From: Vladimir Sokolovsky Date: Mon, 8 Dec 2014 15:07:59 +0000 (+0200) Subject: Initial commit. Based on v3.18 X-Git-Url: https://openfabrics.org/gitweb/?a=commitdiff_plain;h=HEAD;p=~emulex%2Ffor-vlad%2Flinux-3.18.git Initial commit. Based on v3.18 Signed-off-by: Vladimir Sokolovsky --- 4b9e97b1c859c8fe54a596b554215f39d22bc761 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e213b27 --- /dev/null +++ b/.gitignore @@ -0,0 +1,98 @@ +# +# NOTE! Don't add files that are generated in specific +# subdirectories here. Add them in the ".gitignore" file +# in that subdirectory instead. +# +# NOTE! Please use 'git ls-files -i --exclude-standard' +# command after changing this file, to see if there are +# any tracked files which get ignored after the change. +# +# Normal rules +# +.* +*.o +*.o.* +*.a +*.s +*.ko +*.so +*.so.dbg +*.mod.c +*.i +*.lst +*.symtypes +*.order +*.elf +*.bin +*.gz +*.bz2 +*.lzma +*.xz +*.lz4 +*.lzo +*.patch +*.gcno +modules.builtin +Module.symvers +*.dwo + +# +# Top-level generic files +# +/tags +/TAGS +/linux +/vmlinux +/vmlinuz +/System.map +/Module.markers + +# +# Debian directory (make deb-pkg) +# +/debian/ + +# +# git files that we don't want to ignore even it they are dot-files +# +!.gitignore +!.mailmap + +# +# Generated include files +# +include/config +include/generated +arch/*/include/generated + +# stgit generated dirs +patches-* + +# quilt's files +patches +series + +# cscope files +cscope.* +ncscope.* + +# gnu global files +GPATH +GRTAGS +GSYMS +GTAGS + +*.orig +*~ +\#*# + +# +# Leavings from module signing +# +extra_certificates +signing_key.priv +signing_key.x509 +x509.genkey + +# Kconfig presets +all.config diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig new file mode 100644 index 0000000..7708939 --- /dev/null +++ b/drivers/infiniband/Kconfig @@ -0,0 +1,67 @@ +menuconfig INFINIBAND + tristate "InfiniBand support" + depends on PCI || BROKEN + depends on HAS_IOMEM + depends on NET + depends on INET + depends on m || IPV6 != m + ---help--- + Core support for InfiniBand (IB). Make sure to also select + any protocols you wish to use as well as drivers for your + InfiniBand hardware. + +if INFINIBAND + +config INFINIBAND_USER_MAD + tristate "InfiniBand userspace MAD support" + depends on INFINIBAND + ---help--- + Userspace InfiniBand Management Datagram (MAD) support. This + is the kernel side of the userspace MAD support, which allows + userspace processes to send and receive MADs. You will also + need libibumad from . + +config INFINIBAND_USER_ACCESS + tristate "InfiniBand userspace access (verbs and CM)" + select ANON_INODES + ---help--- + Userspace InfiniBand access support. This enables the + kernel side of userspace verbs and the userspace + communication manager (CM). This allows userspace processes + to set up connections and directly access InfiniBand + hardware for fast-path operations. You will also need + libibverbs, libibcm and a hardware driver library from + . + +config INFINIBAND_USER_MEM + bool + depends on INFINIBAND_USER_ACCESS != n + default y + +config INFINIBAND_ADDR_TRANS + bool + depends on INFINIBAND + default y + +source "drivers/infiniband/hw/mthca/Kconfig" +source "drivers/infiniband/hw/ipath/Kconfig" +source "drivers/infiniband/hw/qib/Kconfig" +source "drivers/infiniband/hw/ehca/Kconfig" +source "drivers/infiniband/hw/amso1100/Kconfig" +source "drivers/infiniband/hw/cxgb3/Kconfig" +source "drivers/infiniband/hw/cxgb4/Kconfig" +source "drivers/infiniband/hw/mlx4/Kconfig" +source "drivers/infiniband/hw/mlx5/Kconfig" +source "drivers/infiniband/hw/nes/Kconfig" +source "drivers/infiniband/hw/ocrdma/Kconfig" +source "drivers/infiniband/hw/usnic/Kconfig" + +source "drivers/infiniband/ulp/ipoib/Kconfig" + +source "drivers/infiniband/ulp/srp/Kconfig" +source "drivers/infiniband/ulp/srpt/Kconfig" + +source "drivers/infiniband/ulp/iser/Kconfig" +source "drivers/infiniband/ulp/isert/Kconfig" + +endif # INFINIBAND diff --git a/drivers/infiniband/Makefile b/drivers/infiniband/Makefile new file mode 100644 index 0000000..dc21836 --- /dev/null +++ b/drivers/infiniband/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_INFINIBAND) += core/ +obj-$(CONFIG_INFINIBAND) += hw/ +obj-$(CONFIG_INFINIBAND) += ulp/ diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile new file mode 100644 index 0000000..ffd0af6 --- /dev/null +++ b/drivers/infiniband/core/Makefile @@ -0,0 +1,33 @@ +infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS) := rdma_cm.o +user_access-$(CONFIG_INFINIBAND_ADDR_TRANS) := rdma_ucm.o + +obj-$(CONFIG_INFINIBAND) += ib_core.o ib_mad.o ib_sa.o \ + ib_cm.o iw_cm.o ib_addr.o \ + $(infiniband-y) +obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o +obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \ + $(user_access-y) + +ib_core-y := packer.o ud_header.o verbs.o sysfs.o \ + device.o fmr_pool.o cache.o netlink.o +ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o + +ib_mad-y := mad.o smi.o agent.o mad_rmpp.o + +ib_sa-y := sa_query.o multicast.o + +ib_cm-y := cm.o + +iw_cm-y := iwcm.o iwpm_util.o iwpm_msg.o + +rdma_cm-y := cma.o + +rdma_ucm-y := ucma.o + +ib_addr-y := addr.o + +ib_umad-y := user_mad.o + +ib_ucm-y := ucm.o + +ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c new file mode 100644 index 0000000..8172d37 --- /dev/null +++ b/drivers/infiniband/core/addr.c @@ -0,0 +1,565 @@ +/* + * Copyright (c) 2005 Voltaire Inc. All rights reserved. + * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. + * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved. + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Sean Hefty"); +MODULE_DESCRIPTION("IB Address Translation"); +MODULE_LICENSE("Dual BSD/GPL"); + +struct addr_req { + struct list_head list; + struct sockaddr_storage src_addr; + struct sockaddr_storage dst_addr; + struct rdma_dev_addr *addr; + struct rdma_addr_client *client; + void *context; + void (*callback)(int status, struct sockaddr *src_addr, + struct rdma_dev_addr *addr, void *context); + unsigned long timeout; + int status; +}; + +static void process_req(struct work_struct *work); + +static DEFINE_MUTEX(lock); +static LIST_HEAD(req_list); +static DECLARE_DELAYED_WORK(work, process_req); +static struct workqueue_struct *addr_wq; + +int rdma_addr_size(struct sockaddr *addr) +{ + switch (addr->sa_family) { + case AF_INET: + return sizeof(struct sockaddr_in); + case AF_INET6: + return sizeof(struct sockaddr_in6); + case AF_IB: + return sizeof(struct sockaddr_ib); + default: + return 0; + } +} +EXPORT_SYMBOL(rdma_addr_size); + +static struct rdma_addr_client self; + +void rdma_addr_register_client(struct rdma_addr_client *client) +{ + atomic_set(&client->refcount, 1); + init_completion(&client->comp); +} +EXPORT_SYMBOL(rdma_addr_register_client); + +static inline void put_client(struct rdma_addr_client *client) +{ + if (atomic_dec_and_test(&client->refcount)) + complete(&client->comp); +} + +void rdma_addr_unregister_client(struct rdma_addr_client *client) +{ + put_client(client); + wait_for_completion(&client->comp); +} +EXPORT_SYMBOL(rdma_addr_unregister_client); + +int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev, + const unsigned char *dst_dev_addr) +{ + dev_addr->dev_type = dev->type; + memcpy(dev_addr->src_dev_addr, dev->dev_addr, MAX_ADDR_LEN); + memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN); + if (dst_dev_addr) + memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN); + dev_addr->bound_dev_if = dev->ifindex; + return 0; +} +EXPORT_SYMBOL(rdma_copy_addr); + +int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr, + u16 *vlan_id) +{ + struct net_device *dev; + int ret = -EADDRNOTAVAIL; + + if (dev_addr->bound_dev_if) { + dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); + if (!dev) + return -ENODEV; + ret = rdma_copy_addr(dev_addr, dev, NULL); + dev_put(dev); + return ret; + } + + switch (addr->sa_family) { + case AF_INET: + dev = ip_dev_find(&init_net, + ((struct sockaddr_in *) addr)->sin_addr.s_addr); + + if (!dev) + return ret; + + ret = rdma_copy_addr(dev_addr, dev, NULL); + if (vlan_id) + *vlan_id = rdma_vlan_dev_vlan_id(dev); + dev_put(dev); + break; + +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + rcu_read_lock(); + for_each_netdev_rcu(&init_net, dev) { + if (ipv6_chk_addr(&init_net, + &((struct sockaddr_in6 *) addr)->sin6_addr, + dev, 1)) { + ret = rdma_copy_addr(dev_addr, dev, NULL); + if (vlan_id) + *vlan_id = rdma_vlan_dev_vlan_id(dev); + break; + } + } + rcu_read_unlock(); + break; +#endif + } + return ret; +} +EXPORT_SYMBOL(rdma_translate_ip); + +static void set_timeout(unsigned long time) +{ + unsigned long delay; + + delay = time - jiffies; + if ((long)delay <= 0) + delay = 1; + + mod_delayed_work(addr_wq, &work, delay); +} + +static void queue_req(struct addr_req *req) +{ + struct addr_req *temp_req; + + mutex_lock(&lock); + list_for_each_entry_reverse(temp_req, &req_list, list) { + if (time_after_eq(req->timeout, temp_req->timeout)) + break; + } + + list_add(&req->list, &temp_req->list); + + if (req_list.next == &req->list) + set_timeout(req->timeout); + mutex_unlock(&lock); +} + +static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, void *daddr) +{ + struct neighbour *n; + int ret; + + n = dst_neigh_lookup(dst, daddr); + + rcu_read_lock(); + if (!n || !(n->nud_state & NUD_VALID)) { + if (n) + neigh_event_send(n, NULL); + ret = -ENODATA; + } else { + ret = rdma_copy_addr(dev_addr, dst->dev, n->ha); + } + rcu_read_unlock(); + + if (n) + neigh_release(n); + + return ret; +} + +static int addr4_resolve(struct sockaddr_in *src_in, + struct sockaddr_in *dst_in, + struct rdma_dev_addr *addr) +{ + __be32 src_ip = src_in->sin_addr.s_addr; + __be32 dst_ip = dst_in->sin_addr.s_addr; + struct rtable *rt; + struct flowi4 fl4; + int ret; + + memset(&fl4, 0, sizeof(fl4)); + fl4.daddr = dst_ip; + fl4.saddr = src_ip; + fl4.flowi4_oif = addr->bound_dev_if; + rt = ip_route_output_key(&init_net, &fl4); + if (IS_ERR(rt)) { + ret = PTR_ERR(rt); + goto out; + } + src_in->sin_family = AF_INET; + src_in->sin_addr.s_addr = fl4.saddr; + + if (rt->dst.dev->flags & IFF_LOOPBACK) { + ret = rdma_translate_ip((struct sockaddr *)dst_in, addr, NULL); + if (!ret) + memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); + goto put; + } + + /* If the device does ARP internally, return 'done' */ + if (rt->dst.dev->flags & IFF_NOARP) { + ret = rdma_copy_addr(addr, rt->dst.dev, NULL); + goto put; + } + + ret = dst_fetch_ha(&rt->dst, addr, &fl4.daddr); +put: + ip_rt_put(rt); +out: + return ret; +} + +#if IS_ENABLED(CONFIG_IPV6) +static int addr6_resolve(struct sockaddr_in6 *src_in, + struct sockaddr_in6 *dst_in, + struct rdma_dev_addr *addr) +{ + struct flowi6 fl6; + struct dst_entry *dst; + int ret; + + memset(&fl6, 0, sizeof fl6); + fl6.daddr = dst_in->sin6_addr; + fl6.saddr = src_in->sin6_addr; + fl6.flowi6_oif = addr->bound_dev_if; + + dst = ip6_route_output(&init_net, NULL, &fl6); + if ((ret = dst->error)) + goto put; + + if (ipv6_addr_any(&fl6.saddr)) { + ret = ipv6_dev_get_saddr(&init_net, ip6_dst_idev(dst)->dev, + &fl6.daddr, 0, &fl6.saddr); + if (ret) + goto put; + + src_in->sin6_family = AF_INET6; + src_in->sin6_addr = fl6.saddr; + } + + if (dst->dev->flags & IFF_LOOPBACK) { + ret = rdma_translate_ip((struct sockaddr *)dst_in, addr, NULL); + if (!ret) + memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); + goto put; + } + + /* If the device does ARP internally, return 'done' */ + if (dst->dev->flags & IFF_NOARP) { + ret = rdma_copy_addr(addr, dst->dev, NULL); + goto put; + } + + ret = dst_fetch_ha(dst, addr, &fl6.daddr); +put: + dst_release(dst); + return ret; +} +#else +static int addr6_resolve(struct sockaddr_in6 *src_in, + struct sockaddr_in6 *dst_in, + struct rdma_dev_addr *addr) +{ + return -EADDRNOTAVAIL; +} +#endif + +static int addr_resolve(struct sockaddr *src_in, + struct sockaddr *dst_in, + struct rdma_dev_addr *addr) +{ + if (src_in->sa_family == AF_INET) { + return addr4_resolve((struct sockaddr_in *) src_in, + (struct sockaddr_in *) dst_in, addr); + } else + return addr6_resolve((struct sockaddr_in6 *) src_in, + (struct sockaddr_in6 *) dst_in, addr); +} + +static void process_req(struct work_struct *work) +{ + struct addr_req *req, *temp_req; + struct sockaddr *src_in, *dst_in; + struct list_head done_list; + + INIT_LIST_HEAD(&done_list); + + mutex_lock(&lock); + list_for_each_entry_safe(req, temp_req, &req_list, list) { + if (req->status == -ENODATA) { + src_in = (struct sockaddr *) &req->src_addr; + dst_in = (struct sockaddr *) &req->dst_addr; + req->status = addr_resolve(src_in, dst_in, req->addr); + if (req->status && time_after_eq(jiffies, req->timeout)) + req->status = -ETIMEDOUT; + else if (req->status == -ENODATA) + continue; + } + list_move_tail(&req->list, &done_list); + } + + if (!list_empty(&req_list)) { + req = list_entry(req_list.next, struct addr_req, list); + set_timeout(req->timeout); + } + mutex_unlock(&lock); + + list_for_each_entry_safe(req, temp_req, &done_list, list) { + list_del(&req->list); + req->callback(req->status, (struct sockaddr *) &req->src_addr, + req->addr, req->context); + put_client(req->client); + kfree(req); + } +} + +int rdma_resolve_ip(struct rdma_addr_client *client, + struct sockaddr *src_addr, struct sockaddr *dst_addr, + struct rdma_dev_addr *addr, int timeout_ms, + void (*callback)(int status, struct sockaddr *src_addr, + struct rdma_dev_addr *addr, void *context), + void *context) +{ + struct sockaddr *src_in, *dst_in; + struct addr_req *req; + int ret = 0; + + req = kzalloc(sizeof *req, GFP_KERNEL); + if (!req) + return -ENOMEM; + + src_in = (struct sockaddr *) &req->src_addr; + dst_in = (struct sockaddr *) &req->dst_addr; + + if (src_addr) { + if (src_addr->sa_family != dst_addr->sa_family) { + ret = -EINVAL; + goto err; + } + + memcpy(src_in, src_addr, rdma_addr_size(src_addr)); + } else { + src_in->sa_family = dst_addr->sa_family; + } + + memcpy(dst_in, dst_addr, rdma_addr_size(dst_addr)); + req->addr = addr; + req->callback = callback; + req->context = context; + req->client = client; + atomic_inc(&client->refcount); + + req->status = addr_resolve(src_in, dst_in, addr); + switch (req->status) { + case 0: + req->timeout = jiffies; + queue_req(req); + break; + case -ENODATA: + req->timeout = msecs_to_jiffies(timeout_ms) + jiffies; + queue_req(req); + break; + default: + ret = req->status; + atomic_dec(&client->refcount); + goto err; + } + return ret; +err: + kfree(req); + return ret; +} +EXPORT_SYMBOL(rdma_resolve_ip); + +void rdma_addr_cancel(struct rdma_dev_addr *addr) +{ + struct addr_req *req, *temp_req; + + mutex_lock(&lock); + list_for_each_entry_safe(req, temp_req, &req_list, list) { + if (req->addr == addr) { + req->status = -ECANCELED; + req->timeout = jiffies; + list_move(&req->list, &req_list); + set_timeout(req->timeout); + break; + } + } + mutex_unlock(&lock); +} +EXPORT_SYMBOL(rdma_addr_cancel); + +struct resolve_cb_context { + struct rdma_dev_addr *addr; + struct completion comp; +}; + +static void resolve_cb(int status, struct sockaddr *src_addr, + struct rdma_dev_addr *addr, void *context) +{ + memcpy(((struct resolve_cb_context *)context)->addr, addr, sizeof(struct + rdma_dev_addr)); + complete(&((struct resolve_cb_context *)context)->comp); +} + +int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *dmac, + u16 *vlan_id) +{ + int ret = 0; + struct rdma_dev_addr dev_addr; + struct resolve_cb_context ctx; + struct net_device *dev; + + union { + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } sgid_addr, dgid_addr; + + + ret = rdma_gid2ip(&sgid_addr._sockaddr, sgid); + if (ret) + return ret; + + ret = rdma_gid2ip(&dgid_addr._sockaddr, dgid); + if (ret) + return ret; + + memset(&dev_addr, 0, sizeof(dev_addr)); + + ctx.addr = &dev_addr; + init_completion(&ctx.comp); + ret = rdma_resolve_ip(&self, &sgid_addr._sockaddr, &dgid_addr._sockaddr, + &dev_addr, 1000, resolve_cb, &ctx); + if (ret) + return ret; + + wait_for_completion(&ctx.comp); + + memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN); + dev = dev_get_by_index(&init_net, dev_addr.bound_dev_if); + if (!dev) + return -ENODEV; + if (vlan_id) + *vlan_id = rdma_vlan_dev_vlan_id(dev); + dev_put(dev); + return ret; +} +EXPORT_SYMBOL(rdma_addr_find_dmac_by_grh); + +int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id) +{ + int ret = 0; + struct rdma_dev_addr dev_addr; + union { + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } gid_addr; + + ret = rdma_gid2ip(&gid_addr._sockaddr, sgid); + + if (ret) + return ret; + memset(&dev_addr, 0, sizeof(dev_addr)); + ret = rdma_translate_ip(&gid_addr._sockaddr, &dev_addr, vlan_id); + if (ret) + return ret; + + memcpy(smac, dev_addr.src_dev_addr, ETH_ALEN); + return ret; +} +EXPORT_SYMBOL(rdma_addr_find_smac_by_sgid); + +static int netevent_callback(struct notifier_block *self, unsigned long event, + void *ctx) +{ + if (event == NETEVENT_NEIGH_UPDATE) { + struct neighbour *neigh = ctx; + + if (neigh->nud_state & NUD_VALID) { + set_timeout(jiffies); + } + } + return 0; +} + +static struct notifier_block nb = { + .notifier_call = netevent_callback +}; + +static int __init addr_init(void) +{ + addr_wq = create_singlethread_workqueue("ib_addr"); + if (!addr_wq) + return -ENOMEM; + + register_netevent_notifier(&nb); + rdma_addr_register_client(&self); + return 0; +} + +static void __exit addr_cleanup(void) +{ + rdma_addr_unregister_client(&self); + unregister_netevent_notifier(&nb); + destroy_workqueue(addr_wq); +} + +module_init(addr_init); +module_exit(addr_cleanup); diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c new file mode 100644 index 0000000..f6d2961 --- /dev/null +++ b/drivers/infiniband/core/agent.c @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2004, 2005 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2004, 2005 Infinicon Corporation. All rights reserved. + * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved. + * Copyright (c) 2004, 2005 Topspin Corporation. All rights reserved. + * Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include + +#include "agent.h" +#include "smi.h" +#include "mad_priv.h" + +#define SPFX "ib_agent: " + +struct ib_agent_port_private { + struct list_head port_list; + struct ib_mad_agent *agent[2]; +}; + +static DEFINE_SPINLOCK(ib_agent_port_list_lock); +static LIST_HEAD(ib_agent_port_list); + +static struct ib_agent_port_private * +__ib_get_agent_port(struct ib_device *device, int port_num) +{ + struct ib_agent_port_private *entry; + + list_for_each_entry(entry, &ib_agent_port_list, port_list) { + if (entry->agent[1]->device == device && + entry->agent[1]->port_num == port_num) + return entry; + } + return NULL; +} + +static struct ib_agent_port_private * +ib_get_agent_port(struct ib_device *device, int port_num) +{ + struct ib_agent_port_private *entry; + unsigned long flags; + + spin_lock_irqsave(&ib_agent_port_list_lock, flags); + entry = __ib_get_agent_port(device, port_num); + spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); + return entry; +} + +void agent_send_response(struct ib_mad *mad, struct ib_grh *grh, + struct ib_wc *wc, struct ib_device *device, + int port_num, int qpn) +{ + struct ib_agent_port_private *port_priv; + struct ib_mad_agent *agent; + struct ib_mad_send_buf *send_buf; + struct ib_ah *ah; + struct ib_mad_send_wr_private *mad_send_wr; + + if (device->node_type == RDMA_NODE_IB_SWITCH) + port_priv = ib_get_agent_port(device, 0); + else + port_priv = ib_get_agent_port(device, port_num); + + if (!port_priv) { + dev_err(&device->dev, "Unable to find port agent\n"); + return; + } + + agent = port_priv->agent[qpn]; + ah = ib_create_ah_from_wc(agent->qp->pd, wc, grh, port_num); + if (IS_ERR(ah)) { + dev_err(&device->dev, "ib_create_ah_from_wc error %ld\n", + PTR_ERR(ah)); + return; + } + + send_buf = ib_create_send_mad(agent, wc->src_qp, wc->pkey_index, 0, + IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, + GFP_KERNEL); + if (IS_ERR(send_buf)) { + dev_err(&device->dev, "ib_create_send_mad error\n"); + goto err1; + } + + memcpy(send_buf->mad, mad, sizeof *mad); + send_buf->ah = ah; + + if (device->node_type == RDMA_NODE_IB_SWITCH) { + mad_send_wr = container_of(send_buf, + struct ib_mad_send_wr_private, + send_buf); + mad_send_wr->send_wr.wr.ud.port_num = port_num; + } + + if (ib_post_send_mad(send_buf, NULL)) { + dev_err(&device->dev, "ib_post_send_mad error\n"); + goto err2; + } + return; +err2: + ib_free_send_mad(send_buf); +err1: + ib_destroy_ah(ah); +} + +static void agent_send_handler(struct ib_mad_agent *mad_agent, + struct ib_mad_send_wc *mad_send_wc) +{ + ib_destroy_ah(mad_send_wc->send_buf->ah); + ib_free_send_mad(mad_send_wc->send_buf); +} + +int ib_agent_port_open(struct ib_device *device, int port_num) +{ + struct ib_agent_port_private *port_priv; + unsigned long flags; + int ret; + + /* Create new device info */ + port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL); + if (!port_priv) { + dev_err(&device->dev, "No memory for ib_agent_port_private\n"); + ret = -ENOMEM; + goto error1; + } + + if (rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND) { + /* Obtain send only MAD agent for SMI QP */ + port_priv->agent[0] = ib_register_mad_agent(device, port_num, + IB_QPT_SMI, NULL, 0, + &agent_send_handler, + NULL, NULL, 0); + if (IS_ERR(port_priv->agent[0])) { + ret = PTR_ERR(port_priv->agent[0]); + goto error2; + } + } + + /* Obtain send only MAD agent for GSI QP */ + port_priv->agent[1] = ib_register_mad_agent(device, port_num, + IB_QPT_GSI, NULL, 0, + &agent_send_handler, + NULL, NULL, 0); + if (IS_ERR(port_priv->agent[1])) { + ret = PTR_ERR(port_priv->agent[1]); + goto error3; + } + + spin_lock_irqsave(&ib_agent_port_list_lock, flags); + list_add_tail(&port_priv->port_list, &ib_agent_port_list); + spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); + + return 0; + +error3: + if (port_priv->agent[0]) + ib_unregister_mad_agent(port_priv->agent[0]); +error2: + kfree(port_priv); +error1: + return ret; +} + +int ib_agent_port_close(struct ib_device *device, int port_num) +{ + struct ib_agent_port_private *port_priv; + unsigned long flags; + + spin_lock_irqsave(&ib_agent_port_list_lock, flags); + port_priv = __ib_get_agent_port(device, port_num); + if (port_priv == NULL) { + spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); + dev_err(&device->dev, "Port %d not found\n", port_num); + return -ENODEV; + } + list_del(&port_priv->port_list); + spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); + + ib_unregister_mad_agent(port_priv->agent[1]); + if (port_priv->agent[0]) + ib_unregister_mad_agent(port_priv->agent[0]); + + kfree(port_priv); + return 0; +} diff --git a/drivers/infiniband/core/agent.h b/drivers/infiniband/core/agent.h new file mode 100644 index 0000000..6669287 --- /dev/null +++ b/drivers/infiniband/core/agent.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2004 Infinicon Corporation. All rights reserved. + * Copyright (c) 2004 Intel Corporation. All rights reserved. + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2004 Voltaire Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __AGENT_H_ +#define __AGENT_H_ + +#include +#include + +extern int ib_agent_port_open(struct ib_device *device, int port_num); + +extern int ib_agent_port_close(struct ib_device *device, int port_num); + +extern void agent_send_response(struct ib_mad *mad, struct ib_grh *grh, + struct ib_wc *wc, struct ib_device *device, + int port_num, int qpn); + +#endif /* __AGENT_H_ */ diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c new file mode 100644 index 0000000..80f6cf2 --- /dev/null +++ b/drivers/infiniband/core/cache.c @@ -0,0 +1,439 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include + +#include "core_priv.h" + +struct ib_pkey_cache { + int table_len; + u16 table[0]; +}; + +struct ib_gid_cache { + int table_len; + union ib_gid table[0]; +}; + +struct ib_update_work { + struct work_struct work; + struct ib_device *device; + u8 port_num; +}; + +static inline int start_port(struct ib_device *device) +{ + return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1; +} + +static inline int end_port(struct ib_device *device) +{ + return (device->node_type == RDMA_NODE_IB_SWITCH) ? + 0 : device->phys_port_cnt; +} + +int ib_get_cached_gid(struct ib_device *device, + u8 port_num, + int index, + union ib_gid *gid) +{ + struct ib_gid_cache *cache; + unsigned long flags; + int ret = 0; + + if (port_num < start_port(device) || port_num > end_port(device)) + return -EINVAL; + + read_lock_irqsave(&device->cache.lock, flags); + + cache = device->cache.gid_cache[port_num - start_port(device)]; + + if (index < 0 || index >= cache->table_len) + ret = -EINVAL; + else + *gid = cache->table[index]; + + read_unlock_irqrestore(&device->cache.lock, flags); + + return ret; +} +EXPORT_SYMBOL(ib_get_cached_gid); + +int ib_find_cached_gid(struct ib_device *device, + union ib_gid *gid, + u8 *port_num, + u16 *index) +{ + struct ib_gid_cache *cache; + unsigned long flags; + int p, i; + int ret = -ENOENT; + + *port_num = -1; + if (index) + *index = -1; + + read_lock_irqsave(&device->cache.lock, flags); + + for (p = 0; p <= end_port(device) - start_port(device); ++p) { + cache = device->cache.gid_cache[p]; + for (i = 0; i < cache->table_len; ++i) { + if (!memcmp(gid, &cache->table[i], sizeof *gid)) { + *port_num = p + start_port(device); + if (index) + *index = i; + ret = 0; + goto found; + } + } + } +found: + read_unlock_irqrestore(&device->cache.lock, flags); + + return ret; +} +EXPORT_SYMBOL(ib_find_cached_gid); + +int ib_get_cached_pkey(struct ib_device *device, + u8 port_num, + int index, + u16 *pkey) +{ + struct ib_pkey_cache *cache; + unsigned long flags; + int ret = 0; + + if (port_num < start_port(device) || port_num > end_port(device)) + return -EINVAL; + + read_lock_irqsave(&device->cache.lock, flags); + + cache = device->cache.pkey_cache[port_num - start_port(device)]; + + if (index < 0 || index >= cache->table_len) + ret = -EINVAL; + else + *pkey = cache->table[index]; + + read_unlock_irqrestore(&device->cache.lock, flags); + + return ret; +} +EXPORT_SYMBOL(ib_get_cached_pkey); + +int ib_find_cached_pkey(struct ib_device *device, + u8 port_num, + u16 pkey, + u16 *index) +{ + struct ib_pkey_cache *cache; + unsigned long flags; + int i; + int ret = -ENOENT; + int partial_ix = -1; + + if (port_num < start_port(device) || port_num > end_port(device)) + return -EINVAL; + + read_lock_irqsave(&device->cache.lock, flags); + + cache = device->cache.pkey_cache[port_num - start_port(device)]; + + *index = -1; + + for (i = 0; i < cache->table_len; ++i) + if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) { + if (cache->table[i] & 0x8000) { + *index = i; + ret = 0; + break; + } else + partial_ix = i; + } + + if (ret && partial_ix >= 0) { + *index = partial_ix; + ret = 0; + } + + read_unlock_irqrestore(&device->cache.lock, flags); + + return ret; +} +EXPORT_SYMBOL(ib_find_cached_pkey); + +int ib_find_exact_cached_pkey(struct ib_device *device, + u8 port_num, + u16 pkey, + u16 *index) +{ + struct ib_pkey_cache *cache; + unsigned long flags; + int i; + int ret = -ENOENT; + + if (port_num < start_port(device) || port_num > end_port(device)) + return -EINVAL; + + read_lock_irqsave(&device->cache.lock, flags); + + cache = device->cache.pkey_cache[port_num - start_port(device)]; + + *index = -1; + + for (i = 0; i < cache->table_len; ++i) + if (cache->table[i] == pkey) { + *index = i; + ret = 0; + break; + } + + read_unlock_irqrestore(&device->cache.lock, flags); + + return ret; +} +EXPORT_SYMBOL(ib_find_exact_cached_pkey); + +int ib_get_cached_lmc(struct ib_device *device, + u8 port_num, + u8 *lmc) +{ + unsigned long flags; + int ret = 0; + + if (port_num < start_port(device) || port_num > end_port(device)) + return -EINVAL; + + read_lock_irqsave(&device->cache.lock, flags); + *lmc = device->cache.lmc_cache[port_num - start_port(device)]; + read_unlock_irqrestore(&device->cache.lock, flags); + + return ret; +} +EXPORT_SYMBOL(ib_get_cached_lmc); + +static void ib_cache_update(struct ib_device *device, + u8 port) +{ + struct ib_port_attr *tprops = NULL; + struct ib_pkey_cache *pkey_cache = NULL, *old_pkey_cache; + struct ib_gid_cache *gid_cache = NULL, *old_gid_cache; + int i; + int ret; + + tprops = kmalloc(sizeof *tprops, GFP_KERNEL); + if (!tprops) + return; + + ret = ib_query_port(device, port, tprops); + if (ret) { + printk(KERN_WARNING "ib_query_port failed (%d) for %s\n", + ret, device->name); + goto err; + } + + pkey_cache = kmalloc(sizeof *pkey_cache + tprops->pkey_tbl_len * + sizeof *pkey_cache->table, GFP_KERNEL); + if (!pkey_cache) + goto err; + + pkey_cache->table_len = tprops->pkey_tbl_len; + + gid_cache = kmalloc(sizeof *gid_cache + tprops->gid_tbl_len * + sizeof *gid_cache->table, GFP_KERNEL); + if (!gid_cache) + goto err; + + gid_cache->table_len = tprops->gid_tbl_len; + + for (i = 0; i < pkey_cache->table_len; ++i) { + ret = ib_query_pkey(device, port, i, pkey_cache->table + i); + if (ret) { + printk(KERN_WARNING "ib_query_pkey failed (%d) for %s (index %d)\n", + ret, device->name, i); + goto err; + } + } + + for (i = 0; i < gid_cache->table_len; ++i) { + ret = ib_query_gid(device, port, i, gid_cache->table + i); + if (ret) { + printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n", + ret, device->name, i); + goto err; + } + } + + write_lock_irq(&device->cache.lock); + + old_pkey_cache = device->cache.pkey_cache[port - start_port(device)]; + old_gid_cache = device->cache.gid_cache [port - start_port(device)]; + + device->cache.pkey_cache[port - start_port(device)] = pkey_cache; + device->cache.gid_cache [port - start_port(device)] = gid_cache; + + device->cache.lmc_cache[port - start_port(device)] = tprops->lmc; + + write_unlock_irq(&device->cache.lock); + + kfree(old_pkey_cache); + kfree(old_gid_cache); + kfree(tprops); + return; + +err: + kfree(pkey_cache); + kfree(gid_cache); + kfree(tprops); +} + +static void ib_cache_task(struct work_struct *_work) +{ + struct ib_update_work *work = + container_of(_work, struct ib_update_work, work); + + ib_cache_update(work->device, work->port_num); + kfree(work); +} + +static void ib_cache_event(struct ib_event_handler *handler, + struct ib_event *event) +{ + struct ib_update_work *work; + + if (event->event == IB_EVENT_PORT_ERR || + event->event == IB_EVENT_PORT_ACTIVE || + event->event == IB_EVENT_LID_CHANGE || + event->event == IB_EVENT_PKEY_CHANGE || + event->event == IB_EVENT_SM_CHANGE || + event->event == IB_EVENT_CLIENT_REREGISTER || + event->event == IB_EVENT_GID_CHANGE) { + work = kmalloc(sizeof *work, GFP_ATOMIC); + if (work) { + INIT_WORK(&work->work, ib_cache_task); + work->device = event->device; + work->port_num = event->element.port_num; + queue_work(ib_wq, &work->work); + } + } +} + +static void ib_cache_setup_one(struct ib_device *device) +{ + int p; + + rwlock_init(&device->cache.lock); + + device->cache.pkey_cache = + kmalloc(sizeof *device->cache.pkey_cache * + (end_port(device) - start_port(device) + 1), GFP_KERNEL); + device->cache.gid_cache = + kmalloc(sizeof *device->cache.gid_cache * + (end_port(device) - start_port(device) + 1), GFP_KERNEL); + + device->cache.lmc_cache = kmalloc(sizeof *device->cache.lmc_cache * + (end_port(device) - + start_port(device) + 1), + GFP_KERNEL); + + if (!device->cache.pkey_cache || !device->cache.gid_cache || + !device->cache.lmc_cache) { + printk(KERN_WARNING "Couldn't allocate cache " + "for %s\n", device->name); + goto err; + } + + for (p = 0; p <= end_port(device) - start_port(device); ++p) { + device->cache.pkey_cache[p] = NULL; + device->cache.gid_cache [p] = NULL; + ib_cache_update(device, p + start_port(device)); + } + + INIT_IB_EVENT_HANDLER(&device->cache.event_handler, + device, ib_cache_event); + if (ib_register_event_handler(&device->cache.event_handler)) + goto err_cache; + + return; + +err_cache: + for (p = 0; p <= end_port(device) - start_port(device); ++p) { + kfree(device->cache.pkey_cache[p]); + kfree(device->cache.gid_cache[p]); + } + +err: + kfree(device->cache.pkey_cache); + kfree(device->cache.gid_cache); + kfree(device->cache.lmc_cache); +} + +static void ib_cache_cleanup_one(struct ib_device *device) +{ + int p; + + ib_unregister_event_handler(&device->cache.event_handler); + flush_workqueue(ib_wq); + + for (p = 0; p <= end_port(device) - start_port(device); ++p) { + kfree(device->cache.pkey_cache[p]); + kfree(device->cache.gid_cache[p]); + } + + kfree(device->cache.pkey_cache); + kfree(device->cache.gid_cache); + kfree(device->cache.lmc_cache); +} + +static struct ib_client cache_client = { + .name = "cache", + .add = ib_cache_setup_one, + .remove = ib_cache_cleanup_one +}; + +int __init ib_cache_setup(void) +{ + return ib_register_client(&cache_client); +} + +void __exit ib_cache_cleanup(void) +{ + ib_unregister_client(&cache_client); +} diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c new file mode 100644 index 0000000..e28a494 --- /dev/null +++ b/drivers/infiniband/core/cm.c @@ -0,0 +1,3932 @@ +/* + * Copyright (c) 2004-2007 Intel Corporation. All rights reserved. + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2004, 2005 Voltaire Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "cm_msgs.h" + +MODULE_AUTHOR("Sean Hefty"); +MODULE_DESCRIPTION("InfiniBand CM"); +MODULE_LICENSE("Dual BSD/GPL"); + +static void cm_add_one(struct ib_device *device); +static void cm_remove_one(struct ib_device *device); + +static struct ib_client cm_client = { + .name = "cm", + .add = cm_add_one, + .remove = cm_remove_one +}; + +static struct ib_cm { + spinlock_t lock; + struct list_head device_list; + rwlock_t device_lock; + struct rb_root listen_service_table; + u64 listen_service_id; + /* struct rb_root peer_service_table; todo: fix peer to peer */ + struct rb_root remote_qp_table; + struct rb_root remote_id_table; + struct rb_root remote_sidr_table; + struct idr local_id_table; + __be32 random_id_operand; + struct list_head timewait_list; + struct workqueue_struct *wq; +} cm; + +/* Counter indexes ordered by attribute ID */ +enum { + CM_REQ_COUNTER, + CM_MRA_COUNTER, + CM_REJ_COUNTER, + CM_REP_COUNTER, + CM_RTU_COUNTER, + CM_DREQ_COUNTER, + CM_DREP_COUNTER, + CM_SIDR_REQ_COUNTER, + CM_SIDR_REP_COUNTER, + CM_LAP_COUNTER, + CM_APR_COUNTER, + CM_ATTR_COUNT, + CM_ATTR_ID_OFFSET = 0x0010, +}; + +enum { + CM_XMIT, + CM_XMIT_RETRIES, + CM_RECV, + CM_RECV_DUPLICATES, + CM_COUNTER_GROUPS +}; + +static char const counter_group_names[CM_COUNTER_GROUPS] + [sizeof("cm_rx_duplicates")] = { + "cm_tx_msgs", "cm_tx_retries", + "cm_rx_msgs", "cm_rx_duplicates" +}; + +struct cm_counter_group { + struct kobject obj; + atomic_long_t counter[CM_ATTR_COUNT]; +}; + +struct cm_counter_attribute { + struct attribute attr; + int index; +}; + +#define CM_COUNTER_ATTR(_name, _index) \ +struct cm_counter_attribute cm_##_name##_counter_attr = { \ + .attr = { .name = __stringify(_name), .mode = 0444 }, \ + .index = _index \ +} + +static CM_COUNTER_ATTR(req, CM_REQ_COUNTER); +static CM_COUNTER_ATTR(mra, CM_MRA_COUNTER); +static CM_COUNTER_ATTR(rej, CM_REJ_COUNTER); +static CM_COUNTER_ATTR(rep, CM_REP_COUNTER); +static CM_COUNTER_ATTR(rtu, CM_RTU_COUNTER); +static CM_COUNTER_ATTR(dreq, CM_DREQ_COUNTER); +static CM_COUNTER_ATTR(drep, CM_DREP_COUNTER); +static CM_COUNTER_ATTR(sidr_req, CM_SIDR_REQ_COUNTER); +static CM_COUNTER_ATTR(sidr_rep, CM_SIDR_REP_COUNTER); +static CM_COUNTER_ATTR(lap, CM_LAP_COUNTER); +static CM_COUNTER_ATTR(apr, CM_APR_COUNTER); + +static struct attribute *cm_counter_default_attrs[] = { + &cm_req_counter_attr.attr, + &cm_mra_counter_attr.attr, + &cm_rej_counter_attr.attr, + &cm_rep_counter_attr.attr, + &cm_rtu_counter_attr.attr, + &cm_dreq_counter_attr.attr, + &cm_drep_counter_attr.attr, + &cm_sidr_req_counter_attr.attr, + &cm_sidr_rep_counter_attr.attr, + &cm_lap_counter_attr.attr, + &cm_apr_counter_attr.attr, + NULL +}; + +struct cm_port { + struct cm_device *cm_dev; + struct ib_mad_agent *mad_agent; + struct kobject port_obj; + u8 port_num; + struct cm_counter_group counter_group[CM_COUNTER_GROUPS]; +}; + +struct cm_device { + struct list_head list; + struct ib_device *ib_device; + struct device *device; + u8 ack_delay; + struct cm_port *port[0]; +}; + +struct cm_av { + struct cm_port *port; + union ib_gid dgid; + struct ib_ah_attr ah_attr; + u16 pkey_index; + u8 timeout; + u8 valid; + u8 smac[ETH_ALEN]; +}; + +struct cm_work { + struct delayed_work work; + struct list_head list; + struct cm_port *port; + struct ib_mad_recv_wc *mad_recv_wc; /* Received MADs */ + __be32 local_id; /* Established / timewait */ + __be32 remote_id; + struct ib_cm_event cm_event; + struct ib_sa_path_rec path[0]; +}; + +struct cm_timewait_info { + struct cm_work work; /* Must be first. */ + struct list_head list; + struct rb_node remote_qp_node; + struct rb_node remote_id_node; + __be64 remote_ca_guid; + __be32 remote_qpn; + u8 inserted_remote_qp; + u8 inserted_remote_id; +}; + +struct cm_id_private { + struct ib_cm_id id; + + struct rb_node service_node; + struct rb_node sidr_id_node; + spinlock_t lock; /* Do not acquire inside cm.lock */ + struct completion comp; + atomic_t refcount; + + struct ib_mad_send_buf *msg; + struct cm_timewait_info *timewait_info; + /* todo: use alternate port on send failure */ + struct cm_av av; + struct cm_av alt_av; + struct ib_cm_compare_data *compare_data; + + void *private_data; + __be64 tid; + __be32 local_qpn; + __be32 remote_qpn; + enum ib_qp_type qp_type; + __be32 sq_psn; + __be32 rq_psn; + int timeout_ms; + enum ib_mtu path_mtu; + __be16 pkey; + u8 private_data_len; + u8 max_cm_retries; + u8 peer_to_peer; + u8 responder_resources; + u8 initiator_depth; + u8 retry_count; + u8 rnr_retry_count; + u8 service_timeout; + u8 target_ack_delay; + + struct list_head work_list; + atomic_t work_count; +}; + +static void cm_work_handler(struct work_struct *work); + +static inline void cm_deref_id(struct cm_id_private *cm_id_priv) +{ + if (atomic_dec_and_test(&cm_id_priv->refcount)) + complete(&cm_id_priv->comp); +} + +static int cm_alloc_msg(struct cm_id_private *cm_id_priv, + struct ib_mad_send_buf **msg) +{ + struct ib_mad_agent *mad_agent; + struct ib_mad_send_buf *m; + struct ib_ah *ah; + + mad_agent = cm_id_priv->av.port->mad_agent; + ah = ib_create_ah(mad_agent->qp->pd, &cm_id_priv->av.ah_attr); + if (IS_ERR(ah)) + return PTR_ERR(ah); + + m = ib_create_send_mad(mad_agent, cm_id_priv->id.remote_cm_qpn, + cm_id_priv->av.pkey_index, + 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, + GFP_ATOMIC); + if (IS_ERR(m)) { + ib_destroy_ah(ah); + return PTR_ERR(m); + } + + /* Timeout set by caller if response is expected. */ + m->ah = ah; + m->retries = cm_id_priv->max_cm_retries; + + atomic_inc(&cm_id_priv->refcount); + m->context[0] = cm_id_priv; + *msg = m; + return 0; +} + +static int cm_alloc_response_msg(struct cm_port *port, + struct ib_mad_recv_wc *mad_recv_wc, + struct ib_mad_send_buf **msg) +{ + struct ib_mad_send_buf *m; + struct ib_ah *ah; + + ah = ib_create_ah_from_wc(port->mad_agent->qp->pd, mad_recv_wc->wc, + mad_recv_wc->recv_buf.grh, port->port_num); + if (IS_ERR(ah)) + return PTR_ERR(ah); + + m = ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index, + 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, + GFP_ATOMIC); + if (IS_ERR(m)) { + ib_destroy_ah(ah); + return PTR_ERR(m); + } + m->ah = ah; + *msg = m; + return 0; +} + +static void cm_free_msg(struct ib_mad_send_buf *msg) +{ + ib_destroy_ah(msg->ah); + if (msg->context[0]) + cm_deref_id(msg->context[0]); + ib_free_send_mad(msg); +} + +static void * cm_copy_private_data(const void *private_data, + u8 private_data_len) +{ + void *data; + + if (!private_data || !private_data_len) + return NULL; + + data = kmemdup(private_data, private_data_len, GFP_KERNEL); + if (!data) + return ERR_PTR(-ENOMEM); + + return data; +} + +static void cm_set_private_data(struct cm_id_private *cm_id_priv, + void *private_data, u8 private_data_len) +{ + if (cm_id_priv->private_data && cm_id_priv->private_data_len) + kfree(cm_id_priv->private_data); + + cm_id_priv->private_data = private_data; + cm_id_priv->private_data_len = private_data_len; +} + +static void cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc, + struct ib_grh *grh, struct cm_av *av) +{ + av->port = port; + av->pkey_index = wc->pkey_index; + ib_init_ah_from_wc(port->cm_dev->ib_device, port->port_num, wc, + grh, &av->ah_attr); +} + +static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av) +{ + struct cm_device *cm_dev; + struct cm_port *port = NULL; + unsigned long flags; + int ret; + u8 p; + + read_lock_irqsave(&cm.device_lock, flags); + list_for_each_entry(cm_dev, &cm.device_list, list) { + if (!ib_find_cached_gid(cm_dev->ib_device, &path->sgid, + &p, NULL)) { + port = cm_dev->port[p-1]; + break; + } + } + read_unlock_irqrestore(&cm.device_lock, flags); + + if (!port) + return -EINVAL; + + ret = ib_find_cached_pkey(cm_dev->ib_device, port->port_num, + be16_to_cpu(path->pkey), &av->pkey_index); + if (ret) + return ret; + + av->port = port; + ib_init_ah_from_path(cm_dev->ib_device, port->port_num, path, + &av->ah_attr); + av->timeout = path->packet_life_time + 1; + memcpy(av->smac, path->smac, sizeof(av->smac)); + + av->valid = 1; + return 0; +} + +static int cm_alloc_id(struct cm_id_private *cm_id_priv) +{ + unsigned long flags; + int id; + + idr_preload(GFP_KERNEL); + spin_lock_irqsave(&cm.lock, flags); + + id = idr_alloc_cyclic(&cm.local_id_table, cm_id_priv, 0, 0, GFP_NOWAIT); + + spin_unlock_irqrestore(&cm.lock, flags); + idr_preload_end(); + + cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand; + return id < 0 ? id : 0; +} + +static void cm_free_id(__be32 local_id) +{ + spin_lock_irq(&cm.lock); + idr_remove(&cm.local_id_table, + (__force int) (local_id ^ cm.random_id_operand)); + spin_unlock_irq(&cm.lock); +} + +static struct cm_id_private * cm_get_id(__be32 local_id, __be32 remote_id) +{ + struct cm_id_private *cm_id_priv; + + cm_id_priv = idr_find(&cm.local_id_table, + (__force int) (local_id ^ cm.random_id_operand)); + if (cm_id_priv) { + if (cm_id_priv->id.remote_id == remote_id) + atomic_inc(&cm_id_priv->refcount); + else + cm_id_priv = NULL; + } + + return cm_id_priv; +} + +static struct cm_id_private * cm_acquire_id(__be32 local_id, __be32 remote_id) +{ + struct cm_id_private *cm_id_priv; + + spin_lock_irq(&cm.lock); + cm_id_priv = cm_get_id(local_id, remote_id); + spin_unlock_irq(&cm.lock); + + return cm_id_priv; +} + +static void cm_mask_copy(u8 *dst, u8 *src, u8 *mask) +{ + int i; + + for (i = 0; i < IB_CM_COMPARE_SIZE / sizeof(unsigned long); i++) + ((unsigned long *) dst)[i] = ((unsigned long *) src)[i] & + ((unsigned long *) mask)[i]; +} + +static int cm_compare_data(struct ib_cm_compare_data *src_data, + struct ib_cm_compare_data *dst_data) +{ + u8 src[IB_CM_COMPARE_SIZE]; + u8 dst[IB_CM_COMPARE_SIZE]; + + if (!src_data || !dst_data) + return 0; + + cm_mask_copy(src, src_data->data, dst_data->mask); + cm_mask_copy(dst, dst_data->data, src_data->mask); + return memcmp(src, dst, IB_CM_COMPARE_SIZE); +} + +static int cm_compare_private_data(u8 *private_data, + struct ib_cm_compare_data *dst_data) +{ + u8 src[IB_CM_COMPARE_SIZE]; + + if (!dst_data) + return 0; + + cm_mask_copy(src, private_data, dst_data->mask); + return memcmp(src, dst_data->data, IB_CM_COMPARE_SIZE); +} + +/* + * Trivial helpers to strip endian annotation and compare; the + * endianness doesn't actually matter since we just need a stable + * order for the RB tree. + */ +static int be32_lt(__be32 a, __be32 b) +{ + return (__force u32) a < (__force u32) b; +} + +static int be32_gt(__be32 a, __be32 b) +{ + return (__force u32) a > (__force u32) b; +} + +static int be64_lt(__be64 a, __be64 b) +{ + return (__force u64) a < (__force u64) b; +} + +static int be64_gt(__be64 a, __be64 b) +{ + return (__force u64) a > (__force u64) b; +} + +static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv) +{ + struct rb_node **link = &cm.listen_service_table.rb_node; + struct rb_node *parent = NULL; + struct cm_id_private *cur_cm_id_priv; + __be64 service_id = cm_id_priv->id.service_id; + __be64 service_mask = cm_id_priv->id.service_mask; + int data_cmp; + + while (*link) { + parent = *link; + cur_cm_id_priv = rb_entry(parent, struct cm_id_private, + service_node); + data_cmp = cm_compare_data(cm_id_priv->compare_data, + cur_cm_id_priv->compare_data); + if ((cur_cm_id_priv->id.service_mask & service_id) == + (service_mask & cur_cm_id_priv->id.service_id) && + (cm_id_priv->id.device == cur_cm_id_priv->id.device) && + !data_cmp) + return cur_cm_id_priv; + + if (cm_id_priv->id.device < cur_cm_id_priv->id.device) + link = &(*link)->rb_left; + else if (cm_id_priv->id.device > cur_cm_id_priv->id.device) + link = &(*link)->rb_right; + else if (be64_lt(service_id, cur_cm_id_priv->id.service_id)) + link = &(*link)->rb_left; + else if (be64_gt(service_id, cur_cm_id_priv->id.service_id)) + link = &(*link)->rb_right; + else if (data_cmp < 0) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + rb_link_node(&cm_id_priv->service_node, parent, link); + rb_insert_color(&cm_id_priv->service_node, &cm.listen_service_table); + return NULL; +} + +static struct cm_id_private * cm_find_listen(struct ib_device *device, + __be64 service_id, + u8 *private_data) +{ + struct rb_node *node = cm.listen_service_table.rb_node; + struct cm_id_private *cm_id_priv; + int data_cmp; + + while (node) { + cm_id_priv = rb_entry(node, struct cm_id_private, service_node); + data_cmp = cm_compare_private_data(private_data, + cm_id_priv->compare_data); + if ((cm_id_priv->id.service_mask & service_id) == + cm_id_priv->id.service_id && + (cm_id_priv->id.device == device) && !data_cmp) + return cm_id_priv; + + if (device < cm_id_priv->id.device) + node = node->rb_left; + else if (device > cm_id_priv->id.device) + node = node->rb_right; + else if (be64_lt(service_id, cm_id_priv->id.service_id)) + node = node->rb_left; + else if (be64_gt(service_id, cm_id_priv->id.service_id)) + node = node->rb_right; + else if (data_cmp < 0) + node = node->rb_left; + else + node = node->rb_right; + } + return NULL; +} + +static struct cm_timewait_info * cm_insert_remote_id(struct cm_timewait_info + *timewait_info) +{ + struct rb_node **link = &cm.remote_id_table.rb_node; + struct rb_node *parent = NULL; + struct cm_timewait_info *cur_timewait_info; + __be64 remote_ca_guid = timewait_info->remote_ca_guid; + __be32 remote_id = timewait_info->work.remote_id; + + while (*link) { + parent = *link; + cur_timewait_info = rb_entry(parent, struct cm_timewait_info, + remote_id_node); + if (be32_lt(remote_id, cur_timewait_info->work.remote_id)) + link = &(*link)->rb_left; + else if (be32_gt(remote_id, cur_timewait_info->work.remote_id)) + link = &(*link)->rb_right; + else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid)) + link = &(*link)->rb_left; + else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid)) + link = &(*link)->rb_right; + else + return cur_timewait_info; + } + timewait_info->inserted_remote_id = 1; + rb_link_node(&timewait_info->remote_id_node, parent, link); + rb_insert_color(&timewait_info->remote_id_node, &cm.remote_id_table); + return NULL; +} + +static struct cm_timewait_info * cm_find_remote_id(__be64 remote_ca_guid, + __be32 remote_id) +{ + struct rb_node *node = cm.remote_id_table.rb_node; + struct cm_timewait_info *timewait_info; + + while (node) { + timewait_info = rb_entry(node, struct cm_timewait_info, + remote_id_node); + if (be32_lt(remote_id, timewait_info->work.remote_id)) + node = node->rb_left; + else if (be32_gt(remote_id, timewait_info->work.remote_id)) + node = node->rb_right; + else if (be64_lt(remote_ca_guid, timewait_info->remote_ca_guid)) + node = node->rb_left; + else if (be64_gt(remote_ca_guid, timewait_info->remote_ca_guid)) + node = node->rb_right; + else + return timewait_info; + } + return NULL; +} + +static struct cm_timewait_info * cm_insert_remote_qpn(struct cm_timewait_info + *timewait_info) +{ + struct rb_node **link = &cm.remote_qp_table.rb_node; + struct rb_node *parent = NULL; + struct cm_timewait_info *cur_timewait_info; + __be64 remote_ca_guid = timewait_info->remote_ca_guid; + __be32 remote_qpn = timewait_info->remote_qpn; + + while (*link) { + parent = *link; + cur_timewait_info = rb_entry(parent, struct cm_timewait_info, + remote_qp_node); + if (be32_lt(remote_qpn, cur_timewait_info->remote_qpn)) + link = &(*link)->rb_left; + else if (be32_gt(remote_qpn, cur_timewait_info->remote_qpn)) + link = &(*link)->rb_right; + else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid)) + link = &(*link)->rb_left; + else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid)) + link = &(*link)->rb_right; + else + return cur_timewait_info; + } + timewait_info->inserted_remote_qp = 1; + rb_link_node(&timewait_info->remote_qp_node, parent, link); + rb_insert_color(&timewait_info->remote_qp_node, &cm.remote_qp_table); + return NULL; +} + +static struct cm_id_private * cm_insert_remote_sidr(struct cm_id_private + *cm_id_priv) +{ + struct rb_node **link = &cm.remote_sidr_table.rb_node; + struct rb_node *parent = NULL; + struct cm_id_private *cur_cm_id_priv; + union ib_gid *port_gid = &cm_id_priv->av.dgid; + __be32 remote_id = cm_id_priv->id.remote_id; + + while (*link) { + parent = *link; + cur_cm_id_priv = rb_entry(parent, struct cm_id_private, + sidr_id_node); + if (be32_lt(remote_id, cur_cm_id_priv->id.remote_id)) + link = &(*link)->rb_left; + else if (be32_gt(remote_id, cur_cm_id_priv->id.remote_id)) + link = &(*link)->rb_right; + else { + int cmp; + cmp = memcmp(port_gid, &cur_cm_id_priv->av.dgid, + sizeof *port_gid); + if (cmp < 0) + link = &(*link)->rb_left; + else if (cmp > 0) + link = &(*link)->rb_right; + else + return cur_cm_id_priv; + } + } + rb_link_node(&cm_id_priv->sidr_id_node, parent, link); + rb_insert_color(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table); + return NULL; +} + +static void cm_reject_sidr_req(struct cm_id_private *cm_id_priv, + enum ib_cm_sidr_status status) +{ + struct ib_cm_sidr_rep_param param; + + memset(¶m, 0, sizeof param); + param.status = status; + ib_send_cm_sidr_rep(&cm_id_priv->id, ¶m); +} + +struct ib_cm_id *ib_create_cm_id(struct ib_device *device, + ib_cm_handler cm_handler, + void *context) +{ + struct cm_id_private *cm_id_priv; + int ret; + + cm_id_priv = kzalloc(sizeof *cm_id_priv, GFP_KERNEL); + if (!cm_id_priv) + return ERR_PTR(-ENOMEM); + + cm_id_priv->id.state = IB_CM_IDLE; + cm_id_priv->id.device = device; + cm_id_priv->id.cm_handler = cm_handler; + cm_id_priv->id.context = context; + cm_id_priv->id.remote_cm_qpn = 1; + ret = cm_alloc_id(cm_id_priv); + if (ret) + goto error; + + spin_lock_init(&cm_id_priv->lock); + init_completion(&cm_id_priv->comp); + INIT_LIST_HEAD(&cm_id_priv->work_list); + atomic_set(&cm_id_priv->work_count, -1); + atomic_set(&cm_id_priv->refcount, 1); + return &cm_id_priv->id; + +error: + kfree(cm_id_priv); + return ERR_PTR(-ENOMEM); +} +EXPORT_SYMBOL(ib_create_cm_id); + +static struct cm_work * cm_dequeue_work(struct cm_id_private *cm_id_priv) +{ + struct cm_work *work; + + if (list_empty(&cm_id_priv->work_list)) + return NULL; + + work = list_entry(cm_id_priv->work_list.next, struct cm_work, list); + list_del(&work->list); + return work; +} + +static void cm_free_work(struct cm_work *work) +{ + if (work->mad_recv_wc) + ib_free_recv_mad(work->mad_recv_wc); + kfree(work); +} + +static inline int cm_convert_to_ms(int iba_time) +{ + /* approximate conversion to ms from 4.096us x 2^iba_time */ + return 1 << max(iba_time - 8, 0); +} + +/* + * calculate: 4.096x2^ack_timeout = 4.096x2^ack_delay + 2x4.096x2^life_time + * Because of how ack_timeout is stored, adding one doubles the timeout. + * To avoid large timeouts, select the max(ack_delay, life_time + 1), and + * increment it (round up) only if the other is within 50%. + */ +static u8 cm_ack_timeout(u8 ca_ack_delay, u8 packet_life_time) +{ + int ack_timeout = packet_life_time + 1; + + if (ack_timeout >= ca_ack_delay) + ack_timeout += (ca_ack_delay >= (ack_timeout - 1)); + else + ack_timeout = ca_ack_delay + + (ack_timeout >= (ca_ack_delay - 1)); + + return min(31, ack_timeout); +} + +static void cm_cleanup_timewait(struct cm_timewait_info *timewait_info) +{ + if (timewait_info->inserted_remote_id) { + rb_erase(&timewait_info->remote_id_node, &cm.remote_id_table); + timewait_info->inserted_remote_id = 0; + } + + if (timewait_info->inserted_remote_qp) { + rb_erase(&timewait_info->remote_qp_node, &cm.remote_qp_table); + timewait_info->inserted_remote_qp = 0; + } +} + +static struct cm_timewait_info * cm_create_timewait_info(__be32 local_id) +{ + struct cm_timewait_info *timewait_info; + + timewait_info = kzalloc(sizeof *timewait_info, GFP_KERNEL); + if (!timewait_info) + return ERR_PTR(-ENOMEM); + + timewait_info->work.local_id = local_id; + INIT_DELAYED_WORK(&timewait_info->work.work, cm_work_handler); + timewait_info->work.cm_event.event = IB_CM_TIMEWAIT_EXIT; + return timewait_info; +} + +static void cm_enter_timewait(struct cm_id_private *cm_id_priv) +{ + int wait_time; + unsigned long flags; + + spin_lock_irqsave(&cm.lock, flags); + cm_cleanup_timewait(cm_id_priv->timewait_info); + list_add_tail(&cm_id_priv->timewait_info->list, &cm.timewait_list); + spin_unlock_irqrestore(&cm.lock, flags); + + /* + * The cm_id could be destroyed by the user before we exit timewait. + * To protect against this, we search for the cm_id after exiting + * timewait before notifying the user that we've exited timewait. + */ + cm_id_priv->id.state = IB_CM_TIMEWAIT; + wait_time = cm_convert_to_ms(cm_id_priv->av.timeout); + queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work, + msecs_to_jiffies(wait_time)); + cm_id_priv->timewait_info = NULL; +} + +static void cm_reset_to_idle(struct cm_id_private *cm_id_priv) +{ + unsigned long flags; + + cm_id_priv->id.state = IB_CM_IDLE; + if (cm_id_priv->timewait_info) { + spin_lock_irqsave(&cm.lock, flags); + cm_cleanup_timewait(cm_id_priv->timewait_info); + spin_unlock_irqrestore(&cm.lock, flags); + kfree(cm_id_priv->timewait_info); + cm_id_priv->timewait_info = NULL; + } +} + +static void cm_destroy_id(struct ib_cm_id *cm_id, int err) +{ + struct cm_id_private *cm_id_priv; + struct cm_work *work; + + cm_id_priv = container_of(cm_id, struct cm_id_private, id); +retest: + spin_lock_irq(&cm_id_priv->lock); + switch (cm_id->state) { + case IB_CM_LISTEN: + cm_id->state = IB_CM_IDLE; + spin_unlock_irq(&cm_id_priv->lock); + spin_lock_irq(&cm.lock); + rb_erase(&cm_id_priv->service_node, &cm.listen_service_table); + spin_unlock_irq(&cm.lock); + break; + case IB_CM_SIDR_REQ_SENT: + cm_id->state = IB_CM_IDLE; + ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); + spin_unlock_irq(&cm_id_priv->lock); + break; + case IB_CM_SIDR_REQ_RCVD: + spin_unlock_irq(&cm_id_priv->lock); + cm_reject_sidr_req(cm_id_priv, IB_SIDR_REJECT); + break; + case IB_CM_REQ_SENT: + ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); + spin_unlock_irq(&cm_id_priv->lock); + ib_send_cm_rej(cm_id, IB_CM_REJ_TIMEOUT, + &cm_id_priv->id.device->node_guid, + sizeof cm_id_priv->id.device->node_guid, + NULL, 0); + break; + case IB_CM_REQ_RCVD: + if (err == -ENOMEM) { + /* Do not reject to allow future retries. */ + cm_reset_to_idle(cm_id_priv); + spin_unlock_irq(&cm_id_priv->lock); + } else { + spin_unlock_irq(&cm_id_priv->lock); + ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, + NULL, 0, NULL, 0); + } + break; + case IB_CM_MRA_REQ_RCVD: + case IB_CM_REP_SENT: + case IB_CM_MRA_REP_RCVD: + ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); + /* Fall through */ + case IB_CM_MRA_REQ_SENT: + case IB_CM_REP_RCVD: + case IB_CM_MRA_REP_SENT: + spin_unlock_irq(&cm_id_priv->lock); + ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, + NULL, 0, NULL, 0); + break; + case IB_CM_ESTABLISHED: + spin_unlock_irq(&cm_id_priv->lock); + if (cm_id_priv->qp_type == IB_QPT_XRC_TGT) + break; + ib_send_cm_dreq(cm_id, NULL, 0); + goto retest; + case IB_CM_DREQ_SENT: + ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); + cm_enter_timewait(cm_id_priv); + spin_unlock_irq(&cm_id_priv->lock); + break; + case IB_CM_DREQ_RCVD: + spin_unlock_irq(&cm_id_priv->lock); + ib_send_cm_drep(cm_id, NULL, 0); + break; + default: + spin_unlock_irq(&cm_id_priv->lock); + break; + } + + cm_free_id(cm_id->local_id); + cm_deref_id(cm_id_priv); + wait_for_completion(&cm_id_priv->comp); + while ((work = cm_dequeue_work(cm_id_priv)) != NULL) + cm_free_work(work); + kfree(cm_id_priv->compare_data); + kfree(cm_id_priv->private_data); + kfree(cm_id_priv); +} + +void ib_destroy_cm_id(struct ib_cm_id *cm_id) +{ + cm_destroy_id(cm_id, 0); +} +EXPORT_SYMBOL(ib_destroy_cm_id); + +int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask, + struct ib_cm_compare_data *compare_data) +{ + struct cm_id_private *cm_id_priv, *cur_cm_id_priv; + unsigned long flags; + int ret = 0; + + service_mask = service_mask ? service_mask : ~cpu_to_be64(0); + service_id &= service_mask; + if ((service_id & IB_SERVICE_ID_AGN_MASK) == IB_CM_ASSIGN_SERVICE_ID && + (service_id != IB_CM_ASSIGN_SERVICE_ID)) + return -EINVAL; + + cm_id_priv = container_of(cm_id, struct cm_id_private, id); + if (cm_id->state != IB_CM_IDLE) + return -EINVAL; + + if (compare_data) { + cm_id_priv->compare_data = kzalloc(sizeof *compare_data, + GFP_KERNEL); + if (!cm_id_priv->compare_data) + return -ENOMEM; + cm_mask_copy(cm_id_priv->compare_data->data, + compare_data->data, compare_data->mask); + memcpy(cm_id_priv->compare_data->mask, compare_data->mask, + IB_CM_COMPARE_SIZE); + } + + cm_id->state = IB_CM_LISTEN; + + spin_lock_irqsave(&cm.lock, flags); + if (service_id == IB_CM_ASSIGN_SERVICE_ID) { + cm_id->service_id = cpu_to_be64(cm.listen_service_id++); + cm_id->service_mask = ~cpu_to_be64(0); + } else { + cm_id->service_id = service_id; + cm_id->service_mask = service_mask; + } + cur_cm_id_priv = cm_insert_listen(cm_id_priv); + spin_unlock_irqrestore(&cm.lock, flags); + + if (cur_cm_id_priv) { + cm_id->state = IB_CM_IDLE; + kfree(cm_id_priv->compare_data); + cm_id_priv->compare_data = NULL; + ret = -EBUSY; + } + return ret; +} +EXPORT_SYMBOL(ib_cm_listen); + +static __be64 cm_form_tid(struct cm_id_private *cm_id_priv, + enum cm_msg_sequence msg_seq) +{ + u64 hi_tid, low_tid; + + hi_tid = ((u64) cm_id_priv->av.port->mad_agent->hi_tid) << 32; + low_tid = (u64) ((__force u32)cm_id_priv->id.local_id | + (msg_seq << 30)); + return cpu_to_be64(hi_tid | low_tid); +} + +static void cm_format_mad_hdr(struct ib_mad_hdr *hdr, + __be16 attr_id, __be64 tid) +{ + hdr->base_version = IB_MGMT_BASE_VERSION; + hdr->mgmt_class = IB_MGMT_CLASS_CM; + hdr->class_version = IB_CM_CLASS_VERSION; + hdr->method = IB_MGMT_METHOD_SEND; + hdr->attr_id = attr_id; + hdr->tid = tid; +} + +static void cm_format_req(struct cm_req_msg *req_msg, + struct cm_id_private *cm_id_priv, + struct ib_cm_req_param *param) +{ + struct ib_sa_path_rec *pri_path = param->primary_path; + struct ib_sa_path_rec *alt_path = param->alternate_path; + + cm_format_mad_hdr(&req_msg->hdr, CM_REQ_ATTR_ID, + cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_REQ)); + + req_msg->local_comm_id = cm_id_priv->id.local_id; + req_msg->service_id = param->service_id; + req_msg->local_ca_guid = cm_id_priv->id.device->node_guid; + cm_req_set_local_qpn(req_msg, cpu_to_be32(param->qp_num)); + cm_req_set_init_depth(req_msg, param->initiator_depth); + cm_req_set_remote_resp_timeout(req_msg, + param->remote_cm_response_timeout); + cm_req_set_qp_type(req_msg, param->qp_type); + cm_req_set_flow_ctrl(req_msg, param->flow_control); + cm_req_set_starting_psn(req_msg, cpu_to_be32(param->starting_psn)); + cm_req_set_local_resp_timeout(req_msg, + param->local_cm_response_timeout); + req_msg->pkey = param->primary_path->pkey; + cm_req_set_path_mtu(req_msg, param->primary_path->mtu); + cm_req_set_max_cm_retries(req_msg, param->max_cm_retries); + + if (param->qp_type != IB_QPT_XRC_INI) { + cm_req_set_resp_res(req_msg, param->responder_resources); + cm_req_set_retry_count(req_msg, param->retry_count); + cm_req_set_rnr_retry_count(req_msg, param->rnr_retry_count); + cm_req_set_srq(req_msg, param->srq); + } + + if (pri_path->hop_limit <= 1) { + req_msg->primary_local_lid = pri_path->slid; + req_msg->primary_remote_lid = pri_path->dlid; + } else { + /* Work-around until there's a way to obtain remote LID info */ + req_msg->primary_local_lid = IB_LID_PERMISSIVE; + req_msg->primary_remote_lid = IB_LID_PERMISSIVE; + } + req_msg->primary_local_gid = pri_path->sgid; + req_msg->primary_remote_gid = pri_path->dgid; + cm_req_set_primary_flow_label(req_msg, pri_path->flow_label); + cm_req_set_primary_packet_rate(req_msg, pri_path->rate); + req_msg->primary_traffic_class = pri_path->traffic_class; + req_msg->primary_hop_limit = pri_path->hop_limit; + cm_req_set_primary_sl(req_msg, pri_path->sl); + cm_req_set_primary_subnet_local(req_msg, (pri_path->hop_limit <= 1)); + cm_req_set_primary_local_ack_timeout(req_msg, + cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay, + pri_path->packet_life_time)); + + if (alt_path) { + if (alt_path->hop_limit <= 1) { + req_msg->alt_local_lid = alt_path->slid; + req_msg->alt_remote_lid = alt_path->dlid; + } else { + req_msg->alt_local_lid = IB_LID_PERMISSIVE; + req_msg->alt_remote_lid = IB_LID_PERMISSIVE; + } + req_msg->alt_local_gid = alt_path->sgid; + req_msg->alt_remote_gid = alt_path->dgid; + cm_req_set_alt_flow_label(req_msg, + alt_path->flow_label); + cm_req_set_alt_packet_rate(req_msg, alt_path->rate); + req_msg->alt_traffic_class = alt_path->traffic_class; + req_msg->alt_hop_limit = alt_path->hop_limit; + cm_req_set_alt_sl(req_msg, alt_path->sl); + cm_req_set_alt_subnet_local(req_msg, (alt_path->hop_limit <= 1)); + cm_req_set_alt_local_ack_timeout(req_msg, + cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay, + alt_path->packet_life_time)); + } + + if (param->private_data && param->private_data_len) + memcpy(req_msg->private_data, param->private_data, + param->private_data_len); +} + +static int cm_validate_req_param(struct ib_cm_req_param *param) +{ + /* peer-to-peer not supported */ + if (param->peer_to_peer) + return -EINVAL; + + if (!param->primary_path) + return -EINVAL; + + if (param->qp_type != IB_QPT_RC && param->qp_type != IB_QPT_UC && + param->qp_type != IB_QPT_XRC_INI) + return -EINVAL; + + if (param->private_data && + param->private_data_len > IB_CM_REQ_PRIVATE_DATA_SIZE) + return -EINVAL; + + if (param->alternate_path && + (param->alternate_path->pkey != param->primary_path->pkey || + param->alternate_path->mtu != param->primary_path->mtu)) + return -EINVAL; + + return 0; +} + +int ib_send_cm_req(struct ib_cm_id *cm_id, + struct ib_cm_req_param *param) +{ + struct cm_id_private *cm_id_priv; + struct cm_req_msg *req_msg; + unsigned long flags; + int ret; + + ret = cm_validate_req_param(param); + if (ret) + return ret; + + /* Verify that we're not in timewait. */ + cm_id_priv = container_of(cm_id, struct cm_id_private, id); + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id->state != IB_CM_IDLE) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + ret = -EINVAL; + goto out; + } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + + cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv-> + id.local_id); + if (IS_ERR(cm_id_priv->timewait_info)) { + ret = PTR_ERR(cm_id_priv->timewait_info); + goto out; + } + + ret = cm_init_av_by_path(param->primary_path, &cm_id_priv->av); + if (ret) + goto error1; + if (param->alternate_path) { + ret = cm_init_av_by_path(param->alternate_path, + &cm_id_priv->alt_av); + if (ret) + goto error1; + } + cm_id->service_id = param->service_id; + cm_id->service_mask = ~cpu_to_be64(0); + cm_id_priv->timeout_ms = cm_convert_to_ms( + param->primary_path->packet_life_time) * 2 + + cm_convert_to_ms( + param->remote_cm_response_timeout); + cm_id_priv->max_cm_retries = param->max_cm_retries; + cm_id_priv->initiator_depth = param->initiator_depth; + cm_id_priv->responder_resources = param->responder_resources; + cm_id_priv->retry_count = param->retry_count; + cm_id_priv->path_mtu = param->primary_path->mtu; + cm_id_priv->pkey = param->primary_path->pkey; + cm_id_priv->qp_type = param->qp_type; + + ret = cm_alloc_msg(cm_id_priv, &cm_id_priv->msg); + if (ret) + goto error1; + + req_msg = (struct cm_req_msg *) cm_id_priv->msg->mad; + cm_format_req(req_msg, cm_id_priv, param); + cm_id_priv->tid = req_msg->hdr.tid; + cm_id_priv->msg->timeout_ms = cm_id_priv->timeout_ms; + cm_id_priv->msg->context[1] = (void *) (unsigned long) IB_CM_REQ_SENT; + + cm_id_priv->local_qpn = cm_req_get_local_qpn(req_msg); + cm_id_priv->rq_psn = cm_req_get_starting_psn(req_msg); + + spin_lock_irqsave(&cm_id_priv->lock, flags); + ret = ib_post_send_mad(cm_id_priv->msg, NULL); + if (ret) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + goto error2; + } + BUG_ON(cm_id->state != IB_CM_IDLE); + cm_id->state = IB_CM_REQ_SENT; + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return 0; + +error2: cm_free_msg(cm_id_priv->msg); +error1: kfree(cm_id_priv->timewait_info); +out: return ret; +} +EXPORT_SYMBOL(ib_send_cm_req); + +static int cm_issue_rej(struct cm_port *port, + struct ib_mad_recv_wc *mad_recv_wc, + enum ib_cm_rej_reason reason, + enum cm_msg_response msg_rejected, + void *ari, u8 ari_length) +{ + struct ib_mad_send_buf *msg = NULL; + struct cm_rej_msg *rej_msg, *rcv_msg; + int ret; + + ret = cm_alloc_response_msg(port, mad_recv_wc, &msg); + if (ret) + return ret; + + /* We just need common CM header information. Cast to any message. */ + rcv_msg = (struct cm_rej_msg *) mad_recv_wc->recv_buf.mad; + rej_msg = (struct cm_rej_msg *) msg->mad; + + cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, rcv_msg->hdr.tid); + rej_msg->remote_comm_id = rcv_msg->local_comm_id; + rej_msg->local_comm_id = rcv_msg->remote_comm_id; + cm_rej_set_msg_rejected(rej_msg, msg_rejected); + rej_msg->reason = cpu_to_be16(reason); + + if (ari && ari_length) { + cm_rej_set_reject_info_len(rej_msg, ari_length); + memcpy(rej_msg->ari, ari, ari_length); + } + + ret = ib_post_send_mad(msg, NULL); + if (ret) + cm_free_msg(msg); + + return ret; +} + +static inline int cm_is_active_peer(__be64 local_ca_guid, __be64 remote_ca_guid, + __be32 local_qpn, __be32 remote_qpn) +{ + return (be64_to_cpu(local_ca_guid) > be64_to_cpu(remote_ca_guid) || + ((local_ca_guid == remote_ca_guid) && + (be32_to_cpu(local_qpn) > be32_to_cpu(remote_qpn)))); +} + +static void cm_format_paths_from_req(struct cm_req_msg *req_msg, + struct ib_sa_path_rec *primary_path, + struct ib_sa_path_rec *alt_path) +{ + memset(primary_path, 0, sizeof *primary_path); + primary_path->dgid = req_msg->primary_local_gid; + primary_path->sgid = req_msg->primary_remote_gid; + primary_path->dlid = req_msg->primary_local_lid; + primary_path->slid = req_msg->primary_remote_lid; + primary_path->flow_label = cm_req_get_primary_flow_label(req_msg); + primary_path->hop_limit = req_msg->primary_hop_limit; + primary_path->traffic_class = req_msg->primary_traffic_class; + primary_path->reversible = 1; + primary_path->pkey = req_msg->pkey; + primary_path->sl = cm_req_get_primary_sl(req_msg); + primary_path->mtu_selector = IB_SA_EQ; + primary_path->mtu = cm_req_get_path_mtu(req_msg); + primary_path->rate_selector = IB_SA_EQ; + primary_path->rate = cm_req_get_primary_packet_rate(req_msg); + primary_path->packet_life_time_selector = IB_SA_EQ; + primary_path->packet_life_time = + cm_req_get_primary_local_ack_timeout(req_msg); + primary_path->packet_life_time -= (primary_path->packet_life_time > 0); + + if (req_msg->alt_local_lid) { + memset(alt_path, 0, sizeof *alt_path); + alt_path->dgid = req_msg->alt_local_gid; + alt_path->sgid = req_msg->alt_remote_gid; + alt_path->dlid = req_msg->alt_local_lid; + alt_path->slid = req_msg->alt_remote_lid; + alt_path->flow_label = cm_req_get_alt_flow_label(req_msg); + alt_path->hop_limit = req_msg->alt_hop_limit; + alt_path->traffic_class = req_msg->alt_traffic_class; + alt_path->reversible = 1; + alt_path->pkey = req_msg->pkey; + alt_path->sl = cm_req_get_alt_sl(req_msg); + alt_path->mtu_selector = IB_SA_EQ; + alt_path->mtu = cm_req_get_path_mtu(req_msg); + alt_path->rate_selector = IB_SA_EQ; + alt_path->rate = cm_req_get_alt_packet_rate(req_msg); + alt_path->packet_life_time_selector = IB_SA_EQ; + alt_path->packet_life_time = + cm_req_get_alt_local_ack_timeout(req_msg); + alt_path->packet_life_time -= (alt_path->packet_life_time > 0); + } +} + +static void cm_format_req_event(struct cm_work *work, + struct cm_id_private *cm_id_priv, + struct ib_cm_id *listen_id) +{ + struct cm_req_msg *req_msg; + struct ib_cm_req_event_param *param; + + req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; + param = &work->cm_event.param.req_rcvd; + param->listen_id = listen_id; + param->port = cm_id_priv->av.port->port_num; + param->primary_path = &work->path[0]; + if (req_msg->alt_local_lid) + param->alternate_path = &work->path[1]; + else + param->alternate_path = NULL; + param->remote_ca_guid = req_msg->local_ca_guid; + param->remote_qkey = be32_to_cpu(req_msg->local_qkey); + param->remote_qpn = be32_to_cpu(cm_req_get_local_qpn(req_msg)); + param->qp_type = cm_req_get_qp_type(req_msg); + param->starting_psn = be32_to_cpu(cm_req_get_starting_psn(req_msg)); + param->responder_resources = cm_req_get_init_depth(req_msg); + param->initiator_depth = cm_req_get_resp_res(req_msg); + param->local_cm_response_timeout = + cm_req_get_remote_resp_timeout(req_msg); + param->flow_control = cm_req_get_flow_ctrl(req_msg); + param->remote_cm_response_timeout = + cm_req_get_local_resp_timeout(req_msg); + param->retry_count = cm_req_get_retry_count(req_msg); + param->rnr_retry_count = cm_req_get_rnr_retry_count(req_msg); + param->srq = cm_req_get_srq(req_msg); + work->cm_event.private_data = &req_msg->private_data; +} + +static void cm_process_work(struct cm_id_private *cm_id_priv, + struct cm_work *work) +{ + int ret; + + /* We will typically only have the current event to report. */ + ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->cm_event); + cm_free_work(work); + + while (!ret && !atomic_add_negative(-1, &cm_id_priv->work_count)) { + spin_lock_irq(&cm_id_priv->lock); + work = cm_dequeue_work(cm_id_priv); + spin_unlock_irq(&cm_id_priv->lock); + BUG_ON(!work); + ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, + &work->cm_event); + cm_free_work(work); + } + cm_deref_id(cm_id_priv); + if (ret) + cm_destroy_id(&cm_id_priv->id, ret); +} + +static void cm_format_mra(struct cm_mra_msg *mra_msg, + struct cm_id_private *cm_id_priv, + enum cm_msg_response msg_mraed, u8 service_timeout, + const void *private_data, u8 private_data_len) +{ + cm_format_mad_hdr(&mra_msg->hdr, CM_MRA_ATTR_ID, cm_id_priv->tid); + cm_mra_set_msg_mraed(mra_msg, msg_mraed); + mra_msg->local_comm_id = cm_id_priv->id.local_id; + mra_msg->remote_comm_id = cm_id_priv->id.remote_id; + cm_mra_set_service_timeout(mra_msg, service_timeout); + + if (private_data && private_data_len) + memcpy(mra_msg->private_data, private_data, private_data_len); +} + +static void cm_format_rej(struct cm_rej_msg *rej_msg, + struct cm_id_private *cm_id_priv, + enum ib_cm_rej_reason reason, + void *ari, + u8 ari_length, + const void *private_data, + u8 private_data_len) +{ + cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, cm_id_priv->tid); + rej_msg->remote_comm_id = cm_id_priv->id.remote_id; + + switch(cm_id_priv->id.state) { + case IB_CM_REQ_RCVD: + rej_msg->local_comm_id = 0; + cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REQ); + break; + case IB_CM_MRA_REQ_SENT: + rej_msg->local_comm_id = cm_id_priv->id.local_id; + cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REQ); + break; + case IB_CM_REP_RCVD: + case IB_CM_MRA_REP_SENT: + rej_msg->local_comm_id = cm_id_priv->id.local_id; + cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REP); + break; + default: + rej_msg->local_comm_id = cm_id_priv->id.local_id; + cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_OTHER); + break; + } + + rej_msg->reason = cpu_to_be16(reason); + if (ari && ari_length) { + cm_rej_set_reject_info_len(rej_msg, ari_length); + memcpy(rej_msg->ari, ari, ari_length); + } + + if (private_data && private_data_len) + memcpy(rej_msg->private_data, private_data, private_data_len); +} + +static void cm_dup_req_handler(struct cm_work *work, + struct cm_id_private *cm_id_priv) +{ + struct ib_mad_send_buf *msg = NULL; + int ret; + + atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_REQ_COUNTER]); + + /* Quick state check to discard duplicate REQs. */ + if (cm_id_priv->id.state == IB_CM_REQ_RCVD) + return; + + ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg); + if (ret) + return; + + spin_lock_irq(&cm_id_priv->lock); + switch (cm_id_priv->id.state) { + case IB_CM_MRA_REQ_SENT: + cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, + CM_MSG_RESPONSE_REQ, cm_id_priv->service_timeout, + cm_id_priv->private_data, + cm_id_priv->private_data_len); + break; + case IB_CM_TIMEWAIT: + cm_format_rej((struct cm_rej_msg *) msg->mad, cm_id_priv, + IB_CM_REJ_STALE_CONN, NULL, 0, NULL, 0); + break; + default: + goto unlock; + } + spin_unlock_irq(&cm_id_priv->lock); + + ret = ib_post_send_mad(msg, NULL); + if (ret) + goto free; + return; + +unlock: spin_unlock_irq(&cm_id_priv->lock); +free: cm_free_msg(msg); +} + +static struct cm_id_private * cm_match_req(struct cm_work *work, + struct cm_id_private *cm_id_priv) +{ + struct cm_id_private *listen_cm_id_priv, *cur_cm_id_priv; + struct cm_timewait_info *timewait_info; + struct cm_req_msg *req_msg; + + req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; + + /* Check for possible duplicate REQ. */ + spin_lock_irq(&cm.lock); + timewait_info = cm_insert_remote_id(cm_id_priv->timewait_info); + if (timewait_info) { + cur_cm_id_priv = cm_get_id(timewait_info->work.local_id, + timewait_info->work.remote_id); + spin_unlock_irq(&cm.lock); + if (cur_cm_id_priv) { + cm_dup_req_handler(work, cur_cm_id_priv); + cm_deref_id(cur_cm_id_priv); + } + return NULL; + } + + /* Check for stale connections. */ + timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info); + if (timewait_info) { + cm_cleanup_timewait(cm_id_priv->timewait_info); + spin_unlock_irq(&cm.lock); + cm_issue_rej(work->port, work->mad_recv_wc, + IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REQ, + NULL, 0); + return NULL; + } + + /* Find matching listen request. */ + listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device, + req_msg->service_id, + req_msg->private_data); + if (!listen_cm_id_priv) { + cm_cleanup_timewait(cm_id_priv->timewait_info); + spin_unlock_irq(&cm.lock); + cm_issue_rej(work->port, work->mad_recv_wc, + IB_CM_REJ_INVALID_SERVICE_ID, CM_MSG_RESPONSE_REQ, + NULL, 0); + goto out; + } + atomic_inc(&listen_cm_id_priv->refcount); + atomic_inc(&cm_id_priv->refcount); + cm_id_priv->id.state = IB_CM_REQ_RCVD; + atomic_inc(&cm_id_priv->work_count); + spin_unlock_irq(&cm.lock); +out: + return listen_cm_id_priv; +} + +/* + * Work-around for inter-subnet connections. If the LIDs are permissive, + * we need to override the LID/SL data in the REQ with the LID information + * in the work completion. + */ +static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc) +{ + if (!cm_req_get_primary_subnet_local(req_msg)) { + if (req_msg->primary_local_lid == IB_LID_PERMISSIVE) { + req_msg->primary_local_lid = cpu_to_be16(wc->slid); + cm_req_set_primary_sl(req_msg, wc->sl); + } + + if (req_msg->primary_remote_lid == IB_LID_PERMISSIVE) + req_msg->primary_remote_lid = cpu_to_be16(wc->dlid_path_bits); + } + + if (!cm_req_get_alt_subnet_local(req_msg)) { + if (req_msg->alt_local_lid == IB_LID_PERMISSIVE) { + req_msg->alt_local_lid = cpu_to_be16(wc->slid); + cm_req_set_alt_sl(req_msg, wc->sl); + } + + if (req_msg->alt_remote_lid == IB_LID_PERMISSIVE) + req_msg->alt_remote_lid = cpu_to_be16(wc->dlid_path_bits); + } +} + +static int cm_req_handler(struct cm_work *work) +{ + struct ib_cm_id *cm_id; + struct cm_id_private *cm_id_priv, *listen_cm_id_priv; + struct cm_req_msg *req_msg; + int ret; + + req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; + + cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL); + if (IS_ERR(cm_id)) + return PTR_ERR(cm_id); + + cm_id_priv = container_of(cm_id, struct cm_id_private, id); + cm_id_priv->id.remote_id = req_msg->local_comm_id; + cm_init_av_for_response(work->port, work->mad_recv_wc->wc, + work->mad_recv_wc->recv_buf.grh, + &cm_id_priv->av); + cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv-> + id.local_id); + if (IS_ERR(cm_id_priv->timewait_info)) { + ret = PTR_ERR(cm_id_priv->timewait_info); + goto destroy; + } + cm_id_priv->timewait_info->work.remote_id = req_msg->local_comm_id; + cm_id_priv->timewait_info->remote_ca_guid = req_msg->local_ca_guid; + cm_id_priv->timewait_info->remote_qpn = cm_req_get_local_qpn(req_msg); + + listen_cm_id_priv = cm_match_req(work, cm_id_priv); + if (!listen_cm_id_priv) { + ret = -EINVAL; + kfree(cm_id_priv->timewait_info); + goto destroy; + } + + cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler; + cm_id_priv->id.context = listen_cm_id_priv->id.context; + cm_id_priv->id.service_id = req_msg->service_id; + cm_id_priv->id.service_mask = ~cpu_to_be64(0); + + cm_process_routed_req(req_msg, work->mad_recv_wc->wc); + cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]); + + memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, ETH_ALEN); + work->path[0].vlan_id = cm_id_priv->av.ah_attr.vlan_id; + ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av); + if (ret) { + ib_get_cached_gid(work->port->cm_dev->ib_device, + work->port->port_num, 0, &work->path[0].sgid); + ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, + &work->path[0].sgid, sizeof work->path[0].sgid, + NULL, 0); + goto rejected; + } + if (req_msg->alt_local_lid) { + ret = cm_init_av_by_path(&work->path[1], &cm_id_priv->alt_av); + if (ret) { + ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_ALT_GID, + &work->path[0].sgid, + sizeof work->path[0].sgid, NULL, 0); + goto rejected; + } + } + cm_id_priv->tid = req_msg->hdr.tid; + cm_id_priv->timeout_ms = cm_convert_to_ms( + cm_req_get_local_resp_timeout(req_msg)); + cm_id_priv->max_cm_retries = cm_req_get_max_cm_retries(req_msg); + cm_id_priv->remote_qpn = cm_req_get_local_qpn(req_msg); + cm_id_priv->initiator_depth = cm_req_get_resp_res(req_msg); + cm_id_priv->responder_resources = cm_req_get_init_depth(req_msg); + cm_id_priv->path_mtu = cm_req_get_path_mtu(req_msg); + cm_id_priv->pkey = req_msg->pkey; + cm_id_priv->sq_psn = cm_req_get_starting_psn(req_msg); + cm_id_priv->retry_count = cm_req_get_retry_count(req_msg); + cm_id_priv->rnr_retry_count = cm_req_get_rnr_retry_count(req_msg); + cm_id_priv->qp_type = cm_req_get_qp_type(req_msg); + + cm_format_req_event(work, cm_id_priv, &listen_cm_id_priv->id); + cm_process_work(cm_id_priv, work); + cm_deref_id(listen_cm_id_priv); + return 0; + +rejected: + atomic_dec(&cm_id_priv->refcount); + cm_deref_id(listen_cm_id_priv); +destroy: + ib_destroy_cm_id(cm_id); + return ret; +} + +static void cm_format_rep(struct cm_rep_msg *rep_msg, + struct cm_id_private *cm_id_priv, + struct ib_cm_rep_param *param) +{ + cm_format_mad_hdr(&rep_msg->hdr, CM_REP_ATTR_ID, cm_id_priv->tid); + rep_msg->local_comm_id = cm_id_priv->id.local_id; + rep_msg->remote_comm_id = cm_id_priv->id.remote_id; + cm_rep_set_starting_psn(rep_msg, cpu_to_be32(param->starting_psn)); + rep_msg->resp_resources = param->responder_resources; + cm_rep_set_target_ack_delay(rep_msg, + cm_id_priv->av.port->cm_dev->ack_delay); + cm_rep_set_failover(rep_msg, param->failover_accepted); + cm_rep_set_rnr_retry_count(rep_msg, param->rnr_retry_count); + rep_msg->local_ca_guid = cm_id_priv->id.device->node_guid; + + if (cm_id_priv->qp_type != IB_QPT_XRC_TGT) { + rep_msg->initiator_depth = param->initiator_depth; + cm_rep_set_flow_ctrl(rep_msg, param->flow_control); + cm_rep_set_srq(rep_msg, param->srq); + cm_rep_set_local_qpn(rep_msg, cpu_to_be32(param->qp_num)); + } else { + cm_rep_set_srq(rep_msg, 1); + cm_rep_set_local_eecn(rep_msg, cpu_to_be32(param->qp_num)); + } + + if (param->private_data && param->private_data_len) + memcpy(rep_msg->private_data, param->private_data, + param->private_data_len); +} + +int ib_send_cm_rep(struct ib_cm_id *cm_id, + struct ib_cm_rep_param *param) +{ + struct cm_id_private *cm_id_priv; + struct ib_mad_send_buf *msg; + struct cm_rep_msg *rep_msg; + unsigned long flags; + int ret; + + if (param->private_data && + param->private_data_len > IB_CM_REP_PRIVATE_DATA_SIZE) + return -EINVAL; + + cm_id_priv = container_of(cm_id, struct cm_id_private, id); + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id->state != IB_CM_REQ_RCVD && + cm_id->state != IB_CM_MRA_REQ_SENT) { + ret = -EINVAL; + goto out; + } + + ret = cm_alloc_msg(cm_id_priv, &msg); + if (ret) + goto out; + + rep_msg = (struct cm_rep_msg *) msg->mad; + cm_format_rep(rep_msg, cm_id_priv, param); + msg->timeout_ms = cm_id_priv->timeout_ms; + msg->context[1] = (void *) (unsigned long) IB_CM_REP_SENT; + + ret = ib_post_send_mad(msg, NULL); + if (ret) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + cm_free_msg(msg); + return ret; + } + + cm_id->state = IB_CM_REP_SENT; + cm_id_priv->msg = msg; + cm_id_priv->initiator_depth = param->initiator_depth; + cm_id_priv->responder_resources = param->responder_resources; + cm_id_priv->rq_psn = cm_rep_get_starting_psn(rep_msg); + cm_id_priv->local_qpn = cpu_to_be32(param->qp_num & 0xFFFFFF); + +out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; +} +EXPORT_SYMBOL(ib_send_cm_rep); + +static void cm_format_rtu(struct cm_rtu_msg *rtu_msg, + struct cm_id_private *cm_id_priv, + const void *private_data, + u8 private_data_len) +{ + cm_format_mad_hdr(&rtu_msg->hdr, CM_RTU_ATTR_ID, cm_id_priv->tid); + rtu_msg->local_comm_id = cm_id_priv->id.local_id; + rtu_msg->remote_comm_id = cm_id_priv->id.remote_id; + + if (private_data && private_data_len) + memcpy(rtu_msg->private_data, private_data, private_data_len); +} + +int ib_send_cm_rtu(struct ib_cm_id *cm_id, + const void *private_data, + u8 private_data_len) +{ + struct cm_id_private *cm_id_priv; + struct ib_mad_send_buf *msg; + unsigned long flags; + void *data; + int ret; + + if (private_data && private_data_len > IB_CM_RTU_PRIVATE_DATA_SIZE) + return -EINVAL; + + data = cm_copy_private_data(private_data, private_data_len); + if (IS_ERR(data)) + return PTR_ERR(data); + + cm_id_priv = container_of(cm_id, struct cm_id_private, id); + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id->state != IB_CM_REP_RCVD && + cm_id->state != IB_CM_MRA_REP_SENT) { + ret = -EINVAL; + goto error; + } + + ret = cm_alloc_msg(cm_id_priv, &msg); + if (ret) + goto error; + + cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv, + private_data, private_data_len); + + ret = ib_post_send_mad(msg, NULL); + if (ret) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + cm_free_msg(msg); + kfree(data); + return ret; + } + + cm_id->state = IB_CM_ESTABLISHED; + cm_set_private_data(cm_id_priv, data, private_data_len); + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return 0; + +error: spin_unlock_irqrestore(&cm_id_priv->lock, flags); + kfree(data); + return ret; +} +EXPORT_SYMBOL(ib_send_cm_rtu); + +static void cm_format_rep_event(struct cm_work *work, enum ib_qp_type qp_type) +{ + struct cm_rep_msg *rep_msg; + struct ib_cm_rep_event_param *param; + + rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad; + param = &work->cm_event.param.rep_rcvd; + param->remote_ca_guid = rep_msg->local_ca_guid; + param->remote_qkey = be32_to_cpu(rep_msg->local_qkey); + param->remote_qpn = be32_to_cpu(cm_rep_get_qpn(rep_msg, qp_type)); + param->starting_psn = be32_to_cpu(cm_rep_get_starting_psn(rep_msg)); + param->responder_resources = rep_msg->initiator_depth; + param->initiator_depth = rep_msg->resp_resources; + param->target_ack_delay = cm_rep_get_target_ack_delay(rep_msg); + param->failover_accepted = cm_rep_get_failover(rep_msg); + param->flow_control = cm_rep_get_flow_ctrl(rep_msg); + param->rnr_retry_count = cm_rep_get_rnr_retry_count(rep_msg); + param->srq = cm_rep_get_srq(rep_msg); + work->cm_event.private_data = &rep_msg->private_data; +} + +static void cm_dup_rep_handler(struct cm_work *work) +{ + struct cm_id_private *cm_id_priv; + struct cm_rep_msg *rep_msg; + struct ib_mad_send_buf *msg = NULL; + int ret; + + rep_msg = (struct cm_rep_msg *) work->mad_recv_wc->recv_buf.mad; + cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id, + rep_msg->local_comm_id); + if (!cm_id_priv) + return; + + atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_REP_COUNTER]); + ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg); + if (ret) + goto deref; + + spin_lock_irq(&cm_id_priv->lock); + if (cm_id_priv->id.state == IB_CM_ESTABLISHED) + cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv, + cm_id_priv->private_data, + cm_id_priv->private_data_len); + else if (cm_id_priv->id.state == IB_CM_MRA_REP_SENT) + cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, + CM_MSG_RESPONSE_REP, cm_id_priv->service_timeout, + cm_id_priv->private_data, + cm_id_priv->private_data_len); + else + goto unlock; + spin_unlock_irq(&cm_id_priv->lock); + + ret = ib_post_send_mad(msg, NULL); + if (ret) + goto free; + goto deref; + +unlock: spin_unlock_irq(&cm_id_priv->lock); +free: cm_free_msg(msg); +deref: cm_deref_id(cm_id_priv); +} + +static int cm_rep_handler(struct cm_work *work) +{ + struct cm_id_private *cm_id_priv; + struct cm_rep_msg *rep_msg; + int ret; + + rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad; + cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id, 0); + if (!cm_id_priv) { + cm_dup_rep_handler(work); + return -EINVAL; + } + + cm_format_rep_event(work, cm_id_priv->qp_type); + + spin_lock_irq(&cm_id_priv->lock); + switch (cm_id_priv->id.state) { + case IB_CM_REQ_SENT: + case IB_CM_MRA_REQ_RCVD: + break; + default: + spin_unlock_irq(&cm_id_priv->lock); + ret = -EINVAL; + goto error; + } + + cm_id_priv->timewait_info->work.remote_id = rep_msg->local_comm_id; + cm_id_priv->timewait_info->remote_ca_guid = rep_msg->local_ca_guid; + cm_id_priv->timewait_info->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type); + + spin_lock(&cm.lock); + /* Check for duplicate REP. */ + if (cm_insert_remote_id(cm_id_priv->timewait_info)) { + spin_unlock(&cm.lock); + spin_unlock_irq(&cm_id_priv->lock); + ret = -EINVAL; + goto error; + } + /* Check for a stale connection. */ + if (cm_insert_remote_qpn(cm_id_priv->timewait_info)) { + rb_erase(&cm_id_priv->timewait_info->remote_id_node, + &cm.remote_id_table); + cm_id_priv->timewait_info->inserted_remote_id = 0; + spin_unlock(&cm.lock); + spin_unlock_irq(&cm_id_priv->lock); + cm_issue_rej(work->port, work->mad_recv_wc, + IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP, + NULL, 0); + ret = -EINVAL; + goto error; + } + spin_unlock(&cm.lock); + + cm_id_priv->id.state = IB_CM_REP_RCVD; + cm_id_priv->id.remote_id = rep_msg->local_comm_id; + cm_id_priv->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type); + cm_id_priv->initiator_depth = rep_msg->resp_resources; + cm_id_priv->responder_resources = rep_msg->initiator_depth; + cm_id_priv->sq_psn = cm_rep_get_starting_psn(rep_msg); + cm_id_priv->rnr_retry_count = cm_rep_get_rnr_retry_count(rep_msg); + cm_id_priv->target_ack_delay = cm_rep_get_target_ack_delay(rep_msg); + cm_id_priv->av.timeout = + cm_ack_timeout(cm_id_priv->target_ack_delay, + cm_id_priv->av.timeout - 1); + cm_id_priv->alt_av.timeout = + cm_ack_timeout(cm_id_priv->target_ack_delay, + cm_id_priv->alt_av.timeout - 1); + + /* todo: handle peer_to_peer */ + + ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); + ret = atomic_inc_and_test(&cm_id_priv->work_count); + if (!ret) + list_add_tail(&work->list, &cm_id_priv->work_list); + spin_unlock_irq(&cm_id_priv->lock); + + if (ret) + cm_process_work(cm_id_priv, work); + else + cm_deref_id(cm_id_priv); + return 0; + +error: + cm_deref_id(cm_id_priv); + return ret; +} + +static int cm_establish_handler(struct cm_work *work) +{ + struct cm_id_private *cm_id_priv; + int ret; + + /* See comment in cm_establish about lookup. */ + cm_id_priv = cm_acquire_id(work->local_id, work->remote_id); + if (!cm_id_priv) + return -EINVAL; + + spin_lock_irq(&cm_id_priv->lock); + if (cm_id_priv->id.state != IB_CM_ESTABLISHED) { + spin_unlock_irq(&cm_id_priv->lock); + goto out; + } + + ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); + ret = atomic_inc_and_test(&cm_id_priv->work_count); + if (!ret) + list_add_tail(&work->list, &cm_id_priv->work_list); + spin_unlock_irq(&cm_id_priv->lock); + + if (ret) + cm_process_work(cm_id_priv, work); + else + cm_deref_id(cm_id_priv); + return 0; +out: + cm_deref_id(cm_id_priv); + return -EINVAL; +} + +static int cm_rtu_handler(struct cm_work *work) +{ + struct cm_id_private *cm_id_priv; + struct cm_rtu_msg *rtu_msg; + int ret; + + rtu_msg = (struct cm_rtu_msg *)work->mad_recv_wc->recv_buf.mad; + cm_id_priv = cm_acquire_id(rtu_msg->remote_comm_id, + rtu_msg->local_comm_id); + if (!cm_id_priv) + return -EINVAL; + + work->cm_event.private_data = &rtu_msg->private_data; + + spin_lock_irq(&cm_id_priv->lock); + if (cm_id_priv->id.state != IB_CM_REP_SENT && + cm_id_priv->id.state != IB_CM_MRA_REP_RCVD) { + spin_unlock_irq(&cm_id_priv->lock); + atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_RTU_COUNTER]); + goto out; + } + cm_id_priv->id.state = IB_CM_ESTABLISHED; + + ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); + ret = atomic_inc_and_test(&cm_id_priv->work_count); + if (!ret) + list_add_tail(&work->list, &cm_id_priv->work_list); + spin_unlock_irq(&cm_id_priv->lock); + + if (ret) + cm_process_work(cm_id_priv, work); + else + cm_deref_id(cm_id_priv); + return 0; +out: + cm_deref_id(cm_id_priv); + return -EINVAL; +} + +static void cm_format_dreq(struct cm_dreq_msg *dreq_msg, + struct cm_id_private *cm_id_priv, + const void *private_data, + u8 private_data_len) +{ + cm_format_mad_hdr(&dreq_msg->hdr, CM_DREQ_ATTR_ID, + cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_DREQ)); + dreq_msg->local_comm_id = cm_id_priv->id.local_id; + dreq_msg->remote_comm_id = cm_id_priv->id.remote_id; + cm_dreq_set_remote_qpn(dreq_msg, cm_id_priv->remote_qpn); + + if (private_data && private_data_len) + memcpy(dreq_msg->private_data, private_data, private_data_len); +} + +int ib_send_cm_dreq(struct ib_cm_id *cm_id, + const void *private_data, + u8 private_data_len) +{ + struct cm_id_private *cm_id_priv; + struct ib_mad_send_buf *msg; + unsigned long flags; + int ret; + + if (private_data && private_data_len > IB_CM_DREQ_PRIVATE_DATA_SIZE) + return -EINVAL; + + cm_id_priv = container_of(cm_id, struct cm_id_private, id); + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id->state != IB_CM_ESTABLISHED) { + ret = -EINVAL; + goto out; + } + + if (cm_id->lap_state == IB_CM_LAP_SENT || + cm_id->lap_state == IB_CM_MRA_LAP_RCVD) + ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); + + ret = cm_alloc_msg(cm_id_priv, &msg); + if (ret) { + cm_enter_timewait(cm_id_priv); + goto out; + } + + cm_format_dreq((struct cm_dreq_msg *) msg->mad, cm_id_priv, + private_data, private_data_len); + msg->timeout_ms = cm_id_priv->timeout_ms; + msg->context[1] = (void *) (unsigned long) IB_CM_DREQ_SENT; + + ret = ib_post_send_mad(msg, NULL); + if (ret) { + cm_enter_timewait(cm_id_priv); + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + cm_free_msg(msg); + return ret; + } + + cm_id->state = IB_CM_DREQ_SENT; + cm_id_priv->msg = msg; +out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; +} +EXPORT_SYMBOL(ib_send_cm_dreq); + +static void cm_format_drep(struct cm_drep_msg *drep_msg, + struct cm_id_private *cm_id_priv, + const void *private_data, + u8 private_data_len) +{ + cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, cm_id_priv->tid); + drep_msg->local_comm_id = cm_id_priv->id.local_id; + drep_msg->remote_comm_id = cm_id_priv->id.remote_id; + + if (private_data && private_data_len) + memcpy(drep_msg->private_data, private_data, private_data_len); +} + +int ib_send_cm_drep(struct ib_cm_id *cm_id, + const void *private_data, + u8 private_data_len) +{ + struct cm_id_private *cm_id_priv; + struct ib_mad_send_buf *msg; + unsigned long flags; + void *data; + int ret; + + if (private_data && private_data_len > IB_CM_DREP_PRIVATE_DATA_SIZE) + return -EINVAL; + + data = cm_copy_private_data(private_data, private_data_len); + if (IS_ERR(data)) + return PTR_ERR(data); + + cm_id_priv = container_of(cm_id, struct cm_id_private, id); + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id->state != IB_CM_DREQ_RCVD) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + kfree(data); + return -EINVAL; + } + + cm_set_private_data(cm_id_priv, data, private_data_len); + cm_enter_timewait(cm_id_priv); + + ret = cm_alloc_msg(cm_id_priv, &msg); + if (ret) + goto out; + + cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv, + private_data, private_data_len); + + ret = ib_post_send_mad(msg, NULL); + if (ret) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + cm_free_msg(msg); + return ret; + } + +out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; +} +EXPORT_SYMBOL(ib_send_cm_drep); + +static int cm_issue_drep(struct cm_port *port, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct ib_mad_send_buf *msg = NULL; + struct cm_dreq_msg *dreq_msg; + struct cm_drep_msg *drep_msg; + int ret; + + ret = cm_alloc_response_msg(port, mad_recv_wc, &msg); + if (ret) + return ret; + + dreq_msg = (struct cm_dreq_msg *) mad_recv_wc->recv_buf.mad; + drep_msg = (struct cm_drep_msg *) msg->mad; + + cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, dreq_msg->hdr.tid); + drep_msg->remote_comm_id = dreq_msg->local_comm_id; + drep_msg->local_comm_id = dreq_msg->remote_comm_id; + + ret = ib_post_send_mad(msg, NULL); + if (ret) + cm_free_msg(msg); + + return ret; +} + +static int cm_dreq_handler(struct cm_work *work) +{ + struct cm_id_private *cm_id_priv; + struct cm_dreq_msg *dreq_msg; + struct ib_mad_send_buf *msg = NULL; + int ret; + + dreq_msg = (struct cm_dreq_msg *)work->mad_recv_wc->recv_buf.mad; + cm_id_priv = cm_acquire_id(dreq_msg->remote_comm_id, + dreq_msg->local_comm_id); + if (!cm_id_priv) { + atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_DREQ_COUNTER]); + cm_issue_drep(work->port, work->mad_recv_wc); + return -EINVAL; + } + + work->cm_event.private_data = &dreq_msg->private_data; + + spin_lock_irq(&cm_id_priv->lock); + if (cm_id_priv->local_qpn != cm_dreq_get_remote_qpn(dreq_msg)) + goto unlock; + + switch (cm_id_priv->id.state) { + case IB_CM_REP_SENT: + case IB_CM_DREQ_SENT: + ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); + break; + case IB_CM_ESTABLISHED: + if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT || + cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD) + ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); + break; + case IB_CM_MRA_REP_RCVD: + break; + case IB_CM_TIMEWAIT: + atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_DREQ_COUNTER]); + if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg)) + goto unlock; + + cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv, + cm_id_priv->private_data, + cm_id_priv->private_data_len); + spin_unlock_irq(&cm_id_priv->lock); + + if (ib_post_send_mad(msg, NULL)) + cm_free_msg(msg); + goto deref; + case IB_CM_DREQ_RCVD: + atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_DREQ_COUNTER]); + goto unlock; + default: + goto unlock; + } + cm_id_priv->id.state = IB_CM_DREQ_RCVD; + cm_id_priv->tid = dreq_msg->hdr.tid; + ret = atomic_inc_and_test(&cm_id_priv->work_count); + if (!ret) + list_add_tail(&work->list, &cm_id_priv->work_list); + spin_unlock_irq(&cm_id_priv->lock); + + if (ret) + cm_process_work(cm_id_priv, work); + else + cm_deref_id(cm_id_priv); + return 0; + +unlock: spin_unlock_irq(&cm_id_priv->lock); +deref: cm_deref_id(cm_id_priv); + return -EINVAL; +} + +static int cm_drep_handler(struct cm_work *work) +{ + struct cm_id_private *cm_id_priv; + struct cm_drep_msg *drep_msg; + int ret; + + drep_msg = (struct cm_drep_msg *)work->mad_recv_wc->recv_buf.mad; + cm_id_priv = cm_acquire_id(drep_msg->remote_comm_id, + drep_msg->local_comm_id); + if (!cm_id_priv) + return -EINVAL; + + work->cm_event.private_data = &drep_msg->private_data; + + spin_lock_irq(&cm_id_priv->lock); + if (cm_id_priv->id.state != IB_CM_DREQ_SENT && + cm_id_priv->id.state != IB_CM_DREQ_RCVD) { + spin_unlock_irq(&cm_id_priv->lock); + goto out; + } + cm_enter_timewait(cm_id_priv); + + ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); + ret = atomic_inc_and_test(&cm_id_priv->work_count); + if (!ret) + list_add_tail(&work->list, &cm_id_priv->work_list); + spin_unlock_irq(&cm_id_priv->lock); + + if (ret) + cm_process_work(cm_id_priv, work); + else + cm_deref_id(cm_id_priv); + return 0; +out: + cm_deref_id(cm_id_priv); + return -EINVAL; +} + +int ib_send_cm_rej(struct ib_cm_id *cm_id, + enum ib_cm_rej_reason reason, + void *ari, + u8 ari_length, + const void *private_data, + u8 private_data_len) +{ + struct cm_id_private *cm_id_priv; + struct ib_mad_send_buf *msg; + unsigned long flags; + int ret; + + if ((private_data && private_data_len > IB_CM_REJ_PRIVATE_DATA_SIZE) || + (ari && ari_length > IB_CM_REJ_ARI_LENGTH)) + return -EINVAL; + + cm_id_priv = container_of(cm_id, struct cm_id_private, id); + + spin_lock_irqsave(&cm_id_priv->lock, flags); + switch (cm_id->state) { + case IB_CM_REQ_SENT: + case IB_CM_MRA_REQ_RCVD: + case IB_CM_REQ_RCVD: + case IB_CM_MRA_REQ_SENT: + case IB_CM_REP_RCVD: + case IB_CM_MRA_REP_SENT: + ret = cm_alloc_msg(cm_id_priv, &msg); + if (!ret) + cm_format_rej((struct cm_rej_msg *) msg->mad, + cm_id_priv, reason, ari, ari_length, + private_data, private_data_len); + + cm_reset_to_idle(cm_id_priv); + break; + case IB_CM_REP_SENT: + case IB_CM_MRA_REP_RCVD: + ret = cm_alloc_msg(cm_id_priv, &msg); + if (!ret) + cm_format_rej((struct cm_rej_msg *) msg->mad, + cm_id_priv, reason, ari, ari_length, + private_data, private_data_len); + + cm_enter_timewait(cm_id_priv); + break; + default: + ret = -EINVAL; + goto out; + } + + if (ret) + goto out; + + ret = ib_post_send_mad(msg, NULL); + if (ret) + cm_free_msg(msg); + +out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; +} +EXPORT_SYMBOL(ib_send_cm_rej); + +static void cm_format_rej_event(struct cm_work *work) +{ + struct cm_rej_msg *rej_msg; + struct ib_cm_rej_event_param *param; + + rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad; + param = &work->cm_event.param.rej_rcvd; + param->ari = rej_msg->ari; + param->ari_length = cm_rej_get_reject_info_len(rej_msg); + param->reason = __be16_to_cpu(rej_msg->reason); + work->cm_event.private_data = &rej_msg->private_data; +} + +static struct cm_id_private * cm_acquire_rejected_id(struct cm_rej_msg *rej_msg) +{ + struct cm_timewait_info *timewait_info; + struct cm_id_private *cm_id_priv; + __be32 remote_id; + + remote_id = rej_msg->local_comm_id; + + if (__be16_to_cpu(rej_msg->reason) == IB_CM_REJ_TIMEOUT) { + spin_lock_irq(&cm.lock); + timewait_info = cm_find_remote_id( *((__be64 *) rej_msg->ari), + remote_id); + if (!timewait_info) { + spin_unlock_irq(&cm.lock); + return NULL; + } + cm_id_priv = idr_find(&cm.local_id_table, (__force int) + (timewait_info->work.local_id ^ + cm.random_id_operand)); + if (cm_id_priv) { + if (cm_id_priv->id.remote_id == remote_id) + atomic_inc(&cm_id_priv->refcount); + else + cm_id_priv = NULL; + } + spin_unlock_irq(&cm.lock); + } else if (cm_rej_get_msg_rejected(rej_msg) == CM_MSG_RESPONSE_REQ) + cm_id_priv = cm_acquire_id(rej_msg->remote_comm_id, 0); + else + cm_id_priv = cm_acquire_id(rej_msg->remote_comm_id, remote_id); + + return cm_id_priv; +} + +static int cm_rej_handler(struct cm_work *work) +{ + struct cm_id_private *cm_id_priv; + struct cm_rej_msg *rej_msg; + int ret; + + rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad; + cm_id_priv = cm_acquire_rejected_id(rej_msg); + if (!cm_id_priv) + return -EINVAL; + + cm_format_rej_event(work); + + spin_lock_irq(&cm_id_priv->lock); + switch (cm_id_priv->id.state) { + case IB_CM_REQ_SENT: + case IB_CM_MRA_REQ_RCVD: + case IB_CM_REP_SENT: + case IB_CM_MRA_REP_RCVD: + ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); + /* fall through */ + case IB_CM_REQ_RCVD: + case IB_CM_MRA_REQ_SENT: + if (__be16_to_cpu(rej_msg->reason) == IB_CM_REJ_STALE_CONN) + cm_enter_timewait(cm_id_priv); + else + cm_reset_to_idle(cm_id_priv); + break; + case IB_CM_DREQ_SENT: + ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); + /* fall through */ + case IB_CM_REP_RCVD: + case IB_CM_MRA_REP_SENT: + cm_enter_timewait(cm_id_priv); + break; + case IB_CM_ESTABLISHED: + if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT || + cm_id_priv->id.lap_state == IB_CM_LAP_SENT) { + if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT) + ib_cancel_mad(cm_id_priv->av.port->mad_agent, + cm_id_priv->msg); + cm_enter_timewait(cm_id_priv); + break; + } + /* fall through */ + default: + spin_unlock_irq(&cm_id_priv->lock); + ret = -EINVAL; + goto out; + } + + ret = atomic_inc_and_test(&cm_id_priv->work_count); + if (!ret) + list_add_tail(&work->list, &cm_id_priv->work_list); + spin_unlock_irq(&cm_id_priv->lock); + + if (ret) + cm_process_work(cm_id_priv, work); + else + cm_deref_id(cm_id_priv); + return 0; +out: + cm_deref_id(cm_id_priv); + return -EINVAL; +} + +int ib_send_cm_mra(struct ib_cm_id *cm_id, + u8 service_timeout, + const void *private_data, + u8 private_data_len) +{ + struct cm_id_private *cm_id_priv; + struct ib_mad_send_buf *msg; + enum ib_cm_state cm_state; + enum ib_cm_lap_state lap_state; + enum cm_msg_response msg_response; + void *data; + unsigned long flags; + int ret; + + if (private_data && private_data_len > IB_CM_MRA_PRIVATE_DATA_SIZE) + return -EINVAL; + + data = cm_copy_private_data(private_data, private_data_len); + if (IS_ERR(data)) + return PTR_ERR(data); + + cm_id_priv = container_of(cm_id, struct cm_id_private, id); + + spin_lock_irqsave(&cm_id_priv->lock, flags); + switch(cm_id_priv->id.state) { + case IB_CM_REQ_RCVD: + cm_state = IB_CM_MRA_REQ_SENT; + lap_state = cm_id->lap_state; + msg_response = CM_MSG_RESPONSE_REQ; + break; + case IB_CM_REP_RCVD: + cm_state = IB_CM_MRA_REP_SENT; + lap_state = cm_id->lap_state; + msg_response = CM_MSG_RESPONSE_REP; + break; + case IB_CM_ESTABLISHED: + if (cm_id->lap_state == IB_CM_LAP_RCVD) { + cm_state = cm_id->state; + lap_state = IB_CM_MRA_LAP_SENT; + msg_response = CM_MSG_RESPONSE_OTHER; + break; + } + default: + ret = -EINVAL; + goto error1; + } + + if (!(service_timeout & IB_CM_MRA_FLAG_DELAY)) { + ret = cm_alloc_msg(cm_id_priv, &msg); + if (ret) + goto error1; + + cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, + msg_response, service_timeout, + private_data, private_data_len); + ret = ib_post_send_mad(msg, NULL); + if (ret) + goto error2; + } + + cm_id->state = cm_state; + cm_id->lap_state = lap_state; + cm_id_priv->service_timeout = service_timeout; + cm_set_private_data(cm_id_priv, data, private_data_len); + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return 0; + +error1: spin_unlock_irqrestore(&cm_id_priv->lock, flags); + kfree(data); + return ret; + +error2: spin_unlock_irqrestore(&cm_id_priv->lock, flags); + kfree(data); + cm_free_msg(msg); + return ret; +} +EXPORT_SYMBOL(ib_send_cm_mra); + +static struct cm_id_private * cm_acquire_mraed_id(struct cm_mra_msg *mra_msg) +{ + switch (cm_mra_get_msg_mraed(mra_msg)) { + case CM_MSG_RESPONSE_REQ: + return cm_acquire_id(mra_msg->remote_comm_id, 0); + case CM_MSG_RESPONSE_REP: + case CM_MSG_RESPONSE_OTHER: + return cm_acquire_id(mra_msg->remote_comm_id, + mra_msg->local_comm_id); + default: + return NULL; + } +} + +static int cm_mra_handler(struct cm_work *work) +{ + struct cm_id_private *cm_id_priv; + struct cm_mra_msg *mra_msg; + int timeout, ret; + + mra_msg = (struct cm_mra_msg *)work->mad_recv_wc->recv_buf.mad; + cm_id_priv = cm_acquire_mraed_id(mra_msg); + if (!cm_id_priv) + return -EINVAL; + + work->cm_event.private_data = &mra_msg->private_data; + work->cm_event.param.mra_rcvd.service_timeout = + cm_mra_get_service_timeout(mra_msg); + timeout = cm_convert_to_ms(cm_mra_get_service_timeout(mra_msg)) + + cm_convert_to_ms(cm_id_priv->av.timeout); + + spin_lock_irq(&cm_id_priv->lock); + switch (cm_id_priv->id.state) { + case IB_CM_REQ_SENT: + if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_REQ || + ib_modify_mad(cm_id_priv->av.port->mad_agent, + cm_id_priv->msg, timeout)) + goto out; + cm_id_priv->id.state = IB_CM_MRA_REQ_RCVD; + break; + case IB_CM_REP_SENT: + if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_REP || + ib_modify_mad(cm_id_priv->av.port->mad_agent, + cm_id_priv->msg, timeout)) + goto out; + cm_id_priv->id.state = IB_CM_MRA_REP_RCVD; + break; + case IB_CM_ESTABLISHED: + if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_OTHER || + cm_id_priv->id.lap_state != IB_CM_LAP_SENT || + ib_modify_mad(cm_id_priv->av.port->mad_agent, + cm_id_priv->msg, timeout)) { + if (cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD) + atomic_long_inc(&work->port-> + counter_group[CM_RECV_DUPLICATES]. + counter[CM_MRA_COUNTER]); + goto out; + } + cm_id_priv->id.lap_state = IB_CM_MRA_LAP_RCVD; + break; + case IB_CM_MRA_REQ_RCVD: + case IB_CM_MRA_REP_RCVD: + atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_MRA_COUNTER]); + /* fall through */ + default: + goto out; + } + + cm_id_priv->msg->context[1] = (void *) (unsigned long) + cm_id_priv->id.state; + ret = atomic_inc_and_test(&cm_id_priv->work_count); + if (!ret) + list_add_tail(&work->list, &cm_id_priv->work_list); + spin_unlock_irq(&cm_id_priv->lock); + + if (ret) + cm_process_work(cm_id_priv, work); + else + cm_deref_id(cm_id_priv); + return 0; +out: + spin_unlock_irq(&cm_id_priv->lock); + cm_deref_id(cm_id_priv); + return -EINVAL; +} + +static void cm_format_lap(struct cm_lap_msg *lap_msg, + struct cm_id_private *cm_id_priv, + struct ib_sa_path_rec *alternate_path, + const void *private_data, + u8 private_data_len) +{ + cm_format_mad_hdr(&lap_msg->hdr, CM_LAP_ATTR_ID, + cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_LAP)); + lap_msg->local_comm_id = cm_id_priv->id.local_id; + lap_msg->remote_comm_id = cm_id_priv->id.remote_id; + cm_lap_set_remote_qpn(lap_msg, cm_id_priv->remote_qpn); + /* todo: need remote CM response timeout */ + cm_lap_set_remote_resp_timeout(lap_msg, 0x1F); + lap_msg->alt_local_lid = alternate_path->slid; + lap_msg->alt_remote_lid = alternate_path->dlid; + lap_msg->alt_local_gid = alternate_path->sgid; + lap_msg->alt_remote_gid = alternate_path->dgid; + cm_lap_set_flow_label(lap_msg, alternate_path->flow_label); + cm_lap_set_traffic_class(lap_msg, alternate_path->traffic_class); + lap_msg->alt_hop_limit = alternate_path->hop_limit; + cm_lap_set_packet_rate(lap_msg, alternate_path->rate); + cm_lap_set_sl(lap_msg, alternate_path->sl); + cm_lap_set_subnet_local(lap_msg, 1); /* local only... */ + cm_lap_set_local_ack_timeout(lap_msg, + cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay, + alternate_path->packet_life_time)); + + if (private_data && private_data_len) + memcpy(lap_msg->private_data, private_data, private_data_len); +} + +int ib_send_cm_lap(struct ib_cm_id *cm_id, + struct ib_sa_path_rec *alternate_path, + const void *private_data, + u8 private_data_len) +{ + struct cm_id_private *cm_id_priv; + struct ib_mad_send_buf *msg; + unsigned long flags; + int ret; + + if (private_data && private_data_len > IB_CM_LAP_PRIVATE_DATA_SIZE) + return -EINVAL; + + cm_id_priv = container_of(cm_id, struct cm_id_private, id); + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id->state != IB_CM_ESTABLISHED || + (cm_id->lap_state != IB_CM_LAP_UNINIT && + cm_id->lap_state != IB_CM_LAP_IDLE)) { + ret = -EINVAL; + goto out; + } + + ret = cm_init_av_by_path(alternate_path, &cm_id_priv->alt_av); + if (ret) + goto out; + cm_id_priv->alt_av.timeout = + cm_ack_timeout(cm_id_priv->target_ack_delay, + cm_id_priv->alt_av.timeout - 1); + + ret = cm_alloc_msg(cm_id_priv, &msg); + if (ret) + goto out; + + cm_format_lap((struct cm_lap_msg *) msg->mad, cm_id_priv, + alternate_path, private_data, private_data_len); + msg->timeout_ms = cm_id_priv->timeout_ms; + msg->context[1] = (void *) (unsigned long) IB_CM_ESTABLISHED; + + ret = ib_post_send_mad(msg, NULL); + if (ret) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + cm_free_msg(msg); + return ret; + } + + cm_id->lap_state = IB_CM_LAP_SENT; + cm_id_priv->msg = msg; + +out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; +} +EXPORT_SYMBOL(ib_send_cm_lap); + +static void cm_format_path_from_lap(struct cm_id_private *cm_id_priv, + struct ib_sa_path_rec *path, + struct cm_lap_msg *lap_msg) +{ + memset(path, 0, sizeof *path); + path->dgid = lap_msg->alt_local_gid; + path->sgid = lap_msg->alt_remote_gid; + path->dlid = lap_msg->alt_local_lid; + path->slid = lap_msg->alt_remote_lid; + path->flow_label = cm_lap_get_flow_label(lap_msg); + path->hop_limit = lap_msg->alt_hop_limit; + path->traffic_class = cm_lap_get_traffic_class(lap_msg); + path->reversible = 1; + path->pkey = cm_id_priv->pkey; + path->sl = cm_lap_get_sl(lap_msg); + path->mtu_selector = IB_SA_EQ; + path->mtu = cm_id_priv->path_mtu; + path->rate_selector = IB_SA_EQ; + path->rate = cm_lap_get_packet_rate(lap_msg); + path->packet_life_time_selector = IB_SA_EQ; + path->packet_life_time = cm_lap_get_local_ack_timeout(lap_msg); + path->packet_life_time -= (path->packet_life_time > 0); +} + +static int cm_lap_handler(struct cm_work *work) +{ + struct cm_id_private *cm_id_priv; + struct cm_lap_msg *lap_msg; + struct ib_cm_lap_event_param *param; + struct ib_mad_send_buf *msg = NULL; + int ret; + + /* todo: verify LAP request and send reject APR if invalid. */ + lap_msg = (struct cm_lap_msg *)work->mad_recv_wc->recv_buf.mad; + cm_id_priv = cm_acquire_id(lap_msg->remote_comm_id, + lap_msg->local_comm_id); + if (!cm_id_priv) + return -EINVAL; + + param = &work->cm_event.param.lap_rcvd; + param->alternate_path = &work->path[0]; + cm_format_path_from_lap(cm_id_priv, param->alternate_path, lap_msg); + work->cm_event.private_data = &lap_msg->private_data; + + spin_lock_irq(&cm_id_priv->lock); + if (cm_id_priv->id.state != IB_CM_ESTABLISHED) + goto unlock; + + switch (cm_id_priv->id.lap_state) { + case IB_CM_LAP_UNINIT: + case IB_CM_LAP_IDLE: + break; + case IB_CM_MRA_LAP_SENT: + atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_LAP_COUNTER]); + if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg)) + goto unlock; + + cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, + CM_MSG_RESPONSE_OTHER, + cm_id_priv->service_timeout, + cm_id_priv->private_data, + cm_id_priv->private_data_len); + spin_unlock_irq(&cm_id_priv->lock); + + if (ib_post_send_mad(msg, NULL)) + cm_free_msg(msg); + goto deref; + case IB_CM_LAP_RCVD: + atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_LAP_COUNTER]); + goto unlock; + default: + goto unlock; + } + + cm_id_priv->id.lap_state = IB_CM_LAP_RCVD; + cm_id_priv->tid = lap_msg->hdr.tid; + cm_init_av_for_response(work->port, work->mad_recv_wc->wc, + work->mad_recv_wc->recv_buf.grh, + &cm_id_priv->av); + cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av); + ret = atomic_inc_and_test(&cm_id_priv->work_count); + if (!ret) + list_add_tail(&work->list, &cm_id_priv->work_list); + spin_unlock_irq(&cm_id_priv->lock); + + if (ret) + cm_process_work(cm_id_priv, work); + else + cm_deref_id(cm_id_priv); + return 0; + +unlock: spin_unlock_irq(&cm_id_priv->lock); +deref: cm_deref_id(cm_id_priv); + return -EINVAL; +} + +static void cm_format_apr(struct cm_apr_msg *apr_msg, + struct cm_id_private *cm_id_priv, + enum ib_cm_apr_status status, + void *info, + u8 info_length, + const void *private_data, + u8 private_data_len) +{ + cm_format_mad_hdr(&apr_msg->hdr, CM_APR_ATTR_ID, cm_id_priv->tid); + apr_msg->local_comm_id = cm_id_priv->id.local_id; + apr_msg->remote_comm_id = cm_id_priv->id.remote_id; + apr_msg->ap_status = (u8) status; + + if (info && info_length) { + apr_msg->info_length = info_length; + memcpy(apr_msg->info, info, info_length); + } + + if (private_data && private_data_len) + memcpy(apr_msg->private_data, private_data, private_data_len); +} + +int ib_send_cm_apr(struct ib_cm_id *cm_id, + enum ib_cm_apr_status status, + void *info, + u8 info_length, + const void *private_data, + u8 private_data_len) +{ + struct cm_id_private *cm_id_priv; + struct ib_mad_send_buf *msg; + unsigned long flags; + int ret; + + if ((private_data && private_data_len > IB_CM_APR_PRIVATE_DATA_SIZE) || + (info && info_length > IB_CM_APR_INFO_LENGTH)) + return -EINVAL; + + cm_id_priv = container_of(cm_id, struct cm_id_private, id); + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id->state != IB_CM_ESTABLISHED || + (cm_id->lap_state != IB_CM_LAP_RCVD && + cm_id->lap_state != IB_CM_MRA_LAP_SENT)) { + ret = -EINVAL; + goto out; + } + + ret = cm_alloc_msg(cm_id_priv, &msg); + if (ret) + goto out; + + cm_format_apr((struct cm_apr_msg *) msg->mad, cm_id_priv, status, + info, info_length, private_data, private_data_len); + ret = ib_post_send_mad(msg, NULL); + if (ret) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + cm_free_msg(msg); + return ret; + } + + cm_id->lap_state = IB_CM_LAP_IDLE; +out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; +} +EXPORT_SYMBOL(ib_send_cm_apr); + +static int cm_apr_handler(struct cm_work *work) +{ + struct cm_id_private *cm_id_priv; + struct cm_apr_msg *apr_msg; + int ret; + + apr_msg = (struct cm_apr_msg *)work->mad_recv_wc->recv_buf.mad; + cm_id_priv = cm_acquire_id(apr_msg->remote_comm_id, + apr_msg->local_comm_id); + if (!cm_id_priv) + return -EINVAL; /* Unmatched reply. */ + + work->cm_event.param.apr_rcvd.ap_status = apr_msg->ap_status; + work->cm_event.param.apr_rcvd.apr_info = &apr_msg->info; + work->cm_event.param.apr_rcvd.info_len = apr_msg->info_length; + work->cm_event.private_data = &apr_msg->private_data; + + spin_lock_irq(&cm_id_priv->lock); + if (cm_id_priv->id.state != IB_CM_ESTABLISHED || + (cm_id_priv->id.lap_state != IB_CM_LAP_SENT && + cm_id_priv->id.lap_state != IB_CM_MRA_LAP_RCVD)) { + spin_unlock_irq(&cm_id_priv->lock); + goto out; + } + cm_id_priv->id.lap_state = IB_CM_LAP_IDLE; + ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); + cm_id_priv->msg = NULL; + + ret = atomic_inc_and_test(&cm_id_priv->work_count); + if (!ret) + list_add_tail(&work->list, &cm_id_priv->work_list); + spin_unlock_irq(&cm_id_priv->lock); + + if (ret) + cm_process_work(cm_id_priv, work); + else + cm_deref_id(cm_id_priv); + return 0; +out: + cm_deref_id(cm_id_priv); + return -EINVAL; +} + +static int cm_timewait_handler(struct cm_work *work) +{ + struct cm_timewait_info *timewait_info; + struct cm_id_private *cm_id_priv; + int ret; + + timewait_info = (struct cm_timewait_info *)work; + spin_lock_irq(&cm.lock); + list_del(&timewait_info->list); + spin_unlock_irq(&cm.lock); + + cm_id_priv = cm_acquire_id(timewait_info->work.local_id, + timewait_info->work.remote_id); + if (!cm_id_priv) + return -EINVAL; + + spin_lock_irq(&cm_id_priv->lock); + if (cm_id_priv->id.state != IB_CM_TIMEWAIT || + cm_id_priv->remote_qpn != timewait_info->remote_qpn) { + spin_unlock_irq(&cm_id_priv->lock); + goto out; + } + cm_id_priv->id.state = IB_CM_IDLE; + ret = atomic_inc_and_test(&cm_id_priv->work_count); + if (!ret) + list_add_tail(&work->list, &cm_id_priv->work_list); + spin_unlock_irq(&cm_id_priv->lock); + + if (ret) + cm_process_work(cm_id_priv, work); + else + cm_deref_id(cm_id_priv); + return 0; +out: + cm_deref_id(cm_id_priv); + return -EINVAL; +} + +static void cm_format_sidr_req(struct cm_sidr_req_msg *sidr_req_msg, + struct cm_id_private *cm_id_priv, + struct ib_cm_sidr_req_param *param) +{ + cm_format_mad_hdr(&sidr_req_msg->hdr, CM_SIDR_REQ_ATTR_ID, + cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_SIDR)); + sidr_req_msg->request_id = cm_id_priv->id.local_id; + sidr_req_msg->pkey = param->path->pkey; + sidr_req_msg->service_id = param->service_id; + + if (param->private_data && param->private_data_len) + memcpy(sidr_req_msg->private_data, param->private_data, + param->private_data_len); +} + +int ib_send_cm_sidr_req(struct ib_cm_id *cm_id, + struct ib_cm_sidr_req_param *param) +{ + struct cm_id_private *cm_id_priv; + struct ib_mad_send_buf *msg; + unsigned long flags; + int ret; + + if (!param->path || (param->private_data && + param->private_data_len > IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE)) + return -EINVAL; + + cm_id_priv = container_of(cm_id, struct cm_id_private, id); + ret = cm_init_av_by_path(param->path, &cm_id_priv->av); + if (ret) + goto out; + + cm_id->service_id = param->service_id; + cm_id->service_mask = ~cpu_to_be64(0); + cm_id_priv->timeout_ms = param->timeout_ms; + cm_id_priv->max_cm_retries = param->max_cm_retries; + ret = cm_alloc_msg(cm_id_priv, &msg); + if (ret) + goto out; + + cm_format_sidr_req((struct cm_sidr_req_msg *) msg->mad, cm_id_priv, + param); + msg->timeout_ms = cm_id_priv->timeout_ms; + msg->context[1] = (void *) (unsigned long) IB_CM_SIDR_REQ_SENT; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id->state == IB_CM_IDLE) + ret = ib_post_send_mad(msg, NULL); + else + ret = -EINVAL; + + if (ret) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + cm_free_msg(msg); + goto out; + } + cm_id->state = IB_CM_SIDR_REQ_SENT; + cm_id_priv->msg = msg; + spin_unlock_irqrestore(&cm_id_priv->lock, flags); +out: + return ret; +} +EXPORT_SYMBOL(ib_send_cm_sidr_req); + +static void cm_format_sidr_req_event(struct cm_work *work, + struct ib_cm_id *listen_id) +{ + struct cm_sidr_req_msg *sidr_req_msg; + struct ib_cm_sidr_req_event_param *param; + + sidr_req_msg = (struct cm_sidr_req_msg *) + work->mad_recv_wc->recv_buf.mad; + param = &work->cm_event.param.sidr_req_rcvd; + param->pkey = __be16_to_cpu(sidr_req_msg->pkey); + param->listen_id = listen_id; + param->port = work->port->port_num; + work->cm_event.private_data = &sidr_req_msg->private_data; +} + +static int cm_sidr_req_handler(struct cm_work *work) +{ + struct ib_cm_id *cm_id; + struct cm_id_private *cm_id_priv, *cur_cm_id_priv; + struct cm_sidr_req_msg *sidr_req_msg; + struct ib_wc *wc; + + cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL); + if (IS_ERR(cm_id)) + return PTR_ERR(cm_id); + cm_id_priv = container_of(cm_id, struct cm_id_private, id); + + /* Record SGID/SLID and request ID for lookup. */ + sidr_req_msg = (struct cm_sidr_req_msg *) + work->mad_recv_wc->recv_buf.mad; + wc = work->mad_recv_wc->wc; + cm_id_priv->av.dgid.global.subnet_prefix = cpu_to_be64(wc->slid); + cm_id_priv->av.dgid.global.interface_id = 0; + cm_init_av_for_response(work->port, work->mad_recv_wc->wc, + work->mad_recv_wc->recv_buf.grh, + &cm_id_priv->av); + cm_id_priv->id.remote_id = sidr_req_msg->request_id; + cm_id_priv->tid = sidr_req_msg->hdr.tid; + atomic_inc(&cm_id_priv->work_count); + + spin_lock_irq(&cm.lock); + cur_cm_id_priv = cm_insert_remote_sidr(cm_id_priv); + if (cur_cm_id_priv) { + spin_unlock_irq(&cm.lock); + atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_SIDR_REQ_COUNTER]); + goto out; /* Duplicate message. */ + } + cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD; + cur_cm_id_priv = cm_find_listen(cm_id->device, + sidr_req_msg->service_id, + sidr_req_msg->private_data); + if (!cur_cm_id_priv) { + spin_unlock_irq(&cm.lock); + cm_reject_sidr_req(cm_id_priv, IB_SIDR_UNSUPPORTED); + goto out; /* No match. */ + } + atomic_inc(&cur_cm_id_priv->refcount); + atomic_inc(&cm_id_priv->refcount); + spin_unlock_irq(&cm.lock); + + cm_id_priv->id.cm_handler = cur_cm_id_priv->id.cm_handler; + cm_id_priv->id.context = cur_cm_id_priv->id.context; + cm_id_priv->id.service_id = sidr_req_msg->service_id; + cm_id_priv->id.service_mask = ~cpu_to_be64(0); + + cm_format_sidr_req_event(work, &cur_cm_id_priv->id); + cm_process_work(cm_id_priv, work); + cm_deref_id(cur_cm_id_priv); + return 0; +out: + ib_destroy_cm_id(&cm_id_priv->id); + return -EINVAL; +} + +static void cm_format_sidr_rep(struct cm_sidr_rep_msg *sidr_rep_msg, + struct cm_id_private *cm_id_priv, + struct ib_cm_sidr_rep_param *param) +{ + cm_format_mad_hdr(&sidr_rep_msg->hdr, CM_SIDR_REP_ATTR_ID, + cm_id_priv->tid); + sidr_rep_msg->request_id = cm_id_priv->id.remote_id; + sidr_rep_msg->status = param->status; + cm_sidr_rep_set_qpn(sidr_rep_msg, cpu_to_be32(param->qp_num)); + sidr_rep_msg->service_id = cm_id_priv->id.service_id; + sidr_rep_msg->qkey = cpu_to_be32(param->qkey); + + if (param->info && param->info_length) + memcpy(sidr_rep_msg->info, param->info, param->info_length); + + if (param->private_data && param->private_data_len) + memcpy(sidr_rep_msg->private_data, param->private_data, + param->private_data_len); +} + +int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id, + struct ib_cm_sidr_rep_param *param) +{ + struct cm_id_private *cm_id_priv; + struct ib_mad_send_buf *msg; + unsigned long flags; + int ret; + + if ((param->info && param->info_length > IB_CM_SIDR_REP_INFO_LENGTH) || + (param->private_data && + param->private_data_len > IB_CM_SIDR_REP_PRIVATE_DATA_SIZE)) + return -EINVAL; + + cm_id_priv = container_of(cm_id, struct cm_id_private, id); + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id->state != IB_CM_SIDR_REQ_RCVD) { + ret = -EINVAL; + goto error; + } + + ret = cm_alloc_msg(cm_id_priv, &msg); + if (ret) + goto error; + + cm_format_sidr_rep((struct cm_sidr_rep_msg *) msg->mad, cm_id_priv, + param); + ret = ib_post_send_mad(msg, NULL); + if (ret) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + cm_free_msg(msg); + return ret; + } + cm_id->state = IB_CM_IDLE; + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + + spin_lock_irqsave(&cm.lock, flags); + rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table); + spin_unlock_irqrestore(&cm.lock, flags); + return 0; + +error: spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; +} +EXPORT_SYMBOL(ib_send_cm_sidr_rep); + +static void cm_format_sidr_rep_event(struct cm_work *work) +{ + struct cm_sidr_rep_msg *sidr_rep_msg; + struct ib_cm_sidr_rep_event_param *param; + + sidr_rep_msg = (struct cm_sidr_rep_msg *) + work->mad_recv_wc->recv_buf.mad; + param = &work->cm_event.param.sidr_rep_rcvd; + param->status = sidr_rep_msg->status; + param->qkey = be32_to_cpu(sidr_rep_msg->qkey); + param->qpn = be32_to_cpu(cm_sidr_rep_get_qpn(sidr_rep_msg)); + param->info = &sidr_rep_msg->info; + param->info_len = sidr_rep_msg->info_length; + work->cm_event.private_data = &sidr_rep_msg->private_data; +} + +static int cm_sidr_rep_handler(struct cm_work *work) +{ + struct cm_sidr_rep_msg *sidr_rep_msg; + struct cm_id_private *cm_id_priv; + + sidr_rep_msg = (struct cm_sidr_rep_msg *) + work->mad_recv_wc->recv_buf.mad; + cm_id_priv = cm_acquire_id(sidr_rep_msg->request_id, 0); + if (!cm_id_priv) + return -EINVAL; /* Unmatched reply. */ + + spin_lock_irq(&cm_id_priv->lock); + if (cm_id_priv->id.state != IB_CM_SIDR_REQ_SENT) { + spin_unlock_irq(&cm_id_priv->lock); + goto out; + } + cm_id_priv->id.state = IB_CM_IDLE; + ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); + spin_unlock_irq(&cm_id_priv->lock); + + cm_format_sidr_rep_event(work); + cm_process_work(cm_id_priv, work); + return 0; +out: + cm_deref_id(cm_id_priv); + return -EINVAL; +} + +static void cm_process_send_error(struct ib_mad_send_buf *msg, + enum ib_wc_status wc_status) +{ + struct cm_id_private *cm_id_priv; + struct ib_cm_event cm_event; + enum ib_cm_state state; + int ret; + + memset(&cm_event, 0, sizeof cm_event); + cm_id_priv = msg->context[0]; + + /* Discard old sends or ones without a response. */ + spin_lock_irq(&cm_id_priv->lock); + state = (enum ib_cm_state) (unsigned long) msg->context[1]; + if (msg != cm_id_priv->msg || state != cm_id_priv->id.state) + goto discard; + + switch (state) { + case IB_CM_REQ_SENT: + case IB_CM_MRA_REQ_RCVD: + cm_reset_to_idle(cm_id_priv); + cm_event.event = IB_CM_REQ_ERROR; + break; + case IB_CM_REP_SENT: + case IB_CM_MRA_REP_RCVD: + cm_reset_to_idle(cm_id_priv); + cm_event.event = IB_CM_REP_ERROR; + break; + case IB_CM_DREQ_SENT: + cm_enter_timewait(cm_id_priv); + cm_event.event = IB_CM_DREQ_ERROR; + break; + case IB_CM_SIDR_REQ_SENT: + cm_id_priv->id.state = IB_CM_IDLE; + cm_event.event = IB_CM_SIDR_REQ_ERROR; + break; + default: + goto discard; + } + spin_unlock_irq(&cm_id_priv->lock); + cm_event.param.send_status = wc_status; + + /* No other events can occur on the cm_id at this point. */ + ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &cm_event); + cm_free_msg(msg); + if (ret) + ib_destroy_cm_id(&cm_id_priv->id); + return; +discard: + spin_unlock_irq(&cm_id_priv->lock); + cm_free_msg(msg); +} + +static void cm_send_handler(struct ib_mad_agent *mad_agent, + struct ib_mad_send_wc *mad_send_wc) +{ + struct ib_mad_send_buf *msg = mad_send_wc->send_buf; + struct cm_port *port; + u16 attr_index; + + port = mad_agent->context; + attr_index = be16_to_cpu(((struct ib_mad_hdr *) + msg->mad)->attr_id) - CM_ATTR_ID_OFFSET; + + /* + * If the send was in response to a received message (context[0] is not + * set to a cm_id), and is not a REJ, then it is a send that was + * manually retried. + */ + if (!msg->context[0] && (attr_index != CM_REJ_COUNTER)) + msg->retries = 1; + + atomic_long_add(1 + msg->retries, + &port->counter_group[CM_XMIT].counter[attr_index]); + if (msg->retries) + atomic_long_add(msg->retries, + &port->counter_group[CM_XMIT_RETRIES]. + counter[attr_index]); + + switch (mad_send_wc->status) { + case IB_WC_SUCCESS: + case IB_WC_WR_FLUSH_ERR: + cm_free_msg(msg); + break; + default: + if (msg->context[0] && msg->context[1]) + cm_process_send_error(msg, mad_send_wc->status); + else + cm_free_msg(msg); + break; + } +} + +static void cm_work_handler(struct work_struct *_work) +{ + struct cm_work *work = container_of(_work, struct cm_work, work.work); + int ret; + + switch (work->cm_event.event) { + case IB_CM_REQ_RECEIVED: + ret = cm_req_handler(work); + break; + case IB_CM_MRA_RECEIVED: + ret = cm_mra_handler(work); + break; + case IB_CM_REJ_RECEIVED: + ret = cm_rej_handler(work); + break; + case IB_CM_REP_RECEIVED: + ret = cm_rep_handler(work); + break; + case IB_CM_RTU_RECEIVED: + ret = cm_rtu_handler(work); + break; + case IB_CM_USER_ESTABLISHED: + ret = cm_establish_handler(work); + break; + case IB_CM_DREQ_RECEIVED: + ret = cm_dreq_handler(work); + break; + case IB_CM_DREP_RECEIVED: + ret = cm_drep_handler(work); + break; + case IB_CM_SIDR_REQ_RECEIVED: + ret = cm_sidr_req_handler(work); + break; + case IB_CM_SIDR_REP_RECEIVED: + ret = cm_sidr_rep_handler(work); + break; + case IB_CM_LAP_RECEIVED: + ret = cm_lap_handler(work); + break; + case IB_CM_APR_RECEIVED: + ret = cm_apr_handler(work); + break; + case IB_CM_TIMEWAIT_EXIT: + ret = cm_timewait_handler(work); + break; + default: + ret = -EINVAL; + break; + } + if (ret) + cm_free_work(work); +} + +static int cm_establish(struct ib_cm_id *cm_id) +{ + struct cm_id_private *cm_id_priv; + struct cm_work *work; + unsigned long flags; + int ret = 0; + + work = kmalloc(sizeof *work, GFP_ATOMIC); + if (!work) + return -ENOMEM; + + cm_id_priv = container_of(cm_id, struct cm_id_private, id); + spin_lock_irqsave(&cm_id_priv->lock, flags); + switch (cm_id->state) + { + case IB_CM_REP_SENT: + case IB_CM_MRA_REP_RCVD: + cm_id->state = IB_CM_ESTABLISHED; + break; + case IB_CM_ESTABLISHED: + ret = -EISCONN; + break; + default: + ret = -EINVAL; + break; + } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + + if (ret) { + kfree(work); + goto out; + } + + /* + * The CM worker thread may try to destroy the cm_id before it + * can execute this work item. To prevent potential deadlock, + * we need to find the cm_id once we're in the context of the + * worker thread, rather than holding a reference on it. + */ + INIT_DELAYED_WORK(&work->work, cm_work_handler); + work->local_id = cm_id->local_id; + work->remote_id = cm_id->remote_id; + work->mad_recv_wc = NULL; + work->cm_event.event = IB_CM_USER_ESTABLISHED; + queue_delayed_work(cm.wq, &work->work, 0); +out: + return ret; +} + +static int cm_migrate(struct ib_cm_id *cm_id) +{ + struct cm_id_private *cm_id_priv; + unsigned long flags; + int ret = 0; + + cm_id_priv = container_of(cm_id, struct cm_id_private, id); + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id->state == IB_CM_ESTABLISHED && + (cm_id->lap_state == IB_CM_LAP_UNINIT || + cm_id->lap_state == IB_CM_LAP_IDLE)) { + cm_id->lap_state = IB_CM_LAP_IDLE; + cm_id_priv->av = cm_id_priv->alt_av; + } else + ret = -EINVAL; + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + + return ret; +} + +int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event) +{ + int ret; + + switch (event) { + case IB_EVENT_COMM_EST: + ret = cm_establish(cm_id); + break; + case IB_EVENT_PATH_MIG: + ret = cm_migrate(cm_id); + break; + default: + ret = -EINVAL; + } + return ret; +} +EXPORT_SYMBOL(ib_cm_notify); + +static void cm_recv_handler(struct ib_mad_agent *mad_agent, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct cm_port *port = mad_agent->context; + struct cm_work *work; + enum ib_cm_event_type event; + u16 attr_id; + int paths = 0; + + switch (mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) { + case CM_REQ_ATTR_ID: + paths = 1 + (((struct cm_req_msg *) mad_recv_wc->recv_buf.mad)-> + alt_local_lid != 0); + event = IB_CM_REQ_RECEIVED; + break; + case CM_MRA_ATTR_ID: + event = IB_CM_MRA_RECEIVED; + break; + case CM_REJ_ATTR_ID: + event = IB_CM_REJ_RECEIVED; + break; + case CM_REP_ATTR_ID: + event = IB_CM_REP_RECEIVED; + break; + case CM_RTU_ATTR_ID: + event = IB_CM_RTU_RECEIVED; + break; + case CM_DREQ_ATTR_ID: + event = IB_CM_DREQ_RECEIVED; + break; + case CM_DREP_ATTR_ID: + event = IB_CM_DREP_RECEIVED; + break; + case CM_SIDR_REQ_ATTR_ID: + event = IB_CM_SIDR_REQ_RECEIVED; + break; + case CM_SIDR_REP_ATTR_ID: + event = IB_CM_SIDR_REP_RECEIVED; + break; + case CM_LAP_ATTR_ID: + paths = 1; + event = IB_CM_LAP_RECEIVED; + break; + case CM_APR_ATTR_ID: + event = IB_CM_APR_RECEIVED; + break; + default: + ib_free_recv_mad(mad_recv_wc); + return; + } + + attr_id = be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.attr_id); + atomic_long_inc(&port->counter_group[CM_RECV]. + counter[attr_id - CM_ATTR_ID_OFFSET]); + + work = kmalloc(sizeof *work + sizeof(struct ib_sa_path_rec) * paths, + GFP_KERNEL); + if (!work) { + ib_free_recv_mad(mad_recv_wc); + return; + } + + INIT_DELAYED_WORK(&work->work, cm_work_handler); + work->cm_event.event = event; + work->mad_recv_wc = mad_recv_wc; + work->port = port; + queue_delayed_work(cm.wq, &work->work, 0); +} + +static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv, + struct ib_qp_attr *qp_attr, + int *qp_attr_mask) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + switch (cm_id_priv->id.state) { + case IB_CM_REQ_SENT: + case IB_CM_MRA_REQ_RCVD: + case IB_CM_REQ_RCVD: + case IB_CM_MRA_REQ_SENT: + case IB_CM_REP_RCVD: + case IB_CM_MRA_REP_SENT: + case IB_CM_REP_SENT: + case IB_CM_MRA_REP_RCVD: + case IB_CM_ESTABLISHED: + *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX | IB_QP_PORT; + qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE; + if (cm_id_priv->responder_resources) + qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_ATOMIC; + qp_attr->pkey_index = cm_id_priv->av.pkey_index; + qp_attr->port_num = cm_id_priv->av.port->port_num; + ret = 0; + break; + default: + ret = -EINVAL; + break; + } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; +} + +static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv, + struct ib_qp_attr *qp_attr, + int *qp_attr_mask) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + switch (cm_id_priv->id.state) { + case IB_CM_REQ_RCVD: + case IB_CM_MRA_REQ_SENT: + case IB_CM_REP_RCVD: + case IB_CM_MRA_REP_SENT: + case IB_CM_REP_SENT: + case IB_CM_MRA_REP_RCVD: + case IB_CM_ESTABLISHED: + *qp_attr_mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | + IB_QP_DEST_QPN | IB_QP_RQ_PSN; + qp_attr->ah_attr = cm_id_priv->av.ah_attr; + if (!cm_id_priv->av.valid) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return -EINVAL; + } + if (cm_id_priv->av.ah_attr.vlan_id != 0xffff) { + qp_attr->vlan_id = cm_id_priv->av.ah_attr.vlan_id; + *qp_attr_mask |= IB_QP_VID; + } + if (!is_zero_ether_addr(cm_id_priv->av.smac)) { + memcpy(qp_attr->smac, cm_id_priv->av.smac, + sizeof(qp_attr->smac)); + *qp_attr_mask |= IB_QP_SMAC; + } + if (cm_id_priv->alt_av.valid) { + if (cm_id_priv->alt_av.ah_attr.vlan_id != 0xffff) { + qp_attr->alt_vlan_id = + cm_id_priv->alt_av.ah_attr.vlan_id; + *qp_attr_mask |= IB_QP_ALT_VID; + } + if (!is_zero_ether_addr(cm_id_priv->alt_av.smac)) { + memcpy(qp_attr->alt_smac, + cm_id_priv->alt_av.smac, + sizeof(qp_attr->alt_smac)); + *qp_attr_mask |= IB_QP_ALT_SMAC; + } + } + qp_attr->path_mtu = cm_id_priv->path_mtu; + qp_attr->dest_qp_num = be32_to_cpu(cm_id_priv->remote_qpn); + qp_attr->rq_psn = be32_to_cpu(cm_id_priv->rq_psn); + if (cm_id_priv->qp_type == IB_QPT_RC || + cm_id_priv->qp_type == IB_QPT_XRC_TGT) { + *qp_attr_mask |= IB_QP_MAX_DEST_RD_ATOMIC | + IB_QP_MIN_RNR_TIMER; + qp_attr->max_dest_rd_atomic = + cm_id_priv->responder_resources; + qp_attr->min_rnr_timer = 0; + } + if (cm_id_priv->alt_av.ah_attr.dlid) { + *qp_attr_mask |= IB_QP_ALT_PATH; + qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num; + qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index; + qp_attr->alt_timeout = cm_id_priv->alt_av.timeout; + qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr; + } + ret = 0; + break; + default: + ret = -EINVAL; + break; + } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; +} + +static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv, + struct ib_qp_attr *qp_attr, + int *qp_attr_mask) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + switch (cm_id_priv->id.state) { + /* Allow transition to RTS before sending REP */ + case IB_CM_REQ_RCVD: + case IB_CM_MRA_REQ_SENT: + + case IB_CM_REP_RCVD: + case IB_CM_MRA_REP_SENT: + case IB_CM_REP_SENT: + case IB_CM_MRA_REP_RCVD: + case IB_CM_ESTABLISHED: + if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT) { + *qp_attr_mask = IB_QP_STATE | IB_QP_SQ_PSN; + qp_attr->sq_psn = be32_to_cpu(cm_id_priv->sq_psn); + switch (cm_id_priv->qp_type) { + case IB_QPT_RC: + case IB_QPT_XRC_INI: + *qp_attr_mask |= IB_QP_RETRY_CNT | IB_QP_RNR_RETRY | + IB_QP_MAX_QP_RD_ATOMIC; + qp_attr->retry_cnt = cm_id_priv->retry_count; + qp_attr->rnr_retry = cm_id_priv->rnr_retry_count; + qp_attr->max_rd_atomic = cm_id_priv->initiator_depth; + /* fall through */ + case IB_QPT_XRC_TGT: + *qp_attr_mask |= IB_QP_TIMEOUT; + qp_attr->timeout = cm_id_priv->av.timeout; + break; + default: + break; + } + if (cm_id_priv->alt_av.ah_attr.dlid) { + *qp_attr_mask |= IB_QP_PATH_MIG_STATE; + qp_attr->path_mig_state = IB_MIG_REARM; + } + } else { + *qp_attr_mask = IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE; + qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num; + qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index; + qp_attr->alt_timeout = cm_id_priv->alt_av.timeout; + qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr; + qp_attr->path_mig_state = IB_MIG_REARM; + } + ret = 0; + break; + default: + ret = -EINVAL; + break; + } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; +} + +int ib_cm_init_qp_attr(struct ib_cm_id *cm_id, + struct ib_qp_attr *qp_attr, + int *qp_attr_mask) +{ + struct cm_id_private *cm_id_priv; + int ret; + + cm_id_priv = container_of(cm_id, struct cm_id_private, id); + switch (qp_attr->qp_state) { + case IB_QPS_INIT: + ret = cm_init_qp_init_attr(cm_id_priv, qp_attr, qp_attr_mask); + break; + case IB_QPS_RTR: + ret = cm_init_qp_rtr_attr(cm_id_priv, qp_attr, qp_attr_mask); + break; + case IB_QPS_RTS: + ret = cm_init_qp_rts_attr(cm_id_priv, qp_attr, qp_attr_mask); + break; + default: + ret = -EINVAL; + break; + } + return ret; +} +EXPORT_SYMBOL(ib_cm_init_qp_attr); + +static void cm_get_ack_delay(struct cm_device *cm_dev) +{ + struct ib_device_attr attr; + + if (ib_query_device(cm_dev->ib_device, &attr)) + cm_dev->ack_delay = 0; /* acks will rely on packet life time */ + else + cm_dev->ack_delay = attr.local_ca_ack_delay; +} + +static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr, + char *buf) +{ + struct cm_counter_group *group; + struct cm_counter_attribute *cm_attr; + + group = container_of(obj, struct cm_counter_group, obj); + cm_attr = container_of(attr, struct cm_counter_attribute, attr); + + return sprintf(buf, "%ld\n", + atomic_long_read(&group->counter[cm_attr->index])); +} + +static const struct sysfs_ops cm_counter_ops = { + .show = cm_show_counter +}; + +static struct kobj_type cm_counter_obj_type = { + .sysfs_ops = &cm_counter_ops, + .default_attrs = cm_counter_default_attrs +}; + +static void cm_release_port_obj(struct kobject *obj) +{ + struct cm_port *cm_port; + + cm_port = container_of(obj, struct cm_port, port_obj); + kfree(cm_port); +} + +static struct kobj_type cm_port_obj_type = { + .release = cm_release_port_obj +}; + +static char *cm_devnode(struct device *dev, umode_t *mode) +{ + if (mode) + *mode = 0666; + return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev)); +} + +struct class cm_class = { + .owner = THIS_MODULE, + .name = "infiniband_cm", + .devnode = cm_devnode, +}; +EXPORT_SYMBOL(cm_class); + +static int cm_create_port_fs(struct cm_port *port) +{ + int i, ret; + + ret = kobject_init_and_add(&port->port_obj, &cm_port_obj_type, + &port->cm_dev->device->kobj, + "%d", port->port_num); + if (ret) { + kfree(port); + return ret; + } + + for (i = 0; i < CM_COUNTER_GROUPS; i++) { + ret = kobject_init_and_add(&port->counter_group[i].obj, + &cm_counter_obj_type, + &port->port_obj, + "%s", counter_group_names[i]); + if (ret) + goto error; + } + + return 0; + +error: + while (i--) + kobject_put(&port->counter_group[i].obj); + kobject_put(&port->port_obj); + return ret; + +} + +static void cm_remove_port_fs(struct cm_port *port) +{ + int i; + + for (i = 0; i < CM_COUNTER_GROUPS; i++) + kobject_put(&port->counter_group[i].obj); + + kobject_put(&port->port_obj); +} + +static void cm_add_one(struct ib_device *ib_device) +{ + struct cm_device *cm_dev; + struct cm_port *port; + struct ib_mad_reg_req reg_req = { + .mgmt_class = IB_MGMT_CLASS_CM, + .mgmt_class_version = IB_CM_CLASS_VERSION, + }; + struct ib_port_modify port_modify = { + .set_port_cap_mask = IB_PORT_CM_SUP + }; + unsigned long flags; + int ret; + u8 i; + + if (rdma_node_get_transport(ib_device->node_type) != RDMA_TRANSPORT_IB) + return; + + cm_dev = kzalloc(sizeof(*cm_dev) + sizeof(*port) * + ib_device->phys_port_cnt, GFP_KERNEL); + if (!cm_dev) + return; + + cm_dev->ib_device = ib_device; + cm_get_ack_delay(cm_dev); + + cm_dev->device = device_create(&cm_class, &ib_device->dev, + MKDEV(0, 0), NULL, + "%s", ib_device->name); + if (IS_ERR(cm_dev->device)) { + kfree(cm_dev); + return; + } + + set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask); + for (i = 1; i <= ib_device->phys_port_cnt; i++) { + port = kzalloc(sizeof *port, GFP_KERNEL); + if (!port) + goto error1; + + cm_dev->port[i-1] = port; + port->cm_dev = cm_dev; + port->port_num = i; + + ret = cm_create_port_fs(port); + if (ret) + goto error1; + + port->mad_agent = ib_register_mad_agent(ib_device, i, + IB_QPT_GSI, + ®_req, + 0, + cm_send_handler, + cm_recv_handler, + port, + 0); + if (IS_ERR(port->mad_agent)) + goto error2; + + ret = ib_modify_port(ib_device, i, 0, &port_modify); + if (ret) + goto error3; + } + ib_set_client_data(ib_device, &cm_client, cm_dev); + + write_lock_irqsave(&cm.device_lock, flags); + list_add_tail(&cm_dev->list, &cm.device_list); + write_unlock_irqrestore(&cm.device_lock, flags); + return; + +error3: + ib_unregister_mad_agent(port->mad_agent); +error2: + cm_remove_port_fs(port); +error1: + port_modify.set_port_cap_mask = 0; + port_modify.clr_port_cap_mask = IB_PORT_CM_SUP; + while (--i) { + port = cm_dev->port[i-1]; + ib_modify_port(ib_device, port->port_num, 0, &port_modify); + ib_unregister_mad_agent(port->mad_agent); + cm_remove_port_fs(port); + } + device_unregister(cm_dev->device); + kfree(cm_dev); +} + +static void cm_remove_one(struct ib_device *ib_device) +{ + struct cm_device *cm_dev; + struct cm_port *port; + struct ib_port_modify port_modify = { + .clr_port_cap_mask = IB_PORT_CM_SUP + }; + unsigned long flags; + int i; + + cm_dev = ib_get_client_data(ib_device, &cm_client); + if (!cm_dev) + return; + + write_lock_irqsave(&cm.device_lock, flags); + list_del(&cm_dev->list); + write_unlock_irqrestore(&cm.device_lock, flags); + + for (i = 1; i <= ib_device->phys_port_cnt; i++) { + port = cm_dev->port[i-1]; + ib_modify_port(ib_device, port->port_num, 0, &port_modify); + ib_unregister_mad_agent(port->mad_agent); + flush_workqueue(cm.wq); + cm_remove_port_fs(port); + } + device_unregister(cm_dev->device); + kfree(cm_dev); +} + +static int __init ib_cm_init(void) +{ + int ret; + + memset(&cm, 0, sizeof cm); + INIT_LIST_HEAD(&cm.device_list); + rwlock_init(&cm.device_lock); + spin_lock_init(&cm.lock); + cm.listen_service_table = RB_ROOT; + cm.listen_service_id = be64_to_cpu(IB_CM_ASSIGN_SERVICE_ID); + cm.remote_id_table = RB_ROOT; + cm.remote_qp_table = RB_ROOT; + cm.remote_sidr_table = RB_ROOT; + idr_init(&cm.local_id_table); + get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand); + INIT_LIST_HEAD(&cm.timewait_list); + + ret = class_register(&cm_class); + if (ret) { + ret = -ENOMEM; + goto error1; + } + + cm.wq = create_workqueue("ib_cm"); + if (!cm.wq) { + ret = -ENOMEM; + goto error2; + } + + ret = ib_register_client(&cm_client); + if (ret) + goto error3; + + return 0; +error3: + destroy_workqueue(cm.wq); +error2: + class_unregister(&cm_class); +error1: + idr_destroy(&cm.local_id_table); + return ret; +} + +static void __exit ib_cm_cleanup(void) +{ + struct cm_timewait_info *timewait_info, *tmp; + + spin_lock_irq(&cm.lock); + list_for_each_entry(timewait_info, &cm.timewait_list, list) + cancel_delayed_work(&timewait_info->work.work); + spin_unlock_irq(&cm.lock); + + ib_unregister_client(&cm_client); + destroy_workqueue(cm.wq); + + list_for_each_entry_safe(timewait_info, tmp, &cm.timewait_list, list) { + list_del(&timewait_info->list); + kfree(timewait_info); + } + + class_unregister(&cm_class); + idr_destroy(&cm.local_id_table); +} + +module_init(ib_cm_init); +module_exit(ib_cm_cleanup); + diff --git a/drivers/infiniband/core/cm_msgs.h b/drivers/infiniband/core/cm_msgs.h new file mode 100644 index 0000000..be068f4 --- /dev/null +++ b/drivers/infiniband/core/cm_msgs.h @@ -0,0 +1,836 @@ +/* + * Copyright (c) 2004, 2011 Intel Corporation. All rights reserved. + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2004 Voltaire Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING the madirectory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use source and binary forms, with or + * withmodification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retathe above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHWARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS THE + * SOFTWARE. + */ +#if !defined(CM_MSGS_H) +#define CM_MSGS_H + +#include +#include + +/* + * Parameters to routines below should be in network-byte order, and values + * are returned in network-byte order. + */ + +#define IB_CM_CLASS_VERSION 2 /* IB specification 1.2 */ + +enum cm_msg_sequence { + CM_MSG_SEQUENCE_REQ, + CM_MSG_SEQUENCE_LAP, + CM_MSG_SEQUENCE_DREQ, + CM_MSG_SEQUENCE_SIDR +}; + +struct cm_req_msg { + struct ib_mad_hdr hdr; + + __be32 local_comm_id; + __be32 rsvd4; + __be64 service_id; + __be64 local_ca_guid; + __be32 rsvd24; + __be32 local_qkey; + /* local QPN:24, responder resources:8 */ + __be32 offset32; + /* local EECN:24, initiator depth:8 */ + __be32 offset36; + /* + * remote EECN:24, remote CM response timeout:5, + * transport service type:2, end-to-end flow control:1 + */ + __be32 offset40; + /* starting PSN:24, local CM response timeout:5, retry count:3 */ + __be32 offset44; + __be16 pkey; + /* path MTU:4, RDC exists:1, RNR retry count:3. */ + u8 offset50; + /* max CM Retries:4, SRQ:1, extended transport type:3 */ + u8 offset51; + + __be16 primary_local_lid; + __be16 primary_remote_lid; + union ib_gid primary_local_gid; + union ib_gid primary_remote_gid; + /* flow label:20, rsvd:6, packet rate:6 */ + __be32 primary_offset88; + u8 primary_traffic_class; + u8 primary_hop_limit; + /* SL:4, subnet local:1, rsvd:3 */ + u8 primary_offset94; + /* local ACK timeout:5, rsvd:3 */ + u8 primary_offset95; + + __be16 alt_local_lid; + __be16 alt_remote_lid; + union ib_gid alt_local_gid; + union ib_gid alt_remote_gid; + /* flow label:20, rsvd:6, packet rate:6 */ + __be32 alt_offset132; + u8 alt_traffic_class; + u8 alt_hop_limit; + /* SL:4, subnet local:1, rsvd:3 */ + u8 alt_offset138; + /* local ACK timeout:5, rsvd:3 */ + u8 alt_offset139; + + u8 private_data[IB_CM_REQ_PRIVATE_DATA_SIZE]; + +} __attribute__ ((packed)); + +static inline __be32 cm_req_get_local_qpn(struct cm_req_msg *req_msg) +{ + return cpu_to_be32(be32_to_cpu(req_msg->offset32) >> 8); +} + +static inline void cm_req_set_local_qpn(struct cm_req_msg *req_msg, __be32 qpn) +{ + req_msg->offset32 = cpu_to_be32((be32_to_cpu(qpn) << 8) | + (be32_to_cpu(req_msg->offset32) & + 0x000000FF)); +} + +static inline u8 cm_req_get_resp_res(struct cm_req_msg *req_msg) +{ + return (u8) be32_to_cpu(req_msg->offset32); +} + +static inline void cm_req_set_resp_res(struct cm_req_msg *req_msg, u8 resp_res) +{ + req_msg->offset32 = cpu_to_be32(resp_res | + (be32_to_cpu(req_msg->offset32) & + 0xFFFFFF00)); +} + +static inline u8 cm_req_get_init_depth(struct cm_req_msg *req_msg) +{ + return (u8) be32_to_cpu(req_msg->offset36); +} + +static inline void cm_req_set_init_depth(struct cm_req_msg *req_msg, + u8 init_depth) +{ + req_msg->offset36 = cpu_to_be32(init_depth | + (be32_to_cpu(req_msg->offset36) & + 0xFFFFFF00)); +} + +static inline u8 cm_req_get_remote_resp_timeout(struct cm_req_msg *req_msg) +{ + return (u8) ((be32_to_cpu(req_msg->offset40) & 0xF8) >> 3); +} + +static inline void cm_req_set_remote_resp_timeout(struct cm_req_msg *req_msg, + u8 resp_timeout) +{ + req_msg->offset40 = cpu_to_be32((resp_timeout << 3) | + (be32_to_cpu(req_msg->offset40) & + 0xFFFFFF07)); +} + +static inline enum ib_qp_type cm_req_get_qp_type(struct cm_req_msg *req_msg) +{ + u8 transport_type = (u8) (be32_to_cpu(req_msg->offset40) & 0x06) >> 1; + switch(transport_type) { + case 0: return IB_QPT_RC; + case 1: return IB_QPT_UC; + case 3: + switch (req_msg->offset51 & 0x7) { + case 1: return IB_QPT_XRC_TGT; + default: return 0; + } + default: return 0; + } +} + +static inline void cm_req_set_qp_type(struct cm_req_msg *req_msg, + enum ib_qp_type qp_type) +{ + switch(qp_type) { + case IB_QPT_UC: + req_msg->offset40 = cpu_to_be32((be32_to_cpu( + req_msg->offset40) & + 0xFFFFFFF9) | 0x2); + break; + case IB_QPT_XRC_INI: + req_msg->offset40 = cpu_to_be32((be32_to_cpu( + req_msg->offset40) & + 0xFFFFFFF9) | 0x6); + req_msg->offset51 = (req_msg->offset51 & 0xF8) | 1; + break; + default: + req_msg->offset40 = cpu_to_be32(be32_to_cpu( + req_msg->offset40) & + 0xFFFFFFF9); + } +} + +static inline u8 cm_req_get_flow_ctrl(struct cm_req_msg *req_msg) +{ + return be32_to_cpu(req_msg->offset40) & 0x1; +} + +static inline void cm_req_set_flow_ctrl(struct cm_req_msg *req_msg, + u8 flow_ctrl) +{ + req_msg->offset40 = cpu_to_be32((flow_ctrl & 0x1) | + (be32_to_cpu(req_msg->offset40) & + 0xFFFFFFFE)); +} + +static inline __be32 cm_req_get_starting_psn(struct cm_req_msg *req_msg) +{ + return cpu_to_be32(be32_to_cpu(req_msg->offset44) >> 8); +} + +static inline void cm_req_set_starting_psn(struct cm_req_msg *req_msg, + __be32 starting_psn) +{ + req_msg->offset44 = cpu_to_be32((be32_to_cpu(starting_psn) << 8) | + (be32_to_cpu(req_msg->offset44) & 0x000000FF)); +} + +static inline u8 cm_req_get_local_resp_timeout(struct cm_req_msg *req_msg) +{ + return (u8) ((be32_to_cpu(req_msg->offset44) & 0xF8) >> 3); +} + +static inline void cm_req_set_local_resp_timeout(struct cm_req_msg *req_msg, + u8 resp_timeout) +{ + req_msg->offset44 = cpu_to_be32((resp_timeout << 3) | + (be32_to_cpu(req_msg->offset44) & 0xFFFFFF07)); +} + +static inline u8 cm_req_get_retry_count(struct cm_req_msg *req_msg) +{ + return (u8) (be32_to_cpu(req_msg->offset44) & 0x7); +} + +static inline void cm_req_set_retry_count(struct cm_req_msg *req_msg, + u8 retry_count) +{ + req_msg->offset44 = cpu_to_be32((retry_count & 0x7) | + (be32_to_cpu(req_msg->offset44) & 0xFFFFFFF8)); +} + +static inline u8 cm_req_get_path_mtu(struct cm_req_msg *req_msg) +{ + return req_msg->offset50 >> 4; +} + +static inline void cm_req_set_path_mtu(struct cm_req_msg *req_msg, u8 path_mtu) +{ + req_msg->offset50 = (u8) ((req_msg->offset50 & 0xF) | (path_mtu << 4)); +} + +static inline u8 cm_req_get_rnr_retry_count(struct cm_req_msg *req_msg) +{ + return req_msg->offset50 & 0x7; +} + +static inline void cm_req_set_rnr_retry_count(struct cm_req_msg *req_msg, + u8 rnr_retry_count) +{ + req_msg->offset50 = (u8) ((req_msg->offset50 & 0xF8) | + (rnr_retry_count & 0x7)); +} + +static inline u8 cm_req_get_max_cm_retries(struct cm_req_msg *req_msg) +{ + return req_msg->offset51 >> 4; +} + +static inline void cm_req_set_max_cm_retries(struct cm_req_msg *req_msg, + u8 retries) +{ + req_msg->offset51 = (u8) ((req_msg->offset51 & 0xF) | (retries << 4)); +} + +static inline u8 cm_req_get_srq(struct cm_req_msg *req_msg) +{ + return (req_msg->offset51 & 0x8) >> 3; +} + +static inline void cm_req_set_srq(struct cm_req_msg *req_msg, u8 srq) +{ + req_msg->offset51 = (u8) ((req_msg->offset51 & 0xF7) | + ((srq & 0x1) << 3)); +} + +static inline __be32 cm_req_get_primary_flow_label(struct cm_req_msg *req_msg) +{ + return cpu_to_be32(be32_to_cpu(req_msg->primary_offset88) >> 12); +} + +static inline void cm_req_set_primary_flow_label(struct cm_req_msg *req_msg, + __be32 flow_label) +{ + req_msg->primary_offset88 = cpu_to_be32( + (be32_to_cpu(req_msg->primary_offset88) & + 0x00000FFF) | + (be32_to_cpu(flow_label) << 12)); +} + +static inline u8 cm_req_get_primary_packet_rate(struct cm_req_msg *req_msg) +{ + return (u8) (be32_to_cpu(req_msg->primary_offset88) & 0x3F); +} + +static inline void cm_req_set_primary_packet_rate(struct cm_req_msg *req_msg, + u8 rate) +{ + req_msg->primary_offset88 = cpu_to_be32( + (be32_to_cpu(req_msg->primary_offset88) & + 0xFFFFFFC0) | (rate & 0x3F)); +} + +static inline u8 cm_req_get_primary_sl(struct cm_req_msg *req_msg) +{ + return (u8) (req_msg->primary_offset94 >> 4); +} + +static inline void cm_req_set_primary_sl(struct cm_req_msg *req_msg, u8 sl) +{ + req_msg->primary_offset94 = (u8) ((req_msg->primary_offset94 & 0x0F) | + (sl << 4)); +} + +static inline u8 cm_req_get_primary_subnet_local(struct cm_req_msg *req_msg) +{ + return (u8) ((req_msg->primary_offset94 & 0x08) >> 3); +} + +static inline void cm_req_set_primary_subnet_local(struct cm_req_msg *req_msg, + u8 subnet_local) +{ + req_msg->primary_offset94 = (u8) ((req_msg->primary_offset94 & 0xF7) | + ((subnet_local & 0x1) << 3)); +} + +static inline u8 cm_req_get_primary_local_ack_timeout(struct cm_req_msg *req_msg) +{ + return (u8) (req_msg->primary_offset95 >> 3); +} + +static inline void cm_req_set_primary_local_ack_timeout(struct cm_req_msg *req_msg, + u8 local_ack_timeout) +{ + req_msg->primary_offset95 = (u8) ((req_msg->primary_offset95 & 0x07) | + (local_ack_timeout << 3)); +} + +static inline __be32 cm_req_get_alt_flow_label(struct cm_req_msg *req_msg) +{ + return cpu_to_be32(be32_to_cpu(req_msg->alt_offset132) >> 12); +} + +static inline void cm_req_set_alt_flow_label(struct cm_req_msg *req_msg, + __be32 flow_label) +{ + req_msg->alt_offset132 = cpu_to_be32( + (be32_to_cpu(req_msg->alt_offset132) & + 0x00000FFF) | + (be32_to_cpu(flow_label) << 12)); +} + +static inline u8 cm_req_get_alt_packet_rate(struct cm_req_msg *req_msg) +{ + return (u8) (be32_to_cpu(req_msg->alt_offset132) & 0x3F); +} + +static inline void cm_req_set_alt_packet_rate(struct cm_req_msg *req_msg, + u8 rate) +{ + req_msg->alt_offset132 = cpu_to_be32( + (be32_to_cpu(req_msg->alt_offset132) & + 0xFFFFFFC0) | (rate & 0x3F)); +} + +static inline u8 cm_req_get_alt_sl(struct cm_req_msg *req_msg) +{ + return (u8) (req_msg->alt_offset138 >> 4); +} + +static inline void cm_req_set_alt_sl(struct cm_req_msg *req_msg, u8 sl) +{ + req_msg->alt_offset138 = (u8) ((req_msg->alt_offset138 & 0x0F) | + (sl << 4)); +} + +static inline u8 cm_req_get_alt_subnet_local(struct cm_req_msg *req_msg) +{ + return (u8) ((req_msg->alt_offset138 & 0x08) >> 3); +} + +static inline void cm_req_set_alt_subnet_local(struct cm_req_msg *req_msg, + u8 subnet_local) +{ + req_msg->alt_offset138 = (u8) ((req_msg->alt_offset138 & 0xF7) | + ((subnet_local & 0x1) << 3)); +} + +static inline u8 cm_req_get_alt_local_ack_timeout(struct cm_req_msg *req_msg) +{ + return (u8) (req_msg->alt_offset139 >> 3); +} + +static inline void cm_req_set_alt_local_ack_timeout(struct cm_req_msg *req_msg, + u8 local_ack_timeout) +{ + req_msg->alt_offset139 = (u8) ((req_msg->alt_offset139 & 0x07) | + (local_ack_timeout << 3)); +} + +/* Message REJected or MRAed */ +enum cm_msg_response { + CM_MSG_RESPONSE_REQ = 0x0, + CM_MSG_RESPONSE_REP = 0x1, + CM_MSG_RESPONSE_OTHER = 0x2 +}; + + struct cm_mra_msg { + struct ib_mad_hdr hdr; + + __be32 local_comm_id; + __be32 remote_comm_id; + /* message MRAed:2, rsvd:6 */ + u8 offset8; + /* service timeout:5, rsvd:3 */ + u8 offset9; + + u8 private_data[IB_CM_MRA_PRIVATE_DATA_SIZE]; + +} __attribute__ ((packed)); + +static inline u8 cm_mra_get_msg_mraed(struct cm_mra_msg *mra_msg) +{ + return (u8) (mra_msg->offset8 >> 6); +} + +static inline void cm_mra_set_msg_mraed(struct cm_mra_msg *mra_msg, u8 msg) +{ + mra_msg->offset8 = (u8) ((mra_msg->offset8 & 0x3F) | (msg << 6)); +} + +static inline u8 cm_mra_get_service_timeout(struct cm_mra_msg *mra_msg) +{ + return (u8) (mra_msg->offset9 >> 3); +} + +static inline void cm_mra_set_service_timeout(struct cm_mra_msg *mra_msg, + u8 service_timeout) +{ + mra_msg->offset9 = (u8) ((mra_msg->offset9 & 0x07) | + (service_timeout << 3)); +} + +struct cm_rej_msg { + struct ib_mad_hdr hdr; + + __be32 local_comm_id; + __be32 remote_comm_id; + /* message REJected:2, rsvd:6 */ + u8 offset8; + /* reject info length:7, rsvd:1. */ + u8 offset9; + __be16 reason; + u8 ari[IB_CM_REJ_ARI_LENGTH]; + + u8 private_data[IB_CM_REJ_PRIVATE_DATA_SIZE]; + +} __attribute__ ((packed)); + +static inline u8 cm_rej_get_msg_rejected(struct cm_rej_msg *rej_msg) +{ + return (u8) (rej_msg->offset8 >> 6); +} + +static inline void cm_rej_set_msg_rejected(struct cm_rej_msg *rej_msg, u8 msg) +{ + rej_msg->offset8 = (u8) ((rej_msg->offset8 & 0x3F) | (msg << 6)); +} + +static inline u8 cm_rej_get_reject_info_len(struct cm_rej_msg *rej_msg) +{ + return (u8) (rej_msg->offset9 >> 1); +} + +static inline void cm_rej_set_reject_info_len(struct cm_rej_msg *rej_msg, + u8 len) +{ + rej_msg->offset9 = (u8) ((rej_msg->offset9 & 0x1) | (len << 1)); +} + +struct cm_rep_msg { + struct ib_mad_hdr hdr; + + __be32 local_comm_id; + __be32 remote_comm_id; + __be32 local_qkey; + /* local QPN:24, rsvd:8 */ + __be32 offset12; + /* local EECN:24, rsvd:8 */ + __be32 offset16; + /* starting PSN:24 rsvd:8 */ + __be32 offset20; + u8 resp_resources; + u8 initiator_depth; + /* target ACK delay:5, failover accepted:2, end-to-end flow control:1 */ + u8 offset26; + /* RNR retry count:3, SRQ:1, rsvd:5 */ + u8 offset27; + __be64 local_ca_guid; + + u8 private_data[IB_CM_REP_PRIVATE_DATA_SIZE]; + +} __attribute__ ((packed)); + +static inline __be32 cm_rep_get_local_qpn(struct cm_rep_msg *rep_msg) +{ + return cpu_to_be32(be32_to_cpu(rep_msg->offset12) >> 8); +} + +static inline void cm_rep_set_local_qpn(struct cm_rep_msg *rep_msg, __be32 qpn) +{ + rep_msg->offset12 = cpu_to_be32((be32_to_cpu(qpn) << 8) | + (be32_to_cpu(rep_msg->offset12) & 0x000000FF)); +} + +static inline __be32 cm_rep_get_local_eecn(struct cm_rep_msg *rep_msg) +{ + return cpu_to_be32(be32_to_cpu(rep_msg->offset16) >> 8); +} + +static inline void cm_rep_set_local_eecn(struct cm_rep_msg *rep_msg, __be32 eecn) +{ + rep_msg->offset16 = cpu_to_be32((be32_to_cpu(eecn) << 8) | + (be32_to_cpu(rep_msg->offset16) & 0x000000FF)); +} + +static inline __be32 cm_rep_get_qpn(struct cm_rep_msg *rep_msg, enum ib_qp_type qp_type) +{ + return (qp_type == IB_QPT_XRC_INI) ? + cm_rep_get_local_eecn(rep_msg) : cm_rep_get_local_qpn(rep_msg); +} + +static inline __be32 cm_rep_get_starting_psn(struct cm_rep_msg *rep_msg) +{ + return cpu_to_be32(be32_to_cpu(rep_msg->offset20) >> 8); +} + +static inline void cm_rep_set_starting_psn(struct cm_rep_msg *rep_msg, + __be32 starting_psn) +{ + rep_msg->offset20 = cpu_to_be32((be32_to_cpu(starting_psn) << 8) | + (be32_to_cpu(rep_msg->offset20) & 0x000000FF)); +} + +static inline u8 cm_rep_get_target_ack_delay(struct cm_rep_msg *rep_msg) +{ + return (u8) (rep_msg->offset26 >> 3); +} + +static inline void cm_rep_set_target_ack_delay(struct cm_rep_msg *rep_msg, + u8 target_ack_delay) +{ + rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0x07) | + (target_ack_delay << 3)); +} + +static inline u8 cm_rep_get_failover(struct cm_rep_msg *rep_msg) +{ + return (u8) ((rep_msg->offset26 & 0x06) >> 1); +} + +static inline void cm_rep_set_failover(struct cm_rep_msg *rep_msg, u8 failover) +{ + rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0xF9) | + ((failover & 0x3) << 1)); +} + +static inline u8 cm_rep_get_flow_ctrl(struct cm_rep_msg *rep_msg) +{ + return (u8) (rep_msg->offset26 & 0x01); +} + +static inline void cm_rep_set_flow_ctrl(struct cm_rep_msg *rep_msg, + u8 flow_ctrl) +{ + rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0xFE) | + (flow_ctrl & 0x1)); +} + +static inline u8 cm_rep_get_rnr_retry_count(struct cm_rep_msg *rep_msg) +{ + return (u8) (rep_msg->offset27 >> 5); +} + +static inline void cm_rep_set_rnr_retry_count(struct cm_rep_msg *rep_msg, + u8 rnr_retry_count) +{ + rep_msg->offset27 = (u8) ((rep_msg->offset27 & 0x1F) | + (rnr_retry_count << 5)); +} + +static inline u8 cm_rep_get_srq(struct cm_rep_msg *rep_msg) +{ + return (u8) ((rep_msg->offset27 >> 4) & 0x1); +} + +static inline void cm_rep_set_srq(struct cm_rep_msg *rep_msg, u8 srq) +{ + rep_msg->offset27 = (u8) ((rep_msg->offset27 & 0xEF) | + ((srq & 0x1) << 4)); +} + +struct cm_rtu_msg { + struct ib_mad_hdr hdr; + + __be32 local_comm_id; + __be32 remote_comm_id; + + u8 private_data[IB_CM_RTU_PRIVATE_DATA_SIZE]; + +} __attribute__ ((packed)); + +struct cm_dreq_msg { + struct ib_mad_hdr hdr; + + __be32 local_comm_id; + __be32 remote_comm_id; + /* remote QPN/EECN:24, rsvd:8 */ + __be32 offset8; + + u8 private_data[IB_CM_DREQ_PRIVATE_DATA_SIZE]; + +} __attribute__ ((packed)); + +static inline __be32 cm_dreq_get_remote_qpn(struct cm_dreq_msg *dreq_msg) +{ + return cpu_to_be32(be32_to_cpu(dreq_msg->offset8) >> 8); +} + +static inline void cm_dreq_set_remote_qpn(struct cm_dreq_msg *dreq_msg, __be32 qpn) +{ + dreq_msg->offset8 = cpu_to_be32((be32_to_cpu(qpn) << 8) | + (be32_to_cpu(dreq_msg->offset8) & 0x000000FF)); +} + +struct cm_drep_msg { + struct ib_mad_hdr hdr; + + __be32 local_comm_id; + __be32 remote_comm_id; + + u8 private_data[IB_CM_DREP_PRIVATE_DATA_SIZE]; + +} __attribute__ ((packed)); + +struct cm_lap_msg { + struct ib_mad_hdr hdr; + + __be32 local_comm_id; + __be32 remote_comm_id; + + __be32 rsvd8; + /* remote QPN/EECN:24, remote CM response timeout:5, rsvd:3 */ + __be32 offset12; + __be32 rsvd16; + + __be16 alt_local_lid; + __be16 alt_remote_lid; + union ib_gid alt_local_gid; + union ib_gid alt_remote_gid; + /* flow label:20, rsvd:4, traffic class:8 */ + __be32 offset56; + u8 alt_hop_limit; + /* rsvd:2, packet rate:6 */ + u8 offset61; + /* SL:4, subnet local:1, rsvd:3 */ + u8 offset62; + /* local ACK timeout:5, rsvd:3 */ + u8 offset63; + + u8 private_data[IB_CM_LAP_PRIVATE_DATA_SIZE]; +} __attribute__ ((packed)); + +static inline __be32 cm_lap_get_remote_qpn(struct cm_lap_msg *lap_msg) +{ + return cpu_to_be32(be32_to_cpu(lap_msg->offset12) >> 8); +} + +static inline void cm_lap_set_remote_qpn(struct cm_lap_msg *lap_msg, __be32 qpn) +{ + lap_msg->offset12 = cpu_to_be32((be32_to_cpu(qpn) << 8) | + (be32_to_cpu(lap_msg->offset12) & + 0x000000FF)); +} + +static inline u8 cm_lap_get_remote_resp_timeout(struct cm_lap_msg *lap_msg) +{ + return (u8) ((be32_to_cpu(lap_msg->offset12) & 0xF8) >> 3); +} + +static inline void cm_lap_set_remote_resp_timeout(struct cm_lap_msg *lap_msg, + u8 resp_timeout) +{ + lap_msg->offset12 = cpu_to_be32((resp_timeout << 3) | + (be32_to_cpu(lap_msg->offset12) & + 0xFFFFFF07)); +} + +static inline __be32 cm_lap_get_flow_label(struct cm_lap_msg *lap_msg) +{ + return cpu_to_be32(be32_to_cpu(lap_msg->offset56) >> 12); +} + +static inline void cm_lap_set_flow_label(struct cm_lap_msg *lap_msg, + __be32 flow_label) +{ + lap_msg->offset56 = cpu_to_be32( + (be32_to_cpu(lap_msg->offset56) & 0x00000FFF) | + (be32_to_cpu(flow_label) << 12)); +} + +static inline u8 cm_lap_get_traffic_class(struct cm_lap_msg *lap_msg) +{ + return (u8) be32_to_cpu(lap_msg->offset56); +} + +static inline void cm_lap_set_traffic_class(struct cm_lap_msg *lap_msg, + u8 traffic_class) +{ + lap_msg->offset56 = cpu_to_be32(traffic_class | + (be32_to_cpu(lap_msg->offset56) & + 0xFFFFFF00)); +} + +static inline u8 cm_lap_get_packet_rate(struct cm_lap_msg *lap_msg) +{ + return lap_msg->offset61 & 0x3F; +} + +static inline void cm_lap_set_packet_rate(struct cm_lap_msg *lap_msg, + u8 packet_rate) +{ + lap_msg->offset61 = (packet_rate & 0x3F) | (lap_msg->offset61 & 0xC0); +} + +static inline u8 cm_lap_get_sl(struct cm_lap_msg *lap_msg) +{ + return lap_msg->offset62 >> 4; +} + +static inline void cm_lap_set_sl(struct cm_lap_msg *lap_msg, u8 sl) +{ + lap_msg->offset62 = (sl << 4) | (lap_msg->offset62 & 0x0F); +} + +static inline u8 cm_lap_get_subnet_local(struct cm_lap_msg *lap_msg) +{ + return (lap_msg->offset62 >> 3) & 0x1; +} + +static inline void cm_lap_set_subnet_local(struct cm_lap_msg *lap_msg, + u8 subnet_local) +{ + lap_msg->offset62 = ((subnet_local & 0x1) << 3) | + (lap_msg->offset61 & 0xF7); +} +static inline u8 cm_lap_get_local_ack_timeout(struct cm_lap_msg *lap_msg) +{ + return lap_msg->offset63 >> 3; +} + +static inline void cm_lap_set_local_ack_timeout(struct cm_lap_msg *lap_msg, + u8 local_ack_timeout) +{ + lap_msg->offset63 = (local_ack_timeout << 3) | + (lap_msg->offset63 & 0x07); +} + +struct cm_apr_msg { + struct ib_mad_hdr hdr; + + __be32 local_comm_id; + __be32 remote_comm_id; + + u8 info_length; + u8 ap_status; + __be16 rsvd; + u8 info[IB_CM_APR_INFO_LENGTH]; + + u8 private_data[IB_CM_APR_PRIVATE_DATA_SIZE]; +} __attribute__ ((packed)); + +struct cm_sidr_req_msg { + struct ib_mad_hdr hdr; + + __be32 request_id; + __be16 pkey; + __be16 rsvd; + __be64 service_id; + + u8 private_data[IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE]; +} __attribute__ ((packed)); + +struct cm_sidr_rep_msg { + struct ib_mad_hdr hdr; + + __be32 request_id; + u8 status; + u8 info_length; + __be16 rsvd; + /* QPN:24, rsvd:8 */ + __be32 offset8; + __be64 service_id; + __be32 qkey; + u8 info[IB_CM_SIDR_REP_INFO_LENGTH]; + + u8 private_data[IB_CM_SIDR_REP_PRIVATE_DATA_SIZE]; +} __attribute__ ((packed)); + +static inline __be32 cm_sidr_rep_get_qpn(struct cm_sidr_rep_msg *sidr_rep_msg) +{ + return cpu_to_be32(be32_to_cpu(sidr_rep_msg->offset8) >> 8); +} + +static inline void cm_sidr_rep_set_qpn(struct cm_sidr_rep_msg *sidr_rep_msg, + __be32 qpn) +{ + sidr_rep_msg->offset8 = cpu_to_be32((be32_to_cpu(qpn) << 8) | + (be32_to_cpu(sidr_rep_msg->offset8) & + 0x000000FF)); +} + +#endif /* CM_MSGS_H */ diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c new file mode 100644 index 0000000..d570030 --- /dev/null +++ b/drivers/infiniband/core/cma.c @@ -0,0 +1,3703 @@ +/* + * Copyright (c) 2005 Voltaire Inc. All rights reserved. + * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. + * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved. + * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Sean Hefty"); +MODULE_DESCRIPTION("Generic RDMA CM Agent"); +MODULE_LICENSE("Dual BSD/GPL"); + +#define CMA_CM_RESPONSE_TIMEOUT 20 +#define CMA_MAX_CM_RETRIES 15 +#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) +#define CMA_IBOE_PACKET_LIFETIME 18 + +static void cma_add_one(struct ib_device *device); +static void cma_remove_one(struct ib_device *device); + +static struct ib_client cma_client = { + .name = "cma", + .add = cma_add_one, + .remove = cma_remove_one +}; + +static struct ib_sa_client sa_client; +static struct rdma_addr_client addr_client; +static LIST_HEAD(dev_list); +static LIST_HEAD(listen_any_list); +static DEFINE_MUTEX(lock); +static struct workqueue_struct *cma_wq; +static DEFINE_IDR(tcp_ps); +static DEFINE_IDR(udp_ps); +static DEFINE_IDR(ipoib_ps); +static DEFINE_IDR(ib_ps); + +struct cma_device { + struct list_head list; + struct ib_device *device; + struct completion comp; + atomic_t refcount; + struct list_head id_list; +}; + +struct rdma_bind_list { + struct idr *ps; + struct hlist_head owners; + unsigned short port; +}; + +enum { + CMA_OPTION_AFONLY, +}; + +/* + * Device removal can occur at anytime, so we need extra handling to + * serialize notifying the user of device removal with other callbacks. + * We do this by disabling removal notification while a callback is in process, + * and reporting it after the callback completes. + */ +struct rdma_id_private { + struct rdma_cm_id id; + + struct rdma_bind_list *bind_list; + struct hlist_node node; + struct list_head list; /* listen_any_list or cma_device.list */ + struct list_head listen_list; /* per device listens */ + struct cma_device *cma_dev; + struct list_head mc_list; + + int internal_id; + enum rdma_cm_state state; + spinlock_t lock; + struct mutex qp_mutex; + + struct completion comp; + atomic_t refcount; + struct mutex handler_mutex; + + int backlog; + int timeout_ms; + struct ib_sa_query *query; + int query_id; + union { + struct ib_cm_id *ib; + struct iw_cm_id *iw; + } cm_id; + + u32 seq_num; + u32 qkey; + u32 qp_num; + pid_t owner; + u32 options; + u8 srq; + u8 tos; + u8 reuseaddr; + u8 afonly; +}; + +struct cma_multicast { + struct rdma_id_private *id_priv; + union { + struct ib_sa_multicast *ib; + } multicast; + struct list_head list; + void *context; + struct sockaddr_storage addr; + struct kref mcref; +}; + +struct cma_work { + struct work_struct work; + struct rdma_id_private *id; + enum rdma_cm_state old_state; + enum rdma_cm_state new_state; + struct rdma_cm_event event; +}; + +struct cma_ndev_work { + struct work_struct work; + struct rdma_id_private *id; + struct rdma_cm_event event; +}; + +struct iboe_mcast_work { + struct work_struct work; + struct rdma_id_private *id; + struct cma_multicast *mc; +}; + +union cma_ip_addr { + struct in6_addr ip6; + struct { + __be32 pad[3]; + __be32 addr; + } ip4; +}; + +struct cma_hdr { + u8 cma_version; + u8 ip_version; /* IP version: 7:4 */ + __be16 port; + union cma_ip_addr src_addr; + union cma_ip_addr dst_addr; +}; + +#define CMA_VERSION 0x00 + +static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&id_priv->lock, flags); + ret = (id_priv->state == comp); + spin_unlock_irqrestore(&id_priv->lock, flags); + return ret; +} + +static int cma_comp_exch(struct rdma_id_private *id_priv, + enum rdma_cm_state comp, enum rdma_cm_state exch) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&id_priv->lock, flags); + if ((ret = (id_priv->state == comp))) + id_priv->state = exch; + spin_unlock_irqrestore(&id_priv->lock, flags); + return ret; +} + +static enum rdma_cm_state cma_exch(struct rdma_id_private *id_priv, + enum rdma_cm_state exch) +{ + unsigned long flags; + enum rdma_cm_state old; + + spin_lock_irqsave(&id_priv->lock, flags); + old = id_priv->state; + id_priv->state = exch; + spin_unlock_irqrestore(&id_priv->lock, flags); + return old; +} + +static inline u8 cma_get_ip_ver(struct cma_hdr *hdr) +{ + return hdr->ip_version >> 4; +} + +static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) +{ + hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); +} + +static void cma_attach_to_dev(struct rdma_id_private *id_priv, + struct cma_device *cma_dev) +{ + atomic_inc(&cma_dev->refcount); + id_priv->cma_dev = cma_dev; + id_priv->id.device = cma_dev->device; + id_priv->id.route.addr.dev_addr.transport = + rdma_node_get_transport(cma_dev->device->node_type); + list_add_tail(&id_priv->list, &cma_dev->id_list); +} + +static inline void cma_deref_dev(struct cma_device *cma_dev) +{ + if (atomic_dec_and_test(&cma_dev->refcount)) + complete(&cma_dev->comp); +} + +static inline void release_mc(struct kref *kref) +{ + struct cma_multicast *mc = container_of(kref, struct cma_multicast, mcref); + + kfree(mc->multicast.ib); + kfree(mc); +} + +static void cma_release_dev(struct rdma_id_private *id_priv) +{ + mutex_lock(&lock); + list_del(&id_priv->list); + cma_deref_dev(id_priv->cma_dev); + id_priv->cma_dev = NULL; + mutex_unlock(&lock); +} + +static inline struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv) +{ + return (struct sockaddr *) &id_priv->id.route.addr.src_addr; +} + +static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv) +{ + return (struct sockaddr *) &id_priv->id.route.addr.dst_addr; +} + +static inline unsigned short cma_family(struct rdma_id_private *id_priv) +{ + return id_priv->id.route.addr.src_addr.ss_family; +} + +static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey) +{ + struct ib_sa_mcmember_rec rec; + int ret = 0; + + if (id_priv->qkey) { + if (qkey && id_priv->qkey != qkey) + return -EINVAL; + return 0; + } + + if (qkey) { + id_priv->qkey = qkey; + return 0; + } + + switch (id_priv->id.ps) { + case RDMA_PS_UDP: + case RDMA_PS_IB: + id_priv->qkey = RDMA_UDP_QKEY; + break; + case RDMA_PS_IPOIB: + ib_addr_get_mgid(&id_priv->id.route.addr.dev_addr, &rec.mgid); + ret = ib_sa_get_mcmember_rec(id_priv->id.device, + id_priv->id.port_num, &rec.mgid, + &rec); + if (!ret) + id_priv->qkey = be32_to_cpu(rec.qkey); + break; + default: + break; + } + return ret; +} + +static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr) +{ + dev_addr->dev_type = ARPHRD_INFINIBAND; + rdma_addr_set_sgid(dev_addr, (union ib_gid *) &sib->sib_addr); + ib_addr_set_pkey(dev_addr, ntohs(sib->sib_pkey)); +} + +static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) +{ + int ret; + + if (addr->sa_family != AF_IB) { + ret = rdma_translate_ip(addr, dev_addr, NULL); + } else { + cma_translate_ib((struct sockaddr_ib *) addr, dev_addr); + ret = 0; + } + + return ret; +} + +static int cma_acquire_dev(struct rdma_id_private *id_priv, + struct rdma_id_private *listen_id_priv) +{ + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + struct cma_device *cma_dev; + union ib_gid gid, iboe_gid; + int ret = -ENODEV; + u8 port, found_port; + enum rdma_link_layer dev_ll = dev_addr->dev_type == ARPHRD_INFINIBAND ? + IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; + + if (dev_ll != IB_LINK_LAYER_INFINIBAND && + id_priv->id.ps == RDMA_PS_IPOIB) + return -EINVAL; + + mutex_lock(&lock); + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, + &iboe_gid); + + memcpy(&gid, dev_addr->src_dev_addr + + rdma_addr_gid_offset(dev_addr), sizeof gid); + if (listen_id_priv && + rdma_port_get_link_layer(listen_id_priv->id.device, + listen_id_priv->id.port_num) == dev_ll) { + cma_dev = listen_id_priv->cma_dev; + port = listen_id_priv->id.port_num; + if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB && + rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET) + ret = ib_find_cached_gid(cma_dev->device, &iboe_gid, + &found_port, NULL); + else + ret = ib_find_cached_gid(cma_dev->device, &gid, + &found_port, NULL); + + if (!ret && (port == found_port)) { + id_priv->id.port_num = found_port; + goto out; + } + } + list_for_each_entry(cma_dev, &dev_list, list) { + for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) { + if (listen_id_priv && + listen_id_priv->cma_dev == cma_dev && + listen_id_priv->id.port_num == port) + continue; + if (rdma_port_get_link_layer(cma_dev->device, port) == dev_ll) { + if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB && + rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET) + ret = ib_find_cached_gid(cma_dev->device, &iboe_gid, &found_port, NULL); + else + ret = ib_find_cached_gid(cma_dev->device, &gid, &found_port, NULL); + + if (!ret && (port == found_port)) { + id_priv->id.port_num = found_port; + goto out; + } + } + } + } + +out: + if (!ret) + cma_attach_to_dev(id_priv, cma_dev); + + mutex_unlock(&lock); + return ret; +} + +/* + * Select the source IB device and address to reach the destination IB address. + */ +static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) +{ + struct cma_device *cma_dev, *cur_dev; + struct sockaddr_ib *addr; + union ib_gid gid, sgid, *dgid; + u16 pkey, index; + u8 p; + int i; + + cma_dev = NULL; + addr = (struct sockaddr_ib *) cma_dst_addr(id_priv); + dgid = (union ib_gid *) &addr->sib_addr; + pkey = ntohs(addr->sib_pkey); + + list_for_each_entry(cur_dev, &dev_list, list) { + if (rdma_node_get_transport(cur_dev->device->node_type) != RDMA_TRANSPORT_IB) + continue; + + for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) { + if (ib_find_cached_pkey(cur_dev->device, p, pkey, &index)) + continue; + + for (i = 0; !ib_get_cached_gid(cur_dev->device, p, i, &gid); i++) { + if (!memcmp(&gid, dgid, sizeof(gid))) { + cma_dev = cur_dev; + sgid = gid; + id_priv->id.port_num = p; + goto found; + } + + if (!cma_dev && (gid.global.subnet_prefix == + dgid->global.subnet_prefix)) { + cma_dev = cur_dev; + sgid = gid; + id_priv->id.port_num = p; + } + } + } + } + + if (!cma_dev) + return -ENODEV; + +found: + cma_attach_to_dev(id_priv, cma_dev); + addr = (struct sockaddr_ib *) cma_src_addr(id_priv); + memcpy(&addr->sib_addr, &sgid, sizeof sgid); + cma_translate_ib(addr, &id_priv->id.route.addr.dev_addr); + return 0; +} + +static void cma_deref_id(struct rdma_id_private *id_priv) +{ + if (atomic_dec_and_test(&id_priv->refcount)) + complete(&id_priv->comp); +} + +static int cma_disable_callback(struct rdma_id_private *id_priv, + enum rdma_cm_state state) +{ + mutex_lock(&id_priv->handler_mutex); + if (id_priv->state != state) { + mutex_unlock(&id_priv->handler_mutex); + return -EINVAL; + } + return 0; +} + +struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler, + void *context, enum rdma_port_space ps, + enum ib_qp_type qp_type) +{ + struct rdma_id_private *id_priv; + + id_priv = kzalloc(sizeof *id_priv, GFP_KERNEL); + if (!id_priv) + return ERR_PTR(-ENOMEM); + + id_priv->owner = task_pid_nr(current); + id_priv->state = RDMA_CM_IDLE; + id_priv->id.context = context; + id_priv->id.event_handler = event_handler; + id_priv->id.ps = ps; + id_priv->id.qp_type = qp_type; + spin_lock_init(&id_priv->lock); + mutex_init(&id_priv->qp_mutex); + init_completion(&id_priv->comp); + atomic_set(&id_priv->refcount, 1); + mutex_init(&id_priv->handler_mutex); + INIT_LIST_HEAD(&id_priv->listen_list); + INIT_LIST_HEAD(&id_priv->mc_list); + get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num); + + return &id_priv->id; +} +EXPORT_SYMBOL(rdma_create_id); + +static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) +{ + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + + qp_attr.qp_state = IB_QPS_INIT; + ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); + if (ret) + return ret; + + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) + return ret; + + qp_attr.qp_state = IB_QPS_RTR; + ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE); + if (ret) + return ret; + + qp_attr.qp_state = IB_QPS_RTS; + qp_attr.sq_psn = 0; + ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN); + + return ret; +} + +static int cma_init_conn_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) +{ + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + + qp_attr.qp_state = IB_QPS_INIT; + ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); + if (ret) + return ret; + + return ib_modify_qp(qp, &qp_attr, qp_attr_mask); +} + +int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, + struct ib_qp_init_attr *qp_init_attr) +{ + struct rdma_id_private *id_priv; + struct ib_qp *qp; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (id->device != pd->device) + return -EINVAL; + + qp = ib_create_qp(pd, qp_init_attr); + if (IS_ERR(qp)) + return PTR_ERR(qp); + + if (id->qp_type == IB_QPT_UD) + ret = cma_init_ud_qp(id_priv, qp); + else + ret = cma_init_conn_qp(id_priv, qp); + if (ret) + goto err; + + id->qp = qp; + id_priv->qp_num = qp->qp_num; + id_priv->srq = (qp->srq != NULL); + return 0; +err: + ib_destroy_qp(qp); + return ret; +} +EXPORT_SYMBOL(rdma_create_qp); + +void rdma_destroy_qp(struct rdma_cm_id *id) +{ + struct rdma_id_private *id_priv; + + id_priv = container_of(id, struct rdma_id_private, id); + mutex_lock(&id_priv->qp_mutex); + ib_destroy_qp(id_priv->id.qp); + id_priv->id.qp = NULL; + mutex_unlock(&id_priv->qp_mutex); +} +EXPORT_SYMBOL(rdma_destroy_qp); + +static int cma_modify_qp_rtr(struct rdma_id_private *id_priv, + struct rdma_conn_param *conn_param) +{ + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + union ib_gid sgid; + + mutex_lock(&id_priv->qp_mutex); + if (!id_priv->id.qp) { + ret = 0; + goto out; + } + + /* Need to update QP attributes from default values. */ + qp_attr.qp_state = IB_QPS_INIT; + ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); + if (ret) + goto out; + + ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); + if (ret) + goto out; + + qp_attr.qp_state = IB_QPS_RTR; + ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); + if (ret) + goto out; + + ret = ib_query_gid(id_priv->id.device, id_priv->id.port_num, + qp_attr.ah_attr.grh.sgid_index, &sgid); + if (ret) + goto out; + + if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) + == RDMA_TRANSPORT_IB && + rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num) + == IB_LINK_LAYER_ETHERNET) { + ret = rdma_addr_find_smac_by_sgid(&sgid, qp_attr.smac, NULL); + + if (ret) + goto out; + } + if (conn_param) + qp_attr.max_dest_rd_atomic = conn_param->responder_resources; + ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); +out: + mutex_unlock(&id_priv->qp_mutex); + return ret; +} + +static int cma_modify_qp_rts(struct rdma_id_private *id_priv, + struct rdma_conn_param *conn_param) +{ + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + + mutex_lock(&id_priv->qp_mutex); + if (!id_priv->id.qp) { + ret = 0; + goto out; + } + + qp_attr.qp_state = IB_QPS_RTS; + ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); + if (ret) + goto out; + + if (conn_param) + qp_attr.max_rd_atomic = conn_param->initiator_depth; + ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); +out: + mutex_unlock(&id_priv->qp_mutex); + return ret; +} + +static int cma_modify_qp_err(struct rdma_id_private *id_priv) +{ + struct ib_qp_attr qp_attr; + int ret; + + mutex_lock(&id_priv->qp_mutex); + if (!id_priv->id.qp) { + ret = 0; + goto out; + } + + qp_attr.qp_state = IB_QPS_ERR; + ret = ib_modify_qp(id_priv->id.qp, &qp_attr, IB_QP_STATE); +out: + mutex_unlock(&id_priv->qp_mutex); + return ret; +} + +static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, + struct ib_qp_attr *qp_attr, int *qp_attr_mask) +{ + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + int ret; + u16 pkey; + + if (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num) == + IB_LINK_LAYER_INFINIBAND) + pkey = ib_addr_get_pkey(dev_addr); + else + pkey = 0xffff; + + ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num, + pkey, &qp_attr->pkey_index); + if (ret) + return ret; + + qp_attr->port_num = id_priv->id.port_num; + *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT; + + if (id_priv->id.qp_type == IB_QPT_UD) { + ret = cma_set_qkey(id_priv, 0); + if (ret) + return ret; + + qp_attr->qkey = id_priv->qkey; + *qp_attr_mask |= IB_QP_QKEY; + } else { + qp_attr->qp_access_flags = 0; + *qp_attr_mask |= IB_QP_ACCESS_FLAGS; + } + return 0; +} + +int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, + int *qp_attr_mask) +{ + struct rdma_id_private *id_priv; + int ret = 0; + + id_priv = container_of(id, struct rdma_id_private, id); + switch (rdma_node_get_transport(id_priv->id.device->node_type)) { + case RDMA_TRANSPORT_IB: + if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD)) + ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask); + else + ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr, + qp_attr_mask); + + if (qp_attr->qp_state == IB_QPS_RTR) + qp_attr->rq_psn = id_priv->seq_num; + break; + case RDMA_TRANSPORT_IWARP: + if (!id_priv->cm_id.iw) { + qp_attr->qp_access_flags = 0; + *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; + } else + ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr, + qp_attr_mask); + break; + default: + ret = -ENOSYS; + break; + } + + return ret; +} +EXPORT_SYMBOL(rdma_init_qp_attr); + +static inline int cma_zero_addr(struct sockaddr *addr) +{ + switch (addr->sa_family) { + case AF_INET: + return ipv4_is_zeronet(((struct sockaddr_in *)addr)->sin_addr.s_addr); + case AF_INET6: + return ipv6_addr_any(&((struct sockaddr_in6 *) addr)->sin6_addr); + case AF_IB: + return ib_addr_any(&((struct sockaddr_ib *) addr)->sib_addr); + default: + return 0; + } +} + +static inline int cma_loopback_addr(struct sockaddr *addr) +{ + switch (addr->sa_family) { + case AF_INET: + return ipv4_is_loopback(((struct sockaddr_in *) addr)->sin_addr.s_addr); + case AF_INET6: + return ipv6_addr_loopback(&((struct sockaddr_in6 *) addr)->sin6_addr); + case AF_IB: + return ib_addr_loopback(&((struct sockaddr_ib *) addr)->sib_addr); + default: + return 0; + } +} + +static inline int cma_any_addr(struct sockaddr *addr) +{ + return cma_zero_addr(addr) || cma_loopback_addr(addr); +} + +static int cma_addr_cmp(struct sockaddr *src, struct sockaddr *dst) +{ + if (src->sa_family != dst->sa_family) + return -1; + + switch (src->sa_family) { + case AF_INET: + return ((struct sockaddr_in *) src)->sin_addr.s_addr != + ((struct sockaddr_in *) dst)->sin_addr.s_addr; + case AF_INET6: + return ipv6_addr_cmp(&((struct sockaddr_in6 *) src)->sin6_addr, + &((struct sockaddr_in6 *) dst)->sin6_addr); + default: + return ib_addr_cmp(&((struct sockaddr_ib *) src)->sib_addr, + &((struct sockaddr_ib *) dst)->sib_addr); + } +} + +static __be16 cma_port(struct sockaddr *addr) +{ + struct sockaddr_ib *sib; + + switch (addr->sa_family) { + case AF_INET: + return ((struct sockaddr_in *) addr)->sin_port; + case AF_INET6: + return ((struct sockaddr_in6 *) addr)->sin6_port; + case AF_IB: + sib = (struct sockaddr_ib *) addr; + return htons((u16) (be64_to_cpu(sib->sib_sid) & + be64_to_cpu(sib->sib_sid_mask))); + default: + return 0; + } +} + +static inline int cma_any_port(struct sockaddr *addr) +{ + return !cma_port(addr); +} + +static void cma_save_ib_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id, + struct ib_sa_path_rec *path) +{ + struct sockaddr_ib *listen_ib, *ib; + + listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr; + ib = (struct sockaddr_ib *) &id->route.addr.src_addr; + ib->sib_family = listen_ib->sib_family; + ib->sib_pkey = path->pkey; + ib->sib_flowinfo = path->flow_label; + memcpy(&ib->sib_addr, &path->sgid, 16); + ib->sib_sid = listen_ib->sib_sid; + ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL); + ib->sib_scope_id = listen_ib->sib_scope_id; + + ib = (struct sockaddr_ib *) &id->route.addr.dst_addr; + ib->sib_family = listen_ib->sib_family; + ib->sib_pkey = path->pkey; + ib->sib_flowinfo = path->flow_label; + memcpy(&ib->sib_addr, &path->dgid, 16); +} + +static void cma_save_ip4_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id, + struct cma_hdr *hdr) +{ + struct sockaddr_in *listen4, *ip4; + + listen4 = (struct sockaddr_in *) &listen_id->route.addr.src_addr; + ip4 = (struct sockaddr_in *) &id->route.addr.src_addr; + ip4->sin_family = listen4->sin_family; + ip4->sin_addr.s_addr = hdr->dst_addr.ip4.addr; + ip4->sin_port = listen4->sin_port; + + ip4 = (struct sockaddr_in *) &id->route.addr.dst_addr; + ip4->sin_family = listen4->sin_family; + ip4->sin_addr.s_addr = hdr->src_addr.ip4.addr; + ip4->sin_port = hdr->port; +} + +static void cma_save_ip6_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id, + struct cma_hdr *hdr) +{ + struct sockaddr_in6 *listen6, *ip6; + + listen6 = (struct sockaddr_in6 *) &listen_id->route.addr.src_addr; + ip6 = (struct sockaddr_in6 *) &id->route.addr.src_addr; + ip6->sin6_family = listen6->sin6_family; + ip6->sin6_addr = hdr->dst_addr.ip6; + ip6->sin6_port = listen6->sin6_port; + + ip6 = (struct sockaddr_in6 *) &id->route.addr.dst_addr; + ip6->sin6_family = listen6->sin6_family; + ip6->sin6_addr = hdr->src_addr.ip6; + ip6->sin6_port = hdr->port; +} + +static int cma_save_net_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id, + struct ib_cm_event *ib_event) +{ + struct cma_hdr *hdr; + + if ((listen_id->route.addr.src_addr.ss_family == AF_IB) && + (ib_event->event == IB_CM_REQ_RECEIVED)) { + cma_save_ib_info(id, listen_id, ib_event->param.req_rcvd.primary_path); + return 0; + } + + hdr = ib_event->private_data; + if (hdr->cma_version != CMA_VERSION) + return -EINVAL; + + switch (cma_get_ip_ver(hdr)) { + case 4: + cma_save_ip4_info(id, listen_id, hdr); + break; + case 6: + cma_save_ip6_info(id, listen_id, hdr); + break; + default: + return -EINVAL; + } + return 0; +} + +static inline int cma_user_data_offset(struct rdma_id_private *id_priv) +{ + return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr); +} + +static void cma_cancel_route(struct rdma_id_private *id_priv) +{ + switch (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num)) { + case IB_LINK_LAYER_INFINIBAND: + if (id_priv->query) + ib_sa_cancel_query(id_priv->query_id, id_priv->query); + break; + default: + break; + } +} + +static void cma_cancel_listens(struct rdma_id_private *id_priv) +{ + struct rdma_id_private *dev_id_priv; + + /* + * Remove from listen_any_list to prevent added devices from spawning + * additional listen requests. + */ + mutex_lock(&lock); + list_del(&id_priv->list); + + while (!list_empty(&id_priv->listen_list)) { + dev_id_priv = list_entry(id_priv->listen_list.next, + struct rdma_id_private, listen_list); + /* sync with device removal to avoid duplicate destruction */ + list_del_init(&dev_id_priv->list); + list_del(&dev_id_priv->listen_list); + mutex_unlock(&lock); + + rdma_destroy_id(&dev_id_priv->id); + mutex_lock(&lock); + } + mutex_unlock(&lock); +} + +static void cma_cancel_operation(struct rdma_id_private *id_priv, + enum rdma_cm_state state) +{ + switch (state) { + case RDMA_CM_ADDR_QUERY: + rdma_addr_cancel(&id_priv->id.route.addr.dev_addr); + break; + case RDMA_CM_ROUTE_QUERY: + cma_cancel_route(id_priv); + break; + case RDMA_CM_LISTEN: + if (cma_any_addr(cma_src_addr(id_priv)) && !id_priv->cma_dev) + cma_cancel_listens(id_priv); + break; + default: + break; + } +} + +static void cma_release_port(struct rdma_id_private *id_priv) +{ + struct rdma_bind_list *bind_list = id_priv->bind_list; + + if (!bind_list) + return; + + mutex_lock(&lock); + hlist_del(&id_priv->node); + if (hlist_empty(&bind_list->owners)) { + idr_remove(bind_list->ps, bind_list->port); + kfree(bind_list); + } + mutex_unlock(&lock); +} + +static void cma_leave_mc_groups(struct rdma_id_private *id_priv) +{ + struct cma_multicast *mc; + + while (!list_empty(&id_priv->mc_list)) { + mc = container_of(id_priv->mc_list.next, + struct cma_multicast, list); + list_del(&mc->list); + switch (rdma_port_get_link_layer(id_priv->cma_dev->device, id_priv->id.port_num)) { + case IB_LINK_LAYER_INFINIBAND: + ib_sa_free_multicast(mc->multicast.ib); + kfree(mc); + break; + case IB_LINK_LAYER_ETHERNET: + kref_put(&mc->mcref, release_mc); + break; + default: + break; + } + } +} + +void rdma_destroy_id(struct rdma_cm_id *id) +{ + struct rdma_id_private *id_priv; + enum rdma_cm_state state; + + id_priv = container_of(id, struct rdma_id_private, id); + state = cma_exch(id_priv, RDMA_CM_DESTROYING); + cma_cancel_operation(id_priv, state); + + /* + * Wait for any active callback to finish. New callbacks will find + * the id_priv state set to destroying and abort. + */ + mutex_lock(&id_priv->handler_mutex); + mutex_unlock(&id_priv->handler_mutex); + + if (id_priv->cma_dev) { + switch (rdma_node_get_transport(id_priv->id.device->node_type)) { + case RDMA_TRANSPORT_IB: + if (id_priv->cm_id.ib) + ib_destroy_cm_id(id_priv->cm_id.ib); + break; + case RDMA_TRANSPORT_IWARP: + if (id_priv->cm_id.iw) + iw_destroy_cm_id(id_priv->cm_id.iw); + break; + default: + break; + } + cma_leave_mc_groups(id_priv); + cma_release_dev(id_priv); + } + + cma_release_port(id_priv); + cma_deref_id(id_priv); + wait_for_completion(&id_priv->comp); + + if (id_priv->internal_id) + cma_deref_id(id_priv->id.context); + + kfree(id_priv->id.route.path_rec); + kfree(id_priv); +} +EXPORT_SYMBOL(rdma_destroy_id); + +static int cma_rep_recv(struct rdma_id_private *id_priv) +{ + int ret; + + ret = cma_modify_qp_rtr(id_priv, NULL); + if (ret) + goto reject; + + ret = cma_modify_qp_rts(id_priv, NULL); + if (ret) + goto reject; + + ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0); + if (ret) + goto reject; + + return 0; +reject: + cma_modify_qp_err(id_priv); + ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, + NULL, 0, NULL, 0); + return ret; +} + +static void cma_set_rep_event_data(struct rdma_cm_event *event, + struct ib_cm_rep_event_param *rep_data, + void *private_data) +{ + event->param.conn.private_data = private_data; + event->param.conn.private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE; + event->param.conn.responder_resources = rep_data->responder_resources; + event->param.conn.initiator_depth = rep_data->initiator_depth; + event->param.conn.flow_control = rep_data->flow_control; + event->param.conn.rnr_retry_count = rep_data->rnr_retry_count; + event->param.conn.srq = rep_data->srq; + event->param.conn.qp_num = rep_data->remote_qpn; +} + +static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) +{ + struct rdma_id_private *id_priv = cm_id->context; + struct rdma_cm_event event; + int ret = 0; + + if ((ib_event->event != IB_CM_TIMEWAIT_EXIT && + cma_disable_callback(id_priv, RDMA_CM_CONNECT)) || + (ib_event->event == IB_CM_TIMEWAIT_EXIT && + cma_disable_callback(id_priv, RDMA_CM_DISCONNECT))) + return 0; + + memset(&event, 0, sizeof event); + switch (ib_event->event) { + case IB_CM_REQ_ERROR: + case IB_CM_REP_ERROR: + event.event = RDMA_CM_EVENT_UNREACHABLE; + event.status = -ETIMEDOUT; + break; + case IB_CM_REP_RECEIVED: + if (id_priv->id.qp) { + event.status = cma_rep_recv(id_priv); + event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR : + RDMA_CM_EVENT_ESTABLISHED; + } else { + event.event = RDMA_CM_EVENT_CONNECT_RESPONSE; + } + cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd, + ib_event->private_data); + break; + case IB_CM_RTU_RECEIVED: + case IB_CM_USER_ESTABLISHED: + event.event = RDMA_CM_EVENT_ESTABLISHED; + break; + case IB_CM_DREQ_ERROR: + event.status = -ETIMEDOUT; /* fall through */ + case IB_CM_DREQ_RECEIVED: + case IB_CM_DREP_RECEIVED: + if (!cma_comp_exch(id_priv, RDMA_CM_CONNECT, + RDMA_CM_DISCONNECT)) + goto out; + event.event = RDMA_CM_EVENT_DISCONNECTED; + break; + case IB_CM_TIMEWAIT_EXIT: + event.event = RDMA_CM_EVENT_TIMEWAIT_EXIT; + break; + case IB_CM_MRA_RECEIVED: + /* ignore event */ + goto out; + case IB_CM_REJ_RECEIVED: + cma_modify_qp_err(id_priv); + event.status = ib_event->param.rej_rcvd.reason; + event.event = RDMA_CM_EVENT_REJECTED; + event.param.conn.private_data = ib_event->private_data; + event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE; + break; + default: + printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n", + ib_event->event); + goto out; + } + + ret = id_priv->id.event_handler(&id_priv->id, &event); + if (ret) { + /* Destroy the CM ID by returning a non-zero value. */ + id_priv->cm_id.ib = NULL; + cma_exch(id_priv, RDMA_CM_DESTROYING); + mutex_unlock(&id_priv->handler_mutex); + rdma_destroy_id(&id_priv->id); + return ret; + } +out: + mutex_unlock(&id_priv->handler_mutex); + return ret; +} + +static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, + struct ib_cm_event *ib_event) +{ + struct rdma_id_private *id_priv; + struct rdma_cm_id *id; + struct rdma_route *rt; + int ret; + + id = rdma_create_id(listen_id->event_handler, listen_id->context, + listen_id->ps, ib_event->param.req_rcvd.qp_type); + if (IS_ERR(id)) + return NULL; + + id_priv = container_of(id, struct rdma_id_private, id); + if (cma_save_net_info(id, listen_id, ib_event)) + goto err; + + rt = &id->route; + rt->num_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1; + rt->path_rec = kmalloc(sizeof *rt->path_rec * rt->num_paths, + GFP_KERNEL); + if (!rt->path_rec) + goto err; + + rt->path_rec[0] = *ib_event->param.req_rcvd.primary_path; + if (rt->num_paths == 2) + rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; + + if (cma_any_addr(cma_src_addr(id_priv))) { + rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND; + rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid); + ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey)); + } else { + ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr); + if (ret) + goto err; + } + rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid); + + id_priv->state = RDMA_CM_CONNECT; + return id_priv; + +err: + rdma_destroy_id(id); + return NULL; +} + +static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id, + struct ib_cm_event *ib_event) +{ + struct rdma_id_private *id_priv; + struct rdma_cm_id *id; + int ret; + + id = rdma_create_id(listen_id->event_handler, listen_id->context, + listen_id->ps, IB_QPT_UD); + if (IS_ERR(id)) + return NULL; + + id_priv = container_of(id, struct rdma_id_private, id); + if (cma_save_net_info(id, listen_id, ib_event)) + goto err; + + if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) { + ret = cma_translate_addr(cma_src_addr(id_priv), &id->route.addr.dev_addr); + if (ret) + goto err; + } + + id_priv->state = RDMA_CM_CONNECT; + return id_priv; +err: + rdma_destroy_id(id); + return NULL; +} + +static void cma_set_req_event_data(struct rdma_cm_event *event, + struct ib_cm_req_event_param *req_data, + void *private_data, int offset) +{ + event->param.conn.private_data = private_data + offset; + event->param.conn.private_data_len = IB_CM_REQ_PRIVATE_DATA_SIZE - offset; + event->param.conn.responder_resources = req_data->responder_resources; + event->param.conn.initiator_depth = req_data->initiator_depth; + event->param.conn.flow_control = req_data->flow_control; + event->param.conn.retry_count = req_data->retry_count; + event->param.conn.rnr_retry_count = req_data->rnr_retry_count; + event->param.conn.srq = req_data->srq; + event->param.conn.qp_num = req_data->remote_qpn; +} + +static int cma_check_req_qp_type(struct rdma_cm_id *id, struct ib_cm_event *ib_event) +{ + return (((ib_event->event == IB_CM_REQ_RECEIVED) && + (ib_event->param.req_rcvd.qp_type == id->qp_type)) || + ((ib_event->event == IB_CM_SIDR_REQ_RECEIVED) && + (id->qp_type == IB_QPT_UD)) || + (!id->qp_type)); +} + +static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) +{ + struct rdma_id_private *listen_id, *conn_id; + struct rdma_cm_event event; + int offset, ret; + + listen_id = cm_id->context; + if (!cma_check_req_qp_type(&listen_id->id, ib_event)) + return -EINVAL; + + if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) + return -ECONNABORTED; + + memset(&event, 0, sizeof event); + offset = cma_user_data_offset(listen_id); + event.event = RDMA_CM_EVENT_CONNECT_REQUEST; + if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) { + conn_id = cma_new_udp_id(&listen_id->id, ib_event); + event.param.ud.private_data = ib_event->private_data + offset; + event.param.ud.private_data_len = + IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset; + } else { + conn_id = cma_new_conn_id(&listen_id->id, ib_event); + cma_set_req_event_data(&event, &ib_event->param.req_rcvd, + ib_event->private_data, offset); + } + if (!conn_id) { + ret = -ENOMEM; + goto err1; + } + + mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); + ret = cma_acquire_dev(conn_id, listen_id); + if (ret) + goto err2; + + conn_id->cm_id.ib = cm_id; + cm_id->context = conn_id; + cm_id->cm_handler = cma_ib_handler; + + /* + * Protect against the user destroying conn_id from another thread + * until we're done accessing it. + */ + atomic_inc(&conn_id->refcount); + ret = conn_id->id.event_handler(&conn_id->id, &event); + if (ret) + goto err3; + /* + * Acquire mutex to prevent user executing rdma_destroy_id() + * while we're accessing the cm_id. + */ + mutex_lock(&lock); + if (cma_comp(conn_id, RDMA_CM_CONNECT) && + (conn_id->id.qp_type != IB_QPT_UD)) + ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); + mutex_unlock(&lock); + mutex_unlock(&conn_id->handler_mutex); + mutex_unlock(&listen_id->handler_mutex); + cma_deref_id(conn_id); + return 0; + +err3: + cma_deref_id(conn_id); + /* Destroy the CM ID by returning a non-zero value. */ + conn_id->cm_id.ib = NULL; +err2: + cma_exch(conn_id, RDMA_CM_DESTROYING); + mutex_unlock(&conn_id->handler_mutex); +err1: + mutex_unlock(&listen_id->handler_mutex); + if (conn_id) + rdma_destroy_id(&conn_id->id); + return ret; +} + +__be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr) +{ + if (addr->sa_family == AF_IB) + return ((struct sockaddr_ib *) addr)->sib_sid; + + return cpu_to_be64(((u64)id->ps << 16) + be16_to_cpu(cma_port(addr))); +} +EXPORT_SYMBOL(rdma_get_service_id); + +static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr, + struct ib_cm_compare_data *compare) +{ + struct cma_hdr *cma_data, *cma_mask; + __be32 ip4_addr; + struct in6_addr ip6_addr; + + memset(compare, 0, sizeof *compare); + cma_data = (void *) compare->data; + cma_mask = (void *) compare->mask; + + switch (addr->sa_family) { + case AF_INET: + ip4_addr = ((struct sockaddr_in *) addr)->sin_addr.s_addr; + cma_set_ip_ver(cma_data, 4); + cma_set_ip_ver(cma_mask, 0xF); + if (!cma_any_addr(addr)) { + cma_data->dst_addr.ip4.addr = ip4_addr; + cma_mask->dst_addr.ip4.addr = htonl(~0); + } + break; + case AF_INET6: + ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr; + cma_set_ip_ver(cma_data, 6); + cma_set_ip_ver(cma_mask, 0xF); + if (!cma_any_addr(addr)) { + cma_data->dst_addr.ip6 = ip6_addr; + memset(&cma_mask->dst_addr.ip6, 0xFF, + sizeof cma_mask->dst_addr.ip6); + } + break; + default: + break; + } +} + +static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) +{ + struct rdma_id_private *id_priv = iw_id->context; + struct rdma_cm_event event; + int ret = 0; + struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; + struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; + + if (cma_disable_callback(id_priv, RDMA_CM_CONNECT)) + return 0; + + memset(&event, 0, sizeof event); + switch (iw_event->event) { + case IW_CM_EVENT_CLOSE: + event.event = RDMA_CM_EVENT_DISCONNECTED; + break; + case IW_CM_EVENT_CONNECT_REPLY: + memcpy(cma_src_addr(id_priv), laddr, + rdma_addr_size(laddr)); + memcpy(cma_dst_addr(id_priv), raddr, + rdma_addr_size(raddr)); + switch (iw_event->status) { + case 0: + event.event = RDMA_CM_EVENT_ESTABLISHED; + event.param.conn.initiator_depth = iw_event->ird; + event.param.conn.responder_resources = iw_event->ord; + break; + case -ECONNRESET: + case -ECONNREFUSED: + event.event = RDMA_CM_EVENT_REJECTED; + break; + case -ETIMEDOUT: + event.event = RDMA_CM_EVENT_UNREACHABLE; + break; + default: + event.event = RDMA_CM_EVENT_CONNECT_ERROR; + break; + } + break; + case IW_CM_EVENT_ESTABLISHED: + event.event = RDMA_CM_EVENT_ESTABLISHED; + event.param.conn.initiator_depth = iw_event->ird; + event.param.conn.responder_resources = iw_event->ord; + break; + default: + BUG_ON(1); + } + + event.status = iw_event->status; + event.param.conn.private_data = iw_event->private_data; + event.param.conn.private_data_len = iw_event->private_data_len; + ret = id_priv->id.event_handler(&id_priv->id, &event); + if (ret) { + /* Destroy the CM ID by returning a non-zero value. */ + id_priv->cm_id.iw = NULL; + cma_exch(id_priv, RDMA_CM_DESTROYING); + mutex_unlock(&id_priv->handler_mutex); + rdma_destroy_id(&id_priv->id); + return ret; + } + + mutex_unlock(&id_priv->handler_mutex); + return ret; +} + +static int iw_conn_req_handler(struct iw_cm_id *cm_id, + struct iw_cm_event *iw_event) +{ + struct rdma_cm_id *new_cm_id; + struct rdma_id_private *listen_id, *conn_id; + struct rdma_cm_event event; + int ret; + struct ib_device_attr attr; + struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; + struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; + + listen_id = cm_id->context; + if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) + return -ECONNABORTED; + + /* Create a new RDMA id for the new IW CM ID */ + new_cm_id = rdma_create_id(listen_id->id.event_handler, + listen_id->id.context, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(new_cm_id)) { + ret = -ENOMEM; + goto out; + } + conn_id = container_of(new_cm_id, struct rdma_id_private, id); + mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); + conn_id->state = RDMA_CM_CONNECT; + + ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr, NULL); + if (ret) { + mutex_unlock(&conn_id->handler_mutex); + rdma_destroy_id(new_cm_id); + goto out; + } + + ret = cma_acquire_dev(conn_id, listen_id); + if (ret) { + mutex_unlock(&conn_id->handler_mutex); + rdma_destroy_id(new_cm_id); + goto out; + } + + conn_id->cm_id.iw = cm_id; + cm_id->context = conn_id; + cm_id->cm_handler = cma_iw_handler; + + memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr)); + memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr)); + + ret = ib_query_device(conn_id->id.device, &attr); + if (ret) { + mutex_unlock(&conn_id->handler_mutex); + rdma_destroy_id(new_cm_id); + goto out; + } + + memset(&event, 0, sizeof event); + event.event = RDMA_CM_EVENT_CONNECT_REQUEST; + event.param.conn.private_data = iw_event->private_data; + event.param.conn.private_data_len = iw_event->private_data_len; + event.param.conn.initiator_depth = iw_event->ird; + event.param.conn.responder_resources = iw_event->ord; + + /* + * Protect against the user destroying conn_id from another thread + * until we're done accessing it. + */ + atomic_inc(&conn_id->refcount); + ret = conn_id->id.event_handler(&conn_id->id, &event); + if (ret) { + /* User wants to destroy the CM ID */ + conn_id->cm_id.iw = NULL; + cma_exch(conn_id, RDMA_CM_DESTROYING); + mutex_unlock(&conn_id->handler_mutex); + cma_deref_id(conn_id); + rdma_destroy_id(&conn_id->id); + goto out; + } + + mutex_unlock(&conn_id->handler_mutex); + cma_deref_id(conn_id); + +out: + mutex_unlock(&listen_id->handler_mutex); + return ret; +} + +static int cma_ib_listen(struct rdma_id_private *id_priv) +{ + struct ib_cm_compare_data compare_data; + struct sockaddr *addr; + struct ib_cm_id *id; + __be64 svc_id; + int ret; + + id = ib_create_cm_id(id_priv->id.device, cma_req_handler, id_priv); + if (IS_ERR(id)) + return PTR_ERR(id); + + id_priv->cm_id.ib = id; + + addr = cma_src_addr(id_priv); + svc_id = rdma_get_service_id(&id_priv->id, addr); + if (cma_any_addr(addr) && !id_priv->afonly) + ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL); + else { + cma_set_compare_data(id_priv->id.ps, addr, &compare_data); + ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, &compare_data); + } + + if (ret) { + ib_destroy_cm_id(id_priv->cm_id.ib); + id_priv->cm_id.ib = NULL; + } + + return ret; +} + +static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) +{ + int ret; + struct iw_cm_id *id; + + id = iw_create_cm_id(id_priv->id.device, + iw_conn_req_handler, + id_priv); + if (IS_ERR(id)) + return PTR_ERR(id); + + id_priv->cm_id.iw = id; + + memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv), + rdma_addr_size(cma_src_addr(id_priv))); + + ret = iw_cm_listen(id_priv->cm_id.iw, backlog); + + if (ret) { + iw_destroy_cm_id(id_priv->cm_id.iw); + id_priv->cm_id.iw = NULL; + } + + return ret; +} + +static int cma_listen_handler(struct rdma_cm_id *id, + struct rdma_cm_event *event) +{ + struct rdma_id_private *id_priv = id->context; + + id->context = id_priv->id.context; + id->event_handler = id_priv->id.event_handler; + return id_priv->id.event_handler(id, event); +} + +static void cma_listen_on_dev(struct rdma_id_private *id_priv, + struct cma_device *cma_dev) +{ + struct rdma_id_private *dev_id_priv; + struct rdma_cm_id *id; + int ret; + + if (cma_family(id_priv) == AF_IB && + rdma_node_get_transport(cma_dev->device->node_type) != RDMA_TRANSPORT_IB) + return; + + id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps, + id_priv->id.qp_type); + if (IS_ERR(id)) + return; + + dev_id_priv = container_of(id, struct rdma_id_private, id); + + dev_id_priv->state = RDMA_CM_ADDR_BOUND; + memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv), + rdma_addr_size(cma_src_addr(id_priv))); + + cma_attach_to_dev(dev_id_priv, cma_dev); + list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list); + atomic_inc(&id_priv->refcount); + dev_id_priv->internal_id = 1; + dev_id_priv->afonly = id_priv->afonly; + + ret = rdma_listen(id, id_priv->backlog); + if (ret) + printk(KERN_WARNING "RDMA CMA: cma_listen_on_dev, error %d, " + "listening on device %s\n", ret, cma_dev->device->name); +} + +static void cma_listen_on_all(struct rdma_id_private *id_priv) +{ + struct cma_device *cma_dev; + + mutex_lock(&lock); + list_add_tail(&id_priv->list, &listen_any_list); + list_for_each_entry(cma_dev, &dev_list, list) + cma_listen_on_dev(id_priv, cma_dev); + mutex_unlock(&lock); +} + +void rdma_set_service_type(struct rdma_cm_id *id, int tos) +{ + struct rdma_id_private *id_priv; + + id_priv = container_of(id, struct rdma_id_private, id); + id_priv->tos = (u8) tos; +} +EXPORT_SYMBOL(rdma_set_service_type); + +static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec, + void *context) +{ + struct cma_work *work = context; + struct rdma_route *route; + + route = &work->id->id.route; + + if (!status) { + route->num_paths = 1; + *route->path_rec = *path_rec; + } else { + work->old_state = RDMA_CM_ROUTE_QUERY; + work->new_state = RDMA_CM_ADDR_RESOLVED; + work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; + work->event.status = status; + } + + queue_work(cma_wq, &work->work); +} + +static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms, + struct cma_work *work) +{ + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + struct ib_sa_path_rec path_rec; + ib_sa_comp_mask comp_mask; + struct sockaddr_in6 *sin6; + struct sockaddr_ib *sib; + + memset(&path_rec, 0, sizeof path_rec); + rdma_addr_get_sgid(dev_addr, &path_rec.sgid); + rdma_addr_get_dgid(dev_addr, &path_rec.dgid); + path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); + path_rec.numb_path = 1; + path_rec.reversible = 1; + path_rec.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); + + comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | + IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH | + IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID; + + switch (cma_family(id_priv)) { + case AF_INET: + path_rec.qos_class = cpu_to_be16((u16) id_priv->tos); + comp_mask |= IB_SA_PATH_REC_QOS_CLASS; + break; + case AF_INET6: + sin6 = (struct sockaddr_in6 *) cma_src_addr(id_priv); + path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20); + comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; + break; + case AF_IB: + sib = (struct sockaddr_ib *) cma_src_addr(id_priv); + path_rec.traffic_class = (u8) (be32_to_cpu(sib->sib_flowinfo) >> 20); + comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; + break; + } + + id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device, + id_priv->id.port_num, &path_rec, + comp_mask, timeout_ms, + GFP_KERNEL, cma_query_handler, + work, &id_priv->query); + + return (id_priv->query_id < 0) ? id_priv->query_id : 0; +} + +static void cma_work_handler(struct work_struct *_work) +{ + struct cma_work *work = container_of(_work, struct cma_work, work); + struct rdma_id_private *id_priv = work->id; + int destroy = 0; + + mutex_lock(&id_priv->handler_mutex); + if (!cma_comp_exch(id_priv, work->old_state, work->new_state)) + goto out; + + if (id_priv->id.event_handler(&id_priv->id, &work->event)) { + cma_exch(id_priv, RDMA_CM_DESTROYING); + destroy = 1; + } +out: + mutex_unlock(&id_priv->handler_mutex); + cma_deref_id(id_priv); + if (destroy) + rdma_destroy_id(&id_priv->id); + kfree(work); +} + +static void cma_ndev_work_handler(struct work_struct *_work) +{ + struct cma_ndev_work *work = container_of(_work, struct cma_ndev_work, work); + struct rdma_id_private *id_priv = work->id; + int destroy = 0; + + mutex_lock(&id_priv->handler_mutex); + if (id_priv->state == RDMA_CM_DESTROYING || + id_priv->state == RDMA_CM_DEVICE_REMOVAL) + goto out; + + if (id_priv->id.event_handler(&id_priv->id, &work->event)) { + cma_exch(id_priv, RDMA_CM_DESTROYING); + destroy = 1; + } + +out: + mutex_unlock(&id_priv->handler_mutex); + cma_deref_id(id_priv); + if (destroy) + rdma_destroy_id(&id_priv->id); + kfree(work); +} + +static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms) +{ + struct rdma_route *route = &id_priv->id.route; + struct cma_work *work; + int ret; + + work = kzalloc(sizeof *work, GFP_KERNEL); + if (!work) + return -ENOMEM; + + work->id = id_priv; + INIT_WORK(&work->work, cma_work_handler); + work->old_state = RDMA_CM_ROUTE_QUERY; + work->new_state = RDMA_CM_ROUTE_RESOLVED; + work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; + + route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL); + if (!route->path_rec) { + ret = -ENOMEM; + goto err1; + } + + ret = cma_query_ib_route(id_priv, timeout_ms, work); + if (ret) + goto err2; + + return 0; +err2: + kfree(route->path_rec); + route->path_rec = NULL; +err1: + kfree(work); + return ret; +} + +int rdma_set_ib_paths(struct rdma_cm_id *id, + struct ib_sa_path_rec *path_rec, int num_paths) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, + RDMA_CM_ROUTE_RESOLVED)) + return -EINVAL; + + id->route.path_rec = kmemdup(path_rec, sizeof *path_rec * num_paths, + GFP_KERNEL); + if (!id->route.path_rec) { + ret = -ENOMEM; + goto err; + } + + id->route.num_paths = num_paths; + return 0; +err: + cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED); + return ret; +} +EXPORT_SYMBOL(rdma_set_ib_paths); + +static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms) +{ + struct cma_work *work; + + work = kzalloc(sizeof *work, GFP_KERNEL); + if (!work) + return -ENOMEM; + + work->id = id_priv; + INIT_WORK(&work->work, cma_work_handler); + work->old_state = RDMA_CM_ROUTE_QUERY; + work->new_state = RDMA_CM_ROUTE_RESOLVED; + work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; + queue_work(cma_wq, &work->work); + return 0; +} + +static int iboe_tos_to_sl(struct net_device *ndev, int tos) +{ + int prio; + struct net_device *dev; + + prio = rt_tos2priority(tos); + dev = ndev->priv_flags & IFF_802_1Q_VLAN ? + vlan_dev_real_dev(ndev) : ndev; + + if (dev->num_tc) + return netdev_get_prio_tc_map(dev, prio); + +#if IS_ENABLED(CONFIG_VLAN_8021Q) + if (ndev->priv_flags & IFF_802_1Q_VLAN) + return (vlan_dev_get_egress_qos_mask(ndev, prio) & + VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; +#endif + return 0; +} + +static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) +{ + struct rdma_route *route = &id_priv->id.route; + struct rdma_addr *addr = &route->addr; + struct cma_work *work; + int ret; + struct net_device *ndev = NULL; + + + work = kzalloc(sizeof *work, GFP_KERNEL); + if (!work) + return -ENOMEM; + + work->id = id_priv; + INIT_WORK(&work->work, cma_work_handler); + + route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL); + if (!route->path_rec) { + ret = -ENOMEM; + goto err1; + } + + route->num_paths = 1; + + if (addr->dev_addr.bound_dev_if) + ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if); + if (!ndev) { + ret = -ENODEV; + goto err2; + } + + route->path_rec->vlan_id = rdma_vlan_dev_vlan_id(ndev); + memcpy(route->path_rec->dmac, addr->dev_addr.dst_dev_addr, ETH_ALEN); + memcpy(route->path_rec->smac, ndev->dev_addr, ndev->addr_len); + + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, + &route->path_rec->sgid); + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr, + &route->path_rec->dgid); + + route->path_rec->hop_limit = 1; + route->path_rec->reversible = 1; + route->path_rec->pkey = cpu_to_be16(0xffff); + route->path_rec->mtu_selector = IB_SA_EQ; + route->path_rec->sl = iboe_tos_to_sl(ndev, id_priv->tos); + route->path_rec->mtu = iboe_get_mtu(ndev->mtu); + route->path_rec->rate_selector = IB_SA_EQ; + route->path_rec->rate = iboe_get_rate(ndev); + dev_put(ndev); + route->path_rec->packet_life_time_selector = IB_SA_EQ; + route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME; + if (!route->path_rec->mtu) { + ret = -EINVAL; + goto err2; + } + + work->old_state = RDMA_CM_ROUTE_QUERY; + work->new_state = RDMA_CM_ROUTE_RESOLVED; + work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; + work->event.status = 0; + + queue_work(cma_wq, &work->work); + + return 0; + +err2: + kfree(route->path_rec); + route->path_rec = NULL; +err1: + kfree(work); + return ret; +} + +int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY)) + return -EINVAL; + + atomic_inc(&id_priv->refcount); + switch (rdma_node_get_transport(id->device->node_type)) { + case RDMA_TRANSPORT_IB: + switch (rdma_port_get_link_layer(id->device, id->port_num)) { + case IB_LINK_LAYER_INFINIBAND: + ret = cma_resolve_ib_route(id_priv, timeout_ms); + break; + case IB_LINK_LAYER_ETHERNET: + ret = cma_resolve_iboe_route(id_priv); + break; + default: + ret = -ENOSYS; + } + break; + case RDMA_TRANSPORT_IWARP: + ret = cma_resolve_iw_route(id_priv, timeout_ms); + break; + default: + ret = -ENOSYS; + break; + } + if (ret) + goto err; + + return 0; +err: + cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED); + cma_deref_id(id_priv); + return ret; +} +EXPORT_SYMBOL(rdma_resolve_route); + +static void cma_set_loopback(struct sockaddr *addr) +{ + switch (addr->sa_family) { + case AF_INET: + ((struct sockaddr_in *) addr)->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + break; + case AF_INET6: + ipv6_addr_set(&((struct sockaddr_in6 *) addr)->sin6_addr, + 0, 0, 0, htonl(1)); + break; + default: + ib_addr_set(&((struct sockaddr_ib *) addr)->sib_addr, + 0, 0, 0, htonl(1)); + break; + } +} + +static int cma_bind_loopback(struct rdma_id_private *id_priv) +{ + struct cma_device *cma_dev, *cur_dev; + struct ib_port_attr port_attr; + union ib_gid gid; + u16 pkey; + int ret; + u8 p; + + cma_dev = NULL; + mutex_lock(&lock); + list_for_each_entry(cur_dev, &dev_list, list) { + if (cma_family(id_priv) == AF_IB && + rdma_node_get_transport(cur_dev->device->node_type) != RDMA_TRANSPORT_IB) + continue; + + if (!cma_dev) + cma_dev = cur_dev; + + for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) { + if (!ib_query_port(cur_dev->device, p, &port_attr) && + port_attr.state == IB_PORT_ACTIVE) { + cma_dev = cur_dev; + goto port_found; + } + } + } + + if (!cma_dev) { + ret = -ENODEV; + goto out; + } + + p = 1; + +port_found: + ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid); + if (ret) + goto out; + + ret = ib_get_cached_pkey(cma_dev->device, p, 0, &pkey); + if (ret) + goto out; + + id_priv->id.route.addr.dev_addr.dev_type = + (rdma_port_get_link_layer(cma_dev->device, p) == IB_LINK_LAYER_INFINIBAND) ? + ARPHRD_INFINIBAND : ARPHRD_ETHER; + + rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid); + ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey); + id_priv->id.port_num = p; + cma_attach_to_dev(id_priv, cma_dev); + cma_set_loopback(cma_src_addr(id_priv)); +out: + mutex_unlock(&lock); + return ret; +} + +static void addr_handler(int status, struct sockaddr *src_addr, + struct rdma_dev_addr *dev_addr, void *context) +{ + struct rdma_id_private *id_priv = context; + struct rdma_cm_event event; + + memset(&event, 0, sizeof event); + mutex_lock(&id_priv->handler_mutex); + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, + RDMA_CM_ADDR_RESOLVED)) + goto out; + + memcpy(cma_src_addr(id_priv), src_addr, rdma_addr_size(src_addr)); + if (!status && !id_priv->cma_dev) + status = cma_acquire_dev(id_priv, NULL); + + if (status) { + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, + RDMA_CM_ADDR_BOUND)) + goto out; + event.event = RDMA_CM_EVENT_ADDR_ERROR; + event.status = status; + } else + event.event = RDMA_CM_EVENT_ADDR_RESOLVED; + + if (id_priv->id.event_handler(&id_priv->id, &event)) { + cma_exch(id_priv, RDMA_CM_DESTROYING); + mutex_unlock(&id_priv->handler_mutex); + cma_deref_id(id_priv); + rdma_destroy_id(&id_priv->id); + return; + } +out: + mutex_unlock(&id_priv->handler_mutex); + cma_deref_id(id_priv); +} + +static int cma_resolve_loopback(struct rdma_id_private *id_priv) +{ + struct cma_work *work; + union ib_gid gid; + int ret; + + work = kzalloc(sizeof *work, GFP_KERNEL); + if (!work) + return -ENOMEM; + + if (!id_priv->cma_dev) { + ret = cma_bind_loopback(id_priv); + if (ret) + goto err; + } + + rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid); + rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid); + + work->id = id_priv; + INIT_WORK(&work->work, cma_work_handler); + work->old_state = RDMA_CM_ADDR_QUERY; + work->new_state = RDMA_CM_ADDR_RESOLVED; + work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; + queue_work(cma_wq, &work->work); + return 0; +err: + kfree(work); + return ret; +} + +static int cma_resolve_ib_addr(struct rdma_id_private *id_priv) +{ + struct cma_work *work; + int ret; + + work = kzalloc(sizeof *work, GFP_KERNEL); + if (!work) + return -ENOMEM; + + if (!id_priv->cma_dev) { + ret = cma_resolve_ib_dev(id_priv); + if (ret) + goto err; + } + + rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *) + &(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr)); + + work->id = id_priv; + INIT_WORK(&work->work, cma_work_handler); + work->old_state = RDMA_CM_ADDR_QUERY; + work->new_state = RDMA_CM_ADDR_RESOLVED; + work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; + queue_work(cma_wq, &work->work); + return 0; +err: + kfree(work); + return ret; +} + +static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, + struct sockaddr *dst_addr) +{ + if (!src_addr || !src_addr->sa_family) { + src_addr = (struct sockaddr *) &id->route.addr.src_addr; + src_addr->sa_family = dst_addr->sa_family; + if (dst_addr->sa_family == AF_INET6) { + ((struct sockaddr_in6 *) src_addr)->sin6_scope_id = + ((struct sockaddr_in6 *) dst_addr)->sin6_scope_id; + } else if (dst_addr->sa_family == AF_IB) { + ((struct sockaddr_ib *) src_addr)->sib_pkey = + ((struct sockaddr_ib *) dst_addr)->sib_pkey; + } + } + return rdma_bind_addr(id, src_addr); +} + +int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, + struct sockaddr *dst_addr, int timeout_ms) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (id_priv->state == RDMA_CM_IDLE) { + ret = cma_bind_addr(id, src_addr, dst_addr); + if (ret) + return ret; + } + + if (cma_family(id_priv) != dst_addr->sa_family) + return -EINVAL; + + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) + return -EINVAL; + + atomic_inc(&id_priv->refcount); + memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); + if (cma_any_addr(dst_addr)) { + ret = cma_resolve_loopback(id_priv); + } else { + if (dst_addr->sa_family == AF_IB) { + ret = cma_resolve_ib_addr(id_priv); + } else { + ret = rdma_resolve_ip(&addr_client, cma_src_addr(id_priv), + dst_addr, &id->route.addr.dev_addr, + timeout_ms, addr_handler, id_priv); + } + } + if (ret) + goto err; + + return 0; +err: + cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); + cma_deref_id(id_priv); + return ret; +} +EXPORT_SYMBOL(rdma_resolve_addr); + +int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse) +{ + struct rdma_id_private *id_priv; + unsigned long flags; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + spin_lock_irqsave(&id_priv->lock, flags); + if (reuse || id_priv->state == RDMA_CM_IDLE) { + id_priv->reuseaddr = reuse; + ret = 0; + } else { + ret = -EINVAL; + } + spin_unlock_irqrestore(&id_priv->lock, flags); + return ret; +} +EXPORT_SYMBOL(rdma_set_reuseaddr); + +int rdma_set_afonly(struct rdma_cm_id *id, int afonly) +{ + struct rdma_id_private *id_priv; + unsigned long flags; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + spin_lock_irqsave(&id_priv->lock, flags); + if (id_priv->state == RDMA_CM_IDLE || id_priv->state == RDMA_CM_ADDR_BOUND) { + id_priv->options |= (1 << CMA_OPTION_AFONLY); + id_priv->afonly = afonly; + ret = 0; + } else { + ret = -EINVAL; + } + spin_unlock_irqrestore(&id_priv->lock, flags); + return ret; +} +EXPORT_SYMBOL(rdma_set_afonly); + +static void cma_bind_port(struct rdma_bind_list *bind_list, + struct rdma_id_private *id_priv) +{ + struct sockaddr *addr; + struct sockaddr_ib *sib; + u64 sid, mask; + __be16 port; + + addr = cma_src_addr(id_priv); + port = htons(bind_list->port); + + switch (addr->sa_family) { + case AF_INET: + ((struct sockaddr_in *) addr)->sin_port = port; + break; + case AF_INET6: + ((struct sockaddr_in6 *) addr)->sin6_port = port; + break; + case AF_IB: + sib = (struct sockaddr_ib *) addr; + sid = be64_to_cpu(sib->sib_sid); + mask = be64_to_cpu(sib->sib_sid_mask); + sib->sib_sid = cpu_to_be64((sid & mask) | (u64) ntohs(port)); + sib->sib_sid_mask = cpu_to_be64(~0ULL); + break; + } + id_priv->bind_list = bind_list; + hlist_add_head(&id_priv->node, &bind_list->owners); +} + +static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv, + unsigned short snum) +{ + struct rdma_bind_list *bind_list; + int ret; + + bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL); + if (!bind_list) + return -ENOMEM; + + ret = idr_alloc(ps, bind_list, snum, snum + 1, GFP_KERNEL); + if (ret < 0) + goto err; + + bind_list->ps = ps; + bind_list->port = (unsigned short)ret; + cma_bind_port(bind_list, id_priv); + return 0; +err: + kfree(bind_list); + return ret == -ENOSPC ? -EADDRNOTAVAIL : ret; +} + +static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv) +{ + static unsigned int last_used_port; + int low, high, remaining; + unsigned int rover; + + inet_get_local_port_range(&init_net, &low, &high); + remaining = (high - low) + 1; + rover = prandom_u32() % remaining + low; +retry: + if (last_used_port != rover && + !idr_find(ps, (unsigned short) rover)) { + int ret = cma_alloc_port(ps, id_priv, rover); + /* + * Remember previously used port number in order to avoid + * re-using same port immediately after it is closed. + */ + if (!ret) + last_used_port = rover; + if (ret != -EADDRNOTAVAIL) + return ret; + } + if (--remaining) { + rover++; + if ((rover < low) || (rover > high)) + rover = low; + goto retry; + } + return -EADDRNOTAVAIL; +} + +/* + * Check that the requested port is available. This is called when trying to + * bind to a specific port, or when trying to listen on a bound port. In + * the latter case, the provided id_priv may already be on the bind_list, but + * we still need to check that it's okay to start listening. + */ +static int cma_check_port(struct rdma_bind_list *bind_list, + struct rdma_id_private *id_priv, uint8_t reuseaddr) +{ + struct rdma_id_private *cur_id; + struct sockaddr *addr, *cur_addr; + + addr = cma_src_addr(id_priv); + hlist_for_each_entry(cur_id, &bind_list->owners, node) { + if (id_priv == cur_id) + continue; + + if ((cur_id->state != RDMA_CM_LISTEN) && reuseaddr && + cur_id->reuseaddr) + continue; + + cur_addr = cma_src_addr(cur_id); + if (id_priv->afonly && cur_id->afonly && + (addr->sa_family != cur_addr->sa_family)) + continue; + + if (cma_any_addr(addr) || cma_any_addr(cur_addr)) + return -EADDRNOTAVAIL; + + if (!cma_addr_cmp(addr, cur_addr)) + return -EADDRINUSE; + } + return 0; +} + +static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv) +{ + struct rdma_bind_list *bind_list; + unsigned short snum; + int ret; + + snum = ntohs(cma_port(cma_src_addr(id_priv))); + if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) + return -EACCES; + + bind_list = idr_find(ps, snum); + if (!bind_list) { + ret = cma_alloc_port(ps, id_priv, snum); + } else { + ret = cma_check_port(bind_list, id_priv, id_priv->reuseaddr); + if (!ret) + cma_bind_port(bind_list, id_priv); + } + return ret; +} + +static int cma_bind_listen(struct rdma_id_private *id_priv) +{ + struct rdma_bind_list *bind_list = id_priv->bind_list; + int ret = 0; + + mutex_lock(&lock); + if (bind_list->owners.first->next) + ret = cma_check_port(bind_list, id_priv, 0); + mutex_unlock(&lock); + return ret; +} + +static struct idr *cma_select_inet_ps(struct rdma_id_private *id_priv) +{ + switch (id_priv->id.ps) { + case RDMA_PS_TCP: + return &tcp_ps; + case RDMA_PS_UDP: + return &udp_ps; + case RDMA_PS_IPOIB: + return &ipoib_ps; + case RDMA_PS_IB: + return &ib_ps; + default: + return NULL; + } +} + +static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv) +{ + struct idr *ps = NULL; + struct sockaddr_ib *sib; + u64 sid_ps, mask, sid; + + sib = (struct sockaddr_ib *) cma_src_addr(id_priv); + mask = be64_to_cpu(sib->sib_sid_mask) & RDMA_IB_IP_PS_MASK; + sid = be64_to_cpu(sib->sib_sid) & mask; + + if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) { + sid_ps = RDMA_IB_IP_PS_IB; + ps = &ib_ps; + } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) && + (sid == (RDMA_IB_IP_PS_TCP & mask))) { + sid_ps = RDMA_IB_IP_PS_TCP; + ps = &tcp_ps; + } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) && + (sid == (RDMA_IB_IP_PS_UDP & mask))) { + sid_ps = RDMA_IB_IP_PS_UDP; + ps = &udp_ps; + } + + if (ps) { + sib->sib_sid = cpu_to_be64(sid_ps | ntohs(cma_port((struct sockaddr *) sib))); + sib->sib_sid_mask = cpu_to_be64(RDMA_IB_IP_PS_MASK | + be64_to_cpu(sib->sib_sid_mask)); + } + return ps; +} + +static int cma_get_port(struct rdma_id_private *id_priv) +{ + struct idr *ps; + int ret; + + if (cma_family(id_priv) != AF_IB) + ps = cma_select_inet_ps(id_priv); + else + ps = cma_select_ib_ps(id_priv); + if (!ps) + return -EPROTONOSUPPORT; + + mutex_lock(&lock); + if (cma_any_port(cma_src_addr(id_priv))) + ret = cma_alloc_any_port(ps, id_priv); + else + ret = cma_use_port(ps, id_priv); + mutex_unlock(&lock); + + return ret; +} + +static int cma_check_linklocal(struct rdma_dev_addr *dev_addr, + struct sockaddr *addr) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct sockaddr_in6 *sin6; + + if (addr->sa_family != AF_INET6) + return 0; + + sin6 = (struct sockaddr_in6 *) addr; + + if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) + return 0; + + if (!sin6->sin6_scope_id) + return -EINVAL; + + dev_addr->bound_dev_if = sin6->sin6_scope_id; +#endif + return 0; +} + +int rdma_listen(struct rdma_cm_id *id, int backlog) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (id_priv->state == RDMA_CM_IDLE) { + id->route.addr.src_addr.ss_family = AF_INET; + ret = rdma_bind_addr(id, cma_src_addr(id_priv)); + if (ret) + return ret; + } + + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN)) + return -EINVAL; + + if (id_priv->reuseaddr) { + ret = cma_bind_listen(id_priv); + if (ret) + goto err; + } + + id_priv->backlog = backlog; + if (id->device) { + switch (rdma_node_get_transport(id->device->node_type)) { + case RDMA_TRANSPORT_IB: + ret = cma_ib_listen(id_priv); + if (ret) + goto err; + break; + case RDMA_TRANSPORT_IWARP: + ret = cma_iw_listen(id_priv, backlog); + if (ret) + goto err; + break; + default: + ret = -ENOSYS; + goto err; + } + } else + cma_listen_on_all(id_priv); + + return 0; +err: + id_priv->backlog = 0; + cma_comp_exch(id_priv, RDMA_CM_LISTEN, RDMA_CM_ADDR_BOUND); + return ret; +} +EXPORT_SYMBOL(rdma_listen); + +int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) +{ + struct rdma_id_private *id_priv; + int ret; + + if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6 && + addr->sa_family != AF_IB) + return -EAFNOSUPPORT; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!cma_comp_exch(id_priv, RDMA_CM_IDLE, RDMA_CM_ADDR_BOUND)) + return -EINVAL; + + ret = cma_check_linklocal(&id->route.addr.dev_addr, addr); + if (ret) + goto err1; + + memcpy(cma_src_addr(id_priv), addr, rdma_addr_size(addr)); + if (!cma_any_addr(addr)) { + ret = cma_translate_addr(addr, &id->route.addr.dev_addr); + if (ret) + goto err1; + + ret = cma_acquire_dev(id_priv, NULL); + if (ret) + goto err1; + } + + if (!(id_priv->options & (1 << CMA_OPTION_AFONLY))) { + if (addr->sa_family == AF_INET) + id_priv->afonly = 1; +#if IS_ENABLED(CONFIG_IPV6) + else if (addr->sa_family == AF_INET6) + id_priv->afonly = init_net.ipv6.sysctl.bindv6only; +#endif + } + ret = cma_get_port(id_priv); + if (ret) + goto err2; + + return 0; +err2: + if (id_priv->cma_dev) + cma_release_dev(id_priv); +err1: + cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE); + return ret; +} +EXPORT_SYMBOL(rdma_bind_addr); + +static int cma_format_hdr(void *hdr, struct rdma_id_private *id_priv) +{ + struct cma_hdr *cma_hdr; + + cma_hdr = hdr; + cma_hdr->cma_version = CMA_VERSION; + if (cma_family(id_priv) == AF_INET) { + struct sockaddr_in *src4, *dst4; + + src4 = (struct sockaddr_in *) cma_src_addr(id_priv); + dst4 = (struct sockaddr_in *) cma_dst_addr(id_priv); + + cma_set_ip_ver(cma_hdr, 4); + cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; + cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; + cma_hdr->port = src4->sin_port; + } else if (cma_family(id_priv) == AF_INET6) { + struct sockaddr_in6 *src6, *dst6; + + src6 = (struct sockaddr_in6 *) cma_src_addr(id_priv); + dst6 = (struct sockaddr_in6 *) cma_dst_addr(id_priv); + + cma_set_ip_ver(cma_hdr, 6); + cma_hdr->src_addr.ip6 = src6->sin6_addr; + cma_hdr->dst_addr.ip6 = dst6->sin6_addr; + cma_hdr->port = src6->sin6_port; + } + return 0; +} + +static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, + struct ib_cm_event *ib_event) +{ + struct rdma_id_private *id_priv = cm_id->context; + struct rdma_cm_event event; + struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd; + int ret = 0; + + if (cma_disable_callback(id_priv, RDMA_CM_CONNECT)) + return 0; + + memset(&event, 0, sizeof event); + switch (ib_event->event) { + case IB_CM_SIDR_REQ_ERROR: + event.event = RDMA_CM_EVENT_UNREACHABLE; + event.status = -ETIMEDOUT; + break; + case IB_CM_SIDR_REP_RECEIVED: + event.param.ud.private_data = ib_event->private_data; + event.param.ud.private_data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE; + if (rep->status != IB_SIDR_SUCCESS) { + event.event = RDMA_CM_EVENT_UNREACHABLE; + event.status = ib_event->param.sidr_rep_rcvd.status; + break; + } + ret = cma_set_qkey(id_priv, rep->qkey); + if (ret) { + event.event = RDMA_CM_EVENT_ADDR_ERROR; + event.status = ret; + break; + } + ib_init_ah_from_path(id_priv->id.device, id_priv->id.port_num, + id_priv->id.route.path_rec, + &event.param.ud.ah_attr); + event.param.ud.qp_num = rep->qpn; + event.param.ud.qkey = rep->qkey; + event.event = RDMA_CM_EVENT_ESTABLISHED; + event.status = 0; + break; + default: + printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n", + ib_event->event); + goto out; + } + + ret = id_priv->id.event_handler(&id_priv->id, &event); + if (ret) { + /* Destroy the CM ID by returning a non-zero value. */ + id_priv->cm_id.ib = NULL; + cma_exch(id_priv, RDMA_CM_DESTROYING); + mutex_unlock(&id_priv->handler_mutex); + rdma_destroy_id(&id_priv->id); + return ret; + } +out: + mutex_unlock(&id_priv->handler_mutex); + return ret; +} + +static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, + struct rdma_conn_param *conn_param) +{ + struct ib_cm_sidr_req_param req; + struct ib_cm_id *id; + void *private_data; + int offset, ret; + + memset(&req, 0, sizeof req); + offset = cma_user_data_offset(id_priv); + req.private_data_len = offset + conn_param->private_data_len; + if (req.private_data_len < conn_param->private_data_len) + return -EINVAL; + + if (req.private_data_len) { + private_data = kzalloc(req.private_data_len, GFP_ATOMIC); + if (!private_data) + return -ENOMEM; + } else { + private_data = NULL; + } + + if (conn_param->private_data && conn_param->private_data_len) + memcpy(private_data + offset, conn_param->private_data, + conn_param->private_data_len); + + if (private_data) { + ret = cma_format_hdr(private_data, id_priv); + if (ret) + goto out; + req.private_data = private_data; + } + + id = ib_create_cm_id(id_priv->id.device, cma_sidr_rep_handler, + id_priv); + if (IS_ERR(id)) { + ret = PTR_ERR(id); + goto out; + } + id_priv->cm_id.ib = id; + + req.path = id_priv->id.route.path_rec; + req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); + req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8); + req.max_cm_retries = CMA_MAX_CM_RETRIES; + + ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req); + if (ret) { + ib_destroy_cm_id(id_priv->cm_id.ib); + id_priv->cm_id.ib = NULL; + } +out: + kfree(private_data); + return ret; +} + +static int cma_connect_ib(struct rdma_id_private *id_priv, + struct rdma_conn_param *conn_param) +{ + struct ib_cm_req_param req; + struct rdma_route *route; + void *private_data; + struct ib_cm_id *id; + int offset, ret; + + memset(&req, 0, sizeof req); + offset = cma_user_data_offset(id_priv); + req.private_data_len = offset + conn_param->private_data_len; + if (req.private_data_len < conn_param->private_data_len) + return -EINVAL; + + if (req.private_data_len) { + private_data = kzalloc(req.private_data_len, GFP_ATOMIC); + if (!private_data) + return -ENOMEM; + } else { + private_data = NULL; + } + + if (conn_param->private_data && conn_param->private_data_len) + memcpy(private_data + offset, conn_param->private_data, + conn_param->private_data_len); + + id = ib_create_cm_id(id_priv->id.device, cma_ib_handler, id_priv); + if (IS_ERR(id)) { + ret = PTR_ERR(id); + goto out; + } + id_priv->cm_id.ib = id; + + route = &id_priv->id.route; + if (private_data) { + ret = cma_format_hdr(private_data, id_priv); + if (ret) + goto out; + req.private_data = private_data; + } + + req.primary_path = &route->path_rec[0]; + if (route->num_paths == 2) + req.alternate_path = &route->path_rec[1]; + + req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); + req.qp_num = id_priv->qp_num; + req.qp_type = id_priv->id.qp_type; + req.starting_psn = id_priv->seq_num; + req.responder_resources = conn_param->responder_resources; + req.initiator_depth = conn_param->initiator_depth; + req.flow_control = conn_param->flow_control; + req.retry_count = min_t(u8, 7, conn_param->retry_count); + req.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); + req.remote_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; + req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; + req.max_cm_retries = CMA_MAX_CM_RETRIES; + req.srq = id_priv->srq ? 1 : 0; + + ret = ib_send_cm_req(id_priv->cm_id.ib, &req); +out: + if (ret && !IS_ERR(id)) { + ib_destroy_cm_id(id); + id_priv->cm_id.ib = NULL; + } + + kfree(private_data); + return ret; +} + +static int cma_connect_iw(struct rdma_id_private *id_priv, + struct rdma_conn_param *conn_param) +{ + struct iw_cm_id *cm_id; + int ret; + struct iw_cm_conn_param iw_param; + + cm_id = iw_create_cm_id(id_priv->id.device, cma_iw_handler, id_priv); + if (IS_ERR(cm_id)) + return PTR_ERR(cm_id); + + id_priv->cm_id.iw = cm_id; + + memcpy(&cm_id->local_addr, cma_src_addr(id_priv), + rdma_addr_size(cma_src_addr(id_priv))); + memcpy(&cm_id->remote_addr, cma_dst_addr(id_priv), + rdma_addr_size(cma_dst_addr(id_priv))); + + ret = cma_modify_qp_rtr(id_priv, conn_param); + if (ret) + goto out; + + if (conn_param) { + iw_param.ord = conn_param->initiator_depth; + iw_param.ird = conn_param->responder_resources; + iw_param.private_data = conn_param->private_data; + iw_param.private_data_len = conn_param->private_data_len; + iw_param.qpn = id_priv->id.qp ? id_priv->qp_num : conn_param->qp_num; + } else { + memset(&iw_param, 0, sizeof iw_param); + iw_param.qpn = id_priv->qp_num; + } + ret = iw_cm_connect(cm_id, &iw_param); +out: + if (ret) { + iw_destroy_cm_id(cm_id); + id_priv->cm_id.iw = NULL; + } + return ret; +} + +int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT)) + return -EINVAL; + + if (!id->qp) { + id_priv->qp_num = conn_param->qp_num; + id_priv->srq = conn_param->srq; + } + + switch (rdma_node_get_transport(id->device->node_type)) { + case RDMA_TRANSPORT_IB: + if (id->qp_type == IB_QPT_UD) + ret = cma_resolve_ib_udp(id_priv, conn_param); + else + ret = cma_connect_ib(id_priv, conn_param); + break; + case RDMA_TRANSPORT_IWARP: + ret = cma_connect_iw(id_priv, conn_param); + break; + default: + ret = -ENOSYS; + break; + } + if (ret) + goto err; + + return 0; +err: + cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_ROUTE_RESOLVED); + return ret; +} +EXPORT_SYMBOL(rdma_connect); + +static int cma_accept_ib(struct rdma_id_private *id_priv, + struct rdma_conn_param *conn_param) +{ + struct ib_cm_rep_param rep; + int ret; + + ret = cma_modify_qp_rtr(id_priv, conn_param); + if (ret) + goto out; + + ret = cma_modify_qp_rts(id_priv, conn_param); + if (ret) + goto out; + + memset(&rep, 0, sizeof rep); + rep.qp_num = id_priv->qp_num; + rep.starting_psn = id_priv->seq_num; + rep.private_data = conn_param->private_data; + rep.private_data_len = conn_param->private_data_len; + rep.responder_resources = conn_param->responder_resources; + rep.initiator_depth = conn_param->initiator_depth; + rep.failover_accepted = 0; + rep.flow_control = conn_param->flow_control; + rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); + rep.srq = id_priv->srq ? 1 : 0; + + ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep); +out: + return ret; +} + +static int cma_accept_iw(struct rdma_id_private *id_priv, + struct rdma_conn_param *conn_param) +{ + struct iw_cm_conn_param iw_param; + int ret; + + ret = cma_modify_qp_rtr(id_priv, conn_param); + if (ret) + return ret; + + iw_param.ord = conn_param->initiator_depth; + iw_param.ird = conn_param->responder_resources; + iw_param.private_data = conn_param->private_data; + iw_param.private_data_len = conn_param->private_data_len; + if (id_priv->id.qp) { + iw_param.qpn = id_priv->qp_num; + } else + iw_param.qpn = conn_param->qp_num; + + return iw_cm_accept(id_priv->cm_id.iw, &iw_param); +} + +static int cma_send_sidr_rep(struct rdma_id_private *id_priv, + enum ib_cm_sidr_status status, u32 qkey, + const void *private_data, int private_data_len) +{ + struct ib_cm_sidr_rep_param rep; + int ret; + + memset(&rep, 0, sizeof rep); + rep.status = status; + if (status == IB_SIDR_SUCCESS) { + ret = cma_set_qkey(id_priv, qkey); + if (ret) + return ret; + rep.qp_num = id_priv->qp_num; + rep.qkey = id_priv->qkey; + } + rep.private_data = private_data; + rep.private_data_len = private_data_len; + + return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep); +} + +int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + + id_priv->owner = task_pid_nr(current); + + if (!cma_comp(id_priv, RDMA_CM_CONNECT)) + return -EINVAL; + + if (!id->qp && conn_param) { + id_priv->qp_num = conn_param->qp_num; + id_priv->srq = conn_param->srq; + } + + switch (rdma_node_get_transport(id->device->node_type)) { + case RDMA_TRANSPORT_IB: + if (id->qp_type == IB_QPT_UD) { + if (conn_param) + ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, + conn_param->qkey, + conn_param->private_data, + conn_param->private_data_len); + else + ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, + 0, NULL, 0); + } else { + if (conn_param) + ret = cma_accept_ib(id_priv, conn_param); + else + ret = cma_rep_recv(id_priv); + } + break; + case RDMA_TRANSPORT_IWARP: + ret = cma_accept_iw(id_priv, conn_param); + break; + default: + ret = -ENOSYS; + break; + } + + if (ret) + goto reject; + + return 0; +reject: + cma_modify_qp_err(id_priv); + rdma_reject(id, NULL, 0); + return ret; +} +EXPORT_SYMBOL(rdma_accept); + +int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!id_priv->cm_id.ib) + return -EINVAL; + + switch (id->device->node_type) { + case RDMA_NODE_IB_CA: + ret = ib_cm_notify(id_priv->cm_id.ib, event); + break; + default: + ret = 0; + break; + } + return ret; +} +EXPORT_SYMBOL(rdma_notify); + +int rdma_reject(struct rdma_cm_id *id, const void *private_data, + u8 private_data_len) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!id_priv->cm_id.ib) + return -EINVAL; + + switch (rdma_node_get_transport(id->device->node_type)) { + case RDMA_TRANSPORT_IB: + if (id->qp_type == IB_QPT_UD) + ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, 0, + private_data, private_data_len); + else + ret = ib_send_cm_rej(id_priv->cm_id.ib, + IB_CM_REJ_CONSUMER_DEFINED, NULL, + 0, private_data, private_data_len); + break; + case RDMA_TRANSPORT_IWARP: + ret = iw_cm_reject(id_priv->cm_id.iw, + private_data, private_data_len); + break; + default: + ret = -ENOSYS; + break; + } + return ret; +} +EXPORT_SYMBOL(rdma_reject); + +int rdma_disconnect(struct rdma_cm_id *id) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!id_priv->cm_id.ib) + return -EINVAL; + + switch (rdma_node_get_transport(id->device->node_type)) { + case RDMA_TRANSPORT_IB: + ret = cma_modify_qp_err(id_priv); + if (ret) + goto out; + /* Initiate or respond to a disconnect. */ + if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) + ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0); + break; + case RDMA_TRANSPORT_IWARP: + ret = iw_cm_disconnect(id_priv->cm_id.iw, 0); + break; + default: + ret = -EINVAL; + break; + } +out: + return ret; +} +EXPORT_SYMBOL(rdma_disconnect); + +static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) +{ + struct rdma_id_private *id_priv; + struct cma_multicast *mc = multicast->context; + struct rdma_cm_event event; + int ret; + + id_priv = mc->id_priv; + if (cma_disable_callback(id_priv, RDMA_CM_ADDR_BOUND) && + cma_disable_callback(id_priv, RDMA_CM_ADDR_RESOLVED)) + return 0; + + if (!status) + status = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey)); + mutex_lock(&id_priv->qp_mutex); + if (!status && id_priv->id.qp) + status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid, + be16_to_cpu(multicast->rec.mlid)); + mutex_unlock(&id_priv->qp_mutex); + + memset(&event, 0, sizeof event); + event.status = status; + event.param.ud.private_data = mc->context; + if (!status) { + event.event = RDMA_CM_EVENT_MULTICAST_JOIN; + ib_init_ah_from_mcmember(id_priv->id.device, + id_priv->id.port_num, &multicast->rec, + &event.param.ud.ah_attr); + event.param.ud.qp_num = 0xFFFFFF; + event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey); + } else + event.event = RDMA_CM_EVENT_MULTICAST_ERROR; + + ret = id_priv->id.event_handler(&id_priv->id, &event); + if (ret) { + cma_exch(id_priv, RDMA_CM_DESTROYING); + mutex_unlock(&id_priv->handler_mutex); + rdma_destroy_id(&id_priv->id); + return 0; + } + + mutex_unlock(&id_priv->handler_mutex); + return 0; +} + +static void cma_set_mgid(struct rdma_id_private *id_priv, + struct sockaddr *addr, union ib_gid *mgid) +{ + unsigned char mc_map[MAX_ADDR_LEN]; + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + struct sockaddr_in *sin = (struct sockaddr_in *) addr; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr; + + if (cma_any_addr(addr)) { + memset(mgid, 0, sizeof *mgid); + } else if ((addr->sa_family == AF_INET6) && + ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) == + 0xFF10A01B)) { + /* IPv6 address is an SA assigned MGID. */ + memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); + } else if (addr->sa_family == AF_IB) { + memcpy(mgid, &((struct sockaddr_ib *) addr)->sib_addr, sizeof *mgid); + } else if ((addr->sa_family == AF_INET6)) { + ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map); + if (id_priv->id.ps == RDMA_PS_UDP) + mc_map[7] = 0x01; /* Use RDMA CM signature */ + *mgid = *(union ib_gid *) (mc_map + 4); + } else { + ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map); + if (id_priv->id.ps == RDMA_PS_UDP) + mc_map[7] = 0x01; /* Use RDMA CM signature */ + *mgid = *(union ib_gid *) (mc_map + 4); + } +} + +static int cma_join_ib_multicast(struct rdma_id_private *id_priv, + struct cma_multicast *mc) +{ + struct ib_sa_mcmember_rec rec; + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + ib_sa_comp_mask comp_mask; + int ret; + + ib_addr_get_mgid(dev_addr, &rec.mgid); + ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num, + &rec.mgid, &rec); + if (ret) + return ret; + + ret = cma_set_qkey(id_priv, 0); + if (ret) + return ret; + + cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid); + rec.qkey = cpu_to_be32(id_priv->qkey); + rdma_addr_get_sgid(dev_addr, &rec.port_gid); + rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); + rec.join_state = 1; + + comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | + IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE | + IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_SL | + IB_SA_MCMEMBER_REC_FLOW_LABEL | + IB_SA_MCMEMBER_REC_TRAFFIC_CLASS; + + if (id_priv->id.ps == RDMA_PS_IPOIB) + comp_mask |= IB_SA_MCMEMBER_REC_RATE | + IB_SA_MCMEMBER_REC_RATE_SELECTOR | + IB_SA_MCMEMBER_REC_MTU_SELECTOR | + IB_SA_MCMEMBER_REC_MTU | + IB_SA_MCMEMBER_REC_HOP_LIMIT; + + mc->multicast.ib = ib_sa_join_multicast(&sa_client, id_priv->id.device, + id_priv->id.port_num, &rec, + comp_mask, GFP_KERNEL, + cma_ib_mc_handler, mc); + return PTR_ERR_OR_ZERO(mc->multicast.ib); +} + +static void iboe_mcast_work_handler(struct work_struct *work) +{ + struct iboe_mcast_work *mw = container_of(work, struct iboe_mcast_work, work); + struct cma_multicast *mc = mw->mc; + struct ib_sa_multicast *m = mc->multicast.ib; + + mc->multicast.ib->context = mc; + cma_ib_mc_handler(0, m); + kref_put(&mc->mcref, release_mc); + kfree(mw); +} + +static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)addr; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; + + if (cma_any_addr(addr)) { + memset(mgid, 0, sizeof *mgid); + } else if (addr->sa_family == AF_INET6) { + memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); + } else { + mgid->raw[0] = 0xff; + mgid->raw[1] = 0x0e; + mgid->raw[2] = 0; + mgid->raw[3] = 0; + mgid->raw[4] = 0; + mgid->raw[5] = 0; + mgid->raw[6] = 0; + mgid->raw[7] = 0; + mgid->raw[8] = 0; + mgid->raw[9] = 0; + mgid->raw[10] = 0xff; + mgid->raw[11] = 0xff; + *(__be32 *)(&mgid->raw[12]) = sin->sin_addr.s_addr; + } +} + +static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, + struct cma_multicast *mc) +{ + struct iboe_mcast_work *work; + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + int err; + struct sockaddr *addr = (struct sockaddr *)&mc->addr; + struct net_device *ndev = NULL; + + if (cma_zero_addr((struct sockaddr *)&mc->addr)) + return -EINVAL; + + work = kzalloc(sizeof *work, GFP_KERNEL); + if (!work) + return -ENOMEM; + + mc->multicast.ib = kzalloc(sizeof(struct ib_sa_multicast), GFP_KERNEL); + if (!mc->multicast.ib) { + err = -ENOMEM; + goto out1; + } + + cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid); + + mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff); + if (id_priv->id.ps == RDMA_PS_UDP) + mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); + + if (dev_addr->bound_dev_if) + ndev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); + if (!ndev) { + err = -ENODEV; + goto out2; + } + mc->multicast.ib->rec.rate = iboe_get_rate(ndev); + mc->multicast.ib->rec.hop_limit = 1; + mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu); + dev_put(ndev); + if (!mc->multicast.ib->rec.mtu) { + err = -EINVAL; + goto out2; + } + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, + &mc->multicast.ib->rec.port_gid); + work->id = id_priv; + work->mc = mc; + INIT_WORK(&work->work, iboe_mcast_work_handler); + kref_get(&mc->mcref); + queue_work(cma_wq, &work->work); + + return 0; + +out2: + kfree(mc->multicast.ib); +out1: + kfree(work); + return err; +} + +int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, + void *context) +{ + struct rdma_id_private *id_priv; + struct cma_multicast *mc; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!cma_comp(id_priv, RDMA_CM_ADDR_BOUND) && + !cma_comp(id_priv, RDMA_CM_ADDR_RESOLVED)) + return -EINVAL; + + mc = kmalloc(sizeof *mc, GFP_KERNEL); + if (!mc) + return -ENOMEM; + + memcpy(&mc->addr, addr, rdma_addr_size(addr)); + mc->context = context; + mc->id_priv = id_priv; + + spin_lock(&id_priv->lock); + list_add(&mc->list, &id_priv->mc_list); + spin_unlock(&id_priv->lock); + + switch (rdma_node_get_transport(id->device->node_type)) { + case RDMA_TRANSPORT_IB: + switch (rdma_port_get_link_layer(id->device, id->port_num)) { + case IB_LINK_LAYER_INFINIBAND: + ret = cma_join_ib_multicast(id_priv, mc); + break; + case IB_LINK_LAYER_ETHERNET: + kref_init(&mc->mcref); + ret = cma_iboe_join_multicast(id_priv, mc); + break; + default: + ret = -EINVAL; + } + break; + default: + ret = -ENOSYS; + break; + } + + if (ret) { + spin_lock_irq(&id_priv->lock); + list_del(&mc->list); + spin_unlock_irq(&id_priv->lock); + kfree(mc); + } + return ret; +} +EXPORT_SYMBOL(rdma_join_multicast); + +void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) +{ + struct rdma_id_private *id_priv; + struct cma_multicast *mc; + + id_priv = container_of(id, struct rdma_id_private, id); + spin_lock_irq(&id_priv->lock); + list_for_each_entry(mc, &id_priv->mc_list, list) { + if (!memcmp(&mc->addr, addr, rdma_addr_size(addr))) { + list_del(&mc->list); + spin_unlock_irq(&id_priv->lock); + + if (id->qp) + ib_detach_mcast(id->qp, + &mc->multicast.ib->rec.mgid, + be16_to_cpu(mc->multicast.ib->rec.mlid)); + if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) == RDMA_TRANSPORT_IB) { + switch (rdma_port_get_link_layer(id->device, id->port_num)) { + case IB_LINK_LAYER_INFINIBAND: + ib_sa_free_multicast(mc->multicast.ib); + kfree(mc); + break; + case IB_LINK_LAYER_ETHERNET: + kref_put(&mc->mcref, release_mc); + break; + default: + break; + } + } + return; + } + } + spin_unlock_irq(&id_priv->lock); +} +EXPORT_SYMBOL(rdma_leave_multicast); + +static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv) +{ + struct rdma_dev_addr *dev_addr; + struct cma_ndev_work *work; + + dev_addr = &id_priv->id.route.addr.dev_addr; + + if ((dev_addr->bound_dev_if == ndev->ifindex) && + memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) { + printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n", + ndev->name, &id_priv->id); + work = kzalloc(sizeof *work, GFP_KERNEL); + if (!work) + return -ENOMEM; + + INIT_WORK(&work->work, cma_ndev_work_handler); + work->id = id_priv; + work->event.event = RDMA_CM_EVENT_ADDR_CHANGE; + atomic_inc(&id_priv->refcount); + queue_work(cma_wq, &work->work); + } + + return 0; +} + +static int cma_netdev_callback(struct notifier_block *self, unsigned long event, + void *ptr) +{ + struct net_device *ndev = netdev_notifier_info_to_dev(ptr); + struct cma_device *cma_dev; + struct rdma_id_private *id_priv; + int ret = NOTIFY_DONE; + + if (dev_net(ndev) != &init_net) + return NOTIFY_DONE; + + if (event != NETDEV_BONDING_FAILOVER) + return NOTIFY_DONE; + + if (!(ndev->flags & IFF_MASTER) || !(ndev->priv_flags & IFF_BONDING)) + return NOTIFY_DONE; + + mutex_lock(&lock); + list_for_each_entry(cma_dev, &dev_list, list) + list_for_each_entry(id_priv, &cma_dev->id_list, list) { + ret = cma_netdev_change(ndev, id_priv); + if (ret) + goto out; + } + +out: + mutex_unlock(&lock); + return ret; +} + +static struct notifier_block cma_nb = { + .notifier_call = cma_netdev_callback +}; + +static void cma_add_one(struct ib_device *device) +{ + struct cma_device *cma_dev; + struct rdma_id_private *id_priv; + + cma_dev = kmalloc(sizeof *cma_dev, GFP_KERNEL); + if (!cma_dev) + return; + + cma_dev->device = device; + + init_completion(&cma_dev->comp); + atomic_set(&cma_dev->refcount, 1); + INIT_LIST_HEAD(&cma_dev->id_list); + ib_set_client_data(device, &cma_client, cma_dev); + + mutex_lock(&lock); + list_add_tail(&cma_dev->list, &dev_list); + list_for_each_entry(id_priv, &listen_any_list, list) + cma_listen_on_dev(id_priv, cma_dev); + mutex_unlock(&lock); +} + +static int cma_remove_id_dev(struct rdma_id_private *id_priv) +{ + struct rdma_cm_event event; + enum rdma_cm_state state; + int ret = 0; + + /* Record that we want to remove the device */ + state = cma_exch(id_priv, RDMA_CM_DEVICE_REMOVAL); + if (state == RDMA_CM_DESTROYING) + return 0; + + cma_cancel_operation(id_priv, state); + mutex_lock(&id_priv->handler_mutex); + + /* Check for destruction from another callback. */ + if (!cma_comp(id_priv, RDMA_CM_DEVICE_REMOVAL)) + goto out; + + memset(&event, 0, sizeof event); + event.event = RDMA_CM_EVENT_DEVICE_REMOVAL; + ret = id_priv->id.event_handler(&id_priv->id, &event); +out: + mutex_unlock(&id_priv->handler_mutex); + return ret; +} + +static void cma_process_remove(struct cma_device *cma_dev) +{ + struct rdma_id_private *id_priv; + int ret; + + mutex_lock(&lock); + while (!list_empty(&cma_dev->id_list)) { + id_priv = list_entry(cma_dev->id_list.next, + struct rdma_id_private, list); + + list_del(&id_priv->listen_list); + list_del_init(&id_priv->list); + atomic_inc(&id_priv->refcount); + mutex_unlock(&lock); + + ret = id_priv->internal_id ? 1 : cma_remove_id_dev(id_priv); + cma_deref_id(id_priv); + if (ret) + rdma_destroy_id(&id_priv->id); + + mutex_lock(&lock); + } + mutex_unlock(&lock); + + cma_deref_dev(cma_dev); + wait_for_completion(&cma_dev->comp); +} + +static void cma_remove_one(struct ib_device *device) +{ + struct cma_device *cma_dev; + + cma_dev = ib_get_client_data(device, &cma_client); + if (!cma_dev) + return; + + mutex_lock(&lock); + list_del(&cma_dev->list); + mutex_unlock(&lock); + + cma_process_remove(cma_dev); + kfree(cma_dev); +} + +static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlmsghdr *nlh; + struct rdma_cm_id_stats *id_stats; + struct rdma_id_private *id_priv; + struct rdma_cm_id *id = NULL; + struct cma_device *cma_dev; + int i_dev = 0, i_id = 0; + + /* + * We export all of the IDs as a sequence of messages. Each + * ID gets its own netlink message. + */ + mutex_lock(&lock); + + list_for_each_entry(cma_dev, &dev_list, list) { + if (i_dev < cb->args[0]) { + i_dev++; + continue; + } + + i_id = 0; + list_for_each_entry(id_priv, &cma_dev->id_list, list) { + if (i_id < cb->args[1]) { + i_id++; + continue; + } + + id_stats = ibnl_put_msg(skb, &nlh, cb->nlh->nlmsg_seq, + sizeof *id_stats, RDMA_NL_RDMA_CM, + RDMA_NL_RDMA_CM_ID_STATS, + NLM_F_MULTI); + if (!id_stats) + goto out; + + memset(id_stats, 0, sizeof *id_stats); + id = &id_priv->id; + id_stats->node_type = id->route.addr.dev_addr.dev_type; + id_stats->port_num = id->port_num; + id_stats->bound_dev_if = + id->route.addr.dev_addr.bound_dev_if; + + if (ibnl_put_attr(skb, nlh, + rdma_addr_size(cma_src_addr(id_priv)), + cma_src_addr(id_priv), + RDMA_NL_RDMA_CM_ATTR_SRC_ADDR)) + goto out; + if (ibnl_put_attr(skb, nlh, + rdma_addr_size(cma_src_addr(id_priv)), + cma_dst_addr(id_priv), + RDMA_NL_RDMA_CM_ATTR_DST_ADDR)) + goto out; + + id_stats->pid = id_priv->owner; + id_stats->port_space = id->ps; + id_stats->cm_state = id_priv->state; + id_stats->qp_num = id_priv->qp_num; + id_stats->qp_type = id->qp_type; + + i_id++; + } + + cb->args[1] = 0; + i_dev++; + } + +out: + mutex_unlock(&lock); + cb->args[0] = i_dev; + cb->args[1] = i_id; + + return skb->len; +} + +static const struct ibnl_client_cbs cma_cb_table[] = { + [RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats, + .module = THIS_MODULE }, +}; + +static int __init cma_init(void) +{ + int ret; + + cma_wq = create_singlethread_workqueue("rdma_cm"); + if (!cma_wq) + return -ENOMEM; + + ib_sa_register_client(&sa_client); + rdma_addr_register_client(&addr_client); + register_netdevice_notifier(&cma_nb); + + ret = ib_register_client(&cma_client); + if (ret) + goto err; + + if (ibnl_add_client(RDMA_NL_RDMA_CM, RDMA_NL_RDMA_CM_NUM_OPS, cma_cb_table)) + printk(KERN_WARNING "RDMA CMA: failed to add netlink callback\n"); + + return 0; + +err: + unregister_netdevice_notifier(&cma_nb); + rdma_addr_unregister_client(&addr_client); + ib_sa_unregister_client(&sa_client); + destroy_workqueue(cma_wq); + return ret; +} + +static void __exit cma_cleanup(void) +{ + ibnl_remove_client(RDMA_NL_RDMA_CM); + ib_unregister_client(&cma_client); + unregister_netdevice_notifier(&cma_nb); + rdma_addr_unregister_client(&addr_client); + ib_sa_unregister_client(&sa_client); + destroy_workqueue(cma_wq); + idr_destroy(&tcp_ps); + idr_destroy(&udp_ps); + idr_destroy(&ipoib_ps); + idr_destroy(&ib_ps); +} + +module_init(cma_init); +module_exit(cma_cleanup); diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h new file mode 100644 index 0000000..87d1936 --- /dev/null +++ b/drivers/infiniband/core/core_priv.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _CORE_PRIV_H +#define _CORE_PRIV_H + +#include +#include + +#include + +int ib_device_register_sysfs(struct ib_device *device, + int (*port_callback)(struct ib_device *, + u8, struct kobject *)); +void ib_device_unregister_sysfs(struct ib_device *device); + +int ib_sysfs_setup(void); +void ib_sysfs_cleanup(void); + +int ib_cache_setup(void); +void ib_cache_cleanup(void); + +int ib_resolve_eth_l2_attrs(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, int *qp_attr_mask); +#endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c new file mode 100644 index 0000000..18c1ece --- /dev/null +++ b/drivers/infiniband/core/device.c @@ -0,0 +1,785 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "core_priv.h" + +MODULE_AUTHOR("Roland Dreier"); +MODULE_DESCRIPTION("core kernel InfiniBand API"); +MODULE_LICENSE("Dual BSD/GPL"); + +struct ib_client_data { + struct list_head list; + struct ib_client *client; + void * data; +}; + +struct workqueue_struct *ib_wq; +EXPORT_SYMBOL_GPL(ib_wq); + +static LIST_HEAD(device_list); +static LIST_HEAD(client_list); + +/* + * device_mutex protects access to both device_list and client_list. + * There's no real point to using multiple locks or something fancier + * like an rwsem: we always access both lists, and we're always + * modifying one list or the other list. In any case this is not a + * hot path so there's no point in trying to optimize. + */ +static DEFINE_MUTEX(device_mutex); + +static int ib_device_check_mandatory(struct ib_device *device) +{ +#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device, x), #x } + static const struct { + size_t offset; + char *name; + } mandatory_table[] = { + IB_MANDATORY_FUNC(query_device), + IB_MANDATORY_FUNC(query_port), + IB_MANDATORY_FUNC(query_pkey), + IB_MANDATORY_FUNC(query_gid), + IB_MANDATORY_FUNC(alloc_pd), + IB_MANDATORY_FUNC(dealloc_pd), + IB_MANDATORY_FUNC(create_ah), + IB_MANDATORY_FUNC(destroy_ah), + IB_MANDATORY_FUNC(create_qp), + IB_MANDATORY_FUNC(modify_qp), + IB_MANDATORY_FUNC(destroy_qp), + IB_MANDATORY_FUNC(post_send), + IB_MANDATORY_FUNC(post_recv), + IB_MANDATORY_FUNC(create_cq), + IB_MANDATORY_FUNC(destroy_cq), + IB_MANDATORY_FUNC(poll_cq), + IB_MANDATORY_FUNC(req_notify_cq), + IB_MANDATORY_FUNC(get_dma_mr), + IB_MANDATORY_FUNC(dereg_mr) + }; + int i; + + for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { + if (!*(void **) ((void *) device + mandatory_table[i].offset)) { + printk(KERN_WARNING "Device %s is missing mandatory function %s\n", + device->name, mandatory_table[i].name); + return -EINVAL; + } + } + + return 0; +} + +static struct ib_device *__ib_device_get_by_name(const char *name) +{ + struct ib_device *device; + + list_for_each_entry(device, &device_list, core_list) + if (!strncmp(name, device->name, IB_DEVICE_NAME_MAX)) + return device; + + return NULL; +} + + +static int alloc_name(char *name) +{ + unsigned long *inuse; + char buf[IB_DEVICE_NAME_MAX]; + struct ib_device *device; + int i; + + inuse = (unsigned long *) get_zeroed_page(GFP_KERNEL); + if (!inuse) + return -ENOMEM; + + list_for_each_entry(device, &device_list, core_list) { + if (!sscanf(device->name, name, &i)) + continue; + if (i < 0 || i >= PAGE_SIZE * 8) + continue; + snprintf(buf, sizeof buf, name, i); + if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX)) + set_bit(i, inuse); + } + + i = find_first_zero_bit(inuse, PAGE_SIZE * 8); + free_page((unsigned long) inuse); + snprintf(buf, sizeof buf, name, i); + + if (__ib_device_get_by_name(buf)) + return -ENFILE; + + strlcpy(name, buf, IB_DEVICE_NAME_MAX); + return 0; +} + +static int start_port(struct ib_device *device) +{ + return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1; +} + + +static int end_port(struct ib_device *device) +{ + return (device->node_type == RDMA_NODE_IB_SWITCH) ? + 0 : device->phys_port_cnt; +} + +/** + * ib_alloc_device - allocate an IB device struct + * @size:size of structure to allocate + * + * Low-level drivers should use ib_alloc_device() to allocate &struct + * ib_device. @size is the size of the structure to be allocated, + * including any private data used by the low-level driver. + * ib_dealloc_device() must be used to free structures allocated with + * ib_alloc_device(). + */ +struct ib_device *ib_alloc_device(size_t size) +{ + BUG_ON(size < sizeof (struct ib_device)); + + return kzalloc(size, GFP_KERNEL); +} +EXPORT_SYMBOL(ib_alloc_device); + +/** + * ib_dealloc_device - free an IB device struct + * @device:structure to free + * + * Free a structure allocated with ib_alloc_device(). + */ +void ib_dealloc_device(struct ib_device *device) +{ + if (device->reg_state == IB_DEV_UNINITIALIZED) { + kfree(device); + return; + } + + BUG_ON(device->reg_state != IB_DEV_UNREGISTERED); + + kobject_put(&device->dev.kobj); +} +EXPORT_SYMBOL(ib_dealloc_device); + +static int add_client_context(struct ib_device *device, struct ib_client *client) +{ + struct ib_client_data *context; + unsigned long flags; + + context = kmalloc(sizeof *context, GFP_KERNEL); + if (!context) { + printk(KERN_WARNING "Couldn't allocate client context for %s/%s\n", + device->name, client->name); + return -ENOMEM; + } + + context->client = client; + context->data = NULL; + + spin_lock_irqsave(&device->client_data_lock, flags); + list_add(&context->list, &device->client_data_list); + spin_unlock_irqrestore(&device->client_data_lock, flags); + + return 0; +} + +static int read_port_table_lengths(struct ib_device *device) +{ + struct ib_port_attr *tprops = NULL; + int num_ports, ret = -ENOMEM; + u8 port_index; + + tprops = kmalloc(sizeof *tprops, GFP_KERNEL); + if (!tprops) + goto out; + + num_ports = end_port(device) - start_port(device) + 1; + + device->pkey_tbl_len = kmalloc(sizeof *device->pkey_tbl_len * num_ports, + GFP_KERNEL); + device->gid_tbl_len = kmalloc(sizeof *device->gid_tbl_len * num_ports, + GFP_KERNEL); + if (!device->pkey_tbl_len || !device->gid_tbl_len) + goto err; + + for (port_index = 0; port_index < num_ports; ++port_index) { + ret = ib_query_port(device, port_index + start_port(device), + tprops); + if (ret) + goto err; + device->pkey_tbl_len[port_index] = tprops->pkey_tbl_len; + device->gid_tbl_len[port_index] = tprops->gid_tbl_len; + } + + ret = 0; + goto out; + +err: + kfree(device->gid_tbl_len); + kfree(device->pkey_tbl_len); +out: + kfree(tprops); + return ret; +} + +/** + * ib_register_device - Register an IB device with IB core + * @device:Device to register + * + * Low-level drivers use ib_register_device() to register their + * devices with the IB core. All registered clients will receive a + * callback for each device that is added. @device must be allocated + * with ib_alloc_device(). + */ +int ib_register_device(struct ib_device *device, + int (*port_callback)(struct ib_device *, + u8, struct kobject *)) +{ + int ret; + + mutex_lock(&device_mutex); + + if (strchr(device->name, '%')) { + ret = alloc_name(device->name); + if (ret) + goto out; + } + + if (ib_device_check_mandatory(device)) { + ret = -EINVAL; + goto out; + } + + INIT_LIST_HEAD(&device->event_handler_list); + INIT_LIST_HEAD(&device->client_data_list); + spin_lock_init(&device->event_handler_lock); + spin_lock_init(&device->client_data_lock); + + ret = read_port_table_lengths(device); + if (ret) { + printk(KERN_WARNING "Couldn't create table lengths cache for device %s\n", + device->name); + goto out; + } + + ret = ib_device_register_sysfs(device, port_callback); + if (ret) { + printk(KERN_WARNING "Couldn't register device %s with driver model\n", + device->name); + kfree(device->gid_tbl_len); + kfree(device->pkey_tbl_len); + goto out; + } + + list_add_tail(&device->core_list, &device_list); + + device->reg_state = IB_DEV_REGISTERED; + + { + struct ib_client *client; + + list_for_each_entry(client, &client_list, list) + if (client->add && !add_client_context(device, client)) + client->add(device); + } + + out: + mutex_unlock(&device_mutex); + return ret; +} +EXPORT_SYMBOL(ib_register_device); + +/** + * ib_unregister_device - Unregister an IB device + * @device:Device to unregister + * + * Unregister an IB device. All clients will receive a remove callback. + */ +void ib_unregister_device(struct ib_device *device) +{ + struct ib_client *client; + struct ib_client_data *context, *tmp; + unsigned long flags; + + mutex_lock(&device_mutex); + + list_for_each_entry_reverse(client, &client_list, list) + if (client->remove) + client->remove(device); + + list_del(&device->core_list); + + kfree(device->gid_tbl_len); + kfree(device->pkey_tbl_len); + + mutex_unlock(&device_mutex); + + ib_device_unregister_sysfs(device); + + spin_lock_irqsave(&device->client_data_lock, flags); + list_for_each_entry_safe(context, tmp, &device->client_data_list, list) + kfree(context); + spin_unlock_irqrestore(&device->client_data_lock, flags); + + device->reg_state = IB_DEV_UNREGISTERED; +} +EXPORT_SYMBOL(ib_unregister_device); + +/** + * ib_register_client - Register an IB client + * @client:Client to register + * + * Upper level users of the IB drivers can use ib_register_client() to + * register callbacks for IB device addition and removal. When an IB + * device is added, each registered client's add method will be called + * (in the order the clients were registered), and when a device is + * removed, each client's remove method will be called (in the reverse + * order that clients were registered). In addition, when + * ib_register_client() is called, the client will receive an add + * callback for all devices already registered. + */ +int ib_register_client(struct ib_client *client) +{ + struct ib_device *device; + + mutex_lock(&device_mutex); + + list_add_tail(&client->list, &client_list); + list_for_each_entry(device, &device_list, core_list) + if (client->add && !add_client_context(device, client)) + client->add(device); + + mutex_unlock(&device_mutex); + + return 0; +} +EXPORT_SYMBOL(ib_register_client); + +/** + * ib_unregister_client - Unregister an IB client + * @client:Client to unregister + * + * Upper level users use ib_unregister_client() to remove their client + * registration. When ib_unregister_client() is called, the client + * will receive a remove callback for each IB device still registered. + */ +void ib_unregister_client(struct ib_client *client) +{ + struct ib_client_data *context, *tmp; + struct ib_device *device; + unsigned long flags; + + mutex_lock(&device_mutex); + + list_for_each_entry(device, &device_list, core_list) { + if (client->remove) + client->remove(device); + + spin_lock_irqsave(&device->client_data_lock, flags); + list_for_each_entry_safe(context, tmp, &device->client_data_list, list) + if (context->client == client) { + list_del(&context->list); + kfree(context); + } + spin_unlock_irqrestore(&device->client_data_lock, flags); + } + list_del(&client->list); + + mutex_unlock(&device_mutex); +} +EXPORT_SYMBOL(ib_unregister_client); + +/** + * ib_get_client_data - Get IB client context + * @device:Device to get context for + * @client:Client to get context for + * + * ib_get_client_data() returns client context set with + * ib_set_client_data(). + */ +void *ib_get_client_data(struct ib_device *device, struct ib_client *client) +{ + struct ib_client_data *context; + void *ret = NULL; + unsigned long flags; + + spin_lock_irqsave(&device->client_data_lock, flags); + list_for_each_entry(context, &device->client_data_list, list) + if (context->client == client) { + ret = context->data; + break; + } + spin_unlock_irqrestore(&device->client_data_lock, flags); + + return ret; +} +EXPORT_SYMBOL(ib_get_client_data); + +/** + * ib_set_client_data - Set IB client context + * @device:Device to set context for + * @client:Client to set context for + * @data:Context to set + * + * ib_set_client_data() sets client context that can be retrieved with + * ib_get_client_data(). + */ +void ib_set_client_data(struct ib_device *device, struct ib_client *client, + void *data) +{ + struct ib_client_data *context; + unsigned long flags; + + spin_lock_irqsave(&device->client_data_lock, flags); + list_for_each_entry(context, &device->client_data_list, list) + if (context->client == client) { + context->data = data; + goto out; + } + + printk(KERN_WARNING "No client context found for %s/%s\n", + device->name, client->name); + +out: + spin_unlock_irqrestore(&device->client_data_lock, flags); +} +EXPORT_SYMBOL(ib_set_client_data); + +/** + * ib_register_event_handler - Register an IB event handler + * @event_handler:Handler to register + * + * ib_register_event_handler() registers an event handler that will be + * called back when asynchronous IB events occur (as defined in + * chapter 11 of the InfiniBand Architecture Specification). This + * callback may occur in interrupt context. + */ +int ib_register_event_handler (struct ib_event_handler *event_handler) +{ + unsigned long flags; + + spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); + list_add_tail(&event_handler->list, + &event_handler->device->event_handler_list); + spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); + + return 0; +} +EXPORT_SYMBOL(ib_register_event_handler); + +/** + * ib_unregister_event_handler - Unregister an event handler + * @event_handler:Handler to unregister + * + * Unregister an event handler registered with + * ib_register_event_handler(). + */ +int ib_unregister_event_handler(struct ib_event_handler *event_handler) +{ + unsigned long flags; + + spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); + list_del(&event_handler->list); + spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); + + return 0; +} +EXPORT_SYMBOL(ib_unregister_event_handler); + +/** + * ib_dispatch_event - Dispatch an asynchronous event + * @event:Event to dispatch + * + * Low-level drivers must call ib_dispatch_event() to dispatch the + * event to all registered event handlers when an asynchronous event + * occurs. + */ +void ib_dispatch_event(struct ib_event *event) +{ + unsigned long flags; + struct ib_event_handler *handler; + + spin_lock_irqsave(&event->device->event_handler_lock, flags); + + list_for_each_entry(handler, &event->device->event_handler_list, list) + handler->handler(handler, event); + + spin_unlock_irqrestore(&event->device->event_handler_lock, flags); +} +EXPORT_SYMBOL(ib_dispatch_event); + +/** + * ib_query_device - Query IB device attributes + * @device:Device to query + * @device_attr:Device attributes + * + * ib_query_device() returns the attributes of a device through the + * @device_attr pointer. + */ +int ib_query_device(struct ib_device *device, + struct ib_device_attr *device_attr) +{ + return device->query_device(device, device_attr); +} +EXPORT_SYMBOL(ib_query_device); + +/** + * ib_query_port - Query IB port attributes + * @device:Device to query + * @port_num:Port number to query + * @port_attr:Port attributes + * + * ib_query_port() returns the attributes of a port through the + * @port_attr pointer. + */ +int ib_query_port(struct ib_device *device, + u8 port_num, + struct ib_port_attr *port_attr) +{ + if (port_num < start_port(device) || port_num > end_port(device)) + return -EINVAL; + + return device->query_port(device, port_num, port_attr); +} +EXPORT_SYMBOL(ib_query_port); + +/** + * ib_query_gid - Get GID table entry + * @device:Device to query + * @port_num:Port number to query + * @index:GID table index to query + * @gid:Returned GID + * + * ib_query_gid() fetches the specified GID table entry. + */ +int ib_query_gid(struct ib_device *device, + u8 port_num, int index, union ib_gid *gid) +{ + return device->query_gid(device, port_num, index, gid); +} +EXPORT_SYMBOL(ib_query_gid); + +/** + * ib_query_pkey - Get P_Key table entry + * @device:Device to query + * @port_num:Port number to query + * @index:P_Key table index to query + * @pkey:Returned P_Key + * + * ib_query_pkey() fetches the specified P_Key table entry. + */ +int ib_query_pkey(struct ib_device *device, + u8 port_num, u16 index, u16 *pkey) +{ + return device->query_pkey(device, port_num, index, pkey); +} +EXPORT_SYMBOL(ib_query_pkey); + +/** + * ib_modify_device - Change IB device attributes + * @device:Device to modify + * @device_modify_mask:Mask of attributes to change + * @device_modify:New attribute values + * + * ib_modify_device() changes a device's attributes as specified by + * the @device_modify_mask and @device_modify structure. + */ +int ib_modify_device(struct ib_device *device, + int device_modify_mask, + struct ib_device_modify *device_modify) +{ + if (!device->modify_device) + return -ENOSYS; + + return device->modify_device(device, device_modify_mask, + device_modify); +} +EXPORT_SYMBOL(ib_modify_device); + +/** + * ib_modify_port - Modifies the attributes for the specified port. + * @device: The device to modify. + * @port_num: The number of the port to modify. + * @port_modify_mask: Mask used to specify which attributes of the port + * to change. + * @port_modify: New attribute values for the port. + * + * ib_modify_port() changes a port's attributes as specified by the + * @port_modify_mask and @port_modify structure. + */ +int ib_modify_port(struct ib_device *device, + u8 port_num, int port_modify_mask, + struct ib_port_modify *port_modify) +{ + if (!device->modify_port) + return -ENOSYS; + + if (port_num < start_port(device) || port_num > end_port(device)) + return -EINVAL; + + return device->modify_port(device, port_num, port_modify_mask, + port_modify); +} +EXPORT_SYMBOL(ib_modify_port); + +/** + * ib_find_gid - Returns the port number and GID table index where + * a specified GID value occurs. + * @device: The device to query. + * @gid: The GID value to search for. + * @port_num: The port number of the device where the GID value was found. + * @index: The index into the GID table where the GID was found. This + * parameter may be NULL. + */ +int ib_find_gid(struct ib_device *device, union ib_gid *gid, + u8 *port_num, u16 *index) +{ + union ib_gid tmp_gid; + int ret, port, i; + + for (port = start_port(device); port <= end_port(device); ++port) { + for (i = 0; i < device->gid_tbl_len[port - start_port(device)]; ++i) { + ret = ib_query_gid(device, port, i, &tmp_gid); + if (ret) + return ret; + if (!memcmp(&tmp_gid, gid, sizeof *gid)) { + *port_num = port; + if (index) + *index = i; + return 0; + } + } + } + + return -ENOENT; +} +EXPORT_SYMBOL(ib_find_gid); + +/** + * ib_find_pkey - Returns the PKey table index where a specified + * PKey value occurs. + * @device: The device to query. + * @port_num: The port number of the device to search for the PKey. + * @pkey: The PKey value to search for. + * @index: The index into the PKey table where the PKey was found. + */ +int ib_find_pkey(struct ib_device *device, + u8 port_num, u16 pkey, u16 *index) +{ + int ret, i; + u16 tmp_pkey; + int partial_ix = -1; + + for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) { + ret = ib_query_pkey(device, port_num, i, &tmp_pkey); + if (ret) + return ret; + if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { + /* if there is full-member pkey take it.*/ + if (tmp_pkey & 0x8000) { + *index = i; + return 0; + } + if (partial_ix < 0) + partial_ix = i; + } + } + + /*no full-member, if exists take the limited*/ + if (partial_ix >= 0) { + *index = partial_ix; + return 0; + } + return -ENOENT; +} +EXPORT_SYMBOL(ib_find_pkey); + +static int __init ib_core_init(void) +{ + int ret; + + ib_wq = alloc_workqueue("infiniband", 0, 0); + if (!ib_wq) + return -ENOMEM; + + ret = ib_sysfs_setup(); + if (ret) { + printk(KERN_WARNING "Couldn't create InfiniBand device class\n"); + goto err; + } + + ret = ibnl_init(); + if (ret) { + printk(KERN_WARNING "Couldn't init IB netlink interface\n"); + goto err_sysfs; + } + + ret = ib_cache_setup(); + if (ret) { + printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n"); + goto err_nl; + } + + return 0; + +err_nl: + ibnl_cleanup(); + +err_sysfs: + ib_sysfs_cleanup(); + +err: + destroy_workqueue(ib_wq); + return ret; +} + +static void __exit ib_core_cleanup(void) +{ + ib_cache_cleanup(); + ibnl_cleanup(); + ib_sysfs_cleanup(); + /* Make sure that any pending umem accounting work is done. */ + destroy_workqueue(ib_wq); +} + +module_init(ib_core_init); +module_exit(ib_core_cleanup); diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c new file mode 100644 index 0000000..9f5ad7c --- /dev/null +++ b/drivers/infiniband/core/fmr_pool.c @@ -0,0 +1,544 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include "core_priv.h" + +#define PFX "fmr_pool: " + +enum { + IB_FMR_MAX_REMAPS = 32, + + IB_FMR_HASH_BITS = 8, + IB_FMR_HASH_SIZE = 1 << IB_FMR_HASH_BITS, + IB_FMR_HASH_MASK = IB_FMR_HASH_SIZE - 1 +}; + +/* + * If an FMR is not in use, then the list member will point to either + * its pool's free_list (if the FMR can be mapped again; that is, + * remap_count < pool->max_remaps) or its pool's dirty_list (if the + * FMR needs to be unmapped before being remapped). In either of + * these cases it is a bug if the ref_count is not 0. In other words, + * if ref_count is > 0, then the list member must not be linked into + * either free_list or dirty_list. + * + * The cache_node member is used to link the FMR into a cache bucket + * (if caching is enabled). This is independent of the reference + * count of the FMR. When a valid FMR is released, its ref_count is + * decremented, and if ref_count reaches 0, the FMR is placed in + * either free_list or dirty_list as appropriate. However, it is not + * removed from the cache and may be "revived" if a call to + * ib_fmr_register_physical() occurs before the FMR is remapped. In + * this case we just increment the ref_count and remove the FMR from + * free_list/dirty_list. + * + * Before we remap an FMR from free_list, we remove it from the cache + * (to prevent another user from obtaining a stale FMR). When an FMR + * is released, we add it to the tail of the free list, so that our + * cache eviction policy is "least recently used." + * + * All manipulation of ref_count, list and cache_node is protected by + * pool_lock to maintain consistency. + */ + +struct ib_fmr_pool { + spinlock_t pool_lock; + + int pool_size; + int max_pages; + int max_remaps; + int dirty_watermark; + int dirty_len; + struct list_head free_list; + struct list_head dirty_list; + struct hlist_head *cache_bucket; + + void (*flush_function)(struct ib_fmr_pool *pool, + void * arg); + void *flush_arg; + + struct task_struct *thread; + + atomic_t req_ser; + atomic_t flush_ser; + + wait_queue_head_t force_wait; +}; + +static inline u32 ib_fmr_hash(u64 first_page) +{ + return jhash_2words((u32) first_page, (u32) (first_page >> 32), 0) & + (IB_FMR_HASH_SIZE - 1); +} + +/* Caller must hold pool_lock */ +static inline struct ib_pool_fmr *ib_fmr_cache_lookup(struct ib_fmr_pool *pool, + u64 *page_list, + int page_list_len, + u64 io_virtual_address) +{ + struct hlist_head *bucket; + struct ib_pool_fmr *fmr; + + if (!pool->cache_bucket) + return NULL; + + bucket = pool->cache_bucket + ib_fmr_hash(*page_list); + + hlist_for_each_entry(fmr, bucket, cache_node) + if (io_virtual_address == fmr->io_virtual_address && + page_list_len == fmr->page_list_len && + !memcmp(page_list, fmr->page_list, + page_list_len * sizeof *page_list)) + return fmr; + + return NULL; +} + +static void ib_fmr_batch_release(struct ib_fmr_pool *pool) +{ + int ret; + struct ib_pool_fmr *fmr; + LIST_HEAD(unmap_list); + LIST_HEAD(fmr_list); + + spin_lock_irq(&pool->pool_lock); + + list_for_each_entry(fmr, &pool->dirty_list, list) { + hlist_del_init(&fmr->cache_node); + fmr->remap_count = 0; + list_add_tail(&fmr->fmr->list, &fmr_list); + +#ifdef DEBUG + if (fmr->ref_count !=0) { + printk(KERN_WARNING PFX "Unmapping FMR 0x%08x with ref count %d\n", + fmr, fmr->ref_count); + } +#endif + } + + list_splice_init(&pool->dirty_list, &unmap_list); + pool->dirty_len = 0; + + spin_unlock_irq(&pool->pool_lock); + + if (list_empty(&unmap_list)) { + return; + } + + ret = ib_unmap_fmr(&fmr_list); + if (ret) + printk(KERN_WARNING PFX "ib_unmap_fmr returned %d\n", ret); + + spin_lock_irq(&pool->pool_lock); + list_splice(&unmap_list, &pool->free_list); + spin_unlock_irq(&pool->pool_lock); +} + +static int ib_fmr_cleanup_thread(void *pool_ptr) +{ + struct ib_fmr_pool *pool = pool_ptr; + + do { + if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) < 0) { + ib_fmr_batch_release(pool); + + atomic_inc(&pool->flush_ser); + wake_up_interruptible(&pool->force_wait); + + if (pool->flush_function) + pool->flush_function(pool, pool->flush_arg); + } + + set_current_state(TASK_INTERRUPTIBLE); + if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) >= 0 && + !kthread_should_stop()) + schedule(); + __set_current_state(TASK_RUNNING); + } while (!kthread_should_stop()); + + return 0; +} + +/** + * ib_create_fmr_pool - Create an FMR pool + * @pd:Protection domain for FMRs + * @params:FMR pool parameters + * + * Create a pool of FMRs. Return value is pointer to new pool or + * error code if creation failed. + */ +struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, + struct ib_fmr_pool_param *params) +{ + struct ib_device *device; + struct ib_fmr_pool *pool; + struct ib_device_attr *attr; + int i; + int ret; + int max_remaps; + + if (!params) + return ERR_PTR(-EINVAL); + + device = pd->device; + if (!device->alloc_fmr || !device->dealloc_fmr || + !device->map_phys_fmr || !device->unmap_fmr) { + printk(KERN_INFO PFX "Device %s does not support FMRs\n", + device->name); + return ERR_PTR(-ENOSYS); + } + + attr = kmalloc(sizeof *attr, GFP_KERNEL); + if (!attr) { + printk(KERN_WARNING PFX "couldn't allocate device attr struct\n"); + return ERR_PTR(-ENOMEM); + } + + ret = ib_query_device(device, attr); + if (ret) { + printk(KERN_WARNING PFX "couldn't query device: %d\n", ret); + kfree(attr); + return ERR_PTR(ret); + } + + if (!attr->max_map_per_fmr) + max_remaps = IB_FMR_MAX_REMAPS; + else + max_remaps = attr->max_map_per_fmr; + + kfree(attr); + + pool = kmalloc(sizeof *pool, GFP_KERNEL); + if (!pool) { + printk(KERN_WARNING PFX "couldn't allocate pool struct\n"); + return ERR_PTR(-ENOMEM); + } + + pool->cache_bucket = NULL; + + pool->flush_function = params->flush_function; + pool->flush_arg = params->flush_arg; + + INIT_LIST_HEAD(&pool->free_list); + INIT_LIST_HEAD(&pool->dirty_list); + + if (params->cache) { + pool->cache_bucket = + kmalloc(IB_FMR_HASH_SIZE * sizeof *pool->cache_bucket, + GFP_KERNEL); + if (!pool->cache_bucket) { + printk(KERN_WARNING PFX "Failed to allocate cache in pool\n"); + ret = -ENOMEM; + goto out_free_pool; + } + + for (i = 0; i < IB_FMR_HASH_SIZE; ++i) + INIT_HLIST_HEAD(pool->cache_bucket + i); + } + + pool->pool_size = 0; + pool->max_pages = params->max_pages_per_fmr; + pool->max_remaps = max_remaps; + pool->dirty_watermark = params->dirty_watermark; + pool->dirty_len = 0; + spin_lock_init(&pool->pool_lock); + atomic_set(&pool->req_ser, 0); + atomic_set(&pool->flush_ser, 0); + init_waitqueue_head(&pool->force_wait); + + pool->thread = kthread_run(ib_fmr_cleanup_thread, + pool, + "ib_fmr(%s)", + device->name); + if (IS_ERR(pool->thread)) { + printk(KERN_WARNING PFX "couldn't start cleanup thread\n"); + ret = PTR_ERR(pool->thread); + goto out_free_pool; + } + + { + struct ib_pool_fmr *fmr; + struct ib_fmr_attr fmr_attr = { + .max_pages = params->max_pages_per_fmr, + .max_maps = pool->max_remaps, + .page_shift = params->page_shift + }; + int bytes_per_fmr = sizeof *fmr; + + if (pool->cache_bucket) + bytes_per_fmr += params->max_pages_per_fmr * sizeof (u64); + + for (i = 0; i < params->pool_size; ++i) { + fmr = kmalloc(bytes_per_fmr, GFP_KERNEL); + if (!fmr) { + printk(KERN_WARNING PFX "failed to allocate fmr " + "struct for FMR %d\n", i); + goto out_fail; + } + + fmr->pool = pool; + fmr->remap_count = 0; + fmr->ref_count = 0; + INIT_HLIST_NODE(&fmr->cache_node); + + fmr->fmr = ib_alloc_fmr(pd, params->access, &fmr_attr); + if (IS_ERR(fmr->fmr)) { + printk(KERN_WARNING PFX "fmr_create failed " + "for FMR %d\n", i); + kfree(fmr); + goto out_fail; + } + + list_add_tail(&fmr->list, &pool->free_list); + ++pool->pool_size; + } + } + + return pool; + + out_free_pool: + kfree(pool->cache_bucket); + kfree(pool); + + return ERR_PTR(ret); + + out_fail: + ib_destroy_fmr_pool(pool); + + return ERR_PTR(-ENOMEM); +} +EXPORT_SYMBOL(ib_create_fmr_pool); + +/** + * ib_destroy_fmr_pool - Free FMR pool + * @pool:FMR pool to free + * + * Destroy an FMR pool and free all associated resources. + */ +void ib_destroy_fmr_pool(struct ib_fmr_pool *pool) +{ + struct ib_pool_fmr *fmr; + struct ib_pool_fmr *tmp; + LIST_HEAD(fmr_list); + int i; + + kthread_stop(pool->thread); + ib_fmr_batch_release(pool); + + i = 0; + list_for_each_entry_safe(fmr, tmp, &pool->free_list, list) { + if (fmr->remap_count) { + INIT_LIST_HEAD(&fmr_list); + list_add_tail(&fmr->fmr->list, &fmr_list); + ib_unmap_fmr(&fmr_list); + } + ib_dealloc_fmr(fmr->fmr); + list_del(&fmr->list); + kfree(fmr); + ++i; + } + + if (i < pool->pool_size) + printk(KERN_WARNING PFX "pool still has %d regions registered\n", + pool->pool_size - i); + + kfree(pool->cache_bucket); + kfree(pool); +} +EXPORT_SYMBOL(ib_destroy_fmr_pool); + +/** + * ib_flush_fmr_pool - Invalidate all unmapped FMRs + * @pool:FMR pool to flush + * + * Ensure that all unmapped FMRs are fully invalidated. + */ +int ib_flush_fmr_pool(struct ib_fmr_pool *pool) +{ + int serial; + struct ib_pool_fmr *fmr, *next; + + /* + * The free_list holds FMRs that may have been used + * but have not been remapped enough times to be dirty. + * Put them on the dirty list now so that the cleanup + * thread will reap them too. + */ + spin_lock_irq(&pool->pool_lock); + list_for_each_entry_safe(fmr, next, &pool->free_list, list) { + if (fmr->remap_count > 0) + list_move(&fmr->list, &pool->dirty_list); + } + spin_unlock_irq(&pool->pool_lock); + + serial = atomic_inc_return(&pool->req_ser); + wake_up_process(pool->thread); + + if (wait_event_interruptible(pool->force_wait, + atomic_read(&pool->flush_ser) - serial >= 0)) + return -EINTR; + + return 0; +} +EXPORT_SYMBOL(ib_flush_fmr_pool); + +/** + * ib_fmr_pool_map_phys - + * @pool:FMR pool to allocate FMR from + * @page_list:List of pages to map + * @list_len:Number of pages in @page_list + * @io_virtual_address:I/O virtual address for new FMR + * + * Map an FMR from an FMR pool. + */ +struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle, + u64 *page_list, + int list_len, + u64 io_virtual_address) +{ + struct ib_fmr_pool *pool = pool_handle; + struct ib_pool_fmr *fmr; + unsigned long flags; + int result; + + if (list_len < 1 || list_len > pool->max_pages) + return ERR_PTR(-EINVAL); + + spin_lock_irqsave(&pool->pool_lock, flags); + fmr = ib_fmr_cache_lookup(pool, + page_list, + list_len, + io_virtual_address); + if (fmr) { + /* found in cache */ + ++fmr->ref_count; + if (fmr->ref_count == 1) { + list_del(&fmr->list); + } + + spin_unlock_irqrestore(&pool->pool_lock, flags); + + return fmr; + } + + if (list_empty(&pool->free_list)) { + spin_unlock_irqrestore(&pool->pool_lock, flags); + return ERR_PTR(-EAGAIN); + } + + fmr = list_entry(pool->free_list.next, struct ib_pool_fmr, list); + list_del(&fmr->list); + hlist_del_init(&fmr->cache_node); + spin_unlock_irqrestore(&pool->pool_lock, flags); + + result = ib_map_phys_fmr(fmr->fmr, page_list, list_len, + io_virtual_address); + + if (result) { + spin_lock_irqsave(&pool->pool_lock, flags); + list_add(&fmr->list, &pool->free_list); + spin_unlock_irqrestore(&pool->pool_lock, flags); + + printk(KERN_WARNING PFX "fmr_map returns %d\n", result); + + return ERR_PTR(result); + } + + ++fmr->remap_count; + fmr->ref_count = 1; + + if (pool->cache_bucket) { + fmr->io_virtual_address = io_virtual_address; + fmr->page_list_len = list_len; + memcpy(fmr->page_list, page_list, list_len * sizeof(*page_list)); + + spin_lock_irqsave(&pool->pool_lock, flags); + hlist_add_head(&fmr->cache_node, + pool->cache_bucket + ib_fmr_hash(fmr->page_list[0])); + spin_unlock_irqrestore(&pool->pool_lock, flags); + } + + return fmr; +} +EXPORT_SYMBOL(ib_fmr_pool_map_phys); + +/** + * ib_fmr_pool_unmap - Unmap FMR + * @fmr:FMR to unmap + * + * Unmap an FMR. The FMR mapping may remain valid until the FMR is + * reused (or until ib_flush_fmr_pool() is called). + */ +int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr) +{ + struct ib_fmr_pool *pool; + unsigned long flags; + + pool = fmr->pool; + + spin_lock_irqsave(&pool->pool_lock, flags); + + --fmr->ref_count; + if (!fmr->ref_count) { + if (fmr->remap_count < pool->max_remaps) { + list_add_tail(&fmr->list, &pool->free_list); + } else { + list_add_tail(&fmr->list, &pool->dirty_list); + if (++pool->dirty_len >= pool->dirty_watermark) { + atomic_inc(&pool->req_ser); + wake_up_process(pool->thread); + } + } + } + +#ifdef DEBUG + if (fmr->ref_count < 0) + printk(KERN_WARNING PFX "FMR %p has ref count %d < 0\n", + fmr, fmr->ref_count); +#endif + + spin_unlock_irqrestore(&pool->pool_lock, flags); + + return 0; +} +EXPORT_SYMBOL(ib_fmr_pool_unmap); diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c new file mode 100644 index 0000000..ff9163d --- /dev/null +++ b/drivers/infiniband/core/iwcm.c @@ -0,0 +1,1069 @@ +/* + * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved. + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2004, 2005 Voltaire Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * Copyright (c) 2005 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "iwcm.h" + +MODULE_AUTHOR("Tom Tucker"); +MODULE_DESCRIPTION("iWARP CM"); +MODULE_LICENSE("Dual BSD/GPL"); + +static struct workqueue_struct *iwcm_wq; +struct iwcm_work { + struct work_struct work; + struct iwcm_id_private *cm_id; + struct list_head list; + struct iw_cm_event event; + struct list_head free_list; +}; + +static unsigned int default_backlog = 256; + +static struct ctl_table_header *iwcm_ctl_table_hdr; +static struct ctl_table iwcm_ctl_table[] = { + { + .procname = "default_backlog", + .data = &default_backlog, + .maxlen = sizeof(default_backlog), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +/* + * The following services provide a mechanism for pre-allocating iwcm_work + * elements. The design pre-allocates them based on the cm_id type: + * LISTENING IDS: Get enough elements preallocated to handle the + * listen backlog. + * ACTIVE IDS: 4: CONNECT_REPLY, ESTABLISHED, DISCONNECT, CLOSE + * PASSIVE IDS: 3: ESTABLISHED, DISCONNECT, CLOSE + * + * Allocating them in connect and listen avoids having to deal + * with allocation failures on the event upcall from the provider (which + * is called in the interrupt context). + * + * One exception is when creating the cm_id for incoming connection requests. + * There are two cases: + * 1) in the event upcall, cm_event_handler(), for a listening cm_id. If + * the backlog is exceeded, then no more connection request events will + * be processed. cm_event_handler() returns -ENOMEM in this case. Its up + * to the provider to reject the connection request. + * 2) in the connection request workqueue handler, cm_conn_req_handler(). + * If work elements cannot be allocated for the new connect request cm_id, + * then IWCM will call the provider reject method. This is ok since + * cm_conn_req_handler() runs in the workqueue thread context. + */ + +static struct iwcm_work *get_work(struct iwcm_id_private *cm_id_priv) +{ + struct iwcm_work *work; + + if (list_empty(&cm_id_priv->work_free_list)) + return NULL; + work = list_entry(cm_id_priv->work_free_list.next, struct iwcm_work, + free_list); + list_del_init(&work->free_list); + return work; +} + +static void put_work(struct iwcm_work *work) +{ + list_add(&work->free_list, &work->cm_id->work_free_list); +} + +static void dealloc_work_entries(struct iwcm_id_private *cm_id_priv) +{ + struct list_head *e, *tmp; + + list_for_each_safe(e, tmp, &cm_id_priv->work_free_list) + kfree(list_entry(e, struct iwcm_work, free_list)); +} + +static int alloc_work_entries(struct iwcm_id_private *cm_id_priv, int count) +{ + struct iwcm_work *work; + + BUG_ON(!list_empty(&cm_id_priv->work_free_list)); + while (count--) { + work = kmalloc(sizeof(struct iwcm_work), GFP_KERNEL); + if (!work) { + dealloc_work_entries(cm_id_priv); + return -ENOMEM; + } + work->cm_id = cm_id_priv; + INIT_LIST_HEAD(&work->list); + put_work(work); + } + return 0; +} + +/* + * Save private data from incoming connection requests to + * iw_cm_event, so the low level driver doesn't have to. Adjust + * the event ptr to point to the local copy. + */ +static int copy_private_data(struct iw_cm_event *event) +{ + void *p; + + p = kmemdup(event->private_data, event->private_data_len, GFP_ATOMIC); + if (!p) + return -ENOMEM; + event->private_data = p; + return 0; +} + +static void free_cm_id(struct iwcm_id_private *cm_id_priv) +{ + dealloc_work_entries(cm_id_priv); + kfree(cm_id_priv); +} + +/* + * Release a reference on cm_id. If the last reference is being + * released, enable the waiting thread (in iw_destroy_cm_id) to + * get woken up, and return 1 if a thread is already waiting. + */ +static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv) +{ + BUG_ON(atomic_read(&cm_id_priv->refcount)==0); + if (atomic_dec_and_test(&cm_id_priv->refcount)) { + BUG_ON(!list_empty(&cm_id_priv->work_list)); + complete(&cm_id_priv->destroy_comp); + return 1; + } + + return 0; +} + +static void add_ref(struct iw_cm_id *cm_id) +{ + struct iwcm_id_private *cm_id_priv; + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + atomic_inc(&cm_id_priv->refcount); +} + +static void rem_ref(struct iw_cm_id *cm_id) +{ + struct iwcm_id_private *cm_id_priv; + int cb_destroy; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + + /* + * Test bit before deref in case the cm_id gets freed on another + * thread. + */ + cb_destroy = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags); + if (iwcm_deref_id(cm_id_priv) && cb_destroy) { + BUG_ON(!list_empty(&cm_id_priv->work_list)); + free_cm_id(cm_id_priv); + } +} + +static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event); + +struct iw_cm_id *iw_create_cm_id(struct ib_device *device, + iw_cm_handler cm_handler, + void *context) +{ + struct iwcm_id_private *cm_id_priv; + + cm_id_priv = kzalloc(sizeof(*cm_id_priv), GFP_KERNEL); + if (!cm_id_priv) + return ERR_PTR(-ENOMEM); + + cm_id_priv->state = IW_CM_STATE_IDLE; + cm_id_priv->id.device = device; + cm_id_priv->id.cm_handler = cm_handler; + cm_id_priv->id.context = context; + cm_id_priv->id.event_handler = cm_event_handler; + cm_id_priv->id.add_ref = add_ref; + cm_id_priv->id.rem_ref = rem_ref; + spin_lock_init(&cm_id_priv->lock); + atomic_set(&cm_id_priv->refcount, 1); + init_waitqueue_head(&cm_id_priv->connect_wait); + init_completion(&cm_id_priv->destroy_comp); + INIT_LIST_HEAD(&cm_id_priv->work_list); + INIT_LIST_HEAD(&cm_id_priv->work_free_list); + + return &cm_id_priv->id; +} +EXPORT_SYMBOL(iw_create_cm_id); + + +static int iwcm_modify_qp_err(struct ib_qp *qp) +{ + struct ib_qp_attr qp_attr; + + if (!qp) + return -EINVAL; + + qp_attr.qp_state = IB_QPS_ERR; + return ib_modify_qp(qp, &qp_attr, IB_QP_STATE); +} + +/* + * This is really the RDMAC CLOSING state. It is most similar to the + * IB SQD QP state. + */ +static int iwcm_modify_qp_sqd(struct ib_qp *qp) +{ + struct ib_qp_attr qp_attr; + + BUG_ON(qp == NULL); + qp_attr.qp_state = IB_QPS_SQD; + return ib_modify_qp(qp, &qp_attr, IB_QP_STATE); +} + +/* + * CM_ID <-- CLOSING + * + * Block if a passive or active connection is currently being processed. Then + * process the event as follows: + * - If we are ESTABLISHED, move to CLOSING and modify the QP state + * based on the abrupt flag + * - If the connection is already in the CLOSING or IDLE state, the peer is + * disconnecting concurrently with us and we've already seen the + * DISCONNECT event -- ignore the request and return 0 + * - Disconnect on a listening endpoint returns -EINVAL + */ +int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt) +{ + struct iwcm_id_private *cm_id_priv; + unsigned long flags; + int ret = 0; + struct ib_qp *qp = NULL; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + /* Wait if we're currently in a connect or accept downcall */ + wait_event(cm_id_priv->connect_wait, + !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags)); + + spin_lock_irqsave(&cm_id_priv->lock, flags); + switch (cm_id_priv->state) { + case IW_CM_STATE_ESTABLISHED: + cm_id_priv->state = IW_CM_STATE_CLOSING; + + /* QP could be for user-mode client */ + if (cm_id_priv->qp) + qp = cm_id_priv->qp; + else + ret = -EINVAL; + break; + case IW_CM_STATE_LISTEN: + ret = -EINVAL; + break; + case IW_CM_STATE_CLOSING: + /* remote peer closed first */ + case IW_CM_STATE_IDLE: + /* accept or connect returned !0 */ + break; + case IW_CM_STATE_CONN_RECV: + /* + * App called disconnect before/without calling accept after + * connect_request event delivered. + */ + break; + case IW_CM_STATE_CONN_SENT: + /* Can only get here if wait above fails */ + default: + BUG(); + } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + + if (qp) { + if (abrupt) + ret = iwcm_modify_qp_err(qp); + else + ret = iwcm_modify_qp_sqd(qp); + + /* + * If both sides are disconnecting the QP could + * already be in ERR or SQD states + */ + ret = 0; + } + + return ret; +} +EXPORT_SYMBOL(iw_cm_disconnect); + +/* + * CM_ID <-- DESTROYING + * + * Clean up all resources associated with the connection and release + * the initial reference taken by iw_create_cm_id. + */ +static void destroy_cm_id(struct iw_cm_id *cm_id) +{ + struct iwcm_id_private *cm_id_priv; + unsigned long flags; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + /* + * Wait if we're currently in a connect or accept downcall. A + * listening endpoint should never block here. + */ + wait_event(cm_id_priv->connect_wait, + !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags)); + + spin_lock_irqsave(&cm_id_priv->lock, flags); + switch (cm_id_priv->state) { + case IW_CM_STATE_LISTEN: + cm_id_priv->state = IW_CM_STATE_DESTROYING; + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + /* destroy the listening endpoint */ + cm_id->device->iwcm->destroy_listen(cm_id); + spin_lock_irqsave(&cm_id_priv->lock, flags); + break; + case IW_CM_STATE_ESTABLISHED: + cm_id_priv->state = IW_CM_STATE_DESTROYING; + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + /* Abrupt close of the connection */ + (void)iwcm_modify_qp_err(cm_id_priv->qp); + spin_lock_irqsave(&cm_id_priv->lock, flags); + break; + case IW_CM_STATE_IDLE: + case IW_CM_STATE_CLOSING: + cm_id_priv->state = IW_CM_STATE_DESTROYING; + break; + case IW_CM_STATE_CONN_RECV: + /* + * App called destroy before/without calling accept after + * receiving connection request event notification or + * returned non zero from the event callback function. + * In either case, must tell the provider to reject. + */ + cm_id_priv->state = IW_CM_STATE_DESTROYING; + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + cm_id->device->iwcm->reject(cm_id, NULL, 0); + spin_lock_irqsave(&cm_id_priv->lock, flags); + break; + case IW_CM_STATE_CONN_SENT: + case IW_CM_STATE_DESTROYING: + default: + BUG(); + break; + } + if (cm_id_priv->qp) { + cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp); + cm_id_priv->qp = NULL; + } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + + (void)iwcm_deref_id(cm_id_priv); +} + +/* + * This function is only called by the application thread and cannot + * be called by the event thread. The function will wait for all + * references to be released on the cm_id and then kfree the cm_id + * object. + */ +void iw_destroy_cm_id(struct iw_cm_id *cm_id) +{ + struct iwcm_id_private *cm_id_priv; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + BUG_ON(test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags)); + + destroy_cm_id(cm_id); + + wait_for_completion(&cm_id_priv->destroy_comp); + + free_cm_id(cm_id_priv); +} +EXPORT_SYMBOL(iw_destroy_cm_id); + +/* + * CM_ID <-- LISTEN + * + * Start listening for connect requests. Generates one CONNECT_REQUEST + * event for each inbound connect request. + */ +int iw_cm_listen(struct iw_cm_id *cm_id, int backlog) +{ + struct iwcm_id_private *cm_id_priv; + unsigned long flags; + int ret; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + + if (!backlog) + backlog = default_backlog; + + ret = alloc_work_entries(cm_id_priv, backlog); + if (ret) + return ret; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + switch (cm_id_priv->state) { + case IW_CM_STATE_IDLE: + cm_id_priv->state = IW_CM_STATE_LISTEN; + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + ret = cm_id->device->iwcm->create_listen(cm_id, backlog); + if (ret) + cm_id_priv->state = IW_CM_STATE_IDLE; + spin_lock_irqsave(&cm_id_priv->lock, flags); + break; + default: + ret = -EINVAL; + } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + + return ret; +} +EXPORT_SYMBOL(iw_cm_listen); + +/* + * CM_ID <-- IDLE + * + * Rejects an inbound connection request. No events are generated. + */ +int iw_cm_reject(struct iw_cm_id *cm_id, + const void *private_data, + u8 private_data_len) +{ + struct iwcm_id_private *cm_id_priv; + unsigned long flags; + int ret; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + wake_up_all(&cm_id_priv->connect_wait); + return -EINVAL; + } + cm_id_priv->state = IW_CM_STATE_IDLE; + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + + ret = cm_id->device->iwcm->reject(cm_id, private_data, + private_data_len); + + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + wake_up_all(&cm_id_priv->connect_wait); + + return ret; +} +EXPORT_SYMBOL(iw_cm_reject); + +/* + * CM_ID <-- ESTABLISHED + * + * Accepts an inbound connection request and generates an ESTABLISHED + * event. Callers of iw_cm_disconnect and iw_destroy_cm_id will block + * until the ESTABLISHED event is received from the provider. + */ +int iw_cm_accept(struct iw_cm_id *cm_id, + struct iw_cm_conn_param *iw_param) +{ + struct iwcm_id_private *cm_id_priv; + struct ib_qp *qp; + unsigned long flags; + int ret; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + wake_up_all(&cm_id_priv->connect_wait); + return -EINVAL; + } + /* Get the ib_qp given the QPN */ + qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn); + if (!qp) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + wake_up_all(&cm_id_priv->connect_wait); + return -EINVAL; + } + cm_id->device->iwcm->add_ref(qp); + cm_id_priv->qp = qp; + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + + ret = cm_id->device->iwcm->accept(cm_id, iw_param); + if (ret) { + /* An error on accept precludes provider events */ + BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV); + cm_id_priv->state = IW_CM_STATE_IDLE; + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id_priv->qp) { + cm_id->device->iwcm->rem_ref(qp); + cm_id_priv->qp = NULL; + } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + wake_up_all(&cm_id_priv->connect_wait); + } + + return ret; +} +EXPORT_SYMBOL(iw_cm_accept); + +/* + * Active Side: CM_ID <-- CONN_SENT + * + * If successful, results in the generation of a CONNECT_REPLY + * event. iw_cm_disconnect and iw_cm_destroy will block until the + * CONNECT_REPLY event is received from the provider. + */ +int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) +{ + struct iwcm_id_private *cm_id_priv; + int ret; + unsigned long flags; + struct ib_qp *qp; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + + ret = alloc_work_entries(cm_id_priv, 4); + if (ret) + return ret; + + set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + spin_lock_irqsave(&cm_id_priv->lock, flags); + + if (cm_id_priv->state != IW_CM_STATE_IDLE) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + wake_up_all(&cm_id_priv->connect_wait); + return -EINVAL; + } + + /* Get the ib_qp given the QPN */ + qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn); + if (!qp) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + wake_up_all(&cm_id_priv->connect_wait); + return -EINVAL; + } + cm_id->device->iwcm->add_ref(qp); + cm_id_priv->qp = qp; + cm_id_priv->state = IW_CM_STATE_CONN_SENT; + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + + ret = cm_id->device->iwcm->connect(cm_id, iw_param); + if (ret) { + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id_priv->qp) { + cm_id->device->iwcm->rem_ref(qp); + cm_id_priv->qp = NULL; + } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT); + cm_id_priv->state = IW_CM_STATE_IDLE; + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + wake_up_all(&cm_id_priv->connect_wait); + } + + return ret; +} +EXPORT_SYMBOL(iw_cm_connect); + +/* + * Passive Side: new CM_ID <-- CONN_RECV + * + * Handles an inbound connect request. The function creates a new + * iw_cm_id to represent the new connection and inherits the client + * callback function and other attributes from the listening parent. + * + * The work item contains a pointer to the listen_cm_id and the event. The + * listen_cm_id contains the client cm_handler, context and + * device. These are copied when the device is cloned. The event + * contains the new four tuple. + * + * An error on the child should not affect the parent, so this + * function does not return a value. + */ +static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv, + struct iw_cm_event *iw_event) +{ + unsigned long flags; + struct iw_cm_id *cm_id; + struct iwcm_id_private *cm_id_priv; + int ret; + + /* + * The provider should never generate a connection request + * event with a bad status. + */ + BUG_ON(iw_event->status); + + cm_id = iw_create_cm_id(listen_id_priv->id.device, + listen_id_priv->id.cm_handler, + listen_id_priv->id.context); + /* If the cm_id could not be created, ignore the request */ + if (IS_ERR(cm_id)) + goto out; + + cm_id->provider_data = iw_event->provider_data; + cm_id->local_addr = iw_event->local_addr; + cm_id->remote_addr = iw_event->remote_addr; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + cm_id_priv->state = IW_CM_STATE_CONN_RECV; + + /* + * We could be destroying the listening id. If so, ignore this + * upcall. + */ + spin_lock_irqsave(&listen_id_priv->lock, flags); + if (listen_id_priv->state != IW_CM_STATE_LISTEN) { + spin_unlock_irqrestore(&listen_id_priv->lock, flags); + iw_cm_reject(cm_id, NULL, 0); + iw_destroy_cm_id(cm_id); + goto out; + } + spin_unlock_irqrestore(&listen_id_priv->lock, flags); + + ret = alloc_work_entries(cm_id_priv, 3); + if (ret) { + iw_cm_reject(cm_id, NULL, 0); + iw_destroy_cm_id(cm_id); + goto out; + } + + /* Call the client CM handler */ + ret = cm_id->cm_handler(cm_id, iw_event); + if (ret) { + iw_cm_reject(cm_id, NULL, 0); + set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags); + destroy_cm_id(cm_id); + if (atomic_read(&cm_id_priv->refcount)==0) + free_cm_id(cm_id_priv); + } + +out: + if (iw_event->private_data_len) + kfree(iw_event->private_data); +} + +/* + * Passive Side: CM_ID <-- ESTABLISHED + * + * The provider generated an ESTABLISHED event which means that + * the MPA negotion has completed successfully and we are now in MPA + * FPDU mode. + * + * This event can only be received in the CONN_RECV state. If the + * remote peer closed, the ESTABLISHED event would be received followed + * by the CLOSE event. If the app closes, it will block until we wake + * it up after processing this event. + */ +static int cm_conn_est_handler(struct iwcm_id_private *cm_id_priv, + struct iw_cm_event *iw_event) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + + /* + * We clear the CONNECT_WAIT bit here to allow the callback + * function to call iw_cm_disconnect. Calling iw_destroy_cm_id + * from a callback handler is not allowed. + */ + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV); + cm_id_priv->state = IW_CM_STATE_ESTABLISHED; + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event); + wake_up_all(&cm_id_priv->connect_wait); + + return ret; +} + +/* + * Active Side: CM_ID <-- ESTABLISHED + * + * The app has called connect and is waiting for the established event to + * post it's requests to the server. This event will wake up anyone + * blocked in iw_cm_disconnect or iw_destroy_id. + */ +static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv, + struct iw_cm_event *iw_event) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + /* + * Clear the connect wait bit so a callback function calling + * iw_cm_disconnect will not wait and deadlock this thread + */ + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT); + if (iw_event->status == 0) { + cm_id_priv->id.local_addr = iw_event->local_addr; + cm_id_priv->id.remote_addr = iw_event->remote_addr; + cm_id_priv->state = IW_CM_STATE_ESTABLISHED; + } else { + /* REJECTED or RESET */ + cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp); + cm_id_priv->qp = NULL; + cm_id_priv->state = IW_CM_STATE_IDLE; + } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event); + + if (iw_event->private_data_len) + kfree(iw_event->private_data); + + /* Wake up waiters on connect complete */ + wake_up_all(&cm_id_priv->connect_wait); + + return ret; +} + +/* + * CM_ID <-- CLOSING + * + * If in the ESTABLISHED state, move to CLOSING. + */ +static void cm_disconnect_handler(struct iwcm_id_private *cm_id_priv, + struct iw_cm_event *iw_event) +{ + unsigned long flags; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id_priv->state == IW_CM_STATE_ESTABLISHED) + cm_id_priv->state = IW_CM_STATE_CLOSING; + spin_unlock_irqrestore(&cm_id_priv->lock, flags); +} + +/* + * CM_ID <-- IDLE + * + * If in the ESTBLISHED or CLOSING states, the QP will have have been + * moved by the provider to the ERR state. Disassociate the CM_ID from + * the QP, move to IDLE, and remove the 'connected' reference. + * + * If in some other state, the cm_id was destroyed asynchronously. + * This is the last reference that will result in waking up + * the app thread blocked in iw_destroy_cm_id. + */ +static int cm_close_handler(struct iwcm_id_private *cm_id_priv, + struct iw_cm_event *iw_event) +{ + unsigned long flags; + int ret = 0; + spin_lock_irqsave(&cm_id_priv->lock, flags); + + if (cm_id_priv->qp) { + cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp); + cm_id_priv->qp = NULL; + } + switch (cm_id_priv->state) { + case IW_CM_STATE_ESTABLISHED: + case IW_CM_STATE_CLOSING: + cm_id_priv->state = IW_CM_STATE_IDLE; + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event); + spin_lock_irqsave(&cm_id_priv->lock, flags); + break; + case IW_CM_STATE_DESTROYING: + break; + default: + BUG(); + } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + + return ret; +} + +static int process_event(struct iwcm_id_private *cm_id_priv, + struct iw_cm_event *iw_event) +{ + int ret = 0; + + switch (iw_event->event) { + case IW_CM_EVENT_CONNECT_REQUEST: + cm_conn_req_handler(cm_id_priv, iw_event); + break; + case IW_CM_EVENT_CONNECT_REPLY: + ret = cm_conn_rep_handler(cm_id_priv, iw_event); + break; + case IW_CM_EVENT_ESTABLISHED: + ret = cm_conn_est_handler(cm_id_priv, iw_event); + break; + case IW_CM_EVENT_DISCONNECT: + cm_disconnect_handler(cm_id_priv, iw_event); + break; + case IW_CM_EVENT_CLOSE: + ret = cm_close_handler(cm_id_priv, iw_event); + break; + default: + BUG(); + } + + return ret; +} + +/* + * Process events on the work_list for the cm_id. If the callback + * function requests that the cm_id be deleted, a flag is set in the + * cm_id flags to indicate that when the last reference is + * removed, the cm_id is to be destroyed. This is necessary to + * distinguish between an object that will be destroyed by the app + * thread asleep on the destroy_comp list vs. an object destroyed + * here synchronously when the last reference is removed. + */ +static void cm_work_handler(struct work_struct *_work) +{ + struct iwcm_work *work = container_of(_work, struct iwcm_work, work); + struct iw_cm_event levent; + struct iwcm_id_private *cm_id_priv = work->cm_id; + unsigned long flags; + int empty; + int ret = 0; + int destroy_id; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + empty = list_empty(&cm_id_priv->work_list); + while (!empty) { + work = list_entry(cm_id_priv->work_list.next, + struct iwcm_work, list); + list_del_init(&work->list); + empty = list_empty(&cm_id_priv->work_list); + levent = work->event; + put_work(work); + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + + ret = process_event(cm_id_priv, &levent); + if (ret) { + set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags); + destroy_cm_id(&cm_id_priv->id); + } + BUG_ON(atomic_read(&cm_id_priv->refcount)==0); + destroy_id = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags); + if (iwcm_deref_id(cm_id_priv)) { + if (destroy_id) { + BUG_ON(!list_empty(&cm_id_priv->work_list)); + free_cm_id(cm_id_priv); + } + return; + } + if (empty) + return; + spin_lock_irqsave(&cm_id_priv->lock, flags); + } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); +} + +/* + * This function is called on interrupt context. Schedule events on + * the iwcm_wq thread to allow callback functions to downcall into + * the CM and/or block. Events are queued to a per-CM_ID + * work_list. If this is the first event on the work_list, the work + * element is also queued on the iwcm_wq thread. + * + * Each event holds a reference on the cm_id. Until the last posted + * event has been delivered and processed, the cm_id cannot be + * deleted. + * + * Returns: + * 0 - the event was handled. + * -ENOMEM - the event was not handled due to lack of resources. + */ +static int cm_event_handler(struct iw_cm_id *cm_id, + struct iw_cm_event *iw_event) +{ + struct iwcm_work *work; + struct iwcm_id_private *cm_id_priv; + unsigned long flags; + int ret = 0; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + + spin_lock_irqsave(&cm_id_priv->lock, flags); + work = get_work(cm_id_priv); + if (!work) { + ret = -ENOMEM; + goto out; + } + + INIT_WORK(&work->work, cm_work_handler); + work->cm_id = cm_id_priv; + work->event = *iw_event; + + if ((work->event.event == IW_CM_EVENT_CONNECT_REQUEST || + work->event.event == IW_CM_EVENT_CONNECT_REPLY) && + work->event.private_data_len) { + ret = copy_private_data(&work->event); + if (ret) { + put_work(work); + goto out; + } + } + + atomic_inc(&cm_id_priv->refcount); + if (list_empty(&cm_id_priv->work_list)) { + list_add_tail(&work->list, &cm_id_priv->work_list); + queue_work(iwcm_wq, &work->work); + } else + list_add_tail(&work->list, &cm_id_priv->work_list); +out: + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; +} + +static int iwcm_init_qp_init_attr(struct iwcm_id_private *cm_id_priv, + struct ib_qp_attr *qp_attr, + int *qp_attr_mask) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + switch (cm_id_priv->state) { + case IW_CM_STATE_IDLE: + case IW_CM_STATE_CONN_SENT: + case IW_CM_STATE_CONN_RECV: + case IW_CM_STATE_ESTABLISHED: + *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; + qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE| + IB_ACCESS_REMOTE_READ; + ret = 0; + break; + default: + ret = -EINVAL; + break; + } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; +} + +static int iwcm_init_qp_rts_attr(struct iwcm_id_private *cm_id_priv, + struct ib_qp_attr *qp_attr, + int *qp_attr_mask) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + switch (cm_id_priv->state) { + case IW_CM_STATE_IDLE: + case IW_CM_STATE_CONN_SENT: + case IW_CM_STATE_CONN_RECV: + case IW_CM_STATE_ESTABLISHED: + *qp_attr_mask = 0; + ret = 0; + break; + default: + ret = -EINVAL; + break; + } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; +} + +int iw_cm_init_qp_attr(struct iw_cm_id *cm_id, + struct ib_qp_attr *qp_attr, + int *qp_attr_mask) +{ + struct iwcm_id_private *cm_id_priv; + int ret; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + switch (qp_attr->qp_state) { + case IB_QPS_INIT: + case IB_QPS_RTR: + ret = iwcm_init_qp_init_attr(cm_id_priv, + qp_attr, qp_attr_mask); + break; + case IB_QPS_RTS: + ret = iwcm_init_qp_rts_attr(cm_id_priv, + qp_attr, qp_attr_mask); + break; + default: + ret = -EINVAL; + break; + } + return ret; +} +EXPORT_SYMBOL(iw_cm_init_qp_attr); + +static int __init iw_cm_init(void) +{ + iwcm_wq = create_singlethread_workqueue("iw_cm_wq"); + if (!iwcm_wq) + return -ENOMEM; + + iwcm_ctl_table_hdr = register_net_sysctl(&init_net, "net/iw_cm", + iwcm_ctl_table); + if (!iwcm_ctl_table_hdr) { + pr_err("iw_cm: couldn't register sysctl paths\n"); + destroy_workqueue(iwcm_wq); + return -ENOMEM; + } + + return 0; +} + +static void __exit iw_cm_cleanup(void) +{ + unregister_net_sysctl_table(iwcm_ctl_table_hdr); + destroy_workqueue(iwcm_wq); +} + +module_init(iw_cm_init); +module_exit(iw_cm_cleanup); diff --git a/drivers/infiniband/core/iwcm.h b/drivers/infiniband/core/iwcm.h new file mode 100644 index 0000000..3f6cc82 --- /dev/null +++ b/drivers/infiniband/core/iwcm.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2005 Network Appliance, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef IWCM_H +#define IWCM_H + +enum iw_cm_state { + IW_CM_STATE_IDLE, /* unbound, inactive */ + IW_CM_STATE_LISTEN, /* listen waiting for connect */ + IW_CM_STATE_CONN_RECV, /* inbound waiting for user accept */ + IW_CM_STATE_CONN_SENT, /* outbound waiting for peer accept */ + IW_CM_STATE_ESTABLISHED, /* established */ + IW_CM_STATE_CLOSING, /* disconnect */ + IW_CM_STATE_DESTROYING /* object being deleted */ +}; + +struct iwcm_id_private { + struct iw_cm_id id; + enum iw_cm_state state; + unsigned long flags; + struct ib_qp *qp; + struct completion destroy_comp; + wait_queue_head_t connect_wait; + struct list_head work_list; + spinlock_t lock; + atomic_t refcount; + struct list_head work_free_list; +}; + +#define IWCM_F_CALLBACK_DESTROY 1 +#define IWCM_F_CONNECT_WAIT 2 + +#endif /* IWCM_H */ diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c new file mode 100644 index 0000000..b85ddbc --- /dev/null +++ b/drivers/infiniband/core/iwpm_msg.c @@ -0,0 +1,685 @@ +/* + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * Copyright (c) 2014 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "iwpm_util.h" + +static const char iwpm_ulib_name[] = "iWarpPortMapperUser"; +static int iwpm_ulib_version = 3; +static int iwpm_user_pid = IWPM_PID_UNDEFINED; +static atomic_t echo_nlmsg_seq; + +int iwpm_valid_pid(void) +{ + return iwpm_user_pid > 0; +} +EXPORT_SYMBOL(iwpm_valid_pid); + +/* + * iwpm_register_pid - Send a netlink query to user space + * for the iwarp port mapper pid + * + * nlmsg attributes: + * [IWPM_NLA_REG_PID_SEQ] + * [IWPM_NLA_REG_IF_NAME] + * [IWPM_NLA_REG_IBDEV_NAME] + * [IWPM_NLA_REG_ULIB_NAME] + */ +int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client) +{ + struct sk_buff *skb = NULL; + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlmsghdr *nlh; + u32 msg_seq; + const char *err_str = ""; + int ret = -EINVAL; + + if (!iwpm_valid_client(nl_client)) { + err_str = "Invalid port mapper client"; + goto pid_query_error; + } + if (iwpm_registered_client(nl_client)) + return 0; + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REG_PID, &nlh, nl_client); + if (!skb) { + err_str = "Unable to create a nlmsg"; + goto pid_query_error; + } + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, nl_client, GFP_KERNEL); + if (!nlmsg_request) { + err_str = "Unable to allocate netlink request"; + goto pid_query_error; + } + msg_seq = atomic_read(&echo_nlmsg_seq); + + /* fill in the pid request message */ + err_str = "Unable to put attribute of the nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_REG_PID_SEQ); + if (ret) + goto pid_query_error; + ret = ibnl_put_attr(skb, nlh, IWPM_IFNAME_SIZE, + pm_msg->if_name, IWPM_NLA_REG_IF_NAME); + if (ret) + goto pid_query_error; + ret = ibnl_put_attr(skb, nlh, IWPM_DEVNAME_SIZE, + pm_msg->dev_name, IWPM_NLA_REG_IBDEV_NAME); + if (ret) + goto pid_query_error; + ret = ibnl_put_attr(skb, nlh, IWPM_ULIBNAME_SIZE, + (char *)iwpm_ulib_name, IWPM_NLA_REG_ULIB_NAME); + if (ret) + goto pid_query_error; + + pr_debug("%s: Multicasting a nlmsg (dev = %s ifname = %s iwpm = %s)\n", + __func__, pm_msg->dev_name, pm_msg->if_name, iwpm_ulib_name); + + ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_IWPM, GFP_KERNEL); + if (ret) { + skb = NULL; /* skb is freed in the netlink send-op handling */ + iwpm_set_registered(nl_client, 1); + iwpm_user_pid = IWPM_PID_UNAVAILABLE; + err_str = "Unable to send a nlmsg"; + goto pid_query_error; + } + nlmsg_request->req_buffer = pm_msg; + ret = iwpm_wait_complete_req(nlmsg_request); + return ret; +pid_query_error: + pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); + if (skb) + dev_kfree_skb(skb); + if (nlmsg_request) + iwpm_free_nlmsg_request(&nlmsg_request->kref); + return ret; +} +EXPORT_SYMBOL(iwpm_register_pid); + +/* + * iwpm_add_mapping - Send a netlink add mapping message + * to the port mapper + * nlmsg attributes: + * [IWPM_NLA_MANAGE_MAPPING_SEQ] + * [IWPM_NLA_MANAGE_ADDR] + */ +int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) +{ + struct sk_buff *skb = NULL; + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlmsghdr *nlh; + u32 msg_seq; + const char *err_str = ""; + int ret = -EINVAL; + + if (!iwpm_valid_client(nl_client)) { + err_str = "Invalid port mapper client"; + goto add_mapping_error; + } + if (!iwpm_registered_client(nl_client)) { + err_str = "Unregistered port mapper client"; + goto add_mapping_error; + } + if (!iwpm_valid_pid()) + return 0; + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_ADD_MAPPING, &nlh, nl_client); + if (!skb) { + err_str = "Unable to create a nlmsg"; + goto add_mapping_error; + } + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, nl_client, GFP_KERNEL); + if (!nlmsg_request) { + err_str = "Unable to allocate netlink request"; + goto add_mapping_error; + } + msg_seq = atomic_read(&echo_nlmsg_seq); + /* fill in the add mapping message */ + err_str = "Unable to put attribute of the nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, + IWPM_NLA_MANAGE_MAPPING_SEQ); + if (ret) + goto add_mapping_error; + ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), + &pm_msg->loc_addr, IWPM_NLA_MANAGE_ADDR); + if (ret) + goto add_mapping_error; + nlmsg_request->req_buffer = pm_msg; + + ret = ibnl_unicast(skb, nlh, iwpm_user_pid); + if (ret) { + skb = NULL; /* skb is freed in the netlink send-op handling */ + iwpm_user_pid = IWPM_PID_UNDEFINED; + err_str = "Unable to send a nlmsg"; + goto add_mapping_error; + } + ret = iwpm_wait_complete_req(nlmsg_request); + return ret; +add_mapping_error: + pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); + if (skb) + dev_kfree_skb(skb); + if (nlmsg_request) + iwpm_free_nlmsg_request(&nlmsg_request->kref); + return ret; +} +EXPORT_SYMBOL(iwpm_add_mapping); + +/* + * iwpm_add_and_query_mapping - Send a netlink add and query + * mapping message to the port mapper + * nlmsg attributes: + * [IWPM_NLA_QUERY_MAPPING_SEQ] + * [IWPM_NLA_QUERY_LOCAL_ADDR] + * [IWPM_NLA_QUERY_REMOTE_ADDR] + */ +int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) +{ + struct sk_buff *skb = NULL; + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlmsghdr *nlh; + u32 msg_seq; + const char *err_str = ""; + int ret = -EINVAL; + + if (!iwpm_valid_client(nl_client)) { + err_str = "Invalid port mapper client"; + goto query_mapping_error; + } + if (!iwpm_registered_client(nl_client)) { + err_str = "Unregistered port mapper client"; + goto query_mapping_error; + } + if (!iwpm_valid_pid()) + return 0; + ret = -ENOMEM; + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_QUERY_MAPPING, &nlh, nl_client); + if (!skb) { + err_str = "Unable to create a nlmsg"; + goto query_mapping_error; + } + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, + nl_client, GFP_KERNEL); + if (!nlmsg_request) { + err_str = "Unable to allocate netlink request"; + goto query_mapping_error; + } + msg_seq = atomic_read(&echo_nlmsg_seq); + + /* fill in the query message */ + err_str = "Unable to put attribute of the nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, + IWPM_NLA_QUERY_MAPPING_SEQ); + if (ret) + goto query_mapping_error; + ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), + &pm_msg->loc_addr, IWPM_NLA_QUERY_LOCAL_ADDR); + if (ret) + goto query_mapping_error; + ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), + &pm_msg->rem_addr, IWPM_NLA_QUERY_REMOTE_ADDR); + if (ret) + goto query_mapping_error; + nlmsg_request->req_buffer = pm_msg; + + ret = ibnl_unicast(skb, nlh, iwpm_user_pid); + if (ret) { + skb = NULL; /* skb is freed in the netlink send-op handling */ + err_str = "Unable to send a nlmsg"; + goto query_mapping_error; + } + ret = iwpm_wait_complete_req(nlmsg_request); + return ret; +query_mapping_error: + pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); + if (skb) + dev_kfree_skb(skb); + if (nlmsg_request) + iwpm_free_nlmsg_request(&nlmsg_request->kref); + return ret; +} +EXPORT_SYMBOL(iwpm_add_and_query_mapping); + +/* + * iwpm_remove_mapping - Send a netlink remove mapping message + * to the port mapper + * nlmsg attributes: + * [IWPM_NLA_MANAGE_MAPPING_SEQ] + * [IWPM_NLA_MANAGE_ADDR] + */ +int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client) +{ + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + u32 msg_seq; + const char *err_str = ""; + int ret = -EINVAL; + + if (!iwpm_valid_client(nl_client)) { + err_str = "Invalid port mapper client"; + goto remove_mapping_error; + } + if (!iwpm_registered_client(nl_client)) { + err_str = "Unregistered port mapper client"; + goto remove_mapping_error; + } + if (!iwpm_valid_pid()) + return 0; + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REMOVE_MAPPING, &nlh, nl_client); + if (!skb) { + ret = -ENOMEM; + err_str = "Unable to create a nlmsg"; + goto remove_mapping_error; + } + msg_seq = atomic_read(&echo_nlmsg_seq); + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + err_str = "Unable to put attribute of the nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, + IWPM_NLA_MANAGE_MAPPING_SEQ); + if (ret) + goto remove_mapping_error; + ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), + local_addr, IWPM_NLA_MANAGE_ADDR); + if (ret) + goto remove_mapping_error; + + ret = ibnl_unicast(skb, nlh, iwpm_user_pid); + if (ret) { + skb = NULL; /* skb is freed in the netlink send-op handling */ + iwpm_user_pid = IWPM_PID_UNDEFINED; + err_str = "Unable to send a nlmsg"; + goto remove_mapping_error; + } + iwpm_print_sockaddr(local_addr, + "remove_mapping: Local sockaddr:"); + return 0; +remove_mapping_error: + pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); + if (skb) + dev_kfree_skb_any(skb); + return ret; +} +EXPORT_SYMBOL(iwpm_remove_mapping); + +/* netlink attribute policy for the received response to register pid request */ +static const struct nla_policy resp_reg_policy[IWPM_NLA_RREG_PID_MAX] = { + [IWPM_NLA_RREG_PID_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_RREG_IBDEV_NAME] = { .type = NLA_STRING, + .len = IWPM_DEVNAME_SIZE - 1 }, + [IWPM_NLA_RREG_ULIB_NAME] = { .type = NLA_STRING, + .len = IWPM_ULIBNAME_SIZE - 1 }, + [IWPM_NLA_RREG_ULIB_VER] = { .type = NLA_U16 }, + [IWPM_NLA_RREG_PID_ERR] = { .type = NLA_U16 } +}; + +/* + * iwpm_register_pid_cb - Process a port mapper response to + * iwpm_register_pid() + */ +int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlattr *nltb[IWPM_NLA_RREG_PID_MAX]; + struct iwpm_dev_data *pm_msg; + char *dev_name, *iwpm_name; + u32 msg_seq; + u8 nl_client; + u16 iwpm_version; + const char *msg_type = "Register Pid response"; + + if (iwpm_parse_nlmsg(cb, IWPM_NLA_RREG_PID_MAX, + resp_reg_policy, nltb, msg_type)) + return -EINVAL; + + msg_seq = nla_get_u32(nltb[IWPM_NLA_RREG_PID_SEQ]); + nlmsg_request = iwpm_find_nlmsg_request(msg_seq); + if (!nlmsg_request) { + pr_info("%s: Could not find a matching request (seq = %u)\n", + __func__, msg_seq); + return -EINVAL; + } + pm_msg = nlmsg_request->req_buffer; + nl_client = nlmsg_request->nl_client; + dev_name = (char *)nla_data(nltb[IWPM_NLA_RREG_IBDEV_NAME]); + iwpm_name = (char *)nla_data(nltb[IWPM_NLA_RREG_ULIB_NAME]); + iwpm_version = nla_get_u16(nltb[IWPM_NLA_RREG_ULIB_VER]); + + /* check device name, ulib name and version */ + if (strcmp(pm_msg->dev_name, dev_name) || + strcmp(iwpm_ulib_name, iwpm_name) || + iwpm_version != iwpm_ulib_version) { + + pr_info("%s: Incorrect info (dev = %s name = %s version = %d)\n", + __func__, dev_name, iwpm_name, iwpm_version); + nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; + goto register_pid_response_exit; + } + iwpm_user_pid = cb->nlh->nlmsg_pid; + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n", + __func__, iwpm_user_pid); + if (iwpm_valid_client(nl_client)) + iwpm_set_registered(nl_client, 1); +register_pid_response_exit: + nlmsg_request->request_done = 1; + /* always for found nlmsg_request */ + kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); + barrier(); + wake_up(&nlmsg_request->waitq); + return 0; +} +EXPORT_SYMBOL(iwpm_register_pid_cb); + +/* netlink attribute policy for the received response to add mapping request */ +static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = { + [IWPM_NLA_MANAGE_MAPPING_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_MANAGE_ADDR] = { .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_MANAGE_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RMANAGE_MAPPING_ERR] = { .type = NLA_U16 } +}; + +/* + * iwpm_add_mapping_cb - Process a port mapper response to + * iwpm_add_mapping() + */ +int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct iwpm_sa_data *pm_msg; + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlattr *nltb[IWPM_NLA_RMANAGE_MAPPING_MAX]; + struct sockaddr_storage *local_sockaddr; + struct sockaddr_storage *mapped_sockaddr; + const char *msg_type; + u32 msg_seq; + + msg_type = "Add Mapping response"; + if (iwpm_parse_nlmsg(cb, IWPM_NLA_RMANAGE_MAPPING_MAX, + resp_add_policy, nltb, msg_type)) + return -EINVAL; + + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + + msg_seq = nla_get_u32(nltb[IWPM_NLA_MANAGE_MAPPING_SEQ]); + nlmsg_request = iwpm_find_nlmsg_request(msg_seq); + if (!nlmsg_request) { + pr_info("%s: Could not find a matching request (seq = %u)\n", + __func__, msg_seq); + return -EINVAL; + } + pm_msg = nlmsg_request->req_buffer; + local_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_MANAGE_ADDR]); + mapped_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_MANAGE_MAPPED_LOC_ADDR]); + + if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr)) { + nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; + goto add_mapping_response_exit; + } + if (mapped_sockaddr->ss_family != local_sockaddr->ss_family) { + pr_info("%s: Sockaddr family doesn't match the requested one\n", + __func__); + nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; + goto add_mapping_response_exit; + } + memcpy(&pm_msg->mapped_loc_addr, mapped_sockaddr, + sizeof(*mapped_sockaddr)); + iwpm_print_sockaddr(&pm_msg->loc_addr, + "add_mapping: Local sockaddr:"); + iwpm_print_sockaddr(&pm_msg->mapped_loc_addr, + "add_mapping: Mapped local sockaddr:"); + +add_mapping_response_exit: + nlmsg_request->request_done = 1; + /* always for found request */ + kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); + barrier(); + wake_up(&nlmsg_request->waitq); + return 0; +} +EXPORT_SYMBOL(iwpm_add_mapping_cb); + +/* netlink attribute policy for the response to add and query mapping request */ +static const struct nla_policy resp_query_policy[IWPM_NLA_RQUERY_MAPPING_MAX] = { + [IWPM_NLA_QUERY_MAPPING_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_QUERY_LOCAL_ADDR] = { .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_QUERY_REMOTE_ADDR] = { .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = { .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPING_ERR] = { .type = NLA_U16 } +}; + +/* + * iwpm_add_and_query_mapping_cb - Process a port mapper response to + * iwpm_add_and_query_mapping() + */ +int iwpm_add_and_query_mapping_cb(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct iwpm_sa_data *pm_msg; + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlattr *nltb[IWPM_NLA_RQUERY_MAPPING_MAX]; + struct sockaddr_storage *local_sockaddr, *remote_sockaddr; + struct sockaddr_storage *mapped_loc_sockaddr, *mapped_rem_sockaddr; + const char *msg_type; + u32 msg_seq; + u16 err_code; + + msg_type = "Query Mapping response"; + if (iwpm_parse_nlmsg(cb, IWPM_NLA_RQUERY_MAPPING_MAX, + resp_query_policy, nltb, msg_type)) + return -EINVAL; + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + + msg_seq = nla_get_u32(nltb[IWPM_NLA_QUERY_MAPPING_SEQ]); + nlmsg_request = iwpm_find_nlmsg_request(msg_seq); + if (!nlmsg_request) { + pr_info("%s: Could not find a matching request (seq = %u)\n", + __func__, msg_seq); + return -EINVAL; + } + pm_msg = nlmsg_request->req_buffer; + local_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]); + remote_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]); + mapped_loc_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]); + mapped_rem_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_REM_ADDR]); + + err_code = nla_get_u16(nltb[IWPM_NLA_RQUERY_MAPPING_ERR]); + if (err_code == IWPM_REMOTE_QUERY_REJECT) { + pr_info("%s: Received a Reject (pid = %u, echo seq = %u)\n", + __func__, cb->nlh->nlmsg_pid, msg_seq); + nlmsg_request->err_code = IWPM_REMOTE_QUERY_REJECT; + } + if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr) || + iwpm_compare_sockaddr(remote_sockaddr, &pm_msg->rem_addr)) { + pr_info("%s: Incorrect local sockaddr\n", __func__); + nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; + goto query_mapping_response_exit; + } + if (mapped_loc_sockaddr->ss_family != local_sockaddr->ss_family || + mapped_rem_sockaddr->ss_family != remote_sockaddr->ss_family) { + pr_info("%s: Sockaddr family doesn't match the requested one\n", + __func__); + nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; + goto query_mapping_response_exit; + } + memcpy(&pm_msg->mapped_loc_addr, mapped_loc_sockaddr, + sizeof(*mapped_loc_sockaddr)); + memcpy(&pm_msg->mapped_rem_addr, mapped_rem_sockaddr, + sizeof(*mapped_rem_sockaddr)); + + iwpm_print_sockaddr(&pm_msg->loc_addr, + "query_mapping: Local sockaddr:"); + iwpm_print_sockaddr(&pm_msg->mapped_loc_addr, + "query_mapping: Mapped local sockaddr:"); + iwpm_print_sockaddr(&pm_msg->rem_addr, + "query_mapping: Remote sockaddr:"); + iwpm_print_sockaddr(&pm_msg->mapped_rem_addr, + "query_mapping: Mapped remote sockaddr:"); +query_mapping_response_exit: + nlmsg_request->request_done = 1; + /* always for found request */ + kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); + barrier(); + wake_up(&nlmsg_request->waitq); + return 0; +} +EXPORT_SYMBOL(iwpm_add_and_query_mapping_cb); + +/* netlink attribute policy for the received request for mapping info */ +static const struct nla_policy resp_mapinfo_policy[IWPM_NLA_MAPINFO_REQ_MAX] = { + [IWPM_NLA_MAPINFO_ULIB_NAME] = { .type = NLA_STRING, + .len = IWPM_ULIBNAME_SIZE - 1 }, + [IWPM_NLA_MAPINFO_ULIB_VER] = { .type = NLA_U16 } +}; + +/* + * iwpm_mapping_info_cb - Process a port mapper request for mapping info + */ +int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlattr *nltb[IWPM_NLA_MAPINFO_REQ_MAX]; + const char *msg_type = "Mapping Info response"; + int iwpm_pid; + u8 nl_client; + char *iwpm_name; + u16 iwpm_version; + int ret = -EINVAL; + + if (iwpm_parse_nlmsg(cb, IWPM_NLA_MAPINFO_REQ_MAX, + resp_mapinfo_policy, nltb, msg_type)) { + pr_info("%s: Unable to parse nlmsg\n", __func__); + return ret; + } + iwpm_name = (char *)nla_data(nltb[IWPM_NLA_MAPINFO_ULIB_NAME]); + iwpm_version = nla_get_u16(nltb[IWPM_NLA_MAPINFO_ULIB_VER]); + if (strcmp(iwpm_ulib_name, iwpm_name) || + iwpm_version != iwpm_ulib_version) { + pr_info("%s: Invalid port mapper name = %s version = %d\n", + __func__, iwpm_name, iwpm_version); + return ret; + } + nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type); + if (!iwpm_valid_client(nl_client)) { + pr_info("%s: Invalid port mapper client = %d\n", + __func__, nl_client); + return ret; + } + iwpm_set_registered(nl_client, 0); + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + if (!iwpm_mapinfo_available()) + return 0; + iwpm_pid = cb->nlh->nlmsg_pid; + pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n", + __func__, iwpm_pid); + ret = iwpm_send_mapinfo(nl_client, iwpm_pid); + return ret; +} +EXPORT_SYMBOL(iwpm_mapping_info_cb); + +/* netlink attribute policy for the received mapping info ack */ +static const struct nla_policy ack_mapinfo_policy[IWPM_NLA_MAPINFO_NUM_MAX] = { + [IWPM_NLA_MAPINFO_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_MAPINFO_SEND_NUM] = { .type = NLA_U32 }, + [IWPM_NLA_MAPINFO_ACK_NUM] = { .type = NLA_U32 } +}; + +/* + * iwpm_ack_mapping_info_cb - Process a port mapper ack for + * the provided mapping info records + */ +int iwpm_ack_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlattr *nltb[IWPM_NLA_MAPINFO_NUM_MAX]; + u32 mapinfo_send, mapinfo_ack; + const char *msg_type = "Mapping Info Ack"; + + if (iwpm_parse_nlmsg(cb, IWPM_NLA_MAPINFO_NUM_MAX, + ack_mapinfo_policy, nltb, msg_type)) + return -EINVAL; + mapinfo_send = nla_get_u32(nltb[IWPM_NLA_MAPINFO_SEND_NUM]); + mapinfo_ack = nla_get_u32(nltb[IWPM_NLA_MAPINFO_ACK_NUM]); + if (mapinfo_ack != mapinfo_send) + pr_info("%s: Invalid mapinfo number (sent = %u ack-ed = %u)\n", + __func__, mapinfo_send, mapinfo_ack); + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + return 0; +} +EXPORT_SYMBOL(iwpm_ack_mapping_info_cb); + +/* netlink attribute policy for the received port mapper error message */ +static const struct nla_policy map_error_policy[IWPM_NLA_ERR_MAX] = { + [IWPM_NLA_ERR_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_ERR_CODE] = { .type = NLA_U16 }, +}; + +/* + * iwpm_mapping_error_cb - Process a port mapper error message + */ +int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct iwpm_nlmsg_request *nlmsg_request = NULL; + int nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type); + struct nlattr *nltb[IWPM_NLA_ERR_MAX]; + u32 msg_seq; + u16 err_code; + const char *msg_type = "Mapping Error Msg"; + + if (iwpm_parse_nlmsg(cb, IWPM_NLA_ERR_MAX, + map_error_policy, nltb, msg_type)) + return -EINVAL; + + msg_seq = nla_get_u32(nltb[IWPM_NLA_ERR_SEQ]); + err_code = nla_get_u16(nltb[IWPM_NLA_ERR_CODE]); + pr_info("%s: Received msg seq = %u err code = %u client = %d\n", + __func__, msg_seq, err_code, nl_client); + /* look for nlmsg_request */ + nlmsg_request = iwpm_find_nlmsg_request(msg_seq); + if (!nlmsg_request) { + /* not all errors have associated requests */ + pr_debug("Could not find matching req (seq = %u)\n", msg_seq); + return 0; + } + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + nlmsg_request->err_code = err_code; + nlmsg_request->request_done = 1; + /* always for found request */ + kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); + barrier(); + wake_up(&nlmsg_request->waitq); + return 0; +} +EXPORT_SYMBOL(iwpm_mapping_error_cb); diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c new file mode 100644 index 0000000..69e9f84 --- /dev/null +++ b/drivers/infiniband/core/iwpm_util.c @@ -0,0 +1,607 @@ +/* + * Copyright (c) 2014 Chelsio, Inc. All rights reserved. + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "iwpm_util.h" + +#define IWPM_HASH_BUCKET_SIZE 512 +#define IWPM_HASH_BUCKET_MASK (IWPM_HASH_BUCKET_SIZE - 1) + +static LIST_HEAD(iwpm_nlmsg_req_list); +static DEFINE_SPINLOCK(iwpm_nlmsg_req_lock); + +static struct hlist_head *iwpm_hash_bucket; +static DEFINE_SPINLOCK(iwpm_mapinfo_lock); + +static DEFINE_MUTEX(iwpm_admin_lock); +static struct iwpm_admin_data iwpm_admin; + +int iwpm_init(u8 nl_client) +{ + if (iwpm_valid_client(nl_client)) + return -EINVAL; + mutex_lock(&iwpm_admin_lock); + if (atomic_read(&iwpm_admin.refcount) == 0) { + iwpm_hash_bucket = kzalloc(IWPM_HASH_BUCKET_SIZE * + sizeof(struct hlist_head), GFP_KERNEL); + if (!iwpm_hash_bucket) { + mutex_unlock(&iwpm_admin_lock); + pr_err("%s Unable to create mapinfo hash table\n", __func__); + return -ENOMEM; + } + } + atomic_inc(&iwpm_admin.refcount); + mutex_unlock(&iwpm_admin_lock); + iwpm_set_valid(nl_client, 1); + return 0; +} +EXPORT_SYMBOL(iwpm_init); + +static void free_hash_bucket(void); + +int iwpm_exit(u8 nl_client) +{ + + if (!iwpm_valid_client(nl_client)) + return -EINVAL; + mutex_lock(&iwpm_admin_lock); + if (atomic_read(&iwpm_admin.refcount) == 0) { + mutex_unlock(&iwpm_admin_lock); + pr_err("%s Incorrect usage - negative refcount\n", __func__); + return -EINVAL; + } + if (atomic_dec_and_test(&iwpm_admin.refcount)) { + free_hash_bucket(); + pr_debug("%s: Mapinfo hash table is destroyed\n", __func__); + } + mutex_unlock(&iwpm_admin_lock); + iwpm_set_valid(nl_client, 0); + return 0; +} +EXPORT_SYMBOL(iwpm_exit); + +static struct hlist_head *get_hash_bucket_head(struct sockaddr_storage *, + struct sockaddr_storage *); + +int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr, + struct sockaddr_storage *mapped_sockaddr, + u8 nl_client) +{ + struct hlist_head *hash_bucket_head; + struct iwpm_mapping_info *map_info; + unsigned long flags; + + if (!iwpm_valid_client(nl_client)) + return -EINVAL; + map_info = kzalloc(sizeof(struct iwpm_mapping_info), GFP_KERNEL); + if (!map_info) { + pr_err("%s: Unable to allocate a mapping info\n", __func__); + return -ENOMEM; + } + memcpy(&map_info->local_sockaddr, local_sockaddr, + sizeof(struct sockaddr_storage)); + memcpy(&map_info->mapped_sockaddr, mapped_sockaddr, + sizeof(struct sockaddr_storage)); + map_info->nl_client = nl_client; + + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + if (iwpm_hash_bucket) { + hash_bucket_head = get_hash_bucket_head( + &map_info->local_sockaddr, + &map_info->mapped_sockaddr); + hlist_add_head(&map_info->hlist_node, hash_bucket_head); + } + spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); + return 0; +} +EXPORT_SYMBOL(iwpm_create_mapinfo); + +int iwpm_remove_mapinfo(struct sockaddr_storage *local_sockaddr, + struct sockaddr_storage *mapped_local_addr) +{ + struct hlist_node *tmp_hlist_node; + struct hlist_head *hash_bucket_head; + struct iwpm_mapping_info *map_info = NULL; + unsigned long flags; + int ret = -EINVAL; + + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + if (iwpm_hash_bucket) { + hash_bucket_head = get_hash_bucket_head( + local_sockaddr, + mapped_local_addr); + hlist_for_each_entry_safe(map_info, tmp_hlist_node, + hash_bucket_head, hlist_node) { + + if (!iwpm_compare_sockaddr(&map_info->mapped_sockaddr, + mapped_local_addr)) { + + hlist_del_init(&map_info->hlist_node); + kfree(map_info); + ret = 0; + break; + } + } + } + spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); + return ret; +} +EXPORT_SYMBOL(iwpm_remove_mapinfo); + +static void free_hash_bucket(void) +{ + struct hlist_node *tmp_hlist_node; + struct iwpm_mapping_info *map_info; + unsigned long flags; + int i; + + /* remove all the mapinfo data from the list */ + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + for (i = 0; i < IWPM_HASH_BUCKET_SIZE; i++) { + hlist_for_each_entry_safe(map_info, tmp_hlist_node, + &iwpm_hash_bucket[i], hlist_node) { + + hlist_del_init(&map_info->hlist_node); + kfree(map_info); + } + } + /* free the hash list */ + kfree(iwpm_hash_bucket); + iwpm_hash_bucket = NULL; + spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); +} + +struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq, + u8 nl_client, gfp_t gfp) +{ + struct iwpm_nlmsg_request *nlmsg_request = NULL; + unsigned long flags; + + nlmsg_request = kzalloc(sizeof(struct iwpm_nlmsg_request), gfp); + if (!nlmsg_request) { + pr_err("%s Unable to allocate a nlmsg_request\n", __func__); + return NULL; + } + spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags); + list_add_tail(&nlmsg_request->inprocess_list, &iwpm_nlmsg_req_list); + spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags); + + kref_init(&nlmsg_request->kref); + kref_get(&nlmsg_request->kref); + nlmsg_request->nlmsg_seq = nlmsg_seq; + nlmsg_request->nl_client = nl_client; + nlmsg_request->request_done = 0; + nlmsg_request->err_code = 0; + return nlmsg_request; +} + +void iwpm_free_nlmsg_request(struct kref *kref) +{ + struct iwpm_nlmsg_request *nlmsg_request; + unsigned long flags; + + nlmsg_request = container_of(kref, struct iwpm_nlmsg_request, kref); + + spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags); + list_del_init(&nlmsg_request->inprocess_list); + spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags); + + if (!nlmsg_request->request_done) + pr_debug("%s Freeing incomplete nlmsg request (seq = %u).\n", + __func__, nlmsg_request->nlmsg_seq); + kfree(nlmsg_request); +} + +struct iwpm_nlmsg_request *iwpm_find_nlmsg_request(__u32 echo_seq) +{ + struct iwpm_nlmsg_request *nlmsg_request; + struct iwpm_nlmsg_request *found_request = NULL; + unsigned long flags; + + spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags); + list_for_each_entry(nlmsg_request, &iwpm_nlmsg_req_list, + inprocess_list) { + if (nlmsg_request->nlmsg_seq == echo_seq) { + found_request = nlmsg_request; + kref_get(&nlmsg_request->kref); + break; + } + } + spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags); + return found_request; +} + +int iwpm_wait_complete_req(struct iwpm_nlmsg_request *nlmsg_request) +{ + int ret; + init_waitqueue_head(&nlmsg_request->waitq); + + ret = wait_event_timeout(nlmsg_request->waitq, + (nlmsg_request->request_done != 0), IWPM_NL_TIMEOUT); + if (!ret) { + ret = -EINVAL; + pr_info("%s: Timeout %d sec for netlink request (seq = %u)\n", + __func__, (IWPM_NL_TIMEOUT/HZ), nlmsg_request->nlmsg_seq); + } else { + ret = nlmsg_request->err_code; + } + kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); + return ret; +} + +int iwpm_get_nlmsg_seq(void) +{ + return atomic_inc_return(&iwpm_admin.nlmsg_seq); +} + +int iwpm_valid_client(u8 nl_client) +{ + if (nl_client >= RDMA_NL_NUM_CLIENTS) + return 0; + return iwpm_admin.client_list[nl_client]; +} + +void iwpm_set_valid(u8 nl_client, int valid) +{ + if (nl_client >= RDMA_NL_NUM_CLIENTS) + return; + iwpm_admin.client_list[nl_client] = valid; +} + +/* valid client */ +int iwpm_registered_client(u8 nl_client) +{ + return iwpm_admin.reg_list[nl_client]; +} + +/* valid client */ +void iwpm_set_registered(u8 nl_client, int reg) +{ + iwpm_admin.reg_list[nl_client] = reg; +} + +int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr, + struct sockaddr_storage *b_sockaddr) +{ + if (a_sockaddr->ss_family != b_sockaddr->ss_family) + return 1; + if (a_sockaddr->ss_family == AF_INET) { + struct sockaddr_in *a4_sockaddr = + (struct sockaddr_in *)a_sockaddr; + struct sockaddr_in *b4_sockaddr = + (struct sockaddr_in *)b_sockaddr; + if (!memcmp(&a4_sockaddr->sin_addr, + &b4_sockaddr->sin_addr, sizeof(struct in_addr)) + && a4_sockaddr->sin_port == b4_sockaddr->sin_port) + return 0; + + } else if (a_sockaddr->ss_family == AF_INET6) { + struct sockaddr_in6 *a6_sockaddr = + (struct sockaddr_in6 *)a_sockaddr; + struct sockaddr_in6 *b6_sockaddr = + (struct sockaddr_in6 *)b_sockaddr; + if (!memcmp(&a6_sockaddr->sin6_addr, + &b6_sockaddr->sin6_addr, sizeof(struct in6_addr)) + && a6_sockaddr->sin6_port == b6_sockaddr->sin6_port) + return 0; + + } else { + pr_err("%s: Invalid sockaddr family\n", __func__); + } + return 1; +} + +struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh, + int nl_client) +{ + struct sk_buff *skb = NULL; + + skb = dev_alloc_skb(NLMSG_GOODSIZE); + if (!skb) { + pr_err("%s Unable to allocate skb\n", __func__); + goto create_nlmsg_exit; + } + if (!(ibnl_put_msg(skb, nlh, 0, 0, nl_client, nl_op, + NLM_F_REQUEST))) { + pr_warn("%s: Unable to put the nlmsg header\n", __func__); + dev_kfree_skb(skb); + skb = NULL; + } +create_nlmsg_exit: + return skb; +} + +int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max, + const struct nla_policy *nlmsg_policy, + struct nlattr *nltb[], const char *msg_type) +{ + int nlh_len = 0; + int ret; + const char *err_str = ""; + + ret = nlmsg_validate(cb->nlh, nlh_len, policy_max-1, nlmsg_policy); + if (ret) { + err_str = "Invalid attribute"; + goto parse_nlmsg_error; + } + ret = nlmsg_parse(cb->nlh, nlh_len, nltb, policy_max-1, nlmsg_policy); + if (ret) { + err_str = "Unable to parse the nlmsg"; + goto parse_nlmsg_error; + } + ret = iwpm_validate_nlmsg_attr(nltb, policy_max); + if (ret) { + err_str = "Invalid NULL attribute"; + goto parse_nlmsg_error; + } + return 0; +parse_nlmsg_error: + pr_warn("%s: %s (msg type %s ret = %d)\n", + __func__, err_str, msg_type, ret); + return ret; +} + +void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg) +{ + struct sockaddr_in6 *sockaddr_v6; + struct sockaddr_in *sockaddr_v4; + + switch (sockaddr->ss_family) { + case AF_INET: + sockaddr_v4 = (struct sockaddr_in *)sockaddr; + pr_debug("%s IPV4 %pI4: %u(0x%04X)\n", + msg, &sockaddr_v4->sin_addr, + ntohs(sockaddr_v4->sin_port), + ntohs(sockaddr_v4->sin_port)); + break; + case AF_INET6: + sockaddr_v6 = (struct sockaddr_in6 *)sockaddr; + pr_debug("%s IPV6 %pI6: %u(0x%04X)\n", + msg, &sockaddr_v6->sin6_addr, + ntohs(sockaddr_v6->sin6_port), + ntohs(sockaddr_v6->sin6_port)); + break; + default: + break; + } +} + +static u32 iwpm_ipv6_jhash(struct sockaddr_in6 *ipv6_sockaddr) +{ + u32 ipv6_hash = jhash(&ipv6_sockaddr->sin6_addr, sizeof(struct in6_addr), 0); + u32 hash = jhash_2words(ipv6_hash, (__force u32) ipv6_sockaddr->sin6_port, 0); + return hash; +} + +static u32 iwpm_ipv4_jhash(struct sockaddr_in *ipv4_sockaddr) +{ + u32 ipv4_hash = jhash(&ipv4_sockaddr->sin_addr, sizeof(struct in_addr), 0); + u32 hash = jhash_2words(ipv4_hash, (__force u32) ipv4_sockaddr->sin_port, 0); + return hash; +} + +static struct hlist_head *get_hash_bucket_head(struct sockaddr_storage + *local_sockaddr, + struct sockaddr_storage + *mapped_sockaddr) +{ + u32 local_hash, mapped_hash, hash; + + if (local_sockaddr->ss_family == AF_INET) { + local_hash = iwpm_ipv4_jhash((struct sockaddr_in *) local_sockaddr); + mapped_hash = iwpm_ipv4_jhash((struct sockaddr_in *) mapped_sockaddr); + + } else if (local_sockaddr->ss_family == AF_INET6) { + local_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) local_sockaddr); + mapped_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) mapped_sockaddr); + } else { + pr_err("%s: Invalid sockaddr family\n", __func__); + return NULL; + } + + if (local_hash == mapped_hash) /* if port mapper isn't available */ + hash = local_hash; + else + hash = jhash_2words(local_hash, mapped_hash, 0); + + return &iwpm_hash_bucket[hash & IWPM_HASH_BUCKET_MASK]; +} + +static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid) +{ + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + u32 msg_seq; + const char *err_str = ""; + int ret = -EINVAL; + + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_MAPINFO_NUM, &nlh, nl_client); + if (!skb) { + err_str = "Unable to create a nlmsg"; + goto mapinfo_num_error; + } + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + msg_seq = 0; + err_str = "Unable to put attribute of mapinfo number nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_MAPINFO_SEQ); + if (ret) + goto mapinfo_num_error; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), + &mapping_num, IWPM_NLA_MAPINFO_SEND_NUM); + if (ret) + goto mapinfo_num_error; + ret = ibnl_unicast(skb, nlh, iwpm_pid); + if (ret) { + skb = NULL; + err_str = "Unable to send a nlmsg"; + goto mapinfo_num_error; + } + pr_debug("%s: Sent mapping number = %d\n", __func__, mapping_num); + return 0; +mapinfo_num_error: + pr_info("%s: %s\n", __func__, err_str); + if (skb) + dev_kfree_skb(skb); + return ret; +} + +static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid) +{ + struct nlmsghdr *nlh = NULL; + int ret = 0; + + if (!skb) + return ret; + if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client, + RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) { + pr_warn("%s Unable to put NLMSG_DONE\n", __func__); + return -ENOMEM; + } + nlh->nlmsg_type = NLMSG_DONE; + ret = ibnl_unicast(skb, (struct nlmsghdr *)skb->data, iwpm_pid); + if (ret) + pr_warn("%s Unable to send a nlmsg\n", __func__); + return ret; +} + +int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid) +{ + struct iwpm_mapping_info *map_info; + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + int skb_num = 0, mapping_num = 0; + int i = 0, nlmsg_bytes = 0; + unsigned long flags; + const char *err_str = ""; + int ret; + + skb = dev_alloc_skb(NLMSG_GOODSIZE); + if (!skb) { + ret = -ENOMEM; + err_str = "Unable to allocate skb"; + goto send_mapping_info_exit; + } + skb_num++; + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + for (i = 0; i < IWPM_HASH_BUCKET_SIZE; i++) { + hlist_for_each_entry(map_info, &iwpm_hash_bucket[i], + hlist_node) { + if (map_info->nl_client != nl_client) + continue; + nlh = NULL; + if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client, + RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) { + ret = -ENOMEM; + err_str = "Unable to put the nlmsg header"; + goto send_mapping_info_unlock; + } + err_str = "Unable to put attribute of the nlmsg"; + ret = ibnl_put_attr(skb, nlh, + sizeof(struct sockaddr_storage), + &map_info->local_sockaddr, + IWPM_NLA_MAPINFO_LOCAL_ADDR); + if (ret) + goto send_mapping_info_unlock; + + ret = ibnl_put_attr(skb, nlh, + sizeof(struct sockaddr_storage), + &map_info->mapped_sockaddr, + IWPM_NLA_MAPINFO_MAPPED_ADDR); + if (ret) + goto send_mapping_info_unlock; + + iwpm_print_sockaddr(&map_info->local_sockaddr, + "send_mapping_info: Local sockaddr:"); + iwpm_print_sockaddr(&map_info->mapped_sockaddr, + "send_mapping_info: Mapped local sockaddr:"); + mapping_num++; + nlmsg_bytes += nlh->nlmsg_len; + + /* check if all mappings can fit in one skb */ + if (NLMSG_GOODSIZE - nlmsg_bytes < nlh->nlmsg_len * 2) { + /* and leave room for NLMSG_DONE */ + nlmsg_bytes = 0; + skb_num++; + spin_unlock_irqrestore(&iwpm_mapinfo_lock, + flags); + /* send the skb */ + ret = send_nlmsg_done(skb, nl_client, iwpm_pid); + skb = NULL; + if (ret) { + err_str = "Unable to send map info"; + goto send_mapping_info_exit; + } + if (skb_num == IWPM_MAPINFO_SKB_COUNT) { + ret = -ENOMEM; + err_str = "Insufficient skbs for map info"; + goto send_mapping_info_exit; + } + skb = dev_alloc_skb(NLMSG_GOODSIZE); + if (!skb) { + ret = -ENOMEM; + err_str = "Unable to allocate skb"; + goto send_mapping_info_exit; + } + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + } + } + } +send_mapping_info_unlock: + spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); +send_mapping_info_exit: + if (ret) { + pr_warn("%s: %s (ret = %d)\n", __func__, err_str, ret); + if (skb) + dev_kfree_skb(skb); + return ret; + } + send_nlmsg_done(skb, nl_client, iwpm_pid); + return send_mapinfo_num(mapping_num, nl_client, iwpm_pid); +} + +int iwpm_mapinfo_available(void) +{ + unsigned long flags; + int full_bucket = 0, i = 0; + + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + if (iwpm_hash_bucket) { + for (i = 0; i < IWPM_HASH_BUCKET_SIZE; i++) { + if (!hlist_empty(&iwpm_hash_bucket[i])) { + full_bucket = 1; + break; + } + } + } + spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); + return full_bucket; +} diff --git a/drivers/infiniband/core/iwpm_util.h b/drivers/infiniband/core/iwpm_util.h new file mode 100644 index 0000000..9777c86 --- /dev/null +++ b/drivers/infiniband/core/iwpm_util.h @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * Copyright (c) 2014 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _IWPM_UTIL_H +#define _IWPM_UTIL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define IWPM_NL_RETRANS 3 +#define IWPM_NL_TIMEOUT (10*HZ) +#define IWPM_MAPINFO_SKB_COUNT 20 + +#define IWPM_PID_UNDEFINED -1 +#define IWPM_PID_UNAVAILABLE -2 + +struct iwpm_nlmsg_request { + struct list_head inprocess_list; + __u32 nlmsg_seq; + void *req_buffer; + u8 nl_client; + u8 request_done; + u16 err_code; + wait_queue_head_t waitq; + struct kref kref; +}; + +struct iwpm_mapping_info { + struct hlist_node hlist_node; + struct sockaddr_storage local_sockaddr; + struct sockaddr_storage mapped_sockaddr; + u8 nl_client; +}; + +struct iwpm_admin_data { + atomic_t refcount; + atomic_t nlmsg_seq; + int client_list[RDMA_NL_NUM_CLIENTS]; + int reg_list[RDMA_NL_NUM_CLIENTS]; +}; + +/** + * iwpm_get_nlmsg_request - Allocate and initialize netlink message request + * @nlmsg_seq: Sequence number of the netlink message + * @nl_client: The index of the netlink client + * @gfp: Indicates how the memory for the request should be allocated + * + * Returns the newly allocated netlink request object if successful, + * otherwise returns NULL + */ +struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq, + u8 nl_client, gfp_t gfp); + +/** + * iwpm_free_nlmsg_request - Deallocate netlink message request + * @kref: Holds reference of netlink message request + */ +void iwpm_free_nlmsg_request(struct kref *kref); + +/** + * iwpm_find_nlmsg_request - Find netlink message request in the request list + * @echo_seq: Sequence number of the netlink request to find + * + * Returns the found netlink message request, + * if not found, returns NULL + */ +struct iwpm_nlmsg_request *iwpm_find_nlmsg_request(__u32 echo_seq); + +/** + * iwpm_wait_complete_req - Block while servicing the netlink request + * @nlmsg_request: Netlink message request to service + * + * Wakes up, after the request is completed or expired + * Returns 0 if the request is complete without error + */ +int iwpm_wait_complete_req(struct iwpm_nlmsg_request *nlmsg_request); + +/** + * iwpm_get_nlmsg_seq - Get the sequence number for a netlink + * message to send to the port mapper + * + * Returns the sequence number for the netlink message. + */ +int iwpm_get_nlmsg_seq(void); + +/** + * iwpm_valid_client - Check if the port mapper client is valid + * @nl_client: The index of the netlink client + * + * Valid clients need to call iwpm_init() before using + * the port mapper + */ +int iwpm_valid_client(u8 nl_client); + +/** + * iwpm_set_valid - Set the port mapper client to valid or not + * @nl_client: The index of the netlink client + * @valid: 1 if valid or 0 if invalid + */ +void iwpm_set_valid(u8 nl_client, int valid); + +/** + * iwpm_registered_client - Check if the port mapper client is registered + * @nl_client: The index of the netlink client + * + * Call iwpm_register_pid() to register a client + */ +int iwpm_registered_client(u8 nl_client); + +/** + * iwpm_set_registered - Set the port mapper client to registered or not + * @nl_client: The index of the netlink client + * @reg: 1 if registered or 0 if not + */ +void iwpm_set_registered(u8 nl_client, int reg); + +/** + * iwpm_send_mapinfo - Send local and mapped IPv4/IPv6 address info of + * a client to the user space port mapper + * @nl_client: The index of the netlink client + * @iwpm_pid: The pid of the user space port mapper + * + * If successful, returns the number of sent mapping info records + */ +int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid); + +/** + * iwpm_mapinfo_available - Check if any mapping info records is available + * in the hash table + * + * Returns 1 if mapping information is available, otherwise returns 0 + */ +int iwpm_mapinfo_available(void); + +/** + * iwpm_compare_sockaddr - Compare two sockaddr storage structs + * + * Returns 0 if they are holding the same ip/tcp address info, + * otherwise returns 1 + */ +int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr, + struct sockaddr_storage *b_sockaddr); + +/** + * iwpm_validate_nlmsg_attr - Check for NULL netlink attributes + * @nltb: Holds address of each netlink message attributes + * @nla_count: Number of netlink message attributes + * + * Returns error if any of the nla_count attributes is NULL + */ +static inline int iwpm_validate_nlmsg_attr(struct nlattr *nltb[], + int nla_count) +{ + int i; + for (i = 1; i < nla_count; i++) { + if (!nltb[i]) + return -EINVAL; + } + return 0; +} + +/** + * iwpm_create_nlmsg - Allocate skb and form a netlink message + * @nl_op: Netlink message opcode + * @nlh: Holds address of the netlink message header in skb + * @nl_client: The index of the netlink client + * + * Returns the newly allcated skb, or NULL if the tailroom of the skb + * is insufficient to store the message header and payload + */ +struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh, + int nl_client); + +/** + * iwpm_parse_nlmsg - Validate and parse the received netlink message + * @cb: Netlink callback structure + * @policy_max: Maximum attribute type to be expected + * @nlmsg_policy: Validation policy + * @nltb: Array to store policy_max parsed elements + * @msg_type: Type of netlink message + * + * Returns 0 on success or a negative error code + */ +int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max, + const struct nla_policy *nlmsg_policy, + struct nlattr *nltb[], const char *msg_type); + +/** + * iwpm_print_sockaddr - Print IPv4/IPv6 address and TCP port + * @sockaddr: Socket address to print + * @msg: Message to print + */ +void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg); +#endif diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c new file mode 100644 index 0000000..74c30f4 --- /dev/null +++ b/drivers/infiniband/core/mad.c @@ -0,0 +1,3176 @@ +/* + * Copyright (c) 2004-2007 Voltaire, Inc. All rights reserved. + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2009 HNR Consulting. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include + +#include "mad_priv.h" +#include "mad_rmpp.h" +#include "smi.h" +#include "agent.h" + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("kernel IB MAD API"); +MODULE_AUTHOR("Hal Rosenstock"); +MODULE_AUTHOR("Sean Hefty"); + +static int mad_sendq_size = IB_MAD_QP_SEND_SIZE; +static int mad_recvq_size = IB_MAD_QP_RECV_SIZE; + +module_param_named(send_queue_size, mad_sendq_size, int, 0444); +MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests"); +module_param_named(recv_queue_size, mad_recvq_size, int, 0444); +MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests"); + +static struct kmem_cache *ib_mad_cache; + +static struct list_head ib_mad_port_list; +static u32 ib_mad_client_id = 0; + +/* Port list lock */ +static DEFINE_SPINLOCK(ib_mad_port_list_lock); + +/* Forward declarations */ +static int method_in_use(struct ib_mad_mgmt_method_table **method, + struct ib_mad_reg_req *mad_reg_req); +static void remove_mad_reg_req(struct ib_mad_agent_private *priv); +static struct ib_mad_agent_private *find_mad_agent( + struct ib_mad_port_private *port_priv, + struct ib_mad *mad); +static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, + struct ib_mad_private *mad); +static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv); +static void timeout_sends(struct work_struct *work); +static void local_completions(struct work_struct *work); +static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req, + struct ib_mad_agent_private *agent_priv, + u8 mgmt_class); +static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req, + struct ib_mad_agent_private *agent_priv); + +/* + * Returns a ib_mad_port_private structure or NULL for a device/port + * Assumes ib_mad_port_list_lock is being held + */ +static inline struct ib_mad_port_private * +__ib_get_mad_port(struct ib_device *device, int port_num) +{ + struct ib_mad_port_private *entry; + + list_for_each_entry(entry, &ib_mad_port_list, port_list) { + if (entry->device == device && entry->port_num == port_num) + return entry; + } + return NULL; +} + +/* + * Wrapper function to return a ib_mad_port_private structure or NULL + * for a device/port + */ +static inline struct ib_mad_port_private * +ib_get_mad_port(struct ib_device *device, int port_num) +{ + struct ib_mad_port_private *entry; + unsigned long flags; + + spin_lock_irqsave(&ib_mad_port_list_lock, flags); + entry = __ib_get_mad_port(device, port_num); + spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); + + return entry; +} + +static inline u8 convert_mgmt_class(u8 mgmt_class) +{ + /* Alias IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE to 0 */ + return mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE ? + 0 : mgmt_class; +} + +static int get_spl_qp_index(enum ib_qp_type qp_type) +{ + switch (qp_type) + { + case IB_QPT_SMI: + return 0; + case IB_QPT_GSI: + return 1; + default: + return -1; + } +} + +static int vendor_class_index(u8 mgmt_class) +{ + return mgmt_class - IB_MGMT_CLASS_VENDOR_RANGE2_START; +} + +static int is_vendor_class(u8 mgmt_class) +{ + if ((mgmt_class < IB_MGMT_CLASS_VENDOR_RANGE2_START) || + (mgmt_class > IB_MGMT_CLASS_VENDOR_RANGE2_END)) + return 0; + return 1; +} + +static int is_vendor_oui(char *oui) +{ + if (oui[0] || oui[1] || oui[2]) + return 1; + return 0; +} + +static int is_vendor_method_in_use( + struct ib_mad_mgmt_vendor_class *vendor_class, + struct ib_mad_reg_req *mad_reg_req) +{ + struct ib_mad_mgmt_method_table *method; + int i; + + for (i = 0; i < MAX_MGMT_OUI; i++) { + if (!memcmp(vendor_class->oui[i], mad_reg_req->oui, 3)) { + method = vendor_class->method_table[i]; + if (method) { + if (method_in_use(&method, mad_reg_req)) + return 1; + else + break; + } + } + } + return 0; +} + +int ib_response_mad(struct ib_mad *mad) +{ + return ((mad->mad_hdr.method & IB_MGMT_METHOD_RESP) || + (mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) || + ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_BM) && + (mad->mad_hdr.attr_mod & IB_BM_ATTR_MOD_RESP))); +} +EXPORT_SYMBOL(ib_response_mad); + +/* + * ib_register_mad_agent - Register to send/receive MADs + */ +struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, + u8 port_num, + enum ib_qp_type qp_type, + struct ib_mad_reg_req *mad_reg_req, + u8 rmpp_version, + ib_mad_send_handler send_handler, + ib_mad_recv_handler recv_handler, + void *context, + u32 registration_flags) +{ + struct ib_mad_port_private *port_priv; + struct ib_mad_agent *ret = ERR_PTR(-EINVAL); + struct ib_mad_agent_private *mad_agent_priv; + struct ib_mad_reg_req *reg_req = NULL; + struct ib_mad_mgmt_class_table *class; + struct ib_mad_mgmt_vendor_class_table *vendor; + struct ib_mad_mgmt_vendor_class *vendor_class; + struct ib_mad_mgmt_method_table *method; + int ret2, qpn; + unsigned long flags; + u8 mgmt_class, vclass; + + /* Validate parameters */ + qpn = get_spl_qp_index(qp_type); + if (qpn == -1) { + dev_notice(&device->dev, + "ib_register_mad_agent: invalid QP Type %d\n", + qp_type); + goto error1; + } + + if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION) { + dev_notice(&device->dev, + "ib_register_mad_agent: invalid RMPP Version %u\n", + rmpp_version); + goto error1; + } + + /* Validate MAD registration request if supplied */ + if (mad_reg_req) { + if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION) { + dev_notice(&device->dev, + "ib_register_mad_agent: invalid Class Version %u\n", + mad_reg_req->mgmt_class_version); + goto error1; + } + if (!recv_handler) { + dev_notice(&device->dev, + "ib_register_mad_agent: no recv_handler\n"); + goto error1; + } + if (mad_reg_req->mgmt_class >= MAX_MGMT_CLASS) { + /* + * IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE is the only + * one in this range currently allowed + */ + if (mad_reg_req->mgmt_class != + IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + dev_notice(&device->dev, + "ib_register_mad_agent: Invalid Mgmt Class 0x%x\n", + mad_reg_req->mgmt_class); + goto error1; + } + } else if (mad_reg_req->mgmt_class == 0) { + /* + * Class 0 is reserved in IBA and is used for + * aliasing of IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE + */ + dev_notice(&device->dev, + "ib_register_mad_agent: Invalid Mgmt Class 0\n"); + goto error1; + } else if (is_vendor_class(mad_reg_req->mgmt_class)) { + /* + * If class is in "new" vendor range, + * ensure supplied OUI is not zero + */ + if (!is_vendor_oui(mad_reg_req->oui)) { + dev_notice(&device->dev, + "ib_register_mad_agent: No OUI specified for class 0x%x\n", + mad_reg_req->mgmt_class); + goto error1; + } + } + /* Make sure class supplied is consistent with RMPP */ + if (!ib_is_mad_class_rmpp(mad_reg_req->mgmt_class)) { + if (rmpp_version) { + dev_notice(&device->dev, + "ib_register_mad_agent: RMPP version for non-RMPP class 0x%x\n", + mad_reg_req->mgmt_class); + goto error1; + } + } + + /* Make sure class supplied is consistent with QP type */ + if (qp_type == IB_QPT_SMI) { + if ((mad_reg_req->mgmt_class != + IB_MGMT_CLASS_SUBN_LID_ROUTED) && + (mad_reg_req->mgmt_class != + IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { + dev_notice(&device->dev, + "ib_register_mad_agent: Invalid SM QP type: class 0x%x\n", + mad_reg_req->mgmt_class); + goto error1; + } + } else { + if ((mad_reg_req->mgmt_class == + IB_MGMT_CLASS_SUBN_LID_ROUTED) || + (mad_reg_req->mgmt_class == + IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { + dev_notice(&device->dev, + "ib_register_mad_agent: Invalid GS QP type: class 0x%x\n", + mad_reg_req->mgmt_class); + goto error1; + } + } + } else { + /* No registration request supplied */ + if (!send_handler) + goto error1; + if (registration_flags & IB_MAD_USER_RMPP) + goto error1; + } + + /* Validate device and port */ + port_priv = ib_get_mad_port(device, port_num); + if (!port_priv) { + dev_notice(&device->dev, "ib_register_mad_agent: Invalid port\n"); + ret = ERR_PTR(-ENODEV); + goto error1; + } + + /* Verify the QP requested is supported. For example, Ethernet devices + * will not have QP0 */ + if (!port_priv->qp_info[qpn].qp) { + dev_notice(&device->dev, + "ib_register_mad_agent: QP %d not supported\n", qpn); + ret = ERR_PTR(-EPROTONOSUPPORT); + goto error1; + } + + /* Allocate structures */ + mad_agent_priv = kzalloc(sizeof *mad_agent_priv, GFP_KERNEL); + if (!mad_agent_priv) { + ret = ERR_PTR(-ENOMEM); + goto error1; + } + + mad_agent_priv->agent.mr = ib_get_dma_mr(port_priv->qp_info[qpn].qp->pd, + IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(mad_agent_priv->agent.mr)) { + ret = ERR_PTR(-ENOMEM); + goto error2; + } + + if (mad_reg_req) { + reg_req = kmemdup(mad_reg_req, sizeof *reg_req, GFP_KERNEL); + if (!reg_req) { + ret = ERR_PTR(-ENOMEM); + goto error3; + } + } + + /* Now, fill in the various structures */ + mad_agent_priv->qp_info = &port_priv->qp_info[qpn]; + mad_agent_priv->reg_req = reg_req; + mad_agent_priv->agent.rmpp_version = rmpp_version; + mad_agent_priv->agent.device = device; + mad_agent_priv->agent.recv_handler = recv_handler; + mad_agent_priv->agent.send_handler = send_handler; + mad_agent_priv->agent.context = context; + mad_agent_priv->agent.qp = port_priv->qp_info[qpn].qp; + mad_agent_priv->agent.port_num = port_num; + mad_agent_priv->agent.flags = registration_flags; + spin_lock_init(&mad_agent_priv->lock); + INIT_LIST_HEAD(&mad_agent_priv->send_list); + INIT_LIST_HEAD(&mad_agent_priv->wait_list); + INIT_LIST_HEAD(&mad_agent_priv->done_list); + INIT_LIST_HEAD(&mad_agent_priv->rmpp_list); + INIT_DELAYED_WORK(&mad_agent_priv->timed_work, timeout_sends); + INIT_LIST_HEAD(&mad_agent_priv->local_list); + INIT_WORK(&mad_agent_priv->local_work, local_completions); + atomic_set(&mad_agent_priv->refcount, 1); + init_completion(&mad_agent_priv->comp); + + spin_lock_irqsave(&port_priv->reg_lock, flags); + mad_agent_priv->agent.hi_tid = ++ib_mad_client_id; + + /* + * Make sure MAD registration (if supplied) + * is non overlapping with any existing ones + */ + if (mad_reg_req) { + mgmt_class = convert_mgmt_class(mad_reg_req->mgmt_class); + if (!is_vendor_class(mgmt_class)) { + class = port_priv->version[mad_reg_req-> + mgmt_class_version].class; + if (class) { + method = class->method_table[mgmt_class]; + if (method) { + if (method_in_use(&method, + mad_reg_req)) + goto error4; + } + } + ret2 = add_nonoui_reg_req(mad_reg_req, mad_agent_priv, + mgmt_class); + } else { + /* "New" vendor class range */ + vendor = port_priv->version[mad_reg_req-> + mgmt_class_version].vendor; + if (vendor) { + vclass = vendor_class_index(mgmt_class); + vendor_class = vendor->vendor_class[vclass]; + if (vendor_class) { + if (is_vendor_method_in_use( + vendor_class, + mad_reg_req)) + goto error4; + } + } + ret2 = add_oui_reg_req(mad_reg_req, mad_agent_priv); + } + if (ret2) { + ret = ERR_PTR(ret2); + goto error4; + } + } + + /* Add mad agent into port's agent list */ + list_add_tail(&mad_agent_priv->agent_list, &port_priv->agent_list); + spin_unlock_irqrestore(&port_priv->reg_lock, flags); + + return &mad_agent_priv->agent; + +error4: + spin_unlock_irqrestore(&port_priv->reg_lock, flags); + kfree(reg_req); +error3: + ib_dereg_mr(mad_agent_priv->agent.mr); +error2: + kfree(mad_agent_priv); +error1: + return ret; +} +EXPORT_SYMBOL(ib_register_mad_agent); + +static inline int is_snooping_sends(int mad_snoop_flags) +{ + return (mad_snoop_flags & + (/*IB_MAD_SNOOP_POSTED_SENDS | + IB_MAD_SNOOP_RMPP_SENDS |*/ + IB_MAD_SNOOP_SEND_COMPLETIONS /*| + IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS*/)); +} + +static inline int is_snooping_recvs(int mad_snoop_flags) +{ + return (mad_snoop_flags & + (IB_MAD_SNOOP_RECVS /*| + IB_MAD_SNOOP_RMPP_RECVS*/)); +} + +static int register_snoop_agent(struct ib_mad_qp_info *qp_info, + struct ib_mad_snoop_private *mad_snoop_priv) +{ + struct ib_mad_snoop_private **new_snoop_table; + unsigned long flags; + int i; + + spin_lock_irqsave(&qp_info->snoop_lock, flags); + /* Check for empty slot in array. */ + for (i = 0; i < qp_info->snoop_table_size; i++) + if (!qp_info->snoop_table[i]) + break; + + if (i == qp_info->snoop_table_size) { + /* Grow table. */ + new_snoop_table = krealloc(qp_info->snoop_table, + sizeof mad_snoop_priv * + (qp_info->snoop_table_size + 1), + GFP_ATOMIC); + if (!new_snoop_table) { + i = -ENOMEM; + goto out; + } + + qp_info->snoop_table = new_snoop_table; + qp_info->snoop_table_size++; + } + qp_info->snoop_table[i] = mad_snoop_priv; + atomic_inc(&qp_info->snoop_count); +out: + spin_unlock_irqrestore(&qp_info->snoop_lock, flags); + return i; +} + +struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device, + u8 port_num, + enum ib_qp_type qp_type, + int mad_snoop_flags, + ib_mad_snoop_handler snoop_handler, + ib_mad_recv_handler recv_handler, + void *context) +{ + struct ib_mad_port_private *port_priv; + struct ib_mad_agent *ret; + struct ib_mad_snoop_private *mad_snoop_priv; + int qpn; + + /* Validate parameters */ + if ((is_snooping_sends(mad_snoop_flags) && !snoop_handler) || + (is_snooping_recvs(mad_snoop_flags) && !recv_handler)) { + ret = ERR_PTR(-EINVAL); + goto error1; + } + qpn = get_spl_qp_index(qp_type); + if (qpn == -1) { + ret = ERR_PTR(-EINVAL); + goto error1; + } + port_priv = ib_get_mad_port(device, port_num); + if (!port_priv) { + ret = ERR_PTR(-ENODEV); + goto error1; + } + /* Allocate structures */ + mad_snoop_priv = kzalloc(sizeof *mad_snoop_priv, GFP_KERNEL); + if (!mad_snoop_priv) { + ret = ERR_PTR(-ENOMEM); + goto error1; + } + + /* Now, fill in the various structures */ + mad_snoop_priv->qp_info = &port_priv->qp_info[qpn]; + mad_snoop_priv->agent.device = device; + mad_snoop_priv->agent.recv_handler = recv_handler; + mad_snoop_priv->agent.snoop_handler = snoop_handler; + mad_snoop_priv->agent.context = context; + mad_snoop_priv->agent.qp = port_priv->qp_info[qpn].qp; + mad_snoop_priv->agent.port_num = port_num; + mad_snoop_priv->mad_snoop_flags = mad_snoop_flags; + init_completion(&mad_snoop_priv->comp); + mad_snoop_priv->snoop_index = register_snoop_agent( + &port_priv->qp_info[qpn], + mad_snoop_priv); + if (mad_snoop_priv->snoop_index < 0) { + ret = ERR_PTR(mad_snoop_priv->snoop_index); + goto error2; + } + + atomic_set(&mad_snoop_priv->refcount, 1); + return &mad_snoop_priv->agent; + +error2: + kfree(mad_snoop_priv); +error1: + return ret; +} +EXPORT_SYMBOL(ib_register_mad_snoop); + +static inline void deref_mad_agent(struct ib_mad_agent_private *mad_agent_priv) +{ + if (atomic_dec_and_test(&mad_agent_priv->refcount)) + complete(&mad_agent_priv->comp); +} + +static inline void deref_snoop_agent(struct ib_mad_snoop_private *mad_snoop_priv) +{ + if (atomic_dec_and_test(&mad_snoop_priv->refcount)) + complete(&mad_snoop_priv->comp); +} + +static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv) +{ + struct ib_mad_port_private *port_priv; + unsigned long flags; + + /* Note that we could still be handling received MADs */ + + /* + * Canceling all sends results in dropping received response + * MADs, preventing us from queuing additional work + */ + cancel_mads(mad_agent_priv); + port_priv = mad_agent_priv->qp_info->port_priv; + cancel_delayed_work(&mad_agent_priv->timed_work); + + spin_lock_irqsave(&port_priv->reg_lock, flags); + remove_mad_reg_req(mad_agent_priv); + list_del(&mad_agent_priv->agent_list); + spin_unlock_irqrestore(&port_priv->reg_lock, flags); + + flush_workqueue(port_priv->wq); + ib_cancel_rmpp_recvs(mad_agent_priv); + + deref_mad_agent(mad_agent_priv); + wait_for_completion(&mad_agent_priv->comp); + + kfree(mad_agent_priv->reg_req); + ib_dereg_mr(mad_agent_priv->agent.mr); + kfree(mad_agent_priv); +} + +static void unregister_mad_snoop(struct ib_mad_snoop_private *mad_snoop_priv) +{ + struct ib_mad_qp_info *qp_info; + unsigned long flags; + + qp_info = mad_snoop_priv->qp_info; + spin_lock_irqsave(&qp_info->snoop_lock, flags); + qp_info->snoop_table[mad_snoop_priv->snoop_index] = NULL; + atomic_dec(&qp_info->snoop_count); + spin_unlock_irqrestore(&qp_info->snoop_lock, flags); + + deref_snoop_agent(mad_snoop_priv); + wait_for_completion(&mad_snoop_priv->comp); + + kfree(mad_snoop_priv); +} + +/* + * ib_unregister_mad_agent - Unregisters a client from using MAD services + */ +int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent) +{ + struct ib_mad_agent_private *mad_agent_priv; + struct ib_mad_snoop_private *mad_snoop_priv; + + /* If the TID is zero, the agent can only snoop. */ + if (mad_agent->hi_tid) { + mad_agent_priv = container_of(mad_agent, + struct ib_mad_agent_private, + agent); + unregister_mad_agent(mad_agent_priv); + } else { + mad_snoop_priv = container_of(mad_agent, + struct ib_mad_snoop_private, + agent); + unregister_mad_snoop(mad_snoop_priv); + } + return 0; +} +EXPORT_SYMBOL(ib_unregister_mad_agent); + +static void dequeue_mad(struct ib_mad_list_head *mad_list) +{ + struct ib_mad_queue *mad_queue; + unsigned long flags; + + BUG_ON(!mad_list->mad_queue); + mad_queue = mad_list->mad_queue; + spin_lock_irqsave(&mad_queue->lock, flags); + list_del(&mad_list->list); + mad_queue->count--; + spin_unlock_irqrestore(&mad_queue->lock, flags); +} + +static void snoop_send(struct ib_mad_qp_info *qp_info, + struct ib_mad_send_buf *send_buf, + struct ib_mad_send_wc *mad_send_wc, + int mad_snoop_flags) +{ + struct ib_mad_snoop_private *mad_snoop_priv; + unsigned long flags; + int i; + + spin_lock_irqsave(&qp_info->snoop_lock, flags); + for (i = 0; i < qp_info->snoop_table_size; i++) { + mad_snoop_priv = qp_info->snoop_table[i]; + if (!mad_snoop_priv || + !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags)) + continue; + + atomic_inc(&mad_snoop_priv->refcount); + spin_unlock_irqrestore(&qp_info->snoop_lock, flags); + mad_snoop_priv->agent.snoop_handler(&mad_snoop_priv->agent, + send_buf, mad_send_wc); + deref_snoop_agent(mad_snoop_priv); + spin_lock_irqsave(&qp_info->snoop_lock, flags); + } + spin_unlock_irqrestore(&qp_info->snoop_lock, flags); +} + +static void snoop_recv(struct ib_mad_qp_info *qp_info, + struct ib_mad_recv_wc *mad_recv_wc, + int mad_snoop_flags) +{ + struct ib_mad_snoop_private *mad_snoop_priv; + unsigned long flags; + int i; + + spin_lock_irqsave(&qp_info->snoop_lock, flags); + for (i = 0; i < qp_info->snoop_table_size; i++) { + mad_snoop_priv = qp_info->snoop_table[i]; + if (!mad_snoop_priv || + !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags)) + continue; + + atomic_inc(&mad_snoop_priv->refcount); + spin_unlock_irqrestore(&qp_info->snoop_lock, flags); + mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent, + mad_recv_wc); + deref_snoop_agent(mad_snoop_priv); + spin_lock_irqsave(&qp_info->snoop_lock, flags); + } + spin_unlock_irqrestore(&qp_info->snoop_lock, flags); +} + +static void build_smp_wc(struct ib_qp *qp, + u64 wr_id, u16 slid, u16 pkey_index, u8 port_num, + struct ib_wc *wc) +{ + memset(wc, 0, sizeof *wc); + wc->wr_id = wr_id; + wc->status = IB_WC_SUCCESS; + wc->opcode = IB_WC_RECV; + wc->pkey_index = pkey_index; + wc->byte_len = sizeof(struct ib_mad) + sizeof(struct ib_grh); + wc->src_qp = IB_QP0; + wc->qp = qp; + wc->slid = slid; + wc->sl = 0; + wc->dlid_path_bits = 0; + wc->port_num = port_num; +} + +/* + * Return 0 if SMP is to be sent + * Return 1 if SMP was consumed locally (whether or not solicited) + * Return < 0 if error + */ +static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, + struct ib_mad_send_wr_private *mad_send_wr) +{ + int ret = 0; + struct ib_smp *smp = mad_send_wr->send_buf.mad; + unsigned long flags; + struct ib_mad_local_private *local; + struct ib_mad_private *mad_priv; + struct ib_mad_port_private *port_priv; + struct ib_mad_agent_private *recv_mad_agent = NULL; + struct ib_device *device = mad_agent_priv->agent.device; + u8 port_num; + struct ib_wc mad_wc; + struct ib_send_wr *send_wr = &mad_send_wr->send_wr; + + if (device->node_type == RDMA_NODE_IB_SWITCH && + smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + port_num = send_wr->wr.ud.port_num; + else + port_num = mad_agent_priv->agent.port_num; + + /* + * Directed route handling starts if the initial LID routed part of + * a request or the ending LID routed part of a response is empty. + * If we are at the start of the LID routed part, don't update the + * hop_ptr or hop_cnt. See section 14.2.2, Vol 1 IB spec. + */ + if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) == + IB_LID_PERMISSIVE && + smi_handle_dr_smp_send(smp, device->node_type, port_num) == + IB_SMI_DISCARD) { + ret = -EINVAL; + dev_err(&device->dev, "Invalid directed route\n"); + goto out; + } + + /* Check to post send on QP or process locally */ + if (smi_check_local_smp(smp, device) == IB_SMI_DISCARD && + smi_check_local_returning_smp(smp, device) == IB_SMI_DISCARD) + goto out; + + local = kmalloc(sizeof *local, GFP_ATOMIC); + if (!local) { + ret = -ENOMEM; + dev_err(&device->dev, "No memory for ib_mad_local_private\n"); + goto out; + } + local->mad_priv = NULL; + local->recv_mad_agent = NULL; + mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_ATOMIC); + if (!mad_priv) { + ret = -ENOMEM; + dev_err(&device->dev, "No memory for local response MAD\n"); + kfree(local); + goto out; + } + + build_smp_wc(mad_agent_priv->agent.qp, + send_wr->wr_id, be16_to_cpu(smp->dr_slid), + send_wr->wr.ud.pkey_index, + send_wr->wr.ud.port_num, &mad_wc); + + /* No GRH for DR SMP */ + ret = device->process_mad(device, 0, port_num, &mad_wc, NULL, + (struct ib_mad *)smp, + (struct ib_mad *)&mad_priv->mad); + switch (ret) + { + case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY: + if (ib_response_mad(&mad_priv->mad.mad) && + mad_agent_priv->agent.recv_handler) { + local->mad_priv = mad_priv; + local->recv_mad_agent = mad_agent_priv; + /* + * Reference MAD agent until receive + * side of local completion handled + */ + atomic_inc(&mad_agent_priv->refcount); + } else + kmem_cache_free(ib_mad_cache, mad_priv); + break; + case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED: + kmem_cache_free(ib_mad_cache, mad_priv); + break; + case IB_MAD_RESULT_SUCCESS: + /* Treat like an incoming receive MAD */ + port_priv = ib_get_mad_port(mad_agent_priv->agent.device, + mad_agent_priv->agent.port_num); + if (port_priv) { + memcpy(&mad_priv->mad.mad, smp, sizeof(struct ib_mad)); + recv_mad_agent = find_mad_agent(port_priv, + &mad_priv->mad.mad); + } + if (!port_priv || !recv_mad_agent) { + /* + * No receiving agent so drop packet and + * generate send completion. + */ + kmem_cache_free(ib_mad_cache, mad_priv); + break; + } + local->mad_priv = mad_priv; + local->recv_mad_agent = recv_mad_agent; + break; + default: + kmem_cache_free(ib_mad_cache, mad_priv); + kfree(local); + ret = -EINVAL; + goto out; + } + + local->mad_send_wr = mad_send_wr; + /* Reference MAD agent until send side of local completion handled */ + atomic_inc(&mad_agent_priv->refcount); + /* Queue local completion to local list */ + spin_lock_irqsave(&mad_agent_priv->lock, flags); + list_add_tail(&local->completion_list, &mad_agent_priv->local_list); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + queue_work(mad_agent_priv->qp_info->port_priv->wq, + &mad_agent_priv->local_work); + ret = 1; +out: + return ret; +} + +static int get_pad_size(int hdr_len, int data_len) +{ + int seg_size, pad; + + seg_size = sizeof(struct ib_mad) - hdr_len; + if (data_len && seg_size) { + pad = seg_size - data_len % seg_size; + return pad == seg_size ? 0 : pad; + } else + return seg_size; +} + +static void free_send_rmpp_list(struct ib_mad_send_wr_private *mad_send_wr) +{ + struct ib_rmpp_segment *s, *t; + + list_for_each_entry_safe(s, t, &mad_send_wr->rmpp_list, list) { + list_del(&s->list); + kfree(s); + } +} + +static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr, + gfp_t gfp_mask) +{ + struct ib_mad_send_buf *send_buf = &send_wr->send_buf; + struct ib_rmpp_mad *rmpp_mad = send_buf->mad; + struct ib_rmpp_segment *seg = NULL; + int left, seg_size, pad; + + send_buf->seg_size = sizeof (struct ib_mad) - send_buf->hdr_len; + seg_size = send_buf->seg_size; + pad = send_wr->pad; + + /* Allocate data segments. */ + for (left = send_buf->data_len + pad; left > 0; left -= seg_size) { + seg = kmalloc(sizeof (*seg) + seg_size, gfp_mask); + if (!seg) { + dev_err(&send_buf->mad_agent->device->dev, + "alloc_send_rmpp_segs: RMPP mem alloc failed for len %zd, gfp %#x\n", + sizeof (*seg) + seg_size, gfp_mask); + free_send_rmpp_list(send_wr); + return -ENOMEM; + } + seg->num = ++send_buf->seg_count; + list_add_tail(&seg->list, &send_wr->rmpp_list); + } + + /* Zero any padding */ + if (pad) + memset(seg->data + seg_size - pad, 0, pad); + + rmpp_mad->rmpp_hdr.rmpp_version = send_wr->mad_agent_priv-> + agent.rmpp_version; + rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_DATA; + ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE); + + send_wr->cur_seg = container_of(send_wr->rmpp_list.next, + struct ib_rmpp_segment, list); + send_wr->last_ack_seg = send_wr->cur_seg; + return 0; +} + +int ib_mad_kernel_rmpp_agent(struct ib_mad_agent *agent) +{ + return agent->rmpp_version && !(agent->flags & IB_MAD_USER_RMPP); +} +EXPORT_SYMBOL(ib_mad_kernel_rmpp_agent); + +struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent, + u32 remote_qpn, u16 pkey_index, + int rmpp_active, + int hdr_len, int data_len, + gfp_t gfp_mask) +{ + struct ib_mad_agent_private *mad_agent_priv; + struct ib_mad_send_wr_private *mad_send_wr; + int pad, message_size, ret, size; + void *buf; + + mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private, + agent); + pad = get_pad_size(hdr_len, data_len); + message_size = hdr_len + data_len + pad; + + if (ib_mad_kernel_rmpp_agent(mad_agent)) { + if (!rmpp_active && message_size > sizeof(struct ib_mad)) + return ERR_PTR(-EINVAL); + } else + if (rmpp_active || message_size > sizeof(struct ib_mad)) + return ERR_PTR(-EINVAL); + + size = rmpp_active ? hdr_len : sizeof(struct ib_mad); + buf = kzalloc(sizeof *mad_send_wr + size, gfp_mask); + if (!buf) + return ERR_PTR(-ENOMEM); + + mad_send_wr = buf + size; + INIT_LIST_HEAD(&mad_send_wr->rmpp_list); + mad_send_wr->send_buf.mad = buf; + mad_send_wr->send_buf.hdr_len = hdr_len; + mad_send_wr->send_buf.data_len = data_len; + mad_send_wr->pad = pad; + + mad_send_wr->mad_agent_priv = mad_agent_priv; + mad_send_wr->sg_list[0].length = hdr_len; + mad_send_wr->sg_list[0].lkey = mad_agent->mr->lkey; + mad_send_wr->sg_list[1].length = sizeof(struct ib_mad) - hdr_len; + mad_send_wr->sg_list[1].lkey = mad_agent->mr->lkey; + + mad_send_wr->send_wr.wr_id = (unsigned long) mad_send_wr; + mad_send_wr->send_wr.sg_list = mad_send_wr->sg_list; + mad_send_wr->send_wr.num_sge = 2; + mad_send_wr->send_wr.opcode = IB_WR_SEND; + mad_send_wr->send_wr.send_flags = IB_SEND_SIGNALED; + mad_send_wr->send_wr.wr.ud.remote_qpn = remote_qpn; + mad_send_wr->send_wr.wr.ud.remote_qkey = IB_QP_SET_QKEY; + mad_send_wr->send_wr.wr.ud.pkey_index = pkey_index; + + if (rmpp_active) { + ret = alloc_send_rmpp_list(mad_send_wr, gfp_mask); + if (ret) { + kfree(buf); + return ERR_PTR(ret); + } + } + + mad_send_wr->send_buf.mad_agent = mad_agent; + atomic_inc(&mad_agent_priv->refcount); + return &mad_send_wr->send_buf; +} +EXPORT_SYMBOL(ib_create_send_mad); + +int ib_get_mad_data_offset(u8 mgmt_class) +{ + if (mgmt_class == IB_MGMT_CLASS_SUBN_ADM) + return IB_MGMT_SA_HDR; + else if ((mgmt_class == IB_MGMT_CLASS_DEVICE_MGMT) || + (mgmt_class == IB_MGMT_CLASS_DEVICE_ADM) || + (mgmt_class == IB_MGMT_CLASS_BIS)) + return IB_MGMT_DEVICE_HDR; + else if ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) && + (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END)) + return IB_MGMT_VENDOR_HDR; + else + return IB_MGMT_MAD_HDR; +} +EXPORT_SYMBOL(ib_get_mad_data_offset); + +int ib_is_mad_class_rmpp(u8 mgmt_class) +{ + if ((mgmt_class == IB_MGMT_CLASS_SUBN_ADM) || + (mgmt_class == IB_MGMT_CLASS_DEVICE_MGMT) || + (mgmt_class == IB_MGMT_CLASS_DEVICE_ADM) || + (mgmt_class == IB_MGMT_CLASS_BIS) || + ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) && + (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END))) + return 1; + return 0; +} +EXPORT_SYMBOL(ib_is_mad_class_rmpp); + +void *ib_get_rmpp_segment(struct ib_mad_send_buf *send_buf, int seg_num) +{ + struct ib_mad_send_wr_private *mad_send_wr; + struct list_head *list; + + mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private, + send_buf); + list = &mad_send_wr->cur_seg->list; + + if (mad_send_wr->cur_seg->num < seg_num) { + list_for_each_entry(mad_send_wr->cur_seg, list, list) + if (mad_send_wr->cur_seg->num == seg_num) + break; + } else if (mad_send_wr->cur_seg->num > seg_num) { + list_for_each_entry_reverse(mad_send_wr->cur_seg, list, list) + if (mad_send_wr->cur_seg->num == seg_num) + break; + } + return mad_send_wr->cur_seg->data; +} +EXPORT_SYMBOL(ib_get_rmpp_segment); + +static inline void *ib_get_payload(struct ib_mad_send_wr_private *mad_send_wr) +{ + if (mad_send_wr->send_buf.seg_count) + return ib_get_rmpp_segment(&mad_send_wr->send_buf, + mad_send_wr->seg_num); + else + return mad_send_wr->send_buf.mad + + mad_send_wr->send_buf.hdr_len; +} + +void ib_free_send_mad(struct ib_mad_send_buf *send_buf) +{ + struct ib_mad_agent_private *mad_agent_priv; + struct ib_mad_send_wr_private *mad_send_wr; + + mad_agent_priv = container_of(send_buf->mad_agent, + struct ib_mad_agent_private, agent); + mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private, + send_buf); + + free_send_rmpp_list(mad_send_wr); + kfree(send_buf->mad); + deref_mad_agent(mad_agent_priv); +} +EXPORT_SYMBOL(ib_free_send_mad); + +int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr) +{ + struct ib_mad_qp_info *qp_info; + struct list_head *list; + struct ib_send_wr *bad_send_wr; + struct ib_mad_agent *mad_agent; + struct ib_sge *sge; + unsigned long flags; + int ret; + + /* Set WR ID to find mad_send_wr upon completion */ + qp_info = mad_send_wr->mad_agent_priv->qp_info; + mad_send_wr->send_wr.wr_id = (unsigned long)&mad_send_wr->mad_list; + mad_send_wr->mad_list.mad_queue = &qp_info->send_queue; + + mad_agent = mad_send_wr->send_buf.mad_agent; + sge = mad_send_wr->sg_list; + sge[0].addr = ib_dma_map_single(mad_agent->device, + mad_send_wr->send_buf.mad, + sge[0].length, + DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[0].addr))) + return -ENOMEM; + + mad_send_wr->header_mapping = sge[0].addr; + + sge[1].addr = ib_dma_map_single(mad_agent->device, + ib_get_payload(mad_send_wr), + sge[1].length, + DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[1].addr))) { + ib_dma_unmap_single(mad_agent->device, + mad_send_wr->header_mapping, + sge[0].length, DMA_TO_DEVICE); + return -ENOMEM; + } + mad_send_wr->payload_mapping = sge[1].addr; + + spin_lock_irqsave(&qp_info->send_queue.lock, flags); + if (qp_info->send_queue.count < qp_info->send_queue.max_active) { + ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr, + &bad_send_wr); + list = &qp_info->send_queue.list; + } else { + ret = 0; + list = &qp_info->overflow_list; + } + + if (!ret) { + qp_info->send_queue.count++; + list_add_tail(&mad_send_wr->mad_list.list, list); + } + spin_unlock_irqrestore(&qp_info->send_queue.lock, flags); + if (ret) { + ib_dma_unmap_single(mad_agent->device, + mad_send_wr->header_mapping, + sge[0].length, DMA_TO_DEVICE); + ib_dma_unmap_single(mad_agent->device, + mad_send_wr->payload_mapping, + sge[1].length, DMA_TO_DEVICE); + } + return ret; +} + +/* + * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated + * with the registered client + */ +int ib_post_send_mad(struct ib_mad_send_buf *send_buf, + struct ib_mad_send_buf **bad_send_buf) +{ + struct ib_mad_agent_private *mad_agent_priv; + struct ib_mad_send_buf *next_send_buf; + struct ib_mad_send_wr_private *mad_send_wr; + unsigned long flags; + int ret = -EINVAL; + + /* Walk list of send WRs and post each on send list */ + for (; send_buf; send_buf = next_send_buf) { + + mad_send_wr = container_of(send_buf, + struct ib_mad_send_wr_private, + send_buf); + mad_agent_priv = mad_send_wr->mad_agent_priv; + + if (!send_buf->mad_agent->send_handler || + (send_buf->timeout_ms && + !send_buf->mad_agent->recv_handler)) { + ret = -EINVAL; + goto error; + } + + if (!ib_is_mad_class_rmpp(((struct ib_mad_hdr *) send_buf->mad)->mgmt_class)) { + if (mad_agent_priv->agent.rmpp_version) { + ret = -EINVAL; + goto error; + } + } + + /* + * Save pointer to next work request to post in case the + * current one completes, and the user modifies the work + * request associated with the completion + */ + next_send_buf = send_buf->next; + mad_send_wr->send_wr.wr.ud.ah = send_buf->ah; + + if (((struct ib_mad_hdr *) send_buf->mad)->mgmt_class == + IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + ret = handle_outgoing_dr_smp(mad_agent_priv, + mad_send_wr); + if (ret < 0) /* error */ + goto error; + else if (ret == 1) /* locally consumed */ + continue; + } + + mad_send_wr->tid = ((struct ib_mad_hdr *) send_buf->mad)->tid; + /* Timeout will be updated after send completes */ + mad_send_wr->timeout = msecs_to_jiffies(send_buf->timeout_ms); + mad_send_wr->max_retries = send_buf->retries; + mad_send_wr->retries_left = send_buf->retries; + send_buf->retries = 0; + /* Reference for work request to QP + response */ + mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0); + mad_send_wr->status = IB_WC_SUCCESS; + + /* Reference MAD agent until send completes */ + atomic_inc(&mad_agent_priv->refcount); + spin_lock_irqsave(&mad_agent_priv->lock, flags); + list_add_tail(&mad_send_wr->agent_list, + &mad_agent_priv->send_list); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + + if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) { + ret = ib_send_rmpp_mad(mad_send_wr); + if (ret >= 0 && ret != IB_RMPP_RESULT_CONSUMED) + ret = ib_send_mad(mad_send_wr); + } else + ret = ib_send_mad(mad_send_wr); + if (ret < 0) { + /* Fail send request */ + spin_lock_irqsave(&mad_agent_priv->lock, flags); + list_del(&mad_send_wr->agent_list); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + atomic_dec(&mad_agent_priv->refcount); + goto error; + } + } + return 0; +error: + if (bad_send_buf) + *bad_send_buf = send_buf; + return ret; +} +EXPORT_SYMBOL(ib_post_send_mad); + +/* + * ib_free_recv_mad - Returns data buffers used to receive + * a MAD to the access layer + */ +void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc) +{ + struct ib_mad_recv_buf *mad_recv_buf, *temp_recv_buf; + struct ib_mad_private_header *mad_priv_hdr; + struct ib_mad_private *priv; + struct list_head free_list; + + INIT_LIST_HEAD(&free_list); + list_splice_init(&mad_recv_wc->rmpp_list, &free_list); + + list_for_each_entry_safe(mad_recv_buf, temp_recv_buf, + &free_list, list) { + mad_recv_wc = container_of(mad_recv_buf, struct ib_mad_recv_wc, + recv_buf); + mad_priv_hdr = container_of(mad_recv_wc, + struct ib_mad_private_header, + recv_wc); + priv = container_of(mad_priv_hdr, struct ib_mad_private, + header); + kmem_cache_free(ib_mad_cache, priv); + } +} +EXPORT_SYMBOL(ib_free_recv_mad); + +struct ib_mad_agent *ib_redirect_mad_qp(struct ib_qp *qp, + u8 rmpp_version, + ib_mad_send_handler send_handler, + ib_mad_recv_handler recv_handler, + void *context) +{ + return ERR_PTR(-EINVAL); /* XXX: for now */ +} +EXPORT_SYMBOL(ib_redirect_mad_qp); + +int ib_process_mad_wc(struct ib_mad_agent *mad_agent, + struct ib_wc *wc) +{ + dev_err(&mad_agent->device->dev, + "ib_process_mad_wc() not implemented yet\n"); + return 0; +} +EXPORT_SYMBOL(ib_process_mad_wc); + +static int method_in_use(struct ib_mad_mgmt_method_table **method, + struct ib_mad_reg_req *mad_reg_req) +{ + int i; + + for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) { + if ((*method)->agent[i]) { + pr_err("Method %d already in use\n", i); + return -EINVAL; + } + } + return 0; +} + +static int allocate_method_table(struct ib_mad_mgmt_method_table **method) +{ + /* Allocate management method table */ + *method = kzalloc(sizeof **method, GFP_ATOMIC); + if (!*method) { + pr_err("No memory for ib_mad_mgmt_method_table\n"); + return -ENOMEM; + } + + return 0; +} + +/* + * Check to see if there are any methods still in use + */ +static int check_method_table(struct ib_mad_mgmt_method_table *method) +{ + int i; + + for (i = 0; i < IB_MGMT_MAX_METHODS; i++) + if (method->agent[i]) + return 1; + return 0; +} + +/* + * Check to see if there are any method tables for this class still in use + */ +static int check_class_table(struct ib_mad_mgmt_class_table *class) +{ + int i; + + for (i = 0; i < MAX_MGMT_CLASS; i++) + if (class->method_table[i]) + return 1; + return 0; +} + +static int check_vendor_class(struct ib_mad_mgmt_vendor_class *vendor_class) +{ + int i; + + for (i = 0; i < MAX_MGMT_OUI; i++) + if (vendor_class->method_table[i]) + return 1; + return 0; +} + +static int find_vendor_oui(struct ib_mad_mgmt_vendor_class *vendor_class, + char *oui) +{ + int i; + + for (i = 0; i < MAX_MGMT_OUI; i++) + /* Is there matching OUI for this vendor class ? */ + if (!memcmp(vendor_class->oui[i], oui, 3)) + return i; + + return -1; +} + +static int check_vendor_table(struct ib_mad_mgmt_vendor_class_table *vendor) +{ + int i; + + for (i = 0; i < MAX_MGMT_VENDOR_RANGE2; i++) + if (vendor->vendor_class[i]) + return 1; + + return 0; +} + +static void remove_methods_mad_agent(struct ib_mad_mgmt_method_table *method, + struct ib_mad_agent_private *agent) +{ + int i; + + /* Remove any methods for this mad agent */ + for (i = 0; i < IB_MGMT_MAX_METHODS; i++) { + if (method->agent[i] == agent) { + method->agent[i] = NULL; + } + } +} + +static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req, + struct ib_mad_agent_private *agent_priv, + u8 mgmt_class) +{ + struct ib_mad_port_private *port_priv; + struct ib_mad_mgmt_class_table **class; + struct ib_mad_mgmt_method_table **method; + int i, ret; + + port_priv = agent_priv->qp_info->port_priv; + class = &port_priv->version[mad_reg_req->mgmt_class_version].class; + if (!*class) { + /* Allocate management class table for "new" class version */ + *class = kzalloc(sizeof **class, GFP_ATOMIC); + if (!*class) { + dev_err(&agent_priv->agent.device->dev, + "No memory for ib_mad_mgmt_class_table\n"); + ret = -ENOMEM; + goto error1; + } + + /* Allocate method table for this management class */ + method = &(*class)->method_table[mgmt_class]; + if ((ret = allocate_method_table(method))) + goto error2; + } else { + method = &(*class)->method_table[mgmt_class]; + if (!*method) { + /* Allocate method table for this management class */ + if ((ret = allocate_method_table(method))) + goto error1; + } + } + + /* Now, make sure methods are not already in use */ + if (method_in_use(method, mad_reg_req)) + goto error3; + + /* Finally, add in methods being registered */ + for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) + (*method)->agent[i] = agent_priv; + + return 0; + +error3: + /* Remove any methods for this mad agent */ + remove_methods_mad_agent(*method, agent_priv); + /* Now, check to see if there are any methods in use */ + if (!check_method_table(*method)) { + /* If not, release management method table */ + kfree(*method); + *method = NULL; + } + ret = -EINVAL; + goto error1; +error2: + kfree(*class); + *class = NULL; +error1: + return ret; +} + +static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req, + struct ib_mad_agent_private *agent_priv) +{ + struct ib_mad_port_private *port_priv; + struct ib_mad_mgmt_vendor_class_table **vendor_table; + struct ib_mad_mgmt_vendor_class_table *vendor = NULL; + struct ib_mad_mgmt_vendor_class *vendor_class = NULL; + struct ib_mad_mgmt_method_table **method; + int i, ret = -ENOMEM; + u8 vclass; + + /* "New" vendor (with OUI) class */ + vclass = vendor_class_index(mad_reg_req->mgmt_class); + port_priv = agent_priv->qp_info->port_priv; + vendor_table = &port_priv->version[ + mad_reg_req->mgmt_class_version].vendor; + if (!*vendor_table) { + /* Allocate mgmt vendor class table for "new" class version */ + vendor = kzalloc(sizeof *vendor, GFP_ATOMIC); + if (!vendor) { + dev_err(&agent_priv->agent.device->dev, + "No memory for ib_mad_mgmt_vendor_class_table\n"); + goto error1; + } + + *vendor_table = vendor; + } + if (!(*vendor_table)->vendor_class[vclass]) { + /* Allocate table for this management vendor class */ + vendor_class = kzalloc(sizeof *vendor_class, GFP_ATOMIC); + if (!vendor_class) { + dev_err(&agent_priv->agent.device->dev, + "No memory for ib_mad_mgmt_vendor_class\n"); + goto error2; + } + + (*vendor_table)->vendor_class[vclass] = vendor_class; + } + for (i = 0; i < MAX_MGMT_OUI; i++) { + /* Is there matching OUI for this vendor class ? */ + if (!memcmp((*vendor_table)->vendor_class[vclass]->oui[i], + mad_reg_req->oui, 3)) { + method = &(*vendor_table)->vendor_class[ + vclass]->method_table[i]; + BUG_ON(!*method); + goto check_in_use; + } + } + for (i = 0; i < MAX_MGMT_OUI; i++) { + /* OUI slot available ? */ + if (!is_vendor_oui((*vendor_table)->vendor_class[ + vclass]->oui[i])) { + method = &(*vendor_table)->vendor_class[ + vclass]->method_table[i]; + BUG_ON(*method); + /* Allocate method table for this OUI */ + if ((ret = allocate_method_table(method))) + goto error3; + memcpy((*vendor_table)->vendor_class[vclass]->oui[i], + mad_reg_req->oui, 3); + goto check_in_use; + } + } + dev_err(&agent_priv->agent.device->dev, "All OUI slots in use\n"); + goto error3; + +check_in_use: + /* Now, make sure methods are not already in use */ + if (method_in_use(method, mad_reg_req)) + goto error4; + + /* Finally, add in methods being registered */ + for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) + (*method)->agent[i] = agent_priv; + + return 0; + +error4: + /* Remove any methods for this mad agent */ + remove_methods_mad_agent(*method, agent_priv); + /* Now, check to see if there are any methods in use */ + if (!check_method_table(*method)) { + /* If not, release management method table */ + kfree(*method); + *method = NULL; + } + ret = -EINVAL; +error3: + if (vendor_class) { + (*vendor_table)->vendor_class[vclass] = NULL; + kfree(vendor_class); + } +error2: + if (vendor) { + *vendor_table = NULL; + kfree(vendor); + } +error1: + return ret; +} + +static void remove_mad_reg_req(struct ib_mad_agent_private *agent_priv) +{ + struct ib_mad_port_private *port_priv; + struct ib_mad_mgmt_class_table *class; + struct ib_mad_mgmt_method_table *method; + struct ib_mad_mgmt_vendor_class_table *vendor; + struct ib_mad_mgmt_vendor_class *vendor_class; + int index; + u8 mgmt_class; + + /* + * Was MAD registration request supplied + * with original registration ? + */ + if (!agent_priv->reg_req) { + goto out; + } + + port_priv = agent_priv->qp_info->port_priv; + mgmt_class = convert_mgmt_class(agent_priv->reg_req->mgmt_class); + class = port_priv->version[ + agent_priv->reg_req->mgmt_class_version].class; + if (!class) + goto vendor_check; + + method = class->method_table[mgmt_class]; + if (method) { + /* Remove any methods for this mad agent */ + remove_methods_mad_agent(method, agent_priv); + /* Now, check to see if there are any methods still in use */ + if (!check_method_table(method)) { + /* If not, release management method table */ + kfree(method); + class->method_table[mgmt_class] = NULL; + /* Any management classes left ? */ + if (!check_class_table(class)) { + /* If not, release management class table */ + kfree(class); + port_priv->version[ + agent_priv->reg_req-> + mgmt_class_version].class = NULL; + } + } + } + +vendor_check: + if (!is_vendor_class(mgmt_class)) + goto out; + + /* normalize mgmt_class to vendor range 2 */ + mgmt_class = vendor_class_index(agent_priv->reg_req->mgmt_class); + vendor = port_priv->version[ + agent_priv->reg_req->mgmt_class_version].vendor; + + if (!vendor) + goto out; + + vendor_class = vendor->vendor_class[mgmt_class]; + if (vendor_class) { + index = find_vendor_oui(vendor_class, agent_priv->reg_req->oui); + if (index < 0) + goto out; + method = vendor_class->method_table[index]; + if (method) { + /* Remove any methods for this mad agent */ + remove_methods_mad_agent(method, agent_priv); + /* + * Now, check to see if there are + * any methods still in use + */ + if (!check_method_table(method)) { + /* If not, release management method table */ + kfree(method); + vendor_class->method_table[index] = NULL; + memset(vendor_class->oui[index], 0, 3); + /* Any OUIs left ? */ + if (!check_vendor_class(vendor_class)) { + /* If not, release vendor class table */ + kfree(vendor_class); + vendor->vendor_class[mgmt_class] = NULL; + /* Any other vendor classes left ? */ + if (!check_vendor_table(vendor)) { + kfree(vendor); + port_priv->version[ + agent_priv->reg_req-> + mgmt_class_version]. + vendor = NULL; + } + } + } + } + } + +out: + return; +} + +static struct ib_mad_agent_private * +find_mad_agent(struct ib_mad_port_private *port_priv, + struct ib_mad *mad) +{ + struct ib_mad_agent_private *mad_agent = NULL; + unsigned long flags; + + spin_lock_irqsave(&port_priv->reg_lock, flags); + if (ib_response_mad(mad)) { + u32 hi_tid; + struct ib_mad_agent_private *entry; + + /* + * Routing is based on high 32 bits of transaction ID + * of MAD. + */ + hi_tid = be64_to_cpu(mad->mad_hdr.tid) >> 32; + list_for_each_entry(entry, &port_priv->agent_list, agent_list) { + if (entry->agent.hi_tid == hi_tid) { + mad_agent = entry; + break; + } + } + } else { + struct ib_mad_mgmt_class_table *class; + struct ib_mad_mgmt_method_table *method; + struct ib_mad_mgmt_vendor_class_table *vendor; + struct ib_mad_mgmt_vendor_class *vendor_class; + struct ib_vendor_mad *vendor_mad; + int index; + + /* + * Routing is based on version, class, and method + * For "newer" vendor MADs, also based on OUI + */ + if (mad->mad_hdr.class_version >= MAX_MGMT_VERSION) + goto out; + if (!is_vendor_class(mad->mad_hdr.mgmt_class)) { + class = port_priv->version[ + mad->mad_hdr.class_version].class; + if (!class) + goto out; + if (convert_mgmt_class(mad->mad_hdr.mgmt_class) >= + IB_MGMT_MAX_METHODS) + goto out; + method = class->method_table[convert_mgmt_class( + mad->mad_hdr.mgmt_class)]; + if (method) + mad_agent = method->agent[mad->mad_hdr.method & + ~IB_MGMT_METHOD_RESP]; + } else { + vendor = port_priv->version[ + mad->mad_hdr.class_version].vendor; + if (!vendor) + goto out; + vendor_class = vendor->vendor_class[vendor_class_index( + mad->mad_hdr.mgmt_class)]; + if (!vendor_class) + goto out; + /* Find matching OUI */ + vendor_mad = (struct ib_vendor_mad *)mad; + index = find_vendor_oui(vendor_class, vendor_mad->oui); + if (index == -1) + goto out; + method = vendor_class->method_table[index]; + if (method) { + mad_agent = method->agent[mad->mad_hdr.method & + ~IB_MGMT_METHOD_RESP]; + } + } + } + + if (mad_agent) { + if (mad_agent->agent.recv_handler) + atomic_inc(&mad_agent->refcount); + else { + dev_notice(&port_priv->device->dev, + "No receive handler for client %p on port %d\n", + &mad_agent->agent, port_priv->port_num); + mad_agent = NULL; + } + } +out: + spin_unlock_irqrestore(&port_priv->reg_lock, flags); + + return mad_agent; +} + +static int validate_mad(struct ib_mad *mad, u32 qp_num) +{ + int valid = 0; + + /* Make sure MAD base version is understood */ + if (mad->mad_hdr.base_version != IB_MGMT_BASE_VERSION) { + pr_err("MAD received with unsupported base version %d\n", + mad->mad_hdr.base_version); + goto out; + } + + /* Filter SMI packets sent to other than QP0 */ + if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) || + (mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { + if (qp_num == 0) + valid = 1; + } else { + /* Filter GSI packets sent to QP0 */ + if (qp_num != 0) + valid = 1; + } + +out: + return valid; +} + +static int is_data_mad(struct ib_mad_agent_private *mad_agent_priv, + struct ib_mad_hdr *mad_hdr) +{ + struct ib_rmpp_mad *rmpp_mad; + + rmpp_mad = (struct ib_rmpp_mad *)mad_hdr; + return !mad_agent_priv->agent.rmpp_version || + !ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent) || + !(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & + IB_MGMT_RMPP_FLAG_ACTIVE) || + (rmpp_mad->rmpp_hdr.rmpp_type == IB_MGMT_RMPP_TYPE_DATA); +} + +static inline int rcv_has_same_class(struct ib_mad_send_wr_private *wr, + struct ib_mad_recv_wc *rwc) +{ + return ((struct ib_mad *)(wr->send_buf.mad))->mad_hdr.mgmt_class == + rwc->recv_buf.mad->mad_hdr.mgmt_class; +} + +static inline int rcv_has_same_gid(struct ib_mad_agent_private *mad_agent_priv, + struct ib_mad_send_wr_private *wr, + struct ib_mad_recv_wc *rwc ) +{ + struct ib_ah_attr attr; + u8 send_resp, rcv_resp; + union ib_gid sgid; + struct ib_device *device = mad_agent_priv->agent.device; + u8 port_num = mad_agent_priv->agent.port_num; + u8 lmc; + + send_resp = ib_response_mad((struct ib_mad *)wr->send_buf.mad); + rcv_resp = ib_response_mad(rwc->recv_buf.mad); + + if (send_resp == rcv_resp) + /* both requests, or both responses. GIDs different */ + return 0; + + if (ib_query_ah(wr->send_buf.ah, &attr)) + /* Assume not equal, to avoid false positives. */ + return 0; + + if (!!(attr.ah_flags & IB_AH_GRH) != + !!(rwc->wc->wc_flags & IB_WC_GRH)) + /* one has GID, other does not. Assume different */ + return 0; + + if (!send_resp && rcv_resp) { + /* is request/response. */ + if (!(attr.ah_flags & IB_AH_GRH)) { + if (ib_get_cached_lmc(device, port_num, &lmc)) + return 0; + return (!lmc || !((attr.src_path_bits ^ + rwc->wc->dlid_path_bits) & + ((1 << lmc) - 1))); + } else { + if (ib_get_cached_gid(device, port_num, + attr.grh.sgid_index, &sgid)) + return 0; + return !memcmp(sgid.raw, rwc->recv_buf.grh->dgid.raw, + 16); + } + } + + if (!(attr.ah_flags & IB_AH_GRH)) + return attr.dlid == rwc->wc->slid; + else + return !memcmp(attr.grh.dgid.raw, rwc->recv_buf.grh->sgid.raw, + 16); +} + +static inline int is_direct(u8 class) +{ + return (class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE); +} + +struct ib_mad_send_wr_private* +ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv, + struct ib_mad_recv_wc *wc) +{ + struct ib_mad_send_wr_private *wr; + struct ib_mad *mad; + + mad = (struct ib_mad *)wc->recv_buf.mad; + + list_for_each_entry(wr, &mad_agent_priv->wait_list, agent_list) { + if ((wr->tid == mad->mad_hdr.tid) && + rcv_has_same_class(wr, wc) && + /* + * Don't check GID for direct routed MADs. + * These might have permissive LIDs. + */ + (is_direct(wc->recv_buf.mad->mad_hdr.mgmt_class) || + rcv_has_same_gid(mad_agent_priv, wr, wc))) + return (wr->status == IB_WC_SUCCESS) ? wr : NULL; + } + + /* + * It's possible to receive the response before we've + * been notified that the send has completed + */ + list_for_each_entry(wr, &mad_agent_priv->send_list, agent_list) { + if (is_data_mad(mad_agent_priv, wr->send_buf.mad) && + wr->tid == mad->mad_hdr.tid && + wr->timeout && + rcv_has_same_class(wr, wc) && + /* + * Don't check GID for direct routed MADs. + * These might have permissive LIDs. + */ + (is_direct(wc->recv_buf.mad->mad_hdr.mgmt_class) || + rcv_has_same_gid(mad_agent_priv, wr, wc))) + /* Verify request has not been canceled */ + return (wr->status == IB_WC_SUCCESS) ? wr : NULL; + } + return NULL; +} + +void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr) +{ + mad_send_wr->timeout = 0; + if (mad_send_wr->refcount == 1) + list_move_tail(&mad_send_wr->agent_list, + &mad_send_wr->mad_agent_priv->done_list); +} + +static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct ib_mad_send_wr_private *mad_send_wr; + struct ib_mad_send_wc mad_send_wc; + unsigned long flags; + + INIT_LIST_HEAD(&mad_recv_wc->rmpp_list); + list_add(&mad_recv_wc->recv_buf.list, &mad_recv_wc->rmpp_list); + if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) { + mad_recv_wc = ib_process_rmpp_recv_wc(mad_agent_priv, + mad_recv_wc); + if (!mad_recv_wc) { + deref_mad_agent(mad_agent_priv); + return; + } + } + + /* Complete corresponding request */ + if (ib_response_mad(mad_recv_wc->recv_buf.mad)) { + spin_lock_irqsave(&mad_agent_priv->lock, flags); + mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc); + if (!mad_send_wr) { + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + if (!ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent) + && ib_is_mad_class_rmpp(mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class) + && (ib_get_rmpp_flags(&((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr) + & IB_MGMT_RMPP_FLAG_ACTIVE)) { + /* user rmpp is in effect + * and this is an active RMPP MAD + */ + mad_recv_wc->wc->wr_id = 0; + mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, + mad_recv_wc); + atomic_dec(&mad_agent_priv->refcount); + } else { + /* not user rmpp, revert to normal behavior and + * drop the mad */ + ib_free_recv_mad(mad_recv_wc); + deref_mad_agent(mad_agent_priv); + return; + } + } else { + ib_mark_mad_done(mad_send_wr); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + + /* Defined behavior is to complete response before request */ + mad_recv_wc->wc->wr_id = (unsigned long) &mad_send_wr->send_buf; + mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, + mad_recv_wc); + atomic_dec(&mad_agent_priv->refcount); + + mad_send_wc.status = IB_WC_SUCCESS; + mad_send_wc.vendor_err = 0; + mad_send_wc.send_buf = &mad_send_wr->send_buf; + ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); + } + } else { + mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, + mad_recv_wc); + deref_mad_agent(mad_agent_priv); + } +} + +static bool generate_unmatched_resp(struct ib_mad_private *recv, + struct ib_mad_private *response) +{ + if (recv->mad.mad.mad_hdr.method == IB_MGMT_METHOD_GET || + recv->mad.mad.mad_hdr.method == IB_MGMT_METHOD_SET) { + memcpy(response, recv, sizeof *response); + response->header.recv_wc.wc = &response->header.wc; + response->header.recv_wc.recv_buf.mad = &response->mad.mad; + response->header.recv_wc.recv_buf.grh = &response->grh; + response->mad.mad.mad_hdr.method = IB_MGMT_METHOD_GET_RESP; + response->mad.mad.mad_hdr.status = + cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB); + if (recv->mad.mad.mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + response->mad.mad.mad_hdr.status |= IB_SMP_DIRECTION; + + return true; + } else { + return false; + } +} +static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv, + struct ib_wc *wc) +{ + struct ib_mad_qp_info *qp_info; + struct ib_mad_private_header *mad_priv_hdr; + struct ib_mad_private *recv, *response = NULL; + struct ib_mad_list_head *mad_list; + struct ib_mad_agent_private *mad_agent; + int port_num; + int ret = IB_MAD_RESULT_SUCCESS; + + mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id; + qp_info = mad_list->mad_queue->qp_info; + dequeue_mad(mad_list); + + mad_priv_hdr = container_of(mad_list, struct ib_mad_private_header, + mad_list); + recv = container_of(mad_priv_hdr, struct ib_mad_private, header); + ib_dma_unmap_single(port_priv->device, + recv->header.mapping, + sizeof(struct ib_mad_private) - + sizeof(struct ib_mad_private_header), + DMA_FROM_DEVICE); + + /* Setup MAD receive work completion from "normal" work completion */ + recv->header.wc = *wc; + recv->header.recv_wc.wc = &recv->header.wc; + recv->header.recv_wc.mad_len = sizeof(struct ib_mad); + recv->header.recv_wc.recv_buf.mad = &recv->mad.mad; + recv->header.recv_wc.recv_buf.grh = &recv->grh; + + if (atomic_read(&qp_info->snoop_count)) + snoop_recv(qp_info, &recv->header.recv_wc, IB_MAD_SNOOP_RECVS); + + /* Validate MAD */ + if (!validate_mad(&recv->mad.mad, qp_info->qp->qp_num)) + goto out; + + response = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL); + if (!response) { + dev_err(&port_priv->device->dev, + "ib_mad_recv_done_handler no memory for response buffer\n"); + goto out; + } + + if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH) + port_num = wc->port_num; + else + port_num = port_priv->port_num; + + if (recv->mad.mad.mad_hdr.mgmt_class == + IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + enum smi_forward_action retsmi; + + if (smi_handle_dr_smp_recv(&recv->mad.smp, + port_priv->device->node_type, + port_num, + port_priv->device->phys_port_cnt) == + IB_SMI_DISCARD) + goto out; + + retsmi = smi_check_forward_dr_smp(&recv->mad.smp); + if (retsmi == IB_SMI_LOCAL) + goto local; + + if (retsmi == IB_SMI_SEND) { /* don't forward */ + if (smi_handle_dr_smp_send(&recv->mad.smp, + port_priv->device->node_type, + port_num) == IB_SMI_DISCARD) + goto out; + + if (smi_check_local_smp(&recv->mad.smp, port_priv->device) == IB_SMI_DISCARD) + goto out; + } else if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH) { + /* forward case for switches */ + memcpy(response, recv, sizeof(*response)); + response->header.recv_wc.wc = &response->header.wc; + response->header.recv_wc.recv_buf.mad = &response->mad.mad; + response->header.recv_wc.recv_buf.grh = &response->grh; + + agent_send_response(&response->mad.mad, + &response->grh, wc, + port_priv->device, + smi_get_fwd_port(&recv->mad.smp), + qp_info->qp->qp_num); + + goto out; + } + } + +local: + /* Give driver "right of first refusal" on incoming MAD */ + if (port_priv->device->process_mad) { + ret = port_priv->device->process_mad(port_priv->device, 0, + port_priv->port_num, + wc, &recv->grh, + &recv->mad.mad, + &response->mad.mad); + if (ret & IB_MAD_RESULT_SUCCESS) { + if (ret & IB_MAD_RESULT_CONSUMED) + goto out; + if (ret & IB_MAD_RESULT_REPLY) { + agent_send_response(&response->mad.mad, + &recv->grh, wc, + port_priv->device, + port_num, + qp_info->qp->qp_num); + goto out; + } + } + } + + mad_agent = find_mad_agent(port_priv, &recv->mad.mad); + if (mad_agent) { + ib_mad_complete_recv(mad_agent, &recv->header.recv_wc); + /* + * recv is freed up in error cases in ib_mad_complete_recv + * or via recv_handler in ib_mad_complete_recv() + */ + recv = NULL; + } else if ((ret & IB_MAD_RESULT_SUCCESS) && + generate_unmatched_resp(recv, response)) { + agent_send_response(&response->mad.mad, &recv->grh, wc, + port_priv->device, port_num, qp_info->qp->qp_num); + } + +out: + /* Post another receive request for this QP */ + if (response) { + ib_mad_post_receive_mads(qp_info, response); + if (recv) + kmem_cache_free(ib_mad_cache, recv); + } else + ib_mad_post_receive_mads(qp_info, recv); +} + +static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv) +{ + struct ib_mad_send_wr_private *mad_send_wr; + unsigned long delay; + + if (list_empty(&mad_agent_priv->wait_list)) { + cancel_delayed_work(&mad_agent_priv->timed_work); + } else { + mad_send_wr = list_entry(mad_agent_priv->wait_list.next, + struct ib_mad_send_wr_private, + agent_list); + + if (time_after(mad_agent_priv->timeout, + mad_send_wr->timeout)) { + mad_agent_priv->timeout = mad_send_wr->timeout; + delay = mad_send_wr->timeout - jiffies; + if ((long)delay <= 0) + delay = 1; + mod_delayed_work(mad_agent_priv->qp_info->port_priv->wq, + &mad_agent_priv->timed_work, delay); + } + } +} + +static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr) +{ + struct ib_mad_agent_private *mad_agent_priv; + struct ib_mad_send_wr_private *temp_mad_send_wr; + struct list_head *list_item; + unsigned long delay; + + mad_agent_priv = mad_send_wr->mad_agent_priv; + list_del(&mad_send_wr->agent_list); + + delay = mad_send_wr->timeout; + mad_send_wr->timeout += jiffies; + + if (delay) { + list_for_each_prev(list_item, &mad_agent_priv->wait_list) { + temp_mad_send_wr = list_entry(list_item, + struct ib_mad_send_wr_private, + agent_list); + if (time_after(mad_send_wr->timeout, + temp_mad_send_wr->timeout)) + break; + } + } + else + list_item = &mad_agent_priv->wait_list; + list_add(&mad_send_wr->agent_list, list_item); + + /* Reschedule a work item if we have a shorter timeout */ + if (mad_agent_priv->wait_list.next == &mad_send_wr->agent_list) + mod_delayed_work(mad_agent_priv->qp_info->port_priv->wq, + &mad_agent_priv->timed_work, delay); +} + +void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr, + int timeout_ms) +{ + mad_send_wr->timeout = msecs_to_jiffies(timeout_ms); + wait_for_response(mad_send_wr); +} + +/* + * Process a send work completion + */ +void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_send_wc *mad_send_wc) +{ + struct ib_mad_agent_private *mad_agent_priv; + unsigned long flags; + int ret; + + mad_agent_priv = mad_send_wr->mad_agent_priv; + spin_lock_irqsave(&mad_agent_priv->lock, flags); + if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) { + ret = ib_process_rmpp_send_wc(mad_send_wr, mad_send_wc); + if (ret == IB_RMPP_RESULT_CONSUMED) + goto done; + } else + ret = IB_RMPP_RESULT_UNHANDLED; + + if (mad_send_wc->status != IB_WC_SUCCESS && + mad_send_wr->status == IB_WC_SUCCESS) { + mad_send_wr->status = mad_send_wc->status; + mad_send_wr->refcount -= (mad_send_wr->timeout > 0); + } + + if (--mad_send_wr->refcount > 0) { + if (mad_send_wr->refcount == 1 && mad_send_wr->timeout && + mad_send_wr->status == IB_WC_SUCCESS) { + wait_for_response(mad_send_wr); + } + goto done; + } + + /* Remove send from MAD agent and notify client of completion */ + list_del(&mad_send_wr->agent_list); + adjust_timeout(mad_agent_priv); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + + if (mad_send_wr->status != IB_WC_SUCCESS ) + mad_send_wc->status = mad_send_wr->status; + if (ret == IB_RMPP_RESULT_INTERNAL) + ib_rmpp_send_handler(mad_send_wc); + else + mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, + mad_send_wc); + + /* Release reference on agent taken when sending */ + deref_mad_agent(mad_agent_priv); + return; +done: + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); +} + +static void ib_mad_send_done_handler(struct ib_mad_port_private *port_priv, + struct ib_wc *wc) +{ + struct ib_mad_send_wr_private *mad_send_wr, *queued_send_wr; + struct ib_mad_list_head *mad_list; + struct ib_mad_qp_info *qp_info; + struct ib_mad_queue *send_queue; + struct ib_send_wr *bad_send_wr; + struct ib_mad_send_wc mad_send_wc; + unsigned long flags; + int ret; + + mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id; + mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private, + mad_list); + send_queue = mad_list->mad_queue; + qp_info = send_queue->qp_info; + +retry: + ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device, + mad_send_wr->header_mapping, + mad_send_wr->sg_list[0].length, DMA_TO_DEVICE); + ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device, + mad_send_wr->payload_mapping, + mad_send_wr->sg_list[1].length, DMA_TO_DEVICE); + queued_send_wr = NULL; + spin_lock_irqsave(&send_queue->lock, flags); + list_del(&mad_list->list); + + /* Move queued send to the send queue */ + if (send_queue->count-- > send_queue->max_active) { + mad_list = container_of(qp_info->overflow_list.next, + struct ib_mad_list_head, list); + queued_send_wr = container_of(mad_list, + struct ib_mad_send_wr_private, + mad_list); + list_move_tail(&mad_list->list, &send_queue->list); + } + spin_unlock_irqrestore(&send_queue->lock, flags); + + mad_send_wc.send_buf = &mad_send_wr->send_buf; + mad_send_wc.status = wc->status; + mad_send_wc.vendor_err = wc->vendor_err; + if (atomic_read(&qp_info->snoop_count)) + snoop_send(qp_info, &mad_send_wr->send_buf, &mad_send_wc, + IB_MAD_SNOOP_SEND_COMPLETIONS); + ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); + + if (queued_send_wr) { + ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr, + &bad_send_wr); + if (ret) { + dev_err(&port_priv->device->dev, + "ib_post_send failed: %d\n", ret); + mad_send_wr = queued_send_wr; + wc->status = IB_WC_LOC_QP_OP_ERR; + goto retry; + } + } +} + +static void mark_sends_for_retry(struct ib_mad_qp_info *qp_info) +{ + struct ib_mad_send_wr_private *mad_send_wr; + struct ib_mad_list_head *mad_list; + unsigned long flags; + + spin_lock_irqsave(&qp_info->send_queue.lock, flags); + list_for_each_entry(mad_list, &qp_info->send_queue.list, list) { + mad_send_wr = container_of(mad_list, + struct ib_mad_send_wr_private, + mad_list); + mad_send_wr->retry = 1; + } + spin_unlock_irqrestore(&qp_info->send_queue.lock, flags); +} + +static void mad_error_handler(struct ib_mad_port_private *port_priv, + struct ib_wc *wc) +{ + struct ib_mad_list_head *mad_list; + struct ib_mad_qp_info *qp_info; + struct ib_mad_send_wr_private *mad_send_wr; + int ret; + + /* Determine if failure was a send or receive */ + mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id; + qp_info = mad_list->mad_queue->qp_info; + if (mad_list->mad_queue == &qp_info->recv_queue) + /* + * Receive errors indicate that the QP has entered the error + * state - error handling/shutdown code will cleanup + */ + return; + + /* + * Send errors will transition the QP to SQE - move + * QP to RTS and repost flushed work requests + */ + mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private, + mad_list); + if (wc->status == IB_WC_WR_FLUSH_ERR) { + if (mad_send_wr->retry) { + /* Repost send */ + struct ib_send_wr *bad_send_wr; + + mad_send_wr->retry = 0; + ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr, + &bad_send_wr); + if (ret) + ib_mad_send_done_handler(port_priv, wc); + } else + ib_mad_send_done_handler(port_priv, wc); + } else { + struct ib_qp_attr *attr; + + /* Transition QP to RTS and fail offending send */ + attr = kmalloc(sizeof *attr, GFP_KERNEL); + if (attr) { + attr->qp_state = IB_QPS_RTS; + attr->cur_qp_state = IB_QPS_SQE; + ret = ib_modify_qp(qp_info->qp, attr, + IB_QP_STATE | IB_QP_CUR_STATE); + kfree(attr); + if (ret) + dev_err(&port_priv->device->dev, + "mad_error_handler - ib_modify_qp to RTS : %d\n", + ret); + else + mark_sends_for_retry(qp_info); + } + ib_mad_send_done_handler(port_priv, wc); + } +} + +/* + * IB MAD completion callback + */ +static void ib_mad_completion_handler(struct work_struct *work) +{ + struct ib_mad_port_private *port_priv; + struct ib_wc wc; + + port_priv = container_of(work, struct ib_mad_port_private, work); + ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP); + + while (ib_poll_cq(port_priv->cq, 1, &wc) == 1) { + if (wc.status == IB_WC_SUCCESS) { + switch (wc.opcode) { + case IB_WC_SEND: + ib_mad_send_done_handler(port_priv, &wc); + break; + case IB_WC_RECV: + ib_mad_recv_done_handler(port_priv, &wc); + break; + default: + BUG_ON(1); + break; + } + } else + mad_error_handler(port_priv, &wc); + } +} + +static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv) +{ + unsigned long flags; + struct ib_mad_send_wr_private *mad_send_wr, *temp_mad_send_wr; + struct ib_mad_send_wc mad_send_wc; + struct list_head cancel_list; + + INIT_LIST_HEAD(&cancel_list); + + spin_lock_irqsave(&mad_agent_priv->lock, flags); + list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr, + &mad_agent_priv->send_list, agent_list) { + if (mad_send_wr->status == IB_WC_SUCCESS) { + mad_send_wr->status = IB_WC_WR_FLUSH_ERR; + mad_send_wr->refcount -= (mad_send_wr->timeout > 0); + } + } + + /* Empty wait list to prevent receives from finding a request */ + list_splice_init(&mad_agent_priv->wait_list, &cancel_list); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + + /* Report all cancelled requests */ + mad_send_wc.status = IB_WC_WR_FLUSH_ERR; + mad_send_wc.vendor_err = 0; + + list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr, + &cancel_list, agent_list) { + mad_send_wc.send_buf = &mad_send_wr->send_buf; + list_del(&mad_send_wr->agent_list); + mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, + &mad_send_wc); + atomic_dec(&mad_agent_priv->refcount); + } +} + +static struct ib_mad_send_wr_private* +find_send_wr(struct ib_mad_agent_private *mad_agent_priv, + struct ib_mad_send_buf *send_buf) +{ + struct ib_mad_send_wr_private *mad_send_wr; + + list_for_each_entry(mad_send_wr, &mad_agent_priv->wait_list, + agent_list) { + if (&mad_send_wr->send_buf == send_buf) + return mad_send_wr; + } + + list_for_each_entry(mad_send_wr, &mad_agent_priv->send_list, + agent_list) { + if (is_data_mad(mad_agent_priv, mad_send_wr->send_buf.mad) && + &mad_send_wr->send_buf == send_buf) + return mad_send_wr; + } + return NULL; +} + +int ib_modify_mad(struct ib_mad_agent *mad_agent, + struct ib_mad_send_buf *send_buf, u32 timeout_ms) +{ + struct ib_mad_agent_private *mad_agent_priv; + struct ib_mad_send_wr_private *mad_send_wr; + unsigned long flags; + int active; + + mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private, + agent); + spin_lock_irqsave(&mad_agent_priv->lock, flags); + mad_send_wr = find_send_wr(mad_agent_priv, send_buf); + if (!mad_send_wr || mad_send_wr->status != IB_WC_SUCCESS) { + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + return -EINVAL; + } + + active = (!mad_send_wr->timeout || mad_send_wr->refcount > 1); + if (!timeout_ms) { + mad_send_wr->status = IB_WC_WR_FLUSH_ERR; + mad_send_wr->refcount -= (mad_send_wr->timeout > 0); + } + + mad_send_wr->send_buf.timeout_ms = timeout_ms; + if (active) + mad_send_wr->timeout = msecs_to_jiffies(timeout_ms); + else + ib_reset_mad_timeout(mad_send_wr, timeout_ms); + + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + return 0; +} +EXPORT_SYMBOL(ib_modify_mad); + +void ib_cancel_mad(struct ib_mad_agent *mad_agent, + struct ib_mad_send_buf *send_buf) +{ + ib_modify_mad(mad_agent, send_buf, 0); +} +EXPORT_SYMBOL(ib_cancel_mad); + +static void local_completions(struct work_struct *work) +{ + struct ib_mad_agent_private *mad_agent_priv; + struct ib_mad_local_private *local; + struct ib_mad_agent_private *recv_mad_agent; + unsigned long flags; + int free_mad; + struct ib_wc wc; + struct ib_mad_send_wc mad_send_wc; + + mad_agent_priv = + container_of(work, struct ib_mad_agent_private, local_work); + + spin_lock_irqsave(&mad_agent_priv->lock, flags); + while (!list_empty(&mad_agent_priv->local_list)) { + local = list_entry(mad_agent_priv->local_list.next, + struct ib_mad_local_private, + completion_list); + list_del(&local->completion_list); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + free_mad = 0; + if (local->mad_priv) { + recv_mad_agent = local->recv_mad_agent; + if (!recv_mad_agent) { + dev_err(&mad_agent_priv->agent.device->dev, + "No receive MAD agent for local completion\n"); + free_mad = 1; + goto local_send_completion; + } + + /* + * Defined behavior is to complete response + * before request + */ + build_smp_wc(recv_mad_agent->agent.qp, + (unsigned long) local->mad_send_wr, + be16_to_cpu(IB_LID_PERMISSIVE), + 0, recv_mad_agent->agent.port_num, &wc); + + local->mad_priv->header.recv_wc.wc = &wc; + local->mad_priv->header.recv_wc.mad_len = + sizeof(struct ib_mad); + INIT_LIST_HEAD(&local->mad_priv->header.recv_wc.rmpp_list); + list_add(&local->mad_priv->header.recv_wc.recv_buf.list, + &local->mad_priv->header.recv_wc.rmpp_list); + local->mad_priv->header.recv_wc.recv_buf.grh = NULL; + local->mad_priv->header.recv_wc.recv_buf.mad = + &local->mad_priv->mad.mad; + if (atomic_read(&recv_mad_agent->qp_info->snoop_count)) + snoop_recv(recv_mad_agent->qp_info, + &local->mad_priv->header.recv_wc, + IB_MAD_SNOOP_RECVS); + recv_mad_agent->agent.recv_handler( + &recv_mad_agent->agent, + &local->mad_priv->header.recv_wc); + spin_lock_irqsave(&recv_mad_agent->lock, flags); + atomic_dec(&recv_mad_agent->refcount); + spin_unlock_irqrestore(&recv_mad_agent->lock, flags); + } + +local_send_completion: + /* Complete send */ + mad_send_wc.status = IB_WC_SUCCESS; + mad_send_wc.vendor_err = 0; + mad_send_wc.send_buf = &local->mad_send_wr->send_buf; + if (atomic_read(&mad_agent_priv->qp_info->snoop_count)) + snoop_send(mad_agent_priv->qp_info, + &local->mad_send_wr->send_buf, + &mad_send_wc, IB_MAD_SNOOP_SEND_COMPLETIONS); + mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, + &mad_send_wc); + + spin_lock_irqsave(&mad_agent_priv->lock, flags); + atomic_dec(&mad_agent_priv->refcount); + if (free_mad) + kmem_cache_free(ib_mad_cache, local->mad_priv); + kfree(local); + } + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); +} + +static int retry_send(struct ib_mad_send_wr_private *mad_send_wr) +{ + int ret; + + if (!mad_send_wr->retries_left) + return -ETIMEDOUT; + + mad_send_wr->retries_left--; + mad_send_wr->send_buf.retries++; + + mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms); + + if (ib_mad_kernel_rmpp_agent(&mad_send_wr->mad_agent_priv->agent)) { + ret = ib_retry_rmpp(mad_send_wr); + switch (ret) { + case IB_RMPP_RESULT_UNHANDLED: + ret = ib_send_mad(mad_send_wr); + break; + case IB_RMPP_RESULT_CONSUMED: + ret = 0; + break; + default: + ret = -ECOMM; + break; + } + } else + ret = ib_send_mad(mad_send_wr); + + if (!ret) { + mad_send_wr->refcount++; + list_add_tail(&mad_send_wr->agent_list, + &mad_send_wr->mad_agent_priv->send_list); + } + return ret; +} + +static void timeout_sends(struct work_struct *work) +{ + struct ib_mad_agent_private *mad_agent_priv; + struct ib_mad_send_wr_private *mad_send_wr; + struct ib_mad_send_wc mad_send_wc; + unsigned long flags, delay; + + mad_agent_priv = container_of(work, struct ib_mad_agent_private, + timed_work.work); + mad_send_wc.vendor_err = 0; + + spin_lock_irqsave(&mad_agent_priv->lock, flags); + while (!list_empty(&mad_agent_priv->wait_list)) { + mad_send_wr = list_entry(mad_agent_priv->wait_list.next, + struct ib_mad_send_wr_private, + agent_list); + + if (time_after(mad_send_wr->timeout, jiffies)) { + delay = mad_send_wr->timeout - jiffies; + if ((long)delay <= 0) + delay = 1; + queue_delayed_work(mad_agent_priv->qp_info-> + port_priv->wq, + &mad_agent_priv->timed_work, delay); + break; + } + + list_del(&mad_send_wr->agent_list); + if (mad_send_wr->status == IB_WC_SUCCESS && + !retry_send(mad_send_wr)) + continue; + + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + + if (mad_send_wr->status == IB_WC_SUCCESS) + mad_send_wc.status = IB_WC_RESP_TIMEOUT_ERR; + else + mad_send_wc.status = mad_send_wr->status; + mad_send_wc.send_buf = &mad_send_wr->send_buf; + mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, + &mad_send_wc); + + atomic_dec(&mad_agent_priv->refcount); + spin_lock_irqsave(&mad_agent_priv->lock, flags); + } + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); +} + +static void ib_mad_thread_completion_handler(struct ib_cq *cq, void *arg) +{ + struct ib_mad_port_private *port_priv = cq->cq_context; + unsigned long flags; + + spin_lock_irqsave(&ib_mad_port_list_lock, flags); + if (!list_empty(&port_priv->port_list)) + queue_work(port_priv->wq, &port_priv->work); + spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); +} + +/* + * Allocate receive MADs and post receive WRs for them + */ +static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, + struct ib_mad_private *mad) +{ + unsigned long flags; + int post, ret; + struct ib_mad_private *mad_priv; + struct ib_sge sg_list; + struct ib_recv_wr recv_wr, *bad_recv_wr; + struct ib_mad_queue *recv_queue = &qp_info->recv_queue; + + /* Initialize common scatter list fields */ + sg_list.length = sizeof *mad_priv - sizeof mad_priv->header; + sg_list.lkey = (*qp_info->port_priv->mr).lkey; + + /* Initialize common receive WR fields */ + recv_wr.next = NULL; + recv_wr.sg_list = &sg_list; + recv_wr.num_sge = 1; + + do { + /* Allocate and map receive buffer */ + if (mad) { + mad_priv = mad; + mad = NULL; + } else { + mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL); + if (!mad_priv) { + dev_err(&qp_info->port_priv->device->dev, + "No memory for receive buffer\n"); + ret = -ENOMEM; + break; + } + } + sg_list.addr = ib_dma_map_single(qp_info->port_priv->device, + &mad_priv->grh, + sizeof *mad_priv - + sizeof mad_priv->header, + DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(qp_info->port_priv->device, + sg_list.addr))) { + ret = -ENOMEM; + break; + } + mad_priv->header.mapping = sg_list.addr; + recv_wr.wr_id = (unsigned long)&mad_priv->header.mad_list; + mad_priv->header.mad_list.mad_queue = recv_queue; + + /* Post receive WR */ + spin_lock_irqsave(&recv_queue->lock, flags); + post = (++recv_queue->count < recv_queue->max_active); + list_add_tail(&mad_priv->header.mad_list.list, &recv_queue->list); + spin_unlock_irqrestore(&recv_queue->lock, flags); + ret = ib_post_recv(qp_info->qp, &recv_wr, &bad_recv_wr); + if (ret) { + spin_lock_irqsave(&recv_queue->lock, flags); + list_del(&mad_priv->header.mad_list.list); + recv_queue->count--; + spin_unlock_irqrestore(&recv_queue->lock, flags); + ib_dma_unmap_single(qp_info->port_priv->device, + mad_priv->header.mapping, + sizeof *mad_priv - + sizeof mad_priv->header, + DMA_FROM_DEVICE); + kmem_cache_free(ib_mad_cache, mad_priv); + dev_err(&qp_info->port_priv->device->dev, + "ib_post_recv failed: %d\n", ret); + break; + } + } while (post); + + return ret; +} + +/* + * Return all the posted receive MADs + */ +static void cleanup_recv_queue(struct ib_mad_qp_info *qp_info) +{ + struct ib_mad_private_header *mad_priv_hdr; + struct ib_mad_private *recv; + struct ib_mad_list_head *mad_list; + + if (!qp_info->qp) + return; + + while (!list_empty(&qp_info->recv_queue.list)) { + + mad_list = list_entry(qp_info->recv_queue.list.next, + struct ib_mad_list_head, list); + mad_priv_hdr = container_of(mad_list, + struct ib_mad_private_header, + mad_list); + recv = container_of(mad_priv_hdr, struct ib_mad_private, + header); + + /* Remove from posted receive MAD list */ + list_del(&mad_list->list); + + ib_dma_unmap_single(qp_info->port_priv->device, + recv->header.mapping, + sizeof(struct ib_mad_private) - + sizeof(struct ib_mad_private_header), + DMA_FROM_DEVICE); + kmem_cache_free(ib_mad_cache, recv); + } + + qp_info->recv_queue.count = 0; +} + +/* + * Start the port + */ +static int ib_mad_port_start(struct ib_mad_port_private *port_priv) +{ + int ret, i; + struct ib_qp_attr *attr; + struct ib_qp *qp; + u16 pkey_index; + + attr = kmalloc(sizeof *attr, GFP_KERNEL); + if (!attr) { + dev_err(&port_priv->device->dev, + "Couldn't kmalloc ib_qp_attr\n"); + return -ENOMEM; + } + + ret = ib_find_pkey(port_priv->device, port_priv->port_num, + IB_DEFAULT_PKEY_FULL, &pkey_index); + if (ret) + pkey_index = 0; + + for (i = 0; i < IB_MAD_QPS_CORE; i++) { + qp = port_priv->qp_info[i].qp; + if (!qp) + continue; + + /* + * PKey index for QP1 is irrelevant but + * one is needed for the Reset to Init transition + */ + attr->qp_state = IB_QPS_INIT; + attr->pkey_index = pkey_index; + attr->qkey = (qp->qp_num == 0) ? 0 : IB_QP1_QKEY; + ret = ib_modify_qp(qp, attr, IB_QP_STATE | + IB_QP_PKEY_INDEX | IB_QP_QKEY); + if (ret) { + dev_err(&port_priv->device->dev, + "Couldn't change QP%d state to INIT: %d\n", + i, ret); + goto out; + } + + attr->qp_state = IB_QPS_RTR; + ret = ib_modify_qp(qp, attr, IB_QP_STATE); + if (ret) { + dev_err(&port_priv->device->dev, + "Couldn't change QP%d state to RTR: %d\n", + i, ret); + goto out; + } + + attr->qp_state = IB_QPS_RTS; + attr->sq_psn = IB_MAD_SEND_Q_PSN; + ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_SQ_PSN); + if (ret) { + dev_err(&port_priv->device->dev, + "Couldn't change QP%d state to RTS: %d\n", + i, ret); + goto out; + } + } + + ret = ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP); + if (ret) { + dev_err(&port_priv->device->dev, + "Failed to request completion notification: %d\n", + ret); + goto out; + } + + for (i = 0; i < IB_MAD_QPS_CORE; i++) { + if (!port_priv->qp_info[i].qp) + continue; + + ret = ib_mad_post_receive_mads(&port_priv->qp_info[i], NULL); + if (ret) { + dev_err(&port_priv->device->dev, + "Couldn't post receive WRs\n"); + goto out; + } + } +out: + kfree(attr); + return ret; +} + +static void qp_event_handler(struct ib_event *event, void *qp_context) +{ + struct ib_mad_qp_info *qp_info = qp_context; + + /* It's worse than that! He's dead, Jim! */ + dev_err(&qp_info->port_priv->device->dev, + "Fatal error (%d) on MAD QP (%d)\n", + event->event, qp_info->qp->qp_num); +} + +static void init_mad_queue(struct ib_mad_qp_info *qp_info, + struct ib_mad_queue *mad_queue) +{ + mad_queue->qp_info = qp_info; + mad_queue->count = 0; + spin_lock_init(&mad_queue->lock); + INIT_LIST_HEAD(&mad_queue->list); +} + +static void init_mad_qp(struct ib_mad_port_private *port_priv, + struct ib_mad_qp_info *qp_info) +{ + qp_info->port_priv = port_priv; + init_mad_queue(qp_info, &qp_info->send_queue); + init_mad_queue(qp_info, &qp_info->recv_queue); + INIT_LIST_HEAD(&qp_info->overflow_list); + spin_lock_init(&qp_info->snoop_lock); + qp_info->snoop_table = NULL; + qp_info->snoop_table_size = 0; + atomic_set(&qp_info->snoop_count, 0); +} + +static int create_mad_qp(struct ib_mad_qp_info *qp_info, + enum ib_qp_type qp_type) +{ + struct ib_qp_init_attr qp_init_attr; + int ret; + + memset(&qp_init_attr, 0, sizeof qp_init_attr); + qp_init_attr.send_cq = qp_info->port_priv->cq; + qp_init_attr.recv_cq = qp_info->port_priv->cq; + qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; + qp_init_attr.cap.max_send_wr = mad_sendq_size; + qp_init_attr.cap.max_recv_wr = mad_recvq_size; + qp_init_attr.cap.max_send_sge = IB_MAD_SEND_REQ_MAX_SG; + qp_init_attr.cap.max_recv_sge = IB_MAD_RECV_REQ_MAX_SG; + qp_init_attr.qp_type = qp_type; + qp_init_attr.port_num = qp_info->port_priv->port_num; + qp_init_attr.qp_context = qp_info; + qp_init_attr.event_handler = qp_event_handler; + qp_info->qp = ib_create_qp(qp_info->port_priv->pd, &qp_init_attr); + if (IS_ERR(qp_info->qp)) { + dev_err(&qp_info->port_priv->device->dev, + "Couldn't create ib_mad QP%d\n", + get_spl_qp_index(qp_type)); + ret = PTR_ERR(qp_info->qp); + goto error; + } + /* Use minimum queue sizes unless the CQ is resized */ + qp_info->send_queue.max_active = mad_sendq_size; + qp_info->recv_queue.max_active = mad_recvq_size; + return 0; + +error: + return ret; +} + +static void destroy_mad_qp(struct ib_mad_qp_info *qp_info) +{ + if (!qp_info->qp) + return; + + ib_destroy_qp(qp_info->qp); + kfree(qp_info->snoop_table); +} + +/* + * Open the port + * Create the QP, PD, MR, and CQ if needed + */ +static int ib_mad_port_open(struct ib_device *device, + int port_num) +{ + int ret, cq_size; + struct ib_mad_port_private *port_priv; + unsigned long flags; + char name[sizeof "ib_mad123"]; + int has_smi; + + /* Create new device info */ + port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL); + if (!port_priv) { + dev_err(&device->dev, "No memory for ib_mad_port_private\n"); + return -ENOMEM; + } + + port_priv->device = device; + port_priv->port_num = port_num; + spin_lock_init(&port_priv->reg_lock); + INIT_LIST_HEAD(&port_priv->agent_list); + init_mad_qp(port_priv, &port_priv->qp_info[0]); + init_mad_qp(port_priv, &port_priv->qp_info[1]); + + cq_size = mad_sendq_size + mad_recvq_size; + has_smi = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND; + if (has_smi) + cq_size *= 2; + + port_priv->cq = ib_create_cq(port_priv->device, + ib_mad_thread_completion_handler, + NULL, port_priv, cq_size, 0); + if (IS_ERR(port_priv->cq)) { + dev_err(&device->dev, "Couldn't create ib_mad CQ\n"); + ret = PTR_ERR(port_priv->cq); + goto error3; + } + + port_priv->pd = ib_alloc_pd(device); + if (IS_ERR(port_priv->pd)) { + dev_err(&device->dev, "Couldn't create ib_mad PD\n"); + ret = PTR_ERR(port_priv->pd); + goto error4; + } + + port_priv->mr = ib_get_dma_mr(port_priv->pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(port_priv->mr)) { + dev_err(&device->dev, "Couldn't get ib_mad DMA MR\n"); + ret = PTR_ERR(port_priv->mr); + goto error5; + } + + if (has_smi) { + ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI); + if (ret) + goto error6; + } + ret = create_mad_qp(&port_priv->qp_info[1], IB_QPT_GSI); + if (ret) + goto error7; + + snprintf(name, sizeof name, "ib_mad%d", port_num); + port_priv->wq = create_singlethread_workqueue(name); + if (!port_priv->wq) { + ret = -ENOMEM; + goto error8; + } + INIT_WORK(&port_priv->work, ib_mad_completion_handler); + + spin_lock_irqsave(&ib_mad_port_list_lock, flags); + list_add_tail(&port_priv->port_list, &ib_mad_port_list); + spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); + + ret = ib_mad_port_start(port_priv); + if (ret) { + dev_err(&device->dev, "Couldn't start port\n"); + goto error9; + } + + return 0; + +error9: + spin_lock_irqsave(&ib_mad_port_list_lock, flags); + list_del_init(&port_priv->port_list); + spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); + + destroy_workqueue(port_priv->wq); +error8: + destroy_mad_qp(&port_priv->qp_info[1]); +error7: + destroy_mad_qp(&port_priv->qp_info[0]); +error6: + ib_dereg_mr(port_priv->mr); +error5: + ib_dealloc_pd(port_priv->pd); +error4: + ib_destroy_cq(port_priv->cq); + cleanup_recv_queue(&port_priv->qp_info[1]); + cleanup_recv_queue(&port_priv->qp_info[0]); +error3: + kfree(port_priv); + + return ret; +} + +/* + * Close the port + * If there are no classes using the port, free the port + * resources (CQ, MR, PD, QP) and remove the port's info structure + */ +static int ib_mad_port_close(struct ib_device *device, int port_num) +{ + struct ib_mad_port_private *port_priv; + unsigned long flags; + + spin_lock_irqsave(&ib_mad_port_list_lock, flags); + port_priv = __ib_get_mad_port(device, port_num); + if (port_priv == NULL) { + spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); + dev_err(&device->dev, "Port %d not found\n", port_num); + return -ENODEV; + } + list_del_init(&port_priv->port_list); + spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); + + destroy_workqueue(port_priv->wq); + destroy_mad_qp(&port_priv->qp_info[1]); + destroy_mad_qp(&port_priv->qp_info[0]); + ib_dereg_mr(port_priv->mr); + ib_dealloc_pd(port_priv->pd); + ib_destroy_cq(port_priv->cq); + cleanup_recv_queue(&port_priv->qp_info[1]); + cleanup_recv_queue(&port_priv->qp_info[0]); + /* XXX: Handle deallocation of MAD registration tables */ + + kfree(port_priv); + + return 0; +} + +static void ib_mad_init_device(struct ib_device *device) +{ + int start, end, i; + + if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) + return; + + if (device->node_type == RDMA_NODE_IB_SWITCH) { + start = 0; + end = 0; + } else { + start = 1; + end = device->phys_port_cnt; + } + + for (i = start; i <= end; i++) { + if (ib_mad_port_open(device, i)) { + dev_err(&device->dev, "Couldn't open port %d\n", i); + goto error; + } + if (ib_agent_port_open(device, i)) { + dev_err(&device->dev, + "Couldn't open port %d for agents\n", i); + goto error_agent; + } + } + return; + +error_agent: + if (ib_mad_port_close(device, i)) + dev_err(&device->dev, "Couldn't close port %d\n", i); + +error: + i--; + + while (i >= start) { + if (ib_agent_port_close(device, i)) + dev_err(&device->dev, + "Couldn't close port %d for agents\n", i); + if (ib_mad_port_close(device, i)) + dev_err(&device->dev, "Couldn't close port %d\n", i); + i--; + } +} + +static void ib_mad_remove_device(struct ib_device *device) +{ + int i, num_ports, cur_port; + + if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) + return; + + if (device->node_type == RDMA_NODE_IB_SWITCH) { + num_ports = 1; + cur_port = 0; + } else { + num_ports = device->phys_port_cnt; + cur_port = 1; + } + for (i = 0; i < num_ports; i++, cur_port++) { + if (ib_agent_port_close(device, cur_port)) + dev_err(&device->dev, + "Couldn't close port %d for agents\n", + cur_port); + if (ib_mad_port_close(device, cur_port)) + dev_err(&device->dev, "Couldn't close port %d\n", + cur_port); + } +} + +static struct ib_client mad_client = { + .name = "mad", + .add = ib_mad_init_device, + .remove = ib_mad_remove_device +}; + +static int __init ib_mad_init_module(void) +{ + int ret; + + mad_recvq_size = min(mad_recvq_size, IB_MAD_QP_MAX_SIZE); + mad_recvq_size = max(mad_recvq_size, IB_MAD_QP_MIN_SIZE); + + mad_sendq_size = min(mad_sendq_size, IB_MAD_QP_MAX_SIZE); + mad_sendq_size = max(mad_sendq_size, IB_MAD_QP_MIN_SIZE); + + ib_mad_cache = kmem_cache_create("ib_mad", + sizeof(struct ib_mad_private), + 0, + SLAB_HWCACHE_ALIGN, + NULL); + if (!ib_mad_cache) { + pr_err("Couldn't create ib_mad cache\n"); + ret = -ENOMEM; + goto error1; + } + + INIT_LIST_HEAD(&ib_mad_port_list); + + if (ib_register_client(&mad_client)) { + pr_err("Couldn't register ib_mad client\n"); + ret = -EINVAL; + goto error2; + } + + return 0; + +error2: + kmem_cache_destroy(ib_mad_cache); +error1: + return ret; +} + +static void __exit ib_mad_cleanup_module(void) +{ + ib_unregister_client(&mad_client); + kmem_cache_destroy(ib_mad_cache); +} + +module_init(ib_mad_init_module); +module_exit(ib_mad_cleanup_module); diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h new file mode 100644 index 0000000..d1a0b0e --- /dev/null +++ b/drivers/infiniband/core/mad_priv.h @@ -0,0 +1,227 @@ +/* + * Copyright (c) 2004, 2005, Voltaire, Inc. All rights reserved. + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2009 HNR Consulting. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __IB_MAD_PRIV_H__ +#define __IB_MAD_PRIV_H__ + +#include +#include +#include +#include +#include + +#define IB_MAD_QPS_CORE 2 /* Always QP0 and QP1 as a minimum */ + +/* QP and CQ parameters */ +#define IB_MAD_QP_SEND_SIZE 128 +#define IB_MAD_QP_RECV_SIZE 512 +#define IB_MAD_QP_MIN_SIZE 64 +#define IB_MAD_QP_MAX_SIZE 8192 +#define IB_MAD_SEND_REQ_MAX_SG 2 +#define IB_MAD_RECV_REQ_MAX_SG 1 + +#define IB_MAD_SEND_Q_PSN 0 + +/* Registration table sizes */ +#define MAX_MGMT_CLASS 80 +#define MAX_MGMT_VERSION 8 +#define MAX_MGMT_OUI 8 +#define MAX_MGMT_VENDOR_RANGE2 (IB_MGMT_CLASS_VENDOR_RANGE2_END - \ + IB_MGMT_CLASS_VENDOR_RANGE2_START + 1) + +struct ib_mad_list_head { + struct list_head list; + struct ib_mad_queue *mad_queue; +}; + +struct ib_mad_private_header { + struct ib_mad_list_head mad_list; + struct ib_mad_recv_wc recv_wc; + struct ib_wc wc; + u64 mapping; +} __attribute__ ((packed)); + +struct ib_mad_private { + struct ib_mad_private_header header; + struct ib_grh grh; + union { + struct ib_mad mad; + struct ib_rmpp_mad rmpp_mad; + struct ib_smp smp; + } mad; +} __attribute__ ((packed)); + +struct ib_rmpp_segment { + struct list_head list; + u32 num; + u8 data[0]; +}; + +struct ib_mad_agent_private { + struct list_head agent_list; + struct ib_mad_agent agent; + struct ib_mad_reg_req *reg_req; + struct ib_mad_qp_info *qp_info; + + spinlock_t lock; + struct list_head send_list; + struct list_head wait_list; + struct list_head done_list; + struct delayed_work timed_work; + unsigned long timeout; + struct list_head local_list; + struct work_struct local_work; + struct list_head rmpp_list; + + atomic_t refcount; + struct completion comp; +}; + +struct ib_mad_snoop_private { + struct ib_mad_agent agent; + struct ib_mad_qp_info *qp_info; + int snoop_index; + int mad_snoop_flags; + atomic_t refcount; + struct completion comp; +}; + +struct ib_mad_send_wr_private { + struct ib_mad_list_head mad_list; + struct list_head agent_list; + struct ib_mad_agent_private *mad_agent_priv; + struct ib_mad_send_buf send_buf; + u64 header_mapping; + u64 payload_mapping; + struct ib_send_wr send_wr; + struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG]; + __be64 tid; + unsigned long timeout; + int max_retries; + int retries_left; + int retry; + int refcount; + enum ib_wc_status status; + + /* RMPP control */ + struct list_head rmpp_list; + struct ib_rmpp_segment *last_ack_seg; + struct ib_rmpp_segment *cur_seg; + int last_ack; + int seg_num; + int newwin; + int pad; +}; + +struct ib_mad_local_private { + struct list_head completion_list; + struct ib_mad_private *mad_priv; + struct ib_mad_agent_private *recv_mad_agent; + struct ib_mad_send_wr_private *mad_send_wr; +}; + +struct ib_mad_mgmt_method_table { + struct ib_mad_agent_private *agent[IB_MGMT_MAX_METHODS]; +}; + +struct ib_mad_mgmt_class_table { + struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_CLASS]; +}; + +struct ib_mad_mgmt_vendor_class { + u8 oui[MAX_MGMT_OUI][3]; + struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_OUI]; +}; + +struct ib_mad_mgmt_vendor_class_table { + struct ib_mad_mgmt_vendor_class *vendor_class[MAX_MGMT_VENDOR_RANGE2]; +}; + +struct ib_mad_mgmt_version_table { + struct ib_mad_mgmt_class_table *class; + struct ib_mad_mgmt_vendor_class_table *vendor; +}; + +struct ib_mad_queue { + spinlock_t lock; + struct list_head list; + int count; + int max_active; + struct ib_mad_qp_info *qp_info; +}; + +struct ib_mad_qp_info { + struct ib_mad_port_private *port_priv; + struct ib_qp *qp; + struct ib_mad_queue send_queue; + struct ib_mad_queue recv_queue; + struct list_head overflow_list; + spinlock_t snoop_lock; + struct ib_mad_snoop_private **snoop_table; + int snoop_table_size; + atomic_t snoop_count; +}; + +struct ib_mad_port_private { + struct list_head port_list; + struct ib_device *device; + int port_num; + struct ib_cq *cq; + struct ib_pd *pd; + struct ib_mr *mr; + + spinlock_t reg_lock; + struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION]; + struct list_head agent_list; + struct workqueue_struct *wq; + struct work_struct work; + struct ib_mad_qp_info qp_info[IB_MAD_QPS_CORE]; +}; + +int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr); + +struct ib_mad_send_wr_private * +ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv, + struct ib_mad_recv_wc *mad_recv_wc); + +void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_send_wc *mad_send_wc); + +void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr); + +void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr, + int timeout_ms); + +#endif /* __IB_MAD_PRIV_H__ */ diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c new file mode 100644 index 0000000..f37878c --- /dev/null +++ b/drivers/infiniband/core/mad_rmpp.c @@ -0,0 +1,953 @@ +/* + * Copyright (c) 2005 Intel Inc. All rights reserved. + * Copyright (c) 2005-2006 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "mad_priv.h" +#include "mad_rmpp.h" + +enum rmpp_state { + RMPP_STATE_ACTIVE, + RMPP_STATE_TIMEOUT, + RMPP_STATE_COMPLETE, + RMPP_STATE_CANCELING +}; + +struct mad_rmpp_recv { + struct ib_mad_agent_private *agent; + struct list_head list; + struct delayed_work timeout_work; + struct delayed_work cleanup_work; + struct completion comp; + enum rmpp_state state; + spinlock_t lock; + atomic_t refcount; + + struct ib_ah *ah; + struct ib_mad_recv_wc *rmpp_wc; + struct ib_mad_recv_buf *cur_seg_buf; + int last_ack; + int seg_num; + int newwin; + int repwin; + + __be64 tid; + u32 src_qp; + u16 slid; + u8 mgmt_class; + u8 class_version; + u8 method; +}; + +static inline void deref_rmpp_recv(struct mad_rmpp_recv *rmpp_recv) +{ + if (atomic_dec_and_test(&rmpp_recv->refcount)) + complete(&rmpp_recv->comp); +} + +static void destroy_rmpp_recv(struct mad_rmpp_recv *rmpp_recv) +{ + deref_rmpp_recv(rmpp_recv); + wait_for_completion(&rmpp_recv->comp); + ib_destroy_ah(rmpp_recv->ah); + kfree(rmpp_recv); +} + +void ib_cancel_rmpp_recvs(struct ib_mad_agent_private *agent) +{ + struct mad_rmpp_recv *rmpp_recv, *temp_rmpp_recv; + unsigned long flags; + + spin_lock_irqsave(&agent->lock, flags); + list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) { + if (rmpp_recv->state != RMPP_STATE_COMPLETE) + ib_free_recv_mad(rmpp_recv->rmpp_wc); + rmpp_recv->state = RMPP_STATE_CANCELING; + } + spin_unlock_irqrestore(&agent->lock, flags); + + list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) { + cancel_delayed_work(&rmpp_recv->timeout_work); + cancel_delayed_work(&rmpp_recv->cleanup_work); + } + + flush_workqueue(agent->qp_info->port_priv->wq); + + list_for_each_entry_safe(rmpp_recv, temp_rmpp_recv, + &agent->rmpp_list, list) { + list_del(&rmpp_recv->list); + destroy_rmpp_recv(rmpp_recv); + } +} + +static void format_ack(struct ib_mad_send_buf *msg, + struct ib_rmpp_mad *data, + struct mad_rmpp_recv *rmpp_recv) +{ + struct ib_rmpp_mad *ack = msg->mad; + unsigned long flags; + + memcpy(ack, &data->mad_hdr, msg->hdr_len); + + ack->mad_hdr.method ^= IB_MGMT_METHOD_RESP; + ack->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_ACK; + ib_set_rmpp_flags(&ack->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE); + + spin_lock_irqsave(&rmpp_recv->lock, flags); + rmpp_recv->last_ack = rmpp_recv->seg_num; + ack->rmpp_hdr.seg_num = cpu_to_be32(rmpp_recv->seg_num); + ack->rmpp_hdr.paylen_newwin = cpu_to_be32(rmpp_recv->newwin); + spin_unlock_irqrestore(&rmpp_recv->lock, flags); +} + +static void ack_recv(struct mad_rmpp_recv *rmpp_recv, + struct ib_mad_recv_wc *recv_wc) +{ + struct ib_mad_send_buf *msg; + int ret, hdr_len; + + hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class); + msg = ib_create_send_mad(&rmpp_recv->agent->agent, recv_wc->wc->src_qp, + recv_wc->wc->pkey_index, 1, hdr_len, + 0, GFP_KERNEL); + if (IS_ERR(msg)) + return; + + format_ack(msg, (struct ib_rmpp_mad *) recv_wc->recv_buf.mad, rmpp_recv); + msg->ah = rmpp_recv->ah; + ret = ib_post_send_mad(msg, NULL); + if (ret) + ib_free_send_mad(msg); +} + +static struct ib_mad_send_buf *alloc_response_msg(struct ib_mad_agent *agent, + struct ib_mad_recv_wc *recv_wc) +{ + struct ib_mad_send_buf *msg; + struct ib_ah *ah; + int hdr_len; + + ah = ib_create_ah_from_wc(agent->qp->pd, recv_wc->wc, + recv_wc->recv_buf.grh, agent->port_num); + if (IS_ERR(ah)) + return (void *) ah; + + hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class); + msg = ib_create_send_mad(agent, recv_wc->wc->src_qp, + recv_wc->wc->pkey_index, 1, + hdr_len, 0, GFP_KERNEL); + if (IS_ERR(msg)) + ib_destroy_ah(ah); + else { + msg->ah = ah; + msg->context[0] = ah; + } + + return msg; +} + +static void ack_ds_ack(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *recv_wc) +{ + struct ib_mad_send_buf *msg; + struct ib_rmpp_mad *rmpp_mad; + int ret; + + msg = alloc_response_msg(&agent->agent, recv_wc); + if (IS_ERR(msg)) + return; + + rmpp_mad = msg->mad; + memcpy(rmpp_mad, recv_wc->recv_buf.mad, msg->hdr_len); + + rmpp_mad->mad_hdr.method ^= IB_MGMT_METHOD_RESP; + ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE); + rmpp_mad->rmpp_hdr.seg_num = 0; + rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(1); + + ret = ib_post_send_mad(msg, NULL); + if (ret) { + ib_destroy_ah(msg->ah); + ib_free_send_mad(msg); + } +} + +void ib_rmpp_send_handler(struct ib_mad_send_wc *mad_send_wc) +{ + if (mad_send_wc->send_buf->context[0] == mad_send_wc->send_buf->ah) + ib_destroy_ah(mad_send_wc->send_buf->ah); + ib_free_send_mad(mad_send_wc->send_buf); +} + +static void nack_recv(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *recv_wc, u8 rmpp_status) +{ + struct ib_mad_send_buf *msg; + struct ib_rmpp_mad *rmpp_mad; + int ret; + + msg = alloc_response_msg(&agent->agent, recv_wc); + if (IS_ERR(msg)) + return; + + rmpp_mad = msg->mad; + memcpy(rmpp_mad, recv_wc->recv_buf.mad, msg->hdr_len); + + rmpp_mad->mad_hdr.method ^= IB_MGMT_METHOD_RESP; + rmpp_mad->rmpp_hdr.rmpp_version = IB_MGMT_RMPP_VERSION; + rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_ABORT; + ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE); + rmpp_mad->rmpp_hdr.rmpp_status = rmpp_status; + rmpp_mad->rmpp_hdr.seg_num = 0; + rmpp_mad->rmpp_hdr.paylen_newwin = 0; + + ret = ib_post_send_mad(msg, NULL); + if (ret) { + ib_destroy_ah(msg->ah); + ib_free_send_mad(msg); + } +} + +static void recv_timeout_handler(struct work_struct *work) +{ + struct mad_rmpp_recv *rmpp_recv = + container_of(work, struct mad_rmpp_recv, timeout_work.work); + struct ib_mad_recv_wc *rmpp_wc; + unsigned long flags; + + spin_lock_irqsave(&rmpp_recv->agent->lock, flags); + if (rmpp_recv->state != RMPP_STATE_ACTIVE) { + spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags); + return; + } + rmpp_recv->state = RMPP_STATE_TIMEOUT; + list_del(&rmpp_recv->list); + spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags); + + rmpp_wc = rmpp_recv->rmpp_wc; + nack_recv(rmpp_recv->agent, rmpp_wc, IB_MGMT_RMPP_STATUS_T2L); + destroy_rmpp_recv(rmpp_recv); + ib_free_recv_mad(rmpp_wc); +} + +static void recv_cleanup_handler(struct work_struct *work) +{ + struct mad_rmpp_recv *rmpp_recv = + container_of(work, struct mad_rmpp_recv, cleanup_work.work); + unsigned long flags; + + spin_lock_irqsave(&rmpp_recv->agent->lock, flags); + if (rmpp_recv->state == RMPP_STATE_CANCELING) { + spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags); + return; + } + list_del(&rmpp_recv->list); + spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags); + destroy_rmpp_recv(rmpp_recv); +} + +static struct mad_rmpp_recv * +create_rmpp_recv(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct mad_rmpp_recv *rmpp_recv; + struct ib_mad_hdr *mad_hdr; + + rmpp_recv = kmalloc(sizeof *rmpp_recv, GFP_KERNEL); + if (!rmpp_recv) + return NULL; + + rmpp_recv->ah = ib_create_ah_from_wc(agent->agent.qp->pd, + mad_recv_wc->wc, + mad_recv_wc->recv_buf.grh, + agent->agent.port_num); + if (IS_ERR(rmpp_recv->ah)) + goto error; + + rmpp_recv->agent = agent; + init_completion(&rmpp_recv->comp); + INIT_DELAYED_WORK(&rmpp_recv->timeout_work, recv_timeout_handler); + INIT_DELAYED_WORK(&rmpp_recv->cleanup_work, recv_cleanup_handler); + spin_lock_init(&rmpp_recv->lock); + rmpp_recv->state = RMPP_STATE_ACTIVE; + atomic_set(&rmpp_recv->refcount, 1); + + rmpp_recv->rmpp_wc = mad_recv_wc; + rmpp_recv->cur_seg_buf = &mad_recv_wc->recv_buf; + rmpp_recv->newwin = 1; + rmpp_recv->seg_num = 1; + rmpp_recv->last_ack = 0; + rmpp_recv->repwin = 1; + + mad_hdr = &mad_recv_wc->recv_buf.mad->mad_hdr; + rmpp_recv->tid = mad_hdr->tid; + rmpp_recv->src_qp = mad_recv_wc->wc->src_qp; + rmpp_recv->slid = mad_recv_wc->wc->slid; + rmpp_recv->mgmt_class = mad_hdr->mgmt_class; + rmpp_recv->class_version = mad_hdr->class_version; + rmpp_recv->method = mad_hdr->method; + return rmpp_recv; + +error: kfree(rmpp_recv); + return NULL; +} + +static struct mad_rmpp_recv * +find_rmpp_recv(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct mad_rmpp_recv *rmpp_recv; + struct ib_mad_hdr *mad_hdr = &mad_recv_wc->recv_buf.mad->mad_hdr; + + list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) { + if (rmpp_recv->tid == mad_hdr->tid && + rmpp_recv->src_qp == mad_recv_wc->wc->src_qp && + rmpp_recv->slid == mad_recv_wc->wc->slid && + rmpp_recv->mgmt_class == mad_hdr->mgmt_class && + rmpp_recv->class_version == mad_hdr->class_version && + rmpp_recv->method == mad_hdr->method) + return rmpp_recv; + } + return NULL; +} + +static struct mad_rmpp_recv * +acquire_rmpp_recv(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct mad_rmpp_recv *rmpp_recv; + unsigned long flags; + + spin_lock_irqsave(&agent->lock, flags); + rmpp_recv = find_rmpp_recv(agent, mad_recv_wc); + if (rmpp_recv) + atomic_inc(&rmpp_recv->refcount); + spin_unlock_irqrestore(&agent->lock, flags); + return rmpp_recv; +} + +static struct mad_rmpp_recv * +insert_rmpp_recv(struct ib_mad_agent_private *agent, + struct mad_rmpp_recv *rmpp_recv) +{ + struct mad_rmpp_recv *cur_rmpp_recv; + + cur_rmpp_recv = find_rmpp_recv(agent, rmpp_recv->rmpp_wc); + if (!cur_rmpp_recv) + list_add_tail(&rmpp_recv->list, &agent->rmpp_list); + + return cur_rmpp_recv; +} + +static inline int get_last_flag(struct ib_mad_recv_buf *seg) +{ + struct ib_rmpp_mad *rmpp_mad; + + rmpp_mad = (struct ib_rmpp_mad *) seg->mad; + return ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_LAST; +} + +static inline int get_seg_num(struct ib_mad_recv_buf *seg) +{ + struct ib_rmpp_mad *rmpp_mad; + + rmpp_mad = (struct ib_rmpp_mad *) seg->mad; + return be32_to_cpu(rmpp_mad->rmpp_hdr.seg_num); +} + +static inline struct ib_mad_recv_buf * get_next_seg(struct list_head *rmpp_list, + struct ib_mad_recv_buf *seg) +{ + if (seg->list.next == rmpp_list) + return NULL; + + return container_of(seg->list.next, struct ib_mad_recv_buf, list); +} + +static inline int window_size(struct ib_mad_agent_private *agent) +{ + return max(agent->qp_info->recv_queue.max_active >> 3, 1); +} + +static struct ib_mad_recv_buf * find_seg_location(struct list_head *rmpp_list, + int seg_num) +{ + struct ib_mad_recv_buf *seg_buf; + int cur_seg_num; + + list_for_each_entry_reverse(seg_buf, rmpp_list, list) { + cur_seg_num = get_seg_num(seg_buf); + if (seg_num > cur_seg_num) + return seg_buf; + if (seg_num == cur_seg_num) + break; + } + return NULL; +} + +static void update_seg_num(struct mad_rmpp_recv *rmpp_recv, + struct ib_mad_recv_buf *new_buf) +{ + struct list_head *rmpp_list = &rmpp_recv->rmpp_wc->rmpp_list; + + while (new_buf && (get_seg_num(new_buf) == rmpp_recv->seg_num + 1)) { + rmpp_recv->cur_seg_buf = new_buf; + rmpp_recv->seg_num++; + new_buf = get_next_seg(rmpp_list, new_buf); + } +} + +static inline int get_mad_len(struct mad_rmpp_recv *rmpp_recv) +{ + struct ib_rmpp_mad *rmpp_mad; + int hdr_size, data_size, pad; + + rmpp_mad = (struct ib_rmpp_mad *)rmpp_recv->cur_seg_buf->mad; + + hdr_size = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class); + data_size = sizeof(struct ib_rmpp_mad) - hdr_size; + pad = IB_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin); + if (pad > IB_MGMT_RMPP_DATA || pad < 0) + pad = 0; + + return hdr_size + rmpp_recv->seg_num * data_size - pad; +} + +static struct ib_mad_recv_wc * complete_rmpp(struct mad_rmpp_recv *rmpp_recv) +{ + struct ib_mad_recv_wc *rmpp_wc; + + ack_recv(rmpp_recv, rmpp_recv->rmpp_wc); + if (rmpp_recv->seg_num > 1) + cancel_delayed_work(&rmpp_recv->timeout_work); + + rmpp_wc = rmpp_recv->rmpp_wc; + rmpp_wc->mad_len = get_mad_len(rmpp_recv); + /* 10 seconds until we can find the packet lifetime */ + queue_delayed_work(rmpp_recv->agent->qp_info->port_priv->wq, + &rmpp_recv->cleanup_work, msecs_to_jiffies(10000)); + return rmpp_wc; +} + +static struct ib_mad_recv_wc * +continue_rmpp(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct mad_rmpp_recv *rmpp_recv; + struct ib_mad_recv_buf *prev_buf; + struct ib_mad_recv_wc *done_wc; + int seg_num; + unsigned long flags; + + rmpp_recv = acquire_rmpp_recv(agent, mad_recv_wc); + if (!rmpp_recv) + goto drop1; + + seg_num = get_seg_num(&mad_recv_wc->recv_buf); + + spin_lock_irqsave(&rmpp_recv->lock, flags); + if ((rmpp_recv->state == RMPP_STATE_TIMEOUT) || + (seg_num > rmpp_recv->newwin)) + goto drop3; + + if ((seg_num <= rmpp_recv->last_ack) || + (rmpp_recv->state == RMPP_STATE_COMPLETE)) { + spin_unlock_irqrestore(&rmpp_recv->lock, flags); + ack_recv(rmpp_recv, mad_recv_wc); + goto drop2; + } + + prev_buf = find_seg_location(&rmpp_recv->rmpp_wc->rmpp_list, seg_num); + if (!prev_buf) + goto drop3; + + done_wc = NULL; + list_add(&mad_recv_wc->recv_buf.list, &prev_buf->list); + if (rmpp_recv->cur_seg_buf == prev_buf) { + update_seg_num(rmpp_recv, &mad_recv_wc->recv_buf); + if (get_last_flag(rmpp_recv->cur_seg_buf)) { + rmpp_recv->state = RMPP_STATE_COMPLETE; + spin_unlock_irqrestore(&rmpp_recv->lock, flags); + done_wc = complete_rmpp(rmpp_recv); + goto out; + } else if (rmpp_recv->seg_num == rmpp_recv->newwin) { + rmpp_recv->newwin += window_size(agent); + spin_unlock_irqrestore(&rmpp_recv->lock, flags); + ack_recv(rmpp_recv, mad_recv_wc); + goto out; + } + } + spin_unlock_irqrestore(&rmpp_recv->lock, flags); +out: + deref_rmpp_recv(rmpp_recv); + return done_wc; + +drop3: spin_unlock_irqrestore(&rmpp_recv->lock, flags); +drop2: deref_rmpp_recv(rmpp_recv); +drop1: ib_free_recv_mad(mad_recv_wc); + return NULL; +} + +static struct ib_mad_recv_wc * +start_rmpp(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct mad_rmpp_recv *rmpp_recv; + unsigned long flags; + + rmpp_recv = create_rmpp_recv(agent, mad_recv_wc); + if (!rmpp_recv) { + ib_free_recv_mad(mad_recv_wc); + return NULL; + } + + spin_lock_irqsave(&agent->lock, flags); + if (insert_rmpp_recv(agent, rmpp_recv)) { + spin_unlock_irqrestore(&agent->lock, flags); + /* duplicate first MAD */ + destroy_rmpp_recv(rmpp_recv); + return continue_rmpp(agent, mad_recv_wc); + } + atomic_inc(&rmpp_recv->refcount); + + if (get_last_flag(&mad_recv_wc->recv_buf)) { + rmpp_recv->state = RMPP_STATE_COMPLETE; + spin_unlock_irqrestore(&agent->lock, flags); + complete_rmpp(rmpp_recv); + } else { + spin_unlock_irqrestore(&agent->lock, flags); + /* 40 seconds until we can find the packet lifetimes */ + queue_delayed_work(agent->qp_info->port_priv->wq, + &rmpp_recv->timeout_work, + msecs_to_jiffies(40000)); + rmpp_recv->newwin += window_size(agent); + ack_recv(rmpp_recv, mad_recv_wc); + mad_recv_wc = NULL; + } + deref_rmpp_recv(rmpp_recv); + return mad_recv_wc; +} + +static int send_next_seg(struct ib_mad_send_wr_private *mad_send_wr) +{ + struct ib_rmpp_mad *rmpp_mad; + int timeout; + u32 paylen = 0; + + rmpp_mad = mad_send_wr->send_buf.mad; + ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE); + rmpp_mad->rmpp_hdr.seg_num = cpu_to_be32(++mad_send_wr->seg_num); + + if (mad_send_wr->seg_num == 1) { + rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_FIRST; + paylen = mad_send_wr->send_buf.seg_count * IB_MGMT_RMPP_DATA - + mad_send_wr->pad; + } + + if (mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count) { + rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_LAST; + paylen = IB_MGMT_RMPP_DATA - mad_send_wr->pad; + } + rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(paylen); + + /* 2 seconds for an ACK until we can find the packet lifetime */ + timeout = mad_send_wr->send_buf.timeout_ms; + if (!timeout || timeout > 2000) + mad_send_wr->timeout = msecs_to_jiffies(2000); + + return ib_send_mad(mad_send_wr); +} + +static void abort_send(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *mad_recv_wc, u8 rmpp_status) +{ + struct ib_mad_send_wr_private *mad_send_wr; + struct ib_mad_send_wc wc; + unsigned long flags; + + spin_lock_irqsave(&agent->lock, flags); + mad_send_wr = ib_find_send_mad(agent, mad_recv_wc); + if (!mad_send_wr) + goto out; /* Unmatched send */ + + if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) || + (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS)) + goto out; /* Send is already done */ + + ib_mark_mad_done(mad_send_wr); + spin_unlock_irqrestore(&agent->lock, flags); + + wc.status = IB_WC_REM_ABORT_ERR; + wc.vendor_err = rmpp_status; + wc.send_buf = &mad_send_wr->send_buf; + ib_mad_complete_send_wr(mad_send_wr, &wc); + return; +out: + spin_unlock_irqrestore(&agent->lock, flags); +} + +static inline void adjust_last_ack(struct ib_mad_send_wr_private *wr, + int seg_num) +{ + struct list_head *list; + + wr->last_ack = seg_num; + list = &wr->last_ack_seg->list; + list_for_each_entry(wr->last_ack_seg, list, list) + if (wr->last_ack_seg->num == seg_num) + break; +} + +static void process_ds_ack(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *mad_recv_wc, int newwin) +{ + struct mad_rmpp_recv *rmpp_recv; + + rmpp_recv = find_rmpp_recv(agent, mad_recv_wc); + if (rmpp_recv && rmpp_recv->state == RMPP_STATE_COMPLETE) + rmpp_recv->repwin = newwin; +} + +static void process_rmpp_ack(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct ib_mad_send_wr_private *mad_send_wr; + struct ib_rmpp_mad *rmpp_mad; + unsigned long flags; + int seg_num, newwin, ret; + + rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad; + if (rmpp_mad->rmpp_hdr.rmpp_status) { + abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS); + nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS); + return; + } + + seg_num = be32_to_cpu(rmpp_mad->rmpp_hdr.seg_num); + newwin = be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin); + if (newwin < seg_num) { + abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_W2S); + nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_W2S); + return; + } + + spin_lock_irqsave(&agent->lock, flags); + mad_send_wr = ib_find_send_mad(agent, mad_recv_wc); + if (!mad_send_wr) { + if (!seg_num) + process_ds_ack(agent, mad_recv_wc, newwin); + goto out; /* Unmatched or DS RMPP ACK */ + } + + if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) && + (mad_send_wr->timeout)) { + spin_unlock_irqrestore(&agent->lock, flags); + ack_ds_ack(agent, mad_recv_wc); + return; /* Repeated ACK for DS RMPP transaction */ + } + + if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) || + (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS)) + goto out; /* Send is already done */ + + if (seg_num > mad_send_wr->send_buf.seg_count || + seg_num > mad_send_wr->newwin) { + spin_unlock_irqrestore(&agent->lock, flags); + abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_S2B); + nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_S2B); + return; + } + + if (newwin < mad_send_wr->newwin || seg_num < mad_send_wr->last_ack) + goto out; /* Old ACK */ + + if (seg_num > mad_send_wr->last_ack) { + adjust_last_ack(mad_send_wr, seg_num); + mad_send_wr->retries_left = mad_send_wr->max_retries; + } + mad_send_wr->newwin = newwin; + if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) { + /* If no response is expected, the ACK completes the send */ + if (!mad_send_wr->send_buf.timeout_ms) { + struct ib_mad_send_wc wc; + + ib_mark_mad_done(mad_send_wr); + spin_unlock_irqrestore(&agent->lock, flags); + + wc.status = IB_WC_SUCCESS; + wc.vendor_err = 0; + wc.send_buf = &mad_send_wr->send_buf; + ib_mad_complete_send_wr(mad_send_wr, &wc); + return; + } + if (mad_send_wr->refcount == 1) + ib_reset_mad_timeout(mad_send_wr, + mad_send_wr->send_buf.timeout_ms); + spin_unlock_irqrestore(&agent->lock, flags); + ack_ds_ack(agent, mad_recv_wc); + return; + } else if (mad_send_wr->refcount == 1 && + mad_send_wr->seg_num < mad_send_wr->newwin && + mad_send_wr->seg_num < mad_send_wr->send_buf.seg_count) { + /* Send failure will just result in a timeout/retry */ + ret = send_next_seg(mad_send_wr); + if (ret) + goto out; + + mad_send_wr->refcount++; + list_move_tail(&mad_send_wr->agent_list, + &mad_send_wr->mad_agent_priv->send_list); + } +out: + spin_unlock_irqrestore(&agent->lock, flags); +} + +static struct ib_mad_recv_wc * +process_rmpp_data(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct ib_rmpp_hdr *rmpp_hdr; + u8 rmpp_status; + + rmpp_hdr = &((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr; + + if (rmpp_hdr->rmpp_status) { + rmpp_status = IB_MGMT_RMPP_STATUS_BAD_STATUS; + goto bad; + } + + if (rmpp_hdr->seg_num == cpu_to_be32(1)) { + if (!(ib_get_rmpp_flags(rmpp_hdr) & IB_MGMT_RMPP_FLAG_FIRST)) { + rmpp_status = IB_MGMT_RMPP_STATUS_BAD_SEG; + goto bad; + } + return start_rmpp(agent, mad_recv_wc); + } else { + if (ib_get_rmpp_flags(rmpp_hdr) & IB_MGMT_RMPP_FLAG_FIRST) { + rmpp_status = IB_MGMT_RMPP_STATUS_BAD_SEG; + goto bad; + } + return continue_rmpp(agent, mad_recv_wc); + } +bad: + nack_recv(agent, mad_recv_wc, rmpp_status); + ib_free_recv_mad(mad_recv_wc); + return NULL; +} + +static void process_rmpp_stop(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct ib_rmpp_mad *rmpp_mad; + + rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad; + + if (rmpp_mad->rmpp_hdr.rmpp_status != IB_MGMT_RMPP_STATUS_RESX) { + abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS); + nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS); + } else + abort_send(agent, mad_recv_wc, rmpp_mad->rmpp_hdr.rmpp_status); +} + +static void process_rmpp_abort(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct ib_rmpp_mad *rmpp_mad; + + rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad; + + if (rmpp_mad->rmpp_hdr.rmpp_status < IB_MGMT_RMPP_STATUS_ABORT_MIN || + rmpp_mad->rmpp_hdr.rmpp_status > IB_MGMT_RMPP_STATUS_ABORT_MAX) { + abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS); + nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS); + } else + abort_send(agent, mad_recv_wc, rmpp_mad->rmpp_hdr.rmpp_status); +} + +struct ib_mad_recv_wc * +ib_process_rmpp_recv_wc(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct ib_rmpp_mad *rmpp_mad; + + rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad; + if (!(rmpp_mad->rmpp_hdr.rmpp_rtime_flags & IB_MGMT_RMPP_FLAG_ACTIVE)) + return mad_recv_wc; + + if (rmpp_mad->rmpp_hdr.rmpp_version != IB_MGMT_RMPP_VERSION) { + abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_UNV); + nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_UNV); + goto out; + } + + switch (rmpp_mad->rmpp_hdr.rmpp_type) { + case IB_MGMT_RMPP_TYPE_DATA: + return process_rmpp_data(agent, mad_recv_wc); + case IB_MGMT_RMPP_TYPE_ACK: + process_rmpp_ack(agent, mad_recv_wc); + break; + case IB_MGMT_RMPP_TYPE_STOP: + process_rmpp_stop(agent, mad_recv_wc); + break; + case IB_MGMT_RMPP_TYPE_ABORT: + process_rmpp_abort(agent, mad_recv_wc); + break; + default: + abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BADT); + nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BADT); + break; + } +out: + ib_free_recv_mad(mad_recv_wc); + return NULL; +} + +static int init_newwin(struct ib_mad_send_wr_private *mad_send_wr) +{ + struct ib_mad_agent_private *agent = mad_send_wr->mad_agent_priv; + struct ib_mad_hdr *mad_hdr = mad_send_wr->send_buf.mad; + struct mad_rmpp_recv *rmpp_recv; + struct ib_ah_attr ah_attr; + unsigned long flags; + int newwin = 1; + + if (!(mad_hdr->method & IB_MGMT_METHOD_RESP)) + goto out; + + spin_lock_irqsave(&agent->lock, flags); + list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) { + if (rmpp_recv->tid != mad_hdr->tid || + rmpp_recv->mgmt_class != mad_hdr->mgmt_class || + rmpp_recv->class_version != mad_hdr->class_version || + (rmpp_recv->method & IB_MGMT_METHOD_RESP)) + continue; + + if (ib_query_ah(mad_send_wr->send_buf.ah, &ah_attr)) + continue; + + if (rmpp_recv->slid == ah_attr.dlid) { + newwin = rmpp_recv->repwin; + break; + } + } + spin_unlock_irqrestore(&agent->lock, flags); +out: + return newwin; +} + +int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr) +{ + struct ib_rmpp_mad *rmpp_mad; + int ret; + + rmpp_mad = mad_send_wr->send_buf.mad; + if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & + IB_MGMT_RMPP_FLAG_ACTIVE)) + return IB_RMPP_RESULT_UNHANDLED; + + if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA) { + mad_send_wr->seg_num = 1; + return IB_RMPP_RESULT_INTERNAL; + } + + mad_send_wr->newwin = init_newwin(mad_send_wr); + + /* We need to wait for the final ACK even if there isn't a response */ + mad_send_wr->refcount += (mad_send_wr->timeout == 0); + ret = send_next_seg(mad_send_wr); + if (!ret) + return IB_RMPP_RESULT_CONSUMED; + return ret; +} + +int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_send_wc *mad_send_wc) +{ + struct ib_rmpp_mad *rmpp_mad; + int ret; + + rmpp_mad = mad_send_wr->send_buf.mad; + if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & + IB_MGMT_RMPP_FLAG_ACTIVE)) + return IB_RMPP_RESULT_UNHANDLED; /* RMPP not active */ + + if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA) + return IB_RMPP_RESULT_INTERNAL; /* ACK, STOP, or ABORT */ + + if (mad_send_wc->status != IB_WC_SUCCESS || + mad_send_wr->status != IB_WC_SUCCESS) + return IB_RMPP_RESULT_PROCESSED; /* Canceled or send error */ + + if (!mad_send_wr->timeout) + return IB_RMPP_RESULT_PROCESSED; /* Response received */ + + if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) { + mad_send_wr->timeout = + msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms); + return IB_RMPP_RESULT_PROCESSED; /* Send done */ + } + + if (mad_send_wr->seg_num == mad_send_wr->newwin || + mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count) + return IB_RMPP_RESULT_PROCESSED; /* Wait for ACK */ + + ret = send_next_seg(mad_send_wr); + if (ret) { + mad_send_wc->status = IB_WC_GENERAL_ERR; + return IB_RMPP_RESULT_PROCESSED; + } + return IB_RMPP_RESULT_CONSUMED; +} + +int ib_retry_rmpp(struct ib_mad_send_wr_private *mad_send_wr) +{ + struct ib_rmpp_mad *rmpp_mad; + int ret; + + rmpp_mad = mad_send_wr->send_buf.mad; + if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & + IB_MGMT_RMPP_FLAG_ACTIVE)) + return IB_RMPP_RESULT_UNHANDLED; /* RMPP not active */ + + if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) + return IB_RMPP_RESULT_PROCESSED; + + mad_send_wr->seg_num = mad_send_wr->last_ack; + mad_send_wr->cur_seg = mad_send_wr->last_ack_seg; + + ret = send_next_seg(mad_send_wr); + if (ret) + return IB_RMPP_RESULT_PROCESSED; + + return IB_RMPP_RESULT_CONSUMED; +} diff --git a/drivers/infiniband/core/mad_rmpp.h b/drivers/infiniband/core/mad_rmpp.h new file mode 100644 index 0000000..3d336bf --- /dev/null +++ b/drivers/infiniband/core/mad_rmpp.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2005 Intel Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __MAD_RMPP_H__ +#define __MAD_RMPP_H__ + +enum { + IB_RMPP_RESULT_PROCESSED, + IB_RMPP_RESULT_CONSUMED, + IB_RMPP_RESULT_INTERNAL, + IB_RMPP_RESULT_UNHANDLED +}; + +int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr); + +struct ib_mad_recv_wc * +ib_process_rmpp_recv_wc(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *mad_recv_wc); + +int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_send_wc *mad_send_wc); + +void ib_rmpp_send_handler(struct ib_mad_send_wc *mad_send_wc); + +void ib_cancel_rmpp_recvs(struct ib_mad_agent_private *agent); + +int ib_retry_rmpp(struct ib_mad_send_wr_private *mad_send_wr); + +#endif /* __MAD_RMPP_H__ */ diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c new file mode 100644 index 0000000..d2360a8 --- /dev/null +++ b/drivers/infiniband/core/multicast.c @@ -0,0 +1,898 @@ +/* + * Copyright (c) 2006 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "sa.h" + +static void mcast_add_one(struct ib_device *device); +static void mcast_remove_one(struct ib_device *device); + +static struct ib_client mcast_client = { + .name = "ib_multicast", + .add = mcast_add_one, + .remove = mcast_remove_one +}; + +static struct ib_sa_client sa_client; +static struct workqueue_struct *mcast_wq; +static union ib_gid mgid0; + +struct mcast_device; + +struct mcast_port { + struct mcast_device *dev; + spinlock_t lock; + struct rb_root table; + atomic_t refcount; + struct completion comp; + u8 port_num; +}; + +struct mcast_device { + struct ib_device *device; + struct ib_event_handler event_handler; + int start_port; + int end_port; + struct mcast_port port[0]; +}; + +enum mcast_state { + MCAST_JOINING, + MCAST_MEMBER, + MCAST_ERROR, +}; + +enum mcast_group_state { + MCAST_IDLE, + MCAST_BUSY, + MCAST_GROUP_ERROR, + MCAST_PKEY_EVENT +}; + +enum { + MCAST_INVALID_PKEY_INDEX = 0xFFFF +}; + +struct mcast_member; + +struct mcast_group { + struct ib_sa_mcmember_rec rec; + struct rb_node node; + struct mcast_port *port; + spinlock_t lock; + struct work_struct work; + struct list_head pending_list; + struct list_head active_list; + struct mcast_member *last_join; + int members[3]; + atomic_t refcount; + enum mcast_group_state state; + struct ib_sa_query *query; + int query_id; + u16 pkey_index; + u8 leave_state; + int retries; +}; + +struct mcast_member { + struct ib_sa_multicast multicast; + struct ib_sa_client *client; + struct mcast_group *group; + struct list_head list; + enum mcast_state state; + atomic_t refcount; + struct completion comp; +}; + +static void join_handler(int status, struct ib_sa_mcmember_rec *rec, + void *context); +static void leave_handler(int status, struct ib_sa_mcmember_rec *rec, + void *context); + +static struct mcast_group *mcast_find(struct mcast_port *port, + union ib_gid *mgid) +{ + struct rb_node *node = port->table.rb_node; + struct mcast_group *group; + int ret; + + while (node) { + group = rb_entry(node, struct mcast_group, node); + ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid); + if (!ret) + return group; + + if (ret < 0) + node = node->rb_left; + else + node = node->rb_right; + } + return NULL; +} + +static struct mcast_group *mcast_insert(struct mcast_port *port, + struct mcast_group *group, + int allow_duplicates) +{ + struct rb_node **link = &port->table.rb_node; + struct rb_node *parent = NULL; + struct mcast_group *cur_group; + int ret; + + while (*link) { + parent = *link; + cur_group = rb_entry(parent, struct mcast_group, node); + + ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw, + sizeof group->rec.mgid); + if (ret < 0) + link = &(*link)->rb_left; + else if (ret > 0) + link = &(*link)->rb_right; + else if (allow_duplicates) + link = &(*link)->rb_left; + else + return cur_group; + } + rb_link_node(&group->node, parent, link); + rb_insert_color(&group->node, &port->table); + return NULL; +} + +static void deref_port(struct mcast_port *port) +{ + if (atomic_dec_and_test(&port->refcount)) + complete(&port->comp); +} + +static void release_group(struct mcast_group *group) +{ + struct mcast_port *port = group->port; + unsigned long flags; + + spin_lock_irqsave(&port->lock, flags); + if (atomic_dec_and_test(&group->refcount)) { + rb_erase(&group->node, &port->table); + spin_unlock_irqrestore(&port->lock, flags); + kfree(group); + deref_port(port); + } else + spin_unlock_irqrestore(&port->lock, flags); +} + +static void deref_member(struct mcast_member *member) +{ + if (atomic_dec_and_test(&member->refcount)) + complete(&member->comp); +} + +static void queue_join(struct mcast_member *member) +{ + struct mcast_group *group = member->group; + unsigned long flags; + + spin_lock_irqsave(&group->lock, flags); + list_add_tail(&member->list, &group->pending_list); + if (group->state == MCAST_IDLE) { + group->state = MCAST_BUSY; + atomic_inc(&group->refcount); + queue_work(mcast_wq, &group->work); + } + spin_unlock_irqrestore(&group->lock, flags); +} + +/* + * A multicast group has three types of members: full member, non member, and + * send only member. We need to keep track of the number of members of each + * type based on their join state. Adjust the number of members the belong to + * the specified join states. + */ +static void adjust_membership(struct mcast_group *group, u8 join_state, int inc) +{ + int i; + + for (i = 0; i < 3; i++, join_state >>= 1) + if (join_state & 0x1) + group->members[i] += inc; +} + +/* + * If a multicast group has zero members left for a particular join state, but + * the group is still a member with the SA, we need to leave that join state. + * Determine which join states we still belong to, but that do not have any + * active members. + */ +static u8 get_leave_state(struct mcast_group *group) +{ + u8 leave_state = 0; + int i; + + for (i = 0; i < 3; i++) + if (!group->members[i]) + leave_state |= (0x1 << i); + + return leave_state & group->rec.join_state; +} + +static int check_selector(ib_sa_comp_mask comp_mask, + ib_sa_comp_mask selector_mask, + ib_sa_comp_mask value_mask, + u8 selector, u8 src_value, u8 dst_value) +{ + int err; + + if (!(comp_mask & selector_mask) || !(comp_mask & value_mask)) + return 0; + + switch (selector) { + case IB_SA_GT: + err = (src_value <= dst_value); + break; + case IB_SA_LT: + err = (src_value >= dst_value); + break; + case IB_SA_EQ: + err = (src_value != dst_value); + break; + default: + err = 0; + break; + } + + return err; +} + +static int cmp_rec(struct ib_sa_mcmember_rec *src, + struct ib_sa_mcmember_rec *dst, ib_sa_comp_mask comp_mask) +{ + /* MGID must already match */ + + if (comp_mask & IB_SA_MCMEMBER_REC_PORT_GID && + memcmp(&src->port_gid, &dst->port_gid, sizeof src->port_gid)) + return -EINVAL; + if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey) + return -EINVAL; + if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid) + return -EINVAL; + if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR, + IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector, + src->mtu, dst->mtu)) + return -EINVAL; + if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS && + src->traffic_class != dst->traffic_class) + return -EINVAL; + if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey) + return -EINVAL; + if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR, + IB_SA_MCMEMBER_REC_RATE, dst->rate_selector, + src->rate, dst->rate)) + return -EINVAL; + if (check_selector(comp_mask, + IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR, + IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME, + dst->packet_life_time_selector, + src->packet_life_time, dst->packet_life_time)) + return -EINVAL; + if (comp_mask & IB_SA_MCMEMBER_REC_SL && src->sl != dst->sl) + return -EINVAL; + if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL && + src->flow_label != dst->flow_label) + return -EINVAL; + if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT && + src->hop_limit != dst->hop_limit) + return -EINVAL; + if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE && src->scope != dst->scope) + return -EINVAL; + + /* join_state checked separately, proxy_join ignored */ + + return 0; +} + +static int send_join(struct mcast_group *group, struct mcast_member *member) +{ + struct mcast_port *port = group->port; + int ret; + + group->last_join = member; + ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device, + port->port_num, IB_MGMT_METHOD_SET, + &member->multicast.rec, + member->multicast.comp_mask, + 3000, GFP_KERNEL, join_handler, group, + &group->query); + if (ret >= 0) { + group->query_id = ret; + ret = 0; + } + return ret; +} + +static int send_leave(struct mcast_group *group, u8 leave_state) +{ + struct mcast_port *port = group->port; + struct ib_sa_mcmember_rec rec; + int ret; + + rec = group->rec; + rec.join_state = leave_state; + group->leave_state = leave_state; + + ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device, + port->port_num, IB_SA_METHOD_DELETE, &rec, + IB_SA_MCMEMBER_REC_MGID | + IB_SA_MCMEMBER_REC_PORT_GID | + IB_SA_MCMEMBER_REC_JOIN_STATE, + 3000, GFP_KERNEL, leave_handler, + group, &group->query); + if (ret >= 0) { + group->query_id = ret; + ret = 0; + } + return ret; +} + +static void join_group(struct mcast_group *group, struct mcast_member *member, + u8 join_state) +{ + member->state = MCAST_MEMBER; + adjust_membership(group, join_state, 1); + group->rec.join_state |= join_state; + member->multicast.rec = group->rec; + member->multicast.rec.join_state = join_state; + list_move(&member->list, &group->active_list); +} + +static int fail_join(struct mcast_group *group, struct mcast_member *member, + int status) +{ + spin_lock_irq(&group->lock); + list_del_init(&member->list); + spin_unlock_irq(&group->lock); + return member->multicast.callback(status, &member->multicast); +} + +static void process_group_error(struct mcast_group *group) +{ + struct mcast_member *member; + int ret = 0; + u16 pkey_index; + + if (group->state == MCAST_PKEY_EVENT) + ret = ib_find_pkey(group->port->dev->device, + group->port->port_num, + be16_to_cpu(group->rec.pkey), &pkey_index); + + spin_lock_irq(&group->lock); + if (group->state == MCAST_PKEY_EVENT && !ret && + group->pkey_index == pkey_index) + goto out; + + while (!list_empty(&group->active_list)) { + member = list_entry(group->active_list.next, + struct mcast_member, list); + atomic_inc(&member->refcount); + list_del_init(&member->list); + adjust_membership(group, member->multicast.rec.join_state, -1); + member->state = MCAST_ERROR; + spin_unlock_irq(&group->lock); + + ret = member->multicast.callback(-ENETRESET, + &member->multicast); + deref_member(member); + if (ret) + ib_sa_free_multicast(&member->multicast); + spin_lock_irq(&group->lock); + } + + group->rec.join_state = 0; +out: + group->state = MCAST_BUSY; + spin_unlock_irq(&group->lock); +} + +static void mcast_work_handler(struct work_struct *work) +{ + struct mcast_group *group; + struct mcast_member *member; + struct ib_sa_multicast *multicast; + int status, ret; + u8 join_state; + + group = container_of(work, typeof(*group), work); +retest: + spin_lock_irq(&group->lock); + while (!list_empty(&group->pending_list) || + (group->state != MCAST_BUSY)) { + + if (group->state != MCAST_BUSY) { + spin_unlock_irq(&group->lock); + process_group_error(group); + goto retest; + } + + member = list_entry(group->pending_list.next, + struct mcast_member, list); + multicast = &member->multicast; + join_state = multicast->rec.join_state; + atomic_inc(&member->refcount); + + if (join_state == (group->rec.join_state & join_state)) { + status = cmp_rec(&group->rec, &multicast->rec, + multicast->comp_mask); + if (!status) + join_group(group, member, join_state); + else + list_del_init(&member->list); + spin_unlock_irq(&group->lock); + ret = multicast->callback(status, multicast); + } else { + spin_unlock_irq(&group->lock); + status = send_join(group, member); + if (!status) { + deref_member(member); + return; + } + ret = fail_join(group, member, status); + } + + deref_member(member); + if (ret) + ib_sa_free_multicast(&member->multicast); + spin_lock_irq(&group->lock); + } + + join_state = get_leave_state(group); + if (join_state) { + group->rec.join_state &= ~join_state; + spin_unlock_irq(&group->lock); + if (send_leave(group, join_state)) + goto retest; + } else { + group->state = MCAST_IDLE; + spin_unlock_irq(&group->lock); + release_group(group); + } +} + +/* + * Fail a join request if it is still active - at the head of the pending queue. + */ +static void process_join_error(struct mcast_group *group, int status) +{ + struct mcast_member *member; + int ret; + + spin_lock_irq(&group->lock); + member = list_entry(group->pending_list.next, + struct mcast_member, list); + if (group->last_join == member) { + atomic_inc(&member->refcount); + list_del_init(&member->list); + spin_unlock_irq(&group->lock); + ret = member->multicast.callback(status, &member->multicast); + deref_member(member); + if (ret) + ib_sa_free_multicast(&member->multicast); + } else + spin_unlock_irq(&group->lock); +} + +static void join_handler(int status, struct ib_sa_mcmember_rec *rec, + void *context) +{ + struct mcast_group *group = context; + u16 pkey_index = MCAST_INVALID_PKEY_INDEX; + + if (status) + process_join_error(group, status); + else { + ib_find_pkey(group->port->dev->device, group->port->port_num, + be16_to_cpu(rec->pkey), &pkey_index); + + spin_lock_irq(&group->port->lock); + group->rec = *rec; + if (group->state == MCAST_BUSY && + group->pkey_index == MCAST_INVALID_PKEY_INDEX) + group->pkey_index = pkey_index; + if (!memcmp(&mgid0, &group->rec.mgid, sizeof mgid0)) { + rb_erase(&group->node, &group->port->table); + mcast_insert(group->port, group, 1); + } + spin_unlock_irq(&group->port->lock); + } + mcast_work_handler(&group->work); +} + +static void leave_handler(int status, struct ib_sa_mcmember_rec *rec, + void *context) +{ + struct mcast_group *group = context; + + if (status && group->retries > 0 && + !send_leave(group, group->leave_state)) + group->retries--; + else + mcast_work_handler(&group->work); +} + +static struct mcast_group *acquire_group(struct mcast_port *port, + union ib_gid *mgid, gfp_t gfp_mask) +{ + struct mcast_group *group, *cur_group; + unsigned long flags; + int is_mgid0; + + is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0); + if (!is_mgid0) { + spin_lock_irqsave(&port->lock, flags); + group = mcast_find(port, mgid); + if (group) + goto found; + spin_unlock_irqrestore(&port->lock, flags); + } + + group = kzalloc(sizeof *group, gfp_mask); + if (!group) + return NULL; + + group->retries = 3; + group->port = port; + group->rec.mgid = *mgid; + group->pkey_index = MCAST_INVALID_PKEY_INDEX; + INIT_LIST_HEAD(&group->pending_list); + INIT_LIST_HEAD(&group->active_list); + INIT_WORK(&group->work, mcast_work_handler); + spin_lock_init(&group->lock); + + spin_lock_irqsave(&port->lock, flags); + cur_group = mcast_insert(port, group, is_mgid0); + if (cur_group) { + kfree(group); + group = cur_group; + } else + atomic_inc(&port->refcount); +found: + atomic_inc(&group->refcount); + spin_unlock_irqrestore(&port->lock, flags); + return group; +} + +/* + * We serialize all join requests to a single group to make our lives much + * easier. Otherwise, two users could try to join the same group + * simultaneously, with different configurations, one could leave while the + * join is in progress, etc., which makes locking around error recovery + * difficult. + */ +struct ib_sa_multicast * +ib_sa_join_multicast(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + struct ib_sa_mcmember_rec *rec, + ib_sa_comp_mask comp_mask, gfp_t gfp_mask, + int (*callback)(int status, + struct ib_sa_multicast *multicast), + void *context) +{ + struct mcast_device *dev; + struct mcast_member *member; + struct ib_sa_multicast *multicast; + int ret; + + dev = ib_get_client_data(device, &mcast_client); + if (!dev) + return ERR_PTR(-ENODEV); + + member = kmalloc(sizeof *member, gfp_mask); + if (!member) + return ERR_PTR(-ENOMEM); + + ib_sa_client_get(client); + member->client = client; + member->multicast.rec = *rec; + member->multicast.comp_mask = comp_mask; + member->multicast.callback = callback; + member->multicast.context = context; + init_completion(&member->comp); + atomic_set(&member->refcount, 1); + member->state = MCAST_JOINING; + + member->group = acquire_group(&dev->port[port_num - dev->start_port], + &rec->mgid, gfp_mask); + if (!member->group) { + ret = -ENOMEM; + goto err; + } + + /* + * The user will get the multicast structure in their callback. They + * could then free the multicast structure before we can return from + * this routine. So we save the pointer to return before queuing + * any callback. + */ + multicast = &member->multicast; + queue_join(member); + return multicast; + +err: + ib_sa_client_put(client); + kfree(member); + return ERR_PTR(ret); +} +EXPORT_SYMBOL(ib_sa_join_multicast); + +void ib_sa_free_multicast(struct ib_sa_multicast *multicast) +{ + struct mcast_member *member; + struct mcast_group *group; + + member = container_of(multicast, struct mcast_member, multicast); + group = member->group; + + spin_lock_irq(&group->lock); + if (member->state == MCAST_MEMBER) + adjust_membership(group, multicast->rec.join_state, -1); + + list_del_init(&member->list); + + if (group->state == MCAST_IDLE) { + group->state = MCAST_BUSY; + spin_unlock_irq(&group->lock); + /* Continue to hold reference on group until callback */ + queue_work(mcast_wq, &group->work); + } else { + spin_unlock_irq(&group->lock); + release_group(group); + } + + deref_member(member); + wait_for_completion(&member->comp); + ib_sa_client_put(member->client); + kfree(member); +} +EXPORT_SYMBOL(ib_sa_free_multicast); + +int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num, + union ib_gid *mgid, struct ib_sa_mcmember_rec *rec) +{ + struct mcast_device *dev; + struct mcast_port *port; + struct mcast_group *group; + unsigned long flags; + int ret = 0; + + dev = ib_get_client_data(device, &mcast_client); + if (!dev) + return -ENODEV; + + port = &dev->port[port_num - dev->start_port]; + spin_lock_irqsave(&port->lock, flags); + group = mcast_find(port, mgid); + if (group) + *rec = group->rec; + else + ret = -EADDRNOTAVAIL; + spin_unlock_irqrestore(&port->lock, flags); + + return ret; +} +EXPORT_SYMBOL(ib_sa_get_mcmember_rec); + +int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num, + struct ib_sa_mcmember_rec *rec, + struct ib_ah_attr *ah_attr) +{ + int ret; + u16 gid_index; + u8 p; + + ret = ib_find_cached_gid(device, &rec->port_gid, &p, &gid_index); + if (ret) + return ret; + + memset(ah_attr, 0, sizeof *ah_attr); + ah_attr->dlid = be16_to_cpu(rec->mlid); + ah_attr->sl = rec->sl; + ah_attr->port_num = port_num; + ah_attr->static_rate = rec->rate; + + ah_attr->ah_flags = IB_AH_GRH; + ah_attr->grh.dgid = rec->mgid; + + ah_attr->grh.sgid_index = (u8) gid_index; + ah_attr->grh.flow_label = be32_to_cpu(rec->flow_label); + ah_attr->grh.hop_limit = rec->hop_limit; + ah_attr->grh.traffic_class = rec->traffic_class; + + return 0; +} +EXPORT_SYMBOL(ib_init_ah_from_mcmember); + +static void mcast_groups_event(struct mcast_port *port, + enum mcast_group_state state) +{ + struct mcast_group *group; + struct rb_node *node; + unsigned long flags; + + spin_lock_irqsave(&port->lock, flags); + for (node = rb_first(&port->table); node; node = rb_next(node)) { + group = rb_entry(node, struct mcast_group, node); + spin_lock(&group->lock); + if (group->state == MCAST_IDLE) { + atomic_inc(&group->refcount); + queue_work(mcast_wq, &group->work); + } + if (group->state != MCAST_GROUP_ERROR) + group->state = state; + spin_unlock(&group->lock); + } + spin_unlock_irqrestore(&port->lock, flags); +} + +static void mcast_event_handler(struct ib_event_handler *handler, + struct ib_event *event) +{ + struct mcast_device *dev; + int index; + + dev = container_of(handler, struct mcast_device, event_handler); + if (rdma_port_get_link_layer(dev->device, event->element.port_num) != + IB_LINK_LAYER_INFINIBAND) + return; + + index = event->element.port_num - dev->start_port; + + switch (event->event) { + case IB_EVENT_PORT_ERR: + case IB_EVENT_LID_CHANGE: + case IB_EVENT_SM_CHANGE: + case IB_EVENT_CLIENT_REREGISTER: + mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR); + break; + case IB_EVENT_PKEY_CHANGE: + mcast_groups_event(&dev->port[index], MCAST_PKEY_EVENT); + break; + default: + break; + } +} + +static void mcast_add_one(struct ib_device *device) +{ + struct mcast_device *dev; + struct mcast_port *port; + int i; + int count = 0; + + if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) + return; + + dev = kmalloc(sizeof *dev + device->phys_port_cnt * sizeof *port, + GFP_KERNEL); + if (!dev) + return; + + if (device->node_type == RDMA_NODE_IB_SWITCH) + dev->start_port = dev->end_port = 0; + else { + dev->start_port = 1; + dev->end_port = device->phys_port_cnt; + } + + for (i = 0; i <= dev->end_port - dev->start_port; i++) { + if (rdma_port_get_link_layer(device, dev->start_port + i) != + IB_LINK_LAYER_INFINIBAND) + continue; + port = &dev->port[i]; + port->dev = dev; + port->port_num = dev->start_port + i; + spin_lock_init(&port->lock); + port->table = RB_ROOT; + init_completion(&port->comp); + atomic_set(&port->refcount, 1); + ++count; + } + + if (!count) { + kfree(dev); + return; + } + + dev->device = device; + ib_set_client_data(device, &mcast_client, dev); + + INIT_IB_EVENT_HANDLER(&dev->event_handler, device, mcast_event_handler); + ib_register_event_handler(&dev->event_handler); +} + +static void mcast_remove_one(struct ib_device *device) +{ + struct mcast_device *dev; + struct mcast_port *port; + int i; + + dev = ib_get_client_data(device, &mcast_client); + if (!dev) + return; + + ib_unregister_event_handler(&dev->event_handler); + flush_workqueue(mcast_wq); + + for (i = 0; i <= dev->end_port - dev->start_port; i++) { + if (rdma_port_get_link_layer(device, dev->start_port + i) == + IB_LINK_LAYER_INFINIBAND) { + port = &dev->port[i]; + deref_port(port); + wait_for_completion(&port->comp); + } + } + + kfree(dev); +} + +int mcast_init(void) +{ + int ret; + + mcast_wq = create_singlethread_workqueue("ib_mcast"); + if (!mcast_wq) + return -ENOMEM; + + ib_sa_register_client(&sa_client); + + ret = ib_register_client(&mcast_client); + if (ret) + goto err; + return 0; + +err: + ib_sa_unregister_client(&sa_client); + destroy_workqueue(mcast_wq); + return ret; +} + +void mcast_cleanup(void) +{ + ib_unregister_client(&mcast_client); + ib_sa_unregister_client(&sa_client); + destroy_workqueue(mcast_wq); +} diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c new file mode 100644 index 0000000..23dd5a5 --- /dev/null +++ b/drivers/infiniband/core/netlink.c @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2010 Voltaire Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ + +#include +#include +#include +#include +#include + +struct ibnl_client { + struct list_head list; + int index; + int nops; + const struct ibnl_client_cbs *cb_table; +}; + +static DEFINE_MUTEX(ibnl_mutex); +static struct sock *nls; +static LIST_HEAD(client_list); + +int ibnl_add_client(int index, int nops, + const struct ibnl_client_cbs cb_table[]) +{ + struct ibnl_client *cur; + struct ibnl_client *nl_client; + + nl_client = kmalloc(sizeof *nl_client, GFP_KERNEL); + if (!nl_client) + return -ENOMEM; + + nl_client->index = index; + nl_client->nops = nops; + nl_client->cb_table = cb_table; + + mutex_lock(&ibnl_mutex); + + list_for_each_entry(cur, &client_list, list) { + if (cur->index == index) { + pr_warn("Client for %d already exists\n", index); + mutex_unlock(&ibnl_mutex); + kfree(nl_client); + return -EINVAL; + } + } + + list_add_tail(&nl_client->list, &client_list); + + mutex_unlock(&ibnl_mutex); + + return 0; +} +EXPORT_SYMBOL(ibnl_add_client); + +int ibnl_remove_client(int index) +{ + struct ibnl_client *cur, *next; + + mutex_lock(&ibnl_mutex); + list_for_each_entry_safe(cur, next, &client_list, list) { + if (cur->index == index) { + list_del(&(cur->list)); + mutex_unlock(&ibnl_mutex); + kfree(cur); + return 0; + } + } + pr_warn("Can't remove callback for client idx %d. Not found\n", index); + mutex_unlock(&ibnl_mutex); + + return -EINVAL; +} +EXPORT_SYMBOL(ibnl_remove_client); + +void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq, + int len, int client, int op, int flags) +{ + unsigned char *prev_tail; + + prev_tail = skb_tail_pointer(skb); + *nlh = nlmsg_put(skb, 0, seq, RDMA_NL_GET_TYPE(client, op), + len, flags); + if (!*nlh) + goto out_nlmsg_trim; + (*nlh)->nlmsg_len = skb_tail_pointer(skb) - prev_tail; + return nlmsg_data(*nlh); + +out_nlmsg_trim: + nlmsg_trim(skb, prev_tail); + return NULL; +} +EXPORT_SYMBOL(ibnl_put_msg); + +int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh, + int len, void *data, int type) +{ + unsigned char *prev_tail; + + prev_tail = skb_tail_pointer(skb); + if (nla_put(skb, type, len, data)) + goto nla_put_failure; + nlh->nlmsg_len += skb_tail_pointer(skb) - prev_tail; + return 0; + +nla_put_failure: + nlmsg_trim(skb, prev_tail - nlh->nlmsg_len); + return -EMSGSIZE; +} +EXPORT_SYMBOL(ibnl_put_attr); + +static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + struct ibnl_client *client; + int type = nlh->nlmsg_type; + int index = RDMA_NL_GET_CLIENT(type); + int op = RDMA_NL_GET_OP(type); + + list_for_each_entry(client, &client_list, list) { + if (client->index == index) { + if (op < 0 || op >= client->nops || + !client->cb_table[op].dump) + return -EINVAL; + + { + struct netlink_dump_control c = { + .dump = client->cb_table[op].dump, + .module = client->cb_table[op].module, + }; + return netlink_dump_start(nls, skb, nlh, &c); + } + } + } + + pr_info("Index %d wasn't found in client list\n", index); + return -EINVAL; +} + +static void ibnl_rcv(struct sk_buff *skb) +{ + mutex_lock(&ibnl_mutex); + netlink_rcv_skb(skb, &ibnl_rcv_msg); + mutex_unlock(&ibnl_mutex); +} + +int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh, + __u32 pid) +{ + return nlmsg_unicast(nls, skb, pid); +} +EXPORT_SYMBOL(ibnl_unicast); + +int ibnl_multicast(struct sk_buff *skb, struct nlmsghdr *nlh, + unsigned int group, gfp_t flags) +{ + return nlmsg_multicast(nls, skb, 0, group, flags); +} +EXPORT_SYMBOL(ibnl_multicast); + +int __init ibnl_init(void) +{ + struct netlink_kernel_cfg cfg = { + .input = ibnl_rcv, + }; + + nls = netlink_kernel_create(&init_net, NETLINK_RDMA, &cfg); + if (!nls) { + pr_warn("Failed to create netlink socket\n"); + return -ENOMEM; + } + + return 0; +} + +void ibnl_cleanup(void) +{ + struct ibnl_client *cur, *next; + + mutex_lock(&ibnl_mutex); + list_for_each_entry_safe(cur, next, &client_list, list) { + list_del(&(cur->list)); + kfree(cur); + } + mutex_unlock(&ibnl_mutex); + + netlink_kernel_release(nls); +} diff --git a/drivers/infiniband/core/packer.c b/drivers/infiniband/core/packer.c new file mode 100644 index 0000000..1b65986 --- /dev/null +++ b/drivers/infiniband/core/packer.c @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include + +static u64 value_read(int offset, int size, void *structure) +{ + switch (size) { + case 1: return *(u8 *) (structure + offset); + case 2: return be16_to_cpup((__be16 *) (structure + offset)); + case 4: return be32_to_cpup((__be32 *) (structure + offset)); + case 8: return be64_to_cpup((__be64 *) (structure + offset)); + default: + printk(KERN_WARNING "Field size %d bits not handled\n", size * 8); + return 0; + } +} + +/** + * ib_pack - Pack a structure into a buffer + * @desc:Array of structure field descriptions + * @desc_len:Number of entries in @desc + * @structure:Structure to pack from + * @buf:Buffer to pack into + * + * ib_pack() packs a list of structure fields into a buffer, + * controlled by the array of fields in @desc. + */ +void ib_pack(const struct ib_field *desc, + int desc_len, + void *structure, + void *buf) +{ + int i; + + for (i = 0; i < desc_len; ++i) { + if (desc[i].size_bits <= 32) { + int shift; + u32 val; + __be32 mask; + __be32 *addr; + + shift = 32 - desc[i].offset_bits - desc[i].size_bits; + if (desc[i].struct_size_bytes) + val = value_read(desc[i].struct_offset_bytes, + desc[i].struct_size_bytes, + structure) << shift; + else + val = 0; + + mask = cpu_to_be32(((1ull << desc[i].size_bits) - 1) << shift); + addr = (__be32 *) buf + desc[i].offset_words; + *addr = (*addr & ~mask) | (cpu_to_be32(val) & mask); + } else if (desc[i].size_bits <= 64) { + int shift; + u64 val; + __be64 mask; + __be64 *addr; + + shift = 64 - desc[i].offset_bits - desc[i].size_bits; + if (desc[i].struct_size_bytes) + val = value_read(desc[i].struct_offset_bytes, + desc[i].struct_size_bytes, + structure) << shift; + else + val = 0; + + mask = cpu_to_be64((~0ull >> (64 - desc[i].size_bits)) << shift); + addr = (__be64 *) ((__be32 *) buf + desc[i].offset_words); + *addr = (*addr & ~mask) | (cpu_to_be64(val) & mask); + } else { + if (desc[i].offset_bits % 8 || + desc[i].size_bits % 8) { + printk(KERN_WARNING "Structure field %s of size %d " + "bits is not byte-aligned\n", + desc[i].field_name, desc[i].size_bits); + } + + if (desc[i].struct_size_bytes) + memcpy(buf + desc[i].offset_words * 4 + + desc[i].offset_bits / 8, + structure + desc[i].struct_offset_bytes, + desc[i].size_bits / 8); + else + memset(buf + desc[i].offset_words * 4 + + desc[i].offset_bits / 8, + 0, + desc[i].size_bits / 8); + } + } +} +EXPORT_SYMBOL(ib_pack); + +static void value_write(int offset, int size, u64 val, void *structure) +{ + switch (size * 8) { + case 8: *( u8 *) (structure + offset) = val; break; + case 16: *(__be16 *) (structure + offset) = cpu_to_be16(val); break; + case 32: *(__be32 *) (structure + offset) = cpu_to_be32(val); break; + case 64: *(__be64 *) (structure + offset) = cpu_to_be64(val); break; + default: + printk(KERN_WARNING "Field size %d bits not handled\n", size * 8); + } +} + +/** + * ib_unpack - Unpack a buffer into a structure + * @desc:Array of structure field descriptions + * @desc_len:Number of entries in @desc + * @buf:Buffer to unpack from + * @structure:Structure to unpack into + * + * ib_pack() unpacks a list of structure fields from a buffer, + * controlled by the array of fields in @desc. + */ +void ib_unpack(const struct ib_field *desc, + int desc_len, + void *buf, + void *structure) +{ + int i; + + for (i = 0; i < desc_len; ++i) { + if (!desc[i].struct_size_bytes) + continue; + + if (desc[i].size_bits <= 32) { + int shift; + u32 val; + u32 mask; + __be32 *addr; + + shift = 32 - desc[i].offset_bits - desc[i].size_bits; + mask = ((1ull << desc[i].size_bits) - 1) << shift; + addr = (__be32 *) buf + desc[i].offset_words; + val = (be32_to_cpup(addr) & mask) >> shift; + value_write(desc[i].struct_offset_bytes, + desc[i].struct_size_bytes, + val, + structure); + } else if (desc[i].size_bits <= 64) { + int shift; + u64 val; + u64 mask; + __be64 *addr; + + shift = 64 - desc[i].offset_bits - desc[i].size_bits; + mask = (~0ull >> (64 - desc[i].size_bits)) << shift; + addr = (__be64 *) buf + desc[i].offset_words; + val = (be64_to_cpup(addr) & mask) >> shift; + value_write(desc[i].struct_offset_bytes, + desc[i].struct_size_bytes, + val, + structure); + } else { + if (desc[i].offset_bits % 8 || + desc[i].size_bits % 8) { + printk(KERN_WARNING "Structure field %s of size %d " + "bits is not byte-aligned\n", + desc[i].field_name, desc[i].size_bits); + } + + memcpy(structure + desc[i].struct_offset_bytes, + buf + desc[i].offset_words * 4 + + desc[i].offset_bits / 8, + desc[i].size_bits / 8); + } + } +} +EXPORT_SYMBOL(ib_unpack); diff --git a/drivers/infiniband/core/sa.h b/drivers/infiniband/core/sa.h new file mode 100644 index 0000000..b1d4bbf --- /dev/null +++ b/drivers/infiniband/core/sa.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Voltaire, Inc. All rights reserved. + * Copyright (c) 2006 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef SA_H +#define SA_H + +#include + +static inline void ib_sa_client_get(struct ib_sa_client *client) +{ + atomic_inc(&client->users); +} + +static inline void ib_sa_client_put(struct ib_sa_client *client) +{ + if (atomic_dec_and_test(&client->users)) + complete(&client->comp); +} + +int ib_sa_mcmember_rec_query(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + u8 method, + struct ib_sa_mcmember_rec *rec, + ib_sa_comp_mask comp_mask, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_mcmember_rec *resp, + void *context), + void *context, + struct ib_sa_query **sa_query); + +int mcast_init(void); +void mcast_cleanup(void); + +#endif /* SA_H */ diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c new file mode 100644 index 0000000..c38f030 --- /dev/null +++ b/drivers/infiniband/core/sa_query.c @@ -0,0 +1,1280 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Voltaire, Inc. All rights reserved. + * Copyright (c) 2006 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "sa.h" + +MODULE_AUTHOR("Roland Dreier"); +MODULE_DESCRIPTION("InfiniBand subnet administration query support"); +MODULE_LICENSE("Dual BSD/GPL"); + +struct ib_sa_sm_ah { + struct ib_ah *ah; + struct kref ref; + u16 pkey_index; + u8 src_path_mask; +}; + +struct ib_sa_port { + struct ib_mad_agent *agent; + struct ib_sa_sm_ah *sm_ah; + struct work_struct update_task; + spinlock_t ah_lock; + u8 port_num; +}; + +struct ib_sa_device { + int start_port, end_port; + struct ib_event_handler event_handler; + struct ib_sa_port port[0]; +}; + +struct ib_sa_query { + void (*callback)(struct ib_sa_query *, int, struct ib_sa_mad *); + void (*release)(struct ib_sa_query *); + struct ib_sa_client *client; + struct ib_sa_port *port; + struct ib_mad_send_buf *mad_buf; + struct ib_sa_sm_ah *sm_ah; + int id; +}; + +struct ib_sa_service_query { + void (*callback)(int, struct ib_sa_service_rec *, void *); + void *context; + struct ib_sa_query sa_query; +}; + +struct ib_sa_path_query { + void (*callback)(int, struct ib_sa_path_rec *, void *); + void *context; + struct ib_sa_query sa_query; +}; + +struct ib_sa_guidinfo_query { + void (*callback)(int, struct ib_sa_guidinfo_rec *, void *); + void *context; + struct ib_sa_query sa_query; +}; + +struct ib_sa_mcmember_query { + void (*callback)(int, struct ib_sa_mcmember_rec *, void *); + void *context; + struct ib_sa_query sa_query; +}; + +static void ib_sa_add_one(struct ib_device *device); +static void ib_sa_remove_one(struct ib_device *device); + +static struct ib_client sa_client = { + .name = "sa", + .add = ib_sa_add_one, + .remove = ib_sa_remove_one +}; + +static DEFINE_SPINLOCK(idr_lock); +static DEFINE_IDR(query_idr); + +static DEFINE_SPINLOCK(tid_lock); +static u32 tid; + +#define PATH_REC_FIELD(field) \ + .struct_offset_bytes = offsetof(struct ib_sa_path_rec, field), \ + .struct_size_bytes = sizeof ((struct ib_sa_path_rec *) 0)->field, \ + .field_name = "sa_path_rec:" #field + +static const struct ib_field path_rec_table[] = { + { PATH_REC_FIELD(service_id), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 64 }, + { PATH_REC_FIELD(dgid), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 128 }, + { PATH_REC_FIELD(sgid), + .offset_words = 6, + .offset_bits = 0, + .size_bits = 128 }, + { PATH_REC_FIELD(dlid), + .offset_words = 10, + .offset_bits = 0, + .size_bits = 16 }, + { PATH_REC_FIELD(slid), + .offset_words = 10, + .offset_bits = 16, + .size_bits = 16 }, + { PATH_REC_FIELD(raw_traffic), + .offset_words = 11, + .offset_bits = 0, + .size_bits = 1 }, + { RESERVED, + .offset_words = 11, + .offset_bits = 1, + .size_bits = 3 }, + { PATH_REC_FIELD(flow_label), + .offset_words = 11, + .offset_bits = 4, + .size_bits = 20 }, + { PATH_REC_FIELD(hop_limit), + .offset_words = 11, + .offset_bits = 24, + .size_bits = 8 }, + { PATH_REC_FIELD(traffic_class), + .offset_words = 12, + .offset_bits = 0, + .size_bits = 8 }, + { PATH_REC_FIELD(reversible), + .offset_words = 12, + .offset_bits = 8, + .size_bits = 1 }, + { PATH_REC_FIELD(numb_path), + .offset_words = 12, + .offset_bits = 9, + .size_bits = 7 }, + { PATH_REC_FIELD(pkey), + .offset_words = 12, + .offset_bits = 16, + .size_bits = 16 }, + { PATH_REC_FIELD(qos_class), + .offset_words = 13, + .offset_bits = 0, + .size_bits = 12 }, + { PATH_REC_FIELD(sl), + .offset_words = 13, + .offset_bits = 12, + .size_bits = 4 }, + { PATH_REC_FIELD(mtu_selector), + .offset_words = 13, + .offset_bits = 16, + .size_bits = 2 }, + { PATH_REC_FIELD(mtu), + .offset_words = 13, + .offset_bits = 18, + .size_bits = 6 }, + { PATH_REC_FIELD(rate_selector), + .offset_words = 13, + .offset_bits = 24, + .size_bits = 2 }, + { PATH_REC_FIELD(rate), + .offset_words = 13, + .offset_bits = 26, + .size_bits = 6 }, + { PATH_REC_FIELD(packet_life_time_selector), + .offset_words = 14, + .offset_bits = 0, + .size_bits = 2 }, + { PATH_REC_FIELD(packet_life_time), + .offset_words = 14, + .offset_bits = 2, + .size_bits = 6 }, + { PATH_REC_FIELD(preference), + .offset_words = 14, + .offset_bits = 8, + .size_bits = 8 }, + { RESERVED, + .offset_words = 14, + .offset_bits = 16, + .size_bits = 48 }, +}; + +#define MCMEMBER_REC_FIELD(field) \ + .struct_offset_bytes = offsetof(struct ib_sa_mcmember_rec, field), \ + .struct_size_bytes = sizeof ((struct ib_sa_mcmember_rec *) 0)->field, \ + .field_name = "sa_mcmember_rec:" #field + +static const struct ib_field mcmember_rec_table[] = { + { MCMEMBER_REC_FIELD(mgid), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 128 }, + { MCMEMBER_REC_FIELD(port_gid), + .offset_words = 4, + .offset_bits = 0, + .size_bits = 128 }, + { MCMEMBER_REC_FIELD(qkey), + .offset_words = 8, + .offset_bits = 0, + .size_bits = 32 }, + { MCMEMBER_REC_FIELD(mlid), + .offset_words = 9, + .offset_bits = 0, + .size_bits = 16 }, + { MCMEMBER_REC_FIELD(mtu_selector), + .offset_words = 9, + .offset_bits = 16, + .size_bits = 2 }, + { MCMEMBER_REC_FIELD(mtu), + .offset_words = 9, + .offset_bits = 18, + .size_bits = 6 }, + { MCMEMBER_REC_FIELD(traffic_class), + .offset_words = 9, + .offset_bits = 24, + .size_bits = 8 }, + { MCMEMBER_REC_FIELD(pkey), + .offset_words = 10, + .offset_bits = 0, + .size_bits = 16 }, + { MCMEMBER_REC_FIELD(rate_selector), + .offset_words = 10, + .offset_bits = 16, + .size_bits = 2 }, + { MCMEMBER_REC_FIELD(rate), + .offset_words = 10, + .offset_bits = 18, + .size_bits = 6 }, + { MCMEMBER_REC_FIELD(packet_life_time_selector), + .offset_words = 10, + .offset_bits = 24, + .size_bits = 2 }, + { MCMEMBER_REC_FIELD(packet_life_time), + .offset_words = 10, + .offset_bits = 26, + .size_bits = 6 }, + { MCMEMBER_REC_FIELD(sl), + .offset_words = 11, + .offset_bits = 0, + .size_bits = 4 }, + { MCMEMBER_REC_FIELD(flow_label), + .offset_words = 11, + .offset_bits = 4, + .size_bits = 20 }, + { MCMEMBER_REC_FIELD(hop_limit), + .offset_words = 11, + .offset_bits = 24, + .size_bits = 8 }, + { MCMEMBER_REC_FIELD(scope), + .offset_words = 12, + .offset_bits = 0, + .size_bits = 4 }, + { MCMEMBER_REC_FIELD(join_state), + .offset_words = 12, + .offset_bits = 4, + .size_bits = 4 }, + { MCMEMBER_REC_FIELD(proxy_join), + .offset_words = 12, + .offset_bits = 8, + .size_bits = 1 }, + { RESERVED, + .offset_words = 12, + .offset_bits = 9, + .size_bits = 23 }, +}; + +#define SERVICE_REC_FIELD(field) \ + .struct_offset_bytes = offsetof(struct ib_sa_service_rec, field), \ + .struct_size_bytes = sizeof ((struct ib_sa_service_rec *) 0)->field, \ + .field_name = "sa_service_rec:" #field + +static const struct ib_field service_rec_table[] = { + { SERVICE_REC_FIELD(id), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 64 }, + { SERVICE_REC_FIELD(gid), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 128 }, + { SERVICE_REC_FIELD(pkey), + .offset_words = 6, + .offset_bits = 0, + .size_bits = 16 }, + { SERVICE_REC_FIELD(lease), + .offset_words = 7, + .offset_bits = 0, + .size_bits = 32 }, + { SERVICE_REC_FIELD(key), + .offset_words = 8, + .offset_bits = 0, + .size_bits = 128 }, + { SERVICE_REC_FIELD(name), + .offset_words = 12, + .offset_bits = 0, + .size_bits = 64*8 }, + { SERVICE_REC_FIELD(data8), + .offset_words = 28, + .offset_bits = 0, + .size_bits = 16*8 }, + { SERVICE_REC_FIELD(data16), + .offset_words = 32, + .offset_bits = 0, + .size_bits = 8*16 }, + { SERVICE_REC_FIELD(data32), + .offset_words = 36, + .offset_bits = 0, + .size_bits = 4*32 }, + { SERVICE_REC_FIELD(data64), + .offset_words = 40, + .offset_bits = 0, + .size_bits = 2*64 }, +}; + +#define GUIDINFO_REC_FIELD(field) \ + .struct_offset_bytes = offsetof(struct ib_sa_guidinfo_rec, field), \ + .struct_size_bytes = sizeof((struct ib_sa_guidinfo_rec *) 0)->field, \ + .field_name = "sa_guidinfo_rec:" #field + +static const struct ib_field guidinfo_rec_table[] = { + { GUIDINFO_REC_FIELD(lid), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 16 }, + { GUIDINFO_REC_FIELD(block_num), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 8 }, + { GUIDINFO_REC_FIELD(res1), + .offset_words = 0, + .offset_bits = 24, + .size_bits = 8 }, + { GUIDINFO_REC_FIELD(res2), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 32 }, + { GUIDINFO_REC_FIELD(guid_info_list), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 512 }, +}; + +static void free_sm_ah(struct kref *kref) +{ + struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref); + + ib_destroy_ah(sm_ah->ah); + kfree(sm_ah); +} + +static void update_sm_ah(struct work_struct *work) +{ + struct ib_sa_port *port = + container_of(work, struct ib_sa_port, update_task); + struct ib_sa_sm_ah *new_ah; + struct ib_port_attr port_attr; + struct ib_ah_attr ah_attr; + + if (ib_query_port(port->agent->device, port->port_num, &port_attr)) { + printk(KERN_WARNING "Couldn't query port\n"); + return; + } + + new_ah = kmalloc(sizeof *new_ah, GFP_KERNEL); + if (!new_ah) { + printk(KERN_WARNING "Couldn't allocate new SM AH\n"); + return; + } + + kref_init(&new_ah->ref); + new_ah->src_path_mask = (1 << port_attr.lmc) - 1; + + new_ah->pkey_index = 0; + if (ib_find_pkey(port->agent->device, port->port_num, + IB_DEFAULT_PKEY_FULL, &new_ah->pkey_index)) + printk(KERN_ERR "Couldn't find index for default PKey\n"); + + memset(&ah_attr, 0, sizeof ah_attr); + ah_attr.dlid = port_attr.sm_lid; + ah_attr.sl = port_attr.sm_sl; + ah_attr.port_num = port->port_num; + + new_ah->ah = ib_create_ah(port->agent->qp->pd, &ah_attr); + if (IS_ERR(new_ah->ah)) { + printk(KERN_WARNING "Couldn't create new SM AH\n"); + kfree(new_ah); + return; + } + + spin_lock_irq(&port->ah_lock); + if (port->sm_ah) + kref_put(&port->sm_ah->ref, free_sm_ah); + port->sm_ah = new_ah; + spin_unlock_irq(&port->ah_lock); + +} + +static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event) +{ + if (event->event == IB_EVENT_PORT_ERR || + event->event == IB_EVENT_PORT_ACTIVE || + event->event == IB_EVENT_LID_CHANGE || + event->event == IB_EVENT_PKEY_CHANGE || + event->event == IB_EVENT_SM_CHANGE || + event->event == IB_EVENT_CLIENT_REREGISTER) { + unsigned long flags; + struct ib_sa_device *sa_dev = + container_of(handler, typeof(*sa_dev), event_handler); + struct ib_sa_port *port = + &sa_dev->port[event->element.port_num - sa_dev->start_port]; + + if (rdma_port_get_link_layer(handler->device, port->port_num) != IB_LINK_LAYER_INFINIBAND) + return; + + spin_lock_irqsave(&port->ah_lock, flags); + if (port->sm_ah) + kref_put(&port->sm_ah->ref, free_sm_ah); + port->sm_ah = NULL; + spin_unlock_irqrestore(&port->ah_lock, flags); + + queue_work(ib_wq, &sa_dev->port[event->element.port_num - + sa_dev->start_port].update_task); + } +} + +void ib_sa_register_client(struct ib_sa_client *client) +{ + atomic_set(&client->users, 1); + init_completion(&client->comp); +} +EXPORT_SYMBOL(ib_sa_register_client); + +void ib_sa_unregister_client(struct ib_sa_client *client) +{ + ib_sa_client_put(client); + wait_for_completion(&client->comp); +} +EXPORT_SYMBOL(ib_sa_unregister_client); + +/** + * ib_sa_cancel_query - try to cancel an SA query + * @id:ID of query to cancel + * @query:query pointer to cancel + * + * Try to cancel an SA query. If the id and query don't match up or + * the query has already completed, nothing is done. Otherwise the + * query is canceled and will complete with a status of -EINTR. + */ +void ib_sa_cancel_query(int id, struct ib_sa_query *query) +{ + unsigned long flags; + struct ib_mad_agent *agent; + struct ib_mad_send_buf *mad_buf; + + spin_lock_irqsave(&idr_lock, flags); + if (idr_find(&query_idr, id) != query) { + spin_unlock_irqrestore(&idr_lock, flags); + return; + } + agent = query->port->agent; + mad_buf = query->mad_buf; + spin_unlock_irqrestore(&idr_lock, flags); + + ib_cancel_mad(agent, mad_buf); +} +EXPORT_SYMBOL(ib_sa_cancel_query); + +static u8 get_src_path_mask(struct ib_device *device, u8 port_num) +{ + struct ib_sa_device *sa_dev; + struct ib_sa_port *port; + unsigned long flags; + u8 src_path_mask; + + sa_dev = ib_get_client_data(device, &sa_client); + if (!sa_dev) + return 0x7f; + + port = &sa_dev->port[port_num - sa_dev->start_port]; + spin_lock_irqsave(&port->ah_lock, flags); + src_path_mask = port->sm_ah ? port->sm_ah->src_path_mask : 0x7f; + spin_unlock_irqrestore(&port->ah_lock, flags); + + return src_path_mask; +} + +int ib_init_ah_from_path(struct ib_device *device, u8 port_num, + struct ib_sa_path_rec *rec, struct ib_ah_attr *ah_attr) +{ + int ret; + u16 gid_index; + int force_grh; + + memset(ah_attr, 0, sizeof *ah_attr); + ah_attr->dlid = be16_to_cpu(rec->dlid); + ah_attr->sl = rec->sl; + ah_attr->src_path_bits = be16_to_cpu(rec->slid) & + get_src_path_mask(device, port_num); + ah_attr->port_num = port_num; + ah_attr->static_rate = rec->rate; + + force_grh = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_ETHERNET; + + if (rec->hop_limit > 1 || force_grh) { + ah_attr->ah_flags = IB_AH_GRH; + ah_attr->grh.dgid = rec->dgid; + + ret = ib_find_cached_gid(device, &rec->sgid, &port_num, + &gid_index); + if (ret) + return ret; + + ah_attr->grh.sgid_index = gid_index; + ah_attr->grh.flow_label = be32_to_cpu(rec->flow_label); + ah_attr->grh.hop_limit = rec->hop_limit; + ah_attr->grh.traffic_class = rec->traffic_class; + } + if (force_grh) { + memcpy(ah_attr->dmac, rec->dmac, ETH_ALEN); + ah_attr->vlan_id = rec->vlan_id; + } else { + ah_attr->vlan_id = 0xffff; + } + + return 0; +} +EXPORT_SYMBOL(ib_init_ah_from_path); + +static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask) +{ + unsigned long flags; + + spin_lock_irqsave(&query->port->ah_lock, flags); + if (!query->port->sm_ah) { + spin_unlock_irqrestore(&query->port->ah_lock, flags); + return -EAGAIN; + } + kref_get(&query->port->sm_ah->ref); + query->sm_ah = query->port->sm_ah; + spin_unlock_irqrestore(&query->port->ah_lock, flags); + + query->mad_buf = ib_create_send_mad(query->port->agent, 1, + query->sm_ah->pkey_index, + 0, IB_MGMT_SA_HDR, IB_MGMT_SA_DATA, + gfp_mask); + if (IS_ERR(query->mad_buf)) { + kref_put(&query->sm_ah->ref, free_sm_ah); + return -ENOMEM; + } + + query->mad_buf->ah = query->sm_ah->ah; + + return 0; +} + +static void free_mad(struct ib_sa_query *query) +{ + ib_free_send_mad(query->mad_buf); + kref_put(&query->sm_ah->ref, free_sm_ah); +} + +static void init_mad(struct ib_sa_mad *mad, struct ib_mad_agent *agent) +{ + unsigned long flags; + + memset(mad, 0, sizeof *mad); + + mad->mad_hdr.base_version = IB_MGMT_BASE_VERSION; + mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM; + mad->mad_hdr.class_version = IB_SA_CLASS_VERSION; + + spin_lock_irqsave(&tid_lock, flags); + mad->mad_hdr.tid = + cpu_to_be64(((u64) agent->hi_tid) << 32 | tid++); + spin_unlock_irqrestore(&tid_lock, flags); +} + +static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) +{ + bool preload = !!(gfp_mask & __GFP_WAIT); + unsigned long flags; + int ret, id; + + if (preload) + idr_preload(gfp_mask); + spin_lock_irqsave(&idr_lock, flags); + + id = idr_alloc(&query_idr, query, 0, 0, GFP_NOWAIT); + + spin_unlock_irqrestore(&idr_lock, flags); + if (preload) + idr_preload_end(); + if (id < 0) + return id; + + query->mad_buf->timeout_ms = timeout_ms; + query->mad_buf->context[0] = query; + query->id = id; + + ret = ib_post_send_mad(query->mad_buf, NULL); + if (ret) { + spin_lock_irqsave(&idr_lock, flags); + idr_remove(&query_idr, id); + spin_unlock_irqrestore(&idr_lock, flags); + } + + /* + * It's not safe to dereference query any more, because the + * send may already have completed and freed the query in + * another context. + */ + return ret ? ret : id; +} + +void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec) +{ + ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), attribute, rec); +} +EXPORT_SYMBOL(ib_sa_unpack_path); + +void ib_sa_pack_path(struct ib_sa_path_rec *rec, void *attribute) +{ + ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), rec, attribute); +} +EXPORT_SYMBOL(ib_sa_pack_path); + +static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query, + int status, + struct ib_sa_mad *mad) +{ + struct ib_sa_path_query *query = + container_of(sa_query, struct ib_sa_path_query, sa_query); + + if (mad) { + struct ib_sa_path_rec rec; + + ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), + mad->data, &rec); + rec.vlan_id = 0xffff; + memset(rec.dmac, 0, ETH_ALEN); + memset(rec.smac, 0, ETH_ALEN); + query->callback(status, &rec, query->context); + } else + query->callback(status, NULL, query->context); +} + +static void ib_sa_path_rec_release(struct ib_sa_query *sa_query) +{ + kfree(container_of(sa_query, struct ib_sa_path_query, sa_query)); +} + +/** + * ib_sa_path_rec_get - Start a Path get query + * @client:SA client + * @device:device to send query on + * @port_num: port number to send query on + * @rec:Path Record to send in query + * @comp_mask:component mask to send in query + * @timeout_ms:time to wait for response + * @gfp_mask:GFP mask to use for internal allocations + * @callback:function called when query completes, times out or is + * canceled + * @context:opaque user context passed to callback + * @sa_query:query context, used to cancel query + * + * Send a Path Record Get query to the SA to look up a path. The + * callback function will be called when the query completes (or + * fails); status is 0 for a successful response, -EINTR if the query + * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error + * occurred sending the query. The resp parameter of the callback is + * only valid if status is 0. + * + * If the return value of ib_sa_path_rec_get() is negative, it is an + * error code. Otherwise it is a query ID that can be used to cancel + * the query. + */ +int ib_sa_path_rec_get(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + struct ib_sa_path_rec *rec, + ib_sa_comp_mask comp_mask, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_path_rec *resp, + void *context), + void *context, + struct ib_sa_query **sa_query) +{ + struct ib_sa_path_query *query; + struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); + struct ib_sa_port *port; + struct ib_mad_agent *agent; + struct ib_sa_mad *mad; + int ret; + + if (!sa_dev) + return -ENODEV; + + port = &sa_dev->port[port_num - sa_dev->start_port]; + agent = port->agent; + + query = kmalloc(sizeof *query, gfp_mask); + if (!query) + return -ENOMEM; + + query->sa_query.port = port; + ret = alloc_mad(&query->sa_query, gfp_mask); + if (ret) + goto err1; + + ib_sa_client_get(client); + query->sa_query.client = client; + query->callback = callback; + query->context = context; + + mad = query->sa_query.mad_buf->mad; + init_mad(mad, agent); + + query->sa_query.callback = callback ? ib_sa_path_rec_callback : NULL; + query->sa_query.release = ib_sa_path_rec_release; + mad->mad_hdr.method = IB_MGMT_METHOD_GET; + mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_PATH_REC); + mad->sa_hdr.comp_mask = comp_mask; + + ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), rec, mad->data); + + *sa_query = &query->sa_query; + + ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); + if (ret < 0) + goto err2; + + return ret; + +err2: + *sa_query = NULL; + ib_sa_client_put(query->sa_query.client); + free_mad(&query->sa_query); + +err1: + kfree(query); + return ret; +} +EXPORT_SYMBOL(ib_sa_path_rec_get); + +static void ib_sa_service_rec_callback(struct ib_sa_query *sa_query, + int status, + struct ib_sa_mad *mad) +{ + struct ib_sa_service_query *query = + container_of(sa_query, struct ib_sa_service_query, sa_query); + + if (mad) { + struct ib_sa_service_rec rec; + + ib_unpack(service_rec_table, ARRAY_SIZE(service_rec_table), + mad->data, &rec); + query->callback(status, &rec, query->context); + } else + query->callback(status, NULL, query->context); +} + +static void ib_sa_service_rec_release(struct ib_sa_query *sa_query) +{ + kfree(container_of(sa_query, struct ib_sa_service_query, sa_query)); +} + +/** + * ib_sa_service_rec_query - Start Service Record operation + * @client:SA client + * @device:device to send request on + * @port_num: port number to send request on + * @method:SA method - should be get, set, or delete + * @rec:Service Record to send in request + * @comp_mask:component mask to send in request + * @timeout_ms:time to wait for response + * @gfp_mask:GFP mask to use for internal allocations + * @callback:function called when request completes, times out or is + * canceled + * @context:opaque user context passed to callback + * @sa_query:request context, used to cancel request + * + * Send a Service Record set/get/delete to the SA to register, + * unregister or query a service record. + * The callback function will be called when the request completes (or + * fails); status is 0 for a successful response, -EINTR if the query + * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error + * occurred sending the query. The resp parameter of the callback is + * only valid if status is 0. + * + * If the return value of ib_sa_service_rec_query() is negative, it is an + * error code. Otherwise it is a request ID that can be used to cancel + * the query. + */ +int ib_sa_service_rec_query(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, u8 method, + struct ib_sa_service_rec *rec, + ib_sa_comp_mask comp_mask, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_service_rec *resp, + void *context), + void *context, + struct ib_sa_query **sa_query) +{ + struct ib_sa_service_query *query; + struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); + struct ib_sa_port *port; + struct ib_mad_agent *agent; + struct ib_sa_mad *mad; + int ret; + + if (!sa_dev) + return -ENODEV; + + port = &sa_dev->port[port_num - sa_dev->start_port]; + agent = port->agent; + + if (method != IB_MGMT_METHOD_GET && + method != IB_MGMT_METHOD_SET && + method != IB_SA_METHOD_DELETE) + return -EINVAL; + + query = kmalloc(sizeof *query, gfp_mask); + if (!query) + return -ENOMEM; + + query->sa_query.port = port; + ret = alloc_mad(&query->sa_query, gfp_mask); + if (ret) + goto err1; + + ib_sa_client_get(client); + query->sa_query.client = client; + query->callback = callback; + query->context = context; + + mad = query->sa_query.mad_buf->mad; + init_mad(mad, agent); + + query->sa_query.callback = callback ? ib_sa_service_rec_callback : NULL; + query->sa_query.release = ib_sa_service_rec_release; + mad->mad_hdr.method = method; + mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_SERVICE_REC); + mad->sa_hdr.comp_mask = comp_mask; + + ib_pack(service_rec_table, ARRAY_SIZE(service_rec_table), + rec, mad->data); + + *sa_query = &query->sa_query; + + ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); + if (ret < 0) + goto err2; + + return ret; + +err2: + *sa_query = NULL; + ib_sa_client_put(query->sa_query.client); + free_mad(&query->sa_query); + +err1: + kfree(query); + return ret; +} +EXPORT_SYMBOL(ib_sa_service_rec_query); + +static void ib_sa_mcmember_rec_callback(struct ib_sa_query *sa_query, + int status, + struct ib_sa_mad *mad) +{ + struct ib_sa_mcmember_query *query = + container_of(sa_query, struct ib_sa_mcmember_query, sa_query); + + if (mad) { + struct ib_sa_mcmember_rec rec; + + ib_unpack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table), + mad->data, &rec); + query->callback(status, &rec, query->context); + } else + query->callback(status, NULL, query->context); +} + +static void ib_sa_mcmember_rec_release(struct ib_sa_query *sa_query) +{ + kfree(container_of(sa_query, struct ib_sa_mcmember_query, sa_query)); +} + +int ib_sa_mcmember_rec_query(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + u8 method, + struct ib_sa_mcmember_rec *rec, + ib_sa_comp_mask comp_mask, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_mcmember_rec *resp, + void *context), + void *context, + struct ib_sa_query **sa_query) +{ + struct ib_sa_mcmember_query *query; + struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); + struct ib_sa_port *port; + struct ib_mad_agent *agent; + struct ib_sa_mad *mad; + int ret; + + if (!sa_dev) + return -ENODEV; + + port = &sa_dev->port[port_num - sa_dev->start_port]; + agent = port->agent; + + query = kmalloc(sizeof *query, gfp_mask); + if (!query) + return -ENOMEM; + + query->sa_query.port = port; + ret = alloc_mad(&query->sa_query, gfp_mask); + if (ret) + goto err1; + + ib_sa_client_get(client); + query->sa_query.client = client; + query->callback = callback; + query->context = context; + + mad = query->sa_query.mad_buf->mad; + init_mad(mad, agent); + + query->sa_query.callback = callback ? ib_sa_mcmember_rec_callback : NULL; + query->sa_query.release = ib_sa_mcmember_rec_release; + mad->mad_hdr.method = method; + mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC); + mad->sa_hdr.comp_mask = comp_mask; + + ib_pack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table), + rec, mad->data); + + *sa_query = &query->sa_query; + + ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); + if (ret < 0) + goto err2; + + return ret; + +err2: + *sa_query = NULL; + ib_sa_client_put(query->sa_query.client); + free_mad(&query->sa_query); + +err1: + kfree(query); + return ret; +} + +/* Support GuidInfoRecord */ +static void ib_sa_guidinfo_rec_callback(struct ib_sa_query *sa_query, + int status, + struct ib_sa_mad *mad) +{ + struct ib_sa_guidinfo_query *query = + container_of(sa_query, struct ib_sa_guidinfo_query, sa_query); + + if (mad) { + struct ib_sa_guidinfo_rec rec; + + ib_unpack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table), + mad->data, &rec); + query->callback(status, &rec, query->context); + } else + query->callback(status, NULL, query->context); +} + +static void ib_sa_guidinfo_rec_release(struct ib_sa_query *sa_query) +{ + kfree(container_of(sa_query, struct ib_sa_guidinfo_query, sa_query)); +} + +int ib_sa_guid_info_rec_query(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + struct ib_sa_guidinfo_rec *rec, + ib_sa_comp_mask comp_mask, u8 method, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_guidinfo_rec *resp, + void *context), + void *context, + struct ib_sa_query **sa_query) +{ + struct ib_sa_guidinfo_query *query; + struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); + struct ib_sa_port *port; + struct ib_mad_agent *agent; + struct ib_sa_mad *mad; + int ret; + + if (!sa_dev) + return -ENODEV; + + if (method != IB_MGMT_METHOD_GET && + method != IB_MGMT_METHOD_SET && + method != IB_SA_METHOD_DELETE) { + return -EINVAL; + } + + port = &sa_dev->port[port_num - sa_dev->start_port]; + agent = port->agent; + + query = kmalloc(sizeof *query, gfp_mask); + if (!query) + return -ENOMEM; + + query->sa_query.port = port; + ret = alloc_mad(&query->sa_query, gfp_mask); + if (ret) + goto err1; + + ib_sa_client_get(client); + query->sa_query.client = client; + query->callback = callback; + query->context = context; + + mad = query->sa_query.mad_buf->mad; + init_mad(mad, agent); + + query->sa_query.callback = callback ? ib_sa_guidinfo_rec_callback : NULL; + query->sa_query.release = ib_sa_guidinfo_rec_release; + + mad->mad_hdr.method = method; + mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_GUID_INFO_REC); + mad->sa_hdr.comp_mask = comp_mask; + + ib_pack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table), rec, + mad->data); + + *sa_query = &query->sa_query; + + ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); + if (ret < 0) + goto err2; + + return ret; + +err2: + *sa_query = NULL; + ib_sa_client_put(query->sa_query.client); + free_mad(&query->sa_query); + +err1: + kfree(query); + return ret; +} +EXPORT_SYMBOL(ib_sa_guid_info_rec_query); + +static void send_handler(struct ib_mad_agent *agent, + struct ib_mad_send_wc *mad_send_wc) +{ + struct ib_sa_query *query = mad_send_wc->send_buf->context[0]; + unsigned long flags; + + if (query->callback) + switch (mad_send_wc->status) { + case IB_WC_SUCCESS: + /* No callback -- already got recv */ + break; + case IB_WC_RESP_TIMEOUT_ERR: + query->callback(query, -ETIMEDOUT, NULL); + break; + case IB_WC_WR_FLUSH_ERR: + query->callback(query, -EINTR, NULL); + break; + default: + query->callback(query, -EIO, NULL); + break; + } + + spin_lock_irqsave(&idr_lock, flags); + idr_remove(&query_idr, query->id); + spin_unlock_irqrestore(&idr_lock, flags); + + free_mad(query); + ib_sa_client_put(query->client); + query->release(query); +} + +static void recv_handler(struct ib_mad_agent *mad_agent, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct ib_sa_query *query; + struct ib_mad_send_buf *mad_buf; + + mad_buf = (void *) (unsigned long) mad_recv_wc->wc->wr_id; + query = mad_buf->context[0]; + + if (query->callback) { + if (mad_recv_wc->wc->status == IB_WC_SUCCESS) + query->callback(query, + mad_recv_wc->recv_buf.mad->mad_hdr.status ? + -EINVAL : 0, + (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad); + else + query->callback(query, -EIO, NULL); + } + + ib_free_recv_mad(mad_recv_wc); +} + +static void ib_sa_add_one(struct ib_device *device) +{ + struct ib_sa_device *sa_dev; + int s, e, i; + + if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) + return; + + if (device->node_type == RDMA_NODE_IB_SWITCH) + s = e = 0; + else { + s = 1; + e = device->phys_port_cnt; + } + + sa_dev = kzalloc(sizeof *sa_dev + + (e - s + 1) * sizeof (struct ib_sa_port), + GFP_KERNEL); + if (!sa_dev) + return; + + sa_dev->start_port = s; + sa_dev->end_port = e; + + for (i = 0; i <= e - s; ++i) { + spin_lock_init(&sa_dev->port[i].ah_lock); + if (rdma_port_get_link_layer(device, i + 1) != IB_LINK_LAYER_INFINIBAND) + continue; + + sa_dev->port[i].sm_ah = NULL; + sa_dev->port[i].port_num = i + s; + + sa_dev->port[i].agent = + ib_register_mad_agent(device, i + s, IB_QPT_GSI, + NULL, 0, send_handler, + recv_handler, sa_dev, 0); + if (IS_ERR(sa_dev->port[i].agent)) + goto err; + + INIT_WORK(&sa_dev->port[i].update_task, update_sm_ah); + } + + ib_set_client_data(device, &sa_client, sa_dev); + + /* + * We register our event handler after everything is set up, + * and then update our cached info after the event handler is + * registered to avoid any problems if a port changes state + * during our initialization. + */ + + INIT_IB_EVENT_HANDLER(&sa_dev->event_handler, device, ib_sa_event); + if (ib_register_event_handler(&sa_dev->event_handler)) + goto err; + + for (i = 0; i <= e - s; ++i) + if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) + update_sm_ah(&sa_dev->port[i].update_task); + + return; + +err: + while (--i >= 0) + if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) + ib_unregister_mad_agent(sa_dev->port[i].agent); + + kfree(sa_dev); + + return; +} + +static void ib_sa_remove_one(struct ib_device *device) +{ + struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); + int i; + + if (!sa_dev) + return; + + ib_unregister_event_handler(&sa_dev->event_handler); + + flush_workqueue(ib_wq); + + for (i = 0; i <= sa_dev->end_port - sa_dev->start_port; ++i) { + if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) { + ib_unregister_mad_agent(sa_dev->port[i].agent); + if (sa_dev->port[i].sm_ah) + kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah); + } + + } + + kfree(sa_dev); +} + +static int __init ib_sa_init(void) +{ + int ret; + + get_random_bytes(&tid, sizeof tid); + + ret = ib_register_client(&sa_client); + if (ret) { + printk(KERN_ERR "Couldn't register ib_sa client\n"); + goto err1; + } + + ret = mcast_init(); + if (ret) { + printk(KERN_ERR "Couldn't initialize multicast handling\n"); + goto err2; + } + + return 0; +err2: + ib_unregister_client(&sa_client); +err1: + return ret; +} + +static void __exit ib_sa_cleanup(void) +{ + mcast_cleanup(); + ib_unregister_client(&sa_client); + idr_destroy(&query_idr); +} + +module_init(ib_sa_init); +module_exit(ib_sa_cleanup); diff --git a/drivers/infiniband/core/smi.c b/drivers/infiniband/core/smi.c new file mode 100644 index 0000000..5855e44 --- /dev/null +++ b/drivers/infiniband/core/smi.c @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2004, 2005 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2004, 2005 Infinicon Corporation. All rights reserved. + * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved. + * Copyright (c) 2004, 2005 Topspin Corporation. All rights reserved. + * Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include "smi.h" + +/* + * Fixup a directed route SMP for sending + * Return 0 if the SMP should be discarded + */ +enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp, + u8 node_type, int port_num) +{ + u8 hop_ptr, hop_cnt; + + hop_ptr = smp->hop_ptr; + hop_cnt = smp->hop_cnt; + + /* See section 14.2.2.2, Vol 1 IB spec */ + /* C14-6 -- valid hop_cnt values are from 0 to 63 */ + if (hop_cnt >= IB_SMP_MAX_PATH_HOPS) + return IB_SMI_DISCARD; + + if (!ib_get_smp_direction(smp)) { + /* C14-9:1 */ + if (hop_cnt && hop_ptr == 0) { + smp->hop_ptr++; + return (smp->initial_path[smp->hop_ptr] == + port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-9:2 */ + if (hop_ptr && hop_ptr < hop_cnt) { + if (node_type != RDMA_NODE_IB_SWITCH) + return IB_SMI_DISCARD; + + /* smp->return_path set when received */ + smp->hop_ptr++; + return (smp->initial_path[smp->hop_ptr] == + port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-9:3 -- We're at the end of the DR segment of path */ + if (hop_ptr == hop_cnt) { + /* smp->return_path set when received */ + smp->hop_ptr++; + return (node_type == RDMA_NODE_IB_SWITCH || + smp->dr_dlid == IB_LID_PERMISSIVE ? + IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */ + /* C14-9:5 -- Fail unreasonable hop pointer */ + return (hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD); + + } else { + /* C14-13:1 */ + if (hop_cnt && hop_ptr == hop_cnt + 1) { + smp->hop_ptr--; + return (smp->return_path[smp->hop_ptr] == + port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-13:2 */ + if (2 <= hop_ptr && hop_ptr <= hop_cnt) { + if (node_type != RDMA_NODE_IB_SWITCH) + return IB_SMI_DISCARD; + + smp->hop_ptr--; + return (smp->return_path[smp->hop_ptr] == + port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-13:3 -- at the end of the DR segment of path */ + if (hop_ptr == 1) { + smp->hop_ptr--; + /* C14-13:3 -- SMPs destined for SM shouldn't be here */ + return (node_type == RDMA_NODE_IB_SWITCH || + smp->dr_slid == IB_LID_PERMISSIVE ? + IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-13:4 -- hop_ptr = 0 -> should have gone to SM */ + if (hop_ptr == 0) + return IB_SMI_HANDLE; + + /* C14-13:5 -- Check for unreasonable hop pointer */ + return IB_SMI_DISCARD; + } +} + +/* + * Adjust information for a received SMP + * Return 0 if the SMP should be dropped + */ +enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type, + int port_num, int phys_port_cnt) +{ + u8 hop_ptr, hop_cnt; + + hop_ptr = smp->hop_ptr; + hop_cnt = smp->hop_cnt; + + /* See section 14.2.2.2, Vol 1 IB spec */ + /* C14-6 -- valid hop_cnt values are from 0 to 63 */ + if (hop_cnt >= IB_SMP_MAX_PATH_HOPS) + return IB_SMI_DISCARD; + + if (!ib_get_smp_direction(smp)) { + /* C14-9:1 -- sender should have incremented hop_ptr */ + if (hop_cnt && hop_ptr == 0) + return IB_SMI_DISCARD; + + /* C14-9:2 -- intermediate hop */ + if (hop_ptr && hop_ptr < hop_cnt) { + if (node_type != RDMA_NODE_IB_SWITCH) + return IB_SMI_DISCARD; + + smp->return_path[hop_ptr] = port_num; + /* smp->hop_ptr updated when sending */ + return (smp->initial_path[hop_ptr+1] <= phys_port_cnt ? + IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-9:3 -- We're at the end of the DR segment of path */ + if (hop_ptr == hop_cnt) { + if (hop_cnt) + smp->return_path[hop_ptr] = port_num; + /* smp->hop_ptr updated when sending */ + + return (node_type == RDMA_NODE_IB_SWITCH || + smp->dr_dlid == IB_LID_PERMISSIVE ? + IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */ + /* C14-9:5 -- fail unreasonable hop pointer */ + return (hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD); + + } else { + + /* C14-13:1 */ + if (hop_cnt && hop_ptr == hop_cnt + 1) { + smp->hop_ptr--; + return (smp->return_path[smp->hop_ptr] == + port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-13:2 */ + if (2 <= hop_ptr && hop_ptr <= hop_cnt) { + if (node_type != RDMA_NODE_IB_SWITCH) + return IB_SMI_DISCARD; + + /* smp->hop_ptr updated when sending */ + return (smp->return_path[hop_ptr-1] <= phys_port_cnt ? + IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-13:3 -- We're at the end of the DR segment of path */ + if (hop_ptr == 1) { + if (smp->dr_slid == IB_LID_PERMISSIVE) { + /* giving SMP to SM - update hop_ptr */ + smp->hop_ptr--; + return IB_SMI_HANDLE; + } + /* smp->hop_ptr updated when sending */ + return (node_type == RDMA_NODE_IB_SWITCH ? + IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-13:4 -- hop_ptr = 0 -> give to SM */ + /* C14-13:5 -- Check for unreasonable hop pointer */ + return (hop_ptr == 0 ? IB_SMI_HANDLE : IB_SMI_DISCARD); + } +} + +enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp) +{ + u8 hop_ptr, hop_cnt; + + hop_ptr = smp->hop_ptr; + hop_cnt = smp->hop_cnt; + + if (!ib_get_smp_direction(smp)) { + /* C14-9:2 -- intermediate hop */ + if (hop_ptr && hop_ptr < hop_cnt) + return IB_SMI_FORWARD; + + /* C14-9:3 -- at the end of the DR segment of path */ + if (hop_ptr == hop_cnt) + return (smp->dr_dlid == IB_LID_PERMISSIVE ? + IB_SMI_SEND : IB_SMI_LOCAL); + + /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */ + if (hop_ptr == hop_cnt + 1) + return IB_SMI_SEND; + } else { + /* C14-13:2 -- intermediate hop */ + if (2 <= hop_ptr && hop_ptr <= hop_cnt) + return IB_SMI_FORWARD; + + /* C14-13:3 -- at the end of the DR segment of path */ + if (hop_ptr == 1) + return (smp->dr_slid != IB_LID_PERMISSIVE ? + IB_SMI_SEND : IB_SMI_LOCAL); + } + return IB_SMI_LOCAL; +} + +/* + * Return the forwarding port number from initial_path for outgoing SMP and + * from return_path for returning SMP + */ +int smi_get_fwd_port(struct ib_smp *smp) +{ + return (!ib_get_smp_direction(smp) ? smp->initial_path[smp->hop_ptr+1] : + smp->return_path[smp->hop_ptr-1]); +} diff --git a/drivers/infiniband/core/smi.h b/drivers/infiniband/core/smi.h new file mode 100644 index 0000000..aff96ba --- /dev/null +++ b/drivers/infiniband/core/smi.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2004 Infinicon Corporation. All rights reserved. + * Copyright (c) 2004 Intel Corporation. All rights reserved. + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef __SMI_H_ +#define __SMI_H_ + +#include + +enum smi_action { + IB_SMI_DISCARD, + IB_SMI_HANDLE +}; + +enum smi_forward_action { + IB_SMI_LOCAL, /* SMP should be completed up the stack */ + IB_SMI_SEND, /* received DR SMP should be forwarded to the send queue */ + IB_SMI_FORWARD /* SMP should be forwarded (for switches only) */ +}; + +enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type, + int port_num, int phys_port_cnt); +int smi_get_fwd_port(struct ib_smp *smp); +extern enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp); +extern enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp, + u8 node_type, int port_num); + +/* + * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM + * via process_mad + */ +static inline enum smi_action smi_check_local_smp(struct ib_smp *smp, + struct ib_device *device) +{ + /* C14-9:3 -- We're at the end of the DR segment of path */ + /* C14-9:4 -- Hop Pointer = Hop Count + 1 -> give to SMA/SM */ + return ((device->process_mad && + !ib_get_smp_direction(smp) && + (smp->hop_ptr == smp->hop_cnt + 1)) ? + IB_SMI_HANDLE : IB_SMI_DISCARD); +} + +/* + * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM + * via process_mad + */ +static inline enum smi_action smi_check_local_returning_smp(struct ib_smp *smp, + struct ib_device *device) +{ + /* C14-13:3 -- We're at the end of the DR segment of path */ + /* C14-13:4 -- Hop Pointer == 0 -> give to SM */ + return ((device->process_mad && + ib_get_smp_direction(smp) && + !smp->hop_ptr) ? IB_SMI_HANDLE : IB_SMI_DISCARD); +} + +#endif /* __SMI_H_ */ diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c new file mode 100644 index 0000000..cbd0383 --- /dev/null +++ b/drivers/infiniband/core/sysfs.c @@ -0,0 +1,922 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "core_priv.h" + +#include +#include +#include + +#include + +struct ib_port { + struct kobject kobj; + struct ib_device *ibdev; + struct attribute_group gid_group; + struct attribute_group pkey_group; + u8 port_num; +}; + +struct port_attribute { + struct attribute attr; + ssize_t (*show)(struct ib_port *, struct port_attribute *, char *buf); + ssize_t (*store)(struct ib_port *, struct port_attribute *, + const char *buf, size_t count); +}; + +#define PORT_ATTR(_name, _mode, _show, _store) \ +struct port_attribute port_attr_##_name = __ATTR(_name, _mode, _show, _store) + +#define PORT_ATTR_RO(_name) \ +struct port_attribute port_attr_##_name = __ATTR_RO(_name) + +struct port_table_attribute { + struct port_attribute attr; + char name[8]; + int index; +}; + +static ssize_t port_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct port_attribute *port_attr = + container_of(attr, struct port_attribute, attr); + struct ib_port *p = container_of(kobj, struct ib_port, kobj); + + if (!port_attr->show) + return -EIO; + + return port_attr->show(p, port_attr, buf); +} + +static const struct sysfs_ops port_sysfs_ops = { + .show = port_attr_show +}; + +static ssize_t state_show(struct ib_port *p, struct port_attribute *unused, + char *buf) +{ + struct ib_port_attr attr; + ssize_t ret; + + static const char *state_name[] = { + [IB_PORT_NOP] = "NOP", + [IB_PORT_DOWN] = "DOWN", + [IB_PORT_INIT] = "INIT", + [IB_PORT_ARMED] = "ARMED", + [IB_PORT_ACTIVE] = "ACTIVE", + [IB_PORT_ACTIVE_DEFER] = "ACTIVE_DEFER" + }; + + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (ret) + return ret; + + return sprintf(buf, "%d: %s\n", attr.state, + attr.state >= 0 && attr.state < ARRAY_SIZE(state_name) ? + state_name[attr.state] : "UNKNOWN"); +} + +static ssize_t lid_show(struct ib_port *p, struct port_attribute *unused, + char *buf) +{ + struct ib_port_attr attr; + ssize_t ret; + + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (ret) + return ret; + + return sprintf(buf, "0x%x\n", attr.lid); +} + +static ssize_t lid_mask_count_show(struct ib_port *p, + struct port_attribute *unused, + char *buf) +{ + struct ib_port_attr attr; + ssize_t ret; + + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (ret) + return ret; + + return sprintf(buf, "%d\n", attr.lmc); +} + +static ssize_t sm_lid_show(struct ib_port *p, struct port_attribute *unused, + char *buf) +{ + struct ib_port_attr attr; + ssize_t ret; + + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (ret) + return ret; + + return sprintf(buf, "0x%x\n", attr.sm_lid); +} + +static ssize_t sm_sl_show(struct ib_port *p, struct port_attribute *unused, + char *buf) +{ + struct ib_port_attr attr; + ssize_t ret; + + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (ret) + return ret; + + return sprintf(buf, "%d\n", attr.sm_sl); +} + +static ssize_t cap_mask_show(struct ib_port *p, struct port_attribute *unused, + char *buf) +{ + struct ib_port_attr attr; + ssize_t ret; + + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (ret) + return ret; + + return sprintf(buf, "0x%08x\n", attr.port_cap_flags); +} + +static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused, + char *buf) +{ + struct ib_port_attr attr; + char *speed = ""; + int rate; /* in deci-Gb/sec */ + ssize_t ret; + + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (ret) + return ret; + + switch (attr.active_speed) { + case IB_SPEED_DDR: + speed = " DDR"; + rate = 50; + break; + case IB_SPEED_QDR: + speed = " QDR"; + rate = 100; + break; + case IB_SPEED_FDR10: + speed = " FDR10"; + rate = 100; + break; + case IB_SPEED_FDR: + speed = " FDR"; + rate = 140; + break; + case IB_SPEED_EDR: + speed = " EDR"; + rate = 250; + break; + case IB_SPEED_SDR: + default: /* default to SDR for invalid rates */ + rate = 25; + break; + } + + rate *= ib_width_enum_to_int(attr.active_width); + if (rate < 0) + return -EINVAL; + + return sprintf(buf, "%d%s Gb/sec (%dX%s)\n", + rate / 10, rate % 10 ? ".5" : "", + ib_width_enum_to_int(attr.active_width), speed); +} + +static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused, + char *buf) +{ + struct ib_port_attr attr; + + ssize_t ret; + + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (ret) + return ret; + + switch (attr.phys_state) { + case 1: return sprintf(buf, "1: Sleep\n"); + case 2: return sprintf(buf, "2: Polling\n"); + case 3: return sprintf(buf, "3: Disabled\n"); + case 4: return sprintf(buf, "4: PortConfigurationTraining\n"); + case 5: return sprintf(buf, "5: LinkUp\n"); + case 6: return sprintf(buf, "6: LinkErrorRecovery\n"); + case 7: return sprintf(buf, "7: Phy Test\n"); + default: return sprintf(buf, "%d: \n", attr.phys_state); + } +} + +static ssize_t link_layer_show(struct ib_port *p, struct port_attribute *unused, + char *buf) +{ + switch (rdma_port_get_link_layer(p->ibdev, p->port_num)) { + case IB_LINK_LAYER_INFINIBAND: + return sprintf(buf, "%s\n", "InfiniBand"); + case IB_LINK_LAYER_ETHERNET: + return sprintf(buf, "%s\n", "Ethernet"); + default: + return sprintf(buf, "%s\n", "Unknown"); + } +} + +static PORT_ATTR_RO(state); +static PORT_ATTR_RO(lid); +static PORT_ATTR_RO(lid_mask_count); +static PORT_ATTR_RO(sm_lid); +static PORT_ATTR_RO(sm_sl); +static PORT_ATTR_RO(cap_mask); +static PORT_ATTR_RO(rate); +static PORT_ATTR_RO(phys_state); +static PORT_ATTR_RO(link_layer); + +static struct attribute *port_default_attrs[] = { + &port_attr_state.attr, + &port_attr_lid.attr, + &port_attr_lid_mask_count.attr, + &port_attr_sm_lid.attr, + &port_attr_sm_sl.attr, + &port_attr_cap_mask.attr, + &port_attr_rate.attr, + &port_attr_phys_state.attr, + &port_attr_link_layer.attr, + NULL +}; + +static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr, + char *buf) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + union ib_gid gid; + ssize_t ret; + + ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid); + if (ret) + return ret; + + return sprintf(buf, "%pI6\n", gid.raw); +} + +static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr, + char *buf) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + u16 pkey; + ssize_t ret; + + ret = ib_query_pkey(p->ibdev, p->port_num, tab_attr->index, &pkey); + if (ret) + return ret; + + return sprintf(buf, "0x%04x\n", pkey); +} + +#define PORT_PMA_ATTR(_name, _counter, _width, _offset) \ +struct port_table_attribute port_pma_attr_##_name = { \ + .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \ + .index = (_offset) | ((_width) << 16) | ((_counter) << 24) \ +} + +static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, + char *buf) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + int offset = tab_attr->index & 0xffff; + int width = (tab_attr->index >> 16) & 0xff; + struct ib_mad *in_mad = NULL; + struct ib_mad *out_mad = NULL; + ssize_t ret; + + if (!p->ibdev->process_mad) + return sprintf(buf, "N/A (no PMA)\n"); + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) { + ret = -ENOMEM; + goto out; + } + + in_mad->mad_hdr.base_version = 1; + in_mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_PERF_MGMT; + in_mad->mad_hdr.class_version = 1; + in_mad->mad_hdr.method = IB_MGMT_METHOD_GET; + in_mad->mad_hdr.attr_id = cpu_to_be16(0x12); /* PortCounters */ + + in_mad->data[41] = p->port_num; /* PortSelect field */ + + if ((p->ibdev->process_mad(p->ibdev, IB_MAD_IGNORE_MKEY, + p->port_num, NULL, NULL, in_mad, out_mad) & + (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) != + (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) { + ret = -EINVAL; + goto out; + } + + switch (width) { + case 4: + ret = sprintf(buf, "%u\n", (out_mad->data[40 + offset / 8] >> + (4 - (offset % 8))) & 0xf); + break; + case 8: + ret = sprintf(buf, "%u\n", out_mad->data[40 + offset / 8]); + break; + case 16: + ret = sprintf(buf, "%u\n", + be16_to_cpup((__be16 *)(out_mad->data + 40 + offset / 8))); + break; + case 32: + ret = sprintf(buf, "%u\n", + be32_to_cpup((__be32 *)(out_mad->data + 40 + offset / 8))); + break; + default: + ret = 0; + } + +out: + kfree(in_mad); + kfree(out_mad); + + return ret; +} + +static PORT_PMA_ATTR(symbol_error , 0, 16, 32); +static PORT_PMA_ATTR(link_error_recovery , 1, 8, 48); +static PORT_PMA_ATTR(link_downed , 2, 8, 56); +static PORT_PMA_ATTR(port_rcv_errors , 3, 16, 64); +static PORT_PMA_ATTR(port_rcv_remote_physical_errors, 4, 16, 80); +static PORT_PMA_ATTR(port_rcv_switch_relay_errors , 5, 16, 96); +static PORT_PMA_ATTR(port_xmit_discards , 6, 16, 112); +static PORT_PMA_ATTR(port_xmit_constraint_errors , 7, 8, 128); +static PORT_PMA_ATTR(port_rcv_constraint_errors , 8, 8, 136); +static PORT_PMA_ATTR(local_link_integrity_errors , 9, 4, 152); +static PORT_PMA_ATTR(excessive_buffer_overrun_errors, 10, 4, 156); +static PORT_PMA_ATTR(VL15_dropped , 11, 16, 176); +static PORT_PMA_ATTR(port_xmit_data , 12, 32, 192); +static PORT_PMA_ATTR(port_rcv_data , 13, 32, 224); +static PORT_PMA_ATTR(port_xmit_packets , 14, 32, 256); +static PORT_PMA_ATTR(port_rcv_packets , 15, 32, 288); + +static struct attribute *pma_attrs[] = { + &port_pma_attr_symbol_error.attr.attr, + &port_pma_attr_link_error_recovery.attr.attr, + &port_pma_attr_link_downed.attr.attr, + &port_pma_attr_port_rcv_errors.attr.attr, + &port_pma_attr_port_rcv_remote_physical_errors.attr.attr, + &port_pma_attr_port_rcv_switch_relay_errors.attr.attr, + &port_pma_attr_port_xmit_discards.attr.attr, + &port_pma_attr_port_xmit_constraint_errors.attr.attr, + &port_pma_attr_port_rcv_constraint_errors.attr.attr, + &port_pma_attr_local_link_integrity_errors.attr.attr, + &port_pma_attr_excessive_buffer_overrun_errors.attr.attr, + &port_pma_attr_VL15_dropped.attr.attr, + &port_pma_attr_port_xmit_data.attr.attr, + &port_pma_attr_port_rcv_data.attr.attr, + &port_pma_attr_port_xmit_packets.attr.attr, + &port_pma_attr_port_rcv_packets.attr.attr, + NULL +}; + +static struct attribute_group pma_group = { + .name = "counters", + .attrs = pma_attrs +}; + +static void ib_port_release(struct kobject *kobj) +{ + struct ib_port *p = container_of(kobj, struct ib_port, kobj); + struct attribute *a; + int i; + + if (p->gid_group.attrs) { + for (i = 0; (a = p->gid_group.attrs[i]); ++i) + kfree(a); + + kfree(p->gid_group.attrs); + } + + if (p->pkey_group.attrs) { + for (i = 0; (a = p->pkey_group.attrs[i]); ++i) + kfree(a); + + kfree(p->pkey_group.attrs); + } + + kfree(p); +} + +static struct kobj_type port_type = { + .release = ib_port_release, + .sysfs_ops = &port_sysfs_ops, + .default_attrs = port_default_attrs +}; + +static void ib_device_release(struct device *device) +{ + struct ib_device *dev = container_of(device, struct ib_device, dev); + + kfree(dev); +} + +static int ib_device_uevent(struct device *device, + struct kobj_uevent_env *env) +{ + struct ib_device *dev = container_of(device, struct ib_device, dev); + + if (add_uevent_var(env, "NAME=%s", dev->name)) + return -ENOMEM; + + /* + * It would be nice to pass the node GUID with the event... + */ + + return 0; +} + +static struct attribute ** +alloc_group_attrs(ssize_t (*show)(struct ib_port *, + struct port_attribute *, char *buf), + int len) +{ + struct attribute **tab_attr; + struct port_table_attribute *element; + int i; + + tab_attr = kcalloc(1 + len, sizeof(struct attribute *), GFP_KERNEL); + if (!tab_attr) + return NULL; + + for (i = 0; i < len; i++) { + element = kzalloc(sizeof(struct port_table_attribute), + GFP_KERNEL); + if (!element) + goto err; + + if (snprintf(element->name, sizeof(element->name), + "%d", i) >= sizeof(element->name)) { + kfree(element); + goto err; + } + + element->attr.attr.name = element->name; + element->attr.attr.mode = S_IRUGO; + element->attr.show = show; + element->index = i; + sysfs_attr_init(&element->attr.attr); + + tab_attr[i] = &element->attr.attr; + } + + return tab_attr; + +err: + while (--i >= 0) + kfree(tab_attr[i]); + kfree(tab_attr); + return NULL; +} + +static int add_port(struct ib_device *device, int port_num, + int (*port_callback)(struct ib_device *, + u8, struct kobject *)) +{ + struct ib_port *p; + struct ib_port_attr attr; + int i; + int ret; + + ret = ib_query_port(device, port_num, &attr); + if (ret) + return ret; + + p = kzalloc(sizeof *p, GFP_KERNEL); + if (!p) + return -ENOMEM; + + p->ibdev = device; + p->port_num = port_num; + + ret = kobject_init_and_add(&p->kobj, &port_type, + device->ports_parent, + "%d", port_num); + if (ret) { + kfree(p); + return ret; + } + + ret = sysfs_create_group(&p->kobj, &pma_group); + if (ret) + goto err_put; + + p->gid_group.name = "gids"; + p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len); + if (!p->gid_group.attrs) { + ret = -ENOMEM; + goto err_remove_pma; + } + + ret = sysfs_create_group(&p->kobj, &p->gid_group); + if (ret) + goto err_free_gid; + + p->pkey_group.name = "pkeys"; + p->pkey_group.attrs = alloc_group_attrs(show_port_pkey, + attr.pkey_tbl_len); + if (!p->pkey_group.attrs) { + ret = -ENOMEM; + goto err_remove_gid; + } + + ret = sysfs_create_group(&p->kobj, &p->pkey_group); + if (ret) + goto err_free_pkey; + + if (port_callback) { + ret = port_callback(device, port_num, &p->kobj); + if (ret) + goto err_remove_pkey; + } + + list_add_tail(&p->kobj.entry, &device->port_list); + + kobject_uevent(&p->kobj, KOBJ_ADD); + return 0; + +err_remove_pkey: + sysfs_remove_group(&p->kobj, &p->pkey_group); + +err_free_pkey: + for (i = 0; i < attr.pkey_tbl_len; ++i) + kfree(p->pkey_group.attrs[i]); + + kfree(p->pkey_group.attrs); + p->pkey_group.attrs = NULL; + +err_remove_gid: + sysfs_remove_group(&p->kobj, &p->gid_group); + +err_free_gid: + for (i = 0; i < attr.gid_tbl_len; ++i) + kfree(p->gid_group.attrs[i]); + + kfree(p->gid_group.attrs); + p->gid_group.attrs = NULL; + +err_remove_pma: + sysfs_remove_group(&p->kobj, &pma_group); + +err_put: + kobject_put(&p->kobj); + return ret; +} + +static ssize_t show_node_type(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct ib_device *dev = container_of(device, struct ib_device, dev); + + switch (dev->node_type) { + case RDMA_NODE_IB_CA: return sprintf(buf, "%d: CA\n", dev->node_type); + case RDMA_NODE_RNIC: return sprintf(buf, "%d: RNIC\n", dev->node_type); + case RDMA_NODE_USNIC: return sprintf(buf, "%d: usNIC\n", dev->node_type); + case RDMA_NODE_USNIC_UDP: return sprintf(buf, "%d: usNIC UDP\n", dev->node_type); + case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type); + case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type); + default: return sprintf(buf, "%d: \n", dev->node_type); + } +} + +static ssize_t show_sys_image_guid(struct device *device, + struct device_attribute *dev_attr, char *buf) +{ + struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device_attr attr; + ssize_t ret; + + ret = ib_query_device(dev, &attr); + if (ret) + return ret; + + return sprintf(buf, "%04x:%04x:%04x:%04x\n", + be16_to_cpu(((__be16 *) &attr.sys_image_guid)[0]), + be16_to_cpu(((__be16 *) &attr.sys_image_guid)[1]), + be16_to_cpu(((__be16 *) &attr.sys_image_guid)[2]), + be16_to_cpu(((__be16 *) &attr.sys_image_guid)[3])); +} + +static ssize_t show_node_guid(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct ib_device *dev = container_of(device, struct ib_device, dev); + + return sprintf(buf, "%04x:%04x:%04x:%04x\n", + be16_to_cpu(((__be16 *) &dev->node_guid)[0]), + be16_to_cpu(((__be16 *) &dev->node_guid)[1]), + be16_to_cpu(((__be16 *) &dev->node_guid)[2]), + be16_to_cpu(((__be16 *) &dev->node_guid)[3])); +} + +static ssize_t show_node_desc(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct ib_device *dev = container_of(device, struct ib_device, dev); + + return sprintf(buf, "%.64s\n", dev->node_desc); +} + +static ssize_t set_node_desc(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device_modify desc = {}; + int ret; + + if (!dev->modify_device) + return -EIO; + + memcpy(desc.node_desc, buf, min_t(int, count, 64)); + ret = ib_modify_device(dev, IB_DEVICE_MODIFY_NODE_DESC, &desc); + if (ret) + return ret; + + return count; +} + +static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL); +static DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL); +static DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL); +static DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc); + +static struct device_attribute *ib_class_attributes[] = { + &dev_attr_node_type, + &dev_attr_sys_image_guid, + &dev_attr_node_guid, + &dev_attr_node_desc +}; + +static struct class ib_class = { + .name = "infiniband", + .dev_release = ib_device_release, + .dev_uevent = ib_device_uevent, +}; + +/* Show a given an attribute in the statistics group */ +static ssize_t show_protocol_stat(const struct device *device, + struct device_attribute *attr, char *buf, + unsigned offset) +{ + struct ib_device *dev = container_of(device, struct ib_device, dev); + union rdma_protocol_stats stats; + ssize_t ret; + + ret = dev->get_protocol_stats(dev, &stats); + if (ret) + return ret; + + return sprintf(buf, "%llu\n", + (unsigned long long) ((u64 *) &stats)[offset]); +} + +/* generate a read-only iwarp statistics attribute */ +#define IW_STATS_ENTRY(name) \ +static ssize_t show_##name(struct device *device, \ + struct device_attribute *attr, char *buf) \ +{ \ + return show_protocol_stat(device, attr, buf, \ + offsetof(struct iw_protocol_stats, name) / \ + sizeof (u64)); \ +} \ +static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) + +IW_STATS_ENTRY(ipInReceives); +IW_STATS_ENTRY(ipInHdrErrors); +IW_STATS_ENTRY(ipInTooBigErrors); +IW_STATS_ENTRY(ipInNoRoutes); +IW_STATS_ENTRY(ipInAddrErrors); +IW_STATS_ENTRY(ipInUnknownProtos); +IW_STATS_ENTRY(ipInTruncatedPkts); +IW_STATS_ENTRY(ipInDiscards); +IW_STATS_ENTRY(ipInDelivers); +IW_STATS_ENTRY(ipOutForwDatagrams); +IW_STATS_ENTRY(ipOutRequests); +IW_STATS_ENTRY(ipOutDiscards); +IW_STATS_ENTRY(ipOutNoRoutes); +IW_STATS_ENTRY(ipReasmTimeout); +IW_STATS_ENTRY(ipReasmReqds); +IW_STATS_ENTRY(ipReasmOKs); +IW_STATS_ENTRY(ipReasmFails); +IW_STATS_ENTRY(ipFragOKs); +IW_STATS_ENTRY(ipFragFails); +IW_STATS_ENTRY(ipFragCreates); +IW_STATS_ENTRY(ipInMcastPkts); +IW_STATS_ENTRY(ipOutMcastPkts); +IW_STATS_ENTRY(ipInBcastPkts); +IW_STATS_ENTRY(ipOutBcastPkts); +IW_STATS_ENTRY(tcpRtoAlgorithm); +IW_STATS_ENTRY(tcpRtoMin); +IW_STATS_ENTRY(tcpRtoMax); +IW_STATS_ENTRY(tcpMaxConn); +IW_STATS_ENTRY(tcpActiveOpens); +IW_STATS_ENTRY(tcpPassiveOpens); +IW_STATS_ENTRY(tcpAttemptFails); +IW_STATS_ENTRY(tcpEstabResets); +IW_STATS_ENTRY(tcpCurrEstab); +IW_STATS_ENTRY(tcpInSegs); +IW_STATS_ENTRY(tcpOutSegs); +IW_STATS_ENTRY(tcpRetransSegs); +IW_STATS_ENTRY(tcpInErrs); +IW_STATS_ENTRY(tcpOutRsts); + +static struct attribute *iw_proto_stats_attrs[] = { + &dev_attr_ipInReceives.attr, + &dev_attr_ipInHdrErrors.attr, + &dev_attr_ipInTooBigErrors.attr, + &dev_attr_ipInNoRoutes.attr, + &dev_attr_ipInAddrErrors.attr, + &dev_attr_ipInUnknownProtos.attr, + &dev_attr_ipInTruncatedPkts.attr, + &dev_attr_ipInDiscards.attr, + &dev_attr_ipInDelivers.attr, + &dev_attr_ipOutForwDatagrams.attr, + &dev_attr_ipOutRequests.attr, + &dev_attr_ipOutDiscards.attr, + &dev_attr_ipOutNoRoutes.attr, + &dev_attr_ipReasmTimeout.attr, + &dev_attr_ipReasmReqds.attr, + &dev_attr_ipReasmOKs.attr, + &dev_attr_ipReasmFails.attr, + &dev_attr_ipFragOKs.attr, + &dev_attr_ipFragFails.attr, + &dev_attr_ipFragCreates.attr, + &dev_attr_ipInMcastPkts.attr, + &dev_attr_ipOutMcastPkts.attr, + &dev_attr_ipInBcastPkts.attr, + &dev_attr_ipOutBcastPkts.attr, + &dev_attr_tcpRtoAlgorithm.attr, + &dev_attr_tcpRtoMin.attr, + &dev_attr_tcpRtoMax.attr, + &dev_attr_tcpMaxConn.attr, + &dev_attr_tcpActiveOpens.attr, + &dev_attr_tcpPassiveOpens.attr, + &dev_attr_tcpAttemptFails.attr, + &dev_attr_tcpEstabResets.attr, + &dev_attr_tcpCurrEstab.attr, + &dev_attr_tcpInSegs.attr, + &dev_attr_tcpOutSegs.attr, + &dev_attr_tcpRetransSegs.attr, + &dev_attr_tcpInErrs.attr, + &dev_attr_tcpOutRsts.attr, + NULL +}; + +static struct attribute_group iw_stats_group = { + .name = "proto_stats", + .attrs = iw_proto_stats_attrs, +}; + +static void free_port_list_attributes(struct ib_device *device) +{ + struct kobject *p, *t; + + list_for_each_entry_safe(p, t, &device->port_list, entry) { + struct ib_port *port = container_of(p, struct ib_port, kobj); + list_del(&p->entry); + sysfs_remove_group(p, &pma_group); + sysfs_remove_group(p, &port->pkey_group); + sysfs_remove_group(p, &port->gid_group); + kobject_put(p); + } + + kobject_put(device->ports_parent); +} + +int ib_device_register_sysfs(struct ib_device *device, + int (*port_callback)(struct ib_device *, + u8, struct kobject *)) +{ + struct device *class_dev = &device->dev; + int ret; + int i; + + class_dev->class = &ib_class; + class_dev->parent = device->dma_device; + dev_set_name(class_dev, "%s", device->name); + dev_set_drvdata(class_dev, device); + + INIT_LIST_HEAD(&device->port_list); + + ret = device_register(class_dev); + if (ret) + goto err; + + for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) { + ret = device_create_file(class_dev, ib_class_attributes[i]); + if (ret) + goto err_unregister; + } + + device->ports_parent = kobject_create_and_add("ports", + &class_dev->kobj); + if (!device->ports_parent) { + ret = -ENOMEM; + goto err_put; + } + + if (device->node_type == RDMA_NODE_IB_SWITCH) { + ret = add_port(device, 0, port_callback); + if (ret) + goto err_put; + } else { + for (i = 1; i <= device->phys_port_cnt; ++i) { + ret = add_port(device, i, port_callback); + if (ret) + goto err_put; + } + } + + if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats) { + ret = sysfs_create_group(&class_dev->kobj, &iw_stats_group); + if (ret) + goto err_put; + } + + return 0; + +err_put: + free_port_list_attributes(device); + +err_unregister: + device_unregister(class_dev); + +err: + return ret; +} + +void ib_device_unregister_sysfs(struct ib_device *device) +{ + /* Hold kobject until ib_dealloc_device() */ + struct kobject *kobj_dev = kobject_get(&device->dev.kobj); + int i; + + if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats) + sysfs_remove_group(kobj_dev, &iw_stats_group); + + free_port_list_attributes(device); + + for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) + device_remove_file(&device->dev, ib_class_attributes[i]); + + device_unregister(&device->dev); +} + +int ib_sysfs_setup(void) +{ + return class_register(&ib_class); +} + +void ib_sysfs_cleanup(void) +{ + class_unregister(&ib_class); +} diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c new file mode 100644 index 0000000..f2f6393 --- /dev/null +++ b/drivers/infiniband/core/ucm.c @@ -0,0 +1,1370 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +MODULE_AUTHOR("Libor Michalek"); +MODULE_DESCRIPTION("InfiniBand userspace Connection Manager access"); +MODULE_LICENSE("Dual BSD/GPL"); + +struct ib_ucm_device { + int devnum; + struct cdev cdev; + struct device dev; + struct ib_device *ib_dev; +}; + +struct ib_ucm_file { + struct mutex file_mutex; + struct file *filp; + struct ib_ucm_device *device; + + struct list_head ctxs; + struct list_head events; + wait_queue_head_t poll_wait; +}; + +struct ib_ucm_context { + int id; + struct completion comp; + atomic_t ref; + int events_reported; + + struct ib_ucm_file *file; + struct ib_cm_id *cm_id; + __u64 uid; + + struct list_head events; /* list of pending events. */ + struct list_head file_list; /* member in file ctx list */ +}; + +struct ib_ucm_event { + struct ib_ucm_context *ctx; + struct list_head file_list; /* member in file event list */ + struct list_head ctx_list; /* member in ctx event list */ + + struct ib_cm_id *cm_id; + struct ib_ucm_event_resp resp; + void *data; + void *info; + int data_len; + int info_len; +}; + +enum { + IB_UCM_MAJOR = 231, + IB_UCM_BASE_MINOR = 224, + IB_UCM_MAX_DEVICES = 32 +}; + +#define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR) + +static void ib_ucm_add_one(struct ib_device *device); +static void ib_ucm_remove_one(struct ib_device *device); + +static struct ib_client ucm_client = { + .name = "ucm", + .add = ib_ucm_add_one, + .remove = ib_ucm_remove_one +}; + +static DEFINE_MUTEX(ctx_id_mutex); +static DEFINE_IDR(ctx_id_table); +static DECLARE_BITMAP(dev_map, IB_UCM_MAX_DEVICES); + +static struct ib_ucm_context *ib_ucm_ctx_get(struct ib_ucm_file *file, int id) +{ + struct ib_ucm_context *ctx; + + mutex_lock(&ctx_id_mutex); + ctx = idr_find(&ctx_id_table, id); + if (!ctx) + ctx = ERR_PTR(-ENOENT); + else if (ctx->file != file) + ctx = ERR_PTR(-EINVAL); + else + atomic_inc(&ctx->ref); + mutex_unlock(&ctx_id_mutex); + + return ctx; +} + +static void ib_ucm_ctx_put(struct ib_ucm_context *ctx) +{ + if (atomic_dec_and_test(&ctx->ref)) + complete(&ctx->comp); +} + +static inline int ib_ucm_new_cm_id(int event) +{ + return event == IB_CM_REQ_RECEIVED || event == IB_CM_SIDR_REQ_RECEIVED; +} + +static void ib_ucm_cleanup_events(struct ib_ucm_context *ctx) +{ + struct ib_ucm_event *uevent; + + mutex_lock(&ctx->file->file_mutex); + list_del(&ctx->file_list); + while (!list_empty(&ctx->events)) { + + uevent = list_entry(ctx->events.next, + struct ib_ucm_event, ctx_list); + list_del(&uevent->file_list); + list_del(&uevent->ctx_list); + mutex_unlock(&ctx->file->file_mutex); + + /* clear incoming connections. */ + if (ib_ucm_new_cm_id(uevent->resp.event)) + ib_destroy_cm_id(uevent->cm_id); + + kfree(uevent); + mutex_lock(&ctx->file->file_mutex); + } + mutex_unlock(&ctx->file->file_mutex); +} + +static struct ib_ucm_context *ib_ucm_ctx_alloc(struct ib_ucm_file *file) +{ + struct ib_ucm_context *ctx; + + ctx = kzalloc(sizeof *ctx, GFP_KERNEL); + if (!ctx) + return NULL; + + atomic_set(&ctx->ref, 1); + init_completion(&ctx->comp); + ctx->file = file; + INIT_LIST_HEAD(&ctx->events); + + mutex_lock(&ctx_id_mutex); + ctx->id = idr_alloc(&ctx_id_table, ctx, 0, 0, GFP_KERNEL); + mutex_unlock(&ctx_id_mutex); + if (ctx->id < 0) + goto error; + + list_add_tail(&ctx->file_list, &file->ctxs); + return ctx; + +error: + kfree(ctx); + return NULL; +} + +static void ib_ucm_event_req_get(struct ib_ucm_req_event_resp *ureq, + struct ib_cm_req_event_param *kreq) +{ + ureq->remote_ca_guid = kreq->remote_ca_guid; + ureq->remote_qkey = kreq->remote_qkey; + ureq->remote_qpn = kreq->remote_qpn; + ureq->qp_type = kreq->qp_type; + ureq->starting_psn = kreq->starting_psn; + ureq->responder_resources = kreq->responder_resources; + ureq->initiator_depth = kreq->initiator_depth; + ureq->local_cm_response_timeout = kreq->local_cm_response_timeout; + ureq->flow_control = kreq->flow_control; + ureq->remote_cm_response_timeout = kreq->remote_cm_response_timeout; + ureq->retry_count = kreq->retry_count; + ureq->rnr_retry_count = kreq->rnr_retry_count; + ureq->srq = kreq->srq; + ureq->port = kreq->port; + + ib_copy_path_rec_to_user(&ureq->primary_path, kreq->primary_path); + if (kreq->alternate_path) + ib_copy_path_rec_to_user(&ureq->alternate_path, + kreq->alternate_path); +} + +static void ib_ucm_event_rep_get(struct ib_ucm_rep_event_resp *urep, + struct ib_cm_rep_event_param *krep) +{ + urep->remote_ca_guid = krep->remote_ca_guid; + urep->remote_qkey = krep->remote_qkey; + urep->remote_qpn = krep->remote_qpn; + urep->starting_psn = krep->starting_psn; + urep->responder_resources = krep->responder_resources; + urep->initiator_depth = krep->initiator_depth; + urep->target_ack_delay = krep->target_ack_delay; + urep->failover_accepted = krep->failover_accepted; + urep->flow_control = krep->flow_control; + urep->rnr_retry_count = krep->rnr_retry_count; + urep->srq = krep->srq; +} + +static void ib_ucm_event_sidr_rep_get(struct ib_ucm_sidr_rep_event_resp *urep, + struct ib_cm_sidr_rep_event_param *krep) +{ + urep->status = krep->status; + urep->qkey = krep->qkey; + urep->qpn = krep->qpn; +}; + +static int ib_ucm_event_process(struct ib_cm_event *evt, + struct ib_ucm_event *uvt) +{ + void *info = NULL; + + switch (evt->event) { + case IB_CM_REQ_RECEIVED: + ib_ucm_event_req_get(&uvt->resp.u.req_resp, + &evt->param.req_rcvd); + uvt->data_len = IB_CM_REQ_PRIVATE_DATA_SIZE; + uvt->resp.present = IB_UCM_PRES_PRIMARY; + uvt->resp.present |= (evt->param.req_rcvd.alternate_path ? + IB_UCM_PRES_ALTERNATE : 0); + break; + case IB_CM_REP_RECEIVED: + ib_ucm_event_rep_get(&uvt->resp.u.rep_resp, + &evt->param.rep_rcvd); + uvt->data_len = IB_CM_REP_PRIVATE_DATA_SIZE; + break; + case IB_CM_RTU_RECEIVED: + uvt->data_len = IB_CM_RTU_PRIVATE_DATA_SIZE; + uvt->resp.u.send_status = evt->param.send_status; + break; + case IB_CM_DREQ_RECEIVED: + uvt->data_len = IB_CM_DREQ_PRIVATE_DATA_SIZE; + uvt->resp.u.send_status = evt->param.send_status; + break; + case IB_CM_DREP_RECEIVED: + uvt->data_len = IB_CM_DREP_PRIVATE_DATA_SIZE; + uvt->resp.u.send_status = evt->param.send_status; + break; + case IB_CM_MRA_RECEIVED: + uvt->resp.u.mra_resp.timeout = + evt->param.mra_rcvd.service_timeout; + uvt->data_len = IB_CM_MRA_PRIVATE_DATA_SIZE; + break; + case IB_CM_REJ_RECEIVED: + uvt->resp.u.rej_resp.reason = evt->param.rej_rcvd.reason; + uvt->data_len = IB_CM_REJ_PRIVATE_DATA_SIZE; + uvt->info_len = evt->param.rej_rcvd.ari_length; + info = evt->param.rej_rcvd.ari; + break; + case IB_CM_LAP_RECEIVED: + ib_copy_path_rec_to_user(&uvt->resp.u.lap_resp.path, + evt->param.lap_rcvd.alternate_path); + uvt->data_len = IB_CM_LAP_PRIVATE_DATA_SIZE; + uvt->resp.present = IB_UCM_PRES_ALTERNATE; + break; + case IB_CM_APR_RECEIVED: + uvt->resp.u.apr_resp.status = evt->param.apr_rcvd.ap_status; + uvt->data_len = IB_CM_APR_PRIVATE_DATA_SIZE; + uvt->info_len = evt->param.apr_rcvd.info_len; + info = evt->param.apr_rcvd.apr_info; + break; + case IB_CM_SIDR_REQ_RECEIVED: + uvt->resp.u.sidr_req_resp.pkey = + evt->param.sidr_req_rcvd.pkey; + uvt->resp.u.sidr_req_resp.port = + evt->param.sidr_req_rcvd.port; + uvt->data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE; + break; + case IB_CM_SIDR_REP_RECEIVED: + ib_ucm_event_sidr_rep_get(&uvt->resp.u.sidr_rep_resp, + &evt->param.sidr_rep_rcvd); + uvt->data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE; + uvt->info_len = evt->param.sidr_rep_rcvd.info_len; + info = evt->param.sidr_rep_rcvd.info; + break; + default: + uvt->resp.u.send_status = evt->param.send_status; + break; + } + + if (uvt->data_len) { + uvt->data = kmemdup(evt->private_data, uvt->data_len, GFP_KERNEL); + if (!uvt->data) + goto err1; + + uvt->resp.present |= IB_UCM_PRES_DATA; + } + + if (uvt->info_len) { + uvt->info = kmemdup(info, uvt->info_len, GFP_KERNEL); + if (!uvt->info) + goto err2; + + uvt->resp.present |= IB_UCM_PRES_INFO; + } + return 0; + +err2: + kfree(uvt->data); +err1: + return -ENOMEM; +} + +static int ib_ucm_event_handler(struct ib_cm_id *cm_id, + struct ib_cm_event *event) +{ + struct ib_ucm_event *uevent; + struct ib_ucm_context *ctx; + int result = 0; + + ctx = cm_id->context; + + uevent = kzalloc(sizeof *uevent, GFP_KERNEL); + if (!uevent) + goto err1; + + uevent->ctx = ctx; + uevent->cm_id = cm_id; + uevent->resp.uid = ctx->uid; + uevent->resp.id = ctx->id; + uevent->resp.event = event->event; + + result = ib_ucm_event_process(event, uevent); + if (result) + goto err2; + + mutex_lock(&ctx->file->file_mutex); + list_add_tail(&uevent->file_list, &ctx->file->events); + list_add_tail(&uevent->ctx_list, &ctx->events); + wake_up_interruptible(&ctx->file->poll_wait); + mutex_unlock(&ctx->file->file_mutex); + return 0; + +err2: + kfree(uevent); +err1: + /* Destroy new cm_id's */ + return ib_ucm_new_cm_id(event->event); +} + +static ssize_t ib_ucm_event(struct ib_ucm_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct ib_ucm_context *ctx; + struct ib_ucm_event_get cmd; + struct ib_ucm_event *uevent; + int result = 0; + + if (out_len < sizeof(struct ib_ucm_event_resp)) + return -ENOSPC; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + mutex_lock(&file->file_mutex); + while (list_empty(&file->events)) { + mutex_unlock(&file->file_mutex); + + if (file->filp->f_flags & O_NONBLOCK) + return -EAGAIN; + + if (wait_event_interruptible(file->poll_wait, + !list_empty(&file->events))) + return -ERESTARTSYS; + + mutex_lock(&file->file_mutex); + } + + uevent = list_entry(file->events.next, struct ib_ucm_event, file_list); + + if (ib_ucm_new_cm_id(uevent->resp.event)) { + ctx = ib_ucm_ctx_alloc(file); + if (!ctx) { + result = -ENOMEM; + goto done; + } + + ctx->cm_id = uevent->cm_id; + ctx->cm_id->context = ctx; + uevent->resp.id = ctx->id; + } + + if (copy_to_user((void __user *)(unsigned long)cmd.response, + &uevent->resp, sizeof(uevent->resp))) { + result = -EFAULT; + goto done; + } + + if (uevent->data) { + if (cmd.data_len < uevent->data_len) { + result = -ENOMEM; + goto done; + } + if (copy_to_user((void __user *)(unsigned long)cmd.data, + uevent->data, uevent->data_len)) { + result = -EFAULT; + goto done; + } + } + + if (uevent->info) { + if (cmd.info_len < uevent->info_len) { + result = -ENOMEM; + goto done; + } + if (copy_to_user((void __user *)(unsigned long)cmd.info, + uevent->info, uevent->info_len)) { + result = -EFAULT; + goto done; + } + } + + list_del(&uevent->file_list); + list_del(&uevent->ctx_list); + uevent->ctx->events_reported++; + + kfree(uevent->data); + kfree(uevent->info); + kfree(uevent); +done: + mutex_unlock(&file->file_mutex); + return result; +} + +static ssize_t ib_ucm_create_id(struct ib_ucm_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct ib_ucm_create_id cmd; + struct ib_ucm_create_id_resp resp; + struct ib_ucm_context *ctx; + int result; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + mutex_lock(&file->file_mutex); + ctx = ib_ucm_ctx_alloc(file); + mutex_unlock(&file->file_mutex); + if (!ctx) + return -ENOMEM; + + ctx->uid = cmd.uid; + ctx->cm_id = ib_create_cm_id(file->device->ib_dev, + ib_ucm_event_handler, ctx); + if (IS_ERR(ctx->cm_id)) { + result = PTR_ERR(ctx->cm_id); + goto err1; + } + + resp.id = ctx->id; + if (copy_to_user((void __user *)(unsigned long)cmd.response, + &resp, sizeof(resp))) { + result = -EFAULT; + goto err2; + } + return 0; + +err2: + ib_destroy_cm_id(ctx->cm_id); +err1: + mutex_lock(&ctx_id_mutex); + idr_remove(&ctx_id_table, ctx->id); + mutex_unlock(&ctx_id_mutex); + kfree(ctx); + return result; +} + +static ssize_t ib_ucm_destroy_id(struct ib_ucm_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct ib_ucm_destroy_id cmd; + struct ib_ucm_destroy_id_resp resp; + struct ib_ucm_context *ctx; + int result = 0; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + mutex_lock(&ctx_id_mutex); + ctx = idr_find(&ctx_id_table, cmd.id); + if (!ctx) + ctx = ERR_PTR(-ENOENT); + else if (ctx->file != file) + ctx = ERR_PTR(-EINVAL); + else + idr_remove(&ctx_id_table, ctx->id); + mutex_unlock(&ctx_id_mutex); + + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ib_ucm_ctx_put(ctx); + wait_for_completion(&ctx->comp); + + /* No new events will be generated after destroying the cm_id. */ + ib_destroy_cm_id(ctx->cm_id); + /* Cleanup events not yet reported to the user. */ + ib_ucm_cleanup_events(ctx); + + resp.events_reported = ctx->events_reported; + if (copy_to_user((void __user *)(unsigned long)cmd.response, + &resp, sizeof(resp))) + result = -EFAULT; + + kfree(ctx); + return result; +} + +static ssize_t ib_ucm_attr_id(struct ib_ucm_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct ib_ucm_attr_id_resp resp; + struct ib_ucm_attr_id cmd; + struct ib_ucm_context *ctx; + int result = 0; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ctx = ib_ucm_ctx_get(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + resp.service_id = ctx->cm_id->service_id; + resp.service_mask = ctx->cm_id->service_mask; + resp.local_id = ctx->cm_id->local_id; + resp.remote_id = ctx->cm_id->remote_id; + + if (copy_to_user((void __user *)(unsigned long)cmd.response, + &resp, sizeof(resp))) + result = -EFAULT; + + ib_ucm_ctx_put(ctx); + return result; +} + +static ssize_t ib_ucm_init_qp_attr(struct ib_ucm_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct ib_uverbs_qp_attr resp; + struct ib_ucm_init_qp_attr cmd; + struct ib_ucm_context *ctx; + struct ib_qp_attr qp_attr; + int result = 0; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ctx = ib_ucm_ctx_get(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + resp.qp_attr_mask = 0; + memset(&qp_attr, 0, sizeof qp_attr); + qp_attr.qp_state = cmd.qp_state; + result = ib_cm_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask); + if (result) + goto out; + + ib_copy_qp_attr_to_user(&resp, &qp_attr); + + if (copy_to_user((void __user *)(unsigned long)cmd.response, + &resp, sizeof(resp))) + result = -EFAULT; + +out: + ib_ucm_ctx_put(ctx); + return result; +} + +static int ucm_validate_listen(__be64 service_id, __be64 service_mask) +{ + service_id &= service_mask; + + if (((service_id & IB_CMA_SERVICE_ID_MASK) == IB_CMA_SERVICE_ID) || + ((service_id & IB_SDP_SERVICE_ID_MASK) == IB_SDP_SERVICE_ID)) + return -EINVAL; + + return 0; +} + +static ssize_t ib_ucm_listen(struct ib_ucm_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct ib_ucm_listen cmd; + struct ib_ucm_context *ctx; + int result; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ctx = ib_ucm_ctx_get(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + result = ucm_validate_listen(cmd.service_id, cmd.service_mask); + if (result) + goto out; + + result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask, + NULL); +out: + ib_ucm_ctx_put(ctx); + return result; +} + +static ssize_t ib_ucm_notify(struct ib_ucm_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct ib_ucm_notify cmd; + struct ib_ucm_context *ctx; + int result; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ctx = ib_ucm_ctx_get(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + result = ib_cm_notify(ctx->cm_id, (enum ib_event_type) cmd.event); + ib_ucm_ctx_put(ctx); + return result; +} + +static int ib_ucm_alloc_data(const void **dest, u64 src, u32 len) +{ + void *data; + + *dest = NULL; + + if (!len) + return 0; + + data = memdup_user((void __user *)(unsigned long)src, len); + if (IS_ERR(data)) + return PTR_ERR(data); + + *dest = data; + return 0; +} + +static int ib_ucm_path_get(struct ib_sa_path_rec **path, u64 src) +{ + struct ib_user_path_rec upath; + struct ib_sa_path_rec *sa_path; + + *path = NULL; + + if (!src) + return 0; + + sa_path = kmalloc(sizeof(*sa_path), GFP_KERNEL); + if (!sa_path) + return -ENOMEM; + + if (copy_from_user(&upath, (void __user *)(unsigned long)src, + sizeof(upath))) { + + kfree(sa_path); + return -EFAULT; + } + + ib_copy_path_rec_from_user(sa_path, &upath); + *path = sa_path; + return 0; +} + +static ssize_t ib_ucm_send_req(struct ib_ucm_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct ib_cm_req_param param; + struct ib_ucm_context *ctx; + struct ib_ucm_req cmd; + int result; + + param.private_data = NULL; + param.primary_path = NULL; + param.alternate_path = NULL; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + result = ib_ucm_alloc_data(¶m.private_data, cmd.data, cmd.len); + if (result) + goto done; + + result = ib_ucm_path_get(¶m.primary_path, cmd.primary_path); + if (result) + goto done; + + result = ib_ucm_path_get(¶m.alternate_path, cmd.alternate_path); + if (result) + goto done; + + param.private_data_len = cmd.len; + param.service_id = cmd.sid; + param.qp_num = cmd.qpn; + param.qp_type = cmd.qp_type; + param.starting_psn = cmd.psn; + param.peer_to_peer = cmd.peer_to_peer; + param.responder_resources = cmd.responder_resources; + param.initiator_depth = cmd.initiator_depth; + param.remote_cm_response_timeout = cmd.remote_cm_response_timeout; + param.flow_control = cmd.flow_control; + param.local_cm_response_timeout = cmd.local_cm_response_timeout; + param.retry_count = cmd.retry_count; + param.rnr_retry_count = cmd.rnr_retry_count; + param.max_cm_retries = cmd.max_cm_retries; + param.srq = cmd.srq; + + ctx = ib_ucm_ctx_get(file, cmd.id); + if (!IS_ERR(ctx)) { + result = ib_send_cm_req(ctx->cm_id, ¶m); + ib_ucm_ctx_put(ctx); + } else + result = PTR_ERR(ctx); + +done: + kfree(param.private_data); + kfree(param.primary_path); + kfree(param.alternate_path); + return result; +} + +static ssize_t ib_ucm_send_rep(struct ib_ucm_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct ib_cm_rep_param param; + struct ib_ucm_context *ctx; + struct ib_ucm_rep cmd; + int result; + + param.private_data = NULL; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + result = ib_ucm_alloc_data(¶m.private_data, cmd.data, cmd.len); + if (result) + return result; + + param.qp_num = cmd.qpn; + param.starting_psn = cmd.psn; + param.private_data_len = cmd.len; + param.responder_resources = cmd.responder_resources; + param.initiator_depth = cmd.initiator_depth; + param.failover_accepted = cmd.failover_accepted; + param.flow_control = cmd.flow_control; + param.rnr_retry_count = cmd.rnr_retry_count; + param.srq = cmd.srq; + + ctx = ib_ucm_ctx_get(file, cmd.id); + if (!IS_ERR(ctx)) { + ctx->uid = cmd.uid; + result = ib_send_cm_rep(ctx->cm_id, ¶m); + ib_ucm_ctx_put(ctx); + } else + result = PTR_ERR(ctx); + + kfree(param.private_data); + return result; +} + +static ssize_t ib_ucm_send_private_data(struct ib_ucm_file *file, + const char __user *inbuf, int in_len, + int (*func)(struct ib_cm_id *cm_id, + const void *private_data, + u8 private_data_len)) +{ + struct ib_ucm_private_data cmd; + struct ib_ucm_context *ctx; + const void *private_data = NULL; + int result; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + result = ib_ucm_alloc_data(&private_data, cmd.data, cmd.len); + if (result) + return result; + + ctx = ib_ucm_ctx_get(file, cmd.id); + if (!IS_ERR(ctx)) { + result = func(ctx->cm_id, private_data, cmd.len); + ib_ucm_ctx_put(ctx); + } else + result = PTR_ERR(ctx); + + kfree(private_data); + return result; +} + +static ssize_t ib_ucm_send_rtu(struct ib_ucm_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_rtu); +} + +static ssize_t ib_ucm_send_dreq(struct ib_ucm_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_dreq); +} + +static ssize_t ib_ucm_send_drep(struct ib_ucm_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_drep); +} + +static ssize_t ib_ucm_send_info(struct ib_ucm_file *file, + const char __user *inbuf, int in_len, + int (*func)(struct ib_cm_id *cm_id, + int status, + const void *info, + u8 info_len, + const void *data, + u8 data_len)) +{ + struct ib_ucm_context *ctx; + struct ib_ucm_info cmd; + const void *data = NULL; + const void *info = NULL; + int result; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + result = ib_ucm_alloc_data(&data, cmd.data, cmd.data_len); + if (result) + goto done; + + result = ib_ucm_alloc_data(&info, cmd.info, cmd.info_len); + if (result) + goto done; + + ctx = ib_ucm_ctx_get(file, cmd.id); + if (!IS_ERR(ctx)) { + result = func(ctx->cm_id, cmd.status, info, cmd.info_len, + data, cmd.data_len); + ib_ucm_ctx_put(ctx); + } else + result = PTR_ERR(ctx); + +done: + kfree(data); + kfree(info); + return result; +} + +static ssize_t ib_ucm_send_rej(struct ib_ucm_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_rej); +} + +static ssize_t ib_ucm_send_apr(struct ib_ucm_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_apr); +} + +static ssize_t ib_ucm_send_mra(struct ib_ucm_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct ib_ucm_context *ctx; + struct ib_ucm_mra cmd; + const void *data = NULL; + int result; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + result = ib_ucm_alloc_data(&data, cmd.data, cmd.len); + if (result) + return result; + + ctx = ib_ucm_ctx_get(file, cmd.id); + if (!IS_ERR(ctx)) { + result = ib_send_cm_mra(ctx->cm_id, cmd.timeout, data, cmd.len); + ib_ucm_ctx_put(ctx); + } else + result = PTR_ERR(ctx); + + kfree(data); + return result; +} + +static ssize_t ib_ucm_send_lap(struct ib_ucm_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct ib_ucm_context *ctx; + struct ib_sa_path_rec *path = NULL; + struct ib_ucm_lap cmd; + const void *data = NULL; + int result; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + result = ib_ucm_alloc_data(&data, cmd.data, cmd.len); + if (result) + goto done; + + result = ib_ucm_path_get(&path, cmd.path); + if (result) + goto done; + + ctx = ib_ucm_ctx_get(file, cmd.id); + if (!IS_ERR(ctx)) { + result = ib_send_cm_lap(ctx->cm_id, path, data, cmd.len); + ib_ucm_ctx_put(ctx); + } else + result = PTR_ERR(ctx); + +done: + kfree(data); + kfree(path); + return result; +} + +static ssize_t ib_ucm_send_sidr_req(struct ib_ucm_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct ib_cm_sidr_req_param param; + struct ib_ucm_context *ctx; + struct ib_ucm_sidr_req cmd; + int result; + + param.private_data = NULL; + param.path = NULL; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + result = ib_ucm_alloc_data(¶m.private_data, cmd.data, cmd.len); + if (result) + goto done; + + result = ib_ucm_path_get(¶m.path, cmd.path); + if (result) + goto done; + + param.private_data_len = cmd.len; + param.service_id = cmd.sid; + param.timeout_ms = cmd.timeout; + param.max_cm_retries = cmd.max_cm_retries; + + ctx = ib_ucm_ctx_get(file, cmd.id); + if (!IS_ERR(ctx)) { + result = ib_send_cm_sidr_req(ctx->cm_id, ¶m); + ib_ucm_ctx_put(ctx); + } else + result = PTR_ERR(ctx); + +done: + kfree(param.private_data); + kfree(param.path); + return result; +} + +static ssize_t ib_ucm_send_sidr_rep(struct ib_ucm_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct ib_cm_sidr_rep_param param; + struct ib_ucm_sidr_rep cmd; + struct ib_ucm_context *ctx; + int result; + + param.info = NULL; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + result = ib_ucm_alloc_data(¶m.private_data, + cmd.data, cmd.data_len); + if (result) + goto done; + + result = ib_ucm_alloc_data(¶m.info, cmd.info, cmd.info_len); + if (result) + goto done; + + param.qp_num = cmd.qpn; + param.qkey = cmd.qkey; + param.status = cmd.status; + param.info_length = cmd.info_len; + param.private_data_len = cmd.data_len; + + ctx = ib_ucm_ctx_get(file, cmd.id); + if (!IS_ERR(ctx)) { + result = ib_send_cm_sidr_rep(ctx->cm_id, ¶m); + ib_ucm_ctx_put(ctx); + } else + result = PTR_ERR(ctx); + +done: + kfree(param.private_data); + kfree(param.info); + return result; +} + +static ssize_t (*ucm_cmd_table[])(struct ib_ucm_file *file, + const char __user *inbuf, + int in_len, int out_len) = { + [IB_USER_CM_CMD_CREATE_ID] = ib_ucm_create_id, + [IB_USER_CM_CMD_DESTROY_ID] = ib_ucm_destroy_id, + [IB_USER_CM_CMD_ATTR_ID] = ib_ucm_attr_id, + [IB_USER_CM_CMD_LISTEN] = ib_ucm_listen, + [IB_USER_CM_CMD_NOTIFY] = ib_ucm_notify, + [IB_USER_CM_CMD_SEND_REQ] = ib_ucm_send_req, + [IB_USER_CM_CMD_SEND_REP] = ib_ucm_send_rep, + [IB_USER_CM_CMD_SEND_RTU] = ib_ucm_send_rtu, + [IB_USER_CM_CMD_SEND_DREQ] = ib_ucm_send_dreq, + [IB_USER_CM_CMD_SEND_DREP] = ib_ucm_send_drep, + [IB_USER_CM_CMD_SEND_REJ] = ib_ucm_send_rej, + [IB_USER_CM_CMD_SEND_MRA] = ib_ucm_send_mra, + [IB_USER_CM_CMD_SEND_LAP] = ib_ucm_send_lap, + [IB_USER_CM_CMD_SEND_APR] = ib_ucm_send_apr, + [IB_USER_CM_CMD_SEND_SIDR_REQ] = ib_ucm_send_sidr_req, + [IB_USER_CM_CMD_SEND_SIDR_REP] = ib_ucm_send_sidr_rep, + [IB_USER_CM_CMD_EVENT] = ib_ucm_event, + [IB_USER_CM_CMD_INIT_QP_ATTR] = ib_ucm_init_qp_attr, +}; + +static ssize_t ib_ucm_write(struct file *filp, const char __user *buf, + size_t len, loff_t *pos) +{ + struct ib_ucm_file *file = filp->private_data; + struct ib_ucm_cmd_hdr hdr; + ssize_t result; + + if (len < sizeof(hdr)) + return -EINVAL; + + if (copy_from_user(&hdr, buf, sizeof(hdr))) + return -EFAULT; + + if (hdr.cmd >= ARRAY_SIZE(ucm_cmd_table)) + return -EINVAL; + + if (hdr.in + sizeof(hdr) > len) + return -EINVAL; + + result = ucm_cmd_table[hdr.cmd](file, buf + sizeof(hdr), + hdr.in, hdr.out); + if (!result) + result = len; + + return result; +} + +static unsigned int ib_ucm_poll(struct file *filp, + struct poll_table_struct *wait) +{ + struct ib_ucm_file *file = filp->private_data; + unsigned int mask = 0; + + poll_wait(filp, &file->poll_wait, wait); + + if (!list_empty(&file->events)) + mask = POLLIN | POLLRDNORM; + + return mask; +} + +/* + * ib_ucm_open() does not need the BKL: + * + * - no global state is referred to; + * - there is no ioctl method to race against; + * - no further module initialization is required for open to work + * after the device is registered. + */ +static int ib_ucm_open(struct inode *inode, struct file *filp) +{ + struct ib_ucm_file *file; + + file = kmalloc(sizeof(*file), GFP_KERNEL); + if (!file) + return -ENOMEM; + + INIT_LIST_HEAD(&file->events); + INIT_LIST_HEAD(&file->ctxs); + init_waitqueue_head(&file->poll_wait); + + mutex_init(&file->file_mutex); + + filp->private_data = file; + file->filp = filp; + file->device = container_of(inode->i_cdev, struct ib_ucm_device, cdev); + + return nonseekable_open(inode, filp); +} + +static int ib_ucm_close(struct inode *inode, struct file *filp) +{ + struct ib_ucm_file *file = filp->private_data; + struct ib_ucm_context *ctx; + + mutex_lock(&file->file_mutex); + while (!list_empty(&file->ctxs)) { + ctx = list_entry(file->ctxs.next, + struct ib_ucm_context, file_list); + mutex_unlock(&file->file_mutex); + + mutex_lock(&ctx_id_mutex); + idr_remove(&ctx_id_table, ctx->id); + mutex_unlock(&ctx_id_mutex); + + ib_destroy_cm_id(ctx->cm_id); + ib_ucm_cleanup_events(ctx); + kfree(ctx); + + mutex_lock(&file->file_mutex); + } + mutex_unlock(&file->file_mutex); + kfree(file); + return 0; +} + +static void ib_ucm_release_dev(struct device *dev) +{ + struct ib_ucm_device *ucm_dev; + + ucm_dev = container_of(dev, struct ib_ucm_device, dev); + cdev_del(&ucm_dev->cdev); + if (ucm_dev->devnum < IB_UCM_MAX_DEVICES) + clear_bit(ucm_dev->devnum, dev_map); + else + clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, dev_map); + kfree(ucm_dev); +} + +static const struct file_operations ucm_fops = { + .owner = THIS_MODULE, + .open = ib_ucm_open, + .release = ib_ucm_close, + .write = ib_ucm_write, + .poll = ib_ucm_poll, + .llseek = no_llseek, +}; + +static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct ib_ucm_device *ucm_dev; + + ucm_dev = container_of(dev, struct ib_ucm_device, dev); + return sprintf(buf, "%s\n", ucm_dev->ib_dev->name); +} +static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); + +static dev_t overflow_maj; +static DECLARE_BITMAP(overflow_map, IB_UCM_MAX_DEVICES); +static int find_overflow_devnum(void) +{ + int ret; + + if (!overflow_maj) { + ret = alloc_chrdev_region(&overflow_maj, 0, IB_UCM_MAX_DEVICES, + "infiniband_cm"); + if (ret) { + printk(KERN_ERR "ucm: couldn't register dynamic device number\n"); + return ret; + } + } + + ret = find_first_zero_bit(overflow_map, IB_UCM_MAX_DEVICES); + if (ret >= IB_UCM_MAX_DEVICES) + return -1; + + return ret; +} + +static void ib_ucm_add_one(struct ib_device *device) +{ + int devnum; + dev_t base; + struct ib_ucm_device *ucm_dev; + + if (!device->alloc_ucontext || + rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) + return; + + ucm_dev = kzalloc(sizeof *ucm_dev, GFP_KERNEL); + if (!ucm_dev) + return; + + ucm_dev->ib_dev = device; + + devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES); + if (devnum >= IB_UCM_MAX_DEVICES) { + devnum = find_overflow_devnum(); + if (devnum < 0) + goto err; + + ucm_dev->devnum = devnum + IB_UCM_MAX_DEVICES; + base = devnum + overflow_maj; + set_bit(devnum, overflow_map); + } else { + ucm_dev->devnum = devnum; + base = devnum + IB_UCM_BASE_DEV; + set_bit(devnum, dev_map); + } + + cdev_init(&ucm_dev->cdev, &ucm_fops); + ucm_dev->cdev.owner = THIS_MODULE; + kobject_set_name(&ucm_dev->cdev.kobj, "ucm%d", ucm_dev->devnum); + if (cdev_add(&ucm_dev->cdev, base, 1)) + goto err; + + ucm_dev->dev.class = &cm_class; + ucm_dev->dev.parent = device->dma_device; + ucm_dev->dev.devt = ucm_dev->cdev.dev; + ucm_dev->dev.release = ib_ucm_release_dev; + dev_set_name(&ucm_dev->dev, "ucm%d", ucm_dev->devnum); + if (device_register(&ucm_dev->dev)) + goto err_cdev; + + if (device_create_file(&ucm_dev->dev, &dev_attr_ibdev)) + goto err_dev; + + ib_set_client_data(device, &ucm_client, ucm_dev); + return; + +err_dev: + device_unregister(&ucm_dev->dev); +err_cdev: + cdev_del(&ucm_dev->cdev); + if (ucm_dev->devnum < IB_UCM_MAX_DEVICES) + clear_bit(devnum, dev_map); + else + clear_bit(devnum, overflow_map); +err: + kfree(ucm_dev); + return; +} + +static void ib_ucm_remove_one(struct ib_device *device) +{ + struct ib_ucm_device *ucm_dev = ib_get_client_data(device, &ucm_client); + + if (!ucm_dev) + return; + + device_unregister(&ucm_dev->dev); +} + +static CLASS_ATTR_STRING(abi_version, S_IRUGO, + __stringify(IB_USER_CM_ABI_VERSION)); + +static int __init ib_ucm_init(void) +{ + int ret; + + ret = register_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES, + "infiniband_cm"); + if (ret) { + printk(KERN_ERR "ucm: couldn't register device number\n"); + goto error1; + } + + ret = class_create_file(&cm_class, &class_attr_abi_version.attr); + if (ret) { + printk(KERN_ERR "ucm: couldn't create abi_version attribute\n"); + goto error2; + } + + ret = ib_register_client(&ucm_client); + if (ret) { + printk(KERN_ERR "ucm: couldn't register client\n"); + goto error3; + } + return 0; + +error3: + class_remove_file(&cm_class, &class_attr_abi_version.attr); +error2: + unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES); +error1: + return ret; +} + +static void __exit ib_ucm_cleanup(void) +{ + ib_unregister_client(&ucm_client); + class_remove_file(&cm_class, &class_attr_abi_version.attr); + unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES); + if (overflow_maj) + unregister_chrdev_region(overflow_maj, IB_UCM_MAX_DEVICES); + idr_destroy(&ctx_id_table); +} + +module_init(ib_ucm_init); +module_exit(ib_ucm_cleanup); diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c new file mode 100644 index 0000000..56a4b7c --- /dev/null +++ b/drivers/infiniband/core/ucma.c @@ -0,0 +1,1632 @@ +/* + * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Sean Hefty"); +MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access"); +MODULE_LICENSE("Dual BSD/GPL"); + +static unsigned int max_backlog = 1024; + +static struct ctl_table_header *ucma_ctl_table_hdr; +static struct ctl_table ucma_ctl_table[] = { + { + .procname = "max_backlog", + .data = &max_backlog, + .maxlen = sizeof max_backlog, + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +struct ucma_file { + struct mutex mut; + struct file *filp; + struct list_head ctx_list; + struct list_head event_list; + wait_queue_head_t poll_wait; +}; + +struct ucma_context { + int id; + struct completion comp; + atomic_t ref; + int events_reported; + int backlog; + + struct ucma_file *file; + struct rdma_cm_id *cm_id; + u64 uid; + + struct list_head list; + struct list_head mc_list; +}; + +struct ucma_multicast { + struct ucma_context *ctx; + int id; + int events_reported; + + u64 uid; + struct list_head list; + struct sockaddr_storage addr; +}; + +struct ucma_event { + struct ucma_context *ctx; + struct ucma_multicast *mc; + struct list_head list; + struct rdma_cm_id *cm_id; + struct rdma_ucm_event_resp resp; +}; + +static DEFINE_MUTEX(mut); +static DEFINE_IDR(ctx_idr); +static DEFINE_IDR(multicast_idr); + +static inline struct ucma_context *_ucma_find_context(int id, + struct ucma_file *file) +{ + struct ucma_context *ctx; + + ctx = idr_find(&ctx_idr, id); + if (!ctx) + ctx = ERR_PTR(-ENOENT); + else if (ctx->file != file) + ctx = ERR_PTR(-EINVAL); + return ctx; +} + +static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id) +{ + struct ucma_context *ctx; + + mutex_lock(&mut); + ctx = _ucma_find_context(id, file); + if (!IS_ERR(ctx)) + atomic_inc(&ctx->ref); + mutex_unlock(&mut); + return ctx; +} + +static void ucma_put_ctx(struct ucma_context *ctx) +{ + if (atomic_dec_and_test(&ctx->ref)) + complete(&ctx->comp); +} + +static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file) +{ + struct ucma_context *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return NULL; + + atomic_set(&ctx->ref, 1); + init_completion(&ctx->comp); + INIT_LIST_HEAD(&ctx->mc_list); + ctx->file = file; + + mutex_lock(&mut); + ctx->id = idr_alloc(&ctx_idr, ctx, 0, 0, GFP_KERNEL); + mutex_unlock(&mut); + if (ctx->id < 0) + goto error; + + list_add_tail(&ctx->list, &file->ctx_list); + return ctx; + +error: + kfree(ctx); + return NULL; +} + +static struct ucma_multicast* ucma_alloc_multicast(struct ucma_context *ctx) +{ + struct ucma_multicast *mc; + + mc = kzalloc(sizeof(*mc), GFP_KERNEL); + if (!mc) + return NULL; + + mutex_lock(&mut); + mc->id = idr_alloc(&multicast_idr, mc, 0, 0, GFP_KERNEL); + mutex_unlock(&mut); + if (mc->id < 0) + goto error; + + mc->ctx = ctx; + list_add_tail(&mc->list, &ctx->mc_list); + return mc; + +error: + kfree(mc); + return NULL; +} + +static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst, + struct rdma_conn_param *src) +{ + if (src->private_data_len) + memcpy(dst->private_data, src->private_data, + src->private_data_len); + dst->private_data_len = src->private_data_len; + dst->responder_resources =src->responder_resources; + dst->initiator_depth = src->initiator_depth; + dst->flow_control = src->flow_control; + dst->retry_count = src->retry_count; + dst->rnr_retry_count = src->rnr_retry_count; + dst->srq = src->srq; + dst->qp_num = src->qp_num; +} + +static void ucma_copy_ud_event(struct rdma_ucm_ud_param *dst, + struct rdma_ud_param *src) +{ + if (src->private_data_len) + memcpy(dst->private_data, src->private_data, + src->private_data_len); + dst->private_data_len = src->private_data_len; + ib_copy_ah_attr_to_user(&dst->ah_attr, &src->ah_attr); + dst->qp_num = src->qp_num; + dst->qkey = src->qkey; +} + +static void ucma_set_event_context(struct ucma_context *ctx, + struct rdma_cm_event *event, + struct ucma_event *uevent) +{ + uevent->ctx = ctx; + switch (event->event) { + case RDMA_CM_EVENT_MULTICAST_JOIN: + case RDMA_CM_EVENT_MULTICAST_ERROR: + uevent->mc = (struct ucma_multicast *) + event->param.ud.private_data; + uevent->resp.uid = uevent->mc->uid; + uevent->resp.id = uevent->mc->id; + break; + default: + uevent->resp.uid = ctx->uid; + uevent->resp.id = ctx->id; + break; + } +} + +static int ucma_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + struct ucma_event *uevent; + struct ucma_context *ctx = cm_id->context; + int ret = 0; + + uevent = kzalloc(sizeof(*uevent), GFP_KERNEL); + if (!uevent) + return event->event == RDMA_CM_EVENT_CONNECT_REQUEST; + + mutex_lock(&ctx->file->mut); + uevent->cm_id = cm_id; + ucma_set_event_context(ctx, event, uevent); + uevent->resp.event = event->event; + uevent->resp.status = event->status; + if (cm_id->qp_type == IB_QPT_UD) + ucma_copy_ud_event(&uevent->resp.param.ud, &event->param.ud); + else + ucma_copy_conn_event(&uevent->resp.param.conn, + &event->param.conn); + + if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) { + if (!ctx->backlog) { + ret = -ENOMEM; + kfree(uevent); + goto out; + } + ctx->backlog--; + } else if (!ctx->uid || ctx->cm_id != cm_id) { + /* + * We ignore events for new connections until userspace has set + * their context. This can only happen if an error occurs on a + * new connection before the user accepts it. This is okay, + * since the accept will just fail later. + */ + kfree(uevent); + goto out; + } + + list_add_tail(&uevent->list, &ctx->file->event_list); + wake_up_interruptible(&ctx->file->poll_wait); +out: + mutex_unlock(&ctx->file->mut); + return ret; +} + +static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) +{ + struct ucma_context *ctx; + struct rdma_ucm_get_event cmd; + struct ucma_event *uevent; + int ret = 0; + + if (out_len < sizeof uevent->resp) + return -ENOSPC; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + mutex_lock(&file->mut); + while (list_empty(&file->event_list)) { + mutex_unlock(&file->mut); + + if (file->filp->f_flags & O_NONBLOCK) + return -EAGAIN; + + if (wait_event_interruptible(file->poll_wait, + !list_empty(&file->event_list))) + return -ERESTARTSYS; + + mutex_lock(&file->mut); + } + + uevent = list_entry(file->event_list.next, struct ucma_event, list); + + if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) { + ctx = ucma_alloc_ctx(file); + if (!ctx) { + ret = -ENOMEM; + goto done; + } + uevent->ctx->backlog++; + ctx->cm_id = uevent->cm_id; + ctx->cm_id->context = ctx; + uevent->resp.id = ctx->id; + } + + if (copy_to_user((void __user *)(unsigned long)cmd.response, + &uevent->resp, sizeof uevent->resp)) { + ret = -EFAULT; + goto done; + } + + list_del(&uevent->list); + uevent->ctx->events_reported++; + if (uevent->mc) + uevent->mc->events_reported++; + kfree(uevent); +done: + mutex_unlock(&file->mut); + return ret; +} + +static int ucma_get_qp_type(struct rdma_ucm_create_id *cmd, enum ib_qp_type *qp_type) +{ + switch (cmd->ps) { + case RDMA_PS_TCP: + *qp_type = IB_QPT_RC; + return 0; + case RDMA_PS_UDP: + case RDMA_PS_IPOIB: + *qp_type = IB_QPT_UD; + return 0; + case RDMA_PS_IB: + *qp_type = cmd->qp_type; + return 0; + default: + return -EINVAL; + } +} + +static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_create_id cmd; + struct rdma_ucm_create_id_resp resp; + struct ucma_context *ctx; + enum ib_qp_type qp_type; + int ret; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ret = ucma_get_qp_type(&cmd, &qp_type); + if (ret) + return ret; + + mutex_lock(&file->mut); + ctx = ucma_alloc_ctx(file); + mutex_unlock(&file->mut); + if (!ctx) + return -ENOMEM; + + ctx->uid = cmd.uid; + ctx->cm_id = rdma_create_id(ucma_event_handler, ctx, cmd.ps, qp_type); + if (IS_ERR(ctx->cm_id)) { + ret = PTR_ERR(ctx->cm_id); + goto err1; + } + + resp.id = ctx->id; + if (copy_to_user((void __user *)(unsigned long)cmd.response, + &resp, sizeof(resp))) { + ret = -EFAULT; + goto err2; + } + return 0; + +err2: + rdma_destroy_id(ctx->cm_id); +err1: + mutex_lock(&mut); + idr_remove(&ctx_idr, ctx->id); + mutex_unlock(&mut); + kfree(ctx); + return ret; +} + +static void ucma_cleanup_multicast(struct ucma_context *ctx) +{ + struct ucma_multicast *mc, *tmp; + + mutex_lock(&mut); + list_for_each_entry_safe(mc, tmp, &ctx->mc_list, list) { + list_del(&mc->list); + idr_remove(&multicast_idr, mc->id); + kfree(mc); + } + mutex_unlock(&mut); +} + +static void ucma_cleanup_mc_events(struct ucma_multicast *mc) +{ + struct ucma_event *uevent, *tmp; + + list_for_each_entry_safe(uevent, tmp, &mc->ctx->file->event_list, list) { + if (uevent->mc != mc) + continue; + + list_del(&uevent->list); + kfree(uevent); + } +} + +/* + * We cannot hold file->mut when calling rdma_destroy_id() or we can + * deadlock. We also acquire file->mut in ucma_event_handler(), and + * rdma_destroy_id() will wait until all callbacks have completed. + */ +static int ucma_free_ctx(struct ucma_context *ctx) +{ + int events_reported; + struct ucma_event *uevent, *tmp; + LIST_HEAD(list); + + /* No new events will be generated after destroying the id. */ + rdma_destroy_id(ctx->cm_id); + + ucma_cleanup_multicast(ctx); + + /* Cleanup events not yet reported to the user. */ + mutex_lock(&ctx->file->mut); + list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) { + if (uevent->ctx == ctx) + list_move_tail(&uevent->list, &list); + } + list_del(&ctx->list); + mutex_unlock(&ctx->file->mut); + + list_for_each_entry_safe(uevent, tmp, &list, list) { + list_del(&uevent->list); + if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) + rdma_destroy_id(uevent->cm_id); + kfree(uevent); + } + + events_reported = ctx->events_reported; + kfree(ctx); + return events_reported; +} + +static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_destroy_id cmd; + struct rdma_ucm_destroy_id_resp resp; + struct ucma_context *ctx; + int ret = 0; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + mutex_lock(&mut); + ctx = _ucma_find_context(cmd.id, file); + if (!IS_ERR(ctx)) + idr_remove(&ctx_idr, ctx->id); + mutex_unlock(&mut); + + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ucma_put_ctx(ctx); + wait_for_completion(&ctx->comp); + resp.events_reported = ucma_free_ctx(ctx); + + if (copy_to_user((void __user *)(unsigned long)cmd.response, + &resp, sizeof(resp))) + ret = -EFAULT; + + return ret; +} + +static ssize_t ucma_bind_ip(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_bind_ip cmd; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr); + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_bind(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_bind cmd; + struct sockaddr *addr; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + addr = (struct sockaddr *) &cmd.addr; + if (cmd.reserved || !cmd.addr_size || (cmd.addr_size != rdma_addr_size(addr))) + return -EINVAL; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ret = rdma_bind_addr(ctx->cm_id, addr); + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_resolve_ip(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_resolve_ip cmd; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr, + (struct sockaddr *) &cmd.dst_addr, + cmd.timeout_ms); + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_resolve_addr(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_resolve_addr cmd; + struct sockaddr *src, *dst; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + src = (struct sockaddr *) &cmd.src_addr; + dst = (struct sockaddr *) &cmd.dst_addr; + if (cmd.reserved || (cmd.src_size && (cmd.src_size != rdma_addr_size(src))) || + !cmd.dst_size || (cmd.dst_size != rdma_addr_size(dst))) + return -EINVAL; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ret = rdma_resolve_addr(ctx->cm_id, src, dst, cmd.timeout_ms); + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_resolve_route(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_resolve_route cmd; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ret = rdma_resolve_route(ctx->cm_id, cmd.timeout_ms); + ucma_put_ctx(ctx); + return ret; +} + +static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp, + struct rdma_route *route) +{ + struct rdma_dev_addr *dev_addr; + + resp->num_paths = route->num_paths; + switch (route->num_paths) { + case 0: + dev_addr = &route->addr.dev_addr; + rdma_addr_get_dgid(dev_addr, + (union ib_gid *) &resp->ib_route[0].dgid); + rdma_addr_get_sgid(dev_addr, + (union ib_gid *) &resp->ib_route[0].sgid); + resp->ib_route[0].pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); + break; + case 2: + ib_copy_path_rec_to_user(&resp->ib_route[1], + &route->path_rec[1]); + /* fall through */ + case 1: + ib_copy_path_rec_to_user(&resp->ib_route[0], + &route->path_rec[0]); + break; + default: + break; + } +} + +static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp, + struct rdma_route *route) +{ + + resp->num_paths = route->num_paths; + switch (route->num_paths) { + case 0: + rdma_ip2gid((struct sockaddr *)&route->addr.dst_addr, + (union ib_gid *)&resp->ib_route[0].dgid); + rdma_ip2gid((struct sockaddr *)&route->addr.src_addr, + (union ib_gid *)&resp->ib_route[0].sgid); + resp->ib_route[0].pkey = cpu_to_be16(0xffff); + break; + case 2: + ib_copy_path_rec_to_user(&resp->ib_route[1], + &route->path_rec[1]); + /* fall through */ + case 1: + ib_copy_path_rec_to_user(&resp->ib_route[0], + &route->path_rec[0]); + break; + default: + break; + } +} + +static void ucma_copy_iw_route(struct rdma_ucm_query_route_resp *resp, + struct rdma_route *route) +{ + struct rdma_dev_addr *dev_addr; + + dev_addr = &route->addr.dev_addr; + rdma_addr_get_dgid(dev_addr, (union ib_gid *) &resp->ib_route[0].dgid); + rdma_addr_get_sgid(dev_addr, (union ib_gid *) &resp->ib_route[0].sgid); +} + +static ssize_t ucma_query_route(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_query cmd; + struct rdma_ucm_query_route_resp resp; + struct ucma_context *ctx; + struct sockaddr *addr; + int ret = 0; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + memset(&resp, 0, sizeof resp); + addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr; + memcpy(&resp.src_addr, addr, addr->sa_family == AF_INET ? + sizeof(struct sockaddr_in) : + sizeof(struct sockaddr_in6)); + addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr; + memcpy(&resp.dst_addr, addr, addr->sa_family == AF_INET ? + sizeof(struct sockaddr_in) : + sizeof(struct sockaddr_in6)); + if (!ctx->cm_id->device) + goto out; + + resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid; + resp.port_num = ctx->cm_id->port_num; + switch (rdma_node_get_transport(ctx->cm_id->device->node_type)) { + case RDMA_TRANSPORT_IB: + switch (rdma_port_get_link_layer(ctx->cm_id->device, + ctx->cm_id->port_num)) { + case IB_LINK_LAYER_INFINIBAND: + ucma_copy_ib_route(&resp, &ctx->cm_id->route); + break; + case IB_LINK_LAYER_ETHERNET: + ucma_copy_iboe_route(&resp, &ctx->cm_id->route); + break; + default: + break; + } + break; + case RDMA_TRANSPORT_IWARP: + ucma_copy_iw_route(&resp, &ctx->cm_id->route); + break; + default: + break; + } + +out: + if (copy_to_user((void __user *)(unsigned long)cmd.response, + &resp, sizeof(resp))) + ret = -EFAULT; + + ucma_put_ctx(ctx); + return ret; +} + +static void ucma_query_device_addr(struct rdma_cm_id *cm_id, + struct rdma_ucm_query_addr_resp *resp) +{ + if (!cm_id->device) + return; + + resp->node_guid = (__force __u64) cm_id->device->node_guid; + resp->port_num = cm_id->port_num; + resp->pkey = (__force __u16) cpu_to_be16( + ib_addr_get_pkey(&cm_id->route.addr.dev_addr)); +} + +static ssize_t ucma_query_addr(struct ucma_context *ctx, + void __user *response, int out_len) +{ + struct rdma_ucm_query_addr_resp resp; + struct sockaddr *addr; + int ret = 0; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + memset(&resp, 0, sizeof resp); + + addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr; + resp.src_size = rdma_addr_size(addr); + memcpy(&resp.src_addr, addr, resp.src_size); + + addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr; + resp.dst_size = rdma_addr_size(addr); + memcpy(&resp.dst_addr, addr, resp.dst_size); + + ucma_query_device_addr(ctx->cm_id, &resp); + + if (copy_to_user(response, &resp, sizeof(resp))) + ret = -EFAULT; + + return ret; +} + +static ssize_t ucma_query_path(struct ucma_context *ctx, + void __user *response, int out_len) +{ + struct rdma_ucm_query_path_resp *resp; + int i, ret = 0; + + if (out_len < sizeof(*resp)) + return -ENOSPC; + + resp = kzalloc(out_len, GFP_KERNEL); + if (!resp) + return -ENOMEM; + + resp->num_paths = ctx->cm_id->route.num_paths; + for (i = 0, out_len -= sizeof(*resp); + i < resp->num_paths && out_len > sizeof(struct ib_path_rec_data); + i++, out_len -= sizeof(struct ib_path_rec_data)) { + + resp->path_data[i].flags = IB_PATH_GMP | IB_PATH_PRIMARY | + IB_PATH_BIDIRECTIONAL; + ib_sa_pack_path(&ctx->cm_id->route.path_rec[i], + &resp->path_data[i].path_rec); + } + + if (copy_to_user(response, resp, + sizeof(*resp) + (i * sizeof(struct ib_path_rec_data)))) + ret = -EFAULT; + + kfree(resp); + return ret; +} + +static ssize_t ucma_query_gid(struct ucma_context *ctx, + void __user *response, int out_len) +{ + struct rdma_ucm_query_addr_resp resp; + struct sockaddr_ib *addr; + int ret = 0; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + memset(&resp, 0, sizeof resp); + + ucma_query_device_addr(ctx->cm_id, &resp); + + addr = (struct sockaddr_ib *) &resp.src_addr; + resp.src_size = sizeof(*addr); + if (ctx->cm_id->route.addr.src_addr.ss_family == AF_IB) { + memcpy(addr, &ctx->cm_id->route.addr.src_addr, resp.src_size); + } else { + addr->sib_family = AF_IB; + addr->sib_pkey = (__force __be16) resp.pkey; + rdma_addr_get_sgid(&ctx->cm_id->route.addr.dev_addr, + (union ib_gid *) &addr->sib_addr); + addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *) + &ctx->cm_id->route.addr.src_addr); + } + + addr = (struct sockaddr_ib *) &resp.dst_addr; + resp.dst_size = sizeof(*addr); + if (ctx->cm_id->route.addr.dst_addr.ss_family == AF_IB) { + memcpy(addr, &ctx->cm_id->route.addr.dst_addr, resp.dst_size); + } else { + addr->sib_family = AF_IB; + addr->sib_pkey = (__force __be16) resp.pkey; + rdma_addr_get_dgid(&ctx->cm_id->route.addr.dev_addr, + (union ib_gid *) &addr->sib_addr); + addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *) + &ctx->cm_id->route.addr.dst_addr); + } + + if (copy_to_user(response, &resp, sizeof(resp))) + ret = -EFAULT; + + return ret; +} + +static ssize_t ucma_query(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_query cmd; + struct ucma_context *ctx; + void __user *response; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + response = (void __user *)(unsigned long) cmd.response; + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + switch (cmd.option) { + case RDMA_USER_CM_QUERY_ADDR: + ret = ucma_query_addr(ctx, response, out_len); + break; + case RDMA_USER_CM_QUERY_PATH: + ret = ucma_query_path(ctx, response, out_len); + break; + case RDMA_USER_CM_QUERY_GID: + ret = ucma_query_gid(ctx, response, out_len); + break; + default: + ret = -ENOSYS; + break; + } + + ucma_put_ctx(ctx); + return ret; +} + +static void ucma_copy_conn_param(struct rdma_cm_id *id, + struct rdma_conn_param *dst, + struct rdma_ucm_conn_param *src) +{ + dst->private_data = src->private_data; + dst->private_data_len = src->private_data_len; + dst->responder_resources =src->responder_resources; + dst->initiator_depth = src->initiator_depth; + dst->flow_control = src->flow_control; + dst->retry_count = src->retry_count; + dst->rnr_retry_count = src->rnr_retry_count; + dst->srq = src->srq; + dst->qp_num = src->qp_num; + dst->qkey = (id->route.addr.src_addr.ss_family == AF_IB) ? src->qkey : 0; +} + +static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_connect cmd; + struct rdma_conn_param conn_param; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + if (!cmd.conn_param.valid) + return -EINVAL; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); + ret = rdma_connect(ctx->cm_id, &conn_param); + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_listen(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_listen cmd; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ctx->backlog = cmd.backlog > 0 && cmd.backlog < max_backlog ? + cmd.backlog : max_backlog; + ret = rdma_listen(ctx->cm_id, ctx->backlog); + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_accept cmd; + struct rdma_conn_param conn_param; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + if (cmd.conn_param.valid) { + ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); + mutex_lock(&file->mut); + ret = rdma_accept(ctx->cm_id, &conn_param); + if (!ret) + ctx->uid = cmd.uid; + mutex_unlock(&file->mut); + } else + ret = rdma_accept(ctx->cm_id, NULL); + + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_reject(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_reject cmd; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ret = rdma_reject(ctx->cm_id, cmd.private_data, cmd.private_data_len); + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_disconnect(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_disconnect cmd; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ret = rdma_disconnect(ctx->cm_id); + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_init_qp_attr(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_init_qp_attr cmd; + struct ib_uverbs_qp_attr resp; + struct ucma_context *ctx; + struct ib_qp_attr qp_attr; + int ret; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + resp.qp_attr_mask = 0; + memset(&qp_attr, 0, sizeof qp_attr); + qp_attr.qp_state = cmd.qp_state; + ret = rdma_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask); + if (ret) + goto out; + + ib_copy_qp_attr_to_user(&resp, &qp_attr); + if (copy_to_user((void __user *)(unsigned long)cmd.response, + &resp, sizeof(resp))) + ret = -EFAULT; + +out: + ucma_put_ctx(ctx); + return ret; +} + +static int ucma_set_option_id(struct ucma_context *ctx, int optname, + void *optval, size_t optlen) +{ + int ret = 0; + + switch (optname) { + case RDMA_OPTION_ID_TOS: + if (optlen != sizeof(u8)) { + ret = -EINVAL; + break; + } + rdma_set_service_type(ctx->cm_id, *((u8 *) optval)); + break; + case RDMA_OPTION_ID_REUSEADDR: + if (optlen != sizeof(int)) { + ret = -EINVAL; + break; + } + ret = rdma_set_reuseaddr(ctx->cm_id, *((int *) optval) ? 1 : 0); + break; + case RDMA_OPTION_ID_AFONLY: + if (optlen != sizeof(int)) { + ret = -EINVAL; + break; + } + ret = rdma_set_afonly(ctx->cm_id, *((int *) optval) ? 1 : 0); + break; + default: + ret = -ENOSYS; + } + + return ret; +} + +static int ucma_set_ib_path(struct ucma_context *ctx, + struct ib_path_rec_data *path_data, size_t optlen) +{ + struct ib_sa_path_rec sa_path; + struct rdma_cm_event event; + int ret; + + if (optlen % sizeof(*path_data)) + return -EINVAL; + + for (; optlen; optlen -= sizeof(*path_data), path_data++) { + if (path_data->flags == (IB_PATH_GMP | IB_PATH_PRIMARY | + IB_PATH_BIDIRECTIONAL)) + break; + } + + if (!optlen) + return -EINVAL; + + ib_sa_unpack_path(path_data->path_rec, &sa_path); + ret = rdma_set_ib_paths(ctx->cm_id, &sa_path, 1); + if (ret) + return ret; + + memset(&event, 0, sizeof event); + event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; + return ucma_event_handler(ctx->cm_id, &event); +} + +static int ucma_set_option_ib(struct ucma_context *ctx, int optname, + void *optval, size_t optlen) +{ + int ret; + + switch (optname) { + case RDMA_OPTION_IB_PATH: + ret = ucma_set_ib_path(ctx, optval, optlen); + break; + default: + ret = -ENOSYS; + } + + return ret; +} + +static int ucma_set_option_level(struct ucma_context *ctx, int level, + int optname, void *optval, size_t optlen) +{ + int ret; + + switch (level) { + case RDMA_OPTION_ID: + ret = ucma_set_option_id(ctx, optname, optval, optlen); + break; + case RDMA_OPTION_IB: + ret = ucma_set_option_ib(ctx, optname, optval, optlen); + break; + default: + ret = -ENOSYS; + } + + return ret; +} + +static ssize_t ucma_set_option(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_set_option cmd; + struct ucma_context *ctx; + void *optval; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + optval = memdup_user((void __user *) (unsigned long) cmd.optval, + cmd.optlen); + if (IS_ERR(optval)) { + ret = PTR_ERR(optval); + goto out; + } + + ret = ucma_set_option_level(ctx, cmd.level, cmd.optname, optval, + cmd.optlen); + kfree(optval); + +out: + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_notify(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_notify cmd; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ret = rdma_notify(ctx->cm_id, (enum ib_event_type) cmd.event); + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_process_join(struct ucma_file *file, + struct rdma_ucm_join_mcast *cmd, int out_len) +{ + struct rdma_ucm_create_id_resp resp; + struct ucma_context *ctx; + struct ucma_multicast *mc; + struct sockaddr *addr; + int ret; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + addr = (struct sockaddr *) &cmd->addr; + if (cmd->reserved || !cmd->addr_size || (cmd->addr_size != rdma_addr_size(addr))) + return -EINVAL; + + ctx = ucma_get_ctx(file, cmd->id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + mutex_lock(&file->mut); + mc = ucma_alloc_multicast(ctx); + if (!mc) { + ret = -ENOMEM; + goto err1; + } + + mc->uid = cmd->uid; + memcpy(&mc->addr, addr, cmd->addr_size); + ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr, mc); + if (ret) + goto err2; + + resp.id = mc->id; + if (copy_to_user((void __user *)(unsigned long) cmd->response, + &resp, sizeof(resp))) { + ret = -EFAULT; + goto err3; + } + + mutex_unlock(&file->mut); + ucma_put_ctx(ctx); + return 0; + +err3: + rdma_leave_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr); + ucma_cleanup_mc_events(mc); +err2: + mutex_lock(&mut); + idr_remove(&multicast_idr, mc->id); + mutex_unlock(&mut); + list_del(&mc->list); + kfree(mc); +err1: + mutex_unlock(&file->mut); + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_join_ip_multicast(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_join_ip_mcast cmd; + struct rdma_ucm_join_mcast join_cmd; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + join_cmd.response = cmd.response; + join_cmd.uid = cmd.uid; + join_cmd.id = cmd.id; + join_cmd.addr_size = rdma_addr_size((struct sockaddr *) &cmd.addr); + join_cmd.reserved = 0; + memcpy(&join_cmd.addr, &cmd.addr, join_cmd.addr_size); + + return ucma_process_join(file, &join_cmd, out_len); +} + +static ssize_t ucma_join_multicast(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_join_mcast cmd; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + return ucma_process_join(file, &cmd, out_len); +} + +static ssize_t ucma_leave_multicast(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_destroy_id cmd; + struct rdma_ucm_destroy_id_resp resp; + struct ucma_multicast *mc; + int ret = 0; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + mutex_lock(&mut); + mc = idr_find(&multicast_idr, cmd.id); + if (!mc) + mc = ERR_PTR(-ENOENT); + else if (mc->ctx->file != file) + mc = ERR_PTR(-EINVAL); + else { + idr_remove(&multicast_idr, mc->id); + atomic_inc(&mc->ctx->ref); + } + mutex_unlock(&mut); + + if (IS_ERR(mc)) { + ret = PTR_ERR(mc); + goto out; + } + + rdma_leave_multicast(mc->ctx->cm_id, (struct sockaddr *) &mc->addr); + mutex_lock(&mc->ctx->file->mut); + ucma_cleanup_mc_events(mc); + list_del(&mc->list); + mutex_unlock(&mc->ctx->file->mut); + + ucma_put_ctx(mc->ctx); + resp.events_reported = mc->events_reported; + kfree(mc); + + if (copy_to_user((void __user *)(unsigned long)cmd.response, + &resp, sizeof(resp))) + ret = -EFAULT; +out: + return ret; +} + +static void ucma_lock_files(struct ucma_file *file1, struct ucma_file *file2) +{ + /* Acquire mutex's based on pointer comparison to prevent deadlock. */ + if (file1 < file2) { + mutex_lock(&file1->mut); + mutex_lock(&file2->mut); + } else { + mutex_lock(&file2->mut); + mutex_lock(&file1->mut); + } +} + +static void ucma_unlock_files(struct ucma_file *file1, struct ucma_file *file2) +{ + if (file1 < file2) { + mutex_unlock(&file2->mut); + mutex_unlock(&file1->mut); + } else { + mutex_unlock(&file1->mut); + mutex_unlock(&file2->mut); + } +} + +static void ucma_move_events(struct ucma_context *ctx, struct ucma_file *file) +{ + struct ucma_event *uevent, *tmp; + + list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) + if (uevent->ctx == ctx) + list_move_tail(&uevent->list, &file->event_list); +} + +static ssize_t ucma_migrate_id(struct ucma_file *new_file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_migrate_id cmd; + struct rdma_ucm_migrate_resp resp; + struct ucma_context *ctx; + struct fd f; + struct ucma_file *cur_file; + int ret = 0; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + /* Get current fd to protect against it being closed */ + f = fdget(cmd.fd); + if (!f.file) + return -ENOENT; + + /* Validate current fd and prevent destruction of id. */ + ctx = ucma_get_ctx(f.file->private_data, cmd.id); + if (IS_ERR(ctx)) { + ret = PTR_ERR(ctx); + goto file_put; + } + + cur_file = ctx->file; + if (cur_file == new_file) { + resp.events_reported = ctx->events_reported; + goto response; + } + + /* + * Migrate events between fd's, maintaining order, and avoiding new + * events being added before existing events. + */ + ucma_lock_files(cur_file, new_file); + mutex_lock(&mut); + + list_move_tail(&ctx->list, &new_file->ctx_list); + ucma_move_events(ctx, new_file); + ctx->file = new_file; + resp.events_reported = ctx->events_reported; + + mutex_unlock(&mut); + ucma_unlock_files(cur_file, new_file); + +response: + if (copy_to_user((void __user *)(unsigned long)cmd.response, + &resp, sizeof(resp))) + ret = -EFAULT; + + ucma_put_ctx(ctx); +file_put: + fdput(f); + return ret; +} + +static ssize_t (*ucma_cmd_table[])(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) = { + [RDMA_USER_CM_CMD_CREATE_ID] = ucma_create_id, + [RDMA_USER_CM_CMD_DESTROY_ID] = ucma_destroy_id, + [RDMA_USER_CM_CMD_BIND_IP] = ucma_bind_ip, + [RDMA_USER_CM_CMD_RESOLVE_IP] = ucma_resolve_ip, + [RDMA_USER_CM_CMD_RESOLVE_ROUTE] = ucma_resolve_route, + [RDMA_USER_CM_CMD_QUERY_ROUTE] = ucma_query_route, + [RDMA_USER_CM_CMD_CONNECT] = ucma_connect, + [RDMA_USER_CM_CMD_LISTEN] = ucma_listen, + [RDMA_USER_CM_CMD_ACCEPT] = ucma_accept, + [RDMA_USER_CM_CMD_REJECT] = ucma_reject, + [RDMA_USER_CM_CMD_DISCONNECT] = ucma_disconnect, + [RDMA_USER_CM_CMD_INIT_QP_ATTR] = ucma_init_qp_attr, + [RDMA_USER_CM_CMD_GET_EVENT] = ucma_get_event, + [RDMA_USER_CM_CMD_GET_OPTION] = NULL, + [RDMA_USER_CM_CMD_SET_OPTION] = ucma_set_option, + [RDMA_USER_CM_CMD_NOTIFY] = ucma_notify, + [RDMA_USER_CM_CMD_JOIN_IP_MCAST] = ucma_join_ip_multicast, + [RDMA_USER_CM_CMD_LEAVE_MCAST] = ucma_leave_multicast, + [RDMA_USER_CM_CMD_MIGRATE_ID] = ucma_migrate_id, + [RDMA_USER_CM_CMD_QUERY] = ucma_query, + [RDMA_USER_CM_CMD_BIND] = ucma_bind, + [RDMA_USER_CM_CMD_RESOLVE_ADDR] = ucma_resolve_addr, + [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast +}; + +static ssize_t ucma_write(struct file *filp, const char __user *buf, + size_t len, loff_t *pos) +{ + struct ucma_file *file = filp->private_data; + struct rdma_ucm_cmd_hdr hdr; + ssize_t ret; + + if (len < sizeof(hdr)) + return -EINVAL; + + if (copy_from_user(&hdr, buf, sizeof(hdr))) + return -EFAULT; + + if (hdr.cmd >= ARRAY_SIZE(ucma_cmd_table)) + return -EINVAL; + + if (hdr.in + sizeof(hdr) > len) + return -EINVAL; + + if (!ucma_cmd_table[hdr.cmd]) + return -ENOSYS; + + ret = ucma_cmd_table[hdr.cmd](file, buf + sizeof(hdr), hdr.in, hdr.out); + if (!ret) + ret = len; + + return ret; +} + +static unsigned int ucma_poll(struct file *filp, struct poll_table_struct *wait) +{ + struct ucma_file *file = filp->private_data; + unsigned int mask = 0; + + poll_wait(filp, &file->poll_wait, wait); + + if (!list_empty(&file->event_list)) + mask = POLLIN | POLLRDNORM; + + return mask; +} + +/* + * ucma_open() does not need the BKL: + * + * - no global state is referred to; + * - there is no ioctl method to race against; + * - no further module initialization is required for open to work + * after the device is registered. + */ +static int ucma_open(struct inode *inode, struct file *filp) +{ + struct ucma_file *file; + + file = kmalloc(sizeof *file, GFP_KERNEL); + if (!file) + return -ENOMEM; + + INIT_LIST_HEAD(&file->event_list); + INIT_LIST_HEAD(&file->ctx_list); + init_waitqueue_head(&file->poll_wait); + mutex_init(&file->mut); + + filp->private_data = file; + file->filp = filp; + + return nonseekable_open(inode, filp); +} + +static int ucma_close(struct inode *inode, struct file *filp) +{ + struct ucma_file *file = filp->private_data; + struct ucma_context *ctx, *tmp; + + mutex_lock(&file->mut); + list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) { + mutex_unlock(&file->mut); + + mutex_lock(&mut); + idr_remove(&ctx_idr, ctx->id); + mutex_unlock(&mut); + + ucma_free_ctx(ctx); + mutex_lock(&file->mut); + } + mutex_unlock(&file->mut); + kfree(file); + return 0; +} + +static const struct file_operations ucma_fops = { + .owner = THIS_MODULE, + .open = ucma_open, + .release = ucma_close, + .write = ucma_write, + .poll = ucma_poll, + .llseek = no_llseek, +}; + +static struct miscdevice ucma_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = "rdma_cm", + .nodename = "infiniband/rdma_cm", + .mode = 0666, + .fops = &ucma_fops, +}; + +static ssize_t show_abi_version(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", RDMA_USER_CM_ABI_VERSION); +} +static DEVICE_ATTR(abi_version, S_IRUGO, show_abi_version, NULL); + +static int __init ucma_init(void) +{ + int ret; + + ret = misc_register(&ucma_misc); + if (ret) + return ret; + + ret = device_create_file(ucma_misc.this_device, &dev_attr_abi_version); + if (ret) { + printk(KERN_ERR "rdma_ucm: couldn't create abi_version attr\n"); + goto err1; + } + + ucma_ctl_table_hdr = register_net_sysctl(&init_net, "net/rdma_ucm", ucma_ctl_table); + if (!ucma_ctl_table_hdr) { + printk(KERN_ERR "rdma_ucm: couldn't register sysctl paths\n"); + ret = -ENOMEM; + goto err2; + } + return 0; +err2: + device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); +err1: + misc_deregister(&ucma_misc); + return ret; +} + +static void __exit ucma_cleanup(void) +{ + unregister_net_sysctl_table(ucma_ctl_table_hdr); + device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); + misc_deregister(&ucma_misc); + idr_destroy(&ctx_idr); +} + +module_init(ucma_init); +module_exit(ucma_cleanup); diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c new file mode 100644 index 0000000..72feee6 --- /dev/null +++ b/drivers/infiniband/core/ud_header.c @@ -0,0 +1,414 @@ +/* + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include + +#define STRUCT_FIELD(header, field) \ + .struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field), \ + .struct_size_bytes = sizeof ((struct ib_unpacked_ ## header *) 0)->field, \ + .field_name = #header ":" #field + +static const struct ib_field lrh_table[] = { + { STRUCT_FIELD(lrh, virtual_lane), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 4 }, + { STRUCT_FIELD(lrh, link_version), + .offset_words = 0, + .offset_bits = 4, + .size_bits = 4 }, + { STRUCT_FIELD(lrh, service_level), + .offset_words = 0, + .offset_bits = 8, + .size_bits = 4 }, + { RESERVED, + .offset_words = 0, + .offset_bits = 12, + .size_bits = 2 }, + { STRUCT_FIELD(lrh, link_next_header), + .offset_words = 0, + .offset_bits = 14, + .size_bits = 2 }, + { STRUCT_FIELD(lrh, destination_lid), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 16 }, + { RESERVED, + .offset_words = 1, + .offset_bits = 0, + .size_bits = 5 }, + { STRUCT_FIELD(lrh, packet_length), + .offset_words = 1, + .offset_bits = 5, + .size_bits = 11 }, + { STRUCT_FIELD(lrh, source_lid), + .offset_words = 1, + .offset_bits = 16, + .size_bits = 16 } +}; + +static const struct ib_field eth_table[] = { + { STRUCT_FIELD(eth, dmac_h), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 32 }, + { STRUCT_FIELD(eth, dmac_l), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 16 }, + { STRUCT_FIELD(eth, smac_h), + .offset_words = 1, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(eth, smac_l), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 32 }, + { STRUCT_FIELD(eth, type), + .offset_words = 3, + .offset_bits = 0, + .size_bits = 16 } +}; + +static const struct ib_field vlan_table[] = { + { STRUCT_FIELD(vlan, tag), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 16 }, + { STRUCT_FIELD(vlan, type), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 16 } +}; + +static const struct ib_field grh_table[] = { + { STRUCT_FIELD(grh, ip_version), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 4 }, + { STRUCT_FIELD(grh, traffic_class), + .offset_words = 0, + .offset_bits = 4, + .size_bits = 8 }, + { STRUCT_FIELD(grh, flow_label), + .offset_words = 0, + .offset_bits = 12, + .size_bits = 20 }, + { STRUCT_FIELD(grh, payload_length), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 16 }, + { STRUCT_FIELD(grh, next_header), + .offset_words = 1, + .offset_bits = 16, + .size_bits = 8 }, + { STRUCT_FIELD(grh, hop_limit), + .offset_words = 1, + .offset_bits = 24, + .size_bits = 8 }, + { STRUCT_FIELD(grh, source_gid), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 128 }, + { STRUCT_FIELD(grh, destination_gid), + .offset_words = 6, + .offset_bits = 0, + .size_bits = 128 } +}; + +static const struct ib_field bth_table[] = { + { STRUCT_FIELD(bth, opcode), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 8 }, + { STRUCT_FIELD(bth, solicited_event), + .offset_words = 0, + .offset_bits = 8, + .size_bits = 1 }, + { STRUCT_FIELD(bth, mig_req), + .offset_words = 0, + .offset_bits = 9, + .size_bits = 1 }, + { STRUCT_FIELD(bth, pad_count), + .offset_words = 0, + .offset_bits = 10, + .size_bits = 2 }, + { STRUCT_FIELD(bth, transport_header_version), + .offset_words = 0, + .offset_bits = 12, + .size_bits = 4 }, + { STRUCT_FIELD(bth, pkey), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 16 }, + { RESERVED, + .offset_words = 1, + .offset_bits = 0, + .size_bits = 8 }, + { STRUCT_FIELD(bth, destination_qpn), + .offset_words = 1, + .offset_bits = 8, + .size_bits = 24 }, + { STRUCT_FIELD(bth, ack_req), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 1 }, + { RESERVED, + .offset_words = 2, + .offset_bits = 1, + .size_bits = 7 }, + { STRUCT_FIELD(bth, psn), + .offset_words = 2, + .offset_bits = 8, + .size_bits = 24 } +}; + +static const struct ib_field deth_table[] = { + { STRUCT_FIELD(deth, qkey), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 32 }, + { RESERVED, + .offset_words = 1, + .offset_bits = 0, + .size_bits = 8 }, + { STRUCT_FIELD(deth, source_qpn), + .offset_words = 1, + .offset_bits = 8, + .size_bits = 24 } +}; + +/** + * ib_ud_header_init - Initialize UD header structure + * @payload_bytes:Length of packet payload + * @lrh_present: specify if LRH is present + * @eth_present: specify if Eth header is present + * @vlan_present: packet is tagged vlan + * @grh_present:GRH flag (if non-zero, GRH will be included) + * @immediate_present: specify if immediate data is present + * @header:Structure to initialize + */ +void ib_ud_header_init(int payload_bytes, + int lrh_present, + int eth_present, + int vlan_present, + int grh_present, + int immediate_present, + struct ib_ud_header *header) +{ + memset(header, 0, sizeof *header); + + if (lrh_present) { + u16 packet_length; + + header->lrh.link_version = 0; + header->lrh.link_next_header = + grh_present ? IB_LNH_IBA_GLOBAL : IB_LNH_IBA_LOCAL; + packet_length = (IB_LRH_BYTES + + IB_BTH_BYTES + + IB_DETH_BYTES + + (grh_present ? IB_GRH_BYTES : 0) + + payload_bytes + + 4 + /* ICRC */ + 3) / 4; /* round up */ + header->lrh.packet_length = cpu_to_be16(packet_length); + } + + if (vlan_present) + header->eth.type = cpu_to_be16(ETH_P_8021Q); + + if (grh_present) { + header->grh.ip_version = 6; + header->grh.payload_length = + cpu_to_be16((IB_BTH_BYTES + + IB_DETH_BYTES + + payload_bytes + + 4 + /* ICRC */ + 3) & ~3); /* round up */ + header->grh.next_header = 0x1b; + } + + if (immediate_present) + header->bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; + else + header->bth.opcode = IB_OPCODE_UD_SEND_ONLY; + header->bth.pad_count = (4 - payload_bytes) & 3; + header->bth.transport_header_version = 0; + + header->lrh_present = lrh_present; + header->eth_present = eth_present; + header->vlan_present = vlan_present; + header->grh_present = grh_present; + header->immediate_present = immediate_present; +} +EXPORT_SYMBOL(ib_ud_header_init); + +/** + * ib_ud_header_pack - Pack UD header struct into wire format + * @header:UD header struct + * @buf:Buffer to pack into + * + * ib_ud_header_pack() packs the UD header structure @header into wire + * format in the buffer @buf. + */ +int ib_ud_header_pack(struct ib_ud_header *header, + void *buf) +{ + int len = 0; + + if (header->lrh_present) { + ib_pack(lrh_table, ARRAY_SIZE(lrh_table), + &header->lrh, buf + len); + len += IB_LRH_BYTES; + } + if (header->eth_present) { + ib_pack(eth_table, ARRAY_SIZE(eth_table), + &header->eth, buf + len); + len += IB_ETH_BYTES; + } + if (header->vlan_present) { + ib_pack(vlan_table, ARRAY_SIZE(vlan_table), + &header->vlan, buf + len); + len += IB_VLAN_BYTES; + } + if (header->grh_present) { + ib_pack(grh_table, ARRAY_SIZE(grh_table), + &header->grh, buf + len); + len += IB_GRH_BYTES; + } + + ib_pack(bth_table, ARRAY_SIZE(bth_table), + &header->bth, buf + len); + len += IB_BTH_BYTES; + + ib_pack(deth_table, ARRAY_SIZE(deth_table), + &header->deth, buf + len); + len += IB_DETH_BYTES; + + if (header->immediate_present) { + memcpy(buf + len, &header->immediate_data, sizeof header->immediate_data); + len += sizeof header->immediate_data; + } + + return len; +} +EXPORT_SYMBOL(ib_ud_header_pack); + +/** + * ib_ud_header_unpack - Unpack UD header struct from wire format + * @header:UD header struct + * @buf:Buffer to pack into + * + * ib_ud_header_pack() unpacks the UD header structure @header from wire + * format in the buffer @buf. + */ +int ib_ud_header_unpack(void *buf, + struct ib_ud_header *header) +{ + ib_unpack(lrh_table, ARRAY_SIZE(lrh_table), + buf, &header->lrh); + buf += IB_LRH_BYTES; + + if (header->lrh.link_version != 0) { + printk(KERN_WARNING "Invalid LRH.link_version %d\n", + header->lrh.link_version); + return -EINVAL; + } + + switch (header->lrh.link_next_header) { + case IB_LNH_IBA_LOCAL: + header->grh_present = 0; + break; + + case IB_LNH_IBA_GLOBAL: + header->grh_present = 1; + ib_unpack(grh_table, ARRAY_SIZE(grh_table), + buf, &header->grh); + buf += IB_GRH_BYTES; + + if (header->grh.ip_version != 6) { + printk(KERN_WARNING "Invalid GRH.ip_version %d\n", + header->grh.ip_version); + return -EINVAL; + } + if (header->grh.next_header != 0x1b) { + printk(KERN_WARNING "Invalid GRH.next_header 0x%02x\n", + header->grh.next_header); + return -EINVAL; + } + break; + + default: + printk(KERN_WARNING "Invalid LRH.link_next_header %d\n", + header->lrh.link_next_header); + return -EINVAL; + } + + ib_unpack(bth_table, ARRAY_SIZE(bth_table), + buf, &header->bth); + buf += IB_BTH_BYTES; + + switch (header->bth.opcode) { + case IB_OPCODE_UD_SEND_ONLY: + header->immediate_present = 0; + break; + case IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE: + header->immediate_present = 1; + break; + default: + printk(KERN_WARNING "Invalid BTH.opcode 0x%02x\n", + header->bth.opcode); + return -EINVAL; + } + + if (header->bth.transport_header_version != 0) { + printk(KERN_WARNING "Invalid BTH.transport_header_version %d\n", + header->bth.transport_header_version); + return -EINVAL; + } + + ib_unpack(deth_table, ARRAY_SIZE(deth_table), + buf, &header->deth); + buf += IB_DETH_BYTES; + + if (header->immediate_present) + memcpy(&header->immediate_data, buf, sizeof header->immediate_data); + + return 0; +} +EXPORT_SYMBOL(ib_ud_header_unpack); diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c new file mode 100644 index 0000000..df0c4f6 --- /dev/null +++ b/drivers/infiniband/core/umem.c @@ -0,0 +1,294 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "uverbs.h" + + +static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) +{ + struct scatterlist *sg; + struct page *page; + int i; + + if (umem->nmap > 0) + ib_dma_unmap_sg(dev, umem->sg_head.sgl, + umem->nmap, + DMA_BIDIRECTIONAL); + + for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) { + + page = sg_page(sg); + if (umem->writable && dirty) + set_page_dirty_lock(page); + put_page(page); + } + + sg_free_table(&umem->sg_head); + return; + +} + +/** + * ib_umem_get - Pin and DMA map userspace memory. + * @context: userspace context to pin memory for + * @addr: userspace virtual address to start at + * @size: length of region to pin + * @access: IB_ACCESS_xxx flags for memory being pinned + * @dmasync: flush in-flight DMA when the memory region is written + */ +struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, + size_t size, int access, int dmasync) +{ + struct ib_umem *umem; + struct page **page_list; + struct vm_area_struct **vma_list; + unsigned long locked; + unsigned long lock_limit; + unsigned long cur_base; + unsigned long npages; + int ret; + int i; + DEFINE_DMA_ATTRS(attrs); + struct scatterlist *sg, *sg_list_start; + int need_release = 0; + + if (dmasync) + dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs); + + if (!can_do_mlock()) + return ERR_PTR(-EPERM); + + umem = kzalloc(sizeof *umem, GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + + umem->context = context; + umem->length = size; + umem->offset = addr & ~PAGE_MASK; + umem->page_size = PAGE_SIZE; + umem->pid = get_task_pid(current, PIDTYPE_PID); + /* + * We ask for writable memory if any access flags other than + * "remote read" are set. "Local write" and "remote write" + * obviously require write access. "Remote atomic" can do + * things like fetch and add, which will modify memory, and + * "MW bind" can change permissions by binding a window. + */ + umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ); + + /* We assume the memory is from hugetlb until proved otherwise */ + umem->hugetlb = 1; + + page_list = (struct page **) __get_free_page(GFP_KERNEL); + if (!page_list) { + kfree(umem); + return ERR_PTR(-ENOMEM); + } + + /* + * if we can't alloc the vma_list, it's not so bad; + * just assume the memory is not hugetlb memory + */ + vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL); + if (!vma_list) + umem->hugetlb = 0; + + npages = PAGE_ALIGN(size + umem->offset) >> PAGE_SHIFT; + + down_write(¤t->mm->mmap_sem); + + locked = npages + current->mm->pinned_vm; + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { + ret = -ENOMEM; + goto out; + } + + cur_base = addr & PAGE_MASK; + + if (npages == 0) { + ret = -EINVAL; + goto out; + } + + ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL); + if (ret) + goto out; + + need_release = 1; + sg_list_start = umem->sg_head.sgl; + + while (npages) { + ret = get_user_pages(current, current->mm, cur_base, + min_t(unsigned long, npages, + PAGE_SIZE / sizeof (struct page *)), + 1, !umem->writable, page_list, vma_list); + + if (ret < 0) + goto out; + + umem->npages += ret; + cur_base += ret * PAGE_SIZE; + npages -= ret; + + for_each_sg(sg_list_start, sg, ret, i) { + if (vma_list && !is_vm_hugetlb_page(vma_list[i])) + umem->hugetlb = 0; + + sg_set_page(sg, page_list[i], PAGE_SIZE, 0); + } + + /* preparing for next loop */ + sg_list_start = sg; + } + + umem->nmap = ib_dma_map_sg_attrs(context->device, + umem->sg_head.sgl, + umem->npages, + DMA_BIDIRECTIONAL, + &attrs); + + if (umem->nmap <= 0) { + ret = -ENOMEM; + goto out; + } + + ret = 0; + +out: + if (ret < 0) { + if (need_release) + __ib_umem_release(context->device, umem, 0); + put_pid(umem->pid); + kfree(umem); + } else + current->mm->pinned_vm = locked; + + up_write(¤t->mm->mmap_sem); + if (vma_list) + free_page((unsigned long) vma_list); + free_page((unsigned long) page_list); + + return ret < 0 ? ERR_PTR(ret) : umem; +} +EXPORT_SYMBOL(ib_umem_get); + +static void ib_umem_account(struct work_struct *work) +{ + struct ib_umem *umem = container_of(work, struct ib_umem, work); + + down_write(&umem->mm->mmap_sem); + umem->mm->pinned_vm -= umem->diff; + up_write(&umem->mm->mmap_sem); + mmput(umem->mm); + kfree(umem); +} + +/** + * ib_umem_release - release memory pinned with ib_umem_get + * @umem: umem struct to release + */ +void ib_umem_release(struct ib_umem *umem) +{ + struct ib_ucontext *context = umem->context; + struct mm_struct *mm; + struct task_struct *task; + unsigned long diff; + + __ib_umem_release(umem->context->device, umem, 1); + + task = get_pid_task(umem->pid, PIDTYPE_PID); + put_pid(umem->pid); + if (!task) + goto out; + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + goto out; + + diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT; + + /* + * We may be called with the mm's mmap_sem already held. This + * can happen when a userspace munmap() is the call that drops + * the last reference to our file and calls our release + * method. If there are memory regions to destroy, we'll end + * up here and not be able to take the mmap_sem. In that case + * we defer the vm_locked accounting to the system workqueue. + */ + if (context->closing) { + if (!down_write_trylock(&mm->mmap_sem)) { + INIT_WORK(&umem->work, ib_umem_account); + umem->mm = mm; + umem->diff = diff; + + queue_work(ib_wq, &umem->work); + return; + } + } else + down_write(&mm->mmap_sem); + + mm->pinned_vm -= diff; + up_write(&mm->mmap_sem); + mmput(mm); +out: + kfree(umem); +} +EXPORT_SYMBOL(ib_umem_release); + +int ib_umem_page_count(struct ib_umem *umem) +{ + int shift; + int i; + int n; + struct scatterlist *sg; + + shift = ilog2(umem->page_size); + + n = 0; + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) + n += sg_dma_len(sg) >> shift; + + return n; +} +EXPORT_SYMBOL(ib_umem_page_count); diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c new file mode 100644 index 0000000..928cdd2 --- /dev/null +++ b/drivers/infiniband/core/user_mad.c @@ -0,0 +1,1390 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Voltaire, Inc. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2008 Cisco. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define pr_fmt(fmt) "user_mad: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +MODULE_AUTHOR("Roland Dreier"); +MODULE_DESCRIPTION("InfiniBand userspace MAD packet access"); +MODULE_LICENSE("Dual BSD/GPL"); + +enum { + IB_UMAD_MAX_PORTS = 64, + IB_UMAD_MAX_AGENTS = 32, + + IB_UMAD_MAJOR = 231, + IB_UMAD_MINOR_BASE = 0 +}; + +/* + * Our lifetime rules for these structs are the following: + * device special file is opened, we take a reference on the + * ib_umad_port's struct ib_umad_device. We drop these + * references in the corresponding close(). + * + * In addition to references coming from open character devices, there + * is one more reference to each ib_umad_device representing the + * module's reference taken when allocating the ib_umad_device in + * ib_umad_add_one(). + * + * When destroying an ib_umad_device, we drop the module's reference. + */ + +struct ib_umad_port { + struct cdev cdev; + struct device *dev; + + struct cdev sm_cdev; + struct device *sm_dev; + struct semaphore sm_sem; + + struct mutex file_mutex; + struct list_head file_list; + + struct ib_device *ib_dev; + struct ib_umad_device *umad_dev; + int dev_num; + u8 port_num; +}; + +struct ib_umad_device { + int start_port, end_port; + struct kobject kobj; + struct ib_umad_port port[0]; +}; + +struct ib_umad_file { + struct mutex mutex; + struct ib_umad_port *port; + struct list_head recv_list; + struct list_head send_list; + struct list_head port_list; + spinlock_t send_lock; + wait_queue_head_t recv_wait; + struct ib_mad_agent *agent[IB_UMAD_MAX_AGENTS]; + int agents_dead; + u8 use_pkey_index; + u8 already_used; +}; + +struct ib_umad_packet { + struct ib_mad_send_buf *msg; + struct ib_mad_recv_wc *recv_wc; + struct list_head list; + int length; + struct ib_user_mad mad; +}; + +static struct class *umad_class; + +static const dev_t base_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE); + +static DEFINE_SPINLOCK(port_lock); +static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS); + +static void ib_umad_add_one(struct ib_device *device); +static void ib_umad_remove_one(struct ib_device *device); + +static void ib_umad_release_dev(struct kobject *kobj) +{ + struct ib_umad_device *dev = + container_of(kobj, struct ib_umad_device, kobj); + + kfree(dev); +} + +static struct kobj_type ib_umad_dev_ktype = { + .release = ib_umad_release_dev, +}; + +static int hdr_size(struct ib_umad_file *file) +{ + return file->use_pkey_index ? sizeof (struct ib_user_mad_hdr) : + sizeof (struct ib_user_mad_hdr_old); +} + +/* caller must hold file->mutex */ +static struct ib_mad_agent *__get_agent(struct ib_umad_file *file, int id) +{ + return file->agents_dead ? NULL : file->agent[id]; +} + +static int queue_packet(struct ib_umad_file *file, + struct ib_mad_agent *agent, + struct ib_umad_packet *packet) +{ + int ret = 1; + + mutex_lock(&file->mutex); + + for (packet->mad.hdr.id = 0; + packet->mad.hdr.id < IB_UMAD_MAX_AGENTS; + packet->mad.hdr.id++) + if (agent == __get_agent(file, packet->mad.hdr.id)) { + list_add_tail(&packet->list, &file->recv_list); + wake_up_interruptible(&file->recv_wait); + ret = 0; + break; + } + + mutex_unlock(&file->mutex); + + return ret; +} + +static void dequeue_send(struct ib_umad_file *file, + struct ib_umad_packet *packet) +{ + spin_lock_irq(&file->send_lock); + list_del(&packet->list); + spin_unlock_irq(&file->send_lock); +} + +static void send_handler(struct ib_mad_agent *agent, + struct ib_mad_send_wc *send_wc) +{ + struct ib_umad_file *file = agent->context; + struct ib_umad_packet *packet = send_wc->send_buf->context[0]; + + dequeue_send(file, packet); + ib_destroy_ah(packet->msg->ah); + ib_free_send_mad(packet->msg); + + if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) { + packet->length = IB_MGMT_MAD_HDR; + packet->mad.hdr.status = ETIMEDOUT; + if (!queue_packet(file, agent, packet)) + return; + } + kfree(packet); +} + +static void recv_handler(struct ib_mad_agent *agent, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct ib_umad_file *file = agent->context; + struct ib_umad_packet *packet; + + if (mad_recv_wc->wc->status != IB_WC_SUCCESS) + goto err1; + + packet = kzalloc(sizeof *packet, GFP_KERNEL); + if (!packet) + goto err1; + + packet->length = mad_recv_wc->mad_len; + packet->recv_wc = mad_recv_wc; + + packet->mad.hdr.status = 0; + packet->mad.hdr.length = hdr_size(file) + mad_recv_wc->mad_len; + packet->mad.hdr.qpn = cpu_to_be32(mad_recv_wc->wc->src_qp); + packet->mad.hdr.lid = cpu_to_be16(mad_recv_wc->wc->slid); + packet->mad.hdr.sl = mad_recv_wc->wc->sl; + packet->mad.hdr.path_bits = mad_recv_wc->wc->dlid_path_bits; + packet->mad.hdr.pkey_index = mad_recv_wc->wc->pkey_index; + packet->mad.hdr.grh_present = !!(mad_recv_wc->wc->wc_flags & IB_WC_GRH); + if (packet->mad.hdr.grh_present) { + struct ib_ah_attr ah_attr; + + ib_init_ah_from_wc(agent->device, agent->port_num, + mad_recv_wc->wc, mad_recv_wc->recv_buf.grh, + &ah_attr); + + packet->mad.hdr.gid_index = ah_attr.grh.sgid_index; + packet->mad.hdr.hop_limit = ah_attr.grh.hop_limit; + packet->mad.hdr.traffic_class = ah_attr.grh.traffic_class; + memcpy(packet->mad.hdr.gid, &ah_attr.grh.dgid, 16); + packet->mad.hdr.flow_label = cpu_to_be32(ah_attr.grh.flow_label); + } + + if (queue_packet(file, agent, packet)) + goto err2; + return; + +err2: + kfree(packet); +err1: + ib_free_recv_mad(mad_recv_wc); +} + +static ssize_t copy_recv_mad(struct ib_umad_file *file, char __user *buf, + struct ib_umad_packet *packet, size_t count) +{ + struct ib_mad_recv_buf *recv_buf; + int left, seg_payload, offset, max_seg_payload; + + /* We need enough room to copy the first (or only) MAD segment. */ + recv_buf = &packet->recv_wc->recv_buf; + if ((packet->length <= sizeof (*recv_buf->mad) && + count < hdr_size(file) + packet->length) || + (packet->length > sizeof (*recv_buf->mad) && + count < hdr_size(file) + sizeof (*recv_buf->mad))) + return -EINVAL; + + if (copy_to_user(buf, &packet->mad, hdr_size(file))) + return -EFAULT; + + buf += hdr_size(file); + seg_payload = min_t(int, packet->length, sizeof (*recv_buf->mad)); + if (copy_to_user(buf, recv_buf->mad, seg_payload)) + return -EFAULT; + + if (seg_payload < packet->length) { + /* + * Multipacket RMPP MAD message. Copy remainder of message. + * Note that last segment may have a shorter payload. + */ + if (count < hdr_size(file) + packet->length) { + /* + * The buffer is too small, return the first RMPP segment, + * which includes the RMPP message length. + */ + return -ENOSPC; + } + offset = ib_get_mad_data_offset(recv_buf->mad->mad_hdr.mgmt_class); + max_seg_payload = sizeof (struct ib_mad) - offset; + + for (left = packet->length - seg_payload, buf += seg_payload; + left; left -= seg_payload, buf += seg_payload) { + recv_buf = container_of(recv_buf->list.next, + struct ib_mad_recv_buf, list); + seg_payload = min(left, max_seg_payload); + if (copy_to_user(buf, ((void *) recv_buf->mad) + offset, + seg_payload)) + return -EFAULT; + } + } + return hdr_size(file) + packet->length; +} + +static ssize_t copy_send_mad(struct ib_umad_file *file, char __user *buf, + struct ib_umad_packet *packet, size_t count) +{ + ssize_t size = hdr_size(file) + packet->length; + + if (count < size) + return -EINVAL; + + if (copy_to_user(buf, &packet->mad, hdr_size(file))) + return -EFAULT; + + buf += hdr_size(file); + + if (copy_to_user(buf, packet->mad.data, packet->length)) + return -EFAULT; + + return size; +} + +static ssize_t ib_umad_read(struct file *filp, char __user *buf, + size_t count, loff_t *pos) +{ + struct ib_umad_file *file = filp->private_data; + struct ib_umad_packet *packet; + ssize_t ret; + + if (count < hdr_size(file)) + return -EINVAL; + + mutex_lock(&file->mutex); + + while (list_empty(&file->recv_list)) { + mutex_unlock(&file->mutex); + + if (filp->f_flags & O_NONBLOCK) + return -EAGAIN; + + if (wait_event_interruptible(file->recv_wait, + !list_empty(&file->recv_list))) + return -ERESTARTSYS; + + mutex_lock(&file->mutex); + } + + packet = list_entry(file->recv_list.next, struct ib_umad_packet, list); + list_del(&packet->list); + + mutex_unlock(&file->mutex); + + if (packet->recv_wc) + ret = copy_recv_mad(file, buf, packet, count); + else + ret = copy_send_mad(file, buf, packet, count); + + if (ret < 0) { + /* Requeue packet */ + mutex_lock(&file->mutex); + list_add(&packet->list, &file->recv_list); + mutex_unlock(&file->mutex); + } else { + if (packet->recv_wc) + ib_free_recv_mad(packet->recv_wc); + kfree(packet); + } + return ret; +} + +static int copy_rmpp_mad(struct ib_mad_send_buf *msg, const char __user *buf) +{ + int left, seg; + + /* Copy class specific header */ + if ((msg->hdr_len > IB_MGMT_RMPP_HDR) && + copy_from_user(msg->mad + IB_MGMT_RMPP_HDR, buf + IB_MGMT_RMPP_HDR, + msg->hdr_len - IB_MGMT_RMPP_HDR)) + return -EFAULT; + + /* All headers are in place. Copy data segments. */ + for (seg = 1, left = msg->data_len, buf += msg->hdr_len; left > 0; + seg++, left -= msg->seg_size, buf += msg->seg_size) { + if (copy_from_user(ib_get_rmpp_segment(msg, seg), buf, + min(left, msg->seg_size))) + return -EFAULT; + } + return 0; +} + +static int same_destination(struct ib_user_mad_hdr *hdr1, + struct ib_user_mad_hdr *hdr2) +{ + if (!hdr1->grh_present && !hdr2->grh_present) + return (hdr1->lid == hdr2->lid); + + if (hdr1->grh_present && hdr2->grh_present) + return !memcmp(hdr1->gid, hdr2->gid, 16); + + return 0; +} + +static int is_duplicate(struct ib_umad_file *file, + struct ib_umad_packet *packet) +{ + struct ib_umad_packet *sent_packet; + struct ib_mad_hdr *sent_hdr, *hdr; + + hdr = (struct ib_mad_hdr *) packet->mad.data; + list_for_each_entry(sent_packet, &file->send_list, list) { + sent_hdr = (struct ib_mad_hdr *) sent_packet->mad.data; + + if ((hdr->tid != sent_hdr->tid) || + (hdr->mgmt_class != sent_hdr->mgmt_class)) + continue; + + /* + * No need to be overly clever here. If two new operations have + * the same TID, reject the second as a duplicate. This is more + * restrictive than required by the spec. + */ + if (!ib_response_mad((struct ib_mad *) hdr)) { + if (!ib_response_mad((struct ib_mad *) sent_hdr)) + return 1; + continue; + } else if (!ib_response_mad((struct ib_mad *) sent_hdr)) + continue; + + if (same_destination(&packet->mad.hdr, &sent_packet->mad.hdr)) + return 1; + } + + return 0; +} + +static ssize_t ib_umad_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct ib_umad_file *file = filp->private_data; + struct ib_umad_packet *packet; + struct ib_mad_agent *agent; + struct ib_ah_attr ah_attr; + struct ib_ah *ah; + struct ib_rmpp_mad *rmpp_mad; + __be64 *tid; + int ret, data_len, hdr_len, copy_offset, rmpp_active; + + if (count < hdr_size(file) + IB_MGMT_RMPP_HDR) + return -EINVAL; + + packet = kzalloc(sizeof *packet + IB_MGMT_RMPP_HDR, GFP_KERNEL); + if (!packet) + return -ENOMEM; + + if (copy_from_user(&packet->mad, buf, hdr_size(file))) { + ret = -EFAULT; + goto err; + } + + if (packet->mad.hdr.id >= IB_UMAD_MAX_AGENTS) { + ret = -EINVAL; + goto err; + } + + buf += hdr_size(file); + + if (copy_from_user(packet->mad.data, buf, IB_MGMT_RMPP_HDR)) { + ret = -EFAULT; + goto err; + } + + mutex_lock(&file->mutex); + + agent = __get_agent(file, packet->mad.hdr.id); + if (!agent) { + ret = -EINVAL; + goto err_up; + } + + memset(&ah_attr, 0, sizeof ah_attr); + ah_attr.dlid = be16_to_cpu(packet->mad.hdr.lid); + ah_attr.sl = packet->mad.hdr.sl; + ah_attr.src_path_bits = packet->mad.hdr.path_bits; + ah_attr.port_num = file->port->port_num; + if (packet->mad.hdr.grh_present) { + ah_attr.ah_flags = IB_AH_GRH; + memcpy(ah_attr.grh.dgid.raw, packet->mad.hdr.gid, 16); + ah_attr.grh.sgid_index = packet->mad.hdr.gid_index; + ah_attr.grh.flow_label = be32_to_cpu(packet->mad.hdr.flow_label); + ah_attr.grh.hop_limit = packet->mad.hdr.hop_limit; + ah_attr.grh.traffic_class = packet->mad.hdr.traffic_class; + } + + ah = ib_create_ah(agent->qp->pd, &ah_attr); + if (IS_ERR(ah)) { + ret = PTR_ERR(ah); + goto err_up; + } + + rmpp_mad = (struct ib_rmpp_mad *) packet->mad.data; + hdr_len = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class); + + if (ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class) + && ib_mad_kernel_rmpp_agent(agent)) { + copy_offset = IB_MGMT_RMPP_HDR; + rmpp_active = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & + IB_MGMT_RMPP_FLAG_ACTIVE; + } else { + copy_offset = IB_MGMT_MAD_HDR; + rmpp_active = 0; + } + + data_len = count - hdr_size(file) - hdr_len; + packet->msg = ib_create_send_mad(agent, + be32_to_cpu(packet->mad.hdr.qpn), + packet->mad.hdr.pkey_index, rmpp_active, + hdr_len, data_len, GFP_KERNEL); + if (IS_ERR(packet->msg)) { + ret = PTR_ERR(packet->msg); + goto err_ah; + } + + packet->msg->ah = ah; + packet->msg->timeout_ms = packet->mad.hdr.timeout_ms; + packet->msg->retries = packet->mad.hdr.retries; + packet->msg->context[0] = packet; + + /* Copy MAD header. Any RMPP header is already in place. */ + memcpy(packet->msg->mad, packet->mad.data, IB_MGMT_MAD_HDR); + + if (!rmpp_active) { + if (copy_from_user(packet->msg->mad + copy_offset, + buf + copy_offset, + hdr_len + data_len - copy_offset)) { + ret = -EFAULT; + goto err_msg; + } + } else { + ret = copy_rmpp_mad(packet->msg, buf); + if (ret) + goto err_msg; + } + + /* + * Set the high-order part of the transaction ID to make MADs from + * different agents unique, and allow routing responses back to the + * original requestor. + */ + if (!ib_response_mad(packet->msg->mad)) { + tid = &((struct ib_mad_hdr *) packet->msg->mad)->tid; + *tid = cpu_to_be64(((u64) agent->hi_tid) << 32 | + (be64_to_cpup(tid) & 0xffffffff)); + rmpp_mad->mad_hdr.tid = *tid; + } + + if (!ib_mad_kernel_rmpp_agent(agent) + && ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class) + && (ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) { + spin_lock_irq(&file->send_lock); + list_add_tail(&packet->list, &file->send_list); + spin_unlock_irq(&file->send_lock); + } else { + spin_lock_irq(&file->send_lock); + ret = is_duplicate(file, packet); + if (!ret) + list_add_tail(&packet->list, &file->send_list); + spin_unlock_irq(&file->send_lock); + if (ret) { + ret = -EINVAL; + goto err_msg; + } + } + + ret = ib_post_send_mad(packet->msg, NULL); + if (ret) + goto err_send; + + mutex_unlock(&file->mutex); + return count; + +err_send: + dequeue_send(file, packet); +err_msg: + ib_free_send_mad(packet->msg); +err_ah: + ib_destroy_ah(ah); +err_up: + mutex_unlock(&file->mutex); +err: + kfree(packet); + return ret; +} + +static unsigned int ib_umad_poll(struct file *filp, struct poll_table_struct *wait) +{ + struct ib_umad_file *file = filp->private_data; + + /* we will always be able to post a MAD send */ + unsigned int mask = POLLOUT | POLLWRNORM; + + poll_wait(filp, &file->recv_wait, wait); + + if (!list_empty(&file->recv_list)) + mask |= POLLIN | POLLRDNORM; + + return mask; +} + +static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg, + int compat_method_mask) +{ + struct ib_user_mad_reg_req ureq; + struct ib_mad_reg_req req; + struct ib_mad_agent *agent = NULL; + int agent_id; + int ret; + + mutex_lock(&file->port->file_mutex); + mutex_lock(&file->mutex); + + if (!file->port->ib_dev) { + dev_notice(file->port->dev, + "ib_umad_reg_agent: invalid device\n"); + ret = -EPIPE; + goto out; + } + + if (copy_from_user(&ureq, arg, sizeof ureq)) { + ret = -EFAULT; + goto out; + } + + if (ureq.qpn != 0 && ureq.qpn != 1) { + dev_notice(file->port->dev, + "ib_umad_reg_agent: invalid QPN %d specified\n", + ureq.qpn); + ret = -EINVAL; + goto out; + } + + for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id) + if (!__get_agent(file, agent_id)) + goto found; + + dev_notice(file->port->dev, + "ib_umad_reg_agent: Max Agents (%u) reached\n", + IB_UMAD_MAX_AGENTS); + ret = -ENOMEM; + goto out; + +found: + if (ureq.mgmt_class) { + memset(&req, 0, sizeof(req)); + req.mgmt_class = ureq.mgmt_class; + req.mgmt_class_version = ureq.mgmt_class_version; + memcpy(req.oui, ureq.oui, sizeof req.oui); + + if (compat_method_mask) { + u32 *umm = (u32 *) ureq.method_mask; + int i; + + for (i = 0; i < BITS_TO_LONGS(IB_MGMT_MAX_METHODS); ++i) + req.method_mask[i] = + umm[i * 2] | ((u64) umm[i * 2 + 1] << 32); + } else + memcpy(req.method_mask, ureq.method_mask, + sizeof req.method_mask); + } + + agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num, + ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI, + ureq.mgmt_class ? &req : NULL, + ureq.rmpp_version, + send_handler, recv_handler, file, 0); + if (IS_ERR(agent)) { + ret = PTR_ERR(agent); + agent = NULL; + goto out; + } + + if (put_user(agent_id, + (u32 __user *) (arg + offsetof(struct ib_user_mad_reg_req, id)))) { + ret = -EFAULT; + goto out; + } + + if (!file->already_used) { + file->already_used = 1; + if (!file->use_pkey_index) { + dev_warn(file->port->dev, + "process %s did not enable P_Key index support.\n", + current->comm); + dev_warn(file->port->dev, + " Documentation/infiniband/user_mad.txt has info on the new ABI.\n"); + } + } + + file->agent[agent_id] = agent; + ret = 0; + +out: + mutex_unlock(&file->mutex); + + if (ret && agent) + ib_unregister_mad_agent(agent); + + mutex_unlock(&file->port->file_mutex); + + return ret; +} + +static int ib_umad_reg_agent2(struct ib_umad_file *file, void __user *arg) +{ + struct ib_user_mad_reg_req2 ureq; + struct ib_mad_reg_req req; + struct ib_mad_agent *agent = NULL; + int agent_id; + int ret; + + mutex_lock(&file->port->file_mutex); + mutex_lock(&file->mutex); + + if (!file->port->ib_dev) { + dev_notice(file->port->dev, + "ib_umad_reg_agent2: invalid device\n"); + ret = -EPIPE; + goto out; + } + + if (copy_from_user(&ureq, arg, sizeof(ureq))) { + ret = -EFAULT; + goto out; + } + + if (ureq.qpn != 0 && ureq.qpn != 1) { + dev_notice(file->port->dev, + "ib_umad_reg_agent2: invalid QPN %d specified\n", + ureq.qpn); + ret = -EINVAL; + goto out; + } + + if (ureq.flags & ~IB_USER_MAD_REG_FLAGS_CAP) { + dev_notice(file->port->dev, + "ib_umad_reg_agent2 failed: invalid registration flags specified 0x%x; supported 0x%x\n", + ureq.flags, IB_USER_MAD_REG_FLAGS_CAP); + ret = -EINVAL; + + if (put_user((u32)IB_USER_MAD_REG_FLAGS_CAP, + (u32 __user *) (arg + offsetof(struct + ib_user_mad_reg_req2, flags)))) + ret = -EFAULT; + + goto out; + } + + for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id) + if (!__get_agent(file, agent_id)) + goto found; + + dev_notice(file->port->dev, + "ib_umad_reg_agent2: Max Agents (%u) reached\n", + IB_UMAD_MAX_AGENTS); + ret = -ENOMEM; + goto out; + +found: + if (ureq.mgmt_class) { + memset(&req, 0, sizeof(req)); + req.mgmt_class = ureq.mgmt_class; + req.mgmt_class_version = ureq.mgmt_class_version; + if (ureq.oui & 0xff000000) { + dev_notice(file->port->dev, + "ib_umad_reg_agent2 failed: oui invalid 0x%08x\n", + ureq.oui); + ret = -EINVAL; + goto out; + } + req.oui[2] = ureq.oui & 0x0000ff; + req.oui[1] = (ureq.oui & 0x00ff00) >> 8; + req.oui[0] = (ureq.oui & 0xff0000) >> 16; + memcpy(req.method_mask, ureq.method_mask, + sizeof(req.method_mask)); + } + + agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num, + ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI, + ureq.mgmt_class ? &req : NULL, + ureq.rmpp_version, + send_handler, recv_handler, file, + ureq.flags); + if (IS_ERR(agent)) { + ret = PTR_ERR(agent); + agent = NULL; + goto out; + } + + if (put_user(agent_id, + (u32 __user *)(arg + + offsetof(struct ib_user_mad_reg_req2, id)))) { + ret = -EFAULT; + goto out; + } + + if (!file->already_used) { + file->already_used = 1; + file->use_pkey_index = 1; + } + + file->agent[agent_id] = agent; + ret = 0; + +out: + mutex_unlock(&file->mutex); + + if (ret && agent) + ib_unregister_mad_agent(agent); + + mutex_unlock(&file->port->file_mutex); + + return ret; +} + + +static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg) +{ + struct ib_mad_agent *agent = NULL; + u32 id; + int ret = 0; + + if (get_user(id, arg)) + return -EFAULT; + + mutex_lock(&file->port->file_mutex); + mutex_lock(&file->mutex); + + if (id >= IB_UMAD_MAX_AGENTS || !__get_agent(file, id)) { + ret = -EINVAL; + goto out; + } + + agent = file->agent[id]; + file->agent[id] = NULL; + +out: + mutex_unlock(&file->mutex); + + if (agent) + ib_unregister_mad_agent(agent); + + mutex_unlock(&file->port->file_mutex); + + return ret; +} + +static long ib_umad_enable_pkey(struct ib_umad_file *file) +{ + int ret = 0; + + mutex_lock(&file->mutex); + if (file->already_used) + ret = -EINVAL; + else + file->use_pkey_index = 1; + mutex_unlock(&file->mutex); + + return ret; +} + +static long ib_umad_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + switch (cmd) { + case IB_USER_MAD_REGISTER_AGENT: + return ib_umad_reg_agent(filp->private_data, (void __user *) arg, 0); + case IB_USER_MAD_UNREGISTER_AGENT: + return ib_umad_unreg_agent(filp->private_data, (__u32 __user *) arg); + case IB_USER_MAD_ENABLE_PKEY: + return ib_umad_enable_pkey(filp->private_data); + case IB_USER_MAD_REGISTER_AGENT2: + return ib_umad_reg_agent2(filp->private_data, (void __user *) arg); + default: + return -ENOIOCTLCMD; + } +} + +#ifdef CONFIG_COMPAT +static long ib_umad_compat_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + switch (cmd) { + case IB_USER_MAD_REGISTER_AGENT: + return ib_umad_reg_agent(filp->private_data, compat_ptr(arg), 1); + case IB_USER_MAD_UNREGISTER_AGENT: + return ib_umad_unreg_agent(filp->private_data, compat_ptr(arg)); + case IB_USER_MAD_ENABLE_PKEY: + return ib_umad_enable_pkey(filp->private_data); + case IB_USER_MAD_REGISTER_AGENT2: + return ib_umad_reg_agent2(filp->private_data, compat_ptr(arg)); + default: + return -ENOIOCTLCMD; + } +} +#endif + +/* + * ib_umad_open() does not need the BKL: + * + * - the ib_umad_port structures are properly reference counted, and + * everything else is purely local to the file being created, so + * races against other open calls are not a problem; + * - the ioctl method does not affect any global state outside of the + * file structure being operated on; + */ +static int ib_umad_open(struct inode *inode, struct file *filp) +{ + struct ib_umad_port *port; + struct ib_umad_file *file; + int ret = -ENXIO; + + port = container_of(inode->i_cdev, struct ib_umad_port, cdev); + + mutex_lock(&port->file_mutex); + + if (!port->ib_dev) + goto out; + + ret = -ENOMEM; + file = kzalloc(sizeof *file, GFP_KERNEL); + if (!file) + goto out; + + mutex_init(&file->mutex); + spin_lock_init(&file->send_lock); + INIT_LIST_HEAD(&file->recv_list); + INIT_LIST_HEAD(&file->send_list); + init_waitqueue_head(&file->recv_wait); + + file->port = port; + filp->private_data = file; + + list_add_tail(&file->port_list, &port->file_list); + + ret = nonseekable_open(inode, filp); + if (ret) { + list_del(&file->port_list); + kfree(file); + goto out; + } + + kobject_get(&port->umad_dev->kobj); + +out: + mutex_unlock(&port->file_mutex); + return ret; +} + +static int ib_umad_close(struct inode *inode, struct file *filp) +{ + struct ib_umad_file *file = filp->private_data; + struct ib_umad_device *dev = file->port->umad_dev; + struct ib_umad_packet *packet, *tmp; + int already_dead; + int i; + + mutex_lock(&file->port->file_mutex); + mutex_lock(&file->mutex); + + already_dead = file->agents_dead; + file->agents_dead = 1; + + list_for_each_entry_safe(packet, tmp, &file->recv_list, list) { + if (packet->recv_wc) + ib_free_recv_mad(packet->recv_wc); + kfree(packet); + } + + list_del(&file->port_list); + + mutex_unlock(&file->mutex); + + if (!already_dead) + for (i = 0; i < IB_UMAD_MAX_AGENTS; ++i) + if (file->agent[i]) + ib_unregister_mad_agent(file->agent[i]); + + mutex_unlock(&file->port->file_mutex); + + kfree(file); + kobject_put(&dev->kobj); + + return 0; +} + +static const struct file_operations umad_fops = { + .owner = THIS_MODULE, + .read = ib_umad_read, + .write = ib_umad_write, + .poll = ib_umad_poll, + .unlocked_ioctl = ib_umad_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ib_umad_compat_ioctl, +#endif + .open = ib_umad_open, + .release = ib_umad_close, + .llseek = no_llseek, +}; + +static int ib_umad_sm_open(struct inode *inode, struct file *filp) +{ + struct ib_umad_port *port; + struct ib_port_modify props = { + .set_port_cap_mask = IB_PORT_SM + }; + int ret; + + port = container_of(inode->i_cdev, struct ib_umad_port, sm_cdev); + + if (filp->f_flags & O_NONBLOCK) { + if (down_trylock(&port->sm_sem)) { + ret = -EAGAIN; + goto fail; + } + } else { + if (down_interruptible(&port->sm_sem)) { + ret = -ERESTARTSYS; + goto fail; + } + } + + ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props); + if (ret) + goto err_up_sem; + + filp->private_data = port; + + ret = nonseekable_open(inode, filp); + if (ret) + goto err_clr_sm_cap; + + kobject_get(&port->umad_dev->kobj); + + return 0; + +err_clr_sm_cap: + swap(props.set_port_cap_mask, props.clr_port_cap_mask); + ib_modify_port(port->ib_dev, port->port_num, 0, &props); + +err_up_sem: + up(&port->sm_sem); + +fail: + return ret; +} + +static int ib_umad_sm_close(struct inode *inode, struct file *filp) +{ + struct ib_umad_port *port = filp->private_data; + struct ib_port_modify props = { + .clr_port_cap_mask = IB_PORT_SM + }; + int ret = 0; + + mutex_lock(&port->file_mutex); + if (port->ib_dev) + ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props); + mutex_unlock(&port->file_mutex); + + up(&port->sm_sem); + + kobject_put(&port->umad_dev->kobj); + + return ret; +} + +static const struct file_operations umad_sm_fops = { + .owner = THIS_MODULE, + .open = ib_umad_sm_open, + .release = ib_umad_sm_close, + .llseek = no_llseek, +}; + +static struct ib_client umad_client = { + .name = "umad", + .add = ib_umad_add_one, + .remove = ib_umad_remove_one +}; + +static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct ib_umad_port *port = dev_get_drvdata(dev); + + if (!port) + return -ENODEV; + + return sprintf(buf, "%s\n", port->ib_dev->name); +} +static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); + +static ssize_t show_port(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct ib_umad_port *port = dev_get_drvdata(dev); + + if (!port) + return -ENODEV; + + return sprintf(buf, "%d\n", port->port_num); +} +static DEVICE_ATTR(port, S_IRUGO, show_port, NULL); + +static CLASS_ATTR_STRING(abi_version, S_IRUGO, + __stringify(IB_USER_MAD_ABI_VERSION)); + +static dev_t overflow_maj; +static DECLARE_BITMAP(overflow_map, IB_UMAD_MAX_PORTS); +static int find_overflow_devnum(struct ib_device *device) +{ + int ret; + + if (!overflow_maj) { + ret = alloc_chrdev_region(&overflow_maj, 0, IB_UMAD_MAX_PORTS * 2, + "infiniband_mad"); + if (ret) { + dev_err(&device->dev, + "couldn't register dynamic device number\n"); + return ret; + } + } + + ret = find_first_zero_bit(overflow_map, IB_UMAD_MAX_PORTS); + if (ret >= IB_UMAD_MAX_PORTS) + return -1; + + return ret; +} + +static int ib_umad_init_port(struct ib_device *device, int port_num, + struct ib_umad_device *umad_dev, + struct ib_umad_port *port) +{ + int devnum; + dev_t base; + + spin_lock(&port_lock); + devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS); + if (devnum >= IB_UMAD_MAX_PORTS) { + spin_unlock(&port_lock); + devnum = find_overflow_devnum(device); + if (devnum < 0) + return -1; + + spin_lock(&port_lock); + port->dev_num = devnum + IB_UMAD_MAX_PORTS; + base = devnum + overflow_maj; + set_bit(devnum, overflow_map); + } else { + port->dev_num = devnum; + base = devnum + base_dev; + set_bit(devnum, dev_map); + } + spin_unlock(&port_lock); + + port->ib_dev = device; + port->port_num = port_num; + sema_init(&port->sm_sem, 1); + mutex_init(&port->file_mutex); + INIT_LIST_HEAD(&port->file_list); + + cdev_init(&port->cdev, &umad_fops); + port->cdev.owner = THIS_MODULE; + port->cdev.kobj.parent = &umad_dev->kobj; + kobject_set_name(&port->cdev.kobj, "umad%d", port->dev_num); + if (cdev_add(&port->cdev, base, 1)) + goto err_cdev; + + port->dev = device_create(umad_class, device->dma_device, + port->cdev.dev, port, + "umad%d", port->dev_num); + if (IS_ERR(port->dev)) + goto err_cdev; + + if (device_create_file(port->dev, &dev_attr_ibdev)) + goto err_dev; + if (device_create_file(port->dev, &dev_attr_port)) + goto err_dev; + + base += IB_UMAD_MAX_PORTS; + cdev_init(&port->sm_cdev, &umad_sm_fops); + port->sm_cdev.owner = THIS_MODULE; + port->sm_cdev.kobj.parent = &umad_dev->kobj; + kobject_set_name(&port->sm_cdev.kobj, "issm%d", port->dev_num); + if (cdev_add(&port->sm_cdev, base, 1)) + goto err_sm_cdev; + + port->sm_dev = device_create(umad_class, device->dma_device, + port->sm_cdev.dev, port, + "issm%d", port->dev_num); + if (IS_ERR(port->sm_dev)) + goto err_sm_cdev; + + if (device_create_file(port->sm_dev, &dev_attr_ibdev)) + goto err_sm_dev; + if (device_create_file(port->sm_dev, &dev_attr_port)) + goto err_sm_dev; + + return 0; + +err_sm_dev: + device_destroy(umad_class, port->sm_cdev.dev); + +err_sm_cdev: + cdev_del(&port->sm_cdev); + +err_dev: + device_destroy(umad_class, port->cdev.dev); + +err_cdev: + cdev_del(&port->cdev); + if (port->dev_num < IB_UMAD_MAX_PORTS) + clear_bit(devnum, dev_map); + else + clear_bit(devnum, overflow_map); + + return -1; +} + +static void ib_umad_kill_port(struct ib_umad_port *port) +{ + struct ib_umad_file *file; + int id; + + dev_set_drvdata(port->dev, NULL); + dev_set_drvdata(port->sm_dev, NULL); + + device_destroy(umad_class, port->cdev.dev); + device_destroy(umad_class, port->sm_cdev.dev); + + cdev_del(&port->cdev); + cdev_del(&port->sm_cdev); + + mutex_lock(&port->file_mutex); + + port->ib_dev = NULL; + + list_for_each_entry(file, &port->file_list, port_list) { + mutex_lock(&file->mutex); + file->agents_dead = 1; + mutex_unlock(&file->mutex); + + for (id = 0; id < IB_UMAD_MAX_AGENTS; ++id) + if (file->agent[id]) + ib_unregister_mad_agent(file->agent[id]); + } + + mutex_unlock(&port->file_mutex); + + if (port->dev_num < IB_UMAD_MAX_PORTS) + clear_bit(port->dev_num, dev_map); + else + clear_bit(port->dev_num - IB_UMAD_MAX_PORTS, overflow_map); +} + +static void ib_umad_add_one(struct ib_device *device) +{ + struct ib_umad_device *umad_dev; + int s, e, i; + + if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) + return; + + if (device->node_type == RDMA_NODE_IB_SWITCH) + s = e = 0; + else { + s = 1; + e = device->phys_port_cnt; + } + + umad_dev = kzalloc(sizeof *umad_dev + + (e - s + 1) * sizeof (struct ib_umad_port), + GFP_KERNEL); + if (!umad_dev) + return; + + kobject_init(&umad_dev->kobj, &ib_umad_dev_ktype); + + umad_dev->start_port = s; + umad_dev->end_port = e; + + for (i = s; i <= e; ++i) { + umad_dev->port[i - s].umad_dev = umad_dev; + + if (ib_umad_init_port(device, i, umad_dev, + &umad_dev->port[i - s])) + goto err; + } + + ib_set_client_data(device, &umad_client, umad_dev); + + return; + +err: + while (--i >= s) + ib_umad_kill_port(&umad_dev->port[i - s]); + + kobject_put(&umad_dev->kobj); +} + +static void ib_umad_remove_one(struct ib_device *device) +{ + struct ib_umad_device *umad_dev = ib_get_client_data(device, &umad_client); + int i; + + if (!umad_dev) + return; + + for (i = 0; i <= umad_dev->end_port - umad_dev->start_port; ++i) + ib_umad_kill_port(&umad_dev->port[i]); + + kobject_put(&umad_dev->kobj); +} + +static char *umad_devnode(struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev)); +} + +static int __init ib_umad_init(void) +{ + int ret; + + ret = register_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2, + "infiniband_mad"); + if (ret) { + pr_err("couldn't register device number\n"); + goto out; + } + + umad_class = class_create(THIS_MODULE, "infiniband_mad"); + if (IS_ERR(umad_class)) { + ret = PTR_ERR(umad_class); + pr_err("couldn't create class infiniband_mad\n"); + goto out_chrdev; + } + + umad_class->devnode = umad_devnode; + + ret = class_create_file(umad_class, &class_attr_abi_version.attr); + if (ret) { + pr_err("couldn't create abi_version attribute\n"); + goto out_class; + } + + ret = ib_register_client(&umad_client); + if (ret) { + pr_err("couldn't register ib_umad client\n"); + goto out_class; + } + + return 0; + +out_class: + class_destroy(umad_class); + +out_chrdev: + unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2); + +out: + return ret; +} + +static void __exit ib_umad_cleanup(void) +{ + ib_unregister_client(&umad_client); + class_destroy(umad_class); + unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2); + if (overflow_maj) + unregister_chrdev_region(overflow_maj, IB_UMAD_MAX_PORTS * 2); +} + +module_init(ib_umad_init); +module_exit(ib_umad_cleanup); diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h new file mode 100644 index 0000000..643c08a --- /dev/null +++ b/drivers/infiniband/core/uverbs.h @@ -0,0 +1,262 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2005 Voltaire, Inc. All rights reserved. + * Copyright (c) 2005 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef UVERBS_H +#define UVERBS_H + +#include +#include +#include +#include +#include + +#include +#include +#include + +#define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \ + do { \ + (udata)->inbuf = (const void __user *) (ibuf); \ + (udata)->outbuf = (void __user *) (obuf); \ + (udata)->inlen = (ilen); \ + (udata)->outlen = (olen); \ + } while (0) + +#define INIT_UDATA_BUF_OR_NULL(udata, ibuf, obuf, ilen, olen) \ + do { \ + (udata)->inbuf = (ilen) ? (const void __user *) (ibuf) : NULL; \ + (udata)->outbuf = (olen) ? (void __user *) (obuf) : NULL; \ + (udata)->inlen = (ilen); \ + (udata)->outlen = (olen); \ + } while (0) + +/* + * Our lifetime rules for these structs are the following: + * + * struct ib_uverbs_device: One reference is held by the module and + * released in ib_uverbs_remove_one(). Another reference is taken by + * ib_uverbs_open() each time the character special file is opened, + * and released in ib_uverbs_release_file() when the file is released. + * + * struct ib_uverbs_file: One reference is held by the VFS and + * released when the file is closed. Another reference is taken when + * an asynchronous event queue file is created and released when the + * event file is closed. + * + * struct ib_uverbs_event_file: One reference is held by the VFS and + * released when the file is closed. For asynchronous event files, + * another reference is held by the corresponding main context file + * and released when that file is closed. For completion event files, + * a reference is taken when a CQ is created that uses the file, and + * released when the CQ is destroyed. + */ + +struct ib_uverbs_device { + struct kref ref; + int num_comp_vectors; + struct completion comp; + struct device *dev; + struct ib_device *ib_dev; + int devnum; + struct cdev cdev; + struct rb_root xrcd_tree; + struct mutex xrcd_tree_mutex; +}; + +struct ib_uverbs_event_file { + struct kref ref; + int is_async; + struct ib_uverbs_file *uverbs_file; + spinlock_t lock; + int is_closed; + wait_queue_head_t poll_wait; + struct fasync_struct *async_queue; + struct list_head event_list; +}; + +struct ib_uverbs_file { + struct kref ref; + struct mutex mutex; + struct ib_uverbs_device *device; + struct ib_ucontext *ucontext; + struct ib_event_handler event_handler; + struct ib_uverbs_event_file *async_file; +}; + +struct ib_uverbs_event { + union { + struct ib_uverbs_async_event_desc async; + struct ib_uverbs_comp_event_desc comp; + } desc; + struct list_head list; + struct list_head obj_list; + u32 *counter; +}; + +struct ib_uverbs_mcast_entry { + struct list_head list; + union ib_gid gid; + u16 lid; +}; + +struct ib_uevent_object { + struct ib_uobject uobject; + struct list_head event_list; + u32 events_reported; +}; + +struct ib_uxrcd_object { + struct ib_uobject uobject; + atomic_t refcnt; +}; + +struct ib_usrq_object { + struct ib_uevent_object uevent; + struct ib_uxrcd_object *uxrcd; +}; + +struct ib_uqp_object { + struct ib_uevent_object uevent; + struct list_head mcast_list; + struct ib_uxrcd_object *uxrcd; +}; + +struct ib_ucq_object { + struct ib_uobject uobject; + struct ib_uverbs_file *uverbs_file; + struct list_head comp_list; + struct list_head async_list; + u32 comp_events_reported; + u32 async_events_reported; +}; + +extern spinlock_t ib_uverbs_idr_lock; +extern struct idr ib_uverbs_pd_idr; +extern struct idr ib_uverbs_mr_idr; +extern struct idr ib_uverbs_mw_idr; +extern struct idr ib_uverbs_ah_idr; +extern struct idr ib_uverbs_cq_idr; +extern struct idr ib_uverbs_qp_idr; +extern struct idr ib_uverbs_srq_idr; +extern struct idr ib_uverbs_xrcd_idr; +extern struct idr ib_uverbs_rule_idr; + +void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj); + +struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, + int is_async); +struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd); + +void ib_uverbs_release_ucq(struct ib_uverbs_file *file, + struct ib_uverbs_event_file *ev_file, + struct ib_ucq_object *uobj); +void ib_uverbs_release_uevent(struct ib_uverbs_file *file, + struct ib_uevent_object *uobj); + +void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context); +void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr); +void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr); +void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr); +void ib_uverbs_event_handler(struct ib_event_handler *handler, + struct ib_event *event); +void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd); + +struct ib_uverbs_flow_spec { + union { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + struct ib_uverbs_flow_spec_eth eth; + struct ib_uverbs_flow_spec_ipv4 ipv4; + struct ib_uverbs_flow_spec_tcp_udp tcp_udp; + }; +}; + +#define IB_UVERBS_DECLARE_CMD(name) \ + ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \ + const char __user *buf, int in_len, \ + int out_len) + +IB_UVERBS_DECLARE_CMD(get_context); +IB_UVERBS_DECLARE_CMD(query_device); +IB_UVERBS_DECLARE_CMD(query_port); +IB_UVERBS_DECLARE_CMD(alloc_pd); +IB_UVERBS_DECLARE_CMD(dealloc_pd); +IB_UVERBS_DECLARE_CMD(reg_mr); +IB_UVERBS_DECLARE_CMD(rereg_mr); +IB_UVERBS_DECLARE_CMD(dereg_mr); +IB_UVERBS_DECLARE_CMD(alloc_mw); +IB_UVERBS_DECLARE_CMD(dealloc_mw); +IB_UVERBS_DECLARE_CMD(create_comp_channel); +IB_UVERBS_DECLARE_CMD(create_cq); +IB_UVERBS_DECLARE_CMD(resize_cq); +IB_UVERBS_DECLARE_CMD(poll_cq); +IB_UVERBS_DECLARE_CMD(req_notify_cq); +IB_UVERBS_DECLARE_CMD(destroy_cq); +IB_UVERBS_DECLARE_CMD(create_qp); +IB_UVERBS_DECLARE_CMD(open_qp); +IB_UVERBS_DECLARE_CMD(query_qp); +IB_UVERBS_DECLARE_CMD(modify_qp); +IB_UVERBS_DECLARE_CMD(destroy_qp); +IB_UVERBS_DECLARE_CMD(post_send); +IB_UVERBS_DECLARE_CMD(post_recv); +IB_UVERBS_DECLARE_CMD(post_srq_recv); +IB_UVERBS_DECLARE_CMD(create_ah); +IB_UVERBS_DECLARE_CMD(destroy_ah); +IB_UVERBS_DECLARE_CMD(attach_mcast); +IB_UVERBS_DECLARE_CMD(detach_mcast); +IB_UVERBS_DECLARE_CMD(create_srq); +IB_UVERBS_DECLARE_CMD(modify_srq); +IB_UVERBS_DECLARE_CMD(query_srq); +IB_UVERBS_DECLARE_CMD(destroy_srq); +IB_UVERBS_DECLARE_CMD(create_xsrq); +IB_UVERBS_DECLARE_CMD(open_xrcd); +IB_UVERBS_DECLARE_CMD(close_xrcd); + +#define IB_UVERBS_DECLARE_EX_CMD(name) \ + int ib_uverbs_ex_##name(struct ib_uverbs_file *file, \ + struct ib_udata *ucore, \ + struct ib_udata *uhw) + +IB_UVERBS_DECLARE_EX_CMD(create_flow); +IB_UVERBS_DECLARE_EX_CMD(destroy_flow); + +#endif /* UVERBS_H */ diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c new file mode 100644 index 0000000..5ba2a86 --- /dev/null +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -0,0 +1,3255 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved. + * Copyright (c) 2005 PathScale, Inc. All rights reserved. + * Copyright (c) 2006 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include + +#include "uverbs.h" +#include "core_priv.h" + +struct uverbs_lock_class { + struct lock_class_key key; + char name[16]; +}; + +static struct uverbs_lock_class pd_lock_class = { .name = "PD-uobj" }; +static struct uverbs_lock_class mr_lock_class = { .name = "MR-uobj" }; +static struct uverbs_lock_class mw_lock_class = { .name = "MW-uobj" }; +static struct uverbs_lock_class cq_lock_class = { .name = "CQ-uobj" }; +static struct uverbs_lock_class qp_lock_class = { .name = "QP-uobj" }; +static struct uverbs_lock_class ah_lock_class = { .name = "AH-uobj" }; +static struct uverbs_lock_class srq_lock_class = { .name = "SRQ-uobj" }; +static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" }; +static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" }; + +/* + * The ib_uobject locking scheme is as follows: + * + * - ib_uverbs_idr_lock protects the uverbs idrs themselves, so it + * needs to be held during all idr operations. When an object is + * looked up, a reference must be taken on the object's kref before + * dropping this lock. + * + * - Each object also has an rwsem. This rwsem must be held for + * reading while an operation that uses the object is performed. + * For example, while registering an MR, the associated PD's + * uobject.mutex must be held for reading. The rwsem must be held + * for writing while initializing or destroying an object. + * + * - In addition, each object has a "live" flag. If this flag is not + * set, then lookups of the object will fail even if it is found in + * the idr. This handles a reader that blocks and does not acquire + * the rwsem until after the object is destroyed. The destroy + * operation will set the live flag to 0 and then drop the rwsem; + * this will allow the reader to acquire the rwsem, see that the + * live flag is 0, and then drop the rwsem and its reference to + * object. The underlying storage will not be freed until the last + * reference to the object is dropped. + */ + +static void init_uobj(struct ib_uobject *uobj, u64 user_handle, + struct ib_ucontext *context, struct uverbs_lock_class *c) +{ + uobj->user_handle = user_handle; + uobj->context = context; + kref_init(&uobj->ref); + init_rwsem(&uobj->mutex); + lockdep_set_class_and_name(&uobj->mutex, &c->key, c->name); + uobj->live = 0; +} + +static void release_uobj(struct kref *kref) +{ + kfree(container_of(kref, struct ib_uobject, ref)); +} + +static void put_uobj(struct ib_uobject *uobj) +{ + kref_put(&uobj->ref, release_uobj); +} + +static void put_uobj_read(struct ib_uobject *uobj) +{ + up_read(&uobj->mutex); + put_uobj(uobj); +} + +static void put_uobj_write(struct ib_uobject *uobj) +{ + up_write(&uobj->mutex); + put_uobj(uobj); +} + +static int idr_add_uobj(struct idr *idr, struct ib_uobject *uobj) +{ + int ret; + + idr_preload(GFP_KERNEL); + spin_lock(&ib_uverbs_idr_lock); + + ret = idr_alloc(idr, uobj, 0, 0, GFP_NOWAIT); + if (ret >= 0) + uobj->id = ret; + + spin_unlock(&ib_uverbs_idr_lock); + idr_preload_end(); + + return ret < 0 ? ret : 0; +} + +void idr_remove_uobj(struct idr *idr, struct ib_uobject *uobj) +{ + spin_lock(&ib_uverbs_idr_lock); + idr_remove(idr, uobj->id); + spin_unlock(&ib_uverbs_idr_lock); +} + +static struct ib_uobject *__idr_get_uobj(struct idr *idr, int id, + struct ib_ucontext *context) +{ + struct ib_uobject *uobj; + + spin_lock(&ib_uverbs_idr_lock); + uobj = idr_find(idr, id); + if (uobj) { + if (uobj->context == context) + kref_get(&uobj->ref); + else + uobj = NULL; + } + spin_unlock(&ib_uverbs_idr_lock); + + return uobj; +} + +static struct ib_uobject *idr_read_uobj(struct idr *idr, int id, + struct ib_ucontext *context, int nested) +{ + struct ib_uobject *uobj; + + uobj = __idr_get_uobj(idr, id, context); + if (!uobj) + return NULL; + + if (nested) + down_read_nested(&uobj->mutex, SINGLE_DEPTH_NESTING); + else + down_read(&uobj->mutex); + if (!uobj->live) { + put_uobj_read(uobj); + return NULL; + } + + return uobj; +} + +static struct ib_uobject *idr_write_uobj(struct idr *idr, int id, + struct ib_ucontext *context) +{ + struct ib_uobject *uobj; + + uobj = __idr_get_uobj(idr, id, context); + if (!uobj) + return NULL; + + down_write(&uobj->mutex); + if (!uobj->live) { + put_uobj_write(uobj); + return NULL; + } + + return uobj; +} + +static void *idr_read_obj(struct idr *idr, int id, struct ib_ucontext *context, + int nested) +{ + struct ib_uobject *uobj; + + uobj = idr_read_uobj(idr, id, context, nested); + return uobj ? uobj->object : NULL; +} + +static struct ib_pd *idr_read_pd(int pd_handle, struct ib_ucontext *context) +{ + return idr_read_obj(&ib_uverbs_pd_idr, pd_handle, context, 0); +} + +static void put_pd_read(struct ib_pd *pd) +{ + put_uobj_read(pd->uobject); +} + +static struct ib_cq *idr_read_cq(int cq_handle, struct ib_ucontext *context, int nested) +{ + return idr_read_obj(&ib_uverbs_cq_idr, cq_handle, context, nested); +} + +static void put_cq_read(struct ib_cq *cq) +{ + put_uobj_read(cq->uobject); +} + +static struct ib_ah *idr_read_ah(int ah_handle, struct ib_ucontext *context) +{ + return idr_read_obj(&ib_uverbs_ah_idr, ah_handle, context, 0); +} + +static void put_ah_read(struct ib_ah *ah) +{ + put_uobj_read(ah->uobject); +} + +static struct ib_qp *idr_read_qp(int qp_handle, struct ib_ucontext *context) +{ + return idr_read_obj(&ib_uverbs_qp_idr, qp_handle, context, 0); +} + +static struct ib_qp *idr_write_qp(int qp_handle, struct ib_ucontext *context) +{ + struct ib_uobject *uobj; + + uobj = idr_write_uobj(&ib_uverbs_qp_idr, qp_handle, context); + return uobj ? uobj->object : NULL; +} + +static void put_qp_read(struct ib_qp *qp) +{ + put_uobj_read(qp->uobject); +} + +static void put_qp_write(struct ib_qp *qp) +{ + put_uobj_write(qp->uobject); +} + +static struct ib_srq *idr_read_srq(int srq_handle, struct ib_ucontext *context) +{ + return idr_read_obj(&ib_uverbs_srq_idr, srq_handle, context, 0); +} + +static void put_srq_read(struct ib_srq *srq) +{ + put_uobj_read(srq->uobject); +} + +static struct ib_xrcd *idr_read_xrcd(int xrcd_handle, struct ib_ucontext *context, + struct ib_uobject **uobj) +{ + *uobj = idr_read_uobj(&ib_uverbs_xrcd_idr, xrcd_handle, context, 0); + return *uobj ? (*uobj)->object : NULL; +} + +static void put_xrcd_read(struct ib_uobject *uobj) +{ + put_uobj_read(uobj); +} + +ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, + const char __user *buf, + int in_len, int out_len) +{ + struct ib_uverbs_get_context cmd; + struct ib_uverbs_get_context_resp resp; + struct ib_udata udata; + struct ib_device *ibdev = file->device->ib_dev; + struct ib_ucontext *ucontext; + struct file *filp; + int ret; + + if (out_len < sizeof resp) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + mutex_lock(&file->mutex); + + if (file->ucontext) { + ret = -EINVAL; + goto err; + } + + INIT_UDATA(&udata, buf + sizeof cmd, + (unsigned long) cmd.response + sizeof resp, + in_len - sizeof cmd, out_len - sizeof resp); + + ucontext = ibdev->alloc_ucontext(ibdev, &udata); + if (IS_ERR(ucontext)) { + ret = PTR_ERR(ucontext); + goto err; + } + + ucontext->device = ibdev; + INIT_LIST_HEAD(&ucontext->pd_list); + INIT_LIST_HEAD(&ucontext->mr_list); + INIT_LIST_HEAD(&ucontext->mw_list); + INIT_LIST_HEAD(&ucontext->cq_list); + INIT_LIST_HEAD(&ucontext->qp_list); + INIT_LIST_HEAD(&ucontext->srq_list); + INIT_LIST_HEAD(&ucontext->ah_list); + INIT_LIST_HEAD(&ucontext->xrcd_list); + INIT_LIST_HEAD(&ucontext->rule_list); + ucontext->closing = 0; + + resp.num_comp_vectors = file->device->num_comp_vectors; + + ret = get_unused_fd_flags(O_CLOEXEC); + if (ret < 0) + goto err_free; + resp.async_fd = ret; + + filp = ib_uverbs_alloc_event_file(file, 1); + if (IS_ERR(filp)) { + ret = PTR_ERR(filp); + goto err_fd; + } + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) { + ret = -EFAULT; + goto err_file; + } + + file->async_file = filp->private_data; + + INIT_IB_EVENT_HANDLER(&file->event_handler, file->device->ib_dev, + ib_uverbs_event_handler); + ret = ib_register_event_handler(&file->event_handler); + if (ret) + goto err_file; + + kref_get(&file->async_file->ref); + kref_get(&file->ref); + file->ucontext = ucontext; + + fd_install(resp.async_fd, filp); + + mutex_unlock(&file->mutex); + + return in_len; + +err_file: + fput(filp); + +err_fd: + put_unused_fd(resp.async_fd); + +err_free: + ibdev->dealloc_ucontext(ucontext); + +err: + mutex_unlock(&file->mutex); + return ret; +} + +ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, + const char __user *buf, + int in_len, int out_len) +{ + struct ib_uverbs_query_device cmd; + struct ib_uverbs_query_device_resp resp; + struct ib_device_attr attr; + int ret; + + if (out_len < sizeof resp) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + ret = ib_query_device(file->device->ib_dev, &attr); + if (ret) + return ret; + + memset(&resp, 0, sizeof resp); + + resp.fw_ver = attr.fw_ver; + resp.node_guid = file->device->ib_dev->node_guid; + resp.sys_image_guid = attr.sys_image_guid; + resp.max_mr_size = attr.max_mr_size; + resp.page_size_cap = attr.page_size_cap; + resp.vendor_id = attr.vendor_id; + resp.vendor_part_id = attr.vendor_part_id; + resp.hw_ver = attr.hw_ver; + resp.max_qp = attr.max_qp; + resp.max_qp_wr = attr.max_qp_wr; + resp.device_cap_flags = attr.device_cap_flags; + resp.max_sge = attr.max_sge; + resp.max_sge_rd = attr.max_sge_rd; + resp.max_cq = attr.max_cq; + resp.max_cqe = attr.max_cqe; + resp.max_mr = attr.max_mr; + resp.max_pd = attr.max_pd; + resp.max_qp_rd_atom = attr.max_qp_rd_atom; + resp.max_ee_rd_atom = attr.max_ee_rd_atom; + resp.max_res_rd_atom = attr.max_res_rd_atom; + resp.max_qp_init_rd_atom = attr.max_qp_init_rd_atom; + resp.max_ee_init_rd_atom = attr.max_ee_init_rd_atom; + resp.atomic_cap = attr.atomic_cap; + resp.max_ee = attr.max_ee; + resp.max_rdd = attr.max_rdd; + resp.max_mw = attr.max_mw; + resp.max_raw_ipv6_qp = attr.max_raw_ipv6_qp; + resp.max_raw_ethy_qp = attr.max_raw_ethy_qp; + resp.max_mcast_grp = attr.max_mcast_grp; + resp.max_mcast_qp_attach = attr.max_mcast_qp_attach; + resp.max_total_mcast_qp_attach = attr.max_total_mcast_qp_attach; + resp.max_ah = attr.max_ah; + resp.max_fmr = attr.max_fmr; + resp.max_map_per_fmr = attr.max_map_per_fmr; + resp.max_srq = attr.max_srq; + resp.max_srq_wr = attr.max_srq_wr; + resp.max_srq_sge = attr.max_srq_sge; + resp.max_pkeys = attr.max_pkeys; + resp.local_ca_ack_delay = attr.local_ca_ack_delay; + resp.phys_port_cnt = file->device->ib_dev->phys_port_cnt; + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) + return -EFAULT; + + return in_len; +} + +ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file, + const char __user *buf, + int in_len, int out_len) +{ + struct ib_uverbs_query_port cmd; + struct ib_uverbs_query_port_resp resp; + struct ib_port_attr attr; + int ret; + + if (out_len < sizeof resp) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + ret = ib_query_port(file->device->ib_dev, cmd.port_num, &attr); + if (ret) + return ret; + + memset(&resp, 0, sizeof resp); + + resp.state = attr.state; + resp.max_mtu = attr.max_mtu; + resp.active_mtu = attr.active_mtu; + resp.gid_tbl_len = attr.gid_tbl_len; + resp.port_cap_flags = attr.port_cap_flags; + resp.max_msg_sz = attr.max_msg_sz; + resp.bad_pkey_cntr = attr.bad_pkey_cntr; + resp.qkey_viol_cntr = attr.qkey_viol_cntr; + resp.pkey_tbl_len = attr.pkey_tbl_len; + resp.lid = attr.lid; + resp.sm_lid = attr.sm_lid; + resp.lmc = attr.lmc; + resp.max_vl_num = attr.max_vl_num; + resp.sm_sl = attr.sm_sl; + resp.subnet_timeout = attr.subnet_timeout; + resp.init_type_reply = attr.init_type_reply; + resp.active_width = attr.active_width; + resp.active_speed = attr.active_speed; + resp.phys_state = attr.phys_state; + resp.link_layer = rdma_port_get_link_layer(file->device->ib_dev, + cmd.port_num); + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) + return -EFAULT; + + return in_len; +} + +ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, + const char __user *buf, + int in_len, int out_len) +{ + struct ib_uverbs_alloc_pd cmd; + struct ib_uverbs_alloc_pd_resp resp; + struct ib_udata udata; + struct ib_uobject *uobj; + struct ib_pd *pd; + int ret; + + if (out_len < sizeof resp) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + INIT_UDATA(&udata, buf + sizeof cmd, + (unsigned long) cmd.response + sizeof resp, + in_len - sizeof cmd, out_len - sizeof resp); + + uobj = kmalloc(sizeof *uobj, GFP_KERNEL); + if (!uobj) + return -ENOMEM; + + init_uobj(uobj, 0, file->ucontext, &pd_lock_class); + down_write(&uobj->mutex); + + pd = file->device->ib_dev->alloc_pd(file->device->ib_dev, + file->ucontext, &udata); + if (IS_ERR(pd)) { + ret = PTR_ERR(pd); + goto err; + } + + pd->device = file->device->ib_dev; + pd->uobject = uobj; + atomic_set(&pd->usecnt, 0); + + uobj->object = pd; + ret = idr_add_uobj(&ib_uverbs_pd_idr, uobj); + if (ret) + goto err_idr; + + memset(&resp, 0, sizeof resp); + resp.pd_handle = uobj->id; + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) { + ret = -EFAULT; + goto err_copy; + } + + mutex_lock(&file->mutex); + list_add_tail(&uobj->list, &file->ucontext->pd_list); + mutex_unlock(&file->mutex); + + uobj->live = 1; + + up_write(&uobj->mutex); + + return in_len; + +err_copy: + idr_remove_uobj(&ib_uverbs_pd_idr, uobj); + +err_idr: + ib_dealloc_pd(pd); + +err: + put_uobj_write(uobj); + return ret; +} + +ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file, + const char __user *buf, + int in_len, int out_len) +{ + struct ib_uverbs_dealloc_pd cmd; + struct ib_uobject *uobj; + int ret; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + uobj = idr_write_uobj(&ib_uverbs_pd_idr, cmd.pd_handle, file->ucontext); + if (!uobj) + return -EINVAL; + + ret = ib_dealloc_pd(uobj->object); + if (!ret) + uobj->live = 0; + + put_uobj_write(uobj); + + if (ret) + return ret; + + idr_remove_uobj(&ib_uverbs_pd_idr, uobj); + + mutex_lock(&file->mutex); + list_del(&uobj->list); + mutex_unlock(&file->mutex); + + put_uobj(uobj); + + return in_len; +} + +struct xrcd_table_entry { + struct rb_node node; + struct ib_xrcd *xrcd; + struct inode *inode; +}; + +static int xrcd_table_insert(struct ib_uverbs_device *dev, + struct inode *inode, + struct ib_xrcd *xrcd) +{ + struct xrcd_table_entry *entry, *scan; + struct rb_node **p = &dev->xrcd_tree.rb_node; + struct rb_node *parent = NULL; + + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) + return -ENOMEM; + + entry->xrcd = xrcd; + entry->inode = inode; + + while (*p) { + parent = *p; + scan = rb_entry(parent, struct xrcd_table_entry, node); + + if (inode < scan->inode) { + p = &(*p)->rb_left; + } else if (inode > scan->inode) { + p = &(*p)->rb_right; + } else { + kfree(entry); + return -EEXIST; + } + } + + rb_link_node(&entry->node, parent, p); + rb_insert_color(&entry->node, &dev->xrcd_tree); + igrab(inode); + return 0; +} + +static struct xrcd_table_entry *xrcd_table_search(struct ib_uverbs_device *dev, + struct inode *inode) +{ + struct xrcd_table_entry *entry; + struct rb_node *p = dev->xrcd_tree.rb_node; + + while (p) { + entry = rb_entry(p, struct xrcd_table_entry, node); + + if (inode < entry->inode) + p = p->rb_left; + else if (inode > entry->inode) + p = p->rb_right; + else + return entry; + } + + return NULL; +} + +static struct ib_xrcd *find_xrcd(struct ib_uverbs_device *dev, struct inode *inode) +{ + struct xrcd_table_entry *entry; + + entry = xrcd_table_search(dev, inode); + if (!entry) + return NULL; + + return entry->xrcd; +} + +static void xrcd_table_delete(struct ib_uverbs_device *dev, + struct inode *inode) +{ + struct xrcd_table_entry *entry; + + entry = xrcd_table_search(dev, inode); + if (entry) { + iput(inode); + rb_erase(&entry->node, &dev->xrcd_tree); + kfree(entry); + } +} + +ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_open_xrcd cmd; + struct ib_uverbs_open_xrcd_resp resp; + struct ib_udata udata; + struct ib_uxrcd_object *obj; + struct ib_xrcd *xrcd = NULL; + struct fd f = {NULL, 0}; + struct inode *inode = NULL; + int ret = 0; + int new_xrcd = 0; + + if (out_len < sizeof resp) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + INIT_UDATA(&udata, buf + sizeof cmd, + (unsigned long) cmd.response + sizeof resp, + in_len - sizeof cmd, out_len - sizeof resp); + + mutex_lock(&file->device->xrcd_tree_mutex); + + if (cmd.fd != -1) { + /* search for file descriptor */ + f = fdget(cmd.fd); + if (!f.file) { + ret = -EBADF; + goto err_tree_mutex_unlock; + } + + inode = file_inode(f.file); + xrcd = find_xrcd(file->device, inode); + if (!xrcd && !(cmd.oflags & O_CREAT)) { + /* no file descriptor. Need CREATE flag */ + ret = -EAGAIN; + goto err_tree_mutex_unlock; + } + + if (xrcd && cmd.oflags & O_EXCL) { + ret = -EINVAL; + goto err_tree_mutex_unlock; + } + } + + obj = kmalloc(sizeof *obj, GFP_KERNEL); + if (!obj) { + ret = -ENOMEM; + goto err_tree_mutex_unlock; + } + + init_uobj(&obj->uobject, 0, file->ucontext, &xrcd_lock_class); + + down_write(&obj->uobject.mutex); + + if (!xrcd) { + xrcd = file->device->ib_dev->alloc_xrcd(file->device->ib_dev, + file->ucontext, &udata); + if (IS_ERR(xrcd)) { + ret = PTR_ERR(xrcd); + goto err; + } + + xrcd->inode = inode; + xrcd->device = file->device->ib_dev; + atomic_set(&xrcd->usecnt, 0); + mutex_init(&xrcd->tgt_qp_mutex); + INIT_LIST_HEAD(&xrcd->tgt_qp_list); + new_xrcd = 1; + } + + atomic_set(&obj->refcnt, 0); + obj->uobject.object = xrcd; + ret = idr_add_uobj(&ib_uverbs_xrcd_idr, &obj->uobject); + if (ret) + goto err_idr; + + memset(&resp, 0, sizeof resp); + resp.xrcd_handle = obj->uobject.id; + + if (inode) { + if (new_xrcd) { + /* create new inode/xrcd table entry */ + ret = xrcd_table_insert(file->device, inode, xrcd); + if (ret) + goto err_insert_xrcd; + } + atomic_inc(&xrcd->usecnt); + } + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) { + ret = -EFAULT; + goto err_copy; + } + + if (f.file) + fdput(f); + + mutex_lock(&file->mutex); + list_add_tail(&obj->uobject.list, &file->ucontext->xrcd_list); + mutex_unlock(&file->mutex); + + obj->uobject.live = 1; + up_write(&obj->uobject.mutex); + + mutex_unlock(&file->device->xrcd_tree_mutex); + return in_len; + +err_copy: + if (inode) { + if (new_xrcd) + xrcd_table_delete(file->device, inode); + atomic_dec(&xrcd->usecnt); + } + +err_insert_xrcd: + idr_remove_uobj(&ib_uverbs_xrcd_idr, &obj->uobject); + +err_idr: + ib_dealloc_xrcd(xrcd); + +err: + put_uobj_write(&obj->uobject); + +err_tree_mutex_unlock: + if (f.file) + fdput(f); + + mutex_unlock(&file->device->xrcd_tree_mutex); + + return ret; +} + +ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_close_xrcd cmd; + struct ib_uobject *uobj; + struct ib_xrcd *xrcd = NULL; + struct inode *inode = NULL; + struct ib_uxrcd_object *obj; + int live; + int ret = 0; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + mutex_lock(&file->device->xrcd_tree_mutex); + uobj = idr_write_uobj(&ib_uverbs_xrcd_idr, cmd.xrcd_handle, file->ucontext); + if (!uobj) { + ret = -EINVAL; + goto out; + } + + xrcd = uobj->object; + inode = xrcd->inode; + obj = container_of(uobj, struct ib_uxrcd_object, uobject); + if (atomic_read(&obj->refcnt)) { + put_uobj_write(uobj); + ret = -EBUSY; + goto out; + } + + if (!inode || atomic_dec_and_test(&xrcd->usecnt)) { + ret = ib_dealloc_xrcd(uobj->object); + if (!ret) + uobj->live = 0; + } + + live = uobj->live; + if (inode && ret) + atomic_inc(&xrcd->usecnt); + + put_uobj_write(uobj); + + if (ret) + goto out; + + if (inode && !live) + xrcd_table_delete(file->device, inode); + + idr_remove_uobj(&ib_uverbs_xrcd_idr, uobj); + mutex_lock(&file->mutex); + list_del(&uobj->list); + mutex_unlock(&file->mutex); + + put_uobj(uobj); + ret = in_len; + +out: + mutex_unlock(&file->device->xrcd_tree_mutex); + return ret; +} + +void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, + struct ib_xrcd *xrcd) +{ + struct inode *inode; + + inode = xrcd->inode; + if (inode && !atomic_dec_and_test(&xrcd->usecnt)) + return; + + ib_dealloc_xrcd(xrcd); + + if (inode) + xrcd_table_delete(dev, inode); +} + +ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_reg_mr cmd; + struct ib_uverbs_reg_mr_resp resp; + struct ib_udata udata; + struct ib_uobject *uobj; + struct ib_pd *pd; + struct ib_mr *mr; + int ret; + + if (out_len < sizeof resp) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + INIT_UDATA(&udata, buf + sizeof cmd, + (unsigned long) cmd.response + sizeof resp, + in_len - sizeof cmd, out_len - sizeof resp); + + if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)) + return -EINVAL; + + ret = ib_check_mr_access(cmd.access_flags); + if (ret) + return ret; + + uobj = kmalloc(sizeof *uobj, GFP_KERNEL); + if (!uobj) + return -ENOMEM; + + init_uobj(uobj, 0, file->ucontext, &mr_lock_class); + down_write(&uobj->mutex); + + pd = idr_read_pd(cmd.pd_handle, file->ucontext); + if (!pd) { + ret = -EINVAL; + goto err_free; + } + + mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va, + cmd.access_flags, &udata); + if (IS_ERR(mr)) { + ret = PTR_ERR(mr); + goto err_put; + } + + mr->device = pd->device; + mr->pd = pd; + mr->uobject = uobj; + atomic_inc(&pd->usecnt); + atomic_set(&mr->usecnt, 0); + + uobj->object = mr; + ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj); + if (ret) + goto err_unreg; + + memset(&resp, 0, sizeof resp); + resp.lkey = mr->lkey; + resp.rkey = mr->rkey; + resp.mr_handle = uobj->id; + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) { + ret = -EFAULT; + goto err_copy; + } + + put_pd_read(pd); + + mutex_lock(&file->mutex); + list_add_tail(&uobj->list, &file->ucontext->mr_list); + mutex_unlock(&file->mutex); + + uobj->live = 1; + + up_write(&uobj->mutex); + + return in_len; + +err_copy: + idr_remove_uobj(&ib_uverbs_mr_idr, uobj); + +err_unreg: + ib_dereg_mr(mr); + +err_put: + put_pd_read(pd); + +err_free: + put_uobj_write(uobj); + return ret; +} + +ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_rereg_mr cmd; + struct ib_uverbs_rereg_mr_resp resp; + struct ib_udata udata; + struct ib_pd *pd = NULL; + struct ib_mr *mr; + struct ib_pd *old_pd; + int ret; + struct ib_uobject *uobj; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof(cmd))) + return -EFAULT; + + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd), out_len - sizeof(resp)); + + if (cmd.flags & ~IB_MR_REREG_SUPPORTED || !cmd.flags) + return -EINVAL; + + if ((cmd.flags & IB_MR_REREG_TRANS) && + (!cmd.start || !cmd.hca_va || 0 >= cmd.length || + (cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))) + return -EINVAL; + + uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle, + file->ucontext); + + if (!uobj) + return -EINVAL; + + mr = uobj->object; + + if (cmd.flags & IB_MR_REREG_ACCESS) { + ret = ib_check_mr_access(cmd.access_flags); + if (ret) + goto put_uobjs; + } + + if (cmd.flags & IB_MR_REREG_PD) { + pd = idr_read_pd(cmd.pd_handle, file->ucontext); + if (!pd) { + ret = -EINVAL; + goto put_uobjs; + } + } + + if (atomic_read(&mr->usecnt)) { + ret = -EBUSY; + goto put_uobj_pd; + } + + old_pd = mr->pd; + ret = mr->device->rereg_user_mr(mr, cmd.flags, cmd.start, + cmd.length, cmd.hca_va, + cmd.access_flags, pd, &udata); + if (!ret) { + if (cmd.flags & IB_MR_REREG_PD) { + atomic_inc(&pd->usecnt); + mr->pd = pd; + atomic_dec(&old_pd->usecnt); + } + } else { + goto put_uobj_pd; + } + + memset(&resp, 0, sizeof(resp)); + resp.lkey = mr->lkey; + resp.rkey = mr->rkey; + + if (copy_to_user((void __user *)(unsigned long)cmd.response, + &resp, sizeof(resp))) + ret = -EFAULT; + else + ret = in_len; + +put_uobj_pd: + if (cmd.flags & IB_MR_REREG_PD) + put_pd_read(pd); + +put_uobjs: + + put_uobj_write(mr->uobject); + + return ret; +} + +ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_dereg_mr cmd; + struct ib_mr *mr; + struct ib_uobject *uobj; + int ret = -EINVAL; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle, file->ucontext); + if (!uobj) + return -EINVAL; + + mr = uobj->object; + + ret = ib_dereg_mr(mr); + if (!ret) + uobj->live = 0; + + put_uobj_write(uobj); + + if (ret) + return ret; + + idr_remove_uobj(&ib_uverbs_mr_idr, uobj); + + mutex_lock(&file->mutex); + list_del(&uobj->list); + mutex_unlock(&file->mutex); + + put_uobj(uobj); + + return in_len; +} + +ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_alloc_mw cmd; + struct ib_uverbs_alloc_mw_resp resp; + struct ib_uobject *uobj; + struct ib_pd *pd; + struct ib_mw *mw; + int ret; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof(cmd))) + return -EFAULT; + + uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); + if (!uobj) + return -ENOMEM; + + init_uobj(uobj, 0, file->ucontext, &mw_lock_class); + down_write(&uobj->mutex); + + pd = idr_read_pd(cmd.pd_handle, file->ucontext); + if (!pd) { + ret = -EINVAL; + goto err_free; + } + + mw = pd->device->alloc_mw(pd, cmd.mw_type); + if (IS_ERR(mw)) { + ret = PTR_ERR(mw); + goto err_put; + } + + mw->device = pd->device; + mw->pd = pd; + mw->uobject = uobj; + atomic_inc(&pd->usecnt); + + uobj->object = mw; + ret = idr_add_uobj(&ib_uverbs_mw_idr, uobj); + if (ret) + goto err_unalloc; + + memset(&resp, 0, sizeof(resp)); + resp.rkey = mw->rkey; + resp.mw_handle = uobj->id; + + if (copy_to_user((void __user *)(unsigned long)cmd.response, + &resp, sizeof(resp))) { + ret = -EFAULT; + goto err_copy; + } + + put_pd_read(pd); + + mutex_lock(&file->mutex); + list_add_tail(&uobj->list, &file->ucontext->mw_list); + mutex_unlock(&file->mutex); + + uobj->live = 1; + + up_write(&uobj->mutex); + + return in_len; + +err_copy: + idr_remove_uobj(&ib_uverbs_mw_idr, uobj); + +err_unalloc: + ib_dealloc_mw(mw); + +err_put: + put_pd_read(pd); + +err_free: + put_uobj_write(uobj); + return ret; +} + +ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_dealloc_mw cmd; + struct ib_mw *mw; + struct ib_uobject *uobj; + int ret = -EINVAL; + + if (copy_from_user(&cmd, buf, sizeof(cmd))) + return -EFAULT; + + uobj = idr_write_uobj(&ib_uverbs_mw_idr, cmd.mw_handle, file->ucontext); + if (!uobj) + return -EINVAL; + + mw = uobj->object; + + ret = ib_dealloc_mw(mw); + if (!ret) + uobj->live = 0; + + put_uobj_write(uobj); + + if (ret) + return ret; + + idr_remove_uobj(&ib_uverbs_mw_idr, uobj); + + mutex_lock(&file->mutex); + list_del(&uobj->list); + mutex_unlock(&file->mutex); + + put_uobj(uobj); + + return in_len; +} + +ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_create_comp_channel cmd; + struct ib_uverbs_create_comp_channel_resp resp; + struct file *filp; + int ret; + + if (out_len < sizeof resp) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + ret = get_unused_fd_flags(O_CLOEXEC); + if (ret < 0) + return ret; + resp.fd = ret; + + filp = ib_uverbs_alloc_event_file(file, 0); + if (IS_ERR(filp)) { + put_unused_fd(resp.fd); + return PTR_ERR(filp); + } + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) { + put_unused_fd(resp.fd); + fput(filp); + return -EFAULT; + } + + fd_install(resp.fd, filp); + return in_len; +} + +ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_create_cq cmd; + struct ib_uverbs_create_cq_resp resp; + struct ib_udata udata; + struct ib_ucq_object *obj; + struct ib_uverbs_event_file *ev_file = NULL; + struct ib_cq *cq; + int ret; + + if (out_len < sizeof resp) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + INIT_UDATA(&udata, buf + sizeof cmd, + (unsigned long) cmd.response + sizeof resp, + in_len - sizeof cmd, out_len - sizeof resp); + + if (cmd.comp_vector >= file->device->num_comp_vectors) + return -EINVAL; + + obj = kmalloc(sizeof *obj, GFP_KERNEL); + if (!obj) + return -ENOMEM; + + init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, &cq_lock_class); + down_write(&obj->uobject.mutex); + + if (cmd.comp_channel >= 0) { + ev_file = ib_uverbs_lookup_comp_file(cmd.comp_channel); + if (!ev_file) { + ret = -EINVAL; + goto err; + } + } + + obj->uverbs_file = file; + obj->comp_events_reported = 0; + obj->async_events_reported = 0; + INIT_LIST_HEAD(&obj->comp_list); + INIT_LIST_HEAD(&obj->async_list); + + cq = file->device->ib_dev->create_cq(file->device->ib_dev, cmd.cqe, + cmd.comp_vector, + file->ucontext, &udata); + if (IS_ERR(cq)) { + ret = PTR_ERR(cq); + goto err_file; + } + + cq->device = file->device->ib_dev; + cq->uobject = &obj->uobject; + cq->comp_handler = ib_uverbs_comp_handler; + cq->event_handler = ib_uverbs_cq_event_handler; + cq->cq_context = ev_file; + atomic_set(&cq->usecnt, 0); + + obj->uobject.object = cq; + ret = idr_add_uobj(&ib_uverbs_cq_idr, &obj->uobject); + if (ret) + goto err_free; + + memset(&resp, 0, sizeof resp); + resp.cq_handle = obj->uobject.id; + resp.cqe = cq->cqe; + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) { + ret = -EFAULT; + goto err_copy; + } + + mutex_lock(&file->mutex); + list_add_tail(&obj->uobject.list, &file->ucontext->cq_list); + mutex_unlock(&file->mutex); + + obj->uobject.live = 1; + + up_write(&obj->uobject.mutex); + + return in_len; + +err_copy: + idr_remove_uobj(&ib_uverbs_cq_idr, &obj->uobject); + +err_free: + ib_destroy_cq(cq); + +err_file: + if (ev_file) + ib_uverbs_release_ucq(file, ev_file, obj); + +err: + put_uobj_write(&obj->uobject); + return ret; +} + +ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_resize_cq cmd; + struct ib_uverbs_resize_cq_resp resp; + struct ib_udata udata; + struct ib_cq *cq; + int ret = -EINVAL; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + INIT_UDATA(&udata, buf + sizeof cmd, + (unsigned long) cmd.response + sizeof resp, + in_len - sizeof cmd, out_len - sizeof resp); + + cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); + if (!cq) + return -EINVAL; + + ret = cq->device->resize_cq(cq, cmd.cqe, &udata); + if (ret) + goto out; + + resp.cqe = cq->cqe; + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp.cqe)) + ret = -EFAULT; + +out: + put_cq_read(cq); + + return ret ? ret : in_len; +} + +static int copy_wc_to_user(void __user *dest, struct ib_wc *wc) +{ + struct ib_uverbs_wc tmp; + + tmp.wr_id = wc->wr_id; + tmp.status = wc->status; + tmp.opcode = wc->opcode; + tmp.vendor_err = wc->vendor_err; + tmp.byte_len = wc->byte_len; + tmp.ex.imm_data = (__u32 __force) wc->ex.imm_data; + tmp.qp_num = wc->qp->qp_num; + tmp.src_qp = wc->src_qp; + tmp.wc_flags = wc->wc_flags; + tmp.pkey_index = wc->pkey_index; + tmp.slid = wc->slid; + tmp.sl = wc->sl; + tmp.dlid_path_bits = wc->dlid_path_bits; + tmp.port_num = wc->port_num; + tmp.reserved = 0; + + if (copy_to_user(dest, &tmp, sizeof tmp)) + return -EFAULT; + + return 0; +} + +ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_poll_cq cmd; + struct ib_uverbs_poll_cq_resp resp; + u8 __user *header_ptr; + u8 __user *data_ptr; + struct ib_cq *cq; + struct ib_wc wc; + int ret; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); + if (!cq) + return -EINVAL; + + /* we copy a struct ib_uverbs_poll_cq_resp to user space */ + header_ptr = (void __user *)(unsigned long) cmd.response; + data_ptr = header_ptr + sizeof resp; + + memset(&resp, 0, sizeof resp); + while (resp.count < cmd.ne) { + ret = ib_poll_cq(cq, 1, &wc); + if (ret < 0) + goto out_put; + if (!ret) + break; + + ret = copy_wc_to_user(data_ptr, &wc); + if (ret) + goto out_put; + + data_ptr += sizeof(struct ib_uverbs_wc); + ++resp.count; + } + + if (copy_to_user(header_ptr, &resp, sizeof resp)) { + ret = -EFAULT; + goto out_put; + } + + ret = in_len; + +out_put: + put_cq_read(cq); + return ret; +} + +ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_req_notify_cq cmd; + struct ib_cq *cq; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); + if (!cq) + return -EINVAL; + + ib_req_notify_cq(cq, cmd.solicited_only ? + IB_CQ_SOLICITED : IB_CQ_NEXT_COMP); + + put_cq_read(cq); + + return in_len; +} + +ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_destroy_cq cmd; + struct ib_uverbs_destroy_cq_resp resp; + struct ib_uobject *uobj; + struct ib_cq *cq; + struct ib_ucq_object *obj; + struct ib_uverbs_event_file *ev_file; + int ret = -EINVAL; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + uobj = idr_write_uobj(&ib_uverbs_cq_idr, cmd.cq_handle, file->ucontext); + if (!uobj) + return -EINVAL; + cq = uobj->object; + ev_file = cq->cq_context; + obj = container_of(cq->uobject, struct ib_ucq_object, uobject); + + ret = ib_destroy_cq(cq); + if (!ret) + uobj->live = 0; + + put_uobj_write(uobj); + + if (ret) + return ret; + + idr_remove_uobj(&ib_uverbs_cq_idr, uobj); + + mutex_lock(&file->mutex); + list_del(&uobj->list); + mutex_unlock(&file->mutex); + + ib_uverbs_release_ucq(file, ev_file, obj); + + memset(&resp, 0, sizeof resp); + resp.comp_events_reported = obj->comp_events_reported; + resp.async_events_reported = obj->async_events_reported; + + put_uobj(uobj); + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) + return -EFAULT; + + return in_len; +} + +ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_create_qp cmd; + struct ib_uverbs_create_qp_resp resp; + struct ib_udata udata; + struct ib_uqp_object *obj; + struct ib_device *device; + struct ib_pd *pd = NULL; + struct ib_xrcd *xrcd = NULL; + struct ib_uobject *uninitialized_var(xrcd_uobj); + struct ib_cq *scq = NULL, *rcq = NULL; + struct ib_srq *srq = NULL; + struct ib_qp *qp; + struct ib_qp_init_attr attr; + int ret; + + if (out_len < sizeof resp) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + if (cmd.qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW)) + return -EPERM; + + INIT_UDATA(&udata, buf + sizeof cmd, + (unsigned long) cmd.response + sizeof resp, + in_len - sizeof cmd, out_len - sizeof resp); + + obj = kzalloc(sizeof *obj, GFP_KERNEL); + if (!obj) + return -ENOMEM; + + init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &qp_lock_class); + down_write(&obj->uevent.uobject.mutex); + + if (cmd.qp_type == IB_QPT_XRC_TGT) { + xrcd = idr_read_xrcd(cmd.pd_handle, file->ucontext, &xrcd_uobj); + if (!xrcd) { + ret = -EINVAL; + goto err_put; + } + device = xrcd->device; + } else { + if (cmd.qp_type == IB_QPT_XRC_INI) { + cmd.max_recv_wr = cmd.max_recv_sge = 0; + } else { + if (cmd.is_srq) { + srq = idr_read_srq(cmd.srq_handle, file->ucontext); + if (!srq || srq->srq_type != IB_SRQT_BASIC) { + ret = -EINVAL; + goto err_put; + } + } + + if (cmd.recv_cq_handle != cmd.send_cq_handle) { + rcq = idr_read_cq(cmd.recv_cq_handle, file->ucontext, 0); + if (!rcq) { + ret = -EINVAL; + goto err_put; + } + } + } + + scq = idr_read_cq(cmd.send_cq_handle, file->ucontext, !!rcq); + rcq = rcq ?: scq; + pd = idr_read_pd(cmd.pd_handle, file->ucontext); + if (!pd || !scq) { + ret = -EINVAL; + goto err_put; + } + + device = pd->device; + } + + attr.event_handler = ib_uverbs_qp_event_handler; + attr.qp_context = file; + attr.send_cq = scq; + attr.recv_cq = rcq; + attr.srq = srq; + attr.xrcd = xrcd; + attr.sq_sig_type = cmd.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; + attr.qp_type = cmd.qp_type; + attr.create_flags = 0; + + attr.cap.max_send_wr = cmd.max_send_wr; + attr.cap.max_recv_wr = cmd.max_recv_wr; + attr.cap.max_send_sge = cmd.max_send_sge; + attr.cap.max_recv_sge = cmd.max_recv_sge; + attr.cap.max_inline_data = cmd.max_inline_data; + + obj->uevent.events_reported = 0; + INIT_LIST_HEAD(&obj->uevent.event_list); + INIT_LIST_HEAD(&obj->mcast_list); + + if (cmd.qp_type == IB_QPT_XRC_TGT) + qp = ib_create_qp(pd, &attr); + else + qp = device->create_qp(pd, &attr, &udata); + + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); + goto err_put; + } + + if (cmd.qp_type != IB_QPT_XRC_TGT) { + qp->real_qp = qp; + qp->device = device; + qp->pd = pd; + qp->send_cq = attr.send_cq; + qp->recv_cq = attr.recv_cq; + qp->srq = attr.srq; + qp->event_handler = attr.event_handler; + qp->qp_context = attr.qp_context; + qp->qp_type = attr.qp_type; + atomic_set(&qp->usecnt, 0); + atomic_inc(&pd->usecnt); + atomic_inc(&attr.send_cq->usecnt); + if (attr.recv_cq) + atomic_inc(&attr.recv_cq->usecnt); + if (attr.srq) + atomic_inc(&attr.srq->usecnt); + } + qp->uobject = &obj->uevent.uobject; + + obj->uevent.uobject.object = qp; + ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); + if (ret) + goto err_destroy; + + memset(&resp, 0, sizeof resp); + resp.qpn = qp->qp_num; + resp.qp_handle = obj->uevent.uobject.id; + resp.max_recv_sge = attr.cap.max_recv_sge; + resp.max_send_sge = attr.cap.max_send_sge; + resp.max_recv_wr = attr.cap.max_recv_wr; + resp.max_send_wr = attr.cap.max_send_wr; + resp.max_inline_data = attr.cap.max_inline_data; + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) { + ret = -EFAULT; + goto err_copy; + } + + if (xrcd) { + obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, + uobject); + atomic_inc(&obj->uxrcd->refcnt); + put_xrcd_read(xrcd_uobj); + } + + if (pd) + put_pd_read(pd); + if (scq) + put_cq_read(scq); + if (rcq && rcq != scq) + put_cq_read(rcq); + if (srq) + put_srq_read(srq); + + mutex_lock(&file->mutex); + list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list); + mutex_unlock(&file->mutex); + + obj->uevent.uobject.live = 1; + + up_write(&obj->uevent.uobject.mutex); + + return in_len; + +err_copy: + idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); + +err_destroy: + ib_destroy_qp(qp); + +err_put: + if (xrcd) + put_xrcd_read(xrcd_uobj); + if (pd) + put_pd_read(pd); + if (scq) + put_cq_read(scq); + if (rcq && rcq != scq) + put_cq_read(rcq); + if (srq) + put_srq_read(srq); + + put_uobj_write(&obj->uevent.uobject); + return ret; +} + +ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file, + const char __user *buf, int in_len, int out_len) +{ + struct ib_uverbs_open_qp cmd; + struct ib_uverbs_create_qp_resp resp; + struct ib_udata udata; + struct ib_uqp_object *obj; + struct ib_xrcd *xrcd; + struct ib_uobject *uninitialized_var(xrcd_uobj); + struct ib_qp *qp; + struct ib_qp_open_attr attr; + int ret; + + if (out_len < sizeof resp) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + INIT_UDATA(&udata, buf + sizeof cmd, + (unsigned long) cmd.response + sizeof resp, + in_len - sizeof cmd, out_len - sizeof resp); + + obj = kmalloc(sizeof *obj, GFP_KERNEL); + if (!obj) + return -ENOMEM; + + init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &qp_lock_class); + down_write(&obj->uevent.uobject.mutex); + + xrcd = idr_read_xrcd(cmd.pd_handle, file->ucontext, &xrcd_uobj); + if (!xrcd) { + ret = -EINVAL; + goto err_put; + } + + attr.event_handler = ib_uverbs_qp_event_handler; + attr.qp_context = file; + attr.qp_num = cmd.qpn; + attr.qp_type = cmd.qp_type; + + obj->uevent.events_reported = 0; + INIT_LIST_HEAD(&obj->uevent.event_list); + INIT_LIST_HEAD(&obj->mcast_list); + + qp = ib_open_qp(xrcd, &attr); + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); + goto err_put; + } + + qp->uobject = &obj->uevent.uobject; + + obj->uevent.uobject.object = qp; + ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); + if (ret) + goto err_destroy; + + memset(&resp, 0, sizeof resp); + resp.qpn = qp->qp_num; + resp.qp_handle = obj->uevent.uobject.id; + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) { + ret = -EFAULT; + goto err_remove; + } + + obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); + atomic_inc(&obj->uxrcd->refcnt); + put_xrcd_read(xrcd_uobj); + + mutex_lock(&file->mutex); + list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list); + mutex_unlock(&file->mutex); + + obj->uevent.uobject.live = 1; + + up_write(&obj->uevent.uobject.mutex); + + return in_len; + +err_remove: + idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); + +err_destroy: + ib_destroy_qp(qp); + +err_put: + put_xrcd_read(xrcd_uobj); + put_uobj_write(&obj->uevent.uobject); + return ret; +} + +ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_query_qp cmd; + struct ib_uverbs_query_qp_resp resp; + struct ib_qp *qp; + struct ib_qp_attr *attr; + struct ib_qp_init_attr *init_attr; + int ret; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + attr = kmalloc(sizeof *attr, GFP_KERNEL); + init_attr = kmalloc(sizeof *init_attr, GFP_KERNEL); + if (!attr || !init_attr) { + ret = -ENOMEM; + goto out; + } + + qp = idr_read_qp(cmd.qp_handle, file->ucontext); + if (!qp) { + ret = -EINVAL; + goto out; + } + + ret = ib_query_qp(qp, attr, cmd.attr_mask, init_attr); + + put_qp_read(qp); + + if (ret) + goto out; + + memset(&resp, 0, sizeof resp); + + resp.qp_state = attr->qp_state; + resp.cur_qp_state = attr->cur_qp_state; + resp.path_mtu = attr->path_mtu; + resp.path_mig_state = attr->path_mig_state; + resp.qkey = attr->qkey; + resp.rq_psn = attr->rq_psn; + resp.sq_psn = attr->sq_psn; + resp.dest_qp_num = attr->dest_qp_num; + resp.qp_access_flags = attr->qp_access_flags; + resp.pkey_index = attr->pkey_index; + resp.alt_pkey_index = attr->alt_pkey_index; + resp.sq_draining = attr->sq_draining; + resp.max_rd_atomic = attr->max_rd_atomic; + resp.max_dest_rd_atomic = attr->max_dest_rd_atomic; + resp.min_rnr_timer = attr->min_rnr_timer; + resp.port_num = attr->port_num; + resp.timeout = attr->timeout; + resp.retry_cnt = attr->retry_cnt; + resp.rnr_retry = attr->rnr_retry; + resp.alt_port_num = attr->alt_port_num; + resp.alt_timeout = attr->alt_timeout; + + memcpy(resp.dest.dgid, attr->ah_attr.grh.dgid.raw, 16); + resp.dest.flow_label = attr->ah_attr.grh.flow_label; + resp.dest.sgid_index = attr->ah_attr.grh.sgid_index; + resp.dest.hop_limit = attr->ah_attr.grh.hop_limit; + resp.dest.traffic_class = attr->ah_attr.grh.traffic_class; + resp.dest.dlid = attr->ah_attr.dlid; + resp.dest.sl = attr->ah_attr.sl; + resp.dest.src_path_bits = attr->ah_attr.src_path_bits; + resp.dest.static_rate = attr->ah_attr.static_rate; + resp.dest.is_global = !!(attr->ah_attr.ah_flags & IB_AH_GRH); + resp.dest.port_num = attr->ah_attr.port_num; + + memcpy(resp.alt_dest.dgid, attr->alt_ah_attr.grh.dgid.raw, 16); + resp.alt_dest.flow_label = attr->alt_ah_attr.grh.flow_label; + resp.alt_dest.sgid_index = attr->alt_ah_attr.grh.sgid_index; + resp.alt_dest.hop_limit = attr->alt_ah_attr.grh.hop_limit; + resp.alt_dest.traffic_class = attr->alt_ah_attr.grh.traffic_class; + resp.alt_dest.dlid = attr->alt_ah_attr.dlid; + resp.alt_dest.sl = attr->alt_ah_attr.sl; + resp.alt_dest.src_path_bits = attr->alt_ah_attr.src_path_bits; + resp.alt_dest.static_rate = attr->alt_ah_attr.static_rate; + resp.alt_dest.is_global = !!(attr->alt_ah_attr.ah_flags & IB_AH_GRH); + resp.alt_dest.port_num = attr->alt_ah_attr.port_num; + + resp.max_send_wr = init_attr->cap.max_send_wr; + resp.max_recv_wr = init_attr->cap.max_recv_wr; + resp.max_send_sge = init_attr->cap.max_send_sge; + resp.max_recv_sge = init_attr->cap.max_recv_sge; + resp.max_inline_data = init_attr->cap.max_inline_data; + resp.sq_sig_all = init_attr->sq_sig_type == IB_SIGNAL_ALL_WR; + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) + ret = -EFAULT; + +out: + kfree(attr); + kfree(init_attr); + + return ret ? ret : in_len; +} + +/* Remove ignored fields set in the attribute mask */ +static int modify_qp_mask(enum ib_qp_type qp_type, int mask) +{ + switch (qp_type) { + case IB_QPT_XRC_INI: + return mask & ~(IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER); + case IB_QPT_XRC_TGT: + return mask & ~(IB_QP_MAX_QP_RD_ATOMIC | IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY); + default: + return mask; + } +} + +ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_modify_qp cmd; + struct ib_udata udata; + struct ib_qp *qp; + struct ib_qp_attr *attr; + int ret; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd, + out_len); + + attr = kmalloc(sizeof *attr, GFP_KERNEL); + if (!attr) + return -ENOMEM; + + qp = idr_read_qp(cmd.qp_handle, file->ucontext); + if (!qp) { + ret = -EINVAL; + goto out; + } + + attr->qp_state = cmd.qp_state; + attr->cur_qp_state = cmd.cur_qp_state; + attr->path_mtu = cmd.path_mtu; + attr->path_mig_state = cmd.path_mig_state; + attr->qkey = cmd.qkey; + attr->rq_psn = cmd.rq_psn; + attr->sq_psn = cmd.sq_psn; + attr->dest_qp_num = cmd.dest_qp_num; + attr->qp_access_flags = cmd.qp_access_flags; + attr->pkey_index = cmd.pkey_index; + attr->alt_pkey_index = cmd.alt_pkey_index; + attr->en_sqd_async_notify = cmd.en_sqd_async_notify; + attr->max_rd_atomic = cmd.max_rd_atomic; + attr->max_dest_rd_atomic = cmd.max_dest_rd_atomic; + attr->min_rnr_timer = cmd.min_rnr_timer; + attr->port_num = cmd.port_num; + attr->timeout = cmd.timeout; + attr->retry_cnt = cmd.retry_cnt; + attr->rnr_retry = cmd.rnr_retry; + attr->alt_port_num = cmd.alt_port_num; + attr->alt_timeout = cmd.alt_timeout; + + memcpy(attr->ah_attr.grh.dgid.raw, cmd.dest.dgid, 16); + attr->ah_attr.grh.flow_label = cmd.dest.flow_label; + attr->ah_attr.grh.sgid_index = cmd.dest.sgid_index; + attr->ah_attr.grh.hop_limit = cmd.dest.hop_limit; + attr->ah_attr.grh.traffic_class = cmd.dest.traffic_class; + attr->ah_attr.dlid = cmd.dest.dlid; + attr->ah_attr.sl = cmd.dest.sl; + attr->ah_attr.src_path_bits = cmd.dest.src_path_bits; + attr->ah_attr.static_rate = cmd.dest.static_rate; + attr->ah_attr.ah_flags = cmd.dest.is_global ? IB_AH_GRH : 0; + attr->ah_attr.port_num = cmd.dest.port_num; + + memcpy(attr->alt_ah_attr.grh.dgid.raw, cmd.alt_dest.dgid, 16); + attr->alt_ah_attr.grh.flow_label = cmd.alt_dest.flow_label; + attr->alt_ah_attr.grh.sgid_index = cmd.alt_dest.sgid_index; + attr->alt_ah_attr.grh.hop_limit = cmd.alt_dest.hop_limit; + attr->alt_ah_attr.grh.traffic_class = cmd.alt_dest.traffic_class; + attr->alt_ah_attr.dlid = cmd.alt_dest.dlid; + attr->alt_ah_attr.sl = cmd.alt_dest.sl; + attr->alt_ah_attr.src_path_bits = cmd.alt_dest.src_path_bits; + attr->alt_ah_attr.static_rate = cmd.alt_dest.static_rate; + attr->alt_ah_attr.ah_flags = cmd.alt_dest.is_global ? IB_AH_GRH : 0; + attr->alt_ah_attr.port_num = cmd.alt_dest.port_num; + + if (qp->real_qp == qp) { + ret = ib_resolve_eth_l2_attrs(qp, attr, &cmd.attr_mask); + if (ret) + goto out; + ret = qp->device->modify_qp(qp, attr, + modify_qp_mask(qp->qp_type, cmd.attr_mask), &udata); + } else { + ret = ib_modify_qp(qp, attr, modify_qp_mask(qp->qp_type, cmd.attr_mask)); + } + + put_qp_read(qp); + + if (ret) + goto out; + + ret = in_len; + +out: + kfree(attr); + + return ret; +} + +ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_destroy_qp cmd; + struct ib_uverbs_destroy_qp_resp resp; + struct ib_uobject *uobj; + struct ib_qp *qp; + struct ib_uqp_object *obj; + int ret = -EINVAL; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + memset(&resp, 0, sizeof resp); + + uobj = idr_write_uobj(&ib_uverbs_qp_idr, cmd.qp_handle, file->ucontext); + if (!uobj) + return -EINVAL; + qp = uobj->object; + obj = container_of(uobj, struct ib_uqp_object, uevent.uobject); + + if (!list_empty(&obj->mcast_list)) { + put_uobj_write(uobj); + return -EBUSY; + } + + ret = ib_destroy_qp(qp); + if (!ret) + uobj->live = 0; + + put_uobj_write(uobj); + + if (ret) + return ret; + + if (obj->uxrcd) + atomic_dec(&obj->uxrcd->refcnt); + + idr_remove_uobj(&ib_uverbs_qp_idr, uobj); + + mutex_lock(&file->mutex); + list_del(&uobj->list); + mutex_unlock(&file->mutex); + + ib_uverbs_release_uevent(file, &obj->uevent); + + resp.events_reported = obj->uevent.events_reported; + + put_uobj(uobj); + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) + return -EFAULT; + + return in_len; +} + +ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_post_send cmd; + struct ib_uverbs_post_send_resp resp; + struct ib_uverbs_send_wr *user_wr; + struct ib_send_wr *wr = NULL, *last, *next, *bad_wr; + struct ib_qp *qp; + int i, sg_ind; + int is_ud; + ssize_t ret = -EINVAL; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + if (in_len < sizeof cmd + cmd.wqe_size * cmd.wr_count + + cmd.sge_count * sizeof (struct ib_uverbs_sge)) + return -EINVAL; + + if (cmd.wqe_size < sizeof (struct ib_uverbs_send_wr)) + return -EINVAL; + + user_wr = kmalloc(cmd.wqe_size, GFP_KERNEL); + if (!user_wr) + return -ENOMEM; + + qp = idr_read_qp(cmd.qp_handle, file->ucontext); + if (!qp) + goto out; + + is_ud = qp->qp_type == IB_QPT_UD; + sg_ind = 0; + last = NULL; + for (i = 0; i < cmd.wr_count; ++i) { + if (copy_from_user(user_wr, + buf + sizeof cmd + i * cmd.wqe_size, + cmd.wqe_size)) { + ret = -EFAULT; + goto out_put; + } + + if (user_wr->num_sge + sg_ind > cmd.sge_count) { + ret = -EINVAL; + goto out_put; + } + + next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) + + user_wr->num_sge * sizeof (struct ib_sge), + GFP_KERNEL); + if (!next) { + ret = -ENOMEM; + goto out_put; + } + + if (!last) + wr = next; + else + last->next = next; + last = next; + + next->next = NULL; + next->wr_id = user_wr->wr_id; + next->num_sge = user_wr->num_sge; + next->opcode = user_wr->opcode; + next->send_flags = user_wr->send_flags; + + if (is_ud) { + next->wr.ud.ah = idr_read_ah(user_wr->wr.ud.ah, + file->ucontext); + if (!next->wr.ud.ah) { + ret = -EINVAL; + goto out_put; + } + next->wr.ud.remote_qpn = user_wr->wr.ud.remote_qpn; + next->wr.ud.remote_qkey = user_wr->wr.ud.remote_qkey; + if (next->opcode == IB_WR_SEND_WITH_IMM) + next->ex.imm_data = + (__be32 __force) user_wr->ex.imm_data; + } else { + switch (next->opcode) { + case IB_WR_RDMA_WRITE_WITH_IMM: + next->ex.imm_data = + (__be32 __force) user_wr->ex.imm_data; + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_READ: + next->wr.rdma.remote_addr = + user_wr->wr.rdma.remote_addr; + next->wr.rdma.rkey = + user_wr->wr.rdma.rkey; + break; + case IB_WR_SEND_WITH_IMM: + next->ex.imm_data = + (__be32 __force) user_wr->ex.imm_data; + break; + case IB_WR_SEND_WITH_INV: + next->ex.invalidate_rkey = + user_wr->ex.invalidate_rkey; + break; + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + next->wr.atomic.remote_addr = + user_wr->wr.atomic.remote_addr; + next->wr.atomic.compare_add = + user_wr->wr.atomic.compare_add; + next->wr.atomic.swap = user_wr->wr.atomic.swap; + next->wr.atomic.rkey = user_wr->wr.atomic.rkey; + break; + default: + break; + } + } + + if (next->num_sge) { + next->sg_list = (void *) next + + ALIGN(sizeof *next, sizeof (struct ib_sge)); + if (copy_from_user(next->sg_list, + buf + sizeof cmd + + cmd.wr_count * cmd.wqe_size + + sg_ind * sizeof (struct ib_sge), + next->num_sge * sizeof (struct ib_sge))) { + ret = -EFAULT; + goto out_put; + } + sg_ind += next->num_sge; + } else + next->sg_list = NULL; + } + + resp.bad_wr = 0; + ret = qp->device->post_send(qp->real_qp, wr, &bad_wr); + if (ret) + for (next = wr; next; next = next->next) { + ++resp.bad_wr; + if (next == bad_wr) + break; + } + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) + ret = -EFAULT; + +out_put: + put_qp_read(qp); + + while (wr) { + if (is_ud && wr->wr.ud.ah) + put_ah_read(wr->wr.ud.ah); + next = wr->next; + kfree(wr); + wr = next; + } + +out: + kfree(user_wr); + + return ret ? ret : in_len; +} + +static struct ib_recv_wr *ib_uverbs_unmarshall_recv(const char __user *buf, + int in_len, + u32 wr_count, + u32 sge_count, + u32 wqe_size) +{ + struct ib_uverbs_recv_wr *user_wr; + struct ib_recv_wr *wr = NULL, *last, *next; + int sg_ind; + int i; + int ret; + + if (in_len < wqe_size * wr_count + + sge_count * sizeof (struct ib_uverbs_sge)) + return ERR_PTR(-EINVAL); + + if (wqe_size < sizeof (struct ib_uverbs_recv_wr)) + return ERR_PTR(-EINVAL); + + user_wr = kmalloc(wqe_size, GFP_KERNEL); + if (!user_wr) + return ERR_PTR(-ENOMEM); + + sg_ind = 0; + last = NULL; + for (i = 0; i < wr_count; ++i) { + if (copy_from_user(user_wr, buf + i * wqe_size, + wqe_size)) { + ret = -EFAULT; + goto err; + } + + if (user_wr->num_sge + sg_ind > sge_count) { + ret = -EINVAL; + goto err; + } + + next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) + + user_wr->num_sge * sizeof (struct ib_sge), + GFP_KERNEL); + if (!next) { + ret = -ENOMEM; + goto err; + } + + if (!last) + wr = next; + else + last->next = next; + last = next; + + next->next = NULL; + next->wr_id = user_wr->wr_id; + next->num_sge = user_wr->num_sge; + + if (next->num_sge) { + next->sg_list = (void *) next + + ALIGN(sizeof *next, sizeof (struct ib_sge)); + if (copy_from_user(next->sg_list, + buf + wr_count * wqe_size + + sg_ind * sizeof (struct ib_sge), + next->num_sge * sizeof (struct ib_sge))) { + ret = -EFAULT; + goto err; + } + sg_ind += next->num_sge; + } else + next->sg_list = NULL; + } + + kfree(user_wr); + return wr; + +err: + kfree(user_wr); + + while (wr) { + next = wr->next; + kfree(wr); + wr = next; + } + + return ERR_PTR(ret); +} + +ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_post_recv cmd; + struct ib_uverbs_post_recv_resp resp; + struct ib_recv_wr *wr, *next, *bad_wr; + struct ib_qp *qp; + ssize_t ret = -EINVAL; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd, + in_len - sizeof cmd, cmd.wr_count, + cmd.sge_count, cmd.wqe_size); + if (IS_ERR(wr)) + return PTR_ERR(wr); + + qp = idr_read_qp(cmd.qp_handle, file->ucontext); + if (!qp) + goto out; + + resp.bad_wr = 0; + ret = qp->device->post_recv(qp->real_qp, wr, &bad_wr); + + put_qp_read(qp); + + if (ret) + for (next = wr; next; next = next->next) { + ++resp.bad_wr; + if (next == bad_wr) + break; + } + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) + ret = -EFAULT; + +out: + while (wr) { + next = wr->next; + kfree(wr); + wr = next; + } + + return ret ? ret : in_len; +} + +ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_post_srq_recv cmd; + struct ib_uverbs_post_srq_recv_resp resp; + struct ib_recv_wr *wr, *next, *bad_wr; + struct ib_srq *srq; + ssize_t ret = -EINVAL; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd, + in_len - sizeof cmd, cmd.wr_count, + cmd.sge_count, cmd.wqe_size); + if (IS_ERR(wr)) + return PTR_ERR(wr); + + srq = idr_read_srq(cmd.srq_handle, file->ucontext); + if (!srq) + goto out; + + resp.bad_wr = 0; + ret = srq->device->post_srq_recv(srq, wr, &bad_wr); + + put_srq_read(srq); + + if (ret) + for (next = wr; next; next = next->next) { + ++resp.bad_wr; + if (next == bad_wr) + break; + } + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) + ret = -EFAULT; + +out: + while (wr) { + next = wr->next; + kfree(wr); + wr = next; + } + + return ret ? ret : in_len; +} + +ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_create_ah cmd; + struct ib_uverbs_create_ah_resp resp; + struct ib_uobject *uobj; + struct ib_pd *pd; + struct ib_ah *ah; + struct ib_ah_attr attr; + int ret; + + if (out_len < sizeof resp) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + uobj = kmalloc(sizeof *uobj, GFP_KERNEL); + if (!uobj) + return -ENOMEM; + + init_uobj(uobj, cmd.user_handle, file->ucontext, &ah_lock_class); + down_write(&uobj->mutex); + + pd = idr_read_pd(cmd.pd_handle, file->ucontext); + if (!pd) { + ret = -EINVAL; + goto err; + } + + attr.dlid = cmd.attr.dlid; + attr.sl = cmd.attr.sl; + attr.src_path_bits = cmd.attr.src_path_bits; + attr.static_rate = cmd.attr.static_rate; + attr.ah_flags = cmd.attr.is_global ? IB_AH_GRH : 0; + attr.port_num = cmd.attr.port_num; + attr.grh.flow_label = cmd.attr.grh.flow_label; + attr.grh.sgid_index = cmd.attr.grh.sgid_index; + attr.grh.hop_limit = cmd.attr.grh.hop_limit; + attr.grh.traffic_class = cmd.attr.grh.traffic_class; + attr.vlan_id = 0; + memset(&attr.dmac, 0, sizeof(attr.dmac)); + memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16); + + ah = ib_create_ah(pd, &attr); + if (IS_ERR(ah)) { + ret = PTR_ERR(ah); + goto err_put; + } + + ah->uobject = uobj; + uobj->object = ah; + + ret = idr_add_uobj(&ib_uverbs_ah_idr, uobj); + if (ret) + goto err_destroy; + + resp.ah_handle = uobj->id; + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) { + ret = -EFAULT; + goto err_copy; + } + + put_pd_read(pd); + + mutex_lock(&file->mutex); + list_add_tail(&uobj->list, &file->ucontext->ah_list); + mutex_unlock(&file->mutex); + + uobj->live = 1; + + up_write(&uobj->mutex); + + return in_len; + +err_copy: + idr_remove_uobj(&ib_uverbs_ah_idr, uobj); + +err_destroy: + ib_destroy_ah(ah); + +err_put: + put_pd_read(pd); + +err: + put_uobj_write(uobj); + return ret; +} + +ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file, + const char __user *buf, int in_len, int out_len) +{ + struct ib_uverbs_destroy_ah cmd; + struct ib_ah *ah; + struct ib_uobject *uobj; + int ret; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + uobj = idr_write_uobj(&ib_uverbs_ah_idr, cmd.ah_handle, file->ucontext); + if (!uobj) + return -EINVAL; + ah = uobj->object; + + ret = ib_destroy_ah(ah); + if (!ret) + uobj->live = 0; + + put_uobj_write(uobj); + + if (ret) + return ret; + + idr_remove_uobj(&ib_uverbs_ah_idr, uobj); + + mutex_lock(&file->mutex); + list_del(&uobj->list); + mutex_unlock(&file->mutex); + + put_uobj(uobj); + + return in_len; +} + +ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_attach_mcast cmd; + struct ib_qp *qp; + struct ib_uqp_object *obj; + struct ib_uverbs_mcast_entry *mcast; + int ret; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + qp = idr_write_qp(cmd.qp_handle, file->ucontext); + if (!qp) + return -EINVAL; + + obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject); + + list_for_each_entry(mcast, &obj->mcast_list, list) + if (cmd.mlid == mcast->lid && + !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) { + ret = 0; + goto out_put; + } + + mcast = kmalloc(sizeof *mcast, GFP_KERNEL); + if (!mcast) { + ret = -ENOMEM; + goto out_put; + } + + mcast->lid = cmd.mlid; + memcpy(mcast->gid.raw, cmd.gid, sizeof mcast->gid.raw); + + ret = ib_attach_mcast(qp, &mcast->gid, cmd.mlid); + if (!ret) + list_add_tail(&mcast->list, &obj->mcast_list); + else + kfree(mcast); + +out_put: + put_qp_write(qp); + + return ret ? ret : in_len; +} + +ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_detach_mcast cmd; + struct ib_uqp_object *obj; + struct ib_qp *qp; + struct ib_uverbs_mcast_entry *mcast; + int ret = -EINVAL; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + qp = idr_write_qp(cmd.qp_handle, file->ucontext); + if (!qp) + return -EINVAL; + + ret = ib_detach_mcast(qp, (union ib_gid *) cmd.gid, cmd.mlid); + if (ret) + goto out_put; + + obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject); + + list_for_each_entry(mcast, &obj->mcast_list, list) + if (cmd.mlid == mcast->lid && + !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) { + list_del(&mcast->list); + kfree(mcast); + break; + } + +out_put: + put_qp_write(qp); + + return ret ? ret : in_len; +} + +static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec, + union ib_flow_spec *ib_spec) +{ + if (kern_spec->reserved) + return -EINVAL; + + ib_spec->type = kern_spec->type; + + switch (ib_spec->type) { + case IB_FLOW_SPEC_ETH: + ib_spec->eth.size = sizeof(struct ib_flow_spec_eth); + if (ib_spec->eth.size != kern_spec->eth.size) + return -EINVAL; + memcpy(&ib_spec->eth.val, &kern_spec->eth.val, + sizeof(struct ib_flow_eth_filter)); + memcpy(&ib_spec->eth.mask, &kern_spec->eth.mask, + sizeof(struct ib_flow_eth_filter)); + break; + case IB_FLOW_SPEC_IPV4: + ib_spec->ipv4.size = sizeof(struct ib_flow_spec_ipv4); + if (ib_spec->ipv4.size != kern_spec->ipv4.size) + return -EINVAL; + memcpy(&ib_spec->ipv4.val, &kern_spec->ipv4.val, + sizeof(struct ib_flow_ipv4_filter)); + memcpy(&ib_spec->ipv4.mask, &kern_spec->ipv4.mask, + sizeof(struct ib_flow_ipv4_filter)); + break; + case IB_FLOW_SPEC_TCP: + case IB_FLOW_SPEC_UDP: + ib_spec->tcp_udp.size = sizeof(struct ib_flow_spec_tcp_udp); + if (ib_spec->tcp_udp.size != kern_spec->tcp_udp.size) + return -EINVAL; + memcpy(&ib_spec->tcp_udp.val, &kern_spec->tcp_udp.val, + sizeof(struct ib_flow_tcp_udp_filter)); + memcpy(&ib_spec->tcp_udp.mask, &kern_spec->tcp_udp.mask, + sizeof(struct ib_flow_tcp_udp_filter)); + break; + default: + return -EINVAL; + } + return 0; +} + +int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_create_flow cmd; + struct ib_uverbs_create_flow_resp resp; + struct ib_uobject *uobj; + struct ib_flow *flow_id; + struct ib_uverbs_flow_attr *kern_flow_attr; + struct ib_flow_attr *flow_attr; + struct ib_qp *qp; + int err = 0; + void *kern_spec; + void *ib_spec; + int i; + + if (ucore->inlen < sizeof(cmd)) + return -EINVAL; + + if (ucore->outlen < sizeof(resp)) + return -ENOSPC; + + err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (err) + return err; + + ucore->inbuf += sizeof(cmd); + ucore->inlen -= sizeof(cmd); + + if (cmd.comp_mask) + return -EINVAL; + + if ((cmd.flow_attr.type == IB_FLOW_ATTR_SNIFFER && + !capable(CAP_NET_ADMIN)) || !capable(CAP_NET_RAW)) + return -EPERM; + + if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS) + return -EINVAL; + + if (cmd.flow_attr.size > ucore->inlen || + cmd.flow_attr.size > + (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_flow_spec))) + return -EINVAL; + + if (cmd.flow_attr.reserved[0] || + cmd.flow_attr.reserved[1]) + return -EINVAL; + + if (cmd.flow_attr.num_of_specs) { + kern_flow_attr = kmalloc(sizeof(*kern_flow_attr) + cmd.flow_attr.size, + GFP_KERNEL); + if (!kern_flow_attr) + return -ENOMEM; + + memcpy(kern_flow_attr, &cmd.flow_attr, sizeof(*kern_flow_attr)); + err = ib_copy_from_udata(kern_flow_attr + 1, ucore, + cmd.flow_attr.size); + if (err) + goto err_free_attr; + } else { + kern_flow_attr = &cmd.flow_attr; + } + + uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); + if (!uobj) { + err = -ENOMEM; + goto err_free_attr; + } + init_uobj(uobj, 0, file->ucontext, &rule_lock_class); + down_write(&uobj->mutex); + + qp = idr_read_qp(cmd.qp_handle, file->ucontext); + if (!qp) { + err = -EINVAL; + goto err_uobj; + } + + flow_attr = kmalloc(sizeof(*flow_attr) + cmd.flow_attr.size, GFP_KERNEL); + if (!flow_attr) { + err = -ENOMEM; + goto err_put; + } + + flow_attr->type = kern_flow_attr->type; + flow_attr->priority = kern_flow_attr->priority; + flow_attr->num_of_specs = kern_flow_attr->num_of_specs; + flow_attr->port = kern_flow_attr->port; + flow_attr->flags = kern_flow_attr->flags; + flow_attr->size = sizeof(*flow_attr); + + kern_spec = kern_flow_attr + 1; + ib_spec = flow_attr + 1; + for (i = 0; i < flow_attr->num_of_specs && + cmd.flow_attr.size > offsetof(struct ib_uverbs_flow_spec, reserved) && + cmd.flow_attr.size >= + ((struct ib_uverbs_flow_spec *)kern_spec)->size; i++) { + err = kern_spec_to_ib_spec(kern_spec, ib_spec); + if (err) + goto err_free; + flow_attr->size += + ((union ib_flow_spec *) ib_spec)->size; + cmd.flow_attr.size -= ((struct ib_uverbs_flow_spec *)kern_spec)->size; + kern_spec += ((struct ib_uverbs_flow_spec *) kern_spec)->size; + ib_spec += ((union ib_flow_spec *) ib_spec)->size; + } + if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) { + pr_warn("create flow failed, flow %d: %d bytes left from uverb cmd\n", + i, cmd.flow_attr.size); + err = -EINVAL; + goto err_free; + } + flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER); + if (IS_ERR(flow_id)) { + err = PTR_ERR(flow_id); + goto err_free; + } + flow_id->qp = qp; + flow_id->uobject = uobj; + uobj->object = flow_id; + + err = idr_add_uobj(&ib_uverbs_rule_idr, uobj); + if (err) + goto destroy_flow; + + memset(&resp, 0, sizeof(resp)); + resp.flow_handle = uobj->id; + + err = ib_copy_to_udata(ucore, + &resp, sizeof(resp)); + if (err) + goto err_copy; + + put_qp_read(qp); + mutex_lock(&file->mutex); + list_add_tail(&uobj->list, &file->ucontext->rule_list); + mutex_unlock(&file->mutex); + + uobj->live = 1; + + up_write(&uobj->mutex); + kfree(flow_attr); + if (cmd.flow_attr.num_of_specs) + kfree(kern_flow_attr); + return 0; +err_copy: + idr_remove_uobj(&ib_uverbs_rule_idr, uobj); +destroy_flow: + ib_destroy_flow(flow_id); +err_free: + kfree(flow_attr); +err_put: + put_qp_read(qp); +err_uobj: + put_uobj_write(uobj); +err_free_attr: + if (cmd.flow_attr.num_of_specs) + kfree(kern_flow_attr); + return err; +} + +int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_destroy_flow cmd; + struct ib_flow *flow_id; + struct ib_uobject *uobj; + int ret; + + if (ucore->inlen < sizeof(cmd)) + return -EINVAL; + + ret = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (ret) + return ret; + + if (cmd.comp_mask) + return -EINVAL; + + uobj = idr_write_uobj(&ib_uverbs_rule_idr, cmd.flow_handle, + file->ucontext); + if (!uobj) + return -EINVAL; + flow_id = uobj->object; + + ret = ib_destroy_flow(flow_id); + if (!ret) + uobj->live = 0; + + put_uobj_write(uobj); + + idr_remove_uobj(&ib_uverbs_rule_idr, uobj); + + mutex_lock(&file->mutex); + list_del(&uobj->list); + mutex_unlock(&file->mutex); + + put_uobj(uobj); + + return ret; +} + +static int __uverbs_create_xsrq(struct ib_uverbs_file *file, + struct ib_uverbs_create_xsrq *cmd, + struct ib_udata *udata) +{ + struct ib_uverbs_create_srq_resp resp; + struct ib_usrq_object *obj; + struct ib_pd *pd; + struct ib_srq *srq; + struct ib_uobject *uninitialized_var(xrcd_uobj); + struct ib_srq_init_attr attr; + int ret; + + obj = kmalloc(sizeof *obj, GFP_KERNEL); + if (!obj) + return -ENOMEM; + + init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, &srq_lock_class); + down_write(&obj->uevent.uobject.mutex); + + if (cmd->srq_type == IB_SRQT_XRC) { + attr.ext.xrc.xrcd = idr_read_xrcd(cmd->xrcd_handle, file->ucontext, &xrcd_uobj); + if (!attr.ext.xrc.xrcd) { + ret = -EINVAL; + goto err; + } + + obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); + atomic_inc(&obj->uxrcd->refcnt); + + attr.ext.xrc.cq = idr_read_cq(cmd->cq_handle, file->ucontext, 0); + if (!attr.ext.xrc.cq) { + ret = -EINVAL; + goto err_put_xrcd; + } + } + + pd = idr_read_pd(cmd->pd_handle, file->ucontext); + if (!pd) { + ret = -EINVAL; + goto err_put_cq; + } + + attr.event_handler = ib_uverbs_srq_event_handler; + attr.srq_context = file; + attr.srq_type = cmd->srq_type; + attr.attr.max_wr = cmd->max_wr; + attr.attr.max_sge = cmd->max_sge; + attr.attr.srq_limit = cmd->srq_limit; + + obj->uevent.events_reported = 0; + INIT_LIST_HEAD(&obj->uevent.event_list); + + srq = pd->device->create_srq(pd, &attr, udata); + if (IS_ERR(srq)) { + ret = PTR_ERR(srq); + goto err_put; + } + + srq->device = pd->device; + srq->pd = pd; + srq->srq_type = cmd->srq_type; + srq->uobject = &obj->uevent.uobject; + srq->event_handler = attr.event_handler; + srq->srq_context = attr.srq_context; + + if (cmd->srq_type == IB_SRQT_XRC) { + srq->ext.xrc.cq = attr.ext.xrc.cq; + srq->ext.xrc.xrcd = attr.ext.xrc.xrcd; + atomic_inc(&attr.ext.xrc.cq->usecnt); + atomic_inc(&attr.ext.xrc.xrcd->usecnt); + } + + atomic_inc(&pd->usecnt); + atomic_set(&srq->usecnt, 0); + + obj->uevent.uobject.object = srq; + ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject); + if (ret) + goto err_destroy; + + memset(&resp, 0, sizeof resp); + resp.srq_handle = obj->uevent.uobject.id; + resp.max_wr = attr.attr.max_wr; + resp.max_sge = attr.attr.max_sge; + if (cmd->srq_type == IB_SRQT_XRC) + resp.srqn = srq->ext.xrc.srq_num; + + if (copy_to_user((void __user *) (unsigned long) cmd->response, + &resp, sizeof resp)) { + ret = -EFAULT; + goto err_copy; + } + + if (cmd->srq_type == IB_SRQT_XRC) { + put_uobj_read(xrcd_uobj); + put_cq_read(attr.ext.xrc.cq); + } + put_pd_read(pd); + + mutex_lock(&file->mutex); + list_add_tail(&obj->uevent.uobject.list, &file->ucontext->srq_list); + mutex_unlock(&file->mutex); + + obj->uevent.uobject.live = 1; + + up_write(&obj->uevent.uobject.mutex); + + return 0; + +err_copy: + idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject); + +err_destroy: + ib_destroy_srq(srq); + +err_put: + put_pd_read(pd); + +err_put_cq: + if (cmd->srq_type == IB_SRQT_XRC) + put_cq_read(attr.ext.xrc.cq); + +err_put_xrcd: + if (cmd->srq_type == IB_SRQT_XRC) { + atomic_dec(&obj->uxrcd->refcnt); + put_uobj_read(xrcd_uobj); + } + +err: + put_uobj_write(&obj->uevent.uobject); + return ret; +} + +ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_create_srq cmd; + struct ib_uverbs_create_xsrq xcmd; + struct ib_uverbs_create_srq_resp resp; + struct ib_udata udata; + int ret; + + if (out_len < sizeof resp) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + xcmd.response = cmd.response; + xcmd.user_handle = cmd.user_handle; + xcmd.srq_type = IB_SRQT_BASIC; + xcmd.pd_handle = cmd.pd_handle; + xcmd.max_wr = cmd.max_wr; + xcmd.max_sge = cmd.max_sge; + xcmd.srq_limit = cmd.srq_limit; + + INIT_UDATA(&udata, buf + sizeof cmd, + (unsigned long) cmd.response + sizeof resp, + in_len - sizeof cmd, out_len - sizeof resp); + + ret = __uverbs_create_xsrq(file, &xcmd, &udata); + if (ret) + return ret; + + return in_len; +} + +ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file, + const char __user *buf, int in_len, int out_len) +{ + struct ib_uverbs_create_xsrq cmd; + struct ib_uverbs_create_srq_resp resp; + struct ib_udata udata; + int ret; + + if (out_len < sizeof resp) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + INIT_UDATA(&udata, buf + sizeof cmd, + (unsigned long) cmd.response + sizeof resp, + in_len - sizeof cmd, out_len - sizeof resp); + + ret = __uverbs_create_xsrq(file, &cmd, &udata); + if (ret) + return ret; + + return in_len; +} + +ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_modify_srq cmd; + struct ib_udata udata; + struct ib_srq *srq; + struct ib_srq_attr attr; + int ret; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd, + out_len); + + srq = idr_read_srq(cmd.srq_handle, file->ucontext); + if (!srq) + return -EINVAL; + + attr.max_wr = cmd.max_wr; + attr.srq_limit = cmd.srq_limit; + + ret = srq->device->modify_srq(srq, &attr, cmd.attr_mask, &udata); + + put_srq_read(srq); + + return ret ? ret : in_len; +} + +ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file, + const char __user *buf, + int in_len, int out_len) +{ + struct ib_uverbs_query_srq cmd; + struct ib_uverbs_query_srq_resp resp; + struct ib_srq_attr attr; + struct ib_srq *srq; + int ret; + + if (out_len < sizeof resp) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + srq = idr_read_srq(cmd.srq_handle, file->ucontext); + if (!srq) + return -EINVAL; + + ret = ib_query_srq(srq, &attr); + + put_srq_read(srq); + + if (ret) + return ret; + + memset(&resp, 0, sizeof resp); + + resp.max_wr = attr.max_wr; + resp.max_sge = attr.max_sge; + resp.srq_limit = attr.srq_limit; + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) + return -EFAULT; + + return in_len; +} + +ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_destroy_srq cmd; + struct ib_uverbs_destroy_srq_resp resp; + struct ib_uobject *uobj; + struct ib_srq *srq; + struct ib_uevent_object *obj; + int ret = -EINVAL; + struct ib_usrq_object *us; + enum ib_srq_type srq_type; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + uobj = idr_write_uobj(&ib_uverbs_srq_idr, cmd.srq_handle, file->ucontext); + if (!uobj) + return -EINVAL; + srq = uobj->object; + obj = container_of(uobj, struct ib_uevent_object, uobject); + srq_type = srq->srq_type; + + ret = ib_destroy_srq(srq); + if (!ret) + uobj->live = 0; + + put_uobj_write(uobj); + + if (ret) + return ret; + + if (srq_type == IB_SRQT_XRC) { + us = container_of(obj, struct ib_usrq_object, uevent); + atomic_dec(&us->uxrcd->refcnt); + } + + idr_remove_uobj(&ib_uverbs_srq_idr, uobj); + + mutex_lock(&file->mutex); + list_del(&uobj->list); + mutex_unlock(&file->mutex); + + ib_uverbs_release_uevent(file, obj); + + memset(&resp, 0, sizeof resp); + resp.events_reported = obj->events_reported; + + put_uobj(uobj); + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) + ret = -EFAULT; + + return ret ? ret : in_len; +} diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c new file mode 100644 index 0000000..71ab83f --- /dev/null +++ b/drivers/infiniband/core/uverbs_main.c @@ -0,0 +1,1036 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2005 Voltaire, Inc. All rights reserved. + * Copyright (c) 2005 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "uverbs.h" + +MODULE_AUTHOR("Roland Dreier"); +MODULE_DESCRIPTION("InfiniBand userspace verbs access"); +MODULE_LICENSE("Dual BSD/GPL"); + +enum { + IB_UVERBS_MAJOR = 231, + IB_UVERBS_BASE_MINOR = 192, + IB_UVERBS_MAX_DEVICES = 32 +}; + +#define IB_UVERBS_BASE_DEV MKDEV(IB_UVERBS_MAJOR, IB_UVERBS_BASE_MINOR) + +static struct class *uverbs_class; + +DEFINE_SPINLOCK(ib_uverbs_idr_lock); +DEFINE_IDR(ib_uverbs_pd_idr); +DEFINE_IDR(ib_uverbs_mr_idr); +DEFINE_IDR(ib_uverbs_mw_idr); +DEFINE_IDR(ib_uverbs_ah_idr); +DEFINE_IDR(ib_uverbs_cq_idr); +DEFINE_IDR(ib_uverbs_qp_idr); +DEFINE_IDR(ib_uverbs_srq_idr); +DEFINE_IDR(ib_uverbs_xrcd_idr); +DEFINE_IDR(ib_uverbs_rule_idr); + +static DEFINE_SPINLOCK(map_lock); +static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES); + +static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) = { + [IB_USER_VERBS_CMD_GET_CONTEXT] = ib_uverbs_get_context, + [IB_USER_VERBS_CMD_QUERY_DEVICE] = ib_uverbs_query_device, + [IB_USER_VERBS_CMD_QUERY_PORT] = ib_uverbs_query_port, + [IB_USER_VERBS_CMD_ALLOC_PD] = ib_uverbs_alloc_pd, + [IB_USER_VERBS_CMD_DEALLOC_PD] = ib_uverbs_dealloc_pd, + [IB_USER_VERBS_CMD_REG_MR] = ib_uverbs_reg_mr, + [IB_USER_VERBS_CMD_REREG_MR] = ib_uverbs_rereg_mr, + [IB_USER_VERBS_CMD_DEREG_MR] = ib_uverbs_dereg_mr, + [IB_USER_VERBS_CMD_ALLOC_MW] = ib_uverbs_alloc_mw, + [IB_USER_VERBS_CMD_DEALLOC_MW] = ib_uverbs_dealloc_mw, + [IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL] = ib_uverbs_create_comp_channel, + [IB_USER_VERBS_CMD_CREATE_CQ] = ib_uverbs_create_cq, + [IB_USER_VERBS_CMD_RESIZE_CQ] = ib_uverbs_resize_cq, + [IB_USER_VERBS_CMD_POLL_CQ] = ib_uverbs_poll_cq, + [IB_USER_VERBS_CMD_REQ_NOTIFY_CQ] = ib_uverbs_req_notify_cq, + [IB_USER_VERBS_CMD_DESTROY_CQ] = ib_uverbs_destroy_cq, + [IB_USER_VERBS_CMD_CREATE_QP] = ib_uverbs_create_qp, + [IB_USER_VERBS_CMD_QUERY_QP] = ib_uverbs_query_qp, + [IB_USER_VERBS_CMD_MODIFY_QP] = ib_uverbs_modify_qp, + [IB_USER_VERBS_CMD_DESTROY_QP] = ib_uverbs_destroy_qp, + [IB_USER_VERBS_CMD_POST_SEND] = ib_uverbs_post_send, + [IB_USER_VERBS_CMD_POST_RECV] = ib_uverbs_post_recv, + [IB_USER_VERBS_CMD_POST_SRQ_RECV] = ib_uverbs_post_srq_recv, + [IB_USER_VERBS_CMD_CREATE_AH] = ib_uverbs_create_ah, + [IB_USER_VERBS_CMD_DESTROY_AH] = ib_uverbs_destroy_ah, + [IB_USER_VERBS_CMD_ATTACH_MCAST] = ib_uverbs_attach_mcast, + [IB_USER_VERBS_CMD_DETACH_MCAST] = ib_uverbs_detach_mcast, + [IB_USER_VERBS_CMD_CREATE_SRQ] = ib_uverbs_create_srq, + [IB_USER_VERBS_CMD_MODIFY_SRQ] = ib_uverbs_modify_srq, + [IB_USER_VERBS_CMD_QUERY_SRQ] = ib_uverbs_query_srq, + [IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq, + [IB_USER_VERBS_CMD_OPEN_XRCD] = ib_uverbs_open_xrcd, + [IB_USER_VERBS_CMD_CLOSE_XRCD] = ib_uverbs_close_xrcd, + [IB_USER_VERBS_CMD_CREATE_XSRQ] = ib_uverbs_create_xsrq, + [IB_USER_VERBS_CMD_OPEN_QP] = ib_uverbs_open_qp, +}; + +static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) = { + [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow, + [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow +}; + +static void ib_uverbs_add_one(struct ib_device *device); +static void ib_uverbs_remove_one(struct ib_device *device); + +static void ib_uverbs_release_dev(struct kref *ref) +{ + struct ib_uverbs_device *dev = + container_of(ref, struct ib_uverbs_device, ref); + + complete(&dev->comp); +} + +static void ib_uverbs_release_event_file(struct kref *ref) +{ + struct ib_uverbs_event_file *file = + container_of(ref, struct ib_uverbs_event_file, ref); + + kfree(file); +} + +void ib_uverbs_release_ucq(struct ib_uverbs_file *file, + struct ib_uverbs_event_file *ev_file, + struct ib_ucq_object *uobj) +{ + struct ib_uverbs_event *evt, *tmp; + + if (ev_file) { + spin_lock_irq(&ev_file->lock); + list_for_each_entry_safe(evt, tmp, &uobj->comp_list, obj_list) { + list_del(&evt->list); + kfree(evt); + } + spin_unlock_irq(&ev_file->lock); + + kref_put(&ev_file->ref, ib_uverbs_release_event_file); + } + + spin_lock_irq(&file->async_file->lock); + list_for_each_entry_safe(evt, tmp, &uobj->async_list, obj_list) { + list_del(&evt->list); + kfree(evt); + } + spin_unlock_irq(&file->async_file->lock); +} + +void ib_uverbs_release_uevent(struct ib_uverbs_file *file, + struct ib_uevent_object *uobj) +{ + struct ib_uverbs_event *evt, *tmp; + + spin_lock_irq(&file->async_file->lock); + list_for_each_entry_safe(evt, tmp, &uobj->event_list, obj_list) { + list_del(&evt->list); + kfree(evt); + } + spin_unlock_irq(&file->async_file->lock); +} + +static void ib_uverbs_detach_umcast(struct ib_qp *qp, + struct ib_uqp_object *uobj) +{ + struct ib_uverbs_mcast_entry *mcast, *tmp; + + list_for_each_entry_safe(mcast, tmp, &uobj->mcast_list, list) { + ib_detach_mcast(qp, &mcast->gid, mcast->lid); + list_del(&mcast->list); + kfree(mcast); + } +} + +static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, + struct ib_ucontext *context) +{ + struct ib_uobject *uobj, *tmp; + + if (!context) + return 0; + + context->closing = 1; + + list_for_each_entry_safe(uobj, tmp, &context->ah_list, list) { + struct ib_ah *ah = uobj->object; + + idr_remove_uobj(&ib_uverbs_ah_idr, uobj); + ib_destroy_ah(ah); + kfree(uobj); + } + + /* Remove MWs before QPs, in order to support type 2A MWs. */ + list_for_each_entry_safe(uobj, tmp, &context->mw_list, list) { + struct ib_mw *mw = uobj->object; + + idr_remove_uobj(&ib_uverbs_mw_idr, uobj); + ib_dealloc_mw(mw); + kfree(uobj); + } + + list_for_each_entry_safe(uobj, tmp, &context->rule_list, list) { + struct ib_flow *flow_id = uobj->object; + + idr_remove_uobj(&ib_uverbs_rule_idr, uobj); + ib_destroy_flow(flow_id); + kfree(uobj); + } + + list_for_each_entry_safe(uobj, tmp, &context->qp_list, list) { + struct ib_qp *qp = uobj->object; + struct ib_uqp_object *uqp = + container_of(uobj, struct ib_uqp_object, uevent.uobject); + + idr_remove_uobj(&ib_uverbs_qp_idr, uobj); + if (qp != qp->real_qp) { + ib_close_qp(qp); + } else { + ib_uverbs_detach_umcast(qp, uqp); + ib_destroy_qp(qp); + } + ib_uverbs_release_uevent(file, &uqp->uevent); + kfree(uqp); + } + + list_for_each_entry_safe(uobj, tmp, &context->cq_list, list) { + struct ib_cq *cq = uobj->object; + struct ib_uverbs_event_file *ev_file = cq->cq_context; + struct ib_ucq_object *ucq = + container_of(uobj, struct ib_ucq_object, uobject); + + idr_remove_uobj(&ib_uverbs_cq_idr, uobj); + ib_destroy_cq(cq); + ib_uverbs_release_ucq(file, ev_file, ucq); + kfree(ucq); + } + + list_for_each_entry_safe(uobj, tmp, &context->srq_list, list) { + struct ib_srq *srq = uobj->object; + struct ib_uevent_object *uevent = + container_of(uobj, struct ib_uevent_object, uobject); + + idr_remove_uobj(&ib_uverbs_srq_idr, uobj); + ib_destroy_srq(srq); + ib_uverbs_release_uevent(file, uevent); + kfree(uevent); + } + + list_for_each_entry_safe(uobj, tmp, &context->mr_list, list) { + struct ib_mr *mr = uobj->object; + + idr_remove_uobj(&ib_uverbs_mr_idr, uobj); + ib_dereg_mr(mr); + kfree(uobj); + } + + mutex_lock(&file->device->xrcd_tree_mutex); + list_for_each_entry_safe(uobj, tmp, &context->xrcd_list, list) { + struct ib_xrcd *xrcd = uobj->object; + struct ib_uxrcd_object *uxrcd = + container_of(uobj, struct ib_uxrcd_object, uobject); + + idr_remove_uobj(&ib_uverbs_xrcd_idr, uobj); + ib_uverbs_dealloc_xrcd(file->device, xrcd); + kfree(uxrcd); + } + mutex_unlock(&file->device->xrcd_tree_mutex); + + list_for_each_entry_safe(uobj, tmp, &context->pd_list, list) { + struct ib_pd *pd = uobj->object; + + idr_remove_uobj(&ib_uverbs_pd_idr, uobj); + ib_dealloc_pd(pd); + kfree(uobj); + } + + return context->device->dealloc_ucontext(context); +} + +static void ib_uverbs_release_file(struct kref *ref) +{ + struct ib_uverbs_file *file = + container_of(ref, struct ib_uverbs_file, ref); + + module_put(file->device->ib_dev->owner); + kref_put(&file->device->ref, ib_uverbs_release_dev); + + kfree(file); +} + +static ssize_t ib_uverbs_event_read(struct file *filp, char __user *buf, + size_t count, loff_t *pos) +{ + struct ib_uverbs_event_file *file = filp->private_data; + struct ib_uverbs_event *event; + int eventsz; + int ret = 0; + + spin_lock_irq(&file->lock); + + while (list_empty(&file->event_list)) { + spin_unlock_irq(&file->lock); + + if (filp->f_flags & O_NONBLOCK) + return -EAGAIN; + + if (wait_event_interruptible(file->poll_wait, + !list_empty(&file->event_list))) + return -ERESTARTSYS; + + spin_lock_irq(&file->lock); + } + + event = list_entry(file->event_list.next, struct ib_uverbs_event, list); + + if (file->is_async) + eventsz = sizeof (struct ib_uverbs_async_event_desc); + else + eventsz = sizeof (struct ib_uverbs_comp_event_desc); + + if (eventsz > count) { + ret = -EINVAL; + event = NULL; + } else { + list_del(file->event_list.next); + if (event->counter) { + ++(*event->counter); + list_del(&event->obj_list); + } + } + + spin_unlock_irq(&file->lock); + + if (event) { + if (copy_to_user(buf, event, eventsz)) + ret = -EFAULT; + else + ret = eventsz; + } + + kfree(event); + + return ret; +} + +static unsigned int ib_uverbs_event_poll(struct file *filp, + struct poll_table_struct *wait) +{ + unsigned int pollflags = 0; + struct ib_uverbs_event_file *file = filp->private_data; + + poll_wait(filp, &file->poll_wait, wait); + + spin_lock_irq(&file->lock); + if (!list_empty(&file->event_list)) + pollflags = POLLIN | POLLRDNORM; + spin_unlock_irq(&file->lock); + + return pollflags; +} + +static int ib_uverbs_event_fasync(int fd, struct file *filp, int on) +{ + struct ib_uverbs_event_file *file = filp->private_data; + + return fasync_helper(fd, filp, on, &file->async_queue); +} + +static int ib_uverbs_event_close(struct inode *inode, struct file *filp) +{ + struct ib_uverbs_event_file *file = filp->private_data; + struct ib_uverbs_event *entry, *tmp; + + spin_lock_irq(&file->lock); + file->is_closed = 1; + list_for_each_entry_safe(entry, tmp, &file->event_list, list) { + if (entry->counter) + list_del(&entry->obj_list); + kfree(entry); + } + spin_unlock_irq(&file->lock); + + if (file->is_async) { + ib_unregister_event_handler(&file->uverbs_file->event_handler); + kref_put(&file->uverbs_file->ref, ib_uverbs_release_file); + } + kref_put(&file->ref, ib_uverbs_release_event_file); + + return 0; +} + +static const struct file_operations uverbs_event_fops = { + .owner = THIS_MODULE, + .read = ib_uverbs_event_read, + .poll = ib_uverbs_event_poll, + .release = ib_uverbs_event_close, + .fasync = ib_uverbs_event_fasync, + .llseek = no_llseek, +}; + +void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context) +{ + struct ib_uverbs_event_file *file = cq_context; + struct ib_ucq_object *uobj; + struct ib_uverbs_event *entry; + unsigned long flags; + + if (!file) + return; + + spin_lock_irqsave(&file->lock, flags); + if (file->is_closed) { + spin_unlock_irqrestore(&file->lock, flags); + return; + } + + entry = kmalloc(sizeof *entry, GFP_ATOMIC); + if (!entry) { + spin_unlock_irqrestore(&file->lock, flags); + return; + } + + uobj = container_of(cq->uobject, struct ib_ucq_object, uobject); + + entry->desc.comp.cq_handle = cq->uobject->user_handle; + entry->counter = &uobj->comp_events_reported; + + list_add_tail(&entry->list, &file->event_list); + list_add_tail(&entry->obj_list, &uobj->comp_list); + spin_unlock_irqrestore(&file->lock, flags); + + wake_up_interruptible(&file->poll_wait); + kill_fasync(&file->async_queue, SIGIO, POLL_IN); +} + +static void ib_uverbs_async_handler(struct ib_uverbs_file *file, + __u64 element, __u64 event, + struct list_head *obj_list, + u32 *counter) +{ + struct ib_uverbs_event *entry; + unsigned long flags; + + spin_lock_irqsave(&file->async_file->lock, flags); + if (file->async_file->is_closed) { + spin_unlock_irqrestore(&file->async_file->lock, flags); + return; + } + + entry = kmalloc(sizeof *entry, GFP_ATOMIC); + if (!entry) { + spin_unlock_irqrestore(&file->async_file->lock, flags); + return; + } + + entry->desc.async.element = element; + entry->desc.async.event_type = event; + entry->desc.async.reserved = 0; + entry->counter = counter; + + list_add_tail(&entry->list, &file->async_file->event_list); + if (obj_list) + list_add_tail(&entry->obj_list, obj_list); + spin_unlock_irqrestore(&file->async_file->lock, flags); + + wake_up_interruptible(&file->async_file->poll_wait); + kill_fasync(&file->async_file->async_queue, SIGIO, POLL_IN); +} + +void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr) +{ + struct ib_ucq_object *uobj = container_of(event->element.cq->uobject, + struct ib_ucq_object, uobject); + + ib_uverbs_async_handler(uobj->uverbs_file, uobj->uobject.user_handle, + event->event, &uobj->async_list, + &uobj->async_events_reported); +} + +void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr) +{ + struct ib_uevent_object *uobj; + + /* for XRC target qp's, check that qp is live */ + if (!event->element.qp->uobject || !event->element.qp->uobject->live) + return; + + uobj = container_of(event->element.qp->uobject, + struct ib_uevent_object, uobject); + + ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle, + event->event, &uobj->event_list, + &uobj->events_reported); +} + +void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr) +{ + struct ib_uevent_object *uobj; + + uobj = container_of(event->element.srq->uobject, + struct ib_uevent_object, uobject); + + ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle, + event->event, &uobj->event_list, + &uobj->events_reported); +} + +void ib_uverbs_event_handler(struct ib_event_handler *handler, + struct ib_event *event) +{ + struct ib_uverbs_file *file = + container_of(handler, struct ib_uverbs_file, event_handler); + + ib_uverbs_async_handler(file, event->element.port_num, event->event, + NULL, NULL); +} + +struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, + int is_async) +{ + struct ib_uverbs_event_file *ev_file; + struct file *filp; + + ev_file = kmalloc(sizeof *ev_file, GFP_KERNEL); + if (!ev_file) + return ERR_PTR(-ENOMEM); + + kref_init(&ev_file->ref); + spin_lock_init(&ev_file->lock); + INIT_LIST_HEAD(&ev_file->event_list); + init_waitqueue_head(&ev_file->poll_wait); + ev_file->uverbs_file = uverbs_file; + ev_file->async_queue = NULL; + ev_file->is_async = is_async; + ev_file->is_closed = 0; + + filp = anon_inode_getfile("[infinibandevent]", &uverbs_event_fops, + ev_file, O_RDONLY); + if (IS_ERR(filp)) + kfree(ev_file); + + return filp; +} + +/* + * Look up a completion event file by FD. If lookup is successful, + * takes a ref to the event file struct that it returns; if + * unsuccessful, returns NULL. + */ +struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd) +{ + struct ib_uverbs_event_file *ev_file = NULL; + struct fd f = fdget(fd); + + if (!f.file) + return NULL; + + if (f.file->f_op != &uverbs_event_fops) + goto out; + + ev_file = f.file->private_data; + if (ev_file->is_async) { + ev_file = NULL; + goto out; + } + + kref_get(&ev_file->ref); + +out: + fdput(f); + return ev_file; +} + +static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct ib_uverbs_file *file = filp->private_data; + struct ib_uverbs_cmd_hdr hdr; + __u32 flags; + + if (count < sizeof hdr) + return -EINVAL; + + if (copy_from_user(&hdr, buf, sizeof hdr)) + return -EFAULT; + + flags = (hdr.command & + IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT; + + if (!flags) { + __u32 command; + + if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | + IB_USER_VERBS_CMD_COMMAND_MASK)) + return -EINVAL; + + command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK; + + if (command >= ARRAY_SIZE(uverbs_cmd_table) || + !uverbs_cmd_table[command]) + return -EINVAL; + + if (!file->ucontext && + command != IB_USER_VERBS_CMD_GET_CONTEXT) + return -EINVAL; + + if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << command))) + return -ENOSYS; + + if (hdr.in_words * 4 != count) + return -EINVAL; + + return uverbs_cmd_table[command](file, + buf + sizeof(hdr), + hdr.in_words * 4, + hdr.out_words * 4); + + } else if (flags == IB_USER_VERBS_CMD_FLAG_EXTENDED) { + __u32 command; + + struct ib_uverbs_ex_cmd_hdr ex_hdr; + struct ib_udata ucore; + struct ib_udata uhw; + int err; + size_t written_count = count; + + if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | + IB_USER_VERBS_CMD_COMMAND_MASK)) + return -EINVAL; + + command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK; + + if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) || + !uverbs_ex_cmd_table[command]) + return -ENOSYS; + + if (!file->ucontext) + return -EINVAL; + + if (!(file->device->ib_dev->uverbs_ex_cmd_mask & (1ull << command))) + return -ENOSYS; + + if (count < (sizeof(hdr) + sizeof(ex_hdr))) + return -EINVAL; + + if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) + return -EFAULT; + + count -= sizeof(hdr) + sizeof(ex_hdr); + buf += sizeof(hdr) + sizeof(ex_hdr); + + if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count) + return -EINVAL; + + if (ex_hdr.cmd_hdr_reserved) + return -EINVAL; + + if (ex_hdr.response) { + if (!hdr.out_words && !ex_hdr.provider_out_words) + return -EINVAL; + + if (!access_ok(VERIFY_WRITE, + (void __user *) (unsigned long) ex_hdr.response, + (hdr.out_words + ex_hdr.provider_out_words) * 8)) + return -EFAULT; + } else { + if (hdr.out_words || ex_hdr.provider_out_words) + return -EINVAL; + } + + INIT_UDATA_BUF_OR_NULL(&ucore, buf, (unsigned long) ex_hdr.response, + hdr.in_words * 8, hdr.out_words * 8); + + INIT_UDATA_BUF_OR_NULL(&uhw, + buf + ucore.inlen, + (unsigned long) ex_hdr.response + ucore.outlen, + ex_hdr.provider_in_words * 8, + ex_hdr.provider_out_words * 8); + + err = uverbs_ex_cmd_table[command](file, + &ucore, + &uhw); + + if (err) + return err; + + return written_count; + } + + return -ENOSYS; +} + +static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct ib_uverbs_file *file = filp->private_data; + + if (!file->ucontext) + return -ENODEV; + else + return file->device->ib_dev->mmap(file->ucontext, vma); +} + +/* + * ib_uverbs_open() does not need the BKL: + * + * - the ib_uverbs_device structures are properly reference counted and + * everything else is purely local to the file being created, so + * races against other open calls are not a problem; + * - there is no ioctl method to race against; + * - the open method will either immediately run -ENXIO, or all + * required initialization will be done. + */ +static int ib_uverbs_open(struct inode *inode, struct file *filp) +{ + struct ib_uverbs_device *dev; + struct ib_uverbs_file *file; + int ret; + + dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev); + if (dev) + kref_get(&dev->ref); + else + return -ENXIO; + + if (!try_module_get(dev->ib_dev->owner)) { + ret = -ENODEV; + goto err; + } + + file = kmalloc(sizeof *file, GFP_KERNEL); + if (!file) { + ret = -ENOMEM; + goto err_module; + } + + file->device = dev; + file->ucontext = NULL; + file->async_file = NULL; + kref_init(&file->ref); + mutex_init(&file->mutex); + + filp->private_data = file; + + return nonseekable_open(inode, filp); + +err_module: + module_put(dev->ib_dev->owner); + +err: + kref_put(&dev->ref, ib_uverbs_release_dev); + return ret; +} + +static int ib_uverbs_close(struct inode *inode, struct file *filp) +{ + struct ib_uverbs_file *file = filp->private_data; + + ib_uverbs_cleanup_ucontext(file, file->ucontext); + + if (file->async_file) + kref_put(&file->async_file->ref, ib_uverbs_release_event_file); + + kref_put(&file->ref, ib_uverbs_release_file); + + return 0; +} + +static const struct file_operations uverbs_fops = { + .owner = THIS_MODULE, + .write = ib_uverbs_write, + .open = ib_uverbs_open, + .release = ib_uverbs_close, + .llseek = no_llseek, +}; + +static const struct file_operations uverbs_mmap_fops = { + .owner = THIS_MODULE, + .write = ib_uverbs_write, + .mmap = ib_uverbs_mmap, + .open = ib_uverbs_open, + .release = ib_uverbs_close, + .llseek = no_llseek, +}; + +static struct ib_client uverbs_client = { + .name = "uverbs", + .add = ib_uverbs_add_one, + .remove = ib_uverbs_remove_one +}; + +static ssize_t show_ibdev(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct ib_uverbs_device *dev = dev_get_drvdata(device); + + if (!dev) + return -ENODEV; + + return sprintf(buf, "%s\n", dev->ib_dev->name); +} +static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); + +static ssize_t show_dev_abi_version(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct ib_uverbs_device *dev = dev_get_drvdata(device); + + if (!dev) + return -ENODEV; + + return sprintf(buf, "%d\n", dev->ib_dev->uverbs_abi_ver); +} +static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL); + +static CLASS_ATTR_STRING(abi_version, S_IRUGO, + __stringify(IB_USER_VERBS_ABI_VERSION)); + +static dev_t overflow_maj; +static DECLARE_BITMAP(overflow_map, IB_UVERBS_MAX_DEVICES); + +/* + * If we have more than IB_UVERBS_MAX_DEVICES, dynamically overflow by + * requesting a new major number and doubling the number of max devices we + * support. It's stupid, but simple. + */ +static int find_overflow_devnum(void) +{ + int ret; + + if (!overflow_maj) { + ret = alloc_chrdev_region(&overflow_maj, 0, IB_UVERBS_MAX_DEVICES, + "infiniband_verbs"); + if (ret) { + printk(KERN_ERR "user_verbs: couldn't register dynamic device number\n"); + return ret; + } + } + + ret = find_first_zero_bit(overflow_map, IB_UVERBS_MAX_DEVICES); + if (ret >= IB_UVERBS_MAX_DEVICES) + return -1; + + return ret; +} + +static void ib_uverbs_add_one(struct ib_device *device) +{ + int devnum; + dev_t base; + struct ib_uverbs_device *uverbs_dev; + + if (!device->alloc_ucontext) + return; + + uverbs_dev = kzalloc(sizeof *uverbs_dev, GFP_KERNEL); + if (!uverbs_dev) + return; + + kref_init(&uverbs_dev->ref); + init_completion(&uverbs_dev->comp); + uverbs_dev->xrcd_tree = RB_ROOT; + mutex_init(&uverbs_dev->xrcd_tree_mutex); + + spin_lock(&map_lock); + devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES); + if (devnum >= IB_UVERBS_MAX_DEVICES) { + spin_unlock(&map_lock); + devnum = find_overflow_devnum(); + if (devnum < 0) + goto err; + + spin_lock(&map_lock); + uverbs_dev->devnum = devnum + IB_UVERBS_MAX_DEVICES; + base = devnum + overflow_maj; + set_bit(devnum, overflow_map); + } else { + uverbs_dev->devnum = devnum; + base = devnum + IB_UVERBS_BASE_DEV; + set_bit(devnum, dev_map); + } + spin_unlock(&map_lock); + + uverbs_dev->ib_dev = device; + uverbs_dev->num_comp_vectors = device->num_comp_vectors; + + cdev_init(&uverbs_dev->cdev, NULL); + uverbs_dev->cdev.owner = THIS_MODULE; + uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops; + kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum); + if (cdev_add(&uverbs_dev->cdev, base, 1)) + goto err_cdev; + + uverbs_dev->dev = device_create(uverbs_class, device->dma_device, + uverbs_dev->cdev.dev, uverbs_dev, + "uverbs%d", uverbs_dev->devnum); + if (IS_ERR(uverbs_dev->dev)) + goto err_cdev; + + if (device_create_file(uverbs_dev->dev, &dev_attr_ibdev)) + goto err_class; + if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version)) + goto err_class; + + ib_set_client_data(device, &uverbs_client, uverbs_dev); + + return; + +err_class: + device_destroy(uverbs_class, uverbs_dev->cdev.dev); + +err_cdev: + cdev_del(&uverbs_dev->cdev); + if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES) + clear_bit(devnum, dev_map); + else + clear_bit(devnum, overflow_map); + +err: + kref_put(&uverbs_dev->ref, ib_uverbs_release_dev); + wait_for_completion(&uverbs_dev->comp); + kfree(uverbs_dev); + return; +} + +static void ib_uverbs_remove_one(struct ib_device *device) +{ + struct ib_uverbs_device *uverbs_dev = ib_get_client_data(device, &uverbs_client); + + if (!uverbs_dev) + return; + + dev_set_drvdata(uverbs_dev->dev, NULL); + device_destroy(uverbs_class, uverbs_dev->cdev.dev); + cdev_del(&uverbs_dev->cdev); + + if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES) + clear_bit(uverbs_dev->devnum, dev_map); + else + clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map); + + kref_put(&uverbs_dev->ref, ib_uverbs_release_dev); + wait_for_completion(&uverbs_dev->comp); + kfree(uverbs_dev); +} + +static char *uverbs_devnode(struct device *dev, umode_t *mode) +{ + if (mode) + *mode = 0666; + return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev)); +} + +static int __init ib_uverbs_init(void) +{ + int ret; + + ret = register_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES, + "infiniband_verbs"); + if (ret) { + printk(KERN_ERR "user_verbs: couldn't register device number\n"); + goto out; + } + + uverbs_class = class_create(THIS_MODULE, "infiniband_verbs"); + if (IS_ERR(uverbs_class)) { + ret = PTR_ERR(uverbs_class); + printk(KERN_ERR "user_verbs: couldn't create class infiniband_verbs\n"); + goto out_chrdev; + } + + uverbs_class->devnode = uverbs_devnode; + + ret = class_create_file(uverbs_class, &class_attr_abi_version.attr); + if (ret) { + printk(KERN_ERR "user_verbs: couldn't create abi_version attribute\n"); + goto out_class; + } + + ret = ib_register_client(&uverbs_client); + if (ret) { + printk(KERN_ERR "user_verbs: couldn't register client\n"); + goto out_class; + } + + return 0; + +out_class: + class_destroy(uverbs_class); + +out_chrdev: + unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES); + +out: + return ret; +} + +static void __exit ib_uverbs_cleanup(void) +{ + ib_unregister_client(&uverbs_client); + class_destroy(uverbs_class); + unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES); + if (overflow_maj) + unregister_chrdev_region(overflow_maj, IB_UVERBS_MAX_DEVICES); + idr_destroy(&ib_uverbs_pd_idr); + idr_destroy(&ib_uverbs_mr_idr); + idr_destroy(&ib_uverbs_mw_idr); + idr_destroy(&ib_uverbs_ah_idr); + idr_destroy(&ib_uverbs_cq_idr); + idr_destroy(&ib_uverbs_qp_idr); + idr_destroy(&ib_uverbs_srq_idr); +} + +module_init(ib_uverbs_init); +module_exit(ib_uverbs_cleanup); diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c new file mode 100644 index 0000000..abd9724 --- /dev/null +++ b/drivers/infiniband/core/uverbs_marshall.c @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst, + struct ib_ah_attr *src) +{ + memcpy(dst->grh.dgid, src->grh.dgid.raw, sizeof src->grh.dgid); + dst->grh.flow_label = src->grh.flow_label; + dst->grh.sgid_index = src->grh.sgid_index; + dst->grh.hop_limit = src->grh.hop_limit; + dst->grh.traffic_class = src->grh.traffic_class; + memset(&dst->grh.reserved, 0, sizeof(dst->grh.reserved)); + dst->dlid = src->dlid; + dst->sl = src->sl; + dst->src_path_bits = src->src_path_bits; + dst->static_rate = src->static_rate; + dst->is_global = src->ah_flags & IB_AH_GRH ? 1 : 0; + dst->port_num = src->port_num; + dst->reserved = 0; +} +EXPORT_SYMBOL(ib_copy_ah_attr_to_user); + +void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst, + struct ib_qp_attr *src) +{ + dst->qp_state = src->qp_state; + dst->cur_qp_state = src->cur_qp_state; + dst->path_mtu = src->path_mtu; + dst->path_mig_state = src->path_mig_state; + dst->qkey = src->qkey; + dst->rq_psn = src->rq_psn; + dst->sq_psn = src->sq_psn; + dst->dest_qp_num = src->dest_qp_num; + dst->qp_access_flags = src->qp_access_flags; + + dst->max_send_wr = src->cap.max_send_wr; + dst->max_recv_wr = src->cap.max_recv_wr; + dst->max_send_sge = src->cap.max_send_sge; + dst->max_recv_sge = src->cap.max_recv_sge; + dst->max_inline_data = src->cap.max_inline_data; + + ib_copy_ah_attr_to_user(&dst->ah_attr, &src->ah_attr); + ib_copy_ah_attr_to_user(&dst->alt_ah_attr, &src->alt_ah_attr); + + dst->pkey_index = src->pkey_index; + dst->alt_pkey_index = src->alt_pkey_index; + dst->en_sqd_async_notify = src->en_sqd_async_notify; + dst->sq_draining = src->sq_draining; + dst->max_rd_atomic = src->max_rd_atomic; + dst->max_dest_rd_atomic = src->max_dest_rd_atomic; + dst->min_rnr_timer = src->min_rnr_timer; + dst->port_num = src->port_num; + dst->timeout = src->timeout; + dst->retry_cnt = src->retry_cnt; + dst->rnr_retry = src->rnr_retry; + dst->alt_port_num = src->alt_port_num; + dst->alt_timeout = src->alt_timeout; + memset(dst->reserved, 0, sizeof(dst->reserved)); +} +EXPORT_SYMBOL(ib_copy_qp_attr_to_user); + +void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst, + struct ib_sa_path_rec *src) +{ + memcpy(dst->dgid, src->dgid.raw, sizeof src->dgid); + memcpy(dst->sgid, src->sgid.raw, sizeof src->sgid); + + dst->dlid = src->dlid; + dst->slid = src->slid; + dst->raw_traffic = src->raw_traffic; + dst->flow_label = src->flow_label; + dst->hop_limit = src->hop_limit; + dst->traffic_class = src->traffic_class; + dst->reversible = src->reversible; + dst->numb_path = src->numb_path; + dst->pkey = src->pkey; + dst->sl = src->sl; + dst->mtu_selector = src->mtu_selector; + dst->mtu = src->mtu; + dst->rate_selector = src->rate_selector; + dst->rate = src->rate; + dst->packet_life_time = src->packet_life_time; + dst->preference = src->preference; + dst->packet_life_time_selector = src->packet_life_time_selector; +} +EXPORT_SYMBOL(ib_copy_path_rec_to_user); + +void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst, + struct ib_user_path_rec *src) +{ + memcpy(dst->dgid.raw, src->dgid, sizeof dst->dgid); + memcpy(dst->sgid.raw, src->sgid, sizeof dst->sgid); + + dst->dlid = src->dlid; + dst->slid = src->slid; + dst->raw_traffic = src->raw_traffic; + dst->flow_label = src->flow_label; + dst->hop_limit = src->hop_limit; + dst->traffic_class = src->traffic_class; + dst->reversible = src->reversible; + dst->numb_path = src->numb_path; + dst->pkey = src->pkey; + dst->sl = src->sl; + dst->mtu_selector = src->mtu_selector; + dst->mtu = src->mtu; + dst->rate_selector = src->rate_selector; + dst->rate = src->rate; + dst->packet_life_time = src->packet_life_time; + dst->preference = src->preference; + dst->packet_life_time_selector = src->packet_life_time_selector; + + memset(dst->smac, 0, sizeof(dst->smac)); + memset(dst->dmac, 0, sizeof(dst->dmac)); + dst->vlan_id = 0xffff; +} +EXPORT_SYMBOL(ib_copy_path_rec_from_user); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c new file mode 100644 index 0000000..c2b89cc --- /dev/null +++ b/drivers/infiniband/core/verbs.c @@ -0,0 +1,1447 @@ +/* + * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2004 Infinicon Corporation. All rights reserved. + * Copyright (c) 2004 Intel Corporation. All rights reserved. + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2004 Voltaire Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "core_priv.h" + +__attribute_const__ int ib_rate_to_mult(enum ib_rate rate) +{ + switch (rate) { + case IB_RATE_2_5_GBPS: return 1; + case IB_RATE_5_GBPS: return 2; + case IB_RATE_10_GBPS: return 4; + case IB_RATE_20_GBPS: return 8; + case IB_RATE_30_GBPS: return 12; + case IB_RATE_40_GBPS: return 16; + case IB_RATE_60_GBPS: return 24; + case IB_RATE_80_GBPS: return 32; + case IB_RATE_120_GBPS: return 48; + default: return -1; + } +} +EXPORT_SYMBOL(ib_rate_to_mult); + +__attribute_const__ enum ib_rate mult_to_ib_rate(int mult) +{ + switch (mult) { + case 1: return IB_RATE_2_5_GBPS; + case 2: return IB_RATE_5_GBPS; + case 4: return IB_RATE_10_GBPS; + case 8: return IB_RATE_20_GBPS; + case 12: return IB_RATE_30_GBPS; + case 16: return IB_RATE_40_GBPS; + case 24: return IB_RATE_60_GBPS; + case 32: return IB_RATE_80_GBPS; + case 48: return IB_RATE_120_GBPS; + default: return IB_RATE_PORT_CURRENT; + } +} +EXPORT_SYMBOL(mult_to_ib_rate); + +__attribute_const__ int ib_rate_to_mbps(enum ib_rate rate) +{ + switch (rate) { + case IB_RATE_2_5_GBPS: return 2500; + case IB_RATE_5_GBPS: return 5000; + case IB_RATE_10_GBPS: return 10000; + case IB_RATE_20_GBPS: return 20000; + case IB_RATE_30_GBPS: return 30000; + case IB_RATE_40_GBPS: return 40000; + case IB_RATE_60_GBPS: return 60000; + case IB_RATE_80_GBPS: return 80000; + case IB_RATE_120_GBPS: return 120000; + case IB_RATE_14_GBPS: return 14062; + case IB_RATE_56_GBPS: return 56250; + case IB_RATE_112_GBPS: return 112500; + case IB_RATE_168_GBPS: return 168750; + case IB_RATE_25_GBPS: return 25781; + case IB_RATE_100_GBPS: return 103125; + case IB_RATE_200_GBPS: return 206250; + case IB_RATE_300_GBPS: return 309375; + default: return -1; + } +} +EXPORT_SYMBOL(ib_rate_to_mbps); + +__attribute_const__ enum rdma_transport_type +rdma_node_get_transport(enum rdma_node_type node_type) +{ + switch (node_type) { + case RDMA_NODE_IB_CA: + case RDMA_NODE_IB_SWITCH: + case RDMA_NODE_IB_ROUTER: + return RDMA_TRANSPORT_IB; + case RDMA_NODE_RNIC: + return RDMA_TRANSPORT_IWARP; + case RDMA_NODE_USNIC: + return RDMA_TRANSPORT_USNIC; + case RDMA_NODE_USNIC_UDP: + return RDMA_TRANSPORT_USNIC_UDP; + default: + BUG(); + return 0; + } +} +EXPORT_SYMBOL(rdma_node_get_transport); + +enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num) +{ + if (device->get_link_layer) + return device->get_link_layer(device, port_num); + + switch (rdma_node_get_transport(device->node_type)) { + case RDMA_TRANSPORT_IB: + return IB_LINK_LAYER_INFINIBAND; + case RDMA_TRANSPORT_IWARP: + case RDMA_TRANSPORT_USNIC: + case RDMA_TRANSPORT_USNIC_UDP: + return IB_LINK_LAYER_ETHERNET; + default: + return IB_LINK_LAYER_UNSPECIFIED; + } +} +EXPORT_SYMBOL(rdma_port_get_link_layer); + +/* Protection domains */ + +struct ib_pd *ib_alloc_pd(struct ib_device *device) +{ + struct ib_pd *pd; + + pd = device->alloc_pd(device, NULL, NULL); + + if (!IS_ERR(pd)) { + pd->device = device; + pd->uobject = NULL; + atomic_set(&pd->usecnt, 0); + } + + return pd; +} +EXPORT_SYMBOL(ib_alloc_pd); + +int ib_dealloc_pd(struct ib_pd *pd) +{ + if (atomic_read(&pd->usecnt)) + return -EBUSY; + + return pd->device->dealloc_pd(pd); +} +EXPORT_SYMBOL(ib_dealloc_pd); + +/* Address handles */ + +struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +{ + struct ib_ah *ah; + + ah = pd->device->create_ah(pd, ah_attr); + + if (!IS_ERR(ah)) { + ah->device = pd->device; + ah->pd = pd; + ah->uobject = NULL; + atomic_inc(&pd->usecnt); + } + + return ah; +} +EXPORT_SYMBOL(ib_create_ah); + +int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc, + struct ib_grh *grh, struct ib_ah_attr *ah_attr) +{ + u32 flow_class; + u16 gid_index; + int ret; + int is_eth = (rdma_port_get_link_layer(device, port_num) == + IB_LINK_LAYER_ETHERNET); + + memset(ah_attr, 0, sizeof *ah_attr); + if (is_eth) { + if (!(wc->wc_flags & IB_WC_GRH)) + return -EPROTOTYPE; + + if (wc->wc_flags & IB_WC_WITH_SMAC && + wc->wc_flags & IB_WC_WITH_VLAN) { + memcpy(ah_attr->dmac, wc->smac, ETH_ALEN); + ah_attr->vlan_id = wc->vlan_id; + } else { + ret = rdma_addr_find_dmac_by_grh(&grh->dgid, &grh->sgid, + ah_attr->dmac, &ah_attr->vlan_id); + if (ret) + return ret; + } + } else { + ah_attr->vlan_id = 0xffff; + } + + ah_attr->dlid = wc->slid; + ah_attr->sl = wc->sl; + ah_attr->src_path_bits = wc->dlid_path_bits; + ah_attr->port_num = port_num; + + if (wc->wc_flags & IB_WC_GRH) { + ah_attr->ah_flags = IB_AH_GRH; + ah_attr->grh.dgid = grh->sgid; + + ret = ib_find_cached_gid(device, &grh->dgid, &port_num, + &gid_index); + if (ret) + return ret; + + ah_attr->grh.sgid_index = (u8) gid_index; + flow_class = be32_to_cpu(grh->version_tclass_flow); + ah_attr->grh.flow_label = flow_class & 0xFFFFF; + ah_attr->grh.hop_limit = 0xFF; + ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF; + } + return 0; +} +EXPORT_SYMBOL(ib_init_ah_from_wc); + +struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc, + struct ib_grh *grh, u8 port_num) +{ + struct ib_ah_attr ah_attr; + int ret; + + ret = ib_init_ah_from_wc(pd->device, port_num, wc, grh, &ah_attr); + if (ret) + return ERR_PTR(ret); + + return ib_create_ah(pd, &ah_attr); +} +EXPORT_SYMBOL(ib_create_ah_from_wc); + +int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr) +{ + return ah->device->modify_ah ? + ah->device->modify_ah(ah, ah_attr) : + -ENOSYS; +} +EXPORT_SYMBOL(ib_modify_ah); + +int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr) +{ + return ah->device->query_ah ? + ah->device->query_ah(ah, ah_attr) : + -ENOSYS; +} +EXPORT_SYMBOL(ib_query_ah); + +int ib_destroy_ah(struct ib_ah *ah) +{ + struct ib_pd *pd; + int ret; + + pd = ah->pd; + ret = ah->device->destroy_ah(ah); + if (!ret) + atomic_dec(&pd->usecnt); + + return ret; +} +EXPORT_SYMBOL(ib_destroy_ah); + +/* Shared receive queues */ + +struct ib_srq *ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *srq_init_attr) +{ + struct ib_srq *srq; + + if (!pd->device->create_srq) + return ERR_PTR(-ENOSYS); + + srq = pd->device->create_srq(pd, srq_init_attr, NULL); + + if (!IS_ERR(srq)) { + srq->device = pd->device; + srq->pd = pd; + srq->uobject = NULL; + srq->event_handler = srq_init_attr->event_handler; + srq->srq_context = srq_init_attr->srq_context; + srq->srq_type = srq_init_attr->srq_type; + if (srq->srq_type == IB_SRQT_XRC) { + srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd; + srq->ext.xrc.cq = srq_init_attr->ext.xrc.cq; + atomic_inc(&srq->ext.xrc.xrcd->usecnt); + atomic_inc(&srq->ext.xrc.cq->usecnt); + } + atomic_inc(&pd->usecnt); + atomic_set(&srq->usecnt, 0); + } + + return srq; +} +EXPORT_SYMBOL(ib_create_srq); + +int ib_modify_srq(struct ib_srq *srq, + struct ib_srq_attr *srq_attr, + enum ib_srq_attr_mask srq_attr_mask) +{ + return srq->device->modify_srq ? + srq->device->modify_srq(srq, srq_attr, srq_attr_mask, NULL) : + -ENOSYS; +} +EXPORT_SYMBOL(ib_modify_srq); + +int ib_query_srq(struct ib_srq *srq, + struct ib_srq_attr *srq_attr) +{ + return srq->device->query_srq ? + srq->device->query_srq(srq, srq_attr) : -ENOSYS; +} +EXPORT_SYMBOL(ib_query_srq); + +int ib_destroy_srq(struct ib_srq *srq) +{ + struct ib_pd *pd; + enum ib_srq_type srq_type; + struct ib_xrcd *uninitialized_var(xrcd); + struct ib_cq *uninitialized_var(cq); + int ret; + + if (atomic_read(&srq->usecnt)) + return -EBUSY; + + pd = srq->pd; + srq_type = srq->srq_type; + if (srq_type == IB_SRQT_XRC) { + xrcd = srq->ext.xrc.xrcd; + cq = srq->ext.xrc.cq; + } + + ret = srq->device->destroy_srq(srq); + if (!ret) { + atomic_dec(&pd->usecnt); + if (srq_type == IB_SRQT_XRC) { + atomic_dec(&xrcd->usecnt); + atomic_dec(&cq->usecnt); + } + } + + return ret; +} +EXPORT_SYMBOL(ib_destroy_srq); + +/* Queue pairs */ + +static void __ib_shared_qp_event_handler(struct ib_event *event, void *context) +{ + struct ib_qp *qp = context; + unsigned long flags; + + spin_lock_irqsave(&qp->device->event_handler_lock, flags); + list_for_each_entry(event->element.qp, &qp->open_list, open_list) + if (event->element.qp->event_handler) + event->element.qp->event_handler(event, event->element.qp->qp_context); + spin_unlock_irqrestore(&qp->device->event_handler_lock, flags); +} + +static void __ib_insert_xrcd_qp(struct ib_xrcd *xrcd, struct ib_qp *qp) +{ + mutex_lock(&xrcd->tgt_qp_mutex); + list_add(&qp->xrcd_list, &xrcd->tgt_qp_list); + mutex_unlock(&xrcd->tgt_qp_mutex); +} + +static struct ib_qp *__ib_open_qp(struct ib_qp *real_qp, + void (*event_handler)(struct ib_event *, void *), + void *qp_context) +{ + struct ib_qp *qp; + unsigned long flags; + + qp = kzalloc(sizeof *qp, GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + qp->real_qp = real_qp; + atomic_inc(&real_qp->usecnt); + qp->device = real_qp->device; + qp->event_handler = event_handler; + qp->qp_context = qp_context; + qp->qp_num = real_qp->qp_num; + qp->qp_type = real_qp->qp_type; + + spin_lock_irqsave(&real_qp->device->event_handler_lock, flags); + list_add(&qp->open_list, &real_qp->open_list); + spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags); + + return qp; +} + +struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, + struct ib_qp_open_attr *qp_open_attr) +{ + struct ib_qp *qp, *real_qp; + + if (qp_open_attr->qp_type != IB_QPT_XRC_TGT) + return ERR_PTR(-EINVAL); + + qp = ERR_PTR(-EINVAL); + mutex_lock(&xrcd->tgt_qp_mutex); + list_for_each_entry(real_qp, &xrcd->tgt_qp_list, xrcd_list) { + if (real_qp->qp_num == qp_open_attr->qp_num) { + qp = __ib_open_qp(real_qp, qp_open_attr->event_handler, + qp_open_attr->qp_context); + break; + } + } + mutex_unlock(&xrcd->tgt_qp_mutex); + return qp; +} +EXPORT_SYMBOL(ib_open_qp); + +struct ib_qp *ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *qp_init_attr) +{ + struct ib_qp *qp, *real_qp; + struct ib_device *device; + + device = pd ? pd->device : qp_init_attr->xrcd->device; + qp = device->create_qp(pd, qp_init_attr, NULL); + + if (!IS_ERR(qp)) { + qp->device = device; + qp->real_qp = qp; + qp->uobject = NULL; + qp->qp_type = qp_init_attr->qp_type; + + atomic_set(&qp->usecnt, 0); + if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) { + qp->event_handler = __ib_shared_qp_event_handler; + qp->qp_context = qp; + qp->pd = NULL; + qp->send_cq = qp->recv_cq = NULL; + qp->srq = NULL; + qp->xrcd = qp_init_attr->xrcd; + atomic_inc(&qp_init_attr->xrcd->usecnt); + INIT_LIST_HEAD(&qp->open_list); + + real_qp = qp; + qp = __ib_open_qp(real_qp, qp_init_attr->event_handler, + qp_init_attr->qp_context); + if (!IS_ERR(qp)) + __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp); + else + real_qp->device->destroy_qp(real_qp); + } else { + qp->event_handler = qp_init_attr->event_handler; + qp->qp_context = qp_init_attr->qp_context; + if (qp_init_attr->qp_type == IB_QPT_XRC_INI) { + qp->recv_cq = NULL; + qp->srq = NULL; + } else { + qp->recv_cq = qp_init_attr->recv_cq; + atomic_inc(&qp_init_attr->recv_cq->usecnt); + qp->srq = qp_init_attr->srq; + if (qp->srq) + atomic_inc(&qp_init_attr->srq->usecnt); + } + + qp->pd = pd; + qp->send_cq = qp_init_attr->send_cq; + qp->xrcd = NULL; + + atomic_inc(&pd->usecnt); + atomic_inc(&qp_init_attr->send_cq->usecnt); + } + } + + return qp; +} +EXPORT_SYMBOL(ib_create_qp); + +static const struct { + int valid; + enum ib_qp_attr_mask req_param[IB_QPT_MAX]; + enum ib_qp_attr_mask req_param_add_eth[IB_QPT_MAX]; + enum ib_qp_attr_mask opt_param[IB_QPT_MAX]; + enum ib_qp_attr_mask opt_param_add_eth[IB_QPT_MAX]; +} qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = { + [IB_QPS_RESET] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_INIT] = { + .valid = 1, + .req_param = { + [IB_QPT_UD] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_QKEY), + [IB_QPT_RAW_PACKET] = IB_QP_PORT, + [IB_QPT_UC] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_RC] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + } + }, + }, + [IB_QPS_INIT] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_INIT] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_RC] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + } + }, + [IB_QPS_RTR] = { + .valid = 1, + .req_param = { + [IB_QPT_UC] = (IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_DEST_QPN | + IB_QP_RQ_PSN), + [IB_QPT_RC] = (IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_DEST_QPN | + IB_QP_RQ_PSN | + IB_QP_MAX_DEST_RD_ATOMIC | + IB_QP_MIN_RNR_TIMER), + [IB_QPT_XRC_INI] = (IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_DEST_QPN | + IB_QP_RQ_PSN), + [IB_QPT_XRC_TGT] = (IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_DEST_QPN | + IB_QP_RQ_PSN | + IB_QP_MAX_DEST_RD_ATOMIC | + IB_QP_MIN_RNR_TIMER), + }, + .req_param_add_eth = { + [IB_QPT_RC] = (IB_QP_SMAC), + [IB_QPT_UC] = (IB_QP_SMAC), + [IB_QPT_XRC_INI] = (IB_QP_SMAC), + [IB_QPT_XRC_TGT] = (IB_QP_SMAC) + }, + .opt_param = { + [IB_QPT_UD] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX), + [IB_QPT_RC] = (IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX), + [IB_QPT_XRC_INI] = (IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX), + [IB_QPT_XRC_TGT] = (IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX), + [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + }, + .opt_param_add_eth = { + [IB_QPT_RC] = (IB_QP_ALT_SMAC | + IB_QP_VID | + IB_QP_ALT_VID), + [IB_QPT_UC] = (IB_QP_ALT_SMAC | + IB_QP_VID | + IB_QP_ALT_VID), + [IB_QPT_XRC_INI] = (IB_QP_ALT_SMAC | + IB_QP_VID | + IB_QP_ALT_VID), + [IB_QPT_XRC_TGT] = (IB_QP_ALT_SMAC | + IB_QP_VID | + IB_QP_ALT_VID) + } + } + }, + [IB_QPS_RTR] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .req_param = { + [IB_QPT_UD] = IB_QP_SQ_PSN, + [IB_QPT_UC] = IB_QP_SQ_PSN, + [IB_QPT_RC] = (IB_QP_TIMEOUT | + IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY | + IB_QP_SQ_PSN | + IB_QP_MAX_QP_RD_ATOMIC), + [IB_QPT_XRC_INI] = (IB_QP_TIMEOUT | + IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY | + IB_QP_SQ_PSN | + IB_QP_MAX_QP_RD_ATOMIC), + [IB_QPT_XRC_TGT] = (IB_QP_TIMEOUT | + IB_QP_SQ_PSN), + [IB_QPT_SMI] = IB_QP_SQ_PSN, + [IB_QPT_GSI] = IB_QP_SQ_PSN, + }, + .opt_param = { + [IB_QPT_UD] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PATH_MIG_STATE), + [IB_QPT_RC] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), + [IB_QPT_SMI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + } + } + }, + [IB_QPS_RTS] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS | + IB_QP_ALT_PATH | + IB_QP_PATH_MIG_STATE), + [IB_QPT_RC] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS | + IB_QP_ALT_PATH | + IB_QP_PATH_MIG_STATE | + IB_QP_MIN_RNR_TIMER), + [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS | + IB_QP_ALT_PATH | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS | + IB_QP_ALT_PATH | + IB_QP_PATH_MIG_STATE | + IB_QP_MIN_RNR_TIMER), + [IB_QPT_SMI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + } + }, + [IB_QPS_SQD] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_UC] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_RC] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_XRC_INI] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_XRC_TGT] = IB_QP_EN_SQD_ASYNC_NOTIFY, /* ??? */ + [IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY + } + }, + }, + [IB_QPS_SQD] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PATH_MIG_STATE), + [IB_QPT_RC] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), + [IB_QPT_SMI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + } + }, + [IB_QPS_SQD] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_AV | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX | + IB_QP_PATH_MIG_STATE), + [IB_QPT_RC] = (IB_QP_PORT | + IB_QP_AV | + IB_QP_TIMEOUT | + IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY | + IB_QP_MAX_QP_RD_ATOMIC | + IB_QP_MAX_DEST_RD_ATOMIC | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_INI] = (IB_QP_PORT | + IB_QP_AV | + IB_QP_TIMEOUT | + IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY | + IB_QP_MAX_QP_RD_ATOMIC | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_TGT] = (IB_QP_PORT | + IB_QP_AV | + IB_QP_TIMEOUT | + IB_QP_MAX_DEST_RD_ATOMIC | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), + [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + } + } + }, + [IB_QPS_SQE] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS), + [IB_QPT_SMI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + } + } + }, + [IB_QPS_ERR] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 } + } +}; + +int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, + enum ib_qp_type type, enum ib_qp_attr_mask mask, + enum rdma_link_layer ll) +{ + enum ib_qp_attr_mask req_param, opt_param; + + if (cur_state < 0 || cur_state > IB_QPS_ERR || + next_state < 0 || next_state > IB_QPS_ERR) + return 0; + + if (mask & IB_QP_CUR_STATE && + cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS && + cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE) + return 0; + + if (!qp_state_table[cur_state][next_state].valid) + return 0; + + req_param = qp_state_table[cur_state][next_state].req_param[type]; + opt_param = qp_state_table[cur_state][next_state].opt_param[type]; + + if (ll == IB_LINK_LAYER_ETHERNET) { + req_param |= qp_state_table[cur_state][next_state]. + req_param_add_eth[type]; + opt_param |= qp_state_table[cur_state][next_state]. + opt_param_add_eth[type]; + } + + if ((mask & req_param) != req_param) + return 0; + + if (mask & ~(req_param | opt_param | IB_QP_STATE)) + return 0; + + return 1; +} +EXPORT_SYMBOL(ib_modify_qp_is_ok); + +int ib_resolve_eth_l2_attrs(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, int *qp_attr_mask) +{ + int ret = 0; + union ib_gid sgid; + + if ((*qp_attr_mask & IB_QP_AV) && + (rdma_port_get_link_layer(qp->device, qp_attr->ah_attr.port_num) == IB_LINK_LAYER_ETHERNET)) { + ret = ib_query_gid(qp->device, qp_attr->ah_attr.port_num, + qp_attr->ah_attr.grh.sgid_index, &sgid); + if (ret) + goto out; + if (rdma_link_local_addr((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw)) { + rdma_get_ll_mac((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw, qp_attr->ah_attr.dmac); + rdma_get_ll_mac((struct in6_addr *)sgid.raw, qp_attr->smac); + qp_attr->vlan_id = rdma_get_vlan_id(&sgid); + } else { + ret = rdma_addr_find_dmac_by_grh(&sgid, &qp_attr->ah_attr.grh.dgid, + qp_attr->ah_attr.dmac, &qp_attr->vlan_id); + if (ret) + goto out; + ret = rdma_addr_find_smac_by_sgid(&sgid, qp_attr->smac, NULL); + if (ret) + goto out; + } + *qp_attr_mask |= IB_QP_SMAC; + if (qp_attr->vlan_id < 0xFFFF) + *qp_attr_mask |= IB_QP_VID; + } +out: + return ret; +} +EXPORT_SYMBOL(ib_resolve_eth_l2_attrs); + + +int ib_modify_qp(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask) +{ + int ret; + + ret = ib_resolve_eth_l2_attrs(qp, qp_attr, &qp_attr_mask); + if (ret) + return ret; + + return qp->device->modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL); +} +EXPORT_SYMBOL(ib_modify_qp); + +int ib_query_qp(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + return qp->device->query_qp ? + qp->device->query_qp(qp->real_qp, qp_attr, qp_attr_mask, qp_init_attr) : + -ENOSYS; +} +EXPORT_SYMBOL(ib_query_qp); + +int ib_close_qp(struct ib_qp *qp) +{ + struct ib_qp *real_qp; + unsigned long flags; + + real_qp = qp->real_qp; + if (real_qp == qp) + return -EINVAL; + + spin_lock_irqsave(&real_qp->device->event_handler_lock, flags); + list_del(&qp->open_list); + spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags); + + atomic_dec(&real_qp->usecnt); + kfree(qp); + + return 0; +} +EXPORT_SYMBOL(ib_close_qp); + +static int __ib_destroy_shared_qp(struct ib_qp *qp) +{ + struct ib_xrcd *xrcd; + struct ib_qp *real_qp; + int ret; + + real_qp = qp->real_qp; + xrcd = real_qp->xrcd; + + mutex_lock(&xrcd->tgt_qp_mutex); + ib_close_qp(qp); + if (atomic_read(&real_qp->usecnt) == 0) + list_del(&real_qp->xrcd_list); + else + real_qp = NULL; + mutex_unlock(&xrcd->tgt_qp_mutex); + + if (real_qp) { + ret = ib_destroy_qp(real_qp); + if (!ret) + atomic_dec(&xrcd->usecnt); + else + __ib_insert_xrcd_qp(xrcd, real_qp); + } + + return 0; +} + +int ib_destroy_qp(struct ib_qp *qp) +{ + struct ib_pd *pd; + struct ib_cq *scq, *rcq; + struct ib_srq *srq; + int ret; + + if (atomic_read(&qp->usecnt)) + return -EBUSY; + + if (qp->real_qp != qp) + return __ib_destroy_shared_qp(qp); + + pd = qp->pd; + scq = qp->send_cq; + rcq = qp->recv_cq; + srq = qp->srq; + + ret = qp->device->destroy_qp(qp); + if (!ret) { + if (pd) + atomic_dec(&pd->usecnt); + if (scq) + atomic_dec(&scq->usecnt); + if (rcq) + atomic_dec(&rcq->usecnt); + if (srq) + atomic_dec(&srq->usecnt); + } + + return ret; +} +EXPORT_SYMBOL(ib_destroy_qp); + +/* Completion queues */ + +struct ib_cq *ib_create_cq(struct ib_device *device, + ib_comp_handler comp_handler, + void (*event_handler)(struct ib_event *, void *), + void *cq_context, int cqe, int comp_vector) +{ + struct ib_cq *cq; + + cq = device->create_cq(device, cqe, comp_vector, NULL, NULL); + + if (!IS_ERR(cq)) { + cq->device = device; + cq->uobject = NULL; + cq->comp_handler = comp_handler; + cq->event_handler = event_handler; + cq->cq_context = cq_context; + atomic_set(&cq->usecnt, 0); + } + + return cq; +} +EXPORT_SYMBOL(ib_create_cq); + +int ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) +{ + return cq->device->modify_cq ? + cq->device->modify_cq(cq, cq_count, cq_period) : -ENOSYS; +} +EXPORT_SYMBOL(ib_modify_cq); + +int ib_destroy_cq(struct ib_cq *cq) +{ + if (atomic_read(&cq->usecnt)) + return -EBUSY; + + return cq->device->destroy_cq(cq); +} +EXPORT_SYMBOL(ib_destroy_cq); + +int ib_resize_cq(struct ib_cq *cq, int cqe) +{ + return cq->device->resize_cq ? + cq->device->resize_cq(cq, cqe, NULL) : -ENOSYS; +} +EXPORT_SYMBOL(ib_resize_cq); + +/* Memory regions */ + +struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags) +{ + struct ib_mr *mr; + int err; + + err = ib_check_mr_access(mr_access_flags); + if (err) + return ERR_PTR(err); + + mr = pd->device->get_dma_mr(pd, mr_access_flags); + + if (!IS_ERR(mr)) { + mr->device = pd->device; + mr->pd = pd; + mr->uobject = NULL; + atomic_inc(&pd->usecnt); + atomic_set(&mr->usecnt, 0); + } + + return mr; +} +EXPORT_SYMBOL(ib_get_dma_mr); + +struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd, + struct ib_phys_buf *phys_buf_array, + int num_phys_buf, + int mr_access_flags, + u64 *iova_start) +{ + struct ib_mr *mr; + int err; + + err = ib_check_mr_access(mr_access_flags); + if (err) + return ERR_PTR(err); + + if (!pd->device->reg_phys_mr) + return ERR_PTR(-ENOSYS); + + mr = pd->device->reg_phys_mr(pd, phys_buf_array, num_phys_buf, + mr_access_flags, iova_start); + + if (!IS_ERR(mr)) { + mr->device = pd->device; + mr->pd = pd; + mr->uobject = NULL; + atomic_inc(&pd->usecnt); + atomic_set(&mr->usecnt, 0); + } + + return mr; +} +EXPORT_SYMBOL(ib_reg_phys_mr); + +int ib_rereg_phys_mr(struct ib_mr *mr, + int mr_rereg_mask, + struct ib_pd *pd, + struct ib_phys_buf *phys_buf_array, + int num_phys_buf, + int mr_access_flags, + u64 *iova_start) +{ + struct ib_pd *old_pd; + int ret; + + ret = ib_check_mr_access(mr_access_flags); + if (ret) + return ret; + + if (!mr->device->rereg_phys_mr) + return -ENOSYS; + + if (atomic_read(&mr->usecnt)) + return -EBUSY; + + old_pd = mr->pd; + + ret = mr->device->rereg_phys_mr(mr, mr_rereg_mask, pd, + phys_buf_array, num_phys_buf, + mr_access_flags, iova_start); + + if (!ret && (mr_rereg_mask & IB_MR_REREG_PD)) { + atomic_dec(&old_pd->usecnt); + atomic_inc(&pd->usecnt); + } + + return ret; +} +EXPORT_SYMBOL(ib_rereg_phys_mr); + +int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr) +{ + return mr->device->query_mr ? + mr->device->query_mr(mr, mr_attr) : -ENOSYS; +} +EXPORT_SYMBOL(ib_query_mr); + +int ib_dereg_mr(struct ib_mr *mr) +{ + struct ib_pd *pd; + int ret; + + if (atomic_read(&mr->usecnt)) + return -EBUSY; + + pd = mr->pd; + ret = mr->device->dereg_mr(mr); + if (!ret) + atomic_dec(&pd->usecnt); + + return ret; +} +EXPORT_SYMBOL(ib_dereg_mr); + +struct ib_mr *ib_create_mr(struct ib_pd *pd, + struct ib_mr_init_attr *mr_init_attr) +{ + struct ib_mr *mr; + + if (!pd->device->create_mr) + return ERR_PTR(-ENOSYS); + + mr = pd->device->create_mr(pd, mr_init_attr); + + if (!IS_ERR(mr)) { + mr->device = pd->device; + mr->pd = pd; + mr->uobject = NULL; + atomic_inc(&pd->usecnt); + atomic_set(&mr->usecnt, 0); + } + + return mr; +} +EXPORT_SYMBOL(ib_create_mr); + +int ib_destroy_mr(struct ib_mr *mr) +{ + struct ib_pd *pd; + int ret; + + if (atomic_read(&mr->usecnt)) + return -EBUSY; + + pd = mr->pd; + ret = mr->device->destroy_mr(mr); + if (!ret) + atomic_dec(&pd->usecnt); + + return ret; +} +EXPORT_SYMBOL(ib_destroy_mr); + +struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len) +{ + struct ib_mr *mr; + + if (!pd->device->alloc_fast_reg_mr) + return ERR_PTR(-ENOSYS); + + mr = pd->device->alloc_fast_reg_mr(pd, max_page_list_len); + + if (!IS_ERR(mr)) { + mr->device = pd->device; + mr->pd = pd; + mr->uobject = NULL; + atomic_inc(&pd->usecnt); + atomic_set(&mr->usecnt, 0); + } + + return mr; +} +EXPORT_SYMBOL(ib_alloc_fast_reg_mr); + +struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(struct ib_device *device, + int max_page_list_len) +{ + struct ib_fast_reg_page_list *page_list; + + if (!device->alloc_fast_reg_page_list) + return ERR_PTR(-ENOSYS); + + page_list = device->alloc_fast_reg_page_list(device, max_page_list_len); + + if (!IS_ERR(page_list)) { + page_list->device = device; + page_list->max_page_list_len = max_page_list_len; + } + + return page_list; +} +EXPORT_SYMBOL(ib_alloc_fast_reg_page_list); + +void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list) +{ + page_list->device->free_fast_reg_page_list(page_list); +} +EXPORT_SYMBOL(ib_free_fast_reg_page_list); + +/* Memory windows */ + +struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) +{ + struct ib_mw *mw; + + if (!pd->device->alloc_mw) + return ERR_PTR(-ENOSYS); + + mw = pd->device->alloc_mw(pd, type); + if (!IS_ERR(mw)) { + mw->device = pd->device; + mw->pd = pd; + mw->uobject = NULL; + mw->type = type; + atomic_inc(&pd->usecnt); + } + + return mw; +} +EXPORT_SYMBOL(ib_alloc_mw); + +int ib_dealloc_mw(struct ib_mw *mw) +{ + struct ib_pd *pd; + int ret; + + pd = mw->pd; + ret = mw->device->dealloc_mw(mw); + if (!ret) + atomic_dec(&pd->usecnt); + + return ret; +} +EXPORT_SYMBOL(ib_dealloc_mw); + +/* "Fast" memory regions */ + +struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd, + int mr_access_flags, + struct ib_fmr_attr *fmr_attr) +{ + struct ib_fmr *fmr; + + if (!pd->device->alloc_fmr) + return ERR_PTR(-ENOSYS); + + fmr = pd->device->alloc_fmr(pd, mr_access_flags, fmr_attr); + if (!IS_ERR(fmr)) { + fmr->device = pd->device; + fmr->pd = pd; + atomic_inc(&pd->usecnt); + } + + return fmr; +} +EXPORT_SYMBOL(ib_alloc_fmr); + +int ib_unmap_fmr(struct list_head *fmr_list) +{ + struct ib_fmr *fmr; + + if (list_empty(fmr_list)) + return 0; + + fmr = list_entry(fmr_list->next, struct ib_fmr, list); + return fmr->device->unmap_fmr(fmr_list); +} +EXPORT_SYMBOL(ib_unmap_fmr); + +int ib_dealloc_fmr(struct ib_fmr *fmr) +{ + struct ib_pd *pd; + int ret; + + pd = fmr->pd; + ret = fmr->device->dealloc_fmr(fmr); + if (!ret) + atomic_dec(&pd->usecnt); + + return ret; +} +EXPORT_SYMBOL(ib_dealloc_fmr); + +/* Multicast groups */ + +int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) +{ + int ret; + + if (!qp->device->attach_mcast) + return -ENOSYS; + if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) + return -EINVAL; + + ret = qp->device->attach_mcast(qp, gid, lid); + if (!ret) + atomic_inc(&qp->usecnt); + return ret; +} +EXPORT_SYMBOL(ib_attach_mcast); + +int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) +{ + int ret; + + if (!qp->device->detach_mcast) + return -ENOSYS; + if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) + return -EINVAL; + + ret = qp->device->detach_mcast(qp, gid, lid); + if (!ret) + atomic_dec(&qp->usecnt); + return ret; +} +EXPORT_SYMBOL(ib_detach_mcast); + +struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device) +{ + struct ib_xrcd *xrcd; + + if (!device->alloc_xrcd) + return ERR_PTR(-ENOSYS); + + xrcd = device->alloc_xrcd(device, NULL, NULL); + if (!IS_ERR(xrcd)) { + xrcd->device = device; + xrcd->inode = NULL; + atomic_set(&xrcd->usecnt, 0); + mutex_init(&xrcd->tgt_qp_mutex); + INIT_LIST_HEAD(&xrcd->tgt_qp_list); + } + + return xrcd; +} +EXPORT_SYMBOL(ib_alloc_xrcd); + +int ib_dealloc_xrcd(struct ib_xrcd *xrcd) +{ + struct ib_qp *qp; + int ret; + + if (atomic_read(&xrcd->usecnt)) + return -EBUSY; + + while (!list_empty(&xrcd->tgt_qp_list)) { + qp = list_entry(xrcd->tgt_qp_list.next, struct ib_qp, xrcd_list); + ret = ib_destroy_qp(qp); + if (ret) + return ret; + } + + return xrcd->device->dealloc_xrcd(xrcd); +} +EXPORT_SYMBOL(ib_dealloc_xrcd); + +struct ib_flow *ib_create_flow(struct ib_qp *qp, + struct ib_flow_attr *flow_attr, + int domain) +{ + struct ib_flow *flow_id; + if (!qp->device->create_flow) + return ERR_PTR(-ENOSYS); + + flow_id = qp->device->create_flow(qp, flow_attr, domain); + if (!IS_ERR(flow_id)) + atomic_inc(&qp->usecnt); + return flow_id; +} +EXPORT_SYMBOL(ib_create_flow); + +int ib_destroy_flow(struct ib_flow *flow_id) +{ + int err; + struct ib_qp *qp = flow_id->qp; + + err = qp->device->destroy_flow(flow_id); + if (!err) + atomic_dec(&qp->usecnt); + return err; +} +EXPORT_SYMBOL(ib_destroy_flow); + +int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, + struct ib_mr_status *mr_status) +{ + return mr->device->check_mr_status ? + mr->device->check_mr_status(mr, check_mask, mr_status) : -ENOSYS; +} +EXPORT_SYMBOL(ib_check_mr_status); diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile new file mode 100644 index 0000000..e900b03 --- /dev/null +++ b/drivers/infiniband/hw/Makefile @@ -0,0 +1,12 @@ +obj-$(CONFIG_INFINIBAND_MTHCA) += mthca/ +obj-$(CONFIG_INFINIBAND_IPATH) += ipath/ +obj-$(CONFIG_INFINIBAND_QIB) += qib/ +obj-$(CONFIG_INFINIBAND_EHCA) += ehca/ +obj-$(CONFIG_INFINIBAND_AMSO1100) += amso1100/ +obj-$(CONFIG_INFINIBAND_CXGB3) += cxgb3/ +obj-$(CONFIG_INFINIBAND_CXGB4) += cxgb4/ +obj-$(CONFIG_MLX4_INFINIBAND) += mlx4/ +obj-$(CONFIG_MLX5_INFINIBAND) += mlx5/ +obj-$(CONFIG_INFINIBAND_NES) += nes/ +obj-$(CONFIG_INFINIBAND_OCRDMA) += ocrdma/ +obj-$(CONFIG_INFINIBAND_USNIC) += usnic/ diff --git a/drivers/infiniband/hw/amso1100/Kbuild b/drivers/infiniband/hw/amso1100/Kbuild new file mode 100644 index 0000000..950dfab --- /dev/null +++ b/drivers/infiniband/hw/amso1100/Kbuild @@ -0,0 +1,6 @@ +ccflags-$(CONFIG_INFINIBAND_AMSO1100_DEBUG) := -DDEBUG + +obj-$(CONFIG_INFINIBAND_AMSO1100) += iw_c2.o + +iw_c2-y := c2.o c2_provider.o c2_rnic.o c2_alloc.o c2_mq.o c2_ae.o c2_vq.o \ + c2_intr.o c2_cq.o c2_qp.o c2_cm.o c2_mm.o c2_pd.o diff --git a/drivers/infiniband/hw/amso1100/Kconfig b/drivers/infiniband/hw/amso1100/Kconfig new file mode 100644 index 0000000..e6ce5f2 --- /dev/null +++ b/drivers/infiniband/hw/amso1100/Kconfig @@ -0,0 +1,15 @@ +config INFINIBAND_AMSO1100 + tristate "Ammasso 1100 HCA support" + depends on PCI && INET + ---help--- + This is a low-level driver for the Ammasso 1100 host + channel adapter (HCA). + +config INFINIBAND_AMSO1100_DEBUG + bool "Verbose debugging output" + depends on INFINIBAND_AMSO1100 + default n + ---help--- + This option causes the amso1100 driver to produce a bunch of + debug messages. Select this if you are developing the driver + or trying to diagnose a problem. diff --git a/drivers/infiniband/hw/amso1100/c2.c b/drivers/infiniband/hw/amso1100/c2.c new file mode 100644 index 0000000..766a71c --- /dev/null +++ b/drivers/infiniband/hw/amso1100/c2.c @@ -0,0 +1,1241 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include "c2.h" +#include "c2_provider.h" + +MODULE_AUTHOR("Tom Tucker "); +MODULE_DESCRIPTION("Ammasso AMSO1100 Low-level iWARP Driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); + +static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK + | NETIF_MSG_IFUP | NETIF_MSG_IFDOWN; + +static int debug = -1; /* defaults above */ +module_param(debug, int, 0); +MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)"); + +static int c2_up(struct net_device *netdev); +static int c2_down(struct net_device *netdev); +static int c2_xmit_frame(struct sk_buff *skb, struct net_device *netdev); +static void c2_tx_interrupt(struct net_device *netdev); +static void c2_rx_interrupt(struct net_device *netdev); +static irqreturn_t c2_interrupt(int irq, void *dev_id); +static void c2_tx_timeout(struct net_device *netdev); +static int c2_change_mtu(struct net_device *netdev, int new_mtu); +static void c2_reset(struct c2_port *c2_port); + +static struct pci_device_id c2_pci_table[] = { + { PCI_DEVICE(0x18b8, 0xb001) }, + { 0 } +}; + +MODULE_DEVICE_TABLE(pci, c2_pci_table); + +static void c2_print_macaddr(struct net_device *netdev) +{ + pr_debug("%s: MAC %pM, IRQ %u\n", netdev->name, netdev->dev_addr, netdev->irq); +} + +static void c2_set_rxbufsize(struct c2_port *c2_port) +{ + struct net_device *netdev = c2_port->netdev; + + if (netdev->mtu > RX_BUF_SIZE) + c2_port->rx_buf_size = + netdev->mtu + ETH_HLEN + sizeof(struct c2_rxp_hdr) + + NET_IP_ALIGN; + else + c2_port->rx_buf_size = sizeof(struct c2_rxp_hdr) + RX_BUF_SIZE; +} + +/* + * Allocate TX ring elements and chain them together. + * One-to-one association of adapter descriptors with ring elements. + */ +static int c2_tx_ring_alloc(struct c2_ring *tx_ring, void *vaddr, + dma_addr_t base, void __iomem * mmio_txp_ring) +{ + struct c2_tx_desc *tx_desc; + struct c2_txp_desc __iomem *txp_desc; + struct c2_element *elem; + int i; + + tx_ring->start = kmalloc(sizeof(*elem) * tx_ring->count, GFP_KERNEL); + if (!tx_ring->start) + return -ENOMEM; + + elem = tx_ring->start; + tx_desc = vaddr; + txp_desc = mmio_txp_ring; + for (i = 0; i < tx_ring->count; i++, elem++, tx_desc++, txp_desc++) { + tx_desc->len = 0; + tx_desc->status = 0; + + /* Set TXP_HTXD_UNINIT */ + __raw_writeq((__force u64) cpu_to_be64(0x1122334455667788ULL), + (void __iomem *) txp_desc + C2_TXP_ADDR); + __raw_writew(0, (void __iomem *) txp_desc + C2_TXP_LEN); + __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_UNINIT), + (void __iomem *) txp_desc + C2_TXP_FLAGS); + + elem->skb = NULL; + elem->ht_desc = tx_desc; + elem->hw_desc = txp_desc; + + if (i == tx_ring->count - 1) { + elem->next = tx_ring->start; + tx_desc->next_offset = base; + } else { + elem->next = elem + 1; + tx_desc->next_offset = + base + (i + 1) * sizeof(*tx_desc); + } + } + + tx_ring->to_use = tx_ring->to_clean = tx_ring->start; + + return 0; +} + +/* + * Allocate RX ring elements and chain them together. + * One-to-one association of adapter descriptors with ring elements. + */ +static int c2_rx_ring_alloc(struct c2_ring *rx_ring, void *vaddr, + dma_addr_t base, void __iomem * mmio_rxp_ring) +{ + struct c2_rx_desc *rx_desc; + struct c2_rxp_desc __iomem *rxp_desc; + struct c2_element *elem; + int i; + + rx_ring->start = kmalloc(sizeof(*elem) * rx_ring->count, GFP_KERNEL); + if (!rx_ring->start) + return -ENOMEM; + + elem = rx_ring->start; + rx_desc = vaddr; + rxp_desc = mmio_rxp_ring; + for (i = 0; i < rx_ring->count; i++, elem++, rx_desc++, rxp_desc++) { + rx_desc->len = 0; + rx_desc->status = 0; + + /* Set RXP_HRXD_UNINIT */ + __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_OK), + (void __iomem *) rxp_desc + C2_RXP_STATUS); + __raw_writew(0, (void __iomem *) rxp_desc + C2_RXP_COUNT); + __raw_writew(0, (void __iomem *) rxp_desc + C2_RXP_LEN); + __raw_writeq((__force u64) cpu_to_be64(0x99aabbccddeeffULL), + (void __iomem *) rxp_desc + C2_RXP_ADDR); + __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_UNINIT), + (void __iomem *) rxp_desc + C2_RXP_FLAGS); + + elem->skb = NULL; + elem->ht_desc = rx_desc; + elem->hw_desc = rxp_desc; + + if (i == rx_ring->count - 1) { + elem->next = rx_ring->start; + rx_desc->next_offset = base; + } else { + elem->next = elem + 1; + rx_desc->next_offset = + base + (i + 1) * sizeof(*rx_desc); + } + } + + rx_ring->to_use = rx_ring->to_clean = rx_ring->start; + + return 0; +} + +/* Setup buffer for receiving */ +static inline int c2_rx_alloc(struct c2_port *c2_port, struct c2_element *elem) +{ + struct c2_dev *c2dev = c2_port->c2dev; + struct c2_rx_desc *rx_desc = elem->ht_desc; + struct sk_buff *skb; + dma_addr_t mapaddr; + u32 maplen; + struct c2_rxp_hdr *rxp_hdr; + + skb = dev_alloc_skb(c2_port->rx_buf_size); + if (unlikely(!skb)) { + pr_debug("%s: out of memory for receive\n", + c2_port->netdev->name); + return -ENOMEM; + } + + /* Zero out the rxp hdr in the sk_buff */ + memset(skb->data, 0, sizeof(*rxp_hdr)); + + skb->dev = c2_port->netdev; + + maplen = c2_port->rx_buf_size; + mapaddr = + pci_map_single(c2dev->pcidev, skb->data, maplen, + PCI_DMA_FROMDEVICE); + + /* Set the sk_buff RXP_header to RXP_HRXD_READY */ + rxp_hdr = (struct c2_rxp_hdr *) skb->data; + rxp_hdr->flags = RXP_HRXD_READY; + + __raw_writew(0, elem->hw_desc + C2_RXP_STATUS); + __raw_writew((__force u16) cpu_to_be16((u16) maplen - sizeof(*rxp_hdr)), + elem->hw_desc + C2_RXP_LEN); + __raw_writeq((__force u64) cpu_to_be64(mapaddr), elem->hw_desc + C2_RXP_ADDR); + __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY), + elem->hw_desc + C2_RXP_FLAGS); + + elem->skb = skb; + elem->mapaddr = mapaddr; + elem->maplen = maplen; + rx_desc->len = maplen; + + return 0; +} + +/* + * Allocate buffers for the Rx ring + * For receive: rx_ring.to_clean is next received frame + */ +static int c2_rx_fill(struct c2_port *c2_port) +{ + struct c2_ring *rx_ring = &c2_port->rx_ring; + struct c2_element *elem; + int ret = 0; + + elem = rx_ring->start; + do { + if (c2_rx_alloc(c2_port, elem)) { + ret = 1; + break; + } + } while ((elem = elem->next) != rx_ring->start); + + rx_ring->to_clean = rx_ring->start; + return ret; +} + +/* Free all buffers in RX ring, assumes receiver stopped */ +static void c2_rx_clean(struct c2_port *c2_port) +{ + struct c2_dev *c2dev = c2_port->c2dev; + struct c2_ring *rx_ring = &c2_port->rx_ring; + struct c2_element *elem; + struct c2_rx_desc *rx_desc; + + elem = rx_ring->start; + do { + rx_desc = elem->ht_desc; + rx_desc->len = 0; + + __raw_writew(0, elem->hw_desc + C2_RXP_STATUS); + __raw_writew(0, elem->hw_desc + C2_RXP_COUNT); + __raw_writew(0, elem->hw_desc + C2_RXP_LEN); + __raw_writeq((__force u64) cpu_to_be64(0x99aabbccddeeffULL), + elem->hw_desc + C2_RXP_ADDR); + __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_UNINIT), + elem->hw_desc + C2_RXP_FLAGS); + + if (elem->skb) { + pci_unmap_single(c2dev->pcidev, elem->mapaddr, + elem->maplen, PCI_DMA_FROMDEVICE); + dev_kfree_skb(elem->skb); + elem->skb = NULL; + } + } while ((elem = elem->next) != rx_ring->start); +} + +static inline int c2_tx_free(struct c2_dev *c2dev, struct c2_element *elem) +{ + struct c2_tx_desc *tx_desc = elem->ht_desc; + + tx_desc->len = 0; + + pci_unmap_single(c2dev->pcidev, elem->mapaddr, elem->maplen, + PCI_DMA_TODEVICE); + + if (elem->skb) { + dev_kfree_skb_any(elem->skb); + elem->skb = NULL; + } + + return 0; +} + +/* Free all buffers in TX ring, assumes transmitter stopped */ +static void c2_tx_clean(struct c2_port *c2_port) +{ + struct c2_ring *tx_ring = &c2_port->tx_ring; + struct c2_element *elem; + struct c2_txp_desc txp_htxd; + int retry; + unsigned long flags; + + spin_lock_irqsave(&c2_port->tx_lock, flags); + + elem = tx_ring->start; + + do { + retry = 0; + do { + txp_htxd.flags = + readw(elem->hw_desc + C2_TXP_FLAGS); + + if (txp_htxd.flags == TXP_HTXD_READY) { + retry = 1; + __raw_writew(0, + elem->hw_desc + C2_TXP_LEN); + __raw_writeq(0, + elem->hw_desc + C2_TXP_ADDR); + __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_DONE), + elem->hw_desc + C2_TXP_FLAGS); + c2_port->netdev->stats.tx_dropped++; + break; + } else { + __raw_writew(0, + elem->hw_desc + C2_TXP_LEN); + __raw_writeq((__force u64) cpu_to_be64(0x1122334455667788ULL), + elem->hw_desc + C2_TXP_ADDR); + __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_UNINIT), + elem->hw_desc + C2_TXP_FLAGS); + } + + c2_tx_free(c2_port->c2dev, elem); + + } while ((elem = elem->next) != tx_ring->start); + } while (retry); + + c2_port->tx_avail = c2_port->tx_ring.count - 1; + c2_port->c2dev->cur_tx = tx_ring->to_use - tx_ring->start; + + if (c2_port->tx_avail > MAX_SKB_FRAGS + 1) + netif_wake_queue(c2_port->netdev); + + spin_unlock_irqrestore(&c2_port->tx_lock, flags); +} + +/* + * Process transmit descriptors marked 'DONE' by the firmware, + * freeing up their unneeded sk_buffs. + */ +static void c2_tx_interrupt(struct net_device *netdev) +{ + struct c2_port *c2_port = netdev_priv(netdev); + struct c2_dev *c2dev = c2_port->c2dev; + struct c2_ring *tx_ring = &c2_port->tx_ring; + struct c2_element *elem; + struct c2_txp_desc txp_htxd; + + spin_lock(&c2_port->tx_lock); + + for (elem = tx_ring->to_clean; elem != tx_ring->to_use; + elem = elem->next) { + txp_htxd.flags = + be16_to_cpu((__force __be16) readw(elem->hw_desc + C2_TXP_FLAGS)); + + if (txp_htxd.flags != TXP_HTXD_DONE) + break; + + if (netif_msg_tx_done(c2_port)) { + /* PCI reads are expensive in fast path */ + txp_htxd.len = + be16_to_cpu((__force __be16) readw(elem->hw_desc + C2_TXP_LEN)); + pr_debug("%s: tx done slot %3Zu status 0x%x len " + "%5u bytes\n", + netdev->name, elem - tx_ring->start, + txp_htxd.flags, txp_htxd.len); + } + + c2_tx_free(c2dev, elem); + ++(c2_port->tx_avail); + } + + tx_ring->to_clean = elem; + + if (netif_queue_stopped(netdev) + && c2_port->tx_avail > MAX_SKB_FRAGS + 1) + netif_wake_queue(netdev); + + spin_unlock(&c2_port->tx_lock); +} + +static void c2_rx_error(struct c2_port *c2_port, struct c2_element *elem) +{ + struct c2_rx_desc *rx_desc = elem->ht_desc; + struct c2_rxp_hdr *rxp_hdr = (struct c2_rxp_hdr *) elem->skb->data; + + if (rxp_hdr->status != RXP_HRXD_OK || + rxp_hdr->len > (rx_desc->len - sizeof(*rxp_hdr))) { + pr_debug("BAD RXP_HRXD\n"); + pr_debug(" rx_desc : %p\n", rx_desc); + pr_debug(" index : %Zu\n", + elem - c2_port->rx_ring.start); + pr_debug(" len : %u\n", rx_desc->len); + pr_debug(" rxp_hdr : %p [PA %p]\n", rxp_hdr, + (void *) __pa((unsigned long) rxp_hdr)); + pr_debug(" flags : 0x%x\n", rxp_hdr->flags); + pr_debug(" status: 0x%x\n", rxp_hdr->status); + pr_debug(" len : %u\n", rxp_hdr->len); + pr_debug(" rsvd : 0x%x\n", rxp_hdr->rsvd); + } + + /* Setup the skb for reuse since we're dropping this pkt */ + elem->skb->data = elem->skb->head; + skb_reset_tail_pointer(elem->skb); + + /* Zero out the rxp hdr in the sk_buff */ + memset(elem->skb->data, 0, sizeof(*rxp_hdr)); + + /* Write the descriptor to the adapter's rx ring */ + __raw_writew(0, elem->hw_desc + C2_RXP_STATUS); + __raw_writew(0, elem->hw_desc + C2_RXP_COUNT); + __raw_writew((__force u16) cpu_to_be16((u16) elem->maplen - sizeof(*rxp_hdr)), + elem->hw_desc + C2_RXP_LEN); + __raw_writeq((__force u64) cpu_to_be64(elem->mapaddr), + elem->hw_desc + C2_RXP_ADDR); + __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY), + elem->hw_desc + C2_RXP_FLAGS); + + pr_debug("packet dropped\n"); + c2_port->netdev->stats.rx_dropped++; +} + +static void c2_rx_interrupt(struct net_device *netdev) +{ + struct c2_port *c2_port = netdev_priv(netdev); + struct c2_dev *c2dev = c2_port->c2dev; + struct c2_ring *rx_ring = &c2_port->rx_ring; + struct c2_element *elem; + struct c2_rx_desc *rx_desc; + struct c2_rxp_hdr *rxp_hdr; + struct sk_buff *skb; + dma_addr_t mapaddr; + u32 maplen, buflen; + unsigned long flags; + + spin_lock_irqsave(&c2dev->lock, flags); + + /* Begin where we left off */ + rx_ring->to_clean = rx_ring->start + c2dev->cur_rx; + + for (elem = rx_ring->to_clean; elem->next != rx_ring->to_clean; + elem = elem->next) { + rx_desc = elem->ht_desc; + mapaddr = elem->mapaddr; + maplen = elem->maplen; + skb = elem->skb; + rxp_hdr = (struct c2_rxp_hdr *) skb->data; + + if (rxp_hdr->flags != RXP_HRXD_DONE) + break; + buflen = rxp_hdr->len; + + /* Sanity check the RXP header */ + if (rxp_hdr->status != RXP_HRXD_OK || + buflen > (rx_desc->len - sizeof(*rxp_hdr))) { + c2_rx_error(c2_port, elem); + continue; + } + + /* + * Allocate and map a new skb for replenishing the host + * RX desc + */ + if (c2_rx_alloc(c2_port, elem)) { + c2_rx_error(c2_port, elem); + continue; + } + + /* Unmap the old skb */ + pci_unmap_single(c2dev->pcidev, mapaddr, maplen, + PCI_DMA_FROMDEVICE); + + prefetch(skb->data); + + /* + * Skip past the leading 8 bytes comprising of the + * "struct c2_rxp_hdr", prepended by the adapter + * to the usual Ethernet header ("struct ethhdr"), + * to the start of the raw Ethernet packet. + * + * Fix up the various fields in the sk_buff before + * passing it up to netif_rx(). The transfer size + * (in bytes) specified by the adapter len field of + * the "struct rxp_hdr_t" does NOT include the + * "sizeof(struct c2_rxp_hdr)". + */ + skb->data += sizeof(*rxp_hdr); + skb_set_tail_pointer(skb, buflen); + skb->len = buflen; + skb->protocol = eth_type_trans(skb, netdev); + + netif_rx(skb); + + netdev->stats.rx_packets++; + netdev->stats.rx_bytes += buflen; + } + + /* Save where we left off */ + rx_ring->to_clean = elem; + c2dev->cur_rx = elem - rx_ring->start; + C2_SET_CUR_RX(c2dev, c2dev->cur_rx); + + spin_unlock_irqrestore(&c2dev->lock, flags); +} + +/* + * Handle netisr0 TX & RX interrupts. + */ +static irqreturn_t c2_interrupt(int irq, void *dev_id) +{ + unsigned int netisr0, dmaisr; + int handled = 0; + struct c2_dev *c2dev = (struct c2_dev *) dev_id; + + /* Process CCILNET interrupts */ + netisr0 = readl(c2dev->regs + C2_NISR0); + if (netisr0) { + + /* + * There is an issue with the firmware that always + * provides the status of RX for both TX & RX + * interrupts. So process both queues here. + */ + c2_rx_interrupt(c2dev->netdev); + c2_tx_interrupt(c2dev->netdev); + + /* Clear the interrupt */ + writel(netisr0, c2dev->regs + C2_NISR0); + handled++; + } + + /* Process RNIC interrupts */ + dmaisr = readl(c2dev->regs + C2_DISR); + if (dmaisr) { + writel(dmaisr, c2dev->regs + C2_DISR); + c2_rnic_interrupt(c2dev); + handled++; + } + + if (handled) { + return IRQ_HANDLED; + } else { + return IRQ_NONE; + } +} + +static int c2_up(struct net_device *netdev) +{ + struct c2_port *c2_port = netdev_priv(netdev); + struct c2_dev *c2dev = c2_port->c2dev; + struct c2_element *elem; + struct c2_rxp_hdr *rxp_hdr; + struct in_device *in_dev; + size_t rx_size, tx_size; + int ret, i; + unsigned int netimr0; + + if (netif_msg_ifup(c2_port)) + pr_debug("%s: enabling interface\n", netdev->name); + + /* Set the Rx buffer size based on MTU */ + c2_set_rxbufsize(c2_port); + + /* Allocate DMA'able memory for Tx/Rx host descriptor rings */ + rx_size = c2_port->rx_ring.count * sizeof(struct c2_rx_desc); + tx_size = c2_port->tx_ring.count * sizeof(struct c2_tx_desc); + + c2_port->mem_size = tx_size + rx_size; + c2_port->mem = pci_zalloc_consistent(c2dev->pcidev, c2_port->mem_size, + &c2_port->dma); + if (c2_port->mem == NULL) { + pr_debug("Unable to allocate memory for " + "host descriptor rings\n"); + return -ENOMEM; + } + + /* Create the Rx host descriptor ring */ + if ((ret = + c2_rx_ring_alloc(&c2_port->rx_ring, c2_port->mem, c2_port->dma, + c2dev->mmio_rxp_ring))) { + pr_debug("Unable to create RX ring\n"); + goto bail0; + } + + /* Allocate Rx buffers for the host descriptor ring */ + if (c2_rx_fill(c2_port)) { + pr_debug("Unable to fill RX ring\n"); + goto bail1; + } + + /* Create the Tx host descriptor ring */ + if ((ret = c2_tx_ring_alloc(&c2_port->tx_ring, c2_port->mem + rx_size, + c2_port->dma + rx_size, + c2dev->mmio_txp_ring))) { + pr_debug("Unable to create TX ring\n"); + goto bail1; + } + + /* Set the TX pointer to where we left off */ + c2_port->tx_avail = c2_port->tx_ring.count - 1; + c2_port->tx_ring.to_use = c2_port->tx_ring.to_clean = + c2_port->tx_ring.start + c2dev->cur_tx; + + /* missing: Initialize MAC */ + + BUG_ON(c2_port->tx_ring.to_use != c2_port->tx_ring.to_clean); + + /* Reset the adapter, ensures the driver is in sync with the RXP */ + c2_reset(c2_port); + + /* Reset the READY bit in the sk_buff RXP headers & adapter HRXDQ */ + for (i = 0, elem = c2_port->rx_ring.start; i < c2_port->rx_ring.count; + i++, elem++) { + rxp_hdr = (struct c2_rxp_hdr *) elem->skb->data; + rxp_hdr->flags = 0; + __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY), + elem->hw_desc + C2_RXP_FLAGS); + } + + /* Enable network packets */ + netif_start_queue(netdev); + + /* Enable IRQ */ + writel(0, c2dev->regs + C2_IDIS); + netimr0 = readl(c2dev->regs + C2_NIMR0); + netimr0 &= ~(C2_PCI_HTX_INT | C2_PCI_HRX_INT); + writel(netimr0, c2dev->regs + C2_NIMR0); + + /* Tell the stack to ignore arp requests for ipaddrs bound to + * other interfaces. This is needed to prevent the host stack + * from responding to arp requests to the ipaddr bound on the + * rdma interface. + */ + in_dev = in_dev_get(netdev); + IN_DEV_CONF_SET(in_dev, ARP_IGNORE, 1); + in_dev_put(in_dev); + + return 0; + + bail1: + c2_rx_clean(c2_port); + kfree(c2_port->rx_ring.start); + + bail0: + pci_free_consistent(c2dev->pcidev, c2_port->mem_size, c2_port->mem, + c2_port->dma); + + return ret; +} + +static int c2_down(struct net_device *netdev) +{ + struct c2_port *c2_port = netdev_priv(netdev); + struct c2_dev *c2dev = c2_port->c2dev; + + if (netif_msg_ifdown(c2_port)) + pr_debug("%s: disabling interface\n", + netdev->name); + + /* Wait for all the queued packets to get sent */ + c2_tx_interrupt(netdev); + + /* Disable network packets */ + netif_stop_queue(netdev); + + /* Disable IRQs by clearing the interrupt mask */ + writel(1, c2dev->regs + C2_IDIS); + writel(0, c2dev->regs + C2_NIMR0); + + /* missing: Stop transmitter */ + + /* missing: Stop receiver */ + + /* Reset the adapter, ensures the driver is in sync with the RXP */ + c2_reset(c2_port); + + /* missing: Turn off LEDs here */ + + /* Free all buffers in the host descriptor rings */ + c2_tx_clean(c2_port); + c2_rx_clean(c2_port); + + /* Free the host descriptor rings */ + kfree(c2_port->rx_ring.start); + kfree(c2_port->tx_ring.start); + pci_free_consistent(c2dev->pcidev, c2_port->mem_size, c2_port->mem, + c2_port->dma); + + return 0; +} + +static void c2_reset(struct c2_port *c2_port) +{ + struct c2_dev *c2dev = c2_port->c2dev; + unsigned int cur_rx = c2dev->cur_rx; + + /* Tell the hardware to quiesce */ + C2_SET_CUR_RX(c2dev, cur_rx | C2_PCI_HRX_QUI); + + /* + * The hardware will reset the C2_PCI_HRX_QUI bit once + * the RXP is quiesced. Wait 2 seconds for this. + */ + ssleep(2); + + cur_rx = C2_GET_CUR_RX(c2dev); + + if (cur_rx & C2_PCI_HRX_QUI) + pr_debug("c2_reset: failed to quiesce the hardware!\n"); + + cur_rx &= ~C2_PCI_HRX_QUI; + + c2dev->cur_rx = cur_rx; + + pr_debug("Current RX: %u\n", c2dev->cur_rx); +} + +static int c2_xmit_frame(struct sk_buff *skb, struct net_device *netdev) +{ + struct c2_port *c2_port = netdev_priv(netdev); + struct c2_dev *c2dev = c2_port->c2dev; + struct c2_ring *tx_ring = &c2_port->tx_ring; + struct c2_element *elem; + dma_addr_t mapaddr; + u32 maplen; + unsigned long flags; + unsigned int i; + + spin_lock_irqsave(&c2_port->tx_lock, flags); + + if (unlikely(c2_port->tx_avail < (skb_shinfo(skb)->nr_frags + 1))) { + netif_stop_queue(netdev); + spin_unlock_irqrestore(&c2_port->tx_lock, flags); + + pr_debug("%s: Tx ring full when queue awake!\n", + netdev->name); + return NETDEV_TX_BUSY; + } + + maplen = skb_headlen(skb); + mapaddr = + pci_map_single(c2dev->pcidev, skb->data, maplen, PCI_DMA_TODEVICE); + + elem = tx_ring->to_use; + elem->skb = skb; + elem->mapaddr = mapaddr; + elem->maplen = maplen; + + /* Tell HW to xmit */ + __raw_writeq((__force u64) cpu_to_be64(mapaddr), + elem->hw_desc + C2_TXP_ADDR); + __raw_writew((__force u16) cpu_to_be16(maplen), + elem->hw_desc + C2_TXP_LEN); + __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_READY), + elem->hw_desc + C2_TXP_FLAGS); + + netdev->stats.tx_packets++; + netdev->stats.tx_bytes += maplen; + + /* Loop thru additional data fragments and queue them */ + if (skb_shinfo(skb)->nr_frags) { + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + maplen = skb_frag_size(frag); + mapaddr = skb_frag_dma_map(&c2dev->pcidev->dev, frag, + 0, maplen, DMA_TO_DEVICE); + elem = elem->next; + elem->skb = NULL; + elem->mapaddr = mapaddr; + elem->maplen = maplen; + + /* Tell HW to xmit */ + __raw_writeq((__force u64) cpu_to_be64(mapaddr), + elem->hw_desc + C2_TXP_ADDR); + __raw_writew((__force u16) cpu_to_be16(maplen), + elem->hw_desc + C2_TXP_LEN); + __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_READY), + elem->hw_desc + C2_TXP_FLAGS); + + netdev->stats.tx_packets++; + netdev->stats.tx_bytes += maplen; + } + } + + tx_ring->to_use = elem->next; + c2_port->tx_avail -= (skb_shinfo(skb)->nr_frags + 1); + + if (c2_port->tx_avail <= MAX_SKB_FRAGS + 1) { + netif_stop_queue(netdev); + if (netif_msg_tx_queued(c2_port)) + pr_debug("%s: transmit queue full\n", + netdev->name); + } + + spin_unlock_irqrestore(&c2_port->tx_lock, flags); + + netdev->trans_start = jiffies; + + return NETDEV_TX_OK; +} + +static void c2_tx_timeout(struct net_device *netdev) +{ + struct c2_port *c2_port = netdev_priv(netdev); + + if (netif_msg_timer(c2_port)) + pr_debug("%s: tx timeout\n", netdev->name); + + c2_tx_clean(c2_port); +} + +static int c2_change_mtu(struct net_device *netdev, int new_mtu) +{ + int ret = 0; + + if (new_mtu < ETH_ZLEN || new_mtu > ETH_JUMBO_MTU) + return -EINVAL; + + netdev->mtu = new_mtu; + + if (netif_running(netdev)) { + c2_down(netdev); + + c2_up(netdev); + } + + return ret; +} + +static const struct net_device_ops c2_netdev = { + .ndo_open = c2_up, + .ndo_stop = c2_down, + .ndo_start_xmit = c2_xmit_frame, + .ndo_tx_timeout = c2_tx_timeout, + .ndo_change_mtu = c2_change_mtu, + .ndo_set_mac_address = eth_mac_addr, + .ndo_validate_addr = eth_validate_addr, +}; + +/* Initialize network device */ +static struct net_device *c2_devinit(struct c2_dev *c2dev, + void __iomem * mmio_addr) +{ + struct c2_port *c2_port = NULL; + struct net_device *netdev = alloc_etherdev(sizeof(*c2_port)); + + if (!netdev) { + pr_debug("c2_port etherdev alloc failed"); + return NULL; + } + + SET_NETDEV_DEV(netdev, &c2dev->pcidev->dev); + + netdev->netdev_ops = &c2_netdev; + netdev->watchdog_timeo = C2_TX_TIMEOUT; + netdev->irq = c2dev->pcidev->irq; + + c2_port = netdev_priv(netdev); + c2_port->netdev = netdev; + c2_port->c2dev = c2dev; + c2_port->msg_enable = netif_msg_init(debug, default_msg); + c2_port->tx_ring.count = C2_NUM_TX_DESC; + c2_port->rx_ring.count = C2_NUM_RX_DESC; + + spin_lock_init(&c2_port->tx_lock); + + /* Copy our 48-bit ethernet hardware address */ + memcpy_fromio(netdev->dev_addr, mmio_addr + C2_REGS_ENADDR, 6); + + /* Validate the MAC address */ + if (!is_valid_ether_addr(netdev->dev_addr)) { + pr_debug("Invalid MAC Address\n"); + c2_print_macaddr(netdev); + free_netdev(netdev); + return NULL; + } + + c2dev->netdev = netdev; + + return netdev; +} + +static int c2_probe(struct pci_dev *pcidev, const struct pci_device_id *ent) +{ + int ret = 0, i; + unsigned long reg0_start, reg0_flags, reg0_len; + unsigned long reg2_start, reg2_flags, reg2_len; + unsigned long reg4_start, reg4_flags, reg4_len; + unsigned kva_map_size; + struct net_device *netdev = NULL; + struct c2_dev *c2dev = NULL; + void __iomem *mmio_regs = NULL; + + printk(KERN_INFO PFX "AMSO1100 Gigabit Ethernet driver v%s loaded\n", + DRV_VERSION); + + /* Enable PCI device */ + ret = pci_enable_device(pcidev); + if (ret) { + printk(KERN_ERR PFX "%s: Unable to enable PCI device\n", + pci_name(pcidev)); + goto bail0; + } + + reg0_start = pci_resource_start(pcidev, BAR_0); + reg0_len = pci_resource_len(pcidev, BAR_0); + reg0_flags = pci_resource_flags(pcidev, BAR_0); + + reg2_start = pci_resource_start(pcidev, BAR_2); + reg2_len = pci_resource_len(pcidev, BAR_2); + reg2_flags = pci_resource_flags(pcidev, BAR_2); + + reg4_start = pci_resource_start(pcidev, BAR_4); + reg4_len = pci_resource_len(pcidev, BAR_4); + reg4_flags = pci_resource_flags(pcidev, BAR_4); + + pr_debug("BAR0 size = 0x%lX bytes\n", reg0_len); + pr_debug("BAR2 size = 0x%lX bytes\n", reg2_len); + pr_debug("BAR4 size = 0x%lX bytes\n", reg4_len); + + /* Make sure PCI base addr are MMIO */ + if (!(reg0_flags & IORESOURCE_MEM) || + !(reg2_flags & IORESOURCE_MEM) || !(reg4_flags & IORESOURCE_MEM)) { + printk(KERN_ERR PFX "PCI regions not an MMIO resource\n"); + ret = -ENODEV; + goto bail1; + } + + /* Check for weird/broken PCI region reporting */ + if ((reg0_len < C2_REG0_SIZE) || + (reg2_len < C2_REG2_SIZE) || (reg4_len < C2_REG4_SIZE)) { + printk(KERN_ERR PFX "Invalid PCI region sizes\n"); + ret = -ENODEV; + goto bail1; + } + + /* Reserve PCI I/O and memory resources */ + ret = pci_request_regions(pcidev, DRV_NAME); + if (ret) { + printk(KERN_ERR PFX "%s: Unable to request regions\n", + pci_name(pcidev)); + goto bail1; + } + + if ((sizeof(dma_addr_t) > 4)) { + ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(64)); + if (ret < 0) { + printk(KERN_ERR PFX "64b DMA configuration failed\n"); + goto bail2; + } + } else { + ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(32)); + if (ret < 0) { + printk(KERN_ERR PFX "32b DMA configuration failed\n"); + goto bail2; + } + } + + /* Enables bus-mastering on the device */ + pci_set_master(pcidev); + + /* Remap the adapter PCI registers in BAR4 */ + mmio_regs = ioremap_nocache(reg4_start + C2_PCI_REGS_OFFSET, + sizeof(struct c2_adapter_pci_regs)); + if (!mmio_regs) { + printk(KERN_ERR PFX + "Unable to remap adapter PCI registers in BAR4\n"); + ret = -EIO; + goto bail2; + } + + /* Validate PCI regs magic */ + for (i = 0; i < sizeof(c2_magic); i++) { + if (c2_magic[i] != readb(mmio_regs + C2_REGS_MAGIC + i)) { + printk(KERN_ERR PFX "Downlevel Firmware boot loader " + "[%d/%Zd: got 0x%x, exp 0x%x]. Use the cc_flash " + "utility to update your boot loader\n", + i + 1, sizeof(c2_magic), + readb(mmio_regs + C2_REGS_MAGIC + i), + c2_magic[i]); + printk(KERN_ERR PFX "Adapter not claimed\n"); + iounmap(mmio_regs); + ret = -EIO; + goto bail2; + } + } + + /* Validate the adapter version */ + if (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_VERS)) != C2_VERSION) { + printk(KERN_ERR PFX "Version mismatch " + "[fw=%u, c2=%u], Adapter not claimed\n", + be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_VERS)), + C2_VERSION); + ret = -EINVAL; + iounmap(mmio_regs); + goto bail2; + } + + /* Validate the adapter IVN */ + if (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_IVN)) != C2_IVN) { + printk(KERN_ERR PFX "Downlevel FIrmware level. You should be using " + "the OpenIB device support kit. " + "[fw=0x%x, c2=0x%x], Adapter not claimed\n", + be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_IVN)), + C2_IVN); + ret = -EINVAL; + iounmap(mmio_regs); + goto bail2; + } + + /* Allocate hardware structure */ + c2dev = (struct c2_dev *) ib_alloc_device(sizeof(*c2dev)); + if (!c2dev) { + printk(KERN_ERR PFX "%s: Unable to alloc hardware struct\n", + pci_name(pcidev)); + ret = -ENOMEM; + iounmap(mmio_regs); + goto bail2; + } + + memset(c2dev, 0, sizeof(*c2dev)); + spin_lock_init(&c2dev->lock); + c2dev->pcidev = pcidev; + c2dev->cur_tx = 0; + + /* Get the last RX index */ + c2dev->cur_rx = + (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_HRX_CUR)) - + 0xffffc000) / sizeof(struct c2_rxp_desc); + + /* Request an interrupt line for the driver */ + ret = request_irq(pcidev->irq, c2_interrupt, IRQF_SHARED, DRV_NAME, c2dev); + if (ret) { + printk(KERN_ERR PFX "%s: requested IRQ %u is busy\n", + pci_name(pcidev), pcidev->irq); + iounmap(mmio_regs); + goto bail3; + } + + /* Set driver specific data */ + pci_set_drvdata(pcidev, c2dev); + + /* Initialize network device */ + if ((netdev = c2_devinit(c2dev, mmio_regs)) == NULL) { + ret = -ENOMEM; + iounmap(mmio_regs); + goto bail4; + } + + /* Save off the actual size prior to unmapping mmio_regs */ + kva_map_size = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_PCI_WINSIZE)); + + /* Unmap the adapter PCI registers in BAR4 */ + iounmap(mmio_regs); + + /* Register network device */ + ret = register_netdev(netdev); + if (ret) { + printk(KERN_ERR PFX "Unable to register netdev, ret = %d\n", + ret); + goto bail5; + } + + /* Disable network packets */ + netif_stop_queue(netdev); + + /* Remap the adapter HRXDQ PA space to kernel VA space */ + c2dev->mmio_rxp_ring = ioremap_nocache(reg4_start + C2_RXP_HRXDQ_OFFSET, + C2_RXP_HRXDQ_SIZE); + if (!c2dev->mmio_rxp_ring) { + printk(KERN_ERR PFX "Unable to remap MMIO HRXDQ region\n"); + ret = -EIO; + goto bail6; + } + + /* Remap the adapter HTXDQ PA space to kernel VA space */ + c2dev->mmio_txp_ring = ioremap_nocache(reg4_start + C2_TXP_HTXDQ_OFFSET, + C2_TXP_HTXDQ_SIZE); + if (!c2dev->mmio_txp_ring) { + printk(KERN_ERR PFX "Unable to remap MMIO HTXDQ region\n"); + ret = -EIO; + goto bail7; + } + + /* Save off the current RX index in the last 4 bytes of the TXP Ring */ + C2_SET_CUR_RX(c2dev, c2dev->cur_rx); + + /* Remap the PCI registers in adapter BAR0 to kernel VA space */ + c2dev->regs = ioremap_nocache(reg0_start, reg0_len); + if (!c2dev->regs) { + printk(KERN_ERR PFX "Unable to remap BAR0\n"); + ret = -EIO; + goto bail8; + } + + /* Remap the PCI registers in adapter BAR4 to kernel VA space */ + c2dev->pa = reg4_start + C2_PCI_REGS_OFFSET; + c2dev->kva = ioremap_nocache(reg4_start + C2_PCI_REGS_OFFSET, + kva_map_size); + if (!c2dev->kva) { + printk(KERN_ERR PFX "Unable to remap BAR4\n"); + ret = -EIO; + goto bail9; + } + + /* Print out the MAC address */ + c2_print_macaddr(netdev); + + ret = c2_rnic_init(c2dev); + if (ret) { + printk(KERN_ERR PFX "c2_rnic_init failed: %d\n", ret); + goto bail10; + } + + ret = c2_register_device(c2dev); + if (ret) + goto bail10; + + return 0; + + bail10: + iounmap(c2dev->kva); + + bail9: + iounmap(c2dev->regs); + + bail8: + iounmap(c2dev->mmio_txp_ring); + + bail7: + iounmap(c2dev->mmio_rxp_ring); + + bail6: + unregister_netdev(netdev); + + bail5: + free_netdev(netdev); + + bail4: + free_irq(pcidev->irq, c2dev); + + bail3: + ib_dealloc_device(&c2dev->ibdev); + + bail2: + pci_release_regions(pcidev); + + bail1: + pci_disable_device(pcidev); + + bail0: + return ret; +} + +static void c2_remove(struct pci_dev *pcidev) +{ + struct c2_dev *c2dev = pci_get_drvdata(pcidev); + struct net_device *netdev = c2dev->netdev; + + /* Unregister with OpenIB */ + c2_unregister_device(c2dev); + + /* Clean up the RNIC resources */ + c2_rnic_term(c2dev); + + /* Remove network device from the kernel */ + unregister_netdev(netdev); + + /* Free network device */ + free_netdev(netdev); + + /* Free the interrupt line */ + free_irq(pcidev->irq, c2dev); + + /* missing: Turn LEDs off here */ + + /* Unmap adapter PA space */ + iounmap(c2dev->kva); + iounmap(c2dev->regs); + iounmap(c2dev->mmio_txp_ring); + iounmap(c2dev->mmio_rxp_ring); + + /* Free the hardware structure */ + ib_dealloc_device(&c2dev->ibdev); + + /* Release reserved PCI I/O and memory resources */ + pci_release_regions(pcidev); + + /* Disable PCI device */ + pci_disable_device(pcidev); + + /* Clear driver specific data */ + pci_set_drvdata(pcidev, NULL); +} + +static struct pci_driver c2_pci_driver = { + .name = DRV_NAME, + .id_table = c2_pci_table, + .probe = c2_probe, + .remove = c2_remove, +}; + +module_pci_driver(c2_pci_driver); diff --git a/drivers/infiniband/hw/amso1100/c2.h b/drivers/infiniband/hw/amso1100/c2.h new file mode 100644 index 0000000..d619d73 --- /dev/null +++ b/drivers/infiniband/hw/amso1100/c2.h @@ -0,0 +1,547 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __C2_H +#define __C2_H + +#include +#include +#include +#include +#include +#include + +#include "c2_provider.h" +#include "c2_mq.h" +#include "c2_status.h" + +#define DRV_NAME "c2" +#define DRV_VERSION "1.1" +#define PFX DRV_NAME ": " + +#define BAR_0 0 +#define BAR_2 2 +#define BAR_4 4 + +#define RX_BUF_SIZE (1536 + 8) +#define ETH_JUMBO_MTU 9000 +#define C2_MAGIC "CEPHEUS" +#define C2_VERSION 4 +#define C2_IVN (18 & 0x7fffffff) + +#define C2_REG0_SIZE (16 * 1024) +#define C2_REG2_SIZE (2 * 1024 * 1024) +#define C2_REG4_SIZE (256 * 1024 * 1024) +#define C2_NUM_TX_DESC 341 +#define C2_NUM_RX_DESC 256 +#define C2_PCI_REGS_OFFSET (0x10000) +#define C2_RXP_HRXDQ_OFFSET (((C2_REG4_SIZE)/2)) +#define C2_RXP_HRXDQ_SIZE (4096) +#define C2_TXP_HTXDQ_OFFSET (((C2_REG4_SIZE)/2) + C2_RXP_HRXDQ_SIZE) +#define C2_TXP_HTXDQ_SIZE (4096) +#define C2_TX_TIMEOUT (6*HZ) + +/* CEPHEUS */ +static const u8 c2_magic[] = { + 0x43, 0x45, 0x50, 0x48, 0x45, 0x55, 0x53 +}; + +enum adapter_pci_regs { + C2_REGS_MAGIC = 0x0000, + C2_REGS_VERS = 0x0008, + C2_REGS_IVN = 0x000C, + C2_REGS_PCI_WINSIZE = 0x0010, + C2_REGS_Q0_QSIZE = 0x0014, + C2_REGS_Q0_MSGSIZE = 0x0018, + C2_REGS_Q0_POOLSTART = 0x001C, + C2_REGS_Q0_SHARED = 0x0020, + C2_REGS_Q1_QSIZE = 0x0024, + C2_REGS_Q1_MSGSIZE = 0x0028, + C2_REGS_Q1_SHARED = 0x0030, + C2_REGS_Q2_QSIZE = 0x0034, + C2_REGS_Q2_MSGSIZE = 0x0038, + C2_REGS_Q2_SHARED = 0x0040, + C2_REGS_ENADDR = 0x004C, + C2_REGS_RDMA_ENADDR = 0x0054, + C2_REGS_HRX_CUR = 0x006C, +}; + +struct c2_adapter_pci_regs { + char reg_magic[8]; + u32 version; + u32 ivn; + u32 pci_window_size; + u32 q0_q_size; + u32 q0_msg_size; + u32 q0_pool_start; + u32 q0_shared; + u32 q1_q_size; + u32 q1_msg_size; + u32 q1_pool_start; + u32 q1_shared; + u32 q2_q_size; + u32 q2_msg_size; + u32 q2_pool_start; + u32 q2_shared; + u32 log_start; + u32 log_size; + u8 host_enaddr[8]; + u8 rdma_enaddr[8]; + u32 crash_entry; + u32 crash_ready[2]; + u32 fw_txd_cur; + u32 fw_hrxd_cur; + u32 fw_rxd_cur; +}; + +enum pci_regs { + C2_HISR = 0x0000, + C2_DISR = 0x0004, + C2_HIMR = 0x0008, + C2_DIMR = 0x000C, + C2_NISR0 = 0x0010, + C2_NISR1 = 0x0014, + C2_NIMR0 = 0x0018, + C2_NIMR1 = 0x001C, + C2_IDIS = 0x0020, +}; + +enum { + C2_PCI_HRX_INT = 1 << 8, + C2_PCI_HTX_INT = 1 << 17, + C2_PCI_HRX_QUI = 1 << 31, +}; + +/* + * Cepheus registers in BAR0. + */ +struct c2_pci_regs { + u32 hostisr; + u32 dmaisr; + u32 hostimr; + u32 dmaimr; + u32 netisr0; + u32 netisr1; + u32 netimr0; + u32 netimr1; + u32 int_disable; +}; + +/* TXP flags */ +enum c2_txp_flags { + TXP_HTXD_DONE = 0, + TXP_HTXD_READY = 1 << 0, + TXP_HTXD_UNINIT = 1 << 1, +}; + +/* RXP flags */ +enum c2_rxp_flags { + RXP_HRXD_UNINIT = 0, + RXP_HRXD_READY = 1 << 0, + RXP_HRXD_DONE = 1 << 1, +}; + +/* RXP status */ +enum c2_rxp_status { + RXP_HRXD_ZERO = 0, + RXP_HRXD_OK = 1 << 0, + RXP_HRXD_BUF_OV = 1 << 1, +}; + +/* TXP descriptor fields */ +enum txp_desc { + C2_TXP_FLAGS = 0x0000, + C2_TXP_LEN = 0x0002, + C2_TXP_ADDR = 0x0004, +}; + +/* RXP descriptor fields */ +enum rxp_desc { + C2_RXP_FLAGS = 0x0000, + C2_RXP_STATUS = 0x0002, + C2_RXP_COUNT = 0x0004, + C2_RXP_LEN = 0x0006, + C2_RXP_ADDR = 0x0008, +}; + +struct c2_txp_desc { + u16 flags; + u16 len; + u64 addr; +} __attribute__ ((packed)); + +struct c2_rxp_desc { + u16 flags; + u16 status; + u16 count; + u16 len; + u64 addr; +} __attribute__ ((packed)); + +struct c2_rxp_hdr { + u16 flags; + u16 status; + u16 len; + u16 rsvd; +} __attribute__ ((packed)); + +struct c2_tx_desc { + u32 len; + u32 status; + dma_addr_t next_offset; +}; + +struct c2_rx_desc { + u32 len; + u32 status; + dma_addr_t next_offset; +}; + +struct c2_alloc { + u32 last; + u32 max; + spinlock_t lock; + unsigned long *table; +}; + +struct c2_array { + struct { + void **page; + int used; + } *page_list; +}; + +/* + * The MQ shared pointer pool is organized as a linked list of + * chunks. Each chunk contains a linked list of free shared pointers + * that can be allocated to a given user mode client. + * + */ +struct sp_chunk { + struct sp_chunk *next; + dma_addr_t dma_addr; + DEFINE_DMA_UNMAP_ADDR(mapping); + u16 head; + u16 shared_ptr[0]; +}; + +struct c2_pd_table { + u32 last; + u32 max; + spinlock_t lock; + unsigned long *table; +}; + +struct c2_qp_table { + struct idr idr; + spinlock_t lock; +}; + +struct c2_element { + struct c2_element *next; + void *ht_desc; /* host descriptor */ + void __iomem *hw_desc; /* hardware descriptor */ + struct sk_buff *skb; + dma_addr_t mapaddr; + u32 maplen; +}; + +struct c2_ring { + struct c2_element *to_clean; + struct c2_element *to_use; + struct c2_element *start; + unsigned long count; +}; + +struct c2_dev { + struct ib_device ibdev; + void __iomem *regs; + void __iomem *mmio_txp_ring; /* remapped adapter memory for hw rings */ + void __iomem *mmio_rxp_ring; + spinlock_t lock; + struct pci_dev *pcidev; + struct net_device *netdev; + struct net_device *pseudo_netdev; + unsigned int cur_tx; + unsigned int cur_rx; + u32 adapter_handle; + int device_cap_flags; + void __iomem *kva; /* KVA device memory */ + unsigned long pa; /* PA device memory */ + void **qptr_array; + + struct kmem_cache *host_msg_cache; + + struct list_head cca_link; /* adapter list */ + struct list_head eh_wakeup_list; /* event wakeup list */ + wait_queue_head_t req_vq_wo; + + /* Cached RNIC properties */ + struct ib_device_attr props; + + struct c2_pd_table pd_table; + struct c2_qp_table qp_table; + int ports; /* num of GigE ports */ + int devnum; + spinlock_t vqlock; /* sync vbs req MQ */ + + /* Verbs Queues */ + struct c2_mq req_vq; /* Verbs Request MQ */ + struct c2_mq rep_vq; /* Verbs Reply MQ */ + struct c2_mq aeq; /* Async Events MQ */ + + /* Kernel client MQs */ + struct sp_chunk *kern_mqsp_pool; + + /* Device updates these values when posting messages to a host + * target queue */ + u16 req_vq_shared; + u16 rep_vq_shared; + u16 aeq_shared; + u16 irq_claimed; + + /* + * Shared host target pages for user-accessible MQs. + */ + int hthead; /* index of first free entry */ + void *htpages; /* kernel vaddr */ + int htlen; /* length of htpages memory */ + void *htuva; /* user mapped vaddr */ + spinlock_t htlock; /* serialize allocation */ + + u64 adapter_hint_uva; /* access to the activity FIFO */ + + // spinlock_t aeq_lock; + // spinlock_t rnic_lock; + + __be16 *hint_count; + dma_addr_t hint_count_dma; + u16 hints_read; + + int init; /* TRUE if it's ready */ + char ae_cache_name[16]; + char vq_cache_name[16]; +}; + +struct c2_port { + u32 msg_enable; + struct c2_dev *c2dev; + struct net_device *netdev; + + spinlock_t tx_lock; + u32 tx_avail; + struct c2_ring tx_ring; + struct c2_ring rx_ring; + + void *mem; /* PCI memory for host rings */ + dma_addr_t dma; + unsigned long mem_size; + + u32 rx_buf_size; +}; + +/* + * Activity FIFO registers in BAR0. + */ +#define PCI_BAR0_HOST_HINT 0x100 +#define PCI_BAR0_ADAPTER_HINT 0x2000 + +/* + * Ammasso PCI vendor id and Cepheus PCI device id. + */ +#define CQ_ARMED 0x01 +#define CQ_WAIT_FOR_DMA 0x80 + +/* + * The format of a hint is as follows: + * Lower 16 bits are the count of hints for the queue. + * Next 15 bits are the qp_index + * Upper most bit depends on who reads it: + * If read by producer, then it means Full (1) or Not-Full (0) + * If read by consumer, then it means Empty (1) or Not-Empty (0) + */ +#define C2_HINT_MAKE(q_index, hint_count) (((q_index) << 16) | hint_count) +#define C2_HINT_GET_INDEX(hint) (((hint) & 0x7FFF0000) >> 16) +#define C2_HINT_GET_COUNT(hint) ((hint) & 0x0000FFFF) + + +/* + * The following defines the offset in SDRAM for the c2_adapter_pci_regs_t + * struct. + */ +#define C2_ADAPTER_PCI_REGS_OFFSET 0x10000 + +#ifndef readq +static inline u64 readq(const void __iomem * addr) +{ + u64 ret = readl(addr + 4); + ret <<= 32; + ret |= readl(addr); + + return ret; +} +#endif + +#ifndef writeq +static inline void __raw_writeq(u64 val, void __iomem * addr) +{ + __raw_writel((u32) (val), addr); + __raw_writel((u32) (val >> 32), (addr + 4)); +} +#endif + +#define C2_SET_CUR_RX(c2dev, cur_rx) \ + __raw_writel((__force u32) cpu_to_be32(cur_rx), c2dev->mmio_txp_ring + 4092) + +#define C2_GET_CUR_RX(c2dev) \ + be32_to_cpu((__force __be32) readl(c2dev->mmio_txp_ring + 4092)) + +static inline struct c2_dev *to_c2dev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct c2_dev, ibdev); +} + +static inline int c2_errno(void *reply) +{ + switch (c2_wr_get_result(reply)) { + case C2_OK: + return 0; + case CCERR_NO_BUFS: + case CCERR_INSUFFICIENT_RESOURCES: + case CCERR_ZERO_RDMA_READ_RESOURCES: + return -ENOMEM; + case CCERR_MR_IN_USE: + case CCERR_QP_IN_USE: + return -EBUSY; + case CCERR_ADDR_IN_USE: + return -EADDRINUSE; + case CCERR_ADDR_NOT_AVAIL: + return -EADDRNOTAVAIL; + case CCERR_CONN_RESET: + return -ECONNRESET; + case CCERR_NOT_IMPLEMENTED: + case CCERR_INVALID_WQE: + return -ENOSYS; + case CCERR_QP_NOT_PRIVILEGED: + return -EPERM; + case CCERR_STACK_ERROR: + return -EPROTO; + case CCERR_ACCESS_VIOLATION: + case CCERR_BASE_AND_BOUNDS_VIOLATION: + return -EFAULT; + case CCERR_STAG_STATE_NOT_INVALID: + case CCERR_INVALID_ADDRESS: + case CCERR_INVALID_CQ: + case CCERR_INVALID_EP: + case CCERR_INVALID_MODIFIER: + case CCERR_INVALID_MTU: + case CCERR_INVALID_PD_ID: + case CCERR_INVALID_QP: + case CCERR_INVALID_RNIC: + case CCERR_INVALID_STAG: + return -EINVAL; + default: + return -EAGAIN; + } +} + +/* Device */ +extern int c2_register_device(struct c2_dev *c2dev); +extern void c2_unregister_device(struct c2_dev *c2dev); +extern int c2_rnic_init(struct c2_dev *c2dev); +extern void c2_rnic_term(struct c2_dev *c2dev); +extern void c2_rnic_interrupt(struct c2_dev *c2dev); +extern int c2_del_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask); +extern int c2_add_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask); + +/* QPs */ +extern int c2_alloc_qp(struct c2_dev *c2dev, struct c2_pd *pd, + struct ib_qp_init_attr *qp_attrs, struct c2_qp *qp); +extern void c2_free_qp(struct c2_dev *c2dev, struct c2_qp *qp); +extern struct ib_qp *c2_get_qp(struct ib_device *device, int qpn); +extern int c2_qp_modify(struct c2_dev *c2dev, struct c2_qp *qp, + struct ib_qp_attr *attr, int attr_mask); +extern int c2_qp_set_read_limits(struct c2_dev *c2dev, struct c2_qp *qp, + int ord, int ird); +extern int c2_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, + struct ib_send_wr **bad_wr); +extern int c2_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr, + struct ib_recv_wr **bad_wr); +extern void c2_init_qp_table(struct c2_dev *c2dev); +extern void c2_cleanup_qp_table(struct c2_dev *c2dev); +extern void c2_set_qp_state(struct c2_qp *, int); +extern struct c2_qp *c2_find_qpn(struct c2_dev *c2dev, int qpn); + +/* PDs */ +extern int c2_pd_alloc(struct c2_dev *c2dev, int privileged, struct c2_pd *pd); +extern void c2_pd_free(struct c2_dev *c2dev, struct c2_pd *pd); +extern int c2_init_pd_table(struct c2_dev *c2dev); +extern void c2_cleanup_pd_table(struct c2_dev *c2dev); + +/* CQs */ +extern int c2_init_cq(struct c2_dev *c2dev, int entries, + struct c2_ucontext *ctx, struct c2_cq *cq); +extern void c2_free_cq(struct c2_dev *c2dev, struct c2_cq *cq); +extern void c2_cq_event(struct c2_dev *c2dev, u32 mq_index); +extern void c2_cq_clean(struct c2_dev *c2dev, struct c2_qp *qp, u32 mq_index); +extern int c2_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry); +extern int c2_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); + +/* CM */ +extern int c2_llp_connect(struct iw_cm_id *cm_id, + struct iw_cm_conn_param *iw_param); +extern int c2_llp_accept(struct iw_cm_id *cm_id, + struct iw_cm_conn_param *iw_param); +extern int c2_llp_reject(struct iw_cm_id *cm_id, const void *pdata, + u8 pdata_len); +extern int c2_llp_service_create(struct iw_cm_id *cm_id, int backlog); +extern int c2_llp_service_destroy(struct iw_cm_id *cm_id); + +/* MM */ +extern int c2_nsmr_register_phys_kern(struct c2_dev *c2dev, u64 *addr_list, + int page_size, int pbl_depth, u32 length, + u32 off, u64 *va, enum c2_acf acf, + struct c2_mr *mr); +extern int c2_stag_dealloc(struct c2_dev *c2dev, u32 stag_index); + +/* AE */ +extern void c2_ae_event(struct c2_dev *c2dev, u32 mq_index); + +/* MQSP Allocator */ +extern int c2_init_mqsp_pool(struct c2_dev *c2dev, gfp_t gfp_mask, + struct sp_chunk **root); +extern void c2_free_mqsp_pool(struct c2_dev *c2dev, struct sp_chunk *root); +extern __be16 *c2_alloc_mqsp(struct c2_dev *c2dev, struct sp_chunk *head, + dma_addr_t *dma_addr, gfp_t gfp_mask); +extern void c2_free_mqsp(__be16* mqsp); +#endif diff --git a/drivers/infiniband/hw/amso1100/c2_ae.c b/drivers/infiniband/hw/amso1100/c2_ae.c new file mode 100644 index 0000000..cedda25 --- /dev/null +++ b/drivers/infiniband/hw/amso1100/c2_ae.c @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "c2.h" +#include +#include "c2_status.h" +#include "c2_ae.h" + +static int c2_convert_cm_status(u32 c2_status) +{ + switch (c2_status) { + case C2_CONN_STATUS_SUCCESS: + return 0; + case C2_CONN_STATUS_REJECTED: + return -ENETRESET; + case C2_CONN_STATUS_REFUSED: + return -ECONNREFUSED; + case C2_CONN_STATUS_TIMEDOUT: + return -ETIMEDOUT; + case C2_CONN_STATUS_NETUNREACH: + return -ENETUNREACH; + case C2_CONN_STATUS_HOSTUNREACH: + return -EHOSTUNREACH; + case C2_CONN_STATUS_INVALID_RNIC: + return -EINVAL; + case C2_CONN_STATUS_INVALID_QP: + return -EINVAL; + case C2_CONN_STATUS_INVALID_QP_STATE: + return -EINVAL; + case C2_CONN_STATUS_ADDR_NOT_AVAIL: + return -EADDRNOTAVAIL; + default: + printk(KERN_ERR PFX + "%s - Unable to convert CM status: %d\n", + __func__, c2_status); + return -EIO; + } +} + +static const char* to_event_str(int event) +{ + static const char* event_str[] = { + "CCAE_REMOTE_SHUTDOWN", + "CCAE_ACTIVE_CONNECT_RESULTS", + "CCAE_CONNECTION_REQUEST", + "CCAE_LLP_CLOSE_COMPLETE", + "CCAE_TERMINATE_MESSAGE_RECEIVED", + "CCAE_LLP_CONNECTION_RESET", + "CCAE_LLP_CONNECTION_LOST", + "CCAE_LLP_SEGMENT_SIZE_INVALID", + "CCAE_LLP_INVALID_CRC", + "CCAE_LLP_BAD_FPDU", + "CCAE_INVALID_DDP_VERSION", + "CCAE_INVALID_RDMA_VERSION", + "CCAE_UNEXPECTED_OPCODE", + "CCAE_INVALID_DDP_QUEUE_NUMBER", + "CCAE_RDMA_READ_NOT_ENABLED", + "CCAE_RDMA_WRITE_NOT_ENABLED", + "CCAE_RDMA_READ_TOO_SMALL", + "CCAE_NO_L_BIT", + "CCAE_TAGGED_INVALID_STAG", + "CCAE_TAGGED_BASE_BOUNDS_VIOLATION", + "CCAE_TAGGED_ACCESS_RIGHTS_VIOLATION", + "CCAE_TAGGED_INVALID_PD", + "CCAE_WRAP_ERROR", + "CCAE_BAD_CLOSE", + "CCAE_BAD_LLP_CLOSE", + "CCAE_INVALID_MSN_RANGE", + "CCAE_INVALID_MSN_GAP", + "CCAE_IRRQ_OVERFLOW", + "CCAE_IRRQ_MSN_GAP", + "CCAE_IRRQ_MSN_RANGE", + "CCAE_IRRQ_INVALID_STAG", + "CCAE_IRRQ_BASE_BOUNDS_VIOLATION", + "CCAE_IRRQ_ACCESS_RIGHTS_VIOLATION", + "CCAE_IRRQ_INVALID_PD", + "CCAE_IRRQ_WRAP_ERROR", + "CCAE_CQ_SQ_COMPLETION_OVERFLOW", + "CCAE_CQ_RQ_COMPLETION_ERROR", + "CCAE_QP_SRQ_WQE_ERROR", + "CCAE_QP_LOCAL_CATASTROPHIC_ERROR", + "CCAE_CQ_OVERFLOW", + "CCAE_CQ_OPERATION_ERROR", + "CCAE_SRQ_LIMIT_REACHED", + "CCAE_QP_RQ_LIMIT_REACHED", + "CCAE_SRQ_CATASTROPHIC_ERROR", + "CCAE_RNIC_CATASTROPHIC_ERROR" + }; + + if (event < CCAE_REMOTE_SHUTDOWN || + event > CCAE_RNIC_CATASTROPHIC_ERROR) + return ""; + + event -= CCAE_REMOTE_SHUTDOWN; + return event_str[event]; +} + +static const char *to_qp_state_str(int state) +{ + switch (state) { + case C2_QP_STATE_IDLE: + return "C2_QP_STATE_IDLE"; + case C2_QP_STATE_CONNECTING: + return "C2_QP_STATE_CONNECTING"; + case C2_QP_STATE_RTS: + return "C2_QP_STATE_RTS"; + case C2_QP_STATE_CLOSING: + return "C2_QP_STATE_CLOSING"; + case C2_QP_STATE_TERMINATE: + return "C2_QP_STATE_TERMINATE"; + case C2_QP_STATE_ERROR: + return "C2_QP_STATE_ERROR"; + default: + return ""; + } +} + +void c2_ae_event(struct c2_dev *c2dev, u32 mq_index) +{ + struct c2_mq *mq = c2dev->qptr_array[mq_index]; + union c2wr *wr; + void *resource_user_context; + struct iw_cm_event cm_event; + struct ib_event ib_event; + enum c2_resource_indicator resource_indicator; + enum c2_event_id event_id; + unsigned long flags; + int status; + struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_event.local_addr; + struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_event.remote_addr; + + /* + * retrieve the message + */ + wr = c2_mq_consume(mq); + if (!wr) + return; + + memset(&ib_event, 0, sizeof(ib_event)); + memset(&cm_event, 0, sizeof(cm_event)); + + event_id = c2_wr_get_id(wr); + resource_indicator = be32_to_cpu(wr->ae.ae_generic.resource_type); + resource_user_context = + (void *) (unsigned long) wr->ae.ae_generic.user_context; + + status = cm_event.status = c2_convert_cm_status(c2_wr_get_result(wr)); + + pr_debug("event received c2_dev=%p, event_id=%d, " + "resource_indicator=%d, user_context=%p, status = %d\n", + c2dev, event_id, resource_indicator, resource_user_context, + status); + + switch (resource_indicator) { + case C2_RES_IND_QP:{ + + struct c2_qp *qp = (struct c2_qp *)resource_user_context; + struct iw_cm_id *cm_id = qp->cm_id; + struct c2wr_ae_active_connect_results *res; + + if (!cm_id) { + pr_debug("event received, but cm_id is , qp=%p!\n", + qp); + goto ignore_it; + } + pr_debug("%s: event = %s, user_context=%llx, " + "resource_type=%x, " + "resource=%x, qp_state=%s\n", + __func__, + to_event_str(event_id), + (unsigned long long) wr->ae.ae_generic.user_context, + be32_to_cpu(wr->ae.ae_generic.resource_type), + be32_to_cpu(wr->ae.ae_generic.resource), + to_qp_state_str(be32_to_cpu(wr->ae.ae_generic.qp_state))); + + c2_set_qp_state(qp, be32_to_cpu(wr->ae.ae_generic.qp_state)); + + switch (event_id) { + case CCAE_ACTIVE_CONNECT_RESULTS: + res = &wr->ae.ae_active_connect_results; + cm_event.event = IW_CM_EVENT_CONNECT_REPLY; + laddr->sin_addr.s_addr = res->laddr; + raddr->sin_addr.s_addr = res->raddr; + laddr->sin_port = res->lport; + raddr->sin_port = res->rport; + if (status == 0) { + cm_event.private_data_len = + be32_to_cpu(res->private_data_length); + cm_event.private_data = res->private_data; + } else { + spin_lock_irqsave(&qp->lock, flags); + if (qp->cm_id) { + qp->cm_id->rem_ref(qp->cm_id); + qp->cm_id = NULL; + } + spin_unlock_irqrestore(&qp->lock, flags); + cm_event.private_data_len = 0; + cm_event.private_data = NULL; + } + if (cm_id->event_handler) + cm_id->event_handler(cm_id, &cm_event); + break; + case CCAE_TERMINATE_MESSAGE_RECEIVED: + case CCAE_CQ_SQ_COMPLETION_OVERFLOW: + ib_event.device = &c2dev->ibdev; + ib_event.element.qp = &qp->ibqp; + ib_event.event = IB_EVENT_QP_REQ_ERR; + + if (qp->ibqp.event_handler) + qp->ibqp.event_handler(&ib_event, + qp->ibqp. + qp_context); + break; + case CCAE_BAD_CLOSE: + case CCAE_LLP_CLOSE_COMPLETE: + case CCAE_LLP_CONNECTION_RESET: + case CCAE_LLP_CONNECTION_LOST: + BUG_ON(cm_id->event_handler==(void*)0x6b6b6b6b); + + spin_lock_irqsave(&qp->lock, flags); + if (qp->cm_id) { + qp->cm_id->rem_ref(qp->cm_id); + qp->cm_id = NULL; + } + spin_unlock_irqrestore(&qp->lock, flags); + cm_event.event = IW_CM_EVENT_CLOSE; + cm_event.status = 0; + if (cm_id->event_handler) + cm_id->event_handler(cm_id, &cm_event); + break; + default: + BUG_ON(1); + pr_debug("%s:%d Unexpected event_id=%d on QP=%p, " + "CM_ID=%p\n", + __func__, __LINE__, + event_id, qp, cm_id); + break; + } + break; + } + + case C2_RES_IND_EP:{ + + struct c2wr_ae_connection_request *req = + &wr->ae.ae_connection_request; + struct iw_cm_id *cm_id = + (struct iw_cm_id *)resource_user_context; + + pr_debug("C2_RES_IND_EP event_id=%d\n", event_id); + if (event_id != CCAE_CONNECTION_REQUEST) { + pr_debug("%s: Invalid event_id: %d\n", + __func__, event_id); + break; + } + cm_event.event = IW_CM_EVENT_CONNECT_REQUEST; + cm_event.provider_data = (void*)(unsigned long)req->cr_handle; + laddr->sin_addr.s_addr = req->laddr; + raddr->sin_addr.s_addr = req->raddr; + laddr->sin_port = req->lport; + raddr->sin_port = req->rport; + cm_event.private_data_len = + be32_to_cpu(req->private_data_length); + cm_event.private_data = req->private_data; + /* + * Until ird/ord negotiation via MPAv2 support is added, send + * max supported values + */ + cm_event.ird = cm_event.ord = 128; + + if (cm_id->event_handler) + cm_id->event_handler(cm_id, &cm_event); + break; + } + + case C2_RES_IND_CQ:{ + struct c2_cq *cq = + (struct c2_cq *) resource_user_context; + + pr_debug("IB_EVENT_CQ_ERR\n"); + ib_event.device = &c2dev->ibdev; + ib_event.element.cq = &cq->ibcq; + ib_event.event = IB_EVENT_CQ_ERR; + + if (cq->ibcq.event_handler) + cq->ibcq.event_handler(&ib_event, + cq->ibcq.cq_context); + break; + } + + default: + printk("Bad resource indicator = %d\n", + resource_indicator); + break; + } + + ignore_it: + c2_mq_free(mq); +} diff --git a/drivers/infiniband/hw/amso1100/c2_ae.h b/drivers/infiniband/hw/amso1100/c2_ae.h new file mode 100644 index 0000000..3a065c3 --- /dev/null +++ b/drivers/infiniband/hw/amso1100/c2_ae.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _C2_AE_H_ +#define _C2_AE_H_ + +/* + * WARNING: If you change this file, also bump C2_IVN_BASE + * in common/include/clustercore/c2_ivn.h. + */ + +/* + * Asynchronous Event Identifiers + * + * These start at 0x80 only so it's obvious from inspection that + * they are not work-request statuses. This isn't critical. + * + * NOTE: these event id's must fit in eight bits. + */ +enum c2_event_id { + CCAE_REMOTE_SHUTDOWN = 0x80, + CCAE_ACTIVE_CONNECT_RESULTS, + CCAE_CONNECTION_REQUEST, + CCAE_LLP_CLOSE_COMPLETE, + CCAE_TERMINATE_MESSAGE_RECEIVED, + CCAE_LLP_CONNECTION_RESET, + CCAE_LLP_CONNECTION_LOST, + CCAE_LLP_SEGMENT_SIZE_INVALID, + CCAE_LLP_INVALID_CRC, + CCAE_LLP_BAD_FPDU, + CCAE_INVALID_DDP_VERSION, + CCAE_INVALID_RDMA_VERSION, + CCAE_UNEXPECTED_OPCODE, + CCAE_INVALID_DDP_QUEUE_NUMBER, + CCAE_RDMA_READ_NOT_ENABLED, + CCAE_RDMA_WRITE_NOT_ENABLED, + CCAE_RDMA_READ_TOO_SMALL, + CCAE_NO_L_BIT, + CCAE_TAGGED_INVALID_STAG, + CCAE_TAGGED_BASE_BOUNDS_VIOLATION, + CCAE_TAGGED_ACCESS_RIGHTS_VIOLATION, + CCAE_TAGGED_INVALID_PD, + CCAE_WRAP_ERROR, + CCAE_BAD_CLOSE, + CCAE_BAD_LLP_CLOSE, + CCAE_INVALID_MSN_RANGE, + CCAE_INVALID_MSN_GAP, + CCAE_IRRQ_OVERFLOW, + CCAE_IRRQ_MSN_GAP, + CCAE_IRRQ_MSN_RANGE, + CCAE_IRRQ_INVALID_STAG, + CCAE_IRRQ_BASE_BOUNDS_VIOLATION, + CCAE_IRRQ_ACCESS_RIGHTS_VIOLATION, + CCAE_IRRQ_INVALID_PD, + CCAE_IRRQ_WRAP_ERROR, + CCAE_CQ_SQ_COMPLETION_OVERFLOW, + CCAE_CQ_RQ_COMPLETION_ERROR, + CCAE_QP_SRQ_WQE_ERROR, + CCAE_QP_LOCAL_CATASTROPHIC_ERROR, + CCAE_CQ_OVERFLOW, + CCAE_CQ_OPERATION_ERROR, + CCAE_SRQ_LIMIT_REACHED, + CCAE_QP_RQ_LIMIT_REACHED, + CCAE_SRQ_CATASTROPHIC_ERROR, + CCAE_RNIC_CATASTROPHIC_ERROR +/* WARNING If you add more id's, make sure their values fit in eight bits. */ +}; + +/* + * Resource Indicators and Identifiers + */ +enum c2_resource_indicator { + C2_RES_IND_QP = 1, + C2_RES_IND_EP, + C2_RES_IND_CQ, + C2_RES_IND_SRQ, +}; + +#endif /* _C2_AE_H_ */ diff --git a/drivers/infiniband/hw/amso1100/c2_alloc.c b/drivers/infiniband/hw/amso1100/c2_alloc.c new file mode 100644 index 0000000..78d247e --- /dev/null +++ b/drivers/infiniband/hw/amso1100/c2_alloc.c @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "c2.h" + +static int c2_alloc_mqsp_chunk(struct c2_dev *c2dev, gfp_t gfp_mask, + struct sp_chunk **head) +{ + int i; + struct sp_chunk *new_head; + dma_addr_t dma_addr; + + new_head = dma_alloc_coherent(&c2dev->pcidev->dev, PAGE_SIZE, + &dma_addr, gfp_mask); + if (new_head == NULL) + return -ENOMEM; + + new_head->dma_addr = dma_addr; + dma_unmap_addr_set(new_head, mapping, new_head->dma_addr); + + new_head->next = NULL; + new_head->head = 0; + + /* build list where each index is the next free slot */ + for (i = 0; + i < (PAGE_SIZE - sizeof(struct sp_chunk) - + sizeof(u16)) / sizeof(u16) - 1; + i++) { + new_head->shared_ptr[i] = i + 1; + } + /* terminate list */ + new_head->shared_ptr[i] = 0xFFFF; + + *head = new_head; + return 0; +} + +int c2_init_mqsp_pool(struct c2_dev *c2dev, gfp_t gfp_mask, + struct sp_chunk **root) +{ + return c2_alloc_mqsp_chunk(c2dev, gfp_mask, root); +} + +void c2_free_mqsp_pool(struct c2_dev *c2dev, struct sp_chunk *root) +{ + struct sp_chunk *next; + + while (root) { + next = root->next; + dma_free_coherent(&c2dev->pcidev->dev, PAGE_SIZE, root, + dma_unmap_addr(root, mapping)); + root = next; + } +} + +__be16 *c2_alloc_mqsp(struct c2_dev *c2dev, struct sp_chunk *head, + dma_addr_t *dma_addr, gfp_t gfp_mask) +{ + u16 mqsp; + + while (head) { + mqsp = head->head; + if (mqsp != 0xFFFF) { + head->head = head->shared_ptr[mqsp]; + break; + } else if (head->next == NULL) { + if (c2_alloc_mqsp_chunk(c2dev, gfp_mask, &head->next) == + 0) { + head = head->next; + mqsp = head->head; + head->head = head->shared_ptr[mqsp]; + break; + } else + return NULL; + } else + head = head->next; + } + if (head) { + *dma_addr = head->dma_addr + + ((unsigned long) &(head->shared_ptr[mqsp]) - + (unsigned long) head); + pr_debug("%s addr %p dma_addr %llx\n", __func__, + &(head->shared_ptr[mqsp]), (unsigned long long) *dma_addr); + return (__force __be16 *) &(head->shared_ptr[mqsp]); + } + return NULL; +} + +void c2_free_mqsp(__be16 *mqsp) +{ + struct sp_chunk *head; + u16 idx; + + /* The chunk containing this ptr begins at the page boundary */ + head = (struct sp_chunk *) ((unsigned long) mqsp & PAGE_MASK); + + /* Link head to new mqsp */ + *mqsp = (__force __be16) head->head; + + /* Compute the shared_ptr index */ + idx = ((unsigned long) mqsp & ~PAGE_MASK) >> 1; + idx -= (unsigned long) &(((struct sp_chunk *) 0)->shared_ptr[0]) >> 1; + + /* Point this index at the head */ + head->shared_ptr[idx] = head->head; + + /* Point head at this index */ + head->head = idx; +} diff --git a/drivers/infiniband/hw/amso1100/c2_cm.c b/drivers/infiniband/hw/amso1100/c2_cm.c new file mode 100644 index 0000000..23bfa94 --- /dev/null +++ b/drivers/infiniband/hw/amso1100/c2_cm.c @@ -0,0 +1,461 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include + +#include "c2.h" +#include "c2_wr.h" +#include "c2_vq.h" +#include + +int c2_llp_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) +{ + struct c2_dev *c2dev = to_c2dev(cm_id->device); + struct ib_qp *ibqp; + struct c2_qp *qp; + struct c2wr_qp_connect_req *wr; /* variable size needs a malloc. */ + struct c2_vq_req *vq_req; + int err; + struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; + + if (cm_id->remote_addr.ss_family != AF_INET) + return -ENOSYS; + + ibqp = c2_get_qp(cm_id->device, iw_param->qpn); + if (!ibqp) + return -EINVAL; + qp = to_c2qp(ibqp); + + /* Associate QP <--> CM_ID */ + cm_id->provider_data = qp; + cm_id->add_ref(cm_id); + qp->cm_id = cm_id; + + /* + * only support the max private_data length + */ + if (iw_param->private_data_len > C2_MAX_PRIVATE_DATA_SIZE) { + err = -EINVAL; + goto bail0; + } + /* + * Set the rdma read limits + */ + err = c2_qp_set_read_limits(c2dev, qp, iw_param->ord, iw_param->ird); + if (err) + goto bail0; + + /* + * Create and send a WR_QP_CONNECT... + */ + wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL); + if (!wr) { + err = -ENOMEM; + goto bail0; + } + + vq_req = vq_req_alloc(c2dev); + if (!vq_req) { + err = -ENOMEM; + goto bail1; + } + + c2_wr_set_id(wr, CCWR_QP_CONNECT); + wr->hdr.context = 0; + wr->rnic_handle = c2dev->adapter_handle; + wr->qp_handle = qp->adapter_handle; + + wr->remote_addr = raddr->sin_addr.s_addr; + wr->remote_port = raddr->sin_port; + + /* + * Move any private data from the callers's buf into + * the WR. + */ + if (iw_param->private_data) { + wr->private_data_length = + cpu_to_be32(iw_param->private_data_len); + memcpy(&wr->private_data[0], iw_param->private_data, + iw_param->private_data_len); + } else + wr->private_data_length = 0; + + /* + * Send WR to adapter. NOTE: There is no synch reply from + * the adapter. + */ + err = vq_send_wr(c2dev, (union c2wr *) wr); + vq_req_free(c2dev, vq_req); + + bail1: + kfree(wr); + bail0: + if (err) { + /* + * If we fail, release reference on QP and + * disassociate QP from CM_ID + */ + cm_id->provider_data = NULL; + qp->cm_id = NULL; + cm_id->rem_ref(cm_id); + } + return err; +} + +int c2_llp_service_create(struct iw_cm_id *cm_id, int backlog) +{ + struct c2_dev *c2dev; + struct c2wr_ep_listen_create_req wr; + struct c2wr_ep_listen_create_rep *reply; + struct c2_vq_req *vq_req; + int err; + struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr; + + if (cm_id->local_addr.ss_family != AF_INET) + return -ENOSYS; + + c2dev = to_c2dev(cm_id->device); + if (c2dev == NULL) + return -EINVAL; + + /* + * Allocate verbs request. + */ + vq_req = vq_req_alloc(c2dev); + if (!vq_req) + return -ENOMEM; + + /* + * Build the WR + */ + c2_wr_set_id(&wr, CCWR_EP_LISTEN_CREATE); + wr.hdr.context = (u64) (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.local_addr = laddr->sin_addr.s_addr; + wr.local_port = laddr->sin_port; + wr.backlog = cpu_to_be32(backlog); + wr.user_context = (u64) (unsigned long) cm_id; + + /* + * Reference the request struct. Dereferenced in the int handler. + */ + vq_req_get(c2dev, vq_req); + + /* + * Send WR to adapter + */ + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail0; + } + + /* + * Wait for reply from adapter + */ + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail0; + + /* + * Process reply + */ + reply = + (struct c2wr_ep_listen_create_rep *) (unsigned long) vq_req->reply_msg; + if (!reply) { + err = -ENOMEM; + goto bail1; + } + + if ((err = c2_errno(reply)) != 0) + goto bail1; + + /* + * Keep the adapter handle. Used in subsequent destroy + */ + cm_id->provider_data = (void*)(unsigned long) reply->ep_handle; + + /* + * free vq stuff + */ + vq_repbuf_free(c2dev, reply); + vq_req_free(c2dev, vq_req); + + return 0; + + bail1: + vq_repbuf_free(c2dev, reply); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} + + +int c2_llp_service_destroy(struct iw_cm_id *cm_id) +{ + + struct c2_dev *c2dev; + struct c2wr_ep_listen_destroy_req wr; + struct c2wr_ep_listen_destroy_rep *reply; + struct c2_vq_req *vq_req; + int err; + + c2dev = to_c2dev(cm_id->device); + if (c2dev == NULL) + return -EINVAL; + + /* + * Allocate verbs request. + */ + vq_req = vq_req_alloc(c2dev); + if (!vq_req) + return -ENOMEM; + + /* + * Build the WR + */ + c2_wr_set_id(&wr, CCWR_EP_LISTEN_DESTROY); + wr.hdr.context = (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.ep_handle = (u32)(unsigned long)cm_id->provider_data; + + /* + * reference the request struct. dereferenced in the int handler. + */ + vq_req_get(c2dev, vq_req); + + /* + * Send WR to adapter + */ + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail0; + } + + /* + * Wait for reply from adapter + */ + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail0; + + /* + * Process reply + */ + reply=(struct c2wr_ep_listen_destroy_rep *)(unsigned long)vq_req->reply_msg; + if (!reply) { + err = -ENOMEM; + goto bail0; + } + if ((err = c2_errno(reply)) != 0) + goto bail1; + + bail1: + vq_repbuf_free(c2dev, reply); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} + +int c2_llp_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) +{ + struct c2_dev *c2dev = to_c2dev(cm_id->device); + struct c2_qp *qp; + struct ib_qp *ibqp; + struct c2wr_cr_accept_req *wr; /* variable length WR */ + struct c2_vq_req *vq_req; + struct c2wr_cr_accept_rep *reply; /* VQ Reply msg ptr. */ + int err; + + ibqp = c2_get_qp(cm_id->device, iw_param->qpn); + if (!ibqp) + return -EINVAL; + qp = to_c2qp(ibqp); + + /* Set the RDMA read limits */ + err = c2_qp_set_read_limits(c2dev, qp, iw_param->ord, iw_param->ird); + if (err) + goto bail0; + + /* Allocate verbs request. */ + vq_req = vq_req_alloc(c2dev); + if (!vq_req) { + err = -ENOMEM; + goto bail0; + } + vq_req->qp = qp; + vq_req->cm_id = cm_id; + vq_req->event = IW_CM_EVENT_ESTABLISHED; + + wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL); + if (!wr) { + err = -ENOMEM; + goto bail1; + } + + /* Build the WR */ + c2_wr_set_id(wr, CCWR_CR_ACCEPT); + wr->hdr.context = (unsigned long) vq_req; + wr->rnic_handle = c2dev->adapter_handle; + wr->ep_handle = (u32) (unsigned long) cm_id->provider_data; + wr->qp_handle = qp->adapter_handle; + + /* Replace the cr_handle with the QP after accept */ + cm_id->provider_data = qp; + cm_id->add_ref(cm_id); + qp->cm_id = cm_id; + + cm_id->provider_data = qp; + + /* Validate private_data length */ + if (iw_param->private_data_len > C2_MAX_PRIVATE_DATA_SIZE) { + err = -EINVAL; + goto bail1; + } + + if (iw_param->private_data) { + wr->private_data_length = cpu_to_be32(iw_param->private_data_len); + memcpy(&wr->private_data[0], + iw_param->private_data, iw_param->private_data_len); + } else + wr->private_data_length = 0; + + /* Reference the request struct. Dereferenced in the int handler. */ + vq_req_get(c2dev, vq_req); + + /* Send WR to adapter */ + err = vq_send_wr(c2dev, (union c2wr *) wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail1; + } + + /* Wait for reply from adapter */ + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail1; + + /* Check that reply is present */ + reply = (struct c2wr_cr_accept_rep *) (unsigned long) vq_req->reply_msg; + if (!reply) { + err = -ENOMEM; + goto bail1; + } + + err = c2_errno(reply); + vq_repbuf_free(c2dev, reply); + + if (!err) + c2_set_qp_state(qp, C2_QP_STATE_RTS); + bail1: + kfree(wr); + vq_req_free(c2dev, vq_req); + bail0: + if (err) { + /* + * If we fail, release reference on QP and + * disassociate QP from CM_ID + */ + cm_id->provider_data = NULL; + qp->cm_id = NULL; + cm_id->rem_ref(cm_id); + } + return err; +} + +int c2_llp_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) +{ + struct c2_dev *c2dev; + struct c2wr_cr_reject_req wr; + struct c2_vq_req *vq_req; + struct c2wr_cr_reject_rep *reply; + int err; + + c2dev = to_c2dev(cm_id->device); + + /* + * Allocate verbs request. + */ + vq_req = vq_req_alloc(c2dev); + if (!vq_req) + return -ENOMEM; + + /* + * Build the WR + */ + c2_wr_set_id(&wr, CCWR_CR_REJECT); + wr.hdr.context = (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.ep_handle = (u32) (unsigned long) cm_id->provider_data; + + /* + * reference the request struct. dereferenced in the int handler. + */ + vq_req_get(c2dev, vq_req); + + /* + * Send WR to adapter + */ + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail0; + } + + /* + * Wait for reply from adapter + */ + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail0; + + /* + * Process reply + */ + reply = (struct c2wr_cr_reject_rep *) (unsigned long) + vq_req->reply_msg; + if (!reply) { + err = -ENOMEM; + goto bail0; + } + err = c2_errno(reply); + /* + * free vq stuff + */ + vq_repbuf_free(c2dev, reply); + + bail0: + vq_req_free(c2dev, vq_req); + return err; +} diff --git a/drivers/infiniband/hw/amso1100/c2_cq.c b/drivers/infiniband/hw/amso1100/c2_cq.c new file mode 100644 index 0000000..1b63185 --- /dev/null +++ b/drivers/infiniband/hw/amso1100/c2_cq.c @@ -0,0 +1,440 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include + +#include "c2.h" +#include "c2_vq.h" +#include "c2_status.h" + +#define C2_CQ_MSG_SIZE ((sizeof(struct c2wr_ce) + 32-1) & ~(32-1)) + +static struct c2_cq *c2_cq_get(struct c2_dev *c2dev, int cqn) +{ + struct c2_cq *cq; + unsigned long flags; + + spin_lock_irqsave(&c2dev->lock, flags); + cq = c2dev->qptr_array[cqn]; + if (!cq) { + spin_unlock_irqrestore(&c2dev->lock, flags); + return NULL; + } + atomic_inc(&cq->refcount); + spin_unlock_irqrestore(&c2dev->lock, flags); + return cq; +} + +static void c2_cq_put(struct c2_cq *cq) +{ + if (atomic_dec_and_test(&cq->refcount)) + wake_up(&cq->wait); +} + +void c2_cq_event(struct c2_dev *c2dev, u32 mq_index) +{ + struct c2_cq *cq; + + cq = c2_cq_get(c2dev, mq_index); + if (!cq) { + printk("discarding events on destroyed CQN=%d\n", mq_index); + return; + } + + (*cq->ibcq.comp_handler) (&cq->ibcq, cq->ibcq.cq_context); + c2_cq_put(cq); +} + +void c2_cq_clean(struct c2_dev *c2dev, struct c2_qp *qp, u32 mq_index) +{ + struct c2_cq *cq; + struct c2_mq *q; + + cq = c2_cq_get(c2dev, mq_index); + if (!cq) + return; + + spin_lock_irq(&cq->lock); + q = &cq->mq; + if (q && !c2_mq_empty(q)) { + u16 priv = q->priv; + struct c2wr_ce *msg; + + while (priv != be16_to_cpu(*q->shared)) { + msg = (struct c2wr_ce *) + (q->msg_pool.host + priv * q->msg_size); + if (msg->qp_user_context == (u64) (unsigned long) qp) { + msg->qp_user_context = (u64) 0; + } + priv = (priv + 1) % q->q_size; + } + } + spin_unlock_irq(&cq->lock); + c2_cq_put(cq); +} + +static inline enum ib_wc_status c2_cqe_status_to_openib(u8 status) +{ + switch (status) { + case C2_OK: + return IB_WC_SUCCESS; + case CCERR_FLUSHED: + return IB_WC_WR_FLUSH_ERR; + case CCERR_BASE_AND_BOUNDS_VIOLATION: + return IB_WC_LOC_PROT_ERR; + case CCERR_ACCESS_VIOLATION: + return IB_WC_LOC_ACCESS_ERR; + case CCERR_TOTAL_LENGTH_TOO_BIG: + return IB_WC_LOC_LEN_ERR; + case CCERR_INVALID_WINDOW: + return IB_WC_MW_BIND_ERR; + default: + return IB_WC_GENERAL_ERR; + } +} + + +static inline int c2_poll_one(struct c2_dev *c2dev, + struct c2_cq *cq, struct ib_wc *entry) +{ + struct c2wr_ce *ce; + struct c2_qp *qp; + int is_recv = 0; + + ce = c2_mq_consume(&cq->mq); + if (!ce) { + return -EAGAIN; + } + + /* + * if the qp returned is null then this qp has already + * been freed and we are unable process the completion. + * try pulling the next message + */ + while ((qp = + (struct c2_qp *) (unsigned long) ce->qp_user_context) == NULL) { + c2_mq_free(&cq->mq); + ce = c2_mq_consume(&cq->mq); + if (!ce) + return -EAGAIN; + } + + entry->status = c2_cqe_status_to_openib(c2_wr_get_result(ce)); + entry->wr_id = ce->hdr.context; + entry->qp = &qp->ibqp; + entry->wc_flags = 0; + entry->slid = 0; + entry->sl = 0; + entry->src_qp = 0; + entry->dlid_path_bits = 0; + entry->pkey_index = 0; + + switch (c2_wr_get_id(ce)) { + case C2_WR_TYPE_SEND: + entry->opcode = IB_WC_SEND; + break; + case C2_WR_TYPE_RDMA_WRITE: + entry->opcode = IB_WC_RDMA_WRITE; + break; + case C2_WR_TYPE_RDMA_READ: + entry->opcode = IB_WC_RDMA_READ; + break; + case C2_WR_TYPE_BIND_MW: + entry->opcode = IB_WC_BIND_MW; + break; + case C2_WR_TYPE_RECV: + entry->byte_len = be32_to_cpu(ce->bytes_rcvd); + entry->opcode = IB_WC_RECV; + is_recv = 1; + break; + default: + break; + } + + /* consume the WQEs */ + if (is_recv) + c2_mq_lconsume(&qp->rq_mq, 1); + else + c2_mq_lconsume(&qp->sq_mq, + be32_to_cpu(c2_wr_get_wqe_count(ce)) + 1); + + /* free the message */ + c2_mq_free(&cq->mq); + + return 0; +} + +int c2_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) +{ + struct c2_dev *c2dev = to_c2dev(ibcq->device); + struct c2_cq *cq = to_c2cq(ibcq); + unsigned long flags; + int npolled, err; + + spin_lock_irqsave(&cq->lock, flags); + + for (npolled = 0; npolled < num_entries; ++npolled) { + + err = c2_poll_one(c2dev, cq, entry + npolled); + if (err) + break; + } + + spin_unlock_irqrestore(&cq->lock, flags); + + return npolled; +} + +int c2_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags) +{ + struct c2_mq_shared __iomem *shared; + struct c2_cq *cq; + unsigned long flags; + int ret = 0; + + cq = to_c2cq(ibcq); + shared = cq->mq.peer; + + if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_NEXT_COMP) + writeb(C2_CQ_NOTIFICATION_TYPE_NEXT, &shared->notification_type); + else if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED) + writeb(C2_CQ_NOTIFICATION_TYPE_NEXT_SE, &shared->notification_type); + else + return -EINVAL; + + writeb(CQ_WAIT_FOR_DMA | CQ_ARMED, &shared->armed); + + /* + * Now read back shared->armed to make the PCI + * write synchronous. This is necessary for + * correct cq notification semantics. + */ + readb(&shared->armed); + + if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) { + spin_lock_irqsave(&cq->lock, flags); + ret = !c2_mq_empty(&cq->mq); + spin_unlock_irqrestore(&cq->lock, flags); + } + + return ret; +} + +static void c2_free_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq) +{ + dma_free_coherent(&c2dev->pcidev->dev, mq->q_size * mq->msg_size, + mq->msg_pool.host, dma_unmap_addr(mq, mapping)); +} + +static int c2_alloc_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq, + size_t q_size, size_t msg_size) +{ + u8 *pool_start; + + if (q_size > SIZE_MAX / msg_size) + return -EINVAL; + + pool_start = dma_alloc_coherent(&c2dev->pcidev->dev, q_size * msg_size, + &mq->host_dma, GFP_KERNEL); + if (!pool_start) + return -ENOMEM; + + c2_mq_rep_init(mq, + 0, /* index (currently unknown) */ + q_size, + msg_size, + pool_start, + NULL, /* peer (currently unknown) */ + C2_MQ_HOST_TARGET); + + dma_unmap_addr_set(mq, mapping, mq->host_dma); + + return 0; +} + +int c2_init_cq(struct c2_dev *c2dev, int entries, + struct c2_ucontext *ctx, struct c2_cq *cq) +{ + struct c2wr_cq_create_req wr; + struct c2wr_cq_create_rep *reply; + unsigned long peer_pa; + struct c2_vq_req *vq_req; + int err; + + might_sleep(); + + cq->ibcq.cqe = entries - 1; + cq->is_kernel = !ctx; + + /* Allocate a shared pointer */ + cq->mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool, + &cq->mq.shared_dma, GFP_KERNEL); + if (!cq->mq.shared) + return -ENOMEM; + + /* Allocate pages for the message pool */ + err = c2_alloc_cq_buf(c2dev, &cq->mq, entries + 1, C2_CQ_MSG_SIZE); + if (err) + goto bail0; + + vq_req = vq_req_alloc(c2dev); + if (!vq_req) { + err = -ENOMEM; + goto bail1; + } + + memset(&wr, 0, sizeof(wr)); + c2_wr_set_id(&wr, CCWR_CQ_CREATE); + wr.hdr.context = (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.msg_size = cpu_to_be32(cq->mq.msg_size); + wr.depth = cpu_to_be32(cq->mq.q_size); + wr.shared_ht = cpu_to_be64(cq->mq.shared_dma); + wr.msg_pool = cpu_to_be64(cq->mq.host_dma); + wr.user_context = (u64) (unsigned long) (cq); + + vq_req_get(c2dev, vq_req); + + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail2; + } + + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail2; + + reply = (struct c2wr_cq_create_rep *) (unsigned long) (vq_req->reply_msg); + if (!reply) { + err = -ENOMEM; + goto bail2; + } + + if ((err = c2_errno(reply)) != 0) + goto bail3; + + cq->adapter_handle = reply->cq_handle; + cq->mq.index = be32_to_cpu(reply->mq_index); + + peer_pa = c2dev->pa + be32_to_cpu(reply->adapter_shared); + cq->mq.peer = ioremap_nocache(peer_pa, PAGE_SIZE); + if (!cq->mq.peer) { + err = -ENOMEM; + goto bail3; + } + + vq_repbuf_free(c2dev, reply); + vq_req_free(c2dev, vq_req); + + spin_lock_init(&cq->lock); + atomic_set(&cq->refcount, 1); + init_waitqueue_head(&cq->wait); + + /* + * Use the MQ index allocated by the adapter to + * store the CQ in the qptr_array + */ + cq->cqn = cq->mq.index; + c2dev->qptr_array[cq->cqn] = cq; + + return 0; + + bail3: + vq_repbuf_free(c2dev, reply); + bail2: + vq_req_free(c2dev, vq_req); + bail1: + c2_free_cq_buf(c2dev, &cq->mq); + bail0: + c2_free_mqsp(cq->mq.shared); + + return err; +} + +void c2_free_cq(struct c2_dev *c2dev, struct c2_cq *cq) +{ + int err; + struct c2_vq_req *vq_req; + struct c2wr_cq_destroy_req wr; + struct c2wr_cq_destroy_rep *reply; + + might_sleep(); + + /* Clear CQ from the qptr array */ + spin_lock_irq(&c2dev->lock); + c2dev->qptr_array[cq->mq.index] = NULL; + atomic_dec(&cq->refcount); + spin_unlock_irq(&c2dev->lock); + + wait_event(cq->wait, !atomic_read(&cq->refcount)); + + vq_req = vq_req_alloc(c2dev); + if (!vq_req) { + goto bail0; + } + + memset(&wr, 0, sizeof(wr)); + c2_wr_set_id(&wr, CCWR_CQ_DESTROY); + wr.hdr.context = (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.cq_handle = cq->adapter_handle; + + vq_req_get(c2dev, vq_req); + + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail1; + } + + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail1; + + reply = (struct c2wr_cq_destroy_rep *) (unsigned long) (vq_req->reply_msg); + if (reply) + vq_repbuf_free(c2dev, reply); + bail1: + vq_req_free(c2dev, vq_req); + bail0: + if (cq->is_kernel) { + c2_free_cq_buf(c2dev, &cq->mq); + } + + return; +} diff --git a/drivers/infiniband/hw/amso1100/c2_intr.c b/drivers/infiniband/hw/amso1100/c2_intr.c new file mode 100644 index 0000000..3a17d9b --- /dev/null +++ b/drivers/infiniband/hw/amso1100/c2_intr.c @@ -0,0 +1,219 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "c2.h" +#include +#include "c2_vq.h" + +static void handle_mq(struct c2_dev *c2dev, u32 index); +static void handle_vq(struct c2_dev *c2dev, u32 mq_index); + +/* + * Handle RNIC interrupts + */ +void c2_rnic_interrupt(struct c2_dev *c2dev) +{ + unsigned int mq_index; + + while (c2dev->hints_read != be16_to_cpu(*c2dev->hint_count)) { + mq_index = readl(c2dev->regs + PCI_BAR0_HOST_HINT); + if (mq_index & 0x80000000) { + break; + } + + c2dev->hints_read++; + handle_mq(c2dev, mq_index); + } + +} + +/* + * Top level MQ handler + */ +static void handle_mq(struct c2_dev *c2dev, u32 mq_index) +{ + if (c2dev->qptr_array[mq_index] == NULL) { + pr_debug("handle_mq: stray activity for mq_index=%d\n", + mq_index); + return; + } + + switch (mq_index) { + case (0): + /* + * An index of 0 in the activity queue + * indicates the req vq now has messages + * available... + * + * Wake up any waiters waiting on req VQ + * message availability. + */ + wake_up(&c2dev->req_vq_wo); + break; + case (1): + handle_vq(c2dev, mq_index); + break; + case (2): + /* We have to purge the VQ in case there are pending + * accept reply requests that would result in the + * generation of an ESTABLISHED event. If we don't + * generate these first, a CLOSE event could end up + * being delivered before the ESTABLISHED event. + */ + handle_vq(c2dev, 1); + + c2_ae_event(c2dev, mq_index); + break; + default: + /* There is no event synchronization between CQ events + * and AE or CM events. In fact, CQE could be + * delivered for all of the I/O up to and including the + * FLUSH for a peer disconenct prior to the ESTABLISHED + * event being delivered to the app. The reason for this + * is that CM events are delivered on a thread, while AE + * and CM events are delivered on interrupt context. + */ + c2_cq_event(c2dev, mq_index); + break; + } + + return; +} + +/* + * Handles verbs WR replies. + */ +static void handle_vq(struct c2_dev *c2dev, u32 mq_index) +{ + void *adapter_msg, *reply_msg; + struct c2wr_hdr *host_msg; + struct c2wr_hdr tmp; + struct c2_mq *reply_vq; + struct c2_vq_req *req; + struct iw_cm_event cm_event; + int err; + + reply_vq = (struct c2_mq *) c2dev->qptr_array[mq_index]; + + /* + * get next msg from mq_index into adapter_msg. + * don't free it yet. + */ + adapter_msg = c2_mq_consume(reply_vq); + if (adapter_msg == NULL) { + return; + } + + host_msg = vq_repbuf_alloc(c2dev); + + /* + * If we can't get a host buffer, then we'll still + * wakeup the waiter, we just won't give him the msg. + * It is assumed the waiter will deal with this... + */ + if (!host_msg) { + pr_debug("handle_vq: no repbufs!\n"); + + /* + * just copy the WR header into a local variable. + * this allows us to still demux on the context + */ + host_msg = &tmp; + memcpy(host_msg, adapter_msg, sizeof(tmp)); + reply_msg = NULL; + } else { + memcpy(host_msg, adapter_msg, reply_vq->msg_size); + reply_msg = host_msg; + } + + /* + * consume the msg from the MQ + */ + c2_mq_free(reply_vq); + + /* + * wakeup the waiter. + */ + req = (struct c2_vq_req *) (unsigned long) host_msg->context; + if (req == NULL) { + /* + * We should never get here, as the adapter should + * never send us a reply that we're not expecting. + */ + if (reply_msg != NULL) + vq_repbuf_free(c2dev, host_msg); + pr_debug("handle_vq: UNEXPECTEDLY got NULL req\n"); + return; + } + + if (reply_msg) + err = c2_errno(reply_msg); + else + err = -ENOMEM; + + if (!err) switch (req->event) { + case IW_CM_EVENT_ESTABLISHED: + c2_set_qp_state(req->qp, + C2_QP_STATE_RTS); + /* + * Until ird/ord negotiation via MPAv2 support is added, send + * max supported values + */ + cm_event.ird = cm_event.ord = 128; + case IW_CM_EVENT_CLOSE: + + /* + * Move the QP to RTS if this is + * the established event + */ + cm_event.event = req->event; + cm_event.status = 0; + cm_event.local_addr = req->cm_id->local_addr; + cm_event.remote_addr = req->cm_id->remote_addr; + cm_event.private_data = NULL; + cm_event.private_data_len = 0; + req->cm_id->event_handler(req->cm_id, &cm_event); + break; + default: + break; + } + + req->reply_msg = (u64) (unsigned long) (reply_msg); + atomic_set(&req->reply_ready, 1); + wake_up(&req->wait_object); + + /* + * If the request was cancelled, then this put will + * free the vq_req memory...and reply_msg!!! + */ + vq_req_put(c2dev, req); +} diff --git a/drivers/infiniband/hw/amso1100/c2_mm.c b/drivers/infiniband/hw/amso1100/c2_mm.c new file mode 100644 index 0000000..119c4f3 --- /dev/null +++ b/drivers/infiniband/hw/amso1100/c2_mm.c @@ -0,0 +1,377 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include + +#include "c2.h" +#include "c2_vq.h" + +#define PBL_VIRT 1 +#define PBL_PHYS 2 + +/* + * Send all the PBL messages to convey the remainder of the PBL + * Wait for the adapter's reply on the last one. + * This is indicated by setting the MEM_PBL_COMPLETE in the flags. + * + * NOTE: vq_req is _not_ freed by this function. The VQ Host + * Reply buffer _is_ freed by this function. + */ +static int +send_pbl_messages(struct c2_dev *c2dev, __be32 stag_index, + unsigned long va, u32 pbl_depth, + struct c2_vq_req *vq_req, int pbl_type) +{ + u32 pbe_count; /* amt that fits in a PBL msg */ + u32 count; /* amt in this PBL MSG. */ + struct c2wr_nsmr_pbl_req *wr; /* PBL WR ptr */ + struct c2wr_nsmr_pbl_rep *reply; /* reply ptr */ + int err, pbl_virt, pbl_index, i; + + switch (pbl_type) { + case PBL_VIRT: + pbl_virt = 1; + break; + case PBL_PHYS: + pbl_virt = 0; + break; + default: + return -EINVAL; + break; + } + + pbe_count = (c2dev->req_vq.msg_size - + sizeof(struct c2wr_nsmr_pbl_req)) / sizeof(u64); + wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL); + if (!wr) { + return -ENOMEM; + } + c2_wr_set_id(wr, CCWR_NSMR_PBL); + + /* + * Only the last PBL message will generate a reply from the verbs, + * so we set the context to 0 indicating there is no kernel verbs + * handler blocked awaiting this reply. + */ + wr->hdr.context = 0; + wr->rnic_handle = c2dev->adapter_handle; + wr->stag_index = stag_index; /* already swapped */ + wr->flags = 0; + pbl_index = 0; + while (pbl_depth) { + count = min(pbe_count, pbl_depth); + wr->addrs_length = cpu_to_be32(count); + + /* + * If this is the last message, then reference the + * vq request struct cuz we're gonna wait for a reply. + * also make this PBL msg as the last one. + */ + if (count == pbl_depth) { + /* + * reference the request struct. dereferenced in the + * int handler. + */ + vq_req_get(c2dev, vq_req); + wr->flags = cpu_to_be32(MEM_PBL_COMPLETE); + + /* + * This is the last PBL message. + * Set the context to our VQ Request Object so we can + * wait for the reply. + */ + wr->hdr.context = (unsigned long) vq_req; + } + + /* + * If pbl_virt is set then va is a virtual address + * that describes a virtually contiguous memory + * allocation. The wr needs the start of each virtual page + * to be converted to the corresponding physical address + * of the page. If pbl_virt is not set then va is an array + * of physical addresses and there is no conversion to do. + * Just fill in the wr with what is in the array. + */ + for (i = 0; i < count; i++) { + if (pbl_virt) { + va += PAGE_SIZE; + } else { + wr->paddrs[i] = + cpu_to_be64(((u64 *)va)[pbl_index + i]); + } + } + + /* + * Send WR to adapter + */ + err = vq_send_wr(c2dev, (union c2wr *) wr); + if (err) { + if (count <= pbe_count) { + vq_req_put(c2dev, vq_req); + } + goto bail0; + } + pbl_depth -= count; + pbl_index += count; + } + + /* + * Now wait for the reply... + */ + err = vq_wait_for_reply(c2dev, vq_req); + if (err) { + goto bail0; + } + + /* + * Process reply + */ + reply = (struct c2wr_nsmr_pbl_rep *) (unsigned long) vq_req->reply_msg; + if (!reply) { + err = -ENOMEM; + goto bail0; + } + + err = c2_errno(reply); + + vq_repbuf_free(c2dev, reply); + bail0: + kfree(wr); + return err; +} + +#define C2_PBL_MAX_DEPTH 131072 +int +c2_nsmr_register_phys_kern(struct c2_dev *c2dev, u64 *addr_list, + int page_size, int pbl_depth, u32 length, + u32 offset, u64 *va, enum c2_acf acf, + struct c2_mr *mr) +{ + struct c2_vq_req *vq_req; + struct c2wr_nsmr_register_req *wr; + struct c2wr_nsmr_register_rep *reply; + u16 flags; + int i, pbe_count, count; + int err; + + if (!va || !length || !addr_list || !pbl_depth) + return -EINTR; + + /* + * Verify PBL depth is within rnic max + */ + if (pbl_depth > C2_PBL_MAX_DEPTH) { + return -EINTR; + } + + /* + * allocate verbs request object + */ + vq_req = vq_req_alloc(c2dev); + if (!vq_req) + return -ENOMEM; + + wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL); + if (!wr) { + err = -ENOMEM; + goto bail0; + } + + /* + * build the WR + */ + c2_wr_set_id(wr, CCWR_NSMR_REGISTER); + wr->hdr.context = (unsigned long) vq_req; + wr->rnic_handle = c2dev->adapter_handle; + + flags = (acf | MEM_VA_BASED | MEM_REMOTE); + + /* + * compute how many pbes can fit in the message + */ + pbe_count = (c2dev->req_vq.msg_size - + sizeof(struct c2wr_nsmr_register_req)) / sizeof(u64); + + if (pbl_depth <= pbe_count) { + flags |= MEM_PBL_COMPLETE; + } + wr->flags = cpu_to_be16(flags); + wr->stag_key = 0; //stag_key; + wr->va = cpu_to_be64(*va); + wr->pd_id = mr->pd->pd_id; + wr->pbe_size = cpu_to_be32(page_size); + wr->length = cpu_to_be32(length); + wr->pbl_depth = cpu_to_be32(pbl_depth); + wr->fbo = cpu_to_be32(offset); + count = min(pbl_depth, pbe_count); + wr->addrs_length = cpu_to_be32(count); + + /* + * fill out the PBL for this message + */ + for (i = 0; i < count; i++) { + wr->paddrs[i] = cpu_to_be64(addr_list[i]); + } + + /* + * regerence the request struct + */ + vq_req_get(c2dev, vq_req); + + /* + * send the WR to the adapter + */ + err = vq_send_wr(c2dev, (union c2wr *) wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail1; + } + + /* + * wait for reply from adapter + */ + err = vq_wait_for_reply(c2dev, vq_req); + if (err) { + goto bail1; + } + + /* + * process reply + */ + reply = + (struct c2wr_nsmr_register_rep *) (unsigned long) (vq_req->reply_msg); + if (!reply) { + err = -ENOMEM; + goto bail1; + } + if ((err = c2_errno(reply))) { + goto bail2; + } + //*p_pb_entries = be32_to_cpu(reply->pbl_depth); + mr->ibmr.lkey = mr->ibmr.rkey = be32_to_cpu(reply->stag_index); + vq_repbuf_free(c2dev, reply); + + /* + * if there are still more PBEs we need to send them to + * the adapter and wait for a reply on the final one. + * reuse vq_req for this purpose. + */ + pbl_depth -= count; + if (pbl_depth) { + + vq_req->reply_msg = (unsigned long) NULL; + atomic_set(&vq_req->reply_ready, 0); + err = send_pbl_messages(c2dev, + cpu_to_be32(mr->ibmr.lkey), + (unsigned long) &addr_list[i], + pbl_depth, vq_req, PBL_PHYS); + if (err) { + goto bail1; + } + } + + vq_req_free(c2dev, vq_req); + kfree(wr); + + return err; + + bail2: + vq_repbuf_free(c2dev, reply); + bail1: + kfree(wr); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} + +int c2_stag_dealloc(struct c2_dev *c2dev, u32 stag_index) +{ + struct c2_vq_req *vq_req; /* verbs request object */ + struct c2wr_stag_dealloc_req wr; /* work request */ + struct c2wr_stag_dealloc_rep *reply; /* WR reply */ + int err; + + + /* + * allocate verbs request object + */ + vq_req = vq_req_alloc(c2dev); + if (!vq_req) { + return -ENOMEM; + } + + /* + * Build the WR + */ + c2_wr_set_id(&wr, CCWR_STAG_DEALLOC); + wr.hdr.context = (u64) (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.stag_index = cpu_to_be32(stag_index); + + /* + * reference the request struct. dereferenced in the int handler. + */ + vq_req_get(c2dev, vq_req); + + /* + * Send WR to adapter + */ + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail0; + } + + /* + * Wait for reply from adapter + */ + err = vq_wait_for_reply(c2dev, vq_req); + if (err) { + goto bail0; + } + + /* + * Process reply + */ + reply = (struct c2wr_stag_dealloc_rep *) (unsigned long) vq_req->reply_msg; + if (!reply) { + err = -ENOMEM; + goto bail0; + } + + err = c2_errno(reply); + + vq_repbuf_free(c2dev, reply); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} diff --git a/drivers/infiniband/hw/amso1100/c2_mq.c b/drivers/infiniband/hw/amso1100/c2_mq.c new file mode 100644 index 0000000..0cddc49 --- /dev/null +++ b/drivers/infiniband/hw/amso1100/c2_mq.c @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "c2.h" +#include "c2_mq.h" + +void *c2_mq_alloc(struct c2_mq *q) +{ + BUG_ON(q->magic != C2_MQ_MAGIC); + BUG_ON(q->type != C2_MQ_ADAPTER_TARGET); + + if (c2_mq_full(q)) { + return NULL; + } else { +#ifdef DEBUG + struct c2wr_hdr *m = + (struct c2wr_hdr *) (q->msg_pool.host + q->priv * q->msg_size); +#ifdef CCMSGMAGIC + BUG_ON(m->magic != be32_to_cpu(~CCWR_MAGIC)); + m->magic = cpu_to_be32(CCWR_MAGIC); +#endif + return m; +#else + return q->msg_pool.host + q->priv * q->msg_size; +#endif + } +} + +void c2_mq_produce(struct c2_mq *q) +{ + BUG_ON(q->magic != C2_MQ_MAGIC); + BUG_ON(q->type != C2_MQ_ADAPTER_TARGET); + + if (!c2_mq_full(q)) { + q->priv = (q->priv + 1) % q->q_size; + q->hint_count++; + /* Update peer's offset. */ + __raw_writew((__force u16) cpu_to_be16(q->priv), &q->peer->shared); + } +} + +void *c2_mq_consume(struct c2_mq *q) +{ + BUG_ON(q->magic != C2_MQ_MAGIC); + BUG_ON(q->type != C2_MQ_HOST_TARGET); + + if (c2_mq_empty(q)) { + return NULL; + } else { +#ifdef DEBUG + struct c2wr_hdr *m = (struct c2wr_hdr *) + (q->msg_pool.host + q->priv * q->msg_size); +#ifdef CCMSGMAGIC + BUG_ON(m->magic != be32_to_cpu(CCWR_MAGIC)); +#endif + return m; +#else + return q->msg_pool.host + q->priv * q->msg_size; +#endif + } +} + +void c2_mq_free(struct c2_mq *q) +{ + BUG_ON(q->magic != C2_MQ_MAGIC); + BUG_ON(q->type != C2_MQ_HOST_TARGET); + + if (!c2_mq_empty(q)) { + +#ifdef CCMSGMAGIC + { + struct c2wr_hdr __iomem *m = (struct c2wr_hdr __iomem *) + (q->msg_pool.adapter + q->priv * q->msg_size); + __raw_writel(cpu_to_be32(~CCWR_MAGIC), &m->magic); + } +#endif + q->priv = (q->priv + 1) % q->q_size; + /* Update peer's offset. */ + __raw_writew((__force u16) cpu_to_be16(q->priv), &q->peer->shared); + } +} + + +void c2_mq_lconsume(struct c2_mq *q, u32 wqe_count) +{ + BUG_ON(q->magic != C2_MQ_MAGIC); + BUG_ON(q->type != C2_MQ_ADAPTER_TARGET); + + while (wqe_count--) { + BUG_ON(c2_mq_empty(q)); + *q->shared = cpu_to_be16((be16_to_cpu(*q->shared)+1) % q->q_size); + } +} + +#if 0 +u32 c2_mq_count(struct c2_mq *q) +{ + s32 count; + + if (q->type == C2_MQ_HOST_TARGET) + count = be16_to_cpu(*q->shared) - q->priv; + else + count = q->priv - be16_to_cpu(*q->shared); + + if (count < 0) + count += q->q_size; + + return (u32) count; +} +#endif /* 0 */ + +void c2_mq_req_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size, + u8 __iomem *pool_start, u16 __iomem *peer, u32 type) +{ + BUG_ON(!q->shared); + + /* This code assumes the byte swapping has already been done! */ + q->index = index; + q->q_size = q_size; + q->msg_size = msg_size; + q->msg_pool.adapter = pool_start; + q->peer = (struct c2_mq_shared __iomem *) peer; + q->magic = C2_MQ_MAGIC; + q->type = type; + q->priv = 0; + q->hint_count = 0; + return; +} +void c2_mq_rep_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size, + u8 *pool_start, u16 __iomem *peer, u32 type) +{ + BUG_ON(!q->shared); + + /* This code assumes the byte swapping has already been done! */ + q->index = index; + q->q_size = q_size; + q->msg_size = msg_size; + q->msg_pool.host = pool_start; + q->peer = (struct c2_mq_shared __iomem *) peer; + q->magic = C2_MQ_MAGIC; + q->type = type; + q->priv = 0; + q->hint_count = 0; + return; +} diff --git a/drivers/infiniband/hw/amso1100/c2_mq.h b/drivers/infiniband/hw/amso1100/c2_mq.h new file mode 100644 index 0000000..fc1b9a7 --- /dev/null +++ b/drivers/infiniband/hw/amso1100/c2_mq.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _C2_MQ_H_ +#define _C2_MQ_H_ +#include +#include +#include "c2_wr.h" + +enum c2_shared_regs { + + C2_SHARED_ARMED = 0x10, + C2_SHARED_NOTIFY = 0x18, + C2_SHARED_SHARED = 0x40, +}; + +struct c2_mq_shared { + u16 unused1; + u8 armed; + u8 notification_type; + u32 unused2; + u16 shared; + /* Pad to 64 bytes. */ + u8 pad[64 - sizeof(u16) - 2 * sizeof(u8) - sizeof(u32) - sizeof(u16)]; +}; + +enum c2_mq_type { + C2_MQ_HOST_TARGET = 1, + C2_MQ_ADAPTER_TARGET = 2, +}; + +/* + * c2_mq_t is for kernel-mode MQs like the VQs Cand the AEQ. + * c2_user_mq_t (which is the same format) is for user-mode MQs... + */ +#define C2_MQ_MAGIC 0x4d512020 /* 'MQ ' */ +struct c2_mq { + u32 magic; + union { + u8 *host; + u8 __iomem *adapter; + } msg_pool; + dma_addr_t host_dma; + DEFINE_DMA_UNMAP_ADDR(mapping); + u16 hint_count; + u16 priv; + struct c2_mq_shared __iomem *peer; + __be16 *shared; + dma_addr_t shared_dma; + u32 q_size; + u32 msg_size; + u32 index; + enum c2_mq_type type; +}; + +static __inline__ int c2_mq_empty(struct c2_mq *q) +{ + return q->priv == be16_to_cpu(*q->shared); +} + +static __inline__ int c2_mq_full(struct c2_mq *q) +{ + return q->priv == (be16_to_cpu(*q->shared) + q->q_size - 1) % q->q_size; +} + +extern void c2_mq_lconsume(struct c2_mq *q, u32 wqe_count); +extern void *c2_mq_alloc(struct c2_mq *q); +extern void c2_mq_produce(struct c2_mq *q); +extern void *c2_mq_consume(struct c2_mq *q); +extern void c2_mq_free(struct c2_mq *q); +extern void c2_mq_req_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size, + u8 __iomem *pool_start, u16 __iomem *peer, u32 type); +extern void c2_mq_rep_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size, + u8 *pool_start, u16 __iomem *peer, u32 type); + +#endif /* _C2_MQ_H_ */ diff --git a/drivers/infiniband/hw/amso1100/c2_pd.c b/drivers/infiniband/hw/amso1100/c2_pd.c new file mode 100644 index 0000000..f3e81dc --- /dev/null +++ b/drivers/infiniband/hw/amso1100/c2_pd.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "c2.h" +#include "c2_provider.h" + +int c2_pd_alloc(struct c2_dev *c2dev, int privileged, struct c2_pd *pd) +{ + u32 obj; + int ret = 0; + + spin_lock(&c2dev->pd_table.lock); + obj = find_next_zero_bit(c2dev->pd_table.table, c2dev->pd_table.max, + c2dev->pd_table.last); + if (obj >= c2dev->pd_table.max) + obj = find_first_zero_bit(c2dev->pd_table.table, + c2dev->pd_table.max); + if (obj < c2dev->pd_table.max) { + pd->pd_id = obj; + __set_bit(obj, c2dev->pd_table.table); + c2dev->pd_table.last = obj+1; + if (c2dev->pd_table.last >= c2dev->pd_table.max) + c2dev->pd_table.last = 0; + } else + ret = -ENOMEM; + spin_unlock(&c2dev->pd_table.lock); + return ret; +} + +void c2_pd_free(struct c2_dev *c2dev, struct c2_pd *pd) +{ + spin_lock(&c2dev->pd_table.lock); + __clear_bit(pd->pd_id, c2dev->pd_table.table); + spin_unlock(&c2dev->pd_table.lock); +} + +int c2_init_pd_table(struct c2_dev *c2dev) +{ + + c2dev->pd_table.last = 0; + c2dev->pd_table.max = c2dev->props.max_pd; + spin_lock_init(&c2dev->pd_table.lock); + c2dev->pd_table.table = kmalloc(BITS_TO_LONGS(c2dev->props.max_pd) * + sizeof(long), GFP_KERNEL); + if (!c2dev->pd_table.table) + return -ENOMEM; + bitmap_zero(c2dev->pd_table.table, c2dev->props.max_pd); + return 0; +} + +void c2_cleanup_pd_table(struct c2_dev *c2dev) +{ + kfree(c2dev->pd_table.table); +} diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c b/drivers/infiniband/hw/amso1100/c2_provider.c new file mode 100644 index 0000000..2d5cbf4 --- /dev/null +++ b/drivers/infiniband/hw/amso1100/c2_provider.c @@ -0,0 +1,882 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include "c2.h" +#include "c2_provider.h" +#include "c2_user.h" + +static int c2_query_device(struct ib_device *ibdev, + struct ib_device_attr *props) +{ + struct c2_dev *c2dev = to_c2dev(ibdev); + + pr_debug("%s:%u\n", __func__, __LINE__); + + *props = c2dev->props; + return 0; +} + +static int c2_query_port(struct ib_device *ibdev, + u8 port, struct ib_port_attr *props) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + + props->max_mtu = IB_MTU_4096; + props->lid = 0; + props->lmc = 0; + props->sm_lid = 0; + props->sm_sl = 0; + props->state = IB_PORT_ACTIVE; + props->phys_state = 0; + props->port_cap_flags = + IB_PORT_CM_SUP | + IB_PORT_REINIT_SUP | + IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP; + props->gid_tbl_len = 1; + props->pkey_tbl_len = 1; + props->qkey_viol_cntr = 0; + props->active_width = 1; + props->active_speed = IB_SPEED_SDR; + + return 0; +} + +static int c2_query_pkey(struct ib_device *ibdev, + u8 port, u16 index, u16 * pkey) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + *pkey = 0; + return 0; +} + +static int c2_query_gid(struct ib_device *ibdev, u8 port, + int index, union ib_gid *gid) +{ + struct c2_dev *c2dev = to_c2dev(ibdev); + + pr_debug("%s:%u\n", __func__, __LINE__); + memset(&(gid->raw[0]), 0, sizeof(gid->raw)); + memcpy(&(gid->raw[0]), c2dev->pseudo_netdev->dev_addr, 6); + + return 0; +} + +/* Allocate the user context data structure. This keeps track + * of all objects associated with a particular user-mode client. + */ +static struct ib_ucontext *c2_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct c2_ucontext *context; + + pr_debug("%s:%u\n", __func__, __LINE__); + context = kmalloc(sizeof(*context), GFP_KERNEL); + if (!context) + return ERR_PTR(-ENOMEM); + + return &context->ibucontext; +} + +static int c2_dealloc_ucontext(struct ib_ucontext *context) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + kfree(context); + return 0; +} + +static int c2_mmap_uar(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + return -ENOSYS; +} + +static struct ib_pd *c2_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct c2_pd *pd; + int err; + + pr_debug("%s:%u\n", __func__, __LINE__); + + pd = kmalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return ERR_PTR(-ENOMEM); + + err = c2_pd_alloc(to_c2dev(ibdev), !context, pd); + if (err) { + kfree(pd); + return ERR_PTR(err); + } + + if (context) { + if (ib_copy_to_udata(udata, &pd->pd_id, sizeof(__u32))) { + c2_pd_free(to_c2dev(ibdev), pd); + kfree(pd); + return ERR_PTR(-EFAULT); + } + } + + return &pd->ibpd; +} + +static int c2_dealloc_pd(struct ib_pd *pd) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + c2_pd_free(to_c2dev(pd->device), to_c2pd(pd)); + kfree(pd); + + return 0; +} + +static struct ib_ah *c2_ah_create(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + return ERR_PTR(-ENOSYS); +} + +static int c2_ah_destroy(struct ib_ah *ah) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + return -ENOSYS; +} + +static void c2_add_ref(struct ib_qp *ibqp) +{ + struct c2_qp *qp; + BUG_ON(!ibqp); + qp = to_c2qp(ibqp); + atomic_inc(&qp->refcount); +} + +static void c2_rem_ref(struct ib_qp *ibqp) +{ + struct c2_qp *qp; + BUG_ON(!ibqp); + qp = to_c2qp(ibqp); + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); +} + +struct ib_qp *c2_get_qp(struct ib_device *device, int qpn) +{ + struct c2_dev* c2dev = to_c2dev(device); + struct c2_qp *qp; + + qp = c2_find_qpn(c2dev, qpn); + pr_debug("%s Returning QP=%p for QPN=%d, device=%p, refcount=%d\n", + __func__, qp, qpn, device, + (qp?atomic_read(&qp->refcount):0)); + + return (qp?&qp->ibqp:NULL); +} + +static struct ib_qp *c2_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct c2_qp *qp; + int err; + + pr_debug("%s:%u\n", __func__, __LINE__); + + if (init_attr->create_flags) + return ERR_PTR(-EINVAL); + + switch (init_attr->qp_type) { + case IB_QPT_RC: + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) { + pr_debug("%s: Unable to allocate QP\n", __func__); + return ERR_PTR(-ENOMEM); + } + spin_lock_init(&qp->lock); + if (pd->uobject) { + /* userspace specific */ + } + + err = c2_alloc_qp(to_c2dev(pd->device), + to_c2pd(pd), init_attr, qp); + + if (err && pd->uobject) { + /* userspace specific */ + } + + break; + default: + pr_debug("%s: Invalid QP type: %d\n", __func__, + init_attr->qp_type); + return ERR_PTR(-EINVAL); + } + + if (err) { + kfree(qp); + return ERR_PTR(err); + } + + return &qp->ibqp; +} + +static int c2_destroy_qp(struct ib_qp *ib_qp) +{ + struct c2_qp *qp = to_c2qp(ib_qp); + + pr_debug("%s:%u qp=%p,qp->state=%d\n", + __func__, __LINE__, ib_qp, qp->state); + c2_free_qp(to_c2dev(ib_qp->device), qp); + kfree(qp); + return 0; +} + +static struct ib_cq *c2_create_cq(struct ib_device *ibdev, int entries, int vector, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct c2_cq *cq; + int err; + + cq = kmalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) { + pr_debug("%s: Unable to allocate CQ\n", __func__); + return ERR_PTR(-ENOMEM); + } + + err = c2_init_cq(to_c2dev(ibdev), entries, NULL, cq); + if (err) { + pr_debug("%s: error initializing CQ\n", __func__); + kfree(cq); + return ERR_PTR(err); + } + + return &cq->ibcq; +} + +static int c2_destroy_cq(struct ib_cq *ib_cq) +{ + struct c2_cq *cq = to_c2cq(ib_cq); + + pr_debug("%s:%u\n", __func__, __LINE__); + + c2_free_cq(to_c2dev(ib_cq->device), cq); + kfree(cq); + + return 0; +} + +static inline u32 c2_convert_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_WRITE ? C2_ACF_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? C2_ACF_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? C2_ACF_LOCAL_WRITE : 0) | + C2_ACF_LOCAL_READ | C2_ACF_WINDOW_BIND; +} + +static struct ib_mr *c2_reg_phys_mr(struct ib_pd *ib_pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, int acc, u64 * iova_start) +{ + struct c2_mr *mr; + u64 *page_list; + u32 total_len; + int err, i, j, k, page_shift, pbl_depth; + + pbl_depth = 0; + total_len = 0; + + page_shift = PAGE_SHIFT; + /* + * If there is only 1 buffer we assume this could + * be a map of all phy mem...use a 32k page_shift. + */ + if (num_phys_buf == 1) + page_shift += 3; + + for (i = 0; i < num_phys_buf; i++) { + + if (buffer_list[i].addr & ~PAGE_MASK) { + pr_debug("Unaligned Memory Buffer: 0x%x\n", + (unsigned int) buffer_list[i].addr); + return ERR_PTR(-EINVAL); + } + + if (!buffer_list[i].size) { + pr_debug("Invalid Buffer Size\n"); + return ERR_PTR(-EINVAL); + } + + total_len += buffer_list[i].size; + pbl_depth += ALIGN(buffer_list[i].size, + (1 << page_shift)) >> page_shift; + } + + page_list = vmalloc(sizeof(u64) * pbl_depth); + if (!page_list) { + pr_debug("couldn't vmalloc page_list of size %zd\n", + (sizeof(u64) * pbl_depth)); + return ERR_PTR(-ENOMEM); + } + + for (i = 0, j = 0; i < num_phys_buf; i++) { + + int naddrs; + + naddrs = ALIGN(buffer_list[i].size, + (1 << page_shift)) >> page_shift; + for (k = 0; k < naddrs; k++) + page_list[j++] = (buffer_list[i].addr + + (k << page_shift)); + } + + mr = kmalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + vfree(page_list); + return ERR_PTR(-ENOMEM); + } + + mr->pd = to_c2pd(ib_pd); + mr->umem = NULL; + pr_debug("%s - page shift %d, pbl_depth %d, total_len %u, " + "*iova_start %llx, first pa %llx, last pa %llx\n", + __func__, page_shift, pbl_depth, total_len, + (unsigned long long) *iova_start, + (unsigned long long) page_list[0], + (unsigned long long) page_list[pbl_depth-1]); + err = c2_nsmr_register_phys_kern(to_c2dev(ib_pd->device), page_list, + (1 << page_shift), pbl_depth, + total_len, 0, iova_start, + c2_convert_access(acc), mr); + vfree(page_list); + if (err) { + kfree(mr); + return ERR_PTR(err); + } + + return &mr->ibmr; +} + +static struct ib_mr *c2_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct ib_phys_buf bl; + u64 kva = 0; + + pr_debug("%s:%u\n", __func__, __LINE__); + + /* AMSO1100 limit */ + bl.size = 0xffffffff; + bl.addr = 0; + return c2_reg_phys_mr(pd, &bl, 1, acc, &kva); +} + +static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt, int acc, struct ib_udata *udata) +{ + u64 *pages; + u64 kva = 0; + int shift, n, len; + int i, k, entry; + int err = 0; + struct scatterlist *sg; + struct c2_pd *c2pd = to_c2pd(pd); + struct c2_mr *c2mr; + + pr_debug("%s:%u\n", __func__, __LINE__); + + c2mr = kmalloc(sizeof(*c2mr), GFP_KERNEL); + if (!c2mr) + return ERR_PTR(-ENOMEM); + c2mr->pd = c2pd; + + c2mr->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0); + if (IS_ERR(c2mr->umem)) { + err = PTR_ERR(c2mr->umem); + kfree(c2mr); + return ERR_PTR(err); + } + + shift = ffs(c2mr->umem->page_size) - 1; + n = c2mr->umem->nmap; + + pages = kmalloc(n * sizeof(u64), GFP_KERNEL); + if (!pages) { + err = -ENOMEM; + goto err; + } + + i = 0; + for_each_sg(c2mr->umem->sg_head.sgl, sg, c2mr->umem->nmap, entry) { + len = sg_dma_len(sg) >> shift; + for (k = 0; k < len; ++k) { + pages[i++] = + sg_dma_address(sg) + + (c2mr->umem->page_size * k); + } + } + + kva = virt; + err = c2_nsmr_register_phys_kern(to_c2dev(pd->device), + pages, + c2mr->umem->page_size, + i, + length, + c2mr->umem->offset, + &kva, + c2_convert_access(acc), + c2mr); + kfree(pages); + if (err) + goto err; + return &c2mr->ibmr; + +err: + ib_umem_release(c2mr->umem); + kfree(c2mr); + return ERR_PTR(err); +} + +static int c2_dereg_mr(struct ib_mr *ib_mr) +{ + struct c2_mr *mr = to_c2mr(ib_mr); + int err; + + pr_debug("%s:%u\n", __func__, __LINE__); + + err = c2_stag_dealloc(to_c2dev(ib_mr->device), ib_mr->lkey); + if (err) + pr_debug("c2_stag_dealloc failed: %d\n", err); + else { + if (mr->umem) + ib_umem_release(mr->umem); + kfree(mr); + } + + return err; +} + +static ssize_t show_rev(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct c2_dev *c2dev = container_of(dev, struct c2_dev, ibdev.dev); + pr_debug("%s:%u\n", __func__, __LINE__); + return sprintf(buf, "%x\n", c2dev->props.hw_ver); +} + +static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct c2_dev *c2dev = container_of(dev, struct c2_dev, ibdev.dev); + pr_debug("%s:%u\n", __func__, __LINE__); + return sprintf(buf, "%x.%x.%x\n", + (int) (c2dev->props.fw_ver >> 32), + (int) (c2dev->props.fw_ver >> 16) & 0xffff, + (int) (c2dev->props.fw_ver & 0xffff)); +} + +static ssize_t show_hca(struct device *dev, struct device_attribute *attr, + char *buf) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + return sprintf(buf, "AMSO1100\n"); +} + +static ssize_t show_board(struct device *dev, struct device_attribute *attr, + char *buf) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + return sprintf(buf, "%.*s\n", 32, "AMSO1100 Board ID"); +} + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); + +static struct device_attribute *c2_dev_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_fw_ver, + &dev_attr_hca_type, + &dev_attr_board_id +}; + +static int c2_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + int err; + + err = + c2_qp_modify(to_c2dev(ibqp->device), to_c2qp(ibqp), attr, + attr_mask); + + return err; +} + +static int c2_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + return -ENOSYS; +} + +static int c2_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + return -ENOSYS; +} + +static int c2_process_mad(struct ib_device *ibdev, + int mad_flags, + u8 port_num, + struct ib_wc *in_wc, + struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + return -ENOSYS; +} + +static int c2_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + + /* Request a connection */ + return c2_llp_connect(cm_id, iw_param); +} + +static int c2_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + + /* Accept the new connection */ + return c2_llp_accept(cm_id, iw_param); +} + +static int c2_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) +{ + int err; + + pr_debug("%s:%u\n", __func__, __LINE__); + + err = c2_llp_reject(cm_id, pdata, pdata_len); + return err; +} + +static int c2_service_create(struct iw_cm_id *cm_id, int backlog) +{ + int err; + + pr_debug("%s:%u\n", __func__, __LINE__); + err = c2_llp_service_create(cm_id, backlog); + pr_debug("%s:%u err=%d\n", + __func__, __LINE__, + err); + return err; +} + +static int c2_service_destroy(struct iw_cm_id *cm_id) +{ + int err; + pr_debug("%s:%u\n", __func__, __LINE__); + + err = c2_llp_service_destroy(cm_id); + + return err; +} + +static int c2_pseudo_up(struct net_device *netdev) +{ + struct in_device *ind; + struct c2_dev *c2dev = netdev->ml_priv; + + ind = in_dev_get(netdev); + if (!ind) + return 0; + + pr_debug("adding...\n"); + for_ifa(ind) { +#ifdef DEBUG + u8 *ip = (u8 *) & ifa->ifa_address; + + pr_debug("%s: %d.%d.%d.%d\n", + ifa->ifa_label, ip[0], ip[1], ip[2], ip[3]); +#endif + c2_add_addr(c2dev, ifa->ifa_address, ifa->ifa_mask); + } + endfor_ifa(ind); + in_dev_put(ind); + + return 0; +} + +static int c2_pseudo_down(struct net_device *netdev) +{ + struct in_device *ind; + struct c2_dev *c2dev = netdev->ml_priv; + + ind = in_dev_get(netdev); + if (!ind) + return 0; + + pr_debug("deleting...\n"); + for_ifa(ind) { +#ifdef DEBUG + u8 *ip = (u8 *) & ifa->ifa_address; + + pr_debug("%s: %d.%d.%d.%d\n", + ifa->ifa_label, ip[0], ip[1], ip[2], ip[3]); +#endif + c2_del_addr(c2dev, ifa->ifa_address, ifa->ifa_mask); + } + endfor_ifa(ind); + in_dev_put(ind); + + return 0; +} + +static int c2_pseudo_xmit_frame(struct sk_buff *skb, struct net_device *netdev) +{ + kfree_skb(skb); + return NETDEV_TX_OK; +} + +static int c2_pseudo_change_mtu(struct net_device *netdev, int new_mtu) +{ + if (new_mtu < ETH_ZLEN || new_mtu > ETH_JUMBO_MTU) + return -EINVAL; + + netdev->mtu = new_mtu; + + /* TODO: Tell rnic about new rmda interface mtu */ + return 0; +} + +static const struct net_device_ops c2_pseudo_netdev_ops = { + .ndo_open = c2_pseudo_up, + .ndo_stop = c2_pseudo_down, + .ndo_start_xmit = c2_pseudo_xmit_frame, + .ndo_change_mtu = c2_pseudo_change_mtu, + .ndo_validate_addr = eth_validate_addr, +}; + +static void setup(struct net_device *netdev) +{ + netdev->netdev_ops = &c2_pseudo_netdev_ops; + + netdev->watchdog_timeo = 0; + netdev->type = ARPHRD_ETHER; + netdev->mtu = 1500; + netdev->hard_header_len = ETH_HLEN; + netdev->addr_len = ETH_ALEN; + netdev->tx_queue_len = 0; + netdev->flags |= IFF_NOARP; +} + +static struct net_device *c2_pseudo_netdev_init(struct c2_dev *c2dev) +{ + char name[IFNAMSIZ]; + struct net_device *netdev; + + /* change ethxxx to iwxxx */ + strcpy(name, "iw"); + strcat(name, &c2dev->netdev->name[3]); + netdev = alloc_netdev(0, name, NET_NAME_UNKNOWN, setup); + if (!netdev) { + printk(KERN_ERR PFX "%s - etherdev alloc failed", + __func__); + return NULL; + } + + netdev->ml_priv = c2dev; + + SET_NETDEV_DEV(netdev, &c2dev->pcidev->dev); + + memcpy_fromio(netdev->dev_addr, c2dev->kva + C2_REGS_RDMA_ENADDR, 6); + + /* Print out the MAC address */ + pr_debug("%s: MAC %pM\n", netdev->name, netdev->dev_addr); + +#if 0 + /* Disable network packets */ + netif_stop_queue(netdev); +#endif + return netdev; +} + +int c2_register_device(struct c2_dev *dev) +{ + int ret = -ENOMEM; + int i; + + /* Register pseudo network device */ + dev->pseudo_netdev = c2_pseudo_netdev_init(dev); + if (!dev->pseudo_netdev) + goto out; + + ret = register_netdev(dev->pseudo_netdev); + if (ret) + goto out_free_netdev; + + pr_debug("%s:%u\n", __func__, __LINE__); + strlcpy(dev->ibdev.name, "amso%d", IB_DEVICE_NAME_MAX); + dev->ibdev.owner = THIS_MODULE; + dev->ibdev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_POLL_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_POST_SEND) | + (1ull << IB_USER_VERBS_CMD_POST_RECV); + + dev->ibdev.node_type = RDMA_NODE_RNIC; + memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid)); + memcpy(&dev->ibdev.node_guid, dev->pseudo_netdev->dev_addr, 6); + dev->ibdev.phys_port_cnt = 1; + dev->ibdev.num_comp_vectors = 1; + dev->ibdev.dma_device = &dev->pcidev->dev; + dev->ibdev.query_device = c2_query_device; + dev->ibdev.query_port = c2_query_port; + dev->ibdev.query_pkey = c2_query_pkey; + dev->ibdev.query_gid = c2_query_gid; + dev->ibdev.alloc_ucontext = c2_alloc_ucontext; + dev->ibdev.dealloc_ucontext = c2_dealloc_ucontext; + dev->ibdev.mmap = c2_mmap_uar; + dev->ibdev.alloc_pd = c2_alloc_pd; + dev->ibdev.dealloc_pd = c2_dealloc_pd; + dev->ibdev.create_ah = c2_ah_create; + dev->ibdev.destroy_ah = c2_ah_destroy; + dev->ibdev.create_qp = c2_create_qp; + dev->ibdev.modify_qp = c2_modify_qp; + dev->ibdev.destroy_qp = c2_destroy_qp; + dev->ibdev.create_cq = c2_create_cq; + dev->ibdev.destroy_cq = c2_destroy_cq; + dev->ibdev.poll_cq = c2_poll_cq; + dev->ibdev.get_dma_mr = c2_get_dma_mr; + dev->ibdev.reg_phys_mr = c2_reg_phys_mr; + dev->ibdev.reg_user_mr = c2_reg_user_mr; + dev->ibdev.dereg_mr = c2_dereg_mr; + + dev->ibdev.alloc_fmr = NULL; + dev->ibdev.unmap_fmr = NULL; + dev->ibdev.dealloc_fmr = NULL; + dev->ibdev.map_phys_fmr = NULL; + + dev->ibdev.attach_mcast = c2_multicast_attach; + dev->ibdev.detach_mcast = c2_multicast_detach; + dev->ibdev.process_mad = c2_process_mad; + + dev->ibdev.req_notify_cq = c2_arm_cq; + dev->ibdev.post_send = c2_post_send; + dev->ibdev.post_recv = c2_post_receive; + + dev->ibdev.iwcm = kmalloc(sizeof(*dev->ibdev.iwcm), GFP_KERNEL); + if (dev->ibdev.iwcm == NULL) { + ret = -ENOMEM; + goto out_unregister_netdev; + } + dev->ibdev.iwcm->add_ref = c2_add_ref; + dev->ibdev.iwcm->rem_ref = c2_rem_ref; + dev->ibdev.iwcm->get_qp = c2_get_qp; + dev->ibdev.iwcm->connect = c2_connect; + dev->ibdev.iwcm->accept = c2_accept; + dev->ibdev.iwcm->reject = c2_reject; + dev->ibdev.iwcm->create_listen = c2_service_create; + dev->ibdev.iwcm->destroy_listen = c2_service_destroy; + + ret = ib_register_device(&dev->ibdev, NULL); + if (ret) + goto out_free_iwcm; + + for (i = 0; i < ARRAY_SIZE(c2_dev_attributes); ++i) { + ret = device_create_file(&dev->ibdev.dev, + c2_dev_attributes[i]); + if (ret) + goto out_unregister_ibdev; + } + goto out; + +out_unregister_ibdev: + ib_unregister_device(&dev->ibdev); +out_free_iwcm: + kfree(dev->ibdev.iwcm); +out_unregister_netdev: + unregister_netdev(dev->pseudo_netdev); +out_free_netdev: + free_netdev(dev->pseudo_netdev); +out: + pr_debug("%s:%u ret=%d\n", __func__, __LINE__, ret); + return ret; +} + +void c2_unregister_device(struct c2_dev *dev) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + unregister_netdev(dev->pseudo_netdev); + free_netdev(dev->pseudo_netdev); + ib_unregister_device(&dev->ibdev); +} diff --git a/drivers/infiniband/hw/amso1100/c2_provider.h b/drivers/infiniband/hw/amso1100/c2_provider.h new file mode 100644 index 0000000..bf18998 --- /dev/null +++ b/drivers/infiniband/hw/amso1100/c2_provider.h @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef C2_PROVIDER_H +#define C2_PROVIDER_H +#include + +#include +#include + +#include "c2_mq.h" +#include + +#define C2_MPT_FLAG_ATOMIC (1 << 14) +#define C2_MPT_FLAG_REMOTE_WRITE (1 << 13) +#define C2_MPT_FLAG_REMOTE_READ (1 << 12) +#define C2_MPT_FLAG_LOCAL_WRITE (1 << 11) +#define C2_MPT_FLAG_LOCAL_READ (1 << 10) + +struct c2_buf_list { + void *buf; + DEFINE_DMA_UNMAP_ADDR(mapping); +}; + + +/* The user context keeps track of objects allocated for a + * particular user-mode client. */ +struct c2_ucontext { + struct ib_ucontext ibucontext; +}; + +struct c2_mtt; + +/* All objects associated with a PD are kept in the + * associated user context if present. + */ +struct c2_pd { + struct ib_pd ibpd; + u32 pd_id; +}; + +struct c2_mr { + struct ib_mr ibmr; + struct c2_pd *pd; + struct ib_umem *umem; +}; + +struct c2_av; + +enum c2_ah_type { + C2_AH_ON_HCA, + C2_AH_PCI_POOL, + C2_AH_KMALLOC +}; + +struct c2_ah { + struct ib_ah ibah; +}; + +struct c2_cq { + struct ib_cq ibcq; + spinlock_t lock; + atomic_t refcount; + int cqn; + int is_kernel; + wait_queue_head_t wait; + + u32 adapter_handle; + struct c2_mq mq; +}; + +struct c2_wq { + spinlock_t lock; +}; +struct iw_cm_id; +struct c2_qp { + struct ib_qp ibqp; + struct iw_cm_id *cm_id; + spinlock_t lock; + atomic_t refcount; + wait_queue_head_t wait; + int qpn; + + u32 adapter_handle; + u32 send_sgl_depth; + u32 recv_sgl_depth; + u32 rdma_write_sgl_depth; + u8 state; + + struct c2_mq sq_mq; + struct c2_mq rq_mq; +}; + +struct c2_cr_query_attrs { + u32 local_addr; + u32 remote_addr; + u16 local_port; + u16 remote_port; +}; + +static inline struct c2_pd *to_c2pd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct c2_pd, ibpd); +} + +static inline struct c2_ucontext *to_c2ucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct c2_ucontext, ibucontext); +} + +static inline struct c2_mr *to_c2mr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct c2_mr, ibmr); +} + + +static inline struct c2_ah *to_c2ah(struct ib_ah *ibah) +{ + return container_of(ibah, struct c2_ah, ibah); +} + +static inline struct c2_cq *to_c2cq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct c2_cq, ibcq); +} + +static inline struct c2_qp *to_c2qp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct c2_qp, ibqp); +} + +static inline int is_rnic_addr(struct net_device *netdev, u32 addr) +{ + struct in_device *ind; + int ret = 0; + + ind = in_dev_get(netdev); + if (!ind) + return 0; + + for_ifa(ind) { + if (ifa->ifa_address == addr) { + ret = 1; + break; + } + } + endfor_ifa(ind); + in_dev_put(ind); + return ret; +} +#endif /* C2_PROVIDER_H */ diff --git a/drivers/infiniband/hw/amso1100/c2_qp.c b/drivers/infiniband/hw/amso1100/c2_qp.c new file mode 100644 index 0000000..86708de --- /dev/null +++ b/drivers/infiniband/hw/amso1100/c2_qp.c @@ -0,0 +1,1024 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include + +#include "c2.h" +#include "c2_vq.h" +#include "c2_status.h" + +#define C2_MAX_ORD_PER_QP 128 +#define C2_MAX_IRD_PER_QP 128 + +#define C2_HINT_MAKE(q_index, hint_count) (((q_index) << 16) | hint_count) +#define C2_HINT_GET_INDEX(hint) (((hint) & 0x7FFF0000) >> 16) +#define C2_HINT_GET_COUNT(hint) ((hint) & 0x0000FFFF) + +#define NO_SUPPORT -1 +static const u8 c2_opcode[] = { + [IB_WR_SEND] = C2_WR_TYPE_SEND, + [IB_WR_SEND_WITH_IMM] = NO_SUPPORT, + [IB_WR_RDMA_WRITE] = C2_WR_TYPE_RDMA_WRITE, + [IB_WR_RDMA_WRITE_WITH_IMM] = NO_SUPPORT, + [IB_WR_RDMA_READ] = C2_WR_TYPE_RDMA_READ, + [IB_WR_ATOMIC_CMP_AND_SWP] = NO_SUPPORT, + [IB_WR_ATOMIC_FETCH_AND_ADD] = NO_SUPPORT, +}; + +static int to_c2_state(enum ib_qp_state ib_state) +{ + switch (ib_state) { + case IB_QPS_RESET: + return C2_QP_STATE_IDLE; + case IB_QPS_RTS: + return C2_QP_STATE_RTS; + case IB_QPS_SQD: + return C2_QP_STATE_CLOSING; + case IB_QPS_SQE: + return C2_QP_STATE_CLOSING; + case IB_QPS_ERR: + return C2_QP_STATE_ERROR; + default: + return -1; + } +} + +static int to_ib_state(enum c2_qp_state c2_state) +{ + switch (c2_state) { + case C2_QP_STATE_IDLE: + return IB_QPS_RESET; + case C2_QP_STATE_CONNECTING: + return IB_QPS_RTR; + case C2_QP_STATE_RTS: + return IB_QPS_RTS; + case C2_QP_STATE_CLOSING: + return IB_QPS_SQD; + case C2_QP_STATE_ERROR: + return IB_QPS_ERR; + case C2_QP_STATE_TERMINATE: + return IB_QPS_SQE; + default: + return -1; + } +} + +static const char *to_ib_state_str(int ib_state) +{ + static const char *state_str[] = { + "IB_QPS_RESET", + "IB_QPS_INIT", + "IB_QPS_RTR", + "IB_QPS_RTS", + "IB_QPS_SQD", + "IB_QPS_SQE", + "IB_QPS_ERR" + }; + if (ib_state < IB_QPS_RESET || + ib_state > IB_QPS_ERR) + return ""; + + ib_state -= IB_QPS_RESET; + return state_str[ib_state]; +} + +void c2_set_qp_state(struct c2_qp *qp, int c2_state) +{ + int new_state = to_ib_state(c2_state); + + pr_debug("%s: qp[%p] state modify %s --> %s\n", + __func__, + qp, + to_ib_state_str(qp->state), + to_ib_state_str(new_state)); + qp->state = new_state; +} + +#define C2_QP_NO_ATTR_CHANGE 0xFFFFFFFF + +int c2_qp_modify(struct c2_dev *c2dev, struct c2_qp *qp, + struct ib_qp_attr *attr, int attr_mask) +{ + struct c2wr_qp_modify_req wr; + struct c2wr_qp_modify_rep *reply; + struct c2_vq_req *vq_req; + unsigned long flags; + u8 next_state; + int err; + + pr_debug("%s:%d qp=%p, %s --> %s\n", + __func__, __LINE__, + qp, + to_ib_state_str(qp->state), + to_ib_state_str(attr->qp_state)); + + vq_req = vq_req_alloc(c2dev); + if (!vq_req) + return -ENOMEM; + + c2_wr_set_id(&wr, CCWR_QP_MODIFY); + wr.hdr.context = (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.qp_handle = qp->adapter_handle; + wr.ord = cpu_to_be32(C2_QP_NO_ATTR_CHANGE); + wr.ird = cpu_to_be32(C2_QP_NO_ATTR_CHANGE); + wr.sq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE); + wr.rq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE); + + if (attr_mask & IB_QP_STATE) { + /* Ensure the state is valid */ + if (attr->qp_state < 0 || attr->qp_state > IB_QPS_ERR) { + err = -EINVAL; + goto bail0; + } + + wr.next_qp_state = cpu_to_be32(to_c2_state(attr->qp_state)); + + if (attr->qp_state == IB_QPS_ERR) { + spin_lock_irqsave(&qp->lock, flags); + if (qp->cm_id && qp->state == IB_QPS_RTS) { + pr_debug("Generating CLOSE event for QP-->ERR, " + "qp=%p, cm_id=%p\n",qp,qp->cm_id); + /* Generate an CLOSE event */ + vq_req->cm_id = qp->cm_id; + vq_req->event = IW_CM_EVENT_CLOSE; + } + spin_unlock_irqrestore(&qp->lock, flags); + } + next_state = attr->qp_state; + + } else if (attr_mask & IB_QP_CUR_STATE) { + + if (attr->cur_qp_state != IB_QPS_RTR && + attr->cur_qp_state != IB_QPS_RTS && + attr->cur_qp_state != IB_QPS_SQD && + attr->cur_qp_state != IB_QPS_SQE) { + err = -EINVAL; + goto bail0; + } else + wr.next_qp_state = + cpu_to_be32(to_c2_state(attr->cur_qp_state)); + + next_state = attr->cur_qp_state; + + } else { + err = 0; + goto bail0; + } + + /* reference the request struct */ + vq_req_get(c2dev, vq_req); + + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail0; + } + + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail0; + + reply = (struct c2wr_qp_modify_rep *) (unsigned long) vq_req->reply_msg; + if (!reply) { + err = -ENOMEM; + goto bail0; + } + + err = c2_errno(reply); + if (!err) + qp->state = next_state; +#ifdef DEBUG + else + pr_debug("%s: c2_errno=%d\n", __func__, err); +#endif + /* + * If we're going to error and generating the event here, then + * we need to remove the reference because there will be no + * close event generated by the adapter + */ + spin_lock_irqsave(&qp->lock, flags); + if (vq_req->event==IW_CM_EVENT_CLOSE && qp->cm_id) { + qp->cm_id->rem_ref(qp->cm_id); + qp->cm_id = NULL; + } + spin_unlock_irqrestore(&qp->lock, flags); + + vq_repbuf_free(c2dev, reply); + bail0: + vq_req_free(c2dev, vq_req); + + pr_debug("%s:%d qp=%p, cur_state=%s\n", + __func__, __LINE__, + qp, + to_ib_state_str(qp->state)); + return err; +} + +int c2_qp_set_read_limits(struct c2_dev *c2dev, struct c2_qp *qp, + int ord, int ird) +{ + struct c2wr_qp_modify_req wr; + struct c2wr_qp_modify_rep *reply; + struct c2_vq_req *vq_req; + int err; + + vq_req = vq_req_alloc(c2dev); + if (!vq_req) + return -ENOMEM; + + c2_wr_set_id(&wr, CCWR_QP_MODIFY); + wr.hdr.context = (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.qp_handle = qp->adapter_handle; + wr.ord = cpu_to_be32(ord); + wr.ird = cpu_to_be32(ird); + wr.sq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE); + wr.rq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE); + wr.next_qp_state = cpu_to_be32(C2_QP_NO_ATTR_CHANGE); + + /* reference the request struct */ + vq_req_get(c2dev, vq_req); + + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail0; + } + + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail0; + + reply = (struct c2wr_qp_modify_rep *) (unsigned long) + vq_req->reply_msg; + if (!reply) { + err = -ENOMEM; + goto bail0; + } + + err = c2_errno(reply); + vq_repbuf_free(c2dev, reply); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} + +static int destroy_qp(struct c2_dev *c2dev, struct c2_qp *qp) +{ + struct c2_vq_req *vq_req; + struct c2wr_qp_destroy_req wr; + struct c2wr_qp_destroy_rep *reply; + unsigned long flags; + int err; + + /* + * Allocate a verb request message + */ + vq_req = vq_req_alloc(c2dev); + if (!vq_req) { + return -ENOMEM; + } + + /* + * Initialize the WR + */ + c2_wr_set_id(&wr, CCWR_QP_DESTROY); + wr.hdr.context = (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.qp_handle = qp->adapter_handle; + + /* + * reference the request struct. dereferenced in the int handler. + */ + vq_req_get(c2dev, vq_req); + + spin_lock_irqsave(&qp->lock, flags); + if (qp->cm_id && qp->state == IB_QPS_RTS) { + pr_debug("destroy_qp: generating CLOSE event for QP-->ERR, " + "qp=%p, cm_id=%p\n",qp,qp->cm_id); + /* Generate an CLOSE event */ + vq_req->qp = qp; + vq_req->cm_id = qp->cm_id; + vq_req->event = IW_CM_EVENT_CLOSE; + } + spin_unlock_irqrestore(&qp->lock, flags); + + /* + * Send WR to adapter + */ + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail0; + } + + /* + * Wait for reply from adapter + */ + err = vq_wait_for_reply(c2dev, vq_req); + if (err) { + goto bail0; + } + + /* + * Process reply + */ + reply = (struct c2wr_qp_destroy_rep *) (unsigned long) (vq_req->reply_msg); + if (!reply) { + err = -ENOMEM; + goto bail0; + } + + spin_lock_irqsave(&qp->lock, flags); + if (qp->cm_id) { + qp->cm_id->rem_ref(qp->cm_id); + qp->cm_id = NULL; + } + spin_unlock_irqrestore(&qp->lock, flags); + + vq_repbuf_free(c2dev, reply); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} + +static int c2_alloc_qpn(struct c2_dev *c2dev, struct c2_qp *qp) +{ + int ret; + + idr_preload(GFP_KERNEL); + spin_lock_irq(&c2dev->qp_table.lock); + + ret = idr_alloc_cyclic(&c2dev->qp_table.idr, qp, 0, 0, GFP_NOWAIT); + if (ret >= 0) + qp->qpn = ret; + + spin_unlock_irq(&c2dev->qp_table.lock); + idr_preload_end(); + return ret < 0 ? ret : 0; +} + +static void c2_free_qpn(struct c2_dev *c2dev, int qpn) +{ + spin_lock_irq(&c2dev->qp_table.lock); + idr_remove(&c2dev->qp_table.idr, qpn); + spin_unlock_irq(&c2dev->qp_table.lock); +} + +struct c2_qp *c2_find_qpn(struct c2_dev *c2dev, int qpn) +{ + unsigned long flags; + struct c2_qp *qp; + + spin_lock_irqsave(&c2dev->qp_table.lock, flags); + qp = idr_find(&c2dev->qp_table.idr, qpn); + spin_unlock_irqrestore(&c2dev->qp_table.lock, flags); + return qp; +} + +int c2_alloc_qp(struct c2_dev *c2dev, + struct c2_pd *pd, + struct ib_qp_init_attr *qp_attrs, struct c2_qp *qp) +{ + struct c2wr_qp_create_req wr; + struct c2wr_qp_create_rep *reply; + struct c2_vq_req *vq_req; + struct c2_cq *send_cq = to_c2cq(qp_attrs->send_cq); + struct c2_cq *recv_cq = to_c2cq(qp_attrs->recv_cq); + unsigned long peer_pa; + u32 q_size, msg_size, mmap_size; + void __iomem *mmap; + int err; + + err = c2_alloc_qpn(c2dev, qp); + if (err) + return err; + qp->ibqp.qp_num = qp->qpn; + qp->ibqp.qp_type = IB_QPT_RC; + + /* Allocate the SQ and RQ shared pointers */ + qp->sq_mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool, + &qp->sq_mq.shared_dma, GFP_KERNEL); + if (!qp->sq_mq.shared) { + err = -ENOMEM; + goto bail0; + } + + qp->rq_mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool, + &qp->rq_mq.shared_dma, GFP_KERNEL); + if (!qp->rq_mq.shared) { + err = -ENOMEM; + goto bail1; + } + + /* Allocate the verbs request */ + vq_req = vq_req_alloc(c2dev); + if (vq_req == NULL) { + err = -ENOMEM; + goto bail2; + } + + /* Initialize the work request */ + memset(&wr, 0, sizeof(wr)); + c2_wr_set_id(&wr, CCWR_QP_CREATE); + wr.hdr.context = (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.sq_cq_handle = send_cq->adapter_handle; + wr.rq_cq_handle = recv_cq->adapter_handle; + wr.sq_depth = cpu_to_be32(qp_attrs->cap.max_send_wr + 1); + wr.rq_depth = cpu_to_be32(qp_attrs->cap.max_recv_wr + 1); + wr.srq_handle = 0; + wr.flags = cpu_to_be32(QP_RDMA_READ | QP_RDMA_WRITE | QP_MW_BIND | + QP_ZERO_STAG | QP_RDMA_READ_RESPONSE); + wr.send_sgl_depth = cpu_to_be32(qp_attrs->cap.max_send_sge); + wr.recv_sgl_depth = cpu_to_be32(qp_attrs->cap.max_recv_sge); + wr.rdma_write_sgl_depth = cpu_to_be32(qp_attrs->cap.max_send_sge); + wr.shared_sq_ht = cpu_to_be64(qp->sq_mq.shared_dma); + wr.shared_rq_ht = cpu_to_be64(qp->rq_mq.shared_dma); + wr.ord = cpu_to_be32(C2_MAX_ORD_PER_QP); + wr.ird = cpu_to_be32(C2_MAX_IRD_PER_QP); + wr.pd_id = pd->pd_id; + wr.user_context = (unsigned long) qp; + + vq_req_get(c2dev, vq_req); + + /* Send the WR to the adapter */ + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail3; + } + + /* Wait for the verb reply */ + err = vq_wait_for_reply(c2dev, vq_req); + if (err) { + goto bail3; + } + + /* Process the reply */ + reply = (struct c2wr_qp_create_rep *) (unsigned long) (vq_req->reply_msg); + if (!reply) { + err = -ENOMEM; + goto bail3; + } + + if ((err = c2_wr_get_result(reply)) != 0) { + goto bail4; + } + + /* Fill in the kernel QP struct */ + atomic_set(&qp->refcount, 1); + qp->adapter_handle = reply->qp_handle; + qp->state = IB_QPS_RESET; + qp->send_sgl_depth = qp_attrs->cap.max_send_sge; + qp->rdma_write_sgl_depth = qp_attrs->cap.max_send_sge; + qp->recv_sgl_depth = qp_attrs->cap.max_recv_sge; + init_waitqueue_head(&qp->wait); + + /* Initialize the SQ MQ */ + q_size = be32_to_cpu(reply->sq_depth); + msg_size = be32_to_cpu(reply->sq_msg_size); + peer_pa = c2dev->pa + be32_to_cpu(reply->sq_mq_start); + mmap_size = PAGE_ALIGN(sizeof(struct c2_mq_shared) + msg_size * q_size); + mmap = ioremap_nocache(peer_pa, mmap_size); + if (!mmap) { + err = -ENOMEM; + goto bail5; + } + + c2_mq_req_init(&qp->sq_mq, + be32_to_cpu(reply->sq_mq_index), + q_size, + msg_size, + mmap + sizeof(struct c2_mq_shared), /* pool start */ + mmap, /* peer */ + C2_MQ_ADAPTER_TARGET); + + /* Initialize the RQ mq */ + q_size = be32_to_cpu(reply->rq_depth); + msg_size = be32_to_cpu(reply->rq_msg_size); + peer_pa = c2dev->pa + be32_to_cpu(reply->rq_mq_start); + mmap_size = PAGE_ALIGN(sizeof(struct c2_mq_shared) + msg_size * q_size); + mmap = ioremap_nocache(peer_pa, mmap_size); + if (!mmap) { + err = -ENOMEM; + goto bail6; + } + + c2_mq_req_init(&qp->rq_mq, + be32_to_cpu(reply->rq_mq_index), + q_size, + msg_size, + mmap + sizeof(struct c2_mq_shared), /* pool start */ + mmap, /* peer */ + C2_MQ_ADAPTER_TARGET); + + vq_repbuf_free(c2dev, reply); + vq_req_free(c2dev, vq_req); + + return 0; + + bail6: + iounmap(qp->sq_mq.peer); + bail5: + destroy_qp(c2dev, qp); + bail4: + vq_repbuf_free(c2dev, reply); + bail3: + vq_req_free(c2dev, vq_req); + bail2: + c2_free_mqsp(qp->rq_mq.shared); + bail1: + c2_free_mqsp(qp->sq_mq.shared); + bail0: + c2_free_qpn(c2dev, qp->qpn); + return err; +} + +static inline void c2_lock_cqs(struct c2_cq *send_cq, struct c2_cq *recv_cq) +{ + if (send_cq == recv_cq) + spin_lock_irq(&send_cq->lock); + else if (send_cq > recv_cq) { + spin_lock_irq(&send_cq->lock); + spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); + } else { + spin_lock_irq(&recv_cq->lock); + spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING); + } +} + +static inline void c2_unlock_cqs(struct c2_cq *send_cq, struct c2_cq *recv_cq) +{ + if (send_cq == recv_cq) + spin_unlock_irq(&send_cq->lock); + else if (send_cq > recv_cq) { + spin_unlock(&recv_cq->lock); + spin_unlock_irq(&send_cq->lock); + } else { + spin_unlock(&send_cq->lock); + spin_unlock_irq(&recv_cq->lock); + } +} + +void c2_free_qp(struct c2_dev *c2dev, struct c2_qp *qp) +{ + struct c2_cq *send_cq; + struct c2_cq *recv_cq; + + send_cq = to_c2cq(qp->ibqp.send_cq); + recv_cq = to_c2cq(qp->ibqp.recv_cq); + + /* + * Lock CQs here, so that CQ polling code can do QP lookup + * without taking a lock. + */ + c2_lock_cqs(send_cq, recv_cq); + c2_free_qpn(c2dev, qp->qpn); + c2_unlock_cqs(send_cq, recv_cq); + + /* + * Destroy qp in the rnic... + */ + destroy_qp(c2dev, qp); + + /* + * Mark any unreaped CQEs as null and void. + */ + c2_cq_clean(c2dev, qp, send_cq->cqn); + if (send_cq != recv_cq) + c2_cq_clean(c2dev, qp, recv_cq->cqn); + /* + * Unmap the MQs and return the shared pointers + * to the message pool. + */ + iounmap(qp->sq_mq.peer); + iounmap(qp->rq_mq.peer); + c2_free_mqsp(qp->sq_mq.shared); + c2_free_mqsp(qp->rq_mq.shared); + + atomic_dec(&qp->refcount); + wait_event(qp->wait, !atomic_read(&qp->refcount)); +} + +/* + * Function: move_sgl + * + * Description: + * Move an SGL from the user's work request struct into a CCIL Work Request + * message, swapping to WR byte order and ensure the total length doesn't + * overflow. + * + * IN: + * dst - ptr to CCIL Work Request message SGL memory. + * src - ptr to the consumers SGL memory. + * + * OUT: none + * + * Return: + * CCIL status codes. + */ +static int +move_sgl(struct c2_data_addr * dst, struct ib_sge *src, int count, u32 * p_len, + u8 * actual_count) +{ + u32 tot = 0; /* running total */ + u8 acount = 0; /* running total non-0 len sge's */ + + while (count > 0) { + /* + * If the addition of this SGE causes the + * total SGL length to exceed 2^32-1, then + * fail-n-bail. + * + * If the current total plus the next element length + * wraps, then it will go negative and be less than the + * current total... + */ + if ((tot + src->length) < tot) { + return -EINVAL; + } + /* + * Bug: 1456 (as well as 1498 & 1643) + * Skip over any sge's supplied with len=0 + */ + if (src->length) { + tot += src->length; + dst->stag = cpu_to_be32(src->lkey); + dst->to = cpu_to_be64(src->addr); + dst->length = cpu_to_be32(src->length); + dst++; + acount++; + } + src++; + count--; + } + + if (acount == 0) { + /* + * Bug: 1476 (as well as 1498, 1456 and 1643) + * Setup the SGL in the WR to make it easier for the RNIC. + * This way, the FW doesn't have to deal with special cases. + * Setting length=0 should be sufficient. + */ + dst->stag = 0; + dst->to = 0; + dst->length = 0; + } + + *p_len = tot; + *actual_count = acount; + return 0; +} + +/* + * Function: c2_activity (private function) + * + * Description: + * Post an mq index to the host->adapter activity fifo. + * + * IN: + * c2dev - ptr to c2dev structure + * mq_index - mq index to post + * shared - value most recently written to shared + * + * OUT: + * + * Return: + * none + */ +static inline void c2_activity(struct c2_dev *c2dev, u32 mq_index, u16 shared) +{ + /* + * First read the register to see if the FIFO is full, and if so, + * spin until it's not. This isn't perfect -- there is no + * synchronization among the clients of the register, but in + * practice it prevents multiple CPU from hammering the bus + * with PCI RETRY. Note that when this does happen, the card + * cannot get on the bus and the card and system hang in a + * deadlock -- thus the need for this code. [TOT] + */ + while (readl(c2dev->regs + PCI_BAR0_ADAPTER_HINT) & 0x80000000) + udelay(10); + + __raw_writel(C2_HINT_MAKE(mq_index, shared), + c2dev->regs + PCI_BAR0_ADAPTER_HINT); +} + +/* + * Function: qp_wr_post + * + * Description: + * This in-line function allocates a MQ msg, then moves the host-copy of + * the completed WR into msg. Then it posts the message. + * + * IN: + * q - ptr to user MQ. + * wr - ptr to host-copy of the WR. + * qp - ptr to user qp + * size - Number of bytes to post. Assumed to be divisible by 4. + * + * OUT: none + * + * Return: + * CCIL status codes. + */ +static int qp_wr_post(struct c2_mq *q, union c2wr * wr, struct c2_qp *qp, u32 size) +{ + union c2wr *msg; + + msg = c2_mq_alloc(q); + if (msg == NULL) { + return -EINVAL; + } +#ifdef CCMSGMAGIC + ((c2wr_hdr_t *) wr)->magic = cpu_to_be32(CCWR_MAGIC); +#endif + + /* + * Since all header fields in the WR are the same as the + * CQE, set the following so the adapter need not. + */ + c2_wr_set_result(wr, CCERR_PENDING); + + /* + * Copy the wr down to the adapter + */ + memcpy((void *) msg, (void *) wr, size); + + c2_mq_produce(q); + return 0; +} + + +int c2_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, + struct ib_send_wr **bad_wr) +{ + struct c2_dev *c2dev = to_c2dev(ibqp->device); + struct c2_qp *qp = to_c2qp(ibqp); + union c2wr wr; + unsigned long lock_flags; + int err = 0; + + u32 flags; + u32 tot_len; + u8 actual_sge_count; + u32 msg_size; + + if (qp->state > IB_QPS_RTS) { + err = -EINVAL; + goto out; + } + + while (ib_wr) { + + flags = 0; + wr.sqwr.sq_hdr.user_hdr.hdr.context = ib_wr->wr_id; + if (ib_wr->send_flags & IB_SEND_SIGNALED) { + flags |= SQ_SIGNALED; + } + + switch (ib_wr->opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_INV: + if (ib_wr->opcode == IB_WR_SEND) { + if (ib_wr->send_flags & IB_SEND_SOLICITED) + c2_wr_set_id(&wr, C2_WR_TYPE_SEND_SE); + else + c2_wr_set_id(&wr, C2_WR_TYPE_SEND); + wr.sqwr.send.remote_stag = 0; + } else { + if (ib_wr->send_flags & IB_SEND_SOLICITED) + c2_wr_set_id(&wr, C2_WR_TYPE_SEND_SE_INV); + else + c2_wr_set_id(&wr, C2_WR_TYPE_SEND_INV); + wr.sqwr.send.remote_stag = + cpu_to_be32(ib_wr->ex.invalidate_rkey); + } + + msg_size = sizeof(struct c2wr_send_req) + + sizeof(struct c2_data_addr) * ib_wr->num_sge; + if (ib_wr->num_sge > qp->send_sgl_depth) { + err = -EINVAL; + break; + } + if (ib_wr->send_flags & IB_SEND_FENCE) { + flags |= SQ_READ_FENCE; + } + err = move_sgl((struct c2_data_addr *) & (wr.sqwr.send.data), + ib_wr->sg_list, + ib_wr->num_sge, + &tot_len, &actual_sge_count); + wr.sqwr.send.sge_len = cpu_to_be32(tot_len); + c2_wr_set_sge_count(&wr, actual_sge_count); + break; + case IB_WR_RDMA_WRITE: + c2_wr_set_id(&wr, C2_WR_TYPE_RDMA_WRITE); + msg_size = sizeof(struct c2wr_rdma_write_req) + + (sizeof(struct c2_data_addr) * ib_wr->num_sge); + if (ib_wr->num_sge > qp->rdma_write_sgl_depth) { + err = -EINVAL; + break; + } + if (ib_wr->send_flags & IB_SEND_FENCE) { + flags |= SQ_READ_FENCE; + } + wr.sqwr.rdma_write.remote_stag = + cpu_to_be32(ib_wr->wr.rdma.rkey); + wr.sqwr.rdma_write.remote_to = + cpu_to_be64(ib_wr->wr.rdma.remote_addr); + err = move_sgl((struct c2_data_addr *) + & (wr.sqwr.rdma_write.data), + ib_wr->sg_list, + ib_wr->num_sge, + &tot_len, &actual_sge_count); + wr.sqwr.rdma_write.sge_len = cpu_to_be32(tot_len); + c2_wr_set_sge_count(&wr, actual_sge_count); + break; + case IB_WR_RDMA_READ: + c2_wr_set_id(&wr, C2_WR_TYPE_RDMA_READ); + msg_size = sizeof(struct c2wr_rdma_read_req); + + /* IWarp only suppots 1 sge for RDMA reads */ + if (ib_wr->num_sge > 1) { + err = -EINVAL; + break; + } + + /* + * Move the local and remote stag/to/len into the WR. + */ + wr.sqwr.rdma_read.local_stag = + cpu_to_be32(ib_wr->sg_list->lkey); + wr.sqwr.rdma_read.local_to = + cpu_to_be64(ib_wr->sg_list->addr); + wr.sqwr.rdma_read.remote_stag = + cpu_to_be32(ib_wr->wr.rdma.rkey); + wr.sqwr.rdma_read.remote_to = + cpu_to_be64(ib_wr->wr.rdma.remote_addr); + wr.sqwr.rdma_read.length = + cpu_to_be32(ib_wr->sg_list->length); + break; + default: + /* error */ + msg_size = 0; + err = -EINVAL; + break; + } + + /* + * If we had an error on the last wr build, then + * break out. Possible errors include bogus WR + * type, and a bogus SGL length... + */ + if (err) { + break; + } + + /* + * Store flags + */ + c2_wr_set_flags(&wr, flags); + + /* + * Post the puppy! + */ + spin_lock_irqsave(&qp->lock, lock_flags); + err = qp_wr_post(&qp->sq_mq, &wr, qp, msg_size); + if (err) { + spin_unlock_irqrestore(&qp->lock, lock_flags); + break; + } + + /* + * Enqueue mq index to activity FIFO. + */ + c2_activity(c2dev, qp->sq_mq.index, qp->sq_mq.hint_count); + spin_unlock_irqrestore(&qp->lock, lock_flags); + + ib_wr = ib_wr->next; + } + +out: + if (err) + *bad_wr = ib_wr; + return err; +} + +int c2_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr, + struct ib_recv_wr **bad_wr) +{ + struct c2_dev *c2dev = to_c2dev(ibqp->device); + struct c2_qp *qp = to_c2qp(ibqp); + union c2wr wr; + unsigned long lock_flags; + int err = 0; + + if (qp->state > IB_QPS_RTS) { + err = -EINVAL; + goto out; + } + + /* + * Try and post each work request + */ + while (ib_wr) { + u32 tot_len; + u8 actual_sge_count; + + if (ib_wr->num_sge > qp->recv_sgl_depth) { + err = -EINVAL; + break; + } + + /* + * Create local host-copy of the WR + */ + wr.rqwr.rq_hdr.user_hdr.hdr.context = ib_wr->wr_id; + c2_wr_set_id(&wr, CCWR_RECV); + c2_wr_set_flags(&wr, 0); + + /* sge_count is limited to eight bits. */ + BUG_ON(ib_wr->num_sge >= 256); + err = move_sgl((struct c2_data_addr *) & (wr.rqwr.data), + ib_wr->sg_list, + ib_wr->num_sge, &tot_len, &actual_sge_count); + c2_wr_set_sge_count(&wr, actual_sge_count); + + /* + * If we had an error on the last wr build, then + * break out. Possible errors include bogus WR + * type, and a bogus SGL length... + */ + if (err) { + break; + } + + spin_lock_irqsave(&qp->lock, lock_flags); + err = qp_wr_post(&qp->rq_mq, &wr, qp, qp->rq_mq.msg_size); + if (err) { + spin_unlock_irqrestore(&qp->lock, lock_flags); + break; + } + + /* + * Enqueue mq index to activity FIFO + */ + c2_activity(c2dev, qp->rq_mq.index, qp->rq_mq.hint_count); + spin_unlock_irqrestore(&qp->lock, lock_flags); + + ib_wr = ib_wr->next; + } + +out: + if (err) + *bad_wr = ib_wr; + return err; +} + +void c2_init_qp_table(struct c2_dev *c2dev) +{ + spin_lock_init(&c2dev->qp_table.lock); + idr_init(&c2dev->qp_table.idr); +} + +void c2_cleanup_qp_table(struct c2_dev *c2dev) +{ + idr_destroy(&c2dev->qp_table.idr); +} diff --git a/drivers/infiniband/hw/amso1100/c2_rnic.c b/drivers/infiniband/hw/amso1100/c2_rnic.c new file mode 100644 index 0000000..d2a6d96 --- /dev/null +++ b/drivers/infiniband/hw/amso1100/c2_rnic.c @@ -0,0 +1,655 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include "c2.h" +#include "c2_vq.h" + +/* Device capabilities */ +#define C2_MIN_PAGESIZE 1024 + +#define C2_MAX_MRS 32768 +#define C2_MAX_QPS 16000 +#define C2_MAX_WQE_SZ 256 +#define C2_MAX_QP_WR ((128*1024)/C2_MAX_WQE_SZ) +#define C2_MAX_SGES 4 +#define C2_MAX_SGE_RD 1 +#define C2_MAX_CQS 32768 +#define C2_MAX_CQES 4096 +#define C2_MAX_PDS 16384 + +/* + * Send the adapter INIT message to the amso1100 + */ +static int c2_adapter_init(struct c2_dev *c2dev) +{ + struct c2wr_init_req wr; + int err; + + memset(&wr, 0, sizeof(wr)); + c2_wr_set_id(&wr, CCWR_INIT); + wr.hdr.context = 0; + wr.hint_count = cpu_to_be64(c2dev->hint_count_dma); + wr.q0_host_shared = cpu_to_be64(c2dev->req_vq.shared_dma); + wr.q1_host_shared = cpu_to_be64(c2dev->rep_vq.shared_dma); + wr.q1_host_msg_pool = cpu_to_be64(c2dev->rep_vq.host_dma); + wr.q2_host_shared = cpu_to_be64(c2dev->aeq.shared_dma); + wr.q2_host_msg_pool = cpu_to_be64(c2dev->aeq.host_dma); + + /* Post the init message */ + err = vq_send_wr(c2dev, (union c2wr *) & wr); + + return err; +} + +/* + * Send the adapter TERM message to the amso1100 + */ +static void c2_adapter_term(struct c2_dev *c2dev) +{ + struct c2wr_init_req wr; + + memset(&wr, 0, sizeof(wr)); + c2_wr_set_id(&wr, CCWR_TERM); + wr.hdr.context = 0; + + /* Post the init message */ + vq_send_wr(c2dev, (union c2wr *) & wr); + c2dev->init = 0; + + return; +} + +/* + * Query the adapter + */ +static int c2_rnic_query(struct c2_dev *c2dev, struct ib_device_attr *props) +{ + struct c2_vq_req *vq_req; + struct c2wr_rnic_query_req wr; + struct c2wr_rnic_query_rep *reply; + int err; + + vq_req = vq_req_alloc(c2dev); + if (!vq_req) + return -ENOMEM; + + c2_wr_set_id(&wr, CCWR_RNIC_QUERY); + wr.hdr.context = (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + + vq_req_get(c2dev, vq_req); + + err = vq_send_wr(c2dev, (union c2wr *) &wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail1; + } + + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail1; + + reply = + (struct c2wr_rnic_query_rep *) (unsigned long) (vq_req->reply_msg); + if (!reply) + err = -ENOMEM; + else + err = c2_errno(reply); + if (err) + goto bail2; + + props->fw_ver = + ((u64)be32_to_cpu(reply->fw_ver_major) << 32) | + ((be32_to_cpu(reply->fw_ver_minor) & 0xFFFF) << 16) | + (be32_to_cpu(reply->fw_ver_patch) & 0xFFFF); + memcpy(&props->sys_image_guid, c2dev->netdev->dev_addr, 6); + props->max_mr_size = 0xFFFFFFFF; + props->page_size_cap = ~(C2_MIN_PAGESIZE-1); + props->vendor_id = be32_to_cpu(reply->vendor_id); + props->vendor_part_id = be32_to_cpu(reply->part_number); + props->hw_ver = be32_to_cpu(reply->hw_version); + props->max_qp = be32_to_cpu(reply->max_qps); + props->max_qp_wr = be32_to_cpu(reply->max_qp_depth); + props->device_cap_flags = c2dev->device_cap_flags; + props->max_sge = C2_MAX_SGES; + props->max_sge_rd = C2_MAX_SGE_RD; + props->max_cq = be32_to_cpu(reply->max_cqs); + props->max_cqe = be32_to_cpu(reply->max_cq_depth); + props->max_mr = be32_to_cpu(reply->max_mrs); + props->max_pd = be32_to_cpu(reply->max_pds); + props->max_qp_rd_atom = be32_to_cpu(reply->max_qp_ird); + props->max_ee_rd_atom = 0; + props->max_res_rd_atom = be32_to_cpu(reply->max_global_ird); + props->max_qp_init_rd_atom = be32_to_cpu(reply->max_qp_ord); + props->max_ee_init_rd_atom = 0; + props->atomic_cap = IB_ATOMIC_NONE; + props->max_ee = 0; + props->max_rdd = 0; + props->max_mw = be32_to_cpu(reply->max_mws); + props->max_raw_ipv6_qp = 0; + props->max_raw_ethy_qp = 0; + props->max_mcast_grp = 0; + props->max_mcast_qp_attach = 0; + props->max_total_mcast_qp_attach = 0; + props->max_ah = 0; + props->max_fmr = 0; + props->max_map_per_fmr = 0; + props->max_srq = 0; + props->max_srq_wr = 0; + props->max_srq_sge = 0; + props->max_pkeys = 0; + props->local_ca_ack_delay = 0; + + bail2: + vq_repbuf_free(c2dev, reply); + + bail1: + vq_req_free(c2dev, vq_req); + return err; +} + +/* + * Add an IP address to the RNIC interface + */ +int c2_add_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask) +{ + struct c2_vq_req *vq_req; + struct c2wr_rnic_setconfig_req *wr; + struct c2wr_rnic_setconfig_rep *reply; + struct c2_netaddr netaddr; + int err, len; + + vq_req = vq_req_alloc(c2dev); + if (!vq_req) + return -ENOMEM; + + len = sizeof(struct c2_netaddr); + wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL); + if (!wr) { + err = -ENOMEM; + goto bail0; + } + + c2_wr_set_id(wr, CCWR_RNIC_SETCONFIG); + wr->hdr.context = (unsigned long) vq_req; + wr->rnic_handle = c2dev->adapter_handle; + wr->option = cpu_to_be32(C2_CFG_ADD_ADDR); + + netaddr.ip_addr = inaddr; + netaddr.netmask = inmask; + netaddr.mtu = 0; + + memcpy(wr->data, &netaddr, len); + + vq_req_get(c2dev, vq_req); + + err = vq_send_wr(c2dev, (union c2wr *) wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail1; + } + + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail1; + + reply = + (struct c2wr_rnic_setconfig_rep *) (unsigned long) (vq_req->reply_msg); + if (!reply) { + err = -ENOMEM; + goto bail1; + } + + err = c2_errno(reply); + vq_repbuf_free(c2dev, reply); + + bail1: + kfree(wr); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} + +/* + * Delete an IP address from the RNIC interface + */ +int c2_del_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask) +{ + struct c2_vq_req *vq_req; + struct c2wr_rnic_setconfig_req *wr; + struct c2wr_rnic_setconfig_rep *reply; + struct c2_netaddr netaddr; + int err, len; + + vq_req = vq_req_alloc(c2dev); + if (!vq_req) + return -ENOMEM; + + len = sizeof(struct c2_netaddr); + wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL); + if (!wr) { + err = -ENOMEM; + goto bail0; + } + + c2_wr_set_id(wr, CCWR_RNIC_SETCONFIG); + wr->hdr.context = (unsigned long) vq_req; + wr->rnic_handle = c2dev->adapter_handle; + wr->option = cpu_to_be32(C2_CFG_DEL_ADDR); + + netaddr.ip_addr = inaddr; + netaddr.netmask = inmask; + netaddr.mtu = 0; + + memcpy(wr->data, &netaddr, len); + + vq_req_get(c2dev, vq_req); + + err = vq_send_wr(c2dev, (union c2wr *) wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail1; + } + + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail1; + + reply = + (struct c2wr_rnic_setconfig_rep *) (unsigned long) (vq_req->reply_msg); + if (!reply) { + err = -ENOMEM; + goto bail1; + } + + err = c2_errno(reply); + vq_repbuf_free(c2dev, reply); + + bail1: + kfree(wr); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} + +/* + * Open a single RNIC instance to use with all + * low level openib calls + */ +static int c2_rnic_open(struct c2_dev *c2dev) +{ + struct c2_vq_req *vq_req; + union c2wr wr; + struct c2wr_rnic_open_rep *reply; + int err; + + vq_req = vq_req_alloc(c2dev); + if (vq_req == NULL) { + return -ENOMEM; + } + + memset(&wr, 0, sizeof(wr)); + c2_wr_set_id(&wr, CCWR_RNIC_OPEN); + wr.rnic_open.req.hdr.context = (unsigned long) (vq_req); + wr.rnic_open.req.flags = cpu_to_be16(RNIC_PRIV_MODE); + wr.rnic_open.req.port_num = cpu_to_be16(0); + wr.rnic_open.req.user_context = (unsigned long) c2dev; + + vq_req_get(c2dev, vq_req); + + err = vq_send_wr(c2dev, &wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail0; + } + + err = vq_wait_for_reply(c2dev, vq_req); + if (err) { + goto bail0; + } + + reply = (struct c2wr_rnic_open_rep *) (unsigned long) (vq_req->reply_msg); + if (!reply) { + err = -ENOMEM; + goto bail0; + } + + if ((err = c2_errno(reply)) != 0) { + goto bail1; + } + + c2dev->adapter_handle = reply->rnic_handle; + + bail1: + vq_repbuf_free(c2dev, reply); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} + +/* + * Close the RNIC instance + */ +static int c2_rnic_close(struct c2_dev *c2dev) +{ + struct c2_vq_req *vq_req; + union c2wr wr; + struct c2wr_rnic_close_rep *reply; + int err; + + vq_req = vq_req_alloc(c2dev); + if (vq_req == NULL) { + return -ENOMEM; + } + + memset(&wr, 0, sizeof(wr)); + c2_wr_set_id(&wr, CCWR_RNIC_CLOSE); + wr.rnic_close.req.hdr.context = (unsigned long) vq_req; + wr.rnic_close.req.rnic_handle = c2dev->adapter_handle; + + vq_req_get(c2dev, vq_req); + + err = vq_send_wr(c2dev, &wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail0; + } + + err = vq_wait_for_reply(c2dev, vq_req); + if (err) { + goto bail0; + } + + reply = (struct c2wr_rnic_close_rep *) (unsigned long) (vq_req->reply_msg); + if (!reply) { + err = -ENOMEM; + goto bail0; + } + + if ((err = c2_errno(reply)) != 0) { + goto bail1; + } + + c2dev->adapter_handle = 0; + + bail1: + vq_repbuf_free(c2dev, reply); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} + +/* + * Called by c2_probe to initialize the RNIC. This principally + * involves initializing the various limits and resource pools that + * comprise the RNIC instance. + */ +int c2_rnic_init(struct c2_dev *c2dev) +{ + int err; + u32 qsize, msgsize; + void *q1_pages; + void *q2_pages; + void __iomem *mmio_regs; + + /* Device capabilities */ + c2dev->device_cap_flags = + (IB_DEVICE_RESIZE_MAX_WR | + IB_DEVICE_CURR_QP_STATE_MOD | + IB_DEVICE_SYS_IMAGE_GUID | + IB_DEVICE_LOCAL_DMA_LKEY | + IB_DEVICE_MEM_WINDOW); + + /* Allocate the qptr_array */ + c2dev->qptr_array = vzalloc(C2_MAX_CQS * sizeof(void *)); + if (!c2dev->qptr_array) { + return -ENOMEM; + } + + /* Initialize the qptr_array */ + c2dev->qptr_array[0] = (void *) &c2dev->req_vq; + c2dev->qptr_array[1] = (void *) &c2dev->rep_vq; + c2dev->qptr_array[2] = (void *) &c2dev->aeq; + + /* Initialize data structures */ + init_waitqueue_head(&c2dev->req_vq_wo); + spin_lock_init(&c2dev->vqlock); + spin_lock_init(&c2dev->lock); + + /* Allocate MQ shared pointer pool for kernel clients. User + * mode client pools are hung off the user context + */ + err = c2_init_mqsp_pool(c2dev, GFP_KERNEL, &c2dev->kern_mqsp_pool); + if (err) { + goto bail0; + } + + /* Allocate shared pointers for Q0, Q1, and Q2 from + * the shared pointer pool. + */ + + c2dev->hint_count = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool, + &c2dev->hint_count_dma, + GFP_KERNEL); + c2dev->req_vq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool, + &c2dev->req_vq.shared_dma, + GFP_KERNEL); + c2dev->rep_vq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool, + &c2dev->rep_vq.shared_dma, + GFP_KERNEL); + c2dev->aeq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool, + &c2dev->aeq.shared_dma, GFP_KERNEL); + if (!c2dev->hint_count || !c2dev->req_vq.shared || + !c2dev->rep_vq.shared || !c2dev->aeq.shared) { + err = -ENOMEM; + goto bail1; + } + + mmio_regs = c2dev->kva; + /* Initialize the Verbs Request Queue */ + c2_mq_req_init(&c2dev->req_vq, 0, + be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_QSIZE)), + be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_MSGSIZE)), + mmio_regs + + be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_POOLSTART)), + mmio_regs + + be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_SHARED)), + C2_MQ_ADAPTER_TARGET); + + /* Initialize the Verbs Reply Queue */ + qsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_QSIZE)); + msgsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_MSGSIZE)); + q1_pages = dma_alloc_coherent(&c2dev->pcidev->dev, qsize * msgsize, + &c2dev->rep_vq.host_dma, GFP_KERNEL); + if (!q1_pages) { + err = -ENOMEM; + goto bail1; + } + dma_unmap_addr_set(&c2dev->rep_vq, mapping, c2dev->rep_vq.host_dma); + pr_debug("%s rep_vq va %p dma %llx\n", __func__, q1_pages, + (unsigned long long) c2dev->rep_vq.host_dma); + c2_mq_rep_init(&c2dev->rep_vq, + 1, + qsize, + msgsize, + q1_pages, + mmio_regs + + be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_SHARED)), + C2_MQ_HOST_TARGET); + + /* Initialize the Asynchronus Event Queue */ + qsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_QSIZE)); + msgsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_MSGSIZE)); + q2_pages = dma_alloc_coherent(&c2dev->pcidev->dev, qsize * msgsize, + &c2dev->aeq.host_dma, GFP_KERNEL); + if (!q2_pages) { + err = -ENOMEM; + goto bail2; + } + dma_unmap_addr_set(&c2dev->aeq, mapping, c2dev->aeq.host_dma); + pr_debug("%s aeq va %p dma %llx\n", __func__, q2_pages, + (unsigned long long) c2dev->aeq.host_dma); + c2_mq_rep_init(&c2dev->aeq, + 2, + qsize, + msgsize, + q2_pages, + mmio_regs + + be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_SHARED)), + C2_MQ_HOST_TARGET); + + /* Initialize the verbs request allocator */ + err = vq_init(c2dev); + if (err) + goto bail3; + + /* Enable interrupts on the adapter */ + writel(0, c2dev->regs + C2_IDIS); + + /* create the WR init message */ + err = c2_adapter_init(c2dev); + if (err) + goto bail4; + c2dev->init++; + + /* open an adapter instance */ + err = c2_rnic_open(c2dev); + if (err) + goto bail4; + + /* Initialize cached the adapter limits */ + err = c2_rnic_query(c2dev, &c2dev->props); + if (err) + goto bail5; + + /* Initialize the PD pool */ + err = c2_init_pd_table(c2dev); + if (err) + goto bail5; + + /* Initialize the QP pool */ + c2_init_qp_table(c2dev); + return 0; + + bail5: + c2_rnic_close(c2dev); + bail4: + vq_term(c2dev); + bail3: + dma_free_coherent(&c2dev->pcidev->dev, + c2dev->aeq.q_size * c2dev->aeq.msg_size, + q2_pages, dma_unmap_addr(&c2dev->aeq, mapping)); + bail2: + dma_free_coherent(&c2dev->pcidev->dev, + c2dev->rep_vq.q_size * c2dev->rep_vq.msg_size, + q1_pages, dma_unmap_addr(&c2dev->rep_vq, mapping)); + bail1: + c2_free_mqsp_pool(c2dev, c2dev->kern_mqsp_pool); + bail0: + vfree(c2dev->qptr_array); + + return err; +} + +/* + * Called by c2_remove to cleanup the RNIC resources. + */ +void c2_rnic_term(struct c2_dev *c2dev) +{ + + /* Close the open adapter instance */ + c2_rnic_close(c2dev); + + /* Send the TERM message to the adapter */ + c2_adapter_term(c2dev); + + /* Disable interrupts on the adapter */ + writel(1, c2dev->regs + C2_IDIS); + + /* Free the QP pool */ + c2_cleanup_qp_table(c2dev); + + /* Free the PD pool */ + c2_cleanup_pd_table(c2dev); + + /* Free the verbs request allocator */ + vq_term(c2dev); + + /* Free the asynchronus event queue */ + dma_free_coherent(&c2dev->pcidev->dev, + c2dev->aeq.q_size * c2dev->aeq.msg_size, + c2dev->aeq.msg_pool.host, + dma_unmap_addr(&c2dev->aeq, mapping)); + + /* Free the verbs reply queue */ + dma_free_coherent(&c2dev->pcidev->dev, + c2dev->rep_vq.q_size * c2dev->rep_vq.msg_size, + c2dev->rep_vq.msg_pool.host, + dma_unmap_addr(&c2dev->rep_vq, mapping)); + + /* Free the MQ shared pointer pool */ + c2_free_mqsp_pool(c2dev, c2dev->kern_mqsp_pool); + + /* Free the qptr_array */ + vfree(c2dev->qptr_array); + + return; +} diff --git a/drivers/infiniband/hw/amso1100/c2_status.h b/drivers/infiniband/hw/amso1100/c2_status.h new file mode 100644 index 0000000..6ee4aa9 --- /dev/null +++ b/drivers/infiniband/hw/amso1100/c2_status.h @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _C2_STATUS_H_ +#define _C2_STATUS_H_ + +/* + * Verbs Status Codes + */ +enum c2_status { + C2_OK = 0, /* This must be zero */ + CCERR_INSUFFICIENT_RESOURCES = 1, + CCERR_INVALID_MODIFIER = 2, + CCERR_INVALID_MODE = 3, + CCERR_IN_USE = 4, + CCERR_INVALID_RNIC = 5, + CCERR_INTERRUPTED_OPERATION = 6, + CCERR_INVALID_EH = 7, + CCERR_INVALID_CQ = 8, + CCERR_CQ_EMPTY = 9, + CCERR_NOT_IMPLEMENTED = 10, + CCERR_CQ_DEPTH_TOO_SMALL = 11, + CCERR_PD_IN_USE = 12, + CCERR_INVALID_PD = 13, + CCERR_INVALID_SRQ = 14, + CCERR_INVALID_ADDRESS = 15, + CCERR_INVALID_NETMASK = 16, + CCERR_INVALID_QP = 17, + CCERR_INVALID_QP_STATE = 18, + CCERR_TOO_MANY_WRS_POSTED = 19, + CCERR_INVALID_WR_TYPE = 20, + CCERR_INVALID_SGL_LENGTH = 21, + CCERR_INVALID_SQ_DEPTH = 22, + CCERR_INVALID_RQ_DEPTH = 23, + CCERR_INVALID_ORD = 24, + CCERR_INVALID_IRD = 25, + CCERR_QP_ATTR_CANNOT_CHANGE = 26, + CCERR_INVALID_STAG = 27, + CCERR_QP_IN_USE = 28, + CCERR_OUTSTANDING_WRS = 29, + CCERR_STAG_IN_USE = 30, + CCERR_INVALID_STAG_INDEX = 31, + CCERR_INVALID_SGL_FORMAT = 32, + CCERR_ADAPTER_TIMEOUT = 33, + CCERR_INVALID_CQ_DEPTH = 34, + CCERR_INVALID_PRIVATE_DATA_LENGTH = 35, + CCERR_INVALID_EP = 36, + CCERR_MR_IN_USE = CCERR_STAG_IN_USE, + CCERR_FLUSHED = 38, + CCERR_INVALID_WQE = 39, + CCERR_LOCAL_QP_CATASTROPHIC_ERROR = 40, + CCERR_REMOTE_TERMINATION_ERROR = 41, + CCERR_BASE_AND_BOUNDS_VIOLATION = 42, + CCERR_ACCESS_VIOLATION = 43, + CCERR_INVALID_PD_ID = 44, + CCERR_WRAP_ERROR = 45, + CCERR_INV_STAG_ACCESS_ERROR = 46, + CCERR_ZERO_RDMA_READ_RESOURCES = 47, + CCERR_QP_NOT_PRIVILEGED = 48, + CCERR_STAG_STATE_NOT_INVALID = 49, + CCERR_INVALID_PAGE_SIZE = 50, + CCERR_INVALID_BUFFER_SIZE = 51, + CCERR_INVALID_PBE = 52, + CCERR_INVALID_FBO = 53, + CCERR_INVALID_LENGTH = 54, + CCERR_INVALID_ACCESS_RIGHTS = 55, + CCERR_PBL_TOO_BIG = 56, + CCERR_INVALID_VA = 57, + CCERR_INVALID_REGION = 58, + CCERR_INVALID_WINDOW = 59, + CCERR_TOTAL_LENGTH_TOO_BIG = 60, + CCERR_INVALID_QP_ID = 61, + CCERR_ADDR_IN_USE = 62, + CCERR_ADDR_NOT_AVAIL = 63, + CCERR_NET_DOWN = 64, + CCERR_NET_UNREACHABLE = 65, + CCERR_CONN_ABORTED = 66, + CCERR_CONN_RESET = 67, + CCERR_NO_BUFS = 68, + CCERR_CONN_TIMEDOUT = 69, + CCERR_CONN_REFUSED = 70, + CCERR_HOST_UNREACHABLE = 71, + CCERR_INVALID_SEND_SGL_DEPTH = 72, + CCERR_INVALID_RECV_SGL_DEPTH = 73, + CCERR_INVALID_RDMA_WRITE_SGL_DEPTH = 74, + CCERR_INSUFFICIENT_PRIVILEGES = 75, + CCERR_STACK_ERROR = 76, + CCERR_INVALID_VERSION = 77, + CCERR_INVALID_MTU = 78, + CCERR_INVALID_IMAGE = 79, + CCERR_PENDING = 98, /* not an error; user internally by adapter */ + CCERR_DEFER = 99, /* not an error; used internally by adapter */ + CCERR_FAILED_WRITE = 100, + CCERR_FAILED_ERASE = 101, + CCERR_FAILED_VERIFICATION = 102, + CCERR_NOT_FOUND = 103, + +}; + +/* + * CCAE_ACTIVE_CONNECT_RESULTS status result codes. + */ +enum c2_connect_status { + C2_CONN_STATUS_SUCCESS = C2_OK, + C2_CONN_STATUS_NO_MEM = CCERR_INSUFFICIENT_RESOURCES, + C2_CONN_STATUS_TIMEDOUT = CCERR_CONN_TIMEDOUT, + C2_CONN_STATUS_REFUSED = CCERR_CONN_REFUSED, + C2_CONN_STATUS_NETUNREACH = CCERR_NET_UNREACHABLE, + C2_CONN_STATUS_HOSTUNREACH = CCERR_HOST_UNREACHABLE, + C2_CONN_STATUS_INVALID_RNIC = CCERR_INVALID_RNIC, + C2_CONN_STATUS_INVALID_QP = CCERR_INVALID_QP, + C2_CONN_STATUS_INVALID_QP_STATE = CCERR_INVALID_QP_STATE, + C2_CONN_STATUS_REJECTED = CCERR_CONN_RESET, + C2_CONN_STATUS_ADDR_NOT_AVAIL = CCERR_ADDR_NOT_AVAIL, +}; + +/* + * Flash programming status codes. + */ +enum c2_flash_status { + C2_FLASH_STATUS_SUCCESS = 0x0000, + C2_FLASH_STATUS_VERIFY_ERR = 0x0002, + C2_FLASH_STATUS_IMAGE_ERR = 0x0004, + C2_FLASH_STATUS_ECLBS = 0x0400, + C2_FLASH_STATUS_PSLBS = 0x0800, + C2_FLASH_STATUS_VPENS = 0x1000, +}; + +#endif /* _C2_STATUS_H_ */ diff --git a/drivers/infiniband/hw/amso1100/c2_user.h b/drivers/infiniband/hw/amso1100/c2_user.h new file mode 100644 index 0000000..7e9e7ad --- /dev/null +++ b/drivers/infiniband/hw/amso1100/c2_user.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef C2_USER_H +#define C2_USER_H + +#include + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ + +struct c2_alloc_ucontext_resp { + __u32 qp_tab_size; + __u32 uarc_size; +}; + +struct c2_alloc_pd_resp { + __u32 pdn; + __u32 reserved; +}; + +struct c2_create_cq { + __u32 lkey; + __u32 pdn; + __u64 arm_db_page; + __u64 set_db_page; + __u32 arm_db_index; + __u32 set_db_index; +}; + +struct c2_create_cq_resp { + __u32 cqn; + __u32 reserved; +}; + +struct c2_create_qp { + __u32 lkey; + __u32 reserved; + __u64 sq_db_page; + __u64 rq_db_page; + __u32 sq_db_index; + __u32 rq_db_index; +}; + +#endif /* C2_USER_H */ diff --git a/drivers/infiniband/hw/amso1100/c2_vq.c b/drivers/infiniband/hw/amso1100/c2_vq.c new file mode 100644 index 0000000..2ec716f --- /dev/null +++ b/drivers/infiniband/hw/amso1100/c2_vq.c @@ -0,0 +1,260 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include + +#include "c2_vq.h" +#include "c2_provider.h" + +/* + * Verbs Request Objects: + * + * VQ Request Objects are allocated by the kernel verbs handlers. + * They contain a wait object, a refcnt, an atomic bool indicating that the + * adapter has replied, and a copy of the verb reply work request. + * A pointer to the VQ Request Object is passed down in the context + * field of the work request message, and reflected back by the adapter + * in the verbs reply message. The function handle_vq() in the interrupt + * path will use this pointer to: + * 1) append a copy of the verbs reply message + * 2) mark that the reply is ready + * 3) wake up the kernel verbs handler blocked awaiting the reply. + * + * + * The kernel verbs handlers do a "get" to put a 2nd reference on the + * VQ Request object. If the kernel verbs handler exits before the adapter + * can respond, this extra reference will keep the VQ Request object around + * until the adapter's reply can be processed. The reason we need this is + * because a pointer to this object is stuffed into the context field of + * the verbs work request message, and reflected back in the reply message. + * It is used in the interrupt handler (handle_vq()) to wake up the appropriate + * kernel verb handler that is blocked awaiting the verb reply. + * So handle_vq() will do a "put" on the object when it's done accessing it. + * NOTE: If we guarantee that the kernel verb handler will never bail before + * getting the reply, then we don't need these refcnts. + * + * + * VQ Request objects are freed by the kernel verbs handlers only + * after the verb has been processed, or when the adapter fails and + * does not reply. + * + * + * Verbs Reply Buffers: + * + * VQ Reply bufs are local host memory copies of a + * outstanding Verb Request reply + * message. The are always allocated by the kernel verbs handlers, and _may_ be + * freed by either the kernel verbs handler -or- the interrupt handler. The + * kernel verbs handler _must_ free the repbuf, then free the vq request object + * in that order. + */ + +int vq_init(struct c2_dev *c2dev) +{ + sprintf(c2dev->vq_cache_name, "c2-vq:dev%c", + (char) ('0' + c2dev->devnum)); + c2dev->host_msg_cache = + kmem_cache_create(c2dev->vq_cache_name, c2dev->rep_vq.msg_size, 0, + SLAB_HWCACHE_ALIGN, NULL); + if (c2dev->host_msg_cache == NULL) { + return -ENOMEM; + } + return 0; +} + +void vq_term(struct c2_dev *c2dev) +{ + kmem_cache_destroy(c2dev->host_msg_cache); +} + +/* vq_req_alloc - allocate a VQ Request Object and initialize it. + * The refcnt is set to 1. + */ +struct c2_vq_req *vq_req_alloc(struct c2_dev *c2dev) +{ + struct c2_vq_req *r; + + r = kmalloc(sizeof(struct c2_vq_req), GFP_KERNEL); + if (r) { + init_waitqueue_head(&r->wait_object); + r->reply_msg = 0; + r->event = 0; + r->cm_id = NULL; + r->qp = NULL; + atomic_set(&r->refcnt, 1); + atomic_set(&r->reply_ready, 0); + } + return r; +} + + +/* vq_req_free - free the VQ Request Object. It is assumed the verbs handler + * has already free the VQ Reply Buffer if it existed. + */ +void vq_req_free(struct c2_dev *c2dev, struct c2_vq_req *r) +{ + r->reply_msg = 0; + if (atomic_dec_and_test(&r->refcnt)) { + kfree(r); + } +} + +/* vq_req_get - reference a VQ Request Object. Done + * only in the kernel verbs handlers. + */ +void vq_req_get(struct c2_dev *c2dev, struct c2_vq_req *r) +{ + atomic_inc(&r->refcnt); +} + + +/* vq_req_put - dereference and potentially free a VQ Request Object. + * + * This is only called by handle_vq() on the + * interrupt when it is done processing + * a verb reply message. If the associated + * kernel verbs handler has already bailed, + * then this put will actually free the VQ + * Request object _and_ the VQ Reply Buffer + * if it exists. + */ +void vq_req_put(struct c2_dev *c2dev, struct c2_vq_req *r) +{ + if (atomic_dec_and_test(&r->refcnt)) { + if (r->reply_msg != 0) + vq_repbuf_free(c2dev, + (void *) (unsigned long) r->reply_msg); + kfree(r); + } +} + + +/* + * vq_repbuf_alloc - allocate a VQ Reply Buffer. + */ +void *vq_repbuf_alloc(struct c2_dev *c2dev) +{ + return kmem_cache_alloc(c2dev->host_msg_cache, GFP_ATOMIC); +} + +/* + * vq_send_wr - post a verbs request message to the Verbs Request Queue. + * If a message is not available in the MQ, then block until one is available. + * NOTE: handle_mq() on the interrupt context will wake up threads blocked here. + * When the adapter drains the Verbs Request Queue, + * it inserts MQ index 0 in to the + * adapter->host activity fifo and interrupts the host. + */ +int vq_send_wr(struct c2_dev *c2dev, union c2wr *wr) +{ + void *msg; + wait_queue_t __wait; + + /* + * grab adapter vq lock + */ + spin_lock(&c2dev->vqlock); + + /* + * allocate msg + */ + msg = c2_mq_alloc(&c2dev->req_vq); + + /* + * If we cannot get a msg, then we'll wait + * When a messages are available, the int handler will wake_up() + * any waiters. + */ + while (msg == NULL) { + pr_debug("%s:%d no available msg in VQ, waiting...\n", + __func__, __LINE__); + init_waitqueue_entry(&__wait, current); + add_wait_queue(&c2dev->req_vq_wo, &__wait); + spin_unlock(&c2dev->vqlock); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (!c2_mq_full(&c2dev->req_vq)) { + break; + } + if (!signal_pending(current)) { + schedule_timeout(1 * HZ); /* 1 second... */ + continue; + } + set_current_state(TASK_RUNNING); + remove_wait_queue(&c2dev->req_vq_wo, &__wait); + return -EINTR; + } + set_current_state(TASK_RUNNING); + remove_wait_queue(&c2dev->req_vq_wo, &__wait); + spin_lock(&c2dev->vqlock); + msg = c2_mq_alloc(&c2dev->req_vq); + } + + /* + * copy wr into adapter msg + */ + memcpy(msg, wr, c2dev->req_vq.msg_size); + + /* + * post msg + */ + c2_mq_produce(&c2dev->req_vq); + + /* + * release adapter vq lock + */ + spin_unlock(&c2dev->vqlock); + return 0; +} + + +/* + * vq_wait_for_reply - block until the adapter posts a Verb Reply Message. + */ +int vq_wait_for_reply(struct c2_dev *c2dev, struct c2_vq_req *req) +{ + if (!wait_event_timeout(req->wait_object, + atomic_read(&req->reply_ready), + 60*HZ)) + return -ETIMEDOUT; + + return 0; +} + +/* + * vq_repbuf_free - Free a Verbs Reply Buffer. + */ +void vq_repbuf_free(struct c2_dev *c2dev, void *reply) +{ + kmem_cache_free(c2dev->host_msg_cache, reply); +} diff --git a/drivers/infiniband/hw/amso1100/c2_vq.h b/drivers/infiniband/hw/amso1100/c2_vq.h new file mode 100644 index 0000000..3380562 --- /dev/null +++ b/drivers/infiniband/hw/amso1100/c2_vq.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _C2_VQ_H_ +#define _C2_VQ_H_ +#include +#include "c2.h" +#include "c2_wr.h" +#include "c2_provider.h" + +struct c2_vq_req { + u64 reply_msg; /* ptr to reply msg */ + wait_queue_head_t wait_object; /* wait object for vq reqs */ + atomic_t reply_ready; /* set when reply is ready */ + atomic_t refcnt; /* used to cancel WRs... */ + int event; + struct iw_cm_id *cm_id; + struct c2_qp *qp; +}; + +extern int vq_init(struct c2_dev *c2dev); +extern void vq_term(struct c2_dev *c2dev); + +extern struct c2_vq_req *vq_req_alloc(struct c2_dev *c2dev); +extern void vq_req_free(struct c2_dev *c2dev, struct c2_vq_req *req); +extern void vq_req_get(struct c2_dev *c2dev, struct c2_vq_req *req); +extern void vq_req_put(struct c2_dev *c2dev, struct c2_vq_req *req); +extern int vq_send_wr(struct c2_dev *c2dev, union c2wr * wr); + +extern void *vq_repbuf_alloc(struct c2_dev *c2dev); +extern void vq_repbuf_free(struct c2_dev *c2dev, void *reply); + +extern int vq_wait_for_reply(struct c2_dev *c2dev, struct c2_vq_req *req); +#endif /* _C2_VQ_H_ */ diff --git a/drivers/infiniband/hw/amso1100/c2_wr.h b/drivers/infiniband/hw/amso1100/c2_wr.h new file mode 100644 index 0000000..8d4b4ca --- /dev/null +++ b/drivers/infiniband/hw/amso1100/c2_wr.h @@ -0,0 +1,1520 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _C2_WR_H_ +#define _C2_WR_H_ + +#ifdef CCDEBUG +#define CCWR_MAGIC 0xb07700b0 +#endif + +#define C2_QP_NO_ATTR_CHANGE 0xFFFFFFFF + +/* Maximum allowed size in bytes of private_data exchange + * on connect. + */ +#define C2_MAX_PRIVATE_DATA_SIZE 200 + +/* + * These types are shared among the adapter, host, and CCIL consumer. + */ +enum c2_cq_notification_type { + C2_CQ_NOTIFICATION_TYPE_NONE = 1, + C2_CQ_NOTIFICATION_TYPE_NEXT, + C2_CQ_NOTIFICATION_TYPE_NEXT_SE +}; + +enum c2_setconfig_cmd { + C2_CFG_ADD_ADDR = 1, + C2_CFG_DEL_ADDR = 2, + C2_CFG_ADD_ROUTE = 3, + C2_CFG_DEL_ROUTE = 4 +}; + +enum c2_getconfig_cmd { + C2_GETCONFIG_ROUTES = 1, + C2_GETCONFIG_ADDRS +}; + +/* + * CCIL Work Request Identifiers + */ +enum c2wr_ids { + CCWR_RNIC_OPEN = 1, + CCWR_RNIC_QUERY, + CCWR_RNIC_SETCONFIG, + CCWR_RNIC_GETCONFIG, + CCWR_RNIC_CLOSE, + CCWR_CQ_CREATE, + CCWR_CQ_QUERY, + CCWR_CQ_MODIFY, + CCWR_CQ_DESTROY, + CCWR_QP_CONNECT, + CCWR_PD_ALLOC, + CCWR_PD_DEALLOC, + CCWR_SRQ_CREATE, + CCWR_SRQ_QUERY, + CCWR_SRQ_MODIFY, + CCWR_SRQ_DESTROY, + CCWR_QP_CREATE, + CCWR_QP_QUERY, + CCWR_QP_MODIFY, + CCWR_QP_DESTROY, + CCWR_NSMR_STAG_ALLOC, + CCWR_NSMR_REGISTER, + CCWR_NSMR_PBL, + CCWR_STAG_DEALLOC, + CCWR_NSMR_REREGISTER, + CCWR_SMR_REGISTER, + CCWR_MR_QUERY, + CCWR_MW_ALLOC, + CCWR_MW_QUERY, + CCWR_EP_CREATE, + CCWR_EP_GETOPT, + CCWR_EP_SETOPT, + CCWR_EP_DESTROY, + CCWR_EP_BIND, + CCWR_EP_CONNECT, + CCWR_EP_LISTEN, + CCWR_EP_SHUTDOWN, + CCWR_EP_LISTEN_CREATE, + CCWR_EP_LISTEN_DESTROY, + CCWR_EP_QUERY, + CCWR_CR_ACCEPT, + CCWR_CR_REJECT, + CCWR_CONSOLE, + CCWR_TERM, + CCWR_FLASH_INIT, + CCWR_FLASH, + CCWR_BUF_ALLOC, + CCWR_BUF_FREE, + CCWR_FLASH_WRITE, + CCWR_INIT, /* WARNING: Don't move this ever again! */ + + + + /* Add new IDs here */ + + + + /* + * WARNING: CCWR_LAST must always be the last verbs id defined! + * All the preceding IDs are fixed, and must not change. + * You can add new IDs, but must not remove or reorder + * any IDs. If you do, YOU will ruin any hope of + * compatibility between versions. + */ + CCWR_LAST, + + /* + * Start over at 1 so that arrays indexed by user wr id's + * begin at 1. This is OK since the verbs and user wr id's + * are always used on disjoint sets of queues. + */ + /* + * The order of the CCWR_SEND_XX verbs must + * match the order of the RDMA_OPs + */ + CCWR_SEND = 1, + CCWR_SEND_INV, + CCWR_SEND_SE, + CCWR_SEND_SE_INV, + CCWR_RDMA_WRITE, + CCWR_RDMA_READ, + CCWR_RDMA_READ_INV, + CCWR_MW_BIND, + CCWR_NSMR_FASTREG, + CCWR_STAG_INVALIDATE, + CCWR_RECV, + CCWR_NOP, + CCWR_UNIMPL, +/* WARNING: This must always be the last user wr id defined! */ +}; +#define RDMA_SEND_OPCODE_FROM_WR_ID(x) (x+2) + +/* + * SQ/RQ Work Request Types + */ +enum c2_wr_type { + C2_WR_TYPE_SEND = CCWR_SEND, + C2_WR_TYPE_SEND_SE = CCWR_SEND_SE, + C2_WR_TYPE_SEND_INV = CCWR_SEND_INV, + C2_WR_TYPE_SEND_SE_INV = CCWR_SEND_SE_INV, + C2_WR_TYPE_RDMA_WRITE = CCWR_RDMA_WRITE, + C2_WR_TYPE_RDMA_READ = CCWR_RDMA_READ, + C2_WR_TYPE_RDMA_READ_INV_STAG = CCWR_RDMA_READ_INV, + C2_WR_TYPE_BIND_MW = CCWR_MW_BIND, + C2_WR_TYPE_FASTREG_NSMR = CCWR_NSMR_FASTREG, + C2_WR_TYPE_INV_STAG = CCWR_STAG_INVALIDATE, + C2_WR_TYPE_RECV = CCWR_RECV, + C2_WR_TYPE_NOP = CCWR_NOP, +}; + +struct c2_netaddr { + __be32 ip_addr; + __be32 netmask; + u32 mtu; +}; + +struct c2_route { + u32 ip_addr; /* 0 indicates the default route */ + u32 netmask; /* netmask associated with dst */ + u32 flags; + union { + u32 ipaddr; /* address of the nexthop interface */ + u8 enaddr[6]; + } nexthop; +}; + +/* + * A Scatter Gather Entry. + */ +struct c2_data_addr { + __be32 stag; + __be32 length; + __be64 to; +}; + +/* + * MR and MW flags used by the consumer, RI, and RNIC. + */ +enum c2_mm_flags { + MEM_REMOTE = 0x0001, /* allow mw binds with remote access. */ + MEM_VA_BASED = 0x0002, /* Not Zero-based */ + MEM_PBL_COMPLETE = 0x0004, /* PBL array is complete in this msg */ + MEM_LOCAL_READ = 0x0008, /* allow local reads */ + MEM_LOCAL_WRITE = 0x0010, /* allow local writes */ + MEM_REMOTE_READ = 0x0020, /* allow remote reads */ + MEM_REMOTE_WRITE = 0x0040, /* allow remote writes */ + MEM_WINDOW_BIND = 0x0080, /* binds allowed */ + MEM_SHARED = 0x0100, /* set if MR is shared */ + MEM_STAG_VALID = 0x0200 /* set if STAG is in valid state */ +}; + +/* + * CCIL API ACF flags defined in terms of the low level mem flags. + * This minimizes translation needed in the user API + */ +enum c2_acf { + C2_ACF_LOCAL_READ = MEM_LOCAL_READ, + C2_ACF_LOCAL_WRITE = MEM_LOCAL_WRITE, + C2_ACF_REMOTE_READ = MEM_REMOTE_READ, + C2_ACF_REMOTE_WRITE = MEM_REMOTE_WRITE, + C2_ACF_WINDOW_BIND = MEM_WINDOW_BIND +}; + +/* + * Image types of objects written to flash + */ +#define C2_FLASH_IMG_BITFILE 1 +#define C2_FLASH_IMG_OPTION_ROM 2 +#define C2_FLASH_IMG_VPD 3 + +/* + * to fix bug 1815 we define the max size allowable of the + * terminate message (per the IETF spec).Refer to the IETF + * protocol specification, section 12.1.6, page 64) + * The message is prefixed by 20 types of DDP info. + * + * Then the message has 6 bytes for the terminate control + * and DDP segment length info plus a DDP header (either + * 14 or 18 byts) plus 28 bytes for the RDMA header. + * Thus the max size in: + * 20 + (6 + 18 + 28) = 72 + */ +#define C2_MAX_TERMINATE_MESSAGE_SIZE (72) + +/* + * Build String Length. It must be the same as C2_BUILD_STR_LEN in ccil_api.h + */ +#define WR_BUILD_STR_LEN 64 + +/* + * WARNING: All of these structs need to align any 64bit types on + * 64 bit boundaries! 64bit types include u64 and u64. + */ + +/* + * Clustercore Work Request Header. Be sensitive to field layout + * and alignment. + */ +struct c2wr_hdr { + /* wqe_count is part of the cqe. It is put here so the + * adapter can write to it while the wr is pending without + * clobbering part of the wr. This word need not be dma'd + * from the host to adapter by libccil, but we copy it anyway + * to make the memcpy to the adapter better aligned. + */ + __be32 wqe_count; + + /* Put these fields next so that later 32- and 64-bit + * quantities are naturally aligned. + */ + u8 id; + u8 result; /* adapter -> host */ + u8 sge_count; /* host -> adapter */ + u8 flags; /* host -> adapter */ + + u64 context; +#ifdef CCMSGMAGIC + u32 magic; + u32 pad; +#endif +} __attribute__((packed)); + +/* + *------------------------ RNIC ------------------------ + */ + +/* + * WR_RNIC_OPEN + */ + +/* + * Flags for the RNIC WRs + */ +enum c2_rnic_flags { + RNIC_IRD_STATIC = 0x0001, + RNIC_ORD_STATIC = 0x0002, + RNIC_QP_STATIC = 0x0004, + RNIC_SRQ_SUPPORTED = 0x0008, + RNIC_PBL_BLOCK_MODE = 0x0010, + RNIC_SRQ_MODEL_ARRIVAL = 0x0020, + RNIC_CQ_OVF_DETECTED = 0x0040, + RNIC_PRIV_MODE = 0x0080 +}; + +struct c2wr_rnic_open_req { + struct c2wr_hdr hdr; + u64 user_context; + __be16 flags; /* See enum c2_rnic_flags */ + __be16 port_num; +} __attribute__((packed)); + +struct c2wr_rnic_open_rep { + struct c2wr_hdr hdr; + u32 rnic_handle; +} __attribute__((packed)); + +union c2wr_rnic_open { + struct c2wr_rnic_open_req req; + struct c2wr_rnic_open_rep rep; +} __attribute__((packed)); + +struct c2wr_rnic_query_req { + struct c2wr_hdr hdr; + u32 rnic_handle; +} __attribute__((packed)); + +/* + * WR_RNIC_QUERY + */ +struct c2wr_rnic_query_rep { + struct c2wr_hdr hdr; + u64 user_context; + __be32 vendor_id; + __be32 part_number; + __be32 hw_version; + __be32 fw_ver_major; + __be32 fw_ver_minor; + __be32 fw_ver_patch; + char fw_ver_build_str[WR_BUILD_STR_LEN]; + __be32 max_qps; + __be32 max_qp_depth; + u32 max_srq_depth; + u32 max_send_sgl_depth; + u32 max_rdma_sgl_depth; + __be32 max_cqs; + __be32 max_cq_depth; + u32 max_cq_event_handlers; + __be32 max_mrs; + u32 max_pbl_depth; + __be32 max_pds; + __be32 max_global_ird; + u32 max_global_ord; + __be32 max_qp_ird; + __be32 max_qp_ord; + u32 flags; + __be32 max_mws; + u32 pbe_range_low; + u32 pbe_range_high; + u32 max_srqs; + u32 page_size; +} __attribute__((packed)); + +union c2wr_rnic_query { + struct c2wr_rnic_query_req req; + struct c2wr_rnic_query_rep rep; +} __attribute__((packed)); + +/* + * WR_RNIC_GETCONFIG + */ + +struct c2wr_rnic_getconfig_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 option; /* see c2_getconfig_cmd_t */ + u64 reply_buf; + u32 reply_buf_len; +} __attribute__((packed)) ; + +struct c2wr_rnic_getconfig_rep { + struct c2wr_hdr hdr; + u32 option; /* see c2_getconfig_cmd_t */ + u32 count_len; /* length of the number of addresses configured */ +} __attribute__((packed)) ; + +union c2wr_rnic_getconfig { + struct c2wr_rnic_getconfig_req req; + struct c2wr_rnic_getconfig_rep rep; +} __attribute__((packed)) ; + +/* + * WR_RNIC_SETCONFIG + */ +struct c2wr_rnic_setconfig_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + __be32 option; /* See c2_setconfig_cmd_t */ + /* variable data and pad. See c2_netaddr and c2_route */ + u8 data[0]; +} __attribute__((packed)) ; + +struct c2wr_rnic_setconfig_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_rnic_setconfig { + struct c2wr_rnic_setconfig_req req; + struct c2wr_rnic_setconfig_rep rep; +} __attribute__((packed)) ; + +/* + * WR_RNIC_CLOSE + */ +struct c2wr_rnic_close_req { + struct c2wr_hdr hdr; + u32 rnic_handle; +} __attribute__((packed)) ; + +struct c2wr_rnic_close_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_rnic_close { + struct c2wr_rnic_close_req req; + struct c2wr_rnic_close_rep rep; +} __attribute__((packed)) ; + +/* + *------------------------ CQ ------------------------ + */ +struct c2wr_cq_create_req { + struct c2wr_hdr hdr; + __be64 shared_ht; + u64 user_context; + __be64 msg_pool; + u32 rnic_handle; + __be32 msg_size; + __be32 depth; +} __attribute__((packed)) ; + +struct c2wr_cq_create_rep { + struct c2wr_hdr hdr; + __be32 mq_index; + __be32 adapter_shared; + u32 cq_handle; +} __attribute__((packed)) ; + +union c2wr_cq_create { + struct c2wr_cq_create_req req; + struct c2wr_cq_create_rep rep; +} __attribute__((packed)) ; + +struct c2wr_cq_modify_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 cq_handle; + u32 new_depth; + u64 new_msg_pool; +} __attribute__((packed)) ; + +struct c2wr_cq_modify_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_cq_modify { + struct c2wr_cq_modify_req req; + struct c2wr_cq_modify_rep rep; +} __attribute__((packed)) ; + +struct c2wr_cq_destroy_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 cq_handle; +} __attribute__((packed)) ; + +struct c2wr_cq_destroy_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_cq_destroy { + struct c2wr_cq_destroy_req req; + struct c2wr_cq_destroy_rep rep; +} __attribute__((packed)) ; + +/* + *------------------------ PD ------------------------ + */ +struct c2wr_pd_alloc_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 pd_id; +} __attribute__((packed)) ; + +struct c2wr_pd_alloc_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_pd_alloc { + struct c2wr_pd_alloc_req req; + struct c2wr_pd_alloc_rep rep; +} __attribute__((packed)) ; + +struct c2wr_pd_dealloc_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 pd_id; +} __attribute__((packed)) ; + +struct c2wr_pd_dealloc_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_pd_dealloc { + struct c2wr_pd_dealloc_req req; + struct c2wr_pd_dealloc_rep rep; +} __attribute__((packed)) ; + +/* + *------------------------ SRQ ------------------------ + */ +struct c2wr_srq_create_req { + struct c2wr_hdr hdr; + u64 shared_ht; + u64 user_context; + u32 rnic_handle; + u32 srq_depth; + u32 srq_limit; + u32 sgl_depth; + u32 pd_id; +} __attribute__((packed)) ; + +struct c2wr_srq_create_rep { + struct c2wr_hdr hdr; + u32 srq_depth; + u32 sgl_depth; + u32 msg_size; + u32 mq_index; + u32 mq_start; + u32 srq_handle; +} __attribute__((packed)) ; + +union c2wr_srq_create { + struct c2wr_srq_create_req req; + struct c2wr_srq_create_rep rep; +} __attribute__((packed)) ; + +struct c2wr_srq_destroy_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 srq_handle; +} __attribute__((packed)) ; + +struct c2wr_srq_destroy_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_srq_destroy { + struct c2wr_srq_destroy_req req; + struct c2wr_srq_destroy_rep rep; +} __attribute__((packed)) ; + +/* + *------------------------ QP ------------------------ + */ +enum c2wr_qp_flags { + QP_RDMA_READ = 0x00000001, /* RDMA read enabled? */ + QP_RDMA_WRITE = 0x00000002, /* RDMA write enabled? */ + QP_MW_BIND = 0x00000004, /* MWs enabled */ + QP_ZERO_STAG = 0x00000008, /* enabled? */ + QP_REMOTE_TERMINATION = 0x00000010, /* remote end terminated */ + QP_RDMA_READ_RESPONSE = 0x00000020 /* Remote RDMA read */ + /* enabled? */ +}; + +struct c2wr_qp_create_req { + struct c2wr_hdr hdr; + __be64 shared_sq_ht; + __be64 shared_rq_ht; + u64 user_context; + u32 rnic_handle; + u32 sq_cq_handle; + u32 rq_cq_handle; + __be32 sq_depth; + __be32 rq_depth; + u32 srq_handle; + u32 srq_limit; + __be32 flags; /* see enum c2wr_qp_flags */ + __be32 send_sgl_depth; + __be32 recv_sgl_depth; + __be32 rdma_write_sgl_depth; + __be32 ord; + __be32 ird; + u32 pd_id; +} __attribute__((packed)) ; + +struct c2wr_qp_create_rep { + struct c2wr_hdr hdr; + __be32 sq_depth; + __be32 rq_depth; + u32 send_sgl_depth; + u32 recv_sgl_depth; + u32 rdma_write_sgl_depth; + u32 ord; + u32 ird; + __be32 sq_msg_size; + __be32 sq_mq_index; + __be32 sq_mq_start; + __be32 rq_msg_size; + __be32 rq_mq_index; + __be32 rq_mq_start; + u32 qp_handle; +} __attribute__((packed)) ; + +union c2wr_qp_create { + struct c2wr_qp_create_req req; + struct c2wr_qp_create_rep rep; +} __attribute__((packed)) ; + +struct c2wr_qp_query_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 qp_handle; +} __attribute__((packed)) ; + +struct c2wr_qp_query_rep { + struct c2wr_hdr hdr; + u64 user_context; + u32 rnic_handle; + u32 sq_depth; + u32 rq_depth; + u32 send_sgl_depth; + u32 rdma_write_sgl_depth; + u32 recv_sgl_depth; + u32 ord; + u32 ird; + u16 qp_state; + u16 flags; /* see c2wr_qp_flags_t */ + u32 qp_id; + u32 local_addr; + u32 remote_addr; + u16 local_port; + u16 remote_port; + u32 terminate_msg_length; /* 0 if not present */ + u8 data[0]; + /* Terminate Message in-line here. */ +} __attribute__((packed)) ; + +union c2wr_qp_query { + struct c2wr_qp_query_req req; + struct c2wr_qp_query_rep rep; +} __attribute__((packed)) ; + +struct c2wr_qp_modify_req { + struct c2wr_hdr hdr; + u64 stream_msg; + u32 stream_msg_length; + u32 rnic_handle; + u32 qp_handle; + __be32 next_qp_state; + __be32 ord; + __be32 ird; + __be32 sq_depth; + __be32 rq_depth; + u32 llp_ep_handle; +} __attribute__((packed)) ; + +struct c2wr_qp_modify_rep { + struct c2wr_hdr hdr; + u32 ord; + u32 ird; + u32 sq_depth; + u32 rq_depth; + u32 sq_msg_size; + u32 sq_mq_index; + u32 sq_mq_start; + u32 rq_msg_size; + u32 rq_mq_index; + u32 rq_mq_start; +} __attribute__((packed)) ; + +union c2wr_qp_modify { + struct c2wr_qp_modify_req req; + struct c2wr_qp_modify_rep rep; +} __attribute__((packed)) ; + +struct c2wr_qp_destroy_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 qp_handle; +} __attribute__((packed)) ; + +struct c2wr_qp_destroy_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_qp_destroy { + struct c2wr_qp_destroy_req req; + struct c2wr_qp_destroy_rep rep; +} __attribute__((packed)) ; + +/* + * The CCWR_QP_CONNECT msg is posted on the verbs request queue. It can + * only be posted when a QP is in IDLE state. After the connect request is + * submitted to the LLP, the adapter moves the QP to CONNECT_PENDING state. + * No synchronous reply from adapter to this WR. The results of + * connection are passed back in an async event CCAE_ACTIVE_CONNECT_RESULTS + * See c2wr_ae_active_connect_results_t + */ +struct c2wr_qp_connect_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 qp_handle; + __be32 remote_addr; + __be16 remote_port; + u16 pad; + __be32 private_data_length; + u8 private_data[0]; /* Private data in-line. */ +} __attribute__((packed)) ; + +struct c2wr_qp_connect { + struct c2wr_qp_connect_req req; + /* no synchronous reply. */ +} __attribute__((packed)) ; + + +/* + *------------------------ MM ------------------------ + */ + +struct c2wr_nsmr_stag_alloc_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 pbl_depth; + u32 pd_id; + u32 flags; +} __attribute__((packed)) ; + +struct c2wr_nsmr_stag_alloc_rep { + struct c2wr_hdr hdr; + u32 pbl_depth; + u32 stag_index; +} __attribute__((packed)) ; + +union c2wr_nsmr_stag_alloc { + struct c2wr_nsmr_stag_alloc_req req; + struct c2wr_nsmr_stag_alloc_rep rep; +} __attribute__((packed)) ; + +struct c2wr_nsmr_register_req { + struct c2wr_hdr hdr; + __be64 va; + u32 rnic_handle; + __be16 flags; + u8 stag_key; + u8 pad; + u32 pd_id; + __be32 pbl_depth; + __be32 pbe_size; + __be32 fbo; + __be32 length; + __be32 addrs_length; + /* array of paddrs (must be aligned on a 64bit boundary) */ + __be64 paddrs[0]; +} __attribute__((packed)) ; + +struct c2wr_nsmr_register_rep { + struct c2wr_hdr hdr; + u32 pbl_depth; + __be32 stag_index; +} __attribute__((packed)) ; + +union c2wr_nsmr_register { + struct c2wr_nsmr_register_req req; + struct c2wr_nsmr_register_rep rep; +} __attribute__((packed)) ; + +struct c2wr_nsmr_pbl_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + __be32 flags; + __be32 stag_index; + __be32 addrs_length; + /* array of paddrs (must be aligned on a 64bit boundary) */ + __be64 paddrs[0]; +} __attribute__((packed)) ; + +struct c2wr_nsmr_pbl_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_nsmr_pbl { + struct c2wr_nsmr_pbl_req req; + struct c2wr_nsmr_pbl_rep rep; +} __attribute__((packed)) ; + +struct c2wr_mr_query_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 stag_index; +} __attribute__((packed)) ; + +struct c2wr_mr_query_rep { + struct c2wr_hdr hdr; + u8 stag_key; + u8 pad[3]; + u32 pd_id; + u32 flags; + u32 pbl_depth; +} __attribute__((packed)) ; + +union c2wr_mr_query { + struct c2wr_mr_query_req req; + struct c2wr_mr_query_rep rep; +} __attribute__((packed)) ; + +struct c2wr_mw_query_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 stag_index; +} __attribute__((packed)) ; + +struct c2wr_mw_query_rep { + struct c2wr_hdr hdr; + u8 stag_key; + u8 pad[3]; + u32 pd_id; + u32 flags; +} __attribute__((packed)) ; + +union c2wr_mw_query { + struct c2wr_mw_query_req req; + struct c2wr_mw_query_rep rep; +} __attribute__((packed)) ; + + +struct c2wr_stag_dealloc_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + __be32 stag_index; +} __attribute__((packed)) ; + +struct c2wr_stag_dealloc_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_stag_dealloc { + struct c2wr_stag_dealloc_req req; + struct c2wr_stag_dealloc_rep rep; +} __attribute__((packed)) ; + +struct c2wr_nsmr_reregister_req { + struct c2wr_hdr hdr; + u64 va; + u32 rnic_handle; + u16 flags; + u8 stag_key; + u8 pad; + u32 stag_index; + u32 pd_id; + u32 pbl_depth; + u32 pbe_size; + u32 fbo; + u32 length; + u32 addrs_length; + u32 pad1; + /* array of paddrs (must be aligned on a 64bit boundary) */ + u64 paddrs[0]; +} __attribute__((packed)) ; + +struct c2wr_nsmr_reregister_rep { + struct c2wr_hdr hdr; + u32 pbl_depth; + u32 stag_index; +} __attribute__((packed)) ; + +union c2wr_nsmr_reregister { + struct c2wr_nsmr_reregister_req req; + struct c2wr_nsmr_reregister_rep rep; +} __attribute__((packed)) ; + +struct c2wr_smr_register_req { + struct c2wr_hdr hdr; + u64 va; + u32 rnic_handle; + u16 flags; + u8 stag_key; + u8 pad; + u32 stag_index; + u32 pd_id; +} __attribute__((packed)) ; + +struct c2wr_smr_register_rep { + struct c2wr_hdr hdr; + u32 stag_index; +} __attribute__((packed)) ; + +union c2wr_smr_register { + struct c2wr_smr_register_req req; + struct c2wr_smr_register_rep rep; +} __attribute__((packed)) ; + +struct c2wr_mw_alloc_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 pd_id; +} __attribute__((packed)) ; + +struct c2wr_mw_alloc_rep { + struct c2wr_hdr hdr; + u32 stag_index; +} __attribute__((packed)) ; + +union c2wr_mw_alloc { + struct c2wr_mw_alloc_req req; + struct c2wr_mw_alloc_rep rep; +} __attribute__((packed)) ; + +/* + *------------------------ WRs ----------------------- + */ + +struct c2wr_user_hdr { + struct c2wr_hdr hdr; /* Has status and WR Type */ +} __attribute__((packed)) ; + +enum c2_qp_state { + C2_QP_STATE_IDLE = 0x01, + C2_QP_STATE_CONNECTING = 0x02, + C2_QP_STATE_RTS = 0x04, + C2_QP_STATE_CLOSING = 0x08, + C2_QP_STATE_TERMINATE = 0x10, + C2_QP_STATE_ERROR = 0x20, +}; + +/* Completion queue entry. */ +struct c2wr_ce { + struct c2wr_hdr hdr; /* Has status and WR Type */ + u64 qp_user_context; /* c2_user_qp_t * */ + u32 qp_state; /* Current QP State */ + u32 handle; /* QPID or EP Handle */ + __be32 bytes_rcvd; /* valid for RECV WCs */ + u32 stag; +} __attribute__((packed)) ; + + +/* + * Flags used for all post-sq WRs. These must fit in the flags + * field of the struct c2wr_hdr (eight bits). + */ +enum { + SQ_SIGNALED = 0x01, + SQ_READ_FENCE = 0x02, + SQ_FENCE = 0x04, +}; + +/* + * Common fields for all post-sq WRs. Namely the standard header and a + * secondary header with fields common to all post-sq WRs. + */ +struct c2_sq_hdr { + struct c2wr_user_hdr user_hdr; +} __attribute__((packed)); + +/* + * Same as above but for post-rq WRs. + */ +struct c2_rq_hdr { + struct c2wr_user_hdr user_hdr; +} __attribute__((packed)); + +/* + * use the same struct for all sends. + */ +struct c2wr_send_req { + struct c2_sq_hdr sq_hdr; + __be32 sge_len; + __be32 remote_stag; + u8 data[0]; /* SGE array */ +} __attribute__((packed)); + +union c2wr_send { + struct c2wr_send_req req; + struct c2wr_ce rep; +} __attribute__((packed)); + +struct c2wr_rdma_write_req { + struct c2_sq_hdr sq_hdr; + __be64 remote_to; + __be32 remote_stag; + __be32 sge_len; + u8 data[0]; /* SGE array */ +} __attribute__((packed)); + +union c2wr_rdma_write { + struct c2wr_rdma_write_req req; + struct c2wr_ce rep; +} __attribute__((packed)); + +struct c2wr_rdma_read_req { + struct c2_sq_hdr sq_hdr; + __be64 local_to; + __be64 remote_to; + __be32 local_stag; + __be32 remote_stag; + __be32 length; +} __attribute__((packed)); + +union c2wr_rdma_read { + struct c2wr_rdma_read_req req; + struct c2wr_ce rep; +} __attribute__((packed)); + +struct c2wr_mw_bind_req { + struct c2_sq_hdr sq_hdr; + u64 va; + u8 stag_key; + u8 pad[3]; + u32 mw_stag_index; + u32 mr_stag_index; + u32 length; + u32 flags; +} __attribute__((packed)); + +union c2wr_mw_bind { + struct c2wr_mw_bind_req req; + struct c2wr_ce rep; +} __attribute__((packed)); + +struct c2wr_nsmr_fastreg_req { + struct c2_sq_hdr sq_hdr; + u64 va; + u8 stag_key; + u8 pad[3]; + u32 stag_index; + u32 pbe_size; + u32 fbo; + u32 length; + u32 addrs_length; + /* array of paddrs (must be aligned on a 64bit boundary) */ + u64 paddrs[0]; +} __attribute__((packed)); + +union c2wr_nsmr_fastreg { + struct c2wr_nsmr_fastreg_req req; + struct c2wr_ce rep; +} __attribute__((packed)); + +struct c2wr_stag_invalidate_req { + struct c2_sq_hdr sq_hdr; + u8 stag_key; + u8 pad[3]; + u32 stag_index; +} __attribute__((packed)); + +union c2wr_stag_invalidate { + struct c2wr_stag_invalidate_req req; + struct c2wr_ce rep; +} __attribute__((packed)); + +union c2wr_sqwr { + struct c2_sq_hdr sq_hdr; + struct c2wr_send_req send; + struct c2wr_send_req send_se; + struct c2wr_send_req send_inv; + struct c2wr_send_req send_se_inv; + struct c2wr_rdma_write_req rdma_write; + struct c2wr_rdma_read_req rdma_read; + struct c2wr_mw_bind_req mw_bind; + struct c2wr_nsmr_fastreg_req nsmr_fastreg; + struct c2wr_stag_invalidate_req stag_inv; +} __attribute__((packed)); + + +/* + * RQ WRs + */ +struct c2wr_rqwr { + struct c2_rq_hdr rq_hdr; + u8 data[0]; /* array of SGEs */ +} __attribute__((packed)); + +union c2wr_recv { + struct c2wr_rqwr req; + struct c2wr_ce rep; +} __attribute__((packed)); + +/* + * All AEs start with this header. Most AEs only need to convey the + * information in the header. Some, like LLP connection events, need + * more info. The union typdef c2wr_ae_t has all the possible AEs. + * + * hdr.context is the user_context from the rnic_open WR. NULL If this + * is not affiliated with an rnic + * + * hdr.id is the AE identifier (eg; CCAE_REMOTE_SHUTDOWN, + * CCAE_LLP_CLOSE_COMPLETE) + * + * resource_type is one of: C2_RES_IND_QP, C2_RES_IND_CQ, C2_RES_IND_SRQ + * + * user_context is the context passed down when the host created the resource. + */ +struct c2wr_ae_hdr { + struct c2wr_hdr hdr; + u64 user_context; /* user context for this res. */ + __be32 resource_type; /* see enum c2_resource_indicator */ + __be32 resource; /* handle for resource */ + __be32 qp_state; /* current QP State */ +} __attribute__((packed)); + +/* + * After submitting the CCAE_ACTIVE_CONNECT_RESULTS message on the AEQ, + * the adapter moves the QP into RTS state + */ +struct c2wr_ae_active_connect_results { + struct c2wr_ae_hdr ae_hdr; + __be32 laddr; + __be32 raddr; + __be16 lport; + __be16 rport; + __be32 private_data_length; + u8 private_data[0]; /* data is in-line in the msg. */ +} __attribute__((packed)); + +/* + * When connections are established by the stack (and the private data + * MPA frame is received), the adapter will generate an event to the host. + * The details of the connection, any private data, and the new connection + * request handle is passed up via the CCAE_CONNECTION_REQUEST msg on the + * AE queue: + */ +struct c2wr_ae_connection_request { + struct c2wr_ae_hdr ae_hdr; + u32 cr_handle; /* connreq handle (sock ptr) */ + __be32 laddr; + __be32 raddr; + __be16 lport; + __be16 rport; + __be32 private_data_length; + u8 private_data[0]; /* data is in-line in the msg. */ +} __attribute__((packed)); + +union c2wr_ae { + struct c2wr_ae_hdr ae_generic; + struct c2wr_ae_active_connect_results ae_active_connect_results; + struct c2wr_ae_connection_request ae_connection_request; +} __attribute__((packed)); + +struct c2wr_init_req { + struct c2wr_hdr hdr; + __be64 hint_count; + __be64 q0_host_shared; + __be64 q1_host_shared; + __be64 q1_host_msg_pool; + __be64 q2_host_shared; + __be64 q2_host_msg_pool; +} __attribute__((packed)); + +struct c2wr_init_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)); + +union c2wr_init { + struct c2wr_init_req req; + struct c2wr_init_rep rep; +} __attribute__((packed)); + +/* + * For upgrading flash. + */ + +struct c2wr_flash_init_req { + struct c2wr_hdr hdr; + u32 rnic_handle; +} __attribute__((packed)); + +struct c2wr_flash_init_rep { + struct c2wr_hdr hdr; + u32 adapter_flash_buf_offset; + u32 adapter_flash_len; +} __attribute__((packed)); + +union c2wr_flash_init { + struct c2wr_flash_init_req req; + struct c2wr_flash_init_rep rep; +} __attribute__((packed)); + +struct c2wr_flash_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 len; +} __attribute__((packed)); + +struct c2wr_flash_rep { + struct c2wr_hdr hdr; + u32 status; +} __attribute__((packed)); + +union c2wr_flash { + struct c2wr_flash_req req; + struct c2wr_flash_rep rep; +} __attribute__((packed)); + +struct c2wr_buf_alloc_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 size; +} __attribute__((packed)); + +struct c2wr_buf_alloc_rep { + struct c2wr_hdr hdr; + u32 offset; /* 0 if mem not available */ + u32 size; /* 0 if mem not available */ +} __attribute__((packed)); + +union c2wr_buf_alloc { + struct c2wr_buf_alloc_req req; + struct c2wr_buf_alloc_rep rep; +} __attribute__((packed)); + +struct c2wr_buf_free_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 offset; /* Must match value from alloc */ + u32 size; /* Must match value from alloc */ +} __attribute__((packed)); + +struct c2wr_buf_free_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)); + +union c2wr_buf_free { + struct c2wr_buf_free_req req; + struct c2wr_ce rep; +} __attribute__((packed)); + +struct c2wr_flash_write_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 offset; + u32 size; + u32 type; + u32 flags; +} __attribute__((packed)); + +struct c2wr_flash_write_rep { + struct c2wr_hdr hdr; + u32 status; +} __attribute__((packed)); + +union c2wr_flash_write { + struct c2wr_flash_write_req req; + struct c2wr_flash_write_rep rep; +} __attribute__((packed)); + +/* + * Messages for LLP connection setup. + */ + +/* + * Listen Request. This allocates a listening endpoint to allow passive + * connection setup. Newly established LLP connections are passed up + * via an AE. See c2wr_ae_connection_request_t + */ +struct c2wr_ep_listen_create_req { + struct c2wr_hdr hdr; + u64 user_context; /* returned in AEs. */ + u32 rnic_handle; + __be32 local_addr; /* local addr, or 0 */ + __be16 local_port; /* 0 means "pick one" */ + u16 pad; + __be32 backlog; /* tradional tcp listen bl */ +} __attribute__((packed)); + +struct c2wr_ep_listen_create_rep { + struct c2wr_hdr hdr; + u32 ep_handle; /* handle to new listening ep */ + u16 local_port; /* resulting port... */ + u16 pad; +} __attribute__((packed)); + +union c2wr_ep_listen_create { + struct c2wr_ep_listen_create_req req; + struct c2wr_ep_listen_create_rep rep; +} __attribute__((packed)); + +struct c2wr_ep_listen_destroy_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 ep_handle; +} __attribute__((packed)); + +struct c2wr_ep_listen_destroy_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)); + +union c2wr_ep_listen_destroy { + struct c2wr_ep_listen_destroy_req req; + struct c2wr_ep_listen_destroy_rep rep; +} __attribute__((packed)); + +struct c2wr_ep_query_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 ep_handle; +} __attribute__((packed)); + +struct c2wr_ep_query_rep { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 local_addr; + u32 remote_addr; + u16 local_port; + u16 remote_port; +} __attribute__((packed)); + +union c2wr_ep_query { + struct c2wr_ep_query_req req; + struct c2wr_ep_query_rep rep; +} __attribute__((packed)); + + +/* + * The host passes this down to indicate acceptance of a pending iWARP + * connection. The cr_handle was obtained from the CONNECTION_REQUEST + * AE passed up by the adapter. See c2wr_ae_connection_request_t. + */ +struct c2wr_cr_accept_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 qp_handle; /* QP to bind to this LLP conn */ + u32 ep_handle; /* LLP handle to accept */ + __be32 private_data_length; + u8 private_data[0]; /* data in-line in msg. */ +} __attribute__((packed)); + +/* + * adapter sends reply when private data is successfully submitted to + * the LLP. + */ +struct c2wr_cr_accept_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)); + +union c2wr_cr_accept { + struct c2wr_cr_accept_req req; + struct c2wr_cr_accept_rep rep; +} __attribute__((packed)); + +/* + * The host sends this down if a given iWARP connection request was + * rejected by the consumer. The cr_handle was obtained from a + * previous c2wr_ae_connection_request_t AE sent by the adapter. + */ +struct c2wr_cr_reject_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 ep_handle; /* LLP handle to reject */ +} __attribute__((packed)); + +/* + * Dunno if this is needed, but we'll add it for now. The adapter will + * send the reject_reply after the LLP endpoint has been destroyed. + */ +struct c2wr_cr_reject_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)); + +union c2wr_cr_reject { + struct c2wr_cr_reject_req req; + struct c2wr_cr_reject_rep rep; +} __attribute__((packed)); + +/* + * console command. Used to implement a debug console over the verbs + * request and reply queues. + */ + +/* + * Console request message. It contains: + * - message hdr with id = CCWR_CONSOLE + * - the physaddr/len of host memory to be used for the reply. + * - the command string. eg: "netstat -s" or "zoneinfo" + */ +struct c2wr_console_req { + struct c2wr_hdr hdr; /* id = CCWR_CONSOLE */ + u64 reply_buf; /* pinned host buf for reply */ + u32 reply_buf_len; /* length of reply buffer */ + u8 command[0]; /* NUL terminated ascii string */ + /* containing the command req */ +} __attribute__((packed)); + +/* + * flags used in the console reply. + */ +enum c2_console_flags { + CONS_REPLY_TRUNCATED = 0x00000001 /* reply was truncated */ +} __attribute__((packed)); + +/* + * Console reply message. + * hdr.result contains the c2_status_t error if the reply was _not_ generated, + * or C2_OK if the reply was generated. + */ +struct c2wr_console_rep { + struct c2wr_hdr hdr; /* id = CCWR_CONSOLE */ + u32 flags; +} __attribute__((packed)); + +union c2wr_console { + struct c2wr_console_req req; + struct c2wr_console_rep rep; +} __attribute__((packed)); + + +/* + * Giant union with all WRs. Makes life easier... + */ +union c2wr { + struct c2wr_hdr hdr; + struct c2wr_user_hdr user_hdr; + union c2wr_rnic_open rnic_open; + union c2wr_rnic_query rnic_query; + union c2wr_rnic_getconfig rnic_getconfig; + union c2wr_rnic_setconfig rnic_setconfig; + union c2wr_rnic_close rnic_close; + union c2wr_cq_create cq_create; + union c2wr_cq_modify cq_modify; + union c2wr_cq_destroy cq_destroy; + union c2wr_pd_alloc pd_alloc; + union c2wr_pd_dealloc pd_dealloc; + union c2wr_srq_create srq_create; + union c2wr_srq_destroy srq_destroy; + union c2wr_qp_create qp_create; + union c2wr_qp_query qp_query; + union c2wr_qp_modify qp_modify; + union c2wr_qp_destroy qp_destroy; + struct c2wr_qp_connect qp_connect; + union c2wr_nsmr_stag_alloc nsmr_stag_alloc; + union c2wr_nsmr_register nsmr_register; + union c2wr_nsmr_pbl nsmr_pbl; + union c2wr_mr_query mr_query; + union c2wr_mw_query mw_query; + union c2wr_stag_dealloc stag_dealloc; + union c2wr_sqwr sqwr; + struct c2wr_rqwr rqwr; + struct c2wr_ce ce; + union c2wr_ae ae; + union c2wr_init init; + union c2wr_ep_listen_create ep_listen_create; + union c2wr_ep_listen_destroy ep_listen_destroy; + union c2wr_cr_accept cr_accept; + union c2wr_cr_reject cr_reject; + union c2wr_console console; + union c2wr_flash_init flash_init; + union c2wr_flash flash; + union c2wr_buf_alloc buf_alloc; + union c2wr_buf_free buf_free; + union c2wr_flash_write flash_write; +} __attribute__((packed)); + + +/* + * Accessors for the wr fields that are packed together tightly to + * reduce the wr message size. The wr arguments are void* so that + * either a struct c2wr*, a struct c2wr_hdr*, or a pointer to any of the types + * in the struct c2wr union can be passed in. + */ +static __inline__ u8 c2_wr_get_id(void *wr) +{ + return ((struct c2wr_hdr *) wr)->id; +} +static __inline__ void c2_wr_set_id(void *wr, u8 id) +{ + ((struct c2wr_hdr *) wr)->id = id; +} +static __inline__ u8 c2_wr_get_result(void *wr) +{ + return ((struct c2wr_hdr *) wr)->result; +} +static __inline__ void c2_wr_set_result(void *wr, u8 result) +{ + ((struct c2wr_hdr *) wr)->result = result; +} +static __inline__ u8 c2_wr_get_flags(void *wr) +{ + return ((struct c2wr_hdr *) wr)->flags; +} +static __inline__ void c2_wr_set_flags(void *wr, u8 flags) +{ + ((struct c2wr_hdr *) wr)->flags = flags; +} +static __inline__ u8 c2_wr_get_sge_count(void *wr) +{ + return ((struct c2wr_hdr *) wr)->sge_count; +} +static __inline__ void c2_wr_set_sge_count(void *wr, u8 sge_count) +{ + ((struct c2wr_hdr *) wr)->sge_count = sge_count; +} +static __inline__ __be32 c2_wr_get_wqe_count(void *wr) +{ + return ((struct c2wr_hdr *) wr)->wqe_count; +} +static __inline__ void c2_wr_set_wqe_count(void *wr, u32 wqe_count) +{ + ((struct c2wr_hdr *) wr)->wqe_count = wqe_count; +} + +#endif /* _C2_WR_H_ */ diff --git a/drivers/infiniband/hw/cxgb3/Kconfig b/drivers/infiniband/hw/cxgb3/Kconfig new file mode 100644 index 0000000..2b6352b --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/Kconfig @@ -0,0 +1,27 @@ +config INFINIBAND_CXGB3 + tristate "Chelsio RDMA Driver" + depends on CHELSIO_T3 && INET + select GENERIC_ALLOCATOR + ---help--- + This is an iWARP/RDMA driver for the Chelsio T3 1GbE and + 10GbE adapters. + + For general information about Chelsio and our products, visit + our website at . + + For customer support, please visit our customer support page at + . + + Please send feedback to . + + To compile this driver as a module, choose M here: the module + will be called iw_cxgb3. + +config INFINIBAND_CXGB3_DEBUG + bool "Verbose debugging output" + depends on INFINIBAND_CXGB3 + default n + ---help--- + This option causes the Chelsio RDMA driver to produce copious + amounts of debug messages. Select this if you are developing + the driver or trying to diagnose a problem. diff --git a/drivers/infiniband/hw/cxgb3/Makefile b/drivers/infiniband/hw/cxgb3/Makefile new file mode 100644 index 0000000..2761364 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/Makefile @@ -0,0 +1,8 @@ +ccflags-y := -Idrivers/net/ethernet/chelsio/cxgb3 + +obj-$(CONFIG_INFINIBAND_CXGB3) += iw_cxgb3.o + +iw_cxgb3-y := iwch_cm.o iwch_ev.o iwch_cq.o iwch_qp.o iwch_mem.o \ + iwch_provider.o iwch.o cxio_hal.o cxio_resource.o + +ccflags-$(CONFIG_INFINIBAND_CXGB3_DEBUG) += -DDEBUG diff --git a/drivers/infiniband/hw/cxgb3/cxio_dbg.c b/drivers/infiniband/hw/cxgb3/cxio_dbg.c new file mode 100644 index 0000000..8bca6b4 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/cxio_dbg.c @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef DEBUG +#include +#include +#include "common.h" +#include "cxgb3_ioctl.h" +#include "cxio_hal.h" +#include "cxio_wr.h" + +void cxio_dump_tpt(struct cxio_rdev *rdev, u32 stag) +{ + struct ch_mem_range *m; + u64 *data; + int rc; + int size = 32; + + m = kmalloc(sizeof(*m) + size, GFP_ATOMIC); + if (!m) { + PDBG("%s couldn't allocate memory.\n", __func__); + return; + } + m->mem_id = MEM_PMRX; + m->addr = (stag>>8) * 32 + rdev->rnic_info.tpt_base; + m->len = size; + PDBG("%s TPT addr 0x%x len %d\n", __func__, m->addr, m->len); + rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m); + if (rc) { + PDBG("%s toectl returned error %d\n", __func__, rc); + kfree(m); + return; + } + + data = (u64 *)m->buf; + while (size > 0) { + PDBG("TPT %08x: %016llx\n", m->addr, (unsigned long long) *data); + size -= 8; + data++; + m->addr += 8; + } + kfree(m); +} + +void cxio_dump_pbl(struct cxio_rdev *rdev, u32 pbl_addr, uint len, u8 shift) +{ + struct ch_mem_range *m; + u64 *data; + int rc; + int size, npages; + + shift += 12; + npages = (len + (1ULL << shift) - 1) >> shift; + size = npages * sizeof(u64); + + m = kmalloc(sizeof(*m) + size, GFP_ATOMIC); + if (!m) { + PDBG("%s couldn't allocate memory.\n", __func__); + return; + } + m->mem_id = MEM_PMRX; + m->addr = pbl_addr; + m->len = size; + PDBG("%s PBL addr 0x%x len %d depth %d\n", + __func__, m->addr, m->len, npages); + rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m); + if (rc) { + PDBG("%s toectl returned error %d\n", __func__, rc); + kfree(m); + return; + } + + data = (u64 *)m->buf; + while (size > 0) { + PDBG("PBL %08x: %016llx\n", m->addr, (unsigned long long) *data); + size -= 8; + data++; + m->addr += 8; + } + kfree(m); +} + +void cxio_dump_wqe(union t3_wr *wqe) +{ + __be64 *data = (__be64 *)wqe; + uint size = (uint)(be64_to_cpu(*data) & 0xff); + + if (size == 0) + size = 8; + while (size > 0) { + PDBG("WQE %p: %016llx\n", data, + (unsigned long long) be64_to_cpu(*data)); + size--; + data++; + } +} + +void cxio_dump_wce(struct t3_cqe *wce) +{ + __be64 *data = (__be64 *)wce; + int size = sizeof(*wce); + + while (size > 0) { + PDBG("WCE %p: %016llx\n", data, + (unsigned long long) be64_to_cpu(*data)); + size -= 8; + data++; + } +} + +void cxio_dump_rqt(struct cxio_rdev *rdev, u32 hwtid, int nents) +{ + struct ch_mem_range *m; + int size = nents * 64; + u64 *data; + int rc; + + m = kmalloc(sizeof(*m) + size, GFP_ATOMIC); + if (!m) { + PDBG("%s couldn't allocate memory.\n", __func__); + return; + } + m->mem_id = MEM_PMRX; + m->addr = ((hwtid)<<10) + rdev->rnic_info.rqt_base; + m->len = size; + PDBG("%s RQT addr 0x%x len %d\n", __func__, m->addr, m->len); + rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m); + if (rc) { + PDBG("%s toectl returned error %d\n", __func__, rc); + kfree(m); + return; + } + + data = (u64 *)m->buf; + while (size > 0) { + PDBG("RQT %08x: %016llx\n", m->addr, (unsigned long long) *data); + size -= 8; + data++; + m->addr += 8; + } + kfree(m); +} + +void cxio_dump_tcb(struct cxio_rdev *rdev, u32 hwtid) +{ + struct ch_mem_range *m; + int size = TCB_SIZE; + u32 *data; + int rc; + + m = kmalloc(sizeof(*m) + size, GFP_ATOMIC); + if (!m) { + PDBG("%s couldn't allocate memory.\n", __func__); + return; + } + m->mem_id = MEM_CM; + m->addr = hwtid * size; + m->len = size; + PDBG("%s TCB %d len %d\n", __func__, m->addr, m->len); + rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m); + if (rc) { + PDBG("%s toectl returned error %d\n", __func__, rc); + kfree(m); + return; + } + + data = (u32 *)m->buf; + while (size > 0) { + printk("%2u: %08x %08x %08x %08x %08x %08x %08x %08x\n", + m->addr, + *(data+2), *(data+3), *(data),*(data+1), + *(data+6), *(data+7), *(data+4), *(data+5)); + size -= 32; + data += 8; + m->addr += 32; + } + kfree(m); +} +#endif diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c new file mode 100644 index 0000000..de1c61b --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c @@ -0,0 +1,1343 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cxio_resource.h" +#include "cxio_hal.h" +#include "cxgb3_offload.h" +#include "sge_defs.h" + +static LIST_HEAD(rdev_list); +static cxio_hal_ev_callback_func_t cxio_ev_cb = NULL; + +static struct cxio_rdev *cxio_hal_find_rdev_by_name(char *dev_name) +{ + struct cxio_rdev *rdev; + + list_for_each_entry(rdev, &rdev_list, entry) + if (!strcmp(rdev->dev_name, dev_name)) + return rdev; + return NULL; +} + +static struct cxio_rdev *cxio_hal_find_rdev_by_t3cdev(struct t3cdev *tdev) +{ + struct cxio_rdev *rdev; + + list_for_each_entry(rdev, &rdev_list, entry) + if (rdev->t3cdev_p == tdev) + return rdev; + return NULL; +} + +int cxio_hal_cq_op(struct cxio_rdev *rdev_p, struct t3_cq *cq, + enum t3_cq_opcode op, u32 credit) +{ + int ret; + struct t3_cqe *cqe; + u32 rptr; + + struct rdma_cq_op setup; + setup.id = cq->cqid; + setup.credits = (op == CQ_CREDIT_UPDATE) ? credit : 0; + setup.op = op; + ret = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_OP, &setup); + + if ((ret < 0) || (op == CQ_CREDIT_UPDATE)) + return ret; + + /* + * If the rearm returned an index other than our current index, + * then there might be CQE's in flight (being DMA'd). We must wait + * here for them to complete or the consumer can miss a notification. + */ + if (Q_PTR2IDX((cq->rptr), cq->size_log2) != ret) { + int i=0; + + rptr = cq->rptr; + + /* + * Keep the generation correct by bumping rptr until it + * matches the index returned by the rearm - 1. + */ + while (Q_PTR2IDX((rptr+1), cq->size_log2) != ret) + rptr++; + + /* + * Now rptr is the index for the (last) cqe that was + * in-flight at the time the HW rearmed the CQ. We + * spin until that CQE is valid. + */ + cqe = cq->queue + Q_PTR2IDX(rptr, cq->size_log2); + while (!CQ_VLD_ENTRY(rptr, cq->size_log2, cqe)) { + udelay(1); + if (i++ > 1000000) { + printk(KERN_ERR "%s: stalled rnic\n", + rdev_p->dev_name); + return -EIO; + } + } + + return 1; + } + + return 0; +} + +static int cxio_hal_clear_cq_ctx(struct cxio_rdev *rdev_p, u32 cqid) +{ + struct rdma_cq_setup setup; + setup.id = cqid; + setup.base_addr = 0; /* NULL address */ + setup.size = 0; /* disaable the CQ */ + setup.credits = 0; + setup.credit_thres = 0; + setup.ovfl_mode = 0; + return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup)); +} + +static int cxio_hal_clear_qp_ctx(struct cxio_rdev *rdev_p, u32 qpid) +{ + u64 sge_cmd; + struct t3_modify_qp_wr *wqe; + struct sk_buff *skb = alloc_skb(sizeof(*wqe), GFP_KERNEL); + if (!skb) { + PDBG("%s alloc_skb failed\n", __func__); + return -ENOMEM; + } + wqe = (struct t3_modify_qp_wr *) skb_put(skb, sizeof(*wqe)); + memset(wqe, 0, sizeof(*wqe)); + build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD, + T3_COMPLETION_FLAG | T3_NOTIFY_FLAG, 0, qpid, 7, + T3_SOPEOP); + wqe->flags = cpu_to_be32(MODQP_WRITE_EC); + sge_cmd = qpid << 8 | 3; + wqe->sge_cmd = cpu_to_be64(sge_cmd); + skb->priority = CPL_PRIORITY_CONTROL; + return iwch_cxgb3_ofld_send(rdev_p->t3cdev_p, skb); +} + +int cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq, int kernel) +{ + struct rdma_cq_setup setup; + int size = (1UL << (cq->size_log2)) * sizeof(struct t3_cqe); + + size += 1; /* one extra page for storing cq-in-err state */ + cq->cqid = cxio_hal_get_cqid(rdev_p->rscp); + if (!cq->cqid) + return -ENOMEM; + if (kernel) { + cq->sw_queue = kzalloc(size, GFP_KERNEL); + if (!cq->sw_queue) + return -ENOMEM; + } + cq->queue = dma_alloc_coherent(&(rdev_p->rnic_info.pdev->dev), size, + &(cq->dma_addr), GFP_KERNEL); + if (!cq->queue) { + kfree(cq->sw_queue); + return -ENOMEM; + } + dma_unmap_addr_set(cq, mapping, cq->dma_addr); + memset(cq->queue, 0, size); + setup.id = cq->cqid; + setup.base_addr = (u64) (cq->dma_addr); + setup.size = 1UL << cq->size_log2; + setup.credits = 65535; + setup.credit_thres = 1; + if (rdev_p->t3cdev_p->type != T3A) + setup.ovfl_mode = 0; + else + setup.ovfl_mode = 1; + return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup)); +} + +#ifdef notyet +int cxio_resize_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq) +{ + struct rdma_cq_setup setup; + setup.id = cq->cqid; + setup.base_addr = (u64) (cq->dma_addr); + setup.size = 1UL << cq->size_log2; + setup.credits = setup.size; + setup.credit_thres = setup.size; /* TBD: overflow recovery */ + setup.ovfl_mode = 1; + return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup)); +} +#endif + +static u32 get_qpid(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx) +{ + struct cxio_qpid_list *entry; + u32 qpid; + int i; + + mutex_lock(&uctx->lock); + if (!list_empty(&uctx->qpids)) { + entry = list_entry(uctx->qpids.next, struct cxio_qpid_list, + entry); + list_del(&entry->entry); + qpid = entry->qpid; + kfree(entry); + } else { + qpid = cxio_hal_get_qpid(rdev_p->rscp); + if (!qpid) + goto out; + for (i = qpid+1; i & rdev_p->qpmask; i++) { + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) + break; + entry->qpid = i; + list_add_tail(&entry->entry, &uctx->qpids); + } + } +out: + mutex_unlock(&uctx->lock); + PDBG("%s qpid 0x%x\n", __func__, qpid); + return qpid; +} + +static void put_qpid(struct cxio_rdev *rdev_p, u32 qpid, + struct cxio_ucontext *uctx) +{ + struct cxio_qpid_list *entry; + + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) + return; + PDBG("%s qpid 0x%x\n", __func__, qpid); + entry->qpid = qpid; + mutex_lock(&uctx->lock); + list_add_tail(&entry->entry, &uctx->qpids); + mutex_unlock(&uctx->lock); +} + +void cxio_release_ucontext(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx) +{ + struct list_head *pos, *nxt; + struct cxio_qpid_list *entry; + + mutex_lock(&uctx->lock); + list_for_each_safe(pos, nxt, &uctx->qpids) { + entry = list_entry(pos, struct cxio_qpid_list, entry); + list_del_init(&entry->entry); + if (!(entry->qpid & rdev_p->qpmask)) + cxio_hal_put_qpid(rdev_p->rscp, entry->qpid); + kfree(entry); + } + mutex_unlock(&uctx->lock); +} + +void cxio_init_ucontext(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx) +{ + INIT_LIST_HEAD(&uctx->qpids); + mutex_init(&uctx->lock); +} + +int cxio_create_qp(struct cxio_rdev *rdev_p, u32 kernel_domain, + struct t3_wq *wq, struct cxio_ucontext *uctx) +{ + int depth = 1UL << wq->size_log2; + int rqsize = 1UL << wq->rq_size_log2; + + wq->qpid = get_qpid(rdev_p, uctx); + if (!wq->qpid) + return -ENOMEM; + + wq->rq = kzalloc(depth * sizeof(struct t3_swrq), GFP_KERNEL); + if (!wq->rq) + goto err1; + + wq->rq_addr = cxio_hal_rqtpool_alloc(rdev_p, rqsize); + if (!wq->rq_addr) + goto err2; + + wq->sq = kzalloc(depth * sizeof(struct t3_swsq), GFP_KERNEL); + if (!wq->sq) + goto err3; + + wq->queue = dma_alloc_coherent(&(rdev_p->rnic_info.pdev->dev), + depth * sizeof(union t3_wr), + &(wq->dma_addr), GFP_KERNEL); + if (!wq->queue) + goto err4; + + memset(wq->queue, 0, depth * sizeof(union t3_wr)); + dma_unmap_addr_set(wq, mapping, wq->dma_addr); + wq->doorbell = (void __iomem *)rdev_p->rnic_info.kdb_addr; + if (!kernel_domain) + wq->udb = (u64)rdev_p->rnic_info.udbell_physbase + + (wq->qpid << rdev_p->qpshift); + wq->rdev = rdev_p; + PDBG("%s qpid 0x%x doorbell 0x%p udb 0x%llx\n", __func__, + wq->qpid, wq->doorbell, (unsigned long long) wq->udb); + return 0; +err4: + kfree(wq->sq); +err3: + cxio_hal_rqtpool_free(rdev_p, wq->rq_addr, rqsize); +err2: + kfree(wq->rq); +err1: + put_qpid(rdev_p, wq->qpid, uctx); + return -ENOMEM; +} + +int cxio_destroy_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq) +{ + int err; + err = cxio_hal_clear_cq_ctx(rdev_p, cq->cqid); + kfree(cq->sw_queue); + dma_free_coherent(&(rdev_p->rnic_info.pdev->dev), + (1UL << (cq->size_log2)) + * sizeof(struct t3_cqe), cq->queue, + dma_unmap_addr(cq, mapping)); + cxio_hal_put_cqid(rdev_p->rscp, cq->cqid); + return err; +} + +int cxio_destroy_qp(struct cxio_rdev *rdev_p, struct t3_wq *wq, + struct cxio_ucontext *uctx) +{ + dma_free_coherent(&(rdev_p->rnic_info.pdev->dev), + (1UL << (wq->size_log2)) + * sizeof(union t3_wr), wq->queue, + dma_unmap_addr(wq, mapping)); + kfree(wq->sq); + cxio_hal_rqtpool_free(rdev_p, wq->rq_addr, (1UL << wq->rq_size_log2)); + kfree(wq->rq); + put_qpid(rdev_p, wq->qpid, uctx); + return 0; +} + +static void insert_recv_cqe(struct t3_wq *wq, struct t3_cq *cq) +{ + struct t3_cqe cqe; + + PDBG("%s wq %p cq %p sw_rptr 0x%x sw_wptr 0x%x\n", __func__, + wq, cq, cq->sw_rptr, cq->sw_wptr); + memset(&cqe, 0, sizeof(cqe)); + cqe.header = cpu_to_be32(V_CQE_STATUS(TPT_ERR_SWFLUSH) | + V_CQE_OPCODE(T3_SEND) | + V_CQE_TYPE(0) | + V_CQE_SWCQE(1) | + V_CQE_QPID(wq->qpid) | + V_CQE_GENBIT(Q_GENBIT(cq->sw_wptr, + cq->size_log2))); + *(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)) = cqe; + cq->sw_wptr++; +} + +int cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count) +{ + u32 ptr; + int flushed = 0; + + PDBG("%s wq %p cq %p\n", __func__, wq, cq); + + /* flush RQ */ + PDBG("%s rq_rptr %u rq_wptr %u skip count %u\n", __func__, + wq->rq_rptr, wq->rq_wptr, count); + ptr = wq->rq_rptr + count; + while (ptr++ != wq->rq_wptr) { + insert_recv_cqe(wq, cq); + flushed++; + } + return flushed; +} + +static void insert_sq_cqe(struct t3_wq *wq, struct t3_cq *cq, + struct t3_swsq *sqp) +{ + struct t3_cqe cqe; + + PDBG("%s wq %p cq %p sw_rptr 0x%x sw_wptr 0x%x\n", __func__, + wq, cq, cq->sw_rptr, cq->sw_wptr); + memset(&cqe, 0, sizeof(cqe)); + cqe.header = cpu_to_be32(V_CQE_STATUS(TPT_ERR_SWFLUSH) | + V_CQE_OPCODE(sqp->opcode) | + V_CQE_TYPE(1) | + V_CQE_SWCQE(1) | + V_CQE_QPID(wq->qpid) | + V_CQE_GENBIT(Q_GENBIT(cq->sw_wptr, + cq->size_log2))); + cqe.u.scqe.wrid_hi = sqp->sq_wptr; + + *(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)) = cqe; + cq->sw_wptr++; +} + +int cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count) +{ + __u32 ptr; + int flushed = 0; + struct t3_swsq *sqp = wq->sq + Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2); + + ptr = wq->sq_rptr + count; + sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2); + while (ptr != wq->sq_wptr) { + sqp->signaled = 0; + insert_sq_cqe(wq, cq, sqp); + ptr++; + sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2); + flushed++; + } + return flushed; +} + +/* + * Move all CQEs from the HWCQ into the SWCQ. + */ +void cxio_flush_hw_cq(struct t3_cq *cq) +{ + struct t3_cqe *cqe, *swcqe; + + PDBG("%s cq %p cqid 0x%x\n", __func__, cq, cq->cqid); + cqe = cxio_next_hw_cqe(cq); + while (cqe) { + PDBG("%s flushing hwcq rptr 0x%x to swcq wptr 0x%x\n", + __func__, cq->rptr, cq->sw_wptr); + swcqe = cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2); + *swcqe = *cqe; + swcqe->header |= cpu_to_be32(V_CQE_SWCQE(1)); + cq->sw_wptr++; + cq->rptr++; + cqe = cxio_next_hw_cqe(cq); + } +} + +static int cqe_completes_wr(struct t3_cqe *cqe, struct t3_wq *wq) +{ + if (CQE_OPCODE(*cqe) == T3_TERMINATE) + return 0; + + if ((CQE_OPCODE(*cqe) == T3_RDMA_WRITE) && RQ_TYPE(*cqe)) + return 0; + + if ((CQE_OPCODE(*cqe) == T3_READ_RESP) && SQ_TYPE(*cqe)) + return 0; + + if (CQE_SEND_OPCODE(*cqe) && RQ_TYPE(*cqe) && + Q_EMPTY(wq->rq_rptr, wq->rq_wptr)) + return 0; + + return 1; +} + +void cxio_count_scqes(struct t3_cq *cq, struct t3_wq *wq, int *count) +{ + struct t3_cqe *cqe; + u32 ptr; + + *count = 0; + ptr = cq->sw_rptr; + while (!Q_EMPTY(ptr, cq->sw_wptr)) { + cqe = cq->sw_queue + (Q_PTR2IDX(ptr, cq->size_log2)); + if ((SQ_TYPE(*cqe) || + ((CQE_OPCODE(*cqe) == T3_READ_RESP) && wq->oldest_read)) && + (CQE_QPID(*cqe) == wq->qpid)) + (*count)++; + ptr++; + } + PDBG("%s cq %p count %d\n", __func__, cq, *count); +} + +void cxio_count_rcqes(struct t3_cq *cq, struct t3_wq *wq, int *count) +{ + struct t3_cqe *cqe; + u32 ptr; + + *count = 0; + PDBG("%s count zero %d\n", __func__, *count); + ptr = cq->sw_rptr; + while (!Q_EMPTY(ptr, cq->sw_wptr)) { + cqe = cq->sw_queue + (Q_PTR2IDX(ptr, cq->size_log2)); + if (RQ_TYPE(*cqe) && (CQE_OPCODE(*cqe) != T3_READ_RESP) && + (CQE_QPID(*cqe) == wq->qpid) && cqe_completes_wr(cqe, wq)) + (*count)++; + ptr++; + } + PDBG("%s cq %p count %d\n", __func__, cq, *count); +} + +static int cxio_hal_init_ctrl_cq(struct cxio_rdev *rdev_p) +{ + struct rdma_cq_setup setup; + setup.id = 0; + setup.base_addr = 0; /* NULL address */ + setup.size = 1; /* enable the CQ */ + setup.credits = 0; + + /* force SGE to redirect to RspQ and interrupt */ + setup.credit_thres = 0; + setup.ovfl_mode = 1; + return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup)); +} + +static int cxio_hal_init_ctrl_qp(struct cxio_rdev *rdev_p) +{ + int err; + u64 sge_cmd, ctx0, ctx1; + u64 base_addr; + struct t3_modify_qp_wr *wqe; + struct sk_buff *skb; + + skb = alloc_skb(sizeof(*wqe), GFP_KERNEL); + if (!skb) { + PDBG("%s alloc_skb failed\n", __func__); + return -ENOMEM; + } + err = cxio_hal_init_ctrl_cq(rdev_p); + if (err) { + PDBG("%s err %d initializing ctrl_cq\n", __func__, err); + goto err; + } + rdev_p->ctrl_qp.workq = dma_alloc_coherent( + &(rdev_p->rnic_info.pdev->dev), + (1 << T3_CTRL_QP_SIZE_LOG2) * + sizeof(union t3_wr), + &(rdev_p->ctrl_qp.dma_addr), + GFP_KERNEL); + if (!rdev_p->ctrl_qp.workq) { + PDBG("%s dma_alloc_coherent failed\n", __func__); + err = -ENOMEM; + goto err; + } + dma_unmap_addr_set(&rdev_p->ctrl_qp, mapping, + rdev_p->ctrl_qp.dma_addr); + rdev_p->ctrl_qp.doorbell = (void __iomem *)rdev_p->rnic_info.kdb_addr; + memset(rdev_p->ctrl_qp.workq, 0, + (1 << T3_CTRL_QP_SIZE_LOG2) * sizeof(union t3_wr)); + + mutex_init(&rdev_p->ctrl_qp.lock); + init_waitqueue_head(&rdev_p->ctrl_qp.waitq); + + /* update HW Ctrl QP context */ + base_addr = rdev_p->ctrl_qp.dma_addr; + base_addr >>= 12; + ctx0 = (V_EC_SIZE((1 << T3_CTRL_QP_SIZE_LOG2)) | + V_EC_BASE_LO((u32) base_addr & 0xffff)); + ctx0 <<= 32; + ctx0 |= V_EC_CREDITS(FW_WR_NUM); + base_addr >>= 16; + ctx1 = (u32) base_addr; + base_addr >>= 32; + ctx1 |= ((u64) (V_EC_BASE_HI((u32) base_addr & 0xf) | V_EC_RESPQ(0) | + V_EC_TYPE(0) | V_EC_GEN(1) | + V_EC_UP_TOKEN(T3_CTL_QP_TID) | F_EC_VALID)) << 32; + wqe = (struct t3_modify_qp_wr *) skb_put(skb, sizeof(*wqe)); + memset(wqe, 0, sizeof(*wqe)); + build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD, 0, 0, + T3_CTL_QP_TID, 7, T3_SOPEOP); + wqe->flags = cpu_to_be32(MODQP_WRITE_EC); + sge_cmd = (3ULL << 56) | FW_RI_SGEEC_START << 8 | 3; + wqe->sge_cmd = cpu_to_be64(sge_cmd); + wqe->ctx1 = cpu_to_be64(ctx1); + wqe->ctx0 = cpu_to_be64(ctx0); + PDBG("CtrlQP dma_addr 0x%llx workq %p size %d\n", + (unsigned long long) rdev_p->ctrl_qp.dma_addr, + rdev_p->ctrl_qp.workq, 1 << T3_CTRL_QP_SIZE_LOG2); + skb->priority = CPL_PRIORITY_CONTROL; + return iwch_cxgb3_ofld_send(rdev_p->t3cdev_p, skb); +err: + kfree_skb(skb); + return err; +} + +static int cxio_hal_destroy_ctrl_qp(struct cxio_rdev *rdev_p) +{ + dma_free_coherent(&(rdev_p->rnic_info.pdev->dev), + (1UL << T3_CTRL_QP_SIZE_LOG2) + * sizeof(union t3_wr), rdev_p->ctrl_qp.workq, + dma_unmap_addr(&rdev_p->ctrl_qp, mapping)); + return cxio_hal_clear_qp_ctx(rdev_p, T3_CTRL_QP_ID); +} + +/* write len bytes of data into addr (32B aligned address) + * If data is NULL, clear len byte of memory to zero. + * caller acquires the ctrl_qp lock before the call + */ +static int cxio_hal_ctrl_qp_write_mem(struct cxio_rdev *rdev_p, u32 addr, + u32 len, void *data) +{ + u32 i, nr_wqe, copy_len; + u8 *copy_data; + u8 wr_len, utx_len; /* length in 8 byte flit */ + enum t3_wr_flags flag; + __be64 *wqe; + u64 utx_cmd; + addr &= 0x7FFFFFF; + nr_wqe = len % 96 ? len / 96 + 1 : len / 96; /* 96B max per WQE */ + PDBG("%s wptr 0x%x rptr 0x%x len %d, nr_wqe %d data %p addr 0x%0x\n", + __func__, rdev_p->ctrl_qp.wptr, rdev_p->ctrl_qp.rptr, len, + nr_wqe, data, addr); + utx_len = 3; /* in 32B unit */ + for (i = 0; i < nr_wqe; i++) { + if (Q_FULL(rdev_p->ctrl_qp.rptr, rdev_p->ctrl_qp.wptr, + T3_CTRL_QP_SIZE_LOG2)) { + PDBG("%s ctrl_qp full wtpr 0x%0x rptr 0x%0x, " + "wait for more space i %d\n", __func__, + rdev_p->ctrl_qp.wptr, rdev_p->ctrl_qp.rptr, i); + if (wait_event_interruptible(rdev_p->ctrl_qp.waitq, + !Q_FULL(rdev_p->ctrl_qp.rptr, + rdev_p->ctrl_qp.wptr, + T3_CTRL_QP_SIZE_LOG2))) { + PDBG("%s ctrl_qp workq interrupted\n", + __func__); + return -ERESTARTSYS; + } + PDBG("%s ctrl_qp wakeup, continue posting work request " + "i %d\n", __func__, i); + } + wqe = (__be64 *)(rdev_p->ctrl_qp.workq + (rdev_p->ctrl_qp.wptr % + (1 << T3_CTRL_QP_SIZE_LOG2))); + flag = 0; + if (i == (nr_wqe - 1)) { + /* last WQE */ + flag = T3_COMPLETION_FLAG; + if (len % 32) + utx_len = len / 32 + 1; + else + utx_len = len / 32; + } + + /* + * Force a CQE to return the credit to the workq in case + * we posted more than half the max QP size of WRs + */ + if ((i != 0) && + (i % (((1 << T3_CTRL_QP_SIZE_LOG2)) >> 1) == 0)) { + flag = T3_COMPLETION_FLAG; + PDBG("%s force completion at i %d\n", __func__, i); + } + + /* build the utx mem command */ + wqe += (sizeof(struct t3_bypass_wr) >> 3); + utx_cmd = (T3_UTX_MEM_WRITE << 28) | (addr + i * 3); + utx_cmd <<= 32; + utx_cmd |= (utx_len << 28) | ((utx_len << 2) + 1); + *wqe = cpu_to_be64(utx_cmd); + wqe++; + copy_data = (u8 *) data + i * 96; + copy_len = len > 96 ? 96 : len; + + /* clear memory content if data is NULL */ + if (data) + memcpy(wqe, copy_data, copy_len); + else + memset(wqe, 0, copy_len); + if (copy_len % 32) + memset(((u8 *) wqe) + copy_len, 0, + 32 - (copy_len % 32)); + wr_len = ((sizeof(struct t3_bypass_wr)) >> 3) + 1 + + (utx_len << 2); + wqe = (__be64 *)(rdev_p->ctrl_qp.workq + (rdev_p->ctrl_qp.wptr % + (1 << T3_CTRL_QP_SIZE_LOG2))); + + /* wptr in the WRID[31:0] */ + ((union t3_wrid *)(wqe+1))->id0.low = rdev_p->ctrl_qp.wptr; + + /* + * This must be the last write with a memory barrier + * for the genbit + */ + build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_BP, flag, + Q_GENBIT(rdev_p->ctrl_qp.wptr, + T3_CTRL_QP_SIZE_LOG2), T3_CTRL_QP_ID, + wr_len, T3_SOPEOP); + if (flag == T3_COMPLETION_FLAG) + ring_doorbell(rdev_p->ctrl_qp.doorbell, T3_CTRL_QP_ID); + len -= 96; + rdev_p->ctrl_qp.wptr++; + } + return 0; +} + +/* IN: stag key, pdid, perm, zbva, to, len, page_size, pbl_size and pbl_addr + * OUT: stag index + * TBD: shared memory region support + */ +static int __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry, + u32 *stag, u8 stag_state, u32 pdid, + enum tpt_mem_type type, enum tpt_mem_perm perm, + u32 zbva, u64 to, u32 len, u8 page_size, + u32 pbl_size, u32 pbl_addr) +{ + int err; + struct tpt_entry tpt; + u32 stag_idx; + u32 wptr; + + if (cxio_fatal_error(rdev_p)) + return -EIO; + + stag_state = stag_state > 0; + stag_idx = (*stag) >> 8; + + if ((!reset_tpt_entry) && !(*stag != T3_STAG_UNSET)) { + stag_idx = cxio_hal_get_stag(rdev_p->rscp); + if (!stag_idx) + return -ENOMEM; + *stag = (stag_idx << 8) | ((*stag) & 0xFF); + } + PDBG("%s stag_state 0x%0x type 0x%0x pdid 0x%0x, stag_idx 0x%x\n", + __func__, stag_state, type, pdid, stag_idx); + + mutex_lock(&rdev_p->ctrl_qp.lock); + + /* write TPT entry */ + if (reset_tpt_entry) + memset(&tpt, 0, sizeof(tpt)); + else { + tpt.valid_stag_pdid = cpu_to_be32(F_TPT_VALID | + V_TPT_STAG_KEY((*stag) & M_TPT_STAG_KEY) | + V_TPT_STAG_STATE(stag_state) | + V_TPT_STAG_TYPE(type) | V_TPT_PDID(pdid)); + BUG_ON(page_size >= 28); + tpt.flags_pagesize_qpid = cpu_to_be32(V_TPT_PERM(perm) | + ((perm & TPT_MW_BIND) ? F_TPT_MW_BIND_ENABLE : 0) | + V_TPT_ADDR_TYPE((zbva ? TPT_ZBTO : TPT_VATO)) | + V_TPT_PAGE_SIZE(page_size)); + tpt.rsvd_pbl_addr = cpu_to_be32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, pbl_addr)>>3)); + tpt.len = cpu_to_be32(len); + tpt.va_hi = cpu_to_be32((u32) (to >> 32)); + tpt.va_low_or_fbo = cpu_to_be32((u32) (to & 0xFFFFFFFFULL)); + tpt.rsvd_bind_cnt_or_pstag = 0; + tpt.rsvd_pbl_size = cpu_to_be32(V_TPT_PBL_SIZE(pbl_size >> 2)); + } + err = cxio_hal_ctrl_qp_write_mem(rdev_p, + stag_idx + + (rdev_p->rnic_info.tpt_base >> 5), + sizeof(tpt), &tpt); + + /* release the stag index to free pool */ + if (reset_tpt_entry) + cxio_hal_put_stag(rdev_p->rscp, stag_idx); + + wptr = rdev_p->ctrl_qp.wptr; + mutex_unlock(&rdev_p->ctrl_qp.lock); + if (!err) + if (wait_event_interruptible(rdev_p->ctrl_qp.waitq, + SEQ32_GE(rdev_p->ctrl_qp.rptr, + wptr))) + return -ERESTARTSYS; + return err; +} + +int cxio_write_pbl(struct cxio_rdev *rdev_p, __be64 *pbl, + u32 pbl_addr, u32 pbl_size) +{ + u32 wptr; + int err; + + PDBG("%s *pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d\n", + __func__, pbl_addr, rdev_p->rnic_info.pbl_base, + pbl_size); + + mutex_lock(&rdev_p->ctrl_qp.lock); + err = cxio_hal_ctrl_qp_write_mem(rdev_p, pbl_addr >> 5, pbl_size << 3, + pbl); + wptr = rdev_p->ctrl_qp.wptr; + mutex_unlock(&rdev_p->ctrl_qp.lock); + if (err) + return err; + + if (wait_event_interruptible(rdev_p->ctrl_qp.waitq, + SEQ32_GE(rdev_p->ctrl_qp.rptr, + wptr))) + return -ERESTARTSYS; + + return 0; +} + +int cxio_register_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid, + enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, + u8 page_size, u32 pbl_size, u32 pbl_addr) +{ + *stag = T3_STAG_UNSET; + return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm, + zbva, to, len, page_size, pbl_size, pbl_addr); +} + +int cxio_reregister_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid, + enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, + u8 page_size, u32 pbl_size, u32 pbl_addr) +{ + return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm, + zbva, to, len, page_size, pbl_size, pbl_addr); +} + +int cxio_dereg_mem(struct cxio_rdev *rdev_p, u32 stag, u32 pbl_size, + u32 pbl_addr) +{ + return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, + pbl_size, pbl_addr); +} + +int cxio_allocate_window(struct cxio_rdev *rdev_p, u32 * stag, u32 pdid) +{ + *stag = T3_STAG_UNSET; + return __cxio_tpt_op(rdev_p, 0, stag, 0, pdid, TPT_MW, 0, 0, 0ULL, 0, 0, + 0, 0); +} + +int cxio_deallocate_window(struct cxio_rdev *rdev_p, u32 stag) +{ + return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, + 0, 0); +} + +int cxio_allocate_stag(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid, u32 pbl_size, u32 pbl_addr) +{ + *stag = T3_STAG_UNSET; + return __cxio_tpt_op(rdev_p, 0, stag, 0, pdid, TPT_NON_SHARED_MR, + 0, 0, 0ULL, 0, 0, pbl_size, pbl_addr); +} + +int cxio_rdma_init(struct cxio_rdev *rdev_p, struct t3_rdma_init_attr *attr) +{ + struct t3_rdma_init_wr *wqe; + struct sk_buff *skb = alloc_skb(sizeof(*wqe), GFP_ATOMIC); + if (!skb) + return -ENOMEM; + PDBG("%s rdev_p %p\n", __func__, rdev_p); + wqe = (struct t3_rdma_init_wr *) __skb_put(skb, sizeof(*wqe)); + wqe->wrh.op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(T3_WR_INIT)); + wqe->wrh.gen_tid_len = cpu_to_be32(V_FW_RIWR_TID(attr->tid) | + V_FW_RIWR_LEN(sizeof(*wqe) >> 3)); + wqe->wrid.id1 = 0; + wqe->qpid = cpu_to_be32(attr->qpid); + wqe->pdid = cpu_to_be32(attr->pdid); + wqe->scqid = cpu_to_be32(attr->scqid); + wqe->rcqid = cpu_to_be32(attr->rcqid); + wqe->rq_addr = cpu_to_be32(attr->rq_addr - rdev_p->rnic_info.rqt_base); + wqe->rq_size = cpu_to_be32(attr->rq_size); + wqe->mpaattrs = attr->mpaattrs; + wqe->qpcaps = attr->qpcaps; + wqe->ulpdu_size = cpu_to_be16(attr->tcp_emss); + wqe->rqe_count = cpu_to_be16(attr->rqe_count); + wqe->flags_rtr_type = cpu_to_be16(attr->flags | + V_RTR_TYPE(attr->rtr_type) | + V_CHAN(attr->chan)); + wqe->ord = cpu_to_be32(attr->ord); + wqe->ird = cpu_to_be32(attr->ird); + wqe->qp_dma_addr = cpu_to_be64(attr->qp_dma_addr); + wqe->qp_dma_size = cpu_to_be32(attr->qp_dma_size); + wqe->irs = cpu_to_be32(attr->irs); + skb->priority = 0; /* 0=>ToeQ; 1=>CtrlQ */ + return iwch_cxgb3_ofld_send(rdev_p->t3cdev_p, skb); +} + +void cxio_register_ev_cb(cxio_hal_ev_callback_func_t ev_cb) +{ + cxio_ev_cb = ev_cb; +} + +void cxio_unregister_ev_cb(cxio_hal_ev_callback_func_t ev_cb) +{ + cxio_ev_cb = NULL; +} + +static int cxio_hal_ev_handler(struct t3cdev *t3cdev_p, struct sk_buff *skb) +{ + static int cnt; + struct cxio_rdev *rdev_p = NULL; + struct respQ_msg_t *rsp_msg = (struct respQ_msg_t *) skb->data; + PDBG("%d: %s cq_id 0x%x cq_ptr 0x%x genbit %0x overflow %0x an %0x" + " se %0x notify %0x cqbranch %0x creditth %0x\n", + cnt, __func__, RSPQ_CQID(rsp_msg), RSPQ_CQPTR(rsp_msg), + RSPQ_GENBIT(rsp_msg), RSPQ_OVERFLOW(rsp_msg), RSPQ_AN(rsp_msg), + RSPQ_SE(rsp_msg), RSPQ_NOTIFY(rsp_msg), RSPQ_CQBRANCH(rsp_msg), + RSPQ_CREDIT_THRESH(rsp_msg)); + PDBG("CQE: QPID 0x%0x genbit %0x type 0x%0x status 0x%0x opcode %d " + "len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x\n", + CQE_QPID(rsp_msg->cqe), CQE_GENBIT(rsp_msg->cqe), + CQE_TYPE(rsp_msg->cqe), CQE_STATUS(rsp_msg->cqe), + CQE_OPCODE(rsp_msg->cqe), CQE_LEN(rsp_msg->cqe), + CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe)); + rdev_p = (struct cxio_rdev *)t3cdev_p->ulp; + if (!rdev_p) { + PDBG("%s called by t3cdev %p with null ulp\n", __func__, + t3cdev_p); + return 0; + } + if (CQE_QPID(rsp_msg->cqe) == T3_CTRL_QP_ID) { + rdev_p->ctrl_qp.rptr = CQE_WRID_LOW(rsp_msg->cqe) + 1; + wake_up_interruptible(&rdev_p->ctrl_qp.waitq); + dev_kfree_skb_irq(skb); + } else if (CQE_QPID(rsp_msg->cqe) == 0xfff8) + dev_kfree_skb_irq(skb); + else if (cxio_ev_cb) + (*cxio_ev_cb) (rdev_p, skb); + else + dev_kfree_skb_irq(skb); + cnt++; + return 0; +} + +/* Caller takes care of locking if needed */ +int cxio_rdev_open(struct cxio_rdev *rdev_p) +{ + struct net_device *netdev_p = NULL; + int err = 0; + if (strlen(rdev_p->dev_name)) { + if (cxio_hal_find_rdev_by_name(rdev_p->dev_name)) { + return -EBUSY; + } + netdev_p = dev_get_by_name(&init_net, rdev_p->dev_name); + if (!netdev_p) { + return -EINVAL; + } + dev_put(netdev_p); + } else if (rdev_p->t3cdev_p) { + if (cxio_hal_find_rdev_by_t3cdev(rdev_p->t3cdev_p)) { + return -EBUSY; + } + netdev_p = rdev_p->t3cdev_p->lldev; + strncpy(rdev_p->dev_name, rdev_p->t3cdev_p->name, + T3_MAX_DEV_NAME_LEN); + } else { + PDBG("%s t3cdev_p or dev_name must be set\n", __func__); + return -EINVAL; + } + + list_add_tail(&rdev_p->entry, &rdev_list); + + PDBG("%s opening rnic dev %s\n", __func__, rdev_p->dev_name); + memset(&rdev_p->ctrl_qp, 0, sizeof(rdev_p->ctrl_qp)); + if (!rdev_p->t3cdev_p) + rdev_p->t3cdev_p = dev2t3cdev(netdev_p); + rdev_p->t3cdev_p->ulp = (void *) rdev_p; + + err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, GET_EMBEDDED_INFO, + &(rdev_p->fw_info)); + if (err) { + printk(KERN_ERR "%s t3cdev_p(%p)->ctl returned error %d.\n", + __func__, rdev_p->t3cdev_p, err); + goto err1; + } + if (G_FW_VERSION_MAJOR(rdev_p->fw_info.fw_vers) != CXIO_FW_MAJ) { + printk(KERN_ERR MOD "fatal firmware version mismatch: " + "need version %u but adapter has version %u\n", + CXIO_FW_MAJ, + G_FW_VERSION_MAJOR(rdev_p->fw_info.fw_vers)); + err = -EINVAL; + goto err1; + } + + err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_GET_PARAMS, + &(rdev_p->rnic_info)); + if (err) { + printk(KERN_ERR "%s t3cdev_p(%p)->ctl returned error %d.\n", + __func__, rdev_p->t3cdev_p, err); + goto err1; + } + err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, GET_PORTS, + &(rdev_p->port_info)); + if (err) { + printk(KERN_ERR "%s t3cdev_p(%p)->ctl returned error %d.\n", + __func__, rdev_p->t3cdev_p, err); + goto err1; + } + + /* + * qpshift is the number of bits to shift the qpid left in order + * to get the correct address of the doorbell for that qp. + */ + cxio_init_ucontext(rdev_p, &rdev_p->uctx); + rdev_p->qpshift = PAGE_SHIFT - + ilog2(65536 >> + ilog2(rdev_p->rnic_info.udbell_len >> + PAGE_SHIFT)); + rdev_p->qpnr = rdev_p->rnic_info.udbell_len >> PAGE_SHIFT; + rdev_p->qpmask = (65536 >> ilog2(rdev_p->qpnr)) - 1; + PDBG("%s rnic %s info: tpt_base 0x%0x tpt_top 0x%0x num stags %d " + "pbl_base 0x%0x pbl_top 0x%0x rqt_base 0x%0x, rqt_top 0x%0x\n", + __func__, rdev_p->dev_name, rdev_p->rnic_info.tpt_base, + rdev_p->rnic_info.tpt_top, cxio_num_stags(rdev_p), + rdev_p->rnic_info.pbl_base, + rdev_p->rnic_info.pbl_top, rdev_p->rnic_info.rqt_base, + rdev_p->rnic_info.rqt_top); + PDBG("udbell_len 0x%0x udbell_physbase 0x%lx kdb_addr %p qpshift %lu " + "qpnr %d qpmask 0x%x\n", + rdev_p->rnic_info.udbell_len, + rdev_p->rnic_info.udbell_physbase, rdev_p->rnic_info.kdb_addr, + rdev_p->qpshift, rdev_p->qpnr, rdev_p->qpmask); + + err = cxio_hal_init_ctrl_qp(rdev_p); + if (err) { + printk(KERN_ERR "%s error %d initializing ctrl_qp.\n", + __func__, err); + goto err1; + } + err = cxio_hal_init_resource(rdev_p, cxio_num_stags(rdev_p), 0, + 0, T3_MAX_NUM_QP, T3_MAX_NUM_CQ, + T3_MAX_NUM_PD); + if (err) { + printk(KERN_ERR "%s error %d initializing hal resources.\n", + __func__, err); + goto err2; + } + err = cxio_hal_pblpool_create(rdev_p); + if (err) { + printk(KERN_ERR "%s error %d initializing pbl mem pool.\n", + __func__, err); + goto err3; + } + err = cxio_hal_rqtpool_create(rdev_p); + if (err) { + printk(KERN_ERR "%s error %d initializing rqt mem pool.\n", + __func__, err); + goto err4; + } + return 0; +err4: + cxio_hal_pblpool_destroy(rdev_p); +err3: + cxio_hal_destroy_resource(rdev_p->rscp); +err2: + cxio_hal_destroy_ctrl_qp(rdev_p); +err1: + rdev_p->t3cdev_p->ulp = NULL; + list_del(&rdev_p->entry); + return err; +} + +void cxio_rdev_close(struct cxio_rdev *rdev_p) +{ + if (rdev_p) { + cxio_hal_pblpool_destroy(rdev_p); + cxio_hal_rqtpool_destroy(rdev_p); + list_del(&rdev_p->entry); + cxio_hal_destroy_ctrl_qp(rdev_p); + cxio_hal_destroy_resource(rdev_p->rscp); + rdev_p->t3cdev_p->ulp = NULL; + } +} + +int __init cxio_hal_init(void) +{ + if (cxio_hal_init_rhdl_resource(T3_MAX_NUM_RI)) + return -ENOMEM; + t3_register_cpl_handler(CPL_ASYNC_NOTIF, cxio_hal_ev_handler); + return 0; +} + +void __exit cxio_hal_exit(void) +{ + struct cxio_rdev *rdev, *tmp; + + t3_register_cpl_handler(CPL_ASYNC_NOTIF, NULL); + list_for_each_entry_safe(rdev, tmp, &rdev_list, entry) + cxio_rdev_close(rdev); + cxio_hal_destroy_rhdl_resource(); +} + +static void flush_completed_wrs(struct t3_wq *wq, struct t3_cq *cq) +{ + struct t3_swsq *sqp; + __u32 ptr = wq->sq_rptr; + int count = Q_COUNT(wq->sq_rptr, wq->sq_wptr); + + sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2); + while (count--) + if (!sqp->signaled) { + ptr++; + sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2); + } else if (sqp->complete) { + + /* + * Insert this completed cqe into the swcq. + */ + PDBG("%s moving cqe into swcq sq idx %ld cq idx %ld\n", + __func__, Q_PTR2IDX(ptr, wq->sq_size_log2), + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)); + sqp->cqe.header |= htonl(V_CQE_SWCQE(1)); + *(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)) + = sqp->cqe; + cq->sw_wptr++; + sqp->signaled = 0; + break; + } else + break; +} + +static void create_read_req_cqe(struct t3_wq *wq, struct t3_cqe *hw_cqe, + struct t3_cqe *read_cqe) +{ + read_cqe->u.scqe.wrid_hi = wq->oldest_read->sq_wptr; + read_cqe->len = wq->oldest_read->read_len; + read_cqe->header = htonl(V_CQE_QPID(CQE_QPID(*hw_cqe)) | + V_CQE_SWCQE(SW_CQE(*hw_cqe)) | + V_CQE_OPCODE(T3_READ_REQ) | + V_CQE_TYPE(1)); +} + +/* + * Return a ptr to the next read wr in the SWSQ or NULL. + */ +static void advance_oldest_read(struct t3_wq *wq) +{ + + u32 rptr = wq->oldest_read - wq->sq + 1; + u32 wptr = Q_PTR2IDX(wq->sq_wptr, wq->sq_size_log2); + + while (Q_PTR2IDX(rptr, wq->sq_size_log2) != wptr) { + wq->oldest_read = wq->sq + Q_PTR2IDX(rptr, wq->sq_size_log2); + + if (wq->oldest_read->opcode == T3_READ_REQ) + return; + rptr++; + } + wq->oldest_read = NULL; +} + +/* + * cxio_poll_cq + * + * Caller must: + * check the validity of the first CQE, + * supply the wq assicated with the qpid. + * + * credit: cq credit to return to sge. + * cqe_flushed: 1 iff the CQE is flushed. + * cqe: copy of the polled CQE. + * + * return value: + * 0 CQE returned, + * -1 CQE skipped, try again. + */ +int cxio_poll_cq(struct t3_wq *wq, struct t3_cq *cq, struct t3_cqe *cqe, + u8 *cqe_flushed, u64 *cookie, u32 *credit) +{ + int ret = 0; + struct t3_cqe *hw_cqe, read_cqe; + + *cqe_flushed = 0; + *credit = 0; + hw_cqe = cxio_next_cqe(cq); + + PDBG("%s CQE OOO %d qpid 0x%0x genbit %d type %d status 0x%0x" + " opcode 0x%0x len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x\n", + __func__, CQE_OOO(*hw_cqe), CQE_QPID(*hw_cqe), + CQE_GENBIT(*hw_cqe), CQE_TYPE(*hw_cqe), CQE_STATUS(*hw_cqe), + CQE_OPCODE(*hw_cqe), CQE_LEN(*hw_cqe), CQE_WRID_HI(*hw_cqe), + CQE_WRID_LOW(*hw_cqe)); + + /* + * skip cqe's not affiliated with a QP. + */ + if (wq == NULL) { + ret = -1; + goto skip_cqe; + } + + /* + * Gotta tweak READ completions: + * 1) the cqe doesn't contain the sq_wptr from the wr. + * 2) opcode not reflected from the wr. + * 3) read_len not reflected from the wr. + * 4) cq_type is RQ_TYPE not SQ_TYPE. + */ + if (RQ_TYPE(*hw_cqe) && (CQE_OPCODE(*hw_cqe) == T3_READ_RESP)) { + + /* + * If this is an unsolicited read response, then the read + * was generated by the kernel driver as part of peer-2-peer + * connection setup. So ignore the completion. + */ + if (!wq->oldest_read) { + if (CQE_STATUS(*hw_cqe)) + wq->error = 1; + ret = -1; + goto skip_cqe; + } + + /* + * Don't write to the HWCQ, so create a new read req CQE + * in local memory. + */ + create_read_req_cqe(wq, hw_cqe, &read_cqe); + hw_cqe = &read_cqe; + advance_oldest_read(wq); + } + + /* + * T3A: Discard TERMINATE CQEs. + */ + if (CQE_OPCODE(*hw_cqe) == T3_TERMINATE) { + ret = -1; + wq->error = 1; + goto skip_cqe; + } + + if (CQE_STATUS(*hw_cqe) || wq->error) { + *cqe_flushed = wq->error; + wq->error = 1; + + /* + * T3A inserts errors into the CQE. We cannot return + * these as work completions. + */ + /* incoming write failures */ + if ((CQE_OPCODE(*hw_cqe) == T3_RDMA_WRITE) + && RQ_TYPE(*hw_cqe)) { + ret = -1; + goto skip_cqe; + } + /* incoming read request failures */ + if ((CQE_OPCODE(*hw_cqe) == T3_READ_RESP) && SQ_TYPE(*hw_cqe)) { + ret = -1; + goto skip_cqe; + } + + /* incoming SEND with no receive posted failures */ + if (CQE_SEND_OPCODE(*hw_cqe) && RQ_TYPE(*hw_cqe) && + Q_EMPTY(wq->rq_rptr, wq->rq_wptr)) { + ret = -1; + goto skip_cqe; + } + BUG_ON((*cqe_flushed == 0) && !SW_CQE(*hw_cqe)); + goto proc_cqe; + } + + /* + * RECV completion. + */ + if (RQ_TYPE(*hw_cqe)) { + + /* + * HW only validates 4 bits of MSN. So we must validate that + * the MSN in the SEND is the next expected MSN. If its not, + * then we complete this with TPT_ERR_MSN and mark the wq in + * error. + */ + + if (Q_EMPTY(wq->rq_rptr, wq->rq_wptr)) { + wq->error = 1; + ret = -1; + goto skip_cqe; + } + + if (unlikely((CQE_WRID_MSN(*hw_cqe) != (wq->rq_rptr + 1)))) { + wq->error = 1; + hw_cqe->header |= htonl(V_CQE_STATUS(TPT_ERR_MSN)); + goto proc_cqe; + } + goto proc_cqe; + } + + /* + * If we get here its a send completion. + * + * Handle out of order completion. These get stuffed + * in the SW SQ. Then the SW SQ is walked to move any + * now in-order completions into the SW CQ. This handles + * 2 cases: + * 1) reaping unsignaled WRs when the first subsequent + * signaled WR is completed. + * 2) out of order read completions. + */ + if (!SW_CQE(*hw_cqe) && (CQE_WRID_SQ_WPTR(*hw_cqe) != wq->sq_rptr)) { + struct t3_swsq *sqp; + + PDBG("%s out of order completion going in swsq at idx %ld\n", + __func__, + Q_PTR2IDX(CQE_WRID_SQ_WPTR(*hw_cqe), wq->sq_size_log2)); + sqp = wq->sq + + Q_PTR2IDX(CQE_WRID_SQ_WPTR(*hw_cqe), wq->sq_size_log2); + sqp->cqe = *hw_cqe; + sqp->complete = 1; + ret = -1; + goto flush_wq; + } + +proc_cqe: + *cqe = *hw_cqe; + + /* + * Reap the associated WR(s) that are freed up with this + * completion. + */ + if (SQ_TYPE(*hw_cqe)) { + wq->sq_rptr = CQE_WRID_SQ_WPTR(*hw_cqe); + PDBG("%s completing sq idx %ld\n", __func__, + Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2)); + *cookie = wq->sq[Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2)].wr_id; + wq->sq_rptr++; + } else { + PDBG("%s completing rq idx %ld\n", __func__, + Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)); + *cookie = wq->rq[Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)].wr_id; + if (wq->rq[Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)].pbl_addr) + cxio_hal_pblpool_free(wq->rdev, + wq->rq[Q_PTR2IDX(wq->rq_rptr, + wq->rq_size_log2)].pbl_addr, T3_STAG0_PBL_SIZE); + BUG_ON(Q_EMPTY(wq->rq_rptr, wq->rq_wptr)); + wq->rq_rptr++; + } + +flush_wq: + /* + * Flush any completed cqes that are now in-order. + */ + flush_completed_wrs(wq, cq); + +skip_cqe: + if (SW_CQE(*hw_cqe)) { + PDBG("%s cq %p cqid 0x%x skip sw cqe sw_rptr 0x%x\n", + __func__, cq, cq->cqid, cq->sw_rptr); + ++cq->sw_rptr; + } else { + PDBG("%s cq %p cqid 0x%x skip hw cqe rptr 0x%x\n", + __func__, cq, cq->cqid, cq->rptr); + ++cq->rptr; + + /* + * T3A: compute credits. + */ + if (((cq->rptr - cq->wptr) > (1 << (cq->size_log2 - 1))) + || ((cq->rptr - cq->wptr) >= 128)) { + *credit = cq->rptr - cq->wptr; + cq->wptr = cq->rptr; + } + } + return ret; +} diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.h b/drivers/infiniband/hw/cxgb3/cxio_hal.h new file mode 100644 index 0000000..78fbe9f --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.h @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __CXIO_HAL_H__ +#define __CXIO_HAL_H__ + +#include +#include +#include + +#include "t3_cpl.h" +#include "t3cdev.h" +#include "cxgb3_ctl_defs.h" +#include "cxio_wr.h" + +#define T3_CTRL_QP_ID FW_RI_SGEEC_START +#define T3_CTL_QP_TID FW_RI_TID_START +#define T3_CTRL_QP_SIZE_LOG2 8 +#define T3_CTRL_CQ_ID 0 + +#define T3_MAX_NUM_RI (1<<15) +#define T3_MAX_NUM_QP (1<<15) +#define T3_MAX_NUM_CQ (1<<15) +#define T3_MAX_NUM_PD (1<<15) +#define T3_MAX_PBL_SIZE 256 +#define T3_MAX_RQ_SIZE 1024 +#define T3_MAX_QP_DEPTH (T3_MAX_RQ_SIZE-1) +#define T3_MAX_CQ_DEPTH 65536 +#define T3_MAX_NUM_STAG (1<<15) +#define T3_MAX_MR_SIZE 0x100000000ULL +#define T3_PAGESIZE_MASK 0xffff000 /* 4KB-128MB */ + +#define T3_STAG_UNSET 0xffffffff + +#define T3_MAX_DEV_NAME_LEN 32 + +#define CXIO_FW_MAJ 7 + +struct cxio_hal_ctrl_qp { + u32 wptr; + u32 rptr; + struct mutex lock; /* for the wtpr, can sleep */ + wait_queue_head_t waitq;/* wait for RspQ/CQE msg */ + union t3_wr *workq; /* the work request queue */ + dma_addr_t dma_addr; /* pci bus address of the workq */ + DEFINE_DMA_UNMAP_ADDR(mapping); + void __iomem *doorbell; +}; + +struct cxio_hal_resource { + struct kfifo tpt_fifo; + spinlock_t tpt_fifo_lock; + struct kfifo qpid_fifo; + spinlock_t qpid_fifo_lock; + struct kfifo cqid_fifo; + spinlock_t cqid_fifo_lock; + struct kfifo pdid_fifo; + spinlock_t pdid_fifo_lock; +}; + +struct cxio_qpid_list { + struct list_head entry; + u32 qpid; +}; + +struct cxio_ucontext { + struct list_head qpids; + struct mutex lock; +}; + +struct cxio_rdev { + char dev_name[T3_MAX_DEV_NAME_LEN]; + struct t3cdev *t3cdev_p; + struct rdma_info rnic_info; + struct adap_ports port_info; + struct cxio_hal_resource *rscp; + struct cxio_hal_ctrl_qp ctrl_qp; + void *ulp; + unsigned long qpshift; + u32 qpnr; + u32 qpmask; + struct cxio_ucontext uctx; + struct gen_pool *pbl_pool; + struct gen_pool *rqt_pool; + struct list_head entry; + struct ch_embedded_info fw_info; + u32 flags; +#define CXIO_ERROR_FATAL 1 +}; + +static inline int cxio_fatal_error(struct cxio_rdev *rdev_p) +{ + return rdev_p->flags & CXIO_ERROR_FATAL; +} + +static inline int cxio_num_stags(struct cxio_rdev *rdev_p) +{ + return min((int)T3_MAX_NUM_STAG, (int)((rdev_p->rnic_info.tpt_top - rdev_p->rnic_info.tpt_base) >> 5)); +} + +typedef void (*cxio_hal_ev_callback_func_t) (struct cxio_rdev * rdev_p, + struct sk_buff * skb); + +#define RSPQ_CQID(rsp) (be32_to_cpu(rsp->cq_ptrid) & 0xffff) +#define RSPQ_CQPTR(rsp) ((be32_to_cpu(rsp->cq_ptrid) >> 16) & 0xffff) +#define RSPQ_GENBIT(rsp) ((be32_to_cpu(rsp->flags) >> 16) & 1) +#define RSPQ_OVERFLOW(rsp) ((be32_to_cpu(rsp->flags) >> 17) & 1) +#define RSPQ_AN(rsp) ((be32_to_cpu(rsp->flags) >> 18) & 1) +#define RSPQ_SE(rsp) ((be32_to_cpu(rsp->flags) >> 19) & 1) +#define RSPQ_NOTIFY(rsp) ((be32_to_cpu(rsp->flags) >> 20) & 1) +#define RSPQ_CQBRANCH(rsp) ((be32_to_cpu(rsp->flags) >> 21) & 1) +#define RSPQ_CREDIT_THRESH(rsp) ((be32_to_cpu(rsp->flags) >> 22) & 1) + +struct respQ_msg_t { + __be32 flags; /* flit 0 */ + __be32 cq_ptrid; + __be64 rsvd; /* flit 1 */ + struct t3_cqe cqe; /* flits 2-3 */ +}; + +enum t3_cq_opcode { + CQ_ARM_AN = 0x2, + CQ_ARM_SE = 0x6, + CQ_FORCE_AN = 0x3, + CQ_CREDIT_UPDATE = 0x7 +}; + +int cxio_rdev_open(struct cxio_rdev *rdev); +void cxio_rdev_close(struct cxio_rdev *rdev); +int cxio_hal_cq_op(struct cxio_rdev *rdev, struct t3_cq *cq, + enum t3_cq_opcode op, u32 credit); +int cxio_create_cq(struct cxio_rdev *rdev, struct t3_cq *cq, int kernel); +int cxio_destroy_cq(struct cxio_rdev *rdev, struct t3_cq *cq); +int cxio_resize_cq(struct cxio_rdev *rdev, struct t3_cq *cq); +void cxio_release_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx); +void cxio_init_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx); +int cxio_create_qp(struct cxio_rdev *rdev, u32 kernel_domain, struct t3_wq *wq, + struct cxio_ucontext *uctx); +int cxio_destroy_qp(struct cxio_rdev *rdev, struct t3_wq *wq, + struct cxio_ucontext *uctx); +int cxio_peek_cq(struct t3_wq *wr, struct t3_cq *cq, int opcode); +int cxio_write_pbl(struct cxio_rdev *rdev_p, __be64 *pbl, + u32 pbl_addr, u32 pbl_size); +int cxio_register_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid, + enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, + u8 page_size, u32 pbl_size, u32 pbl_addr); +int cxio_reregister_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid, + enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, + u8 page_size, u32 pbl_size, u32 pbl_addr); +int cxio_dereg_mem(struct cxio_rdev *rdev, u32 stag, u32 pbl_size, + u32 pbl_addr); +int cxio_allocate_window(struct cxio_rdev *rdev, u32 * stag, u32 pdid); +int cxio_allocate_stag(struct cxio_rdev *rdev, u32 *stag, u32 pdid, u32 pbl_size, u32 pbl_addr); +int cxio_deallocate_window(struct cxio_rdev *rdev, u32 stag); +int cxio_rdma_init(struct cxio_rdev *rdev, struct t3_rdma_init_attr *attr); +void cxio_register_ev_cb(cxio_hal_ev_callback_func_t ev_cb); +void cxio_unregister_ev_cb(cxio_hal_ev_callback_func_t ev_cb); +u32 cxio_hal_get_pdid(struct cxio_hal_resource *rscp); +void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid); +int __init cxio_hal_init(void); +void __exit cxio_hal_exit(void); +int cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count); +int cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count); +void cxio_count_rcqes(struct t3_cq *cq, struct t3_wq *wq, int *count); +void cxio_count_scqes(struct t3_cq *cq, struct t3_wq *wq, int *count); +void cxio_flush_hw_cq(struct t3_cq *cq); +int cxio_poll_cq(struct t3_wq *wq, struct t3_cq *cq, struct t3_cqe *cqe, + u8 *cqe_flushed, u64 *cookie, u32 *credit); +int iwch_cxgb3_ofld_send(struct t3cdev *tdev, struct sk_buff *skb); + +#define MOD "iw_cxgb3: " +#define PDBG(fmt, args...) pr_debug(MOD fmt, ## args) + +#ifdef DEBUG +void cxio_dump_tpt(struct cxio_rdev *rev, u32 stag); +void cxio_dump_pbl(struct cxio_rdev *rev, u32 pbl_addr, uint len, u8 shift); +void cxio_dump_wqe(union t3_wr *wqe); +void cxio_dump_wce(struct t3_cqe *wce); +void cxio_dump_rqt(struct cxio_rdev *rdev, u32 hwtid, int nents); +void cxio_dump_tcb(struct cxio_rdev *rdev, u32 hwtid); +#endif + +#endif diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.c b/drivers/infiniband/hw/cxgb3/cxio_resource.c new file mode 100644 index 0000000..c40088e --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/cxio_resource.c @@ -0,0 +1,343 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +/* Crude resource management */ +#include +#include +#include +#include +#include +#include +#include "cxio_resource.h" +#include "cxio_hal.h" + +static struct kfifo rhdl_fifo; +static spinlock_t rhdl_fifo_lock; + +#define RANDOM_SIZE 16 + +static int __cxio_init_resource_fifo(struct kfifo *fifo, + spinlock_t *fifo_lock, + u32 nr, u32 skip_low, + u32 skip_high, + int random) +{ + u32 i, j, entry = 0, idx; + u32 random_bytes; + u32 rarray[16]; + spin_lock_init(fifo_lock); + + if (kfifo_alloc(fifo, nr * sizeof(u32), GFP_KERNEL)) + return -ENOMEM; + + for (i = 0; i < skip_low + skip_high; i++) + kfifo_in(fifo, (unsigned char *) &entry, sizeof(u32)); + if (random) { + j = 0; + random_bytes = prandom_u32(); + for (i = 0; i < RANDOM_SIZE; i++) + rarray[i] = i + skip_low; + for (i = skip_low + RANDOM_SIZE; i < nr - skip_high; i++) { + if (j >= RANDOM_SIZE) { + j = 0; + random_bytes = prandom_u32(); + } + idx = (random_bytes >> (j * 2)) & 0xF; + kfifo_in(fifo, + (unsigned char *) &rarray[idx], + sizeof(u32)); + rarray[idx] = i; + j++; + } + for (i = 0; i < RANDOM_SIZE; i++) + kfifo_in(fifo, + (unsigned char *) &rarray[i], + sizeof(u32)); + } else + for (i = skip_low; i < nr - skip_high; i++) + kfifo_in(fifo, (unsigned char *) &i, sizeof(u32)); + + for (i = 0; i < skip_low + skip_high; i++) + if (kfifo_out_locked(fifo, (unsigned char *) &entry, + sizeof(u32), fifo_lock) != sizeof(u32)) + break; + return 0; +} + +static int cxio_init_resource_fifo(struct kfifo *fifo, spinlock_t * fifo_lock, + u32 nr, u32 skip_low, u32 skip_high) +{ + return (__cxio_init_resource_fifo(fifo, fifo_lock, nr, skip_low, + skip_high, 0)); +} + +static int cxio_init_resource_fifo_random(struct kfifo *fifo, + spinlock_t * fifo_lock, + u32 nr, u32 skip_low, u32 skip_high) +{ + + return (__cxio_init_resource_fifo(fifo, fifo_lock, nr, skip_low, + skip_high, 1)); +} + +static int cxio_init_qpid_fifo(struct cxio_rdev *rdev_p) +{ + u32 i; + + spin_lock_init(&rdev_p->rscp->qpid_fifo_lock); + + if (kfifo_alloc(&rdev_p->rscp->qpid_fifo, T3_MAX_NUM_QP * sizeof(u32), + GFP_KERNEL)) + return -ENOMEM; + + for (i = 16; i < T3_MAX_NUM_QP; i++) + if (!(i & rdev_p->qpmask)) + kfifo_in(&rdev_p->rscp->qpid_fifo, + (unsigned char *) &i, sizeof(u32)); + return 0; +} + +int cxio_hal_init_rhdl_resource(u32 nr_rhdl) +{ + return cxio_init_resource_fifo(&rhdl_fifo, &rhdl_fifo_lock, nr_rhdl, 1, + 0); +} + +void cxio_hal_destroy_rhdl_resource(void) +{ + kfifo_free(&rhdl_fifo); +} + +/* nr_* must be power of 2 */ +int cxio_hal_init_resource(struct cxio_rdev *rdev_p, + u32 nr_tpt, u32 nr_pbl, + u32 nr_rqt, u32 nr_qpid, u32 nr_cqid, u32 nr_pdid) +{ + int err = 0; + struct cxio_hal_resource *rscp; + + rscp = kmalloc(sizeof(*rscp), GFP_KERNEL); + if (!rscp) + return -ENOMEM; + rdev_p->rscp = rscp; + err = cxio_init_resource_fifo_random(&rscp->tpt_fifo, + &rscp->tpt_fifo_lock, + nr_tpt, 1, 0); + if (err) + goto tpt_err; + err = cxio_init_qpid_fifo(rdev_p); + if (err) + goto qpid_err; + err = cxio_init_resource_fifo(&rscp->cqid_fifo, &rscp->cqid_fifo_lock, + nr_cqid, 1, 0); + if (err) + goto cqid_err; + err = cxio_init_resource_fifo(&rscp->pdid_fifo, &rscp->pdid_fifo_lock, + nr_pdid, 1, 0); + if (err) + goto pdid_err; + return 0; +pdid_err: + kfifo_free(&rscp->cqid_fifo); +cqid_err: + kfifo_free(&rscp->qpid_fifo); +qpid_err: + kfifo_free(&rscp->tpt_fifo); +tpt_err: + return -ENOMEM; +} + +/* + * returns 0 if no resource available + */ +static u32 cxio_hal_get_resource(struct kfifo *fifo, spinlock_t * lock) +{ + u32 entry; + if (kfifo_out_locked(fifo, (unsigned char *) &entry, sizeof(u32), lock)) + return entry; + else + return 0; /* fifo emptry */ +} + +static void cxio_hal_put_resource(struct kfifo *fifo, spinlock_t * lock, + u32 entry) +{ + BUG_ON( + kfifo_in_locked(fifo, (unsigned char *) &entry, sizeof(u32), lock) + == 0); +} + +u32 cxio_hal_get_stag(struct cxio_hal_resource *rscp) +{ + return cxio_hal_get_resource(&rscp->tpt_fifo, &rscp->tpt_fifo_lock); +} + +void cxio_hal_put_stag(struct cxio_hal_resource *rscp, u32 stag) +{ + cxio_hal_put_resource(&rscp->tpt_fifo, &rscp->tpt_fifo_lock, stag); +} + +u32 cxio_hal_get_qpid(struct cxio_hal_resource *rscp) +{ + u32 qpid = cxio_hal_get_resource(&rscp->qpid_fifo, + &rscp->qpid_fifo_lock); + PDBG("%s qpid 0x%x\n", __func__, qpid); + return qpid; +} + +void cxio_hal_put_qpid(struct cxio_hal_resource *rscp, u32 qpid) +{ + PDBG("%s qpid 0x%x\n", __func__, qpid); + cxio_hal_put_resource(&rscp->qpid_fifo, &rscp->qpid_fifo_lock, qpid); +} + +u32 cxio_hal_get_cqid(struct cxio_hal_resource *rscp) +{ + return cxio_hal_get_resource(&rscp->cqid_fifo, &rscp->cqid_fifo_lock); +} + +void cxio_hal_put_cqid(struct cxio_hal_resource *rscp, u32 cqid) +{ + cxio_hal_put_resource(&rscp->cqid_fifo, &rscp->cqid_fifo_lock, cqid); +} + +u32 cxio_hal_get_pdid(struct cxio_hal_resource *rscp) +{ + return cxio_hal_get_resource(&rscp->pdid_fifo, &rscp->pdid_fifo_lock); +} + +void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid) +{ + cxio_hal_put_resource(&rscp->pdid_fifo, &rscp->pdid_fifo_lock, pdid); +} + +void cxio_hal_destroy_resource(struct cxio_hal_resource *rscp) +{ + kfifo_free(&rscp->tpt_fifo); + kfifo_free(&rscp->cqid_fifo); + kfifo_free(&rscp->qpid_fifo); + kfifo_free(&rscp->pdid_fifo); + kfree(rscp); +} + +/* + * PBL Memory Manager. Uses Linux generic allocator. + */ + +#define MIN_PBL_SHIFT 8 /* 256B == min PBL size (32 entries) */ + +u32 cxio_hal_pblpool_alloc(struct cxio_rdev *rdev_p, int size) +{ + unsigned long addr = gen_pool_alloc(rdev_p->pbl_pool, size); + PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size); + return (u32)addr; +} + +void cxio_hal_pblpool_free(struct cxio_rdev *rdev_p, u32 addr, int size) +{ + PDBG("%s addr 0x%x size %d\n", __func__, addr, size); + gen_pool_free(rdev_p->pbl_pool, (unsigned long)addr, size); +} + +int cxio_hal_pblpool_create(struct cxio_rdev *rdev_p) +{ + unsigned pbl_start, pbl_chunk; + + rdev_p->pbl_pool = gen_pool_create(MIN_PBL_SHIFT, -1); + if (!rdev_p->pbl_pool) + return -ENOMEM; + + pbl_start = rdev_p->rnic_info.pbl_base; + pbl_chunk = rdev_p->rnic_info.pbl_top - pbl_start + 1; + + while (pbl_start < rdev_p->rnic_info.pbl_top) { + pbl_chunk = min(rdev_p->rnic_info.pbl_top - pbl_start + 1, + pbl_chunk); + if (gen_pool_add(rdev_p->pbl_pool, pbl_start, pbl_chunk, -1)) { + PDBG("%s failed to add PBL chunk (%x/%x)\n", + __func__, pbl_start, pbl_chunk); + if (pbl_chunk <= 1024 << MIN_PBL_SHIFT) { + printk(KERN_WARNING MOD "%s: Failed to add all PBL chunks (%x/%x)\n", + __func__, pbl_start, rdev_p->rnic_info.pbl_top - pbl_start); + return 0; + } + pbl_chunk >>= 1; + } else { + PDBG("%s added PBL chunk (%x/%x)\n", + __func__, pbl_start, pbl_chunk); + pbl_start += pbl_chunk; + } + } + + return 0; +} + +void cxio_hal_pblpool_destroy(struct cxio_rdev *rdev_p) +{ + gen_pool_destroy(rdev_p->pbl_pool); +} + +/* + * RQT Memory Manager. Uses Linux generic allocator. + */ + +#define MIN_RQT_SHIFT 10 /* 1KB == mini RQT size (16 entries) */ +#define RQT_CHUNK 2*1024*1024 + +u32 cxio_hal_rqtpool_alloc(struct cxio_rdev *rdev_p, int size) +{ + unsigned long addr = gen_pool_alloc(rdev_p->rqt_pool, size << 6); + PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size << 6); + return (u32)addr; +} + +void cxio_hal_rqtpool_free(struct cxio_rdev *rdev_p, u32 addr, int size) +{ + PDBG("%s addr 0x%x size %d\n", __func__, addr, size << 6); + gen_pool_free(rdev_p->rqt_pool, (unsigned long)addr, size << 6); +} + +int cxio_hal_rqtpool_create(struct cxio_rdev *rdev_p) +{ + unsigned long i; + rdev_p->rqt_pool = gen_pool_create(MIN_RQT_SHIFT, -1); + if (rdev_p->rqt_pool) + for (i = rdev_p->rnic_info.rqt_base; + i <= rdev_p->rnic_info.rqt_top - RQT_CHUNK + 1; + i += RQT_CHUNK) + gen_pool_add(rdev_p->rqt_pool, i, RQT_CHUNK, -1); + return rdev_p->rqt_pool ? 0 : -ENOMEM; +} + +void cxio_hal_rqtpool_destroy(struct cxio_rdev *rdev_p) +{ + gen_pool_destroy(rdev_p->rqt_pool); +} diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.h b/drivers/infiniband/hw/cxgb3/cxio_resource.h new file mode 100644 index 0000000..a2703a3 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/cxio_resource.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __CXIO_RESOURCE_H__ +#define __CXIO_RESOURCE_H__ + +#include +#include +#include +#include +#include +#include +#include +#include "cxio_hal.h" + +extern int cxio_hal_init_rhdl_resource(u32 nr_rhdl); +extern void cxio_hal_destroy_rhdl_resource(void); +extern int cxio_hal_init_resource(struct cxio_rdev *rdev_p, + u32 nr_tpt, u32 nr_pbl, + u32 nr_rqt, u32 nr_qpid, u32 nr_cqid, + u32 nr_pdid); +extern u32 cxio_hal_get_stag(struct cxio_hal_resource *rscp); +extern void cxio_hal_put_stag(struct cxio_hal_resource *rscp, u32 stag); +extern u32 cxio_hal_get_qpid(struct cxio_hal_resource *rscp); +extern void cxio_hal_put_qpid(struct cxio_hal_resource *rscp, u32 qpid); +extern u32 cxio_hal_get_cqid(struct cxio_hal_resource *rscp); +extern void cxio_hal_put_cqid(struct cxio_hal_resource *rscp, u32 cqid); +extern void cxio_hal_destroy_resource(struct cxio_hal_resource *rscp); + +#define PBL_OFF(rdev_p, a) ( (a) - (rdev_p)->rnic_info.pbl_base ) +extern int cxio_hal_pblpool_create(struct cxio_rdev *rdev_p); +extern void cxio_hal_pblpool_destroy(struct cxio_rdev *rdev_p); +extern u32 cxio_hal_pblpool_alloc(struct cxio_rdev *rdev_p, int size); +extern void cxio_hal_pblpool_free(struct cxio_rdev *rdev_p, u32 addr, int size); + +#define RQT_OFF(rdev_p, a) ( (a) - (rdev_p)->rnic_info.rqt_base ) +extern int cxio_hal_rqtpool_create(struct cxio_rdev *rdev_p); +extern void cxio_hal_rqtpool_destroy(struct cxio_rdev *rdev_p); +extern u32 cxio_hal_rqtpool_alloc(struct cxio_rdev *rdev_p, int size); +extern void cxio_hal_rqtpool_free(struct cxio_rdev *rdev_p, u32 addr, int size); +#endif diff --git a/drivers/infiniband/hw/cxgb3/cxio_wr.h b/drivers/infiniband/hw/cxgb3/cxio_wr.h new file mode 100644 index 0000000..83d2e19 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/cxio_wr.h @@ -0,0 +1,802 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __CXIO_WR_H__ +#define __CXIO_WR_H__ + +#include +#include +#include +#include "firmware_exports.h" + +#define T3_MAX_SGE 4 +#define T3_MAX_INLINE 64 +#define T3_STAG0_PBL_SIZE (2 * T3_MAX_SGE << 3) +#define T3_STAG0_MAX_PBE_LEN (128 * 1024 * 1024) +#define T3_STAG0_PAGE_SHIFT 15 + +#define Q_EMPTY(rptr,wptr) ((rptr)==(wptr)) +#define Q_FULL(rptr,wptr,size_log2) ( (((wptr)-(rptr))>>(size_log2)) && \ + ((rptr)!=(wptr)) ) +#define Q_GENBIT(ptr,size_log2) (!(((ptr)>>size_log2)&0x1)) +#define Q_FREECNT(rptr,wptr,size_log2) ((1UL<> S_FW_RIWR_OP)) & M_FW_RIWR_OP) + +#define S_FW_RIWR_SOPEOP 22 +#define M_FW_RIWR_SOPEOP 0x3 +#define V_FW_RIWR_SOPEOP(x) ((x) << S_FW_RIWR_SOPEOP) + +#define S_FW_RIWR_FLAGS 8 +#define M_FW_RIWR_FLAGS 0x3fffff +#define V_FW_RIWR_FLAGS(x) ((x) << S_FW_RIWR_FLAGS) +#define G_FW_RIWR_FLAGS(x) ((((x) >> S_FW_RIWR_FLAGS)) & M_FW_RIWR_FLAGS) + +#define S_FW_RIWR_TID 8 +#define V_FW_RIWR_TID(x) ((x) << S_FW_RIWR_TID) + +#define S_FW_RIWR_LEN 0 +#define V_FW_RIWR_LEN(x) ((x) << S_FW_RIWR_LEN) + +#define S_FW_RIWR_GEN 31 +#define V_FW_RIWR_GEN(x) ((x) << S_FW_RIWR_GEN) + +struct t3_sge { + __be32 stag; + __be32 len; + __be64 to; +}; + +/* If num_sgle is zero, flit 5+ contains immediate data.*/ +struct t3_send_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + + u8 rdmaop; /* 2 */ + u8 reserved[3]; + __be32 rem_stag; + __be32 plen; /* 3 */ + __be32 num_sgle; + struct t3_sge sgl[T3_MAX_SGE]; /* 4+ */ +}; + +#define T3_MAX_FASTREG_DEPTH 10 +#define T3_MAX_FASTREG_FRAG 10 + +struct t3_fastreg_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + __be32 stag; /* 2 */ + __be32 len; + __be32 va_base_hi; /* 3 */ + __be32 va_base_lo_fbo; + __be32 page_type_perms; /* 4 */ + __be32 reserved1; + __be64 pbl_addrs[0]; /* 5+ */ +}; + +/* + * If a fastreg wr spans multiple wqes, then the 2nd fragment look like this. + */ +struct t3_pbl_frag { + struct fw_riwrh wrh; /* 0 */ + __be64 pbl_addrs[14]; /* 1..14 */ +}; + +#define S_FR_PAGE_COUNT 24 +#define M_FR_PAGE_COUNT 0xff +#define V_FR_PAGE_COUNT(x) ((x) << S_FR_PAGE_COUNT) +#define G_FR_PAGE_COUNT(x) ((((x) >> S_FR_PAGE_COUNT)) & M_FR_PAGE_COUNT) + +#define S_FR_PAGE_SIZE 16 +#define M_FR_PAGE_SIZE 0x1f +#define V_FR_PAGE_SIZE(x) ((x) << S_FR_PAGE_SIZE) +#define G_FR_PAGE_SIZE(x) ((((x) >> S_FR_PAGE_SIZE)) & M_FR_PAGE_SIZE) + +#define S_FR_TYPE 8 +#define M_FR_TYPE 0x1 +#define V_FR_TYPE(x) ((x) << S_FR_TYPE) +#define G_FR_TYPE(x) ((((x) >> S_FR_TYPE)) & M_FR_TYPE) + +#define S_FR_PERMS 0 +#define M_FR_PERMS 0xff +#define V_FR_PERMS(x) ((x) << S_FR_PERMS) +#define G_FR_PERMS(x) ((((x) >> S_FR_PERMS)) & M_FR_PERMS) + +struct t3_local_inv_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + __be32 stag; /* 2 */ + __be32 reserved; +}; + +struct t3_rdma_write_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + u8 rdmaop; /* 2 */ + u8 reserved[3]; + __be32 stag_sink; + __be64 to_sink; /* 3 */ + __be32 plen; /* 4 */ + __be32 num_sgle; + struct t3_sge sgl[T3_MAX_SGE]; /* 5+ */ +}; + +struct t3_rdma_read_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + u8 rdmaop; /* 2 */ + u8 local_inv; + u8 reserved[2]; + __be32 rem_stag; + __be64 rem_to; /* 3 */ + __be32 local_stag; /* 4 */ + __be32 local_len; + __be64 local_to; /* 5 */ +}; + +struct t3_bind_mw_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + u16 reserved; /* 2 */ + u8 type; + u8 perms; + __be32 mr_stag; + __be32 mw_stag; /* 3 */ + __be32 mw_len; + __be64 mw_va; /* 4 */ + __be32 mr_pbl_addr; /* 5 */ + u8 reserved2[3]; + u8 mr_pagesz; +}; + +struct t3_receive_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + u8 pagesz[T3_MAX_SGE]; + __be32 num_sgle; /* 2 */ + struct t3_sge sgl[T3_MAX_SGE]; /* 3+ */ + __be32 pbl_addr[T3_MAX_SGE]; +}; + +struct t3_bypass_wr { + struct fw_riwrh wrh; + union t3_wrid wrid; /* 1 */ +}; + +struct t3_modify_qp_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + __be32 flags; /* 2 */ + __be32 quiesce; /* 2 */ + __be32 max_ird; /* 3 */ + __be32 max_ord; /* 3 */ + __be64 sge_cmd; /* 4 */ + __be64 ctx1; /* 5 */ + __be64 ctx0; /* 6 */ +}; + +enum t3_modify_qp_flags { + MODQP_QUIESCE = 0x01, + MODQP_MAX_IRD = 0x02, + MODQP_MAX_ORD = 0x04, + MODQP_WRITE_EC = 0x08, + MODQP_READ_EC = 0x10, +}; + + +enum t3_mpa_attrs { + uP_RI_MPA_RX_MARKER_ENABLE = 0x1, + uP_RI_MPA_TX_MARKER_ENABLE = 0x2, + uP_RI_MPA_CRC_ENABLE = 0x4, + uP_RI_MPA_IETF_ENABLE = 0x8 +} __attribute__ ((packed)); + +enum t3_qp_caps { + uP_RI_QP_RDMA_READ_ENABLE = 0x01, + uP_RI_QP_RDMA_WRITE_ENABLE = 0x02, + uP_RI_QP_BIND_ENABLE = 0x04, + uP_RI_QP_FAST_REGISTER_ENABLE = 0x08, + uP_RI_QP_STAG0_ENABLE = 0x10 +} __attribute__ ((packed)); + +enum rdma_init_rtr_types { + RTR_READ = 1, + RTR_WRITE = 2, + RTR_SEND = 3, +}; + +#define S_RTR_TYPE 2 +#define M_RTR_TYPE 0x3 +#define V_RTR_TYPE(x) ((x) << S_RTR_TYPE) +#define G_RTR_TYPE(x) ((((x) >> S_RTR_TYPE)) & M_RTR_TYPE) + +#define S_CHAN 4 +#define M_CHAN 0x3 +#define V_CHAN(x) ((x) << S_CHAN) +#define G_CHAN(x) ((((x) >> S_CHAN)) & M_CHAN) + +struct t3_rdma_init_attr { + u32 tid; + u32 qpid; + u32 pdid; + u32 scqid; + u32 rcqid; + u32 rq_addr; + u32 rq_size; + enum t3_mpa_attrs mpaattrs; + enum t3_qp_caps qpcaps; + u16 tcp_emss; + u32 ord; + u32 ird; + u64 qp_dma_addr; + u32 qp_dma_size; + enum rdma_init_rtr_types rtr_type; + u16 flags; + u16 rqe_count; + u32 irs; + u32 chan; +}; + +struct t3_rdma_init_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + __be32 qpid; /* 2 */ + __be32 pdid; + __be32 scqid; /* 3 */ + __be32 rcqid; + __be32 rq_addr; /* 4 */ + __be32 rq_size; + u8 mpaattrs; /* 5 */ + u8 qpcaps; + __be16 ulpdu_size; + __be16 flags_rtr_type; + __be16 rqe_count; + __be32 ord; /* 6 */ + __be32 ird; + __be64 qp_dma_addr; /* 7 */ + __be32 qp_dma_size; /* 8 */ + __be32 irs; +}; + +struct t3_genbit { + u64 flit[15]; + __be64 genbit; +}; + +struct t3_wq_in_err { + u64 flit[13]; + u64 err; +}; + +enum rdma_init_wr_flags { + MPA_INITIATOR = (1<<0), + PRIV_QP = (1<<1), +}; + +union t3_wr { + struct t3_send_wr send; + struct t3_rdma_write_wr write; + struct t3_rdma_read_wr read; + struct t3_receive_wr recv; + struct t3_fastreg_wr fastreg; + struct t3_pbl_frag pbl_frag; + struct t3_local_inv_wr local_inv; + struct t3_bind_mw_wr bind; + struct t3_bypass_wr bypass; + struct t3_rdma_init_wr init; + struct t3_modify_qp_wr qp_mod; + struct t3_genbit genbit; + struct t3_wq_in_err wq_in_err; + __be64 flit[16]; +}; + +#define T3_SQ_CQE_FLIT 13 +#define T3_SQ_COOKIE_FLIT 14 + +#define T3_RQ_COOKIE_FLIT 13 +#define T3_RQ_CQE_FLIT 14 + +static inline enum t3_wr_opcode fw_riwrh_opcode(struct fw_riwrh *wqe) +{ + return G_FW_RIWR_OP(be32_to_cpu(wqe->op_seop_flags)); +} + +enum t3_wr_hdr_bits { + T3_EOP = 1, + T3_SOP = 2, + T3_SOPEOP = T3_EOP|T3_SOP, +}; + +static inline void build_fw_riwrh(struct fw_riwrh *wqe, enum t3_wr_opcode op, + enum t3_wr_flags flags, u8 genbit, u32 tid, + u8 len, u8 sopeop) +{ + wqe->op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(op) | + V_FW_RIWR_SOPEOP(sopeop) | + V_FW_RIWR_FLAGS(flags)); + wmb(); + wqe->gen_tid_len = cpu_to_be32(V_FW_RIWR_GEN(genbit) | + V_FW_RIWR_TID(tid) | + V_FW_RIWR_LEN(len)); + /* 2nd gen bit... */ + ((union t3_wr *)wqe)->genbit.genbit = cpu_to_be64(genbit); +} + +/* + * T3 ULP2_TX commands + */ +enum t3_utx_mem_op { + T3_UTX_MEM_READ = 2, + T3_UTX_MEM_WRITE = 3 +}; + +/* T3 MC7 RDMA TPT entry format */ + +enum tpt_mem_type { + TPT_NON_SHARED_MR = 0x0, + TPT_SHARED_MR = 0x1, + TPT_MW = 0x2, + TPT_MW_RELAXED_PROTECTION = 0x3 +}; + +enum tpt_addr_type { + TPT_ZBTO = 0, + TPT_VATO = 1 +}; + +enum tpt_mem_perm { + TPT_MW_BIND = 0x10, + TPT_LOCAL_READ = 0x8, + TPT_LOCAL_WRITE = 0x4, + TPT_REMOTE_READ = 0x2, + TPT_REMOTE_WRITE = 0x1 +}; + +struct tpt_entry { + __be32 valid_stag_pdid; + __be32 flags_pagesize_qpid; + + __be32 rsvd_pbl_addr; + __be32 len; + __be32 va_hi; + __be32 va_low_or_fbo; + + __be32 rsvd_bind_cnt_or_pstag; + __be32 rsvd_pbl_size; +}; + +#define S_TPT_VALID 31 +#define V_TPT_VALID(x) ((x) << S_TPT_VALID) +#define F_TPT_VALID V_TPT_VALID(1U) + +#define S_TPT_STAG_KEY 23 +#define M_TPT_STAG_KEY 0xFF +#define V_TPT_STAG_KEY(x) ((x) << S_TPT_STAG_KEY) +#define G_TPT_STAG_KEY(x) (((x) >> S_TPT_STAG_KEY) & M_TPT_STAG_KEY) + +#define S_TPT_STAG_STATE 22 +#define V_TPT_STAG_STATE(x) ((x) << S_TPT_STAG_STATE) +#define F_TPT_STAG_STATE V_TPT_STAG_STATE(1U) + +#define S_TPT_STAG_TYPE 20 +#define M_TPT_STAG_TYPE 0x3 +#define V_TPT_STAG_TYPE(x) ((x) << S_TPT_STAG_TYPE) +#define G_TPT_STAG_TYPE(x) (((x) >> S_TPT_STAG_TYPE) & M_TPT_STAG_TYPE) + +#define S_TPT_PDID 0 +#define M_TPT_PDID 0xFFFFF +#define V_TPT_PDID(x) ((x) << S_TPT_PDID) +#define G_TPT_PDID(x) (((x) >> S_TPT_PDID) & M_TPT_PDID) + +#define S_TPT_PERM 28 +#define M_TPT_PERM 0xF +#define V_TPT_PERM(x) ((x) << S_TPT_PERM) +#define G_TPT_PERM(x) (((x) >> S_TPT_PERM) & M_TPT_PERM) + +#define S_TPT_REM_INV_DIS 27 +#define V_TPT_REM_INV_DIS(x) ((x) << S_TPT_REM_INV_DIS) +#define F_TPT_REM_INV_DIS V_TPT_REM_INV_DIS(1U) + +#define S_TPT_ADDR_TYPE 26 +#define V_TPT_ADDR_TYPE(x) ((x) << S_TPT_ADDR_TYPE) +#define F_TPT_ADDR_TYPE V_TPT_ADDR_TYPE(1U) + +#define S_TPT_MW_BIND_ENABLE 25 +#define V_TPT_MW_BIND_ENABLE(x) ((x) << S_TPT_MW_BIND_ENABLE) +#define F_TPT_MW_BIND_ENABLE V_TPT_MW_BIND_ENABLE(1U) + +#define S_TPT_PAGE_SIZE 20 +#define M_TPT_PAGE_SIZE 0x1F +#define V_TPT_PAGE_SIZE(x) ((x) << S_TPT_PAGE_SIZE) +#define G_TPT_PAGE_SIZE(x) (((x) >> S_TPT_PAGE_SIZE) & M_TPT_PAGE_SIZE) + +#define S_TPT_PBL_ADDR 0 +#define M_TPT_PBL_ADDR 0x1FFFFFFF +#define V_TPT_PBL_ADDR(x) ((x) << S_TPT_PBL_ADDR) +#define G_TPT_PBL_ADDR(x) (((x) >> S_TPT_PBL_ADDR) & M_TPT_PBL_ADDR) + +#define S_TPT_QPID 0 +#define M_TPT_QPID 0xFFFFF +#define V_TPT_QPID(x) ((x) << S_TPT_QPID) +#define G_TPT_QPID(x) (((x) >> S_TPT_QPID) & M_TPT_QPID) + +#define S_TPT_PSTAG 0 +#define M_TPT_PSTAG 0xFFFFFF +#define V_TPT_PSTAG(x) ((x) << S_TPT_PSTAG) +#define G_TPT_PSTAG(x) (((x) >> S_TPT_PSTAG) & M_TPT_PSTAG) + +#define S_TPT_PBL_SIZE 0 +#define M_TPT_PBL_SIZE 0xFFFFF +#define V_TPT_PBL_SIZE(x) ((x) << S_TPT_PBL_SIZE) +#define G_TPT_PBL_SIZE(x) (((x) >> S_TPT_PBL_SIZE) & M_TPT_PBL_SIZE) + +/* + * CQE defs + */ +struct t3_cqe { + __be32 header; + __be32 len; + union { + struct { + __be32 stag; + __be32 msn; + } rcqe; + struct { + u32 wrid_hi; + u32 wrid_low; + } scqe; + } u; +}; + +#define S_CQE_OOO 31 +#define M_CQE_OOO 0x1 +#define G_CQE_OOO(x) ((((x) >> S_CQE_OOO)) & M_CQE_OOO) +#define V_CEQ_OOO(x) ((x)<> S_CQE_QPID)) & M_CQE_QPID) +#define V_CQE_QPID(x) ((x)<> S_CQE_SWCQE)) & M_CQE_SWCQE) +#define V_CQE_SWCQE(x) ((x)<> S_CQE_GENBIT) & M_CQE_GENBIT) +#define V_CQE_GENBIT(x) ((x)<> S_CQE_STATUS)) & M_CQE_STATUS) +#define V_CQE_STATUS(x) ((x)<> S_CQE_TYPE)) & M_CQE_TYPE) +#define V_CQE_TYPE(x) ((x)<> S_CQE_OPCODE)) & M_CQE_OPCODE) +#define V_CQE_OPCODE(x) ((x)<queue[1 << cq->size_log2])->cq_err; +} + +static inline void cxio_set_cq_in_error(struct t3_cq *cq) +{ + ((struct t3_cq_status_page *) + &cq->queue[1 << cq->size_log2])->cq_err = 1; +} + +static inline void cxio_set_wq_in_error(struct t3_wq *wq) +{ + wq->queue->wq_in_err.err |= 1; +} + +static inline void cxio_disable_wq_db(struct t3_wq *wq) +{ + wq->queue->wq_in_err.err |= 2; +} + +static inline void cxio_enable_wq_db(struct t3_wq *wq) +{ + wq->queue->wq_in_err.err &= ~2; +} + +static inline int cxio_wq_db_enabled(struct t3_wq *wq) +{ + return !(wq->queue->wq_in_err.err & 2); +} + +static inline struct t3_cqe *cxio_next_hw_cqe(struct t3_cq *cq) +{ + struct t3_cqe *cqe; + + cqe = cq->queue + (Q_PTR2IDX(cq->rptr, cq->size_log2)); + if (CQ_VLD_ENTRY(cq->rptr, cq->size_log2, cqe)) + return cqe; + return NULL; +} + +static inline struct t3_cqe *cxio_next_sw_cqe(struct t3_cq *cq) +{ + struct t3_cqe *cqe; + + if (!Q_EMPTY(cq->sw_rptr, cq->sw_wptr)) { + cqe = cq->sw_queue + (Q_PTR2IDX(cq->sw_rptr, cq->size_log2)); + return cqe; + } + return NULL; +} + +static inline struct t3_cqe *cxio_next_cqe(struct t3_cq *cq) +{ + struct t3_cqe *cqe; + + if (!Q_EMPTY(cq->sw_rptr, cq->sw_wptr)) { + cqe = cq->sw_queue + (Q_PTR2IDX(cq->sw_rptr, cq->size_log2)); + return cqe; + } + cqe = cq->queue + (Q_PTR2IDX(cq->rptr, cq->size_log2)); + if (CQ_VLD_ENTRY(cq->rptr, cq->size_log2, cqe)) + return cqe; + return NULL; +} + +#endif diff --git a/drivers/infiniband/hw/cxgb3/iwch.c b/drivers/infiniband/hw/cxgb3/iwch.c new file mode 100644 index 0000000..8e77dc5 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch.c @@ -0,0 +1,292 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include + +#include + +#include "cxgb3_offload.h" +#include "iwch_provider.h" +#include "iwch_user.h" +#include "iwch.h" +#include "iwch_cm.h" + +#define DRV_VERSION "1.1" + +MODULE_AUTHOR("Boyd Faulkner, Steve Wise"); +MODULE_DESCRIPTION("Chelsio T3 RDMA Driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); + +static void open_rnic_dev(struct t3cdev *); +static void close_rnic_dev(struct t3cdev *); +static void iwch_event_handler(struct t3cdev *, u32, u32); + +struct cxgb3_client t3c_client = { + .name = "iw_cxgb3", + .add = open_rnic_dev, + .remove = close_rnic_dev, + .handlers = t3c_handlers, + .redirect = iwch_ep_redirect, + .event_handler = iwch_event_handler +}; + +static LIST_HEAD(dev_list); +static DEFINE_MUTEX(dev_mutex); + +static int disable_qp_db(int id, void *p, void *data) +{ + struct iwch_qp *qhp = p; + + cxio_disable_wq_db(&qhp->wq); + return 0; +} + +static int enable_qp_db(int id, void *p, void *data) +{ + struct iwch_qp *qhp = p; + + if (data) + ring_doorbell(qhp->rhp->rdev.ctrl_qp.doorbell, qhp->wq.qpid); + cxio_enable_wq_db(&qhp->wq); + return 0; +} + +static void disable_dbs(struct iwch_dev *rnicp) +{ + spin_lock_irq(&rnicp->lock); + idr_for_each(&rnicp->qpidr, disable_qp_db, NULL); + spin_unlock_irq(&rnicp->lock); +} + +static void enable_dbs(struct iwch_dev *rnicp, int ring_db) +{ + spin_lock_irq(&rnicp->lock); + idr_for_each(&rnicp->qpidr, enable_qp_db, + (void *)(unsigned long)ring_db); + spin_unlock_irq(&rnicp->lock); +} + +static void iwch_db_drop_task(struct work_struct *work) +{ + struct iwch_dev *rnicp = container_of(work, struct iwch_dev, + db_drop_task.work); + enable_dbs(rnicp, 1); +} + +static void rnic_init(struct iwch_dev *rnicp) +{ + PDBG("%s iwch_dev %p\n", __func__, rnicp); + idr_init(&rnicp->cqidr); + idr_init(&rnicp->qpidr); + idr_init(&rnicp->mmidr); + spin_lock_init(&rnicp->lock); + INIT_DELAYED_WORK(&rnicp->db_drop_task, iwch_db_drop_task); + + rnicp->attr.max_qps = T3_MAX_NUM_QP - 32; + rnicp->attr.max_wrs = T3_MAX_QP_DEPTH; + rnicp->attr.max_sge_per_wr = T3_MAX_SGE; + rnicp->attr.max_sge_per_rdma_write_wr = T3_MAX_SGE; + rnicp->attr.max_cqs = T3_MAX_NUM_CQ - 1; + rnicp->attr.max_cqes_per_cq = T3_MAX_CQ_DEPTH; + rnicp->attr.max_mem_regs = cxio_num_stags(&rnicp->rdev); + rnicp->attr.max_phys_buf_entries = T3_MAX_PBL_SIZE; + rnicp->attr.max_pds = T3_MAX_NUM_PD - 1; + rnicp->attr.mem_pgsizes_bitmask = T3_PAGESIZE_MASK; + rnicp->attr.max_mr_size = T3_MAX_MR_SIZE; + rnicp->attr.can_resize_wq = 0; + rnicp->attr.max_rdma_reads_per_qp = 8; + rnicp->attr.max_rdma_read_resources = + rnicp->attr.max_rdma_reads_per_qp * rnicp->attr.max_qps; + rnicp->attr.max_rdma_read_qp_depth = 8; /* IRD */ + rnicp->attr.max_rdma_read_depth = + rnicp->attr.max_rdma_read_qp_depth * rnicp->attr.max_qps; + rnicp->attr.rq_overflow_handled = 0; + rnicp->attr.can_modify_ird = 0; + rnicp->attr.can_modify_ord = 0; + rnicp->attr.max_mem_windows = rnicp->attr.max_mem_regs - 1; + rnicp->attr.stag0_value = 1; + rnicp->attr.zbva_support = 1; + rnicp->attr.local_invalidate_fence = 1; + rnicp->attr.cq_overflow_detection = 1; + return; +} + +static void open_rnic_dev(struct t3cdev *tdev) +{ + struct iwch_dev *rnicp; + + PDBG("%s t3cdev %p\n", __func__, tdev); + printk_once(KERN_INFO MOD "Chelsio T3 RDMA Driver - version %s\n", + DRV_VERSION); + rnicp = (struct iwch_dev *)ib_alloc_device(sizeof(*rnicp)); + if (!rnicp) { + printk(KERN_ERR MOD "Cannot allocate ib device\n"); + return; + } + rnicp->rdev.ulp = rnicp; + rnicp->rdev.t3cdev_p = tdev; + + mutex_lock(&dev_mutex); + + if (cxio_rdev_open(&rnicp->rdev)) { + mutex_unlock(&dev_mutex); + printk(KERN_ERR MOD "Unable to open CXIO rdev\n"); + ib_dealloc_device(&rnicp->ibdev); + return; + } + + rnic_init(rnicp); + + list_add_tail(&rnicp->entry, &dev_list); + mutex_unlock(&dev_mutex); + + if (iwch_register_device(rnicp)) { + printk(KERN_ERR MOD "Unable to register device\n"); + close_rnic_dev(tdev); + } + printk(KERN_INFO MOD "Initialized device %s\n", + pci_name(rnicp->rdev.rnic_info.pdev)); + return; +} + +static void close_rnic_dev(struct t3cdev *tdev) +{ + struct iwch_dev *dev, *tmp; + PDBG("%s t3cdev %p\n", __func__, tdev); + mutex_lock(&dev_mutex); + list_for_each_entry_safe(dev, tmp, &dev_list, entry) { + if (dev->rdev.t3cdev_p == tdev) { + dev->rdev.flags = CXIO_ERROR_FATAL; + synchronize_net(); + cancel_delayed_work_sync(&dev->db_drop_task); + list_del(&dev->entry); + iwch_unregister_device(dev); + cxio_rdev_close(&dev->rdev); + idr_destroy(&dev->cqidr); + idr_destroy(&dev->qpidr); + idr_destroy(&dev->mmidr); + ib_dealloc_device(&dev->ibdev); + break; + } + } + mutex_unlock(&dev_mutex); +} + +static void iwch_event_handler(struct t3cdev *tdev, u32 evt, u32 port_id) +{ + struct cxio_rdev *rdev = tdev->ulp; + struct iwch_dev *rnicp; + struct ib_event event; + u32 portnum = port_id + 1; + int dispatch = 0; + + if (!rdev) + return; + rnicp = rdev_to_iwch_dev(rdev); + switch (evt) { + case OFFLOAD_STATUS_DOWN: { + rdev->flags = CXIO_ERROR_FATAL; + synchronize_net(); + event.event = IB_EVENT_DEVICE_FATAL; + dispatch = 1; + break; + } + case OFFLOAD_PORT_DOWN: { + event.event = IB_EVENT_PORT_ERR; + dispatch = 1; + break; + } + case OFFLOAD_PORT_UP: { + event.event = IB_EVENT_PORT_ACTIVE; + dispatch = 1; + break; + } + case OFFLOAD_DB_FULL: { + disable_dbs(rnicp); + break; + } + case OFFLOAD_DB_EMPTY: { + enable_dbs(rnicp, 1); + break; + } + case OFFLOAD_DB_DROP: { + unsigned long delay = 1000; + unsigned short r; + + disable_dbs(rnicp); + get_random_bytes(&r, 2); + delay += r & 1023; + + /* + * delay is between 1000-2023 usecs. + */ + schedule_delayed_work(&rnicp->db_drop_task, + usecs_to_jiffies(delay)); + break; + } + } + + if (dispatch) { + event.device = &rnicp->ibdev; + event.element.port_num = portnum; + ib_dispatch_event(&event); + } + + return; +} + +static int __init iwch_init_module(void) +{ + int err; + + err = cxio_hal_init(); + if (err) + return err; + err = iwch_cm_init(); + if (err) + return err; + cxio_register_ev_cb(iwch_ev_dispatch); + cxgb3_register_client(&t3c_client); + return 0; +} + +static void __exit iwch_exit_module(void) +{ + cxgb3_unregister_client(&t3c_client); + cxio_unregister_ev_cb(iwch_ev_dispatch); + iwch_cm_term(); + cxio_hal_exit(); +} + +module_init(iwch_init_module); +module_exit(iwch_exit_module); diff --git a/drivers/infiniband/hw/cxgb3/iwch.h b/drivers/infiniband/hw/cxgb3/iwch.h new file mode 100644 index 0000000..8378622 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch.h @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __IWCH_H__ +#define __IWCH_H__ + +#include +#include +#include +#include +#include + +#include + +#include "cxio_hal.h" +#include "cxgb3_offload.h" + +struct iwch_pd; +struct iwch_cq; +struct iwch_qp; +struct iwch_mr; + +struct iwch_rnic_attributes { + u32 max_qps; + u32 max_wrs; /* Max for any SQ/RQ */ + u32 max_sge_per_wr; + u32 max_sge_per_rdma_write_wr; /* for RDMA Write WR */ + u32 max_cqs; + u32 max_cqes_per_cq; + u32 max_mem_regs; + u32 max_phys_buf_entries; /* for phys buf list */ + u32 max_pds; + + /* + * The memory page sizes supported by this RNIC. + * Bit position i in bitmap indicates page of + * size (4k)^i. Phys block list mode unsupported. + */ + u32 mem_pgsizes_bitmask; + u64 max_mr_size; + u8 can_resize_wq; + + /* + * The maximum number of RDMA Reads that can be outstanding + * per QP with this RNIC as the target. + */ + u32 max_rdma_reads_per_qp; + + /* + * The maximum number of resources used for RDMA Reads + * by this RNIC with this RNIC as the target. + */ + u32 max_rdma_read_resources; + + /* + * The max depth per QP for initiation of RDMA Read + * by this RNIC. + */ + u32 max_rdma_read_qp_depth; + + /* + * The maximum depth for initiation of RDMA Read + * operations by this RNIC on all QPs + */ + u32 max_rdma_read_depth; + u8 rq_overflow_handled; + u32 can_modify_ird; + u32 can_modify_ord; + u32 max_mem_windows; + u32 stag0_value; + u8 zbva_support; + u8 local_invalidate_fence; + u32 cq_overflow_detection; +}; + +struct iwch_dev { + struct ib_device ibdev; + struct cxio_rdev rdev; + u32 device_cap_flags; + struct iwch_rnic_attributes attr; + struct idr cqidr; + struct idr qpidr; + struct idr mmidr; + spinlock_t lock; + struct list_head entry; + struct delayed_work db_drop_task; +}; + +static inline struct iwch_dev *to_iwch_dev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct iwch_dev, ibdev); +} + +static inline struct iwch_dev *rdev_to_iwch_dev(struct cxio_rdev *rdev) +{ + return container_of(rdev, struct iwch_dev, rdev); +} + +static inline int t3b_device(const struct iwch_dev *rhp) +{ + return rhp->rdev.t3cdev_p->type == T3B; +} + +static inline int t3a_device(const struct iwch_dev *rhp) +{ + return rhp->rdev.t3cdev_p->type == T3A; +} + +static inline struct iwch_cq *get_chp(struct iwch_dev *rhp, u32 cqid) +{ + return idr_find(&rhp->cqidr, cqid); +} + +static inline struct iwch_qp *get_qhp(struct iwch_dev *rhp, u32 qpid) +{ + return idr_find(&rhp->qpidr, qpid); +} + +static inline struct iwch_mr *get_mhp(struct iwch_dev *rhp, u32 mmid) +{ + return idr_find(&rhp->mmidr, mmid); +} + +static inline int insert_handle(struct iwch_dev *rhp, struct idr *idr, + void *handle, u32 id) +{ + int ret; + + idr_preload(GFP_KERNEL); + spin_lock_irq(&rhp->lock); + + ret = idr_alloc(idr, handle, id, id + 1, GFP_NOWAIT); + + spin_unlock_irq(&rhp->lock); + idr_preload_end(); + + BUG_ON(ret == -ENOSPC); + return ret < 0 ? ret : 0; +} + +static inline void remove_handle(struct iwch_dev *rhp, struct idr *idr, u32 id) +{ + spin_lock_irq(&rhp->lock); + idr_remove(idr, id); + spin_unlock_irq(&rhp->lock); +} + +extern struct cxgb3_client t3c_client; +extern cxgb3_cpl_handler_func t3c_handlers[NUM_CPL_CMDS]; +extern void iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct sk_buff *skb); + +#endif diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c new file mode 100644 index 0000000..cb78b1e --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c @@ -0,0 +1,2272 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "tcb.h" +#include "cxgb3_offload.h" +#include "iwch.h" +#include "iwch_provider.h" +#include "iwch_cm.h" + +static char *states[] = { + "idle", + "listen", + "connecting", + "mpa_wait_req", + "mpa_req_sent", + "mpa_req_rcvd", + "mpa_rep_sent", + "fpdu_mode", + "aborting", + "closing", + "moribund", + "dead", + NULL, +}; + +int peer2peer = 0; +module_param(peer2peer, int, 0644); +MODULE_PARM_DESC(peer2peer, "Support peer2peer ULPs (default=0)"); + +static int ep_timeout_secs = 60; +module_param(ep_timeout_secs, int, 0644); +MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout " + "in seconds (default=60)"); + +static int mpa_rev = 1; +module_param(mpa_rev, int, 0644); +MODULE_PARM_DESC(mpa_rev, "MPA Revision, 0 supports amso1100, " + "1 is spec compliant. (default=1)"); + +static int markers_enabled = 0; +module_param(markers_enabled, int, 0644); +MODULE_PARM_DESC(markers_enabled, "Enable MPA MARKERS (default(0)=disabled)"); + +static int crc_enabled = 1; +module_param(crc_enabled, int, 0644); +MODULE_PARM_DESC(crc_enabled, "Enable MPA CRC (default(1)=enabled)"); + +static int rcv_win = 256 * 1024; +module_param(rcv_win, int, 0644); +MODULE_PARM_DESC(rcv_win, "TCP receive window in bytes (default=256)"); + +static int snd_win = 32 * 1024; +module_param(snd_win, int, 0644); +MODULE_PARM_DESC(snd_win, "TCP send window in bytes (default=32KB)"); + +static unsigned int nocong = 0; +module_param(nocong, uint, 0644); +MODULE_PARM_DESC(nocong, "Turn off congestion control (default=0)"); + +static unsigned int cong_flavor = 1; +module_param(cong_flavor, uint, 0644); +MODULE_PARM_DESC(cong_flavor, "TCP Congestion control flavor (default=1)"); + +static struct workqueue_struct *workq; + +static struct sk_buff_head rxq; + +static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp); +static void ep_timeout(unsigned long arg); +static void connect_reply_upcall(struct iwch_ep *ep, int status); + +static void start_ep_timer(struct iwch_ep *ep) +{ + PDBG("%s ep %p\n", __func__, ep); + if (timer_pending(&ep->timer)) { + PDBG("%s stopped / restarted timer ep %p\n", __func__, ep); + del_timer_sync(&ep->timer); + } else + get_ep(&ep->com); + ep->timer.expires = jiffies + ep_timeout_secs * HZ; + ep->timer.data = (unsigned long)ep; + ep->timer.function = ep_timeout; + add_timer(&ep->timer); +} + +static void stop_ep_timer(struct iwch_ep *ep) +{ + PDBG("%s ep %p\n", __func__, ep); + if (!timer_pending(&ep->timer)) { + WARN(1, "%s timer stopped when its not running! ep %p state %u\n", + __func__, ep, ep->com.state); + return; + } + del_timer_sync(&ep->timer); + put_ep(&ep->com); +} + +static int iwch_l2t_send(struct t3cdev *tdev, struct sk_buff *skb, struct l2t_entry *l2e) +{ + int error = 0; + struct cxio_rdev *rdev; + + rdev = (struct cxio_rdev *)tdev->ulp; + if (cxio_fatal_error(rdev)) { + kfree_skb(skb); + return -EIO; + } + error = l2t_send(tdev, skb, l2e); + if (error < 0) + kfree_skb(skb); + return error; +} + +int iwch_cxgb3_ofld_send(struct t3cdev *tdev, struct sk_buff *skb) +{ + int error = 0; + struct cxio_rdev *rdev; + + rdev = (struct cxio_rdev *)tdev->ulp; + if (cxio_fatal_error(rdev)) { + kfree_skb(skb); + return -EIO; + } + error = cxgb3_ofld_send(tdev, skb); + if (error < 0) + kfree_skb(skb); + return error; +} + +static void release_tid(struct t3cdev *tdev, u32 hwtid, struct sk_buff *skb) +{ + struct cpl_tid_release *req; + + skb = get_skb(skb, sizeof *req, GFP_KERNEL); + if (!skb) + return; + req = (struct cpl_tid_release *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, hwtid)); + skb->priority = CPL_PRIORITY_SETUP; + iwch_cxgb3_ofld_send(tdev, skb); + return; +} + +int iwch_quiesce_tid(struct iwch_ep *ep) +{ + struct cpl_set_tcb_field *req; + struct sk_buff *skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + + if (!skb) + return -ENOMEM; + req = (struct cpl_set_tcb_field *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid)); + req->reply = 0; + req->cpu_idx = 0; + req->word = htons(W_TCB_RX_QUIESCE); + req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE); + req->val = cpu_to_be64(1 << S_TCB_RX_QUIESCE); + + skb->priority = CPL_PRIORITY_DATA; + return iwch_cxgb3_ofld_send(ep->com.tdev, skb); +} + +int iwch_resume_tid(struct iwch_ep *ep) +{ + struct cpl_set_tcb_field *req; + struct sk_buff *skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + + if (!skb) + return -ENOMEM; + req = (struct cpl_set_tcb_field *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid)); + req->reply = 0; + req->cpu_idx = 0; + req->word = htons(W_TCB_RX_QUIESCE); + req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE); + req->val = 0; + + skb->priority = CPL_PRIORITY_DATA; + return iwch_cxgb3_ofld_send(ep->com.tdev, skb); +} + +static void set_emss(struct iwch_ep *ep, u16 opt) +{ + PDBG("%s ep %p opt %u\n", __func__, ep, opt); + ep->emss = T3C_DATA(ep->com.tdev)->mtus[G_TCPOPT_MSS(opt)] - 40; + if (G_TCPOPT_TSTAMP(opt)) + ep->emss -= 12; + if (ep->emss < 128) + ep->emss = 128; + PDBG("emss=%d\n", ep->emss); +} + +static enum iwch_ep_state state_read(struct iwch_ep_common *epc) +{ + unsigned long flags; + enum iwch_ep_state state; + + spin_lock_irqsave(&epc->lock, flags); + state = epc->state; + spin_unlock_irqrestore(&epc->lock, flags); + return state; +} + +static void __state_set(struct iwch_ep_common *epc, enum iwch_ep_state new) +{ + epc->state = new; +} + +static void state_set(struct iwch_ep_common *epc, enum iwch_ep_state new) +{ + unsigned long flags; + + spin_lock_irqsave(&epc->lock, flags); + PDBG("%s - %s -> %s\n", __func__, states[epc->state], states[new]); + __state_set(epc, new); + spin_unlock_irqrestore(&epc->lock, flags); + return; +} + +static void *alloc_ep(int size, gfp_t gfp) +{ + struct iwch_ep_common *epc; + + epc = kzalloc(size, gfp); + if (epc) { + kref_init(&epc->kref); + spin_lock_init(&epc->lock); + init_waitqueue_head(&epc->waitq); + } + PDBG("%s alloc ep %p\n", __func__, epc); + return epc; +} + +void __free_ep(struct kref *kref) +{ + struct iwch_ep *ep; + ep = container_of(container_of(kref, struct iwch_ep_common, kref), + struct iwch_ep, com); + PDBG("%s ep %p state %s\n", __func__, ep, states[state_read(&ep->com)]); + if (test_bit(RELEASE_RESOURCES, &ep->com.flags)) { + cxgb3_remove_tid(ep->com.tdev, (void *)ep, ep->hwtid); + dst_release(ep->dst); + l2t_release(ep->com.tdev, ep->l2t); + } + kfree(ep); +} + +static void release_ep_resources(struct iwch_ep *ep) +{ + PDBG("%s ep %p tid %d\n", __func__, ep, ep->hwtid); + set_bit(RELEASE_RESOURCES, &ep->com.flags); + put_ep(&ep->com); +} + +static int status2errno(int status) +{ + switch (status) { + case CPL_ERR_NONE: + return 0; + case CPL_ERR_CONN_RESET: + return -ECONNRESET; + case CPL_ERR_ARP_MISS: + return -EHOSTUNREACH; + case CPL_ERR_CONN_TIMEDOUT: + return -ETIMEDOUT; + case CPL_ERR_TCAM_FULL: + return -ENOMEM; + case CPL_ERR_CONN_EXIST: + return -EADDRINUSE; + default: + return -EIO; + } +} + +/* + * Try and reuse skbs already allocated... + */ +static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp) +{ + if (skb && !skb_is_nonlinear(skb) && !skb_cloned(skb)) { + skb_trim(skb, 0); + skb_get(skb); + } else { + skb = alloc_skb(len, gfp); + } + return skb; +} + +static struct rtable *find_route(struct t3cdev *dev, __be32 local_ip, + __be32 peer_ip, __be16 local_port, + __be16 peer_port, u8 tos) +{ + struct rtable *rt; + struct flowi4 fl4; + + rt = ip_route_output_ports(&init_net, &fl4, NULL, peer_ip, local_ip, + peer_port, local_port, IPPROTO_TCP, + tos, 0); + if (IS_ERR(rt)) + return NULL; + return rt; +} + +static unsigned int find_best_mtu(const struct t3c_data *d, unsigned short mtu) +{ + int i = 0; + + while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu) + ++i; + return i; +} + +static void arp_failure_discard(struct t3cdev *dev, struct sk_buff *skb) +{ + PDBG("%s t3cdev %p\n", __func__, dev); + kfree_skb(skb); +} + +/* + * Handle an ARP failure for an active open. + */ +static void act_open_req_arp_failure(struct t3cdev *dev, struct sk_buff *skb) +{ + printk(KERN_ERR MOD "ARP failure duing connect\n"); + kfree_skb(skb); +} + +/* + * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant + * and send it along. + */ +static void abort_arp_failure(struct t3cdev *dev, struct sk_buff *skb) +{ + struct cpl_abort_req *req = cplhdr(skb); + + PDBG("%s t3cdev %p\n", __func__, dev); + req->cmd = CPL_ABORT_NO_RST; + iwch_cxgb3_ofld_send(dev, skb); +} + +static int send_halfclose(struct iwch_ep *ep, gfp_t gfp) +{ + struct cpl_close_con_req *req; + struct sk_buff *skb; + + PDBG("%s ep %p\n", __func__, ep); + skb = get_skb(NULL, sizeof(*req), gfp); + if (!skb) { + printk(KERN_ERR MOD "%s - failed to alloc skb\n", __func__); + return -ENOMEM; + } + skb->priority = CPL_PRIORITY_DATA; + set_arp_failure_handler(skb, arp_failure_discard); + req = (struct cpl_close_con_req *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON)); + req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, ep->hwtid)); + return iwch_l2t_send(ep->com.tdev, skb, ep->l2t); +} + +static int send_abort(struct iwch_ep *ep, struct sk_buff *skb, gfp_t gfp) +{ + struct cpl_abort_req *req; + + PDBG("%s ep %p\n", __func__, ep); + skb = get_skb(skb, sizeof(*req), gfp); + if (!skb) { + printk(KERN_ERR MOD "%s - failed to alloc skb.\n", + __func__); + return -ENOMEM; + } + skb->priority = CPL_PRIORITY_DATA; + set_arp_failure_handler(skb, abort_arp_failure); + req = (struct cpl_abort_req *) skb_put(skb, sizeof(*req)); + memset(req, 0, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); + req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, ep->hwtid)); + req->cmd = CPL_ABORT_SEND_RST; + return iwch_l2t_send(ep->com.tdev, skb, ep->l2t); +} + +static int send_connect(struct iwch_ep *ep) +{ + struct cpl_act_open_req *req; + struct sk_buff *skb; + u32 opt0h, opt0l, opt2; + unsigned int mtu_idx; + int wscale; + + PDBG("%s ep %p\n", __func__, ep); + + skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "%s - failed to alloc skb.\n", + __func__); + return -ENOMEM; + } + mtu_idx = find_best_mtu(T3C_DATA(ep->com.tdev), dst_mtu(ep->dst)); + wscale = compute_wscale(rcv_win); + opt0h = V_NAGLE(0) | + V_NO_CONG(nocong) | + V_KEEP_ALIVE(1) | + F_TCAM_BYPASS | + V_WND_SCALE(wscale) | + V_MSS_IDX(mtu_idx) | + V_L2T_IDX(ep->l2t->idx) | V_TX_CHANNEL(ep->l2t->smt_idx); + opt0l = V_TOS((ep->tos >> 2) & M_TOS) | V_RCV_BUFSIZ(rcv_win>>10); + opt2 = F_RX_COALESCE_VALID | V_RX_COALESCE(0) | V_FLAVORS_VALID(1) | + V_CONG_CONTROL_FLAVOR(cong_flavor); + skb->priority = CPL_PRIORITY_SETUP; + set_arp_failure_handler(skb, act_open_req_arp_failure); + + req = (struct cpl_act_open_req *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, ep->atid)); + req->local_port = ep->com.local_addr.sin_port; + req->peer_port = ep->com.remote_addr.sin_port; + req->local_ip = ep->com.local_addr.sin_addr.s_addr; + req->peer_ip = ep->com.remote_addr.sin_addr.s_addr; + req->opt0h = htonl(opt0h); + req->opt0l = htonl(opt0l); + req->params = 0; + req->opt2 = htonl(opt2); + return iwch_l2t_send(ep->com.tdev, skb, ep->l2t); +} + +static void send_mpa_req(struct iwch_ep *ep, struct sk_buff *skb) +{ + int mpalen; + struct tx_data_wr *req; + struct mpa_message *mpa; + int len; + + PDBG("%s ep %p pd_len %d\n", __func__, ep, ep->plen); + + BUG_ON(skb_cloned(skb)); + + mpalen = sizeof(*mpa) + ep->plen; + if (skb->data + mpalen + sizeof(*req) > skb_end_pointer(skb)) { + kfree_skb(skb); + skb=alloc_skb(mpalen + sizeof(*req), GFP_KERNEL); + if (!skb) { + connect_reply_upcall(ep, -ENOMEM); + return; + } + } + skb_trim(skb, 0); + skb_reserve(skb, sizeof(*req)); + skb_put(skb, mpalen); + skb->priority = CPL_PRIORITY_DATA; + mpa = (struct mpa_message *) skb->data; + memset(mpa, 0, sizeof(*mpa)); + memcpy(mpa->key, MPA_KEY_REQ, sizeof(mpa->key)); + mpa->flags = (crc_enabled ? MPA_CRC : 0) | + (markers_enabled ? MPA_MARKERS : 0); + mpa->private_data_size = htons(ep->plen); + mpa->revision = mpa_rev; + + if (ep->plen) + memcpy(mpa->private_data, ep->mpa_pkt + sizeof(*mpa), ep->plen); + + /* + * Reference the mpa skb. This ensures the data area + * will remain in memory until the hw acks the tx. + * Function tx_ack() will deref it. + */ + skb_get(skb); + set_arp_failure_handler(skb, arp_failure_discard); + skb_reset_transport_header(skb); + len = skb->len; + req = (struct tx_data_wr *) skb_push(skb, sizeof(*req)); + req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)|F_WR_COMPL); + req->wr_lo = htonl(V_WR_TID(ep->hwtid)); + req->len = htonl(len); + req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) | + V_TX_SNDBUF(snd_win>>15)); + req->flags = htonl(F_TX_INIT); + req->sndseq = htonl(ep->snd_seq); + BUG_ON(ep->mpa_skb); + ep->mpa_skb = skb; + iwch_l2t_send(ep->com.tdev, skb, ep->l2t); + start_ep_timer(ep); + state_set(&ep->com, MPA_REQ_SENT); + return; +} + +static int send_mpa_reject(struct iwch_ep *ep, const void *pdata, u8 plen) +{ + int mpalen; + struct tx_data_wr *req; + struct mpa_message *mpa; + struct sk_buff *skb; + + PDBG("%s ep %p plen %d\n", __func__, ep, plen); + + mpalen = sizeof(*mpa) + plen; + + skb = get_skb(NULL, mpalen + sizeof(*req), GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "%s - cannot alloc skb!\n", __func__); + return -ENOMEM; + } + skb_reserve(skb, sizeof(*req)); + mpa = (struct mpa_message *) skb_put(skb, mpalen); + memset(mpa, 0, sizeof(*mpa)); + memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key)); + mpa->flags = MPA_REJECT; + mpa->revision = mpa_rev; + mpa->private_data_size = htons(plen); + if (plen) + memcpy(mpa->private_data, pdata, plen); + + /* + * Reference the mpa skb again. This ensures the data area + * will remain in memory until the hw acks the tx. + * Function tx_ack() will deref it. + */ + skb_get(skb); + skb->priority = CPL_PRIORITY_DATA; + set_arp_failure_handler(skb, arp_failure_discard); + skb_reset_transport_header(skb); + req = (struct tx_data_wr *) skb_push(skb, sizeof(*req)); + req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)|F_WR_COMPL); + req->wr_lo = htonl(V_WR_TID(ep->hwtid)); + req->len = htonl(mpalen); + req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) | + V_TX_SNDBUF(snd_win>>15)); + req->flags = htonl(F_TX_INIT); + req->sndseq = htonl(ep->snd_seq); + BUG_ON(ep->mpa_skb); + ep->mpa_skb = skb; + return iwch_l2t_send(ep->com.tdev, skb, ep->l2t); +} + +static int send_mpa_reply(struct iwch_ep *ep, const void *pdata, u8 plen) +{ + int mpalen; + struct tx_data_wr *req; + struct mpa_message *mpa; + int len; + struct sk_buff *skb; + + PDBG("%s ep %p plen %d\n", __func__, ep, plen); + + mpalen = sizeof(*mpa) + plen; + + skb = get_skb(NULL, mpalen + sizeof(*req), GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "%s - cannot alloc skb!\n", __func__); + return -ENOMEM; + } + skb->priority = CPL_PRIORITY_DATA; + skb_reserve(skb, sizeof(*req)); + mpa = (struct mpa_message *) skb_put(skb, mpalen); + memset(mpa, 0, sizeof(*mpa)); + memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key)); + mpa->flags = (ep->mpa_attr.crc_enabled ? MPA_CRC : 0) | + (markers_enabled ? MPA_MARKERS : 0); + mpa->revision = mpa_rev; + mpa->private_data_size = htons(plen); + if (plen) + memcpy(mpa->private_data, pdata, plen); + + /* + * Reference the mpa skb. This ensures the data area + * will remain in memory until the hw acks the tx. + * Function tx_ack() will deref it. + */ + skb_get(skb); + set_arp_failure_handler(skb, arp_failure_discard); + skb_reset_transport_header(skb); + len = skb->len; + req = (struct tx_data_wr *) skb_push(skb, sizeof(*req)); + req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)|F_WR_COMPL); + req->wr_lo = htonl(V_WR_TID(ep->hwtid)); + req->len = htonl(len); + req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) | + V_TX_SNDBUF(snd_win>>15)); + req->flags = htonl(F_TX_INIT); + req->sndseq = htonl(ep->snd_seq); + ep->mpa_skb = skb; + state_set(&ep->com, MPA_REP_SENT); + return iwch_l2t_send(ep->com.tdev, skb, ep->l2t); +} + +static int act_establish(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + struct cpl_act_establish *req = cplhdr(skb); + unsigned int tid = GET_TID(req); + + PDBG("%s ep %p tid %d\n", __func__, ep, tid); + + dst_confirm(ep->dst); + + /* setup the hwtid for this connection */ + ep->hwtid = tid; + cxgb3_insert_tid(ep->com.tdev, &t3c_client, ep, tid); + + ep->snd_seq = ntohl(req->snd_isn); + ep->rcv_seq = ntohl(req->rcv_isn); + + set_emss(ep, ntohs(req->tcp_opt)); + + /* dealloc the atid */ + cxgb3_free_atid(ep->com.tdev, ep->atid); + + /* start MPA negotiation */ + send_mpa_req(ep, skb); + + return 0; +} + +static void abort_connection(struct iwch_ep *ep, struct sk_buff *skb, gfp_t gfp) +{ + PDBG("%s ep %p\n", __FILE__, ep); + state_set(&ep->com, ABORTING); + send_abort(ep, skb, gfp); +} + +static void close_complete_upcall(struct iwch_ep *ep) +{ + struct iw_cm_event event; + + PDBG("%s ep %p\n", __func__, ep); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_CLOSE; + if (ep->com.cm_id) { + PDBG("close complete delivered ep %p cm_id %p tid %d\n", + ep, ep->com.cm_id, ep->hwtid); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + ep->com.cm_id->rem_ref(ep->com.cm_id); + ep->com.cm_id = NULL; + ep->com.qp = NULL; + } +} + +static void peer_close_upcall(struct iwch_ep *ep) +{ + struct iw_cm_event event; + + PDBG("%s ep %p\n", __func__, ep); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_DISCONNECT; + if (ep->com.cm_id) { + PDBG("peer close delivered ep %p cm_id %p tid %d\n", + ep, ep->com.cm_id, ep->hwtid); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + } +} + +static void peer_abort_upcall(struct iwch_ep *ep) +{ + struct iw_cm_event event; + + PDBG("%s ep %p\n", __func__, ep); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_CLOSE; + event.status = -ECONNRESET; + if (ep->com.cm_id) { + PDBG("abort delivered ep %p cm_id %p tid %d\n", ep, + ep->com.cm_id, ep->hwtid); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + ep->com.cm_id->rem_ref(ep->com.cm_id); + ep->com.cm_id = NULL; + ep->com.qp = NULL; + } +} + +static void connect_reply_upcall(struct iwch_ep *ep, int status) +{ + struct iw_cm_event event; + + PDBG("%s ep %p status %d\n", __func__, ep, status); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_CONNECT_REPLY; + event.status = status; + memcpy(&event.local_addr, &ep->com.local_addr, + sizeof(ep->com.local_addr)); + memcpy(&event.remote_addr, &ep->com.remote_addr, + sizeof(ep->com.remote_addr)); + + if ((status == 0) || (status == -ECONNREFUSED)) { + event.private_data_len = ep->plen; + event.private_data = ep->mpa_pkt + sizeof(struct mpa_message); + } + if (ep->com.cm_id) { + PDBG("%s ep %p tid %d status %d\n", __func__, ep, + ep->hwtid, status); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + } + if (status < 0) { + ep->com.cm_id->rem_ref(ep->com.cm_id); + ep->com.cm_id = NULL; + ep->com.qp = NULL; + } +} + +static void connect_request_upcall(struct iwch_ep *ep) +{ + struct iw_cm_event event; + + PDBG("%s ep %p tid %d\n", __func__, ep, ep->hwtid); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_CONNECT_REQUEST; + memcpy(&event.local_addr, &ep->com.local_addr, + sizeof(ep->com.local_addr)); + memcpy(&event.remote_addr, &ep->com.remote_addr, + sizeof(ep->com.local_addr)); + event.private_data_len = ep->plen; + event.private_data = ep->mpa_pkt + sizeof(struct mpa_message); + event.provider_data = ep; + /* + * Until ird/ord negotiation via MPAv2 support is added, send max + * supported values + */ + event.ird = event.ord = 8; + if (state_read(&ep->parent_ep->com) != DEAD) { + get_ep(&ep->com); + ep->parent_ep->com.cm_id->event_handler( + ep->parent_ep->com.cm_id, + &event); + } + put_ep(&ep->parent_ep->com); + ep->parent_ep = NULL; +} + +static void established_upcall(struct iwch_ep *ep) +{ + struct iw_cm_event event; + + PDBG("%s ep %p\n", __func__, ep); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_ESTABLISHED; + /* + * Until ird/ord negotiation via MPAv2 support is added, send max + * supported values + */ + event.ird = event.ord = 8; + if (ep->com.cm_id) { + PDBG("%s ep %p tid %d\n", __func__, ep, ep->hwtid); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + } +} + +static int update_rx_credits(struct iwch_ep *ep, u32 credits) +{ + struct cpl_rx_data_ack *req; + struct sk_buff *skb; + + PDBG("%s ep %p credits %u\n", __func__, ep, credits); + skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "update_rx_credits - cannot alloc skb!\n"); + return 0; + } + + req = (struct cpl_rx_data_ack *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, ep->hwtid)); + req->credit_dack = htonl(V_RX_CREDITS(credits) | V_RX_FORCE_ACK(1)); + skb->priority = CPL_PRIORITY_ACK; + iwch_cxgb3_ofld_send(ep->com.tdev, skb); + return credits; +} + +static void process_mpa_reply(struct iwch_ep *ep, struct sk_buff *skb) +{ + struct mpa_message *mpa; + u16 plen; + struct iwch_qp_attributes attrs; + enum iwch_qp_attr_mask mask; + int err; + + PDBG("%s ep %p\n", __func__, ep); + + /* + * Stop mpa timer. If it expired, then the state has + * changed and we bail since ep_timeout already aborted + * the connection. + */ + stop_ep_timer(ep); + if (state_read(&ep->com) != MPA_REQ_SENT) + return; + + /* + * If we get more than the supported amount of private data + * then we must fail this connection. + */ + if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) { + err = -EINVAL; + goto err; + } + + /* + * copy the new data into our accumulation buffer. + */ + skb_copy_from_linear_data(skb, &(ep->mpa_pkt[ep->mpa_pkt_len]), + skb->len); + ep->mpa_pkt_len += skb->len; + + /* + * if we don't even have the mpa message, then bail. + */ + if (ep->mpa_pkt_len < sizeof(*mpa)) + return; + mpa = (struct mpa_message *) ep->mpa_pkt; + + /* Validate MPA header. */ + if (mpa->revision != mpa_rev) { + err = -EPROTO; + goto err; + } + if (memcmp(mpa->key, MPA_KEY_REP, sizeof(mpa->key))) { + err = -EPROTO; + goto err; + } + + plen = ntohs(mpa->private_data_size); + + /* + * Fail if there's too much private data. + */ + if (plen > MPA_MAX_PRIVATE_DATA) { + err = -EPROTO; + goto err; + } + + /* + * If plen does not account for pkt size + */ + if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) { + err = -EPROTO; + goto err; + } + + ep->plen = (u8) plen; + + /* + * If we don't have all the pdata yet, then bail. + * We'll continue process when more data arrives. + */ + if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) + return; + + if (mpa->flags & MPA_REJECT) { + err = -ECONNREFUSED; + goto err; + } + + /* + * If we get here we have accumulated the entire mpa + * start reply message including private data. And + * the MPA header is valid. + */ + state_set(&ep->com, FPDU_MODE); + ep->mpa_attr.initiator = 1; + ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; + ep->mpa_attr.recv_marker_enabled = markers_enabled; + ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; + ep->mpa_attr.version = mpa_rev; + PDBG("%s - crc_enabled=%d, recv_marker_enabled=%d, " + "xmit_marker_enabled=%d, version=%d\n", __func__, + ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled, + ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version); + + attrs.mpa_attr = ep->mpa_attr; + attrs.max_ird = ep->ird; + attrs.max_ord = ep->ord; + attrs.llp_stream_handle = ep; + attrs.next_state = IWCH_QP_STATE_RTS; + + mask = IWCH_QP_ATTR_NEXT_STATE | + IWCH_QP_ATTR_LLP_STREAM_HANDLE | IWCH_QP_ATTR_MPA_ATTR | + IWCH_QP_ATTR_MAX_IRD | IWCH_QP_ATTR_MAX_ORD; + + /* bind QP and TID with INIT_WR */ + err = iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, mask, &attrs, 1); + if (err) + goto err; + + if (peer2peer && iwch_rqes_posted(ep->com.qp) == 0) { + iwch_post_zb_read(ep); + } + + goto out; +err: + abort_connection(ep, skb, GFP_KERNEL); +out: + connect_reply_upcall(ep, err); + return; +} + +static void process_mpa_request(struct iwch_ep *ep, struct sk_buff *skb) +{ + struct mpa_message *mpa; + u16 plen; + + PDBG("%s ep %p\n", __func__, ep); + + /* + * Stop mpa timer. If it expired, then the state has + * changed and we bail since ep_timeout already aborted + * the connection. + */ + stop_ep_timer(ep); + if (state_read(&ep->com) != MPA_REQ_WAIT) + return; + + /* + * If we get more than the supported amount of private data + * then we must fail this connection. + */ + if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) { + abort_connection(ep, skb, GFP_KERNEL); + return; + } + + PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__); + + /* + * Copy the new data into our accumulation buffer. + */ + skb_copy_from_linear_data(skb, &(ep->mpa_pkt[ep->mpa_pkt_len]), + skb->len); + ep->mpa_pkt_len += skb->len; + + /* + * If we don't even have the mpa message, then bail. + * We'll continue process when more data arrives. + */ + if (ep->mpa_pkt_len < sizeof(*mpa)) + return; + PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__); + mpa = (struct mpa_message *) ep->mpa_pkt; + + /* + * Validate MPA Header. + */ + if (mpa->revision != mpa_rev) { + abort_connection(ep, skb, GFP_KERNEL); + return; + } + + if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) { + abort_connection(ep, skb, GFP_KERNEL); + return; + } + + plen = ntohs(mpa->private_data_size); + + /* + * Fail if there's too much private data. + */ + if (plen > MPA_MAX_PRIVATE_DATA) { + abort_connection(ep, skb, GFP_KERNEL); + return; + } + + /* + * If plen does not account for pkt size + */ + if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) { + abort_connection(ep, skb, GFP_KERNEL); + return; + } + ep->plen = (u8) plen; + + /* + * If we don't have all the pdata yet, then bail. + */ + if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) + return; + + /* + * If we get here we have accumulated the entire mpa + * start reply message including private data. + */ + ep->mpa_attr.initiator = 0; + ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; + ep->mpa_attr.recv_marker_enabled = markers_enabled; + ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; + ep->mpa_attr.version = mpa_rev; + PDBG("%s - crc_enabled=%d, recv_marker_enabled=%d, " + "xmit_marker_enabled=%d, version=%d\n", __func__, + ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled, + ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version); + + state_set(&ep->com, MPA_REQ_RCVD); + + /* drive upcall */ + connect_request_upcall(ep); + return; +} + +static int rx_data(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + struct cpl_rx_data *hdr = cplhdr(skb); + unsigned int dlen = ntohs(hdr->len); + + PDBG("%s ep %p dlen %u\n", __func__, ep, dlen); + + skb_pull(skb, sizeof(*hdr)); + skb_trim(skb, dlen); + + ep->rcv_seq += dlen; + BUG_ON(ep->rcv_seq != (ntohl(hdr->seq) + dlen)); + + switch (state_read(&ep->com)) { + case MPA_REQ_SENT: + process_mpa_reply(ep, skb); + break; + case MPA_REQ_WAIT: + process_mpa_request(ep, skb); + break; + case MPA_REP_SENT: + break; + default: + printk(KERN_ERR MOD "%s Unexpected streaming data." + " ep %p state %d tid %d\n", + __func__, ep, state_read(&ep->com), ep->hwtid); + + /* + * The ep will timeout and inform the ULP of the failure. + * See ep_timeout(). + */ + break; + } + + /* update RX credits */ + update_rx_credits(ep, dlen); + + return CPL_RET_BUF_DONE; +} + +/* + * Upcall from the adapter indicating data has been transmitted. + * For us its just the single MPA request or reply. We can now free + * the skb holding the mpa message. + */ +static int tx_ack(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + struct cpl_wr_ack *hdr = cplhdr(skb); + unsigned int credits = ntohs(hdr->credits); + unsigned long flags; + int post_zb = 0; + + PDBG("%s ep %p credits %u\n", __func__, ep, credits); + + if (credits == 0) { + PDBG("%s 0 credit ack ep %p state %u\n", + __func__, ep, state_read(&ep->com)); + return CPL_RET_BUF_DONE; + } + + spin_lock_irqsave(&ep->com.lock, flags); + BUG_ON(credits != 1); + dst_confirm(ep->dst); + if (!ep->mpa_skb) { + PDBG("%s rdma_init wr_ack ep %p state %u\n", + __func__, ep, ep->com.state); + if (ep->mpa_attr.initiator) { + PDBG("%s initiator ep %p state %u\n", + __func__, ep, ep->com.state); + if (peer2peer && ep->com.state == FPDU_MODE) + post_zb = 1; + } else { + PDBG("%s responder ep %p state %u\n", + __func__, ep, ep->com.state); + if (ep->com.state == MPA_REQ_RCVD) { + ep->com.rpl_done = 1; + wake_up(&ep->com.waitq); + } + } + } else { + PDBG("%s lsm ack ep %p state %u freeing skb\n", + __func__, ep, ep->com.state); + kfree_skb(ep->mpa_skb); + ep->mpa_skb = NULL; + } + spin_unlock_irqrestore(&ep->com.lock, flags); + if (post_zb) + iwch_post_zb_read(ep); + return CPL_RET_BUF_DONE; +} + +static int abort_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + unsigned long flags; + int release = 0; + + PDBG("%s ep %p\n", __func__, ep); + BUG_ON(!ep); + + /* + * We get 2 abort replies from the HW. The first one must + * be ignored except for scribbling that we need one more. + */ + if (!test_and_set_bit(ABORT_REQ_IN_PROGRESS, &ep->com.flags)) { + return CPL_RET_BUF_DONE; + } + + spin_lock_irqsave(&ep->com.lock, flags); + switch (ep->com.state) { + case ABORTING: + close_complete_upcall(ep); + __state_set(&ep->com, DEAD); + release = 1; + break; + default: + printk(KERN_ERR "%s ep %p state %d\n", + __func__, ep, ep->com.state); + break; + } + spin_unlock_irqrestore(&ep->com.lock, flags); + + if (release) + release_ep_resources(ep); + return CPL_RET_BUF_DONE; +} + +/* + * Return whether a failed active open has allocated a TID + */ +static inline int act_open_has_tid(int status) +{ + return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST && + status != CPL_ERR_ARP_MISS; +} + +static int act_open_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + struct cpl_act_open_rpl *rpl = cplhdr(skb); + + PDBG("%s ep %p status %u errno %d\n", __func__, ep, rpl->status, + status2errno(rpl->status)); + connect_reply_upcall(ep, status2errno(rpl->status)); + state_set(&ep->com, DEAD); + if (ep->com.tdev->type != T3A && act_open_has_tid(rpl->status)) + release_tid(ep->com.tdev, GET_TID(rpl), NULL); + cxgb3_free_atid(ep->com.tdev, ep->atid); + dst_release(ep->dst); + l2t_release(ep->com.tdev, ep->l2t); + put_ep(&ep->com); + return CPL_RET_BUF_DONE; +} + +static int listen_start(struct iwch_listen_ep *ep) +{ + struct sk_buff *skb; + struct cpl_pass_open_req *req; + + PDBG("%s ep %p\n", __func__, ep); + skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "t3c_listen_start failed to alloc skb!\n"); + return -ENOMEM; + } + + req = (struct cpl_pass_open_req *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, ep->stid)); + req->local_port = ep->com.local_addr.sin_port; + req->local_ip = ep->com.local_addr.sin_addr.s_addr; + req->peer_port = 0; + req->peer_ip = 0; + req->peer_netmask = 0; + req->opt0h = htonl(F_DELACK | F_TCAM_BYPASS); + req->opt0l = htonl(V_RCV_BUFSIZ(rcv_win>>10)); + req->opt1 = htonl(V_CONN_POLICY(CPL_CONN_POLICY_ASK)); + + skb->priority = 1; + return iwch_cxgb3_ofld_send(ep->com.tdev, skb); +} + +static int pass_open_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_listen_ep *ep = ctx; + struct cpl_pass_open_rpl *rpl = cplhdr(skb); + + PDBG("%s ep %p status %d error %d\n", __func__, ep, + rpl->status, status2errno(rpl->status)); + ep->com.rpl_err = status2errno(rpl->status); + ep->com.rpl_done = 1; + wake_up(&ep->com.waitq); + + return CPL_RET_BUF_DONE; +} + +static int listen_stop(struct iwch_listen_ep *ep) +{ + struct sk_buff *skb; + struct cpl_close_listserv_req *req; + + PDBG("%s ep %p\n", __func__, ep); + skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "%s - failed to alloc skb\n", __func__); + return -ENOMEM; + } + req = (struct cpl_close_listserv_req *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + req->cpu_idx = 0; + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, ep->stid)); + skb->priority = 1; + return iwch_cxgb3_ofld_send(ep->com.tdev, skb); +} + +static int close_listsrv_rpl(struct t3cdev *tdev, struct sk_buff *skb, + void *ctx) +{ + struct iwch_listen_ep *ep = ctx; + struct cpl_close_listserv_rpl *rpl = cplhdr(skb); + + PDBG("%s ep %p\n", __func__, ep); + ep->com.rpl_err = status2errno(rpl->status); + ep->com.rpl_done = 1; + wake_up(&ep->com.waitq); + return CPL_RET_BUF_DONE; +} + +static void accept_cr(struct iwch_ep *ep, __be32 peer_ip, struct sk_buff *skb) +{ + struct cpl_pass_accept_rpl *rpl; + unsigned int mtu_idx; + u32 opt0h, opt0l, opt2; + int wscale; + + PDBG("%s ep %p\n", __func__, ep); + BUG_ON(skb_cloned(skb)); + skb_trim(skb, sizeof(*rpl)); + skb_get(skb); + mtu_idx = find_best_mtu(T3C_DATA(ep->com.tdev), dst_mtu(ep->dst)); + wscale = compute_wscale(rcv_win); + opt0h = V_NAGLE(0) | + V_NO_CONG(nocong) | + V_KEEP_ALIVE(1) | + F_TCAM_BYPASS | + V_WND_SCALE(wscale) | + V_MSS_IDX(mtu_idx) | + V_L2T_IDX(ep->l2t->idx) | V_TX_CHANNEL(ep->l2t->smt_idx); + opt0l = V_TOS((ep->tos >> 2) & M_TOS) | V_RCV_BUFSIZ(rcv_win>>10); + opt2 = F_RX_COALESCE_VALID | V_RX_COALESCE(0) | V_FLAVORS_VALID(1) | + V_CONG_CONTROL_FLAVOR(cong_flavor); + + rpl = cplhdr(skb); + rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, ep->hwtid)); + rpl->peer_ip = peer_ip; + rpl->opt0h = htonl(opt0h); + rpl->opt0l_status = htonl(opt0l | CPL_PASS_OPEN_ACCEPT); + rpl->opt2 = htonl(opt2); + rpl->rsvd = rpl->opt2; /* workaround for HW bug */ + skb->priority = CPL_PRIORITY_SETUP; + iwch_l2t_send(ep->com.tdev, skb, ep->l2t); + + return; +} + +static void reject_cr(struct t3cdev *tdev, u32 hwtid, __be32 peer_ip, + struct sk_buff *skb) +{ + PDBG("%s t3cdev %p tid %u peer_ip %x\n", __func__, tdev, hwtid, + peer_ip); + BUG_ON(skb_cloned(skb)); + skb_trim(skb, sizeof(struct cpl_tid_release)); + skb_get(skb); + + if (tdev->type != T3A) + release_tid(tdev, hwtid, skb); + else { + struct cpl_pass_accept_rpl *rpl; + + rpl = cplhdr(skb); + skb->priority = CPL_PRIORITY_SETUP; + rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, + hwtid)); + rpl->peer_ip = peer_ip; + rpl->opt0h = htonl(F_TCAM_BYPASS); + rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); + rpl->opt2 = 0; + rpl->rsvd = rpl->opt2; + iwch_cxgb3_ofld_send(tdev, skb); + } +} + +static int pass_accept_req(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *child_ep, *parent_ep = ctx; + struct cpl_pass_accept_req *req = cplhdr(skb); + unsigned int hwtid = GET_TID(req); + struct dst_entry *dst; + struct l2t_entry *l2t; + struct rtable *rt; + struct iff_mac tim; + + PDBG("%s parent ep %p tid %u\n", __func__, parent_ep, hwtid); + + if (state_read(&parent_ep->com) != LISTEN) { + printk(KERN_ERR "%s - listening ep not in LISTEN\n", + __func__); + goto reject; + } + + /* + * Find the netdev for this connection request. + */ + tim.mac_addr = req->dst_mac; + tim.vlan_tag = ntohs(req->vlan_tag); + if (tdev->ctl(tdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) { + printk(KERN_ERR "%s bad dst mac %pM\n", + __func__, req->dst_mac); + goto reject; + } + + /* Find output route */ + rt = find_route(tdev, + req->local_ip, + req->peer_ip, + req->local_port, + req->peer_port, G_PASS_OPEN_TOS(ntohl(req->tos_tid))); + if (!rt) { + printk(KERN_ERR MOD "%s - failed to find dst entry!\n", + __func__); + goto reject; + } + dst = &rt->dst; + l2t = t3_l2t_get(tdev, dst, NULL, &req->peer_ip); + if (!l2t) { + printk(KERN_ERR MOD "%s - failed to allocate l2t entry!\n", + __func__); + dst_release(dst); + goto reject; + } + child_ep = alloc_ep(sizeof(*child_ep), GFP_KERNEL); + if (!child_ep) { + printk(KERN_ERR MOD "%s - failed to allocate ep entry!\n", + __func__); + l2t_release(tdev, l2t); + dst_release(dst); + goto reject; + } + state_set(&child_ep->com, CONNECTING); + child_ep->com.tdev = tdev; + child_ep->com.cm_id = NULL; + child_ep->com.local_addr.sin_family = PF_INET; + child_ep->com.local_addr.sin_port = req->local_port; + child_ep->com.local_addr.sin_addr.s_addr = req->local_ip; + child_ep->com.remote_addr.sin_family = PF_INET; + child_ep->com.remote_addr.sin_port = req->peer_port; + child_ep->com.remote_addr.sin_addr.s_addr = req->peer_ip; + get_ep(&parent_ep->com); + child_ep->parent_ep = parent_ep; + child_ep->tos = G_PASS_OPEN_TOS(ntohl(req->tos_tid)); + child_ep->l2t = l2t; + child_ep->dst = dst; + child_ep->hwtid = hwtid; + init_timer(&child_ep->timer); + cxgb3_insert_tid(tdev, &t3c_client, child_ep, hwtid); + accept_cr(child_ep, req->peer_ip, skb); + goto out; +reject: + reject_cr(tdev, hwtid, req->peer_ip, skb); +out: + return CPL_RET_BUF_DONE; +} + +static int pass_establish(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + struct cpl_pass_establish *req = cplhdr(skb); + + PDBG("%s ep %p\n", __func__, ep); + ep->snd_seq = ntohl(req->snd_isn); + ep->rcv_seq = ntohl(req->rcv_isn); + + set_emss(ep, ntohs(req->tcp_opt)); + + dst_confirm(ep->dst); + state_set(&ep->com, MPA_REQ_WAIT); + start_ep_timer(ep); + + return CPL_RET_BUF_DONE; +} + +static int peer_close(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + struct iwch_qp_attributes attrs; + unsigned long flags; + int disconnect = 1; + int release = 0; + + PDBG("%s ep %p\n", __func__, ep); + dst_confirm(ep->dst); + + spin_lock_irqsave(&ep->com.lock, flags); + switch (ep->com.state) { + case MPA_REQ_WAIT: + __state_set(&ep->com, CLOSING); + break; + case MPA_REQ_SENT: + __state_set(&ep->com, CLOSING); + connect_reply_upcall(ep, -ECONNRESET); + break; + case MPA_REQ_RCVD: + + /* + * We're gonna mark this puppy DEAD, but keep + * the reference on it until the ULP accepts or + * rejects the CR. Also wake up anyone waiting + * in rdma connection migration (see iwch_accept_cr()). + */ + __state_set(&ep->com, CLOSING); + ep->com.rpl_done = 1; + ep->com.rpl_err = -ECONNRESET; + PDBG("waking up ep %p\n", ep); + wake_up(&ep->com.waitq); + break; + case MPA_REP_SENT: + __state_set(&ep->com, CLOSING); + ep->com.rpl_done = 1; + ep->com.rpl_err = -ECONNRESET; + PDBG("waking up ep %p\n", ep); + wake_up(&ep->com.waitq); + break; + case FPDU_MODE: + start_ep_timer(ep); + __state_set(&ep->com, CLOSING); + attrs.next_state = IWCH_QP_STATE_CLOSING; + iwch_modify_qp(ep->com.qp->rhp, ep->com.qp, + IWCH_QP_ATTR_NEXT_STATE, &attrs, 1); + peer_close_upcall(ep); + break; + case ABORTING: + disconnect = 0; + break; + case CLOSING: + __state_set(&ep->com, MORIBUND); + disconnect = 0; + break; + case MORIBUND: + stop_ep_timer(ep); + if (ep->com.cm_id && ep->com.qp) { + attrs.next_state = IWCH_QP_STATE_IDLE; + iwch_modify_qp(ep->com.qp->rhp, ep->com.qp, + IWCH_QP_ATTR_NEXT_STATE, &attrs, 1); + } + close_complete_upcall(ep); + __state_set(&ep->com, DEAD); + release = 1; + disconnect = 0; + break; + case DEAD: + disconnect = 0; + break; + default: + BUG_ON(1); + } + spin_unlock_irqrestore(&ep->com.lock, flags); + if (disconnect) + iwch_ep_disconnect(ep, 0, GFP_KERNEL); + if (release) + release_ep_resources(ep); + return CPL_RET_BUF_DONE; +} + +/* + * Returns whether an ABORT_REQ_RSS message is a negative advice. + */ +static int is_neg_adv_abort(unsigned int status) +{ + return status == CPL_ERR_RTX_NEG_ADVICE || + status == CPL_ERR_PERSIST_NEG_ADVICE; +} + +static int peer_abort(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct cpl_abort_req_rss *req = cplhdr(skb); + struct iwch_ep *ep = ctx; + struct cpl_abort_rpl *rpl; + struct sk_buff *rpl_skb; + struct iwch_qp_attributes attrs; + int ret; + int release = 0; + unsigned long flags; + + if (is_neg_adv_abort(req->status)) { + PDBG("%s neg_adv_abort ep %p tid %d\n", __func__, ep, + ep->hwtid); + t3_l2t_send_event(ep->com.tdev, ep->l2t); + return CPL_RET_BUF_DONE; + } + + /* + * We get 2 peer aborts from the HW. The first one must + * be ignored except for scribbling that we need one more. + */ + if (!test_and_set_bit(PEER_ABORT_IN_PROGRESS, &ep->com.flags)) { + return CPL_RET_BUF_DONE; + } + + spin_lock_irqsave(&ep->com.lock, flags); + PDBG("%s ep %p state %u\n", __func__, ep, ep->com.state); + switch (ep->com.state) { + case CONNECTING: + break; + case MPA_REQ_WAIT: + stop_ep_timer(ep); + break; + case MPA_REQ_SENT: + stop_ep_timer(ep); + connect_reply_upcall(ep, -ECONNRESET); + break; + case MPA_REP_SENT: + ep->com.rpl_done = 1; + ep->com.rpl_err = -ECONNRESET; + PDBG("waking up ep %p\n", ep); + wake_up(&ep->com.waitq); + break; + case MPA_REQ_RCVD: + + /* + * We're gonna mark this puppy DEAD, but keep + * the reference on it until the ULP accepts or + * rejects the CR. Also wake up anyone waiting + * in rdma connection migration (see iwch_accept_cr()). + */ + ep->com.rpl_done = 1; + ep->com.rpl_err = -ECONNRESET; + PDBG("waking up ep %p\n", ep); + wake_up(&ep->com.waitq); + break; + case MORIBUND: + case CLOSING: + stop_ep_timer(ep); + /*FALLTHROUGH*/ + case FPDU_MODE: + if (ep->com.cm_id && ep->com.qp) { + attrs.next_state = IWCH_QP_STATE_ERROR; + ret = iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, + &attrs, 1); + if (ret) + printk(KERN_ERR MOD + "%s - qp <- error failed!\n", + __func__); + } + peer_abort_upcall(ep); + break; + case ABORTING: + break; + case DEAD: + PDBG("%s PEER_ABORT IN DEAD STATE!!!!\n", __func__); + spin_unlock_irqrestore(&ep->com.lock, flags); + return CPL_RET_BUF_DONE; + default: + BUG_ON(1); + break; + } + dst_confirm(ep->dst); + if (ep->com.state != ABORTING) { + __state_set(&ep->com, DEAD); + release = 1; + } + spin_unlock_irqrestore(&ep->com.lock, flags); + + rpl_skb = get_skb(skb, sizeof(*rpl), GFP_KERNEL); + if (!rpl_skb) { + printk(KERN_ERR MOD "%s - cannot allocate skb!\n", + __func__); + release = 1; + goto out; + } + rpl_skb->priority = CPL_PRIORITY_DATA; + rpl = (struct cpl_abort_rpl *) skb_put(rpl_skb, sizeof(*rpl)); + rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); + rpl->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); + OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, ep->hwtid)); + rpl->cmd = CPL_ABORT_NO_RST; + iwch_cxgb3_ofld_send(ep->com.tdev, rpl_skb); +out: + if (release) + release_ep_resources(ep); + return CPL_RET_BUF_DONE; +} + +static int close_con_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + struct iwch_qp_attributes attrs; + unsigned long flags; + int release = 0; + + PDBG("%s ep %p\n", __func__, ep); + BUG_ON(!ep); + + /* The cm_id may be null if we failed to connect */ + spin_lock_irqsave(&ep->com.lock, flags); + switch (ep->com.state) { + case CLOSING: + __state_set(&ep->com, MORIBUND); + break; + case MORIBUND: + stop_ep_timer(ep); + if ((ep->com.cm_id) && (ep->com.qp)) { + attrs.next_state = IWCH_QP_STATE_IDLE; + iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, + IWCH_QP_ATTR_NEXT_STATE, + &attrs, 1); + } + close_complete_upcall(ep); + __state_set(&ep->com, DEAD); + release = 1; + break; + case ABORTING: + case DEAD: + break; + default: + BUG_ON(1); + break; + } + spin_unlock_irqrestore(&ep->com.lock, flags); + if (release) + release_ep_resources(ep); + return CPL_RET_BUF_DONE; +} + +/* + * T3A does 3 things when a TERM is received: + * 1) send up a CPL_RDMA_TERMINATE message with the TERM packet + * 2) generate an async event on the QP with the TERMINATE opcode + * 3) post a TERMINATE opcode cqe into the associated CQ. + * + * For (1), we save the message in the qp for later consumer consumption. + * For (2), we move the QP into TERMINATE, post a QP event and disconnect. + * For (3), we toss the CQE in cxio_poll_cq(). + * + * terminate() handles case (1)... + */ +static int terminate(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + + if (state_read(&ep->com) != FPDU_MODE) + return CPL_RET_BUF_DONE; + + PDBG("%s ep %p\n", __func__, ep); + skb_pull(skb, sizeof(struct cpl_rdma_terminate)); + PDBG("%s saving %d bytes of term msg\n", __func__, skb->len); + skb_copy_from_linear_data(skb, ep->com.qp->attr.terminate_buffer, + skb->len); + ep->com.qp->attr.terminate_msg_len = skb->len; + ep->com.qp->attr.is_terminate_local = 0; + return CPL_RET_BUF_DONE; +} + +static int ec_status(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct cpl_rdma_ec_status *rep = cplhdr(skb); + struct iwch_ep *ep = ctx; + + PDBG("%s ep %p tid %u status %d\n", __func__, ep, ep->hwtid, + rep->status); + if (rep->status) { + struct iwch_qp_attributes attrs; + + printk(KERN_ERR MOD "%s BAD CLOSE - Aborting tid %u\n", + __func__, ep->hwtid); + stop_ep_timer(ep); + attrs.next_state = IWCH_QP_STATE_ERROR; + iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, + &attrs, 1); + abort_connection(ep, NULL, GFP_KERNEL); + } + return CPL_RET_BUF_DONE; +} + +static void ep_timeout(unsigned long arg) +{ + struct iwch_ep *ep = (struct iwch_ep *)arg; + struct iwch_qp_attributes attrs; + unsigned long flags; + int abort = 1; + + spin_lock_irqsave(&ep->com.lock, flags); + PDBG("%s ep %p tid %u state %d\n", __func__, ep, ep->hwtid, + ep->com.state); + switch (ep->com.state) { + case MPA_REQ_SENT: + __state_set(&ep->com, ABORTING); + connect_reply_upcall(ep, -ETIMEDOUT); + break; + case MPA_REQ_WAIT: + __state_set(&ep->com, ABORTING); + break; + case CLOSING: + case MORIBUND: + if (ep->com.cm_id && ep->com.qp) { + attrs.next_state = IWCH_QP_STATE_ERROR; + iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, + &attrs, 1); + } + __state_set(&ep->com, ABORTING); + break; + default: + WARN(1, "%s unexpected state ep %p state %u\n", + __func__, ep, ep->com.state); + abort = 0; + } + spin_unlock_irqrestore(&ep->com.lock, flags); + if (abort) + abort_connection(ep, NULL, GFP_ATOMIC); + put_ep(&ep->com); +} + +int iwch_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) +{ + int err; + struct iwch_ep *ep = to_ep(cm_id); + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + + if (state_read(&ep->com) == DEAD) { + put_ep(&ep->com); + return -ECONNRESET; + } + BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD); + if (mpa_rev == 0) + abort_connection(ep, NULL, GFP_KERNEL); + else { + err = send_mpa_reject(ep, pdata, pdata_len); + err = iwch_ep_disconnect(ep, 0, GFP_KERNEL); + } + put_ep(&ep->com); + return 0; +} + +int iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) +{ + int err; + struct iwch_qp_attributes attrs; + enum iwch_qp_attr_mask mask; + struct iwch_ep *ep = to_ep(cm_id); + struct iwch_dev *h = to_iwch_dev(cm_id->device); + struct iwch_qp *qp = get_qhp(h, conn_param->qpn); + + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + if (state_read(&ep->com) == DEAD) { + err = -ECONNRESET; + goto err; + } + + BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD); + BUG_ON(!qp); + + if ((conn_param->ord > qp->rhp->attr.max_rdma_read_qp_depth) || + (conn_param->ird > qp->rhp->attr.max_rdma_reads_per_qp)) { + abort_connection(ep, NULL, GFP_KERNEL); + err = -EINVAL; + goto err; + } + + cm_id->add_ref(cm_id); + ep->com.cm_id = cm_id; + ep->com.qp = qp; + + ep->ird = conn_param->ird; + ep->ord = conn_param->ord; + + if (peer2peer && ep->ird == 0) + ep->ird = 1; + + PDBG("%s %d ird %d ord %d\n", __func__, __LINE__, ep->ird, ep->ord); + + /* bind QP to EP and move to RTS */ + attrs.mpa_attr = ep->mpa_attr; + attrs.max_ird = ep->ird; + attrs.max_ord = ep->ord; + attrs.llp_stream_handle = ep; + attrs.next_state = IWCH_QP_STATE_RTS; + + /* bind QP and TID with INIT_WR */ + mask = IWCH_QP_ATTR_NEXT_STATE | + IWCH_QP_ATTR_LLP_STREAM_HANDLE | + IWCH_QP_ATTR_MPA_ATTR | + IWCH_QP_ATTR_MAX_IRD | + IWCH_QP_ATTR_MAX_ORD; + + err = iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, mask, &attrs, 1); + if (err) + goto err1; + + /* if needed, wait for wr_ack */ + if (iwch_rqes_posted(qp)) { + wait_event(ep->com.waitq, ep->com.rpl_done); + err = ep->com.rpl_err; + if (err) + goto err1; + } + + err = send_mpa_reply(ep, conn_param->private_data, + conn_param->private_data_len); + if (err) + goto err1; + + + state_set(&ep->com, FPDU_MODE); + established_upcall(ep); + put_ep(&ep->com); + return 0; +err1: + ep->com.cm_id = NULL; + ep->com.qp = NULL; + cm_id->rem_ref(cm_id); +err: + put_ep(&ep->com); + return err; +} + +static int is_loopback_dst(struct iw_cm_id *cm_id) +{ + struct net_device *dev; + struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; + + dev = ip_dev_find(&init_net, raddr->sin_addr.s_addr); + if (!dev) + return 0; + dev_put(dev); + return 1; +} + +int iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) +{ + struct iwch_dev *h = to_iwch_dev(cm_id->device); + struct iwch_ep *ep; + struct rtable *rt; + int err = 0; + struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr; + struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; + + if (cm_id->remote_addr.ss_family != PF_INET) { + err = -ENOSYS; + goto out; + } + + if (is_loopback_dst(cm_id)) { + err = -ENOSYS; + goto out; + } + + ep = alloc_ep(sizeof(*ep), GFP_KERNEL); + if (!ep) { + printk(KERN_ERR MOD "%s - cannot alloc ep.\n", __func__); + err = -ENOMEM; + goto out; + } + init_timer(&ep->timer); + ep->plen = conn_param->private_data_len; + if (ep->plen) + memcpy(ep->mpa_pkt + sizeof(struct mpa_message), + conn_param->private_data, ep->plen); + ep->ird = conn_param->ird; + ep->ord = conn_param->ord; + + if (peer2peer && ep->ord == 0) + ep->ord = 1; + + ep->com.tdev = h->rdev.t3cdev_p; + + cm_id->add_ref(cm_id); + ep->com.cm_id = cm_id; + ep->com.qp = get_qhp(h, conn_param->qpn); + BUG_ON(!ep->com.qp); + PDBG("%s qpn 0x%x qp %p cm_id %p\n", __func__, conn_param->qpn, + ep->com.qp, cm_id); + + /* + * Allocate an active TID to initiate a TCP connection. + */ + ep->atid = cxgb3_alloc_atid(h->rdev.t3cdev_p, &t3c_client, ep); + if (ep->atid == -1) { + printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __func__); + err = -ENOMEM; + goto fail2; + } + + /* find a route */ + rt = find_route(h->rdev.t3cdev_p, laddr->sin_addr.s_addr, + raddr->sin_addr.s_addr, laddr->sin_port, + raddr->sin_port, IPTOS_LOWDELAY); + if (!rt) { + printk(KERN_ERR MOD "%s - cannot find route.\n", __func__); + err = -EHOSTUNREACH; + goto fail3; + } + ep->dst = &rt->dst; + ep->l2t = t3_l2t_get(ep->com.tdev, ep->dst, NULL, + &raddr->sin_addr.s_addr); + if (!ep->l2t) { + printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __func__); + err = -ENOMEM; + goto fail4; + } + + state_set(&ep->com, CONNECTING); + ep->tos = IPTOS_LOWDELAY; + memcpy(&ep->com.local_addr, &cm_id->local_addr, + sizeof(ep->com.local_addr)); + memcpy(&ep->com.remote_addr, &cm_id->remote_addr, + sizeof(ep->com.remote_addr)); + + /* send connect request to rnic */ + err = send_connect(ep); + if (!err) + goto out; + + l2t_release(h->rdev.t3cdev_p, ep->l2t); +fail4: + dst_release(ep->dst); +fail3: + cxgb3_free_atid(ep->com.tdev, ep->atid); +fail2: + cm_id->rem_ref(cm_id); + put_ep(&ep->com); +out: + return err; +} + +int iwch_create_listen(struct iw_cm_id *cm_id, int backlog) +{ + int err = 0; + struct iwch_dev *h = to_iwch_dev(cm_id->device); + struct iwch_listen_ep *ep; + + + might_sleep(); + + if (cm_id->local_addr.ss_family != PF_INET) { + err = -ENOSYS; + goto fail1; + } + + ep = alloc_ep(sizeof(*ep), GFP_KERNEL); + if (!ep) { + printk(KERN_ERR MOD "%s - cannot alloc ep.\n", __func__); + err = -ENOMEM; + goto fail1; + } + PDBG("%s ep %p\n", __func__, ep); + ep->com.tdev = h->rdev.t3cdev_p; + cm_id->add_ref(cm_id); + ep->com.cm_id = cm_id; + ep->backlog = backlog; + memcpy(&ep->com.local_addr, &cm_id->local_addr, + sizeof(ep->com.local_addr)); + + /* + * Allocate a server TID. + */ + ep->stid = cxgb3_alloc_stid(h->rdev.t3cdev_p, &t3c_client, ep); + if (ep->stid == -1) { + printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __func__); + err = -ENOMEM; + goto fail2; + } + + state_set(&ep->com, LISTEN); + err = listen_start(ep); + if (err) + goto fail3; + + /* wait for pass_open_rpl */ + wait_event(ep->com.waitq, ep->com.rpl_done); + err = ep->com.rpl_err; + if (!err) { + cm_id->provider_data = ep; + goto out; + } +fail3: + cxgb3_free_stid(ep->com.tdev, ep->stid); +fail2: + cm_id->rem_ref(cm_id); + put_ep(&ep->com); +fail1: +out: + return err; +} + +int iwch_destroy_listen(struct iw_cm_id *cm_id) +{ + int err; + struct iwch_listen_ep *ep = to_listen_ep(cm_id); + + PDBG("%s ep %p\n", __func__, ep); + + might_sleep(); + state_set(&ep->com, DEAD); + ep->com.rpl_done = 0; + ep->com.rpl_err = 0; + err = listen_stop(ep); + if (err) + goto done; + wait_event(ep->com.waitq, ep->com.rpl_done); + cxgb3_free_stid(ep->com.tdev, ep->stid); +done: + err = ep->com.rpl_err; + cm_id->rem_ref(cm_id); + put_ep(&ep->com); + return err; +} + +int iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, gfp_t gfp) +{ + int ret=0; + unsigned long flags; + int close = 0; + int fatal = 0; + struct t3cdev *tdev; + struct cxio_rdev *rdev; + + spin_lock_irqsave(&ep->com.lock, flags); + + PDBG("%s ep %p state %s, abrupt %d\n", __func__, ep, + states[ep->com.state], abrupt); + + tdev = (struct t3cdev *)ep->com.tdev; + rdev = (struct cxio_rdev *)tdev->ulp; + if (cxio_fatal_error(rdev)) { + fatal = 1; + close_complete_upcall(ep); + ep->com.state = DEAD; + } + switch (ep->com.state) { + case MPA_REQ_WAIT: + case MPA_REQ_SENT: + case MPA_REQ_RCVD: + case MPA_REP_SENT: + case FPDU_MODE: + close = 1; + if (abrupt) + ep->com.state = ABORTING; + else { + ep->com.state = CLOSING; + start_ep_timer(ep); + } + set_bit(CLOSE_SENT, &ep->com.flags); + break; + case CLOSING: + if (!test_and_set_bit(CLOSE_SENT, &ep->com.flags)) { + close = 1; + if (abrupt) { + stop_ep_timer(ep); + ep->com.state = ABORTING; + } else + ep->com.state = MORIBUND; + } + break; + case MORIBUND: + case ABORTING: + case DEAD: + PDBG("%s ignoring disconnect ep %p state %u\n", + __func__, ep, ep->com.state); + break; + default: + BUG(); + break; + } + + spin_unlock_irqrestore(&ep->com.lock, flags); + if (close) { + if (abrupt) + ret = send_abort(ep, NULL, gfp); + else + ret = send_halfclose(ep, gfp); + if (ret) + fatal = 1; + } + if (fatal) + release_ep_resources(ep); + return ret; +} + +int iwch_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new, + struct l2t_entry *l2t) +{ + struct iwch_ep *ep = ctx; + + if (ep->dst != old) + return 0; + + PDBG("%s ep %p redirect to dst %p l2t %p\n", __func__, ep, new, + l2t); + dst_hold(new); + l2t_release(ep->com.tdev, ep->l2t); + ep->l2t = l2t; + dst_release(old); + ep->dst = new; + return 1; +} + +/* + * All the CM events are handled on a work queue to have a safe context. + * These are the real handlers that are called from the work queue. + */ +static const cxgb3_cpl_handler_func work_handlers[NUM_CPL_CMDS] = { + [CPL_ACT_ESTABLISH] = act_establish, + [CPL_ACT_OPEN_RPL] = act_open_rpl, + [CPL_RX_DATA] = rx_data, + [CPL_TX_DMA_ACK] = tx_ack, + [CPL_ABORT_RPL_RSS] = abort_rpl, + [CPL_ABORT_RPL] = abort_rpl, + [CPL_PASS_OPEN_RPL] = pass_open_rpl, + [CPL_CLOSE_LISTSRV_RPL] = close_listsrv_rpl, + [CPL_PASS_ACCEPT_REQ] = pass_accept_req, + [CPL_PASS_ESTABLISH] = pass_establish, + [CPL_PEER_CLOSE] = peer_close, + [CPL_ABORT_REQ_RSS] = peer_abort, + [CPL_CLOSE_CON_RPL] = close_con_rpl, + [CPL_RDMA_TERMINATE] = terminate, + [CPL_RDMA_EC_STATUS] = ec_status, +}; + +static void process_work(struct work_struct *work) +{ + struct sk_buff *skb = NULL; + void *ep; + struct t3cdev *tdev; + int ret; + + while ((skb = skb_dequeue(&rxq))) { + ep = *((void **) (skb->cb)); + tdev = *((struct t3cdev **) (skb->cb + sizeof(void *))); + ret = work_handlers[G_OPCODE(ntohl((__force __be32)skb->csum))](tdev, skb, ep); + if (ret & CPL_RET_BUF_DONE) + kfree_skb(skb); + + /* + * ep was referenced in sched(), and is freed here. + */ + put_ep((struct iwch_ep_common *)ep); + } +} + +static DECLARE_WORK(skb_work, process_work); + +static int sched(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep_common *epc = ctx; + + get_ep(epc); + + /* + * Save ctx and tdev in the skb->cb area. + */ + *((void **) skb->cb) = ctx; + *((struct t3cdev **) (skb->cb + sizeof(void *))) = tdev; + + /* + * Queue the skb and schedule the worker thread. + */ + skb_queue_tail(&rxq, skb); + queue_work(workq, &skb_work); + return 0; +} + +static int set_tcb_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct cpl_set_tcb_rpl *rpl = cplhdr(skb); + + if (rpl->status != CPL_ERR_NONE) { + printk(KERN_ERR MOD "Unexpected SET_TCB_RPL status %u " + "for tid %u\n", rpl->status, GET_TID(rpl)); + } + return CPL_RET_BUF_DONE; +} + +/* + * All upcalls from the T3 Core go to sched() to schedule the + * processing on a work queue. + */ +cxgb3_cpl_handler_func t3c_handlers[NUM_CPL_CMDS] = { + [CPL_ACT_ESTABLISH] = sched, + [CPL_ACT_OPEN_RPL] = sched, + [CPL_RX_DATA] = sched, + [CPL_TX_DMA_ACK] = sched, + [CPL_ABORT_RPL_RSS] = sched, + [CPL_ABORT_RPL] = sched, + [CPL_PASS_OPEN_RPL] = sched, + [CPL_CLOSE_LISTSRV_RPL] = sched, + [CPL_PASS_ACCEPT_REQ] = sched, + [CPL_PASS_ESTABLISH] = sched, + [CPL_PEER_CLOSE] = sched, + [CPL_CLOSE_CON_RPL] = sched, + [CPL_ABORT_REQ_RSS] = sched, + [CPL_RDMA_TERMINATE] = sched, + [CPL_RDMA_EC_STATUS] = sched, + [CPL_SET_TCB_RPL] = set_tcb_rpl, +}; + +int __init iwch_cm_init(void) +{ + skb_queue_head_init(&rxq); + + workq = create_singlethread_workqueue("iw_cxgb3"); + if (!workq) + return -ENOMEM; + + return 0; +} + +void __exit iwch_cm_term(void) +{ + flush_workqueue(workq); + destroy_workqueue(workq); +} diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.h b/drivers/infiniband/hw/cxgb3/iwch_cm.h new file mode 100644 index 0000000..b9efadf --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.h @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _IWCH_CM_H_ +#define _IWCH_CM_H_ + +#include +#include +#include +#include + +#include +#include + +#include "cxgb3_offload.h" +#include "iwch_provider.h" + +#define MPA_KEY_REQ "MPA ID Req Frame" +#define MPA_KEY_REP "MPA ID Rep Frame" + +#define MPA_MAX_PRIVATE_DATA 256 +#define MPA_REV 0 /* XXX - amso1100 uses rev 0 ! */ +#define MPA_REJECT 0x20 +#define MPA_CRC 0x40 +#define MPA_MARKERS 0x80 +#define MPA_FLAGS_MASK 0xE0 + +#define put_ep(ep) { \ + PDBG("put_ep (via %s:%u) ep %p refcnt %d\n", __func__, __LINE__, \ + ep, atomic_read(&((ep)->kref.refcount))); \ + WARN_ON(atomic_read(&((ep)->kref.refcount)) < 1); \ + kref_put(&((ep)->kref), __free_ep); \ +} + +#define get_ep(ep) { \ + PDBG("get_ep (via %s:%u) ep %p, refcnt %d\n", __func__, __LINE__, \ + ep, atomic_read(&((ep)->kref.refcount))); \ + kref_get(&((ep)->kref)); \ +} + +struct mpa_message { + u8 key[16]; + u8 flags; + u8 revision; + __be16 private_data_size; + u8 private_data[0]; +}; + +struct terminate_message { + u8 layer_etype; + u8 ecode; + __be16 hdrct_rsvd; + u8 len_hdrs[0]; +}; + +#define TERM_MAX_LENGTH (sizeof(struct terminate_message) + 2 + 18 + 28) + +enum iwch_layers_types { + LAYER_RDMAP = 0x00, + LAYER_DDP = 0x10, + LAYER_MPA = 0x20, + RDMAP_LOCAL_CATA = 0x00, + RDMAP_REMOTE_PROT = 0x01, + RDMAP_REMOTE_OP = 0x02, + DDP_LOCAL_CATA = 0x00, + DDP_TAGGED_ERR = 0x01, + DDP_UNTAGGED_ERR = 0x02, + DDP_LLP = 0x03 +}; + +enum iwch_rdma_ecodes { + RDMAP_INV_STAG = 0x00, + RDMAP_BASE_BOUNDS = 0x01, + RDMAP_ACC_VIOL = 0x02, + RDMAP_STAG_NOT_ASSOC = 0x03, + RDMAP_TO_WRAP = 0x04, + RDMAP_INV_VERS = 0x05, + RDMAP_INV_OPCODE = 0x06, + RDMAP_STREAM_CATA = 0x07, + RDMAP_GLOBAL_CATA = 0x08, + RDMAP_CANT_INV_STAG = 0x09, + RDMAP_UNSPECIFIED = 0xff +}; + +enum iwch_ddp_ecodes { + DDPT_INV_STAG = 0x00, + DDPT_BASE_BOUNDS = 0x01, + DDPT_STAG_NOT_ASSOC = 0x02, + DDPT_TO_WRAP = 0x03, + DDPT_INV_VERS = 0x04, + DDPU_INV_QN = 0x01, + DDPU_INV_MSN_NOBUF = 0x02, + DDPU_INV_MSN_RANGE = 0x03, + DDPU_INV_MO = 0x04, + DDPU_MSG_TOOBIG = 0x05, + DDPU_INV_VERS = 0x06 +}; + +enum iwch_mpa_ecodes { + MPA_CRC_ERR = 0x02, + MPA_MARKER_ERR = 0x03 +}; + +enum iwch_ep_state { + IDLE = 0, + LISTEN, + CONNECTING, + MPA_REQ_WAIT, + MPA_REQ_SENT, + MPA_REQ_RCVD, + MPA_REP_SENT, + FPDU_MODE, + ABORTING, + CLOSING, + MORIBUND, + DEAD, +}; + +enum iwch_ep_flags { + PEER_ABORT_IN_PROGRESS = 0, + ABORT_REQ_IN_PROGRESS = 1, + RELEASE_RESOURCES = 2, + CLOSE_SENT = 3, +}; + +struct iwch_ep_common { + struct iw_cm_id *cm_id; + struct iwch_qp *qp; + struct t3cdev *tdev; + enum iwch_ep_state state; + struct kref kref; + spinlock_t lock; + struct sockaddr_in local_addr; + struct sockaddr_in remote_addr; + wait_queue_head_t waitq; + int rpl_done; + int rpl_err; + unsigned long flags; +}; + +struct iwch_listen_ep { + struct iwch_ep_common com; + unsigned int stid; + int backlog; +}; + +struct iwch_ep { + struct iwch_ep_common com; + struct iwch_ep *parent_ep; + struct timer_list timer; + unsigned int atid; + u32 hwtid; + u32 snd_seq; + u32 rcv_seq; + struct l2t_entry *l2t; + struct dst_entry *dst; + struct sk_buff *mpa_skb; + struct iwch_mpa_attributes mpa_attr; + unsigned int mpa_pkt_len; + u8 mpa_pkt[sizeof(struct mpa_message) + MPA_MAX_PRIVATE_DATA]; + u8 tos; + u16 emss; + u16 plen; + u32 ird; + u32 ord; +}; + +static inline struct iwch_ep *to_ep(struct iw_cm_id *cm_id) +{ + return cm_id->provider_data; +} + +static inline struct iwch_listen_ep *to_listen_ep(struct iw_cm_id *cm_id) +{ + return cm_id->provider_data; +} + +static inline int compute_wscale(int win) +{ + int wscale = 0; + + while (wscale < 14 && (65535<cq); + + if (!rd_cqe) + return 0; + + qhp = get_qhp(rhp, CQE_QPID(*rd_cqe)); + if (!qhp) + wq = NULL; + else { + spin_lock(&qhp->lock); + wq = &(qhp->wq); + } + ret = cxio_poll_cq(wq, &(chp->cq), &cqe, &cqe_flushed, &cookie, + &credit); + if (t3a_device(chp->rhp) && credit) { + PDBG("%s updating %d cq credits on id %d\n", __func__, + credit, chp->cq.cqid); + cxio_hal_cq_op(&rhp->rdev, &chp->cq, CQ_CREDIT_UPDATE, credit); + } + + if (ret) { + ret = -EAGAIN; + goto out; + } + ret = 1; + + wc->wr_id = cookie; + wc->qp = &qhp->ibqp; + wc->vendor_err = CQE_STATUS(cqe); + wc->wc_flags = 0; + + PDBG("%s qpid 0x%x type %d opcode %d status 0x%x wrid hi 0x%x " + "lo 0x%x cookie 0x%llx\n", __func__, + CQE_QPID(cqe), CQE_TYPE(cqe), + CQE_OPCODE(cqe), CQE_STATUS(cqe), CQE_WRID_HI(cqe), + CQE_WRID_LOW(cqe), (unsigned long long) cookie); + + if (CQE_TYPE(cqe) == 0) { + if (!CQE_STATUS(cqe)) + wc->byte_len = CQE_LEN(cqe); + else + wc->byte_len = 0; + wc->opcode = IB_WC_RECV; + if (CQE_OPCODE(cqe) == T3_SEND_WITH_INV || + CQE_OPCODE(cqe) == T3_SEND_WITH_SE_INV) { + wc->ex.invalidate_rkey = CQE_WRID_STAG(cqe); + wc->wc_flags |= IB_WC_WITH_INVALIDATE; + } + } else { + switch (CQE_OPCODE(cqe)) { + case T3_RDMA_WRITE: + wc->opcode = IB_WC_RDMA_WRITE; + break; + case T3_READ_REQ: + wc->opcode = IB_WC_RDMA_READ; + wc->byte_len = CQE_LEN(cqe); + break; + case T3_SEND: + case T3_SEND_WITH_SE: + case T3_SEND_WITH_INV: + case T3_SEND_WITH_SE_INV: + wc->opcode = IB_WC_SEND; + break; + case T3_BIND_MW: + wc->opcode = IB_WC_BIND_MW; + break; + + case T3_LOCAL_INV: + wc->opcode = IB_WC_LOCAL_INV; + break; + case T3_FAST_REGISTER: + wc->opcode = IB_WC_FAST_REG_MR; + break; + default: + printk(KERN_ERR MOD "Unexpected opcode %d " + "in the CQE received for QPID=0x%0x\n", + CQE_OPCODE(cqe), CQE_QPID(cqe)); + ret = -EINVAL; + goto out; + } + } + + if (cqe_flushed) + wc->status = IB_WC_WR_FLUSH_ERR; + else { + + switch (CQE_STATUS(cqe)) { + case TPT_ERR_SUCCESS: + wc->status = IB_WC_SUCCESS; + break; + case TPT_ERR_STAG: + wc->status = IB_WC_LOC_ACCESS_ERR; + break; + case TPT_ERR_PDID: + wc->status = IB_WC_LOC_PROT_ERR; + break; + case TPT_ERR_QPID: + case TPT_ERR_ACCESS: + wc->status = IB_WC_LOC_ACCESS_ERR; + break; + case TPT_ERR_WRAP: + wc->status = IB_WC_GENERAL_ERR; + break; + case TPT_ERR_BOUND: + wc->status = IB_WC_LOC_LEN_ERR; + break; + case TPT_ERR_INVALIDATE_SHARED_MR: + case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND: + wc->status = IB_WC_MW_BIND_ERR; + break; + case TPT_ERR_CRC: + case TPT_ERR_MARKER: + case TPT_ERR_PDU_LEN_ERR: + case TPT_ERR_OUT_OF_RQE: + case TPT_ERR_DDP_VERSION: + case TPT_ERR_RDMA_VERSION: + case TPT_ERR_DDP_QUEUE_NUM: + case TPT_ERR_MSN: + case TPT_ERR_TBIT: + case TPT_ERR_MO: + case TPT_ERR_MSN_RANGE: + case TPT_ERR_IRD_OVERFLOW: + case TPT_ERR_OPCODE: + wc->status = IB_WC_FATAL_ERR; + break; + case TPT_ERR_SWFLUSH: + wc->status = IB_WC_WR_FLUSH_ERR; + break; + default: + printk(KERN_ERR MOD "Unexpected cqe_status 0x%x for " + "QPID=0x%0x\n", CQE_STATUS(cqe), CQE_QPID(cqe)); + ret = -EINVAL; + } + } +out: + if (wq) + spin_unlock(&qhp->lock); + return ret; +} + +int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + struct iwch_dev *rhp; + struct iwch_cq *chp; + unsigned long flags; + int npolled; + int err = 0; + + chp = to_iwch_cq(ibcq); + rhp = chp->rhp; + + spin_lock_irqsave(&chp->lock, flags); + for (npolled = 0; npolled < num_entries; ++npolled) { +#ifdef DEBUG + int i=0; +#endif + + /* + * Because T3 can post CQEs that are _not_ associated + * with a WR, we might have to poll again after removing + * one of these. + */ + do { + err = iwch_poll_cq_one(rhp, chp, wc + npolled); +#ifdef DEBUG + BUG_ON(++i > 1000); +#endif + } while (err == -EAGAIN); + if (err <= 0) + break; + } + spin_unlock_irqrestore(&chp->lock, flags); + + if (err < 0) + return err; + else { + return npolled; + } +} diff --git a/drivers/infiniband/hw/cxgb3/iwch_ev.c b/drivers/infiniband/hw/cxgb3/iwch_ev.c new file mode 100644 index 0000000..abcc9e7 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch_ev.c @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include "iwch_provider.h" +#include "iwch.h" +#include "iwch_cm.h" +#include "cxio_hal.h" +#include "cxio_wr.h" + +static void post_qp_event(struct iwch_dev *rnicp, struct iwch_cq *chp, + struct respQ_msg_t *rsp_msg, + enum ib_event_type ib_event, + int send_term) +{ + struct ib_event event; + struct iwch_qp_attributes attrs; + struct iwch_qp *qhp; + unsigned long flag; + + spin_lock(&rnicp->lock); + qhp = get_qhp(rnicp, CQE_QPID(rsp_msg->cqe)); + + if (!qhp) { + printk(KERN_ERR "%s unaffiliated error 0x%x qpid 0x%x\n", + __func__, CQE_STATUS(rsp_msg->cqe), + CQE_QPID(rsp_msg->cqe)); + spin_unlock(&rnicp->lock); + return; + } + + if ((qhp->attr.state == IWCH_QP_STATE_ERROR) || + (qhp->attr.state == IWCH_QP_STATE_TERMINATE)) { + PDBG("%s AE received after RTS - " + "qp state %d qpid 0x%x status 0x%x\n", __func__, + qhp->attr.state, qhp->wq.qpid, CQE_STATUS(rsp_msg->cqe)); + spin_unlock(&rnicp->lock); + return; + } + + printk(KERN_ERR "%s - AE qpid 0x%x opcode %d status 0x%x " + "type %d wrid.hi 0x%x wrid.lo 0x%x \n", __func__, + CQE_QPID(rsp_msg->cqe), CQE_OPCODE(rsp_msg->cqe), + CQE_STATUS(rsp_msg->cqe), CQE_TYPE(rsp_msg->cqe), + CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe)); + + atomic_inc(&qhp->refcnt); + spin_unlock(&rnicp->lock); + + if (qhp->attr.state == IWCH_QP_STATE_RTS) { + attrs.next_state = IWCH_QP_STATE_TERMINATE; + iwch_modify_qp(qhp->rhp, qhp, IWCH_QP_ATTR_NEXT_STATE, + &attrs, 1); + if (send_term) + iwch_post_terminate(qhp, rsp_msg); + } + + event.event = ib_event; + event.device = chp->ibcq.device; + if (ib_event == IB_EVENT_CQ_ERR) + event.element.cq = &chp->ibcq; + else + event.element.qp = &qhp->ibqp; + + if (qhp->ibqp.event_handler) + (*qhp->ibqp.event_handler)(&event, qhp->ibqp.qp_context); + + spin_lock_irqsave(&chp->comp_handler_lock, flag); + (*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context); + spin_unlock_irqrestore(&chp->comp_handler_lock, flag); + + if (atomic_dec_and_test(&qhp->refcnt)) + wake_up(&qhp->wait); +} + +void iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct sk_buff *skb) +{ + struct iwch_dev *rnicp; + struct respQ_msg_t *rsp_msg = (struct respQ_msg_t *) skb->data; + struct iwch_cq *chp; + struct iwch_qp *qhp; + u32 cqid = RSPQ_CQID(rsp_msg); + unsigned long flag; + + rnicp = (struct iwch_dev *) rdev_p->ulp; + spin_lock(&rnicp->lock); + chp = get_chp(rnicp, cqid); + qhp = get_qhp(rnicp, CQE_QPID(rsp_msg->cqe)); + if (!chp || !qhp) { + printk(KERN_ERR MOD "BAD AE cqid 0x%x qpid 0x%x opcode %d " + "status 0x%x type %d wrid.hi 0x%x wrid.lo 0x%x \n", + cqid, CQE_QPID(rsp_msg->cqe), + CQE_OPCODE(rsp_msg->cqe), CQE_STATUS(rsp_msg->cqe), + CQE_TYPE(rsp_msg->cqe), CQE_WRID_HI(rsp_msg->cqe), + CQE_WRID_LOW(rsp_msg->cqe)); + spin_unlock(&rnicp->lock); + goto out; + } + iwch_qp_add_ref(&qhp->ibqp); + atomic_inc(&chp->refcnt); + spin_unlock(&rnicp->lock); + + /* + * 1) completion of our sending a TERMINATE. + * 2) incoming TERMINATE message. + */ + if ((CQE_OPCODE(rsp_msg->cqe) == T3_TERMINATE) && + (CQE_STATUS(rsp_msg->cqe) == 0)) { + if (SQ_TYPE(rsp_msg->cqe)) { + PDBG("%s QPID 0x%x ep %p disconnecting\n", + __func__, qhp->wq.qpid, qhp->ep); + iwch_ep_disconnect(qhp->ep, 0, GFP_ATOMIC); + } else { + PDBG("%s post REQ_ERR AE QPID 0x%x\n", __func__, + qhp->wq.qpid); + post_qp_event(rnicp, chp, rsp_msg, + IB_EVENT_QP_REQ_ERR, 0); + iwch_ep_disconnect(qhp->ep, 0, GFP_ATOMIC); + } + goto done; + } + + /* Bad incoming Read request */ + if (SQ_TYPE(rsp_msg->cqe) && + (CQE_OPCODE(rsp_msg->cqe) == T3_READ_RESP)) { + post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_REQ_ERR, 1); + goto done; + } + + /* Bad incoming write */ + if (RQ_TYPE(rsp_msg->cqe) && + (CQE_OPCODE(rsp_msg->cqe) == T3_RDMA_WRITE)) { + post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_REQ_ERR, 1); + goto done; + } + + switch (CQE_STATUS(rsp_msg->cqe)) { + + /* Completion Events */ + case TPT_ERR_SUCCESS: + + /* + * Confirm the destination entry if this is a RECV completion. + */ + if (qhp->ep && SQ_TYPE(rsp_msg->cqe)) + dst_confirm(qhp->ep->dst); + spin_lock_irqsave(&chp->comp_handler_lock, flag); + (*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context); + spin_unlock_irqrestore(&chp->comp_handler_lock, flag); + break; + + case TPT_ERR_STAG: + case TPT_ERR_PDID: + case TPT_ERR_QPID: + case TPT_ERR_ACCESS: + case TPT_ERR_WRAP: + case TPT_ERR_BOUND: + case TPT_ERR_INVALIDATE_SHARED_MR: + case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND: + post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_ACCESS_ERR, 1); + break; + + /* Device Fatal Errors */ + case TPT_ERR_ECC: + case TPT_ERR_ECC_PSTAG: + case TPT_ERR_INTERNAL_ERR: + post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_DEVICE_FATAL, 1); + break; + + /* QP Fatal Errors */ + case TPT_ERR_OUT_OF_RQE: + case TPT_ERR_PBL_ADDR_BOUND: + case TPT_ERR_CRC: + case TPT_ERR_MARKER: + case TPT_ERR_PDU_LEN_ERR: + case TPT_ERR_DDP_VERSION: + case TPT_ERR_RDMA_VERSION: + case TPT_ERR_OPCODE: + case TPT_ERR_DDP_QUEUE_NUM: + case TPT_ERR_MSN: + case TPT_ERR_TBIT: + case TPT_ERR_MO: + case TPT_ERR_MSN_GAP: + case TPT_ERR_MSN_RANGE: + case TPT_ERR_RQE_ADDR_BOUND: + case TPT_ERR_IRD_OVERFLOW: + post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_FATAL, 1); + break; + + default: + printk(KERN_ERR MOD "Unknown T3 status 0x%x QPID 0x%x\n", + CQE_STATUS(rsp_msg->cqe), qhp->wq.qpid); + post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_FATAL, 1); + break; + } +done: + if (atomic_dec_and_test(&chp->refcnt)) + wake_up(&chp->wait); + iwch_qp_rem_ref(&qhp->ibqp); +out: + dev_kfree_skb_irq(skb); +} diff --git a/drivers/infiniband/hw/cxgb3/iwch_mem.c b/drivers/infiniband/hw/cxgb3/iwch_mem.c new file mode 100644 index 0000000..5c36ee2 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch_mem.c @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include + +#include +#include + +#include "cxio_hal.h" +#include "cxio_resource.h" +#include "iwch.h" +#include "iwch_provider.h" + +static int iwch_finish_mem_reg(struct iwch_mr *mhp, u32 stag) +{ + u32 mmid; + + mhp->attr.state = 1; + mhp->attr.stag = stag; + mmid = stag >> 8; + mhp->ibmr.rkey = mhp->ibmr.lkey = stag; + PDBG("%s mmid 0x%x mhp %p\n", __func__, mmid, mhp); + return insert_handle(mhp->rhp, &mhp->rhp->mmidr, mhp, mmid); +} + +int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php, + struct iwch_mr *mhp, int shift) +{ + u32 stag; + int ret; + + if (cxio_register_phys_mem(&rhp->rdev, + &stag, mhp->attr.pdid, + mhp->attr.perms, + mhp->attr.zbva, + mhp->attr.va_fbo, + mhp->attr.len, + shift - 12, + mhp->attr.pbl_size, mhp->attr.pbl_addr)) + return -ENOMEM; + + ret = iwch_finish_mem_reg(mhp, stag); + if (ret) + cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, + mhp->attr.pbl_addr); + return ret; +} + +int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php, + struct iwch_mr *mhp, + int shift, + int npages) +{ + u32 stag; + int ret; + + /* We could support this... */ + if (npages > mhp->attr.pbl_size) + return -ENOMEM; + + stag = mhp->attr.stag; + if (cxio_reregister_phys_mem(&rhp->rdev, + &stag, mhp->attr.pdid, + mhp->attr.perms, + mhp->attr.zbva, + mhp->attr.va_fbo, + mhp->attr.len, + shift - 12, + mhp->attr.pbl_size, mhp->attr.pbl_addr)) + return -ENOMEM; + + ret = iwch_finish_mem_reg(mhp, stag); + if (ret) + cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, + mhp->attr.pbl_addr); + + return ret; +} + +int iwch_alloc_pbl(struct iwch_mr *mhp, int npages) +{ + mhp->attr.pbl_addr = cxio_hal_pblpool_alloc(&mhp->rhp->rdev, + npages << 3); + + if (!mhp->attr.pbl_addr) + return -ENOMEM; + + mhp->attr.pbl_size = npages; + + return 0; +} + +void iwch_free_pbl(struct iwch_mr *mhp) +{ + cxio_hal_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr, + mhp->attr.pbl_size << 3); +} + +int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset) +{ + return cxio_write_pbl(&mhp->rhp->rdev, pages, + mhp->attr.pbl_addr + (offset << 3), npages); +} + +int build_phys_page_list(struct ib_phys_buf *buffer_list, + int num_phys_buf, + u64 *iova_start, + u64 *total_size, + int *npages, + int *shift, + __be64 **page_list) +{ + u64 mask; + int i, j, n; + + mask = 0; + *total_size = 0; + for (i = 0; i < num_phys_buf; ++i) { + if (i != 0 && buffer_list[i].addr & ~PAGE_MASK) + return -EINVAL; + if (i != 0 && i != num_phys_buf - 1 && + (buffer_list[i].size & ~PAGE_MASK)) + return -EINVAL; + *total_size += buffer_list[i].size; + if (i > 0) + mask |= buffer_list[i].addr; + else + mask |= buffer_list[i].addr & PAGE_MASK; + if (i != num_phys_buf - 1) + mask |= buffer_list[i].addr + buffer_list[i].size; + else + mask |= (buffer_list[i].addr + buffer_list[i].size + + PAGE_SIZE - 1) & PAGE_MASK; + } + + if (*total_size > 0xFFFFFFFFULL) + return -ENOMEM; + + /* Find largest page shift we can use to cover buffers */ + for (*shift = PAGE_SHIFT; *shift < 27; ++(*shift)) + if ((1ULL << *shift) & mask) + break; + + buffer_list[0].size += buffer_list[0].addr & ((1ULL << *shift) - 1); + buffer_list[0].addr &= ~0ull << *shift; + + *npages = 0; + for (i = 0; i < num_phys_buf; ++i) + *npages += (buffer_list[i].size + + (1ULL << *shift) - 1) >> *shift; + + if (!*npages) + return -EINVAL; + + *page_list = kmalloc(sizeof(u64) * *npages, GFP_KERNEL); + if (!*page_list) + return -ENOMEM; + + n = 0; + for (i = 0; i < num_phys_buf; ++i) + for (j = 0; + j < (buffer_list[i].size + (1ULL << *shift) - 1) >> *shift; + ++j) + (*page_list)[n++] = cpu_to_be64(buffer_list[i].addr + + ((u64) j << *shift)); + + PDBG("%s va 0x%llx mask 0x%llx shift %d len %lld pbl_size %d\n", + __func__, (unsigned long long) *iova_start, + (unsigned long long) mask, *shift, (unsigned long long) *total_size, + *npages); + + return 0; + +} diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c new file mode 100644 index 0000000..811b24a --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -0,0 +1,1467 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "cxio_hal.h" +#include "iwch.h" +#include "iwch_provider.h" +#include "iwch_cm.h" +#include "iwch_user.h" +#include "common.h" + +static struct ib_ah *iwch_ah_create(struct ib_pd *pd, + struct ib_ah_attr *ah_attr) +{ + return ERR_PTR(-ENOSYS); +} + +static int iwch_ah_destroy(struct ib_ah *ah) +{ + return -ENOSYS; +} + +static int iwch_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + return -ENOSYS; +} + +static int iwch_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + return -ENOSYS; +} + +static int iwch_process_mad(struct ib_device *ibdev, + int mad_flags, + u8 port_num, + struct ib_wc *in_wc, + struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + return -ENOSYS; +} + +static int iwch_dealloc_ucontext(struct ib_ucontext *context) +{ + struct iwch_dev *rhp = to_iwch_dev(context->device); + struct iwch_ucontext *ucontext = to_iwch_ucontext(context); + struct iwch_mm_entry *mm, *tmp; + + PDBG("%s context %p\n", __func__, context); + list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry) + kfree(mm); + cxio_release_ucontext(&rhp->rdev, &ucontext->uctx); + kfree(ucontext); + return 0; +} + +static struct ib_ucontext *iwch_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct iwch_ucontext *context; + struct iwch_dev *rhp = to_iwch_dev(ibdev); + + PDBG("%s ibdev %p\n", __func__, ibdev); + context = kzalloc(sizeof(*context), GFP_KERNEL); + if (!context) + return ERR_PTR(-ENOMEM); + cxio_init_ucontext(&rhp->rdev, &context->uctx); + INIT_LIST_HEAD(&context->mmaps); + spin_lock_init(&context->mmap_lock); + return &context->ibucontext; +} + +static int iwch_destroy_cq(struct ib_cq *ib_cq) +{ + struct iwch_cq *chp; + + PDBG("%s ib_cq %p\n", __func__, ib_cq); + chp = to_iwch_cq(ib_cq); + + remove_handle(chp->rhp, &chp->rhp->cqidr, chp->cq.cqid); + atomic_dec(&chp->refcnt); + wait_event(chp->wait, !atomic_read(&chp->refcnt)); + + cxio_destroy_cq(&chp->rhp->rdev, &chp->cq); + kfree(chp); + return 0; +} + +static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, int entries, int vector, + struct ib_ucontext *ib_context, + struct ib_udata *udata) +{ + struct iwch_dev *rhp; + struct iwch_cq *chp; + struct iwch_create_cq_resp uresp; + struct iwch_create_cq_req ureq; + struct iwch_ucontext *ucontext = NULL; + static int warned; + size_t resplen; + + PDBG("%s ib_dev %p entries %d\n", __func__, ibdev, entries); + rhp = to_iwch_dev(ibdev); + chp = kzalloc(sizeof(*chp), GFP_KERNEL); + if (!chp) + return ERR_PTR(-ENOMEM); + + if (ib_context) { + ucontext = to_iwch_ucontext(ib_context); + if (!t3a_device(rhp)) { + if (ib_copy_from_udata(&ureq, udata, sizeof (ureq))) { + kfree(chp); + return ERR_PTR(-EFAULT); + } + chp->user_rptr_addr = (u32 __user *)(unsigned long)ureq.user_rptr_addr; + } + } + + if (t3a_device(rhp)) { + + /* + * T3A: Add some fluff to handle extra CQEs inserted + * for various errors. + * Additional CQE possibilities: + * TERMINATE, + * incoming RDMA WRITE Failures + * incoming RDMA READ REQUEST FAILUREs + * NOTE: We cannot ensure the CQ won't overflow. + */ + entries += 16; + } + entries = roundup_pow_of_two(entries); + chp->cq.size_log2 = ilog2(entries); + + if (cxio_create_cq(&rhp->rdev, &chp->cq, !ucontext)) { + kfree(chp); + return ERR_PTR(-ENOMEM); + } + chp->rhp = rhp; + chp->ibcq.cqe = 1 << chp->cq.size_log2; + spin_lock_init(&chp->lock); + spin_lock_init(&chp->comp_handler_lock); + atomic_set(&chp->refcnt, 1); + init_waitqueue_head(&chp->wait); + if (insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid)) { + cxio_destroy_cq(&chp->rhp->rdev, &chp->cq); + kfree(chp); + return ERR_PTR(-ENOMEM); + } + + if (ucontext) { + struct iwch_mm_entry *mm; + + mm = kmalloc(sizeof *mm, GFP_KERNEL); + if (!mm) { + iwch_destroy_cq(&chp->ibcq); + return ERR_PTR(-ENOMEM); + } + uresp.cqid = chp->cq.cqid; + uresp.size_log2 = chp->cq.size_log2; + spin_lock(&ucontext->mmap_lock); + uresp.key = ucontext->key; + ucontext->key += PAGE_SIZE; + spin_unlock(&ucontext->mmap_lock); + mm->key = uresp.key; + mm->addr = virt_to_phys(chp->cq.queue); + if (udata->outlen < sizeof uresp) { + if (!warned++) + printk(KERN_WARNING MOD "Warning - " + "downlevel libcxgb3 (non-fatal).\n"); + mm->len = PAGE_ALIGN((1UL << uresp.size_log2) * + sizeof(struct t3_cqe)); + resplen = sizeof(struct iwch_create_cq_resp_v0); + } else { + mm->len = PAGE_ALIGN(((1UL << uresp.size_log2) + 1) * + sizeof(struct t3_cqe)); + uresp.memsize = mm->len; + uresp.reserved = 0; + resplen = sizeof uresp; + } + if (ib_copy_to_udata(udata, &uresp, resplen)) { + kfree(mm); + iwch_destroy_cq(&chp->ibcq); + return ERR_PTR(-EFAULT); + } + insert_mmap(ucontext, mm); + } + PDBG("created cqid 0x%0x chp %p size 0x%0x, dma_addr 0x%0llx\n", + chp->cq.cqid, chp, (1 << chp->cq.size_log2), + (unsigned long long) chp->cq.dma_addr); + return &chp->ibcq; +} + +static int iwch_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata) +{ +#ifdef notyet + struct iwch_cq *chp = to_iwch_cq(cq); + struct t3_cq oldcq, newcq; + int ret; + + PDBG("%s ib_cq %p cqe %d\n", __func__, cq, cqe); + + /* We don't downsize... */ + if (cqe <= cq->cqe) + return 0; + + /* create new t3_cq with new size */ + cqe = roundup_pow_of_two(cqe+1); + newcq.size_log2 = ilog2(cqe); + + /* Dont allow resize to less than the current wce count */ + if (cqe < Q_COUNT(chp->cq.rptr, chp->cq.wptr)) { + return -ENOMEM; + } + + /* Quiesce all QPs using this CQ */ + ret = iwch_quiesce_qps(chp); + if (ret) { + return ret; + } + + ret = cxio_create_cq(&chp->rhp->rdev, &newcq); + if (ret) { + return ret; + } + + /* copy CQEs */ + memcpy(newcq.queue, chp->cq.queue, (1 << chp->cq.size_log2) * + sizeof(struct t3_cqe)); + + /* old iwch_qp gets new t3_cq but keeps old cqid */ + oldcq = chp->cq; + chp->cq = newcq; + chp->cq.cqid = oldcq.cqid; + + /* resize new t3_cq to update the HW context */ + ret = cxio_resize_cq(&chp->rhp->rdev, &chp->cq); + if (ret) { + chp->cq = oldcq; + return ret; + } + chp->ibcq.cqe = (1<cq.size_log2) - 1; + + /* destroy old t3_cq */ + oldcq.cqid = newcq.cqid; + ret = cxio_destroy_cq(&chp->rhp->rdev, &oldcq); + if (ret) { + printk(KERN_ERR MOD "%s - cxio_destroy_cq failed %d\n", + __func__, ret); + } + + /* add user hooks here */ + + /* resume qps */ + ret = iwch_resume_qps(chp); + return ret; +#else + return -ENOSYS; +#endif +} + +static int iwch_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + struct iwch_dev *rhp; + struct iwch_cq *chp; + enum t3_cq_opcode cq_op; + int err; + unsigned long flag; + u32 rptr; + + chp = to_iwch_cq(ibcq); + rhp = chp->rhp; + if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED) + cq_op = CQ_ARM_SE; + else + cq_op = CQ_ARM_AN; + if (chp->user_rptr_addr) { + if (get_user(rptr, chp->user_rptr_addr)) + return -EFAULT; + spin_lock_irqsave(&chp->lock, flag); + chp->cq.rptr = rptr; + } else + spin_lock_irqsave(&chp->lock, flag); + PDBG("%s rptr 0x%x\n", __func__, chp->cq.rptr); + err = cxio_hal_cq_op(&rhp->rdev, &chp->cq, cq_op, 0); + spin_unlock_irqrestore(&chp->lock, flag); + if (err < 0) + printk(KERN_ERR MOD "Error %d rearming CQID 0x%x\n", err, + chp->cq.cqid); + if (err > 0 && !(flags & IB_CQ_REPORT_MISSED_EVENTS)) + err = 0; + return err; +} + +static int iwch_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + int len = vma->vm_end - vma->vm_start; + u32 key = vma->vm_pgoff << PAGE_SHIFT; + struct cxio_rdev *rdev_p; + int ret = 0; + struct iwch_mm_entry *mm; + struct iwch_ucontext *ucontext; + u64 addr; + + PDBG("%s pgoff 0x%lx key 0x%x len %d\n", __func__, vma->vm_pgoff, + key, len); + + if (vma->vm_start & (PAGE_SIZE-1)) { + return -EINVAL; + } + + rdev_p = &(to_iwch_dev(context->device)->rdev); + ucontext = to_iwch_ucontext(context); + + mm = remove_mmap(ucontext, key, len); + if (!mm) + return -EINVAL; + addr = mm->addr; + kfree(mm); + + if ((addr >= rdev_p->rnic_info.udbell_physbase) && + (addr < (rdev_p->rnic_info.udbell_physbase + + rdev_p->rnic_info.udbell_len))) { + + /* + * Map T3 DB register. + */ + if (vma->vm_flags & VM_READ) { + return -EPERM; + } + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + vma->vm_flags &= ~VM_MAYREAD; + ret = io_remap_pfn_range(vma, vma->vm_start, + addr >> PAGE_SHIFT, + len, vma->vm_page_prot); + } else { + + /* + * Map WQ or CQ contig dma memory... + */ + ret = remap_pfn_range(vma, vma->vm_start, + addr >> PAGE_SHIFT, + len, vma->vm_page_prot); + } + + return ret; +} + +static int iwch_deallocate_pd(struct ib_pd *pd) +{ + struct iwch_dev *rhp; + struct iwch_pd *php; + + php = to_iwch_pd(pd); + rhp = php->rhp; + PDBG("%s ibpd %p pdid 0x%x\n", __func__, pd, php->pdid); + cxio_hal_put_pdid(rhp->rdev.rscp, php->pdid); + kfree(php); + return 0; +} + +static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct iwch_pd *php; + u32 pdid; + struct iwch_dev *rhp; + + PDBG("%s ibdev %p\n", __func__, ibdev); + rhp = (struct iwch_dev *) ibdev; + pdid = cxio_hal_get_pdid(rhp->rdev.rscp); + if (!pdid) + return ERR_PTR(-EINVAL); + php = kzalloc(sizeof(*php), GFP_KERNEL); + if (!php) { + cxio_hal_put_pdid(rhp->rdev.rscp, pdid); + return ERR_PTR(-ENOMEM); + } + php->pdid = pdid; + php->rhp = rhp; + if (context) { + if (ib_copy_to_udata(udata, &php->pdid, sizeof (__u32))) { + iwch_deallocate_pd(&php->ibpd); + return ERR_PTR(-EFAULT); + } + } + PDBG("%s pdid 0x%0x ptr 0x%p\n", __func__, pdid, php); + return &php->ibpd; +} + +static int iwch_dereg_mr(struct ib_mr *ib_mr) +{ + struct iwch_dev *rhp; + struct iwch_mr *mhp; + u32 mmid; + + PDBG("%s ib_mr %p\n", __func__, ib_mr); + /* There can be no memory windows */ + if (atomic_read(&ib_mr->usecnt)) + return -EINVAL; + + mhp = to_iwch_mr(ib_mr); + rhp = mhp->rhp; + mmid = mhp->attr.stag >> 8; + cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, + mhp->attr.pbl_addr); + iwch_free_pbl(mhp); + remove_handle(rhp, &rhp->mmidr, mmid); + if (mhp->kva) + kfree((void *) (unsigned long) mhp->kva); + if (mhp->umem) + ib_umem_release(mhp->umem); + PDBG("%s mmid 0x%x ptr %p\n", __func__, mmid, mhp); + kfree(mhp); + return 0; +} + +static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, + int acc, + u64 *iova_start) +{ + __be64 *page_list; + int shift; + u64 total_size; + int npages; + struct iwch_dev *rhp; + struct iwch_pd *php; + struct iwch_mr *mhp; + int ret; + + PDBG("%s ib_pd %p\n", __func__, pd); + php = to_iwch_pd(pd); + rhp = php->rhp; + + mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); + if (!mhp) + return ERR_PTR(-ENOMEM); + + mhp->rhp = rhp; + + /* First check that we have enough alignment */ + if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK)) { + ret = -EINVAL; + goto err; + } + + if (num_phys_buf > 1 && + ((buffer_list[0].addr + buffer_list[0].size) & ~PAGE_MASK)) { + ret = -EINVAL; + goto err; + } + + ret = build_phys_page_list(buffer_list, num_phys_buf, iova_start, + &total_size, &npages, &shift, &page_list); + if (ret) + goto err; + + ret = iwch_alloc_pbl(mhp, npages); + if (ret) { + kfree(page_list); + goto err_pbl; + } + + ret = iwch_write_pbl(mhp, page_list, npages, 0); + kfree(page_list); + if (ret) + goto err_pbl; + + mhp->attr.pdid = php->pdid; + mhp->attr.zbva = 0; + + mhp->attr.perms = iwch_ib_to_tpt_access(acc); + mhp->attr.va_fbo = *iova_start; + mhp->attr.page_size = shift - 12; + + mhp->attr.len = (u32) total_size; + mhp->attr.pbl_size = npages; + ret = iwch_register_mem(rhp, php, mhp, shift); + if (ret) + goto err_pbl; + + return &mhp->ibmr; + +err_pbl: + iwch_free_pbl(mhp); + +err: + kfree(mhp); + return ERR_PTR(ret); + +} + +static int iwch_reregister_phys_mem(struct ib_mr *mr, + int mr_rereg_mask, + struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, + int acc, u64 * iova_start) +{ + + struct iwch_mr mh, *mhp; + struct iwch_pd *php; + struct iwch_dev *rhp; + __be64 *page_list = NULL; + int shift = 0; + u64 total_size; + int npages = 0; + int ret; + + PDBG("%s ib_mr %p ib_pd %p\n", __func__, mr, pd); + + /* There can be no memory windows */ + if (atomic_read(&mr->usecnt)) + return -EINVAL; + + mhp = to_iwch_mr(mr); + rhp = mhp->rhp; + php = to_iwch_pd(mr->pd); + + /* make sure we are on the same adapter */ + if (rhp != php->rhp) + return -EINVAL; + + memcpy(&mh, mhp, sizeof *mhp); + + if (mr_rereg_mask & IB_MR_REREG_PD) + php = to_iwch_pd(pd); + if (mr_rereg_mask & IB_MR_REREG_ACCESS) + mh.attr.perms = iwch_ib_to_tpt_access(acc); + if (mr_rereg_mask & IB_MR_REREG_TRANS) { + ret = build_phys_page_list(buffer_list, num_phys_buf, + iova_start, + &total_size, &npages, + &shift, &page_list); + if (ret) + return ret; + } + + ret = iwch_reregister_mem(rhp, php, &mh, shift, npages); + kfree(page_list); + if (ret) { + return ret; + } + if (mr_rereg_mask & IB_MR_REREG_PD) + mhp->attr.pdid = php->pdid; + if (mr_rereg_mask & IB_MR_REREG_ACCESS) + mhp->attr.perms = iwch_ib_to_tpt_access(acc); + if (mr_rereg_mask & IB_MR_REREG_TRANS) { + mhp->attr.zbva = 0; + mhp->attr.va_fbo = *iova_start; + mhp->attr.page_size = shift - 12; + mhp->attr.len = (u32) total_size; + mhp->attr.pbl_size = npages; + } + + return 0; +} + + +static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt, int acc, struct ib_udata *udata) +{ + __be64 *pages; + int shift, n, len; + int i, k, entry; + int err = 0; + struct iwch_dev *rhp; + struct iwch_pd *php; + struct iwch_mr *mhp; + struct iwch_reg_user_mr_resp uresp; + struct scatterlist *sg; + PDBG("%s ib_pd %p\n", __func__, pd); + + php = to_iwch_pd(pd); + rhp = php->rhp; + mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); + if (!mhp) + return ERR_PTR(-ENOMEM); + + mhp->rhp = rhp; + + mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0); + if (IS_ERR(mhp->umem)) { + err = PTR_ERR(mhp->umem); + kfree(mhp); + return ERR_PTR(err); + } + + shift = ffs(mhp->umem->page_size) - 1; + + n = mhp->umem->nmap; + + err = iwch_alloc_pbl(mhp, n); + if (err) + goto err; + + pages = (__be64 *) __get_free_page(GFP_KERNEL); + if (!pages) { + err = -ENOMEM; + goto err_pbl; + } + + i = n = 0; + + for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) { + len = sg_dma_len(sg) >> shift; + for (k = 0; k < len; ++k) { + pages[i++] = cpu_to_be64(sg_dma_address(sg) + + mhp->umem->page_size * k); + if (i == PAGE_SIZE / sizeof *pages) { + err = iwch_write_pbl(mhp, pages, i, n); + if (err) + goto pbl_done; + n += i; + i = 0; + } + } + } + + if (i) + err = iwch_write_pbl(mhp, pages, i, n); + +pbl_done: + free_page((unsigned long) pages); + if (err) + goto err_pbl; + + mhp->attr.pdid = php->pdid; + mhp->attr.zbva = 0; + mhp->attr.perms = iwch_ib_to_tpt_access(acc); + mhp->attr.va_fbo = virt; + mhp->attr.page_size = shift - 12; + mhp->attr.len = (u32) length; + + err = iwch_register_mem(rhp, php, mhp, shift); + if (err) + goto err_pbl; + + if (udata && !t3a_device(rhp)) { + uresp.pbl_addr = (mhp->attr.pbl_addr - + rhp->rdev.rnic_info.pbl_base) >> 3; + PDBG("%s user resp pbl_addr 0x%x\n", __func__, + uresp.pbl_addr); + + if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) { + iwch_dereg_mr(&mhp->ibmr); + err = -EFAULT; + goto err; + } + } + + return &mhp->ibmr; + +err_pbl: + iwch_free_pbl(mhp); + +err: + ib_umem_release(mhp->umem); + kfree(mhp); + return ERR_PTR(err); +} + +static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct ib_phys_buf bl; + u64 kva; + struct ib_mr *ibmr; + + PDBG("%s ib_pd %p\n", __func__, pd); + + /* + * T3 only supports 32 bits of size. + */ + bl.size = 0xffffffff; + bl.addr = 0; + kva = 0; + ibmr = iwch_register_phys_mem(pd, &bl, 1, acc, &kva); + return ibmr; +} + +static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) +{ + struct iwch_dev *rhp; + struct iwch_pd *php; + struct iwch_mw *mhp; + u32 mmid; + u32 stag = 0; + int ret; + + if (type != IB_MW_TYPE_1) + return ERR_PTR(-EINVAL); + + php = to_iwch_pd(pd); + rhp = php->rhp; + mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); + if (!mhp) + return ERR_PTR(-ENOMEM); + ret = cxio_allocate_window(&rhp->rdev, &stag, php->pdid); + if (ret) { + kfree(mhp); + return ERR_PTR(ret); + } + mhp->rhp = rhp; + mhp->attr.pdid = php->pdid; + mhp->attr.type = TPT_MW; + mhp->attr.stag = stag; + mmid = (stag) >> 8; + mhp->ibmw.rkey = stag; + if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) { + cxio_deallocate_window(&rhp->rdev, mhp->attr.stag); + kfree(mhp); + return ERR_PTR(-ENOMEM); + } + PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag); + return &(mhp->ibmw); +} + +static int iwch_dealloc_mw(struct ib_mw *mw) +{ + struct iwch_dev *rhp; + struct iwch_mw *mhp; + u32 mmid; + + mhp = to_iwch_mw(mw); + rhp = mhp->rhp; + mmid = (mw->rkey) >> 8; + cxio_deallocate_window(&rhp->rdev, mhp->attr.stag); + remove_handle(rhp, &rhp->mmidr, mmid); + PDBG("%s ib_mw %p mmid 0x%x ptr %p\n", __func__, mw, mmid, mhp); + kfree(mhp); + return 0; +} + +static struct ib_mr *iwch_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth) +{ + struct iwch_dev *rhp; + struct iwch_pd *php; + struct iwch_mr *mhp; + u32 mmid; + u32 stag = 0; + int ret = 0; + + php = to_iwch_pd(pd); + rhp = php->rhp; + mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); + if (!mhp) + goto err; + + mhp->rhp = rhp; + ret = iwch_alloc_pbl(mhp, pbl_depth); + if (ret) + goto err1; + mhp->attr.pbl_size = pbl_depth; + ret = cxio_allocate_stag(&rhp->rdev, &stag, php->pdid, + mhp->attr.pbl_size, mhp->attr.pbl_addr); + if (ret) + goto err2; + mhp->attr.pdid = php->pdid; + mhp->attr.type = TPT_NON_SHARED_MR; + mhp->attr.stag = stag; + mhp->attr.state = 1; + mmid = (stag) >> 8; + mhp->ibmr.rkey = mhp->ibmr.lkey = stag; + if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) + goto err3; + + PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag); + return &(mhp->ibmr); +err3: + cxio_dereg_mem(&rhp->rdev, stag, mhp->attr.pbl_size, + mhp->attr.pbl_addr); +err2: + iwch_free_pbl(mhp); +err1: + kfree(mhp); +err: + return ERR_PTR(ret); +} + +static struct ib_fast_reg_page_list *iwch_alloc_fastreg_pbl( + struct ib_device *device, + int page_list_len) +{ + struct ib_fast_reg_page_list *page_list; + + page_list = kmalloc(sizeof *page_list + page_list_len * sizeof(u64), + GFP_KERNEL); + if (!page_list) + return ERR_PTR(-ENOMEM); + + page_list->page_list = (u64 *)(page_list + 1); + page_list->max_page_list_len = page_list_len; + + return page_list; +} + +static void iwch_free_fastreg_pbl(struct ib_fast_reg_page_list *page_list) +{ + kfree(page_list); +} + +static int iwch_destroy_qp(struct ib_qp *ib_qp) +{ + struct iwch_dev *rhp; + struct iwch_qp *qhp; + struct iwch_qp_attributes attrs; + struct iwch_ucontext *ucontext; + + qhp = to_iwch_qp(ib_qp); + rhp = qhp->rhp; + + attrs.next_state = IWCH_QP_STATE_ERROR; + iwch_modify_qp(rhp, qhp, IWCH_QP_ATTR_NEXT_STATE, &attrs, 0); + wait_event(qhp->wait, !qhp->ep); + + remove_handle(rhp, &rhp->qpidr, qhp->wq.qpid); + + atomic_dec(&qhp->refcnt); + wait_event(qhp->wait, !atomic_read(&qhp->refcnt)); + + ucontext = ib_qp->uobject ? to_iwch_ucontext(ib_qp->uobject->context) + : NULL; + cxio_destroy_qp(&rhp->rdev, &qhp->wq, + ucontext ? &ucontext->uctx : &rhp->rdev.uctx); + + PDBG("%s ib_qp %p qpid 0x%0x qhp %p\n", __func__, + ib_qp, qhp->wq.qpid, qhp); + kfree(qhp); + return 0; +} + +static struct ib_qp *iwch_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *attrs, + struct ib_udata *udata) +{ + struct iwch_dev *rhp; + struct iwch_qp *qhp; + struct iwch_pd *php; + struct iwch_cq *schp; + struct iwch_cq *rchp; + struct iwch_create_qp_resp uresp; + int wqsize, sqsize, rqsize; + struct iwch_ucontext *ucontext; + + PDBG("%s ib_pd %p\n", __func__, pd); + if (attrs->qp_type != IB_QPT_RC) + return ERR_PTR(-EINVAL); + php = to_iwch_pd(pd); + rhp = php->rhp; + schp = get_chp(rhp, ((struct iwch_cq *) attrs->send_cq)->cq.cqid); + rchp = get_chp(rhp, ((struct iwch_cq *) attrs->recv_cq)->cq.cqid); + if (!schp || !rchp) + return ERR_PTR(-EINVAL); + + /* The RQT size must be # of entries + 1 rounded up to a power of two */ + rqsize = roundup_pow_of_two(attrs->cap.max_recv_wr); + if (rqsize == attrs->cap.max_recv_wr) + rqsize = roundup_pow_of_two(attrs->cap.max_recv_wr+1); + + /* T3 doesn't support RQT depth < 16 */ + if (rqsize < 16) + rqsize = 16; + + if (rqsize > T3_MAX_RQ_SIZE) + return ERR_PTR(-EINVAL); + + if (attrs->cap.max_inline_data > T3_MAX_INLINE) + return ERR_PTR(-EINVAL); + + /* + * NOTE: The SQ and total WQ sizes don't need to be + * a power of two. However, all the code assumes + * they are. EG: Q_FREECNT() and friends. + */ + sqsize = roundup_pow_of_two(attrs->cap.max_send_wr); + wqsize = roundup_pow_of_two(rqsize + sqsize); + + /* + * Kernel users need more wq space for fastreg WRs which can take + * 2 WR fragments. + */ + ucontext = pd->uobject ? to_iwch_ucontext(pd->uobject->context) : NULL; + if (!ucontext && wqsize < (rqsize + (2 * sqsize))) + wqsize = roundup_pow_of_two(rqsize + + roundup_pow_of_two(attrs->cap.max_send_wr * 2)); + PDBG("%s wqsize %d sqsize %d rqsize %d\n", __func__, + wqsize, sqsize, rqsize); + qhp = kzalloc(sizeof(*qhp), GFP_KERNEL); + if (!qhp) + return ERR_PTR(-ENOMEM); + qhp->wq.size_log2 = ilog2(wqsize); + qhp->wq.rq_size_log2 = ilog2(rqsize); + qhp->wq.sq_size_log2 = ilog2(sqsize); + if (cxio_create_qp(&rhp->rdev, !udata, &qhp->wq, + ucontext ? &ucontext->uctx : &rhp->rdev.uctx)) { + kfree(qhp); + return ERR_PTR(-ENOMEM); + } + + attrs->cap.max_recv_wr = rqsize - 1; + attrs->cap.max_send_wr = sqsize; + attrs->cap.max_inline_data = T3_MAX_INLINE; + + qhp->rhp = rhp; + qhp->attr.pd = php->pdid; + qhp->attr.scq = ((struct iwch_cq *) attrs->send_cq)->cq.cqid; + qhp->attr.rcq = ((struct iwch_cq *) attrs->recv_cq)->cq.cqid; + qhp->attr.sq_num_entries = attrs->cap.max_send_wr; + qhp->attr.rq_num_entries = attrs->cap.max_recv_wr; + qhp->attr.sq_max_sges = attrs->cap.max_send_sge; + qhp->attr.sq_max_sges_rdma_write = attrs->cap.max_send_sge; + qhp->attr.rq_max_sges = attrs->cap.max_recv_sge; + qhp->attr.state = IWCH_QP_STATE_IDLE; + qhp->attr.next_state = IWCH_QP_STATE_IDLE; + + /* + * XXX - These don't get passed in from the openib user + * at create time. The CM sets them via a QP modify. + * Need to fix... I think the CM should + */ + qhp->attr.enable_rdma_read = 1; + qhp->attr.enable_rdma_write = 1; + qhp->attr.enable_bind = 1; + qhp->attr.max_ord = 1; + qhp->attr.max_ird = 1; + + spin_lock_init(&qhp->lock); + init_waitqueue_head(&qhp->wait); + atomic_set(&qhp->refcnt, 1); + + if (insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.qpid)) { + cxio_destroy_qp(&rhp->rdev, &qhp->wq, + ucontext ? &ucontext->uctx : &rhp->rdev.uctx); + kfree(qhp); + return ERR_PTR(-ENOMEM); + } + + if (udata) { + + struct iwch_mm_entry *mm1, *mm2; + + mm1 = kmalloc(sizeof *mm1, GFP_KERNEL); + if (!mm1) { + iwch_destroy_qp(&qhp->ibqp); + return ERR_PTR(-ENOMEM); + } + + mm2 = kmalloc(sizeof *mm2, GFP_KERNEL); + if (!mm2) { + kfree(mm1); + iwch_destroy_qp(&qhp->ibqp); + return ERR_PTR(-ENOMEM); + } + + uresp.qpid = qhp->wq.qpid; + uresp.size_log2 = qhp->wq.size_log2; + uresp.sq_size_log2 = qhp->wq.sq_size_log2; + uresp.rq_size_log2 = qhp->wq.rq_size_log2; + spin_lock(&ucontext->mmap_lock); + uresp.key = ucontext->key; + ucontext->key += PAGE_SIZE; + uresp.db_key = ucontext->key; + ucontext->key += PAGE_SIZE; + spin_unlock(&ucontext->mmap_lock); + if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) { + kfree(mm1); + kfree(mm2); + iwch_destroy_qp(&qhp->ibqp); + return ERR_PTR(-EFAULT); + } + mm1->key = uresp.key; + mm1->addr = virt_to_phys(qhp->wq.queue); + mm1->len = PAGE_ALIGN(wqsize * sizeof (union t3_wr)); + insert_mmap(ucontext, mm1); + mm2->key = uresp.db_key; + mm2->addr = qhp->wq.udb & PAGE_MASK; + mm2->len = PAGE_SIZE; + insert_mmap(ucontext, mm2); + } + qhp->ibqp.qp_num = qhp->wq.qpid; + init_timer(&(qhp->timer)); + PDBG("%s sq_num_entries %d, rq_num_entries %d " + "qpid 0x%0x qhp %p dma_addr 0x%llx size %d rq_addr 0x%x\n", + __func__, qhp->attr.sq_num_entries, qhp->attr.rq_num_entries, + qhp->wq.qpid, qhp, (unsigned long long) qhp->wq.dma_addr, + 1 << qhp->wq.size_log2, qhp->wq.rq_addr); + return &qhp->ibqp; +} + +static int iwch_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct iwch_dev *rhp; + struct iwch_qp *qhp; + enum iwch_qp_attr_mask mask = 0; + struct iwch_qp_attributes attrs; + + PDBG("%s ib_qp %p\n", __func__, ibqp); + + /* iwarp does not support the RTR state */ + if ((attr_mask & IB_QP_STATE) && (attr->qp_state == IB_QPS_RTR)) + attr_mask &= ~IB_QP_STATE; + + /* Make sure we still have something left to do */ + if (!attr_mask) + return 0; + + memset(&attrs, 0, sizeof attrs); + qhp = to_iwch_qp(ibqp); + rhp = qhp->rhp; + + attrs.next_state = iwch_convert_state(attr->qp_state); + attrs.enable_rdma_read = (attr->qp_access_flags & + IB_ACCESS_REMOTE_READ) ? 1 : 0; + attrs.enable_rdma_write = (attr->qp_access_flags & + IB_ACCESS_REMOTE_WRITE) ? 1 : 0; + attrs.enable_bind = (attr->qp_access_flags & IB_ACCESS_MW_BIND) ? 1 : 0; + + + mask |= (attr_mask & IB_QP_STATE) ? IWCH_QP_ATTR_NEXT_STATE : 0; + mask |= (attr_mask & IB_QP_ACCESS_FLAGS) ? + (IWCH_QP_ATTR_ENABLE_RDMA_READ | + IWCH_QP_ATTR_ENABLE_RDMA_WRITE | + IWCH_QP_ATTR_ENABLE_RDMA_BIND) : 0; + + return iwch_modify_qp(rhp, qhp, mask, &attrs, 0); +} + +void iwch_qp_add_ref(struct ib_qp *qp) +{ + PDBG("%s ib_qp %p\n", __func__, qp); + atomic_inc(&(to_iwch_qp(qp)->refcnt)); +} + +void iwch_qp_rem_ref(struct ib_qp *qp) +{ + PDBG("%s ib_qp %p\n", __func__, qp); + if (atomic_dec_and_test(&(to_iwch_qp(qp)->refcnt))) + wake_up(&(to_iwch_qp(qp)->wait)); +} + +static struct ib_qp *iwch_get_qp(struct ib_device *dev, int qpn) +{ + PDBG("%s ib_dev %p qpn 0x%x\n", __func__, dev, qpn); + return (struct ib_qp *)get_qhp(to_iwch_dev(dev), qpn); +} + + +static int iwch_query_pkey(struct ib_device *ibdev, + u8 port, u16 index, u16 * pkey) +{ + PDBG("%s ibdev %p\n", __func__, ibdev); + *pkey = 0; + return 0; +} + +static int iwch_query_gid(struct ib_device *ibdev, u8 port, + int index, union ib_gid *gid) +{ + struct iwch_dev *dev; + + PDBG("%s ibdev %p, port %d, index %d, gid %p\n", + __func__, ibdev, port, index, gid); + dev = to_iwch_dev(ibdev); + BUG_ON(port == 0 || port > 2); + memset(&(gid->raw[0]), 0, sizeof(gid->raw)); + memcpy(&(gid->raw[0]), dev->rdev.port_info.lldevs[port-1]->dev_addr, 6); + return 0; +} + +static u64 fw_vers_string_to_u64(struct iwch_dev *iwch_dev) +{ + struct ethtool_drvinfo info; + struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev; + char *cp, *next; + unsigned fw_maj, fw_min, fw_mic; + + lldev->ethtool_ops->get_drvinfo(lldev, &info); + + next = info.fw_version + 1; + cp = strsep(&next, "."); + sscanf(cp, "%i", &fw_maj); + cp = strsep(&next, "."); + sscanf(cp, "%i", &fw_min); + cp = strsep(&next, "."); + sscanf(cp, "%i", &fw_mic); + + return (((u64)fw_maj & 0xffff) << 32) | ((fw_min & 0xffff) << 16) | + (fw_mic & 0xffff); +} + +static int iwch_query_device(struct ib_device *ibdev, + struct ib_device_attr *props) +{ + + struct iwch_dev *dev; + PDBG("%s ibdev %p\n", __func__, ibdev); + + dev = to_iwch_dev(ibdev); + memset(props, 0, sizeof *props); + memcpy(&props->sys_image_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6); + props->hw_ver = dev->rdev.t3cdev_p->type; + props->fw_ver = fw_vers_string_to_u64(dev); + props->device_cap_flags = dev->device_cap_flags; + props->page_size_cap = dev->attr.mem_pgsizes_bitmask; + props->vendor_id = (u32)dev->rdev.rnic_info.pdev->vendor; + props->vendor_part_id = (u32)dev->rdev.rnic_info.pdev->device; + props->max_mr_size = dev->attr.max_mr_size; + props->max_qp = dev->attr.max_qps; + props->max_qp_wr = dev->attr.max_wrs; + props->max_sge = dev->attr.max_sge_per_wr; + props->max_sge_rd = 1; + props->max_qp_rd_atom = dev->attr.max_rdma_reads_per_qp; + props->max_qp_init_rd_atom = dev->attr.max_rdma_reads_per_qp; + props->max_cq = dev->attr.max_cqs; + props->max_cqe = dev->attr.max_cqes_per_cq; + props->max_mr = dev->attr.max_mem_regs; + props->max_pd = dev->attr.max_pds; + props->local_ca_ack_delay = 0; + props->max_fast_reg_page_list_len = T3_MAX_FASTREG_DEPTH; + + return 0; +} + +static int iwch_query_port(struct ib_device *ibdev, + u8 port, struct ib_port_attr *props) +{ + struct iwch_dev *dev; + struct net_device *netdev; + struct in_device *inetdev; + + PDBG("%s ibdev %p\n", __func__, ibdev); + + dev = to_iwch_dev(ibdev); + netdev = dev->rdev.port_info.lldevs[port-1]; + + memset(props, 0, sizeof(struct ib_port_attr)); + props->max_mtu = IB_MTU_4096; + if (netdev->mtu >= 4096) + props->active_mtu = IB_MTU_4096; + else if (netdev->mtu >= 2048) + props->active_mtu = IB_MTU_2048; + else if (netdev->mtu >= 1024) + props->active_mtu = IB_MTU_1024; + else if (netdev->mtu >= 512) + props->active_mtu = IB_MTU_512; + else + props->active_mtu = IB_MTU_256; + + if (!netif_carrier_ok(netdev)) + props->state = IB_PORT_DOWN; + else { + inetdev = in_dev_get(netdev); + if (inetdev) { + if (inetdev->ifa_list) + props->state = IB_PORT_ACTIVE; + else + props->state = IB_PORT_INIT; + in_dev_put(inetdev); + } else + props->state = IB_PORT_INIT; + } + + props->port_cap_flags = + IB_PORT_CM_SUP | + IB_PORT_SNMP_TUNNEL_SUP | + IB_PORT_REINIT_SUP | + IB_PORT_DEVICE_MGMT_SUP | + IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP; + props->gid_tbl_len = 1; + props->pkey_tbl_len = 1; + props->active_width = 2; + props->active_speed = IB_SPEED_DDR; + props->max_msg_sz = -1; + + return 0; +} + +static ssize_t show_rev(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev, + ibdev.dev); + PDBG("%s dev 0x%p\n", __func__, dev); + return sprintf(buf, "%d\n", iwch_dev->rdev.t3cdev_p->type); +} + +static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev, + ibdev.dev); + struct ethtool_drvinfo info; + struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev; + + PDBG("%s dev 0x%p\n", __func__, dev); + lldev->ethtool_ops->get_drvinfo(lldev, &info); + return sprintf(buf, "%s\n", info.fw_version); +} + +static ssize_t show_hca(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev, + ibdev.dev); + struct ethtool_drvinfo info; + struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev; + + PDBG("%s dev 0x%p\n", __func__, dev); + lldev->ethtool_ops->get_drvinfo(lldev, &info); + return sprintf(buf, "%s\n", info.driver); +} + +static ssize_t show_board(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev, + ibdev.dev); + PDBG("%s dev 0x%p\n", __func__, dev); + return sprintf(buf, "%x.%x\n", iwch_dev->rdev.rnic_info.pdev->vendor, + iwch_dev->rdev.rnic_info.pdev->device); +} + +static int iwch_get_mib(struct ib_device *ibdev, + union rdma_protocol_stats *stats) +{ + struct iwch_dev *dev; + struct tp_mib_stats m; + int ret; + + PDBG("%s ibdev %p\n", __func__, ibdev); + dev = to_iwch_dev(ibdev); + ret = dev->rdev.t3cdev_p->ctl(dev->rdev.t3cdev_p, RDMA_GET_MIB, &m); + if (ret) + return -ENOSYS; + + memset(stats, 0, sizeof *stats); + stats->iw.ipInReceives = ((u64) m.ipInReceive_hi << 32) + + m.ipInReceive_lo; + stats->iw.ipInHdrErrors = ((u64) m.ipInHdrErrors_hi << 32) + + m.ipInHdrErrors_lo; + stats->iw.ipInAddrErrors = ((u64) m.ipInAddrErrors_hi << 32) + + m.ipInAddrErrors_lo; + stats->iw.ipInUnknownProtos = ((u64) m.ipInUnknownProtos_hi << 32) + + m.ipInUnknownProtos_lo; + stats->iw.ipInDiscards = ((u64) m.ipInDiscards_hi << 32) + + m.ipInDiscards_lo; + stats->iw.ipInDelivers = ((u64) m.ipInDelivers_hi << 32) + + m.ipInDelivers_lo; + stats->iw.ipOutRequests = ((u64) m.ipOutRequests_hi << 32) + + m.ipOutRequests_lo; + stats->iw.ipOutDiscards = ((u64) m.ipOutDiscards_hi << 32) + + m.ipOutDiscards_lo; + stats->iw.ipOutNoRoutes = ((u64) m.ipOutNoRoutes_hi << 32) + + m.ipOutNoRoutes_lo; + stats->iw.ipReasmTimeout = (u64) m.ipReasmTimeout; + stats->iw.ipReasmReqds = (u64) m.ipReasmReqds; + stats->iw.ipReasmOKs = (u64) m.ipReasmOKs; + stats->iw.ipReasmFails = (u64) m.ipReasmFails; + stats->iw.tcpActiveOpens = (u64) m.tcpActiveOpens; + stats->iw.tcpPassiveOpens = (u64) m.tcpPassiveOpens; + stats->iw.tcpAttemptFails = (u64) m.tcpAttemptFails; + stats->iw.tcpEstabResets = (u64) m.tcpEstabResets; + stats->iw.tcpOutRsts = (u64) m.tcpOutRsts; + stats->iw.tcpCurrEstab = (u64) m.tcpCurrEstab; + stats->iw.tcpInSegs = ((u64) m.tcpInSegs_hi << 32) + + m.tcpInSegs_lo; + stats->iw.tcpOutSegs = ((u64) m.tcpOutSegs_hi << 32) + + m.tcpOutSegs_lo; + stats->iw.tcpRetransSegs = ((u64) m.tcpRetransSeg_hi << 32) + + m.tcpRetransSeg_lo; + stats->iw.tcpInErrs = ((u64) m.tcpInErrs_hi << 32) + + m.tcpInErrs_lo; + stats->iw.tcpRtoMin = (u64) m.tcpRtoMin; + stats->iw.tcpRtoMax = (u64) m.tcpRtoMax; + return 0; +} + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); + +static struct device_attribute *iwch_class_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_fw_ver, + &dev_attr_hca_type, + &dev_attr_board_id, +}; + +int iwch_register_device(struct iwch_dev *dev) +{ + int ret; + int i; + + PDBG("%s iwch_dev %p\n", __func__, dev); + strlcpy(dev->ibdev.name, "cxgb3_%d", IB_DEVICE_NAME_MAX); + memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid)); + memcpy(&dev->ibdev.node_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6); + dev->ibdev.owner = THIS_MODULE; + dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | + IB_DEVICE_MEM_WINDOW | + IB_DEVICE_MEM_MGT_EXTENSIONS; + + /* cxgb3 supports STag 0. */ + dev->ibdev.local_dma_lkey = 0; + + dev->ibdev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_POLL_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_POST_SEND) | + (1ull << IB_USER_VERBS_CMD_POST_RECV); + dev->ibdev.node_type = RDMA_NODE_RNIC; + memcpy(dev->ibdev.node_desc, IWCH_NODE_DESC, sizeof(IWCH_NODE_DESC)); + dev->ibdev.phys_port_cnt = dev->rdev.port_info.nports; + dev->ibdev.num_comp_vectors = 1; + dev->ibdev.dma_device = &(dev->rdev.rnic_info.pdev->dev); + dev->ibdev.query_device = iwch_query_device; + dev->ibdev.query_port = iwch_query_port; + dev->ibdev.query_pkey = iwch_query_pkey; + dev->ibdev.query_gid = iwch_query_gid; + dev->ibdev.alloc_ucontext = iwch_alloc_ucontext; + dev->ibdev.dealloc_ucontext = iwch_dealloc_ucontext; + dev->ibdev.mmap = iwch_mmap; + dev->ibdev.alloc_pd = iwch_allocate_pd; + dev->ibdev.dealloc_pd = iwch_deallocate_pd; + dev->ibdev.create_ah = iwch_ah_create; + dev->ibdev.destroy_ah = iwch_ah_destroy; + dev->ibdev.create_qp = iwch_create_qp; + dev->ibdev.modify_qp = iwch_ib_modify_qp; + dev->ibdev.destroy_qp = iwch_destroy_qp; + dev->ibdev.create_cq = iwch_create_cq; + dev->ibdev.destroy_cq = iwch_destroy_cq; + dev->ibdev.resize_cq = iwch_resize_cq; + dev->ibdev.poll_cq = iwch_poll_cq; + dev->ibdev.get_dma_mr = iwch_get_dma_mr; + dev->ibdev.reg_phys_mr = iwch_register_phys_mem; + dev->ibdev.rereg_phys_mr = iwch_reregister_phys_mem; + dev->ibdev.reg_user_mr = iwch_reg_user_mr; + dev->ibdev.dereg_mr = iwch_dereg_mr; + dev->ibdev.alloc_mw = iwch_alloc_mw; + dev->ibdev.bind_mw = iwch_bind_mw; + dev->ibdev.dealloc_mw = iwch_dealloc_mw; + dev->ibdev.alloc_fast_reg_mr = iwch_alloc_fast_reg_mr; + dev->ibdev.alloc_fast_reg_page_list = iwch_alloc_fastreg_pbl; + dev->ibdev.free_fast_reg_page_list = iwch_free_fastreg_pbl; + dev->ibdev.attach_mcast = iwch_multicast_attach; + dev->ibdev.detach_mcast = iwch_multicast_detach; + dev->ibdev.process_mad = iwch_process_mad; + dev->ibdev.req_notify_cq = iwch_arm_cq; + dev->ibdev.post_send = iwch_post_send; + dev->ibdev.post_recv = iwch_post_receive; + dev->ibdev.get_protocol_stats = iwch_get_mib; + dev->ibdev.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION; + + dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL); + if (!dev->ibdev.iwcm) + return -ENOMEM; + + dev->ibdev.iwcm->connect = iwch_connect; + dev->ibdev.iwcm->accept = iwch_accept_cr; + dev->ibdev.iwcm->reject = iwch_reject_cr; + dev->ibdev.iwcm->create_listen = iwch_create_listen; + dev->ibdev.iwcm->destroy_listen = iwch_destroy_listen; + dev->ibdev.iwcm->add_ref = iwch_qp_add_ref; + dev->ibdev.iwcm->rem_ref = iwch_qp_rem_ref; + dev->ibdev.iwcm->get_qp = iwch_get_qp; + + ret = ib_register_device(&dev->ibdev, NULL); + if (ret) + goto bail1; + + for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i) { + ret = device_create_file(&dev->ibdev.dev, + iwch_class_attributes[i]); + if (ret) { + goto bail2; + } + } + return 0; +bail2: + ib_unregister_device(&dev->ibdev); +bail1: + kfree(dev->ibdev.iwcm); + return ret; +} + +void iwch_unregister_device(struct iwch_dev *dev) +{ + int i; + + PDBG("%s iwch_dev %p\n", __func__, dev); + for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i) + device_remove_file(&dev->ibdev.dev, + iwch_class_attributes[i]); + ib_unregister_device(&dev->ibdev); + kfree(dev->ibdev.iwcm); + return; +} diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.h b/drivers/infiniband/hw/cxgb3/iwch_provider.h new file mode 100644 index 0000000..87c14b0 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.h @@ -0,0 +1,360 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __IWCH_PROVIDER_H__ +#define __IWCH_PROVIDER_H__ + +#include +#include +#include +#include +#include "t3cdev.h" +#include "iwch.h" +#include "cxio_wr.h" +#include "cxio_hal.h" + +struct iwch_pd { + struct ib_pd ibpd; + u32 pdid; + struct iwch_dev *rhp; +}; + +static inline struct iwch_pd *to_iwch_pd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct iwch_pd, ibpd); +} + +struct tpt_attributes { + u32 stag; + u32 state:1; + u32 type:2; + u32 rsvd:1; + enum tpt_mem_perm perms; + u32 remote_invaliate_disable:1; + u32 zbva:1; + u32 mw_bind_enable:1; + u32 page_size:5; + + u32 pdid; + u32 qpid; + u32 pbl_addr; + u32 len; + u64 va_fbo; + u32 pbl_size; +}; + +struct iwch_mr { + struct ib_mr ibmr; + struct ib_umem *umem; + struct iwch_dev *rhp; + u64 kva; + struct tpt_attributes attr; +}; + +typedef struct iwch_mw iwch_mw_handle; + +static inline struct iwch_mr *to_iwch_mr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct iwch_mr, ibmr); +} + +struct iwch_mw { + struct ib_mw ibmw; + struct iwch_dev *rhp; + u64 kva; + struct tpt_attributes attr; +}; + +static inline struct iwch_mw *to_iwch_mw(struct ib_mw *ibmw) +{ + return container_of(ibmw, struct iwch_mw, ibmw); +} + +struct iwch_cq { + struct ib_cq ibcq; + struct iwch_dev *rhp; + struct t3_cq cq; + spinlock_t lock; + spinlock_t comp_handler_lock; + atomic_t refcnt; + wait_queue_head_t wait; + u32 __user *user_rptr_addr; +}; + +static inline struct iwch_cq *to_iwch_cq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct iwch_cq, ibcq); +} + +enum IWCH_QP_FLAGS { + QP_QUIESCED = 0x01 +}; + +struct iwch_mpa_attributes { + u8 initiator; + u8 recv_marker_enabled; + u8 xmit_marker_enabled; /* iWARP: enable inbound Read Resp. */ + u8 crc_enabled; + u8 version; /* 0 or 1 */ +}; + +struct iwch_qp_attributes { + u32 scq; + u32 rcq; + u32 sq_num_entries; + u32 rq_num_entries; + u32 sq_max_sges; + u32 sq_max_sges_rdma_write; + u32 rq_max_sges; + u32 state; + u8 enable_rdma_read; + u8 enable_rdma_write; /* enable inbound Read Resp. */ + u8 enable_bind; + u8 enable_mmid0_fastreg; /* Enable STAG0 + Fast-register */ + /* + * Next QP state. If specify the current state, only the + * QP attributes will be modified. + */ + u32 max_ord; + u32 max_ird; + u32 pd; /* IN */ + u32 next_state; + char terminate_buffer[52]; + u32 terminate_msg_len; + u8 is_terminate_local; + struct iwch_mpa_attributes mpa_attr; /* IN-OUT */ + struct iwch_ep *llp_stream_handle; + char *stream_msg_buf; /* Last stream msg. before Idle -> RTS */ + u32 stream_msg_buf_len; /* Only on Idle -> RTS */ +}; + +struct iwch_qp { + struct ib_qp ibqp; + struct iwch_dev *rhp; + struct iwch_ep *ep; + struct iwch_qp_attributes attr; + struct t3_wq wq; + spinlock_t lock; + atomic_t refcnt; + wait_queue_head_t wait; + enum IWCH_QP_FLAGS flags; + struct timer_list timer; +}; + +static inline int qp_quiesced(struct iwch_qp *qhp) +{ + return qhp->flags & QP_QUIESCED; +} + +static inline struct iwch_qp *to_iwch_qp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct iwch_qp, ibqp); +} + +void iwch_qp_add_ref(struct ib_qp *qp); +void iwch_qp_rem_ref(struct ib_qp *qp); + +struct iwch_ucontext { + struct ib_ucontext ibucontext; + struct cxio_ucontext uctx; + u32 key; + spinlock_t mmap_lock; + struct list_head mmaps; +}; + +static inline struct iwch_ucontext *to_iwch_ucontext(struct ib_ucontext *c) +{ + return container_of(c, struct iwch_ucontext, ibucontext); +} + +struct iwch_mm_entry { + struct list_head entry; + u64 addr; + u32 key; + unsigned len; +}; + +static inline struct iwch_mm_entry *remove_mmap(struct iwch_ucontext *ucontext, + u32 key, unsigned len) +{ + struct list_head *pos, *nxt; + struct iwch_mm_entry *mm; + + spin_lock(&ucontext->mmap_lock); + list_for_each_safe(pos, nxt, &ucontext->mmaps) { + + mm = list_entry(pos, struct iwch_mm_entry, entry); + if (mm->key == key && mm->len == len) { + list_del_init(&mm->entry); + spin_unlock(&ucontext->mmap_lock); + PDBG("%s key 0x%x addr 0x%llx len %d\n", __func__, + key, (unsigned long long) mm->addr, mm->len); + return mm; + } + } + spin_unlock(&ucontext->mmap_lock); + return NULL; +} + +static inline void insert_mmap(struct iwch_ucontext *ucontext, + struct iwch_mm_entry *mm) +{ + spin_lock(&ucontext->mmap_lock); + PDBG("%s key 0x%x addr 0x%llx len %d\n", __func__, + mm->key, (unsigned long long) mm->addr, mm->len); + list_add_tail(&mm->entry, &ucontext->mmaps); + spin_unlock(&ucontext->mmap_lock); +} + +enum iwch_qp_attr_mask { + IWCH_QP_ATTR_NEXT_STATE = 1 << 0, + IWCH_QP_ATTR_ENABLE_RDMA_READ = 1 << 7, + IWCH_QP_ATTR_ENABLE_RDMA_WRITE = 1 << 8, + IWCH_QP_ATTR_ENABLE_RDMA_BIND = 1 << 9, + IWCH_QP_ATTR_MAX_ORD = 1 << 11, + IWCH_QP_ATTR_MAX_IRD = 1 << 12, + IWCH_QP_ATTR_LLP_STREAM_HANDLE = 1 << 22, + IWCH_QP_ATTR_STREAM_MSG_BUFFER = 1 << 23, + IWCH_QP_ATTR_MPA_ATTR = 1 << 24, + IWCH_QP_ATTR_QP_CONTEXT_ACTIVATE = 1 << 25, + IWCH_QP_ATTR_VALID_MODIFY = (IWCH_QP_ATTR_ENABLE_RDMA_READ | + IWCH_QP_ATTR_ENABLE_RDMA_WRITE | + IWCH_QP_ATTR_MAX_ORD | + IWCH_QP_ATTR_MAX_IRD | + IWCH_QP_ATTR_LLP_STREAM_HANDLE | + IWCH_QP_ATTR_STREAM_MSG_BUFFER | + IWCH_QP_ATTR_MPA_ATTR | + IWCH_QP_ATTR_QP_CONTEXT_ACTIVATE) +}; + +int iwch_modify_qp(struct iwch_dev *rhp, + struct iwch_qp *qhp, + enum iwch_qp_attr_mask mask, + struct iwch_qp_attributes *attrs, + int internal); + +enum iwch_qp_state { + IWCH_QP_STATE_IDLE, + IWCH_QP_STATE_RTS, + IWCH_QP_STATE_ERROR, + IWCH_QP_STATE_TERMINATE, + IWCH_QP_STATE_CLOSING, + IWCH_QP_STATE_TOT +}; + +static inline int iwch_convert_state(enum ib_qp_state ib_state) +{ + switch (ib_state) { + case IB_QPS_RESET: + case IB_QPS_INIT: + return IWCH_QP_STATE_IDLE; + case IB_QPS_RTS: + return IWCH_QP_STATE_RTS; + case IB_QPS_SQD: + return IWCH_QP_STATE_CLOSING; + case IB_QPS_SQE: + return IWCH_QP_STATE_TERMINATE; + case IB_QPS_ERR: + return IWCH_QP_STATE_ERROR; + default: + return -1; + } +} + +static inline u32 iwch_ib_to_tpt_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_WRITE ? TPT_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? TPT_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? TPT_LOCAL_WRITE : 0) | + (acc & IB_ACCESS_MW_BIND ? TPT_MW_BIND : 0) | + TPT_LOCAL_READ; +} + +static inline u32 iwch_ib_to_tpt_bind_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_WRITE ? TPT_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? TPT_REMOTE_READ : 0); +} + +enum iwch_mmid_state { + IWCH_STAG_STATE_VALID, + IWCH_STAG_STATE_INVALID +}; + +enum iwch_qp_query_flags { + IWCH_QP_QUERY_CONTEXT_NONE = 0x0, /* No ctx; Only attrs */ + IWCH_QP_QUERY_CONTEXT_GET = 0x1, /* Get ctx + attrs */ + IWCH_QP_QUERY_CONTEXT_SUSPEND = 0x2, /* Not Supported */ + + /* + * Quiesce QP context; Consumer + * will NOT replay outstanding WR + */ + IWCH_QP_QUERY_CONTEXT_QUIESCE = 0x4, + IWCH_QP_QUERY_CONTEXT_REMOVE = 0x8, + IWCH_QP_QUERY_TEST_USERWRITE = 0x32 /* Test special */ +}; + +u16 iwch_rqes_posted(struct iwch_qp *qhp); +int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +int iwch_bind_mw(struct ib_qp *qp, + struct ib_mw *mw, + struct ib_mw_bind *mw_bind); +int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg); +int iwch_post_zb_read(struct iwch_ep *ep); +int iwch_register_device(struct iwch_dev *dev); +void iwch_unregister_device(struct iwch_dev *dev); +void stop_read_rep_timer(struct iwch_qp *qhp); +int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php, + struct iwch_mr *mhp, int shift); +int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php, + struct iwch_mr *mhp, + int shift, + int npages); +int iwch_alloc_pbl(struct iwch_mr *mhp, int npages); +void iwch_free_pbl(struct iwch_mr *mhp); +int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset); +int build_phys_page_list(struct ib_phys_buf *buffer_list, + int num_phys_buf, + u64 *iova_start, + u64 *total_size, + int *npages, + int *shift, + __be64 **page_list); + + +#define IWCH_NODE_DESC "cxgb3 Chelsio Communications" + +#endif diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c new file mode 100644 index 0000000..b57c0be --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c @@ -0,0 +1,1163 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include "iwch_provider.h" +#include "iwch.h" +#include "iwch_cm.h" +#include "cxio_hal.h" +#include "cxio_resource.h" + +#define NO_SUPPORT -1 + +static int build_rdma_send(union t3_wr *wqe, struct ib_send_wr *wr, + u8 * flit_cnt) +{ + int i; + u32 plen; + + switch (wr->opcode) { + case IB_WR_SEND: + if (wr->send_flags & IB_SEND_SOLICITED) + wqe->send.rdmaop = T3_SEND_WITH_SE; + else + wqe->send.rdmaop = T3_SEND; + wqe->send.rem_stag = 0; + break; + case IB_WR_SEND_WITH_INV: + if (wr->send_flags & IB_SEND_SOLICITED) + wqe->send.rdmaop = T3_SEND_WITH_SE_INV; + else + wqe->send.rdmaop = T3_SEND_WITH_INV; + wqe->send.rem_stag = cpu_to_be32(wr->ex.invalidate_rkey); + break; + default: + return -EINVAL; + } + if (wr->num_sge > T3_MAX_SGE) + return -EINVAL; + wqe->send.reserved[0] = 0; + wqe->send.reserved[1] = 0; + wqe->send.reserved[2] = 0; + plen = 0; + for (i = 0; i < wr->num_sge; i++) { + if ((plen + wr->sg_list[i].length) < plen) + return -EMSGSIZE; + + plen += wr->sg_list[i].length; + wqe->send.sgl[i].stag = cpu_to_be32(wr->sg_list[i].lkey); + wqe->send.sgl[i].len = cpu_to_be32(wr->sg_list[i].length); + wqe->send.sgl[i].to = cpu_to_be64(wr->sg_list[i].addr); + } + wqe->send.num_sgle = cpu_to_be32(wr->num_sge); + *flit_cnt = 4 + ((wr->num_sge) << 1); + wqe->send.plen = cpu_to_be32(plen); + return 0; +} + +static int build_rdma_write(union t3_wr *wqe, struct ib_send_wr *wr, + u8 *flit_cnt) +{ + int i; + u32 plen; + if (wr->num_sge > T3_MAX_SGE) + return -EINVAL; + wqe->write.rdmaop = T3_RDMA_WRITE; + wqe->write.reserved[0] = 0; + wqe->write.reserved[1] = 0; + wqe->write.reserved[2] = 0; + wqe->write.stag_sink = cpu_to_be32(wr->wr.rdma.rkey); + wqe->write.to_sink = cpu_to_be64(wr->wr.rdma.remote_addr); + + if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) { + plen = 4; + wqe->write.sgl[0].stag = wr->ex.imm_data; + wqe->write.sgl[0].len = cpu_to_be32(0); + wqe->write.num_sgle = cpu_to_be32(0); + *flit_cnt = 6; + } else { + plen = 0; + for (i = 0; i < wr->num_sge; i++) { + if ((plen + wr->sg_list[i].length) < plen) { + return -EMSGSIZE; + } + plen += wr->sg_list[i].length; + wqe->write.sgl[i].stag = + cpu_to_be32(wr->sg_list[i].lkey); + wqe->write.sgl[i].len = + cpu_to_be32(wr->sg_list[i].length); + wqe->write.sgl[i].to = + cpu_to_be64(wr->sg_list[i].addr); + } + wqe->write.num_sgle = cpu_to_be32(wr->num_sge); + *flit_cnt = 5 + ((wr->num_sge) << 1); + } + wqe->write.plen = cpu_to_be32(plen); + return 0; +} + +static int build_rdma_read(union t3_wr *wqe, struct ib_send_wr *wr, + u8 *flit_cnt) +{ + if (wr->num_sge > 1) + return -EINVAL; + wqe->read.rdmaop = T3_READ_REQ; + if (wr->opcode == IB_WR_RDMA_READ_WITH_INV) + wqe->read.local_inv = 1; + else + wqe->read.local_inv = 0; + wqe->read.reserved[0] = 0; + wqe->read.reserved[1] = 0; + wqe->read.rem_stag = cpu_to_be32(wr->wr.rdma.rkey); + wqe->read.rem_to = cpu_to_be64(wr->wr.rdma.remote_addr); + wqe->read.local_stag = cpu_to_be32(wr->sg_list[0].lkey); + wqe->read.local_len = cpu_to_be32(wr->sg_list[0].length); + wqe->read.local_to = cpu_to_be64(wr->sg_list[0].addr); + *flit_cnt = sizeof(struct t3_rdma_read_wr) >> 3; + return 0; +} + +static int build_fastreg(union t3_wr *wqe, struct ib_send_wr *wr, + u8 *flit_cnt, int *wr_cnt, struct t3_wq *wq) +{ + int i; + __be64 *p; + + if (wr->wr.fast_reg.page_list_len > T3_MAX_FASTREG_DEPTH) + return -EINVAL; + *wr_cnt = 1; + wqe->fastreg.stag = cpu_to_be32(wr->wr.fast_reg.rkey); + wqe->fastreg.len = cpu_to_be32(wr->wr.fast_reg.length); + wqe->fastreg.va_base_hi = cpu_to_be32(wr->wr.fast_reg.iova_start >> 32); + wqe->fastreg.va_base_lo_fbo = + cpu_to_be32(wr->wr.fast_reg.iova_start & 0xffffffff); + wqe->fastreg.page_type_perms = cpu_to_be32( + V_FR_PAGE_COUNT(wr->wr.fast_reg.page_list_len) | + V_FR_PAGE_SIZE(wr->wr.fast_reg.page_shift-12) | + V_FR_TYPE(TPT_VATO) | + V_FR_PERMS(iwch_ib_to_tpt_access(wr->wr.fast_reg.access_flags))); + p = &wqe->fastreg.pbl_addrs[0]; + for (i = 0; i < wr->wr.fast_reg.page_list_len; i++, p++) { + + /* If we need a 2nd WR, then set it up */ + if (i == T3_MAX_FASTREG_FRAG) { + *wr_cnt = 2; + wqe = (union t3_wr *)(wq->queue + + Q_PTR2IDX((wq->wptr+1), wq->size_log2)); + build_fw_riwrh((void *)wqe, T3_WR_FASTREG, 0, + Q_GENBIT(wq->wptr + 1, wq->size_log2), + 0, 1 + wr->wr.fast_reg.page_list_len - T3_MAX_FASTREG_FRAG, + T3_EOP); + + p = &wqe->pbl_frag.pbl_addrs[0]; + } + *p = cpu_to_be64((u64)wr->wr.fast_reg.page_list->page_list[i]); + } + *flit_cnt = 5 + wr->wr.fast_reg.page_list_len; + if (*flit_cnt > 15) + *flit_cnt = 15; + return 0; +} + +static int build_inv_stag(union t3_wr *wqe, struct ib_send_wr *wr, + u8 *flit_cnt) +{ + wqe->local_inv.stag = cpu_to_be32(wr->ex.invalidate_rkey); + wqe->local_inv.reserved = 0; + *flit_cnt = sizeof(struct t3_local_inv_wr) >> 3; + return 0; +} + +static int iwch_sgl2pbl_map(struct iwch_dev *rhp, struct ib_sge *sg_list, + u32 num_sgle, u32 * pbl_addr, u8 * page_size) +{ + int i; + struct iwch_mr *mhp; + u64 offset; + for (i = 0; i < num_sgle; i++) { + + mhp = get_mhp(rhp, (sg_list[i].lkey) >> 8); + if (!mhp) { + PDBG("%s %d\n", __func__, __LINE__); + return -EIO; + } + if (!mhp->attr.state) { + PDBG("%s %d\n", __func__, __LINE__); + return -EIO; + } + if (mhp->attr.zbva) { + PDBG("%s %d\n", __func__, __LINE__); + return -EIO; + } + + if (sg_list[i].addr < mhp->attr.va_fbo) { + PDBG("%s %d\n", __func__, __LINE__); + return -EINVAL; + } + if (sg_list[i].addr + ((u64) sg_list[i].length) < + sg_list[i].addr) { + PDBG("%s %d\n", __func__, __LINE__); + return -EINVAL; + } + if (sg_list[i].addr + ((u64) sg_list[i].length) > + mhp->attr.va_fbo + ((u64) mhp->attr.len)) { + PDBG("%s %d\n", __func__, __LINE__); + return -EINVAL; + } + offset = sg_list[i].addr - mhp->attr.va_fbo; + offset += mhp->attr.va_fbo & + ((1UL << (12 + mhp->attr.page_size)) - 1); + pbl_addr[i] = ((mhp->attr.pbl_addr - + rhp->rdev.rnic_info.pbl_base) >> 3) + + (offset >> (12 + mhp->attr.page_size)); + page_size[i] = mhp->attr.page_size; + } + return 0; +} + +static int build_rdma_recv(struct iwch_qp *qhp, union t3_wr *wqe, + struct ib_recv_wr *wr) +{ + int i, err = 0; + u32 pbl_addr[T3_MAX_SGE]; + u8 page_size[T3_MAX_SGE]; + + err = iwch_sgl2pbl_map(qhp->rhp, wr->sg_list, wr->num_sge, pbl_addr, + page_size); + if (err) + return err; + wqe->recv.pagesz[0] = page_size[0]; + wqe->recv.pagesz[1] = page_size[1]; + wqe->recv.pagesz[2] = page_size[2]; + wqe->recv.pagesz[3] = page_size[3]; + wqe->recv.num_sgle = cpu_to_be32(wr->num_sge); + for (i = 0; i < wr->num_sge; i++) { + wqe->recv.sgl[i].stag = cpu_to_be32(wr->sg_list[i].lkey); + wqe->recv.sgl[i].len = cpu_to_be32(wr->sg_list[i].length); + + /* to in the WQE == the offset into the page */ + wqe->recv.sgl[i].to = cpu_to_be64(((u32)wr->sg_list[i].addr) & + ((1UL << (12 + page_size[i])) - 1)); + + /* pbl_addr is the adapters address in the PBL */ + wqe->recv.pbl_addr[i] = cpu_to_be32(pbl_addr[i]); + } + for (; i < T3_MAX_SGE; i++) { + wqe->recv.sgl[i].stag = 0; + wqe->recv.sgl[i].len = 0; + wqe->recv.sgl[i].to = 0; + wqe->recv.pbl_addr[i] = 0; + } + qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr, + qhp->wq.rq_size_log2)].wr_id = wr->wr_id; + qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr, + qhp->wq.rq_size_log2)].pbl_addr = 0; + return 0; +} + +static int build_zero_stag_recv(struct iwch_qp *qhp, union t3_wr *wqe, + struct ib_recv_wr *wr) +{ + int i; + u32 pbl_addr; + u32 pbl_offset; + + + /* + * The T3 HW requires the PBL in the HW recv descriptor to reference + * a PBL entry. So we allocate the max needed PBL memory here and pass + * it to the uP in the recv WR. The uP will build the PBL and setup + * the HW recv descriptor. + */ + pbl_addr = cxio_hal_pblpool_alloc(&qhp->rhp->rdev, T3_STAG0_PBL_SIZE); + if (!pbl_addr) + return -ENOMEM; + + /* + * Compute the 8B aligned offset. + */ + pbl_offset = (pbl_addr - qhp->rhp->rdev.rnic_info.pbl_base) >> 3; + + wqe->recv.num_sgle = cpu_to_be32(wr->num_sge); + + for (i = 0; i < wr->num_sge; i++) { + + /* + * Use a 128MB page size. This and an imposed 128MB + * sge length limit allows us to require only a 2-entry HW + * PBL for each SGE. This restriction is acceptable since + * since it is not possible to allocate 128MB of contiguous + * DMA coherent memory! + */ + if (wr->sg_list[i].length > T3_STAG0_MAX_PBE_LEN) + return -EINVAL; + wqe->recv.pagesz[i] = T3_STAG0_PAGE_SHIFT; + + /* + * T3 restricts a recv to all zero-stag or all non-zero-stag. + */ + if (wr->sg_list[i].lkey != 0) + return -EINVAL; + wqe->recv.sgl[i].stag = 0; + wqe->recv.sgl[i].len = cpu_to_be32(wr->sg_list[i].length); + wqe->recv.sgl[i].to = cpu_to_be64(wr->sg_list[i].addr); + wqe->recv.pbl_addr[i] = cpu_to_be32(pbl_offset); + pbl_offset += 2; + } + for (; i < T3_MAX_SGE; i++) { + wqe->recv.pagesz[i] = 0; + wqe->recv.sgl[i].stag = 0; + wqe->recv.sgl[i].len = 0; + wqe->recv.sgl[i].to = 0; + wqe->recv.pbl_addr[i] = 0; + } + qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr, + qhp->wq.rq_size_log2)].wr_id = wr->wr_id; + qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr, + qhp->wq.rq_size_log2)].pbl_addr = pbl_addr; + return 0; +} + +int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + int err = 0; + u8 uninitialized_var(t3_wr_flit_cnt); + enum t3_wr_opcode t3_wr_opcode = 0; + enum t3_wr_flags t3_wr_flags; + struct iwch_qp *qhp; + u32 idx; + union t3_wr *wqe; + u32 num_wrs; + unsigned long flag; + struct t3_swsq *sqp; + int wr_cnt = 1; + + qhp = to_iwch_qp(ibqp); + spin_lock_irqsave(&qhp->lock, flag); + if (qhp->attr.state > IWCH_QP_STATE_RTS) { + spin_unlock_irqrestore(&qhp->lock, flag); + err = -EINVAL; + goto out; + } + num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr, + qhp->wq.sq_size_log2); + if (num_wrs == 0) { + spin_unlock_irqrestore(&qhp->lock, flag); + err = -ENOMEM; + goto out; + } + while (wr) { + if (num_wrs == 0) { + err = -ENOMEM; + break; + } + idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2); + wqe = (union t3_wr *) (qhp->wq.queue + idx); + t3_wr_flags = 0; + if (wr->send_flags & IB_SEND_SOLICITED) + t3_wr_flags |= T3_SOLICITED_EVENT_FLAG; + if (wr->send_flags & IB_SEND_SIGNALED) + t3_wr_flags |= T3_COMPLETION_FLAG; + sqp = qhp->wq.sq + + Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2); + switch (wr->opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_INV: + if (wr->send_flags & IB_SEND_FENCE) + t3_wr_flags |= T3_READ_FENCE_FLAG; + t3_wr_opcode = T3_WR_SEND; + err = build_rdma_send(wqe, wr, &t3_wr_flit_cnt); + break; + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + t3_wr_opcode = T3_WR_WRITE; + err = build_rdma_write(wqe, wr, &t3_wr_flit_cnt); + break; + case IB_WR_RDMA_READ: + case IB_WR_RDMA_READ_WITH_INV: + t3_wr_opcode = T3_WR_READ; + t3_wr_flags = 0; /* T3 reads are always signaled */ + err = build_rdma_read(wqe, wr, &t3_wr_flit_cnt); + if (err) + break; + sqp->read_len = wqe->read.local_len; + if (!qhp->wq.oldest_read) + qhp->wq.oldest_read = sqp; + break; + case IB_WR_FAST_REG_MR: + t3_wr_opcode = T3_WR_FASTREG; + err = build_fastreg(wqe, wr, &t3_wr_flit_cnt, + &wr_cnt, &qhp->wq); + break; + case IB_WR_LOCAL_INV: + if (wr->send_flags & IB_SEND_FENCE) + t3_wr_flags |= T3_LOCAL_FENCE_FLAG; + t3_wr_opcode = T3_WR_INV_STAG; + err = build_inv_stag(wqe, wr, &t3_wr_flit_cnt); + break; + default: + PDBG("%s post of type=%d TBD!\n", __func__, + wr->opcode); + err = -EINVAL; + } + if (err) + break; + wqe->send.wrid.id0.hi = qhp->wq.sq_wptr; + sqp->wr_id = wr->wr_id; + sqp->opcode = wr2opcode(t3_wr_opcode); + sqp->sq_wptr = qhp->wq.sq_wptr; + sqp->complete = 0; + sqp->signaled = (wr->send_flags & IB_SEND_SIGNALED); + + build_fw_riwrh((void *) wqe, t3_wr_opcode, t3_wr_flags, + Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2), + 0, t3_wr_flit_cnt, + (wr_cnt == 1) ? T3_SOPEOP : T3_SOP); + PDBG("%s cookie 0x%llx wq idx 0x%x swsq idx %ld opcode %d\n", + __func__, (unsigned long long) wr->wr_id, idx, + Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2), + sqp->opcode); + wr = wr->next; + num_wrs--; + qhp->wq.wptr += wr_cnt; + ++(qhp->wq.sq_wptr); + } + spin_unlock_irqrestore(&qhp->lock, flag); + if (cxio_wq_db_enabled(&qhp->wq)) + ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid); + +out: + if (err) + *bad_wr = wr; + return err; +} + +int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + int err = 0; + struct iwch_qp *qhp; + u32 idx; + union t3_wr *wqe; + u32 num_wrs; + unsigned long flag; + + qhp = to_iwch_qp(ibqp); + spin_lock_irqsave(&qhp->lock, flag); + if (qhp->attr.state > IWCH_QP_STATE_RTS) { + spin_unlock_irqrestore(&qhp->lock, flag); + err = -EINVAL; + goto out; + } + num_wrs = Q_FREECNT(qhp->wq.rq_rptr, qhp->wq.rq_wptr, + qhp->wq.rq_size_log2) - 1; + if (!wr) { + spin_unlock_irqrestore(&qhp->lock, flag); + err = -ENOMEM; + goto out; + } + while (wr) { + if (wr->num_sge > T3_MAX_SGE) { + err = -EINVAL; + break; + } + idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2); + wqe = (union t3_wr *) (qhp->wq.queue + idx); + if (num_wrs) + if (wr->sg_list[0].lkey) + err = build_rdma_recv(qhp, wqe, wr); + else + err = build_zero_stag_recv(qhp, wqe, wr); + else + err = -ENOMEM; + + if (err) + break; + + build_fw_riwrh((void *) wqe, T3_WR_RCV, T3_COMPLETION_FLAG, + Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2), + 0, sizeof(struct t3_receive_wr) >> 3, T3_SOPEOP); + PDBG("%s cookie 0x%llx idx 0x%x rq_wptr 0x%x rw_rptr 0x%x " + "wqe %p \n", __func__, (unsigned long long) wr->wr_id, + idx, qhp->wq.rq_wptr, qhp->wq.rq_rptr, wqe); + ++(qhp->wq.rq_wptr); + ++(qhp->wq.wptr); + wr = wr->next; + num_wrs--; + } + spin_unlock_irqrestore(&qhp->lock, flag); + if (cxio_wq_db_enabled(&qhp->wq)) + ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid); + +out: + if (err) + *bad_wr = wr; + return err; +} + +int iwch_bind_mw(struct ib_qp *qp, + struct ib_mw *mw, + struct ib_mw_bind *mw_bind) +{ + struct iwch_dev *rhp; + struct iwch_mw *mhp; + struct iwch_qp *qhp; + union t3_wr *wqe; + u32 pbl_addr; + u8 page_size; + u32 num_wrs; + unsigned long flag; + struct ib_sge sgl; + int err=0; + enum t3_wr_flags t3_wr_flags; + u32 idx; + struct t3_swsq *sqp; + + qhp = to_iwch_qp(qp); + mhp = to_iwch_mw(mw); + rhp = qhp->rhp; + + spin_lock_irqsave(&qhp->lock, flag); + if (qhp->attr.state > IWCH_QP_STATE_RTS) { + spin_unlock_irqrestore(&qhp->lock, flag); + return -EINVAL; + } + num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr, + qhp->wq.sq_size_log2); + if (num_wrs == 0) { + spin_unlock_irqrestore(&qhp->lock, flag); + return -ENOMEM; + } + idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2); + PDBG("%s: idx 0x%0x, mw 0x%p, mw_bind 0x%p\n", __func__, idx, + mw, mw_bind); + wqe = (union t3_wr *) (qhp->wq.queue + idx); + + t3_wr_flags = 0; + if (mw_bind->send_flags & IB_SEND_SIGNALED) + t3_wr_flags = T3_COMPLETION_FLAG; + + sgl.addr = mw_bind->bind_info.addr; + sgl.lkey = mw_bind->bind_info.mr->lkey; + sgl.length = mw_bind->bind_info.length; + wqe->bind.reserved = 0; + wqe->bind.type = TPT_VATO; + + /* TBD: check perms */ + wqe->bind.perms = iwch_ib_to_tpt_bind_access( + mw_bind->bind_info.mw_access_flags); + wqe->bind.mr_stag = cpu_to_be32(mw_bind->bind_info.mr->lkey); + wqe->bind.mw_stag = cpu_to_be32(mw->rkey); + wqe->bind.mw_len = cpu_to_be32(mw_bind->bind_info.length); + wqe->bind.mw_va = cpu_to_be64(mw_bind->bind_info.addr); + err = iwch_sgl2pbl_map(rhp, &sgl, 1, &pbl_addr, &page_size); + if (err) { + spin_unlock_irqrestore(&qhp->lock, flag); + return err; + } + wqe->send.wrid.id0.hi = qhp->wq.sq_wptr; + sqp = qhp->wq.sq + Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2); + sqp->wr_id = mw_bind->wr_id; + sqp->opcode = T3_BIND_MW; + sqp->sq_wptr = qhp->wq.sq_wptr; + sqp->complete = 0; + sqp->signaled = (mw_bind->send_flags & IB_SEND_SIGNALED); + wqe->bind.mr_pbl_addr = cpu_to_be32(pbl_addr); + wqe->bind.mr_pagesz = page_size; + build_fw_riwrh((void *)wqe, T3_WR_BIND, t3_wr_flags, + Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2), 0, + sizeof(struct t3_bind_mw_wr) >> 3, T3_SOPEOP); + ++(qhp->wq.wptr); + ++(qhp->wq.sq_wptr); + spin_unlock_irqrestore(&qhp->lock, flag); + + if (cxio_wq_db_enabled(&qhp->wq)) + ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid); + + return err; +} + +static inline void build_term_codes(struct respQ_msg_t *rsp_msg, + u8 *layer_type, u8 *ecode) +{ + int status = TPT_ERR_INTERNAL_ERR; + int tagged = 0; + int opcode = -1; + int rqtype = 0; + int send_inv = 0; + + if (rsp_msg) { + status = CQE_STATUS(rsp_msg->cqe); + opcode = CQE_OPCODE(rsp_msg->cqe); + rqtype = RQ_TYPE(rsp_msg->cqe); + send_inv = (opcode == T3_SEND_WITH_INV) || + (opcode == T3_SEND_WITH_SE_INV); + tagged = (opcode == T3_RDMA_WRITE) || + (rqtype && (opcode == T3_READ_RESP)); + } + + switch (status) { + case TPT_ERR_STAG: + if (send_inv) { + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; + *ecode = RDMAP_CANT_INV_STAG; + } else { + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_INV_STAG; + } + break; + case TPT_ERR_PDID: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + if ((opcode == T3_SEND_WITH_INV) || + (opcode == T3_SEND_WITH_SE_INV)) + *ecode = RDMAP_CANT_INV_STAG; + else + *ecode = RDMAP_STAG_NOT_ASSOC; + break; + case TPT_ERR_QPID: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_STAG_NOT_ASSOC; + break; + case TPT_ERR_ACCESS: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_ACC_VIOL; + break; + case TPT_ERR_WRAP: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_TO_WRAP; + break; + case TPT_ERR_BOUND: + if (tagged) { + *layer_type = LAYER_DDP|DDP_TAGGED_ERR; + *ecode = DDPT_BASE_BOUNDS; + } else { + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_BASE_BOUNDS; + } + break; + case TPT_ERR_INVALIDATE_SHARED_MR: + case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; + *ecode = RDMAP_CANT_INV_STAG; + break; + case TPT_ERR_ECC: + case TPT_ERR_ECC_PSTAG: + case TPT_ERR_INTERNAL_ERR: + *layer_type = LAYER_RDMAP|RDMAP_LOCAL_CATA; + *ecode = 0; + break; + case TPT_ERR_OUT_OF_RQE: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_MSN_NOBUF; + break; + case TPT_ERR_PBL_ADDR_BOUND: + *layer_type = LAYER_DDP|DDP_TAGGED_ERR; + *ecode = DDPT_BASE_BOUNDS; + break; + case TPT_ERR_CRC: + *layer_type = LAYER_MPA|DDP_LLP; + *ecode = MPA_CRC_ERR; + break; + case TPT_ERR_MARKER: + *layer_type = LAYER_MPA|DDP_LLP; + *ecode = MPA_MARKER_ERR; + break; + case TPT_ERR_PDU_LEN_ERR: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_MSG_TOOBIG; + break; + case TPT_ERR_DDP_VERSION: + if (tagged) { + *layer_type = LAYER_DDP|DDP_TAGGED_ERR; + *ecode = DDPT_INV_VERS; + } else { + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_VERS; + } + break; + case TPT_ERR_RDMA_VERSION: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; + *ecode = RDMAP_INV_VERS; + break; + case TPT_ERR_OPCODE: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; + *ecode = RDMAP_INV_OPCODE; + break; + case TPT_ERR_DDP_QUEUE_NUM: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_QN; + break; + case TPT_ERR_MSN: + case TPT_ERR_MSN_GAP: + case TPT_ERR_MSN_RANGE: + case TPT_ERR_IRD_OVERFLOW: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_MSN_RANGE; + break; + case TPT_ERR_TBIT: + *layer_type = LAYER_DDP|DDP_LOCAL_CATA; + *ecode = 0; + break; + case TPT_ERR_MO: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_MO; + break; + default: + *layer_type = LAYER_RDMAP|DDP_LOCAL_CATA; + *ecode = 0; + break; + } +} + +int iwch_post_zb_read(struct iwch_ep *ep) +{ + union t3_wr *wqe; + struct sk_buff *skb; + u8 flit_cnt = sizeof(struct t3_rdma_read_wr) >> 3; + + PDBG("%s enter\n", __func__); + skb = alloc_skb(40, GFP_KERNEL); + if (!skb) { + printk(KERN_ERR "%s cannot send zb_read!!\n", __func__); + return -ENOMEM; + } + wqe = (union t3_wr *)skb_put(skb, sizeof(struct t3_rdma_read_wr)); + memset(wqe, 0, sizeof(struct t3_rdma_read_wr)); + wqe->read.rdmaop = T3_READ_REQ; + wqe->read.reserved[0] = 0; + wqe->read.reserved[1] = 0; + wqe->read.rem_stag = cpu_to_be32(1); + wqe->read.rem_to = cpu_to_be64(1); + wqe->read.local_stag = cpu_to_be32(1); + wqe->read.local_len = cpu_to_be32(0); + wqe->read.local_to = cpu_to_be64(1); + wqe->send.wrh.op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(T3_WR_READ)); + wqe->send.wrh.gen_tid_len = cpu_to_be32(V_FW_RIWR_TID(ep->hwtid)| + V_FW_RIWR_LEN(flit_cnt)); + skb->priority = CPL_PRIORITY_DATA; + return iwch_cxgb3_ofld_send(ep->com.qp->rhp->rdev.t3cdev_p, skb); +} + +/* + * This posts a TERMINATE with layer=RDMA, type=catastrophic. + */ +int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg) +{ + union t3_wr *wqe; + struct terminate_message *term; + struct sk_buff *skb; + + PDBG("%s %d\n", __func__, __LINE__); + skb = alloc_skb(40, GFP_ATOMIC); + if (!skb) { + printk(KERN_ERR "%s cannot send TERMINATE!\n", __func__); + return -ENOMEM; + } + wqe = (union t3_wr *)skb_put(skb, 40); + memset(wqe, 0, 40); + wqe->send.rdmaop = T3_TERMINATE; + + /* immediate data length */ + wqe->send.plen = htonl(4); + + /* immediate data starts here. */ + term = (struct terminate_message *)wqe->send.sgl; + build_term_codes(rsp_msg, &term->layer_etype, &term->ecode); + wqe->send.wrh.op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(T3_WR_SEND) | + V_FW_RIWR_FLAGS(T3_COMPLETION_FLAG | T3_NOTIFY_FLAG)); + wqe->send.wrh.gen_tid_len = cpu_to_be32(V_FW_RIWR_TID(qhp->ep->hwtid)); + skb->priority = CPL_PRIORITY_DATA; + return iwch_cxgb3_ofld_send(qhp->rhp->rdev.t3cdev_p, skb); +} + +/* + * Assumes qhp lock is held. + */ +static void __flush_qp(struct iwch_qp *qhp, struct iwch_cq *rchp, + struct iwch_cq *schp) +{ + int count; + int flushed; + + + PDBG("%s qhp %p rchp %p schp %p\n", __func__, qhp, rchp, schp); + /* take a ref on the qhp since we must release the lock */ + atomic_inc(&qhp->refcnt); + spin_unlock(&qhp->lock); + + /* locking hierarchy: cq lock first, then qp lock. */ + spin_lock(&rchp->lock); + spin_lock(&qhp->lock); + cxio_flush_hw_cq(&rchp->cq); + cxio_count_rcqes(&rchp->cq, &qhp->wq, &count); + flushed = cxio_flush_rq(&qhp->wq, &rchp->cq, count); + spin_unlock(&qhp->lock); + spin_unlock(&rchp->lock); + if (flushed) { + spin_lock(&rchp->comp_handler_lock); + (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); + spin_unlock(&rchp->comp_handler_lock); + } + + /* locking hierarchy: cq lock first, then qp lock. */ + spin_lock(&schp->lock); + spin_lock(&qhp->lock); + cxio_flush_hw_cq(&schp->cq); + cxio_count_scqes(&schp->cq, &qhp->wq, &count); + flushed = cxio_flush_sq(&qhp->wq, &schp->cq, count); + spin_unlock(&qhp->lock); + spin_unlock(&schp->lock); + if (flushed) { + spin_lock(&schp->comp_handler_lock); + (*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context); + spin_unlock(&schp->comp_handler_lock); + } + + /* deref */ + if (atomic_dec_and_test(&qhp->refcnt)) + wake_up(&qhp->wait); + + spin_lock(&qhp->lock); +} + +static void flush_qp(struct iwch_qp *qhp) +{ + struct iwch_cq *rchp, *schp; + + rchp = get_chp(qhp->rhp, qhp->attr.rcq); + schp = get_chp(qhp->rhp, qhp->attr.scq); + + if (qhp->ibqp.uobject) { + cxio_set_wq_in_error(&qhp->wq); + cxio_set_cq_in_error(&rchp->cq); + spin_lock(&rchp->comp_handler_lock); + (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); + spin_unlock(&rchp->comp_handler_lock); + if (schp != rchp) { + cxio_set_cq_in_error(&schp->cq); + spin_lock(&schp->comp_handler_lock); + (*schp->ibcq.comp_handler)(&schp->ibcq, + schp->ibcq.cq_context); + spin_unlock(&schp->comp_handler_lock); + } + return; + } + __flush_qp(qhp, rchp, schp); +} + + +/* + * Return count of RECV WRs posted + */ +u16 iwch_rqes_posted(struct iwch_qp *qhp) +{ + union t3_wr *wqe = qhp->wq.queue; + u16 count = 0; + + while (count < USHRT_MAX && fw_riwrh_opcode((struct fw_riwrh *)wqe) == T3_WR_RCV) { + count++; + wqe++; + } + PDBG("%s qhp %p count %u\n", __func__, qhp, count); + return count; +} + +static int rdma_init(struct iwch_dev *rhp, struct iwch_qp *qhp, + enum iwch_qp_attr_mask mask, + struct iwch_qp_attributes *attrs) +{ + struct t3_rdma_init_attr init_attr; + int ret; + + init_attr.tid = qhp->ep->hwtid; + init_attr.qpid = qhp->wq.qpid; + init_attr.pdid = qhp->attr.pd; + init_attr.scqid = qhp->attr.scq; + init_attr.rcqid = qhp->attr.rcq; + init_attr.rq_addr = qhp->wq.rq_addr; + init_attr.rq_size = 1 << qhp->wq.rq_size_log2; + init_attr.mpaattrs = uP_RI_MPA_IETF_ENABLE | + qhp->attr.mpa_attr.recv_marker_enabled | + (qhp->attr.mpa_attr.xmit_marker_enabled << 1) | + (qhp->attr.mpa_attr.crc_enabled << 2); + + init_attr.qpcaps = uP_RI_QP_RDMA_READ_ENABLE | + uP_RI_QP_RDMA_WRITE_ENABLE | + uP_RI_QP_BIND_ENABLE; + if (!qhp->ibqp.uobject) + init_attr.qpcaps |= uP_RI_QP_STAG0_ENABLE | + uP_RI_QP_FAST_REGISTER_ENABLE; + + init_attr.tcp_emss = qhp->ep->emss; + init_attr.ord = qhp->attr.max_ord; + init_attr.ird = qhp->attr.max_ird; + init_attr.qp_dma_addr = qhp->wq.dma_addr; + init_attr.qp_dma_size = (1UL << qhp->wq.size_log2); + init_attr.rqe_count = iwch_rqes_posted(qhp); + init_attr.flags = qhp->attr.mpa_attr.initiator ? MPA_INITIATOR : 0; + init_attr.chan = qhp->ep->l2t->smt_idx; + if (peer2peer) { + init_attr.rtr_type = RTR_READ; + if (init_attr.ord == 0 && qhp->attr.mpa_attr.initiator) + init_attr.ord = 1; + if (init_attr.ird == 0 && !qhp->attr.mpa_attr.initiator) + init_attr.ird = 1; + } else + init_attr.rtr_type = 0; + init_attr.irs = qhp->ep->rcv_seq; + PDBG("%s init_attr.rq_addr 0x%x init_attr.rq_size = %d " + "flags 0x%x qpcaps 0x%x\n", __func__, + init_attr.rq_addr, init_attr.rq_size, + init_attr.flags, init_attr.qpcaps); + ret = cxio_rdma_init(&rhp->rdev, &init_attr); + PDBG("%s ret %d\n", __func__, ret); + return ret; +} + +int iwch_modify_qp(struct iwch_dev *rhp, struct iwch_qp *qhp, + enum iwch_qp_attr_mask mask, + struct iwch_qp_attributes *attrs, + int internal) +{ + int ret = 0; + struct iwch_qp_attributes newattr = qhp->attr; + unsigned long flag; + int disconnect = 0; + int terminate = 0; + int abort = 0; + int free = 0; + struct iwch_ep *ep = NULL; + + PDBG("%s qhp %p qpid 0x%x ep %p state %d -> %d\n", __func__, + qhp, qhp->wq.qpid, qhp->ep, qhp->attr.state, + (mask & IWCH_QP_ATTR_NEXT_STATE) ? attrs->next_state : -1); + + spin_lock_irqsave(&qhp->lock, flag); + + /* Process attr changes if in IDLE */ + if (mask & IWCH_QP_ATTR_VALID_MODIFY) { + if (qhp->attr.state != IWCH_QP_STATE_IDLE) { + ret = -EIO; + goto out; + } + if (mask & IWCH_QP_ATTR_ENABLE_RDMA_READ) + newattr.enable_rdma_read = attrs->enable_rdma_read; + if (mask & IWCH_QP_ATTR_ENABLE_RDMA_WRITE) + newattr.enable_rdma_write = attrs->enable_rdma_write; + if (mask & IWCH_QP_ATTR_ENABLE_RDMA_BIND) + newattr.enable_bind = attrs->enable_bind; + if (mask & IWCH_QP_ATTR_MAX_ORD) { + if (attrs->max_ord > + rhp->attr.max_rdma_read_qp_depth) { + ret = -EINVAL; + goto out; + } + newattr.max_ord = attrs->max_ord; + } + if (mask & IWCH_QP_ATTR_MAX_IRD) { + if (attrs->max_ird > + rhp->attr.max_rdma_reads_per_qp) { + ret = -EINVAL; + goto out; + } + newattr.max_ird = attrs->max_ird; + } + qhp->attr = newattr; + } + + if (!(mask & IWCH_QP_ATTR_NEXT_STATE)) + goto out; + if (qhp->attr.state == attrs->next_state) + goto out; + + switch (qhp->attr.state) { + case IWCH_QP_STATE_IDLE: + switch (attrs->next_state) { + case IWCH_QP_STATE_RTS: + if (!(mask & IWCH_QP_ATTR_LLP_STREAM_HANDLE)) { + ret = -EINVAL; + goto out; + } + if (!(mask & IWCH_QP_ATTR_MPA_ATTR)) { + ret = -EINVAL; + goto out; + } + qhp->attr.mpa_attr = attrs->mpa_attr; + qhp->attr.llp_stream_handle = attrs->llp_stream_handle; + qhp->ep = qhp->attr.llp_stream_handle; + qhp->attr.state = IWCH_QP_STATE_RTS; + + /* + * Ref the endpoint here and deref when we + * disassociate the endpoint from the QP. This + * happens in CLOSING->IDLE transition or *->ERROR + * transition. + */ + get_ep(&qhp->ep->com); + spin_unlock_irqrestore(&qhp->lock, flag); + ret = rdma_init(rhp, qhp, mask, attrs); + spin_lock_irqsave(&qhp->lock, flag); + if (ret) + goto err; + break; + case IWCH_QP_STATE_ERROR: + qhp->attr.state = IWCH_QP_STATE_ERROR; + flush_qp(qhp); + break; + default: + ret = -EINVAL; + goto out; + } + break; + case IWCH_QP_STATE_RTS: + switch (attrs->next_state) { + case IWCH_QP_STATE_CLOSING: + BUG_ON(atomic_read(&qhp->ep->com.kref.refcount) < 2); + qhp->attr.state = IWCH_QP_STATE_CLOSING; + if (!internal) { + abort=0; + disconnect = 1; + ep = qhp->ep; + get_ep(&ep->com); + } + break; + case IWCH_QP_STATE_TERMINATE: + qhp->attr.state = IWCH_QP_STATE_TERMINATE; + if (qhp->ibqp.uobject) + cxio_set_wq_in_error(&qhp->wq); + if (!internal) + terminate = 1; + break; + case IWCH_QP_STATE_ERROR: + qhp->attr.state = IWCH_QP_STATE_ERROR; + if (!internal) { + abort=1; + disconnect = 1; + ep = qhp->ep; + get_ep(&ep->com); + } + goto err; + break; + default: + ret = -EINVAL; + goto out; + } + break; + case IWCH_QP_STATE_CLOSING: + if (!internal) { + ret = -EINVAL; + goto out; + } + switch (attrs->next_state) { + case IWCH_QP_STATE_IDLE: + flush_qp(qhp); + qhp->attr.state = IWCH_QP_STATE_IDLE; + qhp->attr.llp_stream_handle = NULL; + put_ep(&qhp->ep->com); + qhp->ep = NULL; + wake_up(&qhp->wait); + break; + case IWCH_QP_STATE_ERROR: + goto err; + default: + ret = -EINVAL; + goto err; + } + break; + case IWCH_QP_STATE_ERROR: + if (attrs->next_state != IWCH_QP_STATE_IDLE) { + ret = -EINVAL; + goto out; + } + + if (!Q_EMPTY(qhp->wq.sq_rptr, qhp->wq.sq_wptr) || + !Q_EMPTY(qhp->wq.rq_rptr, qhp->wq.rq_wptr)) { + ret = -EINVAL; + goto out; + } + qhp->attr.state = IWCH_QP_STATE_IDLE; + break; + case IWCH_QP_STATE_TERMINATE: + if (!internal) { + ret = -EINVAL; + goto out; + } + goto err; + break; + default: + printk(KERN_ERR "%s in a bad state %d\n", + __func__, qhp->attr.state); + ret = -EINVAL; + goto err; + break; + } + goto out; +err: + PDBG("%s disassociating ep %p qpid 0x%x\n", __func__, qhp->ep, + qhp->wq.qpid); + + /* disassociate the LLP connection */ + qhp->attr.llp_stream_handle = NULL; + ep = qhp->ep; + qhp->ep = NULL; + qhp->attr.state = IWCH_QP_STATE_ERROR; + free=1; + wake_up(&qhp->wait); + BUG_ON(!ep); + flush_qp(qhp); +out: + spin_unlock_irqrestore(&qhp->lock, flag); + + if (terminate) + iwch_post_terminate(qhp, NULL); + + /* + * If disconnect is 1, then we need to initiate a disconnect + * on the EP. This can be a normal close (RTS->CLOSING) or + * an abnormal close (RTS/CLOSING->ERROR). + */ + if (disconnect) { + iwch_ep_disconnect(ep, abort, GFP_KERNEL); + put_ep(&ep->com); + } + + /* + * If free is 1, then we've disassociated the EP from the QP + * and we need to dereference the EP. + */ + if (free) + put_ep(&ep->com); + + PDBG("%s exit state %d\n", __func__, qhp->attr.state); + return ret; +} diff --git a/drivers/infiniband/hw/cxgb3/iwch_user.h b/drivers/infiniband/hw/cxgb3/iwch_user.h new file mode 100644 index 0000000..a277c31 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch_user.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __IWCH_USER_H__ +#define __IWCH_USER_H__ + +#define IWCH_UVERBS_ABI_VERSION 1 + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ +struct iwch_create_cq_req { + __u64 user_rptr_addr; +}; + +struct iwch_create_cq_resp_v0 { + __u64 key; + __u32 cqid; + __u32 size_log2; +}; + +struct iwch_create_cq_resp { + __u64 key; + __u32 cqid; + __u32 size_log2; + __u32 memsize; + __u32 reserved; +}; + +struct iwch_create_qp_resp { + __u64 key; + __u64 db_key; + __u32 qpid; + __u32 size_log2; + __u32 sq_size_log2; + __u32 rq_size_log2; +}; + +struct iwch_reg_user_mr_resp { + __u32 pbl_addr; +}; +#endif diff --git a/drivers/infiniband/hw/cxgb3/tcb.h b/drivers/infiniband/hw/cxgb3/tcb.h new file mode 100644 index 0000000..c702dc1 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/tcb.h @@ -0,0 +1,632 @@ +/* + * Copyright (c) 2007 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _TCB_DEFS_H +#define _TCB_DEFS_H + +#define W_TCB_T_STATE 0 +#define S_TCB_T_STATE 0 +#define M_TCB_T_STATE 0xfULL +#define V_TCB_T_STATE(x) ((x) << S_TCB_T_STATE) + +#define W_TCB_TIMER 0 +#define S_TCB_TIMER 4 +#define M_TCB_TIMER 0x1ULL +#define V_TCB_TIMER(x) ((x) << S_TCB_TIMER) + +#define W_TCB_DACK_TIMER 0 +#define S_TCB_DACK_TIMER 5 +#define M_TCB_DACK_TIMER 0x1ULL +#define V_TCB_DACK_TIMER(x) ((x) << S_TCB_DACK_TIMER) + +#define W_TCB_DEL_FLAG 0 +#define S_TCB_DEL_FLAG 6 +#define M_TCB_DEL_FLAG 0x1ULL +#define V_TCB_DEL_FLAG(x) ((x) << S_TCB_DEL_FLAG) + +#define W_TCB_L2T_IX 0 +#define S_TCB_L2T_IX 7 +#define M_TCB_L2T_IX 0x7ffULL +#define V_TCB_L2T_IX(x) ((x) << S_TCB_L2T_IX) + +#define W_TCB_SMAC_SEL 0 +#define S_TCB_SMAC_SEL 18 +#define M_TCB_SMAC_SEL 0x3ULL +#define V_TCB_SMAC_SEL(x) ((x) << S_TCB_SMAC_SEL) + +#define W_TCB_TOS 0 +#define S_TCB_TOS 20 +#define M_TCB_TOS 0x3fULL +#define V_TCB_TOS(x) ((x) << S_TCB_TOS) + +#define W_TCB_MAX_RT 0 +#define S_TCB_MAX_RT 26 +#define M_TCB_MAX_RT 0xfULL +#define V_TCB_MAX_RT(x) ((x) << S_TCB_MAX_RT) + +#define W_TCB_T_RXTSHIFT 0 +#define S_TCB_T_RXTSHIFT 30 +#define M_TCB_T_RXTSHIFT 0xfULL +#define V_TCB_T_RXTSHIFT(x) ((x) << S_TCB_T_RXTSHIFT) + +#define W_TCB_T_DUPACKS 1 +#define S_TCB_T_DUPACKS 2 +#define M_TCB_T_DUPACKS 0xfULL +#define V_TCB_T_DUPACKS(x) ((x) << S_TCB_T_DUPACKS) + +#define W_TCB_T_MAXSEG 1 +#define S_TCB_T_MAXSEG 6 +#define M_TCB_T_MAXSEG 0xfULL +#define V_TCB_T_MAXSEG(x) ((x) << S_TCB_T_MAXSEG) + +#define W_TCB_T_FLAGS1 1 +#define S_TCB_T_FLAGS1 10 +#define M_TCB_T_FLAGS1 0xffffffffULL +#define V_TCB_T_FLAGS1(x) ((x) << S_TCB_T_FLAGS1) + +#define W_TCB_T_MIGRATION 1 +#define S_TCB_T_MIGRATION 20 +#define M_TCB_T_MIGRATION 0x1ULL +#define V_TCB_T_MIGRATION(x) ((x) << S_TCB_T_MIGRATION) + +#define W_TCB_T_FLAGS2 2 +#define S_TCB_T_FLAGS2 10 +#define M_TCB_T_FLAGS2 0x7fULL +#define V_TCB_T_FLAGS2(x) ((x) << S_TCB_T_FLAGS2) + +#define W_TCB_SND_SCALE 2 +#define S_TCB_SND_SCALE 17 +#define M_TCB_SND_SCALE 0xfULL +#define V_TCB_SND_SCALE(x) ((x) << S_TCB_SND_SCALE) + +#define W_TCB_RCV_SCALE 2 +#define S_TCB_RCV_SCALE 21 +#define M_TCB_RCV_SCALE 0xfULL +#define V_TCB_RCV_SCALE(x) ((x) << S_TCB_RCV_SCALE) + +#define W_TCB_SND_UNA_RAW 2 +#define S_TCB_SND_UNA_RAW 25 +#define M_TCB_SND_UNA_RAW 0x7ffffffULL +#define V_TCB_SND_UNA_RAW(x) ((x) << S_TCB_SND_UNA_RAW) + +#define W_TCB_SND_NXT_RAW 3 +#define S_TCB_SND_NXT_RAW 20 +#define M_TCB_SND_NXT_RAW 0x7ffffffULL +#define V_TCB_SND_NXT_RAW(x) ((x) << S_TCB_SND_NXT_RAW) + +#define W_TCB_RCV_NXT 4 +#define S_TCB_RCV_NXT 15 +#define M_TCB_RCV_NXT 0xffffffffULL +#define V_TCB_RCV_NXT(x) ((x) << S_TCB_RCV_NXT) + +#define W_TCB_RCV_ADV 5 +#define S_TCB_RCV_ADV 15 +#define M_TCB_RCV_ADV 0xffffULL +#define V_TCB_RCV_ADV(x) ((x) << S_TCB_RCV_ADV) + +#define W_TCB_SND_MAX_RAW 5 +#define S_TCB_SND_MAX_RAW 31 +#define M_TCB_SND_MAX_RAW 0x7ffffffULL +#define V_TCB_SND_MAX_RAW(x) ((x) << S_TCB_SND_MAX_RAW) + +#define W_TCB_SND_CWND 6 +#define S_TCB_SND_CWND 26 +#define M_TCB_SND_CWND 0x7ffffffULL +#define V_TCB_SND_CWND(x) ((x) << S_TCB_SND_CWND) + +#define W_TCB_SND_SSTHRESH 7 +#define S_TCB_SND_SSTHRESH 21 +#define M_TCB_SND_SSTHRESH 0x7ffffffULL +#define V_TCB_SND_SSTHRESH(x) ((x) << S_TCB_SND_SSTHRESH) + +#define W_TCB_T_RTT_TS_RECENT_AGE 8 +#define S_TCB_T_RTT_TS_RECENT_AGE 16 +#define M_TCB_T_RTT_TS_RECENT_AGE 0xffffffffULL +#define V_TCB_T_RTT_TS_RECENT_AGE(x) ((x) << S_TCB_T_RTT_TS_RECENT_AGE) + +#define W_TCB_T_RTSEQ_RECENT 9 +#define S_TCB_T_RTSEQ_RECENT 16 +#define M_TCB_T_RTSEQ_RECENT 0xffffffffULL +#define V_TCB_T_RTSEQ_RECENT(x) ((x) << S_TCB_T_RTSEQ_RECENT) + +#define W_TCB_T_SRTT 10 +#define S_TCB_T_SRTT 16 +#define M_TCB_T_SRTT 0xffffULL +#define V_TCB_T_SRTT(x) ((x) << S_TCB_T_SRTT) + +#define W_TCB_T_RTTVAR 11 +#define S_TCB_T_RTTVAR 0 +#define M_TCB_T_RTTVAR 0xffffULL +#define V_TCB_T_RTTVAR(x) ((x) << S_TCB_T_RTTVAR) + +#define W_TCB_TS_LAST_ACK_SENT_RAW 11 +#define S_TCB_TS_LAST_ACK_SENT_RAW 16 +#define M_TCB_TS_LAST_ACK_SENT_RAW 0x7ffffffULL +#define V_TCB_TS_LAST_ACK_SENT_RAW(x) ((x) << S_TCB_TS_LAST_ACK_SENT_RAW) + +#define W_TCB_DIP 12 +#define S_TCB_DIP 11 +#define M_TCB_DIP 0xffffffffULL +#define V_TCB_DIP(x) ((x) << S_TCB_DIP) + +#define W_TCB_SIP 13 +#define S_TCB_SIP 11 +#define M_TCB_SIP 0xffffffffULL +#define V_TCB_SIP(x) ((x) << S_TCB_SIP) + +#define W_TCB_DP 14 +#define S_TCB_DP 11 +#define M_TCB_DP 0xffffULL +#define V_TCB_DP(x) ((x) << S_TCB_DP) + +#define W_TCB_SP 14 +#define S_TCB_SP 27 +#define M_TCB_SP 0xffffULL +#define V_TCB_SP(x) ((x) << S_TCB_SP) + +#define W_TCB_TIMESTAMP 15 +#define S_TCB_TIMESTAMP 11 +#define M_TCB_TIMESTAMP 0xffffffffULL +#define V_TCB_TIMESTAMP(x) ((x) << S_TCB_TIMESTAMP) + +#define W_TCB_TIMESTAMP_OFFSET 16 +#define S_TCB_TIMESTAMP_OFFSET 11 +#define M_TCB_TIMESTAMP_OFFSET 0xfULL +#define V_TCB_TIMESTAMP_OFFSET(x) ((x) << S_TCB_TIMESTAMP_OFFSET) + +#define W_TCB_TX_MAX 16 +#define S_TCB_TX_MAX 15 +#define M_TCB_TX_MAX 0xffffffffULL +#define V_TCB_TX_MAX(x) ((x) << S_TCB_TX_MAX) + +#define W_TCB_TX_HDR_PTR_RAW 17 +#define S_TCB_TX_HDR_PTR_RAW 15 +#define M_TCB_TX_HDR_PTR_RAW 0x1ffffULL +#define V_TCB_TX_HDR_PTR_RAW(x) ((x) << S_TCB_TX_HDR_PTR_RAW) + +#define W_TCB_TX_LAST_PTR_RAW 18 +#define S_TCB_TX_LAST_PTR_RAW 0 +#define M_TCB_TX_LAST_PTR_RAW 0x1ffffULL +#define V_TCB_TX_LAST_PTR_RAW(x) ((x) << S_TCB_TX_LAST_PTR_RAW) + +#define W_TCB_TX_COMPACT 18 +#define S_TCB_TX_COMPACT 17 +#define M_TCB_TX_COMPACT 0x1ULL +#define V_TCB_TX_COMPACT(x) ((x) << S_TCB_TX_COMPACT) + +#define W_TCB_RX_COMPACT 18 +#define S_TCB_RX_COMPACT 18 +#define M_TCB_RX_COMPACT 0x1ULL +#define V_TCB_RX_COMPACT(x) ((x) << S_TCB_RX_COMPACT) + +#define W_TCB_RCV_WND 18 +#define S_TCB_RCV_WND 19 +#define M_TCB_RCV_WND 0x7ffffffULL +#define V_TCB_RCV_WND(x) ((x) << S_TCB_RCV_WND) + +#define W_TCB_RX_HDR_OFFSET 19 +#define S_TCB_RX_HDR_OFFSET 14 +#define M_TCB_RX_HDR_OFFSET 0x7ffffffULL +#define V_TCB_RX_HDR_OFFSET(x) ((x) << S_TCB_RX_HDR_OFFSET) + +#define W_TCB_RX_FRAG0_START_IDX_RAW 20 +#define S_TCB_RX_FRAG0_START_IDX_RAW 9 +#define M_TCB_RX_FRAG0_START_IDX_RAW 0x7ffffffULL +#define V_TCB_RX_FRAG0_START_IDX_RAW(x) ((x) << S_TCB_RX_FRAG0_START_IDX_RAW) + +#define W_TCB_RX_FRAG1_START_IDX_OFFSET 21 +#define S_TCB_RX_FRAG1_START_IDX_OFFSET 4 +#define M_TCB_RX_FRAG1_START_IDX_OFFSET 0x7ffffffULL +#define V_TCB_RX_FRAG1_START_IDX_OFFSET(x) ((x) << S_TCB_RX_FRAG1_START_IDX_OFFSET) + +#define W_TCB_RX_FRAG0_LEN 21 +#define S_TCB_RX_FRAG0_LEN 31 +#define M_TCB_RX_FRAG0_LEN 0x7ffffffULL +#define V_TCB_RX_FRAG0_LEN(x) ((x) << S_TCB_RX_FRAG0_LEN) + +#define W_TCB_RX_FRAG1_LEN 22 +#define S_TCB_RX_FRAG1_LEN 26 +#define M_TCB_RX_FRAG1_LEN 0x7ffffffULL +#define V_TCB_RX_FRAG1_LEN(x) ((x) << S_TCB_RX_FRAG1_LEN) + +#define W_TCB_NEWRENO_RECOVER 23 +#define S_TCB_NEWRENO_RECOVER 21 +#define M_TCB_NEWRENO_RECOVER 0x7ffffffULL +#define V_TCB_NEWRENO_RECOVER(x) ((x) << S_TCB_NEWRENO_RECOVER) + +#define W_TCB_PDU_HAVE_LEN 24 +#define S_TCB_PDU_HAVE_LEN 16 +#define M_TCB_PDU_HAVE_LEN 0x1ULL +#define V_TCB_PDU_HAVE_LEN(x) ((x) << S_TCB_PDU_HAVE_LEN) + +#define W_TCB_PDU_LEN 24 +#define S_TCB_PDU_LEN 17 +#define M_TCB_PDU_LEN 0xffffULL +#define V_TCB_PDU_LEN(x) ((x) << S_TCB_PDU_LEN) + +#define W_TCB_RX_QUIESCE 25 +#define S_TCB_RX_QUIESCE 1 +#define M_TCB_RX_QUIESCE 0x1ULL +#define V_TCB_RX_QUIESCE(x) ((x) << S_TCB_RX_QUIESCE) + +#define W_TCB_RX_PTR_RAW 25 +#define S_TCB_RX_PTR_RAW 2 +#define M_TCB_RX_PTR_RAW 0x1ffffULL +#define V_TCB_RX_PTR_RAW(x) ((x) << S_TCB_RX_PTR_RAW) + +#define W_TCB_CPU_NO 25 +#define S_TCB_CPU_NO 19 +#define M_TCB_CPU_NO 0x7fULL +#define V_TCB_CPU_NO(x) ((x) << S_TCB_CPU_NO) + +#define W_TCB_ULP_TYPE 25 +#define S_TCB_ULP_TYPE 26 +#define M_TCB_ULP_TYPE 0xfULL +#define V_TCB_ULP_TYPE(x) ((x) << S_TCB_ULP_TYPE) + +#define W_TCB_RX_FRAG1_PTR_RAW 25 +#define S_TCB_RX_FRAG1_PTR_RAW 30 +#define M_TCB_RX_FRAG1_PTR_RAW 0x1ffffULL +#define V_TCB_RX_FRAG1_PTR_RAW(x) ((x) << S_TCB_RX_FRAG1_PTR_RAW) + +#define W_TCB_RX_FRAG2_START_IDX_OFFSET_RAW 26 +#define S_TCB_RX_FRAG2_START_IDX_OFFSET_RAW 15 +#define M_TCB_RX_FRAG2_START_IDX_OFFSET_RAW 0x7ffffffULL +#define V_TCB_RX_FRAG2_START_IDX_OFFSET_RAW(x) ((x) << S_TCB_RX_FRAG2_START_IDX_OFFSET_RAW) + +#define W_TCB_RX_FRAG2_PTR_RAW 27 +#define S_TCB_RX_FRAG2_PTR_RAW 10 +#define M_TCB_RX_FRAG2_PTR_RAW 0x1ffffULL +#define V_TCB_RX_FRAG2_PTR_RAW(x) ((x) << S_TCB_RX_FRAG2_PTR_RAW) + +#define W_TCB_RX_FRAG2_LEN_RAW 27 +#define S_TCB_RX_FRAG2_LEN_RAW 27 +#define M_TCB_RX_FRAG2_LEN_RAW 0x7ffffffULL +#define V_TCB_RX_FRAG2_LEN_RAW(x) ((x) << S_TCB_RX_FRAG2_LEN_RAW) + +#define W_TCB_RX_FRAG3_PTR_RAW 28 +#define S_TCB_RX_FRAG3_PTR_RAW 22 +#define M_TCB_RX_FRAG3_PTR_RAW 0x1ffffULL +#define V_TCB_RX_FRAG3_PTR_RAW(x) ((x) << S_TCB_RX_FRAG3_PTR_RAW) + +#define W_TCB_RX_FRAG3_LEN_RAW 29 +#define S_TCB_RX_FRAG3_LEN_RAW 7 +#define M_TCB_RX_FRAG3_LEN_RAW 0x7ffffffULL +#define V_TCB_RX_FRAG3_LEN_RAW(x) ((x) << S_TCB_RX_FRAG3_LEN_RAW) + +#define W_TCB_RX_FRAG3_START_IDX_OFFSET_RAW 30 +#define S_TCB_RX_FRAG3_START_IDX_OFFSET_RAW 2 +#define M_TCB_RX_FRAG3_START_IDX_OFFSET_RAW 0x7ffffffULL +#define V_TCB_RX_FRAG3_START_IDX_OFFSET_RAW(x) ((x) << S_TCB_RX_FRAG3_START_IDX_OFFSET_RAW) + +#define W_TCB_PDU_HDR_LEN 30 +#define S_TCB_PDU_HDR_LEN 29 +#define M_TCB_PDU_HDR_LEN 0xffULL +#define V_TCB_PDU_HDR_LEN(x) ((x) << S_TCB_PDU_HDR_LEN) + +#define W_TCB_SLUSH1 31 +#define S_TCB_SLUSH1 5 +#define M_TCB_SLUSH1 0x7ffffULL +#define V_TCB_SLUSH1(x) ((x) << S_TCB_SLUSH1) + +#define W_TCB_ULP_RAW 31 +#define S_TCB_ULP_RAW 24 +#define M_TCB_ULP_RAW 0xffULL +#define V_TCB_ULP_RAW(x) ((x) << S_TCB_ULP_RAW) + +#define W_TCB_DDP_RDMAP_VERSION 25 +#define S_TCB_DDP_RDMAP_VERSION 30 +#define M_TCB_DDP_RDMAP_VERSION 0x1ULL +#define V_TCB_DDP_RDMAP_VERSION(x) ((x) << S_TCB_DDP_RDMAP_VERSION) + +#define W_TCB_MARKER_ENABLE_RX 25 +#define S_TCB_MARKER_ENABLE_RX 31 +#define M_TCB_MARKER_ENABLE_RX 0x1ULL +#define V_TCB_MARKER_ENABLE_RX(x) ((x) << S_TCB_MARKER_ENABLE_RX) + +#define W_TCB_MARKER_ENABLE_TX 26 +#define S_TCB_MARKER_ENABLE_TX 0 +#define M_TCB_MARKER_ENABLE_TX 0x1ULL +#define V_TCB_MARKER_ENABLE_TX(x) ((x) << S_TCB_MARKER_ENABLE_TX) + +#define W_TCB_CRC_ENABLE 26 +#define S_TCB_CRC_ENABLE 1 +#define M_TCB_CRC_ENABLE 0x1ULL +#define V_TCB_CRC_ENABLE(x) ((x) << S_TCB_CRC_ENABLE) + +#define W_TCB_IRS_ULP 26 +#define S_TCB_IRS_ULP 2 +#define M_TCB_IRS_ULP 0x1ffULL +#define V_TCB_IRS_ULP(x) ((x) << S_TCB_IRS_ULP) + +#define W_TCB_ISS_ULP 26 +#define S_TCB_ISS_ULP 11 +#define M_TCB_ISS_ULP 0x1ffULL +#define V_TCB_ISS_ULP(x) ((x) << S_TCB_ISS_ULP) + +#define W_TCB_TX_PDU_LEN 26 +#define S_TCB_TX_PDU_LEN 20 +#define M_TCB_TX_PDU_LEN 0x3fffULL +#define V_TCB_TX_PDU_LEN(x) ((x) << S_TCB_TX_PDU_LEN) + +#define W_TCB_TX_PDU_OUT 27 +#define S_TCB_TX_PDU_OUT 2 +#define M_TCB_TX_PDU_OUT 0x1ULL +#define V_TCB_TX_PDU_OUT(x) ((x) << S_TCB_TX_PDU_OUT) + +#define W_TCB_CQ_IDX_SQ 27 +#define S_TCB_CQ_IDX_SQ 3 +#define M_TCB_CQ_IDX_SQ 0xffffULL +#define V_TCB_CQ_IDX_SQ(x) ((x) << S_TCB_CQ_IDX_SQ) + +#define W_TCB_CQ_IDX_RQ 27 +#define S_TCB_CQ_IDX_RQ 19 +#define M_TCB_CQ_IDX_RQ 0xffffULL +#define V_TCB_CQ_IDX_RQ(x) ((x) << S_TCB_CQ_IDX_RQ) + +#define W_TCB_QP_ID 28 +#define S_TCB_QP_ID 3 +#define M_TCB_QP_ID 0xffffULL +#define V_TCB_QP_ID(x) ((x) << S_TCB_QP_ID) + +#define W_TCB_PD_ID 28 +#define S_TCB_PD_ID 19 +#define M_TCB_PD_ID 0xffffULL +#define V_TCB_PD_ID(x) ((x) << S_TCB_PD_ID) + +#define W_TCB_STAG 29 +#define S_TCB_STAG 3 +#define M_TCB_STAG 0xffffffffULL +#define V_TCB_STAG(x) ((x) << S_TCB_STAG) + +#define W_TCB_RQ_START 30 +#define S_TCB_RQ_START 3 +#define M_TCB_RQ_START 0x3ffffffULL +#define V_TCB_RQ_START(x) ((x) << S_TCB_RQ_START) + +#define W_TCB_RQ_MSN 30 +#define S_TCB_RQ_MSN 29 +#define M_TCB_RQ_MSN 0x3ffULL +#define V_TCB_RQ_MSN(x) ((x) << S_TCB_RQ_MSN) + +#define W_TCB_RQ_MAX_OFFSET 31 +#define S_TCB_RQ_MAX_OFFSET 7 +#define M_TCB_RQ_MAX_OFFSET 0xfULL +#define V_TCB_RQ_MAX_OFFSET(x) ((x) << S_TCB_RQ_MAX_OFFSET) + +#define W_TCB_RQ_WRITE_PTR 31 +#define S_TCB_RQ_WRITE_PTR 11 +#define M_TCB_RQ_WRITE_PTR 0x3ffULL +#define V_TCB_RQ_WRITE_PTR(x) ((x) << S_TCB_RQ_WRITE_PTR) + +#define W_TCB_INB_WRITE_PERM 31 +#define S_TCB_INB_WRITE_PERM 21 +#define M_TCB_INB_WRITE_PERM 0x1ULL +#define V_TCB_INB_WRITE_PERM(x) ((x) << S_TCB_INB_WRITE_PERM) + +#define W_TCB_INB_READ_PERM 31 +#define S_TCB_INB_READ_PERM 22 +#define M_TCB_INB_READ_PERM 0x1ULL +#define V_TCB_INB_READ_PERM(x) ((x) << S_TCB_INB_READ_PERM) + +#define W_TCB_ORD_L_BIT_VLD 31 +#define S_TCB_ORD_L_BIT_VLD 23 +#define M_TCB_ORD_L_BIT_VLD 0x1ULL +#define V_TCB_ORD_L_BIT_VLD(x) ((x) << S_TCB_ORD_L_BIT_VLD) + +#define W_TCB_RDMAP_OPCODE 31 +#define S_TCB_RDMAP_OPCODE 24 +#define M_TCB_RDMAP_OPCODE 0xfULL +#define V_TCB_RDMAP_OPCODE(x) ((x) << S_TCB_RDMAP_OPCODE) + +#define W_TCB_TX_FLUSH 31 +#define S_TCB_TX_FLUSH 28 +#define M_TCB_TX_FLUSH 0x1ULL +#define V_TCB_TX_FLUSH(x) ((x) << S_TCB_TX_FLUSH) + +#define W_TCB_TX_OOS_RXMT 31 +#define S_TCB_TX_OOS_RXMT 29 +#define M_TCB_TX_OOS_RXMT 0x1ULL +#define V_TCB_TX_OOS_RXMT(x) ((x) << S_TCB_TX_OOS_RXMT) + +#define W_TCB_TX_OOS_TXMT 31 +#define S_TCB_TX_OOS_TXMT 30 +#define M_TCB_TX_OOS_TXMT 0x1ULL +#define V_TCB_TX_OOS_TXMT(x) ((x) << S_TCB_TX_OOS_TXMT) + +#define W_TCB_SLUSH_AUX2 31 +#define S_TCB_SLUSH_AUX2 31 +#define M_TCB_SLUSH_AUX2 0x1ULL +#define V_TCB_SLUSH_AUX2(x) ((x) << S_TCB_SLUSH_AUX2) + +#define W_TCB_RX_FRAG1_PTR_RAW2 25 +#define S_TCB_RX_FRAG1_PTR_RAW2 30 +#define M_TCB_RX_FRAG1_PTR_RAW2 0x1ffffULL +#define V_TCB_RX_FRAG1_PTR_RAW2(x) ((x) << S_TCB_RX_FRAG1_PTR_RAW2) + +#define W_TCB_RX_DDP_FLAGS 26 +#define S_TCB_RX_DDP_FLAGS 15 +#define M_TCB_RX_DDP_FLAGS 0x3ffULL +#define V_TCB_RX_DDP_FLAGS(x) ((x) << S_TCB_RX_DDP_FLAGS) + +#define W_TCB_SLUSH_AUX3 26 +#define S_TCB_SLUSH_AUX3 31 +#define M_TCB_SLUSH_AUX3 0x1ffULL +#define V_TCB_SLUSH_AUX3(x) ((x) << S_TCB_SLUSH_AUX3) + +#define W_TCB_RX_DDP_BUF0_OFFSET 27 +#define S_TCB_RX_DDP_BUF0_OFFSET 8 +#define M_TCB_RX_DDP_BUF0_OFFSET 0x3fffffULL +#define V_TCB_RX_DDP_BUF0_OFFSET(x) ((x) << S_TCB_RX_DDP_BUF0_OFFSET) + +#define W_TCB_RX_DDP_BUF0_LEN 27 +#define S_TCB_RX_DDP_BUF0_LEN 30 +#define M_TCB_RX_DDP_BUF0_LEN 0x3fffffULL +#define V_TCB_RX_DDP_BUF0_LEN(x) ((x) << S_TCB_RX_DDP_BUF0_LEN) + +#define W_TCB_RX_DDP_BUF1_OFFSET 28 +#define S_TCB_RX_DDP_BUF1_OFFSET 20 +#define M_TCB_RX_DDP_BUF1_OFFSET 0x3fffffULL +#define V_TCB_RX_DDP_BUF1_OFFSET(x) ((x) << S_TCB_RX_DDP_BUF1_OFFSET) + +#define W_TCB_RX_DDP_BUF1_LEN 29 +#define S_TCB_RX_DDP_BUF1_LEN 10 +#define M_TCB_RX_DDP_BUF1_LEN 0x3fffffULL +#define V_TCB_RX_DDP_BUF1_LEN(x) ((x) << S_TCB_RX_DDP_BUF1_LEN) + +#define W_TCB_RX_DDP_BUF0_TAG 30 +#define S_TCB_RX_DDP_BUF0_TAG 0 +#define M_TCB_RX_DDP_BUF0_TAG 0xffffffffULL +#define V_TCB_RX_DDP_BUF0_TAG(x) ((x) << S_TCB_RX_DDP_BUF0_TAG) + +#define W_TCB_RX_DDP_BUF1_TAG 31 +#define S_TCB_RX_DDP_BUF1_TAG 0 +#define M_TCB_RX_DDP_BUF1_TAG 0xffffffffULL +#define V_TCB_RX_DDP_BUF1_TAG(x) ((x) << S_TCB_RX_DDP_BUF1_TAG) + +#define S_TF_DACK 10 +#define V_TF_DACK(x) ((x) << S_TF_DACK) + +#define S_TF_NAGLE 11 +#define V_TF_NAGLE(x) ((x) << S_TF_NAGLE) + +#define S_TF_RECV_SCALE 12 +#define V_TF_RECV_SCALE(x) ((x) << S_TF_RECV_SCALE) + +#define S_TF_RECV_TSTMP 13 +#define V_TF_RECV_TSTMP(x) ((x) << S_TF_RECV_TSTMP) + +#define S_TF_RECV_SACK 14 +#define V_TF_RECV_SACK(x) ((x) << S_TF_RECV_SACK) + +#define S_TF_TURBO 15 +#define V_TF_TURBO(x) ((x) << S_TF_TURBO) + +#define S_TF_KEEPALIVE 16 +#define V_TF_KEEPALIVE(x) ((x) << S_TF_KEEPALIVE) + +#define S_TF_TCAM_BYPASS 17 +#define V_TF_TCAM_BYPASS(x) ((x) << S_TF_TCAM_BYPASS) + +#define S_TF_CORE_FIN 18 +#define V_TF_CORE_FIN(x) ((x) << S_TF_CORE_FIN) + +#define S_TF_CORE_MORE 19 +#define V_TF_CORE_MORE(x) ((x) << S_TF_CORE_MORE) + +#define S_TF_MIGRATING 20 +#define V_TF_MIGRATING(x) ((x) << S_TF_MIGRATING) + +#define S_TF_ACTIVE_OPEN 21 +#define V_TF_ACTIVE_OPEN(x) ((x) << S_TF_ACTIVE_OPEN) + +#define S_TF_ASK_MODE 22 +#define V_TF_ASK_MODE(x) ((x) << S_TF_ASK_MODE) + +#define S_TF_NON_OFFLOAD 23 +#define V_TF_NON_OFFLOAD(x) ((x) << S_TF_NON_OFFLOAD) + +#define S_TF_MOD_SCHD 24 +#define V_TF_MOD_SCHD(x) ((x) << S_TF_MOD_SCHD) + +#define S_TF_MOD_SCHD_REASON0 25 +#define V_TF_MOD_SCHD_REASON0(x) ((x) << S_TF_MOD_SCHD_REASON0) + +#define S_TF_MOD_SCHD_REASON1 26 +#define V_TF_MOD_SCHD_REASON1(x) ((x) << S_TF_MOD_SCHD_REASON1) + +#define S_TF_MOD_SCHD_RX 27 +#define V_TF_MOD_SCHD_RX(x) ((x) << S_TF_MOD_SCHD_RX) + +#define S_TF_CORE_PUSH 28 +#define V_TF_CORE_PUSH(x) ((x) << S_TF_CORE_PUSH) + +#define S_TF_RCV_COALESCE_ENABLE 29 +#define V_TF_RCV_COALESCE_ENABLE(x) ((x) << S_TF_RCV_COALESCE_ENABLE) + +#define S_TF_RCV_COALESCE_PUSH 30 +#define V_TF_RCV_COALESCE_PUSH(x) ((x) << S_TF_RCV_COALESCE_PUSH) + +#define S_TF_RCV_COALESCE_LAST_PSH 31 +#define V_TF_RCV_COALESCE_LAST_PSH(x) ((x) << S_TF_RCV_COALESCE_LAST_PSH) + +#define S_TF_RCV_COALESCE_HEARTBEAT 32 +#define V_TF_RCV_COALESCE_HEARTBEAT(x) ((x) << S_TF_RCV_COALESCE_HEARTBEAT) + +#define S_TF_HALF_CLOSE 33 +#define V_TF_HALF_CLOSE(x) ((x) << S_TF_HALF_CLOSE) + +#define S_TF_DACK_MSS 34 +#define V_TF_DACK_MSS(x) ((x) << S_TF_DACK_MSS) + +#define S_TF_CCTRL_SEL0 35 +#define V_TF_CCTRL_SEL0(x) ((x) << S_TF_CCTRL_SEL0) + +#define S_TF_CCTRL_SEL1 36 +#define V_TF_CCTRL_SEL1(x) ((x) << S_TF_CCTRL_SEL1) + +#define S_TF_TCP_NEWRENO_FAST_RECOVERY 37 +#define V_TF_TCP_NEWRENO_FAST_RECOVERY(x) ((x) << S_TF_TCP_NEWRENO_FAST_RECOVERY) + +#define S_TF_TX_PACE_AUTO 38 +#define V_TF_TX_PACE_AUTO(x) ((x) << S_TF_TX_PACE_AUTO) + +#define S_TF_PEER_FIN_HELD 39 +#define V_TF_PEER_FIN_HELD(x) ((x) << S_TF_PEER_FIN_HELD) + +#define S_TF_CORE_URG 40 +#define V_TF_CORE_URG(x) ((x) << S_TF_CORE_URG) + +#define S_TF_RDMA_ERROR 41 +#define V_TF_RDMA_ERROR(x) ((x) << S_TF_RDMA_ERROR) + +#define S_TF_SSWS_DISABLED 42 +#define V_TF_SSWS_DISABLED(x) ((x) << S_TF_SSWS_DISABLED) + +#define S_TF_DUPACK_COUNT_ODD 43 +#define V_TF_DUPACK_COUNT_ODD(x) ((x) << S_TF_DUPACK_COUNT_ODD) + +#define S_TF_TX_CHANNEL 44 +#define V_TF_TX_CHANNEL(x) ((x) << S_TF_TX_CHANNEL) + +#define S_TF_RX_CHANNEL 45 +#define V_TF_RX_CHANNEL(x) ((x) << S_TF_RX_CHANNEL) + +#define S_TF_TX_PACE_FIXED 46 +#define V_TF_TX_PACE_FIXED(x) ((x) << S_TF_TX_PACE_FIXED) + +#define S_TF_RDMA_FLM_ERROR 47 +#define V_TF_RDMA_FLM_ERROR(x) ((x) << S_TF_RDMA_FLM_ERROR) + +#define S_TF_RX_FLOW_CONTROL_DISABLE 48 +#define V_TF_RX_FLOW_CONTROL_DISABLE(x) ((x) << S_TF_RX_FLOW_CONTROL_DISABLE) + +#endif /* _TCB_DEFS_H */ diff --git a/drivers/infiniband/hw/cxgb4/Kconfig b/drivers/infiniband/hw/cxgb4/Kconfig new file mode 100644 index 0000000..23f38cf --- /dev/null +++ b/drivers/infiniband/hw/cxgb4/Kconfig @@ -0,0 +1,18 @@ +config INFINIBAND_CXGB4 + tristate "Chelsio T4/T5 RDMA Driver" + depends on CHELSIO_T4 && INET && (IPV6 || IPV6=n) + select GENERIC_ALLOCATOR + ---help--- + This is an iWARP/RDMA driver for the Chelsio T4 and T5 + 1GbE, 10GbE adapters and T5 40GbE adapter. + + For general information about Chelsio and our products, visit + our website at . + + For customer support, please visit our customer support page at + . + + Please send feedback to . + + To compile this driver as a module, choose M here: the module + will be called iw_cxgb4. diff --git a/drivers/infiniband/hw/cxgb4/Makefile b/drivers/infiniband/hw/cxgb4/Makefile new file mode 100644 index 0000000..e11cf72 --- /dev/null +++ b/drivers/infiniband/hw/cxgb4/Makefile @@ -0,0 +1,5 @@ +ccflags-y := -Idrivers/net/ethernet/chelsio/cxgb4 + +obj-$(CONFIG_INFINIBAND_CXGB4) += iw_cxgb4.o + +iw_cxgb4-y := device.o cm.o provider.o mem.o cq.o qp.o resource.o ev.o id_table.o diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c new file mode 100644 index 0000000..fb61f66 --- /dev/null +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -0,0 +1,3993 @@ +/* + * Copyright (c) 2009-2014 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include "iw_cxgb4.h" + +static char *states[] = { + "idle", + "listen", + "connecting", + "mpa_wait_req", + "mpa_req_sent", + "mpa_req_rcvd", + "mpa_rep_sent", + "fpdu_mode", + "aborting", + "closing", + "moribund", + "dead", + NULL, +}; + +static int nocong; +module_param(nocong, int, 0644); +MODULE_PARM_DESC(nocong, "Turn of congestion control (default=0)"); + +static int enable_ecn; +module_param(enable_ecn, int, 0644); +MODULE_PARM_DESC(enable_ecn, "Enable ECN (default=0/disabled)"); + +static int dack_mode = 1; +module_param(dack_mode, int, 0644); +MODULE_PARM_DESC(dack_mode, "Delayed ack mode (default=1)"); + +uint c4iw_max_read_depth = 32; +module_param(c4iw_max_read_depth, int, 0644); +MODULE_PARM_DESC(c4iw_max_read_depth, + "Per-connection max ORD/IRD (default=32)"); + +static int enable_tcp_timestamps; +module_param(enable_tcp_timestamps, int, 0644); +MODULE_PARM_DESC(enable_tcp_timestamps, "Enable tcp timestamps (default=0)"); + +static int enable_tcp_sack; +module_param(enable_tcp_sack, int, 0644); +MODULE_PARM_DESC(enable_tcp_sack, "Enable tcp SACK (default=0)"); + +static int enable_tcp_window_scaling = 1; +module_param(enable_tcp_window_scaling, int, 0644); +MODULE_PARM_DESC(enable_tcp_window_scaling, + "Enable tcp window scaling (default=1)"); + +int c4iw_debug; +module_param(c4iw_debug, int, 0644); +MODULE_PARM_DESC(c4iw_debug, "Enable debug logging (default=0)"); + +static int peer2peer = 1; +module_param(peer2peer, int, 0644); +MODULE_PARM_DESC(peer2peer, "Support peer2peer ULPs (default=1)"); + +static int p2p_type = FW_RI_INIT_P2PTYPE_READ_REQ; +module_param(p2p_type, int, 0644); +MODULE_PARM_DESC(p2p_type, "RDMAP opcode to use for the RTR message: " + "1=RDMA_READ 0=RDMA_WRITE (default 1)"); + +static int ep_timeout_secs = 60; +module_param(ep_timeout_secs, int, 0644); +MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout " + "in seconds (default=60)"); + +static int mpa_rev = 1; +module_param(mpa_rev, int, 0644); +MODULE_PARM_DESC(mpa_rev, "MPA Revision, 0 supports amso1100, " + "1 is RFC0544 spec compliant, 2 is IETF MPA Peer Connect Draft" + " compliant (default=1)"); + +static int markers_enabled; +module_param(markers_enabled, int, 0644); +MODULE_PARM_DESC(markers_enabled, "Enable MPA MARKERS (default(0)=disabled)"); + +static int crc_enabled = 1; +module_param(crc_enabled, int, 0644); +MODULE_PARM_DESC(crc_enabled, "Enable MPA CRC (default(1)=enabled)"); + +static int rcv_win = 256 * 1024; +module_param(rcv_win, int, 0644); +MODULE_PARM_DESC(rcv_win, "TCP receive window in bytes (default=256KB)"); + +static int snd_win = 128 * 1024; +module_param(snd_win, int, 0644); +MODULE_PARM_DESC(snd_win, "TCP send window in bytes (default=128KB)"); + +static struct workqueue_struct *workq; + +static struct sk_buff_head rxq; + +static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp); +static void ep_timeout(unsigned long arg); +static void connect_reply_upcall(struct c4iw_ep *ep, int status); + +static LIST_HEAD(timeout_list); +static spinlock_t timeout_lock; + +static void deref_qp(struct c4iw_ep *ep) +{ + c4iw_qp_rem_ref(&ep->com.qp->ibqp); + clear_bit(QP_REFERENCED, &ep->com.flags); +} + +static void ref_qp(struct c4iw_ep *ep) +{ + set_bit(QP_REFERENCED, &ep->com.flags); + c4iw_qp_add_ref(&ep->com.qp->ibqp); +} + +static void start_ep_timer(struct c4iw_ep *ep) +{ + PDBG("%s ep %p\n", __func__, ep); + if (timer_pending(&ep->timer)) { + pr_err("%s timer already started! ep %p\n", + __func__, ep); + return; + } + clear_bit(TIMEOUT, &ep->com.flags); + c4iw_get_ep(&ep->com); + ep->timer.expires = jiffies + ep_timeout_secs * HZ; + ep->timer.data = (unsigned long)ep; + ep->timer.function = ep_timeout; + add_timer(&ep->timer); +} + +static int stop_ep_timer(struct c4iw_ep *ep) +{ + PDBG("%s ep %p stopping\n", __func__, ep); + del_timer_sync(&ep->timer); + if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) { + c4iw_put_ep(&ep->com); + return 0; + } + return 1; +} + +static int c4iw_l2t_send(struct c4iw_rdev *rdev, struct sk_buff *skb, + struct l2t_entry *l2e) +{ + int error = 0; + + if (c4iw_fatal_error(rdev)) { + kfree_skb(skb); + PDBG("%s - device in error state - dropping\n", __func__); + return -EIO; + } + error = cxgb4_l2t_send(rdev->lldi.ports[0], skb, l2e); + if (error < 0) + kfree_skb(skb); + return error < 0 ? error : 0; +} + +int c4iw_ofld_send(struct c4iw_rdev *rdev, struct sk_buff *skb) +{ + int error = 0; + + if (c4iw_fatal_error(rdev)) { + kfree_skb(skb); + PDBG("%s - device in error state - dropping\n", __func__); + return -EIO; + } + error = cxgb4_ofld_send(rdev->lldi.ports[0], skb); + if (error < 0) + kfree_skb(skb); + return error < 0 ? error : 0; +} + +static void release_tid(struct c4iw_rdev *rdev, u32 hwtid, struct sk_buff *skb) +{ + struct cpl_tid_release *req; + + skb = get_skb(skb, sizeof *req, GFP_KERNEL); + if (!skb) + return; + req = (struct cpl_tid_release *) skb_put(skb, sizeof(*req)); + INIT_TP_WR(req, hwtid); + OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_TID_RELEASE, hwtid)); + set_wr_txq(skb, CPL_PRIORITY_SETUP, 0); + c4iw_ofld_send(rdev, skb); + return; +} + +static void set_emss(struct c4iw_ep *ep, u16 opt) +{ + ep->emss = ep->com.dev->rdev.lldi.mtus[GET_TCPOPT_MSS(opt)] - + ((AF_INET == ep->com.remote_addr.ss_family) ? + sizeof(struct iphdr) : sizeof(struct ipv6hdr)) - + sizeof(struct tcphdr); + ep->mss = ep->emss; + if (GET_TCPOPT_TSTAMP(opt)) + ep->emss -= round_up(TCPOLEN_TIMESTAMP, 4); + if (ep->emss < 128) + ep->emss = 128; + if (ep->emss & 7) + PDBG("Warning: misaligned mtu idx %u mss %u emss=%u\n", + GET_TCPOPT_MSS(opt), ep->mss, ep->emss); + PDBG("%s mss_idx %u mss %u emss=%u\n", __func__, GET_TCPOPT_MSS(opt), + ep->mss, ep->emss); +} + +static enum c4iw_ep_state state_read(struct c4iw_ep_common *epc) +{ + enum c4iw_ep_state state; + + mutex_lock(&epc->mutex); + state = epc->state; + mutex_unlock(&epc->mutex); + return state; +} + +static void __state_set(struct c4iw_ep_common *epc, enum c4iw_ep_state new) +{ + epc->state = new; +} + +static void state_set(struct c4iw_ep_common *epc, enum c4iw_ep_state new) +{ + mutex_lock(&epc->mutex); + PDBG("%s - %s -> %s\n", __func__, states[epc->state], states[new]); + __state_set(epc, new); + mutex_unlock(&epc->mutex); + return; +} + +static void *alloc_ep(int size, gfp_t gfp) +{ + struct c4iw_ep_common *epc; + + epc = kzalloc(size, gfp); + if (epc) { + kref_init(&epc->kref); + mutex_init(&epc->mutex); + c4iw_init_wr_wait(&epc->wr_wait); + } + PDBG("%s alloc ep %p\n", __func__, epc); + return epc; +} + +void _c4iw_free_ep(struct kref *kref) +{ + struct c4iw_ep *ep; + + ep = container_of(kref, struct c4iw_ep, com.kref); + PDBG("%s ep %p state %s\n", __func__, ep, states[state_read(&ep->com)]); + if (test_bit(QP_REFERENCED, &ep->com.flags)) + deref_qp(ep); + if (test_bit(RELEASE_RESOURCES, &ep->com.flags)) { + remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid); + cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, ep->hwtid); + dst_release(ep->dst); + cxgb4_l2t_release(ep->l2t); + } + if (test_bit(RELEASE_MAPINFO, &ep->com.flags)) { + print_addr(&ep->com, __func__, "remove_mapinfo/mapping"); + iwpm_remove_mapinfo(&ep->com.local_addr, + &ep->com.mapped_local_addr); + iwpm_remove_mapping(&ep->com.local_addr, RDMA_NL_C4IW); + } + kfree(ep); +} + +static void release_ep_resources(struct c4iw_ep *ep) +{ + set_bit(RELEASE_RESOURCES, &ep->com.flags); + c4iw_put_ep(&ep->com); +} + +static int status2errno(int status) +{ + switch (status) { + case CPL_ERR_NONE: + return 0; + case CPL_ERR_CONN_RESET: + return -ECONNRESET; + case CPL_ERR_ARP_MISS: + return -EHOSTUNREACH; + case CPL_ERR_CONN_TIMEDOUT: + return -ETIMEDOUT; + case CPL_ERR_TCAM_FULL: + return -ENOMEM; + case CPL_ERR_CONN_EXIST: + return -EADDRINUSE; + default: + return -EIO; + } +} + +/* + * Try and reuse skbs already allocated... + */ +static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp) +{ + if (skb && !skb_is_nonlinear(skb) && !skb_cloned(skb)) { + skb_trim(skb, 0); + skb_get(skb); + skb_reset_transport_header(skb); + } else { + skb = alloc_skb(len, gfp); + } + t4_set_arp_err_handler(skb, NULL, NULL); + return skb; +} + +static struct net_device *get_real_dev(struct net_device *egress_dev) +{ + return rdma_vlan_dev_real_dev(egress_dev) ? : egress_dev; +} + +static int our_interface(struct c4iw_dev *dev, struct net_device *egress_dev) +{ + int i; + + egress_dev = get_real_dev(egress_dev); + for (i = 0; i < dev->rdev.lldi.nports; i++) + if (dev->rdev.lldi.ports[i] == egress_dev) + return 1; + return 0; +} + +static struct dst_entry *find_route6(struct c4iw_dev *dev, __u8 *local_ip, + __u8 *peer_ip, __be16 local_port, + __be16 peer_port, u8 tos, + __u32 sin6_scope_id) +{ + struct dst_entry *dst = NULL; + + if (IS_ENABLED(CONFIG_IPV6)) { + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + memcpy(&fl6.daddr, peer_ip, 16); + memcpy(&fl6.saddr, local_ip, 16); + if (ipv6_addr_type(&fl6.daddr) & IPV6_ADDR_LINKLOCAL) + fl6.flowi6_oif = sin6_scope_id; + dst = ip6_route_output(&init_net, NULL, &fl6); + if (!dst) + goto out; + if (!our_interface(dev, ip6_dst_idev(dst)->dev) && + !(ip6_dst_idev(dst)->dev->flags & IFF_LOOPBACK)) { + dst_release(dst); + dst = NULL; + } + } + +out: + return dst; +} + +static struct dst_entry *find_route(struct c4iw_dev *dev, __be32 local_ip, + __be32 peer_ip, __be16 local_port, + __be16 peer_port, u8 tos) +{ + struct rtable *rt; + struct flowi4 fl4; + struct neighbour *n; + + rt = ip_route_output_ports(&init_net, &fl4, NULL, peer_ip, local_ip, + peer_port, local_port, IPPROTO_TCP, + tos, 0); + if (IS_ERR(rt)) + return NULL; + n = dst_neigh_lookup(&rt->dst, &peer_ip); + if (!n) + return NULL; + if (!our_interface(dev, n->dev) && + !(n->dev->flags & IFF_LOOPBACK)) { + neigh_release(n); + dst_release(&rt->dst); + return NULL; + } + neigh_release(n); + return &rt->dst; +} + +static void arp_failure_discard(void *handle, struct sk_buff *skb) +{ + PDBG("%s c4iw_dev %p\n", __func__, handle); + kfree_skb(skb); +} + +/* + * Handle an ARP failure for an active open. + */ +static void act_open_req_arp_failure(void *handle, struct sk_buff *skb) +{ + struct c4iw_ep *ep = handle; + + printk(KERN_ERR MOD "ARP failure duing connect\n"); + kfree_skb(skb); + connect_reply_upcall(ep, -EHOSTUNREACH); + state_set(&ep->com, DEAD); + remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid); + cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid); + dst_release(ep->dst); + cxgb4_l2t_release(ep->l2t); + c4iw_put_ep(&ep->com); +} + +/* + * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant + * and send it along. + */ +static void abort_arp_failure(void *handle, struct sk_buff *skb) +{ + struct c4iw_rdev *rdev = handle; + struct cpl_abort_req *req = cplhdr(skb); + + PDBG("%s rdev %p\n", __func__, rdev); + req->cmd = CPL_ABORT_NO_RST; + c4iw_ofld_send(rdev, skb); +} + +static void send_flowc(struct c4iw_ep *ep, struct sk_buff *skb) +{ + unsigned int flowclen = 80; + struct fw_flowc_wr *flowc; + int i; + + skb = get_skb(skb, flowclen, GFP_KERNEL); + flowc = (struct fw_flowc_wr *)__skb_put(skb, flowclen); + + flowc->op_to_nparams = cpu_to_be32(FW_WR_OP(FW_FLOWC_WR) | + FW_FLOWC_WR_NPARAMS(8)); + flowc->flowid_len16 = cpu_to_be32(FW_WR_LEN16(DIV_ROUND_UP(flowclen, + 16)) | FW_WR_FLOWID(ep->hwtid)); + + flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN; + flowc->mnemval[0].val = cpu_to_be32(FW_PFVF_CMD_PFN + (ep->com.dev->rdev.lldi.pf)); + flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH; + flowc->mnemval[1].val = cpu_to_be32(ep->tx_chan); + flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT; + flowc->mnemval[2].val = cpu_to_be32(ep->tx_chan); + flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID; + flowc->mnemval[3].val = cpu_to_be32(ep->rss_qid); + flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDNXT; + flowc->mnemval[4].val = cpu_to_be32(ep->snd_seq); + flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_RCVNXT; + flowc->mnemval[5].val = cpu_to_be32(ep->rcv_seq); + flowc->mnemval[6].mnemonic = FW_FLOWC_MNEM_SNDBUF; + flowc->mnemval[6].val = cpu_to_be32(ep->snd_win); + flowc->mnemval[7].mnemonic = FW_FLOWC_MNEM_MSS; + flowc->mnemval[7].val = cpu_to_be32(ep->emss); + /* Pad WR to 16 byte boundary */ + flowc->mnemval[8].mnemonic = 0; + flowc->mnemval[8].val = 0; + for (i = 0; i < 9; i++) { + flowc->mnemval[i].r4[0] = 0; + flowc->mnemval[i].r4[1] = 0; + flowc->mnemval[i].r4[2] = 0; + } + + set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); + c4iw_ofld_send(&ep->com.dev->rdev, skb); +} + +static int send_halfclose(struct c4iw_ep *ep, gfp_t gfp) +{ + struct cpl_close_con_req *req; + struct sk_buff *skb; + int wrlen = roundup(sizeof *req, 16); + + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + skb = get_skb(NULL, wrlen, gfp); + if (!skb) { + printk(KERN_ERR MOD "%s - failed to alloc skb\n", __func__); + return -ENOMEM; + } + set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); + t4_set_arp_err_handler(skb, NULL, arp_failure_discard); + req = (struct cpl_close_con_req *) skb_put(skb, wrlen); + memset(req, 0, wrlen); + INIT_TP_WR(req, ep->hwtid); + OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, + ep->hwtid)); + return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); +} + +static int send_abort(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp) +{ + struct cpl_abort_req *req; + int wrlen = roundup(sizeof *req, 16); + + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + skb = get_skb(skb, wrlen, gfp); + if (!skb) { + printk(KERN_ERR MOD "%s - failed to alloc skb.\n", + __func__); + return -ENOMEM; + } + set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); + t4_set_arp_err_handler(skb, &ep->com.dev->rdev, abort_arp_failure); + req = (struct cpl_abort_req *) skb_put(skb, wrlen); + memset(req, 0, wrlen); + INIT_TP_WR(req, ep->hwtid); + OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ABORT_REQ, ep->hwtid)); + req->cmd = CPL_ABORT_SEND_RST; + return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); +} + +/* + * c4iw_form_pm_msg - Form a port mapper message with mapping info + */ +static void c4iw_form_pm_msg(struct c4iw_ep *ep, + struct iwpm_sa_data *pm_msg) +{ + memcpy(&pm_msg->loc_addr, &ep->com.local_addr, + sizeof(ep->com.local_addr)); + memcpy(&pm_msg->rem_addr, &ep->com.remote_addr, + sizeof(ep->com.remote_addr)); +} + +/* + * c4iw_form_reg_msg - Form a port mapper message with dev info + */ +static void c4iw_form_reg_msg(struct c4iw_dev *dev, + struct iwpm_dev_data *pm_msg) +{ + memcpy(pm_msg->dev_name, dev->ibdev.name, IWPM_DEVNAME_SIZE); + memcpy(pm_msg->if_name, dev->rdev.lldi.ports[0]->name, + IWPM_IFNAME_SIZE); +} + +static void c4iw_record_pm_msg(struct c4iw_ep *ep, + struct iwpm_sa_data *pm_msg) +{ + memcpy(&ep->com.mapped_local_addr, &pm_msg->mapped_loc_addr, + sizeof(ep->com.mapped_local_addr)); + memcpy(&ep->com.mapped_remote_addr, &pm_msg->mapped_rem_addr, + sizeof(ep->com.mapped_remote_addr)); +} + +static void best_mtu(const unsigned short *mtus, unsigned short mtu, + unsigned int *idx, int use_ts, int ipv6) +{ + unsigned short hdr_size = (ipv6 ? + sizeof(struct ipv6hdr) : + sizeof(struct iphdr)) + + sizeof(struct tcphdr) + + (use_ts ? + round_up(TCPOLEN_TIMESTAMP, 4) : 0); + unsigned short data_size = mtu - hdr_size; + + cxgb4_best_aligned_mtu(mtus, hdr_size, data_size, 8, idx); +} + +static int send_connect(struct c4iw_ep *ep) +{ + struct cpl_act_open_req *req; + struct cpl_t5_act_open_req *t5_req; + struct cpl_act_open_req6 *req6; + struct cpl_t5_act_open_req6 *t5_req6; + struct sk_buff *skb; + u64 opt0; + u32 opt2; + unsigned int mtu_idx; + int wscale; + int wrlen; + int sizev4 = is_t4(ep->com.dev->rdev.lldi.adapter_type) ? + sizeof(struct cpl_act_open_req) : + sizeof(struct cpl_t5_act_open_req); + int sizev6 = is_t4(ep->com.dev->rdev.lldi.adapter_type) ? + sizeof(struct cpl_act_open_req6) : + sizeof(struct cpl_t5_act_open_req6); + struct sockaddr_in *la = (struct sockaddr_in *) + &ep->com.mapped_local_addr; + struct sockaddr_in *ra = (struct sockaddr_in *) + &ep->com.mapped_remote_addr; + struct sockaddr_in6 *la6 = (struct sockaddr_in6 *) + &ep->com.mapped_local_addr; + struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *) + &ep->com.mapped_remote_addr; + int win; + + wrlen = (ep->com.remote_addr.ss_family == AF_INET) ? + roundup(sizev4, 16) : + roundup(sizev6, 16); + + PDBG("%s ep %p atid %u\n", __func__, ep, ep->atid); + + skb = get_skb(NULL, wrlen, GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "%s - failed to alloc skb.\n", + __func__); + return -ENOMEM; + } + set_wr_txq(skb, CPL_PRIORITY_SETUP, ep->ctrlq_idx); + + best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx, + enable_tcp_timestamps, + (AF_INET == ep->com.remote_addr.ss_family) ? 0 : 1); + wscale = compute_wscale(rcv_win); + + /* + * Specify the largest window that will fit in opt0. The + * remainder will be specified in the rx_data_ack. + */ + win = ep->rcv_win >> 10; + if (win > RCV_BUFSIZ_MASK) + win = RCV_BUFSIZ_MASK; + + opt0 = (nocong ? NO_CONG(1) : 0) | + KEEP_ALIVE(1) | + DELACK(1) | + WND_SCALE(wscale) | + MSS_IDX(mtu_idx) | + L2T_IDX(ep->l2t->idx) | + TX_CHAN(ep->tx_chan) | + SMAC_SEL(ep->smac_idx) | + DSCP(ep->tos) | + ULP_MODE(ULP_MODE_TCPDDP) | + RCV_BUFSIZ(win); + opt2 = RX_CHANNEL(0) | + CCTRL_ECN(enable_ecn) | + RSS_QUEUE_VALID | RSS_QUEUE(ep->rss_qid); + if (enable_tcp_timestamps) + opt2 |= TSTAMPS_EN(1); + if (enable_tcp_sack) + opt2 |= SACK_EN(1); + if (wscale && enable_tcp_window_scaling) + opt2 |= WND_SCALE_EN(1); + if (is_t5(ep->com.dev->rdev.lldi.adapter_type)) { + opt2 |= T5_OPT_2_VALID; + opt2 |= V_CONG_CNTRL(CONG_ALG_TAHOE); + opt2 |= CONG_CNTRL_VALID; /* OPT_2_ISS for T5 */ + } + t4_set_arp_err_handler(skb, ep, act_open_req_arp_failure); + + if (is_t4(ep->com.dev->rdev.lldi.adapter_type)) { + if (ep->com.remote_addr.ss_family == AF_INET) { + req = (struct cpl_act_open_req *) skb_put(skb, wrlen); + INIT_TP_WR(req, 0); + OPCODE_TID(req) = cpu_to_be32( + MK_OPCODE_TID(CPL_ACT_OPEN_REQ, + ((ep->rss_qid << 14) | ep->atid))); + req->local_port = la->sin_port; + req->peer_port = ra->sin_port; + req->local_ip = la->sin_addr.s_addr; + req->peer_ip = ra->sin_addr.s_addr; + req->opt0 = cpu_to_be64(opt0); + req->params = cpu_to_be32(cxgb4_select_ntuple( + ep->com.dev->rdev.lldi.ports[0], + ep->l2t)); + req->opt2 = cpu_to_be32(opt2); + } else { + req6 = (struct cpl_act_open_req6 *)skb_put(skb, wrlen); + + INIT_TP_WR(req6, 0); + OPCODE_TID(req6) = cpu_to_be32( + MK_OPCODE_TID(CPL_ACT_OPEN_REQ6, + ((ep->rss_qid<<14)|ep->atid))); + req6->local_port = la6->sin6_port; + req6->peer_port = ra6->sin6_port; + req6->local_ip_hi = *((__be64 *) + (la6->sin6_addr.s6_addr)); + req6->local_ip_lo = *((__be64 *) + (la6->sin6_addr.s6_addr + 8)); + req6->peer_ip_hi = *((__be64 *) + (ra6->sin6_addr.s6_addr)); + req6->peer_ip_lo = *((__be64 *) + (ra6->sin6_addr.s6_addr + 8)); + req6->opt0 = cpu_to_be64(opt0); + req6->params = cpu_to_be32(cxgb4_select_ntuple( + ep->com.dev->rdev.lldi.ports[0], + ep->l2t)); + req6->opt2 = cpu_to_be32(opt2); + } + } else { + u32 isn = (prandom_u32() & ~7UL) - 1; + + if (peer2peer) + isn += 4; + + if (ep->com.remote_addr.ss_family == AF_INET) { + t5_req = (struct cpl_t5_act_open_req *) + skb_put(skb, wrlen); + INIT_TP_WR(t5_req, 0); + OPCODE_TID(t5_req) = cpu_to_be32( + MK_OPCODE_TID(CPL_ACT_OPEN_REQ, + ((ep->rss_qid << 14) | ep->atid))); + t5_req->local_port = la->sin_port; + t5_req->peer_port = ra->sin_port; + t5_req->local_ip = la->sin_addr.s_addr; + t5_req->peer_ip = ra->sin_addr.s_addr; + t5_req->opt0 = cpu_to_be64(opt0); + t5_req->params = cpu_to_be64(V_FILTER_TUPLE( + cxgb4_select_ntuple( + ep->com.dev->rdev.lldi.ports[0], + ep->l2t))); + t5_req->rsvd = cpu_to_be32(isn); + PDBG("%s snd_isn %u\n", __func__, + be32_to_cpu(t5_req->rsvd)); + t5_req->opt2 = cpu_to_be32(opt2); + } else { + t5_req6 = (struct cpl_t5_act_open_req6 *) + skb_put(skb, wrlen); + INIT_TP_WR(t5_req6, 0); + OPCODE_TID(t5_req6) = cpu_to_be32( + MK_OPCODE_TID(CPL_ACT_OPEN_REQ6, + ((ep->rss_qid<<14)|ep->atid))); + t5_req6->local_port = la6->sin6_port; + t5_req6->peer_port = ra6->sin6_port; + t5_req6->local_ip_hi = *((__be64 *) + (la6->sin6_addr.s6_addr)); + t5_req6->local_ip_lo = *((__be64 *) + (la6->sin6_addr.s6_addr + 8)); + t5_req6->peer_ip_hi = *((__be64 *) + (ra6->sin6_addr.s6_addr)); + t5_req6->peer_ip_lo = *((__be64 *) + (ra6->sin6_addr.s6_addr + 8)); + t5_req6->opt0 = cpu_to_be64(opt0); + t5_req6->params = cpu_to_be64(V_FILTER_TUPLE( + cxgb4_select_ntuple( + ep->com.dev->rdev.lldi.ports[0], + ep->l2t))); + t5_req6->rsvd = cpu_to_be32(isn); + PDBG("%s snd_isn %u\n", __func__, + be32_to_cpu(t5_req6->rsvd)); + t5_req6->opt2 = cpu_to_be32(opt2); + } + } + + set_bit(ACT_OPEN_REQ, &ep->com.history); + return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); +} + +static void send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb, + u8 mpa_rev_to_use) +{ + int mpalen, wrlen; + struct fw_ofld_tx_data_wr *req; + struct mpa_message *mpa; + struct mpa_v2_conn_params mpa_v2_params; + + PDBG("%s ep %p tid %u pd_len %d\n", __func__, ep, ep->hwtid, ep->plen); + + BUG_ON(skb_cloned(skb)); + + mpalen = sizeof(*mpa) + ep->plen; + if (mpa_rev_to_use == 2) + mpalen += sizeof(struct mpa_v2_conn_params); + wrlen = roundup(mpalen + sizeof *req, 16); + skb = get_skb(skb, wrlen, GFP_KERNEL); + if (!skb) { + connect_reply_upcall(ep, -ENOMEM); + return; + } + set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); + + req = (struct fw_ofld_tx_data_wr *)skb_put(skb, wrlen); + memset(req, 0, wrlen); + req->op_to_immdlen = cpu_to_be32( + FW_WR_OP(FW_OFLD_TX_DATA_WR) | + FW_WR_COMPL(1) | + FW_WR_IMMDLEN(mpalen)); + req->flowid_len16 = cpu_to_be32( + FW_WR_FLOWID(ep->hwtid) | + FW_WR_LEN16(wrlen >> 4)); + req->plen = cpu_to_be32(mpalen); + req->tunnel_to_proxy = cpu_to_be32( + FW_OFLD_TX_DATA_WR_FLUSH(1) | + FW_OFLD_TX_DATA_WR_SHOVE(1)); + + mpa = (struct mpa_message *)(req + 1); + memcpy(mpa->key, MPA_KEY_REQ, sizeof(mpa->key)); + mpa->flags = (crc_enabled ? MPA_CRC : 0) | + (markers_enabled ? MPA_MARKERS : 0) | + (mpa_rev_to_use == 2 ? MPA_ENHANCED_RDMA_CONN : 0); + mpa->private_data_size = htons(ep->plen); + mpa->revision = mpa_rev_to_use; + if (mpa_rev_to_use == 1) { + ep->tried_with_mpa_v1 = 1; + ep->retry_with_mpa_v1 = 0; + } + + if (mpa_rev_to_use == 2) { + mpa->private_data_size = htons(ntohs(mpa->private_data_size) + + sizeof (struct mpa_v2_conn_params)); + PDBG("%s initiator ird %u ord %u\n", __func__, ep->ird, + ep->ord); + mpa_v2_params.ird = htons((u16)ep->ird); + mpa_v2_params.ord = htons((u16)ep->ord); + + if (peer2peer) { + mpa_v2_params.ird |= htons(MPA_V2_PEER2PEER_MODEL); + if (p2p_type == FW_RI_INIT_P2PTYPE_RDMA_WRITE) + mpa_v2_params.ord |= + htons(MPA_V2_RDMA_WRITE_RTR); + else if (p2p_type == FW_RI_INIT_P2PTYPE_READ_REQ) + mpa_v2_params.ord |= + htons(MPA_V2_RDMA_READ_RTR); + } + memcpy(mpa->private_data, &mpa_v2_params, + sizeof(struct mpa_v2_conn_params)); + + if (ep->plen) + memcpy(mpa->private_data + + sizeof(struct mpa_v2_conn_params), + ep->mpa_pkt + sizeof(*mpa), ep->plen); + } else + if (ep->plen) + memcpy(mpa->private_data, + ep->mpa_pkt + sizeof(*mpa), ep->plen); + + /* + * Reference the mpa skb. This ensures the data area + * will remain in memory until the hw acks the tx. + * Function fw4_ack() will deref it. + */ + skb_get(skb); + t4_set_arp_err_handler(skb, NULL, arp_failure_discard); + BUG_ON(ep->mpa_skb); + ep->mpa_skb = skb; + c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); + start_ep_timer(ep); + __state_set(&ep->com, MPA_REQ_SENT); + ep->mpa_attr.initiator = 1; + ep->snd_seq += mpalen; + return; +} + +static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen) +{ + int mpalen, wrlen; + struct fw_ofld_tx_data_wr *req; + struct mpa_message *mpa; + struct sk_buff *skb; + struct mpa_v2_conn_params mpa_v2_params; + + PDBG("%s ep %p tid %u pd_len %d\n", __func__, ep, ep->hwtid, ep->plen); + + mpalen = sizeof(*mpa) + plen; + if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) + mpalen += sizeof(struct mpa_v2_conn_params); + wrlen = roundup(mpalen + sizeof *req, 16); + + skb = get_skb(NULL, wrlen, GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "%s - cannot alloc skb!\n", __func__); + return -ENOMEM; + } + set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); + + req = (struct fw_ofld_tx_data_wr *)skb_put(skb, wrlen); + memset(req, 0, wrlen); + req->op_to_immdlen = cpu_to_be32( + FW_WR_OP(FW_OFLD_TX_DATA_WR) | + FW_WR_COMPL(1) | + FW_WR_IMMDLEN(mpalen)); + req->flowid_len16 = cpu_to_be32( + FW_WR_FLOWID(ep->hwtid) | + FW_WR_LEN16(wrlen >> 4)); + req->plen = cpu_to_be32(mpalen); + req->tunnel_to_proxy = cpu_to_be32( + FW_OFLD_TX_DATA_WR_FLUSH(1) | + FW_OFLD_TX_DATA_WR_SHOVE(1)); + + mpa = (struct mpa_message *)(req + 1); + memset(mpa, 0, sizeof(*mpa)); + memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key)); + mpa->flags = MPA_REJECT; + mpa->revision = ep->mpa_attr.version; + mpa->private_data_size = htons(plen); + + if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) { + mpa->flags |= MPA_ENHANCED_RDMA_CONN; + mpa->private_data_size = htons(ntohs(mpa->private_data_size) + + sizeof (struct mpa_v2_conn_params)); + mpa_v2_params.ird = htons(((u16)ep->ird) | + (peer2peer ? MPA_V2_PEER2PEER_MODEL : + 0)); + mpa_v2_params.ord = htons(((u16)ep->ord) | (peer2peer ? + (p2p_type == + FW_RI_INIT_P2PTYPE_RDMA_WRITE ? + MPA_V2_RDMA_WRITE_RTR : p2p_type == + FW_RI_INIT_P2PTYPE_READ_REQ ? + MPA_V2_RDMA_READ_RTR : 0) : 0)); + memcpy(mpa->private_data, &mpa_v2_params, + sizeof(struct mpa_v2_conn_params)); + + if (ep->plen) + memcpy(mpa->private_data + + sizeof(struct mpa_v2_conn_params), pdata, plen); + } else + if (plen) + memcpy(mpa->private_data, pdata, plen); + + /* + * Reference the mpa skb again. This ensures the data area + * will remain in memory until the hw acks the tx. + * Function fw4_ack() will deref it. + */ + skb_get(skb); + set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); + t4_set_arp_err_handler(skb, NULL, arp_failure_discard); + BUG_ON(ep->mpa_skb); + ep->mpa_skb = skb; + ep->snd_seq += mpalen; + return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); +} + +static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen) +{ + int mpalen, wrlen; + struct fw_ofld_tx_data_wr *req; + struct mpa_message *mpa; + struct sk_buff *skb; + struct mpa_v2_conn_params mpa_v2_params; + + PDBG("%s ep %p tid %u pd_len %d\n", __func__, ep, ep->hwtid, ep->plen); + + mpalen = sizeof(*mpa) + plen; + if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) + mpalen += sizeof(struct mpa_v2_conn_params); + wrlen = roundup(mpalen + sizeof *req, 16); + + skb = get_skb(NULL, wrlen, GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "%s - cannot alloc skb!\n", __func__); + return -ENOMEM; + } + set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); + + req = (struct fw_ofld_tx_data_wr *) skb_put(skb, wrlen); + memset(req, 0, wrlen); + req->op_to_immdlen = cpu_to_be32( + FW_WR_OP(FW_OFLD_TX_DATA_WR) | + FW_WR_COMPL(1) | + FW_WR_IMMDLEN(mpalen)); + req->flowid_len16 = cpu_to_be32( + FW_WR_FLOWID(ep->hwtid) | + FW_WR_LEN16(wrlen >> 4)); + req->plen = cpu_to_be32(mpalen); + req->tunnel_to_proxy = cpu_to_be32( + FW_OFLD_TX_DATA_WR_FLUSH(1) | + FW_OFLD_TX_DATA_WR_SHOVE(1)); + + mpa = (struct mpa_message *)(req + 1); + memset(mpa, 0, sizeof(*mpa)); + memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key)); + mpa->flags = (ep->mpa_attr.crc_enabled ? MPA_CRC : 0) | + (markers_enabled ? MPA_MARKERS : 0); + mpa->revision = ep->mpa_attr.version; + mpa->private_data_size = htons(plen); + + if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) { + mpa->flags |= MPA_ENHANCED_RDMA_CONN; + mpa->private_data_size = htons(ntohs(mpa->private_data_size) + + sizeof (struct mpa_v2_conn_params)); + mpa_v2_params.ird = htons((u16)ep->ird); + mpa_v2_params.ord = htons((u16)ep->ord); + if (peer2peer && (ep->mpa_attr.p2p_type != + FW_RI_INIT_P2PTYPE_DISABLED)) { + mpa_v2_params.ird |= htons(MPA_V2_PEER2PEER_MODEL); + + if (p2p_type == FW_RI_INIT_P2PTYPE_RDMA_WRITE) + mpa_v2_params.ord |= + htons(MPA_V2_RDMA_WRITE_RTR); + else if (p2p_type == FW_RI_INIT_P2PTYPE_READ_REQ) + mpa_v2_params.ord |= + htons(MPA_V2_RDMA_READ_RTR); + } + + memcpy(mpa->private_data, &mpa_v2_params, + sizeof(struct mpa_v2_conn_params)); + + if (ep->plen) + memcpy(mpa->private_data + + sizeof(struct mpa_v2_conn_params), pdata, plen); + } else + if (plen) + memcpy(mpa->private_data, pdata, plen); + + /* + * Reference the mpa skb. This ensures the data area + * will remain in memory until the hw acks the tx. + * Function fw4_ack() will deref it. + */ + skb_get(skb); + t4_set_arp_err_handler(skb, NULL, arp_failure_discard); + ep->mpa_skb = skb; + __state_set(&ep->com, MPA_REP_SENT); + ep->snd_seq += mpalen; + return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); +} + +static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct c4iw_ep *ep; + struct cpl_act_establish *req = cplhdr(skb); + unsigned int tid = GET_TID(req); + unsigned int atid = GET_TID_TID(ntohl(req->tos_atid)); + struct tid_info *t = dev->rdev.lldi.tids; + + ep = lookup_atid(t, atid); + + PDBG("%s ep %p tid %u snd_isn %u rcv_isn %u\n", __func__, ep, tid, + be32_to_cpu(req->snd_isn), be32_to_cpu(req->rcv_isn)); + + mutex_lock(&ep->com.mutex); + dst_confirm(ep->dst); + + /* setup the hwtid for this connection */ + ep->hwtid = tid; + cxgb4_insert_tid(t, ep, tid); + insert_handle(dev, &dev->hwtid_idr, ep, ep->hwtid); + + ep->snd_seq = be32_to_cpu(req->snd_isn); + ep->rcv_seq = be32_to_cpu(req->rcv_isn); + + set_emss(ep, ntohs(req->tcp_opt)); + + /* dealloc the atid */ + remove_handle(ep->com.dev, &ep->com.dev->atid_idr, atid); + cxgb4_free_atid(t, atid); + set_bit(ACT_ESTAB, &ep->com.history); + + /* start MPA negotiation */ + send_flowc(ep, NULL); + if (ep->retry_with_mpa_v1) + send_mpa_req(ep, skb, 1); + else + send_mpa_req(ep, skb, mpa_rev); + mutex_unlock(&ep->com.mutex); + return 0; +} + +static void close_complete_upcall(struct c4iw_ep *ep, int status) +{ + struct iw_cm_event event; + + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_CLOSE; + event.status = status; + if (ep->com.cm_id) { + PDBG("close complete delivered ep %p cm_id %p tid %u\n", + ep, ep->com.cm_id, ep->hwtid); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + ep->com.cm_id->rem_ref(ep->com.cm_id); + ep->com.cm_id = NULL; + set_bit(CLOSE_UPCALL, &ep->com.history); + } +} + +static int abort_connection(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp) +{ + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + __state_set(&ep->com, ABORTING); + set_bit(ABORT_CONN, &ep->com.history); + return send_abort(ep, skb, gfp); +} + +static void peer_close_upcall(struct c4iw_ep *ep) +{ + struct iw_cm_event event; + + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_DISCONNECT; + if (ep->com.cm_id) { + PDBG("peer close delivered ep %p cm_id %p tid %u\n", + ep, ep->com.cm_id, ep->hwtid); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + set_bit(DISCONN_UPCALL, &ep->com.history); + } +} + +static void peer_abort_upcall(struct c4iw_ep *ep) +{ + struct iw_cm_event event; + + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_CLOSE; + event.status = -ECONNRESET; + if (ep->com.cm_id) { + PDBG("abort delivered ep %p cm_id %p tid %u\n", ep, + ep->com.cm_id, ep->hwtid); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + ep->com.cm_id->rem_ref(ep->com.cm_id); + ep->com.cm_id = NULL; + set_bit(ABORT_UPCALL, &ep->com.history); + } +} + +static void connect_reply_upcall(struct c4iw_ep *ep, int status) +{ + struct iw_cm_event event; + + PDBG("%s ep %p tid %u status %d\n", __func__, ep, ep->hwtid, status); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_CONNECT_REPLY; + event.status = status; + memcpy(&event.local_addr, &ep->com.local_addr, + sizeof(ep->com.local_addr)); + memcpy(&event.remote_addr, &ep->com.remote_addr, + sizeof(ep->com.remote_addr)); + + if ((status == 0) || (status == -ECONNREFUSED)) { + if (!ep->tried_with_mpa_v1) { + /* this means MPA_v2 is used */ + event.private_data_len = ep->plen - + sizeof(struct mpa_v2_conn_params); + event.private_data = ep->mpa_pkt + + sizeof(struct mpa_message) + + sizeof(struct mpa_v2_conn_params); + } else { + /* this means MPA_v1 is used */ + event.private_data_len = ep->plen; + event.private_data = ep->mpa_pkt + + sizeof(struct mpa_message); + } + } + + PDBG("%s ep %p tid %u status %d\n", __func__, ep, + ep->hwtid, status); + set_bit(CONN_RPL_UPCALL, &ep->com.history); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + + if (status < 0) { + ep->com.cm_id->rem_ref(ep->com.cm_id); + ep->com.cm_id = NULL; + } +} + +static int connect_request_upcall(struct c4iw_ep *ep) +{ + struct iw_cm_event event; + int ret; + + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_CONNECT_REQUEST; + memcpy(&event.local_addr, &ep->com.local_addr, + sizeof(ep->com.local_addr)); + memcpy(&event.remote_addr, &ep->com.remote_addr, + sizeof(ep->com.remote_addr)); + event.provider_data = ep; + if (!ep->tried_with_mpa_v1) { + /* this means MPA_v2 is used */ + event.ord = ep->ord; + event.ird = ep->ird; + event.private_data_len = ep->plen - + sizeof(struct mpa_v2_conn_params); + event.private_data = ep->mpa_pkt + sizeof(struct mpa_message) + + sizeof(struct mpa_v2_conn_params); + } else { + /* this means MPA_v1 is used. Send max supported */ + event.ord = cur_max_read_depth(ep->com.dev); + event.ird = cur_max_read_depth(ep->com.dev); + event.private_data_len = ep->plen; + event.private_data = ep->mpa_pkt + sizeof(struct mpa_message); + } + c4iw_get_ep(&ep->com); + ret = ep->parent_ep->com.cm_id->event_handler(ep->parent_ep->com.cm_id, + &event); + if (ret) + c4iw_put_ep(&ep->com); + set_bit(CONNREQ_UPCALL, &ep->com.history); + c4iw_put_ep(&ep->parent_ep->com); + return ret; +} + +static void established_upcall(struct c4iw_ep *ep) +{ + struct iw_cm_event event; + + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_ESTABLISHED; + event.ird = ep->ird; + event.ord = ep->ord; + if (ep->com.cm_id) { + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + set_bit(ESTAB_UPCALL, &ep->com.history); + } +} + +static int update_rx_credits(struct c4iw_ep *ep, u32 credits) +{ + struct cpl_rx_data_ack *req; + struct sk_buff *skb; + int wrlen = roundup(sizeof *req, 16); + + PDBG("%s ep %p tid %u credits %u\n", __func__, ep, ep->hwtid, credits); + skb = get_skb(NULL, wrlen, GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "update_rx_credits - cannot alloc skb!\n"); + return 0; + } + + /* + * If we couldn't specify the entire rcv window at connection setup + * due to the limit in the number of bits in the RCV_BUFSIZ field, + * then add the overage in to the credits returned. + */ + if (ep->rcv_win > RCV_BUFSIZ_MASK * 1024) + credits += ep->rcv_win - RCV_BUFSIZ_MASK * 1024; + + req = (struct cpl_rx_data_ack *) skb_put(skb, wrlen); + memset(req, 0, wrlen); + INIT_TP_WR(req, ep->hwtid); + OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_RX_DATA_ACK, + ep->hwtid)); + req->credit_dack = cpu_to_be32(credits | RX_FORCE_ACK(1) | + F_RX_DACK_CHANGE | + V_RX_DACK_MODE(dack_mode)); + set_wr_txq(skb, CPL_PRIORITY_ACK, ep->ctrlq_idx); + c4iw_ofld_send(&ep->com.dev->rdev, skb); + return credits; +} + +#define RELAXED_IRD_NEGOTIATION 1 + +static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) +{ + struct mpa_message *mpa; + struct mpa_v2_conn_params *mpa_v2_params; + u16 plen; + u16 resp_ird, resp_ord; + u8 rtr_mismatch = 0, insuff_ird = 0; + struct c4iw_qp_attributes attrs; + enum c4iw_qp_attr_mask mask; + int err; + int disconnect = 0; + + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + + /* + * Stop mpa timer. If it expired, then + * we ignore the MPA reply. process_timeout() + * will abort the connection. + */ + if (stop_ep_timer(ep)) + return 0; + + /* + * If we get more than the supported amount of private data + * then we must fail this connection. + */ + if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) { + err = -EINVAL; + goto err; + } + + /* + * copy the new data into our accumulation buffer. + */ + skb_copy_from_linear_data(skb, &(ep->mpa_pkt[ep->mpa_pkt_len]), + skb->len); + ep->mpa_pkt_len += skb->len; + + /* + * if we don't even have the mpa message, then bail. + */ + if (ep->mpa_pkt_len < sizeof(*mpa)) + return 0; + mpa = (struct mpa_message *) ep->mpa_pkt; + + /* Validate MPA header. */ + if (mpa->revision > mpa_rev) { + printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d," + " Received = %d\n", __func__, mpa_rev, mpa->revision); + err = -EPROTO; + goto err; + } + if (memcmp(mpa->key, MPA_KEY_REP, sizeof(mpa->key))) { + err = -EPROTO; + goto err; + } + + plen = ntohs(mpa->private_data_size); + + /* + * Fail if there's too much private data. + */ + if (plen > MPA_MAX_PRIVATE_DATA) { + err = -EPROTO; + goto err; + } + + /* + * If plen does not account for pkt size + */ + if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) { + err = -EPROTO; + goto err; + } + + ep->plen = (u8) plen; + + /* + * If we don't have all the pdata yet, then bail. + * We'll continue process when more data arrives. + */ + if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) + return 0; + + if (mpa->flags & MPA_REJECT) { + err = -ECONNREFUSED; + goto err; + } + + /* + * If we get here we have accumulated the entire mpa + * start reply message including private data. And + * the MPA header is valid. + */ + __state_set(&ep->com, FPDU_MODE); + ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; + ep->mpa_attr.recv_marker_enabled = markers_enabled; + ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; + ep->mpa_attr.version = mpa->revision; + ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED; + + if (mpa->revision == 2) { + ep->mpa_attr.enhanced_rdma_conn = + mpa->flags & MPA_ENHANCED_RDMA_CONN ? 1 : 0; + if (ep->mpa_attr.enhanced_rdma_conn) { + mpa_v2_params = (struct mpa_v2_conn_params *) + (ep->mpa_pkt + sizeof(*mpa)); + resp_ird = ntohs(mpa_v2_params->ird) & + MPA_V2_IRD_ORD_MASK; + resp_ord = ntohs(mpa_v2_params->ord) & + MPA_V2_IRD_ORD_MASK; + PDBG("%s responder ird %u ord %u ep ird %u ord %u\n", + __func__, resp_ird, resp_ord, ep->ird, ep->ord); + + /* + * This is a double-check. Ideally, below checks are + * not required since ird/ord stuff has been taken + * care of in c4iw_accept_cr + */ + if (ep->ird < resp_ord) { + if (RELAXED_IRD_NEGOTIATION && resp_ord <= + ep->com.dev->rdev.lldi.max_ordird_qp) + ep->ird = resp_ord; + else + insuff_ird = 1; + } else if (ep->ird > resp_ord) { + ep->ird = resp_ord; + } + if (ep->ord > resp_ird) { + if (RELAXED_IRD_NEGOTIATION) + ep->ord = resp_ird; + else + insuff_ird = 1; + } + if (insuff_ird) { + err = -ENOMEM; + ep->ird = resp_ord; + ep->ord = resp_ird; + } + + if (ntohs(mpa_v2_params->ird) & + MPA_V2_PEER2PEER_MODEL) { + if (ntohs(mpa_v2_params->ord) & + MPA_V2_RDMA_WRITE_RTR) + ep->mpa_attr.p2p_type = + FW_RI_INIT_P2PTYPE_RDMA_WRITE; + else if (ntohs(mpa_v2_params->ord) & + MPA_V2_RDMA_READ_RTR) + ep->mpa_attr.p2p_type = + FW_RI_INIT_P2PTYPE_READ_REQ; + } + } + } else if (mpa->revision == 1) + if (peer2peer) + ep->mpa_attr.p2p_type = p2p_type; + + PDBG("%s - crc_enabled=%d, recv_marker_enabled=%d, " + "xmit_marker_enabled=%d, version=%d p2p_type=%d local-p2p_type = " + "%d\n", __func__, ep->mpa_attr.crc_enabled, + ep->mpa_attr.recv_marker_enabled, + ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version, + ep->mpa_attr.p2p_type, p2p_type); + + /* + * If responder's RTR does not match with that of initiator, assign + * FW_RI_INIT_P2PTYPE_DISABLED in mpa attributes so that RTR is not + * generated when moving QP to RTS state. + * A TERM message will be sent after QP has moved to RTS state + */ + if ((ep->mpa_attr.version == 2) && peer2peer && + (ep->mpa_attr.p2p_type != p2p_type)) { + ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED; + rtr_mismatch = 1; + } + + attrs.mpa_attr = ep->mpa_attr; + attrs.max_ird = ep->ird; + attrs.max_ord = ep->ord; + attrs.llp_stream_handle = ep; + attrs.next_state = C4IW_QP_STATE_RTS; + + mask = C4IW_QP_ATTR_NEXT_STATE | + C4IW_QP_ATTR_LLP_STREAM_HANDLE | C4IW_QP_ATTR_MPA_ATTR | + C4IW_QP_ATTR_MAX_IRD | C4IW_QP_ATTR_MAX_ORD; + + /* bind QP and TID with INIT_WR */ + err = c4iw_modify_qp(ep->com.qp->rhp, + ep->com.qp, mask, &attrs, 1); + if (err) + goto err; + + /* + * If responder's RTR requirement did not match with what initiator + * supports, generate TERM message + */ + if (rtr_mismatch) { + printk(KERN_ERR "%s: RTR mismatch, sending TERM\n", __func__); + attrs.layer_etype = LAYER_MPA | DDP_LLP; + attrs.ecode = MPA_NOMATCH_RTR; + attrs.next_state = C4IW_QP_STATE_TERMINATE; + attrs.send_term = 1; + err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, + C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); + err = -ENOMEM; + disconnect = 1; + goto out; + } + + /* + * Generate TERM if initiator IRD is not sufficient for responder + * provided ORD. Currently, we do the same behaviour even when + * responder provided IRD is also not sufficient as regards to + * initiator ORD. + */ + if (insuff_ird) { + printk(KERN_ERR "%s: Insufficient IRD, sending TERM\n", + __func__); + attrs.layer_etype = LAYER_MPA | DDP_LLP; + attrs.ecode = MPA_INSUFF_IRD; + attrs.next_state = C4IW_QP_STATE_TERMINATE; + attrs.send_term = 1; + err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, + C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); + err = -ENOMEM; + disconnect = 1; + goto out; + } + goto out; +err: + __state_set(&ep->com, ABORTING); + send_abort(ep, skb, GFP_KERNEL); +out: + connect_reply_upcall(ep, err); + return disconnect; +} + +static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) +{ + struct mpa_message *mpa; + struct mpa_v2_conn_params *mpa_v2_params; + u16 plen; + + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + + /* + * If we get more than the supported amount of private data + * then we must fail this connection. + */ + if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) { + (void)stop_ep_timer(ep); + abort_connection(ep, skb, GFP_KERNEL); + return; + } + + PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__); + + /* + * Copy the new data into our accumulation buffer. + */ + skb_copy_from_linear_data(skb, &(ep->mpa_pkt[ep->mpa_pkt_len]), + skb->len); + ep->mpa_pkt_len += skb->len; + + /* + * If we don't even have the mpa message, then bail. + * We'll continue process when more data arrives. + */ + if (ep->mpa_pkt_len < sizeof(*mpa)) + return; + + PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__); + mpa = (struct mpa_message *) ep->mpa_pkt; + + /* + * Validate MPA Header. + */ + if (mpa->revision > mpa_rev) { + printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d," + " Received = %d\n", __func__, mpa_rev, mpa->revision); + (void)stop_ep_timer(ep); + abort_connection(ep, skb, GFP_KERNEL); + return; + } + + if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) { + (void)stop_ep_timer(ep); + abort_connection(ep, skb, GFP_KERNEL); + return; + } + + plen = ntohs(mpa->private_data_size); + + /* + * Fail if there's too much private data. + */ + if (plen > MPA_MAX_PRIVATE_DATA) { + (void)stop_ep_timer(ep); + abort_connection(ep, skb, GFP_KERNEL); + return; + } + + /* + * If plen does not account for pkt size + */ + if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) { + (void)stop_ep_timer(ep); + abort_connection(ep, skb, GFP_KERNEL); + return; + } + ep->plen = (u8) plen; + + /* + * If we don't have all the pdata yet, then bail. + */ + if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) + return; + + /* + * If we get here we have accumulated the entire mpa + * start reply message including private data. + */ + ep->mpa_attr.initiator = 0; + ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; + ep->mpa_attr.recv_marker_enabled = markers_enabled; + ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; + ep->mpa_attr.version = mpa->revision; + if (mpa->revision == 1) + ep->tried_with_mpa_v1 = 1; + ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED; + + if (mpa->revision == 2) { + ep->mpa_attr.enhanced_rdma_conn = + mpa->flags & MPA_ENHANCED_RDMA_CONN ? 1 : 0; + if (ep->mpa_attr.enhanced_rdma_conn) { + mpa_v2_params = (struct mpa_v2_conn_params *) + (ep->mpa_pkt + sizeof(*mpa)); + ep->ird = ntohs(mpa_v2_params->ird) & + MPA_V2_IRD_ORD_MASK; + ep->ord = ntohs(mpa_v2_params->ord) & + MPA_V2_IRD_ORD_MASK; + PDBG("%s initiator ird %u ord %u\n", __func__, ep->ird, + ep->ord); + if (ntohs(mpa_v2_params->ird) & MPA_V2_PEER2PEER_MODEL) + if (peer2peer) { + if (ntohs(mpa_v2_params->ord) & + MPA_V2_RDMA_WRITE_RTR) + ep->mpa_attr.p2p_type = + FW_RI_INIT_P2PTYPE_RDMA_WRITE; + else if (ntohs(mpa_v2_params->ord) & + MPA_V2_RDMA_READ_RTR) + ep->mpa_attr.p2p_type = + FW_RI_INIT_P2PTYPE_READ_REQ; + } + } + } else if (mpa->revision == 1) + if (peer2peer) + ep->mpa_attr.p2p_type = p2p_type; + + PDBG("%s - crc_enabled=%d, recv_marker_enabled=%d, " + "xmit_marker_enabled=%d, version=%d p2p_type=%d\n", __func__, + ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled, + ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version, + ep->mpa_attr.p2p_type); + + /* + * If the endpoint timer already expired, then we ignore + * the start request. process_timeout() will abort + * the connection. + */ + if (!stop_ep_timer(ep)) { + __state_set(&ep->com, MPA_REQ_RCVD); + + /* drive upcall */ + mutex_lock(&ep->parent_ep->com.mutex); + if (ep->parent_ep->com.state != DEAD) { + if (connect_request_upcall(ep)) + abort_connection(ep, skb, GFP_KERNEL); + } else { + abort_connection(ep, skb, GFP_KERNEL); + } + mutex_unlock(&ep->parent_ep->com.mutex); + } + return; +} + +static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct c4iw_ep *ep; + struct cpl_rx_data *hdr = cplhdr(skb); + unsigned int dlen = ntohs(hdr->len); + unsigned int tid = GET_TID(hdr); + struct tid_info *t = dev->rdev.lldi.tids; + __u8 status = hdr->status; + int disconnect = 0; + + ep = lookup_tid(t, tid); + if (!ep) + return 0; + PDBG("%s ep %p tid %u dlen %u\n", __func__, ep, ep->hwtid, dlen); + skb_pull(skb, sizeof(*hdr)); + skb_trim(skb, dlen); + mutex_lock(&ep->com.mutex); + + /* update RX credits */ + update_rx_credits(ep, dlen); + + switch (ep->com.state) { + case MPA_REQ_SENT: + ep->rcv_seq += dlen; + disconnect = process_mpa_reply(ep, skb); + break; + case MPA_REQ_WAIT: + ep->rcv_seq += dlen; + process_mpa_request(ep, skb); + break; + case FPDU_MODE: { + struct c4iw_qp_attributes attrs; + BUG_ON(!ep->com.qp); + if (status) + pr_err("%s Unexpected streaming data." \ + " qpid %u ep %p state %d tid %u status %d\n", + __func__, ep->com.qp->wq.sq.qid, ep, + ep->com.state, ep->hwtid, status); + attrs.next_state = C4IW_QP_STATE_TERMINATE; + c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, + C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); + disconnect = 1; + break; + } + default: + break; + } + mutex_unlock(&ep->com.mutex); + if (disconnect) + c4iw_ep_disconnect(ep, 0, GFP_KERNEL); + return 0; +} + +static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct c4iw_ep *ep; + struct cpl_abort_rpl_rss *rpl = cplhdr(skb); + int release = 0; + unsigned int tid = GET_TID(rpl); + struct tid_info *t = dev->rdev.lldi.tids; + + ep = lookup_tid(t, tid); + if (!ep) { + printk(KERN_WARNING MOD "Abort rpl to freed endpoint\n"); + return 0; + } + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + mutex_lock(&ep->com.mutex); + switch (ep->com.state) { + case ABORTING: + c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); + __state_set(&ep->com, DEAD); + release = 1; + break; + default: + printk(KERN_ERR "%s ep %p state %d\n", + __func__, ep, ep->com.state); + break; + } + mutex_unlock(&ep->com.mutex); + + if (release) + release_ep_resources(ep); + return 0; +} + +static void send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid) +{ + struct sk_buff *skb; + struct fw_ofld_connection_wr *req; + unsigned int mtu_idx; + int wscale; + struct sockaddr_in *sin; + int win; + + skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + req = (struct fw_ofld_connection_wr *)__skb_put(skb, sizeof(*req)); + memset(req, 0, sizeof(*req)); + req->op_compl = htonl(V_WR_OP(FW_OFLD_CONNECTION_WR)); + req->len16_pkd = htonl(FW_WR_LEN16(DIV_ROUND_UP(sizeof(*req), 16))); + req->le.filter = cpu_to_be32(cxgb4_select_ntuple( + ep->com.dev->rdev.lldi.ports[0], + ep->l2t)); + sin = (struct sockaddr_in *)&ep->com.mapped_local_addr; + req->le.lport = sin->sin_port; + req->le.u.ipv4.lip = sin->sin_addr.s_addr; + sin = (struct sockaddr_in *)&ep->com.mapped_remote_addr; + req->le.pport = sin->sin_port; + req->le.u.ipv4.pip = sin->sin_addr.s_addr; + req->tcb.t_state_to_astid = + htonl(V_FW_OFLD_CONNECTION_WR_T_STATE(TCP_SYN_SENT) | + V_FW_OFLD_CONNECTION_WR_ASTID(atid)); + req->tcb.cplrxdataack_cplpassacceptrpl = + htons(F_FW_OFLD_CONNECTION_WR_CPLRXDATAACK); + req->tcb.tx_max = (__force __be32) jiffies; + req->tcb.rcv_adv = htons(1); + best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx, + enable_tcp_timestamps, + (AF_INET == ep->com.remote_addr.ss_family) ? 0 : 1); + wscale = compute_wscale(rcv_win); + + /* + * Specify the largest window that will fit in opt0. The + * remainder will be specified in the rx_data_ack. + */ + win = ep->rcv_win >> 10; + if (win > RCV_BUFSIZ_MASK) + win = RCV_BUFSIZ_MASK; + + req->tcb.opt0 = (__force __be64) (TCAM_BYPASS(1) | + (nocong ? NO_CONG(1) : 0) | + KEEP_ALIVE(1) | + DELACK(1) | + WND_SCALE(wscale) | + MSS_IDX(mtu_idx) | + L2T_IDX(ep->l2t->idx) | + TX_CHAN(ep->tx_chan) | + SMAC_SEL(ep->smac_idx) | + DSCP(ep->tos) | + ULP_MODE(ULP_MODE_TCPDDP) | + RCV_BUFSIZ(win)); + req->tcb.opt2 = (__force __be32) (PACE(1) | + TX_QUEUE(ep->com.dev->rdev.lldi.tx_modq[ep->tx_chan]) | + RX_CHANNEL(0) | + CCTRL_ECN(enable_ecn) | + RSS_QUEUE_VALID | RSS_QUEUE(ep->rss_qid)); + if (enable_tcp_timestamps) + req->tcb.opt2 |= (__force __be32) TSTAMPS_EN(1); + if (enable_tcp_sack) + req->tcb.opt2 |= (__force __be32) SACK_EN(1); + if (wscale && enable_tcp_window_scaling) + req->tcb.opt2 |= (__force __be32) WND_SCALE_EN(1); + req->tcb.opt0 = cpu_to_be64((__force u64) req->tcb.opt0); + req->tcb.opt2 = cpu_to_be32((__force u32) req->tcb.opt2); + set_wr_txq(skb, CPL_PRIORITY_CONTROL, ep->ctrlq_idx); + set_bit(ACT_OFLD_CONN, &ep->com.history); + c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); +} + +/* + * Return whether a failed active open has allocated a TID + */ +static inline int act_open_has_tid(int status) +{ + return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST && + status != CPL_ERR_ARP_MISS; +} + +/* Returns whether a CPL status conveys negative advice. + */ +static int is_neg_adv(unsigned int status) +{ + return status == CPL_ERR_RTX_NEG_ADVICE || + status == CPL_ERR_PERSIST_NEG_ADVICE || + status == CPL_ERR_KEEPALV_NEG_ADVICE; +} + +static char *neg_adv_str(unsigned int status) +{ + switch (status) { + case CPL_ERR_RTX_NEG_ADVICE: + return "Retransmit timeout"; + case CPL_ERR_PERSIST_NEG_ADVICE: + return "Persist timeout"; + case CPL_ERR_KEEPALV_NEG_ADVICE: + return "Keepalive timeout"; + default: + return "Unknown"; + } +} + +static void set_tcp_window(struct c4iw_ep *ep, struct port_info *pi) +{ + ep->snd_win = snd_win; + ep->rcv_win = rcv_win; + PDBG("%s snd_win %d rcv_win %d\n", __func__, ep->snd_win, ep->rcv_win); +} + +#define ACT_OPEN_RETRY_COUNT 2 + +static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip, + struct dst_entry *dst, struct c4iw_dev *cdev, + bool clear_mpa_v1) +{ + struct neighbour *n; + int err, step; + struct net_device *pdev; + + n = dst_neigh_lookup(dst, peer_ip); + if (!n) + return -ENODEV; + + rcu_read_lock(); + err = -ENOMEM; + if (n->dev->flags & IFF_LOOPBACK) { + if (iptype == 4) + pdev = ip_dev_find(&init_net, *(__be32 *)peer_ip); + else if (IS_ENABLED(CONFIG_IPV6)) + for_each_netdev(&init_net, pdev) { + if (ipv6_chk_addr(&init_net, + (struct in6_addr *)peer_ip, + pdev, 1)) + break; + } + else + pdev = NULL; + + if (!pdev) { + err = -ENODEV; + goto out; + } + ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t, + n, pdev, 0); + if (!ep->l2t) + goto out; + ep->mtu = pdev->mtu; + ep->tx_chan = cxgb4_port_chan(pdev); + ep->smac_idx = (cxgb4_port_viid(pdev) & 0x7F) << 1; + step = cdev->rdev.lldi.ntxq / + cdev->rdev.lldi.nchan; + ep->txq_idx = cxgb4_port_idx(pdev) * step; + step = cdev->rdev.lldi.nrxq / + cdev->rdev.lldi.nchan; + ep->ctrlq_idx = cxgb4_port_idx(pdev); + ep->rss_qid = cdev->rdev.lldi.rxq_ids[ + cxgb4_port_idx(pdev) * step]; + set_tcp_window(ep, (struct port_info *)netdev_priv(pdev)); + dev_put(pdev); + } else { + pdev = get_real_dev(n->dev); + ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t, + n, pdev, 0); + if (!ep->l2t) + goto out; + ep->mtu = dst_mtu(dst); + ep->tx_chan = cxgb4_port_chan(pdev); + ep->smac_idx = (cxgb4_port_viid(pdev) & 0x7F) << 1; + step = cdev->rdev.lldi.ntxq / + cdev->rdev.lldi.nchan; + ep->txq_idx = cxgb4_port_idx(pdev) * step; + ep->ctrlq_idx = cxgb4_port_idx(pdev); + step = cdev->rdev.lldi.nrxq / + cdev->rdev.lldi.nchan; + ep->rss_qid = cdev->rdev.lldi.rxq_ids[ + cxgb4_port_idx(pdev) * step]; + set_tcp_window(ep, (struct port_info *)netdev_priv(pdev)); + + if (clear_mpa_v1) { + ep->retry_with_mpa_v1 = 0; + ep->tried_with_mpa_v1 = 0; + } + } + err = 0; +out: + rcu_read_unlock(); + + neigh_release(n); + + return err; +} + +static int c4iw_reconnect(struct c4iw_ep *ep) +{ + int err = 0; + struct sockaddr_in *laddr = (struct sockaddr_in *) + &ep->com.cm_id->local_addr; + struct sockaddr_in *raddr = (struct sockaddr_in *) + &ep->com.cm_id->remote_addr; + struct sockaddr_in6 *laddr6 = (struct sockaddr_in6 *) + &ep->com.cm_id->local_addr; + struct sockaddr_in6 *raddr6 = (struct sockaddr_in6 *) + &ep->com.cm_id->remote_addr; + int iptype; + __u8 *ra; + + PDBG("%s qp %p cm_id %p\n", __func__, ep->com.qp, ep->com.cm_id); + init_timer(&ep->timer); + + /* + * Allocate an active TID to initiate a TCP connection. + */ + ep->atid = cxgb4_alloc_atid(ep->com.dev->rdev.lldi.tids, ep); + if (ep->atid == -1) { + pr_err("%s - cannot alloc atid.\n", __func__); + err = -ENOMEM; + goto fail2; + } + insert_handle(ep->com.dev, &ep->com.dev->atid_idr, ep, ep->atid); + + /* find a route */ + if (ep->com.cm_id->local_addr.ss_family == AF_INET) { + ep->dst = find_route(ep->com.dev, laddr->sin_addr.s_addr, + raddr->sin_addr.s_addr, laddr->sin_port, + raddr->sin_port, 0); + iptype = 4; + ra = (__u8 *)&raddr->sin_addr; + } else { + ep->dst = find_route6(ep->com.dev, laddr6->sin6_addr.s6_addr, + raddr6->sin6_addr.s6_addr, + laddr6->sin6_port, raddr6->sin6_port, 0, + raddr6->sin6_scope_id); + iptype = 6; + ra = (__u8 *)&raddr6->sin6_addr; + } + if (!ep->dst) { + pr_err("%s - cannot find route.\n", __func__); + err = -EHOSTUNREACH; + goto fail3; + } + err = import_ep(ep, iptype, ra, ep->dst, ep->com.dev, false); + if (err) { + pr_err("%s - cannot alloc l2e.\n", __func__); + goto fail4; + } + + PDBG("%s txq_idx %u tx_chan %u smac_idx %u rss_qid %u l2t_idx %u\n", + __func__, ep->txq_idx, ep->tx_chan, ep->smac_idx, ep->rss_qid, + ep->l2t->idx); + + state_set(&ep->com, CONNECTING); + ep->tos = 0; + + /* send connect request to rnic */ + err = send_connect(ep); + if (!err) + goto out; + + cxgb4_l2t_release(ep->l2t); +fail4: + dst_release(ep->dst); +fail3: + remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid); + cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid); +fail2: + /* + * remember to send notification to upper layer. + * We are in here so the upper layer is not aware that this is + * re-connect attempt and so, upper layer is still waiting for + * response of 1st connect request. + */ + connect_reply_upcall(ep, -ECONNRESET); + c4iw_put_ep(&ep->com); +out: + return err; +} + +static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct c4iw_ep *ep; + struct cpl_act_open_rpl *rpl = cplhdr(skb); + unsigned int atid = GET_TID_TID(GET_AOPEN_ATID( + ntohl(rpl->atid_status))); + struct tid_info *t = dev->rdev.lldi.tids; + int status = GET_AOPEN_STATUS(ntohl(rpl->atid_status)); + struct sockaddr_in *la; + struct sockaddr_in *ra; + struct sockaddr_in6 *la6; + struct sockaddr_in6 *ra6; + + ep = lookup_atid(t, atid); + la = (struct sockaddr_in *)&ep->com.mapped_local_addr; + ra = (struct sockaddr_in *)&ep->com.mapped_remote_addr; + la6 = (struct sockaddr_in6 *)&ep->com.mapped_local_addr; + ra6 = (struct sockaddr_in6 *)&ep->com.mapped_remote_addr; + + PDBG("%s ep %p atid %u status %u errno %d\n", __func__, ep, atid, + status, status2errno(status)); + + if (is_neg_adv(status)) { + dev_warn(&dev->rdev.lldi.pdev->dev, + "Connection problems for atid %u status %u (%s)\n", + atid, status, neg_adv_str(status)); + return 0; + } + + set_bit(ACT_OPEN_RPL, &ep->com.history); + + /* + * Log interesting failures. + */ + switch (status) { + case CPL_ERR_CONN_RESET: + case CPL_ERR_CONN_TIMEDOUT: + break; + case CPL_ERR_TCAM_FULL: + mutex_lock(&dev->rdev.stats.lock); + dev->rdev.stats.tcam_full++; + mutex_unlock(&dev->rdev.stats.lock); + if (ep->com.local_addr.ss_family == AF_INET && + dev->rdev.lldi.enable_fw_ofld_conn) { + send_fw_act_open_req(ep, + GET_TID_TID(GET_AOPEN_ATID( + ntohl(rpl->atid_status)))); + return 0; + } + break; + case CPL_ERR_CONN_EXIST: + if (ep->retry_count++ < ACT_OPEN_RETRY_COUNT) { + set_bit(ACT_RETRY_INUSE, &ep->com.history); + remove_handle(ep->com.dev, &ep->com.dev->atid_idr, + atid); + cxgb4_free_atid(t, atid); + dst_release(ep->dst); + cxgb4_l2t_release(ep->l2t); + c4iw_reconnect(ep); + return 0; + } + break; + default: + if (ep->com.local_addr.ss_family == AF_INET) { + pr_info("Active open failure - atid %u status %u errno %d %pI4:%u->%pI4:%u\n", + atid, status, status2errno(status), + &la->sin_addr.s_addr, ntohs(la->sin_port), + &ra->sin_addr.s_addr, ntohs(ra->sin_port)); + } else { + pr_info("Active open failure - atid %u status %u errno %d %pI6:%u->%pI6:%u\n", + atid, status, status2errno(status), + la6->sin6_addr.s6_addr, ntohs(la6->sin6_port), + ra6->sin6_addr.s6_addr, ntohs(ra6->sin6_port)); + } + break; + } + + connect_reply_upcall(ep, status2errno(status)); + state_set(&ep->com, DEAD); + + if (status && act_open_has_tid(status)) + cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, GET_TID(rpl)); + + remove_handle(ep->com.dev, &ep->com.dev->atid_idr, atid); + cxgb4_free_atid(t, atid); + dst_release(ep->dst); + cxgb4_l2t_release(ep->l2t); + c4iw_put_ep(&ep->com); + + return 0; +} + +static int pass_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct cpl_pass_open_rpl *rpl = cplhdr(skb); + struct tid_info *t = dev->rdev.lldi.tids; + unsigned int stid = GET_TID(rpl); + struct c4iw_listen_ep *ep = lookup_stid(t, stid); + + if (!ep) { + PDBG("%s stid %d lookup failure!\n", __func__, stid); + goto out; + } + PDBG("%s ep %p status %d error %d\n", __func__, ep, + rpl->status, status2errno(rpl->status)); + c4iw_wake_up(&ep->com.wr_wait, status2errno(rpl->status)); + +out: + return 0; +} + +static int close_listsrv_rpl(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct cpl_close_listsvr_rpl *rpl = cplhdr(skb); + struct tid_info *t = dev->rdev.lldi.tids; + unsigned int stid = GET_TID(rpl); + struct c4iw_listen_ep *ep = lookup_stid(t, stid); + + PDBG("%s ep %p\n", __func__, ep); + c4iw_wake_up(&ep->com.wr_wait, status2errno(rpl->status)); + return 0; +} + +static void accept_cr(struct c4iw_ep *ep, struct sk_buff *skb, + struct cpl_pass_accept_req *req) +{ + struct cpl_pass_accept_rpl *rpl; + unsigned int mtu_idx; + u64 opt0; + u32 opt2; + int wscale; + struct cpl_t5_pass_accept_rpl *rpl5 = NULL; + int win; + + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + BUG_ON(skb_cloned(skb)); + + skb_get(skb); + rpl = cplhdr(skb); + if (is_t5(ep->com.dev->rdev.lldi.adapter_type)) { + skb_trim(skb, roundup(sizeof(*rpl5), 16)); + rpl5 = (void *)rpl; + INIT_TP_WR(rpl5, ep->hwtid); + } else { + skb_trim(skb, sizeof(*rpl)); + INIT_TP_WR(rpl, ep->hwtid); + } + OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, + ep->hwtid)); + + best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx, + enable_tcp_timestamps && req->tcpopt.tstamp, + (AF_INET == ep->com.remote_addr.ss_family) ? 0 : 1); + wscale = compute_wscale(rcv_win); + + /* + * Specify the largest window that will fit in opt0. The + * remainder will be specified in the rx_data_ack. + */ + win = ep->rcv_win >> 10; + if (win > RCV_BUFSIZ_MASK) + win = RCV_BUFSIZ_MASK; + opt0 = (nocong ? NO_CONG(1) : 0) | + KEEP_ALIVE(1) | + DELACK(1) | + WND_SCALE(wscale) | + MSS_IDX(mtu_idx) | + L2T_IDX(ep->l2t->idx) | + TX_CHAN(ep->tx_chan) | + SMAC_SEL(ep->smac_idx) | + DSCP(ep->tos >> 2) | + ULP_MODE(ULP_MODE_TCPDDP) | + RCV_BUFSIZ(win); + opt2 = RX_CHANNEL(0) | + RSS_QUEUE_VALID | RSS_QUEUE(ep->rss_qid); + + if (enable_tcp_timestamps && req->tcpopt.tstamp) + opt2 |= TSTAMPS_EN(1); + if (enable_tcp_sack && req->tcpopt.sack) + opt2 |= SACK_EN(1); + if (wscale && enable_tcp_window_scaling) + opt2 |= WND_SCALE_EN(1); + if (enable_ecn) { + const struct tcphdr *tcph; + u32 hlen = ntohl(req->hdr_len); + + tcph = (const void *)(req + 1) + G_ETH_HDR_LEN(hlen) + + G_IP_HDR_LEN(hlen); + if (tcph->ece && tcph->cwr) + opt2 |= CCTRL_ECN(1); + } + if (is_t5(ep->com.dev->rdev.lldi.adapter_type)) { + u32 isn = (prandom_u32() & ~7UL) - 1; + opt2 |= T5_OPT_2_VALID; + opt2 |= V_CONG_CNTRL(CONG_ALG_TAHOE); + opt2 |= CONG_CNTRL_VALID; /* OPT_2_ISS for T5 */ + rpl5 = (void *)rpl; + memset(&rpl5->iss, 0, roundup(sizeof(*rpl5)-sizeof(*rpl), 16)); + if (peer2peer) + isn += 4; + rpl5->iss = cpu_to_be32(isn); + PDBG("%s iss %u\n", __func__, be32_to_cpu(rpl5->iss)); + } + + rpl->opt0 = cpu_to_be64(opt0); + rpl->opt2 = cpu_to_be32(opt2); + set_wr_txq(skb, CPL_PRIORITY_SETUP, ep->ctrlq_idx); + t4_set_arp_err_handler(skb, NULL, arp_failure_discard); + c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); + + return; +} + +static void reject_cr(struct c4iw_dev *dev, u32 hwtid, struct sk_buff *skb) +{ + PDBG("%s c4iw_dev %p tid %u\n", __func__, dev, hwtid); + BUG_ON(skb_cloned(skb)); + skb_trim(skb, sizeof(struct cpl_tid_release)); + release_tid(&dev->rdev, hwtid, skb); + return; +} + +static void get_4tuple(struct cpl_pass_accept_req *req, int *iptype, + __u8 *local_ip, __u8 *peer_ip, + __be16 *local_port, __be16 *peer_port) +{ + int eth_len = G_ETH_HDR_LEN(be32_to_cpu(req->hdr_len)); + int ip_len = G_IP_HDR_LEN(be32_to_cpu(req->hdr_len)); + struct iphdr *ip = (struct iphdr *)((u8 *)(req + 1) + eth_len); + struct ipv6hdr *ip6 = (struct ipv6hdr *)((u8 *)(req + 1) + eth_len); + struct tcphdr *tcp = (struct tcphdr *) + ((u8 *)(req + 1) + eth_len + ip_len); + + if (ip->version == 4) { + PDBG("%s saddr 0x%x daddr 0x%x sport %u dport %u\n", __func__, + ntohl(ip->saddr), ntohl(ip->daddr), ntohs(tcp->source), + ntohs(tcp->dest)); + *iptype = 4; + memcpy(peer_ip, &ip->saddr, 4); + memcpy(local_ip, &ip->daddr, 4); + } else { + PDBG("%s saddr %pI6 daddr %pI6 sport %u dport %u\n", __func__, + ip6->saddr.s6_addr, ip6->daddr.s6_addr, ntohs(tcp->source), + ntohs(tcp->dest)); + *iptype = 6; + memcpy(peer_ip, ip6->saddr.s6_addr, 16); + memcpy(local_ip, ip6->daddr.s6_addr, 16); + } + *peer_port = tcp->source; + *local_port = tcp->dest; + + return; +} + +static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct c4iw_ep *child_ep = NULL, *parent_ep; + struct cpl_pass_accept_req *req = cplhdr(skb); + unsigned int stid = GET_POPEN_TID(ntohl(req->tos_stid)); + struct tid_info *t = dev->rdev.lldi.tids; + unsigned int hwtid = GET_TID(req); + struct dst_entry *dst; + __u8 local_ip[16], peer_ip[16]; + __be16 local_port, peer_port; + int err; + u16 peer_mss = ntohs(req->tcpopt.mss); + int iptype; + unsigned short hdrs; + + parent_ep = lookup_stid(t, stid); + if (!parent_ep) { + PDBG("%s connect request on invalid stid %d\n", __func__, stid); + goto reject; + } + + if (state_read(&parent_ep->com) != LISTEN) { + printk(KERN_ERR "%s - listening ep not in LISTEN\n", + __func__); + goto reject; + } + + get_4tuple(req, &iptype, local_ip, peer_ip, &local_port, &peer_port); + + /* Find output route */ + if (iptype == 4) { + PDBG("%s parent ep %p hwtid %u laddr %pI4 raddr %pI4 lport %d rport %d peer_mss %d\n" + , __func__, parent_ep, hwtid, + local_ip, peer_ip, ntohs(local_port), + ntohs(peer_port), peer_mss); + dst = find_route(dev, *(__be32 *)local_ip, *(__be32 *)peer_ip, + local_port, peer_port, + GET_POPEN_TOS(ntohl(req->tos_stid))); + } else { + PDBG("%s parent ep %p hwtid %u laddr %pI6 raddr %pI6 lport %d rport %d peer_mss %d\n" + , __func__, parent_ep, hwtid, + local_ip, peer_ip, ntohs(local_port), + ntohs(peer_port), peer_mss); + dst = find_route6(dev, local_ip, peer_ip, local_port, peer_port, + PASS_OPEN_TOS(ntohl(req->tos_stid)), + ((struct sockaddr_in6 *) + &parent_ep->com.local_addr)->sin6_scope_id); + } + if (!dst) { + printk(KERN_ERR MOD "%s - failed to find dst entry!\n", + __func__); + goto reject; + } + + child_ep = alloc_ep(sizeof(*child_ep), GFP_KERNEL); + if (!child_ep) { + printk(KERN_ERR MOD "%s - failed to allocate ep entry!\n", + __func__); + dst_release(dst); + goto reject; + } + + err = import_ep(child_ep, iptype, peer_ip, dst, dev, false); + if (err) { + printk(KERN_ERR MOD "%s - failed to allocate l2t entry!\n", + __func__); + dst_release(dst); + kfree(child_ep); + goto reject; + } + + hdrs = sizeof(struct iphdr) + sizeof(struct tcphdr) + + ((enable_tcp_timestamps && req->tcpopt.tstamp) ? 12 : 0); + if (peer_mss && child_ep->mtu > (peer_mss + hdrs)) + child_ep->mtu = peer_mss + hdrs; + + state_set(&child_ep->com, CONNECTING); + child_ep->com.dev = dev; + child_ep->com.cm_id = NULL; + if (iptype == 4) { + struct sockaddr_in *sin = (struct sockaddr_in *) + &child_ep->com.local_addr; + sin->sin_family = PF_INET; + sin->sin_port = local_port; + sin->sin_addr.s_addr = *(__be32 *)local_ip; + sin = (struct sockaddr_in *)&child_ep->com.remote_addr; + sin->sin_family = PF_INET; + sin->sin_port = peer_port; + sin->sin_addr.s_addr = *(__be32 *)peer_ip; + } else { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) + &child_ep->com.local_addr; + sin6->sin6_family = PF_INET6; + sin6->sin6_port = local_port; + memcpy(sin6->sin6_addr.s6_addr, local_ip, 16); + sin6 = (struct sockaddr_in6 *)&child_ep->com.remote_addr; + sin6->sin6_family = PF_INET6; + sin6->sin6_port = peer_port; + memcpy(sin6->sin6_addr.s6_addr, peer_ip, 16); + } + c4iw_get_ep(&parent_ep->com); + child_ep->parent_ep = parent_ep; + child_ep->tos = GET_POPEN_TOS(ntohl(req->tos_stid)); + child_ep->dst = dst; + child_ep->hwtid = hwtid; + + PDBG("%s tx_chan %u smac_idx %u rss_qid %u\n", __func__, + child_ep->tx_chan, child_ep->smac_idx, child_ep->rss_qid); + + init_timer(&child_ep->timer); + cxgb4_insert_tid(t, child_ep, hwtid); + insert_handle(dev, &dev->hwtid_idr, child_ep, child_ep->hwtid); + accept_cr(child_ep, skb, req); + set_bit(PASS_ACCEPT_REQ, &child_ep->com.history); + goto out; +reject: + reject_cr(dev, hwtid, skb); +out: + return 0; +} + +static int pass_establish(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct c4iw_ep *ep; + struct cpl_pass_establish *req = cplhdr(skb); + struct tid_info *t = dev->rdev.lldi.tids; + unsigned int tid = GET_TID(req); + + ep = lookup_tid(t, tid); + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + ep->snd_seq = be32_to_cpu(req->snd_isn); + ep->rcv_seq = be32_to_cpu(req->rcv_isn); + + PDBG("%s ep %p hwtid %u tcp_opt 0x%02x\n", __func__, ep, tid, + ntohs(req->tcp_opt)); + + set_emss(ep, ntohs(req->tcp_opt)); + + dst_confirm(ep->dst); + state_set(&ep->com, MPA_REQ_WAIT); + start_ep_timer(ep); + send_flowc(ep, skb); + set_bit(PASS_ESTAB, &ep->com.history); + + return 0; +} + +static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct cpl_peer_close *hdr = cplhdr(skb); + struct c4iw_ep *ep; + struct c4iw_qp_attributes attrs; + int disconnect = 1; + int release = 0; + struct tid_info *t = dev->rdev.lldi.tids; + unsigned int tid = GET_TID(hdr); + int ret; + + ep = lookup_tid(t, tid); + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + dst_confirm(ep->dst); + + set_bit(PEER_CLOSE, &ep->com.history); + mutex_lock(&ep->com.mutex); + switch (ep->com.state) { + case MPA_REQ_WAIT: + __state_set(&ep->com, CLOSING); + break; + case MPA_REQ_SENT: + __state_set(&ep->com, CLOSING); + connect_reply_upcall(ep, -ECONNRESET); + break; + case MPA_REQ_RCVD: + + /* + * We're gonna mark this puppy DEAD, but keep + * the reference on it until the ULP accepts or + * rejects the CR. Also wake up anyone waiting + * in rdma connection migration (see c4iw_accept_cr()). + */ + __state_set(&ep->com, CLOSING); + PDBG("waking up ep %p tid %u\n", ep, ep->hwtid); + c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); + break; + case MPA_REP_SENT: + __state_set(&ep->com, CLOSING); + PDBG("waking up ep %p tid %u\n", ep, ep->hwtid); + c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); + break; + case FPDU_MODE: + start_ep_timer(ep); + __state_set(&ep->com, CLOSING); + attrs.next_state = C4IW_QP_STATE_CLOSING; + ret = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, + C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); + if (ret != -ECONNRESET) { + peer_close_upcall(ep); + disconnect = 1; + } + break; + case ABORTING: + disconnect = 0; + break; + case CLOSING: + __state_set(&ep->com, MORIBUND); + disconnect = 0; + break; + case MORIBUND: + (void)stop_ep_timer(ep); + if (ep->com.cm_id && ep->com.qp) { + attrs.next_state = C4IW_QP_STATE_IDLE; + c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, + C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); + } + close_complete_upcall(ep, 0); + __state_set(&ep->com, DEAD); + release = 1; + disconnect = 0; + break; + case DEAD: + disconnect = 0; + break; + default: + BUG_ON(1); + } + mutex_unlock(&ep->com.mutex); + if (disconnect) + c4iw_ep_disconnect(ep, 0, GFP_KERNEL); + if (release) + release_ep_resources(ep); + return 0; +} + +static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct cpl_abort_req_rss *req = cplhdr(skb); + struct c4iw_ep *ep; + struct cpl_abort_rpl *rpl; + struct sk_buff *rpl_skb; + struct c4iw_qp_attributes attrs; + int ret; + int release = 0; + struct tid_info *t = dev->rdev.lldi.tids; + unsigned int tid = GET_TID(req); + + ep = lookup_tid(t, tid); + if (is_neg_adv(req->status)) { + dev_warn(&dev->rdev.lldi.pdev->dev, + "Negative advice on abort - tid %u status %d (%s)\n", + ep->hwtid, req->status, neg_adv_str(req->status)); + return 0; + } + PDBG("%s ep %p tid %u state %u\n", __func__, ep, ep->hwtid, + ep->com.state); + set_bit(PEER_ABORT, &ep->com.history); + + /* + * Wake up any threads in rdma_init() or rdma_fini(). + * However, this is not needed if com state is just + * MPA_REQ_SENT + */ + if (ep->com.state != MPA_REQ_SENT) + c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); + + mutex_lock(&ep->com.mutex); + switch (ep->com.state) { + case CONNECTING: + break; + case MPA_REQ_WAIT: + (void)stop_ep_timer(ep); + break; + case MPA_REQ_SENT: + (void)stop_ep_timer(ep); + if (mpa_rev == 1 || (mpa_rev == 2 && ep->tried_with_mpa_v1)) + connect_reply_upcall(ep, -ECONNRESET); + else { + /* + * we just don't send notification upwards because we + * want to retry with mpa_v1 without upper layers even + * knowing it. + * + * do some housekeeping so as to re-initiate the + * connection + */ + PDBG("%s: mpa_rev=%d. Retrying with mpav1\n", __func__, + mpa_rev); + ep->retry_with_mpa_v1 = 1; + } + break; + case MPA_REP_SENT: + break; + case MPA_REQ_RCVD: + break; + case MORIBUND: + case CLOSING: + stop_ep_timer(ep); + /*FALLTHROUGH*/ + case FPDU_MODE: + if (ep->com.cm_id && ep->com.qp) { + attrs.next_state = C4IW_QP_STATE_ERROR; + ret = c4iw_modify_qp(ep->com.qp->rhp, + ep->com.qp, C4IW_QP_ATTR_NEXT_STATE, + &attrs, 1); + if (ret) + printk(KERN_ERR MOD + "%s - qp <- error failed!\n", + __func__); + } + peer_abort_upcall(ep); + break; + case ABORTING: + break; + case DEAD: + PDBG("%s PEER_ABORT IN DEAD STATE!!!!\n", __func__); + mutex_unlock(&ep->com.mutex); + return 0; + default: + BUG_ON(1); + break; + } + dst_confirm(ep->dst); + if (ep->com.state != ABORTING) { + __state_set(&ep->com, DEAD); + /* we don't release if we want to retry with mpa_v1 */ + if (!ep->retry_with_mpa_v1) + release = 1; + } + mutex_unlock(&ep->com.mutex); + + rpl_skb = get_skb(skb, sizeof(*rpl), GFP_KERNEL); + if (!rpl_skb) { + printk(KERN_ERR MOD "%s - cannot allocate skb!\n", + __func__); + release = 1; + goto out; + } + set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); + rpl = (struct cpl_abort_rpl *) skb_put(rpl_skb, sizeof(*rpl)); + INIT_TP_WR(rpl, ep->hwtid); + OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_ABORT_RPL, ep->hwtid)); + rpl->cmd = CPL_ABORT_NO_RST; + c4iw_ofld_send(&ep->com.dev->rdev, rpl_skb); +out: + if (release) + release_ep_resources(ep); + else if (ep->retry_with_mpa_v1) { + remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid); + cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, ep->hwtid); + dst_release(ep->dst); + cxgb4_l2t_release(ep->l2t); + c4iw_reconnect(ep); + } + + return 0; +} + +static int close_con_rpl(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct c4iw_ep *ep; + struct c4iw_qp_attributes attrs; + struct cpl_close_con_rpl *rpl = cplhdr(skb); + int release = 0; + struct tid_info *t = dev->rdev.lldi.tids; + unsigned int tid = GET_TID(rpl); + + ep = lookup_tid(t, tid); + + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + BUG_ON(!ep); + + /* The cm_id may be null if we failed to connect */ + mutex_lock(&ep->com.mutex); + switch (ep->com.state) { + case CLOSING: + __state_set(&ep->com, MORIBUND); + break; + case MORIBUND: + (void)stop_ep_timer(ep); + if ((ep->com.cm_id) && (ep->com.qp)) { + attrs.next_state = C4IW_QP_STATE_IDLE; + c4iw_modify_qp(ep->com.qp->rhp, + ep->com.qp, + C4IW_QP_ATTR_NEXT_STATE, + &attrs, 1); + } + close_complete_upcall(ep, 0); + __state_set(&ep->com, DEAD); + release = 1; + break; + case ABORTING: + case DEAD: + break; + default: + BUG_ON(1); + break; + } + mutex_unlock(&ep->com.mutex); + if (release) + release_ep_resources(ep); + return 0; +} + +static int terminate(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct cpl_rdma_terminate *rpl = cplhdr(skb); + struct tid_info *t = dev->rdev.lldi.tids; + unsigned int tid = GET_TID(rpl); + struct c4iw_ep *ep; + struct c4iw_qp_attributes attrs; + + ep = lookup_tid(t, tid); + BUG_ON(!ep); + + if (ep && ep->com.qp) { + printk(KERN_WARNING MOD "TERM received tid %u qpid %u\n", tid, + ep->com.qp->wq.sq.qid); + attrs.next_state = C4IW_QP_STATE_TERMINATE; + c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, + C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); + } else + printk(KERN_WARNING MOD "TERM received tid %u no ep/qp\n", tid); + + return 0; +} + +/* + * Upcall from the adapter indicating data has been transmitted. + * For us its just the single MPA request or reply. We can now free + * the skb holding the mpa message. + */ +static int fw4_ack(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct c4iw_ep *ep; + struct cpl_fw4_ack *hdr = cplhdr(skb); + u8 credits = hdr->credits; + unsigned int tid = GET_TID(hdr); + struct tid_info *t = dev->rdev.lldi.tids; + + + ep = lookup_tid(t, tid); + PDBG("%s ep %p tid %u credits %u\n", __func__, ep, ep->hwtid, credits); + if (credits == 0) { + PDBG("%s 0 credit ack ep %p tid %u state %u\n", + __func__, ep, ep->hwtid, state_read(&ep->com)); + return 0; + } + + dst_confirm(ep->dst); + if (ep->mpa_skb) { + PDBG("%s last streaming msg ack ep %p tid %u state %u " + "initiator %u freeing skb\n", __func__, ep, ep->hwtid, + state_read(&ep->com), ep->mpa_attr.initiator ? 1 : 0); + kfree_skb(ep->mpa_skb); + ep->mpa_skb = NULL; + } + return 0; +} + +int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) +{ + int err = 0; + int disconnect = 0; + struct c4iw_ep *ep = to_ep(cm_id); + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + + mutex_lock(&ep->com.mutex); + if (ep->com.state == DEAD) { + mutex_unlock(&ep->com.mutex); + c4iw_put_ep(&ep->com); + return -ECONNRESET; + } + set_bit(ULP_REJECT, &ep->com.history); + BUG_ON(ep->com.state != MPA_REQ_RCVD); + if (mpa_rev == 0) + abort_connection(ep, NULL, GFP_KERNEL); + else { + err = send_mpa_reject(ep, pdata, pdata_len); + disconnect = 1; + } + mutex_unlock(&ep->com.mutex); + if (disconnect) + err = c4iw_ep_disconnect(ep, 0, GFP_KERNEL); + c4iw_put_ep(&ep->com); + return 0; +} + +int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) +{ + int err; + struct c4iw_qp_attributes attrs; + enum c4iw_qp_attr_mask mask; + struct c4iw_ep *ep = to_ep(cm_id); + struct c4iw_dev *h = to_c4iw_dev(cm_id->device); + struct c4iw_qp *qp = get_qhp(h, conn_param->qpn); + + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); + + mutex_lock(&ep->com.mutex); + if (ep->com.state == DEAD) { + err = -ECONNRESET; + goto err; + } + + BUG_ON(ep->com.state != MPA_REQ_RCVD); + BUG_ON(!qp); + + set_bit(ULP_ACCEPT, &ep->com.history); + if ((conn_param->ord > cur_max_read_depth(ep->com.dev)) || + (conn_param->ird > cur_max_read_depth(ep->com.dev))) { + abort_connection(ep, NULL, GFP_KERNEL); + err = -EINVAL; + goto err; + } + + if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) { + if (conn_param->ord > ep->ird) { + if (RELAXED_IRD_NEGOTIATION) { + ep->ord = ep->ird; + } else { + ep->ird = conn_param->ird; + ep->ord = conn_param->ord; + send_mpa_reject(ep, conn_param->private_data, + conn_param->private_data_len); + abort_connection(ep, NULL, GFP_KERNEL); + err = -ENOMEM; + goto err; + } + } + if (conn_param->ird < ep->ord) { + if (RELAXED_IRD_NEGOTIATION && + ep->ord <= h->rdev.lldi.max_ordird_qp) { + conn_param->ird = ep->ord; + } else { + abort_connection(ep, NULL, GFP_KERNEL); + err = -ENOMEM; + goto err; + } + } + } + ep->ird = conn_param->ird; + ep->ord = conn_param->ord; + + if (ep->mpa_attr.version == 1) { + if (peer2peer && ep->ird == 0) + ep->ird = 1; + } else { + if (peer2peer && + (ep->mpa_attr.p2p_type != FW_RI_INIT_P2PTYPE_DISABLED) && + (p2p_type == FW_RI_INIT_P2PTYPE_READ_REQ) && ep->ord == 0) + ep->ird = 1; + } + + PDBG("%s %d ird %d ord %d\n", __func__, __LINE__, ep->ird, ep->ord); + + cm_id->add_ref(cm_id); + ep->com.cm_id = cm_id; + ep->com.qp = qp; + ref_qp(ep); + + /* bind QP to EP and move to RTS */ + attrs.mpa_attr = ep->mpa_attr; + attrs.max_ird = ep->ird; + attrs.max_ord = ep->ord; + attrs.llp_stream_handle = ep; + attrs.next_state = C4IW_QP_STATE_RTS; + + /* bind QP and TID with INIT_WR */ + mask = C4IW_QP_ATTR_NEXT_STATE | + C4IW_QP_ATTR_LLP_STREAM_HANDLE | + C4IW_QP_ATTR_MPA_ATTR | + C4IW_QP_ATTR_MAX_IRD | + C4IW_QP_ATTR_MAX_ORD; + + err = c4iw_modify_qp(ep->com.qp->rhp, + ep->com.qp, mask, &attrs, 1); + if (err) + goto err1; + err = send_mpa_reply(ep, conn_param->private_data, + conn_param->private_data_len); + if (err) + goto err1; + + __state_set(&ep->com, FPDU_MODE); + established_upcall(ep); + mutex_unlock(&ep->com.mutex); + c4iw_put_ep(&ep->com); + return 0; +err1: + ep->com.cm_id = NULL; + abort_connection(ep, NULL, GFP_KERNEL); + cm_id->rem_ref(cm_id); +err: + mutex_unlock(&ep->com.mutex); + c4iw_put_ep(&ep->com); + return err; +} + +static int pick_local_ipaddrs(struct c4iw_dev *dev, struct iw_cm_id *cm_id) +{ + struct in_device *ind; + int found = 0; + struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr; + struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; + + ind = in_dev_get(dev->rdev.lldi.ports[0]); + if (!ind) + return -EADDRNOTAVAIL; + for_primary_ifa(ind) { + laddr->sin_addr.s_addr = ifa->ifa_address; + raddr->sin_addr.s_addr = ifa->ifa_address; + found = 1; + break; + } + endfor_ifa(ind); + in_dev_put(ind); + return found ? 0 : -EADDRNOTAVAIL; +} + +static int get_lladdr(struct net_device *dev, struct in6_addr *addr, + unsigned char banned_flags) +{ + struct inet6_dev *idev; + int err = -EADDRNOTAVAIL; + + rcu_read_lock(); + idev = __in6_dev_get(dev); + if (idev != NULL) { + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifp, &idev->addr_list, if_list) { + if (ifp->scope == IFA_LINK && + !(ifp->flags & banned_flags)) { + memcpy(addr, &ifp->addr, 16); + err = 0; + break; + } + } + read_unlock_bh(&idev->lock); + } + rcu_read_unlock(); + return err; +} + +static int pick_local_ip6addrs(struct c4iw_dev *dev, struct iw_cm_id *cm_id) +{ + struct in6_addr uninitialized_var(addr); + struct sockaddr_in6 *la6 = (struct sockaddr_in6 *)&cm_id->local_addr; + struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *)&cm_id->remote_addr; + + if (get_lladdr(dev->rdev.lldi.ports[0], &addr, IFA_F_TENTATIVE)) { + memcpy(la6->sin6_addr.s6_addr, &addr, 16); + memcpy(ra6->sin6_addr.s6_addr, &addr, 16); + return 0; + } + return -EADDRNOTAVAIL; +} + +int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) +{ + struct c4iw_dev *dev = to_c4iw_dev(cm_id->device); + struct c4iw_ep *ep; + int err = 0; + struct sockaddr_in *laddr; + struct sockaddr_in *raddr; + struct sockaddr_in6 *laddr6; + struct sockaddr_in6 *raddr6; + struct iwpm_dev_data pm_reg_msg; + struct iwpm_sa_data pm_msg; + __u8 *ra; + int iptype; + int iwpm_err = 0; + + if ((conn_param->ord > cur_max_read_depth(dev)) || + (conn_param->ird > cur_max_read_depth(dev))) { + err = -EINVAL; + goto out; + } + ep = alloc_ep(sizeof(*ep), GFP_KERNEL); + if (!ep) { + printk(KERN_ERR MOD "%s - cannot alloc ep.\n", __func__); + err = -ENOMEM; + goto out; + } + init_timer(&ep->timer); + ep->plen = conn_param->private_data_len; + if (ep->plen) + memcpy(ep->mpa_pkt + sizeof(struct mpa_message), + conn_param->private_data, ep->plen); + ep->ird = conn_param->ird; + ep->ord = conn_param->ord; + + if (peer2peer && ep->ord == 0) + ep->ord = 1; + + cm_id->add_ref(cm_id); + ep->com.dev = dev; + ep->com.cm_id = cm_id; + ep->com.qp = get_qhp(dev, conn_param->qpn); + if (!ep->com.qp) { + PDBG("%s qpn 0x%x not found!\n", __func__, conn_param->qpn); + err = -EINVAL; + goto fail1; + } + ref_qp(ep); + PDBG("%s qpn 0x%x qp %p cm_id %p\n", __func__, conn_param->qpn, + ep->com.qp, cm_id); + + /* + * Allocate an active TID to initiate a TCP connection. + */ + ep->atid = cxgb4_alloc_atid(dev->rdev.lldi.tids, ep); + if (ep->atid == -1) { + printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __func__); + err = -ENOMEM; + goto fail1; + } + insert_handle(dev, &dev->atid_idr, ep, ep->atid); + + memcpy(&ep->com.local_addr, &cm_id->local_addr, + sizeof(ep->com.local_addr)); + memcpy(&ep->com.remote_addr, &cm_id->remote_addr, + sizeof(ep->com.remote_addr)); + + /* No port mapper available, go with the specified peer information */ + memcpy(&ep->com.mapped_local_addr, &cm_id->local_addr, + sizeof(ep->com.mapped_local_addr)); + memcpy(&ep->com.mapped_remote_addr, &cm_id->remote_addr, + sizeof(ep->com.mapped_remote_addr)); + + c4iw_form_reg_msg(dev, &pm_reg_msg); + iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_C4IW); + if (iwpm_err) { + PDBG("%s: Port Mapper reg pid fail (err = %d).\n", + __func__, iwpm_err); + } + if (iwpm_valid_pid() && !iwpm_err) { + c4iw_form_pm_msg(ep, &pm_msg); + iwpm_err = iwpm_add_and_query_mapping(&pm_msg, RDMA_NL_C4IW); + if (iwpm_err) + PDBG("%s: Port Mapper query fail (err = %d).\n", + __func__, iwpm_err); + else + c4iw_record_pm_msg(ep, &pm_msg); + } + if (iwpm_create_mapinfo(&ep->com.local_addr, + &ep->com.mapped_local_addr, RDMA_NL_C4IW)) { + iwpm_remove_mapping(&ep->com.local_addr, RDMA_NL_C4IW); + err = -ENOMEM; + goto fail1; + } + print_addr(&ep->com, __func__, "add_query/create_mapinfo"); + set_bit(RELEASE_MAPINFO, &ep->com.flags); + + laddr = (struct sockaddr_in *)&ep->com.mapped_local_addr; + raddr = (struct sockaddr_in *)&ep->com.mapped_remote_addr; + laddr6 = (struct sockaddr_in6 *)&ep->com.mapped_local_addr; + raddr6 = (struct sockaddr_in6 *) &ep->com.mapped_remote_addr; + + if (cm_id->remote_addr.ss_family == AF_INET) { + iptype = 4; + ra = (__u8 *)&raddr->sin_addr; + + /* + * Handle loopback requests to INADDR_ANY. + */ + if ((__force int)raddr->sin_addr.s_addr == INADDR_ANY) { + err = pick_local_ipaddrs(dev, cm_id); + if (err) + goto fail1; + } + + /* find a route */ + PDBG("%s saddr %pI4 sport 0x%x raddr %pI4 rport 0x%x\n", + __func__, &laddr->sin_addr, ntohs(laddr->sin_port), + ra, ntohs(raddr->sin_port)); + ep->dst = find_route(dev, laddr->sin_addr.s_addr, + raddr->sin_addr.s_addr, laddr->sin_port, + raddr->sin_port, 0); + } else { + iptype = 6; + ra = (__u8 *)&raddr6->sin6_addr; + + /* + * Handle loopback requests to INADDR_ANY. + */ + if (ipv6_addr_type(&raddr6->sin6_addr) == IPV6_ADDR_ANY) { + err = pick_local_ip6addrs(dev, cm_id); + if (err) + goto fail1; + } + + /* find a route */ + PDBG("%s saddr %pI6 sport 0x%x raddr %pI6 rport 0x%x\n", + __func__, laddr6->sin6_addr.s6_addr, + ntohs(laddr6->sin6_port), + raddr6->sin6_addr.s6_addr, ntohs(raddr6->sin6_port)); + ep->dst = find_route6(dev, laddr6->sin6_addr.s6_addr, + raddr6->sin6_addr.s6_addr, + laddr6->sin6_port, raddr6->sin6_port, 0, + raddr6->sin6_scope_id); + } + if (!ep->dst) { + printk(KERN_ERR MOD "%s - cannot find route.\n", __func__); + err = -EHOSTUNREACH; + goto fail2; + } + + err = import_ep(ep, iptype, ra, ep->dst, ep->com.dev, true); + if (err) { + printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __func__); + goto fail3; + } + + PDBG("%s txq_idx %u tx_chan %u smac_idx %u rss_qid %u l2t_idx %u\n", + __func__, ep->txq_idx, ep->tx_chan, ep->smac_idx, ep->rss_qid, + ep->l2t->idx); + + state_set(&ep->com, CONNECTING); + ep->tos = 0; + + /* send connect request to rnic */ + err = send_connect(ep); + if (!err) + goto out; + + cxgb4_l2t_release(ep->l2t); +fail3: + dst_release(ep->dst); +fail2: + remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid); + cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid); +fail1: + cm_id->rem_ref(cm_id); + c4iw_put_ep(&ep->com); +out: + return err; +} + +static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep) +{ + int err; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) + &ep->com.mapped_local_addr; + + c4iw_init_wr_wait(&ep->com.wr_wait); + err = cxgb4_create_server6(ep->com.dev->rdev.lldi.ports[0], + ep->stid, &sin6->sin6_addr, + sin6->sin6_port, + ep->com.dev->rdev.lldi.rxq_ids[0]); + if (!err) + err = c4iw_wait_for_reply(&ep->com.dev->rdev, + &ep->com.wr_wait, + 0, 0, __func__); + if (err) + pr_err("cxgb4_create_server6/filter failed err %d stid %d laddr %pI6 lport %d\n", + err, ep->stid, + sin6->sin6_addr.s6_addr, ntohs(sin6->sin6_port)); + return err; +} + +static int create_server4(struct c4iw_dev *dev, struct c4iw_listen_ep *ep) +{ + int err; + struct sockaddr_in *sin = (struct sockaddr_in *) + &ep->com.mapped_local_addr; + + if (dev->rdev.lldi.enable_fw_ofld_conn) { + do { + err = cxgb4_create_server_filter( + ep->com.dev->rdev.lldi.ports[0], ep->stid, + sin->sin_addr.s_addr, sin->sin_port, 0, + ep->com.dev->rdev.lldi.rxq_ids[0], 0, 0); + if (err == -EBUSY) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(usecs_to_jiffies(100)); + } + } while (err == -EBUSY); + } else { + c4iw_init_wr_wait(&ep->com.wr_wait); + err = cxgb4_create_server(ep->com.dev->rdev.lldi.ports[0], + ep->stid, sin->sin_addr.s_addr, sin->sin_port, + 0, ep->com.dev->rdev.lldi.rxq_ids[0]); + if (!err) + err = c4iw_wait_for_reply(&ep->com.dev->rdev, + &ep->com.wr_wait, + 0, 0, __func__); + } + if (err) + pr_err("cxgb4_create_server/filter failed err %d stid %d laddr %pI4 lport %d\n" + , err, ep->stid, + &sin->sin_addr, ntohs(sin->sin_port)); + return err; +} + +int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) +{ + int err = 0; + struct c4iw_dev *dev = to_c4iw_dev(cm_id->device); + struct c4iw_listen_ep *ep; + struct iwpm_dev_data pm_reg_msg; + struct iwpm_sa_data pm_msg; + int iwpm_err = 0; + + might_sleep(); + + ep = alloc_ep(sizeof(*ep), GFP_KERNEL); + if (!ep) { + printk(KERN_ERR MOD "%s - cannot alloc ep.\n", __func__); + err = -ENOMEM; + goto fail1; + } + PDBG("%s ep %p\n", __func__, ep); + cm_id->add_ref(cm_id); + ep->com.cm_id = cm_id; + ep->com.dev = dev; + ep->backlog = backlog; + memcpy(&ep->com.local_addr, &cm_id->local_addr, + sizeof(ep->com.local_addr)); + + /* + * Allocate a server TID. + */ + if (dev->rdev.lldi.enable_fw_ofld_conn && + ep->com.local_addr.ss_family == AF_INET) + ep->stid = cxgb4_alloc_sftid(dev->rdev.lldi.tids, + cm_id->local_addr.ss_family, ep); + else + ep->stid = cxgb4_alloc_stid(dev->rdev.lldi.tids, + cm_id->local_addr.ss_family, ep); + + if (ep->stid == -1) { + printk(KERN_ERR MOD "%s - cannot alloc stid.\n", __func__); + err = -ENOMEM; + goto fail2; + } + insert_handle(dev, &dev->stid_idr, ep, ep->stid); + + /* No port mapper available, go with the specified info */ + memcpy(&ep->com.mapped_local_addr, &cm_id->local_addr, + sizeof(ep->com.mapped_local_addr)); + + c4iw_form_reg_msg(dev, &pm_reg_msg); + iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_C4IW); + if (iwpm_err) { + PDBG("%s: Port Mapper reg pid fail (err = %d).\n", + __func__, iwpm_err); + } + if (iwpm_valid_pid() && !iwpm_err) { + memcpy(&pm_msg.loc_addr, &ep->com.local_addr, + sizeof(ep->com.local_addr)); + iwpm_err = iwpm_add_mapping(&pm_msg, RDMA_NL_C4IW); + if (iwpm_err) + PDBG("%s: Port Mapper query fail (err = %d).\n", + __func__, iwpm_err); + else + memcpy(&ep->com.mapped_local_addr, + &pm_msg.mapped_loc_addr, + sizeof(ep->com.mapped_local_addr)); + } + if (iwpm_create_mapinfo(&ep->com.local_addr, + &ep->com.mapped_local_addr, RDMA_NL_C4IW)) { + err = -ENOMEM; + goto fail3; + } + print_addr(&ep->com, __func__, "add_mapping/create_mapinfo"); + + set_bit(RELEASE_MAPINFO, &ep->com.flags); + state_set(&ep->com, LISTEN); + if (ep->com.local_addr.ss_family == AF_INET) + err = create_server4(dev, ep); + else + err = create_server6(dev, ep); + if (!err) { + cm_id->provider_data = ep; + goto out; + } + +fail3: + cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid, + ep->com.local_addr.ss_family); +fail2: + cm_id->rem_ref(cm_id); + c4iw_put_ep(&ep->com); +fail1: +out: + return err; +} + +int c4iw_destroy_listen(struct iw_cm_id *cm_id) +{ + int err; + struct c4iw_listen_ep *ep = to_listen_ep(cm_id); + + PDBG("%s ep %p\n", __func__, ep); + + might_sleep(); + state_set(&ep->com, DEAD); + if (ep->com.dev->rdev.lldi.enable_fw_ofld_conn && + ep->com.local_addr.ss_family == AF_INET) { + err = cxgb4_remove_server_filter( + ep->com.dev->rdev.lldi.ports[0], ep->stid, + ep->com.dev->rdev.lldi.rxq_ids[0], 0); + } else { + c4iw_init_wr_wait(&ep->com.wr_wait); + err = cxgb4_remove_server( + ep->com.dev->rdev.lldi.ports[0], ep->stid, + ep->com.dev->rdev.lldi.rxq_ids[0], 0); + if (err) + goto done; + err = c4iw_wait_for_reply(&ep->com.dev->rdev, &ep->com.wr_wait, + 0, 0, __func__); + } + remove_handle(ep->com.dev, &ep->com.dev->stid_idr, ep->stid); + cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid, + ep->com.local_addr.ss_family); +done: + cm_id->rem_ref(cm_id); + c4iw_put_ep(&ep->com); + return err; +} + +int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp) +{ + int ret = 0; + int close = 0; + int fatal = 0; + struct c4iw_rdev *rdev; + + mutex_lock(&ep->com.mutex); + + PDBG("%s ep %p state %s, abrupt %d\n", __func__, ep, + states[ep->com.state], abrupt); + + rdev = &ep->com.dev->rdev; + if (c4iw_fatal_error(rdev)) { + fatal = 1; + close_complete_upcall(ep, -EIO); + ep->com.state = DEAD; + } + switch (ep->com.state) { + case MPA_REQ_WAIT: + case MPA_REQ_SENT: + case MPA_REQ_RCVD: + case MPA_REP_SENT: + case FPDU_MODE: + close = 1; + if (abrupt) + ep->com.state = ABORTING; + else { + ep->com.state = CLOSING; + start_ep_timer(ep); + } + set_bit(CLOSE_SENT, &ep->com.flags); + break; + case CLOSING: + if (!test_and_set_bit(CLOSE_SENT, &ep->com.flags)) { + close = 1; + if (abrupt) { + (void)stop_ep_timer(ep); + ep->com.state = ABORTING; + } else + ep->com.state = MORIBUND; + } + break; + case MORIBUND: + case ABORTING: + case DEAD: + PDBG("%s ignoring disconnect ep %p state %u\n", + __func__, ep, ep->com.state); + break; + default: + BUG(); + break; + } + + if (close) { + if (abrupt) { + set_bit(EP_DISC_ABORT, &ep->com.history); + close_complete_upcall(ep, -ECONNRESET); + ret = send_abort(ep, NULL, gfp); + } else { + set_bit(EP_DISC_CLOSE, &ep->com.history); + ret = send_halfclose(ep, gfp); + } + if (ret) + fatal = 1; + } + mutex_unlock(&ep->com.mutex); + if (fatal) + release_ep_resources(ep); + return ret; +} + +static void active_ofld_conn_reply(struct c4iw_dev *dev, struct sk_buff *skb, + struct cpl_fw6_msg_ofld_connection_wr_rpl *req) +{ + struct c4iw_ep *ep; + int atid = be32_to_cpu(req->tid); + + ep = (struct c4iw_ep *)lookup_atid(dev->rdev.lldi.tids, + (__force u32) req->tid); + if (!ep) + return; + + switch (req->retval) { + case FW_ENOMEM: + set_bit(ACT_RETRY_NOMEM, &ep->com.history); + if (ep->retry_count++ < ACT_OPEN_RETRY_COUNT) { + send_fw_act_open_req(ep, atid); + return; + } + case FW_EADDRINUSE: + set_bit(ACT_RETRY_INUSE, &ep->com.history); + if (ep->retry_count++ < ACT_OPEN_RETRY_COUNT) { + send_fw_act_open_req(ep, atid); + return; + } + break; + default: + pr_info("%s unexpected ofld conn wr retval %d\n", + __func__, req->retval); + break; + } + pr_err("active ofld_connect_wr failure %d atid %d\n", + req->retval, atid); + mutex_lock(&dev->rdev.stats.lock); + dev->rdev.stats.act_ofld_conn_fails++; + mutex_unlock(&dev->rdev.stats.lock); + connect_reply_upcall(ep, status2errno(req->retval)); + state_set(&ep->com, DEAD); + remove_handle(dev, &dev->atid_idr, atid); + cxgb4_free_atid(dev->rdev.lldi.tids, atid); + dst_release(ep->dst); + cxgb4_l2t_release(ep->l2t); + c4iw_put_ep(&ep->com); +} + +static void passive_ofld_conn_reply(struct c4iw_dev *dev, struct sk_buff *skb, + struct cpl_fw6_msg_ofld_connection_wr_rpl *req) +{ + struct sk_buff *rpl_skb; + struct cpl_pass_accept_req *cpl; + int ret; + + rpl_skb = (struct sk_buff *)(unsigned long)req->cookie; + BUG_ON(!rpl_skb); + if (req->retval) { + PDBG("%s passive open failure %d\n", __func__, req->retval); + mutex_lock(&dev->rdev.stats.lock); + dev->rdev.stats.pas_ofld_conn_fails++; + mutex_unlock(&dev->rdev.stats.lock); + kfree_skb(rpl_skb); + } else { + cpl = (struct cpl_pass_accept_req *)cplhdr(rpl_skb); + OPCODE_TID(cpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_REQ, + (__force u32) htonl( + (__force u32) req->tid))); + ret = pass_accept_req(dev, rpl_skb); + if (!ret) + kfree_skb(rpl_skb); + } + return; +} + +static int deferred_fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct cpl_fw6_msg *rpl = cplhdr(skb); + struct cpl_fw6_msg_ofld_connection_wr_rpl *req; + + switch (rpl->type) { + case FW6_TYPE_CQE: + c4iw_ev_dispatch(dev, (struct t4_cqe *)&rpl->data[0]); + break; + case FW6_TYPE_OFLD_CONNECTION_WR_RPL: + req = (struct cpl_fw6_msg_ofld_connection_wr_rpl *)rpl->data; + switch (req->t_state) { + case TCP_SYN_SENT: + active_ofld_conn_reply(dev, skb, req); + break; + case TCP_SYN_RECV: + passive_ofld_conn_reply(dev, skb, req); + break; + default: + pr_err("%s unexpected ofld conn wr state %d\n", + __func__, req->t_state); + break; + } + break; + } + return 0; +} + +static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos) +{ + u32 l2info; + u16 vlantag, len, hdr_len, eth_hdr_len; + u8 intf; + struct cpl_rx_pkt *cpl = cplhdr(skb); + struct cpl_pass_accept_req *req; + struct tcp_options_received tmp_opt; + struct c4iw_dev *dev; + + dev = *((struct c4iw_dev **) (skb->cb + sizeof(void *))); + /* Store values from cpl_rx_pkt in temporary location. */ + vlantag = (__force u16) cpl->vlan; + len = (__force u16) cpl->len; + l2info = (__force u32) cpl->l2info; + hdr_len = (__force u16) cpl->hdr_len; + intf = cpl->iff; + + __skb_pull(skb, sizeof(*req) + sizeof(struct rss_header)); + + /* + * We need to parse the TCP options from SYN packet. + * to generate cpl_pass_accept_req. + */ + memset(&tmp_opt, 0, sizeof(tmp_opt)); + tcp_clear_options(&tmp_opt); + tcp_parse_options(skb, &tmp_opt, 0, NULL); + + req = (struct cpl_pass_accept_req *)__skb_push(skb, sizeof(*req)); + memset(req, 0, sizeof(*req)); + req->l2info = cpu_to_be16(V_SYN_INTF(intf) | + V_SYN_MAC_IDX(G_RX_MACIDX( + (__force int) htonl(l2info))) | + F_SYN_XACT_MATCH); + eth_hdr_len = is_t4(dev->rdev.lldi.adapter_type) ? + G_RX_ETHHDR_LEN((__force int) htonl(l2info)) : + G_RX_T5_ETHHDR_LEN((__force int) htonl(l2info)); + req->hdr_len = cpu_to_be32(V_SYN_RX_CHAN(G_RX_CHAN( + (__force int) htonl(l2info))) | + V_TCP_HDR_LEN(G_RX_TCPHDR_LEN( + (__force int) htons(hdr_len))) | + V_IP_HDR_LEN(G_RX_IPHDR_LEN( + (__force int) htons(hdr_len))) | + V_ETH_HDR_LEN(G_RX_ETHHDR_LEN(eth_hdr_len))); + req->vlan = (__force __be16) vlantag; + req->len = (__force __be16) len; + req->tos_stid = cpu_to_be32(PASS_OPEN_TID(stid) | + PASS_OPEN_TOS(tos)); + req->tcpopt.mss = htons(tmp_opt.mss_clamp); + if (tmp_opt.wscale_ok) + req->tcpopt.wsf = tmp_opt.snd_wscale; + req->tcpopt.tstamp = tmp_opt.saw_tstamp; + if (tmp_opt.sack_ok) + req->tcpopt.sack = 1; + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_REQ, 0)); + return; +} + +static void send_fw_pass_open_req(struct c4iw_dev *dev, struct sk_buff *skb, + __be32 laddr, __be16 lport, + __be32 raddr, __be16 rport, + u32 rcv_isn, u32 filter, u16 window, + u32 rss_qid, u8 port_id) +{ + struct sk_buff *req_skb; + struct fw_ofld_connection_wr *req; + struct cpl_pass_accept_req *cpl = cplhdr(skb); + int ret; + + req_skb = alloc_skb(sizeof(struct fw_ofld_connection_wr), GFP_KERNEL); + req = (struct fw_ofld_connection_wr *)__skb_put(req_skb, sizeof(*req)); + memset(req, 0, sizeof(*req)); + req->op_compl = htonl(V_WR_OP(FW_OFLD_CONNECTION_WR) | FW_WR_COMPL(1)); + req->len16_pkd = htonl(FW_WR_LEN16(DIV_ROUND_UP(sizeof(*req), 16))); + req->le.version_cpl = htonl(F_FW_OFLD_CONNECTION_WR_CPL); + req->le.filter = (__force __be32) filter; + req->le.lport = lport; + req->le.pport = rport; + req->le.u.ipv4.lip = laddr; + req->le.u.ipv4.pip = raddr; + req->tcb.rcv_nxt = htonl(rcv_isn + 1); + req->tcb.rcv_adv = htons(window); + req->tcb.t_state_to_astid = + htonl(V_FW_OFLD_CONNECTION_WR_T_STATE(TCP_SYN_RECV) | + V_FW_OFLD_CONNECTION_WR_RCV_SCALE(cpl->tcpopt.wsf) | + V_FW_OFLD_CONNECTION_WR_ASTID( + GET_PASS_OPEN_TID(ntohl(cpl->tos_stid)))); + + /* + * We store the qid in opt2 which will be used by the firmware + * to send us the wr response. + */ + req->tcb.opt2 = htonl(V_RSS_QUEUE(rss_qid)); + + /* + * We initialize the MSS index in TCB to 0xF. + * So that when driver sends cpl_pass_accept_rpl + * TCB picks up the correct value. If this was 0 + * TP will ignore any value > 0 for MSS index. + */ + req->tcb.opt0 = cpu_to_be64(V_MSS_IDX(0xF)); + req->cookie = (unsigned long)skb; + + set_wr_txq(req_skb, CPL_PRIORITY_CONTROL, port_id); + ret = cxgb4_ofld_send(dev->rdev.lldi.ports[0], req_skb); + if (ret < 0) { + pr_err("%s - cxgb4_ofld_send error %d - dropping\n", __func__, + ret); + kfree_skb(skb); + kfree_skb(req_skb); + } +} + +/* + * Handler for CPL_RX_PKT message. Need to handle cpl_rx_pkt + * messages when a filter is being used instead of server to + * redirect a syn packet. When packets hit filter they are redirected + * to the offload queue and driver tries to establish the connection + * using firmware work request. + */ +static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb) +{ + int stid; + unsigned int filter; + struct ethhdr *eh = NULL; + struct vlan_ethhdr *vlan_eh = NULL; + struct iphdr *iph; + struct tcphdr *tcph; + struct rss_header *rss = (void *)skb->data; + struct cpl_rx_pkt *cpl = (void *)skb->data; + struct cpl_pass_accept_req *req = (void *)(rss + 1); + struct l2t_entry *e; + struct dst_entry *dst; + struct c4iw_ep *lep; + u16 window; + struct port_info *pi; + struct net_device *pdev; + u16 rss_qid, eth_hdr_len; + int step; + u32 tx_chan; + struct neighbour *neigh; + + /* Drop all non-SYN packets */ + if (!(cpl->l2info & cpu_to_be32(F_RXF_SYN))) + goto reject; + + /* + * Drop all packets which did not hit the filter. + * Unlikely to happen. + */ + if (!(rss->filter_hit && rss->filter_tid)) + goto reject; + + /* + * Calculate the server tid from filter hit index from cpl_rx_pkt. + */ + stid = (__force int) cpu_to_be32((__force u32) rss->hash_val); + + lep = (struct c4iw_ep *)lookup_stid(dev->rdev.lldi.tids, stid); + if (!lep) { + PDBG("%s connect request on invalid stid %d\n", __func__, stid); + goto reject; + } + + eth_hdr_len = is_t4(dev->rdev.lldi.adapter_type) ? + G_RX_ETHHDR_LEN(htonl(cpl->l2info)) : + G_RX_T5_ETHHDR_LEN(htonl(cpl->l2info)); + if (eth_hdr_len == ETH_HLEN) { + eh = (struct ethhdr *)(req + 1); + iph = (struct iphdr *)(eh + 1); + } else { + vlan_eh = (struct vlan_ethhdr *)(req + 1); + iph = (struct iphdr *)(vlan_eh + 1); + skb->vlan_tci = ntohs(cpl->vlan); + } + + if (iph->version != 0x4) + goto reject; + + tcph = (struct tcphdr *)(iph + 1); + skb_set_network_header(skb, (void *)iph - (void *)rss); + skb_set_transport_header(skb, (void *)tcph - (void *)rss); + skb_get(skb); + + PDBG("%s lip 0x%x lport %u pip 0x%x pport %u tos %d\n", __func__, + ntohl(iph->daddr), ntohs(tcph->dest), ntohl(iph->saddr), + ntohs(tcph->source), iph->tos); + + dst = find_route(dev, iph->daddr, iph->saddr, tcph->dest, tcph->source, + iph->tos); + if (!dst) { + pr_err("%s - failed to find dst entry!\n", + __func__); + goto reject; + } + neigh = dst_neigh_lookup_skb(dst, skb); + + if (!neigh) { + pr_err("%s - failed to allocate neigh!\n", + __func__); + goto free_dst; + } + + if (neigh->dev->flags & IFF_LOOPBACK) { + pdev = ip_dev_find(&init_net, iph->daddr); + e = cxgb4_l2t_get(dev->rdev.lldi.l2t, neigh, + pdev, 0); + pi = (struct port_info *)netdev_priv(pdev); + tx_chan = cxgb4_port_chan(pdev); + dev_put(pdev); + } else { + pdev = get_real_dev(neigh->dev); + e = cxgb4_l2t_get(dev->rdev.lldi.l2t, neigh, + pdev, 0); + pi = (struct port_info *)netdev_priv(pdev); + tx_chan = cxgb4_port_chan(pdev); + } + neigh_release(neigh); + if (!e) { + pr_err("%s - failed to allocate l2t entry!\n", + __func__); + goto free_dst; + } + + step = dev->rdev.lldi.nrxq / dev->rdev.lldi.nchan; + rss_qid = dev->rdev.lldi.rxq_ids[pi->port_id * step]; + window = (__force u16) htons((__force u16)tcph->window); + + /* Calcuate filter portion for LE region. */ + filter = (__force unsigned int) cpu_to_be32(cxgb4_select_ntuple( + dev->rdev.lldi.ports[0], + e)); + + /* + * Synthesize the cpl_pass_accept_req. We have everything except the + * TID. Once firmware sends a reply with TID we update the TID field + * in cpl and pass it through the regular cpl_pass_accept_req path. + */ + build_cpl_pass_accept_req(skb, stid, iph->tos); + send_fw_pass_open_req(dev, skb, iph->daddr, tcph->dest, iph->saddr, + tcph->source, ntohl(tcph->seq), filter, window, + rss_qid, pi->port_id); + cxgb4_l2t_release(e); +free_dst: + dst_release(dst); +reject: + return 0; +} + +/* + * These are the real handlers that are called from a + * work queue. + */ +static c4iw_handler_func work_handlers[NUM_CPL_CMDS] = { + [CPL_ACT_ESTABLISH] = act_establish, + [CPL_ACT_OPEN_RPL] = act_open_rpl, + [CPL_RX_DATA] = rx_data, + [CPL_ABORT_RPL_RSS] = abort_rpl, + [CPL_ABORT_RPL] = abort_rpl, + [CPL_PASS_OPEN_RPL] = pass_open_rpl, + [CPL_CLOSE_LISTSRV_RPL] = close_listsrv_rpl, + [CPL_PASS_ACCEPT_REQ] = pass_accept_req, + [CPL_PASS_ESTABLISH] = pass_establish, + [CPL_PEER_CLOSE] = peer_close, + [CPL_ABORT_REQ_RSS] = peer_abort, + [CPL_CLOSE_CON_RPL] = close_con_rpl, + [CPL_RDMA_TERMINATE] = terminate, + [CPL_FW4_ACK] = fw4_ack, + [CPL_FW6_MSG] = deferred_fw6_msg, + [CPL_RX_PKT] = rx_pkt +}; + +static void process_timeout(struct c4iw_ep *ep) +{ + struct c4iw_qp_attributes attrs; + int abort = 1; + + mutex_lock(&ep->com.mutex); + PDBG("%s ep %p tid %u state %d\n", __func__, ep, ep->hwtid, + ep->com.state); + set_bit(TIMEDOUT, &ep->com.history); + switch (ep->com.state) { + case MPA_REQ_SENT: + __state_set(&ep->com, ABORTING); + connect_reply_upcall(ep, -ETIMEDOUT); + break; + case MPA_REQ_WAIT: + __state_set(&ep->com, ABORTING); + break; + case CLOSING: + case MORIBUND: + if (ep->com.cm_id && ep->com.qp) { + attrs.next_state = C4IW_QP_STATE_ERROR; + c4iw_modify_qp(ep->com.qp->rhp, + ep->com.qp, C4IW_QP_ATTR_NEXT_STATE, + &attrs, 1); + } + __state_set(&ep->com, ABORTING); + close_complete_upcall(ep, -ETIMEDOUT); + break; + case ABORTING: + case DEAD: + + /* + * These states are expected if the ep timed out at the same + * time as another thread was calling stop_ep_timer(). + * So we silently do nothing for these states. + */ + abort = 0; + break; + default: + WARN(1, "%s unexpected state ep %p tid %u state %u\n", + __func__, ep, ep->hwtid, ep->com.state); + abort = 0; + } + if (abort) + abort_connection(ep, NULL, GFP_KERNEL); + mutex_unlock(&ep->com.mutex); + c4iw_put_ep(&ep->com); +} + +static void process_timedout_eps(void) +{ + struct c4iw_ep *ep; + + spin_lock_irq(&timeout_lock); + while (!list_empty(&timeout_list)) { + struct list_head *tmp; + + tmp = timeout_list.next; + list_del(tmp); + tmp->next = NULL; + tmp->prev = NULL; + spin_unlock_irq(&timeout_lock); + ep = list_entry(tmp, struct c4iw_ep, entry); + process_timeout(ep); + spin_lock_irq(&timeout_lock); + } + spin_unlock_irq(&timeout_lock); +} + +static void process_work(struct work_struct *work) +{ + struct sk_buff *skb = NULL; + struct c4iw_dev *dev; + struct cpl_act_establish *rpl; + unsigned int opcode; + int ret; + + process_timedout_eps(); + while ((skb = skb_dequeue(&rxq))) { + rpl = cplhdr(skb); + dev = *((struct c4iw_dev **) (skb->cb + sizeof(void *))); + opcode = rpl->ot.opcode; + + BUG_ON(!work_handlers[opcode]); + ret = work_handlers[opcode](dev, skb); + if (!ret) + kfree_skb(skb); + process_timedout_eps(); + } +} + +static DECLARE_WORK(skb_work, process_work); + +static void ep_timeout(unsigned long arg) +{ + struct c4iw_ep *ep = (struct c4iw_ep *)arg; + int kickit = 0; + + spin_lock(&timeout_lock); + if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) { + /* + * Only insert if it is not already on the list. + */ + if (!ep->entry.next) { + list_add_tail(&ep->entry, &timeout_list); + kickit = 1; + } + } + spin_unlock(&timeout_lock); + if (kickit) + queue_work(workq, &skb_work); +} + +/* + * All the CM events are handled on a work queue to have a safe context. + */ +static int sched(struct c4iw_dev *dev, struct sk_buff *skb) +{ + + /* + * Save dev in the skb->cb area. + */ + *((struct c4iw_dev **) (skb->cb + sizeof(void *))) = dev; + + /* + * Queue the skb and schedule the worker thread. + */ + skb_queue_tail(&rxq, skb); + queue_work(workq, &skb_work); + return 0; +} + +static int set_tcb_rpl(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct cpl_set_tcb_rpl *rpl = cplhdr(skb); + + if (rpl->status != CPL_ERR_NONE) { + printk(KERN_ERR MOD "Unexpected SET_TCB_RPL status %u " + "for tid %u\n", rpl->status, GET_TID(rpl)); + } + kfree_skb(skb); + return 0; +} + +static int fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct cpl_fw6_msg *rpl = cplhdr(skb); + struct c4iw_wr_wait *wr_waitp; + int ret; + + PDBG("%s type %u\n", __func__, rpl->type); + + switch (rpl->type) { + case FW6_TYPE_WR_RPL: + ret = (int)((be64_to_cpu(rpl->data[0]) >> 8) & 0xff); + wr_waitp = (struct c4iw_wr_wait *)(__force unsigned long) rpl->data[1]; + PDBG("%s wr_waitp %p ret %u\n", __func__, wr_waitp, ret); + if (wr_waitp) + c4iw_wake_up(wr_waitp, ret ? -ret : 0); + kfree_skb(skb); + break; + case FW6_TYPE_CQE: + case FW6_TYPE_OFLD_CONNECTION_WR_RPL: + sched(dev, skb); + break; + default: + printk(KERN_ERR MOD "%s unexpected fw6 msg type %u\n", __func__, + rpl->type); + kfree_skb(skb); + break; + } + return 0; +} + +static int peer_abort_intr(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct cpl_abort_req_rss *req = cplhdr(skb); + struct c4iw_ep *ep; + struct tid_info *t = dev->rdev.lldi.tids; + unsigned int tid = GET_TID(req); + + ep = lookup_tid(t, tid); + if (!ep) { + printk(KERN_WARNING MOD + "Abort on non-existent endpoint, tid %d\n", tid); + kfree_skb(skb); + return 0; + } + if (is_neg_adv(req->status)) { + dev_warn(&dev->rdev.lldi.pdev->dev, + "Negative advice on abort - tid %u status %d (%s)\n", + ep->hwtid, req->status, neg_adv_str(req->status)); + kfree_skb(skb); + return 0; + } + PDBG("%s ep %p tid %u state %u\n", __func__, ep, ep->hwtid, + ep->com.state); + + /* + * Wake up any threads in rdma_init() or rdma_fini(). + * However, if we are on MPAv2 and want to retry with MPAv1 + * then, don't wake up yet. + */ + if (mpa_rev == 2 && !ep->tried_with_mpa_v1) { + if (ep->com.state != MPA_REQ_SENT) + c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); + } else + c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); + sched(dev, skb); + return 0; +} + +/* + * Most upcalls from the T4 Core go to sched() to + * schedule the processing on a work queue. + */ +c4iw_handler_func c4iw_handlers[NUM_CPL_CMDS] = { + [CPL_ACT_ESTABLISH] = sched, + [CPL_ACT_OPEN_RPL] = sched, + [CPL_RX_DATA] = sched, + [CPL_ABORT_RPL_RSS] = sched, + [CPL_ABORT_RPL] = sched, + [CPL_PASS_OPEN_RPL] = sched, + [CPL_CLOSE_LISTSRV_RPL] = sched, + [CPL_PASS_ACCEPT_REQ] = sched, + [CPL_PASS_ESTABLISH] = sched, + [CPL_PEER_CLOSE] = sched, + [CPL_CLOSE_CON_RPL] = sched, + [CPL_ABORT_REQ_RSS] = peer_abort_intr, + [CPL_RDMA_TERMINATE] = sched, + [CPL_FW4_ACK] = sched, + [CPL_SET_TCB_RPL] = set_tcb_rpl, + [CPL_FW6_MSG] = fw6_msg, + [CPL_RX_PKT] = sched +}; + +int __init c4iw_cm_init(void) +{ + spin_lock_init(&timeout_lock); + skb_queue_head_init(&rxq); + + workq = create_singlethread_workqueue("iw_cxgb4"); + if (!workq) + return -ENOMEM; + + return 0; +} + +void c4iw_cm_term(void) +{ + WARN_ON(!list_empty(&timeout_list)); + flush_workqueue(workq); + destroy_workqueue(workq); +} diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c new file mode 100644 index 0000000..0f773e7 --- /dev/null +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -0,0 +1,1009 @@ +/* + * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "iw_cxgb4.h" + +static int destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, + struct c4iw_dev_ucontext *uctx) +{ + struct fw_ri_res_wr *res_wr; + struct fw_ri_res *res; + int wr_len; + struct c4iw_wr_wait wr_wait; + struct sk_buff *skb; + int ret; + + wr_len = sizeof *res_wr + sizeof *res; + skb = alloc_skb(wr_len, GFP_KERNEL); + if (!skb) + return -ENOMEM; + set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0); + + res_wr = (struct fw_ri_res_wr *)__skb_put(skb, wr_len); + memset(res_wr, 0, wr_len); + res_wr->op_nres = cpu_to_be32( + FW_WR_OP(FW_RI_RES_WR) | + V_FW_RI_RES_WR_NRES(1) | + FW_WR_COMPL(1)); + res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16)); + res_wr->cookie = (unsigned long) &wr_wait; + res = res_wr->res; + res->u.cq.restype = FW_RI_RES_TYPE_CQ; + res->u.cq.op = FW_RI_RES_OP_RESET; + res->u.cq.iqid = cpu_to_be32(cq->cqid); + + c4iw_init_wr_wait(&wr_wait); + ret = c4iw_ofld_send(rdev, skb); + if (!ret) { + ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, 0, __func__); + } + + kfree(cq->sw_queue); + dma_free_coherent(&(rdev->lldi.pdev->dev), + cq->memsize, cq->queue, + dma_unmap_addr(cq, mapping)); + c4iw_put_cqid(rdev, cq->cqid, uctx); + return ret; +} + +static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, + struct c4iw_dev_ucontext *uctx) +{ + struct fw_ri_res_wr *res_wr; + struct fw_ri_res *res; + int wr_len; + int user = (uctx != &rdev->uctx); + struct c4iw_wr_wait wr_wait; + int ret; + struct sk_buff *skb; + + cq->cqid = c4iw_get_cqid(rdev, uctx); + if (!cq->cqid) { + ret = -ENOMEM; + goto err1; + } + + if (!user) { + cq->sw_queue = kzalloc(cq->memsize, GFP_KERNEL); + if (!cq->sw_queue) { + ret = -ENOMEM; + goto err2; + } + } + cq->queue = dma_alloc_coherent(&rdev->lldi.pdev->dev, cq->memsize, + &cq->dma_addr, GFP_KERNEL); + if (!cq->queue) { + ret = -ENOMEM; + goto err3; + } + dma_unmap_addr_set(cq, mapping, cq->dma_addr); + memset(cq->queue, 0, cq->memsize); + + /* build fw_ri_res_wr */ + wr_len = sizeof *res_wr + sizeof *res; + + skb = alloc_skb(wr_len, GFP_KERNEL); + if (!skb) { + ret = -ENOMEM; + goto err4; + } + set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0); + + res_wr = (struct fw_ri_res_wr *)__skb_put(skb, wr_len); + memset(res_wr, 0, wr_len); + res_wr->op_nres = cpu_to_be32( + FW_WR_OP(FW_RI_RES_WR) | + V_FW_RI_RES_WR_NRES(1) | + FW_WR_COMPL(1)); + res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16)); + res_wr->cookie = (unsigned long) &wr_wait; + res = res_wr->res; + res->u.cq.restype = FW_RI_RES_TYPE_CQ; + res->u.cq.op = FW_RI_RES_OP_WRITE; + res->u.cq.iqid = cpu_to_be32(cq->cqid); + res->u.cq.iqandst_to_iqandstindex = cpu_to_be32( + V_FW_RI_RES_WR_IQANUS(0) | + V_FW_RI_RES_WR_IQANUD(1) | + F_FW_RI_RES_WR_IQANDST | + V_FW_RI_RES_WR_IQANDSTINDEX( + rdev->lldi.ciq_ids[cq->vector])); + res->u.cq.iqdroprss_to_iqesize = cpu_to_be16( + F_FW_RI_RES_WR_IQDROPRSS | + V_FW_RI_RES_WR_IQPCIECH(2) | + V_FW_RI_RES_WR_IQINTCNTTHRESH(0) | + F_FW_RI_RES_WR_IQO | + V_FW_RI_RES_WR_IQESIZE(1)); + res->u.cq.iqsize = cpu_to_be16(cq->size); + res->u.cq.iqaddr = cpu_to_be64(cq->dma_addr); + + c4iw_init_wr_wait(&wr_wait); + + ret = c4iw_ofld_send(rdev, skb); + if (ret) + goto err4; + PDBG("%s wait_event wr_wait %p\n", __func__, &wr_wait); + ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, 0, __func__); + if (ret) + goto err4; + + cq->gen = 1; + cq->gts = rdev->lldi.gts_reg; + cq->rdev = rdev; + if (user) { + cq->ugts = (u64)pci_resource_start(rdev->lldi.pdev, 2) + + (cq->cqid << rdev->cqshift); + cq->ugts &= PAGE_MASK; + } + return 0; +err4: + dma_free_coherent(&rdev->lldi.pdev->dev, cq->memsize, cq->queue, + dma_unmap_addr(cq, mapping)); +err3: + kfree(cq->sw_queue); +err2: + c4iw_put_cqid(rdev, cq->cqid, uctx); +err1: + return ret; +} + +static void insert_recv_cqe(struct t4_wq *wq, struct t4_cq *cq) +{ + struct t4_cqe cqe; + + PDBG("%s wq %p cq %p sw_cidx %u sw_pidx %u\n", __func__, + wq, cq, cq->sw_cidx, cq->sw_pidx); + memset(&cqe, 0, sizeof(cqe)); + cqe.header = cpu_to_be32(V_CQE_STATUS(T4_ERR_SWFLUSH) | + V_CQE_OPCODE(FW_RI_SEND) | + V_CQE_TYPE(0) | + V_CQE_SWCQE(1) | + V_CQE_QPID(wq->sq.qid)); + cqe.bits_type_ts = cpu_to_be64(V_CQE_GENBIT((u64)cq->gen)); + cq->sw_queue[cq->sw_pidx] = cqe; + t4_swcq_produce(cq); +} + +int c4iw_flush_rq(struct t4_wq *wq, struct t4_cq *cq, int count) +{ + int flushed = 0; + int in_use = wq->rq.in_use - count; + + BUG_ON(in_use < 0); + PDBG("%s wq %p cq %p rq.in_use %u skip count %u\n", __func__, + wq, cq, wq->rq.in_use, count); + while (in_use--) { + insert_recv_cqe(wq, cq); + flushed++; + } + return flushed; +} + +static void insert_sq_cqe(struct t4_wq *wq, struct t4_cq *cq, + struct t4_swsqe *swcqe) +{ + struct t4_cqe cqe; + + PDBG("%s wq %p cq %p sw_cidx %u sw_pidx %u\n", __func__, + wq, cq, cq->sw_cidx, cq->sw_pidx); + memset(&cqe, 0, sizeof(cqe)); + cqe.header = cpu_to_be32(V_CQE_STATUS(T4_ERR_SWFLUSH) | + V_CQE_OPCODE(swcqe->opcode) | + V_CQE_TYPE(1) | + V_CQE_SWCQE(1) | + V_CQE_QPID(wq->sq.qid)); + CQE_WRID_SQ_IDX(&cqe) = swcqe->idx; + cqe.bits_type_ts = cpu_to_be64(V_CQE_GENBIT((u64)cq->gen)); + cq->sw_queue[cq->sw_pidx] = cqe; + t4_swcq_produce(cq); +} + +static void advance_oldest_read(struct t4_wq *wq); + +int c4iw_flush_sq(struct c4iw_qp *qhp) +{ + int flushed = 0; + struct t4_wq *wq = &qhp->wq; + struct c4iw_cq *chp = to_c4iw_cq(qhp->ibqp.send_cq); + struct t4_cq *cq = &chp->cq; + int idx; + struct t4_swsqe *swsqe; + + if (wq->sq.flush_cidx == -1) + wq->sq.flush_cidx = wq->sq.cidx; + idx = wq->sq.flush_cidx; + BUG_ON(idx >= wq->sq.size); + while (idx != wq->sq.pidx) { + swsqe = &wq->sq.sw_sq[idx]; + BUG_ON(swsqe->flushed); + swsqe->flushed = 1; + insert_sq_cqe(wq, cq, swsqe); + if (wq->sq.oldest_read == swsqe) { + BUG_ON(swsqe->opcode != FW_RI_READ_REQ); + advance_oldest_read(wq); + } + flushed++; + if (++idx == wq->sq.size) + idx = 0; + } + wq->sq.flush_cidx += flushed; + if (wq->sq.flush_cidx >= wq->sq.size) + wq->sq.flush_cidx -= wq->sq.size; + return flushed; +} + +static void flush_completed_wrs(struct t4_wq *wq, struct t4_cq *cq) +{ + struct t4_swsqe *swsqe; + int cidx; + + if (wq->sq.flush_cidx == -1) + wq->sq.flush_cidx = wq->sq.cidx; + cidx = wq->sq.flush_cidx; + BUG_ON(cidx > wq->sq.size); + + while (cidx != wq->sq.pidx) { + swsqe = &wq->sq.sw_sq[cidx]; + if (!swsqe->signaled) { + if (++cidx == wq->sq.size) + cidx = 0; + } else if (swsqe->complete) { + + BUG_ON(swsqe->flushed); + + /* + * Insert this completed cqe into the swcq. + */ + PDBG("%s moving cqe into swcq sq idx %u cq idx %u\n", + __func__, cidx, cq->sw_pidx); + swsqe->cqe.header |= htonl(V_CQE_SWCQE(1)); + cq->sw_queue[cq->sw_pidx] = swsqe->cqe; + t4_swcq_produce(cq); + swsqe->flushed = 1; + if (++cidx == wq->sq.size) + cidx = 0; + wq->sq.flush_cidx = cidx; + } else + break; + } +} + +static void create_read_req_cqe(struct t4_wq *wq, struct t4_cqe *hw_cqe, + struct t4_cqe *read_cqe) +{ + read_cqe->u.scqe.cidx = wq->sq.oldest_read->idx; + read_cqe->len = htonl(wq->sq.oldest_read->read_len); + read_cqe->header = htonl(V_CQE_QPID(CQE_QPID(hw_cqe)) | + V_CQE_SWCQE(SW_CQE(hw_cqe)) | + V_CQE_OPCODE(FW_RI_READ_REQ) | + V_CQE_TYPE(1)); + read_cqe->bits_type_ts = hw_cqe->bits_type_ts; +} + +static void advance_oldest_read(struct t4_wq *wq) +{ + + u32 rptr = wq->sq.oldest_read - wq->sq.sw_sq + 1; + + if (rptr == wq->sq.size) + rptr = 0; + while (rptr != wq->sq.pidx) { + wq->sq.oldest_read = &wq->sq.sw_sq[rptr]; + + if (wq->sq.oldest_read->opcode == FW_RI_READ_REQ) + return; + if (++rptr == wq->sq.size) + rptr = 0; + } + wq->sq.oldest_read = NULL; +} + +/* + * Move all CQEs from the HWCQ into the SWCQ. + * Deal with out-of-order and/or completions that complete + * prior unsignalled WRs. + */ +void c4iw_flush_hw_cq(struct c4iw_cq *chp) +{ + struct t4_cqe *hw_cqe, *swcqe, read_cqe; + struct c4iw_qp *qhp; + struct t4_swsqe *swsqe; + int ret; + + PDBG("%s cqid 0x%x\n", __func__, chp->cq.cqid); + ret = t4_next_hw_cqe(&chp->cq, &hw_cqe); + + /* + * This logic is similar to poll_cq(), but not quite the same + * unfortunately. Need to move pertinent HW CQEs to the SW CQ but + * also do any translation magic that poll_cq() normally does. + */ + while (!ret) { + qhp = get_qhp(chp->rhp, CQE_QPID(hw_cqe)); + + /* + * drop CQEs with no associated QP + */ + if (qhp == NULL) + goto next_cqe; + + if (CQE_OPCODE(hw_cqe) == FW_RI_TERMINATE) + goto next_cqe; + + if (CQE_OPCODE(hw_cqe) == FW_RI_READ_RESP) { + + /* If we have reached here because of async + * event or other error, and have egress error + * then drop + */ + if (CQE_TYPE(hw_cqe) == 1) + goto next_cqe; + + /* drop peer2peer RTR reads. + */ + if (CQE_WRID_STAG(hw_cqe) == 1) + goto next_cqe; + + /* + * Eat completions for unsignaled read WRs. + */ + if (!qhp->wq.sq.oldest_read->signaled) { + advance_oldest_read(&qhp->wq); + goto next_cqe; + } + + /* + * Don't write to the HWCQ, create a new read req CQE + * in local memory and move it into the swcq. + */ + create_read_req_cqe(&qhp->wq, hw_cqe, &read_cqe); + hw_cqe = &read_cqe; + advance_oldest_read(&qhp->wq); + } + + /* if its a SQ completion, then do the magic to move all the + * unsignaled and now in-order completions into the swcq. + */ + if (SQ_TYPE(hw_cqe)) { + swsqe = &qhp->wq.sq.sw_sq[CQE_WRID_SQ_IDX(hw_cqe)]; + swsqe->cqe = *hw_cqe; + swsqe->complete = 1; + flush_completed_wrs(&qhp->wq, &chp->cq); + } else { + swcqe = &chp->cq.sw_queue[chp->cq.sw_pidx]; + *swcqe = *hw_cqe; + swcqe->header |= cpu_to_be32(V_CQE_SWCQE(1)); + t4_swcq_produce(&chp->cq); + } +next_cqe: + t4_hwcq_consume(&chp->cq); + ret = t4_next_hw_cqe(&chp->cq, &hw_cqe); + } +} + +static int cqe_completes_wr(struct t4_cqe *cqe, struct t4_wq *wq) +{ + if (CQE_OPCODE(cqe) == FW_RI_TERMINATE) + return 0; + + if ((CQE_OPCODE(cqe) == FW_RI_RDMA_WRITE) && RQ_TYPE(cqe)) + return 0; + + if ((CQE_OPCODE(cqe) == FW_RI_READ_RESP) && SQ_TYPE(cqe)) + return 0; + + if (CQE_SEND_OPCODE(cqe) && RQ_TYPE(cqe) && t4_rq_empty(wq)) + return 0; + return 1; +} + +void c4iw_count_rcqes(struct t4_cq *cq, struct t4_wq *wq, int *count) +{ + struct t4_cqe *cqe; + u32 ptr; + + *count = 0; + PDBG("%s count zero %d\n", __func__, *count); + ptr = cq->sw_cidx; + while (ptr != cq->sw_pidx) { + cqe = &cq->sw_queue[ptr]; + if (RQ_TYPE(cqe) && (CQE_OPCODE(cqe) != FW_RI_READ_RESP) && + (CQE_QPID(cqe) == wq->sq.qid) && cqe_completes_wr(cqe, wq)) + (*count)++; + if (++ptr == cq->size) + ptr = 0; + } + PDBG("%s cq %p count %d\n", __func__, cq, *count); +} + +/* + * poll_cq + * + * Caller must: + * check the validity of the first CQE, + * supply the wq assicated with the qpid. + * + * credit: cq credit to return to sge. + * cqe_flushed: 1 iff the CQE is flushed. + * cqe: copy of the polled CQE. + * + * return value: + * 0 CQE returned ok. + * -EAGAIN CQE skipped, try again. + * -EOVERFLOW CQ overflow detected. + */ +static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe, + u8 *cqe_flushed, u64 *cookie, u32 *credit) +{ + int ret = 0; + struct t4_cqe *hw_cqe, read_cqe; + + *cqe_flushed = 0; + *credit = 0; + ret = t4_next_cqe(cq, &hw_cqe); + if (ret) + return ret; + + PDBG("%s CQE OVF %u qpid 0x%0x genbit %u type %u status 0x%0x" + " opcode 0x%0x len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x\n", + __func__, CQE_OVFBIT(hw_cqe), CQE_QPID(hw_cqe), + CQE_GENBIT(hw_cqe), CQE_TYPE(hw_cqe), CQE_STATUS(hw_cqe), + CQE_OPCODE(hw_cqe), CQE_LEN(hw_cqe), CQE_WRID_HI(hw_cqe), + CQE_WRID_LOW(hw_cqe)); + + /* + * skip cqe's not affiliated with a QP. + */ + if (wq == NULL) { + ret = -EAGAIN; + goto skip_cqe; + } + + /* + * skip hw cqe's if the wq is flushed. + */ + if (wq->flushed && !SW_CQE(hw_cqe)) { + ret = -EAGAIN; + goto skip_cqe; + } + + /* + * skip TERMINATE cqes... + */ + if (CQE_OPCODE(hw_cqe) == FW_RI_TERMINATE) { + ret = -EAGAIN; + goto skip_cqe; + } + + /* + * Gotta tweak READ completions: + * 1) the cqe doesn't contain the sq_wptr from the wr. + * 2) opcode not reflected from the wr. + * 3) read_len not reflected from the wr. + * 4) cq_type is RQ_TYPE not SQ_TYPE. + */ + if (RQ_TYPE(hw_cqe) && (CQE_OPCODE(hw_cqe) == FW_RI_READ_RESP)) { + + /* If we have reached here because of async + * event or other error, and have egress error + * then drop + */ + if (CQE_TYPE(hw_cqe) == 1) { + if (CQE_STATUS(hw_cqe)) + t4_set_wq_in_error(wq); + ret = -EAGAIN; + goto skip_cqe; + } + + /* If this is an unsolicited read response, then the read + * was generated by the kernel driver as part of peer-2-peer + * connection setup. So ignore the completion. + */ + if (CQE_WRID_STAG(hw_cqe) == 1) { + if (CQE_STATUS(hw_cqe)) + t4_set_wq_in_error(wq); + ret = -EAGAIN; + goto skip_cqe; + } + + /* + * Eat completions for unsignaled read WRs. + */ + if (!wq->sq.oldest_read->signaled) { + advance_oldest_read(wq); + ret = -EAGAIN; + goto skip_cqe; + } + + /* + * Don't write to the HWCQ, so create a new read req CQE + * in local memory. + */ + create_read_req_cqe(wq, hw_cqe, &read_cqe); + hw_cqe = &read_cqe; + advance_oldest_read(wq); + } + + if (CQE_STATUS(hw_cqe) || t4_wq_in_error(wq)) { + *cqe_flushed = (CQE_STATUS(hw_cqe) == T4_ERR_SWFLUSH); + t4_set_wq_in_error(wq); + } + + /* + * RECV completion. + */ + if (RQ_TYPE(hw_cqe)) { + + /* + * HW only validates 4 bits of MSN. So we must validate that + * the MSN in the SEND is the next expected MSN. If its not, + * then we complete this with T4_ERR_MSN and mark the wq in + * error. + */ + + if (t4_rq_empty(wq)) { + t4_set_wq_in_error(wq); + ret = -EAGAIN; + goto skip_cqe; + } + if (unlikely((CQE_WRID_MSN(hw_cqe) != (wq->rq.msn)))) { + t4_set_wq_in_error(wq); + hw_cqe->header |= htonl(V_CQE_STATUS(T4_ERR_MSN)); + goto proc_cqe; + } + goto proc_cqe; + } + + /* + * If we get here its a send completion. + * + * Handle out of order completion. These get stuffed + * in the SW SQ. Then the SW SQ is walked to move any + * now in-order completions into the SW CQ. This handles + * 2 cases: + * 1) reaping unsignaled WRs when the first subsequent + * signaled WR is completed. + * 2) out of order read completions. + */ + if (!SW_CQE(hw_cqe) && (CQE_WRID_SQ_IDX(hw_cqe) != wq->sq.cidx)) { + struct t4_swsqe *swsqe; + + PDBG("%s out of order completion going in sw_sq at idx %u\n", + __func__, CQE_WRID_SQ_IDX(hw_cqe)); + swsqe = &wq->sq.sw_sq[CQE_WRID_SQ_IDX(hw_cqe)]; + swsqe->cqe = *hw_cqe; + swsqe->complete = 1; + ret = -EAGAIN; + goto flush_wq; + } + +proc_cqe: + *cqe = *hw_cqe; + + /* + * Reap the associated WR(s) that are freed up with this + * completion. + */ + if (SQ_TYPE(hw_cqe)) { + int idx = CQE_WRID_SQ_IDX(hw_cqe); + BUG_ON(idx >= wq->sq.size); + + /* + * Account for any unsignaled completions completed by + * this signaled completion. In this case, cidx points + * to the first unsignaled one, and idx points to the + * signaled one. So adjust in_use based on this delta. + * if this is not completing any unsigned wrs, then the + * delta will be 0. Handle wrapping also! + */ + if (idx < wq->sq.cidx) + wq->sq.in_use -= wq->sq.size + idx - wq->sq.cidx; + else + wq->sq.in_use -= idx - wq->sq.cidx; + BUG_ON(wq->sq.in_use <= 0 && wq->sq.in_use >= wq->sq.size); + + wq->sq.cidx = (uint16_t)idx; + PDBG("%s completing sq idx %u\n", __func__, wq->sq.cidx); + *cookie = wq->sq.sw_sq[wq->sq.cidx].wr_id; + if (c4iw_wr_log) + c4iw_log_wr_stats(wq, hw_cqe); + t4_sq_consume(wq); + } else { + PDBG("%s completing rq idx %u\n", __func__, wq->rq.cidx); + *cookie = wq->rq.sw_rq[wq->rq.cidx].wr_id; + BUG_ON(t4_rq_empty(wq)); + if (c4iw_wr_log) + c4iw_log_wr_stats(wq, hw_cqe); + t4_rq_consume(wq); + goto skip_cqe; + } + +flush_wq: + /* + * Flush any completed cqes that are now in-order. + */ + flush_completed_wrs(wq, cq); + +skip_cqe: + if (SW_CQE(hw_cqe)) { + PDBG("%s cq %p cqid 0x%x skip sw cqe cidx %u\n", + __func__, cq, cq->cqid, cq->sw_cidx); + t4_swcq_consume(cq); + } else { + PDBG("%s cq %p cqid 0x%x skip hw cqe cidx %u\n", + __func__, cq, cq->cqid, cq->cidx); + t4_hwcq_consume(cq); + } + return ret; +} + +/* + * Get one cq entry from c4iw and map it to openib. + * + * Returns: + * 0 cqe returned + * -ENODATA EMPTY; + * -EAGAIN caller must try again + * any other -errno fatal error + */ +static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc) +{ + struct c4iw_qp *qhp = NULL; + struct t4_cqe uninitialized_var(cqe), *rd_cqe; + struct t4_wq *wq; + u32 credit = 0; + u8 cqe_flushed; + u64 cookie = 0; + int ret; + + ret = t4_next_cqe(&chp->cq, &rd_cqe); + + if (ret) + return ret; + + qhp = get_qhp(chp->rhp, CQE_QPID(rd_cqe)); + if (!qhp) + wq = NULL; + else { + spin_lock(&qhp->lock); + wq = &(qhp->wq); + } + ret = poll_cq(wq, &(chp->cq), &cqe, &cqe_flushed, &cookie, &credit); + if (ret) + goto out; + + wc->wr_id = cookie; + wc->qp = &qhp->ibqp; + wc->vendor_err = CQE_STATUS(&cqe); + wc->wc_flags = 0; + + PDBG("%s qpid 0x%x type %d opcode %d status 0x%x len %u wrid hi 0x%x " + "lo 0x%x cookie 0x%llx\n", __func__, CQE_QPID(&cqe), + CQE_TYPE(&cqe), CQE_OPCODE(&cqe), CQE_STATUS(&cqe), CQE_LEN(&cqe), + CQE_WRID_HI(&cqe), CQE_WRID_LOW(&cqe), (unsigned long long)cookie); + + if (CQE_TYPE(&cqe) == 0) { + if (!CQE_STATUS(&cqe)) + wc->byte_len = CQE_LEN(&cqe); + else + wc->byte_len = 0; + wc->opcode = IB_WC_RECV; + if (CQE_OPCODE(&cqe) == FW_RI_SEND_WITH_INV || + CQE_OPCODE(&cqe) == FW_RI_SEND_WITH_SE_INV) { + wc->ex.invalidate_rkey = CQE_WRID_STAG(&cqe); + wc->wc_flags |= IB_WC_WITH_INVALIDATE; + } + } else { + switch (CQE_OPCODE(&cqe)) { + case FW_RI_RDMA_WRITE: + wc->opcode = IB_WC_RDMA_WRITE; + break; + case FW_RI_READ_REQ: + wc->opcode = IB_WC_RDMA_READ; + wc->byte_len = CQE_LEN(&cqe); + break; + case FW_RI_SEND_WITH_INV: + case FW_RI_SEND_WITH_SE_INV: + wc->opcode = IB_WC_SEND; + wc->wc_flags |= IB_WC_WITH_INVALIDATE; + break; + case FW_RI_SEND: + case FW_RI_SEND_WITH_SE: + wc->opcode = IB_WC_SEND; + break; + case FW_RI_BIND_MW: + wc->opcode = IB_WC_BIND_MW; + break; + + case FW_RI_LOCAL_INV: + wc->opcode = IB_WC_LOCAL_INV; + break; + case FW_RI_FAST_REGISTER: + wc->opcode = IB_WC_FAST_REG_MR; + break; + default: + printk(KERN_ERR MOD "Unexpected opcode %d " + "in the CQE received for QPID=0x%0x\n", + CQE_OPCODE(&cqe), CQE_QPID(&cqe)); + ret = -EINVAL; + goto out; + } + } + + if (cqe_flushed) + wc->status = IB_WC_WR_FLUSH_ERR; + else { + + switch (CQE_STATUS(&cqe)) { + case T4_ERR_SUCCESS: + wc->status = IB_WC_SUCCESS; + break; + case T4_ERR_STAG: + wc->status = IB_WC_LOC_ACCESS_ERR; + break; + case T4_ERR_PDID: + wc->status = IB_WC_LOC_PROT_ERR; + break; + case T4_ERR_QPID: + case T4_ERR_ACCESS: + wc->status = IB_WC_LOC_ACCESS_ERR; + break; + case T4_ERR_WRAP: + wc->status = IB_WC_GENERAL_ERR; + break; + case T4_ERR_BOUND: + wc->status = IB_WC_LOC_LEN_ERR; + break; + case T4_ERR_INVALIDATE_SHARED_MR: + case T4_ERR_INVALIDATE_MR_WITH_MW_BOUND: + wc->status = IB_WC_MW_BIND_ERR; + break; + case T4_ERR_CRC: + case T4_ERR_MARKER: + case T4_ERR_PDU_LEN_ERR: + case T4_ERR_OUT_OF_RQE: + case T4_ERR_DDP_VERSION: + case T4_ERR_RDMA_VERSION: + case T4_ERR_DDP_QUEUE_NUM: + case T4_ERR_MSN: + case T4_ERR_TBIT: + case T4_ERR_MO: + case T4_ERR_MSN_RANGE: + case T4_ERR_IRD_OVERFLOW: + case T4_ERR_OPCODE: + case T4_ERR_INTERNAL_ERR: + wc->status = IB_WC_FATAL_ERR; + break; + case T4_ERR_SWFLUSH: + wc->status = IB_WC_WR_FLUSH_ERR; + break; + default: + printk(KERN_ERR MOD + "Unexpected cqe_status 0x%x for QPID=0x%0x\n", + CQE_STATUS(&cqe), CQE_QPID(&cqe)); + ret = -EINVAL; + } + } +out: + if (wq) + spin_unlock(&qhp->lock); + return ret; +} + +int c4iw_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + struct c4iw_cq *chp; + unsigned long flags; + int npolled; + int err = 0; + + chp = to_c4iw_cq(ibcq); + + spin_lock_irqsave(&chp->lock, flags); + for (npolled = 0; npolled < num_entries; ++npolled) { + do { + err = c4iw_poll_cq_one(chp, wc + npolled); + } while (err == -EAGAIN); + if (err) + break; + } + spin_unlock_irqrestore(&chp->lock, flags); + return !err || err == -ENODATA ? npolled : err; +} + +int c4iw_destroy_cq(struct ib_cq *ib_cq) +{ + struct c4iw_cq *chp; + struct c4iw_ucontext *ucontext; + + PDBG("%s ib_cq %p\n", __func__, ib_cq); + chp = to_c4iw_cq(ib_cq); + + remove_handle(chp->rhp, &chp->rhp->cqidr, chp->cq.cqid); + atomic_dec(&chp->refcnt); + wait_event(chp->wait, !atomic_read(&chp->refcnt)); + + ucontext = ib_cq->uobject ? to_c4iw_ucontext(ib_cq->uobject->context) + : NULL; + destroy_cq(&chp->rhp->rdev, &chp->cq, + ucontext ? &ucontext->uctx : &chp->cq.rdev->uctx); + kfree(chp); + return 0; +} + +struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int entries, + int vector, struct ib_ucontext *ib_context, + struct ib_udata *udata) +{ + struct c4iw_dev *rhp; + struct c4iw_cq *chp; + struct c4iw_create_cq_resp uresp; + struct c4iw_ucontext *ucontext = NULL; + int ret; + size_t memsize, hwentries; + struct c4iw_mm_entry *mm, *mm2; + + PDBG("%s ib_dev %p entries %d\n", __func__, ibdev, entries); + + rhp = to_c4iw_dev(ibdev); + + if (vector >= rhp->rdev.lldi.nciq) + return ERR_PTR(-EINVAL); + + chp = kzalloc(sizeof(*chp), GFP_KERNEL); + if (!chp) + return ERR_PTR(-ENOMEM); + + if (ib_context) + ucontext = to_c4iw_ucontext(ib_context); + + /* account for the status page. */ + entries++; + + /* IQ needs one extra entry to differentiate full vs empty. */ + entries++; + + /* + * entries must be multiple of 16 for HW. + */ + entries = roundup(entries, 16); + + /* + * Make actual HW queue 2x to avoid cdix_inc overflows. + */ + hwentries = min(entries * 2, rhp->rdev.hw_queue.t4_max_iq_size); + + /* + * Make HW queue at least 64 entries so GTS updates aren't too + * frequent. + */ + if (hwentries < 64) + hwentries = 64; + + memsize = hwentries * sizeof *chp->cq.queue; + + /* + * memsize must be a multiple of the page size if its a user cq. + */ + if (ucontext) + memsize = roundup(memsize, PAGE_SIZE); + chp->cq.size = hwentries; + chp->cq.memsize = memsize; + chp->cq.vector = vector; + + ret = create_cq(&rhp->rdev, &chp->cq, + ucontext ? &ucontext->uctx : &rhp->rdev.uctx); + if (ret) + goto err1; + + chp->rhp = rhp; + chp->cq.size--; /* status page */ + chp->ibcq.cqe = entries - 2; + spin_lock_init(&chp->lock); + spin_lock_init(&chp->comp_handler_lock); + atomic_set(&chp->refcnt, 1); + init_waitqueue_head(&chp->wait); + ret = insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid); + if (ret) + goto err2; + + if (ucontext) { + mm = kmalloc(sizeof *mm, GFP_KERNEL); + if (!mm) + goto err3; + mm2 = kmalloc(sizeof *mm2, GFP_KERNEL); + if (!mm2) + goto err4; + + uresp.qid_mask = rhp->rdev.cqmask; + uresp.cqid = chp->cq.cqid; + uresp.size = chp->cq.size; + uresp.memsize = chp->cq.memsize; + spin_lock(&ucontext->mmap_lock); + uresp.key = ucontext->key; + ucontext->key += PAGE_SIZE; + uresp.gts_key = ucontext->key; + ucontext->key += PAGE_SIZE; + spin_unlock(&ucontext->mmap_lock); + ret = ib_copy_to_udata(udata, &uresp, + sizeof(uresp) - sizeof(uresp.reserved)); + if (ret) + goto err5; + + mm->key = uresp.key; + mm->addr = virt_to_phys(chp->cq.queue); + mm->len = chp->cq.memsize; + insert_mmap(ucontext, mm); + + mm2->key = uresp.gts_key; + mm2->addr = chp->cq.ugts; + mm2->len = PAGE_SIZE; + insert_mmap(ucontext, mm2); + } + PDBG("%s cqid 0x%0x chp %p size %u memsize %zu, dma_addr 0x%0llx\n", + __func__, chp->cq.cqid, chp, chp->cq.size, + chp->cq.memsize, + (unsigned long long) chp->cq.dma_addr); + return &chp->ibcq; +err5: + kfree(mm2); +err4: + kfree(mm); +err3: + remove_handle(rhp, &rhp->cqidr, chp->cq.cqid); +err2: + destroy_cq(&chp->rhp->rdev, &chp->cq, + ucontext ? &ucontext->uctx : &rhp->rdev.uctx); +err1: + kfree(chp); + return ERR_PTR(ret); +} + +int c4iw_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata) +{ + return -ENOSYS; +} + +int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + struct c4iw_cq *chp; + int ret; + unsigned long flag; + + chp = to_c4iw_cq(ibcq); + spin_lock_irqsave(&chp->lock, flag); + ret = t4_arm_cq(&chp->cq, + (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED); + spin_unlock_irqrestore(&chp->lock, flag); + if (ret && !(flags & IB_CQ_REPORT_MISSED_EVENTS)) + ret = 0; + return ret; +} diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c new file mode 100644 index 0000000..72f1f05 --- /dev/null +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -0,0 +1,1546 @@ +/* + * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include + +#include + +#include "iw_cxgb4.h" + +#define DRV_VERSION "0.1" + +MODULE_AUTHOR("Steve Wise"); +MODULE_DESCRIPTION("Chelsio T4/T5 RDMA Driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); + +static int allow_db_fc_on_t5; +module_param(allow_db_fc_on_t5, int, 0644); +MODULE_PARM_DESC(allow_db_fc_on_t5, + "Allow DB Flow Control on T5 (default = 0)"); + +static int allow_db_coalescing_on_t5; +module_param(allow_db_coalescing_on_t5, int, 0644); +MODULE_PARM_DESC(allow_db_coalescing_on_t5, + "Allow DB Coalescing on T5 (default = 0)"); + +int c4iw_wr_log = 0; +module_param(c4iw_wr_log, int, 0444); +MODULE_PARM_DESC(c4iw_wr_log, "Enables logging of work request timing data."); + +static int c4iw_wr_log_size_order = 12; +module_param(c4iw_wr_log_size_order, int, 0444); +MODULE_PARM_DESC(c4iw_wr_log_size_order, + "Number of entries (log2) in the work request timing log."); + +struct uld_ctx { + struct list_head entry; + struct cxgb4_lld_info lldi; + struct c4iw_dev *dev; +}; + +static LIST_HEAD(uld_ctx_list); +static DEFINE_MUTEX(dev_mutex); + +#define DB_FC_RESUME_SIZE 64 +#define DB_FC_RESUME_DELAY 1 +#define DB_FC_DRAIN_THRESH 0 + +static struct dentry *c4iw_debugfs_root; + +struct c4iw_debugfs_data { + struct c4iw_dev *devp; + char *buf; + int bufsize; + int pos; +}; + +/* registered cxgb4 netlink callbacks */ +static struct ibnl_client_cbs c4iw_nl_cb_table[] = { + [RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb}, + [RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb}, + [RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb}, + [RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb}, + [RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb}, + [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb} +}; + +static int count_idrs(int id, void *p, void *data) +{ + int *countp = data; + + *countp = *countp + 1; + return 0; +} + +static ssize_t debugfs_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct c4iw_debugfs_data *d = file->private_data; + + return simple_read_from_buffer(buf, count, ppos, d->buf, d->pos); +} + +void c4iw_log_wr_stats(struct t4_wq *wq, struct t4_cqe *cqe) +{ + struct wr_log_entry le; + int idx; + + if (!wq->rdev->wr_log) + return; + + idx = (atomic_inc_return(&wq->rdev->wr_log_idx) - 1) & + (wq->rdev->wr_log_size - 1); + le.poll_sge_ts = cxgb4_read_sge_timestamp(wq->rdev->lldi.ports[0]); + getnstimeofday(&le.poll_host_ts); + le.valid = 1; + le.cqe_sge_ts = CQE_TS(cqe); + if (SQ_TYPE(cqe)) { + le.qid = wq->sq.qid; + le.opcode = CQE_OPCODE(cqe); + le.post_host_ts = wq->sq.sw_sq[wq->sq.cidx].host_ts; + le.post_sge_ts = wq->sq.sw_sq[wq->sq.cidx].sge_ts; + le.wr_id = CQE_WRID_SQ_IDX(cqe); + } else { + le.qid = wq->rq.qid; + le.opcode = FW_RI_RECEIVE; + le.post_host_ts = wq->rq.sw_rq[wq->rq.cidx].host_ts; + le.post_sge_ts = wq->rq.sw_rq[wq->rq.cidx].sge_ts; + le.wr_id = CQE_WRID_MSN(cqe); + } + wq->rdev->wr_log[idx] = le; +} + +static int wr_log_show(struct seq_file *seq, void *v) +{ + struct c4iw_dev *dev = seq->private; + struct timespec prev_ts = {0, 0}; + struct wr_log_entry *lep; + int prev_ts_set = 0; + int idx, end; + +#define ts2ns(ts) div64_ul((ts) * dev->rdev.lldi.cclk_ps, 1000) + + idx = atomic_read(&dev->rdev.wr_log_idx) & + (dev->rdev.wr_log_size - 1); + end = idx - 1; + if (end < 0) + end = dev->rdev.wr_log_size - 1; + lep = &dev->rdev.wr_log[idx]; + while (idx != end) { + if (lep->valid) { + if (!prev_ts_set) { + prev_ts_set = 1; + prev_ts = lep->poll_host_ts; + } + seq_printf(seq, "%04u: sec %lu nsec %lu qid %u opcode " + "%u %s 0x%x host_wr_delta sec %lu nsec %lu " + "post_sge_ts 0x%llx cqe_sge_ts 0x%llx " + "poll_sge_ts 0x%llx post_poll_delta_ns %llu " + "cqe_poll_delta_ns %llu\n", + idx, + timespec_sub(lep->poll_host_ts, + prev_ts).tv_sec, + timespec_sub(lep->poll_host_ts, + prev_ts).tv_nsec, + lep->qid, lep->opcode, + lep->opcode == FW_RI_RECEIVE ? + "msn" : "wrid", + lep->wr_id, + timespec_sub(lep->poll_host_ts, + lep->post_host_ts).tv_sec, + timespec_sub(lep->poll_host_ts, + lep->post_host_ts).tv_nsec, + lep->post_sge_ts, lep->cqe_sge_ts, + lep->poll_sge_ts, + ts2ns(lep->poll_sge_ts - lep->post_sge_ts), + ts2ns(lep->poll_sge_ts - lep->cqe_sge_ts)); + prev_ts = lep->poll_host_ts; + } + idx++; + if (idx > (dev->rdev.wr_log_size - 1)) + idx = 0; + lep = &dev->rdev.wr_log[idx]; + } +#undef ts2ns + return 0; +} + +static int wr_log_open(struct inode *inode, struct file *file) +{ + return single_open(file, wr_log_show, inode->i_private); +} + +static ssize_t wr_log_clear(struct file *file, const char __user *buf, + size_t count, loff_t *pos) +{ + struct c4iw_dev *dev = ((struct seq_file *)file->private_data)->private; + int i; + + if (dev->rdev.wr_log) + for (i = 0; i < dev->rdev.wr_log_size; i++) + dev->rdev.wr_log[i].valid = 0; + return count; +} + +static const struct file_operations wr_log_debugfs_fops = { + .owner = THIS_MODULE, + .open = wr_log_open, + .release = single_release, + .read = seq_read, + .llseek = seq_lseek, + .write = wr_log_clear, +}; + +static int dump_qp(int id, void *p, void *data) +{ + struct c4iw_qp *qp = p; + struct c4iw_debugfs_data *qpd = data; + int space; + int cc; + + if (id != qp->wq.sq.qid) + return 0; + + space = qpd->bufsize - qpd->pos - 1; + if (space == 0) + return 1; + + if (qp->ep) { + if (qp->ep->com.local_addr.ss_family == AF_INET) { + struct sockaddr_in *lsin = (struct sockaddr_in *) + &qp->ep->com.local_addr; + struct sockaddr_in *rsin = (struct sockaddr_in *) + &qp->ep->com.remote_addr; + struct sockaddr_in *mapped_lsin = (struct sockaddr_in *) + &qp->ep->com.mapped_local_addr; + struct sockaddr_in *mapped_rsin = (struct sockaddr_in *) + &qp->ep->com.mapped_remote_addr; + + cc = snprintf(qpd->buf + qpd->pos, space, + "rc qp sq id %u rq id %u state %u " + "onchip %u ep tid %u state %u " + "%pI4:%u/%u->%pI4:%u/%u\n", + qp->wq.sq.qid, qp->wq.rq.qid, + (int)qp->attr.state, + qp->wq.sq.flags & T4_SQ_ONCHIP, + qp->ep->hwtid, (int)qp->ep->com.state, + &lsin->sin_addr, ntohs(lsin->sin_port), + ntohs(mapped_lsin->sin_port), + &rsin->sin_addr, ntohs(rsin->sin_port), + ntohs(mapped_rsin->sin_port)); + } else { + struct sockaddr_in6 *lsin6 = (struct sockaddr_in6 *) + &qp->ep->com.local_addr; + struct sockaddr_in6 *rsin6 = (struct sockaddr_in6 *) + &qp->ep->com.remote_addr; + struct sockaddr_in6 *mapped_lsin6 = + (struct sockaddr_in6 *) + &qp->ep->com.mapped_local_addr; + struct sockaddr_in6 *mapped_rsin6 = + (struct sockaddr_in6 *) + &qp->ep->com.mapped_remote_addr; + + cc = snprintf(qpd->buf + qpd->pos, space, + "rc qp sq id %u rq id %u state %u " + "onchip %u ep tid %u state %u " + "%pI6:%u/%u->%pI6:%u/%u\n", + qp->wq.sq.qid, qp->wq.rq.qid, + (int)qp->attr.state, + qp->wq.sq.flags & T4_SQ_ONCHIP, + qp->ep->hwtid, (int)qp->ep->com.state, + &lsin6->sin6_addr, + ntohs(lsin6->sin6_port), + ntohs(mapped_lsin6->sin6_port), + &rsin6->sin6_addr, + ntohs(rsin6->sin6_port), + ntohs(mapped_rsin6->sin6_port)); + } + } else + cc = snprintf(qpd->buf + qpd->pos, space, + "qp sq id %u rq id %u state %u onchip %u\n", + qp->wq.sq.qid, qp->wq.rq.qid, + (int)qp->attr.state, + qp->wq.sq.flags & T4_SQ_ONCHIP); + if (cc < space) + qpd->pos += cc; + return 0; +} + +static int qp_release(struct inode *inode, struct file *file) +{ + struct c4iw_debugfs_data *qpd = file->private_data; + if (!qpd) { + printk(KERN_INFO "%s null qpd?\n", __func__); + return 0; + } + vfree(qpd->buf); + kfree(qpd); + return 0; +} + +static int qp_open(struct inode *inode, struct file *file) +{ + struct c4iw_debugfs_data *qpd; + int ret = 0; + int count = 1; + + qpd = kmalloc(sizeof *qpd, GFP_KERNEL); + if (!qpd) { + ret = -ENOMEM; + goto out; + } + qpd->devp = inode->i_private; + qpd->pos = 0; + + spin_lock_irq(&qpd->devp->lock); + idr_for_each(&qpd->devp->qpidr, count_idrs, &count); + spin_unlock_irq(&qpd->devp->lock); + + qpd->bufsize = count * 128; + qpd->buf = vmalloc(qpd->bufsize); + if (!qpd->buf) { + ret = -ENOMEM; + goto err1; + } + + spin_lock_irq(&qpd->devp->lock); + idr_for_each(&qpd->devp->qpidr, dump_qp, qpd); + spin_unlock_irq(&qpd->devp->lock); + + qpd->buf[qpd->pos++] = 0; + file->private_data = qpd; + goto out; +err1: + kfree(qpd); +out: + return ret; +} + +static const struct file_operations qp_debugfs_fops = { + .owner = THIS_MODULE, + .open = qp_open, + .release = qp_release, + .read = debugfs_read, + .llseek = default_llseek, +}; + +static int dump_stag(int id, void *p, void *data) +{ + struct c4iw_debugfs_data *stagd = data; + int space; + int cc; + struct fw_ri_tpte tpte; + int ret; + + space = stagd->bufsize - stagd->pos - 1; + if (space == 0) + return 1; + + ret = cxgb4_read_tpte(stagd->devp->rdev.lldi.ports[0], (u32)id<<8, + (__be32 *)&tpte); + if (ret) { + dev_err(&stagd->devp->rdev.lldi.pdev->dev, + "%s cxgb4_read_tpte err %d\n", __func__, ret); + return ret; + } + cc = snprintf(stagd->buf + stagd->pos, space, + "stag: idx 0x%x valid %d key 0x%x state %d pdid %d " + "perm 0x%x ps %d len 0x%llx va 0x%llx\n", + (u32)id<<8, + G_FW_RI_TPTE_VALID(ntohl(tpte.valid_to_pdid)), + G_FW_RI_TPTE_STAGKEY(ntohl(tpte.valid_to_pdid)), + G_FW_RI_TPTE_STAGSTATE(ntohl(tpte.valid_to_pdid)), + G_FW_RI_TPTE_PDID(ntohl(tpte.valid_to_pdid)), + G_FW_RI_TPTE_PERM(ntohl(tpte.locread_to_qpid)), + G_FW_RI_TPTE_PS(ntohl(tpte.locread_to_qpid)), + ((u64)ntohl(tpte.len_hi) << 32) | ntohl(tpte.len_lo), + ((u64)ntohl(tpte.va_hi) << 32) | ntohl(tpte.va_lo_fbo)); + if (cc < space) + stagd->pos += cc; + return 0; +} + +static int stag_release(struct inode *inode, struct file *file) +{ + struct c4iw_debugfs_data *stagd = file->private_data; + if (!stagd) { + printk(KERN_INFO "%s null stagd?\n", __func__); + return 0; + } + vfree(stagd->buf); + kfree(stagd); + return 0; +} + +static int stag_open(struct inode *inode, struct file *file) +{ + struct c4iw_debugfs_data *stagd; + int ret = 0; + int count = 1; + + stagd = kmalloc(sizeof *stagd, GFP_KERNEL); + if (!stagd) { + ret = -ENOMEM; + goto out; + } + stagd->devp = inode->i_private; + stagd->pos = 0; + + spin_lock_irq(&stagd->devp->lock); + idr_for_each(&stagd->devp->mmidr, count_idrs, &count); + spin_unlock_irq(&stagd->devp->lock); + + stagd->bufsize = count * 256; + stagd->buf = vmalloc(stagd->bufsize); + if (!stagd->buf) { + ret = -ENOMEM; + goto err1; + } + + spin_lock_irq(&stagd->devp->lock); + idr_for_each(&stagd->devp->mmidr, dump_stag, stagd); + spin_unlock_irq(&stagd->devp->lock); + + stagd->buf[stagd->pos++] = 0; + file->private_data = stagd; + goto out; +err1: + kfree(stagd); +out: + return ret; +} + +static const struct file_operations stag_debugfs_fops = { + .owner = THIS_MODULE, + .open = stag_open, + .release = stag_release, + .read = debugfs_read, + .llseek = default_llseek, +}; + +static char *db_state_str[] = {"NORMAL", "FLOW_CONTROL", "RECOVERY", "STOPPED"}; + +static int stats_show(struct seq_file *seq, void *v) +{ + struct c4iw_dev *dev = seq->private; + + seq_printf(seq, " Object: %10s %10s %10s %10s\n", "Total", "Current", + "Max", "Fail"); + seq_printf(seq, " PDID: %10llu %10llu %10llu %10llu\n", + dev->rdev.stats.pd.total, dev->rdev.stats.pd.cur, + dev->rdev.stats.pd.max, dev->rdev.stats.pd.fail); + seq_printf(seq, " QID: %10llu %10llu %10llu %10llu\n", + dev->rdev.stats.qid.total, dev->rdev.stats.qid.cur, + dev->rdev.stats.qid.max, dev->rdev.stats.qid.fail); + seq_printf(seq, " TPTMEM: %10llu %10llu %10llu %10llu\n", + dev->rdev.stats.stag.total, dev->rdev.stats.stag.cur, + dev->rdev.stats.stag.max, dev->rdev.stats.stag.fail); + seq_printf(seq, " PBLMEM: %10llu %10llu %10llu %10llu\n", + dev->rdev.stats.pbl.total, dev->rdev.stats.pbl.cur, + dev->rdev.stats.pbl.max, dev->rdev.stats.pbl.fail); + seq_printf(seq, " RQTMEM: %10llu %10llu %10llu %10llu\n", + dev->rdev.stats.rqt.total, dev->rdev.stats.rqt.cur, + dev->rdev.stats.rqt.max, dev->rdev.stats.rqt.fail); + seq_printf(seq, " OCQPMEM: %10llu %10llu %10llu %10llu\n", + dev->rdev.stats.ocqp.total, dev->rdev.stats.ocqp.cur, + dev->rdev.stats.ocqp.max, dev->rdev.stats.ocqp.fail); + seq_printf(seq, " DB FULL: %10llu\n", dev->rdev.stats.db_full); + seq_printf(seq, " DB EMPTY: %10llu\n", dev->rdev.stats.db_empty); + seq_printf(seq, " DB DROP: %10llu\n", dev->rdev.stats.db_drop); + seq_printf(seq, " DB State: %s Transitions %llu FC Interruptions %llu\n", + db_state_str[dev->db_state], + dev->rdev.stats.db_state_transitions, + dev->rdev.stats.db_fc_interruptions); + seq_printf(seq, "TCAM_FULL: %10llu\n", dev->rdev.stats.tcam_full); + seq_printf(seq, "ACT_OFLD_CONN_FAILS: %10llu\n", + dev->rdev.stats.act_ofld_conn_fails); + seq_printf(seq, "PAS_OFLD_CONN_FAILS: %10llu\n", + dev->rdev.stats.pas_ofld_conn_fails); + seq_printf(seq, "AVAILABLE IRD: %10u\n", dev->avail_ird); + return 0; +} + +static int stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, stats_show, inode->i_private); +} + +static ssize_t stats_clear(struct file *file, const char __user *buf, + size_t count, loff_t *pos) +{ + struct c4iw_dev *dev = ((struct seq_file *)file->private_data)->private; + + mutex_lock(&dev->rdev.stats.lock); + dev->rdev.stats.pd.max = 0; + dev->rdev.stats.pd.fail = 0; + dev->rdev.stats.qid.max = 0; + dev->rdev.stats.qid.fail = 0; + dev->rdev.stats.stag.max = 0; + dev->rdev.stats.stag.fail = 0; + dev->rdev.stats.pbl.max = 0; + dev->rdev.stats.pbl.fail = 0; + dev->rdev.stats.rqt.max = 0; + dev->rdev.stats.rqt.fail = 0; + dev->rdev.stats.ocqp.max = 0; + dev->rdev.stats.ocqp.fail = 0; + dev->rdev.stats.db_full = 0; + dev->rdev.stats.db_empty = 0; + dev->rdev.stats.db_drop = 0; + dev->rdev.stats.db_state_transitions = 0; + dev->rdev.stats.tcam_full = 0; + dev->rdev.stats.act_ofld_conn_fails = 0; + dev->rdev.stats.pas_ofld_conn_fails = 0; + mutex_unlock(&dev->rdev.stats.lock); + return count; +} + +static const struct file_operations stats_debugfs_fops = { + .owner = THIS_MODULE, + .open = stats_open, + .release = single_release, + .read = seq_read, + .llseek = seq_lseek, + .write = stats_clear, +}; + +static int dump_ep(int id, void *p, void *data) +{ + struct c4iw_ep *ep = p; + struct c4iw_debugfs_data *epd = data; + int space; + int cc; + + space = epd->bufsize - epd->pos - 1; + if (space == 0) + return 1; + + if (ep->com.local_addr.ss_family == AF_INET) { + struct sockaddr_in *lsin = (struct sockaddr_in *) + &ep->com.local_addr; + struct sockaddr_in *rsin = (struct sockaddr_in *) + &ep->com.remote_addr; + struct sockaddr_in *mapped_lsin = (struct sockaddr_in *) + &ep->com.mapped_local_addr; + struct sockaddr_in *mapped_rsin = (struct sockaddr_in *) + &ep->com.mapped_remote_addr; + + cc = snprintf(epd->buf + epd->pos, space, + "ep %p cm_id %p qp %p state %d flags 0x%lx " + "history 0x%lx hwtid %d atid %d " + "%pI4:%d/%d <-> %pI4:%d/%d\n", + ep, ep->com.cm_id, ep->com.qp, + (int)ep->com.state, ep->com.flags, + ep->com.history, ep->hwtid, ep->atid, + &lsin->sin_addr, ntohs(lsin->sin_port), + ntohs(mapped_lsin->sin_port), + &rsin->sin_addr, ntohs(rsin->sin_port), + ntohs(mapped_rsin->sin_port)); + } else { + struct sockaddr_in6 *lsin6 = (struct sockaddr_in6 *) + &ep->com.local_addr; + struct sockaddr_in6 *rsin6 = (struct sockaddr_in6 *) + &ep->com.remote_addr; + struct sockaddr_in6 *mapped_lsin6 = (struct sockaddr_in6 *) + &ep->com.mapped_local_addr; + struct sockaddr_in6 *mapped_rsin6 = (struct sockaddr_in6 *) + &ep->com.mapped_remote_addr; + + cc = snprintf(epd->buf + epd->pos, space, + "ep %p cm_id %p qp %p state %d flags 0x%lx " + "history 0x%lx hwtid %d atid %d " + "%pI6:%d/%d <-> %pI6:%d/%d\n", + ep, ep->com.cm_id, ep->com.qp, + (int)ep->com.state, ep->com.flags, + ep->com.history, ep->hwtid, ep->atid, + &lsin6->sin6_addr, ntohs(lsin6->sin6_port), + ntohs(mapped_lsin6->sin6_port), + &rsin6->sin6_addr, ntohs(rsin6->sin6_port), + ntohs(mapped_rsin6->sin6_port)); + } + if (cc < space) + epd->pos += cc; + return 0; +} + +static int dump_listen_ep(int id, void *p, void *data) +{ + struct c4iw_listen_ep *ep = p; + struct c4iw_debugfs_data *epd = data; + int space; + int cc; + + space = epd->bufsize - epd->pos - 1; + if (space == 0) + return 1; + + if (ep->com.local_addr.ss_family == AF_INET) { + struct sockaddr_in *lsin = (struct sockaddr_in *) + &ep->com.local_addr; + struct sockaddr_in *mapped_lsin = (struct sockaddr_in *) + &ep->com.mapped_local_addr; + + cc = snprintf(epd->buf + epd->pos, space, + "ep %p cm_id %p state %d flags 0x%lx stid %d " + "backlog %d %pI4:%d/%d\n", + ep, ep->com.cm_id, (int)ep->com.state, + ep->com.flags, ep->stid, ep->backlog, + &lsin->sin_addr, ntohs(lsin->sin_port), + ntohs(mapped_lsin->sin_port)); + } else { + struct sockaddr_in6 *lsin6 = (struct sockaddr_in6 *) + &ep->com.local_addr; + struct sockaddr_in6 *mapped_lsin6 = (struct sockaddr_in6 *) + &ep->com.mapped_local_addr; + + cc = snprintf(epd->buf + epd->pos, space, + "ep %p cm_id %p state %d flags 0x%lx stid %d " + "backlog %d %pI6:%d/%d\n", + ep, ep->com.cm_id, (int)ep->com.state, + ep->com.flags, ep->stid, ep->backlog, + &lsin6->sin6_addr, ntohs(lsin6->sin6_port), + ntohs(mapped_lsin6->sin6_port)); + } + if (cc < space) + epd->pos += cc; + return 0; +} + +static int ep_release(struct inode *inode, struct file *file) +{ + struct c4iw_debugfs_data *epd = file->private_data; + if (!epd) { + pr_info("%s null qpd?\n", __func__); + return 0; + } + vfree(epd->buf); + kfree(epd); + return 0; +} + +static int ep_open(struct inode *inode, struct file *file) +{ + struct c4iw_debugfs_data *epd; + int ret = 0; + int count = 1; + + epd = kmalloc(sizeof(*epd), GFP_KERNEL); + if (!epd) { + ret = -ENOMEM; + goto out; + } + epd->devp = inode->i_private; + epd->pos = 0; + + spin_lock_irq(&epd->devp->lock); + idr_for_each(&epd->devp->hwtid_idr, count_idrs, &count); + idr_for_each(&epd->devp->atid_idr, count_idrs, &count); + idr_for_each(&epd->devp->stid_idr, count_idrs, &count); + spin_unlock_irq(&epd->devp->lock); + + epd->bufsize = count * 160; + epd->buf = vmalloc(epd->bufsize); + if (!epd->buf) { + ret = -ENOMEM; + goto err1; + } + + spin_lock_irq(&epd->devp->lock); + idr_for_each(&epd->devp->hwtid_idr, dump_ep, epd); + idr_for_each(&epd->devp->atid_idr, dump_ep, epd); + idr_for_each(&epd->devp->stid_idr, dump_listen_ep, epd); + spin_unlock_irq(&epd->devp->lock); + + file->private_data = epd; + goto out; +err1: + kfree(epd); +out: + return ret; +} + +static const struct file_operations ep_debugfs_fops = { + .owner = THIS_MODULE, + .open = ep_open, + .release = ep_release, + .read = debugfs_read, +}; + +static int setup_debugfs(struct c4iw_dev *devp) +{ + struct dentry *de; + + if (!devp->debugfs_root) + return -1; + + de = debugfs_create_file("qps", S_IWUSR, devp->debugfs_root, + (void *)devp, &qp_debugfs_fops); + if (de && de->d_inode) + de->d_inode->i_size = 4096; + + de = debugfs_create_file("stags", S_IWUSR, devp->debugfs_root, + (void *)devp, &stag_debugfs_fops); + if (de && de->d_inode) + de->d_inode->i_size = 4096; + + de = debugfs_create_file("stats", S_IWUSR, devp->debugfs_root, + (void *)devp, &stats_debugfs_fops); + if (de && de->d_inode) + de->d_inode->i_size = 4096; + + de = debugfs_create_file("eps", S_IWUSR, devp->debugfs_root, + (void *)devp, &ep_debugfs_fops); + if (de && de->d_inode) + de->d_inode->i_size = 4096; + + if (c4iw_wr_log) { + de = debugfs_create_file("wr_log", S_IWUSR, devp->debugfs_root, + (void *)devp, &wr_log_debugfs_fops); + if (de && de->d_inode) + de->d_inode->i_size = 4096; + } + return 0; +} + +void c4iw_release_dev_ucontext(struct c4iw_rdev *rdev, + struct c4iw_dev_ucontext *uctx) +{ + struct list_head *pos, *nxt; + struct c4iw_qid_list *entry; + + mutex_lock(&uctx->lock); + list_for_each_safe(pos, nxt, &uctx->qpids) { + entry = list_entry(pos, struct c4iw_qid_list, entry); + list_del_init(&entry->entry); + if (!(entry->qid & rdev->qpmask)) { + c4iw_put_resource(&rdev->resource.qid_table, + entry->qid); + mutex_lock(&rdev->stats.lock); + rdev->stats.qid.cur -= rdev->qpmask + 1; + mutex_unlock(&rdev->stats.lock); + } + kfree(entry); + } + + list_for_each_safe(pos, nxt, &uctx->qpids) { + entry = list_entry(pos, struct c4iw_qid_list, entry); + list_del_init(&entry->entry); + kfree(entry); + } + mutex_unlock(&uctx->lock); +} + +void c4iw_init_dev_ucontext(struct c4iw_rdev *rdev, + struct c4iw_dev_ucontext *uctx) +{ + INIT_LIST_HEAD(&uctx->qpids); + INIT_LIST_HEAD(&uctx->cqids); + mutex_init(&uctx->lock); +} + +/* Caller takes care of locking if needed */ +static int c4iw_rdev_open(struct c4iw_rdev *rdev) +{ + int err; + + c4iw_init_dev_ucontext(rdev, &rdev->uctx); + + /* + * qpshift is the number of bits to shift the qpid left in order + * to get the correct address of the doorbell for that qp. + */ + rdev->qpshift = PAGE_SHIFT - ilog2(rdev->lldi.udb_density); + rdev->qpmask = rdev->lldi.udb_density - 1; + rdev->cqshift = PAGE_SHIFT - ilog2(rdev->lldi.ucq_density); + rdev->cqmask = rdev->lldi.ucq_density - 1; + PDBG("%s dev %s stag start 0x%0x size 0x%0x num stags %d " + "pbl start 0x%0x size 0x%0x rq start 0x%0x size 0x%0x " + "qp qid start %u size %u cq qid start %u size %u\n", + __func__, pci_name(rdev->lldi.pdev), rdev->lldi.vr->stag.start, + rdev->lldi.vr->stag.size, c4iw_num_stags(rdev), + rdev->lldi.vr->pbl.start, + rdev->lldi.vr->pbl.size, rdev->lldi.vr->rq.start, + rdev->lldi.vr->rq.size, + rdev->lldi.vr->qp.start, + rdev->lldi.vr->qp.size, + rdev->lldi.vr->cq.start, + rdev->lldi.vr->cq.size); + PDBG("udb len 0x%x udb base %llx db_reg %p gts_reg %p qpshift %lu " + "qpmask 0x%x cqshift %lu cqmask 0x%x\n", + (unsigned)pci_resource_len(rdev->lldi.pdev, 2), + (u64)pci_resource_start(rdev->lldi.pdev, 2), + rdev->lldi.db_reg, + rdev->lldi.gts_reg, + rdev->qpshift, rdev->qpmask, + rdev->cqshift, rdev->cqmask); + + if (c4iw_num_stags(rdev) == 0) { + err = -EINVAL; + goto err1; + } + + rdev->stats.pd.total = T4_MAX_NUM_PD; + rdev->stats.stag.total = rdev->lldi.vr->stag.size; + rdev->stats.pbl.total = rdev->lldi.vr->pbl.size; + rdev->stats.rqt.total = rdev->lldi.vr->rq.size; + rdev->stats.ocqp.total = rdev->lldi.vr->ocq.size; + rdev->stats.qid.total = rdev->lldi.vr->qp.size; + + err = c4iw_init_resource(rdev, c4iw_num_stags(rdev), T4_MAX_NUM_PD); + if (err) { + printk(KERN_ERR MOD "error %d initializing resources\n", err); + goto err1; + } + err = c4iw_pblpool_create(rdev); + if (err) { + printk(KERN_ERR MOD "error %d initializing pbl pool\n", err); + goto err2; + } + err = c4iw_rqtpool_create(rdev); + if (err) { + printk(KERN_ERR MOD "error %d initializing rqt pool\n", err); + goto err3; + } + err = c4iw_ocqp_pool_create(rdev); + if (err) { + printk(KERN_ERR MOD "error %d initializing ocqp pool\n", err); + goto err4; + } + rdev->status_page = (struct t4_dev_status_page *) + __get_free_page(GFP_KERNEL); + if (!rdev->status_page) { + pr_err(MOD "error allocating status page\n"); + goto err4; + } + + if (c4iw_wr_log) { + rdev->wr_log = kzalloc((1 << c4iw_wr_log_size_order) * + sizeof(*rdev->wr_log), GFP_KERNEL); + if (rdev->wr_log) { + rdev->wr_log_size = 1 << c4iw_wr_log_size_order; + atomic_set(&rdev->wr_log_idx, 0); + } else { + pr_err(MOD "error allocating wr_log. Logging disabled\n"); + } + } + + rdev->status_page->db_off = 0; + + return 0; +err4: + c4iw_rqtpool_destroy(rdev); +err3: + c4iw_pblpool_destroy(rdev); +err2: + c4iw_destroy_resource(&rdev->resource); +err1: + return err; +} + +static void c4iw_rdev_close(struct c4iw_rdev *rdev) +{ + kfree(rdev->wr_log); + free_page((unsigned long)rdev->status_page); + c4iw_pblpool_destroy(rdev); + c4iw_rqtpool_destroy(rdev); + c4iw_destroy_resource(&rdev->resource); +} + +static void c4iw_dealloc(struct uld_ctx *ctx) +{ + c4iw_rdev_close(&ctx->dev->rdev); + idr_destroy(&ctx->dev->cqidr); + idr_destroy(&ctx->dev->qpidr); + idr_destroy(&ctx->dev->mmidr); + idr_destroy(&ctx->dev->hwtid_idr); + idr_destroy(&ctx->dev->stid_idr); + idr_destroy(&ctx->dev->atid_idr); + if (ctx->dev->rdev.bar2_kva) + iounmap(ctx->dev->rdev.bar2_kva); + if (ctx->dev->rdev.oc_mw_kva) + iounmap(ctx->dev->rdev.oc_mw_kva); + ib_dealloc_device(&ctx->dev->ibdev); + ctx->dev = NULL; +} + +static void c4iw_remove(struct uld_ctx *ctx) +{ + PDBG("%s c4iw_dev %p\n", __func__, ctx->dev); + c4iw_unregister_device(ctx->dev); + c4iw_dealloc(ctx); +} + +static int rdma_supported(const struct cxgb4_lld_info *infop) +{ + return infop->vr->stag.size > 0 && infop->vr->pbl.size > 0 && + infop->vr->rq.size > 0 && infop->vr->qp.size > 0 && + infop->vr->cq.size > 0; +} + +static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop) +{ + struct c4iw_dev *devp; + int ret; + + if (!rdma_supported(infop)) { + printk(KERN_INFO MOD "%s: RDMA not supported on this device.\n", + pci_name(infop->pdev)); + return ERR_PTR(-ENOSYS); + } + if (!ocqp_supported(infop)) + pr_info("%s: On-Chip Queues not supported on this device.\n", + pci_name(infop->pdev)); + + devp = (struct c4iw_dev *)ib_alloc_device(sizeof(*devp)); + if (!devp) { + printk(KERN_ERR MOD "Cannot allocate ib device\n"); + return ERR_PTR(-ENOMEM); + } + devp->rdev.lldi = *infop; + + /* init various hw-queue params based on lld info */ + PDBG("%s: Ing. padding boundary is %d, egrsstatuspagesize = %d\n", + __func__, devp->rdev.lldi.sge_ingpadboundary, + devp->rdev.lldi.sge_egrstatuspagesize); + + devp->rdev.hw_queue.t4_eq_status_entries = + devp->rdev.lldi.sge_ingpadboundary > 64 ? 2 : 1; + devp->rdev.hw_queue.t4_max_eq_size = 65520; + devp->rdev.hw_queue.t4_max_iq_size = 65520; + devp->rdev.hw_queue.t4_max_rq_size = 8192 - + devp->rdev.hw_queue.t4_eq_status_entries - 1; + devp->rdev.hw_queue.t4_max_sq_size = + devp->rdev.hw_queue.t4_max_eq_size - + devp->rdev.hw_queue.t4_eq_status_entries - 1; + devp->rdev.hw_queue.t4_max_qp_depth = + devp->rdev.hw_queue.t4_max_rq_size; + devp->rdev.hw_queue.t4_max_cq_depth = + devp->rdev.hw_queue.t4_max_iq_size - 2; + devp->rdev.hw_queue.t4_stat_len = + devp->rdev.lldi.sge_egrstatuspagesize; + + /* + * For T5 devices, we map all of BAR2 with WC. + * For T4 devices with onchip qp mem, we map only that part + * of BAR2 with WC. + */ + devp->rdev.bar2_pa = pci_resource_start(devp->rdev.lldi.pdev, 2); + if (is_t5(devp->rdev.lldi.adapter_type)) { + devp->rdev.bar2_kva = ioremap_wc(devp->rdev.bar2_pa, + pci_resource_len(devp->rdev.lldi.pdev, 2)); + if (!devp->rdev.bar2_kva) { + pr_err(MOD "Unable to ioremap BAR2\n"); + ib_dealloc_device(&devp->ibdev); + return ERR_PTR(-EINVAL); + } + } else if (ocqp_supported(infop)) { + devp->rdev.oc_mw_pa = + pci_resource_start(devp->rdev.lldi.pdev, 2) + + pci_resource_len(devp->rdev.lldi.pdev, 2) - + roundup_pow_of_two(devp->rdev.lldi.vr->ocq.size); + devp->rdev.oc_mw_kva = ioremap_wc(devp->rdev.oc_mw_pa, + devp->rdev.lldi.vr->ocq.size); + if (!devp->rdev.oc_mw_kva) { + pr_err(MOD "Unable to ioremap onchip mem\n"); + ib_dealloc_device(&devp->ibdev); + return ERR_PTR(-EINVAL); + } + } + + PDBG(KERN_INFO MOD "ocq memory: " + "hw_start 0x%x size %u mw_pa 0x%lx mw_kva %p\n", + devp->rdev.lldi.vr->ocq.start, devp->rdev.lldi.vr->ocq.size, + devp->rdev.oc_mw_pa, devp->rdev.oc_mw_kva); + + ret = c4iw_rdev_open(&devp->rdev); + if (ret) { + printk(KERN_ERR MOD "Unable to open CXIO rdev err %d\n", ret); + ib_dealloc_device(&devp->ibdev); + return ERR_PTR(ret); + } + + idr_init(&devp->cqidr); + idr_init(&devp->qpidr); + idr_init(&devp->mmidr); + idr_init(&devp->hwtid_idr); + idr_init(&devp->stid_idr); + idr_init(&devp->atid_idr); + spin_lock_init(&devp->lock); + mutex_init(&devp->rdev.stats.lock); + mutex_init(&devp->db_mutex); + INIT_LIST_HEAD(&devp->db_fc_list); + devp->avail_ird = devp->rdev.lldi.max_ird_adapter; + + if (c4iw_debugfs_root) { + devp->debugfs_root = debugfs_create_dir( + pci_name(devp->rdev.lldi.pdev), + c4iw_debugfs_root); + setup_debugfs(devp); + } + + + return devp; +} + +static void *c4iw_uld_add(const struct cxgb4_lld_info *infop) +{ + struct uld_ctx *ctx; + static int vers_printed; + int i; + + if (!vers_printed++) + pr_info("Chelsio T4/T5 RDMA Driver - version %s\n", + DRV_VERSION); + + ctx = kzalloc(sizeof *ctx, GFP_KERNEL); + if (!ctx) { + ctx = ERR_PTR(-ENOMEM); + goto out; + } + ctx->lldi = *infop; + + PDBG("%s found device %s nchan %u nrxq %u ntxq %u nports %u\n", + __func__, pci_name(ctx->lldi.pdev), + ctx->lldi.nchan, ctx->lldi.nrxq, + ctx->lldi.ntxq, ctx->lldi.nports); + + mutex_lock(&dev_mutex); + list_add_tail(&ctx->entry, &uld_ctx_list); + mutex_unlock(&dev_mutex); + + for (i = 0; i < ctx->lldi.nrxq; i++) + PDBG("rxqid[%u] %u\n", i, ctx->lldi.rxq_ids[i]); +out: + return ctx; +} + +static inline struct sk_buff *copy_gl_to_skb_pkt(const struct pkt_gl *gl, + const __be64 *rsp, + u32 pktshift) +{ + struct sk_buff *skb; + + /* + * Allocate space for cpl_pass_accept_req which will be synthesized by + * driver. Once the driver synthesizes the request the skb will go + * through the regular cpl_pass_accept_req processing. + * The math here assumes sizeof cpl_pass_accept_req >= sizeof + * cpl_rx_pkt. + */ + skb = alloc_skb(gl->tot_len + sizeof(struct cpl_pass_accept_req) + + sizeof(struct rss_header) - pktshift, GFP_ATOMIC); + if (unlikely(!skb)) + return NULL; + + __skb_put(skb, gl->tot_len + sizeof(struct cpl_pass_accept_req) + + sizeof(struct rss_header) - pktshift); + + /* + * This skb will contain: + * rss_header from the rspq descriptor (1 flit) + * cpl_rx_pkt struct from the rspq descriptor (2 flits) + * space for the difference between the size of an + * rx_pkt and pass_accept_req cpl (1 flit) + * the packet data from the gl + */ + skb_copy_to_linear_data(skb, rsp, sizeof(struct cpl_pass_accept_req) + + sizeof(struct rss_header)); + skb_copy_to_linear_data_offset(skb, sizeof(struct rss_header) + + sizeof(struct cpl_pass_accept_req), + gl->va + pktshift, + gl->tot_len - pktshift); + return skb; +} + +static inline int recv_rx_pkt(struct c4iw_dev *dev, const struct pkt_gl *gl, + const __be64 *rsp) +{ + unsigned int opcode = *(u8 *)rsp; + struct sk_buff *skb; + + if (opcode != CPL_RX_PKT) + goto out; + + skb = copy_gl_to_skb_pkt(gl , rsp, dev->rdev.lldi.sge_pktshift); + if (skb == NULL) + goto out; + + if (c4iw_handlers[opcode] == NULL) { + pr_info("%s no handler opcode 0x%x...\n", __func__, + opcode); + kfree_skb(skb); + goto out; + } + c4iw_handlers[opcode](dev, skb); + return 1; +out: + return 0; +} + +static int c4iw_uld_rx_handler(void *handle, const __be64 *rsp, + const struct pkt_gl *gl) +{ + struct uld_ctx *ctx = handle; + struct c4iw_dev *dev = ctx->dev; + struct sk_buff *skb; + u8 opcode; + + if (gl == NULL) { + /* omit RSS and rsp_ctrl at end of descriptor */ + unsigned int len = 64 - sizeof(struct rsp_ctrl) - 8; + + skb = alloc_skb(256, GFP_ATOMIC); + if (!skb) + goto nomem; + __skb_put(skb, len); + skb_copy_to_linear_data(skb, &rsp[1], len); + } else if (gl == CXGB4_MSG_AN) { + const struct rsp_ctrl *rc = (void *)rsp; + + u32 qid = be32_to_cpu(rc->pldbuflen_qid); + c4iw_ev_handler(dev, qid); + return 0; + } else if (unlikely(*(u8 *)rsp != *(u8 *)gl->va)) { + if (recv_rx_pkt(dev, gl, rsp)) + return 0; + + pr_info("%s: unexpected FL contents at %p, " \ + "RSS %#llx, FL %#llx, len %u\n", + pci_name(ctx->lldi.pdev), gl->va, + (unsigned long long)be64_to_cpu(*rsp), + (unsigned long long)be64_to_cpu( + *(__force __be64 *)gl->va), + gl->tot_len); + + return 0; + } else { + skb = cxgb4_pktgl_to_skb(gl, 128, 128); + if (unlikely(!skb)) + goto nomem; + } + + opcode = *(u8 *)rsp; + if (c4iw_handlers[opcode]) { + c4iw_handlers[opcode](dev, skb); + } else { + pr_info("%s no handler opcode 0x%x...\n", __func__, + opcode); + kfree_skb(skb); + } + + return 0; +nomem: + return -1; +} + +static int c4iw_uld_state_change(void *handle, enum cxgb4_state new_state) +{ + struct uld_ctx *ctx = handle; + + PDBG("%s new_state %u\n", __func__, new_state); + switch (new_state) { + case CXGB4_STATE_UP: + printk(KERN_INFO MOD "%s: Up\n", pci_name(ctx->lldi.pdev)); + if (!ctx->dev) { + int ret; + + ctx->dev = c4iw_alloc(&ctx->lldi); + if (IS_ERR(ctx->dev)) { + printk(KERN_ERR MOD + "%s: initialization failed: %ld\n", + pci_name(ctx->lldi.pdev), + PTR_ERR(ctx->dev)); + ctx->dev = NULL; + break; + } + ret = c4iw_register_device(ctx->dev); + if (ret) { + printk(KERN_ERR MOD + "%s: RDMA registration failed: %d\n", + pci_name(ctx->lldi.pdev), ret); + c4iw_dealloc(ctx); + } + } + break; + case CXGB4_STATE_DOWN: + printk(KERN_INFO MOD "%s: Down\n", + pci_name(ctx->lldi.pdev)); + if (ctx->dev) + c4iw_remove(ctx); + break; + case CXGB4_STATE_START_RECOVERY: + printk(KERN_INFO MOD "%s: Fatal Error\n", + pci_name(ctx->lldi.pdev)); + if (ctx->dev) { + struct ib_event event; + + ctx->dev->rdev.flags |= T4_FATAL_ERROR; + memset(&event, 0, sizeof event); + event.event = IB_EVENT_DEVICE_FATAL; + event.device = &ctx->dev->ibdev; + ib_dispatch_event(&event); + c4iw_remove(ctx); + } + break; + case CXGB4_STATE_DETACH: + printk(KERN_INFO MOD "%s: Detach\n", + pci_name(ctx->lldi.pdev)); + if (ctx->dev) + c4iw_remove(ctx); + break; + } + return 0; +} + +static int disable_qp_db(int id, void *p, void *data) +{ + struct c4iw_qp *qp = p; + + t4_disable_wq_db(&qp->wq); + return 0; +} + +static void stop_queues(struct uld_ctx *ctx) +{ + unsigned long flags; + + spin_lock_irqsave(&ctx->dev->lock, flags); + ctx->dev->rdev.stats.db_state_transitions++; + ctx->dev->db_state = STOPPED; + if (ctx->dev->rdev.flags & T4_STATUS_PAGE_DISABLED) + idr_for_each(&ctx->dev->qpidr, disable_qp_db, NULL); + else + ctx->dev->rdev.status_page->db_off = 1; + spin_unlock_irqrestore(&ctx->dev->lock, flags); +} + +static int enable_qp_db(int id, void *p, void *data) +{ + struct c4iw_qp *qp = p; + + t4_enable_wq_db(&qp->wq); + return 0; +} + +static void resume_rc_qp(struct c4iw_qp *qp) +{ + spin_lock(&qp->lock); + t4_ring_sq_db(&qp->wq, qp->wq.sq.wq_pidx_inc, + is_t5(qp->rhp->rdev.lldi.adapter_type), NULL); + qp->wq.sq.wq_pidx_inc = 0; + t4_ring_rq_db(&qp->wq, qp->wq.rq.wq_pidx_inc, + is_t5(qp->rhp->rdev.lldi.adapter_type), NULL); + qp->wq.rq.wq_pidx_inc = 0; + spin_unlock(&qp->lock); +} + +static void resume_a_chunk(struct uld_ctx *ctx) +{ + int i; + struct c4iw_qp *qp; + + for (i = 0; i < DB_FC_RESUME_SIZE; i++) { + qp = list_first_entry(&ctx->dev->db_fc_list, struct c4iw_qp, + db_fc_entry); + list_del_init(&qp->db_fc_entry); + resume_rc_qp(qp); + if (list_empty(&ctx->dev->db_fc_list)) + break; + } +} + +static void resume_queues(struct uld_ctx *ctx) +{ + spin_lock_irq(&ctx->dev->lock); + if (ctx->dev->db_state != STOPPED) + goto out; + ctx->dev->db_state = FLOW_CONTROL; + while (1) { + if (list_empty(&ctx->dev->db_fc_list)) { + WARN_ON(ctx->dev->db_state != FLOW_CONTROL); + ctx->dev->db_state = NORMAL; + ctx->dev->rdev.stats.db_state_transitions++; + if (ctx->dev->rdev.flags & T4_STATUS_PAGE_DISABLED) { + idr_for_each(&ctx->dev->qpidr, enable_qp_db, + NULL); + } else { + ctx->dev->rdev.status_page->db_off = 0; + } + break; + } else { + if (cxgb4_dbfifo_count(ctx->dev->rdev.lldi.ports[0], 1) + < (ctx->dev->rdev.lldi.dbfifo_int_thresh << + DB_FC_DRAIN_THRESH)) { + resume_a_chunk(ctx); + } + if (!list_empty(&ctx->dev->db_fc_list)) { + spin_unlock_irq(&ctx->dev->lock); + if (DB_FC_RESUME_DELAY) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(DB_FC_RESUME_DELAY); + } + spin_lock_irq(&ctx->dev->lock); + if (ctx->dev->db_state != FLOW_CONTROL) + break; + } + } + } +out: + if (ctx->dev->db_state != NORMAL) + ctx->dev->rdev.stats.db_fc_interruptions++; + spin_unlock_irq(&ctx->dev->lock); +} + +struct qp_list { + unsigned idx; + struct c4iw_qp **qps; +}; + +static int add_and_ref_qp(int id, void *p, void *data) +{ + struct qp_list *qp_listp = data; + struct c4iw_qp *qp = p; + + c4iw_qp_add_ref(&qp->ibqp); + qp_listp->qps[qp_listp->idx++] = qp; + return 0; +} + +static int count_qps(int id, void *p, void *data) +{ + unsigned *countp = data; + (*countp)++; + return 0; +} + +static void deref_qps(struct qp_list *qp_list) +{ + int idx; + + for (idx = 0; idx < qp_list->idx; idx++) + c4iw_qp_rem_ref(&qp_list->qps[idx]->ibqp); +} + +static void recover_lost_dbs(struct uld_ctx *ctx, struct qp_list *qp_list) +{ + int idx; + int ret; + + for (idx = 0; idx < qp_list->idx; idx++) { + struct c4iw_qp *qp = qp_list->qps[idx]; + + spin_lock_irq(&qp->rhp->lock); + spin_lock(&qp->lock); + ret = cxgb4_sync_txq_pidx(qp->rhp->rdev.lldi.ports[0], + qp->wq.sq.qid, + t4_sq_host_wq_pidx(&qp->wq), + t4_sq_wq_size(&qp->wq)); + if (ret) { + pr_err(KERN_ERR MOD "%s: Fatal error - " + "DB overflow recovery failed - " + "error syncing SQ qid %u\n", + pci_name(ctx->lldi.pdev), qp->wq.sq.qid); + spin_unlock(&qp->lock); + spin_unlock_irq(&qp->rhp->lock); + return; + } + qp->wq.sq.wq_pidx_inc = 0; + + ret = cxgb4_sync_txq_pidx(qp->rhp->rdev.lldi.ports[0], + qp->wq.rq.qid, + t4_rq_host_wq_pidx(&qp->wq), + t4_rq_wq_size(&qp->wq)); + + if (ret) { + pr_err(KERN_ERR MOD "%s: Fatal error - " + "DB overflow recovery failed - " + "error syncing RQ qid %u\n", + pci_name(ctx->lldi.pdev), qp->wq.rq.qid); + spin_unlock(&qp->lock); + spin_unlock_irq(&qp->rhp->lock); + return; + } + qp->wq.rq.wq_pidx_inc = 0; + spin_unlock(&qp->lock); + spin_unlock_irq(&qp->rhp->lock); + + /* Wait for the dbfifo to drain */ + while (cxgb4_dbfifo_count(qp->rhp->rdev.lldi.ports[0], 1) > 0) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(usecs_to_jiffies(10)); + } + } +} + +static void recover_queues(struct uld_ctx *ctx) +{ + int count = 0; + struct qp_list qp_list; + int ret; + + /* slow everybody down */ + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(usecs_to_jiffies(1000)); + + /* flush the SGE contexts */ + ret = cxgb4_flush_eq_cache(ctx->dev->rdev.lldi.ports[0]); + if (ret) { + printk(KERN_ERR MOD "%s: Fatal error - DB overflow recovery failed\n", + pci_name(ctx->lldi.pdev)); + return; + } + + /* Count active queues so we can build a list of queues to recover */ + spin_lock_irq(&ctx->dev->lock); + WARN_ON(ctx->dev->db_state != STOPPED); + ctx->dev->db_state = RECOVERY; + idr_for_each(&ctx->dev->qpidr, count_qps, &count); + + qp_list.qps = kzalloc(count * sizeof *qp_list.qps, GFP_ATOMIC); + if (!qp_list.qps) { + printk(KERN_ERR MOD "%s: Fatal error - DB overflow recovery failed\n", + pci_name(ctx->lldi.pdev)); + spin_unlock_irq(&ctx->dev->lock); + return; + } + qp_list.idx = 0; + + /* add and ref each qp so it doesn't get freed */ + idr_for_each(&ctx->dev->qpidr, add_and_ref_qp, &qp_list); + + spin_unlock_irq(&ctx->dev->lock); + + /* now traverse the list in a safe context to recover the db state*/ + recover_lost_dbs(ctx, &qp_list); + + /* we're almost done! deref the qps and clean up */ + deref_qps(&qp_list); + kfree(qp_list.qps); + + spin_lock_irq(&ctx->dev->lock); + WARN_ON(ctx->dev->db_state != RECOVERY); + ctx->dev->db_state = STOPPED; + spin_unlock_irq(&ctx->dev->lock); +} + +static int c4iw_uld_control(void *handle, enum cxgb4_control control, ...) +{ + struct uld_ctx *ctx = handle; + + switch (control) { + case CXGB4_CONTROL_DB_FULL: + stop_queues(ctx); + ctx->dev->rdev.stats.db_full++; + break; + case CXGB4_CONTROL_DB_EMPTY: + resume_queues(ctx); + mutex_lock(&ctx->dev->rdev.stats.lock); + ctx->dev->rdev.stats.db_empty++; + mutex_unlock(&ctx->dev->rdev.stats.lock); + break; + case CXGB4_CONTROL_DB_DROP: + recover_queues(ctx); + mutex_lock(&ctx->dev->rdev.stats.lock); + ctx->dev->rdev.stats.db_drop++; + mutex_unlock(&ctx->dev->rdev.stats.lock); + break; + default: + printk(KERN_WARNING MOD "%s: unknown control cmd %u\n", + pci_name(ctx->lldi.pdev), control); + break; + } + return 0; +} + +static struct cxgb4_uld_info c4iw_uld_info = { + .name = DRV_NAME, + .add = c4iw_uld_add, + .rx_handler = c4iw_uld_rx_handler, + .state_change = c4iw_uld_state_change, + .control = c4iw_uld_control, +}; + +static int __init c4iw_init_module(void) +{ + int err; + + err = c4iw_cm_init(); + if (err) + return err; + + c4iw_debugfs_root = debugfs_create_dir(DRV_NAME, NULL); + if (!c4iw_debugfs_root) + printk(KERN_WARNING MOD + "could not create debugfs entry, continuing\n"); + + if (ibnl_add_client(RDMA_NL_C4IW, RDMA_NL_IWPM_NUM_OPS, + c4iw_nl_cb_table)) + pr_err("%s[%u]: Failed to add netlink callback\n" + , __func__, __LINE__); + + err = iwpm_init(RDMA_NL_C4IW); + if (err) { + pr_err("port mapper initialization failed with %d\n", err); + ibnl_remove_client(RDMA_NL_C4IW); + c4iw_cm_term(); + debugfs_remove_recursive(c4iw_debugfs_root); + return err; + } + + cxgb4_register_uld(CXGB4_ULD_RDMA, &c4iw_uld_info); + + return 0; +} + +static void __exit c4iw_exit_module(void) +{ + struct uld_ctx *ctx, *tmp; + + mutex_lock(&dev_mutex); + list_for_each_entry_safe(ctx, tmp, &uld_ctx_list, entry) { + if (ctx->dev) + c4iw_remove(ctx); + kfree(ctx); + } + mutex_unlock(&dev_mutex); + cxgb4_unregister_uld(CXGB4_ULD_RDMA); + iwpm_exit(RDMA_NL_C4IW); + ibnl_remove_client(RDMA_NL_C4IW); + c4iw_cm_term(); + debugfs_remove_recursive(c4iw_debugfs_root); +} + +module_init(c4iw_init_module); +module_exit(c4iw_exit_module); diff --git a/drivers/infiniband/hw/cxgb4/ev.c b/drivers/infiniband/hw/cxgb4/ev.c new file mode 100644 index 0000000..c9df054 --- /dev/null +++ b/drivers/infiniband/hw/cxgb4/ev.c @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include + +#include "iw_cxgb4.h" + +static void print_tpte(struct c4iw_dev *dev, u32 stag) +{ + int ret; + struct fw_ri_tpte tpte; + + ret = cxgb4_read_tpte(dev->rdev.lldi.ports[0], stag, + (__be32 *)&tpte); + if (ret) { + dev_err(&dev->rdev.lldi.pdev->dev, + "%s cxgb4_read_tpte err %d\n", __func__, ret); + return; + } + PDBG("stag idx 0x%x valid %d key 0x%x state %d pdid %d " + "perm 0x%x ps %d len 0x%llx va 0x%llx\n", + stag & 0xffffff00, + G_FW_RI_TPTE_VALID(ntohl(tpte.valid_to_pdid)), + G_FW_RI_TPTE_STAGKEY(ntohl(tpte.valid_to_pdid)), + G_FW_RI_TPTE_STAGSTATE(ntohl(tpte.valid_to_pdid)), + G_FW_RI_TPTE_PDID(ntohl(tpte.valid_to_pdid)), + G_FW_RI_TPTE_PERM(ntohl(tpte.locread_to_qpid)), + G_FW_RI_TPTE_PS(ntohl(tpte.locread_to_qpid)), + ((u64)ntohl(tpte.len_hi) << 32) | ntohl(tpte.len_lo), + ((u64)ntohl(tpte.va_hi) << 32) | ntohl(tpte.va_lo_fbo)); +} + +static void dump_err_cqe(struct c4iw_dev *dev, struct t4_cqe *err_cqe) +{ + __be64 *p = (void *)err_cqe; + + dev_err(&dev->rdev.lldi.pdev->dev, + "AE qpid %d opcode %d status 0x%x " + "type %d len 0x%x wrid.hi 0x%x wrid.lo 0x%x\n", + CQE_QPID(err_cqe), CQE_OPCODE(err_cqe), + CQE_STATUS(err_cqe), CQE_TYPE(err_cqe), ntohl(err_cqe->len), + CQE_WRID_HI(err_cqe), CQE_WRID_LOW(err_cqe)); + + PDBG("%016llx %016llx %016llx %016llx\n", + be64_to_cpu(p[0]), be64_to_cpu(p[1]), be64_to_cpu(p[2]), + be64_to_cpu(p[3])); + + /* + * Ingress WRITE and READ_RESP errors provide + * the offending stag, so parse and log it. + */ + if (RQ_TYPE(err_cqe) && (CQE_OPCODE(err_cqe) == FW_RI_RDMA_WRITE || + CQE_OPCODE(err_cqe) == FW_RI_READ_RESP)) + print_tpte(dev, CQE_WRID_STAG(err_cqe)); +} + +static void post_qp_event(struct c4iw_dev *dev, struct c4iw_cq *chp, + struct c4iw_qp *qhp, + struct t4_cqe *err_cqe, + enum ib_event_type ib_event) +{ + struct ib_event event; + struct c4iw_qp_attributes attrs; + unsigned long flag; + + dump_err_cqe(dev, err_cqe); + + if (qhp->attr.state == C4IW_QP_STATE_RTS) { + attrs.next_state = C4IW_QP_STATE_TERMINATE; + c4iw_modify_qp(qhp->rhp, qhp, C4IW_QP_ATTR_NEXT_STATE, + &attrs, 0); + } + + event.event = ib_event; + event.device = chp->ibcq.device; + if (ib_event == IB_EVENT_CQ_ERR) + event.element.cq = &chp->ibcq; + else + event.element.qp = &qhp->ibqp; + if (qhp->ibqp.event_handler) + (*qhp->ibqp.event_handler)(&event, qhp->ibqp.qp_context); + + spin_lock_irqsave(&chp->comp_handler_lock, flag); + (*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context); + spin_unlock_irqrestore(&chp->comp_handler_lock, flag); +} + +void c4iw_ev_dispatch(struct c4iw_dev *dev, struct t4_cqe *err_cqe) +{ + struct c4iw_cq *chp; + struct c4iw_qp *qhp; + u32 cqid; + + spin_lock_irq(&dev->lock); + qhp = get_qhp(dev, CQE_QPID(err_cqe)); + if (!qhp) { + printk(KERN_ERR MOD "BAD AE qpid 0x%x opcode %d " + "status 0x%x type %d wrid.hi 0x%x wrid.lo 0x%x\n", + CQE_QPID(err_cqe), + CQE_OPCODE(err_cqe), CQE_STATUS(err_cqe), + CQE_TYPE(err_cqe), CQE_WRID_HI(err_cqe), + CQE_WRID_LOW(err_cqe)); + spin_unlock_irq(&dev->lock); + goto out; + } + + if (SQ_TYPE(err_cqe)) + cqid = qhp->attr.scq; + else + cqid = qhp->attr.rcq; + chp = get_chp(dev, cqid); + if (!chp) { + printk(KERN_ERR MOD "BAD AE cqid 0x%x qpid 0x%x opcode %d " + "status 0x%x type %d wrid.hi 0x%x wrid.lo 0x%x\n", + cqid, CQE_QPID(err_cqe), + CQE_OPCODE(err_cqe), CQE_STATUS(err_cqe), + CQE_TYPE(err_cqe), CQE_WRID_HI(err_cqe), + CQE_WRID_LOW(err_cqe)); + spin_unlock_irq(&dev->lock); + goto out; + } + + c4iw_qp_add_ref(&qhp->ibqp); + atomic_inc(&chp->refcnt); + spin_unlock_irq(&dev->lock); + + /* Bad incoming write */ + if (RQ_TYPE(err_cqe) && + (CQE_OPCODE(err_cqe) == FW_RI_RDMA_WRITE)) { + post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_QP_REQ_ERR); + goto done; + } + + switch (CQE_STATUS(err_cqe)) { + + /* Completion Events */ + case T4_ERR_SUCCESS: + printk(KERN_ERR MOD "AE with status 0!\n"); + break; + + case T4_ERR_STAG: + case T4_ERR_PDID: + case T4_ERR_QPID: + case T4_ERR_ACCESS: + case T4_ERR_WRAP: + case T4_ERR_BOUND: + case T4_ERR_INVALIDATE_SHARED_MR: + case T4_ERR_INVALIDATE_MR_WITH_MW_BOUND: + post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_QP_ACCESS_ERR); + break; + + /* Device Fatal Errors */ + case T4_ERR_ECC: + case T4_ERR_ECC_PSTAG: + case T4_ERR_INTERNAL_ERR: + post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_DEVICE_FATAL); + break; + + /* QP Fatal Errors */ + case T4_ERR_OUT_OF_RQE: + case T4_ERR_PBL_ADDR_BOUND: + case T4_ERR_CRC: + case T4_ERR_MARKER: + case T4_ERR_PDU_LEN_ERR: + case T4_ERR_DDP_VERSION: + case T4_ERR_RDMA_VERSION: + case T4_ERR_OPCODE: + case T4_ERR_DDP_QUEUE_NUM: + case T4_ERR_MSN: + case T4_ERR_TBIT: + case T4_ERR_MO: + case T4_ERR_MSN_GAP: + case T4_ERR_MSN_RANGE: + case T4_ERR_RQE_ADDR_BOUND: + case T4_ERR_IRD_OVERFLOW: + post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_QP_FATAL); + break; + + default: + printk(KERN_ERR MOD "Unknown T4 status 0x%x QPID 0x%x\n", + CQE_STATUS(err_cqe), qhp->wq.sq.qid); + post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_QP_FATAL); + break; + } +done: + if (atomic_dec_and_test(&chp->refcnt)) + wake_up(&chp->wait); + c4iw_qp_rem_ref(&qhp->ibqp); +out: + return; +} + +int c4iw_ev_handler(struct c4iw_dev *dev, u32 qid) +{ + struct c4iw_cq *chp; + unsigned long flag; + + chp = get_chp(dev, qid); + if (chp) { + t4_clear_cq_armed(&chp->cq); + spin_lock_irqsave(&chp->comp_handler_lock, flag); + (*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context); + spin_unlock_irqrestore(&chp->comp_handler_lock, flag); + } else + PDBG("%s unknown cqid 0x%x\n", __func__, qid); + return 0; +} diff --git a/drivers/infiniband/hw/cxgb4/id_table.c b/drivers/infiniband/hw/cxgb4/id_table.c new file mode 100644 index 0000000..0161ae6 --- /dev/null +++ b/drivers/infiniband/hw/cxgb4/id_table.c @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2011 Chelsio Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include "iw_cxgb4.h" + +#define RANDOM_SKIP 16 + +/* + * Trivial bitmap-based allocator. If the random flag is set, the + * allocator is designed to: + * - pseudo-randomize the id returned such that it is not trivially predictable. + * - avoid reuse of recently used id (at the expense of predictability) + */ +u32 c4iw_id_alloc(struct c4iw_id_table *alloc) +{ + unsigned long flags; + u32 obj; + + spin_lock_irqsave(&alloc->lock, flags); + + obj = find_next_zero_bit(alloc->table, alloc->max, alloc->last); + if (obj >= alloc->max) + obj = find_first_zero_bit(alloc->table, alloc->max); + + if (obj < alloc->max) { + if (alloc->flags & C4IW_ID_TABLE_F_RANDOM) + alloc->last += prandom_u32() % RANDOM_SKIP; + else + alloc->last = obj + 1; + if (alloc->last >= alloc->max) + alloc->last = 0; + set_bit(obj, alloc->table); + obj += alloc->start; + } else + obj = -1; + + spin_unlock_irqrestore(&alloc->lock, flags); + return obj; +} + +void c4iw_id_free(struct c4iw_id_table *alloc, u32 obj) +{ + unsigned long flags; + + obj -= alloc->start; + BUG_ON((int)obj < 0); + + spin_lock_irqsave(&alloc->lock, flags); + clear_bit(obj, alloc->table); + spin_unlock_irqrestore(&alloc->lock, flags); +} + +int c4iw_id_table_alloc(struct c4iw_id_table *alloc, u32 start, u32 num, + u32 reserved, u32 flags) +{ + int i; + + alloc->start = start; + alloc->flags = flags; + if (flags & C4IW_ID_TABLE_F_RANDOM) + alloc->last = prandom_u32() % RANDOM_SKIP; + else + alloc->last = 0; + alloc->max = num; + spin_lock_init(&alloc->lock); + alloc->table = kmalloc(BITS_TO_LONGS(num) * sizeof(long), + GFP_KERNEL); + if (!alloc->table) + return -ENOMEM; + + bitmap_zero(alloc->table, num); + if (!(alloc->flags & C4IW_ID_TABLE_F_EMPTY)) + for (i = 0; i < reserved; ++i) + set_bit(i, alloc->table); + + return 0; +} + +void c4iw_id_table_free(struct c4iw_id_table *alloc) +{ + kfree(alloc->table); +} diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h new file mode 100644 index 0000000..b5678ac --- /dev/null +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -0,0 +1,1036 @@ +/* + * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __IW_CXGB4_H__ +#define __IW_CXGB4_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include + +#include "cxgb4.h" +#include "cxgb4_uld.h" +#include "l2t.h" +#include "user.h" + +#define DRV_NAME "iw_cxgb4" +#define MOD DRV_NAME ":" + +extern int c4iw_debug; +#define PDBG(fmt, args...) \ +do { \ + if (c4iw_debug) \ + printk(MOD fmt, ## args); \ +} while (0) + +#include "t4.h" + +#define PBL_OFF(rdev_p, a) ((a) - (rdev_p)->lldi.vr->pbl.start) +#define RQT_OFF(rdev_p, a) ((a) - (rdev_p)->lldi.vr->rq.start) + +static inline void *cplhdr(struct sk_buff *skb) +{ + return skb->data; +} + +#define C4IW_ID_TABLE_F_RANDOM 1 /* Pseudo-randomize the id's returned */ +#define C4IW_ID_TABLE_F_EMPTY 2 /* Table is initially empty */ + +struct c4iw_id_table { + u32 flags; + u32 start; /* logical minimal id */ + u32 last; /* hint for find */ + u32 max; + spinlock_t lock; + unsigned long *table; +}; + +struct c4iw_resource { + struct c4iw_id_table tpt_table; + struct c4iw_id_table qid_table; + struct c4iw_id_table pdid_table; +}; + +struct c4iw_qid_list { + struct list_head entry; + u32 qid; +}; + +struct c4iw_dev_ucontext { + struct list_head qpids; + struct list_head cqids; + struct mutex lock; +}; + +enum c4iw_rdev_flags { + T4_FATAL_ERROR = (1<<0), + T4_STATUS_PAGE_DISABLED = (1<<1), +}; + +struct c4iw_stat { + u64 total; + u64 cur; + u64 max; + u64 fail; +}; + +struct c4iw_stats { + struct mutex lock; + struct c4iw_stat qid; + struct c4iw_stat pd; + struct c4iw_stat stag; + struct c4iw_stat pbl; + struct c4iw_stat rqt; + struct c4iw_stat ocqp; + u64 db_full; + u64 db_empty; + u64 db_drop; + u64 db_state_transitions; + u64 db_fc_interruptions; + u64 tcam_full; + u64 act_ofld_conn_fails; + u64 pas_ofld_conn_fails; +}; + +struct c4iw_hw_queue { + int t4_eq_status_entries; + int t4_max_eq_size; + int t4_max_iq_size; + int t4_max_rq_size; + int t4_max_sq_size; + int t4_max_qp_depth; + int t4_max_cq_depth; + int t4_stat_len; +}; + +struct wr_log_entry { + struct timespec post_host_ts; + struct timespec poll_host_ts; + u64 post_sge_ts; + u64 cqe_sge_ts; + u64 poll_sge_ts; + u16 qid; + u16 wr_id; + u8 opcode; + u8 valid; +}; + +struct c4iw_rdev { + struct c4iw_resource resource; + unsigned long qpshift; + u32 qpmask; + unsigned long cqshift; + u32 cqmask; + struct c4iw_dev_ucontext uctx; + struct gen_pool *pbl_pool; + struct gen_pool *rqt_pool; + struct gen_pool *ocqp_pool; + u32 flags; + struct cxgb4_lld_info lldi; + unsigned long bar2_pa; + void __iomem *bar2_kva; + unsigned long oc_mw_pa; + void __iomem *oc_mw_kva; + struct c4iw_stats stats; + struct c4iw_hw_queue hw_queue; + struct t4_dev_status_page *status_page; + atomic_t wr_log_idx; + struct wr_log_entry *wr_log; + int wr_log_size; +}; + +static inline int c4iw_fatal_error(struct c4iw_rdev *rdev) +{ + return rdev->flags & T4_FATAL_ERROR; +} + +static inline int c4iw_num_stags(struct c4iw_rdev *rdev) +{ + return (int)(rdev->lldi.vr->stag.size >> 5); +} + +#define C4IW_WR_TO (30*HZ) + +struct c4iw_wr_wait { + struct completion completion; + int ret; +}; + +static inline void c4iw_init_wr_wait(struct c4iw_wr_wait *wr_waitp) +{ + wr_waitp->ret = 0; + init_completion(&wr_waitp->completion); +} + +static inline void c4iw_wake_up(struct c4iw_wr_wait *wr_waitp, int ret) +{ + wr_waitp->ret = ret; + complete(&wr_waitp->completion); +} + +static inline int c4iw_wait_for_reply(struct c4iw_rdev *rdev, + struct c4iw_wr_wait *wr_waitp, + u32 hwtid, u32 qpid, + const char *func) +{ + unsigned to = C4IW_WR_TO; + int ret; + + do { + ret = wait_for_completion_timeout(&wr_waitp->completion, to); + if (!ret) { + printk(KERN_ERR MOD "%s - Device %s not responding - " + "tid %u qpid %u\n", func, + pci_name(rdev->lldi.pdev), hwtid, qpid); + if (c4iw_fatal_error(rdev)) { + wr_waitp->ret = -EIO; + break; + } + to = to << 2; + } + } while (!ret); + if (wr_waitp->ret) + PDBG("%s: FW reply %d tid %u qpid %u\n", + pci_name(rdev->lldi.pdev), wr_waitp->ret, hwtid, qpid); + return wr_waitp->ret; +} + +enum db_state { + NORMAL = 0, + FLOW_CONTROL = 1, + RECOVERY = 2, + STOPPED = 3 +}; + +struct c4iw_dev { + struct ib_device ibdev; + struct c4iw_rdev rdev; + u32 device_cap_flags; + struct idr cqidr; + struct idr qpidr; + struct idr mmidr; + spinlock_t lock; + struct mutex db_mutex; + struct dentry *debugfs_root; + enum db_state db_state; + struct idr hwtid_idr; + struct idr atid_idr; + struct idr stid_idr; + struct list_head db_fc_list; + u32 avail_ird; +}; + +static inline struct c4iw_dev *to_c4iw_dev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct c4iw_dev, ibdev); +} + +static inline struct c4iw_dev *rdev_to_c4iw_dev(struct c4iw_rdev *rdev) +{ + return container_of(rdev, struct c4iw_dev, rdev); +} + +static inline struct c4iw_cq *get_chp(struct c4iw_dev *rhp, u32 cqid) +{ + return idr_find(&rhp->cqidr, cqid); +} + +static inline struct c4iw_qp *get_qhp(struct c4iw_dev *rhp, u32 qpid) +{ + return idr_find(&rhp->qpidr, qpid); +} + +static inline struct c4iw_mr *get_mhp(struct c4iw_dev *rhp, u32 mmid) +{ + return idr_find(&rhp->mmidr, mmid); +} + +static inline int _insert_handle(struct c4iw_dev *rhp, struct idr *idr, + void *handle, u32 id, int lock) +{ + int ret; + + if (lock) { + idr_preload(GFP_KERNEL); + spin_lock_irq(&rhp->lock); + } + + ret = idr_alloc(idr, handle, id, id + 1, GFP_ATOMIC); + + if (lock) { + spin_unlock_irq(&rhp->lock); + idr_preload_end(); + } + + BUG_ON(ret == -ENOSPC); + return ret < 0 ? ret : 0; +} + +static inline int insert_handle(struct c4iw_dev *rhp, struct idr *idr, + void *handle, u32 id) +{ + return _insert_handle(rhp, idr, handle, id, 1); +} + +static inline int insert_handle_nolock(struct c4iw_dev *rhp, struct idr *idr, + void *handle, u32 id) +{ + return _insert_handle(rhp, idr, handle, id, 0); +} + +static inline void _remove_handle(struct c4iw_dev *rhp, struct idr *idr, + u32 id, int lock) +{ + if (lock) + spin_lock_irq(&rhp->lock); + idr_remove(idr, id); + if (lock) + spin_unlock_irq(&rhp->lock); +} + +static inline void remove_handle(struct c4iw_dev *rhp, struct idr *idr, u32 id) +{ + _remove_handle(rhp, idr, id, 1); +} + +static inline void remove_handle_nolock(struct c4iw_dev *rhp, + struct idr *idr, u32 id) +{ + _remove_handle(rhp, idr, id, 0); +} + +extern uint c4iw_max_read_depth; + +static inline int cur_max_read_depth(struct c4iw_dev *dev) +{ + return min(dev->rdev.lldi.max_ordird_qp, c4iw_max_read_depth); +} + +struct c4iw_pd { + struct ib_pd ibpd; + u32 pdid; + struct c4iw_dev *rhp; +}; + +static inline struct c4iw_pd *to_c4iw_pd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct c4iw_pd, ibpd); +} + +struct tpt_attributes { + u64 len; + u64 va_fbo; + enum fw_ri_mem_perms perms; + u32 stag; + u32 pdid; + u32 qpid; + u32 pbl_addr; + u32 pbl_size; + u32 state:1; + u32 type:2; + u32 rsvd:1; + u32 remote_invaliate_disable:1; + u32 zbva:1; + u32 mw_bind_enable:1; + u32 page_size:5; +}; + +struct c4iw_mr { + struct ib_mr ibmr; + struct ib_umem *umem; + struct c4iw_dev *rhp; + u64 kva; + struct tpt_attributes attr; +}; + +static inline struct c4iw_mr *to_c4iw_mr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct c4iw_mr, ibmr); +} + +struct c4iw_mw { + struct ib_mw ibmw; + struct c4iw_dev *rhp; + u64 kva; + struct tpt_attributes attr; +}; + +static inline struct c4iw_mw *to_c4iw_mw(struct ib_mw *ibmw) +{ + return container_of(ibmw, struct c4iw_mw, ibmw); +} + +struct c4iw_fr_page_list { + struct ib_fast_reg_page_list ibpl; + DEFINE_DMA_UNMAP_ADDR(mapping); + dma_addr_t dma_addr; + struct c4iw_dev *dev; + int pll_len; +}; + +static inline struct c4iw_fr_page_list *to_c4iw_fr_page_list( + struct ib_fast_reg_page_list *ibpl) +{ + return container_of(ibpl, struct c4iw_fr_page_list, ibpl); +} + +struct c4iw_cq { + struct ib_cq ibcq; + struct c4iw_dev *rhp; + struct t4_cq cq; + spinlock_t lock; + spinlock_t comp_handler_lock; + atomic_t refcnt; + wait_queue_head_t wait; +}; + +static inline struct c4iw_cq *to_c4iw_cq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct c4iw_cq, ibcq); +} + +struct c4iw_mpa_attributes { + u8 initiator; + u8 recv_marker_enabled; + u8 xmit_marker_enabled; + u8 crc_enabled; + u8 enhanced_rdma_conn; + u8 version; + u8 p2p_type; +}; + +struct c4iw_qp_attributes { + u32 scq; + u32 rcq; + u32 sq_num_entries; + u32 rq_num_entries; + u32 sq_max_sges; + u32 sq_max_sges_rdma_write; + u32 rq_max_sges; + u32 state; + u8 enable_rdma_read; + u8 enable_rdma_write; + u8 enable_bind; + u8 enable_mmid0_fastreg; + u32 max_ord; + u32 max_ird; + u32 pd; + u32 next_state; + char terminate_buffer[52]; + u32 terminate_msg_len; + u8 is_terminate_local; + struct c4iw_mpa_attributes mpa_attr; + struct c4iw_ep *llp_stream_handle; + u8 layer_etype; + u8 ecode; + u16 sq_db_inc; + u16 rq_db_inc; + u8 send_term; +}; + +struct c4iw_qp { + struct ib_qp ibqp; + struct list_head db_fc_entry; + struct c4iw_dev *rhp; + struct c4iw_ep *ep; + struct c4iw_qp_attributes attr; + struct t4_wq wq; + spinlock_t lock; + struct mutex mutex; + atomic_t refcnt; + wait_queue_head_t wait; + struct timer_list timer; + int sq_sig_all; +}; + +static inline struct c4iw_qp *to_c4iw_qp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct c4iw_qp, ibqp); +} + +struct c4iw_ucontext { + struct ib_ucontext ibucontext; + struct c4iw_dev_ucontext uctx; + u32 key; + spinlock_t mmap_lock; + struct list_head mmaps; +}; + +static inline struct c4iw_ucontext *to_c4iw_ucontext(struct ib_ucontext *c) +{ + return container_of(c, struct c4iw_ucontext, ibucontext); +} + +struct c4iw_mm_entry { + struct list_head entry; + u64 addr; + u32 key; + unsigned len; +}; + +static inline struct c4iw_mm_entry *remove_mmap(struct c4iw_ucontext *ucontext, + u32 key, unsigned len) +{ + struct list_head *pos, *nxt; + struct c4iw_mm_entry *mm; + + spin_lock(&ucontext->mmap_lock); + list_for_each_safe(pos, nxt, &ucontext->mmaps) { + + mm = list_entry(pos, struct c4iw_mm_entry, entry); + if (mm->key == key && mm->len == len) { + list_del_init(&mm->entry); + spin_unlock(&ucontext->mmap_lock); + PDBG("%s key 0x%x addr 0x%llx len %d\n", __func__, + key, (unsigned long long) mm->addr, mm->len); + return mm; + } + } + spin_unlock(&ucontext->mmap_lock); + return NULL; +} + +static inline void insert_mmap(struct c4iw_ucontext *ucontext, + struct c4iw_mm_entry *mm) +{ + spin_lock(&ucontext->mmap_lock); + PDBG("%s key 0x%x addr 0x%llx len %d\n", __func__, + mm->key, (unsigned long long) mm->addr, mm->len); + list_add_tail(&mm->entry, &ucontext->mmaps); + spin_unlock(&ucontext->mmap_lock); +} + +enum c4iw_qp_attr_mask { + C4IW_QP_ATTR_NEXT_STATE = 1 << 0, + C4IW_QP_ATTR_SQ_DB = 1<<1, + C4IW_QP_ATTR_RQ_DB = 1<<2, + C4IW_QP_ATTR_ENABLE_RDMA_READ = 1 << 7, + C4IW_QP_ATTR_ENABLE_RDMA_WRITE = 1 << 8, + C4IW_QP_ATTR_ENABLE_RDMA_BIND = 1 << 9, + C4IW_QP_ATTR_MAX_ORD = 1 << 11, + C4IW_QP_ATTR_MAX_IRD = 1 << 12, + C4IW_QP_ATTR_LLP_STREAM_HANDLE = 1 << 22, + C4IW_QP_ATTR_STREAM_MSG_BUFFER = 1 << 23, + C4IW_QP_ATTR_MPA_ATTR = 1 << 24, + C4IW_QP_ATTR_QP_CONTEXT_ACTIVATE = 1 << 25, + C4IW_QP_ATTR_VALID_MODIFY = (C4IW_QP_ATTR_ENABLE_RDMA_READ | + C4IW_QP_ATTR_ENABLE_RDMA_WRITE | + C4IW_QP_ATTR_MAX_ORD | + C4IW_QP_ATTR_MAX_IRD | + C4IW_QP_ATTR_LLP_STREAM_HANDLE | + C4IW_QP_ATTR_STREAM_MSG_BUFFER | + C4IW_QP_ATTR_MPA_ATTR | + C4IW_QP_ATTR_QP_CONTEXT_ACTIVATE) +}; + +int c4iw_modify_qp(struct c4iw_dev *rhp, + struct c4iw_qp *qhp, + enum c4iw_qp_attr_mask mask, + struct c4iw_qp_attributes *attrs, + int internal); + +enum c4iw_qp_state { + C4IW_QP_STATE_IDLE, + C4IW_QP_STATE_RTS, + C4IW_QP_STATE_ERROR, + C4IW_QP_STATE_TERMINATE, + C4IW_QP_STATE_CLOSING, + C4IW_QP_STATE_TOT +}; + +static inline int c4iw_convert_state(enum ib_qp_state ib_state) +{ + switch (ib_state) { + case IB_QPS_RESET: + case IB_QPS_INIT: + return C4IW_QP_STATE_IDLE; + case IB_QPS_RTS: + return C4IW_QP_STATE_RTS; + case IB_QPS_SQD: + return C4IW_QP_STATE_CLOSING; + case IB_QPS_SQE: + return C4IW_QP_STATE_TERMINATE; + case IB_QPS_ERR: + return C4IW_QP_STATE_ERROR; + default: + return -1; + } +} + +static inline int to_ib_qp_state(int c4iw_qp_state) +{ + switch (c4iw_qp_state) { + case C4IW_QP_STATE_IDLE: + return IB_QPS_INIT; + case C4IW_QP_STATE_RTS: + return IB_QPS_RTS; + case C4IW_QP_STATE_CLOSING: + return IB_QPS_SQD; + case C4IW_QP_STATE_TERMINATE: + return IB_QPS_SQE; + case C4IW_QP_STATE_ERROR: + return IB_QPS_ERR; + } + return IB_QPS_ERR; +} + +static inline u32 c4iw_ib_to_tpt_access(int a) +{ + return (a & IB_ACCESS_REMOTE_WRITE ? FW_RI_MEM_ACCESS_REM_WRITE : 0) | + (a & IB_ACCESS_REMOTE_READ ? FW_RI_MEM_ACCESS_REM_READ : 0) | + (a & IB_ACCESS_LOCAL_WRITE ? FW_RI_MEM_ACCESS_LOCAL_WRITE : 0) | + FW_RI_MEM_ACCESS_LOCAL_READ; +} + +static inline u32 c4iw_ib_to_tpt_bind_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_WRITE ? FW_RI_MEM_ACCESS_REM_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? FW_RI_MEM_ACCESS_REM_READ : 0); +} + +enum c4iw_mmid_state { + C4IW_STAG_STATE_VALID, + C4IW_STAG_STATE_INVALID +}; + +#define C4IW_NODE_DESC "cxgb4 Chelsio Communications" + +#define MPA_KEY_REQ "MPA ID Req Frame" +#define MPA_KEY_REP "MPA ID Rep Frame" + +#define MPA_MAX_PRIVATE_DATA 256 +#define MPA_ENHANCED_RDMA_CONN 0x10 +#define MPA_REJECT 0x20 +#define MPA_CRC 0x40 +#define MPA_MARKERS 0x80 +#define MPA_FLAGS_MASK 0xE0 + +#define MPA_V2_PEER2PEER_MODEL 0x8000 +#define MPA_V2_ZERO_LEN_FPDU_RTR 0x4000 +#define MPA_V2_RDMA_WRITE_RTR 0x8000 +#define MPA_V2_RDMA_READ_RTR 0x4000 +#define MPA_V2_IRD_ORD_MASK 0x3FFF + +#define c4iw_put_ep(ep) { \ + PDBG("put_ep (via %s:%u) ep %p refcnt %d\n", __func__, __LINE__, \ + ep, atomic_read(&((ep)->kref.refcount))); \ + WARN_ON(atomic_read(&((ep)->kref.refcount)) < 1); \ + kref_put(&((ep)->kref), _c4iw_free_ep); \ +} + +#define c4iw_get_ep(ep) { \ + PDBG("get_ep (via %s:%u) ep %p, refcnt %d\n", __func__, __LINE__, \ + ep, atomic_read(&((ep)->kref.refcount))); \ + kref_get(&((ep)->kref)); \ +} +void _c4iw_free_ep(struct kref *kref); + +struct mpa_message { + u8 key[16]; + u8 flags; + u8 revision; + __be16 private_data_size; + u8 private_data[0]; +}; + +struct mpa_v2_conn_params { + __be16 ird; + __be16 ord; +}; + +struct terminate_message { + u8 layer_etype; + u8 ecode; + __be16 hdrct_rsvd; + u8 len_hdrs[0]; +}; + +#define TERM_MAX_LENGTH (sizeof(struct terminate_message) + 2 + 18 + 28) + +enum c4iw_layers_types { + LAYER_RDMAP = 0x00, + LAYER_DDP = 0x10, + LAYER_MPA = 0x20, + RDMAP_LOCAL_CATA = 0x00, + RDMAP_REMOTE_PROT = 0x01, + RDMAP_REMOTE_OP = 0x02, + DDP_LOCAL_CATA = 0x00, + DDP_TAGGED_ERR = 0x01, + DDP_UNTAGGED_ERR = 0x02, + DDP_LLP = 0x03 +}; + +enum c4iw_rdma_ecodes { + RDMAP_INV_STAG = 0x00, + RDMAP_BASE_BOUNDS = 0x01, + RDMAP_ACC_VIOL = 0x02, + RDMAP_STAG_NOT_ASSOC = 0x03, + RDMAP_TO_WRAP = 0x04, + RDMAP_INV_VERS = 0x05, + RDMAP_INV_OPCODE = 0x06, + RDMAP_STREAM_CATA = 0x07, + RDMAP_GLOBAL_CATA = 0x08, + RDMAP_CANT_INV_STAG = 0x09, + RDMAP_UNSPECIFIED = 0xff +}; + +enum c4iw_ddp_ecodes { + DDPT_INV_STAG = 0x00, + DDPT_BASE_BOUNDS = 0x01, + DDPT_STAG_NOT_ASSOC = 0x02, + DDPT_TO_WRAP = 0x03, + DDPT_INV_VERS = 0x04, + DDPU_INV_QN = 0x01, + DDPU_INV_MSN_NOBUF = 0x02, + DDPU_INV_MSN_RANGE = 0x03, + DDPU_INV_MO = 0x04, + DDPU_MSG_TOOBIG = 0x05, + DDPU_INV_VERS = 0x06 +}; + +enum c4iw_mpa_ecodes { + MPA_CRC_ERR = 0x02, + MPA_MARKER_ERR = 0x03, + MPA_LOCAL_CATA = 0x05, + MPA_INSUFF_IRD = 0x06, + MPA_NOMATCH_RTR = 0x07, +}; + +enum c4iw_ep_state { + IDLE = 0, + LISTEN, + CONNECTING, + MPA_REQ_WAIT, + MPA_REQ_SENT, + MPA_REQ_RCVD, + MPA_REP_SENT, + FPDU_MODE, + ABORTING, + CLOSING, + MORIBUND, + DEAD, +}; + +enum c4iw_ep_flags { + PEER_ABORT_IN_PROGRESS = 0, + ABORT_REQ_IN_PROGRESS = 1, + RELEASE_RESOURCES = 2, + CLOSE_SENT = 3, + TIMEOUT = 4, + QP_REFERENCED = 5, + RELEASE_MAPINFO = 6, +}; + +enum c4iw_ep_history { + ACT_OPEN_REQ = 0, + ACT_OFLD_CONN = 1, + ACT_OPEN_RPL = 2, + ACT_ESTAB = 3, + PASS_ACCEPT_REQ = 4, + PASS_ESTAB = 5, + ABORT_UPCALL = 6, + ESTAB_UPCALL = 7, + CLOSE_UPCALL = 8, + ULP_ACCEPT = 9, + ULP_REJECT = 10, + TIMEDOUT = 11, + PEER_ABORT = 12, + PEER_CLOSE = 13, + CONNREQ_UPCALL = 14, + ABORT_CONN = 15, + DISCONN_UPCALL = 16, + EP_DISC_CLOSE = 17, + EP_DISC_ABORT = 18, + CONN_RPL_UPCALL = 19, + ACT_RETRY_NOMEM = 20, + ACT_RETRY_INUSE = 21 +}; + +struct c4iw_ep_common { + struct iw_cm_id *cm_id; + struct c4iw_qp *qp; + struct c4iw_dev *dev; + enum c4iw_ep_state state; + struct kref kref; + struct mutex mutex; + struct sockaddr_storage local_addr; + struct sockaddr_storage remote_addr; + struct sockaddr_storage mapped_local_addr; + struct sockaddr_storage mapped_remote_addr; + struct c4iw_wr_wait wr_wait; + unsigned long flags; + unsigned long history; +}; + +struct c4iw_listen_ep { + struct c4iw_ep_common com; + unsigned int stid; + int backlog; +}; + +struct c4iw_ep { + struct c4iw_ep_common com; + struct c4iw_ep *parent_ep; + struct timer_list timer; + struct list_head entry; + unsigned int atid; + u32 hwtid; + u32 snd_seq; + u32 rcv_seq; + struct l2t_entry *l2t; + struct dst_entry *dst; + struct sk_buff *mpa_skb; + struct c4iw_mpa_attributes mpa_attr; + u8 mpa_pkt[sizeof(struct mpa_message) + MPA_MAX_PRIVATE_DATA]; + unsigned int mpa_pkt_len; + u32 ird; + u32 ord; + u32 smac_idx; + u32 tx_chan; + u32 mtu; + u16 mss; + u16 emss; + u16 plen; + u16 rss_qid; + u16 txq_idx; + u16 ctrlq_idx; + u8 tos; + u8 retry_with_mpa_v1; + u8 tried_with_mpa_v1; + unsigned int retry_count; + int snd_win; + int rcv_win; +}; + +static inline void print_addr(struct c4iw_ep_common *epc, const char *func, + const char *msg) +{ + +#define SINA(a) (&(((struct sockaddr_in *)(a))->sin_addr.s_addr)) +#define SINP(a) ntohs(((struct sockaddr_in *)(a))->sin_port) +#define SIN6A(a) (&(((struct sockaddr_in6 *)(a))->sin6_addr)) +#define SIN6P(a) ntohs(((struct sockaddr_in6 *)(a))->sin6_port) + + if (c4iw_debug) { + switch (epc->local_addr.ss_family) { + case AF_INET: + PDBG("%s %s %pI4:%u/%u <-> %pI4:%u/%u\n", + func, msg, SINA(&epc->local_addr), + SINP(&epc->local_addr), + SINP(&epc->mapped_local_addr), + SINA(&epc->remote_addr), + SINP(&epc->remote_addr), + SINP(&epc->mapped_remote_addr)); + break; + case AF_INET6: + PDBG("%s %s %pI6:%u/%u <-> %pI6:%u/%u\n", + func, msg, SIN6A(&epc->local_addr), + SIN6P(&epc->local_addr), + SIN6P(&epc->mapped_local_addr), + SIN6A(&epc->remote_addr), + SIN6P(&epc->remote_addr), + SIN6P(&epc->mapped_remote_addr)); + break; + default: + break; + } + } +#undef SINA +#undef SINP +#undef SIN6A +#undef SIN6P +} + +static inline struct c4iw_ep *to_ep(struct iw_cm_id *cm_id) +{ + return cm_id->provider_data; +} + +static inline struct c4iw_listen_ep *to_listen_ep(struct iw_cm_id *cm_id) +{ + return cm_id->provider_data; +} + +static inline int compute_wscale(int win) +{ + int wscale = 0; + + while (wscale < 14 && (65535<vr->ocq.size > 0; +#else + return 0; +#endif +} + +u32 c4iw_id_alloc(struct c4iw_id_table *alloc); +void c4iw_id_free(struct c4iw_id_table *alloc, u32 obj); +int c4iw_id_table_alloc(struct c4iw_id_table *alloc, u32 start, u32 num, + u32 reserved, u32 flags); +void c4iw_id_table_free(struct c4iw_id_table *alloc); + +typedef int (*c4iw_handler_func)(struct c4iw_dev *dev, struct sk_buff *skb); + +int c4iw_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new, + struct l2t_entry *l2t); +void c4iw_put_qpid(struct c4iw_rdev *rdev, u32 qpid, + struct c4iw_dev_ucontext *uctx); +u32 c4iw_get_resource(struct c4iw_id_table *id_table); +void c4iw_put_resource(struct c4iw_id_table *id_table, u32 entry); +int c4iw_init_resource(struct c4iw_rdev *rdev, u32 nr_tpt, u32 nr_pdid); +int c4iw_init_ctrl_qp(struct c4iw_rdev *rdev); +int c4iw_pblpool_create(struct c4iw_rdev *rdev); +int c4iw_rqtpool_create(struct c4iw_rdev *rdev); +int c4iw_ocqp_pool_create(struct c4iw_rdev *rdev); +void c4iw_pblpool_destroy(struct c4iw_rdev *rdev); +void c4iw_rqtpool_destroy(struct c4iw_rdev *rdev); +void c4iw_ocqp_pool_destroy(struct c4iw_rdev *rdev); +void c4iw_destroy_resource(struct c4iw_resource *rscp); +int c4iw_destroy_ctrl_qp(struct c4iw_rdev *rdev); +int c4iw_register_device(struct c4iw_dev *dev); +void c4iw_unregister_device(struct c4iw_dev *dev); +int __init c4iw_cm_init(void); +void c4iw_cm_term(void); +void c4iw_release_dev_ucontext(struct c4iw_rdev *rdev, + struct c4iw_dev_ucontext *uctx); +void c4iw_init_dev_ucontext(struct c4iw_rdev *rdev, + struct c4iw_dev_ucontext *uctx); +int c4iw_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +int c4iw_bind_mw(struct ib_qp *qp, struct ib_mw *mw, + struct ib_mw_bind *mw_bind); +int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param); +int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog); +int c4iw_destroy_listen(struct iw_cm_id *cm_id); +int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param); +int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len); +void c4iw_qp_add_ref(struct ib_qp *qp); +void c4iw_qp_rem_ref(struct ib_qp *qp); +void c4iw_free_fastreg_pbl(struct ib_fast_reg_page_list *page_list); +struct ib_fast_reg_page_list *c4iw_alloc_fastreg_pbl( + struct ib_device *device, + int page_list_len); +struct ib_mr *c4iw_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth); +int c4iw_dealloc_mw(struct ib_mw *mw); +struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); +struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, + u64 length, u64 virt, int acc, + struct ib_udata *udata); +struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc); +struct ib_mr *c4iw_register_phys_mem(struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, + int acc, + u64 *iova_start); +int c4iw_reregister_phys_mem(struct ib_mr *mr, + int mr_rereg_mask, + struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, + int acc, u64 *iova_start); +int c4iw_dereg_mr(struct ib_mr *ib_mr); +int c4iw_destroy_cq(struct ib_cq *ib_cq); +struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int entries, + int vector, + struct ib_ucontext *ib_context, + struct ib_udata *udata); +int c4iw_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata); +int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); +int c4iw_destroy_qp(struct ib_qp *ib_qp); +struct ib_qp *c4iw_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *attrs, + struct ib_udata *udata); +int c4iw_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); +int c4iw_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_qp_init_attr *init_attr); +struct ib_qp *c4iw_get_qp(struct ib_device *dev, int qpn); +u32 c4iw_rqtpool_alloc(struct c4iw_rdev *rdev, int size); +void c4iw_rqtpool_free(struct c4iw_rdev *rdev, u32 addr, int size); +u32 c4iw_pblpool_alloc(struct c4iw_rdev *rdev, int size); +void c4iw_pblpool_free(struct c4iw_rdev *rdev, u32 addr, int size); +u32 c4iw_ocqp_pool_alloc(struct c4iw_rdev *rdev, int size); +void c4iw_ocqp_pool_free(struct c4iw_rdev *rdev, u32 addr, int size); +int c4iw_ofld_send(struct c4iw_rdev *rdev, struct sk_buff *skb); +void c4iw_flush_hw_cq(struct c4iw_cq *chp); +void c4iw_count_rcqes(struct t4_cq *cq, struct t4_wq *wq, int *count); +int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp); +int c4iw_flush_rq(struct t4_wq *wq, struct t4_cq *cq, int count); +int c4iw_flush_sq(struct c4iw_qp *qhp); +int c4iw_ev_handler(struct c4iw_dev *rnicp, u32 qid); +u16 c4iw_rqes_posted(struct c4iw_qp *qhp); +int c4iw_post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe); +u32 c4iw_get_cqid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx); +void c4iw_put_cqid(struct c4iw_rdev *rdev, u32 qid, + struct c4iw_dev_ucontext *uctx); +u32 c4iw_get_qpid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx); +void c4iw_put_qpid(struct c4iw_rdev *rdev, u32 qid, + struct c4iw_dev_ucontext *uctx); +void c4iw_ev_dispatch(struct c4iw_dev *dev, struct t4_cqe *err_cqe); + +extern struct cxgb4_client t4c_client; +extern c4iw_handler_func c4iw_handlers[NUM_CPL_CMDS]; +extern void c4iw_log_wr_stats(struct t4_wq *wq, struct t4_cqe *cqe); +extern int c4iw_wr_log; +extern int db_fc_threshold; +extern int db_coalescing_threshold; +extern int use_dsgl; + + +#endif diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c new file mode 100644 index 0000000..ec7a298 --- /dev/null +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -0,0 +1,955 @@ +/* + * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "iw_cxgb4.h" + +int use_dsgl = 0; +module_param(use_dsgl, int, 0644); +MODULE_PARM_DESC(use_dsgl, "Use DSGL for PBL/FastReg (default=0)"); + +#define T4_ULPTX_MIN_IO 32 +#define C4IW_MAX_INLINE_SIZE 96 +#define T4_ULPTX_MAX_DMA 1024 +#define C4IW_INLINE_THRESHOLD 128 + +static int inline_threshold = C4IW_INLINE_THRESHOLD; +module_param(inline_threshold, int, 0644); +MODULE_PARM_DESC(inline_threshold, "inline vs dsgl threshold (default=128)"); + +static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr, + u32 len, dma_addr_t data, int wait) +{ + struct sk_buff *skb; + struct ulp_mem_io *req; + struct ulptx_sgl *sgl; + u8 wr_len; + int ret = 0; + struct c4iw_wr_wait wr_wait; + + addr &= 0x7FFFFFF; + + if (wait) + c4iw_init_wr_wait(&wr_wait); + wr_len = roundup(sizeof(*req) + sizeof(*sgl), 16); + + skb = alloc_skb(wr_len, GFP_KERNEL | __GFP_NOFAIL); + if (!skb) + return -ENOMEM; + set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0); + + req = (struct ulp_mem_io *)__skb_put(skb, wr_len); + memset(req, 0, wr_len); + INIT_ULPTX_WR(req, wr_len, 0, 0); + req->wr.wr_hi = cpu_to_be32(FW_WR_OP(FW_ULPTX_WR) | + (wait ? FW_WR_COMPL(1) : 0)); + req->wr.wr_lo = wait ? (__force __be64)(unsigned long) &wr_wait : 0L; + req->wr.wr_mid = cpu_to_be32(FW_WR_LEN16(DIV_ROUND_UP(wr_len, 16))); + req->cmd = cpu_to_be32(ULPTX_CMD(ULP_TX_MEM_WRITE)); + req->cmd |= cpu_to_be32(V_T5_ULP_MEMIO_ORDER(1)); + req->dlen = cpu_to_be32(ULP_MEMIO_DATA_LEN(len>>5)); + req->len16 = cpu_to_be32(DIV_ROUND_UP(wr_len-sizeof(req->wr), 16)); + req->lock_addr = cpu_to_be32(ULP_MEMIO_ADDR(addr)); + + sgl = (struct ulptx_sgl *)(req + 1); + sgl->cmd_nsge = cpu_to_be32(ULPTX_CMD(ULP_TX_SC_DSGL) | + ULPTX_NSGE(1)); + sgl->len0 = cpu_to_be32(len); + sgl->addr0 = cpu_to_be64(data); + + ret = c4iw_ofld_send(rdev, skb); + if (ret) + return ret; + if (wait) + ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, 0, __func__); + return ret; +} + +static int _c4iw_write_mem_inline(struct c4iw_rdev *rdev, u32 addr, u32 len, + void *data) +{ + struct sk_buff *skb; + struct ulp_mem_io *req; + struct ulptx_idata *sc; + u8 wr_len, *to_dp, *from_dp; + int copy_len, num_wqe, i, ret = 0; + struct c4iw_wr_wait wr_wait; + __be32 cmd = cpu_to_be32(ULPTX_CMD(ULP_TX_MEM_WRITE)); + + if (is_t4(rdev->lldi.adapter_type)) + cmd |= cpu_to_be32(ULP_MEMIO_ORDER(1)); + else + cmd |= cpu_to_be32(V_T5_ULP_MEMIO_IMM(1)); + + addr &= 0x7FFFFFF; + PDBG("%s addr 0x%x len %u\n", __func__, addr, len); + num_wqe = DIV_ROUND_UP(len, C4IW_MAX_INLINE_SIZE); + c4iw_init_wr_wait(&wr_wait); + for (i = 0; i < num_wqe; i++) { + + copy_len = len > C4IW_MAX_INLINE_SIZE ? C4IW_MAX_INLINE_SIZE : + len; + wr_len = roundup(sizeof *req + sizeof *sc + + roundup(copy_len, T4_ULPTX_MIN_IO), 16); + + skb = alloc_skb(wr_len, GFP_KERNEL); + if (!skb) + return -ENOMEM; + set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0); + + req = (struct ulp_mem_io *)__skb_put(skb, wr_len); + memset(req, 0, wr_len); + INIT_ULPTX_WR(req, wr_len, 0, 0); + + if (i == (num_wqe-1)) { + req->wr.wr_hi = cpu_to_be32(FW_WR_OP(FW_ULPTX_WR) | + FW_WR_COMPL(1)); + req->wr.wr_lo = (__force __be64)(unsigned long) &wr_wait; + } else + req->wr.wr_hi = cpu_to_be32(FW_WR_OP(FW_ULPTX_WR)); + req->wr.wr_mid = cpu_to_be32( + FW_WR_LEN16(DIV_ROUND_UP(wr_len, 16))); + + req->cmd = cmd; + req->dlen = cpu_to_be32(ULP_MEMIO_DATA_LEN( + DIV_ROUND_UP(copy_len, T4_ULPTX_MIN_IO))); + req->len16 = cpu_to_be32(DIV_ROUND_UP(wr_len-sizeof(req->wr), + 16)); + req->lock_addr = cpu_to_be32(ULP_MEMIO_ADDR(addr + i * 3)); + + sc = (struct ulptx_idata *)(req + 1); + sc->cmd_more = cpu_to_be32(ULPTX_CMD(ULP_TX_SC_IMM)); + sc->len = cpu_to_be32(roundup(copy_len, T4_ULPTX_MIN_IO)); + + to_dp = (u8 *)(sc + 1); + from_dp = (u8 *)data + i * C4IW_MAX_INLINE_SIZE; + if (data) + memcpy(to_dp, from_dp, copy_len); + else + memset(to_dp, 0, copy_len); + if (copy_len % T4_ULPTX_MIN_IO) + memset(to_dp + copy_len, 0, T4_ULPTX_MIN_IO - + (copy_len % T4_ULPTX_MIN_IO)); + ret = c4iw_ofld_send(rdev, skb); + if (ret) + return ret; + len -= C4IW_MAX_INLINE_SIZE; + } + + ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, 0, __func__); + return ret; +} + +static int _c4iw_write_mem_dma(struct c4iw_rdev *rdev, u32 addr, u32 len, void *data) +{ + u32 remain = len; + u32 dmalen; + int ret = 0; + dma_addr_t daddr; + dma_addr_t save; + + daddr = dma_map_single(&rdev->lldi.pdev->dev, data, len, DMA_TO_DEVICE); + if (dma_mapping_error(&rdev->lldi.pdev->dev, daddr)) + return -1; + save = daddr; + + while (remain > inline_threshold) { + if (remain < T4_ULPTX_MAX_DMA) { + if (remain & ~T4_ULPTX_MIN_IO) + dmalen = remain & ~(T4_ULPTX_MIN_IO-1); + else + dmalen = remain; + } else + dmalen = T4_ULPTX_MAX_DMA; + remain -= dmalen; + ret = _c4iw_write_mem_dma_aligned(rdev, addr, dmalen, daddr, + !remain); + if (ret) + goto out; + addr += dmalen >> 5; + data += dmalen; + daddr += dmalen; + } + if (remain) + ret = _c4iw_write_mem_inline(rdev, addr, remain, data); +out: + dma_unmap_single(&rdev->lldi.pdev->dev, save, len, DMA_TO_DEVICE); + return ret; +} + +/* + * write len bytes of data into addr (32B aligned address) + * If data is NULL, clear len byte of memory to zero. + */ +static int write_adapter_mem(struct c4iw_rdev *rdev, u32 addr, u32 len, + void *data) +{ + if (is_t5(rdev->lldi.adapter_type) && use_dsgl) { + if (len > inline_threshold) { + if (_c4iw_write_mem_dma(rdev, addr, len, data)) { + printk_ratelimited(KERN_WARNING + "%s: dma map" + " failure (non fatal)\n", + pci_name(rdev->lldi.pdev)); + return _c4iw_write_mem_inline(rdev, addr, len, + data); + } else + return 0; + } else + return _c4iw_write_mem_inline(rdev, addr, len, data); + } else + return _c4iw_write_mem_inline(rdev, addr, len, data); +} + +/* + * Build and write a TPT entry. + * IN: stag key, pdid, perm, bind_enabled, zbva, to, len, page_size, + * pbl_size and pbl_addr + * OUT: stag index + */ +static int write_tpt_entry(struct c4iw_rdev *rdev, u32 reset_tpt_entry, + u32 *stag, u8 stag_state, u32 pdid, + enum fw_ri_stag_type type, enum fw_ri_mem_perms perm, + int bind_enabled, u32 zbva, u64 to, + u64 len, u8 page_size, u32 pbl_size, u32 pbl_addr) +{ + int err; + struct fw_ri_tpte tpt; + u32 stag_idx; + static atomic_t key; + + if (c4iw_fatal_error(rdev)) + return -EIO; + + stag_state = stag_state > 0; + stag_idx = (*stag) >> 8; + + if ((!reset_tpt_entry) && (*stag == T4_STAG_UNSET)) { + stag_idx = c4iw_get_resource(&rdev->resource.tpt_table); + if (!stag_idx) { + mutex_lock(&rdev->stats.lock); + rdev->stats.stag.fail++; + mutex_unlock(&rdev->stats.lock); + return -ENOMEM; + } + mutex_lock(&rdev->stats.lock); + rdev->stats.stag.cur += 32; + if (rdev->stats.stag.cur > rdev->stats.stag.max) + rdev->stats.stag.max = rdev->stats.stag.cur; + mutex_unlock(&rdev->stats.lock); + *stag = (stag_idx << 8) | (atomic_inc_return(&key) & 0xff); + } + PDBG("%s stag_state 0x%0x type 0x%0x pdid 0x%0x, stag_idx 0x%x\n", + __func__, stag_state, type, pdid, stag_idx); + + /* write TPT entry */ + if (reset_tpt_entry) + memset(&tpt, 0, sizeof(tpt)); + else { + tpt.valid_to_pdid = cpu_to_be32(F_FW_RI_TPTE_VALID | + V_FW_RI_TPTE_STAGKEY((*stag & M_FW_RI_TPTE_STAGKEY)) | + V_FW_RI_TPTE_STAGSTATE(stag_state) | + V_FW_RI_TPTE_STAGTYPE(type) | V_FW_RI_TPTE_PDID(pdid)); + tpt.locread_to_qpid = cpu_to_be32(V_FW_RI_TPTE_PERM(perm) | + (bind_enabled ? F_FW_RI_TPTE_MWBINDEN : 0) | + V_FW_RI_TPTE_ADDRTYPE((zbva ? FW_RI_ZERO_BASED_TO : + FW_RI_VA_BASED_TO))| + V_FW_RI_TPTE_PS(page_size)); + tpt.nosnoop_pbladdr = !pbl_size ? 0 : cpu_to_be32( + V_FW_RI_TPTE_PBLADDR(PBL_OFF(rdev, pbl_addr)>>3)); + tpt.len_lo = cpu_to_be32((u32)(len & 0xffffffffUL)); + tpt.va_hi = cpu_to_be32((u32)(to >> 32)); + tpt.va_lo_fbo = cpu_to_be32((u32)(to & 0xffffffffUL)); + tpt.dca_mwbcnt_pstag = cpu_to_be32(0); + tpt.len_hi = cpu_to_be32((u32)(len >> 32)); + } + err = write_adapter_mem(rdev, stag_idx + + (rdev->lldi.vr->stag.start >> 5), + sizeof(tpt), &tpt); + + if (reset_tpt_entry) { + c4iw_put_resource(&rdev->resource.tpt_table, stag_idx); + mutex_lock(&rdev->stats.lock); + rdev->stats.stag.cur -= 32; + mutex_unlock(&rdev->stats.lock); + } + return err; +} + +static int write_pbl(struct c4iw_rdev *rdev, __be64 *pbl, + u32 pbl_addr, u32 pbl_size) +{ + int err; + + PDBG("%s *pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d\n", + __func__, pbl_addr, rdev->lldi.vr->pbl.start, + pbl_size); + + err = write_adapter_mem(rdev, pbl_addr >> 5, pbl_size << 3, pbl); + return err; +} + +static int dereg_mem(struct c4iw_rdev *rdev, u32 stag, u32 pbl_size, + u32 pbl_addr) +{ + return write_tpt_entry(rdev, 1, &stag, 0, 0, 0, 0, 0, 0, 0UL, 0, 0, + pbl_size, pbl_addr); +} + +static int allocate_window(struct c4iw_rdev *rdev, u32 * stag, u32 pdid) +{ + *stag = T4_STAG_UNSET; + return write_tpt_entry(rdev, 0, stag, 0, pdid, FW_RI_STAG_MW, 0, 0, 0, + 0UL, 0, 0, 0, 0); +} + +static int deallocate_window(struct c4iw_rdev *rdev, u32 stag) +{ + return write_tpt_entry(rdev, 1, &stag, 0, 0, 0, 0, 0, 0, 0UL, 0, 0, 0, + 0); +} + +static int allocate_stag(struct c4iw_rdev *rdev, u32 *stag, u32 pdid, + u32 pbl_size, u32 pbl_addr) +{ + *stag = T4_STAG_UNSET; + return write_tpt_entry(rdev, 0, stag, 0, pdid, FW_RI_STAG_NSMR, 0, 0, 0, + 0UL, 0, 0, pbl_size, pbl_addr); +} + +static int finish_mem_reg(struct c4iw_mr *mhp, u32 stag) +{ + u32 mmid; + + mhp->attr.state = 1; + mhp->attr.stag = stag; + mmid = stag >> 8; + mhp->ibmr.rkey = mhp->ibmr.lkey = stag; + PDBG("%s mmid 0x%x mhp %p\n", __func__, mmid, mhp); + return insert_handle(mhp->rhp, &mhp->rhp->mmidr, mhp, mmid); +} + +static int register_mem(struct c4iw_dev *rhp, struct c4iw_pd *php, + struct c4iw_mr *mhp, int shift) +{ + u32 stag = T4_STAG_UNSET; + int ret; + + ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, mhp->attr.pdid, + FW_RI_STAG_NSMR, mhp->attr.perms, + mhp->attr.mw_bind_enable, mhp->attr.zbva, + mhp->attr.va_fbo, mhp->attr.len, shift - 12, + mhp->attr.pbl_size, mhp->attr.pbl_addr); + if (ret) + return ret; + + ret = finish_mem_reg(mhp, stag); + if (ret) + dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, + mhp->attr.pbl_addr); + return ret; +} + +static int reregister_mem(struct c4iw_dev *rhp, struct c4iw_pd *php, + struct c4iw_mr *mhp, int shift, int npages) +{ + u32 stag; + int ret; + + if (npages > mhp->attr.pbl_size) + return -ENOMEM; + + stag = mhp->attr.stag; + ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, mhp->attr.pdid, + FW_RI_STAG_NSMR, mhp->attr.perms, + mhp->attr.mw_bind_enable, mhp->attr.zbva, + mhp->attr.va_fbo, mhp->attr.len, shift - 12, + mhp->attr.pbl_size, mhp->attr.pbl_addr); + if (ret) + return ret; + + ret = finish_mem_reg(mhp, stag); + if (ret) + dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, + mhp->attr.pbl_addr); + + return ret; +} + +static int alloc_pbl(struct c4iw_mr *mhp, int npages) +{ + mhp->attr.pbl_addr = c4iw_pblpool_alloc(&mhp->rhp->rdev, + npages << 3); + + if (!mhp->attr.pbl_addr) + return -ENOMEM; + + mhp->attr.pbl_size = npages; + + return 0; +} + +static int build_phys_page_list(struct ib_phys_buf *buffer_list, + int num_phys_buf, u64 *iova_start, + u64 *total_size, int *npages, + int *shift, __be64 **page_list) +{ + u64 mask; + int i, j, n; + + mask = 0; + *total_size = 0; + for (i = 0; i < num_phys_buf; ++i) { + if (i != 0 && buffer_list[i].addr & ~PAGE_MASK) + return -EINVAL; + if (i != 0 && i != num_phys_buf - 1 && + (buffer_list[i].size & ~PAGE_MASK)) + return -EINVAL; + *total_size += buffer_list[i].size; + if (i > 0) + mask |= buffer_list[i].addr; + else + mask |= buffer_list[i].addr & PAGE_MASK; + if (i != num_phys_buf - 1) + mask |= buffer_list[i].addr + buffer_list[i].size; + else + mask |= (buffer_list[i].addr + buffer_list[i].size + + PAGE_SIZE - 1) & PAGE_MASK; + } + + if (*total_size > 0xFFFFFFFFULL) + return -ENOMEM; + + /* Find largest page shift we can use to cover buffers */ + for (*shift = PAGE_SHIFT; *shift < 27; ++(*shift)) + if ((1ULL << *shift) & mask) + break; + + buffer_list[0].size += buffer_list[0].addr & ((1ULL << *shift) - 1); + buffer_list[0].addr &= ~0ull << *shift; + + *npages = 0; + for (i = 0; i < num_phys_buf; ++i) + *npages += (buffer_list[i].size + + (1ULL << *shift) - 1) >> *shift; + + if (!*npages) + return -EINVAL; + + *page_list = kmalloc(sizeof(u64) * *npages, GFP_KERNEL); + if (!*page_list) + return -ENOMEM; + + n = 0; + for (i = 0; i < num_phys_buf; ++i) + for (j = 0; + j < (buffer_list[i].size + (1ULL << *shift) - 1) >> *shift; + ++j) + (*page_list)[n++] = cpu_to_be64(buffer_list[i].addr + + ((u64) j << *shift)); + + PDBG("%s va 0x%llx mask 0x%llx shift %d len %lld pbl_size %d\n", + __func__, (unsigned long long)*iova_start, + (unsigned long long)mask, *shift, (unsigned long long)*total_size, + *npages); + + return 0; + +} + +int c4iw_reregister_phys_mem(struct ib_mr *mr, int mr_rereg_mask, + struct ib_pd *pd, struct ib_phys_buf *buffer_list, + int num_phys_buf, int acc, u64 *iova_start) +{ + + struct c4iw_mr mh, *mhp; + struct c4iw_pd *php; + struct c4iw_dev *rhp; + __be64 *page_list = NULL; + int shift = 0; + u64 total_size; + int npages; + int ret; + + PDBG("%s ib_mr %p ib_pd %p\n", __func__, mr, pd); + + /* There can be no memory windows */ + if (atomic_read(&mr->usecnt)) + return -EINVAL; + + mhp = to_c4iw_mr(mr); + rhp = mhp->rhp; + php = to_c4iw_pd(mr->pd); + + /* make sure we are on the same adapter */ + if (rhp != php->rhp) + return -EINVAL; + + memcpy(&mh, mhp, sizeof *mhp); + + if (mr_rereg_mask & IB_MR_REREG_PD) + php = to_c4iw_pd(pd); + if (mr_rereg_mask & IB_MR_REREG_ACCESS) { + mh.attr.perms = c4iw_ib_to_tpt_access(acc); + mh.attr.mw_bind_enable = (acc & IB_ACCESS_MW_BIND) == + IB_ACCESS_MW_BIND; + } + if (mr_rereg_mask & IB_MR_REREG_TRANS) { + ret = build_phys_page_list(buffer_list, num_phys_buf, + iova_start, + &total_size, &npages, + &shift, &page_list); + if (ret) + return ret; + } + + ret = reregister_mem(rhp, php, &mh, shift, npages); + kfree(page_list); + if (ret) + return ret; + if (mr_rereg_mask & IB_MR_REREG_PD) + mhp->attr.pdid = php->pdid; + if (mr_rereg_mask & IB_MR_REREG_ACCESS) + mhp->attr.perms = c4iw_ib_to_tpt_access(acc); + if (mr_rereg_mask & IB_MR_REREG_TRANS) { + mhp->attr.zbva = 0; + mhp->attr.va_fbo = *iova_start; + mhp->attr.page_size = shift - 12; + mhp->attr.len = (u32) total_size; + mhp->attr.pbl_size = npages; + } + + return 0; +} + +struct ib_mr *c4iw_register_phys_mem(struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, int acc, u64 *iova_start) +{ + __be64 *page_list; + int shift; + u64 total_size; + int npages; + struct c4iw_dev *rhp; + struct c4iw_pd *php; + struct c4iw_mr *mhp; + int ret; + + PDBG("%s ib_pd %p\n", __func__, pd); + php = to_c4iw_pd(pd); + rhp = php->rhp; + + mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); + if (!mhp) + return ERR_PTR(-ENOMEM); + + mhp->rhp = rhp; + + /* First check that we have enough alignment */ + if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK)) { + ret = -EINVAL; + goto err; + } + + if (num_phys_buf > 1 && + ((buffer_list[0].addr + buffer_list[0].size) & ~PAGE_MASK)) { + ret = -EINVAL; + goto err; + } + + ret = build_phys_page_list(buffer_list, num_phys_buf, iova_start, + &total_size, &npages, &shift, + &page_list); + if (ret) + goto err; + + ret = alloc_pbl(mhp, npages); + if (ret) { + kfree(page_list); + goto err; + } + + ret = write_pbl(&mhp->rhp->rdev, page_list, mhp->attr.pbl_addr, + npages); + kfree(page_list); + if (ret) + goto err_pbl; + + mhp->attr.pdid = php->pdid; + mhp->attr.zbva = 0; + + mhp->attr.perms = c4iw_ib_to_tpt_access(acc); + mhp->attr.va_fbo = *iova_start; + mhp->attr.page_size = shift - 12; + + mhp->attr.len = (u32) total_size; + mhp->attr.pbl_size = npages; + ret = register_mem(rhp, php, mhp, shift); + if (ret) + goto err_pbl; + + return &mhp->ibmr; + +err_pbl: + c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr, + mhp->attr.pbl_size << 3); + +err: + kfree(mhp); + return ERR_PTR(ret); + +} + +struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct c4iw_dev *rhp; + struct c4iw_pd *php; + struct c4iw_mr *mhp; + int ret; + u32 stag = T4_STAG_UNSET; + + PDBG("%s ib_pd %p\n", __func__, pd); + php = to_c4iw_pd(pd); + rhp = php->rhp; + + mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); + if (!mhp) + return ERR_PTR(-ENOMEM); + + mhp->rhp = rhp; + mhp->attr.pdid = php->pdid; + mhp->attr.perms = c4iw_ib_to_tpt_access(acc); + mhp->attr.mw_bind_enable = (acc&IB_ACCESS_MW_BIND) == IB_ACCESS_MW_BIND; + mhp->attr.zbva = 0; + mhp->attr.va_fbo = 0; + mhp->attr.page_size = 0; + mhp->attr.len = ~0UL; + mhp->attr.pbl_size = 0; + + ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, php->pdid, + FW_RI_STAG_NSMR, mhp->attr.perms, + mhp->attr.mw_bind_enable, 0, 0, ~0UL, 0, 0, 0); + if (ret) + goto err1; + + ret = finish_mem_reg(mhp, stag); + if (ret) + goto err2; + return &mhp->ibmr; +err2: + dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, + mhp->attr.pbl_addr); +err1: + kfree(mhp); + return ERR_PTR(ret); +} + +struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt, int acc, struct ib_udata *udata) +{ + __be64 *pages; + int shift, n, len; + int i, k, entry; + int err = 0; + struct scatterlist *sg; + struct c4iw_dev *rhp; + struct c4iw_pd *php; + struct c4iw_mr *mhp; + + PDBG("%s ib_pd %p\n", __func__, pd); + + if (length == ~0ULL) + return ERR_PTR(-EINVAL); + + if ((length + start) < start) + return ERR_PTR(-EINVAL); + + php = to_c4iw_pd(pd); + rhp = php->rhp; + mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); + if (!mhp) + return ERR_PTR(-ENOMEM); + + mhp->rhp = rhp; + + mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0); + if (IS_ERR(mhp->umem)) { + err = PTR_ERR(mhp->umem); + kfree(mhp); + return ERR_PTR(err); + } + + shift = ffs(mhp->umem->page_size) - 1; + + n = mhp->umem->nmap; + err = alloc_pbl(mhp, n); + if (err) + goto err; + + pages = (__be64 *) __get_free_page(GFP_KERNEL); + if (!pages) { + err = -ENOMEM; + goto err_pbl; + } + + i = n = 0; + + for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) { + len = sg_dma_len(sg) >> shift; + for (k = 0; k < len; ++k) { + pages[i++] = cpu_to_be64(sg_dma_address(sg) + + mhp->umem->page_size * k); + if (i == PAGE_SIZE / sizeof *pages) { + err = write_pbl(&mhp->rhp->rdev, + pages, + mhp->attr.pbl_addr + (n << 3), i); + if (err) + goto pbl_done; + n += i; + i = 0; + } + } + } + + if (i) + err = write_pbl(&mhp->rhp->rdev, pages, + mhp->attr.pbl_addr + (n << 3), i); + +pbl_done: + free_page((unsigned long) pages); + if (err) + goto err_pbl; + + mhp->attr.pdid = php->pdid; + mhp->attr.zbva = 0; + mhp->attr.perms = c4iw_ib_to_tpt_access(acc); + mhp->attr.va_fbo = virt; + mhp->attr.page_size = shift - 12; + mhp->attr.len = length; + + err = register_mem(rhp, php, mhp, shift); + if (err) + goto err_pbl; + + return &mhp->ibmr; + +err_pbl: + c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr, + mhp->attr.pbl_size << 3); + +err: + ib_umem_release(mhp->umem); + kfree(mhp); + return ERR_PTR(err); +} + +struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) +{ + struct c4iw_dev *rhp; + struct c4iw_pd *php; + struct c4iw_mw *mhp; + u32 mmid; + u32 stag = 0; + int ret; + + if (type != IB_MW_TYPE_1) + return ERR_PTR(-EINVAL); + + php = to_c4iw_pd(pd); + rhp = php->rhp; + mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); + if (!mhp) + return ERR_PTR(-ENOMEM); + ret = allocate_window(&rhp->rdev, &stag, php->pdid); + if (ret) { + kfree(mhp); + return ERR_PTR(ret); + } + mhp->rhp = rhp; + mhp->attr.pdid = php->pdid; + mhp->attr.type = FW_RI_STAG_MW; + mhp->attr.stag = stag; + mmid = (stag) >> 8; + mhp->ibmw.rkey = stag; + if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) { + deallocate_window(&rhp->rdev, mhp->attr.stag); + kfree(mhp); + return ERR_PTR(-ENOMEM); + } + PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag); + return &(mhp->ibmw); +} + +int c4iw_dealloc_mw(struct ib_mw *mw) +{ + struct c4iw_dev *rhp; + struct c4iw_mw *mhp; + u32 mmid; + + mhp = to_c4iw_mw(mw); + rhp = mhp->rhp; + mmid = (mw->rkey) >> 8; + remove_handle(rhp, &rhp->mmidr, mmid); + deallocate_window(&rhp->rdev, mhp->attr.stag); + kfree(mhp); + PDBG("%s ib_mw %p mmid 0x%x ptr %p\n", __func__, mw, mmid, mhp); + return 0; +} + +struct ib_mr *c4iw_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth) +{ + struct c4iw_dev *rhp; + struct c4iw_pd *php; + struct c4iw_mr *mhp; + u32 mmid; + u32 stag = 0; + int ret = 0; + + php = to_c4iw_pd(pd); + rhp = php->rhp; + mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); + if (!mhp) { + ret = -ENOMEM; + goto err; + } + + mhp->rhp = rhp; + ret = alloc_pbl(mhp, pbl_depth); + if (ret) + goto err1; + mhp->attr.pbl_size = pbl_depth; + ret = allocate_stag(&rhp->rdev, &stag, php->pdid, + mhp->attr.pbl_size, mhp->attr.pbl_addr); + if (ret) + goto err2; + mhp->attr.pdid = php->pdid; + mhp->attr.type = FW_RI_STAG_NSMR; + mhp->attr.stag = stag; + mhp->attr.state = 1; + mmid = (stag) >> 8; + mhp->ibmr.rkey = mhp->ibmr.lkey = stag; + if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) { + ret = -ENOMEM; + goto err3; + } + + PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag); + return &(mhp->ibmr); +err3: + dereg_mem(&rhp->rdev, stag, mhp->attr.pbl_size, + mhp->attr.pbl_addr); +err2: + c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr, + mhp->attr.pbl_size << 3); +err1: + kfree(mhp); +err: + return ERR_PTR(ret); +} + +struct ib_fast_reg_page_list *c4iw_alloc_fastreg_pbl(struct ib_device *device, + int page_list_len) +{ + struct c4iw_fr_page_list *c4pl; + struct c4iw_dev *dev = to_c4iw_dev(device); + dma_addr_t dma_addr; + int pll_len = roundup(page_list_len * sizeof(u64), 32); + + c4pl = kmalloc(sizeof(*c4pl), GFP_KERNEL); + if (!c4pl) + return ERR_PTR(-ENOMEM); + + c4pl->ibpl.page_list = dma_alloc_coherent(&dev->rdev.lldi.pdev->dev, + pll_len, &dma_addr, + GFP_KERNEL); + if (!c4pl->ibpl.page_list) { + kfree(c4pl); + return ERR_PTR(-ENOMEM); + } + dma_unmap_addr_set(c4pl, mapping, dma_addr); + c4pl->dma_addr = dma_addr; + c4pl->dev = dev; + c4pl->pll_len = pll_len; + + PDBG("%s c4pl %p pll_len %u page_list %p dma_addr %pad\n", + __func__, c4pl, c4pl->pll_len, c4pl->ibpl.page_list, + &c4pl->dma_addr); + + return &c4pl->ibpl; +} + +void c4iw_free_fastreg_pbl(struct ib_fast_reg_page_list *ibpl) +{ + struct c4iw_fr_page_list *c4pl = to_c4iw_fr_page_list(ibpl); + + PDBG("%s c4pl %p pll_len %u page_list %p dma_addr %pad\n", + __func__, c4pl, c4pl->pll_len, c4pl->ibpl.page_list, + &c4pl->dma_addr); + + dma_free_coherent(&c4pl->dev->rdev.lldi.pdev->dev, + c4pl->pll_len, + c4pl->ibpl.page_list, dma_unmap_addr(c4pl, mapping)); + kfree(c4pl); +} + +int c4iw_dereg_mr(struct ib_mr *ib_mr) +{ + struct c4iw_dev *rhp; + struct c4iw_mr *mhp; + u32 mmid; + + PDBG("%s ib_mr %p\n", __func__, ib_mr); + /* There can be no memory windows */ + if (atomic_read(&ib_mr->usecnt)) + return -EINVAL; + + mhp = to_c4iw_mr(ib_mr); + rhp = mhp->rhp; + mmid = mhp->attr.stag >> 8; + remove_handle(rhp, &rhp->mmidr, mmid); + dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, + mhp->attr.pbl_addr); + if (mhp->attr.pbl_size) + c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr, + mhp->attr.pbl_size << 3); + if (mhp->kva) + kfree((void *) (unsigned long) mhp->kva); + if (mhp->umem) + ib_umem_release(mhp->umem); + PDBG("%s mmid 0x%x ptr %p\n", __func__, mmid, mhp); + kfree(mhp); + return 0; +} diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c new file mode 100644 index 0000000..72e3b69 --- /dev/null +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -0,0 +1,588 @@ +/* + * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include "iw_cxgb4.h" + +static int fastreg_support = 1; +module_param(fastreg_support, int, 0644); +MODULE_PARM_DESC(fastreg_support, "Advertise fastreg support (default=1)"); + +static struct ib_ah *c4iw_ah_create(struct ib_pd *pd, + struct ib_ah_attr *ah_attr) +{ + return ERR_PTR(-ENOSYS); +} + +static int c4iw_ah_destroy(struct ib_ah *ah) +{ + return -ENOSYS; +} + +static int c4iw_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + return -ENOSYS; +} + +static int c4iw_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + return -ENOSYS; +} + +static int c4iw_process_mad(struct ib_device *ibdev, int mad_flags, + u8 port_num, struct ib_wc *in_wc, + struct ib_grh *in_grh, struct ib_mad *in_mad, + struct ib_mad *out_mad) +{ + return -ENOSYS; +} + +static int c4iw_dealloc_ucontext(struct ib_ucontext *context) +{ + struct c4iw_dev *rhp = to_c4iw_dev(context->device); + struct c4iw_ucontext *ucontext = to_c4iw_ucontext(context); + struct c4iw_mm_entry *mm, *tmp; + + PDBG("%s context %p\n", __func__, context); + list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry) + kfree(mm); + c4iw_release_dev_ucontext(&rhp->rdev, &ucontext->uctx); + kfree(ucontext); + return 0; +} + +static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct c4iw_ucontext *context; + struct c4iw_dev *rhp = to_c4iw_dev(ibdev); + static int warned; + struct c4iw_alloc_ucontext_resp uresp; + int ret = 0; + struct c4iw_mm_entry *mm = NULL; + + PDBG("%s ibdev %p\n", __func__, ibdev); + context = kzalloc(sizeof(*context), GFP_KERNEL); + if (!context) { + ret = -ENOMEM; + goto err; + } + + c4iw_init_dev_ucontext(&rhp->rdev, &context->uctx); + INIT_LIST_HEAD(&context->mmaps); + spin_lock_init(&context->mmap_lock); + + if (udata->outlen < sizeof(uresp) - sizeof(uresp.reserved)) { + if (!warned++) + pr_err(MOD "Warning - downlevel libcxgb4 (non-fatal), device status page disabled."); + rhp->rdev.flags |= T4_STATUS_PAGE_DISABLED; + } else { + mm = kmalloc(sizeof(*mm), GFP_KERNEL); + if (!mm) { + ret = -ENOMEM; + goto err_free; + } + + uresp.status_page_size = PAGE_SIZE; + + spin_lock(&context->mmap_lock); + uresp.status_page_key = context->key; + context->key += PAGE_SIZE; + spin_unlock(&context->mmap_lock); + + ret = ib_copy_to_udata(udata, &uresp, + sizeof(uresp) - sizeof(uresp.reserved)); + if (ret) + goto err_mm; + + mm->key = uresp.status_page_key; + mm->addr = virt_to_phys(rhp->rdev.status_page); + mm->len = PAGE_SIZE; + insert_mmap(context, mm); + } + return &context->ibucontext; +err_mm: + kfree(mm); +err_free: + kfree(context); +err: + return ERR_PTR(ret); +} + +static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + int len = vma->vm_end - vma->vm_start; + u32 key = vma->vm_pgoff << PAGE_SHIFT; + struct c4iw_rdev *rdev; + int ret = 0; + struct c4iw_mm_entry *mm; + struct c4iw_ucontext *ucontext; + u64 addr; + + PDBG("%s pgoff 0x%lx key 0x%x len %d\n", __func__, vma->vm_pgoff, + key, len); + + if (vma->vm_start & (PAGE_SIZE-1)) + return -EINVAL; + + rdev = &(to_c4iw_dev(context->device)->rdev); + ucontext = to_c4iw_ucontext(context); + + mm = remove_mmap(ucontext, key, len); + if (!mm) + return -EINVAL; + addr = mm->addr; + kfree(mm); + + if ((addr >= pci_resource_start(rdev->lldi.pdev, 0)) && + (addr < (pci_resource_start(rdev->lldi.pdev, 0) + + pci_resource_len(rdev->lldi.pdev, 0)))) { + + /* + * MA_SYNC register... + */ + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + ret = io_remap_pfn_range(vma, vma->vm_start, + addr >> PAGE_SHIFT, + len, vma->vm_page_prot); + } else if ((addr >= pci_resource_start(rdev->lldi.pdev, 2)) && + (addr < (pci_resource_start(rdev->lldi.pdev, 2) + + pci_resource_len(rdev->lldi.pdev, 2)))) { + + /* + * Map user DB or OCQP memory... + */ + if (addr >= rdev->oc_mw_pa) + vma->vm_page_prot = t4_pgprot_wc(vma->vm_page_prot); + else { + if (is_t5(rdev->lldi.adapter_type)) + vma->vm_page_prot = + t4_pgprot_wc(vma->vm_page_prot); + else + vma->vm_page_prot = + pgprot_noncached(vma->vm_page_prot); + } + ret = io_remap_pfn_range(vma, vma->vm_start, + addr >> PAGE_SHIFT, + len, vma->vm_page_prot); + } else { + + /* + * Map WQ or CQ contig dma memory... + */ + ret = remap_pfn_range(vma, vma->vm_start, + addr >> PAGE_SHIFT, + len, vma->vm_page_prot); + } + + return ret; +} + +static int c4iw_deallocate_pd(struct ib_pd *pd) +{ + struct c4iw_dev *rhp; + struct c4iw_pd *php; + + php = to_c4iw_pd(pd); + rhp = php->rhp; + PDBG("%s ibpd %p pdid 0x%x\n", __func__, pd, php->pdid); + c4iw_put_resource(&rhp->rdev.resource.pdid_table, php->pdid); + mutex_lock(&rhp->rdev.stats.lock); + rhp->rdev.stats.pd.cur--; + mutex_unlock(&rhp->rdev.stats.lock); + kfree(php); + return 0; +} + +static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct c4iw_pd *php; + u32 pdid; + struct c4iw_dev *rhp; + + PDBG("%s ibdev %p\n", __func__, ibdev); + rhp = (struct c4iw_dev *) ibdev; + pdid = c4iw_get_resource(&rhp->rdev.resource.pdid_table); + if (!pdid) + return ERR_PTR(-EINVAL); + php = kzalloc(sizeof(*php), GFP_KERNEL); + if (!php) { + c4iw_put_resource(&rhp->rdev.resource.pdid_table, pdid); + return ERR_PTR(-ENOMEM); + } + php->pdid = pdid; + php->rhp = rhp; + if (context) { + if (ib_copy_to_udata(udata, &php->pdid, sizeof(u32))) { + c4iw_deallocate_pd(&php->ibpd); + return ERR_PTR(-EFAULT); + } + } + mutex_lock(&rhp->rdev.stats.lock); + rhp->rdev.stats.pd.cur++; + if (rhp->rdev.stats.pd.cur > rhp->rdev.stats.pd.max) + rhp->rdev.stats.pd.max = rhp->rdev.stats.pd.cur; + mutex_unlock(&rhp->rdev.stats.lock); + PDBG("%s pdid 0x%0x ptr 0x%p\n", __func__, pdid, php); + return &php->ibpd; +} + +static int c4iw_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey) +{ + PDBG("%s ibdev %p\n", __func__, ibdev); + *pkey = 0; + return 0; +} + +static int c4iw_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid) +{ + struct c4iw_dev *dev; + + PDBG("%s ibdev %p, port %d, index %d, gid %p\n", + __func__, ibdev, port, index, gid); + dev = to_c4iw_dev(ibdev); + BUG_ON(port == 0); + memset(&(gid->raw[0]), 0, sizeof(gid->raw)); + memcpy(&(gid->raw[0]), dev->rdev.lldi.ports[port-1]->dev_addr, 6); + return 0; +} + +static int c4iw_query_device(struct ib_device *ibdev, + struct ib_device_attr *props) +{ + + struct c4iw_dev *dev; + PDBG("%s ibdev %p\n", __func__, ibdev); + + dev = to_c4iw_dev(ibdev); + memset(props, 0, sizeof *props); + memcpy(&props->sys_image_guid, dev->rdev.lldi.ports[0]->dev_addr, 6); + props->hw_ver = CHELSIO_CHIP_RELEASE(dev->rdev.lldi.adapter_type); + props->fw_ver = dev->rdev.lldi.fw_vers; + props->device_cap_flags = dev->device_cap_flags; + props->page_size_cap = T4_PAGESIZE_MASK; + props->vendor_id = (u32)dev->rdev.lldi.pdev->vendor; + props->vendor_part_id = (u32)dev->rdev.lldi.pdev->device; + props->max_mr_size = T4_MAX_MR_SIZE; + props->max_qp = dev->rdev.lldi.vr->qp.size / 2; + props->max_qp_wr = dev->rdev.hw_queue.t4_max_qp_depth; + props->max_sge = T4_MAX_RECV_SGE; + props->max_sge_rd = 1; + props->max_res_rd_atom = dev->rdev.lldi.max_ird_adapter; + props->max_qp_rd_atom = min(dev->rdev.lldi.max_ordird_qp, + c4iw_max_read_depth); + props->max_qp_init_rd_atom = props->max_qp_rd_atom; + props->max_cq = dev->rdev.lldi.vr->qp.size; + props->max_cqe = dev->rdev.hw_queue.t4_max_cq_depth; + props->max_mr = c4iw_num_stags(&dev->rdev); + props->max_pd = T4_MAX_NUM_PD; + props->local_ca_ack_delay = 0; + props->max_fast_reg_page_list_len = t4_max_fr_depth(use_dsgl); + + return 0; +} + +static int c4iw_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + struct c4iw_dev *dev; + struct net_device *netdev; + struct in_device *inetdev; + + PDBG("%s ibdev %p\n", __func__, ibdev); + + dev = to_c4iw_dev(ibdev); + netdev = dev->rdev.lldi.ports[port-1]; + + memset(props, 0, sizeof(struct ib_port_attr)); + props->max_mtu = IB_MTU_4096; + if (netdev->mtu >= 4096) + props->active_mtu = IB_MTU_4096; + else if (netdev->mtu >= 2048) + props->active_mtu = IB_MTU_2048; + else if (netdev->mtu >= 1024) + props->active_mtu = IB_MTU_1024; + else if (netdev->mtu >= 512) + props->active_mtu = IB_MTU_512; + else + props->active_mtu = IB_MTU_256; + + if (!netif_carrier_ok(netdev)) + props->state = IB_PORT_DOWN; + else { + inetdev = in_dev_get(netdev); + if (inetdev) { + if (inetdev->ifa_list) + props->state = IB_PORT_ACTIVE; + else + props->state = IB_PORT_INIT; + in_dev_put(inetdev); + } else + props->state = IB_PORT_INIT; + } + + props->port_cap_flags = + IB_PORT_CM_SUP | + IB_PORT_SNMP_TUNNEL_SUP | + IB_PORT_REINIT_SUP | + IB_PORT_DEVICE_MGMT_SUP | + IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP; + props->gid_tbl_len = 1; + props->pkey_tbl_len = 1; + props->active_width = 2; + props->active_speed = IB_SPEED_DDR; + props->max_msg_sz = -1; + + return 0; +} + +static ssize_t show_rev(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, + ibdev.dev); + PDBG("%s dev 0x%p\n", __func__, dev); + return sprintf(buf, "%d\n", + CHELSIO_CHIP_RELEASE(c4iw_dev->rdev.lldi.adapter_type)); +} + +static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, + ibdev.dev); + PDBG("%s dev 0x%p\n", __func__, dev); + + return sprintf(buf, "%u.%u.%u.%u\n", + FW_HDR_FW_VER_MAJOR_GET(c4iw_dev->rdev.lldi.fw_vers), + FW_HDR_FW_VER_MINOR_GET(c4iw_dev->rdev.lldi.fw_vers), + FW_HDR_FW_VER_MICRO_GET(c4iw_dev->rdev.lldi.fw_vers), + FW_HDR_FW_VER_BUILD_GET(c4iw_dev->rdev.lldi.fw_vers)); +} + +static ssize_t show_hca(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, + ibdev.dev); + struct ethtool_drvinfo info; + struct net_device *lldev = c4iw_dev->rdev.lldi.ports[0]; + + PDBG("%s dev 0x%p\n", __func__, dev); + lldev->ethtool_ops->get_drvinfo(lldev, &info); + return sprintf(buf, "%s\n", info.driver); +} + +static ssize_t show_board(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, + ibdev.dev); + PDBG("%s dev 0x%p\n", __func__, dev); + return sprintf(buf, "%x.%x\n", c4iw_dev->rdev.lldi.pdev->vendor, + c4iw_dev->rdev.lldi.pdev->device); +} + +static int c4iw_get_mib(struct ib_device *ibdev, + union rdma_protocol_stats *stats) +{ + struct tp_tcp_stats v4, v6; + struct c4iw_dev *c4iw_dev = to_c4iw_dev(ibdev); + + cxgb4_get_tcp_stats(c4iw_dev->rdev.lldi.pdev, &v4, &v6); + memset(stats, 0, sizeof *stats); + stats->iw.tcpInSegs = v4.tcpInSegs + v6.tcpInSegs; + stats->iw.tcpOutSegs = v4.tcpOutSegs + v6.tcpOutSegs; + stats->iw.tcpRetransSegs = v4.tcpRetransSegs + v6.tcpRetransSegs; + stats->iw.tcpOutRsts = v4.tcpOutRsts + v6.tcpOutSegs; + + return 0; +} + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); + +static struct device_attribute *c4iw_class_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_fw_ver, + &dev_attr_hca_type, + &dev_attr_board_id, +}; + +int c4iw_register_device(struct c4iw_dev *dev) +{ + int ret; + int i; + + PDBG("%s c4iw_dev %p\n", __func__, dev); + BUG_ON(!dev->rdev.lldi.ports[0]); + strlcpy(dev->ibdev.name, "cxgb4_%d", IB_DEVICE_NAME_MAX); + memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid)); + memcpy(&dev->ibdev.node_guid, dev->rdev.lldi.ports[0]->dev_addr, 6); + dev->ibdev.owner = THIS_MODULE; + dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_WINDOW; + if (fastreg_support) + dev->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; + dev->ibdev.local_dma_lkey = 0; + dev->ibdev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_POLL_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_POST_SEND) | + (1ull << IB_USER_VERBS_CMD_POST_RECV); + dev->ibdev.node_type = RDMA_NODE_RNIC; + memcpy(dev->ibdev.node_desc, C4IW_NODE_DESC, sizeof(C4IW_NODE_DESC)); + dev->ibdev.phys_port_cnt = dev->rdev.lldi.nports; + dev->ibdev.num_comp_vectors = dev->rdev.lldi.nciq; + dev->ibdev.dma_device = &(dev->rdev.lldi.pdev->dev); + dev->ibdev.query_device = c4iw_query_device; + dev->ibdev.query_port = c4iw_query_port; + dev->ibdev.query_pkey = c4iw_query_pkey; + dev->ibdev.query_gid = c4iw_query_gid; + dev->ibdev.alloc_ucontext = c4iw_alloc_ucontext; + dev->ibdev.dealloc_ucontext = c4iw_dealloc_ucontext; + dev->ibdev.mmap = c4iw_mmap; + dev->ibdev.alloc_pd = c4iw_allocate_pd; + dev->ibdev.dealloc_pd = c4iw_deallocate_pd; + dev->ibdev.create_ah = c4iw_ah_create; + dev->ibdev.destroy_ah = c4iw_ah_destroy; + dev->ibdev.create_qp = c4iw_create_qp; + dev->ibdev.modify_qp = c4iw_ib_modify_qp; + dev->ibdev.query_qp = c4iw_ib_query_qp; + dev->ibdev.destroy_qp = c4iw_destroy_qp; + dev->ibdev.create_cq = c4iw_create_cq; + dev->ibdev.destroy_cq = c4iw_destroy_cq; + dev->ibdev.resize_cq = c4iw_resize_cq; + dev->ibdev.poll_cq = c4iw_poll_cq; + dev->ibdev.get_dma_mr = c4iw_get_dma_mr; + dev->ibdev.reg_phys_mr = c4iw_register_phys_mem; + dev->ibdev.rereg_phys_mr = c4iw_reregister_phys_mem; + dev->ibdev.reg_user_mr = c4iw_reg_user_mr; + dev->ibdev.dereg_mr = c4iw_dereg_mr; + dev->ibdev.alloc_mw = c4iw_alloc_mw; + dev->ibdev.bind_mw = c4iw_bind_mw; + dev->ibdev.dealloc_mw = c4iw_dealloc_mw; + dev->ibdev.alloc_fast_reg_mr = c4iw_alloc_fast_reg_mr; + dev->ibdev.alloc_fast_reg_page_list = c4iw_alloc_fastreg_pbl; + dev->ibdev.free_fast_reg_page_list = c4iw_free_fastreg_pbl; + dev->ibdev.attach_mcast = c4iw_multicast_attach; + dev->ibdev.detach_mcast = c4iw_multicast_detach; + dev->ibdev.process_mad = c4iw_process_mad; + dev->ibdev.req_notify_cq = c4iw_arm_cq; + dev->ibdev.post_send = c4iw_post_send; + dev->ibdev.post_recv = c4iw_post_receive; + dev->ibdev.get_protocol_stats = c4iw_get_mib; + dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION; + + dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL); + if (!dev->ibdev.iwcm) + return -ENOMEM; + + dev->ibdev.iwcm->connect = c4iw_connect; + dev->ibdev.iwcm->accept = c4iw_accept_cr; + dev->ibdev.iwcm->reject = c4iw_reject_cr; + dev->ibdev.iwcm->create_listen = c4iw_create_listen; + dev->ibdev.iwcm->destroy_listen = c4iw_destroy_listen; + dev->ibdev.iwcm->add_ref = c4iw_qp_add_ref; + dev->ibdev.iwcm->rem_ref = c4iw_qp_rem_ref; + dev->ibdev.iwcm->get_qp = c4iw_get_qp; + + ret = ib_register_device(&dev->ibdev, NULL); + if (ret) + goto bail1; + + for (i = 0; i < ARRAY_SIZE(c4iw_class_attributes); ++i) { + ret = device_create_file(&dev->ibdev.dev, + c4iw_class_attributes[i]); + if (ret) + goto bail2; + } + return 0; +bail2: + ib_unregister_device(&dev->ibdev); +bail1: + kfree(dev->ibdev.iwcm); + return ret; +} + +void c4iw_unregister_device(struct c4iw_dev *dev) +{ + int i; + + PDBG("%s c4iw_dev %p\n", __func__, dev); + for (i = 0; i < ARRAY_SIZE(c4iw_class_attributes); ++i) + device_remove_file(&dev->ibdev.dev, + c4iw_class_attributes[i]); + ib_unregister_device(&dev->ibdev); + kfree(dev->ibdev.iwcm); + return; +} diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c new file mode 100644 index 0000000..41cd688 --- /dev/null +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -0,0 +1,1886 @@ +/* + * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "iw_cxgb4.h" + +static int db_delay_usecs = 1; +module_param(db_delay_usecs, int, 0644); +MODULE_PARM_DESC(db_delay_usecs, "Usecs to delay awaiting db fifo to drain"); + +static int ocqp_support = 1; +module_param(ocqp_support, int, 0644); +MODULE_PARM_DESC(ocqp_support, "Support on-chip SQs (default=1)"); + +int db_fc_threshold = 1000; +module_param(db_fc_threshold, int, 0644); +MODULE_PARM_DESC(db_fc_threshold, + "QP count/threshold that triggers" + " automatic db flow control mode (default = 1000)"); + +int db_coalescing_threshold; +module_param(db_coalescing_threshold, int, 0644); +MODULE_PARM_DESC(db_coalescing_threshold, + "QP count/threshold that triggers" + " disabling db coalescing (default = 0)"); + +static int max_fr_immd = T4_MAX_FR_IMMD; +module_param(max_fr_immd, int, 0644); +MODULE_PARM_DESC(max_fr_immd, "fastreg threshold for using DSGL instead of immedate"); + +static int alloc_ird(struct c4iw_dev *dev, u32 ird) +{ + int ret = 0; + + spin_lock_irq(&dev->lock); + if (ird <= dev->avail_ird) + dev->avail_ird -= ird; + else + ret = -ENOMEM; + spin_unlock_irq(&dev->lock); + + if (ret) + dev_warn(&dev->rdev.lldi.pdev->dev, + "device IRD resources exhausted\n"); + + return ret; +} + +static void free_ird(struct c4iw_dev *dev, int ird) +{ + spin_lock_irq(&dev->lock); + dev->avail_ird += ird; + spin_unlock_irq(&dev->lock); +} + +static void set_state(struct c4iw_qp *qhp, enum c4iw_qp_state state) +{ + unsigned long flag; + spin_lock_irqsave(&qhp->lock, flag); + qhp->attr.state = state; + spin_unlock_irqrestore(&qhp->lock, flag); +} + +static void dealloc_oc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq) +{ + c4iw_ocqp_pool_free(rdev, sq->dma_addr, sq->memsize); +} + +static void dealloc_host_sq(struct c4iw_rdev *rdev, struct t4_sq *sq) +{ + dma_free_coherent(&(rdev->lldi.pdev->dev), sq->memsize, sq->queue, + pci_unmap_addr(sq, mapping)); +} + +static void dealloc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq) +{ + if (t4_sq_onchip(sq)) + dealloc_oc_sq(rdev, sq); + else + dealloc_host_sq(rdev, sq); +} + +static int alloc_oc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq) +{ + if (!ocqp_support || !ocqp_supported(&rdev->lldi)) + return -ENOSYS; + sq->dma_addr = c4iw_ocqp_pool_alloc(rdev, sq->memsize); + if (!sq->dma_addr) + return -ENOMEM; + sq->phys_addr = rdev->oc_mw_pa + sq->dma_addr - + rdev->lldi.vr->ocq.start; + sq->queue = (__force union t4_wr *)(rdev->oc_mw_kva + sq->dma_addr - + rdev->lldi.vr->ocq.start); + sq->flags |= T4_SQ_ONCHIP; + return 0; +} + +static int alloc_host_sq(struct c4iw_rdev *rdev, struct t4_sq *sq) +{ + sq->queue = dma_alloc_coherent(&(rdev->lldi.pdev->dev), sq->memsize, + &(sq->dma_addr), GFP_KERNEL); + if (!sq->queue) + return -ENOMEM; + sq->phys_addr = virt_to_phys(sq->queue); + pci_unmap_addr_set(sq, mapping, sq->dma_addr); + return 0; +} + +static int alloc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq, int user) +{ + int ret = -ENOSYS; + if (user) + ret = alloc_oc_sq(rdev, sq); + if (ret) + ret = alloc_host_sq(rdev, sq); + return ret; +} + +static int destroy_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, + struct c4iw_dev_ucontext *uctx) +{ + /* + * uP clears EQ contexts when the connection exits rdma mode, + * so no need to post a RESET WR for these EQs. + */ + dma_free_coherent(&(rdev->lldi.pdev->dev), + wq->rq.memsize, wq->rq.queue, + dma_unmap_addr(&wq->rq, mapping)); + dealloc_sq(rdev, &wq->sq); + c4iw_rqtpool_free(rdev, wq->rq.rqt_hwaddr, wq->rq.rqt_size); + kfree(wq->rq.sw_rq); + kfree(wq->sq.sw_sq); + c4iw_put_qpid(rdev, wq->rq.qid, uctx); + c4iw_put_qpid(rdev, wq->sq.qid, uctx); + return 0; +} + +static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, + struct t4_cq *rcq, struct t4_cq *scq, + struct c4iw_dev_ucontext *uctx) +{ + int user = (uctx != &rdev->uctx); + struct fw_ri_res_wr *res_wr; + struct fw_ri_res *res; + int wr_len; + struct c4iw_wr_wait wr_wait; + struct sk_buff *skb; + int ret = 0; + int eqsize; + + wq->sq.qid = c4iw_get_qpid(rdev, uctx); + if (!wq->sq.qid) + return -ENOMEM; + + wq->rq.qid = c4iw_get_qpid(rdev, uctx); + if (!wq->rq.qid) { + ret = -ENOMEM; + goto free_sq_qid; + } + + if (!user) { + wq->sq.sw_sq = kzalloc(wq->sq.size * sizeof *wq->sq.sw_sq, + GFP_KERNEL); + if (!wq->sq.sw_sq) { + ret = -ENOMEM; + goto free_rq_qid; + } + + wq->rq.sw_rq = kzalloc(wq->rq.size * sizeof *wq->rq.sw_rq, + GFP_KERNEL); + if (!wq->rq.sw_rq) { + ret = -ENOMEM; + goto free_sw_sq; + } + } + + /* + * RQT must be a power of 2 and at least 16 deep. + */ + wq->rq.rqt_size = roundup_pow_of_two(max_t(u16, wq->rq.size, 16)); + wq->rq.rqt_hwaddr = c4iw_rqtpool_alloc(rdev, wq->rq.rqt_size); + if (!wq->rq.rqt_hwaddr) { + ret = -ENOMEM; + goto free_sw_rq; + } + + ret = alloc_sq(rdev, &wq->sq, user); + if (ret) + goto free_hwaddr; + memset(wq->sq.queue, 0, wq->sq.memsize); + dma_unmap_addr_set(&wq->sq, mapping, wq->sq.dma_addr); + + wq->rq.queue = dma_alloc_coherent(&(rdev->lldi.pdev->dev), + wq->rq.memsize, &(wq->rq.dma_addr), + GFP_KERNEL); + if (!wq->rq.queue) { + ret = -ENOMEM; + goto free_sq; + } + PDBG("%s sq base va 0x%p pa 0x%llx rq base va 0x%p pa 0x%llx\n", + __func__, wq->sq.queue, + (unsigned long long)virt_to_phys(wq->sq.queue), + wq->rq.queue, + (unsigned long long)virt_to_phys(wq->rq.queue)); + memset(wq->rq.queue, 0, wq->rq.memsize); + dma_unmap_addr_set(&wq->rq, mapping, wq->rq.dma_addr); + + wq->db = rdev->lldi.db_reg; + wq->gts = rdev->lldi.gts_reg; + if (user || is_t5(rdev->lldi.adapter_type)) { + u32 off; + + off = (wq->sq.qid << rdev->qpshift) & PAGE_MASK; + if (user) { + wq->sq.udb = (u64 __iomem *)(rdev->bar2_pa + off); + } else { + off += 128 * (wq->sq.qid & rdev->qpmask) + 8; + wq->sq.udb = (u64 __iomem *)(rdev->bar2_kva + off); + } + off = (wq->rq.qid << rdev->qpshift) & PAGE_MASK; + if (user) { + wq->rq.udb = (u64 __iomem *)(rdev->bar2_pa + off); + } else { + off += 128 * (wq->rq.qid & rdev->qpmask) + 8; + wq->rq.udb = (u64 __iomem *)(rdev->bar2_kva + off); + } + } + wq->rdev = rdev; + wq->rq.msn = 1; + + /* build fw_ri_res_wr */ + wr_len = sizeof *res_wr + 2 * sizeof *res; + + skb = alloc_skb(wr_len, GFP_KERNEL); + if (!skb) { + ret = -ENOMEM; + goto free_dma; + } + set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0); + + res_wr = (struct fw_ri_res_wr *)__skb_put(skb, wr_len); + memset(res_wr, 0, wr_len); + res_wr->op_nres = cpu_to_be32( + FW_WR_OP(FW_RI_RES_WR) | + V_FW_RI_RES_WR_NRES(2) | + FW_WR_COMPL(1)); + res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16)); + res_wr->cookie = (unsigned long) &wr_wait; + res = res_wr->res; + res->u.sqrq.restype = FW_RI_RES_TYPE_SQ; + res->u.sqrq.op = FW_RI_RES_OP_WRITE; + + /* + * eqsize is the number of 64B entries plus the status page size. + */ + eqsize = wq->sq.size * T4_SQ_NUM_SLOTS + + rdev->hw_queue.t4_eq_status_entries; + + res->u.sqrq.fetchszm_to_iqid = cpu_to_be32( + V_FW_RI_RES_WR_HOSTFCMODE(0) | /* no host cidx updates */ + V_FW_RI_RES_WR_CPRIO(0) | /* don't keep in chip cache */ + V_FW_RI_RES_WR_PCIECHN(0) | /* set by uP at ri_init time */ + (t4_sq_onchip(&wq->sq) ? F_FW_RI_RES_WR_ONCHIP : 0) | + V_FW_RI_RES_WR_IQID(scq->cqid)); + res->u.sqrq.dcaen_to_eqsize = cpu_to_be32( + V_FW_RI_RES_WR_DCAEN(0) | + V_FW_RI_RES_WR_DCACPU(0) | + V_FW_RI_RES_WR_FBMIN(2) | + V_FW_RI_RES_WR_FBMAX(2) | + V_FW_RI_RES_WR_CIDXFTHRESHO(0) | + V_FW_RI_RES_WR_CIDXFTHRESH(0) | + V_FW_RI_RES_WR_EQSIZE(eqsize)); + res->u.sqrq.eqid = cpu_to_be32(wq->sq.qid); + res->u.sqrq.eqaddr = cpu_to_be64(wq->sq.dma_addr); + res++; + res->u.sqrq.restype = FW_RI_RES_TYPE_RQ; + res->u.sqrq.op = FW_RI_RES_OP_WRITE; + + /* + * eqsize is the number of 64B entries plus the status page size. + */ + eqsize = wq->rq.size * T4_RQ_NUM_SLOTS + + rdev->hw_queue.t4_eq_status_entries; + res->u.sqrq.fetchszm_to_iqid = cpu_to_be32( + V_FW_RI_RES_WR_HOSTFCMODE(0) | /* no host cidx updates */ + V_FW_RI_RES_WR_CPRIO(0) | /* don't keep in chip cache */ + V_FW_RI_RES_WR_PCIECHN(0) | /* set by uP at ri_init time */ + V_FW_RI_RES_WR_IQID(rcq->cqid)); + res->u.sqrq.dcaen_to_eqsize = cpu_to_be32( + V_FW_RI_RES_WR_DCAEN(0) | + V_FW_RI_RES_WR_DCACPU(0) | + V_FW_RI_RES_WR_FBMIN(2) | + V_FW_RI_RES_WR_FBMAX(2) | + V_FW_RI_RES_WR_CIDXFTHRESHO(0) | + V_FW_RI_RES_WR_CIDXFTHRESH(0) | + V_FW_RI_RES_WR_EQSIZE(eqsize)); + res->u.sqrq.eqid = cpu_to_be32(wq->rq.qid); + res->u.sqrq.eqaddr = cpu_to_be64(wq->rq.dma_addr); + + c4iw_init_wr_wait(&wr_wait); + + ret = c4iw_ofld_send(rdev, skb); + if (ret) + goto free_dma; + ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, wq->sq.qid, __func__); + if (ret) + goto free_dma; + + PDBG("%s sqid 0x%x rqid 0x%x kdb 0x%p squdb 0x%lx rqudb 0x%lx\n", + __func__, wq->sq.qid, wq->rq.qid, wq->db, + (__force unsigned long) wq->sq.udb, + (__force unsigned long) wq->rq.udb); + + return 0; +free_dma: + dma_free_coherent(&(rdev->lldi.pdev->dev), + wq->rq.memsize, wq->rq.queue, + dma_unmap_addr(&wq->rq, mapping)); +free_sq: + dealloc_sq(rdev, &wq->sq); +free_hwaddr: + c4iw_rqtpool_free(rdev, wq->rq.rqt_hwaddr, wq->rq.rqt_size); +free_sw_rq: + kfree(wq->rq.sw_rq); +free_sw_sq: + kfree(wq->sq.sw_sq); +free_rq_qid: + c4iw_put_qpid(rdev, wq->rq.qid, uctx); +free_sq_qid: + c4iw_put_qpid(rdev, wq->sq.qid, uctx); + return ret; +} + +static int build_immd(struct t4_sq *sq, struct fw_ri_immd *immdp, + struct ib_send_wr *wr, int max, u32 *plenp) +{ + u8 *dstp, *srcp; + u32 plen = 0; + int i; + int rem, len; + + dstp = (u8 *)immdp->data; + for (i = 0; i < wr->num_sge; i++) { + if ((plen + wr->sg_list[i].length) > max) + return -EMSGSIZE; + srcp = (u8 *)(unsigned long)wr->sg_list[i].addr; + plen += wr->sg_list[i].length; + rem = wr->sg_list[i].length; + while (rem) { + if (dstp == (u8 *)&sq->queue[sq->size]) + dstp = (u8 *)sq->queue; + if (rem <= (u8 *)&sq->queue[sq->size] - dstp) + len = rem; + else + len = (u8 *)&sq->queue[sq->size] - dstp; + memcpy(dstp, srcp, len); + dstp += len; + srcp += len; + rem -= len; + } + } + len = roundup(plen + sizeof *immdp, 16) - (plen + sizeof *immdp); + if (len) + memset(dstp, 0, len); + immdp->op = FW_RI_DATA_IMMD; + immdp->r1 = 0; + immdp->r2 = 0; + immdp->immdlen = cpu_to_be32(plen); + *plenp = plen; + return 0; +} + +static int build_isgl(__be64 *queue_start, __be64 *queue_end, + struct fw_ri_isgl *isglp, struct ib_sge *sg_list, + int num_sge, u32 *plenp) + +{ + int i; + u32 plen = 0; + __be64 *flitp = (__be64 *)isglp->sge; + + for (i = 0; i < num_sge; i++) { + if ((plen + sg_list[i].length) < plen) + return -EMSGSIZE; + plen += sg_list[i].length; + *flitp = cpu_to_be64(((u64)sg_list[i].lkey << 32) | + sg_list[i].length); + if (++flitp == queue_end) + flitp = queue_start; + *flitp = cpu_to_be64(sg_list[i].addr); + if (++flitp == queue_end) + flitp = queue_start; + } + *flitp = (__force __be64)0; + isglp->op = FW_RI_DATA_ISGL; + isglp->r1 = 0; + isglp->nsge = cpu_to_be16(num_sge); + isglp->r2 = 0; + if (plenp) + *plenp = plen; + return 0; +} + +static int build_rdma_send(struct t4_sq *sq, union t4_wr *wqe, + struct ib_send_wr *wr, u8 *len16) +{ + u32 plen; + int size; + int ret; + + if (wr->num_sge > T4_MAX_SEND_SGE) + return -EINVAL; + switch (wr->opcode) { + case IB_WR_SEND: + if (wr->send_flags & IB_SEND_SOLICITED) + wqe->send.sendop_pkd = cpu_to_be32( + V_FW_RI_SEND_WR_SENDOP(FW_RI_SEND_WITH_SE)); + else + wqe->send.sendop_pkd = cpu_to_be32( + V_FW_RI_SEND_WR_SENDOP(FW_RI_SEND)); + wqe->send.stag_inv = 0; + break; + case IB_WR_SEND_WITH_INV: + if (wr->send_flags & IB_SEND_SOLICITED) + wqe->send.sendop_pkd = cpu_to_be32( + V_FW_RI_SEND_WR_SENDOP(FW_RI_SEND_WITH_SE_INV)); + else + wqe->send.sendop_pkd = cpu_to_be32( + V_FW_RI_SEND_WR_SENDOP(FW_RI_SEND_WITH_INV)); + wqe->send.stag_inv = cpu_to_be32(wr->ex.invalidate_rkey); + break; + + default: + return -EINVAL; + } + wqe->send.r3 = 0; + wqe->send.r4 = 0; + + plen = 0; + if (wr->num_sge) { + if (wr->send_flags & IB_SEND_INLINE) { + ret = build_immd(sq, wqe->send.u.immd_src, wr, + T4_MAX_SEND_INLINE, &plen); + if (ret) + return ret; + size = sizeof wqe->send + sizeof(struct fw_ri_immd) + + plen; + } else { + ret = build_isgl((__be64 *)sq->queue, + (__be64 *)&sq->queue[sq->size], + wqe->send.u.isgl_src, + wr->sg_list, wr->num_sge, &plen); + if (ret) + return ret; + size = sizeof wqe->send + sizeof(struct fw_ri_isgl) + + wr->num_sge * sizeof(struct fw_ri_sge); + } + } else { + wqe->send.u.immd_src[0].op = FW_RI_DATA_IMMD; + wqe->send.u.immd_src[0].r1 = 0; + wqe->send.u.immd_src[0].r2 = 0; + wqe->send.u.immd_src[0].immdlen = 0; + size = sizeof wqe->send + sizeof(struct fw_ri_immd); + plen = 0; + } + *len16 = DIV_ROUND_UP(size, 16); + wqe->send.plen = cpu_to_be32(plen); + return 0; +} + +static int build_rdma_write(struct t4_sq *sq, union t4_wr *wqe, + struct ib_send_wr *wr, u8 *len16) +{ + u32 plen; + int size; + int ret; + + if (wr->num_sge > T4_MAX_SEND_SGE) + return -EINVAL; + wqe->write.r2 = 0; + wqe->write.stag_sink = cpu_to_be32(wr->wr.rdma.rkey); + wqe->write.to_sink = cpu_to_be64(wr->wr.rdma.remote_addr); + if (wr->num_sge) { + if (wr->send_flags & IB_SEND_INLINE) { + ret = build_immd(sq, wqe->write.u.immd_src, wr, + T4_MAX_WRITE_INLINE, &plen); + if (ret) + return ret; + size = sizeof wqe->write + sizeof(struct fw_ri_immd) + + plen; + } else { + ret = build_isgl((__be64 *)sq->queue, + (__be64 *)&sq->queue[sq->size], + wqe->write.u.isgl_src, + wr->sg_list, wr->num_sge, &plen); + if (ret) + return ret; + size = sizeof wqe->write + sizeof(struct fw_ri_isgl) + + wr->num_sge * sizeof(struct fw_ri_sge); + } + } else { + wqe->write.u.immd_src[0].op = FW_RI_DATA_IMMD; + wqe->write.u.immd_src[0].r1 = 0; + wqe->write.u.immd_src[0].r2 = 0; + wqe->write.u.immd_src[0].immdlen = 0; + size = sizeof wqe->write + sizeof(struct fw_ri_immd); + plen = 0; + } + *len16 = DIV_ROUND_UP(size, 16); + wqe->write.plen = cpu_to_be32(plen); + return 0; +} + +static int build_rdma_read(union t4_wr *wqe, struct ib_send_wr *wr, u8 *len16) +{ + if (wr->num_sge > 1) + return -EINVAL; + if (wr->num_sge) { + wqe->read.stag_src = cpu_to_be32(wr->wr.rdma.rkey); + wqe->read.to_src_hi = cpu_to_be32((u32)(wr->wr.rdma.remote_addr + >> 32)); + wqe->read.to_src_lo = cpu_to_be32((u32)wr->wr.rdma.remote_addr); + wqe->read.stag_sink = cpu_to_be32(wr->sg_list[0].lkey); + wqe->read.plen = cpu_to_be32(wr->sg_list[0].length); + wqe->read.to_sink_hi = cpu_to_be32((u32)(wr->sg_list[0].addr + >> 32)); + wqe->read.to_sink_lo = cpu_to_be32((u32)(wr->sg_list[0].addr)); + } else { + wqe->read.stag_src = cpu_to_be32(2); + wqe->read.to_src_hi = 0; + wqe->read.to_src_lo = 0; + wqe->read.stag_sink = cpu_to_be32(2); + wqe->read.plen = 0; + wqe->read.to_sink_hi = 0; + wqe->read.to_sink_lo = 0; + } + wqe->read.r2 = 0; + wqe->read.r5 = 0; + *len16 = DIV_ROUND_UP(sizeof wqe->read, 16); + return 0; +} + +static int build_rdma_recv(struct c4iw_qp *qhp, union t4_recv_wr *wqe, + struct ib_recv_wr *wr, u8 *len16) +{ + int ret; + + ret = build_isgl((__be64 *)qhp->wq.rq.queue, + (__be64 *)&qhp->wq.rq.queue[qhp->wq.rq.size], + &wqe->recv.isgl, wr->sg_list, wr->num_sge, NULL); + if (ret) + return ret; + *len16 = DIV_ROUND_UP(sizeof wqe->recv + + wr->num_sge * sizeof(struct fw_ri_sge), 16); + return 0; +} + +static int build_fastreg(struct t4_sq *sq, union t4_wr *wqe, + struct ib_send_wr *wr, u8 *len16, u8 t5dev) +{ + + struct fw_ri_immd *imdp; + __be64 *p; + int i; + int pbllen = roundup(wr->wr.fast_reg.page_list_len * sizeof(u64), 32); + int rem; + + if (wr->wr.fast_reg.page_list_len > + t4_max_fr_depth(use_dsgl)) + return -EINVAL; + + wqe->fr.qpbinde_to_dcacpu = 0; + wqe->fr.pgsz_shift = wr->wr.fast_reg.page_shift - 12; + wqe->fr.addr_type = FW_RI_VA_BASED_TO; + wqe->fr.mem_perms = c4iw_ib_to_tpt_access(wr->wr.fast_reg.access_flags); + wqe->fr.len_hi = 0; + wqe->fr.len_lo = cpu_to_be32(wr->wr.fast_reg.length); + wqe->fr.stag = cpu_to_be32(wr->wr.fast_reg.rkey); + wqe->fr.va_hi = cpu_to_be32(wr->wr.fast_reg.iova_start >> 32); + wqe->fr.va_lo_fbo = cpu_to_be32(wr->wr.fast_reg.iova_start & + 0xffffffff); + + if (t5dev && use_dsgl && (pbllen > max_fr_immd)) { + struct c4iw_fr_page_list *c4pl = + to_c4iw_fr_page_list(wr->wr.fast_reg.page_list); + struct fw_ri_dsgl *sglp; + + for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) { + wr->wr.fast_reg.page_list->page_list[i] = (__force u64) + cpu_to_be64((u64) + wr->wr.fast_reg.page_list->page_list[i]); + } + + sglp = (struct fw_ri_dsgl *)(&wqe->fr + 1); + sglp->op = FW_RI_DATA_DSGL; + sglp->r1 = 0; + sglp->nsge = cpu_to_be16(1); + sglp->addr0 = cpu_to_be64(c4pl->dma_addr); + sglp->len0 = cpu_to_be32(pbllen); + + *len16 = DIV_ROUND_UP(sizeof(wqe->fr) + sizeof(*sglp), 16); + } else { + imdp = (struct fw_ri_immd *)(&wqe->fr + 1); + imdp->op = FW_RI_DATA_IMMD; + imdp->r1 = 0; + imdp->r2 = 0; + imdp->immdlen = cpu_to_be32(pbllen); + p = (__be64 *)(imdp + 1); + rem = pbllen; + for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) { + *p = cpu_to_be64( + (u64)wr->wr.fast_reg.page_list->page_list[i]); + rem -= sizeof(*p); + if (++p == (__be64 *)&sq->queue[sq->size]) + p = (__be64 *)sq->queue; + } + BUG_ON(rem < 0); + while (rem) { + *p = 0; + rem -= sizeof(*p); + if (++p == (__be64 *)&sq->queue[sq->size]) + p = (__be64 *)sq->queue; + } + *len16 = DIV_ROUND_UP(sizeof(wqe->fr) + sizeof(*imdp) + + pbllen, 16); + } + return 0; +} + +static int build_inv_stag(union t4_wr *wqe, struct ib_send_wr *wr, + u8 *len16) +{ + wqe->inv.stag_inv = cpu_to_be32(wr->ex.invalidate_rkey); + wqe->inv.r2 = 0; + *len16 = DIV_ROUND_UP(sizeof wqe->inv, 16); + return 0; +} + +void c4iw_qp_add_ref(struct ib_qp *qp) +{ + PDBG("%s ib_qp %p\n", __func__, qp); + atomic_inc(&(to_c4iw_qp(qp)->refcnt)); +} + +void c4iw_qp_rem_ref(struct ib_qp *qp) +{ + PDBG("%s ib_qp %p\n", __func__, qp); + if (atomic_dec_and_test(&(to_c4iw_qp(qp)->refcnt))) + wake_up(&(to_c4iw_qp(qp)->wait)); +} + +static void add_to_fc_list(struct list_head *head, struct list_head *entry) +{ + if (list_empty(entry)) + list_add_tail(entry, head); +} + +static int ring_kernel_sq_db(struct c4iw_qp *qhp, u16 inc) +{ + unsigned long flags; + + spin_lock_irqsave(&qhp->rhp->lock, flags); + spin_lock(&qhp->lock); + if (qhp->rhp->db_state == NORMAL) + t4_ring_sq_db(&qhp->wq, inc, + is_t5(qhp->rhp->rdev.lldi.adapter_type), NULL); + else { + add_to_fc_list(&qhp->rhp->db_fc_list, &qhp->db_fc_entry); + qhp->wq.sq.wq_pidx_inc += inc; + } + spin_unlock(&qhp->lock); + spin_unlock_irqrestore(&qhp->rhp->lock, flags); + return 0; +} + +static int ring_kernel_rq_db(struct c4iw_qp *qhp, u16 inc) +{ + unsigned long flags; + + spin_lock_irqsave(&qhp->rhp->lock, flags); + spin_lock(&qhp->lock); + if (qhp->rhp->db_state == NORMAL) + t4_ring_rq_db(&qhp->wq, inc, + is_t5(qhp->rhp->rdev.lldi.adapter_type), NULL); + else { + add_to_fc_list(&qhp->rhp->db_fc_list, &qhp->db_fc_entry); + qhp->wq.rq.wq_pidx_inc += inc; + } + spin_unlock(&qhp->lock); + spin_unlock_irqrestore(&qhp->rhp->lock, flags); + return 0; +} + +int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + int err = 0; + u8 len16 = 0; + enum fw_wr_opcodes fw_opcode = 0; + enum fw_ri_wr_flags fw_flags; + struct c4iw_qp *qhp; + union t4_wr *wqe = NULL; + u32 num_wrs; + struct t4_swsqe *swsqe; + unsigned long flag; + u16 idx = 0; + + qhp = to_c4iw_qp(ibqp); + spin_lock_irqsave(&qhp->lock, flag); + if (t4_wq_in_error(&qhp->wq)) { + spin_unlock_irqrestore(&qhp->lock, flag); + return -EINVAL; + } + num_wrs = t4_sq_avail(&qhp->wq); + if (num_wrs == 0) { + spin_unlock_irqrestore(&qhp->lock, flag); + return -ENOMEM; + } + while (wr) { + if (num_wrs == 0) { + err = -ENOMEM; + *bad_wr = wr; + break; + } + wqe = (union t4_wr *)((u8 *)qhp->wq.sq.queue + + qhp->wq.sq.wq_pidx * T4_EQ_ENTRY_SIZE); + + fw_flags = 0; + if (wr->send_flags & IB_SEND_SOLICITED) + fw_flags |= FW_RI_SOLICITED_EVENT_FLAG; + if (wr->send_flags & IB_SEND_SIGNALED || qhp->sq_sig_all) + fw_flags |= FW_RI_COMPLETION_FLAG; + swsqe = &qhp->wq.sq.sw_sq[qhp->wq.sq.pidx]; + switch (wr->opcode) { + case IB_WR_SEND_WITH_INV: + case IB_WR_SEND: + if (wr->send_flags & IB_SEND_FENCE) + fw_flags |= FW_RI_READ_FENCE_FLAG; + fw_opcode = FW_RI_SEND_WR; + if (wr->opcode == IB_WR_SEND) + swsqe->opcode = FW_RI_SEND; + else + swsqe->opcode = FW_RI_SEND_WITH_INV; + err = build_rdma_send(&qhp->wq.sq, wqe, wr, &len16); + break; + case IB_WR_RDMA_WRITE: + fw_opcode = FW_RI_RDMA_WRITE_WR; + swsqe->opcode = FW_RI_RDMA_WRITE; + err = build_rdma_write(&qhp->wq.sq, wqe, wr, &len16); + break; + case IB_WR_RDMA_READ: + case IB_WR_RDMA_READ_WITH_INV: + fw_opcode = FW_RI_RDMA_READ_WR; + swsqe->opcode = FW_RI_READ_REQ; + if (wr->opcode == IB_WR_RDMA_READ_WITH_INV) + fw_flags = FW_RI_RDMA_READ_INVALIDATE; + else + fw_flags = 0; + err = build_rdma_read(wqe, wr, &len16); + if (err) + break; + swsqe->read_len = wr->sg_list[0].length; + if (!qhp->wq.sq.oldest_read) + qhp->wq.sq.oldest_read = swsqe; + break; + case IB_WR_FAST_REG_MR: + fw_opcode = FW_RI_FR_NSMR_WR; + swsqe->opcode = FW_RI_FAST_REGISTER; + err = build_fastreg(&qhp->wq.sq, wqe, wr, &len16, + is_t5( + qhp->rhp->rdev.lldi.adapter_type) ? + 1 : 0); + break; + case IB_WR_LOCAL_INV: + if (wr->send_flags & IB_SEND_FENCE) + fw_flags |= FW_RI_LOCAL_FENCE_FLAG; + fw_opcode = FW_RI_INV_LSTAG_WR; + swsqe->opcode = FW_RI_LOCAL_INV; + err = build_inv_stag(wqe, wr, &len16); + break; + default: + PDBG("%s post of type=%d TBD!\n", __func__, + wr->opcode); + err = -EINVAL; + } + if (err) { + *bad_wr = wr; + break; + } + swsqe->idx = qhp->wq.sq.pidx; + swsqe->complete = 0; + swsqe->signaled = (wr->send_flags & IB_SEND_SIGNALED) || + qhp->sq_sig_all; + swsqe->flushed = 0; + swsqe->wr_id = wr->wr_id; + if (c4iw_wr_log) { + swsqe->sge_ts = cxgb4_read_sge_timestamp( + qhp->rhp->rdev.lldi.ports[0]); + getnstimeofday(&swsqe->host_ts); + } + + init_wr_hdr(wqe, qhp->wq.sq.pidx, fw_opcode, fw_flags, len16); + + PDBG("%s cookie 0x%llx pidx 0x%x opcode 0x%x read_len %u\n", + __func__, (unsigned long long)wr->wr_id, qhp->wq.sq.pidx, + swsqe->opcode, swsqe->read_len); + wr = wr->next; + num_wrs--; + t4_sq_produce(&qhp->wq, len16); + idx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE); + } + if (!qhp->rhp->rdev.status_page->db_off) { + t4_ring_sq_db(&qhp->wq, idx, + is_t5(qhp->rhp->rdev.lldi.adapter_type), wqe); + spin_unlock_irqrestore(&qhp->lock, flag); + } else { + spin_unlock_irqrestore(&qhp->lock, flag); + ring_kernel_sq_db(qhp, idx); + } + return err; +} + +int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + int err = 0; + struct c4iw_qp *qhp; + union t4_recv_wr *wqe = NULL; + u32 num_wrs; + u8 len16 = 0; + unsigned long flag; + u16 idx = 0; + + qhp = to_c4iw_qp(ibqp); + spin_lock_irqsave(&qhp->lock, flag); + if (t4_wq_in_error(&qhp->wq)) { + spin_unlock_irqrestore(&qhp->lock, flag); + return -EINVAL; + } + num_wrs = t4_rq_avail(&qhp->wq); + if (num_wrs == 0) { + spin_unlock_irqrestore(&qhp->lock, flag); + return -ENOMEM; + } + while (wr) { + if (wr->num_sge > T4_MAX_RECV_SGE) { + err = -EINVAL; + *bad_wr = wr; + break; + } + wqe = (union t4_recv_wr *)((u8 *)qhp->wq.rq.queue + + qhp->wq.rq.wq_pidx * + T4_EQ_ENTRY_SIZE); + if (num_wrs) + err = build_rdma_recv(qhp, wqe, wr, &len16); + else + err = -ENOMEM; + if (err) { + *bad_wr = wr; + break; + } + + qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].wr_id = wr->wr_id; + if (c4iw_wr_log) { + qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].sge_ts = + cxgb4_read_sge_timestamp( + qhp->rhp->rdev.lldi.ports[0]); + getnstimeofday( + &qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].host_ts); + } + + wqe->recv.opcode = FW_RI_RECV_WR; + wqe->recv.r1 = 0; + wqe->recv.wrid = qhp->wq.rq.pidx; + wqe->recv.r2[0] = 0; + wqe->recv.r2[1] = 0; + wqe->recv.r2[2] = 0; + wqe->recv.len16 = len16; + PDBG("%s cookie 0x%llx pidx %u\n", __func__, + (unsigned long long) wr->wr_id, qhp->wq.rq.pidx); + t4_rq_produce(&qhp->wq, len16); + idx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE); + wr = wr->next; + num_wrs--; + } + if (!qhp->rhp->rdev.status_page->db_off) { + t4_ring_rq_db(&qhp->wq, idx, + is_t5(qhp->rhp->rdev.lldi.adapter_type), wqe); + spin_unlock_irqrestore(&qhp->lock, flag); + } else { + spin_unlock_irqrestore(&qhp->lock, flag); + ring_kernel_rq_db(qhp, idx); + } + return err; +} + +int c4iw_bind_mw(struct ib_qp *qp, struct ib_mw *mw, struct ib_mw_bind *mw_bind) +{ + return -ENOSYS; +} + +static inline void build_term_codes(struct t4_cqe *err_cqe, u8 *layer_type, + u8 *ecode) +{ + int status; + int tagged; + int opcode; + int rqtype; + int send_inv; + + if (!err_cqe) { + *layer_type = LAYER_RDMAP|DDP_LOCAL_CATA; + *ecode = 0; + return; + } + + status = CQE_STATUS(err_cqe); + opcode = CQE_OPCODE(err_cqe); + rqtype = RQ_TYPE(err_cqe); + send_inv = (opcode == FW_RI_SEND_WITH_INV) || + (opcode == FW_RI_SEND_WITH_SE_INV); + tagged = (opcode == FW_RI_RDMA_WRITE) || + (rqtype && (opcode == FW_RI_READ_RESP)); + + switch (status) { + case T4_ERR_STAG: + if (send_inv) { + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; + *ecode = RDMAP_CANT_INV_STAG; + } else { + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_INV_STAG; + } + break; + case T4_ERR_PDID: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + if ((opcode == FW_RI_SEND_WITH_INV) || + (opcode == FW_RI_SEND_WITH_SE_INV)) + *ecode = RDMAP_CANT_INV_STAG; + else + *ecode = RDMAP_STAG_NOT_ASSOC; + break; + case T4_ERR_QPID: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_STAG_NOT_ASSOC; + break; + case T4_ERR_ACCESS: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_ACC_VIOL; + break; + case T4_ERR_WRAP: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_TO_WRAP; + break; + case T4_ERR_BOUND: + if (tagged) { + *layer_type = LAYER_DDP|DDP_TAGGED_ERR; + *ecode = DDPT_BASE_BOUNDS; + } else { + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_BASE_BOUNDS; + } + break; + case T4_ERR_INVALIDATE_SHARED_MR: + case T4_ERR_INVALIDATE_MR_WITH_MW_BOUND: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; + *ecode = RDMAP_CANT_INV_STAG; + break; + case T4_ERR_ECC: + case T4_ERR_ECC_PSTAG: + case T4_ERR_INTERNAL_ERR: + *layer_type = LAYER_RDMAP|RDMAP_LOCAL_CATA; + *ecode = 0; + break; + case T4_ERR_OUT_OF_RQE: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_MSN_NOBUF; + break; + case T4_ERR_PBL_ADDR_BOUND: + *layer_type = LAYER_DDP|DDP_TAGGED_ERR; + *ecode = DDPT_BASE_BOUNDS; + break; + case T4_ERR_CRC: + *layer_type = LAYER_MPA|DDP_LLP; + *ecode = MPA_CRC_ERR; + break; + case T4_ERR_MARKER: + *layer_type = LAYER_MPA|DDP_LLP; + *ecode = MPA_MARKER_ERR; + break; + case T4_ERR_PDU_LEN_ERR: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_MSG_TOOBIG; + break; + case T4_ERR_DDP_VERSION: + if (tagged) { + *layer_type = LAYER_DDP|DDP_TAGGED_ERR; + *ecode = DDPT_INV_VERS; + } else { + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_VERS; + } + break; + case T4_ERR_RDMA_VERSION: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; + *ecode = RDMAP_INV_VERS; + break; + case T4_ERR_OPCODE: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; + *ecode = RDMAP_INV_OPCODE; + break; + case T4_ERR_DDP_QUEUE_NUM: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_QN; + break; + case T4_ERR_MSN: + case T4_ERR_MSN_GAP: + case T4_ERR_MSN_RANGE: + case T4_ERR_IRD_OVERFLOW: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_MSN_RANGE; + break; + case T4_ERR_TBIT: + *layer_type = LAYER_DDP|DDP_LOCAL_CATA; + *ecode = 0; + break; + case T4_ERR_MO: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_MO; + break; + default: + *layer_type = LAYER_RDMAP|DDP_LOCAL_CATA; + *ecode = 0; + break; + } +} + +static void post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe, + gfp_t gfp) +{ + struct fw_ri_wr *wqe; + struct sk_buff *skb; + struct terminate_message *term; + + PDBG("%s qhp %p qid 0x%x tid %u\n", __func__, qhp, qhp->wq.sq.qid, + qhp->ep->hwtid); + + skb = alloc_skb(sizeof *wqe, gfp); + if (!skb) + return; + set_wr_txq(skb, CPL_PRIORITY_DATA, qhp->ep->txq_idx); + + wqe = (struct fw_ri_wr *)__skb_put(skb, sizeof(*wqe)); + memset(wqe, 0, sizeof *wqe); + wqe->op_compl = cpu_to_be32(FW_WR_OP(FW_RI_INIT_WR)); + wqe->flowid_len16 = cpu_to_be32( + FW_WR_FLOWID(qhp->ep->hwtid) | + FW_WR_LEN16(DIV_ROUND_UP(sizeof *wqe, 16))); + + wqe->u.terminate.type = FW_RI_TYPE_TERMINATE; + wqe->u.terminate.immdlen = cpu_to_be32(sizeof *term); + term = (struct terminate_message *)wqe->u.terminate.termmsg; + if (qhp->attr.layer_etype == (LAYER_MPA|DDP_LLP)) { + term->layer_etype = qhp->attr.layer_etype; + term->ecode = qhp->attr.ecode; + } else + build_term_codes(err_cqe, &term->layer_etype, &term->ecode); + c4iw_ofld_send(&qhp->rhp->rdev, skb); +} + +/* + * Assumes qhp lock is held. + */ +static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp, + struct c4iw_cq *schp) +{ + int count; + int rq_flushed, sq_flushed; + unsigned long flag; + + PDBG("%s qhp %p rchp %p schp %p\n", __func__, qhp, rchp, schp); + + /* locking hierarchy: cq lock first, then qp lock. */ + spin_lock_irqsave(&rchp->lock, flag); + spin_lock(&qhp->lock); + + if (qhp->wq.flushed) { + spin_unlock(&qhp->lock); + spin_unlock_irqrestore(&rchp->lock, flag); + return; + } + qhp->wq.flushed = 1; + + c4iw_flush_hw_cq(rchp); + c4iw_count_rcqes(&rchp->cq, &qhp->wq, &count); + rq_flushed = c4iw_flush_rq(&qhp->wq, &rchp->cq, count); + spin_unlock(&qhp->lock); + spin_unlock_irqrestore(&rchp->lock, flag); + + /* locking hierarchy: cq lock first, then qp lock. */ + spin_lock_irqsave(&schp->lock, flag); + spin_lock(&qhp->lock); + if (schp != rchp) + c4iw_flush_hw_cq(schp); + sq_flushed = c4iw_flush_sq(qhp); + spin_unlock(&qhp->lock); + spin_unlock_irqrestore(&schp->lock, flag); + + if (schp == rchp) { + if (t4_clear_cq_armed(&rchp->cq) && + (rq_flushed || sq_flushed)) { + spin_lock_irqsave(&rchp->comp_handler_lock, flag); + (*rchp->ibcq.comp_handler)(&rchp->ibcq, + rchp->ibcq.cq_context); + spin_unlock_irqrestore(&rchp->comp_handler_lock, flag); + } + } else { + if (t4_clear_cq_armed(&rchp->cq) && rq_flushed) { + spin_lock_irqsave(&rchp->comp_handler_lock, flag); + (*rchp->ibcq.comp_handler)(&rchp->ibcq, + rchp->ibcq.cq_context); + spin_unlock_irqrestore(&rchp->comp_handler_lock, flag); + } + if (t4_clear_cq_armed(&schp->cq) && sq_flushed) { + spin_lock_irqsave(&schp->comp_handler_lock, flag); + (*schp->ibcq.comp_handler)(&schp->ibcq, + schp->ibcq.cq_context); + spin_unlock_irqrestore(&schp->comp_handler_lock, flag); + } + } +} + +static void flush_qp(struct c4iw_qp *qhp) +{ + struct c4iw_cq *rchp, *schp; + unsigned long flag; + + rchp = to_c4iw_cq(qhp->ibqp.recv_cq); + schp = to_c4iw_cq(qhp->ibqp.send_cq); + + t4_set_wq_in_error(&qhp->wq); + if (qhp->ibqp.uobject) { + t4_set_cq_in_error(&rchp->cq); + spin_lock_irqsave(&rchp->comp_handler_lock, flag); + (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); + spin_unlock_irqrestore(&rchp->comp_handler_lock, flag); + if (schp != rchp) { + t4_set_cq_in_error(&schp->cq); + spin_lock_irqsave(&schp->comp_handler_lock, flag); + (*schp->ibcq.comp_handler)(&schp->ibcq, + schp->ibcq.cq_context); + spin_unlock_irqrestore(&schp->comp_handler_lock, flag); + } + return; + } + __flush_qp(qhp, rchp, schp); +} + +static int rdma_fini(struct c4iw_dev *rhp, struct c4iw_qp *qhp, + struct c4iw_ep *ep) +{ + struct fw_ri_wr *wqe; + int ret; + struct sk_buff *skb; + + PDBG("%s qhp %p qid 0x%x tid %u\n", __func__, qhp, qhp->wq.sq.qid, + ep->hwtid); + + skb = alloc_skb(sizeof *wqe, GFP_KERNEL); + if (!skb) + return -ENOMEM; + set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); + + wqe = (struct fw_ri_wr *)__skb_put(skb, sizeof(*wqe)); + memset(wqe, 0, sizeof *wqe); + wqe->op_compl = cpu_to_be32( + FW_WR_OP(FW_RI_INIT_WR) | + FW_WR_COMPL(1)); + wqe->flowid_len16 = cpu_to_be32( + FW_WR_FLOWID(ep->hwtid) | + FW_WR_LEN16(DIV_ROUND_UP(sizeof *wqe, 16))); + wqe->cookie = (unsigned long) &ep->com.wr_wait; + + wqe->u.fini.type = FW_RI_TYPE_FINI; + ret = c4iw_ofld_send(&rhp->rdev, skb); + if (ret) + goto out; + + ret = c4iw_wait_for_reply(&rhp->rdev, &ep->com.wr_wait, qhp->ep->hwtid, + qhp->wq.sq.qid, __func__); +out: + PDBG("%s ret %d\n", __func__, ret); + return ret; +} + +static void build_rtr_msg(u8 p2p_type, struct fw_ri_init *init) +{ + PDBG("%s p2p_type = %d\n", __func__, p2p_type); + memset(&init->u, 0, sizeof init->u); + switch (p2p_type) { + case FW_RI_INIT_P2PTYPE_RDMA_WRITE: + init->u.write.opcode = FW_RI_RDMA_WRITE_WR; + init->u.write.stag_sink = cpu_to_be32(1); + init->u.write.to_sink = cpu_to_be64(1); + init->u.write.u.immd_src[0].op = FW_RI_DATA_IMMD; + init->u.write.len16 = DIV_ROUND_UP(sizeof init->u.write + + sizeof(struct fw_ri_immd), + 16); + break; + case FW_RI_INIT_P2PTYPE_READ_REQ: + init->u.write.opcode = FW_RI_RDMA_READ_WR; + init->u.read.stag_src = cpu_to_be32(1); + init->u.read.to_src_lo = cpu_to_be32(1); + init->u.read.stag_sink = cpu_to_be32(1); + init->u.read.to_sink_lo = cpu_to_be32(1); + init->u.read.len16 = DIV_ROUND_UP(sizeof init->u.read, 16); + break; + } +} + +static int rdma_init(struct c4iw_dev *rhp, struct c4iw_qp *qhp) +{ + struct fw_ri_wr *wqe; + int ret; + struct sk_buff *skb; + + PDBG("%s qhp %p qid 0x%x tid %u ird %u ord %u\n", __func__, qhp, + qhp->wq.sq.qid, qhp->ep->hwtid, qhp->ep->ird, qhp->ep->ord); + + skb = alloc_skb(sizeof *wqe, GFP_KERNEL); + if (!skb) { + ret = -ENOMEM; + goto out; + } + ret = alloc_ird(rhp, qhp->attr.max_ird); + if (ret) { + qhp->attr.max_ird = 0; + kfree_skb(skb); + goto out; + } + set_wr_txq(skb, CPL_PRIORITY_DATA, qhp->ep->txq_idx); + + wqe = (struct fw_ri_wr *)__skb_put(skb, sizeof(*wqe)); + memset(wqe, 0, sizeof *wqe); + wqe->op_compl = cpu_to_be32( + FW_WR_OP(FW_RI_INIT_WR) | + FW_WR_COMPL(1)); + wqe->flowid_len16 = cpu_to_be32( + FW_WR_FLOWID(qhp->ep->hwtid) | + FW_WR_LEN16(DIV_ROUND_UP(sizeof *wqe, 16))); + + wqe->cookie = (unsigned long) &qhp->ep->com.wr_wait; + + wqe->u.init.type = FW_RI_TYPE_INIT; + wqe->u.init.mpareqbit_p2ptype = + V_FW_RI_WR_MPAREQBIT(qhp->attr.mpa_attr.initiator) | + V_FW_RI_WR_P2PTYPE(qhp->attr.mpa_attr.p2p_type); + wqe->u.init.mpa_attrs = FW_RI_MPA_IETF_ENABLE; + if (qhp->attr.mpa_attr.recv_marker_enabled) + wqe->u.init.mpa_attrs |= FW_RI_MPA_RX_MARKER_ENABLE; + if (qhp->attr.mpa_attr.xmit_marker_enabled) + wqe->u.init.mpa_attrs |= FW_RI_MPA_TX_MARKER_ENABLE; + if (qhp->attr.mpa_attr.crc_enabled) + wqe->u.init.mpa_attrs |= FW_RI_MPA_CRC_ENABLE; + + wqe->u.init.qp_caps = FW_RI_QP_RDMA_READ_ENABLE | + FW_RI_QP_RDMA_WRITE_ENABLE | + FW_RI_QP_BIND_ENABLE; + if (!qhp->ibqp.uobject) + wqe->u.init.qp_caps |= FW_RI_QP_FAST_REGISTER_ENABLE | + FW_RI_QP_STAG0_ENABLE; + wqe->u.init.nrqe = cpu_to_be16(t4_rqes_posted(&qhp->wq)); + wqe->u.init.pdid = cpu_to_be32(qhp->attr.pd); + wqe->u.init.qpid = cpu_to_be32(qhp->wq.sq.qid); + wqe->u.init.sq_eqid = cpu_to_be32(qhp->wq.sq.qid); + wqe->u.init.rq_eqid = cpu_to_be32(qhp->wq.rq.qid); + wqe->u.init.scqid = cpu_to_be32(qhp->attr.scq); + wqe->u.init.rcqid = cpu_to_be32(qhp->attr.rcq); + wqe->u.init.ord_max = cpu_to_be32(qhp->attr.max_ord); + wqe->u.init.ird_max = cpu_to_be32(qhp->attr.max_ird); + wqe->u.init.iss = cpu_to_be32(qhp->ep->snd_seq); + wqe->u.init.irs = cpu_to_be32(qhp->ep->rcv_seq); + wqe->u.init.hwrqsize = cpu_to_be32(qhp->wq.rq.rqt_size); + wqe->u.init.hwrqaddr = cpu_to_be32(qhp->wq.rq.rqt_hwaddr - + rhp->rdev.lldi.vr->rq.start); + if (qhp->attr.mpa_attr.initiator) + build_rtr_msg(qhp->attr.mpa_attr.p2p_type, &wqe->u.init); + + ret = c4iw_ofld_send(&rhp->rdev, skb); + if (ret) + goto err1; + + ret = c4iw_wait_for_reply(&rhp->rdev, &qhp->ep->com.wr_wait, + qhp->ep->hwtid, qhp->wq.sq.qid, __func__); + if (!ret) + goto out; +err1: + free_ird(rhp, qhp->attr.max_ird); +out: + PDBG("%s ret %d\n", __func__, ret); + return ret; +} + +int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, + enum c4iw_qp_attr_mask mask, + struct c4iw_qp_attributes *attrs, + int internal) +{ + int ret = 0; + struct c4iw_qp_attributes newattr = qhp->attr; + int disconnect = 0; + int terminate = 0; + int abort = 0; + int free = 0; + struct c4iw_ep *ep = NULL; + + PDBG("%s qhp %p sqid 0x%x rqid 0x%x ep %p state %d -> %d\n", __func__, + qhp, qhp->wq.sq.qid, qhp->wq.rq.qid, qhp->ep, qhp->attr.state, + (mask & C4IW_QP_ATTR_NEXT_STATE) ? attrs->next_state : -1); + + mutex_lock(&qhp->mutex); + + /* Process attr changes if in IDLE */ + if (mask & C4IW_QP_ATTR_VALID_MODIFY) { + if (qhp->attr.state != C4IW_QP_STATE_IDLE) { + ret = -EIO; + goto out; + } + if (mask & C4IW_QP_ATTR_ENABLE_RDMA_READ) + newattr.enable_rdma_read = attrs->enable_rdma_read; + if (mask & C4IW_QP_ATTR_ENABLE_RDMA_WRITE) + newattr.enable_rdma_write = attrs->enable_rdma_write; + if (mask & C4IW_QP_ATTR_ENABLE_RDMA_BIND) + newattr.enable_bind = attrs->enable_bind; + if (mask & C4IW_QP_ATTR_MAX_ORD) { + if (attrs->max_ord > c4iw_max_read_depth) { + ret = -EINVAL; + goto out; + } + newattr.max_ord = attrs->max_ord; + } + if (mask & C4IW_QP_ATTR_MAX_IRD) { + if (attrs->max_ird > cur_max_read_depth(rhp)) { + ret = -EINVAL; + goto out; + } + newattr.max_ird = attrs->max_ird; + } + qhp->attr = newattr; + } + + if (mask & C4IW_QP_ATTR_SQ_DB) { + ret = ring_kernel_sq_db(qhp, attrs->sq_db_inc); + goto out; + } + if (mask & C4IW_QP_ATTR_RQ_DB) { + ret = ring_kernel_rq_db(qhp, attrs->rq_db_inc); + goto out; + } + + if (!(mask & C4IW_QP_ATTR_NEXT_STATE)) + goto out; + if (qhp->attr.state == attrs->next_state) + goto out; + + switch (qhp->attr.state) { + case C4IW_QP_STATE_IDLE: + switch (attrs->next_state) { + case C4IW_QP_STATE_RTS: + if (!(mask & C4IW_QP_ATTR_LLP_STREAM_HANDLE)) { + ret = -EINVAL; + goto out; + } + if (!(mask & C4IW_QP_ATTR_MPA_ATTR)) { + ret = -EINVAL; + goto out; + } + qhp->attr.mpa_attr = attrs->mpa_attr; + qhp->attr.llp_stream_handle = attrs->llp_stream_handle; + qhp->ep = qhp->attr.llp_stream_handle; + set_state(qhp, C4IW_QP_STATE_RTS); + + /* + * Ref the endpoint here and deref when we + * disassociate the endpoint from the QP. This + * happens in CLOSING->IDLE transition or *->ERROR + * transition. + */ + c4iw_get_ep(&qhp->ep->com); + ret = rdma_init(rhp, qhp); + if (ret) + goto err; + break; + case C4IW_QP_STATE_ERROR: + set_state(qhp, C4IW_QP_STATE_ERROR); + flush_qp(qhp); + break; + default: + ret = -EINVAL; + goto out; + } + break; + case C4IW_QP_STATE_RTS: + switch (attrs->next_state) { + case C4IW_QP_STATE_CLOSING: + BUG_ON(atomic_read(&qhp->ep->com.kref.refcount) < 2); + t4_set_wq_in_error(&qhp->wq); + set_state(qhp, C4IW_QP_STATE_CLOSING); + ep = qhp->ep; + if (!internal) { + abort = 0; + disconnect = 1; + c4iw_get_ep(&qhp->ep->com); + } + ret = rdma_fini(rhp, qhp, ep); + if (ret) + goto err; + break; + case C4IW_QP_STATE_TERMINATE: + t4_set_wq_in_error(&qhp->wq); + set_state(qhp, C4IW_QP_STATE_TERMINATE); + qhp->attr.layer_etype = attrs->layer_etype; + qhp->attr.ecode = attrs->ecode; + ep = qhp->ep; + if (!internal) { + c4iw_get_ep(&qhp->ep->com); + terminate = 1; + disconnect = 1; + } else { + terminate = qhp->attr.send_term; + ret = rdma_fini(rhp, qhp, ep); + if (ret) + goto err; + } + break; + case C4IW_QP_STATE_ERROR: + t4_set_wq_in_error(&qhp->wq); + set_state(qhp, C4IW_QP_STATE_ERROR); + if (!internal) { + abort = 1; + disconnect = 1; + ep = qhp->ep; + c4iw_get_ep(&qhp->ep->com); + } + goto err; + break; + default: + ret = -EINVAL; + goto out; + } + break; + case C4IW_QP_STATE_CLOSING: + if (!internal) { + ret = -EINVAL; + goto out; + } + switch (attrs->next_state) { + case C4IW_QP_STATE_IDLE: + flush_qp(qhp); + set_state(qhp, C4IW_QP_STATE_IDLE); + qhp->attr.llp_stream_handle = NULL; + c4iw_put_ep(&qhp->ep->com); + qhp->ep = NULL; + wake_up(&qhp->wait); + break; + case C4IW_QP_STATE_ERROR: + goto err; + default: + ret = -EINVAL; + goto err; + } + break; + case C4IW_QP_STATE_ERROR: + if (attrs->next_state != C4IW_QP_STATE_IDLE) { + ret = -EINVAL; + goto out; + } + if (!t4_sq_empty(&qhp->wq) || !t4_rq_empty(&qhp->wq)) { + ret = -EINVAL; + goto out; + } + set_state(qhp, C4IW_QP_STATE_IDLE); + break; + case C4IW_QP_STATE_TERMINATE: + if (!internal) { + ret = -EINVAL; + goto out; + } + goto err; + break; + default: + printk(KERN_ERR "%s in a bad state %d\n", + __func__, qhp->attr.state); + ret = -EINVAL; + goto err; + break; + } + goto out; +err: + PDBG("%s disassociating ep %p qpid 0x%x\n", __func__, qhp->ep, + qhp->wq.sq.qid); + + /* disassociate the LLP connection */ + qhp->attr.llp_stream_handle = NULL; + if (!ep) + ep = qhp->ep; + qhp->ep = NULL; + set_state(qhp, C4IW_QP_STATE_ERROR); + free = 1; + abort = 1; + wake_up(&qhp->wait); + BUG_ON(!ep); + flush_qp(qhp); +out: + mutex_unlock(&qhp->mutex); + + if (terminate) + post_terminate(qhp, NULL, internal ? GFP_ATOMIC : GFP_KERNEL); + + /* + * If disconnect is 1, then we need to initiate a disconnect + * on the EP. This can be a normal close (RTS->CLOSING) or + * an abnormal close (RTS/CLOSING->ERROR). + */ + if (disconnect) { + c4iw_ep_disconnect(ep, abort, internal ? GFP_ATOMIC : + GFP_KERNEL); + c4iw_put_ep(&ep->com); + } + + /* + * If free is 1, then we've disassociated the EP from the QP + * and we need to dereference the EP. + */ + if (free) + c4iw_put_ep(&ep->com); + PDBG("%s exit state %d\n", __func__, qhp->attr.state); + return ret; +} + +int c4iw_destroy_qp(struct ib_qp *ib_qp) +{ + struct c4iw_dev *rhp; + struct c4iw_qp *qhp; + struct c4iw_qp_attributes attrs; + struct c4iw_ucontext *ucontext; + + qhp = to_c4iw_qp(ib_qp); + rhp = qhp->rhp; + + attrs.next_state = C4IW_QP_STATE_ERROR; + if (qhp->attr.state == C4IW_QP_STATE_TERMINATE) + c4iw_modify_qp(rhp, qhp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); + else + c4iw_modify_qp(rhp, qhp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 0); + wait_event(qhp->wait, !qhp->ep); + + remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid); + atomic_dec(&qhp->refcnt); + wait_event(qhp->wait, !atomic_read(&qhp->refcnt)); + + spin_lock_irq(&rhp->lock); + if (!list_empty(&qhp->db_fc_entry)) + list_del_init(&qhp->db_fc_entry); + spin_unlock_irq(&rhp->lock); + free_ird(rhp, qhp->attr.max_ird); + + ucontext = ib_qp->uobject ? + to_c4iw_ucontext(ib_qp->uobject->context) : NULL; + destroy_qp(&rhp->rdev, &qhp->wq, + ucontext ? &ucontext->uctx : &rhp->rdev.uctx); + + PDBG("%s ib_qp %p qpid 0x%0x\n", __func__, ib_qp, qhp->wq.sq.qid); + kfree(qhp); + return 0; +} + +struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, + struct ib_udata *udata) +{ + struct c4iw_dev *rhp; + struct c4iw_qp *qhp; + struct c4iw_pd *php; + struct c4iw_cq *schp; + struct c4iw_cq *rchp; + struct c4iw_create_qp_resp uresp; + unsigned int sqsize, rqsize; + struct c4iw_ucontext *ucontext; + int ret; + struct c4iw_mm_entry *mm1, *mm2, *mm3, *mm4, *mm5 = NULL; + + PDBG("%s ib_pd %p\n", __func__, pd); + + if (attrs->qp_type != IB_QPT_RC) + return ERR_PTR(-EINVAL); + + php = to_c4iw_pd(pd); + rhp = php->rhp; + schp = get_chp(rhp, ((struct c4iw_cq *)attrs->send_cq)->cq.cqid); + rchp = get_chp(rhp, ((struct c4iw_cq *)attrs->recv_cq)->cq.cqid); + if (!schp || !rchp) + return ERR_PTR(-EINVAL); + + if (attrs->cap.max_inline_data > T4_MAX_SEND_INLINE) + return ERR_PTR(-EINVAL); + + if (attrs->cap.max_recv_wr > rhp->rdev.hw_queue.t4_max_rq_size) + return ERR_PTR(-E2BIG); + rqsize = attrs->cap.max_recv_wr + 1; + if (rqsize < 8) + rqsize = 8; + + if (attrs->cap.max_send_wr > rhp->rdev.hw_queue.t4_max_sq_size) + return ERR_PTR(-E2BIG); + sqsize = attrs->cap.max_send_wr + 1; + if (sqsize < 8) + sqsize = 8; + + ucontext = pd->uobject ? to_c4iw_ucontext(pd->uobject->context) : NULL; + + qhp = kzalloc(sizeof(*qhp), GFP_KERNEL); + if (!qhp) + return ERR_PTR(-ENOMEM); + qhp->wq.sq.size = sqsize; + qhp->wq.sq.memsize = + (sqsize + rhp->rdev.hw_queue.t4_eq_status_entries) * + sizeof(*qhp->wq.sq.queue) + 16 * sizeof(__be64); + qhp->wq.sq.flush_cidx = -1; + qhp->wq.rq.size = rqsize; + qhp->wq.rq.memsize = + (rqsize + rhp->rdev.hw_queue.t4_eq_status_entries) * + sizeof(*qhp->wq.rq.queue); + + if (ucontext) { + qhp->wq.sq.memsize = roundup(qhp->wq.sq.memsize, PAGE_SIZE); + qhp->wq.rq.memsize = roundup(qhp->wq.rq.memsize, PAGE_SIZE); + } + + ret = create_qp(&rhp->rdev, &qhp->wq, &schp->cq, &rchp->cq, + ucontext ? &ucontext->uctx : &rhp->rdev.uctx); + if (ret) + goto err1; + + attrs->cap.max_recv_wr = rqsize - 1; + attrs->cap.max_send_wr = sqsize - 1; + attrs->cap.max_inline_data = T4_MAX_SEND_INLINE; + + qhp->rhp = rhp; + qhp->attr.pd = php->pdid; + qhp->attr.scq = ((struct c4iw_cq *) attrs->send_cq)->cq.cqid; + qhp->attr.rcq = ((struct c4iw_cq *) attrs->recv_cq)->cq.cqid; + qhp->attr.sq_num_entries = attrs->cap.max_send_wr; + qhp->attr.rq_num_entries = attrs->cap.max_recv_wr; + qhp->attr.sq_max_sges = attrs->cap.max_send_sge; + qhp->attr.sq_max_sges_rdma_write = attrs->cap.max_send_sge; + qhp->attr.rq_max_sges = attrs->cap.max_recv_sge; + qhp->attr.state = C4IW_QP_STATE_IDLE; + qhp->attr.next_state = C4IW_QP_STATE_IDLE; + qhp->attr.enable_rdma_read = 1; + qhp->attr.enable_rdma_write = 1; + qhp->attr.enable_bind = 1; + qhp->attr.max_ord = 0; + qhp->attr.max_ird = 0; + qhp->sq_sig_all = attrs->sq_sig_type == IB_SIGNAL_ALL_WR; + spin_lock_init(&qhp->lock); + mutex_init(&qhp->mutex); + init_waitqueue_head(&qhp->wait); + atomic_set(&qhp->refcnt, 1); + + ret = insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.sq.qid); + if (ret) + goto err2; + + if (udata) { + mm1 = kmalloc(sizeof *mm1, GFP_KERNEL); + if (!mm1) { + ret = -ENOMEM; + goto err3; + } + mm2 = kmalloc(sizeof *mm2, GFP_KERNEL); + if (!mm2) { + ret = -ENOMEM; + goto err4; + } + mm3 = kmalloc(sizeof *mm3, GFP_KERNEL); + if (!mm3) { + ret = -ENOMEM; + goto err5; + } + mm4 = kmalloc(sizeof *mm4, GFP_KERNEL); + if (!mm4) { + ret = -ENOMEM; + goto err6; + } + if (t4_sq_onchip(&qhp->wq.sq)) { + mm5 = kmalloc(sizeof *mm5, GFP_KERNEL); + if (!mm5) { + ret = -ENOMEM; + goto err7; + } + uresp.flags = C4IW_QPF_ONCHIP; + } else + uresp.flags = 0; + uresp.qid_mask = rhp->rdev.qpmask; + uresp.sqid = qhp->wq.sq.qid; + uresp.sq_size = qhp->wq.sq.size; + uresp.sq_memsize = qhp->wq.sq.memsize; + uresp.rqid = qhp->wq.rq.qid; + uresp.rq_size = qhp->wq.rq.size; + uresp.rq_memsize = qhp->wq.rq.memsize; + spin_lock(&ucontext->mmap_lock); + if (mm5) { + uresp.ma_sync_key = ucontext->key; + ucontext->key += PAGE_SIZE; + } else { + uresp.ma_sync_key = 0; + } + uresp.sq_key = ucontext->key; + ucontext->key += PAGE_SIZE; + uresp.rq_key = ucontext->key; + ucontext->key += PAGE_SIZE; + uresp.sq_db_gts_key = ucontext->key; + ucontext->key += PAGE_SIZE; + uresp.rq_db_gts_key = ucontext->key; + ucontext->key += PAGE_SIZE; + spin_unlock(&ucontext->mmap_lock); + ret = ib_copy_to_udata(udata, &uresp, sizeof uresp); + if (ret) + goto err8; + mm1->key = uresp.sq_key; + mm1->addr = qhp->wq.sq.phys_addr; + mm1->len = PAGE_ALIGN(qhp->wq.sq.memsize); + insert_mmap(ucontext, mm1); + mm2->key = uresp.rq_key; + mm2->addr = virt_to_phys(qhp->wq.rq.queue); + mm2->len = PAGE_ALIGN(qhp->wq.rq.memsize); + insert_mmap(ucontext, mm2); + mm3->key = uresp.sq_db_gts_key; + mm3->addr = (__force unsigned long) qhp->wq.sq.udb; + mm3->len = PAGE_SIZE; + insert_mmap(ucontext, mm3); + mm4->key = uresp.rq_db_gts_key; + mm4->addr = (__force unsigned long) qhp->wq.rq.udb; + mm4->len = PAGE_SIZE; + insert_mmap(ucontext, mm4); + if (mm5) { + mm5->key = uresp.ma_sync_key; + mm5->addr = (pci_resource_start(rhp->rdev.lldi.pdev, 0) + + A_PCIE_MA_SYNC) & PAGE_MASK; + mm5->len = PAGE_SIZE; + insert_mmap(ucontext, mm5); + } + } + qhp->ibqp.qp_num = qhp->wq.sq.qid; + init_timer(&(qhp->timer)); + INIT_LIST_HEAD(&qhp->db_fc_entry); + PDBG("%s sq id %u size %u memsize %zu num_entries %u " + "rq id %u size %u memsize %zu num_entries %u\n", __func__, + qhp->wq.sq.qid, qhp->wq.sq.size, qhp->wq.sq.memsize, + attrs->cap.max_send_wr, qhp->wq.rq.qid, qhp->wq.rq.size, + qhp->wq.rq.memsize, attrs->cap.max_recv_wr); + return &qhp->ibqp; +err8: + kfree(mm5); +err7: + kfree(mm4); +err6: + kfree(mm3); +err5: + kfree(mm2); +err4: + kfree(mm1); +err3: + remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid); +err2: + destroy_qp(&rhp->rdev, &qhp->wq, + ucontext ? &ucontext->uctx : &rhp->rdev.uctx); +err1: + kfree(qhp); + return ERR_PTR(ret); +} + +int c4iw_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct c4iw_dev *rhp; + struct c4iw_qp *qhp; + enum c4iw_qp_attr_mask mask = 0; + struct c4iw_qp_attributes attrs; + + PDBG("%s ib_qp %p\n", __func__, ibqp); + + /* iwarp does not support the RTR state */ + if ((attr_mask & IB_QP_STATE) && (attr->qp_state == IB_QPS_RTR)) + attr_mask &= ~IB_QP_STATE; + + /* Make sure we still have something left to do */ + if (!attr_mask) + return 0; + + memset(&attrs, 0, sizeof attrs); + qhp = to_c4iw_qp(ibqp); + rhp = qhp->rhp; + + attrs.next_state = c4iw_convert_state(attr->qp_state); + attrs.enable_rdma_read = (attr->qp_access_flags & + IB_ACCESS_REMOTE_READ) ? 1 : 0; + attrs.enable_rdma_write = (attr->qp_access_flags & + IB_ACCESS_REMOTE_WRITE) ? 1 : 0; + attrs.enable_bind = (attr->qp_access_flags & IB_ACCESS_MW_BIND) ? 1 : 0; + + + mask |= (attr_mask & IB_QP_STATE) ? C4IW_QP_ATTR_NEXT_STATE : 0; + mask |= (attr_mask & IB_QP_ACCESS_FLAGS) ? + (C4IW_QP_ATTR_ENABLE_RDMA_READ | + C4IW_QP_ATTR_ENABLE_RDMA_WRITE | + C4IW_QP_ATTR_ENABLE_RDMA_BIND) : 0; + + /* + * Use SQ_PSN and RQ_PSN to pass in IDX_INC values for + * ringing the queue db when we're in DB_FULL mode. + * Only allow this on T4 devices. + */ + attrs.sq_db_inc = attr->sq_psn; + attrs.rq_db_inc = attr->rq_psn; + mask |= (attr_mask & IB_QP_SQ_PSN) ? C4IW_QP_ATTR_SQ_DB : 0; + mask |= (attr_mask & IB_QP_RQ_PSN) ? C4IW_QP_ATTR_RQ_DB : 0; + if (is_t5(to_c4iw_qp(ibqp)->rhp->rdev.lldi.adapter_type) && + (mask & (C4IW_QP_ATTR_SQ_DB|C4IW_QP_ATTR_RQ_DB))) + return -EINVAL; + + return c4iw_modify_qp(rhp, qhp, mask, &attrs, 0); +} + +struct ib_qp *c4iw_get_qp(struct ib_device *dev, int qpn) +{ + PDBG("%s ib_dev %p qpn 0x%x\n", __func__, dev, qpn); + return (struct ib_qp *)get_qhp(to_c4iw_dev(dev), qpn); +} + +int c4iw_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_qp_init_attr *init_attr) +{ + struct c4iw_qp *qhp = to_c4iw_qp(ibqp); + + memset(attr, 0, sizeof *attr); + memset(init_attr, 0, sizeof *init_attr); + attr->qp_state = to_ib_qp_state(qhp->attr.state); + init_attr->cap.max_send_wr = qhp->attr.sq_num_entries; + init_attr->cap.max_recv_wr = qhp->attr.rq_num_entries; + init_attr->cap.max_send_sge = qhp->attr.sq_max_sges; + init_attr->cap.max_recv_sge = qhp->attr.sq_max_sges; + init_attr->cap.max_inline_data = T4_MAX_SEND_INLINE; + init_attr->sq_sig_type = qhp->sq_sig_all ? IB_SIGNAL_ALL_WR : 0; + return 0; +} diff --git a/drivers/infiniband/hw/cxgb4/resource.c b/drivers/infiniband/hw/cxgb4/resource.c new file mode 100644 index 0000000..67df71a --- /dev/null +++ b/drivers/infiniband/hw/cxgb4/resource.c @@ -0,0 +1,453 @@ +/* + * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +/* Crude resource management */ +#include +#include +#include +#include "iw_cxgb4.h" + +static int c4iw_init_qid_table(struct c4iw_rdev *rdev) +{ + u32 i; + + if (c4iw_id_table_alloc(&rdev->resource.qid_table, + rdev->lldi.vr->qp.start, + rdev->lldi.vr->qp.size, + rdev->lldi.vr->qp.size, 0)) + return -ENOMEM; + + for (i = rdev->lldi.vr->qp.start; + i < rdev->lldi.vr->qp.start + rdev->lldi.vr->qp.size; i++) + if (!(i & rdev->qpmask)) + c4iw_id_free(&rdev->resource.qid_table, i); + return 0; +} + +/* nr_* must be power of 2 */ +int c4iw_init_resource(struct c4iw_rdev *rdev, u32 nr_tpt, u32 nr_pdid) +{ + int err = 0; + err = c4iw_id_table_alloc(&rdev->resource.tpt_table, 0, nr_tpt, 1, + C4IW_ID_TABLE_F_RANDOM); + if (err) + goto tpt_err; + err = c4iw_init_qid_table(rdev); + if (err) + goto qid_err; + err = c4iw_id_table_alloc(&rdev->resource.pdid_table, 0, + nr_pdid, 1, 0); + if (err) + goto pdid_err; + return 0; + pdid_err: + c4iw_id_table_free(&rdev->resource.qid_table); + qid_err: + c4iw_id_table_free(&rdev->resource.tpt_table); + tpt_err: + return -ENOMEM; +} + +/* + * returns 0 if no resource available + */ +u32 c4iw_get_resource(struct c4iw_id_table *id_table) +{ + u32 entry; + entry = c4iw_id_alloc(id_table); + if (entry == (u32)(-1)) + return 0; + return entry; +} + +void c4iw_put_resource(struct c4iw_id_table *id_table, u32 entry) +{ + PDBG("%s entry 0x%x\n", __func__, entry); + c4iw_id_free(id_table, entry); +} + +u32 c4iw_get_cqid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx) +{ + struct c4iw_qid_list *entry; + u32 qid; + int i; + + mutex_lock(&uctx->lock); + if (!list_empty(&uctx->cqids)) { + entry = list_entry(uctx->cqids.next, struct c4iw_qid_list, + entry); + list_del(&entry->entry); + qid = entry->qid; + kfree(entry); + } else { + qid = c4iw_get_resource(&rdev->resource.qid_table); + if (!qid) + goto out; + mutex_lock(&rdev->stats.lock); + rdev->stats.qid.cur += rdev->qpmask + 1; + mutex_unlock(&rdev->stats.lock); + for (i = qid+1; i & rdev->qpmask; i++) { + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) + goto out; + entry->qid = i; + list_add_tail(&entry->entry, &uctx->cqids); + } + + /* + * now put the same ids on the qp list since they all + * map to the same db/gts page. + */ + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) + goto out; + entry->qid = qid; + list_add_tail(&entry->entry, &uctx->qpids); + for (i = qid+1; i & rdev->qpmask; i++) { + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) + goto out; + entry->qid = i; + list_add_tail(&entry->entry, &uctx->qpids); + } + } +out: + mutex_unlock(&uctx->lock); + PDBG("%s qid 0x%x\n", __func__, qid); + mutex_lock(&rdev->stats.lock); + if (rdev->stats.qid.cur > rdev->stats.qid.max) + rdev->stats.qid.max = rdev->stats.qid.cur; + mutex_unlock(&rdev->stats.lock); + return qid; +} + +void c4iw_put_cqid(struct c4iw_rdev *rdev, u32 qid, + struct c4iw_dev_ucontext *uctx) +{ + struct c4iw_qid_list *entry; + + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) + return; + PDBG("%s qid 0x%x\n", __func__, qid); + entry->qid = qid; + mutex_lock(&uctx->lock); + list_add_tail(&entry->entry, &uctx->cqids); + mutex_unlock(&uctx->lock); +} + +u32 c4iw_get_qpid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx) +{ + struct c4iw_qid_list *entry; + u32 qid; + int i; + + mutex_lock(&uctx->lock); + if (!list_empty(&uctx->qpids)) { + entry = list_entry(uctx->qpids.next, struct c4iw_qid_list, + entry); + list_del(&entry->entry); + qid = entry->qid; + kfree(entry); + } else { + qid = c4iw_get_resource(&rdev->resource.qid_table); + if (!qid) { + mutex_lock(&rdev->stats.lock); + rdev->stats.qid.fail++; + mutex_unlock(&rdev->stats.lock); + goto out; + } + mutex_lock(&rdev->stats.lock); + rdev->stats.qid.cur += rdev->qpmask + 1; + mutex_unlock(&rdev->stats.lock); + for (i = qid+1; i & rdev->qpmask; i++) { + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) + goto out; + entry->qid = i; + list_add_tail(&entry->entry, &uctx->qpids); + } + + /* + * now put the same ids on the cq list since they all + * map to the same db/gts page. + */ + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) + goto out; + entry->qid = qid; + list_add_tail(&entry->entry, &uctx->cqids); + for (i = qid; i & rdev->qpmask; i++) { + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) + goto out; + entry->qid = i; + list_add_tail(&entry->entry, &uctx->cqids); + } + } +out: + mutex_unlock(&uctx->lock); + PDBG("%s qid 0x%x\n", __func__, qid); + mutex_lock(&rdev->stats.lock); + if (rdev->stats.qid.cur > rdev->stats.qid.max) + rdev->stats.qid.max = rdev->stats.qid.cur; + mutex_unlock(&rdev->stats.lock); + return qid; +} + +void c4iw_put_qpid(struct c4iw_rdev *rdev, u32 qid, + struct c4iw_dev_ucontext *uctx) +{ + struct c4iw_qid_list *entry; + + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) + return; + PDBG("%s qid 0x%x\n", __func__, qid); + entry->qid = qid; + mutex_lock(&uctx->lock); + list_add_tail(&entry->entry, &uctx->qpids); + mutex_unlock(&uctx->lock); +} + +void c4iw_destroy_resource(struct c4iw_resource *rscp) +{ + c4iw_id_table_free(&rscp->tpt_table); + c4iw_id_table_free(&rscp->qid_table); + c4iw_id_table_free(&rscp->pdid_table); +} + +/* + * PBL Memory Manager. Uses Linux generic allocator. + */ + +#define MIN_PBL_SHIFT 8 /* 256B == min PBL size (32 entries) */ + +u32 c4iw_pblpool_alloc(struct c4iw_rdev *rdev, int size) +{ + unsigned long addr = gen_pool_alloc(rdev->pbl_pool, size); + PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size); + mutex_lock(&rdev->stats.lock); + if (addr) { + rdev->stats.pbl.cur += roundup(size, 1 << MIN_PBL_SHIFT); + if (rdev->stats.pbl.cur > rdev->stats.pbl.max) + rdev->stats.pbl.max = rdev->stats.pbl.cur; + } else + rdev->stats.pbl.fail++; + mutex_unlock(&rdev->stats.lock); + return (u32)addr; +} + +void c4iw_pblpool_free(struct c4iw_rdev *rdev, u32 addr, int size) +{ + PDBG("%s addr 0x%x size %d\n", __func__, addr, size); + mutex_lock(&rdev->stats.lock); + rdev->stats.pbl.cur -= roundup(size, 1 << MIN_PBL_SHIFT); + mutex_unlock(&rdev->stats.lock); + gen_pool_free(rdev->pbl_pool, (unsigned long)addr, size); +} + +int c4iw_pblpool_create(struct c4iw_rdev *rdev) +{ + unsigned pbl_start, pbl_chunk, pbl_top; + + rdev->pbl_pool = gen_pool_create(MIN_PBL_SHIFT, -1); + if (!rdev->pbl_pool) + return -ENOMEM; + + pbl_start = rdev->lldi.vr->pbl.start; + pbl_chunk = rdev->lldi.vr->pbl.size; + pbl_top = pbl_start + pbl_chunk; + + while (pbl_start < pbl_top) { + pbl_chunk = min(pbl_top - pbl_start + 1, pbl_chunk); + if (gen_pool_add(rdev->pbl_pool, pbl_start, pbl_chunk, -1)) { + PDBG("%s failed to add PBL chunk (%x/%x)\n", + __func__, pbl_start, pbl_chunk); + if (pbl_chunk <= 1024 << MIN_PBL_SHIFT) { + printk(KERN_WARNING MOD + "Failed to add all PBL chunks (%x/%x)\n", + pbl_start, + pbl_top - pbl_start); + return 0; + } + pbl_chunk >>= 1; + } else { + PDBG("%s added PBL chunk (%x/%x)\n", + __func__, pbl_start, pbl_chunk); + pbl_start += pbl_chunk; + } + } + + return 0; +} + +void c4iw_pblpool_destroy(struct c4iw_rdev *rdev) +{ + gen_pool_destroy(rdev->pbl_pool); +} + +/* + * RQT Memory Manager. Uses Linux generic allocator. + */ + +#define MIN_RQT_SHIFT 10 /* 1KB == min RQT size (16 entries) */ + +u32 c4iw_rqtpool_alloc(struct c4iw_rdev *rdev, int size) +{ + unsigned long addr = gen_pool_alloc(rdev->rqt_pool, size << 6); + PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size << 6); + if (!addr) + pr_warn_ratelimited(MOD "%s: Out of RQT memory\n", + pci_name(rdev->lldi.pdev)); + mutex_lock(&rdev->stats.lock); + if (addr) { + rdev->stats.rqt.cur += roundup(size << 6, 1 << MIN_RQT_SHIFT); + if (rdev->stats.rqt.cur > rdev->stats.rqt.max) + rdev->stats.rqt.max = rdev->stats.rqt.cur; + } else + rdev->stats.rqt.fail++; + mutex_unlock(&rdev->stats.lock); + return (u32)addr; +} + +void c4iw_rqtpool_free(struct c4iw_rdev *rdev, u32 addr, int size) +{ + PDBG("%s addr 0x%x size %d\n", __func__, addr, size << 6); + mutex_lock(&rdev->stats.lock); + rdev->stats.rqt.cur -= roundup(size << 6, 1 << MIN_RQT_SHIFT); + mutex_unlock(&rdev->stats.lock); + gen_pool_free(rdev->rqt_pool, (unsigned long)addr, size << 6); +} + +int c4iw_rqtpool_create(struct c4iw_rdev *rdev) +{ + unsigned rqt_start, rqt_chunk, rqt_top; + + rdev->rqt_pool = gen_pool_create(MIN_RQT_SHIFT, -1); + if (!rdev->rqt_pool) + return -ENOMEM; + + rqt_start = rdev->lldi.vr->rq.start; + rqt_chunk = rdev->lldi.vr->rq.size; + rqt_top = rqt_start + rqt_chunk; + + while (rqt_start < rqt_top) { + rqt_chunk = min(rqt_top - rqt_start + 1, rqt_chunk); + if (gen_pool_add(rdev->rqt_pool, rqt_start, rqt_chunk, -1)) { + PDBG("%s failed to add RQT chunk (%x/%x)\n", + __func__, rqt_start, rqt_chunk); + if (rqt_chunk <= 1024 << MIN_RQT_SHIFT) { + printk(KERN_WARNING MOD + "Failed to add all RQT chunks (%x/%x)\n", + rqt_start, rqt_top - rqt_start); + return 0; + } + rqt_chunk >>= 1; + } else { + PDBG("%s added RQT chunk (%x/%x)\n", + __func__, rqt_start, rqt_chunk); + rqt_start += rqt_chunk; + } + } + return 0; +} + +void c4iw_rqtpool_destroy(struct c4iw_rdev *rdev) +{ + gen_pool_destroy(rdev->rqt_pool); +} + +/* + * On-Chip QP Memory. + */ +#define MIN_OCQP_SHIFT 12 /* 4KB == min ocqp size */ + +u32 c4iw_ocqp_pool_alloc(struct c4iw_rdev *rdev, int size) +{ + unsigned long addr = gen_pool_alloc(rdev->ocqp_pool, size); + PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size); + if (addr) { + mutex_lock(&rdev->stats.lock); + rdev->stats.ocqp.cur += roundup(size, 1 << MIN_OCQP_SHIFT); + if (rdev->stats.ocqp.cur > rdev->stats.ocqp.max) + rdev->stats.ocqp.max = rdev->stats.ocqp.cur; + mutex_unlock(&rdev->stats.lock); + } + return (u32)addr; +} + +void c4iw_ocqp_pool_free(struct c4iw_rdev *rdev, u32 addr, int size) +{ + PDBG("%s addr 0x%x size %d\n", __func__, addr, size); + mutex_lock(&rdev->stats.lock); + rdev->stats.ocqp.cur -= roundup(size, 1 << MIN_OCQP_SHIFT); + mutex_unlock(&rdev->stats.lock); + gen_pool_free(rdev->ocqp_pool, (unsigned long)addr, size); +} + +int c4iw_ocqp_pool_create(struct c4iw_rdev *rdev) +{ + unsigned start, chunk, top; + + rdev->ocqp_pool = gen_pool_create(MIN_OCQP_SHIFT, -1); + if (!rdev->ocqp_pool) + return -ENOMEM; + + start = rdev->lldi.vr->ocq.start; + chunk = rdev->lldi.vr->ocq.size; + top = start + chunk; + + while (start < top) { + chunk = min(top - start + 1, chunk); + if (gen_pool_add(rdev->ocqp_pool, start, chunk, -1)) { + PDBG("%s failed to add OCQP chunk (%x/%x)\n", + __func__, start, chunk); + if (chunk <= 1024 << MIN_OCQP_SHIFT) { + printk(KERN_WARNING MOD + "Failed to add all OCQP chunks (%x/%x)\n", + start, top - start); + return 0; + } + chunk >>= 1; + } else { + PDBG("%s added OCQP chunk (%x/%x)\n", + __func__, start, chunk); + start += chunk; + } + } + return 0; +} + +void c4iw_ocqp_pool_destroy(struct c4iw_rdev *rdev) +{ + gen_pool_destroy(rdev->ocqp_pool); +} diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h new file mode 100644 index 0000000..c04e513 --- /dev/null +++ b/drivers/infiniband/hw/cxgb4/t4.h @@ -0,0 +1,684 @@ +/* + * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __T4_H__ +#define __T4_H__ + +#include "t4_hw.h" +#include "t4_regs.h" +#include "t4_msg.h" +#include "t4fw_ri_api.h" + +#define T4_MAX_NUM_PD 65536 +#define T4_MAX_MR_SIZE (~0ULL) +#define T4_PAGESIZE_MASK 0xffff000 /* 4KB-128MB */ +#define T4_STAG_UNSET 0xffffffff +#define T4_FW_MAJ 0 +#define A_PCIE_MA_SYNC 0x30b4 + +struct t4_status_page { + __be32 rsvd1; /* flit 0 - hw owns */ + __be16 rsvd2; + __be16 qid; + __be16 cidx; + __be16 pidx; + u8 qp_err; /* flit 1 - sw owns */ + u8 db_off; + u8 pad; + u16 host_wq_pidx; + u16 host_cidx; + u16 host_pidx; +}; + +#define T4_EQ_ENTRY_SIZE 64 + +#define T4_SQ_NUM_SLOTS 5 +#define T4_SQ_NUM_BYTES (T4_EQ_ENTRY_SIZE * T4_SQ_NUM_SLOTS) +#define T4_MAX_SEND_SGE ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_send_wr) - \ + sizeof(struct fw_ri_isgl)) / sizeof(struct fw_ri_sge)) +#define T4_MAX_SEND_INLINE ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_send_wr) - \ + sizeof(struct fw_ri_immd))) +#define T4_MAX_WRITE_INLINE ((T4_SQ_NUM_BYTES - \ + sizeof(struct fw_ri_rdma_write_wr) - \ + sizeof(struct fw_ri_immd))) +#define T4_MAX_WRITE_SGE ((T4_SQ_NUM_BYTES - \ + sizeof(struct fw_ri_rdma_write_wr) - \ + sizeof(struct fw_ri_isgl)) / sizeof(struct fw_ri_sge)) +#define T4_MAX_FR_IMMD ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_fr_nsmr_wr) - \ + sizeof(struct fw_ri_immd)) & ~31UL) +#define T4_MAX_FR_IMMD_DEPTH (T4_MAX_FR_IMMD / sizeof(u64)) +#define T4_MAX_FR_DSGL 1024 +#define T4_MAX_FR_DSGL_DEPTH (T4_MAX_FR_DSGL / sizeof(u64)) + +static inline int t4_max_fr_depth(int use_dsgl) +{ + return use_dsgl ? T4_MAX_FR_DSGL_DEPTH : T4_MAX_FR_IMMD_DEPTH; +} + +#define T4_RQ_NUM_SLOTS 2 +#define T4_RQ_NUM_BYTES (T4_EQ_ENTRY_SIZE * T4_RQ_NUM_SLOTS) +#define T4_MAX_RECV_SGE 4 + +union t4_wr { + struct fw_ri_res_wr res; + struct fw_ri_wr ri; + struct fw_ri_rdma_write_wr write; + struct fw_ri_send_wr send; + struct fw_ri_rdma_read_wr read; + struct fw_ri_bind_mw_wr bind; + struct fw_ri_fr_nsmr_wr fr; + struct fw_ri_inv_lstag_wr inv; + struct t4_status_page status; + __be64 flits[T4_EQ_ENTRY_SIZE / sizeof(__be64) * T4_SQ_NUM_SLOTS]; +}; + +union t4_recv_wr { + struct fw_ri_recv_wr recv; + struct t4_status_page status; + __be64 flits[T4_EQ_ENTRY_SIZE / sizeof(__be64) * T4_RQ_NUM_SLOTS]; +}; + +static inline void init_wr_hdr(union t4_wr *wqe, u16 wrid, + enum fw_wr_opcodes opcode, u8 flags, u8 len16) +{ + wqe->send.opcode = (u8)opcode; + wqe->send.flags = flags; + wqe->send.wrid = wrid; + wqe->send.r1[0] = 0; + wqe->send.r1[1] = 0; + wqe->send.r1[2] = 0; + wqe->send.len16 = len16; +} + +/* CQE/AE status codes */ +#define T4_ERR_SUCCESS 0x0 +#define T4_ERR_STAG 0x1 /* STAG invalid: either the */ + /* STAG is offlimt, being 0, */ + /* or STAG_key mismatch */ +#define T4_ERR_PDID 0x2 /* PDID mismatch */ +#define T4_ERR_QPID 0x3 /* QPID mismatch */ +#define T4_ERR_ACCESS 0x4 /* Invalid access right */ +#define T4_ERR_WRAP 0x5 /* Wrap error */ +#define T4_ERR_BOUND 0x6 /* base and bounds voilation */ +#define T4_ERR_INVALIDATE_SHARED_MR 0x7 /* attempt to invalidate a */ + /* shared memory region */ +#define T4_ERR_INVALIDATE_MR_WITH_MW_BOUND 0x8 /* attempt to invalidate a */ + /* shared memory region */ +#define T4_ERR_ECC 0x9 /* ECC error detected */ +#define T4_ERR_ECC_PSTAG 0xA /* ECC error detected when */ + /* reading PSTAG for a MW */ + /* Invalidate */ +#define T4_ERR_PBL_ADDR_BOUND 0xB /* pbl addr out of bounds: */ + /* software error */ +#define T4_ERR_SWFLUSH 0xC /* SW FLUSHED */ +#define T4_ERR_CRC 0x10 /* CRC error */ +#define T4_ERR_MARKER 0x11 /* Marker error */ +#define T4_ERR_PDU_LEN_ERR 0x12 /* invalid PDU length */ +#define T4_ERR_OUT_OF_RQE 0x13 /* out of RQE */ +#define T4_ERR_DDP_VERSION 0x14 /* wrong DDP version */ +#define T4_ERR_RDMA_VERSION 0x15 /* wrong RDMA version */ +#define T4_ERR_OPCODE 0x16 /* invalid rdma opcode */ +#define T4_ERR_DDP_QUEUE_NUM 0x17 /* invalid ddp queue number */ +#define T4_ERR_MSN 0x18 /* MSN error */ +#define T4_ERR_TBIT 0x19 /* tag bit not set correctly */ +#define T4_ERR_MO 0x1A /* MO not 0 for TERMINATE */ + /* or READ_REQ */ +#define T4_ERR_MSN_GAP 0x1B +#define T4_ERR_MSN_RANGE 0x1C +#define T4_ERR_IRD_OVERFLOW 0x1D +#define T4_ERR_RQE_ADDR_BOUND 0x1E /* RQE addr out of bounds: */ + /* software error */ +#define T4_ERR_INTERNAL_ERR 0x1F /* internal error (opcode */ + /* mismatch) */ +/* + * CQE defs + */ +struct t4_cqe { + __be32 header; + __be32 len; + union { + struct { + __be32 stag; + __be32 msn; + } rcqe; + struct { + u32 nada1; + u16 nada2; + u16 cidx; + } scqe; + struct { + __be32 wrid_hi; + __be32 wrid_low; + } gen; + } u; + __be64 reserved; + __be64 bits_type_ts; +}; + +/* macros for flit 0 of the cqe */ + +#define S_CQE_QPID 12 +#define M_CQE_QPID 0xFFFFF +#define G_CQE_QPID(x) ((((x) >> S_CQE_QPID)) & M_CQE_QPID) +#define V_CQE_QPID(x) ((x)<> S_CQE_SWCQE)) & M_CQE_SWCQE) +#define V_CQE_SWCQE(x) ((x)<> S_CQE_STATUS)) & M_CQE_STATUS) +#define V_CQE_STATUS(x) ((x)<> S_CQE_TYPE)) & M_CQE_TYPE) +#define V_CQE_TYPE(x) ((x)<> S_CQE_OPCODE)) & M_CQE_OPCODE) +#define V_CQE_OPCODE(x) ((x)<header))) +#define CQE_QPID(x) (G_CQE_QPID(be32_to_cpu((x)->header))) +#define CQE_TYPE(x) (G_CQE_TYPE(be32_to_cpu((x)->header))) +#define SQ_TYPE(x) (CQE_TYPE((x))) +#define RQ_TYPE(x) (!CQE_TYPE((x))) +#define CQE_STATUS(x) (G_CQE_STATUS(be32_to_cpu((x)->header))) +#define CQE_OPCODE(x) (G_CQE_OPCODE(be32_to_cpu((x)->header))) + +#define CQE_SEND_OPCODE(x)( \ + (G_CQE_OPCODE(be32_to_cpu((x)->header)) == FW_RI_SEND) || \ + (G_CQE_OPCODE(be32_to_cpu((x)->header)) == FW_RI_SEND_WITH_SE) || \ + (G_CQE_OPCODE(be32_to_cpu((x)->header)) == FW_RI_SEND_WITH_INV) || \ + (G_CQE_OPCODE(be32_to_cpu((x)->header)) == FW_RI_SEND_WITH_SE_INV)) + +#define CQE_LEN(x) (be32_to_cpu((x)->len)) + +/* used for RQ completion processing */ +#define CQE_WRID_STAG(x) (be32_to_cpu((x)->u.rcqe.stag)) +#define CQE_WRID_MSN(x) (be32_to_cpu((x)->u.rcqe.msn)) + +/* used for SQ completion processing */ +#define CQE_WRID_SQ_IDX(x) ((x)->u.scqe.cidx) + +/* generic accessor macros */ +#define CQE_WRID_HI(x) (be32_to_cpu((x)->u.gen.wrid_hi)) +#define CQE_WRID_LOW(x) (be32_to_cpu((x)->u.gen.wrid_low)) + +/* macros for flit 3 of the cqe */ +#define S_CQE_GENBIT 63 +#define M_CQE_GENBIT 0x1 +#define G_CQE_GENBIT(x) (((x) >> S_CQE_GENBIT) & M_CQE_GENBIT) +#define V_CQE_GENBIT(x) ((x)<> S_CQE_OVFBIT)) & M_CQE_OVFBIT) + +#define S_CQE_IQTYPE 60 +#define M_CQE_IQTYPE 0x3 +#define G_CQE_IQTYPE(x) ((((x) >> S_CQE_IQTYPE)) & M_CQE_IQTYPE) + +#define M_CQE_TS 0x0fffffffffffffffULL +#define G_CQE_TS(x) ((x) & M_CQE_TS) + +#define CQE_OVFBIT(x) ((unsigned)G_CQE_OVFBIT(be64_to_cpu((x)->bits_type_ts))) +#define CQE_GENBIT(x) ((unsigned)G_CQE_GENBIT(be64_to_cpu((x)->bits_type_ts))) +#define CQE_TS(x) (G_CQE_TS(be64_to_cpu((x)->bits_type_ts))) + +struct t4_swsqe { + u64 wr_id; + struct t4_cqe cqe; + int read_len; + int opcode; + int complete; + int signaled; + u16 idx; + int flushed; + struct timespec host_ts; + u64 sge_ts; +}; + +static inline pgprot_t t4_pgprot_wc(pgprot_t prot) +{ +#if defined(__i386__) || defined(__x86_64__) || defined(CONFIG_PPC64) + return pgprot_writecombine(prot); +#else + return pgprot_noncached(prot); +#endif +} + +enum { + T4_SQ_ONCHIP = (1<<0), +}; + +struct t4_sq { + union t4_wr *queue; + dma_addr_t dma_addr; + DEFINE_DMA_UNMAP_ADDR(mapping); + unsigned long phys_addr; + struct t4_swsqe *sw_sq; + struct t4_swsqe *oldest_read; + u64 __iomem *udb; + size_t memsize; + u32 qid; + u16 in_use; + u16 size; + u16 cidx; + u16 pidx; + u16 wq_pidx; + u16 wq_pidx_inc; + u16 flags; + short flush_cidx; +}; + +struct t4_swrqe { + u64 wr_id; + struct timespec host_ts; + u64 sge_ts; +}; + +struct t4_rq { + union t4_recv_wr *queue; + dma_addr_t dma_addr; + DEFINE_DMA_UNMAP_ADDR(mapping); + struct t4_swrqe *sw_rq; + u64 __iomem *udb; + size_t memsize; + u32 qid; + u32 msn; + u32 rqt_hwaddr; + u16 rqt_size; + u16 in_use; + u16 size; + u16 cidx; + u16 pidx; + u16 wq_pidx; + u16 wq_pidx_inc; +}; + +struct t4_wq { + struct t4_sq sq; + struct t4_rq rq; + void __iomem *db; + void __iomem *gts; + struct c4iw_rdev *rdev; + int flushed; +}; + +static inline int t4_rqes_posted(struct t4_wq *wq) +{ + return wq->rq.in_use; +} + +static inline int t4_rq_empty(struct t4_wq *wq) +{ + return wq->rq.in_use == 0; +} + +static inline int t4_rq_full(struct t4_wq *wq) +{ + return wq->rq.in_use == (wq->rq.size - 1); +} + +static inline u32 t4_rq_avail(struct t4_wq *wq) +{ + return wq->rq.size - 1 - wq->rq.in_use; +} + +static inline void t4_rq_produce(struct t4_wq *wq, u8 len16) +{ + wq->rq.in_use++; + if (++wq->rq.pidx == wq->rq.size) + wq->rq.pidx = 0; + wq->rq.wq_pidx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE); + if (wq->rq.wq_pidx >= wq->rq.size * T4_RQ_NUM_SLOTS) + wq->rq.wq_pidx %= wq->rq.size * T4_RQ_NUM_SLOTS; +} + +static inline void t4_rq_consume(struct t4_wq *wq) +{ + wq->rq.in_use--; + wq->rq.msn++; + if (++wq->rq.cidx == wq->rq.size) + wq->rq.cidx = 0; +} + +static inline u16 t4_rq_host_wq_pidx(struct t4_wq *wq) +{ + return wq->rq.queue[wq->rq.size].status.host_wq_pidx; +} + +static inline u16 t4_rq_wq_size(struct t4_wq *wq) +{ + return wq->rq.size * T4_RQ_NUM_SLOTS; +} + +static inline int t4_sq_onchip(struct t4_sq *sq) +{ + return sq->flags & T4_SQ_ONCHIP; +} + +static inline int t4_sq_empty(struct t4_wq *wq) +{ + return wq->sq.in_use == 0; +} + +static inline int t4_sq_full(struct t4_wq *wq) +{ + return wq->sq.in_use == (wq->sq.size - 1); +} + +static inline u32 t4_sq_avail(struct t4_wq *wq) +{ + return wq->sq.size - 1 - wq->sq.in_use; +} + +static inline void t4_sq_produce(struct t4_wq *wq, u8 len16) +{ + wq->sq.in_use++; + if (++wq->sq.pidx == wq->sq.size) + wq->sq.pidx = 0; + wq->sq.wq_pidx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE); + if (wq->sq.wq_pidx >= wq->sq.size * T4_SQ_NUM_SLOTS) + wq->sq.wq_pidx %= wq->sq.size * T4_SQ_NUM_SLOTS; +} + +static inline void t4_sq_consume(struct t4_wq *wq) +{ + BUG_ON(wq->sq.in_use < 1); + if (wq->sq.cidx == wq->sq.flush_cidx) + wq->sq.flush_cidx = -1; + wq->sq.in_use--; + if (++wq->sq.cidx == wq->sq.size) + wq->sq.cidx = 0; +} + +static inline u16 t4_sq_host_wq_pidx(struct t4_wq *wq) +{ + return wq->sq.queue[wq->sq.size].status.host_wq_pidx; +} + +static inline u16 t4_sq_wq_size(struct t4_wq *wq) +{ + return wq->sq.size * T4_SQ_NUM_SLOTS; +} + +/* This function copies 64 byte coalesced work request to memory + * mapped BAR2 space. For coalesced WRs, the SGE fetches data + * from the FIFO instead of from Host. + */ +static inline void pio_copy(u64 __iomem *dst, u64 *src) +{ + int count = 8; + + while (count) { + writeq(*src, dst); + src++; + dst++; + count--; + } +} + +static inline void t4_ring_sq_db(struct t4_wq *wq, u16 inc, u8 t5, + union t4_wr *wqe) +{ + + /* Flush host queue memory writes. */ + wmb(); + if (t5) { + if (inc == 1 && wqe) { + PDBG("%s: WC wq->sq.pidx = %d\n", + __func__, wq->sq.pidx); + pio_copy(wq->sq.udb + 7, (void *)wqe); + } else { + PDBG("%s: DB wq->sq.pidx = %d\n", + __func__, wq->sq.pidx); + writel(PIDX_T5(inc), wq->sq.udb); + } + + /* Flush user doorbell area writes. */ + wmb(); + return; + } + writel(QID(wq->sq.qid) | PIDX(inc), wq->db); +} + +static inline void t4_ring_rq_db(struct t4_wq *wq, u16 inc, u8 t5, + union t4_recv_wr *wqe) +{ + + /* Flush host queue memory writes. */ + wmb(); + if (t5) { + if (inc == 1 && wqe) { + PDBG("%s: WC wq->rq.pidx = %d\n", + __func__, wq->rq.pidx); + pio_copy(wq->rq.udb + 7, (void *)wqe); + } else { + PDBG("%s: DB wq->rq.pidx = %d\n", + __func__, wq->rq.pidx); + writel(PIDX_T5(inc), wq->rq.udb); + } + + /* Flush user doorbell area writes. */ + wmb(); + return; + } + writel(QID(wq->rq.qid) | PIDX(inc), wq->db); +} + +static inline int t4_wq_in_error(struct t4_wq *wq) +{ + return wq->rq.queue[wq->rq.size].status.qp_err; +} + +static inline void t4_set_wq_in_error(struct t4_wq *wq) +{ + wq->rq.queue[wq->rq.size].status.qp_err = 1; +} + +static inline void t4_disable_wq_db(struct t4_wq *wq) +{ + wq->rq.queue[wq->rq.size].status.db_off = 1; +} + +static inline void t4_enable_wq_db(struct t4_wq *wq) +{ + wq->rq.queue[wq->rq.size].status.db_off = 0; +} + +static inline int t4_wq_db_enabled(struct t4_wq *wq) +{ + return !wq->rq.queue[wq->rq.size].status.db_off; +} + +enum t4_cq_flags { + CQ_ARMED = 1, +}; + +struct t4_cq { + struct t4_cqe *queue; + dma_addr_t dma_addr; + DEFINE_DMA_UNMAP_ADDR(mapping); + struct t4_cqe *sw_queue; + void __iomem *gts; + struct c4iw_rdev *rdev; + u64 ugts; + size_t memsize; + __be64 bits_type_ts; + u32 cqid; + int vector; + u16 size; /* including status page */ + u16 cidx; + u16 sw_pidx; + u16 sw_cidx; + u16 sw_in_use; + u16 cidx_inc; + u8 gen; + u8 error; + unsigned long flags; +}; + +static inline int t4_clear_cq_armed(struct t4_cq *cq) +{ + return test_and_clear_bit(CQ_ARMED, &cq->flags); +} + +static inline int t4_arm_cq(struct t4_cq *cq, int se) +{ + u32 val; + + set_bit(CQ_ARMED, &cq->flags); + while (cq->cidx_inc > CIDXINC_MASK) { + val = SEINTARM(0) | CIDXINC(CIDXINC_MASK) | TIMERREG(7) | + INGRESSQID(cq->cqid); + writel(val, cq->gts); + cq->cidx_inc -= CIDXINC_MASK; + } + val = SEINTARM(se) | CIDXINC(cq->cidx_inc) | TIMERREG(6) | + INGRESSQID(cq->cqid); + writel(val, cq->gts); + cq->cidx_inc = 0; + return 0; +} + +static inline void t4_swcq_produce(struct t4_cq *cq) +{ + cq->sw_in_use++; + if (cq->sw_in_use == cq->size) { + PDBG("%s cxgb4 sw cq overflow cqid %u\n", __func__, cq->cqid); + cq->error = 1; + BUG_ON(1); + } + if (++cq->sw_pidx == cq->size) + cq->sw_pidx = 0; +} + +static inline void t4_swcq_consume(struct t4_cq *cq) +{ + BUG_ON(cq->sw_in_use < 1); + cq->sw_in_use--; + if (++cq->sw_cidx == cq->size) + cq->sw_cidx = 0; +} + +static inline void t4_hwcq_consume(struct t4_cq *cq) +{ + cq->bits_type_ts = cq->queue[cq->cidx].bits_type_ts; + if (++cq->cidx_inc == (cq->size >> 4) || cq->cidx_inc == CIDXINC_MASK) { + u32 val; + + val = SEINTARM(0) | CIDXINC(cq->cidx_inc) | TIMERREG(7) | + INGRESSQID(cq->cqid); + writel(val, cq->gts); + cq->cidx_inc = 0; + } + if (++cq->cidx == cq->size) { + cq->cidx = 0; + cq->gen ^= 1; + } +} + +static inline int t4_valid_cqe(struct t4_cq *cq, struct t4_cqe *cqe) +{ + return (CQE_GENBIT(cqe) == cq->gen); +} + +static inline int t4_next_hw_cqe(struct t4_cq *cq, struct t4_cqe **cqe) +{ + int ret; + u16 prev_cidx; + + if (cq->cidx == 0) + prev_cidx = cq->size - 1; + else + prev_cidx = cq->cidx - 1; + + if (cq->queue[prev_cidx].bits_type_ts != cq->bits_type_ts) { + ret = -EOVERFLOW; + cq->error = 1; + printk(KERN_ERR MOD "cq overflow cqid %u\n", cq->cqid); + BUG_ON(1); + } else if (t4_valid_cqe(cq, &cq->queue[cq->cidx])) { + + /* Ensure CQE is flushed to memory */ + rmb(); + *cqe = &cq->queue[cq->cidx]; + ret = 0; + } else + ret = -ENODATA; + return ret; +} + +static inline struct t4_cqe *t4_next_sw_cqe(struct t4_cq *cq) +{ + if (cq->sw_in_use == cq->size) { + PDBG("%s cxgb4 sw cq overflow cqid %u\n", __func__, cq->cqid); + cq->error = 1; + BUG_ON(1); + return NULL; + } + if (cq->sw_in_use) + return &cq->sw_queue[cq->sw_cidx]; + return NULL; +} + +static inline int t4_next_cqe(struct t4_cq *cq, struct t4_cqe **cqe) +{ + int ret = 0; + + if (cq->error) + ret = -ENODATA; + else if (cq->sw_in_use) + *cqe = &cq->sw_queue[cq->sw_cidx]; + else + ret = t4_next_hw_cqe(cq, cqe); + return ret; +} + +static inline int t4_cq_in_error(struct t4_cq *cq) +{ + return ((struct t4_status_page *)&cq->queue[cq->size])->qp_err; +} + +static inline void t4_set_cq_in_error(struct t4_cq *cq) +{ + ((struct t4_status_page *)&cq->queue[cq->size])->qp_err = 1; +} +#endif + +struct t4_dev_status_page { + u8 db_off; +}; diff --git a/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h new file mode 100644 index 0000000..5709e77 --- /dev/null +++ b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h @@ -0,0 +1,853 @@ +/* + * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _T4FW_RI_API_H_ +#define _T4FW_RI_API_H_ + +#include "t4fw_api.h" + +enum fw_ri_wr_opcode { + FW_RI_RDMA_WRITE = 0x0, /* IETF RDMAP v1.0 ... */ + FW_RI_READ_REQ = 0x1, + FW_RI_READ_RESP = 0x2, + FW_RI_SEND = 0x3, + FW_RI_SEND_WITH_INV = 0x4, + FW_RI_SEND_WITH_SE = 0x5, + FW_RI_SEND_WITH_SE_INV = 0x6, + FW_RI_TERMINATE = 0x7, + FW_RI_RDMA_INIT = 0x8, /* CHELSIO RI specific ... */ + FW_RI_BIND_MW = 0x9, + FW_RI_FAST_REGISTER = 0xa, + FW_RI_LOCAL_INV = 0xb, + FW_RI_QP_MODIFY = 0xc, + FW_RI_BYPASS = 0xd, + FW_RI_RECEIVE = 0xe, + + FW_RI_SGE_EC_CR_RETURN = 0xf +}; + +enum fw_ri_wr_flags { + FW_RI_COMPLETION_FLAG = 0x01, + FW_RI_NOTIFICATION_FLAG = 0x02, + FW_RI_SOLICITED_EVENT_FLAG = 0x04, + FW_RI_READ_FENCE_FLAG = 0x08, + FW_RI_LOCAL_FENCE_FLAG = 0x10, + FW_RI_RDMA_READ_INVALIDATE = 0x20 +}; + +enum fw_ri_mpa_attrs { + FW_RI_MPA_RX_MARKER_ENABLE = 0x01, + FW_RI_MPA_TX_MARKER_ENABLE = 0x02, + FW_RI_MPA_CRC_ENABLE = 0x04, + FW_RI_MPA_IETF_ENABLE = 0x08 +}; + +enum fw_ri_qp_caps { + FW_RI_QP_RDMA_READ_ENABLE = 0x01, + FW_RI_QP_RDMA_WRITE_ENABLE = 0x02, + FW_RI_QP_BIND_ENABLE = 0x04, + FW_RI_QP_FAST_REGISTER_ENABLE = 0x08, + FW_RI_QP_STAG0_ENABLE = 0x10 +}; + +enum fw_ri_addr_type { + FW_RI_ZERO_BASED_TO = 0x00, + FW_RI_VA_BASED_TO = 0x01 +}; + +enum fw_ri_mem_perms { + FW_RI_MEM_ACCESS_REM_WRITE = 0x01, + FW_RI_MEM_ACCESS_REM_READ = 0x02, + FW_RI_MEM_ACCESS_REM = 0x03, + FW_RI_MEM_ACCESS_LOCAL_WRITE = 0x04, + FW_RI_MEM_ACCESS_LOCAL_READ = 0x08, + FW_RI_MEM_ACCESS_LOCAL = 0x0C +}; + +enum fw_ri_stag_type { + FW_RI_STAG_NSMR = 0x00, + FW_RI_STAG_SMR = 0x01, + FW_RI_STAG_MW = 0x02, + FW_RI_STAG_MW_RELAXED = 0x03 +}; + +enum fw_ri_data_op { + FW_RI_DATA_IMMD = 0x81, + FW_RI_DATA_DSGL = 0x82, + FW_RI_DATA_ISGL = 0x83 +}; + +enum fw_ri_sgl_depth { + FW_RI_SGL_DEPTH_MAX_SQ = 16, + FW_RI_SGL_DEPTH_MAX_RQ = 4 +}; + +struct fw_ri_dsge_pair { + __be32 len[2]; + __be64 addr[2]; +}; + +struct fw_ri_dsgl { + __u8 op; + __u8 r1; + __be16 nsge; + __be32 len0; + __be64 addr0; +#ifndef C99_NOT_SUPPORTED + struct fw_ri_dsge_pair sge[0]; +#endif +}; + +struct fw_ri_sge { + __be32 stag; + __be32 len; + __be64 to; +}; + +struct fw_ri_isgl { + __u8 op; + __u8 r1; + __be16 nsge; + __be32 r2; +#ifndef C99_NOT_SUPPORTED + struct fw_ri_sge sge[0]; +#endif +}; + +struct fw_ri_immd { + __u8 op; + __u8 r1; + __be16 r2; + __be32 immdlen; +#ifndef C99_NOT_SUPPORTED + __u8 data[0]; +#endif +}; + +struct fw_ri_tpte { + __be32 valid_to_pdid; + __be32 locread_to_qpid; + __be32 nosnoop_pbladdr; + __be32 len_lo; + __be32 va_hi; + __be32 va_lo_fbo; + __be32 dca_mwbcnt_pstag; + __be32 len_hi; +}; + +#define S_FW_RI_TPTE_VALID 31 +#define M_FW_RI_TPTE_VALID 0x1 +#define V_FW_RI_TPTE_VALID(x) ((x) << S_FW_RI_TPTE_VALID) +#define G_FW_RI_TPTE_VALID(x) \ + (((x) >> S_FW_RI_TPTE_VALID) & M_FW_RI_TPTE_VALID) +#define F_FW_RI_TPTE_VALID V_FW_RI_TPTE_VALID(1U) + +#define S_FW_RI_TPTE_STAGKEY 23 +#define M_FW_RI_TPTE_STAGKEY 0xff +#define V_FW_RI_TPTE_STAGKEY(x) ((x) << S_FW_RI_TPTE_STAGKEY) +#define G_FW_RI_TPTE_STAGKEY(x) \ + (((x) >> S_FW_RI_TPTE_STAGKEY) & M_FW_RI_TPTE_STAGKEY) + +#define S_FW_RI_TPTE_STAGSTATE 22 +#define M_FW_RI_TPTE_STAGSTATE 0x1 +#define V_FW_RI_TPTE_STAGSTATE(x) ((x) << S_FW_RI_TPTE_STAGSTATE) +#define G_FW_RI_TPTE_STAGSTATE(x) \ + (((x) >> S_FW_RI_TPTE_STAGSTATE) & M_FW_RI_TPTE_STAGSTATE) +#define F_FW_RI_TPTE_STAGSTATE V_FW_RI_TPTE_STAGSTATE(1U) + +#define S_FW_RI_TPTE_STAGTYPE 20 +#define M_FW_RI_TPTE_STAGTYPE 0x3 +#define V_FW_RI_TPTE_STAGTYPE(x) ((x) << S_FW_RI_TPTE_STAGTYPE) +#define G_FW_RI_TPTE_STAGTYPE(x) \ + (((x) >> S_FW_RI_TPTE_STAGTYPE) & M_FW_RI_TPTE_STAGTYPE) + +#define S_FW_RI_TPTE_PDID 0 +#define M_FW_RI_TPTE_PDID 0xfffff +#define V_FW_RI_TPTE_PDID(x) ((x) << S_FW_RI_TPTE_PDID) +#define G_FW_RI_TPTE_PDID(x) \ + (((x) >> S_FW_RI_TPTE_PDID) & M_FW_RI_TPTE_PDID) + +#define S_FW_RI_TPTE_PERM 28 +#define M_FW_RI_TPTE_PERM 0xf +#define V_FW_RI_TPTE_PERM(x) ((x) << S_FW_RI_TPTE_PERM) +#define G_FW_RI_TPTE_PERM(x) \ + (((x) >> S_FW_RI_TPTE_PERM) & M_FW_RI_TPTE_PERM) + +#define S_FW_RI_TPTE_REMINVDIS 27 +#define M_FW_RI_TPTE_REMINVDIS 0x1 +#define V_FW_RI_TPTE_REMINVDIS(x) ((x) << S_FW_RI_TPTE_REMINVDIS) +#define G_FW_RI_TPTE_REMINVDIS(x) \ + (((x) >> S_FW_RI_TPTE_REMINVDIS) & M_FW_RI_TPTE_REMINVDIS) +#define F_FW_RI_TPTE_REMINVDIS V_FW_RI_TPTE_REMINVDIS(1U) + +#define S_FW_RI_TPTE_ADDRTYPE 26 +#define M_FW_RI_TPTE_ADDRTYPE 1 +#define V_FW_RI_TPTE_ADDRTYPE(x) ((x) << S_FW_RI_TPTE_ADDRTYPE) +#define G_FW_RI_TPTE_ADDRTYPE(x) \ + (((x) >> S_FW_RI_TPTE_ADDRTYPE) & M_FW_RI_TPTE_ADDRTYPE) +#define F_FW_RI_TPTE_ADDRTYPE V_FW_RI_TPTE_ADDRTYPE(1U) + +#define S_FW_RI_TPTE_MWBINDEN 25 +#define M_FW_RI_TPTE_MWBINDEN 0x1 +#define V_FW_RI_TPTE_MWBINDEN(x) ((x) << S_FW_RI_TPTE_MWBINDEN) +#define G_FW_RI_TPTE_MWBINDEN(x) \ + (((x) >> S_FW_RI_TPTE_MWBINDEN) & M_FW_RI_TPTE_MWBINDEN) +#define F_FW_RI_TPTE_MWBINDEN V_FW_RI_TPTE_MWBINDEN(1U) + +#define S_FW_RI_TPTE_PS 20 +#define M_FW_RI_TPTE_PS 0x1f +#define V_FW_RI_TPTE_PS(x) ((x) << S_FW_RI_TPTE_PS) +#define G_FW_RI_TPTE_PS(x) \ + (((x) >> S_FW_RI_TPTE_PS) & M_FW_RI_TPTE_PS) + +#define S_FW_RI_TPTE_QPID 0 +#define M_FW_RI_TPTE_QPID 0xfffff +#define V_FW_RI_TPTE_QPID(x) ((x) << S_FW_RI_TPTE_QPID) +#define G_FW_RI_TPTE_QPID(x) \ + (((x) >> S_FW_RI_TPTE_QPID) & M_FW_RI_TPTE_QPID) + +#define S_FW_RI_TPTE_NOSNOOP 30 +#define M_FW_RI_TPTE_NOSNOOP 0x1 +#define V_FW_RI_TPTE_NOSNOOP(x) ((x) << S_FW_RI_TPTE_NOSNOOP) +#define G_FW_RI_TPTE_NOSNOOP(x) \ + (((x) >> S_FW_RI_TPTE_NOSNOOP) & M_FW_RI_TPTE_NOSNOOP) +#define F_FW_RI_TPTE_NOSNOOP V_FW_RI_TPTE_NOSNOOP(1U) + +#define S_FW_RI_TPTE_PBLADDR 0 +#define M_FW_RI_TPTE_PBLADDR 0x1fffffff +#define V_FW_RI_TPTE_PBLADDR(x) ((x) << S_FW_RI_TPTE_PBLADDR) +#define G_FW_RI_TPTE_PBLADDR(x) \ + (((x) >> S_FW_RI_TPTE_PBLADDR) & M_FW_RI_TPTE_PBLADDR) + +#define S_FW_RI_TPTE_DCA 24 +#define M_FW_RI_TPTE_DCA 0x1f +#define V_FW_RI_TPTE_DCA(x) ((x) << S_FW_RI_TPTE_DCA) +#define G_FW_RI_TPTE_DCA(x) \ + (((x) >> S_FW_RI_TPTE_DCA) & M_FW_RI_TPTE_DCA) + +#define S_FW_RI_TPTE_MWBCNT_PSTAG 0 +#define M_FW_RI_TPTE_MWBCNT_PSTAG 0xffffff +#define V_FW_RI_TPTE_MWBCNT_PSTAT(x) \ + ((x) << S_FW_RI_TPTE_MWBCNT_PSTAG) +#define G_FW_RI_TPTE_MWBCNT_PSTAG(x) \ + (((x) >> S_FW_RI_TPTE_MWBCNT_PSTAG) & M_FW_RI_TPTE_MWBCNT_PSTAG) + +enum fw_ri_res_type { + FW_RI_RES_TYPE_SQ, + FW_RI_RES_TYPE_RQ, + FW_RI_RES_TYPE_CQ, +}; + +enum fw_ri_res_op { + FW_RI_RES_OP_WRITE, + FW_RI_RES_OP_RESET, +}; + +struct fw_ri_res { + union fw_ri_restype { + struct fw_ri_res_sqrq { + __u8 restype; + __u8 op; + __be16 r3; + __be32 eqid; + __be32 r4[2]; + __be32 fetchszm_to_iqid; + __be32 dcaen_to_eqsize; + __be64 eqaddr; + } sqrq; + struct fw_ri_res_cq { + __u8 restype; + __u8 op; + __be16 r3; + __be32 iqid; + __be32 r4[2]; + __be32 iqandst_to_iqandstindex; + __be16 iqdroprss_to_iqesize; + __be16 iqsize; + __be64 iqaddr; + __be32 iqns_iqro; + __be32 r6_lo; + __be64 r7; + } cq; + } u; +}; + +struct fw_ri_res_wr { + __be32 op_nres; + __be32 len16_pkd; + __u64 cookie; +#ifndef C99_NOT_SUPPORTED + struct fw_ri_res res[0]; +#endif +}; + +#define S_FW_RI_RES_WR_NRES 0 +#define M_FW_RI_RES_WR_NRES 0xff +#define V_FW_RI_RES_WR_NRES(x) ((x) << S_FW_RI_RES_WR_NRES) +#define G_FW_RI_RES_WR_NRES(x) \ + (((x) >> S_FW_RI_RES_WR_NRES) & M_FW_RI_RES_WR_NRES) + +#define S_FW_RI_RES_WR_FETCHSZM 26 +#define M_FW_RI_RES_WR_FETCHSZM 0x1 +#define V_FW_RI_RES_WR_FETCHSZM(x) ((x) << S_FW_RI_RES_WR_FETCHSZM) +#define G_FW_RI_RES_WR_FETCHSZM(x) \ + (((x) >> S_FW_RI_RES_WR_FETCHSZM) & M_FW_RI_RES_WR_FETCHSZM) +#define F_FW_RI_RES_WR_FETCHSZM V_FW_RI_RES_WR_FETCHSZM(1U) + +#define S_FW_RI_RES_WR_STATUSPGNS 25 +#define M_FW_RI_RES_WR_STATUSPGNS 0x1 +#define V_FW_RI_RES_WR_STATUSPGNS(x) ((x) << S_FW_RI_RES_WR_STATUSPGNS) +#define G_FW_RI_RES_WR_STATUSPGNS(x) \ + (((x) >> S_FW_RI_RES_WR_STATUSPGNS) & M_FW_RI_RES_WR_STATUSPGNS) +#define F_FW_RI_RES_WR_STATUSPGNS V_FW_RI_RES_WR_STATUSPGNS(1U) + +#define S_FW_RI_RES_WR_STATUSPGRO 24 +#define M_FW_RI_RES_WR_STATUSPGRO 0x1 +#define V_FW_RI_RES_WR_STATUSPGRO(x) ((x) << S_FW_RI_RES_WR_STATUSPGRO) +#define G_FW_RI_RES_WR_STATUSPGRO(x) \ + (((x) >> S_FW_RI_RES_WR_STATUSPGRO) & M_FW_RI_RES_WR_STATUSPGRO) +#define F_FW_RI_RES_WR_STATUSPGRO V_FW_RI_RES_WR_STATUSPGRO(1U) + +#define S_FW_RI_RES_WR_FETCHNS 23 +#define M_FW_RI_RES_WR_FETCHNS 0x1 +#define V_FW_RI_RES_WR_FETCHNS(x) ((x) << S_FW_RI_RES_WR_FETCHNS) +#define G_FW_RI_RES_WR_FETCHNS(x) \ + (((x) >> S_FW_RI_RES_WR_FETCHNS) & M_FW_RI_RES_WR_FETCHNS) +#define F_FW_RI_RES_WR_FETCHNS V_FW_RI_RES_WR_FETCHNS(1U) + +#define S_FW_RI_RES_WR_FETCHRO 22 +#define M_FW_RI_RES_WR_FETCHRO 0x1 +#define V_FW_RI_RES_WR_FETCHRO(x) ((x) << S_FW_RI_RES_WR_FETCHRO) +#define G_FW_RI_RES_WR_FETCHRO(x) \ + (((x) >> S_FW_RI_RES_WR_FETCHRO) & M_FW_RI_RES_WR_FETCHRO) +#define F_FW_RI_RES_WR_FETCHRO V_FW_RI_RES_WR_FETCHRO(1U) + +#define S_FW_RI_RES_WR_HOSTFCMODE 20 +#define M_FW_RI_RES_WR_HOSTFCMODE 0x3 +#define V_FW_RI_RES_WR_HOSTFCMODE(x) ((x) << S_FW_RI_RES_WR_HOSTFCMODE) +#define G_FW_RI_RES_WR_HOSTFCMODE(x) \ + (((x) >> S_FW_RI_RES_WR_HOSTFCMODE) & M_FW_RI_RES_WR_HOSTFCMODE) + +#define S_FW_RI_RES_WR_CPRIO 19 +#define M_FW_RI_RES_WR_CPRIO 0x1 +#define V_FW_RI_RES_WR_CPRIO(x) ((x) << S_FW_RI_RES_WR_CPRIO) +#define G_FW_RI_RES_WR_CPRIO(x) \ + (((x) >> S_FW_RI_RES_WR_CPRIO) & M_FW_RI_RES_WR_CPRIO) +#define F_FW_RI_RES_WR_CPRIO V_FW_RI_RES_WR_CPRIO(1U) + +#define S_FW_RI_RES_WR_ONCHIP 18 +#define M_FW_RI_RES_WR_ONCHIP 0x1 +#define V_FW_RI_RES_WR_ONCHIP(x) ((x) << S_FW_RI_RES_WR_ONCHIP) +#define G_FW_RI_RES_WR_ONCHIP(x) \ + (((x) >> S_FW_RI_RES_WR_ONCHIP) & M_FW_RI_RES_WR_ONCHIP) +#define F_FW_RI_RES_WR_ONCHIP V_FW_RI_RES_WR_ONCHIP(1U) + +#define S_FW_RI_RES_WR_PCIECHN 16 +#define M_FW_RI_RES_WR_PCIECHN 0x3 +#define V_FW_RI_RES_WR_PCIECHN(x) ((x) << S_FW_RI_RES_WR_PCIECHN) +#define G_FW_RI_RES_WR_PCIECHN(x) \ + (((x) >> S_FW_RI_RES_WR_PCIECHN) & M_FW_RI_RES_WR_PCIECHN) + +#define S_FW_RI_RES_WR_IQID 0 +#define M_FW_RI_RES_WR_IQID 0xffff +#define V_FW_RI_RES_WR_IQID(x) ((x) << S_FW_RI_RES_WR_IQID) +#define G_FW_RI_RES_WR_IQID(x) \ + (((x) >> S_FW_RI_RES_WR_IQID) & M_FW_RI_RES_WR_IQID) + +#define S_FW_RI_RES_WR_DCAEN 31 +#define M_FW_RI_RES_WR_DCAEN 0x1 +#define V_FW_RI_RES_WR_DCAEN(x) ((x) << S_FW_RI_RES_WR_DCAEN) +#define G_FW_RI_RES_WR_DCAEN(x) \ + (((x) >> S_FW_RI_RES_WR_DCAEN) & M_FW_RI_RES_WR_DCAEN) +#define F_FW_RI_RES_WR_DCAEN V_FW_RI_RES_WR_DCAEN(1U) + +#define S_FW_RI_RES_WR_DCACPU 26 +#define M_FW_RI_RES_WR_DCACPU 0x1f +#define V_FW_RI_RES_WR_DCACPU(x) ((x) << S_FW_RI_RES_WR_DCACPU) +#define G_FW_RI_RES_WR_DCACPU(x) \ + (((x) >> S_FW_RI_RES_WR_DCACPU) & M_FW_RI_RES_WR_DCACPU) + +#define S_FW_RI_RES_WR_FBMIN 23 +#define M_FW_RI_RES_WR_FBMIN 0x7 +#define V_FW_RI_RES_WR_FBMIN(x) ((x) << S_FW_RI_RES_WR_FBMIN) +#define G_FW_RI_RES_WR_FBMIN(x) \ + (((x) >> S_FW_RI_RES_WR_FBMIN) & M_FW_RI_RES_WR_FBMIN) + +#define S_FW_RI_RES_WR_FBMAX 20 +#define M_FW_RI_RES_WR_FBMAX 0x7 +#define V_FW_RI_RES_WR_FBMAX(x) ((x) << S_FW_RI_RES_WR_FBMAX) +#define G_FW_RI_RES_WR_FBMAX(x) \ + (((x) >> S_FW_RI_RES_WR_FBMAX) & M_FW_RI_RES_WR_FBMAX) + +#define S_FW_RI_RES_WR_CIDXFTHRESHO 19 +#define M_FW_RI_RES_WR_CIDXFTHRESHO 0x1 +#define V_FW_RI_RES_WR_CIDXFTHRESHO(x) ((x) << S_FW_RI_RES_WR_CIDXFTHRESHO) +#define G_FW_RI_RES_WR_CIDXFTHRESHO(x) \ + (((x) >> S_FW_RI_RES_WR_CIDXFTHRESHO) & M_FW_RI_RES_WR_CIDXFTHRESHO) +#define F_FW_RI_RES_WR_CIDXFTHRESHO V_FW_RI_RES_WR_CIDXFTHRESHO(1U) + +#define S_FW_RI_RES_WR_CIDXFTHRESH 16 +#define M_FW_RI_RES_WR_CIDXFTHRESH 0x7 +#define V_FW_RI_RES_WR_CIDXFTHRESH(x) ((x) << S_FW_RI_RES_WR_CIDXFTHRESH) +#define G_FW_RI_RES_WR_CIDXFTHRESH(x) \ + (((x) >> S_FW_RI_RES_WR_CIDXFTHRESH) & M_FW_RI_RES_WR_CIDXFTHRESH) + +#define S_FW_RI_RES_WR_EQSIZE 0 +#define M_FW_RI_RES_WR_EQSIZE 0xffff +#define V_FW_RI_RES_WR_EQSIZE(x) ((x) << S_FW_RI_RES_WR_EQSIZE) +#define G_FW_RI_RES_WR_EQSIZE(x) \ + (((x) >> S_FW_RI_RES_WR_EQSIZE) & M_FW_RI_RES_WR_EQSIZE) + +#define S_FW_RI_RES_WR_IQANDST 15 +#define M_FW_RI_RES_WR_IQANDST 0x1 +#define V_FW_RI_RES_WR_IQANDST(x) ((x) << S_FW_RI_RES_WR_IQANDST) +#define G_FW_RI_RES_WR_IQANDST(x) \ + (((x) >> S_FW_RI_RES_WR_IQANDST) & M_FW_RI_RES_WR_IQANDST) +#define F_FW_RI_RES_WR_IQANDST V_FW_RI_RES_WR_IQANDST(1U) + +#define S_FW_RI_RES_WR_IQANUS 14 +#define M_FW_RI_RES_WR_IQANUS 0x1 +#define V_FW_RI_RES_WR_IQANUS(x) ((x) << S_FW_RI_RES_WR_IQANUS) +#define G_FW_RI_RES_WR_IQANUS(x) \ + (((x) >> S_FW_RI_RES_WR_IQANUS) & M_FW_RI_RES_WR_IQANUS) +#define F_FW_RI_RES_WR_IQANUS V_FW_RI_RES_WR_IQANUS(1U) + +#define S_FW_RI_RES_WR_IQANUD 12 +#define M_FW_RI_RES_WR_IQANUD 0x3 +#define V_FW_RI_RES_WR_IQANUD(x) ((x) << S_FW_RI_RES_WR_IQANUD) +#define G_FW_RI_RES_WR_IQANUD(x) \ + (((x) >> S_FW_RI_RES_WR_IQANUD) & M_FW_RI_RES_WR_IQANUD) + +#define S_FW_RI_RES_WR_IQANDSTINDEX 0 +#define M_FW_RI_RES_WR_IQANDSTINDEX 0xfff +#define V_FW_RI_RES_WR_IQANDSTINDEX(x) ((x) << S_FW_RI_RES_WR_IQANDSTINDEX) +#define G_FW_RI_RES_WR_IQANDSTINDEX(x) \ + (((x) >> S_FW_RI_RES_WR_IQANDSTINDEX) & M_FW_RI_RES_WR_IQANDSTINDEX) + +#define S_FW_RI_RES_WR_IQDROPRSS 15 +#define M_FW_RI_RES_WR_IQDROPRSS 0x1 +#define V_FW_RI_RES_WR_IQDROPRSS(x) ((x) << S_FW_RI_RES_WR_IQDROPRSS) +#define G_FW_RI_RES_WR_IQDROPRSS(x) \ + (((x) >> S_FW_RI_RES_WR_IQDROPRSS) & M_FW_RI_RES_WR_IQDROPRSS) +#define F_FW_RI_RES_WR_IQDROPRSS V_FW_RI_RES_WR_IQDROPRSS(1U) + +#define S_FW_RI_RES_WR_IQGTSMODE 14 +#define M_FW_RI_RES_WR_IQGTSMODE 0x1 +#define V_FW_RI_RES_WR_IQGTSMODE(x) ((x) << S_FW_RI_RES_WR_IQGTSMODE) +#define G_FW_RI_RES_WR_IQGTSMODE(x) \ + (((x) >> S_FW_RI_RES_WR_IQGTSMODE) & M_FW_RI_RES_WR_IQGTSMODE) +#define F_FW_RI_RES_WR_IQGTSMODE V_FW_RI_RES_WR_IQGTSMODE(1U) + +#define S_FW_RI_RES_WR_IQPCIECH 12 +#define M_FW_RI_RES_WR_IQPCIECH 0x3 +#define V_FW_RI_RES_WR_IQPCIECH(x) ((x) << S_FW_RI_RES_WR_IQPCIECH) +#define G_FW_RI_RES_WR_IQPCIECH(x) \ + (((x) >> S_FW_RI_RES_WR_IQPCIECH) & M_FW_RI_RES_WR_IQPCIECH) + +#define S_FW_RI_RES_WR_IQDCAEN 11 +#define M_FW_RI_RES_WR_IQDCAEN 0x1 +#define V_FW_RI_RES_WR_IQDCAEN(x) ((x) << S_FW_RI_RES_WR_IQDCAEN) +#define G_FW_RI_RES_WR_IQDCAEN(x) \ + (((x) >> S_FW_RI_RES_WR_IQDCAEN) & M_FW_RI_RES_WR_IQDCAEN) +#define F_FW_RI_RES_WR_IQDCAEN V_FW_RI_RES_WR_IQDCAEN(1U) + +#define S_FW_RI_RES_WR_IQDCACPU 6 +#define M_FW_RI_RES_WR_IQDCACPU 0x1f +#define V_FW_RI_RES_WR_IQDCACPU(x) ((x) << S_FW_RI_RES_WR_IQDCACPU) +#define G_FW_RI_RES_WR_IQDCACPU(x) \ + (((x) >> S_FW_RI_RES_WR_IQDCACPU) & M_FW_RI_RES_WR_IQDCACPU) + +#define S_FW_RI_RES_WR_IQINTCNTTHRESH 4 +#define M_FW_RI_RES_WR_IQINTCNTTHRESH 0x3 +#define V_FW_RI_RES_WR_IQINTCNTTHRESH(x) \ + ((x) << S_FW_RI_RES_WR_IQINTCNTTHRESH) +#define G_FW_RI_RES_WR_IQINTCNTTHRESH(x) \ + (((x) >> S_FW_RI_RES_WR_IQINTCNTTHRESH) & M_FW_RI_RES_WR_IQINTCNTTHRESH) + +#define S_FW_RI_RES_WR_IQO 3 +#define M_FW_RI_RES_WR_IQO 0x1 +#define V_FW_RI_RES_WR_IQO(x) ((x) << S_FW_RI_RES_WR_IQO) +#define G_FW_RI_RES_WR_IQO(x) \ + (((x) >> S_FW_RI_RES_WR_IQO) & M_FW_RI_RES_WR_IQO) +#define F_FW_RI_RES_WR_IQO V_FW_RI_RES_WR_IQO(1U) + +#define S_FW_RI_RES_WR_IQCPRIO 2 +#define M_FW_RI_RES_WR_IQCPRIO 0x1 +#define V_FW_RI_RES_WR_IQCPRIO(x) ((x) << S_FW_RI_RES_WR_IQCPRIO) +#define G_FW_RI_RES_WR_IQCPRIO(x) \ + (((x) >> S_FW_RI_RES_WR_IQCPRIO) & M_FW_RI_RES_WR_IQCPRIO) +#define F_FW_RI_RES_WR_IQCPRIO V_FW_RI_RES_WR_IQCPRIO(1U) + +#define S_FW_RI_RES_WR_IQESIZE 0 +#define M_FW_RI_RES_WR_IQESIZE 0x3 +#define V_FW_RI_RES_WR_IQESIZE(x) ((x) << S_FW_RI_RES_WR_IQESIZE) +#define G_FW_RI_RES_WR_IQESIZE(x) \ + (((x) >> S_FW_RI_RES_WR_IQESIZE) & M_FW_RI_RES_WR_IQESIZE) + +#define S_FW_RI_RES_WR_IQNS 31 +#define M_FW_RI_RES_WR_IQNS 0x1 +#define V_FW_RI_RES_WR_IQNS(x) ((x) << S_FW_RI_RES_WR_IQNS) +#define G_FW_RI_RES_WR_IQNS(x) \ + (((x) >> S_FW_RI_RES_WR_IQNS) & M_FW_RI_RES_WR_IQNS) +#define F_FW_RI_RES_WR_IQNS V_FW_RI_RES_WR_IQNS(1U) + +#define S_FW_RI_RES_WR_IQRO 30 +#define M_FW_RI_RES_WR_IQRO 0x1 +#define V_FW_RI_RES_WR_IQRO(x) ((x) << S_FW_RI_RES_WR_IQRO) +#define G_FW_RI_RES_WR_IQRO(x) \ + (((x) >> S_FW_RI_RES_WR_IQRO) & M_FW_RI_RES_WR_IQRO) +#define F_FW_RI_RES_WR_IQRO V_FW_RI_RES_WR_IQRO(1U) + +struct fw_ri_rdma_write_wr { + __u8 opcode; + __u8 flags; + __u16 wrid; + __u8 r1[3]; + __u8 len16; + __be64 r2; + __be32 plen; + __be32 stag_sink; + __be64 to_sink; +#ifndef C99_NOT_SUPPORTED + union { + struct fw_ri_immd immd_src[0]; + struct fw_ri_isgl isgl_src[0]; + } u; +#endif +}; + +struct fw_ri_send_wr { + __u8 opcode; + __u8 flags; + __u16 wrid; + __u8 r1[3]; + __u8 len16; + __be32 sendop_pkd; + __be32 stag_inv; + __be32 plen; + __be32 r3; + __be64 r4; +#ifndef C99_NOT_SUPPORTED + union { + struct fw_ri_immd immd_src[0]; + struct fw_ri_isgl isgl_src[0]; + } u; +#endif +}; + +#define S_FW_RI_SEND_WR_SENDOP 0 +#define M_FW_RI_SEND_WR_SENDOP 0xf +#define V_FW_RI_SEND_WR_SENDOP(x) ((x) << S_FW_RI_SEND_WR_SENDOP) +#define G_FW_RI_SEND_WR_SENDOP(x) \ + (((x) >> S_FW_RI_SEND_WR_SENDOP) & M_FW_RI_SEND_WR_SENDOP) + +struct fw_ri_rdma_read_wr { + __u8 opcode; + __u8 flags; + __u16 wrid; + __u8 r1[3]; + __u8 len16; + __be64 r2; + __be32 stag_sink; + __be32 to_sink_hi; + __be32 to_sink_lo; + __be32 plen; + __be32 stag_src; + __be32 to_src_hi; + __be32 to_src_lo; + __be32 r5; +}; + +struct fw_ri_recv_wr { + __u8 opcode; + __u8 r1; + __u16 wrid; + __u8 r2[3]; + __u8 len16; + struct fw_ri_isgl isgl; +}; + +struct fw_ri_bind_mw_wr { + __u8 opcode; + __u8 flags; + __u16 wrid; + __u8 r1[3]; + __u8 len16; + __u8 qpbinde_to_dcacpu; + __u8 pgsz_shift; + __u8 addr_type; + __u8 mem_perms; + __be32 stag_mr; + __be32 stag_mw; + __be32 r3; + __be64 len_mw; + __be64 va_fbo; + __be64 r4; +}; + +#define S_FW_RI_BIND_MW_WR_QPBINDE 6 +#define M_FW_RI_BIND_MW_WR_QPBINDE 0x1 +#define V_FW_RI_BIND_MW_WR_QPBINDE(x) ((x) << S_FW_RI_BIND_MW_WR_QPBINDE) +#define G_FW_RI_BIND_MW_WR_QPBINDE(x) \ + (((x) >> S_FW_RI_BIND_MW_WR_QPBINDE) & M_FW_RI_BIND_MW_WR_QPBINDE) +#define F_FW_RI_BIND_MW_WR_QPBINDE V_FW_RI_BIND_MW_WR_QPBINDE(1U) + +#define S_FW_RI_BIND_MW_WR_NS 5 +#define M_FW_RI_BIND_MW_WR_NS 0x1 +#define V_FW_RI_BIND_MW_WR_NS(x) ((x) << S_FW_RI_BIND_MW_WR_NS) +#define G_FW_RI_BIND_MW_WR_NS(x) \ + (((x) >> S_FW_RI_BIND_MW_WR_NS) & M_FW_RI_BIND_MW_WR_NS) +#define F_FW_RI_BIND_MW_WR_NS V_FW_RI_BIND_MW_WR_NS(1U) + +#define S_FW_RI_BIND_MW_WR_DCACPU 0 +#define M_FW_RI_BIND_MW_WR_DCACPU 0x1f +#define V_FW_RI_BIND_MW_WR_DCACPU(x) ((x) << S_FW_RI_BIND_MW_WR_DCACPU) +#define G_FW_RI_BIND_MW_WR_DCACPU(x) \ + (((x) >> S_FW_RI_BIND_MW_WR_DCACPU) & M_FW_RI_BIND_MW_WR_DCACPU) + +struct fw_ri_fr_nsmr_wr { + __u8 opcode; + __u8 flags; + __u16 wrid; + __u8 r1[3]; + __u8 len16; + __u8 qpbinde_to_dcacpu; + __u8 pgsz_shift; + __u8 addr_type; + __u8 mem_perms; + __be32 stag; + __be32 len_hi; + __be32 len_lo; + __be32 va_hi; + __be32 va_lo_fbo; +}; + +#define S_FW_RI_FR_NSMR_WR_QPBINDE 6 +#define M_FW_RI_FR_NSMR_WR_QPBINDE 0x1 +#define V_FW_RI_FR_NSMR_WR_QPBINDE(x) ((x) << S_FW_RI_FR_NSMR_WR_QPBINDE) +#define G_FW_RI_FR_NSMR_WR_QPBINDE(x) \ + (((x) >> S_FW_RI_FR_NSMR_WR_QPBINDE) & M_FW_RI_FR_NSMR_WR_QPBINDE) +#define F_FW_RI_FR_NSMR_WR_QPBINDE V_FW_RI_FR_NSMR_WR_QPBINDE(1U) + +#define S_FW_RI_FR_NSMR_WR_NS 5 +#define M_FW_RI_FR_NSMR_WR_NS 0x1 +#define V_FW_RI_FR_NSMR_WR_NS(x) ((x) << S_FW_RI_FR_NSMR_WR_NS) +#define G_FW_RI_FR_NSMR_WR_NS(x) \ + (((x) >> S_FW_RI_FR_NSMR_WR_NS) & M_FW_RI_FR_NSMR_WR_NS) +#define F_FW_RI_FR_NSMR_WR_NS V_FW_RI_FR_NSMR_WR_NS(1U) + +#define S_FW_RI_FR_NSMR_WR_DCACPU 0 +#define M_FW_RI_FR_NSMR_WR_DCACPU 0x1f +#define V_FW_RI_FR_NSMR_WR_DCACPU(x) ((x) << S_FW_RI_FR_NSMR_WR_DCACPU) +#define G_FW_RI_FR_NSMR_WR_DCACPU(x) \ + (((x) >> S_FW_RI_FR_NSMR_WR_DCACPU) & M_FW_RI_FR_NSMR_WR_DCACPU) + +struct fw_ri_inv_lstag_wr { + __u8 opcode; + __u8 flags; + __u16 wrid; + __u8 r1[3]; + __u8 len16; + __be32 r2; + __be32 stag_inv; +}; + +enum fw_ri_type { + FW_RI_TYPE_INIT, + FW_RI_TYPE_FINI, + FW_RI_TYPE_TERMINATE +}; + +enum fw_ri_init_p2ptype { + FW_RI_INIT_P2PTYPE_RDMA_WRITE = FW_RI_RDMA_WRITE, + FW_RI_INIT_P2PTYPE_READ_REQ = FW_RI_READ_REQ, + FW_RI_INIT_P2PTYPE_SEND = FW_RI_SEND, + FW_RI_INIT_P2PTYPE_SEND_WITH_INV = FW_RI_SEND_WITH_INV, + FW_RI_INIT_P2PTYPE_SEND_WITH_SE = FW_RI_SEND_WITH_SE, + FW_RI_INIT_P2PTYPE_SEND_WITH_SE_INV = FW_RI_SEND_WITH_SE_INV, + FW_RI_INIT_P2PTYPE_DISABLED = 0xf, +}; + +struct fw_ri_wr { + __be32 op_compl; + __be32 flowid_len16; + __u64 cookie; + union fw_ri { + struct fw_ri_init { + __u8 type; + __u8 mpareqbit_p2ptype; + __u8 r4[2]; + __u8 mpa_attrs; + __u8 qp_caps; + __be16 nrqe; + __be32 pdid; + __be32 qpid; + __be32 sq_eqid; + __be32 rq_eqid; + __be32 scqid; + __be32 rcqid; + __be32 ord_max; + __be32 ird_max; + __be32 iss; + __be32 irs; + __be32 hwrqsize; + __be32 hwrqaddr; + __be64 r5; + union fw_ri_init_p2p { + struct fw_ri_rdma_write_wr write; + struct fw_ri_rdma_read_wr read; + struct fw_ri_send_wr send; + } u; + } init; + struct fw_ri_fini { + __u8 type; + __u8 r3[7]; + __be64 r4; + } fini; + struct fw_ri_terminate { + __u8 type; + __u8 r3[3]; + __be32 immdlen; + __u8 termmsg[40]; + } terminate; + } u; +}; + +#define S_FW_RI_WR_MPAREQBIT 7 +#define M_FW_RI_WR_MPAREQBIT 0x1 +#define V_FW_RI_WR_MPAREQBIT(x) ((x) << S_FW_RI_WR_MPAREQBIT) +#define G_FW_RI_WR_MPAREQBIT(x) \ + (((x) >> S_FW_RI_WR_MPAREQBIT) & M_FW_RI_WR_MPAREQBIT) +#define F_FW_RI_WR_MPAREQBIT V_FW_RI_WR_MPAREQBIT(1U) + +#define S_FW_RI_WR_P2PTYPE 0 +#define M_FW_RI_WR_P2PTYPE 0xf +#define V_FW_RI_WR_P2PTYPE(x) ((x) << S_FW_RI_WR_P2PTYPE) +#define G_FW_RI_WR_P2PTYPE(x) \ + (((x) >> S_FW_RI_WR_P2PTYPE) & M_FW_RI_WR_P2PTYPE) + +struct tcp_options { + __be16 mss; + __u8 wsf; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8:4; + __u8 unknown:1; + __u8:1; + __u8 sack:1; + __u8 tstamp:1; +#else + __u8 tstamp:1; + __u8 sack:1; + __u8:1; + __u8 unknown:1; + __u8:4; +#endif +}; + +struct cpl_pass_accept_req { + union opcode_tid ot; + __be16 rsvd; + __be16 len; + __be32 hdr_len; + __be16 vlan; + __be16 l2info; + __be32 tos_stid; + struct tcp_options tcpopt; +}; + +/* cpl_pass_accept_req.hdr_len fields */ +#define S_SYN_RX_CHAN 0 +#define M_SYN_RX_CHAN 0xF +#define V_SYN_RX_CHAN(x) ((x) << S_SYN_RX_CHAN) +#define G_SYN_RX_CHAN(x) (((x) >> S_SYN_RX_CHAN) & M_SYN_RX_CHAN) + +#define S_TCP_HDR_LEN 10 +#define M_TCP_HDR_LEN 0x3F +#define V_TCP_HDR_LEN(x) ((x) << S_TCP_HDR_LEN) +#define G_TCP_HDR_LEN(x) (((x) >> S_TCP_HDR_LEN) & M_TCP_HDR_LEN) + +#define S_IP_HDR_LEN 16 +#define M_IP_HDR_LEN 0x3FF +#define V_IP_HDR_LEN(x) ((x) << S_IP_HDR_LEN) +#define G_IP_HDR_LEN(x) (((x) >> S_IP_HDR_LEN) & M_IP_HDR_LEN) + +#define S_ETH_HDR_LEN 26 +#define M_ETH_HDR_LEN 0x1F +#define V_ETH_HDR_LEN(x) ((x) << S_ETH_HDR_LEN) +#define G_ETH_HDR_LEN(x) (((x) >> S_ETH_HDR_LEN) & M_ETH_HDR_LEN) + +/* cpl_pass_accept_req.l2info fields */ +#define S_SYN_MAC_IDX 0 +#define M_SYN_MAC_IDX 0x1FF +#define V_SYN_MAC_IDX(x) ((x) << S_SYN_MAC_IDX) +#define G_SYN_MAC_IDX(x) (((x) >> S_SYN_MAC_IDX) & M_SYN_MAC_IDX) + +#define S_SYN_XACT_MATCH 9 +#define V_SYN_XACT_MATCH(x) ((x) << S_SYN_XACT_MATCH) +#define F_SYN_XACT_MATCH V_SYN_XACT_MATCH(1U) + +#define S_SYN_INTF 12 +#define M_SYN_INTF 0xF +#define V_SYN_INTF(x) ((x) << S_SYN_INTF) +#define G_SYN_INTF(x) (((x) >> S_SYN_INTF) & M_SYN_INTF) + +struct ulptx_idata { + __be32 cmd_more; + __be32 len; +}; + +#define S_ULPTX_NSGE 0 +#define M_ULPTX_NSGE 0xFFFF +#define V_ULPTX_NSGE(x) ((x) << S_ULPTX_NSGE) + +#define S_RX_DACK_MODE 29 +#define M_RX_DACK_MODE 0x3 +#define V_RX_DACK_MODE(x) ((x) << S_RX_DACK_MODE) +#define G_RX_DACK_MODE(x) (((x) >> S_RX_DACK_MODE) & M_RX_DACK_MODE) + +#define S_RX_DACK_CHANGE 31 +#define V_RX_DACK_CHANGE(x) ((x) << S_RX_DACK_CHANGE) +#define F_RX_DACK_CHANGE V_RX_DACK_CHANGE(1U) + +enum { /* TCP congestion control algorithms */ + CONG_ALG_RENO, + CONG_ALG_TAHOE, + CONG_ALG_NEWRENO, + CONG_ALG_HIGHSPEED +}; + +#define S_CONG_CNTRL 14 +#define M_CONG_CNTRL 0x3 +#define V_CONG_CNTRL(x) ((x) << S_CONG_CNTRL) +#define G_CONG_CNTRL(x) (((x) >> S_CONG_CNTRL) & M_CONG_CNTRL) + +#define CONG_CNTRL_VALID (1 << 18) + +#endif /* _T4FW_RI_API_H_ */ diff --git a/drivers/infiniband/hw/cxgb4/user.h b/drivers/infiniband/hw/cxgb4/user.h new file mode 100644 index 0000000..cbd0ce1 --- /dev/null +++ b/drivers/infiniband/hw/cxgb4/user.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __C4IW_USER_H__ +#define __C4IW_USER_H__ + +#define C4IW_UVERBS_ABI_VERSION 2 + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ +struct c4iw_create_cq_resp { + __u64 key; + __u64 gts_key; + __u64 memsize; + __u32 cqid; + __u32 size; + __u32 qid_mask; + __u32 reserved; /* explicit padding (optional for i386) */ +}; + + +enum { + C4IW_QPF_ONCHIP = (1<<0) +}; + +struct c4iw_create_qp_resp { + __u64 ma_sync_key; + __u64 sq_key; + __u64 rq_key; + __u64 sq_db_gts_key; + __u64 rq_db_gts_key; + __u64 sq_memsize; + __u64 rq_memsize; + __u32 sqid; + __u32 rqid; + __u32 sq_size; + __u32 rq_size; + __u32 qid_mask; + __u32 flags; +}; + +struct c4iw_alloc_ucontext_resp { + __u64 status_page_key; + __u32 status_page_size; + __u32 reserved; /* explicit padding (optional for i386) */ +}; +#endif diff --git a/drivers/infiniband/hw/ehca/Kconfig b/drivers/infiniband/hw/ehca/Kconfig new file mode 100644 index 0000000..59f807d --- /dev/null +++ b/drivers/infiniband/hw/ehca/Kconfig @@ -0,0 +1,9 @@ +config INFINIBAND_EHCA + tristate "eHCA support" + depends on IBMEBUS + ---help--- + This driver supports the IBM pSeries eHCA InfiniBand adapter. + + To compile the driver as a module, choose M here. The module + will be called ib_ehca. + diff --git a/drivers/infiniband/hw/ehca/Makefile b/drivers/infiniband/hw/ehca/Makefile new file mode 100644 index 0000000..74d284e --- /dev/null +++ b/drivers/infiniband/hw/ehca/Makefile @@ -0,0 +1,16 @@ +# Authors: Heiko J Schick +# Christoph Raisch +# Joachim Fenkes +# +# Copyright (c) 2005 IBM Corporation +# +# All rights reserved. +# +# This source code is distributed under a dual license of GPL v2.0 and OpenIB BSD. + +obj-$(CONFIG_INFINIBAND_EHCA) += ib_ehca.o + +ib_ehca-objs = ehca_main.o ehca_hca.o ehca_mcast.o ehca_pd.o ehca_av.o ehca_eq.o \ + ehca_cq.o ehca_qp.o ehca_sqp.o ehca_mrmw.o ehca_reqs.o ehca_irq.o \ + ehca_uverbs.o ipz_pt_fn.o hcp_if.o hcp_phyp.o + diff --git a/drivers/infiniband/hw/ehca/ehca_av.c b/drivers/infiniband/hw/ehca/ehca_av.c new file mode 100644 index 0000000..4659263 --- /dev/null +++ b/drivers/infiniband/hw/ehca/ehca_av.c @@ -0,0 +1,277 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * address vector functions + * + * Authors: Hoang-Nam Nguyen + * Khadija Souissi + * Reinhard Ernst + * Christoph Raisch + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "ehca_tools.h" +#include "ehca_iverbs.h" +#include "hcp_if.h" + +static struct kmem_cache *av_cache; + +int ehca_calc_ipd(struct ehca_shca *shca, int port, + enum ib_rate path_rate, u32 *ipd) +{ + int path = ib_rate_to_mult(path_rate); + int link, ret; + struct ib_port_attr pa; + + if (path_rate == IB_RATE_PORT_CURRENT) { + *ipd = 0; + return 0; + } + + if (unlikely(path < 0)) { + ehca_err(&shca->ib_device, "Invalid static rate! path_rate=%x", + path_rate); + return -EINVAL; + } + + ret = ehca_query_port(&shca->ib_device, port, &pa); + if (unlikely(ret < 0)) { + ehca_err(&shca->ib_device, "Failed to query port ret=%i", ret); + return ret; + } + + link = ib_width_enum_to_int(pa.active_width) * pa.active_speed; + + if (path >= link) + /* no need to throttle if path faster than link */ + *ipd = 0; + else + /* IPD = round((link / path) - 1) */ + *ipd = ((link + (path >> 1)) / path) - 1; + + return 0; +} + +struct ib_ah *ehca_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +{ + int ret; + struct ehca_av *av; + struct ehca_shca *shca = container_of(pd->device, struct ehca_shca, + ib_device); + + av = kmem_cache_alloc(av_cache, GFP_KERNEL); + if (!av) { + ehca_err(pd->device, "Out of memory pd=%p ah_attr=%p", + pd, ah_attr); + return ERR_PTR(-ENOMEM); + } + + av->av.sl = ah_attr->sl; + av->av.dlid = ah_attr->dlid; + av->av.slid_path_bits = ah_attr->src_path_bits; + + if (ehca_static_rate < 0) { + u32 ipd; + if (ehca_calc_ipd(shca, ah_attr->port_num, + ah_attr->static_rate, &ipd)) { + ret = -EINVAL; + goto create_ah_exit1; + } + av->av.ipd = ipd; + } else + av->av.ipd = ehca_static_rate; + + av->av.lnh = ah_attr->ah_flags; + av->av.grh.word_0 = EHCA_BMASK_SET(GRH_IPVERSION_MASK, 6); + av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_TCLASS_MASK, + ah_attr->grh.traffic_class); + av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_FLOWLABEL_MASK, + ah_attr->grh.flow_label); + av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_HOPLIMIT_MASK, + ah_attr->grh.hop_limit); + av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_NEXTHEADER_MASK, 0x1B); + /* set sgid in grh.word_1 */ + if (ah_attr->ah_flags & IB_AH_GRH) { + int rc; + struct ib_port_attr port_attr; + union ib_gid gid; + memset(&port_attr, 0, sizeof(port_attr)); + rc = ehca_query_port(pd->device, ah_attr->port_num, + &port_attr); + if (rc) { /* invalid port number */ + ret = -EINVAL; + ehca_err(pd->device, "Invalid port number " + "ehca_query_port() returned %x " + "pd=%p ah_attr=%p", rc, pd, ah_attr); + goto create_ah_exit1; + } + memset(&gid, 0, sizeof(gid)); + rc = ehca_query_gid(pd->device, + ah_attr->port_num, + ah_attr->grh.sgid_index, &gid); + if (rc) { + ret = -EINVAL; + ehca_err(pd->device, "Failed to retrieve sgid " + "ehca_query_gid() returned %x " + "pd=%p ah_attr=%p", rc, pd, ah_attr); + goto create_ah_exit1; + } + memcpy(&av->av.grh.word_1, &gid, sizeof(gid)); + } + av->av.pmtu = shca->max_mtu; + + /* dgid comes in grh.word_3 */ + memcpy(&av->av.grh.word_3, &ah_attr->grh.dgid, + sizeof(ah_attr->grh.dgid)); + + return &av->ib_ah; + +create_ah_exit1: + kmem_cache_free(av_cache, av); + + return ERR_PTR(ret); +} + +int ehca_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr) +{ + struct ehca_av *av; + struct ehca_ud_av new_ehca_av; + struct ehca_shca *shca = container_of(ah->pd->device, struct ehca_shca, + ib_device); + + memset(&new_ehca_av, 0, sizeof(new_ehca_av)); + new_ehca_av.sl = ah_attr->sl; + new_ehca_av.dlid = ah_attr->dlid; + new_ehca_av.slid_path_bits = ah_attr->src_path_bits; + new_ehca_av.ipd = ah_attr->static_rate; + new_ehca_av.lnh = EHCA_BMASK_SET(GRH_FLAG_MASK, + (ah_attr->ah_flags & IB_AH_GRH) > 0); + new_ehca_av.grh.word_0 = EHCA_BMASK_SET(GRH_TCLASS_MASK, + ah_attr->grh.traffic_class); + new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_FLOWLABEL_MASK, + ah_attr->grh.flow_label); + new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_HOPLIMIT_MASK, + ah_attr->grh.hop_limit); + new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_NEXTHEADER_MASK, 0x1b); + + /* set sgid in grh.word_1 */ + if (ah_attr->ah_flags & IB_AH_GRH) { + int rc; + struct ib_port_attr port_attr; + union ib_gid gid; + memset(&port_attr, 0, sizeof(port_attr)); + rc = ehca_query_port(ah->device, ah_attr->port_num, + &port_attr); + if (rc) { /* invalid port number */ + ehca_err(ah->device, "Invalid port number " + "ehca_query_port() returned %x " + "ah=%p ah_attr=%p port_num=%x", + rc, ah, ah_attr, ah_attr->port_num); + return -EINVAL; + } + memset(&gid, 0, sizeof(gid)); + rc = ehca_query_gid(ah->device, + ah_attr->port_num, + ah_attr->grh.sgid_index, &gid); + if (rc) { + ehca_err(ah->device, "Failed to retrieve sgid " + "ehca_query_gid() returned %x " + "ah=%p ah_attr=%p port_num=%x " + "sgid_index=%x", + rc, ah, ah_attr, ah_attr->port_num, + ah_attr->grh.sgid_index); + return -EINVAL; + } + memcpy(&new_ehca_av.grh.word_1, &gid, sizeof(gid)); + } + + new_ehca_av.pmtu = shca->max_mtu; + + memcpy(&new_ehca_av.grh.word_3, &ah_attr->grh.dgid, + sizeof(ah_attr->grh.dgid)); + + av = container_of(ah, struct ehca_av, ib_ah); + av->av = new_ehca_av; + + return 0; +} + +int ehca_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr) +{ + struct ehca_av *av = container_of(ah, struct ehca_av, ib_ah); + + memcpy(&ah_attr->grh.dgid, &av->av.grh.word_3, + sizeof(ah_attr->grh.dgid)); + ah_attr->sl = av->av.sl; + + ah_attr->dlid = av->av.dlid; + + ah_attr->src_path_bits = av->av.slid_path_bits; + ah_attr->static_rate = av->av.ipd; + ah_attr->ah_flags = EHCA_BMASK_GET(GRH_FLAG_MASK, av->av.lnh); + ah_attr->grh.traffic_class = EHCA_BMASK_GET(GRH_TCLASS_MASK, + av->av.grh.word_0); + ah_attr->grh.hop_limit = EHCA_BMASK_GET(GRH_HOPLIMIT_MASK, + av->av.grh.word_0); + ah_attr->grh.flow_label = EHCA_BMASK_GET(GRH_FLOWLABEL_MASK, + av->av.grh.word_0); + + return 0; +} + +int ehca_destroy_ah(struct ib_ah *ah) +{ + kmem_cache_free(av_cache, container_of(ah, struct ehca_av, ib_ah)); + + return 0; +} + +int ehca_init_av_cache(void) +{ + av_cache = kmem_cache_create("ehca_cache_av", + sizeof(struct ehca_av), 0, + SLAB_HWCACHE_ALIGN, + NULL); + if (!av_cache) + return -ENOMEM; + return 0; +} + +void ehca_cleanup_av_cache(void) +{ + if (av_cache) + kmem_cache_destroy(av_cache); +} diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h new file mode 100644 index 0000000..bd45e0f --- /dev/null +++ b/drivers/infiniband/hw/ehca/ehca_classes.h @@ -0,0 +1,482 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Struct definition for eHCA internal structures + * + * Authors: Heiko J Schick + * Christoph Raisch + * Joachim Fenkes + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __EHCA_CLASSES_H__ +#define __EHCA_CLASSES_H__ + +struct ehca_module; +struct ehca_qp; +struct ehca_cq; +struct ehca_eq; +struct ehca_mr; +struct ehca_mw; +struct ehca_pd; +struct ehca_av; + +#include +#include + +#include +#include + +#ifdef CONFIG_PPC64 +#include "ehca_classes_pSeries.h" +#endif +#include "ipz_pt_fn.h" +#include "ehca_qes.h" +#include "ehca_irq.h" + +#define EHCA_EQE_CACHE_SIZE 20 +#define EHCA_MAX_NUM_QUEUES 0xffff + +struct ehca_eqe_cache_entry { + struct ehca_eqe *eqe; + struct ehca_cq *cq; +}; + +struct ehca_eq { + u32 length; + struct ipz_queue ipz_queue; + struct ipz_eq_handle ipz_eq_handle; + struct work_struct work; + struct h_galpas galpas; + int is_initialized; + struct ehca_pfeq pf; + spinlock_t spinlock; + struct tasklet_struct interrupt_task; + u32 ist; + spinlock_t irq_spinlock; + struct ehca_eqe_cache_entry eqe_cache[EHCA_EQE_CACHE_SIZE]; +}; + +struct ehca_sma_attr { + u16 lid, lmc, sm_sl, sm_lid; + u16 pkey_tbl_len, pkeys[16]; +}; + +struct ehca_sport { + struct ib_cq *ibcq_aqp1; + struct ib_qp *ibqp_sqp[2]; + /* lock to serialze modify_qp() calls for sqp in normal + * and irq path (when event PORT_ACTIVE is received first time) + */ + spinlock_t mod_sqp_lock; + enum ib_port_state port_state; + struct ehca_sma_attr saved_attr; + u32 pma_qp_nr; +}; + +#define HCA_CAP_MR_PGSIZE_4K 0x80000000 +#define HCA_CAP_MR_PGSIZE_64K 0x40000000 +#define HCA_CAP_MR_PGSIZE_1M 0x20000000 +#define HCA_CAP_MR_PGSIZE_16M 0x10000000 + +struct ehca_shca { + struct ib_device ib_device; + struct platform_device *ofdev; + u8 num_ports; + int hw_level; + struct list_head shca_list; + struct ipz_adapter_handle ipz_hca_handle; + struct ehca_sport sport[2]; + struct ehca_eq eq; + struct ehca_eq neq; + struct ehca_mr *maxmr; + struct ehca_pd *pd; + struct h_galpas galpas; + struct mutex modify_mutex; + u64 hca_cap; + /* MR pgsize: bit 0-3 means 4K, 64K, 1M, 16M respectively */ + u32 hca_cap_mr_pgsize; + int max_mtu; + int max_num_qps; + int max_num_cqs; + atomic_t num_cqs; + atomic_t num_qps; +}; + +struct ehca_pd { + struct ib_pd ib_pd; + struct ipz_pd fw_pd; + /* small queue mgmt */ + struct mutex lock; + struct list_head free[2]; + struct list_head full[2]; +}; + +enum ehca_ext_qp_type { + EQPT_NORMAL = 0, + EQPT_LLQP = 1, + EQPT_SRQBASE = 2, + EQPT_SRQ = 3, +}; + +/* struct to cache modify_qp()'s parms for GSI/SMI qp */ +struct ehca_mod_qp_parm { + int mask; + struct ib_qp_attr attr; +}; + +#define EHCA_MOD_QP_PARM_MAX 4 + +#define QMAP_IDX_MASK 0xFFFFULL + +/* struct for tracking if cqes have been reported to the application */ +struct ehca_qmap_entry { + u16 app_wr_id; + u8 reported; + u8 cqe_req; +}; + +struct ehca_queue_map { + struct ehca_qmap_entry *map; + unsigned int entries; + unsigned int tail; + unsigned int left_to_poll; + unsigned int next_wqe_idx; /* Idx to first wqe to be flushed */ +}; + +/* function to calculate the next index for the qmap */ +static inline unsigned int next_index(unsigned int cur_index, unsigned int limit) +{ + unsigned int temp = cur_index + 1; + return (temp == limit) ? 0 : temp; +} + +struct ehca_qp { + union { + struct ib_qp ib_qp; + struct ib_srq ib_srq; + }; + u32 qp_type; + enum ehca_ext_qp_type ext_type; + enum ib_qp_state state; + struct ipz_queue ipz_squeue; + struct ehca_queue_map sq_map; + struct ipz_queue ipz_rqueue; + struct ehca_queue_map rq_map; + struct h_galpas galpas; + u32 qkey; + u32 real_qp_num; + u32 token; + spinlock_t spinlock_s; + spinlock_t spinlock_r; + u32 sq_max_inline_data_size; + struct ipz_qp_handle ipz_qp_handle; + struct ehca_pfqp pf; + struct ib_qp_init_attr init_attr; + struct ehca_cq *send_cq; + struct ehca_cq *recv_cq; + unsigned int sqerr_purgeflag; + struct hlist_node list_entries; + /* array to cache modify_qp()'s parms for GSI/SMI qp */ + struct ehca_mod_qp_parm *mod_qp_parm; + int mod_qp_parm_idx; + /* mmap counter for resources mapped into user space */ + u32 mm_count_squeue; + u32 mm_count_rqueue; + u32 mm_count_galpa; + /* unsolicited ack circumvention */ + int unsol_ack_circ; + int mtu_shift; + u32 message_count; + u32 packet_count; + atomic_t nr_events; /* events seen */ + wait_queue_head_t wait_completion; + int mig_armed; + struct list_head sq_err_node; + struct list_head rq_err_node; +}; + +#define IS_SRQ(qp) (qp->ext_type == EQPT_SRQ) +#define HAS_SQ(qp) (qp->ext_type != EQPT_SRQ) +#define HAS_RQ(qp) (qp->ext_type != EQPT_SRQBASE) + +/* must be power of 2 */ +#define QP_HASHTAB_LEN 8 + +struct ehca_cq { + struct ib_cq ib_cq; + struct ipz_queue ipz_queue; + struct h_galpas galpas; + spinlock_t spinlock; + u32 cq_number; + u32 token; + u32 nr_of_entries; + struct ipz_cq_handle ipz_cq_handle; + struct ehca_pfcq pf; + spinlock_t cb_lock; + struct hlist_head qp_hashtab[QP_HASHTAB_LEN]; + struct list_head entry; + u32 nr_callbacks; /* #events assigned to cpu by scaling code */ + atomic_t nr_events; /* #events seen */ + wait_queue_head_t wait_completion; + spinlock_t task_lock; + /* mmap counter for resources mapped into user space */ + u32 mm_count_queue; + u32 mm_count_galpa; + struct list_head sqp_err_list; + struct list_head rqp_err_list; +}; + +enum ehca_mr_flag { + EHCA_MR_FLAG_FMR = 0x80000000, /* FMR, created with ehca_alloc_fmr */ + EHCA_MR_FLAG_MAXMR = 0x40000000, /* max-MR */ +}; + +struct ehca_mr { + union { + struct ib_mr ib_mr; /* must always be first in ehca_mr */ + struct ib_fmr ib_fmr; /* must always be first in ehca_mr */ + } ib; + struct ib_umem *umem; + spinlock_t mrlock; + + enum ehca_mr_flag flags; + u32 num_kpages; /* number of kernel pages */ + u32 num_hwpages; /* number of hw pages to form MR */ + u64 hwpage_size; /* hw page size used for this MR */ + int acl; /* ACL (stored here for usage in reregister) */ + u64 *start; /* virtual start address (stored here for */ + /* usage in reregister) */ + u64 size; /* size (stored here for usage in reregister) */ + u32 fmr_page_size; /* page size for FMR */ + u32 fmr_max_pages; /* max pages for FMR */ + u32 fmr_max_maps; /* max outstanding maps for FMR */ + u32 fmr_map_cnt; /* map counter for FMR */ + /* fw specific data */ + struct ipz_mrmw_handle ipz_mr_handle; /* MR handle for h-calls */ + struct h_galpas galpas; +}; + +struct ehca_mw { + struct ib_mw ib_mw; /* gen2 mw, must always be first in ehca_mw */ + spinlock_t mwlock; + + u8 never_bound; /* indication MW was never bound */ + struct ipz_mrmw_handle ipz_mw_handle; /* MW handle for h-calls */ + struct h_galpas galpas; +}; + +enum ehca_mr_pgi_type { + EHCA_MR_PGI_PHYS = 1, /* type of ehca_reg_phys_mr, + * ehca_rereg_phys_mr, + * ehca_reg_internal_maxmr */ + EHCA_MR_PGI_USER = 2, /* type of ehca_reg_user_mr */ + EHCA_MR_PGI_FMR = 3 /* type of ehca_map_phys_fmr */ +}; + +struct ehca_mr_pginfo { + enum ehca_mr_pgi_type type; + u64 num_kpages; + u64 kpage_cnt; + u64 hwpage_size; /* hw page size used for this MR */ + u64 num_hwpages; /* number of hw pages */ + u64 hwpage_cnt; /* counter for hw pages */ + u64 next_hwpage; /* next hw page in buffer/chunk/listelem */ + + union { + struct { /* type EHCA_MR_PGI_PHYS section */ + int num_phys_buf; + struct ib_phys_buf *phys_buf_array; + u64 next_buf; + } phy; + struct { /* type EHCA_MR_PGI_USER section */ + struct ib_umem *region; + struct scatterlist *next_sg; + u64 next_nmap; + } usr; + struct { /* type EHCA_MR_PGI_FMR section */ + u64 fmr_pgsize; + u64 *page_list; + u64 next_listelem; + } fmr; + } u; +}; + +/* output parameters for MR/FMR hipz calls */ +struct ehca_mr_hipzout_parms { + struct ipz_mrmw_handle handle; + u32 lkey; + u32 rkey; + u64 len; + u64 vaddr; + u32 acl; +}; + +/* output parameters for MW hipz calls */ +struct ehca_mw_hipzout_parms { + struct ipz_mrmw_handle handle; + u32 rkey; +}; + +struct ehca_av { + struct ib_ah ib_ah; + struct ehca_ud_av av; +}; + +struct ehca_ucontext { + struct ib_ucontext ib_ucontext; +}; + +int ehca_init_pd_cache(void); +void ehca_cleanup_pd_cache(void); +int ehca_init_cq_cache(void); +void ehca_cleanup_cq_cache(void); +int ehca_init_qp_cache(void); +void ehca_cleanup_qp_cache(void); +int ehca_init_av_cache(void); +void ehca_cleanup_av_cache(void); +int ehca_init_mrmw_cache(void); +void ehca_cleanup_mrmw_cache(void); +int ehca_init_small_qp_cache(void); +void ehca_cleanup_small_qp_cache(void); + +extern rwlock_t ehca_qp_idr_lock; +extern rwlock_t ehca_cq_idr_lock; +extern struct idr ehca_qp_idr; +extern struct idr ehca_cq_idr; +extern spinlock_t shca_list_lock; + +extern int ehca_static_rate; +extern int ehca_port_act_time; +extern bool ehca_use_hp_mr; +extern bool ehca_scaling_code; +extern int ehca_lock_hcalls; +extern int ehca_nr_ports; +extern int ehca_max_cq; +extern int ehca_max_qp; + +struct ipzu_queue_resp { + u32 qe_size; /* queue entry size */ + u32 act_nr_of_sg; + u32 queue_length; /* queue length allocated in bytes */ + u32 pagesize; + u32 toggle_state; + u32 offset; /* save offset within a page for small_qp */ +}; + +struct ehca_create_cq_resp { + u32 cq_number; + u32 token; + struct ipzu_queue_resp ipz_queue; + u32 fw_handle_ofs; + u32 dummy; +}; + +struct ehca_create_qp_resp { + u32 qp_num; + u32 token; + u32 qp_type; + u32 ext_type; + u32 qkey; + /* qp_num assigned by ehca: sqp0/1 may have got different numbers */ + u32 real_qp_num; + u32 fw_handle_ofs; + u32 dummy; + struct ipzu_queue_resp ipz_squeue; + struct ipzu_queue_resp ipz_rqueue; +}; + +struct ehca_alloc_cq_parms { + u32 nr_cqe; + u32 act_nr_of_entries; + u32 act_pages; + struct ipz_eq_handle eq_handle; +}; + +enum ehca_service_type { + ST_RC = 0, + ST_UC = 1, + ST_RD = 2, + ST_UD = 3, +}; + +enum ehca_ll_comp_flags { + LLQP_SEND_COMP = 0x20, + LLQP_RECV_COMP = 0x40, + LLQP_COMP_MASK = 0x60, +}; + +struct ehca_alloc_queue_parms { + /* input parameters */ + int max_wr; + int max_sge; + int page_size; + int is_small; + + /* output parameters */ + u16 act_nr_wqes; + u8 act_nr_sges; + u32 queue_size; /* bytes for small queues, pages otherwise */ +}; + +struct ehca_alloc_qp_parms { + struct ehca_alloc_queue_parms squeue; + struct ehca_alloc_queue_parms rqueue; + + /* input parameters */ + enum ehca_service_type servicetype; + int qp_storage; + int sigtype; + enum ehca_ext_qp_type ext_type; + enum ehca_ll_comp_flags ll_comp_flags; + int ud_av_l_key_ctl; + + u32 token; + struct ipz_eq_handle eq_handle; + struct ipz_pd pd; + struct ipz_cq_handle send_cq_handle, recv_cq_handle; + + u32 srq_qpn, srq_token, srq_limit; + + /* output parameters */ + u32 real_qp_num; + struct ipz_qp_handle qp_handle; + struct h_galpas galpas; +}; + +int ehca_cq_assign_qp(struct ehca_cq *cq, struct ehca_qp *qp); +int ehca_cq_unassign_qp(struct ehca_cq *cq, unsigned int qp_num); +struct ehca_qp *ehca_cq_get_qp(struct ehca_cq *cq, int qp_num); + +#endif diff --git a/drivers/infiniband/hw/ehca/ehca_classes_pSeries.h b/drivers/infiniband/hw/ehca/ehca_classes_pSeries.h new file mode 100644 index 0000000..689c357 --- /dev/null +++ b/drivers/infiniband/hw/ehca/ehca_classes_pSeries.h @@ -0,0 +1,208 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * pSeries interface definitions + * + * Authors: Waleri Fomin + * Christoph Raisch + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __EHCA_CLASSES_PSERIES_H__ +#define __EHCA_CLASSES_PSERIES_H__ + +#include "hcp_phyp.h" +#include "ipz_pt_fn.h" + + +struct ehca_pfqp { + struct ipz_qpt sqpt; + struct ipz_qpt rqpt; +}; + +struct ehca_pfcq { + struct ipz_qpt qpt; + u32 cqnr; +}; + +struct ehca_pfeq { + struct ipz_qpt qpt; + struct h_galpa galpa; + u32 eqnr; +}; + +struct ipz_adapter_handle { + u64 handle; +}; + +struct ipz_cq_handle { + u64 handle; +}; + +struct ipz_eq_handle { + u64 handle; +}; + +struct ipz_qp_handle { + u64 handle; +}; +struct ipz_mrmw_handle { + u64 handle; +}; + +struct ipz_pd { + u32 value; +}; + +struct hcp_modify_qp_control_block { + u32 qkey; /* 00 */ + u32 rdd; /* reliable datagram domain */ + u32 send_psn; /* 02 */ + u32 receive_psn; /* 03 */ + u32 prim_phys_port; /* 04 */ + u32 alt_phys_port; /* 05 */ + u32 prim_p_key_idx; /* 06 */ + u32 alt_p_key_idx; /* 07 */ + u32 rdma_atomic_ctrl; /* 08 */ + u32 qp_state; /* 09 */ + u32 reserved_10; /* 10 */ + u32 rdma_nr_atomic_resp_res; /* 11 */ + u32 path_migration_state; /* 12 */ + u32 rdma_atomic_outst_dest_qp; /* 13 */ + u32 dest_qp_nr; /* 14 */ + u32 min_rnr_nak_timer_field; /* 15 */ + u32 service_level; /* 16 */ + u32 send_grh_flag; /* 17 */ + u32 retry_count; /* 18 */ + u32 timeout; /* 19 */ + u32 path_mtu; /* 20 */ + u32 max_static_rate; /* 21 */ + u32 dlid; /* 22 */ + u32 rnr_retry_count; /* 23 */ + u32 source_path_bits; /* 24 */ + u32 traffic_class; /* 25 */ + u32 hop_limit; /* 26 */ + u32 source_gid_idx; /* 27 */ + u32 flow_label; /* 28 */ + u32 reserved_29; /* 29 */ + union { /* 30 */ + u64 dw[2]; + u8 byte[16]; + } dest_gid; + u32 service_level_al; /* 34 */ + u32 send_grh_flag_al; /* 35 */ + u32 retry_count_al; /* 36 */ + u32 timeout_al; /* 37 */ + u32 max_static_rate_al; /* 38 */ + u32 dlid_al; /* 39 */ + u32 rnr_retry_count_al; /* 40 */ + u32 source_path_bits_al; /* 41 */ + u32 traffic_class_al; /* 42 */ + u32 hop_limit_al; /* 43 */ + u32 source_gid_idx_al; /* 44 */ + u32 flow_label_al; /* 45 */ + u32 reserved_46; /* 46 */ + u32 reserved_47; /* 47 */ + union { /* 48 */ + u64 dw[2]; + u8 byte[16]; + } dest_gid_al; + u32 max_nr_outst_send_wr; /* 52 */ + u32 max_nr_outst_recv_wr; /* 53 */ + u32 disable_ete_credit_check; /* 54 */ + u32 qp_number; /* 55 */ + u64 send_queue_handle; /* 56 */ + u64 recv_queue_handle; /* 58 */ + u32 actual_nr_sges_in_sq_wqe; /* 60 */ + u32 actual_nr_sges_in_rq_wqe; /* 61 */ + u32 qp_enable; /* 62 */ + u32 curr_srq_limit; /* 63 */ + u64 qp_aff_asyn_ev_log_reg; /* 64 */ + u64 shared_rq_hndl; /* 66 */ + u64 trigg_doorbell_qp_hndl; /* 68 */ + u32 reserved_70_127[58]; /* 70 */ +}; + +#define MQPCB_MASK_QKEY EHCA_BMASK_IBM( 0, 0) +#define MQPCB_MASK_SEND_PSN EHCA_BMASK_IBM( 2, 2) +#define MQPCB_MASK_RECEIVE_PSN EHCA_BMASK_IBM( 3, 3) +#define MQPCB_MASK_PRIM_PHYS_PORT EHCA_BMASK_IBM( 4, 4) +#define MQPCB_PRIM_PHYS_PORT EHCA_BMASK_IBM(24, 31) +#define MQPCB_MASK_ALT_PHYS_PORT EHCA_BMASK_IBM( 5, 5) +#define MQPCB_MASK_PRIM_P_KEY_IDX EHCA_BMASK_IBM( 6, 6) +#define MQPCB_PRIM_P_KEY_IDX EHCA_BMASK_IBM(24, 31) +#define MQPCB_MASK_ALT_P_KEY_IDX EHCA_BMASK_IBM( 7, 7) +#define MQPCB_MASK_RDMA_ATOMIC_CTRL EHCA_BMASK_IBM( 8, 8) +#define MQPCB_MASK_QP_STATE EHCA_BMASK_IBM( 9, 9) +#define MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES EHCA_BMASK_IBM(11, 11) +#define MQPCB_MASK_PATH_MIGRATION_STATE EHCA_BMASK_IBM(12, 12) +#define MQPCB_MASK_RDMA_ATOMIC_OUTST_DEST_QP EHCA_BMASK_IBM(13, 13) +#define MQPCB_MASK_DEST_QP_NR EHCA_BMASK_IBM(14, 14) +#define MQPCB_MASK_MIN_RNR_NAK_TIMER_FIELD EHCA_BMASK_IBM(15, 15) +#define MQPCB_MASK_SERVICE_LEVEL EHCA_BMASK_IBM(16, 16) +#define MQPCB_MASK_SEND_GRH_FLAG EHCA_BMASK_IBM(17, 17) +#define MQPCB_MASK_RETRY_COUNT EHCA_BMASK_IBM(18, 18) +#define MQPCB_MASK_TIMEOUT EHCA_BMASK_IBM(19, 19) +#define MQPCB_MASK_PATH_MTU EHCA_BMASK_IBM(20, 20) +#define MQPCB_MASK_MAX_STATIC_RATE EHCA_BMASK_IBM(21, 21) +#define MQPCB_MASK_DLID EHCA_BMASK_IBM(22, 22) +#define MQPCB_MASK_RNR_RETRY_COUNT EHCA_BMASK_IBM(23, 23) +#define MQPCB_MASK_SOURCE_PATH_BITS EHCA_BMASK_IBM(24, 24) +#define MQPCB_MASK_TRAFFIC_CLASS EHCA_BMASK_IBM(25, 25) +#define MQPCB_MASK_HOP_LIMIT EHCA_BMASK_IBM(26, 26) +#define MQPCB_MASK_SOURCE_GID_IDX EHCA_BMASK_IBM(27, 27) +#define MQPCB_MASK_FLOW_LABEL EHCA_BMASK_IBM(28, 28) +#define MQPCB_MASK_DEST_GID EHCA_BMASK_IBM(30, 30) +#define MQPCB_MASK_SERVICE_LEVEL_AL EHCA_BMASK_IBM(31, 31) +#define MQPCB_MASK_SEND_GRH_FLAG_AL EHCA_BMASK_IBM(32, 32) +#define MQPCB_MASK_RETRY_COUNT_AL EHCA_BMASK_IBM(33, 33) +#define MQPCB_MASK_TIMEOUT_AL EHCA_BMASK_IBM(34, 34) +#define MQPCB_MASK_MAX_STATIC_RATE_AL EHCA_BMASK_IBM(35, 35) +#define MQPCB_MASK_DLID_AL EHCA_BMASK_IBM(36, 36) +#define MQPCB_MASK_RNR_RETRY_COUNT_AL EHCA_BMASK_IBM(37, 37) +#define MQPCB_MASK_SOURCE_PATH_BITS_AL EHCA_BMASK_IBM(38, 38) +#define MQPCB_MASK_TRAFFIC_CLASS_AL EHCA_BMASK_IBM(39, 39) +#define MQPCB_MASK_HOP_LIMIT_AL EHCA_BMASK_IBM(40, 40) +#define MQPCB_MASK_SOURCE_GID_IDX_AL EHCA_BMASK_IBM(41, 41) +#define MQPCB_MASK_FLOW_LABEL_AL EHCA_BMASK_IBM(42, 42) +#define MQPCB_MASK_DEST_GID_AL EHCA_BMASK_IBM(44, 44) +#define MQPCB_MASK_MAX_NR_OUTST_SEND_WR EHCA_BMASK_IBM(45, 45) +#define MQPCB_MASK_MAX_NR_OUTST_RECV_WR EHCA_BMASK_IBM(46, 46) +#define MQPCB_MASK_DISABLE_ETE_CREDIT_CHECK EHCA_BMASK_IBM(47, 47) +#define MQPCB_MASK_QP_ENABLE EHCA_BMASK_IBM(48, 48) +#define MQPCB_MASK_CURR_SRQ_LIMIT EHCA_BMASK_IBM(49, 49) +#define MQPCB_MASK_QP_AFF_ASYN_EV_LOG_REG EHCA_BMASK_IBM(50, 50) +#define MQPCB_MASK_SHARED_RQ_HNDL EHCA_BMASK_IBM(51, 51) + +#endif /* __EHCA_CLASSES_PSERIES_H__ */ diff --git a/drivers/infiniband/hw/ehca/ehca_cq.c b/drivers/infiniband/hw/ehca/ehca_cq.c new file mode 100644 index 0000000..8cc8375 --- /dev/null +++ b/drivers/infiniband/hw/ehca/ehca_cq.c @@ -0,0 +1,392 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Completion queue handling + * + * Authors: Waleri Fomin + * Khadija Souissi + * Reinhard Ernst + * Heiko J Schick + * Hoang-Nam Nguyen + * + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "ehca_iverbs.h" +#include "ehca_classes.h" +#include "ehca_irq.h" +#include "hcp_if.h" + +static struct kmem_cache *cq_cache; + +int ehca_cq_assign_qp(struct ehca_cq *cq, struct ehca_qp *qp) +{ + unsigned int qp_num = qp->real_qp_num; + unsigned int key = qp_num & (QP_HASHTAB_LEN-1); + unsigned long flags; + + spin_lock_irqsave(&cq->spinlock, flags); + hlist_add_head(&qp->list_entries, &cq->qp_hashtab[key]); + spin_unlock_irqrestore(&cq->spinlock, flags); + + ehca_dbg(cq->ib_cq.device, "cq_num=%x real_qp_num=%x", + cq->cq_number, qp_num); + + return 0; +} + +int ehca_cq_unassign_qp(struct ehca_cq *cq, unsigned int real_qp_num) +{ + int ret = -EINVAL; + unsigned int key = real_qp_num & (QP_HASHTAB_LEN-1); + struct hlist_node *iter; + struct ehca_qp *qp; + unsigned long flags; + + spin_lock_irqsave(&cq->spinlock, flags); + hlist_for_each(iter, &cq->qp_hashtab[key]) { + qp = hlist_entry(iter, struct ehca_qp, list_entries); + if (qp->real_qp_num == real_qp_num) { + hlist_del(iter); + ehca_dbg(cq->ib_cq.device, + "removed qp from cq .cq_num=%x real_qp_num=%x", + cq->cq_number, real_qp_num); + ret = 0; + break; + } + } + spin_unlock_irqrestore(&cq->spinlock, flags); + if (ret) + ehca_err(cq->ib_cq.device, + "qp not found cq_num=%x real_qp_num=%x", + cq->cq_number, real_qp_num); + + return ret; +} + +struct ehca_qp *ehca_cq_get_qp(struct ehca_cq *cq, int real_qp_num) +{ + struct ehca_qp *ret = NULL; + unsigned int key = real_qp_num & (QP_HASHTAB_LEN-1); + struct hlist_node *iter; + struct ehca_qp *qp; + hlist_for_each(iter, &cq->qp_hashtab[key]) { + qp = hlist_entry(iter, struct ehca_qp, list_entries); + if (qp->real_qp_num == real_qp_num) { + ret = qp; + break; + } + } + return ret; +} + +struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + static const u32 additional_cqe = 20; + struct ib_cq *cq; + struct ehca_cq *my_cq; + struct ehca_shca *shca = + container_of(device, struct ehca_shca, ib_device); + struct ipz_adapter_handle adapter_handle; + struct ehca_alloc_cq_parms param; /* h_call's out parameters */ + struct h_galpa gal; + void *vpage; + u32 counter; + u64 rpage, cqx_fec, h_ret; + int ipz_rc, i; + unsigned long flags; + + if (cqe >= 0xFFFFFFFF - 64 - additional_cqe) + return ERR_PTR(-EINVAL); + + if (!atomic_add_unless(&shca->num_cqs, 1, shca->max_num_cqs)) { + ehca_err(device, "Unable to create CQ, max number of %i " + "CQs reached.", shca->max_num_cqs); + ehca_err(device, "To increase the maximum number of CQs " + "use the number_of_cqs module parameter.\n"); + return ERR_PTR(-ENOSPC); + } + + my_cq = kmem_cache_zalloc(cq_cache, GFP_KERNEL); + if (!my_cq) { + ehca_err(device, "Out of memory for ehca_cq struct device=%p", + device); + atomic_dec(&shca->num_cqs); + return ERR_PTR(-ENOMEM); + } + + memset(¶m, 0, sizeof(struct ehca_alloc_cq_parms)); + + spin_lock_init(&my_cq->spinlock); + spin_lock_init(&my_cq->cb_lock); + spin_lock_init(&my_cq->task_lock); + atomic_set(&my_cq->nr_events, 0); + init_waitqueue_head(&my_cq->wait_completion); + + cq = &my_cq->ib_cq; + + adapter_handle = shca->ipz_hca_handle; + param.eq_handle = shca->eq.ipz_eq_handle; + + idr_preload(GFP_KERNEL); + write_lock_irqsave(&ehca_cq_idr_lock, flags); + my_cq->token = idr_alloc(&ehca_cq_idr, my_cq, 0, 0x2000000, GFP_NOWAIT); + write_unlock_irqrestore(&ehca_cq_idr_lock, flags); + idr_preload_end(); + + if (my_cq->token < 0) { + cq = ERR_PTR(-ENOMEM); + ehca_err(device, "Can't allocate new idr entry. device=%p", + device); + goto create_cq_exit1; + } + + /* + * CQs maximum depth is 4GB-64, but we need additional 20 as buffer + * for receiving errors CQEs. + */ + param.nr_cqe = cqe + additional_cqe; + h_ret = hipz_h_alloc_resource_cq(adapter_handle, my_cq, ¶m); + + if (h_ret != H_SUCCESS) { + ehca_err(device, "hipz_h_alloc_resource_cq() failed " + "h_ret=%lli device=%p", h_ret, device); + cq = ERR_PTR(ehca2ib_return_code(h_ret)); + goto create_cq_exit2; + } + + ipz_rc = ipz_queue_ctor(NULL, &my_cq->ipz_queue, param.act_pages, + EHCA_PAGESIZE, sizeof(struct ehca_cqe), 0, 0); + if (!ipz_rc) { + ehca_err(device, "ipz_queue_ctor() failed ipz_rc=%i device=%p", + ipz_rc, device); + cq = ERR_PTR(-EINVAL); + goto create_cq_exit3; + } + + for (counter = 0; counter < param.act_pages; counter++) { + vpage = ipz_qpageit_get_inc(&my_cq->ipz_queue); + if (!vpage) { + ehca_err(device, "ipz_qpageit_get_inc() " + "returns NULL device=%p", device); + cq = ERR_PTR(-EAGAIN); + goto create_cq_exit4; + } + rpage = __pa(vpage); + + h_ret = hipz_h_register_rpage_cq(adapter_handle, + my_cq->ipz_cq_handle, + &my_cq->pf, + 0, + 0, + rpage, + 1, + my_cq->galpas. + kernel); + + if (h_ret < H_SUCCESS) { + ehca_err(device, "hipz_h_register_rpage_cq() failed " + "ehca_cq=%p cq_num=%x h_ret=%lli counter=%i " + "act_pages=%i", my_cq, my_cq->cq_number, + h_ret, counter, param.act_pages); + cq = ERR_PTR(-EINVAL); + goto create_cq_exit4; + } + + if (counter == (param.act_pages - 1)) { + vpage = ipz_qpageit_get_inc(&my_cq->ipz_queue); + if ((h_ret != H_SUCCESS) || vpage) { + ehca_err(device, "Registration of pages not " + "complete ehca_cq=%p cq_num=%x " + "h_ret=%lli", my_cq, my_cq->cq_number, + h_ret); + cq = ERR_PTR(-EAGAIN); + goto create_cq_exit4; + } + } else { + if (h_ret != H_PAGE_REGISTERED) { + ehca_err(device, "Registration of page failed " + "ehca_cq=%p cq_num=%x h_ret=%lli " + "counter=%i act_pages=%i", + my_cq, my_cq->cq_number, + h_ret, counter, param.act_pages); + cq = ERR_PTR(-ENOMEM); + goto create_cq_exit4; + } + } + } + + ipz_qeit_reset(&my_cq->ipz_queue); + + gal = my_cq->galpas.kernel; + cqx_fec = hipz_galpa_load(gal, CQTEMM_OFFSET(cqx_fec)); + ehca_dbg(device, "ehca_cq=%p cq_num=%x CQX_FEC=%llx", + my_cq, my_cq->cq_number, cqx_fec); + + my_cq->ib_cq.cqe = my_cq->nr_of_entries = + param.act_nr_of_entries - additional_cqe; + my_cq->cq_number = (my_cq->ipz_cq_handle.handle) & 0xffff; + + for (i = 0; i < QP_HASHTAB_LEN; i++) + INIT_HLIST_HEAD(&my_cq->qp_hashtab[i]); + + INIT_LIST_HEAD(&my_cq->sqp_err_list); + INIT_LIST_HEAD(&my_cq->rqp_err_list); + + if (context) { + struct ipz_queue *ipz_queue = &my_cq->ipz_queue; + struct ehca_create_cq_resp resp; + memset(&resp, 0, sizeof(resp)); + resp.cq_number = my_cq->cq_number; + resp.token = my_cq->token; + resp.ipz_queue.qe_size = ipz_queue->qe_size; + resp.ipz_queue.act_nr_of_sg = ipz_queue->act_nr_of_sg; + resp.ipz_queue.queue_length = ipz_queue->queue_length; + resp.ipz_queue.pagesize = ipz_queue->pagesize; + resp.ipz_queue.toggle_state = ipz_queue->toggle_state; + resp.fw_handle_ofs = (u32) + (my_cq->galpas.user.fw_handle & (PAGE_SIZE - 1)); + if (ib_copy_to_udata(udata, &resp, sizeof(resp))) { + ehca_err(device, "Copy to udata failed."); + cq = ERR_PTR(-EFAULT); + goto create_cq_exit4; + } + } + + return cq; + +create_cq_exit4: + ipz_queue_dtor(NULL, &my_cq->ipz_queue); + +create_cq_exit3: + h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 1); + if (h_ret != H_SUCCESS) + ehca_err(device, "hipz_h_destroy_cq() failed ehca_cq=%p " + "cq_num=%x h_ret=%lli", my_cq, my_cq->cq_number, h_ret); + +create_cq_exit2: + write_lock_irqsave(&ehca_cq_idr_lock, flags); + idr_remove(&ehca_cq_idr, my_cq->token); + write_unlock_irqrestore(&ehca_cq_idr_lock, flags); + +create_cq_exit1: + kmem_cache_free(cq_cache, my_cq); + + atomic_dec(&shca->num_cqs); + return cq; +} + +int ehca_destroy_cq(struct ib_cq *cq) +{ + u64 h_ret; + struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq); + int cq_num = my_cq->cq_number; + struct ib_device *device = cq->device; + struct ehca_shca *shca = container_of(device, struct ehca_shca, + ib_device); + struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle; + unsigned long flags; + + if (cq->uobject) { + if (my_cq->mm_count_galpa || my_cq->mm_count_queue) { + ehca_err(device, "Resources still referenced in " + "user space cq_num=%x", my_cq->cq_number); + return -EINVAL; + } + } + + /* + * remove the CQ from the idr first to make sure + * no more interrupt tasklets will touch this CQ + */ + write_lock_irqsave(&ehca_cq_idr_lock, flags); + idr_remove(&ehca_cq_idr, my_cq->token); + write_unlock_irqrestore(&ehca_cq_idr_lock, flags); + + /* now wait until all pending events have completed */ + wait_event(my_cq->wait_completion, !atomic_read(&my_cq->nr_events)); + + /* nobody's using our CQ any longer -- we can destroy it */ + h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 0); + if (h_ret == H_R_STATE) { + /* cq in err: read err data and destroy it forcibly */ + ehca_dbg(device, "ehca_cq=%p cq_num=%x resource=%llx in err " + "state. Try to delete it forcibly.", + my_cq, cq_num, my_cq->ipz_cq_handle.handle); + ehca_error_data(shca, my_cq, my_cq->ipz_cq_handle.handle); + h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 1); + if (h_ret == H_SUCCESS) + ehca_dbg(device, "cq_num=%x deleted successfully.", + cq_num); + } + if (h_ret != H_SUCCESS) { + ehca_err(device, "hipz_h_destroy_cq() failed h_ret=%lli " + "ehca_cq=%p cq_num=%x", h_ret, my_cq, cq_num); + return ehca2ib_return_code(h_ret); + } + ipz_queue_dtor(NULL, &my_cq->ipz_queue); + kmem_cache_free(cq_cache, my_cq); + + atomic_dec(&shca->num_cqs); + return 0; +} + +int ehca_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata) +{ + /* TODO: proper resize needs to be done */ + ehca_err(cq->device, "not implemented yet"); + + return -EFAULT; +} + +int ehca_init_cq_cache(void) +{ + cq_cache = kmem_cache_create("ehca_cache_cq", + sizeof(struct ehca_cq), 0, + SLAB_HWCACHE_ALIGN, + NULL); + if (!cq_cache) + return -ENOMEM; + return 0; +} + +void ehca_cleanup_cq_cache(void) +{ + if (cq_cache) + kmem_cache_destroy(cq_cache); +} diff --git a/drivers/infiniband/hw/ehca/ehca_eq.c b/drivers/infiniband/hw/ehca/ehca_eq.c new file mode 100644 index 0000000..90da674 --- /dev/null +++ b/drivers/infiniband/hw/ehca/ehca_eq.c @@ -0,0 +1,189 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Event queue handling + * + * Authors: Waleri Fomin + * Khadija Souissi + * Reinhard Ernst + * Heiko J Schick + * Hoang-Nam Nguyen + * + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "ehca_classes.h" +#include "ehca_irq.h" +#include "ehca_iverbs.h" +#include "ehca_qes.h" +#include "hcp_if.h" +#include "ipz_pt_fn.h" + +int ehca_create_eq(struct ehca_shca *shca, + struct ehca_eq *eq, + const enum ehca_eq_type type, const u32 length) +{ + int ret; + u64 h_ret; + u32 nr_pages; + u32 i; + void *vpage; + struct ib_device *ib_dev = &shca->ib_device; + + spin_lock_init(&eq->spinlock); + spin_lock_init(&eq->irq_spinlock); + eq->is_initialized = 0; + + if (type != EHCA_EQ && type != EHCA_NEQ) { + ehca_err(ib_dev, "Invalid EQ type %x. eq=%p", type, eq); + return -EINVAL; + } + if (!length) { + ehca_err(ib_dev, "EQ length must not be zero. eq=%p", eq); + return -EINVAL; + } + + h_ret = hipz_h_alloc_resource_eq(shca->ipz_hca_handle, + &eq->pf, + type, + length, + &eq->ipz_eq_handle, + &eq->length, + &nr_pages, &eq->ist); + + if (h_ret != H_SUCCESS) { + ehca_err(ib_dev, "Can't allocate EQ/NEQ. eq=%p", eq); + return -EINVAL; + } + + ret = ipz_queue_ctor(NULL, &eq->ipz_queue, nr_pages, + EHCA_PAGESIZE, sizeof(struct ehca_eqe), 0, 0); + if (!ret) { + ehca_err(ib_dev, "Can't allocate EQ pages eq=%p", eq); + goto create_eq_exit1; + } + + for (i = 0; i < nr_pages; i++) { + u64 rpage; + + vpage = ipz_qpageit_get_inc(&eq->ipz_queue); + if (!vpage) + goto create_eq_exit2; + + rpage = __pa(vpage); + h_ret = hipz_h_register_rpage_eq(shca->ipz_hca_handle, + eq->ipz_eq_handle, + &eq->pf, + 0, 0, rpage, 1); + + if (i == (nr_pages - 1)) { + /* last page */ + vpage = ipz_qpageit_get_inc(&eq->ipz_queue); + if (h_ret != H_SUCCESS || vpage) + goto create_eq_exit2; + } else { + if (h_ret != H_PAGE_REGISTERED) + goto create_eq_exit2; + } + } + + ipz_qeit_reset(&eq->ipz_queue); + + /* register interrupt handlers and initialize work queues */ + if (type == EHCA_EQ) { + tasklet_init(&eq->interrupt_task, ehca_tasklet_eq, (long)shca); + + ret = ibmebus_request_irq(eq->ist, ehca_interrupt_eq, + 0, "ehca_eq", + (void *)shca); + if (ret < 0) + ehca_err(ib_dev, "Can't map interrupt handler."); + } else if (type == EHCA_NEQ) { + tasklet_init(&eq->interrupt_task, ehca_tasklet_neq, (long)shca); + + ret = ibmebus_request_irq(eq->ist, ehca_interrupt_neq, + 0, "ehca_neq", + (void *)shca); + if (ret < 0) + ehca_err(ib_dev, "Can't map interrupt handler."); + } + + eq->is_initialized = 1; + + return 0; + +create_eq_exit2: + ipz_queue_dtor(NULL, &eq->ipz_queue); + +create_eq_exit1: + hipz_h_destroy_eq(shca->ipz_hca_handle, eq); + + return -EINVAL; +} + +void *ehca_poll_eq(struct ehca_shca *shca, struct ehca_eq *eq) +{ + unsigned long flags; + void *eqe; + + spin_lock_irqsave(&eq->spinlock, flags); + eqe = ipz_eqit_eq_get_inc_valid(&eq->ipz_queue); + spin_unlock_irqrestore(&eq->spinlock, flags); + + return eqe; +} + +int ehca_destroy_eq(struct ehca_shca *shca, struct ehca_eq *eq) +{ + unsigned long flags; + u64 h_ret; + + ibmebus_free_irq(eq->ist, (void *)shca); + + spin_lock_irqsave(&shca_list_lock, flags); + eq->is_initialized = 0; + spin_unlock_irqrestore(&shca_list_lock, flags); + + tasklet_kill(&eq->interrupt_task); + + h_ret = hipz_h_destroy_eq(shca->ipz_hca_handle, eq); + + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "Can't free EQ resources."); + return -EINVAL; + } + ipz_queue_dtor(NULL, &eq->ipz_queue); + + return 0; +} diff --git a/drivers/infiniband/hw/ehca/ehca_hca.c b/drivers/infiniband/hw/ehca/ehca_hca.c new file mode 100644 index 0000000..9ed4d25 --- /dev/null +++ b/drivers/infiniband/hw/ehca/ehca_hca.c @@ -0,0 +1,410 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * HCA query functions + * + * Authors: Heiko J Schick + * Christoph Raisch + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "ehca_tools.h" +#include "ehca_iverbs.h" +#include "hcp_if.h" + +static unsigned int limit_uint(unsigned int value) +{ + return min_t(unsigned int, value, INT_MAX); +} + +int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props) +{ + int i, ret = 0; + struct ehca_shca *shca = container_of(ibdev, struct ehca_shca, + ib_device); + struct hipz_query_hca *rblock; + + static const u32 cap_mapping[] = { + IB_DEVICE_RESIZE_MAX_WR, HCA_CAP_WQE_RESIZE, + IB_DEVICE_BAD_PKEY_CNTR, HCA_CAP_BAD_P_KEY_CTR, + IB_DEVICE_BAD_QKEY_CNTR, HCA_CAP_Q_KEY_VIOL_CTR, + IB_DEVICE_RAW_MULTI, HCA_CAP_RAW_PACKET_MCAST, + IB_DEVICE_AUTO_PATH_MIG, HCA_CAP_AUTO_PATH_MIG, + IB_DEVICE_CHANGE_PHY_PORT, HCA_CAP_SQD_RTS_PORT_CHANGE, + IB_DEVICE_UD_AV_PORT_ENFORCE, HCA_CAP_AH_PORT_NR_CHECK, + IB_DEVICE_CURR_QP_STATE_MOD, HCA_CAP_CUR_QP_STATE_MOD, + IB_DEVICE_SHUTDOWN_PORT, HCA_CAP_SHUTDOWN_PORT, + IB_DEVICE_INIT_TYPE, HCA_CAP_INIT_TYPE, + IB_DEVICE_PORT_ACTIVE_EVENT, HCA_CAP_PORT_ACTIVE_EVENT, + }; + + rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!rblock) { + ehca_err(&shca->ib_device, "Can't allocate rblock memory."); + return -ENOMEM; + } + + if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) { + ehca_err(&shca->ib_device, "Can't query device properties"); + ret = -EINVAL; + goto query_device1; + } + + memset(props, 0, sizeof(struct ib_device_attr)); + props->page_size_cap = shca->hca_cap_mr_pgsize; + props->fw_ver = rblock->hw_ver; + props->max_mr_size = rblock->max_mr_size; + props->vendor_id = rblock->vendor_id >> 8; + props->vendor_part_id = rblock->vendor_part_id >> 16; + props->hw_ver = rblock->hw_ver; + props->max_qp = limit_uint(rblock->max_qp); + props->max_qp_wr = limit_uint(rblock->max_wqes_wq); + props->max_sge = limit_uint(rblock->max_sge); + props->max_sge_rd = limit_uint(rblock->max_sge_rd); + props->max_cq = limit_uint(rblock->max_cq); + props->max_cqe = limit_uint(rblock->max_cqe); + props->max_mr = limit_uint(rblock->max_mr); + props->max_mw = limit_uint(rblock->max_mw); + props->max_pd = limit_uint(rblock->max_pd); + props->max_ah = limit_uint(rblock->max_ah); + props->max_ee = limit_uint(rblock->max_rd_ee_context); + props->max_rdd = limit_uint(rblock->max_rd_domain); + props->max_fmr = limit_uint(rblock->max_mr); + props->max_qp_rd_atom = limit_uint(rblock->max_rr_qp); + props->max_ee_rd_atom = limit_uint(rblock->max_rr_ee_context); + props->max_res_rd_atom = limit_uint(rblock->max_rr_hca); + props->max_qp_init_rd_atom = limit_uint(rblock->max_act_wqs_qp); + props->max_ee_init_rd_atom = limit_uint(rblock->max_act_wqs_ee_context); + + if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) { + props->max_srq = limit_uint(props->max_qp); + props->max_srq_wr = limit_uint(props->max_qp_wr); + props->max_srq_sge = 3; + } + + props->max_pkeys = 16; + /* Some FW versions say 0 here; insert sensible value in that case */ + props->local_ca_ack_delay = rblock->local_ca_ack_delay ? + min_t(u8, rblock->local_ca_ack_delay, 255) : 12; + props->max_raw_ipv6_qp = limit_uint(rblock->max_raw_ipv6_qp); + props->max_raw_ethy_qp = limit_uint(rblock->max_raw_ethy_qp); + props->max_mcast_grp = limit_uint(rblock->max_mcast_grp); + props->max_mcast_qp_attach = limit_uint(rblock->max_mcast_qp_attach); + props->max_total_mcast_qp_attach + = limit_uint(rblock->max_total_mcast_qp_attach); + + /* translate device capabilities */ + props->device_cap_flags = IB_DEVICE_SYS_IMAGE_GUID | + IB_DEVICE_RC_RNR_NAK_GEN | IB_DEVICE_N_NOTIFY_CQ; + for (i = 0; i < ARRAY_SIZE(cap_mapping); i += 2) + if (rblock->hca_cap_indicators & cap_mapping[i + 1]) + props->device_cap_flags |= cap_mapping[i]; + +query_device1: + ehca_free_fw_ctrlblock(rblock); + + return ret; +} + +static enum ib_mtu map_mtu(struct ehca_shca *shca, u32 fw_mtu) +{ + switch (fw_mtu) { + case 0x1: + return IB_MTU_256; + case 0x2: + return IB_MTU_512; + case 0x3: + return IB_MTU_1024; + case 0x4: + return IB_MTU_2048; + case 0x5: + return IB_MTU_4096; + default: + ehca_err(&shca->ib_device, "Unknown MTU size: %x.", + fw_mtu); + return 0; + } +} + +static u8 map_number_of_vls(struct ehca_shca *shca, u32 vl_cap) +{ + switch (vl_cap) { + case 0x1: + return 1; + case 0x2: + return 2; + case 0x3: + return 4; + case 0x4: + return 8; + case 0x5: + return 15; + default: + ehca_err(&shca->ib_device, "invalid Vl Capability: %x.", + vl_cap); + return 0; + } +} + +int ehca_query_port(struct ib_device *ibdev, + u8 port, struct ib_port_attr *props) +{ + int ret = 0; + u64 h_ret; + struct ehca_shca *shca = container_of(ibdev, struct ehca_shca, + ib_device); + struct hipz_query_port *rblock; + + rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!rblock) { + ehca_err(&shca->ib_device, "Can't allocate rblock memory."); + return -ENOMEM; + } + + h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "Can't query port properties"); + ret = -EINVAL; + goto query_port1; + } + + memset(props, 0, sizeof(struct ib_port_attr)); + + props->active_mtu = props->max_mtu = map_mtu(shca, rblock->max_mtu); + props->port_cap_flags = rblock->capability_mask; + props->gid_tbl_len = rblock->gid_tbl_len; + if (rblock->max_msg_sz) + props->max_msg_sz = rblock->max_msg_sz; + else + props->max_msg_sz = 0x1 << 31; + props->bad_pkey_cntr = rblock->bad_pkey_cntr; + props->qkey_viol_cntr = rblock->qkey_viol_cntr; + props->pkey_tbl_len = rblock->pkey_tbl_len; + props->lid = rblock->lid; + props->sm_lid = rblock->sm_lid; + props->lmc = rblock->lmc; + props->sm_sl = rblock->sm_sl; + props->subnet_timeout = rblock->subnet_timeout; + props->init_type_reply = rblock->init_type_reply; + props->max_vl_num = map_number_of_vls(shca, rblock->vl_cap); + + if (rblock->state && rblock->phys_width) { + props->phys_state = rblock->phys_pstate; + props->state = rblock->phys_state; + props->active_width = rblock->phys_width; + props->active_speed = rblock->phys_speed; + } else { + /* old firmware releases don't report physical + * port info, so use default values + */ + props->phys_state = 5; + props->state = rblock->state; + props->active_width = IB_WIDTH_12X; + props->active_speed = IB_SPEED_SDR; + } + +query_port1: + ehca_free_fw_ctrlblock(rblock); + + return ret; +} + +int ehca_query_sma_attr(struct ehca_shca *shca, + u8 port, struct ehca_sma_attr *attr) +{ + int ret = 0; + u64 h_ret; + struct hipz_query_port *rblock; + + rblock = ehca_alloc_fw_ctrlblock(GFP_ATOMIC); + if (!rblock) { + ehca_err(&shca->ib_device, "Can't allocate rblock memory."); + return -ENOMEM; + } + + h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "Can't query port properties"); + ret = -EINVAL; + goto query_sma_attr1; + } + + memset(attr, 0, sizeof(struct ehca_sma_attr)); + + attr->lid = rblock->lid; + attr->lmc = rblock->lmc; + attr->sm_sl = rblock->sm_sl; + attr->sm_lid = rblock->sm_lid; + + attr->pkey_tbl_len = rblock->pkey_tbl_len; + memcpy(attr->pkeys, rblock->pkey_entries, sizeof(attr->pkeys)); + +query_sma_attr1: + ehca_free_fw_ctrlblock(rblock); + + return ret; +} + +int ehca_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) +{ + int ret = 0; + u64 h_ret; + struct ehca_shca *shca; + struct hipz_query_port *rblock; + + shca = container_of(ibdev, struct ehca_shca, ib_device); + if (index > 16) { + ehca_err(&shca->ib_device, "Invalid index: %x.", index); + return -EINVAL; + } + + rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!rblock) { + ehca_err(&shca->ib_device, "Can't allocate rblock memory."); + return -ENOMEM; + } + + h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "Can't query port properties"); + ret = -EINVAL; + goto query_pkey1; + } + + memcpy(pkey, &rblock->pkey_entries + index, sizeof(u16)); + +query_pkey1: + ehca_free_fw_ctrlblock(rblock); + + return ret; +} + +int ehca_query_gid(struct ib_device *ibdev, u8 port, + int index, union ib_gid *gid) +{ + int ret = 0; + u64 h_ret; + struct ehca_shca *shca = container_of(ibdev, struct ehca_shca, + ib_device); + struct hipz_query_port *rblock; + + if (index < 0 || index > 255) { + ehca_err(&shca->ib_device, "Invalid index: %x.", index); + return -EINVAL; + } + + rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!rblock) { + ehca_err(&shca->ib_device, "Can't allocate rblock memory."); + return -ENOMEM; + } + + h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "Can't query port properties"); + ret = -EINVAL; + goto query_gid1; + } + + memcpy(&gid->raw[0], &rblock->gid_prefix, sizeof(u64)); + memcpy(&gid->raw[8], &rblock->guid_entries[index], sizeof(u64)); + +query_gid1: + ehca_free_fw_ctrlblock(rblock); + + return ret; +} + +static const u32 allowed_port_caps = ( + IB_PORT_SM | IB_PORT_LED_INFO_SUP | IB_PORT_CM_SUP | + IB_PORT_SNMP_TUNNEL_SUP | IB_PORT_DEVICE_MGMT_SUP | + IB_PORT_VENDOR_CLASS_SUP); + +int ehca_modify_port(struct ib_device *ibdev, + u8 port, int port_modify_mask, + struct ib_port_modify *props) +{ + int ret = 0; + struct ehca_shca *shca; + struct hipz_query_port *rblock; + u32 cap; + u64 hret; + + shca = container_of(ibdev, struct ehca_shca, ib_device); + if ((props->set_port_cap_mask | props->clr_port_cap_mask) + & ~allowed_port_caps) { + ehca_err(&shca->ib_device, "Non-changeable bits set in masks " + "set=%x clr=%x allowed=%x", props->set_port_cap_mask, + props->clr_port_cap_mask, allowed_port_caps); + return -EINVAL; + } + + if (mutex_lock_interruptible(&shca->modify_mutex)) + return -ERESTARTSYS; + + rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!rblock) { + ehca_err(&shca->ib_device, "Can't allocate rblock memory."); + ret = -ENOMEM; + goto modify_port1; + } + + hret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock); + if (hret != H_SUCCESS) { + ehca_err(&shca->ib_device, "Can't query port properties"); + ret = -EINVAL; + goto modify_port2; + } + + cap = (rblock->capability_mask | props->set_port_cap_mask) + & ~props->clr_port_cap_mask; + + hret = hipz_h_modify_port(shca->ipz_hca_handle, port, + cap, props->init_type, port_modify_mask); + if (hret != H_SUCCESS) { + ehca_err(&shca->ib_device, "Modify port failed h_ret=%lli", + hret); + ret = -EINVAL; + } + +modify_port2: + ehca_free_fw_ctrlblock(rblock); + +modify_port1: + mutex_unlock(&shca->modify_mutex); + + return ret; +} diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c new file mode 100644 index 0000000..8615d7c --- /dev/null +++ b/drivers/infiniband/hw/ehca/ehca_irq.c @@ -0,0 +1,870 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Functions for EQs, NEQs and interrupts + * + * Authors: Heiko J Schick + * Khadija Souissi + * Hoang-Nam Nguyen + * Joachim Fenkes + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + +#include "ehca_classes.h" +#include "ehca_irq.h" +#include "ehca_iverbs.h" +#include "ehca_tools.h" +#include "hcp_if.h" +#include "hipz_fns.h" +#include "ipz_pt_fn.h" + +#define EQE_COMPLETION_EVENT EHCA_BMASK_IBM( 1, 1) +#define EQE_CQ_QP_NUMBER EHCA_BMASK_IBM( 8, 31) +#define EQE_EE_IDENTIFIER EHCA_BMASK_IBM( 2, 7) +#define EQE_CQ_NUMBER EHCA_BMASK_IBM( 8, 31) +#define EQE_QP_NUMBER EHCA_BMASK_IBM( 8, 31) +#define EQE_QP_TOKEN EHCA_BMASK_IBM(32, 63) +#define EQE_CQ_TOKEN EHCA_BMASK_IBM(32, 63) + +#define NEQE_COMPLETION_EVENT EHCA_BMASK_IBM( 1, 1) +#define NEQE_EVENT_CODE EHCA_BMASK_IBM( 2, 7) +#define NEQE_PORT_NUMBER EHCA_BMASK_IBM( 8, 15) +#define NEQE_PORT_AVAILABILITY EHCA_BMASK_IBM(16, 16) +#define NEQE_DISRUPTIVE EHCA_BMASK_IBM(16, 16) +#define NEQE_SPECIFIC_EVENT EHCA_BMASK_IBM(16, 23) + +#define ERROR_DATA_LENGTH EHCA_BMASK_IBM(52, 63) +#define ERROR_DATA_TYPE EHCA_BMASK_IBM( 0, 7) + +static void queue_comp_task(struct ehca_cq *__cq); + +static struct ehca_comp_pool *pool; + +static inline void comp_event_callback(struct ehca_cq *cq) +{ + if (!cq->ib_cq.comp_handler) + return; + + spin_lock(&cq->cb_lock); + cq->ib_cq.comp_handler(&cq->ib_cq, cq->ib_cq.cq_context); + spin_unlock(&cq->cb_lock); + + return; +} + +static void print_error_data(struct ehca_shca *shca, void *data, + u64 *rblock, int length) +{ + u64 type = EHCA_BMASK_GET(ERROR_DATA_TYPE, rblock[2]); + u64 resource = rblock[1]; + + switch (type) { + case 0x1: /* Queue Pair */ + { + struct ehca_qp *qp = (struct ehca_qp *)data; + + /* only print error data if AER is set */ + if (rblock[6] == 0) + return; + + ehca_err(&shca->ib_device, + "QP 0x%x (resource=%llx) has errors.", + qp->ib_qp.qp_num, resource); + break; + } + case 0x4: /* Completion Queue */ + { + struct ehca_cq *cq = (struct ehca_cq *)data; + + ehca_err(&shca->ib_device, + "CQ 0x%x (resource=%llx) has errors.", + cq->cq_number, resource); + break; + } + default: + ehca_err(&shca->ib_device, + "Unknown error type: %llx on %s.", + type, shca->ib_device.name); + break; + } + + ehca_err(&shca->ib_device, "Error data is available: %llx.", resource); + ehca_err(&shca->ib_device, "EHCA ----- error data begin " + "---------------------------------------------------"); + ehca_dmp(rblock, length, "resource=%llx", resource); + ehca_err(&shca->ib_device, "EHCA ----- error data end " + "----------------------------------------------------"); + + return; +} + +int ehca_error_data(struct ehca_shca *shca, void *data, + u64 resource) +{ + + unsigned long ret; + u64 *rblock; + unsigned long block_count; + + rblock = ehca_alloc_fw_ctrlblock(GFP_ATOMIC); + if (!rblock) { + ehca_err(&shca->ib_device, "Cannot allocate rblock memory."); + ret = -ENOMEM; + goto error_data1; + } + + /* rblock must be 4K aligned and should be 4K large */ + ret = hipz_h_error_data(shca->ipz_hca_handle, + resource, + rblock, + &block_count); + + if (ret == H_R_STATE) + ehca_err(&shca->ib_device, + "No error data is available: %llx.", resource); + else if (ret == H_SUCCESS) { + int length; + + length = EHCA_BMASK_GET(ERROR_DATA_LENGTH, rblock[0]); + + if (length > EHCA_PAGESIZE) + length = EHCA_PAGESIZE; + + print_error_data(shca, data, rblock, length); + } else + ehca_err(&shca->ib_device, + "Error data could not be fetched: %llx", resource); + + ehca_free_fw_ctrlblock(rblock); + +error_data1: + return ret; + +} + +static void dispatch_qp_event(struct ehca_shca *shca, struct ehca_qp *qp, + enum ib_event_type event_type) +{ + struct ib_event event; + + /* PATH_MIG without the QP ever having been armed is false alarm */ + if (event_type == IB_EVENT_PATH_MIG && !qp->mig_armed) + return; + + event.device = &shca->ib_device; + event.event = event_type; + + if (qp->ext_type == EQPT_SRQ) { + if (!qp->ib_srq.event_handler) + return; + + event.element.srq = &qp->ib_srq; + qp->ib_srq.event_handler(&event, qp->ib_srq.srq_context); + } else { + if (!qp->ib_qp.event_handler) + return; + + event.element.qp = &qp->ib_qp; + qp->ib_qp.event_handler(&event, qp->ib_qp.qp_context); + } +} + +static void qp_event_callback(struct ehca_shca *shca, u64 eqe, + enum ib_event_type event_type, int fatal) +{ + struct ehca_qp *qp; + u32 token = EHCA_BMASK_GET(EQE_QP_TOKEN, eqe); + + read_lock(&ehca_qp_idr_lock); + qp = idr_find(&ehca_qp_idr, token); + if (qp) + atomic_inc(&qp->nr_events); + read_unlock(&ehca_qp_idr_lock); + + if (!qp) + return; + + if (fatal) + ehca_error_data(shca, qp, qp->ipz_qp_handle.handle); + + dispatch_qp_event(shca, qp, fatal && qp->ext_type == EQPT_SRQ ? + IB_EVENT_SRQ_ERR : event_type); + + /* + * eHCA only processes one WQE at a time for SRQ base QPs, + * so the last WQE has been processed as soon as the QP enters + * error state. + */ + if (fatal && qp->ext_type == EQPT_SRQBASE) + dispatch_qp_event(shca, qp, IB_EVENT_QP_LAST_WQE_REACHED); + + if (atomic_dec_and_test(&qp->nr_events)) + wake_up(&qp->wait_completion); + return; +} + +static void cq_event_callback(struct ehca_shca *shca, + u64 eqe) +{ + struct ehca_cq *cq; + u32 token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe); + + read_lock(&ehca_cq_idr_lock); + cq = idr_find(&ehca_cq_idr, token); + if (cq) + atomic_inc(&cq->nr_events); + read_unlock(&ehca_cq_idr_lock); + + if (!cq) + return; + + ehca_error_data(shca, cq, cq->ipz_cq_handle.handle); + + if (atomic_dec_and_test(&cq->nr_events)) + wake_up(&cq->wait_completion); + + return; +} + +static void parse_identifier(struct ehca_shca *shca, u64 eqe) +{ + u8 identifier = EHCA_BMASK_GET(EQE_EE_IDENTIFIER, eqe); + + switch (identifier) { + case 0x02: /* path migrated */ + qp_event_callback(shca, eqe, IB_EVENT_PATH_MIG, 0); + break; + case 0x03: /* communication established */ + qp_event_callback(shca, eqe, IB_EVENT_COMM_EST, 0); + break; + case 0x04: /* send queue drained */ + qp_event_callback(shca, eqe, IB_EVENT_SQ_DRAINED, 0); + break; + case 0x05: /* QP error */ + case 0x06: /* QP error */ + qp_event_callback(shca, eqe, IB_EVENT_QP_FATAL, 1); + break; + case 0x07: /* CQ error */ + case 0x08: /* CQ error */ + cq_event_callback(shca, eqe); + break; + case 0x09: /* MRMWPTE error */ + ehca_err(&shca->ib_device, "MRMWPTE error."); + break; + case 0x0A: /* port event */ + ehca_err(&shca->ib_device, "Port event."); + break; + case 0x0B: /* MR access error */ + ehca_err(&shca->ib_device, "MR access error."); + break; + case 0x0C: /* EQ error */ + ehca_err(&shca->ib_device, "EQ error."); + break; + case 0x0D: /* P/Q_Key mismatch */ + ehca_err(&shca->ib_device, "P/Q_Key mismatch."); + break; + case 0x10: /* sampling complete */ + ehca_err(&shca->ib_device, "Sampling complete."); + break; + case 0x11: /* unaffiliated access error */ + ehca_err(&shca->ib_device, "Unaffiliated access error."); + break; + case 0x12: /* path migrating */ + ehca_err(&shca->ib_device, "Path migrating."); + break; + case 0x13: /* interface trace stopped */ + ehca_err(&shca->ib_device, "Interface trace stopped."); + break; + case 0x14: /* first error capture info available */ + ehca_info(&shca->ib_device, "First error capture available"); + break; + case 0x15: /* SRQ limit reached */ + qp_event_callback(shca, eqe, IB_EVENT_SRQ_LIMIT_REACHED, 0); + break; + default: + ehca_err(&shca->ib_device, "Unknown identifier: %x on %s.", + identifier, shca->ib_device.name); + break; + } + + return; +} + +static void dispatch_port_event(struct ehca_shca *shca, int port_num, + enum ib_event_type type, const char *msg) +{ + struct ib_event event; + + ehca_info(&shca->ib_device, "port %d %s.", port_num, msg); + event.device = &shca->ib_device; + event.event = type; + event.element.port_num = port_num; + ib_dispatch_event(&event); +} + +static void notify_port_conf_change(struct ehca_shca *shca, int port_num) +{ + struct ehca_sma_attr new_attr; + struct ehca_sma_attr *old_attr = &shca->sport[port_num - 1].saved_attr; + + ehca_query_sma_attr(shca, port_num, &new_attr); + + if (new_attr.sm_sl != old_attr->sm_sl || + new_attr.sm_lid != old_attr->sm_lid) + dispatch_port_event(shca, port_num, IB_EVENT_SM_CHANGE, + "SM changed"); + + if (new_attr.lid != old_attr->lid || + new_attr.lmc != old_attr->lmc) + dispatch_port_event(shca, port_num, IB_EVENT_LID_CHANGE, + "LID changed"); + + if (new_attr.pkey_tbl_len != old_attr->pkey_tbl_len || + memcmp(new_attr.pkeys, old_attr->pkeys, + sizeof(u16) * new_attr.pkey_tbl_len)) + dispatch_port_event(shca, port_num, IB_EVENT_PKEY_CHANGE, + "P_Key changed"); + + *old_attr = new_attr; +} + +/* replay modify_qp for sqps -- return 0 if all is well, 1 if AQP1 destroyed */ +static int replay_modify_qp(struct ehca_sport *sport) +{ + int aqp1_destroyed; + unsigned long flags; + + spin_lock_irqsave(&sport->mod_sqp_lock, flags); + + aqp1_destroyed = !sport->ibqp_sqp[IB_QPT_GSI]; + + if (sport->ibqp_sqp[IB_QPT_SMI]) + ehca_recover_sqp(sport->ibqp_sqp[IB_QPT_SMI]); + if (!aqp1_destroyed) + ehca_recover_sqp(sport->ibqp_sqp[IB_QPT_GSI]); + + spin_unlock_irqrestore(&sport->mod_sqp_lock, flags); + + return aqp1_destroyed; +} + +static void parse_ec(struct ehca_shca *shca, u64 eqe) +{ + u8 ec = EHCA_BMASK_GET(NEQE_EVENT_CODE, eqe); + u8 port = EHCA_BMASK_GET(NEQE_PORT_NUMBER, eqe); + u8 spec_event; + struct ehca_sport *sport = &shca->sport[port - 1]; + + switch (ec) { + case 0x30: /* port availability change */ + if (EHCA_BMASK_GET(NEQE_PORT_AVAILABILITY, eqe)) { + /* only replay modify_qp calls in autodetect mode; + * if AQP1 was destroyed, the port is already down + * again and we can drop the event. + */ + if (ehca_nr_ports < 0) + if (replay_modify_qp(sport)) + break; + + sport->port_state = IB_PORT_ACTIVE; + dispatch_port_event(shca, port, IB_EVENT_PORT_ACTIVE, + "is active"); + ehca_query_sma_attr(shca, port, &sport->saved_attr); + } else { + sport->port_state = IB_PORT_DOWN; + dispatch_port_event(shca, port, IB_EVENT_PORT_ERR, + "is inactive"); + } + break; + case 0x31: + /* port configuration change + * disruptive change is caused by + * LID, PKEY or SM change + */ + if (EHCA_BMASK_GET(NEQE_DISRUPTIVE, eqe)) { + ehca_warn(&shca->ib_device, "disruptive port " + "%d configuration change", port); + + sport->port_state = IB_PORT_DOWN; + dispatch_port_event(shca, port, IB_EVENT_PORT_ERR, + "is inactive"); + + sport->port_state = IB_PORT_ACTIVE; + dispatch_port_event(shca, port, IB_EVENT_PORT_ACTIVE, + "is active"); + ehca_query_sma_attr(shca, port, + &sport->saved_attr); + } else + notify_port_conf_change(shca, port); + break; + case 0x32: /* adapter malfunction */ + ehca_err(&shca->ib_device, "Adapter malfunction."); + break; + case 0x33: /* trace stopped */ + ehca_err(&shca->ib_device, "Traced stopped."); + break; + case 0x34: /* util async event */ + spec_event = EHCA_BMASK_GET(NEQE_SPECIFIC_EVENT, eqe); + if (spec_event == 0x80) /* client reregister required */ + dispatch_port_event(shca, port, + IB_EVENT_CLIENT_REREGISTER, + "client reregister req."); + else + ehca_warn(&shca->ib_device, "Unknown util async " + "event %x on port %x", spec_event, port); + break; + default: + ehca_err(&shca->ib_device, "Unknown event code: %x on %s.", + ec, shca->ib_device.name); + break; + } + + return; +} + +static inline void reset_eq_pending(struct ehca_cq *cq) +{ + u64 CQx_EP; + struct h_galpa gal = cq->galpas.kernel; + + hipz_galpa_store_cq(gal, cqx_ep, 0x0); + CQx_EP = hipz_galpa_load(gal, CQTEMM_OFFSET(cqx_ep)); + + return; +} + +irqreturn_t ehca_interrupt_neq(int irq, void *dev_id) +{ + struct ehca_shca *shca = (struct ehca_shca*)dev_id; + + tasklet_hi_schedule(&shca->neq.interrupt_task); + + return IRQ_HANDLED; +} + +void ehca_tasklet_neq(unsigned long data) +{ + struct ehca_shca *shca = (struct ehca_shca*)data; + struct ehca_eqe *eqe; + u64 ret; + + eqe = ehca_poll_eq(shca, &shca->neq); + + while (eqe) { + if (!EHCA_BMASK_GET(NEQE_COMPLETION_EVENT, eqe->entry)) + parse_ec(shca, eqe->entry); + + eqe = ehca_poll_eq(shca, &shca->neq); + } + + ret = hipz_h_reset_event(shca->ipz_hca_handle, + shca->neq.ipz_eq_handle, 0xFFFFFFFFFFFFFFFFL); + + if (ret != H_SUCCESS) + ehca_err(&shca->ib_device, "Can't clear notification events."); + + return; +} + +irqreturn_t ehca_interrupt_eq(int irq, void *dev_id) +{ + struct ehca_shca *shca = (struct ehca_shca*)dev_id; + + tasklet_hi_schedule(&shca->eq.interrupt_task); + + return IRQ_HANDLED; +} + + +static inline void process_eqe(struct ehca_shca *shca, struct ehca_eqe *eqe) +{ + u64 eqe_value; + u32 token; + struct ehca_cq *cq; + + eqe_value = eqe->entry; + ehca_dbg(&shca->ib_device, "eqe_value=%llx", eqe_value); + if (EHCA_BMASK_GET(EQE_COMPLETION_EVENT, eqe_value)) { + ehca_dbg(&shca->ib_device, "Got completion event"); + token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe_value); + read_lock(&ehca_cq_idr_lock); + cq = idr_find(&ehca_cq_idr, token); + if (cq) + atomic_inc(&cq->nr_events); + read_unlock(&ehca_cq_idr_lock); + if (cq == NULL) { + ehca_err(&shca->ib_device, + "Invalid eqe for non-existing cq token=%x", + token); + return; + } + reset_eq_pending(cq); + if (ehca_scaling_code) + queue_comp_task(cq); + else { + comp_event_callback(cq); + if (atomic_dec_and_test(&cq->nr_events)) + wake_up(&cq->wait_completion); + } + } else { + ehca_dbg(&shca->ib_device, "Got non completion event"); + parse_identifier(shca, eqe_value); + } +} + +void ehca_process_eq(struct ehca_shca *shca, int is_irq) +{ + struct ehca_eq *eq = &shca->eq; + struct ehca_eqe_cache_entry *eqe_cache = eq->eqe_cache; + u64 eqe_value, ret; + int eqe_cnt, i; + int eq_empty = 0; + + spin_lock(&eq->irq_spinlock); + if (is_irq) { + const int max_query_cnt = 100; + int query_cnt = 0; + int int_state = 1; + do { + int_state = hipz_h_query_int_state( + shca->ipz_hca_handle, eq->ist); + query_cnt++; + iosync(); + } while (int_state && query_cnt < max_query_cnt); + if (unlikely((query_cnt == max_query_cnt))) + ehca_dbg(&shca->ib_device, "int_state=%x query_cnt=%x", + int_state, query_cnt); + } + + /* read out all eqes */ + eqe_cnt = 0; + do { + u32 token; + eqe_cache[eqe_cnt].eqe = ehca_poll_eq(shca, eq); + if (!eqe_cache[eqe_cnt].eqe) + break; + eqe_value = eqe_cache[eqe_cnt].eqe->entry; + if (EHCA_BMASK_GET(EQE_COMPLETION_EVENT, eqe_value)) { + token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe_value); + read_lock(&ehca_cq_idr_lock); + eqe_cache[eqe_cnt].cq = idr_find(&ehca_cq_idr, token); + if (eqe_cache[eqe_cnt].cq) + atomic_inc(&eqe_cache[eqe_cnt].cq->nr_events); + read_unlock(&ehca_cq_idr_lock); + if (!eqe_cache[eqe_cnt].cq) { + ehca_err(&shca->ib_device, + "Invalid eqe for non-existing cq " + "token=%x", token); + continue; + } + } else + eqe_cache[eqe_cnt].cq = NULL; + eqe_cnt++; + } while (eqe_cnt < EHCA_EQE_CACHE_SIZE); + if (!eqe_cnt) { + if (is_irq) + ehca_dbg(&shca->ib_device, + "No eqe found for irq event"); + goto unlock_irq_spinlock; + } else if (!is_irq) { + ret = hipz_h_eoi(eq->ist); + if (ret != H_SUCCESS) + ehca_err(&shca->ib_device, + "bad return code EOI -rc = %lld\n", ret); + ehca_dbg(&shca->ib_device, "deadman found %x eqe", eqe_cnt); + } + if (unlikely(eqe_cnt == EHCA_EQE_CACHE_SIZE)) + ehca_dbg(&shca->ib_device, "too many eqes for one irq event"); + /* enable irq for new packets */ + for (i = 0; i < eqe_cnt; i++) { + if (eq->eqe_cache[i].cq) + reset_eq_pending(eq->eqe_cache[i].cq); + } + /* check eq */ + spin_lock(&eq->spinlock); + eq_empty = (!ipz_eqit_eq_peek_valid(&shca->eq.ipz_queue)); + spin_unlock(&eq->spinlock); + /* call completion handler for cached eqes */ + for (i = 0; i < eqe_cnt; i++) + if (eq->eqe_cache[i].cq) { + if (ehca_scaling_code) + queue_comp_task(eq->eqe_cache[i].cq); + else { + struct ehca_cq *cq = eq->eqe_cache[i].cq; + comp_event_callback(cq); + if (atomic_dec_and_test(&cq->nr_events)) + wake_up(&cq->wait_completion); + } + } else { + ehca_dbg(&shca->ib_device, "Got non completion event"); + parse_identifier(shca, eq->eqe_cache[i].eqe->entry); + } + /* poll eq if not empty */ + if (eq_empty) + goto unlock_irq_spinlock; + do { + struct ehca_eqe *eqe; + eqe = ehca_poll_eq(shca, &shca->eq); + if (!eqe) + break; + process_eqe(shca, eqe); + } while (1); + +unlock_irq_spinlock: + spin_unlock(&eq->irq_spinlock); +} + +void ehca_tasklet_eq(unsigned long data) +{ + ehca_process_eq((struct ehca_shca*)data, 1); +} + +static int find_next_online_cpu(struct ehca_comp_pool *pool) +{ + int cpu; + unsigned long flags; + + WARN_ON_ONCE(!in_interrupt()); + if (ehca_debug_level >= 3) + ehca_dmp(cpu_online_mask, cpumask_size(), ""); + + spin_lock_irqsave(&pool->last_cpu_lock, flags); + do { + cpu = cpumask_next(pool->last_cpu, cpu_online_mask); + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(cpu_online_mask); + pool->last_cpu = cpu; + } while (!per_cpu_ptr(pool->cpu_comp_tasks, cpu)->active); + spin_unlock_irqrestore(&pool->last_cpu_lock, flags); + + return cpu; +} + +static void __queue_comp_task(struct ehca_cq *__cq, + struct ehca_cpu_comp_task *cct, + struct task_struct *thread) +{ + unsigned long flags; + + spin_lock_irqsave(&cct->task_lock, flags); + spin_lock(&__cq->task_lock); + + if (__cq->nr_callbacks == 0) { + __cq->nr_callbacks++; + list_add_tail(&__cq->entry, &cct->cq_list); + cct->cq_jobs++; + wake_up_process(thread); + } else + __cq->nr_callbacks++; + + spin_unlock(&__cq->task_lock); + spin_unlock_irqrestore(&cct->task_lock, flags); +} + +static void queue_comp_task(struct ehca_cq *__cq) +{ + int cpu_id; + struct ehca_cpu_comp_task *cct; + struct task_struct *thread; + int cq_jobs; + unsigned long flags; + + cpu_id = find_next_online_cpu(pool); + BUG_ON(!cpu_online(cpu_id)); + + cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id); + thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu_id); + BUG_ON(!cct || !thread); + + spin_lock_irqsave(&cct->task_lock, flags); + cq_jobs = cct->cq_jobs; + spin_unlock_irqrestore(&cct->task_lock, flags); + if (cq_jobs > 0) { + cpu_id = find_next_online_cpu(pool); + cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id); + thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu_id); + BUG_ON(!cct || !thread); + } + __queue_comp_task(__cq, cct, thread); +} + +static void run_comp_task(struct ehca_cpu_comp_task *cct) +{ + struct ehca_cq *cq; + + while (!list_empty(&cct->cq_list)) { + cq = list_entry(cct->cq_list.next, struct ehca_cq, entry); + spin_unlock_irq(&cct->task_lock); + + comp_event_callback(cq); + if (atomic_dec_and_test(&cq->nr_events)) + wake_up(&cq->wait_completion); + + spin_lock_irq(&cct->task_lock); + spin_lock(&cq->task_lock); + cq->nr_callbacks--; + if (!cq->nr_callbacks) { + list_del_init(cct->cq_list.next); + cct->cq_jobs--; + } + spin_unlock(&cq->task_lock); + } +} + +static void comp_task_park(unsigned int cpu) +{ + struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu); + struct ehca_cpu_comp_task *target; + struct task_struct *thread; + struct ehca_cq *cq, *tmp; + LIST_HEAD(list); + + spin_lock_irq(&cct->task_lock); + cct->cq_jobs = 0; + cct->active = 0; + list_splice_init(&cct->cq_list, &list); + spin_unlock_irq(&cct->task_lock); + + cpu = find_next_online_cpu(pool); + target = per_cpu_ptr(pool->cpu_comp_tasks, cpu); + thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu); + spin_lock_irq(&target->task_lock); + list_for_each_entry_safe(cq, tmp, &list, entry) { + list_del(&cq->entry); + __queue_comp_task(cq, target, thread); + } + spin_unlock_irq(&target->task_lock); +} + +static void comp_task_stop(unsigned int cpu, bool online) +{ + struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu); + + spin_lock_irq(&cct->task_lock); + cct->cq_jobs = 0; + cct->active = 0; + WARN_ON(!list_empty(&cct->cq_list)); + spin_unlock_irq(&cct->task_lock); +} + +static int comp_task_should_run(unsigned int cpu) +{ + struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu); + + return cct->cq_jobs; +} + +static void comp_task(unsigned int cpu) +{ + struct ehca_cpu_comp_task *cct = this_cpu_ptr(pool->cpu_comp_tasks); + int cql_empty; + + spin_lock_irq(&cct->task_lock); + cql_empty = list_empty(&cct->cq_list); + if (!cql_empty) { + __set_current_state(TASK_RUNNING); + run_comp_task(cct); + } + spin_unlock_irq(&cct->task_lock); +} + +static struct smp_hotplug_thread comp_pool_threads = { + .thread_should_run = comp_task_should_run, + .thread_fn = comp_task, + .thread_comm = "ehca_comp/%u", + .cleanup = comp_task_stop, + .park = comp_task_park, +}; + +int ehca_create_comp_pool(void) +{ + int cpu, ret = -ENOMEM; + + if (!ehca_scaling_code) + return 0; + + pool = kzalloc(sizeof(struct ehca_comp_pool), GFP_KERNEL); + if (pool == NULL) + return -ENOMEM; + + spin_lock_init(&pool->last_cpu_lock); + pool->last_cpu = cpumask_any(cpu_online_mask); + + pool->cpu_comp_tasks = alloc_percpu(struct ehca_cpu_comp_task); + if (!pool->cpu_comp_tasks) + goto out_pool; + + pool->cpu_comp_threads = alloc_percpu(struct task_struct *); + if (!pool->cpu_comp_threads) + goto out_tasks; + + for_each_present_cpu(cpu) { + struct ehca_cpu_comp_task *cct; + + cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu); + spin_lock_init(&cct->task_lock); + INIT_LIST_HEAD(&cct->cq_list); + } + + comp_pool_threads.store = pool->cpu_comp_threads; + ret = smpboot_register_percpu_thread(&comp_pool_threads); + if (ret) + goto out_threads; + + pr_info("eHCA scaling code enabled\n"); + return ret; + +out_threads: + free_percpu(pool->cpu_comp_threads); +out_tasks: + free_percpu(pool->cpu_comp_tasks); +out_pool: + kfree(pool); + return ret; +} + +void ehca_destroy_comp_pool(void) +{ + if (!ehca_scaling_code) + return; + + smpboot_unregister_percpu_thread(&comp_pool_threads); + + free_percpu(pool->cpu_comp_threads); + free_percpu(pool->cpu_comp_tasks); + kfree(pool); +} diff --git a/drivers/infiniband/hw/ehca/ehca_irq.h b/drivers/infiniband/hw/ehca/ehca_irq.h new file mode 100644 index 0000000..5370199 --- /dev/null +++ b/drivers/infiniband/hw/ehca/ehca_irq.h @@ -0,0 +1,77 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Function definitions and structs for EQs, NEQs and interrupts + * + * Authors: Heiko J Schick + * Khadija Souissi + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __EHCA_IRQ_H +#define __EHCA_IRQ_H + + +struct ehca_shca; + +#include +#include + +int ehca_error_data(struct ehca_shca *shca, void *data, u64 resource); + +irqreturn_t ehca_interrupt_neq(int irq, void *dev_id); +void ehca_tasklet_neq(unsigned long data); + +irqreturn_t ehca_interrupt_eq(int irq, void *dev_id); +void ehca_tasklet_eq(unsigned long data); +void ehca_process_eq(struct ehca_shca *shca, int is_irq); + +struct ehca_cpu_comp_task { + struct list_head cq_list; + spinlock_t task_lock; + int cq_jobs; + int active; +}; + +struct ehca_comp_pool { + struct ehca_cpu_comp_task __percpu *cpu_comp_tasks; + struct task_struct * __percpu *cpu_comp_threads; + int last_cpu; + spinlock_t last_cpu_lock; +}; + +int ehca_create_comp_pool(void); +void ehca_destroy_comp_pool(void); + +#endif diff --git a/drivers/infiniband/hw/ehca/ehca_iverbs.h b/drivers/infiniband/hw/ehca/ehca_iverbs.h new file mode 100644 index 0000000..22f79af --- /dev/null +++ b/drivers/infiniband/hw/ehca/ehca_iverbs.h @@ -0,0 +1,212 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Function definitions for internal functions + * + * Authors: Heiko J Schick + * Dietmar Decker + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __EHCA_IVERBS_H__ +#define __EHCA_IVERBS_H__ + +#include "ehca_classes.h" + +int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props); + +int ehca_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props); + +int ehca_query_sma_attr(struct ehca_shca *shca, u8 port, + struct ehca_sma_attr *attr); + +int ehca_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 * pkey); + +int ehca_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid); + +int ehca_modify_port(struct ib_device *ibdev, u8 port, int port_modify_mask, + struct ib_port_modify *props); + +struct ib_pd *ehca_alloc_pd(struct ib_device *device, + struct ib_ucontext *context, + struct ib_udata *udata); + +int ehca_dealloc_pd(struct ib_pd *pd); + +struct ib_ah *ehca_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr); + +int ehca_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr); + +int ehca_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr); + +int ehca_destroy_ah(struct ib_ah *ah); + +struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags); + +struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd, + struct ib_phys_buf *phys_buf_array, + int num_phys_buf, + int mr_access_flags, u64 *iova_start); + +struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt, int mr_access_flags, + struct ib_udata *udata); + +int ehca_rereg_phys_mr(struct ib_mr *mr, + int mr_rereg_mask, + struct ib_pd *pd, + struct ib_phys_buf *phys_buf_array, + int num_phys_buf, int mr_access_flags, u64 *iova_start); + +int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr); + +int ehca_dereg_mr(struct ib_mr *mr); + +struct ib_mw *ehca_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); + +int ehca_bind_mw(struct ib_qp *qp, struct ib_mw *mw, + struct ib_mw_bind *mw_bind); + +int ehca_dealloc_mw(struct ib_mw *mw); + +struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd, + int mr_access_flags, + struct ib_fmr_attr *fmr_attr); + +int ehca_map_phys_fmr(struct ib_fmr *fmr, + u64 *page_list, int list_len, u64 iova); + +int ehca_unmap_fmr(struct list_head *fmr_list); + +int ehca_dealloc_fmr(struct ib_fmr *fmr); + +enum ehca_eq_type { + EHCA_EQ = 0, /* Event Queue */ + EHCA_NEQ /* Notification Event Queue */ +}; + +int ehca_create_eq(struct ehca_shca *shca, struct ehca_eq *eq, + enum ehca_eq_type type, const u32 length); + +int ehca_destroy_eq(struct ehca_shca *shca, struct ehca_eq *eq); + +void *ehca_poll_eq(struct ehca_shca *shca, struct ehca_eq *eq); + + +struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector, + struct ib_ucontext *context, + struct ib_udata *udata); + +int ehca_destroy_cq(struct ib_cq *cq); + +int ehca_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata); + +int ehca_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc); + +int ehca_peek_cq(struct ib_cq *cq, int wc_cnt); + +int ehca_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags notify_flags); + +struct ib_qp *ehca_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); + +int ehca_destroy_qp(struct ib_qp *qp); + +int ehca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, + struct ib_udata *udata); + +int ehca_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); + +int ehca_post_send(struct ib_qp *qp, struct ib_send_wr *send_wr, + struct ib_send_wr **bad_send_wr); + +int ehca_post_recv(struct ib_qp *qp, struct ib_recv_wr *recv_wr, + struct ib_recv_wr **bad_recv_wr); + +int ehca_post_srq_recv(struct ib_srq *srq, + struct ib_recv_wr *recv_wr, + struct ib_recv_wr **bad_recv_wr); + +struct ib_srq *ehca_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata); + +int ehca_modify_srq(struct ib_srq *srq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata); + +int ehca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); + +int ehca_destroy_srq(struct ib_srq *srq); + +u64 ehca_define_sqp(struct ehca_shca *shca, struct ehca_qp *ibqp, + struct ib_qp_init_attr *qp_init_attr); + +int ehca_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); + +int ehca_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); + +struct ib_ucontext *ehca_alloc_ucontext(struct ib_device *device, + struct ib_udata *udata); + +int ehca_dealloc_ucontext(struct ib_ucontext *context); + +int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); + +int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, + struct ib_mad *out_mad); + +void ehca_poll_eqs(unsigned long data); + +int ehca_calc_ipd(struct ehca_shca *shca, int port, + enum ib_rate path_rate, u32 *ipd); + +void ehca_add_to_err_list(struct ehca_qp *qp, int on_sq); + +#ifdef CONFIG_PPC_64K_PAGES +void *ehca_alloc_fw_ctrlblock(gfp_t flags); +void ehca_free_fw_ctrlblock(void *ptr); +#else +#define ehca_alloc_fw_ctrlblock(flags) ((void *)get_zeroed_page(flags)) +#define ehca_free_fw_ctrlblock(ptr) free_page((unsigned long)(ptr)) +#endif + +void ehca_recover_sqp(struct ib_qp *sqp); + +#endif diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c new file mode 100644 index 0000000..cd8d290 --- /dev/null +++ b/drivers/infiniband/hw/ehca/ehca_main.c @@ -0,0 +1,1100 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * module start stop, hca detection + * + * Authors: Heiko J Schick + * Hoang-Nam Nguyen + * Joachim Fenkes + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef CONFIG_PPC_64K_PAGES +#include +#endif + +#include +#include +#include "ehca_classes.h" +#include "ehca_iverbs.h" +#include "ehca_mrmw.h" +#include "ehca_tools.h" +#include "hcp_if.h" + +#define HCAD_VERSION "0029" + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Christoph Raisch "); +MODULE_DESCRIPTION("IBM eServer HCA InfiniBand Device Driver"); +MODULE_VERSION(HCAD_VERSION); + +static bool ehca_open_aqp1 = 0; +static int ehca_hw_level = 0; +static bool ehca_poll_all_eqs = 1; + +int ehca_debug_level = 0; +int ehca_nr_ports = -1; +bool ehca_use_hp_mr = 0; +int ehca_port_act_time = 30; +int ehca_static_rate = -1; +bool ehca_scaling_code = 0; +int ehca_lock_hcalls = -1; +int ehca_max_cq = -1; +int ehca_max_qp = -1; + +module_param_named(open_aqp1, ehca_open_aqp1, bool, S_IRUGO); +module_param_named(debug_level, ehca_debug_level, int, S_IRUGO); +module_param_named(hw_level, ehca_hw_level, int, S_IRUGO); +module_param_named(nr_ports, ehca_nr_ports, int, S_IRUGO); +module_param_named(use_hp_mr, ehca_use_hp_mr, bool, S_IRUGO); +module_param_named(port_act_time, ehca_port_act_time, int, S_IRUGO); +module_param_named(poll_all_eqs, ehca_poll_all_eqs, bool, S_IRUGO); +module_param_named(static_rate, ehca_static_rate, int, S_IRUGO); +module_param_named(scaling_code, ehca_scaling_code, bool, S_IRUGO); +module_param_named(lock_hcalls, ehca_lock_hcalls, bint, S_IRUGO); +module_param_named(number_of_cqs, ehca_max_cq, int, S_IRUGO); +module_param_named(number_of_qps, ehca_max_qp, int, S_IRUGO); + +MODULE_PARM_DESC(open_aqp1, + "Open AQP1 on startup (default: no)"); +MODULE_PARM_DESC(debug_level, + "Amount of debug output (0: none (default), 1: traces, " + "2: some dumps, 3: lots)"); +MODULE_PARM_DESC(hw_level, + "Hardware level (0: autosensing (default), " + "0x10..0x14: eHCA, 0x20..0x23: eHCA2)"); +MODULE_PARM_DESC(nr_ports, + "number of connected ports (-1: autodetect (default), " + "1: port one only, 2: two ports)"); +MODULE_PARM_DESC(use_hp_mr, + "Use high performance MRs (default: no)"); +MODULE_PARM_DESC(port_act_time, + "Time to wait for port activation (default: 30 sec)"); +MODULE_PARM_DESC(poll_all_eqs, + "Poll all event queues periodically (default: yes)"); +MODULE_PARM_DESC(static_rate, + "Set permanent static rate (default: no static rate)"); +MODULE_PARM_DESC(scaling_code, + "Enable scaling code (default: no)"); +MODULE_PARM_DESC(lock_hcalls, + "Serialize all hCalls made by the driver " + "(default: autodetect)"); +MODULE_PARM_DESC(number_of_cqs, + "Max number of CQs which can be allocated " + "(default: autodetect)"); +MODULE_PARM_DESC(number_of_qps, + "Max number of QPs which can be allocated " + "(default: autodetect)"); + +DEFINE_RWLOCK(ehca_qp_idr_lock); +DEFINE_RWLOCK(ehca_cq_idr_lock); +DEFINE_IDR(ehca_qp_idr); +DEFINE_IDR(ehca_cq_idr); + +static LIST_HEAD(shca_list); /* list of all registered ehcas */ +DEFINE_SPINLOCK(shca_list_lock); + +static struct timer_list poll_eqs_timer; + +#ifdef CONFIG_PPC_64K_PAGES +static struct kmem_cache *ctblk_cache; + +void *ehca_alloc_fw_ctrlblock(gfp_t flags) +{ + void *ret = kmem_cache_zalloc(ctblk_cache, flags); + if (!ret) + ehca_gen_err("Out of memory for ctblk"); + return ret; +} + +void ehca_free_fw_ctrlblock(void *ptr) +{ + if (ptr) + kmem_cache_free(ctblk_cache, ptr); + +} +#endif + +int ehca2ib_return_code(u64 ehca_rc) +{ + switch (ehca_rc) { + case H_SUCCESS: + return 0; + case H_RESOURCE: /* Resource in use */ + case H_BUSY: + return -EBUSY; + case H_NOT_ENOUGH_RESOURCES: /* insufficient resources */ + case H_CONSTRAINED: /* resource constraint */ + case H_NO_MEM: + return -ENOMEM; + default: + return -EINVAL; + } +} + +static int ehca_create_slab_caches(void) +{ + int ret; + + ret = ehca_init_pd_cache(); + if (ret) { + ehca_gen_err("Cannot create PD SLAB cache."); + return ret; + } + + ret = ehca_init_cq_cache(); + if (ret) { + ehca_gen_err("Cannot create CQ SLAB cache."); + goto create_slab_caches2; + } + + ret = ehca_init_qp_cache(); + if (ret) { + ehca_gen_err("Cannot create QP SLAB cache."); + goto create_slab_caches3; + } + + ret = ehca_init_av_cache(); + if (ret) { + ehca_gen_err("Cannot create AV SLAB cache."); + goto create_slab_caches4; + } + + ret = ehca_init_mrmw_cache(); + if (ret) { + ehca_gen_err("Cannot create MR&MW SLAB cache."); + goto create_slab_caches5; + } + + ret = ehca_init_small_qp_cache(); + if (ret) { + ehca_gen_err("Cannot create small queue SLAB cache."); + goto create_slab_caches6; + } + +#ifdef CONFIG_PPC_64K_PAGES + ctblk_cache = kmem_cache_create("ehca_cache_ctblk", + EHCA_PAGESIZE, H_CB_ALIGNMENT, + SLAB_HWCACHE_ALIGN, + NULL); + if (!ctblk_cache) { + ehca_gen_err("Cannot create ctblk SLAB cache."); + ehca_cleanup_small_qp_cache(); + ret = -ENOMEM; + goto create_slab_caches6; + } +#endif + return 0; + +create_slab_caches6: + ehca_cleanup_mrmw_cache(); + +create_slab_caches5: + ehca_cleanup_av_cache(); + +create_slab_caches4: + ehca_cleanup_qp_cache(); + +create_slab_caches3: + ehca_cleanup_cq_cache(); + +create_slab_caches2: + ehca_cleanup_pd_cache(); + + return ret; +} + +static void ehca_destroy_slab_caches(void) +{ + ehca_cleanup_small_qp_cache(); + ehca_cleanup_mrmw_cache(); + ehca_cleanup_av_cache(); + ehca_cleanup_qp_cache(); + ehca_cleanup_cq_cache(); + ehca_cleanup_pd_cache(); +#ifdef CONFIG_PPC_64K_PAGES + if (ctblk_cache) + kmem_cache_destroy(ctblk_cache); +#endif +} + +#define EHCA_HCAAVER EHCA_BMASK_IBM(32, 39) +#define EHCA_REVID EHCA_BMASK_IBM(40, 63) + +static struct cap_descr { + u64 mask; + char *descr; +} hca_cap_descr[] = { + { HCA_CAP_AH_PORT_NR_CHECK, "HCA_CAP_AH_PORT_NR_CHECK" }, + { HCA_CAP_ATOMIC, "HCA_CAP_ATOMIC" }, + { HCA_CAP_AUTO_PATH_MIG, "HCA_CAP_AUTO_PATH_MIG" }, + { HCA_CAP_BAD_P_KEY_CTR, "HCA_CAP_BAD_P_KEY_CTR" }, + { HCA_CAP_SQD_RTS_PORT_CHANGE, "HCA_CAP_SQD_RTS_PORT_CHANGE" }, + { HCA_CAP_CUR_QP_STATE_MOD, "HCA_CAP_CUR_QP_STATE_MOD" }, + { HCA_CAP_INIT_TYPE, "HCA_CAP_INIT_TYPE" }, + { HCA_CAP_PORT_ACTIVE_EVENT, "HCA_CAP_PORT_ACTIVE_EVENT" }, + { HCA_CAP_Q_KEY_VIOL_CTR, "HCA_CAP_Q_KEY_VIOL_CTR" }, + { HCA_CAP_WQE_RESIZE, "HCA_CAP_WQE_RESIZE" }, + { HCA_CAP_RAW_PACKET_MCAST, "HCA_CAP_RAW_PACKET_MCAST" }, + { HCA_CAP_SHUTDOWN_PORT, "HCA_CAP_SHUTDOWN_PORT" }, + { HCA_CAP_RC_LL_QP, "HCA_CAP_RC_LL_QP" }, + { HCA_CAP_SRQ, "HCA_CAP_SRQ" }, + { HCA_CAP_UD_LL_QP, "HCA_CAP_UD_LL_QP" }, + { HCA_CAP_RESIZE_MR, "HCA_CAP_RESIZE_MR" }, + { HCA_CAP_MINI_QP, "HCA_CAP_MINI_QP" }, + { HCA_CAP_H_ALLOC_RES_SYNC, "HCA_CAP_H_ALLOC_RES_SYNC" }, +}; + +static int ehca_sense_attributes(struct ehca_shca *shca) +{ + int i, ret = 0; + u64 h_ret; + struct hipz_query_hca *rblock; + struct hipz_query_port *port; + const char *loc_code; + + static const u32 pgsize_map[] = { + HCA_CAP_MR_PGSIZE_4K, 0x1000, + HCA_CAP_MR_PGSIZE_64K, 0x10000, + HCA_CAP_MR_PGSIZE_1M, 0x100000, + HCA_CAP_MR_PGSIZE_16M, 0x1000000, + }; + + ehca_gen_dbg("Probing adapter %s...", + shca->ofdev->dev.of_node->full_name); + loc_code = of_get_property(shca->ofdev->dev.of_node, "ibm,loc-code", + NULL); + if (loc_code) + ehca_gen_dbg(" ... location lode=%s", loc_code); + + rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!rblock) { + ehca_gen_err("Cannot allocate rblock memory."); + return -ENOMEM; + } + + h_ret = hipz_h_query_hca(shca->ipz_hca_handle, rblock); + if (h_ret != H_SUCCESS) { + ehca_gen_err("Cannot query device properties. h_ret=%lli", + h_ret); + ret = -EPERM; + goto sense_attributes1; + } + + if (ehca_nr_ports == 1) + shca->num_ports = 1; + else + shca->num_ports = (u8)rblock->num_ports; + + ehca_gen_dbg(" ... found %x ports", rblock->num_ports); + + if (ehca_hw_level == 0) { + u32 hcaaver; + u32 revid; + + hcaaver = EHCA_BMASK_GET(EHCA_HCAAVER, rblock->hw_ver); + revid = EHCA_BMASK_GET(EHCA_REVID, rblock->hw_ver); + + ehca_gen_dbg(" ... hardware version=%x:%x", hcaaver, revid); + + if (hcaaver == 1) { + if (revid <= 3) + shca->hw_level = 0x10 | (revid + 1); + else + shca->hw_level = 0x14; + } else if (hcaaver == 2) { + if (revid == 0) + shca->hw_level = 0x21; + else if (revid == 0x10) + shca->hw_level = 0x22; + else if (revid == 0x20 || revid == 0x21) + shca->hw_level = 0x23; + } + + if (!shca->hw_level) { + ehca_gen_warn("unknown hardware version" + " - assuming default level"); + shca->hw_level = 0x22; + } + } else + shca->hw_level = ehca_hw_level; + ehca_gen_dbg(" ... hardware level=%x", shca->hw_level); + + shca->hca_cap = rblock->hca_cap_indicators; + ehca_gen_dbg(" ... HCA capabilities:"); + for (i = 0; i < ARRAY_SIZE(hca_cap_descr); i++) + if (EHCA_BMASK_GET(hca_cap_descr[i].mask, shca->hca_cap)) + ehca_gen_dbg(" %s", hca_cap_descr[i].descr); + + /* Autodetect hCall locking -- the "H_ALLOC_RESOURCE synced" flag is + * a firmware property, so it's valid across all adapters + */ + if (ehca_lock_hcalls == -1) + ehca_lock_hcalls = !EHCA_BMASK_GET(HCA_CAP_H_ALLOC_RES_SYNC, + shca->hca_cap); + + /* translate supported MR page sizes; always support 4K */ + shca->hca_cap_mr_pgsize = EHCA_PAGESIZE; + for (i = 0; i < ARRAY_SIZE(pgsize_map); i += 2) + if (rblock->memory_page_size_supported & pgsize_map[i]) + shca->hca_cap_mr_pgsize |= pgsize_map[i + 1]; + + /* Set maximum number of CQs and QPs to calculate EQ size */ + if (shca->max_num_qps == -1) + shca->max_num_qps = min_t(int, rblock->max_qp, + EHCA_MAX_NUM_QUEUES); + else if (shca->max_num_qps < 1 || shca->max_num_qps > rblock->max_qp) { + ehca_gen_warn("The requested number of QPs is out of range " + "(1 - %i) specified by HW. Value is set to %i", + rblock->max_qp, rblock->max_qp); + shca->max_num_qps = rblock->max_qp; + } + + if (shca->max_num_cqs == -1) + shca->max_num_cqs = min_t(int, rblock->max_cq, + EHCA_MAX_NUM_QUEUES); + else if (shca->max_num_cqs < 1 || shca->max_num_cqs > rblock->max_cq) { + ehca_gen_warn("The requested number of CQs is out of range " + "(1 - %i) specified by HW. Value is set to %i", + rblock->max_cq, rblock->max_cq); + } + + /* query max MTU from first port -- it's the same for all ports */ + port = (struct hipz_query_port *)rblock; + h_ret = hipz_h_query_port(shca->ipz_hca_handle, 1, port); + if (h_ret != H_SUCCESS) { + ehca_gen_err("Cannot query port properties. h_ret=%lli", + h_ret); + ret = -EPERM; + goto sense_attributes1; + } + + shca->max_mtu = port->max_mtu; + +sense_attributes1: + ehca_free_fw_ctrlblock(rblock); + return ret; +} + +static int init_node_guid(struct ehca_shca *shca) +{ + int ret = 0; + struct hipz_query_hca *rblock; + + rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!rblock) { + ehca_err(&shca->ib_device, "Can't allocate rblock memory."); + return -ENOMEM; + } + + if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) { + ehca_err(&shca->ib_device, "Can't query device properties"); + ret = -EINVAL; + goto init_node_guid1; + } + + memcpy(&shca->ib_device.node_guid, &rblock->node_guid, sizeof(u64)); + +init_node_guid1: + ehca_free_fw_ctrlblock(rblock); + return ret; +} + +static int ehca_init_device(struct ehca_shca *shca) +{ + int ret; + + ret = init_node_guid(shca); + if (ret) + return ret; + + strlcpy(shca->ib_device.name, "ehca%d", IB_DEVICE_NAME_MAX); + shca->ib_device.owner = THIS_MODULE; + + shca->ib_device.uverbs_abi_ver = 8; + shca->ib_device.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_DETACH_MCAST); + + shca->ib_device.node_type = RDMA_NODE_IB_CA; + shca->ib_device.phys_port_cnt = shca->num_ports; + shca->ib_device.num_comp_vectors = 1; + shca->ib_device.dma_device = &shca->ofdev->dev; + shca->ib_device.query_device = ehca_query_device; + shca->ib_device.query_port = ehca_query_port; + shca->ib_device.query_gid = ehca_query_gid; + shca->ib_device.query_pkey = ehca_query_pkey; + /* shca->in_device.modify_device = ehca_modify_device */ + shca->ib_device.modify_port = ehca_modify_port; + shca->ib_device.alloc_ucontext = ehca_alloc_ucontext; + shca->ib_device.dealloc_ucontext = ehca_dealloc_ucontext; + shca->ib_device.alloc_pd = ehca_alloc_pd; + shca->ib_device.dealloc_pd = ehca_dealloc_pd; + shca->ib_device.create_ah = ehca_create_ah; + /* shca->ib_device.modify_ah = ehca_modify_ah; */ + shca->ib_device.query_ah = ehca_query_ah; + shca->ib_device.destroy_ah = ehca_destroy_ah; + shca->ib_device.create_qp = ehca_create_qp; + shca->ib_device.modify_qp = ehca_modify_qp; + shca->ib_device.query_qp = ehca_query_qp; + shca->ib_device.destroy_qp = ehca_destroy_qp; + shca->ib_device.post_send = ehca_post_send; + shca->ib_device.post_recv = ehca_post_recv; + shca->ib_device.create_cq = ehca_create_cq; + shca->ib_device.destroy_cq = ehca_destroy_cq; + shca->ib_device.resize_cq = ehca_resize_cq; + shca->ib_device.poll_cq = ehca_poll_cq; + /* shca->ib_device.peek_cq = ehca_peek_cq; */ + shca->ib_device.req_notify_cq = ehca_req_notify_cq; + /* shca->ib_device.req_ncomp_notif = ehca_req_ncomp_notif; */ + shca->ib_device.get_dma_mr = ehca_get_dma_mr; + shca->ib_device.reg_phys_mr = ehca_reg_phys_mr; + shca->ib_device.reg_user_mr = ehca_reg_user_mr; + shca->ib_device.query_mr = ehca_query_mr; + shca->ib_device.dereg_mr = ehca_dereg_mr; + shca->ib_device.rereg_phys_mr = ehca_rereg_phys_mr; + shca->ib_device.alloc_mw = ehca_alloc_mw; + shca->ib_device.bind_mw = ehca_bind_mw; + shca->ib_device.dealloc_mw = ehca_dealloc_mw; + shca->ib_device.alloc_fmr = ehca_alloc_fmr; + shca->ib_device.map_phys_fmr = ehca_map_phys_fmr; + shca->ib_device.unmap_fmr = ehca_unmap_fmr; + shca->ib_device.dealloc_fmr = ehca_dealloc_fmr; + shca->ib_device.attach_mcast = ehca_attach_mcast; + shca->ib_device.detach_mcast = ehca_detach_mcast; + shca->ib_device.process_mad = ehca_process_mad; + shca->ib_device.mmap = ehca_mmap; + shca->ib_device.dma_ops = &ehca_dma_mapping_ops; + + if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) { + shca->ib_device.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ); + + shca->ib_device.create_srq = ehca_create_srq; + shca->ib_device.modify_srq = ehca_modify_srq; + shca->ib_device.query_srq = ehca_query_srq; + shca->ib_device.destroy_srq = ehca_destroy_srq; + shca->ib_device.post_srq_recv = ehca_post_srq_recv; + } + + return ret; +} + +static int ehca_create_aqp1(struct ehca_shca *shca, u32 port) +{ + struct ehca_sport *sport = &shca->sport[port - 1]; + struct ib_cq *ibcq; + struct ib_qp *ibqp; + struct ib_qp_init_attr qp_init_attr; + int ret; + + if (sport->ibcq_aqp1) { + ehca_err(&shca->ib_device, "AQP1 CQ is already created."); + return -EPERM; + } + + ibcq = ib_create_cq(&shca->ib_device, NULL, NULL, (void *)(-1), 10, 0); + if (IS_ERR(ibcq)) { + ehca_err(&shca->ib_device, "Cannot create AQP1 CQ."); + return PTR_ERR(ibcq); + } + sport->ibcq_aqp1 = ibcq; + + if (sport->ibqp_sqp[IB_QPT_GSI]) { + ehca_err(&shca->ib_device, "AQP1 QP is already created."); + ret = -EPERM; + goto create_aqp1; + } + + memset(&qp_init_attr, 0, sizeof(struct ib_qp_init_attr)); + qp_init_attr.send_cq = ibcq; + qp_init_attr.recv_cq = ibcq; + qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; + qp_init_attr.cap.max_send_wr = 100; + qp_init_attr.cap.max_recv_wr = 100; + qp_init_attr.cap.max_send_sge = 2; + qp_init_attr.cap.max_recv_sge = 1; + qp_init_attr.qp_type = IB_QPT_GSI; + qp_init_attr.port_num = port; + qp_init_attr.qp_context = NULL; + qp_init_attr.event_handler = NULL; + qp_init_attr.srq = NULL; + + ibqp = ib_create_qp(&shca->pd->ib_pd, &qp_init_attr); + if (IS_ERR(ibqp)) { + ehca_err(&shca->ib_device, "Cannot create AQP1 QP."); + ret = PTR_ERR(ibqp); + goto create_aqp1; + } + sport->ibqp_sqp[IB_QPT_GSI] = ibqp; + + return 0; + +create_aqp1: + ib_destroy_cq(sport->ibcq_aqp1); + return ret; +} + +static int ehca_destroy_aqp1(struct ehca_sport *sport) +{ + int ret; + + ret = ib_destroy_qp(sport->ibqp_sqp[IB_QPT_GSI]); + if (ret) { + ehca_gen_err("Cannot destroy AQP1 QP. ret=%i", ret); + return ret; + } + + ret = ib_destroy_cq(sport->ibcq_aqp1); + if (ret) + ehca_gen_err("Cannot destroy AQP1 CQ. ret=%i", ret); + + return ret; +} + +static ssize_t ehca_show_debug_level(struct device_driver *ddp, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", ehca_debug_level); +} + +static ssize_t ehca_store_debug_level(struct device_driver *ddp, + const char *buf, size_t count) +{ + int value = (*buf) - '0'; + if (value >= 0 && value <= 9) + ehca_debug_level = value; + return 1; +} + +static DRIVER_ATTR(debug_level, S_IRUSR | S_IWUSR, + ehca_show_debug_level, ehca_store_debug_level); + +static struct attribute *ehca_drv_attrs[] = { + &driver_attr_debug_level.attr, + NULL +}; + +static struct attribute_group ehca_drv_attr_grp = { + .attrs = ehca_drv_attrs +}; + +static const struct attribute_group *ehca_drv_attr_groups[] = { + &ehca_drv_attr_grp, + NULL, +}; + +#define EHCA_RESOURCE_ATTR(name) \ +static ssize_t ehca_show_##name(struct device *dev, \ + struct device_attribute *attr, \ + char *buf) \ +{ \ + struct ehca_shca *shca; \ + struct hipz_query_hca *rblock; \ + int data; \ + \ + shca = dev_get_drvdata(dev); \ + \ + rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL); \ + if (!rblock) { \ + dev_err(dev, "Can't allocate rblock memory.\n"); \ + return 0; \ + } \ + \ + if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) { \ + dev_err(dev, "Can't query device properties\n"); \ + ehca_free_fw_ctrlblock(rblock); \ + return 0; \ + } \ + \ + data = rblock->name; \ + ehca_free_fw_ctrlblock(rblock); \ + \ + if ((strcmp(#name, "num_ports") == 0) && (ehca_nr_ports == 1)) \ + return snprintf(buf, 256, "1\n"); \ + else \ + return snprintf(buf, 256, "%d\n", data); \ + \ +} \ +static DEVICE_ATTR(name, S_IRUGO, ehca_show_##name, NULL); + +EHCA_RESOURCE_ATTR(num_ports); +EHCA_RESOURCE_ATTR(hw_ver); +EHCA_RESOURCE_ATTR(max_eq); +EHCA_RESOURCE_ATTR(cur_eq); +EHCA_RESOURCE_ATTR(max_cq); +EHCA_RESOURCE_ATTR(cur_cq); +EHCA_RESOURCE_ATTR(max_qp); +EHCA_RESOURCE_ATTR(cur_qp); +EHCA_RESOURCE_ATTR(max_mr); +EHCA_RESOURCE_ATTR(cur_mr); +EHCA_RESOURCE_ATTR(max_mw); +EHCA_RESOURCE_ATTR(cur_mw); +EHCA_RESOURCE_ATTR(max_pd); +EHCA_RESOURCE_ATTR(max_ah); + +static ssize_t ehca_show_adapter_handle(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ehca_shca *shca = dev_get_drvdata(dev); + + return sprintf(buf, "%llx\n", shca->ipz_hca_handle.handle); + +} +static DEVICE_ATTR(adapter_handle, S_IRUGO, ehca_show_adapter_handle, NULL); + +static struct attribute *ehca_dev_attrs[] = { + &dev_attr_adapter_handle.attr, + &dev_attr_num_ports.attr, + &dev_attr_hw_ver.attr, + &dev_attr_max_eq.attr, + &dev_attr_cur_eq.attr, + &dev_attr_max_cq.attr, + &dev_attr_cur_cq.attr, + &dev_attr_max_qp.attr, + &dev_attr_cur_qp.attr, + &dev_attr_max_mr.attr, + &dev_attr_cur_mr.attr, + &dev_attr_max_mw.attr, + &dev_attr_cur_mw.attr, + &dev_attr_max_pd.attr, + &dev_attr_max_ah.attr, + NULL +}; + +static struct attribute_group ehca_dev_attr_grp = { + .attrs = ehca_dev_attrs +}; + +static int ehca_probe(struct platform_device *dev) +{ + struct ehca_shca *shca; + const u64 *handle; + struct ib_pd *ibpd; + int ret, i, eq_size; + unsigned long flags; + + handle = of_get_property(dev->dev.of_node, "ibm,hca-handle", NULL); + if (!handle) { + ehca_gen_err("Cannot get eHCA handle for adapter: %s.", + dev->dev.of_node->full_name); + return -ENODEV; + } + + if (!(*handle)) { + ehca_gen_err("Wrong eHCA handle for adapter: %s.", + dev->dev.of_node->full_name); + return -ENODEV; + } + + shca = (struct ehca_shca *)ib_alloc_device(sizeof(*shca)); + if (!shca) { + ehca_gen_err("Cannot allocate shca memory."); + return -ENOMEM; + } + + mutex_init(&shca->modify_mutex); + atomic_set(&shca->num_cqs, 0); + atomic_set(&shca->num_qps, 0); + shca->max_num_qps = ehca_max_qp; + shca->max_num_cqs = ehca_max_cq; + + for (i = 0; i < ARRAY_SIZE(shca->sport); i++) + spin_lock_init(&shca->sport[i].mod_sqp_lock); + + shca->ofdev = dev; + shca->ipz_hca_handle.handle = *handle; + dev_set_drvdata(&dev->dev, shca); + + ret = ehca_sense_attributes(shca); + if (ret < 0) { + ehca_gen_err("Cannot sense eHCA attributes."); + goto probe1; + } + + ret = ehca_init_device(shca); + if (ret) { + ehca_gen_err("Cannot init ehca device struct"); + goto probe1; + } + + eq_size = 2 * shca->max_num_cqs + 4 * shca->max_num_qps; + /* create event queues */ + ret = ehca_create_eq(shca, &shca->eq, EHCA_EQ, eq_size); + if (ret) { + ehca_err(&shca->ib_device, "Cannot create EQ."); + goto probe1; + } + + ret = ehca_create_eq(shca, &shca->neq, EHCA_NEQ, 513); + if (ret) { + ehca_err(&shca->ib_device, "Cannot create NEQ."); + goto probe3; + } + + /* create internal protection domain */ + ibpd = ehca_alloc_pd(&shca->ib_device, (void *)(-1), NULL); + if (IS_ERR(ibpd)) { + ehca_err(&shca->ib_device, "Cannot create internal PD."); + ret = PTR_ERR(ibpd); + goto probe4; + } + + shca->pd = container_of(ibpd, struct ehca_pd, ib_pd); + shca->pd->ib_pd.device = &shca->ib_device; + + /* create internal max MR */ + ret = ehca_reg_internal_maxmr(shca, shca->pd, &shca->maxmr); + + if (ret) { + ehca_err(&shca->ib_device, "Cannot create internal MR ret=%i", + ret); + goto probe5; + } + + ret = ib_register_device(&shca->ib_device, NULL); + if (ret) { + ehca_err(&shca->ib_device, + "ib_register_device() failed ret=%i", ret); + goto probe6; + } + + /* create AQP1 for port 1 */ + if (ehca_open_aqp1 == 1) { + shca->sport[0].port_state = IB_PORT_DOWN; + ret = ehca_create_aqp1(shca, 1); + if (ret) { + ehca_err(&shca->ib_device, + "Cannot create AQP1 for port 1."); + goto probe7; + } + } + + /* create AQP1 for port 2 */ + if ((ehca_open_aqp1 == 1) && (shca->num_ports == 2)) { + shca->sport[1].port_state = IB_PORT_DOWN; + ret = ehca_create_aqp1(shca, 2); + if (ret) { + ehca_err(&shca->ib_device, + "Cannot create AQP1 for port 2."); + goto probe8; + } + } + + ret = sysfs_create_group(&dev->dev.kobj, &ehca_dev_attr_grp); + if (ret) /* only complain; we can live without attributes */ + ehca_err(&shca->ib_device, + "Cannot create device attributes ret=%d", ret); + + spin_lock_irqsave(&shca_list_lock, flags); + list_add(&shca->shca_list, &shca_list); + spin_unlock_irqrestore(&shca_list_lock, flags); + + return 0; + +probe8: + ret = ehca_destroy_aqp1(&shca->sport[0]); + if (ret) + ehca_err(&shca->ib_device, + "Cannot destroy AQP1 for port 1. ret=%i", ret); + +probe7: + ib_unregister_device(&shca->ib_device); + +probe6: + ret = ehca_dereg_internal_maxmr(shca); + if (ret) + ehca_err(&shca->ib_device, + "Cannot destroy internal MR. ret=%x", ret); + +probe5: + ret = ehca_dealloc_pd(&shca->pd->ib_pd); + if (ret) + ehca_err(&shca->ib_device, + "Cannot destroy internal PD. ret=%x", ret); + +probe4: + ret = ehca_destroy_eq(shca, &shca->neq); + if (ret) + ehca_err(&shca->ib_device, + "Cannot destroy NEQ. ret=%x", ret); + +probe3: + ret = ehca_destroy_eq(shca, &shca->eq); + if (ret) + ehca_err(&shca->ib_device, + "Cannot destroy EQ. ret=%x", ret); + +probe1: + ib_dealloc_device(&shca->ib_device); + + return -EINVAL; +} + +static int ehca_remove(struct platform_device *dev) +{ + struct ehca_shca *shca = dev_get_drvdata(&dev->dev); + unsigned long flags; + int ret; + + sysfs_remove_group(&dev->dev.kobj, &ehca_dev_attr_grp); + + if (ehca_open_aqp1 == 1) { + int i; + for (i = 0; i < shca->num_ports; i++) { + ret = ehca_destroy_aqp1(&shca->sport[i]); + if (ret) + ehca_err(&shca->ib_device, + "Cannot destroy AQP1 for port %x " + "ret=%i", ret, i); + } + } + + ib_unregister_device(&shca->ib_device); + + ret = ehca_dereg_internal_maxmr(shca); + if (ret) + ehca_err(&shca->ib_device, + "Cannot destroy internal MR. ret=%i", ret); + + ret = ehca_dealloc_pd(&shca->pd->ib_pd); + if (ret) + ehca_err(&shca->ib_device, + "Cannot destroy internal PD. ret=%i", ret); + + ret = ehca_destroy_eq(shca, &shca->eq); + if (ret) + ehca_err(&shca->ib_device, "Cannot destroy EQ. ret=%i", ret); + + ret = ehca_destroy_eq(shca, &shca->neq); + if (ret) + ehca_err(&shca->ib_device, "Canot destroy NEQ. ret=%i", ret); + + ib_dealloc_device(&shca->ib_device); + + spin_lock_irqsave(&shca_list_lock, flags); + list_del(&shca->shca_list); + spin_unlock_irqrestore(&shca_list_lock, flags); + + return ret; +} + +static struct of_device_id ehca_device_table[] = +{ + { + .name = "lhca", + .compatible = "IBM,lhca", + }, + {}, +}; +MODULE_DEVICE_TABLE(of, ehca_device_table); + +static struct platform_driver ehca_driver = { + .probe = ehca_probe, + .remove = ehca_remove, + .driver = { + .name = "ehca", + .owner = THIS_MODULE, + .groups = ehca_drv_attr_groups, + .of_match_table = ehca_device_table, + }, +}; + +void ehca_poll_eqs(unsigned long data) +{ + struct ehca_shca *shca; + + spin_lock(&shca_list_lock); + list_for_each_entry(shca, &shca_list, shca_list) { + if (shca->eq.is_initialized) { + /* call deadman proc only if eq ptr does not change */ + struct ehca_eq *eq = &shca->eq; + int max = 3; + volatile u64 q_ofs, q_ofs2; + unsigned long flags; + spin_lock_irqsave(&eq->spinlock, flags); + q_ofs = eq->ipz_queue.current_q_offset; + spin_unlock_irqrestore(&eq->spinlock, flags); + do { + spin_lock_irqsave(&eq->spinlock, flags); + q_ofs2 = eq->ipz_queue.current_q_offset; + spin_unlock_irqrestore(&eq->spinlock, flags); + max--; + } while (q_ofs == q_ofs2 && max > 0); + if (q_ofs == q_ofs2) + ehca_process_eq(shca, 0); + } + } + mod_timer(&poll_eqs_timer, round_jiffies(jiffies + HZ)); + spin_unlock(&shca_list_lock); +} + +static int ehca_mem_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + static unsigned long ehca_dmem_warn_time; + unsigned long flags; + + switch (action) { + case MEM_CANCEL_OFFLINE: + case MEM_CANCEL_ONLINE: + case MEM_ONLINE: + case MEM_OFFLINE: + return NOTIFY_OK; + case MEM_GOING_ONLINE: + case MEM_GOING_OFFLINE: + /* only ok if no hca is attached to the lpar */ + spin_lock_irqsave(&shca_list_lock, flags); + if (list_empty(&shca_list)) { + spin_unlock_irqrestore(&shca_list_lock, flags); + return NOTIFY_OK; + } else { + spin_unlock_irqrestore(&shca_list_lock, flags); + if (printk_timed_ratelimit(&ehca_dmem_warn_time, + 30 * 1000)) + ehca_gen_err("DMEM operations are not allowed" + "in conjunction with eHCA"); + return NOTIFY_BAD; + } + } + return NOTIFY_OK; +} + +static struct notifier_block ehca_mem_nb = { + .notifier_call = ehca_mem_notifier, +}; + +static int __init ehca_module_init(void) +{ + int ret; + + printk(KERN_INFO "eHCA Infiniband Device Driver " + "(Version " HCAD_VERSION ")\n"); + + ret = ehca_create_comp_pool(); + if (ret) { + ehca_gen_err("Cannot create comp pool."); + return ret; + } + + ret = ehca_create_slab_caches(); + if (ret) { + ehca_gen_err("Cannot create SLAB caches"); + ret = -ENOMEM; + goto module_init1; + } + + ret = ehca_create_busmap(); + if (ret) { + ehca_gen_err("Cannot create busmap."); + goto module_init2; + } + + ret = ibmebus_register_driver(&ehca_driver); + if (ret) { + ehca_gen_err("Cannot register eHCA device driver"); + ret = -EINVAL; + goto module_init3; + } + + ret = register_memory_notifier(&ehca_mem_nb); + if (ret) { + ehca_gen_err("Failed registering memory add/remove notifier"); + goto module_init4; + } + + if (ehca_poll_all_eqs != 1) { + ehca_gen_err("WARNING!!!"); + ehca_gen_err("It is possible to lose interrupts."); + } else { + init_timer(&poll_eqs_timer); + poll_eqs_timer.function = ehca_poll_eqs; + poll_eqs_timer.expires = jiffies + HZ; + add_timer(&poll_eqs_timer); + } + + return 0; + +module_init4: + ibmebus_unregister_driver(&ehca_driver); + +module_init3: + ehca_destroy_busmap(); + +module_init2: + ehca_destroy_slab_caches(); + +module_init1: + ehca_destroy_comp_pool(); + return ret; +}; + +static void __exit ehca_module_exit(void) +{ + if (ehca_poll_all_eqs == 1) + del_timer_sync(&poll_eqs_timer); + + ibmebus_unregister_driver(&ehca_driver); + + unregister_memory_notifier(&ehca_mem_nb); + + ehca_destroy_busmap(); + + ehca_destroy_slab_caches(); + + ehca_destroy_comp_pool(); + + idr_destroy(&ehca_cq_idr); + idr_destroy(&ehca_qp_idr); +}; + +module_init(ehca_module_init); +module_exit(ehca_module_exit); diff --git a/drivers/infiniband/hw/ehca/ehca_mcast.c b/drivers/infiniband/hw/ehca/ehca_mcast.c new file mode 100644 index 0000000..120aedf --- /dev/null +++ b/drivers/infiniband/hw/ehca/ehca_mcast.c @@ -0,0 +1,131 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * mcast functions + * + * Authors: Khadija Souissi + * Waleri Fomin + * Reinhard Ernst + * Hoang-Nam Nguyen + * Heiko J Schick + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include "ehca_classes.h" +#include "ehca_tools.h" +#include "ehca_qes.h" +#include "ehca_iverbs.h" +#include "hcp_if.h" + +#define MAX_MC_LID 0xFFFE +#define MIN_MC_LID 0xC000 /* Multicast limits */ +#define EHCA_VALID_MULTICAST_GID(gid) ((gid)[0] == 0xFF) +#define EHCA_VALID_MULTICAST_LID(lid) \ + (((lid) >= MIN_MC_LID) && ((lid) <= MAX_MC_LID)) + +int ehca_attach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp); + struct ehca_shca *shca = container_of(ibqp->device, struct ehca_shca, + ib_device); + union ib_gid my_gid; + u64 subnet_prefix, interface_id, h_ret; + + if (ibqp->qp_type != IB_QPT_UD) { + ehca_err(ibqp->device, "invalid qp_type=%x", ibqp->qp_type); + return -EINVAL; + } + + if (!(EHCA_VALID_MULTICAST_GID(gid->raw))) { + ehca_err(ibqp->device, "invalid mulitcast gid"); + return -EINVAL; + } else if ((lid < MIN_MC_LID) || (lid > MAX_MC_LID)) { + ehca_err(ibqp->device, "invalid mulitcast lid=%x", lid); + return -EINVAL; + } + + memcpy(&my_gid.raw, gid->raw, sizeof(union ib_gid)); + + subnet_prefix = be64_to_cpu(my_gid.global.subnet_prefix); + interface_id = be64_to_cpu(my_gid.global.interface_id); + h_ret = hipz_h_attach_mcqp(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, + my_qp->galpas.kernel, + lid, subnet_prefix, interface_id); + if (h_ret != H_SUCCESS) + ehca_err(ibqp->device, + "ehca_qp=%p qp_num=%x hipz_h_attach_mcqp() failed " + "h_ret=%lli", my_qp, ibqp->qp_num, h_ret); + + return ehca2ib_return_code(h_ret); +} + +int ehca_detach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp); + struct ehca_shca *shca = container_of(ibqp->pd->device, + struct ehca_shca, ib_device); + union ib_gid my_gid; + u64 subnet_prefix, interface_id, h_ret; + + if (ibqp->qp_type != IB_QPT_UD) { + ehca_err(ibqp->device, "invalid qp_type %x", ibqp->qp_type); + return -EINVAL; + } + + if (!(EHCA_VALID_MULTICAST_GID(gid->raw))) { + ehca_err(ibqp->device, "invalid mulitcast gid"); + return -EINVAL; + } else if ((lid < MIN_MC_LID) || (lid > MAX_MC_LID)) { + ehca_err(ibqp->device, "invalid mulitcast lid=%x", lid); + return -EINVAL; + } + + memcpy(&my_gid.raw, gid->raw, sizeof(union ib_gid)); + + subnet_prefix = be64_to_cpu(my_gid.global.subnet_prefix); + interface_id = be64_to_cpu(my_gid.global.interface_id); + h_ret = hipz_h_detach_mcqp(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, + my_qp->galpas.kernel, + lid, subnet_prefix, interface_id); + if (h_ret != H_SUCCESS) + ehca_err(ibqp->device, + "ehca_qp=%p qp_num=%x hipz_h_detach_mcqp() failed " + "h_ret=%lli", my_qp, ibqp->qp_num, h_ret); + + return ehca2ib_return_code(h_ret); +} diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c new file mode 100644 index 0000000..3488e8c --- /dev/null +++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c @@ -0,0 +1,2593 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * MR/MW functions + * + * Authors: Dietmar Decker + * Christoph Raisch + * Hoang-Nam Nguyen + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + +#include "ehca_iverbs.h" +#include "ehca_mrmw.h" +#include "hcp_if.h" +#include "hipz_hw.h" + +#define NUM_CHUNKS(length, chunk_size) \ + (((length) + (chunk_size - 1)) / (chunk_size)) + +/* max number of rpages (per hcall register_rpages) */ +#define MAX_RPAGES 512 + +/* DMEM toleration management */ +#define EHCA_SECTSHIFT SECTION_SIZE_BITS +#define EHCA_SECTSIZE (1UL << EHCA_SECTSHIFT) +#define EHCA_HUGEPAGESHIFT 34 +#define EHCA_HUGEPAGE_SIZE (1UL << EHCA_HUGEPAGESHIFT) +#define EHCA_HUGEPAGE_PFN_MASK ((EHCA_HUGEPAGE_SIZE - 1) >> PAGE_SHIFT) +#define EHCA_INVAL_ADDR 0xFFFFFFFFFFFFFFFFULL +#define EHCA_DIR_INDEX_SHIFT 13 /* 8k Entries in 64k block */ +#define EHCA_TOP_INDEX_SHIFT (EHCA_DIR_INDEX_SHIFT * 2) +#define EHCA_MAP_ENTRIES (1 << EHCA_DIR_INDEX_SHIFT) +#define EHCA_TOP_MAP_SIZE (0x10000) /* currently fixed map size */ +#define EHCA_DIR_MAP_SIZE (0x10000) +#define EHCA_ENT_MAP_SIZE (0x10000) +#define EHCA_INDEX_MASK (EHCA_MAP_ENTRIES - 1) + +static unsigned long ehca_mr_len; + +/* + * Memory map data structures + */ +struct ehca_dir_bmap { + u64 ent[EHCA_MAP_ENTRIES]; +}; +struct ehca_top_bmap { + struct ehca_dir_bmap *dir[EHCA_MAP_ENTRIES]; +}; +struct ehca_bmap { + struct ehca_top_bmap *top[EHCA_MAP_ENTRIES]; +}; + +static struct ehca_bmap *ehca_bmap; + +static struct kmem_cache *mr_cache; +static struct kmem_cache *mw_cache; + +enum ehca_mr_pgsize { + EHCA_MR_PGSIZE4K = 0x1000L, + EHCA_MR_PGSIZE64K = 0x10000L, + EHCA_MR_PGSIZE1M = 0x100000L, + EHCA_MR_PGSIZE16M = 0x1000000L +}; + +#define EHCA_MR_PGSHIFT4K 12 +#define EHCA_MR_PGSHIFT64K 16 +#define EHCA_MR_PGSHIFT1M 20 +#define EHCA_MR_PGSHIFT16M 24 + +static u64 ehca_map_vaddr(void *caddr); + +static u32 ehca_encode_hwpage_size(u32 pgsize) +{ + int log = ilog2(pgsize); + WARN_ON(log < 12 || log > 24 || log & 3); + return (log - 12) / 4; +} + +static u64 ehca_get_max_hwpage_size(struct ehca_shca *shca) +{ + return rounddown_pow_of_two(shca->hca_cap_mr_pgsize); +} + +static struct ehca_mr *ehca_mr_new(void) +{ + struct ehca_mr *me; + + me = kmem_cache_zalloc(mr_cache, GFP_KERNEL); + if (me) + spin_lock_init(&me->mrlock); + else + ehca_gen_err("alloc failed"); + + return me; +} + +static void ehca_mr_delete(struct ehca_mr *me) +{ + kmem_cache_free(mr_cache, me); +} + +static struct ehca_mw *ehca_mw_new(void) +{ + struct ehca_mw *me; + + me = kmem_cache_zalloc(mw_cache, GFP_KERNEL); + if (me) + spin_lock_init(&me->mwlock); + else + ehca_gen_err("alloc failed"); + + return me; +} + +static void ehca_mw_delete(struct ehca_mw *me) +{ + kmem_cache_free(mw_cache, me); +} + +/*----------------------------------------------------------------------*/ + +struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags) +{ + struct ib_mr *ib_mr; + int ret; + struct ehca_mr *e_maxmr; + struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd); + struct ehca_shca *shca = + container_of(pd->device, struct ehca_shca, ib_device); + + if (shca->maxmr) { + e_maxmr = ehca_mr_new(); + if (!e_maxmr) { + ehca_err(&shca->ib_device, "out of memory"); + ib_mr = ERR_PTR(-ENOMEM); + goto get_dma_mr_exit0; + } + + ret = ehca_reg_maxmr(shca, e_maxmr, + (void *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START)), + mr_access_flags, e_pd, + &e_maxmr->ib.ib_mr.lkey, + &e_maxmr->ib.ib_mr.rkey); + if (ret) { + ehca_mr_delete(e_maxmr); + ib_mr = ERR_PTR(ret); + goto get_dma_mr_exit0; + } + ib_mr = &e_maxmr->ib.ib_mr; + } else { + ehca_err(&shca->ib_device, "no internal max-MR exist!"); + ib_mr = ERR_PTR(-EINVAL); + goto get_dma_mr_exit0; + } + +get_dma_mr_exit0: + if (IS_ERR(ib_mr)) + ehca_err(&shca->ib_device, "h_ret=%li pd=%p mr_access_flags=%x", + PTR_ERR(ib_mr), pd, mr_access_flags); + return ib_mr; +} /* end ehca_get_dma_mr() */ + +/*----------------------------------------------------------------------*/ + +struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd, + struct ib_phys_buf *phys_buf_array, + int num_phys_buf, + int mr_access_flags, + u64 *iova_start) +{ + struct ib_mr *ib_mr; + int ret; + struct ehca_mr *e_mr; + struct ehca_shca *shca = + container_of(pd->device, struct ehca_shca, ib_device); + struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd); + + u64 size; + + if ((num_phys_buf <= 0) || !phys_buf_array) { + ehca_err(pd->device, "bad input values: num_phys_buf=%x " + "phys_buf_array=%p", num_phys_buf, phys_buf_array); + ib_mr = ERR_PTR(-EINVAL); + goto reg_phys_mr_exit0; + } + if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) && + !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) || + ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) && + !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) { + /* + * Remote Write Access requires Local Write Access + * Remote Atomic Access requires Local Write Access + */ + ehca_err(pd->device, "bad input values: mr_access_flags=%x", + mr_access_flags); + ib_mr = ERR_PTR(-EINVAL); + goto reg_phys_mr_exit0; + } + + /* check physical buffer list and calculate size */ + ret = ehca_mr_chk_buf_and_calc_size(phys_buf_array, num_phys_buf, + iova_start, &size); + if (ret) { + ib_mr = ERR_PTR(ret); + goto reg_phys_mr_exit0; + } + if ((size == 0) || + (((u64)iova_start + size) < (u64)iova_start)) { + ehca_err(pd->device, "bad input values: size=%llx iova_start=%p", + size, iova_start); + ib_mr = ERR_PTR(-EINVAL); + goto reg_phys_mr_exit0; + } + + e_mr = ehca_mr_new(); + if (!e_mr) { + ehca_err(pd->device, "out of memory"); + ib_mr = ERR_PTR(-ENOMEM); + goto reg_phys_mr_exit0; + } + + /* register MR on HCA */ + if (ehca_mr_is_maxmr(size, iova_start)) { + e_mr->flags |= EHCA_MR_FLAG_MAXMR; + ret = ehca_reg_maxmr(shca, e_mr, iova_start, mr_access_flags, + e_pd, &e_mr->ib.ib_mr.lkey, + &e_mr->ib.ib_mr.rkey); + if (ret) { + ib_mr = ERR_PTR(ret); + goto reg_phys_mr_exit1; + } + } else { + struct ehca_mr_pginfo pginfo; + u32 num_kpages; + u32 num_hwpages; + u64 hw_pgsize; + + num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size, + PAGE_SIZE); + /* for kernel space we try most possible pgsize */ + hw_pgsize = ehca_get_max_hwpage_size(shca); + num_hwpages = NUM_CHUNKS(((u64)iova_start % hw_pgsize) + size, + hw_pgsize); + memset(&pginfo, 0, sizeof(pginfo)); + pginfo.type = EHCA_MR_PGI_PHYS; + pginfo.num_kpages = num_kpages; + pginfo.hwpage_size = hw_pgsize; + pginfo.num_hwpages = num_hwpages; + pginfo.u.phy.num_phys_buf = num_phys_buf; + pginfo.u.phy.phys_buf_array = phys_buf_array; + pginfo.next_hwpage = + ((u64)iova_start & ~PAGE_MASK) / hw_pgsize; + + ret = ehca_reg_mr(shca, e_mr, iova_start, size, mr_access_flags, + e_pd, &pginfo, &e_mr->ib.ib_mr.lkey, + &e_mr->ib.ib_mr.rkey, EHCA_REG_MR); + if (ret) { + ib_mr = ERR_PTR(ret); + goto reg_phys_mr_exit1; + } + } + + /* successful registration of all pages */ + return &e_mr->ib.ib_mr; + +reg_phys_mr_exit1: + ehca_mr_delete(e_mr); +reg_phys_mr_exit0: + if (IS_ERR(ib_mr)) + ehca_err(pd->device, "h_ret=%li pd=%p phys_buf_array=%p " + "num_phys_buf=%x mr_access_flags=%x iova_start=%p", + PTR_ERR(ib_mr), pd, phys_buf_array, + num_phys_buf, mr_access_flags, iova_start); + return ib_mr; +} /* end ehca_reg_phys_mr() */ + +/*----------------------------------------------------------------------*/ + +struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt, int mr_access_flags, + struct ib_udata *udata) +{ + struct ib_mr *ib_mr; + struct ehca_mr *e_mr; + struct ehca_shca *shca = + container_of(pd->device, struct ehca_shca, ib_device); + struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd); + struct ehca_mr_pginfo pginfo; + int ret, page_shift; + u32 num_kpages; + u32 num_hwpages; + u64 hwpage_size; + + if (!pd) { + ehca_gen_err("bad pd=%p", pd); + return ERR_PTR(-EFAULT); + } + + if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) && + !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) || + ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) && + !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) { + /* + * Remote Write Access requires Local Write Access + * Remote Atomic Access requires Local Write Access + */ + ehca_err(pd->device, "bad input values: mr_access_flags=%x", + mr_access_flags); + ib_mr = ERR_PTR(-EINVAL); + goto reg_user_mr_exit0; + } + + if (length == 0 || virt + length < virt) { + ehca_err(pd->device, "bad input values: length=%llx " + "virt_base=%llx", length, virt); + ib_mr = ERR_PTR(-EINVAL); + goto reg_user_mr_exit0; + } + + e_mr = ehca_mr_new(); + if (!e_mr) { + ehca_err(pd->device, "out of memory"); + ib_mr = ERR_PTR(-ENOMEM); + goto reg_user_mr_exit0; + } + + e_mr->umem = ib_umem_get(pd->uobject->context, start, length, + mr_access_flags, 0); + if (IS_ERR(e_mr->umem)) { + ib_mr = (void *)e_mr->umem; + goto reg_user_mr_exit1; + } + + if (e_mr->umem->page_size != PAGE_SIZE) { + ehca_err(pd->device, "page size not supported, " + "e_mr->umem->page_size=%x", e_mr->umem->page_size); + ib_mr = ERR_PTR(-EINVAL); + goto reg_user_mr_exit2; + } + + /* determine number of MR pages */ + num_kpages = NUM_CHUNKS((virt % PAGE_SIZE) + length, PAGE_SIZE); + /* select proper hw_pgsize */ + page_shift = PAGE_SHIFT; + if (e_mr->umem->hugetlb) { + /* determine page_shift, clamp between 4K and 16M */ + page_shift = (fls64(length - 1) + 3) & ~3; + page_shift = min(max(page_shift, EHCA_MR_PGSHIFT4K), + EHCA_MR_PGSHIFT16M); + } + hwpage_size = 1UL << page_shift; + + /* now that we have the desired page size, shift until it's + * supported, too. 4K is always supported, so this terminates. + */ + while (!(hwpage_size & shca->hca_cap_mr_pgsize)) + hwpage_size >>= 4; + +reg_user_mr_fallback: + num_hwpages = NUM_CHUNKS((virt % hwpage_size) + length, hwpage_size); + /* register MR on HCA */ + memset(&pginfo, 0, sizeof(pginfo)); + pginfo.type = EHCA_MR_PGI_USER; + pginfo.hwpage_size = hwpage_size; + pginfo.num_kpages = num_kpages; + pginfo.num_hwpages = num_hwpages; + pginfo.u.usr.region = e_mr->umem; + pginfo.next_hwpage = e_mr->umem->offset / hwpage_size; + pginfo.u.usr.next_sg = pginfo.u.usr.region->sg_head.sgl; + ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags, + e_pd, &pginfo, &e_mr->ib.ib_mr.lkey, + &e_mr->ib.ib_mr.rkey, EHCA_REG_MR); + if (ret == -EINVAL && pginfo.hwpage_size > PAGE_SIZE) { + ehca_warn(pd->device, "failed to register mr " + "with hwpage_size=%llx", hwpage_size); + ehca_info(pd->device, "try to register mr with " + "kpage_size=%lx", PAGE_SIZE); + /* + * this means kpages are not contiguous for a hw page + * try kernel page size as fallback solution + */ + hwpage_size = PAGE_SIZE; + goto reg_user_mr_fallback; + } + if (ret) { + ib_mr = ERR_PTR(ret); + goto reg_user_mr_exit2; + } + + /* successful registration of all pages */ + return &e_mr->ib.ib_mr; + +reg_user_mr_exit2: + ib_umem_release(e_mr->umem); +reg_user_mr_exit1: + ehca_mr_delete(e_mr); +reg_user_mr_exit0: + if (IS_ERR(ib_mr)) + ehca_err(pd->device, "rc=%li pd=%p mr_access_flags=%x udata=%p", + PTR_ERR(ib_mr), pd, mr_access_flags, udata); + return ib_mr; +} /* end ehca_reg_user_mr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_rereg_phys_mr(struct ib_mr *mr, + int mr_rereg_mask, + struct ib_pd *pd, + struct ib_phys_buf *phys_buf_array, + int num_phys_buf, + int mr_access_flags, + u64 *iova_start) +{ + int ret; + + struct ehca_shca *shca = + container_of(mr->device, struct ehca_shca, ib_device); + struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr); + u64 new_size; + u64 *new_start; + u32 new_acl; + struct ehca_pd *new_pd; + u32 tmp_lkey, tmp_rkey; + unsigned long sl_flags; + u32 num_kpages = 0; + u32 num_hwpages = 0; + struct ehca_mr_pginfo pginfo; + + if (!(mr_rereg_mask & IB_MR_REREG_TRANS)) { + /* TODO not supported, because PHYP rereg hCall needs pages */ + ehca_err(mr->device, "rereg without IB_MR_REREG_TRANS not " + "supported yet, mr_rereg_mask=%x", mr_rereg_mask); + ret = -EINVAL; + goto rereg_phys_mr_exit0; + } + + if (mr_rereg_mask & IB_MR_REREG_PD) { + if (!pd) { + ehca_err(mr->device, "rereg with bad pd, pd=%p " + "mr_rereg_mask=%x", pd, mr_rereg_mask); + ret = -EINVAL; + goto rereg_phys_mr_exit0; + } + } + + if ((mr_rereg_mask & + ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) || + (mr_rereg_mask == 0)) { + ret = -EINVAL; + goto rereg_phys_mr_exit0; + } + + /* check other parameters */ + if (e_mr == shca->maxmr) { + /* should be impossible, however reject to be sure */ + ehca_err(mr->device, "rereg internal max-MR impossible, mr=%p " + "shca->maxmr=%p mr->lkey=%x", + mr, shca->maxmr, mr->lkey); + ret = -EINVAL; + goto rereg_phys_mr_exit0; + } + if (mr_rereg_mask & IB_MR_REREG_TRANS) { /* transl., i.e. addr/size */ + if (e_mr->flags & EHCA_MR_FLAG_FMR) { + ehca_err(mr->device, "not supported for FMR, mr=%p " + "flags=%x", mr, e_mr->flags); + ret = -EINVAL; + goto rereg_phys_mr_exit0; + } + if (!phys_buf_array || num_phys_buf <= 0) { + ehca_err(mr->device, "bad input values mr_rereg_mask=%x" + " phys_buf_array=%p num_phys_buf=%x", + mr_rereg_mask, phys_buf_array, num_phys_buf); + ret = -EINVAL; + goto rereg_phys_mr_exit0; + } + } + if ((mr_rereg_mask & IB_MR_REREG_ACCESS) && /* change ACL */ + (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) && + !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) || + ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) && + !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)))) { + /* + * Remote Write Access requires Local Write Access + * Remote Atomic Access requires Local Write Access + */ + ehca_err(mr->device, "bad input values: mr_rereg_mask=%x " + "mr_access_flags=%x", mr_rereg_mask, mr_access_flags); + ret = -EINVAL; + goto rereg_phys_mr_exit0; + } + + /* set requested values dependent on rereg request */ + spin_lock_irqsave(&e_mr->mrlock, sl_flags); + new_start = e_mr->start; + new_size = e_mr->size; + new_acl = e_mr->acl; + new_pd = container_of(mr->pd, struct ehca_pd, ib_pd); + + if (mr_rereg_mask & IB_MR_REREG_TRANS) { + u64 hw_pgsize = ehca_get_max_hwpage_size(shca); + + new_start = iova_start; /* change address */ + /* check physical buffer list and calculate size */ + ret = ehca_mr_chk_buf_and_calc_size(phys_buf_array, + num_phys_buf, iova_start, + &new_size); + if (ret) + goto rereg_phys_mr_exit1; + if ((new_size == 0) || + (((u64)iova_start + new_size) < (u64)iova_start)) { + ehca_err(mr->device, "bad input values: new_size=%llx " + "iova_start=%p", new_size, iova_start); + ret = -EINVAL; + goto rereg_phys_mr_exit1; + } + num_kpages = NUM_CHUNKS(((u64)new_start % PAGE_SIZE) + + new_size, PAGE_SIZE); + num_hwpages = NUM_CHUNKS(((u64)new_start % hw_pgsize) + + new_size, hw_pgsize); + memset(&pginfo, 0, sizeof(pginfo)); + pginfo.type = EHCA_MR_PGI_PHYS; + pginfo.num_kpages = num_kpages; + pginfo.hwpage_size = hw_pgsize; + pginfo.num_hwpages = num_hwpages; + pginfo.u.phy.num_phys_buf = num_phys_buf; + pginfo.u.phy.phys_buf_array = phys_buf_array; + pginfo.next_hwpage = + ((u64)iova_start & ~PAGE_MASK) / hw_pgsize; + } + if (mr_rereg_mask & IB_MR_REREG_ACCESS) + new_acl = mr_access_flags; + if (mr_rereg_mask & IB_MR_REREG_PD) + new_pd = container_of(pd, struct ehca_pd, ib_pd); + + ret = ehca_rereg_mr(shca, e_mr, new_start, new_size, new_acl, + new_pd, &pginfo, &tmp_lkey, &tmp_rkey); + if (ret) + goto rereg_phys_mr_exit1; + + /* successful reregistration */ + if (mr_rereg_mask & IB_MR_REREG_PD) + mr->pd = pd; + mr->lkey = tmp_lkey; + mr->rkey = tmp_rkey; + +rereg_phys_mr_exit1: + spin_unlock_irqrestore(&e_mr->mrlock, sl_flags); +rereg_phys_mr_exit0: + if (ret) + ehca_err(mr->device, "ret=%i mr=%p mr_rereg_mask=%x pd=%p " + "phys_buf_array=%p num_phys_buf=%x mr_access_flags=%x " + "iova_start=%p", + ret, mr, mr_rereg_mask, pd, phys_buf_array, + num_phys_buf, mr_access_flags, iova_start); + return ret; +} /* end ehca_rereg_phys_mr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr) +{ + int ret = 0; + u64 h_ret; + struct ehca_shca *shca = + container_of(mr->device, struct ehca_shca, ib_device); + struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr); + unsigned long sl_flags; + struct ehca_mr_hipzout_parms hipzout; + + if ((e_mr->flags & EHCA_MR_FLAG_FMR)) { + ehca_err(mr->device, "not supported for FMR, mr=%p e_mr=%p " + "e_mr->flags=%x", mr, e_mr, e_mr->flags); + ret = -EINVAL; + goto query_mr_exit0; + } + + memset(mr_attr, 0, sizeof(struct ib_mr_attr)); + spin_lock_irqsave(&e_mr->mrlock, sl_flags); + + h_ret = hipz_h_query_mr(shca->ipz_hca_handle, e_mr, &hipzout); + if (h_ret != H_SUCCESS) { + ehca_err(mr->device, "hipz_mr_query failed, h_ret=%lli mr=%p " + "hca_hndl=%llx mr_hndl=%llx lkey=%x", + h_ret, mr, shca->ipz_hca_handle.handle, + e_mr->ipz_mr_handle.handle, mr->lkey); + ret = ehca2ib_return_code(h_ret); + goto query_mr_exit1; + } + mr_attr->pd = mr->pd; + mr_attr->device_virt_addr = hipzout.vaddr; + mr_attr->size = hipzout.len; + mr_attr->lkey = hipzout.lkey; + mr_attr->rkey = hipzout.rkey; + ehca_mrmw_reverse_map_acl(&hipzout.acl, &mr_attr->mr_access_flags); + +query_mr_exit1: + spin_unlock_irqrestore(&e_mr->mrlock, sl_flags); +query_mr_exit0: + if (ret) + ehca_err(mr->device, "ret=%i mr=%p mr_attr=%p", + ret, mr, mr_attr); + return ret; +} /* end ehca_query_mr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_dereg_mr(struct ib_mr *mr) +{ + int ret = 0; + u64 h_ret; + struct ehca_shca *shca = + container_of(mr->device, struct ehca_shca, ib_device); + struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr); + + if ((e_mr->flags & EHCA_MR_FLAG_FMR)) { + ehca_err(mr->device, "not supported for FMR, mr=%p e_mr=%p " + "e_mr->flags=%x", mr, e_mr, e_mr->flags); + ret = -EINVAL; + goto dereg_mr_exit0; + } else if (e_mr == shca->maxmr) { + /* should be impossible, however reject to be sure */ + ehca_err(mr->device, "dereg internal max-MR impossible, mr=%p " + "shca->maxmr=%p mr->lkey=%x", + mr, shca->maxmr, mr->lkey); + ret = -EINVAL; + goto dereg_mr_exit0; + } + + /* TODO: BUSY: MR still has bound window(s) */ + h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr); + if (h_ret != H_SUCCESS) { + ehca_err(mr->device, "hipz_free_mr failed, h_ret=%lli shca=%p " + "e_mr=%p hca_hndl=%llx mr_hndl=%llx mr->lkey=%x", + h_ret, shca, e_mr, shca->ipz_hca_handle.handle, + e_mr->ipz_mr_handle.handle, mr->lkey); + ret = ehca2ib_return_code(h_ret); + goto dereg_mr_exit0; + } + + if (e_mr->umem) + ib_umem_release(e_mr->umem); + + /* successful deregistration */ + ehca_mr_delete(e_mr); + +dereg_mr_exit0: + if (ret) + ehca_err(mr->device, "ret=%i mr=%p", ret, mr); + return ret; +} /* end ehca_dereg_mr() */ + +/*----------------------------------------------------------------------*/ + +struct ib_mw *ehca_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) +{ + struct ib_mw *ib_mw; + u64 h_ret; + struct ehca_mw *e_mw; + struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd); + struct ehca_shca *shca = + container_of(pd->device, struct ehca_shca, ib_device); + struct ehca_mw_hipzout_parms hipzout; + + if (type != IB_MW_TYPE_1) + return ERR_PTR(-EINVAL); + + e_mw = ehca_mw_new(); + if (!e_mw) { + ib_mw = ERR_PTR(-ENOMEM); + goto alloc_mw_exit0; + } + + h_ret = hipz_h_alloc_resource_mw(shca->ipz_hca_handle, e_mw, + e_pd->fw_pd, &hipzout); + if (h_ret != H_SUCCESS) { + ehca_err(pd->device, "hipz_mw_allocate failed, h_ret=%lli " + "shca=%p hca_hndl=%llx mw=%p", + h_ret, shca, shca->ipz_hca_handle.handle, e_mw); + ib_mw = ERR_PTR(ehca2ib_return_code(h_ret)); + goto alloc_mw_exit1; + } + /* successful MW allocation */ + e_mw->ipz_mw_handle = hipzout.handle; + e_mw->ib_mw.rkey = hipzout.rkey; + return &e_mw->ib_mw; + +alloc_mw_exit1: + ehca_mw_delete(e_mw); +alloc_mw_exit0: + if (IS_ERR(ib_mw)) + ehca_err(pd->device, "h_ret=%li pd=%p", PTR_ERR(ib_mw), pd); + return ib_mw; +} /* end ehca_alloc_mw() */ + +/*----------------------------------------------------------------------*/ + +int ehca_bind_mw(struct ib_qp *qp, + struct ib_mw *mw, + struct ib_mw_bind *mw_bind) +{ + /* TODO: not supported up to now */ + ehca_gen_err("bind MW currently not supported by HCAD"); + + return -EPERM; +} /* end ehca_bind_mw() */ + +/*----------------------------------------------------------------------*/ + +int ehca_dealloc_mw(struct ib_mw *mw) +{ + u64 h_ret; + struct ehca_shca *shca = + container_of(mw->device, struct ehca_shca, ib_device); + struct ehca_mw *e_mw = container_of(mw, struct ehca_mw, ib_mw); + + h_ret = hipz_h_free_resource_mw(shca->ipz_hca_handle, e_mw); + if (h_ret != H_SUCCESS) { + ehca_err(mw->device, "hipz_free_mw failed, h_ret=%lli shca=%p " + "mw=%p rkey=%x hca_hndl=%llx mw_hndl=%llx", + h_ret, shca, mw, mw->rkey, shca->ipz_hca_handle.handle, + e_mw->ipz_mw_handle.handle); + return ehca2ib_return_code(h_ret); + } + /* successful deallocation */ + ehca_mw_delete(e_mw); + return 0; +} /* end ehca_dealloc_mw() */ + +/*----------------------------------------------------------------------*/ + +struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd, + int mr_access_flags, + struct ib_fmr_attr *fmr_attr) +{ + struct ib_fmr *ib_fmr; + struct ehca_shca *shca = + container_of(pd->device, struct ehca_shca, ib_device); + struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd); + struct ehca_mr *e_fmr; + int ret; + u32 tmp_lkey, tmp_rkey; + struct ehca_mr_pginfo pginfo; + u64 hw_pgsize; + + /* check other parameters */ + if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) && + !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) || + ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) && + !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) { + /* + * Remote Write Access requires Local Write Access + * Remote Atomic Access requires Local Write Access + */ + ehca_err(pd->device, "bad input values: mr_access_flags=%x", + mr_access_flags); + ib_fmr = ERR_PTR(-EINVAL); + goto alloc_fmr_exit0; + } + if (mr_access_flags & IB_ACCESS_MW_BIND) { + ehca_err(pd->device, "bad input values: mr_access_flags=%x", + mr_access_flags); + ib_fmr = ERR_PTR(-EINVAL); + goto alloc_fmr_exit0; + } + if ((fmr_attr->max_pages == 0) || (fmr_attr->max_maps == 0)) { + ehca_err(pd->device, "bad input values: fmr_attr->max_pages=%x " + "fmr_attr->max_maps=%x fmr_attr->page_shift=%x", + fmr_attr->max_pages, fmr_attr->max_maps, + fmr_attr->page_shift); + ib_fmr = ERR_PTR(-EINVAL); + goto alloc_fmr_exit0; + } + + hw_pgsize = 1 << fmr_attr->page_shift; + if (!(hw_pgsize & shca->hca_cap_mr_pgsize)) { + ehca_err(pd->device, "unsupported fmr_attr->page_shift=%x", + fmr_attr->page_shift); + ib_fmr = ERR_PTR(-EINVAL); + goto alloc_fmr_exit0; + } + + e_fmr = ehca_mr_new(); + if (!e_fmr) { + ib_fmr = ERR_PTR(-ENOMEM); + goto alloc_fmr_exit0; + } + e_fmr->flags |= EHCA_MR_FLAG_FMR; + + /* register MR on HCA */ + memset(&pginfo, 0, sizeof(pginfo)); + pginfo.hwpage_size = hw_pgsize; + /* + * pginfo.num_hwpages==0, ie register_rpages() will not be called + * but deferred to map_phys_fmr() + */ + ret = ehca_reg_mr(shca, e_fmr, NULL, + fmr_attr->max_pages * (1 << fmr_attr->page_shift), + mr_access_flags, e_pd, &pginfo, + &tmp_lkey, &tmp_rkey, EHCA_REG_MR); + if (ret) { + ib_fmr = ERR_PTR(ret); + goto alloc_fmr_exit1; + } + + /* successful */ + e_fmr->hwpage_size = hw_pgsize; + e_fmr->fmr_page_size = 1 << fmr_attr->page_shift; + e_fmr->fmr_max_pages = fmr_attr->max_pages; + e_fmr->fmr_max_maps = fmr_attr->max_maps; + e_fmr->fmr_map_cnt = 0; + return &e_fmr->ib.ib_fmr; + +alloc_fmr_exit1: + ehca_mr_delete(e_fmr); +alloc_fmr_exit0: + return ib_fmr; +} /* end ehca_alloc_fmr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_map_phys_fmr(struct ib_fmr *fmr, + u64 *page_list, + int list_len, + u64 iova) +{ + int ret; + struct ehca_shca *shca = + container_of(fmr->device, struct ehca_shca, ib_device); + struct ehca_mr *e_fmr = container_of(fmr, struct ehca_mr, ib.ib_fmr); + struct ehca_pd *e_pd = container_of(fmr->pd, struct ehca_pd, ib_pd); + struct ehca_mr_pginfo pginfo; + u32 tmp_lkey, tmp_rkey; + + if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) { + ehca_err(fmr->device, "not a FMR, e_fmr=%p e_fmr->flags=%x", + e_fmr, e_fmr->flags); + ret = -EINVAL; + goto map_phys_fmr_exit0; + } + ret = ehca_fmr_check_page_list(e_fmr, page_list, list_len); + if (ret) + goto map_phys_fmr_exit0; + if (iova % e_fmr->fmr_page_size) { + /* only whole-numbered pages */ + ehca_err(fmr->device, "bad iova, iova=%llx fmr_page_size=%x", + iova, e_fmr->fmr_page_size); + ret = -EINVAL; + goto map_phys_fmr_exit0; + } + if (e_fmr->fmr_map_cnt >= e_fmr->fmr_max_maps) { + /* HCAD does not limit the maps, however trace this anyway */ + ehca_info(fmr->device, "map limit exceeded, fmr=%p " + "e_fmr->fmr_map_cnt=%x e_fmr->fmr_max_maps=%x", + fmr, e_fmr->fmr_map_cnt, e_fmr->fmr_max_maps); + } + + memset(&pginfo, 0, sizeof(pginfo)); + pginfo.type = EHCA_MR_PGI_FMR; + pginfo.num_kpages = list_len; + pginfo.hwpage_size = e_fmr->hwpage_size; + pginfo.num_hwpages = + list_len * e_fmr->fmr_page_size / pginfo.hwpage_size; + pginfo.u.fmr.page_list = page_list; + pginfo.next_hwpage = + (iova & (e_fmr->fmr_page_size-1)) / pginfo.hwpage_size; + pginfo.u.fmr.fmr_pgsize = e_fmr->fmr_page_size; + + ret = ehca_rereg_mr(shca, e_fmr, (u64 *)iova, + list_len * e_fmr->fmr_page_size, + e_fmr->acl, e_pd, &pginfo, &tmp_lkey, &tmp_rkey); + if (ret) + goto map_phys_fmr_exit0; + + /* successful reregistration */ + e_fmr->fmr_map_cnt++; + e_fmr->ib.ib_fmr.lkey = tmp_lkey; + e_fmr->ib.ib_fmr.rkey = tmp_rkey; + return 0; + +map_phys_fmr_exit0: + if (ret) + ehca_err(fmr->device, "ret=%i fmr=%p page_list=%p list_len=%x " + "iova=%llx", ret, fmr, page_list, list_len, iova); + return ret; +} /* end ehca_map_phys_fmr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_unmap_fmr(struct list_head *fmr_list) +{ + int ret = 0; + struct ib_fmr *ib_fmr; + struct ehca_shca *shca = NULL; + struct ehca_shca *prev_shca; + struct ehca_mr *e_fmr; + u32 num_fmr = 0; + u32 unmap_fmr_cnt = 0; + + /* check all FMR belong to same SHCA, and check internal flag */ + list_for_each_entry(ib_fmr, fmr_list, list) { + prev_shca = shca; + shca = container_of(ib_fmr->device, struct ehca_shca, + ib_device); + e_fmr = container_of(ib_fmr, struct ehca_mr, ib.ib_fmr); + if ((shca != prev_shca) && prev_shca) { + ehca_err(&shca->ib_device, "SHCA mismatch, shca=%p " + "prev_shca=%p e_fmr=%p", + shca, prev_shca, e_fmr); + ret = -EINVAL; + goto unmap_fmr_exit0; + } + if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) { + ehca_err(&shca->ib_device, "not a FMR, e_fmr=%p " + "e_fmr->flags=%x", e_fmr, e_fmr->flags); + ret = -EINVAL; + goto unmap_fmr_exit0; + } + num_fmr++; + } + + /* loop over all FMRs to unmap */ + list_for_each_entry(ib_fmr, fmr_list, list) { + unmap_fmr_cnt++; + e_fmr = container_of(ib_fmr, struct ehca_mr, ib.ib_fmr); + shca = container_of(ib_fmr->device, struct ehca_shca, + ib_device); + ret = ehca_unmap_one_fmr(shca, e_fmr); + if (ret) { + /* unmap failed, stop unmapping of rest of FMRs */ + ehca_err(&shca->ib_device, "unmap of one FMR failed, " + "stop rest, e_fmr=%p num_fmr=%x " + "unmap_fmr_cnt=%x lkey=%x", e_fmr, num_fmr, + unmap_fmr_cnt, e_fmr->ib.ib_fmr.lkey); + goto unmap_fmr_exit0; + } + } + +unmap_fmr_exit0: + if (ret) + ehca_gen_err("ret=%i fmr_list=%p num_fmr=%x unmap_fmr_cnt=%x", + ret, fmr_list, num_fmr, unmap_fmr_cnt); + return ret; +} /* end ehca_unmap_fmr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_dealloc_fmr(struct ib_fmr *fmr) +{ + int ret; + u64 h_ret; + struct ehca_shca *shca = + container_of(fmr->device, struct ehca_shca, ib_device); + struct ehca_mr *e_fmr = container_of(fmr, struct ehca_mr, ib.ib_fmr); + + if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) { + ehca_err(fmr->device, "not a FMR, e_fmr=%p e_fmr->flags=%x", + e_fmr, e_fmr->flags); + ret = -EINVAL; + goto free_fmr_exit0; + } + + h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_fmr); + if (h_ret != H_SUCCESS) { + ehca_err(fmr->device, "hipz_free_mr failed, h_ret=%lli e_fmr=%p " + "hca_hndl=%llx fmr_hndl=%llx fmr->lkey=%x", + h_ret, e_fmr, shca->ipz_hca_handle.handle, + e_fmr->ipz_mr_handle.handle, fmr->lkey); + ret = ehca2ib_return_code(h_ret); + goto free_fmr_exit0; + } + /* successful deregistration */ + ehca_mr_delete(e_fmr); + return 0; + +free_fmr_exit0: + if (ret) + ehca_err(&shca->ib_device, "ret=%i fmr=%p", ret, fmr); + return ret; +} /* end ehca_dealloc_fmr() */ + +/*----------------------------------------------------------------------*/ + +static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca, + struct ehca_mr *e_mr, + struct ehca_mr_pginfo *pginfo); + +int ehca_reg_mr(struct ehca_shca *shca, + struct ehca_mr *e_mr, + u64 *iova_start, + u64 size, + int acl, + struct ehca_pd *e_pd, + struct ehca_mr_pginfo *pginfo, + u32 *lkey, /*OUT*/ + u32 *rkey, /*OUT*/ + enum ehca_reg_type reg_type) +{ + int ret; + u64 h_ret; + u32 hipz_acl; + struct ehca_mr_hipzout_parms hipzout; + + ehca_mrmw_map_acl(acl, &hipz_acl); + ehca_mrmw_set_pgsize_hipz_acl(pginfo->hwpage_size, &hipz_acl); + if (ehca_use_hp_mr == 1) + hipz_acl |= 0x00000001; + + h_ret = hipz_h_alloc_resource_mr(shca->ipz_hca_handle, e_mr, + (u64)iova_start, size, hipz_acl, + e_pd->fw_pd, &hipzout); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "hipz_alloc_mr failed, h_ret=%lli " + "hca_hndl=%llx", h_ret, shca->ipz_hca_handle.handle); + ret = ehca2ib_return_code(h_ret); + goto ehca_reg_mr_exit0; + } + + e_mr->ipz_mr_handle = hipzout.handle; + + if (reg_type == EHCA_REG_BUSMAP_MR) + ret = ehca_reg_bmap_mr_rpages(shca, e_mr, pginfo); + else if (reg_type == EHCA_REG_MR) + ret = ehca_reg_mr_rpages(shca, e_mr, pginfo); + else + ret = -EINVAL; + + if (ret) + goto ehca_reg_mr_exit1; + + /* successful registration */ + e_mr->num_kpages = pginfo->num_kpages; + e_mr->num_hwpages = pginfo->num_hwpages; + e_mr->hwpage_size = pginfo->hwpage_size; + e_mr->start = iova_start; + e_mr->size = size; + e_mr->acl = acl; + *lkey = hipzout.lkey; + *rkey = hipzout.rkey; + return 0; + +ehca_reg_mr_exit1: + h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "h_ret=%lli shca=%p e_mr=%p " + "iova_start=%p size=%llx acl=%x e_pd=%p lkey=%x " + "pginfo=%p num_kpages=%llx num_hwpages=%llx ret=%i", + h_ret, shca, e_mr, iova_start, size, acl, e_pd, + hipzout.lkey, pginfo, pginfo->num_kpages, + pginfo->num_hwpages, ret); + ehca_err(&shca->ib_device, "internal error in ehca_reg_mr, " + "not recoverable"); + } +ehca_reg_mr_exit0: + if (ret) + ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p " + "iova_start=%p size=%llx acl=%x e_pd=%p pginfo=%p " + "num_kpages=%llx num_hwpages=%llx", + ret, shca, e_mr, iova_start, size, acl, e_pd, pginfo, + pginfo->num_kpages, pginfo->num_hwpages); + return ret; +} /* end ehca_reg_mr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_reg_mr_rpages(struct ehca_shca *shca, + struct ehca_mr *e_mr, + struct ehca_mr_pginfo *pginfo) +{ + int ret = 0; + u64 h_ret; + u32 rnum; + u64 rpage; + u32 i; + u64 *kpage; + + if (!pginfo->num_hwpages) /* in case of fmr */ + return 0; + + kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!kpage) { + ehca_err(&shca->ib_device, "kpage alloc failed"); + ret = -ENOMEM; + goto ehca_reg_mr_rpages_exit0; + } + + /* max MAX_RPAGES ehca mr pages per register call */ + for (i = 0; i < NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES); i++) { + + if (i == NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES) - 1) { + rnum = pginfo->num_hwpages % MAX_RPAGES; /* last shot */ + if (rnum == 0) + rnum = MAX_RPAGES; /* last shot is full */ + } else + rnum = MAX_RPAGES; + + ret = ehca_set_pagebuf(pginfo, rnum, kpage); + if (ret) { + ehca_err(&shca->ib_device, "ehca_set_pagebuf " + "bad rc, ret=%i rnum=%x kpage=%p", + ret, rnum, kpage); + goto ehca_reg_mr_rpages_exit1; + } + + if (rnum > 1) { + rpage = __pa(kpage); + if (!rpage) { + ehca_err(&shca->ib_device, "kpage=%p i=%x", + kpage, i); + ret = -EFAULT; + goto ehca_reg_mr_rpages_exit1; + } + } else + rpage = *kpage; + + h_ret = hipz_h_register_rpage_mr( + shca->ipz_hca_handle, e_mr, + ehca_encode_hwpage_size(pginfo->hwpage_size), + 0, rpage, rnum); + + if (i == NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES) - 1) { + /* + * check for 'registration complete'==H_SUCCESS + * and for 'page registered'==H_PAGE_REGISTERED + */ + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "last " + "hipz_reg_rpage_mr failed, h_ret=%lli " + "e_mr=%p i=%x hca_hndl=%llx mr_hndl=%llx" + " lkey=%x", h_ret, e_mr, i, + shca->ipz_hca_handle.handle, + e_mr->ipz_mr_handle.handle, + e_mr->ib.ib_mr.lkey); + ret = ehca2ib_return_code(h_ret); + break; + } else + ret = 0; + } else if (h_ret != H_PAGE_REGISTERED) { + ehca_err(&shca->ib_device, "hipz_reg_rpage_mr failed, " + "h_ret=%lli e_mr=%p i=%x lkey=%x hca_hndl=%llx " + "mr_hndl=%llx", h_ret, e_mr, i, + e_mr->ib.ib_mr.lkey, + shca->ipz_hca_handle.handle, + e_mr->ipz_mr_handle.handle); + ret = ehca2ib_return_code(h_ret); + break; + } else + ret = 0; + } /* end for(i) */ + + +ehca_reg_mr_rpages_exit1: + ehca_free_fw_ctrlblock(kpage); +ehca_reg_mr_rpages_exit0: + if (ret) + ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p pginfo=%p " + "num_kpages=%llx num_hwpages=%llx", ret, shca, e_mr, + pginfo, pginfo->num_kpages, pginfo->num_hwpages); + return ret; +} /* end ehca_reg_mr_rpages() */ + +/*----------------------------------------------------------------------*/ + +inline int ehca_rereg_mr_rereg1(struct ehca_shca *shca, + struct ehca_mr *e_mr, + u64 *iova_start, + u64 size, + u32 acl, + struct ehca_pd *e_pd, + struct ehca_mr_pginfo *pginfo, + u32 *lkey, /*OUT*/ + u32 *rkey) /*OUT*/ +{ + int ret; + u64 h_ret; + u32 hipz_acl; + u64 *kpage; + u64 rpage; + struct ehca_mr_pginfo pginfo_save; + struct ehca_mr_hipzout_parms hipzout; + + ehca_mrmw_map_acl(acl, &hipz_acl); + ehca_mrmw_set_pgsize_hipz_acl(pginfo->hwpage_size, &hipz_acl); + + kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!kpage) { + ehca_err(&shca->ib_device, "kpage alloc failed"); + ret = -ENOMEM; + goto ehca_rereg_mr_rereg1_exit0; + } + + pginfo_save = *pginfo; + ret = ehca_set_pagebuf(pginfo, pginfo->num_hwpages, kpage); + if (ret) { + ehca_err(&shca->ib_device, "set pagebuf failed, e_mr=%p " + "pginfo=%p type=%x num_kpages=%llx num_hwpages=%llx " + "kpage=%p", e_mr, pginfo, pginfo->type, + pginfo->num_kpages, pginfo->num_hwpages, kpage); + goto ehca_rereg_mr_rereg1_exit1; + } + rpage = __pa(kpage); + if (!rpage) { + ehca_err(&shca->ib_device, "kpage=%p", kpage); + ret = -EFAULT; + goto ehca_rereg_mr_rereg1_exit1; + } + h_ret = hipz_h_reregister_pmr(shca->ipz_hca_handle, e_mr, + (u64)iova_start, size, hipz_acl, + e_pd->fw_pd, rpage, &hipzout); + if (h_ret != H_SUCCESS) { + /* + * reregistration unsuccessful, try it again with the 3 hCalls, + * e.g. this is required in case H_MR_CONDITION + * (MW bound or MR is shared) + */ + ehca_warn(&shca->ib_device, "hipz_h_reregister_pmr failed " + "(Rereg1), h_ret=%lli e_mr=%p", h_ret, e_mr); + *pginfo = pginfo_save; + ret = -EAGAIN; + } else if ((u64 *)hipzout.vaddr != iova_start) { + ehca_err(&shca->ib_device, "PHYP changed iova_start in " + "rereg_pmr, iova_start=%p iova_start_out=%llx e_mr=%p " + "mr_handle=%llx lkey=%x lkey_out=%x", iova_start, + hipzout.vaddr, e_mr, e_mr->ipz_mr_handle.handle, + e_mr->ib.ib_mr.lkey, hipzout.lkey); + ret = -EFAULT; + } else { + /* + * successful reregistration + * note: start and start_out are identical for eServer HCAs + */ + e_mr->num_kpages = pginfo->num_kpages; + e_mr->num_hwpages = pginfo->num_hwpages; + e_mr->hwpage_size = pginfo->hwpage_size; + e_mr->start = iova_start; + e_mr->size = size; + e_mr->acl = acl; + *lkey = hipzout.lkey; + *rkey = hipzout.rkey; + } + +ehca_rereg_mr_rereg1_exit1: + ehca_free_fw_ctrlblock(kpage); +ehca_rereg_mr_rereg1_exit0: + if ( ret && (ret != -EAGAIN) ) + ehca_err(&shca->ib_device, "ret=%i lkey=%x rkey=%x " + "pginfo=%p num_kpages=%llx num_hwpages=%llx", + ret, *lkey, *rkey, pginfo, pginfo->num_kpages, + pginfo->num_hwpages); + return ret; +} /* end ehca_rereg_mr_rereg1() */ + +/*----------------------------------------------------------------------*/ + +int ehca_rereg_mr(struct ehca_shca *shca, + struct ehca_mr *e_mr, + u64 *iova_start, + u64 size, + int acl, + struct ehca_pd *e_pd, + struct ehca_mr_pginfo *pginfo, + u32 *lkey, + u32 *rkey) +{ + int ret = 0; + u64 h_ret; + int rereg_1_hcall = 1; /* 1: use hipz_h_reregister_pmr directly */ + int rereg_3_hcall = 0; /* 1: use 3 hipz calls for reregistration */ + + /* first determine reregistration hCall(s) */ + if ((pginfo->num_hwpages > MAX_RPAGES) || + (e_mr->num_hwpages > MAX_RPAGES) || + (pginfo->num_hwpages > e_mr->num_hwpages)) { + ehca_dbg(&shca->ib_device, "Rereg3 case, " + "pginfo->num_hwpages=%llx e_mr->num_hwpages=%x", + pginfo->num_hwpages, e_mr->num_hwpages); + rereg_1_hcall = 0; + rereg_3_hcall = 1; + } + + if (e_mr->flags & EHCA_MR_FLAG_MAXMR) { /* check for max-MR */ + rereg_1_hcall = 0; + rereg_3_hcall = 1; + e_mr->flags &= ~EHCA_MR_FLAG_MAXMR; + ehca_err(&shca->ib_device, "Rereg MR for max-MR! e_mr=%p", + e_mr); + } + + if (rereg_1_hcall) { + ret = ehca_rereg_mr_rereg1(shca, e_mr, iova_start, size, + acl, e_pd, pginfo, lkey, rkey); + if (ret) { + if (ret == -EAGAIN) + rereg_3_hcall = 1; + else + goto ehca_rereg_mr_exit0; + } + } + + if (rereg_3_hcall) { + struct ehca_mr save_mr; + + /* first deregister old MR */ + h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "hipz_free_mr failed, " + "h_ret=%lli e_mr=%p hca_hndl=%llx mr_hndl=%llx " + "mr->lkey=%x", + h_ret, e_mr, shca->ipz_hca_handle.handle, + e_mr->ipz_mr_handle.handle, + e_mr->ib.ib_mr.lkey); + ret = ehca2ib_return_code(h_ret); + goto ehca_rereg_mr_exit0; + } + /* clean ehca_mr_t, without changing struct ib_mr and lock */ + save_mr = *e_mr; + ehca_mr_deletenew(e_mr); + + /* set some MR values */ + e_mr->flags = save_mr.flags; + e_mr->hwpage_size = save_mr.hwpage_size; + e_mr->fmr_page_size = save_mr.fmr_page_size; + e_mr->fmr_max_pages = save_mr.fmr_max_pages; + e_mr->fmr_max_maps = save_mr.fmr_max_maps; + e_mr->fmr_map_cnt = save_mr.fmr_map_cnt; + + ret = ehca_reg_mr(shca, e_mr, iova_start, size, acl, + e_pd, pginfo, lkey, rkey, EHCA_REG_MR); + if (ret) { + u32 offset = (u64)(&e_mr->flags) - (u64)e_mr; + memcpy(&e_mr->flags, &(save_mr.flags), + sizeof(struct ehca_mr) - offset); + goto ehca_rereg_mr_exit0; + } + } + +ehca_rereg_mr_exit0: + if (ret) + ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p " + "iova_start=%p size=%llx acl=%x e_pd=%p pginfo=%p " + "num_kpages=%llx lkey=%x rkey=%x rereg_1_hcall=%x " + "rereg_3_hcall=%x", ret, shca, e_mr, iova_start, size, + acl, e_pd, pginfo, pginfo->num_kpages, *lkey, *rkey, + rereg_1_hcall, rereg_3_hcall); + return ret; +} /* end ehca_rereg_mr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_unmap_one_fmr(struct ehca_shca *shca, + struct ehca_mr *e_fmr) +{ + int ret = 0; + u64 h_ret; + struct ehca_pd *e_pd = + container_of(e_fmr->ib.ib_fmr.pd, struct ehca_pd, ib_pd); + struct ehca_mr save_fmr; + u32 tmp_lkey, tmp_rkey; + struct ehca_mr_pginfo pginfo; + struct ehca_mr_hipzout_parms hipzout; + struct ehca_mr save_mr; + + if (e_fmr->fmr_max_pages <= MAX_RPAGES) { + /* + * note: after using rereg hcall with len=0, + * rereg hcall must be used again for registering pages + */ + h_ret = hipz_h_reregister_pmr(shca->ipz_hca_handle, e_fmr, 0, + 0, 0, e_pd->fw_pd, 0, &hipzout); + if (h_ret == H_SUCCESS) { + /* successful reregistration */ + e_fmr->start = NULL; + e_fmr->size = 0; + tmp_lkey = hipzout.lkey; + tmp_rkey = hipzout.rkey; + return 0; + } + /* + * should not happen, because length checked above, + * FMRs are not shared and no MW bound to FMRs + */ + ehca_err(&shca->ib_device, "hipz_reregister_pmr failed " + "(Rereg1), h_ret=%lli e_fmr=%p hca_hndl=%llx " + "mr_hndl=%llx lkey=%x lkey_out=%x", + h_ret, e_fmr, shca->ipz_hca_handle.handle, + e_fmr->ipz_mr_handle.handle, + e_fmr->ib.ib_fmr.lkey, hipzout.lkey); + /* try free and rereg */ + } + + /* first free old FMR */ + h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_fmr); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "hipz_free_mr failed, " + "h_ret=%lli e_fmr=%p hca_hndl=%llx mr_hndl=%llx " + "lkey=%x", + h_ret, e_fmr, shca->ipz_hca_handle.handle, + e_fmr->ipz_mr_handle.handle, + e_fmr->ib.ib_fmr.lkey); + ret = ehca2ib_return_code(h_ret); + goto ehca_unmap_one_fmr_exit0; + } + /* clean ehca_mr_t, without changing lock */ + save_fmr = *e_fmr; + ehca_mr_deletenew(e_fmr); + + /* set some MR values */ + e_fmr->flags = save_fmr.flags; + e_fmr->hwpage_size = save_fmr.hwpage_size; + e_fmr->fmr_page_size = save_fmr.fmr_page_size; + e_fmr->fmr_max_pages = save_fmr.fmr_max_pages; + e_fmr->fmr_max_maps = save_fmr.fmr_max_maps; + e_fmr->fmr_map_cnt = save_fmr.fmr_map_cnt; + e_fmr->acl = save_fmr.acl; + + memset(&pginfo, 0, sizeof(pginfo)); + pginfo.type = EHCA_MR_PGI_FMR; + ret = ehca_reg_mr(shca, e_fmr, NULL, + (e_fmr->fmr_max_pages * e_fmr->fmr_page_size), + e_fmr->acl, e_pd, &pginfo, &tmp_lkey, + &tmp_rkey, EHCA_REG_MR); + if (ret) { + u32 offset = (u64)(&e_fmr->flags) - (u64)e_fmr; + memcpy(&e_fmr->flags, &(save_mr.flags), + sizeof(struct ehca_mr) - offset); + } + +ehca_unmap_one_fmr_exit0: + if (ret) + ehca_err(&shca->ib_device, "ret=%i tmp_lkey=%x tmp_rkey=%x " + "fmr_max_pages=%x", + ret, tmp_lkey, tmp_rkey, e_fmr->fmr_max_pages); + return ret; +} /* end ehca_unmap_one_fmr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_reg_smr(struct ehca_shca *shca, + struct ehca_mr *e_origmr, + struct ehca_mr *e_newmr, + u64 *iova_start, + int acl, + struct ehca_pd *e_pd, + u32 *lkey, /*OUT*/ + u32 *rkey) /*OUT*/ +{ + int ret = 0; + u64 h_ret; + u32 hipz_acl; + struct ehca_mr_hipzout_parms hipzout; + + ehca_mrmw_map_acl(acl, &hipz_acl); + ehca_mrmw_set_pgsize_hipz_acl(e_origmr->hwpage_size, &hipz_acl); + + h_ret = hipz_h_register_smr(shca->ipz_hca_handle, e_newmr, e_origmr, + (u64)iova_start, hipz_acl, e_pd->fw_pd, + &hipzout); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%lli " + "shca=%p e_origmr=%p e_newmr=%p iova_start=%p acl=%x " + "e_pd=%p hca_hndl=%llx mr_hndl=%llx lkey=%x", + h_ret, shca, e_origmr, e_newmr, iova_start, acl, e_pd, + shca->ipz_hca_handle.handle, + e_origmr->ipz_mr_handle.handle, + e_origmr->ib.ib_mr.lkey); + ret = ehca2ib_return_code(h_ret); + goto ehca_reg_smr_exit0; + } + /* successful registration */ + e_newmr->num_kpages = e_origmr->num_kpages; + e_newmr->num_hwpages = e_origmr->num_hwpages; + e_newmr->hwpage_size = e_origmr->hwpage_size; + e_newmr->start = iova_start; + e_newmr->size = e_origmr->size; + e_newmr->acl = acl; + e_newmr->ipz_mr_handle = hipzout.handle; + *lkey = hipzout.lkey; + *rkey = hipzout.rkey; + return 0; + +ehca_reg_smr_exit0: + if (ret) + ehca_err(&shca->ib_device, "ret=%i shca=%p e_origmr=%p " + "e_newmr=%p iova_start=%p acl=%x e_pd=%p", + ret, shca, e_origmr, e_newmr, iova_start, acl, e_pd); + return ret; +} /* end ehca_reg_smr() */ + +/*----------------------------------------------------------------------*/ +static inline void *ehca_calc_sectbase(int top, int dir, int idx) +{ + unsigned long ret = idx; + ret |= dir << EHCA_DIR_INDEX_SHIFT; + ret |= top << EHCA_TOP_INDEX_SHIFT; + return __va(ret << SECTION_SIZE_BITS); +} + +#define ehca_bmap_valid(entry) \ + ((u64)entry != (u64)EHCA_INVAL_ADDR) + +static u64 ehca_reg_mr_section(int top, int dir, int idx, u64 *kpage, + struct ehca_shca *shca, struct ehca_mr *mr, + struct ehca_mr_pginfo *pginfo) +{ + u64 h_ret = 0; + unsigned long page = 0; + u64 rpage = __pa(kpage); + int page_count; + + void *sectbase = ehca_calc_sectbase(top, dir, idx); + if ((unsigned long)sectbase & (pginfo->hwpage_size - 1)) { + ehca_err(&shca->ib_device, "reg_mr_section will probably fail:" + "hwpage_size does not fit to " + "section start address"); + } + page_count = EHCA_SECTSIZE / pginfo->hwpage_size; + + while (page < page_count) { + u64 rnum; + for (rnum = 0; (rnum < MAX_RPAGES) && (page < page_count); + rnum++) { + void *pg = sectbase + ((page++) * pginfo->hwpage_size); + kpage[rnum] = __pa(pg); + } + + h_ret = hipz_h_register_rpage_mr(shca->ipz_hca_handle, mr, + ehca_encode_hwpage_size(pginfo->hwpage_size), + 0, rpage, rnum); + + if ((h_ret != H_SUCCESS) && (h_ret != H_PAGE_REGISTERED)) { + ehca_err(&shca->ib_device, "register_rpage_mr failed"); + return h_ret; + } + } + return h_ret; +} + +static u64 ehca_reg_mr_sections(int top, int dir, u64 *kpage, + struct ehca_shca *shca, struct ehca_mr *mr, + struct ehca_mr_pginfo *pginfo) +{ + u64 hret = H_SUCCESS; + int idx; + + for (idx = 0; idx < EHCA_MAP_ENTRIES; idx++) { + if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]->ent[idx])) + continue; + + hret = ehca_reg_mr_section(top, dir, idx, kpage, shca, mr, + pginfo); + if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED)) + return hret; + } + return hret; +} + +static u64 ehca_reg_mr_dir_sections(int top, u64 *kpage, struct ehca_shca *shca, + struct ehca_mr *mr, + struct ehca_mr_pginfo *pginfo) +{ + u64 hret = H_SUCCESS; + int dir; + + for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) { + if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir])) + continue; + + hret = ehca_reg_mr_sections(top, dir, kpage, shca, mr, pginfo); + if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED)) + return hret; + } + return hret; +} + +/* register internal max-MR to internal SHCA */ +int ehca_reg_internal_maxmr( + struct ehca_shca *shca, + struct ehca_pd *e_pd, + struct ehca_mr **e_maxmr) /*OUT*/ +{ + int ret; + struct ehca_mr *e_mr; + u64 *iova_start; + u64 size_maxmr; + struct ehca_mr_pginfo pginfo; + struct ib_phys_buf ib_pbuf; + u32 num_kpages; + u32 num_hwpages; + u64 hw_pgsize; + + if (!ehca_bmap) { + ret = -EFAULT; + goto ehca_reg_internal_maxmr_exit0; + } + + e_mr = ehca_mr_new(); + if (!e_mr) { + ehca_err(&shca->ib_device, "out of memory"); + ret = -ENOMEM; + goto ehca_reg_internal_maxmr_exit0; + } + e_mr->flags |= EHCA_MR_FLAG_MAXMR; + + /* register internal max-MR on HCA */ + size_maxmr = ehca_mr_len; + iova_start = (u64 *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START)); + ib_pbuf.addr = 0; + ib_pbuf.size = size_maxmr; + num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size_maxmr, + PAGE_SIZE); + hw_pgsize = ehca_get_max_hwpage_size(shca); + num_hwpages = NUM_CHUNKS(((u64)iova_start % hw_pgsize) + size_maxmr, + hw_pgsize); + + memset(&pginfo, 0, sizeof(pginfo)); + pginfo.type = EHCA_MR_PGI_PHYS; + pginfo.num_kpages = num_kpages; + pginfo.num_hwpages = num_hwpages; + pginfo.hwpage_size = hw_pgsize; + pginfo.u.phy.num_phys_buf = 1; + pginfo.u.phy.phys_buf_array = &ib_pbuf; + + ret = ehca_reg_mr(shca, e_mr, iova_start, size_maxmr, 0, e_pd, + &pginfo, &e_mr->ib.ib_mr.lkey, + &e_mr->ib.ib_mr.rkey, EHCA_REG_BUSMAP_MR); + if (ret) { + ehca_err(&shca->ib_device, "reg of internal max MR failed, " + "e_mr=%p iova_start=%p size_maxmr=%llx num_kpages=%x " + "num_hwpages=%x", e_mr, iova_start, size_maxmr, + num_kpages, num_hwpages); + goto ehca_reg_internal_maxmr_exit1; + } + + /* successful registration of all pages */ + e_mr->ib.ib_mr.device = e_pd->ib_pd.device; + e_mr->ib.ib_mr.pd = &e_pd->ib_pd; + e_mr->ib.ib_mr.uobject = NULL; + atomic_inc(&(e_pd->ib_pd.usecnt)); + atomic_set(&(e_mr->ib.ib_mr.usecnt), 0); + *e_maxmr = e_mr; + return 0; + +ehca_reg_internal_maxmr_exit1: + ehca_mr_delete(e_mr); +ehca_reg_internal_maxmr_exit0: + if (ret) + ehca_err(&shca->ib_device, "ret=%i shca=%p e_pd=%p e_maxmr=%p", + ret, shca, e_pd, e_maxmr); + return ret; +} /* end ehca_reg_internal_maxmr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_reg_maxmr(struct ehca_shca *shca, + struct ehca_mr *e_newmr, + u64 *iova_start, + int acl, + struct ehca_pd *e_pd, + u32 *lkey, + u32 *rkey) +{ + u64 h_ret; + struct ehca_mr *e_origmr = shca->maxmr; + u32 hipz_acl; + struct ehca_mr_hipzout_parms hipzout; + + ehca_mrmw_map_acl(acl, &hipz_acl); + ehca_mrmw_set_pgsize_hipz_acl(e_origmr->hwpage_size, &hipz_acl); + + h_ret = hipz_h_register_smr(shca->ipz_hca_handle, e_newmr, e_origmr, + (u64)iova_start, hipz_acl, e_pd->fw_pd, + &hipzout); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%lli " + "e_origmr=%p hca_hndl=%llx mr_hndl=%llx lkey=%x", + h_ret, e_origmr, shca->ipz_hca_handle.handle, + e_origmr->ipz_mr_handle.handle, + e_origmr->ib.ib_mr.lkey); + return ehca2ib_return_code(h_ret); + } + /* successful registration */ + e_newmr->num_kpages = e_origmr->num_kpages; + e_newmr->num_hwpages = e_origmr->num_hwpages; + e_newmr->hwpage_size = e_origmr->hwpage_size; + e_newmr->start = iova_start; + e_newmr->size = e_origmr->size; + e_newmr->acl = acl; + e_newmr->ipz_mr_handle = hipzout.handle; + *lkey = hipzout.lkey; + *rkey = hipzout.rkey; + return 0; +} /* end ehca_reg_maxmr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_dereg_internal_maxmr(struct ehca_shca *shca) +{ + int ret; + struct ehca_mr *e_maxmr; + struct ib_pd *ib_pd; + + if (!shca->maxmr) { + ehca_err(&shca->ib_device, "bad call, shca=%p", shca); + ret = -EINVAL; + goto ehca_dereg_internal_maxmr_exit0; + } + + e_maxmr = shca->maxmr; + ib_pd = e_maxmr->ib.ib_mr.pd; + shca->maxmr = NULL; /* remove internal max-MR indication from SHCA */ + + ret = ehca_dereg_mr(&e_maxmr->ib.ib_mr); + if (ret) { + ehca_err(&shca->ib_device, "dereg internal max-MR failed, " + "ret=%i e_maxmr=%p shca=%p lkey=%x", + ret, e_maxmr, shca, e_maxmr->ib.ib_mr.lkey); + shca->maxmr = e_maxmr; + goto ehca_dereg_internal_maxmr_exit0; + } + + atomic_dec(&ib_pd->usecnt); + +ehca_dereg_internal_maxmr_exit0: + if (ret) + ehca_err(&shca->ib_device, "ret=%i shca=%p shca->maxmr=%p", + ret, shca, shca->maxmr); + return ret; +} /* end ehca_dereg_internal_maxmr() */ + +/*----------------------------------------------------------------------*/ + +/* + * check physical buffer array of MR verbs for validness and + * calculates MR size + */ +int ehca_mr_chk_buf_and_calc_size(struct ib_phys_buf *phys_buf_array, + int num_phys_buf, + u64 *iova_start, + u64 *size) +{ + struct ib_phys_buf *pbuf = phys_buf_array; + u64 size_count = 0; + u32 i; + + if (num_phys_buf == 0) { + ehca_gen_err("bad phys buf array len, num_phys_buf=0"); + return -EINVAL; + } + /* check first buffer */ + if (((u64)iova_start & ~PAGE_MASK) != (pbuf->addr & ~PAGE_MASK)) { + ehca_gen_err("iova_start/addr mismatch, iova_start=%p " + "pbuf->addr=%llx pbuf->size=%llx", + iova_start, pbuf->addr, pbuf->size); + return -EINVAL; + } + if (((pbuf->addr + pbuf->size) % PAGE_SIZE) && + (num_phys_buf > 1)) { + ehca_gen_err("addr/size mismatch in 1st buf, pbuf->addr=%llx " + "pbuf->size=%llx", pbuf->addr, pbuf->size); + return -EINVAL; + } + + for (i = 0; i < num_phys_buf; i++) { + if ((i > 0) && (pbuf->addr % PAGE_SIZE)) { + ehca_gen_err("bad address, i=%x pbuf->addr=%llx " + "pbuf->size=%llx", + i, pbuf->addr, pbuf->size); + return -EINVAL; + } + if (((i > 0) && /* not 1st */ + (i < (num_phys_buf - 1)) && /* not last */ + (pbuf->size % PAGE_SIZE)) || (pbuf->size == 0)) { + ehca_gen_err("bad size, i=%x pbuf->size=%llx", + i, pbuf->size); + return -EINVAL; + } + size_count += pbuf->size; + pbuf++; + } + + *size = size_count; + return 0; +} /* end ehca_mr_chk_buf_and_calc_size() */ + +/*----------------------------------------------------------------------*/ + +/* check page list of map FMR verb for validness */ +int ehca_fmr_check_page_list(struct ehca_mr *e_fmr, + u64 *page_list, + int list_len) +{ + u32 i; + u64 *page; + + if ((list_len == 0) || (list_len > e_fmr->fmr_max_pages)) { + ehca_gen_err("bad list_len, list_len=%x " + "e_fmr->fmr_max_pages=%x fmr=%p", + list_len, e_fmr->fmr_max_pages, e_fmr); + return -EINVAL; + } + + /* each page must be aligned */ + page = page_list; + for (i = 0; i < list_len; i++) { + if (*page % e_fmr->fmr_page_size) { + ehca_gen_err("bad page, i=%x *page=%llx page=%p fmr=%p " + "fmr_page_size=%x", i, *page, page, e_fmr, + e_fmr->fmr_page_size); + return -EINVAL; + } + page++; + } + + return 0; +} /* end ehca_fmr_check_page_list() */ + +/*----------------------------------------------------------------------*/ + +/* PAGE_SIZE >= pginfo->hwpage_size */ +static int ehca_set_pagebuf_user1(struct ehca_mr_pginfo *pginfo, + u32 number, + u64 *kpage) +{ + int ret = 0; + u64 pgaddr; + u32 j = 0; + int hwpages_per_kpage = PAGE_SIZE / pginfo->hwpage_size; + struct scatterlist **sg = &pginfo->u.usr.next_sg; + + while (*sg != NULL) { + pgaddr = page_to_pfn(sg_page(*sg)) + << PAGE_SHIFT; + *kpage = pgaddr + (pginfo->next_hwpage * + pginfo->hwpage_size); + if (!(*kpage)) { + ehca_gen_err("pgaddr=%llx " + "sg_dma_address=%llx " + "entry=%llx next_hwpage=%llx", + pgaddr, (u64)sg_dma_address(*sg), + pginfo->u.usr.next_nmap, + pginfo->next_hwpage); + return -EFAULT; + } + (pginfo->hwpage_cnt)++; + (pginfo->next_hwpage)++; + kpage++; + if (pginfo->next_hwpage % hwpages_per_kpage == 0) { + (pginfo->kpage_cnt)++; + (pginfo->u.usr.next_nmap)++; + pginfo->next_hwpage = 0; + *sg = sg_next(*sg); + } + j++; + if (j >= number) + break; + } + + return ret; +} + +/* + * check given pages for contiguous layout + * last page addr is returned in prev_pgaddr for further check + */ +static int ehca_check_kpages_per_ate(struct scatterlist **sg, + int num_pages, + u64 *prev_pgaddr) +{ + for (; *sg && num_pages > 0; *sg = sg_next(*sg), num_pages--) { + u64 pgaddr = page_to_pfn(sg_page(*sg)) << PAGE_SHIFT; + if (ehca_debug_level >= 3) + ehca_gen_dbg("chunk_page=%llx value=%016llx", pgaddr, + *(u64 *)__va(pgaddr)); + if (pgaddr - PAGE_SIZE != *prev_pgaddr) { + ehca_gen_err("uncontiguous page found pgaddr=%llx " + "prev_pgaddr=%llx entries_left_in_hwpage=%x", + pgaddr, *prev_pgaddr, num_pages); + return -EINVAL; + } + *prev_pgaddr = pgaddr; + } + return 0; +} + +/* PAGE_SIZE < pginfo->hwpage_size */ +static int ehca_set_pagebuf_user2(struct ehca_mr_pginfo *pginfo, + u32 number, + u64 *kpage) +{ + int ret = 0; + u64 pgaddr, prev_pgaddr; + u32 j = 0; + int kpages_per_hwpage = pginfo->hwpage_size / PAGE_SIZE; + int nr_kpages = kpages_per_hwpage; + struct scatterlist **sg = &pginfo->u.usr.next_sg; + + while (*sg != NULL) { + + if (nr_kpages == kpages_per_hwpage) { + pgaddr = (page_to_pfn(sg_page(*sg)) + << PAGE_SHIFT); + *kpage = pgaddr; + if (!(*kpage)) { + ehca_gen_err("pgaddr=%llx entry=%llx", + pgaddr, pginfo->u.usr.next_nmap); + ret = -EFAULT; + return ret; + } + /* + * The first page in a hwpage must be aligned; + * the first MR page is exempt from this rule. + */ + if (pgaddr & (pginfo->hwpage_size - 1)) { + if (pginfo->hwpage_cnt) { + ehca_gen_err( + "invalid alignment " + "pgaddr=%llx entry=%llx " + "mr_pgsize=%llx", + pgaddr, pginfo->u.usr.next_nmap, + pginfo->hwpage_size); + ret = -EFAULT; + return ret; + } + /* first MR page */ + pginfo->kpage_cnt = + (pgaddr & + (pginfo->hwpage_size - 1)) >> + PAGE_SHIFT; + nr_kpages -= pginfo->kpage_cnt; + *kpage = pgaddr & + ~(pginfo->hwpage_size - 1); + } + if (ehca_debug_level >= 3) { + u64 val = *(u64 *)__va(pgaddr); + ehca_gen_dbg("kpage=%llx page=%llx " + "value=%016llx", + *kpage, pgaddr, val); + } + prev_pgaddr = pgaddr; + *sg = sg_next(*sg); + pginfo->kpage_cnt++; + pginfo->u.usr.next_nmap++; + nr_kpages--; + if (!nr_kpages) + goto next_kpage; + continue; + } + + ret = ehca_check_kpages_per_ate(sg, nr_kpages, + &prev_pgaddr); + if (ret) + return ret; + pginfo->kpage_cnt += nr_kpages; + pginfo->u.usr.next_nmap += nr_kpages; + +next_kpage: + nr_kpages = kpages_per_hwpage; + (pginfo->hwpage_cnt)++; + kpage++; + j++; + if (j >= number) + break; + } + + return ret; +} + +static int ehca_set_pagebuf_phys(struct ehca_mr_pginfo *pginfo, + u32 number, u64 *kpage) +{ + int ret = 0; + struct ib_phys_buf *pbuf; + u64 num_hw, offs_hw; + u32 i = 0; + + /* loop over desired phys_buf_array entries */ + while (i < number) { + pbuf = pginfo->u.phy.phys_buf_array + pginfo->u.phy.next_buf; + num_hw = NUM_CHUNKS((pbuf->addr % pginfo->hwpage_size) + + pbuf->size, pginfo->hwpage_size); + offs_hw = (pbuf->addr & ~(pginfo->hwpage_size - 1)) / + pginfo->hwpage_size; + while (pginfo->next_hwpage < offs_hw + num_hw) { + /* sanity check */ + if ((pginfo->kpage_cnt >= pginfo->num_kpages) || + (pginfo->hwpage_cnt >= pginfo->num_hwpages)) { + ehca_gen_err("kpage_cnt >= num_kpages, " + "kpage_cnt=%llx num_kpages=%llx " + "hwpage_cnt=%llx " + "num_hwpages=%llx i=%x", + pginfo->kpage_cnt, + pginfo->num_kpages, + pginfo->hwpage_cnt, + pginfo->num_hwpages, i); + return -EFAULT; + } + *kpage = (pbuf->addr & ~(pginfo->hwpage_size - 1)) + + (pginfo->next_hwpage * pginfo->hwpage_size); + if ( !(*kpage) && pbuf->addr ) { + ehca_gen_err("pbuf->addr=%llx pbuf->size=%llx " + "next_hwpage=%llx", pbuf->addr, + pbuf->size, pginfo->next_hwpage); + return -EFAULT; + } + (pginfo->hwpage_cnt)++; + (pginfo->next_hwpage)++; + if (PAGE_SIZE >= pginfo->hwpage_size) { + if (pginfo->next_hwpage % + (PAGE_SIZE / pginfo->hwpage_size) == 0) + (pginfo->kpage_cnt)++; + } else + pginfo->kpage_cnt += pginfo->hwpage_size / + PAGE_SIZE; + kpage++; + i++; + if (i >= number) break; + } + if (pginfo->next_hwpage >= offs_hw + num_hw) { + (pginfo->u.phy.next_buf)++; + pginfo->next_hwpage = 0; + } + } + return ret; +} + +static int ehca_set_pagebuf_fmr(struct ehca_mr_pginfo *pginfo, + u32 number, u64 *kpage) +{ + int ret = 0; + u64 *fmrlist; + u32 i; + + /* loop over desired page_list entries */ + fmrlist = pginfo->u.fmr.page_list + pginfo->u.fmr.next_listelem; + for (i = 0; i < number; i++) { + *kpage = (*fmrlist & ~(pginfo->hwpage_size - 1)) + + pginfo->next_hwpage * pginfo->hwpage_size; + if ( !(*kpage) ) { + ehca_gen_err("*fmrlist=%llx fmrlist=%p " + "next_listelem=%llx next_hwpage=%llx", + *fmrlist, fmrlist, + pginfo->u.fmr.next_listelem, + pginfo->next_hwpage); + return -EFAULT; + } + (pginfo->hwpage_cnt)++; + if (pginfo->u.fmr.fmr_pgsize >= pginfo->hwpage_size) { + if (pginfo->next_hwpage % + (pginfo->u.fmr.fmr_pgsize / + pginfo->hwpage_size) == 0) { + (pginfo->kpage_cnt)++; + (pginfo->u.fmr.next_listelem)++; + fmrlist++; + pginfo->next_hwpage = 0; + } else + (pginfo->next_hwpage)++; + } else { + unsigned int cnt_per_hwpage = pginfo->hwpage_size / + pginfo->u.fmr.fmr_pgsize; + unsigned int j; + u64 prev = *kpage; + /* check if adrs are contiguous */ + for (j = 1; j < cnt_per_hwpage; j++) { + u64 p = fmrlist[j] & ~(pginfo->hwpage_size - 1); + if (prev + pginfo->u.fmr.fmr_pgsize != p) { + ehca_gen_err("uncontiguous fmr pages " + "found prev=%llx p=%llx " + "idx=%x", prev, p, i + j); + return -EINVAL; + } + prev = p; + } + pginfo->kpage_cnt += cnt_per_hwpage; + pginfo->u.fmr.next_listelem += cnt_per_hwpage; + fmrlist += cnt_per_hwpage; + } + kpage++; + } + return ret; +} + +/* setup page buffer from page info */ +int ehca_set_pagebuf(struct ehca_mr_pginfo *pginfo, + u32 number, + u64 *kpage) +{ + int ret; + + switch (pginfo->type) { + case EHCA_MR_PGI_PHYS: + ret = ehca_set_pagebuf_phys(pginfo, number, kpage); + break; + case EHCA_MR_PGI_USER: + ret = PAGE_SIZE >= pginfo->hwpage_size ? + ehca_set_pagebuf_user1(pginfo, number, kpage) : + ehca_set_pagebuf_user2(pginfo, number, kpage); + break; + case EHCA_MR_PGI_FMR: + ret = ehca_set_pagebuf_fmr(pginfo, number, kpage); + break; + default: + ehca_gen_err("bad pginfo->type=%x", pginfo->type); + ret = -EFAULT; + break; + } + return ret; +} /* end ehca_set_pagebuf() */ + +/*----------------------------------------------------------------------*/ + +/* + * check MR if it is a max-MR, i.e. uses whole memory + * in case it's a max-MR 1 is returned, else 0 + */ +int ehca_mr_is_maxmr(u64 size, + u64 *iova_start) +{ + /* a MR is treated as max-MR only if it fits following: */ + if ((size == ehca_mr_len) && + (iova_start == (void *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START)))) { + ehca_gen_dbg("this is a max-MR"); + return 1; + } else + return 0; +} /* end ehca_mr_is_maxmr() */ + +/*----------------------------------------------------------------------*/ + +/* map access control for MR/MW. This routine is used for MR and MW. */ +void ehca_mrmw_map_acl(int ib_acl, + u32 *hipz_acl) +{ + *hipz_acl = 0; + if (ib_acl & IB_ACCESS_REMOTE_READ) + *hipz_acl |= HIPZ_ACCESSCTRL_R_READ; + if (ib_acl & IB_ACCESS_REMOTE_WRITE) + *hipz_acl |= HIPZ_ACCESSCTRL_R_WRITE; + if (ib_acl & IB_ACCESS_REMOTE_ATOMIC) + *hipz_acl |= HIPZ_ACCESSCTRL_R_ATOMIC; + if (ib_acl & IB_ACCESS_LOCAL_WRITE) + *hipz_acl |= HIPZ_ACCESSCTRL_L_WRITE; + if (ib_acl & IB_ACCESS_MW_BIND) + *hipz_acl |= HIPZ_ACCESSCTRL_MW_BIND; +} /* end ehca_mrmw_map_acl() */ + +/*----------------------------------------------------------------------*/ + +/* sets page size in hipz access control for MR/MW. */ +void ehca_mrmw_set_pgsize_hipz_acl(u32 pgsize, u32 *hipz_acl) /*INOUT*/ +{ + *hipz_acl |= (ehca_encode_hwpage_size(pgsize) << 24); +} /* end ehca_mrmw_set_pgsize_hipz_acl() */ + +/*----------------------------------------------------------------------*/ + +/* + * reverse map access control for MR/MW. + * This routine is used for MR and MW. + */ +void ehca_mrmw_reverse_map_acl(const u32 *hipz_acl, + int *ib_acl) /*OUT*/ +{ + *ib_acl = 0; + if (*hipz_acl & HIPZ_ACCESSCTRL_R_READ) + *ib_acl |= IB_ACCESS_REMOTE_READ; + if (*hipz_acl & HIPZ_ACCESSCTRL_R_WRITE) + *ib_acl |= IB_ACCESS_REMOTE_WRITE; + if (*hipz_acl & HIPZ_ACCESSCTRL_R_ATOMIC) + *ib_acl |= IB_ACCESS_REMOTE_ATOMIC; + if (*hipz_acl & HIPZ_ACCESSCTRL_L_WRITE) + *ib_acl |= IB_ACCESS_LOCAL_WRITE; + if (*hipz_acl & HIPZ_ACCESSCTRL_MW_BIND) + *ib_acl |= IB_ACCESS_MW_BIND; +} /* end ehca_mrmw_reverse_map_acl() */ + + +/*----------------------------------------------------------------------*/ + +/* + * MR destructor and constructor + * used in Reregister MR verb, sets all fields in ehca_mr_t to 0, + * except struct ib_mr and spinlock + */ +void ehca_mr_deletenew(struct ehca_mr *mr) +{ + mr->flags = 0; + mr->num_kpages = 0; + mr->num_hwpages = 0; + mr->acl = 0; + mr->start = NULL; + mr->fmr_page_size = 0; + mr->fmr_max_pages = 0; + mr->fmr_max_maps = 0; + mr->fmr_map_cnt = 0; + memset(&mr->ipz_mr_handle, 0, sizeof(mr->ipz_mr_handle)); + memset(&mr->galpas, 0, sizeof(mr->galpas)); +} /* end ehca_mr_deletenew() */ + +int ehca_init_mrmw_cache(void) +{ + mr_cache = kmem_cache_create("ehca_cache_mr", + sizeof(struct ehca_mr), 0, + SLAB_HWCACHE_ALIGN, + NULL); + if (!mr_cache) + return -ENOMEM; + mw_cache = kmem_cache_create("ehca_cache_mw", + sizeof(struct ehca_mw), 0, + SLAB_HWCACHE_ALIGN, + NULL); + if (!mw_cache) { + kmem_cache_destroy(mr_cache); + mr_cache = NULL; + return -ENOMEM; + } + return 0; +} + +void ehca_cleanup_mrmw_cache(void) +{ + if (mr_cache) + kmem_cache_destroy(mr_cache); + if (mw_cache) + kmem_cache_destroy(mw_cache); +} + +static inline int ehca_init_top_bmap(struct ehca_top_bmap *ehca_top_bmap, + int dir) +{ + if (!ehca_bmap_valid(ehca_top_bmap->dir[dir])) { + ehca_top_bmap->dir[dir] = + kmalloc(sizeof(struct ehca_dir_bmap), GFP_KERNEL); + if (!ehca_top_bmap->dir[dir]) + return -ENOMEM; + /* Set map block to 0xFF according to EHCA_INVAL_ADDR */ + memset(ehca_top_bmap->dir[dir], 0xFF, EHCA_ENT_MAP_SIZE); + } + return 0; +} + +static inline int ehca_init_bmap(struct ehca_bmap *ehca_bmap, int top, int dir) +{ + if (!ehca_bmap_valid(ehca_bmap->top[top])) { + ehca_bmap->top[top] = + kmalloc(sizeof(struct ehca_top_bmap), GFP_KERNEL); + if (!ehca_bmap->top[top]) + return -ENOMEM; + /* Set map block to 0xFF according to EHCA_INVAL_ADDR */ + memset(ehca_bmap->top[top], 0xFF, EHCA_DIR_MAP_SIZE); + } + return ehca_init_top_bmap(ehca_bmap->top[top], dir); +} + +static inline int ehca_calc_index(unsigned long i, unsigned long s) +{ + return (i >> s) & EHCA_INDEX_MASK; +} + +void ehca_destroy_busmap(void) +{ + int top, dir; + + if (!ehca_bmap) + return; + + for (top = 0; top < EHCA_MAP_ENTRIES; top++) { + if (!ehca_bmap_valid(ehca_bmap->top[top])) + continue; + for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) { + if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir])) + continue; + + kfree(ehca_bmap->top[top]->dir[dir]); + } + + kfree(ehca_bmap->top[top]); + } + + kfree(ehca_bmap); + ehca_bmap = NULL; +} + +static int ehca_update_busmap(unsigned long pfn, unsigned long nr_pages) +{ + unsigned long i, start_section, end_section; + int top, dir, idx; + + if (!nr_pages) + return 0; + + if (!ehca_bmap) { + ehca_bmap = kmalloc(sizeof(struct ehca_bmap), GFP_KERNEL); + if (!ehca_bmap) + return -ENOMEM; + /* Set map block to 0xFF according to EHCA_INVAL_ADDR */ + memset(ehca_bmap, 0xFF, EHCA_TOP_MAP_SIZE); + } + + start_section = (pfn * PAGE_SIZE) / EHCA_SECTSIZE; + end_section = ((pfn + nr_pages) * PAGE_SIZE) / EHCA_SECTSIZE; + for (i = start_section; i < end_section; i++) { + int ret; + top = ehca_calc_index(i, EHCA_TOP_INDEX_SHIFT); + dir = ehca_calc_index(i, EHCA_DIR_INDEX_SHIFT); + idx = i & EHCA_INDEX_MASK; + + ret = ehca_init_bmap(ehca_bmap, top, dir); + if (ret) { + ehca_destroy_busmap(); + return ret; + } + ehca_bmap->top[top]->dir[dir]->ent[idx] = ehca_mr_len; + ehca_mr_len += EHCA_SECTSIZE; + } + return 0; +} + +static int ehca_is_hugepage(unsigned long pfn) +{ + int page_order; + + if (pfn & EHCA_HUGEPAGE_PFN_MASK) + return 0; + + page_order = compound_order(pfn_to_page(pfn)); + if (page_order + PAGE_SHIFT != EHCA_HUGEPAGESHIFT) + return 0; + + return 1; +} + +static int ehca_create_busmap_callback(unsigned long initial_pfn, + unsigned long total_nr_pages, void *arg) +{ + int ret; + unsigned long pfn, start_pfn, end_pfn, nr_pages; + + if ((total_nr_pages * PAGE_SIZE) < EHCA_HUGEPAGE_SIZE) + return ehca_update_busmap(initial_pfn, total_nr_pages); + + /* Given chunk is >= 16GB -> check for hugepages */ + start_pfn = initial_pfn; + end_pfn = initial_pfn + total_nr_pages; + pfn = start_pfn; + + while (pfn < end_pfn) { + if (ehca_is_hugepage(pfn)) { + /* Add mem found in front of the hugepage */ + nr_pages = pfn - start_pfn; + ret = ehca_update_busmap(start_pfn, nr_pages); + if (ret) + return ret; + /* Skip the hugepage */ + pfn += (EHCA_HUGEPAGE_SIZE / PAGE_SIZE); + start_pfn = pfn; + } else + pfn += (EHCA_SECTSIZE / PAGE_SIZE); + } + + /* Add mem found behind the hugepage(s) */ + nr_pages = pfn - start_pfn; + return ehca_update_busmap(start_pfn, nr_pages); +} + +int ehca_create_busmap(void) +{ + int ret; + + ehca_mr_len = 0; + ret = walk_system_ram_range(0, 1ULL << MAX_PHYSMEM_BITS, NULL, + ehca_create_busmap_callback); + return ret; +} + +static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca, + struct ehca_mr *e_mr, + struct ehca_mr_pginfo *pginfo) +{ + int top; + u64 hret, *kpage; + + kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!kpage) { + ehca_err(&shca->ib_device, "kpage alloc failed"); + return -ENOMEM; + } + for (top = 0; top < EHCA_MAP_ENTRIES; top++) { + if (!ehca_bmap_valid(ehca_bmap->top[top])) + continue; + hret = ehca_reg_mr_dir_sections(top, kpage, shca, e_mr, pginfo); + if ((hret != H_PAGE_REGISTERED) && (hret != H_SUCCESS)) + break; + } + + ehca_free_fw_ctrlblock(kpage); + + if (hret == H_SUCCESS) + return 0; /* Everything is fine */ + else { + ehca_err(&shca->ib_device, "ehca_reg_bmap_mr_rpages failed, " + "h_ret=%lli e_mr=%p top=%x lkey=%x " + "hca_hndl=%llx mr_hndl=%llx", hret, e_mr, top, + e_mr->ib.ib_mr.lkey, + shca->ipz_hca_handle.handle, + e_mr->ipz_mr_handle.handle); + return ehca2ib_return_code(hret); + } +} + +static u64 ehca_map_vaddr(void *caddr) +{ + int top, dir, idx; + unsigned long abs_addr, offset; + u64 entry; + + if (!ehca_bmap) + return EHCA_INVAL_ADDR; + + abs_addr = __pa(caddr); + top = ehca_calc_index(abs_addr, EHCA_TOP_INDEX_SHIFT + EHCA_SECTSHIFT); + if (!ehca_bmap_valid(ehca_bmap->top[top])) + return EHCA_INVAL_ADDR; + + dir = ehca_calc_index(abs_addr, EHCA_DIR_INDEX_SHIFT + EHCA_SECTSHIFT); + if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir])) + return EHCA_INVAL_ADDR; + + idx = ehca_calc_index(abs_addr, EHCA_SECTSHIFT); + + entry = ehca_bmap->top[top]->dir[dir]->ent[idx]; + if (ehca_bmap_valid(entry)) { + offset = (unsigned long)caddr & (EHCA_SECTSIZE - 1); + return entry | offset; + } else + return EHCA_INVAL_ADDR; +} + +static int ehca_dma_mapping_error(struct ib_device *dev, u64 dma_addr) +{ + return dma_addr == EHCA_INVAL_ADDR; +} + +static u64 ehca_dma_map_single(struct ib_device *dev, void *cpu_addr, + size_t size, enum dma_data_direction direction) +{ + if (cpu_addr) + return ehca_map_vaddr(cpu_addr); + else + return EHCA_INVAL_ADDR; +} + +static void ehca_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size, + enum dma_data_direction direction) +{ + /* This is only a stub; nothing to be done here */ +} + +static u64 ehca_dma_map_page(struct ib_device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction direction) +{ + u64 addr; + + if (offset + size > PAGE_SIZE) + return EHCA_INVAL_ADDR; + + addr = ehca_map_vaddr(page_address(page)); + if (!ehca_dma_mapping_error(dev, addr)) + addr += offset; + + return addr; +} + +static void ehca_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size, + enum dma_data_direction direction) +{ + /* This is only a stub; nothing to be done here */ +} + +static int ehca_dma_map_sg(struct ib_device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction direction) +{ + struct scatterlist *sg; + int i; + + for_each_sg(sgl, sg, nents, i) { + u64 addr; + addr = ehca_map_vaddr(sg_virt(sg)); + if (ehca_dma_mapping_error(dev, addr)) + return 0; + + sg->dma_address = addr; + sg->dma_length = sg->length; + } + return nents; +} + +static void ehca_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction direction) +{ + /* This is only a stub; nothing to be done here */ +} + +static void ehca_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr, + size_t size, + enum dma_data_direction dir) +{ + dma_sync_single_for_cpu(dev->dma_device, addr, size, dir); +} + +static void ehca_dma_sync_single_for_device(struct ib_device *dev, u64 addr, + size_t size, + enum dma_data_direction dir) +{ + dma_sync_single_for_device(dev->dma_device, addr, size, dir); +} + +static void *ehca_dma_alloc_coherent(struct ib_device *dev, size_t size, + u64 *dma_handle, gfp_t flag) +{ + struct page *p; + void *addr = NULL; + u64 dma_addr; + + p = alloc_pages(flag, get_order(size)); + if (p) { + addr = page_address(p); + dma_addr = ehca_map_vaddr(addr); + if (ehca_dma_mapping_error(dev, dma_addr)) { + free_pages((unsigned long)addr, get_order(size)); + return NULL; + } + if (dma_handle) + *dma_handle = dma_addr; + return addr; + } + return NULL; +} + +static void ehca_dma_free_coherent(struct ib_device *dev, size_t size, + void *cpu_addr, u64 dma_handle) +{ + if (cpu_addr && size) + free_pages((unsigned long)cpu_addr, get_order(size)); +} + + +struct ib_dma_mapping_ops ehca_dma_mapping_ops = { + .mapping_error = ehca_dma_mapping_error, + .map_single = ehca_dma_map_single, + .unmap_single = ehca_dma_unmap_single, + .map_page = ehca_dma_map_page, + .unmap_page = ehca_dma_unmap_page, + .map_sg = ehca_dma_map_sg, + .unmap_sg = ehca_dma_unmap_sg, + .sync_single_for_cpu = ehca_dma_sync_single_for_cpu, + .sync_single_for_device = ehca_dma_sync_single_for_device, + .alloc_coherent = ehca_dma_alloc_coherent, + .free_coherent = ehca_dma_free_coherent, +}; diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.h b/drivers/infiniband/hw/ehca/ehca_mrmw.h new file mode 100644 index 0000000..50d8b51 --- /dev/null +++ b/drivers/infiniband/hw/ehca/ehca_mrmw.h @@ -0,0 +1,132 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * MR/MW declarations and inline functions + * + * Authors: Dietmar Decker + * Christoph Raisch + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _EHCA_MRMW_H_ +#define _EHCA_MRMW_H_ + +enum ehca_reg_type { + EHCA_REG_MR, + EHCA_REG_BUSMAP_MR +}; + +int ehca_reg_mr(struct ehca_shca *shca, + struct ehca_mr *e_mr, + u64 *iova_start, + u64 size, + int acl, + struct ehca_pd *e_pd, + struct ehca_mr_pginfo *pginfo, + u32 *lkey, + u32 *rkey, + enum ehca_reg_type reg_type); + +int ehca_reg_mr_rpages(struct ehca_shca *shca, + struct ehca_mr *e_mr, + struct ehca_mr_pginfo *pginfo); + +int ehca_rereg_mr(struct ehca_shca *shca, + struct ehca_mr *e_mr, + u64 *iova_start, + u64 size, + int mr_access_flags, + struct ehca_pd *e_pd, + struct ehca_mr_pginfo *pginfo, + u32 *lkey, + u32 *rkey); + +int ehca_unmap_one_fmr(struct ehca_shca *shca, + struct ehca_mr *e_fmr); + +int ehca_reg_smr(struct ehca_shca *shca, + struct ehca_mr *e_origmr, + struct ehca_mr *e_newmr, + u64 *iova_start, + int acl, + struct ehca_pd *e_pd, + u32 *lkey, + u32 *rkey); + +int ehca_reg_internal_maxmr(struct ehca_shca *shca, + struct ehca_pd *e_pd, + struct ehca_mr **maxmr); + +int ehca_reg_maxmr(struct ehca_shca *shca, + struct ehca_mr *e_newmr, + u64 *iova_start, + int acl, + struct ehca_pd *e_pd, + u32 *lkey, + u32 *rkey); + +int ehca_dereg_internal_maxmr(struct ehca_shca *shca); + +int ehca_mr_chk_buf_and_calc_size(struct ib_phys_buf *phys_buf_array, + int num_phys_buf, + u64 *iova_start, + u64 *size); + +int ehca_fmr_check_page_list(struct ehca_mr *e_fmr, + u64 *page_list, + int list_len); + +int ehca_set_pagebuf(struct ehca_mr_pginfo *pginfo, + u32 number, + u64 *kpage); + +int ehca_mr_is_maxmr(u64 size, + u64 *iova_start); + +void ehca_mrmw_map_acl(int ib_acl, + u32 *hipz_acl); + +void ehca_mrmw_set_pgsize_hipz_acl(u32 pgsize, u32 *hipz_acl); + +void ehca_mrmw_reverse_map_acl(const u32 *hipz_acl, + int *ib_acl); + +void ehca_mr_deletenew(struct ehca_mr *mr); + +int ehca_create_busmap(void); + +void ehca_destroy_busmap(void); + +extern struct ib_dma_mapping_ops ehca_dma_mapping_ops; +#endif /*_EHCA_MRMW_H_*/ diff --git a/drivers/infiniband/hw/ehca/ehca_pd.c b/drivers/infiniband/hw/ehca/ehca_pd.c new file mode 100644 index 0000000..351577a --- /dev/null +++ b/drivers/infiniband/hw/ehca/ehca_pd.c @@ -0,0 +1,124 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * PD functions + * + * Authors: Christoph Raisch + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "ehca_tools.h" +#include "ehca_iverbs.h" + +static struct kmem_cache *pd_cache; + +struct ib_pd *ehca_alloc_pd(struct ib_device *device, + struct ib_ucontext *context, struct ib_udata *udata) +{ + struct ehca_pd *pd; + int i; + + pd = kmem_cache_zalloc(pd_cache, GFP_KERNEL); + if (!pd) { + ehca_err(device, "device=%p context=%p out of memory", + device, context); + return ERR_PTR(-ENOMEM); + } + + for (i = 0; i < 2; i++) { + INIT_LIST_HEAD(&pd->free[i]); + INIT_LIST_HEAD(&pd->full[i]); + } + mutex_init(&pd->lock); + + /* + * Kernel PD: when device = -1, 0 + * User PD: when context != -1 + */ + if (!context) { + /* + * Kernel PDs after init reuses always + * the one created in ehca_shca_reopen() + */ + struct ehca_shca *shca = container_of(device, struct ehca_shca, + ib_device); + pd->fw_pd.value = shca->pd->fw_pd.value; + } else + pd->fw_pd.value = (u64)pd; + + return &pd->ib_pd; +} + +int ehca_dealloc_pd(struct ib_pd *pd) +{ + struct ehca_pd *my_pd = container_of(pd, struct ehca_pd, ib_pd); + int i, leftovers = 0; + struct ipz_small_queue_page *page, *tmp; + + for (i = 0; i < 2; i++) { + list_splice(&my_pd->full[i], &my_pd->free[i]); + list_for_each_entry_safe(page, tmp, &my_pd->free[i], list) { + leftovers = 1; + free_page(page->page); + kmem_cache_free(small_qp_cache, page); + } + } + + if (leftovers) + ehca_warn(pd->device, + "Some small queue pages were not freed"); + + kmem_cache_free(pd_cache, my_pd); + + return 0; +} + +int ehca_init_pd_cache(void) +{ + pd_cache = kmem_cache_create("ehca_cache_pd", + sizeof(struct ehca_pd), 0, + SLAB_HWCACHE_ALIGN, + NULL); + if (!pd_cache) + return -ENOMEM; + return 0; +} + +void ehca_cleanup_pd_cache(void) +{ + if (pd_cache) + kmem_cache_destroy(pd_cache); +} diff --git a/drivers/infiniband/hw/ehca/ehca_qes.h b/drivers/infiniband/hw/ehca/ehca_qes.h new file mode 100644 index 0000000..90c4efa --- /dev/null +++ b/drivers/infiniband/hw/ehca/ehca_qes.h @@ -0,0 +1,260 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Hardware request structures + * + * Authors: Waleri Fomin + * Reinhard Ernst + * Christoph Raisch + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef _EHCA_QES_H_ +#define _EHCA_QES_H_ + +#include "ehca_tools.h" + +/* virtual scatter gather entry to specify remote addresses with length */ +struct ehca_vsgentry { + u64 vaddr; + u32 lkey; + u32 length; +}; + +#define GRH_FLAG_MASK EHCA_BMASK_IBM( 7, 7) +#define GRH_IPVERSION_MASK EHCA_BMASK_IBM( 0, 3) +#define GRH_TCLASS_MASK EHCA_BMASK_IBM( 4, 12) +#define GRH_FLOWLABEL_MASK EHCA_BMASK_IBM(13, 31) +#define GRH_PAYLEN_MASK EHCA_BMASK_IBM(32, 47) +#define GRH_NEXTHEADER_MASK EHCA_BMASK_IBM(48, 55) +#define GRH_HOPLIMIT_MASK EHCA_BMASK_IBM(56, 63) + +/* + * Unreliable Datagram Address Vector Format + * see IBTA Vol1 chapter 8.3 Global Routing Header + */ +struct ehca_ud_av { + u8 sl; + u8 lnh; + u16 dlid; + u8 reserved1; + u8 reserved2; + u8 reserved3; + u8 slid_path_bits; + u8 reserved4; + u8 ipd; + u8 reserved5; + u8 pmtu; + u32 reserved6; + u64 reserved7; + union { + struct { + u64 word_0; /* always set to 6 */ + /*should be 0x1B for IB transport */ + u64 word_1; + u64 word_2; + u64 word_3; + u64 word_4; + } grh; + struct { + u32 wd_0; + u32 wd_1; + /* DWord_1 --> SGID */ + + u32 sgid_wd3; + u32 sgid_wd2; + + u32 sgid_wd1; + u32 sgid_wd0; + /* DWord_3 --> DGID */ + + u32 dgid_wd3; + u32 dgid_wd2; + + u32 dgid_wd1; + u32 dgid_wd0; + } grh_l; + }; +}; + +/* maximum number of sg entries allowed in a WQE */ +#define MAX_WQE_SG_ENTRIES 252 + +#define WQE_OPTYPE_SEND 0x80 +#define WQE_OPTYPE_RDMAREAD 0x40 +#define WQE_OPTYPE_RDMAWRITE 0x20 +#define WQE_OPTYPE_CMPSWAP 0x10 +#define WQE_OPTYPE_FETCHADD 0x08 +#define WQE_OPTYPE_BIND 0x04 + +#define WQE_WRFLAG_REQ_SIGNAL_COM 0x80 +#define WQE_WRFLAG_FENCE 0x40 +#define WQE_WRFLAG_IMM_DATA_PRESENT 0x20 +#define WQE_WRFLAG_SOLIC_EVENT 0x10 + +#define WQEF_CACHE_HINT 0x80 +#define WQEF_CACHE_HINT_RD_WR 0x40 +#define WQEF_TIMED_WQE 0x20 +#define WQEF_PURGE 0x08 +#define WQEF_HIGH_NIBBLE 0xF0 + +#define MW_BIND_ACCESSCTRL_R_WRITE 0x40 +#define MW_BIND_ACCESSCTRL_R_READ 0x20 +#define MW_BIND_ACCESSCTRL_R_ATOMIC 0x10 + +struct ehca_wqe { + u64 work_request_id; + u8 optype; + u8 wr_flag; + u16 pkeyi; + u8 wqef; + u8 nr_of_data_seg; + u16 wqe_provided_slid; + u32 destination_qp_number; + u32 resync_psn_sqp; + u32 local_ee_context_qkey; + u32 immediate_data; + union { + struct { + u64 remote_virtual_address; + u32 rkey; + u32 reserved; + u64 atomic_1st_op_dma_len; + u64 atomic_2nd_op; + struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES]; + + } nud; + struct { + u64 ehca_ud_av_ptr; + u64 reserved1; + u64 reserved2; + u64 reserved3; + struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES]; + } ud_avp; + struct { + struct ehca_ud_av ud_av; + struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES - + 2]; + } ud_av; + struct { + u64 reserved0; + u64 reserved1; + u64 reserved2; + u64 reserved3; + struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES]; + } all_rcv; + + struct { + u64 reserved; + u32 rkey; + u32 old_rkey; + u64 reserved1; + u64 reserved2; + u64 virtual_address; + u32 reserved3; + u32 length; + u32 reserved4; + u16 reserved5; + u8 reserved6; + u8 lr_ctl; + u32 lkey; + u32 reserved7; + u64 reserved8; + u64 reserved9; + u64 reserved10; + u64 reserved11; + } bind; + struct { + u64 reserved12; + u64 reserved13; + u32 size; + u32 start; + } inline_data; + } u; + +}; + +#define WC_SEND_RECEIVE EHCA_BMASK_IBM(0, 0) +#define WC_IMM_DATA EHCA_BMASK_IBM(1, 1) +#define WC_GRH_PRESENT EHCA_BMASK_IBM(2, 2) +#define WC_SE_BIT EHCA_BMASK_IBM(3, 3) +#define WC_STATUS_ERROR_BIT 0x80000000 +#define WC_STATUS_REMOTE_ERROR_FLAGS 0x0000F800 +#define WC_STATUS_PURGE_BIT 0x10 +#define WC_SEND_RECEIVE_BIT 0x80 + +struct ehca_cqe { + u64 work_request_id; + u8 optype; + u8 w_completion_flags; + u16 reserved1; + u32 nr_bytes_transferred; + u32 immediate_data; + u32 local_qp_number; + u8 freed_resource_count; + u8 service_level; + u16 wqe_count; + u32 qp_token; + u32 qkey_ee_token; + u32 remote_qp_number; + u16 dlid; + u16 rlid; + u16 reserved2; + u16 pkey_index; + u32 cqe_timestamp; + u32 wqe_timestamp; + u8 wqe_timestamp_valid; + u8 reserved3; + u8 reserved4; + u8 cqe_flags; + u32 status; +}; + +struct ehca_eqe { + u64 entry; +}; + +struct ehca_mrte { + u64 starting_va; + u64 length; /* length of memory region in bytes*/ + u32 pd; + u8 key_instance; + u8 pagesize; + u8 mr_control; + u8 local_remote_access_ctrl; + u8 reserved[0x20 - 0x18]; + u64 at_pointer[4]; +}; +#endif /*_EHCA_QES_H_*/ diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c new file mode 100644 index 0000000..2e89356 --- /dev/null +++ b/drivers/infiniband/hw/ehca/ehca_qp.c @@ -0,0 +1,2257 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * QP functions + * + * Authors: Joachim Fenkes + * Stefan Roscher + * Waleri Fomin + * Hoang-Nam Nguyen + * Reinhard Ernst + * Heiko J Schick + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "ehca_classes.h" +#include "ehca_tools.h" +#include "ehca_qes.h" +#include "ehca_iverbs.h" +#include "hcp_if.h" +#include "hipz_fns.h" + +static struct kmem_cache *qp_cache; + +/* + * attributes not supported by query qp + */ +#define QP_ATTR_QUERY_NOT_SUPPORTED (IB_QP_ACCESS_FLAGS | \ + IB_QP_EN_SQD_ASYNC_NOTIFY) + +/* + * ehca (internal) qp state values + */ +enum ehca_qp_state { + EHCA_QPS_RESET = 1, + EHCA_QPS_INIT = 2, + EHCA_QPS_RTR = 3, + EHCA_QPS_RTS = 5, + EHCA_QPS_SQD = 6, + EHCA_QPS_SQE = 8, + EHCA_QPS_ERR = 128 +}; + +/* + * qp state transitions as defined by IB Arch Rel 1.1 page 431 + */ +enum ib_qp_statetrans { + IB_QPST_ANY2RESET, + IB_QPST_ANY2ERR, + IB_QPST_RESET2INIT, + IB_QPST_INIT2RTR, + IB_QPST_INIT2INIT, + IB_QPST_RTR2RTS, + IB_QPST_RTS2SQD, + IB_QPST_RTS2RTS, + IB_QPST_SQD2RTS, + IB_QPST_SQE2RTS, + IB_QPST_SQD2SQD, + IB_QPST_MAX /* nr of transitions, this must be last!!! */ +}; + +/* + * ib2ehca_qp_state maps IB to ehca qp_state + * returns ehca qp state corresponding to given ib qp state + */ +static inline enum ehca_qp_state ib2ehca_qp_state(enum ib_qp_state ib_qp_state) +{ + switch (ib_qp_state) { + case IB_QPS_RESET: + return EHCA_QPS_RESET; + case IB_QPS_INIT: + return EHCA_QPS_INIT; + case IB_QPS_RTR: + return EHCA_QPS_RTR; + case IB_QPS_RTS: + return EHCA_QPS_RTS; + case IB_QPS_SQD: + return EHCA_QPS_SQD; + case IB_QPS_SQE: + return EHCA_QPS_SQE; + case IB_QPS_ERR: + return EHCA_QPS_ERR; + default: + ehca_gen_err("invalid ib_qp_state=%x", ib_qp_state); + return -EINVAL; + } +} + +/* + * ehca2ib_qp_state maps ehca to IB qp_state + * returns ib qp state corresponding to given ehca qp state + */ +static inline enum ib_qp_state ehca2ib_qp_state(enum ehca_qp_state + ehca_qp_state) +{ + switch (ehca_qp_state) { + case EHCA_QPS_RESET: + return IB_QPS_RESET; + case EHCA_QPS_INIT: + return IB_QPS_INIT; + case EHCA_QPS_RTR: + return IB_QPS_RTR; + case EHCA_QPS_RTS: + return IB_QPS_RTS; + case EHCA_QPS_SQD: + return IB_QPS_SQD; + case EHCA_QPS_SQE: + return IB_QPS_SQE; + case EHCA_QPS_ERR: + return IB_QPS_ERR; + default: + ehca_gen_err("invalid ehca_qp_state=%x", ehca_qp_state); + return -EINVAL; + } +} + +/* + * ehca_qp_type used as index for req_attr and opt_attr of + * struct ehca_modqp_statetrans + */ +enum ehca_qp_type { + QPT_RC = 0, + QPT_UC = 1, + QPT_UD = 2, + QPT_SQP = 3, + QPT_MAX +}; + +/* + * ib2ehcaqptype maps Ib to ehca qp_type + * returns ehca qp type corresponding to ib qp type + */ +static inline enum ehca_qp_type ib2ehcaqptype(enum ib_qp_type ibqptype) +{ + switch (ibqptype) { + case IB_QPT_SMI: + case IB_QPT_GSI: + return QPT_SQP; + case IB_QPT_RC: + return QPT_RC; + case IB_QPT_UC: + return QPT_UC; + case IB_QPT_UD: + return QPT_UD; + default: + ehca_gen_err("Invalid ibqptype=%x", ibqptype); + return -EINVAL; + } +} + +static inline enum ib_qp_statetrans get_modqp_statetrans(int ib_fromstate, + int ib_tostate) +{ + int index = -EINVAL; + switch (ib_tostate) { + case IB_QPS_RESET: + index = IB_QPST_ANY2RESET; + break; + case IB_QPS_INIT: + switch (ib_fromstate) { + case IB_QPS_RESET: + index = IB_QPST_RESET2INIT; + break; + case IB_QPS_INIT: + index = IB_QPST_INIT2INIT; + break; + } + break; + case IB_QPS_RTR: + if (ib_fromstate == IB_QPS_INIT) + index = IB_QPST_INIT2RTR; + break; + case IB_QPS_RTS: + switch (ib_fromstate) { + case IB_QPS_RTR: + index = IB_QPST_RTR2RTS; + break; + case IB_QPS_RTS: + index = IB_QPST_RTS2RTS; + break; + case IB_QPS_SQD: + index = IB_QPST_SQD2RTS; + break; + case IB_QPS_SQE: + index = IB_QPST_SQE2RTS; + break; + } + break; + case IB_QPS_SQD: + if (ib_fromstate == IB_QPS_RTS) + index = IB_QPST_RTS2SQD; + break; + case IB_QPS_SQE: + break; + case IB_QPS_ERR: + index = IB_QPST_ANY2ERR; + break; + default: + break; + } + return index; +} + +/* + * ibqptype2servicetype returns hcp service type corresponding to given + * ib qp type used by create_qp() + */ +static inline int ibqptype2servicetype(enum ib_qp_type ibqptype) +{ + switch (ibqptype) { + case IB_QPT_SMI: + case IB_QPT_GSI: + return ST_UD; + case IB_QPT_RC: + return ST_RC; + case IB_QPT_UC: + return ST_UC; + case IB_QPT_UD: + return ST_UD; + case IB_QPT_RAW_IPV6: + return -EINVAL; + case IB_QPT_RAW_ETHERTYPE: + return -EINVAL; + default: + ehca_gen_err("Invalid ibqptype=%x", ibqptype); + return -EINVAL; + } +} + +/* + * init userspace queue info from ipz_queue data + */ +static inline void queue2resp(struct ipzu_queue_resp *resp, + struct ipz_queue *queue) +{ + resp->qe_size = queue->qe_size; + resp->act_nr_of_sg = queue->act_nr_of_sg; + resp->queue_length = queue->queue_length; + resp->pagesize = queue->pagesize; + resp->toggle_state = queue->toggle_state; + resp->offset = queue->offset; +} + +/* + * init_qp_queue initializes/constructs r/squeue and registers queue pages. + */ +static inline int init_qp_queue(struct ehca_shca *shca, + struct ehca_pd *pd, + struct ehca_qp *my_qp, + struct ipz_queue *queue, + int q_type, + u64 expected_hret, + struct ehca_alloc_queue_parms *parms, + int wqe_size) +{ + int ret, cnt, ipz_rc, nr_q_pages; + void *vpage; + u64 rpage, h_ret; + struct ib_device *ib_dev = &shca->ib_device; + struct ipz_adapter_handle ipz_hca_handle = shca->ipz_hca_handle; + + if (!parms->queue_size) + return 0; + + if (parms->is_small) { + nr_q_pages = 1; + ipz_rc = ipz_queue_ctor(pd, queue, nr_q_pages, + 128 << parms->page_size, + wqe_size, parms->act_nr_sges, 1); + } else { + nr_q_pages = parms->queue_size; + ipz_rc = ipz_queue_ctor(pd, queue, nr_q_pages, + EHCA_PAGESIZE, wqe_size, + parms->act_nr_sges, 0); + } + + if (!ipz_rc) { + ehca_err(ib_dev, "Cannot allocate page for queue. ipz_rc=%i", + ipz_rc); + return -EBUSY; + } + + /* register queue pages */ + for (cnt = 0; cnt < nr_q_pages; cnt++) { + vpage = ipz_qpageit_get_inc(queue); + if (!vpage) { + ehca_err(ib_dev, "ipz_qpageit_get_inc() " + "failed p_vpage= %p", vpage); + ret = -EINVAL; + goto init_qp_queue1; + } + rpage = __pa(vpage); + + h_ret = hipz_h_register_rpage_qp(ipz_hca_handle, + my_qp->ipz_qp_handle, + NULL, 0, q_type, + rpage, parms->is_small ? 0 : 1, + my_qp->galpas.kernel); + if (cnt == (nr_q_pages - 1)) { /* last page! */ + if (h_ret != expected_hret) { + ehca_err(ib_dev, "hipz_qp_register_rpage() " + "h_ret=%lli", h_ret); + ret = ehca2ib_return_code(h_ret); + goto init_qp_queue1; + } + vpage = ipz_qpageit_get_inc(&my_qp->ipz_rqueue); + if (vpage) { + ehca_err(ib_dev, "ipz_qpageit_get_inc() " + "should not succeed vpage=%p", vpage); + ret = -EINVAL; + goto init_qp_queue1; + } + } else { + if (h_ret != H_PAGE_REGISTERED) { + ehca_err(ib_dev, "hipz_qp_register_rpage() " + "h_ret=%lli", h_ret); + ret = ehca2ib_return_code(h_ret); + goto init_qp_queue1; + } + } + } + + ipz_qeit_reset(queue); + + return 0; + +init_qp_queue1: + ipz_queue_dtor(pd, queue); + return ret; +} + +static inline int ehca_calc_wqe_size(int act_nr_sge, int is_llqp) +{ + if (is_llqp) + return 128 << act_nr_sge; + else + return offsetof(struct ehca_wqe, + u.nud.sg_list[act_nr_sge]); +} + +static void ehca_determine_small_queue(struct ehca_alloc_queue_parms *queue, + int req_nr_sge, int is_llqp) +{ + u32 wqe_size, q_size; + int act_nr_sge = req_nr_sge; + + if (!is_llqp) + /* round up #SGEs so WQE size is a power of 2 */ + for (act_nr_sge = 4; act_nr_sge <= 252; + act_nr_sge = 4 + 2 * act_nr_sge) + if (act_nr_sge >= req_nr_sge) + break; + + wqe_size = ehca_calc_wqe_size(act_nr_sge, is_llqp); + q_size = wqe_size * (queue->max_wr + 1); + + if (q_size <= 512) + queue->page_size = 2; + else if (q_size <= 1024) + queue->page_size = 3; + else + queue->page_size = 0; + + queue->is_small = (queue->page_size != 0); +} + +/* needs to be called with cq->spinlock held */ +void ehca_add_to_err_list(struct ehca_qp *qp, int on_sq) +{ + struct list_head *list, *node; + + /* TODO: support low latency QPs */ + if (qp->ext_type == EQPT_LLQP) + return; + + if (on_sq) { + list = &qp->send_cq->sqp_err_list; + node = &qp->sq_err_node; + } else { + list = &qp->recv_cq->rqp_err_list; + node = &qp->rq_err_node; + } + + if (list_empty(node)) + list_add_tail(node, list); + + return; +} + +static void del_from_err_list(struct ehca_cq *cq, struct list_head *node) +{ + unsigned long flags; + + spin_lock_irqsave(&cq->spinlock, flags); + + if (!list_empty(node)) + list_del_init(node); + + spin_unlock_irqrestore(&cq->spinlock, flags); +} + +static void reset_queue_map(struct ehca_queue_map *qmap) +{ + int i; + + qmap->tail = qmap->entries - 1; + qmap->left_to_poll = 0; + qmap->next_wqe_idx = 0; + for (i = 0; i < qmap->entries; i++) { + qmap->map[i].reported = 1; + qmap->map[i].cqe_req = 0; + } +} + +/* + * Create an ib_qp struct that is either a QP or an SRQ, depending on + * the value of the is_srq parameter. If init_attr and srq_init_attr share + * fields, the field out of init_attr is used. + */ +static struct ehca_qp *internal_create_qp( + struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata, int is_srq) +{ + struct ehca_qp *my_qp, *my_srq = NULL; + struct ehca_pd *my_pd = container_of(pd, struct ehca_pd, ib_pd); + struct ehca_shca *shca = container_of(pd->device, struct ehca_shca, + ib_device); + struct ib_ucontext *context = NULL; + u64 h_ret; + int is_llqp = 0, has_srq = 0, is_user = 0; + int qp_type, max_send_sge, max_recv_sge, ret; + + /* h_call's out parameters */ + struct ehca_alloc_qp_parms parms; + u32 swqe_size = 0, rwqe_size = 0, ib_qp_num; + unsigned long flags; + + if (!atomic_add_unless(&shca->num_qps, 1, shca->max_num_qps)) { + ehca_err(pd->device, "Unable to create QP, max number of %i " + "QPs reached.", shca->max_num_qps); + ehca_err(pd->device, "To increase the maximum number of QPs " + "use the number_of_qps module parameter.\n"); + return ERR_PTR(-ENOSPC); + } + + if (init_attr->create_flags) { + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + + memset(&parms, 0, sizeof(parms)); + qp_type = init_attr->qp_type; + + if (init_attr->sq_sig_type != IB_SIGNAL_REQ_WR && + init_attr->sq_sig_type != IB_SIGNAL_ALL_WR) { + ehca_err(pd->device, "init_attr->sg_sig_type=%x not allowed", + init_attr->sq_sig_type); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + + /* save LLQP info */ + if (qp_type & 0x80) { + is_llqp = 1; + parms.ext_type = EQPT_LLQP; + parms.ll_comp_flags = qp_type & LLQP_COMP_MASK; + } + qp_type &= 0x1F; + init_attr->qp_type &= 0x1F; + + /* handle SRQ base QPs */ + if (init_attr->srq) { + my_srq = container_of(init_attr->srq, struct ehca_qp, ib_srq); + + if (qp_type == IB_QPT_UC) { + ehca_err(pd->device, "UC with SRQ not supported"); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + + has_srq = 1; + parms.ext_type = EQPT_SRQBASE; + parms.srq_qpn = my_srq->real_qp_num; + } + + if (is_llqp && has_srq) { + ehca_err(pd->device, "LLQPs can't have an SRQ"); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + + /* handle SRQs */ + if (is_srq) { + parms.ext_type = EQPT_SRQ; + parms.srq_limit = srq_init_attr->attr.srq_limit; + if (init_attr->cap.max_recv_sge > 3) { + ehca_err(pd->device, "no more than three SGEs " + "supported for SRQ pd=%p max_sge=%x", + pd, init_attr->cap.max_recv_sge); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + } + + /* check QP type */ + if (qp_type != IB_QPT_UD && + qp_type != IB_QPT_UC && + qp_type != IB_QPT_RC && + qp_type != IB_QPT_SMI && + qp_type != IB_QPT_GSI) { + ehca_err(pd->device, "wrong QP Type=%x", qp_type); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + + if (is_llqp) { + switch (qp_type) { + case IB_QPT_RC: + if ((init_attr->cap.max_send_wr > 255) || + (init_attr->cap.max_recv_wr > 255)) { + ehca_err(pd->device, + "Invalid Number of max_sq_wr=%x " + "or max_rq_wr=%x for RC LLQP", + init_attr->cap.max_send_wr, + init_attr->cap.max_recv_wr); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + break; + case IB_QPT_UD: + if (!EHCA_BMASK_GET(HCA_CAP_UD_LL_QP, shca->hca_cap)) { + ehca_err(pd->device, "UD LLQP not supported " + "by this adapter"); + atomic_dec(&shca->num_qps); + return ERR_PTR(-ENOSYS); + } + if (!(init_attr->cap.max_send_sge <= 5 + && init_attr->cap.max_send_sge >= 1 + && init_attr->cap.max_recv_sge <= 5 + && init_attr->cap.max_recv_sge >= 1)) { + ehca_err(pd->device, + "Invalid Number of max_send_sge=%x " + "or max_recv_sge=%x for UD LLQP", + init_attr->cap.max_send_sge, + init_attr->cap.max_recv_sge); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } else if (init_attr->cap.max_send_wr > 255) { + ehca_err(pd->device, + "Invalid Number of " + "max_send_wr=%x for UD QP_TYPE=%x", + init_attr->cap.max_send_wr, qp_type); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + break; + default: + ehca_err(pd->device, "unsupported LL QP Type=%x", + qp_type); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + } else { + int max_sge = (qp_type == IB_QPT_UD || qp_type == IB_QPT_SMI + || qp_type == IB_QPT_GSI) ? 250 : 252; + + if (init_attr->cap.max_send_sge > max_sge + || init_attr->cap.max_recv_sge > max_sge) { + ehca_err(pd->device, "Invalid number of SGEs requested " + "send_sge=%x recv_sge=%x max_sge=%x", + init_attr->cap.max_send_sge, + init_attr->cap.max_recv_sge, max_sge); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + } + + my_qp = kmem_cache_zalloc(qp_cache, GFP_KERNEL); + if (!my_qp) { + ehca_err(pd->device, "pd=%p not enough memory to alloc qp", pd); + atomic_dec(&shca->num_qps); + return ERR_PTR(-ENOMEM); + } + + if (pd->uobject && udata) { + is_user = 1; + context = pd->uobject->context; + } + + atomic_set(&my_qp->nr_events, 0); + init_waitqueue_head(&my_qp->wait_completion); + spin_lock_init(&my_qp->spinlock_s); + spin_lock_init(&my_qp->spinlock_r); + my_qp->qp_type = qp_type; + my_qp->ext_type = parms.ext_type; + my_qp->state = IB_QPS_RESET; + + if (init_attr->recv_cq) + my_qp->recv_cq = + container_of(init_attr->recv_cq, struct ehca_cq, ib_cq); + if (init_attr->send_cq) + my_qp->send_cq = + container_of(init_attr->send_cq, struct ehca_cq, ib_cq); + + idr_preload(GFP_KERNEL); + write_lock_irqsave(&ehca_qp_idr_lock, flags); + + ret = idr_alloc(&ehca_qp_idr, my_qp, 0, 0x2000000, GFP_NOWAIT); + if (ret >= 0) + my_qp->token = ret; + + write_unlock_irqrestore(&ehca_qp_idr_lock, flags); + idr_preload_end(); + if (ret < 0) { + if (ret == -ENOSPC) { + ret = -EINVAL; + ehca_err(pd->device, "Invalid number of qp"); + } else { + ret = -ENOMEM; + ehca_err(pd->device, "Can't allocate new idr entry."); + } + goto create_qp_exit0; + } + + if (has_srq) + parms.srq_token = my_qp->token; + + parms.servicetype = ibqptype2servicetype(qp_type); + if (parms.servicetype < 0) { + ret = -EINVAL; + ehca_err(pd->device, "Invalid qp_type=%x", qp_type); + goto create_qp_exit1; + } + + /* Always signal by WQE so we can hide circ. WQEs */ + parms.sigtype = HCALL_SIGT_BY_WQE; + + /* UD_AV CIRCUMVENTION */ + max_send_sge = init_attr->cap.max_send_sge; + max_recv_sge = init_attr->cap.max_recv_sge; + if (parms.servicetype == ST_UD && !is_llqp) { + max_send_sge += 2; + max_recv_sge += 2; + } + + parms.token = my_qp->token; + parms.eq_handle = shca->eq.ipz_eq_handle; + parms.pd = my_pd->fw_pd; + if (my_qp->send_cq) + parms.send_cq_handle = my_qp->send_cq->ipz_cq_handle; + if (my_qp->recv_cq) + parms.recv_cq_handle = my_qp->recv_cq->ipz_cq_handle; + + parms.squeue.max_wr = init_attr->cap.max_send_wr; + parms.rqueue.max_wr = init_attr->cap.max_recv_wr; + parms.squeue.max_sge = max_send_sge; + parms.rqueue.max_sge = max_recv_sge; + + /* RC QPs need one more SWQE for unsolicited ack circumvention */ + if (qp_type == IB_QPT_RC) + parms.squeue.max_wr++; + + if (EHCA_BMASK_GET(HCA_CAP_MINI_QP, shca->hca_cap)) { + if (HAS_SQ(my_qp)) + ehca_determine_small_queue( + &parms.squeue, max_send_sge, is_llqp); + if (HAS_RQ(my_qp)) + ehca_determine_small_queue( + &parms.rqueue, max_recv_sge, is_llqp); + parms.qp_storage = + (parms.squeue.is_small || parms.rqueue.is_small); + } + + h_ret = hipz_h_alloc_resource_qp(shca->ipz_hca_handle, &parms, is_user); + if (h_ret != H_SUCCESS) { + ehca_err(pd->device, "h_alloc_resource_qp() failed h_ret=%lli", + h_ret); + ret = ehca2ib_return_code(h_ret); + goto create_qp_exit1; + } + + ib_qp_num = my_qp->real_qp_num = parms.real_qp_num; + my_qp->ipz_qp_handle = parms.qp_handle; + my_qp->galpas = parms.galpas; + + swqe_size = ehca_calc_wqe_size(parms.squeue.act_nr_sges, is_llqp); + rwqe_size = ehca_calc_wqe_size(parms.rqueue.act_nr_sges, is_llqp); + + switch (qp_type) { + case IB_QPT_RC: + if (is_llqp) { + parms.squeue.act_nr_sges = 1; + parms.rqueue.act_nr_sges = 1; + } + /* hide the extra WQE */ + parms.squeue.act_nr_wqes--; + break; + case IB_QPT_UD: + case IB_QPT_GSI: + case IB_QPT_SMI: + /* UD circumvention */ + if (is_llqp) { + parms.squeue.act_nr_sges = 1; + parms.rqueue.act_nr_sges = 1; + } else { + parms.squeue.act_nr_sges -= 2; + parms.rqueue.act_nr_sges -= 2; + } + + if (IB_QPT_GSI == qp_type || IB_QPT_SMI == qp_type) { + parms.squeue.act_nr_wqes = init_attr->cap.max_send_wr; + parms.rqueue.act_nr_wqes = init_attr->cap.max_recv_wr; + parms.squeue.act_nr_sges = init_attr->cap.max_send_sge; + parms.rqueue.act_nr_sges = init_attr->cap.max_recv_sge; + ib_qp_num = (qp_type == IB_QPT_SMI) ? 0 : 1; + } + + break; + + default: + break; + } + + /* initialize r/squeue and register queue pages */ + if (HAS_SQ(my_qp)) { + ret = init_qp_queue( + shca, my_pd, my_qp, &my_qp->ipz_squeue, 0, + HAS_RQ(my_qp) ? H_PAGE_REGISTERED : H_SUCCESS, + &parms.squeue, swqe_size); + if (ret) { + ehca_err(pd->device, "Couldn't initialize squeue " + "and pages ret=%i", ret); + goto create_qp_exit2; + } + + if (!is_user) { + my_qp->sq_map.entries = my_qp->ipz_squeue.queue_length / + my_qp->ipz_squeue.qe_size; + my_qp->sq_map.map = vmalloc(my_qp->sq_map.entries * + sizeof(struct ehca_qmap_entry)); + if (!my_qp->sq_map.map) { + ehca_err(pd->device, "Couldn't allocate squeue " + "map ret=%i", ret); + goto create_qp_exit3; + } + INIT_LIST_HEAD(&my_qp->sq_err_node); + /* to avoid the generation of bogus flush CQEs */ + reset_queue_map(&my_qp->sq_map); + } + } + + if (HAS_RQ(my_qp)) { + ret = init_qp_queue( + shca, my_pd, my_qp, &my_qp->ipz_rqueue, 1, + H_SUCCESS, &parms.rqueue, rwqe_size); + if (ret) { + ehca_err(pd->device, "Couldn't initialize rqueue " + "and pages ret=%i", ret); + goto create_qp_exit4; + } + if (!is_user) { + my_qp->rq_map.entries = my_qp->ipz_rqueue.queue_length / + my_qp->ipz_rqueue.qe_size; + my_qp->rq_map.map = vmalloc(my_qp->rq_map.entries * + sizeof(struct ehca_qmap_entry)); + if (!my_qp->rq_map.map) { + ehca_err(pd->device, "Couldn't allocate squeue " + "map ret=%i", ret); + goto create_qp_exit5; + } + INIT_LIST_HEAD(&my_qp->rq_err_node); + /* to avoid the generation of bogus flush CQEs */ + reset_queue_map(&my_qp->rq_map); + } + } else if (init_attr->srq && !is_user) { + /* this is a base QP, use the queue map of the SRQ */ + my_qp->rq_map = my_srq->rq_map; + INIT_LIST_HEAD(&my_qp->rq_err_node); + + my_qp->ipz_rqueue = my_srq->ipz_rqueue; + } + + if (is_srq) { + my_qp->ib_srq.pd = &my_pd->ib_pd; + my_qp->ib_srq.device = my_pd->ib_pd.device; + + my_qp->ib_srq.srq_context = init_attr->qp_context; + my_qp->ib_srq.event_handler = init_attr->event_handler; + } else { + my_qp->ib_qp.qp_num = ib_qp_num; + my_qp->ib_qp.pd = &my_pd->ib_pd; + my_qp->ib_qp.device = my_pd->ib_pd.device; + + my_qp->ib_qp.recv_cq = init_attr->recv_cq; + my_qp->ib_qp.send_cq = init_attr->send_cq; + + my_qp->ib_qp.qp_type = qp_type; + my_qp->ib_qp.srq = init_attr->srq; + + my_qp->ib_qp.qp_context = init_attr->qp_context; + my_qp->ib_qp.event_handler = init_attr->event_handler; + } + + init_attr->cap.max_inline_data = 0; /* not supported yet */ + init_attr->cap.max_recv_sge = parms.rqueue.act_nr_sges; + init_attr->cap.max_recv_wr = parms.rqueue.act_nr_wqes; + init_attr->cap.max_send_sge = parms.squeue.act_nr_sges; + init_attr->cap.max_send_wr = parms.squeue.act_nr_wqes; + my_qp->init_attr = *init_attr; + + if (qp_type == IB_QPT_SMI || qp_type == IB_QPT_GSI) { + shca->sport[init_attr->port_num - 1].ibqp_sqp[qp_type] = + &my_qp->ib_qp; + if (ehca_nr_ports < 0) { + /* alloc array to cache subsequent modify qp parms + * for autodetect mode + */ + my_qp->mod_qp_parm = + kzalloc(EHCA_MOD_QP_PARM_MAX * + sizeof(*my_qp->mod_qp_parm), + GFP_KERNEL); + if (!my_qp->mod_qp_parm) { + ehca_err(pd->device, + "Could not alloc mod_qp_parm"); + goto create_qp_exit5; + } + } + } + + /* NOTE: define_apq0() not supported yet */ + if (qp_type == IB_QPT_GSI) { + h_ret = ehca_define_sqp(shca, my_qp, init_attr); + if (h_ret != H_SUCCESS) { + kfree(my_qp->mod_qp_parm); + my_qp->mod_qp_parm = NULL; + /* the QP pointer is no longer valid */ + shca->sport[init_attr->port_num - 1].ibqp_sqp[qp_type] = + NULL; + ret = ehca2ib_return_code(h_ret); + goto create_qp_exit6; + } + } + + if (my_qp->send_cq) { + ret = ehca_cq_assign_qp(my_qp->send_cq, my_qp); + if (ret) { + ehca_err(pd->device, + "Couldn't assign qp to send_cq ret=%i", ret); + goto create_qp_exit7; + } + } + + /* copy queues, galpa data to user space */ + if (context && udata) { + struct ehca_create_qp_resp resp; + memset(&resp, 0, sizeof(resp)); + + resp.qp_num = my_qp->real_qp_num; + resp.token = my_qp->token; + resp.qp_type = my_qp->qp_type; + resp.ext_type = my_qp->ext_type; + resp.qkey = my_qp->qkey; + resp.real_qp_num = my_qp->real_qp_num; + + if (HAS_SQ(my_qp)) + queue2resp(&resp.ipz_squeue, &my_qp->ipz_squeue); + if (HAS_RQ(my_qp)) + queue2resp(&resp.ipz_rqueue, &my_qp->ipz_rqueue); + resp.fw_handle_ofs = (u32) + (my_qp->galpas.user.fw_handle & (PAGE_SIZE - 1)); + + if (ib_copy_to_udata(udata, &resp, sizeof resp)) { + ehca_err(pd->device, "Copy to udata failed"); + ret = -EINVAL; + goto create_qp_exit8; + } + } + + return my_qp; + +create_qp_exit8: + ehca_cq_unassign_qp(my_qp->send_cq, my_qp->real_qp_num); + +create_qp_exit7: + kfree(my_qp->mod_qp_parm); + +create_qp_exit6: + if (HAS_RQ(my_qp) && !is_user) + vfree(my_qp->rq_map.map); + +create_qp_exit5: + if (HAS_RQ(my_qp)) + ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue); + +create_qp_exit4: + if (HAS_SQ(my_qp) && !is_user) + vfree(my_qp->sq_map.map); + +create_qp_exit3: + if (HAS_SQ(my_qp)) + ipz_queue_dtor(my_pd, &my_qp->ipz_squeue); + +create_qp_exit2: + hipz_h_destroy_qp(shca->ipz_hca_handle, my_qp); + +create_qp_exit1: + write_lock_irqsave(&ehca_qp_idr_lock, flags); + idr_remove(&ehca_qp_idr, my_qp->token); + write_unlock_irqrestore(&ehca_qp_idr_lock, flags); + +create_qp_exit0: + kmem_cache_free(qp_cache, my_qp); + atomic_dec(&shca->num_qps); + return ERR_PTR(ret); +} + +struct ib_qp *ehca_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *qp_init_attr, + struct ib_udata *udata) +{ + struct ehca_qp *ret; + + ret = internal_create_qp(pd, qp_init_attr, NULL, udata, 0); + return IS_ERR(ret) ? (struct ib_qp *)ret : &ret->ib_qp; +} + +static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp, + struct ib_uobject *uobject); + +struct ib_srq *ehca_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata) +{ + struct ib_qp_init_attr qp_init_attr; + struct ehca_qp *my_qp; + struct ib_srq *ret; + struct ehca_shca *shca = container_of(pd->device, struct ehca_shca, + ib_device); + struct hcp_modify_qp_control_block *mqpcb; + u64 hret, update_mask; + + if (srq_init_attr->srq_type != IB_SRQT_BASIC) + return ERR_PTR(-ENOSYS); + + /* For common attributes, internal_create_qp() takes its info + * out of qp_init_attr, so copy all common attrs there. + */ + memset(&qp_init_attr, 0, sizeof(qp_init_attr)); + qp_init_attr.event_handler = srq_init_attr->event_handler; + qp_init_attr.qp_context = srq_init_attr->srq_context; + qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; + qp_init_attr.qp_type = IB_QPT_RC; + qp_init_attr.cap.max_recv_wr = srq_init_attr->attr.max_wr; + qp_init_attr.cap.max_recv_sge = srq_init_attr->attr.max_sge; + + my_qp = internal_create_qp(pd, &qp_init_attr, srq_init_attr, udata, 1); + if (IS_ERR(my_qp)) + return (struct ib_srq *)my_qp; + + /* copy back return values */ + srq_init_attr->attr.max_wr = qp_init_attr.cap.max_recv_wr; + srq_init_attr->attr.max_sge = 3; + + /* drive SRQ into RTR state */ + mqpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!mqpcb) { + ehca_err(pd->device, "Could not get zeroed page for mqpcb " + "ehca_qp=%p qp_num=%x ", my_qp, my_qp->real_qp_num); + ret = ERR_PTR(-ENOMEM); + goto create_srq1; + } + + mqpcb->qp_state = EHCA_QPS_INIT; + mqpcb->prim_phys_port = 1; + update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1); + hret = hipz_h_modify_qp(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, + &my_qp->pf, + update_mask, + mqpcb, my_qp->galpas.kernel); + if (hret != H_SUCCESS) { + ehca_err(pd->device, "Could not modify SRQ to INIT " + "ehca_qp=%p qp_num=%x h_ret=%lli", + my_qp, my_qp->real_qp_num, hret); + goto create_srq2; + } + + mqpcb->qp_enable = 1; + update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_ENABLE, 1); + hret = hipz_h_modify_qp(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, + &my_qp->pf, + update_mask, + mqpcb, my_qp->galpas.kernel); + if (hret != H_SUCCESS) { + ehca_err(pd->device, "Could not enable SRQ " + "ehca_qp=%p qp_num=%x h_ret=%lli", + my_qp, my_qp->real_qp_num, hret); + goto create_srq2; + } + + mqpcb->qp_state = EHCA_QPS_RTR; + update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1); + hret = hipz_h_modify_qp(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, + &my_qp->pf, + update_mask, + mqpcb, my_qp->galpas.kernel); + if (hret != H_SUCCESS) { + ehca_err(pd->device, "Could not modify SRQ to RTR " + "ehca_qp=%p qp_num=%x h_ret=%lli", + my_qp, my_qp->real_qp_num, hret); + goto create_srq2; + } + + ehca_free_fw_ctrlblock(mqpcb); + + return &my_qp->ib_srq; + +create_srq2: + ret = ERR_PTR(ehca2ib_return_code(hret)); + ehca_free_fw_ctrlblock(mqpcb); + +create_srq1: + internal_destroy_qp(pd->device, my_qp, my_qp->ib_srq.uobject); + + return ret; +} + +/* + * prepare_sqe_rts called by internal_modify_qp() at trans sqe -> rts + * set purge bit of bad wqe and subsequent wqes to avoid reentering sqe + * returns total number of bad wqes in bad_wqe_cnt + */ +static int prepare_sqe_rts(struct ehca_qp *my_qp, struct ehca_shca *shca, + int *bad_wqe_cnt) +{ + u64 h_ret; + struct ipz_queue *squeue; + void *bad_send_wqe_p, *bad_send_wqe_v; + u64 q_ofs; + struct ehca_wqe *wqe; + int qp_num = my_qp->ib_qp.qp_num; + + /* get send wqe pointer */ + h_ret = hipz_h_disable_and_get_wqe(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, &my_qp->pf, + &bad_send_wqe_p, NULL, 2); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "hipz_h_disable_and_get_wqe() failed" + " ehca_qp=%p qp_num=%x h_ret=%lli", + my_qp, qp_num, h_ret); + return ehca2ib_return_code(h_ret); + } + bad_send_wqe_p = (void *)((u64)bad_send_wqe_p & (~(1L << 63))); + ehca_dbg(&shca->ib_device, "qp_num=%x bad_send_wqe_p=%p", + qp_num, bad_send_wqe_p); + /* convert wqe pointer to vadr */ + bad_send_wqe_v = __va((u64)bad_send_wqe_p); + if (ehca_debug_level >= 2) + ehca_dmp(bad_send_wqe_v, 32, "qp_num=%x bad_wqe", qp_num); + squeue = &my_qp->ipz_squeue; + if (ipz_queue_abs_to_offset(squeue, (u64)bad_send_wqe_p, &q_ofs)) { + ehca_err(&shca->ib_device, "failed to get wqe offset qp_num=%x" + " bad_send_wqe_p=%p", qp_num, bad_send_wqe_p); + return -EFAULT; + } + + /* loop sets wqe's purge bit */ + wqe = (struct ehca_wqe *)ipz_qeit_calc(squeue, q_ofs); + *bad_wqe_cnt = 0; + while (wqe->optype != 0xff && wqe->wqef != 0xff) { + if (ehca_debug_level >= 2) + ehca_dmp(wqe, 32, "qp_num=%x wqe", qp_num); + wqe->nr_of_data_seg = 0; /* suppress data access */ + wqe->wqef = WQEF_PURGE; /* WQE to be purged */ + q_ofs = ipz_queue_advance_offset(squeue, q_ofs); + wqe = (struct ehca_wqe *)ipz_qeit_calc(squeue, q_ofs); + *bad_wqe_cnt = (*bad_wqe_cnt)+1; + } + /* + * bad wqe will be reprocessed and ignored when pol_cq() is called, + * i.e. nr of wqes with flush error status is one less + */ + ehca_dbg(&shca->ib_device, "qp_num=%x flusherr_wqe_cnt=%x", + qp_num, (*bad_wqe_cnt)-1); + wqe->wqef = 0; + + return 0; +} + +static int calc_left_cqes(u64 wqe_p, struct ipz_queue *ipz_queue, + struct ehca_queue_map *qmap) +{ + void *wqe_v; + u64 q_ofs; + u32 wqe_idx; + unsigned int tail_idx; + + /* convert real to abs address */ + wqe_p = wqe_p & (~(1UL << 63)); + + wqe_v = __va(wqe_p); + + if (ipz_queue_abs_to_offset(ipz_queue, wqe_p, &q_ofs)) { + ehca_gen_err("Invalid offset for calculating left cqes " + "wqe_p=%#llx wqe_v=%p\n", wqe_p, wqe_v); + return -EFAULT; + } + + tail_idx = next_index(qmap->tail, qmap->entries); + wqe_idx = q_ofs / ipz_queue->qe_size; + + /* check all processed wqes, whether a cqe is requested or not */ + while (tail_idx != wqe_idx) { + if (qmap->map[tail_idx].cqe_req) + qmap->left_to_poll++; + tail_idx = next_index(tail_idx, qmap->entries); + } + /* save index in queue, where we have to start flushing */ + qmap->next_wqe_idx = wqe_idx; + return 0; +} + +static int check_for_left_cqes(struct ehca_qp *my_qp, struct ehca_shca *shca) +{ + u64 h_ret; + void *send_wqe_p, *recv_wqe_p; + int ret; + unsigned long flags; + int qp_num = my_qp->ib_qp.qp_num; + + /* this hcall is not supported on base QPs */ + if (my_qp->ext_type != EQPT_SRQBASE) { + /* get send and receive wqe pointer */ + h_ret = hipz_h_disable_and_get_wqe(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, &my_qp->pf, + &send_wqe_p, &recv_wqe_p, 4); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "disable_and_get_wqe() " + "failed ehca_qp=%p qp_num=%x h_ret=%lli", + my_qp, qp_num, h_ret); + return ehca2ib_return_code(h_ret); + } + + /* + * acquire lock to ensure that nobody is polling the cq which + * could mean that the qmap->tail pointer is in an + * inconsistent state. + */ + spin_lock_irqsave(&my_qp->send_cq->spinlock, flags); + ret = calc_left_cqes((u64)send_wqe_p, &my_qp->ipz_squeue, + &my_qp->sq_map); + spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags); + if (ret) + return ret; + + + spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags); + ret = calc_left_cqes((u64)recv_wqe_p, &my_qp->ipz_rqueue, + &my_qp->rq_map); + spin_unlock_irqrestore(&my_qp->recv_cq->spinlock, flags); + if (ret) + return ret; + } else { + spin_lock_irqsave(&my_qp->send_cq->spinlock, flags); + my_qp->sq_map.left_to_poll = 0; + my_qp->sq_map.next_wqe_idx = next_index(my_qp->sq_map.tail, + my_qp->sq_map.entries); + spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags); + + spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags); + my_qp->rq_map.left_to_poll = 0; + my_qp->rq_map.next_wqe_idx = next_index(my_qp->rq_map.tail, + my_qp->rq_map.entries); + spin_unlock_irqrestore(&my_qp->recv_cq->spinlock, flags); + } + + /* this assures flush cqes being generated only for pending wqes */ + if ((my_qp->sq_map.left_to_poll == 0) && + (my_qp->rq_map.left_to_poll == 0)) { + spin_lock_irqsave(&my_qp->send_cq->spinlock, flags); + ehca_add_to_err_list(my_qp, 1); + spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags); + + if (HAS_RQ(my_qp)) { + spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags); + ehca_add_to_err_list(my_qp, 0); + spin_unlock_irqrestore(&my_qp->recv_cq->spinlock, + flags); + } + } + + return 0; +} + +/* + * internal_modify_qp with circumvention to handle aqp0 properly + * smi_reset2init indicates if this is an internal reset-to-init-call for + * smi. This flag must always be zero if called from ehca_modify_qp()! + * This internal func was intorduced to avoid recursion of ehca_modify_qp()! + */ +static int internal_modify_qp(struct ib_qp *ibqp, + struct ib_qp_attr *attr, + int attr_mask, int smi_reset2init) +{ + enum ib_qp_state qp_cur_state, qp_new_state; + int cnt, qp_attr_idx, ret = 0; + enum ib_qp_statetrans statetrans; + struct hcp_modify_qp_control_block *mqpcb; + struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp); + struct ehca_shca *shca = + container_of(ibqp->pd->device, struct ehca_shca, ib_device); + u64 update_mask; + u64 h_ret; + int bad_wqe_cnt = 0; + int is_user = 0; + int squeue_locked = 0; + unsigned long flags = 0; + + /* do query_qp to obtain current attr values */ + mqpcb = ehca_alloc_fw_ctrlblock(GFP_ATOMIC); + if (!mqpcb) { + ehca_err(ibqp->device, "Could not get zeroed page for mqpcb " + "ehca_qp=%p qp_num=%x ", my_qp, ibqp->qp_num); + return -ENOMEM; + } + + h_ret = hipz_h_query_qp(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, + &my_qp->pf, + mqpcb, my_qp->galpas.kernel); + if (h_ret != H_SUCCESS) { + ehca_err(ibqp->device, "hipz_h_query_qp() failed " + "ehca_qp=%p qp_num=%x h_ret=%lli", + my_qp, ibqp->qp_num, h_ret); + ret = ehca2ib_return_code(h_ret); + goto modify_qp_exit1; + } + if (ibqp->uobject) + is_user = 1; + + qp_cur_state = ehca2ib_qp_state(mqpcb->qp_state); + + if (qp_cur_state == -EINVAL) { /* invalid qp state */ + ret = -EINVAL; + ehca_err(ibqp->device, "Invalid current ehca_qp_state=%x " + "ehca_qp=%p qp_num=%x", + mqpcb->qp_state, my_qp, ibqp->qp_num); + goto modify_qp_exit1; + } + /* + * circumvention to set aqp0 initial state to init + * as expected by IB spec + */ + if (smi_reset2init == 0 && + ibqp->qp_type == IB_QPT_SMI && + qp_cur_state == IB_QPS_RESET && + (attr_mask & IB_QP_STATE) && + attr->qp_state == IB_QPS_INIT) { /* RESET -> INIT */ + struct ib_qp_attr smiqp_attr = { + .qp_state = IB_QPS_INIT, + .port_num = my_qp->init_attr.port_num, + .pkey_index = 0, + .qkey = 0 + }; + int smiqp_attr_mask = IB_QP_STATE | IB_QP_PORT | + IB_QP_PKEY_INDEX | IB_QP_QKEY; + int smirc = internal_modify_qp( + ibqp, &smiqp_attr, smiqp_attr_mask, 1); + if (smirc) { + ehca_err(ibqp->device, "SMI RESET -> INIT failed. " + "ehca_modify_qp() rc=%i", smirc); + ret = H_PARAMETER; + goto modify_qp_exit1; + } + qp_cur_state = IB_QPS_INIT; + ehca_dbg(ibqp->device, "SMI RESET -> INIT succeeded"); + } + /* is transmitted current state equal to "real" current state */ + if ((attr_mask & IB_QP_CUR_STATE) && + qp_cur_state != attr->cur_qp_state) { + ret = -EINVAL; + ehca_err(ibqp->device, + "Invalid IB_QP_CUR_STATE attr->curr_qp_state=%x <>" + " actual cur_qp_state=%x. ehca_qp=%p qp_num=%x", + attr->cur_qp_state, qp_cur_state, my_qp, ibqp->qp_num); + goto modify_qp_exit1; + } + + ehca_dbg(ibqp->device, "ehca_qp=%p qp_num=%x current qp_state=%x " + "new qp_state=%x attribute_mask=%x", + my_qp, ibqp->qp_num, qp_cur_state, attr->qp_state, attr_mask); + + qp_new_state = attr_mask & IB_QP_STATE ? attr->qp_state : qp_cur_state; + if (!smi_reset2init && + !ib_modify_qp_is_ok(qp_cur_state, qp_new_state, ibqp->qp_type, + attr_mask, IB_LINK_LAYER_UNSPECIFIED)) { + ret = -EINVAL; + ehca_err(ibqp->device, + "Invalid qp transition new_state=%x cur_state=%x " + "ehca_qp=%p qp_num=%x attr_mask=%x", qp_new_state, + qp_cur_state, my_qp, ibqp->qp_num, attr_mask); + goto modify_qp_exit1; + } + + mqpcb->qp_state = ib2ehca_qp_state(qp_new_state); + if (mqpcb->qp_state) + update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1); + else { + ret = -EINVAL; + ehca_err(ibqp->device, "Invalid new qp state=%x " + "ehca_qp=%p qp_num=%x", + qp_new_state, my_qp, ibqp->qp_num); + goto modify_qp_exit1; + } + + /* retrieve state transition struct to get req and opt attrs */ + statetrans = get_modqp_statetrans(qp_cur_state, qp_new_state); + if (statetrans < 0) { + ret = -EINVAL; + ehca_err(ibqp->device, " qp_cur_state=%x " + "new_qp_state=%x State_xsition=%x ehca_qp=%p " + "qp_num=%x", qp_cur_state, qp_new_state, + statetrans, my_qp, ibqp->qp_num); + goto modify_qp_exit1; + } + + qp_attr_idx = ib2ehcaqptype(ibqp->qp_type); + + if (qp_attr_idx < 0) { + ret = qp_attr_idx; + ehca_err(ibqp->device, + "Invalid QP type=%x ehca_qp=%p qp_num=%x", + ibqp->qp_type, my_qp, ibqp->qp_num); + goto modify_qp_exit1; + } + + ehca_dbg(ibqp->device, + "ehca_qp=%p qp_num=%x qp_state_xsit=%x", + my_qp, ibqp->qp_num, statetrans); + + /* eHCA2 rev2 and higher require the SEND_GRH_FLAG to be set + * in non-LL UD QPs. + */ + if ((my_qp->qp_type == IB_QPT_UD) && + (my_qp->ext_type != EQPT_LLQP) && + (statetrans == IB_QPST_INIT2RTR) && + (shca->hw_level >= 0x22)) { + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG, 1); + mqpcb->send_grh_flag = 1; + } + + /* sqe -> rts: set purge bit of bad wqe before actual trans */ + if ((my_qp->qp_type == IB_QPT_UD || + my_qp->qp_type == IB_QPT_GSI || + my_qp->qp_type == IB_QPT_SMI) && + statetrans == IB_QPST_SQE2RTS) { + /* mark next free wqe if kernel */ + if (!ibqp->uobject) { + struct ehca_wqe *wqe; + /* lock send queue */ + spin_lock_irqsave(&my_qp->spinlock_s, flags); + squeue_locked = 1; + /* mark next free wqe */ + wqe = (struct ehca_wqe *) + ipz_qeit_get(&my_qp->ipz_squeue); + wqe->optype = wqe->wqef = 0xff; + ehca_dbg(ibqp->device, "qp_num=%x next_free_wqe=%p", + ibqp->qp_num, wqe); + } + ret = prepare_sqe_rts(my_qp, shca, &bad_wqe_cnt); + if (ret) { + ehca_err(ibqp->device, "prepare_sqe_rts() failed " + "ehca_qp=%p qp_num=%x ret=%i", + my_qp, ibqp->qp_num, ret); + goto modify_qp_exit2; + } + } + + /* + * enable RDMA_Atomic_Control if reset->init und reliable con + * this is necessary since gen2 does not provide that flag, + * but pHyp requires it + */ + if (statetrans == IB_QPST_RESET2INIT && + (ibqp->qp_type == IB_QPT_RC || ibqp->qp_type == IB_QPT_UC)) { + mqpcb->rdma_atomic_ctrl = 3; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RDMA_ATOMIC_CTRL, 1); + } + /* circ. pHyp requires #RDMA/Atomic Resp Res for UC INIT -> RTR */ + if (statetrans == IB_QPST_INIT2RTR && + (ibqp->qp_type == IB_QPT_UC) && + !(attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)) { + mqpcb->rdma_nr_atomic_resp_res = 1; /* default to 1 */ + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES, 1); + } + + if (attr_mask & IB_QP_PKEY_INDEX) { + if (attr->pkey_index >= 16) { + ret = -EINVAL; + ehca_err(ibqp->device, "Invalid pkey_index=%x. " + "ehca_qp=%p qp_num=%x max_pkey_index=f", + attr->pkey_index, my_qp, ibqp->qp_num); + goto modify_qp_exit2; + } + mqpcb->prim_p_key_idx = attr->pkey_index; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PRIM_P_KEY_IDX, 1); + } + if (attr_mask & IB_QP_PORT) { + struct ehca_sport *sport; + struct ehca_qp *aqp1; + if (attr->port_num < 1 || attr->port_num > shca->num_ports) { + ret = -EINVAL; + ehca_err(ibqp->device, "Invalid port=%x. " + "ehca_qp=%p qp_num=%x num_ports=%x", + attr->port_num, my_qp, ibqp->qp_num, + shca->num_ports); + goto modify_qp_exit2; + } + sport = &shca->sport[attr->port_num - 1]; + if (!sport->ibqp_sqp[IB_QPT_GSI]) { + /* should not occur */ + ret = -EFAULT; + ehca_err(ibqp->device, "AQP1 was not created for " + "port=%x", attr->port_num); + goto modify_qp_exit2; + } + aqp1 = container_of(sport->ibqp_sqp[IB_QPT_GSI], + struct ehca_qp, ib_qp); + if (ibqp->qp_type != IB_QPT_GSI && + ibqp->qp_type != IB_QPT_SMI && + aqp1->mod_qp_parm) { + /* + * firmware will reject this modify_qp() because + * port is not activated/initialized fully + */ + ret = -EFAULT; + ehca_warn(ibqp->device, "Couldn't modify qp port=%x: " + "either port is being activated (try again) " + "or cabling issue", attr->port_num); + goto modify_qp_exit2; + } + mqpcb->prim_phys_port = attr->port_num; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PRIM_PHYS_PORT, 1); + } + if (attr_mask & IB_QP_QKEY) { + mqpcb->qkey = attr->qkey; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_QKEY, 1); + } + if (attr_mask & IB_QP_AV) { + mqpcb->dlid = attr->ah_attr.dlid; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DLID, 1); + mqpcb->source_path_bits = attr->ah_attr.src_path_bits; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SOURCE_PATH_BITS, 1); + mqpcb->service_level = attr->ah_attr.sl; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SERVICE_LEVEL, 1); + + if (ehca_calc_ipd(shca, mqpcb->prim_phys_port, + attr->ah_attr.static_rate, + &mqpcb->max_static_rate)) { + ret = -EINVAL; + goto modify_qp_exit2; + } + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_MAX_STATIC_RATE, 1); + + /* + * Always supply the GRH flag, even if it's zero, to give the + * hypervisor a clear "yes" or "no" instead of a "perhaps" + */ + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG, 1); + + /* + * only if GRH is TRUE we might consider SOURCE_GID_IDX + * and DEST_GID otherwise phype will return H_ATTR_PARM!!! + */ + if (attr->ah_attr.ah_flags == IB_AH_GRH) { + mqpcb->send_grh_flag = 1; + + mqpcb->source_gid_idx = attr->ah_attr.grh.sgid_index; + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_SOURCE_GID_IDX, 1); + + for (cnt = 0; cnt < 16; cnt++) + mqpcb->dest_gid.byte[cnt] = + attr->ah_attr.grh.dgid.raw[cnt]; + + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DEST_GID, 1); + mqpcb->flow_label = attr->ah_attr.grh.flow_label; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_FLOW_LABEL, 1); + mqpcb->hop_limit = attr->ah_attr.grh.hop_limit; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_HOP_LIMIT, 1); + mqpcb->traffic_class = attr->ah_attr.grh.traffic_class; + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_TRAFFIC_CLASS, 1); + } + } + + if (attr_mask & IB_QP_PATH_MTU) { + /* store ld(MTU) */ + my_qp->mtu_shift = attr->path_mtu + 7; + mqpcb->path_mtu = attr->path_mtu; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PATH_MTU, 1); + } + if (attr_mask & IB_QP_TIMEOUT) { + mqpcb->timeout = attr->timeout; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_TIMEOUT, 1); + } + if (attr_mask & IB_QP_RETRY_CNT) { + mqpcb->retry_count = attr->retry_cnt; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RETRY_COUNT, 1); + } + if (attr_mask & IB_QP_RNR_RETRY) { + mqpcb->rnr_retry_count = attr->rnr_retry; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RNR_RETRY_COUNT, 1); + } + if (attr_mask & IB_QP_RQ_PSN) { + mqpcb->receive_psn = attr->rq_psn; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RECEIVE_PSN, 1); + } + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { + mqpcb->rdma_nr_atomic_resp_res = attr->max_dest_rd_atomic < 3 ? + attr->max_dest_rd_atomic : 2; + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES, 1); + } + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { + mqpcb->rdma_atomic_outst_dest_qp = attr->max_rd_atomic < 3 ? + attr->max_rd_atomic : 2; + update_mask |= + EHCA_BMASK_SET + (MQPCB_MASK_RDMA_ATOMIC_OUTST_DEST_QP, 1); + } + if (attr_mask & IB_QP_ALT_PATH) { + if (attr->alt_port_num < 1 + || attr->alt_port_num > shca->num_ports) { + ret = -EINVAL; + ehca_err(ibqp->device, "Invalid alt_port=%x. " + "ehca_qp=%p qp_num=%x num_ports=%x", + attr->alt_port_num, my_qp, ibqp->qp_num, + shca->num_ports); + goto modify_qp_exit2; + } + mqpcb->alt_phys_port = attr->alt_port_num; + + if (attr->alt_pkey_index >= 16) { + ret = -EINVAL; + ehca_err(ibqp->device, "Invalid alt_pkey_index=%x. " + "ehca_qp=%p qp_num=%x max_pkey_index=f", + attr->pkey_index, my_qp, ibqp->qp_num); + goto modify_qp_exit2; + } + mqpcb->alt_p_key_idx = attr->alt_pkey_index; + + mqpcb->timeout_al = attr->alt_timeout; + mqpcb->dlid_al = attr->alt_ah_attr.dlid; + mqpcb->source_path_bits_al = attr->alt_ah_attr.src_path_bits; + mqpcb->service_level_al = attr->alt_ah_attr.sl; + + if (ehca_calc_ipd(shca, mqpcb->alt_phys_port, + attr->alt_ah_attr.static_rate, + &mqpcb->max_static_rate_al)) { + ret = -EINVAL; + goto modify_qp_exit2; + } + + /* OpenIB doesn't support alternate retry counts - copy them */ + mqpcb->retry_count_al = mqpcb->retry_count; + mqpcb->rnr_retry_count_al = mqpcb->rnr_retry_count; + + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_ALT_PHYS_PORT, 1) + | EHCA_BMASK_SET(MQPCB_MASK_ALT_P_KEY_IDX, 1) + | EHCA_BMASK_SET(MQPCB_MASK_TIMEOUT_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_DLID_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_SOURCE_PATH_BITS_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_SERVICE_LEVEL_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_MAX_STATIC_RATE_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_RETRY_COUNT_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_RNR_RETRY_COUNT_AL, 1); + + /* + * Always supply the GRH flag, even if it's zero, to give the + * hypervisor a clear "yes" or "no" instead of a "perhaps" + */ + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG_AL, 1); + + /* + * only if GRH is TRUE we might consider SOURCE_GID_IDX + * and DEST_GID otherwise phype will return H_ATTR_PARM!!! + */ + if (attr->alt_ah_attr.ah_flags == IB_AH_GRH) { + mqpcb->send_grh_flag_al = 1; + + for (cnt = 0; cnt < 16; cnt++) + mqpcb->dest_gid_al.byte[cnt] = + attr->alt_ah_attr.grh.dgid.raw[cnt]; + mqpcb->source_gid_idx_al = + attr->alt_ah_attr.grh.sgid_index; + mqpcb->flow_label_al = attr->alt_ah_attr.grh.flow_label; + mqpcb->hop_limit_al = attr->alt_ah_attr.grh.hop_limit; + mqpcb->traffic_class_al = + attr->alt_ah_attr.grh.traffic_class; + + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_SOURCE_GID_IDX_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_DEST_GID_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_FLOW_LABEL_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_HOP_LIMIT_AL, 1) | + EHCA_BMASK_SET(MQPCB_MASK_TRAFFIC_CLASS_AL, 1); + } + } + + if (attr_mask & IB_QP_MIN_RNR_TIMER) { + mqpcb->min_rnr_nak_timer_field = attr->min_rnr_timer; + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_MIN_RNR_NAK_TIMER_FIELD, 1); + } + + if (attr_mask & IB_QP_SQ_PSN) { + mqpcb->send_psn = attr->sq_psn; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_PSN, 1); + } + + if (attr_mask & IB_QP_DEST_QPN) { + mqpcb->dest_qp_nr = attr->dest_qp_num; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DEST_QP_NR, 1); + } + + if (attr_mask & IB_QP_PATH_MIG_STATE) { + if (attr->path_mig_state != IB_MIG_REARM + && attr->path_mig_state != IB_MIG_MIGRATED) { + ret = -EINVAL; + ehca_err(ibqp->device, "Invalid mig_state=%x", + attr->path_mig_state); + goto modify_qp_exit2; + } + mqpcb->path_migration_state = attr->path_mig_state + 1; + if (attr->path_mig_state == IB_MIG_REARM) + my_qp->mig_armed = 1; + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_PATH_MIGRATION_STATE, 1); + } + + if (attr_mask & IB_QP_CAP) { + mqpcb->max_nr_outst_send_wr = attr->cap.max_send_wr+1; + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_MAX_NR_OUTST_SEND_WR, 1); + mqpcb->max_nr_outst_recv_wr = attr->cap.max_recv_wr+1; + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_MAX_NR_OUTST_RECV_WR, 1); + /* no support for max_send/recv_sge yet */ + } + + if (ehca_debug_level >= 2) + ehca_dmp(mqpcb, 4*70, "qp_num=%x", ibqp->qp_num); + + h_ret = hipz_h_modify_qp(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, + &my_qp->pf, + update_mask, + mqpcb, my_qp->galpas.kernel); + + if (h_ret != H_SUCCESS) { + ret = ehca2ib_return_code(h_ret); + ehca_err(ibqp->device, "hipz_h_modify_qp() failed h_ret=%lli " + "ehca_qp=%p qp_num=%x", h_ret, my_qp, ibqp->qp_num); + goto modify_qp_exit2; + } + + if ((my_qp->qp_type == IB_QPT_UD || + my_qp->qp_type == IB_QPT_GSI || + my_qp->qp_type == IB_QPT_SMI) && + statetrans == IB_QPST_SQE2RTS) { + /* doorbell to reprocessing wqes */ + iosync(); /* serialize GAL register access */ + hipz_update_sqa(my_qp, bad_wqe_cnt-1); + ehca_gen_dbg("doorbell for %x wqes", bad_wqe_cnt); + } + + if (statetrans == IB_QPST_RESET2INIT || + statetrans == IB_QPST_INIT2INIT) { + mqpcb->qp_enable = 1; + mqpcb->qp_state = EHCA_QPS_INIT; + update_mask = 0; + update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_ENABLE, 1); + + h_ret = hipz_h_modify_qp(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, + &my_qp->pf, + update_mask, + mqpcb, + my_qp->galpas.kernel); + + if (h_ret != H_SUCCESS) { + ret = ehca2ib_return_code(h_ret); + ehca_err(ibqp->device, "ENABLE in context of " + "RESET_2_INIT failed! Maybe you didn't get " + "a LID h_ret=%lli ehca_qp=%p qp_num=%x", + h_ret, my_qp, ibqp->qp_num); + goto modify_qp_exit2; + } + } + if ((qp_new_state == IB_QPS_ERR) && (qp_cur_state != IB_QPS_ERR) + && !is_user) { + ret = check_for_left_cqes(my_qp, shca); + if (ret) + goto modify_qp_exit2; + } + + if (statetrans == IB_QPST_ANY2RESET) { + ipz_qeit_reset(&my_qp->ipz_rqueue); + ipz_qeit_reset(&my_qp->ipz_squeue); + + if (qp_cur_state == IB_QPS_ERR && !is_user) { + del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node); + + if (HAS_RQ(my_qp)) + del_from_err_list(my_qp->recv_cq, + &my_qp->rq_err_node); + } + if (!is_user) + reset_queue_map(&my_qp->sq_map); + + if (HAS_RQ(my_qp) && !is_user) + reset_queue_map(&my_qp->rq_map); + } + + if (attr_mask & IB_QP_QKEY) + my_qp->qkey = attr->qkey; + +modify_qp_exit2: + if (squeue_locked) { /* this means: sqe -> rts */ + spin_unlock_irqrestore(&my_qp->spinlock_s, flags); + my_qp->sqerr_purgeflag = 1; + } + +modify_qp_exit1: + ehca_free_fw_ctrlblock(mqpcb); + + return ret; +} + +int ehca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, + struct ib_udata *udata) +{ + int ret = 0; + + struct ehca_shca *shca = container_of(ibqp->device, struct ehca_shca, + ib_device); + struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp); + + /* The if-block below caches qp_attr to be modified for GSI and SMI + * qps during the initialization by ib_mad. When the respective port + * is activated, ie we got an event PORT_ACTIVE, we'll replay the + * cached modify calls sequence, see ehca_recover_sqs() below. + * Why that is required: + * 1) If one port is connected, older code requires that port one + * to be connected and module option nr_ports=1 to be given by + * user, which is very inconvenient for end user. + * 2) Firmware accepts modify_qp() only if respective port has become + * active. Older code had a wait loop of 30sec create_qp()/ + * define_aqp1(), which is not appropriate in practice. This + * code now removes that wait loop, see define_aqp1(), and always + * reports all ports to ib_mad resp. users. Only activated ports + * will then usable for the users. + */ + if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) { + int port = my_qp->init_attr.port_num; + struct ehca_sport *sport = &shca->sport[port - 1]; + unsigned long flags; + spin_lock_irqsave(&sport->mod_sqp_lock, flags); + /* cache qp_attr only during init */ + if (my_qp->mod_qp_parm) { + struct ehca_mod_qp_parm *p; + if (my_qp->mod_qp_parm_idx >= EHCA_MOD_QP_PARM_MAX) { + ehca_err(&shca->ib_device, + "mod_qp_parm overflow state=%x port=%x" + " type=%x", attr->qp_state, + my_qp->init_attr.port_num, + ibqp->qp_type); + spin_unlock_irqrestore(&sport->mod_sqp_lock, + flags); + return -EINVAL; + } + p = &my_qp->mod_qp_parm[my_qp->mod_qp_parm_idx]; + p->mask = attr_mask; + p->attr = *attr; + my_qp->mod_qp_parm_idx++; + ehca_dbg(&shca->ib_device, + "Saved qp_attr for state=%x port=%x type=%x", + attr->qp_state, my_qp->init_attr.port_num, + ibqp->qp_type); + spin_unlock_irqrestore(&sport->mod_sqp_lock, flags); + goto out; + } + spin_unlock_irqrestore(&sport->mod_sqp_lock, flags); + } + + ret = internal_modify_qp(ibqp, attr, attr_mask, 0); + +out: + if ((ret == 0) && (attr_mask & IB_QP_STATE)) + my_qp->state = attr->qp_state; + + return ret; +} + +void ehca_recover_sqp(struct ib_qp *sqp) +{ + struct ehca_qp *my_sqp = container_of(sqp, struct ehca_qp, ib_qp); + int port = my_sqp->init_attr.port_num; + struct ib_qp_attr attr; + struct ehca_mod_qp_parm *qp_parm; + int i, qp_parm_idx, ret; + unsigned long flags, wr_cnt; + + if (!my_sqp->mod_qp_parm) + return; + ehca_dbg(sqp->device, "SQP port=%x qp_num=%x", port, sqp->qp_num); + + qp_parm = my_sqp->mod_qp_parm; + qp_parm_idx = my_sqp->mod_qp_parm_idx; + for (i = 0; i < qp_parm_idx; i++) { + attr = qp_parm[i].attr; + ret = internal_modify_qp(sqp, &attr, qp_parm[i].mask, 0); + if (ret) { + ehca_err(sqp->device, "Could not modify SQP port=%x " + "qp_num=%x ret=%x", port, sqp->qp_num, ret); + goto free_qp_parm; + } + ehca_dbg(sqp->device, "SQP port=%x qp_num=%x in state=%x", + port, sqp->qp_num, attr.qp_state); + } + + /* re-trigger posted recv wrs */ + wr_cnt = my_sqp->ipz_rqueue.current_q_offset / + my_sqp->ipz_rqueue.qe_size; + if (wr_cnt) { + spin_lock_irqsave(&my_sqp->spinlock_r, flags); + hipz_update_rqa(my_sqp, wr_cnt); + spin_unlock_irqrestore(&my_sqp->spinlock_r, flags); + ehca_dbg(sqp->device, "doorbell port=%x qp_num=%x wr_cnt=%lx", + port, sqp->qp_num, wr_cnt); + } + +free_qp_parm: + kfree(qp_parm); + /* this prevents subsequent calls to modify_qp() to cache qp_attr */ + my_sqp->mod_qp_parm = NULL; +} + +int ehca_query_qp(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) +{ + struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp); + struct ehca_shca *shca = container_of(qp->device, struct ehca_shca, + ib_device); + struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle; + struct hcp_modify_qp_control_block *qpcb; + int cnt, ret = 0; + u64 h_ret; + + if (qp_attr_mask & QP_ATTR_QUERY_NOT_SUPPORTED) { + ehca_err(qp->device, "Invalid attribute mask " + "ehca_qp=%p qp_num=%x qp_attr_mask=%x ", + my_qp, qp->qp_num, qp_attr_mask); + return -EINVAL; + } + + qpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!qpcb) { + ehca_err(qp->device, "Out of memory for qpcb " + "ehca_qp=%p qp_num=%x", my_qp, qp->qp_num); + return -ENOMEM; + } + + h_ret = hipz_h_query_qp(adapter_handle, + my_qp->ipz_qp_handle, + &my_qp->pf, + qpcb, my_qp->galpas.kernel); + + if (h_ret != H_SUCCESS) { + ret = ehca2ib_return_code(h_ret); + ehca_err(qp->device, "hipz_h_query_qp() failed " + "ehca_qp=%p qp_num=%x h_ret=%lli", + my_qp, qp->qp_num, h_ret); + goto query_qp_exit1; + } + + qp_attr->cur_qp_state = ehca2ib_qp_state(qpcb->qp_state); + qp_attr->qp_state = qp_attr->cur_qp_state; + + if (qp_attr->cur_qp_state == -EINVAL) { + ret = -EINVAL; + ehca_err(qp->device, "Got invalid ehca_qp_state=%x " + "ehca_qp=%p qp_num=%x", + qpcb->qp_state, my_qp, qp->qp_num); + goto query_qp_exit1; + } + + if (qp_attr->qp_state == IB_QPS_SQD) + qp_attr->sq_draining = 1; + + qp_attr->qkey = qpcb->qkey; + qp_attr->path_mtu = qpcb->path_mtu; + qp_attr->path_mig_state = qpcb->path_migration_state - 1; + qp_attr->rq_psn = qpcb->receive_psn; + qp_attr->sq_psn = qpcb->send_psn; + qp_attr->min_rnr_timer = qpcb->min_rnr_nak_timer_field; + qp_attr->cap.max_send_wr = qpcb->max_nr_outst_send_wr-1; + qp_attr->cap.max_recv_wr = qpcb->max_nr_outst_recv_wr-1; + /* UD_AV CIRCUMVENTION */ + if (my_qp->qp_type == IB_QPT_UD) { + qp_attr->cap.max_send_sge = + qpcb->actual_nr_sges_in_sq_wqe - 2; + qp_attr->cap.max_recv_sge = + qpcb->actual_nr_sges_in_rq_wqe - 2; + } else { + qp_attr->cap.max_send_sge = + qpcb->actual_nr_sges_in_sq_wqe; + qp_attr->cap.max_recv_sge = + qpcb->actual_nr_sges_in_rq_wqe; + } + + qp_attr->cap.max_inline_data = my_qp->sq_max_inline_data_size; + qp_attr->dest_qp_num = qpcb->dest_qp_nr; + + qp_attr->pkey_index = qpcb->prim_p_key_idx; + qp_attr->port_num = qpcb->prim_phys_port; + qp_attr->timeout = qpcb->timeout; + qp_attr->retry_cnt = qpcb->retry_count; + qp_attr->rnr_retry = qpcb->rnr_retry_count; + + qp_attr->alt_pkey_index = qpcb->alt_p_key_idx; + qp_attr->alt_port_num = qpcb->alt_phys_port; + qp_attr->alt_timeout = qpcb->timeout_al; + + qp_attr->max_dest_rd_atomic = qpcb->rdma_nr_atomic_resp_res; + qp_attr->max_rd_atomic = qpcb->rdma_atomic_outst_dest_qp; + + /* primary av */ + qp_attr->ah_attr.sl = qpcb->service_level; + + if (qpcb->send_grh_flag) { + qp_attr->ah_attr.ah_flags = IB_AH_GRH; + } + + qp_attr->ah_attr.static_rate = qpcb->max_static_rate; + qp_attr->ah_attr.dlid = qpcb->dlid; + qp_attr->ah_attr.src_path_bits = qpcb->source_path_bits; + qp_attr->ah_attr.port_num = qp_attr->port_num; + + /* primary GRH */ + qp_attr->ah_attr.grh.traffic_class = qpcb->traffic_class; + qp_attr->ah_attr.grh.hop_limit = qpcb->hop_limit; + qp_attr->ah_attr.grh.sgid_index = qpcb->source_gid_idx; + qp_attr->ah_attr.grh.flow_label = qpcb->flow_label; + + for (cnt = 0; cnt < 16; cnt++) + qp_attr->ah_attr.grh.dgid.raw[cnt] = + qpcb->dest_gid.byte[cnt]; + + /* alternate AV */ + qp_attr->alt_ah_attr.sl = qpcb->service_level_al; + if (qpcb->send_grh_flag_al) { + qp_attr->alt_ah_attr.ah_flags = IB_AH_GRH; + } + + qp_attr->alt_ah_attr.static_rate = qpcb->max_static_rate_al; + qp_attr->alt_ah_attr.dlid = qpcb->dlid_al; + qp_attr->alt_ah_attr.src_path_bits = qpcb->source_path_bits_al; + + /* alternate GRH */ + qp_attr->alt_ah_attr.grh.traffic_class = qpcb->traffic_class_al; + qp_attr->alt_ah_attr.grh.hop_limit = qpcb->hop_limit_al; + qp_attr->alt_ah_attr.grh.sgid_index = qpcb->source_gid_idx_al; + qp_attr->alt_ah_attr.grh.flow_label = qpcb->flow_label_al; + + for (cnt = 0; cnt < 16; cnt++) + qp_attr->alt_ah_attr.grh.dgid.raw[cnt] = + qpcb->dest_gid_al.byte[cnt]; + + /* return init attributes given in ehca_create_qp */ + if (qp_init_attr) + *qp_init_attr = my_qp->init_attr; + + if (ehca_debug_level >= 2) + ehca_dmp(qpcb, 4*70, "qp_num=%x", qp->qp_num); + +query_qp_exit1: + ehca_free_fw_ctrlblock(qpcb); + + return ret; +} + +int ehca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) +{ + struct ehca_qp *my_qp = + container_of(ibsrq, struct ehca_qp, ib_srq); + struct ehca_shca *shca = + container_of(ibsrq->pd->device, struct ehca_shca, ib_device); + struct hcp_modify_qp_control_block *mqpcb; + u64 update_mask; + u64 h_ret; + int ret = 0; + + mqpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!mqpcb) { + ehca_err(ibsrq->device, "Could not get zeroed page for mqpcb " + "ehca_qp=%p qp_num=%x ", my_qp, my_qp->real_qp_num); + return -ENOMEM; + } + + update_mask = 0; + if (attr_mask & IB_SRQ_LIMIT) { + attr_mask &= ~IB_SRQ_LIMIT; + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_CURR_SRQ_LIMIT, 1) + | EHCA_BMASK_SET(MQPCB_MASK_QP_AFF_ASYN_EV_LOG_REG, 1); + mqpcb->curr_srq_limit = attr->srq_limit; + mqpcb->qp_aff_asyn_ev_log_reg = + EHCA_BMASK_SET(QPX_AAELOG_RESET_SRQ_LIMIT, 1); + } + + /* by now, all bits in attr_mask should have been cleared */ + if (attr_mask) { + ehca_err(ibsrq->device, "invalid attribute mask bits set " + "attr_mask=%x", attr_mask); + ret = -EINVAL; + goto modify_srq_exit0; + } + + if (ehca_debug_level >= 2) + ehca_dmp(mqpcb, 4*70, "qp_num=%x", my_qp->real_qp_num); + + h_ret = hipz_h_modify_qp(shca->ipz_hca_handle, my_qp->ipz_qp_handle, + NULL, update_mask, mqpcb, + my_qp->galpas.kernel); + + if (h_ret != H_SUCCESS) { + ret = ehca2ib_return_code(h_ret); + ehca_err(ibsrq->device, "hipz_h_modify_qp() failed h_ret=%lli " + "ehca_qp=%p qp_num=%x", + h_ret, my_qp, my_qp->real_qp_num); + } + +modify_srq_exit0: + ehca_free_fw_ctrlblock(mqpcb); + + return ret; +} + +int ehca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr) +{ + struct ehca_qp *my_qp = container_of(srq, struct ehca_qp, ib_srq); + struct ehca_shca *shca = container_of(srq->device, struct ehca_shca, + ib_device); + struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle; + struct hcp_modify_qp_control_block *qpcb; + int ret = 0; + u64 h_ret; + + qpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!qpcb) { + ehca_err(srq->device, "Out of memory for qpcb " + "ehca_qp=%p qp_num=%x", my_qp, my_qp->real_qp_num); + return -ENOMEM; + } + + h_ret = hipz_h_query_qp(adapter_handle, my_qp->ipz_qp_handle, + NULL, qpcb, my_qp->galpas.kernel); + + if (h_ret != H_SUCCESS) { + ret = ehca2ib_return_code(h_ret); + ehca_err(srq->device, "hipz_h_query_qp() failed " + "ehca_qp=%p qp_num=%x h_ret=%lli", + my_qp, my_qp->real_qp_num, h_ret); + goto query_srq_exit1; + } + + srq_attr->max_wr = qpcb->max_nr_outst_recv_wr - 1; + srq_attr->max_sge = 3; + srq_attr->srq_limit = qpcb->curr_srq_limit; + + if (ehca_debug_level >= 2) + ehca_dmp(qpcb, 4*70, "qp_num=%x", my_qp->real_qp_num); + +query_srq_exit1: + ehca_free_fw_ctrlblock(qpcb); + + return ret; +} + +static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp, + struct ib_uobject *uobject) +{ + struct ehca_shca *shca = container_of(dev, struct ehca_shca, ib_device); + struct ehca_pd *my_pd = container_of(my_qp->ib_qp.pd, struct ehca_pd, + ib_pd); + struct ehca_sport *sport = &shca->sport[my_qp->init_attr.port_num - 1]; + u32 qp_num = my_qp->real_qp_num; + int ret; + u64 h_ret; + u8 port_num; + int is_user = 0; + enum ib_qp_type qp_type; + unsigned long flags; + + if (uobject) { + is_user = 1; + if (my_qp->mm_count_galpa || + my_qp->mm_count_rqueue || my_qp->mm_count_squeue) { + ehca_err(dev, "Resources still referenced in " + "user space qp_num=%x", qp_num); + return -EINVAL; + } + } + + if (my_qp->send_cq) { + ret = ehca_cq_unassign_qp(my_qp->send_cq, qp_num); + if (ret) { + ehca_err(dev, "Couldn't unassign qp from " + "send_cq ret=%i qp_num=%x cq_num=%x", ret, + qp_num, my_qp->send_cq->cq_number); + return ret; + } + } + + write_lock_irqsave(&ehca_qp_idr_lock, flags); + idr_remove(&ehca_qp_idr, my_qp->token); + write_unlock_irqrestore(&ehca_qp_idr_lock, flags); + + /* + * SRQs will never get into an error list and do not have a recv_cq, + * so we need to skip them here. + */ + if (HAS_RQ(my_qp) && !IS_SRQ(my_qp) && !is_user) + del_from_err_list(my_qp->recv_cq, &my_qp->rq_err_node); + + if (HAS_SQ(my_qp) && !is_user) + del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node); + + /* now wait until all pending events have completed */ + wait_event(my_qp->wait_completion, !atomic_read(&my_qp->nr_events)); + + h_ret = hipz_h_destroy_qp(shca->ipz_hca_handle, my_qp); + if (h_ret != H_SUCCESS) { + ehca_err(dev, "hipz_h_destroy_qp() failed h_ret=%lli " + "ehca_qp=%p qp_num=%x", h_ret, my_qp, qp_num); + return ehca2ib_return_code(h_ret); + } + + port_num = my_qp->init_attr.port_num; + qp_type = my_qp->init_attr.qp_type; + + if (qp_type == IB_QPT_SMI || qp_type == IB_QPT_GSI) { + spin_lock_irqsave(&sport->mod_sqp_lock, flags); + kfree(my_qp->mod_qp_parm); + my_qp->mod_qp_parm = NULL; + shca->sport[port_num - 1].ibqp_sqp[qp_type] = NULL; + spin_unlock_irqrestore(&sport->mod_sqp_lock, flags); + } + + /* no support for IB_QPT_SMI yet */ + if (qp_type == IB_QPT_GSI) { + struct ib_event event; + ehca_info(dev, "device %s: port %x is inactive.", + shca->ib_device.name, port_num); + event.device = &shca->ib_device; + event.event = IB_EVENT_PORT_ERR; + event.element.port_num = port_num; + shca->sport[port_num - 1].port_state = IB_PORT_DOWN; + ib_dispatch_event(&event); + } + + if (HAS_RQ(my_qp)) { + ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue); + if (!is_user) + vfree(my_qp->rq_map.map); + } + if (HAS_SQ(my_qp)) { + ipz_queue_dtor(my_pd, &my_qp->ipz_squeue); + if (!is_user) + vfree(my_qp->sq_map.map); + } + kmem_cache_free(qp_cache, my_qp); + atomic_dec(&shca->num_qps); + return 0; +} + +int ehca_destroy_qp(struct ib_qp *qp) +{ + return internal_destroy_qp(qp->device, + container_of(qp, struct ehca_qp, ib_qp), + qp->uobject); +} + +int ehca_destroy_srq(struct ib_srq *srq) +{ + return internal_destroy_qp(srq->device, + container_of(srq, struct ehca_qp, ib_srq), + srq->uobject); +} + +int ehca_init_qp_cache(void) +{ + qp_cache = kmem_cache_create("ehca_cache_qp", + sizeof(struct ehca_qp), 0, + SLAB_HWCACHE_ALIGN, + NULL); + if (!qp_cache) + return -ENOMEM; + return 0; +} + +void ehca_cleanup_qp_cache(void) +{ + if (qp_cache) + kmem_cache_destroy(qp_cache); +} diff --git a/drivers/infiniband/hw/ehca/ehca_reqs.c b/drivers/infiniband/hw/ehca/ehca_reqs.c new file mode 100644 index 0000000..47f9498 --- /dev/null +++ b/drivers/infiniband/hw/ehca/ehca_reqs.c @@ -0,0 +1,953 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * post_send/recv, poll_cq, req_notify + * + * Authors: Hoang-Nam Nguyen + * Waleri Fomin + * Joachim Fenkes + * Reinhard Ernst + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + + +#include "ehca_classes.h" +#include "ehca_tools.h" +#include "ehca_qes.h" +#include "ehca_iverbs.h" +#include "hcp_if.h" +#include "hipz_fns.h" + +/* in RC traffic, insert an empty RDMA READ every this many packets */ +#define ACK_CIRC_THRESHOLD 2000000 + +static u64 replace_wr_id(u64 wr_id, u16 idx) +{ + u64 ret; + + ret = wr_id & ~QMAP_IDX_MASK; + ret |= idx & QMAP_IDX_MASK; + + return ret; +} + +static u16 get_app_wr_id(u64 wr_id) +{ + return wr_id & QMAP_IDX_MASK; +} + +static inline int ehca_write_rwqe(struct ipz_queue *ipz_rqueue, + struct ehca_wqe *wqe_p, + struct ib_recv_wr *recv_wr, + u32 rq_map_idx) +{ + u8 cnt_ds; + if (unlikely((recv_wr->num_sge < 0) || + (recv_wr->num_sge > ipz_rqueue->act_nr_of_sg))) { + ehca_gen_err("Invalid number of WQE SGE. " + "num_sqe=%x max_nr_of_sg=%x", + recv_wr->num_sge, ipz_rqueue->act_nr_of_sg); + return -EINVAL; /* invalid SG list length */ + } + + /* clear wqe header until sglist */ + memset(wqe_p, 0, offsetof(struct ehca_wqe, u.ud_av.sg_list)); + + wqe_p->work_request_id = replace_wr_id(recv_wr->wr_id, rq_map_idx); + wqe_p->nr_of_data_seg = recv_wr->num_sge; + + for (cnt_ds = 0; cnt_ds < recv_wr->num_sge; cnt_ds++) { + wqe_p->u.all_rcv.sg_list[cnt_ds].vaddr = + recv_wr->sg_list[cnt_ds].addr; + wqe_p->u.all_rcv.sg_list[cnt_ds].lkey = + recv_wr->sg_list[cnt_ds].lkey; + wqe_p->u.all_rcv.sg_list[cnt_ds].length = + recv_wr->sg_list[cnt_ds].length; + } + + if (ehca_debug_level >= 3) { + ehca_gen_dbg("RECEIVE WQE written into ipz_rqueue=%p", + ipz_rqueue); + ehca_dmp(wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "recv wqe"); + } + + return 0; +} + +#if defined(DEBUG_GSI_SEND_WR) + +/* need ib_mad struct */ +#include + +static void trace_send_wr_ud(const struct ib_send_wr *send_wr) +{ + int idx; + int j; + while (send_wr) { + struct ib_mad_hdr *mad_hdr = send_wr->wr.ud.mad_hdr; + struct ib_sge *sge = send_wr->sg_list; + ehca_gen_dbg("send_wr#%x wr_id=%lx num_sge=%x " + "send_flags=%x opcode=%x", idx, send_wr->wr_id, + send_wr->num_sge, send_wr->send_flags, + send_wr->opcode); + if (mad_hdr) { + ehca_gen_dbg("send_wr#%x mad_hdr base_version=%x " + "mgmt_class=%x class_version=%x method=%x " + "status=%x class_specific=%x tid=%lx " + "attr_id=%x resv=%x attr_mod=%x", + idx, mad_hdr->base_version, + mad_hdr->mgmt_class, + mad_hdr->class_version, mad_hdr->method, + mad_hdr->status, mad_hdr->class_specific, + mad_hdr->tid, mad_hdr->attr_id, + mad_hdr->resv, + mad_hdr->attr_mod); + } + for (j = 0; j < send_wr->num_sge; j++) { + u8 *data = __va(sge->addr); + ehca_gen_dbg("send_wr#%x sge#%x addr=%p length=%x " + "lkey=%x", + idx, j, data, sge->length, sge->lkey); + /* assume length is n*16 */ + ehca_dmp(data, sge->length, "send_wr#%x sge#%x", + idx, j); + sge++; + } /* eof for j */ + idx++; + send_wr = send_wr->next; + } /* eof while send_wr */ +} + +#endif /* DEBUG_GSI_SEND_WR */ + +static inline int ehca_write_swqe(struct ehca_qp *qp, + struct ehca_wqe *wqe_p, + const struct ib_send_wr *send_wr, + u32 sq_map_idx, + int hidden) +{ + u32 idx; + u64 dma_length; + struct ehca_av *my_av; + u32 remote_qkey = send_wr->wr.ud.remote_qkey; + struct ehca_qmap_entry *qmap_entry = &qp->sq_map.map[sq_map_idx]; + + if (unlikely((send_wr->num_sge < 0) || + (send_wr->num_sge > qp->ipz_squeue.act_nr_of_sg))) { + ehca_gen_err("Invalid number of WQE SGE. " + "num_sqe=%x max_nr_of_sg=%x", + send_wr->num_sge, qp->ipz_squeue.act_nr_of_sg); + return -EINVAL; /* invalid SG list length */ + } + + /* clear wqe header until sglist */ + memset(wqe_p, 0, offsetof(struct ehca_wqe, u.ud_av.sg_list)); + + wqe_p->work_request_id = replace_wr_id(send_wr->wr_id, sq_map_idx); + + qmap_entry->app_wr_id = get_app_wr_id(send_wr->wr_id); + qmap_entry->reported = 0; + qmap_entry->cqe_req = 0; + + switch (send_wr->opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + wqe_p->optype = WQE_OPTYPE_SEND; + break; + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + wqe_p->optype = WQE_OPTYPE_RDMAWRITE; + break; + case IB_WR_RDMA_READ: + wqe_p->optype = WQE_OPTYPE_RDMAREAD; + break; + default: + ehca_gen_err("Invalid opcode=%x", send_wr->opcode); + return -EINVAL; /* invalid opcode */ + } + + wqe_p->wqef = (send_wr->opcode) & WQEF_HIGH_NIBBLE; + + wqe_p->wr_flag = 0; + + if ((send_wr->send_flags & IB_SEND_SIGNALED || + qp->init_attr.sq_sig_type == IB_SIGNAL_ALL_WR) + && !hidden) { + wqe_p->wr_flag |= WQE_WRFLAG_REQ_SIGNAL_COM; + qmap_entry->cqe_req = 1; + } + + if (send_wr->opcode == IB_WR_SEND_WITH_IMM || + send_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) { + /* this might not work as long as HW does not support it */ + wqe_p->immediate_data = be32_to_cpu(send_wr->ex.imm_data); + wqe_p->wr_flag |= WQE_WRFLAG_IMM_DATA_PRESENT; + } + + wqe_p->nr_of_data_seg = send_wr->num_sge; + + switch (qp->qp_type) { + case IB_QPT_SMI: + case IB_QPT_GSI: + /* no break is intential here */ + case IB_QPT_UD: + /* IB 1.2 spec C10-15 compliance */ + if (send_wr->wr.ud.remote_qkey & 0x80000000) + remote_qkey = qp->qkey; + + wqe_p->destination_qp_number = send_wr->wr.ud.remote_qpn << 8; + wqe_p->local_ee_context_qkey = remote_qkey; + if (unlikely(!send_wr->wr.ud.ah)) { + ehca_gen_err("wr.ud.ah is NULL. qp=%p", qp); + return -EINVAL; + } + if (unlikely(send_wr->wr.ud.remote_qpn == 0)) { + ehca_gen_err("dest QP# is 0. qp=%x", qp->real_qp_num); + return -EINVAL; + } + my_av = container_of(send_wr->wr.ud.ah, struct ehca_av, ib_ah); + wqe_p->u.ud_av.ud_av = my_av->av; + + /* + * omitted check of IB_SEND_INLINE + * since HW does not support it + */ + for (idx = 0; idx < send_wr->num_sge; idx++) { + wqe_p->u.ud_av.sg_list[idx].vaddr = + send_wr->sg_list[idx].addr; + wqe_p->u.ud_av.sg_list[idx].lkey = + send_wr->sg_list[idx].lkey; + wqe_p->u.ud_av.sg_list[idx].length = + send_wr->sg_list[idx].length; + } /* eof for idx */ + if (qp->qp_type == IB_QPT_SMI || + qp->qp_type == IB_QPT_GSI) + wqe_p->u.ud_av.ud_av.pmtu = 1; + if (qp->qp_type == IB_QPT_GSI) { + wqe_p->pkeyi = send_wr->wr.ud.pkey_index; +#ifdef DEBUG_GSI_SEND_WR + trace_send_wr_ud(send_wr); +#endif /* DEBUG_GSI_SEND_WR */ + } + break; + + case IB_QPT_UC: + if (send_wr->send_flags & IB_SEND_FENCE) + wqe_p->wr_flag |= WQE_WRFLAG_FENCE; + /* no break is intentional here */ + case IB_QPT_RC: + /* TODO: atomic not implemented */ + wqe_p->u.nud.remote_virtual_address = + send_wr->wr.rdma.remote_addr; + wqe_p->u.nud.rkey = send_wr->wr.rdma.rkey; + + /* + * omitted checking of IB_SEND_INLINE + * since HW does not support it + */ + dma_length = 0; + for (idx = 0; idx < send_wr->num_sge; idx++) { + wqe_p->u.nud.sg_list[idx].vaddr = + send_wr->sg_list[idx].addr; + wqe_p->u.nud.sg_list[idx].lkey = + send_wr->sg_list[idx].lkey; + wqe_p->u.nud.sg_list[idx].length = + send_wr->sg_list[idx].length; + dma_length += send_wr->sg_list[idx].length; + } /* eof idx */ + wqe_p->u.nud.atomic_1st_op_dma_len = dma_length; + + /* unsolicited ack circumvention */ + if (send_wr->opcode == IB_WR_RDMA_READ) { + /* on RDMA read, switch on and reset counters */ + qp->message_count = qp->packet_count = 0; + qp->unsol_ack_circ = 1; + } else + /* else estimate #packets */ + qp->packet_count += (dma_length >> qp->mtu_shift) + 1; + + break; + + default: + ehca_gen_err("Invalid qptype=%x", qp->qp_type); + return -EINVAL; + } + + if (ehca_debug_level >= 3) { + ehca_gen_dbg("SEND WQE written into queue qp=%p ", qp); + ehca_dmp( wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "send wqe"); + } + return 0; +} + +/* map_ib_wc_status converts raw cqe_status to ib_wc_status */ +static inline void map_ib_wc_status(u32 cqe_status, + enum ib_wc_status *wc_status) +{ + if (unlikely(cqe_status & WC_STATUS_ERROR_BIT)) { + switch (cqe_status & 0x3F) { + case 0x01: + case 0x21: + *wc_status = IB_WC_LOC_LEN_ERR; + break; + case 0x02: + case 0x22: + *wc_status = IB_WC_LOC_QP_OP_ERR; + break; + case 0x03: + case 0x23: + *wc_status = IB_WC_LOC_EEC_OP_ERR; + break; + case 0x04: + case 0x24: + *wc_status = IB_WC_LOC_PROT_ERR; + break; + case 0x05: + case 0x25: + *wc_status = IB_WC_WR_FLUSH_ERR; + break; + case 0x06: + *wc_status = IB_WC_MW_BIND_ERR; + break; + case 0x07: /* remote error - look into bits 20:24 */ + switch ((cqe_status + & WC_STATUS_REMOTE_ERROR_FLAGS) >> 11) { + case 0x0: + /* + * PSN Sequence Error! + * couldn't find a matching status! + */ + *wc_status = IB_WC_GENERAL_ERR; + break; + case 0x1: + *wc_status = IB_WC_REM_INV_REQ_ERR; + break; + case 0x2: + *wc_status = IB_WC_REM_ACCESS_ERR; + break; + case 0x3: + *wc_status = IB_WC_REM_OP_ERR; + break; + case 0x4: + *wc_status = IB_WC_REM_INV_RD_REQ_ERR; + break; + } + break; + case 0x08: + *wc_status = IB_WC_RETRY_EXC_ERR; + break; + case 0x09: + *wc_status = IB_WC_RNR_RETRY_EXC_ERR; + break; + case 0x0A: + case 0x2D: + *wc_status = IB_WC_REM_ABORT_ERR; + break; + case 0x0B: + case 0x2E: + *wc_status = IB_WC_INV_EECN_ERR; + break; + case 0x0C: + case 0x2F: + *wc_status = IB_WC_INV_EEC_STATE_ERR; + break; + case 0x0D: + *wc_status = IB_WC_BAD_RESP_ERR; + break; + case 0x10: + /* WQE purged */ + *wc_status = IB_WC_WR_FLUSH_ERR; + break; + default: + *wc_status = IB_WC_FATAL_ERR; + + } + } else + *wc_status = IB_WC_SUCCESS; +} + +static inline int post_one_send(struct ehca_qp *my_qp, + struct ib_send_wr *cur_send_wr, + int hidden) +{ + struct ehca_wqe *wqe_p; + int ret; + u32 sq_map_idx; + u64 start_offset = my_qp->ipz_squeue.current_q_offset; + + /* get pointer next to free WQE */ + wqe_p = ipz_qeit_get_inc(&my_qp->ipz_squeue); + if (unlikely(!wqe_p)) { + /* too many posted work requests: queue overflow */ + ehca_err(my_qp->ib_qp.device, "Too many posted WQEs " + "qp_num=%x", my_qp->ib_qp.qp_num); + return -ENOMEM; + } + + /* + * Get the index of the WQE in the send queue. The same index is used + * for writing into the sq_map. + */ + sq_map_idx = start_offset / my_qp->ipz_squeue.qe_size; + + /* write a SEND WQE into the QUEUE */ + ret = ehca_write_swqe(my_qp, wqe_p, cur_send_wr, sq_map_idx, hidden); + /* + * if something failed, + * reset the free entry pointer to the start value + */ + if (unlikely(ret)) { + my_qp->ipz_squeue.current_q_offset = start_offset; + ehca_err(my_qp->ib_qp.device, "Could not write WQE " + "qp_num=%x", my_qp->ib_qp.qp_num); + return -EINVAL; + } + + return 0; +} + +int ehca_post_send(struct ib_qp *qp, + struct ib_send_wr *send_wr, + struct ib_send_wr **bad_send_wr) +{ + struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp); + int wqe_cnt = 0; + int ret = 0; + unsigned long flags; + + /* Reject WR if QP is in RESET, INIT or RTR state */ + if (unlikely(my_qp->state < IB_QPS_RTS)) { + ehca_err(qp->device, "Invalid QP state qp_state=%d qpn=%x", + my_qp->state, qp->qp_num); + ret = -EINVAL; + goto out; + } + + /* LOCK the QUEUE */ + spin_lock_irqsave(&my_qp->spinlock_s, flags); + + /* Send an empty extra RDMA read if: + * 1) there has been an RDMA read on this connection before + * 2) no RDMA read occurred for ACK_CIRC_THRESHOLD link packets + * 3) we can be sure that any previous extra RDMA read has been + * processed so we don't overflow the SQ + */ + if (unlikely(my_qp->unsol_ack_circ && + my_qp->packet_count > ACK_CIRC_THRESHOLD && + my_qp->message_count > my_qp->init_attr.cap.max_send_wr)) { + /* insert an empty RDMA READ to fix up the remote QP state */ + struct ib_send_wr circ_wr; + memset(&circ_wr, 0, sizeof(circ_wr)); + circ_wr.opcode = IB_WR_RDMA_READ; + post_one_send(my_qp, &circ_wr, 1); /* ignore retcode */ + wqe_cnt++; + ehca_dbg(qp->device, "posted circ wr qp_num=%x", qp->qp_num); + my_qp->message_count = my_qp->packet_count = 0; + } + + /* loop processes list of send reqs */ + while (send_wr) { + ret = post_one_send(my_qp, send_wr, 0); + if (unlikely(ret)) { + goto post_send_exit0; + } + wqe_cnt++; + send_wr = send_wr->next; + } + +post_send_exit0: + iosync(); /* serialize GAL register access */ + hipz_update_sqa(my_qp, wqe_cnt); + if (unlikely(ret || ehca_debug_level >= 2)) + ehca_dbg(qp->device, "ehca_qp=%p qp_num=%x wqe_cnt=%d ret=%i", + my_qp, qp->qp_num, wqe_cnt, ret); + my_qp->message_count += wqe_cnt; + spin_unlock_irqrestore(&my_qp->spinlock_s, flags); + +out: + if (ret) + *bad_send_wr = send_wr; + return ret; +} + +static int internal_post_recv(struct ehca_qp *my_qp, + struct ib_device *dev, + struct ib_recv_wr *recv_wr, + struct ib_recv_wr **bad_recv_wr) +{ + struct ehca_wqe *wqe_p; + int wqe_cnt = 0; + int ret = 0; + u32 rq_map_idx; + unsigned long flags; + struct ehca_qmap_entry *qmap_entry; + + if (unlikely(!HAS_RQ(my_qp))) { + ehca_err(dev, "QP has no RQ ehca_qp=%p qp_num=%x ext_type=%d", + my_qp, my_qp->real_qp_num, my_qp->ext_type); + ret = -ENODEV; + goto out; + } + + /* LOCK the QUEUE */ + spin_lock_irqsave(&my_qp->spinlock_r, flags); + + /* loop processes list of recv reqs */ + while (recv_wr) { + u64 start_offset = my_qp->ipz_rqueue.current_q_offset; + /* get pointer next to free WQE */ + wqe_p = ipz_qeit_get_inc(&my_qp->ipz_rqueue); + if (unlikely(!wqe_p)) { + /* too many posted work requests: queue overflow */ + ret = -ENOMEM; + ehca_err(dev, "Too many posted WQEs " + "qp_num=%x", my_qp->real_qp_num); + goto post_recv_exit0; + } + /* + * Get the index of the WQE in the recv queue. The same index + * is used for writing into the rq_map. + */ + rq_map_idx = start_offset / my_qp->ipz_rqueue.qe_size; + + /* write a RECV WQE into the QUEUE */ + ret = ehca_write_rwqe(&my_qp->ipz_rqueue, wqe_p, recv_wr, + rq_map_idx); + /* + * if something failed, + * reset the free entry pointer to the start value + */ + if (unlikely(ret)) { + my_qp->ipz_rqueue.current_q_offset = start_offset; + ret = -EINVAL; + ehca_err(dev, "Could not write WQE " + "qp_num=%x", my_qp->real_qp_num); + goto post_recv_exit0; + } + + qmap_entry = &my_qp->rq_map.map[rq_map_idx]; + qmap_entry->app_wr_id = get_app_wr_id(recv_wr->wr_id); + qmap_entry->reported = 0; + qmap_entry->cqe_req = 1; + + wqe_cnt++; + recv_wr = recv_wr->next; + } /* eof for recv_wr */ + +post_recv_exit0: + iosync(); /* serialize GAL register access */ + hipz_update_rqa(my_qp, wqe_cnt); + if (unlikely(ret || ehca_debug_level >= 2)) + ehca_dbg(dev, "ehca_qp=%p qp_num=%x wqe_cnt=%d ret=%i", + my_qp, my_qp->real_qp_num, wqe_cnt, ret); + spin_unlock_irqrestore(&my_qp->spinlock_r, flags); + +out: + if (ret) + *bad_recv_wr = recv_wr; + + return ret; +} + +int ehca_post_recv(struct ib_qp *qp, + struct ib_recv_wr *recv_wr, + struct ib_recv_wr **bad_recv_wr) +{ + struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp); + + /* Reject WR if QP is in RESET state */ + if (unlikely(my_qp->state == IB_QPS_RESET)) { + ehca_err(qp->device, "Invalid QP state qp_state=%d qpn=%x", + my_qp->state, qp->qp_num); + *bad_recv_wr = recv_wr; + return -EINVAL; + } + + return internal_post_recv(my_qp, qp->device, recv_wr, bad_recv_wr); +} + +int ehca_post_srq_recv(struct ib_srq *srq, + struct ib_recv_wr *recv_wr, + struct ib_recv_wr **bad_recv_wr) +{ + return internal_post_recv(container_of(srq, struct ehca_qp, ib_srq), + srq->device, recv_wr, bad_recv_wr); +} + +/* + * ib_wc_opcode table converts ehca wc opcode to ib + * Since we use zero to indicate invalid opcode, the actual ib opcode must + * be decremented!!! + */ +static const u8 ib_wc_opcode[255] = { + [0x01] = IB_WC_RECV+1, + [0x02] = IB_WC_RECV_RDMA_WITH_IMM+1, + [0x04] = IB_WC_BIND_MW+1, + [0x08] = IB_WC_FETCH_ADD+1, + [0x10] = IB_WC_COMP_SWAP+1, + [0x20] = IB_WC_RDMA_WRITE+1, + [0x40] = IB_WC_RDMA_READ+1, + [0x80] = IB_WC_SEND+1 +}; + +/* internal function to poll one entry of cq */ +static inline int ehca_poll_cq_one(struct ib_cq *cq, struct ib_wc *wc) +{ + int ret = 0, qmap_tail_idx; + struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq); + struct ehca_cqe *cqe; + struct ehca_qp *my_qp; + struct ehca_qmap_entry *qmap_entry; + struct ehca_queue_map *qmap; + int cqe_count = 0, is_error; + +repoll: + cqe = (struct ehca_cqe *) + ipz_qeit_get_inc_valid(&my_cq->ipz_queue); + if (!cqe) { + ret = -EAGAIN; + if (ehca_debug_level >= 3) + ehca_dbg(cq->device, "Completion queue is empty " + "my_cq=%p cq_num=%x", my_cq, my_cq->cq_number); + goto poll_cq_one_exit0; + } + + /* prevents loads being reordered across this point */ + rmb(); + + cqe_count++; + if (unlikely(cqe->status & WC_STATUS_PURGE_BIT)) { + struct ehca_qp *qp; + int purgeflag; + unsigned long flags; + + qp = ehca_cq_get_qp(my_cq, cqe->local_qp_number); + if (!qp) { + ehca_err(cq->device, "cq_num=%x qp_num=%x " + "could not find qp -> ignore cqe", + my_cq->cq_number, cqe->local_qp_number); + ehca_dmp(cqe, 64, "cq_num=%x qp_num=%x", + my_cq->cq_number, cqe->local_qp_number); + /* ignore this purged cqe */ + goto repoll; + } + spin_lock_irqsave(&qp->spinlock_s, flags); + purgeflag = qp->sqerr_purgeflag; + spin_unlock_irqrestore(&qp->spinlock_s, flags); + + if (purgeflag) { + ehca_dbg(cq->device, + "Got CQE with purged bit qp_num=%x src_qp=%x", + cqe->local_qp_number, cqe->remote_qp_number); + if (ehca_debug_level >= 2) + ehca_dmp(cqe, 64, "qp_num=%x src_qp=%x", + cqe->local_qp_number, + cqe->remote_qp_number); + /* + * ignore this to avoid double cqes of bad wqe + * that caused sqe and turn off purge flag + */ + qp->sqerr_purgeflag = 0; + goto repoll; + } + } + + is_error = cqe->status & WC_STATUS_ERROR_BIT; + + /* trace error CQEs if debug_level >= 1, trace all CQEs if >= 3 */ + if (unlikely(ehca_debug_level >= 3 || (ehca_debug_level && is_error))) { + ehca_dbg(cq->device, + "Received %sCOMPLETION ehca_cq=%p cq_num=%x -----", + is_error ? "ERROR " : "", my_cq, my_cq->cq_number); + ehca_dmp(cqe, 64, "ehca_cq=%p cq_num=%x", + my_cq, my_cq->cq_number); + ehca_dbg(cq->device, + "ehca_cq=%p cq_num=%x -------------------------", + my_cq, my_cq->cq_number); + } + + read_lock(&ehca_qp_idr_lock); + my_qp = idr_find(&ehca_qp_idr, cqe->qp_token); + read_unlock(&ehca_qp_idr_lock); + if (!my_qp) + goto repoll; + wc->qp = &my_qp->ib_qp; + + qmap_tail_idx = get_app_wr_id(cqe->work_request_id); + if (!(cqe->w_completion_flags & WC_SEND_RECEIVE_BIT)) + /* We got a send completion. */ + qmap = &my_qp->sq_map; + else + /* We got a receive completion. */ + qmap = &my_qp->rq_map; + + /* advance the tail pointer */ + qmap->tail = qmap_tail_idx; + + if (is_error) { + /* + * set left_to_poll to 0 because in error state, we will not + * get any additional CQEs + */ + my_qp->sq_map.next_wqe_idx = next_index(my_qp->sq_map.tail, + my_qp->sq_map.entries); + my_qp->sq_map.left_to_poll = 0; + ehca_add_to_err_list(my_qp, 1); + + my_qp->rq_map.next_wqe_idx = next_index(my_qp->rq_map.tail, + my_qp->rq_map.entries); + my_qp->rq_map.left_to_poll = 0; + if (HAS_RQ(my_qp)) + ehca_add_to_err_list(my_qp, 0); + } + + qmap_entry = &qmap->map[qmap_tail_idx]; + if (qmap_entry->reported) { + ehca_warn(cq->device, "Double cqe on qp_num=%#x", + my_qp->real_qp_num); + /* found a double cqe, discard it and read next one */ + goto repoll; + } + + wc->wr_id = replace_wr_id(cqe->work_request_id, qmap_entry->app_wr_id); + qmap_entry->reported = 1; + + /* if left_to_poll is decremented to 0, add the QP to the error list */ + if (qmap->left_to_poll > 0) { + qmap->left_to_poll--; + if ((my_qp->sq_map.left_to_poll == 0) && + (my_qp->rq_map.left_to_poll == 0)) { + ehca_add_to_err_list(my_qp, 1); + if (HAS_RQ(my_qp)) + ehca_add_to_err_list(my_qp, 0); + } + } + + /* eval ib_wc_opcode */ + wc->opcode = ib_wc_opcode[cqe->optype]-1; + if (unlikely(wc->opcode == -1)) { + ehca_err(cq->device, "Invalid cqe->OPType=%x cqe->status=%x " + "ehca_cq=%p cq_num=%x", + cqe->optype, cqe->status, my_cq, my_cq->cq_number); + /* dump cqe for other infos */ + ehca_dmp(cqe, 64, "ehca_cq=%p cq_num=%x", + my_cq, my_cq->cq_number); + /* update also queue adder to throw away this entry!!! */ + goto repoll; + } + + /* eval ib_wc_status */ + if (unlikely(is_error)) { + /* complete with errors */ + map_ib_wc_status(cqe->status, &wc->status); + wc->vendor_err = wc->status; + } else + wc->status = IB_WC_SUCCESS; + + wc->byte_len = cqe->nr_bytes_transferred; + wc->pkey_index = cqe->pkey_index; + wc->slid = cqe->rlid; + wc->dlid_path_bits = cqe->dlid; + wc->src_qp = cqe->remote_qp_number; + /* + * HW has "Immed data present" and "GRH present" in bits 6 and 5. + * SW defines those in bits 1 and 0, so we can just shift and mask. + */ + wc->wc_flags = (cqe->w_completion_flags >> 5) & 3; + wc->ex.imm_data = cpu_to_be32(cqe->immediate_data); + wc->sl = cqe->service_level; + +poll_cq_one_exit0: + if (cqe_count > 0) + hipz_update_feca(my_cq, cqe_count); + + return ret; +} + +static int generate_flush_cqes(struct ehca_qp *my_qp, struct ib_cq *cq, + struct ib_wc *wc, int num_entries, + struct ipz_queue *ipz_queue, int on_sq) +{ + int nr = 0; + struct ehca_wqe *wqe; + u64 offset; + struct ehca_queue_map *qmap; + struct ehca_qmap_entry *qmap_entry; + + if (on_sq) + qmap = &my_qp->sq_map; + else + qmap = &my_qp->rq_map; + + qmap_entry = &qmap->map[qmap->next_wqe_idx]; + + while ((nr < num_entries) && (qmap_entry->reported == 0)) { + /* generate flush CQE */ + + memset(wc, 0, sizeof(*wc)); + + offset = qmap->next_wqe_idx * ipz_queue->qe_size; + wqe = (struct ehca_wqe *)ipz_qeit_calc(ipz_queue, offset); + if (!wqe) { + ehca_err(cq->device, "Invalid wqe offset=%#llx on " + "qp_num=%#x", offset, my_qp->real_qp_num); + return nr; + } + + wc->wr_id = replace_wr_id(wqe->work_request_id, + qmap_entry->app_wr_id); + + if (on_sq) { + switch (wqe->optype) { + case WQE_OPTYPE_SEND: + wc->opcode = IB_WC_SEND; + break; + case WQE_OPTYPE_RDMAWRITE: + wc->opcode = IB_WC_RDMA_WRITE; + break; + case WQE_OPTYPE_RDMAREAD: + wc->opcode = IB_WC_RDMA_READ; + break; + default: + ehca_err(cq->device, "Invalid optype=%x", + wqe->optype); + return nr; + } + } else + wc->opcode = IB_WC_RECV; + + if (wqe->wr_flag & WQE_WRFLAG_IMM_DATA_PRESENT) { + wc->ex.imm_data = wqe->immediate_data; + wc->wc_flags |= IB_WC_WITH_IMM; + } + + wc->status = IB_WC_WR_FLUSH_ERR; + + wc->qp = &my_qp->ib_qp; + + /* mark as reported and advance next_wqe pointer */ + qmap_entry->reported = 1; + qmap->next_wqe_idx = next_index(qmap->next_wqe_idx, + qmap->entries); + qmap_entry = &qmap->map[qmap->next_wqe_idx]; + + wc++; nr++; + } + + return nr; + +} + +int ehca_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc) +{ + struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq); + int nr; + struct ehca_qp *err_qp; + struct ib_wc *current_wc = wc; + int ret = 0; + unsigned long flags; + int entries_left = num_entries; + + if (num_entries < 1) { + ehca_err(cq->device, "Invalid num_entries=%d ehca_cq=%p " + "cq_num=%x", num_entries, my_cq, my_cq->cq_number); + ret = -EINVAL; + goto poll_cq_exit0; + } + + spin_lock_irqsave(&my_cq->spinlock, flags); + + /* generate flush cqes for send queues */ + list_for_each_entry(err_qp, &my_cq->sqp_err_list, sq_err_node) { + nr = generate_flush_cqes(err_qp, cq, current_wc, entries_left, + &err_qp->ipz_squeue, 1); + entries_left -= nr; + current_wc += nr; + + if (entries_left == 0) + break; + } + + /* generate flush cqes for receive queues */ + list_for_each_entry(err_qp, &my_cq->rqp_err_list, rq_err_node) { + nr = generate_flush_cqes(err_qp, cq, current_wc, entries_left, + &err_qp->ipz_rqueue, 0); + entries_left -= nr; + current_wc += nr; + + if (entries_left == 0) + break; + } + + for (nr = 0; nr < entries_left; nr++) { + ret = ehca_poll_cq_one(cq, current_wc); + if (ret) + break; + current_wc++; + } /* eof for nr */ + entries_left -= nr; + + spin_unlock_irqrestore(&my_cq->spinlock, flags); + if (ret == -EAGAIN || !ret) + ret = num_entries - entries_left; + +poll_cq_exit0: + return ret; +} + +int ehca_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags notify_flags) +{ + struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq); + int ret = 0; + + switch (notify_flags & IB_CQ_SOLICITED_MASK) { + case IB_CQ_SOLICITED: + hipz_set_cqx_n0(my_cq, 1); + break; + case IB_CQ_NEXT_COMP: + hipz_set_cqx_n1(my_cq, 1); + break; + default: + return -EINVAL; + } + + if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) { + unsigned long spl_flags; + spin_lock_irqsave(&my_cq->spinlock, spl_flags); + ret = ipz_qeit_is_valid(&my_cq->ipz_queue); + spin_unlock_irqrestore(&my_cq->spinlock, spl_flags); + } + + return ret; +} diff --git a/drivers/infiniband/hw/ehca/ehca_sqp.c b/drivers/infiniband/hw/ehca/ehca_sqp.c new file mode 100644 index 0000000..dba8f9f --- /dev/null +++ b/drivers/infiniband/hw/ehca/ehca_sqp.c @@ -0,0 +1,237 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * SQP functions + * + * Authors: Khadija Souissi + * Heiko J Schick + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "ehca_classes.h" +#include "ehca_tools.h" +#include "ehca_iverbs.h" +#include "hcp_if.h" + +#define IB_MAD_STATUS_REDIRECT cpu_to_be16(0x0002) +#define IB_MAD_STATUS_UNSUP_VERSION cpu_to_be16(0x0004) +#define IB_MAD_STATUS_UNSUP_METHOD cpu_to_be16(0x0008) + +#define IB_PMA_CLASS_PORT_INFO cpu_to_be16(0x0001) + +/** + * ehca_define_sqp - Defines special queue pair 1 (GSI QP). When special queue + * pair is created successfully, the corresponding port gets active. + * + * Define Special Queue pair 0 (SMI QP) is still not supported. + * + * @qp_init_attr: Queue pair init attributes with port and queue pair type + */ + +u64 ehca_define_sqp(struct ehca_shca *shca, + struct ehca_qp *ehca_qp, + struct ib_qp_init_attr *qp_init_attr) +{ + u32 pma_qp_nr, bma_qp_nr; + u64 ret; + u8 port = qp_init_attr->port_num; + int counter; + + shca->sport[port - 1].port_state = IB_PORT_DOWN; + + switch (qp_init_attr->qp_type) { + case IB_QPT_SMI: + /* function not supported yet */ + break; + case IB_QPT_GSI: + ret = hipz_h_define_aqp1(shca->ipz_hca_handle, + ehca_qp->ipz_qp_handle, + ehca_qp->galpas.kernel, + (u32) qp_init_attr->port_num, + &pma_qp_nr, &bma_qp_nr); + + if (ret != H_SUCCESS) { + ehca_err(&shca->ib_device, + "Can't define AQP1 for port %x. h_ret=%lli", + port, ret); + return ret; + } + shca->sport[port - 1].pma_qp_nr = pma_qp_nr; + ehca_dbg(&shca->ib_device, "port=%x pma_qp_nr=%x", + port, pma_qp_nr); + break; + default: + ehca_err(&shca->ib_device, "invalid qp_type=%x", + qp_init_attr->qp_type); + return H_PARAMETER; + } + + if (ehca_nr_ports < 0) /* autodetect mode */ + return H_SUCCESS; + + for (counter = 0; + shca->sport[port - 1].port_state != IB_PORT_ACTIVE && + counter < ehca_port_act_time; + counter++) { + ehca_dbg(&shca->ib_device, "... wait until port %x is active", + port); + msleep_interruptible(1000); + } + + if (counter == ehca_port_act_time) { + ehca_err(&shca->ib_device, "Port %x is not active.", port); + return H_HARDWARE; + } + + return H_SUCCESS; +} + +struct ib_perf { + struct ib_mad_hdr mad_hdr; + u8 reserved[40]; + u8 data[192]; +} __attribute__ ((packed)); + +/* TC/SL/FL packed into 32 bits, as in ClassPortInfo */ +struct tcslfl { + u32 tc:8; + u32 sl:4; + u32 fl:20; +} __attribute__ ((packed)); + +/* IP Version/TC/FL packed into 32 bits, as in GRH */ +struct vertcfl { + u32 ver:4; + u32 tc:8; + u32 fl:20; +} __attribute__ ((packed)); + +static int ehca_process_perf(struct ib_device *ibdev, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + struct ib_perf *in_perf = (struct ib_perf *)in_mad; + struct ib_perf *out_perf = (struct ib_perf *)out_mad; + struct ib_class_port_info *poi = + (struct ib_class_port_info *)out_perf->data; + struct tcslfl *tcslfl = + (struct tcslfl *)&poi->redirect_tcslfl; + struct ehca_shca *shca = + container_of(ibdev, struct ehca_shca, ib_device); + struct ehca_sport *sport = &shca->sport[port_num - 1]; + + ehca_dbg(ibdev, "method=%x", in_perf->mad_hdr.method); + + *out_mad = *in_mad; + + if (in_perf->mad_hdr.class_version != 1) { + ehca_warn(ibdev, "Unsupported class_version=%x", + in_perf->mad_hdr.class_version); + out_perf->mad_hdr.status = IB_MAD_STATUS_UNSUP_VERSION; + goto perf_reply; + } + + switch (in_perf->mad_hdr.method) { + case IB_MGMT_METHOD_GET: + case IB_MGMT_METHOD_SET: + /* set class port info for redirection */ + out_perf->mad_hdr.attr_id = IB_PMA_CLASS_PORT_INFO; + out_perf->mad_hdr.status = IB_MAD_STATUS_REDIRECT; + memset(poi, 0, sizeof(*poi)); + poi->base_version = 1; + poi->class_version = 1; + poi->resp_time_value = 18; + + /* copy local routing information from WC where applicable */ + tcslfl->sl = in_wc->sl; + poi->redirect_lid = + sport->saved_attr.lid | in_wc->dlid_path_bits; + poi->redirect_qp = sport->pma_qp_nr; + poi->redirect_qkey = IB_QP1_QKEY; + + ehca_query_pkey(ibdev, port_num, in_wc->pkey_index, + &poi->redirect_pkey); + + /* if request was globally routed, copy route info */ + if (in_grh) { + struct vertcfl *vertcfl = + (struct vertcfl *)&in_grh->version_tclass_flow; + memcpy(poi->redirect_gid, in_grh->dgid.raw, + sizeof(poi->redirect_gid)); + tcslfl->tc = vertcfl->tc; + tcslfl->fl = vertcfl->fl; + } else + /* else only fill in default GID */ + ehca_query_gid(ibdev, port_num, 0, + (union ib_gid *)&poi->redirect_gid); + + ehca_dbg(ibdev, "ehca_pma_lid=%x ehca_pma_qp=%x", + sport->saved_attr.lid, sport->pma_qp_nr); + break; + + case IB_MGMT_METHOD_GET_RESP: + return IB_MAD_RESULT_FAILURE; + + default: + out_perf->mad_hdr.status = IB_MAD_STATUS_UNSUP_METHOD; + break; + } + +perf_reply: + out_perf->mad_hdr.method = IB_MGMT_METHOD_GET_RESP; + + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + int ret; + + if (!port_num || port_num > ibdev->phys_port_cnt || !in_wc) + return IB_MAD_RESULT_FAILURE; + + /* accept only pma request */ + if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT) + return IB_MAD_RESULT_SUCCESS; + + ehca_dbg(ibdev, "port_num=%x src_qp=%x", port_num, in_wc->src_qp); + ret = ehca_process_perf(ibdev, port_num, in_wc, in_grh, + in_mad, out_mad); + + return ret; +} diff --git a/drivers/infiniband/hw/ehca/ehca_tools.h b/drivers/infiniband/hw/ehca/ehca_tools.h new file mode 100644 index 0000000..d280b12 --- /dev/null +++ b/drivers/infiniband/hw/ehca/ehca_tools.h @@ -0,0 +1,155 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * auxiliary functions + * + * Authors: Christoph Raisch + * Hoang-Nam Nguyen + * Khadija Souissi + * Waleri Fomin + * Heiko J Schick + * + * Copyright (c) 2005 IBM Corporation + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef EHCA_TOOLS_H +#define EHCA_TOOLS_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +extern int ehca_debug_level; + +#define ehca_dbg(ib_dev, format, arg...) \ + do { \ + if (unlikely(ehca_debug_level)) \ + dev_printk(KERN_DEBUG, (ib_dev)->dma_device, \ + "PU%04x EHCA_DBG:%s " format "\n", \ + raw_smp_processor_id(), __func__, \ + ## arg); \ + } while (0) + +#define ehca_info(ib_dev, format, arg...) \ + dev_info((ib_dev)->dma_device, "PU%04x EHCA_INFO:%s " format "\n", \ + raw_smp_processor_id(), __func__, ## arg) + +#define ehca_warn(ib_dev, format, arg...) \ + dev_warn((ib_dev)->dma_device, "PU%04x EHCA_WARN:%s " format "\n", \ + raw_smp_processor_id(), __func__, ## arg) + +#define ehca_err(ib_dev, format, arg...) \ + dev_err((ib_dev)->dma_device, "PU%04x EHCA_ERR:%s " format "\n", \ + raw_smp_processor_id(), __func__, ## arg) + +/* use this one only if no ib_dev available */ +#define ehca_gen_dbg(format, arg...) \ + do { \ + if (unlikely(ehca_debug_level)) \ + printk(KERN_DEBUG "PU%04x EHCA_DBG:%s " format "\n", \ + raw_smp_processor_id(), __func__, ## arg); \ + } while (0) + +#define ehca_gen_warn(format, arg...) \ + printk(KERN_INFO "PU%04x EHCA_WARN:%s " format "\n", \ + raw_smp_processor_id(), __func__, ## arg) + +#define ehca_gen_err(format, arg...) \ + printk(KERN_ERR "PU%04x EHCA_ERR:%s " format "\n", \ + raw_smp_processor_id(), __func__, ## arg) + +/** + * ehca_dmp - printk a memory block, whose length is n*8 bytes. + * Each line has the following layout: + * adr=X ofs=Y <8 bytes hex> <8 bytes hex> + */ +#define ehca_dmp(adr, len, format, args...) \ + do { \ + unsigned int x; \ + unsigned int l = (unsigned int)(len); \ + unsigned char *deb = (unsigned char *)(adr); \ + for (x = 0; x < l; x += 16) { \ + printk(KERN_INFO "EHCA_DMP:%s " format \ + " adr=%p ofs=%04x %016llx %016llx\n", \ + __func__, ##args, deb, x, \ + *((u64 *)&deb[0]), *((u64 *)&deb[8])); \ + deb += 16; \ + } \ + } while (0) + +/* define a bitmask, little endian version */ +#define EHCA_BMASK(pos, length) (((pos) << 16) + (length)) + +/* define a bitmask, the ibm way... */ +#define EHCA_BMASK_IBM(from, to) (((63 - to) << 16) + ((to) - (from) + 1)) + +/* internal function, don't use */ +#define EHCA_BMASK_SHIFTPOS(mask) (((mask) >> 16) & 0xffff) + +/* internal function, don't use */ +#define EHCA_BMASK_MASK(mask) (~0ULL >> ((64 - (mask)) & 0xffff)) + +/** + * EHCA_BMASK_SET - return value shifted and masked by mask + * variable|=EHCA_BMASK_SET(MY_MASK,0x4711) ORs the bits in variable + * variable&=~EHCA_BMASK_SET(MY_MASK,-1) clears the bits from the mask + * in variable + */ +#define EHCA_BMASK_SET(mask, value) \ + ((EHCA_BMASK_MASK(mask) & ((u64)(value))) << EHCA_BMASK_SHIFTPOS(mask)) + +/** + * EHCA_BMASK_GET - extract a parameter from value by mask + */ +#define EHCA_BMASK_GET(mask, value) \ + (EHCA_BMASK_MASK(mask) & (((u64)(value)) >> EHCA_BMASK_SHIFTPOS(mask))) + +/* Converts ehca to ib return code */ +int ehca2ib_return_code(u64 ehca_rc); + +#endif /* EHCA_TOOLS_H */ diff --git a/drivers/infiniband/hw/ehca/ehca_uverbs.c b/drivers/infiniband/hw/ehca/ehca_uverbs.c new file mode 100644 index 0000000..1a1d5d9 --- /dev/null +++ b/drivers/infiniband/hw/ehca/ehca_uverbs.c @@ -0,0 +1,309 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * userspace support verbs + * + * Authors: Christoph Raisch + * Hoang-Nam Nguyen + * Heiko J Schick + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "ehca_classes.h" +#include "ehca_iverbs.h" +#include "ehca_mrmw.h" +#include "ehca_tools.h" +#include "hcp_if.h" + +struct ib_ucontext *ehca_alloc_ucontext(struct ib_device *device, + struct ib_udata *udata) +{ + struct ehca_ucontext *my_context; + + my_context = kzalloc(sizeof *my_context, GFP_KERNEL); + if (!my_context) { + ehca_err(device, "Out of memory device=%p", device); + return ERR_PTR(-ENOMEM); + } + + return &my_context->ib_ucontext; +} + +int ehca_dealloc_ucontext(struct ib_ucontext *context) +{ + kfree(container_of(context, struct ehca_ucontext, ib_ucontext)); + return 0; +} + +static void ehca_mm_open(struct vm_area_struct *vma) +{ + u32 *count = (u32 *)vma->vm_private_data; + if (!count) { + ehca_gen_err("Invalid vma struct vm_start=%lx vm_end=%lx", + vma->vm_start, vma->vm_end); + return; + } + (*count)++; + if (!(*count)) + ehca_gen_err("Use count overflow vm_start=%lx vm_end=%lx", + vma->vm_start, vma->vm_end); + ehca_gen_dbg("vm_start=%lx vm_end=%lx count=%x", + vma->vm_start, vma->vm_end, *count); +} + +static void ehca_mm_close(struct vm_area_struct *vma) +{ + u32 *count = (u32 *)vma->vm_private_data; + if (!count) { + ehca_gen_err("Invalid vma struct vm_start=%lx vm_end=%lx", + vma->vm_start, vma->vm_end); + return; + } + (*count)--; + ehca_gen_dbg("vm_start=%lx vm_end=%lx count=%x", + vma->vm_start, vma->vm_end, *count); +} + +static const struct vm_operations_struct vm_ops = { + .open = ehca_mm_open, + .close = ehca_mm_close, +}; + +static int ehca_mmap_fw(struct vm_area_struct *vma, struct h_galpas *galpas, + u32 *mm_count) +{ + int ret; + u64 vsize, physical; + + vsize = vma->vm_end - vma->vm_start; + if (vsize < EHCA_PAGESIZE) { + ehca_gen_err("invalid vsize=%lx", vma->vm_end - vma->vm_start); + return -EINVAL; + } + + physical = galpas->user.fw_handle; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + ehca_gen_dbg("vsize=%llx physical=%llx", vsize, physical); + /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */ + ret = remap_4k_pfn(vma, vma->vm_start, physical >> EHCA_PAGESHIFT, + vma->vm_page_prot); + if (unlikely(ret)) { + ehca_gen_err("remap_pfn_range() failed ret=%i", ret); + return -ENOMEM; + } + + vma->vm_private_data = mm_count; + (*mm_count)++; + vma->vm_ops = &vm_ops; + + return 0; +} + +static int ehca_mmap_queue(struct vm_area_struct *vma, struct ipz_queue *queue, + u32 *mm_count) +{ + int ret; + u64 start, ofs; + struct page *page; + + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + start = vma->vm_start; + for (ofs = 0; ofs < queue->queue_length; ofs += PAGE_SIZE) { + u64 virt_addr = (u64)ipz_qeit_calc(queue, ofs); + page = virt_to_page(virt_addr); + ret = vm_insert_page(vma, start, page); + if (unlikely(ret)) { + ehca_gen_err("vm_insert_page() failed rc=%i", ret); + return ret; + } + start += PAGE_SIZE; + } + vma->vm_private_data = mm_count; + (*mm_count)++; + vma->vm_ops = &vm_ops; + + return 0; +} + +static int ehca_mmap_cq(struct vm_area_struct *vma, struct ehca_cq *cq, + u32 rsrc_type) +{ + int ret; + + switch (rsrc_type) { + case 0: /* galpa fw handle */ + ehca_dbg(cq->ib_cq.device, "cq_num=%x fw", cq->cq_number); + ret = ehca_mmap_fw(vma, &cq->galpas, &cq->mm_count_galpa); + if (unlikely(ret)) { + ehca_err(cq->ib_cq.device, + "ehca_mmap_fw() failed rc=%i cq_num=%x", + ret, cq->cq_number); + return ret; + } + break; + + case 1: /* cq queue_addr */ + ehca_dbg(cq->ib_cq.device, "cq_num=%x queue", cq->cq_number); + ret = ehca_mmap_queue(vma, &cq->ipz_queue, &cq->mm_count_queue); + if (unlikely(ret)) { + ehca_err(cq->ib_cq.device, + "ehca_mmap_queue() failed rc=%i cq_num=%x", + ret, cq->cq_number); + return ret; + } + break; + + default: + ehca_err(cq->ib_cq.device, "bad resource type=%x cq_num=%x", + rsrc_type, cq->cq_number); + return -EINVAL; + } + + return 0; +} + +static int ehca_mmap_qp(struct vm_area_struct *vma, struct ehca_qp *qp, + u32 rsrc_type) +{ + int ret; + + switch (rsrc_type) { + case 0: /* galpa fw handle */ + ehca_dbg(qp->ib_qp.device, "qp_num=%x fw", qp->ib_qp.qp_num); + ret = ehca_mmap_fw(vma, &qp->galpas, &qp->mm_count_galpa); + if (unlikely(ret)) { + ehca_err(qp->ib_qp.device, + "remap_pfn_range() failed ret=%i qp_num=%x", + ret, qp->ib_qp.qp_num); + return -ENOMEM; + } + break; + + case 1: /* qp rqueue_addr */ + ehca_dbg(qp->ib_qp.device, "qp_num=%x rq", qp->ib_qp.qp_num); + ret = ehca_mmap_queue(vma, &qp->ipz_rqueue, + &qp->mm_count_rqueue); + if (unlikely(ret)) { + ehca_err(qp->ib_qp.device, + "ehca_mmap_queue(rq) failed rc=%i qp_num=%x", + ret, qp->ib_qp.qp_num); + return ret; + } + break; + + case 2: /* qp squeue_addr */ + ehca_dbg(qp->ib_qp.device, "qp_num=%x sq", qp->ib_qp.qp_num); + ret = ehca_mmap_queue(vma, &qp->ipz_squeue, + &qp->mm_count_squeue); + if (unlikely(ret)) { + ehca_err(qp->ib_qp.device, + "ehca_mmap_queue(sq) failed rc=%i qp_num=%x", + ret, qp->ib_qp.qp_num); + return ret; + } + break; + + default: + ehca_err(qp->ib_qp.device, "bad resource type=%x qp=num=%x", + rsrc_type, qp->ib_qp.qp_num); + return -EINVAL; + } + + return 0; +} + +int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + u64 fileoffset = vma->vm_pgoff; + u32 idr_handle = fileoffset & 0x1FFFFFF; + u32 q_type = (fileoffset >> 27) & 0x1; /* CQ, QP,... */ + u32 rsrc_type = (fileoffset >> 25) & 0x3; /* sq,rq,cmnd_window */ + u32 ret; + struct ehca_cq *cq; + struct ehca_qp *qp; + struct ib_uobject *uobject; + + switch (q_type) { + case 0: /* CQ */ + read_lock(&ehca_cq_idr_lock); + cq = idr_find(&ehca_cq_idr, idr_handle); + read_unlock(&ehca_cq_idr_lock); + + /* make sure this mmap really belongs to the authorized user */ + if (!cq) + return -EINVAL; + + if (!cq->ib_cq.uobject || cq->ib_cq.uobject->context != context) + return -EINVAL; + + ret = ehca_mmap_cq(vma, cq, rsrc_type); + if (unlikely(ret)) { + ehca_err(cq->ib_cq.device, + "ehca_mmap_cq() failed rc=%i cq_num=%x", + ret, cq->cq_number); + return ret; + } + break; + + case 1: /* QP */ + read_lock(&ehca_qp_idr_lock); + qp = idr_find(&ehca_qp_idr, idr_handle); + read_unlock(&ehca_qp_idr_lock); + + /* make sure this mmap really belongs to the authorized user */ + if (!qp) + return -EINVAL; + + uobject = IS_SRQ(qp) ? qp->ib_srq.uobject : qp->ib_qp.uobject; + if (!uobject || uobject->context != context) + return -EINVAL; + + ret = ehca_mmap_qp(vma, qp, rsrc_type); + if (unlikely(ret)) { + ehca_err(qp->ib_qp.device, + "ehca_mmap_qp() failed rc=%i qp_num=%x", + ret, qp->ib_qp.qp_num); + return ret; + } + break; + + default: + ehca_gen_err("bad queue type %x", q_type); + return -EINVAL; + } + + return 0; +} diff --git a/drivers/infiniband/hw/ehca/hcp_if.c b/drivers/infiniband/hw/ehca/hcp_if.c new file mode 100644 index 0000000..89517ff --- /dev/null +++ b/drivers/infiniband/hw/ehca/hcp_if.c @@ -0,0 +1,949 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Firmware Infiniband Interface code for POWER + * + * Authors: Christoph Raisch + * Hoang-Nam Nguyen + * Joachim Fenkes + * Gerd Bayer + * Waleri Fomin + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include "ehca_tools.h" +#include "hcp_if.h" +#include "hcp_phyp.h" +#include "hipz_fns.h" +#include "ipz_pt_fn.h" + +#define H_ALL_RES_QP_ENHANCED_OPS EHCA_BMASK_IBM(9, 11) +#define H_ALL_RES_QP_PTE_PIN EHCA_BMASK_IBM(12, 12) +#define H_ALL_RES_QP_SERVICE_TYPE EHCA_BMASK_IBM(13, 15) +#define H_ALL_RES_QP_STORAGE EHCA_BMASK_IBM(16, 17) +#define H_ALL_RES_QP_LL_RQ_CQE_POSTING EHCA_BMASK_IBM(18, 18) +#define H_ALL_RES_QP_LL_SQ_CQE_POSTING EHCA_BMASK_IBM(19, 21) +#define H_ALL_RES_QP_SIGNALING_TYPE EHCA_BMASK_IBM(22, 23) +#define H_ALL_RES_QP_UD_AV_LKEY_CTRL EHCA_BMASK_IBM(31, 31) +#define H_ALL_RES_QP_SMALL_SQ_PAGE_SIZE EHCA_BMASK_IBM(32, 35) +#define H_ALL_RES_QP_SMALL_RQ_PAGE_SIZE EHCA_BMASK_IBM(36, 39) +#define H_ALL_RES_QP_RESOURCE_TYPE EHCA_BMASK_IBM(56, 63) + +#define H_ALL_RES_QP_MAX_OUTST_SEND_WR EHCA_BMASK_IBM(0, 15) +#define H_ALL_RES_QP_MAX_OUTST_RECV_WR EHCA_BMASK_IBM(16, 31) +#define H_ALL_RES_QP_MAX_SEND_SGE EHCA_BMASK_IBM(32, 39) +#define H_ALL_RES_QP_MAX_RECV_SGE EHCA_BMASK_IBM(40, 47) + +#define H_ALL_RES_QP_UD_AV_LKEY EHCA_BMASK_IBM(32, 63) +#define H_ALL_RES_QP_SRQ_QP_TOKEN EHCA_BMASK_IBM(0, 31) +#define H_ALL_RES_QP_SRQ_QP_HANDLE EHCA_BMASK_IBM(0, 64) +#define H_ALL_RES_QP_SRQ_LIMIT EHCA_BMASK_IBM(48, 63) +#define H_ALL_RES_QP_SRQ_QPN EHCA_BMASK_IBM(40, 63) + +#define H_ALL_RES_QP_ACT_OUTST_SEND_WR EHCA_BMASK_IBM(16, 31) +#define H_ALL_RES_QP_ACT_OUTST_RECV_WR EHCA_BMASK_IBM(48, 63) +#define H_ALL_RES_QP_ACT_SEND_SGE EHCA_BMASK_IBM(8, 15) +#define H_ALL_RES_QP_ACT_RECV_SGE EHCA_BMASK_IBM(24, 31) + +#define H_ALL_RES_QP_SQUEUE_SIZE_PAGES EHCA_BMASK_IBM(0, 31) +#define H_ALL_RES_QP_RQUEUE_SIZE_PAGES EHCA_BMASK_IBM(32, 63) + +#define H_MP_INIT_TYPE EHCA_BMASK_IBM(44, 47) +#define H_MP_SHUTDOWN EHCA_BMASK_IBM(48, 48) +#define H_MP_RESET_QKEY_CTR EHCA_BMASK_IBM(49, 49) + +#define HCALL4_REGS_FORMAT "r4=%lx r5=%lx r6=%lx r7=%lx" +#define HCALL7_REGS_FORMAT HCALL4_REGS_FORMAT " r8=%lx r9=%lx r10=%lx" +#define HCALL9_REGS_FORMAT HCALL7_REGS_FORMAT " r11=%lx r12=%lx" + +static DEFINE_SPINLOCK(hcall_lock); + +static long ehca_plpar_hcall_norets(unsigned long opcode, + unsigned long arg1, + unsigned long arg2, + unsigned long arg3, + unsigned long arg4, + unsigned long arg5, + unsigned long arg6, + unsigned long arg7) +{ + long ret; + int i, sleep_msecs; + unsigned long flags = 0; + + if (unlikely(ehca_debug_level >= 2)) + ehca_gen_dbg("opcode=%lx " HCALL7_REGS_FORMAT, + opcode, arg1, arg2, arg3, arg4, arg5, arg6, arg7); + + for (i = 0; i < 5; i++) { + /* serialize hCalls to work around firmware issue */ + if (ehca_lock_hcalls) + spin_lock_irqsave(&hcall_lock, flags); + + ret = plpar_hcall_norets(opcode, arg1, arg2, arg3, arg4, + arg5, arg6, arg7); + + if (ehca_lock_hcalls) + spin_unlock_irqrestore(&hcall_lock, flags); + + if (H_IS_LONG_BUSY(ret)) { + sleep_msecs = get_longbusy_msecs(ret); + msleep_interruptible(sleep_msecs); + continue; + } + + if (ret < H_SUCCESS) + ehca_gen_err("opcode=%lx ret=%li " HCALL7_REGS_FORMAT, + opcode, ret, arg1, arg2, arg3, + arg4, arg5, arg6, arg7); + else + if (unlikely(ehca_debug_level >= 2)) + ehca_gen_dbg("opcode=%lx ret=%li", opcode, ret); + + return ret; + } + + return H_BUSY; +} + +static long ehca_plpar_hcall9(unsigned long opcode, + unsigned long *outs, /* array of 9 outputs */ + unsigned long arg1, + unsigned long arg2, + unsigned long arg3, + unsigned long arg4, + unsigned long arg5, + unsigned long arg6, + unsigned long arg7, + unsigned long arg8, + unsigned long arg9) +{ + long ret; + int i, sleep_msecs; + unsigned long flags = 0; + + if (unlikely(ehca_debug_level >= 2)) + ehca_gen_dbg("INPUT -- opcode=%lx " HCALL9_REGS_FORMAT, opcode, + arg1, arg2, arg3, arg4, arg5, + arg6, arg7, arg8, arg9); + + for (i = 0; i < 5; i++) { + /* serialize hCalls to work around firmware issue */ + if (ehca_lock_hcalls) + spin_lock_irqsave(&hcall_lock, flags); + + ret = plpar_hcall9(opcode, outs, + arg1, arg2, arg3, arg4, arg5, + arg6, arg7, arg8, arg9); + + if (ehca_lock_hcalls) + spin_unlock_irqrestore(&hcall_lock, flags); + + if (H_IS_LONG_BUSY(ret)) { + sleep_msecs = get_longbusy_msecs(ret); + msleep_interruptible(sleep_msecs); + continue; + } + + if (ret < H_SUCCESS) { + ehca_gen_err("INPUT -- opcode=%lx " HCALL9_REGS_FORMAT, + opcode, arg1, arg2, arg3, arg4, arg5, + arg6, arg7, arg8, arg9); + ehca_gen_err("OUTPUT -- ret=%li " HCALL9_REGS_FORMAT, + ret, outs[0], outs[1], outs[2], outs[3], + outs[4], outs[5], outs[6], outs[7], + outs[8]); + } else if (unlikely(ehca_debug_level >= 2)) + ehca_gen_dbg("OUTPUT -- ret=%li " HCALL9_REGS_FORMAT, + ret, outs[0], outs[1], outs[2], outs[3], + outs[4], outs[5], outs[6], outs[7], + outs[8]); + return ret; + } + + return H_BUSY; +} + +u64 hipz_h_alloc_resource_eq(const struct ipz_adapter_handle adapter_handle, + struct ehca_pfeq *pfeq, + const u32 neq_control, + const u32 number_of_entries, + struct ipz_eq_handle *eq_handle, + u32 *act_nr_of_entries, + u32 *act_pages, + u32 *eq_ist) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + u64 allocate_controls; + + /* resource type */ + allocate_controls = 3ULL; + + /* ISN is associated */ + if (neq_control != 1) + allocate_controls = (1ULL << (63 - 7)) | allocate_controls; + else /* notification event queue */ + allocate_controls = (1ULL << 63) | allocate_controls; + + ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs, + adapter_handle.handle, /* r4 */ + allocate_controls, /* r5 */ + number_of_entries, /* r6 */ + 0, 0, 0, 0, 0, 0); + eq_handle->handle = outs[0]; + *act_nr_of_entries = (u32)outs[3]; + *act_pages = (u32)outs[4]; + *eq_ist = (u32)outs[5]; + + if (ret == H_NOT_ENOUGH_RESOURCES) + ehca_gen_err("Not enough resource - ret=%lli ", ret); + + return ret; +} + +u64 hipz_h_reset_event(const struct ipz_adapter_handle adapter_handle, + struct ipz_eq_handle eq_handle, + const u64 event_mask) +{ + return ehca_plpar_hcall_norets(H_RESET_EVENTS, + adapter_handle.handle, /* r4 */ + eq_handle.handle, /* r5 */ + event_mask, /* r6 */ + 0, 0, 0, 0); +} + +u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle, + struct ehca_cq *cq, + struct ehca_alloc_cq_parms *param) +{ + int rc; + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs, + adapter_handle.handle, /* r4 */ + 2, /* r5 */ + param->eq_handle.handle, /* r6 */ + cq->token, /* r7 */ + param->nr_cqe, /* r8 */ + 0, 0, 0, 0); + cq->ipz_cq_handle.handle = outs[0]; + param->act_nr_of_entries = (u32)outs[3]; + param->act_pages = (u32)outs[4]; + + if (ret == H_SUCCESS) { + rc = hcp_galpas_ctor(&cq->galpas, 0, outs[5], outs[6]); + if (rc) { + ehca_gen_err("Could not establish HW access. rc=%d paddr=%#lx", + rc, outs[5]); + + ehca_plpar_hcall_norets(H_FREE_RESOURCE, + adapter_handle.handle, /* r4 */ + cq->ipz_cq_handle.handle, /* r5 */ + 0, 0, 0, 0, 0); + ret = H_NO_MEM; + } + } + + if (ret == H_NOT_ENOUGH_RESOURCES) + ehca_gen_err("Not enough resources. ret=%lli", ret); + + return ret; +} + +u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle, + struct ehca_alloc_qp_parms *parms, int is_user) +{ + int rc; + u64 ret; + u64 allocate_controls, max_r10_reg, r11, r12; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + allocate_controls = + EHCA_BMASK_SET(H_ALL_RES_QP_ENHANCED_OPS, parms->ext_type) + | EHCA_BMASK_SET(H_ALL_RES_QP_PTE_PIN, 0) + | EHCA_BMASK_SET(H_ALL_RES_QP_SERVICE_TYPE, parms->servicetype) + | EHCA_BMASK_SET(H_ALL_RES_QP_SIGNALING_TYPE, parms->sigtype) + | EHCA_BMASK_SET(H_ALL_RES_QP_STORAGE, parms->qp_storage) + | EHCA_BMASK_SET(H_ALL_RES_QP_SMALL_SQ_PAGE_SIZE, + parms->squeue.page_size) + | EHCA_BMASK_SET(H_ALL_RES_QP_SMALL_RQ_PAGE_SIZE, + parms->rqueue.page_size) + | EHCA_BMASK_SET(H_ALL_RES_QP_LL_RQ_CQE_POSTING, + !!(parms->ll_comp_flags & LLQP_RECV_COMP)) + | EHCA_BMASK_SET(H_ALL_RES_QP_LL_SQ_CQE_POSTING, + !!(parms->ll_comp_flags & LLQP_SEND_COMP)) + | EHCA_BMASK_SET(H_ALL_RES_QP_UD_AV_LKEY_CTRL, + parms->ud_av_l_key_ctl) + | EHCA_BMASK_SET(H_ALL_RES_QP_RESOURCE_TYPE, 1); + + max_r10_reg = + EHCA_BMASK_SET(H_ALL_RES_QP_MAX_OUTST_SEND_WR, + parms->squeue.max_wr + 1) + | EHCA_BMASK_SET(H_ALL_RES_QP_MAX_OUTST_RECV_WR, + parms->rqueue.max_wr + 1) + | EHCA_BMASK_SET(H_ALL_RES_QP_MAX_SEND_SGE, + parms->squeue.max_sge) + | EHCA_BMASK_SET(H_ALL_RES_QP_MAX_RECV_SGE, + parms->rqueue.max_sge); + + r11 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_QP_TOKEN, parms->srq_token); + + if (parms->ext_type == EQPT_SRQ) + r12 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_LIMIT, parms->srq_limit); + else + r12 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_QPN, parms->srq_qpn); + + ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs, + adapter_handle.handle, /* r4 */ + allocate_controls, /* r5 */ + parms->send_cq_handle.handle, + parms->recv_cq_handle.handle, + parms->eq_handle.handle, + ((u64)parms->token << 32) | parms->pd.value, + max_r10_reg, r11, r12); + + parms->qp_handle.handle = outs[0]; + parms->real_qp_num = (u32)outs[1]; + parms->squeue.act_nr_wqes = + (u16)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_OUTST_SEND_WR, outs[2]); + parms->rqueue.act_nr_wqes = + (u16)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_OUTST_RECV_WR, outs[2]); + parms->squeue.act_nr_sges = + (u8)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_SEND_SGE, outs[3]); + parms->rqueue.act_nr_sges = + (u8)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_RECV_SGE, outs[3]); + parms->squeue.queue_size = + (u32)EHCA_BMASK_GET(H_ALL_RES_QP_SQUEUE_SIZE_PAGES, outs[4]); + parms->rqueue.queue_size = + (u32)EHCA_BMASK_GET(H_ALL_RES_QP_RQUEUE_SIZE_PAGES, outs[4]); + + if (ret == H_SUCCESS) { + rc = hcp_galpas_ctor(&parms->galpas, is_user, outs[6], outs[6]); + if (rc) { + ehca_gen_err("Could not establish HW access. rc=%d paddr=%#lx", + rc, outs[6]); + + ehca_plpar_hcall_norets(H_FREE_RESOURCE, + adapter_handle.handle, /* r4 */ + parms->qp_handle.handle, /* r5 */ + 0, 0, 0, 0, 0); + ret = H_NO_MEM; + } + } + + if (ret == H_NOT_ENOUGH_RESOURCES) + ehca_gen_err("Not enough resources. ret=%lli", ret); + + return ret; +} + +u64 hipz_h_query_port(const struct ipz_adapter_handle adapter_handle, + const u8 port_id, + struct hipz_query_port *query_port_response_block) +{ + u64 ret; + u64 r_cb = __pa(query_port_response_block); + + if (r_cb & (EHCA_PAGESIZE-1)) { + ehca_gen_err("response block not page aligned"); + return H_PARAMETER; + } + + ret = ehca_plpar_hcall_norets(H_QUERY_PORT, + adapter_handle.handle, /* r4 */ + port_id, /* r5 */ + r_cb, /* r6 */ + 0, 0, 0, 0); + + if (ehca_debug_level >= 2) + ehca_dmp(query_port_response_block, 64, "response_block"); + + return ret; +} + +u64 hipz_h_modify_port(const struct ipz_adapter_handle adapter_handle, + const u8 port_id, const u32 port_cap, + const u8 init_type, const int modify_mask) +{ + u64 port_attributes = port_cap; + + if (modify_mask & IB_PORT_SHUTDOWN) + port_attributes |= EHCA_BMASK_SET(H_MP_SHUTDOWN, 1); + if (modify_mask & IB_PORT_INIT_TYPE) + port_attributes |= EHCA_BMASK_SET(H_MP_INIT_TYPE, init_type); + if (modify_mask & IB_PORT_RESET_QKEY_CNTR) + port_attributes |= EHCA_BMASK_SET(H_MP_RESET_QKEY_CTR, 1); + + return ehca_plpar_hcall_norets(H_MODIFY_PORT, + adapter_handle.handle, /* r4 */ + port_id, /* r5 */ + port_attributes, /* r6 */ + 0, 0, 0, 0); +} + +u64 hipz_h_query_hca(const struct ipz_adapter_handle adapter_handle, + struct hipz_query_hca *query_hca_rblock) +{ + u64 r_cb = __pa(query_hca_rblock); + + if (r_cb & (EHCA_PAGESIZE-1)) { + ehca_gen_err("response_block=%p not page aligned", + query_hca_rblock); + return H_PARAMETER; + } + + return ehca_plpar_hcall_norets(H_QUERY_HCA, + adapter_handle.handle, /* r4 */ + r_cb, /* r5 */ + 0, 0, 0, 0, 0); +} + +u64 hipz_h_register_rpage(const struct ipz_adapter_handle adapter_handle, + const u8 pagesize, + const u8 queue_type, + const u64 resource_handle, + const u64 logical_address_of_page, + u64 count) +{ + return ehca_plpar_hcall_norets(H_REGISTER_RPAGES, + adapter_handle.handle, /* r4 */ + (u64)queue_type | ((u64)pagesize) << 8, + /* r5 */ + resource_handle, /* r6 */ + logical_address_of_page, /* r7 */ + count, /* r8 */ + 0, 0); +} + +u64 hipz_h_register_rpage_eq(const struct ipz_adapter_handle adapter_handle, + const struct ipz_eq_handle eq_handle, + struct ehca_pfeq *pfeq, + const u8 pagesize, + const u8 queue_type, + const u64 logical_address_of_page, + const u64 count) +{ + if (count != 1) { + ehca_gen_err("Ppage counter=%llx", count); + return H_PARAMETER; + } + return hipz_h_register_rpage(adapter_handle, + pagesize, + queue_type, + eq_handle.handle, + logical_address_of_page, count); +} + +u64 hipz_h_query_int_state(const struct ipz_adapter_handle adapter_handle, + u32 ist) +{ + u64 ret; + ret = ehca_plpar_hcall_norets(H_QUERY_INT_STATE, + adapter_handle.handle, /* r4 */ + ist, /* r5 */ + 0, 0, 0, 0, 0); + + if (ret != H_SUCCESS && ret != H_BUSY) + ehca_gen_err("Could not query interrupt state."); + + return ret; +} + +u64 hipz_h_register_rpage_cq(const struct ipz_adapter_handle adapter_handle, + const struct ipz_cq_handle cq_handle, + struct ehca_pfcq *pfcq, + const u8 pagesize, + const u8 queue_type, + const u64 logical_address_of_page, + const u64 count, + const struct h_galpa gal) +{ + if (count != 1) { + ehca_gen_err("Page counter=%llx", count); + return H_PARAMETER; + } + + return hipz_h_register_rpage(adapter_handle, pagesize, queue_type, + cq_handle.handle, logical_address_of_page, + count); +} + +u64 hipz_h_register_rpage_qp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct ehca_pfqp *pfqp, + const u8 pagesize, + const u8 queue_type, + const u64 logical_address_of_page, + const u64 count, + const struct h_galpa galpa) +{ + if (count > 1) { + ehca_gen_err("Page counter=%llx", count); + return H_PARAMETER; + } + + return hipz_h_register_rpage(adapter_handle, pagesize, queue_type, + qp_handle.handle, logical_address_of_page, + count); +} + +u64 hipz_h_disable_and_get_wqe(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct ehca_pfqp *pfqp, + void **log_addr_next_sq_wqe2processed, + void **log_addr_next_rq_wqe2processed, + int dis_and_get_function_code) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = ehca_plpar_hcall9(H_DISABLE_AND_GETC, outs, + adapter_handle.handle, /* r4 */ + dis_and_get_function_code, /* r5 */ + qp_handle.handle, /* r6 */ + 0, 0, 0, 0, 0, 0); + if (log_addr_next_sq_wqe2processed) + *log_addr_next_sq_wqe2processed = (void *)outs[0]; + if (log_addr_next_rq_wqe2processed) + *log_addr_next_rq_wqe2processed = (void *)outs[1]; + + return ret; +} + +u64 hipz_h_modify_qp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct ehca_pfqp *pfqp, + const u64 update_mask, + struct hcp_modify_qp_control_block *mqpcb, + struct h_galpa gal) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + ret = ehca_plpar_hcall9(H_MODIFY_QP, outs, + adapter_handle.handle, /* r4 */ + qp_handle.handle, /* r5 */ + update_mask, /* r6 */ + __pa(mqpcb), /* r7 */ + 0, 0, 0, 0, 0); + + if (ret == H_NOT_ENOUGH_RESOURCES) + ehca_gen_err("Insufficient resources ret=%lli", ret); + + return ret; +} + +u64 hipz_h_query_qp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct ehca_pfqp *pfqp, + struct hcp_modify_qp_control_block *qqpcb, + struct h_galpa gal) +{ + return ehca_plpar_hcall_norets(H_QUERY_QP, + adapter_handle.handle, /* r4 */ + qp_handle.handle, /* r5 */ + __pa(qqpcb), /* r6 */ + 0, 0, 0, 0); +} + +u64 hipz_h_destroy_qp(const struct ipz_adapter_handle adapter_handle, + struct ehca_qp *qp) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = hcp_galpas_dtor(&qp->galpas); + if (ret) { + ehca_gen_err("Could not destruct qp->galpas"); + return H_RESOURCE; + } + ret = ehca_plpar_hcall9(H_DISABLE_AND_GETC, outs, + adapter_handle.handle, /* r4 */ + /* function code */ + 1, /* r5 */ + qp->ipz_qp_handle.handle, /* r6 */ + 0, 0, 0, 0, 0, 0); + if (ret == H_HARDWARE) + ehca_gen_err("HCA not operational. ret=%lli", ret); + + ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE, + adapter_handle.handle, /* r4 */ + qp->ipz_qp_handle.handle, /* r5 */ + 0, 0, 0, 0, 0); + + if (ret == H_RESOURCE) + ehca_gen_err("Resource still in use. ret=%lli", ret); + + return ret; +} + +u64 hipz_h_define_aqp0(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct h_galpa gal, + u32 port) +{ + return ehca_plpar_hcall_norets(H_DEFINE_AQP0, + adapter_handle.handle, /* r4 */ + qp_handle.handle, /* r5 */ + port, /* r6 */ + 0, 0, 0, 0); +} + +u64 hipz_h_define_aqp1(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct h_galpa gal, + u32 port, u32 * pma_qp_nr, + u32 * bma_qp_nr) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = ehca_plpar_hcall9(H_DEFINE_AQP1, outs, + adapter_handle.handle, /* r4 */ + qp_handle.handle, /* r5 */ + port, /* r6 */ + 0, 0, 0, 0, 0, 0); + *pma_qp_nr = (u32)outs[0]; + *bma_qp_nr = (u32)outs[1]; + + if (ret == H_ALIAS_EXIST) + ehca_gen_err("AQP1 already exists. ret=%lli", ret); + + return ret; +} + +u64 hipz_h_attach_mcqp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct h_galpa gal, + u16 mcg_dlid, + u64 subnet_prefix, u64 interface_id) +{ + u64 ret; + + ret = ehca_plpar_hcall_norets(H_ATTACH_MCQP, + adapter_handle.handle, /* r4 */ + qp_handle.handle, /* r5 */ + mcg_dlid, /* r6 */ + interface_id, /* r7 */ + subnet_prefix, /* r8 */ + 0, 0); + + if (ret == H_NOT_ENOUGH_RESOURCES) + ehca_gen_err("Not enough resources. ret=%lli", ret); + + return ret; +} + +u64 hipz_h_detach_mcqp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct h_galpa gal, + u16 mcg_dlid, + u64 subnet_prefix, u64 interface_id) +{ + return ehca_plpar_hcall_norets(H_DETACH_MCQP, + adapter_handle.handle, /* r4 */ + qp_handle.handle, /* r5 */ + mcg_dlid, /* r6 */ + interface_id, /* r7 */ + subnet_prefix, /* r8 */ + 0, 0); +} + +u64 hipz_h_destroy_cq(const struct ipz_adapter_handle adapter_handle, + struct ehca_cq *cq, + u8 force_flag) +{ + u64 ret; + + ret = hcp_galpas_dtor(&cq->galpas); + if (ret) { + ehca_gen_err("Could not destruct cp->galpas"); + return H_RESOURCE; + } + + ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE, + adapter_handle.handle, /* r4 */ + cq->ipz_cq_handle.handle, /* r5 */ + force_flag != 0 ? 1L : 0L, /* r6 */ + 0, 0, 0, 0); + + if (ret == H_RESOURCE) + ehca_gen_err("H_FREE_RESOURCE failed ret=%lli ", ret); + + return ret; +} + +u64 hipz_h_destroy_eq(const struct ipz_adapter_handle adapter_handle, + struct ehca_eq *eq) +{ + u64 ret; + + ret = hcp_galpas_dtor(&eq->galpas); + if (ret) { + ehca_gen_err("Could not destruct eq->galpas"); + return H_RESOURCE; + } + + ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE, + adapter_handle.handle, /* r4 */ + eq->ipz_eq_handle.handle, /* r5 */ + 0, 0, 0, 0, 0); + + if (ret == H_RESOURCE) + ehca_gen_err("Resource in use. ret=%lli ", ret); + + return ret; +} + +u64 hipz_h_alloc_resource_mr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + const u64 vaddr, + const u64 length, + const u32 access_ctrl, + const struct ipz_pd pd, + struct ehca_mr_hipzout_parms *outparms) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs, + adapter_handle.handle, /* r4 */ + 5, /* r5 */ + vaddr, /* r6 */ + length, /* r7 */ + (((u64)access_ctrl) << 32ULL), /* r8 */ + pd.value, /* r9 */ + 0, 0, 0); + outparms->handle.handle = outs[0]; + outparms->lkey = (u32)outs[2]; + outparms->rkey = (u32)outs[3]; + + return ret; +} + +u64 hipz_h_register_rpage_mr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + const u8 pagesize, + const u8 queue_type, + const u64 logical_address_of_page, + const u64 count) +{ + u64 ret; + + if (unlikely(ehca_debug_level >= 3)) { + if (count > 1) { + u64 *kpage; + int i; + kpage = __va(logical_address_of_page); + for (i = 0; i < count; i++) + ehca_gen_dbg("kpage[%d]=%p", + i, (void *)kpage[i]); + } else + ehca_gen_dbg("kpage=%p", + (void *)logical_address_of_page); + } + + if ((count > 1) && (logical_address_of_page & (EHCA_PAGESIZE-1))) { + ehca_gen_err("logical_address_of_page not on a 4k boundary " + "adapter_handle=%llx mr=%p mr_handle=%llx " + "pagesize=%x queue_type=%x " + "logical_address_of_page=%llx count=%llx", + adapter_handle.handle, mr, + mr->ipz_mr_handle.handle, pagesize, queue_type, + logical_address_of_page, count); + ret = H_PARAMETER; + } else + ret = hipz_h_register_rpage(adapter_handle, pagesize, + queue_type, + mr->ipz_mr_handle.handle, + logical_address_of_page, count); + return ret; +} + +u64 hipz_h_query_mr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + struct ehca_mr_hipzout_parms *outparms) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = ehca_plpar_hcall9(H_QUERY_MR, outs, + adapter_handle.handle, /* r4 */ + mr->ipz_mr_handle.handle, /* r5 */ + 0, 0, 0, 0, 0, 0, 0); + outparms->len = outs[0]; + outparms->vaddr = outs[1]; + outparms->acl = outs[4] >> 32; + outparms->lkey = (u32)(outs[5] >> 32); + outparms->rkey = (u32)(outs[5] & (0xffffffff)); + + return ret; +} + +u64 hipz_h_free_resource_mr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr) +{ + return ehca_plpar_hcall_norets(H_FREE_RESOURCE, + adapter_handle.handle, /* r4 */ + mr->ipz_mr_handle.handle, /* r5 */ + 0, 0, 0, 0, 0); +} + +u64 hipz_h_reregister_pmr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + const u64 vaddr_in, + const u64 length, + const u32 access_ctrl, + const struct ipz_pd pd, + const u64 mr_addr_cb, + struct ehca_mr_hipzout_parms *outparms) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = ehca_plpar_hcall9(H_REREGISTER_PMR, outs, + adapter_handle.handle, /* r4 */ + mr->ipz_mr_handle.handle, /* r5 */ + vaddr_in, /* r6 */ + length, /* r7 */ + /* r8 */ + ((((u64)access_ctrl) << 32ULL) | pd.value), + mr_addr_cb, /* r9 */ + 0, 0, 0); + outparms->vaddr = outs[1]; + outparms->lkey = (u32)outs[2]; + outparms->rkey = (u32)outs[3]; + + return ret; +} + +u64 hipz_h_register_smr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + const struct ehca_mr *orig_mr, + const u64 vaddr_in, + const u32 access_ctrl, + const struct ipz_pd pd, + struct ehca_mr_hipzout_parms *outparms) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = ehca_plpar_hcall9(H_REGISTER_SMR, outs, + adapter_handle.handle, /* r4 */ + orig_mr->ipz_mr_handle.handle, /* r5 */ + vaddr_in, /* r6 */ + (((u64)access_ctrl) << 32ULL), /* r7 */ + pd.value, /* r8 */ + 0, 0, 0, 0); + outparms->handle.handle = outs[0]; + outparms->lkey = (u32)outs[2]; + outparms->rkey = (u32)outs[3]; + + return ret; +} + +u64 hipz_h_alloc_resource_mw(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mw *mw, + const struct ipz_pd pd, + struct ehca_mw_hipzout_parms *outparms) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs, + adapter_handle.handle, /* r4 */ + 6, /* r5 */ + pd.value, /* r6 */ + 0, 0, 0, 0, 0, 0); + outparms->handle.handle = outs[0]; + outparms->rkey = (u32)outs[3]; + + return ret; +} + +u64 hipz_h_query_mw(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mw *mw, + struct ehca_mw_hipzout_parms *outparms) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = ehca_plpar_hcall9(H_QUERY_MW, outs, + adapter_handle.handle, /* r4 */ + mw->ipz_mw_handle.handle, /* r5 */ + 0, 0, 0, 0, 0, 0, 0); + outparms->rkey = (u32)outs[3]; + + return ret; +} + +u64 hipz_h_free_resource_mw(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mw *mw) +{ + return ehca_plpar_hcall_norets(H_FREE_RESOURCE, + adapter_handle.handle, /* r4 */ + mw->ipz_mw_handle.handle, /* r5 */ + 0, 0, 0, 0, 0); +} + +u64 hipz_h_error_data(const struct ipz_adapter_handle adapter_handle, + const u64 ressource_handle, + void *rblock, + unsigned long *byte_count) +{ + u64 r_cb = __pa(rblock); + + if (r_cb & (EHCA_PAGESIZE-1)) { + ehca_gen_err("rblock not page aligned."); + return H_PARAMETER; + } + + return ehca_plpar_hcall_norets(H_ERROR_DATA, + adapter_handle.handle, + ressource_handle, + r_cb, + 0, 0, 0, 0); +} + +u64 hipz_h_eoi(int irq) +{ + unsigned long xirr; + + iosync(); + xirr = (0xffULL << 24) | irq; + + return plpar_hcall_norets(H_EOI, xirr); +} diff --git a/drivers/infiniband/hw/ehca/hcp_if.h b/drivers/infiniband/hw/ehca/hcp_if.h new file mode 100644 index 0000000..a46e514 --- /dev/null +++ b/drivers/infiniband/hw/ehca/hcp_if.h @@ -0,0 +1,265 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Firmware Infiniband Interface code for POWER + * + * Authors: Christoph Raisch + * Hoang-Nam Nguyen + * Gerd Bayer + * Waleri Fomin + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __HCP_IF_H__ +#define __HCP_IF_H__ + +#include "ehca_classes.h" +#include "ehca_tools.h" +#include "hipz_hw.h" + +/* + * hipz_h_alloc_resource_eq allocates EQ resources in HW and FW, initialize + * resources, create the empty EQPT (ring). + */ +u64 hipz_h_alloc_resource_eq(const struct ipz_adapter_handle adapter_handle, + struct ehca_pfeq *pfeq, + const u32 neq_control, + const u32 number_of_entries, + struct ipz_eq_handle *eq_handle, + u32 * act_nr_of_entries, + u32 * act_pages, + u32 * eq_ist); + +u64 hipz_h_reset_event(const struct ipz_adapter_handle adapter_handle, + struct ipz_eq_handle eq_handle, + const u64 event_mask); +/* + * hipz_h_allocate_resource_cq allocates CQ resources in HW and FW, initialize + * resources, create the empty CQPT (ring). + */ +u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle, + struct ehca_cq *cq, + struct ehca_alloc_cq_parms *param); + + +/* + * hipz_h_alloc_resource_qp allocates QP resources in HW and FW, + * initialize resources, create empty QPPTs (2 rings). + */ +u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle, + struct ehca_alloc_qp_parms *parms, int is_user); + +u64 hipz_h_query_port(const struct ipz_adapter_handle adapter_handle, + const u8 port_id, + struct hipz_query_port *query_port_response_block); + +u64 hipz_h_modify_port(const struct ipz_adapter_handle adapter_handle, + const u8 port_id, const u32 port_cap, + const u8 init_type, const int modify_mask); + +u64 hipz_h_query_hca(const struct ipz_adapter_handle adapter_handle, + struct hipz_query_hca *query_hca_rblock); + +/* + * hipz_h_register_rpage internal function in hcp_if.h for all + * hcp_H_REGISTER_RPAGE calls. + */ +u64 hipz_h_register_rpage(const struct ipz_adapter_handle adapter_handle, + const u8 pagesize, + const u8 queue_type, + const u64 resource_handle, + const u64 logical_address_of_page, + u64 count); + +u64 hipz_h_register_rpage_eq(const struct ipz_adapter_handle adapter_handle, + const struct ipz_eq_handle eq_handle, + struct ehca_pfeq *pfeq, + const u8 pagesize, + const u8 queue_type, + const u64 logical_address_of_page, + const u64 count); + +u64 hipz_h_query_int_state(const struct ipz_adapter_handle + hcp_adapter_handle, + u32 ist); + +u64 hipz_h_register_rpage_cq(const struct ipz_adapter_handle adapter_handle, + const struct ipz_cq_handle cq_handle, + struct ehca_pfcq *pfcq, + const u8 pagesize, + const u8 queue_type, + const u64 logical_address_of_page, + const u64 count, + const struct h_galpa gal); + +u64 hipz_h_register_rpage_qp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct ehca_pfqp *pfqp, + const u8 pagesize, + const u8 queue_type, + const u64 logical_address_of_page, + const u64 count, + const struct h_galpa galpa); + +u64 hipz_h_disable_and_get_wqe(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct ehca_pfqp *pfqp, + void **log_addr_next_sq_wqe_tb_processed, + void **log_addr_next_rq_wqe_tb_processed, + int dis_and_get_function_code); +enum hcall_sigt { + HCALL_SIGT_NO_CQE = 0, + HCALL_SIGT_BY_WQE = 1, + HCALL_SIGT_EVERY = 2 +}; + +u64 hipz_h_modify_qp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct ehca_pfqp *pfqp, + const u64 update_mask, + struct hcp_modify_qp_control_block *mqpcb, + struct h_galpa gal); + +u64 hipz_h_query_qp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct ehca_pfqp *pfqp, + struct hcp_modify_qp_control_block *qqpcb, + struct h_galpa gal); + +u64 hipz_h_destroy_qp(const struct ipz_adapter_handle adapter_handle, + struct ehca_qp *qp); + +u64 hipz_h_define_aqp0(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct h_galpa gal, + u32 port); + +u64 hipz_h_define_aqp1(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct h_galpa gal, + u32 port, u32 * pma_qp_nr, + u32 * bma_qp_nr); + +u64 hipz_h_attach_mcqp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct h_galpa gal, + u16 mcg_dlid, + u64 subnet_prefix, u64 interface_id); + +u64 hipz_h_detach_mcqp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct h_galpa gal, + u16 mcg_dlid, + u64 subnet_prefix, u64 interface_id); + +u64 hipz_h_destroy_cq(const struct ipz_adapter_handle adapter_handle, + struct ehca_cq *cq, + u8 force_flag); + +u64 hipz_h_destroy_eq(const struct ipz_adapter_handle adapter_handle, + struct ehca_eq *eq); + +/* + * hipz_h_alloc_resource_mr allocates MR resources in HW and FW, initialize + * resources. + */ +u64 hipz_h_alloc_resource_mr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + const u64 vaddr, + const u64 length, + const u32 access_ctrl, + const struct ipz_pd pd, + struct ehca_mr_hipzout_parms *outparms); + +/* hipz_h_register_rpage_mr registers MR resource pages in HW and FW */ +u64 hipz_h_register_rpage_mr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + const u8 pagesize, + const u8 queue_type, + const u64 logical_address_of_page, + const u64 count); + +/* hipz_h_query_mr queries MR in HW and FW */ +u64 hipz_h_query_mr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + struct ehca_mr_hipzout_parms *outparms); + +/* hipz_h_free_resource_mr frees MR resources in HW and FW */ +u64 hipz_h_free_resource_mr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr); + +/* hipz_h_reregister_pmr reregisters MR in HW and FW */ +u64 hipz_h_reregister_pmr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + const u64 vaddr_in, + const u64 length, + const u32 access_ctrl, + const struct ipz_pd pd, + const u64 mr_addr_cb, + struct ehca_mr_hipzout_parms *outparms); + +/* hipz_h_register_smr register shared MR in HW and FW */ +u64 hipz_h_register_smr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + const struct ehca_mr *orig_mr, + const u64 vaddr_in, + const u32 access_ctrl, + const struct ipz_pd pd, + struct ehca_mr_hipzout_parms *outparms); + +/* + * hipz_h_alloc_resource_mw allocates MW resources in HW and FW, initialize + * resources. + */ +u64 hipz_h_alloc_resource_mw(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mw *mw, + const struct ipz_pd pd, + struct ehca_mw_hipzout_parms *outparms); + +/* hipz_h_query_mw queries MW in HW and FW */ +u64 hipz_h_query_mw(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mw *mw, + struct ehca_mw_hipzout_parms *outparms); + +/* hipz_h_free_resource_mw frees MW resources in HW and FW */ +u64 hipz_h_free_resource_mw(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mw *mw); + +u64 hipz_h_error_data(const struct ipz_adapter_handle adapter_handle, + const u64 ressource_handle, + void *rblock, + unsigned long *byte_count); +u64 hipz_h_eoi(int irq); + +#endif /* __HCP_IF_H__ */ diff --git a/drivers/infiniband/hw/ehca/hcp_phyp.c b/drivers/infiniband/hw/ehca/hcp_phyp.c new file mode 100644 index 0000000..077376f --- /dev/null +++ b/drivers/infiniband/hw/ehca/hcp_phyp.c @@ -0,0 +1,82 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * load store abstraction for ehca register access with tracing + * + * Authors: Christoph Raisch + * Hoang-Nam Nguyen + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "ehca_classes.h" +#include "hipz_hw.h" + +u64 hcall_map_page(u64 physaddr) +{ + return (u64)ioremap(physaddr, EHCA_PAGESIZE); +} + +int hcall_unmap_page(u64 mapaddr) +{ + iounmap((volatile void __iomem *) mapaddr); + return 0; +} + +int hcp_galpas_ctor(struct h_galpas *galpas, int is_user, + u64 paddr_kernel, u64 paddr_user) +{ + if (!is_user) { + galpas->kernel.fw_handle = hcall_map_page(paddr_kernel); + if (!galpas->kernel.fw_handle) + return -ENOMEM; + } else + galpas->kernel.fw_handle = 0; + + galpas->user.fw_handle = paddr_user; + + return 0; +} + +int hcp_galpas_dtor(struct h_galpas *galpas) +{ + if (galpas->kernel.fw_handle) { + int ret = hcall_unmap_page(galpas->kernel.fw_handle); + if (ret) + return ret; + } + + galpas->user.fw_handle = galpas->kernel.fw_handle = 0; + + return 0; +} diff --git a/drivers/infiniband/hw/ehca/hcp_phyp.h b/drivers/infiniband/hw/ehca/hcp_phyp.h new file mode 100644 index 0000000..d1b0299 --- /dev/null +++ b/drivers/infiniband/hw/ehca/hcp_phyp.h @@ -0,0 +1,90 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Firmware calls + * + * Authors: Christoph Raisch + * Hoang-Nam Nguyen + * Waleri Fomin + * Gerd Bayer + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __HCP_PHYP_H__ +#define __HCP_PHYP_H__ + + +/* + * eHCA page (mapped into memory) + * resource to access eHCA register pages in CPU address space +*/ +struct h_galpa { + u64 fw_handle; + /* for pSeries this is a 64bit memory address where + I/O memory is mapped into CPU address space (kv) */ +}; + +/* + * resource to access eHCA address space registers, all types + */ +struct h_galpas { + u32 pid; /*PID of userspace galpa checking */ + struct h_galpa user; /* user space accessible resource, + set to 0 if unused */ + struct h_galpa kernel; /* kernel space accessible resource, + set to 0 if unused */ +}; + +static inline u64 hipz_galpa_load(struct h_galpa galpa, u32 offset) +{ + u64 addr = galpa.fw_handle + offset; + return *(volatile u64 __force *)addr; +} + +static inline void hipz_galpa_store(struct h_galpa galpa, u32 offset, u64 value) +{ + u64 addr = galpa.fw_handle + offset; + *(volatile u64 __force *)addr = value; +} + +int hcp_galpas_ctor(struct h_galpas *galpas, int is_user, + u64 paddr_kernel, u64 paddr_user); + +int hcp_galpas_dtor(struct h_galpas *galpas); + +u64 hcall_map_page(u64 physaddr); + +int hcall_unmap_page(u64 mapaddr); + +#endif diff --git a/drivers/infiniband/hw/ehca/hipz_fns.h b/drivers/infiniband/hw/ehca/hipz_fns.h new file mode 100644 index 0000000..9dac93d --- /dev/null +++ b/drivers/infiniband/hw/ehca/hipz_fns.h @@ -0,0 +1,68 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * HW abstraction register functions + * + * Authors: Christoph Raisch + * Reinhard Ernst + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __HIPZ_FNS_H__ +#define __HIPZ_FNS_H__ + +#include "ehca_classes.h" +#include "hipz_hw.h" + +#include "hipz_fns_core.h" + +#define hipz_galpa_store_eq(gal, offset, value) \ + hipz_galpa_store(gal, EQTEMM_OFFSET(offset), value) + +#define hipz_galpa_load_eq(gal, offset) \ + hipz_galpa_load(gal, EQTEMM_OFFSET(offset)) + +#define hipz_galpa_store_qped(gal, offset, value) \ + hipz_galpa_store(gal, QPEDMM_OFFSET(offset), value) + +#define hipz_galpa_load_qped(gal, offset) \ + hipz_galpa_load(gal, QPEDMM_OFFSET(offset)) + +#define hipz_galpa_store_mrmw(gal, offset, value) \ + hipz_galpa_store(gal, MRMWMM_OFFSET(offset), value) + +#define hipz_galpa_load_mrmw(gal, offset) \ + hipz_galpa_load(gal, MRMWMM_OFFSET(offset)) + +#endif diff --git a/drivers/infiniband/hw/ehca/hipz_fns_core.h b/drivers/infiniband/hw/ehca/hipz_fns_core.h new file mode 100644 index 0000000..868735f --- /dev/null +++ b/drivers/infiniband/hw/ehca/hipz_fns_core.h @@ -0,0 +1,100 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * HW abstraction register functions + * + * Authors: Christoph Raisch + * Heiko J Schick + * Hoang-Nam Nguyen + * Reinhard Ernst + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __HIPZ_FNS_CORE_H__ +#define __HIPZ_FNS_CORE_H__ + +#include "hcp_phyp.h" +#include "hipz_hw.h" + +#define hipz_galpa_store_cq(gal, offset, value) \ + hipz_galpa_store(gal, CQTEMM_OFFSET(offset), value) + +#define hipz_galpa_load_cq(gal, offset) \ + hipz_galpa_load(gal, CQTEMM_OFFSET(offset)) + +#define hipz_galpa_store_qp(gal, offset, value) \ + hipz_galpa_store(gal, QPTEMM_OFFSET(offset), value) +#define hipz_galpa_load_qp(gal, offset) \ + hipz_galpa_load(gal, QPTEMM_OFFSET(offset)) + +static inline void hipz_update_sqa(struct ehca_qp *qp, u16 nr_wqes) +{ + /* ringing doorbell :-) */ + hipz_galpa_store_qp(qp->galpas.kernel, qpx_sqa, + EHCA_BMASK_SET(QPX_SQADDER, nr_wqes)); +} + +static inline void hipz_update_rqa(struct ehca_qp *qp, u16 nr_wqes) +{ + /* ringing doorbell :-) */ + hipz_galpa_store_qp(qp->galpas.kernel, qpx_rqa, + EHCA_BMASK_SET(QPX_RQADDER, nr_wqes)); +} + +static inline void hipz_update_feca(struct ehca_cq *cq, u32 nr_cqes) +{ + hipz_galpa_store_cq(cq->galpas.kernel, cqx_feca, + EHCA_BMASK_SET(CQX_FECADDER, nr_cqes)); +} + +static inline void hipz_set_cqx_n0(struct ehca_cq *cq, u32 value) +{ + u64 cqx_n0_reg; + + hipz_galpa_store_cq(cq->galpas.kernel, cqx_n0, + EHCA_BMASK_SET(CQX_N0_GENERATE_SOLICITED_COMP_EVENT, + value)); + cqx_n0_reg = hipz_galpa_load_cq(cq->galpas.kernel, cqx_n0); +} + +static inline void hipz_set_cqx_n1(struct ehca_cq *cq, u32 value) +{ + u64 cqx_n1_reg; + + hipz_galpa_store_cq(cq->galpas.kernel, cqx_n1, + EHCA_BMASK_SET(CQX_N1_GENERATE_COMP_EVENT, value)); + cqx_n1_reg = hipz_galpa_load_cq(cq->galpas.kernel, cqx_n1); +} + +#endif /* __HIPZ_FNC_CORE_H__ */ diff --git a/drivers/infiniband/hw/ehca/hipz_hw.h b/drivers/infiniband/hw/ehca/hipz_hw.h new file mode 100644 index 0000000..bf996c7 --- /dev/null +++ b/drivers/infiniband/hw/ehca/hipz_hw.h @@ -0,0 +1,414 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * eHCA register definitions + * + * Authors: Waleri Fomin + * Christoph Raisch + * Reinhard Ernst + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __HIPZ_HW_H__ +#define __HIPZ_HW_H__ + +#include "ehca_tools.h" + +#define EHCA_MAX_MTU 4 + +/* QP Table Entry Memory Map */ +struct hipz_qptemm { + u64 qpx_hcr; + u64 qpx_c; + u64 qpx_herr; + u64 qpx_aer; +/* 0x20*/ + u64 qpx_sqa; + u64 qpx_sqc; + u64 qpx_rqa; + u64 qpx_rqc; +/* 0x40*/ + u64 qpx_st; + u64 qpx_pmstate; + u64 qpx_pmfa; + u64 qpx_pkey; +/* 0x60*/ + u64 qpx_pkeya; + u64 qpx_pkeyb; + u64 qpx_pkeyc; + u64 qpx_pkeyd; +/* 0x80*/ + u64 qpx_qkey; + u64 qpx_dqp; + u64 qpx_dlidp; + u64 qpx_portp; +/* 0xa0*/ + u64 qpx_slidp; + u64 qpx_slidpp; + u64 qpx_dlida; + u64 qpx_porta; +/* 0xc0*/ + u64 qpx_slida; + u64 qpx_slidpa; + u64 qpx_slvl; + u64 qpx_ipd; +/* 0xe0*/ + u64 qpx_mtu; + u64 qpx_lato; + u64 qpx_rlimit; + u64 qpx_rnrlimit; +/* 0x100*/ + u64 qpx_t; + u64 qpx_sqhp; + u64 qpx_sqptp; + u64 qpx_nspsn; +/* 0x120*/ + u64 qpx_nspsnhwm; + u64 reserved1; + u64 qpx_sdsi; + u64 qpx_sdsbc; +/* 0x140*/ + u64 qpx_sqwsize; + u64 qpx_sqwts; + u64 qpx_lsn; + u64 qpx_nssn; +/* 0x160 */ + u64 qpx_mor; + u64 qpx_cor; + u64 qpx_sqsize; + u64 qpx_erc; +/* 0x180*/ + u64 qpx_rnrrc; + u64 qpx_ernrwt; + u64 qpx_rnrresp; + u64 qpx_lmsna; +/* 0x1a0 */ + u64 qpx_sqhpc; + u64 qpx_sqcptp; + u64 qpx_sigt; + u64 qpx_wqecnt; +/* 0x1c0*/ + u64 qpx_rqhp; + u64 qpx_rqptp; + u64 qpx_rqsize; + u64 qpx_nrr; +/* 0x1e0*/ + u64 qpx_rdmac; + u64 qpx_nrpsn; + u64 qpx_lapsn; + u64 qpx_lcr; +/* 0x200*/ + u64 qpx_rwc; + u64 qpx_rwva; + u64 qpx_rdsi; + u64 qpx_rdsbc; +/* 0x220*/ + u64 qpx_rqwsize; + u64 qpx_crmsn; + u64 qpx_rdd; + u64 qpx_larpsn; +/* 0x240*/ + u64 qpx_pd; + u64 qpx_scqn; + u64 qpx_rcqn; + u64 qpx_aeqn; +/* 0x260*/ + u64 qpx_aaelog; + u64 qpx_ram; + u64 qpx_rdmaqe0; + u64 qpx_rdmaqe1; +/* 0x280*/ + u64 qpx_rdmaqe2; + u64 qpx_rdmaqe3; + u64 qpx_nrpsnhwm; +/* 0x298*/ + u64 reserved[(0x400 - 0x298) / 8]; +/* 0x400 extended data */ + u64 reserved_ext[(0x500 - 0x400) / 8]; +/* 0x500 */ + u64 reserved2[(0x1000 - 0x500) / 8]; +/* 0x1000 */ +}; + +#define QPX_SQADDER EHCA_BMASK_IBM(48, 63) +#define QPX_RQADDER EHCA_BMASK_IBM(48, 63) +#define QPX_AAELOG_RESET_SRQ_LIMIT EHCA_BMASK_IBM(3, 3) + +#define QPTEMM_OFFSET(x) offsetof(struct hipz_qptemm, x) + +/* MRMWPT Entry Memory Map */ +struct hipz_mrmwmm { + /* 0x00 */ + u64 mrx_hcr; + + u64 mrx_c; + u64 mrx_herr; + u64 mrx_aer; + /* 0x20 */ + u64 mrx_pp; + u64 reserved1; + u64 reserved2; + u64 reserved3; + /* 0x40 */ + u64 reserved4[(0x200 - 0x40) / 8]; + /* 0x200 */ + u64 mrx_ctl[64]; + +}; + +#define MRMWMM_OFFSET(x) offsetof(struct hipz_mrmwmm, x) + +struct hipz_qpedmm { + /* 0x00 */ + u64 reserved0[(0x400) / 8]; + /* 0x400 */ + u64 qpedx_phh; + u64 qpedx_ppsgp; + /* 0x410 */ + u64 qpedx_ppsgu; + u64 qpedx_ppdgp; + /* 0x420 */ + u64 qpedx_ppdgu; + u64 qpedx_aph; + /* 0x430 */ + u64 qpedx_apsgp; + u64 qpedx_apsgu; + /* 0x440 */ + u64 qpedx_apdgp; + u64 qpedx_apdgu; + /* 0x450 */ + u64 qpedx_apav; + u64 qpedx_apsav; + /* 0x460 */ + u64 qpedx_hcr; + u64 reserved1[4]; + /* 0x488 */ + u64 qpedx_rrl0; + /* 0x490 */ + u64 qpedx_rrrkey0; + u64 qpedx_rrva0; + /* 0x4a0 */ + u64 reserved2; + u64 qpedx_rrl1; + /* 0x4b0 */ + u64 qpedx_rrrkey1; + u64 qpedx_rrva1; + /* 0x4c0 */ + u64 reserved3; + u64 qpedx_rrl2; + /* 0x4d0 */ + u64 qpedx_rrrkey2; + u64 qpedx_rrva2; + /* 0x4e0 */ + u64 reserved4; + u64 qpedx_rrl3; + /* 0x4f0 */ + u64 qpedx_rrrkey3; + u64 qpedx_rrva3; +}; + +#define QPEDMM_OFFSET(x) offsetof(struct hipz_qpedmm, x) + +/* CQ Table Entry Memory Map */ +struct hipz_cqtemm { + u64 cqx_hcr; + u64 cqx_c; + u64 cqx_herr; + u64 cqx_aer; +/* 0x20 */ + u64 cqx_ptp; + u64 cqx_tp; + u64 cqx_fec; + u64 cqx_feca; +/* 0x40 */ + u64 cqx_ep; + u64 cqx_eq; +/* 0x50 */ + u64 reserved1; + u64 cqx_n0; +/* 0x60 */ + u64 cqx_n1; + u64 reserved2[(0x1000 - 0x60) / 8]; +/* 0x1000 */ +}; + +#define CQX_FEC_CQE_CNT EHCA_BMASK_IBM(32, 63) +#define CQX_FECADDER EHCA_BMASK_IBM(32, 63) +#define CQX_N0_GENERATE_SOLICITED_COMP_EVENT EHCA_BMASK_IBM(0, 0) +#define CQX_N1_GENERATE_COMP_EVENT EHCA_BMASK_IBM(0, 0) + +#define CQTEMM_OFFSET(x) offsetof(struct hipz_cqtemm, x) + +/* EQ Table Entry Memory Map */ +struct hipz_eqtemm { + u64 eqx_hcr; + u64 eqx_c; + + u64 eqx_herr; + u64 eqx_aer; +/* 0x20 */ + u64 eqx_ptp; + u64 eqx_tp; + u64 eqx_ssba; + u64 eqx_psba; + +/* 0x40 */ + u64 eqx_cec; + u64 eqx_meql; + u64 eqx_xisbi; + u64 eqx_xisc; +/* 0x60 */ + u64 eqx_it; + +}; + +#define EQTEMM_OFFSET(x) offsetof(struct hipz_eqtemm, x) + +/* access control defines for MR/MW */ +#define HIPZ_ACCESSCTRL_L_WRITE 0x00800000 +#define HIPZ_ACCESSCTRL_R_WRITE 0x00400000 +#define HIPZ_ACCESSCTRL_R_READ 0x00200000 +#define HIPZ_ACCESSCTRL_R_ATOMIC 0x00100000 +#define HIPZ_ACCESSCTRL_MW_BIND 0x00080000 + +/* query hca response block */ +struct hipz_query_hca { + u32 cur_reliable_dg; + u32 cur_qp; + u32 cur_cq; + u32 cur_eq; + u32 cur_mr; + u32 cur_mw; + u32 cur_ee_context; + u32 cur_mcast_grp; + u32 cur_qp_attached_mcast_grp; + u32 reserved1; + u32 cur_ipv6_qp; + u32 cur_eth_qp; + u32 cur_hp_mr; + u32 reserved2[3]; + u32 max_rd_domain; + u32 max_qp; + u32 max_cq; + u32 max_eq; + u32 max_mr; + u32 max_hp_mr; + u32 max_mw; + u32 max_mrwpte; + u32 max_special_mrwpte; + u32 max_rd_ee_context; + u32 max_mcast_grp; + u32 max_total_mcast_qp_attach; + u32 max_mcast_qp_attach; + u32 max_raw_ipv6_qp; + u32 max_raw_ethy_qp; + u32 internal_clock_frequency; + u32 max_pd; + u32 max_ah; + u32 max_cqe; + u32 max_wqes_wq; + u32 max_partitions; + u32 max_rr_ee_context; + u32 max_rr_qp; + u32 max_rr_hca; + u32 max_act_wqs_ee_context; + u32 max_act_wqs_qp; + u32 max_sge; + u32 max_sge_rd; + u32 memory_page_size_supported; + u64 max_mr_size; + u32 local_ca_ack_delay; + u32 num_ports; + u32 vendor_id; + u32 vendor_part_id; + u32 hw_ver; + u64 node_guid; + u64 hca_cap_indicators; + u32 data_counter_register_size; + u32 max_shared_rq; + u32 max_isns_eq; + u32 max_neq; +} __attribute__ ((packed)); + +#define HCA_CAP_AH_PORT_NR_CHECK EHCA_BMASK_IBM( 0, 0) +#define HCA_CAP_ATOMIC EHCA_BMASK_IBM( 1, 1) +#define HCA_CAP_AUTO_PATH_MIG EHCA_BMASK_IBM( 2, 2) +#define HCA_CAP_BAD_P_KEY_CTR EHCA_BMASK_IBM( 3, 3) +#define HCA_CAP_SQD_RTS_PORT_CHANGE EHCA_BMASK_IBM( 4, 4) +#define HCA_CAP_CUR_QP_STATE_MOD EHCA_BMASK_IBM( 5, 5) +#define HCA_CAP_INIT_TYPE EHCA_BMASK_IBM( 6, 6) +#define HCA_CAP_PORT_ACTIVE_EVENT EHCA_BMASK_IBM( 7, 7) +#define HCA_CAP_Q_KEY_VIOL_CTR EHCA_BMASK_IBM( 8, 8) +#define HCA_CAP_WQE_RESIZE EHCA_BMASK_IBM( 9, 9) +#define HCA_CAP_RAW_PACKET_MCAST EHCA_BMASK_IBM(10, 10) +#define HCA_CAP_SHUTDOWN_PORT EHCA_BMASK_IBM(11, 11) +#define HCA_CAP_RC_LL_QP EHCA_BMASK_IBM(12, 12) +#define HCA_CAP_SRQ EHCA_BMASK_IBM(13, 13) +#define HCA_CAP_UD_LL_QP EHCA_BMASK_IBM(16, 16) +#define HCA_CAP_RESIZE_MR EHCA_BMASK_IBM(17, 17) +#define HCA_CAP_MINI_QP EHCA_BMASK_IBM(18, 18) +#define HCA_CAP_H_ALLOC_RES_SYNC EHCA_BMASK_IBM(19, 19) + +/* query port response block */ +struct hipz_query_port { + u32 state; + u32 bad_pkey_cntr; + u32 lmc; + u32 lid; + u32 subnet_timeout; + u32 qkey_viol_cntr; + u32 sm_sl; + u32 sm_lid; + u32 capability_mask; + u32 init_type_reply; + u32 pkey_tbl_len; + u32 gid_tbl_len; + u64 gid_prefix; + u32 port_nr; + u16 pkey_entries[16]; + u8 reserved1[32]; + u32 trent_size; + u32 trbuf_size; + u64 max_msg_sz; + u32 max_mtu; + u32 vl_cap; + u32 phys_pstate; + u32 phys_state; + u32 phys_speed; + u32 phys_width; + u8 reserved2[1884]; + u64 guid_entries[255]; +} __attribute__ ((packed)); + +#endif diff --git a/drivers/infiniband/hw/ehca/ipz_pt_fn.c b/drivers/infiniband/hw/ehca/ipz_pt_fn.c new file mode 100644 index 0000000..8d59451 --- /dev/null +++ b/drivers/infiniband/hw/ehca/ipz_pt_fn.c @@ -0,0 +1,295 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * internal queue handling + * + * Authors: Waleri Fomin + * Reinhard Ernst + * Christoph Raisch + * + * Copyright (c) 2005 IBM Corporation + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "ehca_tools.h" +#include "ipz_pt_fn.h" +#include "ehca_classes.h" + +#define PAGES_PER_KPAGE (PAGE_SIZE >> EHCA_PAGESHIFT) + +struct kmem_cache *small_qp_cache; + +void *ipz_qpageit_get_inc(struct ipz_queue *queue) +{ + void *ret = ipz_qeit_get(queue); + queue->current_q_offset += queue->pagesize; + if (queue->current_q_offset > queue->queue_length) { + queue->current_q_offset -= queue->pagesize; + ret = NULL; + } + if (((u64)ret) % queue->pagesize) { + ehca_gen_err("ERROR!! not at PAGE-Boundary"); + return NULL; + } + return ret; +} + +void *ipz_qeit_eq_get_inc(struct ipz_queue *queue) +{ + void *ret = ipz_qeit_get(queue); + u64 last_entry_in_q = queue->queue_length - queue->qe_size; + + queue->current_q_offset += queue->qe_size; + if (queue->current_q_offset > last_entry_in_q) { + queue->current_q_offset = 0; + queue->toggle_state = (~queue->toggle_state) & 1; + } + + return ret; +} + +int ipz_queue_abs_to_offset(struct ipz_queue *queue, u64 addr, u64 *q_offset) +{ + int i; + for (i = 0; i < queue->queue_length / queue->pagesize; i++) { + u64 page = __pa(queue->queue_pages[i]); + if (addr >= page && addr < page + queue->pagesize) { + *q_offset = addr - page + i * queue->pagesize; + return 0; + } + } + return -EINVAL; +} + +#if PAGE_SHIFT < EHCA_PAGESHIFT +#error Kernel pages must be at least as large than eHCA pages (4K) ! +#endif + +/* + * allocate pages for queue: + * outer loop allocates whole kernel pages (page aligned) and + * inner loop divides a kernel page into smaller hca queue pages + */ +static int alloc_queue_pages(struct ipz_queue *queue, const u32 nr_of_pages) +{ + int k, f = 0; + u8 *kpage; + + while (f < nr_of_pages) { + kpage = (u8 *)get_zeroed_page(GFP_KERNEL); + if (!kpage) + goto out; + + for (k = 0; k < PAGES_PER_KPAGE && f < nr_of_pages; k++) { + queue->queue_pages[f] = (struct ipz_page *)kpage; + kpage += EHCA_PAGESIZE; + f++; + } + } + return 1; + +out: + for (f = 0; f < nr_of_pages && queue->queue_pages[f]; + f += PAGES_PER_KPAGE) + free_page((unsigned long)(queue->queue_pages)[f]); + return 0; +} + +static int alloc_small_queue_page(struct ipz_queue *queue, struct ehca_pd *pd) +{ + int order = ilog2(queue->pagesize) - 9; + struct ipz_small_queue_page *page; + unsigned long bit; + + mutex_lock(&pd->lock); + + if (!list_empty(&pd->free[order])) + page = list_entry(pd->free[order].next, + struct ipz_small_queue_page, list); + else { + page = kmem_cache_zalloc(small_qp_cache, GFP_KERNEL); + if (!page) + goto out; + + page->page = get_zeroed_page(GFP_KERNEL); + if (!page->page) { + kmem_cache_free(small_qp_cache, page); + goto out; + } + + list_add(&page->list, &pd->free[order]); + } + + bit = find_first_zero_bit(page->bitmap, IPZ_SPAGE_PER_KPAGE >> order); + __set_bit(bit, page->bitmap); + page->fill++; + + if (page->fill == IPZ_SPAGE_PER_KPAGE >> order) + list_move(&page->list, &pd->full[order]); + + mutex_unlock(&pd->lock); + + queue->queue_pages[0] = (void *)(page->page | (bit << (order + 9))); + queue->small_page = page; + queue->offset = bit << (order + 9); + return 1; + +out: + ehca_err(pd->ib_pd.device, "failed to allocate small queue page"); + mutex_unlock(&pd->lock); + return 0; +} + +static void free_small_queue_page(struct ipz_queue *queue, struct ehca_pd *pd) +{ + int order = ilog2(queue->pagesize) - 9; + struct ipz_small_queue_page *page = queue->small_page; + unsigned long bit; + int free_page = 0; + + bit = ((unsigned long)queue->queue_pages[0] & ~PAGE_MASK) + >> (order + 9); + + mutex_lock(&pd->lock); + + __clear_bit(bit, page->bitmap); + page->fill--; + + if (page->fill == 0) { + list_del(&page->list); + free_page = 1; + } + + if (page->fill == (IPZ_SPAGE_PER_KPAGE >> order) - 1) + /* the page was full until we freed the chunk */ + list_move_tail(&page->list, &pd->free[order]); + + mutex_unlock(&pd->lock); + + if (free_page) { + free_page(page->page); + kmem_cache_free(small_qp_cache, page); + } +} + +int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue, + const u32 nr_of_pages, const u32 pagesize, + const u32 qe_size, const u32 nr_of_sg, + int is_small) +{ + if (pagesize > PAGE_SIZE) { + ehca_gen_err("FATAL ERROR: pagesize=%x " + "is greater than kernel page size", pagesize); + return 0; + } + + /* init queue fields */ + queue->queue_length = nr_of_pages * pagesize; + queue->pagesize = pagesize; + queue->qe_size = qe_size; + queue->act_nr_of_sg = nr_of_sg; + queue->current_q_offset = 0; + queue->toggle_state = 1; + queue->small_page = NULL; + + /* allocate queue page pointers */ + queue->queue_pages = kzalloc(nr_of_pages * sizeof(void *), + GFP_KERNEL | __GFP_NOWARN); + if (!queue->queue_pages) { + queue->queue_pages = vzalloc(nr_of_pages * sizeof(void *)); + if (!queue->queue_pages) { + ehca_gen_err("Couldn't allocate queue page list"); + return 0; + } + } + + /* allocate actual queue pages */ + if (is_small) { + if (!alloc_small_queue_page(queue, pd)) + goto ipz_queue_ctor_exit0; + } else + if (!alloc_queue_pages(queue, nr_of_pages)) + goto ipz_queue_ctor_exit0; + + return 1; + +ipz_queue_ctor_exit0: + ehca_gen_err("Couldn't alloc pages queue=%p " + "nr_of_pages=%x", queue, nr_of_pages); + if (is_vmalloc_addr(queue->queue_pages)) + vfree(queue->queue_pages); + else + kfree(queue->queue_pages); + + return 0; +} + +int ipz_queue_dtor(struct ehca_pd *pd, struct ipz_queue *queue) +{ + int i, nr_pages; + + if (!queue || !queue->queue_pages) { + ehca_gen_dbg("queue or queue_pages is NULL"); + return 0; + } + + if (queue->small_page) + free_small_queue_page(queue, pd); + else { + nr_pages = queue->queue_length / queue->pagesize; + for (i = 0; i < nr_pages; i += PAGES_PER_KPAGE) + free_page((unsigned long)queue->queue_pages[i]); + } + + if (is_vmalloc_addr(queue->queue_pages)) + vfree(queue->queue_pages); + else + kfree(queue->queue_pages); + + return 1; +} + +int ehca_init_small_qp_cache(void) +{ + small_qp_cache = kmem_cache_create("ehca_cache_small_qp", + sizeof(struct ipz_small_queue_page), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!small_qp_cache) + return -ENOMEM; + + return 0; +} + +void ehca_cleanup_small_qp_cache(void) +{ + kmem_cache_destroy(small_qp_cache); +} diff --git a/drivers/infiniband/hw/ehca/ipz_pt_fn.h b/drivers/infiniband/hw/ehca/ipz_pt_fn.h new file mode 100644 index 0000000..a801274 --- /dev/null +++ b/drivers/infiniband/hw/ehca/ipz_pt_fn.h @@ -0,0 +1,289 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * internal queue handling + * + * Authors: Waleri Fomin + * Reinhard Ernst + * Christoph Raisch + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __IPZ_PT_FN_H__ +#define __IPZ_PT_FN_H__ + +#define EHCA_PAGESHIFT 12 +#define EHCA_PAGESIZE 4096UL +#define EHCA_PAGEMASK (~(EHCA_PAGESIZE-1)) +#define EHCA_PT_ENTRIES 512UL + +#include "ehca_tools.h" +#include "ehca_qes.h" + +struct ehca_pd; +struct ipz_small_queue_page; + +extern struct kmem_cache *small_qp_cache; + +/* struct generic ehca page */ +struct ipz_page { + u8 entries[EHCA_PAGESIZE]; +}; + +#define IPZ_SPAGE_PER_KPAGE (PAGE_SIZE / 512) + +struct ipz_small_queue_page { + unsigned long page; + unsigned long bitmap[IPZ_SPAGE_PER_KPAGE / BITS_PER_LONG]; + int fill; + void *mapped_addr; + u32 mmap_count; + struct list_head list; +}; + +/* struct generic queue in linux kernel virtual memory (kv) */ +struct ipz_queue { + u64 current_q_offset; /* current queue entry */ + + struct ipz_page **queue_pages; /* array of pages belonging to queue */ + u32 qe_size; /* queue entry size */ + u32 act_nr_of_sg; + u32 queue_length; /* queue length allocated in bytes */ + u32 pagesize; + u32 toggle_state; /* toggle flag - per page */ + u32 offset; /* save offset within page for small_qp */ + struct ipz_small_queue_page *small_page; +}; + +/* + * return current Queue Entry for a certain q_offset + * returns address (kv) of Queue Entry + */ +static inline void *ipz_qeit_calc(struct ipz_queue *queue, u64 q_offset) +{ + struct ipz_page *current_page; + if (q_offset >= queue->queue_length) + return NULL; + current_page = (queue->queue_pages)[q_offset >> EHCA_PAGESHIFT]; + return ¤t_page->entries[q_offset & (EHCA_PAGESIZE - 1)]; +} + +/* + * return current Queue Entry + * returns address (kv) of Queue Entry + */ +static inline void *ipz_qeit_get(struct ipz_queue *queue) +{ + return ipz_qeit_calc(queue, queue->current_q_offset); +} + +/* + * return current Queue Page , increment Queue Page iterator from + * page to page in struct ipz_queue, last increment will return 0! and + * NOT wrap + * returns address (kv) of Queue Page + * warning don't use in parallel with ipz_QE_get_inc() + */ +void *ipz_qpageit_get_inc(struct ipz_queue *queue); + +/* + * return current Queue Entry, increment Queue Entry iterator by one + * step in struct ipz_queue, will wrap in ringbuffer + * returns address (kv) of Queue Entry BEFORE increment + * warning don't use in parallel with ipz_qpageit_get_inc() + */ +static inline void *ipz_qeit_get_inc(struct ipz_queue *queue) +{ + void *ret = ipz_qeit_get(queue); + queue->current_q_offset += queue->qe_size; + if (queue->current_q_offset >= queue->queue_length) { + queue->current_q_offset = 0; + /* toggle the valid flag */ + queue->toggle_state = (~queue->toggle_state) & 1; + } + + return ret; +} + +/* + * return a bool indicating whether current Queue Entry is valid + */ +static inline int ipz_qeit_is_valid(struct ipz_queue *queue) +{ + struct ehca_cqe *cqe = ipz_qeit_get(queue); + return ((cqe->cqe_flags >> 7) == (queue->toggle_state & 1)); +} + +/* + * return current Queue Entry, increment Queue Entry iterator by one + * step in struct ipz_queue, will wrap in ringbuffer + * returns address (kv) of Queue Entry BEFORE increment + * returns 0 and does not increment, if wrong valid state + * warning don't use in parallel with ipz_qpageit_get_inc() + */ +static inline void *ipz_qeit_get_inc_valid(struct ipz_queue *queue) +{ + return ipz_qeit_is_valid(queue) ? ipz_qeit_get_inc(queue) : NULL; +} + +/* + * returns and resets Queue Entry iterator + * returns address (kv) of first Queue Entry + */ +static inline void *ipz_qeit_reset(struct ipz_queue *queue) +{ + queue->current_q_offset = 0; + return ipz_qeit_get(queue); +} + +/* + * return the q_offset corresponding to an absolute address + */ +int ipz_queue_abs_to_offset(struct ipz_queue *queue, u64 addr, u64 *q_offset); + +/* + * return the next queue offset. don't modify the queue. + */ +static inline u64 ipz_queue_advance_offset(struct ipz_queue *queue, u64 offset) +{ + offset += queue->qe_size; + if (offset >= queue->queue_length) offset = 0; + return offset; +} + +/* struct generic page table */ +struct ipz_pt { + u64 entries[EHCA_PT_ENTRIES]; +}; + +/* struct page table for a queue, only to be used in pf */ +struct ipz_qpt { + /* queue page tables (kv), use u64 because we know the element length */ + u64 *qpts; + u32 n_qpts; + u32 n_ptes; /* number of page table entries */ + u64 *current_pte_addr; +}; + +/* + * constructor for a ipz_queue_t, placement new for ipz_queue_t, + * new for all dependent datastructors + * all QP Tables are the same + * flow: + * allocate+pin queue + * see ipz_qpt_ctor() + * returns true if ok, false if out of memory + */ +int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue, + const u32 nr_of_pages, const u32 pagesize, + const u32 qe_size, const u32 nr_of_sg, + int is_small); + +/* + * destructor for a ipz_queue_t + * -# free queue + * see ipz_queue_ctor() + * returns true if ok, false if queue was NULL-ptr of free failed + */ +int ipz_queue_dtor(struct ehca_pd *pd, struct ipz_queue *queue); + +/* + * constructor for a ipz_qpt_t, + * placement new for struct ipz_queue, new for all dependent datastructors + * all QP Tables are the same, + * flow: + * -# allocate+pin queue + * -# initialise ptcb + * -# allocate+pin PTs + * -# link PTs to a ring, according to HCA Arch, set bit62 id needed + * -# the ring must have room for exactly nr_of_PTEs + * see ipz_qpt_ctor() + */ +void ipz_qpt_ctor(struct ipz_qpt *qpt, + const u32 nr_of_qes, + const u32 pagesize, + const u32 qe_size, + const u8 lowbyte, const u8 toggle, + u32 * act_nr_of_QEs, u32 * act_nr_of_pages); + +/* + * return current Queue Entry, increment Queue Entry iterator by one + * step in struct ipz_queue, will wrap in ringbuffer + * returns address (kv) of Queue Entry BEFORE increment + * warning don't use in parallel with ipz_qpageit_get_inc() + * warning unpredictable results may occur if steps>act_nr_of_queue_entries + * fix EQ page problems + */ +void *ipz_qeit_eq_get_inc(struct ipz_queue *queue); + +/* + * return current Event Queue Entry, increment Queue Entry iterator + * by one step in struct ipz_queue if valid, will wrap in ringbuffer + * returns address (kv) of Queue Entry BEFORE increment + * returns 0 and does not increment, if wrong valid state + * warning don't use in parallel with ipz_queue_QPageit_get_inc() + * warning unpredictable results may occur if steps>act_nr_of_queue_entries + */ +static inline void *ipz_eqit_eq_get_inc_valid(struct ipz_queue *queue) +{ + void *ret = ipz_qeit_get(queue); + u32 qe = *(u8 *)ret; + if ((qe >> 7) != (queue->toggle_state & 1)) + return NULL; + ipz_qeit_eq_get_inc(queue); /* this is a good one */ + return ret; +} + +static inline void *ipz_eqit_eq_peek_valid(struct ipz_queue *queue) +{ + void *ret = ipz_qeit_get(queue); + u32 qe = *(u8 *)ret; + if ((qe >> 7) != (queue->toggle_state & 1)) + return NULL; + return ret; +} + +/* returns address (GX) of first queue entry */ +static inline u64 ipz_qpt_get_firstpage(struct ipz_qpt *qpt) +{ + return be64_to_cpu(qpt->qpts[0]); +} + +/* returns address (kv) of first page of queue page table */ +static inline void *ipz_qpt_get_qpt(struct ipz_qpt *qpt) +{ + return qpt->qpts; +} + +#endif /* __IPZ_PT_FN_H__ */ diff --git a/drivers/infiniband/hw/ipath/Kconfig b/drivers/infiniband/hw/ipath/Kconfig new file mode 100644 index 0000000..1d9bb11 --- /dev/null +++ b/drivers/infiniband/hw/ipath/Kconfig @@ -0,0 +1,11 @@ +config INFINIBAND_IPATH + tristate "QLogic HTX HCA support" + depends on 64BIT && NET && HT_IRQ + ---help--- + This is a driver for the obsolete QLogic Hyper-Transport + IB host channel adapter (model QHT7140), + including InfiniBand verbs support. This driver allows these + devices to be used with both kernel upper level protocols such + as IP-over-InfiniBand as well as with userspace applications + (in conjunction with InfiniBand userspace access). + For QLogic PCIe QLE based cards, use the QIB driver instead. diff --git a/drivers/infiniband/hw/ipath/Makefile b/drivers/infiniband/hw/ipath/Makefile new file mode 100644 index 0000000..4496f28 --- /dev/null +++ b/drivers/infiniband/hw/ipath/Makefile @@ -0,0 +1,37 @@ +ccflags-y := -DIPATH_IDSTR='"QLogic kernel.org driver"' \ + -DIPATH_KERN_TYPE=0 + +obj-$(CONFIG_INFINIBAND_IPATH) += ib_ipath.o + +ib_ipath-y := \ + ipath_cq.o \ + ipath_diag.o \ + ipath_dma.o \ + ipath_driver.o \ + ipath_eeprom.o \ + ipath_file_ops.o \ + ipath_fs.o \ + ipath_init_chip.o \ + ipath_intr.o \ + ipath_keys.o \ + ipath_mad.o \ + ipath_mmap.o \ + ipath_mr.o \ + ipath_qp.o \ + ipath_rc.o \ + ipath_ruc.o \ + ipath_sdma.o \ + ipath_srq.o \ + ipath_stats.o \ + ipath_sysfs.o \ + ipath_uc.o \ + ipath_ud.o \ + ipath_user_pages.o \ + ipath_user_sdma.o \ + ipath_verbs_mcast.o \ + ipath_verbs.o + +ib_ipath-$(CONFIG_HT_IRQ) += ipath_iba6110.o + +ib_ipath-$(CONFIG_X86_64) += ipath_wc_x86_64.o +ib_ipath-$(CONFIG_PPC64) += ipath_wc_ppc64.o diff --git a/drivers/infiniband/hw/ipath/ipath_common.h b/drivers/infiniband/hw/ipath/ipath_common.h new file mode 100644 index 0000000..28cfe97 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_common.h @@ -0,0 +1,851 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPATH_COMMON_H +#define _IPATH_COMMON_H + +/* + * This file contains defines, structures, etc. that are used + * to communicate between kernel and user code. + */ + + +/* This is the IEEE-assigned OUI for QLogic Inc. InfiniPath */ +#define IPATH_SRC_OUI_1 0x00 +#define IPATH_SRC_OUI_2 0x11 +#define IPATH_SRC_OUI_3 0x75 + +/* version of protocol header (known to chip also). In the long run, + * we should be able to generate and accept a range of version numbers; + * for now we only accept one, and it's compiled in. + */ +#define IPS_PROTO_VERSION 2 + +/* + * These are compile time constants that you may want to enable or disable + * if you are trying to debug problems with code or performance. + * IPATH_VERBOSE_TRACING define as 1 if you want additional tracing in + * fastpath code + * IPATH_TRACE_REGWRITES define as 1 if you want register writes to be + * traced in faspath code + * _IPATH_TRACING define as 0 if you want to remove all tracing in a + * compilation unit + * _IPATH_DEBUGGING define as 0 if you want to remove debug prints + */ + +/* + * The value in the BTH QP field that InfiniPath uses to differentiate + * an infinipath protocol IB packet vs standard IB transport + */ +#define IPATH_KD_QP 0x656b79 + +/* + * valid states passed to ipath_set_linkstate() user call + */ +#define IPATH_IB_LINKDOWN 0 +#define IPATH_IB_LINKARM 1 +#define IPATH_IB_LINKACTIVE 2 +#define IPATH_IB_LINKDOWN_ONLY 3 +#define IPATH_IB_LINKDOWN_SLEEP 4 +#define IPATH_IB_LINKDOWN_DISABLE 5 +#define IPATH_IB_LINK_LOOPBACK 6 /* enable local loopback */ +#define IPATH_IB_LINK_EXTERNAL 7 /* normal, disable local loopback */ +#define IPATH_IB_LINK_NO_HRTBT 8 /* disable Heartbeat, e.g. for loopback */ +#define IPATH_IB_LINK_HRTBT 9 /* enable heartbeat, normal, non-loopback */ + +/* + * These 3 values (SDR and DDR may be ORed for auto-speed + * negotiation) are used for the 3rd argument to path_f_set_ib_cfg + * with cmd IPATH_IB_CFG_SPD_ENB, by direct calls or via sysfs. They + * are also the the possible values for ipath_link_speed_enabled and active + * The values were chosen to match values used within the IB spec. + */ +#define IPATH_IB_SDR 1 +#define IPATH_IB_DDR 2 + +/* + * stats maintained by the driver. For now, at least, this is global + * to all minor devices. + */ +struct infinipath_stats { + /* number of interrupts taken */ + __u64 sps_ints; + /* number of interrupts for errors */ + __u64 sps_errints; + /* number of errors from chip (not incl. packet errors or CRC) */ + __u64 sps_errs; + /* number of packet errors from chip other than CRC */ + __u64 sps_pkterrs; + /* number of packets with CRC errors (ICRC and VCRC) */ + __u64 sps_crcerrs; + /* number of hardware errors reported (parity, etc.) */ + __u64 sps_hwerrs; + /* number of times IB link changed state unexpectedly */ + __u64 sps_iblink; + __u64 sps_unused; /* was fastrcvint, no longer implemented */ + /* number of kernel (port0) packets received */ + __u64 sps_port0pkts; + /* number of "ethernet" packets sent by driver */ + __u64 sps_ether_spkts; + /* number of "ethernet" packets received by driver */ + __u64 sps_ether_rpkts; + /* number of SMA packets sent by driver. Obsolete. */ + __u64 sps_sma_spkts; + /* number of SMA packets received by driver. Obsolete. */ + __u64 sps_sma_rpkts; + /* number of times all ports rcvhdrq was full and packet dropped */ + __u64 sps_hdrqfull; + /* number of times all ports egrtid was full and packet dropped */ + __u64 sps_etidfull; + /* + * number of times we tried to send from driver, but no pio buffers + * avail + */ + __u64 sps_nopiobufs; + /* number of ports currently open */ + __u64 sps_ports; + /* list of pkeys (other than default) accepted (0 means not set) */ + __u16 sps_pkeys[4]; + __u16 sps_unused16[4]; /* available; maintaining compatible layout */ + /* number of user ports per chip (not IB ports) */ + __u32 sps_nports; + /* not our interrupt, or already handled */ + __u32 sps_nullintr; + /* max number of packets handled per receive call */ + __u32 sps_maxpkts_call; + /* avg number of packets handled per receive call */ + __u32 sps_avgpkts_call; + /* total number of pages locked */ + __u64 sps_pagelocks; + /* total number of pages unlocked */ + __u64 sps_pageunlocks; + /* + * Number of packets dropped in kernel other than errors (ether + * packets if ipath not configured, etc.) + */ + __u64 sps_krdrops; + __u64 sps_txeparity; /* PIO buffer parity error, recovered */ + /* pad for future growth */ + __u64 __sps_pad[45]; +}; + +/* + * These are the status bits readable (in ascii form, 64bit value) + * from the "status" sysfs file. + */ +#define IPATH_STATUS_INITTED 0x1 /* basic initialization done */ +#define IPATH_STATUS_DISABLED 0x2 /* hardware disabled */ +/* Device has been disabled via admin request */ +#define IPATH_STATUS_ADMIN_DISABLED 0x4 +/* Chip has been found and initted */ +#define IPATH_STATUS_CHIP_PRESENT 0x20 +/* IB link is at ACTIVE, usable for data traffic */ +#define IPATH_STATUS_IB_READY 0x40 +/* link is configured, LID, MTU, etc. have been set */ +#define IPATH_STATUS_IB_CONF 0x80 +/* no link established, probably no cable */ +#define IPATH_STATUS_IB_NOCABLE 0x100 +/* A Fatal hardware error has occurred. */ +#define IPATH_STATUS_HWERROR 0x200 + +/* + * The list of usermode accessible registers. Also see Reg_* later in file. + */ +typedef enum _ipath_ureg { + /* (RO) DMA RcvHdr to be used next. */ + ur_rcvhdrtail = 0, + /* (RW) RcvHdr entry to be processed next by host. */ + ur_rcvhdrhead = 1, + /* (RO) Index of next Eager index to use. */ + ur_rcvegrindextail = 2, + /* (RW) Eager TID to be processed next */ + ur_rcvegrindexhead = 3, + /* For internal use only; max register number. */ + _IPATH_UregMax +} ipath_ureg; + +/* bit values for spi_runtime_flags */ +#define IPATH_RUNTIME_HT 0x1 +#define IPATH_RUNTIME_PCIE 0x2 +#define IPATH_RUNTIME_FORCE_WC_ORDER 0x4 +#define IPATH_RUNTIME_RCVHDR_COPY 0x8 +#define IPATH_RUNTIME_MASTER 0x10 +#define IPATH_RUNTIME_NODMA_RTAIL 0x80 +#define IPATH_RUNTIME_SDMA 0x200 +#define IPATH_RUNTIME_FORCE_PIOAVAIL 0x400 +#define IPATH_RUNTIME_PIO_REGSWAPPED 0x800 + +/* + * This structure is returned by ipath_userinit() immediately after + * open to get implementation-specific info, and info specific to this + * instance. + * + * This struct must have explict pad fields where type sizes + * may result in different alignments between 32 and 64 bit + * programs, since the 64 bit * bit kernel requires the user code + * to have matching offsets + */ +struct ipath_base_info { + /* version of hardware, for feature checking. */ + __u32 spi_hw_version; + /* version of software, for feature checking. */ + __u32 spi_sw_version; + /* InfiniPath port assigned, goes into sent packets */ + __u16 spi_port; + __u16 spi_subport; + /* + * IB MTU, packets IB data must be less than this. + * The MTU is in bytes, and will be a multiple of 4 bytes. + */ + __u32 spi_mtu; + /* + * Size of a PIO buffer. Any given packet's total size must be less + * than this (in words). Included is the starting control word, so + * if 513 is returned, then total pkt size is 512 words or less. + */ + __u32 spi_piosize; + /* size of the TID cache in infinipath, in entries */ + __u32 spi_tidcnt; + /* size of the TID Eager list in infinipath, in entries */ + __u32 spi_tidegrcnt; + /* size of a single receive header queue entry in words. */ + __u32 spi_rcvhdrent_size; + /* + * Count of receive header queue entries allocated. + * This may be less than the spu_rcvhdrcnt passed in!. + */ + __u32 spi_rcvhdr_cnt; + + /* per-chip and other runtime features bitmap (IPATH_RUNTIME_*) */ + __u32 spi_runtime_flags; + + /* address where receive buffer queue is mapped into */ + __u64 spi_rcvhdr_base; + + /* user program. */ + + /* base address of eager TID receive buffers. */ + __u64 spi_rcv_egrbufs; + + /* Allocated by initialization code, not by protocol. */ + + /* + * Size of each TID buffer in host memory, starting at + * spi_rcv_egrbufs. The buffers are virtually contiguous. + */ + __u32 spi_rcv_egrbufsize; + /* + * The special QP (queue pair) value that identifies an infinipath + * protocol packet from standard IB packets. More, probably much + * more, to be added. + */ + __u32 spi_qpair; + + /* + * User register base for init code, not to be used directly by + * protocol or applications. + */ + __u64 __spi_uregbase; + /* + * Maximum buffer size in bytes that can be used in a single TID + * entry (assuming the buffer is aligned to this boundary). This is + * the minimum of what the hardware and software support Guaranteed + * to be a power of 2. + */ + __u32 spi_tid_maxsize; + /* + * alignment of each pio send buffer (byte count + * to add to spi_piobufbase to get to second buffer) + */ + __u32 spi_pioalign; + /* + * The index of the first pio buffer available to this process; + * needed to do lookup in spi_pioavailaddr; not added to + * spi_piobufbase. + */ + __u32 spi_pioindex; + /* number of buffers mapped for this process */ + __u32 spi_piocnt; + + /* + * Base address of writeonly pio buffers for this process. + * Each buffer has spi_piosize words, and is aligned on spi_pioalign + * boundaries. spi_piocnt buffers are mapped from this address + */ + __u64 spi_piobufbase; + + /* + * Base address of readonly memory copy of the pioavail registers. + * There are 2 bits for each buffer. + */ + __u64 spi_pioavailaddr; + + /* + * Address where driver updates a copy of the interface and driver + * status (IPATH_STATUS_*) as a 64 bit value. It's followed by a + * string indicating hardware error, if there was one. + */ + __u64 spi_status; + + /* number of chip ports available to user processes */ + __u32 spi_nports; + /* unit number of chip we are using */ + __u32 spi_unit; + /* num bufs in each contiguous set */ + __u32 spi_rcv_egrperchunk; + /* size in bytes of each contiguous set */ + __u32 spi_rcv_egrchunksize; + /* total size of mmap to cover full rcvegrbuffers */ + __u32 spi_rcv_egrbuftotlen; + __u32 spi_filler_for_align; + /* address of readonly memory copy of the rcvhdrq tail register. */ + __u64 spi_rcvhdr_tailaddr; + + /* shared memory pages for subports if port is shared */ + __u64 spi_subport_uregbase; + __u64 spi_subport_rcvegrbuf; + __u64 spi_subport_rcvhdr_base; + + /* shared memory page for hardware port if it is shared */ + __u64 spi_port_uregbase; + __u64 spi_port_rcvegrbuf; + __u64 spi_port_rcvhdr_base; + __u64 spi_port_rcvhdr_tailaddr; + +} __attribute__ ((aligned(8))); + + +/* + * This version number is given to the driver by the user code during + * initialization in the spu_userversion field of ipath_user_info, so + * the driver can check for compatibility with user code. + * + * The major version changes when data structures + * change in an incompatible way. The driver must be the same or higher + * for initialization to succeed. In some cases, a higher version + * driver will not interoperate with older software, and initialization + * will return an error. + */ +#define IPATH_USER_SWMAJOR 1 + +/* + * Minor version differences are always compatible + * a within a major version, however if user software is larger + * than driver software, some new features and/or structure fields + * may not be implemented; the user code must deal with this if it + * cares, or it must abort after initialization reports the difference. + */ +#define IPATH_USER_SWMINOR 6 + +#define IPATH_USER_SWVERSION ((IPATH_USER_SWMAJOR<<16) | IPATH_USER_SWMINOR) + +#define IPATH_KERN_TYPE 0 + +/* + * Similarly, this is the kernel version going back to the user. It's + * slightly different, in that we want to tell if the driver was built as + * part of a QLogic release, or from the driver from openfabrics.org, + * kernel.org, or a standard distribution, for support reasons. + * The high bit is 0 for non-QLogic and 1 for QLogic-built/supplied. + * + * It's returned by the driver to the user code during initialization in the + * spi_sw_version field of ipath_base_info, so the user code can in turn + * check for compatibility with the kernel. +*/ +#define IPATH_KERN_SWVERSION ((IPATH_KERN_TYPE<<31) | IPATH_USER_SWVERSION) + +/* + * This structure is passed to ipath_userinit() to tell the driver where + * user code buffers are, sizes, etc. The offsets and sizes of the + * fields must remain unchanged, for binary compatibility. It can + * be extended, if userversion is changed so user code can tell, if needed + */ +struct ipath_user_info { + /* + * version of user software, to detect compatibility issues. + * Should be set to IPATH_USER_SWVERSION. + */ + __u32 spu_userversion; + + /* desired number of receive header queue entries */ + __u32 spu_rcvhdrcnt; + + /* size of struct base_info to write to */ + __u32 spu_base_info_size; + + /* + * number of words in KD protocol header + * This tells InfiniPath how many words to copy to rcvhdrq. If 0, + * kernel uses a default. Once set, attempts to set any other value + * are an error (EAGAIN) until driver is reloaded. + */ + __u32 spu_rcvhdrsize; + + /* + * If two or more processes wish to share a port, each process + * must set the spu_subport_cnt and spu_subport_id to the same + * values. The only restriction on the spu_subport_id is that + * it be unique for a given node. + */ + __u16 spu_subport_cnt; + __u16 spu_subport_id; + + __u32 spu_unused; /* kept for compatible layout */ + + /* + * address of struct base_info to write to + */ + __u64 spu_base_info; + +} __attribute__ ((aligned(8))); + +/* User commands. */ + +#define IPATH_CMD_MIN 16 + +#define __IPATH_CMD_USER_INIT 16 /* old set up userspace (for old user code) */ +#define IPATH_CMD_PORT_INFO 17 /* find out what resources we got */ +#define IPATH_CMD_RECV_CTRL 18 /* control receipt of packets */ +#define IPATH_CMD_TID_UPDATE 19 /* update expected TID entries */ +#define IPATH_CMD_TID_FREE 20 /* free expected TID entries */ +#define IPATH_CMD_SET_PART_KEY 21 /* add partition key */ +#define __IPATH_CMD_SLAVE_INFO 22 /* return info on slave processes (for old user code) */ +#define IPATH_CMD_ASSIGN_PORT 23 /* allocate HCA and port */ +#define IPATH_CMD_USER_INIT 24 /* set up userspace */ +#define IPATH_CMD_UNUSED_1 25 +#define IPATH_CMD_UNUSED_2 26 +#define IPATH_CMD_PIOAVAILUPD 27 /* force an update of PIOAvail reg */ +#define IPATH_CMD_POLL_TYPE 28 /* set the kind of polling we want */ +#define IPATH_CMD_ARMLAUNCH_CTRL 29 /* armlaunch detection control */ +/* 30 is unused */ +#define IPATH_CMD_SDMA_INFLIGHT 31 /* sdma inflight counter request */ +#define IPATH_CMD_SDMA_COMPLETE 32 /* sdma completion counter request */ + +/* + * Poll types + */ +#define IPATH_POLL_TYPE_URGENT 0x01 +#define IPATH_POLL_TYPE_OVERFLOW 0x02 + +struct ipath_port_info { + __u32 num_active; /* number of active units */ + __u32 unit; /* unit (chip) assigned to caller */ + __u16 port; /* port on unit assigned to caller */ + __u16 subport; /* subport on unit assigned to caller */ + __u16 num_ports; /* number of ports available on unit */ + __u16 num_subports; /* number of subports opened on port */ +}; + +struct ipath_tid_info { + __u32 tidcnt; + /* make structure same size in 32 and 64 bit */ + __u32 tid__unused; + /* virtual address of first page in transfer */ + __u64 tidvaddr; + /* pointer (same size 32/64 bit) to __u16 tid array */ + __u64 tidlist; + + /* + * pointer (same size 32/64 bit) to bitmap of TIDs used + * for this call; checked for being large enough at open + */ + __u64 tidmap; +}; + +struct ipath_cmd { + __u32 type; /* command type */ + union { + struct ipath_tid_info tid_info; + struct ipath_user_info user_info; + + /* + * address in userspace where we should put the sdma + * inflight counter + */ + __u64 sdma_inflight; + /* + * address in userspace where we should put the sdma + * completion counter + */ + __u64 sdma_complete; + /* address in userspace of struct ipath_port_info to + write result to */ + __u64 port_info; + /* enable/disable receipt of packets */ + __u32 recv_ctrl; + /* enable/disable armlaunch errors (non-zero to enable) */ + __u32 armlaunch_ctrl; + /* partition key to set */ + __u16 part_key; + /* user address of __u32 bitmask of active slaves */ + __u64 slave_mask_addr; + /* type of polling we want */ + __u16 poll_type; + } cmd; +}; + +struct ipath_iovec { + /* Pointer to data, but same size 32 and 64 bit */ + __u64 iov_base; + + /* + * Length of data; don't need 64 bits, but want + * ipath_sendpkt to remain same size as before 32 bit changes, so... + */ + __u64 iov_len; +}; + +/* + * Describes a single packet for send. Each packet can have one or more + * buffers, but the total length (exclusive of IB headers) must be less + * than the MTU, and if using the PIO method, entire packet length, + * including IB headers, must be less than the ipath_piosize value (words). + * Use of this necessitates including sys/uio.h + */ +struct __ipath_sendpkt { + __u32 sps_flags; /* flags for packet (TBD) */ + __u32 sps_cnt; /* number of entries to use in sps_iov */ + /* array of iov's describing packet. TEMPORARY */ + struct ipath_iovec sps_iov[4]; +}; + +/* + * diagnostics can send a packet by "writing" one of the following + * two structs to diag data special file + * The first is the legacy version for backward compatibility + */ +struct ipath_diag_pkt { + __u32 unit; + __u64 data; + __u32 len; +}; + +/* The second diag_pkt struct is the expanded version that allows + * more control over the packet, specifically, by allowing a custom + * pbc (+ static rate) qword, so that special modes and deliberate + * changes to CRCs can be used. The elements were also re-ordered + * for better alignment and to avoid padding issues. + */ +struct ipath_diag_xpkt { + __u64 data; + __u64 pbc_wd; + __u32 unit; + __u32 len; +}; + +/* + * Data layout in I2C flash (for GUID, etc.) + * All fields are little-endian binary unless otherwise stated + */ +#define IPATH_FLASH_VERSION 2 +struct ipath_flash { + /* flash layout version (IPATH_FLASH_VERSION) */ + __u8 if_fversion; + /* checksum protecting if_length bytes */ + __u8 if_csum; + /* + * valid length (in use, protected by if_csum), including + * if_fversion and if_csum themselves) + */ + __u8 if_length; + /* the GUID, in network order */ + __u8 if_guid[8]; + /* number of GUIDs to use, starting from if_guid */ + __u8 if_numguid; + /* the (last 10 characters of) board serial number, in ASCII */ + char if_serial[12]; + /* board mfg date (YYYYMMDD ASCII) */ + char if_mfgdate[8]; + /* last board rework/test date (YYYYMMDD ASCII) */ + char if_testdate[8]; + /* logging of error counts, TBD */ + __u8 if_errcntp[4]; + /* powered on hours, updated at driver unload */ + __u8 if_powerhour[2]; + /* ASCII free-form comment field */ + char if_comment[32]; + /* Backwards compatible prefix for longer QLogic Serial Numbers */ + char if_sprefix[4]; + /* 82 bytes used, min flash size is 128 bytes */ + __u8 if_future[46]; +}; + +/* + * These are the counters implemented in the chip, and are listed in order. + * The InterCaps naming is taken straight from the chip spec. + */ +struct infinipath_counters { + __u64 LBIntCnt; + __u64 LBFlowStallCnt; + __u64 TxSDmaDescCnt; /* was Reserved1 */ + __u64 TxUnsupVLErrCnt; + __u64 TxDataPktCnt; + __u64 TxFlowPktCnt; + __u64 TxDwordCnt; + __u64 TxLenErrCnt; + __u64 TxMaxMinLenErrCnt; + __u64 TxUnderrunCnt; + __u64 TxFlowStallCnt; + __u64 TxDroppedPktCnt; + __u64 RxDroppedPktCnt; + __u64 RxDataPktCnt; + __u64 RxFlowPktCnt; + __u64 RxDwordCnt; + __u64 RxLenErrCnt; + __u64 RxMaxMinLenErrCnt; + __u64 RxICRCErrCnt; + __u64 RxVCRCErrCnt; + __u64 RxFlowCtrlErrCnt; + __u64 RxBadFormatCnt; + __u64 RxLinkProblemCnt; + __u64 RxEBPCnt; + __u64 RxLPCRCErrCnt; + __u64 RxBufOvflCnt; + __u64 RxTIDFullErrCnt; + __u64 RxTIDValidErrCnt; + __u64 RxPKeyMismatchCnt; + __u64 RxP0HdrEgrOvflCnt; + __u64 RxP1HdrEgrOvflCnt; + __u64 RxP2HdrEgrOvflCnt; + __u64 RxP3HdrEgrOvflCnt; + __u64 RxP4HdrEgrOvflCnt; + __u64 RxP5HdrEgrOvflCnt; + __u64 RxP6HdrEgrOvflCnt; + __u64 RxP7HdrEgrOvflCnt; + __u64 RxP8HdrEgrOvflCnt; + __u64 RxP9HdrEgrOvflCnt; /* was Reserved6 */ + __u64 RxP10HdrEgrOvflCnt; /* was Reserved7 */ + __u64 RxP11HdrEgrOvflCnt; /* new for IBA7220 */ + __u64 RxP12HdrEgrOvflCnt; /* new for IBA7220 */ + __u64 RxP13HdrEgrOvflCnt; /* new for IBA7220 */ + __u64 RxP14HdrEgrOvflCnt; /* new for IBA7220 */ + __u64 RxP15HdrEgrOvflCnt; /* new for IBA7220 */ + __u64 RxP16HdrEgrOvflCnt; /* new for IBA7220 */ + __u64 IBStatusChangeCnt; + __u64 IBLinkErrRecoveryCnt; + __u64 IBLinkDownedCnt; + __u64 IBSymbolErrCnt; + /* The following are new for IBA7220 */ + __u64 RxVL15DroppedPktCnt; + __u64 RxOtherLocalPhyErrCnt; + __u64 PcieRetryBufDiagQwordCnt; + __u64 ExcessBufferOvflCnt; + __u64 LocalLinkIntegrityErrCnt; + __u64 RxVlErrCnt; + __u64 RxDlidFltrCnt; +}; + +/* + * The next set of defines are for packet headers, and chip register + * and memory bits that are visible to and/or used by user-mode software + * The other bits that are used only by the driver or diags are in + * ipath_registers.h + */ + +/* RcvHdrFlags bits */ +#define INFINIPATH_RHF_LENGTH_MASK 0x7FF +#define INFINIPATH_RHF_LENGTH_SHIFT 0 +#define INFINIPATH_RHF_RCVTYPE_MASK 0x7 +#define INFINIPATH_RHF_RCVTYPE_SHIFT 11 +#define INFINIPATH_RHF_EGRINDEX_MASK 0xFFF +#define INFINIPATH_RHF_EGRINDEX_SHIFT 16 +#define INFINIPATH_RHF_SEQ_MASK 0xF +#define INFINIPATH_RHF_SEQ_SHIFT 0 +#define INFINIPATH_RHF_HDRQ_OFFSET_MASK 0x7FF +#define INFINIPATH_RHF_HDRQ_OFFSET_SHIFT 4 +#define INFINIPATH_RHF_H_ICRCERR 0x80000000 +#define INFINIPATH_RHF_H_VCRCERR 0x40000000 +#define INFINIPATH_RHF_H_PARITYERR 0x20000000 +#define INFINIPATH_RHF_H_LENERR 0x10000000 +#define INFINIPATH_RHF_H_MTUERR 0x08000000 +#define INFINIPATH_RHF_H_IHDRERR 0x04000000 +#define INFINIPATH_RHF_H_TIDERR 0x02000000 +#define INFINIPATH_RHF_H_MKERR 0x01000000 +#define INFINIPATH_RHF_H_IBERR 0x00800000 +#define INFINIPATH_RHF_H_ERR_MASK 0xFF800000 +#define INFINIPATH_RHF_L_USE_EGR 0x80000000 +#define INFINIPATH_RHF_L_SWA 0x00008000 +#define INFINIPATH_RHF_L_SWB 0x00004000 + +/* infinipath header fields */ +#define INFINIPATH_I_VERS_MASK 0xF +#define INFINIPATH_I_VERS_SHIFT 28 +#define INFINIPATH_I_PORT_MASK 0xF +#define INFINIPATH_I_PORT_SHIFT 24 +#define INFINIPATH_I_TID_MASK 0x7FF +#define INFINIPATH_I_TID_SHIFT 13 +#define INFINIPATH_I_OFFSET_MASK 0x1FFF +#define INFINIPATH_I_OFFSET_SHIFT 0 + +/* K_PktFlags bits */ +#define INFINIPATH_KPF_INTR 0x1 +#define INFINIPATH_KPF_SUBPORT_MASK 0x3 +#define INFINIPATH_KPF_SUBPORT_SHIFT 1 + +#define INFINIPATH_MAX_SUBPORT 4 + +/* SendPIO per-buffer control */ +#define INFINIPATH_SP_TEST 0x40 +#define INFINIPATH_SP_TESTEBP 0x20 +#define INFINIPATH_SP_TRIGGER_SHIFT 15 + +/* SendPIOAvail bits */ +#define INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT 1 +#define INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT 0 + +/* infinipath header format */ +struct ipath_header { + /* + * Version - 4 bits, Port - 4 bits, TID - 10 bits and Offset - + * 14 bits before ECO change ~28 Dec 03. After that, Vers 4, + * Port 4, TID 11, offset 13. + */ + __le32 ver_port_tid_offset; + __le16 chksum; + __le16 pkt_flags; +}; + +/* infinipath user message header format. + * This structure contains the first 4 fields common to all protocols + * that employ infinipath. + */ +struct ipath_message_header { + __be16 lrh[4]; + __be32 bth[3]; + /* fields below this point are in host byte order */ + struct ipath_header iph; + __u8 sub_opcode; +}; + +/* infinipath ethernet header format */ +struct ether_header { + __be16 lrh[4]; + __be32 bth[3]; + struct ipath_header iph; + __u8 sub_opcode; + __u8 cmd; + __be16 lid; + __u16 mac[3]; + __u8 frag_num; + __u8 seq_num; + __le32 len; + /* MUST be of word size due to PIO write requirements */ + __le32 csum; + __le16 csum_offset; + __le16 flags; + __u16 first_2_bytes; + __u8 unused[2]; /* currently unused */ +}; + + +/* IB - LRH header consts */ +#define IPATH_LRH_GRH 0x0003 /* 1. word of IB LRH - next header: GRH */ +#define IPATH_LRH_BTH 0x0002 /* 1. word of IB LRH - next header: BTH */ + +/* misc. */ +#define SIZE_OF_CRC 1 + +#define IPATH_DEFAULT_P_KEY 0xFFFF +#define IPATH_PERMISSIVE_LID 0xFFFF +#define IPATH_AETH_CREDIT_SHIFT 24 +#define IPATH_AETH_CREDIT_MASK 0x1F +#define IPATH_AETH_CREDIT_INVAL 0x1F +#define IPATH_PSN_MASK 0xFFFFFF +#define IPATH_MSN_MASK 0xFFFFFF +#define IPATH_QPN_MASK 0xFFFFFF +#define IPATH_MULTICAST_LID_BASE 0xC000 +#define IPATH_EAGER_TID_ID INFINIPATH_I_TID_MASK +#define IPATH_MULTICAST_QPN 0xFFFFFF + +/* Receive Header Queue: receive type (from infinipath) */ +#define RCVHQ_RCV_TYPE_EXPECTED 0 +#define RCVHQ_RCV_TYPE_EAGER 1 +#define RCVHQ_RCV_TYPE_NON_KD 2 +#define RCVHQ_RCV_TYPE_ERROR 3 + + +/* sub OpCodes - ith4x */ +#define IPATH_ITH4X_OPCODE_ENCAP 0x81 +#define IPATH_ITH4X_OPCODE_LID_ARP 0x82 + +#define IPATH_HEADER_QUEUE_WORDS 9 + +/* functions for extracting fields from rcvhdrq entries for the driver. + */ +static inline __u32 ipath_hdrget_err_flags(const __le32 * rbuf) +{ + return __le32_to_cpu(rbuf[1]) & INFINIPATH_RHF_H_ERR_MASK; +} + +static inline __u32 ipath_hdrget_rcv_type(const __le32 * rbuf) +{ + return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_RCVTYPE_SHIFT) + & INFINIPATH_RHF_RCVTYPE_MASK; +} + +static inline __u32 ipath_hdrget_length_in_bytes(const __le32 * rbuf) +{ + return ((__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_LENGTH_SHIFT) + & INFINIPATH_RHF_LENGTH_MASK) << 2; +} + +static inline __u32 ipath_hdrget_index(const __le32 * rbuf) +{ + return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_EGRINDEX_SHIFT) + & INFINIPATH_RHF_EGRINDEX_MASK; +} + +static inline __u32 ipath_hdrget_seq(const __le32 *rbuf) +{ + return (__le32_to_cpu(rbuf[1]) >> INFINIPATH_RHF_SEQ_SHIFT) + & INFINIPATH_RHF_SEQ_MASK; +} + +static inline __u32 ipath_hdrget_offset(const __le32 *rbuf) +{ + return (__le32_to_cpu(rbuf[1]) >> INFINIPATH_RHF_HDRQ_OFFSET_SHIFT) + & INFINIPATH_RHF_HDRQ_OFFSET_MASK; +} + +static inline __u32 ipath_hdrget_use_egr_buf(const __le32 *rbuf) +{ + return __le32_to_cpu(rbuf[0]) & INFINIPATH_RHF_L_USE_EGR; +} + +static inline __u32 ipath_hdrget_ipath_ver(__le32 hdrword) +{ + return (__le32_to_cpu(hdrword) >> INFINIPATH_I_VERS_SHIFT) + & INFINIPATH_I_VERS_MASK; +} + +#endif /* _IPATH_COMMON_H */ diff --git a/drivers/infiniband/hw/ipath/ipath_cq.c b/drivers/infiniband/hw/ipath/ipath_cq.c new file mode 100644 index 0000000..0416c6c --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_cq.c @@ -0,0 +1,478 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "ipath_verbs.h" + +/** + * ipath_cq_enter - add a new entry to the completion queue + * @cq: completion queue + * @entry: work completion entry to add + * @sig: true if @entry is a solicitated entry + * + * This may be called with qp->s_lock held. + */ +void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited) +{ + struct ipath_cq_wc *wc; + unsigned long flags; + u32 head; + u32 next; + + spin_lock_irqsave(&cq->lock, flags); + + /* + * Note that the head pointer might be writable by user processes. + * Take care to verify it is a sane value. + */ + wc = cq->queue; + head = wc->head; + if (head >= (unsigned) cq->ibcq.cqe) { + head = cq->ibcq.cqe; + next = 0; + } else + next = head + 1; + if (unlikely(next == wc->tail)) { + spin_unlock_irqrestore(&cq->lock, flags); + if (cq->ibcq.event_handler) { + struct ib_event ev; + + ev.device = cq->ibcq.device; + ev.element.cq = &cq->ibcq; + ev.event = IB_EVENT_CQ_ERR; + cq->ibcq.event_handler(&ev, cq->ibcq.cq_context); + } + return; + } + if (cq->ip) { + wc->uqueue[head].wr_id = entry->wr_id; + wc->uqueue[head].status = entry->status; + wc->uqueue[head].opcode = entry->opcode; + wc->uqueue[head].vendor_err = entry->vendor_err; + wc->uqueue[head].byte_len = entry->byte_len; + wc->uqueue[head].ex.imm_data = (__u32 __force) entry->ex.imm_data; + wc->uqueue[head].qp_num = entry->qp->qp_num; + wc->uqueue[head].src_qp = entry->src_qp; + wc->uqueue[head].wc_flags = entry->wc_flags; + wc->uqueue[head].pkey_index = entry->pkey_index; + wc->uqueue[head].slid = entry->slid; + wc->uqueue[head].sl = entry->sl; + wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits; + wc->uqueue[head].port_num = entry->port_num; + /* Make sure entry is written before the head index. */ + smp_wmb(); + } else + wc->kqueue[head] = *entry; + wc->head = next; + + if (cq->notify == IB_CQ_NEXT_COMP || + (cq->notify == IB_CQ_SOLICITED && solicited)) { + cq->notify = IB_CQ_NONE; + cq->triggered++; + /* + * This will cause send_complete() to be called in + * another thread. + */ + tasklet_hi_schedule(&cq->comptask); + } + + spin_unlock_irqrestore(&cq->lock, flags); + + if (entry->status != IB_WC_SUCCESS) + to_idev(cq->ibcq.device)->n_wqe_errs++; +} + +/** + * ipath_poll_cq - poll for work completion entries + * @ibcq: the completion queue to poll + * @num_entries: the maximum number of entries to return + * @entry: pointer to array where work completions are placed + * + * Returns the number of completion entries polled. + * + * This may be called from interrupt context. Also called by ib_poll_cq() + * in the generic verbs code. + */ +int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) +{ + struct ipath_cq *cq = to_icq(ibcq); + struct ipath_cq_wc *wc; + unsigned long flags; + int npolled; + u32 tail; + + /* The kernel can only poll a kernel completion queue */ + if (cq->ip) { + npolled = -EINVAL; + goto bail; + } + + spin_lock_irqsave(&cq->lock, flags); + + wc = cq->queue; + tail = wc->tail; + if (tail > (u32) cq->ibcq.cqe) + tail = (u32) cq->ibcq.cqe; + for (npolled = 0; npolled < num_entries; ++npolled, ++entry) { + if (tail == wc->head) + break; + /* The kernel doesn't need a RMB since it has the lock. */ + *entry = wc->kqueue[tail]; + if (tail >= cq->ibcq.cqe) + tail = 0; + else + tail++; + } + wc->tail = tail; + + spin_unlock_irqrestore(&cq->lock, flags); + +bail: + return npolled; +} + +static void send_complete(unsigned long data) +{ + struct ipath_cq *cq = (struct ipath_cq *)data; + + /* + * The completion handler will most likely rearm the notification + * and poll for all pending entries. If a new completion entry + * is added while we are in this routine, tasklet_hi_schedule() + * won't call us again until we return so we check triggered to + * see if we need to call the handler again. + */ + for (;;) { + u8 triggered = cq->triggered; + + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); + + if (cq->triggered == triggered) + return; + } +} + +/** + * ipath_create_cq - create a completion queue + * @ibdev: the device this completion queue is attached to + * @entries: the minimum size of the completion queue + * @context: unused by the InfiniPath driver + * @udata: unused by the InfiniPath driver + * + * Returns a pointer to the completion queue or negative errno values + * for failure. + * + * Called by ib_create_cq() in the generic verbs code. + */ +struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries, int comp_vector, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_cq *cq; + struct ipath_cq_wc *wc; + struct ib_cq *ret; + u32 sz; + + if (entries < 1 || entries > ib_ipath_max_cqes) { + ret = ERR_PTR(-EINVAL); + goto done; + } + + /* Allocate the completion queue structure. */ + cq = kmalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) { + ret = ERR_PTR(-ENOMEM); + goto done; + } + + /* + * Allocate the completion queue entries and head/tail pointers. + * This is allocated separately so that it can be resized and + * also mapped into user space. + * We need to use vmalloc() in order to support mmap and large + * numbers of entries. + */ + sz = sizeof(*wc); + if (udata && udata->outlen >= sizeof(__u64)) + sz += sizeof(struct ib_uverbs_wc) * (entries + 1); + else + sz += sizeof(struct ib_wc) * (entries + 1); + wc = vmalloc_user(sz); + if (!wc) { + ret = ERR_PTR(-ENOMEM); + goto bail_cq; + } + + /* + * Return the address of the WC as the offset to mmap. + * See ipath_mmap() for details. + */ + if (udata && udata->outlen >= sizeof(__u64)) { + int err; + + cq->ip = ipath_create_mmap_info(dev, sz, context, wc); + if (!cq->ip) { + ret = ERR_PTR(-ENOMEM); + goto bail_wc; + } + + err = ib_copy_to_udata(udata, &cq->ip->offset, + sizeof(cq->ip->offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_ip; + } + } else + cq->ip = NULL; + + spin_lock(&dev->n_cqs_lock); + if (dev->n_cqs_allocated == ib_ipath_max_cqs) { + spin_unlock(&dev->n_cqs_lock); + ret = ERR_PTR(-ENOMEM); + goto bail_ip; + } + + dev->n_cqs_allocated++; + spin_unlock(&dev->n_cqs_lock); + + if (cq->ip) { + spin_lock_irq(&dev->pending_lock); + list_add(&cq->ip->pending_mmaps, &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + + /* + * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe. + * The number of entries should be >= the number requested or return + * an error. + */ + cq->ibcq.cqe = entries; + cq->notify = IB_CQ_NONE; + cq->triggered = 0; + spin_lock_init(&cq->lock); + tasklet_init(&cq->comptask, send_complete, (unsigned long)cq); + wc->head = 0; + wc->tail = 0; + cq->queue = wc; + + ret = &cq->ibcq; + + goto done; + +bail_ip: + kfree(cq->ip); +bail_wc: + vfree(wc); +bail_cq: + kfree(cq); +done: + return ret; +} + +/** + * ipath_destroy_cq - destroy a completion queue + * @ibcq: the completion queue to destroy. + * + * Returns 0 for success. + * + * Called by ib_destroy_cq() in the generic verbs code. + */ +int ipath_destroy_cq(struct ib_cq *ibcq) +{ + struct ipath_ibdev *dev = to_idev(ibcq->device); + struct ipath_cq *cq = to_icq(ibcq); + + tasklet_kill(&cq->comptask); + spin_lock(&dev->n_cqs_lock); + dev->n_cqs_allocated--; + spin_unlock(&dev->n_cqs_lock); + if (cq->ip) + kref_put(&cq->ip->ref, ipath_release_mmap_info); + else + vfree(cq->queue); + kfree(cq); + + return 0; +} + +/** + * ipath_req_notify_cq - change the notification type for a completion queue + * @ibcq: the completion queue + * @notify_flags: the type of notification to request + * + * Returns 0 for success. + * + * This may be called from interrupt context. Also called by + * ib_req_notify_cq() in the generic verbs code. + */ +int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags) +{ + struct ipath_cq *cq = to_icq(ibcq); + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&cq->lock, flags); + /* + * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow + * any other transitions (see C11-31 and C11-32 in ch. 11.4.2.2). + */ + if (cq->notify != IB_CQ_NEXT_COMP) + cq->notify = notify_flags & IB_CQ_SOLICITED_MASK; + + if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) && + cq->queue->head != cq->queue->tail) + ret = 1; + + spin_unlock_irqrestore(&cq->lock, flags); + + return ret; +} + +/** + * ipath_resize_cq - change the size of the CQ + * @ibcq: the completion queue + * + * Returns 0 for success. + */ +int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) +{ + struct ipath_cq *cq = to_icq(ibcq); + struct ipath_cq_wc *old_wc; + struct ipath_cq_wc *wc; + u32 head, tail, n; + int ret; + u32 sz; + + if (cqe < 1 || cqe > ib_ipath_max_cqes) { + ret = -EINVAL; + goto bail; + } + + /* + * Need to use vmalloc() if we want to support large #s of entries. + */ + sz = sizeof(*wc); + if (udata && udata->outlen >= sizeof(__u64)) + sz += sizeof(struct ib_uverbs_wc) * (cqe + 1); + else + sz += sizeof(struct ib_wc) * (cqe + 1); + wc = vmalloc_user(sz); + if (!wc) { + ret = -ENOMEM; + goto bail; + } + + /* Check that we can write the offset to mmap. */ + if (udata && udata->outlen >= sizeof(__u64)) { + __u64 offset = 0; + + ret = ib_copy_to_udata(udata, &offset, sizeof(offset)); + if (ret) + goto bail_free; + } + + spin_lock_irq(&cq->lock); + /* + * Make sure head and tail are sane since they + * might be user writable. + */ + old_wc = cq->queue; + head = old_wc->head; + if (head > (u32) cq->ibcq.cqe) + head = (u32) cq->ibcq.cqe; + tail = old_wc->tail; + if (tail > (u32) cq->ibcq.cqe) + tail = (u32) cq->ibcq.cqe; + if (head < tail) + n = cq->ibcq.cqe + 1 + head - tail; + else + n = head - tail; + if (unlikely((u32)cqe < n)) { + ret = -EINVAL; + goto bail_unlock; + } + for (n = 0; tail != head; n++) { + if (cq->ip) + wc->uqueue[n] = old_wc->uqueue[tail]; + else + wc->kqueue[n] = old_wc->kqueue[tail]; + if (tail == (u32) cq->ibcq.cqe) + tail = 0; + else + tail++; + } + cq->ibcq.cqe = cqe; + wc->head = n; + wc->tail = 0; + cq->queue = wc; + spin_unlock_irq(&cq->lock); + + vfree(old_wc); + + if (cq->ip) { + struct ipath_ibdev *dev = to_idev(ibcq->device); + struct ipath_mmap_info *ip = cq->ip; + + ipath_update_mmap_info(dev, ip, sz, wc); + + /* + * Return the offset to mmap. + * See ipath_mmap() for details. + */ + if (udata && udata->outlen >= sizeof(__u64)) { + ret = ib_copy_to_udata(udata, &ip->offset, + sizeof(ip->offset)); + if (ret) + goto bail; + } + + spin_lock_irq(&dev->pending_lock); + if (list_empty(&ip->pending_mmaps)) + list_add(&ip->pending_mmaps, &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + + ret = 0; + goto bail; + +bail_unlock: + spin_unlock_irq(&cq->lock); +bail_free: + vfree(wc); +bail: + return ret; +} diff --git a/drivers/infiniband/hw/ipath/ipath_debug.h b/drivers/infiniband/hw/ipath/ipath_debug.h new file mode 100644 index 0000000..65926cd --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_debug.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPATH_DEBUG_H +#define _IPATH_DEBUG_H + +#ifndef _IPATH_DEBUGGING /* debugging enabled or not */ +#define _IPATH_DEBUGGING 1 +#endif + +#if _IPATH_DEBUGGING + +/* + * Mask values for debugging. The scheme allows us to compile out any + * of the debug tracing stuff, and if compiled in, to enable or disable + * dynamically. This can be set at modprobe time also: + * modprobe infinipath.ko infinipath_debug=7 + */ + +#define __IPATH_INFO 0x1 /* generic low verbosity stuff */ +#define __IPATH_DBG 0x2 /* generic debug */ +#define __IPATH_TRSAMPLE 0x8 /* generate trace buffer sample entries */ +/* leave some low verbosity spots open */ +#define __IPATH_VERBDBG 0x40 /* very verbose debug */ +#define __IPATH_PKTDBG 0x80 /* print packet data */ +/* print process startup (init)/exit messages */ +#define __IPATH_PROCDBG 0x100 +/* print mmap/fault stuff, not using VDBG any more */ +#define __IPATH_MMDBG 0x200 +#define __IPATH_ERRPKTDBG 0x400 +#define __IPATH_USER_SEND 0x1000 /* use user mode send */ +#define __IPATH_KERNEL_SEND 0x2000 /* use kernel mode send */ +#define __IPATH_EPKTDBG 0x4000 /* print ethernet packet data */ +#define __IPATH_IPATHDBG 0x10000 /* Ethernet (IPATH) gen debug */ +#define __IPATH_IPATHWARN 0x20000 /* Ethernet (IPATH) warnings */ +#define __IPATH_IPATHERR 0x40000 /* Ethernet (IPATH) errors */ +#define __IPATH_IPATHPD 0x80000 /* Ethernet (IPATH) packet dump */ +#define __IPATH_IPATHTABLE 0x100000 /* Ethernet (IPATH) table dump */ +#define __IPATH_LINKVERBDBG 0x200000 /* very verbose linkchange debug */ + +#else /* _IPATH_DEBUGGING */ + +/* + * define all of these even with debugging off, for the few places that do + * if(infinipath_debug & _IPATH_xyzzy), but in a way that will make the + * compiler eliminate the code + */ + +#define __IPATH_INFO 0x0 /* generic low verbosity stuff */ +#define __IPATH_DBG 0x0 /* generic debug */ +#define __IPATH_TRSAMPLE 0x0 /* generate trace buffer sample entries */ +#define __IPATH_VERBDBG 0x0 /* very verbose debug */ +#define __IPATH_PKTDBG 0x0 /* print packet data */ +#define __IPATH_PROCDBG 0x0 /* process startup (init)/exit messages */ +/* print mmap/fault stuff, not using VDBG any more */ +#define __IPATH_MMDBG 0x0 +#define __IPATH_EPKTDBG 0x0 /* print ethernet packet data */ +#define __IPATH_IPATHDBG 0x0 /* Ethernet (IPATH) table dump on */ +#define __IPATH_IPATHWARN 0x0 /* Ethernet (IPATH) warnings on */ +#define __IPATH_IPATHERR 0x0 /* Ethernet (IPATH) errors on */ +#define __IPATH_IPATHPD 0x0 /* Ethernet (IPATH) packet dump on */ +#define __IPATH_IPATHTABLE 0x0 /* Ethernet (IPATH) packet dump on */ +#define __IPATH_LINKVERBDBG 0x0 /* very verbose linkchange debug */ + +#endif /* _IPATH_DEBUGGING */ + +#define __IPATH_VERBOSEDBG __IPATH_VERBDBG + +#endif /* _IPATH_DEBUG_H */ diff --git a/drivers/infiniband/hw/ipath/ipath_diag.c b/drivers/infiniband/hw/ipath/ipath_diag.c new file mode 100644 index 0000000..45802e9 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_diag.c @@ -0,0 +1,551 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This file contains support for diagnostic functions. It is accessed by + * opening the ipath_diag device, normally minor number 129. Diagnostic use + * of the InfiniPath chip may render the chip or board unusable until the + * driver is unloaded, or in some cases, until the system is rebooted. + * + * Accesses to the chip through this interface are not similar to going + * through the /sys/bus/pci resource mmap interface. + */ + +#include +#include +#include +#include +#include +#include + +#include "ipath_kernel.h" +#include "ipath_common.h" + +int ipath_diag_inuse; +static int diag_set_link; + +static int ipath_diag_open(struct inode *in, struct file *fp); +static int ipath_diag_release(struct inode *in, struct file *fp); +static ssize_t ipath_diag_read(struct file *fp, char __user *data, + size_t count, loff_t *off); +static ssize_t ipath_diag_write(struct file *fp, const char __user *data, + size_t count, loff_t *off); + +static const struct file_operations diag_file_ops = { + .owner = THIS_MODULE, + .write = ipath_diag_write, + .read = ipath_diag_read, + .open = ipath_diag_open, + .release = ipath_diag_release, + .llseek = default_llseek, +}; + +static ssize_t ipath_diagpkt_write(struct file *fp, + const char __user *data, + size_t count, loff_t *off); + +static const struct file_operations diagpkt_file_ops = { + .owner = THIS_MODULE, + .write = ipath_diagpkt_write, + .llseek = noop_llseek, +}; + +static atomic_t diagpkt_count = ATOMIC_INIT(0); +static struct cdev *diagpkt_cdev; +static struct device *diagpkt_dev; + +int ipath_diag_add(struct ipath_devdata *dd) +{ + char name[16]; + int ret = 0; + + if (atomic_inc_return(&diagpkt_count) == 1) { + ret = ipath_cdev_init(IPATH_DIAGPKT_MINOR, + "ipath_diagpkt", &diagpkt_file_ops, + &diagpkt_cdev, &diagpkt_dev); + + if (ret) { + ipath_dev_err(dd, "Couldn't create ipath_diagpkt " + "device: %d", ret); + goto done; + } + } + + snprintf(name, sizeof(name), "ipath_diag%d", dd->ipath_unit); + + ret = ipath_cdev_init(IPATH_DIAG_MINOR_BASE + dd->ipath_unit, name, + &diag_file_ops, &dd->diag_cdev, + &dd->diag_dev); + if (ret) + ipath_dev_err(dd, "Couldn't create %s device: %d", + name, ret); + +done: + return ret; +} + +void ipath_diag_remove(struct ipath_devdata *dd) +{ + if (atomic_dec_and_test(&diagpkt_count)) + ipath_cdev_cleanup(&diagpkt_cdev, &diagpkt_dev); + + ipath_cdev_cleanup(&dd->diag_cdev, &dd->diag_dev); +} + +/** + * ipath_read_umem64 - read a 64-bit quantity from the chip into user space + * @dd: the infinipath device + * @uaddr: the location to store the data in user memory + * @caddr: the source chip address (full pointer, not offset) + * @count: number of bytes to copy (multiple of 32 bits) + * + * This function also localizes all chip memory accesses. + * The copy should be written such that we read full cacheline packets + * from the chip. This is usually used for a single qword + * + * NOTE: This assumes the chip address is 64-bit aligned. + */ +static int ipath_read_umem64(struct ipath_devdata *dd, void __user *uaddr, + const void __iomem *caddr, size_t count) +{ + const u64 __iomem *reg_addr = caddr; + const u64 __iomem *reg_end = reg_addr + (count / sizeof(u64)); + int ret; + + /* not very efficient, but it works for now */ + if (reg_addr < dd->ipath_kregbase || reg_end > dd->ipath_kregend) { + ret = -EINVAL; + goto bail; + } + while (reg_addr < reg_end) { + u64 data = readq(reg_addr); + if (copy_to_user(uaddr, &data, sizeof(u64))) { + ret = -EFAULT; + goto bail; + } + reg_addr++; + uaddr += sizeof(u64); + } + ret = 0; +bail: + return ret; +} + +/** + * ipath_write_umem64 - write a 64-bit quantity to the chip from user space + * @dd: the infinipath device + * @caddr: the destination chip address (full pointer, not offset) + * @uaddr: the source of the data in user memory + * @count: the number of bytes to copy (multiple of 32 bits) + * + * This is usually used for a single qword + * NOTE: This assumes the chip address is 64-bit aligned. + */ + +static int ipath_write_umem64(struct ipath_devdata *dd, void __iomem *caddr, + const void __user *uaddr, size_t count) +{ + u64 __iomem *reg_addr = caddr; + const u64 __iomem *reg_end = reg_addr + (count / sizeof(u64)); + int ret; + + /* not very efficient, but it works for now */ + if (reg_addr < dd->ipath_kregbase || reg_end > dd->ipath_kregend) { + ret = -EINVAL; + goto bail; + } + while (reg_addr < reg_end) { + u64 data; + if (copy_from_user(&data, uaddr, sizeof(data))) { + ret = -EFAULT; + goto bail; + } + writeq(data, reg_addr); + + reg_addr++; + uaddr += sizeof(u64); + } + ret = 0; +bail: + return ret; +} + +/** + * ipath_read_umem32 - read a 32-bit quantity from the chip into user space + * @dd: the infinipath device + * @uaddr: the location to store the data in user memory + * @caddr: the source chip address (full pointer, not offset) + * @count: number of bytes to copy + * + * read 32 bit values, not 64 bit; for memories that only + * support 32 bit reads; usually a single dword. + */ +static int ipath_read_umem32(struct ipath_devdata *dd, void __user *uaddr, + const void __iomem *caddr, size_t count) +{ + const u32 __iomem *reg_addr = caddr; + const u32 __iomem *reg_end = reg_addr + (count / sizeof(u32)); + int ret; + + if (reg_addr < (u32 __iomem *) dd->ipath_kregbase || + reg_end > (u32 __iomem *) dd->ipath_kregend) { + ret = -EINVAL; + goto bail; + } + /* not very efficient, but it works for now */ + while (reg_addr < reg_end) { + u32 data = readl(reg_addr); + if (copy_to_user(uaddr, &data, sizeof(data))) { + ret = -EFAULT; + goto bail; + } + + reg_addr++; + uaddr += sizeof(u32); + + } + ret = 0; +bail: + return ret; +} + +/** + * ipath_write_umem32 - write a 32-bit quantity to the chip from user space + * @dd: the infinipath device + * @caddr: the destination chip address (full pointer, not offset) + * @uaddr: the source of the data in user memory + * @count: number of bytes to copy + * + * write 32 bit values, not 64 bit; for memories that only + * support 32 bit write; usually a single dword. + */ + +static int ipath_write_umem32(struct ipath_devdata *dd, void __iomem *caddr, + const void __user *uaddr, size_t count) +{ + u32 __iomem *reg_addr = caddr; + const u32 __iomem *reg_end = reg_addr + (count / sizeof(u32)); + int ret; + + if (reg_addr < (u32 __iomem *) dd->ipath_kregbase || + reg_end > (u32 __iomem *) dd->ipath_kregend) { + ret = -EINVAL; + goto bail; + } + while (reg_addr < reg_end) { + u32 data; + if (copy_from_user(&data, uaddr, sizeof(data))) { + ret = -EFAULT; + goto bail; + } + writel(data, reg_addr); + + reg_addr++; + uaddr += sizeof(u32); + } + ret = 0; +bail: + return ret; +} + +static int ipath_diag_open(struct inode *in, struct file *fp) +{ + int unit = iminor(in) - IPATH_DIAG_MINOR_BASE; + struct ipath_devdata *dd; + int ret; + + mutex_lock(&ipath_mutex); + + if (ipath_diag_inuse) { + ret = -EBUSY; + goto bail; + } + + dd = ipath_lookup(unit); + + if (dd == NULL || !(dd->ipath_flags & IPATH_PRESENT) || + !dd->ipath_kregbase) { + ret = -ENODEV; + goto bail; + } + + fp->private_data = dd; + ipath_diag_inuse = -2; + diag_set_link = 0; + ret = 0; + + /* Only expose a way to reset the device if we + make it into diag mode. */ + ipath_expose_reset(&dd->pcidev->dev); + +bail: + mutex_unlock(&ipath_mutex); + + return ret; +} + +/** + * ipath_diagpkt_write - write an IB packet + * @fp: the diag data device file pointer + * @data: ipath_diag_pkt structure saying where to get the packet + * @count: size of data to write + * @off: unused by this code + */ +static ssize_t ipath_diagpkt_write(struct file *fp, + const char __user *data, + size_t count, loff_t *off) +{ + u32 __iomem *piobuf; + u32 plen, pbufn, maxlen_reserve; + struct ipath_diag_pkt odp; + struct ipath_diag_xpkt dp; + u32 *tmpbuf = NULL; + struct ipath_devdata *dd; + ssize_t ret = 0; + u64 val; + u32 l_state, lt_state; /* LinkState, LinkTrainingState */ + + + if (count == sizeof(dp)) { + if (copy_from_user(&dp, data, sizeof(dp))) { + ret = -EFAULT; + goto bail; + } + } else if (count == sizeof(odp)) { + if (copy_from_user(&odp, data, sizeof(odp))) { + ret = -EFAULT; + goto bail; + } + dp.len = odp.len; + dp.unit = odp.unit; + dp.data = odp.data; + dp.pbc_wd = 0; + } else { + ret = -EINVAL; + goto bail; + } + + /* send count must be an exact number of dwords */ + if (dp.len & 3) { + ret = -EINVAL; + goto bail; + } + + plen = dp.len >> 2; + + dd = ipath_lookup(dp.unit); + if (!dd || !(dd->ipath_flags & IPATH_PRESENT) || + !dd->ipath_kregbase) { + ipath_cdbg(VERBOSE, "illegal unit %u for diag data send\n", + dp.unit); + ret = -ENODEV; + goto bail; + } + + if (ipath_diag_inuse && !diag_set_link && + !(dd->ipath_flags & IPATH_LINKACTIVE)) { + diag_set_link = 1; + ipath_cdbg(VERBOSE, "Trying to set to set link active for " + "diag pkt\n"); + ipath_set_linkstate(dd, IPATH_IB_LINKARM); + ipath_set_linkstate(dd, IPATH_IB_LINKACTIVE); + } + + if (!(dd->ipath_flags & IPATH_INITTED)) { + /* no hardware, freeze, etc. */ + ipath_cdbg(VERBOSE, "unit %u not usable\n", dd->ipath_unit); + ret = -ENODEV; + goto bail; + } + /* + * Want to skip check for l_state if using custom PBC, + * because we might be trying to force an SM packet out. + * first-cut, skip _all_ state checking in that case. + */ + val = ipath_ib_state(dd, dd->ipath_lastibcstat); + lt_state = ipath_ib_linktrstate(dd, dd->ipath_lastibcstat); + l_state = ipath_ib_linkstate(dd, dd->ipath_lastibcstat); + if (!dp.pbc_wd && (lt_state != INFINIPATH_IBCS_LT_STATE_LINKUP || + (val != dd->ib_init && val != dd->ib_arm && + val != dd->ib_active))) { + ipath_cdbg(VERBOSE, "unit %u not ready (state %llx)\n", + dd->ipath_unit, (unsigned long long) val); + ret = -EINVAL; + goto bail; + } + + /* + * need total length before first word written, plus 2 Dwords. One Dword + * is for padding so we get the full user data when not aligned on + * a word boundary. The other Dword is to make sure we have room for the + * ICRC which gets tacked on later. + */ + maxlen_reserve = 2 * sizeof(u32); + if (dp.len > dd->ipath_ibmaxlen - maxlen_reserve) { + ipath_dbg("Pkt len 0x%x > ibmaxlen %x\n", + dp.len, dd->ipath_ibmaxlen); + ret = -EINVAL; + goto bail; + } + + plen = sizeof(u32) + dp.len; + + tmpbuf = vmalloc(plen); + if (!tmpbuf) { + dev_info(&dd->pcidev->dev, "Unable to allocate tmp buffer, " + "failing\n"); + ret = -ENOMEM; + goto bail; + } + + if (copy_from_user(tmpbuf, + (const void __user *) (unsigned long) dp.data, + dp.len)) { + ret = -EFAULT; + goto bail; + } + + plen >>= 2; /* in dwords */ + + piobuf = ipath_getpiobuf(dd, plen, &pbufn); + if (!piobuf) { + ipath_cdbg(VERBOSE, "No PIO buffers avail unit for %u\n", + dd->ipath_unit); + ret = -EBUSY; + goto bail; + } + /* disarm it just to be extra sure */ + ipath_disarm_piobufs(dd, pbufn, 1); + + if (ipath_debug & __IPATH_PKTDBG) + ipath_cdbg(VERBOSE, "unit %u 0x%x+1w pio%d\n", + dd->ipath_unit, plen - 1, pbufn); + + if (dp.pbc_wd == 0) + dp.pbc_wd = plen; + writeq(dp.pbc_wd, piobuf); + /* + * Copy all by the trigger word, then flush, so it's written + * to chip before trigger word, then write trigger word, then + * flush again, so packet is sent. + */ + if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) { + ipath_flush_wc(); + __iowrite32_copy(piobuf + 2, tmpbuf, plen - 1); + ipath_flush_wc(); + __raw_writel(tmpbuf[plen - 1], piobuf + plen + 1); + } else + __iowrite32_copy(piobuf + 2, tmpbuf, plen); + + ipath_flush_wc(); + + ret = sizeof(dp); + +bail: + vfree(tmpbuf); + return ret; +} + +static int ipath_diag_release(struct inode *in, struct file *fp) +{ + mutex_lock(&ipath_mutex); + ipath_diag_inuse = 0; + fp->private_data = NULL; + mutex_unlock(&ipath_mutex); + return 0; +} + +static ssize_t ipath_diag_read(struct file *fp, char __user *data, + size_t count, loff_t *off) +{ + struct ipath_devdata *dd = fp->private_data; + void __iomem *kreg_base; + ssize_t ret; + + kreg_base = dd->ipath_kregbase; + + if (count == 0) + ret = 0; + else if ((count % 4) || (*off % 4)) + /* address or length is not 32-bit aligned, hence invalid */ + ret = -EINVAL; + else if (ipath_diag_inuse < 1 && (*off || count != 8)) + ret = -EINVAL; /* prevent cat /dev/ipath_diag* */ + else if ((count % 8) || (*off % 8)) + /* address or length not 64-bit aligned; do 32-bit reads */ + ret = ipath_read_umem32(dd, data, kreg_base + *off, count); + else + ret = ipath_read_umem64(dd, data, kreg_base + *off, count); + + if (ret >= 0) { + *off += count; + ret = count; + if (ipath_diag_inuse == -2) + ipath_diag_inuse++; + } + + return ret; +} + +static ssize_t ipath_diag_write(struct file *fp, const char __user *data, + size_t count, loff_t *off) +{ + struct ipath_devdata *dd = fp->private_data; + void __iomem *kreg_base; + ssize_t ret; + + kreg_base = dd->ipath_kregbase; + + if (count == 0) + ret = 0; + else if ((count % 4) || (*off % 4)) + /* address or length is not 32-bit aligned, hence invalid */ + ret = -EINVAL; + else if ((ipath_diag_inuse == -1 && (*off || count != 8)) || + ipath_diag_inuse == -2) /* read qw off 0, write qw off 0 */ + ret = -EINVAL; /* before any other write allowed */ + else if ((count % 8) || (*off % 8)) + /* address or length not 64-bit aligned; do 32-bit writes */ + ret = ipath_write_umem32(dd, kreg_base + *off, data, count); + else + ret = ipath_write_umem64(dd, kreg_base + *off, data, count); + + if (ret >= 0) { + *off += count; + ret = count; + if (ipath_diag_inuse == -1) + ipath_diag_inuse = 1; /* all read/write OK now */ + } + + return ret; +} diff --git a/drivers/infiniband/hw/ipath/ipath_dma.c b/drivers/infiniband/hw/ipath/ipath_dma.c new file mode 100644 index 0000000..123a8c0 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_dma.c @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2006 QLogic, Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "ipath_verbs.h" + +#define BAD_DMA_ADDRESS ((u64) 0) + +/* + * The following functions implement driver specific replacements + * for the ib_dma_*() functions. + * + * These functions return kernel virtual addresses instead of + * device bus addresses since the driver uses the CPU to copy + * data instead of using hardware DMA. + */ + +static int ipath_mapping_error(struct ib_device *dev, u64 dma_addr) +{ + return dma_addr == BAD_DMA_ADDRESS; +} + +static u64 ipath_dma_map_single(struct ib_device *dev, + void *cpu_addr, size_t size, + enum dma_data_direction direction) +{ + BUG_ON(!valid_dma_direction(direction)); + return (u64) cpu_addr; +} + +static void ipath_dma_unmap_single(struct ib_device *dev, + u64 addr, size_t size, + enum dma_data_direction direction) +{ + BUG_ON(!valid_dma_direction(direction)); +} + +static u64 ipath_dma_map_page(struct ib_device *dev, + struct page *page, + unsigned long offset, + size_t size, + enum dma_data_direction direction) +{ + u64 addr; + + BUG_ON(!valid_dma_direction(direction)); + + if (offset + size > PAGE_SIZE) { + addr = BAD_DMA_ADDRESS; + goto done; + } + + addr = (u64) page_address(page); + if (addr) + addr += offset; + /* TODO: handle highmem pages */ + +done: + return addr; +} + +static void ipath_dma_unmap_page(struct ib_device *dev, + u64 addr, size_t size, + enum dma_data_direction direction) +{ + BUG_ON(!valid_dma_direction(direction)); +} + +static int ipath_map_sg(struct ib_device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction direction) +{ + struct scatterlist *sg; + u64 addr; + int i; + int ret = nents; + + BUG_ON(!valid_dma_direction(direction)); + + for_each_sg(sgl, sg, nents, i) { + addr = (u64) page_address(sg_page(sg)); + /* TODO: handle highmem pages */ + if (!addr) { + ret = 0; + break; + } + sg->dma_address = addr + sg->offset; +#ifdef CONFIG_NEED_SG_DMA_LENGTH + sg->dma_length = sg->length; +#endif + } + return ret; +} + +static void ipath_unmap_sg(struct ib_device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + BUG_ON(!valid_dma_direction(direction)); +} + +static void ipath_sync_single_for_cpu(struct ib_device *dev, + u64 addr, + size_t size, + enum dma_data_direction dir) +{ +} + +static void ipath_sync_single_for_device(struct ib_device *dev, + u64 addr, + size_t size, + enum dma_data_direction dir) +{ +} + +static void *ipath_dma_alloc_coherent(struct ib_device *dev, size_t size, + u64 *dma_handle, gfp_t flag) +{ + struct page *p; + void *addr = NULL; + + p = alloc_pages(flag, get_order(size)); + if (p) + addr = page_address(p); + if (dma_handle) + *dma_handle = (u64) addr; + return addr; +} + +static void ipath_dma_free_coherent(struct ib_device *dev, size_t size, + void *cpu_addr, u64 dma_handle) +{ + free_pages((unsigned long) cpu_addr, get_order(size)); +} + +struct ib_dma_mapping_ops ipath_dma_mapping_ops = { + .mapping_error = ipath_mapping_error, + .map_single = ipath_dma_map_single, + .unmap_single = ipath_dma_unmap_single, + .map_page = ipath_dma_map_page, + .unmap_page = ipath_dma_unmap_page, + .map_sg = ipath_map_sg, + .unmap_sg = ipath_unmap_sg, + .sync_single_for_cpu = ipath_sync_single_for_cpu, + .sync_single_for_device = ipath_sync_single_for_device, + .alloc_coherent = ipath_dma_alloc_coherent, + .free_coherent = ipath_dma_free_coherent +}; diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c new file mode 100644 index 0000000..bd0caed --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -0,0 +1,2779 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ipath_kernel.h" +#include "ipath_verbs.h" + +static void ipath_update_pio_bufs(struct ipath_devdata *); + +const char *ipath_get_unit_name(int unit) +{ + static char iname[16]; + snprintf(iname, sizeof iname, "infinipath%u", unit); + return iname; +} + +#define DRIVER_LOAD_MSG "QLogic " IPATH_DRV_NAME " loaded: " +#define PFX IPATH_DRV_NAME ": " + +/* + * The size has to be longer than this string, so we can append + * board/chip information to it in the init code. + */ +const char ib_ipath_version[] = IPATH_IDSTR "\n"; + +static struct idr unit_table; +DEFINE_SPINLOCK(ipath_devs_lock); +LIST_HEAD(ipath_dev_list); + +wait_queue_head_t ipath_state_wait; + +unsigned ipath_debug = __IPATH_INFO; + +module_param_named(debug, ipath_debug, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(debug, "mask for debug prints"); +EXPORT_SYMBOL_GPL(ipath_debug); + +unsigned ipath_mtu4096 = 1; /* max 4KB IB mtu by default, if supported */ +module_param_named(mtu4096, ipath_mtu4096, uint, S_IRUGO); +MODULE_PARM_DESC(mtu4096, "enable MTU of 4096 bytes, if supported"); + +static unsigned ipath_hol_timeout_ms = 13000; +module_param_named(hol_timeout_ms, ipath_hol_timeout_ms, uint, S_IRUGO); +MODULE_PARM_DESC(hol_timeout_ms, + "duration of user app suspension after link failure"); + +unsigned ipath_linkrecovery = 1; +module_param_named(linkrecovery, ipath_linkrecovery, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(linkrecovery, "enable workaround for link recovery issue"); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("QLogic "); +MODULE_DESCRIPTION("QLogic InfiniPath driver"); + +/* + * Table to translate the LINKTRAININGSTATE portion of + * IBCStatus to a human-readable form. + */ +const char *ipath_ibcstatus_str[] = { + "Disabled", + "LinkUp", + "PollActive", + "PollQuiet", + "SleepDelay", + "SleepQuiet", + "LState6", /* unused */ + "LState7", /* unused */ + "CfgDebounce", + "CfgRcvfCfg", + "CfgWaitRmt", + "CfgIdle", + "RecovRetrain", + "CfgTxRevLane", /* unused before IBA7220 */ + "RecovWaitRmt", + "RecovIdle", + /* below were added for IBA7220 */ + "CfgEnhanced", + "CfgTest", + "CfgWaitRmtTest", + "CfgWaitCfgEnhanced", + "SendTS_T", + "SendTstIdles", + "RcvTS_T", + "SendTst_TS1s", + "LTState18", "LTState19", "LTState1A", "LTState1B", + "LTState1C", "LTState1D", "LTState1E", "LTState1F" +}; + +static void ipath_remove_one(struct pci_dev *); +static int ipath_init_one(struct pci_dev *, const struct pci_device_id *); + +/* Only needed for registration, nothing else needs this info */ +#define PCI_VENDOR_ID_PATHSCALE 0x1fc1 +#define PCI_DEVICE_ID_INFINIPATH_HT 0xd + +/* Number of seconds before our card status check... */ +#define STATUS_TIMEOUT 60 + +static const struct pci_device_id ipath_pci_tbl[] = { + { PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, PCI_DEVICE_ID_INFINIPATH_HT) }, + { 0, } +}; + +MODULE_DEVICE_TABLE(pci, ipath_pci_tbl); + +static struct pci_driver ipath_driver = { + .name = IPATH_DRV_NAME, + .probe = ipath_init_one, + .remove = ipath_remove_one, + .id_table = ipath_pci_tbl, + .driver = { + .groups = ipath_driver_attr_groups, + }, +}; + +static inline void read_bars(struct ipath_devdata *dd, struct pci_dev *dev, + u32 *bar0, u32 *bar1) +{ + int ret; + + ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_0, bar0); + if (ret) + ipath_dev_err(dd, "failed to read bar0 before enable: " + "error %d\n", -ret); + + ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_1, bar1); + if (ret) + ipath_dev_err(dd, "failed to read bar1 before enable: " + "error %d\n", -ret); + + ipath_dbg("Read bar0 %x bar1 %x\n", *bar0, *bar1); +} + +static void ipath_free_devdata(struct pci_dev *pdev, + struct ipath_devdata *dd) +{ + unsigned long flags; + + pci_set_drvdata(pdev, NULL); + + if (dd->ipath_unit != -1) { + spin_lock_irqsave(&ipath_devs_lock, flags); + idr_remove(&unit_table, dd->ipath_unit); + list_del(&dd->ipath_list); + spin_unlock_irqrestore(&ipath_devs_lock, flags); + } + vfree(dd); +} + +static struct ipath_devdata *ipath_alloc_devdata(struct pci_dev *pdev) +{ + unsigned long flags; + struct ipath_devdata *dd; + int ret; + + dd = vzalloc(sizeof(*dd)); + if (!dd) { + dd = ERR_PTR(-ENOMEM); + goto bail; + } + dd->ipath_unit = -1; + + idr_preload(GFP_KERNEL); + spin_lock_irqsave(&ipath_devs_lock, flags); + + ret = idr_alloc(&unit_table, dd, 0, 0, GFP_NOWAIT); + if (ret < 0) { + printk(KERN_ERR IPATH_DRV_NAME + ": Could not allocate unit ID: error %d\n", -ret); + ipath_free_devdata(pdev, dd); + dd = ERR_PTR(ret); + goto bail_unlock; + } + dd->ipath_unit = ret; + + dd->pcidev = pdev; + pci_set_drvdata(pdev, dd); + + list_add(&dd->ipath_list, &ipath_dev_list); + +bail_unlock: + spin_unlock_irqrestore(&ipath_devs_lock, flags); + idr_preload_end(); +bail: + return dd; +} + +static inline struct ipath_devdata *__ipath_lookup(int unit) +{ + return idr_find(&unit_table, unit); +} + +struct ipath_devdata *ipath_lookup(int unit) +{ + struct ipath_devdata *dd; + unsigned long flags; + + spin_lock_irqsave(&ipath_devs_lock, flags); + dd = __ipath_lookup(unit); + spin_unlock_irqrestore(&ipath_devs_lock, flags); + + return dd; +} + +int ipath_count_units(int *npresentp, int *nupp, int *maxportsp) +{ + int nunits, npresent, nup; + struct ipath_devdata *dd; + unsigned long flags; + int maxports; + + nunits = npresent = nup = maxports = 0; + + spin_lock_irqsave(&ipath_devs_lock, flags); + + list_for_each_entry(dd, &ipath_dev_list, ipath_list) { + nunits++; + if ((dd->ipath_flags & IPATH_PRESENT) && dd->ipath_kregbase) + npresent++; + if (dd->ipath_lid && + !(dd->ipath_flags & (IPATH_DISABLED | IPATH_LINKDOWN + | IPATH_LINKUNK))) + nup++; + if (dd->ipath_cfgports > maxports) + maxports = dd->ipath_cfgports; + } + + spin_unlock_irqrestore(&ipath_devs_lock, flags); + + if (npresentp) + *npresentp = npresent; + if (nupp) + *nupp = nup; + if (maxportsp) + *maxportsp = maxports; + + return nunits; +} + +/* + * These next two routines are placeholders in case we don't have per-arch + * code for controlling write combining. If explicit control of write + * combining is not available, performance will probably be awful. + */ + +int __attribute__((weak)) ipath_enable_wc(struct ipath_devdata *dd) +{ + return -EOPNOTSUPP; +} + +void __attribute__((weak)) ipath_disable_wc(struct ipath_devdata *dd) +{ +} + +/* + * Perform a PIO buffer bandwidth write test, to verify proper system + * configuration. Even when all the setup calls work, occasionally + * BIOS or other issues can prevent write combining from working, or + * can cause other bandwidth problems to the chip. + * + * This test simply writes the same buffer over and over again, and + * measures close to the peak bandwidth to the chip (not testing + * data bandwidth to the wire). On chips that use an address-based + * trigger to send packets to the wire, this is easy. On chips that + * use a count to trigger, we want to make sure that the packet doesn't + * go out on the wire, or trigger flow control checks. + */ +static void ipath_verify_pioperf(struct ipath_devdata *dd) +{ + u32 pbnum, cnt, lcnt; + u32 __iomem *piobuf; + u32 *addr; + u64 msecs, emsecs; + + piobuf = ipath_getpiobuf(dd, 0, &pbnum); + if (!piobuf) { + dev_info(&dd->pcidev->dev, + "No PIObufs for checking perf, skipping\n"); + return; + } + + /* + * Enough to give us a reasonable test, less than piobuf size, and + * likely multiple of store buffer length. + */ + cnt = 1024; + + addr = vmalloc(cnt); + if (!addr) { + dev_info(&dd->pcidev->dev, + "Couldn't get memory for checking PIO perf," + " skipping\n"); + goto done; + } + + preempt_disable(); /* we want reasonably accurate elapsed time */ + msecs = 1 + jiffies_to_msecs(jiffies); + for (lcnt = 0; lcnt < 10000U; lcnt++) { + /* wait until we cross msec boundary */ + if (jiffies_to_msecs(jiffies) >= msecs) + break; + udelay(1); + } + + ipath_disable_armlaunch(dd); + + /* + * length 0, no dwords actually sent, and mark as VL15 + * on chips where that may matter (due to IB flowcontrol) + */ + if ((dd->ipath_flags & IPATH_HAS_PBC_CNT)) + writeq(1UL << 63, piobuf); + else + writeq(0, piobuf); + ipath_flush_wc(); + + /* + * this is only roughly accurate, since even with preempt we + * still take interrupts that could take a while. Running for + * >= 5 msec seems to get us "close enough" to accurate values + */ + msecs = jiffies_to_msecs(jiffies); + for (emsecs = lcnt = 0; emsecs <= 5UL; lcnt++) { + __iowrite32_copy(piobuf + 64, addr, cnt >> 2); + emsecs = jiffies_to_msecs(jiffies) - msecs; + } + + /* 1 GiB/sec, slightly over IB SDR line rate */ + if (lcnt < (emsecs * 1024U)) + ipath_dev_err(dd, + "Performance problem: bandwidth to PIO buffers is " + "only %u MiB/sec\n", + lcnt / (u32) emsecs); + else + ipath_dbg("PIO buffer bandwidth %u MiB/sec is OK\n", + lcnt / (u32) emsecs); + + preempt_enable(); + + vfree(addr); + +done: + /* disarm piobuf, so it's available again */ + ipath_disarm_piobufs(dd, pbnum, 1); + ipath_enable_armlaunch(dd); +} + +static void cleanup_device(struct ipath_devdata *dd); + +static int ipath_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + int ret, len, j; + struct ipath_devdata *dd; + unsigned long long addr; + u32 bar0 = 0, bar1 = 0; + + dd = ipath_alloc_devdata(pdev); + if (IS_ERR(dd)) { + ret = PTR_ERR(dd); + printk(KERN_ERR IPATH_DRV_NAME + ": Could not allocate devdata: error %d\n", -ret); + goto bail; + } + + ipath_cdbg(VERBOSE, "initializing unit #%u\n", dd->ipath_unit); + + ret = pci_enable_device(pdev); + if (ret) { + /* This can happen iff: + * + * We did a chip reset, and then failed to reprogram the + * BAR, or the chip reset due to an internal error. We then + * unloaded the driver and reloaded it. + * + * Both reset cases set the BAR back to initial state. For + * the latter case, the AER sticky error bit at offset 0x718 + * should be set, but the Linux kernel doesn't yet know + * about that, it appears. If the original BAR was retained + * in the kernel data structures, this may be OK. + */ + ipath_dev_err(dd, "enable unit %d failed: error %d\n", + dd->ipath_unit, -ret); + goto bail_devdata; + } + addr = pci_resource_start(pdev, 0); + len = pci_resource_len(pdev, 0); + ipath_cdbg(VERBOSE, "regbase (0) %llx len %d irq %d, vend %x/%x " + "driver_data %lx\n", addr, len, pdev->irq, ent->vendor, + ent->device, ent->driver_data); + + read_bars(dd, pdev, &bar0, &bar1); + + if (!bar1 && !(bar0 & ~0xf)) { + if (addr) { + dev_info(&pdev->dev, "BAR is 0 (probable RESET), " + "rewriting as %llx\n", addr); + ret = pci_write_config_dword( + pdev, PCI_BASE_ADDRESS_0, addr); + if (ret) { + ipath_dev_err(dd, "rewrite of BAR0 " + "failed: err %d\n", -ret); + goto bail_disable; + } + ret = pci_write_config_dword( + pdev, PCI_BASE_ADDRESS_1, addr >> 32); + if (ret) { + ipath_dev_err(dd, "rewrite of BAR1 " + "failed: err %d\n", -ret); + goto bail_disable; + } + } else { + ipath_dev_err(dd, "BAR is 0 (probable RESET), " + "not usable until reboot\n"); + ret = -ENODEV; + goto bail_disable; + } + } + + ret = pci_request_regions(pdev, IPATH_DRV_NAME); + if (ret) { + dev_info(&pdev->dev, "pci_request_regions unit %u fails: " + "err %d\n", dd->ipath_unit, -ret); + goto bail_disable; + } + + ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + if (ret) { + /* + * if the 64 bit setup fails, try 32 bit. Some systems + * do not setup 64 bit maps on systems with 2GB or less + * memory installed. + */ + ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + if (ret) { + dev_info(&pdev->dev, + "Unable to set DMA mask for unit %u: %d\n", + dd->ipath_unit, ret); + goto bail_regions; + } + else { + ipath_dbg("No 64bit DMA mask, used 32 bit mask\n"); + ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); + if (ret) + dev_info(&pdev->dev, + "Unable to set DMA consistent mask " + "for unit %u: %d\n", + dd->ipath_unit, ret); + + } + } + else { + ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); + if (ret) + dev_info(&pdev->dev, + "Unable to set DMA consistent mask " + "for unit %u: %d\n", + dd->ipath_unit, ret); + } + + pci_set_master(pdev); + + /* + * Save BARs to rewrite after device reset. Save all 64 bits of + * BAR, just in case. + */ + dd->ipath_pcibar0 = addr; + dd->ipath_pcibar1 = addr >> 32; + dd->ipath_deviceid = ent->device; /* save for later use */ + dd->ipath_vendorid = ent->vendor; + + /* setup the chip-specific functions, as early as possible. */ + switch (ent->device) { + case PCI_DEVICE_ID_INFINIPATH_HT: + ipath_init_iba6110_funcs(dd); + break; + + default: + ipath_dev_err(dd, "Found unknown QLogic deviceid 0x%x, " + "failing\n", ent->device); + return -ENODEV; + } + + for (j = 0; j < 6; j++) { + if (!pdev->resource[j].start) + continue; + ipath_cdbg(VERBOSE, "BAR %d %pR, len %llx\n", + j, &pdev->resource[j], + (unsigned long long)pci_resource_len(pdev, j)); + } + + if (!addr) { + ipath_dev_err(dd, "No valid address in BAR 0!\n"); + ret = -ENODEV; + goto bail_regions; + } + + dd->ipath_pcirev = pdev->revision; + +#if defined(__powerpc__) + /* There isn't a generic way to specify writethrough mappings */ + dd->ipath_kregbase = __ioremap(addr, len, + (_PAGE_NO_CACHE|_PAGE_WRITETHRU)); +#else + dd->ipath_kregbase = ioremap_nocache(addr, len); +#endif + + if (!dd->ipath_kregbase) { + ipath_dbg("Unable to map io addr %llx to kvirt, failing\n", + addr); + ret = -ENOMEM; + goto bail_iounmap; + } + dd->ipath_kregend = (u64 __iomem *) + ((void __iomem *)dd->ipath_kregbase + len); + dd->ipath_physaddr = addr; /* used for io_remap, etc. */ + /* for user mmap */ + ipath_cdbg(VERBOSE, "mapped io addr %llx to kregbase %p\n", + addr, dd->ipath_kregbase); + + if (dd->ipath_f_bus(dd, pdev)) + ipath_dev_err(dd, "Failed to setup config space; " + "continuing anyway\n"); + + /* + * set up our interrupt handler; IRQF_SHARED probably not needed, + * since MSI interrupts shouldn't be shared but won't hurt for now. + * check 0 irq after we return from chip-specific bus setup, since + * that can affect this due to setup + */ + if (!dd->ipath_irq) + ipath_dev_err(dd, "irq is 0, BIOS error? Interrupts won't " + "work\n"); + else { + ret = request_irq(dd->ipath_irq, ipath_intr, IRQF_SHARED, + IPATH_DRV_NAME, dd); + if (ret) { + ipath_dev_err(dd, "Couldn't setup irq handler, " + "irq=%d: %d\n", dd->ipath_irq, ret); + goto bail_iounmap; + } + } + + ret = ipath_init_chip(dd, 0); /* do the chip-specific init */ + if (ret) + goto bail_irqsetup; + + ret = ipath_enable_wc(dd); + + if (ret) { + ipath_dev_err(dd, "Write combining not enabled " + "(err %d): performance may be poor\n", + -ret); + ret = 0; + } + + ipath_verify_pioperf(dd); + + ipath_device_create_group(&pdev->dev, dd); + ipathfs_add_device(dd); + ipath_user_add(dd); + ipath_diag_add(dd); + ipath_register_ib_device(dd); + + goto bail; + +bail_irqsetup: + cleanup_device(dd); + + if (dd->ipath_irq) + dd->ipath_f_free_irq(dd); + + if (dd->ipath_f_cleanup) + dd->ipath_f_cleanup(dd); + +bail_iounmap: + iounmap((volatile void __iomem *) dd->ipath_kregbase); + +bail_regions: + pci_release_regions(pdev); + +bail_disable: + pci_disable_device(pdev); + +bail_devdata: + ipath_free_devdata(pdev, dd); + +bail: + return ret; +} + +static void cleanup_device(struct ipath_devdata *dd) +{ + int port; + struct ipath_portdata **tmp; + unsigned long flags; + + if (*dd->ipath_statusp & IPATH_STATUS_CHIP_PRESENT) { + /* can't do anything more with chip; needs re-init */ + *dd->ipath_statusp &= ~IPATH_STATUS_CHIP_PRESENT; + if (dd->ipath_kregbase) { + /* + * if we haven't already cleaned up before these are + * to ensure any register reads/writes "fail" until + * re-init + */ + dd->ipath_kregbase = NULL; + dd->ipath_uregbase = 0; + dd->ipath_sregbase = 0; + dd->ipath_cregbase = 0; + dd->ipath_kregsize = 0; + } + ipath_disable_wc(dd); + } + + if (dd->ipath_spectriggerhit) + dev_info(&dd->pcidev->dev, "%lu special trigger hits\n", + dd->ipath_spectriggerhit); + + if (dd->ipath_pioavailregs_dma) { + dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, + (void *) dd->ipath_pioavailregs_dma, + dd->ipath_pioavailregs_phys); + dd->ipath_pioavailregs_dma = NULL; + } + if (dd->ipath_dummy_hdrq) { + dma_free_coherent(&dd->pcidev->dev, + dd->ipath_pd[0]->port_rcvhdrq_size, + dd->ipath_dummy_hdrq, dd->ipath_dummy_hdrq_phys); + dd->ipath_dummy_hdrq = NULL; + } + + if (dd->ipath_pageshadow) { + struct page **tmpp = dd->ipath_pageshadow; + dma_addr_t *tmpd = dd->ipath_physshadow; + int i, cnt = 0; + + ipath_cdbg(VERBOSE, "Unlocking any expTID pages still " + "locked\n"); + for (port = 0; port < dd->ipath_cfgports; port++) { + int port_tidbase = port * dd->ipath_rcvtidcnt; + int maxtid = port_tidbase + dd->ipath_rcvtidcnt; + for (i = port_tidbase; i < maxtid; i++) { + if (!tmpp[i]) + continue; + pci_unmap_page(dd->pcidev, tmpd[i], + PAGE_SIZE, PCI_DMA_FROMDEVICE); + ipath_release_user_pages(&tmpp[i], 1); + tmpp[i] = NULL; + cnt++; + } + } + if (cnt) { + ipath_stats.sps_pageunlocks += cnt; + ipath_cdbg(VERBOSE, "There were still %u expTID " + "entries locked\n", cnt); + } + if (ipath_stats.sps_pagelocks || + ipath_stats.sps_pageunlocks) + ipath_cdbg(VERBOSE, "%llu pages locked, %llu " + "unlocked via ipath_m{un}lock\n", + (unsigned long long) + ipath_stats.sps_pagelocks, + (unsigned long long) + ipath_stats.sps_pageunlocks); + + ipath_cdbg(VERBOSE, "Free shadow page tid array at %p\n", + dd->ipath_pageshadow); + tmpp = dd->ipath_pageshadow; + dd->ipath_pageshadow = NULL; + vfree(tmpp); + + dd->ipath_egrtidbase = NULL; + } + + /* + * free any resources still in use (usually just kernel ports) + * at unload; we do for portcnt, because that's what we allocate. + * We acquire lock to be really paranoid that ipath_pd isn't being + * accessed from some interrupt-related code (that should not happen, + * but best to be sure). + */ + spin_lock_irqsave(&dd->ipath_uctxt_lock, flags); + tmp = dd->ipath_pd; + dd->ipath_pd = NULL; + spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags); + for (port = 0; port < dd->ipath_portcnt; port++) { + struct ipath_portdata *pd = tmp[port]; + tmp[port] = NULL; /* debugging paranoia */ + ipath_free_pddata(dd, pd); + } + kfree(tmp); +} + +static void ipath_remove_one(struct pci_dev *pdev) +{ + struct ipath_devdata *dd = pci_get_drvdata(pdev); + + ipath_cdbg(VERBOSE, "removing, pdev=%p, dd=%p\n", pdev, dd); + + /* + * disable the IB link early, to be sure no new packets arrive, which + * complicates the shutdown process + */ + ipath_shutdown_device(dd); + + flush_workqueue(ib_wq); + + if (dd->verbs_dev) + ipath_unregister_ib_device(dd->verbs_dev); + + ipath_diag_remove(dd); + ipath_user_remove(dd); + ipathfs_remove_device(dd); + ipath_device_remove_group(&pdev->dev, dd); + + ipath_cdbg(VERBOSE, "Releasing pci memory regions, dd %p, " + "unit %u\n", dd, (u32) dd->ipath_unit); + + cleanup_device(dd); + + /* + * turn off rcv, send, and interrupts for all ports, all drivers + * should also hard reset the chip here? + * free up port 0 (kernel) rcvhdr, egr bufs, and eventually tid bufs + * for all versions of the driver, if they were allocated + */ + if (dd->ipath_irq) { + ipath_cdbg(VERBOSE, "unit %u free irq %d\n", + dd->ipath_unit, dd->ipath_irq); + dd->ipath_f_free_irq(dd); + } else + ipath_dbg("irq is 0, not doing free_irq " + "for unit %u\n", dd->ipath_unit); + /* + * we check for NULL here, because it's outside + * the kregbase check, and we need to call it + * after the free_irq. Thus it's possible that + * the function pointers were never initialized. + */ + if (dd->ipath_f_cleanup) + /* clean up chip-specific stuff */ + dd->ipath_f_cleanup(dd); + + ipath_cdbg(VERBOSE, "Unmapping kregbase %p\n", dd->ipath_kregbase); + iounmap((volatile void __iomem *) dd->ipath_kregbase); + pci_release_regions(pdev); + ipath_cdbg(VERBOSE, "calling pci_disable_device\n"); + pci_disable_device(pdev); + + ipath_free_devdata(pdev, dd); +} + +/* general driver use */ +DEFINE_MUTEX(ipath_mutex); + +static DEFINE_SPINLOCK(ipath_pioavail_lock); + +/** + * ipath_disarm_piobufs - cancel a range of PIO buffers + * @dd: the infinipath device + * @first: the first PIO buffer to cancel + * @cnt: the number of PIO buffers to cancel + * + * cancel a range of PIO buffers, used when they might be armed, but + * not triggered. Used at init to ensure buffer state, and also user + * process close, in case it died while writing to a PIO buffer + * Also after errors. + */ +void ipath_disarm_piobufs(struct ipath_devdata *dd, unsigned first, + unsigned cnt) +{ + unsigned i, last = first + cnt; + unsigned long flags; + + ipath_cdbg(PKT, "disarm %u PIObufs first=%u\n", cnt, first); + for (i = first; i < last; i++) { + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + /* + * The disarm-related bits are write-only, so it + * is ok to OR them in with our copy of sendctrl + * while we hold the lock. + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl | INFINIPATH_S_DISARM | + (i << INFINIPATH_S_DISARMPIOBUF_SHIFT)); + /* can't disarm bufs back-to-back per iba7220 spec */ + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + } + /* on some older chips, update may not happen after cancel */ + ipath_force_pio_avail_update(dd); +} + +/** + * ipath_wait_linkstate - wait for an IB link state change to occur + * @dd: the infinipath device + * @state: the state to wait for + * @msecs: the number of milliseconds to wait + * + * wait up to msecs milliseconds for IB link state change to occur for + * now, take the easy polling route. Currently used only by + * ipath_set_linkstate. Returns 0 if state reached, otherwise + * -ETIMEDOUT state can have multiple states set, for any of several + * transitions. + */ +int ipath_wait_linkstate(struct ipath_devdata *dd, u32 state, int msecs) +{ + dd->ipath_state_wanted = state; + wait_event_interruptible_timeout(ipath_state_wait, + (dd->ipath_flags & state), + msecs_to_jiffies(msecs)); + dd->ipath_state_wanted = 0; + + if (!(dd->ipath_flags & state)) { + u64 val; + ipath_cdbg(VERBOSE, "Didn't reach linkstate %s within %u" + " ms\n", + /* test INIT ahead of DOWN, both can be set */ + (state & IPATH_LINKINIT) ? "INIT" : + ((state & IPATH_LINKDOWN) ? "DOWN" : + ((state & IPATH_LINKARMED) ? "ARM" : "ACTIVE")), + msecs); + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus); + ipath_cdbg(VERBOSE, "ibcc=%llx ibcstatus=%llx (%s)\n", + (unsigned long long) ipath_read_kreg64( + dd, dd->ipath_kregs->kr_ibcctrl), + (unsigned long long) val, + ipath_ibcstatus_str[val & dd->ibcs_lts_mask]); + } + return (dd->ipath_flags & state) ? 0 : -ETIMEDOUT; +} + +static void decode_sdma_errs(struct ipath_devdata *dd, ipath_err_t err, + char *buf, size_t blen) +{ + static const struct { + ipath_err_t err; + const char *msg; + } errs[] = { + { INFINIPATH_E_SDMAGENMISMATCH, "SDmaGenMismatch" }, + { INFINIPATH_E_SDMAOUTOFBOUND, "SDmaOutOfBound" }, + { INFINIPATH_E_SDMATAILOUTOFBOUND, "SDmaTailOutOfBound" }, + { INFINIPATH_E_SDMABASE, "SDmaBase" }, + { INFINIPATH_E_SDMA1STDESC, "SDma1stDesc" }, + { INFINIPATH_E_SDMARPYTAG, "SDmaRpyTag" }, + { INFINIPATH_E_SDMADWEN, "SDmaDwEn" }, + { INFINIPATH_E_SDMAMISSINGDW, "SDmaMissingDw" }, + { INFINIPATH_E_SDMAUNEXPDATA, "SDmaUnexpData" }, + { INFINIPATH_E_SDMADESCADDRMISALIGN, "SDmaDescAddrMisalign" }, + { INFINIPATH_E_SENDBUFMISUSE, "SendBufMisuse" }, + { INFINIPATH_E_SDMADISABLED, "SDmaDisabled" }, + }; + int i; + int expected; + size_t bidx = 0; + + for (i = 0; i < ARRAY_SIZE(errs); i++) { + expected = (errs[i].err != INFINIPATH_E_SDMADISABLED) ? 0 : + test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status); + if ((err & errs[i].err) && !expected) + bidx += snprintf(buf + bidx, blen - bidx, + "%s ", errs[i].msg); + } +} + +/* + * Decode the error status into strings, deciding whether to always + * print * it or not depending on "normal packet errors" vs everything + * else. Return 1 if "real" errors, otherwise 0 if only packet + * errors, so caller can decide what to print with the string. + */ +int ipath_decode_err(struct ipath_devdata *dd, char *buf, size_t blen, + ipath_err_t err) +{ + int iserr = 1; + *buf = '\0'; + if (err & INFINIPATH_E_PKTERRS) { + if (!(err & ~INFINIPATH_E_PKTERRS)) + iserr = 0; // if only packet errors. + if (ipath_debug & __IPATH_ERRPKTDBG) { + if (err & INFINIPATH_E_REBP) + strlcat(buf, "EBP ", blen); + if (err & INFINIPATH_E_RVCRC) + strlcat(buf, "VCRC ", blen); + if (err & INFINIPATH_E_RICRC) { + strlcat(buf, "CRC ", blen); + // clear for check below, so only once + err &= INFINIPATH_E_RICRC; + } + if (err & INFINIPATH_E_RSHORTPKTLEN) + strlcat(buf, "rshortpktlen ", blen); + if (err & INFINIPATH_E_SDROPPEDDATAPKT) + strlcat(buf, "sdroppeddatapkt ", blen); + if (err & INFINIPATH_E_SPKTLEN) + strlcat(buf, "spktlen ", blen); + } + if ((err & INFINIPATH_E_RICRC) && + !(err&(INFINIPATH_E_RVCRC|INFINIPATH_E_REBP))) + strlcat(buf, "CRC ", blen); + if (!iserr) + goto done; + } + if (err & INFINIPATH_E_RHDRLEN) + strlcat(buf, "rhdrlen ", blen); + if (err & INFINIPATH_E_RBADTID) + strlcat(buf, "rbadtid ", blen); + if (err & INFINIPATH_E_RBADVERSION) + strlcat(buf, "rbadversion ", blen); + if (err & INFINIPATH_E_RHDR) + strlcat(buf, "rhdr ", blen); + if (err & INFINIPATH_E_SENDSPECIALTRIGGER) + strlcat(buf, "sendspecialtrigger ", blen); + if (err & INFINIPATH_E_RLONGPKTLEN) + strlcat(buf, "rlongpktlen ", blen); + if (err & INFINIPATH_E_RMAXPKTLEN) + strlcat(buf, "rmaxpktlen ", blen); + if (err & INFINIPATH_E_RMINPKTLEN) + strlcat(buf, "rminpktlen ", blen); + if (err & INFINIPATH_E_SMINPKTLEN) + strlcat(buf, "sminpktlen ", blen); + if (err & INFINIPATH_E_RFORMATERR) + strlcat(buf, "rformaterr ", blen); + if (err & INFINIPATH_E_RUNSUPVL) + strlcat(buf, "runsupvl ", blen); + if (err & INFINIPATH_E_RUNEXPCHAR) + strlcat(buf, "runexpchar ", blen); + if (err & INFINIPATH_E_RIBFLOW) + strlcat(buf, "ribflow ", blen); + if (err & INFINIPATH_E_SUNDERRUN) + strlcat(buf, "sunderrun ", blen); + if (err & INFINIPATH_E_SPIOARMLAUNCH) + strlcat(buf, "spioarmlaunch ", blen); + if (err & INFINIPATH_E_SUNEXPERRPKTNUM) + strlcat(buf, "sunexperrpktnum ", blen); + if (err & INFINIPATH_E_SDROPPEDSMPPKT) + strlcat(buf, "sdroppedsmppkt ", blen); + if (err & INFINIPATH_E_SMAXPKTLEN) + strlcat(buf, "smaxpktlen ", blen); + if (err & INFINIPATH_E_SUNSUPVL) + strlcat(buf, "sunsupVL ", blen); + if (err & INFINIPATH_E_INVALIDADDR) + strlcat(buf, "invalidaddr ", blen); + if (err & INFINIPATH_E_RRCVEGRFULL) + strlcat(buf, "rcvegrfull ", blen); + if (err & INFINIPATH_E_RRCVHDRFULL) + strlcat(buf, "rcvhdrfull ", blen); + if (err & INFINIPATH_E_IBSTATUSCHANGED) + strlcat(buf, "ibcstatuschg ", blen); + if (err & INFINIPATH_E_RIBLOSTLINK) + strlcat(buf, "riblostlink ", blen); + if (err & INFINIPATH_E_HARDWARE) + strlcat(buf, "hardware ", blen); + if (err & INFINIPATH_E_RESET) + strlcat(buf, "reset ", blen); + if (err & INFINIPATH_E_SDMAERRS) + decode_sdma_errs(dd, err, buf, blen); + if (err & INFINIPATH_E_INVALIDEEPCMD) + strlcat(buf, "invalideepromcmd ", blen); +done: + return iserr; +} + +/** + * get_rhf_errstring - decode RHF errors + * @err: the err number + * @msg: the output buffer + * @len: the length of the output buffer + * + * only used one place now, may want more later + */ +static void get_rhf_errstring(u32 err, char *msg, size_t len) +{ + /* if no errors, and so don't need to check what's first */ + *msg = '\0'; + + if (err & INFINIPATH_RHF_H_ICRCERR) + strlcat(msg, "icrcerr ", len); + if (err & INFINIPATH_RHF_H_VCRCERR) + strlcat(msg, "vcrcerr ", len); + if (err & INFINIPATH_RHF_H_PARITYERR) + strlcat(msg, "parityerr ", len); + if (err & INFINIPATH_RHF_H_LENERR) + strlcat(msg, "lenerr ", len); + if (err & INFINIPATH_RHF_H_MTUERR) + strlcat(msg, "mtuerr ", len); + if (err & INFINIPATH_RHF_H_IHDRERR) + /* infinipath hdr checksum error */ + strlcat(msg, "ipathhdrerr ", len); + if (err & INFINIPATH_RHF_H_TIDERR) + strlcat(msg, "tiderr ", len); + if (err & INFINIPATH_RHF_H_MKERR) + /* bad port, offset, etc. */ + strlcat(msg, "invalid ipathhdr ", len); + if (err & INFINIPATH_RHF_H_IBERR) + strlcat(msg, "iberr ", len); + if (err & INFINIPATH_RHF_L_SWA) + strlcat(msg, "swA ", len); + if (err & INFINIPATH_RHF_L_SWB) + strlcat(msg, "swB ", len); +} + +/** + * ipath_get_egrbuf - get an eager buffer + * @dd: the infinipath device + * @bufnum: the eager buffer to get + * + * must only be called if ipath_pd[port] is known to be allocated + */ +static inline void *ipath_get_egrbuf(struct ipath_devdata *dd, u32 bufnum) +{ + return dd->ipath_port0_skbinfo ? + (void *) dd->ipath_port0_skbinfo[bufnum].skb->data : NULL; +} + +/** + * ipath_alloc_skb - allocate an skb and buffer with possible constraints + * @dd: the infinipath device + * @gfp_mask: the sk_buff SFP mask + */ +struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd, + gfp_t gfp_mask) +{ + struct sk_buff *skb; + u32 len; + + /* + * Only fully supported way to handle this is to allocate lots + * extra, align as needed, and then do skb_reserve(). That wastes + * a lot of memory... I'll have to hack this into infinipath_copy + * also. + */ + + /* + * We need 2 extra bytes for ipath_ether data sent in the + * key header. In order to keep everything dword aligned, + * we'll reserve 4 bytes. + */ + len = dd->ipath_ibmaxlen + 4; + + if (dd->ipath_flags & IPATH_4BYTE_TID) { + /* We need a 2KB multiple alignment, and there is no way + * to do it except to allocate extra and then skb_reserve + * enough to bring it up to the right alignment. + */ + len += 2047; + } + + skb = __dev_alloc_skb(len, gfp_mask); + if (!skb) { + ipath_dev_err(dd, "Failed to allocate skbuff, length %u\n", + len); + goto bail; + } + + skb_reserve(skb, 4); + + if (dd->ipath_flags & IPATH_4BYTE_TID) { + u32 una = (unsigned long)skb->data & 2047; + if (una) + skb_reserve(skb, 2048 - una); + } + +bail: + return skb; +} + +static void ipath_rcv_hdrerr(struct ipath_devdata *dd, + u32 eflags, + u32 l, + u32 etail, + __le32 *rhf_addr, + struct ipath_message_header *hdr) +{ + char emsg[128]; + + get_rhf_errstring(eflags, emsg, sizeof emsg); + ipath_cdbg(PKT, "RHFerrs %x hdrqtail=%x typ=%u " + "tlen=%x opcode=%x egridx=%x: %s\n", + eflags, l, + ipath_hdrget_rcv_type(rhf_addr), + ipath_hdrget_length_in_bytes(rhf_addr), + be32_to_cpu(hdr->bth[0]) >> 24, + etail, emsg); + + /* Count local link integrity errors. */ + if (eflags & (INFINIPATH_RHF_H_ICRCERR | INFINIPATH_RHF_H_VCRCERR)) { + u8 n = (dd->ipath_ibcctrl >> + INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) & + INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK; + + if (++dd->ipath_lli_counter > n) { + dd->ipath_lli_counter = 0; + dd->ipath_lli_errors++; + } + } +} + +/* + * ipath_kreceive - receive a packet + * @pd: the infinipath port + * + * called from interrupt handler for errors or receive interrupt + */ +void ipath_kreceive(struct ipath_portdata *pd) +{ + struct ipath_devdata *dd = pd->port_dd; + __le32 *rhf_addr; + void *ebuf; + const u32 rsize = dd->ipath_rcvhdrentsize; /* words */ + const u32 maxcnt = dd->ipath_rcvhdrcnt * rsize; /* words */ + u32 etail = -1, l, hdrqtail; + struct ipath_message_header *hdr; + u32 eflags, i, etype, tlen, pkttot = 0, updegr = 0, reloop = 0; + static u64 totcalls; /* stats, may eventually remove */ + int last; + + l = pd->port_head; + rhf_addr = (__le32 *) pd->port_rcvhdrq + l + dd->ipath_rhf_offset; + if (dd->ipath_flags & IPATH_NODMA_RTAIL) { + u32 seq = ipath_hdrget_seq(rhf_addr); + + if (seq != pd->port_seq_cnt) + goto bail; + hdrqtail = 0; + } else { + hdrqtail = ipath_get_rcvhdrtail(pd); + if (l == hdrqtail) + goto bail; + smp_rmb(); + } + +reloop: + for (last = 0, i = 1; !last; i += !last) { + hdr = dd->ipath_f_get_msgheader(dd, rhf_addr); + eflags = ipath_hdrget_err_flags(rhf_addr); + etype = ipath_hdrget_rcv_type(rhf_addr); + /* total length */ + tlen = ipath_hdrget_length_in_bytes(rhf_addr); + ebuf = NULL; + if ((dd->ipath_flags & IPATH_NODMA_RTAIL) ? + ipath_hdrget_use_egr_buf(rhf_addr) : + (etype != RCVHQ_RCV_TYPE_EXPECTED)) { + /* + * It turns out that the chip uses an eager buffer + * for all non-expected packets, whether it "needs" + * one or not. So always get the index, but don't + * set ebuf (so we try to copy data) unless the + * length requires it. + */ + etail = ipath_hdrget_index(rhf_addr); + updegr = 1; + if (tlen > sizeof(*hdr) || + etype == RCVHQ_RCV_TYPE_NON_KD) + ebuf = ipath_get_egrbuf(dd, etail); + } + + /* + * both tiderr and ipathhdrerr are set for all plain IB + * packets; only ipathhdrerr should be set. + */ + + if (etype != RCVHQ_RCV_TYPE_NON_KD && + etype != RCVHQ_RCV_TYPE_ERROR && + ipath_hdrget_ipath_ver(hdr->iph.ver_port_tid_offset) != + IPS_PROTO_VERSION) + ipath_cdbg(PKT, "Bad InfiniPath protocol version " + "%x\n", etype); + + if (unlikely(eflags)) + ipath_rcv_hdrerr(dd, eflags, l, etail, rhf_addr, hdr); + else if (etype == RCVHQ_RCV_TYPE_NON_KD) { + ipath_ib_rcv(dd->verbs_dev, (u32 *)hdr, ebuf, tlen); + if (dd->ipath_lli_counter) + dd->ipath_lli_counter--; + } else if (etype == RCVHQ_RCV_TYPE_EAGER) { + u8 opcode = be32_to_cpu(hdr->bth[0]) >> 24; + u32 qp = be32_to_cpu(hdr->bth[1]) & 0xffffff; + ipath_cdbg(PKT, "typ %x, opcode %x (eager, " + "qp=%x), len %x; ignored\n", + etype, opcode, qp, tlen); + } + else if (etype == RCVHQ_RCV_TYPE_EXPECTED) + ipath_dbg("Bug: Expected TID, opcode %x; ignored\n", + be32_to_cpu(hdr->bth[0]) >> 24); + else { + /* + * error packet, type of error unknown. + * Probably type 3, but we don't know, so don't + * even try to print the opcode, etc. + * Usually caused by a "bad packet", that has no + * BTH, when the LRH says it should. + */ + ipath_cdbg(ERRPKT, "Error Pkt, but no eflags! egrbuf" + " %x, len %x hdrq+%x rhf: %Lx\n", + etail, tlen, l, (unsigned long long) + le64_to_cpu(*(__le64 *) rhf_addr)); + if (ipath_debug & __IPATH_ERRPKTDBG) { + u32 j, *d, dw = rsize-2; + if (rsize > (tlen>>2)) + dw = tlen>>2; + d = (u32 *)hdr; + printk(KERN_DEBUG "EPkt rcvhdr(%x dw):\n", + dw); + for (j = 0; j < dw; j++) + printk(KERN_DEBUG "%8x%s", d[j], + (j%8) == 7 ? "\n" : " "); + printk(KERN_DEBUG ".\n"); + } + } + l += rsize; + if (l >= maxcnt) + l = 0; + rhf_addr = (__le32 *) pd->port_rcvhdrq + + l + dd->ipath_rhf_offset; + if (dd->ipath_flags & IPATH_NODMA_RTAIL) { + u32 seq = ipath_hdrget_seq(rhf_addr); + + if (++pd->port_seq_cnt > 13) + pd->port_seq_cnt = 1; + if (seq != pd->port_seq_cnt) + last = 1; + } else if (l == hdrqtail) + last = 1; + /* + * update head regs on last packet, and every 16 packets. + * Reduce bus traffic, while still trying to prevent + * rcvhdrq overflows, for when the queue is nearly full + */ + if (last || !(i & 0xf)) { + u64 lval = l; + + /* request IBA6120 and 7220 interrupt only on last */ + if (last) + lval |= dd->ipath_rhdrhead_intr_off; + ipath_write_ureg(dd, ur_rcvhdrhead, lval, + pd->port_port); + if (updegr) { + ipath_write_ureg(dd, ur_rcvegrindexhead, + etail, pd->port_port); + updegr = 0; + } + } + } + + if (!dd->ipath_rhdrhead_intr_off && !reloop && + !(dd->ipath_flags & IPATH_NODMA_RTAIL)) { + /* IBA6110 workaround; we can have a race clearing chip + * interrupt with another interrupt about to be delivered, + * and can clear it before it is delivered on the GPIO + * workaround. By doing the extra check here for the + * in-memory tail register updating while we were doing + * earlier packets, we "almost" guarantee we have covered + * that case. + */ + u32 hqtail = ipath_get_rcvhdrtail(pd); + if (hqtail != hdrqtail) { + hdrqtail = hqtail; + reloop = 1; /* loop 1 extra time at most */ + goto reloop; + } + } + + pkttot += i; + + pd->port_head = l; + + if (pkttot > ipath_stats.sps_maxpkts_call) + ipath_stats.sps_maxpkts_call = pkttot; + ipath_stats.sps_port0pkts += pkttot; + ipath_stats.sps_avgpkts_call = + ipath_stats.sps_port0pkts / ++totcalls; + +bail:; +} + +/** + * ipath_update_pio_bufs - update shadow copy of the PIO availability map + * @dd: the infinipath device + * + * called whenever our local copy indicates we have run out of send buffers + * NOTE: This can be called from interrupt context by some code + * and from non-interrupt context by ipath_getpiobuf(). + */ + +static void ipath_update_pio_bufs(struct ipath_devdata *dd) +{ + unsigned long flags; + int i; + const unsigned piobregs = (unsigned)dd->ipath_pioavregs; + + /* If the generation (check) bits have changed, then we update the + * busy bit for the corresponding PIO buffer. This algorithm will + * modify positions to the value they already have in some cases + * (i.e., no change), but it's faster than changing only the bits + * that have changed. + * + * We would like to do this atomicly, to avoid spinlocks in the + * critical send path, but that's not really possible, given the + * type of changes, and that this routine could be called on + * multiple cpu's simultaneously, so we lock in this routine only, + * to avoid conflicting updates; all we change is the shadow, and + * it's a single 64 bit memory location, so by definition the update + * is atomic in terms of what other cpu's can see in testing the + * bits. The spin_lock overhead isn't too bad, since it only + * happens when all buffers are in use, so only cpu overhead, not + * latency or bandwidth is affected. + */ + if (!dd->ipath_pioavailregs_dma) { + ipath_dbg("Update shadow pioavail, but regs_dma NULL!\n"); + return; + } + if (ipath_debug & __IPATH_VERBDBG) { + /* only if packet debug and verbose */ + volatile __le64 *dma = dd->ipath_pioavailregs_dma; + unsigned long *shadow = dd->ipath_pioavailshadow; + + ipath_cdbg(PKT, "Refill avail, dma0=%llx shad0=%lx, " + "d1=%llx s1=%lx, d2=%llx s2=%lx, d3=%llx " + "s3=%lx\n", + (unsigned long long) le64_to_cpu(dma[0]), + shadow[0], + (unsigned long long) le64_to_cpu(dma[1]), + shadow[1], + (unsigned long long) le64_to_cpu(dma[2]), + shadow[2], + (unsigned long long) le64_to_cpu(dma[3]), + shadow[3]); + if (piobregs > 4) + ipath_cdbg( + PKT, "2nd group, dma4=%llx shad4=%lx, " + "d5=%llx s5=%lx, d6=%llx s6=%lx, " + "d7=%llx s7=%lx\n", + (unsigned long long) le64_to_cpu(dma[4]), + shadow[4], + (unsigned long long) le64_to_cpu(dma[5]), + shadow[5], + (unsigned long long) le64_to_cpu(dma[6]), + shadow[6], + (unsigned long long) le64_to_cpu(dma[7]), + shadow[7]); + } + spin_lock_irqsave(&ipath_pioavail_lock, flags); + for (i = 0; i < piobregs; i++) { + u64 pchbusy, pchg, piov, pnew; + /* + * Chip Errata: bug 6641; even and odd qwords>3 are swapped + */ + if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) + piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i ^ 1]); + else + piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i]); + pchg = dd->ipath_pioavailkernel[i] & + ~(dd->ipath_pioavailshadow[i] ^ piov); + pchbusy = pchg << INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT; + if (pchg && (pchbusy & dd->ipath_pioavailshadow[i])) { + pnew = dd->ipath_pioavailshadow[i] & ~pchbusy; + pnew |= piov & pchbusy; + dd->ipath_pioavailshadow[i] = pnew; + } + } + spin_unlock_irqrestore(&ipath_pioavail_lock, flags); +} + +/* + * used to force update of pioavailshadow if we can't get a pio buffer. + * Needed primarily due to exitting freeze mode after recovering + * from errors. Done lazily, because it's safer (known to not + * be writing pio buffers). + */ +static void ipath_reset_availshadow(struct ipath_devdata *dd) +{ + int i, im; + unsigned long flags; + + spin_lock_irqsave(&ipath_pioavail_lock, flags); + for (i = 0; i < dd->ipath_pioavregs; i++) { + u64 val, oldval; + /* deal with 6110 chip bug on high register #s */ + im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ? + i ^ 1 : i; + val = le64_to_cpu(dd->ipath_pioavailregs_dma[im]); + /* + * busy out the buffers not in the kernel avail list, + * without changing the generation bits. + */ + oldval = dd->ipath_pioavailshadow[i]; + dd->ipath_pioavailshadow[i] = val | + ((~dd->ipath_pioavailkernel[i] << + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT) & + 0xaaaaaaaaaaaaaaaaULL); /* All BUSY bits in qword */ + if (oldval != dd->ipath_pioavailshadow[i]) + ipath_dbg("shadow[%d] was %Lx, now %lx\n", + i, (unsigned long long) oldval, + dd->ipath_pioavailshadow[i]); + } + spin_unlock_irqrestore(&ipath_pioavail_lock, flags); +} + +/** + * ipath_setrcvhdrsize - set the receive header size + * @dd: the infinipath device + * @rhdrsize: the receive header size + * + * called from user init code, and also layered driver init + */ +int ipath_setrcvhdrsize(struct ipath_devdata *dd, unsigned rhdrsize) +{ + int ret = 0; + + if (dd->ipath_flags & IPATH_RCVHDRSZ_SET) { + if (dd->ipath_rcvhdrsize != rhdrsize) { + dev_info(&dd->pcidev->dev, + "Error: can't set protocol header " + "size %u, already %u\n", + rhdrsize, dd->ipath_rcvhdrsize); + ret = -EAGAIN; + } else + ipath_cdbg(VERBOSE, "Reuse same protocol header " + "size %u\n", dd->ipath_rcvhdrsize); + } else if (rhdrsize > (dd->ipath_rcvhdrentsize - + (sizeof(u64) / sizeof(u32)))) { + ipath_dbg("Error: can't set protocol header size %u " + "(> max %u)\n", rhdrsize, + dd->ipath_rcvhdrentsize - + (u32) (sizeof(u64) / sizeof(u32))); + ret = -EOVERFLOW; + } else { + dd->ipath_flags |= IPATH_RCVHDRSZ_SET; + dd->ipath_rcvhdrsize = rhdrsize; + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrsize, + dd->ipath_rcvhdrsize); + ipath_cdbg(VERBOSE, "Set protocol header size to %u\n", + dd->ipath_rcvhdrsize); + } + return ret; +} + +/* + * debugging code and stats updates if no pio buffers available. + */ +static noinline void no_pio_bufs(struct ipath_devdata *dd) +{ + unsigned long *shadow = dd->ipath_pioavailshadow; + __le64 *dma = (__le64 *)dd->ipath_pioavailregs_dma; + + dd->ipath_upd_pio_shadow = 1; + + /* + * not atomic, but if we lose a stat count in a while, that's OK + */ + ipath_stats.sps_nopiobufs++; + if (!(++dd->ipath_consec_nopiobuf % 100000)) { + ipath_force_pio_avail_update(dd); /* at start */ + ipath_dbg("%u tries no piobufavail ts%lx; dmacopy: " + "%llx %llx %llx %llx\n" + "ipath shadow: %lx %lx %lx %lx\n", + dd->ipath_consec_nopiobuf, + (unsigned long)get_cycles(), + (unsigned long long) le64_to_cpu(dma[0]), + (unsigned long long) le64_to_cpu(dma[1]), + (unsigned long long) le64_to_cpu(dma[2]), + (unsigned long long) le64_to_cpu(dma[3]), + shadow[0], shadow[1], shadow[2], shadow[3]); + /* + * 4 buffers per byte, 4 registers above, cover rest + * below + */ + if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) > + (sizeof(shadow[0]) * 4 * 4)) + ipath_dbg("2nd group: dmacopy: " + "%llx %llx %llx %llx\n" + "ipath shadow: %lx %lx %lx %lx\n", + (unsigned long long)le64_to_cpu(dma[4]), + (unsigned long long)le64_to_cpu(dma[5]), + (unsigned long long)le64_to_cpu(dma[6]), + (unsigned long long)le64_to_cpu(dma[7]), + shadow[4], shadow[5], shadow[6], shadow[7]); + + /* at end, so update likely happened */ + ipath_reset_availshadow(dd); + } +} + +/* + * common code for normal driver pio buffer allocation, and reserved + * allocation. + * + * do appropriate marking as busy, etc. + * returns buffer number if one found (>=0), negative number is error. + */ +static u32 __iomem *ipath_getpiobuf_range(struct ipath_devdata *dd, + u32 *pbufnum, u32 first, u32 last, u32 firsti) +{ + int i, j, updated = 0; + unsigned piobcnt; + unsigned long flags; + unsigned long *shadow = dd->ipath_pioavailshadow; + u32 __iomem *buf; + + piobcnt = last - first; + if (dd->ipath_upd_pio_shadow) { + /* + * Minor optimization. If we had no buffers on last call, + * start out by doing the update; continue and do scan even + * if no buffers were updated, to be paranoid + */ + ipath_update_pio_bufs(dd); + updated++; + i = first; + } else + i = firsti; +rescan: + /* + * while test_and_set_bit() is atomic, we do that and then the + * change_bit(), and the pair is not. See if this is the cause + * of the remaining armlaunch errors. + */ + spin_lock_irqsave(&ipath_pioavail_lock, flags); + for (j = 0; j < piobcnt; j++, i++) { + if (i >= last) + i = first; + if (__test_and_set_bit((2 * i) + 1, shadow)) + continue; + /* flip generation bit */ + __change_bit(2 * i, shadow); + break; + } + spin_unlock_irqrestore(&ipath_pioavail_lock, flags); + + if (j == piobcnt) { + if (!updated) { + /* + * first time through; shadow exhausted, but may be + * buffers available, try an update and then rescan. + */ + ipath_update_pio_bufs(dd); + updated++; + i = first; + goto rescan; + } else if (updated == 1 && piobcnt <= + ((dd->ipath_sendctrl + >> INFINIPATH_S_UPDTHRESH_SHIFT) & + INFINIPATH_S_UPDTHRESH_MASK)) { + /* + * for chips supporting and using the update + * threshold we need to force an update of the + * in-memory copy if the count is less than the + * thershold, then check one more time. + */ + ipath_force_pio_avail_update(dd); + ipath_update_pio_bufs(dd); + updated++; + i = first; + goto rescan; + } + + no_pio_bufs(dd); + buf = NULL; + } else { + if (i < dd->ipath_piobcnt2k) + buf = (u32 __iomem *) (dd->ipath_pio2kbase + + i * dd->ipath_palign); + else + buf = (u32 __iomem *) + (dd->ipath_pio4kbase + + (i - dd->ipath_piobcnt2k) * dd->ipath_4kalign); + if (pbufnum) + *pbufnum = i; + } + + return buf; +} + +/** + * ipath_getpiobuf - find an available pio buffer + * @dd: the infinipath device + * @plen: the size of the PIO buffer needed in 32-bit words + * @pbufnum: the buffer number is placed here + */ +u32 __iomem *ipath_getpiobuf(struct ipath_devdata *dd, u32 plen, u32 *pbufnum) +{ + u32 __iomem *buf; + u32 pnum, nbufs; + u32 first, lasti; + + if (plen + 1 >= IPATH_SMALLBUF_DWORDS) { + first = dd->ipath_piobcnt2k; + lasti = dd->ipath_lastpioindexl; + } else { + first = 0; + lasti = dd->ipath_lastpioindex; + } + nbufs = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k; + buf = ipath_getpiobuf_range(dd, &pnum, first, nbufs, lasti); + + if (buf) { + /* + * Set next starting place. It's just an optimization, + * it doesn't matter who wins on this, so no locking + */ + if (plen + 1 >= IPATH_SMALLBUF_DWORDS) + dd->ipath_lastpioindexl = pnum + 1; + else + dd->ipath_lastpioindex = pnum + 1; + if (dd->ipath_upd_pio_shadow) + dd->ipath_upd_pio_shadow = 0; + if (dd->ipath_consec_nopiobuf) + dd->ipath_consec_nopiobuf = 0; + ipath_cdbg(VERBOSE, "Return piobuf%u %uk @ %p\n", + pnum, (pnum < dd->ipath_piobcnt2k) ? 2 : 4, buf); + if (pbufnum) + *pbufnum = pnum; + + } + return buf; +} + +/** + * ipath_chg_pioavailkernel - change which send buffers are available for kernel + * @dd: the infinipath device + * @start: the starting send buffer number + * @len: the number of send buffers + * @avail: true if the buffers are available for kernel use, false otherwise + */ +void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start, + unsigned len, int avail) +{ + unsigned long flags; + unsigned end, cnt = 0; + + /* There are two bits per send buffer (busy and generation) */ + start *= 2; + end = start + len * 2; + + spin_lock_irqsave(&ipath_pioavail_lock, flags); + /* Set or clear the busy bit in the shadow. */ + while (start < end) { + if (avail) { + unsigned long dma; + int i, im; + /* + * the BUSY bit will never be set, because we disarm + * the user buffers before we hand them back to the + * kernel. We do have to make sure the generation + * bit is set correctly in shadow, since it could + * have changed many times while allocated to user. + * We can't use the bitmap functions on the full + * dma array because it is always little-endian, so + * we have to flip to host-order first. + * BITS_PER_LONG is slightly wrong, since it's + * always 64 bits per register in chip... + * We only work on 64 bit kernels, so that's OK. + */ + /* deal with 6110 chip bug on high register #s */ + i = start / BITS_PER_LONG; + im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ? + i ^ 1 : i; + __clear_bit(INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT + + start, dd->ipath_pioavailshadow); + dma = (unsigned long) le64_to_cpu( + dd->ipath_pioavailregs_dma[im]); + if (test_bit((INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT + + start) % BITS_PER_LONG, &dma)) + __set_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT + + start, dd->ipath_pioavailshadow); + else + __clear_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT + + start, dd->ipath_pioavailshadow); + __set_bit(start, dd->ipath_pioavailkernel); + } else { + __set_bit(start + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT, + dd->ipath_pioavailshadow); + __clear_bit(start, dd->ipath_pioavailkernel); + } + start += 2; + } + + if (dd->ipath_pioupd_thresh) { + end = 2 * (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k); + cnt = bitmap_weight(dd->ipath_pioavailkernel, end); + } + spin_unlock_irqrestore(&ipath_pioavail_lock, flags); + + /* + * When moving buffers from kernel to user, if number assigned to + * the user is less than the pio update threshold, and threshold + * is supported (cnt was computed > 0), drop the update threshold + * so we update at least once per allocated number of buffers. + * In any case, if the kernel buffers are less than the threshold, + * drop the threshold. We don't bother increasing it, having once + * decreased it, since it would typically just cycle back and forth. + * If we don't decrease below buffers in use, we can wait a long + * time for an update, until some other context uses PIO buffers. + */ + if (!avail && len < cnt) + cnt = len; + if (cnt < dd->ipath_pioupd_thresh) { + dd->ipath_pioupd_thresh = cnt; + ipath_dbg("Decreased pio update threshold to %u\n", + dd->ipath_pioupd_thresh); + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl &= ~(INFINIPATH_S_UPDTHRESH_MASK + << INFINIPATH_S_UPDTHRESH_SHIFT); + dd->ipath_sendctrl |= dd->ipath_pioupd_thresh + << INFINIPATH_S_UPDTHRESH_SHIFT; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + } +} + +/** + * ipath_create_rcvhdrq - create a receive header queue + * @dd: the infinipath device + * @pd: the port data + * + * this must be contiguous memory (from an i/o perspective), and must be + * DMA'able (which means for some systems, it will go through an IOMMU, + * or be forced into a low address range). + */ +int ipath_create_rcvhdrq(struct ipath_devdata *dd, + struct ipath_portdata *pd) +{ + int ret = 0; + + if (!pd->port_rcvhdrq) { + dma_addr_t phys_hdrqtail; + gfp_t gfp_flags = GFP_USER | __GFP_COMP; + int amt = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize * + sizeof(u32), PAGE_SIZE); + + pd->port_rcvhdrq = dma_alloc_coherent( + &dd->pcidev->dev, amt, &pd->port_rcvhdrq_phys, + gfp_flags); + + if (!pd->port_rcvhdrq) { + ipath_dev_err(dd, "attempt to allocate %d bytes " + "for port %u rcvhdrq failed\n", + amt, pd->port_port); + ret = -ENOMEM; + goto bail; + } + + if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) { + pd->port_rcvhdrtail_kvaddr = dma_alloc_coherent( + &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail, + GFP_KERNEL); + if (!pd->port_rcvhdrtail_kvaddr) { + ipath_dev_err(dd, "attempt to allocate 1 page " + "for port %u rcvhdrqtailaddr " + "failed\n", pd->port_port); + ret = -ENOMEM; + dma_free_coherent(&dd->pcidev->dev, amt, + pd->port_rcvhdrq, + pd->port_rcvhdrq_phys); + pd->port_rcvhdrq = NULL; + goto bail; + } + pd->port_rcvhdrqtailaddr_phys = phys_hdrqtail; + ipath_cdbg(VERBOSE, "port %d hdrtailaddr, %llx " + "physical\n", pd->port_port, + (unsigned long long) phys_hdrqtail); + } + + pd->port_rcvhdrq_size = amt; + + ipath_cdbg(VERBOSE, "%d pages at %p (phys %lx) size=%lu " + "for port %u rcvhdr Q\n", + amt >> PAGE_SHIFT, pd->port_rcvhdrq, + (unsigned long) pd->port_rcvhdrq_phys, + (unsigned long) pd->port_rcvhdrq_size, + pd->port_port); + } + else + ipath_cdbg(VERBOSE, "reuse port %d rcvhdrq @%p %llx phys; " + "hdrtailaddr@%p %llx physical\n", + pd->port_port, pd->port_rcvhdrq, + (unsigned long long) pd->port_rcvhdrq_phys, + pd->port_rcvhdrtail_kvaddr, (unsigned long long) + pd->port_rcvhdrqtailaddr_phys); + + /* clear for security and sanity on each use */ + memset(pd->port_rcvhdrq, 0, pd->port_rcvhdrq_size); + if (pd->port_rcvhdrtail_kvaddr) + memset(pd->port_rcvhdrtail_kvaddr, 0, PAGE_SIZE); + + /* + * tell chip each time we init it, even if we are re-using previous + * memory (we zero the register at process close) + */ + ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdrtailaddr, + pd->port_port, pd->port_rcvhdrqtailaddr_phys); + ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr, + pd->port_port, pd->port_rcvhdrq_phys); + +bail: + return ret; +} + + +/* + * Flush all sends that might be in the ready to send state, as well as any + * that are in the process of being sent. Used whenever we need to be + * sure the send side is idle. Cleans up all buffer state by canceling + * all pio buffers, and issuing an abort, which cleans up anything in the + * launch fifo. The cancel is superfluous on some chip versions, but + * it's safer to always do it. + * PIOAvail bits are updated by the chip as if normal send had happened. + */ +void ipath_cancel_sends(struct ipath_devdata *dd, int restore_sendctrl) +{ + unsigned long flags; + + if (dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) { + ipath_cdbg(VERBOSE, "Ignore while in autonegotiation\n"); + goto bail; + } + /* + * If we have SDMA, and it's not disabled, we have to kick off the + * abort state machine, provided we aren't already aborting. + * If we are in the process of aborting SDMA (!DISABLED, but ABORTING), + * we skip the rest of this routine. It is already "in progress" + */ + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) { + int skip_cancel; + unsigned long *statp = &dd->ipath_sdma_status; + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + skip_cancel = + test_and_set_bit(IPATH_SDMA_ABORTING, statp) + && !test_bit(IPATH_SDMA_DISABLED, statp); + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + if (skip_cancel) + goto bail; + } + + ipath_dbg("Cancelling all in-progress send buffers\n"); + + /* skip armlaunch errs for a while */ + dd->ipath_lastcancel = jiffies + HZ / 2; + + /* + * The abort bit is auto-clearing. We also don't want pioavail + * update happening during this, and we don't want any other + * sends going out, so turn those off for the duration. We read + * the scratch register to be sure that cancels and the abort + * have taken effect in the chip. Otherwise two parts are same + * as ipath_force_pio_avail_update() + */ + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl &= ~(INFINIPATH_S_PIOBUFAVAILUPD + | INFINIPATH_S_PIOENABLE); + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl | INFINIPATH_S_ABORT); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + /* disarm all send buffers */ + ipath_disarm_piobufs(dd, 0, + dd->ipath_piobcnt2k + dd->ipath_piobcnt4k); + + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) + set_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status); + + if (restore_sendctrl) { + /* else done by caller later if needed */ + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl |= INFINIPATH_S_PIOBUFAVAILUPD | + INFINIPATH_S_PIOENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + /* and again, be sure all have hit the chip */ + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + } + + if ((dd->ipath_flags & IPATH_HAS_SEND_DMA) && + !test_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status) && + test_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status)) { + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + /* only wait so long for intr */ + dd->ipath_sdma_abort_intr_timeout = jiffies + HZ; + dd->ipath_sdma_reset_wait = 200; + if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) + tasklet_hi_schedule(&dd->ipath_sdma_abort_task); + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + } +bail:; +} + +/* + * Force an update of in-memory copy of the pioavail registers, when + * needed for any of a variety of reasons. We read the scratch register + * to make it highly likely that the update will have happened by the + * time we return. If already off (as in cancel_sends above), this + * routine is a nop, on the assumption that the caller will "do the + * right thing". + */ +void ipath_force_pio_avail_update(struct ipath_devdata *dd) +{ + unsigned long flags; + + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + if (dd->ipath_sendctrl & INFINIPATH_S_PIOBUFAVAILUPD) { + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl & ~INFINIPATH_S_PIOBUFAVAILUPD); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + } + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); +} + +static void ipath_set_ib_lstate(struct ipath_devdata *dd, int linkcmd, + int linitcmd) +{ + u64 mod_wd; + static const char *what[4] = { + [0] = "NOP", + [INFINIPATH_IBCC_LINKCMD_DOWN] = "DOWN", + [INFINIPATH_IBCC_LINKCMD_ARMED] = "ARMED", + [INFINIPATH_IBCC_LINKCMD_ACTIVE] = "ACTIVE" + }; + + if (linitcmd == INFINIPATH_IBCC_LINKINITCMD_DISABLE) { + /* + * If we are told to disable, note that so link-recovery + * code does not attempt to bring us back up. + */ + preempt_disable(); + dd->ipath_flags |= IPATH_IB_LINK_DISABLED; + preempt_enable(); + } else if (linitcmd) { + /* + * Any other linkinitcmd will lead to LINKDOWN and then + * to INIT (if all is well), so clear flag to let + * link-recovery code attempt to bring us back up. + */ + preempt_disable(); + dd->ipath_flags &= ~IPATH_IB_LINK_DISABLED; + preempt_enable(); + } + + mod_wd = (linkcmd << dd->ibcc_lc_shift) | + (linitcmd << INFINIPATH_IBCC_LINKINITCMD_SHIFT); + ipath_cdbg(VERBOSE, + "Moving unit %u to %s (initcmd=0x%x), current ltstate is %s\n", + dd->ipath_unit, what[linkcmd], linitcmd, + ipath_ibcstatus_str[ipath_ib_linktrstate(dd, + ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus))]); + + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl | mod_wd); + /* read from chip so write is flushed */ + (void) ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus); +} + +int ipath_set_linkstate(struct ipath_devdata *dd, u8 newstate) +{ + u32 lstate; + int ret; + + switch (newstate) { + case IPATH_IB_LINKDOWN_ONLY: + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, 0); + /* don't wait */ + ret = 0; + goto bail; + + case IPATH_IB_LINKDOWN: + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, + INFINIPATH_IBCC_LINKINITCMD_POLL); + /* don't wait */ + ret = 0; + goto bail; + + case IPATH_IB_LINKDOWN_SLEEP: + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, + INFINIPATH_IBCC_LINKINITCMD_SLEEP); + /* don't wait */ + ret = 0; + goto bail; + + case IPATH_IB_LINKDOWN_DISABLE: + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, + INFINIPATH_IBCC_LINKINITCMD_DISABLE); + /* don't wait */ + ret = 0; + goto bail; + + case IPATH_IB_LINKARM: + if (dd->ipath_flags & IPATH_LINKARMED) { + ret = 0; + goto bail; + } + if (!(dd->ipath_flags & + (IPATH_LINKINIT | IPATH_LINKACTIVE))) { + ret = -EINVAL; + goto bail; + } + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ARMED, 0); + + /* + * Since the port can transition to ACTIVE by receiving + * a non VL 15 packet, wait for either state. + */ + lstate = IPATH_LINKARMED | IPATH_LINKACTIVE; + break; + + case IPATH_IB_LINKACTIVE: + if (dd->ipath_flags & IPATH_LINKACTIVE) { + ret = 0; + goto bail; + } + if (!(dd->ipath_flags & IPATH_LINKARMED)) { + ret = -EINVAL; + goto bail; + } + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ACTIVE, 0); + lstate = IPATH_LINKACTIVE; + break; + + case IPATH_IB_LINK_LOOPBACK: + dev_info(&dd->pcidev->dev, "Enabling IB local loopback\n"); + dd->ipath_ibcctrl |= INFINIPATH_IBCC_LOOPBACK; + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl); + + /* turn heartbeat off, as it causes loopback to fail */ + dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, + IPATH_IB_HRTBT_OFF); + /* don't wait */ + ret = 0; + goto bail; + + case IPATH_IB_LINK_EXTERNAL: + dev_info(&dd->pcidev->dev, + "Disabling IB local loopback (normal)\n"); + dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, + IPATH_IB_HRTBT_ON); + dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LOOPBACK; + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl); + /* don't wait */ + ret = 0; + goto bail; + + /* + * Heartbeat can be explicitly enabled by the user via + * "hrtbt_enable" "file", and if disabled, trying to enable here + * will have no effect. Implicit changes (heartbeat off when + * loopback on, and vice versa) are included to ease testing. + */ + case IPATH_IB_LINK_HRTBT: + ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, + IPATH_IB_HRTBT_ON); + goto bail; + + case IPATH_IB_LINK_NO_HRTBT: + ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, + IPATH_IB_HRTBT_OFF); + goto bail; + + default: + ipath_dbg("Invalid linkstate 0x%x requested\n", newstate); + ret = -EINVAL; + goto bail; + } + ret = ipath_wait_linkstate(dd, lstate, 2000); + +bail: + return ret; +} + +/** + * ipath_set_mtu - set the MTU + * @dd: the infinipath device + * @arg: the new MTU + * + * we can handle "any" incoming size, the issue here is whether we + * need to restrict our outgoing size. For now, we don't do any + * sanity checking on this, and we don't deal with what happens to + * programs that are already running when the size changes. + * NOTE: changing the MTU will usually cause the IBC to go back to + * link INIT state... + */ +int ipath_set_mtu(struct ipath_devdata *dd, u16 arg) +{ + u32 piosize; + int changed = 0; + int ret; + + /* + * mtu is IB data payload max. It's the largest power of 2 less + * than piosize (or even larger, since it only really controls the + * largest we can receive; we can send the max of the mtu and + * piosize). We check that it's one of the valid IB sizes. + */ + if (arg != 256 && arg != 512 && arg != 1024 && arg != 2048 && + (arg != 4096 || !ipath_mtu4096)) { + ipath_dbg("Trying to set invalid mtu %u, failing\n", arg); + ret = -EINVAL; + goto bail; + } + if (dd->ipath_ibmtu == arg) { + ret = 0; /* same as current */ + goto bail; + } + + piosize = dd->ipath_ibmaxlen; + dd->ipath_ibmtu = arg; + + if (arg >= (piosize - IPATH_PIO_MAXIBHDR)) { + /* Only if it's not the initial value (or reset to it) */ + if (piosize != dd->ipath_init_ibmaxlen) { + if (arg > piosize && arg <= dd->ipath_init_ibmaxlen) + piosize = dd->ipath_init_ibmaxlen; + dd->ipath_ibmaxlen = piosize; + changed = 1; + } + } else if ((arg + IPATH_PIO_MAXIBHDR) != dd->ipath_ibmaxlen) { + piosize = arg + IPATH_PIO_MAXIBHDR; + ipath_cdbg(VERBOSE, "ibmaxlen was 0x%x, setting to 0x%x " + "(mtu 0x%x)\n", dd->ipath_ibmaxlen, piosize, + arg); + dd->ipath_ibmaxlen = piosize; + changed = 1; + } + + if (changed) { + u64 ibc = dd->ipath_ibcctrl, ibdw; + /* + * update our housekeeping variables, and set IBC max + * size, same as init code; max IBC is max we allow in + * buffer, less the qword pbc, plus 1 for ICRC, in dwords + */ + dd->ipath_ibmaxlen = piosize - 2 * sizeof(u32); + ibdw = (dd->ipath_ibmaxlen >> 2) + 1; + ibc &= ~(INFINIPATH_IBCC_MAXPKTLEN_MASK << + dd->ibcc_mpl_shift); + ibc |= ibdw << dd->ibcc_mpl_shift; + dd->ipath_ibcctrl = ibc; + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl); + dd->ipath_f_tidtemplate(dd); + } + + ret = 0; + +bail: + return ret; +} + +int ipath_set_lid(struct ipath_devdata *dd, u32 lid, u8 lmc) +{ + dd->ipath_lid = lid; + dd->ipath_lmc = lmc; + + dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LIDLMC, lid | + (~((1U << lmc) - 1)) << 16); + + dev_info(&dd->pcidev->dev, "We got a lid: 0x%x\n", lid); + + return 0; +} + + +/** + * ipath_write_kreg_port - write a device's per-port 64-bit kernel register + * @dd: the infinipath device + * @regno: the register number to write + * @port: the port containing the register + * @value: the value to write + * + * Registers that vary with the chip implementation constants (port) + * use this routine. + */ +void ipath_write_kreg_port(const struct ipath_devdata *dd, ipath_kreg regno, + unsigned port, u64 value) +{ + u16 where; + + if (port < dd->ipath_portcnt && + (regno == dd->ipath_kregs->kr_rcvhdraddr || + regno == dd->ipath_kregs->kr_rcvhdrtailaddr)) + where = regno + port; + else + where = -1; + + ipath_write_kreg(dd, where, value); +} + +/* + * Following deal with the "obviously simple" task of overriding the state + * of the LEDS, which normally indicate link physical and logical status. + * The complications arise in dealing with different hardware mappings + * and the board-dependent routine being called from interrupts. + * and then there's the requirement to _flash_ them. + */ +#define LED_OVER_FREQ_SHIFT 8 +#define LED_OVER_FREQ_MASK (0xFF<ipath_flags & IPATH_INITTED)) + return; + + pidx = dd->ipath_led_override_phase++ & 1; + dd->ipath_led_override = dd->ipath_led_override_vals[pidx]; + timeoff = dd->ipath_led_override_timeoff; + + /* + * below potentially restores the LED values per current status, + * should also possibly setup the traffic-blink register, + * but leave that to per-chip functions. + */ + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus); + ltstate = ipath_ib_linktrstate(dd, val); + lstate = ipath_ib_linkstate(dd, val); + + dd->ipath_f_setextled(dd, lstate, ltstate); + mod_timer(&dd->ipath_led_override_timer, jiffies + timeoff); +} + +void ipath_set_led_override(struct ipath_devdata *dd, unsigned int val) +{ + int timeoff, freq; + + if (!(dd->ipath_flags & IPATH_INITTED)) + return; + + /* First check if we are blinking. If not, use 1HZ polling */ + timeoff = HZ; + freq = (val & LED_OVER_FREQ_MASK) >> LED_OVER_FREQ_SHIFT; + + if (freq) { + /* For blink, set each phase from one nybble of val */ + dd->ipath_led_override_vals[0] = val & 0xF; + dd->ipath_led_override_vals[1] = (val >> 4) & 0xF; + timeoff = (HZ << 4)/freq; + } else { + /* Non-blink set both phases the same. */ + dd->ipath_led_override_vals[0] = val & 0xF; + dd->ipath_led_override_vals[1] = val & 0xF; + } + dd->ipath_led_override_timeoff = timeoff; + + /* + * If the timer has not already been started, do so. Use a "quick" + * timeout so the function will be called soon, to look at our request. + */ + if (atomic_inc_return(&dd->ipath_led_override_timer_active) == 1) { + /* Need to start timer */ + init_timer(&dd->ipath_led_override_timer); + dd->ipath_led_override_timer.function = + ipath_run_led_override; + dd->ipath_led_override_timer.data = (unsigned long) dd; + dd->ipath_led_override_timer.expires = jiffies + 1; + add_timer(&dd->ipath_led_override_timer); + } else + atomic_dec(&dd->ipath_led_override_timer_active); +} + +/** + * ipath_shutdown_device - shut down a device + * @dd: the infinipath device + * + * This is called to make the device quiet when we are about to + * unload the driver, and also when the device is administratively + * disabled. It does not free any data structures. + * Everything it does has to be setup again by ipath_init_chip(dd,1) + */ +void ipath_shutdown_device(struct ipath_devdata *dd) +{ + unsigned long flags; + + ipath_dbg("Shutting down the device\n"); + + ipath_hol_up(dd); /* make sure user processes aren't suspended */ + + dd->ipath_flags |= IPATH_LINKUNK; + dd->ipath_flags &= ~(IPATH_INITTED | IPATH_LINKDOWN | + IPATH_LINKINIT | IPATH_LINKARMED | + IPATH_LINKACTIVE); + *dd->ipath_statusp &= ~(IPATH_STATUS_IB_CONF | + IPATH_STATUS_IB_READY); + + /* mask interrupts, but not errors */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL); + + dd->ipath_rcvctrl = 0; + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) + teardown_sdma(dd); + + /* + * gracefully stop all sends allowing any in progress to trickle out + * first. + */ + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl = 0; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); + /* flush it */ + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + /* + * enough for anything that's going to trickle out to have actually + * done so. + */ + udelay(5); + + dd->ipath_f_setextled(dd, 0, 0); /* make sure LEDs are off */ + + ipath_set_ib_lstate(dd, 0, INFINIPATH_IBCC_LINKINITCMD_DISABLE); + ipath_cancel_sends(dd, 0); + + /* + * we are shutting down, so tell components that care. We don't do + * this on just a link state change, much like ethernet, a cable + * unplug, etc. doesn't change driver state + */ + signal_ib_event(dd, IB_EVENT_PORT_ERR); + + /* disable IBC */ + dd->ipath_control &= ~INFINIPATH_C_LINKENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, + dd->ipath_control | INFINIPATH_C_FREEZEMODE); + + /* + * clear SerdesEnable and turn the leds off; do this here because + * we are unloading, so don't count on interrupts to move along + * Turn the LEDs off explicitly for the same reason. + */ + dd->ipath_f_quiet_serdes(dd); + + /* stop all the timers that might still be running */ + del_timer_sync(&dd->ipath_hol_timer); + if (dd->ipath_stats_timer_active) { + del_timer_sync(&dd->ipath_stats_timer); + dd->ipath_stats_timer_active = 0; + } + if (dd->ipath_intrchk_timer.data) { + del_timer_sync(&dd->ipath_intrchk_timer); + dd->ipath_intrchk_timer.data = 0; + } + if (atomic_read(&dd->ipath_led_override_timer_active)) { + del_timer_sync(&dd->ipath_led_override_timer); + atomic_set(&dd->ipath_led_override_timer_active, 0); + } + + /* + * clear all interrupts and errors, so that the next time the driver + * is loaded or device is enabled, we know that whatever is set + * happened while we were unloaded + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, + ~0ULL & ~INFINIPATH_HWE_MEMBISTFAILED); + ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL); + + ipath_cdbg(VERBOSE, "Flush time and errors to EEPROM\n"); + ipath_update_eeprom_log(dd); +} + +/** + * ipath_free_pddata - free a port's allocated data + * @dd: the infinipath device + * @pd: the portdata structure + * + * free up any allocated data for a port + * This should not touch anything that would affect a simultaneous + * re-allocation of port data, because it is called after ipath_mutex + * is released (and can be called from reinit as well). + * It should never change any chip state, or global driver state. + * (The only exception to global state is freeing the port0 port0_skbs.) + */ +void ipath_free_pddata(struct ipath_devdata *dd, struct ipath_portdata *pd) +{ + if (!pd) + return; + + if (pd->port_rcvhdrq) { + ipath_cdbg(VERBOSE, "free closed port %d rcvhdrq @ %p " + "(size=%lu)\n", pd->port_port, pd->port_rcvhdrq, + (unsigned long) pd->port_rcvhdrq_size); + dma_free_coherent(&dd->pcidev->dev, pd->port_rcvhdrq_size, + pd->port_rcvhdrq, pd->port_rcvhdrq_phys); + pd->port_rcvhdrq = NULL; + if (pd->port_rcvhdrtail_kvaddr) { + dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, + pd->port_rcvhdrtail_kvaddr, + pd->port_rcvhdrqtailaddr_phys); + pd->port_rcvhdrtail_kvaddr = NULL; + } + } + if (pd->port_port && pd->port_rcvegrbuf) { + unsigned e; + + for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) { + void *base = pd->port_rcvegrbuf[e]; + size_t size = pd->port_rcvegrbuf_size; + + ipath_cdbg(VERBOSE, "egrbuf free(%p, %lu), " + "chunk %u/%u\n", base, + (unsigned long) size, + e, pd->port_rcvegrbuf_chunks); + dma_free_coherent(&dd->pcidev->dev, size, + base, pd->port_rcvegrbuf_phys[e]); + } + kfree(pd->port_rcvegrbuf); + pd->port_rcvegrbuf = NULL; + kfree(pd->port_rcvegrbuf_phys); + pd->port_rcvegrbuf_phys = NULL; + pd->port_rcvegrbuf_chunks = 0; + } else if (pd->port_port == 0 && dd->ipath_port0_skbinfo) { + unsigned e; + struct ipath_skbinfo *skbinfo = dd->ipath_port0_skbinfo; + + dd->ipath_port0_skbinfo = NULL; + ipath_cdbg(VERBOSE, "free closed port %d " + "ipath_port0_skbinfo @ %p\n", pd->port_port, + skbinfo); + for (e = 0; e < dd->ipath_p0_rcvegrcnt; e++) + if (skbinfo[e].skb) { + pci_unmap_single(dd->pcidev, skbinfo[e].phys, + dd->ipath_ibmaxlen, + PCI_DMA_FROMDEVICE); + dev_kfree_skb(skbinfo[e].skb); + } + vfree(skbinfo); + } + kfree(pd->port_tid_pg_list); + vfree(pd->subport_uregbase); + vfree(pd->subport_rcvegrbuf); + vfree(pd->subport_rcvhdr_base); + kfree(pd); +} + +static int __init infinipath_init(void) +{ + int ret; + + if (ipath_debug & __IPATH_DBG) + printk(KERN_INFO DRIVER_LOAD_MSG "%s", ib_ipath_version); + + /* + * These must be called before the driver is registered with + * the PCI subsystem. + */ + idr_init(&unit_table); + + ret = pci_register_driver(&ipath_driver); + if (ret < 0) { + printk(KERN_ERR IPATH_DRV_NAME + ": Unable to register driver: error %d\n", -ret); + goto bail_unit; + } + + ret = ipath_init_ipathfs(); + if (ret < 0) { + printk(KERN_ERR IPATH_DRV_NAME ": Unable to create " + "ipathfs: error %d\n", -ret); + goto bail_pci; + } + + goto bail; + +bail_pci: + pci_unregister_driver(&ipath_driver); + +bail_unit: + idr_destroy(&unit_table); + +bail: + return ret; +} + +static void __exit infinipath_cleanup(void) +{ + ipath_exit_ipathfs(); + + ipath_cdbg(VERBOSE, "Unregistering pci driver\n"); + pci_unregister_driver(&ipath_driver); + + idr_destroy(&unit_table); +} + +/** + * ipath_reset_device - reset the chip if possible + * @unit: the device to reset + * + * Whether or not reset is successful, we attempt to re-initialize the chip + * (that is, much like a driver unload/reload). We clear the INITTED flag + * so that the various entry points will fail until we reinitialize. For + * now, we only allow this if no user ports are open that use chip resources + */ +int ipath_reset_device(int unit) +{ + int ret, i; + struct ipath_devdata *dd = ipath_lookup(unit); + unsigned long flags; + + if (!dd) { + ret = -ENODEV; + goto bail; + } + + if (atomic_read(&dd->ipath_led_override_timer_active)) { + /* Need to stop LED timer, _then_ shut off LEDs */ + del_timer_sync(&dd->ipath_led_override_timer); + atomic_set(&dd->ipath_led_override_timer_active, 0); + } + + /* Shut off LEDs after we are sure timer is not running */ + dd->ipath_led_override = LED_OVER_BOTH_OFF; + dd->ipath_f_setextled(dd, 0, 0); + + dev_info(&dd->pcidev->dev, "Reset on unit %u requested\n", unit); + + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) { + dev_info(&dd->pcidev->dev, "Invalid unit number %u or " + "not initialized or not present\n", unit); + ret = -ENXIO; + goto bail; + } + + spin_lock_irqsave(&dd->ipath_uctxt_lock, flags); + if (dd->ipath_pd) + for (i = 1; i < dd->ipath_cfgports; i++) { + if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt) + continue; + spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags); + ipath_dbg("unit %u port %d is in use " + "(PID %u cmd %s), can't reset\n", + unit, i, + pid_nr(dd->ipath_pd[i]->port_pid), + dd->ipath_pd[i]->port_comm); + ret = -EBUSY; + goto bail; + } + spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags); + + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) + teardown_sdma(dd); + + dd->ipath_flags &= ~IPATH_INITTED; + ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL); + ret = dd->ipath_f_reset(dd); + if (ret == 1) { + ipath_dbg("Reinitializing unit %u after reset attempt\n", + unit); + ret = ipath_init_chip(dd, 1); + } else + ret = -EAGAIN; + if (ret) + ipath_dev_err(dd, "Reinitialize unit %u after " + "reset failed with %d\n", unit, ret); + else + dev_info(&dd->pcidev->dev, "Reinitialized unit %u after " + "resetting\n", unit); + +bail: + return ret; +} + +/* + * send a signal to all the processes that have the driver open + * through the normal interfaces (i.e., everything other than diags + * interface). Returns number of signalled processes. + */ +static int ipath_signal_procs(struct ipath_devdata *dd, int sig) +{ + int i, sub, any = 0; + struct pid *pid; + unsigned long flags; + + if (!dd->ipath_pd) + return 0; + + spin_lock_irqsave(&dd->ipath_uctxt_lock, flags); + for (i = 1; i < dd->ipath_cfgports; i++) { + if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt) + continue; + pid = dd->ipath_pd[i]->port_pid; + if (!pid) + continue; + + dev_info(&dd->pcidev->dev, "context %d in use " + "(PID %u), sending signal %d\n", + i, pid_nr(pid), sig); + kill_pid(pid, sig, 1); + any++; + for (sub = 0; sub < INFINIPATH_MAX_SUBPORT; sub++) { + pid = dd->ipath_pd[i]->port_subpid[sub]; + if (!pid) + continue; + dev_info(&dd->pcidev->dev, "sub-context " + "%d:%d in use (PID %u), sending " + "signal %d\n", i, sub, pid_nr(pid), sig); + kill_pid(pid, sig, 1); + any++; + } + } + spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags); + return any; +} + +static void ipath_hol_signal_down(struct ipath_devdata *dd) +{ + if (ipath_signal_procs(dd, SIGSTOP)) + ipath_dbg("Stopped some processes\n"); + ipath_cancel_sends(dd, 1); +} + + +static void ipath_hol_signal_up(struct ipath_devdata *dd) +{ + if (ipath_signal_procs(dd, SIGCONT)) + ipath_dbg("Continued some processes\n"); +} + +/* + * link is down, stop any users processes, and flush pending sends + * to prevent HoL blocking, then start the HoL timer that + * periodically continues, then stop procs, so they can detect + * link down if they want, and do something about it. + * Timer may already be running, so use mod_timer, not add_timer. + */ +void ipath_hol_down(struct ipath_devdata *dd) +{ + dd->ipath_hol_state = IPATH_HOL_DOWN; + ipath_hol_signal_down(dd); + dd->ipath_hol_next = IPATH_HOL_DOWNCONT; + dd->ipath_hol_timer.expires = jiffies + + msecs_to_jiffies(ipath_hol_timeout_ms); + mod_timer(&dd->ipath_hol_timer, dd->ipath_hol_timer.expires); +} + +/* + * link is up, continue any user processes, and ensure timer + * is a nop, if running. Let timer keep running, if set; it + * will nop when it sees the link is up + */ +void ipath_hol_up(struct ipath_devdata *dd) +{ + ipath_hol_signal_up(dd); + dd->ipath_hol_state = IPATH_HOL_UP; +} + +/* + * toggle the running/not running state of user proceses + * to prevent HoL blocking on chip resources, but still allow + * user processes to do link down special case handling. + * Should only be called via the timer + */ +void ipath_hol_event(unsigned long opaque) +{ + struct ipath_devdata *dd = (struct ipath_devdata *)opaque; + + if (dd->ipath_hol_next == IPATH_HOL_DOWNSTOP + && dd->ipath_hol_state != IPATH_HOL_UP) { + dd->ipath_hol_next = IPATH_HOL_DOWNCONT; + ipath_dbg("Stopping processes\n"); + ipath_hol_signal_down(dd); + } else { /* may do "extra" if also in ipath_hol_up() */ + dd->ipath_hol_next = IPATH_HOL_DOWNSTOP; + ipath_dbg("Continuing processes\n"); + ipath_hol_signal_up(dd); + } + if (dd->ipath_hol_state == IPATH_HOL_UP) + ipath_dbg("link's up, don't resched timer\n"); + else { + dd->ipath_hol_timer.expires = jiffies + + msecs_to_jiffies(ipath_hol_timeout_ms); + mod_timer(&dd->ipath_hol_timer, + dd->ipath_hol_timer.expires); + } +} + +int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv) +{ + u64 val; + + if (new_pol_inv > INFINIPATH_XGXS_RX_POL_MASK) + return -1; + if (dd->ipath_rx_pol_inv != new_pol_inv) { + dd->ipath_rx_pol_inv = new_pol_inv; + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig); + val &= ~(INFINIPATH_XGXS_RX_POL_MASK << + INFINIPATH_XGXS_RX_POL_SHIFT); + val |= ((u64)dd->ipath_rx_pol_inv) << + INFINIPATH_XGXS_RX_POL_SHIFT; + ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val); + } + return 0; +} + +/* + * Disable and enable the armlaunch error. Used for PIO bandwidth testing on + * the 7220, which is count-based, rather than trigger-based. Safe for the + * driver check, since it's at init. Not completely safe when used for + * user-mode checking, since some error checking can be lost, but not + * particularly risky, and only has problematic side-effects in the face of + * very buggy user code. There is no reference counting, but that's also + * fine, given the intended use. + */ +void ipath_enable_armlaunch(struct ipath_devdata *dd) +{ + dd->ipath_lasterror &= ~INFINIPATH_E_SPIOARMLAUNCH; + ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, + INFINIPATH_E_SPIOARMLAUNCH); + dd->ipath_errormask |= INFINIPATH_E_SPIOARMLAUNCH; + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + dd->ipath_errormask); +} + +void ipath_disable_armlaunch(struct ipath_devdata *dd) +{ + /* so don't re-enable if already set */ + dd->ipath_maskederrs &= ~INFINIPATH_E_SPIOARMLAUNCH; + dd->ipath_errormask &= ~INFINIPATH_E_SPIOARMLAUNCH; + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + dd->ipath_errormask); +} + +module_init(infinipath_init); +module_exit(infinipath_cleanup); diff --git a/drivers/infiniband/hw/ipath/ipath_eeprom.c b/drivers/infiniband/hw/ipath/ipath_eeprom.c new file mode 100644 index 0000000..fc71819 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_eeprom.c @@ -0,0 +1,1183 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "ipath_kernel.h" + +/* + * InfiniPath I2C driver for a serial eeprom. This is not a generic + * I2C interface. For a start, the device we're using (Atmel AT24C11) + * doesn't work like a regular I2C device. It looks like one + * electrically, but not logically. Normal I2C devices have a single + * 7-bit or 10-bit I2C address that they respond to. Valid 7-bit + * addresses range from 0x03 to 0x77. Addresses 0x00 to 0x02 and 0x78 + * to 0x7F are special reserved addresses (e.g. 0x00 is the "general + * call" address.) The Atmel device, on the other hand, responds to ALL + * 7-bit addresses. It's designed to be the only device on a given I2C + * bus. A 7-bit address corresponds to the memory address within the + * Atmel device itself. + * + * Also, the timing requirements mean more than simple software + * bitbanging, with readbacks from chip to ensure timing (simple udelay + * is not enough). + * + * This all means that accessing the device is specialized enough + * that using the standard kernel I2C bitbanging interface would be + * impossible. For example, the core I2C eeprom driver expects to find + * a device at one or more of a limited set of addresses only. It doesn't + * allow writing to an eeprom. It also doesn't provide any means of + * accessing eeprom contents from within the kernel, only via sysfs. + */ + +/* Added functionality for IBA7220-based cards */ +#define IPATH_EEPROM_DEV_V1 0xA0 +#define IPATH_EEPROM_DEV_V2 0xA2 +#define IPATH_TEMP_DEV 0x98 +#define IPATH_BAD_DEV (IPATH_EEPROM_DEV_V2+2) +#define IPATH_NO_DEV (0xFF) + +/* + * The number of I2C chains is proliferating. Table below brings + * some order to the madness. The basic principle is that the + * table is scanned from the top, and a "probe" is made to the + * device probe_dev. If that succeeds, the chain is considered + * to be of that type, and dd->i2c_chain_type is set to the index+1 + * of the entry. + * The +1 is so static initialization can mean "unknown, do probe." + */ +static struct i2c_chain_desc { + u8 probe_dev; /* If seen at probe, chain is this type */ + u8 eeprom_dev; /* Dev addr (if any) for EEPROM */ + u8 temp_dev; /* Dev Addr (if any) for Temp-sense */ +} i2c_chains[] = { + { IPATH_BAD_DEV, IPATH_NO_DEV, IPATH_NO_DEV }, /* pre-iba7220 bds */ + { IPATH_EEPROM_DEV_V1, IPATH_EEPROM_DEV_V1, IPATH_TEMP_DEV}, /* V1 */ + { IPATH_EEPROM_DEV_V2, IPATH_EEPROM_DEV_V2, IPATH_TEMP_DEV}, /* V2 */ + { IPATH_NO_DEV } +}; + +enum i2c_type { + i2c_line_scl = 0, + i2c_line_sda +}; + +enum i2c_state { + i2c_line_low = 0, + i2c_line_high +}; + +#define READ_CMD 1 +#define WRITE_CMD 0 + +/** + * i2c_gpio_set - set a GPIO line + * @dd: the infinipath device + * @line: the line to set + * @new_line_state: the state to set + * + * Returns 0 if the line was set to the new state successfully, non-zero + * on error. + */ +static int i2c_gpio_set(struct ipath_devdata *dd, + enum i2c_type line, + enum i2c_state new_line_state) +{ + u64 out_mask, dir_mask, *gpioval; + unsigned long flags = 0; + + gpioval = &dd->ipath_gpio_out; + + if (line == i2c_line_scl) { + dir_mask = dd->ipath_gpio_scl; + out_mask = (1UL << dd->ipath_gpio_scl_num); + } else { + dir_mask = dd->ipath_gpio_sda; + out_mask = (1UL << dd->ipath_gpio_sda_num); + } + + spin_lock_irqsave(&dd->ipath_gpio_lock, flags); + if (new_line_state == i2c_line_high) { + /* tri-state the output rather than force high */ + dd->ipath_extctrl &= ~dir_mask; + } else { + /* config line to be an output */ + dd->ipath_extctrl |= dir_mask; + } + ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, dd->ipath_extctrl); + + /* set output as well (no real verify) */ + if (new_line_state == i2c_line_high) + *gpioval |= out_mask; + else + *gpioval &= ~out_mask; + + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_out, *gpioval); + spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags); + + return 0; +} + +/** + * i2c_gpio_get - get a GPIO line state + * @dd: the infinipath device + * @line: the line to get + * @curr_statep: where to put the line state + * + * Returns 0 if the line was set to the new state successfully, non-zero + * on error. curr_state is not set on error. + */ +static int i2c_gpio_get(struct ipath_devdata *dd, + enum i2c_type line, + enum i2c_state *curr_statep) +{ + u64 read_val, mask; + int ret; + unsigned long flags = 0; + + /* check args */ + if (curr_statep == NULL) { + ret = 1; + goto bail; + } + + /* config line to be an input */ + if (line == i2c_line_scl) + mask = dd->ipath_gpio_scl; + else + mask = dd->ipath_gpio_sda; + + spin_lock_irqsave(&dd->ipath_gpio_lock, flags); + dd->ipath_extctrl &= ~mask; + ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, dd->ipath_extctrl); + /* + * Below is very unlikely to reflect true input state if Output + * Enable actually changed. + */ + read_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus); + spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags); + + if (read_val & mask) + *curr_statep = i2c_line_high; + else + *curr_statep = i2c_line_low; + + ret = 0; + +bail: + return ret; +} + +/** + * i2c_wait_for_writes - wait for a write + * @dd: the infinipath device + * + * We use this instead of udelay directly, so we can make sure + * that previous register writes have been flushed all the way + * to the chip. Since we are delaying anyway, the cost doesn't + * hurt, and makes the bit twiddling more regular + */ +static void i2c_wait_for_writes(struct ipath_devdata *dd) +{ + (void)ipath_read_kreg32(dd, dd->ipath_kregs->kr_scratch); + rmb(); +} + +static void scl_out(struct ipath_devdata *dd, u8 bit) +{ + udelay(1); + i2c_gpio_set(dd, i2c_line_scl, bit ? i2c_line_high : i2c_line_low); + + i2c_wait_for_writes(dd); +} + +static void sda_out(struct ipath_devdata *dd, u8 bit) +{ + i2c_gpio_set(dd, i2c_line_sda, bit ? i2c_line_high : i2c_line_low); + + i2c_wait_for_writes(dd); +} + +static u8 sda_in(struct ipath_devdata *dd, int wait) +{ + enum i2c_state bit; + + if (i2c_gpio_get(dd, i2c_line_sda, &bit)) + ipath_dbg("get bit failed!\n"); + + if (wait) + i2c_wait_for_writes(dd); + + return bit == i2c_line_high ? 1U : 0; +} + +/** + * i2c_ackrcv - see if ack following write is true + * @dd: the infinipath device + */ +static int i2c_ackrcv(struct ipath_devdata *dd) +{ + u8 ack_received; + + /* AT ENTRY SCL = LOW */ + /* change direction, ignore data */ + ack_received = sda_in(dd, 1); + scl_out(dd, i2c_line_high); + ack_received = sda_in(dd, 1) == 0; + scl_out(dd, i2c_line_low); + return ack_received; +} + +/** + * rd_byte - read a byte, leaving ACK, STOP, etc up to caller + * @dd: the infinipath device + * + * Returns byte shifted out of device + */ +static int rd_byte(struct ipath_devdata *dd) +{ + int bit_cntr, data; + + data = 0; + + for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) { + data <<= 1; + scl_out(dd, i2c_line_high); + data |= sda_in(dd, 0); + scl_out(dd, i2c_line_low); + } + return data; +} + +/** + * wr_byte - write a byte, one bit at a time + * @dd: the infinipath device + * @data: the byte to write + * + * Returns 0 if we got the following ack, otherwise 1 + */ +static int wr_byte(struct ipath_devdata *dd, u8 data) +{ + int bit_cntr; + u8 bit; + + for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) { + bit = (data >> bit_cntr) & 1; + sda_out(dd, bit); + scl_out(dd, i2c_line_high); + scl_out(dd, i2c_line_low); + } + return (!i2c_ackrcv(dd)) ? 1 : 0; +} + +static void send_ack(struct ipath_devdata *dd) +{ + sda_out(dd, i2c_line_low); + scl_out(dd, i2c_line_high); + scl_out(dd, i2c_line_low); + sda_out(dd, i2c_line_high); +} + +/** + * i2c_startcmd - transmit the start condition, followed by address/cmd + * @dd: the infinipath device + * @offset_dir: direction byte + * + * (both clock/data high, clock high, data low while clock is high) + */ +static int i2c_startcmd(struct ipath_devdata *dd, u8 offset_dir) +{ + int res; + + /* issue start sequence */ + sda_out(dd, i2c_line_high); + scl_out(dd, i2c_line_high); + sda_out(dd, i2c_line_low); + scl_out(dd, i2c_line_low); + + /* issue length and direction byte */ + res = wr_byte(dd, offset_dir); + + if (res) + ipath_cdbg(VERBOSE, "No ack to complete start\n"); + + return res; +} + +/** + * stop_cmd - transmit the stop condition + * @dd: the infinipath device + * + * (both clock/data low, clock high, data high while clock is high) + */ +static void stop_cmd(struct ipath_devdata *dd) +{ + scl_out(dd, i2c_line_low); + sda_out(dd, i2c_line_low); + scl_out(dd, i2c_line_high); + sda_out(dd, i2c_line_high); + udelay(2); +} + +/** + * eeprom_reset - reset I2C communication + * @dd: the infinipath device + */ + +static int eeprom_reset(struct ipath_devdata *dd) +{ + int clock_cycles_left = 9; + u64 *gpioval = &dd->ipath_gpio_out; + int ret; + unsigned long flags; + + spin_lock_irqsave(&dd->ipath_gpio_lock, flags); + /* Make sure shadows are consistent */ + dd->ipath_extctrl = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extctrl); + *gpioval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_out); + spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags); + + ipath_cdbg(VERBOSE, "Resetting i2c eeprom; initial gpioout reg " + "is %llx\n", (unsigned long long) *gpioval); + + /* + * This is to get the i2c into a known state, by first going low, + * then tristate sda (and then tristate scl as first thing + * in loop) + */ + scl_out(dd, i2c_line_low); + sda_out(dd, i2c_line_high); + + /* Clock up to 9 cycles looking for SDA hi, then issue START and STOP */ + while (clock_cycles_left--) { + scl_out(dd, i2c_line_high); + + /* SDA seen high, issue START by dropping it while SCL high */ + if (sda_in(dd, 0)) { + sda_out(dd, i2c_line_low); + scl_out(dd, i2c_line_low); + /* ATMEL spec says must be followed by STOP. */ + scl_out(dd, i2c_line_high); + sda_out(dd, i2c_line_high); + ret = 0; + goto bail; + } + + scl_out(dd, i2c_line_low); + } + + ret = 1; + +bail: + return ret; +} + +/* + * Probe for I2C device at specified address. Returns 0 for "success" + * to match rest of this file. + * Leave bus in "reasonable" state for further commands. + */ +static int i2c_probe(struct ipath_devdata *dd, int devaddr) +{ + int ret = 0; + + ret = eeprom_reset(dd); + if (ret) { + ipath_dev_err(dd, "Failed reset probing device 0x%02X\n", + devaddr); + return ret; + } + /* + * Reset no longer leaves bus in start condition, so normal + * i2c_startcmd() will do. + */ + ret = i2c_startcmd(dd, devaddr | READ_CMD); + if (ret) + ipath_cdbg(VERBOSE, "Failed startcmd for device 0x%02X\n", + devaddr); + else { + /* + * Device did respond. Complete a single-byte read, because some + * devices apparently cannot handle STOP immediately after they + * ACK the start-cmd. + */ + int data; + data = rd_byte(dd); + stop_cmd(dd); + ipath_cdbg(VERBOSE, "Response from device 0x%02X\n", devaddr); + } + return ret; +} + +/* + * Returns the "i2c type". This is a pointer to a struct that describes + * the I2C chain on this board. To minimize impact on struct ipath_devdata, + * the (small integer) index into the table is actually memoized, rather + * then the pointer. + * Memoization is because the type is determined on the first call per chip. + * An alternative would be to move type determination to early + * init code. + */ +static struct i2c_chain_desc *ipath_i2c_type(struct ipath_devdata *dd) +{ + int idx; + + /* Get memoized index, from previous successful probes */ + idx = dd->ipath_i2c_chain_type - 1; + if (idx >= 0 && idx < (ARRAY_SIZE(i2c_chains) - 1)) + goto done; + + idx = 0; + while (i2c_chains[idx].probe_dev != IPATH_NO_DEV) { + /* if probe succeeds, this is type */ + if (!i2c_probe(dd, i2c_chains[idx].probe_dev)) + break; + ++idx; + } + + /* + * Old EEPROM (first entry) may require a reset after probe, + * rather than being able to "start" after "stop" + */ + if (idx == 0) + eeprom_reset(dd); + + if (i2c_chains[idx].probe_dev == IPATH_NO_DEV) + idx = -1; + else + dd->ipath_i2c_chain_type = idx + 1; +done: + return (idx >= 0) ? i2c_chains + idx : NULL; +} + +static int ipath_eeprom_internal_read(struct ipath_devdata *dd, + u8 eeprom_offset, void *buffer, int len) +{ + int ret; + struct i2c_chain_desc *icd; + u8 *bp = buffer; + + ret = 1; + icd = ipath_i2c_type(dd); + if (!icd) + goto bail; + + if (icd->eeprom_dev == IPATH_NO_DEV) { + /* legacy not-really-I2C */ + ipath_cdbg(VERBOSE, "Start command only address\n"); + eeprom_offset = (eeprom_offset << 1) | READ_CMD; + ret = i2c_startcmd(dd, eeprom_offset); + } else { + /* Actual I2C */ + ipath_cdbg(VERBOSE, "Start command uses devaddr\n"); + if (i2c_startcmd(dd, icd->eeprom_dev | WRITE_CMD)) { + ipath_dbg("Failed EEPROM startcmd\n"); + stop_cmd(dd); + ret = 1; + goto bail; + } + ret = wr_byte(dd, eeprom_offset); + stop_cmd(dd); + if (ret) { + ipath_dev_err(dd, "Failed to write EEPROM address\n"); + ret = 1; + goto bail; + } + ret = i2c_startcmd(dd, icd->eeprom_dev | READ_CMD); + } + if (ret) { + ipath_dbg("Failed startcmd for dev %02X\n", icd->eeprom_dev); + stop_cmd(dd); + ret = 1; + goto bail; + } + + /* + * eeprom keeps clocking data out as long as we ack, automatically + * incrementing the address. + */ + while (len-- > 0) { + /* get and store data */ + *bp++ = rd_byte(dd); + /* send ack if not the last byte */ + if (len) + send_ack(dd); + } + + stop_cmd(dd); + + ret = 0; + +bail: + return ret; +} + +static int ipath_eeprom_internal_write(struct ipath_devdata *dd, u8 eeprom_offset, + const void *buffer, int len) +{ + int sub_len; + const u8 *bp = buffer; + int max_wait_time, i; + int ret; + struct i2c_chain_desc *icd; + + ret = 1; + icd = ipath_i2c_type(dd); + if (!icd) + goto bail; + + while (len > 0) { + if (icd->eeprom_dev == IPATH_NO_DEV) { + if (i2c_startcmd(dd, + (eeprom_offset << 1) | WRITE_CMD)) { + ipath_dbg("Failed to start cmd offset %u\n", + eeprom_offset); + goto failed_write; + } + } else { + /* Real I2C */ + if (i2c_startcmd(dd, icd->eeprom_dev | WRITE_CMD)) { + ipath_dbg("Failed EEPROM startcmd\n"); + goto failed_write; + } + ret = wr_byte(dd, eeprom_offset); + if (ret) { + ipath_dev_err(dd, "Failed to write EEPROM " + "address\n"); + goto failed_write; + } + } + + sub_len = min(len, 4); + eeprom_offset += sub_len; + len -= sub_len; + + for (i = 0; i < sub_len; i++) { + if (wr_byte(dd, *bp++)) { + ipath_dbg("no ack after byte %u/%u (%u " + "total remain)\n", i, sub_len, + len + sub_len - i); + goto failed_write; + } + } + + stop_cmd(dd); + + /* + * wait for write complete by waiting for a successful + * read (the chip replies with a zero after the write + * cmd completes, and before it writes to the eeprom. + * The startcmd for the read will fail the ack until + * the writes have completed. We do this inline to avoid + * the debug prints that are in the real read routine + * if the startcmd fails. + * We also use the proper device address, so it doesn't matter + * whether we have real eeprom_dev. legacy likes any address. + */ + max_wait_time = 100; + while (i2c_startcmd(dd, icd->eeprom_dev | READ_CMD)) { + stop_cmd(dd); + if (!--max_wait_time) { + ipath_dbg("Did not get successful read to " + "complete write\n"); + goto failed_write; + } + } + /* now read (and ignore) the resulting byte */ + rd_byte(dd); + stop_cmd(dd); + } + + ret = 0; + goto bail; + +failed_write: + stop_cmd(dd); + ret = 1; + +bail: + return ret; +} + +/** + * ipath_eeprom_read - receives bytes from the eeprom via I2C + * @dd: the infinipath device + * @eeprom_offset: address to read from + * @buffer: where to store result + * @len: number of bytes to receive + */ +int ipath_eeprom_read(struct ipath_devdata *dd, u8 eeprom_offset, + void *buff, int len) +{ + int ret; + + ret = mutex_lock_interruptible(&dd->ipath_eep_lock); + if (!ret) { + ret = ipath_eeprom_internal_read(dd, eeprom_offset, buff, len); + mutex_unlock(&dd->ipath_eep_lock); + } + + return ret; +} + +/** + * ipath_eeprom_write - writes data to the eeprom via I2C + * @dd: the infinipath device + * @eeprom_offset: where to place data + * @buffer: data to write + * @len: number of bytes to write + */ +int ipath_eeprom_write(struct ipath_devdata *dd, u8 eeprom_offset, + const void *buff, int len) +{ + int ret; + + ret = mutex_lock_interruptible(&dd->ipath_eep_lock); + if (!ret) { + ret = ipath_eeprom_internal_write(dd, eeprom_offset, buff, len); + mutex_unlock(&dd->ipath_eep_lock); + } + + return ret; +} + +static u8 flash_csum(struct ipath_flash *ifp, int adjust) +{ + u8 *ip = (u8 *) ifp; + u8 csum = 0, len; + + /* + * Limit length checksummed to max length of actual data. + * Checksum of erased eeprom will still be bad, but we avoid + * reading past the end of the buffer we were passed. + */ + len = ifp->if_length; + if (len > sizeof(struct ipath_flash)) + len = sizeof(struct ipath_flash); + while (len--) + csum += *ip++; + csum -= ifp->if_csum; + csum = ~csum; + if (adjust) + ifp->if_csum = csum; + + return csum; +} + +/** + * ipath_get_guid - get the GUID from the i2c device + * @dd: the infinipath device + * + * We have the capability to use the ipath_nguid field, and get + * the guid from the first chip's flash, to use for all of them. + */ +void ipath_get_eeprom_info(struct ipath_devdata *dd) +{ + void *buf; + struct ipath_flash *ifp; + __be64 guid; + int len, eep_stat; + u8 csum, *bguid; + int t = dd->ipath_unit; + struct ipath_devdata *dd0 = ipath_lookup(0); + + if (t && dd0->ipath_nguid > 1 && t <= dd0->ipath_nguid) { + u8 oguid; + dd->ipath_guid = dd0->ipath_guid; + bguid = (u8 *) & dd->ipath_guid; + + oguid = bguid[7]; + bguid[7] += t; + if (oguid > bguid[7]) { + if (bguid[6] == 0xff) { + if (bguid[5] == 0xff) { + ipath_dev_err( + dd, + "Can't set %s GUID from " + "base, wraps to OUI!\n", + ipath_get_unit_name(t)); + dd->ipath_guid = 0; + goto bail; + } + bguid[5]++; + } + bguid[6]++; + } + dd->ipath_nguid = 1; + + ipath_dbg("nguid %u, so adding %u to device 0 guid, " + "for %llx\n", + dd0->ipath_nguid, t, + (unsigned long long) be64_to_cpu(dd->ipath_guid)); + goto bail; + } + + /* + * read full flash, not just currently used part, since it may have + * been written with a newer definition + * */ + len = sizeof(struct ipath_flash); + buf = vmalloc(len); + if (!buf) { + ipath_dev_err(dd, "Couldn't allocate memory to read %u " + "bytes from eeprom for GUID\n", len); + goto bail; + } + + mutex_lock(&dd->ipath_eep_lock); + eep_stat = ipath_eeprom_internal_read(dd, 0, buf, len); + mutex_unlock(&dd->ipath_eep_lock); + + if (eep_stat) { + ipath_dev_err(dd, "Failed reading GUID from eeprom\n"); + goto done; + } + ifp = (struct ipath_flash *)buf; + + csum = flash_csum(ifp, 0); + if (csum != ifp->if_csum) { + dev_info(&dd->pcidev->dev, "Bad I2C flash checksum: " + "0x%x, not 0x%x\n", csum, ifp->if_csum); + goto done; + } + if (*(__be64 *) ifp->if_guid == cpu_to_be64(0) || + *(__be64 *) ifp->if_guid == ~cpu_to_be64(0)) { + ipath_dev_err(dd, "Invalid GUID %llx from flash; " + "ignoring\n", + *(unsigned long long *) ifp->if_guid); + /* don't allow GUID if all 0 or all 1's */ + goto done; + } + + /* complain, but allow it */ + if (*(u64 *) ifp->if_guid == 0x100007511000000ULL) + dev_info(&dd->pcidev->dev, "Warning, GUID %llx is " + "default, probably not correct!\n", + *(unsigned long long *) ifp->if_guid); + + bguid = ifp->if_guid; + if (!bguid[0] && !bguid[1] && !bguid[2]) { + /* original incorrect GUID format in flash; fix in + * core copy, by shifting up 2 octets; don't need to + * change top octet, since both it and shifted are + * 0.. */ + bguid[1] = bguid[3]; + bguid[2] = bguid[4]; + bguid[3] = bguid[4] = 0; + guid = *(__be64 *) ifp->if_guid; + ipath_cdbg(VERBOSE, "Old GUID format in flash, top 3 zero, " + "shifting 2 octets\n"); + } else + guid = *(__be64 *) ifp->if_guid; + dd->ipath_guid = guid; + dd->ipath_nguid = ifp->if_numguid; + /* + * Things are slightly complicated by the desire to transparently + * support both the Pathscale 10-digit serial number and the QLogic + * 13-character version. + */ + if ((ifp->if_fversion > 1) && ifp->if_sprefix[0] + && ((u8 *)ifp->if_sprefix)[0] != 0xFF) { + /* This board has a Serial-prefix, which is stored + * elsewhere for backward-compatibility. + */ + char *snp = dd->ipath_serial; + memcpy(snp, ifp->if_sprefix, sizeof ifp->if_sprefix); + snp[sizeof ifp->if_sprefix] = '\0'; + len = strlen(snp); + snp += len; + len = (sizeof dd->ipath_serial) - len; + if (len > sizeof ifp->if_serial) { + len = sizeof ifp->if_serial; + } + memcpy(snp, ifp->if_serial, len); + } else + memcpy(dd->ipath_serial, ifp->if_serial, + sizeof ifp->if_serial); + if (!strstr(ifp->if_comment, "Tested successfully")) + ipath_dev_err(dd, "Board SN %s did not pass functional " + "test: %s\n", dd->ipath_serial, + ifp->if_comment); + + ipath_cdbg(VERBOSE, "Initted GUID to %llx from eeprom\n", + (unsigned long long) be64_to_cpu(dd->ipath_guid)); + + memcpy(&dd->ipath_eep_st_errs, &ifp->if_errcntp, IPATH_EEP_LOG_CNT); + /* + * Power-on (actually "active") hours are kept as little-endian value + * in EEPROM, but as seconds in a (possibly as small as 24-bit) + * atomic_t while running. + */ + atomic_set(&dd->ipath_active_time, 0); + dd->ipath_eep_hrs = ifp->if_powerhour[0] | (ifp->if_powerhour[1] << 8); + +done: + vfree(buf); + +bail:; +} + +/** + * ipath_update_eeprom_log - copy active-time and error counters to eeprom + * @dd: the infinipath device + * + * Although the time is kept as seconds in the ipath_devdata struct, it is + * rounded to hours for re-write, as we have only 16 bits in EEPROM. + * First-cut code reads whole (expected) struct ipath_flash, modifies, + * re-writes. Future direction: read/write only what we need, assuming + * that the EEPROM had to have been "good enough" for driver init, and + * if not, we aren't making it worse. + * + */ + +int ipath_update_eeprom_log(struct ipath_devdata *dd) +{ + void *buf; + struct ipath_flash *ifp; + int len, hi_water; + uint32_t new_time, new_hrs; + u8 csum; + int ret, idx; + unsigned long flags; + + /* first, check if we actually need to do anything. */ + ret = 0; + for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) { + if (dd->ipath_eep_st_new_errs[idx]) { + ret = 1; + break; + } + } + new_time = atomic_read(&dd->ipath_active_time); + + if (ret == 0 && new_time < 3600) + return 0; + + /* + * The quick-check above determined that there is something worthy + * of logging, so get current contents and do a more detailed idea. + * read full flash, not just currently used part, since it may have + * been written with a newer definition + */ + len = sizeof(struct ipath_flash); + buf = vmalloc(len); + ret = 1; + if (!buf) { + ipath_dev_err(dd, "Couldn't allocate memory to read %u " + "bytes from eeprom for logging\n", len); + goto bail; + } + + /* Grab semaphore and read current EEPROM. If we get an + * error, let go, but if not, keep it until we finish write. + */ + ret = mutex_lock_interruptible(&dd->ipath_eep_lock); + if (ret) { + ipath_dev_err(dd, "Unable to acquire EEPROM for logging\n"); + goto free_bail; + } + ret = ipath_eeprom_internal_read(dd, 0, buf, len); + if (ret) { + mutex_unlock(&dd->ipath_eep_lock); + ipath_dev_err(dd, "Unable read EEPROM for logging\n"); + goto free_bail; + } + ifp = (struct ipath_flash *)buf; + + csum = flash_csum(ifp, 0); + if (csum != ifp->if_csum) { + mutex_unlock(&dd->ipath_eep_lock); + ipath_dev_err(dd, "EEPROM cks err (0x%02X, S/B 0x%02X)\n", + csum, ifp->if_csum); + ret = 1; + goto free_bail; + } + hi_water = 0; + spin_lock_irqsave(&dd->ipath_eep_st_lock, flags); + for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) { + int new_val = dd->ipath_eep_st_new_errs[idx]; + if (new_val) { + /* + * If we have seen any errors, add to EEPROM values + * We need to saturate at 0xFF (255) and we also + * would need to adjust the checksum if we were + * trying to minimize EEPROM traffic + * Note that we add to actual current count in EEPROM, + * in case it was altered while we were running. + */ + new_val += ifp->if_errcntp[idx]; + if (new_val > 0xFF) + new_val = 0xFF; + if (ifp->if_errcntp[idx] != new_val) { + ifp->if_errcntp[idx] = new_val; + hi_water = offsetof(struct ipath_flash, + if_errcntp) + idx; + } + /* + * update our shadow (used to minimize EEPROM + * traffic), to match what we are about to write. + */ + dd->ipath_eep_st_errs[idx] = new_val; + dd->ipath_eep_st_new_errs[idx] = 0; + } + } + /* + * now update active-time. We would like to round to the nearest hour + * but unless atomic_t are sure to be proper signed ints we cannot, + * because we need to account for what we "transfer" to EEPROM and + * if we log an hour at 31 minutes, then we would need to set + * active_time to -29 to accurately count the _next_ hour. + */ + if (new_time >= 3600) { + new_hrs = new_time / 3600; + atomic_sub((new_hrs * 3600), &dd->ipath_active_time); + new_hrs += dd->ipath_eep_hrs; + if (new_hrs > 0xFFFF) + new_hrs = 0xFFFF; + dd->ipath_eep_hrs = new_hrs; + if ((new_hrs & 0xFF) != ifp->if_powerhour[0]) { + ifp->if_powerhour[0] = new_hrs & 0xFF; + hi_water = offsetof(struct ipath_flash, if_powerhour); + } + if ((new_hrs >> 8) != ifp->if_powerhour[1]) { + ifp->if_powerhour[1] = new_hrs >> 8; + hi_water = offsetof(struct ipath_flash, if_powerhour) + + 1; + } + } + /* + * There is a tiny possibility that we could somehow fail to write + * the EEPROM after updating our shadows, but problems from holding + * the spinlock too long are a much bigger issue. + */ + spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags); + if (hi_water) { + /* we made some change to the data, uopdate cksum and write */ + csum = flash_csum(ifp, 1); + ret = ipath_eeprom_internal_write(dd, 0, buf, hi_water + 1); + } + mutex_unlock(&dd->ipath_eep_lock); + if (ret) + ipath_dev_err(dd, "Failed updating EEPROM\n"); + +free_bail: + vfree(buf); +bail: + return ret; + +} + +/** + * ipath_inc_eeprom_err - increment one of the four error counters + * that are logged to EEPROM. + * @dd: the infinipath device + * @eidx: 0..3, the counter to increment + * @incr: how much to add + * + * Each counter is 8-bits, and saturates at 255 (0xFF). They + * are copied to the EEPROM (aka flash) whenever ipath_update_eeprom_log() + * is called, but it can only be called in a context that allows sleep. + * This function can be called even at interrupt level. + */ + +void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr) +{ + uint new_val; + unsigned long flags; + + spin_lock_irqsave(&dd->ipath_eep_st_lock, flags); + new_val = dd->ipath_eep_st_new_errs[eidx] + incr; + if (new_val > 255) + new_val = 255; + dd->ipath_eep_st_new_errs[eidx] = new_val; + spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags); + return; +} + +static int ipath_tempsense_internal_read(struct ipath_devdata *dd, u8 regnum) +{ + int ret; + struct i2c_chain_desc *icd; + + ret = -ENOENT; + + icd = ipath_i2c_type(dd); + if (!icd) + goto bail; + + if (icd->temp_dev == IPATH_NO_DEV) { + /* tempsense only exists on new, real-I2C boards */ + ret = -ENXIO; + goto bail; + } + + if (i2c_startcmd(dd, icd->temp_dev | WRITE_CMD)) { + ipath_dbg("Failed tempsense startcmd\n"); + stop_cmd(dd); + ret = -ENXIO; + goto bail; + } + ret = wr_byte(dd, regnum); + stop_cmd(dd); + if (ret) { + ipath_dev_err(dd, "Failed tempsense WR command %02X\n", + regnum); + ret = -ENXIO; + goto bail; + } + if (i2c_startcmd(dd, icd->temp_dev | READ_CMD)) { + ipath_dbg("Failed tempsense RD startcmd\n"); + stop_cmd(dd); + ret = -ENXIO; + goto bail; + } + /* + * We can only clock out one byte per command, sensibly + */ + ret = rd_byte(dd); + stop_cmd(dd); + +bail: + return ret; +} + +#define VALID_TS_RD_REG_MASK 0xBF + +/** + * ipath_tempsense_read - read register of temp sensor via I2C + * @dd: the infinipath device + * @regnum: register to read from + * + * returns reg contents (0..255) or < 0 for error + */ +int ipath_tempsense_read(struct ipath_devdata *dd, u8 regnum) +{ + int ret; + + if (regnum > 7) + return -EINVAL; + + /* return a bogus value for (the one) register we do not have */ + if (!((1 << regnum) & VALID_TS_RD_REG_MASK)) + return 0; + + ret = mutex_lock_interruptible(&dd->ipath_eep_lock); + if (!ret) { + ret = ipath_tempsense_internal_read(dd, regnum); + mutex_unlock(&dd->ipath_eep_lock); + } + + /* + * There are three possibilities here: + * ret is actual value (0..255) + * ret is -ENXIO or -EINVAL from code in this file + * ret is -EINTR from mutex_lock_interruptible. + */ + return ret; +} + +static int ipath_tempsense_internal_write(struct ipath_devdata *dd, + u8 regnum, u8 data) +{ + int ret = -ENOENT; + struct i2c_chain_desc *icd; + + icd = ipath_i2c_type(dd); + if (!icd) + goto bail; + + if (icd->temp_dev == IPATH_NO_DEV) { + /* tempsense only exists on new, real-I2C boards */ + ret = -ENXIO; + goto bail; + } + if (i2c_startcmd(dd, icd->temp_dev | WRITE_CMD)) { + ipath_dbg("Failed tempsense startcmd\n"); + stop_cmd(dd); + ret = -ENXIO; + goto bail; + } + ret = wr_byte(dd, regnum); + if (ret) { + stop_cmd(dd); + ipath_dev_err(dd, "Failed to write tempsense command %02X\n", + regnum); + ret = -ENXIO; + goto bail; + } + ret = wr_byte(dd, data); + stop_cmd(dd); + ret = i2c_startcmd(dd, icd->temp_dev | READ_CMD); + if (ret) { + ipath_dev_err(dd, "Failed tempsense data wrt to %02X\n", + regnum); + ret = -ENXIO; + } + +bail: + return ret; +} + +#define VALID_TS_WR_REG_MASK ((1 << 9) | (1 << 0xB) | (1 << 0xD)) + +/** + * ipath_tempsense_write - write register of temp sensor via I2C + * @dd: the infinipath device + * @regnum: register to write + * @data: data to write + * + * returns 0 for success or < 0 for error + */ +int ipath_tempsense_write(struct ipath_devdata *dd, u8 regnum, u8 data) +{ + int ret; + + if (regnum > 15 || !((1 << regnum) & VALID_TS_WR_REG_MASK)) + return -EINVAL; + + ret = mutex_lock_interruptible(&dd->ipath_eep_lock); + if (!ret) { + ret = ipath_tempsense_internal_write(dd, regnum, data); + mutex_unlock(&dd->ipath_eep_lock); + } + + /* + * There are three possibilities here: + * ret is 0 for success + * ret is -ENXIO or -EINVAL from code in this file + * ret is -EINTR from mutex_lock_interruptible. + */ + return ret; +} diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c new file mode 100644 index 0000000..6d7f453 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c @@ -0,0 +1,2617 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ipath_kernel.h" +#include "ipath_common.h" +#include "ipath_user_sdma.h" + +static int ipath_open(struct inode *, struct file *); +static int ipath_close(struct inode *, struct file *); +static ssize_t ipath_write(struct file *, const char __user *, size_t, + loff_t *); +static ssize_t ipath_writev(struct kiocb *, const struct iovec *, + unsigned long , loff_t); +static unsigned int ipath_poll(struct file *, struct poll_table_struct *); +static int ipath_mmap(struct file *, struct vm_area_struct *); + +static const struct file_operations ipath_file_ops = { + .owner = THIS_MODULE, + .write = ipath_write, + .aio_write = ipath_writev, + .open = ipath_open, + .release = ipath_close, + .poll = ipath_poll, + .mmap = ipath_mmap, + .llseek = noop_llseek, +}; + +/* + * Convert kernel virtual addresses to physical addresses so they don't + * potentially conflict with the chip addresses used as mmap offsets. + * It doesn't really matter what mmap offset we use as long as we can + * interpret it correctly. + */ +static u64 cvt_kvaddr(void *p) +{ + struct page *page; + u64 paddr = 0; + + page = vmalloc_to_page(p); + if (page) + paddr = page_to_pfn(page) << PAGE_SHIFT; + + return paddr; +} + +static int ipath_get_base_info(struct file *fp, + void __user *ubase, size_t ubase_size) +{ + struct ipath_portdata *pd = port_fp(fp); + int ret = 0; + struct ipath_base_info *kinfo = NULL; + struct ipath_devdata *dd = pd->port_dd; + unsigned subport_cnt; + int shared, master; + size_t sz; + + subport_cnt = pd->port_subport_cnt; + if (!subport_cnt) { + shared = 0; + master = 0; + subport_cnt = 1; + } else { + shared = 1; + master = !subport_fp(fp); + } + + sz = sizeof(*kinfo); + /* If port sharing is not requested, allow the old size structure */ + if (!shared) + sz -= 7 * sizeof(u64); + if (ubase_size < sz) { + ipath_cdbg(PROC, + "Base size %zu, need %zu (version mismatch?)\n", + ubase_size, sz); + ret = -EINVAL; + goto bail; + } + + kinfo = kzalloc(sizeof(*kinfo), GFP_KERNEL); + if (kinfo == NULL) { + ret = -ENOMEM; + goto bail; + } + + ret = dd->ipath_f_get_base_info(pd, kinfo); + if (ret < 0) + goto bail; + + kinfo->spi_rcvhdr_cnt = dd->ipath_rcvhdrcnt; + kinfo->spi_rcvhdrent_size = dd->ipath_rcvhdrentsize; + kinfo->spi_tidegrcnt = dd->ipath_rcvegrcnt; + kinfo->spi_rcv_egrbufsize = dd->ipath_rcvegrbufsize; + /* + * have to mmap whole thing + */ + kinfo->spi_rcv_egrbuftotlen = + pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size; + kinfo->spi_rcv_egrperchunk = pd->port_rcvegrbufs_perchunk; + kinfo->spi_rcv_egrchunksize = kinfo->spi_rcv_egrbuftotlen / + pd->port_rcvegrbuf_chunks; + kinfo->spi_tidcnt = dd->ipath_rcvtidcnt / subport_cnt; + if (master) + kinfo->spi_tidcnt += dd->ipath_rcvtidcnt % subport_cnt; + /* + * for this use, may be ipath_cfgports summed over all chips that + * are are configured and present + */ + kinfo->spi_nports = dd->ipath_cfgports; + /* unit (chip/board) our port is on */ + kinfo->spi_unit = dd->ipath_unit; + /* for now, only a single page */ + kinfo->spi_tid_maxsize = PAGE_SIZE; + + /* + * Doing this per port, and based on the skip value, etc. This has + * to be the actual buffer size, since the protocol code treats it + * as an array. + * + * These have to be set to user addresses in the user code via mmap. + * These values are used on return to user code for the mmap target + * addresses only. For 32 bit, same 44 bit address problem, so use + * the physical address, not virtual. Before 2.6.11, using the + * page_address() macro worked, but in 2.6.11, even that returns the + * full 64 bit address (upper bits all 1's). So far, using the + * physical addresses (or chip offsets, for chip mapping) works, but + * no doubt some future kernel release will change that, and we'll be + * on to yet another method of dealing with this. + */ + kinfo->spi_rcvhdr_base = (u64) pd->port_rcvhdrq_phys; + kinfo->spi_rcvhdr_tailaddr = (u64) pd->port_rcvhdrqtailaddr_phys; + kinfo->spi_rcv_egrbufs = (u64) pd->port_rcvegr_phys; + kinfo->spi_pioavailaddr = (u64) dd->ipath_pioavailregs_phys; + kinfo->spi_status = (u64) kinfo->spi_pioavailaddr + + (void *) dd->ipath_statusp - + (void *) dd->ipath_pioavailregs_dma; + if (!shared) { + kinfo->spi_piocnt = pd->port_piocnt; + kinfo->spi_piobufbase = (u64) pd->port_piobufs; + kinfo->__spi_uregbase = (u64) dd->ipath_uregbase + + dd->ipath_ureg_align * pd->port_port; + } else if (master) { + kinfo->spi_piocnt = (pd->port_piocnt / subport_cnt) + + (pd->port_piocnt % subport_cnt); + /* Master's PIO buffers are after all the slave's */ + kinfo->spi_piobufbase = (u64) pd->port_piobufs + + dd->ipath_palign * + (pd->port_piocnt - kinfo->spi_piocnt); + } else { + unsigned slave = subport_fp(fp) - 1; + + kinfo->spi_piocnt = pd->port_piocnt / subport_cnt; + kinfo->spi_piobufbase = (u64) pd->port_piobufs + + dd->ipath_palign * kinfo->spi_piocnt * slave; + } + + if (shared) { + kinfo->spi_port_uregbase = (u64) dd->ipath_uregbase + + dd->ipath_ureg_align * pd->port_port; + kinfo->spi_port_rcvegrbuf = kinfo->spi_rcv_egrbufs; + kinfo->spi_port_rcvhdr_base = kinfo->spi_rcvhdr_base; + kinfo->spi_port_rcvhdr_tailaddr = kinfo->spi_rcvhdr_tailaddr; + + kinfo->__spi_uregbase = cvt_kvaddr(pd->subport_uregbase + + PAGE_SIZE * subport_fp(fp)); + + kinfo->spi_rcvhdr_base = cvt_kvaddr(pd->subport_rcvhdr_base + + pd->port_rcvhdrq_size * subport_fp(fp)); + kinfo->spi_rcvhdr_tailaddr = 0; + kinfo->spi_rcv_egrbufs = cvt_kvaddr(pd->subport_rcvegrbuf + + pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size * + subport_fp(fp)); + + kinfo->spi_subport_uregbase = + cvt_kvaddr(pd->subport_uregbase); + kinfo->spi_subport_rcvegrbuf = + cvt_kvaddr(pd->subport_rcvegrbuf); + kinfo->spi_subport_rcvhdr_base = + cvt_kvaddr(pd->subport_rcvhdr_base); + ipath_cdbg(PROC, "port %u flags %x %llx %llx %llx\n", + kinfo->spi_port, kinfo->spi_runtime_flags, + (unsigned long long) kinfo->spi_subport_uregbase, + (unsigned long long) kinfo->spi_subport_rcvegrbuf, + (unsigned long long) kinfo->spi_subport_rcvhdr_base); + } + + /* + * All user buffers are 2KB buffers. If we ever support + * giving 4KB buffers to user processes, this will need some + * work. + */ + kinfo->spi_pioindex = (kinfo->spi_piobufbase - + (dd->ipath_piobufbase & 0xffffffff)) / dd->ipath_palign; + kinfo->spi_pioalign = dd->ipath_palign; + + kinfo->spi_qpair = IPATH_KD_QP; + /* + * user mode PIO buffers are always 2KB, even when 4KB can + * be received, and sent via the kernel; this is ibmaxlen + * for 2K MTU. + */ + kinfo->spi_piosize = dd->ipath_piosize2k - 2 * sizeof(u32); + kinfo->spi_mtu = dd->ipath_ibmaxlen; /* maxlen, not ibmtu */ + kinfo->spi_port = pd->port_port; + kinfo->spi_subport = subport_fp(fp); + kinfo->spi_sw_version = IPATH_KERN_SWVERSION; + kinfo->spi_hw_version = dd->ipath_revision; + + if (master) { + kinfo->spi_runtime_flags |= IPATH_RUNTIME_MASTER; + } + + sz = (ubase_size < sizeof(*kinfo)) ? ubase_size : sizeof(*kinfo); + if (copy_to_user(ubase, kinfo, sz)) + ret = -EFAULT; + +bail: + kfree(kinfo); + return ret; +} + +/** + * ipath_tid_update - update a port TID + * @pd: the port + * @fp: the ipath device file + * @ti: the TID information + * + * The new implementation as of Oct 2004 is that the driver assigns + * the tid and returns it to the caller. To make it easier to + * catch bugs, and to reduce search time, we keep a cursor for + * each port, walking the shadow tid array to find one that's not + * in use. + * + * For now, if we can't allocate the full list, we fail, although + * in the long run, we'll allocate as many as we can, and the + * caller will deal with that by trying the remaining pages later. + * That means that when we fail, we have to mark the tids as not in + * use again, in our shadow copy. + * + * It's up to the caller to free the tids when they are done. + * We'll unlock the pages as they free them. + * + * Also, right now we are locking one page at a time, but since + * the intended use of this routine is for a single group of + * virtually contiguous pages, that should change to improve + * performance. + */ +static int ipath_tid_update(struct ipath_portdata *pd, struct file *fp, + const struct ipath_tid_info *ti) +{ + int ret = 0, ntids; + u32 tid, porttid, cnt, i, tidcnt, tidoff; + u16 *tidlist; + struct ipath_devdata *dd = pd->port_dd; + u64 physaddr; + unsigned long vaddr; + u64 __iomem *tidbase; + unsigned long tidmap[8]; + struct page **pagep = NULL; + unsigned subport = subport_fp(fp); + + if (!dd->ipath_pageshadow) { + ret = -ENOMEM; + goto done; + } + + cnt = ti->tidcnt; + if (!cnt) { + ipath_dbg("After copyin, tidcnt 0, tidlist %llx\n", + (unsigned long long) ti->tidlist); + /* + * Should we treat as success? likely a bug + */ + ret = -EFAULT; + goto done; + } + porttid = pd->port_port * dd->ipath_rcvtidcnt; + if (!pd->port_subport_cnt) { + tidcnt = dd->ipath_rcvtidcnt; + tid = pd->port_tidcursor; + tidoff = 0; + } else if (!subport) { + tidcnt = (dd->ipath_rcvtidcnt / pd->port_subport_cnt) + + (dd->ipath_rcvtidcnt % pd->port_subport_cnt); + tidoff = dd->ipath_rcvtidcnt - tidcnt; + porttid += tidoff; + tid = tidcursor_fp(fp); + } else { + tidcnt = dd->ipath_rcvtidcnt / pd->port_subport_cnt; + tidoff = tidcnt * (subport - 1); + porttid += tidoff; + tid = tidcursor_fp(fp); + } + if (cnt > tidcnt) { + /* make sure it all fits in port_tid_pg_list */ + dev_info(&dd->pcidev->dev, "Process tried to allocate %u " + "TIDs, only trying max (%u)\n", cnt, tidcnt); + cnt = tidcnt; + } + pagep = &((struct page **) pd->port_tid_pg_list)[tidoff]; + tidlist = &((u16 *) &pagep[dd->ipath_rcvtidcnt])[tidoff]; + + memset(tidmap, 0, sizeof(tidmap)); + /* before decrement; chip actual # */ + ntids = tidcnt; + tidbase = (u64 __iomem *) (((char __iomem *) dd->ipath_kregbase) + + dd->ipath_rcvtidbase + + porttid * sizeof(*tidbase)); + + ipath_cdbg(VERBOSE, "Port%u %u tids, cursor %u, tidbase %p\n", + pd->port_port, cnt, tid, tidbase); + + /* virtual address of first page in transfer */ + vaddr = ti->tidvaddr; + if (!access_ok(VERIFY_WRITE, (void __user *) vaddr, + cnt * PAGE_SIZE)) { + ipath_dbg("Fail vaddr %p, %u pages, !access_ok\n", + (void *)vaddr, cnt); + ret = -EFAULT; + goto done; + } + ret = ipath_get_user_pages(vaddr, cnt, pagep); + if (ret) { + if (ret == -EBUSY) { + ipath_dbg("Failed to lock addr %p, %u pages " + "(already locked)\n", + (void *) vaddr, cnt); + /* + * for now, continue, and see what happens but with + * the new implementation, this should never happen, + * unless perhaps the user has mpin'ed the pages + * themselves (something we need to test) + */ + ret = 0; + } else { + dev_info(&dd->pcidev->dev, + "Failed to lock addr %p, %u pages: " + "errno %d\n", (void *) vaddr, cnt, -ret); + goto done; + } + } + for (i = 0; i < cnt; i++, vaddr += PAGE_SIZE) { + for (; ntids--; tid++) { + if (tid == tidcnt) + tid = 0; + if (!dd->ipath_pageshadow[porttid + tid]) + break; + } + if (ntids < 0) { + /* + * oops, wrapped all the way through their TIDs, + * and didn't have enough free; see comments at + * start of routine + */ + ipath_dbg("Not enough free TIDs for %u pages " + "(index %d), failing\n", cnt, i); + i--; /* last tidlist[i] not filled in */ + ret = -ENOMEM; + break; + } + tidlist[i] = tid + tidoff; + ipath_cdbg(VERBOSE, "Updating idx %u to TID %u, " + "vaddr %lx\n", i, tid + tidoff, vaddr); + /* we "know" system pages and TID pages are same size */ + dd->ipath_pageshadow[porttid + tid] = pagep[i]; + dd->ipath_physshadow[porttid + tid] = ipath_map_page( + dd->pcidev, pagep[i], 0, PAGE_SIZE, + PCI_DMA_FROMDEVICE); + /* + * don't need atomic or it's overhead + */ + __set_bit(tid, tidmap); + physaddr = dd->ipath_physshadow[porttid + tid]; + ipath_stats.sps_pagelocks++; + ipath_cdbg(VERBOSE, + "TID %u, vaddr %lx, physaddr %llx pgp %p\n", + tid, vaddr, (unsigned long long) physaddr, + pagep[i]); + dd->ipath_f_put_tid(dd, &tidbase[tid], RCVHQ_RCV_TYPE_EXPECTED, + physaddr); + /* + * don't check this tid in ipath_portshadow, since we + * just filled it in; start with the next one. + */ + tid++; + } + + if (ret) { + u32 limit; + cleanup: + /* jump here if copy out of updated info failed... */ + ipath_dbg("After failure (ret=%d), undo %d of %d entries\n", + -ret, i, cnt); + /* same code that's in ipath_free_tid() */ + limit = sizeof(tidmap) * BITS_PER_BYTE; + if (limit > tidcnt) + /* just in case size changes in future */ + limit = tidcnt; + tid = find_first_bit((const unsigned long *)tidmap, limit); + for (; tid < limit; tid++) { + if (!test_bit(tid, tidmap)) + continue; + if (dd->ipath_pageshadow[porttid + tid]) { + ipath_cdbg(VERBOSE, "Freeing TID %u\n", + tid); + dd->ipath_f_put_tid(dd, &tidbase[tid], + RCVHQ_RCV_TYPE_EXPECTED, + dd->ipath_tidinvalid); + pci_unmap_page(dd->pcidev, + dd->ipath_physshadow[porttid + tid], + PAGE_SIZE, PCI_DMA_FROMDEVICE); + dd->ipath_pageshadow[porttid + tid] = NULL; + ipath_stats.sps_pageunlocks++; + } + } + ipath_release_user_pages(pagep, cnt); + } else { + /* + * Copy the updated array, with ipath_tid's filled in, back + * to user. Since we did the copy in already, this "should + * never fail" If it does, we have to clean up... + */ + if (copy_to_user((void __user *) + (unsigned long) ti->tidlist, + tidlist, cnt * sizeof(*tidlist))) { + ret = -EFAULT; + goto cleanup; + } + if (copy_to_user((void __user *) (unsigned long) ti->tidmap, + tidmap, sizeof tidmap)) { + ret = -EFAULT; + goto cleanup; + } + if (tid == tidcnt) + tid = 0; + if (!pd->port_subport_cnt) + pd->port_tidcursor = tid; + else + tidcursor_fp(fp) = tid; + } + +done: + if (ret) + ipath_dbg("Failed to map %u TID pages, failing with %d\n", + ti->tidcnt, -ret); + return ret; +} + +/** + * ipath_tid_free - free a port TID + * @pd: the port + * @subport: the subport + * @ti: the TID info + * + * right now we are unlocking one page at a time, but since + * the intended use of this routine is for a single group of + * virtually contiguous pages, that should change to improve + * performance. We check that the TID is in range for this port + * but otherwise don't check validity; if user has an error and + * frees the wrong tid, it's only their own data that can thereby + * be corrupted. We do check that the TID was in use, for sanity + * We always use our idea of the saved address, not the address that + * they pass in to us. + */ + +static int ipath_tid_free(struct ipath_portdata *pd, unsigned subport, + const struct ipath_tid_info *ti) +{ + int ret = 0; + u32 tid, porttid, cnt, limit, tidcnt; + struct ipath_devdata *dd = pd->port_dd; + u64 __iomem *tidbase; + unsigned long tidmap[8]; + + if (!dd->ipath_pageshadow) { + ret = -ENOMEM; + goto done; + } + + if (copy_from_user(tidmap, (void __user *)(unsigned long)ti->tidmap, + sizeof tidmap)) { + ret = -EFAULT; + goto done; + } + + porttid = pd->port_port * dd->ipath_rcvtidcnt; + if (!pd->port_subport_cnt) + tidcnt = dd->ipath_rcvtidcnt; + else if (!subport) { + tidcnt = (dd->ipath_rcvtidcnt / pd->port_subport_cnt) + + (dd->ipath_rcvtidcnt % pd->port_subport_cnt); + porttid += dd->ipath_rcvtidcnt - tidcnt; + } else { + tidcnt = dd->ipath_rcvtidcnt / pd->port_subport_cnt; + porttid += tidcnt * (subport - 1); + } + tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) + + dd->ipath_rcvtidbase + + porttid * sizeof(*tidbase)); + + limit = sizeof(tidmap) * BITS_PER_BYTE; + if (limit > tidcnt) + /* just in case size changes in future */ + limit = tidcnt; + tid = find_first_bit(tidmap, limit); + ipath_cdbg(VERBOSE, "Port%u free %u tids; first bit (max=%d) " + "set is %d, porttid %u\n", pd->port_port, ti->tidcnt, + limit, tid, porttid); + for (cnt = 0; tid < limit; tid++) { + /* + * small optimization; if we detect a run of 3 or so without + * any set, use find_first_bit again. That's mainly to + * accelerate the case where we wrapped, so we have some at + * the beginning, and some at the end, and a big gap + * in the middle. + */ + if (!test_bit(tid, tidmap)) + continue; + cnt++; + if (dd->ipath_pageshadow[porttid + tid]) { + struct page *p; + p = dd->ipath_pageshadow[porttid + tid]; + dd->ipath_pageshadow[porttid + tid] = NULL; + ipath_cdbg(VERBOSE, "PID %u freeing TID %u\n", + pid_nr(pd->port_pid), tid); + dd->ipath_f_put_tid(dd, &tidbase[tid], + RCVHQ_RCV_TYPE_EXPECTED, + dd->ipath_tidinvalid); + pci_unmap_page(dd->pcidev, + dd->ipath_physshadow[porttid + tid], + PAGE_SIZE, PCI_DMA_FROMDEVICE); + ipath_release_user_pages(&p, 1); + ipath_stats.sps_pageunlocks++; + } else + ipath_dbg("Unused tid %u, ignoring\n", tid); + } + if (cnt != ti->tidcnt) + ipath_dbg("passed in tidcnt %d, only %d bits set in map\n", + ti->tidcnt, cnt); +done: + if (ret) + ipath_dbg("Failed to unmap %u TID pages, failing with %d\n", + ti->tidcnt, -ret); + return ret; +} + +/** + * ipath_set_part_key - set a partition key + * @pd: the port + * @key: the key + * + * We can have up to 4 active at a time (other than the default, which is + * always allowed). This is somewhat tricky, since multiple ports may set + * the same key, so we reference count them, and clean up at exit. All 4 + * partition keys are packed into a single infinipath register. It's an + * error for a process to set the same pkey multiple times. We provide no + * mechanism to de-allocate a pkey at this time, we may eventually need to + * do that. I've used the atomic operations, and no locking, and only make + * a single pass through what's available. This should be more than + * adequate for some time. I'll think about spinlocks or the like if and as + * it's necessary. + */ +static int ipath_set_part_key(struct ipath_portdata *pd, u16 key) +{ + struct ipath_devdata *dd = pd->port_dd; + int i, any = 0, pidx = -1; + u16 lkey = key & 0x7FFF; + int ret; + + if (lkey == (IPATH_DEFAULT_P_KEY & 0x7FFF)) { + /* nothing to do; this key always valid */ + ret = 0; + goto bail; + } + + ipath_cdbg(VERBOSE, "p%u try to set pkey %hx, current keys " + "%hx:%x %hx:%x %hx:%x %hx:%x\n", + pd->port_port, key, dd->ipath_pkeys[0], + atomic_read(&dd->ipath_pkeyrefs[0]), dd->ipath_pkeys[1], + atomic_read(&dd->ipath_pkeyrefs[1]), dd->ipath_pkeys[2], + atomic_read(&dd->ipath_pkeyrefs[2]), dd->ipath_pkeys[3], + atomic_read(&dd->ipath_pkeyrefs[3])); + + if (!lkey) { + ipath_cdbg(PROC, "p%u tries to set key 0, not allowed\n", + pd->port_port); + ret = -EINVAL; + goto bail; + } + + /* + * Set the full membership bit, because it has to be + * set in the register or the packet, and it seems + * cleaner to set in the register than to force all + * callers to set it. (see bug 4331) + */ + key |= 0x8000; + + for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) { + if (!pd->port_pkeys[i] && pidx == -1) + pidx = i; + if (pd->port_pkeys[i] == key) { + ipath_cdbg(VERBOSE, "p%u tries to set same pkey " + "(%x) more than once\n", + pd->port_port, key); + ret = -EEXIST; + goto bail; + } + } + if (pidx == -1) { + ipath_dbg("All pkeys for port %u already in use, " + "can't set %x\n", pd->port_port, key); + ret = -EBUSY; + goto bail; + } + for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) { + if (!dd->ipath_pkeys[i]) { + any++; + continue; + } + if (dd->ipath_pkeys[i] == key) { + atomic_t *pkrefs = &dd->ipath_pkeyrefs[i]; + + if (atomic_inc_return(pkrefs) > 1) { + pd->port_pkeys[pidx] = key; + ipath_cdbg(VERBOSE, "p%u set key %x " + "matches #%d, count now %d\n", + pd->port_port, key, i, + atomic_read(pkrefs)); + ret = 0; + goto bail; + } else { + /* + * lost race, decrement count, catch below + */ + atomic_dec(pkrefs); + ipath_cdbg(VERBOSE, "Lost race, count was " + "0, after dec, it's %d\n", + atomic_read(pkrefs)); + any++; + } + } + if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) { + /* + * It makes no sense to have both the limited and + * full membership PKEY set at the same time since + * the unlimited one will disable the limited one. + */ + ret = -EEXIST; + goto bail; + } + } + if (!any) { + ipath_dbg("port %u, all pkeys already in use, " + "can't set %x\n", pd->port_port, key); + ret = -EBUSY; + goto bail; + } + for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) { + if (!dd->ipath_pkeys[i] && + atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) { + u64 pkey; + + /* for ipathstats, etc. */ + ipath_stats.sps_pkeys[i] = lkey; + pd->port_pkeys[pidx] = dd->ipath_pkeys[i] = key; + pkey = + (u64) dd->ipath_pkeys[0] | + ((u64) dd->ipath_pkeys[1] << 16) | + ((u64) dd->ipath_pkeys[2] << 32) | + ((u64) dd->ipath_pkeys[3] << 48); + ipath_cdbg(PROC, "p%u set key %x in #%d, " + "portidx %d, new pkey reg %llx\n", + pd->port_port, key, i, pidx, + (unsigned long long) pkey); + ipath_write_kreg( + dd, dd->ipath_kregs->kr_partitionkey, pkey); + + ret = 0; + goto bail; + } + } + ipath_dbg("port %u, all pkeys already in use 2nd pass, " + "can't set %x\n", pd->port_port, key); + ret = -EBUSY; + +bail: + return ret; +} + +/** + * ipath_manage_rcvq - manage a port's receive queue + * @pd: the port + * @subport: the subport + * @start_stop: action to carry out + * + * start_stop == 0 disables receive on the port, for use in queue + * overflow conditions. start_stop==1 re-enables, to be used to + * re-init the software copy of the head register + */ +static int ipath_manage_rcvq(struct ipath_portdata *pd, unsigned subport, + int start_stop) +{ + struct ipath_devdata *dd = pd->port_dd; + + ipath_cdbg(PROC, "%sabling rcv for unit %u port %u:%u\n", + start_stop ? "en" : "dis", dd->ipath_unit, + pd->port_port, subport); + if (subport) + goto bail; + /* atomically clear receive enable port. */ + if (start_stop) { + /* + * On enable, force in-memory copy of the tail register to + * 0, so that protocol code doesn't have to worry about + * whether or not the chip has yet updated the in-memory + * copy or not on return from the system call. The chip + * always resets it's tail register back to 0 on a + * transition from disabled to enabled. This could cause a + * problem if software was broken, and did the enable w/o + * the disable, but eventually the in-memory copy will be + * updated and correct itself, even in the face of software + * bugs. + */ + if (pd->port_rcvhdrtail_kvaddr) + ipath_clear_rcvhdrtail(pd); + set_bit(dd->ipath_r_portenable_shift + pd->port_port, + &dd->ipath_rcvctrl); + } else + clear_bit(dd->ipath_r_portenable_shift + pd->port_port, + &dd->ipath_rcvctrl); + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + /* now be sure chip saw it before we return */ + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + if (start_stop) { + /* + * And try to be sure that tail reg update has happened too. + * This should in theory interlock with the RXE changes to + * the tail register. Don't assign it to the tail register + * in memory copy, since we could overwrite an update by the + * chip if we did. + */ + ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port); + } + /* always; new head should be equal to new tail; see above */ +bail: + return 0; +} + +static void ipath_clean_part_key(struct ipath_portdata *pd, + struct ipath_devdata *dd) +{ + int i, j, pchanged = 0; + u64 oldpkey; + + /* for debugging only */ + oldpkey = (u64) dd->ipath_pkeys[0] | + ((u64) dd->ipath_pkeys[1] << 16) | + ((u64) dd->ipath_pkeys[2] << 32) | + ((u64) dd->ipath_pkeys[3] << 48); + + for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) { + if (!pd->port_pkeys[i]) + continue; + ipath_cdbg(VERBOSE, "look for key[%d] %hx in pkeys\n", i, + pd->port_pkeys[i]); + for (j = 0; j < ARRAY_SIZE(dd->ipath_pkeys); j++) { + /* check for match independent of the global bit */ + if ((dd->ipath_pkeys[j] & 0x7fff) != + (pd->port_pkeys[i] & 0x7fff)) + continue; + if (atomic_dec_and_test(&dd->ipath_pkeyrefs[j])) { + ipath_cdbg(VERBOSE, "p%u clear key " + "%x matches #%d\n", + pd->port_port, + pd->port_pkeys[i], j); + ipath_stats.sps_pkeys[j] = + dd->ipath_pkeys[j] = 0; + pchanged++; + } + else ipath_cdbg( + VERBOSE, "p%u key %x matches #%d, " + "but ref still %d\n", pd->port_port, + pd->port_pkeys[i], j, + atomic_read(&dd->ipath_pkeyrefs[j])); + break; + } + pd->port_pkeys[i] = 0; + } + if (pchanged) { + u64 pkey = (u64) dd->ipath_pkeys[0] | + ((u64) dd->ipath_pkeys[1] << 16) | + ((u64) dd->ipath_pkeys[2] << 32) | + ((u64) dd->ipath_pkeys[3] << 48); + ipath_cdbg(VERBOSE, "p%u old pkey reg %llx, " + "new pkey reg %llx\n", pd->port_port, + (unsigned long long) oldpkey, + (unsigned long long) pkey); + ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey, + pkey); + } +} + +/* + * Initialize the port data with the receive buffer sizes + * so this can be done while the master port is locked. + * Otherwise, there is a race with a slave opening the port + * and seeing these fields uninitialized. + */ +static void init_user_egr_sizes(struct ipath_portdata *pd) +{ + struct ipath_devdata *dd = pd->port_dd; + unsigned egrperchunk, egrcnt, size; + + /* + * to avoid wasting a lot of memory, we allocate 32KB chunks of + * physically contiguous memory, advance through it until used up + * and then allocate more. Of course, we need memory to store those + * extra pointers, now. Started out with 256KB, but under heavy + * memory pressure (creating large files and then copying them over + * NFS while doing lots of MPI jobs), we hit some allocation + * failures, even though we can sleep... (2.6.10) Still get + * failures at 64K. 32K is the lowest we can go without wasting + * additional memory. + */ + size = 0x8000; + egrperchunk = size / dd->ipath_rcvegrbufsize; + egrcnt = dd->ipath_rcvegrcnt; + pd->port_rcvegrbuf_chunks = (egrcnt + egrperchunk - 1) / egrperchunk; + pd->port_rcvegrbufs_perchunk = egrperchunk; + pd->port_rcvegrbuf_size = size; +} + +/** + * ipath_create_user_egr - allocate eager TID buffers + * @pd: the port to allocate TID buffers for + * + * This routine is now quite different for user and kernel, because + * the kernel uses skb's, for the accelerated network performance + * This is the user port version + * + * Allocate the eager TID buffers and program them into infinipath + * They are no longer completely contiguous, we do multiple allocation + * calls. + */ +static int ipath_create_user_egr(struct ipath_portdata *pd) +{ + struct ipath_devdata *dd = pd->port_dd; + unsigned e, egrcnt, egrperchunk, chunk, egrsize, egroff; + size_t size; + int ret; + gfp_t gfp_flags; + + /* + * GFP_USER, but without GFP_FS, so buffer cache can be + * coalesced (we hope); otherwise, even at order 4, + * heavy filesystem activity makes these fail, and we can + * use compound pages. + */ + gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP; + + egrcnt = dd->ipath_rcvegrcnt; + /* TID number offset for this port */ + egroff = (pd->port_port - 1) * egrcnt + dd->ipath_p0_rcvegrcnt; + egrsize = dd->ipath_rcvegrbufsize; + ipath_cdbg(VERBOSE, "Allocating %d egr buffers, at egrtid " + "offset %x, egrsize %u\n", egrcnt, egroff, egrsize); + + chunk = pd->port_rcvegrbuf_chunks; + egrperchunk = pd->port_rcvegrbufs_perchunk; + size = pd->port_rcvegrbuf_size; + pd->port_rcvegrbuf = kmalloc(chunk * sizeof(pd->port_rcvegrbuf[0]), + GFP_KERNEL); + if (!pd->port_rcvegrbuf) { + ret = -ENOMEM; + goto bail; + } + pd->port_rcvegrbuf_phys = + kmalloc(chunk * sizeof(pd->port_rcvegrbuf_phys[0]), + GFP_KERNEL); + if (!pd->port_rcvegrbuf_phys) { + ret = -ENOMEM; + goto bail_rcvegrbuf; + } + for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) { + + pd->port_rcvegrbuf[e] = dma_alloc_coherent( + &dd->pcidev->dev, size, &pd->port_rcvegrbuf_phys[e], + gfp_flags); + + if (!pd->port_rcvegrbuf[e]) { + ret = -ENOMEM; + goto bail_rcvegrbuf_phys; + } + } + + pd->port_rcvegr_phys = pd->port_rcvegrbuf_phys[0]; + + for (e = chunk = 0; chunk < pd->port_rcvegrbuf_chunks; chunk++) { + dma_addr_t pa = pd->port_rcvegrbuf_phys[chunk]; + unsigned i; + + for (i = 0; e < egrcnt && i < egrperchunk; e++, i++) { + dd->ipath_f_put_tid(dd, e + egroff + + (u64 __iomem *) + ((char __iomem *) + dd->ipath_kregbase + + dd->ipath_rcvegrbase), + RCVHQ_RCV_TYPE_EAGER, pa); + pa += egrsize; + } + cond_resched(); /* don't hog the cpu */ + } + + ret = 0; + goto bail; + +bail_rcvegrbuf_phys: + for (e = 0; e < pd->port_rcvegrbuf_chunks && + pd->port_rcvegrbuf[e]; e++) { + dma_free_coherent(&dd->pcidev->dev, size, + pd->port_rcvegrbuf[e], + pd->port_rcvegrbuf_phys[e]); + + } + kfree(pd->port_rcvegrbuf_phys); + pd->port_rcvegrbuf_phys = NULL; +bail_rcvegrbuf: + kfree(pd->port_rcvegrbuf); + pd->port_rcvegrbuf = NULL; +bail: + return ret; +} + + +/* common code for the mappings on dma_alloc_coherent mem */ +static int ipath_mmap_mem(struct vm_area_struct *vma, + struct ipath_portdata *pd, unsigned len, int write_ok, + void *kvaddr, char *what) +{ + struct ipath_devdata *dd = pd->port_dd; + unsigned long pfn; + int ret; + + if ((vma->vm_end - vma->vm_start) > len) { + dev_info(&dd->pcidev->dev, + "FAIL on %s: len %lx > %x\n", what, + vma->vm_end - vma->vm_start, len); + ret = -EFAULT; + goto bail; + } + + if (!write_ok) { + if (vma->vm_flags & VM_WRITE) { + dev_info(&dd->pcidev->dev, + "%s must be mapped readonly\n", what); + ret = -EPERM; + goto bail; + } + + /* don't allow them to later change with mprotect */ + vma->vm_flags &= ~VM_MAYWRITE; + } + + pfn = virt_to_phys(kvaddr) >> PAGE_SHIFT; + ret = remap_pfn_range(vma, vma->vm_start, pfn, + len, vma->vm_page_prot); + if (ret) + dev_info(&dd->pcidev->dev, "%s port%u mmap of %lx, %x " + "bytes r%c failed: %d\n", what, pd->port_port, + pfn, len, write_ok?'w':'o', ret); + else + ipath_cdbg(VERBOSE, "%s port%u mmaped %lx, %x bytes " + "r%c\n", what, pd->port_port, pfn, len, + write_ok?'w':'o'); +bail: + return ret; +} + +static int mmap_ureg(struct vm_area_struct *vma, struct ipath_devdata *dd, + u64 ureg) +{ + unsigned long phys; + int ret; + + /* + * This is real hardware, so use io_remap. This is the mechanism + * for the user process to update the head registers for their port + * in the chip. + */ + if ((vma->vm_end - vma->vm_start) > PAGE_SIZE) { + dev_info(&dd->pcidev->dev, "FAIL mmap userreg: reqlen " + "%lx > PAGE\n", vma->vm_end - vma->vm_start); + ret = -EFAULT; + } else { + phys = dd->ipath_physaddr + ureg; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + ret = io_remap_pfn_range(vma, vma->vm_start, + phys >> PAGE_SHIFT, + vma->vm_end - vma->vm_start, + vma->vm_page_prot); + } + return ret; +} + +static int mmap_piobufs(struct vm_area_struct *vma, + struct ipath_devdata *dd, + struct ipath_portdata *pd, + unsigned piobufs, unsigned piocnt) +{ + unsigned long phys; + int ret; + + /* + * When we map the PIO buffers in the chip, we want to map them as + * writeonly, no read possible. This prevents access to previous + * process data, and catches users who might try to read the i/o + * space due to a bug. + */ + if ((vma->vm_end - vma->vm_start) > (piocnt * dd->ipath_palign)) { + dev_info(&dd->pcidev->dev, "FAIL mmap piobufs: " + "reqlen %lx > PAGE\n", + vma->vm_end - vma->vm_start); + ret = -EINVAL; + goto bail; + } + + phys = dd->ipath_physaddr + piobufs; + +#if defined(__powerpc__) + /* There isn't a generic way to specify writethrough mappings */ + pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE; + pgprot_val(vma->vm_page_prot) |= _PAGE_WRITETHRU; + pgprot_val(vma->vm_page_prot) &= ~_PAGE_GUARDED; +#endif + + /* + * don't allow them to later change to readable with mprotect (for when + * not initially mapped readable, as is normally the case) + */ + vma->vm_flags &= ~VM_MAYREAD; + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + + ret = io_remap_pfn_range(vma, vma->vm_start, phys >> PAGE_SHIFT, + vma->vm_end - vma->vm_start, + vma->vm_page_prot); +bail: + return ret; +} + +static int mmap_rcvegrbufs(struct vm_area_struct *vma, + struct ipath_portdata *pd) +{ + struct ipath_devdata *dd = pd->port_dd; + unsigned long start, size; + size_t total_size, i; + unsigned long pfn; + int ret; + + size = pd->port_rcvegrbuf_size; + total_size = pd->port_rcvegrbuf_chunks * size; + if ((vma->vm_end - vma->vm_start) > total_size) { + dev_info(&dd->pcidev->dev, "FAIL on egr bufs: " + "reqlen %lx > actual %lx\n", + vma->vm_end - vma->vm_start, + (unsigned long) total_size); + ret = -EINVAL; + goto bail; + } + + if (vma->vm_flags & VM_WRITE) { + dev_info(&dd->pcidev->dev, "Can't map eager buffers as " + "writable (flags=%lx)\n", vma->vm_flags); + ret = -EPERM; + goto bail; + } + /* don't allow them to later change to writeable with mprotect */ + vma->vm_flags &= ~VM_MAYWRITE; + + start = vma->vm_start; + + for (i = 0; i < pd->port_rcvegrbuf_chunks; i++, start += size) { + pfn = virt_to_phys(pd->port_rcvegrbuf[i]) >> PAGE_SHIFT; + ret = remap_pfn_range(vma, start, pfn, size, + vma->vm_page_prot); + if (ret < 0) + goto bail; + } + ret = 0; + +bail: + return ret; +} + +/* + * ipath_file_vma_fault - handle a VMA page fault. + */ +static int ipath_file_vma_fault(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct page *page; + + page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT)); + if (!page) + return VM_FAULT_SIGBUS; + get_page(page); + vmf->page = page; + + return 0; +} + +static const struct vm_operations_struct ipath_file_vm_ops = { + .fault = ipath_file_vma_fault, +}; + +static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr, + struct ipath_portdata *pd, unsigned subport) +{ + unsigned long len; + struct ipath_devdata *dd; + void *addr; + size_t size; + int ret = 0; + + /* If the port is not shared, all addresses should be physical */ + if (!pd->port_subport_cnt) + goto bail; + + dd = pd->port_dd; + size = pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size; + + /* + * Each process has all the subport uregbase, rcvhdrq, and + * rcvegrbufs mmapped - as an array for all the processes, + * and also separately for this process. + */ + if (pgaddr == cvt_kvaddr(pd->subport_uregbase)) { + addr = pd->subport_uregbase; + size = PAGE_SIZE * pd->port_subport_cnt; + } else if (pgaddr == cvt_kvaddr(pd->subport_rcvhdr_base)) { + addr = pd->subport_rcvhdr_base; + size = pd->port_rcvhdrq_size * pd->port_subport_cnt; + } else if (pgaddr == cvt_kvaddr(pd->subport_rcvegrbuf)) { + addr = pd->subport_rcvegrbuf; + size *= pd->port_subport_cnt; + } else if (pgaddr == cvt_kvaddr(pd->subport_uregbase + + PAGE_SIZE * subport)) { + addr = pd->subport_uregbase + PAGE_SIZE * subport; + size = PAGE_SIZE; + } else if (pgaddr == cvt_kvaddr(pd->subport_rcvhdr_base + + pd->port_rcvhdrq_size * subport)) { + addr = pd->subport_rcvhdr_base + + pd->port_rcvhdrq_size * subport; + size = pd->port_rcvhdrq_size; + } else if (pgaddr == cvt_kvaddr(pd->subport_rcvegrbuf + + size * subport)) { + addr = pd->subport_rcvegrbuf + size * subport; + /* rcvegrbufs are read-only on the slave */ + if (vma->vm_flags & VM_WRITE) { + dev_info(&dd->pcidev->dev, + "Can't map eager buffers as " + "writable (flags=%lx)\n", vma->vm_flags); + ret = -EPERM; + goto bail; + } + /* + * Don't allow permission to later change to writeable + * with mprotect. + */ + vma->vm_flags &= ~VM_MAYWRITE; + } else { + goto bail; + } + len = vma->vm_end - vma->vm_start; + if (len > size) { + ipath_cdbg(MM, "FAIL: reqlen %lx > %zx\n", len, size); + ret = -EINVAL; + goto bail; + } + + vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT; + vma->vm_ops = &ipath_file_vm_ops; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + ret = 1; + +bail: + return ret; +} + +/** + * ipath_mmap - mmap various structures into user space + * @fp: the file pointer + * @vma: the VM area + * + * We use this to have a shared buffer between the kernel and the user code + * for the rcvhdr queue, egr buffers, and the per-port user regs and pio + * buffers in the chip. We have the open and close entries so we can bump + * the ref count and keep the driver from being unloaded while still mapped. + */ +static int ipath_mmap(struct file *fp, struct vm_area_struct *vma) +{ + struct ipath_portdata *pd; + struct ipath_devdata *dd; + u64 pgaddr, ureg; + unsigned piobufs, piocnt; + int ret; + + pd = port_fp(fp); + if (!pd) { + ret = -EINVAL; + goto bail; + } + dd = pd->port_dd; + + /* + * This is the ipath_do_user_init() code, mapping the shared buffers + * into the user process. The address referred to by vm_pgoff is the + * file offset passed via mmap(). For shared ports, this is the + * kernel vmalloc() address of the pages to share with the master. + * For non-shared or master ports, this is a physical address. + * We only do one mmap for each space mapped. + */ + pgaddr = vma->vm_pgoff << PAGE_SHIFT; + + /* + * Check for 0 in case one of the allocations failed, but user + * called mmap anyway. + */ + if (!pgaddr) { + ret = -EINVAL; + goto bail; + } + + ipath_cdbg(MM, "pgaddr %llx vm_start=%lx len %lx port %u:%u:%u\n", + (unsigned long long) pgaddr, vma->vm_start, + vma->vm_end - vma->vm_start, dd->ipath_unit, + pd->port_port, subport_fp(fp)); + + /* + * Physical addresses must fit in 40 bits for our hardware. + * Check for kernel virtual addresses first, anything else must + * match a HW or memory address. + */ + ret = mmap_kvaddr(vma, pgaddr, pd, subport_fp(fp)); + if (ret) { + if (ret > 0) + ret = 0; + goto bail; + } + + ureg = dd->ipath_uregbase + dd->ipath_ureg_align * pd->port_port; + if (!pd->port_subport_cnt) { + /* port is not shared */ + piocnt = pd->port_piocnt; + piobufs = pd->port_piobufs; + } else if (!subport_fp(fp)) { + /* caller is the master */ + piocnt = (pd->port_piocnt / pd->port_subport_cnt) + + (pd->port_piocnt % pd->port_subport_cnt); + piobufs = pd->port_piobufs + + dd->ipath_palign * (pd->port_piocnt - piocnt); + } else { + unsigned slave = subport_fp(fp) - 1; + + /* caller is a slave */ + piocnt = pd->port_piocnt / pd->port_subport_cnt; + piobufs = pd->port_piobufs + dd->ipath_palign * piocnt * slave; + } + + if (pgaddr == ureg) + ret = mmap_ureg(vma, dd, ureg); + else if (pgaddr == piobufs) + ret = mmap_piobufs(vma, dd, pd, piobufs, piocnt); + else if (pgaddr == dd->ipath_pioavailregs_phys) + /* in-memory copy of pioavail registers */ + ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0, + (void *) dd->ipath_pioavailregs_dma, + "pioavail registers"); + else if (pgaddr == pd->port_rcvegr_phys) + ret = mmap_rcvegrbufs(vma, pd); + else if (pgaddr == (u64) pd->port_rcvhdrq_phys) + /* + * The rcvhdrq itself; readonly except on HT (so have + * to allow writable mapping), multiple pages, contiguous + * from an i/o perspective. + */ + ret = ipath_mmap_mem(vma, pd, pd->port_rcvhdrq_size, 1, + pd->port_rcvhdrq, + "rcvhdrq"); + else if (pgaddr == (u64) pd->port_rcvhdrqtailaddr_phys) + /* in-memory copy of rcvhdrq tail register */ + ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0, + pd->port_rcvhdrtail_kvaddr, + "rcvhdrq tail"); + else + ret = -EINVAL; + + vma->vm_private_data = NULL; + + if (ret < 0) + dev_info(&dd->pcidev->dev, + "Failure %d on off %llx len %lx\n", + -ret, (unsigned long long)pgaddr, + vma->vm_end - vma->vm_start); +bail: + return ret; +} + +static unsigned ipath_poll_hdrqfull(struct ipath_portdata *pd) +{ + unsigned pollflag = 0; + + if ((pd->poll_type & IPATH_POLL_TYPE_OVERFLOW) && + pd->port_hdrqfull != pd->port_hdrqfull_poll) { + pollflag |= POLLIN | POLLRDNORM; + pd->port_hdrqfull_poll = pd->port_hdrqfull; + } + + return pollflag; +} + +static unsigned int ipath_poll_urgent(struct ipath_portdata *pd, + struct file *fp, + struct poll_table_struct *pt) +{ + unsigned pollflag = 0; + struct ipath_devdata *dd; + + dd = pd->port_dd; + + /* variable access in ipath_poll_hdrqfull() needs this */ + rmb(); + pollflag = ipath_poll_hdrqfull(pd); + + if (pd->port_urgent != pd->port_urgent_poll) { + pollflag |= POLLIN | POLLRDNORM; + pd->port_urgent_poll = pd->port_urgent; + } + + if (!pollflag) { + /* this saves a spin_lock/unlock in interrupt handler... */ + set_bit(IPATH_PORT_WAITING_URG, &pd->port_flag); + /* flush waiting flag so don't miss an event... */ + wmb(); + poll_wait(fp, &pd->port_wait, pt); + } + + return pollflag; +} + +static unsigned int ipath_poll_next(struct ipath_portdata *pd, + struct file *fp, + struct poll_table_struct *pt) +{ + u32 head; + u32 tail; + unsigned pollflag = 0; + struct ipath_devdata *dd; + + dd = pd->port_dd; + + /* variable access in ipath_poll_hdrqfull() needs this */ + rmb(); + pollflag = ipath_poll_hdrqfull(pd); + + head = ipath_read_ureg32(dd, ur_rcvhdrhead, pd->port_port); + if (pd->port_rcvhdrtail_kvaddr) + tail = ipath_get_rcvhdrtail(pd); + else + tail = ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port); + + if (head != tail) + pollflag |= POLLIN | POLLRDNORM; + else { + /* this saves a spin_lock/unlock in interrupt handler */ + set_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag); + /* flush waiting flag so we don't miss an event */ + wmb(); + + set_bit(pd->port_port + dd->ipath_r_intravail_shift, + &dd->ipath_rcvctrl); + + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + + if (dd->ipath_rhdrhead_intr_off) /* arm rcv interrupt */ + ipath_write_ureg(dd, ur_rcvhdrhead, + dd->ipath_rhdrhead_intr_off | head, + pd->port_port); + + poll_wait(fp, &pd->port_wait, pt); + } + + return pollflag; +} + +static unsigned int ipath_poll(struct file *fp, + struct poll_table_struct *pt) +{ + struct ipath_portdata *pd; + unsigned pollflag; + + pd = port_fp(fp); + if (!pd) + pollflag = 0; + else if (pd->poll_type & IPATH_POLL_TYPE_URGENT) + pollflag = ipath_poll_urgent(pd, fp, pt); + else + pollflag = ipath_poll_next(pd, fp, pt); + + return pollflag; +} + +static int ipath_supports_subports(int user_swmajor, int user_swminor) +{ + /* no subport implementation prior to software version 1.3 */ + return (user_swmajor > 1) || (user_swminor >= 3); +} + +static int ipath_compatible_subports(int user_swmajor, int user_swminor) +{ + /* this code is written long-hand for clarity */ + if (IPATH_USER_SWMAJOR != user_swmajor) { + /* no promise of compatibility if major mismatch */ + return 0; + } + if (IPATH_USER_SWMAJOR == 1) { + switch (IPATH_USER_SWMINOR) { + case 0: + case 1: + case 2: + /* no subport implementation so cannot be compatible */ + return 0; + case 3: + /* 3 is only compatible with itself */ + return user_swminor == 3; + default: + /* >= 4 are compatible (or are expected to be) */ + return user_swminor >= 4; + } + } + /* make no promises yet for future major versions */ + return 0; +} + +static int init_subports(struct ipath_devdata *dd, + struct ipath_portdata *pd, + const struct ipath_user_info *uinfo) +{ + int ret = 0; + unsigned num_subports; + size_t size; + + /* + * If the user is requesting zero subports, + * skip the subport allocation. + */ + if (uinfo->spu_subport_cnt <= 0) + goto bail; + + /* Self-consistency check for ipath_compatible_subports() */ + if (ipath_supports_subports(IPATH_USER_SWMAJOR, IPATH_USER_SWMINOR) && + !ipath_compatible_subports(IPATH_USER_SWMAJOR, + IPATH_USER_SWMINOR)) { + dev_info(&dd->pcidev->dev, + "Inconsistent ipath_compatible_subports()\n"); + goto bail; + } + + /* Check for subport compatibility */ + if (!ipath_compatible_subports(uinfo->spu_userversion >> 16, + uinfo->spu_userversion & 0xffff)) { + dev_info(&dd->pcidev->dev, + "Mismatched user version (%d.%d) and driver " + "version (%d.%d) while port sharing. Ensure " + "that driver and library are from the same " + "release.\n", + (int) (uinfo->spu_userversion >> 16), + (int) (uinfo->spu_userversion & 0xffff), + IPATH_USER_SWMAJOR, + IPATH_USER_SWMINOR); + goto bail; + } + if (uinfo->spu_subport_cnt > INFINIPATH_MAX_SUBPORT) { + ret = -EINVAL; + goto bail; + } + + num_subports = uinfo->spu_subport_cnt; + pd->subport_uregbase = vzalloc(PAGE_SIZE * num_subports); + if (!pd->subport_uregbase) { + ret = -ENOMEM; + goto bail; + } + /* Note: pd->port_rcvhdrq_size isn't initialized yet. */ + size = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize * + sizeof(u32), PAGE_SIZE) * num_subports; + pd->subport_rcvhdr_base = vzalloc(size); + if (!pd->subport_rcvhdr_base) { + ret = -ENOMEM; + goto bail_ureg; + } + + pd->subport_rcvegrbuf = vzalloc(pd->port_rcvegrbuf_chunks * + pd->port_rcvegrbuf_size * + num_subports); + if (!pd->subport_rcvegrbuf) { + ret = -ENOMEM; + goto bail_rhdr; + } + + pd->port_subport_cnt = uinfo->spu_subport_cnt; + pd->port_subport_id = uinfo->spu_subport_id; + pd->active_slaves = 1; + set_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag); + goto bail; + +bail_rhdr: + vfree(pd->subport_rcvhdr_base); +bail_ureg: + vfree(pd->subport_uregbase); + pd->subport_uregbase = NULL; +bail: + return ret; +} + +static int try_alloc_port(struct ipath_devdata *dd, int port, + struct file *fp, + const struct ipath_user_info *uinfo) +{ + struct ipath_portdata *pd; + int ret; + + if (!(pd = dd->ipath_pd[port])) { + void *ptmp; + + pd = kzalloc(sizeof(struct ipath_portdata), GFP_KERNEL); + + /* + * Allocate memory for use in ipath_tid_update() just once + * at open, not per call. Reduces cost of expected send + * setup. + */ + ptmp = kmalloc(dd->ipath_rcvtidcnt * sizeof(u16) + + dd->ipath_rcvtidcnt * sizeof(struct page **), + GFP_KERNEL); + if (!pd || !ptmp) { + ipath_dev_err(dd, "Unable to allocate portdata " + "memory, failing open\n"); + ret = -ENOMEM; + kfree(pd); + kfree(ptmp); + goto bail; + } + dd->ipath_pd[port] = pd; + dd->ipath_pd[port]->port_port = port; + dd->ipath_pd[port]->port_dd = dd; + dd->ipath_pd[port]->port_tid_pg_list = ptmp; + init_waitqueue_head(&dd->ipath_pd[port]->port_wait); + } + if (!pd->port_cnt) { + pd->userversion = uinfo->spu_userversion; + init_user_egr_sizes(pd); + if ((ret = init_subports(dd, pd, uinfo)) != 0) + goto bail; + ipath_cdbg(PROC, "%s[%u] opened unit:port %u:%u\n", + current->comm, current->pid, dd->ipath_unit, + port); + pd->port_cnt = 1; + port_fp(fp) = pd; + pd->port_pid = get_pid(task_pid(current)); + strlcpy(pd->port_comm, current->comm, sizeof(pd->port_comm)); + ipath_stats.sps_ports++; + ret = 0; + } else + ret = -EBUSY; + +bail: + return ret; +} + +static inline int usable(struct ipath_devdata *dd) +{ + return dd && + (dd->ipath_flags & IPATH_PRESENT) && + dd->ipath_kregbase && + dd->ipath_lid && + !(dd->ipath_flags & (IPATH_LINKDOWN | IPATH_DISABLED + | IPATH_LINKUNK)); +} + +static int find_free_port(int unit, struct file *fp, + const struct ipath_user_info *uinfo) +{ + struct ipath_devdata *dd = ipath_lookup(unit); + int ret, i; + + if (!dd) { + ret = -ENODEV; + goto bail; + } + + if (!usable(dd)) { + ret = -ENETDOWN; + goto bail; + } + + for (i = 1; i < dd->ipath_cfgports; i++) { + ret = try_alloc_port(dd, i, fp, uinfo); + if (ret != -EBUSY) + goto bail; + } + ret = -EBUSY; + +bail: + return ret; +} + +static int find_best_unit(struct file *fp, + const struct ipath_user_info *uinfo) +{ + int ret = 0, i, prefunit = -1, devmax; + int maxofallports, npresent, nup; + int ndev; + + devmax = ipath_count_units(&npresent, &nup, &maxofallports); + + /* + * This code is present to allow a knowledgeable person to + * specify the layout of processes to processors before opening + * this driver, and then we'll assign the process to the "closest" + * InfiniPath chip to that processor (we assume reasonable connectivity, + * for now). This code assumes that if affinity has been set + * before this point, that at most one cpu is set; for now this + * is reasonable. I check for both cpumask_empty() and cpumask_full(), + * in case some kernel variant sets none of the bits when no + * affinity is set. 2.6.11 and 12 kernels have all present + * cpus set. Some day we'll have to fix it up further to handle + * a cpu subset. This algorithm fails for two HT chips connected + * in tunnel fashion. Eventually this needs real topology + * information. There may be some issues with dual core numbering + * as well. This needs more work prior to release. + */ + if (!cpumask_empty(tsk_cpus_allowed(current)) && + !cpumask_full(tsk_cpus_allowed(current))) { + int ncpus = num_online_cpus(), curcpu = -1, nset = 0; + get_online_cpus(); + for_each_online_cpu(i) + if (cpumask_test_cpu(i, tsk_cpus_allowed(current))) { + ipath_cdbg(PROC, "%s[%u] affinity set for " + "cpu %d/%d\n", current->comm, + current->pid, i, ncpus); + curcpu = i; + nset++; + } + put_online_cpus(); + if (curcpu != -1 && nset != ncpus) { + if (npresent) { + prefunit = curcpu / (ncpus / npresent); + ipath_cdbg(PROC,"%s[%u] %d chips, %d cpus, " + "%d cpus/chip, select unit %d\n", + current->comm, current->pid, + npresent, ncpus, ncpus / npresent, + prefunit); + } + } + } + + /* + * user ports start at 1, kernel port is 0 + * For now, we do round-robin access across all chips + */ + + if (prefunit != -1) + devmax = prefunit + 1; +recheck: + for (i = 1; i < maxofallports; i++) { + for (ndev = prefunit != -1 ? prefunit : 0; ndev < devmax; + ndev++) { + struct ipath_devdata *dd = ipath_lookup(ndev); + + if (!usable(dd)) + continue; /* can't use this unit */ + if (i >= dd->ipath_cfgports) + /* + * Maxed out on users of this unit. Try + * next. + */ + continue; + ret = try_alloc_port(dd, i, fp, uinfo); + if (!ret) + goto done; + } + } + + if (npresent) { + if (nup == 0) { + ret = -ENETDOWN; + ipath_dbg("No ports available (none initialized " + "and ready)\n"); + } else { + if (prefunit > 0) { + /* if started above 0, retry from 0 */ + ipath_cdbg(PROC, + "%s[%u] no ports on prefunit " + "%d, clear and re-check\n", + current->comm, current->pid, + prefunit); + devmax = ipath_count_units(NULL, NULL, + NULL); + prefunit = -1; + goto recheck; + } + ret = -EBUSY; + ipath_dbg("No ports available\n"); + } + } else { + ret = -ENXIO; + ipath_dbg("No boards found\n"); + } + +done: + return ret; +} + +static int find_shared_port(struct file *fp, + const struct ipath_user_info *uinfo) +{ + int devmax, ndev, i; + int ret = 0; + + devmax = ipath_count_units(NULL, NULL, NULL); + + for (ndev = 0; ndev < devmax; ndev++) { + struct ipath_devdata *dd = ipath_lookup(ndev); + + if (!usable(dd)) + continue; + for (i = 1; i < dd->ipath_cfgports; i++) { + struct ipath_portdata *pd = dd->ipath_pd[i]; + + /* Skip ports which are not yet open */ + if (!pd || !pd->port_cnt) + continue; + /* Skip port if it doesn't match the requested one */ + if (pd->port_subport_id != uinfo->spu_subport_id) + continue; + /* Verify the sharing process matches the master */ + if (pd->port_subport_cnt != uinfo->spu_subport_cnt || + pd->userversion != uinfo->spu_userversion || + pd->port_cnt >= pd->port_subport_cnt) { + ret = -EINVAL; + goto done; + } + port_fp(fp) = pd; + subport_fp(fp) = pd->port_cnt++; + pd->port_subpid[subport_fp(fp)] = + get_pid(task_pid(current)); + tidcursor_fp(fp) = 0; + pd->active_slaves |= 1 << subport_fp(fp); + ipath_cdbg(PROC, + "%s[%u] %u sharing %s[%u] unit:port %u:%u\n", + current->comm, current->pid, + subport_fp(fp), + pd->port_comm, pid_nr(pd->port_pid), + dd->ipath_unit, pd->port_port); + ret = 1; + goto done; + } + } + +done: + return ret; +} + +static int ipath_open(struct inode *in, struct file *fp) +{ + /* The real work is performed later in ipath_assign_port() */ + fp->private_data = kzalloc(sizeof(struct ipath_filedata), GFP_KERNEL); + return fp->private_data ? 0 : -ENOMEM; +} + +/* Get port early, so can set affinity prior to memory allocation */ +static int ipath_assign_port(struct file *fp, + const struct ipath_user_info *uinfo) +{ + int ret; + int i_minor; + unsigned swmajor, swminor; + + /* Check to be sure we haven't already initialized this file */ + if (port_fp(fp)) { + ret = -EINVAL; + goto done; + } + + /* for now, if major version is different, bail */ + swmajor = uinfo->spu_userversion >> 16; + if (swmajor != IPATH_USER_SWMAJOR) { + ipath_dbg("User major version %d not same as driver " + "major %d\n", uinfo->spu_userversion >> 16, + IPATH_USER_SWMAJOR); + ret = -ENODEV; + goto done; + } + + swminor = uinfo->spu_userversion & 0xffff; + if (swminor != IPATH_USER_SWMINOR) + ipath_dbg("User minor version %d not same as driver " + "minor %d\n", swminor, IPATH_USER_SWMINOR); + + mutex_lock(&ipath_mutex); + + if (ipath_compatible_subports(swmajor, swminor) && + uinfo->spu_subport_cnt && + (ret = find_shared_port(fp, uinfo))) { + if (ret > 0) + ret = 0; + goto done_chk_sdma; + } + + i_minor = iminor(file_inode(fp)) - IPATH_USER_MINOR_BASE; + ipath_cdbg(VERBOSE, "open on dev %lx (minor %d)\n", + (long)file_inode(fp)->i_rdev, i_minor); + + if (i_minor) + ret = find_free_port(i_minor - 1, fp, uinfo); + else + ret = find_best_unit(fp, uinfo); + +done_chk_sdma: + if (!ret) { + struct ipath_filedata *fd = fp->private_data; + const struct ipath_portdata *pd = fd->pd; + const struct ipath_devdata *dd = pd->port_dd; + + fd->pq = ipath_user_sdma_queue_create(&dd->pcidev->dev, + dd->ipath_unit, + pd->port_port, + fd->subport); + + if (!fd->pq) + ret = -ENOMEM; + } + + mutex_unlock(&ipath_mutex); + +done: + return ret; +} + + +static int ipath_do_user_init(struct file *fp, + const struct ipath_user_info *uinfo) +{ + int ret; + struct ipath_portdata *pd = port_fp(fp); + struct ipath_devdata *dd; + u32 head32; + + /* Subports don't need to initialize anything since master did it. */ + if (subport_fp(fp)) { + ret = wait_event_interruptible(pd->port_wait, + !test_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag)); + goto done; + } + + dd = pd->port_dd; + + if (uinfo->spu_rcvhdrsize) { + ret = ipath_setrcvhdrsize(dd, uinfo->spu_rcvhdrsize); + if (ret) + goto done; + } + + /* for now we do nothing with rcvhdrcnt: uinfo->spu_rcvhdrcnt */ + + /* some ports may get extra buffers, calculate that here */ + if (pd->port_port <= dd->ipath_ports_extrabuf) + pd->port_piocnt = dd->ipath_pbufsport + 1; + else + pd->port_piocnt = dd->ipath_pbufsport; + + /* for right now, kernel piobufs are at end, so port 1 is at 0 */ + if (pd->port_port <= dd->ipath_ports_extrabuf) + pd->port_pio_base = (dd->ipath_pbufsport + 1) + * (pd->port_port - 1); + else + pd->port_pio_base = dd->ipath_ports_extrabuf + + dd->ipath_pbufsport * (pd->port_port - 1); + pd->port_piobufs = dd->ipath_piobufbase + + pd->port_pio_base * dd->ipath_palign; + ipath_cdbg(VERBOSE, "piobuf base for port %u is 0x%x, piocnt %u," + " first pio %u\n", pd->port_port, pd->port_piobufs, + pd->port_piocnt, pd->port_pio_base); + ipath_chg_pioavailkernel(dd, pd->port_pio_base, pd->port_piocnt, 0); + + /* + * Now allocate the rcvhdr Q and eager TIDs; skip the TID + * array for time being. If pd->port_port > chip-supported, + * we need to do extra stuff here to handle by handling overflow + * through port 0, someday + */ + ret = ipath_create_rcvhdrq(dd, pd); + if (!ret) + ret = ipath_create_user_egr(pd); + if (ret) + goto done; + + /* + * set the eager head register for this port to the current values + * of the tail pointers, since we don't know if they were + * updated on last use of the port. + */ + head32 = ipath_read_ureg32(dd, ur_rcvegrindextail, pd->port_port); + ipath_write_ureg(dd, ur_rcvegrindexhead, head32, pd->port_port); + pd->port_lastrcvhdrqtail = -1; + ipath_cdbg(VERBOSE, "Wrote port%d egrhead %x from tail regs\n", + pd->port_port, head32); + pd->port_tidcursor = 0; /* start at beginning after open */ + + /* initialize poll variables... */ + pd->port_urgent = 0; + pd->port_urgent_poll = 0; + pd->port_hdrqfull_poll = pd->port_hdrqfull; + + /* + * Now enable the port for receive. + * For chips that are set to DMA the tail register to memory + * when they change (and when the update bit transitions from + * 0 to 1. So for those chips, we turn it off and then back on. + * This will (very briefly) affect any other open ports, but the + * duration is very short, and therefore isn't an issue. We + * explicitly set the in-memory tail copy to 0 beforehand, so we + * don't have to wait to be sure the DMA update has happened + * (chip resets head/tail to 0 on transition to enable). + */ + set_bit(dd->ipath_r_portenable_shift + pd->port_port, + &dd->ipath_rcvctrl); + if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) { + if (pd->port_rcvhdrtail_kvaddr) + ipath_clear_rcvhdrtail(pd); + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl & + ~(1ULL << dd->ipath_r_tailupd_shift)); + } + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + /* Notify any waiting slaves */ + if (pd->port_subport_cnt) { + clear_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag); + wake_up(&pd->port_wait); + } +done: + return ret; +} + +/** + * unlock_exptid - unlock any expected TID entries port still had in use + * @pd: port + * + * We don't actually update the chip here, because we do a bulk update + * below, using ipath_f_clear_tids. + */ +static void unlock_expected_tids(struct ipath_portdata *pd) +{ + struct ipath_devdata *dd = pd->port_dd; + int port_tidbase = pd->port_port * dd->ipath_rcvtidcnt; + int i, cnt = 0, maxtid = port_tidbase + dd->ipath_rcvtidcnt; + + ipath_cdbg(VERBOSE, "Port %u unlocking any locked expTID pages\n", + pd->port_port); + for (i = port_tidbase; i < maxtid; i++) { + struct page *ps = dd->ipath_pageshadow[i]; + + if (!ps) + continue; + + dd->ipath_pageshadow[i] = NULL; + pci_unmap_page(dd->pcidev, dd->ipath_physshadow[i], + PAGE_SIZE, PCI_DMA_FROMDEVICE); + ipath_release_user_pages_on_close(&ps, 1); + cnt++; + ipath_stats.sps_pageunlocks++; + } + if (cnt) + ipath_cdbg(VERBOSE, "Port %u locked %u expTID entries\n", + pd->port_port, cnt); + + if (ipath_stats.sps_pagelocks || ipath_stats.sps_pageunlocks) + ipath_cdbg(VERBOSE, "%llu pages locked, %llu unlocked\n", + (unsigned long long) ipath_stats.sps_pagelocks, + (unsigned long long) + ipath_stats.sps_pageunlocks); +} + +static int ipath_close(struct inode *in, struct file *fp) +{ + int ret = 0; + struct ipath_filedata *fd; + struct ipath_portdata *pd; + struct ipath_devdata *dd; + unsigned long flags; + unsigned port; + struct pid *pid; + + ipath_cdbg(VERBOSE, "close on dev %lx, private data %p\n", + (long)in->i_rdev, fp->private_data); + + mutex_lock(&ipath_mutex); + + fd = fp->private_data; + fp->private_data = NULL; + pd = fd->pd; + if (!pd) { + mutex_unlock(&ipath_mutex); + goto bail; + } + + dd = pd->port_dd; + + /* drain user sdma queue */ + ipath_user_sdma_queue_drain(dd, fd->pq); + ipath_user_sdma_queue_destroy(fd->pq); + + if (--pd->port_cnt) { + /* + * XXX If the master closes the port before the slave(s), + * revoke the mmap for the eager receive queue so + * the slave(s) don't wait for receive data forever. + */ + pd->active_slaves &= ~(1 << fd->subport); + put_pid(pd->port_subpid[fd->subport]); + pd->port_subpid[fd->subport] = NULL; + mutex_unlock(&ipath_mutex); + goto bail; + } + /* early; no interrupt users after this */ + spin_lock_irqsave(&dd->ipath_uctxt_lock, flags); + port = pd->port_port; + dd->ipath_pd[port] = NULL; + pid = pd->port_pid; + pd->port_pid = NULL; + spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags); + + if (pd->port_rcvwait_to || pd->port_piowait_to + || pd->port_rcvnowait || pd->port_pionowait) { + ipath_cdbg(VERBOSE, "port%u, %u rcv, %u pio wait timeo; " + "%u rcv %u, pio already\n", + pd->port_port, pd->port_rcvwait_to, + pd->port_piowait_to, pd->port_rcvnowait, + pd->port_pionowait); + pd->port_rcvwait_to = pd->port_piowait_to = + pd->port_rcvnowait = pd->port_pionowait = 0; + } + if (pd->port_flag) { + ipath_cdbg(PROC, "port %u port_flag set: 0x%lx\n", + pd->port_port, pd->port_flag); + pd->port_flag = 0; + } + + if (dd->ipath_kregbase) { + /* atomically clear receive enable port and intr avail. */ + clear_bit(dd->ipath_r_portenable_shift + port, + &dd->ipath_rcvctrl); + clear_bit(pd->port_port + dd->ipath_r_intravail_shift, + &dd->ipath_rcvctrl); + ipath_write_kreg( dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + /* and read back from chip to be sure that nothing + * else is in flight when we do the rest */ + (void)ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + + /* clean up the pkeys for this port user */ + ipath_clean_part_key(pd, dd); + /* + * be paranoid, and never write 0's to these, just use an + * unused part of the port 0 tail page. Of course, + * rcvhdraddr points to a large chunk of memory, so this + * could still trash things, but at least it won't trash + * page 0, and by disabling the port, it should stop "soon", + * even if a packet or two is in already in flight after we + * disabled the port. + */ + ipath_write_kreg_port(dd, + dd->ipath_kregs->kr_rcvhdrtailaddr, port, + dd->ipath_dummy_hdrq_phys); + ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr, + pd->port_port, dd->ipath_dummy_hdrq_phys); + + ipath_disarm_piobufs(dd, pd->port_pio_base, pd->port_piocnt); + ipath_chg_pioavailkernel(dd, pd->port_pio_base, + pd->port_piocnt, 1); + + dd->ipath_f_clear_tids(dd, pd->port_port); + + if (dd->ipath_pageshadow) + unlock_expected_tids(pd); + ipath_stats.sps_ports--; + ipath_cdbg(PROC, "%s[%u] closed port %u:%u\n", + pd->port_comm, pid_nr(pid), + dd->ipath_unit, port); + } + + put_pid(pid); + mutex_unlock(&ipath_mutex); + ipath_free_pddata(dd, pd); /* after releasing the mutex */ + +bail: + kfree(fd); + return ret; +} + +static int ipath_port_info(struct ipath_portdata *pd, u16 subport, + struct ipath_port_info __user *uinfo) +{ + struct ipath_port_info info; + int nup; + int ret; + size_t sz; + + (void) ipath_count_units(NULL, &nup, NULL); + info.num_active = nup; + info.unit = pd->port_dd->ipath_unit; + info.port = pd->port_port; + info.subport = subport; + /* Don't return new fields if old library opened the port. */ + if (ipath_supports_subports(pd->userversion >> 16, + pd->userversion & 0xffff)) { + /* Number of user ports available for this device. */ + info.num_ports = pd->port_dd->ipath_cfgports - 1; + info.num_subports = pd->port_subport_cnt; + sz = sizeof(info); + } else + sz = sizeof(info) - 2 * sizeof(u16); + + if (copy_to_user(uinfo, &info, sz)) { + ret = -EFAULT; + goto bail; + } + ret = 0; + +bail: + return ret; +} + +static int ipath_get_slave_info(struct ipath_portdata *pd, + void __user *slave_mask_addr) +{ + int ret = 0; + + if (copy_to_user(slave_mask_addr, &pd->active_slaves, sizeof(u32))) + ret = -EFAULT; + return ret; +} + +static int ipath_sdma_get_inflight(struct ipath_user_sdma_queue *pq, + u32 __user *inflightp) +{ + const u32 val = ipath_user_sdma_inflight_counter(pq); + + if (put_user(val, inflightp)) + return -EFAULT; + + return 0; +} + +static int ipath_sdma_get_complete(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq, + u32 __user *completep) +{ + u32 val; + int err; + + err = ipath_user_sdma_make_progress(dd, pq); + if (err < 0) + return err; + + val = ipath_user_sdma_complete_counter(pq); + if (put_user(val, completep)) + return -EFAULT; + + return 0; +} + +static ssize_t ipath_write(struct file *fp, const char __user *data, + size_t count, loff_t *off) +{ + const struct ipath_cmd __user *ucmd; + struct ipath_portdata *pd; + const void __user *src; + size_t consumed, copy; + struct ipath_cmd cmd; + ssize_t ret = 0; + void *dest; + + if (count < sizeof(cmd.type)) { + ret = -EINVAL; + goto bail; + } + + ucmd = (const struct ipath_cmd __user *) data; + + if (copy_from_user(&cmd.type, &ucmd->type, sizeof(cmd.type))) { + ret = -EFAULT; + goto bail; + } + + consumed = sizeof(cmd.type); + + switch (cmd.type) { + case IPATH_CMD_ASSIGN_PORT: + case __IPATH_CMD_USER_INIT: + case IPATH_CMD_USER_INIT: + copy = sizeof(cmd.cmd.user_info); + dest = &cmd.cmd.user_info; + src = &ucmd->cmd.user_info; + break; + case IPATH_CMD_RECV_CTRL: + copy = sizeof(cmd.cmd.recv_ctrl); + dest = &cmd.cmd.recv_ctrl; + src = &ucmd->cmd.recv_ctrl; + break; + case IPATH_CMD_PORT_INFO: + copy = sizeof(cmd.cmd.port_info); + dest = &cmd.cmd.port_info; + src = &ucmd->cmd.port_info; + break; + case IPATH_CMD_TID_UPDATE: + case IPATH_CMD_TID_FREE: + copy = sizeof(cmd.cmd.tid_info); + dest = &cmd.cmd.tid_info; + src = &ucmd->cmd.tid_info; + break; + case IPATH_CMD_SET_PART_KEY: + copy = sizeof(cmd.cmd.part_key); + dest = &cmd.cmd.part_key; + src = &ucmd->cmd.part_key; + break; + case __IPATH_CMD_SLAVE_INFO: + copy = sizeof(cmd.cmd.slave_mask_addr); + dest = &cmd.cmd.slave_mask_addr; + src = &ucmd->cmd.slave_mask_addr; + break; + case IPATH_CMD_PIOAVAILUPD: // force an update of PIOAvail reg + copy = 0; + src = NULL; + dest = NULL; + break; + case IPATH_CMD_POLL_TYPE: + copy = sizeof(cmd.cmd.poll_type); + dest = &cmd.cmd.poll_type; + src = &ucmd->cmd.poll_type; + break; + case IPATH_CMD_ARMLAUNCH_CTRL: + copy = sizeof(cmd.cmd.armlaunch_ctrl); + dest = &cmd.cmd.armlaunch_ctrl; + src = &ucmd->cmd.armlaunch_ctrl; + break; + case IPATH_CMD_SDMA_INFLIGHT: + copy = sizeof(cmd.cmd.sdma_inflight); + dest = &cmd.cmd.sdma_inflight; + src = &ucmd->cmd.sdma_inflight; + break; + case IPATH_CMD_SDMA_COMPLETE: + copy = sizeof(cmd.cmd.sdma_complete); + dest = &cmd.cmd.sdma_complete; + src = &ucmd->cmd.sdma_complete; + break; + default: + ret = -EINVAL; + goto bail; + } + + if (copy) { + if ((count - consumed) < copy) { + ret = -EINVAL; + goto bail; + } + + if (copy_from_user(dest, src, copy)) { + ret = -EFAULT; + goto bail; + } + + consumed += copy; + } + + pd = port_fp(fp); + if (!pd && cmd.type != __IPATH_CMD_USER_INIT && + cmd.type != IPATH_CMD_ASSIGN_PORT) { + ret = -EINVAL; + goto bail; + } + + switch (cmd.type) { + case IPATH_CMD_ASSIGN_PORT: + ret = ipath_assign_port(fp, &cmd.cmd.user_info); + if (ret) + goto bail; + break; + case __IPATH_CMD_USER_INIT: + /* backwards compatibility, get port first */ + ret = ipath_assign_port(fp, &cmd.cmd.user_info); + if (ret) + goto bail; + /* and fall through to current version. */ + case IPATH_CMD_USER_INIT: + ret = ipath_do_user_init(fp, &cmd.cmd.user_info); + if (ret) + goto bail; + ret = ipath_get_base_info( + fp, (void __user *) (unsigned long) + cmd.cmd.user_info.spu_base_info, + cmd.cmd.user_info.spu_base_info_size); + break; + case IPATH_CMD_RECV_CTRL: + ret = ipath_manage_rcvq(pd, subport_fp(fp), cmd.cmd.recv_ctrl); + break; + case IPATH_CMD_PORT_INFO: + ret = ipath_port_info(pd, subport_fp(fp), + (struct ipath_port_info __user *) + (unsigned long) cmd.cmd.port_info); + break; + case IPATH_CMD_TID_UPDATE: + ret = ipath_tid_update(pd, fp, &cmd.cmd.tid_info); + break; + case IPATH_CMD_TID_FREE: + ret = ipath_tid_free(pd, subport_fp(fp), &cmd.cmd.tid_info); + break; + case IPATH_CMD_SET_PART_KEY: + ret = ipath_set_part_key(pd, cmd.cmd.part_key); + break; + case __IPATH_CMD_SLAVE_INFO: + ret = ipath_get_slave_info(pd, + (void __user *) (unsigned long) + cmd.cmd.slave_mask_addr); + break; + case IPATH_CMD_PIOAVAILUPD: + ipath_force_pio_avail_update(pd->port_dd); + break; + case IPATH_CMD_POLL_TYPE: + pd->poll_type = cmd.cmd.poll_type; + break; + case IPATH_CMD_ARMLAUNCH_CTRL: + if (cmd.cmd.armlaunch_ctrl) + ipath_enable_armlaunch(pd->port_dd); + else + ipath_disable_armlaunch(pd->port_dd); + break; + case IPATH_CMD_SDMA_INFLIGHT: + ret = ipath_sdma_get_inflight(user_sdma_queue_fp(fp), + (u32 __user *) (unsigned long) + cmd.cmd.sdma_inflight); + break; + case IPATH_CMD_SDMA_COMPLETE: + ret = ipath_sdma_get_complete(pd->port_dd, + user_sdma_queue_fp(fp), + (u32 __user *) (unsigned long) + cmd.cmd.sdma_complete); + break; + } + + if (ret >= 0) + ret = consumed; + +bail: + return ret; +} + +static ssize_t ipath_writev(struct kiocb *iocb, const struct iovec *iov, + unsigned long dim, loff_t off) +{ + struct file *filp = iocb->ki_filp; + struct ipath_filedata *fp = filp->private_data; + struct ipath_portdata *pd = port_fp(filp); + struct ipath_user_sdma_queue *pq = fp->pq; + + if (!dim) + return -EINVAL; + + return ipath_user_sdma_writev(pd->port_dd, pq, iov, dim); +} + +static struct class *ipath_class; + +static int init_cdev(int minor, char *name, const struct file_operations *fops, + struct cdev **cdevp, struct device **devp) +{ + const dev_t dev = MKDEV(IPATH_MAJOR, minor); + struct cdev *cdev = NULL; + struct device *device = NULL; + int ret; + + cdev = cdev_alloc(); + if (!cdev) { + printk(KERN_ERR IPATH_DRV_NAME + ": Could not allocate cdev for minor %d, %s\n", + minor, name); + ret = -ENOMEM; + goto done; + } + + cdev->owner = THIS_MODULE; + cdev->ops = fops; + kobject_set_name(&cdev->kobj, name); + + ret = cdev_add(cdev, dev, 1); + if (ret < 0) { + printk(KERN_ERR IPATH_DRV_NAME + ": Could not add cdev for minor %d, %s (err %d)\n", + minor, name, -ret); + goto err_cdev; + } + + device = device_create(ipath_class, NULL, dev, NULL, name); + + if (IS_ERR(device)) { + ret = PTR_ERR(device); + printk(KERN_ERR IPATH_DRV_NAME ": Could not create " + "device for minor %d, %s (err %d)\n", + minor, name, -ret); + goto err_cdev; + } + + goto done; + +err_cdev: + cdev_del(cdev); + cdev = NULL; + +done: + if (ret >= 0) { + *cdevp = cdev; + *devp = device; + } else { + *cdevp = NULL; + *devp = NULL; + } + + return ret; +} + +int ipath_cdev_init(int minor, char *name, const struct file_operations *fops, + struct cdev **cdevp, struct device **devp) +{ + return init_cdev(minor, name, fops, cdevp, devp); +} + +static void cleanup_cdev(struct cdev **cdevp, + struct device **devp) +{ + struct device *dev = *devp; + + if (dev) { + device_unregister(dev); + *devp = NULL; + } + + if (*cdevp) { + cdev_del(*cdevp); + *cdevp = NULL; + } +} + +void ipath_cdev_cleanup(struct cdev **cdevp, + struct device **devp) +{ + cleanup_cdev(cdevp, devp); +} + +static struct cdev *wildcard_cdev; +static struct device *wildcard_dev; + +static const dev_t dev = MKDEV(IPATH_MAJOR, 0); + +static int user_init(void) +{ + int ret; + + ret = register_chrdev_region(dev, IPATH_NMINORS, IPATH_DRV_NAME); + if (ret < 0) { + printk(KERN_ERR IPATH_DRV_NAME ": Could not register " + "chrdev region (err %d)\n", -ret); + goto done; + } + + ipath_class = class_create(THIS_MODULE, IPATH_DRV_NAME); + + if (IS_ERR(ipath_class)) { + ret = PTR_ERR(ipath_class); + printk(KERN_ERR IPATH_DRV_NAME ": Could not create " + "device class (err %d)\n", -ret); + goto bail; + } + + goto done; +bail: + unregister_chrdev_region(dev, IPATH_NMINORS); +done: + return ret; +} + +static void user_cleanup(void) +{ + if (ipath_class) { + class_destroy(ipath_class); + ipath_class = NULL; + } + + unregister_chrdev_region(dev, IPATH_NMINORS); +} + +static atomic_t user_count = ATOMIC_INIT(0); +static atomic_t user_setup = ATOMIC_INIT(0); + +int ipath_user_add(struct ipath_devdata *dd) +{ + char name[10]; + int ret; + + if (atomic_inc_return(&user_count) == 1) { + ret = user_init(); + if (ret < 0) { + ipath_dev_err(dd, "Unable to set up user support: " + "error %d\n", -ret); + goto bail; + } + ret = init_cdev(0, "ipath", &ipath_file_ops, &wildcard_cdev, + &wildcard_dev); + if (ret < 0) { + ipath_dev_err(dd, "Could not create wildcard " + "minor: error %d\n", -ret); + goto bail_user; + } + + atomic_set(&user_setup, 1); + } + + snprintf(name, sizeof(name), "ipath%d", dd->ipath_unit); + + ret = init_cdev(dd->ipath_unit + 1, name, &ipath_file_ops, + &dd->user_cdev, &dd->user_dev); + if (ret < 0) + ipath_dev_err(dd, "Could not create user minor %d, %s\n", + dd->ipath_unit + 1, name); + + goto bail; + +bail_user: + user_cleanup(); +bail: + return ret; +} + +void ipath_user_remove(struct ipath_devdata *dd) +{ + cleanup_cdev(&dd->user_cdev, &dd->user_dev); + + if (atomic_dec_return(&user_count) == 0) { + if (atomic_read(&user_setup) == 0) + goto bail; + + cleanup_cdev(&wildcard_cdev, &wildcard_dev); + user_cleanup(); + + atomic_set(&user_setup, 0); + } +bail: + return; +} diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c new file mode 100644 index 0000000..4977082 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_fs.c @@ -0,0 +1,422 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "ipath_kernel.h" + +#define IPATHFS_MAGIC 0x726a77 + +static struct super_block *ipath_super; + +static int ipathfs_mknod(struct inode *dir, struct dentry *dentry, + umode_t mode, const struct file_operations *fops, + void *data) +{ + int error; + struct inode *inode = new_inode(dir->i_sb); + + if (!inode) { + error = -EPERM; + goto bail; + } + + inode->i_ino = get_next_ino(); + inode->i_mode = mode; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_private = data; + if (S_ISDIR(mode)) { + inode->i_op = &simple_dir_inode_operations; + inc_nlink(inode); + inc_nlink(dir); + } + + inode->i_fop = fops; + + d_instantiate(dentry, inode); + error = 0; + +bail: + return error; +} + +static int create_file(const char *name, umode_t mode, + struct dentry *parent, struct dentry **dentry, + const struct file_operations *fops, void *data) +{ + int error; + + mutex_lock(&parent->d_inode->i_mutex); + *dentry = lookup_one_len(name, parent, strlen(name)); + if (!IS_ERR(*dentry)) + error = ipathfs_mknod(parent->d_inode, *dentry, + mode, fops, data); + else + error = PTR_ERR(*dentry); + mutex_unlock(&parent->d_inode->i_mutex); + + return error; +} + +static ssize_t atomic_stats_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return simple_read_from_buffer(buf, count, ppos, &ipath_stats, + sizeof ipath_stats); +} + +static const struct file_operations atomic_stats_ops = { + .read = atomic_stats_read, + .llseek = default_llseek, +}; + +static ssize_t atomic_counters_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct infinipath_counters counters; + struct ipath_devdata *dd; + + dd = file_inode(file)->i_private; + dd->ipath_f_read_counters(dd, &counters); + + return simple_read_from_buffer(buf, count, ppos, &counters, + sizeof counters); +} + +static const struct file_operations atomic_counters_ops = { + .read = atomic_counters_read, + .llseek = default_llseek, +}; + +static ssize_t flash_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct ipath_devdata *dd; + ssize_t ret; + loff_t pos; + char *tmp; + + pos = *ppos; + + if ( pos < 0) { + ret = -EINVAL; + goto bail; + } + + if (pos >= sizeof(struct ipath_flash)) { + ret = 0; + goto bail; + } + + if (count > sizeof(struct ipath_flash) - pos) + count = sizeof(struct ipath_flash) - pos; + + tmp = kmalloc(count, GFP_KERNEL); + if (!tmp) { + ret = -ENOMEM; + goto bail; + } + + dd = file_inode(file)->i_private; + if (ipath_eeprom_read(dd, pos, tmp, count)) { + ipath_dev_err(dd, "failed to read from flash\n"); + ret = -ENXIO; + goto bail_tmp; + } + + if (copy_to_user(buf, tmp, count)) { + ret = -EFAULT; + goto bail_tmp; + } + + *ppos = pos + count; + ret = count; + +bail_tmp: + kfree(tmp); + +bail: + return ret; +} + +static ssize_t flash_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct ipath_devdata *dd; + ssize_t ret; + loff_t pos; + char *tmp; + + pos = *ppos; + + if (pos != 0) { + ret = -EINVAL; + goto bail; + } + + if (count != sizeof(struct ipath_flash)) { + ret = -EINVAL; + goto bail; + } + + tmp = kmalloc(count, GFP_KERNEL); + if (!tmp) { + ret = -ENOMEM; + goto bail; + } + + if (copy_from_user(tmp, buf, count)) { + ret = -EFAULT; + goto bail_tmp; + } + + dd = file_inode(file)->i_private; + if (ipath_eeprom_write(dd, pos, tmp, count)) { + ret = -ENXIO; + ipath_dev_err(dd, "failed to write to flash\n"); + goto bail_tmp; + } + + *ppos = pos + count; + ret = count; + +bail_tmp: + kfree(tmp); + +bail: + return ret; +} + +static const struct file_operations flash_ops = { + .read = flash_read, + .write = flash_write, + .llseek = default_llseek, +}; + +static int create_device_files(struct super_block *sb, + struct ipath_devdata *dd) +{ + struct dentry *dir, *tmp; + char unit[10]; + int ret; + + snprintf(unit, sizeof unit, "%02d", dd->ipath_unit); + ret = create_file(unit, S_IFDIR|S_IRUGO|S_IXUGO, sb->s_root, &dir, + &simple_dir_operations, dd); + if (ret) { + printk(KERN_ERR "create_file(%s) failed: %d\n", unit, ret); + goto bail; + } + + ret = create_file("atomic_counters", S_IFREG|S_IRUGO, dir, &tmp, + &atomic_counters_ops, dd); + if (ret) { + printk(KERN_ERR "create_file(%s/atomic_counters) " + "failed: %d\n", unit, ret); + goto bail; + } + + ret = create_file("flash", S_IFREG|S_IWUSR|S_IRUGO, dir, &tmp, + &flash_ops, dd); + if (ret) { + printk(KERN_ERR "create_file(%s/flash) " + "failed: %d\n", unit, ret); + goto bail; + } + +bail: + return ret; +} + +static int remove_file(struct dentry *parent, char *name) +{ + struct dentry *tmp; + int ret; + + tmp = lookup_one_len(name, parent, strlen(name)); + + if (IS_ERR(tmp)) { + ret = PTR_ERR(tmp); + goto bail; + } + + spin_lock(&tmp->d_lock); + if (!(d_unhashed(tmp) && tmp->d_inode)) { + dget_dlock(tmp); + __d_drop(tmp); + spin_unlock(&tmp->d_lock); + simple_unlink(parent->d_inode, tmp); + } else + spin_unlock(&tmp->d_lock); + + ret = 0; +bail: + /* + * We don't expect clients to care about the return value, but + * it's there if they need it. + */ + return ret; +} + +static int remove_device_files(struct super_block *sb, + struct ipath_devdata *dd) +{ + struct dentry *dir, *root; + char unit[10]; + int ret; + + root = dget(sb->s_root); + mutex_lock(&root->d_inode->i_mutex); + snprintf(unit, sizeof unit, "%02d", dd->ipath_unit); + dir = lookup_one_len(unit, root, strlen(unit)); + + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); + printk(KERN_ERR "Lookup of %s failed\n", unit); + goto bail; + } + + remove_file(dir, "flash"); + remove_file(dir, "atomic_counters"); + d_delete(dir); + ret = simple_rmdir(root->d_inode, dir); + +bail: + mutex_unlock(&root->d_inode->i_mutex); + dput(root); + return ret; +} + +static int ipathfs_fill_super(struct super_block *sb, void *data, + int silent) +{ + struct ipath_devdata *dd, *tmp; + unsigned long flags; + int ret; + + static struct tree_descr files[] = { + [2] = {"atomic_stats", &atomic_stats_ops, S_IRUGO}, + {""}, + }; + + ret = simple_fill_super(sb, IPATHFS_MAGIC, files); + if (ret) { + printk(KERN_ERR "simple_fill_super failed: %d\n", ret); + goto bail; + } + + spin_lock_irqsave(&ipath_devs_lock, flags); + + list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) { + spin_unlock_irqrestore(&ipath_devs_lock, flags); + ret = create_device_files(sb, dd); + if (ret) + goto bail; + spin_lock_irqsave(&ipath_devs_lock, flags); + } + + spin_unlock_irqrestore(&ipath_devs_lock, flags); + +bail: + return ret; +} + +static struct dentry *ipathfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + struct dentry *ret; + ret = mount_single(fs_type, flags, data, ipathfs_fill_super); + if (!IS_ERR(ret)) + ipath_super = ret->d_sb; + return ret; +} + +static void ipathfs_kill_super(struct super_block *s) +{ + kill_litter_super(s); + ipath_super = NULL; +} + +int ipathfs_add_device(struct ipath_devdata *dd) +{ + int ret; + + if (ipath_super == NULL) { + ret = 0; + goto bail; + } + + ret = create_device_files(ipath_super, dd); + +bail: + return ret; +} + +int ipathfs_remove_device(struct ipath_devdata *dd) +{ + int ret; + + if (ipath_super == NULL) { + ret = 0; + goto bail; + } + + ret = remove_device_files(ipath_super, dd); + +bail: + return ret; +} + +static struct file_system_type ipathfs_fs_type = { + .owner = THIS_MODULE, + .name = "ipathfs", + .mount = ipathfs_mount, + .kill_sb = ipathfs_kill_super, +}; +MODULE_ALIAS_FS("ipathfs"); + +int __init ipath_init_ipathfs(void) +{ + return register_filesystem(&ipathfs_fs_type); +} + +void __exit ipath_exit_ipathfs(void) +{ + unregister_filesystem(&ipathfs_fs_type); +} diff --git a/drivers/infiniband/hw/ipath/ipath_iba6110.c b/drivers/infiniband/hw/ipath/ipath_iba6110.c new file mode 100644 index 0000000..7cc3054 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_iba6110.c @@ -0,0 +1,1940 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This file contains all of the code that is specific to the InfiniPath + * HT chip. + */ + +#include +#include +#include +#include +#include + +#include "ipath_kernel.h" +#include "ipath_registers.h" + +static void ipath_setup_ht_setextled(struct ipath_devdata *, u64, u64); + + +/* + * This lists the InfiniPath registers, in the actual chip layout. + * This structure should never be directly accessed. + * + * The names are in InterCap form because they're taken straight from + * the chip specification. Since they're only used in this file, they + * don't pollute the rest of the source. +*/ + +struct _infinipath_do_not_use_kernel_regs { + unsigned long long Revision; + unsigned long long Control; + unsigned long long PageAlign; + unsigned long long PortCnt; + unsigned long long DebugPortSelect; + unsigned long long DebugPort; + unsigned long long SendRegBase; + unsigned long long UserRegBase; + unsigned long long CounterRegBase; + unsigned long long Scratch; + unsigned long long ReservedMisc1; + unsigned long long InterruptConfig; + unsigned long long IntBlocked; + unsigned long long IntMask; + unsigned long long IntStatus; + unsigned long long IntClear; + unsigned long long ErrorMask; + unsigned long long ErrorStatus; + unsigned long long ErrorClear; + unsigned long long HwErrMask; + unsigned long long HwErrStatus; + unsigned long long HwErrClear; + unsigned long long HwDiagCtrl; + unsigned long long MDIO; + unsigned long long IBCStatus; + unsigned long long IBCCtrl; + unsigned long long ExtStatus; + unsigned long long ExtCtrl; + unsigned long long GPIOOut; + unsigned long long GPIOMask; + unsigned long long GPIOStatus; + unsigned long long GPIOClear; + unsigned long long RcvCtrl; + unsigned long long RcvBTHQP; + unsigned long long RcvHdrSize; + unsigned long long RcvHdrCnt; + unsigned long long RcvHdrEntSize; + unsigned long long RcvTIDBase; + unsigned long long RcvTIDCnt; + unsigned long long RcvEgrBase; + unsigned long long RcvEgrCnt; + unsigned long long RcvBufBase; + unsigned long long RcvBufSize; + unsigned long long RxIntMemBase; + unsigned long long RxIntMemSize; + unsigned long long RcvPartitionKey; + unsigned long long ReservedRcv[10]; + unsigned long long SendCtrl; + unsigned long long SendPIOBufBase; + unsigned long long SendPIOSize; + unsigned long long SendPIOBufCnt; + unsigned long long SendPIOAvailAddr; + unsigned long long TxIntMemBase; + unsigned long long TxIntMemSize; + unsigned long long ReservedSend[9]; + unsigned long long SendBufferError; + unsigned long long SendBufferErrorCONT1; + unsigned long long SendBufferErrorCONT2; + unsigned long long SendBufferErrorCONT3; + unsigned long long ReservedSBE[4]; + unsigned long long RcvHdrAddr0; + unsigned long long RcvHdrAddr1; + unsigned long long RcvHdrAddr2; + unsigned long long RcvHdrAddr3; + unsigned long long RcvHdrAddr4; + unsigned long long RcvHdrAddr5; + unsigned long long RcvHdrAddr6; + unsigned long long RcvHdrAddr7; + unsigned long long RcvHdrAddr8; + unsigned long long ReservedRHA[7]; + unsigned long long RcvHdrTailAddr0; + unsigned long long RcvHdrTailAddr1; + unsigned long long RcvHdrTailAddr2; + unsigned long long RcvHdrTailAddr3; + unsigned long long RcvHdrTailAddr4; + unsigned long long RcvHdrTailAddr5; + unsigned long long RcvHdrTailAddr6; + unsigned long long RcvHdrTailAddr7; + unsigned long long RcvHdrTailAddr8; + unsigned long long ReservedRHTA[7]; + unsigned long long Sync; /* Software only */ + unsigned long long Dump; /* Software only */ + unsigned long long SimVer; /* Software only */ + unsigned long long ReservedSW[5]; + unsigned long long SerdesConfig0; + unsigned long long SerdesConfig1; + unsigned long long SerdesStatus; + unsigned long long XGXSConfig; + unsigned long long ReservedSW2[4]; +}; + +struct _infinipath_do_not_use_counters { + __u64 LBIntCnt; + __u64 LBFlowStallCnt; + __u64 Reserved1; + __u64 TxUnsupVLErrCnt; + __u64 TxDataPktCnt; + __u64 TxFlowPktCnt; + __u64 TxDwordCnt; + __u64 TxLenErrCnt; + __u64 TxMaxMinLenErrCnt; + __u64 TxUnderrunCnt; + __u64 TxFlowStallCnt; + __u64 TxDroppedPktCnt; + __u64 RxDroppedPktCnt; + __u64 RxDataPktCnt; + __u64 RxFlowPktCnt; + __u64 RxDwordCnt; + __u64 RxLenErrCnt; + __u64 RxMaxMinLenErrCnt; + __u64 RxICRCErrCnt; + __u64 RxVCRCErrCnt; + __u64 RxFlowCtrlErrCnt; + __u64 RxBadFormatCnt; + __u64 RxLinkProblemCnt; + __u64 RxEBPCnt; + __u64 RxLPCRCErrCnt; + __u64 RxBufOvflCnt; + __u64 RxTIDFullErrCnt; + __u64 RxTIDValidErrCnt; + __u64 RxPKeyMismatchCnt; + __u64 RxP0HdrEgrOvflCnt; + __u64 RxP1HdrEgrOvflCnt; + __u64 RxP2HdrEgrOvflCnt; + __u64 RxP3HdrEgrOvflCnt; + __u64 RxP4HdrEgrOvflCnt; + __u64 RxP5HdrEgrOvflCnt; + __u64 RxP6HdrEgrOvflCnt; + __u64 RxP7HdrEgrOvflCnt; + __u64 RxP8HdrEgrOvflCnt; + __u64 Reserved6; + __u64 Reserved7; + __u64 IBStatusChangeCnt; + __u64 IBLinkErrRecoveryCnt; + __u64 IBLinkDownedCnt; + __u64 IBSymbolErrCnt; +}; + +#define IPATH_KREG_OFFSET(field) (offsetof( \ + struct _infinipath_do_not_use_kernel_regs, field) / sizeof(u64)) +#define IPATH_CREG_OFFSET(field) (offsetof( \ + struct _infinipath_do_not_use_counters, field) / sizeof(u64)) + +static const struct ipath_kregs ipath_ht_kregs = { + .kr_control = IPATH_KREG_OFFSET(Control), + .kr_counterregbase = IPATH_KREG_OFFSET(CounterRegBase), + .kr_debugport = IPATH_KREG_OFFSET(DebugPort), + .kr_debugportselect = IPATH_KREG_OFFSET(DebugPortSelect), + .kr_errorclear = IPATH_KREG_OFFSET(ErrorClear), + .kr_errormask = IPATH_KREG_OFFSET(ErrorMask), + .kr_errorstatus = IPATH_KREG_OFFSET(ErrorStatus), + .kr_extctrl = IPATH_KREG_OFFSET(ExtCtrl), + .kr_extstatus = IPATH_KREG_OFFSET(ExtStatus), + .kr_gpio_clear = IPATH_KREG_OFFSET(GPIOClear), + .kr_gpio_mask = IPATH_KREG_OFFSET(GPIOMask), + .kr_gpio_out = IPATH_KREG_OFFSET(GPIOOut), + .kr_gpio_status = IPATH_KREG_OFFSET(GPIOStatus), + .kr_hwdiagctrl = IPATH_KREG_OFFSET(HwDiagCtrl), + .kr_hwerrclear = IPATH_KREG_OFFSET(HwErrClear), + .kr_hwerrmask = IPATH_KREG_OFFSET(HwErrMask), + .kr_hwerrstatus = IPATH_KREG_OFFSET(HwErrStatus), + .kr_ibcctrl = IPATH_KREG_OFFSET(IBCCtrl), + .kr_ibcstatus = IPATH_KREG_OFFSET(IBCStatus), + .kr_intblocked = IPATH_KREG_OFFSET(IntBlocked), + .kr_intclear = IPATH_KREG_OFFSET(IntClear), + .kr_interruptconfig = IPATH_KREG_OFFSET(InterruptConfig), + .kr_intmask = IPATH_KREG_OFFSET(IntMask), + .kr_intstatus = IPATH_KREG_OFFSET(IntStatus), + .kr_mdio = IPATH_KREG_OFFSET(MDIO), + .kr_pagealign = IPATH_KREG_OFFSET(PageAlign), + .kr_partitionkey = IPATH_KREG_OFFSET(RcvPartitionKey), + .kr_portcnt = IPATH_KREG_OFFSET(PortCnt), + .kr_rcvbthqp = IPATH_KREG_OFFSET(RcvBTHQP), + .kr_rcvbufbase = IPATH_KREG_OFFSET(RcvBufBase), + .kr_rcvbufsize = IPATH_KREG_OFFSET(RcvBufSize), + .kr_rcvctrl = IPATH_KREG_OFFSET(RcvCtrl), + .kr_rcvegrbase = IPATH_KREG_OFFSET(RcvEgrBase), + .kr_rcvegrcnt = IPATH_KREG_OFFSET(RcvEgrCnt), + .kr_rcvhdrcnt = IPATH_KREG_OFFSET(RcvHdrCnt), + .kr_rcvhdrentsize = IPATH_KREG_OFFSET(RcvHdrEntSize), + .kr_rcvhdrsize = IPATH_KREG_OFFSET(RcvHdrSize), + .kr_rcvintmembase = IPATH_KREG_OFFSET(RxIntMemBase), + .kr_rcvintmemsize = IPATH_KREG_OFFSET(RxIntMemSize), + .kr_rcvtidbase = IPATH_KREG_OFFSET(RcvTIDBase), + .kr_rcvtidcnt = IPATH_KREG_OFFSET(RcvTIDCnt), + .kr_revision = IPATH_KREG_OFFSET(Revision), + .kr_scratch = IPATH_KREG_OFFSET(Scratch), + .kr_sendbuffererror = IPATH_KREG_OFFSET(SendBufferError), + .kr_sendctrl = IPATH_KREG_OFFSET(SendCtrl), + .kr_sendpioavailaddr = IPATH_KREG_OFFSET(SendPIOAvailAddr), + .kr_sendpiobufbase = IPATH_KREG_OFFSET(SendPIOBufBase), + .kr_sendpiobufcnt = IPATH_KREG_OFFSET(SendPIOBufCnt), + .kr_sendpiosize = IPATH_KREG_OFFSET(SendPIOSize), + .kr_sendregbase = IPATH_KREG_OFFSET(SendRegBase), + .kr_txintmembase = IPATH_KREG_OFFSET(TxIntMemBase), + .kr_txintmemsize = IPATH_KREG_OFFSET(TxIntMemSize), + .kr_userregbase = IPATH_KREG_OFFSET(UserRegBase), + .kr_serdesconfig0 = IPATH_KREG_OFFSET(SerdesConfig0), + .kr_serdesconfig1 = IPATH_KREG_OFFSET(SerdesConfig1), + .kr_serdesstatus = IPATH_KREG_OFFSET(SerdesStatus), + .kr_xgxsconfig = IPATH_KREG_OFFSET(XGXSConfig), + /* + * These should not be used directly via ipath_write_kreg64(), + * use them with ipath_write_kreg64_port(), + */ + .kr_rcvhdraddr = IPATH_KREG_OFFSET(RcvHdrAddr0), + .kr_rcvhdrtailaddr = IPATH_KREG_OFFSET(RcvHdrTailAddr0) +}; + +static const struct ipath_cregs ipath_ht_cregs = { + .cr_badformatcnt = IPATH_CREG_OFFSET(RxBadFormatCnt), + .cr_erricrccnt = IPATH_CREG_OFFSET(RxICRCErrCnt), + .cr_errlinkcnt = IPATH_CREG_OFFSET(RxLinkProblemCnt), + .cr_errlpcrccnt = IPATH_CREG_OFFSET(RxLPCRCErrCnt), + .cr_errpkey = IPATH_CREG_OFFSET(RxPKeyMismatchCnt), + .cr_errrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowCtrlErrCnt), + .cr_err_rlencnt = IPATH_CREG_OFFSET(RxLenErrCnt), + .cr_errslencnt = IPATH_CREG_OFFSET(TxLenErrCnt), + .cr_errtidfull = IPATH_CREG_OFFSET(RxTIDFullErrCnt), + .cr_errtidvalid = IPATH_CREG_OFFSET(RxTIDValidErrCnt), + .cr_errvcrccnt = IPATH_CREG_OFFSET(RxVCRCErrCnt), + .cr_ibstatuschange = IPATH_CREG_OFFSET(IBStatusChangeCnt), + /* calc from Reg_CounterRegBase + offset */ + .cr_intcnt = IPATH_CREG_OFFSET(LBIntCnt), + .cr_invalidrlencnt = IPATH_CREG_OFFSET(RxMaxMinLenErrCnt), + .cr_invalidslencnt = IPATH_CREG_OFFSET(TxMaxMinLenErrCnt), + .cr_lbflowstallcnt = IPATH_CREG_OFFSET(LBFlowStallCnt), + .cr_pktrcvcnt = IPATH_CREG_OFFSET(RxDataPktCnt), + .cr_pktrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowPktCnt), + .cr_pktsendcnt = IPATH_CREG_OFFSET(TxDataPktCnt), + .cr_pktsendflowcnt = IPATH_CREG_OFFSET(TxFlowPktCnt), + .cr_portovflcnt = IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt), + .cr_rcvebpcnt = IPATH_CREG_OFFSET(RxEBPCnt), + .cr_rcvovflcnt = IPATH_CREG_OFFSET(RxBufOvflCnt), + .cr_senddropped = IPATH_CREG_OFFSET(TxDroppedPktCnt), + .cr_sendstallcnt = IPATH_CREG_OFFSET(TxFlowStallCnt), + .cr_sendunderruncnt = IPATH_CREG_OFFSET(TxUnderrunCnt), + .cr_wordrcvcnt = IPATH_CREG_OFFSET(RxDwordCnt), + .cr_wordsendcnt = IPATH_CREG_OFFSET(TxDwordCnt), + .cr_unsupvlcnt = IPATH_CREG_OFFSET(TxUnsupVLErrCnt), + .cr_rxdroppktcnt = IPATH_CREG_OFFSET(RxDroppedPktCnt), + .cr_iblinkerrrecovcnt = IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt), + .cr_iblinkdowncnt = IPATH_CREG_OFFSET(IBLinkDownedCnt), + .cr_ibsymbolerrcnt = IPATH_CREG_OFFSET(IBSymbolErrCnt) +}; + +/* kr_intstatus, kr_intclear, kr_intmask bits */ +#define INFINIPATH_I_RCVURG_MASK ((1U<<9)-1) +#define INFINIPATH_I_RCVURG_SHIFT 0 +#define INFINIPATH_I_RCVAVAIL_MASK ((1U<<9)-1) +#define INFINIPATH_I_RCVAVAIL_SHIFT 12 + +/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */ +#define INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT 0 +#define INFINIPATH_HWE_HTCMEMPARITYERR_MASK 0x3FFFFFULL +#define INFINIPATH_HWE_HTCLNKABYTE0CRCERR 0x0000000000800000ULL +#define INFINIPATH_HWE_HTCLNKABYTE1CRCERR 0x0000000001000000ULL +#define INFINIPATH_HWE_HTCLNKBBYTE0CRCERR 0x0000000002000000ULL +#define INFINIPATH_HWE_HTCLNKBBYTE1CRCERR 0x0000000004000000ULL +#define INFINIPATH_HWE_HTCMISCERR4 0x0000000008000000ULL +#define INFINIPATH_HWE_HTCMISCERR5 0x0000000010000000ULL +#define INFINIPATH_HWE_HTCMISCERR6 0x0000000020000000ULL +#define INFINIPATH_HWE_HTCMISCERR7 0x0000000040000000ULL +#define INFINIPATH_HWE_HTCBUSTREQPARITYERR 0x0000000080000000ULL +#define INFINIPATH_HWE_HTCBUSTRESPPARITYERR 0x0000000100000000ULL +#define INFINIPATH_HWE_HTCBUSIREQPARITYERR 0x0000000200000000ULL +#define INFINIPATH_HWE_COREPLL_FBSLIP 0x0080000000000000ULL +#define INFINIPATH_HWE_COREPLL_RFSLIP 0x0100000000000000ULL +#define INFINIPATH_HWE_HTBPLL_FBSLIP 0x0200000000000000ULL +#define INFINIPATH_HWE_HTBPLL_RFSLIP 0x0400000000000000ULL +#define INFINIPATH_HWE_HTAPLL_FBSLIP 0x0800000000000000ULL +#define INFINIPATH_HWE_HTAPLL_RFSLIP 0x1000000000000000ULL +#define INFINIPATH_HWE_SERDESPLLFAILED 0x2000000000000000ULL + +#define IBA6110_IBCS_LINKTRAININGSTATE_MASK 0xf +#define IBA6110_IBCS_LINKSTATE_SHIFT 4 + +/* kr_extstatus bits */ +#define INFINIPATH_EXTS_FREQSEL 0x2 +#define INFINIPATH_EXTS_SERDESSEL 0x4 +#define INFINIPATH_EXTS_MEMBIST_ENDTEST 0x0000000000004000 +#define INFINIPATH_EXTS_MEMBIST_CORRECT 0x0000000000008000 + + +/* TID entries (memory), HT-only */ +#define INFINIPATH_RT_ADDR_MASK 0xFFFFFFFFFFULL /* 40 bits valid */ +#define INFINIPATH_RT_VALID 0x8000000000000000ULL +#define INFINIPATH_RT_ADDR_SHIFT 0 +#define INFINIPATH_RT_BUFSIZE_MASK 0x3FFFULL +#define INFINIPATH_RT_BUFSIZE_SHIFT 48 + +#define INFINIPATH_R_INTRAVAIL_SHIFT 16 +#define INFINIPATH_R_TAILUPD_SHIFT 31 + +/* kr_xgxsconfig bits */ +#define INFINIPATH_XGXS_RESET 0x7ULL + +/* + * masks and bits that are different in different chips, or present only + * in one + */ +static const ipath_err_t infinipath_hwe_htcmemparityerr_mask = + INFINIPATH_HWE_HTCMEMPARITYERR_MASK; +static const ipath_err_t infinipath_hwe_htcmemparityerr_shift = + INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT; + +static const ipath_err_t infinipath_hwe_htclnkabyte0crcerr = + INFINIPATH_HWE_HTCLNKABYTE0CRCERR; +static const ipath_err_t infinipath_hwe_htclnkabyte1crcerr = + INFINIPATH_HWE_HTCLNKABYTE1CRCERR; +static const ipath_err_t infinipath_hwe_htclnkbbyte0crcerr = + INFINIPATH_HWE_HTCLNKBBYTE0CRCERR; +static const ipath_err_t infinipath_hwe_htclnkbbyte1crcerr = + INFINIPATH_HWE_HTCLNKBBYTE1CRCERR; + +#define _IPATH_GPIO_SDA_NUM 1 +#define _IPATH_GPIO_SCL_NUM 0 + +#define IPATH_GPIO_SDA \ + (1ULL << (_IPATH_GPIO_SDA_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT)) +#define IPATH_GPIO_SCL \ + (1ULL << (_IPATH_GPIO_SCL_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT)) + +/* keep the code below somewhat more readable; not used elsewhere */ +#define _IPATH_HTLINK0_CRCBITS (infinipath_hwe_htclnkabyte0crcerr | \ + infinipath_hwe_htclnkabyte1crcerr) +#define _IPATH_HTLINK1_CRCBITS (infinipath_hwe_htclnkbbyte0crcerr | \ + infinipath_hwe_htclnkbbyte1crcerr) +#define _IPATH_HTLANE0_CRCBITS (infinipath_hwe_htclnkabyte0crcerr | \ + infinipath_hwe_htclnkbbyte0crcerr) +#define _IPATH_HTLANE1_CRCBITS (infinipath_hwe_htclnkabyte1crcerr | \ + infinipath_hwe_htclnkbbyte1crcerr) + +static void hwerr_crcbits(struct ipath_devdata *dd, ipath_err_t hwerrs, + char *msg, size_t msgl) +{ + char bitsmsg[64]; + ipath_err_t crcbits = hwerrs & + (_IPATH_HTLINK0_CRCBITS | _IPATH_HTLINK1_CRCBITS); + /* don't check if 8bit HT */ + if (dd->ipath_flags & IPATH_8BIT_IN_HT0) + crcbits &= ~infinipath_hwe_htclnkabyte1crcerr; + /* don't check if 8bit HT */ + if (dd->ipath_flags & IPATH_8BIT_IN_HT1) + crcbits &= ~infinipath_hwe_htclnkbbyte1crcerr; + /* + * we'll want to ignore link errors on link that is + * not in use, if any. For now, complain about both + */ + if (crcbits) { + u16 ctrl0, ctrl1; + snprintf(bitsmsg, sizeof bitsmsg, + "[HT%s lane %s CRC (%llx); powercycle to completely clear]", + !(crcbits & _IPATH_HTLINK1_CRCBITS) ? + "0 (A)" : (!(crcbits & _IPATH_HTLINK0_CRCBITS) + ? "1 (B)" : "0+1 (A+B)"), + !(crcbits & _IPATH_HTLANE1_CRCBITS) ? "0" + : (!(crcbits & _IPATH_HTLANE0_CRCBITS) ? "1" : + "0+1"), (unsigned long long) crcbits); + strlcat(msg, bitsmsg, msgl); + + /* + * print extra info for debugging. slave/primary + * config word 4, 8 (link control 0, 1) + */ + + if (pci_read_config_word(dd->pcidev, + dd->ipath_ht_slave_off + 0x4, + &ctrl0)) + dev_info(&dd->pcidev->dev, "Couldn't read " + "linkctrl0 of slave/primary " + "config block\n"); + else if (!(ctrl0 & 1 << 6)) + /* not if EOC bit set */ + ipath_dbg("HT linkctrl0 0x%x%s%s\n", ctrl0, + ((ctrl0 >> 8) & 7) ? " CRC" : "", + ((ctrl0 >> 4) & 1) ? "linkfail" : + ""); + if (pci_read_config_word(dd->pcidev, + dd->ipath_ht_slave_off + 0x8, + &ctrl1)) + dev_info(&dd->pcidev->dev, "Couldn't read " + "linkctrl1 of slave/primary " + "config block\n"); + else if (!(ctrl1 & 1 << 6)) + /* not if EOC bit set */ + ipath_dbg("HT linkctrl1 0x%x%s%s\n", ctrl1, + ((ctrl1 >> 8) & 7) ? " CRC" : "", + ((ctrl1 >> 4) & 1) ? "linkfail" : + ""); + + /* disable until driver reloaded */ + dd->ipath_hwerrmask &= ~crcbits; + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, + dd->ipath_hwerrmask); + ipath_dbg("HT crc errs: %s\n", msg); + } else + ipath_dbg("ignoring HT crc errors 0x%llx, " + "not in use\n", (unsigned long long) + (hwerrs & (_IPATH_HTLINK0_CRCBITS | + _IPATH_HTLINK1_CRCBITS))); +} + +/* 6110 specific hardware errors... */ +static const struct ipath_hwerror_msgs ipath_6110_hwerror_msgs[] = { + INFINIPATH_HWE_MSG(HTCBUSIREQPARITYERR, "HTC Ireq Parity"), + INFINIPATH_HWE_MSG(HTCBUSTREQPARITYERR, "HTC Treq Parity"), + INFINIPATH_HWE_MSG(HTCBUSTRESPPARITYERR, "HTC Tresp Parity"), + INFINIPATH_HWE_MSG(HTCMISCERR5, "HT core Misc5"), + INFINIPATH_HWE_MSG(HTCMISCERR6, "HT core Misc6"), + INFINIPATH_HWE_MSG(HTCMISCERR7, "HT core Misc7"), + INFINIPATH_HWE_MSG(RXDSYNCMEMPARITYERR, "Rx Dsync"), + INFINIPATH_HWE_MSG(SERDESPLLFAILED, "SerDes PLL"), +}; + +#define TXE_PIO_PARITY ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | \ + INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) \ + << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) +#define RXE_EAGER_PARITY (INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID \ + << INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) + +static void ipath_ht_txe_recover(struct ipath_devdata *dd) +{ + ++ipath_stats.sps_txeparity; + dev_info(&dd->pcidev->dev, + "Recovering from TXE PIO parity error\n"); +} + + +/** + * ipath_ht_handle_hwerrors - display hardware errors. + * @dd: the infinipath device + * @msg: the output buffer + * @msgl: the size of the output buffer + * + * Use same msg buffer as regular errors to avoid excessive stack + * use. Most hardware errors are catastrophic, but for right now, + * we'll print them and continue. We reuse the same message buffer as + * ipath_handle_errors() to avoid excessive stack usage. + */ +static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg, + size_t msgl) +{ + ipath_err_t hwerrs; + u32 bits, ctrl; + int isfatal = 0; + char bitsmsg[64]; + int log_idx; + + hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus); + + if (!hwerrs) { + ipath_cdbg(VERBOSE, "Called but no hardware errors set\n"); + /* + * better than printing cofusing messages + * This seems to be related to clearing the crc error, or + * the pll error during init. + */ + goto bail; + } else if (hwerrs == -1LL) { + ipath_dev_err(dd, "Read of hardware error status failed " + "(all bits set); ignoring\n"); + goto bail; + } + ipath_stats.sps_hwerrs++; + + /* Always clear the error status register, except MEMBISTFAIL, + * regardless of whether we continue or stop using the chip. + * We want that set so we know it failed, even across driver reload. + * We'll still ignore it in the hwerrmask. We do this partly for + * diagnostics, but also for support */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, + hwerrs&~INFINIPATH_HWE_MEMBISTFAILED); + + hwerrs &= dd->ipath_hwerrmask; + + /* We log some errors to EEPROM, check if we have any of those. */ + for (log_idx = 0; log_idx < IPATH_EEP_LOG_CNT; ++log_idx) + if (hwerrs & dd->ipath_eep_st_masks[log_idx].hwerrs_to_log) + ipath_inc_eeprom_err(dd, log_idx, 1); + + /* + * make sure we get this much out, unless told to be quiet, + * it's a parity error we may recover from, + * or it's occurred within the last 5 seconds + */ + if ((hwerrs & ~(dd->ipath_lasthwerror | TXE_PIO_PARITY | + RXE_EAGER_PARITY)) || + (ipath_debug & __IPATH_VERBDBG)) + dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx " + "(cleared)\n", (unsigned long long) hwerrs); + dd->ipath_lasthwerror |= hwerrs; + + if (hwerrs & ~dd->ipath_hwe_bitsextant) + ipath_dev_err(dd, "hwerror interrupt with unknown errors " + "%llx set\n", (unsigned long long) + (hwerrs & ~dd->ipath_hwe_bitsextant)); + + ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control); + if ((ctrl & INFINIPATH_C_FREEZEMODE) && !ipath_diag_inuse) { + /* + * parity errors in send memory are recoverable, + * just cancel the send (if indicated in * sendbuffererror), + * count the occurrence, unfreeze (if no other handled + * hardware error bits are set), and continue. They can + * occur if a processor speculative read is done to the PIO + * buffer while we are sending a packet, for example. + */ + if (hwerrs & TXE_PIO_PARITY) { + ipath_ht_txe_recover(dd); + hwerrs &= ~TXE_PIO_PARITY; + } + + if (!hwerrs) { + ipath_dbg("Clearing freezemode on ignored or " + "recovered hardware error\n"); + ipath_clear_freeze(dd); + } + } + + *msg = '\0'; + + /* + * may someday want to decode into which bits are which + * functional area for parity errors, etc. + */ + if (hwerrs & (infinipath_hwe_htcmemparityerr_mask + << INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT)) { + bits = (u32) ((hwerrs >> + INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT) & + INFINIPATH_HWE_HTCMEMPARITYERR_MASK); + snprintf(bitsmsg, sizeof bitsmsg, "[HTC Parity Errs %x] ", + bits); + strlcat(msg, bitsmsg, msgl); + } + + ipath_format_hwerrors(hwerrs, + ipath_6110_hwerror_msgs, + ARRAY_SIZE(ipath_6110_hwerror_msgs), + msg, msgl); + + if (hwerrs & (_IPATH_HTLINK0_CRCBITS | _IPATH_HTLINK1_CRCBITS)) + hwerr_crcbits(dd, hwerrs, msg, msgl); + + if (hwerrs & INFINIPATH_HWE_MEMBISTFAILED) { + strlcat(msg, "[Memory BIST test failed, InfiniPath hardware unusable]", + msgl); + /* ignore from now on, so disable until driver reloaded */ + dd->ipath_hwerrmask &= ~INFINIPATH_HWE_MEMBISTFAILED; + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, + dd->ipath_hwerrmask); + } +#define _IPATH_PLL_FAIL (INFINIPATH_HWE_COREPLL_FBSLIP | \ + INFINIPATH_HWE_COREPLL_RFSLIP | \ + INFINIPATH_HWE_HTBPLL_FBSLIP | \ + INFINIPATH_HWE_HTBPLL_RFSLIP | \ + INFINIPATH_HWE_HTAPLL_FBSLIP | \ + INFINIPATH_HWE_HTAPLL_RFSLIP) + + if (hwerrs & _IPATH_PLL_FAIL) { + snprintf(bitsmsg, sizeof bitsmsg, + "[PLL failed (%llx), InfiniPath hardware unusable]", + (unsigned long long) (hwerrs & _IPATH_PLL_FAIL)); + strlcat(msg, bitsmsg, msgl); + /* ignore from now on, so disable until driver reloaded */ + dd->ipath_hwerrmask &= ~(hwerrs & _IPATH_PLL_FAIL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, + dd->ipath_hwerrmask); + } + + if (hwerrs & INFINIPATH_HWE_SERDESPLLFAILED) { + /* + * If it occurs, it is left masked since the eternal + * interface is unused + */ + dd->ipath_hwerrmask &= ~INFINIPATH_HWE_SERDESPLLFAILED; + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, + dd->ipath_hwerrmask); + } + + if (hwerrs) { + /* + * if any set that we aren't ignoring; only + * make the complaint once, in case it's stuck + * or recurring, and we get here multiple + * times. + * force link down, so switch knows, and + * LEDs are turned off + */ + if (dd->ipath_flags & IPATH_INITTED) { + ipath_set_linkstate(dd, IPATH_IB_LINKDOWN); + ipath_setup_ht_setextled(dd, + INFINIPATH_IBCS_L_STATE_DOWN, + INFINIPATH_IBCS_LT_STATE_DISABLED); + ipath_dev_err(dd, "Fatal Hardware Error (freeze " + "mode), no longer usable, SN %.16s\n", + dd->ipath_serial); + isfatal = 1; + } + *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY; + /* mark as having had error */ + *dd->ipath_statusp |= IPATH_STATUS_HWERROR; + /* + * mark as not usable, at a minimum until driver + * is reloaded, probably until reboot, since no + * other reset is possible. + */ + dd->ipath_flags &= ~IPATH_INITTED; + } + else + *msg = 0; /* recovered from all of them */ + if (*msg) + ipath_dev_err(dd, "%s hardware error\n", msg); + if (isfatal && !ipath_diag_inuse && dd->ipath_freezemsg) + /* + * for status file; if no trailing brace is copied, + * we'll know it was truncated. + */ + snprintf(dd->ipath_freezemsg, + dd->ipath_freezelen, "{%s}", msg); + +bail:; +} + +/** + * ipath_ht_boardname - fill in the board name + * @dd: the infinipath device + * @name: the output buffer + * @namelen: the size of the output buffer + * + * fill in the board name, based on the board revision register + */ +static int ipath_ht_boardname(struct ipath_devdata *dd, char *name, + size_t namelen) +{ + char *n = NULL; + u8 boardrev = dd->ipath_boardrev; + int ret = 0; + + switch (boardrev) { + case 5: + /* + * original production board; two production levels, with + * different serial number ranges. See ipath_ht_early_init() for + * case where we enable IPATH_GPIO_INTR for later serial # range. + * Original 112* serial number is no longer supported. + */ + n = "InfiniPath_QHT7040"; + break; + case 7: + /* small form factor production board */ + n = "InfiniPath_QHT7140"; + break; + default: /* don't know, just print the number */ + ipath_dev_err(dd, "Don't yet know about board " + "with ID %u\n", boardrev); + snprintf(name, namelen, "Unknown_InfiniPath_QHT7xxx_%u", + boardrev); + break; + } + if (n) + snprintf(name, namelen, "%s", n); + + if (ret) { + ipath_dev_err(dd, "Unsupported InfiniPath board %s!\n", name); + goto bail; + } + if (dd->ipath_majrev != 3 || (dd->ipath_minrev < 2 || + dd->ipath_minrev > 4)) { + /* + * This version of the driver only supports Rev 3.2 - 3.4 + */ + ipath_dev_err(dd, + "Unsupported InfiniPath hardware revision %u.%u!\n", + dd->ipath_majrev, dd->ipath_minrev); + ret = 1; + goto bail; + } + /* + * pkt/word counters are 32 bit, and therefore wrap fast enough + * that we snapshot them from a timer, and maintain 64 bit shadow + * copies + */ + dd->ipath_flags |= IPATH_32BITCOUNTERS; + dd->ipath_flags |= IPATH_GPIO_INTR; + if (dd->ipath_lbus_speed != 800) + ipath_dev_err(dd, + "Incorrectly configured for HT @ %uMHz\n", + dd->ipath_lbus_speed); + + /* + * set here, not in ipath_init_*_funcs because we have to do + * it after we can read chip registers. + */ + dd->ipath_ureg_align = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_pagealign); + +bail: + return ret; +} + +static void ipath_check_htlink(struct ipath_devdata *dd) +{ + u8 linkerr, link_off, i; + + for (i = 0; i < 2; i++) { + link_off = dd->ipath_ht_slave_off + i * 4 + 0xd; + if (pci_read_config_byte(dd->pcidev, link_off, &linkerr)) + dev_info(&dd->pcidev->dev, "Couldn't read " + "linkerror%d of HT slave/primary block\n", + i); + else if (linkerr & 0xf0) { + ipath_cdbg(VERBOSE, "HT linkerr%d bits 0x%x set, " + "clearing\n", linkerr >> 4, i); + /* + * writing the linkerr bits that are set should + * clear them + */ + if (pci_write_config_byte(dd->pcidev, link_off, + linkerr)) + ipath_dbg("Failed write to clear HT " + "linkerror%d\n", i); + if (pci_read_config_byte(dd->pcidev, link_off, + &linkerr)) + dev_info(&dd->pcidev->dev, + "Couldn't reread linkerror%d of " + "HT slave/primary block\n", i); + else if (linkerr & 0xf0) + dev_info(&dd->pcidev->dev, + "HT linkerror%d bits 0x%x " + "couldn't be cleared\n", + i, linkerr >> 4); + } + } +} + +static int ipath_setup_ht_reset(struct ipath_devdata *dd) +{ + ipath_dbg("No reset possible for this InfiniPath hardware\n"); + return 0; +} + +#define HT_INTR_DISC_CONFIG 0x80 /* HT interrupt and discovery cap */ +#define HT_INTR_REG_INDEX 2 /* intconfig requires indirect accesses */ + +/* + * Bits 13-15 of command==0 is slave/primary block. Clear any HT CRC + * errors. We only bother to do this at load time, because it's OK if + * it happened before we were loaded (first time after boot/reset), + * but any time after that, it's fatal anyway. Also need to not check + * for upper byte errors if we are in 8 bit mode, so figure out + * our width. For now, at least, also complain if it's 8 bit. + */ +static void slave_or_pri_blk(struct ipath_devdata *dd, struct pci_dev *pdev, + int pos, u8 cap_type) +{ + u8 linkwidth = 0, linkerr, link_a_b_off, link_off; + u16 linkctrl = 0; + int i; + + dd->ipath_ht_slave_off = pos; + /* command word, master_host bit */ + /* master host || slave */ + if ((cap_type >> 2) & 1) + link_a_b_off = 4; + else + link_a_b_off = 0; + ipath_cdbg(VERBOSE, "HT%u (Link %c) connected to processor\n", + link_a_b_off ? 1 : 0, + link_a_b_off ? 'B' : 'A'); + + link_a_b_off += pos; + + /* + * check both link control registers; clear both HT CRC sets if + * necessary. + */ + for (i = 0; i < 2; i++) { + link_off = pos + i * 4 + 0x4; + if (pci_read_config_word(pdev, link_off, &linkctrl)) + ipath_dev_err(dd, "Couldn't read HT link control%d " + "register\n", i); + else if (linkctrl & (0xf << 8)) { + ipath_cdbg(VERBOSE, "Clear linkctrl%d CRC Error " + "bits %x\n", i, linkctrl & (0xf << 8)); + /* + * now write them back to clear the error. + */ + pci_write_config_word(pdev, link_off, + linkctrl & (0xf << 8)); + } + } + + /* + * As with HT CRC bits, same for protocol errors that might occur + * during boot. + */ + for (i = 0; i < 2; i++) { + link_off = pos + i * 4 + 0xd; + if (pci_read_config_byte(pdev, link_off, &linkerr)) + dev_info(&pdev->dev, "Couldn't read linkerror%d " + "of HT slave/primary block\n", i); + else if (linkerr & 0xf0) { + ipath_cdbg(VERBOSE, "HT linkerr%d bits 0x%x set, " + "clearing\n", linkerr >> 4, i); + /* + * writing the linkerr bits that are set will clear + * them + */ + if (pci_write_config_byte + (pdev, link_off, linkerr)) + ipath_dbg("Failed write to clear HT " + "linkerror%d\n", i); + if (pci_read_config_byte(pdev, link_off, &linkerr)) + dev_info(&pdev->dev, "Couldn't reread " + "linkerror%d of HT slave/primary " + "block\n", i); + else if (linkerr & 0xf0) + dev_info(&pdev->dev, "HT linkerror%d bits " + "0x%x couldn't be cleared\n", + i, linkerr >> 4); + } + } + + /* + * this is just for our link to the host, not devices connected + * through tunnel. + */ + + if (pci_read_config_byte(pdev, link_a_b_off + 7, &linkwidth)) + ipath_dev_err(dd, "Couldn't read HT link width " + "config register\n"); + else { + u32 width; + switch (linkwidth & 7) { + case 5: + width = 4; + break; + case 4: + width = 2; + break; + case 3: + width = 32; + break; + case 1: + width = 16; + break; + case 0: + default: /* if wrong, assume 8 bit */ + width = 8; + break; + } + + dd->ipath_lbus_width = width; + + if (linkwidth != 0x11) { + ipath_dev_err(dd, "Not configured for 16 bit HT " + "(%x)\n", linkwidth); + if (!(linkwidth & 0xf)) { + ipath_dbg("Will ignore HT lane1 errors\n"); + dd->ipath_flags |= IPATH_8BIT_IN_HT0; + } + } + } + + /* + * this is just for our link to the host, not devices connected + * through tunnel. + */ + if (pci_read_config_byte(pdev, link_a_b_off + 0xd, &linkwidth)) + ipath_dev_err(dd, "Couldn't read HT link frequency " + "config register\n"); + else { + u32 speed; + switch (linkwidth & 0xf) { + case 6: + speed = 1000; + break; + case 5: + speed = 800; + break; + case 4: + speed = 600; + break; + case 3: + speed = 500; + break; + case 2: + speed = 400; + break; + case 1: + speed = 300; + break; + default: + /* + * assume reserved and vendor-specific are 200... + */ + case 0: + speed = 200; + break; + } + dd->ipath_lbus_speed = speed; + } + + snprintf(dd->ipath_lbus_info, sizeof(dd->ipath_lbus_info), + "HyperTransport,%uMHz,x%u\n", + dd->ipath_lbus_speed, + dd->ipath_lbus_width); +} + +static int ipath_ht_intconfig(struct ipath_devdata *dd) +{ + int ret; + + if (dd->ipath_intconfig) { + ipath_write_kreg(dd, dd->ipath_kregs->kr_interruptconfig, + dd->ipath_intconfig); /* interrupt address */ + ret = 0; + } else { + ipath_dev_err(dd, "No interrupts enabled, couldn't setup " + "interrupt address\n"); + ret = -EINVAL; + } + + return ret; +} + +static void ipath_ht_irq_update(struct pci_dev *dev, int irq, + struct ht_irq_msg *msg) +{ + struct ipath_devdata *dd = pci_get_drvdata(dev); + u64 prev_intconfig = dd->ipath_intconfig; + + dd->ipath_intconfig = msg->address_lo; + dd->ipath_intconfig |= ((u64) msg->address_hi) << 32; + + /* + * If the previous value of dd->ipath_intconfig is zero, we're + * getting configured for the first time, and must not program the + * intconfig register here (it will be programmed later, when the + * hardware is ready). Otherwise, we should. + */ + if (prev_intconfig) + ipath_ht_intconfig(dd); +} + +/** + * ipath_setup_ht_config - setup the interruptconfig register + * @dd: the infinipath device + * @pdev: the PCI device + * + * setup the interruptconfig register from the HT config info. + * Also clear CRC errors in HT linkcontrol, if necessary. + * This is done only for the real hardware. It is done before + * chip address space is initted, so can't touch infinipath registers + */ +static int ipath_setup_ht_config(struct ipath_devdata *dd, + struct pci_dev *pdev) +{ + int pos, ret; + + ret = __ht_create_irq(pdev, 0, ipath_ht_irq_update); + if (ret < 0) { + ipath_dev_err(dd, "Couldn't create interrupt handler: " + "err %d\n", ret); + goto bail; + } + dd->ipath_irq = ret; + ret = 0; + + /* + * Handle clearing CRC errors in linkctrl register if necessary. We + * do this early, before we ever enable errors or hardware errors, + * mostly to avoid causing the chip to enter freeze mode. + */ + pos = pci_find_capability(pdev, PCI_CAP_ID_HT); + if (!pos) { + ipath_dev_err(dd, "Couldn't find HyperTransport " + "capability; no interrupts\n"); + ret = -ENODEV; + goto bail; + } + do { + u8 cap_type; + + /* + * The HT capability type byte is 3 bytes after the + * capability byte. + */ + if (pci_read_config_byte(pdev, pos + 3, &cap_type)) { + dev_info(&pdev->dev, "Couldn't read config " + "command @ %d\n", pos); + continue; + } + if (!(cap_type & 0xE0)) + slave_or_pri_blk(dd, pdev, pos, cap_type); + } while ((pos = pci_find_next_capability(pdev, pos, + PCI_CAP_ID_HT))); + + dd->ipath_flags |= IPATH_SWAP_PIOBUFS; + +bail: + return ret; +} + +/** + * ipath_setup_ht_cleanup - clean up any per-chip chip-specific stuff + * @dd: the infinipath device + * + * Called during driver unload. + * This is currently a nop for the HT chip, not for all chips + */ +static void ipath_setup_ht_cleanup(struct ipath_devdata *dd) +{ +} + +/** + * ipath_setup_ht_setextled - set the state of the two external LEDs + * @dd: the infinipath device + * @lst: the L state + * @ltst: the LT state + * + * Set the state of the two external LEDs, to indicate physical and + * logical state of IB link. For this chip (at least with recommended + * board pinouts), LED1 is Green (physical state), and LED2 is Yellow + * (logical state) + * + * Note: We try to match the Mellanox HCA LED behavior as best + * we can. Green indicates physical link state is OK (something is + * plugged in, and we can train). + * Amber indicates the link is logically up (ACTIVE). + * Mellanox further blinks the amber LED to indicate data packet + * activity, but we have no hardware support for that, so it would + * require waking up every 10-20 msecs and checking the counters + * on the chip, and then turning the LED off if appropriate. That's + * visible overhead, so not something we will do. + * + */ +static void ipath_setup_ht_setextled(struct ipath_devdata *dd, + u64 lst, u64 ltst) +{ + u64 extctl; + unsigned long flags = 0; + + /* the diags use the LED to indicate diag info, so we leave + * the external LED alone when the diags are running */ + if (ipath_diag_inuse) + return; + + /* Allow override of LED display for, e.g. Locating system in rack */ + if (dd->ipath_led_override) { + ltst = (dd->ipath_led_override & IPATH_LED_PHYS) + ? INFINIPATH_IBCS_LT_STATE_LINKUP + : INFINIPATH_IBCS_LT_STATE_DISABLED; + lst = (dd->ipath_led_override & IPATH_LED_LOG) + ? INFINIPATH_IBCS_L_STATE_ACTIVE + : INFINIPATH_IBCS_L_STATE_DOWN; + } + + spin_lock_irqsave(&dd->ipath_gpio_lock, flags); + /* + * start by setting both LED control bits to off, then turn + * on the appropriate bit(s). + */ + if (dd->ipath_boardrev == 8) { /* LS/X-1 uses different pins */ + /* + * major difference is that INFINIPATH_EXTC_LEDGBLERR_OFF + * is inverted, because it is normally used to indicate + * a hardware fault at reset, if there were errors + */ + extctl = (dd->ipath_extctrl & ~INFINIPATH_EXTC_LEDGBLOK_ON) + | INFINIPATH_EXTC_LEDGBLERR_OFF; + if (ltst == INFINIPATH_IBCS_LT_STATE_LINKUP) + extctl &= ~INFINIPATH_EXTC_LEDGBLERR_OFF; + if (lst == INFINIPATH_IBCS_L_STATE_ACTIVE) + extctl |= INFINIPATH_EXTC_LEDGBLOK_ON; + } + else { + extctl = dd->ipath_extctrl & + ~(INFINIPATH_EXTC_LED1PRIPORT_ON | + INFINIPATH_EXTC_LED2PRIPORT_ON); + if (ltst == INFINIPATH_IBCS_LT_STATE_LINKUP) + extctl |= INFINIPATH_EXTC_LED1PRIPORT_ON; + if (lst == INFINIPATH_IBCS_L_STATE_ACTIVE) + extctl |= INFINIPATH_EXTC_LED2PRIPORT_ON; + } + dd->ipath_extctrl = extctl; + ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, extctl); + spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags); +} + +static void ipath_init_ht_variables(struct ipath_devdata *dd) +{ + /* + * setup the register offsets, since they are different for each + * chip + */ + dd->ipath_kregs = &ipath_ht_kregs; + dd->ipath_cregs = &ipath_ht_cregs; + + dd->ipath_gpio_sda_num = _IPATH_GPIO_SDA_NUM; + dd->ipath_gpio_scl_num = _IPATH_GPIO_SCL_NUM; + dd->ipath_gpio_sda = IPATH_GPIO_SDA; + dd->ipath_gpio_scl = IPATH_GPIO_SCL; + + /* + * Fill in data for field-values that change in newer chips. + * We dynamically specify only the mask for LINKTRAININGSTATE + * and only the shift for LINKSTATE, as they are the only ones + * that change. Also precalculate the 3 link states of interest + * and the combined mask. + */ + dd->ibcs_ls_shift = IBA6110_IBCS_LINKSTATE_SHIFT; + dd->ibcs_lts_mask = IBA6110_IBCS_LINKTRAININGSTATE_MASK; + dd->ibcs_mask = (INFINIPATH_IBCS_LINKSTATE_MASK << + dd->ibcs_ls_shift) | dd->ibcs_lts_mask; + dd->ib_init = (INFINIPATH_IBCS_LT_STATE_LINKUP << + INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) | + (INFINIPATH_IBCS_L_STATE_INIT << dd->ibcs_ls_shift); + dd->ib_arm = (INFINIPATH_IBCS_LT_STATE_LINKUP << + INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) | + (INFINIPATH_IBCS_L_STATE_ARM << dd->ibcs_ls_shift); + dd->ib_active = (INFINIPATH_IBCS_LT_STATE_LINKUP << + INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) | + (INFINIPATH_IBCS_L_STATE_ACTIVE << dd->ibcs_ls_shift); + + /* + * Fill in data for ibcc field-values that change in newer chips. + * We dynamically specify only the mask for LINKINITCMD + * and only the shift for LINKCMD and MAXPKTLEN, as they are + * the only ones that change. + */ + dd->ibcc_lic_mask = INFINIPATH_IBCC_LINKINITCMD_MASK; + dd->ibcc_lc_shift = INFINIPATH_IBCC_LINKCMD_SHIFT; + dd->ibcc_mpl_shift = INFINIPATH_IBCC_MAXPKTLEN_SHIFT; + + /* Fill in shifts for RcvCtrl. */ + dd->ipath_r_portenable_shift = INFINIPATH_R_PORTENABLE_SHIFT; + dd->ipath_r_intravail_shift = INFINIPATH_R_INTRAVAIL_SHIFT; + dd->ipath_r_tailupd_shift = INFINIPATH_R_TAILUPD_SHIFT; + dd->ipath_r_portcfg_shift = 0; /* Not on IBA6110 */ + + dd->ipath_i_bitsextant = + (INFINIPATH_I_RCVURG_MASK << INFINIPATH_I_RCVURG_SHIFT) | + (INFINIPATH_I_RCVAVAIL_MASK << + INFINIPATH_I_RCVAVAIL_SHIFT) | + INFINIPATH_I_ERROR | INFINIPATH_I_SPIOSENT | + INFINIPATH_I_SPIOBUFAVAIL | INFINIPATH_I_GPIO; + + dd->ipath_e_bitsextant = + INFINIPATH_E_RFORMATERR | INFINIPATH_E_RVCRC | + INFINIPATH_E_RICRC | INFINIPATH_E_RMINPKTLEN | + INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RLONGPKTLEN | + INFINIPATH_E_RSHORTPKTLEN | INFINIPATH_E_RUNEXPCHAR | + INFINIPATH_E_RUNSUPVL | INFINIPATH_E_REBP | + INFINIPATH_E_RIBFLOW | INFINIPATH_E_RBADVERSION | + INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL | + INFINIPATH_E_RBADTID | INFINIPATH_E_RHDRLEN | + INFINIPATH_E_RHDR | INFINIPATH_E_RIBLOSTLINK | + INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SMAXPKTLEN | + INFINIPATH_E_SUNDERRUN | INFINIPATH_E_SPKTLEN | + INFINIPATH_E_SDROPPEDSMPPKT | INFINIPATH_E_SDROPPEDDATAPKT | + INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM | + INFINIPATH_E_SUNSUPVL | INFINIPATH_E_IBSTATUSCHANGED | + INFINIPATH_E_INVALIDADDR | INFINIPATH_E_RESET | + INFINIPATH_E_HARDWARE; + + dd->ipath_hwe_bitsextant = + (INFINIPATH_HWE_HTCMEMPARITYERR_MASK << + INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT) | + (INFINIPATH_HWE_TXEMEMPARITYERR_MASK << + INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) | + (INFINIPATH_HWE_RXEMEMPARITYERR_MASK << + INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) | + INFINIPATH_HWE_HTCLNKABYTE0CRCERR | + INFINIPATH_HWE_HTCLNKABYTE1CRCERR | + INFINIPATH_HWE_HTCLNKBBYTE0CRCERR | + INFINIPATH_HWE_HTCLNKBBYTE1CRCERR | + INFINIPATH_HWE_HTCMISCERR4 | + INFINIPATH_HWE_HTCMISCERR5 | INFINIPATH_HWE_HTCMISCERR6 | + INFINIPATH_HWE_HTCMISCERR7 | + INFINIPATH_HWE_HTCBUSTREQPARITYERR | + INFINIPATH_HWE_HTCBUSTRESPPARITYERR | + INFINIPATH_HWE_HTCBUSIREQPARITYERR | + INFINIPATH_HWE_RXDSYNCMEMPARITYERR | + INFINIPATH_HWE_MEMBISTFAILED | + INFINIPATH_HWE_COREPLL_FBSLIP | + INFINIPATH_HWE_COREPLL_RFSLIP | + INFINIPATH_HWE_HTBPLL_FBSLIP | + INFINIPATH_HWE_HTBPLL_RFSLIP | + INFINIPATH_HWE_HTAPLL_FBSLIP | + INFINIPATH_HWE_HTAPLL_RFSLIP | + INFINIPATH_HWE_SERDESPLLFAILED | + INFINIPATH_HWE_IBCBUSTOSPCPARITYERR | + INFINIPATH_HWE_IBCBUSFRSPCPARITYERR; + + dd->ipath_i_rcvavail_mask = INFINIPATH_I_RCVAVAIL_MASK; + dd->ipath_i_rcvurg_mask = INFINIPATH_I_RCVURG_MASK; + dd->ipath_i_rcvavail_shift = INFINIPATH_I_RCVAVAIL_SHIFT; + dd->ipath_i_rcvurg_shift = INFINIPATH_I_RCVURG_SHIFT; + + /* + * EEPROM error log 0 is TXE Parity errors. 1 is RXE Parity. + * 2 is Some Misc, 3 is reserved for future. + */ + dd->ipath_eep_st_masks[0].hwerrs_to_log = + INFINIPATH_HWE_TXEMEMPARITYERR_MASK << + INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT; + + dd->ipath_eep_st_masks[1].hwerrs_to_log = + INFINIPATH_HWE_RXEMEMPARITYERR_MASK << + INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT; + + dd->ipath_eep_st_masks[2].errs_to_log = INFINIPATH_E_RESET; + + dd->delay_mult = 2; /* SDR, 4X, can't change */ + + dd->ipath_link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X; + dd->ipath_link_speed_supported = IPATH_IB_SDR; + dd->ipath_link_width_enabled = IB_WIDTH_4X; + dd->ipath_link_speed_enabled = dd->ipath_link_speed_supported; + /* these can't change for this chip, so set once */ + dd->ipath_link_width_active = dd->ipath_link_width_enabled; + dd->ipath_link_speed_active = dd->ipath_link_speed_enabled; +} + +/** + * ipath_ht_init_hwerrors - enable hardware errors + * @dd: the infinipath device + * + * now that we have finished initializing everything that might reasonably + * cause a hardware error, and cleared those errors bits as they occur, + * we can enable hardware errors in the mask (potentially enabling + * freeze mode), and enable hardware errors as errors (along with + * everything else) in errormask + */ +static void ipath_ht_init_hwerrors(struct ipath_devdata *dd) +{ + ipath_err_t val; + u64 extsval; + + extsval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus); + + if (!(extsval & INFINIPATH_EXTS_MEMBIST_ENDTEST)) + ipath_dev_err(dd, "MemBIST did not complete!\n"); + if (extsval & INFINIPATH_EXTS_MEMBIST_CORRECT) + ipath_dbg("MemBIST corrected\n"); + + ipath_check_htlink(dd); + + /* barring bugs, all hwerrors become interrupts, which can */ + val = -1LL; + /* don't look at crc lane1 if 8 bit */ + if (dd->ipath_flags & IPATH_8BIT_IN_HT0) + val &= ~infinipath_hwe_htclnkabyte1crcerr; + /* don't look at crc lane1 if 8 bit */ + if (dd->ipath_flags & IPATH_8BIT_IN_HT1) + val &= ~infinipath_hwe_htclnkbbyte1crcerr; + + /* + * disable RXDSYNCMEMPARITY because external serdes is unused, + * and therefore the logic will never be used or initialized, + * and uninitialized state will normally result in this error + * being asserted. Similarly for the external serdess pll + * lock signal. + */ + val &= ~(INFINIPATH_HWE_SERDESPLLFAILED | + INFINIPATH_HWE_RXDSYNCMEMPARITYERR); + + /* + * Disable MISCERR4 because of an inversion in the HT core + * logic checking for errors that cause this bit to be set. + * The errata can also cause the protocol error bit to be set + * in the HT config space linkerror register(s). + */ + val &= ~INFINIPATH_HWE_HTCMISCERR4; + + /* + * PLL ignored because unused MDIO interface has a logic problem + */ + if (dd->ipath_boardrev == 4 || dd->ipath_boardrev == 9) + val &= ~INFINIPATH_HWE_SERDESPLLFAILED; + dd->ipath_hwerrmask = val; +} + + + + +/** + * ipath_ht_bringup_serdes - bring up the serdes + * @dd: the infinipath device + */ +static int ipath_ht_bringup_serdes(struct ipath_devdata *dd) +{ + u64 val, config1; + int ret = 0, change = 0; + + ipath_dbg("Trying to bringup serdes\n"); + + if (ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus) & + INFINIPATH_HWE_SERDESPLLFAILED) + { + ipath_dbg("At start, serdes PLL failed bit set in " + "hwerrstatus, clearing and continuing\n"); + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, + INFINIPATH_HWE_SERDESPLLFAILED); + } + + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0); + config1 = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig1); + + ipath_cdbg(VERBOSE, "Initial serdes status is config0=%llx " + "config1=%llx, sstatus=%llx xgxs %llx\n", + (unsigned long long) val, (unsigned long long) config1, + (unsigned long long) + ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesstatus), + (unsigned long long) + ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig)); + + /* force reset on */ + val |= INFINIPATH_SERDC0_RESET_PLL + /* | INFINIPATH_SERDC0_RESET_MASK */ + ; + ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val); + udelay(15); /* need pll reset set at least for a bit */ + + if (val & INFINIPATH_SERDC0_RESET_PLL) { + u64 val2 = val &= ~INFINIPATH_SERDC0_RESET_PLL; + /* set lane resets, and tx idle, during pll reset */ + val2 |= INFINIPATH_SERDC0_RESET_MASK | + INFINIPATH_SERDC0_TXIDLE; + ipath_cdbg(VERBOSE, "Clearing serdes PLL reset (writing " + "%llx)\n", (unsigned long long) val2); + ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, + val2); + /* + * be sure chip saw it + */ + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + /* + * need pll reset clear at least 11 usec before lane + * resets cleared; give it a few more + */ + udelay(15); + val = val2; /* for check below */ + } + + if (val & (INFINIPATH_SERDC0_RESET_PLL | + INFINIPATH_SERDC0_RESET_MASK | + INFINIPATH_SERDC0_TXIDLE)) { + val &= ~(INFINIPATH_SERDC0_RESET_PLL | + INFINIPATH_SERDC0_RESET_MASK | + INFINIPATH_SERDC0_TXIDLE); + /* clear them */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, + val); + } + + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig); + if (val & INFINIPATH_XGXS_RESET) { + /* normally true after boot */ + val &= ~INFINIPATH_XGXS_RESET; + change = 1; + } + if (((val >> INFINIPATH_XGXS_RX_POL_SHIFT) & + INFINIPATH_XGXS_RX_POL_MASK) != dd->ipath_rx_pol_inv ) { + /* need to compensate for Tx inversion in partner */ + val &= ~(INFINIPATH_XGXS_RX_POL_MASK << + INFINIPATH_XGXS_RX_POL_SHIFT); + val |= dd->ipath_rx_pol_inv << + INFINIPATH_XGXS_RX_POL_SHIFT; + change = 1; + } + if (change) + ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val); + + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0); + + /* clear current and de-emphasis bits */ + config1 &= ~0x0ffffffff00ULL; + /* set current to 20ma */ + config1 |= 0x00000000000ULL; + /* set de-emphasis to -5.68dB */ + config1 |= 0x0cccc000000ULL; + ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig1, config1); + + ipath_cdbg(VERBOSE, "After setup: serdes status is config0=%llx " + "config1=%llx, sstatus=%llx xgxs %llx\n", + (unsigned long long) val, (unsigned long long) config1, + (unsigned long long) + ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesstatus), + (unsigned long long) + ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig)); + + return ret; /* for now, say we always succeeded */ +} + +/** + * ipath_ht_quiet_serdes - set serdes to txidle + * @dd: the infinipath device + * driver is being unloaded + */ +static void ipath_ht_quiet_serdes(struct ipath_devdata *dd) +{ + u64 val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0); + + val |= INFINIPATH_SERDC0_TXIDLE; + ipath_dbg("Setting TxIdleEn on serdes (config0 = %llx)\n", + (unsigned long long) val); + ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val); +} + +/** + * ipath_pe_put_tid - write a TID in chip + * @dd: the infinipath device + * @tidptr: pointer to the expected TID (in chip) to update + * @tidtype: RCVHQ_RCV_TYPE_EAGER (1) for eager, RCVHQ_RCV_TYPE_EXPECTED (0) for expected + * @pa: physical address of in memory buffer; ipath_tidinvalid if freeing + * + * This exists as a separate routine to allow for special locking etc. + * It's used for both the full cleanup on exit, as well as the normal + * setup and teardown. + */ +static void ipath_ht_put_tid(struct ipath_devdata *dd, + u64 __iomem *tidptr, u32 type, + unsigned long pa) +{ + if (!dd->ipath_kregbase) + return; + + if (pa != dd->ipath_tidinvalid) { + if (unlikely((pa & ~INFINIPATH_RT_ADDR_MASK))) { + dev_info(&dd->pcidev->dev, + "physaddr %lx has more than " + "40 bits, using only 40!!!\n", pa); + pa &= INFINIPATH_RT_ADDR_MASK; + } + if (type == RCVHQ_RCV_TYPE_EAGER) + pa |= dd->ipath_tidtemplate; + else { + /* in words (fixed, full page). */ + u64 lenvalid = PAGE_SIZE >> 2; + lenvalid <<= INFINIPATH_RT_BUFSIZE_SHIFT; + pa |= lenvalid | INFINIPATH_RT_VALID; + } + } + + writeq(pa, tidptr); +} + + +/** + * ipath_ht_clear_tid - clear all TID entries for a port, expected and eager + * @dd: the infinipath device + * @port: the port + * + * Used from ipath_close(), and at chip initialization. + */ +static void ipath_ht_clear_tids(struct ipath_devdata *dd, unsigned port) +{ + u64 __iomem *tidbase; + int i; + + if (!dd->ipath_kregbase) + return; + + ipath_cdbg(VERBOSE, "Invalidate TIDs for port %u\n", port); + + /* + * need to invalidate all of the expected TID entries for this + * port, so we don't have valid entries that might somehow get + * used (early in next use of this port, or through some bug) + */ + tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) + + dd->ipath_rcvtidbase + + port * dd->ipath_rcvtidcnt * + sizeof(*tidbase)); + for (i = 0; i < dd->ipath_rcvtidcnt; i++) + ipath_ht_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EXPECTED, + dd->ipath_tidinvalid); + + tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) + + dd->ipath_rcvegrbase + + port * dd->ipath_rcvegrcnt * + sizeof(*tidbase)); + + for (i = 0; i < dd->ipath_rcvegrcnt; i++) + ipath_ht_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EAGER, + dd->ipath_tidinvalid); +} + +/** + * ipath_ht_tidtemplate - setup constants for TID updates + * @dd: the infinipath device + * + * We setup stuff that we use a lot, to avoid calculating each time + */ +static void ipath_ht_tidtemplate(struct ipath_devdata *dd) +{ + dd->ipath_tidtemplate = dd->ipath_ibmaxlen >> 2; + dd->ipath_tidtemplate <<= INFINIPATH_RT_BUFSIZE_SHIFT; + dd->ipath_tidtemplate |= INFINIPATH_RT_VALID; + + /* + * work around chip errata bug 7358, by marking invalid tids + * as having max length + */ + dd->ipath_tidinvalid = (-1LL & INFINIPATH_RT_BUFSIZE_MASK) << + INFINIPATH_RT_BUFSIZE_SHIFT; +} + +static int ipath_ht_early_init(struct ipath_devdata *dd) +{ + u32 __iomem *piobuf; + u32 pioincr, val32; + int i; + + /* + * one cache line; long IB headers will spill over into received + * buffer + */ + dd->ipath_rcvhdrentsize = 16; + dd->ipath_rcvhdrsize = IPATH_DFLT_RCVHDRSIZE; + + /* + * For HT, we allocate a somewhat overly large eager buffer, + * such that we can guarantee that we can receive the largest + * packet that we can send out. To truly support a 4KB MTU, + * we need to bump this to a large value. To date, other than + * testing, we have never encountered an HCA that can really + * send 4KB MTU packets, so we do not handle that (we'll get + * errors interrupts if we ever see one). + */ + dd->ipath_rcvegrbufsize = dd->ipath_piosize2k; + + /* + * the min() check here is currently a nop, but it may not + * always be, depending on just how we do ipath_rcvegrbufsize + */ + dd->ipath_ibmaxlen = min(dd->ipath_piosize2k, + dd->ipath_rcvegrbufsize); + dd->ipath_init_ibmaxlen = dd->ipath_ibmaxlen; + ipath_ht_tidtemplate(dd); + + /* + * zero all the TID entries at startup. We do this for sanity, + * in case of a previous driver crash of some kind, and also + * because the chip powers up with these memories in an unknown + * state. Use portcnt, not cfgports, since this is for the + * full chip, not for current (possibly different) configuration + * value. + * Chip Errata bug 6447 + */ + for (val32 = 0; val32 < dd->ipath_portcnt; val32++) + ipath_ht_clear_tids(dd, val32); + + /* + * write the pbc of each buffer, to be sure it's initialized, then + * cancel all the buffers, and also abort any packets that might + * have been in flight for some reason (the latter is for driver + * unload/reload, but isn't a bad idea at first init). PIO send + * isn't enabled at this point, so there is no danger of sending + * these out on the wire. + * Chip Errata bug 6610 + */ + piobuf = (u32 __iomem *) (((char __iomem *)(dd->ipath_kregbase)) + + dd->ipath_piobufbase); + pioincr = dd->ipath_palign / sizeof(*piobuf); + for (i = 0; i < dd->ipath_piobcnt2k; i++) { + /* + * reasonable word count, just to init pbc + */ + writel(16, piobuf); + piobuf += pioincr; + } + + ipath_get_eeprom_info(dd); + if (dd->ipath_boardrev == 5) { + /* + * Later production QHT7040 has same changes as QHT7140, so + * can use GPIO interrupts. They have serial #'s starting + * with 128, rather than 112. + */ + if (dd->ipath_serial[0] == '1' && + dd->ipath_serial[1] == '2' && + dd->ipath_serial[2] == '8') + dd->ipath_flags |= IPATH_GPIO_INTR; + else { + ipath_dev_err(dd, "Unsupported InfiniPath board " + "(serial number %.16s)!\n", + dd->ipath_serial); + return 1; + } + } + + if (dd->ipath_minrev >= 4) { + /* Rev4+ reports extra errors via internal GPIO pins */ + dd->ipath_flags |= IPATH_GPIO_ERRINTRS; + dd->ipath_gpio_mask |= IPATH_GPIO_ERRINTR_MASK; + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask, + dd->ipath_gpio_mask); + } + + return 0; +} + + +/** + * ipath_init_ht_get_base_info - set chip-specific flags for user code + * @dd: the infinipath device + * @kbase: ipath_base_info pointer + * + * We set the PCIE flag because the lower bandwidth on PCIe vs + * HyperTransport can affect some user packet algorithms. + */ +static int ipath_ht_get_base_info(struct ipath_portdata *pd, void *kbase) +{ + struct ipath_base_info *kinfo = kbase; + + kinfo->spi_runtime_flags |= IPATH_RUNTIME_HT | + IPATH_RUNTIME_PIO_REGSWAPPED; + + if (pd->port_dd->ipath_minrev < 4) + kinfo->spi_runtime_flags |= IPATH_RUNTIME_RCVHDR_COPY; + + return 0; +} + +static void ipath_ht_free_irq(struct ipath_devdata *dd) +{ + free_irq(dd->ipath_irq, dd); + ht_destroy_irq(dd->ipath_irq); + dd->ipath_irq = 0; + dd->ipath_intconfig = 0; +} + +static struct ipath_message_header * +ipath_ht_get_msgheader(struct ipath_devdata *dd, __le32 *rhf_addr) +{ + return (struct ipath_message_header *) + &rhf_addr[sizeof(u64) / sizeof(u32)]; +} + +static void ipath_ht_config_ports(struct ipath_devdata *dd, ushort cfgports) +{ + dd->ipath_portcnt = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_portcnt); + dd->ipath_p0_rcvegrcnt = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt); +} + +static void ipath_ht_read_counters(struct ipath_devdata *dd, + struct infinipath_counters *cntrs) +{ + cntrs->LBIntCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(LBIntCnt)); + cntrs->LBFlowStallCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(LBFlowStallCnt)); + cntrs->TxSDmaDescCnt = 0; + cntrs->TxUnsupVLErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxUnsupVLErrCnt)); + cntrs->TxDataPktCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDataPktCnt)); + cntrs->TxFlowPktCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxFlowPktCnt)); + cntrs->TxDwordCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDwordCnt)); + cntrs->TxLenErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxLenErrCnt)); + cntrs->TxMaxMinLenErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxMaxMinLenErrCnt)); + cntrs->TxUnderrunCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxUnderrunCnt)); + cntrs->TxFlowStallCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxFlowStallCnt)); + cntrs->TxDroppedPktCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDroppedPktCnt)); + cntrs->RxDroppedPktCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDroppedPktCnt)); + cntrs->RxDataPktCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDataPktCnt)); + cntrs->RxFlowPktCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxFlowPktCnt)); + cntrs->RxDwordCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDwordCnt)); + cntrs->RxLenErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLenErrCnt)); + cntrs->RxMaxMinLenErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxMaxMinLenErrCnt)); + cntrs->RxICRCErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxICRCErrCnt)); + cntrs->RxVCRCErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxVCRCErrCnt)); + cntrs->RxFlowCtrlErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxFlowCtrlErrCnt)); + cntrs->RxBadFormatCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxBadFormatCnt)); + cntrs->RxLinkProblemCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLinkProblemCnt)); + cntrs->RxEBPCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxEBPCnt)); + cntrs->RxLPCRCErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLPCRCErrCnt)); + cntrs->RxBufOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxBufOvflCnt)); + cntrs->RxTIDFullErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxTIDFullErrCnt)); + cntrs->RxTIDValidErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxTIDValidErrCnt)); + cntrs->RxPKeyMismatchCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxPKeyMismatchCnt)); + cntrs->RxP0HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt)); + cntrs->RxP1HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP1HdrEgrOvflCnt)); + cntrs->RxP2HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP2HdrEgrOvflCnt)); + cntrs->RxP3HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP3HdrEgrOvflCnt)); + cntrs->RxP4HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP4HdrEgrOvflCnt)); + cntrs->RxP5HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP5HdrEgrOvflCnt)); + cntrs->RxP6HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP6HdrEgrOvflCnt)); + cntrs->RxP7HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP7HdrEgrOvflCnt)); + cntrs->RxP8HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP8HdrEgrOvflCnt)); + cntrs->RxP9HdrEgrOvflCnt = 0; + cntrs->RxP10HdrEgrOvflCnt = 0; + cntrs->RxP11HdrEgrOvflCnt = 0; + cntrs->RxP12HdrEgrOvflCnt = 0; + cntrs->RxP13HdrEgrOvflCnt = 0; + cntrs->RxP14HdrEgrOvflCnt = 0; + cntrs->RxP15HdrEgrOvflCnt = 0; + cntrs->RxP16HdrEgrOvflCnt = 0; + cntrs->IBStatusChangeCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBStatusChangeCnt)); + cntrs->IBLinkErrRecoveryCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt)); + cntrs->IBLinkDownedCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBLinkDownedCnt)); + cntrs->IBSymbolErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBSymbolErrCnt)); + cntrs->RxVL15DroppedPktCnt = 0; + cntrs->RxOtherLocalPhyErrCnt = 0; + cntrs->PcieRetryBufDiagQwordCnt = 0; + cntrs->ExcessBufferOvflCnt = dd->ipath_overrun_thresh_errs; + cntrs->LocalLinkIntegrityErrCnt = + (dd->ipath_flags & IPATH_GPIO_ERRINTRS) ? + dd->ipath_lli_errs : dd->ipath_lli_errors; + cntrs->RxVlErrCnt = 0; + cntrs->RxDlidFltrCnt = 0; +} + + +/* no interrupt fallback for these chips */ +static int ipath_ht_nointr_fallback(struct ipath_devdata *dd) +{ + return 0; +} + + +/* + * reset the XGXS (between serdes and IBC). Slightly less intrusive + * than resetting the IBC or external link state, and useful in some + * cases to cause some retraining. To do this right, we reset IBC + * as well. + */ +static void ipath_ht_xgxs_reset(struct ipath_devdata *dd) +{ + u64 val, prev_val; + + prev_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig); + val = prev_val | INFINIPATH_XGXS_RESET; + prev_val &= ~INFINIPATH_XGXS_RESET; /* be sure */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, + dd->ipath_control & ~INFINIPATH_C_LINKENABLE); + ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val); + ipath_read_kreg32(dd, dd->ipath_kregs->kr_scratch); + ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, prev_val); + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, + dd->ipath_control); +} + + +static int ipath_ht_get_ib_cfg(struct ipath_devdata *dd, int which) +{ + int ret; + + switch (which) { + case IPATH_IB_CFG_LWID: + ret = dd->ipath_link_width_active; + break; + case IPATH_IB_CFG_SPD: + ret = dd->ipath_link_speed_active; + break; + case IPATH_IB_CFG_LWID_ENB: + ret = dd->ipath_link_width_enabled; + break; + case IPATH_IB_CFG_SPD_ENB: + ret = dd->ipath_link_speed_enabled; + break; + default: + ret = -ENOTSUPP; + break; + } + return ret; +} + + +/* we assume range checking is already done, if needed */ +static int ipath_ht_set_ib_cfg(struct ipath_devdata *dd, int which, u32 val) +{ + int ret = 0; + + if (which == IPATH_IB_CFG_LWID_ENB) + dd->ipath_link_width_enabled = val; + else if (which == IPATH_IB_CFG_SPD_ENB) + dd->ipath_link_speed_enabled = val; + else + ret = -ENOTSUPP; + return ret; +} + + +static void ipath_ht_config_jint(struct ipath_devdata *dd, u16 a, u16 b) +{ +} + + +static int ipath_ht_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs) +{ + ipath_setup_ht_setextled(dd, ipath_ib_linkstate(dd, ibcs), + ipath_ib_linktrstate(dd, ibcs)); + return 0; +} + + +/** + * ipath_init_iba6110_funcs - set up the chip-specific function pointers + * @dd: the infinipath device + * + * This is global, and is called directly at init to set up the + * chip-specific function pointers for later use. + */ +void ipath_init_iba6110_funcs(struct ipath_devdata *dd) +{ + dd->ipath_f_intrsetup = ipath_ht_intconfig; + dd->ipath_f_bus = ipath_setup_ht_config; + dd->ipath_f_reset = ipath_setup_ht_reset; + dd->ipath_f_get_boardname = ipath_ht_boardname; + dd->ipath_f_init_hwerrors = ipath_ht_init_hwerrors; + dd->ipath_f_early_init = ipath_ht_early_init; + dd->ipath_f_handle_hwerrors = ipath_ht_handle_hwerrors; + dd->ipath_f_quiet_serdes = ipath_ht_quiet_serdes; + dd->ipath_f_bringup_serdes = ipath_ht_bringup_serdes; + dd->ipath_f_clear_tids = ipath_ht_clear_tids; + dd->ipath_f_put_tid = ipath_ht_put_tid; + dd->ipath_f_cleanup = ipath_setup_ht_cleanup; + dd->ipath_f_setextled = ipath_setup_ht_setextled; + dd->ipath_f_get_base_info = ipath_ht_get_base_info; + dd->ipath_f_free_irq = ipath_ht_free_irq; + dd->ipath_f_tidtemplate = ipath_ht_tidtemplate; + dd->ipath_f_intr_fallback = ipath_ht_nointr_fallback; + dd->ipath_f_get_msgheader = ipath_ht_get_msgheader; + dd->ipath_f_config_ports = ipath_ht_config_ports; + dd->ipath_f_read_counters = ipath_ht_read_counters; + dd->ipath_f_xgxs_reset = ipath_ht_xgxs_reset; + dd->ipath_f_get_ib_cfg = ipath_ht_get_ib_cfg; + dd->ipath_f_set_ib_cfg = ipath_ht_set_ib_cfg; + dd->ipath_f_config_jint = ipath_ht_config_jint; + dd->ipath_f_ib_updown = ipath_ht_ib_updown; + + /* + * initialize chip-specific variables + */ + ipath_init_ht_variables(dd); +} diff --git a/drivers/infiniband/hw/ipath/ipath_init_chip.c b/drivers/infiniband/hw/ipath/ipath_init_chip.c new file mode 100644 index 0000000..be2a60e --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_init_chip.c @@ -0,0 +1,1066 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#include "ipath_kernel.h" +#include "ipath_common.h" + +/* + * min buffers we want to have per port, after driver + */ +#define IPATH_MIN_USER_PORT_BUFCNT 7 + +/* + * Number of ports we are configured to use (to allow for more pio + * buffers per port, etc.) Zero means use chip value. + */ +static ushort ipath_cfgports; + +module_param_named(cfgports, ipath_cfgports, ushort, S_IRUGO); +MODULE_PARM_DESC(cfgports, "Set max number of ports to use"); + +/* + * Number of buffers reserved for driver (verbs and layered drivers.) + * Initialized based on number of PIO buffers if not set via module interface. + * The problem with this is that it's global, but we'll use different + * numbers for different chip types. + */ +static ushort ipath_kpiobufs; + +static int ipath_set_kpiobufs(const char *val, struct kernel_param *kp); + +module_param_call(kpiobufs, ipath_set_kpiobufs, param_get_ushort, + &ipath_kpiobufs, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(kpiobufs, "Set number of PIO buffers for driver"); + +/** + * create_port0_egr - allocate the eager TID buffers + * @dd: the infinipath device + * + * This code is now quite different for user and kernel, because + * the kernel uses skb's, for the accelerated network performance. + * This is the kernel (port0) version. + * + * Allocate the eager TID buffers and program them into infinipath. + * We use the network layer alloc_skb() allocator to allocate the + * memory, and either use the buffers as is for things like verbs + * packets, or pass the buffers up to the ipath layered driver and + * thence the network layer, replacing them as we do so (see + * ipath_rcv_layer()). + */ +static int create_port0_egr(struct ipath_devdata *dd) +{ + unsigned e, egrcnt; + struct ipath_skbinfo *skbinfo; + int ret; + + egrcnt = dd->ipath_p0_rcvegrcnt; + + skbinfo = vmalloc(sizeof(*dd->ipath_port0_skbinfo) * egrcnt); + if (skbinfo == NULL) { + ipath_dev_err(dd, "allocation error for eager TID " + "skb array\n"); + ret = -ENOMEM; + goto bail; + } + for (e = 0; e < egrcnt; e++) { + /* + * This is a bit tricky in that we allocate extra + * space for 2 bytes of the 14 byte ethernet header. + * These two bytes are passed in the ipath header so + * the rest of the data is word aligned. We allocate + * 4 bytes so that the data buffer stays word aligned. + * See ipath_kreceive() for more details. + */ + skbinfo[e].skb = ipath_alloc_skb(dd, GFP_KERNEL); + if (!skbinfo[e].skb) { + ipath_dev_err(dd, "SKB allocation error for " + "eager TID %u\n", e); + while (e != 0) + dev_kfree_skb(skbinfo[--e].skb); + vfree(skbinfo); + ret = -ENOMEM; + goto bail; + } + } + /* + * After loop above, so we can test non-NULL to see if ready + * to use at receive, etc. + */ + dd->ipath_port0_skbinfo = skbinfo; + + for (e = 0; e < egrcnt; e++) { + dd->ipath_port0_skbinfo[e].phys = + ipath_map_single(dd->pcidev, + dd->ipath_port0_skbinfo[e].skb->data, + dd->ipath_ibmaxlen, PCI_DMA_FROMDEVICE); + dd->ipath_f_put_tid(dd, e + (u64 __iomem *) + ((char __iomem *) dd->ipath_kregbase + + dd->ipath_rcvegrbase), + RCVHQ_RCV_TYPE_EAGER, + dd->ipath_port0_skbinfo[e].phys); + } + + ret = 0; + +bail: + return ret; +} + +static int bringup_link(struct ipath_devdata *dd) +{ + u64 val, ibc; + int ret = 0; + + /* hold IBC in reset */ + dd->ipath_control &= ~INFINIPATH_C_LINKENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, + dd->ipath_control); + + /* + * set initial max size pkt IBC will send, including ICRC; it's the + * PIO buffer size in dwords, less 1; also see ipath_set_mtu() + */ + val = (dd->ipath_ibmaxlen >> 2) + 1; + ibc = val << dd->ibcc_mpl_shift; + + /* flowcontrolwatermark is in units of KBytes */ + ibc |= 0x5ULL << INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT; + /* + * How often flowctrl sent. More or less in usecs; balance against + * watermark value, so that in theory senders always get a flow + * control update in time to not let the IB link go idle. + */ + ibc |= 0x3ULL << INFINIPATH_IBCC_FLOWCTRLPERIOD_SHIFT; + /* max error tolerance */ + ibc |= 0xfULL << INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT; + /* use "real" buffer space for */ + ibc |= 4ULL << INFINIPATH_IBCC_CREDITSCALE_SHIFT; + /* IB credit flow control. */ + ibc |= 0xfULL << INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT; + /* initially come up waiting for TS1, without sending anything. */ + dd->ipath_ibcctrl = ibc; + /* + * Want to start out with both LINKCMD and LINKINITCMD in NOP + * (0 and 0). Don't put linkinitcmd in ipath_ibcctrl, want that + * to stay a NOP. Flag that we are disabled, for the (unlikely) + * case that some recovery path is trying to bring the link up + * before we are ready. + */ + ibc |= INFINIPATH_IBCC_LINKINITCMD_DISABLE << + INFINIPATH_IBCC_LINKINITCMD_SHIFT; + dd->ipath_flags |= IPATH_IB_LINK_DISABLED; + ipath_cdbg(VERBOSE, "Writing 0x%llx to ibcctrl\n", + (unsigned long long) ibc); + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, ibc); + + // be sure chip saw it + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + + ret = dd->ipath_f_bringup_serdes(dd); + + if (ret) + dev_info(&dd->pcidev->dev, "Could not initialize SerDes, " + "not usable\n"); + else { + /* enable IBC */ + dd->ipath_control |= INFINIPATH_C_LINKENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, + dd->ipath_control); + } + + return ret; +} + +static struct ipath_portdata *create_portdata0(struct ipath_devdata *dd) +{ + struct ipath_portdata *pd = NULL; + + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (pd) { + pd->port_dd = dd; + pd->port_cnt = 1; + /* The port 0 pkey table is used by the layer interface. */ + pd->port_pkeys[0] = IPATH_DEFAULT_P_KEY; + pd->port_seq_cnt = 1; + } + return pd; +} + +static int init_chip_first(struct ipath_devdata *dd) +{ + struct ipath_portdata *pd; + int ret = 0; + u64 val; + + spin_lock_init(&dd->ipath_kernel_tid_lock); + spin_lock_init(&dd->ipath_user_tid_lock); + spin_lock_init(&dd->ipath_sendctrl_lock); + spin_lock_init(&dd->ipath_uctxt_lock); + spin_lock_init(&dd->ipath_sdma_lock); + spin_lock_init(&dd->ipath_gpio_lock); + spin_lock_init(&dd->ipath_eep_st_lock); + spin_lock_init(&dd->ipath_sdepb_lock); + mutex_init(&dd->ipath_eep_lock); + + /* + * skip cfgports stuff because we are not allocating memory, + * and we don't want problems if the portcnt changed due to + * cfgports. We do still check and report a difference, if + * not same (should be impossible). + */ + dd->ipath_f_config_ports(dd, ipath_cfgports); + if (!ipath_cfgports) + dd->ipath_cfgports = dd->ipath_portcnt; + else if (ipath_cfgports <= dd->ipath_portcnt) { + dd->ipath_cfgports = ipath_cfgports; + ipath_dbg("Configured to use %u ports out of %u in chip\n", + dd->ipath_cfgports, ipath_read_kreg32(dd, + dd->ipath_kregs->kr_portcnt)); + } else { + dd->ipath_cfgports = dd->ipath_portcnt; + ipath_dbg("Tried to configured to use %u ports; chip " + "only supports %u\n", ipath_cfgports, + ipath_read_kreg32(dd, + dd->ipath_kregs->kr_portcnt)); + } + /* + * Allocate full portcnt array, rather than just cfgports, because + * cleanup iterates across all possible ports. + */ + dd->ipath_pd = kzalloc(sizeof(*dd->ipath_pd) * dd->ipath_portcnt, + GFP_KERNEL); + + if (!dd->ipath_pd) { + ipath_dev_err(dd, "Unable to allocate portdata array, " + "failing\n"); + ret = -ENOMEM; + goto done; + } + + pd = create_portdata0(dd); + if (!pd) { + ipath_dev_err(dd, "Unable to allocate portdata for port " + "0, failing\n"); + ret = -ENOMEM; + goto done; + } + dd->ipath_pd[0] = pd; + + dd->ipath_rcvtidcnt = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt); + dd->ipath_rcvtidbase = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidbase); + dd->ipath_rcvegrcnt = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt); + dd->ipath_rcvegrbase = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrbase); + dd->ipath_palign = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_pagealign); + dd->ipath_piobufbase = + ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiobufbase); + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiosize); + dd->ipath_piosize2k = val & ~0U; + dd->ipath_piosize4k = val >> 32; + if (dd->ipath_piosize4k == 0 && ipath_mtu4096) + ipath_mtu4096 = 0; /* 4KB not supported by this chip */ + dd->ipath_ibmtu = ipath_mtu4096 ? 4096 : 2048; + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiobufcnt); + dd->ipath_piobcnt2k = val & ~0U; + dd->ipath_piobcnt4k = val >> 32; + dd->ipath_pio2kbase = + (u32 __iomem *) (((char __iomem *) dd->ipath_kregbase) + + (dd->ipath_piobufbase & 0xffffffff)); + if (dd->ipath_piobcnt4k) { + dd->ipath_pio4kbase = (u32 __iomem *) + (((char __iomem *) dd->ipath_kregbase) + + (dd->ipath_piobufbase >> 32)); + /* + * 4K buffers take 2 pages; we use roundup just to be + * paranoid; we calculate it once here, rather than on + * ever buf allocate + */ + dd->ipath_4kalign = ALIGN(dd->ipath_piosize4k, + dd->ipath_palign); + ipath_dbg("%u 2k(%x) piobufs @ %p, %u 4k(%x) @ %p " + "(%x aligned)\n", + dd->ipath_piobcnt2k, dd->ipath_piosize2k, + dd->ipath_pio2kbase, dd->ipath_piobcnt4k, + dd->ipath_piosize4k, dd->ipath_pio4kbase, + dd->ipath_4kalign); + } + else ipath_dbg("%u 2k piobufs @ %p\n", + dd->ipath_piobcnt2k, dd->ipath_pio2kbase); + +done: + return ret; +} + +/** + * init_chip_reset - re-initialize after a reset, or enable + * @dd: the infinipath device + * + * sanity check at least some of the values after reset, and + * ensure no receive or transmit (explicitly, in case reset + * failed + */ +static int init_chip_reset(struct ipath_devdata *dd) +{ + u32 rtmp; + int i; + unsigned long flags; + + /* + * ensure chip does no sends or receives, tail updates, or + * pioavail updates while we re-initialize + */ + dd->ipath_rcvctrl &= ~(1ULL << dd->ipath_r_tailupd_shift); + for (i = 0; i < dd->ipath_portcnt; i++) { + clear_bit(dd->ipath_r_portenable_shift + i, + &dd->ipath_rcvctrl); + clear_bit(dd->ipath_r_intravail_shift + i, + &dd->ipath_rcvctrl); + } + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl = 0U; /* no sdma, etc */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL); + + rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt); + if (rtmp != dd->ipath_rcvtidcnt) + dev_info(&dd->pcidev->dev, "tidcnt was %u before " + "reset, now %u, using original\n", + dd->ipath_rcvtidcnt, rtmp); + rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidbase); + if (rtmp != dd->ipath_rcvtidbase) + dev_info(&dd->pcidev->dev, "tidbase was %u before " + "reset, now %u, using original\n", + dd->ipath_rcvtidbase, rtmp); + rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt); + if (rtmp != dd->ipath_rcvegrcnt) + dev_info(&dd->pcidev->dev, "egrcnt was %u before " + "reset, now %u, using original\n", + dd->ipath_rcvegrcnt, rtmp); + rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrbase); + if (rtmp != dd->ipath_rcvegrbase) + dev_info(&dd->pcidev->dev, "egrbase was %u before " + "reset, now %u, using original\n", + dd->ipath_rcvegrbase, rtmp); + + return 0; +} + +static int init_pioavailregs(struct ipath_devdata *dd) +{ + int ret; + + dd->ipath_pioavailregs_dma = dma_alloc_coherent( + &dd->pcidev->dev, PAGE_SIZE, &dd->ipath_pioavailregs_phys, + GFP_KERNEL); + if (!dd->ipath_pioavailregs_dma) { + ipath_dev_err(dd, "failed to allocate PIOavail reg area " + "in memory\n"); + ret = -ENOMEM; + goto done; + } + + /* + * we really want L2 cache aligned, but for current CPUs of + * interest, they are the same. + */ + dd->ipath_statusp = (u64 *) + ((char *)dd->ipath_pioavailregs_dma + + ((2 * L1_CACHE_BYTES + + dd->ipath_pioavregs * sizeof(u64)) & ~L1_CACHE_BYTES)); + /* copy the current value now that it's really allocated */ + *dd->ipath_statusp = dd->_ipath_status; + /* + * setup buffer to hold freeze msg, accessible to apps, + * following statusp + */ + dd->ipath_freezemsg = (char *)&dd->ipath_statusp[1]; + /* and its length */ + dd->ipath_freezelen = L1_CACHE_BYTES - sizeof(dd->ipath_statusp[0]); + + ret = 0; + +done: + return ret; +} + +/** + * init_shadow_tids - allocate the shadow TID array + * @dd: the infinipath device + * + * allocate the shadow TID array, so we can ipath_munlock previous + * entries. It may make more sense to move the pageshadow to the + * port data structure, so we only allocate memory for ports actually + * in use, since we at 8k per port, now. + */ +static void init_shadow_tids(struct ipath_devdata *dd) +{ + struct page **pages; + dma_addr_t *addrs; + + pages = vzalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt * + sizeof(struct page *)); + if (!pages) { + ipath_dev_err(dd, "failed to allocate shadow page * " + "array, no expected sends!\n"); + dd->ipath_pageshadow = NULL; + return; + } + + addrs = vmalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt * + sizeof(dma_addr_t)); + if (!addrs) { + ipath_dev_err(dd, "failed to allocate shadow dma handle " + "array, no expected sends!\n"); + vfree(pages); + dd->ipath_pageshadow = NULL; + return; + } + + dd->ipath_pageshadow = pages; + dd->ipath_physshadow = addrs; +} + +static void enable_chip(struct ipath_devdata *dd, int reinit) +{ + u32 val; + u64 rcvmask; + unsigned long flags; + int i; + + if (!reinit) + init_waitqueue_head(&ipath_state_wait); + + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + /* Enable PIO send, and update of PIOavail regs to memory. */ + dd->ipath_sendctrl = INFINIPATH_S_PIOENABLE | + INFINIPATH_S_PIOBUFAVAILUPD; + + /* + * Set the PIO avail update threshold to host memory + * on chips that support it. + */ + if (dd->ipath_pioupd_thresh) + dd->ipath_sendctrl |= dd->ipath_pioupd_thresh + << INFINIPATH_S_UPDTHRESH_SHIFT; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + /* + * Enable kernel ports' receive and receive interrupt. + * Other ports done as user opens and inits them. + */ + rcvmask = 1ULL; + dd->ipath_rcvctrl |= (rcvmask << dd->ipath_r_portenable_shift) | + (rcvmask << dd->ipath_r_intravail_shift); + if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) + dd->ipath_rcvctrl |= (1ULL << dd->ipath_r_tailupd_shift); + + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + + /* + * now ready for use. this should be cleared whenever we + * detect a reset, or initiate one. + */ + dd->ipath_flags |= IPATH_INITTED; + + /* + * Init our shadow copies of head from tail values, + * and write head values to match. + */ + val = ipath_read_ureg32(dd, ur_rcvegrindextail, 0); + ipath_write_ureg(dd, ur_rcvegrindexhead, val, 0); + + /* Initialize so we interrupt on next packet received */ + ipath_write_ureg(dd, ur_rcvhdrhead, + dd->ipath_rhdrhead_intr_off | + dd->ipath_pd[0]->port_head, 0); + + /* + * by now pioavail updates to memory should have occurred, so + * copy them into our working/shadow registers; this is in + * case something went wrong with abort, but mostly to get the + * initial values of the generation bit correct. + */ + for (i = 0; i < dd->ipath_pioavregs; i++) { + __le64 pioavail; + + /* + * Chip Errata bug 6641; even and odd qwords>3 are swapped. + */ + if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) + pioavail = dd->ipath_pioavailregs_dma[i ^ 1]; + else + pioavail = dd->ipath_pioavailregs_dma[i]; + /* + * don't need to worry about ipath_pioavailkernel here + * because we will call ipath_chg_pioavailkernel() later + * in initialization, to busy out buffers as needed + */ + dd->ipath_pioavailshadow[i] = le64_to_cpu(pioavail); + } + /* can get counters, stats, etc. */ + dd->ipath_flags |= IPATH_PRESENT; +} + +static int init_housekeeping(struct ipath_devdata *dd, int reinit) +{ + char boardn[40]; + int ret = 0; + + /* + * have to clear shadow copies of registers at init that are + * not otherwise set here, or all kinds of bizarre things + * happen with driver on chip reset + */ + dd->ipath_rcvhdrsize = 0; + + /* + * Don't clear ipath_flags as 8bit mode was set before + * entering this func. However, we do set the linkstate to + * unknown, so we can watch for a transition. + * PRESENT is set because we want register reads to work, + * and the kernel infrastructure saw it in config space; + * We clear it if we have failures. + */ + dd->ipath_flags |= IPATH_LINKUNK | IPATH_PRESENT; + dd->ipath_flags &= ~(IPATH_LINKACTIVE | IPATH_LINKARMED | + IPATH_LINKDOWN | IPATH_LINKINIT); + + ipath_cdbg(VERBOSE, "Try to read spc chip revision\n"); + dd->ipath_revision = + ipath_read_kreg64(dd, dd->ipath_kregs->kr_revision); + + /* + * set up fundamental info we need to use the chip; we assume + * if the revision reg and these regs are OK, we don't need to + * special case the rest + */ + dd->ipath_sregbase = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_sendregbase); + dd->ipath_cregbase = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_counterregbase); + dd->ipath_uregbase = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_userregbase); + ipath_cdbg(VERBOSE, "ipath_kregbase %p, sendbase %x usrbase %x, " + "cntrbase %x\n", dd->ipath_kregbase, dd->ipath_sregbase, + dd->ipath_uregbase, dd->ipath_cregbase); + if ((dd->ipath_revision & 0xffffffff) == 0xffffffff + || (dd->ipath_sregbase & 0xffffffff) == 0xffffffff + || (dd->ipath_cregbase & 0xffffffff) == 0xffffffff + || (dd->ipath_uregbase & 0xffffffff) == 0xffffffff) { + ipath_dev_err(dd, "Register read failures from chip, " + "giving up initialization\n"); + dd->ipath_flags &= ~IPATH_PRESENT; + ret = -ENODEV; + goto done; + } + + + /* clear diagctrl register, in case diags were running and crashed */ + ipath_write_kreg (dd, dd->ipath_kregs->kr_hwdiagctrl, 0); + + /* clear the initial reset flag, in case first driver load */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, + INFINIPATH_E_RESET); + + ipath_cdbg(VERBOSE, "Revision %llx (PCI %x)\n", + (unsigned long long) dd->ipath_revision, + dd->ipath_pcirev); + + if (((dd->ipath_revision >> INFINIPATH_R_SOFTWARE_SHIFT) & + INFINIPATH_R_SOFTWARE_MASK) != IPATH_CHIP_SWVERSION) { + ipath_dev_err(dd, "Driver only handles version %d, " + "chip swversion is %d (%llx), failng\n", + IPATH_CHIP_SWVERSION, + (int)(dd->ipath_revision >> + INFINIPATH_R_SOFTWARE_SHIFT) & + INFINIPATH_R_SOFTWARE_MASK, + (unsigned long long) dd->ipath_revision); + ret = -ENOSYS; + goto done; + } + dd->ipath_majrev = (u8) ((dd->ipath_revision >> + INFINIPATH_R_CHIPREVMAJOR_SHIFT) & + INFINIPATH_R_CHIPREVMAJOR_MASK); + dd->ipath_minrev = (u8) ((dd->ipath_revision >> + INFINIPATH_R_CHIPREVMINOR_SHIFT) & + INFINIPATH_R_CHIPREVMINOR_MASK); + dd->ipath_boardrev = (u8) ((dd->ipath_revision >> + INFINIPATH_R_BOARDID_SHIFT) & + INFINIPATH_R_BOARDID_MASK); + + ret = dd->ipath_f_get_boardname(dd, boardn, sizeof boardn); + + snprintf(dd->ipath_boardversion, sizeof(dd->ipath_boardversion), + "ChipABI %u.%u, %s, InfiniPath%u %u.%u, PCI %u, " + "SW Compat %u\n", + IPATH_CHIP_VERS_MAJ, IPATH_CHIP_VERS_MIN, boardn, + (unsigned)(dd->ipath_revision >> INFINIPATH_R_ARCH_SHIFT) & + INFINIPATH_R_ARCH_MASK, + dd->ipath_majrev, dd->ipath_minrev, dd->ipath_pcirev, + (unsigned)(dd->ipath_revision >> + INFINIPATH_R_SOFTWARE_SHIFT) & + INFINIPATH_R_SOFTWARE_MASK); + + ipath_dbg("%s", dd->ipath_boardversion); + + if (ret) + goto done; + + if (reinit) + ret = init_chip_reset(dd); + else + ret = init_chip_first(dd); + +done: + return ret; +} + +static void verify_interrupt(unsigned long opaque) +{ + struct ipath_devdata *dd = (struct ipath_devdata *) opaque; + + if (!dd) + return; /* being torn down */ + + /* + * If we don't have any interrupts, let the user know and + * don't bother checking again. + */ + if (dd->ipath_int_counter == 0) { + if (!dd->ipath_f_intr_fallback(dd)) + dev_err(&dd->pcidev->dev, "No interrupts detected, " + "not usable.\n"); + else /* re-arm the timer to see if fallback works */ + mod_timer(&dd->ipath_intrchk_timer, jiffies + HZ/2); + } else + ipath_cdbg(VERBOSE, "%u interrupts at timer check\n", + dd->ipath_int_counter); +} + +/** + * ipath_init_chip - do the actual initialization sequence on the chip + * @dd: the infinipath device + * @reinit: reinitializing, so don't allocate new memory + * + * Do the actual initialization sequence on the chip. This is done + * both from the init routine called from the PCI infrastructure, and + * when we reset the chip, or detect that it was reset internally, + * or it's administratively re-enabled. + * + * Memory allocation here and in called routines is only done in + * the first case (reinit == 0). We have to be careful, because even + * without memory allocation, we need to re-write all the chip registers + * TIDs, etc. after the reset or enable has completed. + */ +int ipath_init_chip(struct ipath_devdata *dd, int reinit) +{ + int ret = 0; + u32 kpiobufs, defkbufs; + u32 piobufs, uports; + u64 val; + struct ipath_portdata *pd; + gfp_t gfp_flags = GFP_USER | __GFP_COMP; + + ret = init_housekeeping(dd, reinit); + if (ret) + goto done; + + /* + * We could bump this to allow for full rcvegrcnt + rcvtidcnt, + * but then it no longer nicely fits power of two, and since + * we now use routines that backend onto __get_free_pages, the + * rest would be wasted. + */ + dd->ipath_rcvhdrcnt = max(dd->ipath_p0_rcvegrcnt, dd->ipath_rcvegrcnt); + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrcnt, + dd->ipath_rcvhdrcnt); + + /* + * Set up the shadow copies of the piobufavail registers, + * which we compare against the chip registers for now, and + * the in memory DMA'ed copies of the registers. This has to + * be done early, before we calculate lastport, etc. + */ + piobufs = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k; + /* + * calc number of pioavail registers, and save it; we have 2 + * bits per buffer. + */ + dd->ipath_pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2) + / (sizeof(u64) * BITS_PER_BYTE / 2); + uports = dd->ipath_cfgports ? dd->ipath_cfgports - 1 : 0; + if (piobufs > 144) + defkbufs = 32 + dd->ipath_pioreserved; + else + defkbufs = 16 + dd->ipath_pioreserved; + + if (ipath_kpiobufs && (ipath_kpiobufs + + (uports * IPATH_MIN_USER_PORT_BUFCNT)) > piobufs) { + int i = (int) piobufs - + (int) (uports * IPATH_MIN_USER_PORT_BUFCNT); + if (i < 1) + i = 1; + dev_info(&dd->pcidev->dev, "Allocating %d PIO bufs of " + "%d for kernel leaves too few for %d user ports " + "(%d each); using %u\n", ipath_kpiobufs, + piobufs, uports, IPATH_MIN_USER_PORT_BUFCNT, i); + /* + * shouldn't change ipath_kpiobufs, because could be + * different for different devices... + */ + kpiobufs = i; + } else if (ipath_kpiobufs) + kpiobufs = ipath_kpiobufs; + else + kpiobufs = defkbufs; + dd->ipath_lastport_piobuf = piobufs - kpiobufs; + dd->ipath_pbufsport = + uports ? dd->ipath_lastport_piobuf / uports : 0; + /* if not an even divisor, some user ports get extra buffers */ + dd->ipath_ports_extrabuf = dd->ipath_lastport_piobuf - + (dd->ipath_pbufsport * uports); + if (dd->ipath_ports_extrabuf) + ipath_dbg("%u pbufs/port leaves some unused, add 1 buffer to " + "ports <= %u\n", dd->ipath_pbufsport, + dd->ipath_ports_extrabuf); + dd->ipath_lastpioindex = 0; + dd->ipath_lastpioindexl = dd->ipath_piobcnt2k; + /* ipath_pioavailshadow initialized earlier */ + ipath_cdbg(VERBOSE, "%d PIO bufs for kernel out of %d total %u " + "each for %u user ports\n", kpiobufs, + piobufs, dd->ipath_pbufsport, uports); + ret = dd->ipath_f_early_init(dd); + if (ret) { + ipath_dev_err(dd, "Early initialization failure\n"); + goto done; + } + + /* + * Early_init sets rcvhdrentsize and rcvhdrsize, so this must be + * done after early_init. + */ + dd->ipath_hdrqlast = + dd->ipath_rcvhdrentsize * (dd->ipath_rcvhdrcnt - 1); + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrentsize, + dd->ipath_rcvhdrentsize); + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrsize, + dd->ipath_rcvhdrsize); + + if (!reinit) { + ret = init_pioavailregs(dd); + init_shadow_tids(dd); + if (ret) + goto done; + } + + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendpioavailaddr, + dd->ipath_pioavailregs_phys); + + /* + * this is to detect s/w errors, which the h/w works around by + * ignoring the low 6 bits of address, if it wasn't aligned. + */ + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpioavailaddr); + if (val != dd->ipath_pioavailregs_phys) { + ipath_dev_err(dd, "Catastrophic software error, " + "SendPIOAvailAddr written as %lx, " + "read back as %llx\n", + (unsigned long) dd->ipath_pioavailregs_phys, + (unsigned long long) val); + ret = -EINVAL; + goto done; + } + + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvbthqp, IPATH_KD_QP); + + /* + * make sure we are not in freeze, and PIO send enabled, so + * writes to pbc happen + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, 0ULL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, + ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED); + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL); + + /* + * before error clears, since we expect serdes pll errors during + * this, the first time after reset + */ + if (bringup_link(dd)) { + dev_info(&dd->pcidev->dev, "Failed to bringup IB link\n"); + ret = -ENETDOWN; + goto done; + } + + /* + * clear any "expected" hwerrs from reset and/or initialization + * clear any that aren't enabled (at least this once), and then + * set the enable mask + */ + dd->ipath_f_init_hwerrors(dd); + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, + ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED); + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, + dd->ipath_hwerrmask); + + /* clear all */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL); + /* enable errors that are masked, at least this first time. */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + ~dd->ipath_maskederrs); + dd->ipath_maskederrs = 0; /* don't re-enable ignored in timer */ + dd->ipath_errormask = + ipath_read_kreg64(dd, dd->ipath_kregs->kr_errormask); + /* clear any interrupts up to this point (ints still not enabled) */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL); + + dd->ipath_f_tidtemplate(dd); + + /* + * Set up the port 0 (kernel) rcvhdr q and egr TIDs. If doing + * re-init, the simplest way to handle this is to free + * existing, and re-allocate. + * Need to re-create rest of port 0 portdata as well. + */ + pd = dd->ipath_pd[0]; + if (reinit) { + struct ipath_portdata *npd; + + /* + * Alloc and init new ipath_portdata for port0, + * Then free old pd. Could lead to fragmentation, but also + * makes later support for hot-swap easier. + */ + npd = create_portdata0(dd); + if (npd) { + ipath_free_pddata(dd, pd); + dd->ipath_pd[0] = npd; + pd = npd; + } else { + ipath_dev_err(dd, "Unable to allocate portdata" + " for port 0, failing\n"); + ret = -ENOMEM; + goto done; + } + } + ret = ipath_create_rcvhdrq(dd, pd); + if (!ret) + ret = create_port0_egr(dd); + if (ret) { + ipath_dev_err(dd, "failed to allocate kernel port's " + "rcvhdrq and/or egr bufs\n"); + goto done; + } + else + enable_chip(dd, reinit); + + /* after enable_chip, so pioavailshadow setup */ + ipath_chg_pioavailkernel(dd, 0, piobufs, 1); + + /* + * Cancel any possible active sends from early driver load. + * Follows early_init because some chips have to initialize + * PIO buffers in early_init to avoid false parity errors. + * After enable and ipath_chg_pioavailkernel so we can safely + * enable pioavail updates and PIOENABLE; packets are now + * ready to go out. + */ + ipath_cancel_sends(dd, 1); + + if (!reinit) { + /* + * Used when we close a port, for DMA already in flight + * at close. + */ + dd->ipath_dummy_hdrq = dma_alloc_coherent( + &dd->pcidev->dev, dd->ipath_pd[0]->port_rcvhdrq_size, + &dd->ipath_dummy_hdrq_phys, + gfp_flags); + if (!dd->ipath_dummy_hdrq) { + dev_info(&dd->pcidev->dev, + "Couldn't allocate 0x%lx bytes for dummy hdrq\n", + dd->ipath_pd[0]->port_rcvhdrq_size); + /* fallback to just 0'ing */ + dd->ipath_dummy_hdrq_phys = 0UL; + } + } + + /* + * cause retrigger of pending interrupts ignored during init, + * even if we had errors + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL); + + if (!dd->ipath_stats_timer_active) { + /* + * first init, or after an admin disable/enable + * set up stats retrieval timer, even if we had errors + * in last portion of setup + */ + init_timer(&dd->ipath_stats_timer); + dd->ipath_stats_timer.function = ipath_get_faststats; + dd->ipath_stats_timer.data = (unsigned long) dd; + /* every 5 seconds; */ + dd->ipath_stats_timer.expires = jiffies + 5 * HZ; + /* takes ~16 seconds to overflow at full IB 4x bandwdith */ + add_timer(&dd->ipath_stats_timer); + dd->ipath_stats_timer_active = 1; + } + + /* Set up SendDMA if chip supports it */ + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) + ret = setup_sdma(dd); + + /* Set up HoL state */ + init_timer(&dd->ipath_hol_timer); + dd->ipath_hol_timer.function = ipath_hol_event; + dd->ipath_hol_timer.data = (unsigned long)dd; + dd->ipath_hol_state = IPATH_HOL_UP; + +done: + if (!ret) { + *dd->ipath_statusp |= IPATH_STATUS_CHIP_PRESENT; + if (!dd->ipath_f_intrsetup(dd)) { + /* now we can enable all interrupts from the chip */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, + -1LL); + /* force re-interrupt of any pending interrupts. */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, + 0ULL); + /* chip is usable; mark it as initialized */ + *dd->ipath_statusp |= IPATH_STATUS_INITTED; + + /* + * setup to verify we get an interrupt, and fallback + * to an alternate if necessary and possible + */ + if (!reinit) { + init_timer(&dd->ipath_intrchk_timer); + dd->ipath_intrchk_timer.function = + verify_interrupt; + dd->ipath_intrchk_timer.data = + (unsigned long) dd; + } + dd->ipath_intrchk_timer.expires = jiffies + HZ/2; + add_timer(&dd->ipath_intrchk_timer); + } else + ipath_dev_err(dd, "No interrupts enabled, couldn't " + "setup interrupt address\n"); + + if (dd->ipath_cfgports > ipath_stats.sps_nports) + /* + * sps_nports is a global, so, we set it to + * the highest number of ports of any of the + * chips we find; we never decrement it, at + * least for now. Since this might have changed + * over disable/enable or prior to reset, always + * do the check and potentially adjust. + */ + ipath_stats.sps_nports = dd->ipath_cfgports; + } else + ipath_dbg("Failed (%d) to initialize chip\n", ret); + + /* if ret is non-zero, we probably should do some cleanup + here... */ + return ret; +} + +static int ipath_set_kpiobufs(const char *str, struct kernel_param *kp) +{ + struct ipath_devdata *dd; + unsigned long flags; + unsigned short val; + int ret; + + ret = ipath_parse_ushort(str, &val); + + spin_lock_irqsave(&ipath_devs_lock, flags); + + if (ret < 0) + goto bail; + + if (val == 0) { + ret = -EINVAL; + goto bail; + } + + list_for_each_entry(dd, &ipath_dev_list, ipath_list) { + if (dd->ipath_kregbase) + continue; + if (val > (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - + (dd->ipath_cfgports * + IPATH_MIN_USER_PORT_BUFCNT))) + { + ipath_dev_err( + dd, + "Allocating %d PIO bufs for kernel leaves " + "too few for %d user ports (%d each)\n", + val, dd->ipath_cfgports - 1, + IPATH_MIN_USER_PORT_BUFCNT); + ret = -EINVAL; + goto bail; + } + dd->ipath_lastport_piobuf = + dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - val; + } + + ipath_kpiobufs = val; + ret = 0; +bail: + spin_unlock_irqrestore(&ipath_devs_lock, flags); + + return ret; +} diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c new file mode 100644 index 0000000..01ba792 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_intr.c @@ -0,0 +1,1273 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "ipath_kernel.h" +#include "ipath_verbs.h" +#include "ipath_common.h" + + +/* + * Called when we might have an error that is specific to a particular + * PIO buffer, and may need to cancel that buffer, so it can be re-used. + */ +void ipath_disarm_senderrbufs(struct ipath_devdata *dd) +{ + u32 piobcnt; + unsigned long sbuf[4]; + /* + * it's possible that sendbuffererror could have bits set; might + * have already done this as a result of hardware error handling + */ + piobcnt = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k; + /* read these before writing errorclear */ + sbuf[0] = ipath_read_kreg64( + dd, dd->ipath_kregs->kr_sendbuffererror); + sbuf[1] = ipath_read_kreg64( + dd, dd->ipath_kregs->kr_sendbuffererror + 1); + if (piobcnt > 128) + sbuf[2] = ipath_read_kreg64( + dd, dd->ipath_kregs->kr_sendbuffererror + 2); + if (piobcnt > 192) + sbuf[3] = ipath_read_kreg64( + dd, dd->ipath_kregs->kr_sendbuffererror + 3); + else + sbuf[3] = 0; + + if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) { + int i; + if (ipath_debug & (__IPATH_PKTDBG|__IPATH_DBG) && + time_after(dd->ipath_lastcancel, jiffies)) { + __IPATH_DBG_WHICH(__IPATH_PKTDBG|__IPATH_DBG, + "SendbufErrs %lx %lx", sbuf[0], + sbuf[1]); + if (ipath_debug & __IPATH_PKTDBG && piobcnt > 128) + printk(" %lx %lx ", sbuf[2], sbuf[3]); + printk("\n"); + } + + for (i = 0; i < piobcnt; i++) + if (test_bit(i, sbuf)) + ipath_disarm_piobufs(dd, i, 1); + /* ignore armlaunch errs for a bit */ + dd->ipath_lastcancel = jiffies+3; + } +} + + +/* These are all rcv-related errors which we want to count for stats */ +#define E_SUM_PKTERRS \ + (INFINIPATH_E_RHDRLEN | INFINIPATH_E_RBADTID | \ + INFINIPATH_E_RBADVERSION | INFINIPATH_E_RHDR | \ + INFINIPATH_E_RLONGPKTLEN | INFINIPATH_E_RSHORTPKTLEN | \ + INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RMINPKTLEN | \ + INFINIPATH_E_RFORMATERR | INFINIPATH_E_RUNSUPVL | \ + INFINIPATH_E_RUNEXPCHAR | INFINIPATH_E_REBP) + +/* These are all send-related errors which we want to count for stats */ +#define E_SUM_ERRS \ + (INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM | \ + INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \ + INFINIPATH_E_SMAXPKTLEN | INFINIPATH_E_SUNSUPVL | \ + INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SPKTLEN | \ + INFINIPATH_E_INVALIDADDR) + +/* + * this is similar to E_SUM_ERRS, but can't ignore armlaunch, don't ignore + * errors not related to freeze and cancelling buffers. Can't ignore + * armlaunch because could get more while still cleaning up, and need + * to cancel those as they happen. + */ +#define E_SPKT_ERRS_IGNORE \ + (INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \ + INFINIPATH_E_SMAXPKTLEN | INFINIPATH_E_SMINPKTLEN | \ + INFINIPATH_E_SPKTLEN) + +/* + * these are errors that can occur when the link changes state while + * a packet is being sent or received. This doesn't cover things + * like EBP or VCRC that can be the result of a sending having the + * link change state, so we receive a "known bad" packet. + */ +#define E_SUM_LINK_PKTERRS \ + (INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \ + INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SPKTLEN | \ + INFINIPATH_E_RSHORTPKTLEN | INFINIPATH_E_RMINPKTLEN | \ + INFINIPATH_E_RUNEXPCHAR) + +static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs) +{ + u64 ignore_this_time = 0; + + ipath_disarm_senderrbufs(dd); + if ((errs & E_SUM_LINK_PKTERRS) && + !(dd->ipath_flags & IPATH_LINKACTIVE)) { + /* + * This can happen when SMA is trying to bring the link + * up, but the IB link changes state at the "wrong" time. + * The IB logic then complains that the packet isn't + * valid. We don't want to confuse people, so we just + * don't print them, except at debug + */ + ipath_dbg("Ignoring packet errors %llx, because link not " + "ACTIVE\n", (unsigned long long) errs); + ignore_this_time = errs & E_SUM_LINK_PKTERRS; + } + + return ignore_this_time; +} + +/* generic hw error messages... */ +#define INFINIPATH_HWE_TXEMEMPARITYERR_MSG(a) \ + { \ + .mask = ( INFINIPATH_HWE_TXEMEMPARITYERR_##a << \ + INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT ), \ + .msg = "TXE " #a " Memory Parity" \ + } +#define INFINIPATH_HWE_RXEMEMPARITYERR_MSG(a) \ + { \ + .mask = ( INFINIPATH_HWE_RXEMEMPARITYERR_##a << \ + INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT ), \ + .msg = "RXE " #a " Memory Parity" \ + } + +static const struct ipath_hwerror_msgs ipath_generic_hwerror_msgs[] = { + INFINIPATH_HWE_MSG(IBCBUSFRSPCPARITYERR, "IPATH2IB Parity"), + INFINIPATH_HWE_MSG(IBCBUSTOSPCPARITYERR, "IB2IPATH Parity"), + + INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOBUF), + INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOPBC), + INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOLAUNCHFIFO), + + INFINIPATH_HWE_RXEMEMPARITYERR_MSG(RCVBUF), + INFINIPATH_HWE_RXEMEMPARITYERR_MSG(LOOKUPQ), + INFINIPATH_HWE_RXEMEMPARITYERR_MSG(EAGERTID), + INFINIPATH_HWE_RXEMEMPARITYERR_MSG(EXPTID), + INFINIPATH_HWE_RXEMEMPARITYERR_MSG(FLAGBUF), + INFINIPATH_HWE_RXEMEMPARITYERR_MSG(DATAINFO), + INFINIPATH_HWE_RXEMEMPARITYERR_MSG(HDRINFO), +}; + +/** + * ipath_format_hwmsg - format a single hwerror message + * @msg message buffer + * @msgl length of message buffer + * @hwmsg message to add to message buffer + */ +static void ipath_format_hwmsg(char *msg, size_t msgl, const char *hwmsg) +{ + strlcat(msg, "[", msgl); + strlcat(msg, hwmsg, msgl); + strlcat(msg, "]", msgl); +} + +/** + * ipath_format_hwerrors - format hardware error messages for display + * @hwerrs hardware errors bit vector + * @hwerrmsgs hardware error descriptions + * @nhwerrmsgs number of hwerrmsgs + * @msg message buffer + * @msgl message buffer length + */ +void ipath_format_hwerrors(u64 hwerrs, + const struct ipath_hwerror_msgs *hwerrmsgs, + size_t nhwerrmsgs, + char *msg, size_t msgl) +{ + int i; + const int glen = + ARRAY_SIZE(ipath_generic_hwerror_msgs); + + for (i=0; iib_init) + ret = "Init"; + else if (state == dd->ib_arm) + ret = "Arm"; + else if (state == dd->ib_active) + ret = "Active"; + else + ret = "Down"; + return ret; +} + +void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev) +{ + struct ib_event event; + + event.device = &dd->verbs_dev->ibdev; + event.element.port_num = 1; + event.event = ev; + ib_dispatch_event(&event); +} + +static void handle_e_ibstatuschanged(struct ipath_devdata *dd, + ipath_err_t errs) +{ + u32 ltstate, lstate, ibstate, lastlstate; + u32 init = dd->ib_init; + u32 arm = dd->ib_arm; + u32 active = dd->ib_active; + const u64 ibcs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus); + + lstate = ipath_ib_linkstate(dd, ibcs); /* linkstate */ + ibstate = ipath_ib_state(dd, ibcs); + /* linkstate at last interrupt */ + lastlstate = ipath_ib_linkstate(dd, dd->ipath_lastibcstat); + ltstate = ipath_ib_linktrstate(dd, ibcs); /* linktrainingtate */ + + /* + * Since going into a recovery state causes the link state to go + * down and since recovery is transitory, it is better if we "miss" + * ever seeing the link training state go into recovery (i.e., + * ignore this transition for link state special handling purposes) + * without even updating ipath_lastibcstat. + */ + if ((ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN) || + (ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT) || + (ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERIDLE)) + goto done; + + /* + * if linkstate transitions into INIT from any of the various down + * states, or if it transitions from any of the up (INIT or better) + * states into any of the down states (except link recovery), then + * call the chip-specific code to take appropriate actions. + */ + if (lstate >= INFINIPATH_IBCS_L_STATE_INIT && + lastlstate == INFINIPATH_IBCS_L_STATE_DOWN) { + /* transitioned to UP */ + if (dd->ipath_f_ib_updown(dd, 1, ibcs)) { + /* link came up, so we must no longer be disabled */ + dd->ipath_flags &= ~IPATH_IB_LINK_DISABLED; + ipath_cdbg(LINKVERB, "LinkUp handled, skipped\n"); + goto skip_ibchange; /* chip-code handled */ + } + } else if ((lastlstate >= INFINIPATH_IBCS_L_STATE_INIT || + (dd->ipath_flags & IPATH_IB_FORCE_NOTIFY)) && + ltstate <= INFINIPATH_IBCS_LT_STATE_CFGWAITRMT && + ltstate != INFINIPATH_IBCS_LT_STATE_LINKUP) { + int handled; + handled = dd->ipath_f_ib_updown(dd, 0, ibcs); + dd->ipath_flags &= ~IPATH_IB_FORCE_NOTIFY; + if (handled) { + ipath_cdbg(LINKVERB, "LinkDown handled, skipped\n"); + goto skip_ibchange; /* chip-code handled */ + } + } + + /* + * Significant enough to always print and get into logs, if it was + * unexpected. If it was a requested state change, we'll have + * already cleared the flags, so we won't print this warning + */ + if ((ibstate != arm && ibstate != active) && + (dd->ipath_flags & (IPATH_LINKARMED | IPATH_LINKACTIVE))) { + dev_info(&dd->pcidev->dev, "Link state changed from %s " + "to %s\n", (dd->ipath_flags & IPATH_LINKARMED) ? + "ARM" : "ACTIVE", ib_linkstate(dd, ibcs)); + } + + if (ltstate == INFINIPATH_IBCS_LT_STATE_POLLACTIVE || + ltstate == INFINIPATH_IBCS_LT_STATE_POLLQUIET) { + u32 lastlts; + lastlts = ipath_ib_linktrstate(dd, dd->ipath_lastibcstat); + /* + * Ignore cycling back and forth from Polling.Active to + * Polling.Quiet while waiting for the other end of the link + * to come up, except to try and decide if we are connected + * to a live IB device or not. We will cycle back and + * forth between them if no cable is plugged in, the other + * device is powered off or disabled, etc. + */ + if (lastlts == INFINIPATH_IBCS_LT_STATE_POLLACTIVE || + lastlts == INFINIPATH_IBCS_LT_STATE_POLLQUIET) { + if (!(dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) && + (++dd->ipath_ibpollcnt == 40)) { + dd->ipath_flags |= IPATH_NOCABLE; + *dd->ipath_statusp |= + IPATH_STATUS_IB_NOCABLE; + ipath_cdbg(LINKVERB, "Set NOCABLE\n"); + } + ipath_cdbg(LINKVERB, "POLL change to %s (%x)\n", + ipath_ibcstatus_str[ltstate], ibstate); + goto skip_ibchange; + } + } + + dd->ipath_ibpollcnt = 0; /* not poll*, now */ + ipath_stats.sps_iblink++; + + if (ibstate != init && dd->ipath_lastlinkrecov && ipath_linkrecovery) { + u64 linkrecov; + linkrecov = ipath_snap_cntr(dd, + dd->ipath_cregs->cr_iblinkerrrecovcnt); + if (linkrecov != dd->ipath_lastlinkrecov) { + ipath_dbg("IB linkrecov up %Lx (%s %s) recov %Lu\n", + (unsigned long long) ibcs, + ib_linkstate(dd, ibcs), + ipath_ibcstatus_str[ltstate], + (unsigned long long) linkrecov); + /* and no more until active again */ + dd->ipath_lastlinkrecov = 0; + ipath_set_linkstate(dd, IPATH_IB_LINKDOWN); + goto skip_ibchange; + } + } + + if (ibstate == init || ibstate == arm || ibstate == active) { + *dd->ipath_statusp &= ~IPATH_STATUS_IB_NOCABLE; + if (ibstate == init || ibstate == arm) { + *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY; + if (dd->ipath_flags & IPATH_LINKACTIVE) + signal_ib_event(dd, IB_EVENT_PORT_ERR); + } + if (ibstate == arm) { + dd->ipath_flags |= IPATH_LINKARMED; + dd->ipath_flags &= ~(IPATH_LINKUNK | + IPATH_LINKINIT | IPATH_LINKDOWN | + IPATH_LINKACTIVE | IPATH_NOCABLE); + ipath_hol_down(dd); + } else if (ibstate == init) { + /* + * set INIT and DOWN. Down is checked by + * most of the other code, but INIT is + * useful to know in a few places. + */ + dd->ipath_flags |= IPATH_LINKINIT | + IPATH_LINKDOWN; + dd->ipath_flags &= ~(IPATH_LINKUNK | + IPATH_LINKARMED | IPATH_LINKACTIVE | + IPATH_NOCABLE); + ipath_hol_down(dd); + } else { /* active */ + dd->ipath_lastlinkrecov = ipath_snap_cntr(dd, + dd->ipath_cregs->cr_iblinkerrrecovcnt); + *dd->ipath_statusp |= + IPATH_STATUS_IB_READY | IPATH_STATUS_IB_CONF; + dd->ipath_flags |= IPATH_LINKACTIVE; + dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT + | IPATH_LINKDOWN | IPATH_LINKARMED | + IPATH_NOCABLE); + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) + ipath_restart_sdma(dd); + signal_ib_event(dd, IB_EVENT_PORT_ACTIVE); + /* LED active not handled in chip _f_updown */ + dd->ipath_f_setextled(dd, lstate, ltstate); + ipath_hol_up(dd); + } + + /* + * print after we've already done the work, so as not to + * delay the state changes and notifications, for debugging + */ + if (lstate == lastlstate) + ipath_cdbg(LINKVERB, "Unchanged from last: %s " + "(%x)\n", ib_linkstate(dd, ibcs), ibstate); + else + ipath_cdbg(VERBOSE, "Unit %u: link up to %s %s (%x)\n", + dd->ipath_unit, ib_linkstate(dd, ibcs), + ipath_ibcstatus_str[ltstate], ibstate); + } else { /* down */ + if (dd->ipath_flags & IPATH_LINKACTIVE) + signal_ib_event(dd, IB_EVENT_PORT_ERR); + dd->ipath_flags |= IPATH_LINKDOWN; + dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT + | IPATH_LINKACTIVE | + IPATH_LINKARMED); + *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY; + dd->ipath_lli_counter = 0; + + if (lastlstate != INFINIPATH_IBCS_L_STATE_DOWN) + ipath_cdbg(VERBOSE, "Unit %u link state down " + "(state 0x%x), from %s\n", + dd->ipath_unit, lstate, + ib_linkstate(dd, dd->ipath_lastibcstat)); + else + ipath_cdbg(LINKVERB, "Unit %u link state changed " + "to %s (0x%x) from down (%x)\n", + dd->ipath_unit, + ipath_ibcstatus_str[ltstate], + ibstate, lastlstate); + } + +skip_ibchange: + dd->ipath_lastibcstat = ibcs; +done: + return; +} + +static void handle_supp_msgs(struct ipath_devdata *dd, + unsigned supp_msgs, char *msg, u32 msgsz) +{ + /* + * Print the message unless it's ibc status change only, which + * happens so often we never want to count it. + */ + if (dd->ipath_lasterror & ~INFINIPATH_E_IBSTATUSCHANGED) { + int iserr; + ipath_err_t mask; + iserr = ipath_decode_err(dd, msg, msgsz, + dd->ipath_lasterror & + ~INFINIPATH_E_IBSTATUSCHANGED); + + mask = INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL | + INFINIPATH_E_PKTERRS | INFINIPATH_E_SDMADISABLED; + + /* if we're in debug, then don't mask SDMADISABLED msgs */ + if (ipath_debug & __IPATH_DBG) + mask &= ~INFINIPATH_E_SDMADISABLED; + + if (dd->ipath_lasterror & ~mask) + ipath_dev_err(dd, "Suppressed %u messages for " + "fast-repeating errors (%s) (%llx)\n", + supp_msgs, msg, + (unsigned long long) + dd->ipath_lasterror); + else { + /* + * rcvegrfull and rcvhdrqfull are "normal", for some + * types of processes (mostly benchmarks) that send + * huge numbers of messages, while not processing + * them. So only complain about these at debug + * level. + */ + if (iserr) + ipath_dbg("Suppressed %u messages for %s\n", + supp_msgs, msg); + else + ipath_cdbg(ERRPKT, + "Suppressed %u messages for %s\n", + supp_msgs, msg); + } + } +} + +static unsigned handle_frequent_errors(struct ipath_devdata *dd, + ipath_err_t errs, char *msg, + u32 msgsz, int *noprint) +{ + unsigned long nc; + static unsigned long nextmsg_time; + static unsigned nmsgs, supp_msgs; + + /* + * Throttle back "fast" messages to no more than 10 per 5 seconds. + * This isn't perfect, but it's a reasonable heuristic. If we get + * more than 10, give a 6x longer delay. + */ + nc = jiffies; + if (nmsgs > 10) { + if (time_before(nc, nextmsg_time)) { + *noprint = 1; + if (!supp_msgs++) + nextmsg_time = nc + HZ * 3; + } + else if (supp_msgs) { + handle_supp_msgs(dd, supp_msgs, msg, msgsz); + supp_msgs = 0; + nmsgs = 0; + } + } + else if (!nmsgs++ || time_after(nc, nextmsg_time)) + nextmsg_time = nc + HZ / 2; + + return supp_msgs; +} + +static void handle_sdma_errors(struct ipath_devdata *dd, ipath_err_t errs) +{ + unsigned long flags; + int expected; + + if (ipath_debug & __IPATH_DBG) { + char msg[128]; + ipath_decode_err(dd, msg, sizeof msg, errs & + INFINIPATH_E_SDMAERRS); + ipath_dbg("errors %lx (%s)\n", (unsigned long)errs, msg); + } + if (ipath_debug & __IPATH_VERBDBG) { + unsigned long tl, hd, status, lengen; + tl = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmatail); + hd = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmahead); + status = ipath_read_kreg64(dd + , dd->ipath_kregs->kr_senddmastatus); + lengen = ipath_read_kreg64(dd, + dd->ipath_kregs->kr_senddmalengen); + ipath_cdbg(VERBOSE, "sdma tl 0x%lx hd 0x%lx status 0x%lx " + "lengen 0x%lx\n", tl, hd, status, lengen); + } + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + __set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status); + expected = test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status); + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + if (!expected) + ipath_cancel_sends(dd, 1); +} + +static void handle_sdma_intr(struct ipath_devdata *dd, u64 istat) +{ + unsigned long flags; + int expected; + + if ((istat & INFINIPATH_I_SDMAINT) && + !test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) + ipath_sdma_intr(dd); + + if (istat & INFINIPATH_I_SDMADISABLED) { + expected = test_bit(IPATH_SDMA_ABORTING, + &dd->ipath_sdma_status); + ipath_dbg("%s SDmaDisabled intr\n", + expected ? "expected" : "unexpected"); + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + __set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status); + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + if (!expected) + ipath_cancel_sends(dd, 1); + if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) + tasklet_hi_schedule(&dd->ipath_sdma_abort_task); + } +} + +static int handle_hdrq_full(struct ipath_devdata *dd) +{ + int chkerrpkts = 0; + u32 hd, tl; + u32 i; + + ipath_stats.sps_hdrqfull++; + for (i = 0; i < dd->ipath_cfgports; i++) { + struct ipath_portdata *pd = dd->ipath_pd[i]; + + if (i == 0) { + /* + * For kernel receive queues, we just want to know + * if there are packets in the queue that we can + * process. + */ + if (pd->port_head != ipath_get_hdrqtail(pd)) + chkerrpkts |= 1 << i; + continue; + } + + /* Skip if user context is not open */ + if (!pd || !pd->port_cnt) + continue; + + /* Don't report the same point multiple times. */ + if (dd->ipath_flags & IPATH_NODMA_RTAIL) + tl = ipath_read_ureg32(dd, ur_rcvhdrtail, i); + else + tl = ipath_get_rcvhdrtail(pd); + if (tl == pd->port_lastrcvhdrqtail) + continue; + + hd = ipath_read_ureg32(dd, ur_rcvhdrhead, i); + if (hd == (tl + 1) || (!hd && tl == dd->ipath_hdrqlast)) { + pd->port_lastrcvhdrqtail = tl; + pd->port_hdrqfull++; + /* flush hdrqfull so that poll() sees it */ + wmb(); + wake_up_interruptible(&pd->port_wait); + } + } + + return chkerrpkts; +} + +static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) +{ + char msg[128]; + u64 ignore_this_time = 0; + u64 iserr = 0; + int chkerrpkts = 0, noprint = 0; + unsigned supp_msgs; + int log_idx; + + /* + * don't report errors that are masked, either at init + * (not set in ipath_errormask), or temporarily (set in + * ipath_maskederrs) + */ + errs &= dd->ipath_errormask & ~dd->ipath_maskederrs; + + supp_msgs = handle_frequent_errors(dd, errs, msg, (u32)sizeof msg, + &noprint); + + /* do these first, they are most important */ + if (errs & INFINIPATH_E_HARDWARE) { + /* reuse same msg buf */ + dd->ipath_f_handle_hwerrors(dd, msg, sizeof msg); + } else { + u64 mask; + for (log_idx = 0; log_idx < IPATH_EEP_LOG_CNT; ++log_idx) { + mask = dd->ipath_eep_st_masks[log_idx].errs_to_log; + if (errs & mask) + ipath_inc_eeprom_err(dd, log_idx, 1); + } + } + + if (errs & INFINIPATH_E_SDMAERRS) + handle_sdma_errors(dd, errs); + + if (!noprint && (errs & ~dd->ipath_e_bitsextant)) + ipath_dev_err(dd, "error interrupt with unknown errors " + "%llx set\n", (unsigned long long) + (errs & ~dd->ipath_e_bitsextant)); + + if (errs & E_SUM_ERRS) + ignore_this_time = handle_e_sum_errs(dd, errs); + else if ((errs & E_SUM_LINK_PKTERRS) && + !(dd->ipath_flags & IPATH_LINKACTIVE)) { + /* + * This can happen when SMA is trying to bring the link + * up, but the IB link changes state at the "wrong" time. + * The IB logic then complains that the packet isn't + * valid. We don't want to confuse people, so we just + * don't print them, except at debug + */ + ipath_dbg("Ignoring packet errors %llx, because link not " + "ACTIVE\n", (unsigned long long) errs); + ignore_this_time = errs & E_SUM_LINK_PKTERRS; + } + + if (supp_msgs == 250000) { + int s_iserr; + /* + * It's not entirely reasonable assuming that the errors set + * in the last clear period are all responsible for the + * problem, but the alternative is to assume it's the only + * ones on this particular interrupt, which also isn't great + */ + dd->ipath_maskederrs |= dd->ipath_lasterror | errs; + + dd->ipath_errormask &= ~dd->ipath_maskederrs; + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + dd->ipath_errormask); + s_iserr = ipath_decode_err(dd, msg, sizeof msg, + dd->ipath_maskederrs); + + if (dd->ipath_maskederrs & + ~(INFINIPATH_E_RRCVEGRFULL | + INFINIPATH_E_RRCVHDRFULL | INFINIPATH_E_PKTERRS)) + ipath_dev_err(dd, "Temporarily disabling " + "error(s) %llx reporting; too frequent (%s)\n", + (unsigned long long) dd->ipath_maskederrs, + msg); + else { + /* + * rcvegrfull and rcvhdrqfull are "normal", + * for some types of processes (mostly benchmarks) + * that send huge numbers of messages, while not + * processing them. So only complain about + * these at debug level. + */ + if (s_iserr) + ipath_dbg("Temporarily disabling reporting " + "too frequent queue full errors (%s)\n", + msg); + else + ipath_cdbg(ERRPKT, + "Temporarily disabling reporting too" + " frequent packet errors (%s)\n", + msg); + } + + /* + * Re-enable the masked errors after around 3 minutes. in + * ipath_get_faststats(). If we have a series of fast + * repeating but different errors, the interval will keep + * stretching out, but that's OK, as that's pretty + * catastrophic. + */ + dd->ipath_unmasktime = jiffies + HZ * 180; + } + + ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, errs); + if (ignore_this_time) + errs &= ~ignore_this_time; + if (errs & ~dd->ipath_lasterror) { + errs &= ~dd->ipath_lasterror; + /* never suppress duplicate hwerrors or ibstatuschange */ + dd->ipath_lasterror |= errs & + ~(INFINIPATH_E_HARDWARE | + INFINIPATH_E_IBSTATUSCHANGED); + } + + if (errs & INFINIPATH_E_SENDSPECIALTRIGGER) { + dd->ipath_spectriggerhit++; + ipath_dbg("%lu special trigger hits\n", + dd->ipath_spectriggerhit); + } + + /* likely due to cancel; so suppress message unless verbose */ + if ((errs & (INFINIPATH_E_SPKTLEN | INFINIPATH_E_SPIOARMLAUNCH)) && + time_after(dd->ipath_lastcancel, jiffies)) { + /* armlaunch takes precedence; it often causes both. */ + ipath_cdbg(VERBOSE, + "Suppressed %s error (%llx) after sendbuf cancel\n", + (errs & INFINIPATH_E_SPIOARMLAUNCH) ? + "armlaunch" : "sendpktlen", (unsigned long long)errs); + errs &= ~(INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SPKTLEN); + } + + if (!errs) + return 0; + + if (!noprint) { + ipath_err_t mask; + /* + * The ones we mask off are handled specially below + * or above. Also mask SDMADISABLED by default as it + * is too chatty. + */ + mask = INFINIPATH_E_IBSTATUSCHANGED | + INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL | + INFINIPATH_E_HARDWARE | INFINIPATH_E_SDMADISABLED; + + /* if we're in debug, then don't mask SDMADISABLED msgs */ + if (ipath_debug & __IPATH_DBG) + mask &= ~INFINIPATH_E_SDMADISABLED; + + ipath_decode_err(dd, msg, sizeof msg, errs & ~mask); + } else + /* so we don't need if (!noprint) at strlcat's below */ + *msg = 0; + + if (errs & E_SUM_PKTERRS) { + ipath_stats.sps_pkterrs++; + chkerrpkts = 1; + } + if (errs & E_SUM_ERRS) + ipath_stats.sps_errs++; + + if (errs & (INFINIPATH_E_RICRC | INFINIPATH_E_RVCRC)) { + ipath_stats.sps_crcerrs++; + chkerrpkts = 1; + } + iserr = errs & ~(E_SUM_PKTERRS | INFINIPATH_E_PKTERRS); + + + /* + * We don't want to print these two as they happen, or we can make + * the situation even worse, because it takes so long to print + * messages to serial consoles. Kernel ports get printed from + * fast_stats, no more than every 5 seconds, user ports get printed + * on close + */ + if (errs & INFINIPATH_E_RRCVHDRFULL) + chkerrpkts |= handle_hdrq_full(dd); + if (errs & INFINIPATH_E_RRCVEGRFULL) { + struct ipath_portdata *pd = dd->ipath_pd[0]; + + /* + * since this is of less importance and not likely to + * happen without also getting hdrfull, only count + * occurrences; don't check each port (or even the kernel + * vs user) + */ + ipath_stats.sps_etidfull++; + if (pd->port_head != ipath_get_hdrqtail(pd)) + chkerrpkts |= 1; + } + + /* + * do this before IBSTATUSCHANGED, in case both bits set in a single + * interrupt; we want the STATUSCHANGE to "win", so we do our + * internal copy of state machine correctly + */ + if (errs & INFINIPATH_E_RIBLOSTLINK) { + /* + * force through block below + */ + errs |= INFINIPATH_E_IBSTATUSCHANGED; + ipath_stats.sps_iblink++; + dd->ipath_flags |= IPATH_LINKDOWN; + dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT + | IPATH_LINKARMED | IPATH_LINKACTIVE); + *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY; + + ipath_dbg("Lost link, link now down (%s)\n", + ipath_ibcstatus_str[ipath_read_kreg64(dd, + dd->ipath_kregs->kr_ibcstatus) & 0xf]); + } + if (errs & INFINIPATH_E_IBSTATUSCHANGED) + handle_e_ibstatuschanged(dd, errs); + + if (errs & INFINIPATH_E_RESET) { + if (!noprint) + ipath_dev_err(dd, "Got reset, requires re-init " + "(unload and reload driver)\n"); + dd->ipath_flags &= ~IPATH_INITTED; /* needs re-init */ + /* mark as having had error */ + *dd->ipath_statusp |= IPATH_STATUS_HWERROR; + *dd->ipath_statusp &= ~IPATH_STATUS_IB_CONF; + } + + if (!noprint && *msg) { + if (iserr) + ipath_dev_err(dd, "%s error\n", msg); + } + if (dd->ipath_state_wanted & dd->ipath_flags) { + ipath_cdbg(VERBOSE, "driver wanted state %x, iflags now %x, " + "waking\n", dd->ipath_state_wanted, + dd->ipath_flags); + wake_up_interruptible(&ipath_state_wait); + } + + return chkerrpkts; +} + +/* + * try to cleanup as much as possible for anything that might have gone + * wrong while in freeze mode, such as pio buffers being written by user + * processes (causing armlaunch), send errors due to going into freeze mode, + * etc., and try to avoid causing extra interrupts while doing so. + * Forcibly update the in-memory pioavail register copies after cleanup + * because the chip won't do it while in freeze mode (the register values + * themselves are kept correct). + * Make sure that we don't lose any important interrupts by using the chip + * feature that says that writing 0 to a bit in *clear that is set in + * *status will cause an interrupt to be generated again (if allowed by + * the *mask value). + */ +void ipath_clear_freeze(struct ipath_devdata *dd) +{ + /* disable error interrupts, to avoid confusion */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, 0ULL); + + /* also disable interrupts; errormask is sometimes overwriten */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL); + + ipath_cancel_sends(dd, 1); + + /* clear the freeze, and be sure chip saw it */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, + dd->ipath_control); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + + /* force in-memory update now we are out of freeze */ + ipath_force_pio_avail_update(dd); + + /* + * force new interrupt if any hwerr, error or interrupt bits are + * still set, and clear "safe" send packet errors related to freeze + * and cancelling sends. Re-enable error interrupts before possible + * force of re-interrupt on pending interrupts. + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, 0ULL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, + E_SPKT_ERRS_IGNORE); + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + dd->ipath_errormask); + ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, -1LL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL); +} + + +/* this is separate to allow for better optimization of ipath_intr() */ + +static noinline void ipath_bad_intr(struct ipath_devdata *dd, u32 *unexpectp) +{ + /* + * sometimes happen during driver init and unload, don't want + * to process any interrupts at that point + */ + + /* this is just a bandaid, not a fix, if something goes badly + * wrong */ + if (++*unexpectp > 100) { + if (++*unexpectp > 105) { + /* + * ok, we must be taking somebody else's interrupts, + * due to a messed up mptable and/or PIRQ table, so + * unregister the interrupt. We've seen this during + * linuxbios development work, and it may happen in + * the future again. + */ + if (dd->pcidev && dd->ipath_irq) { + ipath_dev_err(dd, "Now %u unexpected " + "interrupts, unregistering " + "interrupt handler\n", + *unexpectp); + ipath_dbg("free_irq of irq %d\n", + dd->ipath_irq); + dd->ipath_f_free_irq(dd); + } + } + if (ipath_read_ireg(dd, dd->ipath_kregs->kr_intmask)) { + ipath_dev_err(dd, "%u unexpected interrupts, " + "disabling interrupts completely\n", + *unexpectp); + /* + * disable all interrupts, something is very wrong + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, + 0ULL); + } + } else if (*unexpectp > 1) + ipath_dbg("Interrupt when not ready, should not happen, " + "ignoring\n"); +} + +static noinline void ipath_bad_regread(struct ipath_devdata *dd) +{ + static int allbits; + + /* separate routine, for better optimization of ipath_intr() */ + + /* + * We print the message and disable interrupts, in hope of + * having a better chance of debugging the problem. + */ + ipath_dev_err(dd, + "Read of interrupt status failed (all bits set)\n"); + if (allbits++) { + /* disable all interrupts, something is very wrong */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL); + if (allbits == 2) { + ipath_dev_err(dd, "Still bad interrupt status, " + "unregistering interrupt\n"); + dd->ipath_f_free_irq(dd); + } else if (allbits > 2) { + if ((allbits % 10000) == 0) + printk("."); + } else + ipath_dev_err(dd, "Disabling interrupts, " + "multiple errors\n"); + } +} + +static void handle_layer_pioavail(struct ipath_devdata *dd) +{ + unsigned long flags; + int ret; + + ret = ipath_ib_piobufavail(dd->verbs_dev); + if (ret > 0) + goto set; + + return; +set: + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl |= INFINIPATH_S_PIOINTBUFAVAIL; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); +} + +/* + * Handle receive interrupts for user ports; this means a user + * process was waiting for a packet to arrive, and didn't want + * to poll + */ +static void handle_urcv(struct ipath_devdata *dd, u64 istat) +{ + u64 portr; + int i; + int rcvdint = 0; + + /* + * test_and_clear_bit(IPATH_PORT_WAITING_RCV) and + * test_and_clear_bit(IPATH_PORT_WAITING_URG) below + * would both like timely updates of the bits so that + * we don't pass them by unnecessarily. the rmb() + * here ensures that we see them promptly -- the + * corresponding wmb()'s are in ipath_poll_urgent() + * and ipath_poll_next()... + */ + rmb(); + portr = ((istat >> dd->ipath_i_rcvavail_shift) & + dd->ipath_i_rcvavail_mask) | + ((istat >> dd->ipath_i_rcvurg_shift) & + dd->ipath_i_rcvurg_mask); + for (i = 1; i < dd->ipath_cfgports; i++) { + struct ipath_portdata *pd = dd->ipath_pd[i]; + + if (portr & (1 << i) && pd && pd->port_cnt) { + if (test_and_clear_bit(IPATH_PORT_WAITING_RCV, + &pd->port_flag)) { + clear_bit(i + dd->ipath_r_intravail_shift, + &dd->ipath_rcvctrl); + wake_up_interruptible(&pd->port_wait); + rcvdint = 1; + } else if (test_and_clear_bit(IPATH_PORT_WAITING_URG, + &pd->port_flag)) { + pd->port_urgent++; + wake_up_interruptible(&pd->port_wait); + } + } + } + if (rcvdint) { + /* only want to take one interrupt, so turn off the rcv + * interrupt for all the ports that we set the rcv_waiting + * (but never for kernel port) + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + } +} + +irqreturn_t ipath_intr(int irq, void *data) +{ + struct ipath_devdata *dd = data; + u64 istat, chk0rcv = 0; + ipath_err_t estat = 0; + irqreturn_t ret; + static unsigned unexpected = 0; + u64 kportrbits; + + ipath_stats.sps_ints++; + + if (dd->ipath_int_counter != (u32) -1) + dd->ipath_int_counter++; + + if (!(dd->ipath_flags & IPATH_PRESENT)) { + /* + * This return value is not great, but we do not want the + * interrupt core code to remove our interrupt handler + * because we don't appear to be handling an interrupt + * during a chip reset. + */ + return IRQ_HANDLED; + } + + /* + * this needs to be flags&initted, not statusp, so we keep + * taking interrupts even after link goes down, etc. + * Also, we *must* clear the interrupt at some point, or we won't + * take it again, which can be real bad for errors, etc... + */ + + if (!(dd->ipath_flags & IPATH_INITTED)) { + ipath_bad_intr(dd, &unexpected); + ret = IRQ_NONE; + goto bail; + } + + istat = ipath_read_ireg(dd, dd->ipath_kregs->kr_intstatus); + + if (unlikely(!istat)) { + ipath_stats.sps_nullintr++; + ret = IRQ_NONE; /* not our interrupt, or already handled */ + goto bail; + } + if (unlikely(istat == -1)) { + ipath_bad_regread(dd); + /* don't know if it was our interrupt or not */ + ret = IRQ_NONE; + goto bail; + } + + if (unexpected) + unexpected = 0; + + if (unlikely(istat & ~dd->ipath_i_bitsextant)) + ipath_dev_err(dd, + "interrupt with unknown interrupts %Lx set\n", + (unsigned long long) + istat & ~dd->ipath_i_bitsextant); + else if (istat & ~INFINIPATH_I_ERROR) /* errors do own printing */ + ipath_cdbg(VERBOSE, "intr stat=0x%Lx\n", + (unsigned long long) istat); + + if (istat & INFINIPATH_I_ERROR) { + ipath_stats.sps_errints++; + estat = ipath_read_kreg64(dd, + dd->ipath_kregs->kr_errorstatus); + if (!estat) + dev_info(&dd->pcidev->dev, "error interrupt (%Lx), " + "but no error bits set!\n", + (unsigned long long) istat); + else if (estat == -1LL) + /* + * should we try clearing all, or hope next read + * works? + */ + ipath_dev_err(dd, "Read of error status failed " + "(all bits set); ignoring\n"); + else + chk0rcv |= handle_errors(dd, estat); + } + + if (istat & INFINIPATH_I_GPIO) { + /* + * GPIO interrupts fall in two broad classes: + * GPIO_2 indicates (on some HT4xx boards) that a packet + * has arrived for Port 0. Checking for this + * is controlled by flag IPATH_GPIO_INTR. + * GPIO_3..5 on IBA6120 Rev2 and IBA6110 Rev4 chips indicate + * errors that we need to count. Checking for this + * is controlled by flag IPATH_GPIO_ERRINTRS. + */ + u32 gpiostatus; + u32 to_clear = 0; + + gpiostatus = ipath_read_kreg32( + dd, dd->ipath_kregs->kr_gpio_status); + /* First the error-counter case. */ + if ((gpiostatus & IPATH_GPIO_ERRINTR_MASK) && + (dd->ipath_flags & IPATH_GPIO_ERRINTRS)) { + /* want to clear the bits we see asserted. */ + to_clear |= (gpiostatus & IPATH_GPIO_ERRINTR_MASK); + + /* + * Count appropriately, clear bits out of our copy, + * as they have been "handled". + */ + if (gpiostatus & (1 << IPATH_GPIO_RXUVL_BIT)) { + ipath_dbg("FlowCtl on UnsupVL\n"); + dd->ipath_rxfc_unsupvl_errs++; + } + if (gpiostatus & (1 << IPATH_GPIO_OVRUN_BIT)) { + ipath_dbg("Overrun Threshold exceeded\n"); + dd->ipath_overrun_thresh_errs++; + } + if (gpiostatus & (1 << IPATH_GPIO_LLI_BIT)) { + ipath_dbg("Local Link Integrity error\n"); + dd->ipath_lli_errs++; + } + gpiostatus &= ~IPATH_GPIO_ERRINTR_MASK; + } + /* Now the Port0 Receive case */ + if ((gpiostatus & (1 << IPATH_GPIO_PORT0_BIT)) && + (dd->ipath_flags & IPATH_GPIO_INTR)) { + /* + * GPIO status bit 2 is set, and we expected it. + * clear it and indicate in p0bits. + * This probably only happens if a Port0 pkt + * arrives at _just_ the wrong time, and we + * handle that by seting chk0rcv; + */ + to_clear |= (1 << IPATH_GPIO_PORT0_BIT); + gpiostatus &= ~(1 << IPATH_GPIO_PORT0_BIT); + chk0rcv = 1; + } + if (gpiostatus) { + /* + * Some unexpected bits remain. If they could have + * caused the interrupt, complain and clear. + * To avoid repetition of this condition, also clear + * the mask. It is almost certainly due to error. + */ + const u32 mask = (u32) dd->ipath_gpio_mask; + + if (mask & gpiostatus) { + ipath_dbg("Unexpected GPIO IRQ bits %x\n", + gpiostatus & mask); + to_clear |= (gpiostatus & mask); + dd->ipath_gpio_mask &= ~(gpiostatus & mask); + ipath_write_kreg(dd, + dd->ipath_kregs->kr_gpio_mask, + dd->ipath_gpio_mask); + } + } + if (to_clear) { + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_clear, + (u64) to_clear); + } + } + + /* + * Clear the interrupt bits we found set, unless they are receive + * related, in which case we already cleared them above, and don't + * want to clear them again, because we might lose an interrupt. + * Clear it early, so we "know" know the chip will have seen this by + * the time we process the queue, and will re-interrupt if necessary. + * The processor itself won't take the interrupt again until we return. + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, istat); + + /* + * Handle kernel receive queues before checking for pio buffers + * available since receives can overflow; piobuf waiters can afford + * a few extra cycles, since they were waiting anyway, and user's + * waiting for receive are at the bottom. + */ + kportrbits = (1ULL << dd->ipath_i_rcvavail_shift) | + (1ULL << dd->ipath_i_rcvurg_shift); + if (chk0rcv || (istat & kportrbits)) { + istat &= ~kportrbits; + ipath_kreceive(dd->ipath_pd[0]); + } + + if (istat & ((dd->ipath_i_rcvavail_mask << dd->ipath_i_rcvavail_shift) | + (dd->ipath_i_rcvurg_mask << dd->ipath_i_rcvurg_shift))) + handle_urcv(dd, istat); + + if (istat & (INFINIPATH_I_SDMAINT | INFINIPATH_I_SDMADISABLED)) + handle_sdma_intr(dd, istat); + + if (istat & INFINIPATH_I_SPIOBUFAVAIL) { + unsigned long flags; + + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl &= ~INFINIPATH_S_PIOINTBUFAVAIL; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + /* always process; sdma verbs uses PIO for acks and VL15 */ + handle_layer_pioavail(dd); + } + + ret = IRQ_HANDLED; + +bail: + return ret; +} diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/infiniband/hw/ipath/ipath_kernel.h new file mode 100644 index 0000000..6559af6 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_kernel.h @@ -0,0 +1,1378 @@ +#ifndef _IPATH_KERNEL_H +#define _IPATH_KERNEL_H +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This header file is the base header file for infinipath kernel code + * ipath_user.h serves a similar purpose for user code. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ipath_common.h" +#include "ipath_debug.h" +#include "ipath_registers.h" + +/* only s/w major version of InfiniPath we can handle */ +#define IPATH_CHIP_VERS_MAJ 2U + +/* don't care about this except printing */ +#define IPATH_CHIP_VERS_MIN 0U + +/* temporary, maybe always */ +extern struct infinipath_stats ipath_stats; + +#define IPATH_CHIP_SWVERSION IPATH_CHIP_VERS_MAJ +/* + * First-cut critierion for "device is active" is + * two thousand dwords combined Tx, Rx traffic per + * 5-second interval. SMA packets are 64 dwords, + * and occur "a few per second", presumably each way. + */ +#define IPATH_TRAFFIC_ACTIVE_THRESHOLD (2000) +/* + * Struct used to indicate which errors are logged in each of the + * error-counters that are logged to EEPROM. A counter is incremented + * _once_ (saturating at 255) for each event with any bits set in + * the error or hwerror register masks below. + */ +#define IPATH_EEP_LOG_CNT (4) +struct ipath_eep_log_mask { + u64 errs_to_log; + u64 hwerrs_to_log; +}; + +struct ipath_portdata { + void **port_rcvegrbuf; + dma_addr_t *port_rcvegrbuf_phys; + /* rcvhdrq base, needs mmap before useful */ + void *port_rcvhdrq; + /* kernel virtual address where hdrqtail is updated */ + void *port_rcvhdrtail_kvaddr; + /* + * temp buffer for expected send setup, allocated at open, instead + * of each setup call + */ + void *port_tid_pg_list; + /* when waiting for rcv or pioavail */ + wait_queue_head_t port_wait; + /* + * rcvegr bufs base, physical, must fit + * in 44 bits so 32 bit programs mmap64 44 bit works) + */ + dma_addr_t port_rcvegr_phys; + /* mmap of hdrq, must fit in 44 bits */ + dma_addr_t port_rcvhdrq_phys; + dma_addr_t port_rcvhdrqtailaddr_phys; + /* + * number of opens (including slave subports) on this instance + * (ignoring forks, dup, etc. for now) + */ + int port_cnt; + /* + * how much space to leave at start of eager TID entries for + * protocol use, on each TID + */ + /* instead of calculating it */ + unsigned port_port; + /* non-zero if port is being shared. */ + u16 port_subport_cnt; + /* non-zero if port is being shared. */ + u16 port_subport_id; + /* number of pio bufs for this port (all procs, if shared) */ + u32 port_piocnt; + /* first pio buffer for this port */ + u32 port_pio_base; + /* chip offset of PIO buffers for this port */ + u32 port_piobufs; + /* how many alloc_pages() chunks in port_rcvegrbuf_pages */ + u32 port_rcvegrbuf_chunks; + /* how many egrbufs per chunk */ + u32 port_rcvegrbufs_perchunk; + /* order for port_rcvegrbuf_pages */ + size_t port_rcvegrbuf_size; + /* rcvhdrq size (for freeing) */ + size_t port_rcvhdrq_size; + /* next expected TID to check when looking for free */ + u32 port_tidcursor; + /* next expected TID to check */ + unsigned long port_flag; + /* what happened */ + unsigned long int_flag; + /* WAIT_RCV that timed out, no interrupt */ + u32 port_rcvwait_to; + /* WAIT_PIO that timed out, no interrupt */ + u32 port_piowait_to; + /* WAIT_RCV already happened, no wait */ + u32 port_rcvnowait; + /* WAIT_PIO already happened, no wait */ + u32 port_pionowait; + /* total number of rcvhdrqfull errors */ + u32 port_hdrqfull; + /* + * Used to suppress multiple instances of same + * port staying stuck at same point. + */ + u32 port_lastrcvhdrqtail; + /* saved total number of rcvhdrqfull errors for poll edge trigger */ + u32 port_hdrqfull_poll; + /* total number of polled urgent packets */ + u32 port_urgent; + /* saved total number of polled urgent packets for poll edge trigger */ + u32 port_urgent_poll; + /* pid of process using this port */ + struct pid *port_pid; + struct pid *port_subpid[INFINIPATH_MAX_SUBPORT]; + /* same size as task_struct .comm[] */ + char port_comm[16]; + /* pkeys set by this use of this port */ + u16 port_pkeys[4]; + /* so file ops can get at unit */ + struct ipath_devdata *port_dd; + /* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */ + void *subport_uregbase; + /* An array of pages for the eager receive buffers * N */ + void *subport_rcvegrbuf; + /* An array of pages for the eager header queue entries * N */ + void *subport_rcvhdr_base; + /* The version of the library which opened this port */ + u32 userversion; + /* Bitmask of active slaves */ + u32 active_slaves; + /* Type of packets or conditions we want to poll for */ + u16 poll_type; + /* port rcvhdrq head offset */ + u32 port_head; + /* receive packet sequence counter */ + u32 port_seq_cnt; +}; + +struct sk_buff; +struct ipath_sge_state; +struct ipath_verbs_txreq; + +/* + * control information for layered drivers + */ +struct _ipath_layer { + void *l_arg; +}; + +struct ipath_skbinfo { + struct sk_buff *skb; + dma_addr_t phys; +}; + +struct ipath_sdma_txreq { + int flags; + int sg_count; + union { + struct scatterlist *sg; + void *map_addr; + }; + void (*callback)(void *, int); + void *callback_cookie; + int callback_status; + u16 start_idx; /* sdma private */ + u16 next_descq_idx; /* sdma private */ + struct list_head list; /* sdma private */ +}; + +struct ipath_sdma_desc { + __le64 qw[2]; +}; + +#define IPATH_SDMA_TXREQ_F_USELARGEBUF 0x1 +#define IPATH_SDMA_TXREQ_F_HEADTOHOST 0x2 +#define IPATH_SDMA_TXREQ_F_INTREQ 0x4 +#define IPATH_SDMA_TXREQ_F_FREEBUF 0x8 +#define IPATH_SDMA_TXREQ_F_FREEDESC 0x10 +#define IPATH_SDMA_TXREQ_F_VL15 0x20 + +#define IPATH_SDMA_TXREQ_S_OK 0 +#define IPATH_SDMA_TXREQ_S_SENDERROR 1 +#define IPATH_SDMA_TXREQ_S_ABORTED 2 +#define IPATH_SDMA_TXREQ_S_SHUTDOWN 3 + +#define IPATH_SDMA_STATUS_SCORE_BOARD_DRAIN_IN_PROG (1ull << 63) +#define IPATH_SDMA_STATUS_ABORT_IN_PROG (1ull << 62) +#define IPATH_SDMA_STATUS_INTERNAL_SDMA_ENABLE (1ull << 61) +#define IPATH_SDMA_STATUS_SCB_EMPTY (1ull << 30) + +/* max dwords in small buffer packet */ +#define IPATH_SMALLBUF_DWORDS (dd->ipath_piosize2k >> 2) + +/* + * Possible IB config parameters for ipath_f_get/set_ib_cfg() + */ +#define IPATH_IB_CFG_LIDLMC 0 /* Get/set LID (LS16b) and Mask (MS16b) */ +#define IPATH_IB_CFG_HRTBT 1 /* Get/set Heartbeat off/enable/auto */ +#define IPATH_IB_HRTBT_ON 3 /* Heartbeat enabled, sent every 100msec */ +#define IPATH_IB_HRTBT_OFF 0 /* Heartbeat off */ +#define IPATH_IB_CFG_LWID_ENB 2 /* Get/set allowed Link-width */ +#define IPATH_IB_CFG_LWID 3 /* Get currently active Link-width */ +#define IPATH_IB_CFG_SPD_ENB 4 /* Get/set allowed Link speeds */ +#define IPATH_IB_CFG_SPD 5 /* Get current Link spd */ +#define IPATH_IB_CFG_RXPOL_ENB 6 /* Get/set Auto-RX-polarity enable */ +#define IPATH_IB_CFG_LREV_ENB 7 /* Get/set Auto-Lane-reversal enable */ +#define IPATH_IB_CFG_LINKLATENCY 8 /* Get Auto-Lane-reversal enable */ + + +struct ipath_devdata { + struct list_head ipath_list; + + struct ipath_kregs const *ipath_kregs; + struct ipath_cregs const *ipath_cregs; + + /* mem-mapped pointer to base of chip regs */ + u64 __iomem *ipath_kregbase; + /* end of mem-mapped chip space; range checking */ + u64 __iomem *ipath_kregend; + /* physical address of chip for io_remap, etc. */ + unsigned long ipath_physaddr; + /* base of memory alloced for ipath_kregbase, for free */ + u64 *ipath_kregalloc; + /* ipath_cfgports pointers */ + struct ipath_portdata **ipath_pd; + /* sk_buffs used by port 0 eager receive queue */ + struct ipath_skbinfo *ipath_port0_skbinfo; + /* kvirt address of 1st 2k pio buffer */ + void __iomem *ipath_pio2kbase; + /* kvirt address of 1st 4k pio buffer */ + void __iomem *ipath_pio4kbase; + /* + * points to area where PIOavail registers will be DMA'ed. + * Has to be on a page of it's own, because the page will be + * mapped into user program space. This copy is *ONLY* ever + * written by DMA, not by the driver! Need a copy per device + * when we get to multiple devices + */ + volatile __le64 *ipath_pioavailregs_dma; + /* physical address where updates occur */ + dma_addr_t ipath_pioavailregs_phys; + struct _ipath_layer ipath_layer; + /* setup intr */ + int (*ipath_f_intrsetup)(struct ipath_devdata *); + /* fallback to alternate interrupt type if possible */ + int (*ipath_f_intr_fallback)(struct ipath_devdata *); + /* setup on-chip bus config */ + int (*ipath_f_bus)(struct ipath_devdata *, struct pci_dev *); + /* hard reset chip */ + int (*ipath_f_reset)(struct ipath_devdata *); + int (*ipath_f_get_boardname)(struct ipath_devdata *, char *, + size_t); + void (*ipath_f_init_hwerrors)(struct ipath_devdata *); + void (*ipath_f_handle_hwerrors)(struct ipath_devdata *, char *, + size_t); + void (*ipath_f_quiet_serdes)(struct ipath_devdata *); + int (*ipath_f_bringup_serdes)(struct ipath_devdata *); + int (*ipath_f_early_init)(struct ipath_devdata *); + void (*ipath_f_clear_tids)(struct ipath_devdata *, unsigned); + void (*ipath_f_put_tid)(struct ipath_devdata *, u64 __iomem*, + u32, unsigned long); + void (*ipath_f_tidtemplate)(struct ipath_devdata *); + void (*ipath_f_cleanup)(struct ipath_devdata *); + void (*ipath_f_setextled)(struct ipath_devdata *, u64, u64); + /* fill out chip-specific fields */ + int (*ipath_f_get_base_info)(struct ipath_portdata *, void *); + /* free irq */ + void (*ipath_f_free_irq)(struct ipath_devdata *); + struct ipath_message_header *(*ipath_f_get_msgheader) + (struct ipath_devdata *, __le32 *); + void (*ipath_f_config_ports)(struct ipath_devdata *, ushort); + int (*ipath_f_get_ib_cfg)(struct ipath_devdata *, int); + int (*ipath_f_set_ib_cfg)(struct ipath_devdata *, int, u32); + void (*ipath_f_config_jint)(struct ipath_devdata *, u16 , u16); + void (*ipath_f_read_counters)(struct ipath_devdata *, + struct infinipath_counters *); + void (*ipath_f_xgxs_reset)(struct ipath_devdata *); + /* per chip actions needed for IB Link up/down changes */ + int (*ipath_f_ib_updown)(struct ipath_devdata *, int, u64); + + unsigned ipath_lastegr_idx; + struct ipath_ibdev *verbs_dev; + struct timer_list verbs_timer; + /* total dwords sent (summed from counter) */ + u64 ipath_sword; + /* total dwords rcvd (summed from counter) */ + u64 ipath_rword; + /* total packets sent (summed from counter) */ + u64 ipath_spkts; + /* total packets rcvd (summed from counter) */ + u64 ipath_rpkts; + /* ipath_statusp initially points to this. */ + u64 _ipath_status; + /* GUID for this interface, in network order */ + __be64 ipath_guid; + /* + * aggregrate of error bits reported since last cleared, for + * limiting of error reporting + */ + ipath_err_t ipath_lasterror; + /* + * aggregrate of error bits reported since last cleared, for + * limiting of hwerror reporting + */ + ipath_err_t ipath_lasthwerror; + /* errors masked because they occur too fast */ + ipath_err_t ipath_maskederrs; + u64 ipath_lastlinkrecov; /* link recoveries at last ACTIVE */ + /* these 5 fields are used to establish deltas for IB Symbol + * errors and linkrecovery errors. They can be reported on + * some chips during link negotiation prior to INIT, and with + * DDR when faking DDR negotiations with non-IBTA switches. + * The chip counters are adjusted at driver unload if there is + * a non-zero delta. + */ + u64 ibdeltainprog; + u64 ibsymdelta; + u64 ibsymsnap; + u64 iblnkerrdelta; + u64 iblnkerrsnap; + + /* time in jiffies at which to re-enable maskederrs */ + unsigned long ipath_unmasktime; + /* count of egrfull errors, combined for all ports */ + u64 ipath_last_tidfull; + /* for ipath_qcheck() */ + u64 ipath_lastport0rcv_cnt; + /* template for writing TIDs */ + u64 ipath_tidtemplate; + /* value to write to free TIDs */ + u64 ipath_tidinvalid; + /* IBA6120 rcv interrupt setup */ + u64 ipath_rhdrhead_intr_off; + + /* size of memory at ipath_kregbase */ + u32 ipath_kregsize; + /* number of registers used for pioavail */ + u32 ipath_pioavregs; + /* IPATH_POLL, etc. */ + u32 ipath_flags; + /* ipath_flags driver is waiting for */ + u32 ipath_state_wanted; + /* last buffer for user use, first buf for kernel use is this + * index. */ + u32 ipath_lastport_piobuf; + /* is a stats timer active */ + u32 ipath_stats_timer_active; + /* number of interrupts for this device -- saturates... */ + u32 ipath_int_counter; + /* dwords sent read from counter */ + u32 ipath_lastsword; + /* dwords received read from counter */ + u32 ipath_lastrword; + /* sent packets read from counter */ + u32 ipath_lastspkts; + /* received packets read from counter */ + u32 ipath_lastrpkts; + /* pio bufs allocated per port */ + u32 ipath_pbufsport; + /* if remainder on bufs/port, ports < extrabuf get 1 extra */ + u32 ipath_ports_extrabuf; + u32 ipath_pioupd_thresh; /* update threshold, some chips */ + /* + * number of ports configured as max; zero is set to number chip + * supports, less gives more pio bufs/port, etc. + */ + u32 ipath_cfgports; + /* count of port 0 hdrqfull errors */ + u32 ipath_p0_hdrqfull; + /* port 0 number of receive eager buffers */ + u32 ipath_p0_rcvegrcnt; + + /* + * index of last piobuffer we used. Speeds up searching, by + * starting at this point. Doesn't matter if multiple cpu's use and + * update, last updater is only write that matters. Whenever it + * wraps, we update shadow copies. Need a copy per device when we + * get to multiple devices + */ + u32 ipath_lastpioindex; + u32 ipath_lastpioindexl; + /* max length of freezemsg */ + u32 ipath_freezelen; + /* + * consecutive times we wanted a PIO buffer but were unable to + * get one + */ + u32 ipath_consec_nopiobuf; + /* + * hint that we should update ipath_pioavailshadow before + * looking for a PIO buffer + */ + u32 ipath_upd_pio_shadow; + /* so we can rewrite it after a chip reset */ + u32 ipath_pcibar0; + /* so we can rewrite it after a chip reset */ + u32 ipath_pcibar1; + u32 ipath_x1_fix_tries; + u32 ipath_autoneg_tries; + u32 serdes_first_init_done; + + struct ipath_relock { + atomic_t ipath_relock_timer_active; + struct timer_list ipath_relock_timer; + unsigned int ipath_relock_interval; /* in jiffies */ + } ipath_relock_singleton; + + /* interrupt number */ + int ipath_irq; + /* HT/PCI Vendor ID (here for NodeInfo) */ + u16 ipath_vendorid; + /* HT/PCI Device ID (here for NodeInfo) */ + u16 ipath_deviceid; + /* offset in HT config space of slave/primary interface block */ + u8 ipath_ht_slave_off; + /* for write combining settings */ + unsigned long ipath_wc_cookie; + unsigned long ipath_wc_base; + unsigned long ipath_wc_len; + /* ref count for each pkey */ + atomic_t ipath_pkeyrefs[4]; + /* shadow copy of struct page *'s for exp tid pages */ + struct page **ipath_pageshadow; + /* shadow copy of dma handles for exp tid pages */ + dma_addr_t *ipath_physshadow; + u64 __iomem *ipath_egrtidbase; + /* lock to workaround chip bug 9437 and others */ + spinlock_t ipath_kernel_tid_lock; + spinlock_t ipath_user_tid_lock; + spinlock_t ipath_sendctrl_lock; + /* around ipath_pd and (user ports) port_cnt use (intr vs free) */ + spinlock_t ipath_uctxt_lock; + + /* + * IPATH_STATUS_*, + * this address is mapped readonly into user processes so they can + * get status cheaply, whenever they want. + */ + u64 *ipath_statusp; + /* freeze msg if hw error put chip in freeze */ + char *ipath_freezemsg; + /* pci access data structure */ + struct pci_dev *pcidev; + struct cdev *user_cdev; + struct cdev *diag_cdev; + struct device *user_dev; + struct device *diag_dev; + /* timer used to prevent stats overflow, error throttling, etc. */ + struct timer_list ipath_stats_timer; + /* timer to verify interrupts work, and fallback if possible */ + struct timer_list ipath_intrchk_timer; + void *ipath_dummy_hdrq; /* used after port close */ + dma_addr_t ipath_dummy_hdrq_phys; + + /* SendDMA related entries */ + spinlock_t ipath_sdma_lock; + unsigned long ipath_sdma_status; + unsigned long ipath_sdma_abort_jiffies; + unsigned long ipath_sdma_abort_intr_timeout; + unsigned long ipath_sdma_buf_jiffies; + struct ipath_sdma_desc *ipath_sdma_descq; + u64 ipath_sdma_descq_added; + u64 ipath_sdma_descq_removed; + int ipath_sdma_desc_nreserved; + u16 ipath_sdma_descq_cnt; + u16 ipath_sdma_descq_tail; + u16 ipath_sdma_descq_head; + u16 ipath_sdma_next_intr; + u16 ipath_sdma_reset_wait; + u8 ipath_sdma_generation; + struct tasklet_struct ipath_sdma_abort_task; + struct tasklet_struct ipath_sdma_notify_task; + struct list_head ipath_sdma_activelist; + struct list_head ipath_sdma_notifylist; + atomic_t ipath_sdma_vl15_count; + struct timer_list ipath_sdma_vl15_timer; + + dma_addr_t ipath_sdma_descq_phys; + volatile __le64 *ipath_sdma_head_dma; + dma_addr_t ipath_sdma_head_phys; + + unsigned long ipath_ureg_align; /* user register alignment */ + + struct delayed_work ipath_autoneg_work; + wait_queue_head_t ipath_autoneg_wait; + + /* HoL blocking / user app forward-progress state */ + unsigned ipath_hol_state; + unsigned ipath_hol_next; + struct timer_list ipath_hol_timer; + + /* + * Shadow copies of registers; size indicates read access size. + * Most of them are readonly, but some are write-only register, + * where we manipulate the bits in the shadow copy, and then write + * the shadow copy to infinipath. + * + * We deliberately make most of these 32 bits, since they have + * restricted range. For any that we read, we won't to generate 32 + * bit accesses, since Opteron will generate 2 separate 32 bit HT + * transactions for a 64 bit read, and we want to avoid unnecessary + * HT transactions. + */ + + /* This is the 64 bit group */ + + /* + * shadow of pioavail, check to be sure it's large enough at + * init time. + */ + unsigned long ipath_pioavailshadow[8]; + /* bitmap of send buffers available for the kernel to use with PIO. */ + unsigned long ipath_pioavailkernel[8]; + /* shadow of kr_gpio_out, for rmw ops */ + u64 ipath_gpio_out; + /* shadow the gpio mask register */ + u64 ipath_gpio_mask; + /* shadow the gpio output enable, etc... */ + u64 ipath_extctrl; + /* kr_revision shadow */ + u64 ipath_revision; + /* + * shadow of ibcctrl, for interrupt handling of link changes, + * etc. + */ + u64 ipath_ibcctrl; + /* + * last ibcstatus, to suppress "duplicate" status change messages, + * mostly from 2 to 3 + */ + u64 ipath_lastibcstat; + /* hwerrmask shadow */ + ipath_err_t ipath_hwerrmask; + ipath_err_t ipath_errormask; /* errormask shadow */ + /* interrupt config reg shadow */ + u64 ipath_intconfig; + /* kr_sendpiobufbase value */ + u64 ipath_piobufbase; + /* kr_ibcddrctrl shadow */ + u64 ipath_ibcddrctrl; + + /* these are the "32 bit" regs */ + + /* + * number of GUIDs in the flash for this interface; may need some + * rethinking for setting on other ifaces + */ + u32 ipath_nguid; + /* + * the following two are 32-bit bitmasks, but {test,clear,set}_bit + * all expect bit fields to be "unsigned long" + */ + /* shadow kr_rcvctrl */ + unsigned long ipath_rcvctrl; + /* shadow kr_sendctrl */ + unsigned long ipath_sendctrl; + /* to not count armlaunch after cancel */ + unsigned long ipath_lastcancel; + /* count cases where special trigger was needed (double write) */ + unsigned long ipath_spectriggerhit; + + /* value we put in kr_rcvhdrcnt */ + u32 ipath_rcvhdrcnt; + /* value we put in kr_rcvhdrsize */ + u32 ipath_rcvhdrsize; + /* value we put in kr_rcvhdrentsize */ + u32 ipath_rcvhdrentsize; + /* offset of last entry in rcvhdrq */ + u32 ipath_hdrqlast; + /* kr_portcnt value */ + u32 ipath_portcnt; + /* kr_pagealign value */ + u32 ipath_palign; + /* number of "2KB" PIO buffers */ + u32 ipath_piobcnt2k; + /* size in bytes of "2KB" PIO buffers */ + u32 ipath_piosize2k; + /* number of "4KB" PIO buffers */ + u32 ipath_piobcnt4k; + /* size in bytes of "4KB" PIO buffers */ + u32 ipath_piosize4k; + u32 ipath_pioreserved; /* reserved special-inkernel; */ + /* kr_rcvegrbase value */ + u32 ipath_rcvegrbase; + /* kr_rcvegrcnt value */ + u32 ipath_rcvegrcnt; + /* kr_rcvtidbase value */ + u32 ipath_rcvtidbase; + /* kr_rcvtidcnt value */ + u32 ipath_rcvtidcnt; + /* kr_sendregbase */ + u32 ipath_sregbase; + /* kr_userregbase */ + u32 ipath_uregbase; + /* kr_counterregbase */ + u32 ipath_cregbase; + /* shadow the control register contents */ + u32 ipath_control; + /* PCI revision register (HTC rev on FPGA) */ + u32 ipath_pcirev; + + /* chip address space used by 4k pio buffers */ + u32 ipath_4kalign; + /* The MTU programmed for this unit */ + u32 ipath_ibmtu; + /* + * The max size IB packet, included IB headers that we can send. + * Starts same as ipath_piosize, but is affected when ibmtu is + * changed, or by size of eager buffers + */ + u32 ipath_ibmaxlen; + /* + * ibmaxlen at init time, limited by chip and by receive buffer + * size. Not changed after init. + */ + u32 ipath_init_ibmaxlen; + /* size of each rcvegrbuffer */ + u32 ipath_rcvegrbufsize; + /* localbus width (1, 2,4,8,16,32) from config space */ + u32 ipath_lbus_width; + /* localbus speed (HT: 200,400,800,1000; PCIe 2500) */ + u32 ipath_lbus_speed; + /* + * number of sequential ibcstatus change for polling active/quiet + * (i.e., link not coming up). + */ + u32 ipath_ibpollcnt; + /* low and high portions of MSI capability/vector */ + u32 ipath_msi_lo; + /* saved after PCIe init for restore after reset */ + u32 ipath_msi_hi; + /* MSI data (vector) saved for restore */ + u16 ipath_msi_data; + /* MLID programmed for this instance */ + u16 ipath_mlid; + /* LID programmed for this instance */ + u16 ipath_lid; + /* list of pkeys programmed; 0 if not set */ + u16 ipath_pkeys[4]; + /* + * ASCII serial number, from flash, large enough for original + * all digit strings, and longer QLogic serial number format + */ + u8 ipath_serial[16]; + /* human readable board version */ + u8 ipath_boardversion[96]; + u8 ipath_lbus_info[32]; /* human readable localbus info */ + /* chip major rev, from ipath_revision */ + u8 ipath_majrev; + /* chip minor rev, from ipath_revision */ + u8 ipath_minrev; + /* board rev, from ipath_revision */ + u8 ipath_boardrev; + /* saved for restore after reset */ + u8 ipath_pci_cacheline; + /* LID mask control */ + u8 ipath_lmc; + /* link width supported */ + u8 ipath_link_width_supported; + /* link speed supported */ + u8 ipath_link_speed_supported; + u8 ipath_link_width_enabled; + u8 ipath_link_speed_enabled; + u8 ipath_link_width_active; + u8 ipath_link_speed_active; + /* Rx Polarity inversion (compensate for ~tx on partner) */ + u8 ipath_rx_pol_inv; + + u8 ipath_r_portenable_shift; + u8 ipath_r_intravail_shift; + u8 ipath_r_tailupd_shift; + u8 ipath_r_portcfg_shift; + + /* unit # of this chip, if present */ + int ipath_unit; + + /* local link integrity counter */ + u32 ipath_lli_counter; + /* local link integrity errors */ + u32 ipath_lli_errors; + /* + * Above counts only cases where _successive_ LocalLinkIntegrity + * errors were seen in the receive headers of kern-packets. + * Below are the three (monotonically increasing) counters + * maintained via GPIO interrupts on iba6120-rev2. + */ + u32 ipath_rxfc_unsupvl_errs; + u32 ipath_overrun_thresh_errs; + u32 ipath_lli_errs; + + /* + * Not all devices managed by a driver instance are the same + * type, so these fields must be per-device. + */ + u64 ipath_i_bitsextant; + ipath_err_t ipath_e_bitsextant; + ipath_err_t ipath_hwe_bitsextant; + + /* + * Below should be computable from number of ports, + * since they are never modified. + */ + u64 ipath_i_rcvavail_mask; + u64 ipath_i_rcvurg_mask; + u16 ipath_i_rcvurg_shift; + u16 ipath_i_rcvavail_shift; + + /* + * Register bits for selecting i2c direction and values, used for + * I2C serial flash. + */ + u8 ipath_gpio_sda_num; + u8 ipath_gpio_scl_num; + u8 ipath_i2c_chain_type; + u64 ipath_gpio_sda; + u64 ipath_gpio_scl; + + /* lock for doing RMW of shadows/regs for ExtCtrl and GPIO */ + spinlock_t ipath_gpio_lock; + + /* + * IB link and linktraining states and masks that vary per chip in + * some way. Set at init, to avoid each IB status change interrupt + */ + u8 ibcs_ls_shift; + u8 ibcs_lts_mask; + u32 ibcs_mask; + u32 ib_init; + u32 ib_arm; + u32 ib_active; + + u16 ipath_rhf_offset; /* offset of RHF within receive header entry */ + + /* + * shift/mask for linkcmd, linkinitcmd, maxpktlen in ibccontol + * reg. Changes for IBA7220 + */ + u8 ibcc_lic_mask; /* LinkInitCmd */ + u8 ibcc_lc_shift; /* LinkCmd */ + u8 ibcc_mpl_shift; /* Maxpktlen */ + + u8 delay_mult; + + /* used to override LED behavior */ + u8 ipath_led_override; /* Substituted for normal value, if non-zero */ + u16 ipath_led_override_timeoff; /* delta to next timer event */ + u8 ipath_led_override_vals[2]; /* Alternates per blink-frame */ + u8 ipath_led_override_phase; /* Just counts, LSB picks from vals[] */ + atomic_t ipath_led_override_timer_active; + /* Used to flash LEDs in override mode */ + struct timer_list ipath_led_override_timer; + + /* Support (including locks) for EEPROM logging of errors and time */ + /* control access to actual counters, timer */ + spinlock_t ipath_eep_st_lock; + /* control high-level access to EEPROM */ + struct mutex ipath_eep_lock; + /* Below inc'd by ipath_snap_cntrs(), locked by ipath_eep_st_lock */ + uint64_t ipath_traffic_wds; + /* active time is kept in seconds, but logged in hours */ + atomic_t ipath_active_time; + /* Below are nominal shadow of EEPROM, new since last EEPROM update */ + uint8_t ipath_eep_st_errs[IPATH_EEP_LOG_CNT]; + uint8_t ipath_eep_st_new_errs[IPATH_EEP_LOG_CNT]; + uint16_t ipath_eep_hrs; + /* + * masks for which bits of errs, hwerrs that cause + * each of the counters to increment. + */ + struct ipath_eep_log_mask ipath_eep_st_masks[IPATH_EEP_LOG_CNT]; + + /* interrupt mitigation reload register info */ + u16 ipath_jint_idle_ticks; /* idle clock ticks */ + u16 ipath_jint_max_packets; /* max packets across all ports */ + + /* + * lock for access to SerDes, and flags to sequence preset + * versus steady-state. 7220-only at the moment. + */ + spinlock_t ipath_sdepb_lock; + u8 ipath_presets_needed; /* Set if presets to be restored next DOWN */ +}; + +/* ipath_hol_state values (stopping/starting user proc, send flushing) */ +#define IPATH_HOL_UP 0 +#define IPATH_HOL_DOWN 1 +/* ipath_hol_next toggle values, used when hol_state IPATH_HOL_DOWN */ +#define IPATH_HOL_DOWNSTOP 0 +#define IPATH_HOL_DOWNCONT 1 + +/* bit positions for sdma_status */ +#define IPATH_SDMA_ABORTING 0 +#define IPATH_SDMA_DISARMED 1 +#define IPATH_SDMA_DISABLED 2 +#define IPATH_SDMA_LAYERBUF 3 +#define IPATH_SDMA_RUNNING 30 +#define IPATH_SDMA_SHUTDOWN 31 + +/* bit combinations that correspond to abort states */ +#define IPATH_SDMA_ABORT_NONE 0 +#define IPATH_SDMA_ABORT_ABORTING (1UL << IPATH_SDMA_ABORTING) +#define IPATH_SDMA_ABORT_DISARMED ((1UL << IPATH_SDMA_ABORTING) | \ + (1UL << IPATH_SDMA_DISARMED)) +#define IPATH_SDMA_ABORT_DISABLED ((1UL << IPATH_SDMA_ABORTING) | \ + (1UL << IPATH_SDMA_DISABLED)) +#define IPATH_SDMA_ABORT_ABORTED ((1UL << IPATH_SDMA_ABORTING) | \ + (1UL << IPATH_SDMA_DISARMED) | (1UL << IPATH_SDMA_DISABLED)) +#define IPATH_SDMA_ABORT_MASK ((1UL<private_data)->pd +#define subport_fp(fp) \ + ((struct ipath_filedata *)(fp)->private_data)->subport +#define tidcursor_fp(fp) \ + ((struct ipath_filedata *)(fp)->private_data)->tidcursor +#define user_sdma_queue_fp(fp) \ + ((struct ipath_filedata *)(fp)->private_data)->pq + +/* + * values for ipath_flags + */ + /* chip can report link latency (IB 1.2) */ +#define IPATH_HAS_LINK_LATENCY 0x1 + /* The chip is up and initted */ +#define IPATH_INITTED 0x2 + /* set if any user code has set kr_rcvhdrsize */ +#define IPATH_RCVHDRSZ_SET 0x4 + /* The chip is present and valid for accesses */ +#define IPATH_PRESENT 0x8 + /* HT link0 is only 8 bits wide, ignore upper byte crc + * errors, etc. */ +#define IPATH_8BIT_IN_HT0 0x10 + /* HT link1 is only 8 bits wide, ignore upper byte crc + * errors, etc. */ +#define IPATH_8BIT_IN_HT1 0x20 + /* The link is down */ +#define IPATH_LINKDOWN 0x40 + /* The link level is up (0x11) */ +#define IPATH_LINKINIT 0x80 + /* The link is in the armed (0x21) state */ +#define IPATH_LINKARMED 0x100 + /* The link is in the active (0x31) state */ +#define IPATH_LINKACTIVE 0x200 + /* link current state is unknown */ +#define IPATH_LINKUNK 0x400 + /* Write combining flush needed for PIO */ +#define IPATH_PIO_FLUSH_WC 0x1000 + /* DMA Receive tail pointer */ +#define IPATH_NODMA_RTAIL 0x2000 + /* no IB cable, or no device on IB cable */ +#define IPATH_NOCABLE 0x4000 + /* Supports port zero per packet receive interrupts via + * GPIO */ +#define IPATH_GPIO_INTR 0x8000 + /* uses the coded 4byte TID, not 8 byte */ +#define IPATH_4BYTE_TID 0x10000 + /* packet/word counters are 32 bit, else those 4 counters + * are 64bit */ +#define IPATH_32BITCOUNTERS 0x20000 + /* Interrupt register is 64 bits */ +#define IPATH_INTREG_64 0x40000 + /* can miss port0 rx interrupts */ +#define IPATH_DISABLED 0x80000 /* administratively disabled */ + /* Use GPIO interrupts for new counters */ +#define IPATH_GPIO_ERRINTRS 0x100000 +#define IPATH_SWAP_PIOBUFS 0x200000 + /* Supports Send DMA */ +#define IPATH_HAS_SEND_DMA 0x400000 + /* Supports Send Count (not just word count) in PBC */ +#define IPATH_HAS_PBC_CNT 0x800000 + /* Suppress heartbeat, even if turning off loopback */ +#define IPATH_NO_HRTBT 0x1000000 +#define IPATH_HAS_THRESH_UPDATE 0x4000000 +#define IPATH_HAS_MULT_IB_SPEED 0x8000000 +#define IPATH_IB_AUTONEG_INPROG 0x10000000 +#define IPATH_IB_AUTONEG_FAILED 0x20000000 + /* Linkdown-disable intentionally, Do not attempt to bring up */ +#define IPATH_IB_LINK_DISABLED 0x40000000 +#define IPATH_IB_FORCE_NOTIFY 0x80000000 /* force notify on next ib change */ + +/* Bits in GPIO for the added interrupts */ +#define IPATH_GPIO_PORT0_BIT 2 +#define IPATH_GPIO_RXUVL_BIT 3 +#define IPATH_GPIO_OVRUN_BIT 4 +#define IPATH_GPIO_LLI_BIT 5 +#define IPATH_GPIO_ERRINTR_MASK 0x38 + +/* portdata flag bit offsets */ + /* waiting for a packet to arrive */ +#define IPATH_PORT_WAITING_RCV 2 + /* master has not finished initializing */ +#define IPATH_PORT_MASTER_UNINIT 4 + /* waiting for an urgent packet to arrive */ +#define IPATH_PORT_WAITING_URG 5 + +/* free up any allocated data at closes */ +void ipath_free_data(struct ipath_portdata *dd); +u32 __iomem *ipath_getpiobuf(struct ipath_devdata *, u32, u32 *); +void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start, + unsigned len, int avail); +void ipath_init_iba6110_funcs(struct ipath_devdata *); +void ipath_get_eeprom_info(struct ipath_devdata *); +int ipath_update_eeprom_log(struct ipath_devdata *dd); +void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr); +u64 ipath_snap_cntr(struct ipath_devdata *, ipath_creg); +void ipath_disarm_senderrbufs(struct ipath_devdata *); +void ipath_force_pio_avail_update(struct ipath_devdata *); +void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev); + +/* + * Set LED override, only the two LSBs have "public" meaning, but + * any non-zero value substitutes them for the Link and LinkTrain + * LED states. + */ +#define IPATH_LED_PHYS 1 /* Physical (linktraining) GREEN LED */ +#define IPATH_LED_LOG 2 /* Logical (link) YELLOW LED */ +void ipath_set_led_override(struct ipath_devdata *dd, unsigned int val); + +/* send dma routines */ +int setup_sdma(struct ipath_devdata *); +void teardown_sdma(struct ipath_devdata *); +void ipath_restart_sdma(struct ipath_devdata *); +void ipath_sdma_intr(struct ipath_devdata *); +int ipath_sdma_verbs_send(struct ipath_devdata *, struct ipath_sge_state *, + u32, struct ipath_verbs_txreq *); +/* ipath_sdma_lock should be locked before calling this. */ +int ipath_sdma_make_progress(struct ipath_devdata *dd); + +/* must be called under ipath_sdma_lock */ +static inline u16 ipath_sdma_descq_freecnt(const struct ipath_devdata *dd) +{ + return dd->ipath_sdma_descq_cnt - + (dd->ipath_sdma_descq_added - dd->ipath_sdma_descq_removed) - + 1 - dd->ipath_sdma_desc_nreserved; +} + +static inline void ipath_sdma_desc_reserve(struct ipath_devdata *dd, u16 cnt) +{ + dd->ipath_sdma_desc_nreserved += cnt; +} + +static inline void ipath_sdma_desc_unreserve(struct ipath_devdata *dd, u16 cnt) +{ + dd->ipath_sdma_desc_nreserved -= cnt; +} + +/* + * number of words used for protocol header if not set by ipath_userinit(); + */ +#define IPATH_DFLT_RCVHDRSIZE 9 + +int ipath_get_user_pages(unsigned long, size_t, struct page **); +void ipath_release_user_pages(struct page **, size_t); +void ipath_release_user_pages_on_close(struct page **, size_t); +int ipath_eeprom_read(struct ipath_devdata *, u8, void *, int); +int ipath_eeprom_write(struct ipath_devdata *, u8, const void *, int); +int ipath_tempsense_read(struct ipath_devdata *, u8 regnum); +int ipath_tempsense_write(struct ipath_devdata *, u8 regnum, u8 data); + +/* these are used for the registers that vary with port */ +void ipath_write_kreg_port(const struct ipath_devdata *, ipath_kreg, + unsigned, u64); + +/* + * We could have a single register get/put routine, that takes a group type, + * but this is somewhat clearer and cleaner. It also gives us some error + * checking. 64 bit register reads should always work, but are inefficient + * on opteron (the northbridge always generates 2 separate HT 32 bit reads), + * so we use kreg32 wherever possible. User register and counter register + * reads are always 32 bit reads, so only one form of those routines. + */ + +/* + * At the moment, none of the s-registers are writable, so no + * ipath_write_sreg(). + */ + +/** + * ipath_read_ureg32 - read 32-bit virtualized per-port register + * @dd: device + * @regno: register number + * @port: port number + * + * Return the contents of a register that is virtualized to be per port. + * Returns -1 on errors (not distinguishable from valid contents at + * runtime; we may add a separate error variable at some point). + */ +static inline u32 ipath_read_ureg32(const struct ipath_devdata *dd, + ipath_ureg regno, int port) +{ + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) + return 0; + + return readl(regno + (u64 __iomem *) + (dd->ipath_uregbase + + (char __iomem *)dd->ipath_kregbase + + dd->ipath_ureg_align * port)); +} + +/** + * ipath_write_ureg - write 32-bit virtualized per-port register + * @dd: device + * @regno: register number + * @value: value + * @port: port + * + * Write the contents of a register that is virtualized to be per port. + */ +static inline void ipath_write_ureg(const struct ipath_devdata *dd, + ipath_ureg regno, u64 value, int port) +{ + u64 __iomem *ubase = (u64 __iomem *) + (dd->ipath_uregbase + (char __iomem *) dd->ipath_kregbase + + dd->ipath_ureg_align * port); + if (dd->ipath_kregbase) + writeq(value, &ubase[regno]); +} + +static inline u32 ipath_read_kreg32(const struct ipath_devdata *dd, + ipath_kreg regno) +{ + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) + return -1; + return readl((u32 __iomem *) & dd->ipath_kregbase[regno]); +} + +static inline u64 ipath_read_kreg64(const struct ipath_devdata *dd, + ipath_kreg regno) +{ + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) + return -1; + + return readq(&dd->ipath_kregbase[regno]); +} + +static inline void ipath_write_kreg(const struct ipath_devdata *dd, + ipath_kreg regno, u64 value) +{ + if (dd->ipath_kregbase) + writeq(value, &dd->ipath_kregbase[regno]); +} + +static inline u64 ipath_read_creg(const struct ipath_devdata *dd, + ipath_sreg regno) +{ + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) + return 0; + + return readq(regno + (u64 __iomem *) + (dd->ipath_cregbase + + (char __iomem *)dd->ipath_kregbase)); +} + +static inline u32 ipath_read_creg32(const struct ipath_devdata *dd, + ipath_sreg regno) +{ + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) + return 0; + return readl(regno + (u64 __iomem *) + (dd->ipath_cregbase + + (char __iomem *)dd->ipath_kregbase)); +} + +static inline void ipath_write_creg(const struct ipath_devdata *dd, + ipath_creg regno, u64 value) +{ + if (dd->ipath_kregbase) + writeq(value, regno + (u64 __iomem *) + (dd->ipath_cregbase + + (char __iomem *)dd->ipath_kregbase)); +} + +static inline void ipath_clear_rcvhdrtail(const struct ipath_portdata *pd) +{ + *((u64 *) pd->port_rcvhdrtail_kvaddr) = 0ULL; +} + +static inline u32 ipath_get_rcvhdrtail(const struct ipath_portdata *pd) +{ + return (u32) le64_to_cpu(*((volatile __le64 *) + pd->port_rcvhdrtail_kvaddr)); +} + +static inline u32 ipath_get_hdrqtail(const struct ipath_portdata *pd) +{ + const struct ipath_devdata *dd = pd->port_dd; + u32 hdrqtail; + + if (dd->ipath_flags & IPATH_NODMA_RTAIL) { + __le32 *rhf_addr; + u32 seq; + + rhf_addr = (__le32 *) pd->port_rcvhdrq + + pd->port_head + dd->ipath_rhf_offset; + seq = ipath_hdrget_seq(rhf_addr); + hdrqtail = pd->port_head; + if (seq == pd->port_seq_cnt) + hdrqtail++; + } else + hdrqtail = ipath_get_rcvhdrtail(pd); + + return hdrqtail; +} + +static inline u64 ipath_read_ireg(const struct ipath_devdata *dd, ipath_kreg r) +{ + return (dd->ipath_flags & IPATH_INTREG_64) ? + ipath_read_kreg64(dd, r) : ipath_read_kreg32(dd, r); +} + +/* + * from contents of IBCStatus (or a saved copy), return linkstate + * Report ACTIVE_DEFER as ACTIVE, because we treat them the same + * everywhere, anyway (and should be, for almost all purposes). + */ +static inline u32 ipath_ib_linkstate(struct ipath_devdata *dd, u64 ibcs) +{ + u32 state = (u32)(ibcs >> dd->ibcs_ls_shift) & + INFINIPATH_IBCS_LINKSTATE_MASK; + if (state == INFINIPATH_IBCS_L_STATE_ACT_DEFER) + state = INFINIPATH_IBCS_L_STATE_ACTIVE; + return state; +} + +/* from contents of IBCStatus (or a saved copy), return linktrainingstate */ +static inline u32 ipath_ib_linktrstate(struct ipath_devdata *dd, u64 ibcs) +{ + return (u32)(ibcs >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) & + dd->ibcs_lts_mask; +} + +/* + * from contents of IBCStatus (or a saved copy), return logical link state + * combination of link state and linktraining state (down, active, init, + * arm, etc. + */ +static inline u32 ipath_ib_state(struct ipath_devdata *dd, u64 ibcs) +{ + u32 ibs; + ibs = (u32)(ibcs >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) & + dd->ibcs_lts_mask; + ibs |= (u32)(ibcs & + (INFINIPATH_IBCS_LINKSTATE_MASK << dd->ibcs_ls_shift)); + return ibs; +} + +/* + * sysfs interface. + */ + +struct device_driver; + +extern const char ib_ipath_version[]; + +extern const struct attribute_group *ipath_driver_attr_groups[]; + +int ipath_device_create_group(struct device *, struct ipath_devdata *); +void ipath_device_remove_group(struct device *, struct ipath_devdata *); +int ipath_expose_reset(struct device *); + +int ipath_init_ipathfs(void); +void ipath_exit_ipathfs(void); +int ipathfs_add_device(struct ipath_devdata *); +int ipathfs_remove_device(struct ipath_devdata *); + +/* + * dma_addr wrappers - all 0's invalid for hw + */ +dma_addr_t ipath_map_page(struct pci_dev *, struct page *, unsigned long, + size_t, int); +dma_addr_t ipath_map_single(struct pci_dev *, void *, size_t, int); +const char *ipath_get_unit_name(int unit); + +/* + * Flush write combining store buffers (if present) and perform a write + * barrier. + */ +#if defined(CONFIG_X86_64) +#define ipath_flush_wc() asm volatile("sfence" ::: "memory") +#else +#define ipath_flush_wc() wmb() +#endif + +extern unsigned ipath_debug; /* debugging bit mask */ +extern unsigned ipath_linkrecovery; +extern unsigned ipath_mtu4096; +extern struct mutex ipath_mutex; + +#define IPATH_DRV_NAME "ib_ipath" +#define IPATH_MAJOR 233 +#define IPATH_USER_MINOR_BASE 0 +#define IPATH_DIAGPKT_MINOR 127 +#define IPATH_DIAG_MINOR_BASE 129 +#define IPATH_NMINORS 255 + +#define ipath_dev_err(dd,fmt,...) \ + do { \ + const struct ipath_devdata *__dd = (dd); \ + if (__dd->pcidev) \ + dev_err(&__dd->pcidev->dev, "%s: " fmt, \ + ipath_get_unit_name(__dd->ipath_unit), \ + ##__VA_ARGS__); \ + else \ + printk(KERN_ERR IPATH_DRV_NAME ": %s: " fmt, \ + ipath_get_unit_name(__dd->ipath_unit), \ + ##__VA_ARGS__); \ + } while (0) + +#if _IPATH_DEBUGGING + +# define __IPATH_DBG_WHICH(which,fmt,...) \ + do { \ + if (unlikely(ipath_debug & (which))) \ + printk(KERN_DEBUG IPATH_DRV_NAME ": %s: " fmt, \ + __func__,##__VA_ARGS__); \ + } while(0) + +# define ipath_dbg(fmt,...) \ + __IPATH_DBG_WHICH(__IPATH_DBG,fmt,##__VA_ARGS__) +# define ipath_cdbg(which,fmt,...) \ + __IPATH_DBG_WHICH(__IPATH_##which##DBG,fmt,##__VA_ARGS__) + +#else /* ! _IPATH_DEBUGGING */ + +# define ipath_dbg(fmt,...) +# define ipath_cdbg(which,fmt,...) + +#endif /* _IPATH_DEBUGGING */ + +/* + * this is used for formatting hw error messages... + */ +struct ipath_hwerror_msgs { + u64 mask; + const char *msg; +}; + +#define INFINIPATH_HWE_MSG(a, b) { .mask = INFINIPATH_HWE_##a, .msg = b } + +/* in ipath_intr.c... */ +void ipath_format_hwerrors(u64 hwerrs, + const struct ipath_hwerror_msgs *hwerrmsgs, + size_t nhwerrmsgs, + char *msg, size_t lmsg); + +#endif /* _IPATH_KERNEL_H */ diff --git a/drivers/infiniband/hw/ipath/ipath_keys.c b/drivers/infiniband/hw/ipath/ipath_keys.c new file mode 100644 index 0000000..c0e933f --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_keys.c @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "ipath_verbs.h" +#include "ipath_kernel.h" + +/** + * ipath_alloc_lkey - allocate an lkey + * @rkt: lkey table in which to allocate the lkey + * @mr: memory region that this lkey protects + * + * Returns 1 if successful, otherwise returns 0. + */ + +int ipath_alloc_lkey(struct ipath_lkey_table *rkt, struct ipath_mregion *mr) +{ + unsigned long flags; + u32 r; + u32 n; + int ret; + + spin_lock_irqsave(&rkt->lock, flags); + + /* Find the next available LKEY */ + r = n = rkt->next; + for (;;) { + if (rkt->table[r] == NULL) + break; + r = (r + 1) & (rkt->max - 1); + if (r == n) { + spin_unlock_irqrestore(&rkt->lock, flags); + ipath_dbg("LKEY table full\n"); + ret = 0; + goto bail; + } + } + rkt->next = (r + 1) & (rkt->max - 1); + /* + * Make sure lkey is never zero which is reserved to indicate an + * unrestricted LKEY. + */ + rkt->gen++; + mr->lkey = (r << (32 - ib_ipath_lkey_table_size)) | + ((((1 << (24 - ib_ipath_lkey_table_size)) - 1) & rkt->gen) + << 8); + if (mr->lkey == 0) { + mr->lkey |= 1 << 8; + rkt->gen++; + } + rkt->table[r] = mr; + spin_unlock_irqrestore(&rkt->lock, flags); + + ret = 1; + +bail: + return ret; +} + +/** + * ipath_free_lkey - free an lkey + * @rkt: table from which to free the lkey + * @lkey: lkey id to free + */ +void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey) +{ + unsigned long flags; + u32 r; + + if (lkey == 0) + return; + r = lkey >> (32 - ib_ipath_lkey_table_size); + spin_lock_irqsave(&rkt->lock, flags); + rkt->table[r] = NULL; + spin_unlock_irqrestore(&rkt->lock, flags); +} + +/** + * ipath_lkey_ok - check IB SGE for validity and initialize + * @rkt: table containing lkey to check SGE against + * @isge: outgoing internal SGE + * @sge: SGE to check + * @acc: access flags + * + * Return 1 if valid and successful, otherwise returns 0. + * + * Check the IB SGE for validity and initialize our internal version + * of it. + */ +int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge, + struct ib_sge *sge, int acc) +{ + struct ipath_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table; + struct ipath_mregion *mr; + unsigned n, m; + size_t off; + int ret; + + /* + * We use LKEY == zero for kernel virtual addresses + * (see ipath_get_dma_mr and ipath_dma.c). + */ + if (sge->lkey == 0) { + /* always a kernel port, no locking needed */ + struct ipath_pd *pd = to_ipd(qp->ibqp.pd); + + if (pd->user) { + ret = 0; + goto bail; + } + isge->mr = NULL; + isge->vaddr = (void *) sge->addr; + isge->length = sge->length; + isge->sge_length = sge->length; + ret = 1; + goto bail; + } + mr = rkt->table[(sge->lkey >> (32 - ib_ipath_lkey_table_size))]; + if (unlikely(mr == NULL || mr->lkey != sge->lkey || + qp->ibqp.pd != mr->pd)) { + ret = 0; + goto bail; + } + + off = sge->addr - mr->user_base; + if (unlikely(sge->addr < mr->user_base || + off + sge->length > mr->length || + (mr->access_flags & acc) != acc)) { + ret = 0; + goto bail; + } + + off += mr->offset; + m = 0; + n = 0; + while (off >= mr->map[m]->segs[n].length) { + off -= mr->map[m]->segs[n].length; + n++; + if (n >= IPATH_SEGSZ) { + m++; + n = 0; + } + } + isge->mr = mr; + isge->vaddr = mr->map[m]->segs[n].vaddr + off; + isge->length = mr->map[m]->segs[n].length - off; + isge->sge_length = sge->length; + isge->m = m; + isge->n = n; + + ret = 1; + +bail: + return ret; +} + +/** + * ipath_rkey_ok - check the IB virtual address, length, and RKEY + * @dev: infiniband device + * @ss: SGE state + * @len: length of data + * @vaddr: virtual address to place data + * @rkey: rkey to check + * @acc: access flags + * + * Return 1 if successful, otherwise 0. + */ +int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss, + u32 len, u64 vaddr, u32 rkey, int acc) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ipath_lkey_table *rkt = &dev->lk_table; + struct ipath_sge *sge = &ss->sge; + struct ipath_mregion *mr; + unsigned n, m; + size_t off; + int ret; + + /* + * We use RKEY == zero for kernel virtual addresses + * (see ipath_get_dma_mr and ipath_dma.c). + */ + if (rkey == 0) { + /* always a kernel port, no locking needed */ + struct ipath_pd *pd = to_ipd(qp->ibqp.pd); + + if (pd->user) { + ret = 0; + goto bail; + } + sge->mr = NULL; + sge->vaddr = (void *) vaddr; + sge->length = len; + sge->sge_length = len; + ss->sg_list = NULL; + ss->num_sge = 1; + ret = 1; + goto bail; + } + + mr = rkt->table[(rkey >> (32 - ib_ipath_lkey_table_size))]; + if (unlikely(mr == NULL || mr->lkey != rkey || + qp->ibqp.pd != mr->pd)) { + ret = 0; + goto bail; + } + + off = vaddr - mr->iova; + if (unlikely(vaddr < mr->iova || off + len > mr->length || + (mr->access_flags & acc) == 0)) { + ret = 0; + goto bail; + } + + off += mr->offset; + m = 0; + n = 0; + while (off >= mr->map[m]->segs[n].length) { + off -= mr->map[m]->segs[n].length; + n++; + if (n >= IPATH_SEGSZ) { + m++; + n = 0; + } + } + sge->mr = mr; + sge->vaddr = mr->map[m]->segs[n].vaddr + off; + sge->length = mr->map[m]->segs[n].length - off; + sge->sge_length = len; + sge->m = m; + sge->n = n; + ss->sg_list = NULL; + ss->num_sge = 1; + + ret = 1; + +bail: + return ret; +} diff --git a/drivers/infiniband/hw/ipath/ipath_mad.c b/drivers/infiniband/hw/ipath/ipath_mad.c new file mode 100644 index 0000000..e890e5b --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_mad.c @@ -0,0 +1,1513 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "ipath_kernel.h" +#include "ipath_verbs.h" +#include "ipath_common.h" + +#define IB_SMP_UNSUP_VERSION cpu_to_be16(0x0004) +#define IB_SMP_UNSUP_METHOD cpu_to_be16(0x0008) +#define IB_SMP_UNSUP_METH_ATTR cpu_to_be16(0x000C) +#define IB_SMP_INVALID_FIELD cpu_to_be16(0x001C) + +static int reply(struct ib_smp *smp) +{ + /* + * The verbs framework will handle the directed/LID route + * packet changes. + */ + smp->method = IB_MGMT_METHOD_GET_RESP; + if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + smp->status |= IB_SMP_DIRECTION; + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +static int recv_subn_get_nodedescription(struct ib_smp *smp, + struct ib_device *ibdev) +{ + if (smp->attr_mod) + smp->status |= IB_SMP_INVALID_FIELD; + + memcpy(smp->data, ibdev->node_desc, sizeof(smp->data)); + + return reply(smp); +} + +struct nodeinfo { + u8 base_version; + u8 class_version; + u8 node_type; + u8 num_ports; + __be64 sys_guid; + __be64 node_guid; + __be64 port_guid; + __be16 partition_cap; + __be16 device_id; + __be32 revision; + u8 local_port_num; + u8 vendor_id[3]; +} __attribute__ ((packed)); + +static int recv_subn_get_nodeinfo(struct ib_smp *smp, + struct ib_device *ibdev, u8 port) +{ + struct nodeinfo *nip = (struct nodeinfo *)&smp->data; + struct ipath_devdata *dd = to_idev(ibdev)->dd; + u32 vendor, majrev, minrev; + + /* GUID 0 is illegal */ + if (smp->attr_mod || (dd->ipath_guid == 0)) + smp->status |= IB_SMP_INVALID_FIELD; + + nip->base_version = 1; + nip->class_version = 1; + nip->node_type = 1; /* channel adapter */ + /* + * XXX The num_ports value will need a layer function to get + * the value if we ever have more than one IB port on a chip. + * We will also need to get the GUID for the port. + */ + nip->num_ports = ibdev->phys_port_cnt; + /* This is already in network order */ + nip->sys_guid = to_idev(ibdev)->sys_image_guid; + nip->node_guid = dd->ipath_guid; + nip->port_guid = dd->ipath_guid; + nip->partition_cap = cpu_to_be16(ipath_get_npkeys(dd)); + nip->device_id = cpu_to_be16(dd->ipath_deviceid); + majrev = dd->ipath_majrev; + minrev = dd->ipath_minrev; + nip->revision = cpu_to_be32((majrev << 16) | minrev); + nip->local_port_num = port; + vendor = dd->ipath_vendorid; + nip->vendor_id[0] = IPATH_SRC_OUI_1; + nip->vendor_id[1] = IPATH_SRC_OUI_2; + nip->vendor_id[2] = IPATH_SRC_OUI_3; + + return reply(smp); +} + +static int recv_subn_get_guidinfo(struct ib_smp *smp, + struct ib_device *ibdev) +{ + u32 startgx = 8 * be32_to_cpu(smp->attr_mod); + __be64 *p = (__be64 *) smp->data; + + /* 32 blocks of 8 64-bit GUIDs per block */ + + memset(smp->data, 0, sizeof(smp->data)); + + /* + * We only support one GUID for now. If this changes, the + * portinfo.guid_cap field needs to be updated too. + */ + if (startgx == 0) { + __be64 g = to_idev(ibdev)->dd->ipath_guid; + if (g == 0) + /* GUID 0 is illegal */ + smp->status |= IB_SMP_INVALID_FIELD; + else + /* The first is a copy of the read-only HW GUID. */ + *p = g; + } else + smp->status |= IB_SMP_INVALID_FIELD; + + return reply(smp); +} + +static void set_link_width_enabled(struct ipath_devdata *dd, u32 w) +{ + (void) dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB, w); +} + +static void set_link_speed_enabled(struct ipath_devdata *dd, u32 s) +{ + (void) dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB, s); +} + +static int get_overrunthreshold(struct ipath_devdata *dd) +{ + return (dd->ipath_ibcctrl >> + INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT) & + INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK; +} + +/** + * set_overrunthreshold - set the overrun threshold + * @dd: the infinipath device + * @n: the new threshold + * + * Note that this will only take effect when the link state changes. + */ +static int set_overrunthreshold(struct ipath_devdata *dd, unsigned n) +{ + unsigned v; + + v = (dd->ipath_ibcctrl >> INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT) & + INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK; + if (v != n) { + dd->ipath_ibcctrl &= + ~(INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK << + INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT); + dd->ipath_ibcctrl |= + (u64) n << INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT; + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl); + } + return 0; +} + +static int get_phyerrthreshold(struct ipath_devdata *dd) +{ + return (dd->ipath_ibcctrl >> + INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) & + INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK; +} + +/** + * set_phyerrthreshold - set the physical error threshold + * @dd: the infinipath device + * @n: the new threshold + * + * Note that this will only take effect when the link state changes. + */ +static int set_phyerrthreshold(struct ipath_devdata *dd, unsigned n) +{ + unsigned v; + + v = (dd->ipath_ibcctrl >> INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) & + INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK; + if (v != n) { + dd->ipath_ibcctrl &= + ~(INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK << + INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT); + dd->ipath_ibcctrl |= + (u64) n << INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT; + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl); + } + return 0; +} + +/** + * get_linkdowndefaultstate - get the default linkdown state + * @dd: the infinipath device + * + * Returns zero if the default is POLL, 1 if the default is SLEEP. + */ +static int get_linkdowndefaultstate(struct ipath_devdata *dd) +{ + return !!(dd->ipath_ibcctrl & INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE); +} + +static int recv_subn_get_portinfo(struct ib_smp *smp, + struct ib_device *ibdev, u8 port) +{ + struct ipath_ibdev *dev; + struct ipath_devdata *dd; + struct ib_port_info *pip = (struct ib_port_info *)smp->data; + u16 lid; + u8 ibcstat; + u8 mtu; + int ret; + + if (be32_to_cpu(smp->attr_mod) > ibdev->phys_port_cnt) { + smp->status |= IB_SMP_INVALID_FIELD; + ret = reply(smp); + goto bail; + } + + dev = to_idev(ibdev); + dd = dev->dd; + + /* Clear all fields. Only set the non-zero fields. */ + memset(smp->data, 0, sizeof(smp->data)); + + /* Only return the mkey if the protection field allows it. */ + if (smp->method == IB_MGMT_METHOD_SET || dev->mkey == smp->mkey || + dev->mkeyprot == 0) + pip->mkey = dev->mkey; + pip->gid_prefix = dev->gid_prefix; + lid = dd->ipath_lid; + pip->lid = lid ? cpu_to_be16(lid) : IB_LID_PERMISSIVE; + pip->sm_lid = cpu_to_be16(dev->sm_lid); + pip->cap_mask = cpu_to_be32(dev->port_cap_flags); + /* pip->diag_code; */ + pip->mkey_lease_period = cpu_to_be16(dev->mkey_lease_period); + pip->local_port_num = port; + pip->link_width_enabled = dd->ipath_link_width_enabled; + pip->link_width_supported = dd->ipath_link_width_supported; + pip->link_width_active = dd->ipath_link_width_active; + pip->linkspeed_portstate = dd->ipath_link_speed_supported << 4; + ibcstat = dd->ipath_lastibcstat; + /* map LinkState to IB portinfo values. */ + pip->linkspeed_portstate |= ipath_ib_linkstate(dd, ibcstat) + 1; + + pip->portphysstate_linkdown = + (ipath_cvt_physportstate[ibcstat & dd->ibcs_lts_mask] << 4) | + (get_linkdowndefaultstate(dd) ? 1 : 2); + pip->mkeyprot_resv_lmc = (dev->mkeyprot << 6) | dd->ipath_lmc; + pip->linkspeedactive_enabled = (dd->ipath_link_speed_active << 4) | + dd->ipath_link_speed_enabled; + switch (dd->ipath_ibmtu) { + case 4096: + mtu = IB_MTU_4096; + break; + case 2048: + mtu = IB_MTU_2048; + break; + case 1024: + mtu = IB_MTU_1024; + break; + case 512: + mtu = IB_MTU_512; + break; + case 256: + mtu = IB_MTU_256; + break; + default: /* oops, something is wrong */ + mtu = IB_MTU_2048; + break; + } + pip->neighbormtu_mastersmsl = (mtu << 4) | dev->sm_sl; + pip->vlcap_inittype = 0x10; /* VLCap = VL0, InitType = 0 */ + pip->vl_high_limit = dev->vl_high_limit; + /* pip->vl_arb_high_cap; // only one VL */ + /* pip->vl_arb_low_cap; // only one VL */ + /* InitTypeReply = 0 */ + /* our mtu cap depends on whether 4K MTU enabled or not */ + pip->inittypereply_mtucap = ipath_mtu4096 ? IB_MTU_4096 : IB_MTU_2048; + /* HCAs ignore VLStallCount and HOQLife */ + /* pip->vlstallcnt_hoqlife; */ + pip->operationalvl_pei_peo_fpi_fpo = 0x10; /* OVLs = 1 */ + pip->mkey_violations = cpu_to_be16(dev->mkey_violations); + /* P_KeyViolations are counted by hardware. */ + pip->pkey_violations = + cpu_to_be16((ipath_get_cr_errpkey(dd) - + dev->z_pkey_violations) & 0xFFFF); + pip->qkey_violations = cpu_to_be16(dev->qkey_violations); + /* Only the hardware GUID is supported for now */ + pip->guid_cap = 1; + pip->clientrereg_resv_subnetto = dev->subnet_timeout; + /* 32.768 usec. response time (guessing) */ + pip->resv_resptimevalue = 3; + pip->localphyerrors_overrunerrors = + (get_phyerrthreshold(dd) << 4) | + get_overrunthreshold(dd); + /* pip->max_credit_hint; */ + if (dev->port_cap_flags & IB_PORT_LINK_LATENCY_SUP) { + u32 v; + + v = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LINKLATENCY); + pip->link_roundtrip_latency[0] = v >> 16; + pip->link_roundtrip_latency[1] = v >> 8; + pip->link_roundtrip_latency[2] = v; + } + + ret = reply(smp); + +bail: + return ret; +} + +/** + * get_pkeys - return the PKEY table for port 0 + * @dd: the infinipath device + * @pkeys: the pkey table is placed here + */ +static int get_pkeys(struct ipath_devdata *dd, u16 * pkeys) +{ + /* always a kernel port, no locking needed */ + struct ipath_portdata *pd = dd->ipath_pd[0]; + + memcpy(pkeys, pd->port_pkeys, sizeof(pd->port_pkeys)); + + return 0; +} + +static int recv_subn_get_pkeytable(struct ib_smp *smp, + struct ib_device *ibdev) +{ + u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff); + u16 *p = (u16 *) smp->data; + __be16 *q = (__be16 *) smp->data; + + /* 64 blocks of 32 16-bit P_Key entries */ + + memset(smp->data, 0, sizeof(smp->data)); + if (startpx == 0) { + struct ipath_ibdev *dev = to_idev(ibdev); + unsigned i, n = ipath_get_npkeys(dev->dd); + + get_pkeys(dev->dd, p); + + for (i = 0; i < n; i++) + q[i] = cpu_to_be16(p[i]); + } else + smp->status |= IB_SMP_INVALID_FIELD; + + return reply(smp); +} + +static int recv_subn_set_guidinfo(struct ib_smp *smp, + struct ib_device *ibdev) +{ + /* The only GUID we support is the first read-only entry. */ + return recv_subn_get_guidinfo(smp, ibdev); +} + +/** + * set_linkdowndefaultstate - set the default linkdown state + * @dd: the infinipath device + * @sleep: the new state + * + * Note that this will only take effect when the link state changes. + */ +static int set_linkdowndefaultstate(struct ipath_devdata *dd, int sleep) +{ + if (sleep) + dd->ipath_ibcctrl |= INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE; + else + dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl); + return 0; +} + +/** + * recv_subn_set_portinfo - set port information + * @smp: the incoming SM packet + * @ibdev: the infiniband device + * @port: the port on the device + * + * Set Portinfo (see ch. 14.2.5.6). + */ +static int recv_subn_set_portinfo(struct ib_smp *smp, + struct ib_device *ibdev, u8 port) +{ + struct ib_port_info *pip = (struct ib_port_info *)smp->data; + struct ib_event event; + struct ipath_ibdev *dev; + struct ipath_devdata *dd; + char clientrereg = 0; + u16 lid, smlid; + u8 lwe; + u8 lse; + u8 state; + u16 lstate; + u32 mtu; + int ret, ore; + + if (be32_to_cpu(smp->attr_mod) > ibdev->phys_port_cnt) + goto err; + + dev = to_idev(ibdev); + dd = dev->dd; + event.device = ibdev; + event.element.port_num = port; + + dev->mkey = pip->mkey; + dev->gid_prefix = pip->gid_prefix; + dev->mkey_lease_period = be16_to_cpu(pip->mkey_lease_period); + + lid = be16_to_cpu(pip->lid); + if (dd->ipath_lid != lid || + dd->ipath_lmc != (pip->mkeyprot_resv_lmc & 7)) { + /* Must be a valid unicast LID address. */ + if (lid == 0 || lid >= IPATH_MULTICAST_LID_BASE) + goto err; + ipath_set_lid(dd, lid, pip->mkeyprot_resv_lmc & 7); + event.event = IB_EVENT_LID_CHANGE; + ib_dispatch_event(&event); + } + + smlid = be16_to_cpu(pip->sm_lid); + if (smlid != dev->sm_lid) { + /* Must be a valid unicast LID address. */ + if (smlid == 0 || smlid >= IPATH_MULTICAST_LID_BASE) + goto err; + dev->sm_lid = smlid; + event.event = IB_EVENT_SM_CHANGE; + ib_dispatch_event(&event); + } + + /* Allow 1x or 4x to be set (see 14.2.6.6). */ + lwe = pip->link_width_enabled; + if (lwe) { + if (lwe == 0xFF) + lwe = dd->ipath_link_width_supported; + else if (lwe >= 16 || (lwe & ~dd->ipath_link_width_supported)) + goto err; + set_link_width_enabled(dd, lwe); + } + + /* Allow 2.5 or 5.0 Gbs. */ + lse = pip->linkspeedactive_enabled & 0xF; + if (lse) { + if (lse == 15) + lse = dd->ipath_link_speed_supported; + else if (lse >= 8 || (lse & ~dd->ipath_link_speed_supported)) + goto err; + set_link_speed_enabled(dd, lse); + } + + /* Set link down default state. */ + switch (pip->portphysstate_linkdown & 0xF) { + case 0: /* NOP */ + break; + case 1: /* SLEEP */ + if (set_linkdowndefaultstate(dd, 1)) + goto err; + break; + case 2: /* POLL */ + if (set_linkdowndefaultstate(dd, 0)) + goto err; + break; + default: + goto err; + } + + dev->mkeyprot = pip->mkeyprot_resv_lmc >> 6; + dev->vl_high_limit = pip->vl_high_limit; + + switch ((pip->neighbormtu_mastersmsl >> 4) & 0xF) { + case IB_MTU_256: + mtu = 256; + break; + case IB_MTU_512: + mtu = 512; + break; + case IB_MTU_1024: + mtu = 1024; + break; + case IB_MTU_2048: + mtu = 2048; + break; + case IB_MTU_4096: + if (!ipath_mtu4096) + goto err; + mtu = 4096; + break; + default: + /* XXX We have already partially updated our state! */ + goto err; + } + ipath_set_mtu(dd, mtu); + + dev->sm_sl = pip->neighbormtu_mastersmsl & 0xF; + + /* We only support VL0 */ + if (((pip->operationalvl_pei_peo_fpi_fpo >> 4) & 0xF) > 1) + goto err; + + if (pip->mkey_violations == 0) + dev->mkey_violations = 0; + + /* + * Hardware counter can't be reset so snapshot and subtract + * later. + */ + if (pip->pkey_violations == 0) + dev->z_pkey_violations = ipath_get_cr_errpkey(dd); + + if (pip->qkey_violations == 0) + dev->qkey_violations = 0; + + ore = pip->localphyerrors_overrunerrors; + if (set_phyerrthreshold(dd, (ore >> 4) & 0xF)) + goto err; + + if (set_overrunthreshold(dd, (ore & 0xF))) + goto err; + + dev->subnet_timeout = pip->clientrereg_resv_subnetto & 0x1F; + + if (pip->clientrereg_resv_subnetto & 0x80) { + clientrereg = 1; + event.event = IB_EVENT_CLIENT_REREGISTER; + ib_dispatch_event(&event); + } + + /* + * Do the port state change now that the other link parameters + * have been set. + * Changing the port physical state only makes sense if the link + * is down or is being set to down. + */ + state = pip->linkspeed_portstate & 0xF; + lstate = (pip->portphysstate_linkdown >> 4) & 0xF; + if (lstate && !(state == IB_PORT_DOWN || state == IB_PORT_NOP)) + goto err; + + /* + * Only state changes of DOWN, ARM, and ACTIVE are valid + * and must be in the correct state to take effect (see 7.2.6). + */ + switch (state) { + case IB_PORT_NOP: + if (lstate == 0) + break; + /* FALLTHROUGH */ + case IB_PORT_DOWN: + if (lstate == 0) + lstate = IPATH_IB_LINKDOWN_ONLY; + else if (lstate == 1) + lstate = IPATH_IB_LINKDOWN_SLEEP; + else if (lstate == 2) + lstate = IPATH_IB_LINKDOWN; + else if (lstate == 3) + lstate = IPATH_IB_LINKDOWN_DISABLE; + else + goto err; + ipath_set_linkstate(dd, lstate); + if (lstate == IPATH_IB_LINKDOWN_DISABLE) { + ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + goto done; + } + ipath_wait_linkstate(dd, IPATH_LINKINIT | IPATH_LINKARMED | + IPATH_LINKACTIVE, 1000); + break; + case IB_PORT_ARMED: + ipath_set_linkstate(dd, IPATH_IB_LINKARM); + break; + case IB_PORT_ACTIVE: + ipath_set_linkstate(dd, IPATH_IB_LINKACTIVE); + break; + default: + /* XXX We have already partially updated our state! */ + goto err; + } + + ret = recv_subn_get_portinfo(smp, ibdev, port); + + if (clientrereg) + pip->clientrereg_resv_subnetto |= 0x80; + + goto done; + +err: + smp->status |= IB_SMP_INVALID_FIELD; + ret = recv_subn_get_portinfo(smp, ibdev, port); + +done: + return ret; +} + +/** + * rm_pkey - decrecment the reference count for the given PKEY + * @dd: the infinipath device + * @key: the PKEY index + * + * Return true if this was the last reference and the hardware table entry + * needs to be changed. + */ +static int rm_pkey(struct ipath_devdata *dd, u16 key) +{ + int i; + int ret; + + for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) { + if (dd->ipath_pkeys[i] != key) + continue; + if (atomic_dec_and_test(&dd->ipath_pkeyrefs[i])) { + dd->ipath_pkeys[i] = 0; + ret = 1; + goto bail; + } + break; + } + + ret = 0; + +bail: + return ret; +} + +/** + * add_pkey - add the given PKEY to the hardware table + * @dd: the infinipath device + * @key: the PKEY + * + * Return an error code if unable to add the entry, zero if no change, + * or 1 if the hardware PKEY register needs to be updated. + */ +static int add_pkey(struct ipath_devdata *dd, u16 key) +{ + int i; + u16 lkey = key & 0x7FFF; + int any = 0; + int ret; + + if (lkey == 0x7FFF) { + ret = 0; + goto bail; + } + + /* Look for an empty slot or a matching PKEY. */ + for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) { + if (!dd->ipath_pkeys[i]) { + any++; + continue; + } + /* If it matches exactly, try to increment the ref count */ + if (dd->ipath_pkeys[i] == key) { + if (atomic_inc_return(&dd->ipath_pkeyrefs[i]) > 1) { + ret = 0; + goto bail; + } + /* Lost the race. Look for an empty slot below. */ + atomic_dec(&dd->ipath_pkeyrefs[i]); + any++; + } + /* + * It makes no sense to have both the limited and unlimited + * PKEY set at the same time since the unlimited one will + * disable the limited one. + */ + if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) { + ret = -EEXIST; + goto bail; + } + } + if (!any) { + ret = -EBUSY; + goto bail; + } + for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) { + if (!dd->ipath_pkeys[i] && + atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) { + /* for ipathstats, etc. */ + ipath_stats.sps_pkeys[i] = lkey; + dd->ipath_pkeys[i] = key; + ret = 1; + goto bail; + } + } + ret = -EBUSY; + +bail: + return ret; +} + +/** + * set_pkeys - set the PKEY table for port 0 + * @dd: the infinipath device + * @pkeys: the PKEY table + */ +static int set_pkeys(struct ipath_devdata *dd, u16 *pkeys, u8 port) +{ + struct ipath_portdata *pd; + int i; + int changed = 0; + + /* always a kernel port, no locking needed */ + pd = dd->ipath_pd[0]; + + for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) { + u16 key = pkeys[i]; + u16 okey = pd->port_pkeys[i]; + + if (key == okey) + continue; + /* + * The value of this PKEY table entry is changing. + * Remove the old entry in the hardware's array of PKEYs. + */ + if (okey & 0x7FFF) + changed |= rm_pkey(dd, okey); + if (key & 0x7FFF) { + int ret = add_pkey(dd, key); + + if (ret < 0) + key = 0; + else + changed |= ret; + } + pd->port_pkeys[i] = key; + } + if (changed) { + u64 pkey; + struct ib_event event; + + pkey = (u64) dd->ipath_pkeys[0] | + ((u64) dd->ipath_pkeys[1] << 16) | + ((u64) dd->ipath_pkeys[2] << 32) | + ((u64) dd->ipath_pkeys[3] << 48); + ipath_cdbg(VERBOSE, "p0 new pkey reg %llx\n", + (unsigned long long) pkey); + ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey, + pkey); + + event.event = IB_EVENT_PKEY_CHANGE; + event.device = &dd->verbs_dev->ibdev; + event.element.port_num = port; + ib_dispatch_event(&event); + } + return 0; +} + +static int recv_subn_set_pkeytable(struct ib_smp *smp, + struct ib_device *ibdev, u8 port) +{ + u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff); + __be16 *p = (__be16 *) smp->data; + u16 *q = (u16 *) smp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + unsigned i, n = ipath_get_npkeys(dev->dd); + + for (i = 0; i < n; i++) + q[i] = be16_to_cpu(p[i]); + + if (startpx != 0 || set_pkeys(dev->dd, q, port) != 0) + smp->status |= IB_SMP_INVALID_FIELD; + + return recv_subn_get_pkeytable(smp, ibdev); +} + +static int recv_pma_get_classportinfo(struct ib_pma_mad *pmp) +{ + struct ib_class_port_info *p = + (struct ib_class_port_info *)pmp->data; + + memset(pmp->data, 0, sizeof(pmp->data)); + + if (pmp->mad_hdr.attr_mod != 0) + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + + /* Indicate AllPortSelect is valid (only one port anyway) */ + p->capability_mask = cpu_to_be16(1 << 8); + p->base_version = 1; + p->class_version = 1; + /* + * Expected response time is 4.096 usec. * 2^18 == 1.073741824 + * sec. + */ + p->resp_time_value = 18; + + return reply((struct ib_smp *) pmp); +} + +/* + * The PortSamplesControl.CounterMasks field is an array of 3 bit fields + * which specify the N'th counter's capabilities. See ch. 16.1.3.2. + * We support 5 counters which only count the mandatory quantities. + */ +#define COUNTER_MASK(q, n) (q << ((9 - n) * 3)) +#define COUNTER_MASK0_9 cpu_to_be32(COUNTER_MASK(1, 0) | \ + COUNTER_MASK(1, 1) | \ + COUNTER_MASK(1, 2) | \ + COUNTER_MASK(1, 3) | \ + COUNTER_MASK(1, 4)) + +static int recv_pma_get_portsamplescontrol(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portsamplescontrol *p = + (struct ib_pma_portsamplescontrol *)pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_cregs const *crp = dev->dd->ipath_cregs; + unsigned long flags; + u8 port_select = p->port_select; + + memset(pmp->data, 0, sizeof(pmp->data)); + + p->port_select = port_select; + if (pmp->mad_hdr.attr_mod != 0 || + (port_select != port && port_select != 0xFF)) + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + /* + * Ticks are 10x the link transfer period which for 2.5Gbs is 4 + * nsec. 0 == 4 nsec., 1 == 8 nsec., ..., 255 == 1020 nsec. Sample + * intervals are counted in ticks. Since we use Linux timers, that + * count in jiffies, we can't sample for less than 1000 ticks if HZ + * == 1000 (4000 ticks if HZ is 250). link_speed_active returns 2 for + * DDR, 1 for SDR, set the tick to 1 for DDR, 0 for SDR on chips that + * have hardware support for delaying packets. + */ + if (crp->cr_psstat) + p->tick = dev->dd->ipath_link_speed_active - 1; + else + p->tick = 250; /* 1 usec. */ + p->counter_width = 4; /* 32 bit counters */ + p->counter_mask0_9 = COUNTER_MASK0_9; + spin_lock_irqsave(&dev->pending_lock, flags); + if (crp->cr_psstat) + p->sample_status = ipath_read_creg32(dev->dd, crp->cr_psstat); + else + p->sample_status = dev->pma_sample_status; + p->sample_start = cpu_to_be32(dev->pma_sample_start); + p->sample_interval = cpu_to_be32(dev->pma_sample_interval); + p->tag = cpu_to_be16(dev->pma_tag); + p->counter_select[0] = dev->pma_counter_select[0]; + p->counter_select[1] = dev->pma_counter_select[1]; + p->counter_select[2] = dev->pma_counter_select[2]; + p->counter_select[3] = dev->pma_counter_select[3]; + p->counter_select[4] = dev->pma_counter_select[4]; + spin_unlock_irqrestore(&dev->pending_lock, flags); + + return reply((struct ib_smp *) pmp); +} + +static int recv_pma_set_portsamplescontrol(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portsamplescontrol *p = + (struct ib_pma_portsamplescontrol *)pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_cregs const *crp = dev->dd->ipath_cregs; + unsigned long flags; + u8 status; + int ret; + + if (pmp->mad_hdr.attr_mod != 0 || + (p->port_select != port && p->port_select != 0xFF)) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + ret = reply((struct ib_smp *) pmp); + goto bail; + } + + spin_lock_irqsave(&dev->pending_lock, flags); + if (crp->cr_psstat) + status = ipath_read_creg32(dev->dd, crp->cr_psstat); + else + status = dev->pma_sample_status; + if (status == IB_PMA_SAMPLE_STATUS_DONE) { + dev->pma_sample_start = be32_to_cpu(p->sample_start); + dev->pma_sample_interval = be32_to_cpu(p->sample_interval); + dev->pma_tag = be16_to_cpu(p->tag); + dev->pma_counter_select[0] = p->counter_select[0]; + dev->pma_counter_select[1] = p->counter_select[1]; + dev->pma_counter_select[2] = p->counter_select[2]; + dev->pma_counter_select[3] = p->counter_select[3]; + dev->pma_counter_select[4] = p->counter_select[4]; + if (crp->cr_psstat) { + ipath_write_creg(dev->dd, crp->cr_psinterval, + dev->pma_sample_interval); + ipath_write_creg(dev->dd, crp->cr_psstart, + dev->pma_sample_start); + } else + dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_STARTED; + } + spin_unlock_irqrestore(&dev->pending_lock, flags); + + ret = recv_pma_get_portsamplescontrol(pmp, ibdev, port); + +bail: + return ret; +} + +static u64 get_counter(struct ipath_ibdev *dev, + struct ipath_cregs const *crp, + __be16 sel) +{ + u64 ret; + + switch (sel) { + case IB_PMA_PORT_XMIT_DATA: + ret = (crp->cr_psxmitdatacount) ? + ipath_read_creg32(dev->dd, crp->cr_psxmitdatacount) : + dev->ipath_sword; + break; + case IB_PMA_PORT_RCV_DATA: + ret = (crp->cr_psrcvdatacount) ? + ipath_read_creg32(dev->dd, crp->cr_psrcvdatacount) : + dev->ipath_rword; + break; + case IB_PMA_PORT_XMIT_PKTS: + ret = (crp->cr_psxmitpktscount) ? + ipath_read_creg32(dev->dd, crp->cr_psxmitpktscount) : + dev->ipath_spkts; + break; + case IB_PMA_PORT_RCV_PKTS: + ret = (crp->cr_psrcvpktscount) ? + ipath_read_creg32(dev->dd, crp->cr_psrcvpktscount) : + dev->ipath_rpkts; + break; + case IB_PMA_PORT_XMIT_WAIT: + ret = (crp->cr_psxmitwaitcount) ? + ipath_read_creg32(dev->dd, crp->cr_psxmitwaitcount) : + dev->ipath_xmit_wait; + break; + default: + ret = 0; + } + + return ret; +} + +static int recv_pma_get_portsamplesresult(struct ib_pma_mad *pmp, + struct ib_device *ibdev) +{ + struct ib_pma_portsamplesresult *p = + (struct ib_pma_portsamplesresult *)pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_cregs const *crp = dev->dd->ipath_cregs; + u8 status; + int i; + + memset(pmp->data, 0, sizeof(pmp->data)); + p->tag = cpu_to_be16(dev->pma_tag); + if (crp->cr_psstat) + status = ipath_read_creg32(dev->dd, crp->cr_psstat); + else + status = dev->pma_sample_status; + p->sample_status = cpu_to_be16(status); + for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++) + p->counter[i] = (status != IB_PMA_SAMPLE_STATUS_DONE) ? 0 : + cpu_to_be32( + get_counter(dev, crp, dev->pma_counter_select[i])); + + return reply((struct ib_smp *) pmp); +} + +static int recv_pma_get_portsamplesresult_ext(struct ib_pma_mad *pmp, + struct ib_device *ibdev) +{ + struct ib_pma_portsamplesresult_ext *p = + (struct ib_pma_portsamplesresult_ext *)pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_cregs const *crp = dev->dd->ipath_cregs; + u8 status; + int i; + + memset(pmp->data, 0, sizeof(pmp->data)); + p->tag = cpu_to_be16(dev->pma_tag); + if (crp->cr_psstat) + status = ipath_read_creg32(dev->dd, crp->cr_psstat); + else + status = dev->pma_sample_status; + p->sample_status = cpu_to_be16(status); + /* 64 bits */ + p->extended_width = cpu_to_be32(0x80000000); + for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++) + p->counter[i] = (status != IB_PMA_SAMPLE_STATUS_DONE) ? 0 : + cpu_to_be64( + get_counter(dev, crp, dev->pma_counter_select[i])); + + return reply((struct ib_smp *) pmp); +} + +static int recv_pma_get_portcounters(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portcounters *p = (struct ib_pma_portcounters *) + pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_verbs_counters cntrs; + u8 port_select = p->port_select; + + ipath_get_counters(dev->dd, &cntrs); + + /* Adjust counters for any resets done. */ + cntrs.symbol_error_counter -= dev->z_symbol_error_counter; + cntrs.link_error_recovery_counter -= + dev->z_link_error_recovery_counter; + cntrs.link_downed_counter -= dev->z_link_downed_counter; + cntrs.port_rcv_errors += dev->rcv_errors; + cntrs.port_rcv_errors -= dev->z_port_rcv_errors; + cntrs.port_rcv_remphys_errors -= dev->z_port_rcv_remphys_errors; + cntrs.port_xmit_discards -= dev->z_port_xmit_discards; + cntrs.port_xmit_data -= dev->z_port_xmit_data; + cntrs.port_rcv_data -= dev->z_port_rcv_data; + cntrs.port_xmit_packets -= dev->z_port_xmit_packets; + cntrs.port_rcv_packets -= dev->z_port_rcv_packets; + cntrs.local_link_integrity_errors -= + dev->z_local_link_integrity_errors; + cntrs.excessive_buffer_overrun_errors -= + dev->z_excessive_buffer_overrun_errors; + cntrs.vl15_dropped -= dev->z_vl15_dropped; + cntrs.vl15_dropped += dev->n_vl15_dropped; + + memset(pmp->data, 0, sizeof(pmp->data)); + + p->port_select = port_select; + if (pmp->mad_hdr.attr_mod != 0 || + (port_select != port && port_select != 0xFF)) + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + + if (cntrs.symbol_error_counter > 0xFFFFUL) + p->symbol_error_counter = cpu_to_be16(0xFFFF); + else + p->symbol_error_counter = + cpu_to_be16((u16)cntrs.symbol_error_counter); + if (cntrs.link_error_recovery_counter > 0xFFUL) + p->link_error_recovery_counter = 0xFF; + else + p->link_error_recovery_counter = + (u8)cntrs.link_error_recovery_counter; + if (cntrs.link_downed_counter > 0xFFUL) + p->link_downed_counter = 0xFF; + else + p->link_downed_counter = (u8)cntrs.link_downed_counter; + if (cntrs.port_rcv_errors > 0xFFFFUL) + p->port_rcv_errors = cpu_to_be16(0xFFFF); + else + p->port_rcv_errors = + cpu_to_be16((u16) cntrs.port_rcv_errors); + if (cntrs.port_rcv_remphys_errors > 0xFFFFUL) + p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF); + else + p->port_rcv_remphys_errors = + cpu_to_be16((u16)cntrs.port_rcv_remphys_errors); + if (cntrs.port_xmit_discards > 0xFFFFUL) + p->port_xmit_discards = cpu_to_be16(0xFFFF); + else + p->port_xmit_discards = + cpu_to_be16((u16)cntrs.port_xmit_discards); + if (cntrs.local_link_integrity_errors > 0xFUL) + cntrs.local_link_integrity_errors = 0xFUL; + if (cntrs.excessive_buffer_overrun_errors > 0xFUL) + cntrs.excessive_buffer_overrun_errors = 0xFUL; + p->link_overrun_errors = (cntrs.local_link_integrity_errors << 4) | + cntrs.excessive_buffer_overrun_errors; + if (cntrs.vl15_dropped > 0xFFFFUL) + p->vl15_dropped = cpu_to_be16(0xFFFF); + else + p->vl15_dropped = cpu_to_be16((u16)cntrs.vl15_dropped); + if (cntrs.port_xmit_data > 0xFFFFFFFFUL) + p->port_xmit_data = cpu_to_be32(0xFFFFFFFF); + else + p->port_xmit_data = cpu_to_be32((u32)cntrs.port_xmit_data); + if (cntrs.port_rcv_data > 0xFFFFFFFFUL) + p->port_rcv_data = cpu_to_be32(0xFFFFFFFF); + else + p->port_rcv_data = cpu_to_be32((u32)cntrs.port_rcv_data); + if (cntrs.port_xmit_packets > 0xFFFFFFFFUL) + p->port_xmit_packets = cpu_to_be32(0xFFFFFFFF); + else + p->port_xmit_packets = + cpu_to_be32((u32)cntrs.port_xmit_packets); + if (cntrs.port_rcv_packets > 0xFFFFFFFFUL) + p->port_rcv_packets = cpu_to_be32(0xFFFFFFFF); + else + p->port_rcv_packets = + cpu_to_be32((u32) cntrs.port_rcv_packets); + + return reply((struct ib_smp *) pmp); +} + +static int recv_pma_get_portcounters_ext(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portcounters_ext *p = + (struct ib_pma_portcounters_ext *)pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + u64 swords, rwords, spkts, rpkts, xwait; + u8 port_select = p->port_select; + + ipath_snapshot_counters(dev->dd, &swords, &rwords, &spkts, + &rpkts, &xwait); + + /* Adjust counters for any resets done. */ + swords -= dev->z_port_xmit_data; + rwords -= dev->z_port_rcv_data; + spkts -= dev->z_port_xmit_packets; + rpkts -= dev->z_port_rcv_packets; + + memset(pmp->data, 0, sizeof(pmp->data)); + + p->port_select = port_select; + if (pmp->mad_hdr.attr_mod != 0 || + (port_select != port && port_select != 0xFF)) + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + + p->port_xmit_data = cpu_to_be64(swords); + p->port_rcv_data = cpu_to_be64(rwords); + p->port_xmit_packets = cpu_to_be64(spkts); + p->port_rcv_packets = cpu_to_be64(rpkts); + p->port_unicast_xmit_packets = cpu_to_be64(dev->n_unicast_xmit); + p->port_unicast_rcv_packets = cpu_to_be64(dev->n_unicast_rcv); + p->port_multicast_xmit_packets = cpu_to_be64(dev->n_multicast_xmit); + p->port_multicast_rcv_packets = cpu_to_be64(dev->n_multicast_rcv); + + return reply((struct ib_smp *) pmp); +} + +static int recv_pma_set_portcounters(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portcounters *p = (struct ib_pma_portcounters *) + pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_verbs_counters cntrs; + + /* + * Since the HW doesn't support clearing counters, we save the + * current count and subtract it from future responses. + */ + ipath_get_counters(dev->dd, &cntrs); + + if (p->counter_select & IB_PMA_SEL_SYMBOL_ERROR) + dev->z_symbol_error_counter = cntrs.symbol_error_counter; + + if (p->counter_select & IB_PMA_SEL_LINK_ERROR_RECOVERY) + dev->z_link_error_recovery_counter = + cntrs.link_error_recovery_counter; + + if (p->counter_select & IB_PMA_SEL_LINK_DOWNED) + dev->z_link_downed_counter = cntrs.link_downed_counter; + + if (p->counter_select & IB_PMA_SEL_PORT_RCV_ERRORS) + dev->z_port_rcv_errors = + cntrs.port_rcv_errors + dev->rcv_errors; + + if (p->counter_select & IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS) + dev->z_port_rcv_remphys_errors = + cntrs.port_rcv_remphys_errors; + + if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DISCARDS) + dev->z_port_xmit_discards = cntrs.port_xmit_discards; + + if (p->counter_select & IB_PMA_SEL_LOCAL_LINK_INTEGRITY_ERRORS) + dev->z_local_link_integrity_errors = + cntrs.local_link_integrity_errors; + + if (p->counter_select & IB_PMA_SEL_EXCESSIVE_BUFFER_OVERRUNS) + dev->z_excessive_buffer_overrun_errors = + cntrs.excessive_buffer_overrun_errors; + + if (p->counter_select & IB_PMA_SEL_PORT_VL15_DROPPED) { + dev->n_vl15_dropped = 0; + dev->z_vl15_dropped = cntrs.vl15_dropped; + } + + if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DATA) + dev->z_port_xmit_data = cntrs.port_xmit_data; + + if (p->counter_select & IB_PMA_SEL_PORT_RCV_DATA) + dev->z_port_rcv_data = cntrs.port_rcv_data; + + if (p->counter_select & IB_PMA_SEL_PORT_XMIT_PACKETS) + dev->z_port_xmit_packets = cntrs.port_xmit_packets; + + if (p->counter_select & IB_PMA_SEL_PORT_RCV_PACKETS) + dev->z_port_rcv_packets = cntrs.port_rcv_packets; + + return recv_pma_get_portcounters(pmp, ibdev, port); +} + +static int recv_pma_set_portcounters_ext(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portcounters *p = (struct ib_pma_portcounters *) + pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + u64 swords, rwords, spkts, rpkts, xwait; + + ipath_snapshot_counters(dev->dd, &swords, &rwords, &spkts, + &rpkts, &xwait); + + if (p->counter_select & IB_PMA_SELX_PORT_XMIT_DATA) + dev->z_port_xmit_data = swords; + + if (p->counter_select & IB_PMA_SELX_PORT_RCV_DATA) + dev->z_port_rcv_data = rwords; + + if (p->counter_select & IB_PMA_SELX_PORT_XMIT_PACKETS) + dev->z_port_xmit_packets = spkts; + + if (p->counter_select & IB_PMA_SELX_PORT_RCV_PACKETS) + dev->z_port_rcv_packets = rpkts; + + if (p->counter_select & IB_PMA_SELX_PORT_UNI_XMIT_PACKETS) + dev->n_unicast_xmit = 0; + + if (p->counter_select & IB_PMA_SELX_PORT_UNI_RCV_PACKETS) + dev->n_unicast_rcv = 0; + + if (p->counter_select & IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS) + dev->n_multicast_xmit = 0; + + if (p->counter_select & IB_PMA_SELX_PORT_MULTI_RCV_PACKETS) + dev->n_multicast_rcv = 0; + + return recv_pma_get_portcounters_ext(pmp, ibdev, port); +} + +static int process_subn(struct ib_device *ibdev, int mad_flags, + u8 port_num, struct ib_mad *in_mad, + struct ib_mad *out_mad) +{ + struct ib_smp *smp = (struct ib_smp *)out_mad; + struct ipath_ibdev *dev = to_idev(ibdev); + int ret; + + *out_mad = *in_mad; + if (smp->class_version != 1) { + smp->status |= IB_SMP_UNSUP_VERSION; + ret = reply(smp); + goto bail; + } + + /* Is the mkey in the process of expiring? */ + if (dev->mkey_lease_timeout && + time_after_eq(jiffies, dev->mkey_lease_timeout)) { + /* Clear timeout and mkey protection field. */ + dev->mkey_lease_timeout = 0; + dev->mkeyprot = 0; + } + + /* + * M_Key checking depends on + * Portinfo:M_Key_protect_bits + */ + if ((mad_flags & IB_MAD_IGNORE_MKEY) == 0 && dev->mkey != 0 && + dev->mkey != smp->mkey && + (smp->method == IB_MGMT_METHOD_SET || + (smp->method == IB_MGMT_METHOD_GET && + dev->mkeyprot >= 2))) { + if (dev->mkey_violations != 0xFFFF) + ++dev->mkey_violations; + if (dev->mkey_lease_timeout || + dev->mkey_lease_period == 0) { + ret = IB_MAD_RESULT_SUCCESS | + IB_MAD_RESULT_CONSUMED; + goto bail; + } + dev->mkey_lease_timeout = jiffies + + dev->mkey_lease_period * HZ; + /* Future: Generate a trap notice. */ + ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + goto bail; + } else if (dev->mkey_lease_timeout) + dev->mkey_lease_timeout = 0; + + switch (smp->method) { + case IB_MGMT_METHOD_GET: + switch (smp->attr_id) { + case IB_SMP_ATTR_NODE_DESC: + ret = recv_subn_get_nodedescription(smp, ibdev); + goto bail; + case IB_SMP_ATTR_NODE_INFO: + ret = recv_subn_get_nodeinfo(smp, ibdev, port_num); + goto bail; + case IB_SMP_ATTR_GUID_INFO: + ret = recv_subn_get_guidinfo(smp, ibdev); + goto bail; + case IB_SMP_ATTR_PORT_INFO: + ret = recv_subn_get_portinfo(smp, ibdev, port_num); + goto bail; + case IB_SMP_ATTR_PKEY_TABLE: + ret = recv_subn_get_pkeytable(smp, ibdev); + goto bail; + case IB_SMP_ATTR_SM_INFO: + if (dev->port_cap_flags & IB_PORT_SM_DISABLED) { + ret = IB_MAD_RESULT_SUCCESS | + IB_MAD_RESULT_CONSUMED; + goto bail; + } + if (dev->port_cap_flags & IB_PORT_SM) { + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + } + /* FALLTHROUGH */ + default: + smp->status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply(smp); + goto bail; + } + + case IB_MGMT_METHOD_SET: + switch (smp->attr_id) { + case IB_SMP_ATTR_GUID_INFO: + ret = recv_subn_set_guidinfo(smp, ibdev); + goto bail; + case IB_SMP_ATTR_PORT_INFO: + ret = recv_subn_set_portinfo(smp, ibdev, port_num); + goto bail; + case IB_SMP_ATTR_PKEY_TABLE: + ret = recv_subn_set_pkeytable(smp, ibdev, port_num); + goto bail; + case IB_SMP_ATTR_SM_INFO: + if (dev->port_cap_flags & IB_PORT_SM_DISABLED) { + ret = IB_MAD_RESULT_SUCCESS | + IB_MAD_RESULT_CONSUMED; + goto bail; + } + if (dev->port_cap_flags & IB_PORT_SM) { + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + } + /* FALLTHROUGH */ + default: + smp->status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply(smp); + goto bail; + } + + case IB_MGMT_METHOD_TRAP: + case IB_MGMT_METHOD_REPORT: + case IB_MGMT_METHOD_REPORT_RESP: + case IB_MGMT_METHOD_TRAP_REPRESS: + case IB_MGMT_METHOD_GET_RESP: + /* + * The ib_mad module will call us to process responses + * before checking for other consumers. + * Just tell the caller to process it normally. + */ + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + default: + smp->status |= IB_SMP_UNSUP_METHOD; + ret = reply(smp); + } + +bail: + return ret; +} + +static int process_perf(struct ib_device *ibdev, u8 port_num, + struct ib_mad *in_mad, + struct ib_mad *out_mad) +{ + struct ib_pma_mad *pmp = (struct ib_pma_mad *)out_mad; + int ret; + + *out_mad = *in_mad; + if (pmp->mad_hdr.class_version != 1) { + pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION; + ret = reply((struct ib_smp *) pmp); + goto bail; + } + + switch (pmp->mad_hdr.method) { + case IB_MGMT_METHOD_GET: + switch (pmp->mad_hdr.attr_id) { + case IB_PMA_CLASS_PORT_INFO: + ret = recv_pma_get_classportinfo(pmp); + goto bail; + case IB_PMA_PORT_SAMPLES_CONTROL: + ret = recv_pma_get_portsamplescontrol(pmp, ibdev, + port_num); + goto bail; + case IB_PMA_PORT_SAMPLES_RESULT: + ret = recv_pma_get_portsamplesresult(pmp, ibdev); + goto bail; + case IB_PMA_PORT_SAMPLES_RESULT_EXT: + ret = recv_pma_get_portsamplesresult_ext(pmp, + ibdev); + goto bail; + case IB_PMA_PORT_COUNTERS: + ret = recv_pma_get_portcounters(pmp, ibdev, + port_num); + goto bail; + case IB_PMA_PORT_COUNTERS_EXT: + ret = recv_pma_get_portcounters_ext(pmp, ibdev, + port_num); + goto bail; + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_smp *) pmp); + goto bail; + } + + case IB_MGMT_METHOD_SET: + switch (pmp->mad_hdr.attr_id) { + case IB_PMA_PORT_SAMPLES_CONTROL: + ret = recv_pma_set_portsamplescontrol(pmp, ibdev, + port_num); + goto bail; + case IB_PMA_PORT_COUNTERS: + ret = recv_pma_set_portcounters(pmp, ibdev, + port_num); + goto bail; + case IB_PMA_PORT_COUNTERS_EXT: + ret = recv_pma_set_portcounters_ext(pmp, ibdev, + port_num); + goto bail; + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_smp *) pmp); + goto bail; + } + + case IB_MGMT_METHOD_GET_RESP: + /* + * The ib_mad module will call us to process responses + * before checking for other consumers. + * Just tell the caller to process it normally. + */ + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD; + ret = reply((struct ib_smp *) pmp); + } + +bail: + return ret; +} + +/** + * ipath_process_mad - process an incoming MAD packet + * @ibdev: the infiniband device this packet came in on + * @mad_flags: MAD flags + * @port_num: the port number this packet came in on + * @in_wc: the work completion entry for this packet + * @in_grh: the global route header for this packet + * @in_mad: the incoming MAD + * @out_mad: any outgoing MAD reply + * + * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not + * interested in processing. + * + * Note that the verbs framework has already done the MAD sanity checks, + * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE + * MADs. + * + * This is called by the ib_mad module. + */ +int ipath_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + int ret; + + switch (in_mad->mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + ret = process_subn(ibdev, mad_flags, port_num, + in_mad, out_mad); + goto bail; + case IB_MGMT_CLASS_PERF_MGMT: + ret = process_perf(ibdev, port_num, in_mad, out_mad); + goto bail; + default: + ret = IB_MAD_RESULT_SUCCESS; + } + +bail: + return ret; +} diff --git a/drivers/infiniband/hw/ipath/ipath_mmap.c b/drivers/infiniband/hw/ipath/ipath_mmap.c new file mode 100644 index 0000000..e732742 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_mmap.c @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#include "ipath_verbs.h" + +/** + * ipath_release_mmap_info - free mmap info structure + * @ref: a pointer to the kref within struct ipath_mmap_info + */ +void ipath_release_mmap_info(struct kref *ref) +{ + struct ipath_mmap_info *ip = + container_of(ref, struct ipath_mmap_info, ref); + struct ipath_ibdev *dev = to_idev(ip->context->device); + + spin_lock_irq(&dev->pending_lock); + list_del(&ip->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + + vfree(ip->obj); + kfree(ip); +} + +/* + * open and close keep track of how many times the CQ is mapped, + * to avoid releasing it. + */ +static void ipath_vma_open(struct vm_area_struct *vma) +{ + struct ipath_mmap_info *ip = vma->vm_private_data; + + kref_get(&ip->ref); +} + +static void ipath_vma_close(struct vm_area_struct *vma) +{ + struct ipath_mmap_info *ip = vma->vm_private_data; + + kref_put(&ip->ref, ipath_release_mmap_info); +} + +static const struct vm_operations_struct ipath_vm_ops = { + .open = ipath_vma_open, + .close = ipath_vma_close, +}; + +/** + * ipath_mmap - create a new mmap region + * @context: the IB user context of the process making the mmap() call + * @vma: the VMA to be initialized + * Return zero if the mmap is OK. Otherwise, return an errno. + */ +int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + struct ipath_ibdev *dev = to_idev(context->device); + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned long size = vma->vm_end - vma->vm_start; + struct ipath_mmap_info *ip, *pp; + int ret = -EINVAL; + + /* + * Search the device's list of objects waiting for a mmap call. + * Normally, this list is very short since a call to create a + * CQ, QP, or SRQ is soon followed by a call to mmap(). + */ + spin_lock_irq(&dev->pending_lock); + list_for_each_entry_safe(ip, pp, &dev->pending_mmaps, + pending_mmaps) { + /* Only the creator is allowed to mmap the object */ + if (context != ip->context || (__u64) offset != ip->offset) + continue; + /* Don't allow a mmap larger than the object. */ + if (size > ip->size) + break; + + list_del_init(&ip->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + + ret = remap_vmalloc_range(vma, ip->obj, 0); + if (ret) + goto done; + vma->vm_ops = &ipath_vm_ops; + vma->vm_private_data = ip; + ipath_vma_open(vma); + goto done; + } + spin_unlock_irq(&dev->pending_lock); +done: + return ret; +} + +/* + * Allocate information for ipath_mmap + */ +struct ipath_mmap_info *ipath_create_mmap_info(struct ipath_ibdev *dev, + u32 size, + struct ib_ucontext *context, + void *obj) { + struct ipath_mmap_info *ip; + + ip = kmalloc(sizeof *ip, GFP_KERNEL); + if (!ip) + goto bail; + + size = PAGE_ALIGN(size); + + spin_lock_irq(&dev->mmap_offset_lock); + if (dev->mmap_offset == 0) + dev->mmap_offset = PAGE_SIZE; + ip->offset = dev->mmap_offset; + dev->mmap_offset += size; + spin_unlock_irq(&dev->mmap_offset_lock); + + INIT_LIST_HEAD(&ip->pending_mmaps); + ip->size = size; + ip->context = context; + ip->obj = obj; + kref_init(&ip->ref); + +bail: + return ip; +} + +void ipath_update_mmap_info(struct ipath_ibdev *dev, + struct ipath_mmap_info *ip, + u32 size, void *obj) { + size = PAGE_ALIGN(size); + + spin_lock_irq(&dev->mmap_offset_lock); + if (dev->mmap_offset == 0) + dev->mmap_offset = PAGE_SIZE; + ip->offset = dev->mmap_offset; + dev->mmap_offset += size; + spin_unlock_irq(&dev->mmap_offset_lock); + + ip->size = size; + ip->obj = obj; +} diff --git a/drivers/infiniband/hw/ipath/ipath_mr.c b/drivers/infiniband/hw/ipath/ipath_mr.c new file mode 100644 index 0000000..5e61e9b --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_mr.c @@ -0,0 +1,425 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include +#include +#include + +#include "ipath_verbs.h" + +/* Fast memory region */ +struct ipath_fmr { + struct ib_fmr ibfmr; + u8 page_shift; + struct ipath_mregion mr; /* must be last */ +}; + +static inline struct ipath_fmr *to_ifmr(struct ib_fmr *ibfmr) +{ + return container_of(ibfmr, struct ipath_fmr, ibfmr); +} + +/** + * ipath_get_dma_mr - get a DMA memory region + * @pd: protection domain for this memory region + * @acc: access flags + * + * Returns the memory region on success, otherwise returns an errno. + * Note that all DMA addresses should be created via the + * struct ib_dma_mapping_ops functions (see ipath_dma.c). + */ +struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct ipath_mr *mr; + struct ib_mr *ret; + + mr = kzalloc(sizeof *mr, GFP_KERNEL); + if (!mr) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + mr->mr.access_flags = acc; + ret = &mr->ibmr; + +bail: + return ret; +} + +static struct ipath_mr *alloc_mr(int count, + struct ipath_lkey_table *lk_table) +{ + struct ipath_mr *mr; + int m, i = 0; + + /* Allocate struct plus pointers to first level page tables. */ + m = (count + IPATH_SEGSZ - 1) / IPATH_SEGSZ; + mr = kmalloc(sizeof *mr + m * sizeof mr->mr.map[0], GFP_KERNEL); + if (!mr) + goto done; + + /* Allocate first level page tables. */ + for (; i < m; i++) { + mr->mr.map[i] = kmalloc(sizeof *mr->mr.map[0], GFP_KERNEL); + if (!mr->mr.map[i]) + goto bail; + } + mr->mr.mapsz = m; + + /* + * ib_reg_phys_mr() will initialize mr->ibmr except for + * lkey and rkey. + */ + if (!ipath_alloc_lkey(lk_table, &mr->mr)) + goto bail; + mr->ibmr.rkey = mr->ibmr.lkey = mr->mr.lkey; + + goto done; + +bail: + while (i) { + i--; + kfree(mr->mr.map[i]); + } + kfree(mr); + mr = NULL; + +done: + return mr; +} + +/** + * ipath_reg_phys_mr - register a physical memory region + * @pd: protection domain for this memory region + * @buffer_list: pointer to the list of physical buffers to register + * @num_phys_buf: the number of physical buffers to register + * @iova_start: the starting address passed over IB which maps to this MR + * + * Returns the memory region on success, otherwise returns an errno. + */ +struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, int acc, u64 *iova_start) +{ + struct ipath_mr *mr; + int n, m, i; + struct ib_mr *ret; + + mr = alloc_mr(num_phys_buf, &to_idev(pd->device)->lk_table); + if (mr == NULL) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + mr->mr.pd = pd; + mr->mr.user_base = *iova_start; + mr->mr.iova = *iova_start; + mr->mr.length = 0; + mr->mr.offset = 0; + mr->mr.access_flags = acc; + mr->mr.max_segs = num_phys_buf; + mr->umem = NULL; + + m = 0; + n = 0; + for (i = 0; i < num_phys_buf; i++) { + mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr; + mr->mr.map[m]->segs[n].length = buffer_list[i].size; + mr->mr.length += buffer_list[i].size; + n++; + if (n == IPATH_SEGSZ) { + m++; + n = 0; + } + } + + ret = &mr->ibmr; + +bail: + return ret; +} + +/** + * ipath_reg_user_mr - register a userspace memory region + * @pd: protection domain for this memory region + * @start: starting userspace address + * @length: length of region to register + * @virt_addr: virtual address to use (from HCA's point of view) + * @mr_access_flags: access flags for this memory region + * @udata: unused by the InfiniPath driver + * + * Returns the memory region on success, otherwise returns an errno. + */ +struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int mr_access_flags, + struct ib_udata *udata) +{ + struct ipath_mr *mr; + struct ib_umem *umem; + int n, m, entry; + struct scatterlist *sg; + struct ib_mr *ret; + + if (length == 0) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + umem = ib_umem_get(pd->uobject->context, start, length, + mr_access_flags, 0); + if (IS_ERR(umem)) + return (void *) umem; + + n = umem->nmap; + mr = alloc_mr(n, &to_idev(pd->device)->lk_table); + if (!mr) { + ret = ERR_PTR(-ENOMEM); + ib_umem_release(umem); + goto bail; + } + + mr->mr.pd = pd; + mr->mr.user_base = start; + mr->mr.iova = virt_addr; + mr->mr.length = length; + mr->mr.offset = umem->offset; + mr->mr.access_flags = mr_access_flags; + mr->mr.max_segs = n; + mr->umem = umem; + + m = 0; + n = 0; + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + void *vaddr; + + vaddr = page_address(sg_page(sg)); + if (!vaddr) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + mr->mr.map[m]->segs[n].vaddr = vaddr; + mr->mr.map[m]->segs[n].length = umem->page_size; + n++; + if (n == IPATH_SEGSZ) { + m++; + n = 0; + } + } + ret = &mr->ibmr; + +bail: + return ret; +} + +/** + * ipath_dereg_mr - unregister and free a memory region + * @ibmr: the memory region to free + * + * Returns 0 on success. + * + * Note that this is called to free MRs created by ipath_get_dma_mr() + * or ipath_reg_user_mr(). + */ +int ipath_dereg_mr(struct ib_mr *ibmr) +{ + struct ipath_mr *mr = to_imr(ibmr); + int i; + + ipath_free_lkey(&to_idev(ibmr->device)->lk_table, ibmr->lkey); + i = mr->mr.mapsz; + while (i) { + i--; + kfree(mr->mr.map[i]); + } + + if (mr->umem) + ib_umem_release(mr->umem); + + kfree(mr); + return 0; +} + +/** + * ipath_alloc_fmr - allocate a fast memory region + * @pd: the protection domain for this memory region + * @mr_access_flags: access flags for this memory region + * @fmr_attr: fast memory region attributes + * + * Returns the memory region on success, otherwise returns an errno. + */ +struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd, int mr_access_flags, + struct ib_fmr_attr *fmr_attr) +{ + struct ipath_fmr *fmr; + int m, i = 0; + struct ib_fmr *ret; + + /* Allocate struct plus pointers to first level page tables. */ + m = (fmr_attr->max_pages + IPATH_SEGSZ - 1) / IPATH_SEGSZ; + fmr = kmalloc(sizeof *fmr + m * sizeof fmr->mr.map[0], GFP_KERNEL); + if (!fmr) + goto bail; + + /* Allocate first level page tables. */ + for (; i < m; i++) { + fmr->mr.map[i] = kmalloc(sizeof *fmr->mr.map[0], + GFP_KERNEL); + if (!fmr->mr.map[i]) + goto bail; + } + fmr->mr.mapsz = m; + + /* + * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey & + * rkey. + */ + if (!ipath_alloc_lkey(&to_idev(pd->device)->lk_table, &fmr->mr)) + goto bail; + fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mr.lkey; + /* + * Resources are allocated but no valid mapping (RKEY can't be + * used). + */ + fmr->mr.pd = pd; + fmr->mr.user_base = 0; + fmr->mr.iova = 0; + fmr->mr.length = 0; + fmr->mr.offset = 0; + fmr->mr.access_flags = mr_access_flags; + fmr->mr.max_segs = fmr_attr->max_pages; + fmr->page_shift = fmr_attr->page_shift; + + ret = &fmr->ibfmr; + goto done; + +bail: + while (i) + kfree(fmr->mr.map[--i]); + kfree(fmr); + ret = ERR_PTR(-ENOMEM); + +done: + return ret; +} + +/** + * ipath_map_phys_fmr - set up a fast memory region + * @ibmfr: the fast memory region to set up + * @page_list: the list of pages to associate with the fast memory region + * @list_len: the number of pages to associate with the fast memory region + * @iova: the virtual address of the start of the fast memory region + * + * This may be called from interrupt context. + */ + +int ipath_map_phys_fmr(struct ib_fmr *ibfmr, u64 * page_list, + int list_len, u64 iova) +{ + struct ipath_fmr *fmr = to_ifmr(ibfmr); + struct ipath_lkey_table *rkt; + unsigned long flags; + int m, n, i; + u32 ps; + int ret; + + if (list_len > fmr->mr.max_segs) { + ret = -EINVAL; + goto bail; + } + rkt = &to_idev(ibfmr->device)->lk_table; + spin_lock_irqsave(&rkt->lock, flags); + fmr->mr.user_base = iova; + fmr->mr.iova = iova; + ps = 1 << fmr->page_shift; + fmr->mr.length = list_len * ps; + m = 0; + n = 0; + ps = 1 << fmr->page_shift; + for (i = 0; i < list_len; i++) { + fmr->mr.map[m]->segs[n].vaddr = (void *) page_list[i]; + fmr->mr.map[m]->segs[n].length = ps; + if (++n == IPATH_SEGSZ) { + m++; + n = 0; + } + } + spin_unlock_irqrestore(&rkt->lock, flags); + ret = 0; + +bail: + return ret; +} + +/** + * ipath_unmap_fmr - unmap fast memory regions + * @fmr_list: the list of fast memory regions to unmap + * + * Returns 0 on success. + */ +int ipath_unmap_fmr(struct list_head *fmr_list) +{ + struct ipath_fmr *fmr; + struct ipath_lkey_table *rkt; + unsigned long flags; + + list_for_each_entry(fmr, fmr_list, ibfmr.list) { + rkt = &to_idev(fmr->ibfmr.device)->lk_table; + spin_lock_irqsave(&rkt->lock, flags); + fmr->mr.user_base = 0; + fmr->mr.iova = 0; + fmr->mr.length = 0; + spin_unlock_irqrestore(&rkt->lock, flags); + } + return 0; +} + +/** + * ipath_dealloc_fmr - deallocate a fast memory region + * @ibfmr: the fast memory region to deallocate + * + * Returns 0 on success. + */ +int ipath_dealloc_fmr(struct ib_fmr *ibfmr) +{ + struct ipath_fmr *fmr = to_ifmr(ibfmr); + int i; + + ipath_free_lkey(&to_idev(ibfmr->device)->lk_table, ibfmr->lkey); + i = fmr->mr.mapsz; + while (i) + kfree(fmr->mr.map[--i]); + kfree(fmr); + return 0; +} diff --git a/drivers/infiniband/hw/ipath/ipath_qp.c b/drivers/infiniband/hw/ipath/ipath_qp.c new file mode 100644 index 0000000..face876 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_qp.c @@ -0,0 +1,1080 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "ipath_verbs.h" +#include "ipath_kernel.h" + +#define BITS_PER_PAGE (PAGE_SIZE*BITS_PER_BYTE) +#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) +#define mk_qpn(qpt, map, off) (((map) - (qpt)->map) * BITS_PER_PAGE + \ + (off)) +#define find_next_offset(map, off) find_next_zero_bit((map)->page, \ + BITS_PER_PAGE, off) + +/* + * Convert the AETH credit code into the number of credits. + */ +static u32 credit_table[31] = { + 0, /* 0 */ + 1, /* 1 */ + 2, /* 2 */ + 3, /* 3 */ + 4, /* 4 */ + 6, /* 5 */ + 8, /* 6 */ + 12, /* 7 */ + 16, /* 8 */ + 24, /* 9 */ + 32, /* A */ + 48, /* B */ + 64, /* C */ + 96, /* D */ + 128, /* E */ + 192, /* F */ + 256, /* 10 */ + 384, /* 11 */ + 512, /* 12 */ + 768, /* 13 */ + 1024, /* 14 */ + 1536, /* 15 */ + 2048, /* 16 */ + 3072, /* 17 */ + 4096, /* 18 */ + 6144, /* 19 */ + 8192, /* 1A */ + 12288, /* 1B */ + 16384, /* 1C */ + 24576, /* 1D */ + 32768 /* 1E */ +}; + + +static void get_map_page(struct ipath_qp_table *qpt, struct qpn_map *map) +{ + unsigned long page = get_zeroed_page(GFP_KERNEL); + unsigned long flags; + + /* + * Free the page if someone raced with us installing it. + */ + + spin_lock_irqsave(&qpt->lock, flags); + if (map->page) + free_page(page); + else + map->page = (void *)page; + spin_unlock_irqrestore(&qpt->lock, flags); +} + + +static int alloc_qpn(struct ipath_qp_table *qpt, enum ib_qp_type type) +{ + u32 i, offset, max_scan, qpn; + struct qpn_map *map; + u32 ret = -1; + + if (type == IB_QPT_SMI) + ret = 0; + else if (type == IB_QPT_GSI) + ret = 1; + + if (ret != -1) { + map = &qpt->map[0]; + if (unlikely(!map->page)) { + get_map_page(qpt, map); + if (unlikely(!map->page)) { + ret = -ENOMEM; + goto bail; + } + } + if (!test_and_set_bit(ret, map->page)) + atomic_dec(&map->n_free); + else + ret = -EBUSY; + goto bail; + } + + qpn = qpt->last + 1; + if (qpn >= QPN_MAX) + qpn = 2; + offset = qpn & BITS_PER_PAGE_MASK; + map = &qpt->map[qpn / BITS_PER_PAGE]; + max_scan = qpt->nmaps - !offset; + for (i = 0;;) { + if (unlikely(!map->page)) { + get_map_page(qpt, map); + if (unlikely(!map->page)) + break; + } + if (likely(atomic_read(&map->n_free))) { + do { + if (!test_and_set_bit(offset, map->page)) { + atomic_dec(&map->n_free); + qpt->last = qpn; + ret = qpn; + goto bail; + } + offset = find_next_offset(map, offset); + qpn = mk_qpn(qpt, map, offset); + /* + * This test differs from alloc_pidmap(). + * If find_next_offset() does find a zero + * bit, we don't need to check for QPN + * wrapping around past our starting QPN. + * We just need to be sure we don't loop + * forever. + */ + } while (offset < BITS_PER_PAGE && qpn < QPN_MAX); + } + /* + * In order to keep the number of pages allocated to a + * minimum, we scan the all existing pages before increasing + * the size of the bitmap table. + */ + if (++i > max_scan) { + if (qpt->nmaps == QPNMAP_ENTRIES) + break; + map = &qpt->map[qpt->nmaps++]; + offset = 0; + } else if (map < &qpt->map[qpt->nmaps]) { + ++map; + offset = 0; + } else { + map = &qpt->map[0]; + offset = 2; + } + qpn = mk_qpn(qpt, map, offset); + } + + ret = -ENOMEM; + +bail: + return ret; +} + +static void free_qpn(struct ipath_qp_table *qpt, u32 qpn) +{ + struct qpn_map *map; + + map = qpt->map + qpn / BITS_PER_PAGE; + if (map->page) + clear_bit(qpn & BITS_PER_PAGE_MASK, map->page); + atomic_inc(&map->n_free); +} + +/** + * ipath_alloc_qpn - allocate a QP number + * @qpt: the QP table + * @qp: the QP + * @type: the QP type (IB_QPT_SMI and IB_QPT_GSI are special) + * + * Allocate the next available QPN and put the QP into the hash table. + * The hash table holds a reference to the QP. + */ +static int ipath_alloc_qpn(struct ipath_qp_table *qpt, struct ipath_qp *qp, + enum ib_qp_type type) +{ + unsigned long flags; + int ret; + + ret = alloc_qpn(qpt, type); + if (ret < 0) + goto bail; + qp->ibqp.qp_num = ret; + + /* Add the QP to the hash table. */ + spin_lock_irqsave(&qpt->lock, flags); + + ret %= qpt->max; + qp->next = qpt->table[ret]; + qpt->table[ret] = qp; + atomic_inc(&qp->refcount); + + spin_unlock_irqrestore(&qpt->lock, flags); + ret = 0; + +bail: + return ret; +} + +/** + * ipath_free_qp - remove a QP from the QP table + * @qpt: the QP table + * @qp: the QP to remove + * + * Remove the QP from the table so it can't be found asynchronously by + * the receive interrupt routine. + */ +static void ipath_free_qp(struct ipath_qp_table *qpt, struct ipath_qp *qp) +{ + struct ipath_qp *q, **qpp; + unsigned long flags; + + spin_lock_irqsave(&qpt->lock, flags); + + /* Remove QP from the hash table. */ + qpp = &qpt->table[qp->ibqp.qp_num % qpt->max]; + for (; (q = *qpp) != NULL; qpp = &q->next) { + if (q == qp) { + *qpp = qp->next; + qp->next = NULL; + atomic_dec(&qp->refcount); + break; + } + } + + spin_unlock_irqrestore(&qpt->lock, flags); +} + +/** + * ipath_free_all_qps - check for QPs still in use + * @qpt: the QP table to empty + * + * There should not be any QPs still in use. + * Free memory for table. + */ +unsigned ipath_free_all_qps(struct ipath_qp_table *qpt) +{ + unsigned long flags; + struct ipath_qp *qp; + u32 n, qp_inuse = 0; + + spin_lock_irqsave(&qpt->lock, flags); + for (n = 0; n < qpt->max; n++) { + qp = qpt->table[n]; + qpt->table[n] = NULL; + + for (; qp; qp = qp->next) + qp_inuse++; + } + spin_unlock_irqrestore(&qpt->lock, flags); + + for (n = 0; n < ARRAY_SIZE(qpt->map); n++) + if (qpt->map[n].page) + free_page((unsigned long) qpt->map[n].page); + return qp_inuse; +} + +/** + * ipath_lookup_qpn - return the QP with the given QPN + * @qpt: the QP table + * @qpn: the QP number to look up + * + * The caller is responsible for decrementing the QP reference count + * when done. + */ +struct ipath_qp *ipath_lookup_qpn(struct ipath_qp_table *qpt, u32 qpn) +{ + unsigned long flags; + struct ipath_qp *qp; + + spin_lock_irqsave(&qpt->lock, flags); + + for (qp = qpt->table[qpn % qpt->max]; qp; qp = qp->next) { + if (qp->ibqp.qp_num == qpn) { + atomic_inc(&qp->refcount); + break; + } + } + + spin_unlock_irqrestore(&qpt->lock, flags); + return qp; +} + +/** + * ipath_reset_qp - initialize the QP state to the reset state + * @qp: the QP to reset + * @type: the QP type + */ +static void ipath_reset_qp(struct ipath_qp *qp, enum ib_qp_type type) +{ + qp->remote_qpn = 0; + qp->qkey = 0; + qp->qp_access_flags = 0; + atomic_set(&qp->s_dma_busy, 0); + qp->s_flags &= IPATH_S_SIGNAL_REQ_WR; + qp->s_hdrwords = 0; + qp->s_wqe = NULL; + qp->s_pkt_delay = 0; + qp->s_draining = 0; + qp->s_psn = 0; + qp->r_psn = 0; + qp->r_msn = 0; + if (type == IB_QPT_RC) { + qp->s_state = IB_OPCODE_RC_SEND_LAST; + qp->r_state = IB_OPCODE_RC_SEND_LAST; + } else { + qp->s_state = IB_OPCODE_UC_SEND_LAST; + qp->r_state = IB_OPCODE_UC_SEND_LAST; + } + qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE; + qp->r_nak_state = 0; + qp->r_aflags = 0; + qp->r_flags = 0; + qp->s_rnr_timeout = 0; + qp->s_head = 0; + qp->s_tail = 0; + qp->s_cur = 0; + qp->s_last = 0; + qp->s_ssn = 1; + qp->s_lsn = 0; + memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue)); + qp->r_head_ack_queue = 0; + qp->s_tail_ack_queue = 0; + qp->s_num_rd_atomic = 0; + if (qp->r_rq.wq) { + qp->r_rq.wq->head = 0; + qp->r_rq.wq->tail = 0; + } +} + +/** + * ipath_error_qp - put a QP into the error state + * @qp: the QP to put into the error state + * @err: the receive completion error to signal if a RWQE is active + * + * Flushes both send and receive work queues. + * Returns true if last WQE event should be generated. + * The QP s_lock should be held and interrupts disabled. + * If we are already in error state, just return. + */ + +int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ib_wc wc; + int ret = 0; + + if (qp->state == IB_QPS_ERR) + goto bail; + + qp->state = IB_QPS_ERR; + + spin_lock(&dev->pending_lock); + if (!list_empty(&qp->timerwait)) + list_del_init(&qp->timerwait); + if (!list_empty(&qp->piowait)) + list_del_init(&qp->piowait); + spin_unlock(&dev->pending_lock); + + /* Schedule the sending tasklet to drain the send work queue. */ + if (qp->s_last != qp->s_head) + ipath_schedule_send(qp); + + memset(&wc, 0, sizeof(wc)); + wc.qp = &qp->ibqp; + wc.opcode = IB_WC_RECV; + + if (test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags)) { + wc.wr_id = qp->r_wr_id; + wc.status = err; + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); + } + wc.status = IB_WC_WR_FLUSH_ERR; + + if (qp->r_rq.wq) { + struct ipath_rwq *wq; + u32 head; + u32 tail; + + spin_lock(&qp->r_rq.lock); + + /* sanity check pointers before trusting them */ + wq = qp->r_rq.wq; + head = wq->head; + if (head >= qp->r_rq.size) + head = 0; + tail = wq->tail; + if (tail >= qp->r_rq.size) + tail = 0; + while (tail != head) { + wc.wr_id = get_rwqe_ptr(&qp->r_rq, tail)->wr_id; + if (++tail >= qp->r_rq.size) + tail = 0; + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); + } + wq->tail = tail; + + spin_unlock(&qp->r_rq.lock); + } else if (qp->ibqp.event_handler) + ret = 1; + +bail: + return ret; +} + +/** + * ipath_modify_qp - modify the attributes of a queue pair + * @ibqp: the queue pair who's attributes we're modifying + * @attr: the new attributes + * @attr_mask: the mask of attributes to modify + * @udata: user data for ipathverbs.so + * + * Returns 0 on success, otherwise returns an errno. + */ +int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct ipath_ibdev *dev = to_idev(ibqp->device); + struct ipath_qp *qp = to_iqp(ibqp); + enum ib_qp_state cur_state, new_state; + int lastwqe = 0; + int ret; + + spin_lock_irq(&qp->s_lock); + + cur_state = attr_mask & IB_QP_CUR_STATE ? + attr->cur_qp_state : qp->state; + new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, + attr_mask, IB_LINK_LAYER_UNSPECIFIED)) + goto inval; + + if (attr_mask & IB_QP_AV) { + if (attr->ah_attr.dlid == 0 || + attr->ah_attr.dlid >= IPATH_MULTICAST_LID_BASE) + goto inval; + + if ((attr->ah_attr.ah_flags & IB_AH_GRH) && + (attr->ah_attr.grh.sgid_index > 1)) + goto inval; + } + + if (attr_mask & IB_QP_PKEY_INDEX) + if (attr->pkey_index >= ipath_get_npkeys(dev->dd)) + goto inval; + + if (attr_mask & IB_QP_MIN_RNR_TIMER) + if (attr->min_rnr_timer > 31) + goto inval; + + if (attr_mask & IB_QP_PORT) + if (attr->port_num == 0 || + attr->port_num > ibqp->device->phys_port_cnt) + goto inval; + + /* + * don't allow invalid Path MTU values or greater than 2048 + * unless we are configured for a 4KB MTU + */ + if ((attr_mask & IB_QP_PATH_MTU) && + (ib_mtu_enum_to_int(attr->path_mtu) == -1 || + (attr->path_mtu > IB_MTU_2048 && !ipath_mtu4096))) + goto inval; + + if (attr_mask & IB_QP_PATH_MIG_STATE) + if (attr->path_mig_state != IB_MIG_MIGRATED && + attr->path_mig_state != IB_MIG_REARM) + goto inval; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + if (attr->max_dest_rd_atomic > IPATH_MAX_RDMA_ATOMIC) + goto inval; + + switch (new_state) { + case IB_QPS_RESET: + if (qp->state != IB_QPS_RESET) { + qp->state = IB_QPS_RESET; + spin_lock(&dev->pending_lock); + if (!list_empty(&qp->timerwait)) + list_del_init(&qp->timerwait); + if (!list_empty(&qp->piowait)) + list_del_init(&qp->piowait); + spin_unlock(&dev->pending_lock); + qp->s_flags &= ~IPATH_S_ANY_WAIT; + spin_unlock_irq(&qp->s_lock); + /* Stop the sending tasklet */ + tasklet_kill(&qp->s_task); + wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy)); + spin_lock_irq(&qp->s_lock); + } + ipath_reset_qp(qp, ibqp->qp_type); + break; + + case IB_QPS_SQD: + qp->s_draining = qp->s_last != qp->s_cur; + qp->state = new_state; + break; + + case IB_QPS_SQE: + if (qp->ibqp.qp_type == IB_QPT_RC) + goto inval; + qp->state = new_state; + break; + + case IB_QPS_ERR: + lastwqe = ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR); + break; + + default: + qp->state = new_state; + break; + } + + if (attr_mask & IB_QP_PKEY_INDEX) + qp->s_pkey_index = attr->pkey_index; + + if (attr_mask & IB_QP_DEST_QPN) + qp->remote_qpn = attr->dest_qp_num; + + if (attr_mask & IB_QP_SQ_PSN) { + qp->s_psn = qp->s_next_psn = attr->sq_psn; + qp->s_last_psn = qp->s_next_psn - 1; + } + + if (attr_mask & IB_QP_RQ_PSN) + qp->r_psn = attr->rq_psn; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + qp->qp_access_flags = attr->qp_access_flags; + + if (attr_mask & IB_QP_AV) { + qp->remote_ah_attr = attr->ah_attr; + qp->s_dmult = ipath_ib_rate_to_mult(attr->ah_attr.static_rate); + } + + if (attr_mask & IB_QP_PATH_MTU) + qp->path_mtu = attr->path_mtu; + + if (attr_mask & IB_QP_RETRY_CNT) + qp->s_retry = qp->s_retry_cnt = attr->retry_cnt; + + if (attr_mask & IB_QP_RNR_RETRY) { + qp->s_rnr_retry = attr->rnr_retry; + if (qp->s_rnr_retry > 7) + qp->s_rnr_retry = 7; + qp->s_rnr_retry_cnt = qp->s_rnr_retry; + } + + if (attr_mask & IB_QP_MIN_RNR_TIMER) + qp->r_min_rnr_timer = attr->min_rnr_timer; + + if (attr_mask & IB_QP_TIMEOUT) + qp->timeout = attr->timeout; + + if (attr_mask & IB_QP_QKEY) + qp->qkey = attr->qkey; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + qp->r_max_rd_atomic = attr->max_dest_rd_atomic; + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) + qp->s_max_rd_atomic = attr->max_rd_atomic; + + spin_unlock_irq(&qp->s_lock); + + if (lastwqe) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } + ret = 0; + goto bail; + +inval: + spin_unlock_irq(&qp->s_lock); + ret = -EINVAL; + +bail: + return ret; +} + +int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_qp_init_attr *init_attr) +{ + struct ipath_qp *qp = to_iqp(ibqp); + + attr->qp_state = qp->state; + attr->cur_qp_state = attr->qp_state; + attr->path_mtu = qp->path_mtu; + attr->path_mig_state = 0; + attr->qkey = qp->qkey; + attr->rq_psn = qp->r_psn; + attr->sq_psn = qp->s_next_psn; + attr->dest_qp_num = qp->remote_qpn; + attr->qp_access_flags = qp->qp_access_flags; + attr->cap.max_send_wr = qp->s_size - 1; + attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1; + attr->cap.max_send_sge = qp->s_max_sge; + attr->cap.max_recv_sge = qp->r_rq.max_sge; + attr->cap.max_inline_data = 0; + attr->ah_attr = qp->remote_ah_attr; + memset(&attr->alt_ah_attr, 0, sizeof(attr->alt_ah_attr)); + attr->pkey_index = qp->s_pkey_index; + attr->alt_pkey_index = 0; + attr->en_sqd_async_notify = 0; + attr->sq_draining = qp->s_draining; + attr->max_rd_atomic = qp->s_max_rd_atomic; + attr->max_dest_rd_atomic = qp->r_max_rd_atomic; + attr->min_rnr_timer = qp->r_min_rnr_timer; + attr->port_num = 1; + attr->timeout = qp->timeout; + attr->retry_cnt = qp->s_retry_cnt; + attr->rnr_retry = qp->s_rnr_retry_cnt; + attr->alt_port_num = 0; + attr->alt_timeout = 0; + + init_attr->event_handler = qp->ibqp.event_handler; + init_attr->qp_context = qp->ibqp.qp_context; + init_attr->send_cq = qp->ibqp.send_cq; + init_attr->recv_cq = qp->ibqp.recv_cq; + init_attr->srq = qp->ibqp.srq; + init_attr->cap = attr->cap; + if (qp->s_flags & IPATH_S_SIGNAL_REQ_WR) + init_attr->sq_sig_type = IB_SIGNAL_REQ_WR; + else + init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; + init_attr->qp_type = qp->ibqp.qp_type; + init_attr->port_num = 1; + return 0; +} + +/** + * ipath_compute_aeth - compute the AETH (syndrome + MSN) + * @qp: the queue pair to compute the AETH for + * + * Returns the AETH. + */ +__be32 ipath_compute_aeth(struct ipath_qp *qp) +{ + u32 aeth = qp->r_msn & IPATH_MSN_MASK; + + if (qp->ibqp.srq) { + /* + * Shared receive queues don't generate credits. + * Set the credit field to the invalid value. + */ + aeth |= IPATH_AETH_CREDIT_INVAL << IPATH_AETH_CREDIT_SHIFT; + } else { + u32 min, max, x; + u32 credits; + struct ipath_rwq *wq = qp->r_rq.wq; + u32 head; + u32 tail; + + /* sanity check pointers before trusting them */ + head = wq->head; + if (head >= qp->r_rq.size) + head = 0; + tail = wq->tail; + if (tail >= qp->r_rq.size) + tail = 0; + /* + * Compute the number of credits available (RWQEs). + * XXX Not holding the r_rq.lock here so there is a small + * chance that the pair of reads are not atomic. + */ + credits = head - tail; + if ((int)credits < 0) + credits += qp->r_rq.size; + /* + * Binary search the credit table to find the code to + * use. + */ + min = 0; + max = 31; + for (;;) { + x = (min + max) / 2; + if (credit_table[x] == credits) + break; + if (credit_table[x] > credits) + max = x; + else if (min == x) + break; + else + min = x; + } + aeth |= x << IPATH_AETH_CREDIT_SHIFT; + } + return cpu_to_be32(aeth); +} + +/** + * ipath_create_qp - create a queue pair for a device + * @ibpd: the protection domain who's device we create the queue pair for + * @init_attr: the attributes of the queue pair + * @udata: unused by InfiniPath + * + * Returns the queue pair on success, otherwise returns an errno. + * + * Called by the ib_create_qp() core verbs function. + */ +struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct ipath_qp *qp; + int err; + struct ipath_swqe *swq = NULL; + struct ipath_ibdev *dev; + size_t sz; + size_t sg_list_sz; + struct ib_qp *ret; + + if (init_attr->create_flags) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + if (init_attr->cap.max_send_sge > ib_ipath_max_sges || + init_attr->cap.max_send_wr > ib_ipath_max_qp_wrs) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + /* Check receive queue parameters if no SRQ is specified. */ + if (!init_attr->srq) { + if (init_attr->cap.max_recv_sge > ib_ipath_max_sges || + init_attr->cap.max_recv_wr > ib_ipath_max_qp_wrs) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + if (init_attr->cap.max_send_sge + + init_attr->cap.max_send_wr + + init_attr->cap.max_recv_sge + + init_attr->cap.max_recv_wr == 0) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + } + + switch (init_attr->qp_type) { + case IB_QPT_UC: + case IB_QPT_RC: + case IB_QPT_UD: + case IB_QPT_SMI: + case IB_QPT_GSI: + sz = sizeof(struct ipath_sge) * + init_attr->cap.max_send_sge + + sizeof(struct ipath_swqe); + swq = vmalloc((init_attr->cap.max_send_wr + 1) * sz); + if (swq == NULL) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + sz = sizeof(*qp); + sg_list_sz = 0; + if (init_attr->srq) { + struct ipath_srq *srq = to_isrq(init_attr->srq); + + if (srq->rq.max_sge > 1) + sg_list_sz = sizeof(*qp->r_sg_list) * + (srq->rq.max_sge - 1); + } else if (init_attr->cap.max_recv_sge > 1) + sg_list_sz = sizeof(*qp->r_sg_list) * + (init_attr->cap.max_recv_sge - 1); + qp = kmalloc(sz + sg_list_sz, GFP_KERNEL); + if (!qp) { + ret = ERR_PTR(-ENOMEM); + goto bail_swq; + } + if (sg_list_sz && (init_attr->qp_type == IB_QPT_UD || + init_attr->qp_type == IB_QPT_SMI || + init_attr->qp_type == IB_QPT_GSI)) { + qp->r_ud_sg_list = kmalloc(sg_list_sz, GFP_KERNEL); + if (!qp->r_ud_sg_list) { + ret = ERR_PTR(-ENOMEM); + goto bail_qp; + } + } else + qp->r_ud_sg_list = NULL; + if (init_attr->srq) { + sz = 0; + qp->r_rq.size = 0; + qp->r_rq.max_sge = 0; + qp->r_rq.wq = NULL; + init_attr->cap.max_recv_wr = 0; + init_attr->cap.max_recv_sge = 0; + } else { + qp->r_rq.size = init_attr->cap.max_recv_wr + 1; + qp->r_rq.max_sge = init_attr->cap.max_recv_sge; + sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) + + sizeof(struct ipath_rwqe); + qp->r_rq.wq = vmalloc_user(sizeof(struct ipath_rwq) + + qp->r_rq.size * sz); + if (!qp->r_rq.wq) { + ret = ERR_PTR(-ENOMEM); + goto bail_sg_list; + } + } + + /* + * ib_create_qp() will initialize qp->ibqp + * except for qp->ibqp.qp_num. + */ + spin_lock_init(&qp->s_lock); + spin_lock_init(&qp->r_rq.lock); + atomic_set(&qp->refcount, 0); + init_waitqueue_head(&qp->wait); + init_waitqueue_head(&qp->wait_dma); + tasklet_init(&qp->s_task, ipath_do_send, (unsigned long)qp); + INIT_LIST_HEAD(&qp->piowait); + INIT_LIST_HEAD(&qp->timerwait); + qp->state = IB_QPS_RESET; + qp->s_wq = swq; + qp->s_size = init_attr->cap.max_send_wr + 1; + qp->s_max_sge = init_attr->cap.max_send_sge; + if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR) + qp->s_flags = IPATH_S_SIGNAL_REQ_WR; + else + qp->s_flags = 0; + dev = to_idev(ibpd->device); + err = ipath_alloc_qpn(&dev->qp_table, qp, + init_attr->qp_type); + if (err) { + ret = ERR_PTR(err); + vfree(qp->r_rq.wq); + goto bail_sg_list; + } + qp->ip = NULL; + qp->s_tx = NULL; + ipath_reset_qp(qp, init_attr->qp_type); + break; + + default: + /* Don't support raw QPs */ + ret = ERR_PTR(-ENOSYS); + goto bail; + } + + init_attr->cap.max_inline_data = 0; + + /* + * Return the address of the RWQ as the offset to mmap. + * See ipath_mmap() for details. + */ + if (udata && udata->outlen >= sizeof(__u64)) { + if (!qp->r_rq.wq) { + __u64 offset = 0; + + err = ib_copy_to_udata(udata, &offset, + sizeof(offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_ip; + } + } else { + u32 s = sizeof(struct ipath_rwq) + + qp->r_rq.size * sz; + + qp->ip = + ipath_create_mmap_info(dev, s, + ibpd->uobject->context, + qp->r_rq.wq); + if (!qp->ip) { + ret = ERR_PTR(-ENOMEM); + goto bail_ip; + } + + err = ib_copy_to_udata(udata, &(qp->ip->offset), + sizeof(qp->ip->offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_ip; + } + } + } + + spin_lock(&dev->n_qps_lock); + if (dev->n_qps_allocated == ib_ipath_max_qps) { + spin_unlock(&dev->n_qps_lock); + ret = ERR_PTR(-ENOMEM); + goto bail_ip; + } + + dev->n_qps_allocated++; + spin_unlock(&dev->n_qps_lock); + + if (qp->ip) { + spin_lock_irq(&dev->pending_lock); + list_add(&qp->ip->pending_mmaps, &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + + ret = &qp->ibqp; + goto bail; + +bail_ip: + if (qp->ip) + kref_put(&qp->ip->ref, ipath_release_mmap_info); + else + vfree(qp->r_rq.wq); + ipath_free_qp(&dev->qp_table, qp); + free_qpn(&dev->qp_table, qp->ibqp.qp_num); +bail_sg_list: + kfree(qp->r_ud_sg_list); +bail_qp: + kfree(qp); +bail_swq: + vfree(swq); +bail: + return ret; +} + +/** + * ipath_destroy_qp - destroy a queue pair + * @ibqp: the queue pair to destroy + * + * Returns 0 on success. + * + * Note that this can be called while the QP is actively sending or + * receiving! + */ +int ipath_destroy_qp(struct ib_qp *ibqp) +{ + struct ipath_qp *qp = to_iqp(ibqp); + struct ipath_ibdev *dev = to_idev(ibqp->device); + + /* Make sure HW and driver activity is stopped. */ + spin_lock_irq(&qp->s_lock); + if (qp->state != IB_QPS_RESET) { + qp->state = IB_QPS_RESET; + spin_lock(&dev->pending_lock); + if (!list_empty(&qp->timerwait)) + list_del_init(&qp->timerwait); + if (!list_empty(&qp->piowait)) + list_del_init(&qp->piowait); + spin_unlock(&dev->pending_lock); + qp->s_flags &= ~IPATH_S_ANY_WAIT; + spin_unlock_irq(&qp->s_lock); + /* Stop the sending tasklet */ + tasklet_kill(&qp->s_task); + wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy)); + } else + spin_unlock_irq(&qp->s_lock); + + ipath_free_qp(&dev->qp_table, qp); + + if (qp->s_tx) { + atomic_dec(&qp->refcount); + if (qp->s_tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF) + kfree(qp->s_tx->txreq.map_addr); + spin_lock_irq(&dev->pending_lock); + list_add(&qp->s_tx->txreq.list, &dev->txreq_free); + spin_unlock_irq(&dev->pending_lock); + qp->s_tx = NULL; + } + + wait_event(qp->wait, !atomic_read(&qp->refcount)); + + /* all user's cleaned up, mark it available */ + free_qpn(&dev->qp_table, qp->ibqp.qp_num); + spin_lock(&dev->n_qps_lock); + dev->n_qps_allocated--; + spin_unlock(&dev->n_qps_lock); + + if (qp->ip) + kref_put(&qp->ip->ref, ipath_release_mmap_info); + else + vfree(qp->r_rq.wq); + kfree(qp->r_ud_sg_list); + vfree(qp->s_wq); + kfree(qp); + return 0; +} + +/** + * ipath_init_qp_table - initialize the QP table for a device + * @idev: the device who's QP table we're initializing + * @size: the size of the QP table + * + * Returns 0 on success, otherwise returns an errno. + */ +int ipath_init_qp_table(struct ipath_ibdev *idev, int size) +{ + int i; + int ret; + + idev->qp_table.last = 1; /* QPN 0 and 1 are special. */ + idev->qp_table.max = size; + idev->qp_table.nmaps = 1; + idev->qp_table.table = kzalloc(size * sizeof(*idev->qp_table.table), + GFP_KERNEL); + if (idev->qp_table.table == NULL) { + ret = -ENOMEM; + goto bail; + } + + for (i = 0; i < ARRAY_SIZE(idev->qp_table.map); i++) { + atomic_set(&idev->qp_table.map[i].n_free, BITS_PER_PAGE); + idev->qp_table.map[i].page = NULL; + } + + ret = 0; + +bail: + return ret; +} + +/** + * ipath_get_credit - flush the send work queue of a QP + * @qp: the qp who's send work queue to flush + * @aeth: the Acknowledge Extended Transport Header + * + * The QP s_lock should be held. + */ +void ipath_get_credit(struct ipath_qp *qp, u32 aeth) +{ + u32 credit = (aeth >> IPATH_AETH_CREDIT_SHIFT) & IPATH_AETH_CREDIT_MASK; + + /* + * If the credit is invalid, we can send + * as many packets as we like. Otherwise, we have to + * honor the credit field. + */ + if (credit == IPATH_AETH_CREDIT_INVAL) + qp->s_lsn = (u32) -1; + else if (qp->s_lsn != (u32) -1) { + /* Compute new LSN (i.e., MSN + credit) */ + credit = (aeth + credit_table[credit]) & IPATH_MSN_MASK; + if (ipath_cmp24(credit, qp->s_lsn) > 0) + qp->s_lsn = credit; + } + + /* Restart sending if it was blocked due to lack of credits. */ + if ((qp->s_flags & IPATH_S_WAIT_SSN_CREDIT) && + qp->s_cur != qp->s_head && + (qp->s_lsn == (u32) -1 || + ipath_cmp24(get_swqe_ptr(qp, qp->s_cur)->ssn, + qp->s_lsn + 1) <= 0)) + ipath_schedule_send(qp); +} diff --git a/drivers/infiniband/hw/ipath/ipath_rc.c b/drivers/infiniband/hw/ipath/ipath_rc.c new file mode 100644 index 0000000..79b3dbc --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_rc.c @@ -0,0 +1,1969 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "ipath_verbs.h" +#include "ipath_kernel.h" + +/* cut down ridiculously long IB macro names */ +#define OP(x) IB_OPCODE_RC_##x + +static u32 restart_sge(struct ipath_sge_state *ss, struct ipath_swqe *wqe, + u32 psn, u32 pmtu) +{ + u32 len; + + len = ((psn - wqe->psn) & IPATH_PSN_MASK) * pmtu; + ss->sge = wqe->sg_list[0]; + ss->sg_list = wqe->sg_list + 1; + ss->num_sge = wqe->wr.num_sge; + ipath_skip_sge(ss, len); + return wqe->length - len; +} + +/** + * ipath_init_restart- initialize the qp->s_sge after a restart + * @qp: the QP who's SGE we're restarting + * @wqe: the work queue to initialize the QP's SGE from + * + * The QP s_lock should be held and interrupts disabled. + */ +static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe) +{ + struct ipath_ibdev *dev; + + qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, + ib_mtu_enum_to_int(qp->path_mtu)); + dev = to_idev(qp->ibqp.device); + spin_lock(&dev->pending_lock); + if (list_empty(&qp->timerwait)) + list_add_tail(&qp->timerwait, + &dev->pending[dev->pending_index]); + spin_unlock(&dev->pending_lock); +} + +/** + * ipath_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read) + * @qp: a pointer to the QP + * @ohdr: a pointer to the IB header being constructed + * @pmtu: the path MTU + * + * Return 1 if constructed; otherwise, return 0. + * Note that we are in the responder's side of the QP context. + * Note the QP s_lock must be held. + */ +static int ipath_make_rc_ack(struct ipath_ibdev *dev, struct ipath_qp *qp, + struct ipath_other_headers *ohdr, u32 pmtu) +{ + struct ipath_ack_entry *e; + u32 hwords; + u32 len; + u32 bth0; + u32 bth2; + + /* Don't send an ACK if we aren't supposed to. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) + goto bail; + + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + + switch (qp->s_ack_state) { + case OP(RDMA_READ_RESPONSE_LAST): + case OP(RDMA_READ_RESPONSE_ONLY): + case OP(ATOMIC_ACKNOWLEDGE): + /* + * We can increment the tail pointer now that the last + * response has been sent instead of only being + * constructed. + */ + if (++qp->s_tail_ack_queue > IPATH_MAX_RDMA_ATOMIC) + qp->s_tail_ack_queue = 0; + /* FALLTHROUGH */ + case OP(SEND_ONLY): + case OP(ACKNOWLEDGE): + /* Check for no next entry in the queue. */ + if (qp->r_head_ack_queue == qp->s_tail_ack_queue) { + if (qp->s_flags & IPATH_S_ACK_PENDING) + goto normal; + qp->s_ack_state = OP(ACKNOWLEDGE); + goto bail; + } + + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + if (e->opcode == OP(RDMA_READ_REQUEST)) { + /* Copy SGE state in case we need to resend */ + qp->s_ack_rdma_sge = e->rdma_sge; + qp->s_cur_sge = &qp->s_ack_rdma_sge; + len = e->rdma_sge.sge.sge_length; + if (len > pmtu) { + len = pmtu; + qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST); + } else { + qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY); + e->sent = 1; + } + ohdr->u.aeth = ipath_compute_aeth(qp); + hwords++; + qp->s_ack_rdma_psn = e->psn; + bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK; + } else { + /* COMPARE_SWAP or FETCH_ADD */ + qp->s_cur_sge = NULL; + len = 0; + qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE); + ohdr->u.at.aeth = ipath_compute_aeth(qp); + ohdr->u.at.atomic_ack_eth[0] = + cpu_to_be32(e->atomic_data >> 32); + ohdr->u.at.atomic_ack_eth[1] = + cpu_to_be32(e->atomic_data); + hwords += sizeof(ohdr->u.at) / sizeof(u32); + bth2 = e->psn; + e->sent = 1; + } + bth0 = qp->s_ack_state << 24; + break; + + case OP(RDMA_READ_RESPONSE_FIRST): + qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE); + /* FALLTHROUGH */ + case OP(RDMA_READ_RESPONSE_MIDDLE): + len = qp->s_ack_rdma_sge.sge.sge_length; + if (len > pmtu) + len = pmtu; + else { + ohdr->u.aeth = ipath_compute_aeth(qp); + hwords++; + qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); + qp->s_ack_queue[qp->s_tail_ack_queue].sent = 1; + } + bth0 = qp->s_ack_state << 24; + bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK; + break; + + default: + normal: + /* + * Send a regular ACK. + * Set the s_ack_state so we wait until after sending + * the ACK before setting s_ack_state to ACKNOWLEDGE + * (see above). + */ + qp->s_ack_state = OP(SEND_ONLY); + qp->s_flags &= ~IPATH_S_ACK_PENDING; + qp->s_cur_sge = NULL; + if (qp->s_nak_state) + ohdr->u.aeth = + cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) | + (qp->s_nak_state << + IPATH_AETH_CREDIT_SHIFT)); + else + ohdr->u.aeth = ipath_compute_aeth(qp); + hwords++; + len = 0; + bth0 = OP(ACKNOWLEDGE) << 24; + bth2 = qp->s_ack_psn & IPATH_PSN_MASK; + } + qp->s_hdrwords = hwords; + qp->s_cur_size = len; + ipath_make_ruc_header(dev, qp, ohdr, bth0, bth2); + return 1; + +bail: + return 0; +} + +/** + * ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC) + * @qp: a pointer to the QP + * + * Return 1 if constructed; otherwise, return 0. + */ +int ipath_make_rc_req(struct ipath_qp *qp) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ipath_other_headers *ohdr; + struct ipath_sge_state *ss; + struct ipath_swqe *wqe; + u32 hwords; + u32 len; + u32 bth0; + u32 bth2; + u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); + char newreq; + unsigned long flags; + int ret = 0; + + ohdr = &qp->s_hdr.u.oth; + if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) + ohdr = &qp->s_hdr.u.l.oth; + + /* + * The lock is needed to synchronize between the sending tasklet, + * the receive interrupt handler, and timeout resends. + */ + spin_lock_irqsave(&qp->s_lock, flags); + + /* Sending responses has higher priority over sending requests. */ + if ((qp->r_head_ack_queue != qp->s_tail_ack_queue || + (qp->s_flags & IPATH_S_ACK_PENDING) || + qp->s_ack_state != OP(ACKNOWLEDGE)) && + ipath_make_rc_ack(dev, qp, ohdr, pmtu)) + goto done; + + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) { + if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + if (qp->s_last == qp->s_head) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (atomic_read(&qp->s_dma_busy)) { + qp->s_flags |= IPATH_S_WAIT_DMA; + goto bail; + } + wqe = get_swqe_ptr(qp, qp->s_last); + ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + goto done; + } + + /* Leave BUSY set until RNR timeout. */ + if (qp->s_rnr_timeout) { + qp->s_flags |= IPATH_S_WAITING; + goto bail; + } + + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + bth0 = 1 << 22; /* Set M bit */ + + /* Send a request. */ + wqe = get_swqe_ptr(qp, qp->s_cur); + switch (qp->s_state) { + default: + if (!(ib_ipath_state_ops[qp->state] & + IPATH_PROCESS_NEXT_SEND_OK)) + goto bail; + /* + * Resend an old request or start a new one. + * + * We keep track of the current SWQE so that + * we don't reset the "furthest progress" state + * if we need to back up. + */ + newreq = 0; + if (qp->s_cur == qp->s_tail) { + /* Check if send work queue is empty. */ + if (qp->s_tail == qp->s_head) + goto bail; + /* + * If a fence is requested, wait for previous + * RDMA read and atomic operations to finish. + */ + if ((wqe->wr.send_flags & IB_SEND_FENCE) && + qp->s_num_rd_atomic) { + qp->s_flags |= IPATH_S_FENCE_PENDING; + goto bail; + } + wqe->psn = qp->s_next_psn; + newreq = 1; + } + /* + * Note that we have to be careful not to modify the + * original work request since we may need to resend + * it. + */ + len = wqe->length; + ss = &qp->s_sge; + bth2 = 0; + switch (wqe->wr.opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + /* If no credit, return. */ + if (qp->s_lsn != (u32) -1 && + ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) { + qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT; + goto bail; + } + wqe->lpsn = wqe->psn; + if (len > pmtu) { + wqe->lpsn += (len - 1) / pmtu; + qp->s_state = OP(SEND_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_ONLY); + else { + qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + bth2 = 1 << 31; /* Request ACK. */ + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_RDMA_WRITE: + if (newreq && qp->s_lsn != (u32) -1) + qp->s_lsn++; + /* FALLTHROUGH */ + case IB_WR_RDMA_WRITE_WITH_IMM: + /* If no credit, return. */ + if (qp->s_lsn != (u32) -1 && + ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) { + qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT; + goto bail; + } + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + hwords += sizeof(struct ib_reth) / sizeof(u32); + wqe->lpsn = wqe->psn; + if (len > pmtu) { + wqe->lpsn += (len - 1) / pmtu; + qp->s_state = OP(RDMA_WRITE_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_ONLY); + else { + qp->s_state = + OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after RETH */ + ohdr->u.rc.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + } + bth2 = 1 << 31; /* Request ACK. */ + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_RDMA_READ: + /* + * Don't allow more operations to be started + * than the QP limits allow. + */ + if (newreq) { + if (qp->s_num_rd_atomic >= + qp->s_max_rd_atomic) { + qp->s_flags |= IPATH_S_RDMAR_PENDING; + goto bail; + } + qp->s_num_rd_atomic++; + if (qp->s_lsn != (u32) -1) + qp->s_lsn++; + /* + * Adjust s_next_psn to count the + * expected number of responses. + */ + if (len > pmtu) + qp->s_next_psn += (len - 1) / pmtu; + wqe->lpsn = qp->s_next_psn++; + } + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + qp->s_state = OP(RDMA_READ_REQUEST); + hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); + ss = NULL; + len = 0; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + /* + * Don't allow more operations to be started + * than the QP limits allow. + */ + if (newreq) { + if (qp->s_num_rd_atomic >= + qp->s_max_rd_atomic) { + qp->s_flags |= IPATH_S_RDMAR_PENDING; + goto bail; + } + qp->s_num_rd_atomic++; + if (qp->s_lsn != (u32) -1) + qp->s_lsn++; + wqe->lpsn = wqe->psn; + } + if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) { + qp->s_state = OP(COMPARE_SWAP); + ohdr->u.atomic_eth.swap_data = cpu_to_be64( + wqe->wr.wr.atomic.swap); + ohdr->u.atomic_eth.compare_data = cpu_to_be64( + wqe->wr.wr.atomic.compare_add); + } else { + qp->s_state = OP(FETCH_ADD); + ohdr->u.atomic_eth.swap_data = cpu_to_be64( + wqe->wr.wr.atomic.compare_add); + ohdr->u.atomic_eth.compare_data = 0; + } + ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32( + wqe->wr.wr.atomic.remote_addr >> 32); + ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32( + wqe->wr.wr.atomic.remote_addr); + ohdr->u.atomic_eth.rkey = cpu_to_be32( + wqe->wr.wr.atomic.rkey); + hwords += sizeof(struct ib_atomic_eth) / sizeof(u32); + ss = NULL; + len = 0; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + default: + goto bail; + } + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_len = wqe->length; + if (newreq) { + qp->s_tail++; + if (qp->s_tail >= qp->s_size) + qp->s_tail = 0; + } + bth2 |= qp->s_psn & IPATH_PSN_MASK; + if (wqe->wr.opcode == IB_WR_RDMA_READ) + qp->s_psn = wqe->lpsn + 1; + else { + qp->s_psn++; + if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0) + qp->s_next_psn = qp->s_psn; + } + /* + * Put the QP on the pending list so lost ACKs will cause + * a retry. More than one request can be pending so the + * QP may already be on the dev->pending list. + */ + spin_lock(&dev->pending_lock); + if (list_empty(&qp->timerwait)) + list_add_tail(&qp->timerwait, + &dev->pending[dev->pending_index]); + spin_unlock(&dev->pending_lock); + break; + + case OP(RDMA_READ_RESPONSE_FIRST): + /* + * This case can only happen if a send is restarted. + * See ipath_restart_rc(). + */ + ipath_init_restart(qp, wqe); + /* FALLTHROUGH */ + case OP(SEND_FIRST): + qp->s_state = OP(SEND_MIDDLE); + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + bth2 = qp->s_psn++ & IPATH_PSN_MASK; + if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0) + qp->s_next_psn = qp->s_psn; + ss = &qp->s_sge; + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_LAST); + else { + qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + bth2 |= 1 << 31; /* Request ACK. */ + qp->s_cur++; + if (qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_READ_RESPONSE_LAST): + /* + * This case can only happen if a RDMA write is restarted. + * See ipath_restart_rc(). + */ + ipath_init_restart(qp, wqe); + /* FALLTHROUGH */ + case OP(RDMA_WRITE_FIRST): + qp->s_state = OP(RDMA_WRITE_MIDDLE); + /* FALLTHROUGH */ + case OP(RDMA_WRITE_MIDDLE): + bth2 = qp->s_psn++ & IPATH_PSN_MASK; + if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0) + qp->s_next_psn = qp->s_psn; + ss = &qp->s_sge; + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_LAST); + else { + qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + } + bth2 |= 1 << 31; /* Request ACK. */ + qp->s_cur++; + if (qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_READ_RESPONSE_MIDDLE): + /* + * This case can only happen if a RDMA read is restarted. + * See ipath_restart_rc(). + */ + ipath_init_restart(qp, wqe); + len = ((qp->s_psn - wqe->psn) & IPATH_PSN_MASK) * pmtu; + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(qp->s_len); + qp->s_state = OP(RDMA_READ_REQUEST); + hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); + bth2 = qp->s_psn & IPATH_PSN_MASK; + qp->s_psn = wqe->lpsn + 1; + ss = NULL; + len = 0; + qp->s_cur++; + if (qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + } + if (ipath_cmp24(qp->s_psn, qp->s_last_psn + IPATH_PSN_CREDIT - 1) >= 0) + bth2 |= 1 << 31; /* Request ACK. */ + qp->s_len -= len; + qp->s_hdrwords = hwords; + qp->s_cur_sge = ss; + qp->s_cur_size = len; + ipath_make_ruc_header(dev, qp, ohdr, bth0 | (qp->s_state << 24), bth2); +done: + ret = 1; + goto unlock; + +bail: + qp->s_flags &= ~IPATH_S_BUSY; +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +/** + * send_rc_ack - Construct an ACK packet and send it + * @qp: a pointer to the QP + * + * This is called from ipath_rc_rcv() and only uses the receive + * side QP state. + * Note that RDMA reads and atomics are handled in the + * send side QP state and tasklet. + */ +static void send_rc_ack(struct ipath_qp *qp) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ipath_devdata *dd; + u16 lrh0; + u32 bth0; + u32 hwords; + u32 __iomem *piobuf; + struct ipath_ib_header hdr; + struct ipath_other_headers *ohdr; + unsigned long flags; + + spin_lock_irqsave(&qp->s_lock, flags); + + /* Don't send ACK or NAK if a RDMA read or atomic is pending. */ + if (qp->r_head_ack_queue != qp->s_tail_ack_queue || + (qp->s_flags & IPATH_S_ACK_PENDING) || + qp->s_ack_state != OP(ACKNOWLEDGE)) + goto queue_ack; + + spin_unlock_irqrestore(&qp->s_lock, flags); + + /* Don't try to send ACKs if the link isn't ACTIVE */ + dd = dev->dd; + if (!(dd->ipath_flags & IPATH_LINKACTIVE)) + goto done; + + piobuf = ipath_getpiobuf(dd, 0, NULL); + if (!piobuf) { + /* + * We are out of PIO buffers at the moment. + * Pass responsibility for sending the ACK to the + * send tasklet so that when a PIO buffer becomes + * available, the ACK is sent ahead of other outgoing + * packets. + */ + spin_lock_irqsave(&qp->s_lock, flags); + goto queue_ack; + } + + /* Construct the header. */ + ohdr = &hdr.u.oth; + lrh0 = IPATH_LRH_BTH; + /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */ + hwords = 6; + if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { + hwords += ipath_make_grh(dev, &hdr.u.l.grh, + &qp->remote_ah_attr.grh, + hwords, 0); + ohdr = &hdr.u.l.oth; + lrh0 = IPATH_LRH_GRH; + } + /* read pkey_index w/o lock (its atomic) */ + bth0 = ipath_get_pkey(dd, qp->s_pkey_index) | + (OP(ACKNOWLEDGE) << 24) | (1 << 22); + if (qp->r_nak_state) + ohdr->u.aeth = cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) | + (qp->r_nak_state << + IPATH_AETH_CREDIT_SHIFT)); + else + ohdr->u.aeth = ipath_compute_aeth(qp); + lrh0 |= qp->remote_ah_attr.sl << 4; + hdr.lrh[0] = cpu_to_be16(lrh0); + hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); + hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC); + hdr.lrh[3] = cpu_to_be16(dd->ipath_lid | + qp->remote_ah_attr.src_path_bits); + ohdr->bth[0] = cpu_to_be32(bth0); + ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); + ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & IPATH_PSN_MASK); + + writeq(hwords + 1, piobuf); + + if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) { + u32 *hdrp = (u32 *) &hdr; + + ipath_flush_wc(); + __iowrite32_copy(piobuf + 2, hdrp, hwords - 1); + ipath_flush_wc(); + __raw_writel(hdrp[hwords - 1], piobuf + hwords + 1); + } else + __iowrite32_copy(piobuf + 2, (u32 *) &hdr, hwords); + + ipath_flush_wc(); + + dev->n_unicast_xmit++; + goto done; + +queue_ack: + if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK) { + dev->n_rc_qacks++; + qp->s_flags |= IPATH_S_ACK_PENDING; + qp->s_nak_state = qp->r_nak_state; + qp->s_ack_psn = qp->r_ack_psn; + + /* Schedule the send tasklet. */ + ipath_schedule_send(qp); + } + spin_unlock_irqrestore(&qp->s_lock, flags); +done: + return; +} + +/** + * reset_psn - reset the QP state to send starting from PSN + * @qp: the QP + * @psn: the packet sequence number to restart at + * + * This is called from ipath_rc_rcv() to process an incoming RC ACK + * for the given QP. + * Called at interrupt level with the QP s_lock held. + */ +static void reset_psn(struct ipath_qp *qp, u32 psn) +{ + u32 n = qp->s_last; + struct ipath_swqe *wqe = get_swqe_ptr(qp, n); + u32 opcode; + + qp->s_cur = n; + + /* + * If we are starting the request from the beginning, + * let the normal send code handle initialization. + */ + if (ipath_cmp24(psn, wqe->psn) <= 0) { + qp->s_state = OP(SEND_LAST); + goto done; + } + + /* Find the work request opcode corresponding to the given PSN. */ + opcode = wqe->wr.opcode; + for (;;) { + int diff; + + if (++n == qp->s_size) + n = 0; + if (n == qp->s_tail) + break; + wqe = get_swqe_ptr(qp, n); + diff = ipath_cmp24(psn, wqe->psn); + if (diff < 0) + break; + qp->s_cur = n; + /* + * If we are starting the request from the beginning, + * let the normal send code handle initialization. + */ + if (diff == 0) { + qp->s_state = OP(SEND_LAST); + goto done; + } + opcode = wqe->wr.opcode; + } + + /* + * Set the state to restart in the middle of a request. + * Don't change the s_sge, s_cur_sge, or s_cur_size. + * See ipath_make_rc_req(). + */ + switch (opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + qp->s_state = OP(RDMA_READ_RESPONSE_FIRST); + break; + + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + qp->s_state = OP(RDMA_READ_RESPONSE_LAST); + break; + + case IB_WR_RDMA_READ: + qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE); + break; + + default: + /* + * This case shouldn't happen since its only + * one PSN per req. + */ + qp->s_state = OP(SEND_LAST); + } +done: + qp->s_psn = psn; +} + +/** + * ipath_restart_rc - back up requester to resend the last un-ACKed request + * @qp: the QP to restart + * @psn: packet sequence number for the request + * @wc: the work completion request + * + * The QP s_lock should be held and interrupts disabled. + */ +void ipath_restart_rc(struct ipath_qp *qp, u32 psn) +{ + struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last); + struct ipath_ibdev *dev; + + if (qp->s_retry == 0) { + ipath_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); + ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR); + goto bail; + } + qp->s_retry--; + + /* + * Remove the QP from the timeout queue. + * Note: it may already have been removed by ipath_ib_timer(). + */ + dev = to_idev(qp->ibqp.device); + spin_lock(&dev->pending_lock); + if (!list_empty(&qp->timerwait)) + list_del_init(&qp->timerwait); + if (!list_empty(&qp->piowait)) + list_del_init(&qp->piowait); + spin_unlock(&dev->pending_lock); + + if (wqe->wr.opcode == IB_WR_RDMA_READ) + dev->n_rc_resends++; + else + dev->n_rc_resends += (qp->s_psn - psn) & IPATH_PSN_MASK; + + reset_psn(qp, psn); + ipath_schedule_send(qp); + +bail: + return; +} + +static inline void update_last_psn(struct ipath_qp *qp, u32 psn) +{ + qp->s_last_psn = psn; +} + +/** + * do_rc_ack - process an incoming RC ACK + * @qp: the QP the ACK came in on + * @psn: the packet sequence number of the ACK + * @opcode: the opcode of the request that resulted in the ACK + * + * This is called from ipath_rc_rcv_resp() to process an incoming RC ACK + * for the given QP. + * Called at interrupt level with the QP s_lock held and interrupts disabled. + * Returns 1 if OK, 0 if current operation should be aborted (NAK). + */ +static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode, + u64 val) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ib_wc wc; + enum ib_wc_status status; + struct ipath_swqe *wqe; + int ret = 0; + u32 ack_psn; + int diff; + + /* + * Remove the QP from the timeout queue (or RNR timeout queue). + * If ipath_ib_timer() has already removed it, + * it's OK since we hold the QP s_lock and ipath_restart_rc() + * just won't find anything to restart if we ACK everything. + */ + spin_lock(&dev->pending_lock); + if (!list_empty(&qp->timerwait)) + list_del_init(&qp->timerwait); + spin_unlock(&dev->pending_lock); + + /* + * Note that NAKs implicitly ACK outstanding SEND and RDMA write + * requests and implicitly NAK RDMA read and atomic requests issued + * before the NAK'ed request. The MSN won't include the NAK'ed + * request but will include an ACK'ed request(s). + */ + ack_psn = psn; + if (aeth >> 29) + ack_psn--; + wqe = get_swqe_ptr(qp, qp->s_last); + + /* + * The MSN might be for a later WQE than the PSN indicates so + * only complete WQEs that the PSN finishes. + */ + while ((diff = ipath_cmp24(ack_psn, wqe->lpsn)) >= 0) { + /* + * RDMA_READ_RESPONSE_ONLY is a special case since + * we want to generate completion events for everything + * before the RDMA read, copy the data, then generate + * the completion for the read. + */ + if (wqe->wr.opcode == IB_WR_RDMA_READ && + opcode == OP(RDMA_READ_RESPONSE_ONLY) && + diff == 0) { + ret = 1; + goto bail; + } + /* + * If this request is a RDMA read or atomic, and the ACK is + * for a later operation, this ACK NAKs the RDMA read or + * atomic. In other words, only a RDMA_READ_LAST or ONLY + * can ACK a RDMA read and likewise for atomic ops. Note + * that the NAK case can only happen if relaxed ordering is + * used and requests are sent after an RDMA read or atomic + * is sent but before the response is received. + */ + if ((wqe->wr.opcode == IB_WR_RDMA_READ && + (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) || + ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) && + (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) { + /* + * The last valid PSN seen is the previous + * request's. + */ + update_last_psn(qp, wqe->psn - 1); + /* Retry this request. */ + ipath_restart_rc(qp, wqe->psn); + /* + * No need to process the ACK/NAK since we are + * restarting an earlier request. + */ + goto bail; + } + if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) + *(u64 *) wqe->sg_list[0].vaddr = val; + if (qp->s_num_rd_atomic && + (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) { + qp->s_num_rd_atomic--; + /* Restart sending task if fence is complete */ + if (((qp->s_flags & IPATH_S_FENCE_PENDING) && + !qp->s_num_rd_atomic) || + qp->s_flags & IPATH_S_RDMAR_PENDING) + ipath_schedule_send(qp); + } + /* Post a send completion queue entry if requested. */ + if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) || + (wqe->wr.send_flags & IB_SEND_SIGNALED)) { + memset(&wc, 0, sizeof wc); + wc.wr_id = wqe->wr.wr_id; + wc.status = IB_WC_SUCCESS; + wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; + wc.byte_len = wqe->length; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = qp->remote_ah_attr.dlid; + wc.sl = qp->remote_ah_attr.sl; + ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0); + } + qp->s_retry = qp->s_retry_cnt; + /* + * If we are completing a request which is in the process of + * being resent, we can stop resending it since we know the + * responder has already seen it. + */ + if (qp->s_last == qp->s_cur) { + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + qp->s_last = qp->s_cur; + if (qp->s_last == qp->s_tail) + break; + wqe = get_swqe_ptr(qp, qp->s_cur); + qp->s_state = OP(SEND_LAST); + qp->s_psn = wqe->psn; + } else { + if (++qp->s_last >= qp->s_size) + qp->s_last = 0; + if (qp->state == IB_QPS_SQD && qp->s_last == qp->s_cur) + qp->s_draining = 0; + if (qp->s_last == qp->s_tail) + break; + wqe = get_swqe_ptr(qp, qp->s_last); + } + } + + switch (aeth >> 29) { + case 0: /* ACK */ + dev->n_rc_acks++; + /* If this is a partial ACK, reset the retransmit timer. */ + if (qp->s_last != qp->s_tail) { + spin_lock(&dev->pending_lock); + if (list_empty(&qp->timerwait)) + list_add_tail(&qp->timerwait, + &dev->pending[dev->pending_index]); + spin_unlock(&dev->pending_lock); + /* + * If we get a partial ACK for a resent operation, + * we can stop resending the earlier packets and + * continue with the next packet the receiver wants. + */ + if (ipath_cmp24(qp->s_psn, psn) <= 0) { + reset_psn(qp, psn + 1); + ipath_schedule_send(qp); + } + } else if (ipath_cmp24(qp->s_psn, psn) <= 0) { + qp->s_state = OP(SEND_LAST); + qp->s_psn = psn + 1; + } + ipath_get_credit(qp, aeth); + qp->s_rnr_retry = qp->s_rnr_retry_cnt; + qp->s_retry = qp->s_retry_cnt; + update_last_psn(qp, psn); + ret = 1; + goto bail; + + case 1: /* RNR NAK */ + dev->n_rnr_naks++; + if (qp->s_last == qp->s_tail) + goto bail; + if (qp->s_rnr_retry == 0) { + status = IB_WC_RNR_RETRY_EXC_ERR; + goto class_b; + } + if (qp->s_rnr_retry_cnt < 7) + qp->s_rnr_retry--; + + /* The last valid PSN is the previous PSN. */ + update_last_psn(qp, psn - 1); + + if (wqe->wr.opcode == IB_WR_RDMA_READ) + dev->n_rc_resends++; + else + dev->n_rc_resends += + (qp->s_psn - psn) & IPATH_PSN_MASK; + + reset_psn(qp, psn); + + qp->s_rnr_timeout = + ib_ipath_rnr_table[(aeth >> IPATH_AETH_CREDIT_SHIFT) & + IPATH_AETH_CREDIT_MASK]; + ipath_insert_rnr_queue(qp); + ipath_schedule_send(qp); + goto bail; + + case 3: /* NAK */ + if (qp->s_last == qp->s_tail) + goto bail; + /* The last valid PSN is the previous PSN. */ + update_last_psn(qp, psn - 1); + switch ((aeth >> IPATH_AETH_CREDIT_SHIFT) & + IPATH_AETH_CREDIT_MASK) { + case 0: /* PSN sequence error */ + dev->n_seq_naks++; + /* + * Back up to the responder's expected PSN. + * Note that we might get a NAK in the middle of an + * RDMA READ response which terminates the RDMA + * READ. + */ + ipath_restart_rc(qp, psn); + break; + + case 1: /* Invalid Request */ + status = IB_WC_REM_INV_REQ_ERR; + dev->n_other_naks++; + goto class_b; + + case 2: /* Remote Access Error */ + status = IB_WC_REM_ACCESS_ERR; + dev->n_other_naks++; + goto class_b; + + case 3: /* Remote Operation Error */ + status = IB_WC_REM_OP_ERR; + dev->n_other_naks++; + class_b: + ipath_send_complete(qp, wqe, status); + ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR); + break; + + default: + /* Ignore other reserved NAK error codes */ + goto reserved; + } + qp->s_rnr_retry = qp->s_rnr_retry_cnt; + goto bail; + + default: /* 2: reserved */ + reserved: + /* Ignore reserved NAK codes. */ + goto bail; + } + +bail: + return ret; +} + +/** + * ipath_rc_rcv_resp - process an incoming RC response packet + * @dev: the device this packet came in on + * @ohdr: the other headers for this packet + * @data: the packet data + * @tlen: the packet length + * @qp: the QP for this packet + * @opcode: the opcode for this packet + * @psn: the packet sequence number for this packet + * @hdrsize: the header length + * @pmtu: the path MTU + * @header_in_data: true if part of the header data is in the data buffer + * + * This is called from ipath_rc_rcv() to process an incoming RC response + * packet for the given QP. + * Called at interrupt level. + */ +static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev, + struct ipath_other_headers *ohdr, + void *data, u32 tlen, + struct ipath_qp *qp, + u32 opcode, + u32 psn, u32 hdrsize, u32 pmtu, + int header_in_data) +{ + struct ipath_swqe *wqe; + enum ib_wc_status status; + unsigned long flags; + int diff; + u32 pad; + u32 aeth; + u64 val; + + spin_lock_irqsave(&qp->s_lock, flags); + + /* Double check we can process this now that we hold the s_lock. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) + goto ack_done; + + /* Ignore invalid responses. */ + if (ipath_cmp24(psn, qp->s_next_psn) >= 0) + goto ack_done; + + /* Ignore duplicate responses. */ + diff = ipath_cmp24(psn, qp->s_last_psn); + if (unlikely(diff <= 0)) { + /* Update credits for "ghost" ACKs */ + if (diff == 0 && opcode == OP(ACKNOWLEDGE)) { + if (!header_in_data) + aeth = be32_to_cpu(ohdr->u.aeth); + else { + aeth = be32_to_cpu(((__be32 *) data)[0]); + data += sizeof(__be32); + } + if ((aeth >> 29) == 0) + ipath_get_credit(qp, aeth); + } + goto ack_done; + } + + if (unlikely(qp->s_last == qp->s_tail)) + goto ack_done; + wqe = get_swqe_ptr(qp, qp->s_last); + status = IB_WC_SUCCESS; + + switch (opcode) { + case OP(ACKNOWLEDGE): + case OP(ATOMIC_ACKNOWLEDGE): + case OP(RDMA_READ_RESPONSE_FIRST): + if (!header_in_data) + aeth = be32_to_cpu(ohdr->u.aeth); + else { + aeth = be32_to_cpu(((__be32 *) data)[0]); + data += sizeof(__be32); + } + if (opcode == OP(ATOMIC_ACKNOWLEDGE)) { + if (!header_in_data) { + __be32 *p = ohdr->u.at.atomic_ack_eth; + + val = ((u64) be32_to_cpu(p[0]) << 32) | + be32_to_cpu(p[1]); + } else + val = be64_to_cpu(((__be64 *) data)[0]); + } else + val = 0; + if (!do_rc_ack(qp, aeth, psn, opcode, val) || + opcode != OP(RDMA_READ_RESPONSE_FIRST)) + goto ack_done; + hdrsize += 4; + wqe = get_swqe_ptr(qp, qp->s_last); + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; + qp->r_flags &= ~IPATH_R_RDMAR_SEQ; + /* + * If this is a response to a resent RDMA read, we + * have to be careful to copy the data to the right + * location. + */ + qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, + wqe, psn, pmtu); + goto read_middle; + + case OP(RDMA_READ_RESPONSE_MIDDLE): + /* no AETH, no ACK */ + if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) { + dev->n_rdma_seq++; + if (qp->r_flags & IPATH_R_RDMAR_SEQ) + goto ack_done; + qp->r_flags |= IPATH_R_RDMAR_SEQ; + ipath_restart_rc(qp, qp->s_last_psn + 1); + goto ack_done; + } + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; + read_middle: + if (unlikely(tlen != (hdrsize + pmtu + 4))) + goto ack_len_err; + if (unlikely(pmtu >= qp->s_rdma_read_len)) + goto ack_len_err; + + /* We got a response so update the timeout. */ + spin_lock(&dev->pending_lock); + if (qp->s_rnr_timeout == 0 && !list_empty(&qp->timerwait)) + list_move_tail(&qp->timerwait, + &dev->pending[dev->pending_index]); + spin_unlock(&dev->pending_lock); + + if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE)) + qp->s_retry = qp->s_retry_cnt; + + /* + * Update the RDMA receive state but do the copy w/o + * holding the locks and blocking interrupts. + */ + qp->s_rdma_read_len -= pmtu; + update_last_psn(qp, psn); + spin_unlock_irqrestore(&qp->s_lock, flags); + ipath_copy_sge(&qp->s_rdma_read_sge, data, pmtu); + goto bail; + + case OP(RDMA_READ_RESPONSE_ONLY): + if (!header_in_data) + aeth = be32_to_cpu(ohdr->u.aeth); + else + aeth = be32_to_cpu(((__be32 *) data)[0]); + if (!do_rc_ack(qp, aeth, psn, opcode, 0)) + goto ack_done; + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* + * Check that the data size is >= 0 && <= pmtu. + * Remember to account for the AETH header (4) and + * ICRC (4). + */ + if (unlikely(tlen < (hdrsize + pad + 8))) + goto ack_len_err; + /* + * If this is a response to a resent RDMA read, we + * have to be careful to copy the data to the right + * location. + */ + wqe = get_swqe_ptr(qp, qp->s_last); + qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, + wqe, psn, pmtu); + goto read_last; + + case OP(RDMA_READ_RESPONSE_LAST): + /* ACKs READ req. */ + if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) { + dev->n_rdma_seq++; + if (qp->r_flags & IPATH_R_RDMAR_SEQ) + goto ack_done; + qp->r_flags |= IPATH_R_RDMAR_SEQ; + ipath_restart_rc(qp, qp->s_last_psn + 1); + goto ack_done; + } + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* + * Check that the data size is >= 1 && <= pmtu. + * Remember to account for the AETH header (4) and + * ICRC (4). + */ + if (unlikely(tlen <= (hdrsize + pad + 8))) + goto ack_len_err; + read_last: + tlen -= hdrsize + pad + 8; + if (unlikely(tlen != qp->s_rdma_read_len)) + goto ack_len_err; + if (!header_in_data) + aeth = be32_to_cpu(ohdr->u.aeth); + else { + aeth = be32_to_cpu(((__be32 *) data)[0]); + data += sizeof(__be32); + } + ipath_copy_sge(&qp->s_rdma_read_sge, data, tlen); + (void) do_rc_ack(qp, aeth, psn, + OP(RDMA_READ_RESPONSE_LAST), 0); + goto ack_done; + } + +ack_op_err: + status = IB_WC_LOC_QP_OP_ERR; + goto ack_err; + +ack_len_err: + status = IB_WC_LOC_LEN_ERR; +ack_err: + ipath_send_complete(qp, wqe, status); + ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR); +ack_done: + spin_unlock_irqrestore(&qp->s_lock, flags); +bail: + return; +} + +/** + * ipath_rc_rcv_error - process an incoming duplicate or error RC packet + * @dev: the device this packet came in on + * @ohdr: the other headers for this packet + * @data: the packet data + * @qp: the QP for this packet + * @opcode: the opcode for this packet + * @psn: the packet sequence number for this packet + * @diff: the difference between the PSN and the expected PSN + * @header_in_data: true if part of the header data is in the data buffer + * + * This is called from ipath_rc_rcv() to process an unexpected + * incoming RC packet for the given QP. + * Called at interrupt level. + * Return 1 if no more processing is needed; otherwise return 0 to + * schedule a response to be sent. + */ +static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev, + struct ipath_other_headers *ohdr, + void *data, + struct ipath_qp *qp, + u32 opcode, + u32 psn, + int diff, + int header_in_data) +{ + struct ipath_ack_entry *e; + u8 i, prev; + int old_req; + unsigned long flags; + + if (diff > 0) { + /* + * Packet sequence error. + * A NAK will ACK earlier sends and RDMA writes. + * Don't queue the NAK if we already sent one. + */ + if (!qp->r_nak_state) { + qp->r_nak_state = IB_NAK_PSN_ERROR; + /* Use the expected PSN. */ + qp->r_ack_psn = qp->r_psn; + goto send_ack; + } + goto done; + } + + /* + * Handle a duplicate request. Don't re-execute SEND, RDMA + * write or atomic op. Don't NAK errors, just silently drop + * the duplicate request. Note that r_sge, r_len, and + * r_rcv_len may be in use so don't modify them. + * + * We are supposed to ACK the earliest duplicate PSN but we + * can coalesce an outstanding duplicate ACK. We have to + * send the earliest so that RDMA reads can be restarted at + * the requester's expected PSN. + * + * First, find where this duplicate PSN falls within the + * ACKs previously sent. + */ + psn &= IPATH_PSN_MASK; + e = NULL; + old_req = 1; + + spin_lock_irqsave(&qp->s_lock, flags); + /* Double check we can process this now that we hold the s_lock. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) + goto unlock_done; + + for (i = qp->r_head_ack_queue; ; i = prev) { + if (i == qp->s_tail_ack_queue) + old_req = 0; + if (i) + prev = i - 1; + else + prev = IPATH_MAX_RDMA_ATOMIC; + if (prev == qp->r_head_ack_queue) { + e = NULL; + break; + } + e = &qp->s_ack_queue[prev]; + if (!e->opcode) { + e = NULL; + break; + } + if (ipath_cmp24(psn, e->psn) >= 0) { + if (prev == qp->s_tail_ack_queue) + old_req = 0; + break; + } + } + switch (opcode) { + case OP(RDMA_READ_REQUEST): { + struct ib_reth *reth; + u32 offset; + u32 len; + + /* + * If we didn't find the RDMA read request in the ack queue, + * or the send tasklet is already backed up to send an + * earlier entry, we can ignore this request. + */ + if (!e || e->opcode != OP(RDMA_READ_REQUEST) || old_req) + goto unlock_done; + /* RETH comes after BTH */ + if (!header_in_data) + reth = &ohdr->u.rc.reth; + else { + reth = (struct ib_reth *)data; + data += sizeof(*reth); + } + /* + * Address range must be a subset of the original + * request and start on pmtu boundaries. + * We reuse the old ack_queue slot since the requester + * should not back up and request an earlier PSN for the + * same request. + */ + offset = ((psn - e->psn) & IPATH_PSN_MASK) * + ib_mtu_enum_to_int(qp->path_mtu); + len = be32_to_cpu(reth->length); + if (unlikely(offset + len > e->rdma_sge.sge.sge_length)) + goto unlock_done; + if (len != 0) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + ok = ipath_rkey_ok(qp, &e->rdma_sge, + len, vaddr, rkey, + IB_ACCESS_REMOTE_READ); + if (unlikely(!ok)) + goto unlock_done; + } else { + e->rdma_sge.sg_list = NULL; + e->rdma_sge.num_sge = 0; + e->rdma_sge.sge.mr = NULL; + e->rdma_sge.sge.vaddr = NULL; + e->rdma_sge.sge.length = 0; + e->rdma_sge.sge.sge_length = 0; + } + e->psn = psn; + qp->s_ack_state = OP(ACKNOWLEDGE); + qp->s_tail_ack_queue = prev; + break; + } + + case OP(COMPARE_SWAP): + case OP(FETCH_ADD): { + /* + * If we didn't find the atomic request in the ack queue + * or the send tasklet is already backed up to send an + * earlier entry, we can ignore this request. + */ + if (!e || e->opcode != (u8) opcode || old_req) + goto unlock_done; + qp->s_ack_state = OP(ACKNOWLEDGE); + qp->s_tail_ack_queue = prev; + break; + } + + default: + if (old_req) + goto unlock_done; + /* + * Resend the most recent ACK if this request is + * after all the previous RDMA reads and atomics. + */ + if (i == qp->r_head_ack_queue) { + spin_unlock_irqrestore(&qp->s_lock, flags); + qp->r_nak_state = 0; + qp->r_ack_psn = qp->r_psn - 1; + goto send_ack; + } + /* + * Try to send a simple ACK to work around a Mellanox bug + * which doesn't accept a RDMA read response or atomic + * response as an ACK for earlier SENDs or RDMA writes. + */ + if (qp->r_head_ack_queue == qp->s_tail_ack_queue && + !(qp->s_flags & IPATH_S_ACK_PENDING) && + qp->s_ack_state == OP(ACKNOWLEDGE)) { + spin_unlock_irqrestore(&qp->s_lock, flags); + qp->r_nak_state = 0; + qp->r_ack_psn = qp->s_ack_queue[i].psn - 1; + goto send_ack; + } + /* + * Resend the RDMA read or atomic op which + * ACKs this duplicate request. + */ + qp->s_ack_state = OP(ACKNOWLEDGE); + qp->s_tail_ack_queue = i; + break; + } + qp->r_nak_state = 0; + ipath_schedule_send(qp); + +unlock_done: + spin_unlock_irqrestore(&qp->s_lock, flags); +done: + return 1; + +send_ack: + return 0; +} + +void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err) +{ + unsigned long flags; + int lastwqe; + + spin_lock_irqsave(&qp->s_lock, flags); + lastwqe = ipath_error_qp(qp, err); + spin_unlock_irqrestore(&qp->s_lock, flags); + + if (lastwqe) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } +} + +static inline void ipath_update_ack_queue(struct ipath_qp *qp, unsigned n) +{ + unsigned next; + + next = n + 1; + if (next > IPATH_MAX_RDMA_ATOMIC) + next = 0; + if (n == qp->s_tail_ack_queue) { + qp->s_tail_ack_queue = next; + qp->s_ack_state = OP(ACKNOWLEDGE); + } +} + +/** + * ipath_rc_rcv - process an incoming RC packet + * @dev: the device this packet came in on + * @hdr: the header of this packet + * @has_grh: true if the header has a GRH + * @data: the packet data + * @tlen: the packet length + * @qp: the QP for this packet + * + * This is called from ipath_qp_rcv() to process an incoming RC packet + * for the given QP. + * Called at interrupt level. + */ +void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct ipath_qp *qp) +{ + struct ipath_other_headers *ohdr; + u32 opcode; + u32 hdrsize; + u32 psn; + u32 pad; + struct ib_wc wc; + u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); + int diff; + struct ib_reth *reth; + int header_in_data; + unsigned long flags; + + /* Validate the SLID. See Ch. 9.6.1.5 */ + if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid)) + goto done; + + /* Check for GRH */ + if (!has_grh) { + ohdr = &hdr->u.oth; + hdrsize = 8 + 12; /* LRH + BTH */ + psn = be32_to_cpu(ohdr->bth[2]); + header_in_data = 0; + } else { + ohdr = &hdr->u.l.oth; + hdrsize = 8 + 40 + 12; /* LRH + GRH + BTH */ + /* + * The header with GRH is 60 bytes and the core driver sets + * the eager header buffer size to 56 bytes so the last 4 + * bytes of the BTH header (PSN) is in the data buffer. + */ + header_in_data = dev->dd->ipath_rcvhdrentsize == 16; + if (header_in_data) { + psn = be32_to_cpu(((__be32 *) data)[0]); + data += sizeof(__be32); + } else + psn = be32_to_cpu(ohdr->bth[2]); + } + + /* + * Process responses (ACKs) before anything else. Note that the + * packet sequence number will be for something in the send work + * queue rather than the expected receive packet sequence number. + * In other words, this QP is the requester. + */ + opcode = be32_to_cpu(ohdr->bth[0]) >> 24; + if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) && + opcode <= OP(ATOMIC_ACKNOWLEDGE)) { + ipath_rc_rcv_resp(dev, ohdr, data, tlen, qp, opcode, psn, + hdrsize, pmtu, header_in_data); + goto done; + } + + /* Compute 24 bits worth of difference. */ + diff = ipath_cmp24(psn, qp->r_psn); + if (unlikely(diff)) { + if (ipath_rc_rcv_error(dev, ohdr, data, qp, opcode, + psn, diff, header_in_data)) + goto done; + goto send_ack; + } + + /* Check for opcode sequence errors. */ + switch (qp->r_state) { + case OP(SEND_FIRST): + case OP(SEND_MIDDLE): + if (opcode == OP(SEND_MIDDLE) || + opcode == OP(SEND_LAST) || + opcode == OP(SEND_LAST_WITH_IMMEDIATE)) + break; + goto nack_inv; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_MIDDLE): + if (opcode == OP(RDMA_WRITE_MIDDLE) || + opcode == OP(RDMA_WRITE_LAST) || + opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) + break; + goto nack_inv; + + default: + if (opcode == OP(SEND_MIDDLE) || + opcode == OP(SEND_LAST) || + opcode == OP(SEND_LAST_WITH_IMMEDIATE) || + opcode == OP(RDMA_WRITE_MIDDLE) || + opcode == OP(RDMA_WRITE_LAST) || + opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) + goto nack_inv; + /* + * Note that it is up to the requester to not send a new + * RDMA read or atomic operation before receiving an ACK + * for the previous operation. + */ + break; + } + + memset(&wc, 0, sizeof wc); + + /* OK, process the packet. */ + switch (opcode) { + case OP(SEND_FIRST): + if (!ipath_get_rwqe(qp, 0)) + goto rnr_nak; + qp->r_rcv_len = 0; + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + case OP(RDMA_WRITE_MIDDLE): + send_middle: + /* Check for invalid length PMTU or posted rwqe len. */ + if (unlikely(tlen != (hdrsize + pmtu + 4))) + goto nack_inv; + qp->r_rcv_len += pmtu; + if (unlikely(qp->r_rcv_len > qp->r_len)) + goto nack_inv; + ipath_copy_sge(&qp->r_sge, data, pmtu); + break; + + case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): + /* consume RWQE */ + if (!ipath_get_rwqe(qp, 1)) + goto rnr_nak; + goto send_last_imm; + + case OP(SEND_ONLY): + case OP(SEND_ONLY_WITH_IMMEDIATE): + if (!ipath_get_rwqe(qp, 0)) + goto rnr_nak; + qp->r_rcv_len = 0; + if (opcode == OP(SEND_ONLY)) + goto send_last; + /* FALLTHROUGH */ + case OP(SEND_LAST_WITH_IMMEDIATE): + send_last_imm: + if (header_in_data) { + wc.ex.imm_data = *(__be32 *) data; + data += sizeof(__be32); + } else { + /* Immediate data comes after BTH */ + wc.ex.imm_data = ohdr->u.imm_data; + } + hdrsize += 4; + wc.wc_flags = IB_WC_WITH_IMM; + /* FALLTHROUGH */ + case OP(SEND_LAST): + case OP(RDMA_WRITE_LAST): + send_last: + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* XXX LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) + goto nack_inv; + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + wc.byte_len = tlen + qp->r_rcv_len; + if (unlikely(wc.byte_len > qp->r_len)) + goto nack_inv; + ipath_copy_sge(&qp->r_sge, data, tlen); + qp->r_msn++; + if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags)) + break; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) || + opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + else + wc.opcode = IB_WC_RECV; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = qp->remote_ah_attr.dlid; + wc.sl = qp->remote_ah_attr.sl; + /* Signal completion event if the solicited bit is set. */ + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + (ohdr->bth[0] & + cpu_to_be32(1 << 23)) != 0); + break; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_ONLY): + case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): + if (unlikely(!(qp->qp_access_flags & + IB_ACCESS_REMOTE_WRITE))) + goto nack_inv; + /* consume RWQE */ + /* RETH comes after BTH */ + if (!header_in_data) + reth = &ohdr->u.rc.reth; + else { + reth = (struct ib_reth *)data; + data += sizeof(*reth); + } + hdrsize += sizeof(*reth); + qp->r_len = be32_to_cpu(reth->length); + qp->r_rcv_len = 0; + if (qp->r_len != 0) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + /* Check rkey & NAK */ + ok = ipath_rkey_ok(qp, &qp->r_sge, + qp->r_len, vaddr, rkey, + IB_ACCESS_REMOTE_WRITE); + if (unlikely(!ok)) + goto nack_acc; + } else { + qp->r_sge.sg_list = NULL; + qp->r_sge.sge.mr = NULL; + qp->r_sge.sge.vaddr = NULL; + qp->r_sge.sge.length = 0; + qp->r_sge.sge.sge_length = 0; + } + if (opcode == OP(RDMA_WRITE_FIRST)) + goto send_middle; + else if (opcode == OP(RDMA_WRITE_ONLY)) + goto send_last; + if (!ipath_get_rwqe(qp, 1)) + goto rnr_nak; + goto send_last_imm; + + case OP(RDMA_READ_REQUEST): { + struct ipath_ack_entry *e; + u32 len; + u8 next; + + if (unlikely(!(qp->qp_access_flags & + IB_ACCESS_REMOTE_READ))) + goto nack_inv; + next = qp->r_head_ack_queue + 1; + if (next > IPATH_MAX_RDMA_ATOMIC) + next = 0; + spin_lock_irqsave(&qp->s_lock, flags); + /* Double check we can process this while holding the s_lock. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) + goto unlock; + if (unlikely(next == qp->s_tail_ack_queue)) { + if (!qp->s_ack_queue[next].sent) + goto nack_inv_unlck; + ipath_update_ack_queue(qp, next); + } + e = &qp->s_ack_queue[qp->r_head_ack_queue]; + /* RETH comes after BTH */ + if (!header_in_data) + reth = &ohdr->u.rc.reth; + else { + reth = (struct ib_reth *)data; + data += sizeof(*reth); + } + len = be32_to_cpu(reth->length); + if (len) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + /* Check rkey & NAK */ + ok = ipath_rkey_ok(qp, &e->rdma_sge, len, vaddr, + rkey, IB_ACCESS_REMOTE_READ); + if (unlikely(!ok)) + goto nack_acc_unlck; + /* + * Update the next expected PSN. We add 1 later + * below, so only add the remainder here. + */ + if (len > pmtu) + qp->r_psn += (len - 1) / pmtu; + } else { + e->rdma_sge.sg_list = NULL; + e->rdma_sge.num_sge = 0; + e->rdma_sge.sge.mr = NULL; + e->rdma_sge.sge.vaddr = NULL; + e->rdma_sge.sge.length = 0; + e->rdma_sge.sge.sge_length = 0; + } + e->opcode = opcode; + e->sent = 0; + e->psn = psn; + /* + * We need to increment the MSN here instead of when we + * finish sending the result since a duplicate request would + * increment it more than once. + */ + qp->r_msn++; + qp->r_psn++; + qp->r_state = opcode; + qp->r_nak_state = 0; + qp->r_head_ack_queue = next; + + /* Schedule the send tasklet. */ + ipath_schedule_send(qp); + + goto unlock; + } + + case OP(COMPARE_SWAP): + case OP(FETCH_ADD): { + struct ib_atomic_eth *ateth; + struct ipath_ack_entry *e; + u64 vaddr; + atomic64_t *maddr; + u64 sdata; + u32 rkey; + u8 next; + + if (unlikely(!(qp->qp_access_flags & + IB_ACCESS_REMOTE_ATOMIC))) + goto nack_inv; + next = qp->r_head_ack_queue + 1; + if (next > IPATH_MAX_RDMA_ATOMIC) + next = 0; + spin_lock_irqsave(&qp->s_lock, flags); + /* Double check we can process this while holding the s_lock. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) + goto unlock; + if (unlikely(next == qp->s_tail_ack_queue)) { + if (!qp->s_ack_queue[next].sent) + goto nack_inv_unlck; + ipath_update_ack_queue(qp, next); + } + if (!header_in_data) + ateth = &ohdr->u.atomic_eth; + else + ateth = (struct ib_atomic_eth *)data; + vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) | + be32_to_cpu(ateth->vaddr[1]); + if (unlikely(vaddr & (sizeof(u64) - 1))) + goto nack_inv_unlck; + rkey = be32_to_cpu(ateth->rkey); + /* Check rkey & NAK */ + if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, + sizeof(u64), vaddr, rkey, + IB_ACCESS_REMOTE_ATOMIC))) + goto nack_acc_unlck; + /* Perform atomic OP and save result. */ + maddr = (atomic64_t *) qp->r_sge.sge.vaddr; + sdata = be64_to_cpu(ateth->swap_data); + e = &qp->s_ack_queue[qp->r_head_ack_queue]; + e->atomic_data = (opcode == OP(FETCH_ADD)) ? + (u64) atomic64_add_return(sdata, maddr) - sdata : + (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr, + be64_to_cpu(ateth->compare_data), + sdata); + e->opcode = opcode; + e->sent = 0; + e->psn = psn & IPATH_PSN_MASK; + qp->r_msn++; + qp->r_psn++; + qp->r_state = opcode; + qp->r_nak_state = 0; + qp->r_head_ack_queue = next; + + /* Schedule the send tasklet. */ + ipath_schedule_send(qp); + + goto unlock; + } + + default: + /* NAK unknown opcodes. */ + goto nack_inv; + } + qp->r_psn++; + qp->r_state = opcode; + qp->r_ack_psn = psn; + qp->r_nak_state = 0; + /* Send an ACK if requested or required. */ + if (psn & (1 << 31)) + goto send_ack; + goto done; + +rnr_nak: + qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer; + qp->r_ack_psn = qp->r_psn; + goto send_ack; + +nack_inv_unlck: + spin_unlock_irqrestore(&qp->s_lock, flags); +nack_inv: + ipath_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + qp->r_nak_state = IB_NAK_INVALID_REQUEST; + qp->r_ack_psn = qp->r_psn; + goto send_ack; + +nack_acc_unlck: + spin_unlock_irqrestore(&qp->s_lock, flags); +nack_acc: + ipath_rc_error(qp, IB_WC_LOC_PROT_ERR); + qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; + qp->r_ack_psn = qp->r_psn; +send_ack: + send_rc_ack(qp); + goto done; + +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); +done: + return; +} diff --git a/drivers/infiniband/hw/ipath/ipath_registers.h b/drivers/infiniband/hw/ipath/ipath_registers.h new file mode 100644 index 0000000..8f44d0c --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_registers.h @@ -0,0 +1,512 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPATH_REGISTERS_H +#define _IPATH_REGISTERS_H + +/* + * This file should only be included by kernel source, and by the diags. It + * defines the registers, and their contents, for InfiniPath chips. + */ + +/* + * These are the InfiniPath register and buffer bit definitions, + * that are visible to software, and needed only by the kernel + * and diag code. A few, that are visible to protocol and user + * code are in ipath_common.h. Some bits are specific + * to a given chip implementation, and have been moved to the + * chip-specific source file + */ + +/* kr_revision bits */ +#define INFINIPATH_R_CHIPREVMINOR_MASK 0xFF +#define INFINIPATH_R_CHIPREVMINOR_SHIFT 0 +#define INFINIPATH_R_CHIPREVMAJOR_MASK 0xFF +#define INFINIPATH_R_CHIPREVMAJOR_SHIFT 8 +#define INFINIPATH_R_ARCH_MASK 0xFF +#define INFINIPATH_R_ARCH_SHIFT 16 +#define INFINIPATH_R_SOFTWARE_MASK 0xFF +#define INFINIPATH_R_SOFTWARE_SHIFT 24 +#define INFINIPATH_R_BOARDID_MASK 0xFF +#define INFINIPATH_R_BOARDID_SHIFT 32 + +/* kr_control bits */ +#define INFINIPATH_C_FREEZEMODE 0x00000002 +#define INFINIPATH_C_LINKENABLE 0x00000004 + +/* kr_sendctrl bits */ +#define INFINIPATH_S_DISARMPIOBUF_SHIFT 16 +#define INFINIPATH_S_UPDTHRESH_SHIFT 24 +#define INFINIPATH_S_UPDTHRESH_MASK 0x1f + +#define IPATH_S_ABORT 0 +#define IPATH_S_PIOINTBUFAVAIL 1 +#define IPATH_S_PIOBUFAVAILUPD 2 +#define IPATH_S_PIOENABLE 3 +#define IPATH_S_SDMAINTENABLE 9 +#define IPATH_S_SDMASINGLEDESCRIPTOR 10 +#define IPATH_S_SDMAENABLE 11 +#define IPATH_S_SDMAHALT 12 +#define IPATH_S_DISARM 31 + +#define INFINIPATH_S_ABORT (1U << IPATH_S_ABORT) +#define INFINIPATH_S_PIOINTBUFAVAIL (1U << IPATH_S_PIOINTBUFAVAIL) +#define INFINIPATH_S_PIOBUFAVAILUPD (1U << IPATH_S_PIOBUFAVAILUPD) +#define INFINIPATH_S_PIOENABLE (1U << IPATH_S_PIOENABLE) +#define INFINIPATH_S_SDMAINTENABLE (1U << IPATH_S_SDMAINTENABLE) +#define INFINIPATH_S_SDMASINGLEDESCRIPTOR \ + (1U << IPATH_S_SDMASINGLEDESCRIPTOR) +#define INFINIPATH_S_SDMAENABLE (1U << IPATH_S_SDMAENABLE) +#define INFINIPATH_S_SDMAHALT (1U << IPATH_S_SDMAHALT) +#define INFINIPATH_S_DISARM (1U << IPATH_S_DISARM) + +/* kr_rcvctrl bits that are the same on multiple chips */ +#define INFINIPATH_R_PORTENABLE_SHIFT 0 +#define INFINIPATH_R_QPMAP_ENABLE (1ULL << 38) + +/* kr_intstatus, kr_intclear, kr_intmask bits */ +#define INFINIPATH_I_SDMAINT 0x8000000000000000ULL +#define INFINIPATH_I_SDMADISABLED 0x4000000000000000ULL +#define INFINIPATH_I_ERROR 0x0000000080000000ULL +#define INFINIPATH_I_SPIOSENT 0x0000000040000000ULL +#define INFINIPATH_I_SPIOBUFAVAIL 0x0000000020000000ULL +#define INFINIPATH_I_GPIO 0x0000000010000000ULL +#define INFINIPATH_I_JINT 0x0000000004000000ULL + +/* kr_errorstatus, kr_errorclear, kr_errormask bits */ +#define INFINIPATH_E_RFORMATERR 0x0000000000000001ULL +#define INFINIPATH_E_RVCRC 0x0000000000000002ULL +#define INFINIPATH_E_RICRC 0x0000000000000004ULL +#define INFINIPATH_E_RMINPKTLEN 0x0000000000000008ULL +#define INFINIPATH_E_RMAXPKTLEN 0x0000000000000010ULL +#define INFINIPATH_E_RLONGPKTLEN 0x0000000000000020ULL +#define INFINIPATH_E_RSHORTPKTLEN 0x0000000000000040ULL +#define INFINIPATH_E_RUNEXPCHAR 0x0000000000000080ULL +#define INFINIPATH_E_RUNSUPVL 0x0000000000000100ULL +#define INFINIPATH_E_REBP 0x0000000000000200ULL +#define INFINIPATH_E_RIBFLOW 0x0000000000000400ULL +#define INFINIPATH_E_RBADVERSION 0x0000000000000800ULL +#define INFINIPATH_E_RRCVEGRFULL 0x0000000000001000ULL +#define INFINIPATH_E_RRCVHDRFULL 0x0000000000002000ULL +#define INFINIPATH_E_RBADTID 0x0000000000004000ULL +#define INFINIPATH_E_RHDRLEN 0x0000000000008000ULL +#define INFINIPATH_E_RHDR 0x0000000000010000ULL +#define INFINIPATH_E_RIBLOSTLINK 0x0000000000020000ULL +#define INFINIPATH_E_SENDSPECIALTRIGGER 0x0000000008000000ULL +#define INFINIPATH_E_SDMADISABLED 0x0000000010000000ULL +#define INFINIPATH_E_SMINPKTLEN 0x0000000020000000ULL +#define INFINIPATH_E_SMAXPKTLEN 0x0000000040000000ULL +#define INFINIPATH_E_SUNDERRUN 0x0000000080000000ULL +#define INFINIPATH_E_SPKTLEN 0x0000000100000000ULL +#define INFINIPATH_E_SDROPPEDSMPPKT 0x0000000200000000ULL +#define INFINIPATH_E_SDROPPEDDATAPKT 0x0000000400000000ULL +#define INFINIPATH_E_SPIOARMLAUNCH 0x0000000800000000ULL +#define INFINIPATH_E_SUNEXPERRPKTNUM 0x0000001000000000ULL +#define INFINIPATH_E_SUNSUPVL 0x0000002000000000ULL +#define INFINIPATH_E_SENDBUFMISUSE 0x0000004000000000ULL +#define INFINIPATH_E_SDMAGENMISMATCH 0x0000008000000000ULL +#define INFINIPATH_E_SDMAOUTOFBOUND 0x0000010000000000ULL +#define INFINIPATH_E_SDMATAILOUTOFBOUND 0x0000020000000000ULL +#define INFINIPATH_E_SDMABASE 0x0000040000000000ULL +#define INFINIPATH_E_SDMA1STDESC 0x0000080000000000ULL +#define INFINIPATH_E_SDMARPYTAG 0x0000100000000000ULL +#define INFINIPATH_E_SDMADWEN 0x0000200000000000ULL +#define INFINIPATH_E_SDMAMISSINGDW 0x0000400000000000ULL +#define INFINIPATH_E_SDMAUNEXPDATA 0x0000800000000000ULL +#define INFINIPATH_E_IBSTATUSCHANGED 0x0001000000000000ULL +#define INFINIPATH_E_INVALIDADDR 0x0002000000000000ULL +#define INFINIPATH_E_RESET 0x0004000000000000ULL +#define INFINIPATH_E_HARDWARE 0x0008000000000000ULL +#define INFINIPATH_E_SDMADESCADDRMISALIGN 0x0010000000000000ULL +#define INFINIPATH_E_INVALIDEEPCMD 0x0020000000000000ULL + +/* + * this is used to print "common" packet errors only when the + * __IPATH_ERRPKTDBG bit is set in ipath_debug. + */ +#define INFINIPATH_E_PKTERRS ( INFINIPATH_E_SPKTLEN \ + | INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_RVCRC \ + | INFINIPATH_E_RICRC | INFINIPATH_E_RSHORTPKTLEN \ + | INFINIPATH_E_REBP ) + +/* Convenience for decoding Send DMA errors */ +#define INFINIPATH_E_SDMAERRS ( \ + INFINIPATH_E_SDMAGENMISMATCH | INFINIPATH_E_SDMAOUTOFBOUND | \ + INFINIPATH_E_SDMATAILOUTOFBOUND | INFINIPATH_E_SDMABASE | \ + INFINIPATH_E_SDMA1STDESC | INFINIPATH_E_SDMARPYTAG | \ + INFINIPATH_E_SDMADWEN | INFINIPATH_E_SDMAMISSINGDW | \ + INFINIPATH_E_SDMAUNEXPDATA | \ + INFINIPATH_E_SDMADESCADDRMISALIGN | \ + INFINIPATH_E_SDMADISABLED | \ + INFINIPATH_E_SENDBUFMISUSE) + +/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */ +/* TXEMEMPARITYERR bit 0: PIObuf, 1: PIOpbc, 2: launchfifo + * RXEMEMPARITYERR bit 0: rcvbuf, 1: lookupq, 2: expTID, 3: eagerTID + * bit 4: flag buffer, 5: datainfo, 6: header info */ +#define INFINIPATH_HWE_TXEMEMPARITYERR_MASK 0xFULL +#define INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT 40 +#define INFINIPATH_HWE_RXEMEMPARITYERR_MASK 0x7FULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT 44 +#define INFINIPATH_HWE_IBCBUSTOSPCPARITYERR 0x4000000000000000ULL +#define INFINIPATH_HWE_IBCBUSFRSPCPARITYERR 0x8000000000000000ULL +/* txe mem parity errors (shift by INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) */ +#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF 0x1ULL +#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC 0x2ULL +#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOLAUNCHFIFO 0x4ULL +/* rxe mem parity errors (shift by INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) */ +#define INFINIPATH_HWE_RXEMEMPARITYERR_RCVBUF 0x01ULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_LOOKUPQ 0x02ULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_EXPTID 0x04ULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID 0x08ULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_FLAGBUF 0x10ULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_DATAINFO 0x20ULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_HDRINFO 0x40ULL +/* waldo specific -- find the rest in ipath_6110.c */ +#define INFINIPATH_HWE_RXDSYNCMEMPARITYERR 0x0000000400000000ULL +/* 6120/7220 specific -- find the rest in ipath_6120.c and ipath_7220.c */ +#define INFINIPATH_HWE_MEMBISTFAILED 0x0040000000000000ULL + +/* kr_hwdiagctrl bits */ +#define INFINIPATH_DC_FORCETXEMEMPARITYERR_MASK 0xFULL +#define INFINIPATH_DC_FORCETXEMEMPARITYERR_SHIFT 40 +#define INFINIPATH_DC_FORCERXEMEMPARITYERR_MASK 0x7FULL +#define INFINIPATH_DC_FORCERXEMEMPARITYERR_SHIFT 44 +#define INFINIPATH_DC_FORCERXDSYNCMEMPARITYERR 0x0000000400000000ULL +#define INFINIPATH_DC_COUNTERDISABLE 0x1000000000000000ULL +#define INFINIPATH_DC_COUNTERWREN 0x2000000000000000ULL +#define INFINIPATH_DC_FORCEIBCBUSTOSPCPARITYERR 0x4000000000000000ULL +#define INFINIPATH_DC_FORCEIBCBUSFRSPCPARITYERR 0x8000000000000000ULL + +/* kr_ibcctrl bits */ +#define INFINIPATH_IBCC_FLOWCTRLPERIOD_MASK 0xFFULL +#define INFINIPATH_IBCC_FLOWCTRLPERIOD_SHIFT 0 +#define INFINIPATH_IBCC_FLOWCTRLWATERMARK_MASK 0xFFULL +#define INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT 8 +#define INFINIPATH_IBCC_LINKINITCMD_MASK 0x3ULL +#define INFINIPATH_IBCC_LINKINITCMD_DISABLE 1 +/* cycle through TS1/TS2 till OK */ +#define INFINIPATH_IBCC_LINKINITCMD_POLL 2 +/* wait for TS1, then go on */ +#define INFINIPATH_IBCC_LINKINITCMD_SLEEP 3 +#define INFINIPATH_IBCC_LINKINITCMD_SHIFT 16 +#define INFINIPATH_IBCC_LINKCMD_MASK 0x3ULL +#define INFINIPATH_IBCC_LINKCMD_DOWN 1 /* move to 0x11 */ +#define INFINIPATH_IBCC_LINKCMD_ARMED 2 /* move to 0x21 */ +#define INFINIPATH_IBCC_LINKCMD_ACTIVE 3 /* move to 0x31 */ +#define INFINIPATH_IBCC_LINKCMD_SHIFT 18 +#define INFINIPATH_IBCC_MAXPKTLEN_MASK 0x7FFULL +#define INFINIPATH_IBCC_MAXPKTLEN_SHIFT 20 +#define INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK 0xFULL +#define INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT 32 +#define INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK 0xFULL +#define INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT 36 +#define INFINIPATH_IBCC_CREDITSCALE_MASK 0x7ULL +#define INFINIPATH_IBCC_CREDITSCALE_SHIFT 40 +#define INFINIPATH_IBCC_LOOPBACK 0x8000000000000000ULL +#define INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE 0x4000000000000000ULL + +/* kr_ibcstatus bits */ +#define INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT 0 +#define INFINIPATH_IBCS_LINKSTATE_MASK 0x7 + +#define INFINIPATH_IBCS_TXREADY 0x40000000 +#define INFINIPATH_IBCS_TXCREDITOK 0x80000000 +/* link training states (shift by + INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) */ +#define INFINIPATH_IBCS_LT_STATE_DISABLED 0x00 +#define INFINIPATH_IBCS_LT_STATE_LINKUP 0x01 +#define INFINIPATH_IBCS_LT_STATE_POLLACTIVE 0x02 +#define INFINIPATH_IBCS_LT_STATE_POLLQUIET 0x03 +#define INFINIPATH_IBCS_LT_STATE_SLEEPDELAY 0x04 +#define INFINIPATH_IBCS_LT_STATE_SLEEPQUIET 0x05 +#define INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE 0x08 +#define INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG 0x09 +#define INFINIPATH_IBCS_LT_STATE_CFGWAITRMT 0x0a +#define INFINIPATH_IBCS_LT_STATE_CFGIDLE 0x0b +#define INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN 0x0c +#define INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT 0x0e +#define INFINIPATH_IBCS_LT_STATE_RECOVERIDLE 0x0f +/* link state machine states (shift by ibcs_ls_shift) */ +#define INFINIPATH_IBCS_L_STATE_DOWN 0x0 +#define INFINIPATH_IBCS_L_STATE_INIT 0x1 +#define INFINIPATH_IBCS_L_STATE_ARM 0x2 +#define INFINIPATH_IBCS_L_STATE_ACTIVE 0x3 +#define INFINIPATH_IBCS_L_STATE_ACT_DEFER 0x4 + + +/* kr_extstatus bits */ +#define INFINIPATH_EXTS_SERDESPLLLOCK 0x1 +#define INFINIPATH_EXTS_GPIOIN_MASK 0xFFFFULL +#define INFINIPATH_EXTS_GPIOIN_SHIFT 48 + +/* kr_extctrl bits */ +#define INFINIPATH_EXTC_GPIOINVERT_MASK 0xFFFFULL +#define INFINIPATH_EXTC_GPIOINVERT_SHIFT 32 +#define INFINIPATH_EXTC_GPIOOE_MASK 0xFFFFULL +#define INFINIPATH_EXTC_GPIOOE_SHIFT 48 +#define INFINIPATH_EXTC_SERDESENABLE 0x80000000ULL +#define INFINIPATH_EXTC_SERDESCONNECT 0x40000000ULL +#define INFINIPATH_EXTC_SERDESENTRUNKING 0x20000000ULL +#define INFINIPATH_EXTC_SERDESDISRXFIFO 0x10000000ULL +#define INFINIPATH_EXTC_SERDESENPLPBK1 0x08000000ULL +#define INFINIPATH_EXTC_SERDESENPLPBK2 0x04000000ULL +#define INFINIPATH_EXTC_SERDESENENCDEC 0x02000000ULL +#define INFINIPATH_EXTC_LED1SECPORT_ON 0x00000020ULL +#define INFINIPATH_EXTC_LED2SECPORT_ON 0x00000010ULL +#define INFINIPATH_EXTC_LED1PRIPORT_ON 0x00000008ULL +#define INFINIPATH_EXTC_LED2PRIPORT_ON 0x00000004ULL +#define INFINIPATH_EXTC_LEDGBLOK_ON 0x00000002ULL +#define INFINIPATH_EXTC_LEDGBLERR_OFF 0x00000001ULL + +/* kr_partitionkey bits */ +#define INFINIPATH_PKEY_SIZE 16 +#define INFINIPATH_PKEY_MASK 0xFFFF +#define INFINIPATH_PKEY_DEFAULT_PKEY 0xFFFF + +/* kr_serdesconfig0 bits */ +#define INFINIPATH_SERDC0_RESET_MASK 0xfULL /* overal reset bits */ +#define INFINIPATH_SERDC0_RESET_PLL 0x10000000ULL /* pll reset */ +/* tx idle enables (per lane) */ +#define INFINIPATH_SERDC0_TXIDLE 0xF000ULL +/* rx detect enables (per lane) */ +#define INFINIPATH_SERDC0_RXDETECT_EN 0xF0000ULL +/* L1 Power down; use with RXDETECT, Otherwise not used on IB side */ +#define INFINIPATH_SERDC0_L1PWR_DN 0xF0ULL + +/* common kr_xgxsconfig bits (or safe in all, even if not implemented) */ +#define INFINIPATH_XGXS_RX_POL_SHIFT 19 +#define INFINIPATH_XGXS_RX_POL_MASK 0xfULL + + +/* + * IPATH_PIO_MAXIBHDR is the max IB header size allowed for in our + * PIO send buffers. This is well beyond anything currently + * defined in the InfiniBand spec. + */ +#define IPATH_PIO_MAXIBHDR 128 + +typedef u64 ipath_err_t; + +/* The following change with the type of device, so + * need to be part of the ipath_devdata struct, or + * we could have problems plugging in devices of + * different types (e.g. one HT, one PCIE) + * in one system, to be managed by one driver. + * On the other hand, this file is may also be included + * by other code, so leave the declarations here + * temporarily. Minor footprint issue if common-model + * linker used, none if C89+ linker used. + */ + +/* mask of defined bits for various registers */ +extern u64 infinipath_i_bitsextant; +extern ipath_err_t infinipath_e_bitsextant, infinipath_hwe_bitsextant; + +/* masks that are different in various chips, or only exist in some chips */ +extern u32 infinipath_i_rcvavail_mask, infinipath_i_rcvurg_mask; + +/* + * These are the infinipath general register numbers (not offsets). + * The kernel registers are used directly, those beyond the kernel + * registers are calculated from one of the base registers. The use of + * an integer type doesn't allow type-checking as thorough as, say, + * an enum but allows for better hiding of chip differences. + */ +typedef const u16 ipath_kreg, /* infinipath general registers */ + ipath_creg, /* infinipath counter registers */ + ipath_sreg; /* kernel-only, infinipath send registers */ + +/* + * These are the chip registers common to all infinipath chips, and + * used both by the kernel and the diagnostics or other user code. + * They are all implemented such that 64 bit accesses work. + * Some implement no more than 32 bits. Because 64 bit reads + * require 2 HT cmds on opteron, we access those with 32 bit + * reads for efficiency (they are written as 64 bits, since + * the extra 32 bits are nearly free on writes, and it slightly reduces + * complexity). The rest are all accessed as 64 bits. + */ +struct ipath_kregs { + /* These are the 32 bit group */ + ipath_kreg kr_control; + ipath_kreg kr_counterregbase; + ipath_kreg kr_intmask; + ipath_kreg kr_intstatus; + ipath_kreg kr_pagealign; + ipath_kreg kr_portcnt; + ipath_kreg kr_rcvtidbase; + ipath_kreg kr_rcvtidcnt; + ipath_kreg kr_rcvegrbase; + ipath_kreg kr_rcvegrcnt; + ipath_kreg kr_scratch; + ipath_kreg kr_sendctrl; + ipath_kreg kr_sendpiobufbase; + ipath_kreg kr_sendpiobufcnt; + ipath_kreg kr_sendpiosize; + ipath_kreg kr_sendregbase; + ipath_kreg kr_userregbase; + /* These are the 64 bit group */ + ipath_kreg kr_debugport; + ipath_kreg kr_debugportselect; + ipath_kreg kr_errorclear; + ipath_kreg kr_errormask; + ipath_kreg kr_errorstatus; + ipath_kreg kr_extctrl; + ipath_kreg kr_extstatus; + ipath_kreg kr_gpio_clear; + ipath_kreg kr_gpio_mask; + ipath_kreg kr_gpio_out; + ipath_kreg kr_gpio_status; + ipath_kreg kr_hwdiagctrl; + ipath_kreg kr_hwerrclear; + ipath_kreg kr_hwerrmask; + ipath_kreg kr_hwerrstatus; + ipath_kreg kr_ibcctrl; + ipath_kreg kr_ibcstatus; + ipath_kreg kr_intblocked; + ipath_kreg kr_intclear; + ipath_kreg kr_interruptconfig; + ipath_kreg kr_mdio; + ipath_kreg kr_partitionkey; + ipath_kreg kr_rcvbthqp; + ipath_kreg kr_rcvbufbase; + ipath_kreg kr_rcvbufsize; + ipath_kreg kr_rcvctrl; + ipath_kreg kr_rcvhdrcnt; + ipath_kreg kr_rcvhdrentsize; + ipath_kreg kr_rcvhdrsize; + ipath_kreg kr_rcvintmembase; + ipath_kreg kr_rcvintmemsize; + ipath_kreg kr_revision; + ipath_kreg kr_sendbuffererror; + ipath_kreg kr_sendpioavailaddr; + ipath_kreg kr_serdesconfig0; + ipath_kreg kr_serdesconfig1; + ipath_kreg kr_serdesstatus; + ipath_kreg kr_txintmembase; + ipath_kreg kr_txintmemsize; + ipath_kreg kr_xgxsconfig; + ipath_kreg kr_ibpllcfg; + /* use these two (and the following N ports) only with + * ipath_k*_kreg64_port(); not *kreg64() */ + ipath_kreg kr_rcvhdraddr; + ipath_kreg kr_rcvhdrtailaddr; + + /* remaining registers are not present on all types of infinipath + chips */ + ipath_kreg kr_rcvpktledcnt; + ipath_kreg kr_pcierbuftestreg0; + ipath_kreg kr_pcierbuftestreg1; + ipath_kreg kr_pcieq0serdesconfig0; + ipath_kreg kr_pcieq0serdesconfig1; + ipath_kreg kr_pcieq0serdesstatus; + ipath_kreg kr_pcieq1serdesconfig0; + ipath_kreg kr_pcieq1serdesconfig1; + ipath_kreg kr_pcieq1serdesstatus; + ipath_kreg kr_hrtbt_guid; + ipath_kreg kr_ibcddrctrl; + ipath_kreg kr_ibcddrstatus; + ipath_kreg kr_jintreload; + + /* send dma related regs */ + ipath_kreg kr_senddmabase; + ipath_kreg kr_senddmalengen; + ipath_kreg kr_senddmatail; + ipath_kreg kr_senddmahead; + ipath_kreg kr_senddmaheadaddr; + ipath_kreg kr_senddmabufmask0; + ipath_kreg kr_senddmabufmask1; + ipath_kreg kr_senddmabufmask2; + ipath_kreg kr_senddmastatus; + + /* SerDes related regs (IBA7220-only) */ + ipath_kreg kr_ibserdesctrl; + ipath_kreg kr_ib_epbacc; + ipath_kreg kr_ib_epbtrans; + ipath_kreg kr_pcie_epbacc; + ipath_kreg kr_pcie_epbtrans; + ipath_kreg kr_ib_ddsrxeq; +}; + +struct ipath_cregs { + ipath_creg cr_badformatcnt; + ipath_creg cr_erricrccnt; + ipath_creg cr_errlinkcnt; + ipath_creg cr_errlpcrccnt; + ipath_creg cr_errpkey; + ipath_creg cr_errrcvflowctrlcnt; + ipath_creg cr_err_rlencnt; + ipath_creg cr_errslencnt; + ipath_creg cr_errtidfull; + ipath_creg cr_errtidvalid; + ipath_creg cr_errvcrccnt; + ipath_creg cr_ibstatuschange; + ipath_creg cr_intcnt; + ipath_creg cr_invalidrlencnt; + ipath_creg cr_invalidslencnt; + ipath_creg cr_lbflowstallcnt; + ipath_creg cr_iblinkdowncnt; + ipath_creg cr_iblinkerrrecovcnt; + ipath_creg cr_ibsymbolerrcnt; + ipath_creg cr_pktrcvcnt; + ipath_creg cr_pktrcvflowctrlcnt; + ipath_creg cr_pktsendcnt; + ipath_creg cr_pktsendflowcnt; + ipath_creg cr_portovflcnt; + ipath_creg cr_rcvebpcnt; + ipath_creg cr_rcvovflcnt; + ipath_creg cr_rxdroppktcnt; + ipath_creg cr_senddropped; + ipath_creg cr_sendstallcnt; + ipath_creg cr_sendunderruncnt; + ipath_creg cr_unsupvlcnt; + ipath_creg cr_wordrcvcnt; + ipath_creg cr_wordsendcnt; + ipath_creg cr_vl15droppedpktcnt; + ipath_creg cr_rxotherlocalphyerrcnt; + ipath_creg cr_excessbufferovflcnt; + ipath_creg cr_locallinkintegrityerrcnt; + ipath_creg cr_rxvlerrcnt; + ipath_creg cr_rxdlidfltrcnt; + ipath_creg cr_psstat; + ipath_creg cr_psstart; + ipath_creg cr_psinterval; + ipath_creg cr_psrcvdatacount; + ipath_creg cr_psrcvpktscount; + ipath_creg cr_psxmitdatacount; + ipath_creg cr_psxmitpktscount; + ipath_creg cr_psxmitwaitcount; +}; + +#endif /* _IPATH_REGISTERS_H */ diff --git a/drivers/infiniband/hw/ipath/ipath_ruc.c b/drivers/infiniband/hw/ipath/ipath_ruc.c new file mode 100644 index 0000000..1f95bba --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_ruc.c @@ -0,0 +1,734 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "ipath_verbs.h" +#include "ipath_kernel.h" + +/* + * Convert the AETH RNR timeout code into the number of milliseconds. + */ +const u32 ib_ipath_rnr_table[32] = { + 656, /* 0 */ + 1, /* 1 */ + 1, /* 2 */ + 1, /* 3 */ + 1, /* 4 */ + 1, /* 5 */ + 1, /* 6 */ + 1, /* 7 */ + 1, /* 8 */ + 1, /* 9 */ + 1, /* A */ + 1, /* B */ + 1, /* C */ + 1, /* D */ + 2, /* E */ + 2, /* F */ + 3, /* 10 */ + 4, /* 11 */ + 6, /* 12 */ + 8, /* 13 */ + 11, /* 14 */ + 16, /* 15 */ + 21, /* 16 */ + 31, /* 17 */ + 41, /* 18 */ + 62, /* 19 */ + 82, /* 1A */ + 123, /* 1B */ + 164, /* 1C */ + 246, /* 1D */ + 328, /* 1E */ + 492 /* 1F */ +}; + +/** + * ipath_insert_rnr_queue - put QP on the RNR timeout list for the device + * @qp: the QP + * + * Called with the QP s_lock held and interrupts disabled. + * XXX Use a simple list for now. We might need a priority + * queue if we have lots of QPs waiting for RNR timeouts + * but that should be rare. + */ +void ipath_insert_rnr_queue(struct ipath_qp *qp) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + + /* We already did a spin_lock_irqsave(), so just use spin_lock */ + spin_lock(&dev->pending_lock); + if (list_empty(&dev->rnrwait)) + list_add(&qp->timerwait, &dev->rnrwait); + else { + struct list_head *l = &dev->rnrwait; + struct ipath_qp *nqp = list_entry(l->next, struct ipath_qp, + timerwait); + + while (qp->s_rnr_timeout >= nqp->s_rnr_timeout) { + qp->s_rnr_timeout -= nqp->s_rnr_timeout; + l = l->next; + if (l->next == &dev->rnrwait) { + nqp = NULL; + break; + } + nqp = list_entry(l->next, struct ipath_qp, + timerwait); + } + if (nqp) + nqp->s_rnr_timeout -= qp->s_rnr_timeout; + list_add(&qp->timerwait, l); + } + spin_unlock(&dev->pending_lock); +} + +/** + * ipath_init_sge - Validate a RWQE and fill in the SGE state + * @qp: the QP + * + * Return 1 if OK. + */ +int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe, + u32 *lengthp, struct ipath_sge_state *ss) +{ + int i, j, ret; + struct ib_wc wc; + + *lengthp = 0; + for (i = j = 0; i < wqe->num_sge; i++) { + if (wqe->sg_list[i].length == 0) + continue; + /* Check LKEY */ + if (!ipath_lkey_ok(qp, j ? &ss->sg_list[j - 1] : &ss->sge, + &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE)) + goto bad_lkey; + *lengthp += wqe->sg_list[i].length; + j++; + } + ss->num_sge = j; + ret = 1; + goto bail; + +bad_lkey: + memset(&wc, 0, sizeof(wc)); + wc.wr_id = wqe->wr_id; + wc.status = IB_WC_LOC_PROT_ERR; + wc.opcode = IB_WC_RECV; + wc.qp = &qp->ibqp; + /* Signal solicited completion event. */ + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); + ret = 0; +bail: + return ret; +} + +/** + * ipath_get_rwqe - copy the next RWQE into the QP's RWQE + * @qp: the QP + * @wr_id_only: update qp->r_wr_id only, not qp->r_sge + * + * Return 0 if no RWQE is available, otherwise return 1. + * + * Can be called from interrupt level. + */ +int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only) +{ + unsigned long flags; + struct ipath_rq *rq; + struct ipath_rwq *wq; + struct ipath_srq *srq; + struct ipath_rwqe *wqe; + void (*handler)(struct ib_event *, void *); + u32 tail; + int ret; + + if (qp->ibqp.srq) { + srq = to_isrq(qp->ibqp.srq); + handler = srq->ibsrq.event_handler; + rq = &srq->rq; + } else { + srq = NULL; + handler = NULL; + rq = &qp->r_rq; + } + + spin_lock_irqsave(&rq->lock, flags); + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) { + ret = 0; + goto unlock; + } + + wq = rq->wq; + tail = wq->tail; + /* Validate tail before using it since it is user writable. */ + if (tail >= rq->size) + tail = 0; + do { + if (unlikely(tail == wq->head)) { + ret = 0; + goto unlock; + } + /* Make sure entry is read after head index is read. */ + smp_rmb(); + wqe = get_rwqe_ptr(rq, tail); + if (++tail >= rq->size) + tail = 0; + if (wr_id_only) + break; + qp->r_sge.sg_list = qp->r_sg_list; + } while (!ipath_init_sge(qp, wqe, &qp->r_len, &qp->r_sge)); + qp->r_wr_id = wqe->wr_id; + wq->tail = tail; + + ret = 1; + set_bit(IPATH_R_WRID_VALID, &qp->r_aflags); + if (handler) { + u32 n; + + /* + * validate head pointer value and compute + * the number of remaining WQEs. + */ + n = wq->head; + if (n >= rq->size) + n = 0; + if (n < tail) + n += rq->size - tail; + else + n -= tail; + if (n < srq->limit) { + struct ib_event ev; + + srq->limit = 0; + spin_unlock_irqrestore(&rq->lock, flags); + ev.device = qp->ibqp.device; + ev.element.srq = qp->ibqp.srq; + ev.event = IB_EVENT_SRQ_LIMIT_REACHED; + handler(&ev, srq->ibsrq.srq_context); + goto bail; + } + } +unlock: + spin_unlock_irqrestore(&rq->lock, flags); +bail: + return ret; +} + +/** + * ipath_ruc_loopback - handle UC and RC lookback requests + * @sqp: the sending QP + * + * This is called from ipath_do_send() to + * forward a WQE addressed to the same HCA. + * Note that although we are single threaded due to the tasklet, we still + * have to protect against post_send(). We don't have to worry about + * receive interrupts since this is a connected protocol and all packets + * will pass through here. + */ +static void ipath_ruc_loopback(struct ipath_qp *sqp) +{ + struct ipath_ibdev *dev = to_idev(sqp->ibqp.device); + struct ipath_qp *qp; + struct ipath_swqe *wqe; + struct ipath_sge *sge; + unsigned long flags; + struct ib_wc wc; + u64 sdata; + atomic64_t *maddr; + enum ib_wc_status send_status; + + /* + * Note that we check the responder QP state after + * checking the requester's state. + */ + qp = ipath_lookup_qpn(&dev->qp_table, sqp->remote_qpn); + + spin_lock_irqsave(&sqp->s_lock, flags); + + /* Return if we are already busy processing a work request. */ + if ((sqp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) || + !(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_OR_FLUSH_SEND)) + goto unlock; + + sqp->s_flags |= IPATH_S_BUSY; + +again: + if (sqp->s_last == sqp->s_head) + goto clr_busy; + wqe = get_swqe_ptr(sqp, sqp->s_last); + + /* Return if it is not OK to start a new work reqeust. */ + if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_NEXT_SEND_OK)) { + if (!(ib_ipath_state_ops[sqp->state] & IPATH_FLUSH_SEND)) + goto clr_busy; + /* We are in the error state, flush the work request. */ + send_status = IB_WC_WR_FLUSH_ERR; + goto flush_send; + } + + /* + * We can rely on the entry not changing without the s_lock + * being held until we update s_last. + * We increment s_cur to indicate s_last is in progress. + */ + if (sqp->s_last == sqp->s_cur) { + if (++sqp->s_cur >= sqp->s_size) + sqp->s_cur = 0; + } + spin_unlock_irqrestore(&sqp->s_lock, flags); + + if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) { + dev->n_pkt_drops++; + /* + * For RC, the requester would timeout and retry so + * shortcut the timeouts and just signal too many retries. + */ + if (sqp->ibqp.qp_type == IB_QPT_RC) + send_status = IB_WC_RETRY_EXC_ERR; + else + send_status = IB_WC_SUCCESS; + goto serr; + } + + memset(&wc, 0, sizeof wc); + send_status = IB_WC_SUCCESS; + + sqp->s_sge.sge = wqe->sg_list[0]; + sqp->s_sge.sg_list = wqe->sg_list + 1; + sqp->s_sge.num_sge = wqe->wr.num_sge; + sqp->s_len = wqe->length; + switch (wqe->wr.opcode) { + case IB_WR_SEND_WITH_IMM: + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = wqe->wr.ex.imm_data; + /* FALLTHROUGH */ + case IB_WR_SEND: + if (!ipath_get_rwqe(qp, 0)) + goto rnr_nak; + break; + + case IB_WR_RDMA_WRITE_WITH_IMM: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto inv_err; + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = wqe->wr.ex.imm_data; + if (!ipath_get_rwqe(qp, 1)) + goto rnr_nak; + /* FALLTHROUGH */ + case IB_WR_RDMA_WRITE: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto inv_err; + if (wqe->length == 0) + break; + if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, wqe->length, + wqe->wr.wr.rdma.remote_addr, + wqe->wr.wr.rdma.rkey, + IB_ACCESS_REMOTE_WRITE))) + goto acc_err; + break; + + case IB_WR_RDMA_READ: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) + goto inv_err; + if (unlikely(!ipath_rkey_ok(qp, &sqp->s_sge, wqe->length, + wqe->wr.wr.rdma.remote_addr, + wqe->wr.wr.rdma.rkey, + IB_ACCESS_REMOTE_READ))) + goto acc_err; + qp->r_sge.sge = wqe->sg_list[0]; + qp->r_sge.sg_list = wqe->sg_list + 1; + qp->r_sge.num_sge = wqe->wr.num_sge; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) + goto inv_err; + if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, sizeof(u64), + wqe->wr.wr.atomic.remote_addr, + wqe->wr.wr.atomic.rkey, + IB_ACCESS_REMOTE_ATOMIC))) + goto acc_err; + /* Perform atomic OP and save result. */ + maddr = (atomic64_t *) qp->r_sge.sge.vaddr; + sdata = wqe->wr.wr.atomic.compare_add; + *(u64 *) sqp->s_sge.sge.vaddr = + (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ? + (u64) atomic64_add_return(sdata, maddr) - sdata : + (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr, + sdata, wqe->wr.wr.atomic.swap); + goto send_comp; + + default: + send_status = IB_WC_LOC_QP_OP_ERR; + goto serr; + } + + sge = &sqp->s_sge.sge; + while (sqp->s_len) { + u32 len = sqp->s_len; + + if (len > sge->length) + len = sge->length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + ipath_copy_sge(&qp->r_sge, sge->vaddr, len); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--sqp->s_sge.num_sge) + *sge = *sqp->s_sge.sg_list++; + } else if (sge->length == 0 && sge->mr != NULL) { + if (++sge->n >= IPATH_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + sqp->s_len -= len; + } + + if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags)) + goto send_comp; + + if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM) + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + else + wc.opcode = IB_WC_RECV; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.byte_len = wqe->length; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = qp->remote_ah_attr.dlid; + wc.sl = qp->remote_ah_attr.sl; + wc.port_num = 1; + /* Signal completion event if the solicited bit is set. */ + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + wqe->wr.send_flags & IB_SEND_SOLICITED); + +send_comp: + spin_lock_irqsave(&sqp->s_lock, flags); +flush_send: + sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; + ipath_send_complete(sqp, wqe, send_status); + goto again; + +rnr_nak: + /* Handle RNR NAK */ + if (qp->ibqp.qp_type == IB_QPT_UC) + goto send_comp; + /* + * Note: we don't need the s_lock held since the BUSY flag + * makes this single threaded. + */ + if (sqp->s_rnr_retry == 0) { + send_status = IB_WC_RNR_RETRY_EXC_ERR; + goto serr; + } + if (sqp->s_rnr_retry_cnt < 7) + sqp->s_rnr_retry--; + spin_lock_irqsave(&sqp->s_lock, flags); + if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_RECV_OK)) + goto clr_busy; + sqp->s_flags |= IPATH_S_WAITING; + dev->n_rnr_naks++; + sqp->s_rnr_timeout = ib_ipath_rnr_table[qp->r_min_rnr_timer]; + ipath_insert_rnr_queue(sqp); + goto clr_busy; + +inv_err: + send_status = IB_WC_REM_INV_REQ_ERR; + wc.status = IB_WC_LOC_QP_OP_ERR; + goto err; + +acc_err: + send_status = IB_WC_REM_ACCESS_ERR; + wc.status = IB_WC_LOC_PROT_ERR; +err: + /* responder goes to error state */ + ipath_rc_error(qp, wc.status); + +serr: + spin_lock_irqsave(&sqp->s_lock, flags); + ipath_send_complete(sqp, wqe, send_status); + if (sqp->ibqp.qp_type == IB_QPT_RC) { + int lastwqe = ipath_error_qp(sqp, IB_WC_WR_FLUSH_ERR); + + sqp->s_flags &= ~IPATH_S_BUSY; + spin_unlock_irqrestore(&sqp->s_lock, flags); + if (lastwqe) { + struct ib_event ev; + + ev.device = sqp->ibqp.device; + ev.element.qp = &sqp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context); + } + goto done; + } +clr_busy: + sqp->s_flags &= ~IPATH_S_BUSY; +unlock: + spin_unlock_irqrestore(&sqp->s_lock, flags); +done: + if (qp && atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); +} + +static void want_buffer(struct ipath_devdata *dd, struct ipath_qp *qp) +{ + if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA) || + qp->ibqp.qp_type == IB_QPT_SMI) { + unsigned long flags; + + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl |= INFINIPATH_S_PIOINTBUFAVAIL; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + } +} + +/** + * ipath_no_bufs_available - tell the layer driver we need buffers + * @qp: the QP that caused the problem + * @dev: the device we ran out of buffers on + * + * Called when we run out of PIO buffers. + * If we are now in the error state, return zero to flush the + * send work request. + */ +static int ipath_no_bufs_available(struct ipath_qp *qp, + struct ipath_ibdev *dev) +{ + unsigned long flags; + int ret = 1; + + /* + * Note that as soon as want_buffer() is called and + * possibly before it returns, ipath_ib_piobufavail() + * could be called. Therefore, put QP on the piowait list before + * enabling the PIO avail interrupt. + */ + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) { + dev->n_piowait++; + qp->s_flags |= IPATH_S_WAITING; + qp->s_flags &= ~IPATH_S_BUSY; + spin_lock(&dev->pending_lock); + if (list_empty(&qp->piowait)) + list_add_tail(&qp->piowait, &dev->piowait); + spin_unlock(&dev->pending_lock); + } else + ret = 0; + spin_unlock_irqrestore(&qp->s_lock, flags); + if (ret) + want_buffer(dev->dd, qp); + return ret; +} + +/** + * ipath_make_grh - construct a GRH header + * @dev: a pointer to the ipath device + * @hdr: a pointer to the GRH header being constructed + * @grh: the global route address to send to + * @hwords: the number of 32 bit words of header being sent + * @nwords: the number of 32 bit words of data being sent + * + * Return the size of the header in 32 bit words. + */ +u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr, + struct ib_global_route *grh, u32 hwords, u32 nwords) +{ + hdr->version_tclass_flow = + cpu_to_be32((6 << 28) | + (grh->traffic_class << 20) | + grh->flow_label); + hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2); + /* next_hdr is defined by C8-7 in ch. 8.4.1 */ + hdr->next_hdr = 0x1B; + hdr->hop_limit = grh->hop_limit; + /* The SGID is 32-bit aligned. */ + hdr->sgid.global.subnet_prefix = dev->gid_prefix; + hdr->sgid.global.interface_id = dev->dd->ipath_guid; + hdr->dgid = grh->dgid; + + /* GRH header size in 32-bit words. */ + return sizeof(struct ib_grh) / sizeof(u32); +} + +void ipath_make_ruc_header(struct ipath_ibdev *dev, struct ipath_qp *qp, + struct ipath_other_headers *ohdr, + u32 bth0, u32 bth2) +{ + u16 lrh0; + u32 nwords; + u32 extra_bytes; + + /* Construct the header. */ + extra_bytes = -qp->s_cur_size & 3; + nwords = (qp->s_cur_size + extra_bytes) >> 2; + lrh0 = IPATH_LRH_BTH; + if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { + qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh, + &qp->remote_ah_attr.grh, + qp->s_hdrwords, nwords); + lrh0 = IPATH_LRH_GRH; + } + lrh0 |= qp->remote_ah_attr.sl << 4; + qp->s_hdr.lrh[0] = cpu_to_be16(lrh0); + qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); + qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC); + qp->s_hdr.lrh[3] = cpu_to_be16(dev->dd->ipath_lid | + qp->remote_ah_attr.src_path_bits); + bth0 |= ipath_get_pkey(dev->dd, qp->s_pkey_index); + bth0 |= extra_bytes << 20; + ohdr->bth[0] = cpu_to_be32(bth0 | (1 << 22)); + ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); + ohdr->bth[2] = cpu_to_be32(bth2); +} + +/** + * ipath_do_send - perform a send on a QP + * @data: contains a pointer to the QP + * + * Process entries in the send work queue until credit or queue is + * exhausted. Only allow one CPU to send a packet per QP (tasklet). + * Otherwise, two threads could send packets out of order. + */ +void ipath_do_send(unsigned long data) +{ + struct ipath_qp *qp = (struct ipath_qp *)data; + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + int (*make_req)(struct ipath_qp *qp); + unsigned long flags; + + if ((qp->ibqp.qp_type == IB_QPT_RC || + qp->ibqp.qp_type == IB_QPT_UC) && + qp->remote_ah_attr.dlid == dev->dd->ipath_lid) { + ipath_ruc_loopback(qp); + goto bail; + } + + if (qp->ibqp.qp_type == IB_QPT_RC) + make_req = ipath_make_rc_req; + else if (qp->ibqp.qp_type == IB_QPT_UC) + make_req = ipath_make_uc_req; + else + make_req = ipath_make_ud_req; + + spin_lock_irqsave(&qp->s_lock, flags); + + /* Return if we are already busy processing a work request. */ + if ((qp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) || + !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND)) { + spin_unlock_irqrestore(&qp->s_lock, flags); + goto bail; + } + + qp->s_flags |= IPATH_S_BUSY; + + spin_unlock_irqrestore(&qp->s_lock, flags); + +again: + /* Check for a constructed packet to be sent. */ + if (qp->s_hdrwords != 0) { + /* + * If no PIO bufs are available, return. An interrupt will + * call ipath_ib_piobufavail() when one is available. + */ + if (ipath_verbs_send(qp, &qp->s_hdr, qp->s_hdrwords, + qp->s_cur_sge, qp->s_cur_size)) { + if (ipath_no_bufs_available(qp, dev)) + goto bail; + } + dev->n_unicast_xmit++; + /* Record that we sent the packet and s_hdr is empty. */ + qp->s_hdrwords = 0; + } + + if (make_req(qp)) + goto again; + +bail:; +} + +/* + * This should be called with s_lock held. + */ +void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe, + enum ib_wc_status status) +{ + u32 old_last, last; + + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND)) + return; + + /* See ch. 11.2.4.1 and 10.7.3.1 */ + if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) || + (wqe->wr.send_flags & IB_SEND_SIGNALED) || + status != IB_WC_SUCCESS) { + struct ib_wc wc; + + memset(&wc, 0, sizeof wc); + wc.wr_id = wqe->wr.wr_id; + wc.status = status; + wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; + wc.qp = &qp->ibqp; + if (status == IB_WC_SUCCESS) + wc.byte_len = wqe->length; + ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, + status != IB_WC_SUCCESS); + } + + old_last = last = qp->s_last; + if (++last >= qp->s_size) + last = 0; + qp->s_last = last; + if (qp->s_cur == old_last) + qp->s_cur = last; + if (qp->s_tail == old_last) + qp->s_tail = last; + if (qp->state == IB_QPS_SQD && last == qp->s_cur) + qp->s_draining = 0; +} diff --git a/drivers/infiniband/hw/ipath/ipath_sdma.c b/drivers/infiniband/hw/ipath/ipath_sdma.c new file mode 100644 index 0000000..17a5177 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_sdma.c @@ -0,0 +1,818 @@ +/* + * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "ipath_kernel.h" +#include "ipath_verbs.h" +#include "ipath_common.h" + +#define SDMA_DESCQ_SZ PAGE_SIZE /* 256 entries per 4KB page */ + +static void vl15_watchdog_enq(struct ipath_devdata *dd) +{ + /* ipath_sdma_lock must already be held */ + if (atomic_inc_return(&dd->ipath_sdma_vl15_count) == 1) { + unsigned long interval = (HZ + 19) / 20; + dd->ipath_sdma_vl15_timer.expires = jiffies + interval; + add_timer(&dd->ipath_sdma_vl15_timer); + } +} + +static void vl15_watchdog_deq(struct ipath_devdata *dd) +{ + /* ipath_sdma_lock must already be held */ + if (atomic_dec_return(&dd->ipath_sdma_vl15_count) != 0) { + unsigned long interval = (HZ + 19) / 20; + mod_timer(&dd->ipath_sdma_vl15_timer, jiffies + interval); + } else { + del_timer(&dd->ipath_sdma_vl15_timer); + } +} + +static void vl15_watchdog_timeout(unsigned long opaque) +{ + struct ipath_devdata *dd = (struct ipath_devdata *)opaque; + + if (atomic_read(&dd->ipath_sdma_vl15_count) != 0) { + ipath_dbg("vl15 watchdog timeout - clearing\n"); + ipath_cancel_sends(dd, 1); + ipath_hol_down(dd); + } else { + ipath_dbg("vl15 watchdog timeout - " + "condition already cleared\n"); + } +} + +static void unmap_desc(struct ipath_devdata *dd, unsigned head) +{ + __le64 *descqp = &dd->ipath_sdma_descq[head].qw[0]; + u64 desc[2]; + dma_addr_t addr; + size_t len; + + desc[0] = le64_to_cpu(descqp[0]); + desc[1] = le64_to_cpu(descqp[1]); + + addr = (desc[1] << 32) | (desc[0] >> 32); + len = (desc[0] >> 14) & (0x7ffULL << 2); + dma_unmap_single(&dd->pcidev->dev, addr, len, DMA_TO_DEVICE); +} + +/* + * ipath_sdma_lock should be locked before calling this. + */ +int ipath_sdma_make_progress(struct ipath_devdata *dd) +{ + struct list_head *lp = NULL; + struct ipath_sdma_txreq *txp = NULL; + u16 dmahead; + u16 start_idx = 0; + int progress = 0; + + if (!list_empty(&dd->ipath_sdma_activelist)) { + lp = dd->ipath_sdma_activelist.next; + txp = list_entry(lp, struct ipath_sdma_txreq, list); + start_idx = txp->start_idx; + } + + /* + * Read the SDMA head register in order to know that the + * interrupt clear has been written to the chip. + * Otherwise, we may not get an interrupt for the last + * descriptor in the queue. + */ + dmahead = (u16)ipath_read_kreg32(dd, dd->ipath_kregs->kr_senddmahead); + /* sanity check return value for error handling (chip reset, etc.) */ + if (dmahead >= dd->ipath_sdma_descq_cnt) + goto done; + + while (dd->ipath_sdma_descq_head != dmahead) { + if (txp && txp->flags & IPATH_SDMA_TXREQ_F_FREEDESC && + dd->ipath_sdma_descq_head == start_idx) { + unmap_desc(dd, dd->ipath_sdma_descq_head); + start_idx++; + if (start_idx == dd->ipath_sdma_descq_cnt) + start_idx = 0; + } + + /* increment free count and head */ + dd->ipath_sdma_descq_removed++; + if (++dd->ipath_sdma_descq_head == dd->ipath_sdma_descq_cnt) + dd->ipath_sdma_descq_head = 0; + + if (txp && txp->next_descq_idx == dd->ipath_sdma_descq_head) { + /* move to notify list */ + if (txp->flags & IPATH_SDMA_TXREQ_F_VL15) + vl15_watchdog_deq(dd); + list_move_tail(lp, &dd->ipath_sdma_notifylist); + if (!list_empty(&dd->ipath_sdma_activelist)) { + lp = dd->ipath_sdma_activelist.next; + txp = list_entry(lp, struct ipath_sdma_txreq, + list); + start_idx = txp->start_idx; + } else { + lp = NULL; + txp = NULL; + } + } + progress = 1; + } + + if (progress) + tasklet_hi_schedule(&dd->ipath_sdma_notify_task); + +done: + return progress; +} + +static void ipath_sdma_notify(struct ipath_devdata *dd, struct list_head *list) +{ + struct ipath_sdma_txreq *txp, *txp_next; + + list_for_each_entry_safe(txp, txp_next, list, list) { + list_del_init(&txp->list); + + if (txp->callback) + (*txp->callback)(txp->callback_cookie, + txp->callback_status); + } +} + +static void sdma_notify_taskbody(struct ipath_devdata *dd) +{ + unsigned long flags; + struct list_head list; + + INIT_LIST_HEAD(&list); + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + + list_splice_init(&dd->ipath_sdma_notifylist, &list); + + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + + ipath_sdma_notify(dd, &list); + + /* + * The IB verbs layer needs to see the callback before getting + * the call to ipath_ib_piobufavail() because the callback + * handles releasing resources the next send will need. + * Otherwise, we could do these calls in + * ipath_sdma_make_progress(). + */ + ipath_ib_piobufavail(dd->verbs_dev); +} + +static void sdma_notify_task(unsigned long opaque) +{ + struct ipath_devdata *dd = (struct ipath_devdata *)opaque; + + if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) + sdma_notify_taskbody(dd); +} + +static void dump_sdma_state(struct ipath_devdata *dd) +{ + unsigned long reg; + + reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmastatus); + ipath_cdbg(VERBOSE, "kr_senddmastatus: 0x%016lx\n", reg); + + reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendctrl); + ipath_cdbg(VERBOSE, "kr_sendctrl: 0x%016lx\n", reg); + + reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask0); + ipath_cdbg(VERBOSE, "kr_senddmabufmask0: 0x%016lx\n", reg); + + reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask1); + ipath_cdbg(VERBOSE, "kr_senddmabufmask1: 0x%016lx\n", reg); + + reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask2); + ipath_cdbg(VERBOSE, "kr_senddmabufmask2: 0x%016lx\n", reg); + + reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmatail); + ipath_cdbg(VERBOSE, "kr_senddmatail: 0x%016lx\n", reg); + + reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmahead); + ipath_cdbg(VERBOSE, "kr_senddmahead: 0x%016lx\n", reg); +} + +static void sdma_abort_task(unsigned long opaque) +{ + struct ipath_devdata *dd = (struct ipath_devdata *) opaque; + u64 status; + unsigned long flags; + + if (test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) + return; + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + + status = dd->ipath_sdma_status & IPATH_SDMA_ABORT_MASK; + + /* nothing to do */ + if (status == IPATH_SDMA_ABORT_NONE) + goto unlock; + + /* ipath_sdma_abort() is done, waiting for interrupt */ + if (status == IPATH_SDMA_ABORT_DISARMED) { + if (time_before(jiffies, dd->ipath_sdma_abort_intr_timeout)) + goto resched_noprint; + /* give up, intr got lost somewhere */ + ipath_dbg("give up waiting for SDMADISABLED intr\n"); + __set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status); + status = IPATH_SDMA_ABORT_ABORTED; + } + + /* everything is stopped, time to clean up and restart */ + if (status == IPATH_SDMA_ABORT_ABORTED) { + struct ipath_sdma_txreq *txp, *txpnext; + u64 hwstatus; + int notify = 0; + + hwstatus = ipath_read_kreg64(dd, + dd->ipath_kregs->kr_senddmastatus); + + if ((hwstatus & (IPATH_SDMA_STATUS_SCORE_BOARD_DRAIN_IN_PROG | + IPATH_SDMA_STATUS_ABORT_IN_PROG | + IPATH_SDMA_STATUS_INTERNAL_SDMA_ENABLE)) || + !(hwstatus & IPATH_SDMA_STATUS_SCB_EMPTY)) { + if (dd->ipath_sdma_reset_wait > 0) { + /* not done shutting down sdma */ + --dd->ipath_sdma_reset_wait; + goto resched; + } + ipath_cdbg(VERBOSE, "gave up waiting for quiescent " + "status after SDMA reset, continuing\n"); + dump_sdma_state(dd); + } + + /* dequeue all "sent" requests */ + list_for_each_entry_safe(txp, txpnext, + &dd->ipath_sdma_activelist, list) { + txp->callback_status = IPATH_SDMA_TXREQ_S_ABORTED; + if (txp->flags & IPATH_SDMA_TXREQ_F_VL15) + vl15_watchdog_deq(dd); + list_move_tail(&txp->list, &dd->ipath_sdma_notifylist); + notify = 1; + } + if (notify) + tasklet_hi_schedule(&dd->ipath_sdma_notify_task); + + /* reset our notion of head and tail */ + dd->ipath_sdma_descq_tail = 0; + dd->ipath_sdma_descq_head = 0; + dd->ipath_sdma_head_dma[0] = 0; + dd->ipath_sdma_generation = 0; + dd->ipath_sdma_descq_removed = dd->ipath_sdma_descq_added; + + /* Reset SendDmaLenGen */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen, + (u64) dd->ipath_sdma_descq_cnt | (1ULL << 18)); + + /* done with sdma state for a bit */ + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + + /* + * Don't restart sdma here (with the exception + * below). Wait until link is up to ACTIVE. VL15 MADs + * used to bring the link up use PIO, and multiple link + * transitions otherwise cause the sdma engine to be + * stopped and started multiple times. + * The disable is done here, including the shadow, + * so the state is kept consistent. + * See ipath_restart_sdma() for the actual starting + * of sdma. + */ + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + /* make sure I see next message */ + dd->ipath_sdma_abort_jiffies = 0; + + /* + * Not everything that takes SDMA offline is a link + * status change. If the link was up, restart SDMA. + */ + if (dd->ipath_flags & IPATH_LINKACTIVE) + ipath_restart_sdma(dd); + + goto done; + } + +resched: + /* + * for now, keep spinning + * JAG - this is bad to just have default be a loop without + * state change + */ + if (time_after(jiffies, dd->ipath_sdma_abort_jiffies)) { + ipath_dbg("looping with status 0x%08lx\n", + dd->ipath_sdma_status); + dd->ipath_sdma_abort_jiffies = jiffies + 5 * HZ; + } +resched_noprint: + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) + tasklet_hi_schedule(&dd->ipath_sdma_abort_task); + return; + +unlock: + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); +done: + return; +} + +/* + * This is called from interrupt context. + */ +void ipath_sdma_intr(struct ipath_devdata *dd) +{ + unsigned long flags; + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + + (void) ipath_sdma_make_progress(dd); + + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); +} + +static int alloc_sdma(struct ipath_devdata *dd) +{ + int ret = 0; + + /* Allocate memory for SendDMA descriptor FIFO */ + dd->ipath_sdma_descq = dma_alloc_coherent(&dd->pcidev->dev, + SDMA_DESCQ_SZ, &dd->ipath_sdma_descq_phys, GFP_KERNEL); + + if (!dd->ipath_sdma_descq) { + ipath_dev_err(dd, "failed to allocate SendDMA descriptor " + "FIFO memory\n"); + ret = -ENOMEM; + goto done; + } + + dd->ipath_sdma_descq_cnt = + SDMA_DESCQ_SZ / sizeof(struct ipath_sdma_desc); + + /* Allocate memory for DMA of head register to memory */ + dd->ipath_sdma_head_dma = dma_alloc_coherent(&dd->pcidev->dev, + PAGE_SIZE, &dd->ipath_sdma_head_phys, GFP_KERNEL); + if (!dd->ipath_sdma_head_dma) { + ipath_dev_err(dd, "failed to allocate SendDMA head memory\n"); + ret = -ENOMEM; + goto cleanup_descq; + } + dd->ipath_sdma_head_dma[0] = 0; + + init_timer(&dd->ipath_sdma_vl15_timer); + dd->ipath_sdma_vl15_timer.function = vl15_watchdog_timeout; + dd->ipath_sdma_vl15_timer.data = (unsigned long)dd; + atomic_set(&dd->ipath_sdma_vl15_count, 0); + + goto done; + +cleanup_descq: + dma_free_coherent(&dd->pcidev->dev, SDMA_DESCQ_SZ, + (void *)dd->ipath_sdma_descq, dd->ipath_sdma_descq_phys); + dd->ipath_sdma_descq = NULL; + dd->ipath_sdma_descq_phys = 0; +done: + return ret; +} + +int setup_sdma(struct ipath_devdata *dd) +{ + int ret = 0; + unsigned i, n; + u64 tmp64; + u64 senddmabufmask[3] = { 0 }; + unsigned long flags; + + ret = alloc_sdma(dd); + if (ret) + goto done; + + if (!dd->ipath_sdma_descq) { + ipath_dev_err(dd, "SendDMA memory not allocated\n"); + goto done; + } + + /* + * Set initial status as if we had been up, then gone down. + * This lets initial start on transition to ACTIVE be the + * same as restart after link flap. + */ + dd->ipath_sdma_status = IPATH_SDMA_ABORT_ABORTED; + dd->ipath_sdma_abort_jiffies = 0; + dd->ipath_sdma_generation = 0; + dd->ipath_sdma_descq_tail = 0; + dd->ipath_sdma_descq_head = 0; + dd->ipath_sdma_descq_removed = 0; + dd->ipath_sdma_descq_added = 0; + + /* Set SendDmaBase */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabase, + dd->ipath_sdma_descq_phys); + /* Set SendDmaLenGen */ + tmp64 = dd->ipath_sdma_descq_cnt; + tmp64 |= 1<<18; /* enable generation checking */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen, tmp64); + /* Set SendDmaTail */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, + dd->ipath_sdma_descq_tail); + /* Set SendDmaHeadAddr */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmaheadaddr, + dd->ipath_sdma_head_phys); + + /* + * Reserve all the former "kernel" piobufs, using high number range + * so we get as many 4K buffers as possible + */ + n = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k; + i = dd->ipath_lastport_piobuf + dd->ipath_pioreserved; + ipath_chg_pioavailkernel(dd, i, n - i , 0); + for (; i < n; ++i) { + unsigned word = i / 64; + unsigned bit = i & 63; + BUG_ON(word >= 3); + senddmabufmask[word] |= 1ULL << bit; + } + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask0, + senddmabufmask[0]); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask1, + senddmabufmask[1]); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask2, + senddmabufmask[2]); + + INIT_LIST_HEAD(&dd->ipath_sdma_activelist); + INIT_LIST_HEAD(&dd->ipath_sdma_notifylist); + + tasklet_init(&dd->ipath_sdma_notify_task, sdma_notify_task, + (unsigned long) dd); + tasklet_init(&dd->ipath_sdma_abort_task, sdma_abort_task, + (unsigned long) dd); + + /* + * No use to turn on SDMA here, as link is probably not ACTIVE + * Just mark it RUNNING and enable the interrupt, and let the + * ipath_restart_sdma() on link transition to ACTIVE actually + * enable it. + */ + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl |= INFINIPATH_S_SDMAINTENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + __set_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + +done: + return ret; +} + +void teardown_sdma(struct ipath_devdata *dd) +{ + struct ipath_sdma_txreq *txp, *txpnext; + unsigned long flags; + dma_addr_t sdma_head_phys = 0; + dma_addr_t sdma_descq_phys = 0; + void *sdma_descq = NULL; + void *sdma_head_dma = NULL; + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + __clear_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status); + __set_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status); + __set_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status); + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + + tasklet_kill(&dd->ipath_sdma_abort_task); + tasklet_kill(&dd->ipath_sdma_notify_task); + + /* turn off sdma */ + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + /* dequeue all "sent" requests */ + list_for_each_entry_safe(txp, txpnext, &dd->ipath_sdma_activelist, + list) { + txp->callback_status = IPATH_SDMA_TXREQ_S_SHUTDOWN; + if (txp->flags & IPATH_SDMA_TXREQ_F_VL15) + vl15_watchdog_deq(dd); + list_move_tail(&txp->list, &dd->ipath_sdma_notifylist); + } + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + + sdma_notify_taskbody(dd); + + del_timer_sync(&dd->ipath_sdma_vl15_timer); + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + + dd->ipath_sdma_abort_jiffies = 0; + + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabase, 0); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen, 0); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, 0); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmaheadaddr, 0); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask0, 0); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask1, 0); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask2, 0); + + if (dd->ipath_sdma_head_dma) { + sdma_head_dma = (void *) dd->ipath_sdma_head_dma; + sdma_head_phys = dd->ipath_sdma_head_phys; + dd->ipath_sdma_head_dma = NULL; + dd->ipath_sdma_head_phys = 0; + } + + if (dd->ipath_sdma_descq) { + sdma_descq = dd->ipath_sdma_descq; + sdma_descq_phys = dd->ipath_sdma_descq_phys; + dd->ipath_sdma_descq = NULL; + dd->ipath_sdma_descq_phys = 0; + } + + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + + if (sdma_head_dma) + dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, + sdma_head_dma, sdma_head_phys); + + if (sdma_descq) + dma_free_coherent(&dd->pcidev->dev, SDMA_DESCQ_SZ, + sdma_descq, sdma_descq_phys); +} + +/* + * [Re]start SDMA, if we use it, and it's not already OK. + * This is called on transition to link ACTIVE, either the first or + * subsequent times. + */ +void ipath_restart_sdma(struct ipath_devdata *dd) +{ + unsigned long flags; + int needed = 1; + + if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA)) + goto bail; + + /* + * First, make sure we should, which is to say, + * check that we are "RUNNING" (not in teardown) + * and not "SHUTDOWN" + */ + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + if (!test_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status) + || test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) + needed = 0; + else { + __clear_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status); + __clear_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status); + __clear_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status); + } + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + if (!needed) { + ipath_dbg("invalid attempt to restart SDMA, status 0x%08lx\n", + dd->ipath_sdma_status); + goto bail; + } + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + /* + * First clear, just to be safe. Enable is only done + * in chip on 0->1 transition + */ + dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + dd->ipath_sendctrl |= INFINIPATH_S_SDMAENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + /* notify upper layers */ + ipath_ib_piobufavail(dd->verbs_dev); + +bail: + return; +} + +static inline void make_sdma_desc(struct ipath_devdata *dd, + u64 *sdmadesc, u64 addr, u64 dwlen, u64 dwoffset) +{ + WARN_ON(addr & 3); + /* SDmaPhyAddr[47:32] */ + sdmadesc[1] = addr >> 32; + /* SDmaPhyAddr[31:0] */ + sdmadesc[0] = (addr & 0xfffffffcULL) << 32; + /* SDmaGeneration[1:0] */ + sdmadesc[0] |= (dd->ipath_sdma_generation & 3ULL) << 30; + /* SDmaDwordCount[10:0] */ + sdmadesc[0] |= (dwlen & 0x7ffULL) << 16; + /* SDmaBufOffset[12:2] */ + sdmadesc[0] |= dwoffset & 0x7ffULL; +} + +/* + * This function queues one IB packet onto the send DMA queue per call. + * The caller is responsible for checking: + * 1) The number of send DMA descriptor entries is less than the size of + * the descriptor queue. + * 2) The IB SGE addresses and lengths are 32-bit aligned + * (except possibly the last SGE's length) + * 3) The SGE addresses are suitable for passing to dma_map_single(). + */ +int ipath_sdma_verbs_send(struct ipath_devdata *dd, + struct ipath_sge_state *ss, u32 dwords, + struct ipath_verbs_txreq *tx) +{ + + unsigned long flags; + struct ipath_sge *sge; + int ret = 0; + u16 tail; + __le64 *descqp; + u64 sdmadesc[2]; + u32 dwoffset; + dma_addr_t addr; + + if ((tx->map_len + (dwords<<2)) > dd->ipath_ibmaxlen) { + ipath_dbg("packet size %X > ibmax %X, fail\n", + tx->map_len + (dwords<<2), dd->ipath_ibmaxlen); + ret = -EMSGSIZE; + goto fail; + } + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + +retry: + if (unlikely(test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status))) { + ret = -EBUSY; + goto unlock; + } + + if (tx->txreq.sg_count > ipath_sdma_descq_freecnt(dd)) { + if (ipath_sdma_make_progress(dd)) + goto retry; + ret = -ENOBUFS; + goto unlock; + } + + addr = dma_map_single(&dd->pcidev->dev, tx->txreq.map_addr, + tx->map_len, DMA_TO_DEVICE); + if (dma_mapping_error(&dd->pcidev->dev, addr)) + goto ioerr; + + dwoffset = tx->map_len >> 2; + make_sdma_desc(dd, sdmadesc, (u64) addr, dwoffset, 0); + + /* SDmaFirstDesc */ + sdmadesc[0] |= 1ULL << 12; + if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_USELARGEBUF) + sdmadesc[0] |= 1ULL << 14; /* SDmaUseLargeBuf */ + + /* write to the descq */ + tail = dd->ipath_sdma_descq_tail; + descqp = &dd->ipath_sdma_descq[tail].qw[0]; + *descqp++ = cpu_to_le64(sdmadesc[0]); + *descqp++ = cpu_to_le64(sdmadesc[1]); + + if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEDESC) + tx->txreq.start_idx = tail; + + /* increment the tail */ + if (++tail == dd->ipath_sdma_descq_cnt) { + tail = 0; + descqp = &dd->ipath_sdma_descq[0].qw[0]; + ++dd->ipath_sdma_generation; + } + + sge = &ss->sge; + while (dwords) { + u32 dw; + u32 len; + + len = dwords << 2; + if (len > sge->length) + len = sge->length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + dw = (len + 3) >> 2; + addr = dma_map_single(&dd->pcidev->dev, sge->vaddr, dw << 2, + DMA_TO_DEVICE); + if (dma_mapping_error(&dd->pcidev->dev, addr)) + goto unmap; + make_sdma_desc(dd, sdmadesc, (u64) addr, dw, dwoffset); + /* SDmaUseLargeBuf has to be set in every descriptor */ + if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_USELARGEBUF) + sdmadesc[0] |= 1ULL << 14; + /* write to the descq */ + *descqp++ = cpu_to_le64(sdmadesc[0]); + *descqp++ = cpu_to_le64(sdmadesc[1]); + + /* increment the tail */ + if (++tail == dd->ipath_sdma_descq_cnt) { + tail = 0; + descqp = &dd->ipath_sdma_descq[0].qw[0]; + ++dd->ipath_sdma_generation; + } + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr != NULL) { + if (++sge->n >= IPATH_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + + dwoffset += dw; + dwords -= dw; + } + + if (!tail) + descqp = &dd->ipath_sdma_descq[dd->ipath_sdma_descq_cnt].qw[0]; + descqp -= 2; + /* SDmaLastDesc */ + descqp[0] |= cpu_to_le64(1ULL << 11); + if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_INTREQ) { + /* SDmaIntReq */ + descqp[0] |= cpu_to_le64(1ULL << 15); + } + + /* Commit writes to memory and advance the tail on the chip */ + wmb(); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, tail); + + tx->txreq.next_descq_idx = tail; + tx->txreq.callback_status = IPATH_SDMA_TXREQ_S_OK; + dd->ipath_sdma_descq_tail = tail; + dd->ipath_sdma_descq_added += tx->txreq.sg_count; + list_add_tail(&tx->txreq.list, &dd->ipath_sdma_activelist); + if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_VL15) + vl15_watchdog_enq(dd); + goto unlock; + +unmap: + while (tail != dd->ipath_sdma_descq_tail) { + if (!tail) + tail = dd->ipath_sdma_descq_cnt - 1; + else + tail--; + unmap_desc(dd, tail); + } +ioerr: + ret = -EIO; +unlock: + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); +fail: + return ret; +} diff --git a/drivers/infiniband/hw/ipath/ipath_srq.c b/drivers/infiniband/hw/ipath/ipath_srq.c new file mode 100644 index 0000000..2627198 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_srq.c @@ -0,0 +1,380 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "ipath_verbs.h" + +/** + * ipath_post_srq_receive - post a receive on a shared receive queue + * @ibsrq: the SRQ to post the receive on + * @wr: the list of work requests to post + * @bad_wr: the first WR to cause a problem is put here + * + * This may be called from interrupt context. + */ +int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct ipath_srq *srq = to_isrq(ibsrq); + struct ipath_rwq *wq; + unsigned long flags; + int ret; + + for (; wr; wr = wr->next) { + struct ipath_rwqe *wqe; + u32 next; + int i; + + if ((unsigned) wr->num_sge > srq->rq.max_sge) { + *bad_wr = wr; + ret = -EINVAL; + goto bail; + } + + spin_lock_irqsave(&srq->rq.lock, flags); + wq = srq->rq.wq; + next = wq->head + 1; + if (next >= srq->rq.size) + next = 0; + if (next == wq->tail) { + spin_unlock_irqrestore(&srq->rq.lock, flags); + *bad_wr = wr; + ret = -ENOMEM; + goto bail; + } + + wqe = get_rwqe_ptr(&srq->rq, wq->head); + wqe->wr_id = wr->wr_id; + wqe->num_sge = wr->num_sge; + for (i = 0; i < wr->num_sge; i++) + wqe->sg_list[i] = wr->sg_list[i]; + /* Make sure queue entry is written before the head index. */ + smp_wmb(); + wq->head = next; + spin_unlock_irqrestore(&srq->rq.lock, flags); + } + ret = 0; + +bail: + return ret; +} + +/** + * ipath_create_srq - create a shared receive queue + * @ibpd: the protection domain of the SRQ to create + * @srq_init_attr: the attributes of the SRQ + * @udata: data from libipathverbs when creating a user SRQ + */ +struct ib_srq *ipath_create_srq(struct ib_pd *ibpd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata) +{ + struct ipath_ibdev *dev = to_idev(ibpd->device); + struct ipath_srq *srq; + u32 sz; + struct ib_srq *ret; + + if (srq_init_attr->srq_type != IB_SRQT_BASIC) { + ret = ERR_PTR(-ENOSYS); + goto done; + } + + if (srq_init_attr->attr.max_wr == 0) { + ret = ERR_PTR(-EINVAL); + goto done; + } + + if ((srq_init_attr->attr.max_sge > ib_ipath_max_srq_sges) || + (srq_init_attr->attr.max_wr > ib_ipath_max_srq_wrs)) { + ret = ERR_PTR(-EINVAL); + goto done; + } + + srq = kmalloc(sizeof(*srq), GFP_KERNEL); + if (!srq) { + ret = ERR_PTR(-ENOMEM); + goto done; + } + + /* + * Need to use vmalloc() if we want to support large #s of entries. + */ + srq->rq.size = srq_init_attr->attr.max_wr + 1; + srq->rq.max_sge = srq_init_attr->attr.max_sge; + sz = sizeof(struct ib_sge) * srq->rq.max_sge + + sizeof(struct ipath_rwqe); + srq->rq.wq = vmalloc_user(sizeof(struct ipath_rwq) + srq->rq.size * sz); + if (!srq->rq.wq) { + ret = ERR_PTR(-ENOMEM); + goto bail_srq; + } + + /* + * Return the address of the RWQ as the offset to mmap. + * See ipath_mmap() for details. + */ + if (udata && udata->outlen >= sizeof(__u64)) { + int err; + u32 s = sizeof(struct ipath_rwq) + srq->rq.size * sz; + + srq->ip = + ipath_create_mmap_info(dev, s, + ibpd->uobject->context, + srq->rq.wq); + if (!srq->ip) { + ret = ERR_PTR(-ENOMEM); + goto bail_wq; + } + + err = ib_copy_to_udata(udata, &srq->ip->offset, + sizeof(srq->ip->offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_ip; + } + } else + srq->ip = NULL; + + /* + * ib_create_srq() will initialize srq->ibsrq. + */ + spin_lock_init(&srq->rq.lock); + srq->rq.wq->head = 0; + srq->rq.wq->tail = 0; + srq->limit = srq_init_attr->attr.srq_limit; + + spin_lock(&dev->n_srqs_lock); + if (dev->n_srqs_allocated == ib_ipath_max_srqs) { + spin_unlock(&dev->n_srqs_lock); + ret = ERR_PTR(-ENOMEM); + goto bail_ip; + } + + dev->n_srqs_allocated++; + spin_unlock(&dev->n_srqs_lock); + + if (srq->ip) { + spin_lock_irq(&dev->pending_lock); + list_add(&srq->ip->pending_mmaps, &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + + ret = &srq->ibsrq; + goto done; + +bail_ip: + kfree(srq->ip); +bail_wq: + vfree(srq->rq.wq); +bail_srq: + kfree(srq); +done: + return ret; +} + +/** + * ipath_modify_srq - modify a shared receive queue + * @ibsrq: the SRQ to modify + * @attr: the new attributes of the SRQ + * @attr_mask: indicates which attributes to modify + * @udata: user data for ipathverbs.so + */ +int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, + struct ib_udata *udata) +{ + struct ipath_srq *srq = to_isrq(ibsrq); + struct ipath_rwq *wq; + int ret = 0; + + if (attr_mask & IB_SRQ_MAX_WR) { + struct ipath_rwq *owq; + struct ipath_rwqe *p; + u32 sz, size, n, head, tail; + + /* Check that the requested sizes are below the limits. */ + if ((attr->max_wr > ib_ipath_max_srq_wrs) || + ((attr_mask & IB_SRQ_LIMIT) ? + attr->srq_limit : srq->limit) > attr->max_wr) { + ret = -EINVAL; + goto bail; + } + + sz = sizeof(struct ipath_rwqe) + + srq->rq.max_sge * sizeof(struct ib_sge); + size = attr->max_wr + 1; + wq = vmalloc_user(sizeof(struct ipath_rwq) + size * sz); + if (!wq) { + ret = -ENOMEM; + goto bail; + } + + /* Check that we can write the offset to mmap. */ + if (udata && udata->inlen >= sizeof(__u64)) { + __u64 offset_addr; + __u64 offset = 0; + + ret = ib_copy_from_udata(&offset_addr, udata, + sizeof(offset_addr)); + if (ret) + goto bail_free; + udata->outbuf = + (void __user *) (unsigned long) offset_addr; + ret = ib_copy_to_udata(udata, &offset, + sizeof(offset)); + if (ret) + goto bail_free; + } + + spin_lock_irq(&srq->rq.lock); + /* + * validate head pointer value and compute + * the number of remaining WQEs. + */ + owq = srq->rq.wq; + head = owq->head; + if (head >= srq->rq.size) + head = 0; + tail = owq->tail; + if (tail >= srq->rq.size) + tail = 0; + n = head; + if (n < tail) + n += srq->rq.size - tail; + else + n -= tail; + if (size <= n) { + ret = -EINVAL; + goto bail_unlock; + } + n = 0; + p = wq->wq; + while (tail != head) { + struct ipath_rwqe *wqe; + int i; + + wqe = get_rwqe_ptr(&srq->rq, tail); + p->wr_id = wqe->wr_id; + p->num_sge = wqe->num_sge; + for (i = 0; i < wqe->num_sge; i++) + p->sg_list[i] = wqe->sg_list[i]; + n++; + p = (struct ipath_rwqe *)((char *) p + sz); + if (++tail >= srq->rq.size) + tail = 0; + } + srq->rq.wq = wq; + srq->rq.size = size; + wq->head = n; + wq->tail = 0; + if (attr_mask & IB_SRQ_LIMIT) + srq->limit = attr->srq_limit; + spin_unlock_irq(&srq->rq.lock); + + vfree(owq); + + if (srq->ip) { + struct ipath_mmap_info *ip = srq->ip; + struct ipath_ibdev *dev = to_idev(srq->ibsrq.device); + u32 s = sizeof(struct ipath_rwq) + size * sz; + + ipath_update_mmap_info(dev, ip, s, wq); + + /* + * Return the offset to mmap. + * See ipath_mmap() for details. + */ + if (udata && udata->inlen >= sizeof(__u64)) { + ret = ib_copy_to_udata(udata, &ip->offset, + sizeof(ip->offset)); + if (ret) + goto bail; + } + + spin_lock_irq(&dev->pending_lock); + if (list_empty(&ip->pending_mmaps)) + list_add(&ip->pending_mmaps, + &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + } else if (attr_mask & IB_SRQ_LIMIT) { + spin_lock_irq(&srq->rq.lock); + if (attr->srq_limit >= srq->rq.size) + ret = -EINVAL; + else + srq->limit = attr->srq_limit; + spin_unlock_irq(&srq->rq.lock); + } + goto bail; + +bail_unlock: + spin_unlock_irq(&srq->rq.lock); +bail_free: + vfree(wq); +bail: + return ret; +} + +int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) +{ + struct ipath_srq *srq = to_isrq(ibsrq); + + attr->max_wr = srq->rq.size - 1; + attr->max_sge = srq->rq.max_sge; + attr->srq_limit = srq->limit; + return 0; +} + +/** + * ipath_destroy_srq - destroy a shared receive queue + * @ibsrq: the SRQ to destroy + */ +int ipath_destroy_srq(struct ib_srq *ibsrq) +{ + struct ipath_srq *srq = to_isrq(ibsrq); + struct ipath_ibdev *dev = to_idev(ibsrq->device); + + spin_lock(&dev->n_srqs_lock); + dev->n_srqs_allocated--; + spin_unlock(&dev->n_srqs_lock); + if (srq->ip) + kref_put(&srq->ip->ref, ipath_release_mmap_info); + else + vfree(srq->rq.wq); + kfree(srq); + + return 0; +} diff --git a/drivers/infiniband/hw/ipath/ipath_stats.c b/drivers/infiniband/hw/ipath/ipath_stats.c new file mode 100644 index 0000000..f63e143 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_stats.c @@ -0,0 +1,347 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ipath_kernel.h" + +struct infinipath_stats ipath_stats; + +/** + * ipath_snap_cntr - snapshot a chip counter + * @dd: the infinipath device + * @creg: the counter to snapshot + * + * called from add_timer and user counter read calls, to deal with + * counters that wrap in "human time". The words sent and received, and + * the packets sent and received are all that we worry about. For now, + * at least, we don't worry about error counters, because if they wrap + * that quickly, we probably don't care. We may eventually just make this + * handle all the counters. word counters can wrap in about 20 seconds + * of full bandwidth traffic, packet counters in a few hours. + */ + +u64 ipath_snap_cntr(struct ipath_devdata *dd, ipath_creg creg) +{ + u32 val, reg64 = 0; + u64 val64; + unsigned long t0, t1; + u64 ret; + + t0 = jiffies; + /* If fast increment counters are only 32 bits, snapshot them, + * and maintain them as 64bit values in the driver */ + if (!(dd->ipath_flags & IPATH_32BITCOUNTERS) && + (creg == dd->ipath_cregs->cr_wordsendcnt || + creg == dd->ipath_cregs->cr_wordrcvcnt || + creg == dd->ipath_cregs->cr_pktsendcnt || + creg == dd->ipath_cregs->cr_pktrcvcnt)) { + val64 = ipath_read_creg(dd, creg); + val = val64 == ~0ULL ? ~0U : 0; + reg64 = 1; + } else /* val64 just to keep gcc quiet... */ + val64 = val = ipath_read_creg32(dd, creg); + /* + * See if a second has passed. This is just a way to detect things + * that are quite broken. Normally this should take just a few + * cycles (the check is for long enough that we don't care if we get + * pre-empted.) An Opteron HT O read timeout is 4 seconds with + * normal NB values + */ + t1 = jiffies; + if (time_before(t0 + HZ, t1) && val == -1) { + ipath_dev_err(dd, "Error! Read counter 0x%x timed out\n", + creg); + ret = 0ULL; + goto bail; + } + if (reg64) { + ret = val64; + goto bail; + } + + if (creg == dd->ipath_cregs->cr_wordsendcnt) { + if (val != dd->ipath_lastsword) { + dd->ipath_sword += val - dd->ipath_lastsword; + dd->ipath_lastsword = val; + } + val64 = dd->ipath_sword; + } else if (creg == dd->ipath_cregs->cr_wordrcvcnt) { + if (val != dd->ipath_lastrword) { + dd->ipath_rword += val - dd->ipath_lastrword; + dd->ipath_lastrword = val; + } + val64 = dd->ipath_rword; + } else if (creg == dd->ipath_cregs->cr_pktsendcnt) { + if (val != dd->ipath_lastspkts) { + dd->ipath_spkts += val - dd->ipath_lastspkts; + dd->ipath_lastspkts = val; + } + val64 = dd->ipath_spkts; + } else if (creg == dd->ipath_cregs->cr_pktrcvcnt) { + if (val != dd->ipath_lastrpkts) { + dd->ipath_rpkts += val - dd->ipath_lastrpkts; + dd->ipath_lastrpkts = val; + } + val64 = dd->ipath_rpkts; + } else if (creg == dd->ipath_cregs->cr_ibsymbolerrcnt) { + if (dd->ibdeltainprog) + val64 -= val64 - dd->ibsymsnap; + val64 -= dd->ibsymdelta; + } else if (creg == dd->ipath_cregs->cr_iblinkerrrecovcnt) { + if (dd->ibdeltainprog) + val64 -= val64 - dd->iblnkerrsnap; + val64 -= dd->iblnkerrdelta; + } else + val64 = (u64) val; + + ret = val64; + +bail: + return ret; +} + +/** + * ipath_qcheck - print delta of egrfull/hdrqfull errors for kernel ports + * @dd: the infinipath device + * + * print the delta of egrfull/hdrqfull errors for kernel ports no more than + * every 5 seconds. User processes are printed at close, but kernel doesn't + * close, so... Separate routine so may call from other places someday, and + * so function name when printed by _IPATH_INFO is meaningfull + */ +static void ipath_qcheck(struct ipath_devdata *dd) +{ + static u64 last_tot_hdrqfull; + struct ipath_portdata *pd = dd->ipath_pd[0]; + size_t blen = 0; + char buf[128]; + u32 hdrqtail; + + *buf = 0; + if (pd->port_hdrqfull != dd->ipath_p0_hdrqfull) { + blen = snprintf(buf, sizeof buf, "port 0 hdrqfull %u", + pd->port_hdrqfull - + dd->ipath_p0_hdrqfull); + dd->ipath_p0_hdrqfull = pd->port_hdrqfull; + } + if (ipath_stats.sps_etidfull != dd->ipath_last_tidfull) { + blen += snprintf(buf + blen, sizeof buf - blen, + "%srcvegrfull %llu", + blen ? ", " : "", + (unsigned long long) + (ipath_stats.sps_etidfull - + dd->ipath_last_tidfull)); + dd->ipath_last_tidfull = ipath_stats.sps_etidfull; + } + + /* + * this is actually the number of hdrq full interrupts, not actual + * events, but at the moment that's mostly what I'm interested in. + * Actual count, etc. is in the counters, if needed. For production + * users this won't ordinarily be printed. + */ + + if ((ipath_debug & (__IPATH_PKTDBG | __IPATH_DBG)) && + ipath_stats.sps_hdrqfull != last_tot_hdrqfull) { + blen += snprintf(buf + blen, sizeof buf - blen, + "%shdrqfull %llu (all ports)", + blen ? ", " : "", + (unsigned long long) + (ipath_stats.sps_hdrqfull - + last_tot_hdrqfull)); + last_tot_hdrqfull = ipath_stats.sps_hdrqfull; + } + if (blen) + ipath_dbg("%s\n", buf); + + hdrqtail = ipath_get_hdrqtail(pd); + if (pd->port_head != hdrqtail) { + if (dd->ipath_lastport0rcv_cnt == + ipath_stats.sps_port0pkts) { + ipath_cdbg(PKT, "missing rcv interrupts? " + "port0 hd=%x tl=%x; port0pkts %llx; write" + " hd (w/intr)\n", + pd->port_head, hdrqtail, + (unsigned long long) + ipath_stats.sps_port0pkts); + ipath_write_ureg(dd, ur_rcvhdrhead, hdrqtail | + dd->ipath_rhdrhead_intr_off, pd->port_port); + } + dd->ipath_lastport0rcv_cnt = ipath_stats.sps_port0pkts; + } +} + +static void ipath_chk_errormask(struct ipath_devdata *dd) +{ + static u32 fixed; + u32 ctrl; + unsigned long errormask; + unsigned long hwerrs; + + if (!dd->ipath_errormask || !(dd->ipath_flags & IPATH_INITTED)) + return; + + errormask = ipath_read_kreg64(dd, dd->ipath_kregs->kr_errormask); + + if (errormask == dd->ipath_errormask) + return; + fixed++; + + hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus); + ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control); + + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + dd->ipath_errormask); + + if ((hwerrs & dd->ipath_hwerrmask) || + (ctrl & INFINIPATH_C_FREEZEMODE)) { + /* force re-interrupt of pending events, just in case */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, 0ULL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, 0ULL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL); + dev_info(&dd->pcidev->dev, + "errormask fixed(%u) %lx -> %lx, ctrl %x hwerr %lx\n", + fixed, errormask, (unsigned long)dd->ipath_errormask, + ctrl, hwerrs); + } else + ipath_dbg("errormask fixed(%u) %lx -> %lx, no freeze\n", + fixed, errormask, + (unsigned long)dd->ipath_errormask); +} + + +/** + * ipath_get_faststats - get word counters from chip before they overflow + * @opaque - contains a pointer to the infinipath device ipath_devdata + * + * called from add_timer + */ +void ipath_get_faststats(unsigned long opaque) +{ + struct ipath_devdata *dd = (struct ipath_devdata *) opaque; + int i; + static unsigned cnt; + unsigned long flags; + u64 traffic_wds; + + /* + * don't access the chip while running diags, or memory diags can + * fail + */ + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_INITTED) || + ipath_diag_inuse) + /* but re-arm the timer, for diags case; won't hurt other */ + goto done; + + /* + * We now try to maintain a "active timer", based on traffic + * exceeding a threshold, so we need to check the word-counts + * even if they are 64-bit. + */ + traffic_wds = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt) + + ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt); + spin_lock_irqsave(&dd->ipath_eep_st_lock, flags); + traffic_wds -= dd->ipath_traffic_wds; + dd->ipath_traffic_wds += traffic_wds; + if (traffic_wds >= IPATH_TRAFFIC_ACTIVE_THRESHOLD) + atomic_add(5, &dd->ipath_active_time); /* S/B #define */ + spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags); + + if (dd->ipath_flags & IPATH_32BITCOUNTERS) { + ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt); + ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt); + } + + ipath_qcheck(dd); + + /* + * deal with repeat error suppression. Doesn't really matter if + * last error was almost a full interval ago, or just a few usecs + * ago; still won't get more than 2 per interval. We may want + * longer intervals for this eventually, could do with mod, counter + * or separate timer. Also see code in ipath_handle_errors() and + * ipath_handle_hwerrors(). + */ + + if (dd->ipath_lasterror) + dd->ipath_lasterror = 0; + if (dd->ipath_lasthwerror) + dd->ipath_lasthwerror = 0; + if (dd->ipath_maskederrs + && time_after(jiffies, dd->ipath_unmasktime)) { + char ebuf[256]; + int iserr; + iserr = ipath_decode_err(dd, ebuf, sizeof ebuf, + dd->ipath_maskederrs); + if (dd->ipath_maskederrs & + ~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL | + INFINIPATH_E_PKTERRS)) + ipath_dev_err(dd, "Re-enabling masked errors " + "(%s)\n", ebuf); + else { + /* + * rcvegrfull and rcvhdrqfull are "normal", for some + * types of processes (mostly benchmarks) that send + * huge numbers of messages, while not processing + * them. So only complain about these at debug + * level. + */ + if (iserr) + ipath_dbg( + "Re-enabling queue full errors (%s)\n", + ebuf); + else + ipath_cdbg(ERRPKT, "Re-enabling packet" + " problem interrupt (%s)\n", ebuf); + } + + /* re-enable masked errors */ + dd->ipath_errormask |= dd->ipath_maskederrs; + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + dd->ipath_errormask); + dd->ipath_maskederrs = 0; + } + + /* limit qfull messages to ~one per minute per port */ + if ((++cnt & 0x10)) { + for (i = (int) dd->ipath_cfgports; --i >= 0; ) { + struct ipath_portdata *pd = dd->ipath_pd[i]; + + if (pd && pd->port_lastrcvhdrqtail != -1) + pd->port_lastrcvhdrqtail = -1; + } + } + + ipath_chk_errormask(dd); +done: + mod_timer(&dd->ipath_stats_timer, jiffies + HZ * 5); +} diff --git a/drivers/infiniband/hw/ipath/ipath_sysfs.c b/drivers/infiniband/hw/ipath/ipath_sysfs.c new file mode 100644 index 0000000..75558f3 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_sysfs.c @@ -0,0 +1,1238 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "ipath_kernel.h" +#include "ipath_verbs.h" +#include "ipath_common.h" + +/** + * ipath_parse_ushort - parse an unsigned short value in an arbitrary base + * @str: the string containing the number + * @valp: where to put the result + * + * returns the number of bytes consumed, or negative value on error + */ +int ipath_parse_ushort(const char *str, unsigned short *valp) +{ + unsigned long val; + char *end; + int ret; + + if (!isdigit(str[0])) { + ret = -EINVAL; + goto bail; + } + + val = simple_strtoul(str, &end, 0); + + if (val > 0xffff) { + ret = -EINVAL; + goto bail; + } + + *valp = val; + + ret = end + 1 - str; + if (ret == 0) + ret = -EINVAL; + +bail: + return ret; +} + +static ssize_t show_version(struct device_driver *dev, char *buf) +{ + /* The string printed here is already newline-terminated. */ + return scnprintf(buf, PAGE_SIZE, "%s", ib_ipath_version); +} + +static ssize_t show_num_units(struct device_driver *dev, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%d\n", + ipath_count_units(NULL, NULL, NULL)); +} + +static ssize_t show_status(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + ssize_t ret; + + if (!dd->ipath_statusp) { + ret = -EINVAL; + goto bail; + } + + ret = scnprintf(buf, PAGE_SIZE, "0x%llx\n", + (unsigned long long) *(dd->ipath_statusp)); + +bail: + return ret; +} + +static const char *ipath_status_str[] = { + "Initted", + "Disabled", + "Admin_Disabled", + "", /* This used to be the old "OIB_SMA" status. */ + "", /* This used to be the old "SMA" status. */ + "Present", + "IB_link_up", + "IB_configured", + "NoIBcable", + "Fatal_Hardware_Error", + NULL, +}; + +static ssize_t show_status_str(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int i, any; + u64 s; + ssize_t ret; + + if (!dd->ipath_statusp) { + ret = -EINVAL; + goto bail; + } + + s = *(dd->ipath_statusp); + *buf = '\0'; + for (any = i = 0; s && ipath_status_str[i]; i++) { + if (s & 1) { + if (any && strlcat(buf, " ", PAGE_SIZE) >= + PAGE_SIZE) + /* overflow */ + break; + if (strlcat(buf, ipath_status_str[i], + PAGE_SIZE) >= PAGE_SIZE) + break; + any = 1; + } + s >>= 1; + } + if (any) + strlcat(buf, "\n", PAGE_SIZE); + + ret = strlen(buf); + +bail: + return ret; +} + +static ssize_t show_boardversion(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + /* The string printed here is already newline-terminated. */ + return scnprintf(buf, PAGE_SIZE, "%s", dd->ipath_boardversion); +} + +static ssize_t show_localbus_info(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + /* The string printed here is already newline-terminated. */ + return scnprintf(buf, PAGE_SIZE, "%s", dd->ipath_lbus_info); +} + +static ssize_t show_lmc(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_lmc); +} + +static ssize_t store_lmc(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + u16 lmc = 0; + int ret; + + ret = ipath_parse_ushort(buf, &lmc); + if (ret < 0) + goto invalid; + + if (lmc > 7) { + ret = -EINVAL; + goto invalid; + } + + ipath_set_lid(dd, dd->ipath_lid, lmc); + + goto bail; +invalid: + ipath_dev_err(dd, "attempt to set invalid LMC %u\n", lmc); +bail: + return ret; +} + +static ssize_t show_lid(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "0x%x\n", dd->ipath_lid); +} + +static ssize_t store_lid(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + u16 lid = 0; + int ret; + + ret = ipath_parse_ushort(buf, &lid); + if (ret < 0) + goto invalid; + + if (lid == 0 || lid >= IPATH_MULTICAST_LID_BASE) { + ret = -EINVAL; + goto invalid; + } + + ipath_set_lid(dd, lid, dd->ipath_lmc); + + goto bail; +invalid: + ipath_dev_err(dd, "attempt to set invalid LID 0x%x\n", lid); +bail: + return ret; +} + +static ssize_t show_mlid(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "0x%x\n", dd->ipath_mlid); +} + +static ssize_t store_mlid(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + u16 mlid; + int ret; + + ret = ipath_parse_ushort(buf, &mlid); + if (ret < 0 || mlid < IPATH_MULTICAST_LID_BASE) + goto invalid; + + dd->ipath_mlid = mlid; + + goto bail; +invalid: + ipath_dev_err(dd, "attempt to set invalid MLID\n"); +bail: + return ret; +} + +static ssize_t show_guid(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + u8 *guid; + + guid = (u8 *) & (dd->ipath_guid); + + return scnprintf(buf, PAGE_SIZE, + "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", + guid[0], guid[1], guid[2], guid[3], + guid[4], guid[5], guid[6], guid[7]); +} + +static ssize_t store_guid(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + ssize_t ret; + unsigned short guid[8]; + __be64 new_guid; + u8 *ng; + int i; + + if (sscanf(buf, "%hx:%hx:%hx:%hx:%hx:%hx:%hx:%hx", + &guid[0], &guid[1], &guid[2], &guid[3], + &guid[4], &guid[5], &guid[6], &guid[7]) != 8) + goto invalid; + + ng = (u8 *) &new_guid; + + for (i = 0; i < 8; i++) { + if (guid[i] > 0xff) + goto invalid; + ng[i] = guid[i]; + } + + if (new_guid == 0) + goto invalid; + + dd->ipath_guid = new_guid; + dd->ipath_nguid = 1; + if (dd->verbs_dev) + dd->verbs_dev->ibdev.node_guid = new_guid; + + ret = strlen(buf); + goto bail; + +invalid: + ipath_dev_err(dd, "attempt to set invalid GUID\n"); + ret = -EINVAL; + +bail: + return ret; +} + +static ssize_t show_nguid(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_nguid); +} + +static ssize_t show_nports(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + /* Return the number of user ports available. */ + return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_cfgports - 1); +} + +static ssize_t show_serial(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + buf[sizeof dd->ipath_serial] = '\0'; + memcpy(buf, dd->ipath_serial, sizeof dd->ipath_serial); + strcat(buf, "\n"); + return strlen(buf); +} + +static ssize_t show_unit(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_unit); +} + +static ssize_t show_jint_max_packets(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "%hu\n", dd->ipath_jint_max_packets); +} + +static ssize_t store_jint_max_packets(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + u16 v = 0; + int ret; + + ret = ipath_parse_ushort(buf, &v); + if (ret < 0) + ipath_dev_err(dd, "invalid jint_max_packets.\n"); + else + dd->ipath_f_config_jint(dd, dd->ipath_jint_idle_ticks, v); + + return ret; +} + +static ssize_t show_jint_idle_ticks(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "%hu\n", dd->ipath_jint_idle_ticks); +} + +static ssize_t store_jint_idle_ticks(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + u16 v = 0; + int ret; + + ret = ipath_parse_ushort(buf, &v); + if (ret < 0) + ipath_dev_err(dd, "invalid jint_idle_ticks.\n"); + else + dd->ipath_f_config_jint(dd, v, dd->ipath_jint_max_packets); + + return ret; +} + +#define DEVICE_COUNTER(name, attr) \ + static ssize_t show_counter_##name(struct device *dev, \ + struct device_attribute *attr, \ + char *buf) \ + { \ + struct ipath_devdata *dd = dev_get_drvdata(dev); \ + return scnprintf(\ + buf, PAGE_SIZE, "%llu\n", (unsigned long long) \ + ipath_snap_cntr( \ + dd, offsetof(struct infinipath_counters, \ + attr) / sizeof(u64))); \ + } \ + static DEVICE_ATTR(name, S_IRUGO, show_counter_##name, NULL); + +DEVICE_COUNTER(ib_link_downeds, IBLinkDownedCnt); +DEVICE_COUNTER(ib_link_err_recoveries, IBLinkErrRecoveryCnt); +DEVICE_COUNTER(ib_status_changes, IBStatusChangeCnt); +DEVICE_COUNTER(ib_symbol_errs, IBSymbolErrCnt); +DEVICE_COUNTER(lb_flow_stalls, LBFlowStallCnt); +DEVICE_COUNTER(lb_ints, LBIntCnt); +DEVICE_COUNTER(rx_bad_formats, RxBadFormatCnt); +DEVICE_COUNTER(rx_buf_ovfls, RxBufOvflCnt); +DEVICE_COUNTER(rx_data_pkts, RxDataPktCnt); +DEVICE_COUNTER(rx_dropped_pkts, RxDroppedPktCnt); +DEVICE_COUNTER(rx_dwords, RxDwordCnt); +DEVICE_COUNTER(rx_ebps, RxEBPCnt); +DEVICE_COUNTER(rx_flow_ctrl_errs, RxFlowCtrlErrCnt); +DEVICE_COUNTER(rx_flow_pkts, RxFlowPktCnt); +DEVICE_COUNTER(rx_icrc_errs, RxICRCErrCnt); +DEVICE_COUNTER(rx_len_errs, RxLenErrCnt); +DEVICE_COUNTER(rx_link_problems, RxLinkProblemCnt); +DEVICE_COUNTER(rx_lpcrc_errs, RxLPCRCErrCnt); +DEVICE_COUNTER(rx_max_min_len_errs, RxMaxMinLenErrCnt); +DEVICE_COUNTER(rx_p0_hdr_egr_ovfls, RxP0HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p1_hdr_egr_ovfls, RxP1HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p2_hdr_egr_ovfls, RxP2HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p3_hdr_egr_ovfls, RxP3HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p4_hdr_egr_ovfls, RxP4HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p5_hdr_egr_ovfls, RxP5HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p6_hdr_egr_ovfls, RxP6HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p7_hdr_egr_ovfls, RxP7HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p8_hdr_egr_ovfls, RxP8HdrEgrOvflCnt); +DEVICE_COUNTER(rx_pkey_mismatches, RxPKeyMismatchCnt); +DEVICE_COUNTER(rx_tid_full_errs, RxTIDFullErrCnt); +DEVICE_COUNTER(rx_tid_valid_errs, RxTIDValidErrCnt); +DEVICE_COUNTER(rx_vcrc_errs, RxVCRCErrCnt); +DEVICE_COUNTER(tx_data_pkts, TxDataPktCnt); +DEVICE_COUNTER(tx_dropped_pkts, TxDroppedPktCnt); +DEVICE_COUNTER(tx_dwords, TxDwordCnt); +DEVICE_COUNTER(tx_flow_pkts, TxFlowPktCnt); +DEVICE_COUNTER(tx_flow_stalls, TxFlowStallCnt); +DEVICE_COUNTER(tx_len_errs, TxLenErrCnt); +DEVICE_COUNTER(tx_max_min_len_errs, TxMaxMinLenErrCnt); +DEVICE_COUNTER(tx_underruns, TxUnderrunCnt); +DEVICE_COUNTER(tx_unsup_vl_errs, TxUnsupVLErrCnt); + +static struct attribute *dev_counter_attributes[] = { + &dev_attr_ib_link_downeds.attr, + &dev_attr_ib_link_err_recoveries.attr, + &dev_attr_ib_status_changes.attr, + &dev_attr_ib_symbol_errs.attr, + &dev_attr_lb_flow_stalls.attr, + &dev_attr_lb_ints.attr, + &dev_attr_rx_bad_formats.attr, + &dev_attr_rx_buf_ovfls.attr, + &dev_attr_rx_data_pkts.attr, + &dev_attr_rx_dropped_pkts.attr, + &dev_attr_rx_dwords.attr, + &dev_attr_rx_ebps.attr, + &dev_attr_rx_flow_ctrl_errs.attr, + &dev_attr_rx_flow_pkts.attr, + &dev_attr_rx_icrc_errs.attr, + &dev_attr_rx_len_errs.attr, + &dev_attr_rx_link_problems.attr, + &dev_attr_rx_lpcrc_errs.attr, + &dev_attr_rx_max_min_len_errs.attr, + &dev_attr_rx_p0_hdr_egr_ovfls.attr, + &dev_attr_rx_p1_hdr_egr_ovfls.attr, + &dev_attr_rx_p2_hdr_egr_ovfls.attr, + &dev_attr_rx_p3_hdr_egr_ovfls.attr, + &dev_attr_rx_p4_hdr_egr_ovfls.attr, + &dev_attr_rx_p5_hdr_egr_ovfls.attr, + &dev_attr_rx_p6_hdr_egr_ovfls.attr, + &dev_attr_rx_p7_hdr_egr_ovfls.attr, + &dev_attr_rx_p8_hdr_egr_ovfls.attr, + &dev_attr_rx_pkey_mismatches.attr, + &dev_attr_rx_tid_full_errs.attr, + &dev_attr_rx_tid_valid_errs.attr, + &dev_attr_rx_vcrc_errs.attr, + &dev_attr_tx_data_pkts.attr, + &dev_attr_tx_dropped_pkts.attr, + &dev_attr_tx_dwords.attr, + &dev_attr_tx_flow_pkts.attr, + &dev_attr_tx_flow_stalls.attr, + &dev_attr_tx_len_errs.attr, + &dev_attr_tx_max_min_len_errs.attr, + &dev_attr_tx_underruns.attr, + &dev_attr_tx_unsup_vl_errs.attr, + NULL +}; + +static struct attribute_group dev_counter_attr_group = { + .name = "counters", + .attrs = dev_counter_attributes +}; + +static ssize_t store_reset(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + if (count < 5 || memcmp(buf, "reset", 5)) { + ret = -EINVAL; + goto bail; + } + + if (dd->ipath_flags & IPATH_DISABLED) { + /* + * post-reset init would re-enable interrupts, etc. + * so don't allow reset on disabled devices. Not + * perfect error, but about the best choice. + */ + dev_info(dev,"Unit %d is disabled, can't reset\n", + dd->ipath_unit); + ret = -EINVAL; + goto bail; + } + ret = ipath_reset_device(dd->ipath_unit); +bail: + return ret<0 ? ret : count; +} + +static ssize_t store_link_state(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, r; + u16 state; + + ret = ipath_parse_ushort(buf, &state); + if (ret < 0) + goto invalid; + + r = ipath_set_linkstate(dd, state); + if (r < 0) { + ret = r; + goto bail; + } + + goto bail; +invalid: + ipath_dev_err(dd, "attempt to set invalid link state\n"); +bail: + return ret; +} + +static ssize_t show_mtu(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_ibmtu); +} + +static ssize_t store_mtu(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + ssize_t ret; + u16 mtu = 0; + int r; + + ret = ipath_parse_ushort(buf, &mtu); + if (ret < 0) + goto invalid; + + r = ipath_set_mtu(dd, mtu); + if (r < 0) + ret = r; + + goto bail; +invalid: + ipath_dev_err(dd, "attempt to set invalid MTU\n"); +bail: + return ret; +} + +static ssize_t show_enabled(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + return scnprintf(buf, PAGE_SIZE, "%u\n", + (dd->ipath_flags & IPATH_DISABLED) ? 0 : 1); +} + +static ssize_t store_enabled(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + ssize_t ret; + u16 enable = 0; + + ret = ipath_parse_ushort(buf, &enable); + if (ret < 0) { + ipath_dev_err(dd, "attempt to use non-numeric on enable\n"); + goto bail; + } + + if (enable) { + if (!(dd->ipath_flags & IPATH_DISABLED)) + goto bail; + + dev_info(dev, "Enabling unit %d\n", dd->ipath_unit); + /* same as post-reset */ + ret = ipath_init_chip(dd, 1); + if (ret) + ipath_dev_err(dd, "Failed to enable unit %d\n", + dd->ipath_unit); + else { + dd->ipath_flags &= ~IPATH_DISABLED; + *dd->ipath_statusp &= ~IPATH_STATUS_ADMIN_DISABLED; + } + } + else if (!(dd->ipath_flags & IPATH_DISABLED)) { + dev_info(dev, "Disabling unit %d\n", dd->ipath_unit); + ipath_shutdown_device(dd); + dd->ipath_flags |= IPATH_DISABLED; + *dd->ipath_statusp |= IPATH_STATUS_ADMIN_DISABLED; + } + +bail: + return ret; +} + +static ssize_t store_rx_pol_inv(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, r; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret < 0) + goto invalid; + + r = ipath_set_rx_pol_inv(dd, val); + if (r < 0) { + ret = r; + goto bail; + } + + goto bail; +invalid: + ipath_dev_err(dd, "attempt to set invalid Rx Polarity invert\n"); +bail: + return ret; +} + +static ssize_t store_led_override(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret > 0) + ipath_set_led_override(dd, val); + else + ipath_dev_err(dd, "attempt to set invalid LED override\n"); + return ret; +} + +static ssize_t show_logged_errs(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int idx, count; + + /* force consistency with actual EEPROM */ + if (ipath_update_eeprom_log(dd) != 0) + return -ENXIO; + + count = 0; + for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) { + count += scnprintf(buf + count, PAGE_SIZE - count, "%d%c", + dd->ipath_eep_st_errs[idx], + idx == (IPATH_EEP_LOG_CNT - 1) ? '\n' : ' '); + } + + return count; +} + +/* + * New sysfs entries to control various IB config. These all turn into + * accesses via ipath_f_get/set_ib_cfg. + * + * Get/Set heartbeat enable. Or of 1=enabled, 2=auto + */ +static ssize_t show_hrtbt_enb(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_HRTBT); + if (ret >= 0) + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +static ssize_t store_hrtbt_enb(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, r; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret >= 0 && val > 3) + ret = -EINVAL; + if (ret < 0) { + ipath_dev_err(dd, "attempt to set invalid Heartbeat enable\n"); + goto bail; + } + + /* + * Set the "intentional" heartbeat enable per either of + * "Enable" and "Auto", as these are normally set together. + * This bit is consulted when leaving loopback mode, + * because entering loopback mode overrides it and automatically + * disables heartbeat. + */ + r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, val); + if (r < 0) + ret = r; + else if (val == IPATH_IB_HRTBT_OFF) + dd->ipath_flags |= IPATH_NO_HRTBT; + else + dd->ipath_flags &= ~IPATH_NO_HRTBT; + +bail: + return ret; +} + +/* + * Get/Set Link-widths enabled. Or of 1=1x, 2=4x (this is human/IB centric, + * _not_ the particular encoding of any given chip) + */ +static ssize_t show_lwid_enb(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB); + if (ret >= 0) + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +static ssize_t store_lwid_enb(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, r; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret >= 0 && (val == 0 || val > 3)) + ret = -EINVAL; + if (ret < 0) { + ipath_dev_err(dd, + "attempt to set invalid Link Width (enable)\n"); + goto bail; + } + + r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB, val); + if (r < 0) + ret = r; + +bail: + return ret; +} + +/* Get current link width */ +static ssize_t show_lwid(struct device *dev, + struct device_attribute *attr, + char *buf) + +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LWID); + if (ret >= 0) + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +/* + * Get/Set Link-speeds enabled. Or of 1=SDR 2=DDR. + */ +static ssize_t show_spd_enb(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB); + if (ret >= 0) + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +static ssize_t store_spd_enb(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, r; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret >= 0 && (val == 0 || val > (IPATH_IB_SDR | IPATH_IB_DDR))) + ret = -EINVAL; + if (ret < 0) { + ipath_dev_err(dd, + "attempt to set invalid Link Speed (enable)\n"); + goto bail; + } + + r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB, val); + if (r < 0) + ret = r; + +bail: + return ret; +} + +/* Get current link speed */ +static ssize_t show_spd(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_SPD); + if (ret >= 0) + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +/* + * Get/Set RX polarity-invert enable. 0=no, 1=yes. + */ +static ssize_t show_rx_polinv_enb(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_RXPOL_ENB); + if (ret >= 0) + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +static ssize_t store_rx_polinv_enb(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, r; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret >= 0 && val > 1) { + ipath_dev_err(dd, + "attempt to set invalid Rx Polarity (enable)\n"); + ret = -EINVAL; + goto bail; + } + + r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_RXPOL_ENB, val); + if (r < 0) + ret = r; + +bail: + return ret; +} + +/* + * Get/Set RX lane-reversal enable. 0=no, 1=yes. + */ +static ssize_t show_lanerev_enb(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LREV_ENB); + if (ret >= 0) + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +static ssize_t store_lanerev_enb(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, r; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret >= 0 && val > 1) { + ret = -EINVAL; + ipath_dev_err(dd, + "attempt to set invalid Lane reversal (enable)\n"); + goto bail; + } + + r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LREV_ENB, val); + if (r < 0) + ret = r; + +bail: + return ret; +} + +static DRIVER_ATTR(num_units, S_IRUGO, show_num_units, NULL); +static DRIVER_ATTR(version, S_IRUGO, show_version, NULL); + +static struct attribute *driver_attributes[] = { + &driver_attr_num_units.attr, + &driver_attr_version.attr, + NULL +}; + +static struct attribute_group driver_attr_group = { + .attrs = driver_attributes +}; + +static ssize_t store_tempsense(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, stat; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret <= 0) { + ipath_dev_err(dd, "attempt to set invalid tempsense config\n"); + goto bail; + } + /* If anything but the highest limit, enable T_CRIT_A "interrupt" */ + stat = ipath_tempsense_write(dd, 9, (val == 0x7f7f) ? 0x80 : 0); + if (stat) { + ipath_dev_err(dd, "Unable to set tempsense config\n"); + ret = -1; + goto bail; + } + stat = ipath_tempsense_write(dd, 0xB, (u8) (val & 0xFF)); + if (stat) { + ipath_dev_err(dd, "Unable to set local Tcrit\n"); + ret = -1; + goto bail; + } + stat = ipath_tempsense_write(dd, 0xD, (u8) (val >> 8)); + if (stat) { + ipath_dev_err(dd, "Unable to set remote Tcrit\n"); + ret = -1; + goto bail; + } + +bail: + return ret; +} + +/* + * dump tempsense regs. in decimal, to ease shell-scripts. + */ +static ssize_t show_tempsense(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + int idx; + u8 regvals[8]; + + ret = -ENXIO; + for (idx = 0; idx < 8; ++idx) { + if (idx == 6) + continue; + ret = ipath_tempsense_read(dd, idx); + if (ret < 0) + break; + regvals[idx] = ret; + } + if (idx == 8) + ret = scnprintf(buf, PAGE_SIZE, "%d %d %02X %02X %d %d\n", + *(signed char *)(regvals), + *(signed char *)(regvals + 1), + regvals[2], regvals[3], + *(signed char *)(regvals + 5), + *(signed char *)(regvals + 7)); + return ret; +} + +const struct attribute_group *ipath_driver_attr_groups[] = { + &driver_attr_group, + NULL, +}; + +static DEVICE_ATTR(guid, S_IWUSR | S_IRUGO, show_guid, store_guid); +static DEVICE_ATTR(lmc, S_IWUSR | S_IRUGO, show_lmc, store_lmc); +static DEVICE_ATTR(lid, S_IWUSR | S_IRUGO, show_lid, store_lid); +static DEVICE_ATTR(link_state, S_IWUSR, NULL, store_link_state); +static DEVICE_ATTR(mlid, S_IWUSR | S_IRUGO, show_mlid, store_mlid); +static DEVICE_ATTR(mtu, S_IWUSR | S_IRUGO, show_mtu, store_mtu); +static DEVICE_ATTR(enabled, S_IWUSR | S_IRUGO, show_enabled, store_enabled); +static DEVICE_ATTR(nguid, S_IRUGO, show_nguid, NULL); +static DEVICE_ATTR(nports, S_IRUGO, show_nports, NULL); +static DEVICE_ATTR(reset, S_IWUSR, NULL, store_reset); +static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL); +static DEVICE_ATTR(status, S_IRUGO, show_status, NULL); +static DEVICE_ATTR(status_str, S_IRUGO, show_status_str, NULL); +static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL); +static DEVICE_ATTR(unit, S_IRUGO, show_unit, NULL); +static DEVICE_ATTR(rx_pol_inv, S_IWUSR, NULL, store_rx_pol_inv); +static DEVICE_ATTR(led_override, S_IWUSR, NULL, store_led_override); +static DEVICE_ATTR(logged_errors, S_IRUGO, show_logged_errs, NULL); +static DEVICE_ATTR(localbus_info, S_IRUGO, show_localbus_info, NULL); +static DEVICE_ATTR(jint_max_packets, S_IWUSR | S_IRUGO, + show_jint_max_packets, store_jint_max_packets); +static DEVICE_ATTR(jint_idle_ticks, S_IWUSR | S_IRUGO, + show_jint_idle_ticks, store_jint_idle_ticks); +static DEVICE_ATTR(tempsense, S_IWUSR | S_IRUGO, + show_tempsense, store_tempsense); + +static struct attribute *dev_attributes[] = { + &dev_attr_guid.attr, + &dev_attr_lmc.attr, + &dev_attr_lid.attr, + &dev_attr_link_state.attr, + &dev_attr_mlid.attr, + &dev_attr_mtu.attr, + &dev_attr_nguid.attr, + &dev_attr_nports.attr, + &dev_attr_serial.attr, + &dev_attr_status.attr, + &dev_attr_status_str.attr, + &dev_attr_boardversion.attr, + &dev_attr_unit.attr, + &dev_attr_enabled.attr, + &dev_attr_rx_pol_inv.attr, + &dev_attr_led_override.attr, + &dev_attr_logged_errors.attr, + &dev_attr_tempsense.attr, + &dev_attr_localbus_info.attr, + NULL +}; + +static struct attribute_group dev_attr_group = { + .attrs = dev_attributes +}; + +static DEVICE_ATTR(hrtbt_enable, S_IWUSR | S_IRUGO, show_hrtbt_enb, + store_hrtbt_enb); +static DEVICE_ATTR(link_width_enable, S_IWUSR | S_IRUGO, show_lwid_enb, + store_lwid_enb); +static DEVICE_ATTR(link_width, S_IRUGO, show_lwid, NULL); +static DEVICE_ATTR(link_speed_enable, S_IWUSR | S_IRUGO, show_spd_enb, + store_spd_enb); +static DEVICE_ATTR(link_speed, S_IRUGO, show_spd, NULL); +static DEVICE_ATTR(rx_pol_inv_enable, S_IWUSR | S_IRUGO, show_rx_polinv_enb, + store_rx_polinv_enb); +static DEVICE_ATTR(rx_lane_rev_enable, S_IWUSR | S_IRUGO, show_lanerev_enb, + store_lanerev_enb); + +static struct attribute *dev_ibcfg_attributes[] = { + &dev_attr_hrtbt_enable.attr, + &dev_attr_link_width_enable.attr, + &dev_attr_link_width.attr, + &dev_attr_link_speed_enable.attr, + &dev_attr_link_speed.attr, + &dev_attr_rx_pol_inv_enable.attr, + &dev_attr_rx_lane_rev_enable.attr, + NULL +}; + +static struct attribute_group dev_ibcfg_attr_group = { + .attrs = dev_ibcfg_attributes +}; + +/** + * ipath_expose_reset - create a device reset file + * @dev: the device structure + * + * Only expose a file that lets us reset the device after someone + * enters diag mode. A device reset is quite likely to crash the + * machine entirely, so we don't want to normally make it + * available. + * + * Called with ipath_mutex held. + */ +int ipath_expose_reset(struct device *dev) +{ + static int exposed; + int ret; + + if (!exposed) { + ret = device_create_file(dev, &dev_attr_reset); + exposed = 1; + } + else + ret = 0; + + return ret; +} + +int ipath_device_create_group(struct device *dev, struct ipath_devdata *dd) +{ + int ret; + + ret = sysfs_create_group(&dev->kobj, &dev_attr_group); + if (ret) + goto bail; + + ret = sysfs_create_group(&dev->kobj, &dev_counter_attr_group); + if (ret) + goto bail_attrs; + + if (dd->ipath_flags & IPATH_HAS_MULT_IB_SPEED) { + ret = device_create_file(dev, &dev_attr_jint_idle_ticks); + if (ret) + goto bail_counter; + ret = device_create_file(dev, &dev_attr_jint_max_packets); + if (ret) + goto bail_idle; + + ret = sysfs_create_group(&dev->kobj, &dev_ibcfg_attr_group); + if (ret) + goto bail_max; + } + + return 0; + +bail_max: + device_remove_file(dev, &dev_attr_jint_max_packets); +bail_idle: + device_remove_file(dev, &dev_attr_jint_idle_ticks); +bail_counter: + sysfs_remove_group(&dev->kobj, &dev_counter_attr_group); +bail_attrs: + sysfs_remove_group(&dev->kobj, &dev_attr_group); +bail: + return ret; +} + +void ipath_device_remove_group(struct device *dev, struct ipath_devdata *dd) +{ + sysfs_remove_group(&dev->kobj, &dev_counter_attr_group); + + if (dd->ipath_flags & IPATH_HAS_MULT_IB_SPEED) { + sysfs_remove_group(&dev->kobj, &dev_ibcfg_attr_group); + device_remove_file(dev, &dev_attr_jint_idle_ticks); + device_remove_file(dev, &dev_attr_jint_max_packets); + } + + sysfs_remove_group(&dev->kobj, &dev_attr_group); + + device_remove_file(dev, &dev_attr_reset); +} diff --git a/drivers/infiniband/hw/ipath/ipath_uc.c b/drivers/infiniband/hw/ipath/ipath_uc.c new file mode 100644 index 0000000..22e6099 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_uc.c @@ -0,0 +1,547 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ipath_verbs.h" +#include "ipath_kernel.h" + +/* cut down ridiculously long IB macro names */ +#define OP(x) IB_OPCODE_UC_##x + +/** + * ipath_make_uc_req - construct a request packet (SEND, RDMA write) + * @qp: a pointer to the QP + * + * Return 1 if constructed; otherwise, return 0. + */ +int ipath_make_uc_req(struct ipath_qp *qp) +{ + struct ipath_other_headers *ohdr; + struct ipath_swqe *wqe; + unsigned long flags; + u32 hwords; + u32 bth0; + u32 len; + u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); + int ret = 0; + + spin_lock_irqsave(&qp->s_lock, flags); + + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) { + if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + if (qp->s_last == qp->s_head) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (atomic_read(&qp->s_dma_busy)) { + qp->s_flags |= IPATH_S_WAIT_DMA; + goto bail; + } + wqe = get_swqe_ptr(qp, qp->s_last); + ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + goto done; + } + + ohdr = &qp->s_hdr.u.oth; + if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) + ohdr = &qp->s_hdr.u.l.oth; + + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + bth0 = 1 << 22; /* Set M bit */ + + /* Get the next send request. */ + wqe = get_swqe_ptr(qp, qp->s_cur); + qp->s_wqe = NULL; + switch (qp->s_state) { + default: + if (!(ib_ipath_state_ops[qp->state] & + IPATH_PROCESS_NEXT_SEND_OK)) + goto bail; + /* Check if send work queue is empty. */ + if (qp->s_cur == qp->s_head) + goto bail; + /* + * Start a new request. + */ + qp->s_psn = wqe->psn = qp->s_next_psn; + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_len = len = wqe->length; + switch (wqe->wr.opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + if (len > pmtu) { + qp->s_state = OP(SEND_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_ONLY); + else { + qp->s_state = + OP(SEND_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + hwords += sizeof(struct ib_reth) / 4; + if (len > pmtu) { + qp->s_state = OP(RDMA_WRITE_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_ONLY); + else { + qp->s_state = + OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after the RETH */ + ohdr->u.rc.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + } + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + default: + goto bail; + } + break; + + case OP(SEND_FIRST): + qp->s_state = OP(SEND_MIDDLE); + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_LAST); + else { + qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_WRITE_FIRST): + qp->s_state = OP(RDMA_WRITE_MIDDLE); + /* FALLTHROUGH */ + case OP(RDMA_WRITE_MIDDLE): + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_LAST); + else { + qp->s_state = + OP(RDMA_WRITE_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + } + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + } + qp->s_len -= len; + qp->s_hdrwords = hwords; + qp->s_cur_sge = &qp->s_sge; + qp->s_cur_size = len; + ipath_make_ruc_header(to_idev(qp->ibqp.device), + qp, ohdr, bth0 | (qp->s_state << 24), + qp->s_next_psn++ & IPATH_PSN_MASK); +done: + ret = 1; + goto unlock; + +bail: + qp->s_flags &= ~IPATH_S_BUSY; +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +/** + * ipath_uc_rcv - handle an incoming UC packet + * @dev: the device the packet came in on + * @hdr: the header of the packet + * @has_grh: true if the packet has a GRH + * @data: the packet data + * @tlen: the length of the packet + * @qp: the QP for this packet. + * + * This is called from ipath_qp_rcv() to process an incoming UC packet + * for the given QP. + * Called at interrupt level. + */ +void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct ipath_qp *qp) +{ + struct ipath_other_headers *ohdr; + int opcode; + u32 hdrsize; + u32 psn; + u32 pad; + struct ib_wc wc; + u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); + struct ib_reth *reth; + int header_in_data; + + /* Validate the SLID. See Ch. 9.6.1.5 */ + if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid)) + goto done; + + /* Check for GRH */ + if (!has_grh) { + ohdr = &hdr->u.oth; + hdrsize = 8 + 12; /* LRH + BTH */ + psn = be32_to_cpu(ohdr->bth[2]); + header_in_data = 0; + } else { + ohdr = &hdr->u.l.oth; + hdrsize = 8 + 40 + 12; /* LRH + GRH + BTH */ + /* + * The header with GRH is 60 bytes and the + * core driver sets the eager header buffer + * size to 56 bytes so the last 4 bytes of + * the BTH header (PSN) is in the data buffer. + */ + header_in_data = dev->dd->ipath_rcvhdrentsize == 16; + if (header_in_data) { + psn = be32_to_cpu(((__be32 *) data)[0]); + data += sizeof(__be32); + } else + psn = be32_to_cpu(ohdr->bth[2]); + } + /* + * The opcode is in the low byte when its in network order + * (top byte when in host order). + */ + opcode = be32_to_cpu(ohdr->bth[0]) >> 24; + + memset(&wc, 0, sizeof wc); + + /* Compare the PSN verses the expected PSN. */ + if (unlikely(ipath_cmp24(psn, qp->r_psn) != 0)) { + /* + * Handle a sequence error. + * Silently drop any current message. + */ + qp->r_psn = psn; + inv: + qp->r_state = OP(SEND_LAST); + switch (opcode) { + case OP(SEND_FIRST): + case OP(SEND_ONLY): + case OP(SEND_ONLY_WITH_IMMEDIATE): + goto send_first; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_ONLY): + case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): + goto rdma_first; + + default: + dev->n_pkt_drops++; + goto done; + } + } + + /* Check for opcode sequence errors. */ + switch (qp->r_state) { + case OP(SEND_FIRST): + case OP(SEND_MIDDLE): + if (opcode == OP(SEND_MIDDLE) || + opcode == OP(SEND_LAST) || + opcode == OP(SEND_LAST_WITH_IMMEDIATE)) + break; + goto inv; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_MIDDLE): + if (opcode == OP(RDMA_WRITE_MIDDLE) || + opcode == OP(RDMA_WRITE_LAST) || + opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) + break; + goto inv; + + default: + if (opcode == OP(SEND_FIRST) || + opcode == OP(SEND_ONLY) || + opcode == OP(SEND_ONLY_WITH_IMMEDIATE) || + opcode == OP(RDMA_WRITE_FIRST) || + opcode == OP(RDMA_WRITE_ONLY) || + opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) + break; + goto inv; + } + + /* OK, process the packet. */ + switch (opcode) { + case OP(SEND_FIRST): + case OP(SEND_ONLY): + case OP(SEND_ONLY_WITH_IMMEDIATE): + send_first: + if (qp->r_flags & IPATH_R_REUSE_SGE) { + qp->r_flags &= ~IPATH_R_REUSE_SGE; + qp->r_sge = qp->s_rdma_read_sge; + } else if (!ipath_get_rwqe(qp, 0)) { + dev->n_pkt_drops++; + goto done; + } + /* Save the WQE so we can reuse it in case of an error. */ + qp->s_rdma_read_sge = qp->r_sge; + qp->r_rcv_len = 0; + if (opcode == OP(SEND_ONLY)) + goto send_last; + else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE)) + goto send_last_imm; + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + /* Check for invalid length PMTU or posted rwqe len. */ + if (unlikely(tlen != (hdrsize + pmtu + 4))) { + qp->r_flags |= IPATH_R_REUSE_SGE; + dev->n_pkt_drops++; + goto done; + } + qp->r_rcv_len += pmtu; + if (unlikely(qp->r_rcv_len > qp->r_len)) { + qp->r_flags |= IPATH_R_REUSE_SGE; + dev->n_pkt_drops++; + goto done; + } + ipath_copy_sge(&qp->r_sge, data, pmtu); + break; + + case OP(SEND_LAST_WITH_IMMEDIATE): + send_last_imm: + if (header_in_data) { + wc.ex.imm_data = *(__be32 *) data; + data += sizeof(__be32); + } else { + /* Immediate data comes after BTH */ + wc.ex.imm_data = ohdr->u.imm_data; + } + hdrsize += 4; + wc.wc_flags = IB_WC_WITH_IMM; + /* FALLTHROUGH */ + case OP(SEND_LAST): + send_last: + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* XXX LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) { + qp->r_flags |= IPATH_R_REUSE_SGE; + dev->n_pkt_drops++; + goto done; + } + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + wc.byte_len = tlen + qp->r_rcv_len; + if (unlikely(wc.byte_len > qp->r_len)) { + qp->r_flags |= IPATH_R_REUSE_SGE; + dev->n_pkt_drops++; + goto done; + } + wc.opcode = IB_WC_RECV; + last_imm: + ipath_copy_sge(&qp->r_sge, data, tlen); + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = qp->remote_ah_attr.dlid; + wc.sl = qp->remote_ah_attr.sl; + /* Signal completion event if the solicited bit is set. */ + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + (ohdr->bth[0] & + cpu_to_be32(1 << 23)) != 0); + break; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_ONLY): + case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */ + rdma_first: + /* RETH comes after BTH */ + if (!header_in_data) + reth = &ohdr->u.rc.reth; + else { + reth = (struct ib_reth *)data; + data += sizeof(*reth); + } + hdrsize += sizeof(*reth); + qp->r_len = be32_to_cpu(reth->length); + qp->r_rcv_len = 0; + if (qp->r_len != 0) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + /* Check rkey */ + ok = ipath_rkey_ok(qp, &qp->r_sge, qp->r_len, + vaddr, rkey, + IB_ACCESS_REMOTE_WRITE); + if (unlikely(!ok)) { + dev->n_pkt_drops++; + goto done; + } + } else { + qp->r_sge.sg_list = NULL; + qp->r_sge.sge.mr = NULL; + qp->r_sge.sge.vaddr = NULL; + qp->r_sge.sge.length = 0; + qp->r_sge.sge.sge_length = 0; + } + if (unlikely(!(qp->qp_access_flags & + IB_ACCESS_REMOTE_WRITE))) { + dev->n_pkt_drops++; + goto done; + } + if (opcode == OP(RDMA_WRITE_ONLY)) + goto rdma_last; + else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) + goto rdma_last_imm; + /* FALLTHROUGH */ + case OP(RDMA_WRITE_MIDDLE): + /* Check for invalid length PMTU or posted rwqe len. */ + if (unlikely(tlen != (hdrsize + pmtu + 4))) { + dev->n_pkt_drops++; + goto done; + } + qp->r_rcv_len += pmtu; + if (unlikely(qp->r_rcv_len > qp->r_len)) { + dev->n_pkt_drops++; + goto done; + } + ipath_copy_sge(&qp->r_sge, data, pmtu); + break; + + case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): + rdma_last_imm: + if (header_in_data) { + wc.ex.imm_data = *(__be32 *) data; + data += sizeof(__be32); + } else { + /* Immediate data comes after BTH */ + wc.ex.imm_data = ohdr->u.imm_data; + } + hdrsize += 4; + wc.wc_flags = IB_WC_WITH_IMM; + + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* XXX LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) { + dev->n_pkt_drops++; + goto done; + } + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) { + dev->n_pkt_drops++; + goto done; + } + if (qp->r_flags & IPATH_R_REUSE_SGE) + qp->r_flags &= ~IPATH_R_REUSE_SGE; + else if (!ipath_get_rwqe(qp, 1)) { + dev->n_pkt_drops++; + goto done; + } + wc.byte_len = qp->r_len; + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + goto last_imm; + + case OP(RDMA_WRITE_LAST): + rdma_last: + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* XXX LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) { + dev->n_pkt_drops++; + goto done; + } + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) { + dev->n_pkt_drops++; + goto done; + } + ipath_copy_sge(&qp->r_sge, data, tlen); + break; + + default: + /* Drop packet for unknown opcodes. */ + dev->n_pkt_drops++; + goto done; + } + qp->r_psn++; + qp->r_state = opcode; +done: + return; +} diff --git a/drivers/infiniband/hw/ipath/ipath_ud.c b/drivers/infiniband/hw/ipath/ipath_ud.c new file mode 100644 index 0000000..e8a2a91 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_ud.c @@ -0,0 +1,580 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "ipath_verbs.h" +#include "ipath_kernel.h" + +/** + * ipath_ud_loopback - handle send on loopback QPs + * @sqp: the sending QP + * @swqe: the send work request + * + * This is called from ipath_make_ud_req() to forward a WQE addressed + * to the same HCA. + * Note that the receive interrupt handler may be calling ipath_ud_rcv() + * while this is being called. + */ +static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe) +{ + struct ipath_ibdev *dev = to_idev(sqp->ibqp.device); + struct ipath_qp *qp; + struct ib_ah_attr *ah_attr; + unsigned long flags; + struct ipath_rq *rq; + struct ipath_srq *srq; + struct ipath_sge_state rsge; + struct ipath_sge *sge; + struct ipath_rwq *wq; + struct ipath_rwqe *wqe; + void (*handler)(struct ib_event *, void *); + struct ib_wc wc; + u32 tail; + u32 rlen; + u32 length; + + qp = ipath_lookup_qpn(&dev->qp_table, swqe->wr.wr.ud.remote_qpn); + if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) { + dev->n_pkt_drops++; + goto done; + } + + /* + * Check that the qkey matches (except for QP0, see 9.6.1.4.1). + * Qkeys with the high order bit set mean use the + * qkey from the QP context instead of the WR (see 10.2.5). + */ + if (unlikely(qp->ibqp.qp_num && + ((int) swqe->wr.wr.ud.remote_qkey < 0 ? + sqp->qkey : swqe->wr.wr.ud.remote_qkey) != qp->qkey)) { + /* XXX OK to lose a count once in a while. */ + dev->qkey_violations++; + dev->n_pkt_drops++; + goto drop; + } + + /* + * A GRH is expected to precede the data even if not + * present on the wire. + */ + length = swqe->length; + memset(&wc, 0, sizeof wc); + wc.byte_len = length + sizeof(struct ib_grh); + + if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) { + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = swqe->wr.ex.imm_data; + } + + /* + * This would be a lot simpler if we could call ipath_get_rwqe() + * but that uses state that the receive interrupt handler uses + * so we would need to lock out receive interrupts while doing + * local loopback. + */ + if (qp->ibqp.srq) { + srq = to_isrq(qp->ibqp.srq); + handler = srq->ibsrq.event_handler; + rq = &srq->rq; + } else { + srq = NULL; + handler = NULL; + rq = &qp->r_rq; + } + + /* + * Get the next work request entry to find where to put the data. + * Note that it is safe to drop the lock after changing rq->tail + * since ipath_post_receive() won't fill the empty slot. + */ + spin_lock_irqsave(&rq->lock, flags); + wq = rq->wq; + tail = wq->tail; + /* Validate tail before using it since it is user writable. */ + if (tail >= rq->size) + tail = 0; + if (unlikely(tail == wq->head)) { + spin_unlock_irqrestore(&rq->lock, flags); + dev->n_pkt_drops++; + goto drop; + } + wqe = get_rwqe_ptr(rq, tail); + rsge.sg_list = qp->r_ud_sg_list; + if (!ipath_init_sge(qp, wqe, &rlen, &rsge)) { + spin_unlock_irqrestore(&rq->lock, flags); + dev->n_pkt_drops++; + goto drop; + } + /* Silently drop packets which are too big. */ + if (wc.byte_len > rlen) { + spin_unlock_irqrestore(&rq->lock, flags); + dev->n_pkt_drops++; + goto drop; + } + if (++tail >= rq->size) + tail = 0; + wq->tail = tail; + wc.wr_id = wqe->wr_id; + if (handler) { + u32 n; + + /* + * validate head pointer value and compute + * the number of remaining WQEs. + */ + n = wq->head; + if (n >= rq->size) + n = 0; + if (n < tail) + n += rq->size - tail; + else + n -= tail; + if (n < srq->limit) { + struct ib_event ev; + + srq->limit = 0; + spin_unlock_irqrestore(&rq->lock, flags); + ev.device = qp->ibqp.device; + ev.element.srq = qp->ibqp.srq; + ev.event = IB_EVENT_SRQ_LIMIT_REACHED; + handler(&ev, srq->ibsrq.srq_context); + } else + spin_unlock_irqrestore(&rq->lock, flags); + } else + spin_unlock_irqrestore(&rq->lock, flags); + + ah_attr = &to_iah(swqe->wr.wr.ud.ah)->attr; + if (ah_attr->ah_flags & IB_AH_GRH) { + ipath_copy_sge(&rsge, &ah_attr->grh, sizeof(struct ib_grh)); + wc.wc_flags |= IB_WC_GRH; + } else + ipath_skip_sge(&rsge, sizeof(struct ib_grh)); + sge = swqe->sg_list; + while (length) { + u32 len = sge->length; + + if (len > length) + len = length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + ipath_copy_sge(&rsge, sge->vaddr, len); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--swqe->wr.num_sge) + sge++; + } else if (sge->length == 0 && sge->mr != NULL) { + if (++sge->n >= IPATH_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + length -= len; + } + wc.status = IB_WC_SUCCESS; + wc.opcode = IB_WC_RECV; + wc.qp = &qp->ibqp; + wc.src_qp = sqp->ibqp.qp_num; + /* XXX do we know which pkey matched? Only needed for GSI. */ + wc.pkey_index = 0; + wc.slid = dev->dd->ipath_lid | + (ah_attr->src_path_bits & + ((1 << dev->dd->ipath_lmc) - 1)); + wc.sl = ah_attr->sl; + wc.dlid_path_bits = + ah_attr->dlid & ((1 << dev->dd->ipath_lmc) - 1); + wc.port_num = 1; + /* Signal completion event if the solicited bit is set. */ + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + swqe->wr.send_flags & IB_SEND_SOLICITED); +drop: + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); +done:; +} + +/** + * ipath_make_ud_req - construct a UD request packet + * @qp: the QP + * + * Return 1 if constructed; otherwise, return 0. + */ +int ipath_make_ud_req(struct ipath_qp *qp) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ipath_other_headers *ohdr; + struct ib_ah_attr *ah_attr; + struct ipath_swqe *wqe; + unsigned long flags; + u32 nwords; + u32 extra_bytes; + u32 bth0; + u16 lrh0; + u16 lid; + int ret = 0; + int next_cur; + + spin_lock_irqsave(&qp->s_lock, flags); + + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_NEXT_SEND_OK)) { + if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + if (qp->s_last == qp->s_head) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (atomic_read(&qp->s_dma_busy)) { + qp->s_flags |= IPATH_S_WAIT_DMA; + goto bail; + } + wqe = get_swqe_ptr(qp, qp->s_last); + ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + goto done; + } + + if (qp->s_cur == qp->s_head) + goto bail; + + wqe = get_swqe_ptr(qp, qp->s_cur); + next_cur = qp->s_cur + 1; + if (next_cur >= qp->s_size) + next_cur = 0; + + /* Construct the header. */ + ah_attr = &to_iah(wqe->wr.wr.ud.ah)->attr; + if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE) { + if (ah_attr->dlid != IPATH_PERMISSIVE_LID) + dev->n_multicast_xmit++; + else + dev->n_unicast_xmit++; + } else { + dev->n_unicast_xmit++; + lid = ah_attr->dlid & ~((1 << dev->dd->ipath_lmc) - 1); + if (unlikely(lid == dev->dd->ipath_lid)) { + /* + * If DMAs are in progress, we can't generate + * a completion for the loopback packet since + * it would be out of order. + * XXX Instead of waiting, we could queue a + * zero length descriptor so we get a callback. + */ + if (atomic_read(&qp->s_dma_busy)) { + qp->s_flags |= IPATH_S_WAIT_DMA; + goto bail; + } + qp->s_cur = next_cur; + spin_unlock_irqrestore(&qp->s_lock, flags); + ipath_ud_loopback(qp, wqe); + spin_lock_irqsave(&qp->s_lock, flags); + ipath_send_complete(qp, wqe, IB_WC_SUCCESS); + goto done; + } + } + + qp->s_cur = next_cur; + extra_bytes = -wqe->length & 3; + nwords = (wqe->length + extra_bytes) >> 2; + + /* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */ + qp->s_hdrwords = 7; + qp->s_cur_size = wqe->length; + qp->s_cur_sge = &qp->s_sge; + qp->s_dmult = ah_attr->static_rate; + qp->s_wqe = wqe; + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + + if (ah_attr->ah_flags & IB_AH_GRH) { + /* Header size in 32-bit words. */ + qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh, + &ah_attr->grh, + qp->s_hdrwords, nwords); + lrh0 = IPATH_LRH_GRH; + ohdr = &qp->s_hdr.u.l.oth; + /* + * Don't worry about sending to locally attached multicast + * QPs. It is unspecified by the spec. what happens. + */ + } else { + /* Header size in 32-bit words. */ + lrh0 = IPATH_LRH_BTH; + ohdr = &qp->s_hdr.u.oth; + } + if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { + qp->s_hdrwords++; + ohdr->u.ud.imm_data = wqe->wr.ex.imm_data; + bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24; + } else + bth0 = IB_OPCODE_UD_SEND_ONLY << 24; + lrh0 |= ah_attr->sl << 4; + if (qp->ibqp.qp_type == IB_QPT_SMI) + lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */ + qp->s_hdr.lrh[0] = cpu_to_be16(lrh0); + qp->s_hdr.lrh[1] = cpu_to_be16(ah_attr->dlid); /* DEST LID */ + qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + + SIZE_OF_CRC); + lid = dev->dd->ipath_lid; + if (lid) { + lid |= ah_attr->src_path_bits & + ((1 << dev->dd->ipath_lmc) - 1); + qp->s_hdr.lrh[3] = cpu_to_be16(lid); + } else + qp->s_hdr.lrh[3] = IB_LID_PERMISSIVE; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + bth0 |= extra_bytes << 20; + bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? IPATH_DEFAULT_P_KEY : + ipath_get_pkey(dev->dd, qp->s_pkey_index); + ohdr->bth[0] = cpu_to_be32(bth0); + /* + * Use the multicast QP if the destination LID is a multicast LID. + */ + ohdr->bth[1] = ah_attr->dlid >= IPATH_MULTICAST_LID_BASE && + ah_attr->dlid != IPATH_PERMISSIVE_LID ? + cpu_to_be32(IPATH_MULTICAST_QPN) : + cpu_to_be32(wqe->wr.wr.ud.remote_qpn); + ohdr->bth[2] = cpu_to_be32(qp->s_next_psn++ & IPATH_PSN_MASK); + /* + * Qkeys with the high order bit set mean use the + * qkey from the QP context instead of the WR (see 10.2.5). + */ + ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->wr.wr.ud.remote_qkey < 0 ? + qp->qkey : wqe->wr.wr.ud.remote_qkey); + ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num); + +done: + ret = 1; + goto unlock; + +bail: + qp->s_flags &= ~IPATH_S_BUSY; +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +/** + * ipath_ud_rcv - receive an incoming UD packet + * @dev: the device the packet came in on + * @hdr: the packet header + * @has_grh: true if the packet has a GRH + * @data: the packet data + * @tlen: the packet length + * @qp: the QP the packet came on + * + * This is called from ipath_qp_rcv() to process an incoming UD packet + * for the given QP. + * Called at interrupt level. + */ +void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct ipath_qp *qp) +{ + struct ipath_other_headers *ohdr; + int opcode; + u32 hdrsize; + u32 pad; + struct ib_wc wc; + u32 qkey; + u32 src_qp; + u16 dlid; + int header_in_data; + + /* Check for GRH */ + if (!has_grh) { + ohdr = &hdr->u.oth; + hdrsize = 8 + 12 + 8; /* LRH + BTH + DETH */ + qkey = be32_to_cpu(ohdr->u.ud.deth[0]); + src_qp = be32_to_cpu(ohdr->u.ud.deth[1]); + header_in_data = 0; + } else { + ohdr = &hdr->u.l.oth; + hdrsize = 8 + 40 + 12 + 8; /* LRH + GRH + BTH + DETH */ + /* + * The header with GRH is 68 bytes and the core driver sets + * the eager header buffer size to 56 bytes so the last 12 + * bytes of the IB header is in the data buffer. + */ + header_in_data = dev->dd->ipath_rcvhdrentsize == 16; + if (header_in_data) { + qkey = be32_to_cpu(((__be32 *) data)[1]); + src_qp = be32_to_cpu(((__be32 *) data)[2]); + data += 12; + } else { + qkey = be32_to_cpu(ohdr->u.ud.deth[0]); + src_qp = be32_to_cpu(ohdr->u.ud.deth[1]); + } + } + src_qp &= IPATH_QPN_MASK; + + /* + * Check that the permissive LID is only used on QP0 + * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1). + */ + if (qp->ibqp.qp_num) { + if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE || + hdr->lrh[3] == IB_LID_PERMISSIVE)) { + dev->n_pkt_drops++; + goto bail; + } + if (unlikely(qkey != qp->qkey)) { + /* XXX OK to lose a count once in a while. */ + dev->qkey_violations++; + dev->n_pkt_drops++; + goto bail; + } + } else if (hdr->lrh[1] == IB_LID_PERMISSIVE || + hdr->lrh[3] == IB_LID_PERMISSIVE) { + struct ib_smp *smp = (struct ib_smp *) data; + + if (smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + dev->n_pkt_drops++; + goto bail; + } + } + + /* + * The opcode is in the low byte when its in network order + * (top byte when in host order). + */ + opcode = be32_to_cpu(ohdr->bth[0]) >> 24; + if (qp->ibqp.qp_num > 1 && + opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) { + if (header_in_data) { + wc.ex.imm_data = *(__be32 *) data; + data += sizeof(__be32); + } else + wc.ex.imm_data = ohdr->u.ud.imm_data; + wc.wc_flags = IB_WC_WITH_IMM; + hdrsize += sizeof(u32); + } else if (opcode == IB_OPCODE_UD_SEND_ONLY) { + wc.ex.imm_data = 0; + wc.wc_flags = 0; + } else { + dev->n_pkt_drops++; + goto bail; + } + + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + if (unlikely(tlen < (hdrsize + pad + 4))) { + /* Drop incomplete packets. */ + dev->n_pkt_drops++; + goto bail; + } + tlen -= hdrsize + pad + 4; + + /* Drop invalid MAD packets (see 13.5.3.1). */ + if (unlikely((qp->ibqp.qp_num == 0 && + (tlen != 256 || + (be16_to_cpu(hdr->lrh[0]) >> 12) != 15)) || + (qp->ibqp.qp_num == 1 && + (tlen != 256 || + (be16_to_cpu(hdr->lrh[0]) >> 12) == 15)))) { + dev->n_pkt_drops++; + goto bail; + } + + /* + * A GRH is expected to precede the data even if not + * present on the wire. + */ + wc.byte_len = tlen + sizeof(struct ib_grh); + + /* + * Get the next work request entry to find where to put the data. + */ + if (qp->r_flags & IPATH_R_REUSE_SGE) + qp->r_flags &= ~IPATH_R_REUSE_SGE; + else if (!ipath_get_rwqe(qp, 0)) { + /* + * Count VL15 packets dropped due to no receive buffer. + * Otherwise, count them as buffer overruns since usually, + * the HW will be able to receive packets even if there are + * no QPs with posted receive buffers. + */ + if (qp->ibqp.qp_num == 0) + dev->n_vl15_dropped++; + else + dev->rcv_errors++; + goto bail; + } + /* Silently drop packets which are too big. */ + if (wc.byte_len > qp->r_len) { + qp->r_flags |= IPATH_R_REUSE_SGE; + dev->n_pkt_drops++; + goto bail; + } + if (has_grh) { + ipath_copy_sge(&qp->r_sge, &hdr->u.l.grh, + sizeof(struct ib_grh)); + wc.wc_flags |= IB_WC_GRH; + } else + ipath_skip_sge(&qp->r_sge, sizeof(struct ib_grh)); + ipath_copy_sge(&qp->r_sge, data, + wc.byte_len - sizeof(struct ib_grh)); + if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags)) + goto bail; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.opcode = IB_WC_RECV; + wc.vendor_err = 0; + wc.qp = &qp->ibqp; + wc.src_qp = src_qp; + /* XXX do we know which pkey matched? Only needed for GSI. */ + wc.pkey_index = 0; + wc.slid = be16_to_cpu(hdr->lrh[3]); + wc.sl = (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF; + dlid = be16_to_cpu(hdr->lrh[1]); + /* + * Save the LMC lower bits if the destination LID is a unicast LID. + */ + wc.dlid_path_bits = dlid >= IPATH_MULTICAST_LID_BASE ? 0 : + dlid & ((1 << dev->dd->ipath_lmc) - 1); + wc.port_num = 1; + /* Signal completion event if the solicited bit is set. */ + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + (ohdr->bth[0] & + cpu_to_be32(1 << 23)) != 0); + +bail:; +} diff --git a/drivers/infiniband/hw/ipath/ipath_user_pages.c b/drivers/infiniband/hw/ipath/ipath_user_pages.c new file mode 100644 index 0000000..1da1252 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_user_pages.c @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "ipath_kernel.h" + +static void __ipath_release_user_pages(struct page **p, size_t num_pages, + int dirty) +{ + size_t i; + + for (i = 0; i < num_pages; i++) { + ipath_cdbg(MM, "%lu/%lu put_page %p\n", (unsigned long) i, + (unsigned long) num_pages, p[i]); + if (dirty) + set_page_dirty_lock(p[i]); + put_page(p[i]); + } +} + +/* call with current->mm->mmap_sem held */ +static int __ipath_get_user_pages(unsigned long start_page, size_t num_pages, + struct page **p) +{ + unsigned long lock_limit; + size_t got; + int ret; + + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + if (num_pages > lock_limit) { + ret = -ENOMEM; + goto bail; + } + + ipath_cdbg(VERBOSE, "pin %lx pages from vaddr %lx\n", + (unsigned long) num_pages, start_page); + + for (got = 0; got < num_pages; got += ret) { + ret = get_user_pages(current, current->mm, + start_page + got * PAGE_SIZE, + num_pages - got, 1, 1, + p + got, NULL); + if (ret < 0) + goto bail_release; + } + + current->mm->pinned_vm += num_pages; + + ret = 0; + goto bail; + +bail_release: + __ipath_release_user_pages(p, got, 0); +bail: + return ret; +} + +/** + * ipath_map_page - a safety wrapper around pci_map_page() + * + * A dma_addr of all 0's is interpreted by the chip as "disabled". + * Unfortunately, it can also be a valid dma_addr returned on some + * architectures. + * + * The powerpc iommu assigns dma_addrs in ascending order, so we don't + * have to bother with retries or mapping a dummy page to insure we + * don't just get the same mapping again. + * + * I'm sure we won't be so lucky with other iommu's, so FIXME. + */ +dma_addr_t ipath_map_page(struct pci_dev *hwdev, struct page *page, + unsigned long offset, size_t size, int direction) +{ + dma_addr_t phys; + + phys = pci_map_page(hwdev, page, offset, size, direction); + + if (phys == 0) { + pci_unmap_page(hwdev, phys, size, direction); + phys = pci_map_page(hwdev, page, offset, size, direction); + /* + * FIXME: If we get 0 again, we should keep this page, + * map another, then free the 0 page. + */ + } + + return phys; +} + +/** + * ipath_map_single - a safety wrapper around pci_map_single() + * + * Same idea as ipath_map_page(). + */ +dma_addr_t ipath_map_single(struct pci_dev *hwdev, void *ptr, size_t size, + int direction) +{ + dma_addr_t phys; + + phys = pci_map_single(hwdev, ptr, size, direction); + + if (phys == 0) { + pci_unmap_single(hwdev, phys, size, direction); + phys = pci_map_single(hwdev, ptr, size, direction); + /* + * FIXME: If we get 0 again, we should keep this page, + * map another, then free the 0 page. + */ + } + + return phys; +} + +/** + * ipath_get_user_pages - lock user pages into memory + * @start_page: the start page + * @num_pages: the number of pages + * @p: the output page structures + * + * This function takes a given start page (page aligned user virtual + * address) and pins it and the following specified number of pages. For + * now, num_pages is always 1, but that will probably change at some point + * (because caller is doing expected sends on a single virtually contiguous + * buffer, so we can do all pages at once). + */ +int ipath_get_user_pages(unsigned long start_page, size_t num_pages, + struct page **p) +{ + int ret; + + down_write(¤t->mm->mmap_sem); + + ret = __ipath_get_user_pages(start_page, num_pages, p); + + up_write(¤t->mm->mmap_sem); + + return ret; +} + +void ipath_release_user_pages(struct page **p, size_t num_pages) +{ + down_write(¤t->mm->mmap_sem); + + __ipath_release_user_pages(p, num_pages, 1); + + current->mm->pinned_vm -= num_pages; + + up_write(¤t->mm->mmap_sem); +} + +struct ipath_user_pages_work { + struct work_struct work; + struct mm_struct *mm; + unsigned long num_pages; +}; + +static void user_pages_account(struct work_struct *_work) +{ + struct ipath_user_pages_work *work = + container_of(_work, struct ipath_user_pages_work, work); + + down_write(&work->mm->mmap_sem); + work->mm->pinned_vm -= work->num_pages; + up_write(&work->mm->mmap_sem); + mmput(work->mm); + kfree(work); +} + +void ipath_release_user_pages_on_close(struct page **p, size_t num_pages) +{ + struct ipath_user_pages_work *work; + struct mm_struct *mm; + + __ipath_release_user_pages(p, num_pages, 1); + + mm = get_task_mm(current); + if (!mm) + return; + + work = kmalloc(sizeof(*work), GFP_KERNEL); + if (!work) + goto bail_mm; + + INIT_WORK(&work->work, user_pages_account); + work->mm = mm; + work->num_pages = num_pages; + + queue_work(ib_wq, &work->work); + return; + +bail_mm: + mmput(mm); + return; +} diff --git a/drivers/infiniband/hw/ipath/ipath_user_sdma.c b/drivers/infiniband/hw/ipath/ipath_user_sdma.c new file mode 100644 index 0000000..cc04b7b --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_user_sdma.c @@ -0,0 +1,875 @@ +/* + * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ipath_kernel.h" +#include "ipath_user_sdma.h" + +/* minimum size of header */ +#define IPATH_USER_SDMA_MIN_HEADER_LENGTH 64 +/* expected size of headers (for dma_pool) */ +#define IPATH_USER_SDMA_EXP_HEADER_LENGTH 64 +/* length mask in PBC (lower 11 bits) */ +#define IPATH_PBC_LENGTH_MASK ((1 << 11) - 1) + +struct ipath_user_sdma_pkt { + u8 naddr; /* dimension of addr (1..3) ... */ + u32 counter; /* sdma pkts queued counter for this entry */ + u64 added; /* global descq number of entries */ + + struct { + u32 offset; /* offset for kvaddr, addr */ + u32 length; /* length in page */ + u8 put_page; /* should we put_page? */ + u8 dma_mapped; /* is page dma_mapped? */ + struct page *page; /* may be NULL (coherent mem) */ + void *kvaddr; /* FIXME: only for pio hack */ + dma_addr_t addr; + } addr[4]; /* max pages, any more and we coalesce */ + struct list_head list; /* list element */ +}; + +struct ipath_user_sdma_queue { + /* + * pkts sent to dma engine are queued on this + * list head. the type of the elements of this + * list are struct ipath_user_sdma_pkt... + */ + struct list_head sent; + + /* headers with expected length are allocated from here... */ + char header_cache_name[64]; + struct dma_pool *header_cache; + + /* packets are allocated from the slab cache... */ + char pkt_slab_name[64]; + struct kmem_cache *pkt_slab; + + /* as packets go on the queued queue, they are counted... */ + u32 counter; + u32 sent_counter; + + /* dma page table */ + struct rb_root dma_pages_root; + + /* protect everything above... */ + struct mutex lock; +}; + +struct ipath_user_sdma_queue * +ipath_user_sdma_queue_create(struct device *dev, int unit, int port, int sport) +{ + struct ipath_user_sdma_queue *pq = + kmalloc(sizeof(struct ipath_user_sdma_queue), GFP_KERNEL); + + if (!pq) + goto done; + + pq->counter = 0; + pq->sent_counter = 0; + INIT_LIST_HEAD(&pq->sent); + + mutex_init(&pq->lock); + + snprintf(pq->pkt_slab_name, sizeof(pq->pkt_slab_name), + "ipath-user-sdma-pkts-%u-%02u.%02u", unit, port, sport); + pq->pkt_slab = kmem_cache_create(pq->pkt_slab_name, + sizeof(struct ipath_user_sdma_pkt), + 0, 0, NULL); + + if (!pq->pkt_slab) + goto err_kfree; + + snprintf(pq->header_cache_name, sizeof(pq->header_cache_name), + "ipath-user-sdma-headers-%u-%02u.%02u", unit, port, sport); + pq->header_cache = dma_pool_create(pq->header_cache_name, + dev, + IPATH_USER_SDMA_EXP_HEADER_LENGTH, + 4, 0); + if (!pq->header_cache) + goto err_slab; + + pq->dma_pages_root = RB_ROOT; + + goto done; + +err_slab: + kmem_cache_destroy(pq->pkt_slab); +err_kfree: + kfree(pq); + pq = NULL; + +done: + return pq; +} + +static void ipath_user_sdma_init_frag(struct ipath_user_sdma_pkt *pkt, + int i, size_t offset, size_t len, + int put_page, int dma_mapped, + struct page *page, + void *kvaddr, dma_addr_t dma_addr) +{ + pkt->addr[i].offset = offset; + pkt->addr[i].length = len; + pkt->addr[i].put_page = put_page; + pkt->addr[i].dma_mapped = dma_mapped; + pkt->addr[i].page = page; + pkt->addr[i].kvaddr = kvaddr; + pkt->addr[i].addr = dma_addr; +} + +static void ipath_user_sdma_init_header(struct ipath_user_sdma_pkt *pkt, + u32 counter, size_t offset, + size_t len, int dma_mapped, + struct page *page, + void *kvaddr, dma_addr_t dma_addr) +{ + pkt->naddr = 1; + pkt->counter = counter; + ipath_user_sdma_init_frag(pkt, 0, offset, len, 0, dma_mapped, page, + kvaddr, dma_addr); +} + +/* we've too many pages in the iovec, coalesce to a single page */ +static int ipath_user_sdma_coalesce(const struct ipath_devdata *dd, + struct ipath_user_sdma_pkt *pkt, + const struct iovec *iov, + unsigned long niov) { + int ret = 0; + struct page *page = alloc_page(GFP_KERNEL); + void *mpage_save; + char *mpage; + int i; + int len = 0; + dma_addr_t dma_addr; + + if (!page) { + ret = -ENOMEM; + goto done; + } + + mpage = kmap(page); + mpage_save = mpage; + for (i = 0; i < niov; i++) { + int cfur; + + cfur = copy_from_user(mpage, + iov[i].iov_base, iov[i].iov_len); + if (cfur) { + ret = -EFAULT; + goto free_unmap; + } + + mpage += iov[i].iov_len; + len += iov[i].iov_len; + } + + dma_addr = dma_map_page(&dd->pcidev->dev, page, 0, len, + DMA_TO_DEVICE); + if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) { + ret = -ENOMEM; + goto free_unmap; + } + + ipath_user_sdma_init_frag(pkt, 1, 0, len, 0, 1, page, mpage_save, + dma_addr); + pkt->naddr = 2; + + goto done; + +free_unmap: + kunmap(page); + __free_page(page); +done: + return ret; +} + +/* how many pages in this iovec element? */ +static int ipath_user_sdma_num_pages(const struct iovec *iov) +{ + const unsigned long addr = (unsigned long) iov->iov_base; + const unsigned long len = iov->iov_len; + const unsigned long spage = addr & PAGE_MASK; + const unsigned long epage = (addr + len - 1) & PAGE_MASK; + + return 1 + ((epage - spage) >> PAGE_SHIFT); +} + +/* truncate length to page boundary */ +static int ipath_user_sdma_page_length(unsigned long addr, unsigned long len) +{ + const unsigned long offset = addr & ~PAGE_MASK; + + return ((offset + len) > PAGE_SIZE) ? (PAGE_SIZE - offset) : len; +} + +static void ipath_user_sdma_free_pkt_frag(struct device *dev, + struct ipath_user_sdma_queue *pq, + struct ipath_user_sdma_pkt *pkt, + int frag) +{ + const int i = frag; + + if (pkt->addr[i].page) { + if (pkt->addr[i].dma_mapped) + dma_unmap_page(dev, + pkt->addr[i].addr, + pkt->addr[i].length, + DMA_TO_DEVICE); + + if (pkt->addr[i].kvaddr) + kunmap(pkt->addr[i].page); + + if (pkt->addr[i].put_page) + put_page(pkt->addr[i].page); + else + __free_page(pkt->addr[i].page); + } else if (pkt->addr[i].kvaddr) + /* free coherent mem from cache... */ + dma_pool_free(pq->header_cache, + pkt->addr[i].kvaddr, pkt->addr[i].addr); +} + +/* return number of pages pinned... */ +static int ipath_user_sdma_pin_pages(const struct ipath_devdata *dd, + struct ipath_user_sdma_pkt *pkt, + unsigned long addr, int tlen, int npages) +{ + struct page *pages[2]; + int j; + int ret; + + ret = get_user_pages_fast(addr, npages, 0, pages); + if (ret != npages) { + int i; + + for (i = 0; i < ret; i++) + put_page(pages[i]); + + ret = -ENOMEM; + goto done; + } + + for (j = 0; j < npages; j++) { + /* map the pages... */ + const int flen = + ipath_user_sdma_page_length(addr, tlen); + dma_addr_t dma_addr = + dma_map_page(&dd->pcidev->dev, + pages[j], 0, flen, DMA_TO_DEVICE); + unsigned long fofs = addr & ~PAGE_MASK; + + if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) { + ret = -ENOMEM; + goto done; + } + + ipath_user_sdma_init_frag(pkt, pkt->naddr, fofs, flen, 1, 1, + pages[j], kmap(pages[j]), + dma_addr); + + pkt->naddr++; + addr += flen; + tlen -= flen; + } + +done: + return ret; +} + +static int ipath_user_sdma_pin_pkt(const struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq, + struct ipath_user_sdma_pkt *pkt, + const struct iovec *iov, + unsigned long niov) +{ + int ret = 0; + unsigned long idx; + + for (idx = 0; idx < niov; idx++) { + const int npages = ipath_user_sdma_num_pages(iov + idx); + const unsigned long addr = (unsigned long) iov[idx].iov_base; + + ret = ipath_user_sdma_pin_pages(dd, pkt, + addr, iov[idx].iov_len, + npages); + if (ret < 0) + goto free_pkt; + } + + goto done; + +free_pkt: + for (idx = 0; idx < pkt->naddr; idx++) + ipath_user_sdma_free_pkt_frag(&dd->pcidev->dev, pq, pkt, idx); + +done: + return ret; +} + +static int ipath_user_sdma_init_payload(const struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq, + struct ipath_user_sdma_pkt *pkt, + const struct iovec *iov, + unsigned long niov, int npages) +{ + int ret = 0; + + if (npages >= ARRAY_SIZE(pkt->addr)) + ret = ipath_user_sdma_coalesce(dd, pkt, iov, niov); + else + ret = ipath_user_sdma_pin_pkt(dd, pq, pkt, iov, niov); + + return ret; +} + +/* free a packet list -- return counter value of last packet */ +static void ipath_user_sdma_free_pkt_list(struct device *dev, + struct ipath_user_sdma_queue *pq, + struct list_head *list) +{ + struct ipath_user_sdma_pkt *pkt, *pkt_next; + + list_for_each_entry_safe(pkt, pkt_next, list, list) { + int i; + + for (i = 0; i < pkt->naddr; i++) + ipath_user_sdma_free_pkt_frag(dev, pq, pkt, i); + + kmem_cache_free(pq->pkt_slab, pkt); + } +} + +/* + * copy headers, coalesce etc -- pq->lock must be held + * + * we queue all the packets to list, returning the + * number of bytes total. list must be empty initially, + * as, if there is an error we clean it... + */ +static int ipath_user_sdma_queue_pkts(const struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq, + struct list_head *list, + const struct iovec *iov, + unsigned long niov, + int maxpkts) +{ + unsigned long idx = 0; + int ret = 0; + int npkts = 0; + struct page *page = NULL; + __le32 *pbc; + dma_addr_t dma_addr; + struct ipath_user_sdma_pkt *pkt = NULL; + size_t len; + size_t nw; + u32 counter = pq->counter; + int dma_mapped = 0; + + while (idx < niov && npkts < maxpkts) { + const unsigned long addr = (unsigned long) iov[idx].iov_base; + const unsigned long idx_save = idx; + unsigned pktnw; + unsigned pktnwc; + int nfrags = 0; + int npages = 0; + int cfur; + + dma_mapped = 0; + len = iov[idx].iov_len; + nw = len >> 2; + page = NULL; + + pkt = kmem_cache_alloc(pq->pkt_slab, GFP_KERNEL); + if (!pkt) { + ret = -ENOMEM; + goto free_list; + } + + if (len < IPATH_USER_SDMA_MIN_HEADER_LENGTH || + len > PAGE_SIZE || len & 3 || addr & 3) { + ret = -EINVAL; + goto free_pkt; + } + + if (len == IPATH_USER_SDMA_EXP_HEADER_LENGTH) + pbc = dma_pool_alloc(pq->header_cache, GFP_KERNEL, + &dma_addr); + else + pbc = NULL; + + if (!pbc) { + page = alloc_page(GFP_KERNEL); + if (!page) { + ret = -ENOMEM; + goto free_pkt; + } + pbc = kmap(page); + } + + cfur = copy_from_user(pbc, iov[idx].iov_base, len); + if (cfur) { + ret = -EFAULT; + goto free_pbc; + } + + /* + * this assignment is a bit strange. it's because the + * the pbc counts the number of 32 bit words in the full + * packet _except_ the first word of the pbc itself... + */ + pktnwc = nw - 1; + + /* + * pktnw computation yields the number of 32 bit words + * that the caller has indicated in the PBC. note that + * this is one less than the total number of words that + * goes to the send DMA engine as the first 32 bit word + * of the PBC itself is not counted. Armed with this count, + * we can verify that the packet is consistent with the + * iovec lengths. + */ + pktnw = le32_to_cpu(*pbc) & IPATH_PBC_LENGTH_MASK; + if (pktnw < pktnwc || pktnw > pktnwc + (PAGE_SIZE >> 2)) { + ret = -EINVAL; + goto free_pbc; + } + + + idx++; + while (pktnwc < pktnw && idx < niov) { + const size_t slen = iov[idx].iov_len; + const unsigned long faddr = + (unsigned long) iov[idx].iov_base; + + if (slen & 3 || faddr & 3 || !slen || + slen > PAGE_SIZE) { + ret = -EINVAL; + goto free_pbc; + } + + npages++; + if ((faddr & PAGE_MASK) != + ((faddr + slen - 1) & PAGE_MASK)) + npages++; + + pktnwc += slen >> 2; + idx++; + nfrags++; + } + + if (pktnwc != pktnw) { + ret = -EINVAL; + goto free_pbc; + } + + if (page) { + dma_addr = dma_map_page(&dd->pcidev->dev, + page, 0, len, DMA_TO_DEVICE); + if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) { + ret = -ENOMEM; + goto free_pbc; + } + + dma_mapped = 1; + } + + ipath_user_sdma_init_header(pkt, counter, 0, len, dma_mapped, + page, pbc, dma_addr); + + if (nfrags) { + ret = ipath_user_sdma_init_payload(dd, pq, pkt, + iov + idx_save + 1, + nfrags, npages); + if (ret < 0) + goto free_pbc_dma; + } + + counter++; + npkts++; + + list_add_tail(&pkt->list, list); + } + + ret = idx; + goto done; + +free_pbc_dma: + if (dma_mapped) + dma_unmap_page(&dd->pcidev->dev, dma_addr, len, DMA_TO_DEVICE); +free_pbc: + if (page) { + kunmap(page); + __free_page(page); + } else + dma_pool_free(pq->header_cache, pbc, dma_addr); +free_pkt: + kmem_cache_free(pq->pkt_slab, pkt); +free_list: + ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, list); +done: + return ret; +} + +static void ipath_user_sdma_set_complete_counter(struct ipath_user_sdma_queue *pq, + u32 c) +{ + pq->sent_counter = c; +} + +/* try to clean out queue -- needs pq->lock */ +static int ipath_user_sdma_queue_clean(const struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq) +{ + struct list_head free_list; + struct ipath_user_sdma_pkt *pkt; + struct ipath_user_sdma_pkt *pkt_prev; + int ret = 0; + + INIT_LIST_HEAD(&free_list); + + list_for_each_entry_safe(pkt, pkt_prev, &pq->sent, list) { + s64 descd = dd->ipath_sdma_descq_removed - pkt->added; + + if (descd < 0) + break; + + list_move_tail(&pkt->list, &free_list); + + /* one more packet cleaned */ + ret++; + } + + if (!list_empty(&free_list)) { + u32 counter; + + pkt = list_entry(free_list.prev, + struct ipath_user_sdma_pkt, list); + counter = pkt->counter; + + ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list); + ipath_user_sdma_set_complete_counter(pq, counter); + } + + return ret; +} + +void ipath_user_sdma_queue_destroy(struct ipath_user_sdma_queue *pq) +{ + if (!pq) + return; + + kmem_cache_destroy(pq->pkt_slab); + dma_pool_destroy(pq->header_cache); + kfree(pq); +} + +/* clean descriptor queue, returns > 0 if some elements cleaned */ +static int ipath_user_sdma_hwqueue_clean(struct ipath_devdata *dd) +{ + int ret; + unsigned long flags; + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + ret = ipath_sdma_make_progress(dd); + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + + return ret; +} + +/* we're in close, drain packets so that we can cleanup successfully... */ +void ipath_user_sdma_queue_drain(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq) +{ + int i; + + if (!pq) + return; + + for (i = 0; i < 100; i++) { + mutex_lock(&pq->lock); + if (list_empty(&pq->sent)) { + mutex_unlock(&pq->lock); + break; + } + ipath_user_sdma_hwqueue_clean(dd); + ipath_user_sdma_queue_clean(dd, pq); + mutex_unlock(&pq->lock); + msleep(10); + } + + if (!list_empty(&pq->sent)) { + struct list_head free_list; + + printk(KERN_INFO "drain: lists not empty: forcing!\n"); + INIT_LIST_HEAD(&free_list); + mutex_lock(&pq->lock); + list_splice_init(&pq->sent, &free_list); + ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list); + mutex_unlock(&pq->lock); + } +} + +static inline __le64 ipath_sdma_make_desc0(struct ipath_devdata *dd, + u64 addr, u64 dwlen, u64 dwoffset) +{ + return cpu_to_le64(/* SDmaPhyAddr[31:0] */ + ((addr & 0xfffffffcULL) << 32) | + /* SDmaGeneration[1:0] */ + ((dd->ipath_sdma_generation & 3ULL) << 30) | + /* SDmaDwordCount[10:0] */ + ((dwlen & 0x7ffULL) << 16) | + /* SDmaBufOffset[12:2] */ + (dwoffset & 0x7ffULL)); +} + +static inline __le64 ipath_sdma_make_first_desc0(__le64 descq) +{ + return descq | cpu_to_le64(1ULL << 12); +} + +static inline __le64 ipath_sdma_make_last_desc0(__le64 descq) +{ + /* last */ /* dma head */ + return descq | cpu_to_le64(1ULL << 11 | 1ULL << 13); +} + +static inline __le64 ipath_sdma_make_desc1(u64 addr) +{ + /* SDmaPhyAddr[47:32] */ + return cpu_to_le64(addr >> 32); +} + +static void ipath_user_sdma_send_frag(struct ipath_devdata *dd, + struct ipath_user_sdma_pkt *pkt, int idx, + unsigned ofs, u16 tail) +{ + const u64 addr = (u64) pkt->addr[idx].addr + + (u64) pkt->addr[idx].offset; + const u64 dwlen = (u64) pkt->addr[idx].length / 4; + __le64 *descqp; + __le64 descq0; + + descqp = &dd->ipath_sdma_descq[tail].qw[0]; + + descq0 = ipath_sdma_make_desc0(dd, addr, dwlen, ofs); + if (idx == 0) + descq0 = ipath_sdma_make_first_desc0(descq0); + if (idx == pkt->naddr - 1) + descq0 = ipath_sdma_make_last_desc0(descq0); + + descqp[0] = descq0; + descqp[1] = ipath_sdma_make_desc1(addr); +} + +/* pq->lock must be held, get packets on the wire... */ +static int ipath_user_sdma_push_pkts(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq, + struct list_head *pktlist) +{ + int ret = 0; + unsigned long flags; + u16 tail; + + if (list_empty(pktlist)) + return 0; + + if (unlikely(!(dd->ipath_flags & IPATH_LINKACTIVE))) + return -ECOMM; + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + + if (unlikely(dd->ipath_sdma_status & IPATH_SDMA_ABORT_MASK)) { + ret = -ECOMM; + goto unlock; + } + + tail = dd->ipath_sdma_descq_tail; + while (!list_empty(pktlist)) { + struct ipath_user_sdma_pkt *pkt = + list_entry(pktlist->next, struct ipath_user_sdma_pkt, + list); + int i; + unsigned ofs = 0; + u16 dtail = tail; + + if (pkt->naddr > ipath_sdma_descq_freecnt(dd)) + goto unlock_check_tail; + + for (i = 0; i < pkt->naddr; i++) { + ipath_user_sdma_send_frag(dd, pkt, i, ofs, tail); + ofs += pkt->addr[i].length >> 2; + + if (++tail == dd->ipath_sdma_descq_cnt) { + tail = 0; + ++dd->ipath_sdma_generation; + } + } + + if ((ofs<<2) > dd->ipath_ibmaxlen) { + ipath_dbg("packet size %X > ibmax %X, fail\n", + ofs<<2, dd->ipath_ibmaxlen); + ret = -EMSGSIZE; + goto unlock; + } + + /* + * if the packet is >= 2KB mtu equivalent, we have to use + * the large buffers, and have to mark each descriptor as + * part of a large buffer packet. + */ + if (ofs >= IPATH_SMALLBUF_DWORDS) { + for (i = 0; i < pkt->naddr; i++) { + dd->ipath_sdma_descq[dtail].qw[0] |= + cpu_to_le64(1ULL << 14); + if (++dtail == dd->ipath_sdma_descq_cnt) + dtail = 0; + } + } + + dd->ipath_sdma_descq_added += pkt->naddr; + pkt->added = dd->ipath_sdma_descq_added; + list_move_tail(&pkt->list, &pq->sent); + ret++; + } + +unlock_check_tail: + /* advance the tail on the chip if necessary */ + if (dd->ipath_sdma_descq_tail != tail) { + wmb(); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, tail); + dd->ipath_sdma_descq_tail = tail; + } + +unlock: + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + + return ret; +} + +int ipath_user_sdma_writev(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq, + const struct iovec *iov, + unsigned long dim) +{ + int ret = 0; + struct list_head list; + int npkts = 0; + + INIT_LIST_HEAD(&list); + + mutex_lock(&pq->lock); + + if (dd->ipath_sdma_descq_added != dd->ipath_sdma_descq_removed) { + ipath_user_sdma_hwqueue_clean(dd); + ipath_user_sdma_queue_clean(dd, pq); + } + + while (dim) { + const int mxp = 8; + + ret = ipath_user_sdma_queue_pkts(dd, pq, &list, iov, dim, mxp); + if (ret <= 0) + goto done_unlock; + else { + dim -= ret; + iov += ret; + } + + /* force packets onto the sdma hw queue... */ + if (!list_empty(&list)) { + /* + * lazily clean hw queue. the 4 is a guess of about + * how many sdma descriptors a packet will take (it + * doesn't have to be perfect). + */ + if (ipath_sdma_descq_freecnt(dd) < ret * 4) { + ipath_user_sdma_hwqueue_clean(dd); + ipath_user_sdma_queue_clean(dd, pq); + } + + ret = ipath_user_sdma_push_pkts(dd, pq, &list); + if (ret < 0) + goto done_unlock; + else { + npkts += ret; + pq->counter += ret; + + if (!list_empty(&list)) + goto done_unlock; + } + } + } + +done_unlock: + if (!list_empty(&list)) + ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &list); + mutex_unlock(&pq->lock); + + return (ret < 0) ? ret : npkts; +} + +int ipath_user_sdma_make_progress(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq) +{ + int ret = 0; + + mutex_lock(&pq->lock); + ipath_user_sdma_hwqueue_clean(dd); + ret = ipath_user_sdma_queue_clean(dd, pq); + mutex_unlock(&pq->lock); + + return ret; +} + +u32 ipath_user_sdma_complete_counter(const struct ipath_user_sdma_queue *pq) +{ + return pq->sent_counter; +} + +u32 ipath_user_sdma_inflight_counter(struct ipath_user_sdma_queue *pq) +{ + return pq->counter; +} + diff --git a/drivers/infiniband/hw/ipath/ipath_user_sdma.h b/drivers/infiniband/hw/ipath/ipath_user_sdma.h new file mode 100644 index 0000000..fc76316 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_user_sdma.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include + +struct ipath_user_sdma_queue; + +struct ipath_user_sdma_queue * +ipath_user_sdma_queue_create(struct device *dev, int unit, int port, int sport); +void ipath_user_sdma_queue_destroy(struct ipath_user_sdma_queue *pq); + +int ipath_user_sdma_writev(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq, + const struct iovec *iov, + unsigned long dim); + +int ipath_user_sdma_make_progress(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq); + +void ipath_user_sdma_queue_drain(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq); + +u32 ipath_user_sdma_complete_counter(const struct ipath_user_sdma_queue *pq); +u32 ipath_user_sdma_inflight_counter(struct ipath_user_sdma_queue *pq); diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c new file mode 100644 index 0000000..44ea939 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_verbs.c @@ -0,0 +1,2342 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "ipath_kernel.h" +#include "ipath_verbs.h" +#include "ipath_common.h" + +static unsigned int ib_ipath_qp_table_size = 251; +module_param_named(qp_table_size, ib_ipath_qp_table_size, uint, S_IRUGO); +MODULE_PARM_DESC(qp_table_size, "QP table size"); + +unsigned int ib_ipath_lkey_table_size = 12; +module_param_named(lkey_table_size, ib_ipath_lkey_table_size, uint, + S_IRUGO); +MODULE_PARM_DESC(lkey_table_size, + "LKEY table size in bits (2^n, 1 <= n <= 23)"); + +static unsigned int ib_ipath_max_pds = 0xFFFF; +module_param_named(max_pds, ib_ipath_max_pds, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_pds, + "Maximum number of protection domains to support"); + +static unsigned int ib_ipath_max_ahs = 0xFFFF; +module_param_named(max_ahs, ib_ipath_max_ahs, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support"); + +unsigned int ib_ipath_max_cqes = 0x2FFFF; +module_param_named(max_cqes, ib_ipath_max_cqes, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_cqes, + "Maximum number of completion queue entries to support"); + +unsigned int ib_ipath_max_cqs = 0x1FFFF; +module_param_named(max_cqs, ib_ipath_max_cqs, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support"); + +unsigned int ib_ipath_max_qp_wrs = 0x3FFF; +module_param_named(max_qp_wrs, ib_ipath_max_qp_wrs, uint, + S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support"); + +unsigned int ib_ipath_max_qps = 16384; +module_param_named(max_qps, ib_ipath_max_qps, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support"); + +unsigned int ib_ipath_max_sges = 0x60; +module_param_named(max_sges, ib_ipath_max_sges, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support"); + +unsigned int ib_ipath_max_mcast_grps = 16384; +module_param_named(max_mcast_grps, ib_ipath_max_mcast_grps, uint, + S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_mcast_grps, + "Maximum number of multicast groups to support"); + +unsigned int ib_ipath_max_mcast_qp_attached = 16; +module_param_named(max_mcast_qp_attached, ib_ipath_max_mcast_qp_attached, + uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_mcast_qp_attached, + "Maximum number of attached QPs to support"); + +unsigned int ib_ipath_max_srqs = 1024; +module_param_named(max_srqs, ib_ipath_max_srqs, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support"); + +unsigned int ib_ipath_max_srq_sges = 128; +module_param_named(max_srq_sges, ib_ipath_max_srq_sges, + uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support"); + +unsigned int ib_ipath_max_srq_wrs = 0x1FFFF; +module_param_named(max_srq_wrs, ib_ipath_max_srq_wrs, + uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support"); + +static unsigned int ib_ipath_disable_sma; +module_param_named(disable_sma, ib_ipath_disable_sma, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(disable_sma, "Disable the SMA"); + +/* + * Note that it is OK to post send work requests in the SQE and ERR + * states; ipath_do_send() will process them and generate error + * completions as per IB 1.2 C10-96. + */ +const int ib_ipath_state_ops[IB_QPS_ERR + 1] = { + [IB_QPS_RESET] = 0, + [IB_QPS_INIT] = IPATH_POST_RECV_OK, + [IB_QPS_RTR] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK, + [IB_QPS_RTS] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK | + IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK | + IPATH_PROCESS_NEXT_SEND_OK, + [IB_QPS_SQD] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK | + IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK, + [IB_QPS_SQE] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK | + IPATH_POST_SEND_OK | IPATH_FLUSH_SEND, + [IB_QPS_ERR] = IPATH_POST_RECV_OK | IPATH_FLUSH_RECV | + IPATH_POST_SEND_OK | IPATH_FLUSH_SEND, +}; + +struct ipath_ucontext { + struct ib_ucontext ibucontext; +}; + +static inline struct ipath_ucontext *to_iucontext(struct ib_ucontext + *ibucontext) +{ + return container_of(ibucontext, struct ipath_ucontext, ibucontext); +} + +/* + * Translate ib_wr_opcode into ib_wc_opcode. + */ +const enum ib_wc_opcode ib_ipath_wc_opcode[] = { + [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE, + [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE, + [IB_WR_SEND] = IB_WC_SEND, + [IB_WR_SEND_WITH_IMM] = IB_WC_SEND, + [IB_WR_RDMA_READ] = IB_WC_RDMA_READ, + [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP, + [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD +}; + +/* + * System image GUID. + */ +static __be64 sys_image_guid; + +/** + * ipath_copy_sge - copy data to SGE memory + * @ss: the SGE state + * @data: the data to copy + * @length: the length of the data + */ +void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length) +{ + struct ipath_sge *sge = &ss->sge; + + while (length) { + u32 len = sge->length; + + if (len > length) + len = length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + memcpy(sge->vaddr, data, len); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr != NULL) { + if (++sge->n >= IPATH_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + data += len; + length -= len; + } +} + +/** + * ipath_skip_sge - skip over SGE memory - XXX almost dup of prev func + * @ss: the SGE state + * @length: the number of bytes to skip + */ +void ipath_skip_sge(struct ipath_sge_state *ss, u32 length) +{ + struct ipath_sge *sge = &ss->sge; + + while (length) { + u32 len = sge->length; + + if (len > length) + len = length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr != NULL) { + if (++sge->n >= IPATH_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + length -= len; + } +} + +/* + * Count the number of DMA descriptors needed to send length bytes of data. + * Don't modify the ipath_sge_state to get the count. + * Return zero if any of the segments is not aligned. + */ +static u32 ipath_count_sge(struct ipath_sge_state *ss, u32 length) +{ + struct ipath_sge *sg_list = ss->sg_list; + struct ipath_sge sge = ss->sge; + u8 num_sge = ss->num_sge; + u32 ndesc = 1; /* count the header */ + + while (length) { + u32 len = sge.length; + + if (len > length) + len = length; + if (len > sge.sge_length) + len = sge.sge_length; + BUG_ON(len == 0); + if (((long) sge.vaddr & (sizeof(u32) - 1)) || + (len != length && (len & (sizeof(u32) - 1)))) { + ndesc = 0; + break; + } + ndesc++; + sge.vaddr += len; + sge.length -= len; + sge.sge_length -= len; + if (sge.sge_length == 0) { + if (--num_sge) + sge = *sg_list++; + } else if (sge.length == 0 && sge.mr != NULL) { + if (++sge.n >= IPATH_SEGSZ) { + if (++sge.m >= sge.mr->mapsz) + break; + sge.n = 0; + } + sge.vaddr = + sge.mr->map[sge.m]->segs[sge.n].vaddr; + sge.length = + sge.mr->map[sge.m]->segs[sge.n].length; + } + length -= len; + } + return ndesc; +} + +/* + * Copy from the SGEs to the data buffer. + */ +static void ipath_copy_from_sge(void *data, struct ipath_sge_state *ss, + u32 length) +{ + struct ipath_sge *sge = &ss->sge; + + while (length) { + u32 len = sge->length; + + if (len > length) + len = length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + memcpy(data, sge->vaddr, len); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr != NULL) { + if (++sge->n >= IPATH_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + data += len; + length -= len; + } +} + +/** + * ipath_post_one_send - post one RC, UC, or UD send work request + * @qp: the QP to post on + * @wr: the work request to send + */ +static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr) +{ + struct ipath_swqe *wqe; + u32 next; + int i; + int j; + int acc; + int ret; + unsigned long flags; + struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd; + + spin_lock_irqsave(&qp->s_lock, flags); + + if (qp->ibqp.qp_type != IB_QPT_SMI && + !(dd->ipath_flags & IPATH_LINKACTIVE)) { + ret = -ENETDOWN; + goto bail; + } + + /* Check that state is OK to post send. */ + if (unlikely(!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK))) + goto bail_inval; + + /* IB spec says that num_sge == 0 is OK. */ + if (wr->num_sge > qp->s_max_sge) + goto bail_inval; + + /* + * Don't allow RDMA reads or atomic operations on UC or + * undefined operations. + * Make sure buffer is large enough to hold the result for atomics. + */ + if (qp->ibqp.qp_type == IB_QPT_UC) { + if ((unsigned) wr->opcode >= IB_WR_RDMA_READ) + goto bail_inval; + } else if (qp->ibqp.qp_type == IB_QPT_UD) { + /* Check UD opcode */ + if (wr->opcode != IB_WR_SEND && + wr->opcode != IB_WR_SEND_WITH_IMM) + goto bail_inval; + /* Check UD destination address PD */ + if (qp->ibqp.pd != wr->wr.ud.ah->pd) + goto bail_inval; + } else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD) + goto bail_inval; + else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP && + (wr->num_sge == 0 || + wr->sg_list[0].length < sizeof(u64) || + wr->sg_list[0].addr & (sizeof(u64) - 1))) + goto bail_inval; + else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic) + goto bail_inval; + + next = qp->s_head + 1; + if (next >= qp->s_size) + next = 0; + if (next == qp->s_last) { + ret = -ENOMEM; + goto bail; + } + + wqe = get_swqe_ptr(qp, qp->s_head); + wqe->wr = *wr; + wqe->length = 0; + if (wr->num_sge) { + acc = wr->opcode >= IB_WR_RDMA_READ ? + IB_ACCESS_LOCAL_WRITE : 0; + for (i = 0, j = 0; i < wr->num_sge; i++) { + u32 length = wr->sg_list[i].length; + int ok; + + if (length == 0) + continue; + ok = ipath_lkey_ok(qp, &wqe->sg_list[j], + &wr->sg_list[i], acc); + if (!ok) + goto bail_inval; + wqe->length += length; + j++; + } + wqe->wr.num_sge = j; + } + if (qp->ibqp.qp_type == IB_QPT_UC || + qp->ibqp.qp_type == IB_QPT_RC) { + if (wqe->length > 0x80000000U) + goto bail_inval; + } else if (wqe->length > to_idev(qp->ibqp.device)->dd->ipath_ibmtu) + goto bail_inval; + wqe->ssn = qp->s_ssn++; + qp->s_head = next; + + ret = 0; + goto bail; + +bail_inval: + ret = -EINVAL; +bail: + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +/** + * ipath_post_send - post a send on a QP + * @ibqp: the QP to post the send on + * @wr: the list of work requests to post + * @bad_wr: the first bad WR is put here + * + * This may be called from interrupt context. + */ +static int ipath_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + struct ipath_qp *qp = to_iqp(ibqp); + int err = 0; + + for (; wr; wr = wr->next) { + err = ipath_post_one_send(qp, wr); + if (err) { + *bad_wr = wr; + goto bail; + } + } + + /* Try to do the send work in the caller's context. */ + ipath_do_send((unsigned long) qp); + +bail: + return err; +} + +/** + * ipath_post_receive - post a receive on a QP + * @ibqp: the QP to post the receive on + * @wr: the WR to post + * @bad_wr: the first bad WR is put here + * + * This may be called from interrupt context. + */ +static int ipath_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct ipath_qp *qp = to_iqp(ibqp); + struct ipath_rwq *wq = qp->r_rq.wq; + unsigned long flags; + int ret; + + /* Check that state is OK to post receive. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_POST_RECV_OK) || !wq) { + *bad_wr = wr; + ret = -EINVAL; + goto bail; + } + + for (; wr; wr = wr->next) { + struct ipath_rwqe *wqe; + u32 next; + int i; + + if ((unsigned) wr->num_sge > qp->r_rq.max_sge) { + *bad_wr = wr; + ret = -EINVAL; + goto bail; + } + + spin_lock_irqsave(&qp->r_rq.lock, flags); + next = wq->head + 1; + if (next >= qp->r_rq.size) + next = 0; + if (next == wq->tail) { + spin_unlock_irqrestore(&qp->r_rq.lock, flags); + *bad_wr = wr; + ret = -ENOMEM; + goto bail; + } + + wqe = get_rwqe_ptr(&qp->r_rq, wq->head); + wqe->wr_id = wr->wr_id; + wqe->num_sge = wr->num_sge; + for (i = 0; i < wr->num_sge; i++) + wqe->sg_list[i] = wr->sg_list[i]; + /* Make sure queue entry is written before the head index. */ + smp_wmb(); + wq->head = next; + spin_unlock_irqrestore(&qp->r_rq.lock, flags); + } + ret = 0; + +bail: + return ret; +} + +/** + * ipath_qp_rcv - processing an incoming packet on a QP + * @dev: the device the packet came on + * @hdr: the packet header + * @has_grh: true if the packet has a GRH + * @data: the packet data + * @tlen: the packet length + * @qp: the QP the packet came on + * + * This is called from ipath_ib_rcv() to process an incoming packet + * for the given QP. + * Called at interrupt level. + */ +static void ipath_qp_rcv(struct ipath_ibdev *dev, + struct ipath_ib_header *hdr, int has_grh, + void *data, u32 tlen, struct ipath_qp *qp) +{ + /* Check for valid receive state. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) { + dev->n_pkt_drops++; + return; + } + + switch (qp->ibqp.qp_type) { + case IB_QPT_SMI: + case IB_QPT_GSI: + if (ib_ipath_disable_sma) + break; + /* FALLTHROUGH */ + case IB_QPT_UD: + ipath_ud_rcv(dev, hdr, has_grh, data, tlen, qp); + break; + + case IB_QPT_RC: + ipath_rc_rcv(dev, hdr, has_grh, data, tlen, qp); + break; + + case IB_QPT_UC: + ipath_uc_rcv(dev, hdr, has_grh, data, tlen, qp); + break; + + default: + break; + } +} + +/** + * ipath_ib_rcv - process an incoming packet + * @arg: the device pointer + * @rhdr: the header of the packet + * @data: the packet data + * @tlen: the packet length + * + * This is called from ipath_kreceive() to process an incoming packet at + * interrupt level. Tlen is the length of the header + data + CRC in bytes. + */ +void ipath_ib_rcv(struct ipath_ibdev *dev, void *rhdr, void *data, + u32 tlen) +{ + struct ipath_ib_header *hdr = rhdr; + struct ipath_other_headers *ohdr; + struct ipath_qp *qp; + u32 qp_num; + int lnh; + u8 opcode; + u16 lid; + + if (unlikely(dev == NULL)) + goto bail; + + if (unlikely(tlen < 24)) { /* LRH+BTH+CRC */ + dev->rcv_errors++; + goto bail; + } + + /* Check for a valid destination LID (see ch. 7.11.1). */ + lid = be16_to_cpu(hdr->lrh[1]); + if (lid < IPATH_MULTICAST_LID_BASE) { + lid &= ~((1 << dev->dd->ipath_lmc) - 1); + if (unlikely(lid != dev->dd->ipath_lid)) { + dev->rcv_errors++; + goto bail; + } + } + + /* Check for GRH */ + lnh = be16_to_cpu(hdr->lrh[0]) & 3; + if (lnh == IPATH_LRH_BTH) + ohdr = &hdr->u.oth; + else if (lnh == IPATH_LRH_GRH) + ohdr = &hdr->u.l.oth; + else { + dev->rcv_errors++; + goto bail; + } + + opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0x7f; + dev->opstats[opcode].n_bytes += tlen; + dev->opstats[opcode].n_packets++; + + /* Get the destination QP number. */ + qp_num = be32_to_cpu(ohdr->bth[1]) & IPATH_QPN_MASK; + if (qp_num == IPATH_MULTICAST_QPN) { + struct ipath_mcast *mcast; + struct ipath_mcast_qp *p; + + if (lnh != IPATH_LRH_GRH) { + dev->n_pkt_drops++; + goto bail; + } + mcast = ipath_mcast_find(&hdr->u.l.grh.dgid); + if (mcast == NULL) { + dev->n_pkt_drops++; + goto bail; + } + dev->n_multicast_rcv++; + list_for_each_entry_rcu(p, &mcast->qp_list, list) + ipath_qp_rcv(dev, hdr, 1, data, tlen, p->qp); + /* + * Notify ipath_multicast_detach() if it is waiting for us + * to finish. + */ + if (atomic_dec_return(&mcast->refcount) <= 1) + wake_up(&mcast->wait); + } else { + qp = ipath_lookup_qpn(&dev->qp_table, qp_num); + if (qp) { + dev->n_unicast_rcv++; + ipath_qp_rcv(dev, hdr, lnh == IPATH_LRH_GRH, data, + tlen, qp); + /* + * Notify ipath_destroy_qp() if it is waiting + * for us to finish. + */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } else + dev->n_pkt_drops++; + } + +bail:; +} + +/** + * ipath_ib_timer - verbs timer + * @arg: the device pointer + * + * This is called from ipath_do_rcv_timer() at interrupt level to check for + * QPs which need retransmits and to collect performance numbers. + */ +static void ipath_ib_timer(struct ipath_ibdev *dev) +{ + struct ipath_qp *resend = NULL; + struct ipath_qp *rnr = NULL; + struct list_head *last; + struct ipath_qp *qp; + unsigned long flags; + + if (dev == NULL) + return; + + spin_lock_irqsave(&dev->pending_lock, flags); + /* Start filling the next pending queue. */ + if (++dev->pending_index >= ARRAY_SIZE(dev->pending)) + dev->pending_index = 0; + /* Save any requests still in the new queue, they have timed out. */ + last = &dev->pending[dev->pending_index]; + while (!list_empty(last)) { + qp = list_entry(last->next, struct ipath_qp, timerwait); + list_del_init(&qp->timerwait); + qp->timer_next = resend; + resend = qp; + atomic_inc(&qp->refcount); + } + last = &dev->rnrwait; + if (!list_empty(last)) { + qp = list_entry(last->next, struct ipath_qp, timerwait); + if (--qp->s_rnr_timeout == 0) { + do { + list_del_init(&qp->timerwait); + qp->timer_next = rnr; + rnr = qp; + atomic_inc(&qp->refcount); + if (list_empty(last)) + break; + qp = list_entry(last->next, struct ipath_qp, + timerwait); + } while (qp->s_rnr_timeout == 0); + } + } + /* + * We should only be in the started state if pma_sample_start != 0 + */ + if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_STARTED && + --dev->pma_sample_start == 0) { + dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_RUNNING; + ipath_snapshot_counters(dev->dd, &dev->ipath_sword, + &dev->ipath_rword, + &dev->ipath_spkts, + &dev->ipath_rpkts, + &dev->ipath_xmit_wait); + } + if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_RUNNING) { + if (dev->pma_sample_interval == 0) { + u64 ta, tb, tc, td, te; + + dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_DONE; + ipath_snapshot_counters(dev->dd, &ta, &tb, + &tc, &td, &te); + + dev->ipath_sword = ta - dev->ipath_sword; + dev->ipath_rword = tb - dev->ipath_rword; + dev->ipath_spkts = tc - dev->ipath_spkts; + dev->ipath_rpkts = td - dev->ipath_rpkts; + dev->ipath_xmit_wait = te - dev->ipath_xmit_wait; + } + else + dev->pma_sample_interval--; + } + spin_unlock_irqrestore(&dev->pending_lock, flags); + + /* XXX What if timer fires again while this is running? */ + while (resend != NULL) { + qp = resend; + resend = qp->timer_next; + + spin_lock_irqsave(&qp->s_lock, flags); + if (qp->s_last != qp->s_tail && + ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) { + dev->n_timeouts++; + ipath_restart_rc(qp, qp->s_last_psn + 1); + } + spin_unlock_irqrestore(&qp->s_lock, flags); + + /* Notify ipath_destroy_qp() if it is waiting. */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } + while (rnr != NULL) { + qp = rnr; + rnr = qp->timer_next; + + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) + ipath_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + + /* Notify ipath_destroy_qp() if it is waiting. */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } +} + +static void update_sge(struct ipath_sge_state *ss, u32 length) +{ + struct ipath_sge *sge = &ss->sge; + + sge->vaddr += length; + sge->length -= length; + sge->sge_length -= length; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr != NULL) { + if (++sge->n >= IPATH_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + return; + sge->n = 0; + } + sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = sge->mr->map[sge->m]->segs[sge->n].length; + } +} + +#ifdef __LITTLE_ENDIAN +static inline u32 get_upper_bits(u32 data, u32 shift) +{ + return data >> shift; +} + +static inline u32 set_upper_bits(u32 data, u32 shift) +{ + return data << shift; +} + +static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off) +{ + data <<= ((sizeof(u32) - n) * BITS_PER_BYTE); + data >>= ((sizeof(u32) - n - off) * BITS_PER_BYTE); + return data; +} +#else +static inline u32 get_upper_bits(u32 data, u32 shift) +{ + return data << shift; +} + +static inline u32 set_upper_bits(u32 data, u32 shift) +{ + return data >> shift; +} + +static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off) +{ + data >>= ((sizeof(u32) - n) * BITS_PER_BYTE); + data <<= ((sizeof(u32) - n - off) * BITS_PER_BYTE); + return data; +} +#endif + +static void copy_io(u32 __iomem *piobuf, struct ipath_sge_state *ss, + u32 length, unsigned flush_wc) +{ + u32 extra = 0; + u32 data = 0; + u32 last; + + while (1) { + u32 len = ss->sge.length; + u32 off; + + if (len > length) + len = length; + if (len > ss->sge.sge_length) + len = ss->sge.sge_length; + BUG_ON(len == 0); + /* If the source address is not aligned, try to align it. */ + off = (unsigned long)ss->sge.vaddr & (sizeof(u32) - 1); + if (off) { + u32 *addr = (u32 *)((unsigned long)ss->sge.vaddr & + ~(sizeof(u32) - 1)); + u32 v = get_upper_bits(*addr, off * BITS_PER_BYTE); + u32 y; + + y = sizeof(u32) - off; + if (len > y) + len = y; + if (len + extra >= sizeof(u32)) { + data |= set_upper_bits(v, extra * + BITS_PER_BYTE); + len = sizeof(u32) - extra; + if (len == length) { + last = data; + break; + } + __raw_writel(data, piobuf); + piobuf++; + extra = 0; + data = 0; + } else { + /* Clear unused upper bytes */ + data |= clear_upper_bytes(v, len, extra); + if (len == length) { + last = data; + break; + } + extra += len; + } + } else if (extra) { + /* Source address is aligned. */ + u32 *addr = (u32 *) ss->sge.vaddr; + int shift = extra * BITS_PER_BYTE; + int ushift = 32 - shift; + u32 l = len; + + while (l >= sizeof(u32)) { + u32 v = *addr; + + data |= set_upper_bits(v, shift); + __raw_writel(data, piobuf); + data = get_upper_bits(v, ushift); + piobuf++; + addr++; + l -= sizeof(u32); + } + /* + * We still have 'extra' number of bytes leftover. + */ + if (l) { + u32 v = *addr; + + if (l + extra >= sizeof(u32)) { + data |= set_upper_bits(v, shift); + len -= l + extra - sizeof(u32); + if (len == length) { + last = data; + break; + } + __raw_writel(data, piobuf); + piobuf++; + extra = 0; + data = 0; + } else { + /* Clear unused upper bytes */ + data |= clear_upper_bytes(v, l, + extra); + if (len == length) { + last = data; + break; + } + extra += l; + } + } else if (len == length) { + last = data; + break; + } + } else if (len == length) { + u32 w; + + /* + * Need to round up for the last dword in the + * packet. + */ + w = (len + 3) >> 2; + __iowrite32_copy(piobuf, ss->sge.vaddr, w - 1); + piobuf += w - 1; + last = ((u32 *) ss->sge.vaddr)[w - 1]; + break; + } else { + u32 w = len >> 2; + + __iowrite32_copy(piobuf, ss->sge.vaddr, w); + piobuf += w; + + extra = len & (sizeof(u32) - 1); + if (extra) { + u32 v = ((u32 *) ss->sge.vaddr)[w]; + + /* Clear unused upper bytes */ + data = clear_upper_bytes(v, extra, 0); + } + } + update_sge(ss, len); + length -= len; + } + /* Update address before sending packet. */ + update_sge(ss, length); + if (flush_wc) { + /* must flush early everything before trigger word */ + ipath_flush_wc(); + __raw_writel(last, piobuf); + /* be sure trigger word is written */ + ipath_flush_wc(); + } else + __raw_writel(last, piobuf); +} + +/* + * Convert IB rate to delay multiplier. + */ +unsigned ipath_ib_rate_to_mult(enum ib_rate rate) +{ + switch (rate) { + case IB_RATE_2_5_GBPS: return 8; + case IB_RATE_5_GBPS: return 4; + case IB_RATE_10_GBPS: return 2; + case IB_RATE_20_GBPS: return 1; + default: return 0; + } +} + +/* + * Convert delay multiplier to IB rate + */ +static enum ib_rate ipath_mult_to_ib_rate(unsigned mult) +{ + switch (mult) { + case 8: return IB_RATE_2_5_GBPS; + case 4: return IB_RATE_5_GBPS; + case 2: return IB_RATE_10_GBPS; + case 1: return IB_RATE_20_GBPS; + default: return IB_RATE_PORT_CURRENT; + } +} + +static inline struct ipath_verbs_txreq *get_txreq(struct ipath_ibdev *dev) +{ + struct ipath_verbs_txreq *tx = NULL; + unsigned long flags; + + spin_lock_irqsave(&dev->pending_lock, flags); + if (!list_empty(&dev->txreq_free)) { + struct list_head *l = dev->txreq_free.next; + + list_del(l); + tx = list_entry(l, struct ipath_verbs_txreq, txreq.list); + } + spin_unlock_irqrestore(&dev->pending_lock, flags); + return tx; +} + +static inline void put_txreq(struct ipath_ibdev *dev, + struct ipath_verbs_txreq *tx) +{ + unsigned long flags; + + spin_lock_irqsave(&dev->pending_lock, flags); + list_add(&tx->txreq.list, &dev->txreq_free); + spin_unlock_irqrestore(&dev->pending_lock, flags); +} + +static void sdma_complete(void *cookie, int status) +{ + struct ipath_verbs_txreq *tx = cookie; + struct ipath_qp *qp = tx->qp; + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + unsigned long flags; + enum ib_wc_status ibs = status == IPATH_SDMA_TXREQ_S_OK ? + IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR; + + if (atomic_dec_and_test(&qp->s_dma_busy)) { + spin_lock_irqsave(&qp->s_lock, flags); + if (tx->wqe) + ipath_send_complete(qp, tx->wqe, ibs); + if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND && + qp->s_last != qp->s_head) || + (qp->s_flags & IPATH_S_WAIT_DMA)) + ipath_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + wake_up(&qp->wait_dma); + } else if (tx->wqe) { + spin_lock_irqsave(&qp->s_lock, flags); + ipath_send_complete(qp, tx->wqe, ibs); + spin_unlock_irqrestore(&qp->s_lock, flags); + } + + if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF) + kfree(tx->txreq.map_addr); + put_txreq(dev, tx); + + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); +} + +static void decrement_dma_busy(struct ipath_qp *qp) +{ + unsigned long flags; + + if (atomic_dec_and_test(&qp->s_dma_busy)) { + spin_lock_irqsave(&qp->s_lock, flags); + if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND && + qp->s_last != qp->s_head) || + (qp->s_flags & IPATH_S_WAIT_DMA)) + ipath_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + wake_up(&qp->wait_dma); + } +} + +/* + * Compute the number of clock cycles of delay before sending the next packet. + * The multipliers reflect the number of clocks for the fastest rate so + * one tick at 4xDDR is 8 ticks at 1xSDR. + * If the destination port will take longer to receive a packet than + * the outgoing link can send it, we need to delay sending the next packet + * by the difference in time it takes the receiver to receive and the sender + * to send this packet. + * Note that this delay is always correct for UC and RC but not always + * optimal for UD. For UD, the destination HCA can be different for each + * packet, in which case, we could send packets to a different destination + * while "waiting" for the delay. The overhead for doing this without + * HW support is more than just paying the cost of delaying some packets + * unnecessarily. + */ +static inline unsigned ipath_pkt_delay(u32 plen, u8 snd_mult, u8 rcv_mult) +{ + return (rcv_mult > snd_mult) ? + (plen * (rcv_mult - snd_mult) + 1) >> 1 : 0; +} + +static int ipath_verbs_send_dma(struct ipath_qp *qp, + struct ipath_ib_header *hdr, u32 hdrwords, + struct ipath_sge_state *ss, u32 len, + u32 plen, u32 dwords) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ipath_devdata *dd = dev->dd; + struct ipath_verbs_txreq *tx; + u32 *piobuf; + u32 control; + u32 ndesc; + int ret; + + tx = qp->s_tx; + if (tx) { + qp->s_tx = NULL; + /* resend previously constructed packet */ + atomic_inc(&qp->s_dma_busy); + ret = ipath_sdma_verbs_send(dd, tx->ss, tx->len, tx); + if (ret) { + qp->s_tx = tx; + decrement_dma_busy(qp); + } + goto bail; + } + + tx = get_txreq(dev); + if (!tx) { + ret = -EBUSY; + goto bail; + } + + /* + * Get the saved delay count we computed for the previous packet + * and save the delay count for this packet to be used next time + * we get here. + */ + control = qp->s_pkt_delay; + qp->s_pkt_delay = ipath_pkt_delay(plen, dd->delay_mult, qp->s_dmult); + + tx->qp = qp; + atomic_inc(&qp->refcount); + tx->wqe = qp->s_wqe; + tx->txreq.callback = sdma_complete; + tx->txreq.callback_cookie = tx; + tx->txreq.flags = IPATH_SDMA_TXREQ_F_HEADTOHOST | + IPATH_SDMA_TXREQ_F_INTREQ | IPATH_SDMA_TXREQ_F_FREEDESC; + if (plen + 1 >= IPATH_SMALLBUF_DWORDS) + tx->txreq.flags |= IPATH_SDMA_TXREQ_F_USELARGEBUF; + + /* VL15 packets bypass credit check */ + if ((be16_to_cpu(hdr->lrh[0]) >> 12) == 15) { + control |= 1ULL << 31; + tx->txreq.flags |= IPATH_SDMA_TXREQ_F_VL15; + } + + if (len) { + /* + * Don't try to DMA if it takes more descriptors than + * the queue holds. + */ + ndesc = ipath_count_sge(ss, len); + if (ndesc >= dd->ipath_sdma_descq_cnt) + ndesc = 0; + } else + ndesc = 1; + if (ndesc) { + tx->hdr.pbc[0] = cpu_to_le32(plen); + tx->hdr.pbc[1] = cpu_to_le32(control); + memcpy(&tx->hdr.hdr, hdr, hdrwords << 2); + tx->txreq.sg_count = ndesc; + tx->map_len = (hdrwords + 2) << 2; + tx->txreq.map_addr = &tx->hdr; + atomic_inc(&qp->s_dma_busy); + ret = ipath_sdma_verbs_send(dd, ss, dwords, tx); + if (ret) { + /* save ss and length in dwords */ + tx->ss = ss; + tx->len = dwords; + qp->s_tx = tx; + decrement_dma_busy(qp); + } + goto bail; + } + + /* Allocate a buffer and copy the header and payload to it. */ + tx->map_len = (plen + 1) << 2; + piobuf = kmalloc(tx->map_len, GFP_ATOMIC); + if (unlikely(piobuf == NULL)) { + ret = -EBUSY; + goto err_tx; + } + tx->txreq.map_addr = piobuf; + tx->txreq.flags |= IPATH_SDMA_TXREQ_F_FREEBUF; + tx->txreq.sg_count = 1; + + *piobuf++ = (__force u32) cpu_to_le32(plen); + *piobuf++ = (__force u32) cpu_to_le32(control); + memcpy(piobuf, hdr, hdrwords << 2); + ipath_copy_from_sge(piobuf + hdrwords, ss, len); + + atomic_inc(&qp->s_dma_busy); + ret = ipath_sdma_verbs_send(dd, NULL, 0, tx); + /* + * If we couldn't queue the DMA request, save the info + * and try again later rather than destroying the + * buffer and undoing the side effects of the copy. + */ + if (ret) { + tx->ss = NULL; + tx->len = 0; + qp->s_tx = tx; + decrement_dma_busy(qp); + } + dev->n_unaligned++; + goto bail; + +err_tx: + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + put_txreq(dev, tx); +bail: + return ret; +} + +static int ipath_verbs_send_pio(struct ipath_qp *qp, + struct ipath_ib_header *ibhdr, u32 hdrwords, + struct ipath_sge_state *ss, u32 len, + u32 plen, u32 dwords) +{ + struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd; + u32 *hdr = (u32 *) ibhdr; + u32 __iomem *piobuf; + unsigned flush_wc; + u32 control; + int ret; + unsigned long flags; + + piobuf = ipath_getpiobuf(dd, plen, NULL); + if (unlikely(piobuf == NULL)) { + ret = -EBUSY; + goto bail; + } + + /* + * Get the saved delay count we computed for the previous packet + * and save the delay count for this packet to be used next time + * we get here. + */ + control = qp->s_pkt_delay; + qp->s_pkt_delay = ipath_pkt_delay(plen, dd->delay_mult, qp->s_dmult); + + /* VL15 packets bypass credit check */ + if ((be16_to_cpu(ibhdr->lrh[0]) >> 12) == 15) + control |= 1ULL << 31; + + /* + * Write the length to the control qword plus any needed flags. + * We have to flush after the PBC for correctness on some cpus + * or WC buffer can be written out of order. + */ + writeq(((u64) control << 32) | plen, piobuf); + piobuf += 2; + + flush_wc = dd->ipath_flags & IPATH_PIO_FLUSH_WC; + if (len == 0) { + /* + * If there is just the header portion, must flush before + * writing last word of header for correctness, and after + * the last header word (trigger word). + */ + if (flush_wc) { + ipath_flush_wc(); + __iowrite32_copy(piobuf, hdr, hdrwords - 1); + ipath_flush_wc(); + __raw_writel(hdr[hdrwords - 1], piobuf + hdrwords - 1); + ipath_flush_wc(); + } else + __iowrite32_copy(piobuf, hdr, hdrwords); + goto done; + } + + if (flush_wc) + ipath_flush_wc(); + __iowrite32_copy(piobuf, hdr, hdrwords); + piobuf += hdrwords; + + /* The common case is aligned and contained in one segment. */ + if (likely(ss->num_sge == 1 && len <= ss->sge.length && + !((unsigned long)ss->sge.vaddr & (sizeof(u32) - 1)))) { + u32 *addr = (u32 *) ss->sge.vaddr; + + /* Update address before sending packet. */ + update_sge(ss, len); + if (flush_wc) { + __iowrite32_copy(piobuf, addr, dwords - 1); + /* must flush early everything before trigger word */ + ipath_flush_wc(); + __raw_writel(addr[dwords - 1], piobuf + dwords - 1); + /* be sure trigger word is written */ + ipath_flush_wc(); + } else + __iowrite32_copy(piobuf, addr, dwords); + goto done; + } + copy_io(piobuf, ss, len, flush_wc); +done: + if (qp->s_wqe) { + spin_lock_irqsave(&qp->s_lock, flags); + ipath_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS); + spin_unlock_irqrestore(&qp->s_lock, flags); + } + ret = 0; +bail: + return ret; +} + +/** + * ipath_verbs_send - send a packet + * @qp: the QP to send on + * @hdr: the packet header + * @hdrwords: the number of 32-bit words in the header + * @ss: the SGE to send + * @len: the length of the packet in bytes + */ +int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr, + u32 hdrwords, struct ipath_sge_state *ss, u32 len) +{ + struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd; + u32 plen; + int ret; + u32 dwords = (len + 3) >> 2; + + /* + * Calculate the send buffer trigger address. + * The +1 counts for the pbc control dword following the pbc length. + */ + plen = hdrwords + dwords + 1; + + /* + * VL15 packets (IB_QPT_SMI) will always use PIO, so we + * can defer SDMA restart until link goes ACTIVE without + * worrying about just how we got there. + */ + if (qp->ibqp.qp_type == IB_QPT_SMI || + !(dd->ipath_flags & IPATH_HAS_SEND_DMA)) + ret = ipath_verbs_send_pio(qp, hdr, hdrwords, ss, len, + plen, dwords); + else + ret = ipath_verbs_send_dma(qp, hdr, hdrwords, ss, len, + plen, dwords); + + return ret; +} + +int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords, + u64 *rwords, u64 *spkts, u64 *rpkts, + u64 *xmit_wait) +{ + int ret; + + if (!(dd->ipath_flags & IPATH_INITTED)) { + /* no hardware, freeze, etc. */ + ret = -EINVAL; + goto bail; + } + *swords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt); + *rwords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt); + *spkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt); + *rpkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt); + *xmit_wait = ipath_snap_cntr(dd, dd->ipath_cregs->cr_sendstallcnt); + + ret = 0; + +bail: + return ret; +} + +/** + * ipath_get_counters - get various chip counters + * @dd: the infinipath device + * @cntrs: counters are placed here + * + * Return the counters needed by recv_pma_get_portcounters(). + */ +int ipath_get_counters(struct ipath_devdata *dd, + struct ipath_verbs_counters *cntrs) +{ + struct ipath_cregs const *crp = dd->ipath_cregs; + int ret; + + if (!(dd->ipath_flags & IPATH_INITTED)) { + /* no hardware, freeze, etc. */ + ret = -EINVAL; + goto bail; + } + cntrs->symbol_error_counter = + ipath_snap_cntr(dd, crp->cr_ibsymbolerrcnt); + cntrs->link_error_recovery_counter = + ipath_snap_cntr(dd, crp->cr_iblinkerrrecovcnt); + /* + * The link downed counter counts when the other side downs the + * connection. We add in the number of times we downed the link + * due to local link integrity errors to compensate. + */ + cntrs->link_downed_counter = + ipath_snap_cntr(dd, crp->cr_iblinkdowncnt); + cntrs->port_rcv_errors = + ipath_snap_cntr(dd, crp->cr_rxdroppktcnt) + + ipath_snap_cntr(dd, crp->cr_rcvovflcnt) + + ipath_snap_cntr(dd, crp->cr_portovflcnt) + + ipath_snap_cntr(dd, crp->cr_err_rlencnt) + + ipath_snap_cntr(dd, crp->cr_invalidrlencnt) + + ipath_snap_cntr(dd, crp->cr_errlinkcnt) + + ipath_snap_cntr(dd, crp->cr_erricrccnt) + + ipath_snap_cntr(dd, crp->cr_errvcrccnt) + + ipath_snap_cntr(dd, crp->cr_errlpcrccnt) + + ipath_snap_cntr(dd, crp->cr_badformatcnt) + + dd->ipath_rxfc_unsupvl_errs; + if (crp->cr_rxotherlocalphyerrcnt) + cntrs->port_rcv_errors += + ipath_snap_cntr(dd, crp->cr_rxotherlocalphyerrcnt); + if (crp->cr_rxvlerrcnt) + cntrs->port_rcv_errors += + ipath_snap_cntr(dd, crp->cr_rxvlerrcnt); + cntrs->port_rcv_remphys_errors = + ipath_snap_cntr(dd, crp->cr_rcvebpcnt); + cntrs->port_xmit_discards = ipath_snap_cntr(dd, crp->cr_unsupvlcnt); + cntrs->port_xmit_data = ipath_snap_cntr(dd, crp->cr_wordsendcnt); + cntrs->port_rcv_data = ipath_snap_cntr(dd, crp->cr_wordrcvcnt); + cntrs->port_xmit_packets = ipath_snap_cntr(dd, crp->cr_pktsendcnt); + cntrs->port_rcv_packets = ipath_snap_cntr(dd, crp->cr_pktrcvcnt); + cntrs->local_link_integrity_errors = + crp->cr_locallinkintegrityerrcnt ? + ipath_snap_cntr(dd, crp->cr_locallinkintegrityerrcnt) : + ((dd->ipath_flags & IPATH_GPIO_ERRINTRS) ? + dd->ipath_lli_errs : dd->ipath_lli_errors); + cntrs->excessive_buffer_overrun_errors = + crp->cr_excessbufferovflcnt ? + ipath_snap_cntr(dd, crp->cr_excessbufferovflcnt) : + dd->ipath_overrun_thresh_errs; + cntrs->vl15_dropped = crp->cr_vl15droppedpktcnt ? + ipath_snap_cntr(dd, crp->cr_vl15droppedpktcnt) : 0; + + ret = 0; + +bail: + return ret; +} + +/** + * ipath_ib_piobufavail - callback when a PIO buffer is available + * @arg: the device pointer + * + * This is called from ipath_intr() at interrupt level when a PIO buffer is + * available after ipath_verbs_send() returned an error that no buffers were + * available. Return 1 if we consumed all the PIO buffers and we still have + * QPs waiting for buffers (for now, just restart the send tasklet and + * return zero). + */ +int ipath_ib_piobufavail(struct ipath_ibdev *dev) +{ + struct list_head *list; + struct ipath_qp *qplist; + struct ipath_qp *qp; + unsigned long flags; + + if (dev == NULL) + goto bail; + + list = &dev->piowait; + qplist = NULL; + + spin_lock_irqsave(&dev->pending_lock, flags); + while (!list_empty(list)) { + qp = list_entry(list->next, struct ipath_qp, piowait); + list_del_init(&qp->piowait); + qp->pio_next = qplist; + qplist = qp; + atomic_inc(&qp->refcount); + } + spin_unlock_irqrestore(&dev->pending_lock, flags); + + while (qplist != NULL) { + qp = qplist; + qplist = qp->pio_next; + + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) + ipath_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + + /* Notify ipath_destroy_qp() if it is waiting. */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } + +bail: + return 0; +} + +static int ipath_query_device(struct ib_device *ibdev, + struct ib_device_attr *props) +{ + struct ipath_ibdev *dev = to_idev(ibdev); + + memset(props, 0, sizeof(*props)); + + props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR | + IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT | + IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN | + IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE; + props->page_size_cap = PAGE_SIZE; + props->vendor_id = + IPATH_SRC_OUI_1 << 16 | IPATH_SRC_OUI_2 << 8 | IPATH_SRC_OUI_3; + props->vendor_part_id = dev->dd->ipath_deviceid; + props->hw_ver = dev->dd->ipath_pcirev; + + props->sys_image_guid = dev->sys_image_guid; + + props->max_mr_size = ~0ull; + props->max_qp = ib_ipath_max_qps; + props->max_qp_wr = ib_ipath_max_qp_wrs; + props->max_sge = ib_ipath_max_sges; + props->max_cq = ib_ipath_max_cqs; + props->max_ah = ib_ipath_max_ahs; + props->max_cqe = ib_ipath_max_cqes; + props->max_mr = dev->lk_table.max; + props->max_fmr = dev->lk_table.max; + props->max_map_per_fmr = 32767; + props->max_pd = ib_ipath_max_pds; + props->max_qp_rd_atom = IPATH_MAX_RDMA_ATOMIC; + props->max_qp_init_rd_atom = 255; + /* props->max_res_rd_atom */ + props->max_srq = ib_ipath_max_srqs; + props->max_srq_wr = ib_ipath_max_srq_wrs; + props->max_srq_sge = ib_ipath_max_srq_sges; + /* props->local_ca_ack_delay */ + props->atomic_cap = IB_ATOMIC_GLOB; + props->max_pkeys = ipath_get_npkeys(dev->dd); + props->max_mcast_grp = ib_ipath_max_mcast_grps; + props->max_mcast_qp_attach = ib_ipath_max_mcast_qp_attached; + props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * + props->max_mcast_grp; + + return 0; +} + +const u8 ipath_cvt_physportstate[32] = { + [INFINIPATH_IBCS_LT_STATE_DISABLED] = IB_PHYSPORTSTATE_DISABLED, + [INFINIPATH_IBCS_LT_STATE_LINKUP] = IB_PHYSPORTSTATE_LINKUP, + [INFINIPATH_IBCS_LT_STATE_POLLACTIVE] = IB_PHYSPORTSTATE_POLL, + [INFINIPATH_IBCS_LT_STATE_POLLQUIET] = IB_PHYSPORTSTATE_POLL, + [INFINIPATH_IBCS_LT_STATE_SLEEPDELAY] = IB_PHYSPORTSTATE_SLEEP, + [INFINIPATH_IBCS_LT_STATE_SLEEPQUIET] = IB_PHYSPORTSTATE_SLEEP, + [INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE] = + IB_PHYSPORTSTATE_CFG_TRAIN, + [INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG] = + IB_PHYSPORTSTATE_CFG_TRAIN, + [INFINIPATH_IBCS_LT_STATE_CFGWAITRMT] = + IB_PHYSPORTSTATE_CFG_TRAIN, + [INFINIPATH_IBCS_LT_STATE_CFGIDLE] = IB_PHYSPORTSTATE_CFG_TRAIN, + [INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN] = + IB_PHYSPORTSTATE_LINK_ERR_RECOVER, + [INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT] = + IB_PHYSPORTSTATE_LINK_ERR_RECOVER, + [INFINIPATH_IBCS_LT_STATE_RECOVERIDLE] = + IB_PHYSPORTSTATE_LINK_ERR_RECOVER, + [0x10] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x11] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x12] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x13] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x14] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x15] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x16] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x17] = IB_PHYSPORTSTATE_CFG_TRAIN +}; + +u32 ipath_get_cr_errpkey(struct ipath_devdata *dd) +{ + return ipath_read_creg32(dd, dd->ipath_cregs->cr_errpkey); +} + +static int ipath_query_port(struct ib_device *ibdev, + u8 port, struct ib_port_attr *props) +{ + struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_devdata *dd = dev->dd; + enum ib_mtu mtu; + u16 lid = dd->ipath_lid; + u64 ibcstat; + + memset(props, 0, sizeof(*props)); + props->lid = lid ? lid : be16_to_cpu(IB_LID_PERMISSIVE); + props->lmc = dd->ipath_lmc; + props->sm_lid = dev->sm_lid; + props->sm_sl = dev->sm_sl; + ibcstat = dd->ipath_lastibcstat; + /* map LinkState to IB portinfo values. */ + props->state = ipath_ib_linkstate(dd, ibcstat) + 1; + + /* See phys_state_show() */ + props->phys_state = /* MEA: assumes shift == 0 */ + ipath_cvt_physportstate[dd->ipath_lastibcstat & + dd->ibcs_lts_mask]; + props->port_cap_flags = dev->port_cap_flags; + props->gid_tbl_len = 1; + props->max_msg_sz = 0x80000000; + props->pkey_tbl_len = ipath_get_npkeys(dd); + props->bad_pkey_cntr = ipath_get_cr_errpkey(dd) - + dev->z_pkey_violations; + props->qkey_viol_cntr = dev->qkey_violations; + props->active_width = dd->ipath_link_width_active; + /* See rate_show() */ + props->active_speed = dd->ipath_link_speed_active; + props->max_vl_num = 1; /* VLCap = VL0 */ + props->init_type_reply = 0; + + props->max_mtu = ipath_mtu4096 ? IB_MTU_4096 : IB_MTU_2048; + switch (dd->ipath_ibmtu) { + case 4096: + mtu = IB_MTU_4096; + break; + case 2048: + mtu = IB_MTU_2048; + break; + case 1024: + mtu = IB_MTU_1024; + break; + case 512: + mtu = IB_MTU_512; + break; + case 256: + mtu = IB_MTU_256; + break; + default: + mtu = IB_MTU_2048; + } + props->active_mtu = mtu; + props->subnet_timeout = dev->subnet_timeout; + + return 0; +} + +static int ipath_modify_device(struct ib_device *device, + int device_modify_mask, + struct ib_device_modify *device_modify) +{ + int ret; + + if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID | + IB_DEVICE_MODIFY_NODE_DESC)) { + ret = -EOPNOTSUPP; + goto bail; + } + + if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) + memcpy(device->node_desc, device_modify->node_desc, 64); + + if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) + to_idev(device)->sys_image_guid = + cpu_to_be64(device_modify->sys_image_guid); + + ret = 0; + +bail: + return ret; +} + +static int ipath_modify_port(struct ib_device *ibdev, + u8 port, int port_modify_mask, + struct ib_port_modify *props) +{ + struct ipath_ibdev *dev = to_idev(ibdev); + + dev->port_cap_flags |= props->set_port_cap_mask; + dev->port_cap_flags &= ~props->clr_port_cap_mask; + if (port_modify_mask & IB_PORT_SHUTDOWN) + ipath_set_linkstate(dev->dd, IPATH_IB_LINKDOWN); + if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR) + dev->qkey_violations = 0; + return 0; +} + +static int ipath_query_gid(struct ib_device *ibdev, u8 port, + int index, union ib_gid *gid) +{ + struct ipath_ibdev *dev = to_idev(ibdev); + int ret; + + if (index >= 1) { + ret = -EINVAL; + goto bail; + } + gid->global.subnet_prefix = dev->gid_prefix; + gid->global.interface_id = dev->dd->ipath_guid; + + ret = 0; + +bail: + return ret; +} + +static struct ib_pd *ipath_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_pd *pd; + struct ib_pd *ret; + + /* + * This is actually totally arbitrary. Some correctness tests + * assume there's a maximum number of PDs that can be allocated. + * We don't actually have this limit, but we fail the test if + * we allow allocations of more than we report for this value. + */ + + pd = kmalloc(sizeof *pd, GFP_KERNEL); + if (!pd) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + spin_lock(&dev->n_pds_lock); + if (dev->n_pds_allocated == ib_ipath_max_pds) { + spin_unlock(&dev->n_pds_lock); + kfree(pd); + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + dev->n_pds_allocated++; + spin_unlock(&dev->n_pds_lock); + + /* ib_alloc_pd() will initialize pd->ibpd. */ + pd->user = udata != NULL; + + ret = &pd->ibpd; + +bail: + return ret; +} + +static int ipath_dealloc_pd(struct ib_pd *ibpd) +{ + struct ipath_pd *pd = to_ipd(ibpd); + struct ipath_ibdev *dev = to_idev(ibpd->device); + + spin_lock(&dev->n_pds_lock); + dev->n_pds_allocated--; + spin_unlock(&dev->n_pds_lock); + + kfree(pd); + + return 0; +} + +/** + * ipath_create_ah - create an address handle + * @pd: the protection domain + * @ah_attr: the attributes of the AH + * + * This may be called from interrupt context. + */ +static struct ib_ah *ipath_create_ah(struct ib_pd *pd, + struct ib_ah_attr *ah_attr) +{ + struct ipath_ah *ah; + struct ib_ah *ret; + struct ipath_ibdev *dev = to_idev(pd->device); + unsigned long flags; + + /* A multicast address requires a GRH (see ch. 8.4.1). */ + if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE && + ah_attr->dlid != IPATH_PERMISSIVE_LID && + !(ah_attr->ah_flags & IB_AH_GRH)) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + if (ah_attr->dlid == 0) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + if (ah_attr->port_num < 1 || + ah_attr->port_num > pd->device->phys_port_cnt) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + ah = kmalloc(sizeof *ah, GFP_ATOMIC); + if (!ah) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + spin_lock_irqsave(&dev->n_ahs_lock, flags); + if (dev->n_ahs_allocated == ib_ipath_max_ahs) { + spin_unlock_irqrestore(&dev->n_ahs_lock, flags); + kfree(ah); + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + dev->n_ahs_allocated++; + spin_unlock_irqrestore(&dev->n_ahs_lock, flags); + + /* ib_create_ah() will initialize ah->ibah. */ + ah->attr = *ah_attr; + ah->attr.static_rate = ipath_ib_rate_to_mult(ah_attr->static_rate); + + ret = &ah->ibah; + +bail: + return ret; +} + +/** + * ipath_destroy_ah - destroy an address handle + * @ibah: the AH to destroy + * + * This may be called from interrupt context. + */ +static int ipath_destroy_ah(struct ib_ah *ibah) +{ + struct ipath_ibdev *dev = to_idev(ibah->device); + struct ipath_ah *ah = to_iah(ibah); + unsigned long flags; + + spin_lock_irqsave(&dev->n_ahs_lock, flags); + dev->n_ahs_allocated--; + spin_unlock_irqrestore(&dev->n_ahs_lock, flags); + + kfree(ah); + + return 0; +} + +static int ipath_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) +{ + struct ipath_ah *ah = to_iah(ibah); + + *ah_attr = ah->attr; + ah_attr->static_rate = ipath_mult_to_ib_rate(ah->attr.static_rate); + + return 0; +} + +/** + * ipath_get_npkeys - return the size of the PKEY table for port 0 + * @dd: the infinipath device + */ +unsigned ipath_get_npkeys(struct ipath_devdata *dd) +{ + return ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys); +} + +/** + * ipath_get_pkey - return the indexed PKEY from the port PKEY table + * @dd: the infinipath device + * @index: the PKEY index + */ +unsigned ipath_get_pkey(struct ipath_devdata *dd, unsigned index) +{ + unsigned ret; + + /* always a kernel port, no locking needed */ + if (index >= ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys)) + ret = 0; + else + ret = dd->ipath_pd[0]->port_pkeys[index]; + + return ret; +} + +static int ipath_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey) +{ + struct ipath_ibdev *dev = to_idev(ibdev); + int ret; + + if (index >= ipath_get_npkeys(dev->dd)) { + ret = -EINVAL; + goto bail; + } + + *pkey = ipath_get_pkey(dev->dd, index); + ret = 0; + +bail: + return ret; +} + +/** + * ipath_alloc_ucontext - allocate a ucontest + * @ibdev: the infiniband device + * @udata: not used by the InfiniPath driver + */ + +static struct ib_ucontext *ipath_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct ipath_ucontext *context; + struct ib_ucontext *ret; + + context = kmalloc(sizeof *context, GFP_KERNEL); + if (!context) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + ret = &context->ibucontext; + +bail: + return ret; +} + +static int ipath_dealloc_ucontext(struct ib_ucontext *context) +{ + kfree(to_iucontext(context)); + return 0; +} + +static int ipath_verbs_register_sysfs(struct ib_device *dev); + +static void __verbs_timer(unsigned long arg) +{ + struct ipath_devdata *dd = (struct ipath_devdata *) arg; + + /* Handle verbs layer timeouts. */ + ipath_ib_timer(dd->verbs_dev); + + mod_timer(&dd->verbs_timer, jiffies + 1); +} + +static int enable_timer(struct ipath_devdata *dd) +{ + /* + * Early chips had a design flaw where the chip and kernel idea + * of the tail register don't always agree, and therefore we won't + * get an interrupt on the next packet received. + * If the board supports per packet receive interrupts, use it. + * Otherwise, the timer function periodically checks for packets + * to cover this case. + * Either way, the timer is needed for verbs layer related + * processing. + */ + if (dd->ipath_flags & IPATH_GPIO_INTR) { + ipath_write_kreg(dd, dd->ipath_kregs->kr_debugportselect, + 0x2074076542310ULL); + /* Enable GPIO bit 2 interrupt */ + dd->ipath_gpio_mask |= (u64) (1 << IPATH_GPIO_PORT0_BIT); + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask, + dd->ipath_gpio_mask); + } + + init_timer(&dd->verbs_timer); + dd->verbs_timer.function = __verbs_timer; + dd->verbs_timer.data = (unsigned long)dd; + dd->verbs_timer.expires = jiffies + 1; + add_timer(&dd->verbs_timer); + + return 0; +} + +static int disable_timer(struct ipath_devdata *dd) +{ + /* Disable GPIO bit 2 interrupt */ + if (dd->ipath_flags & IPATH_GPIO_INTR) { + /* Disable GPIO bit 2 interrupt */ + dd->ipath_gpio_mask &= ~((u64) (1 << IPATH_GPIO_PORT0_BIT)); + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask, + dd->ipath_gpio_mask); + /* + * We might want to undo changes to debugportselect, + * but how? + */ + } + + del_timer_sync(&dd->verbs_timer); + + return 0; +} + +/** + * ipath_register_ib_device - register our device with the infiniband core + * @dd: the device data structure + * Return the allocated ipath_ibdev pointer or NULL on error. + */ +int ipath_register_ib_device(struct ipath_devdata *dd) +{ + struct ipath_verbs_counters cntrs; + struct ipath_ibdev *idev; + struct ib_device *dev; + struct ipath_verbs_txreq *tx; + unsigned i; + int ret; + + idev = (struct ipath_ibdev *)ib_alloc_device(sizeof *idev); + if (idev == NULL) { + ret = -ENOMEM; + goto bail; + } + + dev = &idev->ibdev; + + if (dd->ipath_sdma_descq_cnt) { + tx = kmalloc(dd->ipath_sdma_descq_cnt * sizeof *tx, + GFP_KERNEL); + if (tx == NULL) { + ret = -ENOMEM; + goto err_tx; + } + } else + tx = NULL; + idev->txreq_bufs = tx; + + /* Only need to initialize non-zero fields. */ + spin_lock_init(&idev->n_pds_lock); + spin_lock_init(&idev->n_ahs_lock); + spin_lock_init(&idev->n_cqs_lock); + spin_lock_init(&idev->n_qps_lock); + spin_lock_init(&idev->n_srqs_lock); + spin_lock_init(&idev->n_mcast_grps_lock); + + spin_lock_init(&idev->qp_table.lock); + spin_lock_init(&idev->lk_table.lock); + idev->sm_lid = __constant_be16_to_cpu(IB_LID_PERMISSIVE); + /* Set the prefix to the default value (see ch. 4.1.1) */ + idev->gid_prefix = __constant_cpu_to_be64(0xfe80000000000000ULL); + + ret = ipath_init_qp_table(idev, ib_ipath_qp_table_size); + if (ret) + goto err_qp; + + /* + * The top ib_ipath_lkey_table_size bits are used to index the + * table. The lower 8 bits can be owned by the user (copied from + * the LKEY). The remaining bits act as a generation number or tag. + */ + idev->lk_table.max = 1 << ib_ipath_lkey_table_size; + idev->lk_table.table = kzalloc(idev->lk_table.max * + sizeof(*idev->lk_table.table), + GFP_KERNEL); + if (idev->lk_table.table == NULL) { + ret = -ENOMEM; + goto err_lk; + } + INIT_LIST_HEAD(&idev->pending_mmaps); + spin_lock_init(&idev->pending_lock); + idev->mmap_offset = PAGE_SIZE; + spin_lock_init(&idev->mmap_offset_lock); + INIT_LIST_HEAD(&idev->pending[0]); + INIT_LIST_HEAD(&idev->pending[1]); + INIT_LIST_HEAD(&idev->pending[2]); + INIT_LIST_HEAD(&idev->piowait); + INIT_LIST_HEAD(&idev->rnrwait); + INIT_LIST_HEAD(&idev->txreq_free); + idev->pending_index = 0; + idev->port_cap_flags = + IB_PORT_SYS_IMAGE_GUID_SUP | IB_PORT_CLIENT_REG_SUP; + if (dd->ipath_flags & IPATH_HAS_LINK_LATENCY) + idev->port_cap_flags |= IB_PORT_LINK_LATENCY_SUP; + idev->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA; + idev->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA; + idev->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS; + idev->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS; + idev->pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT; + + /* Snapshot current HW counters to "clear" them. */ + ipath_get_counters(dd, &cntrs); + idev->z_symbol_error_counter = cntrs.symbol_error_counter; + idev->z_link_error_recovery_counter = + cntrs.link_error_recovery_counter; + idev->z_link_downed_counter = cntrs.link_downed_counter; + idev->z_port_rcv_errors = cntrs.port_rcv_errors; + idev->z_port_rcv_remphys_errors = + cntrs.port_rcv_remphys_errors; + idev->z_port_xmit_discards = cntrs.port_xmit_discards; + idev->z_port_xmit_data = cntrs.port_xmit_data; + idev->z_port_rcv_data = cntrs.port_rcv_data; + idev->z_port_xmit_packets = cntrs.port_xmit_packets; + idev->z_port_rcv_packets = cntrs.port_rcv_packets; + idev->z_local_link_integrity_errors = + cntrs.local_link_integrity_errors; + idev->z_excessive_buffer_overrun_errors = + cntrs.excessive_buffer_overrun_errors; + idev->z_vl15_dropped = cntrs.vl15_dropped; + + for (i = 0; i < dd->ipath_sdma_descq_cnt; i++, tx++) + list_add(&tx->txreq.list, &idev->txreq_free); + + /* + * The system image GUID is supposed to be the same for all + * IB HCAs in a single system but since there can be other + * device types in the system, we can't be sure this is unique. + */ + if (!sys_image_guid) + sys_image_guid = dd->ipath_guid; + idev->sys_image_guid = sys_image_guid; + idev->ib_unit = dd->ipath_unit; + idev->dd = dd; + + strlcpy(dev->name, "ipath%d", IB_DEVICE_NAME_MAX); + dev->owner = THIS_MODULE; + dev->node_guid = dd->ipath_guid; + dev->uverbs_abi_ver = IPATH_UVERBS_ABI_VERSION; + dev->uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_CREATE_AH) | + (1ull << IB_USER_VERBS_CMD_DESTROY_AH) | + (1ull << IB_USER_VERBS_CMD_QUERY_AH) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_POLL_CQ) | + (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_POST_SEND) | + (1ull << IB_USER_VERBS_CMD_POST_RECV) | + (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | + (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV); + dev->node_type = RDMA_NODE_IB_CA; + dev->phys_port_cnt = 1; + dev->num_comp_vectors = 1; + dev->dma_device = &dd->pcidev->dev; + dev->query_device = ipath_query_device; + dev->modify_device = ipath_modify_device; + dev->query_port = ipath_query_port; + dev->modify_port = ipath_modify_port; + dev->query_pkey = ipath_query_pkey; + dev->query_gid = ipath_query_gid; + dev->alloc_ucontext = ipath_alloc_ucontext; + dev->dealloc_ucontext = ipath_dealloc_ucontext; + dev->alloc_pd = ipath_alloc_pd; + dev->dealloc_pd = ipath_dealloc_pd; + dev->create_ah = ipath_create_ah; + dev->destroy_ah = ipath_destroy_ah; + dev->query_ah = ipath_query_ah; + dev->create_srq = ipath_create_srq; + dev->modify_srq = ipath_modify_srq; + dev->query_srq = ipath_query_srq; + dev->destroy_srq = ipath_destroy_srq; + dev->create_qp = ipath_create_qp; + dev->modify_qp = ipath_modify_qp; + dev->query_qp = ipath_query_qp; + dev->destroy_qp = ipath_destroy_qp; + dev->post_send = ipath_post_send; + dev->post_recv = ipath_post_receive; + dev->post_srq_recv = ipath_post_srq_receive; + dev->create_cq = ipath_create_cq; + dev->destroy_cq = ipath_destroy_cq; + dev->resize_cq = ipath_resize_cq; + dev->poll_cq = ipath_poll_cq; + dev->req_notify_cq = ipath_req_notify_cq; + dev->get_dma_mr = ipath_get_dma_mr; + dev->reg_phys_mr = ipath_reg_phys_mr; + dev->reg_user_mr = ipath_reg_user_mr; + dev->dereg_mr = ipath_dereg_mr; + dev->alloc_fmr = ipath_alloc_fmr; + dev->map_phys_fmr = ipath_map_phys_fmr; + dev->unmap_fmr = ipath_unmap_fmr; + dev->dealloc_fmr = ipath_dealloc_fmr; + dev->attach_mcast = ipath_multicast_attach; + dev->detach_mcast = ipath_multicast_detach; + dev->process_mad = ipath_process_mad; + dev->mmap = ipath_mmap; + dev->dma_ops = &ipath_dma_mapping_ops; + + snprintf(dev->node_desc, sizeof(dev->node_desc), + IPATH_IDSTR " %s", init_utsname()->nodename); + + ret = ib_register_device(dev, NULL); + if (ret) + goto err_reg; + + ret = ipath_verbs_register_sysfs(dev); + if (ret) + goto err_class; + + enable_timer(dd); + + goto bail; + +err_class: + ib_unregister_device(dev); +err_reg: + kfree(idev->lk_table.table); +err_lk: + kfree(idev->qp_table.table); +err_qp: + kfree(idev->txreq_bufs); +err_tx: + ib_dealloc_device(dev); + ipath_dev_err(dd, "cannot register verbs: %d!\n", -ret); + idev = NULL; + +bail: + dd->verbs_dev = idev; + return ret; +} + +void ipath_unregister_ib_device(struct ipath_ibdev *dev) +{ + struct ib_device *ibdev = &dev->ibdev; + u32 qps_inuse; + + ib_unregister_device(ibdev); + + disable_timer(dev->dd); + + if (!list_empty(&dev->pending[0]) || + !list_empty(&dev->pending[1]) || + !list_empty(&dev->pending[2])) + ipath_dev_err(dev->dd, "pending list not empty!\n"); + if (!list_empty(&dev->piowait)) + ipath_dev_err(dev->dd, "piowait list not empty!\n"); + if (!list_empty(&dev->rnrwait)) + ipath_dev_err(dev->dd, "rnrwait list not empty!\n"); + if (!ipath_mcast_tree_empty()) + ipath_dev_err(dev->dd, "multicast table memory leak!\n"); + /* + * Note that ipath_unregister_ib_device() can be called before all + * the QPs are destroyed! + */ + qps_inuse = ipath_free_all_qps(&dev->qp_table); + if (qps_inuse) + ipath_dev_err(dev->dd, "QP memory leak! %u still in use\n", + qps_inuse); + kfree(dev->qp_table.table); + kfree(dev->lk_table.table); + kfree(dev->txreq_bufs); + ib_dealloc_device(ibdev); +} + +static ssize_t show_rev(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct ipath_ibdev *dev = + container_of(device, struct ipath_ibdev, ibdev.dev); + + return sprintf(buf, "%x\n", dev->dd->ipath_pcirev); +} + +static ssize_t show_hca(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct ipath_ibdev *dev = + container_of(device, struct ipath_ibdev, ibdev.dev); + int ret; + + ret = dev->dd->ipath_f_get_boardname(dev->dd, buf, 128); + if (ret < 0) + goto bail; + strcat(buf, "\n"); + ret = strlen(buf); + +bail: + return ret; +} + +static ssize_t show_stats(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct ipath_ibdev *dev = + container_of(device, struct ipath_ibdev, ibdev.dev); + int i; + int len; + + len = sprintf(buf, + "RC resends %d\n" + "RC no QACK %d\n" + "RC ACKs %d\n" + "RC SEQ NAKs %d\n" + "RC RDMA seq %d\n" + "RC RNR NAKs %d\n" + "RC OTH NAKs %d\n" + "RC timeouts %d\n" + "RC RDMA dup %d\n" + "piobuf wait %d\n" + "unaligned %d\n" + "PKT drops %d\n" + "WQE errs %d\n", + dev->n_rc_resends, dev->n_rc_qacks, dev->n_rc_acks, + dev->n_seq_naks, dev->n_rdma_seq, dev->n_rnr_naks, + dev->n_other_naks, dev->n_timeouts, + dev->n_rdma_dup_busy, dev->n_piowait, dev->n_unaligned, + dev->n_pkt_drops, dev->n_wqe_errs); + for (i = 0; i < ARRAY_SIZE(dev->opstats); i++) { + const struct ipath_opcode_stats *si = &dev->opstats[i]; + + if (!si->n_packets && !si->n_bytes) + continue; + len += sprintf(buf + len, "%02x %llu/%llu\n", i, + (unsigned long long) si->n_packets, + (unsigned long long) si->n_bytes); + } + return len; +} + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(stats, S_IRUGO, show_stats, NULL); + +static struct device_attribute *ipath_class_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_hca_type, + &dev_attr_board_id, + &dev_attr_stats +}; + +static int ipath_verbs_register_sysfs(struct ib_device *dev) +{ + int i; + int ret; + + for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i) { + ret = device_create_file(&dev->dev, + ipath_class_attributes[i]); + if (ret) + goto bail; + } + return 0; +bail: + for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i) + device_remove_file(&dev->dev, ipath_class_attributes[i]); + return ret; +} diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.h b/drivers/infiniband/hw/ipath/ipath_verbs.h new file mode 100644 index 0000000..ae6cff4 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_verbs.h @@ -0,0 +1,936 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IPATH_VERBS_H +#define IPATH_VERBS_H + +#include +#include +#include +#include +#include +#include +#include + +#include "ipath_kernel.h" + +#define IPATH_MAX_RDMA_ATOMIC 4 + +#define QPN_MAX (1 << 24) +#define QPNMAP_ENTRIES (QPN_MAX / PAGE_SIZE / BITS_PER_BYTE) + +/* + * Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define IPATH_UVERBS_ABI_VERSION 2 + +/* + * Define an ib_cq_notify value that is not valid so we know when CQ + * notifications are armed. + */ +#define IB_CQ_NONE (IB_CQ_NEXT_COMP + 1) + +/* AETH NAK opcode values */ +#define IB_RNR_NAK 0x20 +#define IB_NAK_PSN_ERROR 0x60 +#define IB_NAK_INVALID_REQUEST 0x61 +#define IB_NAK_REMOTE_ACCESS_ERROR 0x62 +#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63 +#define IB_NAK_INVALID_RD_REQUEST 0x64 + +/* Flags for checking QP state (see ib_ipath_state_ops[]) */ +#define IPATH_POST_SEND_OK 0x01 +#define IPATH_POST_RECV_OK 0x02 +#define IPATH_PROCESS_RECV_OK 0x04 +#define IPATH_PROCESS_SEND_OK 0x08 +#define IPATH_PROCESS_NEXT_SEND_OK 0x10 +#define IPATH_FLUSH_SEND 0x20 +#define IPATH_FLUSH_RECV 0x40 +#define IPATH_PROCESS_OR_FLUSH_SEND \ + (IPATH_PROCESS_SEND_OK | IPATH_FLUSH_SEND) + +/* IB Performance Manager status values */ +#define IB_PMA_SAMPLE_STATUS_DONE 0x00 +#define IB_PMA_SAMPLE_STATUS_STARTED 0x01 +#define IB_PMA_SAMPLE_STATUS_RUNNING 0x02 + +/* Mandatory IB performance counter select values. */ +#define IB_PMA_PORT_XMIT_DATA cpu_to_be16(0x0001) +#define IB_PMA_PORT_RCV_DATA cpu_to_be16(0x0002) +#define IB_PMA_PORT_XMIT_PKTS cpu_to_be16(0x0003) +#define IB_PMA_PORT_RCV_PKTS cpu_to_be16(0x0004) +#define IB_PMA_PORT_XMIT_WAIT cpu_to_be16(0x0005) + +struct ib_reth { + __be64 vaddr; + __be32 rkey; + __be32 length; +} __attribute__ ((packed)); + +struct ib_atomic_eth { + __be32 vaddr[2]; /* unaligned so access as 2 32-bit words */ + __be32 rkey; + __be64 swap_data; + __be64 compare_data; +} __attribute__ ((packed)); + +struct ipath_other_headers { + __be32 bth[3]; + union { + struct { + __be32 deth[2]; + __be32 imm_data; + } ud; + struct { + struct ib_reth reth; + __be32 imm_data; + } rc; + struct { + __be32 aeth; + __be32 atomic_ack_eth[2]; + } at; + __be32 imm_data; + __be32 aeth; + struct ib_atomic_eth atomic_eth; + } u; +} __attribute__ ((packed)); + +/* + * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes + * long (72 w/ imm_data). Only the first 56 bytes of the IB header + * will be in the eager header buffer. The remaining 12 or 16 bytes + * are in the data buffer. + */ +struct ipath_ib_header { + __be16 lrh[4]; + union { + struct { + struct ib_grh grh; + struct ipath_other_headers oth; + } l; + struct ipath_other_headers oth; + } u; +} __attribute__ ((packed)); + +struct ipath_pio_header { + __le32 pbc[2]; + struct ipath_ib_header hdr; +} __attribute__ ((packed)); + +/* + * There is one struct ipath_mcast for each multicast GID. + * All attached QPs are then stored as a list of + * struct ipath_mcast_qp. + */ +struct ipath_mcast_qp { + struct list_head list; + struct ipath_qp *qp; +}; + +struct ipath_mcast { + struct rb_node rb_node; + union ib_gid mgid; + struct list_head qp_list; + wait_queue_head_t wait; + atomic_t refcount; + int n_attached; +}; + +/* Protection domain */ +struct ipath_pd { + struct ib_pd ibpd; + int user; /* non-zero if created from user space */ +}; + +/* Address Handle */ +struct ipath_ah { + struct ib_ah ibah; + struct ib_ah_attr attr; +}; + +/* + * This structure is used by ipath_mmap() to validate an offset + * when an mmap() request is made. The vm_area_struct then uses + * this as its vm_private_data. + */ +struct ipath_mmap_info { + struct list_head pending_mmaps; + struct ib_ucontext *context; + void *obj; + __u64 offset; + struct kref ref; + unsigned size; +}; + +/* + * This structure is used to contain the head pointer, tail pointer, + * and completion queue entries as a single memory allocation so + * it can be mmap'ed into user space. + */ +struct ipath_cq_wc { + u32 head; /* index of next entry to fill */ + u32 tail; /* index of next ib_poll_cq() entry */ + union { + /* these are actually size ibcq.cqe + 1 */ + struct ib_uverbs_wc uqueue[0]; + struct ib_wc kqueue[0]; + }; +}; + +/* + * The completion queue structure. + */ +struct ipath_cq { + struct ib_cq ibcq; + struct tasklet_struct comptask; + spinlock_t lock; + u8 notify; + u8 triggered; + struct ipath_cq_wc *queue; + struct ipath_mmap_info *ip; +}; + +/* + * A segment is a linear region of low physical memory. + * XXX Maybe we should use phys addr here and kmap()/kunmap(). + * Used by the verbs layer. + */ +struct ipath_seg { + void *vaddr; + size_t length; +}; + +/* The number of ipath_segs that fit in a page. */ +#define IPATH_SEGSZ (PAGE_SIZE / sizeof (struct ipath_seg)) + +struct ipath_segarray { + struct ipath_seg segs[IPATH_SEGSZ]; +}; + +struct ipath_mregion { + struct ib_pd *pd; /* shares refcnt of ibmr.pd */ + u64 user_base; /* User's address for this region */ + u64 iova; /* IB start address of this region */ + size_t length; + u32 lkey; + u32 offset; /* offset (bytes) to start of region */ + int access_flags; + u32 max_segs; /* number of ipath_segs in all the arrays */ + u32 mapsz; /* size of the map array */ + struct ipath_segarray *map[0]; /* the segments */ +}; + +/* + * These keep track of the copy progress within a memory region. + * Used by the verbs layer. + */ +struct ipath_sge { + struct ipath_mregion *mr; + void *vaddr; /* kernel virtual address of segment */ + u32 sge_length; /* length of the SGE */ + u32 length; /* remaining length of the segment */ + u16 m; /* current index: mr->map[m] */ + u16 n; /* current index: mr->map[m]->segs[n] */ +}; + +/* Memory region */ +struct ipath_mr { + struct ib_mr ibmr; + struct ib_umem *umem; + struct ipath_mregion mr; /* must be last */ +}; + +/* + * Send work request queue entry. + * The size of the sg_list is determined when the QP is created and stored + * in qp->s_max_sge. + */ +struct ipath_swqe { + struct ib_send_wr wr; /* don't use wr.sg_list */ + u32 psn; /* first packet sequence number */ + u32 lpsn; /* last packet sequence number */ + u32 ssn; /* send sequence number */ + u32 length; /* total length of data in sg_list */ + struct ipath_sge sg_list[0]; +}; + +/* + * Receive work request queue entry. + * The size of the sg_list is determined when the QP (or SRQ) is created + * and stored in qp->r_rq.max_sge (or srq->rq.max_sge). + */ +struct ipath_rwqe { + u64 wr_id; + u8 num_sge; + struct ib_sge sg_list[0]; +}; + +/* + * This structure is used to contain the head pointer, tail pointer, + * and receive work queue entries as a single memory allocation so + * it can be mmap'ed into user space. + * Note that the wq array elements are variable size so you can't + * just index into the array to get the N'th element; + * use get_rwqe_ptr() instead. + */ +struct ipath_rwq { + u32 head; /* new work requests posted to the head */ + u32 tail; /* receives pull requests from here. */ + struct ipath_rwqe wq[0]; +}; + +struct ipath_rq { + struct ipath_rwq *wq; + spinlock_t lock; + u32 size; /* size of RWQE array */ + u8 max_sge; +}; + +struct ipath_srq { + struct ib_srq ibsrq; + struct ipath_rq rq; + struct ipath_mmap_info *ip; + /* send signal when number of RWQEs < limit */ + u32 limit; +}; + +struct ipath_sge_state { + struct ipath_sge *sg_list; /* next SGE to be used if any */ + struct ipath_sge sge; /* progress state for the current SGE */ + u8 num_sge; + u8 static_rate; +}; + +/* + * This structure holds the information that the send tasklet needs + * to send a RDMA read response or atomic operation. + */ +struct ipath_ack_entry { + u8 opcode; + u8 sent; + u32 psn; + union { + struct ipath_sge_state rdma_sge; + u64 atomic_data; + }; +}; + +/* + * Variables prefixed with s_ are for the requester (sender). + * Variables prefixed with r_ are for the responder (receiver). + * Variables prefixed with ack_ are for responder replies. + * + * Common variables are protected by both r_rq.lock and s_lock in that order + * which only happens in modify_qp() or changing the QP 'state'. + */ +struct ipath_qp { + struct ib_qp ibqp; + struct ipath_qp *next; /* link list for QPN hash table */ + struct ipath_qp *timer_next; /* link list for ipath_ib_timer() */ + struct ipath_qp *pio_next; /* link for ipath_ib_piobufavail() */ + struct list_head piowait; /* link for wait PIO buf */ + struct list_head timerwait; /* link for waiting for timeouts */ + struct ib_ah_attr remote_ah_attr; + struct ipath_ib_header s_hdr; /* next packet header to send */ + atomic_t refcount; + wait_queue_head_t wait; + wait_queue_head_t wait_dma; + struct tasklet_struct s_task; + struct ipath_mmap_info *ip; + struct ipath_sge_state *s_cur_sge; + struct ipath_verbs_txreq *s_tx; + struct ipath_sge_state s_sge; /* current send request data */ + struct ipath_ack_entry s_ack_queue[IPATH_MAX_RDMA_ATOMIC + 1]; + struct ipath_sge_state s_ack_rdma_sge; + struct ipath_sge_state s_rdma_read_sge; + struct ipath_sge_state r_sge; /* current receive data */ + spinlock_t s_lock; + atomic_t s_dma_busy; + u16 s_pkt_delay; + u16 s_hdrwords; /* size of s_hdr in 32 bit words */ + u32 s_cur_size; /* size of send packet in bytes */ + u32 s_len; /* total length of s_sge */ + u32 s_rdma_read_len; /* total length of s_rdma_read_sge */ + u32 s_next_psn; /* PSN for next request */ + u32 s_last_psn; /* last response PSN processed */ + u32 s_psn; /* current packet sequence number */ + u32 s_ack_rdma_psn; /* PSN for sending RDMA read responses */ + u32 s_ack_psn; /* PSN for acking sends and RDMA writes */ + u32 s_rnr_timeout; /* number of milliseconds for RNR timeout */ + u32 r_ack_psn; /* PSN for next ACK or atomic ACK */ + u64 r_wr_id; /* ID for current receive WQE */ + unsigned long r_aflags; + u32 r_len; /* total length of r_sge */ + u32 r_rcv_len; /* receive data len processed */ + u32 r_psn; /* expected rcv packet sequence number */ + u32 r_msn; /* message sequence number */ + u8 state; /* QP state */ + u8 s_state; /* opcode of last packet sent */ + u8 s_ack_state; /* opcode of packet to ACK */ + u8 s_nak_state; /* non-zero if NAK is pending */ + u8 r_state; /* opcode of last packet received */ + u8 r_nak_state; /* non-zero if NAK is pending */ + u8 r_min_rnr_timer; /* retry timeout value for RNR NAKs */ + u8 r_flags; + u8 r_max_rd_atomic; /* max number of RDMA read/atomic to receive */ + u8 r_head_ack_queue; /* index into s_ack_queue[] */ + u8 qp_access_flags; + u8 s_max_sge; /* size of s_wq->sg_list */ + u8 s_retry_cnt; /* number of times to retry */ + u8 s_rnr_retry_cnt; + u8 s_retry; /* requester retry counter */ + u8 s_rnr_retry; /* requester RNR retry counter */ + u8 s_pkey_index; /* PKEY index to use */ + u8 s_max_rd_atomic; /* max number of RDMA read/atomic to send */ + u8 s_num_rd_atomic; /* number of RDMA read/atomic pending */ + u8 s_tail_ack_queue; /* index into s_ack_queue[] */ + u8 s_flags; + u8 s_dmult; + u8 s_draining; + u8 timeout; /* Timeout for this QP */ + enum ib_mtu path_mtu; + u32 remote_qpn; + u32 qkey; /* QKEY for this QP (for UD or RD) */ + u32 s_size; /* send work queue size */ + u32 s_head; /* new entries added here */ + u32 s_tail; /* next entry to process */ + u32 s_cur; /* current work queue entry */ + u32 s_last; /* last un-ACK'ed entry */ + u32 s_ssn; /* SSN of tail entry */ + u32 s_lsn; /* limit sequence number (credit) */ + struct ipath_swqe *s_wq; /* send work queue */ + struct ipath_swqe *s_wqe; + struct ipath_sge *r_ud_sg_list; + struct ipath_rq r_rq; /* receive work queue */ + struct ipath_sge r_sg_list[0]; /* verified SGEs */ +}; + +/* + * Atomic bit definitions for r_aflags. + */ +#define IPATH_R_WRID_VALID 0 + +/* + * Bit definitions for r_flags. + */ +#define IPATH_R_REUSE_SGE 0x01 +#define IPATH_R_RDMAR_SEQ 0x02 + +/* + * Bit definitions for s_flags. + * + * IPATH_S_FENCE_PENDING - waiting for all prior RDMA read or atomic SWQEs + * before processing the next SWQE + * IPATH_S_RDMAR_PENDING - waiting for any RDMA read or atomic SWQEs + * before processing the next SWQE + * IPATH_S_WAITING - waiting for RNR timeout or send buffer available. + * IPATH_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE + * IPATH_S_WAIT_DMA - waiting for send DMA queue to drain before generating + * next send completion entry not via send DMA. + */ +#define IPATH_S_SIGNAL_REQ_WR 0x01 +#define IPATH_S_FENCE_PENDING 0x02 +#define IPATH_S_RDMAR_PENDING 0x04 +#define IPATH_S_ACK_PENDING 0x08 +#define IPATH_S_BUSY 0x10 +#define IPATH_S_WAITING 0x20 +#define IPATH_S_WAIT_SSN_CREDIT 0x40 +#define IPATH_S_WAIT_DMA 0x80 + +#define IPATH_S_ANY_WAIT (IPATH_S_FENCE_PENDING | IPATH_S_RDMAR_PENDING | \ + IPATH_S_WAITING | IPATH_S_WAIT_SSN_CREDIT | IPATH_S_WAIT_DMA) + +#define IPATH_PSN_CREDIT 512 + +/* + * Since struct ipath_swqe is not a fixed size, we can't simply index into + * struct ipath_qp.s_wq. This function does the array index computation. + */ +static inline struct ipath_swqe *get_swqe_ptr(struct ipath_qp *qp, + unsigned n) +{ + return (struct ipath_swqe *)((char *)qp->s_wq + + (sizeof(struct ipath_swqe) + + qp->s_max_sge * + sizeof(struct ipath_sge)) * n); +} + +/* + * Since struct ipath_rwqe is not a fixed size, we can't simply index into + * struct ipath_rwq.wq. This function does the array index computation. + */ +static inline struct ipath_rwqe *get_rwqe_ptr(struct ipath_rq *rq, + unsigned n) +{ + return (struct ipath_rwqe *) + ((char *) rq->wq->wq + + (sizeof(struct ipath_rwqe) + + rq->max_sge * sizeof(struct ib_sge)) * n); +} + +/* + * QPN-map pages start out as NULL, they get allocated upon + * first use and are never deallocated. This way, + * large bitmaps are not allocated unless large numbers of QPs are used. + */ +struct qpn_map { + atomic_t n_free; + void *page; +}; + +struct ipath_qp_table { + spinlock_t lock; + u32 last; /* last QP number allocated */ + u32 max; /* size of the hash table */ + u32 nmaps; /* size of the map table */ + struct ipath_qp **table; + /* bit map of free numbers */ + struct qpn_map map[QPNMAP_ENTRIES]; +}; + +struct ipath_lkey_table { + spinlock_t lock; + u32 next; /* next unused index (speeds search) */ + u32 gen; /* generation count */ + u32 max; /* size of the table */ + struct ipath_mregion **table; +}; + +struct ipath_opcode_stats { + u64 n_packets; /* number of packets */ + u64 n_bytes; /* total number of bytes */ +}; + +struct ipath_ibdev { + struct ib_device ibdev; + struct ipath_devdata *dd; + struct list_head pending_mmaps; + spinlock_t mmap_offset_lock; + u32 mmap_offset; + int ib_unit; /* This is the device number */ + u16 sm_lid; /* in host order */ + u8 sm_sl; + u8 mkeyprot; + /* non-zero when timer is set */ + unsigned long mkey_lease_timeout; + + /* The following fields are really per port. */ + struct ipath_qp_table qp_table; + struct ipath_lkey_table lk_table; + struct list_head pending[3]; /* FIFO of QPs waiting for ACKs */ + struct list_head piowait; /* list for wait PIO buf */ + struct list_head txreq_free; + void *txreq_bufs; + /* list of QPs waiting for RNR timer */ + struct list_head rnrwait; + spinlock_t pending_lock; + __be64 sys_image_guid; /* in network order */ + __be64 gid_prefix; /* in network order */ + __be64 mkey; + + u32 n_pds_allocated; /* number of PDs allocated for device */ + spinlock_t n_pds_lock; + u32 n_ahs_allocated; /* number of AHs allocated for device */ + spinlock_t n_ahs_lock; + u32 n_cqs_allocated; /* number of CQs allocated for device */ + spinlock_t n_cqs_lock; + u32 n_qps_allocated; /* number of QPs allocated for device */ + spinlock_t n_qps_lock; + u32 n_srqs_allocated; /* number of SRQs allocated for device */ + spinlock_t n_srqs_lock; + u32 n_mcast_grps_allocated; /* number of mcast groups allocated */ + spinlock_t n_mcast_grps_lock; + + u64 ipath_sword; /* total dwords sent (sample result) */ + u64 ipath_rword; /* total dwords received (sample result) */ + u64 ipath_spkts; /* total packets sent (sample result) */ + u64 ipath_rpkts; /* total packets received (sample result) */ + /* # of ticks no data sent (sample result) */ + u64 ipath_xmit_wait; + u64 rcv_errors; /* # of packets with SW detected rcv errs */ + u64 n_unicast_xmit; /* total unicast packets sent */ + u64 n_unicast_rcv; /* total unicast packets received */ + u64 n_multicast_xmit; /* total multicast packets sent */ + u64 n_multicast_rcv; /* total multicast packets received */ + u64 z_symbol_error_counter; /* starting count for PMA */ + u64 z_link_error_recovery_counter; /* starting count for PMA */ + u64 z_link_downed_counter; /* starting count for PMA */ + u64 z_port_rcv_errors; /* starting count for PMA */ + u64 z_port_rcv_remphys_errors; /* starting count for PMA */ + u64 z_port_xmit_discards; /* starting count for PMA */ + u64 z_port_xmit_data; /* starting count for PMA */ + u64 z_port_rcv_data; /* starting count for PMA */ + u64 z_port_xmit_packets; /* starting count for PMA */ + u64 z_port_rcv_packets; /* starting count for PMA */ + u32 z_pkey_violations; /* starting count for PMA */ + u32 z_local_link_integrity_errors; /* starting count for PMA */ + u32 z_excessive_buffer_overrun_errors; /* starting count for PMA */ + u32 z_vl15_dropped; /* starting count for PMA */ + u32 n_rc_resends; + u32 n_rc_acks; + u32 n_rc_qacks; + u32 n_seq_naks; + u32 n_rdma_seq; + u32 n_rnr_naks; + u32 n_other_naks; + u32 n_timeouts; + u32 n_pkt_drops; + u32 n_vl15_dropped; + u32 n_wqe_errs; + u32 n_rdma_dup_busy; + u32 n_piowait; + u32 n_unaligned; + u32 port_cap_flags; + u32 pma_sample_start; + u32 pma_sample_interval; + __be16 pma_counter_select[5]; + u16 pma_tag; + u16 qkey_violations; + u16 mkey_violations; + u16 mkey_lease_period; + u16 pending_index; /* which pending queue is active */ + u8 pma_sample_status; + u8 subnet_timeout; + u8 vl_high_limit; + struct ipath_opcode_stats opstats[128]; +}; + +struct ipath_verbs_counters { + u64 symbol_error_counter; + u64 link_error_recovery_counter; + u64 link_downed_counter; + u64 port_rcv_errors; + u64 port_rcv_remphys_errors; + u64 port_xmit_discards; + u64 port_xmit_data; + u64 port_rcv_data; + u64 port_xmit_packets; + u64 port_rcv_packets; + u32 local_link_integrity_errors; + u32 excessive_buffer_overrun_errors; + u32 vl15_dropped; +}; + +struct ipath_verbs_txreq { + struct ipath_qp *qp; + struct ipath_swqe *wqe; + u32 map_len; + u32 len; + struct ipath_sge_state *ss; + struct ipath_pio_header hdr; + struct ipath_sdma_txreq txreq; +}; + +static inline struct ipath_mr *to_imr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct ipath_mr, ibmr); +} + +static inline struct ipath_pd *to_ipd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct ipath_pd, ibpd); +} + +static inline struct ipath_ah *to_iah(struct ib_ah *ibah) +{ + return container_of(ibah, struct ipath_ah, ibah); +} + +static inline struct ipath_cq *to_icq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct ipath_cq, ibcq); +} + +static inline struct ipath_srq *to_isrq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct ipath_srq, ibsrq); +} + +static inline struct ipath_qp *to_iqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct ipath_qp, ibqp); +} + +static inline struct ipath_ibdev *to_idev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct ipath_ibdev, ibdev); +} + +/* + * This must be called with s_lock held. + */ +static inline void ipath_schedule_send(struct ipath_qp *qp) +{ + if (qp->s_flags & IPATH_S_ANY_WAIT) + qp->s_flags &= ~IPATH_S_ANY_WAIT; + if (!(qp->s_flags & IPATH_S_BUSY)) + tasklet_hi_schedule(&qp->s_task); +} + +int ipath_process_mad(struct ib_device *ibdev, + int mad_flags, + u8 port_num, + struct ib_wc *in_wc, + struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad); + +/* + * Compare the lower 24 bits of the two values. + * Returns an integer <, ==, or > than zero. + */ +static inline int ipath_cmp24(u32 a, u32 b) +{ + return (((int) a) - ((int) b)) << 8; +} + +struct ipath_mcast *ipath_mcast_find(union ib_gid *mgid); + +int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords, + u64 *rwords, u64 *spkts, u64 *rpkts, + u64 *xmit_wait); + +int ipath_get_counters(struct ipath_devdata *dd, + struct ipath_verbs_counters *cntrs); + +int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); + +int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); + +int ipath_mcast_tree_empty(void); + +__be32 ipath_compute_aeth(struct ipath_qp *qp); + +struct ipath_qp *ipath_lookup_qpn(struct ipath_qp_table *qpt, u32 qpn); + +struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); + +int ipath_destroy_qp(struct ib_qp *ibqp); + +int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err); + +int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); + +int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_qp_init_attr *init_attr); + +unsigned ipath_free_all_qps(struct ipath_qp_table *qpt); + +int ipath_init_qp_table(struct ipath_ibdev *idev, int size); + +void ipath_get_credit(struct ipath_qp *qp, u32 aeth); + +unsigned ipath_ib_rate_to_mult(enum ib_rate rate); + +int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr, + u32 hdrwords, struct ipath_sge_state *ss, u32 len); + +void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length); + +void ipath_skip_sge(struct ipath_sge_state *ss, u32 length); + +void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct ipath_qp *qp); + +void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct ipath_qp *qp); + +void ipath_restart_rc(struct ipath_qp *qp, u32 psn); + +void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err); + +int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr); + +void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct ipath_qp *qp); + +int ipath_alloc_lkey(struct ipath_lkey_table *rkt, + struct ipath_mregion *mr); + +void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey); + +int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge, + struct ib_sge *sge, int acc); + +int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss, + u32 len, u64 vaddr, u32 rkey, int acc); + +int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); + +struct ib_srq *ipath_create_srq(struct ib_pd *ibpd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata); + +int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, + struct ib_udata *udata); + +int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr); + +int ipath_destroy_srq(struct ib_srq *ibsrq); + +void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int sig); + +int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry); + +struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries, int comp_vector, + struct ib_ucontext *context, + struct ib_udata *udata); + +int ipath_destroy_cq(struct ib_cq *ibcq); + +int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags); + +int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); + +struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc); + +struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, int acc, u64 *iova_start); + +struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int mr_access_flags, + struct ib_udata *udata); + +int ipath_dereg_mr(struct ib_mr *ibmr); + +struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd, int mr_access_flags, + struct ib_fmr_attr *fmr_attr); + +int ipath_map_phys_fmr(struct ib_fmr *ibfmr, u64 * page_list, + int list_len, u64 iova); + +int ipath_unmap_fmr(struct list_head *fmr_list); + +int ipath_dealloc_fmr(struct ib_fmr *ibfmr); + +void ipath_release_mmap_info(struct kref *ref); + +struct ipath_mmap_info *ipath_create_mmap_info(struct ipath_ibdev *dev, + u32 size, + struct ib_ucontext *context, + void *obj); + +void ipath_update_mmap_info(struct ipath_ibdev *dev, + struct ipath_mmap_info *ip, + u32 size, void *obj); + +int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); + +void ipath_insert_rnr_queue(struct ipath_qp *qp); + +int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe, + u32 *lengthp, struct ipath_sge_state *ss); + +int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only); + +u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr, + struct ib_global_route *grh, u32 hwords, u32 nwords); + +void ipath_make_ruc_header(struct ipath_ibdev *dev, struct ipath_qp *qp, + struct ipath_other_headers *ohdr, + u32 bth0, u32 bth2); + +void ipath_do_send(unsigned long data); + +void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe, + enum ib_wc_status status); + +int ipath_make_rc_req(struct ipath_qp *qp); + +int ipath_make_uc_req(struct ipath_qp *qp); + +int ipath_make_ud_req(struct ipath_qp *qp); + +int ipath_register_ib_device(struct ipath_devdata *); + +void ipath_unregister_ib_device(struct ipath_ibdev *); + +void ipath_ib_rcv(struct ipath_ibdev *, void *, void *, u32); + +int ipath_ib_piobufavail(struct ipath_ibdev *); + +unsigned ipath_get_npkeys(struct ipath_devdata *); + +u32 ipath_get_cr_errpkey(struct ipath_devdata *); + +unsigned ipath_get_pkey(struct ipath_devdata *, unsigned); + +extern const enum ib_wc_opcode ib_ipath_wc_opcode[]; + +/* + * Below converts HCA-specific LinkTrainingState to IB PhysPortState + * values. + */ +extern const u8 ipath_cvt_physportstate[]; +#define IB_PHYSPORTSTATE_SLEEP 1 +#define IB_PHYSPORTSTATE_POLL 2 +#define IB_PHYSPORTSTATE_DISABLED 3 +#define IB_PHYSPORTSTATE_CFG_TRAIN 4 +#define IB_PHYSPORTSTATE_LINKUP 5 +#define IB_PHYSPORTSTATE_LINK_ERR_RECOVER 6 + +extern const int ib_ipath_state_ops[]; + +extern unsigned int ib_ipath_lkey_table_size; + +extern unsigned int ib_ipath_max_cqes; + +extern unsigned int ib_ipath_max_cqs; + +extern unsigned int ib_ipath_max_qp_wrs; + +extern unsigned int ib_ipath_max_qps; + +extern unsigned int ib_ipath_max_sges; + +extern unsigned int ib_ipath_max_mcast_grps; + +extern unsigned int ib_ipath_max_mcast_qp_attached; + +extern unsigned int ib_ipath_max_srqs; + +extern unsigned int ib_ipath_max_srq_sges; + +extern unsigned int ib_ipath_max_srq_wrs; + +extern const u32 ib_ipath_rnr_table[]; + +extern struct ib_dma_mapping_ops ipath_dma_mapping_ops; + +#endif /* IPATH_VERBS_H */ diff --git a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c new file mode 100644 index 0000000..6216ea9 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "ipath_verbs.h" + +/* + * Global table of GID to attached QPs. + * The table is global to all ipath devices since a send from one QP/device + * needs to be locally routed to any locally attached QPs on the same + * or different device. + */ +static struct rb_root mcast_tree; +static DEFINE_SPINLOCK(mcast_lock); + +/** + * ipath_mcast_qp_alloc - alloc a struct to link a QP to mcast GID struct + * @qp: the QP to link + */ +static struct ipath_mcast_qp *ipath_mcast_qp_alloc(struct ipath_qp *qp) +{ + struct ipath_mcast_qp *mqp; + + mqp = kmalloc(sizeof *mqp, GFP_KERNEL); + if (!mqp) + goto bail; + + mqp->qp = qp; + atomic_inc(&qp->refcount); + +bail: + return mqp; +} + +static void ipath_mcast_qp_free(struct ipath_mcast_qp *mqp) +{ + struct ipath_qp *qp = mqp->qp; + + /* Notify ipath_destroy_qp() if it is waiting. */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + + kfree(mqp); +} + +/** + * ipath_mcast_alloc - allocate the multicast GID structure + * @mgid: the multicast GID + * + * A list of QPs will be attached to this structure. + */ +static struct ipath_mcast *ipath_mcast_alloc(union ib_gid *mgid) +{ + struct ipath_mcast *mcast; + + mcast = kmalloc(sizeof *mcast, GFP_KERNEL); + if (!mcast) + goto bail; + + mcast->mgid = *mgid; + INIT_LIST_HEAD(&mcast->qp_list); + init_waitqueue_head(&mcast->wait); + atomic_set(&mcast->refcount, 0); + mcast->n_attached = 0; + +bail: + return mcast; +} + +static void ipath_mcast_free(struct ipath_mcast *mcast) +{ + struct ipath_mcast_qp *p, *tmp; + + list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) + ipath_mcast_qp_free(p); + + kfree(mcast); +} + +/** + * ipath_mcast_find - search the global table for the given multicast GID + * @mgid: the multicast GID to search for + * + * Returns NULL if not found. + * + * The caller is responsible for decrementing the reference count if found. + */ +struct ipath_mcast *ipath_mcast_find(union ib_gid *mgid) +{ + struct rb_node *n; + unsigned long flags; + struct ipath_mcast *mcast; + + spin_lock_irqsave(&mcast_lock, flags); + n = mcast_tree.rb_node; + while (n) { + int ret; + + mcast = rb_entry(n, struct ipath_mcast, rb_node); + + ret = memcmp(mgid->raw, mcast->mgid.raw, + sizeof(union ib_gid)); + if (ret < 0) + n = n->rb_left; + else if (ret > 0) + n = n->rb_right; + else { + atomic_inc(&mcast->refcount); + spin_unlock_irqrestore(&mcast_lock, flags); + goto bail; + } + } + spin_unlock_irqrestore(&mcast_lock, flags); + + mcast = NULL; + +bail: + return mcast; +} + +/** + * ipath_mcast_add - insert mcast GID into table and attach QP struct + * @mcast: the mcast GID table + * @mqp: the QP to attach + * + * Return zero if both were added. Return EEXIST if the GID was already in + * the table but the QP was added. Return ESRCH if the QP was already + * attached and neither structure was added. + */ +static int ipath_mcast_add(struct ipath_ibdev *dev, + struct ipath_mcast *mcast, + struct ipath_mcast_qp *mqp) +{ + struct rb_node **n = &mcast_tree.rb_node; + struct rb_node *pn = NULL; + int ret; + + spin_lock_irq(&mcast_lock); + + while (*n) { + struct ipath_mcast *tmcast; + struct ipath_mcast_qp *p; + + pn = *n; + tmcast = rb_entry(pn, struct ipath_mcast, rb_node); + + ret = memcmp(mcast->mgid.raw, tmcast->mgid.raw, + sizeof(union ib_gid)); + if (ret < 0) { + n = &pn->rb_left; + continue; + } + if (ret > 0) { + n = &pn->rb_right; + continue; + } + + /* Search the QP list to see if this is already there. */ + list_for_each_entry_rcu(p, &tmcast->qp_list, list) { + if (p->qp == mqp->qp) { + ret = ESRCH; + goto bail; + } + } + if (tmcast->n_attached == ib_ipath_max_mcast_qp_attached) { + ret = ENOMEM; + goto bail; + } + + tmcast->n_attached++; + + list_add_tail_rcu(&mqp->list, &tmcast->qp_list); + ret = EEXIST; + goto bail; + } + + spin_lock(&dev->n_mcast_grps_lock); + if (dev->n_mcast_grps_allocated == ib_ipath_max_mcast_grps) { + spin_unlock(&dev->n_mcast_grps_lock); + ret = ENOMEM; + goto bail; + } + + dev->n_mcast_grps_allocated++; + spin_unlock(&dev->n_mcast_grps_lock); + + mcast->n_attached++; + + list_add_tail_rcu(&mqp->list, &mcast->qp_list); + + atomic_inc(&mcast->refcount); + rb_link_node(&mcast->rb_node, pn, n); + rb_insert_color(&mcast->rb_node, &mcast_tree); + + ret = 0; + +bail: + spin_unlock_irq(&mcast_lock); + + return ret; +} + +int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct ipath_qp *qp = to_iqp(ibqp); + struct ipath_ibdev *dev = to_idev(ibqp->device); + struct ipath_mcast *mcast; + struct ipath_mcast_qp *mqp; + int ret; + + /* + * Allocate data structures since its better to do this outside of + * spin locks and it will most likely be needed. + */ + mcast = ipath_mcast_alloc(gid); + if (mcast == NULL) { + ret = -ENOMEM; + goto bail; + } + mqp = ipath_mcast_qp_alloc(qp); + if (mqp == NULL) { + ipath_mcast_free(mcast); + ret = -ENOMEM; + goto bail; + } + switch (ipath_mcast_add(dev, mcast, mqp)) { + case ESRCH: + /* Neither was used: can't attach the same QP twice. */ + ipath_mcast_qp_free(mqp); + ipath_mcast_free(mcast); + ret = -EINVAL; + goto bail; + case EEXIST: /* The mcast wasn't used */ + ipath_mcast_free(mcast); + break; + case ENOMEM: + /* Exceeded the maximum number of mcast groups. */ + ipath_mcast_qp_free(mqp); + ipath_mcast_free(mcast); + ret = -ENOMEM; + goto bail; + default: + break; + } + + ret = 0; + +bail: + return ret; +} + +int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct ipath_qp *qp = to_iqp(ibqp); + struct ipath_ibdev *dev = to_idev(ibqp->device); + struct ipath_mcast *mcast = NULL; + struct ipath_mcast_qp *p, *tmp; + struct rb_node *n; + int last = 0; + int ret; + + spin_lock_irq(&mcast_lock); + + /* Find the GID in the mcast table. */ + n = mcast_tree.rb_node; + while (1) { + if (n == NULL) { + spin_unlock_irq(&mcast_lock); + ret = -EINVAL; + goto bail; + } + + mcast = rb_entry(n, struct ipath_mcast, rb_node); + ret = memcmp(gid->raw, mcast->mgid.raw, + sizeof(union ib_gid)); + if (ret < 0) + n = n->rb_left; + else if (ret > 0) + n = n->rb_right; + else + break; + } + + /* Search the QP list. */ + list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) { + if (p->qp != qp) + continue; + /* + * We found it, so remove it, but don't poison the forward + * link until we are sure there are no list walkers. + */ + list_del_rcu(&p->list); + mcast->n_attached--; + + /* If this was the last attached QP, remove the GID too. */ + if (list_empty(&mcast->qp_list)) { + rb_erase(&mcast->rb_node, &mcast_tree); + last = 1; + } + break; + } + + spin_unlock_irq(&mcast_lock); + + if (p) { + /* + * Wait for any list walkers to finish before freeing the + * list element. + */ + wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1); + ipath_mcast_qp_free(p); + } + if (last) { + atomic_dec(&mcast->refcount); + wait_event(mcast->wait, !atomic_read(&mcast->refcount)); + ipath_mcast_free(mcast); + spin_lock_irq(&dev->n_mcast_grps_lock); + dev->n_mcast_grps_allocated--; + spin_unlock_irq(&dev->n_mcast_grps_lock); + } + + ret = 0; + +bail: + return ret; +} + +int ipath_mcast_tree_empty(void) +{ + return mcast_tree.rb_node == NULL; +} diff --git a/drivers/infiniband/hw/ipath/ipath_wc_ppc64.c b/drivers/infiniband/hw/ipath/ipath_wc_ppc64.c new file mode 100644 index 0000000..1d7bd82 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_wc_ppc64.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This file is conditionally built on PowerPC only. Otherwise weak symbol + * versions of the functions exported from here are used. + */ + +#include "ipath_kernel.h" + +/** + * ipath_enable_wc - enable write combining for MMIO writes to the device + * @dd: infinipath device + * + * Nothing to do on PowerPC, so just return without error. + */ +int ipath_enable_wc(struct ipath_devdata *dd) +{ + return 0; +} + +/** + * ipath_unordered_wc - indicate whether write combining is unordered + * + * Because our performance depends on our ability to do write + * combining mmio writes in the most efficient way, we need to + * know if we are on a processor that may reorder stores when + * write combining. + */ +int ipath_unordered_wc(void) +{ + return 1; +} diff --git a/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c b/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c new file mode 100644 index 0000000..3428acb --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This file is conditionally built on x86_64 only. Otherwise weak symbol + * versions of the functions exported from here are used. + */ + +#include +#include +#include + +#include "ipath_kernel.h" + +/** + * ipath_enable_wc - enable write combining for MMIO writes to the device + * @dd: infinipath device + * + * This routine is x86_64-specific; it twiddles the CPU's MTRRs to enable + * write combining. + */ +int ipath_enable_wc(struct ipath_devdata *dd) +{ + int ret = 0; + u64 pioaddr, piolen; + unsigned bits; + const unsigned long addr = pci_resource_start(dd->pcidev, 0); + const size_t len = pci_resource_len(dd->pcidev, 0); + + /* + * Set the PIO buffers to be WCCOMB, so we get HT bursts to the + * chip. Linux (possibly the hardware) requires it to be on a power + * of 2 address matching the length (which has to be a power of 2). + * For rev1, that means the base address, for rev2, it will be just + * the PIO buffers themselves. + * For chips with two sets of buffers, the calculations are + * somewhat more complicated; we need to sum, and the piobufbase + * register has both offsets, 2K in low 32 bits, 4K in high 32 bits. + * The buffers are still packed, so a single range covers both. + */ + if (dd->ipath_piobcnt2k && dd->ipath_piobcnt4k) { /* 2 sizes */ + unsigned long pio2kbase, pio4kbase; + pio2kbase = dd->ipath_piobufbase & 0xffffffffUL; + pio4kbase = (dd->ipath_piobufbase >> 32) & 0xffffffffUL; + if (pio2kbase < pio4kbase) { /* all, for now */ + pioaddr = addr + pio2kbase; + piolen = pio4kbase - pio2kbase + + dd->ipath_piobcnt4k * dd->ipath_4kalign; + } else { + pioaddr = addr + pio4kbase; + piolen = pio2kbase - pio4kbase + + dd->ipath_piobcnt2k * dd->ipath_palign; + } + } else { /* single buffer size (2K, currently) */ + pioaddr = addr + dd->ipath_piobufbase; + piolen = dd->ipath_piobcnt2k * dd->ipath_palign + + dd->ipath_piobcnt4k * dd->ipath_4kalign; + } + + for (bits = 0; !(piolen & (1ULL << bits)); bits++) + /* do nothing */ ; + + if (piolen != (1ULL << bits)) { + piolen >>= bits; + while (piolen >>= 1) + bits++; + piolen = 1ULL << (bits + 1); + } + if (pioaddr & (piolen - 1)) { + u64 atmp; + ipath_dbg("pioaddr %llx not on right boundary for size " + "%llx, fixing\n", + (unsigned long long) pioaddr, + (unsigned long long) piolen); + atmp = pioaddr & ~(piolen - 1); + if (atmp < addr || (atmp + piolen) > (addr + len)) { + ipath_dev_err(dd, "No way to align address/size " + "(%llx/%llx), no WC mtrr\n", + (unsigned long long) atmp, + (unsigned long long) piolen << 1); + ret = -ENODEV; + } else { + ipath_dbg("changing WC base from %llx to %llx, " + "len from %llx to %llx\n", + (unsigned long long) pioaddr, + (unsigned long long) atmp, + (unsigned long long) piolen, + (unsigned long long) piolen << 1); + pioaddr = atmp; + piolen <<= 1; + } + } + + if (!ret) { + int cookie; + ipath_cdbg(VERBOSE, "Setting mtrr for chip to WC " + "(addr %llx, len=0x%llx)\n", + (unsigned long long) pioaddr, + (unsigned long long) piolen); + cookie = mtrr_add(pioaddr, piolen, MTRR_TYPE_WRCOMB, 0); + if (cookie < 0) { + { + dev_info(&dd->pcidev->dev, + "mtrr_add() WC for PIO bufs " + "failed (%d)\n", + cookie); + ret = -EINVAL; + } + } else { + ipath_cdbg(VERBOSE, "Set mtrr for chip to WC, " + "cookie is %d\n", cookie); + dd->ipath_wc_cookie = cookie; + dd->ipath_wc_base = (unsigned long) pioaddr; + dd->ipath_wc_len = (unsigned long) piolen; + } + } + + return ret; +} + +/** + * ipath_disable_wc - disable write combining for MMIO writes to the device + * @dd: infinipath device + */ +void ipath_disable_wc(struct ipath_devdata *dd) +{ + if (dd->ipath_wc_cookie) { + int r; + ipath_cdbg(VERBOSE, "undoing WCCOMB on pio buffers\n"); + r = mtrr_del(dd->ipath_wc_cookie, dd->ipath_wc_base, + dd->ipath_wc_len); + if (r < 0) + dev_info(&dd->pcidev->dev, + "mtrr_del(%lx, %lx, %lx) failed: %d\n", + dd->ipath_wc_cookie, dd->ipath_wc_base, + dd->ipath_wc_len, r); + dd->ipath_wc_cookie = 0; /* even on failure */ + } +} + +/** + * ipath_unordered_wc - indicate whether write combining is ordered + * + * Because our performance depends on our ability to do write combining mmio + * writes in the most efficient way, we need to know if we are on an Intel + * or AMD x86_64 processor. AMD x86_64 processors flush WC buffers out in + * the order completed, and so no special flushing is required to get + * correct ordering. Intel processors, however, will flush write buffers + * out in "random" orders, and so explicit ordering is needed at times. + */ +int ipath_unordered_wc(void) +{ + return boot_cpu_data.x86_vendor != X86_VENDOR_AMD; +} diff --git a/drivers/infiniband/hw/mlx4/Kconfig b/drivers/infiniband/hw/mlx4/Kconfig new file mode 100644 index 0000000..fc01dea --- /dev/null +++ b/drivers/infiniband/hw/mlx4/Kconfig @@ -0,0 +1,10 @@ +config MLX4_INFINIBAND + tristate "Mellanox ConnectX HCA support" + depends on NETDEVICES && ETHERNET && PCI && INET + select NET_VENDOR_MELLANOX + select MLX4_CORE + ---help--- + This driver provides low-level InfiniBand support for + Mellanox ConnectX PCI Express host channel adapters (HCAs). + This is required to use InfiniBand protocols such as + IP-over-IB or SRP with these devices. diff --git a/drivers/infiniband/hw/mlx4/Makefile b/drivers/infiniband/hw/mlx4/Makefile new file mode 100644 index 0000000..f4213b3 --- /dev/null +++ b/drivers/infiniband/hw/mlx4/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_MLX4_INFINIBAND) += mlx4_ib.o + +mlx4_ib-y := ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o mcg.o cm.o alias_GUID.o sysfs.o diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c new file mode 100644 index 0000000..2d8c339 --- /dev/null +++ b/drivers/infiniband/hw/mlx4/ah.c @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include +#include +#include + +#include "mlx4_ib.h" + +static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr, + struct mlx4_ib_ah *ah) +{ + struct mlx4_dev *dev = to_mdev(pd->device)->dev; + + ah->av.ib.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24)); + ah->av.ib.g_slid = ah_attr->src_path_bits; + if (ah_attr->ah_flags & IB_AH_GRH) { + ah->av.ib.g_slid |= 0x80; + ah->av.ib.gid_index = ah_attr->grh.sgid_index; + ah->av.ib.hop_limit = ah_attr->grh.hop_limit; + ah->av.ib.sl_tclass_flowlabel |= + cpu_to_be32((ah_attr->grh.traffic_class << 20) | + ah_attr->grh.flow_label); + memcpy(ah->av.ib.dgid, ah_attr->grh.dgid.raw, 16); + } + + ah->av.ib.dlid = cpu_to_be16(ah_attr->dlid); + if (ah_attr->static_rate) { + ah->av.ib.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET; + while (ah->av.ib.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET && + !(1 << ah->av.ib.stat_rate & dev->caps.stat_rate_support)) + --ah->av.ib.stat_rate; + } + ah->av.ib.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28); + + return &ah->ibah; +} + +static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr, + struct mlx4_ib_ah *ah) +{ + struct mlx4_ib_dev *ibdev = to_mdev(pd->device); + struct mlx4_dev *dev = ibdev->dev; + int is_mcast = 0; + struct in6_addr in6; + u16 vlan_tag; + + memcpy(&in6, ah_attr->grh.dgid.raw, sizeof(in6)); + if (rdma_is_multicast_addr(&in6)) { + is_mcast = 1; + rdma_get_mcast_mac(&in6, ah->av.eth.mac); + } else { + memcpy(ah->av.eth.mac, ah_attr->dmac, ETH_ALEN); + } + vlan_tag = ah_attr->vlan_id; + if (vlan_tag < 0x1000) + vlan_tag |= (ah_attr->sl & 7) << 13; + ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24)); + ah->av.eth.gid_index = ah_attr->grh.sgid_index; + ah->av.eth.vlan = cpu_to_be16(vlan_tag); + if (ah_attr->static_rate) { + ah->av.eth.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET; + while (ah->av.eth.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET && + !(1 << ah->av.eth.stat_rate & dev->caps.stat_rate_support)) + --ah->av.eth.stat_rate; + } + + /* + * HW requires multicast LID so we just choose one. + */ + if (is_mcast) + ah->av.ib.dlid = cpu_to_be16(0xc000); + + memcpy(ah->av.eth.dgid, ah_attr->grh.dgid.raw, 16); + ah->av.eth.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 29); + + return &ah->ibah; +} + +struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +{ + struct mlx4_ib_ah *ah; + struct ib_ah *ret; + + ah = kzalloc(sizeof *ah, GFP_ATOMIC); + if (!ah) + return ERR_PTR(-ENOMEM); + + if (rdma_port_get_link_layer(pd->device, ah_attr->port_num) == IB_LINK_LAYER_ETHERNET) { + if (!(ah_attr->ah_flags & IB_AH_GRH)) { + ret = ERR_PTR(-EINVAL); + } else { + /* + * TBD: need to handle the case when we get + * called in an atomic context and there we + * might sleep. We don't expect this + * currently since we're working with link + * local addresses which we can translate + * without going to sleep. + */ + ret = create_iboe_ah(pd, ah_attr, ah); + } + + if (IS_ERR(ret)) + kfree(ah); + + return ret; + } else + return create_ib_ah(pd, ah_attr, ah); /* never fails */ +} + +int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) +{ + struct mlx4_ib_ah *ah = to_mah(ibah); + enum rdma_link_layer ll; + + memset(ah_attr, 0, sizeof *ah_attr); + ah_attr->sl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; + ah_attr->port_num = be32_to_cpu(ah->av.ib.port_pd) >> 24; + ll = rdma_port_get_link_layer(ibah->device, ah_attr->port_num); + ah_attr->dlid = ll == IB_LINK_LAYER_INFINIBAND ? be16_to_cpu(ah->av.ib.dlid) : 0; + if (ah->av.ib.stat_rate) + ah_attr->static_rate = ah->av.ib.stat_rate - MLX4_STAT_RATE_OFFSET; + ah_attr->src_path_bits = ah->av.ib.g_slid & 0x7F; + + if (mlx4_ib_ah_grh_present(ah)) { + ah_attr->ah_flags = IB_AH_GRH; + + ah_attr->grh.traffic_class = + be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20; + ah_attr->grh.flow_label = + be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) & 0xfffff; + ah_attr->grh.hop_limit = ah->av.ib.hop_limit; + ah_attr->grh.sgid_index = ah->av.ib.gid_index; + memcpy(ah_attr->grh.dgid.raw, ah->av.ib.dgid, 16); + } + + return 0; +} + +int mlx4_ib_destroy_ah(struct ib_ah *ah) +{ + kfree(to_mah(ah)); + return 0; +} diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c new file mode 100644 index 0000000..0eb141c --- /dev/null +++ b/drivers/infiniband/hw/mlx4/alias_GUID.c @@ -0,0 +1,688 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + /***********************************************************/ +/*This file support the handling of the Alias GUID feature. */ +/***********************************************************/ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mlx4_ib.h" + +/* +The driver keeps the current state of all guids, as they are in the HW. +Whenever we receive an smp mad GUIDInfo record, the data will be cached. +*/ + +struct mlx4_alias_guid_work_context { + u8 port; + struct mlx4_ib_dev *dev ; + struct ib_sa_query *sa_query; + struct completion done; + int query_id; + struct list_head list; + int block_num; +}; + +struct mlx4_next_alias_guid_work { + u8 port; + u8 block_num; + struct mlx4_sriov_alias_guid_info_rec_det rec_det; +}; + + +void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev, int block_num, + u8 port_num, u8 *p_data) +{ + int i; + u64 guid_indexes; + int slave_id; + int port_index = port_num - 1; + + if (!mlx4_is_master(dev->dev)) + return; + + guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid. + ports_guid[port_num - 1]. + all_rec_per_port[block_num].guid_indexes); + pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num, guid_indexes); + + for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) { + /* The location of the specific index starts from bit number 4 + * until bit num 11 */ + if (test_bit(i + 4, (unsigned long *)&guid_indexes)) { + slave_id = (block_num * NUM_ALIAS_GUID_IN_REC) + i ; + if (slave_id >= dev->dev->num_slaves) { + pr_debug("The last slave: %d\n", slave_id); + return; + } + + /* cache the guid: */ + memcpy(&dev->sriov.demux[port_index].guid_cache[slave_id], + &p_data[i * GUID_REC_SIZE], + GUID_REC_SIZE); + } else + pr_debug("Guid number: %d in block: %d" + " was not updated\n", i, block_num); + } +} + +static __be64 get_cached_alias_guid(struct mlx4_ib_dev *dev, int port, int index) +{ + if (index >= NUM_ALIAS_GUID_PER_PORT) { + pr_err("%s: ERROR: asked for index:%d\n", __func__, index); + return (__force __be64) -1; + } + return *(__be64 *)&dev->sriov.demux[port - 1].guid_cache[index]; +} + + +ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index) +{ + return IB_SA_COMP_MASK(4 + index); +} + +/* + * Whenever new GUID is set/unset (guid table change) create event and + * notify the relevant slave (master also should be notified). + * If the GUID value is not as we have in the cache the slave will not be + * updated; in this case it waits for the smp_snoop or the port management + * event to call the function and to update the slave. + * block_number - the index of the block (16 blocks available) + * port_number - 1 or 2 + */ +void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev, + int block_num, u8 port_num, + u8 *p_data) +{ + int i; + u64 guid_indexes; + int slave_id; + enum slave_port_state new_state; + enum slave_port_state prev_state; + __be64 tmp_cur_ag, form_cache_ag; + enum slave_port_gen_event gen_event; + + if (!mlx4_is_master(dev->dev)) + return; + + guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid. + ports_guid[port_num - 1]. + all_rec_per_port[block_num].guid_indexes); + pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num, guid_indexes); + + /*calculate the slaves and notify them*/ + for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) { + /* the location of the specific index runs from bits 4..11 */ + if (!(test_bit(i + 4, (unsigned long *)&guid_indexes))) + continue; + + slave_id = (block_num * NUM_ALIAS_GUID_IN_REC) + i ; + if (slave_id >= dev->dev->num_vfs + 1) + return; + tmp_cur_ag = *(__be64 *)&p_data[i * GUID_REC_SIZE]; + form_cache_ag = get_cached_alias_guid(dev, port_num, + (NUM_ALIAS_GUID_IN_REC * block_num) + i); + /* + * Check if guid is not the same as in the cache, + * If it is different, wait for the snoop_smp or the port mgmt + * change event to update the slave on its port state change + */ + if (tmp_cur_ag != form_cache_ag) + continue; + mlx4_gen_guid_change_eqe(dev->dev, slave_id, port_num); + + /*2 cases: Valid GUID, and Invalid Guid*/ + + if (tmp_cur_ag != MLX4_NOT_SET_GUID) { /*valid GUID*/ + prev_state = mlx4_get_slave_port_state(dev->dev, slave_id, port_num); + new_state = set_and_calc_slave_port_state(dev->dev, slave_id, port_num, + MLX4_PORT_STATE_IB_PORT_STATE_EVENT_GID_VALID, + &gen_event); + pr_debug("slave: %d, port: %d prev_port_state: %d," + " new_port_state: %d, gen_event: %d\n", + slave_id, port_num, prev_state, new_state, gen_event); + if (gen_event == SLAVE_PORT_GEN_EVENT_UP) { + pr_debug("sending PORT_UP event to slave: %d, port: %d\n", + slave_id, port_num); + mlx4_gen_port_state_change_eqe(dev->dev, slave_id, + port_num, MLX4_PORT_CHANGE_SUBTYPE_ACTIVE); + } + } else { /* request to invalidate GUID */ + set_and_calc_slave_port_state(dev->dev, slave_id, port_num, + MLX4_PORT_STATE_IB_EVENT_GID_INVALID, + &gen_event); + pr_debug("sending PORT DOWN event to slave: %d, port: %d\n", + slave_id, port_num); + mlx4_gen_port_state_change_eqe(dev->dev, slave_id, port_num, + MLX4_PORT_CHANGE_SUBTYPE_DOWN); + } + } +} + +static void aliasguid_query_handler(int status, + struct ib_sa_guidinfo_rec *guid_rec, + void *context) +{ + struct mlx4_ib_dev *dev; + struct mlx4_alias_guid_work_context *cb_ctx = context; + u8 port_index ; + int i; + struct mlx4_sriov_alias_guid_info_rec_det *rec; + unsigned long flags, flags1; + + if (!context) + return; + + dev = cb_ctx->dev; + port_index = cb_ctx->port - 1; + rec = &dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[cb_ctx->block_num]; + + if (status) { + rec->status = MLX4_GUID_INFO_STATUS_IDLE; + pr_debug("(port: %d) failed: status = %d\n", + cb_ctx->port, status); + goto out; + } + + if (guid_rec->block_num != cb_ctx->block_num) { + pr_err("block num mismatch: %d != %d\n", + cb_ctx->block_num, guid_rec->block_num); + goto out; + } + + pr_debug("lid/port: %d/%d, block_num: %d\n", + be16_to_cpu(guid_rec->lid), cb_ctx->port, + guid_rec->block_num); + + rec = &dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[guid_rec->block_num]; + + rec->status = MLX4_GUID_INFO_STATUS_SET; + rec->method = MLX4_GUID_INFO_RECORD_SET; + + for (i = 0 ; i < NUM_ALIAS_GUID_IN_REC; i++) { + __be64 tmp_cur_ag; + tmp_cur_ag = *(__be64 *)&guid_rec->guid_info_list[i * GUID_REC_SIZE]; + /* check if the SM didn't assign one of the records. + * if it didn't, if it was not sysadmin request: + * ask the SM to give a new GUID, (instead of the driver request). + */ + if (tmp_cur_ag == MLX4_NOT_SET_GUID) { + mlx4_ib_warn(&dev->ib_dev, "%s:Record num %d in " + "block_num: %d was declined by SM, " + "ownership by %d (0 = driver, 1=sysAdmin," + " 2=None)\n", __func__, i, + guid_rec->block_num, rec->ownership); + if (rec->ownership == MLX4_GUID_DRIVER_ASSIGN) { + /* if it is driver assign, asks for new GUID from SM*/ + *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE] = + MLX4_NOT_SET_GUID; + + /* Mark the record as not assigned, and let it + * be sent again in the next work sched.*/ + rec->status = MLX4_GUID_INFO_STATUS_IDLE; + rec->guid_indexes |= mlx4_ib_get_aguid_comp_mask_from_ix(i); + } + } else { + /* properly assigned record. */ + /* We save the GUID we just got from the SM in the + * admin_guid in order to be persistent, and in the + * request from the sm the process will ask for the same GUID */ + if (rec->ownership == MLX4_GUID_SYSADMIN_ASSIGN && + tmp_cur_ag != *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE]) { + /* the sysadmin assignment failed.*/ + mlx4_ib_warn(&dev->ib_dev, "%s: Failed to set" + " admin guid after SysAdmin " + "configuration. " + "Record num %d in block_num:%d " + "was declined by SM, " + "new val(0x%llx) was kept\n", + __func__, i, + guid_rec->block_num, + be64_to_cpu(*(__be64 *) & + rec->all_recs[i * GUID_REC_SIZE])); + } else { + memcpy(&rec->all_recs[i * GUID_REC_SIZE], + &guid_rec->guid_info_list[i * GUID_REC_SIZE], + GUID_REC_SIZE); + } + } + } + /* + The func is call here to close the cases when the + sm doesn't send smp, so in the sa response the driver + notifies the slave. + */ + mlx4_ib_notify_slaves_on_guid_change(dev, guid_rec->block_num, + cb_ctx->port, + guid_rec->guid_info_list); +out: + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + if (!dev->sriov.is_going_down) + queue_delayed_work(dev->sriov.alias_guid.ports_guid[port_index].wq, + &dev->sriov.alias_guid.ports_guid[port_index]. + alias_guid_work, 0); + if (cb_ctx->sa_query) { + list_del(&cb_ctx->list); + kfree(cb_ctx); + } else + complete(&cb_ctx->done); + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); +} + +static void invalidate_guid_record(struct mlx4_ib_dev *dev, u8 port, int index) +{ + int i; + u64 cur_admin_val; + ib_sa_comp_mask comp_mask = 0; + + dev->sriov.alias_guid.ports_guid[port - 1].all_rec_per_port[index].status + = MLX4_GUID_INFO_STATUS_IDLE; + dev->sriov.alias_guid.ports_guid[port - 1].all_rec_per_port[index].method + = MLX4_GUID_INFO_RECORD_SET; + + /* calculate the comp_mask for that record.*/ + for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) { + cur_admin_val = + *(u64 *)&dev->sriov.alias_guid.ports_guid[port - 1]. + all_rec_per_port[index].all_recs[GUID_REC_SIZE * i]; + /* + check the admin value: if it's for delete (~00LL) or + it is the first guid of the first record (hw guid) or + the records is not in ownership of the sysadmin and the sm doesn't + need to assign GUIDs, then don't put it up for assignment. + */ + if (MLX4_GUID_FOR_DELETE_VAL == cur_admin_val || + (!index && !i) || + MLX4_GUID_NONE_ASSIGN == dev->sriov.alias_guid. + ports_guid[port - 1].all_rec_per_port[index].ownership) + continue; + comp_mask |= mlx4_ib_get_aguid_comp_mask_from_ix(i); + } + dev->sriov.alias_guid.ports_guid[port - 1]. + all_rec_per_port[index].guid_indexes = comp_mask; +} + +static int set_guid_rec(struct ib_device *ibdev, + u8 port, int index, + struct mlx4_sriov_alias_guid_info_rec_det *rec_det) +{ + int err; + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct ib_sa_guidinfo_rec guid_info_rec; + ib_sa_comp_mask comp_mask; + struct ib_port_attr attr; + struct mlx4_alias_guid_work_context *callback_context; + unsigned long resched_delay, flags, flags1; + struct list_head *head = + &dev->sriov.alias_guid.ports_guid[port - 1].cb_list; + + err = __mlx4_ib_query_port(ibdev, port, &attr, 1); + if (err) { + pr_debug("mlx4_ib_query_port failed (err: %d), port: %d\n", + err, port); + return err; + } + /*check the port was configured by the sm, otherwise no need to send */ + if (attr.state != IB_PORT_ACTIVE) { + pr_debug("port %d not active...rescheduling\n", port); + resched_delay = 5 * HZ; + err = -EAGAIN; + goto new_schedule; + } + + callback_context = kmalloc(sizeof *callback_context, GFP_KERNEL); + if (!callback_context) { + err = -ENOMEM; + resched_delay = HZ * 5; + goto new_schedule; + } + callback_context->port = port; + callback_context->dev = dev; + callback_context->block_num = index; + + memset(&guid_info_rec, 0, sizeof (struct ib_sa_guidinfo_rec)); + + guid_info_rec.lid = cpu_to_be16(attr.lid); + guid_info_rec.block_num = index; + + memcpy(guid_info_rec.guid_info_list, rec_det->all_recs, + GUID_REC_SIZE * NUM_ALIAS_GUID_IN_REC); + comp_mask = IB_SA_GUIDINFO_REC_LID | IB_SA_GUIDINFO_REC_BLOCK_NUM | + rec_det->guid_indexes; + + init_completion(&callback_context->done); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + list_add_tail(&callback_context->list, head); + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + + callback_context->query_id = + ib_sa_guid_info_rec_query(dev->sriov.alias_guid.sa_client, + ibdev, port, &guid_info_rec, + comp_mask, rec_det->method, 1000, + GFP_KERNEL, aliasguid_query_handler, + callback_context, + &callback_context->sa_query); + if (callback_context->query_id < 0) { + pr_debug("ib_sa_guid_info_rec_query failed, query_id: " + "%d. will reschedule to the next 1 sec.\n", + callback_context->query_id); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + list_del(&callback_context->list); + kfree(callback_context); + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + resched_delay = 1 * HZ; + err = -EAGAIN; + goto new_schedule; + } + err = 0; + goto out; + +new_schedule: + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + invalidate_guid_record(dev, port, index); + if (!dev->sriov.is_going_down) { + queue_delayed_work(dev->sriov.alias_guid.ports_guid[port - 1].wq, + &dev->sriov.alias_guid.ports_guid[port - 1].alias_guid_work, + resched_delay); + } + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); + +out: + return err; +} + +void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port) +{ + int i; + unsigned long flags, flags1; + + pr_debug("port %d\n", port); + + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + for (i = 0; i < NUM_ALIAS_GUID_REC_IN_PORT; i++) + invalidate_guid_record(dev, port, i); + + if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down) { + /* + make sure no work waits in the queue, if the work is already + queued(not on the timer) the cancel will fail. That is not a problem + because we just want the work started. + */ + cancel_delayed_work(&dev->sriov.alias_guid. + ports_guid[port - 1].alias_guid_work); + queue_delayed_work(dev->sriov.alias_guid.ports_guid[port - 1].wq, + &dev->sriov.alias_guid.ports_guid[port - 1].alias_guid_work, + 0); + } + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); +} + +/* The function returns the next record that was + * not configured (or failed to be configured) */ +static int get_next_record_to_update(struct mlx4_ib_dev *dev, u8 port, + struct mlx4_next_alias_guid_work *rec) +{ + int j; + unsigned long flags; + + for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) { + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags); + if (dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j].status == + MLX4_GUID_INFO_STATUS_IDLE) { + memcpy(&rec->rec_det, + &dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j], + sizeof (struct mlx4_sriov_alias_guid_info_rec_det)); + rec->port = port; + rec->block_num = j; + dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j].status = + MLX4_GUID_INFO_STATUS_PENDING; + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags); + return 0; + } + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags); + } + return -ENOENT; +} + +static void set_administratively_guid_record(struct mlx4_ib_dev *dev, int port, + int rec_index, + struct mlx4_sriov_alias_guid_info_rec_det *rec_det) +{ + dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].guid_indexes = + rec_det->guid_indexes; + memcpy(dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].all_recs, + rec_det->all_recs, NUM_ALIAS_GUID_IN_REC * GUID_REC_SIZE); + dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].status = + rec_det->status; +} + +static void set_all_slaves_guids(struct mlx4_ib_dev *dev, int port) +{ + int j; + struct mlx4_sriov_alias_guid_info_rec_det rec_det ; + + for (j = 0 ; j < NUM_ALIAS_GUID_REC_IN_PORT ; j++) { + memset(rec_det.all_recs, 0, NUM_ALIAS_GUID_IN_REC * GUID_REC_SIZE); + rec_det.guid_indexes = (!j ? 0 : IB_SA_GUIDINFO_REC_GID0) | + IB_SA_GUIDINFO_REC_GID1 | IB_SA_GUIDINFO_REC_GID2 | + IB_SA_GUIDINFO_REC_GID3 | IB_SA_GUIDINFO_REC_GID4 | + IB_SA_GUIDINFO_REC_GID5 | IB_SA_GUIDINFO_REC_GID6 | + IB_SA_GUIDINFO_REC_GID7; + rec_det.status = MLX4_GUID_INFO_STATUS_IDLE; + set_administratively_guid_record(dev, port, j, &rec_det); + } +} + +static void alias_guid_work(struct work_struct *work) +{ + struct delayed_work *delay = to_delayed_work(work); + int ret = 0; + struct mlx4_next_alias_guid_work *rec; + struct mlx4_sriov_alias_guid_port_rec_det *sriov_alias_port = + container_of(delay, struct mlx4_sriov_alias_guid_port_rec_det, + alias_guid_work); + struct mlx4_sriov_alias_guid *sriov_alias_guid = sriov_alias_port->parent; + struct mlx4_ib_sriov *ib_sriov = container_of(sriov_alias_guid, + struct mlx4_ib_sriov, + alias_guid); + struct mlx4_ib_dev *dev = container_of(ib_sriov, struct mlx4_ib_dev, sriov); + + rec = kzalloc(sizeof *rec, GFP_KERNEL); + if (!rec) { + pr_err("alias_guid_work: No Memory\n"); + return; + } + + pr_debug("starting [port: %d]...\n", sriov_alias_port->port + 1); + ret = get_next_record_to_update(dev, sriov_alias_port->port, rec); + if (ret) { + pr_debug("No more records to update.\n"); + goto out; + } + + set_guid_rec(&dev->ib_dev, rec->port + 1, rec->block_num, + &rec->rec_det); + +out: + kfree(rec); +} + + +void mlx4_ib_init_alias_guid_work(struct mlx4_ib_dev *dev, int port) +{ + unsigned long flags, flags1; + + if (!mlx4_is_master(dev->dev)) + return; + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + if (!dev->sriov.is_going_down) { + queue_delayed_work(dev->sriov.alias_guid.ports_guid[port].wq, + &dev->sriov.alias_guid.ports_guid[port].alias_guid_work, 0); + } + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); +} + +void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev) +{ + int i; + struct mlx4_ib_sriov *sriov = &dev->sriov; + struct mlx4_alias_guid_work_context *cb_ctx; + struct mlx4_sriov_alias_guid_port_rec_det *det; + struct ib_sa_query *sa_query; + unsigned long flags; + + for (i = 0 ; i < dev->num_ports; i++) { + cancel_delayed_work(&dev->sriov.alias_guid.ports_guid[i].alias_guid_work); + det = &sriov->alias_guid.ports_guid[i]; + spin_lock_irqsave(&sriov->alias_guid.ag_work_lock, flags); + while (!list_empty(&det->cb_list)) { + cb_ctx = list_entry(det->cb_list.next, + struct mlx4_alias_guid_work_context, + list); + sa_query = cb_ctx->sa_query; + cb_ctx->sa_query = NULL; + list_del(&cb_ctx->list); + spin_unlock_irqrestore(&sriov->alias_guid.ag_work_lock, flags); + ib_sa_cancel_query(cb_ctx->query_id, sa_query); + wait_for_completion(&cb_ctx->done); + kfree(cb_ctx); + spin_lock_irqsave(&sriov->alias_guid.ag_work_lock, flags); + } + spin_unlock_irqrestore(&sriov->alias_guid.ag_work_lock, flags); + } + for (i = 0 ; i < dev->num_ports; i++) { + flush_workqueue(dev->sriov.alias_guid.ports_guid[i].wq); + destroy_workqueue(dev->sriov.alias_guid.ports_guid[i].wq); + } + ib_sa_unregister_client(dev->sriov.alias_guid.sa_client); + kfree(dev->sriov.alias_guid.sa_client); +} + +int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev) +{ + char alias_wq_name[15]; + int ret = 0; + int i, j, k; + union ib_gid gid; + + if (!mlx4_is_master(dev->dev)) + return 0; + dev->sriov.alias_guid.sa_client = + kzalloc(sizeof *dev->sriov.alias_guid.sa_client, GFP_KERNEL); + if (!dev->sriov.alias_guid.sa_client) + return -ENOMEM; + + ib_sa_register_client(dev->sriov.alias_guid.sa_client); + + spin_lock_init(&dev->sriov.alias_guid.ag_work_lock); + + for (i = 1; i <= dev->num_ports; ++i) { + if (dev->ib_dev.query_gid(&dev->ib_dev , i, 0, &gid)) { + ret = -EFAULT; + goto err_unregister; + } + } + + for (i = 0 ; i < dev->num_ports; i++) { + memset(&dev->sriov.alias_guid.ports_guid[i], 0, + sizeof (struct mlx4_sriov_alias_guid_port_rec_det)); + /*Check if the SM doesn't need to assign the GUIDs*/ + for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) { + if (mlx4_ib_sm_guid_assign) { + dev->sriov.alias_guid.ports_guid[i]. + all_rec_per_port[j]. + ownership = MLX4_GUID_DRIVER_ASSIGN; + continue; + } + dev->sriov.alias_guid.ports_guid[i].all_rec_per_port[j]. + ownership = MLX4_GUID_NONE_ASSIGN; + /*mark each val as it was deleted, + till the sysAdmin will give it valid val*/ + for (k = 0; k < NUM_ALIAS_GUID_IN_REC; k++) { + *(__be64 *)&dev->sriov.alias_guid.ports_guid[i]. + all_rec_per_port[j].all_recs[GUID_REC_SIZE * k] = + cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL); + } + } + INIT_LIST_HEAD(&dev->sriov.alias_guid.ports_guid[i].cb_list); + /*prepare the records, set them to be allocated by sm*/ + for (j = 0 ; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) + invalidate_guid_record(dev, i + 1, j); + + dev->sriov.alias_guid.ports_guid[i].parent = &dev->sriov.alias_guid; + dev->sriov.alias_guid.ports_guid[i].port = i; + if (mlx4_ib_sm_guid_assign) + set_all_slaves_guids(dev, i); + + snprintf(alias_wq_name, sizeof alias_wq_name, "alias_guid%d", i); + dev->sriov.alias_guid.ports_guid[i].wq = + create_singlethread_workqueue(alias_wq_name); + if (!dev->sriov.alias_guid.ports_guid[i].wq) { + ret = -ENOMEM; + goto err_thread; + } + INIT_DELAYED_WORK(&dev->sriov.alias_guid.ports_guid[i].alias_guid_work, + alias_guid_work); + } + return 0; + +err_thread: + for (--i; i >= 0; i--) { + destroy_workqueue(dev->sriov.alias_guid.ports_guid[i].wq); + dev->sriov.alias_guid.ports_guid[i].wq = NULL; + } + +err_unregister: + ib_sa_unregister_client(dev->sriov.alias_guid.sa_client); + kfree(dev->sriov.alias_guid.sa_client); + dev->sriov.alias_guid.sa_client = NULL; + pr_err("init_alias_guid_service: Failed. (ret:%d)\n", ret); + return ret; +} diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c new file mode 100644 index 0000000..56a593e --- /dev/null +++ b/drivers/infiniband/hw/mlx4/cm.c @@ -0,0 +1,478 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include +#include +#include +#include + +#include "mlx4_ib.h" + +#define CM_CLEANUP_CACHE_TIMEOUT (5 * HZ) + +struct id_map_entry { + struct rb_node node; + + u32 sl_cm_id; + u32 pv_cm_id; + int slave_id; + int scheduled_delete; + struct mlx4_ib_dev *dev; + + struct list_head list; + struct delayed_work timeout; +}; + +struct cm_generic_msg { + struct ib_mad_hdr hdr; + + __be32 local_comm_id; + __be32 remote_comm_id; +}; + +struct cm_sidr_generic_msg { + struct ib_mad_hdr hdr; + __be32 request_id; +}; + +struct cm_req_msg { + unsigned char unused[0x60]; + union ib_gid primary_path_sgid; +}; + + +static void set_local_comm_id(struct ib_mad *mad, u32 cm_id) +{ + if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + msg->request_id = cpu_to_be32(cm_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + pr_err("trying to set local_comm_id in SIDR_REP\n"); + return; + } else { + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + msg->local_comm_id = cpu_to_be32(cm_id); + } +} + +static u32 get_local_comm_id(struct ib_mad *mad) +{ + if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + return be32_to_cpu(msg->request_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + pr_err("trying to set local_comm_id in SIDR_REP\n"); + return -1; + } else { + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + return be32_to_cpu(msg->local_comm_id); + } +} + +static void set_remote_comm_id(struct ib_mad *mad, u32 cm_id) +{ + if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + msg->request_id = cpu_to_be32(cm_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + pr_err("trying to set remote_comm_id in SIDR_REQ\n"); + return; + } else { + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + msg->remote_comm_id = cpu_to_be32(cm_id); + } +} + +static u32 get_remote_comm_id(struct ib_mad *mad) +{ + if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + return be32_to_cpu(msg->request_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + pr_err("trying to set remote_comm_id in SIDR_REQ\n"); + return -1; + } else { + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + return be32_to_cpu(msg->remote_comm_id); + } +} + +static union ib_gid gid_from_req_msg(struct ib_device *ibdev, struct ib_mad *mad) +{ + struct cm_req_msg *msg = (struct cm_req_msg *)mad; + + return msg->primary_path_sgid; +} + +/* Lock should be taken before called */ +static struct id_map_entry * +id_map_find_by_sl_id(struct ib_device *ibdev, u32 slave_id, u32 sl_cm_id) +{ + struct rb_root *sl_id_map = &to_mdev(ibdev)->sriov.sl_id_map; + struct rb_node *node = sl_id_map->rb_node; + + while (node) { + struct id_map_entry *id_map_entry = + rb_entry(node, struct id_map_entry, node); + + if (id_map_entry->sl_cm_id > sl_cm_id) + node = node->rb_left; + else if (id_map_entry->sl_cm_id < sl_cm_id) + node = node->rb_right; + else if (id_map_entry->slave_id > slave_id) + node = node->rb_left; + else if (id_map_entry->slave_id < slave_id) + node = node->rb_right; + else + return id_map_entry; + } + return NULL; +} + +static void id_map_ent_timeout(struct work_struct *work) +{ + struct delayed_work *delay = to_delayed_work(work); + struct id_map_entry *ent = container_of(delay, struct id_map_entry, timeout); + struct id_map_entry *db_ent, *found_ent; + struct mlx4_ib_dev *dev = ent->dev; + struct mlx4_ib_sriov *sriov = &dev->sriov; + struct rb_root *sl_id_map = &sriov->sl_id_map; + int pv_id = (int) ent->pv_cm_id; + + spin_lock(&sriov->id_map_lock); + db_ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, pv_id); + if (!db_ent) + goto out; + found_ent = id_map_find_by_sl_id(&dev->ib_dev, ent->slave_id, ent->sl_cm_id); + if (found_ent && found_ent == ent) + rb_erase(&found_ent->node, sl_id_map); + idr_remove(&sriov->pv_id_table, pv_id); + +out: + list_del(&ent->list); + spin_unlock(&sriov->id_map_lock); + kfree(ent); +} + +static void id_map_find_del(struct ib_device *ibdev, int pv_cm_id) +{ + struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; + struct rb_root *sl_id_map = &sriov->sl_id_map; + struct id_map_entry *ent, *found_ent; + + spin_lock(&sriov->id_map_lock); + ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, pv_cm_id); + if (!ent) + goto out; + found_ent = id_map_find_by_sl_id(ibdev, ent->slave_id, ent->sl_cm_id); + if (found_ent && found_ent == ent) + rb_erase(&found_ent->node, sl_id_map); + idr_remove(&sriov->pv_id_table, pv_cm_id); +out: + spin_unlock(&sriov->id_map_lock); +} + +static void sl_id_map_add(struct ib_device *ibdev, struct id_map_entry *new) +{ + struct rb_root *sl_id_map = &to_mdev(ibdev)->sriov.sl_id_map; + struct rb_node **link = &sl_id_map->rb_node, *parent = NULL; + struct id_map_entry *ent; + int slave_id = new->slave_id; + int sl_cm_id = new->sl_cm_id; + + ent = id_map_find_by_sl_id(ibdev, slave_id, sl_cm_id); + if (ent) { + pr_debug("overriding existing sl_id_map entry (cm_id = %x)\n", + sl_cm_id); + + rb_replace_node(&ent->node, &new->node, sl_id_map); + return; + } + + /* Go to the bottom of the tree */ + while (*link) { + parent = *link; + ent = rb_entry(parent, struct id_map_entry, node); + + if (ent->sl_cm_id > sl_cm_id || (ent->sl_cm_id == sl_cm_id && ent->slave_id > slave_id)) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + + rb_link_node(&new->node, parent, link); + rb_insert_color(&new->node, sl_id_map); +} + +static struct id_map_entry * +id_map_alloc(struct ib_device *ibdev, int slave_id, u32 sl_cm_id) +{ + int ret; + struct id_map_entry *ent; + struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; + + ent = kmalloc(sizeof (struct id_map_entry), GFP_KERNEL); + if (!ent) { + mlx4_ib_warn(ibdev, "Couldn't allocate id cache entry - out of memory\n"); + return ERR_PTR(-ENOMEM); + } + + ent->sl_cm_id = sl_cm_id; + ent->slave_id = slave_id; + ent->scheduled_delete = 0; + ent->dev = to_mdev(ibdev); + INIT_DELAYED_WORK(&ent->timeout, id_map_ent_timeout); + + idr_preload(GFP_KERNEL); + spin_lock(&to_mdev(ibdev)->sriov.id_map_lock); + + ret = idr_alloc_cyclic(&sriov->pv_id_table, ent, 0, 0, GFP_NOWAIT); + if (ret >= 0) { + ent->pv_cm_id = (u32)ret; + sl_id_map_add(ibdev, ent); + list_add_tail(&ent->list, &sriov->cm_list); + } + + spin_unlock(&sriov->id_map_lock); + idr_preload_end(); + + if (ret >= 0) + return ent; + + /*error flow*/ + kfree(ent); + mlx4_ib_warn(ibdev, "No more space in the idr (err:0x%x)\n", ret); + return ERR_PTR(-ENOMEM); +} + +static struct id_map_entry * +id_map_get(struct ib_device *ibdev, int *pv_cm_id, int sl_cm_id, int slave_id) +{ + struct id_map_entry *ent; + struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; + + spin_lock(&sriov->id_map_lock); + if (*pv_cm_id == -1) { + ent = id_map_find_by_sl_id(ibdev, sl_cm_id, slave_id); + if (ent) + *pv_cm_id = (int) ent->pv_cm_id; + } else + ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, *pv_cm_id); + spin_unlock(&sriov->id_map_lock); + + return ent; +} + +static void schedule_delayed(struct ib_device *ibdev, struct id_map_entry *id) +{ + struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; + unsigned long flags; + + spin_lock(&sriov->id_map_lock); + spin_lock_irqsave(&sriov->going_down_lock, flags); + /*make sure that there is no schedule inside the scheduled work.*/ + if (!sriov->is_going_down) { + id->scheduled_delete = 1; + schedule_delayed_work(&id->timeout, CM_CLEANUP_CACHE_TIMEOUT); + } + spin_unlock_irqrestore(&sriov->going_down_lock, flags); + spin_unlock(&sriov->id_map_lock); +} + +int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id, + struct ib_mad *mad) +{ + struct id_map_entry *id; + u32 sl_cm_id; + int pv_cm_id = -1; + + if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID || + mad->mad_hdr.attr_id == CM_REP_ATTR_ID || + mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + sl_cm_id = get_local_comm_id(mad); + id = id_map_alloc(ibdev, slave_id, sl_cm_id); + if (IS_ERR(id)) { + mlx4_ib_warn(ibdev, "%s: id{slave: %d, sl_cm_id: 0x%x} Failed to id_map_alloc\n", + __func__, slave_id, sl_cm_id); + return PTR_ERR(id); + } + } else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID || + mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + return 0; + } else { + sl_cm_id = get_local_comm_id(mad); + id = id_map_get(ibdev, &pv_cm_id, slave_id, sl_cm_id); + } + + if (!id) { + pr_debug("id{slave: %d, sl_cm_id: 0x%x} is NULL!\n", + slave_id, sl_cm_id); + return -EINVAL; + } + + set_local_comm_id(mad, id->pv_cm_id); + + if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID) + schedule_delayed(ibdev, id); + else if (mad->mad_hdr.attr_id == CM_DREP_ATTR_ID) + id_map_find_del(ibdev, pv_cm_id); + + return 0; +} + +int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, + struct ib_mad *mad) +{ + u32 pv_cm_id; + struct id_map_entry *id; + + if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID || + mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + union ib_gid gid; + + if (!slave) + return 0; + + gid = gid_from_req_msg(ibdev, mad); + *slave = mlx4_ib_find_real_gid(ibdev, port, gid.global.interface_id); + if (*slave < 0) { + mlx4_ib_warn(ibdev, "failed matching slave_id by gid (0x%llx)\n", + gid.global.interface_id); + return -ENOENT; + } + return 0; + } + + pv_cm_id = get_remote_comm_id(mad); + id = id_map_get(ibdev, (int *)&pv_cm_id, -1, -1); + + if (!id) { + pr_debug("Couldn't find an entry for pv_cm_id 0x%x\n", pv_cm_id); + return -ENOENT; + } + + if (slave) + *slave = id->slave_id; + set_remote_comm_id(mad, id->sl_cm_id); + + if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID) + schedule_delayed(ibdev, id); + else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID || + mad->mad_hdr.attr_id == CM_DREP_ATTR_ID) { + id_map_find_del(ibdev, (int) pv_cm_id); + } + + return 0; +} + +void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev) +{ + spin_lock_init(&dev->sriov.id_map_lock); + INIT_LIST_HEAD(&dev->sriov.cm_list); + dev->sriov.sl_id_map = RB_ROOT; + idr_init(&dev->sriov.pv_id_table); +} + +/* slave = -1 ==> all slaves */ +/* TBD -- call paravirt clean for single slave. Need for slave RESET event */ +void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave) +{ + struct mlx4_ib_sriov *sriov = &dev->sriov; + struct rb_root *sl_id_map = &sriov->sl_id_map; + struct list_head lh; + struct rb_node *nd; + int need_flush = 1; + struct id_map_entry *map, *tmp_map; + /* cancel all delayed work queue entries */ + INIT_LIST_HEAD(&lh); + spin_lock(&sriov->id_map_lock); + list_for_each_entry_safe(map, tmp_map, &dev->sriov.cm_list, list) { + if (slave < 0 || slave == map->slave_id) { + if (map->scheduled_delete) + need_flush &= !!cancel_delayed_work(&map->timeout); + } + } + + spin_unlock(&sriov->id_map_lock); + + if (!need_flush) + flush_scheduled_work(); /* make sure all timers were flushed */ + + /* now, remove all leftover entries from databases*/ + spin_lock(&sriov->id_map_lock); + if (slave < 0) { + while (rb_first(sl_id_map)) { + struct id_map_entry *ent = + rb_entry(rb_first(sl_id_map), + struct id_map_entry, node); + + rb_erase(&ent->node, sl_id_map); + idr_remove(&sriov->pv_id_table, (int) ent->pv_cm_id); + } + list_splice_init(&dev->sriov.cm_list, &lh); + } else { + /* first, move nodes belonging to slave to db remove list */ + nd = rb_first(sl_id_map); + while (nd) { + struct id_map_entry *ent = + rb_entry(nd, struct id_map_entry, node); + nd = rb_next(nd); + if (ent->slave_id == slave) + list_move_tail(&ent->list, &lh); + } + /* remove those nodes from databases */ + list_for_each_entry_safe(map, tmp_map, &lh, list) { + rb_erase(&map->node, sl_id_map); + idr_remove(&sriov->pv_id_table, (int) map->pv_cm_id); + } + + /* add remaining nodes from cm_list */ + list_for_each_entry_safe(map, tmp_map, &dev->sriov.cm_list, list) { + if (slave == map->slave_id) + list_move_tail(&map->list, &lh); + } + } + + spin_unlock(&sriov->id_map_lock); + + /* free any map entries left behind due to cancel_delayed_work above */ + list_for_each_entry_safe(map, tmp_map, &lh, list) { + list_del(&map->list); + kfree(map); + } +} diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c new file mode 100644 index 0000000..1066eec --- /dev/null +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -0,0 +1,924 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "mlx4_ib.h" +#include "user.h" + +static void mlx4_ib_cq_comp(struct mlx4_cq *cq) +{ + struct ib_cq *ibcq = &to_mibcq(cq)->ibcq; + ibcq->comp_handler(ibcq, ibcq->cq_context); +} + +static void mlx4_ib_cq_event(struct mlx4_cq *cq, enum mlx4_event type) +{ + struct ib_event event; + struct ib_cq *ibcq; + + if (type != MLX4_EVENT_TYPE_CQ_ERROR) { + pr_warn("Unexpected event type %d " + "on CQ %06x\n", type, cq->cqn); + return; + } + + ibcq = &to_mibcq(cq)->ibcq; + if (ibcq->event_handler) { + event.device = ibcq->device; + event.event = IB_EVENT_CQ_ERR; + event.element.cq = ibcq; + ibcq->event_handler(&event, ibcq->cq_context); + } +} + +static void *get_cqe_from_buf(struct mlx4_ib_cq_buf *buf, int n) +{ + return mlx4_buf_offset(&buf->buf, n * buf->entry_size); +} + +static void *get_cqe(struct mlx4_ib_cq *cq, int n) +{ + return get_cqe_from_buf(&cq->buf, n); +} + +static void *get_sw_cqe(struct mlx4_ib_cq *cq, int n) +{ + struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibcq.cqe); + struct mlx4_cqe *tcqe = ((cq->buf.entry_size == 64) ? (cqe + 1) : cqe); + + return (!!(tcqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ + !!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe; +} + +static struct mlx4_cqe *next_cqe_sw(struct mlx4_ib_cq *cq) +{ + return get_sw_cqe(cq, cq->mcq.cons_index); +} + +int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) +{ + struct mlx4_ib_cq *mcq = to_mcq(cq); + struct mlx4_ib_dev *dev = to_mdev(cq->device); + + return mlx4_cq_modify(dev->dev, &mcq->mcq, cq_count, cq_period); +} + +static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int nent) +{ + int err; + + err = mlx4_buf_alloc(dev->dev, nent * dev->dev->caps.cqe_size, + PAGE_SIZE * 2, &buf->buf, GFP_KERNEL); + + if (err) + goto out; + + buf->entry_size = dev->dev->caps.cqe_size; + err = mlx4_mtt_init(dev->dev, buf->buf.npages, buf->buf.page_shift, + &buf->mtt); + if (err) + goto err_buf; + + err = mlx4_buf_write_mtt(dev->dev, &buf->mtt, &buf->buf, GFP_KERNEL); + if (err) + goto err_mtt; + + return 0; + +err_mtt: + mlx4_mtt_cleanup(dev->dev, &buf->mtt); + +err_buf: + mlx4_buf_free(dev->dev, nent * buf->entry_size, &buf->buf); + +out: + return err; +} + +static void mlx4_ib_free_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int cqe) +{ + mlx4_buf_free(dev->dev, (cqe + 1) * buf->entry_size, &buf->buf); +} + +static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *context, + struct mlx4_ib_cq_buf *buf, struct ib_umem **umem, + u64 buf_addr, int cqe) +{ + int err; + int cqe_size = dev->dev->caps.cqe_size; + + *umem = ib_umem_get(context, buf_addr, cqe * cqe_size, + IB_ACCESS_LOCAL_WRITE, 1); + if (IS_ERR(*umem)) + return PTR_ERR(*umem); + + err = mlx4_mtt_init(dev->dev, ib_umem_page_count(*umem), + ilog2((*umem)->page_size), &buf->mtt); + if (err) + goto err_buf; + + err = mlx4_ib_umem_write_mtt(dev, &buf->mtt, *umem); + if (err) + goto err_mtt; + + return 0; + +err_mtt: + mlx4_mtt_cleanup(dev->dev, &buf->mtt); + +err_buf: + ib_umem_release(*umem); + + return err; +} + +struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct mlx4_ib_cq *cq; + struct mlx4_uar *uar; + int err; + + if (entries < 1 || entries > dev->dev->caps.max_cqes) + return ERR_PTR(-EINVAL); + + cq = kmalloc(sizeof *cq, GFP_KERNEL); + if (!cq) + return ERR_PTR(-ENOMEM); + + entries = roundup_pow_of_two(entries + 1); + cq->ibcq.cqe = entries - 1; + mutex_init(&cq->resize_mutex); + spin_lock_init(&cq->lock); + cq->resize_buf = NULL; + cq->resize_umem = NULL; + + if (context) { + struct mlx4_ib_create_cq ucmd; + + if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { + err = -EFAULT; + goto err_cq; + } + + err = mlx4_ib_get_cq_umem(dev, context, &cq->buf, &cq->umem, + ucmd.buf_addr, entries); + if (err) + goto err_cq; + + err = mlx4_ib_db_map_user(to_mucontext(context), ucmd.db_addr, + &cq->db); + if (err) + goto err_mtt; + + uar = &to_mucontext(context)->uar; + } else { + err = mlx4_db_alloc(dev->dev, &cq->db, 1, GFP_KERNEL); + if (err) + goto err_cq; + + cq->mcq.set_ci_db = cq->db.db; + cq->mcq.arm_db = cq->db.db + 1; + *cq->mcq.set_ci_db = 0; + *cq->mcq.arm_db = 0; + + err = mlx4_ib_alloc_cq_buf(dev, &cq->buf, entries); + if (err) + goto err_db; + + uar = &dev->priv_uar; + } + + if (dev->eq_table) + vector = dev->eq_table[vector % ibdev->num_comp_vectors]; + + err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar, + cq->db.dma, &cq->mcq, vector, 0, 0); + if (err) + goto err_dbmap; + + cq->mcq.comp = mlx4_ib_cq_comp; + cq->mcq.event = mlx4_ib_cq_event; + + if (context) + if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof (__u32))) { + err = -EFAULT; + goto err_dbmap; + } + + return &cq->ibcq; + +err_dbmap: + if (context) + mlx4_ib_db_unmap_user(to_mucontext(context), &cq->db); + +err_mtt: + mlx4_mtt_cleanup(dev->dev, &cq->buf.mtt); + + if (context) + ib_umem_release(cq->umem); + else + mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe); + +err_db: + if (!context) + mlx4_db_free(dev->dev, &cq->db); + +err_cq: + kfree(cq); + + return ERR_PTR(err); +} + +static int mlx4_alloc_resize_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq, + int entries) +{ + int err; + + if (cq->resize_buf) + return -EBUSY; + + cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC); + if (!cq->resize_buf) + return -ENOMEM; + + err = mlx4_ib_alloc_cq_buf(dev, &cq->resize_buf->buf, entries); + if (err) { + kfree(cq->resize_buf); + cq->resize_buf = NULL; + return err; + } + + cq->resize_buf->cqe = entries - 1; + + return 0; +} + +static int mlx4_alloc_resize_umem(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq, + int entries, struct ib_udata *udata) +{ + struct mlx4_ib_resize_cq ucmd; + int err; + + if (cq->resize_umem) + return -EBUSY; + + if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) + return -EFAULT; + + cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC); + if (!cq->resize_buf) + return -ENOMEM; + + err = mlx4_ib_get_cq_umem(dev, cq->umem->context, &cq->resize_buf->buf, + &cq->resize_umem, ucmd.buf_addr, entries); + if (err) { + kfree(cq->resize_buf); + cq->resize_buf = NULL; + return err; + } + + cq->resize_buf->cqe = entries - 1; + + return 0; +} + +static int mlx4_ib_get_outstanding_cqes(struct mlx4_ib_cq *cq) +{ + u32 i; + + i = cq->mcq.cons_index; + while (get_sw_cqe(cq, i)) + ++i; + + return i - cq->mcq.cons_index; +} + +static void mlx4_ib_cq_resize_copy_cqes(struct mlx4_ib_cq *cq) +{ + struct mlx4_cqe *cqe, *new_cqe; + int i; + int cqe_size = cq->buf.entry_size; + int cqe_inc = cqe_size == 64 ? 1 : 0; + + i = cq->mcq.cons_index; + cqe = get_cqe(cq, i & cq->ibcq.cqe); + cqe += cqe_inc; + + while ((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != MLX4_CQE_OPCODE_RESIZE) { + new_cqe = get_cqe_from_buf(&cq->resize_buf->buf, + (i + 1) & cq->resize_buf->cqe); + memcpy(new_cqe, get_cqe(cq, i & cq->ibcq.cqe), cqe_size); + new_cqe += cqe_inc; + + new_cqe->owner_sr_opcode = (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK) | + (((i + 1) & (cq->resize_buf->cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0); + cqe = get_cqe(cq, ++i & cq->ibcq.cqe); + cqe += cqe_inc; + } + ++cq->mcq.cons_index; +} + +int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(ibcq->device); + struct mlx4_ib_cq *cq = to_mcq(ibcq); + struct mlx4_mtt mtt; + int outst_cqe; + int err; + + mutex_lock(&cq->resize_mutex); + + if (entries < 1) { + err = -EINVAL; + goto out; + } + + entries = roundup_pow_of_two(entries + 1); + if (entries == ibcq->cqe + 1) { + err = 0; + goto out; + } + + if (entries > dev->dev->caps.max_cqes) { + err = -EINVAL; + goto out; + } + + if (ibcq->uobject) { + err = mlx4_alloc_resize_umem(dev, cq, entries, udata); + if (err) + goto out; + } else { + /* Can't be smaller than the number of outstanding CQEs */ + outst_cqe = mlx4_ib_get_outstanding_cqes(cq); + if (entries < outst_cqe + 1) { + err = 0; + goto out; + } + + err = mlx4_alloc_resize_buf(dev, cq, entries); + if (err) + goto out; + } + + mtt = cq->buf.mtt; + + err = mlx4_cq_resize(dev->dev, &cq->mcq, entries, &cq->resize_buf->buf.mtt); + if (err) + goto err_buf; + + mlx4_mtt_cleanup(dev->dev, &mtt); + if (ibcq->uobject) { + cq->buf = cq->resize_buf->buf; + cq->ibcq.cqe = cq->resize_buf->cqe; + ib_umem_release(cq->umem); + cq->umem = cq->resize_umem; + + kfree(cq->resize_buf); + cq->resize_buf = NULL; + cq->resize_umem = NULL; + } else { + struct mlx4_ib_cq_buf tmp_buf; + int tmp_cqe = 0; + + spin_lock_irq(&cq->lock); + if (cq->resize_buf) { + mlx4_ib_cq_resize_copy_cqes(cq); + tmp_buf = cq->buf; + tmp_cqe = cq->ibcq.cqe; + cq->buf = cq->resize_buf->buf; + cq->ibcq.cqe = cq->resize_buf->cqe; + + kfree(cq->resize_buf); + cq->resize_buf = NULL; + } + spin_unlock_irq(&cq->lock); + + if (tmp_cqe) + mlx4_ib_free_cq_buf(dev, &tmp_buf, tmp_cqe); + } + + goto out; + +err_buf: + mlx4_mtt_cleanup(dev->dev, &cq->resize_buf->buf.mtt); + if (!ibcq->uobject) + mlx4_ib_free_cq_buf(dev, &cq->resize_buf->buf, + cq->resize_buf->cqe); + + kfree(cq->resize_buf); + cq->resize_buf = NULL; + + if (cq->resize_umem) { + ib_umem_release(cq->resize_umem); + cq->resize_umem = NULL; + } + +out: + mutex_unlock(&cq->resize_mutex); + + return err; +} + +int mlx4_ib_destroy_cq(struct ib_cq *cq) +{ + struct mlx4_ib_dev *dev = to_mdev(cq->device); + struct mlx4_ib_cq *mcq = to_mcq(cq); + + mlx4_cq_free(dev->dev, &mcq->mcq); + mlx4_mtt_cleanup(dev->dev, &mcq->buf.mtt); + + if (cq->uobject) { + mlx4_ib_db_unmap_user(to_mucontext(cq->uobject->context), &mcq->db); + ib_umem_release(mcq->umem); + } else { + mlx4_ib_free_cq_buf(dev, &mcq->buf, cq->cqe); + mlx4_db_free(dev->dev, &mcq->db); + } + + kfree(mcq); + + return 0; +} + +static void dump_cqe(void *cqe) +{ + __be32 *buf = cqe; + + pr_debug("CQE contents %08x %08x %08x %08x %08x %08x %08x %08x\n", + be32_to_cpu(buf[0]), be32_to_cpu(buf[1]), be32_to_cpu(buf[2]), + be32_to_cpu(buf[3]), be32_to_cpu(buf[4]), be32_to_cpu(buf[5]), + be32_to_cpu(buf[6]), be32_to_cpu(buf[7])); +} + +static void mlx4_ib_handle_error_cqe(struct mlx4_err_cqe *cqe, + struct ib_wc *wc) +{ + if (cqe->syndrome == MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR) { + pr_debug("local QP operation err " + "(QPN %06x, WQE index %x, vendor syndrome %02x, " + "opcode = %02x)\n", + be32_to_cpu(cqe->my_qpn), be16_to_cpu(cqe->wqe_index), + cqe->vendor_err_syndrome, + cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK); + dump_cqe(cqe); + } + + switch (cqe->syndrome) { + case MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR: + wc->status = IB_WC_LOC_LEN_ERR; + break; + case MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR: + wc->status = IB_WC_LOC_QP_OP_ERR; + break; + case MLX4_CQE_SYNDROME_LOCAL_PROT_ERR: + wc->status = IB_WC_LOC_PROT_ERR; + break; + case MLX4_CQE_SYNDROME_WR_FLUSH_ERR: + wc->status = IB_WC_WR_FLUSH_ERR; + break; + case MLX4_CQE_SYNDROME_MW_BIND_ERR: + wc->status = IB_WC_MW_BIND_ERR; + break; + case MLX4_CQE_SYNDROME_BAD_RESP_ERR: + wc->status = IB_WC_BAD_RESP_ERR; + break; + case MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR: + wc->status = IB_WC_LOC_ACCESS_ERR; + break; + case MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR: + wc->status = IB_WC_REM_INV_REQ_ERR; + break; + case MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR: + wc->status = IB_WC_REM_ACCESS_ERR; + break; + case MLX4_CQE_SYNDROME_REMOTE_OP_ERR: + wc->status = IB_WC_REM_OP_ERR; + break; + case MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR: + wc->status = IB_WC_RETRY_EXC_ERR; + break; + case MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR: + wc->status = IB_WC_RNR_RETRY_EXC_ERR; + break; + case MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR: + wc->status = IB_WC_REM_ABORT_ERR; + break; + default: + wc->status = IB_WC_GENERAL_ERR; + break; + } + + wc->vendor_err = cqe->vendor_err_syndrome; +} + +static int mlx4_ib_ipoib_csum_ok(__be16 status, __be16 checksum) +{ + return ((status & cpu_to_be16(MLX4_CQE_STATUS_IPV4 | + MLX4_CQE_STATUS_IPV4F | + MLX4_CQE_STATUS_IPV4OPT | + MLX4_CQE_STATUS_IPV6 | + MLX4_CQE_STATUS_IPOK)) == + cpu_to_be16(MLX4_CQE_STATUS_IPV4 | + MLX4_CQE_STATUS_IPOK)) && + (status & cpu_to_be16(MLX4_CQE_STATUS_UDP | + MLX4_CQE_STATUS_TCP)) && + checksum == cpu_to_be16(0xffff); +} + +static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct ib_wc *wc, + unsigned tail, struct mlx4_cqe *cqe, int is_eth) +{ + struct mlx4_ib_proxy_sqp_hdr *hdr; + + ib_dma_sync_single_for_cpu(qp->ibqp.device, + qp->sqp_proxy_rcv[tail].map, + sizeof (struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + hdr = (struct mlx4_ib_proxy_sqp_hdr *) (qp->sqp_proxy_rcv[tail].addr); + wc->pkey_index = be16_to_cpu(hdr->tun.pkey_index); + wc->src_qp = be32_to_cpu(hdr->tun.flags_src_qp) & 0xFFFFFF; + wc->wc_flags |= (hdr->tun.g_ml_path & 0x80) ? (IB_WC_GRH) : 0; + wc->dlid_path_bits = 0; + + if (is_eth) { + wc->vlan_id = be16_to_cpu(hdr->tun.sl_vid); + memcpy(&(wc->smac[0]), (char *)&hdr->tun.mac_31_0, 4); + memcpy(&(wc->smac[4]), (char *)&hdr->tun.slid_mac_47_32, 2); + wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC); + } else { + wc->slid = be16_to_cpu(hdr->tun.slid_mac_47_32); + wc->sl = (u8) (be16_to_cpu(hdr->tun.sl_vid) >> 12); + } + + return 0; +} + +static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, + struct mlx4_ib_qp **cur_qp, + struct ib_wc *wc) +{ + struct mlx4_cqe *cqe; + struct mlx4_qp *mqp; + struct mlx4_ib_wq *wq; + struct mlx4_ib_srq *srq; + struct mlx4_srq *msrq = NULL; + int is_send; + int is_error; + int is_eth; + u32 g_mlpath_rqpn; + u16 wqe_ctr; + unsigned tail = 0; + +repoll: + cqe = next_cqe_sw(cq); + if (!cqe) + return -EAGAIN; + + if (cq->buf.entry_size == 64) + cqe++; + + ++cq->mcq.cons_index; + + /* + * Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + rmb(); + + is_send = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK; + is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == + MLX4_CQE_OPCODE_ERROR; + + if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_OPCODE_NOP && + is_send)) { + pr_warn("Completion for NOP opcode detected!\n"); + return -EINVAL; + } + + /* Resize CQ in progress */ + if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_CQE_OPCODE_RESIZE)) { + if (cq->resize_buf) { + struct mlx4_ib_dev *dev = to_mdev(cq->ibcq.device); + + mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe); + cq->buf = cq->resize_buf->buf; + cq->ibcq.cqe = cq->resize_buf->cqe; + + kfree(cq->resize_buf); + cq->resize_buf = NULL; + } + + goto repoll; + } + + if (!*cur_qp || + (be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) != (*cur_qp)->mqp.qpn) { + /* + * We do not have to take the QP table lock here, + * because CQs will be locked while QPs are removed + * from the table. + */ + mqp = __mlx4_qp_lookup(to_mdev(cq->ibcq.device)->dev, + be32_to_cpu(cqe->vlan_my_qpn)); + if (unlikely(!mqp)) { + pr_warn("CQ %06x with entry for unknown QPN %06x\n", + cq->mcq.cqn, be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK); + return -EINVAL; + } + + *cur_qp = to_mibqp(mqp); + } + + wc->qp = &(*cur_qp)->ibqp; + + if (wc->qp->qp_type == IB_QPT_XRC_TGT) { + u32 srq_num; + g_mlpath_rqpn = be32_to_cpu(cqe->g_mlpath_rqpn); + srq_num = g_mlpath_rqpn & 0xffffff; + /* SRQ is also in the radix tree */ + msrq = mlx4_srq_lookup(to_mdev(cq->ibcq.device)->dev, + srq_num); + if (unlikely(!msrq)) { + pr_warn("CQ %06x with entry for unknown SRQN %06x\n", + cq->mcq.cqn, srq_num); + return -EINVAL; + } + } + + if (is_send) { + wq = &(*cur_qp)->sq; + if (!(*cur_qp)->sq_signal_bits) { + wqe_ctr = be16_to_cpu(cqe->wqe_index); + wq->tail += (u16) (wqe_ctr - (u16) wq->tail); + } + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + ++wq->tail; + } else if ((*cur_qp)->ibqp.srq) { + srq = to_msrq((*cur_qp)->ibqp.srq); + wqe_ctr = be16_to_cpu(cqe->wqe_index); + wc->wr_id = srq->wrid[wqe_ctr]; + mlx4_ib_free_srq_wqe(srq, wqe_ctr); + } else if (msrq) { + srq = to_mibsrq(msrq); + wqe_ctr = be16_to_cpu(cqe->wqe_index); + wc->wr_id = srq->wrid[wqe_ctr]; + mlx4_ib_free_srq_wqe(srq, wqe_ctr); + } else { + wq = &(*cur_qp)->rq; + tail = wq->tail & (wq->wqe_cnt - 1); + wc->wr_id = wq->wrid[tail]; + ++wq->tail; + } + + if (unlikely(is_error)) { + mlx4_ib_handle_error_cqe((struct mlx4_err_cqe *) cqe, wc); + return 0; + } + + wc->status = IB_WC_SUCCESS; + + if (is_send) { + wc->wc_flags = 0; + switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { + case MLX4_OPCODE_RDMA_WRITE_IMM: + wc->wc_flags |= IB_WC_WITH_IMM; + case MLX4_OPCODE_RDMA_WRITE: + wc->opcode = IB_WC_RDMA_WRITE; + break; + case MLX4_OPCODE_SEND_IMM: + wc->wc_flags |= IB_WC_WITH_IMM; + case MLX4_OPCODE_SEND: + case MLX4_OPCODE_SEND_INVAL: + wc->opcode = IB_WC_SEND; + break; + case MLX4_OPCODE_RDMA_READ: + wc->opcode = IB_WC_RDMA_READ; + wc->byte_len = be32_to_cpu(cqe->byte_cnt); + break; + case MLX4_OPCODE_ATOMIC_CS: + wc->opcode = IB_WC_COMP_SWAP; + wc->byte_len = 8; + break; + case MLX4_OPCODE_ATOMIC_FA: + wc->opcode = IB_WC_FETCH_ADD; + wc->byte_len = 8; + break; + case MLX4_OPCODE_MASKED_ATOMIC_CS: + wc->opcode = IB_WC_MASKED_COMP_SWAP; + wc->byte_len = 8; + break; + case MLX4_OPCODE_MASKED_ATOMIC_FA: + wc->opcode = IB_WC_MASKED_FETCH_ADD; + wc->byte_len = 8; + break; + case MLX4_OPCODE_BIND_MW: + wc->opcode = IB_WC_BIND_MW; + break; + case MLX4_OPCODE_LSO: + wc->opcode = IB_WC_LSO; + break; + case MLX4_OPCODE_FMR: + wc->opcode = IB_WC_FAST_REG_MR; + break; + case MLX4_OPCODE_LOCAL_INVAL: + wc->opcode = IB_WC_LOCAL_INV; + break; + } + } else { + wc->byte_len = be32_to_cpu(cqe->byte_cnt); + + switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { + case MLX4_RECV_OPCODE_RDMA_WRITE_IMM: + wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; + wc->wc_flags = IB_WC_WITH_IMM; + wc->ex.imm_data = cqe->immed_rss_invalid; + break; + case MLX4_RECV_OPCODE_SEND_INVAL: + wc->opcode = IB_WC_RECV; + wc->wc_flags = IB_WC_WITH_INVALIDATE; + wc->ex.invalidate_rkey = be32_to_cpu(cqe->immed_rss_invalid); + break; + case MLX4_RECV_OPCODE_SEND: + wc->opcode = IB_WC_RECV; + wc->wc_flags = 0; + break; + case MLX4_RECV_OPCODE_SEND_IMM: + wc->opcode = IB_WC_RECV; + wc->wc_flags = IB_WC_WITH_IMM; + wc->ex.imm_data = cqe->immed_rss_invalid; + break; + } + + is_eth = (rdma_port_get_link_layer(wc->qp->device, + (*cur_qp)->port) == + IB_LINK_LAYER_ETHERNET); + if (mlx4_is_mfunc(to_mdev(cq->ibcq.device)->dev)) { + if ((*cur_qp)->mlx4_ib_qp_type & + (MLX4_IB_QPT_PROXY_SMI_OWNER | + MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) + return use_tunnel_data(*cur_qp, cq, wc, tail, + cqe, is_eth); + } + + wc->slid = be16_to_cpu(cqe->rlid); + g_mlpath_rqpn = be32_to_cpu(cqe->g_mlpath_rqpn); + wc->src_qp = g_mlpath_rqpn & 0xffffff; + wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f; + wc->wc_flags |= g_mlpath_rqpn & 0x80000000 ? IB_WC_GRH : 0; + wc->pkey_index = be32_to_cpu(cqe->immed_rss_invalid) & 0x7f; + wc->wc_flags |= mlx4_ib_ipoib_csum_ok(cqe->status, + cqe->checksum) ? IB_WC_IP_CSUM_OK : 0; + if (is_eth) { + wc->sl = be16_to_cpu(cqe->sl_vid) >> 13; + if (be32_to_cpu(cqe->vlan_my_qpn) & + MLX4_CQE_VLAN_PRESENT_MASK) { + wc->vlan_id = be16_to_cpu(cqe->sl_vid) & + MLX4_CQE_VID_MASK; + } else { + wc->vlan_id = 0xffff; + } + memcpy(wc->smac, cqe->smac, ETH_ALEN); + wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC); + } else { + wc->sl = be16_to_cpu(cqe->sl_vid) >> 12; + wc->vlan_id = 0xffff; + } + } + + return 0; +} + +int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + struct mlx4_ib_cq *cq = to_mcq(ibcq); + struct mlx4_ib_qp *cur_qp = NULL; + unsigned long flags; + int npolled; + int err = 0; + + spin_lock_irqsave(&cq->lock, flags); + + for (npolled = 0; npolled < num_entries; ++npolled) { + err = mlx4_ib_poll_one(cq, &cur_qp, wc + npolled); + if (err) + break; + } + + mlx4_cq_set_ci(&cq->mcq); + + spin_unlock_irqrestore(&cq->lock, flags); + + if (err == 0 || err == -EAGAIN) + return npolled; + else + return err; +} + +int mlx4_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + mlx4_cq_arm(&to_mcq(ibcq)->mcq, + (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? + MLX4_CQ_DB_REQ_NOT_SOL : MLX4_CQ_DB_REQ_NOT, + to_mdev(ibcq->device)->uar_map, + MLX4_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->uar_lock)); + + return 0; +} + +void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq) +{ + u32 prod_index; + int nfreed = 0; + struct mlx4_cqe *cqe, *dest; + u8 owner_bit; + int cqe_inc = cq->buf.entry_size == 64 ? 1 : 0; + + /* + * First we need to find the current producer index, so we + * know where to start cleaning from. It doesn't matter if HW + * adds new entries after this loop -- the QP we're worried + * about is already in RESET, so the new entries won't come + * from our QP and therefore don't need to be checked. + */ + for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); ++prod_index) + if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe) + break; + + /* + * Now sweep backwards through the CQ, removing CQ entries + * that match our QP by copying older entries on top of them. + */ + while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) { + cqe = get_cqe(cq, prod_index & cq->ibcq.cqe); + cqe += cqe_inc; + + if ((be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) { + if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) + mlx4_ib_free_srq_wqe(srq, be16_to_cpu(cqe->wqe_index)); + ++nfreed; + } else if (nfreed) { + dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe); + dest += cqe_inc; + + owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK; + memcpy(dest, cqe, sizeof *cqe); + dest->owner_sr_opcode = owner_bit | + (dest->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK); + } + } + + if (nfreed) { + cq->mcq.cons_index += nfreed; + /* + * Make sure update of buffer contents is done before + * updating consumer index. + */ + wmb(); + mlx4_cq_set_ci(&cq->mcq); + } +} + +void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq) +{ + spin_lock_irq(&cq->lock); + __mlx4_ib_cq_clean(cq, qpn, srq); + spin_unlock_irq(&cq->lock); +} diff --git a/drivers/infiniband/hw/mlx4/doorbell.c b/drivers/infiniband/hw/mlx4/doorbell.c new file mode 100644 index 0000000..c517409 --- /dev/null +++ b/drivers/infiniband/hw/mlx4/doorbell.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "mlx4_ib.h" + +struct mlx4_ib_user_db_page { + struct list_head list; + struct ib_umem *umem; + unsigned long user_virt; + int refcnt; +}; + +int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt, + struct mlx4_db *db) +{ + struct mlx4_ib_user_db_page *page; + int err = 0; + + mutex_lock(&context->db_page_mutex); + + list_for_each_entry(page, &context->db_page_list, list) + if (page->user_virt == (virt & PAGE_MASK)) + goto found; + + page = kmalloc(sizeof *page, GFP_KERNEL); + if (!page) { + err = -ENOMEM; + goto out; + } + + page->user_virt = (virt & PAGE_MASK); + page->refcnt = 0; + page->umem = ib_umem_get(&context->ibucontext, virt & PAGE_MASK, + PAGE_SIZE, 0, 0); + if (IS_ERR(page->umem)) { + err = PTR_ERR(page->umem); + kfree(page); + goto out; + } + + list_add(&page->list, &context->db_page_list); + +found: + db->dma = sg_dma_address(page->umem->sg_head.sgl) + (virt & ~PAGE_MASK); + db->u.user_page = page; + ++page->refcnt; + +out: + mutex_unlock(&context->db_page_mutex); + + return err; +} + +void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db) +{ + mutex_lock(&context->db_page_mutex); + + if (!--db->u.user_page->refcnt) { + list_del(&db->u.user_page->list); + ib_umem_release(db->u.user_page->umem); + kfree(db->u.user_page); + } + + mutex_unlock(&context->db_page_mutex); +} diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c new file mode 100644 index 0000000..82a7dd8 --- /dev/null +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -0,0 +1,2163 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "mlx4_ib.h" + +enum { + MLX4_IB_VENDOR_CLASS1 = 0x9, + MLX4_IB_VENDOR_CLASS2 = 0xa +}; + +#define MLX4_TUN_SEND_WRID_SHIFT 34 +#define MLX4_TUN_QPN_SHIFT 32 +#define MLX4_TUN_WRID_RECV (((u64) 1) << MLX4_TUN_SEND_WRID_SHIFT) +#define MLX4_TUN_SET_WRID_QPN(a) (((u64) ((a) & 0x3)) << MLX4_TUN_QPN_SHIFT) + +#define MLX4_TUN_IS_RECV(a) (((a) >> MLX4_TUN_SEND_WRID_SHIFT) & 0x1) +#define MLX4_TUN_WRID_QPN(a) (((a) >> MLX4_TUN_QPN_SHIFT) & 0x3) + + /* Port mgmt change event handling */ + +#define GET_BLK_PTR_FROM_EQE(eqe) be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.block_ptr) +#define GET_MASK_FROM_EQE(eqe) be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.tbl_entries_mask) +#define NUM_IDX_IN_PKEY_TBL_BLK 32 +#define GUID_TBL_ENTRY_SIZE 8 /* size in bytes */ +#define GUID_TBL_BLK_NUM_ENTRIES 8 +#define GUID_TBL_BLK_SIZE (GUID_TBL_ENTRY_SIZE * GUID_TBL_BLK_NUM_ENTRIES) + +struct mlx4_mad_rcv_buf { + struct ib_grh grh; + u8 payload[256]; +} __packed; + +struct mlx4_mad_snd_buf { + u8 payload[256]; +} __packed; + +struct mlx4_tunnel_mad { + struct ib_grh grh; + struct mlx4_ib_tunnel_header hdr; + struct ib_mad mad; +} __packed; + +struct mlx4_rcv_tunnel_mad { + struct mlx4_rcv_tunnel_hdr hdr; + struct ib_grh grh; + struct ib_mad mad; +} __packed; + +static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num); +static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num); +static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num, + int block, u32 change_bitmap); + +__be64 mlx4_ib_gen_node_guid(void) +{ +#define NODE_GUID_HI ((u64) (((u64)IB_OPENIB_OUI) << 40)) + return cpu_to_be64(NODE_GUID_HI | prandom_u32()); +} + +__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx) +{ + return cpu_to_be64(atomic_inc_return(&ctx->tid)) | + cpu_to_be64(0xff00000000000000LL); +} + +int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags, + int port, struct ib_wc *in_wc, struct ib_grh *in_grh, + void *in_mad, void *response_mad) +{ + struct mlx4_cmd_mailbox *inmailbox, *outmailbox; + void *inbox; + int err; + u32 in_modifier = port; + u8 op_modifier = 0; + + inmailbox = mlx4_alloc_cmd_mailbox(dev->dev); + if (IS_ERR(inmailbox)) + return PTR_ERR(inmailbox); + inbox = inmailbox->buf; + + outmailbox = mlx4_alloc_cmd_mailbox(dev->dev); + if (IS_ERR(outmailbox)) { + mlx4_free_cmd_mailbox(dev->dev, inmailbox); + return PTR_ERR(outmailbox); + } + + memcpy(inbox, in_mad, 256); + + /* + * Key check traps can't be generated unless we have in_wc to + * tell us where to send the trap. + */ + if ((mad_ifc_flags & MLX4_MAD_IFC_IGNORE_MKEY) || !in_wc) + op_modifier |= 0x1; + if ((mad_ifc_flags & MLX4_MAD_IFC_IGNORE_BKEY) || !in_wc) + op_modifier |= 0x2; + if (mlx4_is_mfunc(dev->dev) && + (mad_ifc_flags & MLX4_MAD_IFC_NET_VIEW || in_wc)) + op_modifier |= 0x8; + + if (in_wc) { + struct { + __be32 my_qpn; + u32 reserved1; + __be32 rqpn; + u8 sl; + u8 g_path; + u16 reserved2[2]; + __be16 pkey; + u32 reserved3[11]; + u8 grh[40]; + } *ext_info; + + memset(inbox + 256, 0, 256); + ext_info = inbox + 256; + + ext_info->my_qpn = cpu_to_be32(in_wc->qp->qp_num); + ext_info->rqpn = cpu_to_be32(in_wc->src_qp); + ext_info->sl = in_wc->sl << 4; + ext_info->g_path = in_wc->dlid_path_bits | + (in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0); + ext_info->pkey = cpu_to_be16(in_wc->pkey_index); + + if (in_grh) + memcpy(ext_info->grh, in_grh, 40); + + op_modifier |= 0x4; + + in_modifier |= in_wc->slid << 16; + } + + err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma, in_modifier, + mlx4_is_master(dev->dev) ? (op_modifier & ~0x8) : op_modifier, + MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C, + (op_modifier & 0x8) ? MLX4_CMD_NATIVE : MLX4_CMD_WRAPPED); + + if (!err) + memcpy(response_mad, outmailbox->buf, 256); + + mlx4_free_cmd_mailbox(dev->dev, inmailbox); + mlx4_free_cmd_mailbox(dev->dev, outmailbox); + + return err; +} + +static void update_sm_ah(struct mlx4_ib_dev *dev, u8 port_num, u16 lid, u8 sl) +{ + struct ib_ah *new_ah; + struct ib_ah_attr ah_attr; + unsigned long flags; + + if (!dev->send_agent[port_num - 1][0]) + return; + + memset(&ah_attr, 0, sizeof ah_attr); + ah_attr.dlid = lid; + ah_attr.sl = sl; + ah_attr.port_num = port_num; + + new_ah = ib_create_ah(dev->send_agent[port_num - 1][0]->qp->pd, + &ah_attr); + if (IS_ERR(new_ah)) + return; + + spin_lock_irqsave(&dev->sm_lock, flags); + if (dev->sm_ah[port_num - 1]) + ib_destroy_ah(dev->sm_ah[port_num - 1]); + dev->sm_ah[port_num - 1] = new_ah; + spin_unlock_irqrestore(&dev->sm_lock, flags); +} + +/* + * Snoop SM MADs for port info, GUID info, and P_Key table sets, so we can + * synthesize LID change, Client-Rereg, GID change, and P_Key change events. + */ +static void smp_snoop(struct ib_device *ibdev, u8 port_num, struct ib_mad *mad, + u16 prev_lid) +{ + struct ib_port_info *pinfo; + u16 lid; + __be16 *base; + u32 bn, pkey_change_bitmap; + int i; + + + struct mlx4_ib_dev *dev = to_mdev(ibdev); + if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && + mad->mad_hdr.method == IB_MGMT_METHOD_SET) + switch (mad->mad_hdr.attr_id) { + case IB_SMP_ATTR_PORT_INFO: + pinfo = (struct ib_port_info *) ((struct ib_smp *) mad)->data; + lid = be16_to_cpu(pinfo->lid); + + update_sm_ah(dev, port_num, + be16_to_cpu(pinfo->sm_lid), + pinfo->neighbormtu_mastersmsl & 0xf); + + if (pinfo->clientrereg_resv_subnetto & 0x80) + handle_client_rereg_event(dev, port_num); + + if (prev_lid != lid) + handle_lid_change_event(dev, port_num); + break; + + case IB_SMP_ATTR_PKEY_TABLE: + if (!mlx4_is_mfunc(dev->dev)) { + mlx4_ib_dispatch_event(dev, port_num, + IB_EVENT_PKEY_CHANGE); + break; + } + + /* at this point, we are running in the master. + * Slaves do not receive SMPs. + */ + bn = be32_to_cpu(((struct ib_smp *)mad)->attr_mod) & 0xFFFF; + base = (__be16 *) &(((struct ib_smp *)mad)->data[0]); + pkey_change_bitmap = 0; + for (i = 0; i < 32; i++) { + pr_debug("PKEY[%d] = x%x\n", + i + bn*32, be16_to_cpu(base[i])); + if (be16_to_cpu(base[i]) != + dev->pkeys.phys_pkey_cache[port_num - 1][i + bn*32]) { + pkey_change_bitmap |= (1 << i); + dev->pkeys.phys_pkey_cache[port_num - 1][i + bn*32] = + be16_to_cpu(base[i]); + } + } + pr_debug("PKEY Change event: port=%d, " + "block=0x%x, change_bitmap=0x%x\n", + port_num, bn, pkey_change_bitmap); + + if (pkey_change_bitmap) { + mlx4_ib_dispatch_event(dev, port_num, + IB_EVENT_PKEY_CHANGE); + if (!dev->sriov.is_going_down) + __propagate_pkey_ev(dev, port_num, bn, + pkey_change_bitmap); + } + break; + + case IB_SMP_ATTR_GUID_INFO: + /* paravirtualized master's guid is guid 0 -- does not change */ + if (!mlx4_is_master(dev->dev)) + mlx4_ib_dispatch_event(dev, port_num, + IB_EVENT_GID_CHANGE); + /*if master, notify relevant slaves*/ + if (mlx4_is_master(dev->dev) && + !dev->sriov.is_going_down) { + bn = be32_to_cpu(((struct ib_smp *)mad)->attr_mod); + mlx4_ib_update_cache_on_guid_change(dev, bn, port_num, + (u8 *)(&((struct ib_smp *)mad)->data)); + mlx4_ib_notify_slaves_on_guid_change(dev, bn, port_num, + (u8 *)(&((struct ib_smp *)mad)->data)); + } + break; + + default: + break; + } +} + +static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num, + int block, u32 change_bitmap) +{ + int i, ix, slave, err; + int have_event = 0; + + for (slave = 0; slave < dev->dev->caps.sqp_demux; slave++) { + if (slave == mlx4_master_func_num(dev->dev)) + continue; + if (!mlx4_is_slave_active(dev->dev, slave)) + continue; + + have_event = 0; + for (i = 0; i < 32; i++) { + if (!(change_bitmap & (1 << i))) + continue; + for (ix = 0; + ix < dev->dev->caps.pkey_table_len[port_num]; ix++) { + if (dev->pkeys.virt2phys_pkey[slave][port_num - 1] + [ix] == i + 32 * block) { + err = mlx4_gen_pkey_eqe(dev->dev, slave, port_num); + pr_debug("propagate_pkey_ev: slave %d," + " port %d, ix %d (%d)\n", + slave, port_num, ix, err); + have_event = 1; + break; + } + } + if (have_event) + break; + } + } +} + +static void node_desc_override(struct ib_device *dev, + struct ib_mad *mad) +{ + unsigned long flags; + + if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && + mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP && + mad->mad_hdr.attr_id == IB_SMP_ATTR_NODE_DESC) { + spin_lock_irqsave(&to_mdev(dev)->sm_lock, flags); + memcpy(((struct ib_smp *) mad)->data, dev->node_desc, 64); + spin_unlock_irqrestore(&to_mdev(dev)->sm_lock, flags); + } +} + +static void forward_trap(struct mlx4_ib_dev *dev, u8 port_num, struct ib_mad *mad) +{ + int qpn = mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED; + struct ib_mad_send_buf *send_buf; + struct ib_mad_agent *agent = dev->send_agent[port_num - 1][qpn]; + int ret; + unsigned long flags; + + if (agent) { + send_buf = ib_create_send_mad(agent, qpn, 0, 0, IB_MGMT_MAD_HDR, + IB_MGMT_MAD_DATA, GFP_ATOMIC); + if (IS_ERR(send_buf)) + return; + /* + * We rely here on the fact that MLX QPs don't use the + * address handle after the send is posted (this is + * wrong following the IB spec strictly, but we know + * it's OK for our devices). + */ + spin_lock_irqsave(&dev->sm_lock, flags); + memcpy(send_buf->mad, mad, sizeof *mad); + if ((send_buf->ah = dev->sm_ah[port_num - 1])) + ret = ib_post_send_mad(send_buf, NULL); + else + ret = -EINVAL; + spin_unlock_irqrestore(&dev->sm_lock, flags); + + if (ret) + ib_free_send_mad(send_buf); + } +} + +static int mlx4_ib_demux_sa_handler(struct ib_device *ibdev, int port, int slave, + struct ib_sa_mad *sa_mad) +{ + int ret = 0; + + /* dispatch to different sa handlers */ + switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) { + case IB_SA_ATTR_MC_MEMBER_REC: + ret = mlx4_ib_mcg_demux_handler(ibdev, port, slave, sa_mad); + break; + default: + break; + } + return ret; +} + +int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + int i; + + for (i = 0; i < dev->dev->caps.sqp_demux; i++) { + if (dev->sriov.demux[port - 1].guid_cache[i] == guid) + return i; + } + return -1; +} + + +static int find_slave_port_pkey_ix(struct mlx4_ib_dev *dev, int slave, + u8 port, u16 pkey, u16 *ix) +{ + int i, ret; + u8 unassigned_pkey_ix, pkey_ix, partial_ix = 0xFF; + u16 slot_pkey; + + if (slave == mlx4_master_func_num(dev->dev)) + return ib_find_cached_pkey(&dev->ib_dev, port, pkey, ix); + + unassigned_pkey_ix = dev->dev->phys_caps.pkey_phys_table_len[port] - 1; + + for (i = 0; i < dev->dev->caps.pkey_table_len[port]; i++) { + if (dev->pkeys.virt2phys_pkey[slave][port - 1][i] == unassigned_pkey_ix) + continue; + + pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][i]; + + ret = ib_get_cached_pkey(&dev->ib_dev, port, pkey_ix, &slot_pkey); + if (ret) + continue; + if ((slot_pkey & 0x7FFF) == (pkey & 0x7FFF)) { + if (slot_pkey & 0x8000) { + *ix = (u16) pkey_ix; + return 0; + } else { + /* take first partial pkey index found */ + if (partial_ix == 0xFF) + partial_ix = pkey_ix; + } + } + } + + if (partial_ix < 0xFF) { + *ix = (u16) partial_ix; + return 0; + } + + return -EINVAL; +} + +int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, + enum ib_qp_type dest_qpt, struct ib_wc *wc, + struct ib_grh *grh, struct ib_mad *mad) +{ + struct ib_sge list; + struct ib_send_wr wr, *bad_wr; + struct mlx4_ib_demux_pv_ctx *tun_ctx; + struct mlx4_ib_demux_pv_qp *tun_qp; + struct mlx4_rcv_tunnel_mad *tun_mad; + struct ib_ah_attr attr; + struct ib_ah *ah; + struct ib_qp *src_qp = NULL; + unsigned tun_tx_ix = 0; + int dqpn; + int ret = 0; + u16 tun_pkey_ix; + u16 cached_pkey; + u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH; + + if (dest_qpt > IB_QPT_GSI) + return -EINVAL; + + tun_ctx = dev->sriov.demux[port-1].tun[slave]; + + /* check if proxy qp created */ + if (!tun_ctx || tun_ctx->state != DEMUX_PV_STATE_ACTIVE) + return -EAGAIN; + + if (!dest_qpt) + tun_qp = &tun_ctx->qp[0]; + else + tun_qp = &tun_ctx->qp[1]; + + /* compute P_Key index to put in tunnel header for slave */ + if (dest_qpt) { + u16 pkey_ix; + ret = ib_get_cached_pkey(&dev->ib_dev, port, wc->pkey_index, &cached_pkey); + if (ret) + return -EINVAL; + + ret = find_slave_port_pkey_ix(dev, slave, port, cached_pkey, &pkey_ix); + if (ret) + return -EINVAL; + tun_pkey_ix = pkey_ix; + } else + tun_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0]; + + dqpn = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave + port + (dest_qpt * 2) - 1; + + /* get tunnel tx data buf for slave */ + src_qp = tun_qp->qp; + + /* create ah. Just need an empty one with the port num for the post send. + * The driver will set the force loopback bit in post_send */ + memset(&attr, 0, sizeof attr); + attr.port_num = port; + if (is_eth) { + memcpy(&attr.grh.dgid.raw[0], &grh->dgid.raw[0], 16); + attr.ah_flags = IB_AH_GRH; + } + ah = ib_create_ah(tun_ctx->pd, &attr); + if (IS_ERR(ah)) + return -ENOMEM; + + /* allocate tunnel tx buf after pass failure returns */ + spin_lock(&tun_qp->tx_lock); + if (tun_qp->tx_ix_head - tun_qp->tx_ix_tail >= + (MLX4_NUM_TUNNEL_BUFS - 1)) + ret = -EAGAIN; + else + tun_tx_ix = (++tun_qp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1); + spin_unlock(&tun_qp->tx_lock); + if (ret) + goto out; + + tun_mad = (struct mlx4_rcv_tunnel_mad *) (tun_qp->tx_ring[tun_tx_ix].buf.addr); + if (tun_qp->tx_ring[tun_tx_ix].ah) + ib_destroy_ah(tun_qp->tx_ring[tun_tx_ix].ah); + tun_qp->tx_ring[tun_tx_ix].ah = ah; + ib_dma_sync_single_for_cpu(&dev->ib_dev, + tun_qp->tx_ring[tun_tx_ix].buf.map, + sizeof (struct mlx4_rcv_tunnel_mad), + DMA_TO_DEVICE); + + /* copy over to tunnel buffer */ + if (grh) + memcpy(&tun_mad->grh, grh, sizeof *grh); + memcpy(&tun_mad->mad, mad, sizeof *mad); + + /* adjust tunnel data */ + tun_mad->hdr.pkey_index = cpu_to_be16(tun_pkey_ix); + tun_mad->hdr.flags_src_qp = cpu_to_be32(wc->src_qp & 0xFFFFFF); + tun_mad->hdr.g_ml_path = (grh && (wc->wc_flags & IB_WC_GRH)) ? 0x80 : 0; + + if (is_eth) { + u16 vlan = 0; + if (mlx4_get_slave_default_vlan(dev->dev, port, slave, &vlan, + NULL)) { + /* VST mode */ + if (vlan != wc->vlan_id) + /* Packet vlan is not the VST-assigned vlan. + * Drop the packet. + */ + goto out; + else + /* Remove the vlan tag before forwarding + * the packet to the VF. + */ + vlan = 0xffff; + } else { + vlan = wc->vlan_id; + } + + tun_mad->hdr.sl_vid = cpu_to_be16(vlan); + memcpy((char *)&tun_mad->hdr.mac_31_0, &(wc->smac[0]), 4); + memcpy((char *)&tun_mad->hdr.slid_mac_47_32, &(wc->smac[4]), 2); + } else { + tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12); + tun_mad->hdr.slid_mac_47_32 = cpu_to_be16(wc->slid); + } + + ib_dma_sync_single_for_device(&dev->ib_dev, + tun_qp->tx_ring[tun_tx_ix].buf.map, + sizeof (struct mlx4_rcv_tunnel_mad), + DMA_TO_DEVICE); + + list.addr = tun_qp->tx_ring[tun_tx_ix].buf.map; + list.length = sizeof (struct mlx4_rcv_tunnel_mad); + list.lkey = tun_ctx->mr->lkey; + + wr.wr.ud.ah = ah; + wr.wr.ud.port_num = port; + wr.wr.ud.remote_qkey = IB_QP_SET_QKEY; + wr.wr.ud.remote_qpn = dqpn; + wr.next = NULL; + wr.wr_id = ((u64) tun_tx_ix) | MLX4_TUN_SET_WRID_QPN(dest_qpt); + wr.sg_list = &list; + wr.num_sge = 1; + wr.opcode = IB_WR_SEND; + wr.send_flags = IB_SEND_SIGNALED; + + ret = ib_post_send(src_qp, &wr, &bad_wr); +out: + if (ret) + ib_destroy_ah(ah); + return ret; +} + +static int mlx4_ib_demux_mad(struct ib_device *ibdev, u8 port, + struct ib_wc *wc, struct ib_grh *grh, + struct ib_mad *mad) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + int err; + int slave; + u8 *slave_id; + int is_eth = 0; + + if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND) + is_eth = 0; + else + is_eth = 1; + + if (is_eth) { + if (!(wc->wc_flags & IB_WC_GRH)) { + mlx4_ib_warn(ibdev, "RoCE grh not present.\n"); + return -EINVAL; + } + if (mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_CM) { + mlx4_ib_warn(ibdev, "RoCE mgmt class is not CM\n"); + return -EINVAL; + } + if (mlx4_get_slave_from_roce_gid(dev->dev, port, grh->dgid.raw, &slave)) { + mlx4_ib_warn(ibdev, "failed matching grh\n"); + return -ENOENT; + } + if (slave >= dev->dev->caps.sqp_demux) { + mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n", + slave, dev->dev->caps.sqp_demux); + return -ENOENT; + } + + if (mlx4_ib_demux_cm_handler(ibdev, port, NULL, mad)) + return 0; + + err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad); + if (err) + pr_debug("failed sending to slave %d via tunnel qp (%d)\n", + slave, err); + return 0; + } + + /* Initially assume that this mad is for us */ + slave = mlx4_master_func_num(dev->dev); + + /* See if the slave id is encoded in a response mad */ + if (mad->mad_hdr.method & 0x80) { + slave_id = (u8 *) &mad->mad_hdr.tid; + slave = *slave_id; + if (slave != 255) /*255 indicates the dom0*/ + *slave_id = 0; /* remap tid */ + } + + /* If a grh is present, we demux according to it */ + if (wc->wc_flags & IB_WC_GRH) { + slave = mlx4_ib_find_real_gid(ibdev, port, grh->dgid.global.interface_id); + if (slave < 0) { + mlx4_ib_warn(ibdev, "failed matching grh\n"); + return -ENOENT; + } + } + /* Class-specific handling */ + switch (mad->mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: + /* 255 indicates the dom0 */ + if (slave != 255 && slave != mlx4_master_func_num(dev->dev)) { + if (!mlx4_vf_smi_enabled(dev->dev, slave, port)) + return -EPERM; + /* for a VF. drop unsolicited MADs */ + if (!(mad->mad_hdr.method & IB_MGMT_METHOD_RESP)) { + mlx4_ib_warn(ibdev, "demux QP0. rejecting unsolicited mad for slave %d class 0x%x, method 0x%x\n", + slave, mad->mad_hdr.mgmt_class, + mad->mad_hdr.method); + return -EINVAL; + } + } + break; + case IB_MGMT_CLASS_SUBN_ADM: + if (mlx4_ib_demux_sa_handler(ibdev, port, slave, + (struct ib_sa_mad *) mad)) + return 0; + break; + case IB_MGMT_CLASS_CM: + if (mlx4_ib_demux_cm_handler(ibdev, port, &slave, mad)) + return 0; + break; + case IB_MGMT_CLASS_DEVICE_MGMT: + if (mad->mad_hdr.method != IB_MGMT_METHOD_GET_RESP) + return 0; + break; + default: + /* Drop unsupported classes for slaves in tunnel mode */ + if (slave != mlx4_master_func_num(dev->dev)) { + pr_debug("dropping unsupported ingress mad from class:%d " + "for slave:%d\n", mad->mad_hdr.mgmt_class, slave); + return 0; + } + } + /*make sure that no slave==255 was not handled yet.*/ + if (slave >= dev->dev->caps.sqp_demux) { + mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n", + slave, dev->dev->caps.sqp_demux); + return -ENOENT; + } + + err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad); + if (err) + pr_debug("failed sending to slave %d via tunnel qp (%d)\n", + slave, err); + return 0; +} + +static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + u16 slid, prev_lid = 0; + int err; + struct ib_port_attr pattr; + + if (in_wc && in_wc->qp->qp_num) { + pr_debug("received MAD: slid:%d sqpn:%d " + "dlid_bits:%d dqpn:%d wc_flags:0x%x, cls %x, mtd %x, atr %x\n", + in_wc->slid, in_wc->src_qp, + in_wc->dlid_path_bits, + in_wc->qp->qp_num, + in_wc->wc_flags, + in_mad->mad_hdr.mgmt_class, in_mad->mad_hdr.method, + be16_to_cpu(in_mad->mad_hdr.attr_id)); + if (in_wc->wc_flags & IB_WC_GRH) { + pr_debug("sgid_hi:0x%016llx sgid_lo:0x%016llx\n", + be64_to_cpu(in_grh->sgid.global.subnet_prefix), + be64_to_cpu(in_grh->sgid.global.interface_id)); + pr_debug("dgid_hi:0x%016llx dgid_lo:0x%016llx\n", + be64_to_cpu(in_grh->dgid.global.subnet_prefix), + be64_to_cpu(in_grh->dgid.global.interface_id)); + } + } + + slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); + + if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) { + forward_trap(to_mdev(ibdev), port_num, in_mad); + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + } + + if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_SET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_TRAP_REPRESS) + return IB_MAD_RESULT_SUCCESS; + + /* + * Don't process SMInfo queries -- the SMA can't handle them. + */ + if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO) + return IB_MAD_RESULT_SUCCESS; + } else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT || + in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS1 || + in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS2 || + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_CONG_MGMT) { + if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_SET) + return IB_MAD_RESULT_SUCCESS; + } else + return IB_MAD_RESULT_SUCCESS; + + if ((in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && + in_mad->mad_hdr.method == IB_MGMT_METHOD_SET && + in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO && + !ib_query_port(ibdev, port_num, &pattr)) + prev_lid = pattr.lid; + + err = mlx4_MAD_IFC(to_mdev(ibdev), + (mad_flags & IB_MAD_IGNORE_MKEY ? MLX4_MAD_IFC_IGNORE_MKEY : 0) | + (mad_flags & IB_MAD_IGNORE_BKEY ? MLX4_MAD_IFC_IGNORE_BKEY : 0) | + MLX4_MAD_IFC_NET_VIEW, + port_num, in_wc, in_grh, in_mad, out_mad); + if (err) + return IB_MAD_RESULT_FAILURE; + + if (!out_mad->mad_hdr.status) { + if (!(to_mdev(ibdev)->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV)) + smp_snoop(ibdev, port_num, in_mad, prev_lid); + /* slaves get node desc from FW */ + if (!mlx4_is_slave(to_mdev(ibdev)->dev)) + node_desc_override(ibdev, out_mad); + } + + /* set return bit in status of directed route responses */ + if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + out_mad->mad_hdr.status |= cpu_to_be16(1 << 15); + + if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) + /* no response for trap repress */ + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +static void edit_counter(struct mlx4_counter *cnt, + struct ib_pma_portcounters *pma_cnt) +{ + pma_cnt->port_xmit_data = cpu_to_be32((be64_to_cpu(cnt->tx_bytes)>>2)); + pma_cnt->port_rcv_data = cpu_to_be32((be64_to_cpu(cnt->rx_bytes)>>2)); + pma_cnt->port_xmit_packets = cpu_to_be32(be64_to_cpu(cnt->tx_frames)); + pma_cnt->port_rcv_packets = cpu_to_be32(be64_to_cpu(cnt->rx_frames)); +} + +static int iboe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_ib_dev *dev = to_mdev(ibdev); + int err; + u32 inmod = dev->counters[port_num - 1] & 0xffff; + u8 mode; + + if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT) + return -EINVAL; + + mailbox = mlx4_alloc_cmd_mailbox(dev->dev); + if (IS_ERR(mailbox)) + return IB_MAD_RESULT_FAILURE; + + err = mlx4_cmd_box(dev->dev, 0, mailbox->dma, inmod, 0, + MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C, + MLX4_CMD_WRAPPED); + if (err) + err = IB_MAD_RESULT_FAILURE; + else { + memset(out_mad->data, 0, sizeof out_mad->data); + mode = ((struct mlx4_counter *)mailbox->buf)->counter_mode; + switch (mode & 0xf) { + case 0: + edit_counter(mailbox->buf, + (void *)(out_mad->data + 40)); + err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; + break; + default: + err = IB_MAD_RESULT_FAILURE; + } + } + + mlx4_free_cmd_mailbox(dev->dev, mailbox); + + return err; +} + +int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + switch (rdma_port_get_link_layer(ibdev, port_num)) { + case IB_LINK_LAYER_INFINIBAND: + return ib_process_mad(ibdev, mad_flags, port_num, in_wc, + in_grh, in_mad, out_mad); + case IB_LINK_LAYER_ETHERNET: + return iboe_process_mad(ibdev, mad_flags, port_num, in_wc, + in_grh, in_mad, out_mad); + default: + return -EINVAL; + } +} + +static void send_handler(struct ib_mad_agent *agent, + struct ib_mad_send_wc *mad_send_wc) +{ + if (mad_send_wc->send_buf->context[0]) + ib_destroy_ah(mad_send_wc->send_buf->context[0]); + ib_free_send_mad(mad_send_wc->send_buf); +} + +int mlx4_ib_mad_init(struct mlx4_ib_dev *dev) +{ + struct ib_mad_agent *agent; + int p, q; + int ret; + enum rdma_link_layer ll; + + for (p = 0; p < dev->num_ports; ++p) { + ll = rdma_port_get_link_layer(&dev->ib_dev, p + 1); + for (q = 0; q <= 1; ++q) { + if (ll == IB_LINK_LAYER_INFINIBAND) { + agent = ib_register_mad_agent(&dev->ib_dev, p + 1, + q ? IB_QPT_GSI : IB_QPT_SMI, + NULL, 0, send_handler, + NULL, NULL, 0); + if (IS_ERR(agent)) { + ret = PTR_ERR(agent); + goto err; + } + dev->send_agent[p][q] = agent; + } else + dev->send_agent[p][q] = NULL; + } + } + + return 0; + +err: + for (p = 0; p < dev->num_ports; ++p) + for (q = 0; q <= 1; ++q) + if (dev->send_agent[p][q]) + ib_unregister_mad_agent(dev->send_agent[p][q]); + + return ret; +} + +void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev) +{ + struct ib_mad_agent *agent; + int p, q; + + for (p = 0; p < dev->num_ports; ++p) { + for (q = 0; q <= 1; ++q) { + agent = dev->send_agent[p][q]; + if (agent) { + dev->send_agent[p][q] = NULL; + ib_unregister_mad_agent(agent); + } + } + + if (dev->sm_ah[p]) + ib_destroy_ah(dev->sm_ah[p]); + } +} + +static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num) +{ + mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_LID_CHANGE); + + if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down) + mlx4_gen_slaves_port_mgt_ev(dev->dev, port_num, + MLX4_EQ_PORT_INFO_LID_CHANGE_MASK); +} + +static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num) +{ + /* re-configure the alias-guid and mcg's */ + if (mlx4_is_master(dev->dev)) { + mlx4_ib_invalidate_all_guid_record(dev, port_num); + + if (!dev->sriov.is_going_down) { + mlx4_ib_mcg_port_cleanup(&dev->sriov.demux[port_num - 1], 0); + mlx4_gen_slaves_port_mgt_ev(dev->dev, port_num, + MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK); + } + } + mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_CLIENT_REREGISTER); +} + +static void propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num, + struct mlx4_eqe *eqe) +{ + __propagate_pkey_ev(dev, port_num, GET_BLK_PTR_FROM_EQE(eqe), + GET_MASK_FROM_EQE(eqe)); +} + +static void handle_slaves_guid_change(struct mlx4_ib_dev *dev, u8 port_num, + u32 guid_tbl_blk_num, u32 change_bitmap) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + u16 i; + + if (!mlx4_is_mfunc(dev->dev) || !mlx4_is_master(dev->dev)) + return; + + in_mad = kmalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) { + mlx4_ib_warn(&dev->ib_dev, "failed to allocate memory for guid info mads\n"); + goto out; + } + + guid_tbl_blk_num *= 4; + + for (i = 0; i < 4; i++) { + if (change_bitmap && (!((change_bitmap >> (8 * i)) & 0xff))) + continue; + memset(in_mad, 0, sizeof *in_mad); + memset(out_mad, 0, sizeof *out_mad); + + in_mad->base_version = 1; + in_mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; + in_mad->class_version = 1; + in_mad->method = IB_MGMT_METHOD_GET; + in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; + in_mad->attr_mod = cpu_to_be32(guid_tbl_blk_num + i); + + if (mlx4_MAD_IFC(dev, + MLX4_MAD_IFC_IGNORE_KEYS | MLX4_MAD_IFC_NET_VIEW, + port_num, NULL, NULL, in_mad, out_mad)) { + mlx4_ib_warn(&dev->ib_dev, "Failed in get GUID INFO MAD_IFC\n"); + goto out; + } + + mlx4_ib_update_cache_on_guid_change(dev, guid_tbl_blk_num + i, + port_num, + (u8 *)(&((struct ib_smp *)out_mad)->data)); + mlx4_ib_notify_slaves_on_guid_change(dev, guid_tbl_blk_num + i, + port_num, + (u8 *)(&((struct ib_smp *)out_mad)->data)); + } + +out: + kfree(in_mad); + kfree(out_mad); + return; +} + +void handle_port_mgmt_change_event(struct work_struct *work) +{ + struct ib_event_work *ew = container_of(work, struct ib_event_work, work); + struct mlx4_ib_dev *dev = ew->ib_dev; + struct mlx4_eqe *eqe = &(ew->ib_eqe); + u8 port = eqe->event.port_mgmt_change.port; + u32 changed_attr; + u32 tbl_block; + u32 change_bitmap; + + switch (eqe->subtype) { + case MLX4_DEV_PMC_SUBTYPE_PORT_INFO: + changed_attr = be32_to_cpu(eqe->event.port_mgmt_change.params.port_info.changed_attr); + + /* Update the SM ah - This should be done before handling + the other changed attributes so that MADs can be sent to the SM */ + if (changed_attr & MSTR_SM_CHANGE_MASK) { + u16 lid = be16_to_cpu(eqe->event.port_mgmt_change.params.port_info.mstr_sm_lid); + u8 sl = eqe->event.port_mgmt_change.params.port_info.mstr_sm_sl & 0xf; + update_sm_ah(dev, port, lid, sl); + } + + /* Check if it is a lid change event */ + if (changed_attr & MLX4_EQ_PORT_INFO_LID_CHANGE_MASK) + handle_lid_change_event(dev, port); + + /* Generate GUID changed event */ + if (changed_attr & MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK) { + mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE); + /*if master, notify all slaves*/ + if (mlx4_is_master(dev->dev)) + mlx4_gen_slaves_port_mgt_ev(dev->dev, port, + MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK); + } + + if (changed_attr & MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK) + handle_client_rereg_event(dev, port); + break; + + case MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE: + mlx4_ib_dispatch_event(dev, port, IB_EVENT_PKEY_CHANGE); + if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down) + propagate_pkey_ev(dev, port, eqe); + break; + case MLX4_DEV_PMC_SUBTYPE_GUID_INFO: + /* paravirtualized master's guid is guid 0 -- does not change */ + if (!mlx4_is_master(dev->dev)) + mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE); + /*if master, notify relevant slaves*/ + else if (!dev->sriov.is_going_down) { + tbl_block = GET_BLK_PTR_FROM_EQE(eqe); + change_bitmap = GET_MASK_FROM_EQE(eqe); + handle_slaves_guid_change(dev, port, tbl_block, change_bitmap); + } + break; + default: + pr_warn("Unsupported subtype 0x%x for " + "Port Management Change event\n", eqe->subtype); + } + + kfree(ew); +} + +void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u8 port_num, + enum ib_event_type type) +{ + struct ib_event event; + + event.device = &dev->ib_dev; + event.element.port_num = port_num; + event.event = type; + + ib_dispatch_event(&event); +} + +static void mlx4_ib_tunnel_comp_handler(struct ib_cq *cq, void *arg) +{ + unsigned long flags; + struct mlx4_ib_demux_pv_ctx *ctx = cq->cq_context; + struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + if (!dev->sriov.is_going_down && ctx->state == DEMUX_PV_STATE_ACTIVE) + queue_work(ctx->wq, &ctx->work); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); +} + +static int mlx4_ib_post_pv_qp_buf(struct mlx4_ib_demux_pv_ctx *ctx, + struct mlx4_ib_demux_pv_qp *tun_qp, + int index) +{ + struct ib_sge sg_list; + struct ib_recv_wr recv_wr, *bad_recv_wr; + int size; + + size = (tun_qp->qp->qp_type == IB_QPT_UD) ? + sizeof (struct mlx4_tunnel_mad) : sizeof (struct mlx4_mad_rcv_buf); + + sg_list.addr = tun_qp->ring[index].map; + sg_list.length = size; + sg_list.lkey = ctx->mr->lkey; + + recv_wr.next = NULL; + recv_wr.sg_list = &sg_list; + recv_wr.num_sge = 1; + recv_wr.wr_id = (u64) index | MLX4_TUN_WRID_RECV | + MLX4_TUN_SET_WRID_QPN(tun_qp->proxy_qpt); + ib_dma_sync_single_for_device(ctx->ib_dev, tun_qp->ring[index].map, + size, DMA_FROM_DEVICE); + return ib_post_recv(tun_qp->qp, &recv_wr, &bad_recv_wr); +} + +static int mlx4_ib_multiplex_sa_handler(struct ib_device *ibdev, int port, + int slave, struct ib_sa_mad *sa_mad) +{ + int ret = 0; + + /* dispatch to different sa handlers */ + switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) { + case IB_SA_ATTR_MC_MEMBER_REC: + ret = mlx4_ib_mcg_multiplex_handler(ibdev, port, slave, sa_mad); + break; + default: + break; + } + return ret; +} + +static int is_proxy_qp0(struct mlx4_ib_dev *dev, int qpn, int slave) +{ + int proxy_start = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave; + + return (qpn >= proxy_start && qpn <= proxy_start + 1); +} + + +int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, + enum ib_qp_type dest_qpt, u16 pkey_index, + u32 remote_qpn, u32 qkey, struct ib_ah_attr *attr, + u8 *s_mac, struct ib_mad *mad) +{ + struct ib_sge list; + struct ib_send_wr wr, *bad_wr; + struct mlx4_ib_demux_pv_ctx *sqp_ctx; + struct mlx4_ib_demux_pv_qp *sqp; + struct mlx4_mad_snd_buf *sqp_mad; + struct ib_ah *ah; + struct ib_qp *send_qp = NULL; + unsigned wire_tx_ix = 0; + int ret = 0; + u16 wire_pkey_ix; + int src_qpnum; + u8 sgid_index; + + + sqp_ctx = dev->sriov.sqps[port-1]; + + /* check if proxy qp created */ + if (!sqp_ctx || sqp_ctx->state != DEMUX_PV_STATE_ACTIVE) + return -EAGAIN; + + if (dest_qpt == IB_QPT_SMI) { + src_qpnum = 0; + sqp = &sqp_ctx->qp[0]; + wire_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0]; + } else { + src_qpnum = 1; + sqp = &sqp_ctx->qp[1]; + wire_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][pkey_index]; + } + + send_qp = sqp->qp; + + /* create ah */ + sgid_index = attr->grh.sgid_index; + attr->grh.sgid_index = 0; + ah = ib_create_ah(sqp_ctx->pd, attr); + if (IS_ERR(ah)) + return -ENOMEM; + attr->grh.sgid_index = sgid_index; + to_mah(ah)->av.ib.gid_index = sgid_index; + /* get rid of force-loopback bit */ + to_mah(ah)->av.ib.port_pd &= cpu_to_be32(0x7FFFFFFF); + spin_lock(&sqp->tx_lock); + if (sqp->tx_ix_head - sqp->tx_ix_tail >= + (MLX4_NUM_TUNNEL_BUFS - 1)) + ret = -EAGAIN; + else + wire_tx_ix = (++sqp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1); + spin_unlock(&sqp->tx_lock); + if (ret) + goto out; + + sqp_mad = (struct mlx4_mad_snd_buf *) (sqp->tx_ring[wire_tx_ix].buf.addr); + if (sqp->tx_ring[wire_tx_ix].ah) + ib_destroy_ah(sqp->tx_ring[wire_tx_ix].ah); + sqp->tx_ring[wire_tx_ix].ah = ah; + ib_dma_sync_single_for_cpu(&dev->ib_dev, + sqp->tx_ring[wire_tx_ix].buf.map, + sizeof (struct mlx4_mad_snd_buf), + DMA_TO_DEVICE); + + memcpy(&sqp_mad->payload, mad, sizeof *mad); + + ib_dma_sync_single_for_device(&dev->ib_dev, + sqp->tx_ring[wire_tx_ix].buf.map, + sizeof (struct mlx4_mad_snd_buf), + DMA_TO_DEVICE); + + list.addr = sqp->tx_ring[wire_tx_ix].buf.map; + list.length = sizeof (struct mlx4_mad_snd_buf); + list.lkey = sqp_ctx->mr->lkey; + + wr.wr.ud.ah = ah; + wr.wr.ud.port_num = port; + wr.wr.ud.pkey_index = wire_pkey_ix; + wr.wr.ud.remote_qkey = qkey; + wr.wr.ud.remote_qpn = remote_qpn; + wr.next = NULL; + wr.wr_id = ((u64) wire_tx_ix) | MLX4_TUN_SET_WRID_QPN(src_qpnum); + wr.sg_list = &list; + wr.num_sge = 1; + wr.opcode = IB_WR_SEND; + wr.send_flags = IB_SEND_SIGNALED; + if (s_mac) + memcpy(to_mah(ah)->av.eth.s_mac, s_mac, 6); + + + ret = ib_post_send(send_qp, &wr, &bad_wr); +out: + if (ret) + ib_destroy_ah(ah); + return ret; +} + +static int get_slave_base_gid_ix(struct mlx4_ib_dev *dev, int slave, int port) +{ + if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND) + return slave; + return mlx4_get_base_gid_ix(dev->dev, slave, port); +} + +static void fill_in_real_sgid_index(struct mlx4_ib_dev *dev, int slave, int port, + struct ib_ah_attr *ah_attr) +{ + if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND) + ah_attr->grh.sgid_index = slave; + else + ah_attr->grh.sgid_index += get_slave_base_gid_ix(dev, slave, port); +} + +static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc *wc) +{ + struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); + struct mlx4_ib_demux_pv_qp *tun_qp = &ctx->qp[MLX4_TUN_WRID_QPN(wc->wr_id)]; + int wr_ix = wc->wr_id & (MLX4_NUM_TUNNEL_BUFS - 1); + struct mlx4_tunnel_mad *tunnel = tun_qp->ring[wr_ix].addr; + struct mlx4_ib_ah ah; + struct ib_ah_attr ah_attr; + u8 *slave_id; + int slave; + int port; + + /* Get slave that sent this packet */ + if (wc->src_qp < dev->dev->phys_caps.base_proxy_sqpn || + wc->src_qp >= dev->dev->phys_caps.base_proxy_sqpn + 8 * MLX4_MFUNC_MAX || + (wc->src_qp & 0x1) != ctx->port - 1 || + wc->src_qp & 0x4) { + mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d\n", wc->src_qp); + return; + } + slave = ((wc->src_qp & ~0x7) - dev->dev->phys_caps.base_proxy_sqpn) / 8; + if (slave != ctx->slave) { + mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d: " + "belongs to another slave\n", wc->src_qp); + return; + } + + /* Map transaction ID */ + ib_dma_sync_single_for_cpu(ctx->ib_dev, tun_qp->ring[wr_ix].map, + sizeof (struct mlx4_tunnel_mad), + DMA_FROM_DEVICE); + switch (tunnel->mad.mad_hdr.method) { + case IB_MGMT_METHOD_SET: + case IB_MGMT_METHOD_GET: + case IB_MGMT_METHOD_REPORT: + case IB_SA_METHOD_GET_TABLE: + case IB_SA_METHOD_DELETE: + case IB_SA_METHOD_GET_MULTI: + case IB_SA_METHOD_GET_TRACE_TBL: + slave_id = (u8 *) &tunnel->mad.mad_hdr.tid; + if (*slave_id) { + mlx4_ib_warn(ctx->ib_dev, "egress mad has non-null tid msb:%d " + "class:%d slave:%d\n", *slave_id, + tunnel->mad.mad_hdr.mgmt_class, slave); + return; + } else + *slave_id = slave; + default: + /* nothing */; + } + + /* Class-specific handling */ + switch (tunnel->mad.mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: + if (slave != mlx4_master_func_num(dev->dev) && + !mlx4_vf_smi_enabled(dev->dev, slave, ctx->port)) + return; + break; + case IB_MGMT_CLASS_SUBN_ADM: + if (mlx4_ib_multiplex_sa_handler(ctx->ib_dev, ctx->port, slave, + (struct ib_sa_mad *) &tunnel->mad)) + return; + break; + case IB_MGMT_CLASS_CM: + if (mlx4_ib_multiplex_cm_handler(ctx->ib_dev, ctx->port, slave, + (struct ib_mad *) &tunnel->mad)) + return; + break; + case IB_MGMT_CLASS_DEVICE_MGMT: + if (tunnel->mad.mad_hdr.method != IB_MGMT_METHOD_GET && + tunnel->mad.mad_hdr.method != IB_MGMT_METHOD_SET) + return; + break; + default: + /* Drop unsupported classes for slaves in tunnel mode */ + if (slave != mlx4_master_func_num(dev->dev)) { + mlx4_ib_warn(ctx->ib_dev, "dropping unsupported egress mad from class:%d " + "for slave:%d\n", tunnel->mad.mad_hdr.mgmt_class, slave); + return; + } + } + + /* We are using standard ib_core services to send the mad, so generate a + * stadard address handle by decoding the tunnelled mlx4_ah fields */ + memcpy(&ah.av, &tunnel->hdr.av, sizeof (struct mlx4_av)); + ah.ibah.device = ctx->ib_dev; + mlx4_ib_query_ah(&ah.ibah, &ah_attr); + if (ah_attr.ah_flags & IB_AH_GRH) + fill_in_real_sgid_index(dev, slave, ctx->port, &ah_attr); + + port = mlx4_slave_convert_port(dev->dev, slave, ah_attr.port_num); + if (port < 0) + return; + ah_attr.port_num = port; + memcpy(ah_attr.dmac, tunnel->hdr.mac, 6); + ah_attr.vlan_id = be16_to_cpu(tunnel->hdr.vlan); + /* if slave have default vlan use it */ + mlx4_get_slave_default_vlan(dev->dev, ctx->port, slave, + &ah_attr.vlan_id, &ah_attr.sl); + + mlx4_ib_send_to_wire(dev, slave, ctx->port, + is_proxy_qp0(dev, wc->src_qp, slave) ? + IB_QPT_SMI : IB_QPT_GSI, + be16_to_cpu(tunnel->hdr.pkey_index), + be32_to_cpu(tunnel->hdr.remote_qpn), + be32_to_cpu(tunnel->hdr.qkey), + &ah_attr, wc->smac, &tunnel->mad); +} + +static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx, + enum ib_qp_type qp_type, int is_tun) +{ + int i; + struct mlx4_ib_demux_pv_qp *tun_qp; + int rx_buf_size, tx_buf_size; + + if (qp_type > IB_QPT_GSI) + return -EINVAL; + + tun_qp = &ctx->qp[qp_type]; + + tun_qp->ring = kzalloc(sizeof (struct mlx4_ib_buf) * MLX4_NUM_TUNNEL_BUFS, + GFP_KERNEL); + if (!tun_qp->ring) + return -ENOMEM; + + tun_qp->tx_ring = kcalloc(MLX4_NUM_TUNNEL_BUFS, + sizeof (struct mlx4_ib_tun_tx_buf), + GFP_KERNEL); + if (!tun_qp->tx_ring) { + kfree(tun_qp->ring); + tun_qp->ring = NULL; + return -ENOMEM; + } + + if (is_tun) { + rx_buf_size = sizeof (struct mlx4_tunnel_mad); + tx_buf_size = sizeof (struct mlx4_rcv_tunnel_mad); + } else { + rx_buf_size = sizeof (struct mlx4_mad_rcv_buf); + tx_buf_size = sizeof (struct mlx4_mad_snd_buf); + } + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + tun_qp->ring[i].addr = kmalloc(rx_buf_size, GFP_KERNEL); + if (!tun_qp->ring[i].addr) + goto err; + tun_qp->ring[i].map = ib_dma_map_single(ctx->ib_dev, + tun_qp->ring[i].addr, + rx_buf_size, + DMA_FROM_DEVICE); + } + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + tun_qp->tx_ring[i].buf.addr = + kmalloc(tx_buf_size, GFP_KERNEL); + if (!tun_qp->tx_ring[i].buf.addr) + goto tx_err; + tun_qp->tx_ring[i].buf.map = + ib_dma_map_single(ctx->ib_dev, + tun_qp->tx_ring[i].buf.addr, + tx_buf_size, + DMA_TO_DEVICE); + tun_qp->tx_ring[i].ah = NULL; + } + spin_lock_init(&tun_qp->tx_lock); + tun_qp->tx_ix_head = 0; + tun_qp->tx_ix_tail = 0; + tun_qp->proxy_qpt = qp_type; + + return 0; + +tx_err: + while (i > 0) { + --i; + ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map, + tx_buf_size, DMA_TO_DEVICE); + kfree(tun_qp->tx_ring[i].buf.addr); + } + kfree(tun_qp->tx_ring); + tun_qp->tx_ring = NULL; + i = MLX4_NUM_TUNNEL_BUFS; +err: + while (i > 0) { + --i; + ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map, + rx_buf_size, DMA_FROM_DEVICE); + kfree(tun_qp->ring[i].addr); + } + kfree(tun_qp->ring); + tun_qp->ring = NULL; + return -ENOMEM; +} + +static void mlx4_ib_free_pv_qp_bufs(struct mlx4_ib_demux_pv_ctx *ctx, + enum ib_qp_type qp_type, int is_tun) +{ + int i; + struct mlx4_ib_demux_pv_qp *tun_qp; + int rx_buf_size, tx_buf_size; + + if (qp_type > IB_QPT_GSI) + return; + + tun_qp = &ctx->qp[qp_type]; + if (is_tun) { + rx_buf_size = sizeof (struct mlx4_tunnel_mad); + tx_buf_size = sizeof (struct mlx4_rcv_tunnel_mad); + } else { + rx_buf_size = sizeof (struct mlx4_mad_rcv_buf); + tx_buf_size = sizeof (struct mlx4_mad_snd_buf); + } + + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map, + rx_buf_size, DMA_FROM_DEVICE); + kfree(tun_qp->ring[i].addr); + } + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map, + tx_buf_size, DMA_TO_DEVICE); + kfree(tun_qp->tx_ring[i].buf.addr); + if (tun_qp->tx_ring[i].ah) + ib_destroy_ah(tun_qp->tx_ring[i].ah); + } + kfree(tun_qp->tx_ring); + kfree(tun_qp->ring); +} + +static void mlx4_ib_tunnel_comp_worker(struct work_struct *work) +{ + struct mlx4_ib_demux_pv_ctx *ctx; + struct mlx4_ib_demux_pv_qp *tun_qp; + struct ib_wc wc; + int ret; + ctx = container_of(work, struct mlx4_ib_demux_pv_ctx, work); + ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP); + + while (ib_poll_cq(ctx->cq, 1, &wc) == 1) { + tun_qp = &ctx->qp[MLX4_TUN_WRID_QPN(wc.wr_id)]; + if (wc.status == IB_WC_SUCCESS) { + switch (wc.opcode) { + case IB_WC_RECV: + mlx4_ib_multiplex_mad(ctx, &wc); + ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp, + wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)); + if (ret) + pr_err("Failed reposting tunnel " + "buf:%lld\n", wc.wr_id); + break; + case IB_WC_SEND: + pr_debug("received tunnel send completion:" + "wrid=0x%llx, status=0x%x\n", + wc.wr_id, wc.status); + ib_destroy_ah(tun_qp->tx_ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].ah); + tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah + = NULL; + spin_lock(&tun_qp->tx_lock); + tun_qp->tx_ix_tail++; + spin_unlock(&tun_qp->tx_lock); + + break; + default: + break; + } + } else { + pr_debug("mlx4_ib: completion error in tunnel: %d." + " status = %d, wrid = 0x%llx\n", + ctx->slave, wc.status, wc.wr_id); + if (!MLX4_TUN_IS_RECV(wc.wr_id)) { + ib_destroy_ah(tun_qp->tx_ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].ah); + tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah + = NULL; + spin_lock(&tun_qp->tx_lock); + tun_qp->tx_ix_tail++; + spin_unlock(&tun_qp->tx_lock); + } + } + } +} + +static void pv_qp_event_handler(struct ib_event *event, void *qp_context) +{ + struct mlx4_ib_demux_pv_ctx *sqp = qp_context; + + /* It's worse than that! He's dead, Jim! */ + pr_err("Fatal error (%d) on a MAD QP on port %d\n", + event->event, sqp->port); +} + +static int create_pv_sqp(struct mlx4_ib_demux_pv_ctx *ctx, + enum ib_qp_type qp_type, int create_tun) +{ + int i, ret; + struct mlx4_ib_demux_pv_qp *tun_qp; + struct mlx4_ib_qp_tunnel_init_attr qp_init_attr; + struct ib_qp_attr attr; + int qp_attr_mask_INIT; + + if (qp_type > IB_QPT_GSI) + return -EINVAL; + + tun_qp = &ctx->qp[qp_type]; + + memset(&qp_init_attr, 0, sizeof qp_init_attr); + qp_init_attr.init_attr.send_cq = ctx->cq; + qp_init_attr.init_attr.recv_cq = ctx->cq; + qp_init_attr.init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; + qp_init_attr.init_attr.cap.max_send_wr = MLX4_NUM_TUNNEL_BUFS; + qp_init_attr.init_attr.cap.max_recv_wr = MLX4_NUM_TUNNEL_BUFS; + qp_init_attr.init_attr.cap.max_send_sge = 1; + qp_init_attr.init_attr.cap.max_recv_sge = 1; + if (create_tun) { + qp_init_attr.init_attr.qp_type = IB_QPT_UD; + qp_init_attr.init_attr.create_flags = MLX4_IB_SRIOV_TUNNEL_QP; + qp_init_attr.port = ctx->port; + qp_init_attr.slave = ctx->slave; + qp_init_attr.proxy_qp_type = qp_type; + qp_attr_mask_INIT = IB_QP_STATE | IB_QP_PKEY_INDEX | + IB_QP_QKEY | IB_QP_PORT; + } else { + qp_init_attr.init_attr.qp_type = qp_type; + qp_init_attr.init_attr.create_flags = MLX4_IB_SRIOV_SQP; + qp_attr_mask_INIT = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY; + } + qp_init_attr.init_attr.port_num = ctx->port; + qp_init_attr.init_attr.qp_context = ctx; + qp_init_attr.init_attr.event_handler = pv_qp_event_handler; + tun_qp->qp = ib_create_qp(ctx->pd, &qp_init_attr.init_attr); + if (IS_ERR(tun_qp->qp)) { + ret = PTR_ERR(tun_qp->qp); + tun_qp->qp = NULL; + pr_err("Couldn't create %s QP (%d)\n", + create_tun ? "tunnel" : "special", ret); + return ret; + } + + memset(&attr, 0, sizeof attr); + attr.qp_state = IB_QPS_INIT; + ret = 0; + if (create_tun) + ret = find_slave_port_pkey_ix(to_mdev(ctx->ib_dev), ctx->slave, + ctx->port, IB_DEFAULT_PKEY_FULL, + &attr.pkey_index); + if (ret || !create_tun) + attr.pkey_index = + to_mdev(ctx->ib_dev)->pkeys.virt2phys_pkey[ctx->slave][ctx->port - 1][0]; + attr.qkey = IB_QP1_QKEY; + attr.port_num = ctx->port; + ret = ib_modify_qp(tun_qp->qp, &attr, qp_attr_mask_INIT); + if (ret) { + pr_err("Couldn't change %s qp state to INIT (%d)\n", + create_tun ? "tunnel" : "special", ret); + goto err_qp; + } + attr.qp_state = IB_QPS_RTR; + ret = ib_modify_qp(tun_qp->qp, &attr, IB_QP_STATE); + if (ret) { + pr_err("Couldn't change %s qp state to RTR (%d)\n", + create_tun ? "tunnel" : "special", ret); + goto err_qp; + } + attr.qp_state = IB_QPS_RTS; + attr.sq_psn = 0; + ret = ib_modify_qp(tun_qp->qp, &attr, IB_QP_STATE | IB_QP_SQ_PSN); + if (ret) { + pr_err("Couldn't change %s qp state to RTS (%d)\n", + create_tun ? "tunnel" : "special", ret); + goto err_qp; + } + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp, i); + if (ret) { + pr_err(" mlx4_ib_post_pv_buf error" + " (err = %d, i = %d)\n", ret, i); + goto err_qp; + } + } + return 0; + +err_qp: + ib_destroy_qp(tun_qp->qp); + tun_qp->qp = NULL; + return ret; +} + +/* + * IB MAD completion callback for real SQPs + */ +static void mlx4_ib_sqp_comp_worker(struct work_struct *work) +{ + struct mlx4_ib_demux_pv_ctx *ctx; + struct mlx4_ib_demux_pv_qp *sqp; + struct ib_wc wc; + struct ib_grh *grh; + struct ib_mad *mad; + + ctx = container_of(work, struct mlx4_ib_demux_pv_ctx, work); + ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP); + + while (mlx4_ib_poll_cq(ctx->cq, 1, &wc) == 1) { + sqp = &ctx->qp[MLX4_TUN_WRID_QPN(wc.wr_id)]; + if (wc.status == IB_WC_SUCCESS) { + switch (wc.opcode) { + case IB_WC_SEND: + ib_destroy_ah(sqp->tx_ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].ah); + sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah + = NULL; + spin_lock(&sqp->tx_lock); + sqp->tx_ix_tail++; + spin_unlock(&sqp->tx_lock); + break; + case IB_WC_RECV: + mad = (struct ib_mad *) &(((struct mlx4_mad_rcv_buf *) + (sqp->ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].addr))->payload); + grh = &(((struct mlx4_mad_rcv_buf *) + (sqp->ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].addr))->grh); + mlx4_ib_demux_mad(ctx->ib_dev, ctx->port, &wc, grh, mad); + if (mlx4_ib_post_pv_qp_buf(ctx, sqp, wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1))) + pr_err("Failed reposting SQP " + "buf:%lld\n", wc.wr_id); + break; + default: + BUG_ON(1); + break; + } + } else { + pr_debug("mlx4_ib: completion error in tunnel: %d." + " status = %d, wrid = 0x%llx\n", + ctx->slave, wc.status, wc.wr_id); + if (!MLX4_TUN_IS_RECV(wc.wr_id)) { + ib_destroy_ah(sqp->tx_ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].ah); + sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah + = NULL; + spin_lock(&sqp->tx_lock); + sqp->tx_ix_tail++; + spin_unlock(&sqp->tx_lock); + } + } + } +} + +static int alloc_pv_object(struct mlx4_ib_dev *dev, int slave, int port, + struct mlx4_ib_demux_pv_ctx **ret_ctx) +{ + struct mlx4_ib_demux_pv_ctx *ctx; + + *ret_ctx = NULL; + ctx = kzalloc(sizeof (struct mlx4_ib_demux_pv_ctx), GFP_KERNEL); + if (!ctx) { + pr_err("failed allocating pv resource context " + "for port %d, slave %d\n", port, slave); + return -ENOMEM; + } + + ctx->ib_dev = &dev->ib_dev; + ctx->port = port; + ctx->slave = slave; + *ret_ctx = ctx; + return 0; +} + +static void free_pv_object(struct mlx4_ib_dev *dev, int slave, int port) +{ + if (dev->sriov.demux[port - 1].tun[slave]) { + kfree(dev->sriov.demux[port - 1].tun[slave]); + dev->sriov.demux[port - 1].tun[slave] = NULL; + } +} + +static int create_pv_resources(struct ib_device *ibdev, int slave, int port, + int create_tun, struct mlx4_ib_demux_pv_ctx *ctx) +{ + int ret, cq_size; + + if (ctx->state != DEMUX_PV_STATE_DOWN) + return -EEXIST; + + ctx->state = DEMUX_PV_STATE_STARTING; + /* have QP0 only if link layer is IB */ + if (rdma_port_get_link_layer(ibdev, ctx->port) == + IB_LINK_LAYER_INFINIBAND) + ctx->has_smi = 1; + + if (ctx->has_smi) { + ret = mlx4_ib_alloc_pv_bufs(ctx, IB_QPT_SMI, create_tun); + if (ret) { + pr_err("Failed allocating qp0 tunnel bufs (%d)\n", ret); + goto err_out; + } + } + + ret = mlx4_ib_alloc_pv_bufs(ctx, IB_QPT_GSI, create_tun); + if (ret) { + pr_err("Failed allocating qp1 tunnel bufs (%d)\n", ret); + goto err_out_qp0; + } + + cq_size = 2 * MLX4_NUM_TUNNEL_BUFS; + if (ctx->has_smi) + cq_size *= 2; + + ctx->cq = ib_create_cq(ctx->ib_dev, mlx4_ib_tunnel_comp_handler, + NULL, ctx, cq_size, 0); + if (IS_ERR(ctx->cq)) { + ret = PTR_ERR(ctx->cq); + pr_err("Couldn't create tunnel CQ (%d)\n", ret); + goto err_buf; + } + + ctx->pd = ib_alloc_pd(ctx->ib_dev); + if (IS_ERR(ctx->pd)) { + ret = PTR_ERR(ctx->pd); + pr_err("Couldn't create tunnel PD (%d)\n", ret); + goto err_cq; + } + + ctx->mr = ib_get_dma_mr(ctx->pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(ctx->mr)) { + ret = PTR_ERR(ctx->mr); + pr_err("Couldn't get tunnel DMA MR (%d)\n", ret); + goto err_pd; + } + + if (ctx->has_smi) { + ret = create_pv_sqp(ctx, IB_QPT_SMI, create_tun); + if (ret) { + pr_err("Couldn't create %s QP0 (%d)\n", + create_tun ? "tunnel for" : "", ret); + goto err_mr; + } + } + + ret = create_pv_sqp(ctx, IB_QPT_GSI, create_tun); + if (ret) { + pr_err("Couldn't create %s QP1 (%d)\n", + create_tun ? "tunnel for" : "", ret); + goto err_qp0; + } + + if (create_tun) + INIT_WORK(&ctx->work, mlx4_ib_tunnel_comp_worker); + else + INIT_WORK(&ctx->work, mlx4_ib_sqp_comp_worker); + + ctx->wq = to_mdev(ibdev)->sriov.demux[port - 1].wq; + + ret = ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP); + if (ret) { + pr_err("Couldn't arm tunnel cq (%d)\n", ret); + goto err_wq; + } + ctx->state = DEMUX_PV_STATE_ACTIVE; + return 0; + +err_wq: + ctx->wq = NULL; + ib_destroy_qp(ctx->qp[1].qp); + ctx->qp[1].qp = NULL; + + +err_qp0: + if (ctx->has_smi) + ib_destroy_qp(ctx->qp[0].qp); + ctx->qp[0].qp = NULL; + +err_mr: + ib_dereg_mr(ctx->mr); + ctx->mr = NULL; + +err_pd: + ib_dealloc_pd(ctx->pd); + ctx->pd = NULL; + +err_cq: + ib_destroy_cq(ctx->cq); + ctx->cq = NULL; + +err_buf: + mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, create_tun); + +err_out_qp0: + if (ctx->has_smi) + mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_SMI, create_tun); +err_out: + ctx->state = DEMUX_PV_STATE_DOWN; + return ret; +} + +static void destroy_pv_resources(struct mlx4_ib_dev *dev, int slave, int port, + struct mlx4_ib_demux_pv_ctx *ctx, int flush) +{ + if (!ctx) + return; + if (ctx->state > DEMUX_PV_STATE_DOWN) { + ctx->state = DEMUX_PV_STATE_DOWNING; + if (flush) + flush_workqueue(ctx->wq); + if (ctx->has_smi) { + ib_destroy_qp(ctx->qp[0].qp); + ctx->qp[0].qp = NULL; + mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_SMI, 1); + } + ib_destroy_qp(ctx->qp[1].qp); + ctx->qp[1].qp = NULL; + mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, 1); + ib_dereg_mr(ctx->mr); + ctx->mr = NULL; + ib_dealloc_pd(ctx->pd); + ctx->pd = NULL; + ib_destroy_cq(ctx->cq); + ctx->cq = NULL; + ctx->state = DEMUX_PV_STATE_DOWN; + } +} + +static int mlx4_ib_tunnels_update(struct mlx4_ib_dev *dev, int slave, + int port, int do_init) +{ + int ret = 0; + + if (!do_init) { + clean_vf_mcast(&dev->sriov.demux[port - 1], slave); + /* for master, destroy real sqp resources */ + if (slave == mlx4_master_func_num(dev->dev)) + destroy_pv_resources(dev, slave, port, + dev->sriov.sqps[port - 1], 1); + /* destroy the tunnel qp resources */ + destroy_pv_resources(dev, slave, port, + dev->sriov.demux[port - 1].tun[slave], 1); + return 0; + } + + /* create the tunnel qp resources */ + ret = create_pv_resources(&dev->ib_dev, slave, port, 1, + dev->sriov.demux[port - 1].tun[slave]); + + /* for master, create the real sqp resources */ + if (!ret && slave == mlx4_master_func_num(dev->dev)) + ret = create_pv_resources(&dev->ib_dev, slave, port, 0, + dev->sriov.sqps[port - 1]); + return ret; +} + +void mlx4_ib_tunnels_update_work(struct work_struct *work) +{ + struct mlx4_ib_demux_work *dmxw; + + dmxw = container_of(work, struct mlx4_ib_demux_work, work); + mlx4_ib_tunnels_update(dmxw->dev, dmxw->slave, (int) dmxw->port, + dmxw->do_init); + kfree(dmxw); + return; +} + +static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev, + struct mlx4_ib_demux_ctx *ctx, + int port) +{ + char name[12]; + int ret = 0; + int i; + + ctx->tun = kcalloc(dev->dev->caps.sqp_demux, + sizeof (struct mlx4_ib_demux_pv_ctx *), GFP_KERNEL); + if (!ctx->tun) + return -ENOMEM; + + ctx->dev = dev; + ctx->port = port; + ctx->ib_dev = &dev->ib_dev; + + for (i = 0; + i < min(dev->dev->caps.sqp_demux, (u16)(dev->dev->num_vfs + 1)); + i++) { + struct mlx4_active_ports actv_ports = + mlx4_get_active_ports(dev->dev, i); + + if (!test_bit(port - 1, actv_ports.ports)) + continue; + + ret = alloc_pv_object(dev, i, port, &ctx->tun[i]); + if (ret) { + ret = -ENOMEM; + goto err_mcg; + } + } + + ret = mlx4_ib_mcg_port_init(ctx); + if (ret) { + pr_err("Failed initializing mcg para-virt (%d)\n", ret); + goto err_mcg; + } + + snprintf(name, sizeof name, "mlx4_ibt%d", port); + ctx->wq = create_singlethread_workqueue(name); + if (!ctx->wq) { + pr_err("Failed to create tunnelling WQ for port %d\n", port); + ret = -ENOMEM; + goto err_wq; + } + + snprintf(name, sizeof name, "mlx4_ibud%d", port); + ctx->ud_wq = create_singlethread_workqueue(name); + if (!ctx->ud_wq) { + pr_err("Failed to create up/down WQ for port %d\n", port); + ret = -ENOMEM; + goto err_udwq; + } + + return 0; + +err_udwq: + destroy_workqueue(ctx->wq); + ctx->wq = NULL; + +err_wq: + mlx4_ib_mcg_port_cleanup(ctx, 1); +err_mcg: + for (i = 0; i < dev->dev->caps.sqp_demux; i++) + free_pv_object(dev, i, port); + kfree(ctx->tun); + ctx->tun = NULL; + return ret; +} + +static void mlx4_ib_free_sqp_ctx(struct mlx4_ib_demux_pv_ctx *sqp_ctx) +{ + if (sqp_ctx->state > DEMUX_PV_STATE_DOWN) { + sqp_ctx->state = DEMUX_PV_STATE_DOWNING; + flush_workqueue(sqp_ctx->wq); + if (sqp_ctx->has_smi) { + ib_destroy_qp(sqp_ctx->qp[0].qp); + sqp_ctx->qp[0].qp = NULL; + mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_SMI, 0); + } + ib_destroy_qp(sqp_ctx->qp[1].qp); + sqp_ctx->qp[1].qp = NULL; + mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_GSI, 0); + ib_dereg_mr(sqp_ctx->mr); + sqp_ctx->mr = NULL; + ib_dealloc_pd(sqp_ctx->pd); + sqp_ctx->pd = NULL; + ib_destroy_cq(sqp_ctx->cq); + sqp_ctx->cq = NULL; + sqp_ctx->state = DEMUX_PV_STATE_DOWN; + } +} + +static void mlx4_ib_free_demux_ctx(struct mlx4_ib_demux_ctx *ctx) +{ + int i; + if (ctx) { + struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); + mlx4_ib_mcg_port_cleanup(ctx, 1); + for (i = 0; i < dev->dev->caps.sqp_demux; i++) { + if (!ctx->tun[i]) + continue; + if (ctx->tun[i]->state > DEMUX_PV_STATE_DOWN) + ctx->tun[i]->state = DEMUX_PV_STATE_DOWNING; + } + flush_workqueue(ctx->wq); + for (i = 0; i < dev->dev->caps.sqp_demux; i++) { + destroy_pv_resources(dev, i, ctx->port, ctx->tun[i], 0); + free_pv_object(dev, i, ctx->port); + } + kfree(ctx->tun); + destroy_workqueue(ctx->ud_wq); + destroy_workqueue(ctx->wq); + } +} + +static void mlx4_ib_master_tunnels(struct mlx4_ib_dev *dev, int do_init) +{ + int i; + + if (!mlx4_is_master(dev->dev)) + return; + /* initialize or tear down tunnel QPs for the master */ + for (i = 0; i < dev->dev->caps.num_ports; i++) + mlx4_ib_tunnels_update(dev, mlx4_master_func_num(dev->dev), i + 1, do_init); + return; +} + +int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev) +{ + int i = 0; + int err; + + if (!mlx4_is_mfunc(dev->dev)) + return 0; + + dev->sriov.is_going_down = 0; + spin_lock_init(&dev->sriov.going_down_lock); + mlx4_ib_cm_paravirt_init(dev); + + mlx4_ib_warn(&dev->ib_dev, "multi-function enabled\n"); + + if (mlx4_is_slave(dev->dev)) { + mlx4_ib_warn(&dev->ib_dev, "operating in qp1 tunnel mode\n"); + return 0; + } + + for (i = 0; i < dev->dev->caps.sqp_demux; i++) { + if (i == mlx4_master_func_num(dev->dev)) + mlx4_put_slave_node_guid(dev->dev, i, dev->ib_dev.node_guid); + else + mlx4_put_slave_node_guid(dev->dev, i, mlx4_ib_gen_node_guid()); + } + + err = mlx4_ib_init_alias_guid_service(dev); + if (err) { + mlx4_ib_warn(&dev->ib_dev, "Failed init alias guid process.\n"); + goto paravirt_err; + } + err = mlx4_ib_device_register_sysfs(dev); + if (err) { + mlx4_ib_warn(&dev->ib_dev, "Failed to register sysfs\n"); + goto sysfs_err; + } + + mlx4_ib_warn(&dev->ib_dev, "initializing demux service for %d qp1 clients\n", + dev->dev->caps.sqp_demux); + for (i = 0; i < dev->num_ports; i++) { + union ib_gid gid; + err = __mlx4_ib_query_gid(&dev->ib_dev, i + 1, 0, &gid, 1); + if (err) + goto demux_err; + dev->sriov.demux[i].guid_cache[0] = gid.global.interface_id; + err = alloc_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1, + &dev->sriov.sqps[i]); + if (err) + goto demux_err; + err = mlx4_ib_alloc_demux_ctx(dev, &dev->sriov.demux[i], i + 1); + if (err) + goto free_pv; + } + mlx4_ib_master_tunnels(dev, 1); + return 0; + +free_pv: + free_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1); +demux_err: + while (--i >= 0) { + free_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1); + mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]); + } + mlx4_ib_device_unregister_sysfs(dev); + +sysfs_err: + mlx4_ib_destroy_alias_guid_service(dev); + +paravirt_err: + mlx4_ib_cm_paravirt_clean(dev, -1); + + return err; +} + +void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev) +{ + int i; + unsigned long flags; + + if (!mlx4_is_mfunc(dev->dev)) + return; + + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + dev->sriov.is_going_down = 1; + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); + if (mlx4_is_master(dev->dev)) { + for (i = 0; i < dev->num_ports; i++) { + flush_workqueue(dev->sriov.demux[i].ud_wq); + mlx4_ib_free_sqp_ctx(dev->sriov.sqps[i]); + kfree(dev->sriov.sqps[i]); + dev->sriov.sqps[i] = NULL; + mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]); + } + + mlx4_ib_cm_paravirt_clean(dev, -1); + mlx4_ib_destroy_alias_guid_service(dev); + mlx4_ib_device_unregister_sysfs(dev); + } +} diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c new file mode 100644 index 0000000..8b72cf3 --- /dev/null +++ b/drivers/infiniband/hw/mlx4/main.c @@ -0,0 +1,2653 @@ +/* + * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include "mlx4_ib.h" +#include "user.h" + +#define DRV_NAME MLX4_IB_DRV_NAME +#define DRV_VERSION "2.2-1" +#define DRV_RELDATE "Feb 2014" + +#define MLX4_IB_FLOW_MAX_PRIO 0xFFF +#define MLX4_IB_FLOW_QPN_MASK 0xFFFFFF +#define MLX4_IB_CARD_REV_A0 0xA0 + +MODULE_AUTHOR("Roland Dreier"); +MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); + +int mlx4_ib_sm_guid_assign = 1; +module_param_named(sm_guid_assign, mlx4_ib_sm_guid_assign, int, 0444); +MODULE_PARM_DESC(sm_guid_assign, "Enable SM alias_GUID assignment if sm_guid_assign > 0 (Default: 1)"); + +static const char mlx4_ib_version[] = + DRV_NAME ": Mellanox ConnectX InfiniBand driver v" + DRV_VERSION " (" DRV_RELDATE ")\n"; + +struct update_gid_work { + struct work_struct work; + union ib_gid gids[128]; + struct mlx4_ib_dev *dev; + int port; +}; + +static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init); + +static struct workqueue_struct *wq; + +static void init_query_mad(struct ib_smp *mad) +{ + mad->base_version = 1; + mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; + mad->class_version = 1; + mad->method = IB_MGMT_METHOD_GET; +} + +static union ib_gid zgid; + +static int check_flow_steering_support(struct mlx4_dev *dev) +{ + int eth_num_ports = 0; + int ib_num_ports = 0; + + int dmfs = dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED; + + if (dmfs) { + int i; + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) + eth_num_ports++; + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) + ib_num_ports++; + dmfs &= (!ib_num_ports || + (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_IPOIB)) && + (!eth_num_ports || + (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN)); + if (ib_num_ports && mlx4_is_mfunc(dev)) { + pr_warn("Device managed flow steering is unavailable for IB port in multifunction env.\n"); + dmfs = 0; + } + } + return dmfs; +} + +static int num_ib_ports(struct mlx4_dev *dev) +{ + int ib_ports = 0; + int i; + + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) + ib_ports++; + + return ib_ports; +} + +static int mlx4_ib_query_device(struct ib_device *ibdev, + struct ib_device_attr *props) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + int have_ib_ports; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; + + err = mlx4_MAD_IFC(to_mdev(ibdev), MLX4_MAD_IFC_IGNORE_KEYS, + 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memset(props, 0, sizeof *props); + + have_ib_ports = num_ib_ports(dev->dev); + + props->fw_ver = dev->dev->caps.fw_ver; + props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | + IB_DEVICE_PORT_ACTIVE_EVENT | + IB_DEVICE_SYS_IMAGE_GUID | + IB_DEVICE_RC_RNR_NAK_GEN | + IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR) + props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR) + props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_APM && have_ib_ports) + props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UD_AV_PORT) + props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM) + props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; + if (dev->dev->caps.max_gso_sz && + (dev->dev->rev_id != MLX4_IB_CARD_REV_A0) && + (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BLH)) + props->device_cap_flags |= IB_DEVICE_UD_TSO; + if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_RESERVED_LKEY) + props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY; + if ((dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_LOCAL_INV) && + (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_REMOTE_INV) && + (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_FAST_REG_WR)) + props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) + props->device_cap_flags |= IB_DEVICE_XRC; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW) + props->device_cap_flags |= IB_DEVICE_MEM_WINDOW; + if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) { + if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_WIN_TYPE_2B) + props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2B; + else + props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2A; + if (dev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) + props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING; + } + + props->vendor_id = be32_to_cpup((__be32 *) (out_mad->data + 36)) & + 0xffffff; + props->vendor_part_id = dev->dev->pdev->device; + props->hw_ver = be32_to_cpup((__be32 *) (out_mad->data + 32)); + memcpy(&props->sys_image_guid, out_mad->data + 4, 8); + + props->max_mr_size = ~0ull; + props->page_size_cap = dev->dev->caps.page_size_cap; + props->max_qp = dev->dev->quotas.qp; + props->max_qp_wr = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE; + props->max_sge = min(dev->dev->caps.max_sq_sg, + dev->dev->caps.max_rq_sg); + props->max_cq = dev->dev->quotas.cq; + props->max_cqe = dev->dev->caps.max_cqes; + props->max_mr = dev->dev->quotas.mpt; + props->max_pd = dev->dev->caps.num_pds - dev->dev->caps.reserved_pds; + props->max_qp_rd_atom = dev->dev->caps.max_qp_dest_rdma; + props->max_qp_init_rd_atom = dev->dev->caps.max_qp_init_rdma; + props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; + props->max_srq = dev->dev->quotas.srq; + props->max_srq_wr = dev->dev->caps.max_srq_wqes - 1; + props->max_srq_sge = dev->dev->caps.max_srq_sge; + props->max_fast_reg_page_list_len = MLX4_MAX_FAST_REG_PAGES; + props->local_ca_ack_delay = dev->dev->caps.local_ca_ack_delay; + props->atomic_cap = dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_ATOMIC ? + IB_ATOMIC_HCA : IB_ATOMIC_NONE; + props->masked_atomic_cap = props->atomic_cap; + props->max_pkeys = dev->dev->caps.pkey_table_len[1]; + props->max_mcast_grp = dev->dev->caps.num_mgms + dev->dev->caps.num_amgms; + props->max_mcast_qp_attach = dev->dev->caps.num_qp_per_mgm; + props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * + props->max_mcast_grp; + props->max_map_per_fmr = dev->dev->caps.max_fmr_maps; + +out: + kfree(in_mad); + kfree(out_mad); + + return err; +} + +static enum rdma_link_layer +mlx4_ib_port_link_layer(struct ib_device *device, u8 port_num) +{ + struct mlx4_dev *dev = to_mdev(device)->dev; + + return dev->caps.port_mask[port_num] == MLX4_PORT_TYPE_IB ? + IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; +} + +static int ib_link_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props, int netw_view) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int ext_active_speed; + int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view) + mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; + + err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, + in_mad, out_mad); + if (err) + goto out; + + + props->lid = be16_to_cpup((__be16 *) (out_mad->data + 16)); + props->lmc = out_mad->data[34] & 0x7; + props->sm_lid = be16_to_cpup((__be16 *) (out_mad->data + 18)); + props->sm_sl = out_mad->data[36] & 0xf; + props->state = out_mad->data[32] & 0xf; + props->phys_state = out_mad->data[33] >> 4; + props->port_cap_flags = be32_to_cpup((__be32 *) (out_mad->data + 20)); + if (netw_view) + props->gid_tbl_len = out_mad->data[50]; + else + props->gid_tbl_len = to_mdev(ibdev)->dev->caps.gid_table_len[port]; + props->max_msg_sz = to_mdev(ibdev)->dev->caps.max_msg_sz; + props->pkey_tbl_len = to_mdev(ibdev)->dev->caps.pkey_table_len[port]; + props->bad_pkey_cntr = be16_to_cpup((__be16 *) (out_mad->data + 46)); + props->qkey_viol_cntr = be16_to_cpup((__be16 *) (out_mad->data + 48)); + props->active_width = out_mad->data[31] & 0xf; + props->active_speed = out_mad->data[35] >> 4; + props->max_mtu = out_mad->data[41] & 0xf; + props->active_mtu = out_mad->data[36] >> 4; + props->subnet_timeout = out_mad->data[51] & 0x1f; + props->max_vl_num = out_mad->data[37] >> 4; + props->init_type_reply = out_mad->data[41] >> 4; + + /* Check if extended speeds (EDR/FDR/...) are supported */ + if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) { + ext_active_speed = out_mad->data[62] >> 4; + + switch (ext_active_speed) { + case 1: + props->active_speed = IB_SPEED_FDR; + break; + case 2: + props->active_speed = IB_SPEED_EDR; + break; + } + } + + /* If reported active speed is QDR, check if is FDR-10 */ + if (props->active_speed == IB_SPEED_QDR) { + init_query_mad(in_mad); + in_mad->attr_id = MLX4_ATTR_EXTENDED_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, + NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + /* Checking LinkSpeedActive for FDR-10 */ + if (out_mad->data[15] & 0x1) + props->active_speed = IB_SPEED_FDR10; + } + + /* Avoid wrong speed value returned by FW if the IB link is down. */ + if (props->state == IB_PORT_DOWN) + props->active_speed = IB_SPEED_SDR; + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static u8 state_to_phys_state(enum ib_port_state state) +{ + return state == IB_PORT_ACTIVE ? 5 : 3; +} + +static int eth_link_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props, int netw_view) +{ + + struct mlx4_ib_dev *mdev = to_mdev(ibdev); + struct mlx4_ib_iboe *iboe = &mdev->iboe; + struct net_device *ndev; + enum ib_mtu tmp; + struct mlx4_cmd_mailbox *mailbox; + int err = 0; + + mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, port, 0, + MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); + if (err) + goto out; + + props->active_width = (((u8 *)mailbox->buf)[5] == 0x40) ? + IB_WIDTH_4X : IB_WIDTH_1X; + props->active_speed = IB_SPEED_QDR; + props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_IP_BASED_GIDS; + props->gid_tbl_len = mdev->dev->caps.gid_table_len[port]; + props->max_msg_sz = mdev->dev->caps.max_msg_sz; + props->pkey_tbl_len = 1; + props->max_mtu = IB_MTU_4096; + props->max_vl_num = 2; + props->state = IB_PORT_DOWN; + props->phys_state = state_to_phys_state(props->state); + props->active_mtu = IB_MTU_256; + spin_lock_bh(&iboe->lock); + ndev = iboe->netdevs[port - 1]; + if (!ndev) + goto out_unlock; + + tmp = iboe_get_mtu(ndev->mtu); + props->active_mtu = tmp ? min(props->max_mtu, tmp) : IB_MTU_256; + + props->state = (netif_running(ndev) && netif_carrier_ok(ndev)) ? + IB_PORT_ACTIVE : IB_PORT_DOWN; + props->phys_state = state_to_phys_state(props->state); +out_unlock: + spin_unlock_bh(&iboe->lock); +out: + mlx4_free_cmd_mailbox(mdev->dev, mailbox); + return err; +} + +int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props, int netw_view) +{ + int err; + + memset(props, 0, sizeof *props); + + err = mlx4_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND ? + ib_link_query_port(ibdev, port, props, netw_view) : + eth_link_query_port(ibdev, port, props, netw_view); + + return err; +} + +static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + /* returns host view */ + return __mlx4_ib_query_port(ibdev, port, props, 0); +} + +int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid, int netw_view) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + struct mlx4_ib_dev *dev = to_mdev(ibdev); + int clear = 0; + int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + if (mlx4_is_mfunc(dev->dev) && netw_view) + mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; + + err = mlx4_MAD_IFC(dev, mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(gid->raw, out_mad->data + 8, 8); + + if (mlx4_is_mfunc(dev->dev) && !netw_view) { + if (index) { + /* For any index > 0, return the null guid */ + err = 0; + clear = 1; + goto out; + } + } + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; + in_mad->attr_mod = cpu_to_be32(index / 8); + + err = mlx4_MAD_IFC(dev, mad_ifc_flags, port, + NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8); + +out: + if (clear) + memset(gid->raw + 8, 0, 8); + kfree(in_mad); + kfree(out_mad); + return err; +} + +static int iboe_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + + *gid = dev->iboe.gid_table[port - 1][index]; + + return 0; +} + +static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid) +{ + if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND) + return __mlx4_ib_query_gid(ibdev, port, index, gid, 0); + else + return iboe_query_gid(ibdev, port, index, gid); +} + +int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey, int netw_view) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PKEY_TABLE; + in_mad->attr_mod = cpu_to_be32(index / 32); + + if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view) + mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; + + err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, + in_mad, out_mad); + if (err) + goto out; + + *pkey = be16_to_cpu(((__be16 *) out_mad->data)[index % 32]); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) +{ + return __mlx4_ib_query_pkey(ibdev, port, index, pkey, 0); +} + +static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask, + struct ib_device_modify *props) +{ + struct mlx4_cmd_mailbox *mailbox; + unsigned long flags; + + if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) + return -EOPNOTSUPP; + + if (!(mask & IB_DEVICE_MODIFY_NODE_DESC)) + return 0; + + if (mlx4_is_slave(to_mdev(ibdev)->dev)) + return -EOPNOTSUPP; + + spin_lock_irqsave(&to_mdev(ibdev)->sm_lock, flags); + memcpy(ibdev->node_desc, props->node_desc, 64); + spin_unlock_irqrestore(&to_mdev(ibdev)->sm_lock, flags); + + /* + * If possible, pass node desc to FW, so it can generate + * a 144 trap. If cmd fails, just ignore. + */ + mailbox = mlx4_alloc_cmd_mailbox(to_mdev(ibdev)->dev); + if (IS_ERR(mailbox)) + return 0; + + memcpy(mailbox->buf, props->node_desc, 64); + mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0, + MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); + + mlx4_free_cmd_mailbox(to_mdev(ibdev)->dev, mailbox); + + return 0; +} + +static int mlx4_ib_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, + u32 cap_mask) +{ + struct mlx4_cmd_mailbox *mailbox; + int err; + + mailbox = mlx4_alloc_cmd_mailbox(dev->dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + if (dev->dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { + *(u8 *) mailbox->buf = !!reset_qkey_viols << 6; + ((__be32 *) mailbox->buf)[2] = cpu_to_be32(cap_mask); + } else { + ((u8 *) mailbox->buf)[3] = !!reset_qkey_viols; + ((__be32 *) mailbox->buf)[1] = cpu_to_be32(cap_mask); + } + + err = mlx4_cmd(dev->dev, mailbox->dma, port, 0, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); + + mlx4_free_cmd_mailbox(dev->dev, mailbox); + return err; +} + +static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, + struct ib_port_modify *props) +{ + struct mlx4_ib_dev *mdev = to_mdev(ibdev); + u8 is_eth = mdev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH; + struct ib_port_attr attr; + u32 cap_mask; + int err; + + /* return OK if this is RoCE. CM calls ib_modify_port() regardless + * of whether port link layer is ETH or IB. For ETH ports, qkey + * violations and port capabilities are not meaningful. + */ + if (is_eth) + return 0; + + mutex_lock(&mdev->cap_mask_mutex); + + err = mlx4_ib_query_port(ibdev, port, &attr); + if (err) + goto out; + + cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) & + ~props->clr_port_cap_mask; + + err = mlx4_ib_SET_PORT(mdev, port, + !!(mask & IB_PORT_RESET_QKEY_CNTR), + cap_mask); + +out: + mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex); + return err; +} + +static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct mlx4_ib_ucontext *context; + struct mlx4_ib_alloc_ucontext_resp_v3 resp_v3; + struct mlx4_ib_alloc_ucontext_resp resp; + int err; + + if (!dev->ib_active) + return ERR_PTR(-EAGAIN); + + if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) { + resp_v3.qp_tab_size = dev->dev->caps.num_qps; + resp_v3.bf_reg_size = dev->dev->caps.bf_reg_size; + resp_v3.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; + } else { + resp.dev_caps = dev->dev->caps.userspace_caps; + resp.qp_tab_size = dev->dev->caps.num_qps; + resp.bf_reg_size = dev->dev->caps.bf_reg_size; + resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; + resp.cqe_size = dev->dev->caps.cqe_size; + } + + context = kmalloc(sizeof *context, GFP_KERNEL); + if (!context) + return ERR_PTR(-ENOMEM); + + err = mlx4_uar_alloc(to_mdev(ibdev)->dev, &context->uar); + if (err) { + kfree(context); + return ERR_PTR(err); + } + + INIT_LIST_HEAD(&context->db_page_list); + mutex_init(&context->db_page_mutex); + + if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) + err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3)); + else + err = ib_copy_to_udata(udata, &resp, sizeof(resp)); + + if (err) { + mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar); + kfree(context); + return ERR_PTR(-EFAULT); + } + + return &context->ibucontext; +} + +static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) +{ + struct mlx4_ib_ucontext *context = to_mucontext(ibcontext); + + mlx4_uar_free(to_mdev(ibcontext->device)->dev, &context->uar); + kfree(context); + + return 0; +} + +static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + struct mlx4_ib_dev *dev = to_mdev(context->device); + + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + if (vma->vm_pgoff == 0) { + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + if (io_remap_pfn_range(vma, vma->vm_start, + to_mucontext(context)->uar.pfn, + PAGE_SIZE, vma->vm_page_prot)) + return -EAGAIN; + } else if (vma->vm_pgoff == 1 && dev->dev->caps.bf_reg_size != 0) { + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + + if (io_remap_pfn_range(vma, vma->vm_start, + to_mucontext(context)->uar.pfn + + dev->dev->caps.num_uars, + PAGE_SIZE, vma->vm_page_prot)) + return -EAGAIN; + } else + return -EINVAL; + + return 0; +} + +static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx4_ib_pd *pd; + int err; + + pd = kmalloc(sizeof *pd, GFP_KERNEL); + if (!pd) + return ERR_PTR(-ENOMEM); + + err = mlx4_pd_alloc(to_mdev(ibdev)->dev, &pd->pdn); + if (err) { + kfree(pd); + return ERR_PTR(err); + } + + if (context) + if (ib_copy_to_udata(udata, &pd->pdn, sizeof (__u32))) { + mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn); + kfree(pd); + return ERR_PTR(-EFAULT); + } + + return &pd->ibpd; +} + +static int mlx4_ib_dealloc_pd(struct ib_pd *pd) +{ + mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn); + kfree(pd); + + return 0; +} + +static struct ib_xrcd *mlx4_ib_alloc_xrcd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx4_ib_xrcd *xrcd; + int err; + + if (!(to_mdev(ibdev)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)) + return ERR_PTR(-ENOSYS); + + xrcd = kmalloc(sizeof *xrcd, GFP_KERNEL); + if (!xrcd) + return ERR_PTR(-ENOMEM); + + err = mlx4_xrcd_alloc(to_mdev(ibdev)->dev, &xrcd->xrcdn); + if (err) + goto err1; + + xrcd->pd = ib_alloc_pd(ibdev); + if (IS_ERR(xrcd->pd)) { + err = PTR_ERR(xrcd->pd); + goto err2; + } + + xrcd->cq = ib_create_cq(ibdev, NULL, NULL, xrcd, 1, 0); + if (IS_ERR(xrcd->cq)) { + err = PTR_ERR(xrcd->cq); + goto err3; + } + + return &xrcd->ibxrcd; + +err3: + ib_dealloc_pd(xrcd->pd); +err2: + mlx4_xrcd_free(to_mdev(ibdev)->dev, xrcd->xrcdn); +err1: + kfree(xrcd); + return ERR_PTR(err); +} + +static int mlx4_ib_dealloc_xrcd(struct ib_xrcd *xrcd) +{ + ib_destroy_cq(to_mxrcd(xrcd)->cq); + ib_dealloc_pd(to_mxrcd(xrcd)->pd); + mlx4_xrcd_free(to_mdev(xrcd->device)->dev, to_mxrcd(xrcd)->xrcdn); + kfree(xrcd); + + return 0; +} + +static int add_gid_entry(struct ib_qp *ibqp, union ib_gid *gid) +{ + struct mlx4_ib_qp *mqp = to_mqp(ibqp); + struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); + struct mlx4_ib_gid_entry *ge; + + ge = kzalloc(sizeof *ge, GFP_KERNEL); + if (!ge) + return -ENOMEM; + + ge->gid = *gid; + if (mlx4_ib_add_mc(mdev, mqp, gid)) { + ge->port = mqp->port; + ge->added = 1; + } + + mutex_lock(&mqp->mutex); + list_add_tail(&ge->list, &mqp->gid_list); + mutex_unlock(&mqp->mutex); + + return 0; +} + +int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, + union ib_gid *gid) +{ + struct net_device *ndev; + int ret = 0; + + if (!mqp->port) + return 0; + + spin_lock_bh(&mdev->iboe.lock); + ndev = mdev->iboe.netdevs[mqp->port - 1]; + if (ndev) + dev_hold(ndev); + spin_unlock_bh(&mdev->iboe.lock); + + if (ndev) { + ret = 1; + dev_put(ndev); + } + + return ret; +} + +struct mlx4_ib_steering { + struct list_head list; + u64 reg_id; + union ib_gid gid; +}; + +static int parse_flow_attr(struct mlx4_dev *dev, + u32 qp_num, + union ib_flow_spec *ib_spec, + struct _rule_hw *mlx4_spec) +{ + enum mlx4_net_trans_rule_id type; + + switch (ib_spec->type) { + case IB_FLOW_SPEC_ETH: + type = MLX4_NET_TRANS_RULE_ID_ETH; + memcpy(mlx4_spec->eth.dst_mac, ib_spec->eth.val.dst_mac, + ETH_ALEN); + memcpy(mlx4_spec->eth.dst_mac_msk, ib_spec->eth.mask.dst_mac, + ETH_ALEN); + mlx4_spec->eth.vlan_tag = ib_spec->eth.val.vlan_tag; + mlx4_spec->eth.vlan_tag_msk = ib_spec->eth.mask.vlan_tag; + break; + case IB_FLOW_SPEC_IB: + type = MLX4_NET_TRANS_RULE_ID_IB; + mlx4_spec->ib.l3_qpn = + cpu_to_be32(qp_num); + mlx4_spec->ib.qpn_mask = + cpu_to_be32(MLX4_IB_FLOW_QPN_MASK); + break; + + + case IB_FLOW_SPEC_IPV4: + type = MLX4_NET_TRANS_RULE_ID_IPV4; + mlx4_spec->ipv4.src_ip = ib_spec->ipv4.val.src_ip; + mlx4_spec->ipv4.src_ip_msk = ib_spec->ipv4.mask.src_ip; + mlx4_spec->ipv4.dst_ip = ib_spec->ipv4.val.dst_ip; + mlx4_spec->ipv4.dst_ip_msk = ib_spec->ipv4.mask.dst_ip; + break; + + case IB_FLOW_SPEC_TCP: + case IB_FLOW_SPEC_UDP: + type = ib_spec->type == IB_FLOW_SPEC_TCP ? + MLX4_NET_TRANS_RULE_ID_TCP : + MLX4_NET_TRANS_RULE_ID_UDP; + mlx4_spec->tcp_udp.dst_port = ib_spec->tcp_udp.val.dst_port; + mlx4_spec->tcp_udp.dst_port_msk = ib_spec->tcp_udp.mask.dst_port; + mlx4_spec->tcp_udp.src_port = ib_spec->tcp_udp.val.src_port; + mlx4_spec->tcp_udp.src_port_msk = ib_spec->tcp_udp.mask.src_port; + break; + + default: + return -EINVAL; + } + if (mlx4_map_sw_to_hw_steering_id(dev, type) < 0 || + mlx4_hw_rule_sz(dev, type) < 0) + return -EINVAL; + mlx4_spec->id = cpu_to_be16(mlx4_map_sw_to_hw_steering_id(dev, type)); + mlx4_spec->size = mlx4_hw_rule_sz(dev, type) >> 2; + return mlx4_hw_rule_sz(dev, type); +} + +struct default_rules { + __u32 mandatory_fields[IB_FLOW_SPEC_SUPPORT_LAYERS]; + __u32 mandatory_not_fields[IB_FLOW_SPEC_SUPPORT_LAYERS]; + __u32 rules_create_list[IB_FLOW_SPEC_SUPPORT_LAYERS]; + __u8 link_layer; +}; +static const struct default_rules default_table[] = { + { + .mandatory_fields = {IB_FLOW_SPEC_IPV4}, + .mandatory_not_fields = {IB_FLOW_SPEC_ETH}, + .rules_create_list = {IB_FLOW_SPEC_IB}, + .link_layer = IB_LINK_LAYER_INFINIBAND + } +}; + +static int __mlx4_ib_default_rules_match(struct ib_qp *qp, + struct ib_flow_attr *flow_attr) +{ + int i, j, k; + void *ib_flow; + const struct default_rules *pdefault_rules = default_table; + u8 link_layer = rdma_port_get_link_layer(qp->device, flow_attr->port); + + for (i = 0; i < ARRAY_SIZE(default_table); i++, pdefault_rules++) { + __u32 field_types[IB_FLOW_SPEC_SUPPORT_LAYERS]; + memset(&field_types, 0, sizeof(field_types)); + + if (link_layer != pdefault_rules->link_layer) + continue; + + ib_flow = flow_attr + 1; + /* we assume the specs are sorted */ + for (j = 0, k = 0; k < IB_FLOW_SPEC_SUPPORT_LAYERS && + j < flow_attr->num_of_specs; k++) { + union ib_flow_spec *current_flow = + (union ib_flow_spec *)ib_flow; + + /* same layer but different type */ + if (((current_flow->type & IB_FLOW_SPEC_LAYER_MASK) == + (pdefault_rules->mandatory_fields[k] & + IB_FLOW_SPEC_LAYER_MASK)) && + (current_flow->type != + pdefault_rules->mandatory_fields[k])) + goto out; + + /* same layer, try match next one */ + if (current_flow->type == + pdefault_rules->mandatory_fields[k]) { + j++; + ib_flow += + ((union ib_flow_spec *)ib_flow)->size; + } + } + + ib_flow = flow_attr + 1; + for (j = 0; j < flow_attr->num_of_specs; + j++, ib_flow += ((union ib_flow_spec *)ib_flow)->size) + for (k = 0; k < IB_FLOW_SPEC_SUPPORT_LAYERS; k++) + /* same layer and same type */ + if (((union ib_flow_spec *)ib_flow)->type == + pdefault_rules->mandatory_not_fields[k]) + goto out; + + return i; + } +out: + return -1; +} + +static int __mlx4_ib_create_default_rules( + struct mlx4_ib_dev *mdev, + struct ib_qp *qp, + const struct default_rules *pdefault_rules, + struct _rule_hw *mlx4_spec) { + int size = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(pdefault_rules->rules_create_list); i++) { + int ret; + union ib_flow_spec ib_spec; + switch (pdefault_rules->rules_create_list[i]) { + case 0: + /* no rule */ + continue; + case IB_FLOW_SPEC_IB: + ib_spec.type = IB_FLOW_SPEC_IB; + ib_spec.size = sizeof(struct ib_flow_spec_ib); + + break; + default: + /* invalid rule */ + return -EINVAL; + } + /* We must put empty rule, qpn is being ignored */ + ret = parse_flow_attr(mdev->dev, 0, &ib_spec, + mlx4_spec); + if (ret < 0) { + pr_info("invalid parsing\n"); + return -EINVAL; + } + + mlx4_spec = (void *)mlx4_spec + ret; + size += ret; + } + return size; +} + +static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, + int domain, + enum mlx4_net_trans_promisc_mode flow_type, + u64 *reg_id) +{ + int ret, i; + int size = 0; + void *ib_flow; + struct mlx4_ib_dev *mdev = to_mdev(qp->device); + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_net_trans_rule_hw_ctrl *ctrl; + int default_flow; + + static const u16 __mlx4_domain[] = { + [IB_FLOW_DOMAIN_USER] = MLX4_DOMAIN_UVERBS, + [IB_FLOW_DOMAIN_ETHTOOL] = MLX4_DOMAIN_ETHTOOL, + [IB_FLOW_DOMAIN_RFS] = MLX4_DOMAIN_RFS, + [IB_FLOW_DOMAIN_NIC] = MLX4_DOMAIN_NIC, + }; + + if (flow_attr->priority > MLX4_IB_FLOW_MAX_PRIO) { + pr_err("Invalid priority value %d\n", flow_attr->priority); + return -EINVAL; + } + + if (domain >= IB_FLOW_DOMAIN_NUM) { + pr_err("Invalid domain value %d\n", domain); + return -EINVAL; + } + + if (mlx4_map_sw_to_hw_steering_mode(mdev->dev, flow_type) < 0) + return -EINVAL; + + mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + ctrl = mailbox->buf; + + ctrl->prio = cpu_to_be16(__mlx4_domain[domain] | + flow_attr->priority); + ctrl->type = mlx4_map_sw_to_hw_steering_mode(mdev->dev, flow_type); + ctrl->port = flow_attr->port; + ctrl->qpn = cpu_to_be32(qp->qp_num); + + ib_flow = flow_attr + 1; + size += sizeof(struct mlx4_net_trans_rule_hw_ctrl); + /* Add default flows */ + default_flow = __mlx4_ib_default_rules_match(qp, flow_attr); + if (default_flow >= 0) { + ret = __mlx4_ib_create_default_rules( + mdev, qp, default_table + default_flow, + mailbox->buf + size); + if (ret < 0) { + mlx4_free_cmd_mailbox(mdev->dev, mailbox); + return -EINVAL; + } + size += ret; + } + for (i = 0; i < flow_attr->num_of_specs; i++) { + ret = parse_flow_attr(mdev->dev, qp->qp_num, ib_flow, + mailbox->buf + size); + if (ret < 0) { + mlx4_free_cmd_mailbox(mdev->dev, mailbox); + return -EINVAL; + } + ib_flow += ((union ib_flow_spec *) ib_flow)->size; + size += ret; + } + + ret = mlx4_cmd_imm(mdev->dev, mailbox->dma, reg_id, size >> 2, 0, + MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (ret == -ENOMEM) + pr_err("mcg table is full. Fail to register network rule.\n"); + else if (ret == -ENXIO) + pr_err("Device managed flow steering is disabled. Fail to register network rule.\n"); + else if (ret) + pr_err("Invalid argumant. Fail to register network rule.\n"); + + mlx4_free_cmd_mailbox(mdev->dev, mailbox); + return ret; +} + +static int __mlx4_ib_destroy_flow(struct mlx4_dev *dev, u64 reg_id) +{ + int err; + err = mlx4_cmd(dev, reg_id, 0, 0, + MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (err) + pr_err("Fail to detach network rule. registration id = 0x%llx\n", + reg_id); + return err; +} + +static int mlx4_ib_tunnel_steer_add(struct ib_qp *qp, struct ib_flow_attr *flow_attr, + u64 *reg_id) +{ + void *ib_flow; + union ib_flow_spec *ib_spec; + struct mlx4_dev *dev = to_mdev(qp->device)->dev; + int err = 0; + + if (dev->caps.tunnel_offload_mode != MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) + return 0; /* do nothing */ + + ib_flow = flow_attr + 1; + ib_spec = (union ib_flow_spec *)ib_flow; + + if (ib_spec->type != IB_FLOW_SPEC_ETH || flow_attr->num_of_specs != 1) + return 0; /* do nothing */ + + err = mlx4_tunnel_steer_add(to_mdev(qp->device)->dev, ib_spec->eth.val.dst_mac, + flow_attr->port, qp->qp_num, + MLX4_DOMAIN_UVERBS | (flow_attr->priority & 0xff), + reg_id); + return err; +} + +static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, + struct ib_flow_attr *flow_attr, + int domain) +{ + int err = 0, i = 0; + struct mlx4_ib_flow *mflow; + enum mlx4_net_trans_promisc_mode type[2]; + + memset(type, 0, sizeof(type)); + + mflow = kzalloc(sizeof(*mflow), GFP_KERNEL); + if (!mflow) { + err = -ENOMEM; + goto err_free; + } + + switch (flow_attr->type) { + case IB_FLOW_ATTR_NORMAL: + type[0] = MLX4_FS_REGULAR; + break; + + case IB_FLOW_ATTR_ALL_DEFAULT: + type[0] = MLX4_FS_ALL_DEFAULT; + break; + + case IB_FLOW_ATTR_MC_DEFAULT: + type[0] = MLX4_FS_MC_DEFAULT; + break; + + case IB_FLOW_ATTR_SNIFFER: + type[0] = MLX4_FS_UC_SNIFFER; + type[1] = MLX4_FS_MC_SNIFFER; + break; + + default: + err = -EINVAL; + goto err_free; + } + + while (i < ARRAY_SIZE(type) && type[i]) { + err = __mlx4_ib_create_flow(qp, flow_attr, domain, type[i], + &mflow->reg_id[i]); + if (err) + goto err_create_flow; + i++; + } + + if (i < ARRAY_SIZE(type) && flow_attr->type == IB_FLOW_ATTR_NORMAL) { + err = mlx4_ib_tunnel_steer_add(qp, flow_attr, &mflow->reg_id[i]); + if (err) + goto err_create_flow; + i++; + } + + return &mflow->ibflow; + +err_create_flow: + while (i) { + (void)__mlx4_ib_destroy_flow(to_mdev(qp->device)->dev, mflow->reg_id[i]); + i--; + } +err_free: + kfree(mflow); + return ERR_PTR(err); +} + +static int mlx4_ib_destroy_flow(struct ib_flow *flow_id) +{ + int err, ret = 0; + int i = 0; + struct mlx4_ib_dev *mdev = to_mdev(flow_id->qp->device); + struct mlx4_ib_flow *mflow = to_mflow(flow_id); + + while (i < ARRAY_SIZE(mflow->reg_id) && mflow->reg_id[i]) { + err = __mlx4_ib_destroy_flow(mdev->dev, mflow->reg_id[i]); + if (err) + ret = err; + i++; + } + + kfree(mflow); + return ret; +} + +static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + int err; + struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); + struct mlx4_ib_qp *mqp = to_mqp(ibqp); + u64 reg_id; + struct mlx4_ib_steering *ib_steering = NULL; + enum mlx4_protocol prot = (gid->raw[1] == 0x0e) ? + MLX4_PROT_IB_IPV4 : MLX4_PROT_IB_IPV6; + + if (mdev->dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED) { + ib_steering = kmalloc(sizeof(*ib_steering), GFP_KERNEL); + if (!ib_steering) + return -ENOMEM; + } + + err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, mqp->port, + !!(mqp->flags & + MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK), + prot, ®_id); + if (err) + goto err_malloc; + + err = add_gid_entry(ibqp, gid); + if (err) + goto err_add; + + if (ib_steering) { + memcpy(ib_steering->gid.raw, gid->raw, 16); + ib_steering->reg_id = reg_id; + mutex_lock(&mqp->mutex); + list_add(&ib_steering->list, &mqp->steering_rules); + mutex_unlock(&mqp->mutex); + } + return 0; + +err_add: + mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, + prot, reg_id); +err_malloc: + kfree(ib_steering); + + return err; +} + +static struct mlx4_ib_gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw) +{ + struct mlx4_ib_gid_entry *ge; + struct mlx4_ib_gid_entry *tmp; + struct mlx4_ib_gid_entry *ret = NULL; + + list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) { + if (!memcmp(raw, ge->gid.raw, 16)) { + ret = ge; + break; + } + } + + return ret; +} + +static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + int err; + struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); + struct mlx4_ib_qp *mqp = to_mqp(ibqp); + struct net_device *ndev; + struct mlx4_ib_gid_entry *ge; + u64 reg_id = 0; + enum mlx4_protocol prot = (gid->raw[1] == 0x0e) ? + MLX4_PROT_IB_IPV4 : MLX4_PROT_IB_IPV6; + + if (mdev->dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED) { + struct mlx4_ib_steering *ib_steering; + + mutex_lock(&mqp->mutex); + list_for_each_entry(ib_steering, &mqp->steering_rules, list) { + if (!memcmp(ib_steering->gid.raw, gid->raw, 16)) { + list_del(&ib_steering->list); + break; + } + } + mutex_unlock(&mqp->mutex); + if (&ib_steering->list == &mqp->steering_rules) { + pr_err("Couldn't find reg_id for mgid. Steering rule is left attached\n"); + return -EINVAL; + } + reg_id = ib_steering->reg_id; + kfree(ib_steering); + } + + err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, + prot, reg_id); + if (err) + return err; + + mutex_lock(&mqp->mutex); + ge = find_gid_entry(mqp, gid->raw); + if (ge) { + spin_lock_bh(&mdev->iboe.lock); + ndev = ge->added ? mdev->iboe.netdevs[ge->port - 1] : NULL; + if (ndev) + dev_hold(ndev); + spin_unlock_bh(&mdev->iboe.lock); + if (ndev) + dev_put(ndev); + list_del(&ge->list); + kfree(ge); + } else + pr_warn("could not find mgid entry\n"); + + mutex_unlock(&mqp->mutex); + + return 0; +} + +static int init_node_data(struct mlx4_ib_dev *dev) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_DESC; + if (mlx4_is_master(dev->dev)) + mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; + + err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(dev->ib_dev.node_desc, out_mad->data, 64); + + in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; + + err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + dev->dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32)); + memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static ssize_t show_hca(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx4_ib_dev *dev = + container_of(device, struct mlx4_ib_dev, ib_dev.dev); + return sprintf(buf, "MT%d\n", dev->dev->pdev->device); +} + +static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx4_ib_dev *dev = + container_of(device, struct mlx4_ib_dev, ib_dev.dev); + return sprintf(buf, "%d.%d.%d\n", (int) (dev->dev->caps.fw_ver >> 32), + (int) (dev->dev->caps.fw_ver >> 16) & 0xffff, + (int) dev->dev->caps.fw_ver & 0xffff); +} + +static ssize_t show_rev(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx4_ib_dev *dev = + container_of(device, struct mlx4_ib_dev, ib_dev.dev); + return sprintf(buf, "%x\n", dev->dev->rev_id); +} + +static ssize_t show_board(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx4_ib_dev *dev = + container_of(device, struct mlx4_ib_dev, ib_dev.dev); + return sprintf(buf, "%.*s\n", MLX4_BOARD_ID_LEN, + dev->dev->board_id); +} + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); + +static struct device_attribute *mlx4_class_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_fw_ver, + &dev_attr_hca_type, + &dev_attr_board_id +}; + +static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, + struct net_device *dev) +{ + memcpy(eui, dev->dev_addr, 3); + memcpy(eui + 5, dev->dev_addr + 3, 3); + if (vlan_id < 0x1000) { + eui[3] = vlan_id >> 8; + eui[4] = vlan_id & 0xff; + } else { + eui[3] = 0xff; + eui[4] = 0xfe; + } + eui[0] ^= 2; +} + +static void update_gids_task(struct work_struct *work) +{ + struct update_gid_work *gw = container_of(work, struct update_gid_work, work); + struct mlx4_cmd_mailbox *mailbox; + union ib_gid *gids; + int err; + struct mlx4_dev *dev = gw->dev->dev; + + if (!gw->dev->ib_active) + return; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + pr_warn("update gid table failed %ld\n", PTR_ERR(mailbox)); + return; + } + + gids = mailbox->buf; + memcpy(gids, gw->gids, sizeof gw->gids); + + err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | gw->port, + 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); + if (err) + pr_warn("set port command failed\n"); + else + mlx4_ib_dispatch_event(gw->dev, gw->port, IB_EVENT_GID_CHANGE); + + mlx4_free_cmd_mailbox(dev, mailbox); + kfree(gw); +} + +static void reset_gids_task(struct work_struct *work) +{ + struct update_gid_work *gw = + container_of(work, struct update_gid_work, work); + struct mlx4_cmd_mailbox *mailbox; + union ib_gid *gids; + int err; + struct mlx4_dev *dev = gw->dev->dev; + + if (!gw->dev->ib_active) + return; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + pr_warn("reset gid table failed\n"); + goto free; + } + + gids = mailbox->buf; + memcpy(gids, gw->gids, sizeof(gw->gids)); + + if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, gw->port) == + IB_LINK_LAYER_ETHERNET) { + err = mlx4_cmd(dev, mailbox->dma, + MLX4_SET_PORT_GID_TABLE << 8 | gw->port, + 1, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); + if (err) + pr_warn(KERN_WARNING + "set port %d command failed\n", gw->port); + } + + mlx4_free_cmd_mailbox(dev, mailbox); +free: + kfree(gw); +} + +static int update_gid_table(struct mlx4_ib_dev *dev, int port, + union ib_gid *gid, int clear, + int default_gid) +{ + struct update_gid_work *work; + int i; + int need_update = 0; + int free = -1; + int found = -1; + int max_gids; + + if (default_gid) { + free = 0; + } else { + max_gids = dev->dev->caps.gid_table_len[port]; + for (i = 1; i < max_gids; ++i) { + if (!memcmp(&dev->iboe.gid_table[port - 1][i], gid, + sizeof(*gid))) + found = i; + + if (clear) { + if (found >= 0) { + need_update = 1; + dev->iboe.gid_table[port - 1][found] = + zgid; + break; + } + } else { + if (found >= 0) + break; + + if (free < 0 && + !memcmp(&dev->iboe.gid_table[port - 1][i], + &zgid, sizeof(*gid))) + free = i; + } + } + } + + if (found == -1 && !clear && free >= 0) { + dev->iboe.gid_table[port - 1][free] = *gid; + need_update = 1; + } + + if (!need_update) + return 0; + + work = kzalloc(sizeof(*work), GFP_ATOMIC); + if (!work) + return -ENOMEM; + + memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof(work->gids)); + INIT_WORK(&work->work, update_gids_task); + work->port = port; + work->dev = dev; + queue_work(wq, &work->work); + + return 0; +} + +static void mlx4_make_default_gid(struct net_device *dev, union ib_gid *gid) +{ + gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); + mlx4_addrconf_ifid_eui48(&gid->raw[8], 0xffff, dev); +} + + +static int reset_gid_table(struct mlx4_ib_dev *dev, u8 port) +{ + struct update_gid_work *work; + + work = kzalloc(sizeof(*work), GFP_ATOMIC); + if (!work) + return -ENOMEM; + + memset(dev->iboe.gid_table[port - 1], 0, sizeof(work->gids)); + memset(work->gids, 0, sizeof(work->gids)); + INIT_WORK(&work->work, reset_gids_task); + work->dev = dev; + work->port = port; + queue_work(wq, &work->work); + return 0; +} + +static int mlx4_ib_addr_event(int event, struct net_device *event_netdev, + struct mlx4_ib_dev *ibdev, union ib_gid *gid) +{ + struct mlx4_ib_iboe *iboe; + int port = 0; + struct net_device *real_dev = rdma_vlan_dev_real_dev(event_netdev) ? + rdma_vlan_dev_real_dev(event_netdev) : + event_netdev; + union ib_gid default_gid; + + mlx4_make_default_gid(real_dev, &default_gid); + + if (!memcmp(gid, &default_gid, sizeof(*gid))) + return 0; + + if (event != NETDEV_DOWN && event != NETDEV_UP) + return 0; + + if ((real_dev != event_netdev) && + (event == NETDEV_DOWN) && + rdma_link_local_addr((struct in6_addr *)gid)) + return 0; + + iboe = &ibdev->iboe; + spin_lock_bh(&iboe->lock); + + for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) + if ((netif_is_bond_master(real_dev) && + (real_dev == iboe->masters[port - 1])) || + (!netif_is_bond_master(real_dev) && + (real_dev == iboe->netdevs[port - 1]))) + update_gid_table(ibdev, port, gid, + event == NETDEV_DOWN, 0); + + spin_unlock_bh(&iboe->lock); + return 0; + +} + +static u8 mlx4_ib_get_dev_port(struct net_device *dev, + struct mlx4_ib_dev *ibdev) +{ + u8 port = 0; + struct mlx4_ib_iboe *iboe; + struct net_device *real_dev = rdma_vlan_dev_real_dev(dev) ? + rdma_vlan_dev_real_dev(dev) : dev; + + iboe = &ibdev->iboe; + + for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) + if ((netif_is_bond_master(real_dev) && + (real_dev == iboe->masters[port - 1])) || + (!netif_is_bond_master(real_dev) && + (real_dev == iboe->netdevs[port - 1]))) + break; + + if ((port == 0) || (port > ibdev->dev->caps.num_ports)) + return 0; + else + return port; +} + +static int mlx4_ib_inet_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct mlx4_ib_dev *ibdev; + struct in_ifaddr *ifa = ptr; + union ib_gid gid; + struct net_device *event_netdev = ifa->ifa_dev->dev; + + ipv6_addr_set_v4mapped(ifa->ifa_address, (struct in6_addr *)&gid); + + ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet); + + mlx4_ib_addr_event(event, event_netdev, ibdev, &gid); + return NOTIFY_DONE; +} + +#if IS_ENABLED(CONFIG_IPV6) +static int mlx4_ib_inet6_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct mlx4_ib_dev *ibdev; + struct inet6_ifaddr *ifa = ptr; + union ib_gid *gid = (union ib_gid *)&ifa->addr; + struct net_device *event_netdev = ifa->idev->dev; + + ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet6); + + mlx4_ib_addr_event(event, event_netdev, ibdev, gid); + return NOTIFY_DONE; +} +#endif + +#define MLX4_IB_INVALID_MAC ((u64)-1) +static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev, + struct net_device *dev, + int port) +{ + u64 new_smac = 0; + u64 release_mac = MLX4_IB_INVALID_MAC; + struct mlx4_ib_qp *qp; + + read_lock(&dev_base_lock); + new_smac = mlx4_mac_to_u64(dev->dev_addr); + read_unlock(&dev_base_lock); + + atomic64_set(&ibdev->iboe.mac[port - 1], new_smac); + + /* no need for update QP1 and mac registration in non-SRIOV */ + if (!mlx4_is_mfunc(ibdev->dev)) + return; + + mutex_lock(&ibdev->qp1_proxy_lock[port - 1]); + qp = ibdev->qp1_proxy[port - 1]; + if (qp) { + int new_smac_index; + u64 old_smac; + struct mlx4_update_qp_params update_params; + + mutex_lock(&qp->mutex); + old_smac = qp->pri.smac; + if (new_smac == old_smac) + goto unlock; + + new_smac_index = mlx4_register_mac(ibdev->dev, port, new_smac); + + if (new_smac_index < 0) + goto unlock; + + update_params.smac_index = new_smac_index; + if (mlx4_update_qp(ibdev->dev, qp->mqp.qpn, MLX4_UPDATE_QP_SMAC, + &update_params)) { + release_mac = new_smac; + goto unlock; + } + /* if old port was zero, no mac was yet registered for this QP */ + if (qp->pri.smac_port) + release_mac = old_smac; + qp->pri.smac = new_smac; + qp->pri.smac_port = port; + qp->pri.smac_index = new_smac_index; + } + +unlock: + if (release_mac != MLX4_IB_INVALID_MAC) + mlx4_unregister_mac(ibdev->dev, port, release_mac); + if (qp) + mutex_unlock(&qp->mutex); + mutex_unlock(&ibdev->qp1_proxy_lock[port - 1]); +} + +static void mlx4_ib_get_dev_addr(struct net_device *dev, + struct mlx4_ib_dev *ibdev, u8 port) +{ + struct in_device *in_dev; +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_dev *in6_dev; + union ib_gid *pgid; + struct inet6_ifaddr *ifp; + union ib_gid default_gid; +#endif + union ib_gid gid; + + + if ((port == 0) || (port > ibdev->dev->caps.num_ports)) + return; + + /* IPv4 gids */ + in_dev = in_dev_get(dev); + if (in_dev) { + for_ifa(in_dev) { + /*ifa->ifa_address;*/ + ipv6_addr_set_v4mapped(ifa->ifa_address, + (struct in6_addr *)&gid); + update_gid_table(ibdev, port, &gid, 0, 0); + } + endfor_ifa(in_dev); + in_dev_put(in_dev); + } +#if IS_ENABLED(CONFIG_IPV6) + mlx4_make_default_gid(dev, &default_gid); + /* IPv6 gids */ + in6_dev = in6_dev_get(dev); + if (in6_dev) { + read_lock_bh(&in6_dev->lock); + list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { + pgid = (union ib_gid *)&ifp->addr; + if (!memcmp(pgid, &default_gid, sizeof(*pgid))) + continue; + update_gid_table(ibdev, port, pgid, 0, 0); + } + read_unlock_bh(&in6_dev->lock); + in6_dev_put(in6_dev); + } +#endif +} + +static void mlx4_ib_set_default_gid(struct mlx4_ib_dev *ibdev, + struct net_device *dev, u8 port) +{ + union ib_gid gid; + mlx4_make_default_gid(dev, &gid); + update_gid_table(ibdev, port, &gid, 0, 1); +} + +static int mlx4_ib_init_gid_table(struct mlx4_ib_dev *ibdev) +{ + struct net_device *dev; + struct mlx4_ib_iboe *iboe = &ibdev->iboe; + int i; + int err = 0; + + for (i = 1; i <= ibdev->num_ports; ++i) { + if (rdma_port_get_link_layer(&ibdev->ib_dev, i) == + IB_LINK_LAYER_ETHERNET) { + err = reset_gid_table(ibdev, i); + if (err) + goto out; + } + } + + read_lock(&dev_base_lock); + spin_lock_bh(&iboe->lock); + + for_each_netdev(&init_net, dev) { + u8 port = mlx4_ib_get_dev_port(dev, ibdev); + /* port will be non-zero only for ETH ports */ + if (port) { + mlx4_ib_set_default_gid(ibdev, dev, port); + mlx4_ib_get_dev_addr(dev, ibdev, port); + } + } + + spin_unlock_bh(&iboe->lock); + read_unlock(&dev_base_lock); +out: + return err; +} + +static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev, + struct net_device *dev, + unsigned long event) + +{ + struct mlx4_ib_iboe *iboe; + int update_qps_port = -1; + int port; + + iboe = &ibdev->iboe; + + spin_lock_bh(&iboe->lock); + mlx4_foreach_ib_transport_port(port, ibdev->dev) { + enum ib_port_state port_state = IB_PORT_NOP; + struct net_device *old_master = iboe->masters[port - 1]; + struct net_device *curr_netdev; + struct net_device *curr_master; + + iboe->netdevs[port - 1] = + mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port); + if (iboe->netdevs[port - 1]) + mlx4_ib_set_default_gid(ibdev, + iboe->netdevs[port - 1], port); + curr_netdev = iboe->netdevs[port - 1]; + + if (iboe->netdevs[port - 1] && + netif_is_bond_slave(iboe->netdevs[port - 1])) { + iboe->masters[port - 1] = netdev_master_upper_dev_get( + iboe->netdevs[port - 1]); + } else { + iboe->masters[port - 1] = NULL; + } + curr_master = iboe->masters[port - 1]; + + if (dev == iboe->netdevs[port - 1] && + (event == NETDEV_CHANGEADDR || event == NETDEV_REGISTER || + event == NETDEV_UP || event == NETDEV_CHANGE)) + update_qps_port = port; + + if (curr_netdev) { + port_state = (netif_running(curr_netdev) && netif_carrier_ok(curr_netdev)) ? + IB_PORT_ACTIVE : IB_PORT_DOWN; + mlx4_ib_set_default_gid(ibdev, curr_netdev, port); + if (curr_master) { + /* if using bonding/team and a slave port is down, we + * don't want the bond IP based gids in the table since + * flows that select port by gid may get the down port. + */ + if (port_state == IB_PORT_DOWN) { + reset_gid_table(ibdev, port); + mlx4_ib_set_default_gid(ibdev, + curr_netdev, + port); + } else { + /* gids from the upper dev (bond/team) + * should appear in port's gid table + */ + mlx4_ib_get_dev_addr(curr_master, + ibdev, port); + } + } + /* if bonding is used it is possible that we add it to + * masters only after IP address is assigned to the + * net bonding interface. + */ + if (curr_master && (old_master != curr_master)) { + reset_gid_table(ibdev, port); + mlx4_ib_set_default_gid(ibdev, + curr_netdev, port); + mlx4_ib_get_dev_addr(curr_master, ibdev, port); + } + + if (!curr_master && (old_master != curr_master)) { + reset_gid_table(ibdev, port); + mlx4_ib_set_default_gid(ibdev, + curr_netdev, port); + mlx4_ib_get_dev_addr(curr_netdev, ibdev, port); + } + } else { + reset_gid_table(ibdev, port); + } + } + + spin_unlock_bh(&iboe->lock); + + if (update_qps_port > 0) + mlx4_ib_update_qps(ibdev, dev, update_qps_port); +} + +static int mlx4_ib_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct mlx4_ib_dev *ibdev; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb); + mlx4_ib_scan_netdevs(ibdev, dev, event); + + return NOTIFY_DONE; +} + +static void init_pkeys(struct mlx4_ib_dev *ibdev) +{ + int port; + int slave; + int i; + + if (mlx4_is_master(ibdev->dev)) { + for (slave = 0; slave <= ibdev->dev->num_vfs; ++slave) { + for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) { + for (i = 0; + i < ibdev->dev->phys_caps.pkey_phys_table_len[port]; + ++i) { + ibdev->pkeys.virt2phys_pkey[slave][port - 1][i] = + /* master has the identity virt2phys pkey mapping */ + (slave == mlx4_master_func_num(ibdev->dev) || !i) ? i : + ibdev->dev->phys_caps.pkey_phys_table_len[port] - 1; + mlx4_sync_pkey_table(ibdev->dev, slave, port, i, + ibdev->pkeys.virt2phys_pkey[slave][port - 1][i]); + } + } + } + /* initialize pkey cache */ + for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) { + for (i = 0; + i < ibdev->dev->phys_caps.pkey_phys_table_len[port]; + ++i) + ibdev->pkeys.phys_pkey_cache[port-1][i] = + (i) ? 0 : 0xFFFF; + } + } +} + +static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) +{ + char name[80]; + int eq_per_port = 0; + int added_eqs = 0; + int total_eqs = 0; + int i, j, eq; + + /* Legacy mode or comp_pool is not large enough */ + if (dev->caps.comp_pool == 0 || + dev->caps.num_ports > dev->caps.comp_pool) + return; + + eq_per_port = rounddown_pow_of_two(dev->caps.comp_pool/ + dev->caps.num_ports); + + /* Init eq table */ + added_eqs = 0; + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) + added_eqs += eq_per_port; + + total_eqs = dev->caps.num_comp_vectors + added_eqs; + + ibdev->eq_table = kzalloc(total_eqs * sizeof(int), GFP_KERNEL); + if (!ibdev->eq_table) + return; + + ibdev->eq_added = added_eqs; + + eq = 0; + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) { + for (j = 0; j < eq_per_port; j++) { + snprintf(name, sizeof(name), "mlx4-ib-%d-%d@%s", + i, j, dev->pdev->bus->name); + /* Set IRQ for specific name (per ring) */ + if (mlx4_assign_eq(dev, name, NULL, + &ibdev->eq_table[eq])) { + /* Use legacy (same as mlx4_en driver) */ + pr_warn("Can't allocate EQ %d; reverting to legacy\n", eq); + ibdev->eq_table[eq] = + (eq % dev->caps.num_comp_vectors); + } + eq++; + } + } + + /* Fill the reset of the vector with legacy EQ */ + for (i = 0, eq = added_eqs; i < dev->caps.num_comp_vectors; i++) + ibdev->eq_table[eq++] = i; + + /* Advertise the new number of EQs to clients */ + ibdev->ib_dev.num_comp_vectors = total_eqs; +} + +static void mlx4_ib_free_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) +{ + int i; + + /* no additional eqs were added */ + if (!ibdev->eq_table) + return; + + /* Reset the advertised EQ number */ + ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; + + /* Free only the added eqs */ + for (i = 0; i < ibdev->eq_added; i++) { + /* Don't free legacy eqs if used */ + if (ibdev->eq_table[i] <= dev->caps.num_comp_vectors) + continue; + mlx4_release_eq(dev, ibdev->eq_table[i]); + } + + kfree(ibdev->eq_table); +} + +static void *mlx4_ib_add(struct mlx4_dev *dev) +{ + struct mlx4_ib_dev *ibdev; + int num_ports = 0; + int i, j; + int err; + struct mlx4_ib_iboe *iboe; + int ib_num_ports = 0; + + pr_info_once("%s", mlx4_ib_version); + + num_ports = 0; + mlx4_foreach_ib_transport_port(i, dev) + num_ports++; + + /* No point in registering a device with no ports... */ + if (num_ports == 0) + return NULL; + + ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev); + if (!ibdev) { + dev_err(&dev->pdev->dev, "Device struct alloc failed\n"); + return NULL; + } + + iboe = &ibdev->iboe; + + if (mlx4_pd_alloc(dev, &ibdev->priv_pdn)) + goto err_dealloc; + + if (mlx4_uar_alloc(dev, &ibdev->priv_uar)) + goto err_pd; + + ibdev->uar_map = ioremap((phys_addr_t) ibdev->priv_uar.pfn << PAGE_SHIFT, + PAGE_SIZE); + if (!ibdev->uar_map) + goto err_uar; + MLX4_INIT_DOORBELL_LOCK(&ibdev->uar_lock); + + ibdev->dev = dev; + + strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX); + ibdev->ib_dev.owner = THIS_MODULE; + ibdev->ib_dev.node_type = RDMA_NODE_IB_CA; + ibdev->ib_dev.local_dma_lkey = dev->caps.reserved_lkey; + ibdev->num_ports = num_ports; + ibdev->ib_dev.phys_port_cnt = ibdev->num_ports; + ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; + ibdev->ib_dev.dma_device = &dev->pdev->dev; + + if (dev->caps.userspace_caps) + ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION; + else + ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION; + + ibdev->ib_dev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_REREG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | + (1ull << IB_USER_VERBS_CMD_OPEN_QP); + + ibdev->ib_dev.query_device = mlx4_ib_query_device; + ibdev->ib_dev.query_port = mlx4_ib_query_port; + ibdev->ib_dev.get_link_layer = mlx4_ib_port_link_layer; + ibdev->ib_dev.query_gid = mlx4_ib_query_gid; + ibdev->ib_dev.query_pkey = mlx4_ib_query_pkey; + ibdev->ib_dev.modify_device = mlx4_ib_modify_device; + ibdev->ib_dev.modify_port = mlx4_ib_modify_port; + ibdev->ib_dev.alloc_ucontext = mlx4_ib_alloc_ucontext; + ibdev->ib_dev.dealloc_ucontext = mlx4_ib_dealloc_ucontext; + ibdev->ib_dev.mmap = mlx4_ib_mmap; + ibdev->ib_dev.alloc_pd = mlx4_ib_alloc_pd; + ibdev->ib_dev.dealloc_pd = mlx4_ib_dealloc_pd; + ibdev->ib_dev.create_ah = mlx4_ib_create_ah; + ibdev->ib_dev.query_ah = mlx4_ib_query_ah; + ibdev->ib_dev.destroy_ah = mlx4_ib_destroy_ah; + ibdev->ib_dev.create_srq = mlx4_ib_create_srq; + ibdev->ib_dev.modify_srq = mlx4_ib_modify_srq; + ibdev->ib_dev.query_srq = mlx4_ib_query_srq; + ibdev->ib_dev.destroy_srq = mlx4_ib_destroy_srq; + ibdev->ib_dev.post_srq_recv = mlx4_ib_post_srq_recv; + ibdev->ib_dev.create_qp = mlx4_ib_create_qp; + ibdev->ib_dev.modify_qp = mlx4_ib_modify_qp; + ibdev->ib_dev.query_qp = mlx4_ib_query_qp; + ibdev->ib_dev.destroy_qp = mlx4_ib_destroy_qp; + ibdev->ib_dev.post_send = mlx4_ib_post_send; + ibdev->ib_dev.post_recv = mlx4_ib_post_recv; + ibdev->ib_dev.create_cq = mlx4_ib_create_cq; + ibdev->ib_dev.modify_cq = mlx4_ib_modify_cq; + ibdev->ib_dev.resize_cq = mlx4_ib_resize_cq; + ibdev->ib_dev.destroy_cq = mlx4_ib_destroy_cq; + ibdev->ib_dev.poll_cq = mlx4_ib_poll_cq; + ibdev->ib_dev.req_notify_cq = mlx4_ib_arm_cq; + ibdev->ib_dev.get_dma_mr = mlx4_ib_get_dma_mr; + ibdev->ib_dev.reg_user_mr = mlx4_ib_reg_user_mr; + ibdev->ib_dev.rereg_user_mr = mlx4_ib_rereg_user_mr; + ibdev->ib_dev.dereg_mr = mlx4_ib_dereg_mr; + ibdev->ib_dev.alloc_fast_reg_mr = mlx4_ib_alloc_fast_reg_mr; + ibdev->ib_dev.alloc_fast_reg_page_list = mlx4_ib_alloc_fast_reg_page_list; + ibdev->ib_dev.free_fast_reg_page_list = mlx4_ib_free_fast_reg_page_list; + ibdev->ib_dev.attach_mcast = mlx4_ib_mcg_attach; + ibdev->ib_dev.detach_mcast = mlx4_ib_mcg_detach; + ibdev->ib_dev.process_mad = mlx4_ib_process_mad; + + if (!mlx4_is_slave(ibdev->dev)) { + ibdev->ib_dev.alloc_fmr = mlx4_ib_fmr_alloc; + ibdev->ib_dev.map_phys_fmr = mlx4_ib_map_phys_fmr; + ibdev->ib_dev.unmap_fmr = mlx4_ib_unmap_fmr; + ibdev->ib_dev.dealloc_fmr = mlx4_ib_fmr_dealloc; + } + + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW || + dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) { + ibdev->ib_dev.alloc_mw = mlx4_ib_alloc_mw; + ibdev->ib_dev.bind_mw = mlx4_ib_bind_mw; + ibdev->ib_dev.dealloc_mw = mlx4_ib_dealloc_mw; + + ibdev->ib_dev.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_ALLOC_MW) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_MW); + } + + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) { + ibdev->ib_dev.alloc_xrcd = mlx4_ib_alloc_xrcd; + ibdev->ib_dev.dealloc_xrcd = mlx4_ib_dealloc_xrcd; + ibdev->ib_dev.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) | + (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); + } + + if (check_flow_steering_support(dev)) { + ibdev->steering_support = MLX4_STEERING_MODE_DEVICE_MANAGED; + ibdev->ib_dev.create_flow = mlx4_ib_create_flow; + ibdev->ib_dev.destroy_flow = mlx4_ib_destroy_flow; + + ibdev->ib_dev.uverbs_ex_cmd_mask |= + (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW); + } + + mlx4_ib_alloc_eqs(dev, ibdev); + + spin_lock_init(&iboe->lock); + + if (init_node_data(ibdev)) + goto err_map; + + for (i = 0; i < ibdev->num_ports; ++i) { + mutex_init(&ibdev->qp1_proxy_lock[i]); + if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) == + IB_LINK_LAYER_ETHERNET) { + err = mlx4_counter_alloc(ibdev->dev, &ibdev->counters[i]); + if (err) + ibdev->counters[i] = -1; + } else { + ibdev->counters[i] = -1; + } + } + + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) + ib_num_ports++; + + spin_lock_init(&ibdev->sm_lock); + mutex_init(&ibdev->cap_mask_mutex); + + if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED && + ib_num_ports) { + ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS; + err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count, + MLX4_IB_UC_STEER_QPN_ALIGN, + &ibdev->steer_qpn_base); + if (err) + goto err_counter; + + ibdev->ib_uc_qpns_bitmap = + kmalloc(BITS_TO_LONGS(ibdev->steer_qpn_count) * + sizeof(long), + GFP_KERNEL); + if (!ibdev->ib_uc_qpns_bitmap) { + dev_err(&dev->pdev->dev, "bit map alloc failed\n"); + goto err_steer_qp_release; + } + + bitmap_zero(ibdev->ib_uc_qpns_bitmap, ibdev->steer_qpn_count); + + err = mlx4_FLOW_STEERING_IB_UC_QP_RANGE( + dev, ibdev->steer_qpn_base, + ibdev->steer_qpn_base + + ibdev->steer_qpn_count - 1); + if (err) + goto err_steer_free_bitmap; + } + + for (j = 1; j <= ibdev->dev->caps.num_ports; j++) + atomic64_set(&iboe->mac[j - 1], ibdev->dev->caps.def_mac[j]); + + if (ib_register_device(&ibdev->ib_dev, NULL)) + goto err_steer_free_bitmap; + + if (mlx4_ib_mad_init(ibdev)) + goto err_reg; + + if (mlx4_ib_init_sriov(ibdev)) + goto err_mad; + + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) { + if (!iboe->nb.notifier_call) { + iboe->nb.notifier_call = mlx4_ib_netdev_event; + err = register_netdevice_notifier(&iboe->nb); + if (err) { + iboe->nb.notifier_call = NULL; + goto err_notif; + } + } + if (!iboe->nb_inet.notifier_call) { + iboe->nb_inet.notifier_call = mlx4_ib_inet_event; + err = register_inetaddr_notifier(&iboe->nb_inet); + if (err) { + iboe->nb_inet.notifier_call = NULL; + goto err_notif; + } + } +#if IS_ENABLED(CONFIG_IPV6) + if (!iboe->nb_inet6.notifier_call) { + iboe->nb_inet6.notifier_call = mlx4_ib_inet6_event; + err = register_inet6addr_notifier(&iboe->nb_inet6); + if (err) { + iboe->nb_inet6.notifier_call = NULL; + goto err_notif; + } + } +#endif + if (mlx4_ib_init_gid_table(ibdev)) + goto err_notif; + } + + for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { + if (device_create_file(&ibdev->ib_dev.dev, + mlx4_class_attributes[j])) + goto err_notif; + } + + ibdev->ib_active = true; + + if (mlx4_is_mfunc(ibdev->dev)) + init_pkeys(ibdev); + + /* create paravirt contexts for any VFs which are active */ + if (mlx4_is_master(ibdev->dev)) { + for (j = 0; j < MLX4_MFUNC_MAX; j++) { + if (j == mlx4_master_func_num(ibdev->dev)) + continue; + if (mlx4_is_slave_active(ibdev->dev, j)) + do_slave_init(ibdev, j, 1); + } + } + return ibdev; + +err_notif: + if (ibdev->iboe.nb.notifier_call) { + if (unregister_netdevice_notifier(&ibdev->iboe.nb)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb.notifier_call = NULL; + } + if (ibdev->iboe.nb_inet.notifier_call) { + if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb_inet.notifier_call = NULL; + } +#if IS_ENABLED(CONFIG_IPV6) + if (ibdev->iboe.nb_inet6.notifier_call) { + if (unregister_inet6addr_notifier(&ibdev->iboe.nb_inet6)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb_inet6.notifier_call = NULL; + } +#endif + flush_workqueue(wq); + + mlx4_ib_close_sriov(ibdev); + +err_mad: + mlx4_ib_mad_cleanup(ibdev); + +err_reg: + ib_unregister_device(&ibdev->ib_dev); + +err_steer_free_bitmap: + kfree(ibdev->ib_uc_qpns_bitmap); + +err_steer_qp_release: + if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) + mlx4_qp_release_range(dev, ibdev->steer_qpn_base, + ibdev->steer_qpn_count); +err_counter: + for (; i; --i) + if (ibdev->counters[i - 1] != -1) + mlx4_counter_free(ibdev->dev, ibdev->counters[i - 1]); + +err_map: + iounmap(ibdev->uar_map); + +err_uar: + mlx4_uar_free(dev, &ibdev->priv_uar); + +err_pd: + mlx4_pd_free(dev, ibdev->priv_pdn); + +err_dealloc: + ib_dealloc_device(&ibdev->ib_dev); + + return NULL; +} + +int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn) +{ + int offset; + + WARN_ON(!dev->ib_uc_qpns_bitmap); + + offset = bitmap_find_free_region(dev->ib_uc_qpns_bitmap, + dev->steer_qpn_count, + get_count_order(count)); + if (offset < 0) + return offset; + + *qpn = dev->steer_qpn_base + offset; + return 0; +} + +void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count) +{ + if (!qpn || + dev->steering_support != MLX4_STEERING_MODE_DEVICE_MANAGED) + return; + + BUG_ON(qpn < dev->steer_qpn_base); + + bitmap_release_region(dev->ib_uc_qpns_bitmap, + qpn - dev->steer_qpn_base, + get_count_order(count)); +} + +int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, + int is_attach) +{ + int err; + size_t flow_size; + struct ib_flow_attr *flow = NULL; + struct ib_flow_spec_ib *ib_spec; + + if (is_attach) { + flow_size = sizeof(struct ib_flow_attr) + + sizeof(struct ib_flow_spec_ib); + flow = kzalloc(flow_size, GFP_KERNEL); + if (!flow) + return -ENOMEM; + flow->port = mqp->port; + flow->num_of_specs = 1; + flow->size = flow_size; + ib_spec = (struct ib_flow_spec_ib *)(flow + 1); + ib_spec->type = IB_FLOW_SPEC_IB; + ib_spec->size = sizeof(struct ib_flow_spec_ib); + /* Add an empty rule for IB L2 */ + memset(&ib_spec->mask, 0, sizeof(ib_spec->mask)); + + err = __mlx4_ib_create_flow(&mqp->ibqp, flow, + IB_FLOW_DOMAIN_NIC, + MLX4_FS_REGULAR, + &mqp->reg_id); + } else { + err = __mlx4_ib_destroy_flow(mdev->dev, mqp->reg_id); + } + kfree(flow); + return err; +} + +static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr) +{ + struct mlx4_ib_dev *ibdev = ibdev_ptr; + int p; + + ibdev->ib_active = false; + flush_workqueue(wq); + + mlx4_ib_close_sriov(ibdev); + mlx4_ib_mad_cleanup(ibdev); + ib_unregister_device(&ibdev->ib_dev); + if (ibdev->iboe.nb.notifier_call) { + if (unregister_netdevice_notifier(&ibdev->iboe.nb)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb.notifier_call = NULL; + } + + if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) { + mlx4_qp_release_range(dev, ibdev->steer_qpn_base, + ibdev->steer_qpn_count); + kfree(ibdev->ib_uc_qpns_bitmap); + } + + if (ibdev->iboe.nb_inet.notifier_call) { + if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb_inet.notifier_call = NULL; + } +#if IS_ENABLED(CONFIG_IPV6) + if (ibdev->iboe.nb_inet6.notifier_call) { + if (unregister_inet6addr_notifier(&ibdev->iboe.nb_inet6)) + pr_warn("failure unregistering notifier\n"); + ibdev->iboe.nb_inet6.notifier_call = NULL; + } +#endif + + iounmap(ibdev->uar_map); + for (p = 0; p < ibdev->num_ports; ++p) + if (ibdev->counters[p] != -1) + mlx4_counter_free(ibdev->dev, ibdev->counters[p]); + mlx4_foreach_port(p, dev, MLX4_PORT_TYPE_IB) + mlx4_CLOSE_PORT(dev, p); + + mlx4_ib_free_eqs(dev, ibdev); + + mlx4_uar_free(dev, &ibdev->priv_uar); + mlx4_pd_free(dev, ibdev->priv_pdn); + ib_dealloc_device(&ibdev->ib_dev); +} + +static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init) +{ + struct mlx4_ib_demux_work **dm = NULL; + struct mlx4_dev *dev = ibdev->dev; + int i; + unsigned long flags; + struct mlx4_active_ports actv_ports; + unsigned int ports; + unsigned int first_port; + + if (!mlx4_is_master(dev)) + return; + + actv_ports = mlx4_get_active_ports(dev, slave); + ports = bitmap_weight(actv_ports.ports, dev->caps.num_ports); + first_port = find_first_bit(actv_ports.ports, dev->caps.num_ports); + + dm = kcalloc(ports, sizeof(*dm), GFP_ATOMIC); + if (!dm) { + pr_err("failed to allocate memory for tunneling qp update\n"); + goto out; + } + + for (i = 0; i < ports; i++) { + dm[i] = kmalloc(sizeof (struct mlx4_ib_demux_work), GFP_ATOMIC); + if (!dm[i]) { + pr_err("failed to allocate memory for tunneling qp update work struct\n"); + for (i = 0; i < dev->caps.num_ports; i++) { + if (dm[i]) + kfree(dm[i]); + } + goto out; + } + } + /* initialize or tear down tunnel QPs for the slave */ + for (i = 0; i < ports; i++) { + INIT_WORK(&dm[i]->work, mlx4_ib_tunnels_update_work); + dm[i]->port = first_port + i + 1; + dm[i]->slave = slave; + dm[i]->do_init = do_init; + dm[i]->dev = ibdev; + spin_lock_irqsave(&ibdev->sriov.going_down_lock, flags); + if (!ibdev->sriov.is_going_down) + queue_work(ibdev->sriov.demux[i].ud_wq, &dm[i]->work); + spin_unlock_irqrestore(&ibdev->sriov.going_down_lock, flags); + } +out: + kfree(dm); + return; +} + +static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr, + enum mlx4_dev_event event, unsigned long param) +{ + struct ib_event ibev; + struct mlx4_ib_dev *ibdev = to_mdev((struct ib_device *) ibdev_ptr); + struct mlx4_eqe *eqe = NULL; + struct ib_event_work *ew; + int p = 0; + + if (event == MLX4_DEV_EVENT_PORT_MGMT_CHANGE) + eqe = (struct mlx4_eqe *)param; + else + p = (int) param; + + switch (event) { + case MLX4_DEV_EVENT_PORT_UP: + if (p > ibdev->num_ports) + return; + if (mlx4_is_master(dev) && + rdma_port_get_link_layer(&ibdev->ib_dev, p) == + IB_LINK_LAYER_INFINIBAND) { + mlx4_ib_invalidate_all_guid_record(ibdev, p); + } + ibev.event = IB_EVENT_PORT_ACTIVE; + break; + + case MLX4_DEV_EVENT_PORT_DOWN: + if (p > ibdev->num_ports) + return; + ibev.event = IB_EVENT_PORT_ERR; + break; + + case MLX4_DEV_EVENT_CATASTROPHIC_ERROR: + ibdev->ib_active = false; + ibev.event = IB_EVENT_DEVICE_FATAL; + break; + + case MLX4_DEV_EVENT_PORT_MGMT_CHANGE: + ew = kmalloc(sizeof *ew, GFP_ATOMIC); + if (!ew) { + pr_err("failed to allocate memory for events work\n"); + break; + } + + INIT_WORK(&ew->work, handle_port_mgmt_change_event); + memcpy(&ew->ib_eqe, eqe, sizeof *eqe); + ew->ib_dev = ibdev; + /* need to queue only for port owner, which uses GEN_EQE */ + if (mlx4_is_master(dev)) + queue_work(wq, &ew->work); + else + handle_port_mgmt_change_event(&ew->work); + return; + + case MLX4_DEV_EVENT_SLAVE_INIT: + /* here, p is the slave id */ + do_slave_init(ibdev, p, 1); + return; + + case MLX4_DEV_EVENT_SLAVE_SHUTDOWN: + /* here, p is the slave id */ + do_slave_init(ibdev, p, 0); + return; + + default: + return; + } + + ibev.device = ibdev_ptr; + ibev.element.port_num = (u8) p; + + ib_dispatch_event(&ibev); +} + +static struct mlx4_interface mlx4_ib_interface = { + .add = mlx4_ib_add, + .remove = mlx4_ib_remove, + .event = mlx4_ib_event, + .protocol = MLX4_PROT_IB_IPV6 +}; + +static int __init mlx4_ib_init(void) +{ + int err; + + wq = create_singlethread_workqueue("mlx4_ib"); + if (!wq) + return -ENOMEM; + + err = mlx4_ib_mcg_init(); + if (err) + goto clean_wq; + + err = mlx4_register_interface(&mlx4_ib_interface); + if (err) + goto clean_mcg; + + return 0; + +clean_mcg: + mlx4_ib_mcg_destroy(); + +clean_wq: + destroy_workqueue(wq); + return err; +} + +static void __exit mlx4_ib_cleanup(void) +{ + mlx4_unregister_interface(&mlx4_ib_interface); + mlx4_ib_mcg_destroy(); + destroy_workqueue(wq); +} + +module_init(mlx4_ib_init); +module_exit(mlx4_ib_cleanup); diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c new file mode 100644 index 0000000..ed327e6 --- /dev/null +++ b/drivers/infiniband/hw/mlx4/mcg.c @@ -0,0 +1,1257 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include "mlx4_ib.h" + +#define MAX_VFS 80 +#define MAX_PEND_REQS_PER_FUNC 4 +#define MAD_TIMEOUT_MS 2000 + +#define mcg_warn(fmt, arg...) pr_warn("MCG WARNING: " fmt, ##arg) +#define mcg_error(fmt, arg...) pr_err(fmt, ##arg) +#define mcg_warn_group(group, format, arg...) \ + pr_warn("%s-%d: %16s (port %d): WARNING: " format, __func__, __LINE__,\ + (group)->name, group->demux->port, ## arg) + +#define mcg_error_group(group, format, arg...) \ + pr_err(" %16s: " format, (group)->name, ## arg) + + +static union ib_gid mgid0; + +static struct workqueue_struct *clean_wq; + +enum mcast_state { + MCAST_NOT_MEMBER = 0, + MCAST_MEMBER, +}; + +enum mcast_group_state { + MCAST_IDLE, + MCAST_JOIN_SENT, + MCAST_LEAVE_SENT, + MCAST_RESP_READY +}; + +struct mcast_member { + enum mcast_state state; + uint8_t join_state; + int num_pend_reqs; + struct list_head pending; +}; + +struct ib_sa_mcmember_data { + union ib_gid mgid; + union ib_gid port_gid; + __be32 qkey; + __be16 mlid; + u8 mtusel_mtu; + u8 tclass; + __be16 pkey; + u8 ratesel_rate; + u8 lifetmsel_lifetm; + __be32 sl_flowlabel_hoplimit; + u8 scope_join_state; + u8 proxy_join; + u8 reserved[2]; +}; + +struct mcast_group { + struct ib_sa_mcmember_data rec; + struct rb_node node; + struct list_head mgid0_list; + struct mlx4_ib_demux_ctx *demux; + struct mcast_member func[MAX_VFS]; + struct mutex lock; + struct work_struct work; + struct list_head pending_list; + int members[3]; + enum mcast_group_state state; + enum mcast_group_state prev_state; + struct ib_sa_mad response_sa_mad; + __be64 last_req_tid; + + char name[33]; /* MGID string */ + struct device_attribute dentry; + + /* refcount is the reference count for the following: + 1. Each queued request + 2. Each invocation of the worker thread + 3. Membership of the port at the SA + */ + atomic_t refcount; + + /* delayed work to clean pending SM request */ + struct delayed_work timeout_work; + struct list_head cleanup_list; +}; + +struct mcast_req { + int func; + struct ib_sa_mad sa_mad; + struct list_head group_list; + struct list_head func_list; + struct mcast_group *group; + int clean; +}; + + +#define safe_atomic_dec(ref) \ + do {\ + if (atomic_dec_and_test(ref)) \ + mcg_warn_group(group, "did not expect to reach zero\n"); \ + } while (0) + +static const char *get_state_string(enum mcast_group_state state) +{ + switch (state) { + case MCAST_IDLE: + return "MCAST_IDLE"; + case MCAST_JOIN_SENT: + return "MCAST_JOIN_SENT"; + case MCAST_LEAVE_SENT: + return "MCAST_LEAVE_SENT"; + case MCAST_RESP_READY: + return "MCAST_RESP_READY"; + } + return "Invalid State"; +} + +static struct mcast_group *mcast_find(struct mlx4_ib_demux_ctx *ctx, + union ib_gid *mgid) +{ + struct rb_node *node = ctx->mcg_table.rb_node; + struct mcast_group *group; + int ret; + + while (node) { + group = rb_entry(node, struct mcast_group, node); + ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid); + if (!ret) + return group; + + if (ret < 0) + node = node->rb_left; + else + node = node->rb_right; + } + return NULL; +} + +static struct mcast_group *mcast_insert(struct mlx4_ib_demux_ctx *ctx, + struct mcast_group *group) +{ + struct rb_node **link = &ctx->mcg_table.rb_node; + struct rb_node *parent = NULL; + struct mcast_group *cur_group; + int ret; + + while (*link) { + parent = *link; + cur_group = rb_entry(parent, struct mcast_group, node); + + ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw, + sizeof group->rec.mgid); + if (ret < 0) + link = &(*link)->rb_left; + else if (ret > 0) + link = &(*link)->rb_right; + else + return cur_group; + } + rb_link_node(&group->node, parent, link); + rb_insert_color(&group->node, &ctx->mcg_table); + return NULL; +} + +static int send_mad_to_wire(struct mlx4_ib_demux_ctx *ctx, struct ib_mad *mad) +{ + struct mlx4_ib_dev *dev = ctx->dev; + struct ib_ah_attr ah_attr; + + spin_lock(&dev->sm_lock); + if (!dev->sm_ah[ctx->port - 1]) { + /* port is not yet Active, sm_ah not ready */ + spin_unlock(&dev->sm_lock); + return -EAGAIN; + } + mlx4_ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr); + spin_unlock(&dev->sm_lock); + return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev), + ctx->port, IB_QPT_GSI, 0, 1, IB_QP1_QKEY, + &ah_attr, NULL, mad); +} + +static int send_mad_to_slave(int slave, struct mlx4_ib_demux_ctx *ctx, + struct ib_mad *mad) +{ + struct mlx4_ib_dev *dev = ctx->dev; + struct ib_mad_agent *agent = dev->send_agent[ctx->port - 1][1]; + struct ib_wc wc; + struct ib_ah_attr ah_attr; + + /* Our agent might not yet be registered when mads start to arrive */ + if (!agent) + return -EAGAIN; + + ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr); + + if (ib_find_cached_pkey(&dev->ib_dev, ctx->port, IB_DEFAULT_PKEY_FULL, &wc.pkey_index)) + return -EINVAL; + wc.sl = 0; + wc.dlid_path_bits = 0; + wc.port_num = ctx->port; + wc.slid = ah_attr.dlid; /* opensm lid */ + wc.src_qp = 1; + return mlx4_ib_send_to_slave(dev, slave, ctx->port, IB_QPT_GSI, &wc, NULL, mad); +} + +static int send_join_to_wire(struct mcast_group *group, struct ib_sa_mad *sa_mad) +{ + struct ib_sa_mad mad; + struct ib_sa_mcmember_data *sa_mad_data = (struct ib_sa_mcmember_data *)&mad.data; + int ret; + + /* we rely on a mad request as arrived from a VF */ + memcpy(&mad, sa_mad, sizeof mad); + + /* fix port GID to be the real one (slave 0) */ + sa_mad_data->port_gid.global.interface_id = group->demux->guid_cache[0]; + + /* assign our own TID */ + mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux); + group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */ + + ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad); + /* set timeout handler */ + if (!ret) { + /* calls mlx4_ib_mcg_timeout_handler */ + queue_delayed_work(group->demux->mcg_wq, &group->timeout_work, + msecs_to_jiffies(MAD_TIMEOUT_MS)); + } + + return ret; +} + +static int send_leave_to_wire(struct mcast_group *group, u8 join_state) +{ + struct ib_sa_mad mad; + struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data; + int ret; + + memset(&mad, 0, sizeof mad); + mad.mad_hdr.base_version = 1; + mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM; + mad.mad_hdr.class_version = 2; + mad.mad_hdr.method = IB_SA_METHOD_DELETE; + mad.mad_hdr.status = cpu_to_be16(0); + mad.mad_hdr.class_specific = cpu_to_be16(0); + mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux); + group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */ + mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC); + mad.mad_hdr.attr_mod = cpu_to_be32(0); + mad.sa_hdr.sm_key = 0x0; + mad.sa_hdr.attr_offset = cpu_to_be16(7); + mad.sa_hdr.comp_mask = IB_SA_MCMEMBER_REC_MGID | + IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_JOIN_STATE; + + *sa_data = group->rec; + sa_data->scope_join_state = join_state; + + ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad); + if (ret) + group->state = MCAST_IDLE; + + /* set timeout handler */ + if (!ret) { + /* calls mlx4_ib_mcg_timeout_handler */ + queue_delayed_work(group->demux->mcg_wq, &group->timeout_work, + msecs_to_jiffies(MAD_TIMEOUT_MS)); + } + + return ret; +} + +static int send_reply_to_slave(int slave, struct mcast_group *group, + struct ib_sa_mad *req_sa_mad, u16 status) +{ + struct ib_sa_mad mad; + struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data; + struct ib_sa_mcmember_data *req_sa_data = (struct ib_sa_mcmember_data *)&req_sa_mad->data; + int ret; + + memset(&mad, 0, sizeof mad); + mad.mad_hdr.base_version = 1; + mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM; + mad.mad_hdr.class_version = 2; + mad.mad_hdr.method = IB_MGMT_METHOD_GET_RESP; + mad.mad_hdr.status = cpu_to_be16(status); + mad.mad_hdr.class_specific = cpu_to_be16(0); + mad.mad_hdr.tid = req_sa_mad->mad_hdr.tid; + *(u8 *)&mad.mad_hdr.tid = 0; /* resetting tid to 0 */ + mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC); + mad.mad_hdr.attr_mod = cpu_to_be32(0); + mad.sa_hdr.sm_key = req_sa_mad->sa_hdr.sm_key; + mad.sa_hdr.attr_offset = cpu_to_be16(7); + mad.sa_hdr.comp_mask = 0; /* ignored on responses, see IBTA spec */ + + *sa_data = group->rec; + + /* reconstruct VF's requested join_state and port_gid */ + sa_data->scope_join_state &= 0xf0; + sa_data->scope_join_state |= (group->func[slave].join_state & 0x0f); + memcpy(&sa_data->port_gid, &req_sa_data->port_gid, sizeof req_sa_data->port_gid); + + ret = send_mad_to_slave(slave, group->demux, (struct ib_mad *)&mad); + return ret; +} + +static int check_selector(ib_sa_comp_mask comp_mask, + ib_sa_comp_mask selector_mask, + ib_sa_comp_mask value_mask, + u8 src_value, u8 dst_value) +{ + int err; + u8 selector = dst_value >> 6; + dst_value &= 0x3f; + src_value &= 0x3f; + + if (!(comp_mask & selector_mask) || !(comp_mask & value_mask)) + return 0; + + switch (selector) { + case IB_SA_GT: + err = (src_value <= dst_value); + break; + case IB_SA_LT: + err = (src_value >= dst_value); + break; + case IB_SA_EQ: + err = (src_value != dst_value); + break; + default: + err = 0; + break; + } + + return err; +} + +static u16 cmp_rec(struct ib_sa_mcmember_data *src, + struct ib_sa_mcmember_data *dst, ib_sa_comp_mask comp_mask) +{ + /* src is group record, dst is request record */ + /* MGID must already match */ + /* Port_GID we always replace to our Port_GID, so it is a match */ + +#define MAD_STATUS_REQ_INVALID 0x0200 + if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid) + return MAD_STATUS_REQ_INVALID; + if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR, + IB_SA_MCMEMBER_REC_MTU, + src->mtusel_mtu, dst->mtusel_mtu)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS && + src->tclass != dst->tclass) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey) + return MAD_STATUS_REQ_INVALID; + if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR, + IB_SA_MCMEMBER_REC_RATE, + src->ratesel_rate, dst->ratesel_rate)) + return MAD_STATUS_REQ_INVALID; + if (check_selector(comp_mask, + IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR, + IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME, + src->lifetmsel_lifetm, dst->lifetmsel_lifetm)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_SL && + (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0xf0000000) != + (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0xf0000000)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL && + (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x0fffff00) != + (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x0fffff00)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT && + (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x000000ff) != + (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x000000ff)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE && + (src->scope_join_state & 0xf0) != + (dst->scope_join_state & 0xf0)) + return MAD_STATUS_REQ_INVALID; + + /* join_state checked separately, proxy_join ignored */ + + return 0; +} + +/* release group, return 1 if this was last release and group is destroyed + * timout work is canceled sync */ +static int release_group(struct mcast_group *group, int from_timeout_handler) +{ + struct mlx4_ib_demux_ctx *ctx = group->demux; + int nzgroup; + + mutex_lock(&ctx->mcg_table_lock); + mutex_lock(&group->lock); + if (atomic_dec_and_test(&group->refcount)) { + if (!from_timeout_handler) { + if (group->state != MCAST_IDLE && + !cancel_delayed_work(&group->timeout_work)) { + atomic_inc(&group->refcount); + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + return 0; + } + } + + nzgroup = memcmp(&group->rec.mgid, &mgid0, sizeof mgid0); + if (nzgroup) + del_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr); + if (!list_empty(&group->pending_list)) + mcg_warn_group(group, "releasing a group with non empty pending list\n"); + if (nzgroup) + rb_erase(&group->node, &ctx->mcg_table); + list_del_init(&group->mgid0_list); + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + kfree(group); + return 1; + } else { + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + } + return 0; +} + +static void adjust_membership(struct mcast_group *group, u8 join_state, int inc) +{ + int i; + + for (i = 0; i < 3; i++, join_state >>= 1) + if (join_state & 0x1) + group->members[i] += inc; +} + +static u8 get_leave_state(struct mcast_group *group) +{ + u8 leave_state = 0; + int i; + + for (i = 0; i < 3; i++) + if (!group->members[i]) + leave_state |= (1 << i); + + return leave_state & (group->rec.scope_join_state & 7); +} + +static int join_group(struct mcast_group *group, int slave, u8 join_mask) +{ + int ret = 0; + u8 join_state; + + /* remove bits that slave is already member of, and adjust */ + join_state = join_mask & (~group->func[slave].join_state); + adjust_membership(group, join_state, 1); + group->func[slave].join_state |= join_state; + if (group->func[slave].state != MCAST_MEMBER && join_state) { + group->func[slave].state = MCAST_MEMBER; + ret = 1; + } + return ret; +} + +static int leave_group(struct mcast_group *group, int slave, u8 leave_state) +{ + int ret = 0; + + adjust_membership(group, leave_state, -1); + group->func[slave].join_state &= ~leave_state; + if (!group->func[slave].join_state) { + group->func[slave].state = MCAST_NOT_MEMBER; + ret = 1; + } + return ret; +} + +static int check_leave(struct mcast_group *group, int slave, u8 leave_mask) +{ + if (group->func[slave].state != MCAST_MEMBER) + return MAD_STATUS_REQ_INVALID; + + /* make sure we're not deleting unset bits */ + if (~group->func[slave].join_state & leave_mask) + return MAD_STATUS_REQ_INVALID; + + if (!leave_mask) + return MAD_STATUS_REQ_INVALID; + + return 0; +} + +static void mlx4_ib_mcg_timeout_handler(struct work_struct *work) +{ + struct delayed_work *delay = to_delayed_work(work); + struct mcast_group *group; + struct mcast_req *req = NULL; + + group = container_of(delay, typeof(*group), timeout_work); + + mutex_lock(&group->lock); + if (group->state == MCAST_JOIN_SENT) { + if (!list_empty(&group->pending_list)) { + req = list_first_entry(&group->pending_list, struct mcast_req, group_list); + list_del(&req->group_list); + list_del(&req->func_list); + --group->func[req->func].num_pend_reqs; + mutex_unlock(&group->lock); + kfree(req); + if (memcmp(&group->rec.mgid, &mgid0, sizeof mgid0)) { + if (release_group(group, 1)) + return; + } else { + kfree(group); + return; + } + mutex_lock(&group->lock); + } else + mcg_warn_group(group, "DRIVER BUG\n"); + } else if (group->state == MCAST_LEAVE_SENT) { + if (group->rec.scope_join_state & 7) + group->rec.scope_join_state &= 0xf8; + group->state = MCAST_IDLE; + mutex_unlock(&group->lock); + if (release_group(group, 1)) + return; + mutex_lock(&group->lock); + } else + mcg_warn_group(group, "invalid state %s\n", get_state_string(group->state)); + group->state = MCAST_IDLE; + atomic_inc(&group->refcount); + if (!queue_work(group->demux->mcg_wq, &group->work)) + safe_atomic_dec(&group->refcount); + + mutex_unlock(&group->lock); +} + +static int handle_leave_req(struct mcast_group *group, u8 leave_mask, + struct mcast_req *req) +{ + u16 status; + + if (req->clean) + leave_mask = group->func[req->func].join_state; + + status = check_leave(group, req->func, leave_mask); + if (!status) + leave_group(group, req->func, leave_mask); + + if (!req->clean) + send_reply_to_slave(req->func, group, &req->sa_mad, status); + --group->func[req->func].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + return 1; +} + +static int handle_join_req(struct mcast_group *group, u8 join_mask, + struct mcast_req *req) +{ + u8 group_join_state = group->rec.scope_join_state & 7; + int ref = 0; + u16 status; + struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data; + + if (join_mask == (group_join_state & join_mask)) { + /* port's membership need not change */ + status = cmp_rec(&group->rec, sa_data, req->sa_mad.sa_hdr.comp_mask); + if (!status) + join_group(group, req->func, join_mask); + + --group->func[req->func].num_pend_reqs; + send_reply_to_slave(req->func, group, &req->sa_mad, status); + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + ++ref; + } else { + /* port's membership needs to be updated */ + group->prev_state = group->state; + if (send_join_to_wire(group, &req->sa_mad)) { + --group->func[req->func].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + ref = 1; + group->state = group->prev_state; + } else + group->state = MCAST_JOIN_SENT; + } + + return ref; +} + +static void mlx4_ib_mcg_work_handler(struct work_struct *work) +{ + struct mcast_group *group; + struct mcast_req *req = NULL; + struct ib_sa_mcmember_data *sa_data; + u8 req_join_state; + int rc = 1; /* release_count - this is for the scheduled work */ + u16 status; + u8 method; + + group = container_of(work, typeof(*group), work); + + mutex_lock(&group->lock); + + /* First, let's see if a response from SM is waiting regarding this group. + * If so, we need to update the group's REC. If this is a bad response, we + * may need to send a bad response to a VF waiting for it. If VF is waiting + * and this is a good response, the VF will be answered later in this func. */ + if (group->state == MCAST_RESP_READY) { + /* cancels mlx4_ib_mcg_timeout_handler */ + cancel_delayed_work(&group->timeout_work); + status = be16_to_cpu(group->response_sa_mad.mad_hdr.status); + method = group->response_sa_mad.mad_hdr.method; + if (group->last_req_tid != group->response_sa_mad.mad_hdr.tid) { + mcg_warn_group(group, "Got MAD response to existing MGID but wrong TID, dropping. Resp TID=%llx, group TID=%llx\n", + be64_to_cpu(group->response_sa_mad.mad_hdr.tid), + be64_to_cpu(group->last_req_tid)); + group->state = group->prev_state; + goto process_requests; + } + if (status) { + if (!list_empty(&group->pending_list)) + req = list_first_entry(&group->pending_list, + struct mcast_req, group_list); + if ((method == IB_MGMT_METHOD_GET_RESP)) { + if (req) { + send_reply_to_slave(req->func, group, &req->sa_mad, status); + --group->func[req->func].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + ++rc; + } else + mcg_warn_group(group, "no request for failed join\n"); + } else if (method == IB_SA_METHOD_DELETE_RESP && group->demux->flushing) + ++rc; + } else { + u8 resp_join_state; + u8 cur_join_state; + + resp_join_state = ((struct ib_sa_mcmember_data *) + group->response_sa_mad.data)->scope_join_state & 7; + cur_join_state = group->rec.scope_join_state & 7; + + if (method == IB_MGMT_METHOD_GET_RESP) { + /* successfull join */ + if (!cur_join_state && resp_join_state) + --rc; + } else if (!resp_join_state) + ++rc; + memcpy(&group->rec, group->response_sa_mad.data, sizeof group->rec); + } + group->state = MCAST_IDLE; + } + +process_requests: + /* We should now go over pending join/leave requests, as long as we are idle. */ + while (!list_empty(&group->pending_list) && group->state == MCAST_IDLE) { + req = list_first_entry(&group->pending_list, struct mcast_req, + group_list); + sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data; + req_join_state = sa_data->scope_join_state & 0x7; + + /* For a leave request, we will immediately answer the VF, and + * update our internal counters. The actual leave will be sent + * to SM later, if at all needed. We dequeue the request now. */ + if (req->sa_mad.mad_hdr.method == IB_SA_METHOD_DELETE) + rc += handle_leave_req(group, req_join_state, req); + else + rc += handle_join_req(group, req_join_state, req); + } + + /* Handle leaves */ + if (group->state == MCAST_IDLE) { + req_join_state = get_leave_state(group); + if (req_join_state) { + group->rec.scope_join_state &= ~req_join_state; + group->prev_state = group->state; + if (send_leave_to_wire(group, req_join_state)) { + group->state = group->prev_state; + ++rc; + } else + group->state = MCAST_LEAVE_SENT; + } + } + + if (!list_empty(&group->pending_list) && group->state == MCAST_IDLE) + goto process_requests; + mutex_unlock(&group->lock); + + while (rc--) + release_group(group, 0); +} + +static struct mcast_group *search_relocate_mgid0_group(struct mlx4_ib_demux_ctx *ctx, + __be64 tid, + union ib_gid *new_mgid) +{ + struct mcast_group *group = NULL, *cur_group; + struct mcast_req *req; + struct list_head *pos; + struct list_head *n; + + mutex_lock(&ctx->mcg_table_lock); + list_for_each_safe(pos, n, &ctx->mcg_mgid0_list) { + group = list_entry(pos, struct mcast_group, mgid0_list); + mutex_lock(&group->lock); + if (group->last_req_tid == tid) { + if (memcmp(new_mgid, &mgid0, sizeof mgid0)) { + group->rec.mgid = *new_mgid; + sprintf(group->name, "%016llx%016llx", + be64_to_cpu(group->rec.mgid.global.subnet_prefix), + be64_to_cpu(group->rec.mgid.global.interface_id)); + list_del_init(&group->mgid0_list); + cur_group = mcast_insert(ctx, group); + if (cur_group) { + /* A race between our code and SM. Silently cleaning the new one */ + req = list_first_entry(&group->pending_list, + struct mcast_req, group_list); + --group->func[req->func].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + release_group(group, 0); + return NULL; + } + + atomic_inc(&group->refcount); + add_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr); + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + return group; + } else { + struct mcast_req *tmp1, *tmp2; + + list_del(&group->mgid0_list); + if (!list_empty(&group->pending_list) && group->state != MCAST_IDLE) + cancel_delayed_work_sync(&group->timeout_work); + + list_for_each_entry_safe(tmp1, tmp2, &group->pending_list, group_list) { + list_del(&tmp1->group_list); + kfree(tmp1); + } + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + kfree(group); + return NULL; + } + } + mutex_unlock(&group->lock); + } + mutex_unlock(&ctx->mcg_table_lock); + + return NULL; +} + +static ssize_t sysfs_show_group(struct device *dev, + struct device_attribute *attr, char *buf); + +static struct mcast_group *acquire_group(struct mlx4_ib_demux_ctx *ctx, + union ib_gid *mgid, int create, + gfp_t gfp_mask) +{ + struct mcast_group *group, *cur_group; + int is_mgid0; + int i; + + is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0); + if (!is_mgid0) { + group = mcast_find(ctx, mgid); + if (group) + goto found; + } + + if (!create) + return ERR_PTR(-ENOENT); + + group = kzalloc(sizeof *group, gfp_mask); + if (!group) + return ERR_PTR(-ENOMEM); + + group->demux = ctx; + group->rec.mgid = *mgid; + INIT_LIST_HEAD(&group->pending_list); + INIT_LIST_HEAD(&group->mgid0_list); + for (i = 0; i < MAX_VFS; ++i) + INIT_LIST_HEAD(&group->func[i].pending); + INIT_WORK(&group->work, mlx4_ib_mcg_work_handler); + INIT_DELAYED_WORK(&group->timeout_work, mlx4_ib_mcg_timeout_handler); + mutex_init(&group->lock); + sprintf(group->name, "%016llx%016llx", + be64_to_cpu(group->rec.mgid.global.subnet_prefix), + be64_to_cpu(group->rec.mgid.global.interface_id)); + sysfs_attr_init(&group->dentry.attr); + group->dentry.show = sysfs_show_group; + group->dentry.store = NULL; + group->dentry.attr.name = group->name; + group->dentry.attr.mode = 0400; + group->state = MCAST_IDLE; + + if (is_mgid0) { + list_add(&group->mgid0_list, &ctx->mcg_mgid0_list); + goto found; + } + + cur_group = mcast_insert(ctx, group); + if (cur_group) { + mcg_warn("group just showed up %s - confused\n", cur_group->name); + kfree(group); + return ERR_PTR(-EINVAL); + } + + add_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr); + +found: + atomic_inc(&group->refcount); + return group; +} + +static void queue_req(struct mcast_req *req) +{ + struct mcast_group *group = req->group; + + atomic_inc(&group->refcount); /* for the request */ + atomic_inc(&group->refcount); /* for scheduling the work */ + list_add_tail(&req->group_list, &group->pending_list); + list_add_tail(&req->func_list, &group->func[req->func].pending); + /* calls mlx4_ib_mcg_work_handler */ + if (!queue_work(group->demux->mcg_wq, &group->work)) + safe_atomic_dec(&group->refcount); +} + +int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave, + struct ib_sa_mad *mad) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)mad->data; + struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1]; + struct mcast_group *group; + + switch (mad->mad_hdr.method) { + case IB_MGMT_METHOD_GET_RESP: + case IB_SA_METHOD_DELETE_RESP: + mutex_lock(&ctx->mcg_table_lock); + group = acquire_group(ctx, &rec->mgid, 0, GFP_KERNEL); + mutex_unlock(&ctx->mcg_table_lock); + if (IS_ERR(group)) { + if (mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP) { + __be64 tid = mad->mad_hdr.tid; + *(u8 *)(&tid) = (u8)slave; /* in group we kept the modified TID */ + group = search_relocate_mgid0_group(ctx, tid, &rec->mgid); + } else + group = NULL; + } + + if (!group) + return 1; + + mutex_lock(&group->lock); + group->response_sa_mad = *mad; + group->prev_state = group->state; + group->state = MCAST_RESP_READY; + /* calls mlx4_ib_mcg_work_handler */ + atomic_inc(&group->refcount); + if (!queue_work(ctx->mcg_wq, &group->work)) + safe_atomic_dec(&group->refcount); + mutex_unlock(&group->lock); + release_group(group, 0); + return 1; /* consumed */ + case IB_MGMT_METHOD_SET: + case IB_SA_METHOD_GET_TABLE: + case IB_SA_METHOD_GET_TABLE_RESP: + case IB_SA_METHOD_DELETE: + return 0; /* not consumed, pass-through to guest over tunnel */ + default: + mcg_warn("In demux, port %d: unexpected MCMember method: 0x%x, dropping\n", + port, mad->mad_hdr.method); + return 1; /* consumed */ + } +} + +int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port, + int slave, struct ib_sa_mad *sa_mad) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)sa_mad->data; + struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1]; + struct mcast_group *group; + struct mcast_req *req; + int may_create = 0; + + if (ctx->flushing) + return -EAGAIN; + + switch (sa_mad->mad_hdr.method) { + case IB_MGMT_METHOD_SET: + may_create = 1; + case IB_SA_METHOD_DELETE: + req = kzalloc(sizeof *req, GFP_KERNEL); + if (!req) + return -ENOMEM; + + req->func = slave; + req->sa_mad = *sa_mad; + + mutex_lock(&ctx->mcg_table_lock); + group = acquire_group(ctx, &rec->mgid, may_create, GFP_KERNEL); + mutex_unlock(&ctx->mcg_table_lock); + if (IS_ERR(group)) { + kfree(req); + return PTR_ERR(group); + } + mutex_lock(&group->lock); + if (group->func[slave].num_pend_reqs > MAX_PEND_REQS_PER_FUNC) { + mutex_unlock(&group->lock); + mcg_warn_group(group, "Port %d, Func %d has too many pending requests (%d), dropping\n", + port, slave, MAX_PEND_REQS_PER_FUNC); + release_group(group, 0); + kfree(req); + return -ENOMEM; + } + ++group->func[slave].num_pend_reqs; + req->group = group; + queue_req(req); + mutex_unlock(&group->lock); + release_group(group, 0); + return 1; /* consumed */ + case IB_SA_METHOD_GET_TABLE: + case IB_MGMT_METHOD_GET_RESP: + case IB_SA_METHOD_GET_TABLE_RESP: + case IB_SA_METHOD_DELETE_RESP: + return 0; /* not consumed, pass-through */ + default: + mcg_warn("In multiplex, port %d, func %d: unexpected MCMember method: 0x%x, dropping\n", + port, slave, sa_mad->mad_hdr.method); + return 1; /* consumed */ + } +} + +static ssize_t sysfs_show_group(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct mcast_group *group = + container_of(attr, struct mcast_group, dentry); + struct mcast_req *req = NULL; + char pending_str[40]; + char state_str[40]; + ssize_t len = 0; + int f; + + if (group->state == MCAST_IDLE) + sprintf(state_str, "%s", get_state_string(group->state)); + else + sprintf(state_str, "%s(TID=0x%llx)", + get_state_string(group->state), + be64_to_cpu(group->last_req_tid)); + if (list_empty(&group->pending_list)) { + sprintf(pending_str, "No"); + } else { + req = list_first_entry(&group->pending_list, struct mcast_req, group_list); + sprintf(pending_str, "Yes(TID=0x%llx)", + be64_to_cpu(req->sa_mad.mad_hdr.tid)); + } + len += sprintf(buf + len, "%1d [%02d,%02d,%02d] %4d %4s %5s ", + group->rec.scope_join_state & 0xf, + group->members[2], group->members[1], group->members[0], + atomic_read(&group->refcount), + pending_str, + state_str); + for (f = 0; f < MAX_VFS; ++f) + if (group->func[f].state == MCAST_MEMBER) + len += sprintf(buf + len, "%d[%1x] ", + f, group->func[f].join_state); + + len += sprintf(buf + len, "\t\t(%4hx %4x %2x %2x %2x %2x %2x " + "%4x %4x %2x %2x)\n", + be16_to_cpu(group->rec.pkey), + be32_to_cpu(group->rec.qkey), + (group->rec.mtusel_mtu & 0xc0) >> 6, + group->rec.mtusel_mtu & 0x3f, + group->rec.tclass, + (group->rec.ratesel_rate & 0xc0) >> 6, + group->rec.ratesel_rate & 0x3f, + (be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0xf0000000) >> 28, + (be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0x0fffff00) >> 8, + be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0x000000ff, + group->rec.proxy_join); + + return len; +} + +int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx) +{ + char name[20]; + + atomic_set(&ctx->tid, 0); + sprintf(name, "mlx4_ib_mcg%d", ctx->port); + ctx->mcg_wq = create_singlethread_workqueue(name); + if (!ctx->mcg_wq) + return -ENOMEM; + + mutex_init(&ctx->mcg_table_lock); + ctx->mcg_table = RB_ROOT; + INIT_LIST_HEAD(&ctx->mcg_mgid0_list); + ctx->flushing = 0; + + return 0; +} + +static void force_clean_group(struct mcast_group *group) +{ + struct mcast_req *req, *tmp + ; + list_for_each_entry_safe(req, tmp, &group->pending_list, group_list) { + list_del(&req->group_list); + kfree(req); + } + del_sysfs_port_mcg_attr(group->demux->dev, group->demux->port, &group->dentry.attr); + rb_erase(&group->node, &group->demux->mcg_table); + kfree(group); +} + +static void _mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq) +{ + int i; + struct rb_node *p; + struct mcast_group *group; + unsigned long end; + int count; + + for (i = 0; i < MAX_VFS; ++i) + clean_vf_mcast(ctx, i); + + end = jiffies + msecs_to_jiffies(MAD_TIMEOUT_MS + 3000); + do { + count = 0; + mutex_lock(&ctx->mcg_table_lock); + for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p)) + ++count; + mutex_unlock(&ctx->mcg_table_lock); + if (!count) + break; + + msleep(1); + } while (time_after(end, jiffies)); + + flush_workqueue(ctx->mcg_wq); + if (destroy_wq) + destroy_workqueue(ctx->mcg_wq); + + mutex_lock(&ctx->mcg_table_lock); + while ((p = rb_first(&ctx->mcg_table)) != NULL) { + group = rb_entry(p, struct mcast_group, node); + if (atomic_read(&group->refcount)) + mcg_warn_group(group, "group refcount %d!!! (pointer %p)\n", atomic_read(&group->refcount), group); + + force_clean_group(group); + } + mutex_unlock(&ctx->mcg_table_lock); +} + +struct clean_work { + struct work_struct work; + struct mlx4_ib_demux_ctx *ctx; + int destroy_wq; +}; + +static void mcg_clean_task(struct work_struct *work) +{ + struct clean_work *cw = container_of(work, struct clean_work, work); + + _mlx4_ib_mcg_port_cleanup(cw->ctx, cw->destroy_wq); + cw->ctx->flushing = 0; + kfree(cw); +} + +void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq) +{ + struct clean_work *work; + + if (ctx->flushing) + return; + + ctx->flushing = 1; + + if (destroy_wq) { + _mlx4_ib_mcg_port_cleanup(ctx, destroy_wq); + ctx->flushing = 0; + return; + } + + work = kmalloc(sizeof *work, GFP_KERNEL); + if (!work) { + ctx->flushing = 0; + mcg_warn("failed allocating work for cleanup\n"); + return; + } + + work->ctx = ctx; + work->destroy_wq = destroy_wq; + INIT_WORK(&work->work, mcg_clean_task); + queue_work(clean_wq, &work->work); +} + +static void build_leave_mad(struct mcast_req *req) +{ + struct ib_sa_mad *mad = &req->sa_mad; + + mad->mad_hdr.method = IB_SA_METHOD_DELETE; +} + + +static void clear_pending_reqs(struct mcast_group *group, int vf) +{ + struct mcast_req *req, *tmp, *group_first = NULL; + int clear; + int pend = 0; + + if (!list_empty(&group->pending_list)) + group_first = list_first_entry(&group->pending_list, struct mcast_req, group_list); + + list_for_each_entry_safe(req, tmp, &group->func[vf].pending, func_list) { + clear = 1; + if (group_first == req && + (group->state == MCAST_JOIN_SENT || + group->state == MCAST_LEAVE_SENT)) { + clear = cancel_delayed_work(&group->timeout_work); + pend = !clear; + group->state = MCAST_IDLE; + } + if (clear) { + --group->func[vf].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + atomic_dec(&group->refcount); + } + } + + if (!pend && (!list_empty(&group->func[vf].pending) || group->func[vf].num_pend_reqs)) { + mcg_warn_group(group, "DRIVER BUG: list_empty %d, num_pend_reqs %d\n", + list_empty(&group->func[vf].pending), group->func[vf].num_pend_reqs); + } +} + +static int push_deleteing_req(struct mcast_group *group, int slave) +{ + struct mcast_req *req; + struct mcast_req *pend_req; + + if (!group->func[slave].join_state) + return 0; + + req = kzalloc(sizeof *req, GFP_KERNEL); + if (!req) { + mcg_warn_group(group, "failed allocation - may leave stall groups\n"); + return -ENOMEM; + } + + if (!list_empty(&group->func[slave].pending)) { + pend_req = list_entry(group->func[slave].pending.prev, struct mcast_req, group_list); + if (pend_req->clean) { + kfree(req); + return 0; + } + } + + req->clean = 1; + req->func = slave; + req->group = group; + ++group->func[slave].num_pend_reqs; + build_leave_mad(req); + queue_req(req); + return 0; +} + +void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave) +{ + struct mcast_group *group; + struct rb_node *p; + + mutex_lock(&ctx->mcg_table_lock); + for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p)) { + group = rb_entry(p, struct mcast_group, node); + mutex_lock(&group->lock); + if (atomic_read(&group->refcount)) { + /* clear pending requests of this VF */ + clear_pending_reqs(group, slave); + push_deleteing_req(group, slave); + } + mutex_unlock(&group->lock); + } + mutex_unlock(&ctx->mcg_table_lock); +} + + +int mlx4_ib_mcg_init(void) +{ + clean_wq = create_singlethread_workqueue("mlx4_ib_mcg"); + if (!clean_wq) + return -ENOMEM; + + return 0; +} + +void mlx4_ib_mcg_destroy(void) +{ + destroy_workqueue(clean_wq); +} diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h new file mode 100644 index 0000000..6eb743f --- /dev/null +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -0,0 +1,797 @@ +/* + * Copyright (c) 2006, 2007 Cisco Systems. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX4_IB_H +#define MLX4_IB_H + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#define MLX4_IB_DRV_NAME "mlx4_ib" + +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) "<" MLX4_IB_DRV_NAME "> %s: " fmt, __func__ + +#define mlx4_ib_warn(ibdev, format, arg...) \ + dev_warn((ibdev)->dma_device, MLX4_IB_DRV_NAME ": " format, ## arg) + +enum { + MLX4_IB_SQ_MIN_WQE_SHIFT = 6, + MLX4_IB_MAX_HEADROOM = 2048 +}; + +#define MLX4_IB_SQ_HEADROOM(shift) ((MLX4_IB_MAX_HEADROOM >> (shift)) + 1) +#define MLX4_IB_SQ_MAX_SPARE (MLX4_IB_SQ_HEADROOM(MLX4_IB_SQ_MIN_WQE_SHIFT)) + +/*module param to indicate if SM assigns the alias_GUID*/ +extern int mlx4_ib_sm_guid_assign; + +#define MLX4_IB_UC_STEER_QPN_ALIGN 1 +#define MLX4_IB_UC_MAX_NUM_QPS 256 +struct mlx4_ib_ucontext { + struct ib_ucontext ibucontext; + struct mlx4_uar uar; + struct list_head db_page_list; + struct mutex db_page_mutex; +}; + +struct mlx4_ib_pd { + struct ib_pd ibpd; + u32 pdn; +}; + +struct mlx4_ib_xrcd { + struct ib_xrcd ibxrcd; + u32 xrcdn; + struct ib_pd *pd; + struct ib_cq *cq; +}; + +struct mlx4_ib_cq_buf { + struct mlx4_buf buf; + struct mlx4_mtt mtt; + int entry_size; +}; + +struct mlx4_ib_cq_resize { + struct mlx4_ib_cq_buf buf; + int cqe; +}; + +struct mlx4_ib_cq { + struct ib_cq ibcq; + struct mlx4_cq mcq; + struct mlx4_ib_cq_buf buf; + struct mlx4_ib_cq_resize *resize_buf; + struct mlx4_db db; + spinlock_t lock; + struct mutex resize_mutex; + struct ib_umem *umem; + struct ib_umem *resize_umem; +}; + +struct mlx4_ib_mr { + struct ib_mr ibmr; + struct mlx4_mr mmr; + struct ib_umem *umem; +}; + +struct mlx4_ib_mw { + struct ib_mw ibmw; + struct mlx4_mw mmw; +}; + +struct mlx4_ib_fast_reg_page_list { + struct ib_fast_reg_page_list ibfrpl; + __be64 *mapped_page_list; + dma_addr_t map; +}; + +struct mlx4_ib_fmr { + struct ib_fmr ibfmr; + struct mlx4_fmr mfmr; +}; + +struct mlx4_ib_flow { + struct ib_flow ibflow; + /* translating DMFS verbs sniffer rule to FW API requires two reg IDs */ + u64 reg_id[2]; +}; + +struct mlx4_ib_wq { + u64 *wrid; + spinlock_t lock; + int wqe_cnt; + int max_post; + int max_gs; + int offset; + int wqe_shift; + unsigned head; + unsigned tail; +}; + +enum mlx4_ib_qp_flags { + MLX4_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO, + MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, + MLX4_IB_QP_NETIF = IB_QP_CREATE_NETIF_QP, + MLX4_IB_QP_CREATE_USE_GFP_NOIO = IB_QP_CREATE_USE_GFP_NOIO, + MLX4_IB_SRIOV_TUNNEL_QP = 1 << 30, + MLX4_IB_SRIOV_SQP = 1 << 31, +}; + +struct mlx4_ib_gid_entry { + struct list_head list; + union ib_gid gid; + int added; + u8 port; +}; + +enum mlx4_ib_qp_type { + /* + * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries + * here (and in that order) since the MAD layer uses them as + * indices into a 2-entry table. + */ + MLX4_IB_QPT_SMI = IB_QPT_SMI, + MLX4_IB_QPT_GSI = IB_QPT_GSI, + + MLX4_IB_QPT_RC = IB_QPT_RC, + MLX4_IB_QPT_UC = IB_QPT_UC, + MLX4_IB_QPT_UD = IB_QPT_UD, + MLX4_IB_QPT_RAW_IPV6 = IB_QPT_RAW_IPV6, + MLX4_IB_QPT_RAW_ETHERTYPE = IB_QPT_RAW_ETHERTYPE, + MLX4_IB_QPT_RAW_PACKET = IB_QPT_RAW_PACKET, + MLX4_IB_QPT_XRC_INI = IB_QPT_XRC_INI, + MLX4_IB_QPT_XRC_TGT = IB_QPT_XRC_TGT, + + MLX4_IB_QPT_PROXY_SMI_OWNER = 1 << 16, + MLX4_IB_QPT_PROXY_SMI = 1 << 17, + MLX4_IB_QPT_PROXY_GSI = 1 << 18, + MLX4_IB_QPT_TUN_SMI_OWNER = 1 << 19, + MLX4_IB_QPT_TUN_SMI = 1 << 20, + MLX4_IB_QPT_TUN_GSI = 1 << 21, +}; + +#define MLX4_IB_QPT_ANY_SRIOV (MLX4_IB_QPT_PROXY_SMI_OWNER | \ + MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER | \ + MLX4_IB_QPT_TUN_SMI | MLX4_IB_QPT_TUN_GSI) + +enum mlx4_ib_mad_ifc_flags { + MLX4_MAD_IFC_IGNORE_MKEY = 1, + MLX4_MAD_IFC_IGNORE_BKEY = 2, + MLX4_MAD_IFC_IGNORE_KEYS = (MLX4_MAD_IFC_IGNORE_MKEY | + MLX4_MAD_IFC_IGNORE_BKEY), + MLX4_MAD_IFC_NET_VIEW = 4, +}; + +enum { + MLX4_NUM_TUNNEL_BUFS = 256, +}; + +struct mlx4_ib_tunnel_header { + struct mlx4_av av; + __be32 remote_qpn; + __be32 qkey; + __be16 vlan; + u8 mac[6]; + __be16 pkey_index; + u8 reserved[6]; +}; + +struct mlx4_ib_buf { + void *addr; + dma_addr_t map; +}; + +struct mlx4_rcv_tunnel_hdr { + __be32 flags_src_qp; /* flags[6:5] is defined for VLANs: + * 0x0 - no vlan was in the packet + * 0x01 - C-VLAN was in the packet */ + u8 g_ml_path; /* gid bit stands for ipv6/4 header in RoCE */ + u8 reserved; + __be16 pkey_index; + __be16 sl_vid; + __be16 slid_mac_47_32; + __be32 mac_31_0; +}; + +struct mlx4_ib_proxy_sqp_hdr { + struct ib_grh grh; + struct mlx4_rcv_tunnel_hdr tun; +} __packed; + +struct mlx4_roce_smac_vlan_info { + u64 smac; + int smac_index; + int smac_port; + u64 candidate_smac; + int candidate_smac_index; + int candidate_smac_port; + u16 vid; + int vlan_index; + int vlan_port; + u16 candidate_vid; + int candidate_vlan_index; + int candidate_vlan_port; + int update_vid; +}; + +struct mlx4_ib_qp { + struct ib_qp ibqp; + struct mlx4_qp mqp; + struct mlx4_buf buf; + + struct mlx4_db db; + struct mlx4_ib_wq rq; + + u32 doorbell_qpn; + __be32 sq_signal_bits; + unsigned sq_next_wqe; + int sq_max_wqes_per_wr; + int sq_spare_wqes; + struct mlx4_ib_wq sq; + + enum mlx4_ib_qp_type mlx4_ib_qp_type; + struct ib_umem *umem; + struct mlx4_mtt mtt; + int buf_size; + struct mutex mutex; + u16 xrcdn; + u32 flags; + u8 port; + u8 alt_port; + u8 atomic_rd_en; + u8 resp_depth; + u8 sq_no_prefetch; + u8 state; + int mlx_type; + struct list_head gid_list; + struct list_head steering_rules; + struct mlx4_ib_buf *sqp_proxy_rcv; + struct mlx4_roce_smac_vlan_info pri; + struct mlx4_roce_smac_vlan_info alt; + u64 reg_id; +}; + +struct mlx4_ib_srq { + struct ib_srq ibsrq; + struct mlx4_srq msrq; + struct mlx4_buf buf; + struct mlx4_db db; + u64 *wrid; + spinlock_t lock; + int head; + int tail; + u16 wqe_ctr; + struct ib_umem *umem; + struct mlx4_mtt mtt; + struct mutex mutex; +}; + +struct mlx4_ib_ah { + struct ib_ah ibah; + union mlx4_ext_av av; +}; + +/****************************************/ +/* alias guid support */ +/****************************************/ +#define NUM_PORT_ALIAS_GUID 2 +#define NUM_ALIAS_GUID_IN_REC 8 +#define NUM_ALIAS_GUID_REC_IN_PORT 16 +#define GUID_REC_SIZE 8 +#define NUM_ALIAS_GUID_PER_PORT 128 +#define MLX4_NOT_SET_GUID (0x00LL) +#define MLX4_GUID_FOR_DELETE_VAL (~(0x00LL)) + +enum mlx4_guid_alias_rec_status { + MLX4_GUID_INFO_STATUS_IDLE, + MLX4_GUID_INFO_STATUS_SET, + MLX4_GUID_INFO_STATUS_PENDING, +}; + +enum mlx4_guid_alias_rec_ownership { + MLX4_GUID_DRIVER_ASSIGN, + MLX4_GUID_SYSADMIN_ASSIGN, + MLX4_GUID_NONE_ASSIGN, /*init state of each record*/ +}; + +enum mlx4_guid_alias_rec_method { + MLX4_GUID_INFO_RECORD_SET = IB_MGMT_METHOD_SET, + MLX4_GUID_INFO_RECORD_DELETE = IB_SA_METHOD_DELETE, +}; + +struct mlx4_sriov_alias_guid_info_rec_det { + u8 all_recs[GUID_REC_SIZE * NUM_ALIAS_GUID_IN_REC]; + ib_sa_comp_mask guid_indexes; /*indicates what from the 8 records are valid*/ + enum mlx4_guid_alias_rec_status status; /*indicates the administraively status of the record.*/ + u8 method; /*set or delete*/ + enum mlx4_guid_alias_rec_ownership ownership; /*indicates who assign that alias_guid record*/ +}; + +struct mlx4_sriov_alias_guid_port_rec_det { + struct mlx4_sriov_alias_guid_info_rec_det all_rec_per_port[NUM_ALIAS_GUID_REC_IN_PORT]; + struct workqueue_struct *wq; + struct delayed_work alias_guid_work; + u8 port; + struct mlx4_sriov_alias_guid *parent; + struct list_head cb_list; +}; + +struct mlx4_sriov_alias_guid { + struct mlx4_sriov_alias_guid_port_rec_det ports_guid[MLX4_MAX_PORTS]; + spinlock_t ag_work_lock; + struct ib_sa_client *sa_client; +}; + +struct mlx4_ib_demux_work { + struct work_struct work; + struct mlx4_ib_dev *dev; + int slave; + int do_init; + u8 port; + +}; + +struct mlx4_ib_tun_tx_buf { + struct mlx4_ib_buf buf; + struct ib_ah *ah; +}; + +struct mlx4_ib_demux_pv_qp { + struct ib_qp *qp; + enum ib_qp_type proxy_qpt; + struct mlx4_ib_buf *ring; + struct mlx4_ib_tun_tx_buf *tx_ring; + spinlock_t tx_lock; + unsigned tx_ix_head; + unsigned tx_ix_tail; +}; + +enum mlx4_ib_demux_pv_state { + DEMUX_PV_STATE_DOWN, + DEMUX_PV_STATE_STARTING, + DEMUX_PV_STATE_ACTIVE, + DEMUX_PV_STATE_DOWNING, +}; + +struct mlx4_ib_demux_pv_ctx { + int port; + int slave; + enum mlx4_ib_demux_pv_state state; + int has_smi; + struct ib_device *ib_dev; + struct ib_cq *cq; + struct ib_pd *pd; + struct ib_mr *mr; + struct work_struct work; + struct workqueue_struct *wq; + struct mlx4_ib_demux_pv_qp qp[2]; +}; + +struct mlx4_ib_demux_ctx { + struct ib_device *ib_dev; + int port; + struct workqueue_struct *wq; + struct workqueue_struct *ud_wq; + spinlock_t ud_lock; + __be64 subnet_prefix; + __be64 guid_cache[128]; + struct mlx4_ib_dev *dev; + /* the following lock protects both mcg_table and mcg_mgid0_list */ + struct mutex mcg_table_lock; + struct rb_root mcg_table; + struct list_head mcg_mgid0_list; + struct workqueue_struct *mcg_wq; + struct mlx4_ib_demux_pv_ctx **tun; + atomic_t tid; + int flushing; /* flushing the work queue */ +}; + +struct mlx4_ib_sriov { + struct mlx4_ib_demux_ctx demux[MLX4_MAX_PORTS]; + struct mlx4_ib_demux_pv_ctx *sqps[MLX4_MAX_PORTS]; + /* when using this spinlock you should use "irq" because + * it may be called from interrupt context.*/ + spinlock_t going_down_lock; + int is_going_down; + + struct mlx4_sriov_alias_guid alias_guid; + + /* CM paravirtualization fields */ + struct list_head cm_list; + spinlock_t id_map_lock; + struct rb_root sl_id_map; + struct idr pv_id_table; +}; + +struct mlx4_ib_iboe { + spinlock_t lock; + struct net_device *netdevs[MLX4_MAX_PORTS]; + struct net_device *masters[MLX4_MAX_PORTS]; + atomic64_t mac[MLX4_MAX_PORTS]; + struct notifier_block nb; + struct notifier_block nb_inet; + struct notifier_block nb_inet6; + union ib_gid gid_table[MLX4_MAX_PORTS][128]; +}; + +struct pkey_mgt { + u8 virt2phys_pkey[MLX4_MFUNC_MAX][MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS]; + u16 phys_pkey_cache[MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS]; + struct list_head pkey_port_list[MLX4_MFUNC_MAX]; + struct kobject *device_parent[MLX4_MFUNC_MAX]; +}; + +struct mlx4_ib_iov_sysfs_attr { + void *ctx; + struct kobject *kobj; + unsigned long data; + u32 entry_num; + char name[15]; + struct device_attribute dentry; + struct device *dev; +}; + +struct mlx4_ib_iov_sysfs_attr_ar { + struct mlx4_ib_iov_sysfs_attr dentries[3 * NUM_ALIAS_GUID_PER_PORT + 1]; +}; + +struct mlx4_ib_iov_port { + char name[100]; + u8 num; + struct mlx4_ib_dev *dev; + struct list_head list; + struct mlx4_ib_iov_sysfs_attr_ar *dentr_ar; + struct ib_port_attr attr; + struct kobject *cur_port; + struct kobject *admin_alias_parent; + struct kobject *gids_parent; + struct kobject *pkeys_parent; + struct kobject *mcgs_parent; + struct mlx4_ib_iov_sysfs_attr mcg_dentry; +}; + +struct mlx4_ib_dev { + struct ib_device ib_dev; + struct mlx4_dev *dev; + int num_ports; + void __iomem *uar_map; + + struct mlx4_uar priv_uar; + u32 priv_pdn; + MLX4_DECLARE_DOORBELL_LOCK(uar_lock); + + struct ib_mad_agent *send_agent[MLX4_MAX_PORTS][2]; + struct ib_ah *sm_ah[MLX4_MAX_PORTS]; + spinlock_t sm_lock; + struct mlx4_ib_sriov sriov; + + struct mutex cap_mask_mutex; + bool ib_active; + struct mlx4_ib_iboe iboe; + int counters[MLX4_MAX_PORTS]; + int *eq_table; + int eq_added; + struct kobject *iov_parent; + struct kobject *ports_parent; + struct kobject *dev_ports_parent[MLX4_MFUNC_MAX]; + struct mlx4_ib_iov_port iov_ports[MLX4_MAX_PORTS]; + struct pkey_mgt pkeys; + unsigned long *ib_uc_qpns_bitmap; + int steer_qpn_count; + int steer_qpn_base; + int steering_support; + struct mlx4_ib_qp *qp1_proxy[MLX4_MAX_PORTS]; + /* lock when destroying qp1_proxy and getting netdev events */ + struct mutex qp1_proxy_lock[MLX4_MAX_PORTS]; +}; + +struct ib_event_work { + struct work_struct work; + struct mlx4_ib_dev *ib_dev; + struct mlx4_eqe ib_eqe; +}; + +struct mlx4_ib_qp_tunnel_init_attr { + struct ib_qp_init_attr init_attr; + int slave; + enum ib_qp_type proxy_qp_type; + u8 port; +}; + +static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct mlx4_ib_dev, ib_dev); +} + +static inline struct mlx4_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct mlx4_ib_ucontext, ibucontext); +} + +static inline struct mlx4_ib_pd *to_mpd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct mlx4_ib_pd, ibpd); +} + +static inline struct mlx4_ib_xrcd *to_mxrcd(struct ib_xrcd *ibxrcd) +{ + return container_of(ibxrcd, struct mlx4_ib_xrcd, ibxrcd); +} + +static inline struct mlx4_ib_cq *to_mcq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct mlx4_ib_cq, ibcq); +} + +static inline struct mlx4_ib_cq *to_mibcq(struct mlx4_cq *mcq) +{ + return container_of(mcq, struct mlx4_ib_cq, mcq); +} + +static inline struct mlx4_ib_mr *to_mmr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct mlx4_ib_mr, ibmr); +} + +static inline struct mlx4_ib_mw *to_mmw(struct ib_mw *ibmw) +{ + return container_of(ibmw, struct mlx4_ib_mw, ibmw); +} + +static inline struct mlx4_ib_fast_reg_page_list *to_mfrpl(struct ib_fast_reg_page_list *ibfrpl) +{ + return container_of(ibfrpl, struct mlx4_ib_fast_reg_page_list, ibfrpl); +} + +static inline struct mlx4_ib_fmr *to_mfmr(struct ib_fmr *ibfmr) +{ + return container_of(ibfmr, struct mlx4_ib_fmr, ibfmr); +} + +static inline struct mlx4_ib_flow *to_mflow(struct ib_flow *ibflow) +{ + return container_of(ibflow, struct mlx4_ib_flow, ibflow); +} + +static inline struct mlx4_ib_qp *to_mqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct mlx4_ib_qp, ibqp); +} + +static inline struct mlx4_ib_qp *to_mibqp(struct mlx4_qp *mqp) +{ + return container_of(mqp, struct mlx4_ib_qp, mqp); +} + +static inline struct mlx4_ib_srq *to_msrq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct mlx4_ib_srq, ibsrq); +} + +static inline struct mlx4_ib_srq *to_mibsrq(struct mlx4_srq *msrq) +{ + return container_of(msrq, struct mlx4_ib_srq, msrq); +} + +static inline struct mlx4_ib_ah *to_mah(struct ib_ah *ibah) +{ + return container_of(ibah, struct mlx4_ib_ah, ibah); +} + +int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev); +void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev); + +int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt, + struct mlx4_db *db); +void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db); + +struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc); +int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, + struct ib_umem *umem); +struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata); +int mlx4_ib_dereg_mr(struct ib_mr *mr); +struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); +int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw, + struct ib_mw_bind *mw_bind); +int mlx4_ib_dealloc_mw(struct ib_mw *mw); +struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd, + int max_page_list_len); +struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, + int page_list_len); +void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list); + +int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); +int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); +struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector, + struct ib_ucontext *context, + struct ib_udata *udata); +int mlx4_ib_destroy_cq(struct ib_cq *cq); +int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); +void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq); +void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq); + +struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr); +int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr); +int mlx4_ib_destroy_ah(struct ib_ah *ah); + +struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata); +int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata); +int mlx4_ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); +int mlx4_ib_destroy_srq(struct ib_srq *srq); +void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index); +int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); + +struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); +int mlx4_ib_destroy_qp(struct ib_qp *qp); +int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); +int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr); +int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); + +int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags, + int port, struct ib_wc *in_wc, struct ib_grh *in_grh, + void *in_mad, void *response_mad); +int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad); +int mlx4_ib_mad_init(struct mlx4_ib_dev *dev); +void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev); + +struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int mr_access_flags, + struct ib_fmr_attr *fmr_attr); +int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, int npages, + u64 iova); +int mlx4_ib_unmap_fmr(struct list_head *fmr_list); +int mlx4_ib_fmr_dealloc(struct ib_fmr *fmr); +int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props, int netw_view); +int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey, int netw_view); + +int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid, int netw_view); + +static inline bool mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah) +{ + u8 port = be32_to_cpu(ah->av.ib.port_pd) >> 24 & 3; + + if (rdma_port_get_link_layer(ah->ibah.device, port) == IB_LINK_LAYER_ETHERNET) + return true; + + return !!(ah->av.ib.g_slid & 0x80); +} + +int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx); +void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq); +void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave); +int mlx4_ib_mcg_init(void); +void mlx4_ib_mcg_destroy(void); + +int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid); + +int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port, int slave, + struct ib_sa_mad *sa_mad); +int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave, + struct ib_sa_mad *mad); + +int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, + union ib_gid *gid); + +void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u8 port_num, + enum ib_event_type type); + +void mlx4_ib_tunnels_update_work(struct work_struct *work); + +int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, + enum ib_qp_type qpt, struct ib_wc *wc, + struct ib_grh *grh, struct ib_mad *mad); + +int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, + enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn, + u32 qkey, struct ib_ah_attr *attr, u8 *s_mac, + struct ib_mad *mad); + +__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx); + +int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, + struct ib_mad *mad); + +int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id, + struct ib_mad *mad); + +void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev); +void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave_id); + +/* alias guid support */ +void mlx4_ib_init_alias_guid_work(struct mlx4_ib_dev *dev, int port); +int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev); +void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev); +void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port); + +void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev, + int block_num, + u8 port_num, u8 *p_data); + +void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev, + int block_num, u8 port_num, + u8 *p_data); + +int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, + struct attribute *attr); +void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, + struct attribute *attr); +ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index); + +int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *device) ; + +void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device); + +__be64 mlx4_ib_gen_node_guid(void); + +int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn); +void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count); +int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, + int is_attach); +int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, + u64 start, u64 length, u64 virt_addr, + int mr_access_flags, struct ib_pd *pd, + struct ib_udata *udata); + +#endif /* MLX4_IB_H */ diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c new file mode 100644 index 0000000..8f9325c --- /dev/null +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -0,0 +1,524 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "mlx4_ib.h" + +static u32 convert_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX4_PERM_ATOMIC : 0) | + (acc & IB_ACCESS_REMOTE_WRITE ? MLX4_PERM_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? MLX4_PERM_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? MLX4_PERM_LOCAL_WRITE : 0) | + (acc & IB_ACCESS_MW_BIND ? MLX4_PERM_BIND_MW : 0) | + MLX4_PERM_LOCAL_READ; +} + +static enum mlx4_mw_type to_mlx4_type(enum ib_mw_type type) +{ + switch (type) { + case IB_MW_TYPE_1: return MLX4_MW_TYPE_1; + case IB_MW_TYPE_2: return MLX4_MW_TYPE_2; + default: return -1; + } +} + +struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct mlx4_ib_mr *mr; + int err; + + mr = kmalloc(sizeof *mr, GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + err = mlx4_mr_alloc(to_mdev(pd->device)->dev, to_mpd(pd)->pdn, 0, + ~0ull, convert_access(acc), 0, 0, &mr->mmr); + if (err) + goto err_free; + + err = mlx4_mr_enable(to_mdev(pd->device)->dev, &mr->mmr); + if (err) + goto err_mr; + + mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key; + mr->umem = NULL; + + return &mr->ibmr; + +err_mr: + (void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr); + +err_free: + kfree(mr); + + return ERR_PTR(err); +} + +int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, + struct ib_umem *umem) +{ + u64 *pages; + int i, k, entry; + int n; + int len; + int err = 0; + struct scatterlist *sg; + + pages = (u64 *) __get_free_page(GFP_KERNEL); + if (!pages) + return -ENOMEM; + + i = n = 0; + + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + len = sg_dma_len(sg) >> mtt->page_shift; + for (k = 0; k < len; ++k) { + pages[i++] = sg_dma_address(sg) + + umem->page_size * k; + /* + * Be friendly to mlx4_write_mtt() and + * pass it chunks of appropriate size. + */ + if (i == PAGE_SIZE / sizeof (u64)) { + err = mlx4_write_mtt(dev->dev, mtt, n, + i, pages); + if (err) + goto out; + n += i; + i = 0; + } + } + } + + if (i) + err = mlx4_write_mtt(dev->dev, mtt, n, i, pages); + +out: + free_page((unsigned long) pages); + return err; +} + +struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(pd->device); + struct mlx4_ib_mr *mr; + int shift; + int err; + int n; + + mr = kmalloc(sizeof *mr, GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + /* Force registering the memory as writable. */ + /* Used for memory re-registeration. HCA protects the access */ + mr->umem = ib_umem_get(pd->uobject->context, start, length, + access_flags | IB_ACCESS_LOCAL_WRITE, 0); + if (IS_ERR(mr->umem)) { + err = PTR_ERR(mr->umem); + goto err_free; + } + + n = ib_umem_page_count(mr->umem); + shift = ilog2(mr->umem->page_size); + + err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length, + convert_access(access_flags), n, shift, &mr->mmr); + if (err) + goto err_umem; + + err = mlx4_ib_umem_write_mtt(dev, &mr->mmr.mtt, mr->umem); + if (err) + goto err_mr; + + err = mlx4_mr_enable(dev->dev, &mr->mmr); + if (err) + goto err_mr; + + mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key; + + return &mr->ibmr; + +err_mr: + (void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr); + +err_umem: + ib_umem_release(mr->umem); + +err_free: + kfree(mr); + + return ERR_PTR(err); +} + +int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, + u64 start, u64 length, u64 virt_addr, + int mr_access_flags, struct ib_pd *pd, + struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(mr->device); + struct mlx4_ib_mr *mmr = to_mmr(mr); + struct mlx4_mpt_entry *mpt_entry; + struct mlx4_mpt_entry **pmpt_entry = &mpt_entry; + int err; + + /* Since we synchronize this call and mlx4_ib_dereg_mr via uverbs, + * we assume that the calls can't run concurrently. Otherwise, a + * race exists. + */ + err = mlx4_mr_hw_get_mpt(dev->dev, &mmr->mmr, &pmpt_entry); + + if (err) + return err; + + if (flags & IB_MR_REREG_PD) { + err = mlx4_mr_hw_change_pd(dev->dev, *pmpt_entry, + to_mpd(pd)->pdn); + + if (err) + goto release_mpt_entry; + } + + if (flags & IB_MR_REREG_ACCESS) { + err = mlx4_mr_hw_change_access(dev->dev, *pmpt_entry, + convert_access(mr_access_flags)); + + if (err) + goto release_mpt_entry; + } + + if (flags & IB_MR_REREG_TRANS) { + int shift; + int err; + int n; + + mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr); + ib_umem_release(mmr->umem); + mmr->umem = ib_umem_get(mr->uobject->context, start, length, + mr_access_flags | + IB_ACCESS_LOCAL_WRITE, + 0); + if (IS_ERR(mmr->umem)) { + err = PTR_ERR(mmr->umem); + /* Prevent mlx4_ib_dereg_mr from free'ing invalid pointer */ + mmr->umem = NULL; + goto release_mpt_entry; + } + n = ib_umem_page_count(mmr->umem); + shift = ilog2(mmr->umem->page_size); + + err = mlx4_mr_rereg_mem_write(dev->dev, &mmr->mmr, + virt_addr, length, n, shift, + *pmpt_entry); + if (err) { + ib_umem_release(mmr->umem); + goto release_mpt_entry; + } + mmr->mmr.iova = virt_addr; + mmr->mmr.size = length; + + err = mlx4_ib_umem_write_mtt(dev, &mmr->mmr.mtt, mmr->umem); + if (err) { + mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr); + ib_umem_release(mmr->umem); + goto release_mpt_entry; + } + } + + /* If we couldn't transfer the MR to the HCA, just remember to + * return a failure. But dereg_mr will free the resources. + */ + err = mlx4_mr_hw_write_mpt(dev->dev, &mmr->mmr, pmpt_entry); + if (!err && flags & IB_MR_REREG_ACCESS) + mmr->mmr.access = mr_access_flags; + +release_mpt_entry: + mlx4_mr_hw_put_mpt(dev->dev, pmpt_entry); + + return err; +} + +int mlx4_ib_dereg_mr(struct ib_mr *ibmr) +{ + struct mlx4_ib_mr *mr = to_mmr(ibmr); + int ret; + + ret = mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr); + if (ret) + return ret; + if (mr->umem) + ib_umem_release(mr->umem); + kfree(mr); + + return 0; +} + +struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) +{ + struct mlx4_ib_dev *dev = to_mdev(pd->device); + struct mlx4_ib_mw *mw; + int err; + + mw = kmalloc(sizeof(*mw), GFP_KERNEL); + if (!mw) + return ERR_PTR(-ENOMEM); + + err = mlx4_mw_alloc(dev->dev, to_mpd(pd)->pdn, + to_mlx4_type(type), &mw->mmw); + if (err) + goto err_free; + + err = mlx4_mw_enable(dev->dev, &mw->mmw); + if (err) + goto err_mw; + + mw->ibmw.rkey = mw->mmw.key; + + return &mw->ibmw; + +err_mw: + mlx4_mw_free(dev->dev, &mw->mmw); + +err_free: + kfree(mw); + + return ERR_PTR(err); +} + +int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw, + struct ib_mw_bind *mw_bind) +{ + struct ib_send_wr wr; + struct ib_send_wr *bad_wr; + int ret; + + memset(&wr, 0, sizeof(wr)); + wr.opcode = IB_WR_BIND_MW; + wr.wr_id = mw_bind->wr_id; + wr.send_flags = mw_bind->send_flags; + wr.wr.bind_mw.mw = mw; + wr.wr.bind_mw.bind_info = mw_bind->bind_info; + wr.wr.bind_mw.rkey = ib_inc_rkey(mw->rkey); + + ret = mlx4_ib_post_send(qp, &wr, &bad_wr); + if (!ret) + mw->rkey = wr.wr.bind_mw.rkey; + + return ret; +} + +int mlx4_ib_dealloc_mw(struct ib_mw *ibmw) +{ + struct mlx4_ib_mw *mw = to_mmw(ibmw); + + mlx4_mw_free(to_mdev(ibmw->device)->dev, &mw->mmw); + kfree(mw); + + return 0; +} + +struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd, + int max_page_list_len) +{ + struct mlx4_ib_dev *dev = to_mdev(pd->device); + struct mlx4_ib_mr *mr; + int err; + + mr = kmalloc(sizeof *mr, GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, 0, 0, 0, + max_page_list_len, 0, &mr->mmr); + if (err) + goto err_free; + + err = mlx4_mr_enable(dev->dev, &mr->mmr); + if (err) + goto err_mr; + + mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key; + mr->umem = NULL; + + return &mr->ibmr; + +err_mr: + (void) mlx4_mr_free(dev->dev, &mr->mmr); + +err_free: + kfree(mr); + return ERR_PTR(err); +} + +struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, + int page_list_len) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct mlx4_ib_fast_reg_page_list *mfrpl; + int size = page_list_len * sizeof (u64); + + if (page_list_len > MLX4_MAX_FAST_REG_PAGES) + return ERR_PTR(-EINVAL); + + mfrpl = kmalloc(sizeof *mfrpl, GFP_KERNEL); + if (!mfrpl) + return ERR_PTR(-ENOMEM); + + mfrpl->ibfrpl.page_list = kmalloc(size, GFP_KERNEL); + if (!mfrpl->ibfrpl.page_list) + goto err_free; + + mfrpl->mapped_page_list = dma_alloc_coherent(&dev->dev->pdev->dev, + size, &mfrpl->map, + GFP_KERNEL); + if (!mfrpl->mapped_page_list) + goto err_free; + + WARN_ON(mfrpl->map & 0x3f); + + return &mfrpl->ibfrpl; + +err_free: + kfree(mfrpl->ibfrpl.page_list); + kfree(mfrpl); + return ERR_PTR(-ENOMEM); +} + +void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list) +{ + struct mlx4_ib_dev *dev = to_mdev(page_list->device); + struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list); + int size = page_list->max_page_list_len * sizeof (u64); + + dma_free_coherent(&dev->dev->pdev->dev, size, mfrpl->mapped_page_list, + mfrpl->map); + kfree(mfrpl->ibfrpl.page_list); + kfree(mfrpl); +} + +struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int acc, + struct ib_fmr_attr *fmr_attr) +{ + struct mlx4_ib_dev *dev = to_mdev(pd->device); + struct mlx4_ib_fmr *fmr; + int err = -ENOMEM; + + fmr = kmalloc(sizeof *fmr, GFP_KERNEL); + if (!fmr) + return ERR_PTR(-ENOMEM); + + err = mlx4_fmr_alloc(dev->dev, to_mpd(pd)->pdn, convert_access(acc), + fmr_attr->max_pages, fmr_attr->max_maps, + fmr_attr->page_shift, &fmr->mfmr); + if (err) + goto err_free; + + err = mlx4_fmr_enable(to_mdev(pd->device)->dev, &fmr->mfmr); + if (err) + goto err_mr; + + fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mfmr.mr.key; + + return &fmr->ibfmr; + +err_mr: + (void) mlx4_mr_free(to_mdev(pd->device)->dev, &fmr->mfmr.mr); + +err_free: + kfree(fmr); + + return ERR_PTR(err); +} + +int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, + int npages, u64 iova) +{ + struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr); + struct mlx4_ib_dev *dev = to_mdev(ifmr->ibfmr.device); + + return mlx4_map_phys_fmr(dev->dev, &ifmr->mfmr, page_list, npages, iova, + &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey); +} + +int mlx4_ib_unmap_fmr(struct list_head *fmr_list) +{ + struct ib_fmr *ibfmr; + int err; + struct mlx4_dev *mdev = NULL; + + list_for_each_entry(ibfmr, fmr_list, list) { + if (mdev && to_mdev(ibfmr->device)->dev != mdev) + return -EINVAL; + mdev = to_mdev(ibfmr->device)->dev; + } + + if (!mdev) + return 0; + + list_for_each_entry(ibfmr, fmr_list, list) { + struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr); + + mlx4_fmr_unmap(mdev, &ifmr->mfmr, &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey); + } + + /* + * Make sure all MPT status updates are visible before issuing + * SYNC_TPT firmware command. + */ + wmb(); + + err = mlx4_SYNC_TPT(mdev); + if (err) + pr_warn("SYNC_TPT error %d when " + "unmapping FMRs\n", err); + + return 0; +} + +int mlx4_ib_fmr_dealloc(struct ib_fmr *ibfmr) +{ + struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr); + struct mlx4_ib_dev *dev = to_mdev(ibfmr->device); + int err; + + err = mlx4_fmr_free(dev->dev, &ifmr->mfmr); + + if (!err) + kfree(ifmr); + + return err; +} diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c new file mode 100644 index 0000000..9c5150c --- /dev/null +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -0,0 +1,3151 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include "mlx4_ib.h" +#include "user.h" + +enum { + MLX4_IB_ACK_REQ_FREQ = 8, +}; + +enum { + MLX4_IB_DEFAULT_SCHED_QUEUE = 0x83, + MLX4_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f, + MLX4_IB_LINK_TYPE_IB = 0, + MLX4_IB_LINK_TYPE_ETH = 1 +}; + +enum { + /* + * Largest possible UD header: send with GRH and immediate + * data plus 18 bytes for an Ethernet header with VLAN/802.1Q + * tag. (LRH would only use 8 bytes, so Ethernet is the + * biggest case) + */ + MLX4_IB_UD_HEADER_SIZE = 82, + MLX4_IB_LSO_HEADER_SPARE = 128, +}; + +enum { + MLX4_IB_IBOE_ETHERTYPE = 0x8915 +}; + +struct mlx4_ib_sqp { + struct mlx4_ib_qp qp; + int pkey_index; + u32 qkey; + u32 send_psn; + struct ib_ud_header ud_header; + u8 header_buf[MLX4_IB_UD_HEADER_SIZE]; +}; + +enum { + MLX4_IB_MIN_SQ_STRIDE = 6, + MLX4_IB_CACHE_LINE_SIZE = 64, +}; + +enum { + MLX4_RAW_QP_MTU = 7, + MLX4_RAW_QP_MSGMAX = 31, +}; + +#ifndef ETH_ALEN +#define ETH_ALEN 6 +#endif +static inline u64 mlx4_mac_to_u64(u8 *addr) +{ + u64 mac = 0; + int i; + + for (i = 0; i < ETH_ALEN; i++) { + mac <<= 8; + mac |= addr[i]; + } + return mac; +} + +static const __be32 mlx4_ib_opcode[] = { + [IB_WR_SEND] = cpu_to_be32(MLX4_OPCODE_SEND), + [IB_WR_LSO] = cpu_to_be32(MLX4_OPCODE_LSO), + [IB_WR_SEND_WITH_IMM] = cpu_to_be32(MLX4_OPCODE_SEND_IMM), + [IB_WR_RDMA_WRITE] = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE), + [IB_WR_RDMA_WRITE_WITH_IMM] = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM), + [IB_WR_RDMA_READ] = cpu_to_be32(MLX4_OPCODE_RDMA_READ), + [IB_WR_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_ATOMIC_CS), + [IB_WR_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_ATOMIC_FA), + [IB_WR_SEND_WITH_INV] = cpu_to_be32(MLX4_OPCODE_SEND_INVAL), + [IB_WR_LOCAL_INV] = cpu_to_be32(MLX4_OPCODE_LOCAL_INVAL), + [IB_WR_FAST_REG_MR] = cpu_to_be32(MLX4_OPCODE_FMR), + [IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_CS), + [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA), + [IB_WR_BIND_MW] = cpu_to_be32(MLX4_OPCODE_BIND_MW), +}; + +static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp) +{ + return container_of(mqp, struct mlx4_ib_sqp, qp); +} + +static int is_tunnel_qp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) +{ + if (!mlx4_is_master(dev->dev)) + return 0; + + return qp->mqp.qpn >= dev->dev->phys_caps.base_tunnel_sqpn && + qp->mqp.qpn < dev->dev->phys_caps.base_tunnel_sqpn + + 8 * MLX4_MFUNC_MAX; +} + +static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) +{ + int proxy_sqp = 0; + int real_sqp = 0; + int i; + /* PPF or Native -- real SQP */ + real_sqp = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) && + qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn && + qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 3); + if (real_sqp) + return 1; + /* VF or PF -- proxy SQP */ + if (mlx4_is_mfunc(dev->dev)) { + for (i = 0; i < dev->dev->caps.num_ports; i++) { + if (qp->mqp.qpn == dev->dev->caps.qp0_proxy[i] || + qp->mqp.qpn == dev->dev->caps.qp1_proxy[i]) { + proxy_sqp = 1; + break; + } + } + } + return proxy_sqp; +} + +/* used for INIT/CLOSE port logic */ +static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) +{ + int proxy_qp0 = 0; + int real_qp0 = 0; + int i; + /* PPF or Native -- real QP0 */ + real_qp0 = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) && + qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn && + qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 1); + if (real_qp0) + return 1; + /* VF or PF -- proxy QP0 */ + if (mlx4_is_mfunc(dev->dev)) { + for (i = 0; i < dev->dev->caps.num_ports; i++) { + if (qp->mqp.qpn == dev->dev->caps.qp0_proxy[i]) { + proxy_qp0 = 1; + break; + } + } + } + return proxy_qp0; +} + +static void *get_wqe(struct mlx4_ib_qp *qp, int offset) +{ + return mlx4_buf_offset(&qp->buf, offset); +} + +static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n) +{ + return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift)); +} + +static void *get_send_wqe(struct mlx4_ib_qp *qp, int n) +{ + return get_wqe(qp, qp->sq.offset + (n << qp->sq.wqe_shift)); +} + +/* + * Stamp a SQ WQE so that it is invalid if prefetched by marking the + * first four bytes of every 64 byte chunk with + * 0x7FFFFFF | (invalid_ownership_value << 31). + * + * When the max work request size is less than or equal to the WQE + * basic block size, as an optimization, we can stamp all WQEs with + * 0xffffffff, and skip the very first chunk of each WQE. + */ +static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size) +{ + __be32 *wqe; + int i; + int s; + int ind; + void *buf; + __be32 stamp; + struct mlx4_wqe_ctrl_seg *ctrl; + + if (qp->sq_max_wqes_per_wr > 1) { + s = roundup(size, 1U << qp->sq.wqe_shift); + for (i = 0; i < s; i += 64) { + ind = (i >> qp->sq.wqe_shift) + n; + stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) : + cpu_to_be32(0xffffffff); + buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); + wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1)); + *wqe = stamp; + } + } else { + ctrl = buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1)); + s = (ctrl->fence_size & 0x3f) << 4; + for (i = 64; i < s; i += 64) { + wqe = buf + i; + *wqe = cpu_to_be32(0xffffffff); + } + } +} + +static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size) +{ + struct mlx4_wqe_ctrl_seg *ctrl; + struct mlx4_wqe_inline_seg *inl; + void *wqe; + int s; + + ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1)); + s = sizeof(struct mlx4_wqe_ctrl_seg); + + if (qp->ibqp.qp_type == IB_QPT_UD) { + struct mlx4_wqe_datagram_seg *dgram = wqe + sizeof *ctrl; + struct mlx4_av *av = (struct mlx4_av *)dgram->av; + memset(dgram, 0, sizeof *dgram); + av->port_pd = cpu_to_be32((qp->port << 24) | to_mpd(qp->ibqp.pd)->pdn); + s += sizeof(struct mlx4_wqe_datagram_seg); + } + + /* Pad the remainder of the WQE with an inline data segment. */ + if (size > s) { + inl = wqe + s; + inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl)); + } + ctrl->srcrb_flags = 0; + ctrl->fence_size = size / 16; + /* + * Make sure descriptor is fully written before setting ownership bit + * (because HW can start executing as soon as we do). + */ + wmb(); + + ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) | + (n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0); + + stamp_send_wqe(qp, n + qp->sq_spare_wqes, size); +} + +/* Post NOP WQE to prevent wrap-around in the middle of WR */ +static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind) +{ + unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1)); + if (unlikely(s < qp->sq_max_wqes_per_wr)) { + post_nop_wqe(qp, ind, s << qp->sq.wqe_shift); + ind += s; + } + return ind; +} + +static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type) +{ + struct ib_event event; + struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; + + if (type == MLX4_EVENT_TYPE_PATH_MIG) + to_mibqp(qp)->port = to_mibqp(qp)->alt_port; + + if (ibqp->event_handler) { + event.device = ibqp->device; + event.element.qp = ibqp; + switch (type) { + case MLX4_EVENT_TYPE_PATH_MIG: + event.event = IB_EVENT_PATH_MIG; + break; + case MLX4_EVENT_TYPE_COMM_EST: + event.event = IB_EVENT_COMM_EST; + break; + case MLX4_EVENT_TYPE_SQ_DRAINED: + event.event = IB_EVENT_SQ_DRAINED; + break; + case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE: + event.event = IB_EVENT_QP_LAST_WQE_REACHED; + break; + case MLX4_EVENT_TYPE_WQ_CATAS_ERROR: + event.event = IB_EVENT_QP_FATAL; + break; + case MLX4_EVENT_TYPE_PATH_MIG_FAILED: + event.event = IB_EVENT_PATH_MIG_ERR; + break; + case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + event.event = IB_EVENT_QP_REQ_ERR; + break; + case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR: + event.event = IB_EVENT_QP_ACCESS_ERR; + break; + default: + pr_warn("Unexpected event type %d " + "on QP %06x\n", type, qp->qpn); + return; + } + + ibqp->event_handler(&event, ibqp->qp_context); + } +} + +static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags) +{ + /* + * UD WQEs must have a datagram segment. + * RC and UC WQEs might have a remote address segment. + * MLX WQEs need two extra inline data segments (for the UD + * header and space for the ICRC). + */ + switch (type) { + case MLX4_IB_QPT_UD: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_datagram_seg) + + ((flags & MLX4_IB_QP_LSO) ? MLX4_IB_LSO_HEADER_SPARE : 0); + case MLX4_IB_QPT_PROXY_SMI_OWNER: + case MLX4_IB_QPT_PROXY_SMI: + case MLX4_IB_QPT_PROXY_GSI: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_datagram_seg) + 64; + case MLX4_IB_QPT_TUN_SMI_OWNER: + case MLX4_IB_QPT_TUN_GSI: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_datagram_seg); + + case MLX4_IB_QPT_UC: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_raddr_seg); + case MLX4_IB_QPT_RC: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_atomic_seg) + + sizeof (struct mlx4_wqe_raddr_seg); + case MLX4_IB_QPT_SMI: + case MLX4_IB_QPT_GSI: + return sizeof (struct mlx4_wqe_ctrl_seg) + + ALIGN(MLX4_IB_UD_HEADER_SIZE + + DIV_ROUND_UP(MLX4_IB_UD_HEADER_SIZE, + MLX4_INLINE_ALIGN) * + sizeof (struct mlx4_wqe_inline_seg), + sizeof (struct mlx4_wqe_data_seg)) + + ALIGN(4 + + sizeof (struct mlx4_wqe_inline_seg), + sizeof (struct mlx4_wqe_data_seg)); + default: + return sizeof (struct mlx4_wqe_ctrl_seg); + } +} + +static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, + int is_user, int has_rq, struct mlx4_ib_qp *qp) +{ + /* Sanity check RQ size before proceeding */ + if (cap->max_recv_wr > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE || + cap->max_recv_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg)) + return -EINVAL; + + if (!has_rq) { + if (cap->max_recv_wr) + return -EINVAL; + + qp->rq.wqe_cnt = qp->rq.max_gs = 0; + } else { + /* HW requires >= 1 RQ entry with >= 1 gather entry */ + if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge)) + return -EINVAL; + + qp->rq.wqe_cnt = roundup_pow_of_two(max(1U, cap->max_recv_wr)); + qp->rq.max_gs = roundup_pow_of_two(max(1U, cap->max_recv_sge)); + qp->rq.wqe_shift = ilog2(qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg)); + } + + /* leave userspace return values as they were, so as not to break ABI */ + if (is_user) { + cap->max_recv_wr = qp->rq.max_post = qp->rq.wqe_cnt; + cap->max_recv_sge = qp->rq.max_gs; + } else { + cap->max_recv_wr = qp->rq.max_post = + min(dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE, qp->rq.wqe_cnt); + cap->max_recv_sge = min(qp->rq.max_gs, + min(dev->dev->caps.max_sq_sg, + dev->dev->caps.max_rq_sg)); + } + + return 0; +} + +static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, + enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp) +{ + int s; + + /* Sanity check SQ size before proceeding */ + if (cap->max_send_wr > (dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE) || + cap->max_send_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg) || + cap->max_inline_data + send_wqe_overhead(type, qp->flags) + + sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz) + return -EINVAL; + + /* + * For MLX transport we need 2 extra S/G entries: + * one for the header and one for the checksum at the end + */ + if ((type == MLX4_IB_QPT_SMI || type == MLX4_IB_QPT_GSI || + type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) && + cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg) + return -EINVAL; + + s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg), + cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) + + send_wqe_overhead(type, qp->flags); + + if (s > dev->dev->caps.max_sq_desc_sz) + return -EINVAL; + + /* + * Hermon supports shrinking WQEs, such that a single work + * request can include multiple units of 1 << wqe_shift. This + * way, work requests can differ in size, and do not have to + * be a power of 2 in size, saving memory and speeding up send + * WR posting. Unfortunately, if we do this then the + * wqe_index field in CQEs can't be used to look up the WR ID + * anymore, so we do this only if selective signaling is off. + * + * Further, on 32-bit platforms, we can't use vmap() to make + * the QP buffer virtually contiguous. Thus we have to use + * constant-sized WRs to make sure a WR is always fully within + * a single page-sized chunk. + * + * Finally, we use NOP work requests to pad the end of the + * work queue, to avoid wrap-around in the middle of WR. We + * set NEC bit to avoid getting completions with error for + * these NOP WRs, but since NEC is only supported starting + * with firmware 2.2.232, we use constant-sized WRs for older + * firmware. + * + * And, since MLX QPs only support SEND, we use constant-sized + * WRs in this case. + * + * We look for the smallest value of wqe_shift such that the + * resulting number of wqes does not exceed device + * capabilities. + * + * We set WQE size to at least 64 bytes, this way stamping + * invalidates each WQE. + */ + if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC && + qp->sq_signal_bits && BITS_PER_LONG == 64 && + type != MLX4_IB_QPT_SMI && type != MLX4_IB_QPT_GSI && + !(type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI | + MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) + qp->sq.wqe_shift = ilog2(64); + else + qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s)); + + for (;;) { + qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1U << qp->sq.wqe_shift); + + /* + * We need to leave 2 KB + 1 WR of headroom in the SQ to + * allow HW to prefetch. + */ + qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr; + qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr * + qp->sq_max_wqes_per_wr + + qp->sq_spare_wqes); + + if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes) + break; + + if (qp->sq_max_wqes_per_wr <= 1) + return -EINVAL; + + ++qp->sq.wqe_shift; + } + + qp->sq.max_gs = (min(dev->dev->caps.max_sq_desc_sz, + (qp->sq_max_wqes_per_wr << qp->sq.wqe_shift)) - + send_wqe_overhead(type, qp->flags)) / + sizeof (struct mlx4_wqe_data_seg); + + qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + + (qp->sq.wqe_cnt << qp->sq.wqe_shift); + if (qp->rq.wqe_shift > qp->sq.wqe_shift) { + qp->rq.offset = 0; + qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; + } else { + qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift; + qp->sq.offset = 0; + } + + cap->max_send_wr = qp->sq.max_post = + (qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr; + cap->max_send_sge = min(qp->sq.max_gs, + min(dev->dev->caps.max_sq_sg, + dev->dev->caps.max_rq_sg)); + /* We don't support inline sends for kernel QPs (yet) */ + cap->max_inline_data = 0; + + return 0; +} + +static int set_user_sq_size(struct mlx4_ib_dev *dev, + struct mlx4_ib_qp *qp, + struct mlx4_ib_create_qp *ucmd) +{ + /* Sanity check SQ size before proceeding */ + if ((1 << ucmd->log_sq_bb_count) > dev->dev->caps.max_wqes || + ucmd->log_sq_stride > + ilog2(roundup_pow_of_two(dev->dev->caps.max_sq_desc_sz)) || + ucmd->log_sq_stride < MLX4_IB_MIN_SQ_STRIDE) + return -EINVAL; + + qp->sq.wqe_cnt = 1 << ucmd->log_sq_bb_count; + qp->sq.wqe_shift = ucmd->log_sq_stride; + + qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + + (qp->sq.wqe_cnt << qp->sq.wqe_shift); + + return 0; +} + +static int alloc_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp) +{ + int i; + + qp->sqp_proxy_rcv = + kmalloc(sizeof (struct mlx4_ib_buf) * qp->rq.wqe_cnt, + GFP_KERNEL); + if (!qp->sqp_proxy_rcv) + return -ENOMEM; + for (i = 0; i < qp->rq.wqe_cnt; i++) { + qp->sqp_proxy_rcv[i].addr = + kmalloc(sizeof (struct mlx4_ib_proxy_sqp_hdr), + GFP_KERNEL); + if (!qp->sqp_proxy_rcv[i].addr) + goto err; + qp->sqp_proxy_rcv[i].map = + ib_dma_map_single(dev, qp->sqp_proxy_rcv[i].addr, + sizeof (struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + } + return 0; + +err: + while (i > 0) { + --i; + ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map, + sizeof (struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + kfree(qp->sqp_proxy_rcv[i].addr); + } + kfree(qp->sqp_proxy_rcv); + qp->sqp_proxy_rcv = NULL; + return -ENOMEM; +} + +static void free_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp) +{ + int i; + + for (i = 0; i < qp->rq.wqe_cnt; i++) { + ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map, + sizeof (struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + kfree(qp->sqp_proxy_rcv[i].addr); + } + kfree(qp->sqp_proxy_rcv); +} + +static int qp_has_rq(struct ib_qp_init_attr *attr) +{ + if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT) + return 0; + + return !attr->srq; +} + +static int qp0_enabled_vf(struct mlx4_dev *dev, int qpn) +{ + int i; + for (i = 0; i < dev->caps.num_ports; i++) { + if (qpn == dev->caps.qp0_proxy[i]) + return !!dev->caps.qp0_qkey[i]; + } + return 0; +} + +static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata, int sqpn, struct mlx4_ib_qp **caller_qp, + gfp_t gfp) +{ + int qpn; + int err; + struct mlx4_ib_sqp *sqp; + struct mlx4_ib_qp *qp; + enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type; + + /* When tunneling special qps, we use a plain UD qp */ + if (sqpn) { + if (mlx4_is_mfunc(dev->dev) && + (!mlx4_is_master(dev->dev) || + !(init_attr->create_flags & MLX4_IB_SRIOV_SQP))) { + if (init_attr->qp_type == IB_QPT_GSI) + qp_type = MLX4_IB_QPT_PROXY_GSI; + else { + if (mlx4_is_master(dev->dev) || + qp0_enabled_vf(dev->dev, sqpn)) + qp_type = MLX4_IB_QPT_PROXY_SMI_OWNER; + else + qp_type = MLX4_IB_QPT_PROXY_SMI; + } + } + qpn = sqpn; + /* add extra sg entry for tunneling */ + init_attr->cap.max_recv_sge++; + } else if (init_attr->create_flags & MLX4_IB_SRIOV_TUNNEL_QP) { + struct mlx4_ib_qp_tunnel_init_attr *tnl_init = + container_of(init_attr, + struct mlx4_ib_qp_tunnel_init_attr, init_attr); + if ((tnl_init->proxy_qp_type != IB_QPT_SMI && + tnl_init->proxy_qp_type != IB_QPT_GSI) || + !mlx4_is_master(dev->dev)) + return -EINVAL; + if (tnl_init->proxy_qp_type == IB_QPT_GSI) + qp_type = MLX4_IB_QPT_TUN_GSI; + else if (tnl_init->slave == mlx4_master_func_num(dev->dev) || + mlx4_vf_smi_enabled(dev->dev, tnl_init->slave, + tnl_init->port)) + qp_type = MLX4_IB_QPT_TUN_SMI_OWNER; + else + qp_type = MLX4_IB_QPT_TUN_SMI; + /* we are definitely in the PPF here, since we are creating + * tunnel QPs. base_tunnel_sqpn is therefore valid. */ + qpn = dev->dev->phys_caps.base_tunnel_sqpn + 8 * tnl_init->slave + + tnl_init->proxy_qp_type * 2 + tnl_init->port - 1; + sqpn = qpn; + } + + if (!*caller_qp) { + if (qp_type == MLX4_IB_QPT_SMI || qp_type == MLX4_IB_QPT_GSI || + (qp_type & (MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_SMI_OWNER | + MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) { + sqp = kzalloc(sizeof (struct mlx4_ib_sqp), gfp); + if (!sqp) + return -ENOMEM; + qp = &sqp->qp; + qp->pri.vid = 0xFFFF; + qp->alt.vid = 0xFFFF; + } else { + qp = kzalloc(sizeof (struct mlx4_ib_qp), gfp); + if (!qp) + return -ENOMEM; + qp->pri.vid = 0xFFFF; + qp->alt.vid = 0xFFFF; + } + } else + qp = *caller_qp; + + qp->mlx4_ib_qp_type = qp_type; + + mutex_init(&qp->mutex); + spin_lock_init(&qp->sq.lock); + spin_lock_init(&qp->rq.lock); + INIT_LIST_HEAD(&qp->gid_list); + INIT_LIST_HEAD(&qp->steering_rules); + + qp->state = IB_QPS_RESET; + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) + qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); + + err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, qp_has_rq(init_attr), qp); + if (err) + goto err; + + if (pd->uobject) { + struct mlx4_ib_create_qp ucmd; + + if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { + err = -EFAULT; + goto err; + } + + qp->sq_no_prefetch = ucmd.sq_no_prefetch; + + err = set_user_sq_size(dev, qp, &ucmd); + if (err) + goto err; + + qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, + qp->buf_size, 0, 0); + if (IS_ERR(qp->umem)) { + err = PTR_ERR(qp->umem); + goto err; + } + + err = mlx4_mtt_init(dev->dev, ib_umem_page_count(qp->umem), + ilog2(qp->umem->page_size), &qp->mtt); + if (err) + goto err_buf; + + err = mlx4_ib_umem_write_mtt(dev, &qp->mtt, qp->umem); + if (err) + goto err_mtt; + + if (qp_has_rq(init_attr)) { + err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context), + ucmd.db_addr, &qp->db); + if (err) + goto err_mtt; + } + } else { + qp->sq_no_prefetch = 0; + + if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) + qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK; + + if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) + qp->flags |= MLX4_IB_QP_LSO; + + if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) { + if (dev->steering_support == + MLX4_STEERING_MODE_DEVICE_MANAGED) + qp->flags |= MLX4_IB_QP_NETIF; + else + goto err; + } + + err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp); + if (err) + goto err; + + if (qp_has_rq(init_attr)) { + err = mlx4_db_alloc(dev->dev, &qp->db, 0, gfp); + if (err) + goto err; + + *qp->db.db = 0; + } + + if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf, gfp)) { + err = -ENOMEM; + goto err_db; + } + + err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift, + &qp->mtt); + if (err) + goto err_buf; + + err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf, gfp); + if (err) + goto err_mtt; + + qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), gfp); + qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), gfp); + if (!qp->sq.wrid || !qp->rq.wrid) { + err = -ENOMEM; + goto err_wrid; + } + } + + if (sqpn) { + if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | + MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) { + if (alloc_proxy_bufs(pd->device, qp)) { + err = -ENOMEM; + goto err_wrid; + } + } + } else { + /* Raw packet QPNs must be aligned to 8 bits. If not, the WQE + * BlueFlame setup flow wrongly causes VLAN insertion. */ + if (init_attr->qp_type == IB_QPT_RAW_PACKET) + err = mlx4_qp_reserve_range(dev->dev, 1, 1 << 8, &qpn); + else + if (qp->flags & MLX4_IB_QP_NETIF) + err = mlx4_ib_steer_qp_alloc(dev, 1, &qpn); + else + err = mlx4_qp_reserve_range(dev->dev, 1, 1, + &qpn); + if (err) + goto err_proxy; + } + + err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp, gfp); + if (err) + goto err_qpn; + + if (init_attr->qp_type == IB_QPT_XRC_TGT) + qp->mqp.qpn |= (1 << 23); + + /* + * Hardware wants QPN written in big-endian order (after + * shifting) for send doorbell. Precompute this value to save + * a little bit when posting sends. + */ + qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); + + qp->mqp.event = mlx4_ib_qp_event; + if (!*caller_qp) + *caller_qp = qp; + return 0; + +err_qpn: + if (!sqpn) { + if (qp->flags & MLX4_IB_QP_NETIF) + mlx4_ib_steer_qp_free(dev, qpn, 1); + else + mlx4_qp_release_range(dev->dev, qpn, 1); + } +err_proxy: + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI) + free_proxy_bufs(pd->device, qp); +err_wrid: + if (pd->uobject) { + if (qp_has_rq(init_attr)) + mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &qp->db); + } else { + kfree(qp->sq.wrid); + kfree(qp->rq.wrid); + } + +err_mtt: + mlx4_mtt_cleanup(dev->dev, &qp->mtt); + +err_buf: + if (pd->uobject) + ib_umem_release(qp->umem); + else + mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); + +err_db: + if (!pd->uobject && qp_has_rq(init_attr)) + mlx4_db_free(dev->dev, &qp->db); + +err: + if (!*caller_qp) + kfree(qp); + return err; +} + +static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state) +{ + switch (state) { + case IB_QPS_RESET: return MLX4_QP_STATE_RST; + case IB_QPS_INIT: return MLX4_QP_STATE_INIT; + case IB_QPS_RTR: return MLX4_QP_STATE_RTR; + case IB_QPS_RTS: return MLX4_QP_STATE_RTS; + case IB_QPS_SQD: return MLX4_QP_STATE_SQD; + case IB_QPS_SQE: return MLX4_QP_STATE_SQER; + case IB_QPS_ERR: return MLX4_QP_STATE_ERR; + default: return -1; + } +} + +static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq) + __acquires(&send_cq->lock) __acquires(&recv_cq->lock) +{ + if (send_cq == recv_cq) { + spin_lock_irq(&send_cq->lock); + __acquire(&recv_cq->lock); + } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + spin_lock_irq(&send_cq->lock); + spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); + } else { + spin_lock_irq(&recv_cq->lock); + spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING); + } +} + +static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq) + __releases(&send_cq->lock) __releases(&recv_cq->lock) +{ + if (send_cq == recv_cq) { + __release(&recv_cq->lock); + spin_unlock_irq(&send_cq->lock); + } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + spin_unlock(&recv_cq->lock); + spin_unlock_irq(&send_cq->lock); + } else { + spin_unlock(&send_cq->lock); + spin_unlock_irq(&recv_cq->lock); + } +} + +static void del_gid_entries(struct mlx4_ib_qp *qp) +{ + struct mlx4_ib_gid_entry *ge, *tmp; + + list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) { + list_del(&ge->list); + kfree(ge); + } +} + +static struct mlx4_ib_pd *get_pd(struct mlx4_ib_qp *qp) +{ + if (qp->ibqp.qp_type == IB_QPT_XRC_TGT) + return to_mpd(to_mxrcd(qp->ibqp.xrcd)->pd); + else + return to_mpd(qp->ibqp.pd); +} + +static void get_cqs(struct mlx4_ib_qp *qp, + struct mlx4_ib_cq **send_cq, struct mlx4_ib_cq **recv_cq) +{ + switch (qp->ibqp.qp_type) { + case IB_QPT_XRC_TGT: + *send_cq = to_mcq(to_mxrcd(qp->ibqp.xrcd)->cq); + *recv_cq = *send_cq; + break; + case IB_QPT_XRC_INI: + *send_cq = to_mcq(qp->ibqp.send_cq); + *recv_cq = *send_cq; + break; + default: + *send_cq = to_mcq(qp->ibqp.send_cq); + *recv_cq = to_mcq(qp->ibqp.recv_cq); + break; + } +} + +static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, + int is_user) +{ + struct mlx4_ib_cq *send_cq, *recv_cq; + + if (qp->state != IB_QPS_RESET) { + if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state), + MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp)) + pr_warn("modify QP %06x to RESET failed.\n", + qp->mqp.qpn); + if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) { + mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); + qp->pri.smac = 0; + qp->pri.smac_port = 0; + } + if (qp->alt.smac) { + mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); + qp->alt.smac = 0; + } + if (qp->pri.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid); + qp->pri.vid = 0xFFFF; + qp->pri.candidate_vid = 0xFFFF; + qp->pri.update_vid = 0; + } + if (qp->alt.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid); + qp->alt.vid = 0xFFFF; + qp->alt.candidate_vid = 0xFFFF; + qp->alt.update_vid = 0; + } + } + + get_cqs(qp, &send_cq, &recv_cq); + + mlx4_ib_lock_cqs(send_cq, recv_cq); + + if (!is_user) { + __mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, + qp->ibqp.srq ? to_msrq(qp->ibqp.srq): NULL); + if (send_cq != recv_cq) + __mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + } + + mlx4_qp_remove(dev->dev, &qp->mqp); + + mlx4_ib_unlock_cqs(send_cq, recv_cq); + + mlx4_qp_free(dev->dev, &qp->mqp); + + if (!is_sqp(dev, qp) && !is_tunnel_qp(dev, qp)) { + if (qp->flags & MLX4_IB_QP_NETIF) + mlx4_ib_steer_qp_free(dev, qp->mqp.qpn, 1); + else + mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1); + } + + mlx4_mtt_cleanup(dev->dev, &qp->mtt); + + if (is_user) { + if (qp->rq.wqe_cnt) + mlx4_ib_db_unmap_user(to_mucontext(qp->ibqp.uobject->context), + &qp->db); + ib_umem_release(qp->umem); + } else { + kfree(qp->sq.wrid); + kfree(qp->rq.wrid); + if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | + MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) + free_proxy_bufs(&dev->ib_dev, qp); + mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); + if (qp->rq.wqe_cnt) + mlx4_db_free(dev->dev, &qp->db); + } + + del_gid_entries(qp); +} + +static u32 get_sqp_num(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr) +{ + /* Native or PPF */ + if (!mlx4_is_mfunc(dev->dev) || + (mlx4_is_master(dev->dev) && + attr->create_flags & MLX4_IB_SRIOV_SQP)) { + return dev->dev->phys_caps.base_sqpn + + (attr->qp_type == IB_QPT_SMI ? 0 : 2) + + attr->port_num - 1; + } + /* PF or VF -- creating proxies */ + if (attr->qp_type == IB_QPT_SMI) + return dev->dev->caps.qp0_proxy[attr->port_num - 1]; + else + return dev->dev->caps.qp1_proxy[attr->port_num - 1]; +} + +struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx4_ib_qp *qp = NULL; + int err; + u16 xrcdn = 0; + gfp_t gfp; + + gfp = (init_attr->create_flags & MLX4_IB_QP_CREATE_USE_GFP_NOIO) ? + GFP_NOIO : GFP_KERNEL; + /* + * We only support LSO, vendor flag1, and multicast loopback blocking, + * and only for kernel UD QPs. + */ + if (init_attr->create_flags & ~(MLX4_IB_QP_LSO | + MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK | + MLX4_IB_SRIOV_TUNNEL_QP | + MLX4_IB_SRIOV_SQP | + MLX4_IB_QP_NETIF | + MLX4_IB_QP_CREATE_USE_GFP_NOIO)) + return ERR_PTR(-EINVAL); + + if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) { + if (init_attr->qp_type != IB_QPT_UD) + return ERR_PTR(-EINVAL); + } + + if (init_attr->create_flags && + (udata || + ((init_attr->create_flags & ~(MLX4_IB_SRIOV_SQP | MLX4_IB_QP_CREATE_USE_GFP_NOIO)) && + init_attr->qp_type != IB_QPT_UD) || + ((init_attr->create_flags & MLX4_IB_SRIOV_SQP) && + init_attr->qp_type > IB_QPT_GSI))) + return ERR_PTR(-EINVAL); + + switch (init_attr->qp_type) { + case IB_QPT_XRC_TGT: + pd = to_mxrcd(init_attr->xrcd)->pd; + xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn; + init_attr->send_cq = to_mxrcd(init_attr->xrcd)->cq; + /* fall through */ + case IB_QPT_XRC_INI: + if (!(to_mdev(pd->device)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)) + return ERR_PTR(-ENOSYS); + init_attr->recv_cq = init_attr->send_cq; + /* fall through */ + case IB_QPT_RC: + case IB_QPT_UC: + case IB_QPT_RAW_PACKET: + qp = kzalloc(sizeof *qp, gfp); + if (!qp) + return ERR_PTR(-ENOMEM); + qp->pri.vid = 0xFFFF; + qp->alt.vid = 0xFFFF; + /* fall through */ + case IB_QPT_UD: + { + err = create_qp_common(to_mdev(pd->device), pd, init_attr, + udata, 0, &qp, gfp); + if (err) + return ERR_PTR(err); + + qp->ibqp.qp_num = qp->mqp.qpn; + qp->xrcdn = xrcdn; + + break; + } + case IB_QPT_SMI: + case IB_QPT_GSI: + { + /* Userspace is not allowed to create special QPs: */ + if (udata) + return ERR_PTR(-EINVAL); + + err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata, + get_sqp_num(to_mdev(pd->device), init_attr), + &qp, gfp); + if (err) + return ERR_PTR(err); + + qp->port = init_attr->port_num; + qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1; + + break; + } + default: + /* Don't support raw QPs */ + return ERR_PTR(-EINVAL); + } + + return &qp->ibqp; +} + +int mlx4_ib_destroy_qp(struct ib_qp *qp) +{ + struct mlx4_ib_dev *dev = to_mdev(qp->device); + struct mlx4_ib_qp *mqp = to_mqp(qp); + struct mlx4_ib_pd *pd; + + if (is_qp0(dev, mqp)) + mlx4_CLOSE_PORT(dev->dev, mqp->port); + + if (dev->qp1_proxy[mqp->port - 1] == mqp) { + mutex_lock(&dev->qp1_proxy_lock[mqp->port - 1]); + dev->qp1_proxy[mqp->port - 1] = NULL; + mutex_unlock(&dev->qp1_proxy_lock[mqp->port - 1]); + } + + pd = get_pd(mqp); + destroy_qp_common(dev, mqp, !!pd->ibpd.uobject); + + if (is_sqp(dev, mqp)) + kfree(to_msqp(mqp)); + else + kfree(mqp); + + return 0; +} + +static int to_mlx4_st(struct mlx4_ib_dev *dev, enum mlx4_ib_qp_type type) +{ + switch (type) { + case MLX4_IB_QPT_RC: return MLX4_QP_ST_RC; + case MLX4_IB_QPT_UC: return MLX4_QP_ST_UC; + case MLX4_IB_QPT_UD: return MLX4_QP_ST_UD; + case MLX4_IB_QPT_XRC_INI: + case MLX4_IB_QPT_XRC_TGT: return MLX4_QP_ST_XRC; + case MLX4_IB_QPT_SMI: + case MLX4_IB_QPT_GSI: + case MLX4_IB_QPT_RAW_PACKET: return MLX4_QP_ST_MLX; + + case MLX4_IB_QPT_PROXY_SMI_OWNER: + case MLX4_IB_QPT_TUN_SMI_OWNER: return (mlx4_is_mfunc(dev->dev) ? + MLX4_QP_ST_MLX : -1); + case MLX4_IB_QPT_PROXY_SMI: + case MLX4_IB_QPT_TUN_SMI: + case MLX4_IB_QPT_PROXY_GSI: + case MLX4_IB_QPT_TUN_GSI: return (mlx4_is_mfunc(dev->dev) ? + MLX4_QP_ST_UD : -1); + default: return -1; + } +} + +static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, const struct ib_qp_attr *attr, + int attr_mask) +{ + u8 dest_rd_atomic; + u32 access_flags; + u32 hw_access_flags = 0; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + dest_rd_atomic = attr->max_dest_rd_atomic; + else + dest_rd_atomic = qp->resp_depth; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + access_flags = attr->qp_access_flags; + else + access_flags = qp->atomic_rd_en; + + if (!dest_rd_atomic) + access_flags &= IB_ACCESS_REMOTE_WRITE; + + if (access_flags & IB_ACCESS_REMOTE_READ) + hw_access_flags |= MLX4_QP_BIT_RRE; + if (access_flags & IB_ACCESS_REMOTE_ATOMIC) + hw_access_flags |= MLX4_QP_BIT_RAE; + if (access_flags & IB_ACCESS_REMOTE_WRITE) + hw_access_flags |= MLX4_QP_BIT_RWE; + + return cpu_to_be32(hw_access_flags); +} + +static void store_sqp_attrs(struct mlx4_ib_sqp *sqp, const struct ib_qp_attr *attr, + int attr_mask) +{ + if (attr_mask & IB_QP_PKEY_INDEX) + sqp->pkey_index = attr->pkey_index; + if (attr_mask & IB_QP_QKEY) + sqp->qkey = attr->qkey; + if (attr_mask & IB_QP_SQ_PSN) + sqp->send_psn = attr->sq_psn; +} + +static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port) +{ + path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6); +} + +static int _mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, + u64 smac, u16 vlan_tag, struct mlx4_qp_path *path, + struct mlx4_roce_smac_vlan_info *smac_info, u8 port) +{ + int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port) == + IB_LINK_LAYER_ETHERNET; + int vidx; + int smac_index; + int err; + + + path->grh_mylmc = ah->src_path_bits & 0x7f; + path->rlid = cpu_to_be16(ah->dlid); + if (ah->static_rate) { + path->static_rate = ah->static_rate + MLX4_STAT_RATE_OFFSET; + while (path->static_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET && + !(1 << path->static_rate & dev->dev->caps.stat_rate_support)) + --path->static_rate; + } else + path->static_rate = 0; + + if (ah->ah_flags & IB_AH_GRH) { + if (ah->grh.sgid_index >= dev->dev->caps.gid_table_len[port]) { + pr_err("sgid_index (%u) too large. max is %d\n", + ah->grh.sgid_index, dev->dev->caps.gid_table_len[port] - 1); + return -1; + } + + path->grh_mylmc |= 1 << 7; + path->mgid_index = ah->grh.sgid_index; + path->hop_limit = ah->grh.hop_limit; + path->tclass_flowlabel = + cpu_to_be32((ah->grh.traffic_class << 20) | + (ah->grh.flow_label)); + memcpy(path->rgid, ah->grh.dgid.raw, 16); + } + + if (is_eth) { + if (!(ah->ah_flags & IB_AH_GRH)) + return -1; + + path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | + ((port - 1) << 6) | ((ah->sl & 7) << 3); + + path->feup |= MLX4_FEUP_FORCE_ETH_UP; + if (vlan_tag < 0x1000) { + if (smac_info->vid < 0x1000) { + /* both valid vlan ids */ + if (smac_info->vid != vlan_tag) { + /* different VIDs. unreg old and reg new */ + err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx); + if (err) + return err; + smac_info->candidate_vid = vlan_tag; + smac_info->candidate_vlan_index = vidx; + smac_info->candidate_vlan_port = port; + smac_info->update_vid = 1; + path->vlan_index = vidx; + } else { + path->vlan_index = smac_info->vlan_index; + } + } else { + /* no current vlan tag in qp */ + err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx); + if (err) + return err; + smac_info->candidate_vid = vlan_tag; + smac_info->candidate_vlan_index = vidx; + smac_info->candidate_vlan_port = port; + smac_info->update_vid = 1; + path->vlan_index = vidx; + } + path->feup |= MLX4_FVL_FORCE_ETH_VLAN; + path->fl = 1 << 6; + } else { + /* have current vlan tag. unregister it at modify-qp success */ + if (smac_info->vid < 0x1000) { + smac_info->candidate_vid = 0xFFFF; + smac_info->update_vid = 1; + } + } + + /* get smac_index for RoCE use. + * If no smac was yet assigned, register one. + * If one was already assigned, but the new mac differs, + * unregister the old one and register the new one. + */ + if ((!smac_info->smac && !smac_info->smac_port) || + smac_info->smac != smac) { + /* register candidate now, unreg if needed, after success */ + smac_index = mlx4_register_mac(dev->dev, port, smac); + if (smac_index >= 0) { + smac_info->candidate_smac_index = smac_index; + smac_info->candidate_smac = smac; + smac_info->candidate_smac_port = port; + } else { + return -EINVAL; + } + } else { + smac_index = smac_info->smac_index; + } + + memcpy(path->dmac, ah->dmac, 6); + path->ackto = MLX4_IB_LINK_TYPE_ETH; + /* put MAC table smac index for IBoE */ + path->grh_mylmc = (u8) (smac_index) | 0x80; + } else { + path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | + ((port - 1) << 6) | ((ah->sl & 0xf) << 2); + } + + return 0; +} + +static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_qp_attr *qp, + enum ib_qp_attr_mask qp_attr_mask, + struct mlx4_ib_qp *mqp, + struct mlx4_qp_path *path, u8 port) +{ + return _mlx4_set_path(dev, &qp->ah_attr, + mlx4_mac_to_u64((u8 *)qp->smac), + (qp_attr_mask & IB_QP_VID) ? qp->vlan_id : 0xffff, + path, &mqp->pri, port); +} + +static int mlx4_set_alt_path(struct mlx4_ib_dev *dev, + const struct ib_qp_attr *qp, + enum ib_qp_attr_mask qp_attr_mask, + struct mlx4_ib_qp *mqp, + struct mlx4_qp_path *path, u8 port) +{ + return _mlx4_set_path(dev, &qp->alt_ah_attr, + mlx4_mac_to_u64((u8 *)qp->alt_smac), + (qp_attr_mask & IB_QP_ALT_VID) ? + qp->alt_vlan_id : 0xffff, + path, &mqp->alt, port); +} + +static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) +{ + struct mlx4_ib_gid_entry *ge, *tmp; + + list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) { + if (!ge->added && mlx4_ib_add_mc(dev, qp, &ge->gid)) { + ge->added = 1; + ge->port = qp->port; + } + } +} + +static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, u8 *smac, + struct mlx4_qp_context *context) +{ + u64 u64_mac; + int smac_index; + + u64_mac = atomic64_read(&dev->iboe.mac[qp->port - 1]); + + context->pri_path.sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((qp->port - 1) << 6); + if (!qp->pri.smac && !qp->pri.smac_port) { + smac_index = mlx4_register_mac(dev->dev, qp->port, u64_mac); + if (smac_index >= 0) { + qp->pri.candidate_smac_index = smac_index; + qp->pri.candidate_smac = u64_mac; + qp->pri.candidate_smac_port = qp->port; + context->pri_path.grh_mylmc = 0x80 | (u8) smac_index; + } else { + return -ENOENT; + } + } + return 0; +} + +static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, + const struct ib_qp_attr *attr, int attr_mask, + enum ib_qp_state cur_state, enum ib_qp_state new_state) +{ + struct mlx4_ib_dev *dev = to_mdev(ibqp->device); + struct mlx4_ib_qp *qp = to_mqp(ibqp); + struct mlx4_ib_pd *pd; + struct mlx4_ib_cq *send_cq, *recv_cq; + struct mlx4_qp_context *context; + enum mlx4_qp_optpar optpar = 0; + int sqd_event; + int steer_qp = 0; + int err = -EINVAL; + + /* APM is not supported under RoCE */ + if (attr_mask & IB_QP_ALT_PATH && + rdma_port_get_link_layer(&dev->ib_dev, qp->port) == + IB_LINK_LAYER_ETHERNET) + return -ENOTSUPP; + + context = kzalloc(sizeof *context, GFP_KERNEL); + if (!context) + return -ENOMEM; + + context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) | + (to_mlx4_st(dev, qp->mlx4_ib_qp_type) << 16)); + + if (!(attr_mask & IB_QP_PATH_MIG_STATE)) + context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11); + else { + optpar |= MLX4_QP_OPTPAR_PM_STATE; + switch (attr->path_mig_state) { + case IB_MIG_MIGRATED: + context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11); + break; + case IB_MIG_REARM: + context->flags |= cpu_to_be32(MLX4_QP_PM_REARM << 11); + break; + case IB_MIG_ARMED: + context->flags |= cpu_to_be32(MLX4_QP_PM_ARMED << 11); + break; + } + } + + if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) + context->mtu_msgmax = (IB_MTU_4096 << 5) | 11; + else if (ibqp->qp_type == IB_QPT_RAW_PACKET) + context->mtu_msgmax = (MLX4_RAW_QP_MTU << 5) | MLX4_RAW_QP_MSGMAX; + else if (ibqp->qp_type == IB_QPT_UD) { + if (qp->flags & MLX4_IB_QP_LSO) + context->mtu_msgmax = (IB_MTU_4096 << 5) | + ilog2(dev->dev->caps.max_gso_sz); + else + context->mtu_msgmax = (IB_MTU_4096 << 5) | 12; + } else if (attr_mask & IB_QP_PATH_MTU) { + if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) { + pr_err("path MTU (%u) is invalid\n", + attr->path_mtu); + goto out; + } + context->mtu_msgmax = (attr->path_mtu << 5) | + ilog2(dev->dev->caps.max_msg_sz); + } + + if (qp->rq.wqe_cnt) + context->rq_size_stride = ilog2(qp->rq.wqe_cnt) << 3; + context->rq_size_stride |= qp->rq.wqe_shift - 4; + + if (qp->sq.wqe_cnt) + context->sq_size_stride = ilog2(qp->sq.wqe_cnt) << 3; + context->sq_size_stride |= qp->sq.wqe_shift - 4; + + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { + context->sq_size_stride |= !!qp->sq_no_prefetch << 7; + context->xrcd = cpu_to_be32((u32) qp->xrcdn); + if (ibqp->qp_type == IB_QPT_RAW_PACKET) + context->param3 |= cpu_to_be32(1 << 30); + } + + if (qp->ibqp.uobject) + context->usr_page = cpu_to_be32(to_mucontext(ibqp->uobject->context)->uar.index); + else + context->usr_page = cpu_to_be32(dev->priv_uar.index); + + if (attr_mask & IB_QP_DEST_QPN) + context->remote_qpn = cpu_to_be32(attr->dest_qp_num); + + if (attr_mask & IB_QP_PORT) { + if (cur_state == IB_QPS_SQD && new_state == IB_QPS_SQD && + !(attr_mask & IB_QP_AV)) { + mlx4_set_sched(&context->pri_path, attr->port_num); + optpar |= MLX4_QP_OPTPAR_SCHED_QUEUE; + } + } + + if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { + if (dev->counters[qp->port - 1] != -1) { + context->pri_path.counter_index = + dev->counters[qp->port - 1]; + optpar |= MLX4_QP_OPTPAR_COUNTER_INDEX; + } else + context->pri_path.counter_index = 0xff; + + if (qp->flags & MLX4_IB_QP_NETIF) { + mlx4_ib_steer_qp_reg(dev, qp, 1); + steer_qp = 1; + } + } + + if (attr_mask & IB_QP_PKEY_INDEX) { + if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV) + context->pri_path.disable_pkey_check = 0x40; + context->pri_path.pkey_index = attr->pkey_index; + optpar |= MLX4_QP_OPTPAR_PKEY_INDEX; + } + + if (attr_mask & IB_QP_AV) { + if (mlx4_set_path(dev, attr, attr_mask, qp, &context->pri_path, + attr_mask & IB_QP_PORT ? + attr->port_num : qp->port)) + goto out; + + optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH | + MLX4_QP_OPTPAR_SCHED_QUEUE); + } + + if (attr_mask & IB_QP_TIMEOUT) { + context->pri_path.ackto |= attr->timeout << 3; + optpar |= MLX4_QP_OPTPAR_ACK_TIMEOUT; + } + + if (attr_mask & IB_QP_ALT_PATH) { + if (attr->alt_port_num == 0 || + attr->alt_port_num > dev->dev->caps.num_ports) + goto out; + + if (attr->alt_pkey_index >= + dev->dev->caps.pkey_table_len[attr->alt_port_num]) + goto out; + + if (mlx4_set_alt_path(dev, attr, attr_mask, qp, + &context->alt_path, + attr->alt_port_num)) + goto out; + + context->alt_path.pkey_index = attr->alt_pkey_index; + context->alt_path.ackto = attr->alt_timeout << 3; + optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH; + } + + pd = get_pd(qp); + get_cqs(qp, &send_cq, &recv_cq); + context->pd = cpu_to_be32(pd->pdn); + context->cqn_send = cpu_to_be32(send_cq->mcq.cqn); + context->cqn_recv = cpu_to_be32(recv_cq->mcq.cqn); + context->params1 = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28); + + /* Set "fast registration enabled" for all kernel QPs */ + if (!qp->ibqp.uobject) + context->params1 |= cpu_to_be32(1 << 11); + + if (attr_mask & IB_QP_RNR_RETRY) { + context->params1 |= cpu_to_be32(attr->rnr_retry << 13); + optpar |= MLX4_QP_OPTPAR_RNR_RETRY; + } + + if (attr_mask & IB_QP_RETRY_CNT) { + context->params1 |= cpu_to_be32(attr->retry_cnt << 16); + optpar |= MLX4_QP_OPTPAR_RETRY_COUNT; + } + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { + if (attr->max_rd_atomic) + context->params1 |= + cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21); + optpar |= MLX4_QP_OPTPAR_SRA_MAX; + } + + if (attr_mask & IB_QP_SQ_PSN) + context->next_send_psn = cpu_to_be32(attr->sq_psn); + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { + if (attr->max_dest_rd_atomic) + context->params2 |= + cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21); + optpar |= MLX4_QP_OPTPAR_RRA_MAX; + } + + if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) { + context->params2 |= to_mlx4_access_flags(qp, attr, attr_mask); + optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE; + } + + if (ibqp->srq) + context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC); + + if (attr_mask & IB_QP_MIN_RNR_TIMER) { + context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24); + optpar |= MLX4_QP_OPTPAR_RNR_TIMEOUT; + } + if (attr_mask & IB_QP_RQ_PSN) + context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn); + + /* proxy and tunnel qp qkeys will be changed in modify-qp wrappers */ + if (attr_mask & IB_QP_QKEY) { + if (qp->mlx4_ib_qp_type & + (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) + context->qkey = cpu_to_be32(IB_QP_SET_QKEY); + else { + if (mlx4_is_mfunc(dev->dev) && + !(qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV) && + (attr->qkey & MLX4_RESERVED_QKEY_MASK) == + MLX4_RESERVED_QKEY_BASE) { + pr_err("Cannot use reserved QKEY" + " 0x%x (range 0xffff0000..0xffffffff" + " is reserved)\n", attr->qkey); + err = -EINVAL; + goto out; + } + context->qkey = cpu_to_be32(attr->qkey); + } + optpar |= MLX4_QP_OPTPAR_Q_KEY; + } + + if (ibqp->srq) + context->srqn = cpu_to_be32(1 << 24 | to_msrq(ibqp->srq)->msrq.srqn); + + if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + context->db_rec_addr = cpu_to_be64(qp->db.dma); + + if (cur_state == IB_QPS_INIT && + new_state == IB_QPS_RTR && + (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI || + ibqp->qp_type == IB_QPT_UD || + ibqp->qp_type == IB_QPT_RAW_PACKET)) { + context->pri_path.sched_queue = (qp->port - 1) << 6; + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI || + qp->mlx4_ib_qp_type & + (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) { + context->pri_path.sched_queue |= MLX4_IB_DEFAULT_QP0_SCHED_QUEUE; + if (qp->mlx4_ib_qp_type != MLX4_IB_QPT_SMI) + context->pri_path.fl = 0x80; + } else { + if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV) + context->pri_path.fl = 0x80; + context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE; + } + if (rdma_port_get_link_layer(&dev->ib_dev, qp->port) == + IB_LINK_LAYER_ETHERNET) { + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) + context->pri_path.feup = 1 << 7; /* don't fsm */ + /* handle smac_index */ + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_UD || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI) { + err = handle_eth_ud_smac_index(dev, qp, (u8 *)attr->smac, context); + if (err) + return -EINVAL; + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI) + dev->qp1_proxy[qp->port - 1] = qp; + } + } + } + + if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) { + context->pri_path.ackto = (context->pri_path.ackto & 0xf8) | + MLX4_IB_LINK_TYPE_ETH; + if (dev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) { + /* set QP to receive both tunneled & non-tunneled packets */ + if (!(context->flags & cpu_to_be32(1 << MLX4_RSS_QPC_FLAG_OFFSET))) + context->srqn = cpu_to_be32(7 << 28); + } + } + + if (ibqp->qp_type == IB_QPT_UD && (new_state == IB_QPS_RTR)) { + int is_eth = rdma_port_get_link_layer( + &dev->ib_dev, qp->port) == + IB_LINK_LAYER_ETHERNET; + if (is_eth) { + context->pri_path.ackto = MLX4_IB_LINK_TYPE_ETH; + optpar |= MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH; + } + } + + + if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD && + attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify) + sqd_event = 1; + else + sqd_event = 0; + + if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + context->rlkey |= (1 << 4); + + /* + * Before passing a kernel QP to the HW, make sure that the + * ownership bits of the send queue are set and the SQ + * headroom is stamped so that the hardware doesn't start + * processing stale work requests. + */ + if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { + struct mlx4_wqe_ctrl_seg *ctrl; + int i; + + for (i = 0; i < qp->sq.wqe_cnt; ++i) { + ctrl = get_send_wqe(qp, i); + ctrl->owner_opcode = cpu_to_be32(1 << 31); + if (qp->sq_max_wqes_per_wr == 1) + ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4); + + stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift); + } + } + + err = mlx4_qp_modify(dev->dev, &qp->mtt, to_mlx4_state(cur_state), + to_mlx4_state(new_state), context, optpar, + sqd_event, &qp->mqp); + if (err) + goto out; + + qp->state = new_state; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + qp->atomic_rd_en = attr->qp_access_flags; + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + qp->resp_depth = attr->max_dest_rd_atomic; + if (attr_mask & IB_QP_PORT) { + qp->port = attr->port_num; + update_mcg_macs(dev, qp); + } + if (attr_mask & IB_QP_ALT_PATH) + qp->alt_port = attr->alt_port_num; + + if (is_sqp(dev, qp)) + store_sqp_attrs(to_msqp(qp), attr, attr_mask); + + /* + * If we moved QP0 to RTR, bring the IB link up; if we moved + * QP0 to RESET or ERROR, bring the link back down. + */ + if (is_qp0(dev, qp)) { + if (cur_state != IB_QPS_RTR && new_state == IB_QPS_RTR) + if (mlx4_INIT_PORT(dev->dev, qp->port)) + pr_warn("INIT_PORT failed for port %d\n", + qp->port); + + if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR && + (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR)) + mlx4_CLOSE_PORT(dev->dev, qp->port); + } + + /* + * If we moved a kernel QP to RESET, clean up all old CQ + * entries and reinitialize the QP. + */ + if (new_state == IB_QPS_RESET) { + if (!ibqp->uobject) { + mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, + ibqp->srq ? to_msrq(ibqp->srq) : NULL); + if (send_cq != recv_cq) + mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + + qp->rq.head = 0; + qp->rq.tail = 0; + qp->sq.head = 0; + qp->sq.tail = 0; + qp->sq_next_wqe = 0; + if (qp->rq.wqe_cnt) + *qp->db.db = 0; + + if (qp->flags & MLX4_IB_QP_NETIF) + mlx4_ib_steer_qp_reg(dev, qp, 0); + } + if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) { + mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); + qp->pri.smac = 0; + qp->pri.smac_port = 0; + } + if (qp->alt.smac) { + mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); + qp->alt.smac = 0; + } + if (qp->pri.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid); + qp->pri.vid = 0xFFFF; + qp->pri.candidate_vid = 0xFFFF; + qp->pri.update_vid = 0; + } + + if (qp->alt.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid); + qp->alt.vid = 0xFFFF; + qp->alt.candidate_vid = 0xFFFF; + qp->alt.update_vid = 0; + } + } +out: + if (err && steer_qp) + mlx4_ib_steer_qp_reg(dev, qp, 0); + kfree(context); + if (qp->pri.candidate_smac || + (!qp->pri.candidate_smac && qp->pri.candidate_smac_port)) { + if (err) { + mlx4_unregister_mac(dev->dev, qp->pri.candidate_smac_port, qp->pri.candidate_smac); + } else { + if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) + mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); + qp->pri.smac = qp->pri.candidate_smac; + qp->pri.smac_index = qp->pri.candidate_smac_index; + qp->pri.smac_port = qp->pri.candidate_smac_port; + } + qp->pri.candidate_smac = 0; + qp->pri.candidate_smac_index = 0; + qp->pri.candidate_smac_port = 0; + } + if (qp->alt.candidate_smac) { + if (err) { + mlx4_unregister_mac(dev->dev, qp->alt.candidate_smac_port, qp->alt.candidate_smac); + } else { + if (qp->alt.smac) + mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); + qp->alt.smac = qp->alt.candidate_smac; + qp->alt.smac_index = qp->alt.candidate_smac_index; + qp->alt.smac_port = qp->alt.candidate_smac_port; + } + qp->alt.candidate_smac = 0; + qp->alt.candidate_smac_index = 0; + qp->alt.candidate_smac_port = 0; + } + + if (qp->pri.update_vid) { + if (err) { + if (qp->pri.candidate_vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->pri.candidate_vlan_port, + qp->pri.candidate_vid); + } else { + if (qp->pri.vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, + qp->pri.vid); + qp->pri.vid = qp->pri.candidate_vid; + qp->pri.vlan_port = qp->pri.candidate_vlan_port; + qp->pri.vlan_index = qp->pri.candidate_vlan_index; + } + qp->pri.candidate_vid = 0xFFFF; + qp->pri.update_vid = 0; + } + + if (qp->alt.update_vid) { + if (err) { + if (qp->alt.candidate_vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->alt.candidate_vlan_port, + qp->alt.candidate_vid); + } else { + if (qp->alt.vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, + qp->alt.vid); + qp->alt.vid = qp->alt.candidate_vid; + qp->alt.vlan_port = qp->alt.candidate_vlan_port; + qp->alt.vlan_index = qp->alt.candidate_vlan_index; + } + qp->alt.candidate_vid = 0xFFFF; + qp->alt.update_vid = 0; + } + + return err; +} + +int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(ibqp->device); + struct mlx4_ib_qp *qp = to_mqp(ibqp); + enum ib_qp_state cur_state, new_state; + int err = -EINVAL; + int ll; + mutex_lock(&qp->mutex); + + cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; + new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + + if (cur_state == new_state && cur_state == IB_QPS_RESET) { + ll = IB_LINK_LAYER_UNSPECIFIED; + } else { + int port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; + ll = rdma_port_get_link_layer(&dev->ib_dev, port); + } + + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, + attr_mask, ll)) { + pr_debug("qpn 0x%x: invalid attribute mask specified " + "for transition %d to %d. qp_type %d," + " attr_mask 0x%x\n", + ibqp->qp_num, cur_state, new_state, + ibqp->qp_type, attr_mask); + goto out; + } + + if ((attr_mask & IB_QP_PORT) && + (attr->port_num == 0 || attr->port_num > dev->num_ports)) { + pr_debug("qpn 0x%x: invalid port number (%d) specified " + "for transition %d to %d. qp_type %d\n", + ibqp->qp_num, attr->port_num, cur_state, + new_state, ibqp->qp_type); + goto out; + } + + if ((attr_mask & IB_QP_PORT) && (ibqp->qp_type == IB_QPT_RAW_PACKET) && + (rdma_port_get_link_layer(&dev->ib_dev, attr->port_num) != + IB_LINK_LAYER_ETHERNET)) + goto out; + + if (attr_mask & IB_QP_PKEY_INDEX) { + int p = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; + if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p]) { + pr_debug("qpn 0x%x: invalid pkey index (%d) specified " + "for transition %d to %d. qp_type %d\n", + ibqp->qp_num, attr->pkey_index, cur_state, + new_state, ibqp->qp_type); + goto out; + } + } + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && + attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) { + pr_debug("qpn 0x%x: max_rd_atomic (%d) too large. " + "Transition %d to %d. qp_type %d\n", + ibqp->qp_num, attr->max_rd_atomic, cur_state, + new_state, ibqp->qp_type); + goto out; + } + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && + attr->max_dest_rd_atomic > dev->dev->caps.max_qp_dest_rdma) { + pr_debug("qpn 0x%x: max_dest_rd_atomic (%d) too large. " + "Transition %d to %d. qp_type %d\n", + ibqp->qp_num, attr->max_dest_rd_atomic, cur_state, + new_state, ibqp->qp_type); + goto out; + } + + if (cur_state == new_state && cur_state == IB_QPS_RESET) { + err = 0; + goto out; + } + + err = __mlx4_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state); + +out: + mutex_unlock(&qp->mutex); + return err; +} + +static int vf_get_qp0_qkey(struct mlx4_dev *dev, int qpn, u32 *qkey) +{ + int i; + for (i = 0; i < dev->caps.num_ports; i++) { + if (qpn == dev->caps.qp0_proxy[i] || + qpn == dev->caps.qp0_tunnel[i]) { + *qkey = dev->caps.qp0_qkey[i]; + return 0; + } + } + return -EINVAL; +} + +static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp, + struct ib_send_wr *wr, + void *wqe, unsigned *mlx_seg_len) +{ + struct mlx4_ib_dev *mdev = to_mdev(sqp->qp.ibqp.device); + struct ib_device *ib_dev = &mdev->ib_dev; + struct mlx4_wqe_mlx_seg *mlx = wqe; + struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx; + struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah); + u16 pkey; + u32 qkey; + int send_size; + int header_size; + int spc; + int i; + + if (wr->opcode != IB_WR_SEND) + return -EINVAL; + + send_size = 0; + + for (i = 0; i < wr->num_sge; ++i) + send_size += wr->sg_list[i].length; + + /* for proxy-qp0 sends, need to add in size of tunnel header */ + /* for tunnel-qp0 sends, tunnel header is already in s/g list */ + if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) + send_size += sizeof (struct mlx4_ib_tunnel_header); + + ib_ud_header_init(send_size, 1, 0, 0, 0, 0, &sqp->ud_header); + + if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) { + sqp->ud_header.lrh.service_level = + be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; + sqp->ud_header.lrh.destination_lid = + cpu_to_be16(ah->av.ib.g_slid & 0x7f); + sqp->ud_header.lrh.source_lid = + cpu_to_be16(ah->av.ib.g_slid & 0x7f); + } + + mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); + + /* force loopback */ + mlx->flags |= cpu_to_be32(MLX4_WQE_MLX_VL15 | 0x1 | MLX4_WQE_MLX_SLR); + mlx->rlid = sqp->ud_header.lrh.destination_lid; + + sqp->ud_header.lrh.virtual_lane = 0; + sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED); + ib_get_cached_pkey(ib_dev, sqp->qp.port, 0, &pkey); + sqp->ud_header.bth.pkey = cpu_to_be16(pkey); + if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_TUN_SMI_OWNER) + sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); + else + sqp->ud_header.bth.destination_qpn = + cpu_to_be32(mdev->dev->caps.qp0_tunnel[sqp->qp.port - 1]); + + sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); + if (mlx4_is_master(mdev->dev)) { + if (mlx4_get_parav_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey)) + return -EINVAL; + } else { + if (vf_get_qp0_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey)) + return -EINVAL; + } + sqp->ud_header.deth.qkey = cpu_to_be32(qkey); + sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.mqp.qpn); + + sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY; + sqp->ud_header.immediate_present = 0; + + header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf); + + /* + * Inline data segments may not cross a 64 byte boundary. If + * our UD header is bigger than the space available up to the + * next 64 byte boundary in the WQE, use two inline data + * segments to hold the UD header. + */ + spc = MLX4_INLINE_ALIGN - + ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); + if (header_size <= spc) { + inl->byte_count = cpu_to_be32(1 << 31 | header_size); + memcpy(inl + 1, sqp->header_buf, header_size); + i = 1; + } else { + inl->byte_count = cpu_to_be32(1 << 31 | spc); + memcpy(inl + 1, sqp->header_buf, spc); + + inl = (void *) (inl + 1) + spc; + memcpy(inl + 1, sqp->header_buf + spc, header_size - spc); + /* + * Need a barrier here to make sure all the data is + * visible before the byte_count field is set. + * Otherwise the HCA prefetcher could grab the 64-byte + * chunk with this inline segment and get a valid (!= + * 0xffffffff) byte count but stale data, and end up + * generating a packet with bad headers. + * + * The first inline segment's byte_count field doesn't + * need a barrier, because it comes after a + * control/MLX segment and therefore is at an offset + * of 16 mod 64. + */ + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc)); + i = 2; + } + + *mlx_seg_len = + ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16); + return 0; +} + +static void mlx4_u64_to_smac(u8 *dst_mac, u64 src_mac) +{ + int i; + + for (i = ETH_ALEN; i; i--) { + dst_mac[i - 1] = src_mac & 0xff; + src_mac >>= 8; + } +} + +static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, + void *wqe, unsigned *mlx_seg_len) +{ + struct ib_device *ib_dev = sqp->qp.ibqp.device; + struct mlx4_wqe_mlx_seg *mlx = wqe; + struct mlx4_wqe_ctrl_seg *ctrl = wqe; + struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx; + struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah); + union ib_gid sgid; + u16 pkey; + int send_size; + int header_size; + int spc; + int i; + int err = 0; + u16 vlan = 0xffff; + bool is_eth; + bool is_vlan = false; + bool is_grh; + + send_size = 0; + for (i = 0; i < wr->num_sge; ++i) + send_size += wr->sg_list[i].length; + + is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET; + is_grh = mlx4_ib_ah_grh_present(ah); + if (is_eth) { + if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) { + /* When multi-function is enabled, the ib_core gid + * indexes don't necessarily match the hw ones, so + * we must use our own cache */ + err = mlx4_get_roce_gid_from_slave(to_mdev(ib_dev)->dev, + be32_to_cpu(ah->av.ib.port_pd) >> 24, + ah->av.ib.gid_index, &sgid.raw[0]); + if (err) + return err; + } else { + err = ib_get_cached_gid(ib_dev, + be32_to_cpu(ah->av.ib.port_pd) >> 24, + ah->av.ib.gid_index, &sgid); + if (err) + return err; + } + + if (ah->av.eth.vlan != cpu_to_be16(0xffff)) { + vlan = be16_to_cpu(ah->av.eth.vlan) & 0x0fff; + is_vlan = 1; + } + } + ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header); + + if (!is_eth) { + sqp->ud_header.lrh.service_level = + be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; + sqp->ud_header.lrh.destination_lid = ah->av.ib.dlid; + sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f); + } + + if (is_grh) { + sqp->ud_header.grh.traffic_class = + (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff; + sqp->ud_header.grh.flow_label = + ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff); + sqp->ud_header.grh.hop_limit = ah->av.ib.hop_limit; + if (is_eth) + memcpy(sqp->ud_header.grh.source_gid.raw, sgid.raw, 16); + else { + if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) { + /* When multi-function is enabled, the ib_core gid + * indexes don't necessarily match the hw ones, so + * we must use our own cache */ + sqp->ud_header.grh.source_gid.global.subnet_prefix = + to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1]. + subnet_prefix; + sqp->ud_header.grh.source_gid.global.interface_id = + to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1]. + guid_cache[ah->av.ib.gid_index]; + } else + ib_get_cached_gid(ib_dev, + be32_to_cpu(ah->av.ib.port_pd) >> 24, + ah->av.ib.gid_index, + &sqp->ud_header.grh.source_gid); + } + memcpy(sqp->ud_header.grh.destination_gid.raw, + ah->av.ib.dgid, 16); + } + + mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); + + if (!is_eth) { + mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) | + (sqp->ud_header.lrh.destination_lid == + IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) | + (sqp->ud_header.lrh.service_level << 8)); + if (ah->av.ib.port_pd & cpu_to_be32(0x80000000)) + mlx->flags |= cpu_to_be32(0x1); /* force loopback */ + mlx->rlid = sqp->ud_header.lrh.destination_lid; + } + + switch (wr->opcode) { + case IB_WR_SEND: + sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY; + sqp->ud_header.immediate_present = 0; + break; + case IB_WR_SEND_WITH_IMM: + sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; + sqp->ud_header.immediate_present = 1; + sqp->ud_header.immediate_data = wr->ex.imm_data; + break; + default: + return -EINVAL; + } + + if (is_eth) { + struct in6_addr in6; + + u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13; + + mlx->sched_prio = cpu_to_be16(pcp); + + memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6); + /* FIXME: cache smac value? */ + memcpy(&ctrl->srcrb_flags16[0], ah->av.eth.mac, 2); + memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4); + memcpy(&in6, sgid.raw, sizeof(in6)); + + if (!mlx4_is_mfunc(to_mdev(ib_dev)->dev)) { + u64 mac = atomic64_read(&to_mdev(ib_dev)->iboe.mac[sqp->qp.port - 1]); + u8 smac[ETH_ALEN]; + + mlx4_u64_to_smac(smac, mac); + memcpy(sqp->ud_header.eth.smac_h, smac, ETH_ALEN); + } else { + /* use the src mac of the tunnel */ + memcpy(sqp->ud_header.eth.smac_h, ah->av.eth.s_mac, ETH_ALEN); + } + + if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6)) + mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK); + if (!is_vlan) { + sqp->ud_header.eth.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE); + } else { + sqp->ud_header.vlan.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE); + sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp); + } + } else { + sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : 0; + if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE) + sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE; + } + sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED); + if (!sqp->qp.ibqp.qp_num) + ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey); + else + ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->wr.ud.pkey_index, &pkey); + sqp->ud_header.bth.pkey = cpu_to_be16(pkey); + sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); + sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); + sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ? + sqp->qkey : wr->wr.ud.remote_qkey); + sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num); + + header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf); + + if (0) { + pr_err("built UD header of size %d:\n", header_size); + for (i = 0; i < header_size / 4; ++i) { + if (i % 8 == 0) + pr_err(" [%02x] ", i * 4); + pr_cont(" %08x", + be32_to_cpu(((__be32 *) sqp->header_buf)[i])); + if ((i + 1) % 8 == 0) + pr_cont("\n"); + } + pr_err("\n"); + } + + /* + * Inline data segments may not cross a 64 byte boundary. If + * our UD header is bigger than the space available up to the + * next 64 byte boundary in the WQE, use two inline data + * segments to hold the UD header. + */ + spc = MLX4_INLINE_ALIGN - + ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); + if (header_size <= spc) { + inl->byte_count = cpu_to_be32(1 << 31 | header_size); + memcpy(inl + 1, sqp->header_buf, header_size); + i = 1; + } else { + inl->byte_count = cpu_to_be32(1 << 31 | spc); + memcpy(inl + 1, sqp->header_buf, spc); + + inl = (void *) (inl + 1) + spc; + memcpy(inl + 1, sqp->header_buf + spc, header_size - spc); + /* + * Need a barrier here to make sure all the data is + * visible before the byte_count field is set. + * Otherwise the HCA prefetcher could grab the 64-byte + * chunk with this inline segment and get a valid (!= + * 0xffffffff) byte count but stale data, and end up + * generating a packet with bad headers. + * + * The first inline segment's byte_count field doesn't + * need a barrier, because it comes after a + * control/MLX segment and therefore is at an offset + * of 16 mod 64. + */ + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc)); + i = 2; + } + + *mlx_seg_len = + ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16); + return 0; +} + +static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq) +{ + unsigned cur; + struct mlx4_ib_cq *cq; + + cur = wq->head - wq->tail; + if (likely(cur + nreq < wq->max_post)) + return 0; + + cq = to_mcq(ib_cq); + spin_lock(&cq->lock); + cur = wq->head - wq->tail; + spin_unlock(&cq->lock); + + return cur + nreq >= wq->max_post; +} + +static __be32 convert_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_ATOMIC ? + cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC) : 0) | + (acc & IB_ACCESS_REMOTE_WRITE ? + cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE) : 0) | + (acc & IB_ACCESS_REMOTE_READ ? + cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ) : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_WRITE) : 0) | + cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_READ); +} + +static void set_fmr_seg(struct mlx4_wqe_fmr_seg *fseg, struct ib_send_wr *wr) +{ + struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(wr->wr.fast_reg.page_list); + int i; + + for (i = 0; i < wr->wr.fast_reg.page_list_len; ++i) + mfrpl->mapped_page_list[i] = + cpu_to_be64(wr->wr.fast_reg.page_list->page_list[i] | + MLX4_MTT_FLAG_PRESENT); + + fseg->flags = convert_access(wr->wr.fast_reg.access_flags); + fseg->mem_key = cpu_to_be32(wr->wr.fast_reg.rkey); + fseg->buf_list = cpu_to_be64(mfrpl->map); + fseg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start); + fseg->reg_len = cpu_to_be64(wr->wr.fast_reg.length); + fseg->offset = 0; /* XXX -- is this just for ZBVA? */ + fseg->page_size = cpu_to_be32(wr->wr.fast_reg.page_shift); + fseg->reserved[0] = 0; + fseg->reserved[1] = 0; +} + +static void set_bind_seg(struct mlx4_wqe_bind_seg *bseg, struct ib_send_wr *wr) +{ + bseg->flags1 = + convert_access(wr->wr.bind_mw.bind_info.mw_access_flags) & + cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ | + MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE | + MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC); + bseg->flags2 = 0; + if (wr->wr.bind_mw.mw->type == IB_MW_TYPE_2) + bseg->flags2 |= cpu_to_be32(MLX4_WQE_BIND_TYPE_2); + if (wr->wr.bind_mw.bind_info.mw_access_flags & IB_ZERO_BASED) + bseg->flags2 |= cpu_to_be32(MLX4_WQE_BIND_ZERO_BASED); + bseg->new_rkey = cpu_to_be32(wr->wr.bind_mw.rkey); + bseg->lkey = cpu_to_be32(wr->wr.bind_mw.bind_info.mr->lkey); + bseg->addr = cpu_to_be64(wr->wr.bind_mw.bind_info.addr); + bseg->length = cpu_to_be64(wr->wr.bind_mw.bind_info.length); +} + +static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey) +{ + memset(iseg, 0, sizeof(*iseg)); + iseg->mem_key = cpu_to_be32(rkey); +} + +static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg, + u64 remote_addr, u32 rkey) +{ + rseg->raddr = cpu_to_be64(remote_addr); + rseg->rkey = cpu_to_be32(rkey); + rseg->reserved = 0; +} + +static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ib_send_wr *wr) +{ + if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) { + aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap); + aseg->compare = cpu_to_be64(wr->wr.atomic.compare_add); + } else if (wr->opcode == IB_WR_MASKED_ATOMIC_FETCH_AND_ADD) { + aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add); + aseg->compare = cpu_to_be64(wr->wr.atomic.compare_add_mask); + } else { + aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add); + aseg->compare = 0; + } + +} + +static void set_masked_atomic_seg(struct mlx4_wqe_masked_atomic_seg *aseg, + struct ib_send_wr *wr) +{ + aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap); + aseg->swap_add_mask = cpu_to_be64(wr->wr.atomic.swap_mask); + aseg->compare = cpu_to_be64(wr->wr.atomic.compare_add); + aseg->compare_mask = cpu_to_be64(wr->wr.atomic.compare_add_mask); +} + +static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, + struct ib_send_wr *wr) +{ + memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av)); + dseg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn); + dseg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey); + dseg->vlan = to_mah(wr->wr.ud.ah)->av.eth.vlan; + memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->av.eth.mac, 6); +} + +static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev, + struct mlx4_wqe_datagram_seg *dseg, + struct ib_send_wr *wr, + enum mlx4_ib_qp_type qpt) +{ + union mlx4_ext_av *av = &to_mah(wr->wr.ud.ah)->av; + struct mlx4_av sqp_av = {0}; + int port = *((u8 *) &av->ib.port_pd) & 0x3; + + /* force loopback */ + sqp_av.port_pd = av->ib.port_pd | cpu_to_be32(0x80000000); + sqp_av.g_slid = av->ib.g_slid & 0x7f; /* no GRH */ + sqp_av.sl_tclass_flowlabel = av->ib.sl_tclass_flowlabel & + cpu_to_be32(0xf0000000); + + memcpy(dseg->av, &sqp_av, sizeof (struct mlx4_av)); + if (qpt == MLX4_IB_QPT_PROXY_GSI) + dseg->dqpn = cpu_to_be32(dev->dev->caps.qp1_tunnel[port - 1]); + else + dseg->dqpn = cpu_to_be32(dev->dev->caps.qp0_tunnel[port - 1]); + /* Use QKEY from the QP context, which is set by master */ + dseg->qkey = cpu_to_be32(IB_QP_SET_QKEY); +} + +static void build_tunnel_header(struct ib_send_wr *wr, void *wqe, unsigned *mlx_seg_len) +{ + struct mlx4_wqe_inline_seg *inl = wqe; + struct mlx4_ib_tunnel_header hdr; + struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah); + int spc; + int i; + + memcpy(&hdr.av, &ah->av, sizeof hdr.av); + hdr.remote_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); + hdr.pkey_index = cpu_to_be16(wr->wr.ud.pkey_index); + hdr.qkey = cpu_to_be32(wr->wr.ud.remote_qkey); + memcpy(hdr.mac, ah->av.eth.mac, 6); + hdr.vlan = ah->av.eth.vlan; + + spc = MLX4_INLINE_ALIGN - + ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); + if (sizeof (hdr) <= spc) { + memcpy(inl + 1, &hdr, sizeof (hdr)); + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | sizeof (hdr)); + i = 1; + } else { + memcpy(inl + 1, &hdr, spc); + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | spc); + + inl = (void *) (inl + 1) + spc; + memcpy(inl + 1, (void *) &hdr + spc, sizeof (hdr) - spc); + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | (sizeof (hdr) - spc)); + i = 2; + } + + *mlx_seg_len = + ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + sizeof (hdr), 16); +} + +static void set_mlx_icrc_seg(void *dseg) +{ + u32 *t = dseg; + struct mlx4_wqe_inline_seg *iseg = dseg; + + t[1] = 0; + + /* + * Need a barrier here before writing the byte_count field to + * make sure that all the data is visible before the + * byte_count field is set. Otherwise, if the segment begins + * a new cacheline, the HCA prefetcher could grab the 64-byte + * chunk and get a valid (!= * 0xffffffff) byte count but + * stale data, and end up sending the wrong data. + */ + wmb(); + + iseg->byte_count = cpu_to_be32((1 << 31) | 4); +} + +static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) +{ + dseg->lkey = cpu_to_be32(sg->lkey); + dseg->addr = cpu_to_be64(sg->addr); + + /* + * Need a barrier here before writing the byte_count field to + * make sure that all the data is visible before the + * byte_count field is set. Otherwise, if the segment begins + * a new cacheline, the HCA prefetcher could grab the 64-byte + * chunk and get a valid (!= * 0xffffffff) byte count but + * stale data, and end up sending the wrong data. + */ + wmb(); + + dseg->byte_count = cpu_to_be32(sg->length); +} + +static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) +{ + dseg->byte_count = cpu_to_be32(sg->length); + dseg->lkey = cpu_to_be32(sg->lkey); + dseg->addr = cpu_to_be64(sg->addr); +} + +static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr, + struct mlx4_ib_qp *qp, unsigned *lso_seg_len, + __be32 *lso_hdr_sz, __be32 *blh) +{ + unsigned halign = ALIGN(sizeof *wqe + wr->wr.ud.hlen, 16); + + if (unlikely(halign > MLX4_IB_CACHE_LINE_SIZE)) + *blh = cpu_to_be32(1 << 6); + + if (unlikely(!(qp->flags & MLX4_IB_QP_LSO) && + wr->num_sge > qp->sq.max_gs - (halign >> 4))) + return -EINVAL; + + memcpy(wqe->header, wr->wr.ud.header, wr->wr.ud.hlen); + + *lso_hdr_sz = cpu_to_be32((wr->wr.ud.mss - wr->wr.ud.hlen) << 16 | + wr->wr.ud.hlen); + *lso_seg_len = halign; + return 0; +} + +static __be32 send_ieth(struct ib_send_wr *wr) +{ + switch (wr->opcode) { + case IB_WR_SEND_WITH_IMM: + case IB_WR_RDMA_WRITE_WITH_IMM: + return wr->ex.imm_data; + + case IB_WR_SEND_WITH_INV: + return cpu_to_be32(wr->ex.invalidate_rkey); + + default: + return 0; + } +} + +static void add_zero_len_inline(void *wqe) +{ + struct mlx4_wqe_inline_seg *inl = wqe; + memset(wqe, 0, 16); + inl->byte_count = cpu_to_be32(1 << 31); +} + +int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + struct mlx4_ib_qp *qp = to_mqp(ibqp); + void *wqe; + struct mlx4_wqe_ctrl_seg *ctrl; + struct mlx4_wqe_data_seg *dseg; + unsigned long flags; + int nreq; + int err = 0; + unsigned ind; + int uninitialized_var(stamp); + int uninitialized_var(size); + unsigned uninitialized_var(seglen); + __be32 dummy; + __be32 *lso_wqe; + __be32 uninitialized_var(lso_hdr_sz); + __be32 blh; + int i; + + spin_lock_irqsave(&qp->sq.lock, flags); + + ind = qp->sq_next_wqe; + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + lso_wqe = &dummy; + blh = 0; + + if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) { + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->num_sge > qp->sq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); + qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id; + + ctrl->srcrb_flags = + (wr->send_flags & IB_SEND_SIGNALED ? + cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) | + (wr->send_flags & IB_SEND_SOLICITED ? + cpu_to_be32(MLX4_WQE_CTRL_SOLICITED) : 0) | + ((wr->send_flags & IB_SEND_IP_CSUM) ? + cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM | + MLX4_WQE_CTRL_TCP_UDP_CSUM) : 0) | + qp->sq_signal_bits; + + ctrl->imm = send_ieth(wr); + + wqe += sizeof *ctrl; + size = sizeof *ctrl / 16; + + switch (qp->mlx4_ib_qp_type) { + case MLX4_IB_QPT_RC: + case MLX4_IB_QPT_UC: + switch (wr->opcode) { + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD: + set_raddr_seg(wqe, wr->wr.atomic.remote_addr, + wr->wr.atomic.rkey); + wqe += sizeof (struct mlx4_wqe_raddr_seg); + + set_atomic_seg(wqe, wr); + wqe += sizeof (struct mlx4_wqe_atomic_seg); + + size += (sizeof (struct mlx4_wqe_raddr_seg) + + sizeof (struct mlx4_wqe_atomic_seg)) / 16; + + break; + + case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: + set_raddr_seg(wqe, wr->wr.atomic.remote_addr, + wr->wr.atomic.rkey); + wqe += sizeof (struct mlx4_wqe_raddr_seg); + + set_masked_atomic_seg(wqe, wr); + wqe += sizeof (struct mlx4_wqe_masked_atomic_seg); + + size += (sizeof (struct mlx4_wqe_raddr_seg) + + sizeof (struct mlx4_wqe_masked_atomic_seg)) / 16; + + break; + + case IB_WR_RDMA_READ: + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + set_raddr_seg(wqe, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + wqe += sizeof (struct mlx4_wqe_raddr_seg); + size += sizeof (struct mlx4_wqe_raddr_seg) / 16; + break; + + case IB_WR_LOCAL_INV: + ctrl->srcrb_flags |= + cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER); + set_local_inv_seg(wqe, wr->ex.invalidate_rkey); + wqe += sizeof (struct mlx4_wqe_local_inval_seg); + size += sizeof (struct mlx4_wqe_local_inval_seg) / 16; + break; + + case IB_WR_FAST_REG_MR: + ctrl->srcrb_flags |= + cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER); + set_fmr_seg(wqe, wr); + wqe += sizeof (struct mlx4_wqe_fmr_seg); + size += sizeof (struct mlx4_wqe_fmr_seg) / 16; + break; + + case IB_WR_BIND_MW: + ctrl->srcrb_flags |= + cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER); + set_bind_seg(wqe, wr); + wqe += sizeof(struct mlx4_wqe_bind_seg); + size += sizeof(struct mlx4_wqe_bind_seg) / 16; + break; + default: + /* No extra segments required for sends */ + break; + } + break; + + case MLX4_IB_QPT_TUN_SMI_OWNER: + err = build_sriov_qp0_header(to_msqp(qp), wr, ctrl, &seglen); + if (unlikely(err)) { + *bad_wr = wr; + goto out; + } + wqe += seglen; + size += seglen / 16; + break; + case MLX4_IB_QPT_TUN_SMI: + case MLX4_IB_QPT_TUN_GSI: + /* this is a UD qp used in MAD responses to slaves. */ + set_datagram_seg(wqe, wr); + /* set the forced-loopback bit in the data seg av */ + *(__be32 *) wqe |= cpu_to_be32(0x80000000); + wqe += sizeof (struct mlx4_wqe_datagram_seg); + size += sizeof (struct mlx4_wqe_datagram_seg) / 16; + break; + case MLX4_IB_QPT_UD: + set_datagram_seg(wqe, wr); + wqe += sizeof (struct mlx4_wqe_datagram_seg); + size += sizeof (struct mlx4_wqe_datagram_seg) / 16; + + if (wr->opcode == IB_WR_LSO) { + err = build_lso_seg(wqe, wr, qp, &seglen, &lso_hdr_sz, &blh); + if (unlikely(err)) { + *bad_wr = wr; + goto out; + } + lso_wqe = (__be32 *) wqe; + wqe += seglen; + size += seglen / 16; + } + break; + + case MLX4_IB_QPT_PROXY_SMI_OWNER: + err = build_sriov_qp0_header(to_msqp(qp), wr, ctrl, &seglen); + if (unlikely(err)) { + *bad_wr = wr; + goto out; + } + wqe += seglen; + size += seglen / 16; + /* to start tunnel header on a cache-line boundary */ + add_zero_len_inline(wqe); + wqe += 16; + size++; + build_tunnel_header(wr, wqe, &seglen); + wqe += seglen; + size += seglen / 16; + break; + case MLX4_IB_QPT_PROXY_SMI: + case MLX4_IB_QPT_PROXY_GSI: + /* If we are tunneling special qps, this is a UD qp. + * In this case we first add a UD segment targeting + * the tunnel qp, and then add a header with address + * information */ + set_tunnel_datagram_seg(to_mdev(ibqp->device), wqe, wr, + qp->mlx4_ib_qp_type); + wqe += sizeof (struct mlx4_wqe_datagram_seg); + size += sizeof (struct mlx4_wqe_datagram_seg) / 16; + build_tunnel_header(wr, wqe, &seglen); + wqe += seglen; + size += seglen / 16; + break; + + case MLX4_IB_QPT_SMI: + case MLX4_IB_QPT_GSI: + err = build_mlx_header(to_msqp(qp), wr, ctrl, &seglen); + if (unlikely(err)) { + *bad_wr = wr; + goto out; + } + wqe += seglen; + size += seglen / 16; + break; + + default: + break; + } + + /* + * Write data segments in reverse order, so as to + * overwrite cacheline stamp last within each + * cacheline. This avoids issues with WQE + * prefetching. + */ + + dseg = wqe; + dseg += wr->num_sge - 1; + size += wr->num_sge * (sizeof (struct mlx4_wqe_data_seg) / 16); + + /* Add one more inline data segment for ICRC for MLX sends */ + if (unlikely(qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI || + qp->mlx4_ib_qp_type & + (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER))) { + set_mlx_icrc_seg(dseg + 1); + size += sizeof (struct mlx4_wqe_data_seg) / 16; + } + + for (i = wr->num_sge - 1; i >= 0; --i, --dseg) + set_data_seg(dseg, wr->sg_list + i); + + /* + * Possibly overwrite stamping in cacheline with LSO + * segment only after making sure all data segments + * are written. + */ + wmb(); + *lso_wqe = lso_hdr_sz; + + ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ? + MLX4_WQE_CTRL_FENCE : 0) | size; + + /* + * Make sure descriptor is fully written before + * setting ownership bit (because HW can start + * executing as soon as we do). + */ + wmb(); + + if (wr->opcode < 0 || wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) { + *bad_wr = wr; + err = -EINVAL; + goto out; + } + + ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] | + (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) | blh; + + stamp = ind + qp->sq_spare_wqes; + ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift); + + /* + * We can improve latency by not stamping the last + * send queue WQE until after ringing the doorbell, so + * only stamp here if there are still more WQEs to post. + * + * Same optimization applies to padding with NOP wqe + * in case of WQE shrinking (used to prevent wrap-around + * in the middle of WR). + */ + if (wr->next) { + stamp_send_wqe(qp, stamp, size * 16); + ind = pad_wraparound(qp, ind); + } + } + +out: + if (likely(nreq)) { + qp->sq.head += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + writel(qp->doorbell_qpn, + to_mdev(ibqp->device)->uar_map + MLX4_SEND_DOORBELL); + + /* + * Make sure doorbells don't leak out of SQ spinlock + * and reach the HCA out of order. + */ + mmiowb(); + + stamp_send_wqe(qp, stamp, size * 16); + + ind = pad_wraparound(qp, ind); + qp->sq_next_wqe = ind; + } + + spin_unlock_irqrestore(&qp->sq.lock, flags); + + return err; +} + +int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mlx4_ib_qp *qp = to_mqp(ibqp); + struct mlx4_wqe_data_seg *scat; + unsigned long flags; + int err = 0; + int nreq; + int ind; + int max_gs; + int i; + + max_gs = qp->rq.max_gs; + spin_lock_irqsave(&qp->rq.lock, flags); + + ind = qp->rq.head & (qp->rq.wqe_cnt - 1); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->num_sge > qp->rq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + scat = get_recv_wqe(qp, ind); + + if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | + MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) { + ib_dma_sync_single_for_device(ibqp->device, + qp->sqp_proxy_rcv[ind].map, + sizeof (struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + scat->byte_count = + cpu_to_be32(sizeof (struct mlx4_ib_proxy_sqp_hdr)); + /* use dma lkey from upper layer entry */ + scat->lkey = cpu_to_be32(wr->sg_list->lkey); + scat->addr = cpu_to_be64(qp->sqp_proxy_rcv[ind].map); + scat++; + max_gs--; + } + + for (i = 0; i < wr->num_sge; ++i) + __set_data_seg(scat + i, wr->sg_list + i); + + if (i < max_gs) { + scat[i].byte_count = 0; + scat[i].lkey = cpu_to_be32(MLX4_INVALID_LKEY); + scat[i].addr = 0; + } + + qp->rq.wrid[ind] = wr->wr_id; + + ind = (ind + 1) & (qp->rq.wqe_cnt - 1); + } + +out: + if (likely(nreq)) { + qp->rq.head += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff); + } + + spin_unlock_irqrestore(&qp->rq.lock, flags); + + return err; +} + +static inline enum ib_qp_state to_ib_qp_state(enum mlx4_qp_state mlx4_state) +{ + switch (mlx4_state) { + case MLX4_QP_STATE_RST: return IB_QPS_RESET; + case MLX4_QP_STATE_INIT: return IB_QPS_INIT; + case MLX4_QP_STATE_RTR: return IB_QPS_RTR; + case MLX4_QP_STATE_RTS: return IB_QPS_RTS; + case MLX4_QP_STATE_SQ_DRAINING: + case MLX4_QP_STATE_SQD: return IB_QPS_SQD; + case MLX4_QP_STATE_SQER: return IB_QPS_SQE; + case MLX4_QP_STATE_ERR: return IB_QPS_ERR; + default: return -1; + } +} + +static inline enum ib_mig_state to_ib_mig_state(int mlx4_mig_state) +{ + switch (mlx4_mig_state) { + case MLX4_QP_PM_ARMED: return IB_MIG_ARMED; + case MLX4_QP_PM_REARM: return IB_MIG_REARM; + case MLX4_QP_PM_MIGRATED: return IB_MIG_MIGRATED; + default: return -1; + } +} + +static int to_ib_qp_access_flags(int mlx4_flags) +{ + int ib_flags = 0; + + if (mlx4_flags & MLX4_QP_BIT_RRE) + ib_flags |= IB_ACCESS_REMOTE_READ; + if (mlx4_flags & MLX4_QP_BIT_RWE) + ib_flags |= IB_ACCESS_REMOTE_WRITE; + if (mlx4_flags & MLX4_QP_BIT_RAE) + ib_flags |= IB_ACCESS_REMOTE_ATOMIC; + + return ib_flags; +} + +static void to_ib_ah_attr(struct mlx4_ib_dev *ibdev, struct ib_ah_attr *ib_ah_attr, + struct mlx4_qp_path *path) +{ + struct mlx4_dev *dev = ibdev->dev; + int is_eth; + + memset(ib_ah_attr, 0, sizeof *ib_ah_attr); + ib_ah_attr->port_num = path->sched_queue & 0x40 ? 2 : 1; + + if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->caps.num_ports) + return; + + is_eth = rdma_port_get_link_layer(&ibdev->ib_dev, ib_ah_attr->port_num) == + IB_LINK_LAYER_ETHERNET; + if (is_eth) + ib_ah_attr->sl = ((path->sched_queue >> 3) & 0x7) | + ((path->sched_queue & 4) << 1); + else + ib_ah_attr->sl = (path->sched_queue >> 2) & 0xf; + + ib_ah_attr->dlid = be16_to_cpu(path->rlid); + ib_ah_attr->src_path_bits = path->grh_mylmc & 0x7f; + ib_ah_attr->static_rate = path->static_rate ? path->static_rate - 5 : 0; + ib_ah_attr->ah_flags = (path->grh_mylmc & (1 << 7)) ? IB_AH_GRH : 0; + if (ib_ah_attr->ah_flags) { + ib_ah_attr->grh.sgid_index = path->mgid_index; + ib_ah_attr->grh.hop_limit = path->hop_limit; + ib_ah_attr->grh.traffic_class = + (be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff; + ib_ah_attr->grh.flow_label = + be32_to_cpu(path->tclass_flowlabel) & 0xfffff; + memcpy(ib_ah_attr->grh.dgid.raw, + path->rgid, sizeof ib_ah_attr->grh.dgid.raw); + } +} + +int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct mlx4_ib_dev *dev = to_mdev(ibqp->device); + struct mlx4_ib_qp *qp = to_mqp(ibqp); + struct mlx4_qp_context context; + int mlx4_state; + int err = 0; + + mutex_lock(&qp->mutex); + + if (qp->state == IB_QPS_RESET) { + qp_attr->qp_state = IB_QPS_RESET; + goto done; + } + + err = mlx4_qp_query(dev->dev, &qp->mqp, &context); + if (err) { + err = -EINVAL; + goto out; + } + + mlx4_state = be32_to_cpu(context.flags) >> 28; + + qp->state = to_ib_qp_state(mlx4_state); + qp_attr->qp_state = qp->state; + qp_attr->path_mtu = context.mtu_msgmax >> 5; + qp_attr->path_mig_state = + to_ib_mig_state((be32_to_cpu(context.flags) >> 11) & 0x3); + qp_attr->qkey = be32_to_cpu(context.qkey); + qp_attr->rq_psn = be32_to_cpu(context.rnr_nextrecvpsn) & 0xffffff; + qp_attr->sq_psn = be32_to_cpu(context.next_send_psn) & 0xffffff; + qp_attr->dest_qp_num = be32_to_cpu(context.remote_qpn) & 0xffffff; + qp_attr->qp_access_flags = + to_ib_qp_access_flags(be32_to_cpu(context.params2)); + + if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) { + to_ib_ah_attr(dev, &qp_attr->ah_attr, &context.pri_path); + to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context.alt_path); + qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f; + qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num; + } + + qp_attr->pkey_index = context.pri_path.pkey_index & 0x7f; + if (qp_attr->qp_state == IB_QPS_INIT) + qp_attr->port_num = qp->port; + else + qp_attr->port_num = context.pri_path.sched_queue & 0x40 ? 2 : 1; + + /* qp_attr->en_sqd_async_notify is only applicable in modify qp */ + qp_attr->sq_draining = mlx4_state == MLX4_QP_STATE_SQ_DRAINING; + + qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context.params1) >> 21) & 0x7); + + qp_attr->max_dest_rd_atomic = + 1 << ((be32_to_cpu(context.params2) >> 21) & 0x7); + qp_attr->min_rnr_timer = + (be32_to_cpu(context.rnr_nextrecvpsn) >> 24) & 0x1f; + qp_attr->timeout = context.pri_path.ackto >> 3; + qp_attr->retry_cnt = (be32_to_cpu(context.params1) >> 16) & 0x7; + qp_attr->rnr_retry = (be32_to_cpu(context.params1) >> 13) & 0x7; + qp_attr->alt_timeout = context.alt_path.ackto >> 3; + +done: + qp_attr->cur_qp_state = qp_attr->qp_state; + qp_attr->cap.max_recv_wr = qp->rq.wqe_cnt; + qp_attr->cap.max_recv_sge = qp->rq.max_gs; + + if (!ibqp->uobject) { + qp_attr->cap.max_send_wr = qp->sq.wqe_cnt; + qp_attr->cap.max_send_sge = qp->sq.max_gs; + } else { + qp_attr->cap.max_send_wr = 0; + qp_attr->cap.max_send_sge = 0; + } + + /* + * We don't support inline sends for kernel QPs (yet), and we + * don't know what userspace's value should be. + */ + qp_attr->cap.max_inline_data = 0; + + qp_init_attr->cap = qp_attr->cap; + + qp_init_attr->create_flags = 0; + if (qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK) + qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; + + if (qp->flags & MLX4_IB_QP_LSO) + qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO; + + if (qp->flags & MLX4_IB_QP_NETIF) + qp_init_attr->create_flags |= IB_QP_CREATE_NETIF_QP; + + qp_init_attr->sq_sig_type = + qp->sq_signal_bits == cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) ? + IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; + +out: + mutex_unlock(&qp->mutex); + return err; +} + diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c new file mode 100644 index 0000000..62d9285 --- /dev/null +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -0,0 +1,369 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "mlx4_ib.h" +#include "user.h" + +static void *get_wqe(struct mlx4_ib_srq *srq, int n) +{ + return mlx4_buf_offset(&srq->buf, n << srq->msrq.wqe_shift); +} + +static void mlx4_ib_srq_event(struct mlx4_srq *srq, enum mlx4_event type) +{ + struct ib_event event; + struct ib_srq *ibsrq = &to_mibsrq(srq)->ibsrq; + + if (ibsrq->event_handler) { + event.device = ibsrq->device; + event.element.srq = ibsrq; + switch (type) { + case MLX4_EVENT_TYPE_SRQ_LIMIT: + event.event = IB_EVENT_SRQ_LIMIT_REACHED; + break; + case MLX4_EVENT_TYPE_SRQ_CATAS_ERROR: + event.event = IB_EVENT_SRQ_ERR; + break; + default: + pr_warn("Unexpected event type %d " + "on SRQ %06x\n", type, srq->srqn); + return; + } + + ibsrq->event_handler(&event, ibsrq->srq_context); + } +} + +struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(pd->device); + struct mlx4_ib_srq *srq; + struct mlx4_wqe_srq_next_seg *next; + struct mlx4_wqe_data_seg *scatter; + u32 cqn; + u16 xrcdn; + int desc_size; + int buf_size; + int err; + int i; + + /* Sanity check SRQ size before proceeding */ + if (init_attr->attr.max_wr >= dev->dev->caps.max_srq_wqes || + init_attr->attr.max_sge > dev->dev->caps.max_srq_sge) + return ERR_PTR(-EINVAL); + + srq = kmalloc(sizeof *srq, GFP_KERNEL); + if (!srq) + return ERR_PTR(-ENOMEM); + + mutex_init(&srq->mutex); + spin_lock_init(&srq->lock); + srq->msrq.max = roundup_pow_of_two(init_attr->attr.max_wr + 1); + srq->msrq.max_gs = init_attr->attr.max_sge; + + desc_size = max(32UL, + roundup_pow_of_two(sizeof (struct mlx4_wqe_srq_next_seg) + + srq->msrq.max_gs * + sizeof (struct mlx4_wqe_data_seg))); + srq->msrq.wqe_shift = ilog2(desc_size); + + buf_size = srq->msrq.max * desc_size; + + if (pd->uobject) { + struct mlx4_ib_create_srq ucmd; + + if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { + err = -EFAULT; + goto err_srq; + } + + srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, + buf_size, 0, 0); + if (IS_ERR(srq->umem)) { + err = PTR_ERR(srq->umem); + goto err_srq; + } + + err = mlx4_mtt_init(dev->dev, ib_umem_page_count(srq->umem), + ilog2(srq->umem->page_size), &srq->mtt); + if (err) + goto err_buf; + + err = mlx4_ib_umem_write_mtt(dev, &srq->mtt, srq->umem); + if (err) + goto err_mtt; + + err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context), + ucmd.db_addr, &srq->db); + if (err) + goto err_mtt; + } else { + err = mlx4_db_alloc(dev->dev, &srq->db, 0, GFP_KERNEL); + if (err) + goto err_srq; + + *srq->db.db = 0; + + if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &srq->buf, + GFP_KERNEL)) { + err = -ENOMEM; + goto err_db; + } + + srq->head = 0; + srq->tail = srq->msrq.max - 1; + srq->wqe_ctr = 0; + + for (i = 0; i < srq->msrq.max; ++i) { + next = get_wqe(srq, i); + next->next_wqe_index = + cpu_to_be16((i + 1) & (srq->msrq.max - 1)); + + for (scatter = (void *) (next + 1); + (void *) scatter < (void *) next + desc_size; + ++scatter) + scatter->lkey = cpu_to_be32(MLX4_INVALID_LKEY); + } + + err = mlx4_mtt_init(dev->dev, srq->buf.npages, srq->buf.page_shift, + &srq->mtt); + if (err) + goto err_buf; + + err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf, GFP_KERNEL); + if (err) + goto err_mtt; + + srq->wrid = kmalloc(srq->msrq.max * sizeof (u64), GFP_KERNEL); + if (!srq->wrid) { + err = -ENOMEM; + goto err_mtt; + } + } + + cqn = (init_attr->srq_type == IB_SRQT_XRC) ? + to_mcq(init_attr->ext.xrc.cq)->mcq.cqn : 0; + xrcdn = (init_attr->srq_type == IB_SRQT_XRC) ? + to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn : + (u16) dev->dev->caps.reserved_xrcds; + err = mlx4_srq_alloc(dev->dev, to_mpd(pd)->pdn, cqn, xrcdn, &srq->mtt, + srq->db.dma, &srq->msrq); + if (err) + goto err_wrid; + + srq->msrq.event = mlx4_ib_srq_event; + srq->ibsrq.ext.xrc.srq_num = srq->msrq.srqn; + + if (pd->uobject) + if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof (__u32))) { + err = -EFAULT; + goto err_wrid; + } + + init_attr->attr.max_wr = srq->msrq.max - 1; + + return &srq->ibsrq; + +err_wrid: + if (pd->uobject) + mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db); + else + kfree(srq->wrid); + +err_mtt: + mlx4_mtt_cleanup(dev->dev, &srq->mtt); + +err_buf: + if (pd->uobject) + ib_umem_release(srq->umem); + else + mlx4_buf_free(dev->dev, buf_size, &srq->buf); + +err_db: + if (!pd->uobject) + mlx4_db_free(dev->dev, &srq->db); + +err_srq: + kfree(srq); + + return ERR_PTR(err); +} + +int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(ibsrq->device); + struct mlx4_ib_srq *srq = to_msrq(ibsrq); + int ret; + + /* We don't support resizing SRQs (yet?) */ + if (attr_mask & IB_SRQ_MAX_WR) + return -EINVAL; + + if (attr_mask & IB_SRQ_LIMIT) { + if (attr->srq_limit >= srq->msrq.max) + return -EINVAL; + + mutex_lock(&srq->mutex); + ret = mlx4_srq_arm(dev->dev, &srq->msrq, attr->srq_limit); + mutex_unlock(&srq->mutex); + + if (ret) + return ret; + } + + return 0; +} + +int mlx4_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr) +{ + struct mlx4_ib_dev *dev = to_mdev(ibsrq->device); + struct mlx4_ib_srq *srq = to_msrq(ibsrq); + int ret; + int limit_watermark; + + ret = mlx4_srq_query(dev->dev, &srq->msrq, &limit_watermark); + if (ret) + return ret; + + srq_attr->srq_limit = limit_watermark; + srq_attr->max_wr = srq->msrq.max - 1; + srq_attr->max_sge = srq->msrq.max_gs; + + return 0; +} + +int mlx4_ib_destroy_srq(struct ib_srq *srq) +{ + struct mlx4_ib_dev *dev = to_mdev(srq->device); + struct mlx4_ib_srq *msrq = to_msrq(srq); + + mlx4_srq_free(dev->dev, &msrq->msrq); + mlx4_mtt_cleanup(dev->dev, &msrq->mtt); + + if (srq->uobject) { + mlx4_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db); + ib_umem_release(msrq->umem); + } else { + kfree(msrq->wrid); + mlx4_buf_free(dev->dev, msrq->msrq.max << msrq->msrq.wqe_shift, + &msrq->buf); + mlx4_db_free(dev->dev, &msrq->db); + } + + kfree(msrq); + + return 0; +} + +void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index) +{ + struct mlx4_wqe_srq_next_seg *next; + + /* always called with interrupts disabled. */ + spin_lock(&srq->lock); + + next = get_wqe(srq, srq->tail); + next->next_wqe_index = cpu_to_be16(wqe_index); + srq->tail = wqe_index; + + spin_unlock(&srq->lock); +} + +int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mlx4_ib_srq *srq = to_msrq(ibsrq); + struct mlx4_wqe_srq_next_seg *next; + struct mlx4_wqe_data_seg *scat; + unsigned long flags; + int err = 0; + int nreq; + int i; + + spin_lock_irqsave(&srq->lock, flags); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (unlikely(wr->num_sge > srq->msrq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + break; + } + + if (unlikely(srq->head == srq->tail)) { + err = -ENOMEM; + *bad_wr = wr; + break; + } + + srq->wrid[srq->head] = wr->wr_id; + + next = get_wqe(srq, srq->head); + srq->head = be16_to_cpu(next->next_wqe_index); + scat = (struct mlx4_wqe_data_seg *) (next + 1); + + for (i = 0; i < wr->num_sge; ++i) { + scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length); + scat[i].lkey = cpu_to_be32(wr->sg_list[i].lkey); + scat[i].addr = cpu_to_be64(wr->sg_list[i].addr); + } + + if (i < srq->msrq.max_gs) { + scat[i].byte_count = 0; + scat[i].lkey = cpu_to_be32(MLX4_INVALID_LKEY); + scat[i].addr = 0; + } + } + + if (likely(nreq)) { + srq->wqe_ctr += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *srq->db.db = cpu_to_be32(srq->wqe_ctr); + } + + spin_unlock_irqrestore(&srq->lock, flags); + + return err; +} diff --git a/drivers/infiniband/hw/mlx4/sysfs.c b/drivers/infiniband/hw/mlx4/sysfs.c new file mode 100644 index 0000000..cb4c66e --- /dev/null +++ b/drivers/infiniband/hw/mlx4/sysfs.c @@ -0,0 +1,906 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/*#include "core_priv.h"*/ +#include "mlx4_ib.h" +#include +#include +#include + +#include +/*show_admin_alias_guid returns the administratively assigned value of that GUID. + * Values returned in buf parameter string: + * 0 - requests opensm to assign a value. + * ffffffffffffffff - delete this entry. + * other - value assigned by administrator. + */ +static ssize_t show_admin_alias_guid(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int record_num;/*0-15*/ + int guid_index_in_rec; /*0 - 7*/ + struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry = + container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry); + struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx; + struct mlx4_ib_dev *mdev = port->dev; + + record_num = mlx4_ib_iov_dentry->entry_num / 8 ; + guid_index_in_rec = mlx4_ib_iov_dentry->entry_num % 8 ; + + return sprintf(buf, "%llx\n", + be64_to_cpu(*(__be64 *)&mdev->sriov.alias_guid. + ports_guid[port->num - 1]. + all_rec_per_port[record_num]. + all_recs[8 * guid_index_in_rec])); +} + +/* store_admin_alias_guid stores the (new) administratively assigned value of that GUID. + * Values in buf parameter string: + * 0 - requests opensm to assign a value. + * 0xffffffffffffffff - delete this entry. + * other - guid value assigned by the administrator. + */ +static ssize_t store_admin_alias_guid(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int record_num;/*0-15*/ + int guid_index_in_rec; /*0 - 7*/ + struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry = + container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry); + struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx; + struct mlx4_ib_dev *mdev = port->dev; + u64 sysadmin_ag_val; + + record_num = mlx4_ib_iov_dentry->entry_num / 8; + guid_index_in_rec = mlx4_ib_iov_dentry->entry_num % 8; + if (0 == record_num && 0 == guid_index_in_rec) { + pr_err("GUID 0 block 0 is RO\n"); + return count; + } + sscanf(buf, "%llx", &sysadmin_ag_val); + *(__be64 *)&mdev->sriov.alias_guid.ports_guid[port->num - 1]. + all_rec_per_port[record_num]. + all_recs[GUID_REC_SIZE * guid_index_in_rec] = + cpu_to_be64(sysadmin_ag_val); + + /* Change the state to be pending for update */ + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].status + = MLX4_GUID_INFO_STATUS_IDLE ; + + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].method + = MLX4_GUID_INFO_RECORD_SET; + + switch (sysadmin_ag_val) { + case MLX4_GUID_FOR_DELETE_VAL: + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].method + = MLX4_GUID_INFO_RECORD_DELETE; + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership + = MLX4_GUID_SYSADMIN_ASSIGN; + break; + /* The sysadmin requests the SM to re-assign */ + case MLX4_NOT_SET_GUID: + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership + = MLX4_GUID_DRIVER_ASSIGN; + break; + /* The sysadmin requests a specific value.*/ + default: + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership + = MLX4_GUID_SYSADMIN_ASSIGN; + break; + } + + /* set the record index */ + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].guid_indexes + = mlx4_ib_get_aguid_comp_mask_from_ix(guid_index_in_rec); + + mlx4_ib_init_alias_guid_work(mdev, port->num - 1); + + return count; +} + +static ssize_t show_port_gid(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry = + container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry); + struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx; + struct mlx4_ib_dev *mdev = port->dev; + union ib_gid gid; + ssize_t ret; + + ret = __mlx4_ib_query_gid(&mdev->ib_dev, port->num, + mlx4_ib_iov_dentry->entry_num, &gid, 1); + if (ret) + return ret; + ret = sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + be16_to_cpu(((__be16 *) gid.raw)[0]), + be16_to_cpu(((__be16 *) gid.raw)[1]), + be16_to_cpu(((__be16 *) gid.raw)[2]), + be16_to_cpu(((__be16 *) gid.raw)[3]), + be16_to_cpu(((__be16 *) gid.raw)[4]), + be16_to_cpu(((__be16 *) gid.raw)[5]), + be16_to_cpu(((__be16 *) gid.raw)[6]), + be16_to_cpu(((__be16 *) gid.raw)[7])); + return ret; +} + +static ssize_t show_phys_port_pkey(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry = + container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry); + struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx; + struct mlx4_ib_dev *mdev = port->dev; + u16 pkey; + ssize_t ret; + + ret = __mlx4_ib_query_pkey(&mdev->ib_dev, port->num, + mlx4_ib_iov_dentry->entry_num, &pkey, 1); + if (ret) + return ret; + + return sprintf(buf, "0x%04x\n", pkey); +} + +#define DENTRY_REMOVE(_dentry) \ +do { \ + sysfs_remove_file((_dentry)->kobj, &(_dentry)->dentry.attr); \ +} while (0); + +static int create_sysfs_entry(void *_ctx, struct mlx4_ib_iov_sysfs_attr *_dentry, + char *_name, struct kobject *_kobj, + ssize_t (*show)(struct device *dev, + struct device_attribute *attr, + char *buf), + ssize_t (*store)(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) + ) +{ + int ret = 0; + struct mlx4_ib_iov_sysfs_attr *vdentry = _dentry; + + vdentry->ctx = _ctx; + vdentry->dentry.show = show; + vdentry->dentry.store = store; + sysfs_attr_init(&vdentry->dentry.attr); + vdentry->dentry.attr.name = vdentry->name; + vdentry->dentry.attr.mode = 0; + vdentry->kobj = _kobj; + snprintf(vdentry->name, 15, "%s", _name); + + if (vdentry->dentry.store) + vdentry->dentry.attr.mode |= S_IWUSR; + + if (vdentry->dentry.show) + vdentry->dentry.attr.mode |= S_IRUGO; + + ret = sysfs_create_file(vdentry->kobj, &vdentry->dentry.attr); + if (ret) { + pr_err("failed to create %s\n", vdentry->dentry.attr.name); + vdentry->ctx = NULL; + return ret; + } + + return ret; +} + +int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, + struct attribute *attr) +{ + struct mlx4_ib_iov_port *port = &device->iov_ports[port_num - 1]; + int ret; + + ret = sysfs_create_file(port->mcgs_parent, attr); + if (ret) + pr_err("failed to create %s\n", attr->name); + + return ret; +} + +void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, + struct attribute *attr) +{ + struct mlx4_ib_iov_port *port = &device->iov_ports[port_num - 1]; + + sysfs_remove_file(port->mcgs_parent, attr); +} + +static int add_port_entries(struct mlx4_ib_dev *device, int port_num) +{ + int i; + char buff[10]; + struct mlx4_ib_iov_port *port = NULL; + int ret = 0 ; + struct ib_port_attr attr; + + /* get the physical gid and pkey table sizes.*/ + ret = __mlx4_ib_query_port(&device->ib_dev, port_num, &attr, 1); + if (ret) + goto err; + + port = &device->iov_ports[port_num - 1]; + port->dev = device; + port->num = port_num; + /* Directory structure: + * iov - + * port num - + * admin_guids + * gids (operational) + * mcg_table + */ + port->dentr_ar = kzalloc(sizeof (struct mlx4_ib_iov_sysfs_attr_ar), + GFP_KERNEL); + if (!port->dentr_ar) { + ret = -ENOMEM; + goto err; + } + sprintf(buff, "%d", port_num); + port->cur_port = kobject_create_and_add(buff, + kobject_get(device->ports_parent)); + if (!port->cur_port) { + ret = -ENOMEM; + goto kobj_create_err; + } + /* admin GUIDs */ + port->admin_alias_parent = kobject_create_and_add("admin_guids", + kobject_get(port->cur_port)); + if (!port->admin_alias_parent) { + ret = -ENOMEM; + goto err_admin_guids; + } + for (i = 0 ; i < attr.gid_tbl_len; i++) { + sprintf(buff, "%d", i); + port->dentr_ar->dentries[i].entry_num = i; + ret = create_sysfs_entry(port, &port->dentr_ar->dentries[i], + buff, port->admin_alias_parent, + show_admin_alias_guid, store_admin_alias_guid); + if (ret) + goto err_admin_alias_parent; + } + + /* gids subdirectory (operational gids) */ + port->gids_parent = kobject_create_and_add("gids", + kobject_get(port->cur_port)); + if (!port->gids_parent) { + ret = -ENOMEM; + goto err_gids; + } + + for (i = 0 ; i < attr.gid_tbl_len; i++) { + sprintf(buff, "%d", i); + port->dentr_ar->dentries[attr.gid_tbl_len + i].entry_num = i; + ret = create_sysfs_entry(port, + &port->dentr_ar->dentries[attr.gid_tbl_len + i], + buff, + port->gids_parent, show_port_gid, NULL); + if (ret) + goto err_gids_parent; + } + + /* physical port pkey table */ + port->pkeys_parent = + kobject_create_and_add("pkeys", kobject_get(port->cur_port)); + if (!port->pkeys_parent) { + ret = -ENOMEM; + goto err_pkeys; + } + + for (i = 0 ; i < attr.pkey_tbl_len; i++) { + sprintf(buff, "%d", i); + port->dentr_ar->dentries[2 * attr.gid_tbl_len + i].entry_num = i; + ret = create_sysfs_entry(port, + &port->dentr_ar->dentries[2 * attr.gid_tbl_len + i], + buff, port->pkeys_parent, + show_phys_port_pkey, NULL); + if (ret) + goto err_pkeys_parent; + } + + /* MCGs table */ + port->mcgs_parent = + kobject_create_and_add("mcgs", kobject_get(port->cur_port)); + if (!port->mcgs_parent) { + ret = -ENOMEM; + goto err_mcgs; + } + return 0; + +err_mcgs: + kobject_put(port->cur_port); + +err_pkeys_parent: + kobject_put(port->pkeys_parent); + +err_pkeys: + kobject_put(port->cur_port); + +err_gids_parent: + kobject_put(port->gids_parent); + +err_gids: + kobject_put(port->cur_port); + +err_admin_alias_parent: + kobject_put(port->admin_alias_parent); + +err_admin_guids: + kobject_put(port->cur_port); + kobject_put(port->cur_port); /* once more for create_and_add buff */ + +kobj_create_err: + kobject_put(device->ports_parent); + kfree(port->dentr_ar); + +err: + pr_err("add_port_entries FAILED: for port:%d, error: %d\n", + port_num, ret); + return ret; +} + +static void get_name(struct mlx4_ib_dev *dev, char *name, int i, int max) +{ + char base_name[9]; + + /* pci_name format is: bus:dev:func -> xxxx:yy:zz.n */ + strlcpy(name, pci_name(dev->dev->pdev), max); + strncpy(base_name, name, 8); /*till xxxx:yy:*/ + base_name[8] = '\0'; + /* with no ARI only 3 last bits are used so when the fn is higher than 8 + * need to add it to the dev num, so count in the last number will be + * modulo 8 */ + sprintf(name, "%s%.2d.%d", base_name, (i/8), (i%8)); +} + +struct mlx4_port { + struct kobject kobj; + struct mlx4_ib_dev *dev; + struct attribute_group pkey_group; + struct attribute_group gid_group; + struct device_attribute enable_smi_admin; + struct device_attribute smi_enabled; + int slave; + u8 port_num; +}; + + +static void mlx4_port_release(struct kobject *kobj) +{ + struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj); + struct attribute *a; + int i; + + for (i = 0; (a = p->pkey_group.attrs[i]); ++i) + kfree(a); + kfree(p->pkey_group.attrs); + for (i = 0; (a = p->gid_group.attrs[i]); ++i) + kfree(a); + kfree(p->gid_group.attrs); + kfree(p); +} + +struct port_attribute { + struct attribute attr; + ssize_t (*show)(struct mlx4_port *, struct port_attribute *, char *buf); + ssize_t (*store)(struct mlx4_port *, struct port_attribute *, + const char *buf, size_t count); +}; + +static ssize_t port_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct port_attribute *port_attr = + container_of(attr, struct port_attribute, attr); + struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj); + + if (!port_attr->show) + return -EIO; + return port_attr->show(p, port_attr, buf); +} + +static ssize_t port_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t size) +{ + struct port_attribute *port_attr = + container_of(attr, struct port_attribute, attr); + struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj); + + if (!port_attr->store) + return -EIO; + return port_attr->store(p, port_attr, buf, size); +} + +static const struct sysfs_ops port_sysfs_ops = { + .show = port_attr_show, + .store = port_attr_store, +}; + +static struct kobj_type port_type = { + .release = mlx4_port_release, + .sysfs_ops = &port_sysfs_ops, +}; + +struct port_table_attribute { + struct port_attribute attr; + char name[8]; + int index; +}; + +static ssize_t show_port_pkey(struct mlx4_port *p, struct port_attribute *attr, + char *buf) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + ssize_t ret = -ENODEV; + + if (p->dev->pkeys.virt2phys_pkey[p->slave][p->port_num - 1][tab_attr->index] >= + (p->dev->dev->caps.pkey_table_len[p->port_num])) + ret = sprintf(buf, "none\n"); + else + ret = sprintf(buf, "%d\n", + p->dev->pkeys.virt2phys_pkey[p->slave] + [p->port_num - 1][tab_attr->index]); + return ret; +} + +static ssize_t store_port_pkey(struct mlx4_port *p, struct port_attribute *attr, + const char *buf, size_t count) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + int idx; + int err; + + /* do not allow remapping Dom0 virtual pkey table */ + if (p->slave == mlx4_master_func_num(p->dev->dev)) + return -EINVAL; + + if (!strncasecmp(buf, "no", 2)) + idx = p->dev->dev->phys_caps.pkey_phys_table_len[p->port_num] - 1; + else if (sscanf(buf, "%i", &idx) != 1 || + idx >= p->dev->dev->caps.pkey_table_len[p->port_num] || + idx < 0) + return -EINVAL; + + p->dev->pkeys.virt2phys_pkey[p->slave][p->port_num - 1] + [tab_attr->index] = idx; + mlx4_sync_pkey_table(p->dev->dev, p->slave, p->port_num, + tab_attr->index, idx); + err = mlx4_gen_pkey_eqe(p->dev->dev, p->slave, p->port_num); + if (err) { + pr_err("mlx4_gen_pkey_eqe failed for slave %d," + " port %d, index %d\n", p->slave, p->port_num, idx); + return err; + } + return count; +} + +static ssize_t show_port_gid_idx(struct mlx4_port *p, + struct port_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", p->slave); +} + +static struct attribute ** +alloc_group_attrs(ssize_t (*show)(struct mlx4_port *, + struct port_attribute *, char *buf), + ssize_t (*store)(struct mlx4_port *, struct port_attribute *, + const char *buf, size_t count), + int len) +{ + struct attribute **tab_attr; + struct port_table_attribute *element; + int i; + + tab_attr = kcalloc(1 + len, sizeof (struct attribute *), GFP_KERNEL); + if (!tab_attr) + return NULL; + + for (i = 0; i < len; i++) { + element = kzalloc(sizeof (struct port_table_attribute), + GFP_KERNEL); + if (!element) + goto err; + if (snprintf(element->name, sizeof (element->name), + "%d", i) >= sizeof (element->name)) { + kfree(element); + goto err; + } + sysfs_attr_init(&element->attr.attr); + element->attr.attr.name = element->name; + if (store) { + element->attr.attr.mode = S_IWUSR | S_IRUGO; + element->attr.store = store; + } else + element->attr.attr.mode = S_IRUGO; + + element->attr.show = show; + element->index = i; + tab_attr[i] = &element->attr.attr; + } + return tab_attr; + +err: + while (--i >= 0) + kfree(tab_attr[i]); + kfree(tab_attr); + return NULL; +} + +static ssize_t sysfs_show_smi_enabled(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct mlx4_port *p = + container_of(attr, struct mlx4_port, smi_enabled); + ssize_t len = 0; + + if (mlx4_vf_smi_enabled(p->dev->dev, p->slave, p->port_num)) + len = sprintf(buf, "%d\n", 1); + else + len = sprintf(buf, "%d\n", 0); + + return len; +} + +static ssize_t sysfs_show_enable_smi_admin(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mlx4_port *p = + container_of(attr, struct mlx4_port, enable_smi_admin); + ssize_t len = 0; + + if (mlx4_vf_get_enable_smi_admin(p->dev->dev, p->slave, p->port_num)) + len = sprintf(buf, "%d\n", 1); + else + len = sprintf(buf, "%d\n", 0); + + return len; +} + +static ssize_t sysfs_store_enable_smi_admin(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mlx4_port *p = + container_of(attr, struct mlx4_port, enable_smi_admin); + int enable; + + if (sscanf(buf, "%i", &enable) != 1 || + enable < 0 || enable > 1) + return -EINVAL; + + if (mlx4_vf_set_enable_smi_admin(p->dev->dev, p->slave, p->port_num, enable)) + return -EINVAL; + return count; +} + +static int add_vf_smi_entries(struct mlx4_port *p) +{ + int is_eth = rdma_port_get_link_layer(&p->dev->ib_dev, p->port_num) == + IB_LINK_LAYER_ETHERNET; + int ret; + + /* do not display entries if eth transport, or if master */ + if (is_eth || p->slave == mlx4_master_func_num(p->dev->dev)) + return 0; + + sysfs_attr_init(&p->smi_enabled.attr); + p->smi_enabled.show = sysfs_show_smi_enabled; + p->smi_enabled.store = NULL; + p->smi_enabled.attr.name = "smi_enabled"; + p->smi_enabled.attr.mode = 0444; + ret = sysfs_create_file(&p->kobj, &p->smi_enabled.attr); + if (ret) { + pr_err("failed to create smi_enabled\n"); + return ret; + } + + sysfs_attr_init(&p->enable_smi_admin.attr); + p->enable_smi_admin.show = sysfs_show_enable_smi_admin; + p->enable_smi_admin.store = sysfs_store_enable_smi_admin; + p->enable_smi_admin.attr.name = "enable_smi_admin"; + p->enable_smi_admin.attr.mode = 0644; + ret = sysfs_create_file(&p->kobj, &p->enable_smi_admin.attr); + if (ret) { + pr_err("failed to create enable_smi_admin\n"); + sysfs_remove_file(&p->kobj, &p->smi_enabled.attr); + return ret; + } + return 0; +} + +static void remove_vf_smi_entries(struct mlx4_port *p) +{ + int is_eth = rdma_port_get_link_layer(&p->dev->ib_dev, p->port_num) == + IB_LINK_LAYER_ETHERNET; + + if (is_eth || p->slave == mlx4_master_func_num(p->dev->dev)) + return; + + sysfs_remove_file(&p->kobj, &p->smi_enabled.attr); + sysfs_remove_file(&p->kobj, &p->enable_smi_admin.attr); +} + +static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave) +{ + struct mlx4_port *p; + int i; + int ret; + + p = kzalloc(sizeof *p, GFP_KERNEL); + if (!p) + return -ENOMEM; + + p->dev = dev; + p->port_num = port_num; + p->slave = slave; + + ret = kobject_init_and_add(&p->kobj, &port_type, + kobject_get(dev->dev_ports_parent[slave]), + "%d", port_num); + if (ret) + goto err_alloc; + + p->pkey_group.name = "pkey_idx"; + p->pkey_group.attrs = + alloc_group_attrs(show_port_pkey, store_port_pkey, + dev->dev->caps.pkey_table_len[port_num]); + if (!p->pkey_group.attrs) { + ret = -ENOMEM; + goto err_alloc; + } + + ret = sysfs_create_group(&p->kobj, &p->pkey_group); + if (ret) + goto err_free_pkey; + + p->gid_group.name = "gid_idx"; + p->gid_group.attrs = alloc_group_attrs(show_port_gid_idx, NULL, 1); + if (!p->gid_group.attrs) { + ret = -ENOMEM; + goto err_free_pkey; + } + + ret = sysfs_create_group(&p->kobj, &p->gid_group); + if (ret) + goto err_free_gid; + + ret = add_vf_smi_entries(p); + if (ret) + goto err_free_gid; + + list_add_tail(&p->kobj.entry, &dev->pkeys.pkey_port_list[slave]); + return 0; + +err_free_gid: + kfree(p->gid_group.attrs[0]); + kfree(p->gid_group.attrs); + +err_free_pkey: + for (i = 0; i < dev->dev->caps.pkey_table_len[port_num]; ++i) + kfree(p->pkey_group.attrs[i]); + kfree(p->pkey_group.attrs); + +err_alloc: + kobject_put(dev->dev_ports_parent[slave]); + kfree(p); + return ret; +} + +static int register_one_pkey_tree(struct mlx4_ib_dev *dev, int slave) +{ + char name[32]; + int err; + int port; + struct kobject *p, *t; + struct mlx4_port *mport; + struct mlx4_active_ports actv_ports; + + get_name(dev, name, slave, sizeof name); + + dev->pkeys.device_parent[slave] = + kobject_create_and_add(name, kobject_get(dev->iov_parent)); + + if (!dev->pkeys.device_parent[slave]) { + err = -ENOMEM; + goto fail_dev; + } + + INIT_LIST_HEAD(&dev->pkeys.pkey_port_list[slave]); + + dev->dev_ports_parent[slave] = + kobject_create_and_add("ports", + kobject_get(dev->pkeys.device_parent[slave])); + + if (!dev->dev_ports_parent[slave]) { + err = -ENOMEM; + goto err_ports; + } + + actv_ports = mlx4_get_active_ports(dev->dev, slave); + + for (port = 1; port <= dev->dev->caps.num_ports; ++port) { + if (!test_bit(port - 1, actv_ports.ports)) + continue; + err = add_port(dev, port, slave); + if (err) + goto err_add; + } + return 0; + +err_add: + list_for_each_entry_safe(p, t, + &dev->pkeys.pkey_port_list[slave], + entry) { + list_del(&p->entry); + mport = container_of(p, struct mlx4_port, kobj); + sysfs_remove_group(p, &mport->pkey_group); + sysfs_remove_group(p, &mport->gid_group); + remove_vf_smi_entries(mport); + kobject_put(p); + } + kobject_put(dev->dev_ports_parent[slave]); + +err_ports: + kobject_put(dev->pkeys.device_parent[slave]); + /* extra put for the device_parent create_and_add */ + kobject_put(dev->pkeys.device_parent[slave]); + +fail_dev: + kobject_put(dev->iov_parent); + return err; +} + +static int register_pkey_tree(struct mlx4_ib_dev *device) +{ + int i; + + if (!mlx4_is_master(device->dev)) + return 0; + + for (i = 0; i <= device->dev->num_vfs; ++i) + register_one_pkey_tree(device, i); + + return 0; +} + +static void unregister_pkey_tree(struct mlx4_ib_dev *device) +{ + int slave; + struct kobject *p, *t; + struct mlx4_port *port; + + if (!mlx4_is_master(device->dev)) + return; + + for (slave = device->dev->num_vfs; slave >= 0; --slave) { + list_for_each_entry_safe(p, t, + &device->pkeys.pkey_port_list[slave], + entry) { + list_del(&p->entry); + port = container_of(p, struct mlx4_port, kobj); + sysfs_remove_group(p, &port->pkey_group); + sysfs_remove_group(p, &port->gid_group); + remove_vf_smi_entries(port); + kobject_put(p); + kobject_put(device->dev_ports_parent[slave]); + } + kobject_put(device->dev_ports_parent[slave]); + kobject_put(device->pkeys.device_parent[slave]); + kobject_put(device->pkeys.device_parent[slave]); + kobject_put(device->iov_parent); + } +} + +int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *dev) +{ + int i; + int ret = 0; + + if (!mlx4_is_master(dev->dev)) + return 0; + + dev->iov_parent = + kobject_create_and_add("iov", + kobject_get(dev->ib_dev.ports_parent->parent)); + if (!dev->iov_parent) { + ret = -ENOMEM; + goto err; + } + dev->ports_parent = + kobject_create_and_add("ports", + kobject_get(dev->iov_parent)); + if (!dev->ports_parent) { + ret = -ENOMEM; + goto err_ports; + } + + for (i = 1; i <= dev->ib_dev.phys_port_cnt; ++i) { + ret = add_port_entries(dev, i); + if (ret) + goto err_add_entries; + } + + ret = register_pkey_tree(dev); + if (ret) + goto err_add_entries; + return 0; + +err_add_entries: + kobject_put(dev->ports_parent); + +err_ports: + kobject_put(dev->iov_parent); +err: + kobject_put(dev->ib_dev.ports_parent->parent); + pr_err("mlx4_ib_device_register_sysfs error (%d)\n", ret); + return ret; +} + +static void unregister_alias_guid_tree(struct mlx4_ib_dev *device) +{ + struct mlx4_ib_iov_port *p; + int i; + + if (!mlx4_is_master(device->dev)) + return; + + for (i = 0; i < device->dev->caps.num_ports; i++) { + p = &device->iov_ports[i]; + kobject_put(p->admin_alias_parent); + kobject_put(p->gids_parent); + kobject_put(p->pkeys_parent); + kobject_put(p->mcgs_parent); + kobject_put(p->cur_port); + kobject_put(p->cur_port); + kobject_put(p->cur_port); + kobject_put(p->cur_port); + kobject_put(p->cur_port); + kobject_put(p->dev->ports_parent); + kfree(p->dentr_ar); + } +} + +void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device) +{ + unregister_alias_guid_tree(device); + unregister_pkey_tree(device); + kobject_put(device->ports_parent); + kobject_put(device->iov_parent); + kobject_put(device->iov_parent); + kobject_put(device->ib_dev.ports_parent->parent); +} diff --git a/drivers/infiniband/hw/mlx4/user.h b/drivers/infiniband/hw/mlx4/user.h new file mode 100644 index 0000000..07e6769 --- /dev/null +++ b/drivers/infiniband/hw/mlx4/user.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX4_IB_USER_H +#define MLX4_IB_USER_H + +#include + +/* + * Increment this value if any changes that break userspace ABI + * compatibility are made. + */ + +#define MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION 3 +#define MLX4_IB_UVERBS_ABI_VERSION 4 + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ + +struct mlx4_ib_alloc_ucontext_resp_v3 { + __u32 qp_tab_size; + __u16 bf_reg_size; + __u16 bf_regs_per_page; +}; + +struct mlx4_ib_alloc_ucontext_resp { + __u32 dev_caps; + __u32 qp_tab_size; + __u16 bf_reg_size; + __u16 bf_regs_per_page; + __u32 cqe_size; +}; + +struct mlx4_ib_alloc_pd_resp { + __u32 pdn; + __u32 reserved; +}; + +struct mlx4_ib_create_cq { + __u64 buf_addr; + __u64 db_addr; +}; + +struct mlx4_ib_create_cq_resp { + __u32 cqn; + __u32 reserved; +}; + +struct mlx4_ib_resize_cq { + __u64 buf_addr; +}; + +struct mlx4_ib_create_srq { + __u64 buf_addr; + __u64 db_addr; +}; + +struct mlx4_ib_create_srq_resp { + __u32 srqn; + __u32 reserved; +}; + +struct mlx4_ib_create_qp { + __u64 buf_addr; + __u64 db_addr; + __u8 log_sq_bb_count; + __u8 log_sq_stride; + __u8 sq_no_prefetch; + __u8 reserved[5]; +}; + +#endif /* MLX4_IB_USER_H */ diff --git a/drivers/infiniband/hw/mlx5/Kconfig b/drivers/infiniband/hw/mlx5/Kconfig new file mode 100644 index 0000000..10df386 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/Kconfig @@ -0,0 +1,10 @@ +config MLX5_INFINIBAND + tristate "Mellanox Connect-IB HCA support" + depends on NETDEVICES && ETHERNET && PCI + select NET_VENDOR_MELLANOX + select MLX5_CORE + ---help--- + This driver provides low-level InfiniBand support for + Mellanox Connect-IB PCI Express host channel adapters (HCAs). + This is required to use InfiniBand protocols such as + IP-over-IB or SRP with these devices. diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile new file mode 100644 index 0000000..4ea0135 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_MLX5_INFINIBAND) += mlx5_ib.o + +mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c new file mode 100644 index 0000000..39ab0ca --- /dev/null +++ b/drivers/infiniband/hw/mlx5/ah.c @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "mlx5_ib.h" + +struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr, + struct mlx5_ib_ah *ah) +{ + if (ah_attr->ah_flags & IB_AH_GRH) { + memcpy(ah->av.rgid, &ah_attr->grh.dgid, 16); + ah->av.grh_gid_fl = cpu_to_be32(ah_attr->grh.flow_label | + (1 << 30) | + ah_attr->grh.sgid_index << 20); + ah->av.hop_limit = ah_attr->grh.hop_limit; + ah->av.tclass = ah_attr->grh.traffic_class; + } + + ah->av.rlid = cpu_to_be16(ah_attr->dlid); + ah->av.fl_mlid = ah_attr->src_path_bits & 0x7f; + ah->av.stat_rate_sl = (ah_attr->static_rate << 4) | (ah_attr->sl & 0xf); + + return &ah->ibah; +} + +struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +{ + struct mlx5_ib_ah *ah; + + ah = kzalloc(sizeof(*ah), GFP_ATOMIC); + if (!ah) + return ERR_PTR(-ENOMEM); + + return create_ib_ah(ah_attr, ah); /* never fails */ +} + +int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) +{ + struct mlx5_ib_ah *ah = to_mah(ibah); + u32 tmp; + + memset(ah_attr, 0, sizeof(*ah_attr)); + + tmp = be32_to_cpu(ah->av.grh_gid_fl); + if (tmp & (1 << 30)) { + ah_attr->ah_flags = IB_AH_GRH; + ah_attr->grh.sgid_index = (tmp >> 20) & 0xff; + ah_attr->grh.flow_label = tmp & 0xfffff; + memcpy(&ah_attr->grh.dgid, ah->av.rgid, 16); + ah_attr->grh.hop_limit = ah->av.hop_limit; + ah_attr->grh.traffic_class = ah->av.tclass; + } + ah_attr->dlid = be16_to_cpu(ah->av.rlid); + ah_attr->static_rate = ah->av.stat_rate_sl >> 4; + ah_attr->sl = ah->av.stat_rate_sl & 0xf; + + return 0; +} + +int mlx5_ib_destroy_ah(struct ib_ah *ah) +{ + kfree(to_mah(ah)); + return 0; +} diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c new file mode 100644 index 0000000..10cfce5 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -0,0 +1,1187 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include "mlx5_ib.h" +#include "user.h" + +static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq) +{ + struct ib_cq *ibcq = &to_mibcq(cq)->ibcq; + + ibcq->comp_handler(ibcq, ibcq->cq_context); +} + +static void mlx5_ib_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type) +{ + struct mlx5_ib_cq *cq = container_of(mcq, struct mlx5_ib_cq, mcq); + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct ib_cq *ibcq = &cq->ibcq; + struct ib_event event; + + if (type != MLX5_EVENT_TYPE_CQ_ERROR) { + mlx5_ib_warn(dev, "Unexpected event type %d on CQ %06x\n", + type, mcq->cqn); + return; + } + + if (ibcq->event_handler) { + event.device = &dev->ib_dev; + event.event = IB_EVENT_CQ_ERR; + event.element.cq = ibcq; + ibcq->event_handler(&event, ibcq->cq_context); + } +} + +static void *get_cqe_from_buf(struct mlx5_ib_cq_buf *buf, int n, int size) +{ + return mlx5_buf_offset(&buf->buf, n * size); +} + +static void *get_cqe(struct mlx5_ib_cq *cq, int n) +{ + return get_cqe_from_buf(&cq->buf, n, cq->mcq.cqe_sz); +} + +static u8 sw_ownership_bit(int n, int nent) +{ + return (n & nent) ? 1 : 0; +} + +static void *get_sw_cqe(struct mlx5_ib_cq *cq, int n) +{ + void *cqe = get_cqe(cq, n & cq->ibcq.cqe); + struct mlx5_cqe64 *cqe64; + + cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; + + if (likely((cqe64->op_own) >> 4 != MLX5_CQE_INVALID) && + !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ibcq.cqe + 1)))) { + return cqe; + } else { + return NULL; + } +} + +static void *next_cqe_sw(struct mlx5_ib_cq *cq) +{ + return get_sw_cqe(cq, cq->mcq.cons_index); +} + +static enum ib_wc_opcode get_umr_comp(struct mlx5_ib_wq *wq, int idx) +{ + switch (wq->wr_data[idx]) { + case MLX5_IB_WR_UMR: + return 0; + + case IB_WR_LOCAL_INV: + return IB_WC_LOCAL_INV; + + case IB_WR_FAST_REG_MR: + return IB_WC_FAST_REG_MR; + + default: + pr_warn("unknown completion status\n"); + return 0; + } +} + +static void handle_good_req(struct ib_wc *wc, struct mlx5_cqe64 *cqe, + struct mlx5_ib_wq *wq, int idx) +{ + wc->wc_flags = 0; + switch (be32_to_cpu(cqe->sop_drop_qpn) >> 24) { + case MLX5_OPCODE_RDMA_WRITE_IMM: + wc->wc_flags |= IB_WC_WITH_IMM; + case MLX5_OPCODE_RDMA_WRITE: + wc->opcode = IB_WC_RDMA_WRITE; + break; + case MLX5_OPCODE_SEND_IMM: + wc->wc_flags |= IB_WC_WITH_IMM; + case MLX5_OPCODE_SEND: + case MLX5_OPCODE_SEND_INVAL: + wc->opcode = IB_WC_SEND; + break; + case MLX5_OPCODE_RDMA_READ: + wc->opcode = IB_WC_RDMA_READ; + wc->byte_len = be32_to_cpu(cqe->byte_cnt); + break; + case MLX5_OPCODE_ATOMIC_CS: + wc->opcode = IB_WC_COMP_SWAP; + wc->byte_len = 8; + break; + case MLX5_OPCODE_ATOMIC_FA: + wc->opcode = IB_WC_FETCH_ADD; + wc->byte_len = 8; + break; + case MLX5_OPCODE_ATOMIC_MASKED_CS: + wc->opcode = IB_WC_MASKED_COMP_SWAP; + wc->byte_len = 8; + break; + case MLX5_OPCODE_ATOMIC_MASKED_FA: + wc->opcode = IB_WC_MASKED_FETCH_ADD; + wc->byte_len = 8; + break; + case MLX5_OPCODE_BIND_MW: + wc->opcode = IB_WC_BIND_MW; + break; + case MLX5_OPCODE_UMR: + wc->opcode = get_umr_comp(wq, idx); + break; + } +} + +enum { + MLX5_GRH_IN_BUFFER = 1, + MLX5_GRH_IN_CQE = 2, +}; + +static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe, + struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); + struct mlx5_ib_srq *srq; + struct mlx5_ib_wq *wq; + u16 wqe_ctr; + u8 g; + + if (qp->ibqp.srq || qp->ibqp.xrcd) { + struct mlx5_core_srq *msrq = NULL; + + if (qp->ibqp.xrcd) { + msrq = mlx5_core_get_srq(dev->mdev, + be32_to_cpu(cqe->srqn)); + srq = to_mibsrq(msrq); + } else { + srq = to_msrq(qp->ibqp.srq); + } + if (srq) { + wqe_ctr = be16_to_cpu(cqe->wqe_counter); + wc->wr_id = srq->wrid[wqe_ctr]; + mlx5_ib_free_srq_wqe(srq, wqe_ctr); + if (msrq && atomic_dec_and_test(&msrq->refcount)) + complete(&msrq->free); + } + } else { + wq = &qp->rq; + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + ++wq->tail; + } + wc->byte_len = be32_to_cpu(cqe->byte_cnt); + + switch (cqe->op_own >> 4) { + case MLX5_CQE_RESP_WR_IMM: + wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; + wc->wc_flags = IB_WC_WITH_IMM; + wc->ex.imm_data = cqe->imm_inval_pkey; + break; + case MLX5_CQE_RESP_SEND: + wc->opcode = IB_WC_RECV; + wc->wc_flags = 0; + break; + case MLX5_CQE_RESP_SEND_IMM: + wc->opcode = IB_WC_RECV; + wc->wc_flags = IB_WC_WITH_IMM; + wc->ex.imm_data = cqe->imm_inval_pkey; + break; + case MLX5_CQE_RESP_SEND_INV: + wc->opcode = IB_WC_RECV; + wc->wc_flags = IB_WC_WITH_INVALIDATE; + wc->ex.invalidate_rkey = be32_to_cpu(cqe->imm_inval_pkey); + break; + } + wc->slid = be16_to_cpu(cqe->slid); + wc->sl = (be32_to_cpu(cqe->flags_rqpn) >> 24) & 0xf; + wc->src_qp = be32_to_cpu(cqe->flags_rqpn) & 0xffffff; + wc->dlid_path_bits = cqe->ml_path; + g = (be32_to_cpu(cqe->flags_rqpn) >> 28) & 3; + wc->wc_flags |= g ? IB_WC_GRH : 0; + wc->pkey_index = be32_to_cpu(cqe->imm_inval_pkey) & 0xffff; +} + +static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe) +{ + __be32 *p = (__be32 *)cqe; + int i; + + mlx5_ib_warn(dev, "dump error cqe\n"); + for (i = 0; i < sizeof(*cqe) / 16; i++, p += 4) + pr_info("%08x %08x %08x %08x\n", be32_to_cpu(p[0]), + be32_to_cpu(p[1]), be32_to_cpu(p[2]), + be32_to_cpu(p[3])); +} + +static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev, + struct mlx5_err_cqe *cqe, + struct ib_wc *wc) +{ + int dump = 1; + + switch (cqe->syndrome) { + case MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR: + wc->status = IB_WC_LOC_LEN_ERR; + break; + case MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR: + wc->status = IB_WC_LOC_QP_OP_ERR; + break; + case MLX5_CQE_SYNDROME_LOCAL_PROT_ERR: + wc->status = IB_WC_LOC_PROT_ERR; + break; + case MLX5_CQE_SYNDROME_WR_FLUSH_ERR: + dump = 0; + wc->status = IB_WC_WR_FLUSH_ERR; + break; + case MLX5_CQE_SYNDROME_MW_BIND_ERR: + wc->status = IB_WC_MW_BIND_ERR; + break; + case MLX5_CQE_SYNDROME_BAD_RESP_ERR: + wc->status = IB_WC_BAD_RESP_ERR; + break; + case MLX5_CQE_SYNDROME_LOCAL_ACCESS_ERR: + wc->status = IB_WC_LOC_ACCESS_ERR; + break; + case MLX5_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR: + wc->status = IB_WC_REM_INV_REQ_ERR; + break; + case MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR: + wc->status = IB_WC_REM_ACCESS_ERR; + break; + case MLX5_CQE_SYNDROME_REMOTE_OP_ERR: + wc->status = IB_WC_REM_OP_ERR; + break; + case MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR: + wc->status = IB_WC_RETRY_EXC_ERR; + dump = 0; + break; + case MLX5_CQE_SYNDROME_RNR_RETRY_EXC_ERR: + wc->status = IB_WC_RNR_RETRY_EXC_ERR; + dump = 0; + break; + case MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR: + wc->status = IB_WC_REM_ABORT_ERR; + break; + default: + wc->status = IB_WC_GENERAL_ERR; + break; + } + + wc->vendor_err = cqe->vendor_err_synd; + if (dump) + dump_cqe(dev, cqe); +} + +static int is_atomic_response(struct mlx5_ib_qp *qp, uint16_t idx) +{ + /* TBD: waiting decision + */ + return 0; +} + +static void *mlx5_get_atomic_laddr(struct mlx5_ib_qp *qp, uint16_t idx) +{ + struct mlx5_wqe_data_seg *dpseg; + void *addr; + + dpseg = mlx5_get_send_wqe(qp, idx) + sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_raddr_seg) + + sizeof(struct mlx5_wqe_atomic_seg); + addr = (void *)(unsigned long)be64_to_cpu(dpseg->addr); + return addr; +} + +static void handle_atomic(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64, + uint16_t idx) +{ + void *addr; + int byte_count; + int i; + + if (!is_atomic_response(qp, idx)) + return; + + byte_count = be32_to_cpu(cqe64->byte_cnt); + addr = mlx5_get_atomic_laddr(qp, idx); + + if (byte_count == 4) { + *(uint32_t *)addr = be32_to_cpu(*((__be32 *)addr)); + } else { + for (i = 0; i < byte_count; i += 8) { + *(uint64_t *)addr = be64_to_cpu(*((__be64 *)addr)); + addr += 8; + } + } + + return; +} + +static void handle_atomics(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64, + u16 tail, u16 head) +{ + u16 idx; + + do { + idx = tail & (qp->sq.wqe_cnt - 1); + handle_atomic(qp, cqe64, idx); + if (idx == head) + break; + + tail = qp->sq.w_list[idx].next; + } while (1); + tail = qp->sq.w_list[idx].next; + qp->sq.last_poll = tail; +} + +static void free_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf) +{ + mlx5_buf_free(dev->mdev, &buf->buf); +} + +static void get_sig_err_item(struct mlx5_sig_err_cqe *cqe, + struct ib_sig_err *item) +{ + u16 syndrome = be16_to_cpu(cqe->syndrome); + +#define GUARD_ERR (1 << 13) +#define APPTAG_ERR (1 << 12) +#define REFTAG_ERR (1 << 11) + + if (syndrome & GUARD_ERR) { + item->err_type = IB_SIG_BAD_GUARD; + item->expected = be32_to_cpu(cqe->expected_trans_sig) >> 16; + item->actual = be32_to_cpu(cqe->actual_trans_sig) >> 16; + } else + if (syndrome & REFTAG_ERR) { + item->err_type = IB_SIG_BAD_REFTAG; + item->expected = be32_to_cpu(cqe->expected_reftag); + item->actual = be32_to_cpu(cqe->actual_reftag); + } else + if (syndrome & APPTAG_ERR) { + item->err_type = IB_SIG_BAD_APPTAG; + item->expected = be32_to_cpu(cqe->expected_trans_sig) & 0xffff; + item->actual = be32_to_cpu(cqe->actual_trans_sig) & 0xffff; + } else { + pr_err("Got signature completion error with bad syndrome %04x\n", + syndrome); + } + + item->sig_err_offset = be64_to_cpu(cqe->err_offset); + item->key = be32_to_cpu(cqe->mkey); +} + +static int mlx5_poll_one(struct mlx5_ib_cq *cq, + struct mlx5_ib_qp **cur_qp, + struct ib_wc *wc) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct mlx5_err_cqe *err_cqe; + struct mlx5_cqe64 *cqe64; + struct mlx5_core_qp *mqp; + struct mlx5_ib_wq *wq; + struct mlx5_sig_err_cqe *sig_err_cqe; + struct mlx5_core_mr *mmr; + struct mlx5_ib_mr *mr; + uint8_t opcode; + uint32_t qpn; + u16 wqe_ctr; + void *cqe; + int idx; + +repoll: + cqe = next_cqe_sw(cq); + if (!cqe) + return -EAGAIN; + + cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; + + ++cq->mcq.cons_index; + + /* Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + rmb(); + + opcode = cqe64->op_own >> 4; + if (unlikely(opcode == MLX5_CQE_RESIZE_CQ)) { + if (likely(cq->resize_buf)) { + free_cq_buf(dev, &cq->buf); + cq->buf = *cq->resize_buf; + kfree(cq->resize_buf); + cq->resize_buf = NULL; + goto repoll; + } else { + mlx5_ib_warn(dev, "unexpected resize cqe\n"); + } + } + + qpn = ntohl(cqe64->sop_drop_qpn) & 0xffffff; + if (!*cur_qp || (qpn != (*cur_qp)->ibqp.qp_num)) { + /* We do not have to take the QP table lock here, + * because CQs will be locked while QPs are removed + * from the table. + */ + mqp = __mlx5_qp_lookup(dev->mdev, qpn); + if (unlikely(!mqp)) { + mlx5_ib_warn(dev, "CQE@CQ %06x for unknown QPN %6x\n", + cq->mcq.cqn, qpn); + return -EINVAL; + } + + *cur_qp = to_mibqp(mqp); + } + + wc->qp = &(*cur_qp)->ibqp; + switch (opcode) { + case MLX5_CQE_REQ: + wq = &(*cur_qp)->sq; + wqe_ctr = be16_to_cpu(cqe64->wqe_counter); + idx = wqe_ctr & (wq->wqe_cnt - 1); + handle_good_req(wc, cqe64, wq, idx); + handle_atomics(*cur_qp, cqe64, wq->last_poll, idx); + wc->wr_id = wq->wrid[idx]; + wq->tail = wq->wqe_head[idx] + 1; + wc->status = IB_WC_SUCCESS; + break; + case MLX5_CQE_RESP_WR_IMM: + case MLX5_CQE_RESP_SEND: + case MLX5_CQE_RESP_SEND_IMM: + case MLX5_CQE_RESP_SEND_INV: + handle_responder(wc, cqe64, *cur_qp); + wc->status = IB_WC_SUCCESS; + break; + case MLX5_CQE_RESIZE_CQ: + break; + case MLX5_CQE_REQ_ERR: + case MLX5_CQE_RESP_ERR: + err_cqe = (struct mlx5_err_cqe *)cqe64; + mlx5_handle_error_cqe(dev, err_cqe, wc); + mlx5_ib_dbg(dev, "%s error cqe on cqn 0x%x:\n", + opcode == MLX5_CQE_REQ_ERR ? + "Requestor" : "Responder", cq->mcq.cqn); + mlx5_ib_dbg(dev, "syndrome 0x%x, vendor syndrome 0x%x\n", + err_cqe->syndrome, err_cqe->vendor_err_synd); + if (opcode == MLX5_CQE_REQ_ERR) { + wq = &(*cur_qp)->sq; + wqe_ctr = be16_to_cpu(cqe64->wqe_counter); + idx = wqe_ctr & (wq->wqe_cnt - 1); + wc->wr_id = wq->wrid[idx]; + wq->tail = wq->wqe_head[idx] + 1; + } else { + struct mlx5_ib_srq *srq; + + if ((*cur_qp)->ibqp.srq) { + srq = to_msrq((*cur_qp)->ibqp.srq); + wqe_ctr = be16_to_cpu(cqe64->wqe_counter); + wc->wr_id = srq->wrid[wqe_ctr]; + mlx5_ib_free_srq_wqe(srq, wqe_ctr); + } else { + wq = &(*cur_qp)->rq; + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + ++wq->tail; + } + } + break; + case MLX5_CQE_SIG_ERR: + sig_err_cqe = (struct mlx5_sig_err_cqe *)cqe64; + + read_lock(&dev->mdev->priv.mr_table.lock); + mmr = __mlx5_mr_lookup(dev->mdev, + mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey))); + if (unlikely(!mmr)) { + read_unlock(&dev->mdev->priv.mr_table.lock); + mlx5_ib_warn(dev, "CQE@CQ %06x for unknown MR %6x\n", + cq->mcq.cqn, be32_to_cpu(sig_err_cqe->mkey)); + return -EINVAL; + } + + mr = to_mibmr(mmr); + get_sig_err_item(sig_err_cqe, &mr->sig->err_item); + mr->sig->sig_err_exists = true; + mr->sig->sigerr_count++; + + mlx5_ib_warn(dev, "CQN: 0x%x Got SIGERR on key: 0x%x err_type %x err_offset %llx expected %x actual %x\n", + cq->mcq.cqn, mr->sig->err_item.key, + mr->sig->err_item.err_type, + mr->sig->err_item.sig_err_offset, + mr->sig->err_item.expected, + mr->sig->err_item.actual); + + read_unlock(&dev->mdev->priv.mr_table.lock); + goto repoll; + } + + return 0; +} + +int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + struct mlx5_ib_cq *cq = to_mcq(ibcq); + struct mlx5_ib_qp *cur_qp = NULL; + unsigned long flags; + int npolled; + int err = 0; + + spin_lock_irqsave(&cq->lock, flags); + + for (npolled = 0; npolled < num_entries; npolled++) { + err = mlx5_poll_one(cq, &cur_qp, wc + npolled); + if (err) + break; + } + + if (npolled) + mlx5_cq_set_ci(&cq->mcq); + + spin_unlock_irqrestore(&cq->lock, flags); + + if (err == 0 || err == -EAGAIN) + return npolled; + else + return err; +} + +int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + mlx5_cq_arm(&to_mcq(ibcq)->mcq, + (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? + MLX5_CQ_DB_REQ_NOT_SOL : MLX5_CQ_DB_REQ_NOT, + to_mdev(ibcq->device)->mdev->priv.uuari.uars[0].map, + MLX5_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->mdev->priv.cq_uar_lock)); + + return 0; +} + +static int alloc_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf, + int nent, int cqe_size) +{ + int err; + + err = mlx5_buf_alloc(dev->mdev, nent * cqe_size, + PAGE_SIZE * 2, &buf->buf); + if (err) + return err; + + buf->cqe_size = cqe_size; + buf->nent = nent; + + return 0; +} + +static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, + struct ib_ucontext *context, struct mlx5_ib_cq *cq, + int entries, struct mlx5_create_cq_mbox_in **cqb, + int *cqe_size, int *index, int *inlen) +{ + struct mlx5_ib_create_cq ucmd; + size_t ucmdlen; + int page_shift; + int npages; + int ncont; + int err; + + ucmdlen = + (udata->inlen - sizeof(struct ib_uverbs_cmd_hdr) < + sizeof(ucmd)) ? (sizeof(ucmd) - + sizeof(ucmd.reserved)) : sizeof(ucmd); + + if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) + return -EFAULT; + + if (ucmdlen == sizeof(ucmd) && + ucmd.reserved != 0) + return -EINVAL; + + if (ucmd.cqe_size != 64 && ucmd.cqe_size != 128) + return -EINVAL; + + *cqe_size = ucmd.cqe_size; + + cq->buf.umem = ib_umem_get(context, ucmd.buf_addr, + entries * ucmd.cqe_size, + IB_ACCESS_LOCAL_WRITE, 1); + if (IS_ERR(cq->buf.umem)) { + err = PTR_ERR(cq->buf.umem); + return err; + } + + err = mlx5_ib_db_map_user(to_mucontext(context), ucmd.db_addr, + &cq->db); + if (err) + goto err_umem; + + mlx5_ib_cont_pages(cq->buf.umem, ucmd.buf_addr, &npages, &page_shift, + &ncont, NULL); + mlx5_ib_dbg(dev, "addr 0x%llx, size %u, npages %d, page_shift %d, ncont %d\n", + ucmd.buf_addr, entries * ucmd.cqe_size, npages, page_shift, ncont); + + *inlen = sizeof(**cqb) + sizeof(*(*cqb)->pas) * ncont; + *cqb = mlx5_vzalloc(*inlen); + if (!*cqb) { + err = -ENOMEM; + goto err_db; + } + mlx5_ib_populate_pas(dev, cq->buf.umem, page_shift, (*cqb)->pas, 0); + (*cqb)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + + *index = to_mucontext(context)->uuari.uars[0].index; + + return 0; + +err_db: + mlx5_ib_db_unmap_user(to_mucontext(context), &cq->db); + +err_umem: + ib_umem_release(cq->buf.umem); + return err; +} + +static void destroy_cq_user(struct mlx5_ib_cq *cq, struct ib_ucontext *context) +{ + mlx5_ib_db_unmap_user(to_mucontext(context), &cq->db); + ib_umem_release(cq->buf.umem); +} + +static void init_cq_buf(struct mlx5_ib_cq *cq, struct mlx5_ib_cq_buf *buf) +{ + int i; + void *cqe; + struct mlx5_cqe64 *cqe64; + + for (i = 0; i < buf->nent; i++) { + cqe = get_cqe_from_buf(buf, i, buf->cqe_size); + cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64; + cqe64->op_own = MLX5_CQE_INVALID << 4; + } +} + +static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, + int entries, int cqe_size, + struct mlx5_create_cq_mbox_in **cqb, + int *index, int *inlen) +{ + int err; + + err = mlx5_db_alloc(dev->mdev, &cq->db); + if (err) + return err; + + cq->mcq.set_ci_db = cq->db.db; + cq->mcq.arm_db = cq->db.db + 1; + *cq->mcq.set_ci_db = 0; + *cq->mcq.arm_db = 0; + cq->mcq.cqe_sz = cqe_size; + + err = alloc_cq_buf(dev, &cq->buf, entries, cqe_size); + if (err) + goto err_db; + + init_cq_buf(cq, &cq->buf); + + *inlen = sizeof(**cqb) + sizeof(*(*cqb)->pas) * cq->buf.buf.npages; + *cqb = mlx5_vzalloc(*inlen); + if (!*cqb) { + err = -ENOMEM; + goto err_buf; + } + mlx5_fill_page_array(&cq->buf.buf, (*cqb)->pas); + + (*cqb)->ctx.log_pg_sz = cq->buf.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT; + *index = dev->mdev->priv.uuari.uars[0].index; + + return 0; + +err_buf: + free_cq_buf(dev, &cq->buf); + +err_db: + mlx5_db_free(dev->mdev, &cq->db); + return err; +} + +static void destroy_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq) +{ + free_cq_buf(dev, &cq->buf); + mlx5_db_free(dev->mdev, &cq->db); +} + +struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries, + int vector, struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx5_create_cq_mbox_in *cqb = NULL; + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_cq *cq; + int uninitialized_var(index); + int uninitialized_var(inlen); + int cqe_size; + int irqn; + int eqn; + int err; + + if (entries < 0) + return ERR_PTR(-EINVAL); + + entries = roundup_pow_of_two(entries + 1); + if (entries > dev->mdev->caps.gen.max_cqes) + return ERR_PTR(-EINVAL); + + cq = kzalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) + return ERR_PTR(-ENOMEM); + + cq->ibcq.cqe = entries - 1; + mutex_init(&cq->resize_mutex); + spin_lock_init(&cq->lock); + cq->resize_buf = NULL; + cq->resize_umem = NULL; + + if (context) { + err = create_cq_user(dev, udata, context, cq, entries, + &cqb, &cqe_size, &index, &inlen); + if (err) + goto err_create; + } else { + /* for now choose 64 bytes till we have a proper interface */ + cqe_size = 64; + err = create_cq_kernel(dev, cq, entries, cqe_size, &cqb, + &index, &inlen); + if (err) + goto err_create; + } + + cq->cqe_size = cqe_size; + cqb->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5; + cqb->ctx.log_sz_usr_page = cpu_to_be32((ilog2(entries) << 24) | index); + err = mlx5_vector2eqn(dev, vector, &eqn, &irqn); + if (err) + goto err_cqb; + + cqb->ctx.c_eqn = cpu_to_be16(eqn); + cqb->ctx.db_record_addr = cpu_to_be64(cq->db.dma); + + err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen); + if (err) + goto err_cqb; + + mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn); + cq->mcq.irqn = irqn; + cq->mcq.comp = mlx5_ib_cq_comp; + cq->mcq.event = mlx5_ib_cq_event; + + if (context) + if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof(__u32))) { + err = -EFAULT; + goto err_cmd; + } + + + mlx5_vfree(cqb); + return &cq->ibcq; + +err_cmd: + mlx5_core_destroy_cq(dev->mdev, &cq->mcq); + +err_cqb: + mlx5_vfree(cqb); + if (context) + destroy_cq_user(cq, context); + else + destroy_cq_kernel(dev, cq); + +err_create: + kfree(cq); + + return ERR_PTR(err); +} + + +int mlx5_ib_destroy_cq(struct ib_cq *cq) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->device); + struct mlx5_ib_cq *mcq = to_mcq(cq); + struct ib_ucontext *context = NULL; + + if (cq->uobject) + context = cq->uobject->context; + + mlx5_core_destroy_cq(dev->mdev, &mcq->mcq); + if (context) + destroy_cq_user(mcq, context); + else + destroy_cq_kernel(dev, mcq); + + kfree(mcq); + + return 0; +} + +static int is_equal_rsn(struct mlx5_cqe64 *cqe64, u32 rsn) +{ + return rsn == (ntohl(cqe64->sop_drop_qpn) & 0xffffff); +} + +void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 rsn, struct mlx5_ib_srq *srq) +{ + struct mlx5_cqe64 *cqe64, *dest64; + void *cqe, *dest; + u32 prod_index; + int nfreed = 0; + u8 owner_bit; + + if (!cq) + return; + + /* First we need to find the current producer index, so we + * know where to start cleaning from. It doesn't matter if HW + * adds new entries after this loop -- the QP we're worried + * about is already in RESET, so the new entries won't come + * from our QP and therefore don't need to be checked. + */ + for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); prod_index++) + if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe) + break; + + /* Now sweep backwards through the CQ, removing CQ entries + * that match our QP by copying older entries on top of them. + */ + while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) { + cqe = get_cqe(cq, prod_index & cq->ibcq.cqe); + cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; + if (is_equal_rsn(cqe64, rsn)) { + if (srq && (ntohl(cqe64->srqn) & 0xffffff)) + mlx5_ib_free_srq_wqe(srq, be16_to_cpu(cqe64->wqe_counter)); + ++nfreed; + } else if (nfreed) { + dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe); + dest64 = (cq->mcq.cqe_sz == 64) ? dest : dest + 64; + owner_bit = dest64->op_own & MLX5_CQE_OWNER_MASK; + memcpy(dest, cqe, cq->mcq.cqe_sz); + dest64->op_own = owner_bit | + (dest64->op_own & ~MLX5_CQE_OWNER_MASK); + } + } + + if (nfreed) { + cq->mcq.cons_index += nfreed; + /* Make sure update of buffer contents is done before + * updating consumer index. + */ + wmb(); + mlx5_cq_set_ci(&cq->mcq); + } +} + +void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq) +{ + if (!cq) + return; + + spin_lock_irq(&cq->lock); + __mlx5_ib_cq_clean(cq, qpn, srq); + spin_unlock_irq(&cq->lock); +} + +int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) +{ + struct mlx5_modify_cq_mbox_in *in; + struct mlx5_ib_dev *dev = to_mdev(cq->device); + struct mlx5_ib_cq *mcq = to_mcq(cq); + int err; + u32 fsel; + + if (!(dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_CQ_MODER)) + return -ENOSYS; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + in->cqn = cpu_to_be32(mcq->mcq.cqn); + fsel = (MLX5_CQ_MODIFY_PERIOD | MLX5_CQ_MODIFY_COUNT); + in->ctx.cq_period = cpu_to_be16(cq_period); + in->ctx.cq_max_count = cpu_to_be16(cq_count); + in->field_select = cpu_to_be32(fsel); + err = mlx5_core_modify_cq(dev->mdev, &mcq->mcq, in, sizeof(*in)); + kfree(in); + + if (err) + mlx5_ib_warn(dev, "modify cq 0x%x failed\n", mcq->mcq.cqn); + + return err; +} + +static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, + int entries, struct ib_udata *udata, int *npas, + int *page_shift, int *cqe_size) +{ + struct mlx5_ib_resize_cq ucmd; + struct ib_umem *umem; + int err; + int npages; + struct ib_ucontext *context = cq->buf.umem->context; + + err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); + if (err) + return err; + + if (ucmd.reserved0 || ucmd.reserved1) + return -EINVAL; + + umem = ib_umem_get(context, ucmd.buf_addr, entries * ucmd.cqe_size, + IB_ACCESS_LOCAL_WRITE, 1); + if (IS_ERR(umem)) { + err = PTR_ERR(umem); + return err; + } + + mlx5_ib_cont_pages(umem, ucmd.buf_addr, &npages, page_shift, + npas, NULL); + + cq->resize_umem = umem; + *cqe_size = ucmd.cqe_size; + + return 0; +} + +static void un_resize_user(struct mlx5_ib_cq *cq) +{ + ib_umem_release(cq->resize_umem); +} + +static int resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, + int entries, int cqe_size) +{ + int err; + + cq->resize_buf = kzalloc(sizeof(*cq->resize_buf), GFP_KERNEL); + if (!cq->resize_buf) + return -ENOMEM; + + err = alloc_cq_buf(dev, cq->resize_buf, entries, cqe_size); + if (err) + goto ex; + + init_cq_buf(cq, cq->resize_buf); + + return 0; + +ex: + kfree(cq->resize_buf); + return err; +} + +static void un_resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq) +{ + free_cq_buf(dev, cq->resize_buf); + cq->resize_buf = NULL; +} + +static int copy_resize_cqes(struct mlx5_ib_cq *cq) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct mlx5_cqe64 *scqe64; + struct mlx5_cqe64 *dcqe64; + void *start_cqe; + void *scqe; + void *dcqe; + int ssize; + int dsize; + int i; + u8 sw_own; + + ssize = cq->buf.cqe_size; + dsize = cq->resize_buf->cqe_size; + if (ssize != dsize) { + mlx5_ib_warn(dev, "resize from different cqe size is not supported\n"); + return -EINVAL; + } + + i = cq->mcq.cons_index; + scqe = get_sw_cqe(cq, i); + scqe64 = ssize == 64 ? scqe : scqe + 64; + start_cqe = scqe; + if (!scqe) { + mlx5_ib_warn(dev, "expected cqe in sw ownership\n"); + return -EINVAL; + } + + while ((scqe64->op_own >> 4) != MLX5_CQE_RESIZE_CQ) { + dcqe = get_cqe_from_buf(cq->resize_buf, + (i + 1) & (cq->resize_buf->nent), + dsize); + dcqe64 = dsize == 64 ? dcqe : dcqe + 64; + sw_own = sw_ownership_bit(i + 1, cq->resize_buf->nent); + memcpy(dcqe, scqe, dsize); + dcqe64->op_own = (dcqe64->op_own & ~MLX5_CQE_OWNER_MASK) | sw_own; + + ++i; + scqe = get_sw_cqe(cq, i); + scqe64 = ssize == 64 ? scqe : scqe + 64; + if (!scqe) { + mlx5_ib_warn(dev, "expected cqe in sw ownership\n"); + return -EINVAL; + } + + if (scqe == start_cqe) { + pr_warn("resize CQ failed to get resize CQE, CQN 0x%x\n", + cq->mcq.cqn); + return -ENOMEM; + } + } + ++cq->mcq.cons_index; + return 0; +} + +int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibcq->device); + struct mlx5_ib_cq *cq = to_mcq(ibcq); + struct mlx5_modify_cq_mbox_in *in; + int err; + int npas; + int page_shift; + int inlen; + int uninitialized_var(cqe_size); + unsigned long flags; + + if (!(dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_RESIZE_CQ)) { + pr_info("Firmware does not support resize CQ\n"); + return -ENOSYS; + } + + if (entries < 1) + return -EINVAL; + + entries = roundup_pow_of_two(entries + 1); + if (entries > dev->mdev->caps.gen.max_cqes + 1) + return -EINVAL; + + if (entries == ibcq->cqe + 1) + return 0; + + mutex_lock(&cq->resize_mutex); + if (udata) { + err = resize_user(dev, cq, entries, udata, &npas, &page_shift, + &cqe_size); + } else { + cqe_size = 64; + err = resize_kernel(dev, cq, entries, cqe_size); + if (!err) { + npas = cq->resize_buf->buf.npages; + page_shift = cq->resize_buf->buf.page_shift; + } + } + + if (err) + goto ex; + + inlen = sizeof(*in) + npas * sizeof(in->pas[0]); + in = mlx5_vzalloc(inlen); + if (!in) { + err = -ENOMEM; + goto ex_resize; + } + + if (udata) + mlx5_ib_populate_pas(dev, cq->resize_umem, page_shift, + in->pas, 0); + else + mlx5_fill_page_array(&cq->resize_buf->buf, in->pas); + + in->field_select = cpu_to_be32(MLX5_MODIFY_CQ_MASK_LOG_SIZE | + MLX5_MODIFY_CQ_MASK_PG_OFFSET | + MLX5_MODIFY_CQ_MASK_PG_SIZE); + in->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + in->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5; + in->ctx.page_offset = 0; + in->ctx.log_sz_usr_page = cpu_to_be32(ilog2(entries) << 24); + in->hdr.opmod = cpu_to_be16(MLX5_CQ_OPMOD_RESIZE); + in->cqn = cpu_to_be32(cq->mcq.cqn); + + err = mlx5_core_modify_cq(dev->mdev, &cq->mcq, in, inlen); + if (err) + goto ex_alloc; + + if (udata) { + cq->ibcq.cqe = entries - 1; + ib_umem_release(cq->buf.umem); + cq->buf.umem = cq->resize_umem; + cq->resize_umem = NULL; + } else { + struct mlx5_ib_cq_buf tbuf; + int resized = 0; + + spin_lock_irqsave(&cq->lock, flags); + if (cq->resize_buf) { + err = copy_resize_cqes(cq); + if (!err) { + tbuf = cq->buf; + cq->buf = *cq->resize_buf; + kfree(cq->resize_buf); + cq->resize_buf = NULL; + resized = 1; + } + } + cq->ibcq.cqe = entries - 1; + spin_unlock_irqrestore(&cq->lock, flags); + if (resized) + free_cq_buf(dev, &tbuf); + } + mutex_unlock(&cq->resize_mutex); + + mlx5_vfree(in); + return 0; + +ex_alloc: + mlx5_vfree(in); + +ex_resize: + if (udata) + un_resize_user(cq); + else + un_resize_kernel(dev, cq); +ex: + mutex_unlock(&cq->resize_mutex); + return err; +} + +int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq) +{ + struct mlx5_ib_cq *cq; + + if (!ibcq) + return 128; + + cq = to_mcq(ibcq); + return cq->cqe_size; +} diff --git a/drivers/infiniband/hw/mlx5/doorbell.c b/drivers/infiniband/hw/mlx5/doorbell.c new file mode 100644 index 0000000..ece028f --- /dev/null +++ b/drivers/infiniband/hw/mlx5/doorbell.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "mlx5_ib.h" + +struct mlx5_ib_user_db_page { + struct list_head list; + struct ib_umem *umem; + unsigned long user_virt; + int refcnt; +}; + +int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt, + struct mlx5_db *db) +{ + struct mlx5_ib_user_db_page *page; + int err = 0; + + mutex_lock(&context->db_page_mutex); + + list_for_each_entry(page, &context->db_page_list, list) + if (page->user_virt == (virt & PAGE_MASK)) + goto found; + + page = kmalloc(sizeof(*page), GFP_KERNEL); + if (!page) { + err = -ENOMEM; + goto out; + } + + page->user_virt = (virt & PAGE_MASK); + page->refcnt = 0; + page->umem = ib_umem_get(&context->ibucontext, virt & PAGE_MASK, + PAGE_SIZE, 0, 0); + if (IS_ERR(page->umem)) { + err = PTR_ERR(page->umem); + kfree(page); + goto out; + } + + list_add(&page->list, &context->db_page_list); + +found: + db->dma = sg_dma_address(page->umem->sg_head.sgl) + (virt & ~PAGE_MASK); + db->u.user_page = page; + ++page->refcnt; + +out: + mutex_unlock(&context->db_page_mutex); + + return err; +} + +void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db) +{ + mutex_lock(&context->db_page_mutex); + + if (!--db->u.user_page->refcnt) { + list_del(&db->u.user_page->list); + ib_umem_release(db->u.user_page->umem); + kfree(db->u.user_page); + } + + mutex_unlock(&context->db_page_mutex); +} diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c new file mode 100644 index 0000000..657af9a --- /dev/null +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include "mlx5_ib.h" + +enum { + MLX5_IB_VENDOR_CLASS1 = 0x9, + MLX5_IB_VENDOR_CLASS2 = 0xa +}; + +int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey, + u8 port, struct ib_wc *in_wc, struct ib_grh *in_grh, + void *in_mad, void *response_mad) +{ + u8 op_modifier = 0; + + /* Key check traps can't be generated unless we have in_wc to + * tell us where to send the trap. + */ + if (ignore_mkey || !in_wc) + op_modifier |= 0x1; + if (ignore_bkey || !in_wc) + op_modifier |= 0x2; + + return mlx5_core_mad_ifc(dev->mdev, in_mad, response_mad, op_modifier, port); +} + +int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + u16 slid; + int err; + + slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); + + if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + + if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_SET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_TRAP_REPRESS) + return IB_MAD_RESULT_SUCCESS; + + /* Don't process SMInfo queries -- the SMA can't handle them. + */ + if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO) + return IB_MAD_RESULT_SUCCESS; + } else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT || + in_mad->mad_hdr.mgmt_class == MLX5_IB_VENDOR_CLASS1 || + in_mad->mad_hdr.mgmt_class == MLX5_IB_VENDOR_CLASS2 || + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_CONG_MGMT) { + if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_SET) + return IB_MAD_RESULT_SUCCESS; + } else { + return IB_MAD_RESULT_SUCCESS; + } + + err = mlx5_MAD_IFC(to_mdev(ibdev), + mad_flags & IB_MAD_IGNORE_MKEY, + mad_flags & IB_MAD_IGNORE_BKEY, + port_num, in_wc, in_grh, in_mad, out_mad); + if (err) + return IB_MAD_RESULT_FAILURE; + + /* set return bit in status of directed route responses */ + if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + out_mad->mad_hdr.status |= cpu_to_be16(1 << 15); + + if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) + /* no response for trap repress */ + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + u16 packet_error; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = MLX5_ATTR_EXTENDED_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + + packet_error = be16_to_cpu(out_mad->status); + + dev->mdev->caps.gen.ext_port_cap[port - 1] = (!err && !packet_error) ? + MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO : 0; + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c new file mode 100644 index 0000000..1ba6c42 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/main.c @@ -0,0 +1,1455 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "user.h" +#include "mlx5_ib.h" + +#define DRIVER_NAME "mlx5_ib" +#define DRIVER_VERSION "2.2-1" +#define DRIVER_RELDATE "Feb 2014" + +MODULE_AUTHOR("Eli Cohen "); +MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRIVER_VERSION); + +static int deprecated_prof_sel = 2; +module_param_named(prof_sel, deprecated_prof_sel, int, 0444); +MODULE_PARM_DESC(prof_sel, "profile selector. Deprecated here. Moved to module mlx5_core"); + +static char mlx5_version[] = + DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v" + DRIVER_VERSION " (" DRIVER_RELDATE ")\n"; + +int mlx5_vector2eqn(struct mlx5_ib_dev *dev, int vector, int *eqn, int *irqn) +{ + struct mlx5_eq_table *table = &dev->mdev->priv.eq_table; + struct mlx5_eq *eq, *n; + int err = -ENOENT; + + spin_lock(&table->lock); + list_for_each_entry_safe(eq, n, &dev->eqs_list, list) { + if (eq->index == vector) { + *eqn = eq->eqn; + *irqn = eq->irqn; + err = 0; + break; + } + } + spin_unlock(&table->lock); + + return err; +} + +static int alloc_comp_eqs(struct mlx5_ib_dev *dev) +{ + struct mlx5_eq_table *table = &dev->mdev->priv.eq_table; + char name[MLX5_MAX_EQ_NAME]; + struct mlx5_eq *eq, *n; + int ncomp_vec; + int nent; + int err; + int i; + + INIT_LIST_HEAD(&dev->eqs_list); + ncomp_vec = table->num_comp_vectors; + nent = MLX5_COMP_EQ_SIZE; + for (i = 0; i < ncomp_vec; i++) { + eq = kzalloc(sizeof(*eq), GFP_KERNEL); + if (!eq) { + err = -ENOMEM; + goto clean; + } + + snprintf(name, MLX5_MAX_EQ_NAME, "mlx5_comp%d", i); + err = mlx5_create_map_eq(dev->mdev, eq, + i + MLX5_EQ_VEC_COMP_BASE, nent, 0, + name, &dev->mdev->priv.uuari.uars[0]); + if (err) { + kfree(eq); + goto clean; + } + mlx5_ib_dbg(dev, "allocated completion EQN %d\n", eq->eqn); + eq->index = i; + spin_lock(&table->lock); + list_add_tail(&eq->list, &dev->eqs_list); + spin_unlock(&table->lock); + } + + dev->num_comp_vectors = ncomp_vec; + return 0; + +clean: + spin_lock(&table->lock); + list_for_each_entry_safe(eq, n, &dev->eqs_list, list) { + list_del(&eq->list); + spin_unlock(&table->lock); + if (mlx5_destroy_unmap_eq(dev->mdev, eq)) + mlx5_ib_warn(dev, "failed to destroy EQ 0x%x\n", eq->eqn); + kfree(eq); + spin_lock(&table->lock); + } + spin_unlock(&table->lock); + return err; +} + +static void free_comp_eqs(struct mlx5_ib_dev *dev) +{ + struct mlx5_eq_table *table = &dev->mdev->priv.eq_table; + struct mlx5_eq *eq, *n; + + spin_lock(&table->lock); + list_for_each_entry_safe(eq, n, &dev->eqs_list, list) { + list_del(&eq->list); + spin_unlock(&table->lock); + if (mlx5_destroy_unmap_eq(dev->mdev, eq)) + mlx5_ib_warn(dev, "failed to destroy EQ 0x%x\n", eq->eqn); + kfree(eq); + spin_lock(&table->lock); + } + spin_unlock(&table->lock); +} + +static int mlx5_ib_query_device(struct ib_device *ibdev, + struct ib_device_attr *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + struct mlx5_general_caps *gen; + int err = -ENOMEM; + int max_rq_sg; + int max_sq_sg; + u64 flags; + + gen = &dev->mdev->caps.gen; + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memset(props, 0, sizeof(*props)); + + props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) | + (fw_rev_min(dev->mdev) << 16) | + fw_rev_sub(dev->mdev); + props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | + IB_DEVICE_PORT_ACTIVE_EVENT | + IB_DEVICE_SYS_IMAGE_GUID | + IB_DEVICE_RC_RNR_NAK_GEN; + flags = gen->flags; + if (flags & MLX5_DEV_CAP_FLAG_BAD_PKEY_CNTR) + props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; + if (flags & MLX5_DEV_CAP_FLAG_BAD_QKEY_CNTR) + props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; + if (flags & MLX5_DEV_CAP_FLAG_APM) + props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; + props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY; + if (flags & MLX5_DEV_CAP_FLAG_XRC) + props->device_cap_flags |= IB_DEVICE_XRC; + props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; + if (flags & MLX5_DEV_CAP_FLAG_SIG_HAND_OVER) { + props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER; + /* At this stage no support for signature handover */ + props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 | + IB_PROT_T10DIF_TYPE_2 | + IB_PROT_T10DIF_TYPE_3; + props->sig_guard_cap = IB_GUARD_T10DIF_CRC | + IB_GUARD_T10DIF_CSUM; + } + if (flags & MLX5_DEV_CAP_FLAG_BLOCK_MCAST) + props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; + + props->vendor_id = be32_to_cpup((__be32 *)(out_mad->data + 36)) & + 0xffffff; + props->vendor_part_id = be16_to_cpup((__be16 *)(out_mad->data + 30)); + props->hw_ver = be32_to_cpup((__be32 *)(out_mad->data + 32)); + memcpy(&props->sys_image_guid, out_mad->data + 4, 8); + + props->max_mr_size = ~0ull; + props->page_size_cap = gen->min_page_sz; + props->max_qp = 1 << gen->log_max_qp; + props->max_qp_wr = gen->max_wqes; + max_rq_sg = gen->max_rq_desc_sz / sizeof(struct mlx5_wqe_data_seg); + max_sq_sg = (gen->max_sq_desc_sz - sizeof(struct mlx5_wqe_ctrl_seg)) / + sizeof(struct mlx5_wqe_data_seg); + props->max_sge = min(max_rq_sg, max_sq_sg); + props->max_cq = 1 << gen->log_max_cq; + props->max_cqe = gen->max_cqes - 1; + props->max_mr = 1 << gen->log_max_mkey; + props->max_pd = 1 << gen->log_max_pd; + props->max_qp_rd_atom = 1 << gen->log_max_ra_req_qp; + props->max_qp_init_rd_atom = 1 << gen->log_max_ra_res_qp; + props->max_srq = 1 << gen->log_max_srq; + props->max_srq_wr = gen->max_srq_wqes - 1; + props->local_ca_ack_delay = gen->local_ca_ack_delay; + props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; + props->max_srq_sge = max_rq_sg - 1; + props->max_fast_reg_page_list_len = (unsigned int)-1; + props->local_ca_ack_delay = gen->local_ca_ack_delay; + props->atomic_cap = IB_ATOMIC_NONE; + props->masked_atomic_cap = IB_ATOMIC_NONE; + props->max_pkeys = be16_to_cpup((__be16 *)(out_mad->data + 28)); + props->max_mcast_grp = 1 << gen->log_max_mcg; + props->max_mcast_qp_attach = gen->max_qp_mcg; + props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * + props->max_mcast_grp; + props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */ + +out: + kfree(in_mad); + kfree(out_mad); + + return err; +} + +int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + struct mlx5_general_caps *gen; + int ext_active_speed; + int err = -ENOMEM; + + gen = &dev->mdev->caps.gen; + if (port < 1 || port > gen->num_ports) { + mlx5_ib_warn(dev, "invalid port number %d\n", port); + return -EINVAL; + } + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + memset(props, 0, sizeof(*props)); + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(dev, 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) { + mlx5_ib_warn(dev, "err %d\n", err); + goto out; + } + + + props->lid = be16_to_cpup((__be16 *)(out_mad->data + 16)); + props->lmc = out_mad->data[34] & 0x7; + props->sm_lid = be16_to_cpup((__be16 *)(out_mad->data + 18)); + props->sm_sl = out_mad->data[36] & 0xf; + props->state = out_mad->data[32] & 0xf; + props->phys_state = out_mad->data[33] >> 4; + props->port_cap_flags = be32_to_cpup((__be32 *)(out_mad->data + 20)); + props->gid_tbl_len = out_mad->data[50]; + props->max_msg_sz = 1 << gen->log_max_msg; + props->pkey_tbl_len = gen->port[port - 1].pkey_table_len; + props->bad_pkey_cntr = be16_to_cpup((__be16 *)(out_mad->data + 46)); + props->qkey_viol_cntr = be16_to_cpup((__be16 *)(out_mad->data + 48)); + props->active_width = out_mad->data[31] & 0xf; + props->active_speed = out_mad->data[35] >> 4; + props->max_mtu = out_mad->data[41] & 0xf; + props->active_mtu = out_mad->data[36] >> 4; + props->subnet_timeout = out_mad->data[51] & 0x1f; + props->max_vl_num = out_mad->data[37] >> 4; + props->init_type_reply = out_mad->data[41] >> 4; + + /* Check if extended speeds (EDR/FDR/...) are supported */ + if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) { + ext_active_speed = out_mad->data[62] >> 4; + + switch (ext_active_speed) { + case 1: + props->active_speed = 16; /* FDR */ + break; + case 2: + props->active_speed = 32; /* EDR */ + break; + } + } + + /* If reported active speed is QDR, check if is FDR-10 */ + if (props->active_speed == 4) { + if (gen->ext_port_cap[port - 1] & + MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO) { + init_query_mad(in_mad); + in_mad->attr_id = MLX5_ATTR_EXTENDED_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(dev, 1, 1, port, + NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + /* Checking LinkSpeedActive for FDR-10 */ + if (out_mad->data[15] & 0x1) + props->active_speed = 8; + } + } + +out: + kfree(in_mad); + kfree(out_mad); + + return err; +} + +static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(gid->raw, out_mad->data + 8, 8); + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; + in_mad->attr_mod = cpu_to_be32(index / 8); + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PKEY_TABLE; + in_mad->attr_mod = cpu_to_be32(index / 32); + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + *pkey = be16_to_cpu(((__be16 *)out_mad->data)[index % 32]); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +struct mlx5_reg_node_desc { + u8 desc[64]; +}; + +static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask, + struct ib_device_modify *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_reg_node_desc in; + struct mlx5_reg_node_desc out; + int err; + + if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) + return -EOPNOTSUPP; + + if (!(mask & IB_DEVICE_MODIFY_NODE_DESC)) + return 0; + + /* + * If possible, pass node desc to FW, so it can generate + * a 144 trap. If cmd fails, just ignore. + */ + memcpy(&in, props->node_desc, 64); + err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out, + sizeof(out), MLX5_REG_NODE_DESC, 0, 1); + if (err) + return err; + + memcpy(ibdev->node_desc, props->node_desc, 64); + + return err; +} + +static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, + struct ib_port_modify *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct ib_port_attr attr; + u32 tmp; + int err; + + mutex_lock(&dev->cap_mask_mutex); + + err = mlx5_ib_query_port(ibdev, port, &attr); + if (err) + goto out; + + tmp = (attr.port_cap_flags | props->set_port_cap_mask) & + ~props->clr_port_cap_mask; + + err = mlx5_set_port_caps(dev->mdev, port, tmp); + +out: + mutex_unlock(&dev->cap_mask_mutex); + return err; +} + +static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_alloc_ucontext_req_v2 req; + struct mlx5_ib_alloc_ucontext_resp resp; + struct mlx5_ib_ucontext *context; + struct mlx5_general_caps *gen; + struct mlx5_uuar_info *uuari; + struct mlx5_uar *uars; + int gross_uuars; + int num_uars; + int ver; + int uuarn; + int err; + int i; + size_t reqlen; + + gen = &dev->mdev->caps.gen; + if (!dev->ib_active) + return ERR_PTR(-EAGAIN); + + memset(&req, 0, sizeof(req)); + reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr); + if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req)) + ver = 0; + else if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req_v2)) + ver = 2; + else + return ERR_PTR(-EINVAL); + + err = ib_copy_from_udata(&req, udata, reqlen); + if (err) + return ERR_PTR(err); + + if (req.flags || req.reserved) + return ERR_PTR(-EINVAL); + + if (req.total_num_uuars > MLX5_MAX_UUARS) + return ERR_PTR(-ENOMEM); + + if (req.total_num_uuars == 0) + return ERR_PTR(-EINVAL); + + req.total_num_uuars = ALIGN(req.total_num_uuars, + MLX5_NON_FP_BF_REGS_PER_PAGE); + if (req.num_low_latency_uuars > req.total_num_uuars - 1) + return ERR_PTR(-EINVAL); + + num_uars = req.total_num_uuars / MLX5_NON_FP_BF_REGS_PER_PAGE; + gross_uuars = num_uars * MLX5_BF_REGS_PER_PAGE; + resp.qp_tab_size = 1 << gen->log_max_qp; + resp.bf_reg_size = gen->bf_reg_size; + resp.cache_line_size = L1_CACHE_BYTES; + resp.max_sq_desc_sz = gen->max_sq_desc_sz; + resp.max_rq_desc_sz = gen->max_rq_desc_sz; + resp.max_send_wqebb = gen->max_wqes; + resp.max_recv_wr = gen->max_wqes; + resp.max_srq_recv_wr = gen->max_srq_wqes; + + context = kzalloc(sizeof(*context), GFP_KERNEL); + if (!context) + return ERR_PTR(-ENOMEM); + + uuari = &context->uuari; + mutex_init(&uuari->lock); + uars = kcalloc(num_uars, sizeof(*uars), GFP_KERNEL); + if (!uars) { + err = -ENOMEM; + goto out_ctx; + } + + uuari->bitmap = kcalloc(BITS_TO_LONGS(gross_uuars), + sizeof(*uuari->bitmap), + GFP_KERNEL); + if (!uuari->bitmap) { + err = -ENOMEM; + goto out_uar_ctx; + } + /* + * clear all fast path uuars + */ + for (i = 0; i < gross_uuars; i++) { + uuarn = i & 3; + if (uuarn == 2 || uuarn == 3) + set_bit(i, uuari->bitmap); + } + + uuari->count = kcalloc(gross_uuars, sizeof(*uuari->count), GFP_KERNEL); + if (!uuari->count) { + err = -ENOMEM; + goto out_bitmap; + } + + for (i = 0; i < num_uars; i++) { + err = mlx5_cmd_alloc_uar(dev->mdev, &uars[i].index); + if (err) + goto out_count; + } + + INIT_LIST_HEAD(&context->db_page_list); + mutex_init(&context->db_page_mutex); + + resp.tot_uuars = req.total_num_uuars; + resp.num_ports = gen->num_ports; + err = ib_copy_to_udata(udata, &resp, + sizeof(resp) - sizeof(resp.reserved)); + if (err) + goto out_uars; + + uuari->ver = ver; + uuari->num_low_latency_uuars = req.num_low_latency_uuars; + uuari->uars = uars; + uuari->num_uars = num_uars; + return &context->ibucontext; + +out_uars: + for (i--; i >= 0; i--) + mlx5_cmd_free_uar(dev->mdev, uars[i].index); +out_count: + kfree(uuari->count); + +out_bitmap: + kfree(uuari->bitmap); + +out_uar_ctx: + kfree(uars); + +out_ctx: + kfree(context); + return ERR_PTR(err); +} + +static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) +{ + struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); + struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); + struct mlx5_uuar_info *uuari = &context->uuari; + int i; + + for (i = 0; i < uuari->num_uars; i++) { + if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index)) + mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index); + } + + kfree(uuari->count); + kfree(uuari->bitmap); + kfree(uuari->uars); + kfree(context); + + return 0; +} + +static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, int index) +{ + return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + index; +} + +static int get_command(unsigned long offset) +{ + return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK; +} + +static int get_arg(unsigned long offset) +{ + return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1); +} + +static int get_index(unsigned long offset) +{ + return get_arg(offset); +} + +static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) +{ + struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); + struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); + struct mlx5_uuar_info *uuari = &context->uuari; + unsigned long command; + unsigned long idx; + phys_addr_t pfn; + + command = get_command(vma->vm_pgoff); + switch (command) { + case MLX5_IB_MMAP_REGULAR_PAGE: + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + idx = get_index(vma->vm_pgoff); + if (idx >= uuari->num_uars) + return -EINVAL; + + pfn = uar_index2pfn(dev, uuari->uars[idx].index); + mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn 0x%llx\n", idx, + (unsigned long long)pfn); + + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + if (io_remap_pfn_range(vma, vma->vm_start, pfn, + PAGE_SIZE, vma->vm_page_prot)) + return -EAGAIN; + + mlx5_ib_dbg(dev, "mapped WC at 0x%lx, PA 0x%llx\n", + vma->vm_start, + (unsigned long long)pfn << PAGE_SHIFT); + break; + + case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES: + return -ENOSYS; + + default: + return -EINVAL; + } + + return 0; +} + +static int alloc_pa_mkey(struct mlx5_ib_dev *dev, u32 *key, u32 pdn) +{ + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_mkey_seg *seg; + struct mlx5_core_mr mr; + int err; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + seg = &in->seg; + seg->flags = MLX5_PERM_LOCAL_READ | MLX5_ACCESS_MODE_PA; + seg->flags_pd = cpu_to_be32(pdn | MLX5_MKEY_LEN64); + seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + seg->start_addr = 0; + + err = mlx5_core_create_mkey(dev->mdev, &mr, in, sizeof(*in), + NULL, NULL, NULL); + if (err) { + mlx5_ib_warn(dev, "failed to create mkey, %d\n", err); + goto err_in; + } + + kfree(in); + *key = mr.key; + + return 0; + +err_in: + kfree(in); + + return err; +} + +static void free_pa_mkey(struct mlx5_ib_dev *dev, u32 key) +{ + struct mlx5_core_mr mr; + int err; + + memset(&mr, 0, sizeof(mr)); + mr.key = key; + err = mlx5_core_destroy_mkey(dev->mdev, &mr); + if (err) + mlx5_ib_warn(dev, "failed to destroy mkey 0x%x\n", key); +} + +static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx5_ib_alloc_pd_resp resp; + struct mlx5_ib_pd *pd; + int err; + + pd = kmalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return ERR_PTR(-ENOMEM); + + err = mlx5_core_alloc_pd(to_mdev(ibdev)->mdev, &pd->pdn); + if (err) { + kfree(pd); + return ERR_PTR(err); + } + + if (context) { + resp.pdn = pd->pdn; + if (ib_copy_to_udata(udata, &resp, sizeof(resp))) { + mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn); + kfree(pd); + return ERR_PTR(-EFAULT); + } + } else { + err = alloc_pa_mkey(to_mdev(ibdev), &pd->pa_lkey, pd->pdn); + if (err) { + mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn); + kfree(pd); + return ERR_PTR(err); + } + } + + return &pd->ibpd; +} + +static int mlx5_ib_dealloc_pd(struct ib_pd *pd) +{ + struct mlx5_ib_dev *mdev = to_mdev(pd->device); + struct mlx5_ib_pd *mpd = to_mpd(pd); + + if (!pd->uobject) + free_pa_mkey(mdev, mpd->pa_lkey); + + mlx5_core_dealloc_pd(mdev->mdev, mpd->pdn); + kfree(mpd); + + return 0; +} + +static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + int err; + + err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num); + if (err) + mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n", + ibqp->qp_num, gid->raw); + + return err; +} + +static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + int err; + + err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num); + if (err) + mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n", + ibqp->qp_num, gid->raw); + + return err; +} + +static int init_node_data(struct mlx5_ib_dev *dev) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_DESC; + + err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(dev->ib_dev.node_desc, out_mad->data, 64); + + in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; + + err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + dev->mdev->rev_id = be32_to_cpup((__be32 *)(out_mad->data + 32)); + memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + + return sprintf(buf, "%d\n", dev->mdev->priv.fw_pages); +} + +static ssize_t show_reg_pages(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + + return sprintf(buf, "%d\n", dev->mdev->priv.reg_pages); +} + +static ssize_t show_hca(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + return sprintf(buf, "MT%d\n", dev->mdev->pdev->device); +} + +static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + return sprintf(buf, "%d.%d.%d\n", fw_rev_maj(dev->mdev), + fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev)); +} + +static ssize_t show_rev(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + return sprintf(buf, "%x\n", dev->mdev->rev_id); +} + +static ssize_t show_board(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5_ib_dev *dev = + container_of(device, struct mlx5_ib_dev, ib_dev.dev); + return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN, + dev->mdev->board_id); +} + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); +static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL); +static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL); + +static struct device_attribute *mlx5_class_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_fw_ver, + &dev_attr_hca_type, + &dev_attr_board_id, + &dev_attr_fw_pages, + &dev_attr_reg_pages, +}; + +static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, + enum mlx5_dev_event event, unsigned long param) +{ + struct mlx5_ib_dev *ibdev = (struct mlx5_ib_dev *)context; + struct ib_event ibev; + + u8 port = 0; + + switch (event) { + case MLX5_DEV_EVENT_SYS_ERROR: + ibdev->ib_active = false; + ibev.event = IB_EVENT_DEVICE_FATAL; + break; + + case MLX5_DEV_EVENT_PORT_UP: + ibev.event = IB_EVENT_PORT_ACTIVE; + port = (u8)param; + break; + + case MLX5_DEV_EVENT_PORT_DOWN: + ibev.event = IB_EVENT_PORT_ERR; + port = (u8)param; + break; + + case MLX5_DEV_EVENT_PORT_INITIALIZED: + /* not used by ULPs */ + return; + + case MLX5_DEV_EVENT_LID_CHANGE: + ibev.event = IB_EVENT_LID_CHANGE; + port = (u8)param; + break; + + case MLX5_DEV_EVENT_PKEY_CHANGE: + ibev.event = IB_EVENT_PKEY_CHANGE; + port = (u8)param; + break; + + case MLX5_DEV_EVENT_GUID_CHANGE: + ibev.event = IB_EVENT_GID_CHANGE; + port = (u8)param; + break; + + case MLX5_DEV_EVENT_CLIENT_REREG: + ibev.event = IB_EVENT_CLIENT_REREGISTER; + port = (u8)param; + break; + } + + ibev.device = &ibdev->ib_dev; + ibev.element.port_num = port; + + if (port < 1 || port > ibdev->num_ports) { + mlx5_ib_warn(ibdev, "warning: event on port %d\n", port); + return; + } + + if (ibdev->ib_active) + ib_dispatch_event(&ibev); +} + +static void get_ext_port_caps(struct mlx5_ib_dev *dev) +{ + struct mlx5_general_caps *gen; + int port; + + gen = &dev->mdev->caps.gen; + for (port = 1; port <= gen->num_ports; port++) + mlx5_query_ext_port_caps(dev, port); +} + +static int get_port_caps(struct mlx5_ib_dev *dev) +{ + struct ib_device_attr *dprops = NULL; + struct ib_port_attr *pprops = NULL; + struct mlx5_general_caps *gen; + int err = 0; + int port; + + gen = &dev->mdev->caps.gen; + pprops = kmalloc(sizeof(*pprops), GFP_KERNEL); + if (!pprops) + goto out; + + dprops = kmalloc(sizeof(*dprops), GFP_KERNEL); + if (!dprops) + goto out; + + err = mlx5_ib_query_device(&dev->ib_dev, dprops); + if (err) { + mlx5_ib_warn(dev, "query_device failed %d\n", err); + goto out; + } + + for (port = 1; port <= gen->num_ports; port++) { + err = mlx5_ib_query_port(&dev->ib_dev, port, pprops); + if (err) { + mlx5_ib_warn(dev, "query_port %d failed %d\n", port, err); + break; + } + gen->port[port - 1].pkey_table_len = dprops->max_pkeys; + gen->port[port - 1].gid_table_len = pprops->gid_tbl_len; + mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n", + dprops->max_pkeys, pprops->gid_tbl_len); + } + +out: + kfree(pprops); + kfree(dprops); + + return err; +} + +static void destroy_umrc_res(struct mlx5_ib_dev *dev) +{ + int err; + + err = mlx5_mr_cache_cleanup(dev); + if (err) + mlx5_ib_warn(dev, "mr cache cleanup failed\n"); + + mlx5_ib_destroy_qp(dev->umrc.qp); + ib_destroy_cq(dev->umrc.cq); + ib_dereg_mr(dev->umrc.mr); + ib_dealloc_pd(dev->umrc.pd); +} + +enum { + MAX_UMR_WR = 128, +}; + +static int create_umr_res(struct mlx5_ib_dev *dev) +{ + struct ib_qp_init_attr *init_attr = NULL; + struct ib_qp_attr *attr = NULL; + struct ib_pd *pd; + struct ib_cq *cq; + struct ib_qp *qp; + struct ib_mr *mr; + int ret; + + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL); + if (!attr || !init_attr) { + ret = -ENOMEM; + goto error_0; + } + + pd = ib_alloc_pd(&dev->ib_dev); + if (IS_ERR(pd)) { + mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n"); + ret = PTR_ERR(pd); + goto error_0; + } + + mr = ib_get_dma_mr(pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(mr)) { + mlx5_ib_dbg(dev, "Couldn't create DMA MR for sync UMR QP\n"); + ret = PTR_ERR(mr); + goto error_1; + } + + cq = ib_create_cq(&dev->ib_dev, mlx5_umr_cq_handler, NULL, NULL, 128, + 0); + if (IS_ERR(cq)) { + mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n"); + ret = PTR_ERR(cq); + goto error_2; + } + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + + init_attr->send_cq = cq; + init_attr->recv_cq = cq; + init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; + init_attr->cap.max_send_wr = MAX_UMR_WR; + init_attr->cap.max_send_sge = 1; + init_attr->qp_type = MLX5_IB_QPT_REG_UMR; + init_attr->port_num = 1; + qp = mlx5_ib_create_qp(pd, init_attr, NULL); + if (IS_ERR(qp)) { + mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n"); + ret = PTR_ERR(qp); + goto error_3; + } + qp->device = &dev->ib_dev; + qp->real_qp = qp; + qp->uobject = NULL; + qp->qp_type = MLX5_IB_QPT_REG_UMR; + + attr->qp_state = IB_QPS_INIT; + attr->port_num = 1; + ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX | + IB_QP_PORT, NULL); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n"); + goto error_4; + } + + memset(attr, 0, sizeof(*attr)); + attr->qp_state = IB_QPS_RTR; + attr->path_mtu = IB_MTU_256; + + ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n"); + goto error_4; + } + + memset(attr, 0, sizeof(*attr)); + attr->qp_state = IB_QPS_RTS; + ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n"); + goto error_4; + } + + dev->umrc.qp = qp; + dev->umrc.cq = cq; + dev->umrc.mr = mr; + dev->umrc.pd = pd; + + sema_init(&dev->umrc.sem, MAX_UMR_WR); + ret = mlx5_mr_cache_init(dev); + if (ret) { + mlx5_ib_warn(dev, "mr cache init failed %d\n", ret); + goto error_4; + } + + kfree(attr); + kfree(init_attr); + + return 0; + +error_4: + mlx5_ib_destroy_qp(qp); + +error_3: + ib_destroy_cq(cq); + +error_2: + ib_dereg_mr(mr); + +error_1: + ib_dealloc_pd(pd); + +error_0: + kfree(attr); + kfree(init_attr); + return ret; +} + +static int create_dev_resources(struct mlx5_ib_resources *devr) +{ + struct ib_srq_init_attr attr; + struct mlx5_ib_dev *dev; + int ret = 0; + + dev = container_of(devr, struct mlx5_ib_dev, devr); + + devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL); + if (IS_ERR(devr->p0)) { + ret = PTR_ERR(devr->p0); + goto error0; + } + devr->p0->device = &dev->ib_dev; + devr->p0->uobject = NULL; + atomic_set(&devr->p0->usecnt, 0); + + devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, 1, 0, NULL, NULL); + if (IS_ERR(devr->c0)) { + ret = PTR_ERR(devr->c0); + goto error1; + } + devr->c0->device = &dev->ib_dev; + devr->c0->uobject = NULL; + devr->c0->comp_handler = NULL; + devr->c0->event_handler = NULL; + devr->c0->cq_context = NULL; + atomic_set(&devr->c0->usecnt, 0); + + devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL); + if (IS_ERR(devr->x0)) { + ret = PTR_ERR(devr->x0); + goto error2; + } + devr->x0->device = &dev->ib_dev; + devr->x0->inode = NULL; + atomic_set(&devr->x0->usecnt, 0); + mutex_init(&devr->x0->tgt_qp_mutex); + INIT_LIST_HEAD(&devr->x0->tgt_qp_list); + + devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL); + if (IS_ERR(devr->x1)) { + ret = PTR_ERR(devr->x1); + goto error3; + } + devr->x1->device = &dev->ib_dev; + devr->x1->inode = NULL; + atomic_set(&devr->x1->usecnt, 0); + mutex_init(&devr->x1->tgt_qp_mutex); + INIT_LIST_HEAD(&devr->x1->tgt_qp_list); + + memset(&attr, 0, sizeof(attr)); + attr.attr.max_sge = 1; + attr.attr.max_wr = 1; + attr.srq_type = IB_SRQT_XRC; + attr.ext.xrc.cq = devr->c0; + attr.ext.xrc.xrcd = devr->x0; + + devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL); + if (IS_ERR(devr->s0)) { + ret = PTR_ERR(devr->s0); + goto error4; + } + devr->s0->device = &dev->ib_dev; + devr->s0->pd = devr->p0; + devr->s0->uobject = NULL; + devr->s0->event_handler = NULL; + devr->s0->srq_context = NULL; + devr->s0->srq_type = IB_SRQT_XRC; + devr->s0->ext.xrc.xrcd = devr->x0; + devr->s0->ext.xrc.cq = devr->c0; + atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt); + atomic_inc(&devr->s0->ext.xrc.cq->usecnt); + atomic_inc(&devr->p0->usecnt); + atomic_set(&devr->s0->usecnt, 0); + + return 0; + +error4: + mlx5_ib_dealloc_xrcd(devr->x1); +error3: + mlx5_ib_dealloc_xrcd(devr->x0); +error2: + mlx5_ib_destroy_cq(devr->c0); +error1: + mlx5_ib_dealloc_pd(devr->p0); +error0: + return ret; +} + +static void destroy_dev_resources(struct mlx5_ib_resources *devr) +{ + mlx5_ib_destroy_srq(devr->s0); + mlx5_ib_dealloc_xrcd(devr->x0); + mlx5_ib_dealloc_xrcd(devr->x1); + mlx5_ib_destroy_cq(devr->c0); + mlx5_ib_dealloc_pd(devr->p0); +} + +static void *mlx5_ib_add(struct mlx5_core_dev *mdev) +{ + struct mlx5_ib_dev *dev; + int err; + int i; + + printk_once(KERN_INFO "%s", mlx5_version); + + dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev)); + if (!dev) + return NULL; + + dev->mdev = mdev; + + err = get_port_caps(dev); + if (err) + goto err_dealloc; + + get_ext_port_caps(dev); + + err = alloc_comp_eqs(dev); + if (err) + goto err_dealloc; + + MLX5_INIT_DOORBELL_LOCK(&dev->uar_lock); + + strlcpy(dev->ib_dev.name, "mlx5_%d", IB_DEVICE_NAME_MAX); + dev->ib_dev.owner = THIS_MODULE; + dev->ib_dev.node_type = RDMA_NODE_IB_CA; + dev->ib_dev.local_dma_lkey = mdev->caps.gen.reserved_lkey; + dev->num_ports = mdev->caps.gen.num_ports; + dev->ib_dev.phys_port_cnt = dev->num_ports; + dev->ib_dev.num_comp_vectors = dev->num_comp_vectors; + dev->ib_dev.dma_device = &mdev->pdev->dev; + + dev->ib_dev.uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION; + dev->ib_dev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | + (1ull << IB_USER_VERBS_CMD_OPEN_QP); + + dev->ib_dev.query_device = mlx5_ib_query_device; + dev->ib_dev.query_port = mlx5_ib_query_port; + dev->ib_dev.query_gid = mlx5_ib_query_gid; + dev->ib_dev.query_pkey = mlx5_ib_query_pkey; + dev->ib_dev.modify_device = mlx5_ib_modify_device; + dev->ib_dev.modify_port = mlx5_ib_modify_port; + dev->ib_dev.alloc_ucontext = mlx5_ib_alloc_ucontext; + dev->ib_dev.dealloc_ucontext = mlx5_ib_dealloc_ucontext; + dev->ib_dev.mmap = mlx5_ib_mmap; + dev->ib_dev.alloc_pd = mlx5_ib_alloc_pd; + dev->ib_dev.dealloc_pd = mlx5_ib_dealloc_pd; + dev->ib_dev.create_ah = mlx5_ib_create_ah; + dev->ib_dev.query_ah = mlx5_ib_query_ah; + dev->ib_dev.destroy_ah = mlx5_ib_destroy_ah; + dev->ib_dev.create_srq = mlx5_ib_create_srq; + dev->ib_dev.modify_srq = mlx5_ib_modify_srq; + dev->ib_dev.query_srq = mlx5_ib_query_srq; + dev->ib_dev.destroy_srq = mlx5_ib_destroy_srq; + dev->ib_dev.post_srq_recv = mlx5_ib_post_srq_recv; + dev->ib_dev.create_qp = mlx5_ib_create_qp; + dev->ib_dev.modify_qp = mlx5_ib_modify_qp; + dev->ib_dev.query_qp = mlx5_ib_query_qp; + dev->ib_dev.destroy_qp = mlx5_ib_destroy_qp; + dev->ib_dev.post_send = mlx5_ib_post_send; + dev->ib_dev.post_recv = mlx5_ib_post_recv; + dev->ib_dev.create_cq = mlx5_ib_create_cq; + dev->ib_dev.modify_cq = mlx5_ib_modify_cq; + dev->ib_dev.resize_cq = mlx5_ib_resize_cq; + dev->ib_dev.destroy_cq = mlx5_ib_destroy_cq; + dev->ib_dev.poll_cq = mlx5_ib_poll_cq; + dev->ib_dev.req_notify_cq = mlx5_ib_arm_cq; + dev->ib_dev.get_dma_mr = mlx5_ib_get_dma_mr; + dev->ib_dev.reg_user_mr = mlx5_ib_reg_user_mr; + dev->ib_dev.dereg_mr = mlx5_ib_dereg_mr; + dev->ib_dev.destroy_mr = mlx5_ib_destroy_mr; + dev->ib_dev.attach_mcast = mlx5_ib_mcg_attach; + dev->ib_dev.detach_mcast = mlx5_ib_mcg_detach; + dev->ib_dev.process_mad = mlx5_ib_process_mad; + dev->ib_dev.create_mr = mlx5_ib_create_mr; + dev->ib_dev.alloc_fast_reg_mr = mlx5_ib_alloc_fast_reg_mr; + dev->ib_dev.alloc_fast_reg_page_list = mlx5_ib_alloc_fast_reg_page_list; + dev->ib_dev.free_fast_reg_page_list = mlx5_ib_free_fast_reg_page_list; + dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status; + + if (mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_XRC) { + dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd; + dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd; + dev->ib_dev.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) | + (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); + } + + err = init_node_data(dev); + if (err) + goto err_eqs; + + mutex_init(&dev->cap_mask_mutex); + spin_lock_init(&dev->mr_lock); + + err = create_dev_resources(&dev->devr); + if (err) + goto err_eqs; + + err = ib_register_device(&dev->ib_dev, NULL); + if (err) + goto err_rsrc; + + err = create_umr_res(dev); + if (err) + goto err_dev; + + for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) { + err = device_create_file(&dev->ib_dev.dev, + mlx5_class_attributes[i]); + if (err) + goto err_umrc; + } + + dev->ib_active = true; + + return dev; + +err_umrc: + destroy_umrc_res(dev); + +err_dev: + ib_unregister_device(&dev->ib_dev); + +err_rsrc: + destroy_dev_resources(&dev->devr); + +err_eqs: + free_comp_eqs(dev); + +err_dealloc: + ib_dealloc_device((struct ib_device *)dev); + + return NULL; +} + +static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) +{ + struct mlx5_ib_dev *dev = context; + ib_unregister_device(&dev->ib_dev); + destroy_umrc_res(dev); + destroy_dev_resources(&dev->devr); + free_comp_eqs(dev); + ib_dealloc_device(&dev->ib_dev); +} + +static struct mlx5_interface mlx5_ib_interface = { + .add = mlx5_ib_add, + .remove = mlx5_ib_remove, + .event = mlx5_ib_event, +}; + +static int __init mlx5_ib_init(void) +{ + if (deprecated_prof_sel != 2) + pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n"); + + return mlx5_register_interface(&mlx5_ib_interface); +} + +static void __exit mlx5_ib_cleanup(void) +{ + mlx5_unregister_interface(&mlx5_ib_interface); +} + +module_init(mlx5_ib_init); +module_exit(mlx5_ib_cleanup); diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c new file mode 100644 index 0000000..dae07ea --- /dev/null +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include "mlx5_ib.h" + +/* @umem: umem object to scan + * @addr: ib virtual address requested by the user + * @count: number of PAGE_SIZE pages covered by umem + * @shift: page shift for the compound pages found in the region + * @ncont: number of compund pages + * @order: log2 of the number of compound pages + */ +void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, + int *ncont, int *order) +{ + unsigned long tmp; + unsigned long m; + int i, k; + u64 base = 0; + int p = 0; + int skip; + int mask; + u64 len; + u64 pfn; + struct scatterlist *sg; + int entry; + unsigned long page_shift = ilog2(umem->page_size); + + addr = addr >> page_shift; + tmp = (unsigned long)addr; + m = find_first_bit(&tmp, sizeof(tmp)); + skip = 1 << m; + mask = skip - 1; + i = 0; + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + len = sg_dma_len(sg) >> page_shift; + pfn = sg_dma_address(sg) >> page_shift; + for (k = 0; k < len; k++) { + if (!(i & mask)) { + tmp = (unsigned long)pfn; + m = min(m, find_first_bit(&tmp, sizeof(tmp))); + skip = 1 << m; + mask = skip - 1; + base = pfn; + p = 0; + } else { + if (base + p != pfn) { + tmp = (unsigned long)p; + m = find_first_bit(&tmp, sizeof(tmp)); + skip = 1 << m; + mask = skip - 1; + base = pfn; + p = 0; + } + } + p++; + i++; + } + } + + if (i) { + m = min_t(unsigned long, ilog2(roundup_pow_of_two(i)), m); + + if (order) + *order = ilog2(roundup_pow_of_two(i) >> m); + + *ncont = DIV_ROUND_UP(i, (1 << m)); + } else { + m = 0; + + if (order) + *order = 0; + + *ncont = 0; + } + *shift = page_shift + m; + *count = i; +} + +void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, __be64 *pas, int umr) +{ + unsigned long umem_page_shift = ilog2(umem->page_size); + int shift = page_shift - umem_page_shift; + int mask = (1 << shift) - 1; + int i, k; + u64 cur = 0; + u64 base; + int len; + struct scatterlist *sg; + int entry; + + i = 0; + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + len = sg_dma_len(sg) >> umem_page_shift; + base = sg_dma_address(sg); + for (k = 0; k < len; k++) { + if (!(i & mask)) { + cur = base + (k << umem_page_shift); + if (umr) + cur |= 3; + + pas[i >> shift] = cpu_to_be64(cur); + mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n", + i >> shift, be64_to_cpu(pas[i >> shift])); + } else + mlx5_ib_dbg(dev, "=====> 0x%llx\n", + base + (k << umem_page_shift)); + i++; + } + } +} + +int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset) +{ + u64 page_size; + u64 page_mask; + u64 off_size; + u64 off_mask; + u64 buf_off; + + page_size = (u64)1 << page_shift; + page_mask = page_size - 1; + buf_off = addr & page_mask; + off_size = page_size >> 6; + off_mask = off_size - 1; + + if (buf_off & off_mask) + return -EINVAL; + + *offset = buf_off >> ilog2(off_size); + return 0; +} diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h new file mode 100644 index 0000000..386780f --- /dev/null +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -0,0 +1,564 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_IB_H +#define MLX5_IB_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define mlx5_ib_dbg(dev, format, arg...) \ +pr_debug("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ + __LINE__, current->pid, ##arg) + +#define mlx5_ib_err(dev, format, arg...) \ +pr_err("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ + __LINE__, current->pid, ##arg) + +#define mlx5_ib_warn(dev, format, arg...) \ +pr_warn("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ + __LINE__, current->pid, ##arg) + +enum { + MLX5_IB_MMAP_CMD_SHIFT = 8, + MLX5_IB_MMAP_CMD_MASK = 0xff, +}; + +enum mlx5_ib_mmap_cmd { + MLX5_IB_MMAP_REGULAR_PAGE = 0, + MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES = 1, /* always last */ +}; + +enum { + MLX5_RES_SCAT_DATA32_CQE = 0x1, + MLX5_RES_SCAT_DATA64_CQE = 0x2, + MLX5_REQ_SCAT_DATA32_CQE = 0x11, + MLX5_REQ_SCAT_DATA64_CQE = 0x22, +}; + +enum mlx5_ib_latency_class { + MLX5_IB_LATENCY_CLASS_LOW, + MLX5_IB_LATENCY_CLASS_MEDIUM, + MLX5_IB_LATENCY_CLASS_HIGH, + MLX5_IB_LATENCY_CLASS_FAST_PATH +}; + +enum mlx5_ib_mad_ifc_flags { + MLX5_MAD_IFC_IGNORE_MKEY = 1, + MLX5_MAD_IFC_IGNORE_BKEY = 2, + MLX5_MAD_IFC_NET_VIEW = 4, +}; + +struct mlx5_ib_ucontext { + struct ib_ucontext ibucontext; + struct list_head db_page_list; + + /* protect doorbell record alloc/free + */ + struct mutex db_page_mutex; + struct mlx5_uuar_info uuari; +}; + +static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct mlx5_ib_ucontext, ibucontext); +} + +struct mlx5_ib_pd { + struct ib_pd ibpd; + u32 pdn; + u32 pa_lkey; +}; + +/* Use macros here so that don't have to duplicate + * enum ib_send_flags and enum ib_qp_type for low-level driver + */ + +#define MLX5_IB_SEND_UMR_UNREG IB_SEND_RESERVED_START +#define MLX5_IB_QPT_REG_UMR IB_QPT_RESERVED1 +#define MLX5_IB_WR_UMR IB_WR_RESERVED1 + +struct wr_list { + u16 opcode; + u16 next; +}; + +struct mlx5_ib_wq { + u64 *wrid; + u32 *wr_data; + struct wr_list *w_list; + unsigned *wqe_head; + u16 unsig_count; + + /* serialize post to the work queue + */ + spinlock_t lock; + int wqe_cnt; + int max_post; + int max_gs; + int offset; + int wqe_shift; + unsigned head; + unsigned tail; + u16 cur_post; + u16 last_poll; + void *qend; +}; + +enum { + MLX5_QP_USER, + MLX5_QP_KERNEL, + MLX5_QP_EMPTY +}; + +struct mlx5_ib_qp { + struct ib_qp ibqp; + struct mlx5_core_qp mqp; + struct mlx5_buf buf; + + struct mlx5_db db; + struct mlx5_ib_wq rq; + + u32 doorbell_qpn; + u8 sq_signal_bits; + u8 fm_cache; + int sq_max_wqes_per_wr; + int sq_spare_wqes; + struct mlx5_ib_wq sq; + + struct ib_umem *umem; + int buf_size; + + /* serialize qp state modifications + */ + struct mutex mutex; + u16 xrcdn; + u32 flags; + u8 port; + u8 alt_port; + u8 atomic_rd_en; + u8 resp_depth; + u8 state; + int mlx_type; + int wq_sig; + int scat_cqe; + int max_inline_data; + struct mlx5_bf *bf; + int has_rq; + + /* only for user space QPs. For kernel + * we have it from the bf object + */ + int uuarn; + + int create_type; + u32 pa_lkey; + + /* Store signature errors */ + bool signature_en; +}; + +struct mlx5_ib_cq_buf { + struct mlx5_buf buf; + struct ib_umem *umem; + int cqe_size; + int nent; +}; + +enum mlx5_ib_qp_flags { + MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK = 1 << 0, + MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 1, +}; + +struct mlx5_shared_mr_info { + int mr_id; + struct ib_umem *umem; +}; + +struct mlx5_ib_cq { + struct ib_cq ibcq; + struct mlx5_core_cq mcq; + struct mlx5_ib_cq_buf buf; + struct mlx5_db db; + + /* serialize access to the CQ + */ + spinlock_t lock; + + /* protect resize cq + */ + struct mutex resize_mutex; + struct mlx5_ib_cq_buf *resize_buf; + struct ib_umem *resize_umem; + int cqe_size; +}; + +struct mlx5_ib_srq { + struct ib_srq ibsrq; + struct mlx5_core_srq msrq; + struct mlx5_buf buf; + struct mlx5_db db; + u64 *wrid; + /* protect SRQ hanlding + */ + spinlock_t lock; + int head; + int tail; + u16 wqe_ctr; + struct ib_umem *umem; + /* serialize arming a SRQ + */ + struct mutex mutex; + int wq_sig; +}; + +struct mlx5_ib_xrcd { + struct ib_xrcd ibxrcd; + u32 xrcdn; +}; + +struct mlx5_ib_mr { + struct ib_mr ibmr; + struct mlx5_core_mr mmr; + struct ib_umem *umem; + struct mlx5_shared_mr_info *smr_info; + struct list_head list; + int order; + int umred; + __be64 *pas; + dma_addr_t dma; + int npages; + struct mlx5_ib_dev *dev; + struct mlx5_create_mkey_mbox_out out; + struct mlx5_core_sig_ctx *sig; +}; + +struct mlx5_ib_fast_reg_page_list { + struct ib_fast_reg_page_list ibfrpl; + __be64 *mapped_page_list; + dma_addr_t map; +}; + +struct mlx5_ib_umr_context { + enum ib_wc_status status; + struct completion done; +}; + +static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context) +{ + context->status = -1; + init_completion(&context->done); +} + +struct umr_common { + struct ib_pd *pd; + struct ib_cq *cq; + struct ib_qp *qp; + struct ib_mr *mr; + /* control access to UMR QP + */ + struct semaphore sem; +}; + +enum { + MLX5_FMR_INVALID, + MLX5_FMR_VALID, + MLX5_FMR_BUSY, +}; + +struct mlx5_ib_fmr { + struct ib_fmr ibfmr; + struct mlx5_core_mr mr; + int access_flags; + int state; + /* protect fmr state + */ + spinlock_t lock; + u64 wrid; + struct ib_send_wr wr[2]; + u8 page_shift; + struct ib_fast_reg_page_list page_list; +}; + +struct mlx5_cache_ent { + struct list_head head; + /* sync access to the cahce entry + */ + spinlock_t lock; + + + struct dentry *dir; + char name[4]; + u32 order; + u32 size; + u32 cur; + u32 miss; + u32 limit; + + struct dentry *fsize; + struct dentry *fcur; + struct dentry *fmiss; + struct dentry *flimit; + + struct mlx5_ib_dev *dev; + struct work_struct work; + struct delayed_work dwork; + int pending; +}; + +struct mlx5_mr_cache { + struct workqueue_struct *wq; + struct mlx5_cache_ent ent[MAX_MR_CACHE_ENTRIES]; + int stopped; + struct dentry *root; + unsigned long last_add; +}; + +struct mlx5_ib_resources { + struct ib_cq *c0; + struct ib_xrcd *x0; + struct ib_xrcd *x1; + struct ib_pd *p0; + struct ib_srq *s0; +}; + +struct mlx5_ib_dev { + struct ib_device ib_dev; + struct mlx5_core_dev *mdev; + MLX5_DECLARE_DOORBELL_LOCK(uar_lock); + struct list_head eqs_list; + int num_ports; + int num_comp_vectors; + /* serialize update of capability mask + */ + struct mutex cap_mask_mutex; + bool ib_active; + struct umr_common umrc; + /* sync used page count stats + */ + spinlock_t mr_lock; + struct mlx5_ib_resources devr; + struct mlx5_mr_cache cache; + struct timer_list delay_timer; + int fill_delay; +}; + +static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) +{ + return container_of(mcq, struct mlx5_ib_cq, mcq); +} + +static inline struct mlx5_ib_xrcd *to_mxrcd(struct ib_xrcd *ibxrcd) +{ + return container_of(ibxrcd, struct mlx5_ib_xrcd, ibxrcd); +} + +static inline struct mlx5_ib_dev *to_mdev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct mlx5_ib_dev, ib_dev); +} + +static inline struct mlx5_ib_fmr *to_mfmr(struct ib_fmr *ibfmr) +{ + return container_of(ibfmr, struct mlx5_ib_fmr, ibfmr); +} + +static inline struct mlx5_ib_cq *to_mcq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct mlx5_ib_cq, ibcq); +} + +static inline struct mlx5_ib_qp *to_mibqp(struct mlx5_core_qp *mqp) +{ + return container_of(mqp, struct mlx5_ib_qp, mqp); +} + +static inline struct mlx5_ib_mr *to_mibmr(struct mlx5_core_mr *mmr) +{ + return container_of(mmr, struct mlx5_ib_mr, mmr); +} + +static inline struct mlx5_ib_pd *to_mpd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct mlx5_ib_pd, ibpd); +} + +static inline struct mlx5_ib_srq *to_msrq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct mlx5_ib_srq, ibsrq); +} + +static inline struct mlx5_ib_qp *to_mqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct mlx5_ib_qp, ibqp); +} + +static inline struct mlx5_ib_srq *to_mibsrq(struct mlx5_core_srq *msrq) +{ + return container_of(msrq, struct mlx5_ib_srq, msrq); +} + +static inline struct mlx5_ib_mr *to_mmr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct mlx5_ib_mr, ibmr); +} + +static inline struct mlx5_ib_fast_reg_page_list *to_mfrpl(struct ib_fast_reg_page_list *ibfrpl) +{ + return container_of(ibfrpl, struct mlx5_ib_fast_reg_page_list, ibfrpl); +} + +struct mlx5_ib_ah { + struct ib_ah ibah; + struct mlx5_av av; +}; + +static inline struct mlx5_ib_ah *to_mah(struct ib_ah *ibah) +{ + return container_of(ibah, struct mlx5_ib_ah, ibah); +} + +int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt, + struct mlx5_db *db); +void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db); +void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq); +void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq); +void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index); +int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey, + u8 port, struct ib_wc *in_wc, struct ib_grh *in_grh, + void *in_mad, void *response_mad); +struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr, + struct mlx5_ib_ah *ah); +struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr); +int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr); +int mlx5_ib_destroy_ah(struct ib_ah *ah); +struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata); +int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata); +int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr); +int mlx5_ib_destroy_srq(struct ib_srq *srq); +int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); +int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); +int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr); +int mlx5_ib_destroy_qp(struct ib_qp *qp); +int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n); +struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries, + int vector, struct ib_ucontext *context, + struct ib_udata *udata); +int mlx5_ib_destroy_cq(struct ib_cq *cq); +int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); +int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); +int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); +struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc); +struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata); +int mlx5_ib_dereg_mr(struct ib_mr *ibmr); +int mlx5_ib_destroy_mr(struct ib_mr *ibmr); +struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd, + struct ib_mr_init_attr *mr_init_attr); +struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd, + int max_page_list_len); +struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, + int page_list_len); +void mlx5_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list); +struct ib_fmr *mlx5_ib_fmr_alloc(struct ib_pd *pd, int acc, + struct ib_fmr_attr *fmr_attr); +int mlx5_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, + int npages, u64 iova); +int mlx5_ib_unmap_fmr(struct list_head *fmr_list); +int mlx5_ib_fmr_dealloc(struct ib_fmr *ibfmr); +int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad); +struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata); +int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd); +int mlx5_vector2eqn(struct mlx5_ib_dev *dev, int vector, int *eqn, int *irqn); +int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset); +int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port); +int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props); +int mlx5_ib_init_fmr(struct mlx5_ib_dev *dev); +void mlx5_ib_cleanup_fmr(struct mlx5_ib_dev *dev); +void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, + int *ncont, int *order); +void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, __be64 *pas, int umr); +void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); +int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq); +int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); +int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev); +int mlx5_mr_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift); +void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context); +int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, + struct ib_mr_status *mr_status); + +static inline void init_query_mad(struct ib_smp *mad) +{ + mad->base_version = 1; + mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; + mad->class_version = 1; + mad->method = IB_MGMT_METHOD_GET; +} + +static inline u8 convert_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC : 0) | + (acc & IB_ACCESS_REMOTE_WRITE ? MLX5_PERM_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? MLX5_PERM_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? MLX5_PERM_LOCAL_WRITE : 0) | + MLX5_PERM_LOCAL_READ; +} + +#endif /* MLX5_IB_H */ diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c new file mode 100644 index 0000000..8ee7cb4 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -0,0 +1,1250 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include +#include +#include +#include +#include +#include +#include "mlx5_ib.h" + +enum { + MAX_PENDING_REG_MR = 8, +}; + +enum { + MLX5_UMR_ALIGN = 2048 +}; + +static __be64 *mr_align(__be64 *ptr, int align) +{ + unsigned long mask = align - 1; + + return (__be64 *)(((unsigned long)ptr + mask) & ~mask); +} + +static int order2idx(struct mlx5_ib_dev *dev, int order) +{ + struct mlx5_mr_cache *cache = &dev->cache; + + if (order < cache->ent[0].order) + return 0; + else + return order - cache->ent[0].order; +} + +static void reg_mr_callback(int status, void *context) +{ + struct mlx5_ib_mr *mr = context; + struct mlx5_ib_dev *dev = mr->dev; + struct mlx5_mr_cache *cache = &dev->cache; + int c = order2idx(dev, mr->order); + struct mlx5_cache_ent *ent = &cache->ent[c]; + u8 key; + unsigned long flags; + struct mlx5_mr_table *table = &dev->mdev->priv.mr_table; + int err; + + spin_lock_irqsave(&ent->lock, flags); + ent->pending--; + spin_unlock_irqrestore(&ent->lock, flags); + if (status) { + mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status); + kfree(mr); + dev->fill_delay = 1; + mod_timer(&dev->delay_timer, jiffies + HZ); + return; + } + + if (mr->out.hdr.status) { + mlx5_ib_warn(dev, "failed - status %d, syndorme 0x%x\n", + mr->out.hdr.status, + be32_to_cpu(mr->out.hdr.syndrome)); + kfree(mr); + dev->fill_delay = 1; + mod_timer(&dev->delay_timer, jiffies + HZ); + return; + } + + spin_lock_irqsave(&dev->mdev->priv.mkey_lock, flags); + key = dev->mdev->priv.mkey_key++; + spin_unlock_irqrestore(&dev->mdev->priv.mkey_lock, flags); + mr->mmr.key = mlx5_idx_to_mkey(be32_to_cpu(mr->out.mkey) & 0xffffff) | key; + + cache->last_add = jiffies; + + spin_lock_irqsave(&ent->lock, flags); + list_add_tail(&mr->list, &ent->head); + ent->cur++; + ent->size++; + spin_unlock_irqrestore(&ent->lock, flags); + + write_lock_irqsave(&table->lock, flags); + err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->mmr.key), + &mr->mmr); + if (err) + pr_err("Error inserting to mr tree. 0x%x\n", -err); + write_unlock_irqrestore(&table->lock, flags); +} + +static int add_keys(struct mlx5_ib_dev *dev, int c, int num) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[c]; + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_ib_mr *mr; + int npages = 1 << ent->order; + int err = 0; + int i; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + for (i = 0; i < num; i++) { + if (ent->pending >= MAX_PENDING_REG_MR) { + err = -EAGAIN; + break; + } + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + err = -ENOMEM; + break; + } + mr->order = ent->order; + mr->umred = 1; + mr->dev = dev; + in->seg.status = 1 << 6; + in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2); + in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN; + in->seg.log2_page_size = 12; + + spin_lock_irq(&ent->lock); + ent->pending++; + spin_unlock_irq(&ent->lock); + err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, + sizeof(*in), reg_mr_callback, + mr, &mr->out); + if (err) { + mlx5_ib_warn(dev, "create mkey failed %d\n", err); + kfree(mr); + break; + } + } + + kfree(in); + return err; +} + +static void remove_keys(struct mlx5_ib_dev *dev, int c, int num) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[c]; + struct mlx5_ib_mr *mr; + int err; + int i; + + for (i = 0; i < num; i++) { + spin_lock_irq(&ent->lock); + if (list_empty(&ent->head)) { + spin_unlock_irq(&ent->lock); + return; + } + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); + list_del(&mr->list); + ent->cur--; + ent->size--; + spin_unlock_irq(&ent->lock); + err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); + if (err) + mlx5_ib_warn(dev, "failed destroy mkey\n"); + else + kfree(mr); + } +} + +static ssize_t size_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_cache_ent *ent = filp->private_data; + struct mlx5_ib_dev *dev = ent->dev; + char lbuf[20]; + u32 var; + int err; + int c; + + if (copy_from_user(lbuf, buf, sizeof(lbuf))) + return -EFAULT; + + c = order2idx(dev, ent->order); + lbuf[sizeof(lbuf) - 1] = 0; + + if (sscanf(lbuf, "%u", &var) != 1) + return -EINVAL; + + if (var < ent->limit) + return -EINVAL; + + if (var > ent->size) { + do { + err = add_keys(dev, c, var - ent->size); + if (err && err != -EAGAIN) + return err; + + usleep_range(3000, 5000); + } while (err); + } else if (var < ent->size) { + remove_keys(dev, c, ent->size - var); + } + + return count; +} + +static ssize_t size_read(struct file *filp, char __user *buf, size_t count, + loff_t *pos) +{ + struct mlx5_cache_ent *ent = filp->private_data; + char lbuf[20]; + int err; + + if (*pos) + return 0; + + err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->size); + if (err < 0) + return err; + + if (copy_to_user(buf, lbuf, err)) + return -EFAULT; + + *pos += err; + + return err; +} + +static const struct file_operations size_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = size_write, + .read = size_read, +}; + +static ssize_t limit_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_cache_ent *ent = filp->private_data; + struct mlx5_ib_dev *dev = ent->dev; + char lbuf[20]; + u32 var; + int err; + int c; + + if (copy_from_user(lbuf, buf, sizeof(lbuf))) + return -EFAULT; + + c = order2idx(dev, ent->order); + lbuf[sizeof(lbuf) - 1] = 0; + + if (sscanf(lbuf, "%u", &var) != 1) + return -EINVAL; + + if (var > ent->size) + return -EINVAL; + + ent->limit = var; + + if (ent->cur < ent->limit) { + err = add_keys(dev, c, 2 * ent->limit - ent->cur); + if (err) + return err; + } + + return count; +} + +static ssize_t limit_read(struct file *filp, char __user *buf, size_t count, + loff_t *pos) +{ + struct mlx5_cache_ent *ent = filp->private_data; + char lbuf[20]; + int err; + + if (*pos) + return 0; + + err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit); + if (err < 0) + return err; + + if (copy_to_user(buf, lbuf, err)) + return -EFAULT; + + *pos += err; + + return err; +} + +static const struct file_operations limit_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = limit_write, + .read = limit_read, +}; + +static int someone_adding(struct mlx5_mr_cache *cache) +{ + int i; + + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + if (cache->ent[i].cur < cache->ent[i].limit) + return 1; + } + + return 0; +} + +static void __cache_work_func(struct mlx5_cache_ent *ent) +{ + struct mlx5_ib_dev *dev = ent->dev; + struct mlx5_mr_cache *cache = &dev->cache; + int i = order2idx(dev, ent->order); + int err; + + if (cache->stopped) + return; + + ent = &dev->cache.ent[i]; + if (ent->cur < 2 * ent->limit && !dev->fill_delay) { + err = add_keys(dev, i, 1); + if (ent->cur < 2 * ent->limit) { + if (err == -EAGAIN) { + mlx5_ib_dbg(dev, "returned eagain, order %d\n", + i + 2); + queue_delayed_work(cache->wq, &ent->dwork, + msecs_to_jiffies(3)); + } else if (err) { + mlx5_ib_warn(dev, "command failed order %d, err %d\n", + i + 2, err); + queue_delayed_work(cache->wq, &ent->dwork, + msecs_to_jiffies(1000)); + } else { + queue_work(cache->wq, &ent->work); + } + } + } else if (ent->cur > 2 * ent->limit) { + if (!someone_adding(cache) && + time_after(jiffies, cache->last_add + 300 * HZ)) { + remove_keys(dev, i, 1); + if (ent->cur > ent->limit) + queue_work(cache->wq, &ent->work); + } else { + queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ); + } + } +} + +static void delayed_cache_work_func(struct work_struct *work) +{ + struct mlx5_cache_ent *ent; + + ent = container_of(work, struct mlx5_cache_ent, dwork.work); + __cache_work_func(ent); +} + +static void cache_work_func(struct work_struct *work) +{ + struct mlx5_cache_ent *ent; + + ent = container_of(work, struct mlx5_cache_ent, work); + __cache_work_func(ent); +} + +static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_ib_mr *mr = NULL; + struct mlx5_cache_ent *ent; + int c; + int i; + + c = order2idx(dev, order); + if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) { + mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c); + return NULL; + } + + for (i = c; i < MAX_MR_CACHE_ENTRIES; i++) { + ent = &cache->ent[i]; + + mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i); + + spin_lock_irq(&ent->lock); + if (!list_empty(&ent->head)) { + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, + list); + list_del(&mr->list); + ent->cur--; + spin_unlock_irq(&ent->lock); + if (ent->cur < ent->limit) + queue_work(cache->wq, &ent->work); + break; + } + spin_unlock_irq(&ent->lock); + + queue_work(cache->wq, &ent->work); + + if (mr) + break; + } + + if (!mr) + cache->ent[c].miss++; + + return mr; +} + +static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent; + int shrink = 0; + int c; + + c = order2idx(dev, mr->order); + if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) { + mlx5_ib_warn(dev, "order %d, cache index %d\n", mr->order, c); + return; + } + ent = &cache->ent[c]; + spin_lock_irq(&ent->lock); + list_add_tail(&mr->list, &ent->head); + ent->cur++; + if (ent->cur > 2 * ent->limit) + shrink = 1; + spin_unlock_irq(&ent->lock); + + if (shrink) + queue_work(cache->wq, &ent->work); +} + +static void clean_keys(struct mlx5_ib_dev *dev, int c) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[c]; + struct mlx5_ib_mr *mr; + int err; + + cancel_delayed_work(&ent->dwork); + while (1) { + spin_lock_irq(&ent->lock); + if (list_empty(&ent->head)) { + spin_unlock_irq(&ent->lock); + return; + } + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); + list_del(&mr->list); + ent->cur--; + ent->size--; + spin_unlock_irq(&ent->lock); + err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); + if (err) + mlx5_ib_warn(dev, "failed destroy mkey\n"); + else + kfree(mr); + } +} + +static int mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent; + int i; + + if (!mlx5_debugfs_root) + return 0; + + cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root); + if (!cache->root) + return -ENOMEM; + + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + ent = &cache->ent[i]; + sprintf(ent->name, "%d", ent->order); + ent->dir = debugfs_create_dir(ent->name, cache->root); + if (!ent->dir) + return -ENOMEM; + + ent->fsize = debugfs_create_file("size", 0600, ent->dir, ent, + &size_fops); + if (!ent->fsize) + return -ENOMEM; + + ent->flimit = debugfs_create_file("limit", 0600, ent->dir, ent, + &limit_fops); + if (!ent->flimit) + return -ENOMEM; + + ent->fcur = debugfs_create_u32("cur", 0400, ent->dir, + &ent->cur); + if (!ent->fcur) + return -ENOMEM; + + ent->fmiss = debugfs_create_u32("miss", 0600, ent->dir, + &ent->miss); + if (!ent->fmiss) + return -ENOMEM; + } + + return 0; +} + +static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) +{ + if (!mlx5_debugfs_root) + return; + + debugfs_remove_recursive(dev->cache.root); +} + +static void delay_time_func(unsigned long ctx) +{ + struct mlx5_ib_dev *dev = (struct mlx5_ib_dev *)ctx; + + dev->fill_delay = 0; +} + +int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent; + int limit; + int err; + int i; + + cache->wq = create_singlethread_workqueue("mkey_cache"); + if (!cache->wq) { + mlx5_ib_warn(dev, "failed to create work queue\n"); + return -ENOMEM; + } + + setup_timer(&dev->delay_timer, delay_time_func, (unsigned long)dev); + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + INIT_LIST_HEAD(&cache->ent[i].head); + spin_lock_init(&cache->ent[i].lock); + + ent = &cache->ent[i]; + INIT_LIST_HEAD(&ent->head); + spin_lock_init(&ent->lock); + ent->order = i + 2; + ent->dev = dev; + + if (dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) + limit = dev->mdev->profile->mr_cache[i].limit; + else + limit = 0; + + INIT_WORK(&ent->work, cache_work_func); + INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); + ent->limit = limit; + queue_work(cache->wq, &ent->work); + } + + err = mlx5_mr_cache_debugfs_init(dev); + if (err) + mlx5_ib_warn(dev, "cache debugfs failure\n"); + + return 0; +} + +int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev) +{ + int i; + + dev->cache.stopped = 1; + flush_workqueue(dev->cache.wq); + + mlx5_mr_cache_debugfs_cleanup(dev); + + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) + clean_keys(dev, i); + + destroy_workqueue(dev->cache.wq); + del_timer_sync(&dev->delay_timer); + + return 0; +} + +struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_mkey_seg *seg; + struct mlx5_ib_mr *mr; + int err; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_free; + } + + seg = &in->seg; + seg->flags = convert_access(acc) | MLX5_ACCESS_MODE_PA; + seg->flags_pd = cpu_to_be32(to_mpd(pd)->pdn | MLX5_MKEY_LEN64); + seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + seg->start_addr = 0; + + err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in), NULL, NULL, + NULL); + if (err) + goto err_in; + + kfree(in); + mr->ibmr.lkey = mr->mmr.key; + mr->ibmr.rkey = mr->mmr.key; + mr->umem = NULL; + + return &mr->ibmr; + +err_in: + kfree(in); + +err_free: + kfree(mr); + + return ERR_PTR(err); +} + +static int get_octo_len(u64 addr, u64 len, int page_size) +{ + u64 offset; + int npages; + + offset = addr & (page_size - 1); + npages = ALIGN(len + offset, page_size) >> ilog2(page_size); + return (npages + 1) / 2; +} + +static int use_umr(int order) +{ + return order <= 17; +} + +static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, + struct ib_sge *sg, u64 dma, int n, u32 key, + int page_shift, u64 virt_addr, u64 len, + int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct ib_mr *mr = dev->umrc.mr; + + sg->addr = dma; + sg->length = ALIGN(sizeof(u64) * n, 64); + sg->lkey = mr->lkey; + + wr->next = NULL; + wr->send_flags = 0; + wr->sg_list = sg; + if (n) + wr->num_sge = 1; + else + wr->num_sge = 0; + + wr->opcode = MLX5_IB_WR_UMR; + wr->wr.fast_reg.page_list_len = n; + wr->wr.fast_reg.page_shift = page_shift; + wr->wr.fast_reg.rkey = key; + wr->wr.fast_reg.iova_start = virt_addr; + wr->wr.fast_reg.length = len; + wr->wr.fast_reg.access_flags = access_flags; + wr->wr.fast_reg.page_list = (struct ib_fast_reg_page_list *)pd; +} + +static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev, + struct ib_send_wr *wr, u32 key) +{ + wr->send_flags = MLX5_IB_SEND_UMR_UNREG; + wr->opcode = MLX5_IB_WR_UMR; + wr->wr.fast_reg.rkey = key; +} + +void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context) +{ + struct mlx5_ib_umr_context *context; + struct ib_wc wc; + int err; + + while (1) { + err = ib_poll_cq(cq, 1, &wc); + if (err < 0) { + pr_warn("poll cq error %d\n", err); + return; + } + if (err == 0) + break; + + context = (struct mlx5_ib_umr_context *) (unsigned long) wc.wr_id; + context->status = wc.status; + complete(&context->done); + } + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); +} + +static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, + u64 virt_addr, u64 len, int npages, + int page_shift, int order, int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct device *ddev = dev->ib_dev.dma_device; + struct umr_common *umrc = &dev->umrc; + struct mlx5_ib_umr_context umr_context; + struct ib_send_wr wr, *bad; + struct mlx5_ib_mr *mr; + struct ib_sge sg; + int size = sizeof(u64) * npages; + int err = 0; + int i; + + for (i = 0; i < 1; i++) { + mr = alloc_cached_mr(dev, order); + if (mr) + break; + + err = add_keys(dev, order2idx(dev, order), 1); + if (err && err != -EAGAIN) { + mlx5_ib_warn(dev, "add_keys failed, err %d\n", err); + break; + } + } + + if (!mr) + return ERR_PTR(-EAGAIN); + + mr->pas = kmalloc(size + MLX5_UMR_ALIGN - 1, GFP_KERNEL); + if (!mr->pas) { + err = -ENOMEM; + goto free_mr; + } + + mlx5_ib_populate_pas(dev, umem, page_shift, + mr_align(mr->pas, MLX5_UMR_ALIGN), 1); + + mr->dma = dma_map_single(ddev, mr_align(mr->pas, MLX5_UMR_ALIGN), size, + DMA_TO_DEVICE); + if (dma_mapping_error(ddev, mr->dma)) { + err = -ENOMEM; + goto free_pas; + } + + memset(&wr, 0, sizeof(wr)); + wr.wr_id = (u64)(unsigned long)&umr_context; + prep_umr_reg_wqe(pd, &wr, &sg, mr->dma, npages, mr->mmr.key, page_shift, virt_addr, len, access_flags); + + mlx5_ib_init_umr_context(&umr_context); + down(&umrc->sem); + err = ib_post_send(umrc->qp, &wr, &bad); + if (err) { + mlx5_ib_warn(dev, "post send failed, err %d\n", err); + goto unmap_dma; + } else { + wait_for_completion(&umr_context.done); + if (umr_context.status != IB_WC_SUCCESS) { + mlx5_ib_warn(dev, "reg umr failed\n"); + err = -EFAULT; + } + } + + mr->mmr.iova = virt_addr; + mr->mmr.size = len; + mr->mmr.pd = to_mpd(pd)->pdn; + +unmap_dma: + up(&umrc->sem); + dma_unmap_single(ddev, mr->dma, size, DMA_TO_DEVICE); + +free_pas: + kfree(mr->pas); + +free_mr: + if (err) { + free_cached_mr(dev, mr); + return ERR_PTR(err); + } + + return mr; +} + +static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, + u64 length, struct ib_umem *umem, + int npages, int page_shift, + int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_ib_mr *mr; + int inlen; + int err; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + inlen = sizeof(*in) + sizeof(*in->pas) * ((npages + 1) / 2) * 2; + in = mlx5_vzalloc(inlen); + if (!in) { + err = -ENOMEM; + goto err_1; + } + mlx5_ib_populate_pas(dev, umem, page_shift, in->pas, 0); + + in->seg.flags = convert_access(access_flags) | + MLX5_ACCESS_MODE_MTT; + in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); + in->seg.start_addr = cpu_to_be64(virt_addr); + in->seg.len = cpu_to_be64(length); + in->seg.bsfs_octo_size = 0; + in->seg.xlt_oct_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift)); + in->seg.log2_page_size = page_shift; + in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length, + 1 << page_shift)); + err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, inlen, NULL, + NULL, NULL); + if (err) { + mlx5_ib_warn(dev, "create mkey failed\n"); + goto err_2; + } + mr->umem = umem; + mlx5_vfree(in); + + mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key); + + return mr; + +err_2: + mlx5_vfree(in); + +err_1: + kfree(mr); + + return ERR_PTR(err); +} + +struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_mr *mr = NULL; + struct ib_umem *umem; + int page_shift; + int npages; + int ncont; + int order; + int err; + + mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n", + start, virt_addr, length, access_flags); + umem = ib_umem_get(pd->uobject->context, start, length, access_flags, + 0); + if (IS_ERR(umem)) { + mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(umem)); + return (void *)umem; + } + + mlx5_ib_cont_pages(umem, start, &npages, &page_shift, &ncont, &order); + if (!npages) { + mlx5_ib_warn(dev, "avoid zero region\n"); + err = -EINVAL; + goto error; + } + + mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n", + npages, ncont, order, page_shift); + + if (use_umr(order)) { + mr = reg_umr(pd, umem, virt_addr, length, ncont, page_shift, + order, access_flags); + if (PTR_ERR(mr) == -EAGAIN) { + mlx5_ib_dbg(dev, "cache empty for order %d", order); + mr = NULL; + } + } + + if (!mr) + mr = reg_create(pd, virt_addr, length, umem, ncont, page_shift, + access_flags); + + if (IS_ERR(mr)) { + err = PTR_ERR(mr); + goto error; + } + + mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmr.key); + + mr->umem = umem; + mr->npages = npages; + spin_lock(&dev->mr_lock); + dev->mdev->priv.reg_pages += npages; + spin_unlock(&dev->mr_lock); + mr->ibmr.lkey = mr->mmr.key; + mr->ibmr.rkey = mr->mmr.key; + + return &mr->ibmr; + +error: + ib_umem_release(umem); + return ERR_PTR(err); +} + +static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +{ + struct umr_common *umrc = &dev->umrc; + struct mlx5_ib_umr_context umr_context; + struct ib_send_wr wr, *bad; + int err; + + memset(&wr, 0, sizeof(wr)); + wr.wr_id = (u64)(unsigned long)&umr_context; + prep_umr_unreg_wqe(dev, &wr, mr->mmr.key); + + mlx5_ib_init_umr_context(&umr_context); + down(&umrc->sem); + err = ib_post_send(umrc->qp, &wr, &bad); + if (err) { + up(&umrc->sem); + mlx5_ib_dbg(dev, "err %d\n", err); + goto error; + } else { + wait_for_completion(&umr_context.done); + up(&umrc->sem); + } + if (umr_context.status != IB_WC_SUCCESS) { + mlx5_ib_warn(dev, "unreg umr failed\n"); + err = -EFAULT; + goto error; + } + return 0; + +error: + return err; +} + +int mlx5_ib_dereg_mr(struct ib_mr *ibmr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibmr->device); + struct mlx5_ib_mr *mr = to_mmr(ibmr); + struct ib_umem *umem = mr->umem; + int npages = mr->npages; + int umred = mr->umred; + int err; + + if (!umred) { + err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); + if (err) { + mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", + mr->mmr.key, err); + return err; + } + } else { + err = unreg_umr(dev, mr); + if (err) { + mlx5_ib_warn(dev, "failed unregister\n"); + return err; + } + free_cached_mr(dev, mr); + } + + if (umem) { + ib_umem_release(umem); + spin_lock(&dev->mr_lock); + dev->mdev->priv.reg_pages -= npages; + spin_unlock(&dev->mr_lock); + } + + if (!umred) + kfree(mr); + + return 0; +} + +struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd, + struct ib_mr_init_attr *mr_init_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_ib_mr *mr; + int access_mode, err; + int ndescs = roundup(mr_init_attr->max_reg_descriptors, 4); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_free; + } + + in->seg.status = 1 << 6; /* free */ + in->seg.xlt_oct_size = cpu_to_be32(ndescs); + in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); + access_mode = MLX5_ACCESS_MODE_MTT; + + if (mr_init_attr->flags & IB_MR_SIGNATURE_EN) { + u32 psv_index[2]; + + in->seg.flags_pd = cpu_to_be32(be32_to_cpu(in->seg.flags_pd) | + MLX5_MKEY_BSF_EN); + in->seg.bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE); + mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); + if (!mr->sig) { + err = -ENOMEM; + goto err_free_in; + } + + /* create mem & wire PSVs */ + err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, + 2, psv_index); + if (err) + goto err_free_sig; + + access_mode = MLX5_ACCESS_MODE_KLM; + mr->sig->psv_memory.psv_idx = psv_index[0]; + mr->sig->psv_wire.psv_idx = psv_index[1]; + + mr->sig->sig_status_checked = true; + mr->sig->sig_err_exists = false; + /* Next UMR, Arm SIGERR */ + ++mr->sig->sigerr_count; + } + + in->seg.flags = MLX5_PERM_UMR_EN | access_mode; + err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, sizeof(*in), + NULL, NULL, NULL); + if (err) + goto err_destroy_psv; + + mr->ibmr.lkey = mr->mmr.key; + mr->ibmr.rkey = mr->mmr.key; + mr->umem = NULL; + kfree(in); + + return &mr->ibmr; + +err_destroy_psv: + if (mr->sig) { + if (mlx5_core_destroy_psv(dev->mdev, + mr->sig->psv_memory.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", + mr->sig->psv_memory.psv_idx); + if (mlx5_core_destroy_psv(dev->mdev, + mr->sig->psv_wire.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", + mr->sig->psv_wire.psv_idx); + } +err_free_sig: + kfree(mr->sig); +err_free_in: + kfree(in); +err_free: + kfree(mr); + return ERR_PTR(err); +} + +int mlx5_ib_destroy_mr(struct ib_mr *ibmr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibmr->device); + struct mlx5_ib_mr *mr = to_mmr(ibmr); + int err; + + if (mr->sig) { + if (mlx5_core_destroy_psv(dev->mdev, + mr->sig->psv_memory.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", + mr->sig->psv_memory.psv_idx); + if (mlx5_core_destroy_psv(dev->mdev, + mr->sig->psv_wire.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", + mr->sig->psv_wire.psv_idx); + kfree(mr->sig); + } + + err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); + if (err) { + mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", + mr->mmr.key, err); + return err; + } + + kfree(mr); + + return err; +} + +struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd, + int max_page_list_len) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_create_mkey_mbox_in *in; + struct mlx5_ib_mr *mr; + int err; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_free; + } + + in->seg.status = 1 << 6; /* free */ + in->seg.xlt_oct_size = cpu_to_be32((max_page_list_len + 1) / 2); + in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + in->seg.flags = MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_MTT; + in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); + /* + * TBD not needed - issue 197292 */ + in->seg.log2_page_size = PAGE_SHIFT; + + err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, sizeof(*in), NULL, + NULL, NULL); + kfree(in); + if (err) + goto err_free; + + mr->ibmr.lkey = mr->mmr.key; + mr->ibmr.rkey = mr->mmr.key; + mr->umem = NULL; + + return &mr->ibmr; + +err_free: + kfree(mr); + return ERR_PTR(err); +} + +struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, + int page_list_len) +{ + struct mlx5_ib_fast_reg_page_list *mfrpl; + int size = page_list_len * sizeof(u64); + + mfrpl = kmalloc(sizeof(*mfrpl), GFP_KERNEL); + if (!mfrpl) + return ERR_PTR(-ENOMEM); + + mfrpl->ibfrpl.page_list = kmalloc(size, GFP_KERNEL); + if (!mfrpl->ibfrpl.page_list) + goto err_free; + + mfrpl->mapped_page_list = dma_alloc_coherent(ibdev->dma_device, + size, &mfrpl->map, + GFP_KERNEL); + if (!mfrpl->mapped_page_list) + goto err_free; + + WARN_ON(mfrpl->map & 0x3f); + + return &mfrpl->ibfrpl; + +err_free: + kfree(mfrpl->ibfrpl.page_list); + kfree(mfrpl); + return ERR_PTR(-ENOMEM); +} + +void mlx5_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list) +{ + struct mlx5_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list); + struct mlx5_ib_dev *dev = to_mdev(page_list->device); + int size = page_list->max_page_list_len * sizeof(u64); + + dma_free_coherent(&dev->mdev->pdev->dev, size, mfrpl->mapped_page_list, + mfrpl->map); + kfree(mfrpl->ibfrpl.page_list); + kfree(mfrpl); +} + +int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, + struct ib_mr_status *mr_status) +{ + struct mlx5_ib_mr *mmr = to_mmr(ibmr); + int ret = 0; + + if (check_mask & ~IB_MR_CHECK_SIG_STATUS) { + pr_err("Invalid status check mask\n"); + ret = -EINVAL; + goto done; + } + + mr_status->fail_status = 0; + if (check_mask & IB_MR_CHECK_SIG_STATUS) { + if (!mmr->sig) { + ret = -EINVAL; + pr_err("signature status check requested on a non-signature enabled MR\n"); + goto done; + } + + mmr->sig->sig_status_checked = true; + if (!mmr->sig->sig_err_exists) + goto done; + + if (ibmr->lkey == mmr->sig->err_item.key) + memcpy(&mr_status->sig_err, &mmr->sig->err_item, + sizeof(mr_status->sig_err)); + else { + mr_status->sig_err.err_type = IB_SIG_BAD_GUARD; + mr_status->sig_err.sig_err_offset = 0; + mr_status->sig_err.key = mmr->sig->err_item.key; + } + + mmr->sig->sig_err_exists = false; + mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS; + } + +done: + return ret; +} diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c new file mode 100644 index 0000000..e261a53 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -0,0 +1,3039 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include "mlx5_ib.h" +#include "user.h" + +/* not supported currently */ +static int wq_signature; + +enum { + MLX5_IB_ACK_REQ_FREQ = 8, +}; + +enum { + MLX5_IB_DEFAULT_SCHED_QUEUE = 0x83, + MLX5_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f, + MLX5_IB_LINK_TYPE_IB = 0, + MLX5_IB_LINK_TYPE_ETH = 1 +}; + +enum { + MLX5_IB_SQ_STRIDE = 6, + MLX5_IB_CACHE_LINE_SIZE = 64, +}; + +static const u32 mlx5_ib_opcode[] = { + [IB_WR_SEND] = MLX5_OPCODE_SEND, + [IB_WR_SEND_WITH_IMM] = MLX5_OPCODE_SEND_IMM, + [IB_WR_RDMA_WRITE] = MLX5_OPCODE_RDMA_WRITE, + [IB_WR_RDMA_WRITE_WITH_IMM] = MLX5_OPCODE_RDMA_WRITE_IMM, + [IB_WR_RDMA_READ] = MLX5_OPCODE_RDMA_READ, + [IB_WR_ATOMIC_CMP_AND_SWP] = MLX5_OPCODE_ATOMIC_CS, + [IB_WR_ATOMIC_FETCH_AND_ADD] = MLX5_OPCODE_ATOMIC_FA, + [IB_WR_SEND_WITH_INV] = MLX5_OPCODE_SEND_INVAL, + [IB_WR_LOCAL_INV] = MLX5_OPCODE_UMR, + [IB_WR_FAST_REG_MR] = MLX5_OPCODE_UMR, + [IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = MLX5_OPCODE_ATOMIC_MASKED_CS, + [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = MLX5_OPCODE_ATOMIC_MASKED_FA, + [MLX5_IB_WR_UMR] = MLX5_OPCODE_UMR, +}; + +struct umr_wr { + u64 virt_addr; + struct ib_pd *pd; + unsigned int page_shift; + unsigned int npages; + u32 length; + int access_flags; + u32 mkey; +}; + +static int is_qp0(enum ib_qp_type qp_type) +{ + return qp_type == IB_QPT_SMI; +} + +static int is_qp1(enum ib_qp_type qp_type) +{ + return qp_type == IB_QPT_GSI; +} + +static int is_sqp(enum ib_qp_type qp_type) +{ + return is_qp0(qp_type) || is_qp1(qp_type); +} + +static void *get_wqe(struct mlx5_ib_qp *qp, int offset) +{ + return mlx5_buf_offset(&qp->buf, offset); +} + +static void *get_recv_wqe(struct mlx5_ib_qp *qp, int n) +{ + return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift)); +} + +void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n) +{ + return get_wqe(qp, qp->sq.offset + (n << MLX5_IB_SQ_STRIDE)); +} + +static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type) +{ + struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; + struct ib_event event; + + if (type == MLX5_EVENT_TYPE_PATH_MIG) + to_mibqp(qp)->port = to_mibqp(qp)->alt_port; + + if (ibqp->event_handler) { + event.device = ibqp->device; + event.element.qp = ibqp; + switch (type) { + case MLX5_EVENT_TYPE_PATH_MIG: + event.event = IB_EVENT_PATH_MIG; + break; + case MLX5_EVENT_TYPE_COMM_EST: + event.event = IB_EVENT_COMM_EST; + break; + case MLX5_EVENT_TYPE_SQ_DRAINED: + event.event = IB_EVENT_SQ_DRAINED; + break; + case MLX5_EVENT_TYPE_SRQ_LAST_WQE: + event.event = IB_EVENT_QP_LAST_WQE_REACHED; + break; + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + event.event = IB_EVENT_QP_FATAL; + break; + case MLX5_EVENT_TYPE_PATH_MIG_FAILED: + event.event = IB_EVENT_PATH_MIG_ERR; + break; + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + event.event = IB_EVENT_QP_REQ_ERR; + break; + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + event.event = IB_EVENT_QP_ACCESS_ERR; + break; + default: + pr_warn("mlx5_ib: Unexpected event type %d on QP %06x\n", type, qp->qpn); + return; + } + + ibqp->event_handler(&event, ibqp->qp_context); + } +} + +static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap, + int has_rq, struct mlx5_ib_qp *qp, struct mlx5_ib_create_qp *ucmd) +{ + struct mlx5_general_caps *gen; + int wqe_size; + int wq_size; + + gen = &dev->mdev->caps.gen; + /* Sanity check RQ size before proceeding */ + if (cap->max_recv_wr > gen->max_wqes) + return -EINVAL; + + if (!has_rq) { + qp->rq.max_gs = 0; + qp->rq.wqe_cnt = 0; + qp->rq.wqe_shift = 0; + } else { + if (ucmd) { + qp->rq.wqe_cnt = ucmd->rq_wqe_count; + qp->rq.wqe_shift = ucmd->rq_wqe_shift; + qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig; + qp->rq.max_post = qp->rq.wqe_cnt; + } else { + wqe_size = qp->wq_sig ? sizeof(struct mlx5_wqe_signature_seg) : 0; + wqe_size += cap->max_recv_sge * sizeof(struct mlx5_wqe_data_seg); + wqe_size = roundup_pow_of_two(wqe_size); + wq_size = roundup_pow_of_two(cap->max_recv_wr) * wqe_size; + wq_size = max_t(int, wq_size, MLX5_SEND_WQE_BB); + qp->rq.wqe_cnt = wq_size / wqe_size; + if (wqe_size > gen->max_rq_desc_sz) { + mlx5_ib_dbg(dev, "wqe_size %d, max %d\n", + wqe_size, + gen->max_rq_desc_sz); + return -EINVAL; + } + qp->rq.wqe_shift = ilog2(wqe_size); + qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig; + qp->rq.max_post = qp->rq.wqe_cnt; + } + } + + return 0; +} + +static int sq_overhead(enum ib_qp_type qp_type) +{ + int size = 0; + + switch (qp_type) { + case IB_QPT_XRC_INI: + size += sizeof(struct mlx5_wqe_xrc_seg); + /* fall through */ + case IB_QPT_RC: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_atomic_seg) + + sizeof(struct mlx5_wqe_raddr_seg); + break; + + case IB_QPT_XRC_TGT: + return 0; + + case IB_QPT_UC: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_raddr_seg) + + sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_mkey_seg); + break; + + case IB_QPT_UD: + case IB_QPT_SMI: + case IB_QPT_GSI: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_datagram_seg); + break; + + case MLX5_IB_QPT_REG_UMR: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_mkey_seg); + break; + + default: + return -EINVAL; + } + + return size; +} + +static int calc_send_wqe(struct ib_qp_init_attr *attr) +{ + int inl_size = 0; + int size; + + size = sq_overhead(attr->qp_type); + if (size < 0) + return size; + + if (attr->cap.max_inline_data) { + inl_size = size + sizeof(struct mlx5_wqe_inline_seg) + + attr->cap.max_inline_data; + } + + size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg); + if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN && + ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB) < MLX5_SIG_WQE_SIZE) + return MLX5_SIG_WQE_SIZE; + else + return ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB); +} + +static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, + struct mlx5_ib_qp *qp) +{ + struct mlx5_general_caps *gen; + int wqe_size; + int wq_size; + + gen = &dev->mdev->caps.gen; + if (!attr->cap.max_send_wr) + return 0; + + wqe_size = calc_send_wqe(attr); + mlx5_ib_dbg(dev, "wqe_size %d\n", wqe_size); + if (wqe_size < 0) + return wqe_size; + + if (wqe_size > gen->max_sq_desc_sz) { + mlx5_ib_dbg(dev, "wqe_size(%d) > max_sq_desc_sz(%d)\n", + wqe_size, gen->max_sq_desc_sz); + return -EINVAL; + } + + qp->max_inline_data = wqe_size - sq_overhead(attr->qp_type) - + sizeof(struct mlx5_wqe_inline_seg); + attr->cap.max_inline_data = qp->max_inline_data; + + if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) + qp->signature_en = true; + + wq_size = roundup_pow_of_two(attr->cap.max_send_wr * wqe_size); + qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB; + if (qp->sq.wqe_cnt > gen->max_wqes) { + mlx5_ib_dbg(dev, "wqe count(%d) exceeds limits(%d)\n", + qp->sq.wqe_cnt, gen->max_wqes); + return -ENOMEM; + } + qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); + qp->sq.max_gs = attr->cap.max_send_sge; + qp->sq.max_post = wq_size / wqe_size; + attr->cap.max_send_wr = qp->sq.max_post; + + return wq_size; +} + +static int set_user_buf_size(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp, + struct mlx5_ib_create_qp *ucmd) +{ + struct mlx5_general_caps *gen; + int desc_sz = 1 << qp->sq.wqe_shift; + + gen = &dev->mdev->caps.gen; + if (desc_sz > gen->max_sq_desc_sz) { + mlx5_ib_warn(dev, "desc_sz %d, max_sq_desc_sz %d\n", + desc_sz, gen->max_sq_desc_sz); + return -EINVAL; + } + + if (ucmd->sq_wqe_count && ((1 << ilog2(ucmd->sq_wqe_count)) != ucmd->sq_wqe_count)) { + mlx5_ib_warn(dev, "sq_wqe_count %d, sq_wqe_count %d\n", + ucmd->sq_wqe_count, ucmd->sq_wqe_count); + return -EINVAL; + } + + qp->sq.wqe_cnt = ucmd->sq_wqe_count; + + if (qp->sq.wqe_cnt > gen->max_wqes) { + mlx5_ib_warn(dev, "wqe_cnt %d, max_wqes %d\n", + qp->sq.wqe_cnt, gen->max_wqes); + return -EINVAL; + } + + qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + + (qp->sq.wqe_cnt << 6); + + return 0; +} + +static int qp_has_rq(struct ib_qp_init_attr *attr) +{ + if (attr->qp_type == IB_QPT_XRC_INI || + attr->qp_type == IB_QPT_XRC_TGT || attr->srq || + attr->qp_type == MLX5_IB_QPT_REG_UMR || + !attr->cap.max_recv_wr) + return 0; + + return 1; +} + +static int first_med_uuar(void) +{ + return 1; +} + +static int next_uuar(int n) +{ + n++; + + while (((n % 4) & 2)) + n++; + + return n; +} + +static int num_med_uuar(struct mlx5_uuar_info *uuari) +{ + int n; + + n = uuari->num_uars * MLX5_NON_FP_BF_REGS_PER_PAGE - + uuari->num_low_latency_uuars - 1; + + return n >= 0 ? n : 0; +} + +static int max_uuari(struct mlx5_uuar_info *uuari) +{ + return uuari->num_uars * 4; +} + +static int first_hi_uuar(struct mlx5_uuar_info *uuari) +{ + int med; + int i; + int t; + + med = num_med_uuar(uuari); + for (t = 0, i = first_med_uuar();; i = next_uuar(i)) { + t++; + if (t == med) + return next_uuar(i); + } + + return 0; +} + +static int alloc_high_class_uuar(struct mlx5_uuar_info *uuari) +{ + int i; + + for (i = first_hi_uuar(uuari); i < max_uuari(uuari); i = next_uuar(i)) { + if (!test_bit(i, uuari->bitmap)) { + set_bit(i, uuari->bitmap); + uuari->count[i]++; + return i; + } + } + + return -ENOMEM; +} + +static int alloc_med_class_uuar(struct mlx5_uuar_info *uuari) +{ + int minidx = first_med_uuar(); + int i; + + for (i = first_med_uuar(); i < first_hi_uuar(uuari); i = next_uuar(i)) { + if (uuari->count[i] < uuari->count[minidx]) + minidx = i; + } + + uuari->count[minidx]++; + return minidx; +} + +static int alloc_uuar(struct mlx5_uuar_info *uuari, + enum mlx5_ib_latency_class lat) +{ + int uuarn = -EINVAL; + + mutex_lock(&uuari->lock); + switch (lat) { + case MLX5_IB_LATENCY_CLASS_LOW: + uuarn = 0; + uuari->count[uuarn]++; + break; + + case MLX5_IB_LATENCY_CLASS_MEDIUM: + if (uuari->ver < 2) + uuarn = -ENOMEM; + else + uuarn = alloc_med_class_uuar(uuari); + break; + + case MLX5_IB_LATENCY_CLASS_HIGH: + if (uuari->ver < 2) + uuarn = -ENOMEM; + else + uuarn = alloc_high_class_uuar(uuari); + break; + + case MLX5_IB_LATENCY_CLASS_FAST_PATH: + uuarn = 2; + break; + } + mutex_unlock(&uuari->lock); + + return uuarn; +} + +static void free_med_class_uuar(struct mlx5_uuar_info *uuari, int uuarn) +{ + clear_bit(uuarn, uuari->bitmap); + --uuari->count[uuarn]; +} + +static void free_high_class_uuar(struct mlx5_uuar_info *uuari, int uuarn) +{ + clear_bit(uuarn, uuari->bitmap); + --uuari->count[uuarn]; +} + +static void free_uuar(struct mlx5_uuar_info *uuari, int uuarn) +{ + int nuuars = uuari->num_uars * MLX5_BF_REGS_PER_PAGE; + int high_uuar = nuuars - uuari->num_low_latency_uuars; + + mutex_lock(&uuari->lock); + if (uuarn == 0) { + --uuari->count[uuarn]; + goto out; + } + + if (uuarn < high_uuar) { + free_med_class_uuar(uuari, uuarn); + goto out; + } + + free_high_class_uuar(uuari, uuarn); + +out: + mutex_unlock(&uuari->lock); +} + +static enum mlx5_qp_state to_mlx5_state(enum ib_qp_state state) +{ + switch (state) { + case IB_QPS_RESET: return MLX5_QP_STATE_RST; + case IB_QPS_INIT: return MLX5_QP_STATE_INIT; + case IB_QPS_RTR: return MLX5_QP_STATE_RTR; + case IB_QPS_RTS: return MLX5_QP_STATE_RTS; + case IB_QPS_SQD: return MLX5_QP_STATE_SQD; + case IB_QPS_SQE: return MLX5_QP_STATE_SQER; + case IB_QPS_ERR: return MLX5_QP_STATE_ERR; + default: return -1; + } +} + +static int to_mlx5_st(enum ib_qp_type type) +{ + switch (type) { + case IB_QPT_RC: return MLX5_QP_ST_RC; + case IB_QPT_UC: return MLX5_QP_ST_UC; + case IB_QPT_UD: return MLX5_QP_ST_UD; + case MLX5_IB_QPT_REG_UMR: return MLX5_QP_ST_REG_UMR; + case IB_QPT_XRC_INI: + case IB_QPT_XRC_TGT: return MLX5_QP_ST_XRC; + case IB_QPT_SMI: return MLX5_QP_ST_QP0; + case IB_QPT_GSI: return MLX5_QP_ST_QP1; + case IB_QPT_RAW_IPV6: return MLX5_QP_ST_RAW_IPV6; + case IB_QPT_RAW_ETHERTYPE: return MLX5_QP_ST_RAW_ETHERTYPE; + case IB_QPT_RAW_PACKET: + case IB_QPT_MAX: + default: return -EINVAL; + } +} + +static int uuarn_to_uar_index(struct mlx5_uuar_info *uuari, int uuarn) +{ + return uuari->uars[uuarn / MLX5_BF_REGS_PER_PAGE].index; +} + +static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_qp *qp, struct ib_udata *udata, + struct mlx5_create_qp_mbox_in **in, + struct mlx5_ib_create_qp_resp *resp, int *inlen) +{ + struct mlx5_ib_ucontext *context; + struct mlx5_ib_create_qp ucmd; + int page_shift = 0; + int uar_index; + int npages; + u32 offset = 0; + int uuarn; + int ncont = 0; + int err; + + err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); + if (err) { + mlx5_ib_dbg(dev, "copy failed\n"); + return err; + } + + context = to_mucontext(pd->uobject->context); + /* + * TBD: should come from the verbs when we have the API + */ + uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_HIGH); + if (uuarn < 0) { + mlx5_ib_dbg(dev, "failed to allocate low latency UUAR\n"); + mlx5_ib_dbg(dev, "reverting to medium latency\n"); + uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_MEDIUM); + if (uuarn < 0) { + mlx5_ib_dbg(dev, "failed to allocate medium latency UUAR\n"); + mlx5_ib_dbg(dev, "reverting to high latency\n"); + uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW); + if (uuarn < 0) { + mlx5_ib_warn(dev, "uuar allocation failed\n"); + return uuarn; + } + } + } + + uar_index = uuarn_to_uar_index(&context->uuari, uuarn); + mlx5_ib_dbg(dev, "uuarn 0x%x, uar_index 0x%x\n", uuarn, uar_index); + + qp->rq.offset = 0; + qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); + qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; + + err = set_user_buf_size(dev, qp, &ucmd); + if (err) + goto err_uuar; + + if (ucmd.buf_addr && qp->buf_size) { + qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, + qp->buf_size, 0, 0); + if (IS_ERR(qp->umem)) { + mlx5_ib_dbg(dev, "umem_get failed\n"); + err = PTR_ERR(qp->umem); + goto err_uuar; + } + } else { + qp->umem = NULL; + } + + if (qp->umem) { + mlx5_ib_cont_pages(qp->umem, ucmd.buf_addr, &npages, &page_shift, + &ncont, NULL); + err = mlx5_ib_get_buf_offset(ucmd.buf_addr, page_shift, &offset); + if (err) { + mlx5_ib_warn(dev, "bad offset\n"); + goto err_umem; + } + mlx5_ib_dbg(dev, "addr 0x%llx, size %d, npages %d, page_shift %d, ncont %d, offset %d\n", + ucmd.buf_addr, qp->buf_size, npages, page_shift, ncont, offset); + } + + *inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont; + *in = mlx5_vzalloc(*inlen); + if (!*in) { + err = -ENOMEM; + goto err_umem; + } + if (qp->umem) + mlx5_ib_populate_pas(dev, qp->umem, page_shift, (*in)->pas, 0); + (*in)->ctx.log_pg_sz_remote_qpn = + cpu_to_be32((page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24); + (*in)->ctx.params2 = cpu_to_be32(offset << 6); + + (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); + resp->uuar_index = uuarn; + qp->uuarn = uuarn; + + err = mlx5_ib_db_map_user(context, ucmd.db_addr, &qp->db); + if (err) { + mlx5_ib_dbg(dev, "map failed\n"); + goto err_free; + } + + err = ib_copy_to_udata(udata, resp, sizeof(*resp)); + if (err) { + mlx5_ib_dbg(dev, "copy failed\n"); + goto err_unmap; + } + qp->create_type = MLX5_QP_USER; + + return 0; + +err_unmap: + mlx5_ib_db_unmap_user(context, &qp->db); + +err_free: + mlx5_vfree(*in); + +err_umem: + if (qp->umem) + ib_umem_release(qp->umem); + +err_uuar: + free_uuar(&context->uuari, uuarn); + return err; +} + +static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_ucontext *context; + + context = to_mucontext(pd->uobject->context); + mlx5_ib_db_unmap_user(context, &qp->db); + if (qp->umem) + ib_umem_release(qp->umem); + free_uuar(&context->uuari, qp->uuarn); +} + +static int create_kernel_qp(struct mlx5_ib_dev *dev, + struct ib_qp_init_attr *init_attr, + struct mlx5_ib_qp *qp, + struct mlx5_create_qp_mbox_in **in, int *inlen) +{ + enum mlx5_ib_latency_class lc = MLX5_IB_LATENCY_CLASS_LOW; + struct mlx5_uuar_info *uuari; + int uar_index; + int uuarn; + int err; + + uuari = &dev->mdev->priv.uuari; + if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN | IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)) + return -EINVAL; + + if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR) + lc = MLX5_IB_LATENCY_CLASS_FAST_PATH; + + uuarn = alloc_uuar(uuari, lc); + if (uuarn < 0) { + mlx5_ib_dbg(dev, "\n"); + return -ENOMEM; + } + + qp->bf = &uuari->bfs[uuarn]; + uar_index = qp->bf->uar->index; + + err = calc_sq_size(dev, init_attr, qp); + if (err < 0) { + mlx5_ib_dbg(dev, "err %d\n", err); + goto err_uuar; + } + + qp->rq.offset = 0; + qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; + qp->buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift); + + err = mlx5_buf_alloc(dev->mdev, qp->buf_size, PAGE_SIZE * 2, &qp->buf); + if (err) { + mlx5_ib_dbg(dev, "err %d\n", err); + goto err_uuar; + } + + qp->sq.qend = mlx5_get_send_wqe(qp, qp->sq.wqe_cnt); + *inlen = sizeof(**in) + sizeof(*(*in)->pas) * qp->buf.npages; + *in = mlx5_vzalloc(*inlen); + if (!*in) { + err = -ENOMEM; + goto err_buf; + } + (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); + (*in)->ctx.log_pg_sz_remote_qpn = + cpu_to_be32((qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24); + /* Set "fast registration enabled" for all kernel QPs */ + (*in)->ctx.params1 |= cpu_to_be32(1 << 11); + (*in)->ctx.sq_crq_size |= cpu_to_be16(1 << 4); + + mlx5_fill_page_array(&qp->buf, (*in)->pas); + + err = mlx5_db_alloc(dev->mdev, &qp->db); + if (err) { + mlx5_ib_dbg(dev, "err %d\n", err); + goto err_free; + } + + qp->db.db[0] = 0; + qp->db.db[1] = 0; + + qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wrid), GFP_KERNEL); + qp->sq.wr_data = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wr_data), GFP_KERNEL); + qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof(*qp->rq.wrid), GFP_KERNEL); + qp->sq.w_list = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.w_list), GFP_KERNEL); + qp->sq.wqe_head = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wqe_head), GFP_KERNEL); + + if (!qp->sq.wrid || !qp->sq.wr_data || !qp->rq.wrid || + !qp->sq.w_list || !qp->sq.wqe_head) { + err = -ENOMEM; + goto err_wrid; + } + qp->create_type = MLX5_QP_KERNEL; + + return 0; + +err_wrid: + mlx5_db_free(dev->mdev, &qp->db); + kfree(qp->sq.wqe_head); + kfree(qp->sq.w_list); + kfree(qp->sq.wrid); + kfree(qp->sq.wr_data); + kfree(qp->rq.wrid); + +err_free: + mlx5_vfree(*in); + +err_buf: + mlx5_buf_free(dev->mdev, &qp->buf); + +err_uuar: + free_uuar(&dev->mdev->priv.uuari, uuarn); + return err; +} + +static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) +{ + mlx5_db_free(dev->mdev, &qp->db); + kfree(qp->sq.wqe_head); + kfree(qp->sq.w_list); + kfree(qp->sq.wrid); + kfree(qp->sq.wr_data); + kfree(qp->rq.wrid); + mlx5_buf_free(dev->mdev, &qp->buf); + free_uuar(&dev->mdev->priv.uuari, qp->bf->uuarn); +} + +static __be32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr) +{ + if (attr->srq || (attr->qp_type == IB_QPT_XRC_TGT) || + (attr->qp_type == IB_QPT_XRC_INI)) + return cpu_to_be32(MLX5_SRQ_RQ); + else if (!qp->has_rq) + return cpu_to_be32(MLX5_ZERO_LEN_RQ); + else + return cpu_to_be32(MLX5_NON_ZERO_RQ); +} + +static int is_connected(enum ib_qp_type qp_type) +{ + if (qp_type == IB_QPT_RC || qp_type == IB_QPT_UC) + return 1; + + return 0; +} + +static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata, struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_resources *devr = &dev->devr; + struct mlx5_ib_create_qp_resp resp; + struct mlx5_create_qp_mbox_in *in; + struct mlx5_general_caps *gen; + struct mlx5_ib_create_qp ucmd; + int inlen = sizeof(*in); + int err; + + gen = &dev->mdev->caps.gen; + mutex_init(&qp->mutex); + spin_lock_init(&qp->sq.lock); + spin_lock_init(&qp->rq.lock); + + if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) { + if (!(gen->flags & MLX5_DEV_CAP_FLAG_BLOCK_MCAST)) { + mlx5_ib_dbg(dev, "block multicast loopback isn't supported\n"); + return -EINVAL; + } else { + qp->flags |= MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK; + } + } + + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) + qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; + + if (pd && pd->uobject) { + if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) { + mlx5_ib_dbg(dev, "copy failed\n"); + return -EFAULT; + } + + qp->wq_sig = !!(ucmd.flags & MLX5_QP_FLAG_SIGNATURE); + qp->scat_cqe = !!(ucmd.flags & MLX5_QP_FLAG_SCATTER_CQE); + } else { + qp->wq_sig = !!wq_signature; + } + + qp->has_rq = qp_has_rq(init_attr); + err = set_rq_size(dev, &init_attr->cap, qp->has_rq, + qp, (pd && pd->uobject) ? &ucmd : NULL); + if (err) { + mlx5_ib_dbg(dev, "err %d\n", err); + return err; + } + + if (pd) { + if (pd->uobject) { + mlx5_ib_dbg(dev, "requested sq_wqe_count (%d)\n", ucmd.sq_wqe_count); + if (ucmd.rq_wqe_shift != qp->rq.wqe_shift || + ucmd.rq_wqe_count != qp->rq.wqe_cnt) { + mlx5_ib_dbg(dev, "invalid rq params\n"); + return -EINVAL; + } + if (ucmd.sq_wqe_count > gen->max_wqes) { + mlx5_ib_dbg(dev, "requested sq_wqe_count (%d) > max allowed (%d)\n", + ucmd.sq_wqe_count, gen->max_wqes); + return -EINVAL; + } + err = create_user_qp(dev, pd, qp, udata, &in, &resp, &inlen); + if (err) + mlx5_ib_dbg(dev, "err %d\n", err); + } else { + err = create_kernel_qp(dev, init_attr, qp, &in, &inlen); + if (err) + mlx5_ib_dbg(dev, "err %d\n", err); + else + qp->pa_lkey = to_mpd(pd)->pa_lkey; + } + + if (err) + return err; + } else { + in = mlx5_vzalloc(sizeof(*in)); + if (!in) + return -ENOMEM; + + qp->create_type = MLX5_QP_EMPTY; + } + + if (is_sqp(init_attr->qp_type)) + qp->port = init_attr->port_num; + + in->ctx.flags = cpu_to_be32(to_mlx5_st(init_attr->qp_type) << 16 | + MLX5_QP_PM_MIGRATED << 11); + + if (init_attr->qp_type != MLX5_IB_QPT_REG_UMR) + in->ctx.flags_pd = cpu_to_be32(to_mpd(pd ? pd : devr->p0)->pdn); + else + in->ctx.flags_pd = cpu_to_be32(MLX5_QP_LAT_SENSITIVE); + + if (qp->wq_sig) + in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_ENABLE_SIG); + + if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK) + in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_BLOCK_MCAST); + + if (qp->scat_cqe && is_connected(init_attr->qp_type)) { + int rcqe_sz; + int scqe_sz; + + rcqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->recv_cq); + scqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->send_cq); + + if (rcqe_sz == 128) + in->ctx.cs_res = MLX5_RES_SCAT_DATA64_CQE; + else + in->ctx.cs_res = MLX5_RES_SCAT_DATA32_CQE; + + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) { + if (scqe_sz == 128) + in->ctx.cs_req = MLX5_REQ_SCAT_DATA64_CQE; + else + in->ctx.cs_req = MLX5_REQ_SCAT_DATA32_CQE; + } + } + + if (qp->rq.wqe_cnt) { + in->ctx.rq_size_stride = (qp->rq.wqe_shift - 4); + in->ctx.rq_size_stride |= ilog2(qp->rq.wqe_cnt) << 3; + } + + in->ctx.rq_type_srqn = get_rx_type(qp, init_attr); + + if (qp->sq.wqe_cnt) + in->ctx.sq_crq_size |= cpu_to_be16(ilog2(qp->sq.wqe_cnt) << 11); + else + in->ctx.sq_crq_size |= cpu_to_be16(0x8000); + + /* Set default resources */ + switch (init_attr->qp_type) { + case IB_QPT_XRC_TGT: + in->ctx.cqn_recv = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn); + in->ctx.cqn_send = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn); + in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn); + in->ctx.xrcd = cpu_to_be32(to_mxrcd(init_attr->xrcd)->xrcdn); + break; + case IB_QPT_XRC_INI: + in->ctx.cqn_recv = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn); + in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x1)->xrcdn); + in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn); + break; + default: + if (init_attr->srq) { + in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x0)->xrcdn); + in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(init_attr->srq)->msrq.srqn); + } else { + in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x1)->xrcdn); + in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn); + } + } + + if (init_attr->send_cq) + in->ctx.cqn_send = cpu_to_be32(to_mcq(init_attr->send_cq)->mcq.cqn); + + if (init_attr->recv_cq) + in->ctx.cqn_recv = cpu_to_be32(to_mcq(init_attr->recv_cq)->mcq.cqn); + + in->ctx.db_rec_addr = cpu_to_be64(qp->db.dma); + + err = mlx5_core_create_qp(dev->mdev, &qp->mqp, in, inlen); + if (err) { + mlx5_ib_dbg(dev, "create qp failed\n"); + goto err_create; + } + + mlx5_vfree(in); + /* Hardware wants QPN written in big-endian order (after + * shifting) for send doorbell. Precompute this value to save + * a little bit when posting sends. + */ + qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); + + qp->mqp.event = mlx5_ib_qp_event; + + return 0; + +err_create: + if (qp->create_type == MLX5_QP_USER) + destroy_qp_user(pd, qp); + else if (qp->create_type == MLX5_QP_KERNEL) + destroy_qp_kernel(dev, qp); + + mlx5_vfree(in); + return err; +} + +static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq) + __acquires(&send_cq->lock) __acquires(&recv_cq->lock) +{ + if (send_cq) { + if (recv_cq) { + if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + spin_lock_irq(&send_cq->lock); + spin_lock_nested(&recv_cq->lock, + SINGLE_DEPTH_NESTING); + } else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) { + spin_lock_irq(&send_cq->lock); + __acquire(&recv_cq->lock); + } else { + spin_lock_irq(&recv_cq->lock); + spin_lock_nested(&send_cq->lock, + SINGLE_DEPTH_NESTING); + } + } else { + spin_lock_irq(&send_cq->lock); + } + } else if (recv_cq) { + spin_lock_irq(&recv_cq->lock); + } +} + +static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq) + __releases(&send_cq->lock) __releases(&recv_cq->lock) +{ + if (send_cq) { + if (recv_cq) { + if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + spin_unlock(&recv_cq->lock); + spin_unlock_irq(&send_cq->lock); + } else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) { + __release(&recv_cq->lock); + spin_unlock_irq(&send_cq->lock); + } else { + spin_unlock(&send_cq->lock); + spin_unlock_irq(&recv_cq->lock); + } + } else { + spin_unlock_irq(&send_cq->lock); + } + } else if (recv_cq) { + spin_unlock_irq(&recv_cq->lock); + } +} + +static struct mlx5_ib_pd *get_pd(struct mlx5_ib_qp *qp) +{ + return to_mpd(qp->ibqp.pd); +} + +static void get_cqs(struct mlx5_ib_qp *qp, + struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq) +{ + switch (qp->ibqp.qp_type) { + case IB_QPT_XRC_TGT: + *send_cq = NULL; + *recv_cq = NULL; + break; + case MLX5_IB_QPT_REG_UMR: + case IB_QPT_XRC_INI: + *send_cq = to_mcq(qp->ibqp.send_cq); + *recv_cq = NULL; + break; + + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_QPT_RC: + case IB_QPT_UC: + case IB_QPT_UD: + case IB_QPT_RAW_IPV6: + case IB_QPT_RAW_ETHERTYPE: + *send_cq = to_mcq(qp->ibqp.send_cq); + *recv_cq = to_mcq(qp->ibqp.recv_cq); + break; + + case IB_QPT_RAW_PACKET: + case IB_QPT_MAX: + default: + *send_cq = NULL; + *recv_cq = NULL; + break; + } +} + +static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_cq *send_cq, *recv_cq; + struct mlx5_modify_qp_mbox_in *in; + int err; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return; + if (qp->state != IB_QPS_RESET) + if (mlx5_core_qp_modify(dev->mdev, to_mlx5_state(qp->state), + MLX5_QP_STATE_RST, in, sizeof(*in), &qp->mqp)) + mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n", + qp->mqp.qpn); + + get_cqs(qp, &send_cq, &recv_cq); + + if (qp->create_type == MLX5_QP_KERNEL) { + mlx5_ib_lock_cqs(send_cq, recv_cq); + __mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn, + qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL); + if (send_cq != recv_cq) + __mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + mlx5_ib_unlock_cqs(send_cq, recv_cq); + } + + err = mlx5_core_destroy_qp(dev->mdev, &qp->mqp); + if (err) + mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n", qp->mqp.qpn); + kfree(in); + + + if (qp->create_type == MLX5_QP_KERNEL) + destroy_qp_kernel(dev, qp); + else if (qp->create_type == MLX5_QP_USER) + destroy_qp_user(&get_pd(qp)->ibpd, qp); +} + +static const char *ib_qp_type_str(enum ib_qp_type type) +{ + switch (type) { + case IB_QPT_SMI: + return "IB_QPT_SMI"; + case IB_QPT_GSI: + return "IB_QPT_GSI"; + case IB_QPT_RC: + return "IB_QPT_RC"; + case IB_QPT_UC: + return "IB_QPT_UC"; + case IB_QPT_UD: + return "IB_QPT_UD"; + case IB_QPT_RAW_IPV6: + return "IB_QPT_RAW_IPV6"; + case IB_QPT_RAW_ETHERTYPE: + return "IB_QPT_RAW_ETHERTYPE"; + case IB_QPT_XRC_INI: + return "IB_QPT_XRC_INI"; + case IB_QPT_XRC_TGT: + return "IB_QPT_XRC_TGT"; + case IB_QPT_RAW_PACKET: + return "IB_QPT_RAW_PACKET"; + case MLX5_IB_QPT_REG_UMR: + return "MLX5_IB_QPT_REG_UMR"; + case IB_QPT_MAX: + default: + return "Invalid QP type"; + } +} + +struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx5_general_caps *gen; + struct mlx5_ib_dev *dev; + struct mlx5_ib_qp *qp; + u16 xrcdn = 0; + int err; + + if (pd) { + dev = to_mdev(pd->device); + } else { + /* being cautious here */ + if (init_attr->qp_type != IB_QPT_XRC_TGT && + init_attr->qp_type != MLX5_IB_QPT_REG_UMR) { + pr_warn("%s: no PD for transport %s\n", __func__, + ib_qp_type_str(init_attr->qp_type)); + return ERR_PTR(-EINVAL); + } + dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device); + } + gen = &dev->mdev->caps.gen; + + switch (init_attr->qp_type) { + case IB_QPT_XRC_TGT: + case IB_QPT_XRC_INI: + if (!(gen->flags & MLX5_DEV_CAP_FLAG_XRC)) { + mlx5_ib_dbg(dev, "XRC not supported\n"); + return ERR_PTR(-ENOSYS); + } + init_attr->recv_cq = NULL; + if (init_attr->qp_type == IB_QPT_XRC_TGT) { + xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn; + init_attr->send_cq = NULL; + } + + /* fall through */ + case IB_QPT_RC: + case IB_QPT_UC: + case IB_QPT_UD: + case IB_QPT_SMI: + case IB_QPT_GSI: + case MLX5_IB_QPT_REG_UMR: + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + err = create_qp_common(dev, pd, init_attr, udata, qp); + if (err) { + mlx5_ib_dbg(dev, "create_qp_common failed\n"); + kfree(qp); + return ERR_PTR(err); + } + + if (is_qp0(init_attr->qp_type)) + qp->ibqp.qp_num = 0; + else if (is_qp1(init_attr->qp_type)) + qp->ibqp.qp_num = 1; + else + qp->ibqp.qp_num = qp->mqp.qpn; + + mlx5_ib_dbg(dev, "ib qpnum 0x%x, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x\n", + qp->ibqp.qp_num, qp->mqp.qpn, to_mcq(init_attr->recv_cq)->mcq.cqn, + to_mcq(init_attr->send_cq)->mcq.cqn); + + qp->xrcdn = xrcdn; + + break; + + case IB_QPT_RAW_IPV6: + case IB_QPT_RAW_ETHERTYPE: + case IB_QPT_RAW_PACKET: + case IB_QPT_MAX: + default: + mlx5_ib_dbg(dev, "unsupported qp type %d\n", + init_attr->qp_type); + /* Don't support raw QPs */ + return ERR_PTR(-EINVAL); + } + + return &qp->ibqp; +} + +int mlx5_ib_destroy_qp(struct ib_qp *qp) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_ib_qp *mqp = to_mqp(qp); + + destroy_qp_common(dev, mqp); + + kfree(mqp); + + return 0; +} + +static __be32 to_mlx5_access_flags(struct mlx5_ib_qp *qp, const struct ib_qp_attr *attr, + int attr_mask) +{ + u32 hw_access_flags = 0; + u8 dest_rd_atomic; + u32 access_flags; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + dest_rd_atomic = attr->max_dest_rd_atomic; + else + dest_rd_atomic = qp->resp_depth; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + access_flags = attr->qp_access_flags; + else + access_flags = qp->atomic_rd_en; + + if (!dest_rd_atomic) + access_flags &= IB_ACCESS_REMOTE_WRITE; + + if (access_flags & IB_ACCESS_REMOTE_READ) + hw_access_flags |= MLX5_QP_BIT_RRE; + if (access_flags & IB_ACCESS_REMOTE_ATOMIC) + hw_access_flags |= (MLX5_QP_BIT_RAE | MLX5_ATOMIC_MODE_CX); + if (access_flags & IB_ACCESS_REMOTE_WRITE) + hw_access_flags |= MLX5_QP_BIT_RWE; + + return cpu_to_be32(hw_access_flags); +} + +enum { + MLX5_PATH_FLAG_FL = 1 << 0, + MLX5_PATH_FLAG_FREE_AR = 1 << 1, + MLX5_PATH_FLAG_COUNTER = 1 << 2, +}; + +static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate) +{ + struct mlx5_general_caps *gen; + + gen = &dev->mdev->caps.gen; + if (rate == IB_RATE_PORT_CURRENT) { + return 0; + } else if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_300_GBPS) { + return -EINVAL; + } else { + while (rate != IB_RATE_2_5_GBPS && + !(1 << (rate + MLX5_STAT_RATE_OFFSET) & + gen->stat_rate_support)) + --rate; + } + + return rate + MLX5_STAT_RATE_OFFSET; +} + +static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah, + struct mlx5_qp_path *path, u8 port, int attr_mask, + u32 path_flags, const struct ib_qp_attr *attr) +{ + struct mlx5_general_caps *gen; + int err; + + gen = &dev->mdev->caps.gen; + path->fl = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0; + path->free_ar = (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x80 : 0; + + if (attr_mask & IB_QP_PKEY_INDEX) + path->pkey_index = attr->pkey_index; + + path->grh_mlid = ah->src_path_bits & 0x7f; + path->rlid = cpu_to_be16(ah->dlid); + + if (ah->ah_flags & IB_AH_GRH) { + if (ah->grh.sgid_index >= gen->port[port - 1].gid_table_len) { + pr_err(KERN_ERR "sgid_index (%u) too large. max is %d\n", + ah->grh.sgid_index, gen->port[port - 1].gid_table_len); + return -EINVAL; + } + path->grh_mlid |= 1 << 7; + path->mgid_index = ah->grh.sgid_index; + path->hop_limit = ah->grh.hop_limit; + path->tclass_flowlabel = + cpu_to_be32((ah->grh.traffic_class << 20) | + (ah->grh.flow_label)); + memcpy(path->rgid, ah->grh.dgid.raw, 16); + } + + err = ib_rate_to_mlx5(dev, ah->static_rate); + if (err < 0) + return err; + path->static_rate = err; + path->port = port; + + if (attr_mask & IB_QP_TIMEOUT) + path->ackto_lt = attr->timeout << 3; + + path->sl = ah->sl & 0xf; + + return 0; +} + +static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_QP_ST_MAX] = { + [MLX5_QP_STATE_INIT] = { + [MLX5_QP_STATE_INIT] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_PRI_PORT, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_PRI_PORT, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_Q_KEY | + MLX5_QP_OPTPAR_PRI_PORT, + }, + [MLX5_QP_STATE_RTR] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX, + }, + }, + [MLX5_QP_STATE_RTR] = { + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_RNR_TIMEOUT, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PM_STATE, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY, + }, + }, + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RNR_TIMEOUT | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_ALT_ADDR_PATH, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_ALT_ADDR_PATH, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY | + MLX5_QP_OPTPAR_SRQN | + MLX5_QP_OPTPAR_CQN_RCV, + }, + }, + [MLX5_QP_STATE_SQER] = { + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE, + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RNR_TIMEOUT | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RRE, + }, + }, +}; + +static int ib_nr_to_mlx5_nr(int ib_mask) +{ + switch (ib_mask) { + case IB_QP_STATE: + return 0; + case IB_QP_CUR_STATE: + return 0; + case IB_QP_EN_SQD_ASYNC_NOTIFY: + return 0; + case IB_QP_ACCESS_FLAGS: + return MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE; + case IB_QP_PKEY_INDEX: + return MLX5_QP_OPTPAR_PKEY_INDEX; + case IB_QP_PORT: + return MLX5_QP_OPTPAR_PRI_PORT; + case IB_QP_QKEY: + return MLX5_QP_OPTPAR_Q_KEY; + case IB_QP_AV: + return MLX5_QP_OPTPAR_PRIMARY_ADDR_PATH | + MLX5_QP_OPTPAR_PRI_PORT; + case IB_QP_PATH_MTU: + return 0; + case IB_QP_TIMEOUT: + return MLX5_QP_OPTPAR_ACK_TIMEOUT; + case IB_QP_RETRY_CNT: + return MLX5_QP_OPTPAR_RETRY_COUNT; + case IB_QP_RNR_RETRY: + return MLX5_QP_OPTPAR_RNR_RETRY; + case IB_QP_RQ_PSN: + return 0; + case IB_QP_MAX_QP_RD_ATOMIC: + return MLX5_QP_OPTPAR_SRA_MAX; + case IB_QP_ALT_PATH: + return MLX5_QP_OPTPAR_ALT_ADDR_PATH; + case IB_QP_MIN_RNR_TIMER: + return MLX5_QP_OPTPAR_RNR_TIMEOUT; + case IB_QP_SQ_PSN: + return 0; + case IB_QP_MAX_DEST_RD_ATOMIC: + return MLX5_QP_OPTPAR_RRA_MAX | MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE; + case IB_QP_PATH_MIG_STATE: + return MLX5_QP_OPTPAR_PM_STATE; + case IB_QP_CAP: + return 0; + case IB_QP_DEST_QPN: + return 0; + } + return 0; +} + +static int ib_mask_to_mlx5_opt(int ib_mask) +{ + int result = 0; + int i; + + for (i = 0; i < 8 * sizeof(int); i++) { + if ((1 << i) & ib_mask) + result |= ib_nr_to_mlx5_nr(1 << i); + } + + return result; +} + +static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, + const struct ib_qp_attr *attr, int attr_mask, + enum ib_qp_state cur_state, enum ib_qp_state new_state) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_ib_cq *send_cq, *recv_cq; + struct mlx5_qp_context *context; + struct mlx5_general_caps *gen; + struct mlx5_modify_qp_mbox_in *in; + struct mlx5_ib_pd *pd; + enum mlx5_qp_state mlx5_cur, mlx5_new; + enum mlx5_qp_optpar optpar; + int sqd_event; + int mlx5_st; + int err; + + gen = &dev->mdev->caps.gen; + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + context = &in->ctx; + err = to_mlx5_st(ibqp->qp_type); + if (err < 0) + goto out; + + context->flags = cpu_to_be32(err << 16); + + if (!(attr_mask & IB_QP_PATH_MIG_STATE)) { + context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11); + } else { + switch (attr->path_mig_state) { + case IB_MIG_MIGRATED: + context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11); + break; + case IB_MIG_REARM: + context->flags |= cpu_to_be32(MLX5_QP_PM_REARM << 11); + break; + case IB_MIG_ARMED: + context->flags |= cpu_to_be32(MLX5_QP_PM_ARMED << 11); + break; + } + } + + if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) { + context->mtu_msgmax = (IB_MTU_256 << 5) | 8; + } else if (ibqp->qp_type == IB_QPT_UD || + ibqp->qp_type == MLX5_IB_QPT_REG_UMR) { + context->mtu_msgmax = (IB_MTU_4096 << 5) | 12; + } else if (attr_mask & IB_QP_PATH_MTU) { + if (attr->path_mtu < IB_MTU_256 || + attr->path_mtu > IB_MTU_4096) { + mlx5_ib_warn(dev, "invalid mtu %d\n", attr->path_mtu); + err = -EINVAL; + goto out; + } + context->mtu_msgmax = (attr->path_mtu << 5) | gen->log_max_msg; + } + + if (attr_mask & IB_QP_DEST_QPN) + context->log_pg_sz_remote_qpn = cpu_to_be32(attr->dest_qp_num); + + if (attr_mask & IB_QP_PKEY_INDEX) + context->pri_path.pkey_index = attr->pkey_index; + + /* todo implement counter_index functionality */ + + if (is_sqp(ibqp->qp_type)) + context->pri_path.port = qp->port; + + if (attr_mask & IB_QP_PORT) + context->pri_path.port = attr->port_num; + + if (attr_mask & IB_QP_AV) { + err = mlx5_set_path(dev, &attr->ah_attr, &context->pri_path, + attr_mask & IB_QP_PORT ? attr->port_num : qp->port, + attr_mask, 0, attr); + if (err) + goto out; + } + + if (attr_mask & IB_QP_TIMEOUT) + context->pri_path.ackto_lt |= attr->timeout << 3; + + if (attr_mask & IB_QP_ALT_PATH) { + err = mlx5_set_path(dev, &attr->alt_ah_attr, &context->alt_path, + attr->alt_port_num, attr_mask, 0, attr); + if (err) + goto out; + } + + pd = get_pd(qp); + get_cqs(qp, &send_cq, &recv_cq); + + context->flags_pd = cpu_to_be32(pd ? pd->pdn : to_mpd(dev->devr.p0)->pdn); + context->cqn_send = send_cq ? cpu_to_be32(send_cq->mcq.cqn) : 0; + context->cqn_recv = recv_cq ? cpu_to_be32(recv_cq->mcq.cqn) : 0; + context->params1 = cpu_to_be32(MLX5_IB_ACK_REQ_FREQ << 28); + + if (attr_mask & IB_QP_RNR_RETRY) + context->params1 |= cpu_to_be32(attr->rnr_retry << 13); + + if (attr_mask & IB_QP_RETRY_CNT) + context->params1 |= cpu_to_be32(attr->retry_cnt << 16); + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { + if (attr->max_rd_atomic) + context->params1 |= + cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21); + } + + if (attr_mask & IB_QP_SQ_PSN) + context->next_send_psn = cpu_to_be32(attr->sq_psn); + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { + if (attr->max_dest_rd_atomic) + context->params2 |= + cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21); + } + + if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) + context->params2 |= to_mlx5_access_flags(qp, attr, attr_mask); + + if (attr_mask & IB_QP_MIN_RNR_TIMER) + context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24); + + if (attr_mask & IB_QP_RQ_PSN) + context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn); + + if (attr_mask & IB_QP_QKEY) + context->qkey = cpu_to_be32(attr->qkey); + + if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + context->db_rec_addr = cpu_to_be64(qp->db.dma); + + if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD && + attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify) + sqd_event = 1; + else + sqd_event = 0; + + if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + context->sq_crq_size |= cpu_to_be16(1 << 4); + + + mlx5_cur = to_mlx5_state(cur_state); + mlx5_new = to_mlx5_state(new_state); + mlx5_st = to_mlx5_st(ibqp->qp_type); + if (mlx5_st < 0) + goto out; + + optpar = ib_mask_to_mlx5_opt(attr_mask); + optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; + in->optparam = cpu_to_be32(optpar); + err = mlx5_core_qp_modify(dev->mdev, to_mlx5_state(cur_state), + to_mlx5_state(new_state), in, sqd_event, + &qp->mqp); + if (err) + goto out; + + qp->state = new_state; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + qp->atomic_rd_en = attr->qp_access_flags; + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + qp->resp_depth = attr->max_dest_rd_atomic; + if (attr_mask & IB_QP_PORT) + qp->port = attr->port_num; + if (attr_mask & IB_QP_ALT_PATH) + qp->alt_port = attr->alt_port_num; + + /* + * If we moved a kernel QP to RESET, clean up all old CQ + * entries and reinitialize the QP. + */ + if (new_state == IB_QPS_RESET && !ibqp->uobject) { + mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn, + ibqp->srq ? to_msrq(ibqp->srq) : NULL); + if (send_cq != recv_cq) + mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + + qp->rq.head = 0; + qp->rq.tail = 0; + qp->sq.head = 0; + qp->sq.tail = 0; + qp->sq.cur_post = 0; + qp->sq.last_poll = 0; + qp->db.db[MLX5_RCV_DBR] = 0; + qp->db.db[MLX5_SND_DBR] = 0; + } + +out: + kfree(in); + return err; +} + +int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_qp *qp = to_mqp(ibqp); + enum ib_qp_state cur_state, new_state; + struct mlx5_general_caps *gen; + int err = -EINVAL; + int port; + + gen = &dev->mdev->caps.gen; + mutex_lock(&qp->mutex); + + cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; + new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + + if (ibqp->qp_type != MLX5_IB_QPT_REG_UMR && + !ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, + IB_LINK_LAYER_UNSPECIFIED)) + goto out; + + if ((attr_mask & IB_QP_PORT) && + (attr->port_num == 0 || attr->port_num > gen->num_ports)) + goto out; + + if (attr_mask & IB_QP_PKEY_INDEX) { + port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; + if (attr->pkey_index >= gen->port[port - 1].pkey_table_len) + goto out; + } + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && + attr->max_rd_atomic > (1 << gen->log_max_ra_res_qp)) + goto out; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && + attr->max_dest_rd_atomic > (1 << gen->log_max_ra_req_qp)) + goto out; + + if (cur_state == new_state && cur_state == IB_QPS_RESET) { + err = 0; + goto out; + } + + err = __mlx5_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state); + +out: + mutex_unlock(&qp->mutex); + return err; +} + +static int mlx5_wq_overflow(struct mlx5_ib_wq *wq, int nreq, struct ib_cq *ib_cq) +{ + struct mlx5_ib_cq *cq; + unsigned cur; + + cur = wq->head - wq->tail; + if (likely(cur + nreq < wq->max_post)) + return 0; + + cq = to_mcq(ib_cq); + spin_lock(&cq->lock); + cur = wq->head - wq->tail; + spin_unlock(&cq->lock); + + return cur + nreq >= wq->max_post; +} + +static __always_inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg, + u64 remote_addr, u32 rkey) +{ + rseg->raddr = cpu_to_be64(remote_addr); + rseg->rkey = cpu_to_be32(rkey); + rseg->reserved = 0; +} + +static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg, + struct ib_send_wr *wr) +{ + memcpy(&dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof(struct mlx5_av)); + dseg->av.dqp_dct = cpu_to_be32(wr->wr.ud.remote_qpn | MLX5_EXTENDED_UD_AV); + dseg->av.key.qkey.qkey = cpu_to_be32(wr->wr.ud.remote_qkey); +} + +static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ib_sge *sg) +{ + dseg->byte_count = cpu_to_be32(sg->length); + dseg->lkey = cpu_to_be32(sg->lkey); + dseg->addr = cpu_to_be64(sg->addr); +} + +static __be16 get_klm_octo(int npages) +{ + return cpu_to_be16(ALIGN(npages, 8) / 2); +} + +static __be64 frwr_mkey_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_EN_RINVAL | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_A | + MLX5_MKEY_MASK_SMALL_FENCE | + MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 sig_mkey_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_EN_SIGERR | + MLX5_MKEY_MASK_EN_RINVAL | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_SMALL_FENCE | + MLX5_MKEY_MASK_FREE | + MLX5_MKEY_MASK_BSF_EN; + + return cpu_to_be64(result); +} + +static void set_frwr_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, + struct ib_send_wr *wr, int li) +{ + memset(umr, 0, sizeof(*umr)); + + if (li) { + umr->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE); + umr->flags = 1 << 7; + return; + } + + umr->flags = (1 << 5); /* fail if not free */ + umr->klm_octowords = get_klm_octo(wr->wr.fast_reg.page_list_len); + umr->mkey_mask = frwr_mkey_mask(); +} + +static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, + struct ib_send_wr *wr) +{ + struct umr_wr *umrwr = (struct umr_wr *)&wr->wr.fast_reg; + u64 mask; + + memset(umr, 0, sizeof(*umr)); + + if (!(wr->send_flags & MLX5_IB_SEND_UMR_UNREG)) { + umr->flags = 1 << 5; /* fail if not free */ + umr->klm_octowords = get_klm_octo(umrwr->npages); + mask = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_PD | + MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_A | + MLX5_MKEY_MASK_FREE; + umr->mkey_mask = cpu_to_be64(mask); + } else { + umr->flags = 2 << 5; /* fail if free */ + mask = MLX5_MKEY_MASK_FREE; + umr->mkey_mask = cpu_to_be64(mask); + } + + if (!wr->num_sge) + umr->flags |= (1 << 7); /* inline */ +} + +static u8 get_umr_flags(int acc) +{ + return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC : 0) | + (acc & IB_ACCESS_REMOTE_WRITE ? MLX5_PERM_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? MLX5_PERM_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? MLX5_PERM_LOCAL_WRITE : 0) | + MLX5_PERM_LOCAL_READ | MLX5_PERM_UMR_EN; +} + +static void set_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr, + int li, int *writ) +{ + memset(seg, 0, sizeof(*seg)); + if (li) { + seg->status = 1 << 6; + return; + } + + seg->flags = get_umr_flags(wr->wr.fast_reg.access_flags) | + MLX5_ACCESS_MODE_MTT; + *writ = seg->flags & (MLX5_PERM_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE); + seg->qpn_mkey7_0 = cpu_to_be32((wr->wr.fast_reg.rkey & 0xff) | 0xffffff00); + seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL); + seg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start); + seg->len = cpu_to_be64(wr->wr.fast_reg.length); + seg->xlt_oct_size = cpu_to_be32((wr->wr.fast_reg.page_list_len + 1) / 2); + seg->log2_page_size = wr->wr.fast_reg.page_shift; +} + +static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr) +{ + memset(seg, 0, sizeof(*seg)); + if (wr->send_flags & MLX5_IB_SEND_UMR_UNREG) { + seg->status = 1 << 6; + return; + } + + seg->flags = convert_access(wr->wr.fast_reg.access_flags); + seg->flags_pd = cpu_to_be32(to_mpd((struct ib_pd *)wr->wr.fast_reg.page_list)->pdn); + seg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start); + seg->len = cpu_to_be64(wr->wr.fast_reg.length); + seg->log2_page_size = wr->wr.fast_reg.page_shift; + seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 | + mlx5_mkey_variant(wr->wr.fast_reg.rkey)); +} + +static void set_frwr_pages(struct mlx5_wqe_data_seg *dseg, + struct ib_send_wr *wr, + struct mlx5_core_dev *mdev, + struct mlx5_ib_pd *pd, + int writ) +{ + struct mlx5_ib_fast_reg_page_list *mfrpl = to_mfrpl(wr->wr.fast_reg.page_list); + u64 *page_list = wr->wr.fast_reg.page_list->page_list; + u64 perm = MLX5_EN_RD | (writ ? MLX5_EN_WR : 0); + int i; + + for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) + mfrpl->mapped_page_list[i] = cpu_to_be64(page_list[i] | perm); + dseg->addr = cpu_to_be64(mfrpl->map); + dseg->byte_count = cpu_to_be32(ALIGN(sizeof(u64) * wr->wr.fast_reg.page_list_len, 64)); + dseg->lkey = cpu_to_be32(pd->pa_lkey); +} + +static __be32 send_ieth(struct ib_send_wr *wr) +{ + switch (wr->opcode) { + case IB_WR_SEND_WITH_IMM: + case IB_WR_RDMA_WRITE_WITH_IMM: + return wr->ex.imm_data; + + case IB_WR_SEND_WITH_INV: + return cpu_to_be32(wr->ex.invalidate_rkey); + + default: + return 0; + } +} + +static u8 calc_sig(void *wqe, int size) +{ + u8 *p = wqe; + u8 res = 0; + int i; + + for (i = 0; i < size; i++) + res ^= p[i]; + + return ~res; +} + +static u8 wq_sig(void *wqe) +{ + return calc_sig(wqe, (*((u8 *)wqe + 8) & 0x3f) << 4); +} + +static int set_data_inl_seg(struct mlx5_ib_qp *qp, struct ib_send_wr *wr, + void *wqe, int *sz) +{ + struct mlx5_wqe_inline_seg *seg; + void *qend = qp->sq.qend; + void *addr; + int inl = 0; + int copy; + int len; + int i; + + seg = wqe; + wqe += sizeof(*seg); + for (i = 0; i < wr->num_sge; i++) { + addr = (void *)(unsigned long)(wr->sg_list[i].addr); + len = wr->sg_list[i].length; + inl += len; + + if (unlikely(inl > qp->max_inline_data)) + return -ENOMEM; + + if (unlikely(wqe + len > qend)) { + copy = qend - wqe; + memcpy(wqe, addr, copy); + addr += copy; + len -= copy; + wqe = mlx5_get_send_wqe(qp, 0); + } + memcpy(wqe, addr, len); + wqe += len; + } + + seg->byte_count = cpu_to_be32(inl | MLX5_INLINE_SEG); + + *sz = ALIGN(inl + sizeof(seg->byte_count), 16) / 16; + + return 0; +} + +static u16 prot_field_size(enum ib_signature_type type) +{ + switch (type) { + case IB_SIG_TYPE_T10_DIF: + return MLX5_DIF_SIZE; + default: + return 0; + } +} + +static u8 bs_selector(int block_size) +{ + switch (block_size) { + case 512: return 0x1; + case 520: return 0x2; + case 4096: return 0x3; + case 4160: return 0x4; + case 1073741824: return 0x5; + default: return 0; + } +} + +static void mlx5_fill_inl_bsf(struct ib_sig_domain *domain, + struct mlx5_bsf_inl *inl) +{ + /* Valid inline section and allow BSF refresh */ + inl->vld_refresh = cpu_to_be16(MLX5_BSF_INL_VALID | + MLX5_BSF_REFRESH_DIF); + inl->dif_apptag = cpu_to_be16(domain->sig.dif.app_tag); + inl->dif_reftag = cpu_to_be32(domain->sig.dif.ref_tag); + /* repeating block */ + inl->rp_inv_seed = MLX5_BSF_REPEAT_BLOCK; + inl->sig_type = domain->sig.dif.bg_type == IB_T10DIF_CRC ? + MLX5_DIF_CRC : MLX5_DIF_IPCS; + + if (domain->sig.dif.ref_remap) + inl->dif_inc_ref_guard_check |= MLX5_BSF_INC_REFTAG; + + if (domain->sig.dif.app_escape) { + if (domain->sig.dif.ref_escape) + inl->dif_inc_ref_guard_check |= MLX5_BSF_APPREF_ESCAPE; + else + inl->dif_inc_ref_guard_check |= MLX5_BSF_APPTAG_ESCAPE; + } + + inl->dif_app_bitmask_check = + cpu_to_be16(domain->sig.dif.apptag_check_mask); +} + +static int mlx5_set_bsf(struct ib_mr *sig_mr, + struct ib_sig_attrs *sig_attrs, + struct mlx5_bsf *bsf, u32 data_size) +{ + struct mlx5_core_sig_ctx *msig = to_mmr(sig_mr)->sig; + struct mlx5_bsf_basic *basic = &bsf->basic; + struct ib_sig_domain *mem = &sig_attrs->mem; + struct ib_sig_domain *wire = &sig_attrs->wire; + + memset(bsf, 0, sizeof(*bsf)); + + /* Basic + Extended + Inline */ + basic->bsf_size_sbs = 1 << 7; + /* Input domain check byte mask */ + basic->check_byte_mask = sig_attrs->check_mask; + basic->raw_data_size = cpu_to_be32(data_size); + + /* Memory domain */ + switch (sig_attrs->mem.sig_type) { + case IB_SIG_TYPE_NONE: + break; + case IB_SIG_TYPE_T10_DIF: + basic->mem.bs_selector = bs_selector(mem->sig.dif.pi_interval); + basic->m_bfs_psv = cpu_to_be32(msig->psv_memory.psv_idx); + mlx5_fill_inl_bsf(mem, &bsf->m_inl); + break; + default: + return -EINVAL; + } + + /* Wire domain */ + switch (sig_attrs->wire.sig_type) { + case IB_SIG_TYPE_NONE: + break; + case IB_SIG_TYPE_T10_DIF: + if (mem->sig.dif.pi_interval == wire->sig.dif.pi_interval && + mem->sig_type == wire->sig_type) { + /* Same block structure */ + basic->bsf_size_sbs |= 1 << 4; + if (mem->sig.dif.bg_type == wire->sig.dif.bg_type) + basic->wire.copy_byte_mask |= MLX5_CPY_GRD_MASK; + if (mem->sig.dif.app_tag == wire->sig.dif.app_tag) + basic->wire.copy_byte_mask |= MLX5_CPY_APP_MASK; + if (mem->sig.dif.ref_tag == wire->sig.dif.ref_tag) + basic->wire.copy_byte_mask |= MLX5_CPY_REF_MASK; + } else + basic->wire.bs_selector = bs_selector(wire->sig.dif.pi_interval); + + basic->w_bfs_psv = cpu_to_be32(msig->psv_wire.psv_idx); + mlx5_fill_inl_bsf(wire, &bsf->w_inl); + break; + default: + return -EINVAL; + } + + return 0; +} + +static int set_sig_data_segment(struct ib_send_wr *wr, struct mlx5_ib_qp *qp, + void **seg, int *size) +{ + struct ib_sig_attrs *sig_attrs = wr->wr.sig_handover.sig_attrs; + struct ib_mr *sig_mr = wr->wr.sig_handover.sig_mr; + struct mlx5_bsf *bsf; + u32 data_len = wr->sg_list->length; + u32 data_key = wr->sg_list->lkey; + u64 data_va = wr->sg_list->addr; + int ret; + int wqe_size; + + if (!wr->wr.sig_handover.prot || + (data_key == wr->wr.sig_handover.prot->lkey && + data_va == wr->wr.sig_handover.prot->addr && + data_len == wr->wr.sig_handover.prot->length)) { + /** + * Source domain doesn't contain signature information + * or data and protection are interleaved in memory. + * So need construct: + * ------------------ + * | data_klm | + * ------------------ + * | BSF | + * ------------------ + **/ + struct mlx5_klm *data_klm = *seg; + + data_klm->bcount = cpu_to_be32(data_len); + data_klm->key = cpu_to_be32(data_key); + data_klm->va = cpu_to_be64(data_va); + wqe_size = ALIGN(sizeof(*data_klm), 64); + } else { + /** + * Source domain contains signature information + * So need construct a strided block format: + * --------------------------- + * | stride_block_ctrl | + * --------------------------- + * | data_klm | + * --------------------------- + * | prot_klm | + * --------------------------- + * | BSF | + * --------------------------- + **/ + struct mlx5_stride_block_ctrl_seg *sblock_ctrl; + struct mlx5_stride_block_entry *data_sentry; + struct mlx5_stride_block_entry *prot_sentry; + u32 prot_key = wr->wr.sig_handover.prot->lkey; + u64 prot_va = wr->wr.sig_handover.prot->addr; + u16 block_size = sig_attrs->mem.sig.dif.pi_interval; + int prot_size; + + sblock_ctrl = *seg; + data_sentry = (void *)sblock_ctrl + sizeof(*sblock_ctrl); + prot_sentry = (void *)data_sentry + sizeof(*data_sentry); + + prot_size = prot_field_size(sig_attrs->mem.sig_type); + if (!prot_size) { + pr_err("Bad block size given: %u\n", block_size); + return -EINVAL; + } + sblock_ctrl->bcount_per_cycle = cpu_to_be32(block_size + + prot_size); + sblock_ctrl->op = cpu_to_be32(MLX5_STRIDE_BLOCK_OP); + sblock_ctrl->repeat_count = cpu_to_be32(data_len / block_size); + sblock_ctrl->num_entries = cpu_to_be16(2); + + data_sentry->bcount = cpu_to_be16(block_size); + data_sentry->key = cpu_to_be32(data_key); + data_sentry->va = cpu_to_be64(data_va); + data_sentry->stride = cpu_to_be16(block_size); + + prot_sentry->bcount = cpu_to_be16(prot_size); + prot_sentry->key = cpu_to_be32(prot_key); + prot_sentry->va = cpu_to_be64(prot_va); + prot_sentry->stride = cpu_to_be16(prot_size); + + wqe_size = ALIGN(sizeof(*sblock_ctrl) + sizeof(*data_sentry) + + sizeof(*prot_sentry), 64); + } + + *seg += wqe_size; + *size += wqe_size / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + bsf = *seg; + ret = mlx5_set_bsf(sig_mr, sig_attrs, bsf, data_len); + if (ret) + return -EINVAL; + + *seg += sizeof(*bsf); + *size += sizeof(*bsf) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + return 0; +} + +static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg, + struct ib_send_wr *wr, u32 nelements, + u32 length, u32 pdn) +{ + struct ib_mr *sig_mr = wr->wr.sig_handover.sig_mr; + u32 sig_key = sig_mr->rkey; + u8 sigerr = to_mmr(sig_mr)->sig->sigerr_count & 1; + + memset(seg, 0, sizeof(*seg)); + + seg->flags = get_umr_flags(wr->wr.sig_handover.access_flags) | + MLX5_ACCESS_MODE_KLM; + seg->qpn_mkey7_0 = cpu_to_be32((sig_key & 0xff) | 0xffffff00); + seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | sigerr << 26 | + MLX5_MKEY_BSF_EN | pdn); + seg->len = cpu_to_be64(length); + seg->xlt_oct_size = cpu_to_be32(be16_to_cpu(get_klm_octo(nelements))); + seg->bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE); +} + +static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, + struct ib_send_wr *wr, u32 nelements) +{ + memset(umr, 0, sizeof(*umr)); + + umr->flags = MLX5_FLAGS_INLINE | MLX5_FLAGS_CHECK_FREE; + umr->klm_octowords = get_klm_octo(nelements); + umr->bsf_octowords = cpu_to_be16(MLX5_MKEY_BSF_OCTO_SIZE); + umr->mkey_mask = sig_mkey_mask(); +} + + +static int set_sig_umr_wr(struct ib_send_wr *wr, struct mlx5_ib_qp *qp, + void **seg, int *size) +{ + struct mlx5_ib_mr *sig_mr = to_mmr(wr->wr.sig_handover.sig_mr); + u32 pdn = get_pd(qp)->pdn; + u32 klm_oct_size; + int region_len, ret; + + if (unlikely(wr->num_sge != 1) || + unlikely(wr->wr.sig_handover.access_flags & + IB_ACCESS_REMOTE_ATOMIC) || + unlikely(!sig_mr->sig) || unlikely(!qp->signature_en) || + unlikely(!sig_mr->sig->sig_status_checked)) + return -EINVAL; + + /* length of the protected region, data + protection */ + region_len = wr->sg_list->length; + if (wr->wr.sig_handover.prot && + (wr->wr.sig_handover.prot->lkey != wr->sg_list->lkey || + wr->wr.sig_handover.prot->addr != wr->sg_list->addr || + wr->wr.sig_handover.prot->length != wr->sg_list->length)) + region_len += wr->wr.sig_handover.prot->length; + + /** + * KLM octoword size - if protection was provided + * then we use strided block format (3 octowords), + * else we use single KLM (1 octoword) + **/ + klm_oct_size = wr->wr.sig_handover.prot ? 3 : 1; + + set_sig_umr_segment(*seg, wr, klm_oct_size); + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + set_sig_mkey_segment(*seg, wr, klm_oct_size, region_len, pdn); + *seg += sizeof(struct mlx5_mkey_seg); + *size += sizeof(struct mlx5_mkey_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + ret = set_sig_data_segment(wr, qp, seg, size); + if (ret) + return ret; + + sig_mr->sig->sig_status_checked = false; + return 0; +} + +static int set_psv_wr(struct ib_sig_domain *domain, + u32 psv_idx, void **seg, int *size) +{ + struct mlx5_seg_set_psv *psv_seg = *seg; + + memset(psv_seg, 0, sizeof(*psv_seg)); + psv_seg->psv_num = cpu_to_be32(psv_idx); + switch (domain->sig_type) { + case IB_SIG_TYPE_NONE: + break; + case IB_SIG_TYPE_T10_DIF: + psv_seg->transient_sig = cpu_to_be32(domain->sig.dif.bg << 16 | + domain->sig.dif.app_tag); + psv_seg->ref_tag = cpu_to_be32(domain->sig.dif.ref_tag); + break; + default: + pr_err("Bad signature type given.\n"); + return 1; + } + + *seg += sizeof(*psv_seg); + *size += sizeof(*psv_seg) / 16; + + return 0; +} + +static int set_frwr_li_wr(void **seg, struct ib_send_wr *wr, int *size, + struct mlx5_core_dev *mdev, struct mlx5_ib_pd *pd, struct mlx5_ib_qp *qp) +{ + int writ = 0; + int li; + + li = wr->opcode == IB_WR_LOCAL_INV ? 1 : 0; + if (unlikely(wr->send_flags & IB_SEND_INLINE)) + return -EINVAL; + + set_frwr_umr_segment(*seg, wr, li); + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + set_mkey_segment(*seg, wr, li, &writ); + *seg += sizeof(struct mlx5_mkey_seg); + *size += sizeof(struct mlx5_mkey_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + if (!li) { + if (unlikely(wr->wr.fast_reg.page_list_len > + wr->wr.fast_reg.page_list->max_page_list_len)) + return -ENOMEM; + + set_frwr_pages(*seg, wr, mdev, pd, writ); + *seg += sizeof(struct mlx5_wqe_data_seg); + *size += (sizeof(struct mlx5_wqe_data_seg) / 16); + } + return 0; +} + +static void dump_wqe(struct mlx5_ib_qp *qp, int idx, int size_16) +{ + __be32 *p = NULL; + int tidx = idx; + int i, j; + + pr_debug("dump wqe at %p\n", mlx5_get_send_wqe(qp, tidx)); + for (i = 0, j = 0; i < size_16 * 4; i += 4, j += 4) { + if ((i & 0xf) == 0) { + void *buf = mlx5_get_send_wqe(qp, tidx); + tidx = (tidx + 1) & (qp->sq.wqe_cnt - 1); + p = buf; + j = 0; + } + pr_debug("%08x %08x %08x %08x\n", be32_to_cpu(p[j]), + be32_to_cpu(p[j + 1]), be32_to_cpu(p[j + 2]), + be32_to_cpu(p[j + 3])); + } +} + +static void mlx5_bf_copy(u64 __iomem *dst, u64 *src, + unsigned bytecnt, struct mlx5_ib_qp *qp) +{ + while (bytecnt > 0) { + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + __iowrite64_copy(dst++, src++, 8); + bytecnt -= 64; + if (unlikely(src == qp->sq.qend)) + src = mlx5_get_send_wqe(qp, 0); + } +} + +static u8 get_fence(u8 fence, struct ib_send_wr *wr) +{ + if (unlikely(wr->opcode == IB_WR_LOCAL_INV && + wr->send_flags & IB_SEND_FENCE)) + return MLX5_FENCE_MODE_STRONG_ORDERING; + + if (unlikely(fence)) { + if (wr->send_flags & IB_SEND_FENCE) + return MLX5_FENCE_MODE_SMALL_AND_FENCE; + else + return fence; + + } else { + return 0; + } +} + +static int begin_wqe(struct mlx5_ib_qp *qp, void **seg, + struct mlx5_wqe_ctrl_seg **ctrl, + struct ib_send_wr *wr, int *idx, + int *size, int nreq) +{ + int err = 0; + + if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq))) { + err = -ENOMEM; + return err; + } + + *idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1); + *seg = mlx5_get_send_wqe(qp, *idx); + *ctrl = *seg; + *(uint32_t *)(*seg + 8) = 0; + (*ctrl)->imm = send_ieth(wr); + (*ctrl)->fm_ce_se = qp->sq_signal_bits | + (wr->send_flags & IB_SEND_SIGNALED ? + MLX5_WQE_CTRL_CQ_UPDATE : 0) | + (wr->send_flags & IB_SEND_SOLICITED ? + MLX5_WQE_CTRL_SOLICITED : 0); + + *seg += sizeof(**ctrl); + *size = sizeof(**ctrl) / 16; + + return err; +} + +static void finish_wqe(struct mlx5_ib_qp *qp, + struct mlx5_wqe_ctrl_seg *ctrl, + u8 size, unsigned idx, u64 wr_id, + int nreq, u8 fence, u8 next_fence, + u32 mlx5_opcode) +{ + u8 opmod = 0; + + ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8) | + mlx5_opcode | ((u32)opmod << 24)); + ctrl->qpn_ds = cpu_to_be32(size | (qp->mqp.qpn << 8)); + ctrl->fm_ce_se |= fence; + qp->fm_cache = next_fence; + if (unlikely(qp->wq_sig)) + ctrl->signature = wq_sig(ctrl); + + qp->sq.wrid[idx] = wr_id; + qp->sq.w_list[idx].opcode = mlx5_opcode; + qp->sq.wqe_head[idx] = qp->sq.head + nreq; + qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB); + qp->sq.w_list[idx].next = qp->sq.cur_post; +} + + +int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + struct mlx5_wqe_ctrl_seg *ctrl = NULL; /* compiler warning */ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_ib_mr *mr; + struct mlx5_wqe_data_seg *dpseg; + struct mlx5_wqe_xrc_seg *xrc; + struct mlx5_bf *bf = qp->bf; + int uninitialized_var(size); + void *qend = qp->sq.qend; + unsigned long flags; + unsigned idx; + int err = 0; + int inl = 0; + int num_sge; + void *seg; + int nreq; + int i; + u8 next_fence = 0; + u8 fence; + + spin_lock_irqsave(&qp->sq.lock, flags); + + for (nreq = 0; wr; nreq++, wr = wr->next) { + if (unlikely(wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) { + mlx5_ib_warn(dev, "\n"); + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + fence = qp->fm_cache; + num_sge = wr->num_sge; + if (unlikely(num_sge > qp->sq.max_gs)) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + err = begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, nreq); + if (err) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + switch (ibqp->qp_type) { + case IB_QPT_XRC_INI: + xrc = seg; + xrc->xrc_srqn = htonl(wr->xrc_remote_srq_num); + seg += sizeof(*xrc); + size += sizeof(*xrc) / 16; + /* fall through */ + case IB_QPT_RC: + switch (wr->opcode) { + case IB_WR_RDMA_READ: + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + set_raddr_seg(seg, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + seg += sizeof(struct mlx5_wqe_raddr_seg); + size += sizeof(struct mlx5_wqe_raddr_seg) / 16; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: + mlx5_ib_warn(dev, "Atomic operations are not supported yet\n"); + err = -ENOSYS; + *bad_wr = wr; + goto out; + + case IB_WR_LOCAL_INV: + next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + qp->sq.wr_data[idx] = IB_WR_LOCAL_INV; + ctrl->imm = cpu_to_be32(wr->ex.invalidate_rkey); + err = set_frwr_li_wr(&seg, wr, &size, mdev, to_mpd(ibqp->pd), qp); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + num_sge = 0; + break; + + case IB_WR_FAST_REG_MR: + next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + qp->sq.wr_data[idx] = IB_WR_FAST_REG_MR; + ctrl->imm = cpu_to_be32(wr->wr.fast_reg.rkey); + err = set_frwr_li_wr(&seg, wr, &size, mdev, to_mpd(ibqp->pd), qp); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + num_sge = 0; + break; + + case IB_WR_REG_SIG_MR: + qp->sq.wr_data[idx] = IB_WR_REG_SIG_MR; + mr = to_mmr(wr->wr.sig_handover.sig_mr); + + ctrl->imm = cpu_to_be32(mr->ibmr.rkey); + err = set_sig_umr_wr(wr, qp, &seg, &size); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + + finish_wqe(qp, ctrl, size, idx, wr->wr_id, + nreq, get_fence(fence, wr), + next_fence, MLX5_OPCODE_UMR); + /* + * SET_PSV WQEs are not signaled and solicited + * on error + */ + wr->send_flags &= ~IB_SEND_SIGNALED; + wr->send_flags |= IB_SEND_SOLICITED; + err = begin_wqe(qp, &seg, &ctrl, wr, + &idx, &size, nreq); + if (err) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + err = set_psv_wr(&wr->wr.sig_handover.sig_attrs->mem, + mr->sig->psv_memory.psv_idx, &seg, + &size); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + + finish_wqe(qp, ctrl, size, idx, wr->wr_id, + nreq, get_fence(fence, wr), + next_fence, MLX5_OPCODE_SET_PSV); + err = begin_wqe(qp, &seg, &ctrl, wr, + &idx, &size, nreq); + if (err) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + err = set_psv_wr(&wr->wr.sig_handover.sig_attrs->wire, + mr->sig->psv_wire.psv_idx, &seg, + &size); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + + finish_wqe(qp, ctrl, size, idx, wr->wr_id, + nreq, get_fence(fence, wr), + next_fence, MLX5_OPCODE_SET_PSV); + num_sge = 0; + goto skip_psv; + + default: + break; + } + break; + + case IB_QPT_UC: + switch (wr->opcode) { + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + set_raddr_seg(seg, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + seg += sizeof(struct mlx5_wqe_raddr_seg); + size += sizeof(struct mlx5_wqe_raddr_seg) / 16; + break; + + default: + break; + } + break; + + case IB_QPT_UD: + case IB_QPT_SMI: + case IB_QPT_GSI: + set_datagram_seg(seg, wr); + seg += sizeof(struct mlx5_wqe_datagram_seg); + size += sizeof(struct mlx5_wqe_datagram_seg) / 16; + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + break; + + case MLX5_IB_QPT_REG_UMR: + if (wr->opcode != MLX5_IB_WR_UMR) { + err = -EINVAL; + mlx5_ib_warn(dev, "bad opcode\n"); + goto out; + } + qp->sq.wr_data[idx] = MLX5_IB_WR_UMR; + ctrl->imm = cpu_to_be32(wr->wr.fast_reg.rkey); + set_reg_umr_segment(seg, wr); + seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + set_reg_mkey_segment(seg, wr); + seg += sizeof(struct mlx5_mkey_seg); + size += sizeof(struct mlx5_mkey_seg) / 16; + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + break; + + default: + break; + } + + if (wr->send_flags & IB_SEND_INLINE && num_sge) { + int uninitialized_var(sz); + + err = set_data_inl_seg(qp, wr, seg, &sz); + if (unlikely(err)) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + inl = 1; + size += sz; + } else { + dpseg = seg; + for (i = 0; i < num_sge; i++) { + if (unlikely(dpseg == qend)) { + seg = mlx5_get_send_wqe(qp, 0); + dpseg = seg; + } + if (likely(wr->sg_list[i].length)) { + set_data_ptr_seg(dpseg, wr->sg_list + i); + size += sizeof(struct mlx5_wqe_data_seg) / 16; + dpseg++; + } + } + } + + finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq, + get_fence(fence, wr), next_fence, + mlx5_ib_opcode[wr->opcode]); +skip_psv: + if (0) + dump_wqe(qp, idx, size); + } + +out: + if (likely(nreq)) { + qp->sq.head += nreq; + + /* Make sure that descriptors are written before + * updating doorbell record and ringing the doorbell + */ + wmb(); + + qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post); + + /* Make sure doorbell record is visible to the HCA before + * we hit doorbell */ + wmb(); + + if (bf->need_lock) + spin_lock(&bf->lock); + + /* TBD enable WC */ + if (0 && nreq == 1 && bf->uuarn && inl && size > 1 && size <= bf->buf_size / 16) { + mlx5_bf_copy(bf->reg + bf->offset, (u64 *)ctrl, ALIGN(size * 16, 64), qp); + /* wc_wmb(); */ + } else { + mlx5_write64((__be32 *)ctrl, bf->regreg + bf->offset, + MLX5_GET_DOORBELL_LOCK(&bf->lock32)); + /* Make sure doorbells don't leak out of SQ spinlock + * and reach the HCA out of order. + */ + mmiowb(); + } + bf->offset ^= bf->buf_size; + if (bf->need_lock) + spin_unlock(&bf->lock); + } + + spin_unlock_irqrestore(&qp->sq.lock, flags); + + return err; +} + +static void set_sig_seg(struct mlx5_rwqe_sig *sig, int size) +{ + sig->signature = calc_sig(sig, size); +} + +int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_wqe_data_seg *scat; + struct mlx5_rwqe_sig *sig; + unsigned long flags; + int err = 0; + int nreq; + int ind; + int i; + + spin_lock_irqsave(&qp->rq.lock, flags); + + ind = qp->rq.head & (qp->rq.wqe_cnt - 1); + + for (nreq = 0; wr; nreq++, wr = wr->next) { + if (mlx5_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->num_sge > qp->rq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + scat = get_recv_wqe(qp, ind); + if (qp->wq_sig) + scat++; + + for (i = 0; i < wr->num_sge; i++) + set_data_ptr_seg(scat + i, wr->sg_list + i); + + if (i < qp->rq.max_gs) { + scat[i].byte_count = 0; + scat[i].lkey = cpu_to_be32(MLX5_INVALID_LKEY); + scat[i].addr = 0; + } + + if (qp->wq_sig) { + sig = (struct mlx5_rwqe_sig *)scat; + set_sig_seg(sig, (qp->rq.max_gs + 1) << 2); + } + + qp->rq.wrid[ind] = wr->wr_id; + + ind = (ind + 1) & (qp->rq.wqe_cnt - 1); + } + +out: + if (likely(nreq)) { + qp->rq.head += nreq; + + /* Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff); + } + + spin_unlock_irqrestore(&qp->rq.lock, flags); + + return err; +} + +static inline enum ib_qp_state to_ib_qp_state(enum mlx5_qp_state mlx5_state) +{ + switch (mlx5_state) { + case MLX5_QP_STATE_RST: return IB_QPS_RESET; + case MLX5_QP_STATE_INIT: return IB_QPS_INIT; + case MLX5_QP_STATE_RTR: return IB_QPS_RTR; + case MLX5_QP_STATE_RTS: return IB_QPS_RTS; + case MLX5_QP_STATE_SQ_DRAINING: + case MLX5_QP_STATE_SQD: return IB_QPS_SQD; + case MLX5_QP_STATE_SQER: return IB_QPS_SQE; + case MLX5_QP_STATE_ERR: return IB_QPS_ERR; + default: return -1; + } +} + +static inline enum ib_mig_state to_ib_mig_state(int mlx5_mig_state) +{ + switch (mlx5_mig_state) { + case MLX5_QP_PM_ARMED: return IB_MIG_ARMED; + case MLX5_QP_PM_REARM: return IB_MIG_REARM; + case MLX5_QP_PM_MIGRATED: return IB_MIG_MIGRATED; + default: return -1; + } +} + +static int to_ib_qp_access_flags(int mlx5_flags) +{ + int ib_flags = 0; + + if (mlx5_flags & MLX5_QP_BIT_RRE) + ib_flags |= IB_ACCESS_REMOTE_READ; + if (mlx5_flags & MLX5_QP_BIT_RWE) + ib_flags |= IB_ACCESS_REMOTE_WRITE; + if (mlx5_flags & MLX5_QP_BIT_RAE) + ib_flags |= IB_ACCESS_REMOTE_ATOMIC; + + return ib_flags; +} + +static void to_ib_ah_attr(struct mlx5_ib_dev *ibdev, struct ib_ah_attr *ib_ah_attr, + struct mlx5_qp_path *path) +{ + struct mlx5_core_dev *dev = ibdev->mdev; + + memset(ib_ah_attr, 0, sizeof(*ib_ah_attr)); + ib_ah_attr->port_num = path->port; + + if (ib_ah_attr->port_num == 0 || + ib_ah_attr->port_num > dev->caps.gen.num_ports) + return; + + ib_ah_attr->sl = path->sl & 0xf; + + ib_ah_attr->dlid = be16_to_cpu(path->rlid); + ib_ah_attr->src_path_bits = path->grh_mlid & 0x7f; + ib_ah_attr->static_rate = path->static_rate ? path->static_rate - 5 : 0; + ib_ah_attr->ah_flags = (path->grh_mlid & (1 << 7)) ? IB_AH_GRH : 0; + if (ib_ah_attr->ah_flags) { + ib_ah_attr->grh.sgid_index = path->mgid_index; + ib_ah_attr->grh.hop_limit = path->hop_limit; + ib_ah_attr->grh.traffic_class = + (be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff; + ib_ah_attr->grh.flow_label = + be32_to_cpu(path->tclass_flowlabel) & 0xfffff; + memcpy(ib_ah_attr->grh.dgid.raw, + path->rgid, sizeof(ib_ah_attr->grh.dgid.raw)); + } +} + +int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_query_qp_mbox_out *outb; + struct mlx5_qp_context *context; + int mlx5_state; + int err = 0; + + mutex_lock(&qp->mutex); + outb = kzalloc(sizeof(*outb), GFP_KERNEL); + if (!outb) { + err = -ENOMEM; + goto out; + } + context = &outb->ctx; + err = mlx5_core_qp_query(dev->mdev, &qp->mqp, outb, sizeof(*outb)); + if (err) + goto out_free; + + mlx5_state = be32_to_cpu(context->flags) >> 28; + + qp->state = to_ib_qp_state(mlx5_state); + qp_attr->qp_state = qp->state; + qp_attr->path_mtu = context->mtu_msgmax >> 5; + qp_attr->path_mig_state = + to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3); + qp_attr->qkey = be32_to_cpu(context->qkey); + qp_attr->rq_psn = be32_to_cpu(context->rnr_nextrecvpsn) & 0xffffff; + qp_attr->sq_psn = be32_to_cpu(context->next_send_psn) & 0xffffff; + qp_attr->dest_qp_num = be32_to_cpu(context->log_pg_sz_remote_qpn) & 0xffffff; + qp_attr->qp_access_flags = + to_ib_qp_access_flags(be32_to_cpu(context->params2)); + + if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) { + to_ib_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path); + to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path); + qp_attr->alt_pkey_index = context->alt_path.pkey_index & 0x7f; + qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num; + } + + qp_attr->pkey_index = context->pri_path.pkey_index & 0x7f; + qp_attr->port_num = context->pri_path.port; + + /* qp_attr->en_sqd_async_notify is only applicable in modify qp */ + qp_attr->sq_draining = mlx5_state == MLX5_QP_STATE_SQ_DRAINING; + + qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context->params1) >> 21) & 0x7); + + qp_attr->max_dest_rd_atomic = + 1 << ((be32_to_cpu(context->params2) >> 21) & 0x7); + qp_attr->min_rnr_timer = + (be32_to_cpu(context->rnr_nextrecvpsn) >> 24) & 0x1f; + qp_attr->timeout = context->pri_path.ackto_lt >> 3; + qp_attr->retry_cnt = (be32_to_cpu(context->params1) >> 16) & 0x7; + qp_attr->rnr_retry = (be32_to_cpu(context->params1) >> 13) & 0x7; + qp_attr->alt_timeout = context->alt_path.ackto_lt >> 3; + qp_attr->cur_qp_state = qp_attr->qp_state; + qp_attr->cap.max_recv_wr = qp->rq.wqe_cnt; + qp_attr->cap.max_recv_sge = qp->rq.max_gs; + + if (!ibqp->uobject) { + qp_attr->cap.max_send_wr = qp->sq.wqe_cnt; + qp_attr->cap.max_send_sge = qp->sq.max_gs; + } else { + qp_attr->cap.max_send_wr = 0; + qp_attr->cap.max_send_sge = 0; + } + + /* We don't support inline sends for kernel QPs (yet), and we + * don't know what userspace's value should be. + */ + qp_attr->cap.max_inline_data = 0; + + qp_init_attr->cap = qp_attr->cap; + + qp_init_attr->create_flags = 0; + if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK) + qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; + + qp_init_attr->sq_sig_type = qp->sq_signal_bits & MLX5_WQE_CTRL_CQ_UPDATE ? + IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; + +out_free: + kfree(outb); + +out: + mutex_unlock(&qp->mutex); + return err; +} + +struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_general_caps *gen; + struct mlx5_ib_xrcd *xrcd; + int err; + + gen = &dev->mdev->caps.gen; + if (!(gen->flags & MLX5_DEV_CAP_FLAG_XRC)) + return ERR_PTR(-ENOSYS); + + xrcd = kmalloc(sizeof(*xrcd), GFP_KERNEL); + if (!xrcd) + return ERR_PTR(-ENOMEM); + + err = mlx5_core_xrcd_alloc(dev->mdev, &xrcd->xrcdn); + if (err) { + kfree(xrcd); + return ERR_PTR(-ENOMEM); + } + + return &xrcd->ibxrcd; +} + +int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd) +{ + struct mlx5_ib_dev *dev = to_mdev(xrcd->device); + u32 xrcdn = to_mxrcd(xrcd)->xrcdn; + int err; + + err = mlx5_core_xrcd_dealloc(dev->mdev, xrcdn); + if (err) { + mlx5_ib_warn(dev, "failed to dealloc xrcdn 0x%x\n", xrcdn); + return err; + } + + kfree(xrcd); + + return 0; +} diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c new file mode 100644 index 0000000..97cc1ba --- /dev/null +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -0,0 +1,487 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#include "mlx5_ib.h" +#include "user.h" + +/* not supported currently */ +static int srq_signature; + +static void *get_wqe(struct mlx5_ib_srq *srq, int n) +{ + return mlx5_buf_offset(&srq->buf, n << srq->msrq.wqe_shift); +} + +static void mlx5_ib_srq_event(struct mlx5_core_srq *srq, enum mlx5_event type) +{ + struct ib_event event; + struct ib_srq *ibsrq = &to_mibsrq(srq)->ibsrq; + + if (ibsrq->event_handler) { + event.device = ibsrq->device; + event.element.srq = ibsrq; + switch (type) { + case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT: + event.event = IB_EVENT_SRQ_LIMIT_REACHED; + break; + case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR: + event.event = IB_EVENT_SRQ_ERR; + break; + default: + pr_warn("mlx5_ib: Unexpected event type %d on SRQ %06x\n", + type, srq->srqn); + return; + } + + ibsrq->event_handler(&event, ibsrq->srq_context); + } +} + +static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, + struct mlx5_create_srq_mbox_in **in, + struct ib_udata *udata, int buf_size, int *inlen) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_create_srq ucmd; + size_t ucmdlen; + int err; + int npages; + int page_shift; + int ncont; + u32 offset; + + ucmdlen = + (udata->inlen - sizeof(struct ib_uverbs_cmd_hdr) < + sizeof(ucmd)) ? (sizeof(ucmd) - + sizeof(ucmd.reserved)) : sizeof(ucmd); + + if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) { + mlx5_ib_dbg(dev, "failed copy udata\n"); + return -EFAULT; + } + + if (ucmdlen == sizeof(ucmd) && + ucmd.reserved != 0) + return -EINVAL; + + srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE); + + srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, buf_size, + 0, 0); + if (IS_ERR(srq->umem)) { + mlx5_ib_dbg(dev, "failed umem get, size %d\n", buf_size); + err = PTR_ERR(srq->umem); + return err; + } + + mlx5_ib_cont_pages(srq->umem, ucmd.buf_addr, &npages, + &page_shift, &ncont, NULL); + err = mlx5_ib_get_buf_offset(ucmd.buf_addr, page_shift, + &offset); + if (err) { + mlx5_ib_warn(dev, "bad offset\n"); + goto err_umem; + } + + *inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont; + *in = mlx5_vzalloc(*inlen); + if (!(*in)) { + err = -ENOMEM; + goto err_umem; + } + + mlx5_ib_populate_pas(dev, srq->umem, page_shift, (*in)->pas, 0); + + err = mlx5_ib_db_map_user(to_mucontext(pd->uobject->context), + ucmd.db_addr, &srq->db); + if (err) { + mlx5_ib_dbg(dev, "map doorbell failed\n"); + goto err_in; + } + + (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + (*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26); + + return 0; + +err_in: + mlx5_vfree(*in); + +err_umem: + ib_umem_release(srq->umem); + + return err; +} + +static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, + struct mlx5_create_srq_mbox_in **in, int buf_size, + int *inlen) +{ + int err; + int i; + struct mlx5_wqe_srq_next_seg *next; + int page_shift; + int npages; + + err = mlx5_db_alloc(dev->mdev, &srq->db); + if (err) { + mlx5_ib_warn(dev, "alloc dbell rec failed\n"); + return err; + } + + *srq->db.db = 0; + + if (mlx5_buf_alloc(dev->mdev, buf_size, PAGE_SIZE * 2, &srq->buf)) { + mlx5_ib_dbg(dev, "buf alloc failed\n"); + err = -ENOMEM; + goto err_db; + } + page_shift = srq->buf.page_shift; + + srq->head = 0; + srq->tail = srq->msrq.max - 1; + srq->wqe_ctr = 0; + + for (i = 0; i < srq->msrq.max; i++) { + next = get_wqe(srq, i); + next->next_wqe_index = + cpu_to_be16((i + 1) & (srq->msrq.max - 1)); + } + + npages = DIV_ROUND_UP(srq->buf.npages, 1 << (page_shift - PAGE_SHIFT)); + mlx5_ib_dbg(dev, "buf_size %d, page_shift %d, npages %d, calc npages %d\n", + buf_size, page_shift, srq->buf.npages, npages); + *inlen = sizeof(**in) + sizeof(*(*in)->pas) * npages; + *in = mlx5_vzalloc(*inlen); + if (!*in) { + err = -ENOMEM; + goto err_buf; + } + mlx5_fill_page_array(&srq->buf, (*in)->pas); + + srq->wrid = kmalloc(srq->msrq.max * sizeof(u64), GFP_KERNEL); + if (!srq->wrid) { + mlx5_ib_dbg(dev, "kmalloc failed %lu\n", + (unsigned long)(srq->msrq.max * sizeof(u64))); + err = -ENOMEM; + goto err_in; + } + srq->wq_sig = !!srq_signature; + + (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + + return 0; + +err_in: + mlx5_vfree(*in); + +err_buf: + mlx5_buf_free(dev->mdev, &srq->buf); + +err_db: + mlx5_db_free(dev->mdev, &srq->db); + return err; +} + +static void destroy_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq) +{ + mlx5_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db); + ib_umem_release(srq->umem); +} + + +static void destroy_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq) +{ + kfree(srq->wrid); + mlx5_buf_free(dev->mdev, &srq->buf); + mlx5_db_free(dev->mdev, &srq->db); +} + +struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_general_caps *gen; + struct mlx5_ib_srq *srq; + int desc_size; + int buf_size; + int err; + struct mlx5_create_srq_mbox_in *uninitialized_var(in); + int uninitialized_var(inlen); + int is_xrc; + u32 flgs, xrcdn; + + gen = &dev->mdev->caps.gen; + /* Sanity check SRQ size before proceeding */ + if (init_attr->attr.max_wr >= gen->max_srq_wqes) { + mlx5_ib_dbg(dev, "max_wr %d, cap %d\n", + init_attr->attr.max_wr, + gen->max_srq_wqes); + return ERR_PTR(-EINVAL); + } + + srq = kmalloc(sizeof(*srq), GFP_KERNEL); + if (!srq) + return ERR_PTR(-ENOMEM); + + mutex_init(&srq->mutex); + spin_lock_init(&srq->lock); + srq->msrq.max = roundup_pow_of_two(init_attr->attr.max_wr + 1); + srq->msrq.max_gs = init_attr->attr.max_sge; + + desc_size = sizeof(struct mlx5_wqe_srq_next_seg) + + srq->msrq.max_gs * sizeof(struct mlx5_wqe_data_seg); + desc_size = roundup_pow_of_two(desc_size); + desc_size = max_t(int, 32, desc_size); + srq->msrq.max_avail_gather = (desc_size - sizeof(struct mlx5_wqe_srq_next_seg)) / + sizeof(struct mlx5_wqe_data_seg); + srq->msrq.wqe_shift = ilog2(desc_size); + buf_size = srq->msrq.max * desc_size; + mlx5_ib_dbg(dev, "desc_size 0x%x, req wr 0x%x, srq size 0x%x, max_gs 0x%x, max_avail_gather 0x%x\n", + desc_size, init_attr->attr.max_wr, srq->msrq.max, srq->msrq.max_gs, + srq->msrq.max_avail_gather); + + if (pd->uobject) + err = create_srq_user(pd, srq, &in, udata, buf_size, &inlen); + else + err = create_srq_kernel(dev, srq, &in, buf_size, &inlen); + + if (err) { + mlx5_ib_warn(dev, "create srq %s failed, err %d\n", + pd->uobject ? "user" : "kernel", err); + goto err_srq; + } + + is_xrc = (init_attr->srq_type == IB_SRQT_XRC); + in->ctx.state_log_sz = ilog2(srq->msrq.max); + flgs = ((srq->msrq.wqe_shift - 4) | (is_xrc << 5) | (srq->wq_sig << 7)) << 24; + xrcdn = 0; + if (is_xrc) { + xrcdn = to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn; + in->ctx.pgoff_cqn |= cpu_to_be32(to_mcq(init_attr->ext.xrc.cq)->mcq.cqn); + } else if (init_attr->srq_type == IB_SRQT_BASIC) { + xrcdn = to_mxrcd(dev->devr.x0)->xrcdn; + in->ctx.pgoff_cqn |= cpu_to_be32(to_mcq(dev->devr.c0)->mcq.cqn); + } + + in->ctx.flags_xrcd = cpu_to_be32((flgs & 0xFF000000) | (xrcdn & 0xFFFFFF)); + + in->ctx.pd = cpu_to_be32(to_mpd(pd)->pdn); + in->ctx.db_record = cpu_to_be64(srq->db.dma); + err = mlx5_core_create_srq(dev->mdev, &srq->msrq, in, inlen); + mlx5_vfree(in); + if (err) { + mlx5_ib_dbg(dev, "create SRQ failed, err %d\n", err); + goto err_usr_kern_srq; + } + + mlx5_ib_dbg(dev, "create SRQ with srqn 0x%x\n", srq->msrq.srqn); + + srq->msrq.event = mlx5_ib_srq_event; + srq->ibsrq.ext.xrc.srq_num = srq->msrq.srqn; + + if (pd->uobject) + if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof(__u32))) { + mlx5_ib_dbg(dev, "copy to user failed\n"); + err = -EFAULT; + goto err_core; + } + + init_attr->attr.max_wr = srq->msrq.max - 1; + + return &srq->ibsrq; + +err_core: + mlx5_core_destroy_srq(dev->mdev, &srq->msrq); + +err_usr_kern_srq: + if (pd->uobject) + destroy_srq_user(pd, srq); + else + destroy_srq_kernel(dev, srq); + +err_srq: + kfree(srq); + + return ERR_PTR(err); +} + +int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibsrq->device); + struct mlx5_ib_srq *srq = to_msrq(ibsrq); + int ret; + + /* We don't support resizing SRQs yet */ + if (attr_mask & IB_SRQ_MAX_WR) + return -EINVAL; + + if (attr_mask & IB_SRQ_LIMIT) { + if (attr->srq_limit >= srq->msrq.max) + return -EINVAL; + + mutex_lock(&srq->mutex); + ret = mlx5_core_arm_srq(dev->mdev, &srq->msrq, attr->srq_limit, 1); + mutex_unlock(&srq->mutex); + + if (ret) + return ret; + } + + return 0; +} + +int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibsrq->device); + struct mlx5_ib_srq *srq = to_msrq(ibsrq); + int ret; + struct mlx5_query_srq_mbox_out *out; + + out = kzalloc(sizeof(*out), GFP_KERNEL); + if (!out) + return -ENOMEM; + + ret = mlx5_core_query_srq(dev->mdev, &srq->msrq, out); + if (ret) + goto out_box; + + srq_attr->srq_limit = be16_to_cpu(out->ctx.lwm); + srq_attr->max_wr = srq->msrq.max - 1; + srq_attr->max_sge = srq->msrq.max_gs; + +out_box: + kfree(out); + return ret; +} + +int mlx5_ib_destroy_srq(struct ib_srq *srq) +{ + struct mlx5_ib_dev *dev = to_mdev(srq->device); + struct mlx5_ib_srq *msrq = to_msrq(srq); + + mlx5_core_destroy_srq(dev->mdev, &msrq->msrq); + + if (srq->uobject) { + mlx5_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db); + ib_umem_release(msrq->umem); + } else { + destroy_srq_kernel(dev, msrq); + } + + kfree(srq); + return 0; +} + +void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index) +{ + struct mlx5_wqe_srq_next_seg *next; + + /* always called with interrupts disabled. */ + spin_lock(&srq->lock); + + next = get_wqe(srq, srq->tail); + next->next_wqe_index = cpu_to_be16(wqe_index); + srq->tail = wqe_index; + + spin_unlock(&srq->lock); +} + +int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mlx5_ib_srq *srq = to_msrq(ibsrq); + struct mlx5_wqe_srq_next_seg *next; + struct mlx5_wqe_data_seg *scat; + unsigned long flags; + int err = 0; + int nreq; + int i; + + spin_lock_irqsave(&srq->lock, flags); + + for (nreq = 0; wr; nreq++, wr = wr->next) { + if (unlikely(wr->num_sge > srq->msrq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + break; + } + + if (unlikely(srq->head == srq->tail)) { + err = -ENOMEM; + *bad_wr = wr; + break; + } + + srq->wrid[srq->head] = wr->wr_id; + + next = get_wqe(srq, srq->head); + srq->head = be16_to_cpu(next->next_wqe_index); + scat = (struct mlx5_wqe_data_seg *)(next + 1); + + for (i = 0; i < wr->num_sge; i++) { + scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length); + scat[i].lkey = cpu_to_be32(wr->sg_list[i].lkey); + scat[i].addr = cpu_to_be64(wr->sg_list[i].addr); + } + + if (i < srq->msrq.max_avail_gather) { + scat[i].byte_count = 0; + scat[i].lkey = cpu_to_be32(MLX5_INVALID_LKEY); + scat[i].addr = 0; + } + } + + if (likely(nreq)) { + srq->wqe_ctr += nreq; + + /* Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *srq->db.db = cpu_to_be32(srq->wqe_ctr); + } + + spin_unlock_irqrestore(&srq->lock, flags); + + return err; +} diff --git a/drivers/infiniband/hw/mlx5/user.h b/drivers/infiniband/hw/mlx5/user.h new file mode 100644 index 0000000..d0ba264 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/user.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_IB_USER_H +#define MLX5_IB_USER_H + +#include + +enum { + MLX5_QP_FLAG_SIGNATURE = 1 << 0, + MLX5_QP_FLAG_SCATTER_CQE = 1 << 1, +}; + +enum { + MLX5_SRQ_FLAG_SIGNATURE = 1 << 0, +}; + + +/* Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define MLX5_IB_UVERBS_ABI_VERSION 1 + +/* Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ + +struct mlx5_ib_alloc_ucontext_req { + __u32 total_num_uuars; + __u32 num_low_latency_uuars; +}; + +struct mlx5_ib_alloc_ucontext_req_v2 { + __u32 total_num_uuars; + __u32 num_low_latency_uuars; + __u32 flags; + __u32 reserved; +}; + +struct mlx5_ib_alloc_ucontext_resp { + __u32 qp_tab_size; + __u32 bf_reg_size; + __u32 tot_uuars; + __u32 cache_line_size; + __u16 max_sq_desc_sz; + __u16 max_rq_desc_sz; + __u32 max_send_wqebb; + __u32 max_recv_wr; + __u32 max_srq_recv_wr; + __u16 num_ports; + __u16 reserved; +}; + +struct mlx5_ib_alloc_pd_resp { + __u32 pdn; +}; + +struct mlx5_ib_create_cq { + __u64 buf_addr; + __u64 db_addr; + __u32 cqe_size; + __u32 reserved; /* explicit padding (optional on i386) */ +}; + +struct mlx5_ib_create_cq_resp { + __u32 cqn; + __u32 reserved; +}; + +struct mlx5_ib_resize_cq { + __u64 buf_addr; + __u16 cqe_size; + __u16 reserved0; + __u32 reserved1; +}; + +struct mlx5_ib_create_srq { + __u64 buf_addr; + __u64 db_addr; + __u32 flags; + __u32 reserved; /* explicit padding (optional on i386) */ +}; + +struct mlx5_ib_create_srq_resp { + __u32 srqn; + __u32 reserved; +}; + +struct mlx5_ib_create_qp { + __u64 buf_addr; + __u64 db_addr; + __u32 sq_wqe_count; + __u32 rq_wqe_count; + __u32 rq_wqe_shift; + __u32 flags; +}; + +struct mlx5_ib_create_qp_resp { + __u32 uuar_index; +}; +#endif /* MLX5_IB_USER_H */ diff --git a/drivers/infiniband/hw/mthca/Kconfig b/drivers/infiniband/hw/mthca/Kconfig new file mode 100644 index 0000000..da314c3 --- /dev/null +++ b/drivers/infiniband/hw/mthca/Kconfig @@ -0,0 +1,17 @@ +config INFINIBAND_MTHCA + tristate "Mellanox HCA support" + depends on PCI + ---help--- + This is a low-level driver for Mellanox InfiniHost host + channel adapters (HCAs), including the MT23108 PCI-X HCA + ("Tavor") and the MT25208 PCI Express HCA ("Arbel"). + +config INFINIBAND_MTHCA_DEBUG + bool "Verbose debugging output" if EXPERT + depends on INFINIBAND_MTHCA + default y + ---help--- + This option causes debugging code to be compiled into the + mthca driver. The output can be turned on via the + debug_level module parameter (which can also be set after + the driver is loaded through sysfs). diff --git a/drivers/infiniband/hw/mthca/Makefile b/drivers/infiniband/hw/mthca/Makefile new file mode 100644 index 0000000..e388d95 --- /dev/null +++ b/drivers/infiniband/hw/mthca/Makefile @@ -0,0 +1,7 @@ +obj-$(CONFIG_INFINIBAND_MTHCA) += ib_mthca.o + +ib_mthca-y := mthca_main.o mthca_cmd.o mthca_profile.o mthca_reset.o \ + mthca_allocator.o mthca_eq.o mthca_pd.o mthca_cq.o \ + mthca_mr.o mthca_qp.o mthca_av.o mthca_mcg.o mthca_mad.o \ + mthca_provider.o mthca_memfree.o mthca_uar.o mthca_srq.o \ + mthca_catas.o diff --git a/drivers/infiniband/hw/mthca/mthca_allocator.c b/drivers/infiniband/hw/mthca/mthca_allocator.c new file mode 100644 index 0000000..b4e0cf4 --- /dev/null +++ b/drivers/infiniband/hw/mthca/mthca_allocator.c @@ -0,0 +1,301 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "mthca_dev.h" + +/* Trivial bitmap-based allocator */ +u32 mthca_alloc(struct mthca_alloc *alloc) +{ + unsigned long flags; + u32 obj; + + spin_lock_irqsave(&alloc->lock, flags); + + obj = find_next_zero_bit(alloc->table, alloc->max, alloc->last); + if (obj >= alloc->max) { + alloc->top = (alloc->top + alloc->max) & alloc->mask; + obj = find_first_zero_bit(alloc->table, alloc->max); + } + + if (obj < alloc->max) { + set_bit(obj, alloc->table); + obj |= alloc->top; + } else + obj = -1; + + spin_unlock_irqrestore(&alloc->lock, flags); + + return obj; +} + +void mthca_free(struct mthca_alloc *alloc, u32 obj) +{ + unsigned long flags; + + obj &= alloc->max - 1; + + spin_lock_irqsave(&alloc->lock, flags); + + clear_bit(obj, alloc->table); + alloc->last = min(alloc->last, obj); + alloc->top = (alloc->top + alloc->max) & alloc->mask; + + spin_unlock_irqrestore(&alloc->lock, flags); +} + +int mthca_alloc_init(struct mthca_alloc *alloc, u32 num, u32 mask, + u32 reserved) +{ + int i; + + /* num must be a power of 2 */ + if (num != 1 << (ffs(num) - 1)) + return -EINVAL; + + alloc->last = 0; + alloc->top = 0; + alloc->max = num; + alloc->mask = mask; + spin_lock_init(&alloc->lock); + alloc->table = kmalloc(BITS_TO_LONGS(num) * sizeof (long), + GFP_KERNEL); + if (!alloc->table) + return -ENOMEM; + + bitmap_zero(alloc->table, num); + for (i = 0; i < reserved; ++i) + set_bit(i, alloc->table); + + return 0; +} + +void mthca_alloc_cleanup(struct mthca_alloc *alloc) +{ + kfree(alloc->table); +} + +/* + * Array of pointers with lazy allocation of leaf pages. Callers of + * _get, _set and _clear methods must use a lock or otherwise + * serialize access to the array. + */ + +#define MTHCA_ARRAY_MASK (PAGE_SIZE / sizeof (void *) - 1) + +void *mthca_array_get(struct mthca_array *array, int index) +{ + int p = (index * sizeof (void *)) >> PAGE_SHIFT; + + if (array->page_list[p].page) + return array->page_list[p].page[index & MTHCA_ARRAY_MASK]; + else + return NULL; +} + +int mthca_array_set(struct mthca_array *array, int index, void *value) +{ + int p = (index * sizeof (void *)) >> PAGE_SHIFT; + + /* Allocate with GFP_ATOMIC because we'll be called with locks held. */ + if (!array->page_list[p].page) + array->page_list[p].page = (void **) get_zeroed_page(GFP_ATOMIC); + + if (!array->page_list[p].page) + return -ENOMEM; + + array->page_list[p].page[index & MTHCA_ARRAY_MASK] = value; + ++array->page_list[p].used; + + return 0; +} + +void mthca_array_clear(struct mthca_array *array, int index) +{ + int p = (index * sizeof (void *)) >> PAGE_SHIFT; + + if (--array->page_list[p].used == 0) { + free_page((unsigned long) array->page_list[p].page); + array->page_list[p].page = NULL; + } else + array->page_list[p].page[index & MTHCA_ARRAY_MASK] = NULL; + + if (array->page_list[p].used < 0) + pr_debug("Array %p index %d page %d with ref count %d < 0\n", + array, index, p, array->page_list[p].used); +} + +int mthca_array_init(struct mthca_array *array, int nent) +{ + int npage = (nent * sizeof (void *) + PAGE_SIZE - 1) / PAGE_SIZE; + int i; + + array->page_list = kmalloc(npage * sizeof *array->page_list, GFP_KERNEL); + if (!array->page_list) + return -ENOMEM; + + for (i = 0; i < npage; ++i) { + array->page_list[i].page = NULL; + array->page_list[i].used = 0; + } + + return 0; +} + +void mthca_array_cleanup(struct mthca_array *array, int nent) +{ + int i; + + for (i = 0; i < (nent * sizeof (void *) + PAGE_SIZE - 1) / PAGE_SIZE; ++i) + free_page((unsigned long) array->page_list[i].page); + + kfree(array->page_list); +} + +/* + * Handling for queue buffers -- we allocate a bunch of memory and + * register it in a memory region at HCA virtual address 0. If the + * requested size is > max_direct, we split the allocation into + * multiple pages, so we don't require too much contiguous memory. + */ + +int mthca_buf_alloc(struct mthca_dev *dev, int size, int max_direct, + union mthca_buf *buf, int *is_direct, struct mthca_pd *pd, + int hca_write, struct mthca_mr *mr) +{ + int err = -ENOMEM; + int npages, shift; + u64 *dma_list = NULL; + dma_addr_t t; + int i; + + if (size <= max_direct) { + *is_direct = 1; + npages = 1; + shift = get_order(size) + PAGE_SHIFT; + + buf->direct.buf = dma_alloc_coherent(&dev->pdev->dev, + size, &t, GFP_KERNEL); + if (!buf->direct.buf) + return -ENOMEM; + + dma_unmap_addr_set(&buf->direct, mapping, t); + + memset(buf->direct.buf, 0, size); + + while (t & ((1 << shift) - 1)) { + --shift; + npages *= 2; + } + + dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL); + if (!dma_list) + goto err_free; + + for (i = 0; i < npages; ++i) + dma_list[i] = t + i * (1 << shift); + } else { + *is_direct = 0; + npages = (size + PAGE_SIZE - 1) / PAGE_SIZE; + shift = PAGE_SHIFT; + + dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL); + if (!dma_list) + return -ENOMEM; + + buf->page_list = kmalloc(npages * sizeof *buf->page_list, + GFP_KERNEL); + if (!buf->page_list) + goto err_out; + + for (i = 0; i < npages; ++i) + buf->page_list[i].buf = NULL; + + for (i = 0; i < npages; ++i) { + buf->page_list[i].buf = + dma_alloc_coherent(&dev->pdev->dev, PAGE_SIZE, + &t, GFP_KERNEL); + if (!buf->page_list[i].buf) + goto err_free; + + dma_list[i] = t; + dma_unmap_addr_set(&buf->page_list[i], mapping, t); + + clear_page(buf->page_list[i].buf); + } + } + + err = mthca_mr_alloc_phys(dev, pd->pd_num, + dma_list, shift, npages, + 0, size, + MTHCA_MPT_FLAG_LOCAL_READ | + (hca_write ? MTHCA_MPT_FLAG_LOCAL_WRITE : 0), + mr); + if (err) + goto err_free; + + kfree(dma_list); + + return 0; + +err_free: + mthca_buf_free(dev, size, buf, *is_direct, NULL); + +err_out: + kfree(dma_list); + + return err; +} + +void mthca_buf_free(struct mthca_dev *dev, int size, union mthca_buf *buf, + int is_direct, struct mthca_mr *mr) +{ + int i; + + if (mr) + mthca_free_mr(dev, mr); + + if (is_direct) + dma_free_coherent(&dev->pdev->dev, size, buf->direct.buf, + dma_unmap_addr(&buf->direct, mapping)); + else { + for (i = 0; i < (size + PAGE_SIZE - 1) / PAGE_SIZE; ++i) + dma_free_coherent(&dev->pdev->dev, PAGE_SIZE, + buf->page_list[i].buf, + dma_unmap_addr(&buf->page_list[i], + mapping)); + kfree(buf->page_list); + } +} diff --git a/drivers/infiniband/hw/mthca/mthca_av.c b/drivers/infiniband/hw/mthca/mthca_av.c new file mode 100644 index 0000000..32f6c63 --- /dev/null +++ b/drivers/infiniband/hw/mthca/mthca_av.c @@ -0,0 +1,374 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include +#include + +#include "mthca_dev.h" + +enum { + MTHCA_RATE_TAVOR_FULL = 0, + MTHCA_RATE_TAVOR_1X = 1, + MTHCA_RATE_TAVOR_4X = 2, + MTHCA_RATE_TAVOR_1X_DDR = 3 +}; + +enum { + MTHCA_RATE_MEMFREE_FULL = 0, + MTHCA_RATE_MEMFREE_QUARTER = 1, + MTHCA_RATE_MEMFREE_EIGHTH = 2, + MTHCA_RATE_MEMFREE_HALF = 3 +}; + +struct mthca_av { + __be32 port_pd; + u8 reserved1; + u8 g_slid; + __be16 dlid; + u8 reserved2; + u8 gid_index; + u8 msg_sr; + u8 hop_limit; + __be32 sl_tclass_flowlabel; + __be32 dgid[4]; +}; + +static enum ib_rate memfree_rate_to_ib(u8 mthca_rate, u8 port_rate) +{ + switch (mthca_rate) { + case MTHCA_RATE_MEMFREE_EIGHTH: + return mult_to_ib_rate(port_rate >> 3); + case MTHCA_RATE_MEMFREE_QUARTER: + return mult_to_ib_rate(port_rate >> 2); + case MTHCA_RATE_MEMFREE_HALF: + return mult_to_ib_rate(port_rate >> 1); + case MTHCA_RATE_MEMFREE_FULL: + default: + return mult_to_ib_rate(port_rate); + } +} + +static enum ib_rate tavor_rate_to_ib(u8 mthca_rate, u8 port_rate) +{ + switch (mthca_rate) { + case MTHCA_RATE_TAVOR_1X: return IB_RATE_2_5_GBPS; + case MTHCA_RATE_TAVOR_1X_DDR: return IB_RATE_5_GBPS; + case MTHCA_RATE_TAVOR_4X: return IB_RATE_10_GBPS; + default: return mult_to_ib_rate(port_rate); + } +} + +enum ib_rate mthca_rate_to_ib(struct mthca_dev *dev, u8 mthca_rate, u8 port) +{ + if (mthca_is_memfree(dev)) { + /* Handle old Arbel FW */ + if (dev->limits.stat_rate_support == 0x3 && mthca_rate) + return IB_RATE_2_5_GBPS; + + return memfree_rate_to_ib(mthca_rate, dev->rate[port - 1]); + } else + return tavor_rate_to_ib(mthca_rate, dev->rate[port - 1]); +} + +static u8 ib_rate_to_memfree(u8 req_rate, u8 cur_rate) +{ + if (cur_rate <= req_rate) + return 0; + + /* + * Inter-packet delay (IPD) to get from rate X down to a rate + * no more than Y is (X - 1) / Y. + */ + switch ((cur_rate - 1) / req_rate) { + case 0: return MTHCA_RATE_MEMFREE_FULL; + case 1: return MTHCA_RATE_MEMFREE_HALF; + case 2: /* fall through */ + case 3: return MTHCA_RATE_MEMFREE_QUARTER; + default: return MTHCA_RATE_MEMFREE_EIGHTH; + } +} + +static u8 ib_rate_to_tavor(u8 static_rate) +{ + switch (static_rate) { + case IB_RATE_2_5_GBPS: return MTHCA_RATE_TAVOR_1X; + case IB_RATE_5_GBPS: return MTHCA_RATE_TAVOR_1X_DDR; + case IB_RATE_10_GBPS: return MTHCA_RATE_TAVOR_4X; + default: return MTHCA_RATE_TAVOR_FULL; + } +} + +u8 mthca_get_rate(struct mthca_dev *dev, int static_rate, u8 port) +{ + u8 rate; + + if (!static_rate || ib_rate_to_mult(static_rate) >= dev->rate[port - 1]) + return 0; + + if (mthca_is_memfree(dev)) + rate = ib_rate_to_memfree(ib_rate_to_mult(static_rate), + dev->rate[port - 1]); + else + rate = ib_rate_to_tavor(static_rate); + + if (!(dev->limits.stat_rate_support & (1 << rate))) + rate = 1; + + return rate; +} + +int mthca_create_ah(struct mthca_dev *dev, + struct mthca_pd *pd, + struct ib_ah_attr *ah_attr, + struct mthca_ah *ah) +{ + u32 index = -1; + struct mthca_av *av = NULL; + + ah->type = MTHCA_AH_PCI_POOL; + + if (mthca_is_memfree(dev)) { + ah->av = kmalloc(sizeof *ah->av, GFP_ATOMIC); + if (!ah->av) + return -ENOMEM; + + ah->type = MTHCA_AH_KMALLOC; + av = ah->av; + } else if (!atomic_read(&pd->sqp_count) && + !(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) { + index = mthca_alloc(&dev->av_table.alloc); + + /* fall back to allocate in host memory */ + if (index == -1) + goto on_hca_fail; + + av = kmalloc(sizeof *av, GFP_ATOMIC); + if (!av) + goto on_hca_fail; + + ah->type = MTHCA_AH_ON_HCA; + ah->avdma = dev->av_table.ddr_av_base + + index * MTHCA_AV_SIZE; + } + +on_hca_fail: + if (ah->type == MTHCA_AH_PCI_POOL) { + ah->av = pci_pool_alloc(dev->av_table.pool, + GFP_ATOMIC, &ah->avdma); + if (!ah->av) + return -ENOMEM; + + av = ah->av; + } + + ah->key = pd->ntmr.ibmr.lkey; + + memset(av, 0, MTHCA_AV_SIZE); + + av->port_pd = cpu_to_be32(pd->pd_num | (ah_attr->port_num << 24)); + av->g_slid = ah_attr->src_path_bits; + av->dlid = cpu_to_be16(ah_attr->dlid); + av->msg_sr = (3 << 4) | /* 2K message */ + mthca_get_rate(dev, ah_attr->static_rate, ah_attr->port_num); + av->sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28); + if (ah_attr->ah_flags & IB_AH_GRH) { + av->g_slid |= 0x80; + av->gid_index = (ah_attr->port_num - 1) * dev->limits.gid_table_len + + ah_attr->grh.sgid_index; + av->hop_limit = ah_attr->grh.hop_limit; + av->sl_tclass_flowlabel |= + cpu_to_be32((ah_attr->grh.traffic_class << 20) | + ah_attr->grh.flow_label); + memcpy(av->dgid, ah_attr->grh.dgid.raw, 16); + } else { + /* Arbel workaround -- low byte of GID must be 2 */ + av->dgid[3] = cpu_to_be32(2); + } + + if (0) { + int j; + + mthca_dbg(dev, "Created UDAV at %p/%08lx:\n", + av, (unsigned long) ah->avdma); + for (j = 0; j < 8; ++j) + printk(KERN_DEBUG " [%2x] %08x\n", + j * 4, be32_to_cpu(((__be32 *) av)[j])); + } + + if (ah->type == MTHCA_AH_ON_HCA) { + memcpy_toio(dev->av_table.av_map + index * MTHCA_AV_SIZE, + av, MTHCA_AV_SIZE); + kfree(av); + } + + return 0; +} + +int mthca_destroy_ah(struct mthca_dev *dev, struct mthca_ah *ah) +{ + switch (ah->type) { + case MTHCA_AH_ON_HCA: + mthca_free(&dev->av_table.alloc, + (ah->avdma - dev->av_table.ddr_av_base) / + MTHCA_AV_SIZE); + break; + + case MTHCA_AH_PCI_POOL: + pci_pool_free(dev->av_table.pool, ah->av, ah->avdma); + break; + + case MTHCA_AH_KMALLOC: + kfree(ah->av); + break; + } + + return 0; +} + +int mthca_ah_grh_present(struct mthca_ah *ah) +{ + return !!(ah->av->g_slid & 0x80); +} + +int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah, + struct ib_ud_header *header) +{ + if (ah->type == MTHCA_AH_ON_HCA) + return -EINVAL; + + header->lrh.service_level = be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 28; + header->lrh.destination_lid = ah->av->dlid; + header->lrh.source_lid = cpu_to_be16(ah->av->g_slid & 0x7f); + if (mthca_ah_grh_present(ah)) { + header->grh.traffic_class = + (be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 20) & 0xff; + header->grh.flow_label = + ah->av->sl_tclass_flowlabel & cpu_to_be32(0xfffff); + header->grh.hop_limit = ah->av->hop_limit; + ib_get_cached_gid(&dev->ib_dev, + be32_to_cpu(ah->av->port_pd) >> 24, + ah->av->gid_index % dev->limits.gid_table_len, + &header->grh.source_gid); + memcpy(header->grh.destination_gid.raw, + ah->av->dgid, 16); + } + + return 0; +} + +int mthca_ah_query(struct ib_ah *ibah, struct ib_ah_attr *attr) +{ + struct mthca_ah *ah = to_mah(ibah); + struct mthca_dev *dev = to_mdev(ibah->device); + + /* Only implement for MAD and memfree ah for now. */ + if (ah->type == MTHCA_AH_ON_HCA) + return -ENOSYS; + + memset(attr, 0, sizeof *attr); + attr->dlid = be16_to_cpu(ah->av->dlid); + attr->sl = be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 28; + attr->port_num = be32_to_cpu(ah->av->port_pd) >> 24; + attr->static_rate = mthca_rate_to_ib(dev, ah->av->msg_sr & 0x7, + attr->port_num); + attr->src_path_bits = ah->av->g_slid & 0x7F; + attr->ah_flags = mthca_ah_grh_present(ah) ? IB_AH_GRH : 0; + + if (attr->ah_flags) { + attr->grh.traffic_class = + be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 20; + attr->grh.flow_label = + be32_to_cpu(ah->av->sl_tclass_flowlabel) & 0xfffff; + attr->grh.hop_limit = ah->av->hop_limit; + attr->grh.sgid_index = ah->av->gid_index & + (dev->limits.gid_table_len - 1); + memcpy(attr->grh.dgid.raw, ah->av->dgid, 16); + } + + return 0; +} + +int mthca_init_av_table(struct mthca_dev *dev) +{ + int err; + + if (mthca_is_memfree(dev)) + return 0; + + err = mthca_alloc_init(&dev->av_table.alloc, + dev->av_table.num_ddr_avs, + dev->av_table.num_ddr_avs - 1, + 0); + if (err) + return err; + + dev->av_table.pool = pci_pool_create("mthca_av", dev->pdev, + MTHCA_AV_SIZE, + MTHCA_AV_SIZE, 0); + if (!dev->av_table.pool) + goto out_free_alloc; + + if (!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) { + dev->av_table.av_map = ioremap(pci_resource_start(dev->pdev, 4) + + dev->av_table.ddr_av_base - + dev->ddr_start, + dev->av_table.num_ddr_avs * + MTHCA_AV_SIZE); + if (!dev->av_table.av_map) + goto out_free_pool; + } else + dev->av_table.av_map = NULL; + + return 0; + + out_free_pool: + pci_pool_destroy(dev->av_table.pool); + + out_free_alloc: + mthca_alloc_cleanup(&dev->av_table.alloc); + return -ENOMEM; +} + +void mthca_cleanup_av_table(struct mthca_dev *dev) +{ + if (mthca_is_memfree(dev)) + return; + + if (dev->av_table.av_map) + iounmap(dev->av_table.av_map); + pci_pool_destroy(dev->av_table.pool); + mthca_alloc_cleanup(&dev->av_table.alloc); +} diff --git a/drivers/infiniband/hw/mthca/mthca_catas.c b/drivers/infiniband/hw/mthca/mthca_catas.c new file mode 100644 index 0000000..712d2a3 --- /dev/null +++ b/drivers/infiniband/hw/mthca/mthca_catas.c @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "mthca_dev.h" + +enum { + MTHCA_CATAS_POLL_INTERVAL = 5 * HZ, + + MTHCA_CATAS_TYPE_INTERNAL = 0, + MTHCA_CATAS_TYPE_UPLINK = 3, + MTHCA_CATAS_TYPE_DDR = 4, + MTHCA_CATAS_TYPE_PARITY = 5, +}; + +static DEFINE_SPINLOCK(catas_lock); + +static LIST_HEAD(catas_list); +static struct workqueue_struct *catas_wq; +static struct work_struct catas_work; + +static int catas_reset_disable; +module_param_named(catas_reset_disable, catas_reset_disable, int, 0644); +MODULE_PARM_DESC(catas_reset_disable, "disable reset on catastrophic event if nonzero"); + +static void catas_reset(struct work_struct *work) +{ + struct mthca_dev *dev, *tmpdev; + LIST_HEAD(tlist); + int ret; + + mutex_lock(&mthca_device_mutex); + + spin_lock_irq(&catas_lock); + list_splice_init(&catas_list, &tlist); + spin_unlock_irq(&catas_lock); + + list_for_each_entry_safe(dev, tmpdev, &tlist, catas_err.list) { + struct pci_dev *pdev = dev->pdev; + ret = __mthca_restart_one(dev->pdev); + /* 'dev' now is not valid */ + if (ret) + printk(KERN_ERR "mthca %s: Reset failed (%d)\n", + pci_name(pdev), ret); + else { + struct mthca_dev *d = pci_get_drvdata(pdev); + mthca_dbg(d, "Reset succeeded\n"); + } + } + + mutex_unlock(&mthca_device_mutex); +} + +static void handle_catas(struct mthca_dev *dev) +{ + struct ib_event event; + unsigned long flags; + const char *type; + int i; + + event.device = &dev->ib_dev; + event.event = IB_EVENT_DEVICE_FATAL; + event.element.port_num = 0; + dev->active = false; + + ib_dispatch_event(&event); + + switch (swab32(readl(dev->catas_err.map)) >> 24) { + case MTHCA_CATAS_TYPE_INTERNAL: + type = "internal error"; + break; + case MTHCA_CATAS_TYPE_UPLINK: + type = "uplink bus error"; + break; + case MTHCA_CATAS_TYPE_DDR: + type = "DDR data error"; + break; + case MTHCA_CATAS_TYPE_PARITY: + type = "internal parity error"; + break; + default: + type = "unknown error"; + break; + } + + mthca_err(dev, "Catastrophic error detected: %s\n", type); + for (i = 0; i < dev->catas_err.size; ++i) + mthca_err(dev, " buf[%02x]: %08x\n", + i, swab32(readl(dev->catas_err.map + i))); + + if (catas_reset_disable) + return; + + spin_lock_irqsave(&catas_lock, flags); + list_add(&dev->catas_err.list, &catas_list); + queue_work(catas_wq, &catas_work); + spin_unlock_irqrestore(&catas_lock, flags); +} + +static void poll_catas(unsigned long dev_ptr) +{ + struct mthca_dev *dev = (struct mthca_dev *) dev_ptr; + int i; + + for (i = 0; i < dev->catas_err.size; ++i) + if (readl(dev->catas_err.map + i)) { + handle_catas(dev); + return; + } + + mod_timer(&dev->catas_err.timer, + round_jiffies(jiffies + MTHCA_CATAS_POLL_INTERVAL)); +} + +void mthca_start_catas_poll(struct mthca_dev *dev) +{ + phys_addr_t addr; + + init_timer(&dev->catas_err.timer); + dev->catas_err.map = NULL; + + addr = pci_resource_start(dev->pdev, 0) + + ((pci_resource_len(dev->pdev, 0) - 1) & + dev->catas_err.addr); + + dev->catas_err.map = ioremap(addr, dev->catas_err.size * 4); + if (!dev->catas_err.map) { + mthca_warn(dev, "couldn't map catastrophic error region " + "at 0x%llx/0x%x\n", (unsigned long long) addr, + dev->catas_err.size * 4); + return; + } + + dev->catas_err.timer.data = (unsigned long) dev; + dev->catas_err.timer.function = poll_catas; + dev->catas_err.timer.expires = jiffies + MTHCA_CATAS_POLL_INTERVAL; + INIT_LIST_HEAD(&dev->catas_err.list); + add_timer(&dev->catas_err.timer); +} + +void mthca_stop_catas_poll(struct mthca_dev *dev) +{ + del_timer_sync(&dev->catas_err.timer); + + if (dev->catas_err.map) + iounmap(dev->catas_err.map); + + spin_lock_irq(&catas_lock); + list_del(&dev->catas_err.list); + spin_unlock_irq(&catas_lock); +} + +int __init mthca_catas_init(void) +{ + INIT_WORK(&catas_work, catas_reset); + + catas_wq = create_singlethread_workqueue("mthca_catas"); + if (!catas_wq) + return -ENOMEM; + + return 0; +} + +void mthca_catas_cleanup(void) +{ + destroy_workqueue(catas_wq); +} diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c new file mode 100644 index 0000000..9d3e5c1 --- /dev/null +++ b/drivers/infiniband/hw/mthca/mthca_cmd.c @@ -0,0 +1,1969 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mthca_dev.h" +#include "mthca_config_reg.h" +#include "mthca_cmd.h" +#include "mthca_memfree.h" + +#define CMD_POLL_TOKEN 0xffff + +enum { + HCR_IN_PARAM_OFFSET = 0x00, + HCR_IN_MODIFIER_OFFSET = 0x08, + HCR_OUT_PARAM_OFFSET = 0x0c, + HCR_TOKEN_OFFSET = 0x14, + HCR_STATUS_OFFSET = 0x18, + + HCR_OPMOD_SHIFT = 12, + HCA_E_BIT = 22, + HCR_GO_BIT = 23 +}; + +enum { + /* initialization and general commands */ + CMD_SYS_EN = 0x1, + CMD_SYS_DIS = 0x2, + CMD_MAP_FA = 0xfff, + CMD_UNMAP_FA = 0xffe, + CMD_RUN_FW = 0xff6, + CMD_MOD_STAT_CFG = 0x34, + CMD_QUERY_DEV_LIM = 0x3, + CMD_QUERY_FW = 0x4, + CMD_ENABLE_LAM = 0xff8, + CMD_DISABLE_LAM = 0xff7, + CMD_QUERY_DDR = 0x5, + CMD_QUERY_ADAPTER = 0x6, + CMD_INIT_HCA = 0x7, + CMD_CLOSE_HCA = 0x8, + CMD_INIT_IB = 0x9, + CMD_CLOSE_IB = 0xa, + CMD_QUERY_HCA = 0xb, + CMD_SET_IB = 0xc, + CMD_ACCESS_DDR = 0x2e, + CMD_MAP_ICM = 0xffa, + CMD_UNMAP_ICM = 0xff9, + CMD_MAP_ICM_AUX = 0xffc, + CMD_UNMAP_ICM_AUX = 0xffb, + CMD_SET_ICM_SIZE = 0xffd, + + /* TPT commands */ + CMD_SW2HW_MPT = 0xd, + CMD_QUERY_MPT = 0xe, + CMD_HW2SW_MPT = 0xf, + CMD_READ_MTT = 0x10, + CMD_WRITE_MTT = 0x11, + CMD_SYNC_TPT = 0x2f, + + /* EQ commands */ + CMD_MAP_EQ = 0x12, + CMD_SW2HW_EQ = 0x13, + CMD_HW2SW_EQ = 0x14, + CMD_QUERY_EQ = 0x15, + + /* CQ commands */ + CMD_SW2HW_CQ = 0x16, + CMD_HW2SW_CQ = 0x17, + CMD_QUERY_CQ = 0x18, + CMD_RESIZE_CQ = 0x2c, + + /* SRQ commands */ + CMD_SW2HW_SRQ = 0x35, + CMD_HW2SW_SRQ = 0x36, + CMD_QUERY_SRQ = 0x37, + CMD_ARM_SRQ = 0x40, + + /* QP/EE commands */ + CMD_RST2INIT_QPEE = 0x19, + CMD_INIT2RTR_QPEE = 0x1a, + CMD_RTR2RTS_QPEE = 0x1b, + CMD_RTS2RTS_QPEE = 0x1c, + CMD_SQERR2RTS_QPEE = 0x1d, + CMD_2ERR_QPEE = 0x1e, + CMD_RTS2SQD_QPEE = 0x1f, + CMD_SQD2SQD_QPEE = 0x38, + CMD_SQD2RTS_QPEE = 0x20, + CMD_ERR2RST_QPEE = 0x21, + CMD_QUERY_QPEE = 0x22, + CMD_INIT2INIT_QPEE = 0x2d, + CMD_SUSPEND_QPEE = 0x32, + CMD_UNSUSPEND_QPEE = 0x33, + /* special QPs and management commands */ + CMD_CONF_SPECIAL_QP = 0x23, + CMD_MAD_IFC = 0x24, + + /* multicast commands */ + CMD_READ_MGM = 0x25, + CMD_WRITE_MGM = 0x26, + CMD_MGID_HASH = 0x27, + + /* miscellaneous commands */ + CMD_DIAG_RPRT = 0x30, + CMD_NOP = 0x31, + + /* debug commands */ + CMD_QUERY_DEBUG_MSG = 0x2a, + CMD_SET_DEBUG_MSG = 0x2b, +}; + +/* + * According to Mellanox code, FW may be starved and never complete + * commands. So we can't use strict timeouts described in PRM -- we + * just arbitrarily select 60 seconds for now. + */ +#if 0 +/* + * Round up and add 1 to make sure we get the full wait time (since we + * will be starting in the middle of a jiffy) + */ +enum { + CMD_TIME_CLASS_A = (HZ + 999) / 1000 + 1, + CMD_TIME_CLASS_B = (HZ + 99) / 100 + 1, + CMD_TIME_CLASS_C = (HZ + 9) / 10 + 1, + CMD_TIME_CLASS_D = 60 * HZ +}; +#else +enum { + CMD_TIME_CLASS_A = 60 * HZ, + CMD_TIME_CLASS_B = 60 * HZ, + CMD_TIME_CLASS_C = 60 * HZ, + CMD_TIME_CLASS_D = 60 * HZ +}; +#endif + +enum { + GO_BIT_TIMEOUT = HZ * 10 +}; + +struct mthca_cmd_context { + struct completion done; + int result; + int next; + u64 out_param; + u16 token; + u8 status; +}; + +static int fw_cmd_doorbell = 0; +module_param(fw_cmd_doorbell, int, 0644); +MODULE_PARM_DESC(fw_cmd_doorbell, "post FW commands through doorbell page if nonzero " + "(and supported by FW)"); + +static inline int go_bit(struct mthca_dev *dev) +{ + return readl(dev->hcr + HCR_STATUS_OFFSET) & + swab32(1 << HCR_GO_BIT); +} + +static void mthca_cmd_post_dbell(struct mthca_dev *dev, + u64 in_param, + u64 out_param, + u32 in_modifier, + u8 op_modifier, + u16 op, + u16 token) +{ + void __iomem *ptr = dev->cmd.dbell_map; + u16 *offs = dev->cmd.dbell_offsets; + + __raw_writel((__force u32) cpu_to_be32(in_param >> 32), ptr + offs[0]); + wmb(); + __raw_writel((__force u32) cpu_to_be32(in_param & 0xfffffffful), ptr + offs[1]); + wmb(); + __raw_writel((__force u32) cpu_to_be32(in_modifier), ptr + offs[2]); + wmb(); + __raw_writel((__force u32) cpu_to_be32(out_param >> 32), ptr + offs[3]); + wmb(); + __raw_writel((__force u32) cpu_to_be32(out_param & 0xfffffffful), ptr + offs[4]); + wmb(); + __raw_writel((__force u32) cpu_to_be32(token << 16), ptr + offs[5]); + wmb(); + __raw_writel((__force u32) cpu_to_be32((1 << HCR_GO_BIT) | + (1 << HCA_E_BIT) | + (op_modifier << HCR_OPMOD_SHIFT) | + op), ptr + offs[6]); + wmb(); + __raw_writel((__force u32) 0, ptr + offs[7]); + wmb(); +} + +static int mthca_cmd_post_hcr(struct mthca_dev *dev, + u64 in_param, + u64 out_param, + u32 in_modifier, + u8 op_modifier, + u16 op, + u16 token, + int event) +{ + if (event) { + unsigned long end = jiffies + GO_BIT_TIMEOUT; + + while (go_bit(dev) && time_before(jiffies, end)) { + set_current_state(TASK_RUNNING); + schedule(); + } + } + + if (go_bit(dev)) + return -EAGAIN; + + /* + * We use writel (instead of something like memcpy_toio) + * because writes of less than 32 bits to the HCR don't work + * (and some architectures such as ia64 implement memcpy_toio + * in terms of writeb). + */ + __raw_writel((__force u32) cpu_to_be32(in_param >> 32), dev->hcr + 0 * 4); + __raw_writel((__force u32) cpu_to_be32(in_param & 0xfffffffful), dev->hcr + 1 * 4); + __raw_writel((__force u32) cpu_to_be32(in_modifier), dev->hcr + 2 * 4); + __raw_writel((__force u32) cpu_to_be32(out_param >> 32), dev->hcr + 3 * 4); + __raw_writel((__force u32) cpu_to_be32(out_param & 0xfffffffful), dev->hcr + 4 * 4); + __raw_writel((__force u32) cpu_to_be32(token << 16), dev->hcr + 5 * 4); + + /* __raw_writel may not order writes. */ + wmb(); + + __raw_writel((__force u32) cpu_to_be32((1 << HCR_GO_BIT) | + (event ? (1 << HCA_E_BIT) : 0) | + (op_modifier << HCR_OPMOD_SHIFT) | + op), dev->hcr + 6 * 4); + + return 0; +} + +static int mthca_cmd_post(struct mthca_dev *dev, + u64 in_param, + u64 out_param, + u32 in_modifier, + u8 op_modifier, + u16 op, + u16 token, + int event) +{ + int err = 0; + + mutex_lock(&dev->cmd.hcr_mutex); + + if (event && dev->cmd.flags & MTHCA_CMD_POST_DOORBELLS && fw_cmd_doorbell) + mthca_cmd_post_dbell(dev, in_param, out_param, in_modifier, + op_modifier, op, token); + else + err = mthca_cmd_post_hcr(dev, in_param, out_param, in_modifier, + op_modifier, op, token, event); + + /* + * Make sure that our HCR writes don't get mixed in with + * writes from another CPU starting a FW command. + */ + mmiowb(); + + mutex_unlock(&dev->cmd.hcr_mutex); + return err; +} + + +static int mthca_status_to_errno(u8 status) +{ + static const int trans_table[] = { + [MTHCA_CMD_STAT_INTERNAL_ERR] = -EIO, + [MTHCA_CMD_STAT_BAD_OP] = -EPERM, + [MTHCA_CMD_STAT_BAD_PARAM] = -EINVAL, + [MTHCA_CMD_STAT_BAD_SYS_STATE] = -ENXIO, + [MTHCA_CMD_STAT_BAD_RESOURCE] = -EBADF, + [MTHCA_CMD_STAT_RESOURCE_BUSY] = -EBUSY, + [MTHCA_CMD_STAT_DDR_MEM_ERR] = -ENOMEM, + [MTHCA_CMD_STAT_EXCEED_LIM] = -ENOMEM, + [MTHCA_CMD_STAT_BAD_RES_STATE] = -EBADF, + [MTHCA_CMD_STAT_BAD_INDEX] = -EBADF, + [MTHCA_CMD_STAT_BAD_NVMEM] = -EFAULT, + [MTHCA_CMD_STAT_BAD_QPEE_STATE] = -EINVAL, + [MTHCA_CMD_STAT_BAD_SEG_PARAM] = -EFAULT, + [MTHCA_CMD_STAT_REG_BOUND] = -EBUSY, + [MTHCA_CMD_STAT_LAM_NOT_PRE] = -EAGAIN, + [MTHCA_CMD_STAT_BAD_PKT] = -EBADMSG, + [MTHCA_CMD_STAT_BAD_SIZE] = -ENOMEM, + }; + + if (status >= ARRAY_SIZE(trans_table) || + (status != MTHCA_CMD_STAT_OK + && trans_table[status] == 0)) + return -EINVAL; + + return trans_table[status]; +} + + +static int mthca_cmd_poll(struct mthca_dev *dev, + u64 in_param, + u64 *out_param, + int out_is_imm, + u32 in_modifier, + u8 op_modifier, + u16 op, + unsigned long timeout) +{ + int err = 0; + unsigned long end; + u8 status; + + down(&dev->cmd.poll_sem); + + err = mthca_cmd_post(dev, in_param, + out_param ? *out_param : 0, + in_modifier, op_modifier, + op, CMD_POLL_TOKEN, 0); + if (err) + goto out; + + end = timeout + jiffies; + while (go_bit(dev) && time_before(jiffies, end)) { + set_current_state(TASK_RUNNING); + schedule(); + } + + if (go_bit(dev)) { + err = -EBUSY; + goto out; + } + + if (out_is_imm) + *out_param = + (u64) be32_to_cpu((__force __be32) + __raw_readl(dev->hcr + HCR_OUT_PARAM_OFFSET)) << 32 | + (u64) be32_to_cpu((__force __be32) + __raw_readl(dev->hcr + HCR_OUT_PARAM_OFFSET + 4)); + + status = be32_to_cpu((__force __be32) __raw_readl(dev->hcr + HCR_STATUS_OFFSET)) >> 24; + if (status) { + mthca_dbg(dev, "Command %02x completed with status %02x\n", + op, status); + err = mthca_status_to_errno(status); + } + +out: + up(&dev->cmd.poll_sem); + return err; +} + +void mthca_cmd_event(struct mthca_dev *dev, + u16 token, + u8 status, + u64 out_param) +{ + struct mthca_cmd_context *context = + &dev->cmd.context[token & dev->cmd.token_mask]; + + /* previously timed out command completing at long last */ + if (token != context->token) + return; + + context->result = 0; + context->status = status; + context->out_param = out_param; + + complete(&context->done); +} + +static int mthca_cmd_wait(struct mthca_dev *dev, + u64 in_param, + u64 *out_param, + int out_is_imm, + u32 in_modifier, + u8 op_modifier, + u16 op, + unsigned long timeout) +{ + int err = 0; + struct mthca_cmd_context *context; + + down(&dev->cmd.event_sem); + + spin_lock(&dev->cmd.context_lock); + BUG_ON(dev->cmd.free_head < 0); + context = &dev->cmd.context[dev->cmd.free_head]; + context->token += dev->cmd.token_mask + 1; + dev->cmd.free_head = context->next; + spin_unlock(&dev->cmd.context_lock); + + init_completion(&context->done); + + err = mthca_cmd_post(dev, in_param, + out_param ? *out_param : 0, + in_modifier, op_modifier, + op, context->token, 1); + if (err) + goto out; + + if (!wait_for_completion_timeout(&context->done, timeout)) { + err = -EBUSY; + goto out; + } + + err = context->result; + if (err) + goto out; + + if (context->status) { + mthca_dbg(dev, "Command %02x completed with status %02x\n", + op, context->status); + err = mthca_status_to_errno(context->status); + } + + if (out_is_imm) + *out_param = context->out_param; + +out: + spin_lock(&dev->cmd.context_lock); + context->next = dev->cmd.free_head; + dev->cmd.free_head = context - dev->cmd.context; + spin_unlock(&dev->cmd.context_lock); + + up(&dev->cmd.event_sem); + return err; +} + +/* Invoke a command with an output mailbox */ +static int mthca_cmd_box(struct mthca_dev *dev, + u64 in_param, + u64 out_param, + u32 in_modifier, + u8 op_modifier, + u16 op, + unsigned long timeout) +{ + if (dev->cmd.flags & MTHCA_CMD_USE_EVENTS) + return mthca_cmd_wait(dev, in_param, &out_param, 0, + in_modifier, op_modifier, op, + timeout); + else + return mthca_cmd_poll(dev, in_param, &out_param, 0, + in_modifier, op_modifier, op, + timeout); +} + +/* Invoke a command with no output parameter */ +static int mthca_cmd(struct mthca_dev *dev, + u64 in_param, + u32 in_modifier, + u8 op_modifier, + u16 op, + unsigned long timeout) +{ + return mthca_cmd_box(dev, in_param, 0, in_modifier, + op_modifier, op, timeout); +} + +/* + * Invoke a command with an immediate output parameter (and copy the + * output into the caller's out_param pointer after the command + * executes). + */ +static int mthca_cmd_imm(struct mthca_dev *dev, + u64 in_param, + u64 *out_param, + u32 in_modifier, + u8 op_modifier, + u16 op, + unsigned long timeout) +{ + if (dev->cmd.flags & MTHCA_CMD_USE_EVENTS) + return mthca_cmd_wait(dev, in_param, out_param, 1, + in_modifier, op_modifier, op, + timeout); + else + return mthca_cmd_poll(dev, in_param, out_param, 1, + in_modifier, op_modifier, op, + timeout); +} + +int mthca_cmd_init(struct mthca_dev *dev) +{ + mutex_init(&dev->cmd.hcr_mutex); + sema_init(&dev->cmd.poll_sem, 1); + dev->cmd.flags = 0; + + dev->hcr = ioremap(pci_resource_start(dev->pdev, 0) + MTHCA_HCR_BASE, + MTHCA_HCR_SIZE); + if (!dev->hcr) { + mthca_err(dev, "Couldn't map command register."); + return -ENOMEM; + } + + dev->cmd.pool = pci_pool_create("mthca_cmd", dev->pdev, + MTHCA_MAILBOX_SIZE, + MTHCA_MAILBOX_SIZE, 0); + if (!dev->cmd.pool) { + iounmap(dev->hcr); + return -ENOMEM; + } + + return 0; +} + +void mthca_cmd_cleanup(struct mthca_dev *dev) +{ + pci_pool_destroy(dev->cmd.pool); + iounmap(dev->hcr); + if (dev->cmd.flags & MTHCA_CMD_POST_DOORBELLS) + iounmap(dev->cmd.dbell_map); +} + +/* + * Switch to using events to issue FW commands (should be called after + * event queue to command events has been initialized). + */ +int mthca_cmd_use_events(struct mthca_dev *dev) +{ + int i; + + dev->cmd.context = kmalloc(dev->cmd.max_cmds * + sizeof (struct mthca_cmd_context), + GFP_KERNEL); + if (!dev->cmd.context) + return -ENOMEM; + + for (i = 0; i < dev->cmd.max_cmds; ++i) { + dev->cmd.context[i].token = i; + dev->cmd.context[i].next = i + 1; + } + + dev->cmd.context[dev->cmd.max_cmds - 1].next = -1; + dev->cmd.free_head = 0; + + sema_init(&dev->cmd.event_sem, dev->cmd.max_cmds); + spin_lock_init(&dev->cmd.context_lock); + + for (dev->cmd.token_mask = 1; + dev->cmd.token_mask < dev->cmd.max_cmds; + dev->cmd.token_mask <<= 1) + ; /* nothing */ + --dev->cmd.token_mask; + + dev->cmd.flags |= MTHCA_CMD_USE_EVENTS; + + down(&dev->cmd.poll_sem); + + return 0; +} + +/* + * Switch back to polling (used when shutting down the device) + */ +void mthca_cmd_use_polling(struct mthca_dev *dev) +{ + int i; + + dev->cmd.flags &= ~MTHCA_CMD_USE_EVENTS; + + for (i = 0; i < dev->cmd.max_cmds; ++i) + down(&dev->cmd.event_sem); + + kfree(dev->cmd.context); + + up(&dev->cmd.poll_sem); +} + +struct mthca_mailbox *mthca_alloc_mailbox(struct mthca_dev *dev, + gfp_t gfp_mask) +{ + struct mthca_mailbox *mailbox; + + mailbox = kmalloc(sizeof *mailbox, gfp_mask); + if (!mailbox) + return ERR_PTR(-ENOMEM); + + mailbox->buf = pci_pool_alloc(dev->cmd.pool, gfp_mask, &mailbox->dma); + if (!mailbox->buf) { + kfree(mailbox); + return ERR_PTR(-ENOMEM); + } + + return mailbox; +} + +void mthca_free_mailbox(struct mthca_dev *dev, struct mthca_mailbox *mailbox) +{ + if (!mailbox) + return; + + pci_pool_free(dev->cmd.pool, mailbox->buf, mailbox->dma); + kfree(mailbox); +} + +int mthca_SYS_EN(struct mthca_dev *dev) +{ + u64 out; + int ret; + + ret = mthca_cmd_imm(dev, 0, &out, 0, 0, CMD_SYS_EN, CMD_TIME_CLASS_D); + + if (ret == -ENOMEM) + mthca_warn(dev, "SYS_EN DDR error: syn=%x, sock=%d, " + "sladdr=%d, SPD source=%s\n", + (int) (out >> 6) & 0xf, (int) (out >> 4) & 3, + (int) (out >> 1) & 7, (int) out & 1 ? "NVMEM" : "DIMM"); + + return ret; +} + +int mthca_SYS_DIS(struct mthca_dev *dev) +{ + return mthca_cmd(dev, 0, 0, 0, CMD_SYS_DIS, CMD_TIME_CLASS_C); +} + +static int mthca_map_cmd(struct mthca_dev *dev, u16 op, struct mthca_icm *icm, + u64 virt) +{ + struct mthca_mailbox *mailbox; + struct mthca_icm_iter iter; + __be64 *pages; + int lg; + int nent = 0; + int i; + int err = 0; + int ts = 0, tc = 0; + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + memset(mailbox->buf, 0, MTHCA_MAILBOX_SIZE); + pages = mailbox->buf; + + for (mthca_icm_first(icm, &iter); + !mthca_icm_last(&iter); + mthca_icm_next(&iter)) { + /* + * We have to pass pages that are aligned to their + * size, so find the least significant 1 in the + * address or size and use that as our log2 size. + */ + lg = ffs(mthca_icm_addr(&iter) | mthca_icm_size(&iter)) - 1; + if (lg < MTHCA_ICM_PAGE_SHIFT) { + mthca_warn(dev, "Got FW area not aligned to %d (%llx/%lx).\n", + MTHCA_ICM_PAGE_SIZE, + (unsigned long long) mthca_icm_addr(&iter), + mthca_icm_size(&iter)); + err = -EINVAL; + goto out; + } + for (i = 0; i < mthca_icm_size(&iter) >> lg; ++i) { + if (virt != -1) { + pages[nent * 2] = cpu_to_be64(virt); + virt += 1 << lg; + } + + pages[nent * 2 + 1] = + cpu_to_be64((mthca_icm_addr(&iter) + (i << lg)) | + (lg - MTHCA_ICM_PAGE_SHIFT)); + ts += 1 << (lg - 10); + ++tc; + + if (++nent == MTHCA_MAILBOX_SIZE / 16) { + err = mthca_cmd(dev, mailbox->dma, nent, 0, op, + CMD_TIME_CLASS_B); + if (err) + goto out; + nent = 0; + } + } + } + + if (nent) + err = mthca_cmd(dev, mailbox->dma, nent, 0, op, + CMD_TIME_CLASS_B); + + switch (op) { + case CMD_MAP_FA: + mthca_dbg(dev, "Mapped %d chunks/%d KB for FW.\n", tc, ts); + break; + case CMD_MAP_ICM_AUX: + mthca_dbg(dev, "Mapped %d chunks/%d KB for ICM aux.\n", tc, ts); + break; + case CMD_MAP_ICM: + mthca_dbg(dev, "Mapped %d chunks/%d KB at %llx for ICM.\n", + tc, ts, (unsigned long long) virt - (ts << 10)); + break; + } + +out: + mthca_free_mailbox(dev, mailbox); + return err; +} + +int mthca_MAP_FA(struct mthca_dev *dev, struct mthca_icm *icm) +{ + return mthca_map_cmd(dev, CMD_MAP_FA, icm, -1); +} + +int mthca_UNMAP_FA(struct mthca_dev *dev) +{ + return mthca_cmd(dev, 0, 0, 0, CMD_UNMAP_FA, CMD_TIME_CLASS_B); +} + +int mthca_RUN_FW(struct mthca_dev *dev) +{ + return mthca_cmd(dev, 0, 0, 0, CMD_RUN_FW, CMD_TIME_CLASS_A); +} + +static void mthca_setup_cmd_doorbells(struct mthca_dev *dev, u64 base) +{ + phys_addr_t addr; + u16 max_off = 0; + int i; + + for (i = 0; i < 8; ++i) + max_off = max(max_off, dev->cmd.dbell_offsets[i]); + + if ((base & PAGE_MASK) != ((base + max_off) & PAGE_MASK)) { + mthca_warn(dev, "Firmware doorbell region at 0x%016llx, " + "length 0x%x crosses a page boundary\n", + (unsigned long long) base, max_off); + return; + } + + addr = pci_resource_start(dev->pdev, 2) + + ((pci_resource_len(dev->pdev, 2) - 1) & base); + dev->cmd.dbell_map = ioremap(addr, max_off + sizeof(u32)); + if (!dev->cmd.dbell_map) + return; + + dev->cmd.flags |= MTHCA_CMD_POST_DOORBELLS; + mthca_dbg(dev, "Mapped doorbell page for posting FW commands\n"); +} + +int mthca_QUERY_FW(struct mthca_dev *dev) +{ + struct mthca_mailbox *mailbox; + u32 *outbox; + u64 base; + u32 tmp; + int err = 0; + u8 lg; + int i; + +#define QUERY_FW_OUT_SIZE 0x100 +#define QUERY_FW_VER_OFFSET 0x00 +#define QUERY_FW_MAX_CMD_OFFSET 0x0f +#define QUERY_FW_ERR_START_OFFSET 0x30 +#define QUERY_FW_ERR_SIZE_OFFSET 0x38 + +#define QUERY_FW_CMD_DB_EN_OFFSET 0x10 +#define QUERY_FW_CMD_DB_OFFSET 0x50 +#define QUERY_FW_CMD_DB_BASE 0x60 + +#define QUERY_FW_START_OFFSET 0x20 +#define QUERY_FW_END_OFFSET 0x28 + +#define QUERY_FW_SIZE_OFFSET 0x00 +#define QUERY_FW_CLR_INT_BASE_OFFSET 0x20 +#define QUERY_FW_EQ_ARM_BASE_OFFSET 0x40 +#define QUERY_FW_EQ_SET_CI_BASE_OFFSET 0x48 + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + outbox = mailbox->buf; + + err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_FW, + CMD_TIME_CLASS_A); + + if (err) + goto out; + + MTHCA_GET(dev->fw_ver, outbox, QUERY_FW_VER_OFFSET); + /* + * FW subminor version is at more significant bits than minor + * version, so swap here. + */ + dev->fw_ver = (dev->fw_ver & 0xffff00000000ull) | + ((dev->fw_ver & 0xffff0000ull) >> 16) | + ((dev->fw_ver & 0x0000ffffull) << 16); + + MTHCA_GET(lg, outbox, QUERY_FW_MAX_CMD_OFFSET); + dev->cmd.max_cmds = 1 << lg; + + mthca_dbg(dev, "FW version %012llx, max commands %d\n", + (unsigned long long) dev->fw_ver, dev->cmd.max_cmds); + + MTHCA_GET(dev->catas_err.addr, outbox, QUERY_FW_ERR_START_OFFSET); + MTHCA_GET(dev->catas_err.size, outbox, QUERY_FW_ERR_SIZE_OFFSET); + + mthca_dbg(dev, "Catastrophic error buffer at 0x%llx, size 0x%x\n", + (unsigned long long) dev->catas_err.addr, dev->catas_err.size); + + MTHCA_GET(tmp, outbox, QUERY_FW_CMD_DB_EN_OFFSET); + if (tmp & 0x1) { + mthca_dbg(dev, "FW supports commands through doorbells\n"); + + MTHCA_GET(base, outbox, QUERY_FW_CMD_DB_BASE); + for (i = 0; i < MTHCA_CMD_NUM_DBELL_DWORDS; ++i) + MTHCA_GET(dev->cmd.dbell_offsets[i], outbox, + QUERY_FW_CMD_DB_OFFSET + (i << 1)); + + mthca_setup_cmd_doorbells(dev, base); + } + + if (mthca_is_memfree(dev)) { + MTHCA_GET(dev->fw.arbel.fw_pages, outbox, QUERY_FW_SIZE_OFFSET); + MTHCA_GET(dev->fw.arbel.clr_int_base, outbox, QUERY_FW_CLR_INT_BASE_OFFSET); + MTHCA_GET(dev->fw.arbel.eq_arm_base, outbox, QUERY_FW_EQ_ARM_BASE_OFFSET); + MTHCA_GET(dev->fw.arbel.eq_set_ci_base, outbox, QUERY_FW_EQ_SET_CI_BASE_OFFSET); + mthca_dbg(dev, "FW size %d KB\n", dev->fw.arbel.fw_pages << 2); + + /* + * Round up number of system pages needed in case + * MTHCA_ICM_PAGE_SIZE < PAGE_SIZE. + */ + dev->fw.arbel.fw_pages = + ALIGN(dev->fw.arbel.fw_pages, PAGE_SIZE / MTHCA_ICM_PAGE_SIZE) >> + (PAGE_SHIFT - MTHCA_ICM_PAGE_SHIFT); + + mthca_dbg(dev, "Clear int @ %llx, EQ arm @ %llx, EQ set CI @ %llx\n", + (unsigned long long) dev->fw.arbel.clr_int_base, + (unsigned long long) dev->fw.arbel.eq_arm_base, + (unsigned long long) dev->fw.arbel.eq_set_ci_base); + } else { + MTHCA_GET(dev->fw.tavor.fw_start, outbox, QUERY_FW_START_OFFSET); + MTHCA_GET(dev->fw.tavor.fw_end, outbox, QUERY_FW_END_OFFSET); + + mthca_dbg(dev, "FW size %d KB (start %llx, end %llx)\n", + (int) ((dev->fw.tavor.fw_end - dev->fw.tavor.fw_start) >> 10), + (unsigned long long) dev->fw.tavor.fw_start, + (unsigned long long) dev->fw.tavor.fw_end); + } + +out: + mthca_free_mailbox(dev, mailbox); + return err; +} + +int mthca_ENABLE_LAM(struct mthca_dev *dev) +{ + struct mthca_mailbox *mailbox; + u8 info; + u32 *outbox; + int err = 0; + +#define ENABLE_LAM_OUT_SIZE 0x100 +#define ENABLE_LAM_START_OFFSET 0x00 +#define ENABLE_LAM_END_OFFSET 0x08 +#define ENABLE_LAM_INFO_OFFSET 0x13 + +#define ENABLE_LAM_INFO_HIDDEN_FLAG (1 << 4) +#define ENABLE_LAM_INFO_ECC_MASK 0x3 + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + outbox = mailbox->buf; + + err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_ENABLE_LAM, + CMD_TIME_CLASS_C); + + if (err) + goto out; + + MTHCA_GET(dev->ddr_start, outbox, ENABLE_LAM_START_OFFSET); + MTHCA_GET(dev->ddr_end, outbox, ENABLE_LAM_END_OFFSET); + MTHCA_GET(info, outbox, ENABLE_LAM_INFO_OFFSET); + + if (!!(info & ENABLE_LAM_INFO_HIDDEN_FLAG) != + !!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) { + mthca_info(dev, "FW reports that HCA-attached memory " + "is %s hidden; does not match PCI config\n", + (info & ENABLE_LAM_INFO_HIDDEN_FLAG) ? + "" : "not"); + } + if (info & ENABLE_LAM_INFO_HIDDEN_FLAG) + mthca_dbg(dev, "HCA-attached memory is hidden.\n"); + + mthca_dbg(dev, "HCA memory size %d KB (start %llx, end %llx)\n", + (int) ((dev->ddr_end - dev->ddr_start) >> 10), + (unsigned long long) dev->ddr_start, + (unsigned long long) dev->ddr_end); + +out: + mthca_free_mailbox(dev, mailbox); + return err; +} + +int mthca_DISABLE_LAM(struct mthca_dev *dev) +{ + return mthca_cmd(dev, 0, 0, 0, CMD_SYS_DIS, CMD_TIME_CLASS_C); +} + +int mthca_QUERY_DDR(struct mthca_dev *dev) +{ + struct mthca_mailbox *mailbox; + u8 info; + u32 *outbox; + int err = 0; + +#define QUERY_DDR_OUT_SIZE 0x100 +#define QUERY_DDR_START_OFFSET 0x00 +#define QUERY_DDR_END_OFFSET 0x08 +#define QUERY_DDR_INFO_OFFSET 0x13 + +#define QUERY_DDR_INFO_HIDDEN_FLAG (1 << 4) +#define QUERY_DDR_INFO_ECC_MASK 0x3 + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + outbox = mailbox->buf; + + err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_DDR, + CMD_TIME_CLASS_A); + + if (err) + goto out; + + MTHCA_GET(dev->ddr_start, outbox, QUERY_DDR_START_OFFSET); + MTHCA_GET(dev->ddr_end, outbox, QUERY_DDR_END_OFFSET); + MTHCA_GET(info, outbox, QUERY_DDR_INFO_OFFSET); + + if (!!(info & QUERY_DDR_INFO_HIDDEN_FLAG) != + !!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) { + mthca_info(dev, "FW reports that HCA-attached memory " + "is %s hidden; does not match PCI config\n", + (info & QUERY_DDR_INFO_HIDDEN_FLAG) ? + "" : "not"); + } + if (info & QUERY_DDR_INFO_HIDDEN_FLAG) + mthca_dbg(dev, "HCA-attached memory is hidden.\n"); + + mthca_dbg(dev, "HCA memory size %d KB (start %llx, end %llx)\n", + (int) ((dev->ddr_end - dev->ddr_start) >> 10), + (unsigned long long) dev->ddr_start, + (unsigned long long) dev->ddr_end); + +out: + mthca_free_mailbox(dev, mailbox); + return err; +} + +int mthca_QUERY_DEV_LIM(struct mthca_dev *dev, + struct mthca_dev_lim *dev_lim) +{ + struct mthca_mailbox *mailbox; + u32 *outbox; + u8 field; + u16 size; + u16 stat_rate; + int err; + +#define QUERY_DEV_LIM_OUT_SIZE 0x100 +#define QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET 0x10 +#define QUERY_DEV_LIM_MAX_QP_SZ_OFFSET 0x11 +#define QUERY_DEV_LIM_RSVD_QP_OFFSET 0x12 +#define QUERY_DEV_LIM_MAX_QP_OFFSET 0x13 +#define QUERY_DEV_LIM_RSVD_SRQ_OFFSET 0x14 +#define QUERY_DEV_LIM_MAX_SRQ_OFFSET 0x15 +#define QUERY_DEV_LIM_RSVD_EEC_OFFSET 0x16 +#define QUERY_DEV_LIM_MAX_EEC_OFFSET 0x17 +#define QUERY_DEV_LIM_MAX_CQ_SZ_OFFSET 0x19 +#define QUERY_DEV_LIM_RSVD_CQ_OFFSET 0x1a +#define QUERY_DEV_LIM_MAX_CQ_OFFSET 0x1b +#define QUERY_DEV_LIM_MAX_MPT_OFFSET 0x1d +#define QUERY_DEV_LIM_RSVD_EQ_OFFSET 0x1e +#define QUERY_DEV_LIM_MAX_EQ_OFFSET 0x1f +#define QUERY_DEV_LIM_RSVD_MTT_OFFSET 0x20 +#define QUERY_DEV_LIM_MAX_MRW_SZ_OFFSET 0x21 +#define QUERY_DEV_LIM_RSVD_MRW_OFFSET 0x22 +#define QUERY_DEV_LIM_MAX_MTT_SEG_OFFSET 0x23 +#define QUERY_DEV_LIM_MAX_AV_OFFSET 0x27 +#define QUERY_DEV_LIM_MAX_REQ_QP_OFFSET 0x29 +#define QUERY_DEV_LIM_MAX_RES_QP_OFFSET 0x2b +#define QUERY_DEV_LIM_MAX_RDMA_OFFSET 0x2f +#define QUERY_DEV_LIM_RSZ_SRQ_OFFSET 0x33 +#define QUERY_DEV_LIM_ACK_DELAY_OFFSET 0x35 +#define QUERY_DEV_LIM_MTU_WIDTH_OFFSET 0x36 +#define QUERY_DEV_LIM_VL_PORT_OFFSET 0x37 +#define QUERY_DEV_LIM_MAX_GID_OFFSET 0x3b +#define QUERY_DEV_LIM_RATE_SUPPORT_OFFSET 0x3c +#define QUERY_DEV_LIM_MAX_PKEY_OFFSET 0x3f +#define QUERY_DEV_LIM_FLAGS_OFFSET 0x44 +#define QUERY_DEV_LIM_RSVD_UAR_OFFSET 0x48 +#define QUERY_DEV_LIM_UAR_SZ_OFFSET 0x49 +#define QUERY_DEV_LIM_PAGE_SZ_OFFSET 0x4b +#define QUERY_DEV_LIM_MAX_SG_OFFSET 0x51 +#define QUERY_DEV_LIM_MAX_DESC_SZ_OFFSET 0x52 +#define QUERY_DEV_LIM_MAX_SG_RQ_OFFSET 0x55 +#define QUERY_DEV_LIM_MAX_DESC_SZ_RQ_OFFSET 0x56 +#define QUERY_DEV_LIM_MAX_QP_MCG_OFFSET 0x61 +#define QUERY_DEV_LIM_RSVD_MCG_OFFSET 0x62 +#define QUERY_DEV_LIM_MAX_MCG_OFFSET 0x63 +#define QUERY_DEV_LIM_RSVD_PD_OFFSET 0x64 +#define QUERY_DEV_LIM_MAX_PD_OFFSET 0x65 +#define QUERY_DEV_LIM_RSVD_RDD_OFFSET 0x66 +#define QUERY_DEV_LIM_MAX_RDD_OFFSET 0x67 +#define QUERY_DEV_LIM_EEC_ENTRY_SZ_OFFSET 0x80 +#define QUERY_DEV_LIM_QPC_ENTRY_SZ_OFFSET 0x82 +#define QUERY_DEV_LIM_EEEC_ENTRY_SZ_OFFSET 0x84 +#define QUERY_DEV_LIM_EQPC_ENTRY_SZ_OFFSET 0x86 +#define QUERY_DEV_LIM_EQC_ENTRY_SZ_OFFSET 0x88 +#define QUERY_DEV_LIM_CQC_ENTRY_SZ_OFFSET 0x8a +#define QUERY_DEV_LIM_SRQ_ENTRY_SZ_OFFSET 0x8c +#define QUERY_DEV_LIM_UAR_ENTRY_SZ_OFFSET 0x8e +#define QUERY_DEV_LIM_MTT_ENTRY_SZ_OFFSET 0x90 +#define QUERY_DEV_LIM_MPT_ENTRY_SZ_OFFSET 0x92 +#define QUERY_DEV_LIM_PBL_SZ_OFFSET 0x96 +#define QUERY_DEV_LIM_BMME_FLAGS_OFFSET 0x97 +#define QUERY_DEV_LIM_RSVD_LKEY_OFFSET 0x98 +#define QUERY_DEV_LIM_LAMR_OFFSET 0x9f +#define QUERY_DEV_LIM_MAX_ICM_SZ_OFFSET 0xa0 + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + outbox = mailbox->buf; + + err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_DEV_LIM, + CMD_TIME_CLASS_A); + + if (err) + goto out; + + MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_QP_OFFSET); + dev_lim->reserved_qps = 1 << (field & 0xf); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_OFFSET); + dev_lim->max_qps = 1 << (field & 0x1f); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_SRQ_OFFSET); + dev_lim->reserved_srqs = 1 << (field >> 4); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_OFFSET); + dev_lim->max_srqs = 1 << (field & 0x1f); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_EEC_OFFSET); + dev_lim->reserved_eecs = 1 << (field & 0xf); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_EEC_OFFSET); + dev_lim->max_eecs = 1 << (field & 0x1f); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_CQ_SZ_OFFSET); + dev_lim->max_cq_sz = 1 << field; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_CQ_OFFSET); + dev_lim->reserved_cqs = 1 << (field & 0xf); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_CQ_OFFSET); + dev_lim->max_cqs = 1 << (field & 0x1f); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MPT_OFFSET); + dev_lim->max_mpts = 1 << (field & 0x3f); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_EQ_OFFSET); + dev_lim->reserved_eqs = 1 << (field & 0xf); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_EQ_OFFSET); + dev_lim->max_eqs = 1 << (field & 0x7); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MTT_OFFSET); + if (mthca_is_memfree(dev)) + dev_lim->reserved_mtts = ALIGN((1 << (field >> 4)) * sizeof(u64), + dev->limits.mtt_seg_size) / dev->limits.mtt_seg_size; + else + dev_lim->reserved_mtts = 1 << (field >> 4); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MRW_SZ_OFFSET); + dev_lim->max_mrw_sz = 1 << field; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MRW_OFFSET); + dev_lim->reserved_mrws = 1 << (field & 0xf); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MTT_SEG_OFFSET); + dev_lim->max_mtt_seg = 1 << (field & 0x3f); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_REQ_QP_OFFSET); + dev_lim->max_requester_per_qp = 1 << (field & 0x3f); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RES_QP_OFFSET); + dev_lim->max_responder_per_qp = 1 << (field & 0x3f); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RDMA_OFFSET); + dev_lim->max_rdma_global = 1 << (field & 0x3f); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_ACK_DELAY_OFFSET); + dev_lim->local_ca_ack_delay = field & 0x1f; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MTU_WIDTH_OFFSET); + dev_lim->max_mtu = field >> 4; + dev_lim->max_port_width = field & 0xf; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_VL_PORT_OFFSET); + dev_lim->max_vl = field >> 4; + dev_lim->num_ports = field & 0xf; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_GID_OFFSET); + dev_lim->max_gids = 1 << (field & 0xf); + MTHCA_GET(stat_rate, outbox, QUERY_DEV_LIM_RATE_SUPPORT_OFFSET); + dev_lim->stat_rate_support = stat_rate; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_PKEY_OFFSET); + dev_lim->max_pkeys = 1 << (field & 0xf); + MTHCA_GET(dev_lim->flags, outbox, QUERY_DEV_LIM_FLAGS_OFFSET); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_UAR_OFFSET); + dev_lim->reserved_uars = field >> 4; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_UAR_SZ_OFFSET); + dev_lim->uar_size = 1 << ((field & 0x3f) + 20); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_PAGE_SZ_OFFSET); + dev_lim->min_page_sz = 1 << field; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SG_OFFSET); + dev_lim->max_sg = field; + + MTHCA_GET(size, outbox, QUERY_DEV_LIM_MAX_DESC_SZ_OFFSET); + dev_lim->max_desc_sz = size; + + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_MCG_OFFSET); + dev_lim->max_qp_per_mcg = 1 << field; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MCG_OFFSET); + dev_lim->reserved_mgms = field & 0xf; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MCG_OFFSET); + dev_lim->max_mcgs = 1 << field; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_PD_OFFSET); + dev_lim->reserved_pds = field >> 4; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_PD_OFFSET); + dev_lim->max_pds = 1 << (field & 0x3f); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_RDD_OFFSET); + dev_lim->reserved_rdds = field >> 4; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RDD_OFFSET); + dev_lim->max_rdds = 1 << (field & 0x3f); + + MTHCA_GET(size, outbox, QUERY_DEV_LIM_EEC_ENTRY_SZ_OFFSET); + dev_lim->eec_entry_sz = size; + MTHCA_GET(size, outbox, QUERY_DEV_LIM_QPC_ENTRY_SZ_OFFSET); + dev_lim->qpc_entry_sz = size; + MTHCA_GET(size, outbox, QUERY_DEV_LIM_EEEC_ENTRY_SZ_OFFSET); + dev_lim->eeec_entry_sz = size; + MTHCA_GET(size, outbox, QUERY_DEV_LIM_EQPC_ENTRY_SZ_OFFSET); + dev_lim->eqpc_entry_sz = size; + MTHCA_GET(size, outbox, QUERY_DEV_LIM_EQC_ENTRY_SZ_OFFSET); + dev_lim->eqc_entry_sz = size; + MTHCA_GET(size, outbox, QUERY_DEV_LIM_CQC_ENTRY_SZ_OFFSET); + dev_lim->cqc_entry_sz = size; + MTHCA_GET(size, outbox, QUERY_DEV_LIM_SRQ_ENTRY_SZ_OFFSET); + dev_lim->srq_entry_sz = size; + MTHCA_GET(size, outbox, QUERY_DEV_LIM_UAR_ENTRY_SZ_OFFSET); + dev_lim->uar_scratch_entry_sz = size; + + if (mthca_is_memfree(dev)) { + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET); + dev_lim->max_srq_sz = 1 << field; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_SZ_OFFSET); + dev_lim->max_qp_sz = 1 << field; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSZ_SRQ_OFFSET); + dev_lim->hca.arbel.resize_srq = field & 1; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SG_RQ_OFFSET); + dev_lim->max_sg = min_t(int, field, dev_lim->max_sg); + MTHCA_GET(size, outbox, QUERY_DEV_LIM_MAX_DESC_SZ_RQ_OFFSET); + dev_lim->max_desc_sz = min_t(int, size, dev_lim->max_desc_sz); + MTHCA_GET(size, outbox, QUERY_DEV_LIM_MPT_ENTRY_SZ_OFFSET); + dev_lim->mpt_entry_sz = size; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_PBL_SZ_OFFSET); + dev_lim->hca.arbel.max_pbl_sz = 1 << (field & 0x3f); + MTHCA_GET(dev_lim->hca.arbel.bmme_flags, outbox, + QUERY_DEV_LIM_BMME_FLAGS_OFFSET); + MTHCA_GET(dev_lim->hca.arbel.reserved_lkey, outbox, + QUERY_DEV_LIM_RSVD_LKEY_OFFSET); + MTHCA_GET(field, outbox, QUERY_DEV_LIM_LAMR_OFFSET); + dev_lim->hca.arbel.lam_required = field & 1; + MTHCA_GET(dev_lim->hca.arbel.max_icm_sz, outbox, + QUERY_DEV_LIM_MAX_ICM_SZ_OFFSET); + + if (dev_lim->hca.arbel.bmme_flags & 1) + mthca_dbg(dev, "Base MM extensions: yes " + "(flags %d, max PBL %d, rsvd L_Key %08x)\n", + dev_lim->hca.arbel.bmme_flags, + dev_lim->hca.arbel.max_pbl_sz, + dev_lim->hca.arbel.reserved_lkey); + else + mthca_dbg(dev, "Base MM extensions: no\n"); + + mthca_dbg(dev, "Max ICM size %lld MB\n", + (unsigned long long) dev_lim->hca.arbel.max_icm_sz >> 20); + } else { + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET); + dev_lim->max_srq_sz = (1 << field) - 1; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_SZ_OFFSET); + dev_lim->max_qp_sz = (1 << field) - 1; + MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_AV_OFFSET); + dev_lim->hca.tavor.max_avs = 1 << (field & 0x3f); + dev_lim->mpt_entry_sz = MTHCA_MPT_ENTRY_SIZE; + } + + mthca_dbg(dev, "Max QPs: %d, reserved QPs: %d, entry size: %d\n", + dev_lim->max_qps, dev_lim->reserved_qps, dev_lim->qpc_entry_sz); + mthca_dbg(dev, "Max SRQs: %d, reserved SRQs: %d, entry size: %d\n", + dev_lim->max_srqs, dev_lim->reserved_srqs, dev_lim->srq_entry_sz); + mthca_dbg(dev, "Max CQs: %d, reserved CQs: %d, entry size: %d\n", + dev_lim->max_cqs, dev_lim->reserved_cqs, dev_lim->cqc_entry_sz); + mthca_dbg(dev, "Max EQs: %d, reserved EQs: %d, entry size: %d\n", + dev_lim->max_eqs, dev_lim->reserved_eqs, dev_lim->eqc_entry_sz); + mthca_dbg(dev, "reserved MPTs: %d, reserved MTTs: %d\n", + dev_lim->reserved_mrws, dev_lim->reserved_mtts); + mthca_dbg(dev, "Max PDs: %d, reserved PDs: %d, reserved UARs: %d\n", + dev_lim->max_pds, dev_lim->reserved_pds, dev_lim->reserved_uars); + mthca_dbg(dev, "Max QP/MCG: %d, reserved MGMs: %d\n", + dev_lim->max_pds, dev_lim->reserved_mgms); + mthca_dbg(dev, "Max CQEs: %d, max WQEs: %d, max SRQ WQEs: %d\n", + dev_lim->max_cq_sz, dev_lim->max_qp_sz, dev_lim->max_srq_sz); + + mthca_dbg(dev, "Flags: %08x\n", dev_lim->flags); + +out: + mthca_free_mailbox(dev, mailbox); + return err; +} + +static void get_board_id(void *vsd, char *board_id) +{ + int i; + +#define VSD_OFFSET_SIG1 0x00 +#define VSD_OFFSET_SIG2 0xde +#define VSD_OFFSET_MLX_BOARD_ID 0xd0 +#define VSD_OFFSET_TS_BOARD_ID 0x20 + +#define VSD_SIGNATURE_TOPSPIN 0x5ad + + memset(board_id, 0, MTHCA_BOARD_ID_LEN); + + if (be16_to_cpup(vsd + VSD_OFFSET_SIG1) == VSD_SIGNATURE_TOPSPIN && + be16_to_cpup(vsd + VSD_OFFSET_SIG2) == VSD_SIGNATURE_TOPSPIN) { + strlcpy(board_id, vsd + VSD_OFFSET_TS_BOARD_ID, MTHCA_BOARD_ID_LEN); + } else { + /* + * The board ID is a string but the firmware byte + * swaps each 4-byte word before passing it back to + * us. Therefore we need to swab it before printing. + */ + for (i = 0; i < 4; ++i) + ((u32 *) board_id)[i] = + swab32(*(u32 *) (vsd + VSD_OFFSET_MLX_BOARD_ID + i * 4)); + } +} + +int mthca_QUERY_ADAPTER(struct mthca_dev *dev, + struct mthca_adapter *adapter) +{ + struct mthca_mailbox *mailbox; + u32 *outbox; + int err; + +#define QUERY_ADAPTER_OUT_SIZE 0x100 +#define QUERY_ADAPTER_VENDOR_ID_OFFSET 0x00 +#define QUERY_ADAPTER_DEVICE_ID_OFFSET 0x04 +#define QUERY_ADAPTER_REVISION_ID_OFFSET 0x08 +#define QUERY_ADAPTER_INTA_PIN_OFFSET 0x10 +#define QUERY_ADAPTER_VSD_OFFSET 0x20 + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + outbox = mailbox->buf; + + err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_ADAPTER, + CMD_TIME_CLASS_A); + + if (err) + goto out; + + if (!mthca_is_memfree(dev)) { + MTHCA_GET(adapter->vendor_id, outbox, + QUERY_ADAPTER_VENDOR_ID_OFFSET); + MTHCA_GET(adapter->device_id, outbox, + QUERY_ADAPTER_DEVICE_ID_OFFSET); + MTHCA_GET(adapter->revision_id, outbox, + QUERY_ADAPTER_REVISION_ID_OFFSET); + } + MTHCA_GET(adapter->inta_pin, outbox, QUERY_ADAPTER_INTA_PIN_OFFSET); + + get_board_id(outbox + QUERY_ADAPTER_VSD_OFFSET / 4, + adapter->board_id); + +out: + mthca_free_mailbox(dev, mailbox); + return err; +} + +int mthca_INIT_HCA(struct mthca_dev *dev, + struct mthca_init_hca_param *param) +{ + struct mthca_mailbox *mailbox; + __be32 *inbox; + int err; + +#define INIT_HCA_IN_SIZE 0x200 +#define INIT_HCA_FLAGS1_OFFSET 0x00c +#define INIT_HCA_FLAGS2_OFFSET 0x014 +#define INIT_HCA_QPC_OFFSET 0x020 +#define INIT_HCA_QPC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x10) +#define INIT_HCA_LOG_QP_OFFSET (INIT_HCA_QPC_OFFSET + 0x17) +#define INIT_HCA_EEC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x20) +#define INIT_HCA_LOG_EEC_OFFSET (INIT_HCA_QPC_OFFSET + 0x27) +#define INIT_HCA_SRQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x28) +#define INIT_HCA_LOG_SRQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x2f) +#define INIT_HCA_CQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x30) +#define INIT_HCA_LOG_CQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x37) +#define INIT_HCA_EQPC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x40) +#define INIT_HCA_EEEC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x50) +#define INIT_HCA_EQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x60) +#define INIT_HCA_LOG_EQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x67) +#define INIT_HCA_RDB_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x70) +#define INIT_HCA_UDAV_OFFSET 0x0b0 +#define INIT_HCA_UDAV_LKEY_OFFSET (INIT_HCA_UDAV_OFFSET + 0x0) +#define INIT_HCA_UDAV_PD_OFFSET (INIT_HCA_UDAV_OFFSET + 0x4) +#define INIT_HCA_MCAST_OFFSET 0x0c0 +#define INIT_HCA_MC_BASE_OFFSET (INIT_HCA_MCAST_OFFSET + 0x00) +#define INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x12) +#define INIT_HCA_MC_HASH_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x16) +#define INIT_HCA_LOG_MC_TABLE_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x1b) +#define INIT_HCA_TPT_OFFSET 0x0f0 +#define INIT_HCA_MPT_BASE_OFFSET (INIT_HCA_TPT_OFFSET + 0x00) +#define INIT_HCA_MTT_SEG_SZ_OFFSET (INIT_HCA_TPT_OFFSET + 0x09) +#define INIT_HCA_LOG_MPT_SZ_OFFSET (INIT_HCA_TPT_OFFSET + 0x0b) +#define INIT_HCA_MTT_BASE_OFFSET (INIT_HCA_TPT_OFFSET + 0x10) +#define INIT_HCA_UAR_OFFSET 0x120 +#define INIT_HCA_UAR_BASE_OFFSET (INIT_HCA_UAR_OFFSET + 0x00) +#define INIT_HCA_UARC_SZ_OFFSET (INIT_HCA_UAR_OFFSET + 0x09) +#define INIT_HCA_LOG_UAR_SZ_OFFSET (INIT_HCA_UAR_OFFSET + 0x0a) +#define INIT_HCA_UAR_PAGE_SZ_OFFSET (INIT_HCA_UAR_OFFSET + 0x0b) +#define INIT_HCA_UAR_SCATCH_BASE_OFFSET (INIT_HCA_UAR_OFFSET + 0x10) +#define INIT_HCA_UAR_CTX_BASE_OFFSET (INIT_HCA_UAR_OFFSET + 0x18) + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + inbox = mailbox->buf; + + memset(inbox, 0, INIT_HCA_IN_SIZE); + + if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT) + MTHCA_PUT(inbox, 0x1, INIT_HCA_FLAGS1_OFFSET); + +#if defined(__LITTLE_ENDIAN) + *(inbox + INIT_HCA_FLAGS2_OFFSET / 4) &= ~cpu_to_be32(1 << 1); +#elif defined(__BIG_ENDIAN) + *(inbox + INIT_HCA_FLAGS2_OFFSET / 4) |= cpu_to_be32(1 << 1); +#else +#error Host endianness not defined +#endif + /* Check port for UD address vector: */ + *(inbox + INIT_HCA_FLAGS2_OFFSET / 4) |= cpu_to_be32(1); + + /* Enable IPoIB checksumming if we can: */ + if (dev->device_cap_flags & IB_DEVICE_UD_IP_CSUM) + *(inbox + INIT_HCA_FLAGS2_OFFSET / 4) |= cpu_to_be32(7 << 3); + + /* We leave wqe_quota, responder_exu, etc as 0 (default) */ + + /* QPC/EEC/CQC/EQC/RDB attributes */ + + MTHCA_PUT(inbox, param->qpc_base, INIT_HCA_QPC_BASE_OFFSET); + MTHCA_PUT(inbox, param->log_num_qps, INIT_HCA_LOG_QP_OFFSET); + MTHCA_PUT(inbox, param->eec_base, INIT_HCA_EEC_BASE_OFFSET); + MTHCA_PUT(inbox, param->log_num_eecs, INIT_HCA_LOG_EEC_OFFSET); + MTHCA_PUT(inbox, param->srqc_base, INIT_HCA_SRQC_BASE_OFFSET); + MTHCA_PUT(inbox, param->log_num_srqs, INIT_HCA_LOG_SRQ_OFFSET); + MTHCA_PUT(inbox, param->cqc_base, INIT_HCA_CQC_BASE_OFFSET); + MTHCA_PUT(inbox, param->log_num_cqs, INIT_HCA_LOG_CQ_OFFSET); + MTHCA_PUT(inbox, param->eqpc_base, INIT_HCA_EQPC_BASE_OFFSET); + MTHCA_PUT(inbox, param->eeec_base, INIT_HCA_EEEC_BASE_OFFSET); + MTHCA_PUT(inbox, param->eqc_base, INIT_HCA_EQC_BASE_OFFSET); + MTHCA_PUT(inbox, param->log_num_eqs, INIT_HCA_LOG_EQ_OFFSET); + MTHCA_PUT(inbox, param->rdb_base, INIT_HCA_RDB_BASE_OFFSET); + + /* UD AV attributes */ + + /* multicast attributes */ + + MTHCA_PUT(inbox, param->mc_base, INIT_HCA_MC_BASE_OFFSET); + MTHCA_PUT(inbox, param->log_mc_entry_sz, INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET); + MTHCA_PUT(inbox, param->mc_hash_sz, INIT_HCA_MC_HASH_SZ_OFFSET); + MTHCA_PUT(inbox, param->log_mc_table_sz, INIT_HCA_LOG_MC_TABLE_SZ_OFFSET); + + /* TPT attributes */ + + MTHCA_PUT(inbox, param->mpt_base, INIT_HCA_MPT_BASE_OFFSET); + if (!mthca_is_memfree(dev)) + MTHCA_PUT(inbox, param->mtt_seg_sz, INIT_HCA_MTT_SEG_SZ_OFFSET); + MTHCA_PUT(inbox, param->log_mpt_sz, INIT_HCA_LOG_MPT_SZ_OFFSET); + MTHCA_PUT(inbox, param->mtt_base, INIT_HCA_MTT_BASE_OFFSET); + + /* UAR attributes */ + { + u8 uar_page_sz = PAGE_SHIFT - 12; + MTHCA_PUT(inbox, uar_page_sz, INIT_HCA_UAR_PAGE_SZ_OFFSET); + } + + MTHCA_PUT(inbox, param->uar_scratch_base, INIT_HCA_UAR_SCATCH_BASE_OFFSET); + + if (mthca_is_memfree(dev)) { + MTHCA_PUT(inbox, param->log_uarc_sz, INIT_HCA_UARC_SZ_OFFSET); + MTHCA_PUT(inbox, param->log_uar_sz, INIT_HCA_LOG_UAR_SZ_OFFSET); + MTHCA_PUT(inbox, param->uarc_base, INIT_HCA_UAR_CTX_BASE_OFFSET); + } + + err = mthca_cmd(dev, mailbox->dma, 0, 0, + CMD_INIT_HCA, CMD_TIME_CLASS_D); + + mthca_free_mailbox(dev, mailbox); + return err; +} + +int mthca_INIT_IB(struct mthca_dev *dev, + struct mthca_init_ib_param *param, + int port) +{ + struct mthca_mailbox *mailbox; + u32 *inbox; + int err; + u32 flags; + +#define INIT_IB_IN_SIZE 56 +#define INIT_IB_FLAGS_OFFSET 0x00 +#define INIT_IB_FLAG_SIG (1 << 18) +#define INIT_IB_FLAG_NG (1 << 17) +#define INIT_IB_FLAG_G0 (1 << 16) +#define INIT_IB_VL_SHIFT 4 +#define INIT_IB_PORT_WIDTH_SHIFT 8 +#define INIT_IB_MTU_SHIFT 12 +#define INIT_IB_MAX_GID_OFFSET 0x06 +#define INIT_IB_MAX_PKEY_OFFSET 0x0a +#define INIT_IB_GUID0_OFFSET 0x10 +#define INIT_IB_NODE_GUID_OFFSET 0x18 +#define INIT_IB_SI_GUID_OFFSET 0x20 + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + inbox = mailbox->buf; + + memset(inbox, 0, INIT_IB_IN_SIZE); + + flags = 0; + flags |= param->set_guid0 ? INIT_IB_FLAG_G0 : 0; + flags |= param->set_node_guid ? INIT_IB_FLAG_NG : 0; + flags |= param->set_si_guid ? INIT_IB_FLAG_SIG : 0; + flags |= param->vl_cap << INIT_IB_VL_SHIFT; + flags |= param->port_width << INIT_IB_PORT_WIDTH_SHIFT; + flags |= param->mtu_cap << INIT_IB_MTU_SHIFT; + MTHCA_PUT(inbox, flags, INIT_IB_FLAGS_OFFSET); + + MTHCA_PUT(inbox, param->gid_cap, INIT_IB_MAX_GID_OFFSET); + MTHCA_PUT(inbox, param->pkey_cap, INIT_IB_MAX_PKEY_OFFSET); + MTHCA_PUT(inbox, param->guid0, INIT_IB_GUID0_OFFSET); + MTHCA_PUT(inbox, param->node_guid, INIT_IB_NODE_GUID_OFFSET); + MTHCA_PUT(inbox, param->si_guid, INIT_IB_SI_GUID_OFFSET); + + err = mthca_cmd(dev, mailbox->dma, port, 0, CMD_INIT_IB, + CMD_TIME_CLASS_A); + + mthca_free_mailbox(dev, mailbox); + return err; +} + +int mthca_CLOSE_IB(struct mthca_dev *dev, int port) +{ + return mthca_cmd(dev, 0, port, 0, CMD_CLOSE_IB, CMD_TIME_CLASS_A); +} + +int mthca_CLOSE_HCA(struct mthca_dev *dev, int panic) +{ + return mthca_cmd(dev, 0, 0, panic, CMD_CLOSE_HCA, CMD_TIME_CLASS_C); +} + +int mthca_SET_IB(struct mthca_dev *dev, struct mthca_set_ib_param *param, + int port) +{ + struct mthca_mailbox *mailbox; + u32 *inbox; + int err; + u32 flags = 0; + +#define SET_IB_IN_SIZE 0x40 +#define SET_IB_FLAGS_OFFSET 0x00 +#define SET_IB_FLAG_SIG (1 << 18) +#define SET_IB_FLAG_RQK (1 << 0) +#define SET_IB_CAP_MASK_OFFSET 0x04 +#define SET_IB_SI_GUID_OFFSET 0x08 + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + inbox = mailbox->buf; + + memset(inbox, 0, SET_IB_IN_SIZE); + + flags |= param->set_si_guid ? SET_IB_FLAG_SIG : 0; + flags |= param->reset_qkey_viol ? SET_IB_FLAG_RQK : 0; + MTHCA_PUT(inbox, flags, SET_IB_FLAGS_OFFSET); + + MTHCA_PUT(inbox, param->cap_mask, SET_IB_CAP_MASK_OFFSET); + MTHCA_PUT(inbox, param->si_guid, SET_IB_SI_GUID_OFFSET); + + err = mthca_cmd(dev, mailbox->dma, port, 0, CMD_SET_IB, + CMD_TIME_CLASS_B); + + mthca_free_mailbox(dev, mailbox); + return err; +} + +int mthca_MAP_ICM(struct mthca_dev *dev, struct mthca_icm *icm, u64 virt) +{ + return mthca_map_cmd(dev, CMD_MAP_ICM, icm, virt); +} + +int mthca_MAP_ICM_page(struct mthca_dev *dev, u64 dma_addr, u64 virt) +{ + struct mthca_mailbox *mailbox; + __be64 *inbox; + int err; + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + inbox = mailbox->buf; + + inbox[0] = cpu_to_be64(virt); + inbox[1] = cpu_to_be64(dma_addr); + + err = mthca_cmd(dev, mailbox->dma, 1, 0, CMD_MAP_ICM, + CMD_TIME_CLASS_B); + + mthca_free_mailbox(dev, mailbox); + + if (!err) + mthca_dbg(dev, "Mapped page at %llx to %llx for ICM.\n", + (unsigned long long) dma_addr, (unsigned long long) virt); + + return err; +} + +int mthca_UNMAP_ICM(struct mthca_dev *dev, u64 virt, u32 page_count) +{ + mthca_dbg(dev, "Unmapping %d pages at %llx from ICM.\n", + page_count, (unsigned long long) virt); + + return mthca_cmd(dev, virt, page_count, 0, + CMD_UNMAP_ICM, CMD_TIME_CLASS_B); +} + +int mthca_MAP_ICM_AUX(struct mthca_dev *dev, struct mthca_icm *icm) +{ + return mthca_map_cmd(dev, CMD_MAP_ICM_AUX, icm, -1); +} + +int mthca_UNMAP_ICM_AUX(struct mthca_dev *dev) +{ + return mthca_cmd(dev, 0, 0, 0, CMD_UNMAP_ICM_AUX, CMD_TIME_CLASS_B); +} + +int mthca_SET_ICM_SIZE(struct mthca_dev *dev, u64 icm_size, u64 *aux_pages) +{ + int ret = mthca_cmd_imm(dev, icm_size, aux_pages, 0, + 0, CMD_SET_ICM_SIZE, CMD_TIME_CLASS_A); + + if (ret) + return ret; + + /* + * Round up number of system pages needed in case + * MTHCA_ICM_PAGE_SIZE < PAGE_SIZE. + */ + *aux_pages = ALIGN(*aux_pages, PAGE_SIZE / MTHCA_ICM_PAGE_SIZE) >> + (PAGE_SHIFT - MTHCA_ICM_PAGE_SHIFT); + + return 0; +} + +int mthca_SW2HW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int mpt_index) +{ + return mthca_cmd(dev, mailbox->dma, mpt_index, 0, CMD_SW2HW_MPT, + CMD_TIME_CLASS_B); +} + +int mthca_HW2SW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int mpt_index) +{ + return mthca_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, mpt_index, + !mailbox, CMD_HW2SW_MPT, + CMD_TIME_CLASS_B); +} + +int mthca_WRITE_MTT(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int num_mtt) +{ + return mthca_cmd(dev, mailbox->dma, num_mtt, 0, CMD_WRITE_MTT, + CMD_TIME_CLASS_B); +} + +int mthca_SYNC_TPT(struct mthca_dev *dev) +{ + return mthca_cmd(dev, 0, 0, 0, CMD_SYNC_TPT, CMD_TIME_CLASS_B); +} + +int mthca_MAP_EQ(struct mthca_dev *dev, u64 event_mask, int unmap, + int eq_num) +{ + mthca_dbg(dev, "%s mask %016llx for eqn %d\n", + unmap ? "Clearing" : "Setting", + (unsigned long long) event_mask, eq_num); + return mthca_cmd(dev, event_mask, (unmap << 31) | eq_num, + 0, CMD_MAP_EQ, CMD_TIME_CLASS_B); +} + +int mthca_SW2HW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int eq_num) +{ + return mthca_cmd(dev, mailbox->dma, eq_num, 0, CMD_SW2HW_EQ, + CMD_TIME_CLASS_A); +} + +int mthca_HW2SW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int eq_num) +{ + return mthca_cmd_box(dev, 0, mailbox->dma, eq_num, 0, + CMD_HW2SW_EQ, + CMD_TIME_CLASS_A); +} + +int mthca_SW2HW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int cq_num) +{ + return mthca_cmd(dev, mailbox->dma, cq_num, 0, CMD_SW2HW_CQ, + CMD_TIME_CLASS_A); +} + +int mthca_HW2SW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int cq_num) +{ + return mthca_cmd_box(dev, 0, mailbox->dma, cq_num, 0, + CMD_HW2SW_CQ, + CMD_TIME_CLASS_A); +} + +int mthca_RESIZE_CQ(struct mthca_dev *dev, int cq_num, u32 lkey, u8 log_size) +{ + struct mthca_mailbox *mailbox; + __be32 *inbox; + int err; + +#define RESIZE_CQ_IN_SIZE 0x40 +#define RESIZE_CQ_LOG_SIZE_OFFSET 0x0c +#define RESIZE_CQ_LKEY_OFFSET 0x1c + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + inbox = mailbox->buf; + + memset(inbox, 0, RESIZE_CQ_IN_SIZE); + /* + * Leave start address fields zeroed out -- mthca assumes that + * MRs for CQs always start at virtual address 0. + */ + MTHCA_PUT(inbox, log_size, RESIZE_CQ_LOG_SIZE_OFFSET); + MTHCA_PUT(inbox, lkey, RESIZE_CQ_LKEY_OFFSET); + + err = mthca_cmd(dev, mailbox->dma, cq_num, 1, CMD_RESIZE_CQ, + CMD_TIME_CLASS_B); + + mthca_free_mailbox(dev, mailbox); + return err; +} + +int mthca_SW2HW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int srq_num) +{ + return mthca_cmd(dev, mailbox->dma, srq_num, 0, CMD_SW2HW_SRQ, + CMD_TIME_CLASS_A); +} + +int mthca_HW2SW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int srq_num) +{ + return mthca_cmd_box(dev, 0, mailbox->dma, srq_num, 0, + CMD_HW2SW_SRQ, + CMD_TIME_CLASS_A); +} + +int mthca_QUERY_SRQ(struct mthca_dev *dev, u32 num, + struct mthca_mailbox *mailbox) +{ + return mthca_cmd_box(dev, 0, mailbox->dma, num, 0, + CMD_QUERY_SRQ, CMD_TIME_CLASS_A); +} + +int mthca_ARM_SRQ(struct mthca_dev *dev, int srq_num, int limit) +{ + return mthca_cmd(dev, limit, srq_num, 0, CMD_ARM_SRQ, + CMD_TIME_CLASS_B); +} + +int mthca_MODIFY_QP(struct mthca_dev *dev, enum ib_qp_state cur, + enum ib_qp_state next, u32 num, int is_ee, + struct mthca_mailbox *mailbox, u32 optmask) +{ + static const u16 op[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = { + [IB_QPS_RESET] = { + [IB_QPS_RESET] = CMD_ERR2RST_QPEE, + [IB_QPS_ERR] = CMD_2ERR_QPEE, + [IB_QPS_INIT] = CMD_RST2INIT_QPEE, + }, + [IB_QPS_INIT] = { + [IB_QPS_RESET] = CMD_ERR2RST_QPEE, + [IB_QPS_ERR] = CMD_2ERR_QPEE, + [IB_QPS_INIT] = CMD_INIT2INIT_QPEE, + [IB_QPS_RTR] = CMD_INIT2RTR_QPEE, + }, + [IB_QPS_RTR] = { + [IB_QPS_RESET] = CMD_ERR2RST_QPEE, + [IB_QPS_ERR] = CMD_2ERR_QPEE, + [IB_QPS_RTS] = CMD_RTR2RTS_QPEE, + }, + [IB_QPS_RTS] = { + [IB_QPS_RESET] = CMD_ERR2RST_QPEE, + [IB_QPS_ERR] = CMD_2ERR_QPEE, + [IB_QPS_RTS] = CMD_RTS2RTS_QPEE, + [IB_QPS_SQD] = CMD_RTS2SQD_QPEE, + }, + [IB_QPS_SQD] = { + [IB_QPS_RESET] = CMD_ERR2RST_QPEE, + [IB_QPS_ERR] = CMD_2ERR_QPEE, + [IB_QPS_RTS] = CMD_SQD2RTS_QPEE, + [IB_QPS_SQD] = CMD_SQD2SQD_QPEE, + }, + [IB_QPS_SQE] = { + [IB_QPS_RESET] = CMD_ERR2RST_QPEE, + [IB_QPS_ERR] = CMD_2ERR_QPEE, + [IB_QPS_RTS] = CMD_SQERR2RTS_QPEE, + }, + [IB_QPS_ERR] = { + [IB_QPS_RESET] = CMD_ERR2RST_QPEE, + [IB_QPS_ERR] = CMD_2ERR_QPEE, + } + }; + + u8 op_mod = 0; + int my_mailbox = 0; + int err; + + if (op[cur][next] == CMD_ERR2RST_QPEE) { + op_mod = 3; /* don't write outbox, any->reset */ + + /* For debugging */ + if (!mailbox) { + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (!IS_ERR(mailbox)) { + my_mailbox = 1; + op_mod = 2; /* write outbox, any->reset */ + } else + mailbox = NULL; + } + + err = mthca_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, + (!!is_ee << 24) | num, op_mod, + op[cur][next], CMD_TIME_CLASS_C); + + if (0 && mailbox) { + int i; + mthca_dbg(dev, "Dumping QP context:\n"); + printk(" %08x\n", be32_to_cpup(mailbox->buf)); + for (i = 0; i < 0x100 / 4; ++i) { + if (i % 8 == 0) + printk("[%02x] ", i * 4); + printk(" %08x", + be32_to_cpu(((__be32 *) mailbox->buf)[i + 2])); + if ((i + 1) % 8 == 0) + printk("\n"); + } + } + + if (my_mailbox) + mthca_free_mailbox(dev, mailbox); + } else { + if (0) { + int i; + mthca_dbg(dev, "Dumping QP context:\n"); + printk(" opt param mask: %08x\n", be32_to_cpup(mailbox->buf)); + for (i = 0; i < 0x100 / 4; ++i) { + if (i % 8 == 0) + printk(" [%02x] ", i * 4); + printk(" %08x", + be32_to_cpu(((__be32 *) mailbox->buf)[i + 2])); + if ((i + 1) % 8 == 0) + printk("\n"); + } + } + + err = mthca_cmd(dev, mailbox->dma, optmask | (!!is_ee << 24) | num, + op_mod, op[cur][next], CMD_TIME_CLASS_C); + } + + return err; +} + +int mthca_QUERY_QP(struct mthca_dev *dev, u32 num, int is_ee, + struct mthca_mailbox *mailbox) +{ + return mthca_cmd_box(dev, 0, mailbox->dma, (!!is_ee << 24) | num, 0, + CMD_QUERY_QPEE, CMD_TIME_CLASS_A); +} + +int mthca_CONF_SPECIAL_QP(struct mthca_dev *dev, int type, u32 qpn) +{ + u8 op_mod; + + switch (type) { + case IB_QPT_SMI: + op_mod = 0; + break; + case IB_QPT_GSI: + op_mod = 1; + break; + case IB_QPT_RAW_IPV6: + op_mod = 2; + break; + case IB_QPT_RAW_ETHERTYPE: + op_mod = 3; + break; + default: + return -EINVAL; + } + + return mthca_cmd(dev, 0, qpn, op_mod, CMD_CONF_SPECIAL_QP, + CMD_TIME_CLASS_B); +} + +int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey, + int port, struct ib_wc *in_wc, struct ib_grh *in_grh, + void *in_mad, void *response_mad) +{ + struct mthca_mailbox *inmailbox, *outmailbox; + void *inbox; + int err; + u32 in_modifier = port; + u8 op_modifier = 0; + +#define MAD_IFC_BOX_SIZE 0x400 +#define MAD_IFC_MY_QPN_OFFSET 0x100 +#define MAD_IFC_RQPN_OFFSET 0x108 +#define MAD_IFC_SL_OFFSET 0x10c +#define MAD_IFC_G_PATH_OFFSET 0x10d +#define MAD_IFC_RLID_OFFSET 0x10e +#define MAD_IFC_PKEY_OFFSET 0x112 +#define MAD_IFC_GRH_OFFSET 0x140 + + inmailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(inmailbox)) + return PTR_ERR(inmailbox); + inbox = inmailbox->buf; + + outmailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(outmailbox)) { + mthca_free_mailbox(dev, inmailbox); + return PTR_ERR(outmailbox); + } + + memcpy(inbox, in_mad, 256); + + /* + * Key check traps can't be generated unless we have in_wc to + * tell us where to send the trap. + */ + if (ignore_mkey || !in_wc) + op_modifier |= 0x1; + if (ignore_bkey || !in_wc) + op_modifier |= 0x2; + + if (in_wc) { + u8 val; + + memset(inbox + 256, 0, 256); + + MTHCA_PUT(inbox, in_wc->qp->qp_num, MAD_IFC_MY_QPN_OFFSET); + MTHCA_PUT(inbox, in_wc->src_qp, MAD_IFC_RQPN_OFFSET); + + val = in_wc->sl << 4; + MTHCA_PUT(inbox, val, MAD_IFC_SL_OFFSET); + + val = in_wc->dlid_path_bits | + (in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0); + MTHCA_PUT(inbox, val, MAD_IFC_G_PATH_OFFSET); + + MTHCA_PUT(inbox, in_wc->slid, MAD_IFC_RLID_OFFSET); + MTHCA_PUT(inbox, in_wc->pkey_index, MAD_IFC_PKEY_OFFSET); + + if (in_grh) + memcpy(inbox + MAD_IFC_GRH_OFFSET, in_grh, 40); + + op_modifier |= 0x4; + + in_modifier |= in_wc->slid << 16; + } + + err = mthca_cmd_box(dev, inmailbox->dma, outmailbox->dma, + in_modifier, op_modifier, + CMD_MAD_IFC, CMD_TIME_CLASS_C); + + if (!err) + memcpy(response_mad, outmailbox->buf, 256); + + mthca_free_mailbox(dev, inmailbox); + mthca_free_mailbox(dev, outmailbox); + return err; +} + +int mthca_READ_MGM(struct mthca_dev *dev, int index, + struct mthca_mailbox *mailbox) +{ + return mthca_cmd_box(dev, 0, mailbox->dma, index, 0, + CMD_READ_MGM, CMD_TIME_CLASS_A); +} + +int mthca_WRITE_MGM(struct mthca_dev *dev, int index, + struct mthca_mailbox *mailbox) +{ + return mthca_cmd(dev, mailbox->dma, index, 0, CMD_WRITE_MGM, + CMD_TIME_CLASS_A); +} + +int mthca_MGID_HASH(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + u16 *hash) +{ + u64 imm; + int err; + + err = mthca_cmd_imm(dev, mailbox->dma, &imm, 0, 0, CMD_MGID_HASH, + CMD_TIME_CLASS_A); + + *hash = imm; + return err; +} + +int mthca_NOP(struct mthca_dev *dev) +{ + return mthca_cmd(dev, 0, 0x1f, 0, CMD_NOP, msecs_to_jiffies(100)); +} diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.h b/drivers/infiniband/hw/mthca/mthca_cmd.h new file mode 100644 index 0000000..f952244 --- /dev/null +++ b/drivers/infiniband/hw/mthca/mthca_cmd.h @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2006 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MTHCA_CMD_H +#define MTHCA_CMD_H + +#include + +#define MTHCA_MAILBOX_SIZE 4096 + +enum { + /* command completed successfully: */ + MTHCA_CMD_STAT_OK = 0x00, + /* Internal error (such as a bus error) occurred while processing command: */ + MTHCA_CMD_STAT_INTERNAL_ERR = 0x01, + /* Operation/command not supported or opcode modifier not supported: */ + MTHCA_CMD_STAT_BAD_OP = 0x02, + /* Parameter not supported or parameter out of range: */ + MTHCA_CMD_STAT_BAD_PARAM = 0x03, + /* System not enabled or bad system state: */ + MTHCA_CMD_STAT_BAD_SYS_STATE = 0x04, + /* Attempt to access reserved or unallocaterd resource: */ + MTHCA_CMD_STAT_BAD_RESOURCE = 0x05, + /* Requested resource is currently executing a command, or is otherwise busy: */ + MTHCA_CMD_STAT_RESOURCE_BUSY = 0x06, + /* memory error: */ + MTHCA_CMD_STAT_DDR_MEM_ERR = 0x07, + /* Required capability exceeds device limits: */ + MTHCA_CMD_STAT_EXCEED_LIM = 0x08, + /* Resource is not in the appropriate state or ownership: */ + MTHCA_CMD_STAT_BAD_RES_STATE = 0x09, + /* Index out of range: */ + MTHCA_CMD_STAT_BAD_INDEX = 0x0a, + /* FW image corrupted: */ + MTHCA_CMD_STAT_BAD_NVMEM = 0x0b, + /* Attempt to modify a QP/EE which is not in the presumed state: */ + MTHCA_CMD_STAT_BAD_QPEE_STATE = 0x10, + /* Bad segment parameters (Address/Size): */ + MTHCA_CMD_STAT_BAD_SEG_PARAM = 0x20, + /* Memory Region has Memory Windows bound to: */ + MTHCA_CMD_STAT_REG_BOUND = 0x21, + /* HCA local attached memory not present: */ + MTHCA_CMD_STAT_LAM_NOT_PRE = 0x22, + /* Bad management packet (silently discarded): */ + MTHCA_CMD_STAT_BAD_PKT = 0x30, + /* More outstanding CQEs in CQ than new CQ size: */ + MTHCA_CMD_STAT_BAD_SIZE = 0x40 +}; + +enum { + MTHCA_TRANS_INVALID = 0, + MTHCA_TRANS_RST2INIT, + MTHCA_TRANS_INIT2INIT, + MTHCA_TRANS_INIT2RTR, + MTHCA_TRANS_RTR2RTS, + MTHCA_TRANS_RTS2RTS, + MTHCA_TRANS_SQERR2RTS, + MTHCA_TRANS_ANY2ERR, + MTHCA_TRANS_RTS2SQD, + MTHCA_TRANS_SQD2SQD, + MTHCA_TRANS_SQD2RTS, + MTHCA_TRANS_ANY2RST, +}; + +enum { + DEV_LIM_FLAG_RC = 1 << 0, + DEV_LIM_FLAG_UC = 1 << 1, + DEV_LIM_FLAG_UD = 1 << 2, + DEV_LIM_FLAG_RD = 1 << 3, + DEV_LIM_FLAG_RAW_IPV6 = 1 << 4, + DEV_LIM_FLAG_RAW_ETHER = 1 << 5, + DEV_LIM_FLAG_SRQ = 1 << 6, + DEV_LIM_FLAG_IPOIB_CSUM = 1 << 7, + DEV_LIM_FLAG_BAD_PKEY_CNTR = 1 << 8, + DEV_LIM_FLAG_BAD_QKEY_CNTR = 1 << 9, + DEV_LIM_FLAG_MW = 1 << 16, + DEV_LIM_FLAG_AUTO_PATH_MIG = 1 << 17, + DEV_LIM_FLAG_ATOMIC = 1 << 18, + DEV_LIM_FLAG_RAW_MULTI = 1 << 19, + DEV_LIM_FLAG_UD_AV_PORT_ENFORCE = 1 << 20, + DEV_LIM_FLAG_UD_MULTI = 1 << 21, +}; + +struct mthca_mailbox { + dma_addr_t dma; + void *buf; +}; + +struct mthca_dev_lim { + int max_srq_sz; + int max_qp_sz; + int reserved_qps; + int max_qps; + int reserved_srqs; + int max_srqs; + int reserved_eecs; + int max_eecs; + int max_cq_sz; + int reserved_cqs; + int max_cqs; + int max_mpts; + int reserved_eqs; + int max_eqs; + int reserved_mtts; + int max_mrw_sz; + int reserved_mrws; + int max_mtt_seg; + int max_requester_per_qp; + int max_responder_per_qp; + int max_rdma_global; + int local_ca_ack_delay; + int max_mtu; + int max_port_width; + int max_vl; + int num_ports; + int max_gids; + u16 stat_rate_support; + int max_pkeys; + u32 flags; + int reserved_uars; + int uar_size; + int min_page_sz; + int max_sg; + int max_desc_sz; + int max_qp_per_mcg; + int reserved_mgms; + int max_mcgs; + int reserved_pds; + int max_pds; + int reserved_rdds; + int max_rdds; + int eec_entry_sz; + int qpc_entry_sz; + int eeec_entry_sz; + int eqpc_entry_sz; + int eqc_entry_sz; + int cqc_entry_sz; + int srq_entry_sz; + int uar_scratch_entry_sz; + int mpt_entry_sz; + union { + struct { + int max_avs; + } tavor; + struct { + int resize_srq; + int max_pbl_sz; + u8 bmme_flags; + u32 reserved_lkey; + int lam_required; + u64 max_icm_sz; + } arbel; + } hca; +}; + +struct mthca_adapter { + u32 vendor_id; + u32 device_id; + u32 revision_id; + char board_id[MTHCA_BOARD_ID_LEN]; + u8 inta_pin; +}; + +struct mthca_init_hca_param { + u64 qpc_base; + u64 eec_base; + u64 srqc_base; + u64 cqc_base; + u64 eqpc_base; + u64 eeec_base; + u64 eqc_base; + u64 rdb_base; + u64 mc_base; + u64 mpt_base; + u64 mtt_base; + u64 uar_scratch_base; + u64 uarc_base; + u16 log_mc_entry_sz; + u16 mc_hash_sz; + u8 log_num_qps; + u8 log_num_eecs; + u8 log_num_srqs; + u8 log_num_cqs; + u8 log_num_eqs; + u8 log_mc_table_sz; + u8 mtt_seg_sz; + u8 log_mpt_sz; + u8 log_uar_sz; + u8 log_uarc_sz; +}; + +struct mthca_init_ib_param { + int port_width; + int vl_cap; + int mtu_cap; + u16 gid_cap; + u16 pkey_cap; + int set_guid0; + u64 guid0; + int set_node_guid; + u64 node_guid; + int set_si_guid; + u64 si_guid; +}; + +struct mthca_set_ib_param { + int set_si_guid; + int reset_qkey_viol; + u64 si_guid; + u32 cap_mask; +}; + +int mthca_cmd_init(struct mthca_dev *dev); +void mthca_cmd_cleanup(struct mthca_dev *dev); +int mthca_cmd_use_events(struct mthca_dev *dev); +void mthca_cmd_use_polling(struct mthca_dev *dev); +void mthca_cmd_event(struct mthca_dev *dev, u16 token, + u8 status, u64 out_param); + +struct mthca_mailbox *mthca_alloc_mailbox(struct mthca_dev *dev, + gfp_t gfp_mask); +void mthca_free_mailbox(struct mthca_dev *dev, struct mthca_mailbox *mailbox); + +int mthca_SYS_EN(struct mthca_dev *dev); +int mthca_SYS_DIS(struct mthca_dev *dev); +int mthca_MAP_FA(struct mthca_dev *dev, struct mthca_icm *icm); +int mthca_UNMAP_FA(struct mthca_dev *dev); +int mthca_RUN_FW(struct mthca_dev *dev); +int mthca_QUERY_FW(struct mthca_dev *dev); +int mthca_ENABLE_LAM(struct mthca_dev *dev); +int mthca_DISABLE_LAM(struct mthca_dev *dev); +int mthca_QUERY_DDR(struct mthca_dev *dev); +int mthca_QUERY_DEV_LIM(struct mthca_dev *dev, + struct mthca_dev_lim *dev_lim); +int mthca_QUERY_ADAPTER(struct mthca_dev *dev, + struct mthca_adapter *adapter); +int mthca_INIT_HCA(struct mthca_dev *dev, + struct mthca_init_hca_param *param); +int mthca_INIT_IB(struct mthca_dev *dev, + struct mthca_init_ib_param *param, + int port); +int mthca_CLOSE_IB(struct mthca_dev *dev, int port); +int mthca_CLOSE_HCA(struct mthca_dev *dev, int panic); +int mthca_SET_IB(struct mthca_dev *dev, struct mthca_set_ib_param *param, + int port); +int mthca_MAP_ICM(struct mthca_dev *dev, struct mthca_icm *icm, u64 virt); +int mthca_MAP_ICM_page(struct mthca_dev *dev, u64 dma_addr, u64 virt); +int mthca_UNMAP_ICM(struct mthca_dev *dev, u64 virt, u32 page_count); +int mthca_MAP_ICM_AUX(struct mthca_dev *dev, struct mthca_icm *icm); +int mthca_UNMAP_ICM_AUX(struct mthca_dev *dev); +int mthca_SET_ICM_SIZE(struct mthca_dev *dev, u64 icm_size, u64 *aux_pages); +int mthca_SW2HW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int mpt_index); +int mthca_HW2SW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int mpt_index); +int mthca_WRITE_MTT(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int num_mtt); +int mthca_SYNC_TPT(struct mthca_dev *dev); +int mthca_MAP_EQ(struct mthca_dev *dev, u64 event_mask, int unmap, + int eq_num); +int mthca_SW2HW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int eq_num); +int mthca_HW2SW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int eq_num); +int mthca_SW2HW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int cq_num); +int mthca_HW2SW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int cq_num); +int mthca_RESIZE_CQ(struct mthca_dev *dev, int cq_num, u32 lkey, u8 log_size); +int mthca_SW2HW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int srq_num); +int mthca_HW2SW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + int srq_num); +int mthca_QUERY_SRQ(struct mthca_dev *dev, u32 num, + struct mthca_mailbox *mailbox); +int mthca_ARM_SRQ(struct mthca_dev *dev, int srq_num, int limit); +int mthca_MODIFY_QP(struct mthca_dev *dev, enum ib_qp_state cur, + enum ib_qp_state next, u32 num, int is_ee, + struct mthca_mailbox *mailbox, u32 optmask); +int mthca_QUERY_QP(struct mthca_dev *dev, u32 num, int is_ee, + struct mthca_mailbox *mailbox); +int mthca_CONF_SPECIAL_QP(struct mthca_dev *dev, int type, u32 qpn); +int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey, + int port, struct ib_wc *in_wc, struct ib_grh *in_grh, + void *in_mad, void *response_mad); +int mthca_READ_MGM(struct mthca_dev *dev, int index, + struct mthca_mailbox *mailbox); +int mthca_WRITE_MGM(struct mthca_dev *dev, int index, + struct mthca_mailbox *mailbox); +int mthca_MGID_HASH(struct mthca_dev *dev, struct mthca_mailbox *mailbox, + u16 *hash); +int mthca_NOP(struct mthca_dev *dev); + +#endif /* MTHCA_CMD_H */ diff --git a/drivers/infiniband/hw/mthca/mthca_config_reg.h b/drivers/infiniband/hw/mthca/mthca_config_reg.h new file mode 100644 index 0000000..155bc66 --- /dev/null +++ b/drivers/infiniband/hw/mthca/mthca_config_reg.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MTHCA_CONFIG_REG_H +#define MTHCA_CONFIG_REG_H + +#define MTHCA_HCR_BASE 0x80680 +#define MTHCA_HCR_SIZE 0x0001c +#define MTHCA_ECR_BASE 0x80700 +#define MTHCA_ECR_SIZE 0x00008 +#define MTHCA_ECR_CLR_BASE 0x80708 +#define MTHCA_ECR_CLR_SIZE 0x00008 +#define MTHCA_MAP_ECR_SIZE (MTHCA_ECR_SIZE + MTHCA_ECR_CLR_SIZE) +#define MTHCA_CLR_INT_BASE 0xf00d8 +#define MTHCA_CLR_INT_SIZE 0x00008 +#define MTHCA_EQ_SET_CI_SIZE (8 * 32) + +#endif /* MTHCA_CONFIG_REG_H */ diff --git a/drivers/infiniband/hw/mthca/mthca_cq.c b/drivers/infiniband/hw/mthca/mthca_cq.c new file mode 100644 index 0000000..40ba833 --- /dev/null +++ b/drivers/infiniband/hw/mthca/mthca_cq.c @@ -0,0 +1,984 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include + +#include + +#include "mthca_dev.h" +#include "mthca_cmd.h" +#include "mthca_memfree.h" + +enum { + MTHCA_MAX_DIRECT_CQ_SIZE = 4 * PAGE_SIZE +}; + +enum { + MTHCA_CQ_ENTRY_SIZE = 0x20 +}; + +enum { + MTHCA_ATOMIC_BYTE_LEN = 8 +}; + +/* + * Must be packed because start is 64 bits but only aligned to 32 bits. + */ +struct mthca_cq_context { + __be32 flags; + __be64 start; + __be32 logsize_usrpage; + __be32 error_eqn; /* Tavor only */ + __be32 comp_eqn; + __be32 pd; + __be32 lkey; + __be32 last_notified_index; + __be32 solicit_producer_index; + __be32 consumer_index; + __be32 producer_index; + __be32 cqn; + __be32 ci_db; /* Arbel only */ + __be32 state_db; /* Arbel only */ + u32 reserved; +} __attribute__((packed)); + +#define MTHCA_CQ_STATUS_OK ( 0 << 28) +#define MTHCA_CQ_STATUS_OVERFLOW ( 9 << 28) +#define MTHCA_CQ_STATUS_WRITE_FAIL (10 << 28) +#define MTHCA_CQ_FLAG_TR ( 1 << 18) +#define MTHCA_CQ_FLAG_OI ( 1 << 17) +#define MTHCA_CQ_STATE_DISARMED ( 0 << 8) +#define MTHCA_CQ_STATE_ARMED ( 1 << 8) +#define MTHCA_CQ_STATE_ARMED_SOL ( 4 << 8) +#define MTHCA_EQ_STATE_FIRED (10 << 8) + +enum { + MTHCA_ERROR_CQE_OPCODE_MASK = 0xfe +}; + +enum { + SYNDROME_LOCAL_LENGTH_ERR = 0x01, + SYNDROME_LOCAL_QP_OP_ERR = 0x02, + SYNDROME_LOCAL_EEC_OP_ERR = 0x03, + SYNDROME_LOCAL_PROT_ERR = 0x04, + SYNDROME_WR_FLUSH_ERR = 0x05, + SYNDROME_MW_BIND_ERR = 0x06, + SYNDROME_BAD_RESP_ERR = 0x10, + SYNDROME_LOCAL_ACCESS_ERR = 0x11, + SYNDROME_REMOTE_INVAL_REQ_ERR = 0x12, + SYNDROME_REMOTE_ACCESS_ERR = 0x13, + SYNDROME_REMOTE_OP_ERR = 0x14, + SYNDROME_RETRY_EXC_ERR = 0x15, + SYNDROME_RNR_RETRY_EXC_ERR = 0x16, + SYNDROME_LOCAL_RDD_VIOL_ERR = 0x20, + SYNDROME_REMOTE_INVAL_RD_REQ_ERR = 0x21, + SYNDROME_REMOTE_ABORTED_ERR = 0x22, + SYNDROME_INVAL_EECN_ERR = 0x23, + SYNDROME_INVAL_EEC_STATE_ERR = 0x24 +}; + +struct mthca_cqe { + __be32 my_qpn; + __be32 my_ee; + __be32 rqpn; + u8 sl_ipok; + u8 g_mlpath; + __be16 rlid; + __be32 imm_etype_pkey_eec; + __be32 byte_cnt; + __be32 wqe; + u8 opcode; + u8 is_send; + u8 reserved; + u8 owner; +}; + +struct mthca_err_cqe { + __be32 my_qpn; + u32 reserved1[3]; + u8 syndrome; + u8 vendor_err; + __be16 db_cnt; + u32 reserved2; + __be32 wqe; + u8 opcode; + u8 reserved3[2]; + u8 owner; +}; + +#define MTHCA_CQ_ENTRY_OWNER_SW (0 << 7) +#define MTHCA_CQ_ENTRY_OWNER_HW (1 << 7) + +#define MTHCA_TAVOR_CQ_DB_INC_CI (1 << 24) +#define MTHCA_TAVOR_CQ_DB_REQ_NOT (2 << 24) +#define MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL (3 << 24) +#define MTHCA_TAVOR_CQ_DB_SET_CI (4 << 24) +#define MTHCA_TAVOR_CQ_DB_REQ_NOT_MULT (5 << 24) + +#define MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL (1 << 24) +#define MTHCA_ARBEL_CQ_DB_REQ_NOT (2 << 24) +#define MTHCA_ARBEL_CQ_DB_REQ_NOT_MULT (3 << 24) + +static inline struct mthca_cqe *get_cqe_from_buf(struct mthca_cq_buf *buf, + int entry) +{ + if (buf->is_direct) + return buf->queue.direct.buf + (entry * MTHCA_CQ_ENTRY_SIZE); + else + return buf->queue.page_list[entry * MTHCA_CQ_ENTRY_SIZE / PAGE_SIZE].buf + + (entry * MTHCA_CQ_ENTRY_SIZE) % PAGE_SIZE; +} + +static inline struct mthca_cqe *get_cqe(struct mthca_cq *cq, int entry) +{ + return get_cqe_from_buf(&cq->buf, entry); +} + +static inline struct mthca_cqe *cqe_sw(struct mthca_cqe *cqe) +{ + return MTHCA_CQ_ENTRY_OWNER_HW & cqe->owner ? NULL : cqe; +} + +static inline struct mthca_cqe *next_cqe_sw(struct mthca_cq *cq) +{ + return cqe_sw(get_cqe(cq, cq->cons_index & cq->ibcq.cqe)); +} + +static inline void set_cqe_hw(struct mthca_cqe *cqe) +{ + cqe->owner = MTHCA_CQ_ENTRY_OWNER_HW; +} + +static void dump_cqe(struct mthca_dev *dev, void *cqe_ptr) +{ + __be32 *cqe = cqe_ptr; + + (void) cqe; /* avoid warning if mthca_dbg compiled away... */ + mthca_dbg(dev, "CQE contents %08x %08x %08x %08x %08x %08x %08x %08x\n", + be32_to_cpu(cqe[0]), be32_to_cpu(cqe[1]), be32_to_cpu(cqe[2]), + be32_to_cpu(cqe[3]), be32_to_cpu(cqe[4]), be32_to_cpu(cqe[5]), + be32_to_cpu(cqe[6]), be32_to_cpu(cqe[7])); +} + +/* + * incr is ignored in native Arbel (mem-free) mode, so cq->cons_index + * should be correct before calling update_cons_index(). + */ +static inline void update_cons_index(struct mthca_dev *dev, struct mthca_cq *cq, + int incr) +{ + if (mthca_is_memfree(dev)) { + *cq->set_ci_db = cpu_to_be32(cq->cons_index); + wmb(); + } else { + mthca_write64(MTHCA_TAVOR_CQ_DB_INC_CI | cq->cqn, incr - 1, + dev->kar + MTHCA_CQ_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); + /* + * Make sure doorbells don't leak out of CQ spinlock + * and reach the HCA out of order: + */ + mmiowb(); + } +} + +void mthca_cq_completion(struct mthca_dev *dev, u32 cqn) +{ + struct mthca_cq *cq; + + cq = mthca_array_get(&dev->cq_table.cq, cqn & (dev->limits.num_cqs - 1)); + + if (!cq) { + mthca_warn(dev, "Completion event for bogus CQ %08x\n", cqn); + return; + } + + ++cq->arm_sn; + + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); +} + +void mthca_cq_event(struct mthca_dev *dev, u32 cqn, + enum ib_event_type event_type) +{ + struct mthca_cq *cq; + struct ib_event event; + + spin_lock(&dev->cq_table.lock); + + cq = mthca_array_get(&dev->cq_table.cq, cqn & (dev->limits.num_cqs - 1)); + if (cq) + ++cq->refcount; + + spin_unlock(&dev->cq_table.lock); + + if (!cq) { + mthca_warn(dev, "Async event for bogus CQ %08x\n", cqn); + return; + } + + event.device = &dev->ib_dev; + event.event = event_type; + event.element.cq = &cq->ibcq; + if (cq->ibcq.event_handler) + cq->ibcq.event_handler(&event, cq->ibcq.cq_context); + + spin_lock(&dev->cq_table.lock); + if (!--cq->refcount) + wake_up(&cq->wait); + spin_unlock(&dev->cq_table.lock); +} + +static inline int is_recv_cqe(struct mthca_cqe *cqe) +{ + if ((cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) == + MTHCA_ERROR_CQE_OPCODE_MASK) + return !(cqe->opcode & 0x01); + else + return !(cqe->is_send & 0x80); +} + +void mthca_cq_clean(struct mthca_dev *dev, struct mthca_cq *cq, u32 qpn, + struct mthca_srq *srq) +{ + struct mthca_cqe *cqe; + u32 prod_index; + int i, nfreed = 0; + + spin_lock_irq(&cq->lock); + + /* + * First we need to find the current producer index, so we + * know where to start cleaning from. It doesn't matter if HW + * adds new entries after this loop -- the QP we're worried + * about is already in RESET, so the new entries won't come + * from our QP and therefore don't need to be checked. + */ + for (prod_index = cq->cons_index; + cqe_sw(get_cqe(cq, prod_index & cq->ibcq.cqe)); + ++prod_index) + if (prod_index == cq->cons_index + cq->ibcq.cqe) + break; + + if (0) + mthca_dbg(dev, "Cleaning QPN %06x from CQN %06x; ci %d, pi %d\n", + qpn, cq->cqn, cq->cons_index, prod_index); + + /* + * Now sweep backwards through the CQ, removing CQ entries + * that match our QP by copying older entries on top of them. + */ + while ((int) --prod_index - (int) cq->cons_index >= 0) { + cqe = get_cqe(cq, prod_index & cq->ibcq.cqe); + if (cqe->my_qpn == cpu_to_be32(qpn)) { + if (srq && is_recv_cqe(cqe)) + mthca_free_srq_wqe(srq, be32_to_cpu(cqe->wqe)); + ++nfreed; + } else if (nfreed) + memcpy(get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe), + cqe, MTHCA_CQ_ENTRY_SIZE); + } + + if (nfreed) { + for (i = 0; i < nfreed; ++i) + set_cqe_hw(get_cqe(cq, (cq->cons_index + i) & cq->ibcq.cqe)); + wmb(); + cq->cons_index += nfreed; + update_cons_index(dev, cq, nfreed); + } + + spin_unlock_irq(&cq->lock); +} + +void mthca_cq_resize_copy_cqes(struct mthca_cq *cq) +{ + int i; + + /* + * In Tavor mode, the hardware keeps the consumer and producer + * indices mod the CQ size. Since we might be making the CQ + * bigger, we need to deal with the case where the producer + * index wrapped around before the CQ was resized. + */ + if (!mthca_is_memfree(to_mdev(cq->ibcq.device)) && + cq->ibcq.cqe < cq->resize_buf->cqe) { + cq->cons_index &= cq->ibcq.cqe; + if (cqe_sw(get_cqe(cq, cq->ibcq.cqe))) + cq->cons_index -= cq->ibcq.cqe + 1; + } + + for (i = cq->cons_index; cqe_sw(get_cqe(cq, i & cq->ibcq.cqe)); ++i) + memcpy(get_cqe_from_buf(&cq->resize_buf->buf, + i & cq->resize_buf->cqe), + get_cqe(cq, i & cq->ibcq.cqe), MTHCA_CQ_ENTRY_SIZE); +} + +int mthca_alloc_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int nent) +{ + int ret; + int i; + + ret = mthca_buf_alloc(dev, nent * MTHCA_CQ_ENTRY_SIZE, + MTHCA_MAX_DIRECT_CQ_SIZE, + &buf->queue, &buf->is_direct, + &dev->driver_pd, 1, &buf->mr); + if (ret) + return ret; + + for (i = 0; i < nent; ++i) + set_cqe_hw(get_cqe_from_buf(buf, i)); + + return 0; +} + +void mthca_free_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int cqe) +{ + mthca_buf_free(dev, (cqe + 1) * MTHCA_CQ_ENTRY_SIZE, &buf->queue, + buf->is_direct, &buf->mr); +} + +static void handle_error_cqe(struct mthca_dev *dev, struct mthca_cq *cq, + struct mthca_qp *qp, int wqe_index, int is_send, + struct mthca_err_cqe *cqe, + struct ib_wc *entry, int *free_cqe) +{ + int dbd; + __be32 new_wqe; + + if (cqe->syndrome == SYNDROME_LOCAL_QP_OP_ERR) { + mthca_dbg(dev, "local QP operation err " + "(QPN %06x, WQE @ %08x, CQN %06x, index %d)\n", + be32_to_cpu(cqe->my_qpn), be32_to_cpu(cqe->wqe), + cq->cqn, cq->cons_index); + dump_cqe(dev, cqe); + } + + /* + * For completions in error, only work request ID, status, vendor error + * (and freed resource count for RD) have to be set. + */ + switch (cqe->syndrome) { + case SYNDROME_LOCAL_LENGTH_ERR: + entry->status = IB_WC_LOC_LEN_ERR; + break; + case SYNDROME_LOCAL_QP_OP_ERR: + entry->status = IB_WC_LOC_QP_OP_ERR; + break; + case SYNDROME_LOCAL_EEC_OP_ERR: + entry->status = IB_WC_LOC_EEC_OP_ERR; + break; + case SYNDROME_LOCAL_PROT_ERR: + entry->status = IB_WC_LOC_PROT_ERR; + break; + case SYNDROME_WR_FLUSH_ERR: + entry->status = IB_WC_WR_FLUSH_ERR; + break; + case SYNDROME_MW_BIND_ERR: + entry->status = IB_WC_MW_BIND_ERR; + break; + case SYNDROME_BAD_RESP_ERR: + entry->status = IB_WC_BAD_RESP_ERR; + break; + case SYNDROME_LOCAL_ACCESS_ERR: + entry->status = IB_WC_LOC_ACCESS_ERR; + break; + case SYNDROME_REMOTE_INVAL_REQ_ERR: + entry->status = IB_WC_REM_INV_REQ_ERR; + break; + case SYNDROME_REMOTE_ACCESS_ERR: + entry->status = IB_WC_REM_ACCESS_ERR; + break; + case SYNDROME_REMOTE_OP_ERR: + entry->status = IB_WC_REM_OP_ERR; + break; + case SYNDROME_RETRY_EXC_ERR: + entry->status = IB_WC_RETRY_EXC_ERR; + break; + case SYNDROME_RNR_RETRY_EXC_ERR: + entry->status = IB_WC_RNR_RETRY_EXC_ERR; + break; + case SYNDROME_LOCAL_RDD_VIOL_ERR: + entry->status = IB_WC_LOC_RDD_VIOL_ERR; + break; + case SYNDROME_REMOTE_INVAL_RD_REQ_ERR: + entry->status = IB_WC_REM_INV_RD_REQ_ERR; + break; + case SYNDROME_REMOTE_ABORTED_ERR: + entry->status = IB_WC_REM_ABORT_ERR; + break; + case SYNDROME_INVAL_EECN_ERR: + entry->status = IB_WC_INV_EECN_ERR; + break; + case SYNDROME_INVAL_EEC_STATE_ERR: + entry->status = IB_WC_INV_EEC_STATE_ERR; + break; + default: + entry->status = IB_WC_GENERAL_ERR; + break; + } + + entry->vendor_err = cqe->vendor_err; + + /* + * Mem-free HCAs always generate one CQE per WQE, even in the + * error case, so we don't have to check the doorbell count, etc. + */ + if (mthca_is_memfree(dev)) + return; + + mthca_free_err_wqe(dev, qp, is_send, wqe_index, &dbd, &new_wqe); + + /* + * If we're at the end of the WQE chain, or we've used up our + * doorbell count, free the CQE. Otherwise just update it for + * the next poll operation. + */ + if (!(new_wqe & cpu_to_be32(0x3f)) || (!cqe->db_cnt && dbd)) + return; + + be16_add_cpu(&cqe->db_cnt, -dbd); + cqe->wqe = new_wqe; + cqe->syndrome = SYNDROME_WR_FLUSH_ERR; + + *free_cqe = 0; +} + +static inline int mthca_poll_one(struct mthca_dev *dev, + struct mthca_cq *cq, + struct mthca_qp **cur_qp, + int *freed, + struct ib_wc *entry) +{ + struct mthca_wq *wq; + struct mthca_cqe *cqe; + int wqe_index; + int is_error; + int is_send; + int free_cqe = 1; + int err = 0; + u16 checksum; + + cqe = next_cqe_sw(cq); + if (!cqe) + return -EAGAIN; + + /* + * Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + rmb(); + + if (0) { + mthca_dbg(dev, "%x/%d: CQE -> QPN %06x, WQE @ %08x\n", + cq->cqn, cq->cons_index, be32_to_cpu(cqe->my_qpn), + be32_to_cpu(cqe->wqe)); + dump_cqe(dev, cqe); + } + + is_error = (cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) == + MTHCA_ERROR_CQE_OPCODE_MASK; + is_send = is_error ? cqe->opcode & 0x01 : cqe->is_send & 0x80; + + if (!*cur_qp || be32_to_cpu(cqe->my_qpn) != (*cur_qp)->qpn) { + /* + * We do not have to take the QP table lock here, + * because CQs will be locked while QPs are removed + * from the table. + */ + *cur_qp = mthca_array_get(&dev->qp_table.qp, + be32_to_cpu(cqe->my_qpn) & + (dev->limits.num_qps - 1)); + if (!*cur_qp) { + mthca_warn(dev, "CQ entry for unknown QP %06x\n", + be32_to_cpu(cqe->my_qpn) & 0xffffff); + err = -EINVAL; + goto out; + } + } + + entry->qp = &(*cur_qp)->ibqp; + + if (is_send) { + wq = &(*cur_qp)->sq; + wqe_index = ((be32_to_cpu(cqe->wqe) - (*cur_qp)->send_wqe_offset) + >> wq->wqe_shift); + entry->wr_id = (*cur_qp)->wrid[wqe_index + + (*cur_qp)->rq.max]; + } else if ((*cur_qp)->ibqp.srq) { + struct mthca_srq *srq = to_msrq((*cur_qp)->ibqp.srq); + u32 wqe = be32_to_cpu(cqe->wqe); + wq = NULL; + wqe_index = wqe >> srq->wqe_shift; + entry->wr_id = srq->wrid[wqe_index]; + mthca_free_srq_wqe(srq, wqe); + } else { + s32 wqe; + wq = &(*cur_qp)->rq; + wqe = be32_to_cpu(cqe->wqe); + wqe_index = wqe >> wq->wqe_shift; + /* + * WQE addr == base - 1 might be reported in receive completion + * with error instead of (rq size - 1) by Sinai FW 1.0.800 and + * Arbel FW 5.1.400. This bug should be fixed in later FW revs. + */ + if (unlikely(wqe_index < 0)) + wqe_index = wq->max - 1; + entry->wr_id = (*cur_qp)->wrid[wqe_index]; + } + + if (wq) { + if (wq->last_comp < wqe_index) + wq->tail += wqe_index - wq->last_comp; + else + wq->tail += wqe_index + wq->max - wq->last_comp; + + wq->last_comp = wqe_index; + } + + if (is_error) { + handle_error_cqe(dev, cq, *cur_qp, wqe_index, is_send, + (struct mthca_err_cqe *) cqe, + entry, &free_cqe); + goto out; + } + + if (is_send) { + entry->wc_flags = 0; + switch (cqe->opcode) { + case MTHCA_OPCODE_RDMA_WRITE: + entry->opcode = IB_WC_RDMA_WRITE; + break; + case MTHCA_OPCODE_RDMA_WRITE_IMM: + entry->opcode = IB_WC_RDMA_WRITE; + entry->wc_flags |= IB_WC_WITH_IMM; + break; + case MTHCA_OPCODE_SEND: + entry->opcode = IB_WC_SEND; + break; + case MTHCA_OPCODE_SEND_IMM: + entry->opcode = IB_WC_SEND; + entry->wc_flags |= IB_WC_WITH_IMM; + break; + case MTHCA_OPCODE_RDMA_READ: + entry->opcode = IB_WC_RDMA_READ; + entry->byte_len = be32_to_cpu(cqe->byte_cnt); + break; + case MTHCA_OPCODE_ATOMIC_CS: + entry->opcode = IB_WC_COMP_SWAP; + entry->byte_len = MTHCA_ATOMIC_BYTE_LEN; + break; + case MTHCA_OPCODE_ATOMIC_FA: + entry->opcode = IB_WC_FETCH_ADD; + entry->byte_len = MTHCA_ATOMIC_BYTE_LEN; + break; + case MTHCA_OPCODE_BIND_MW: + entry->opcode = IB_WC_BIND_MW; + break; + default: + entry->opcode = MTHCA_OPCODE_INVALID; + break; + } + } else { + entry->byte_len = be32_to_cpu(cqe->byte_cnt); + switch (cqe->opcode & 0x1f) { + case IB_OPCODE_SEND_LAST_WITH_IMMEDIATE: + case IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE: + entry->wc_flags = IB_WC_WITH_IMM; + entry->ex.imm_data = cqe->imm_etype_pkey_eec; + entry->opcode = IB_WC_RECV; + break; + case IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE: + case IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE: + entry->wc_flags = IB_WC_WITH_IMM; + entry->ex.imm_data = cqe->imm_etype_pkey_eec; + entry->opcode = IB_WC_RECV_RDMA_WITH_IMM; + break; + default: + entry->wc_flags = 0; + entry->opcode = IB_WC_RECV; + break; + } + entry->slid = be16_to_cpu(cqe->rlid); + entry->sl = cqe->sl_ipok >> 4; + entry->src_qp = be32_to_cpu(cqe->rqpn) & 0xffffff; + entry->dlid_path_bits = cqe->g_mlpath & 0x7f; + entry->pkey_index = be32_to_cpu(cqe->imm_etype_pkey_eec) >> 16; + entry->wc_flags |= cqe->g_mlpath & 0x80 ? IB_WC_GRH : 0; + checksum = (be32_to_cpu(cqe->rqpn) >> 24) | + ((be32_to_cpu(cqe->my_ee) >> 16) & 0xff00); + entry->wc_flags |= (cqe->sl_ipok & 1 && checksum == 0xffff) ? + IB_WC_IP_CSUM_OK : 0; + } + + entry->status = IB_WC_SUCCESS; + + out: + if (likely(free_cqe)) { + set_cqe_hw(cqe); + ++(*freed); + ++cq->cons_index; + } + + return err; +} + +int mthca_poll_cq(struct ib_cq *ibcq, int num_entries, + struct ib_wc *entry) +{ + struct mthca_dev *dev = to_mdev(ibcq->device); + struct mthca_cq *cq = to_mcq(ibcq); + struct mthca_qp *qp = NULL; + unsigned long flags; + int err = 0; + int freed = 0; + int npolled; + + spin_lock_irqsave(&cq->lock, flags); + + npolled = 0; +repoll: + while (npolled < num_entries) { + err = mthca_poll_one(dev, cq, &qp, + &freed, entry + npolled); + if (err) + break; + ++npolled; + } + + if (freed) { + wmb(); + update_cons_index(dev, cq, freed); + } + + /* + * If a CQ resize is in progress and we discovered that the + * old buffer is empty, then peek in the new buffer, and if + * it's not empty, switch to the new buffer and continue + * polling there. + */ + if (unlikely(err == -EAGAIN && cq->resize_buf && + cq->resize_buf->state == CQ_RESIZE_READY)) { + /* + * In Tavor mode, the hardware keeps the producer + * index modulo the CQ size. Since we might be making + * the CQ bigger, we need to mask our consumer index + * using the size of the old CQ buffer before looking + * in the new CQ buffer. + */ + if (!mthca_is_memfree(dev)) + cq->cons_index &= cq->ibcq.cqe; + + if (cqe_sw(get_cqe_from_buf(&cq->resize_buf->buf, + cq->cons_index & cq->resize_buf->cqe))) { + struct mthca_cq_buf tbuf; + int tcqe; + + tbuf = cq->buf; + tcqe = cq->ibcq.cqe; + cq->buf = cq->resize_buf->buf; + cq->ibcq.cqe = cq->resize_buf->cqe; + + cq->resize_buf->buf = tbuf; + cq->resize_buf->cqe = tcqe; + cq->resize_buf->state = CQ_RESIZE_SWAPPED; + + goto repoll; + } + } + + spin_unlock_irqrestore(&cq->lock, flags); + + return err == 0 || err == -EAGAIN ? npolled : err; +} + +int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags) +{ + u32 dbhi = ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? + MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL : + MTHCA_TAVOR_CQ_DB_REQ_NOT) | + to_mcq(cq)->cqn; + + mthca_write64(dbhi, 0xffffffff, to_mdev(cq->device)->kar + MTHCA_CQ_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&to_mdev(cq->device)->doorbell_lock)); + + return 0; +} + +int mthca_arbel_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + struct mthca_cq *cq = to_mcq(ibcq); + __be32 db_rec[2]; + u32 dbhi; + u32 sn = cq->arm_sn & 3; + + db_rec[0] = cpu_to_be32(cq->cons_index); + db_rec[1] = cpu_to_be32((cq->cqn << 8) | (2 << 5) | (sn << 3) | + ((flags & IB_CQ_SOLICITED_MASK) == + IB_CQ_SOLICITED ? 1 : 2)); + + mthca_write_db_rec(db_rec, cq->arm_db); + + /* + * Make sure that the doorbell record in host memory is + * written before ringing the doorbell via PCI MMIO. + */ + wmb(); + + dbhi = (sn << 28) | + ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? + MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL : + MTHCA_ARBEL_CQ_DB_REQ_NOT) | cq->cqn; + + mthca_write64(dbhi, cq->cons_index, + to_mdev(ibcq->device)->kar + MTHCA_CQ_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->doorbell_lock)); + + return 0; +} + +int mthca_init_cq(struct mthca_dev *dev, int nent, + struct mthca_ucontext *ctx, u32 pdn, + struct mthca_cq *cq) +{ + struct mthca_mailbox *mailbox; + struct mthca_cq_context *cq_context; + int err = -ENOMEM; + + cq->ibcq.cqe = nent - 1; + cq->is_kernel = !ctx; + + cq->cqn = mthca_alloc(&dev->cq_table.alloc); + if (cq->cqn == -1) + return -ENOMEM; + + if (mthca_is_memfree(dev)) { + err = mthca_table_get(dev, dev->cq_table.table, cq->cqn); + if (err) + goto err_out; + + if (cq->is_kernel) { + cq->arm_sn = 1; + + err = -ENOMEM; + + cq->set_ci_db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, + cq->cqn, &cq->set_ci_db); + if (cq->set_ci_db_index < 0) + goto err_out_icm; + + cq->arm_db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_CQ_ARM, + cq->cqn, &cq->arm_db); + if (cq->arm_db_index < 0) + goto err_out_ci; + } + } + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + goto err_out_arm; + + cq_context = mailbox->buf; + + if (cq->is_kernel) { + err = mthca_alloc_cq_buf(dev, &cq->buf, nent); + if (err) + goto err_out_mailbox; + } + + spin_lock_init(&cq->lock); + cq->refcount = 1; + init_waitqueue_head(&cq->wait); + mutex_init(&cq->mutex); + + memset(cq_context, 0, sizeof *cq_context); + cq_context->flags = cpu_to_be32(MTHCA_CQ_STATUS_OK | + MTHCA_CQ_STATE_DISARMED | + MTHCA_CQ_FLAG_TR); + cq_context->logsize_usrpage = cpu_to_be32((ffs(nent) - 1) << 24); + if (ctx) + cq_context->logsize_usrpage |= cpu_to_be32(ctx->uar.index); + else + cq_context->logsize_usrpage |= cpu_to_be32(dev->driver_uar.index); + cq_context->error_eqn = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn); + cq_context->comp_eqn = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_COMP].eqn); + cq_context->pd = cpu_to_be32(pdn); + cq_context->lkey = cpu_to_be32(cq->buf.mr.ibmr.lkey); + cq_context->cqn = cpu_to_be32(cq->cqn); + + if (mthca_is_memfree(dev)) { + cq_context->ci_db = cpu_to_be32(cq->set_ci_db_index); + cq_context->state_db = cpu_to_be32(cq->arm_db_index); + } + + err = mthca_SW2HW_CQ(dev, mailbox, cq->cqn); + if (err) { + mthca_warn(dev, "SW2HW_CQ failed (%d)\n", err); + goto err_out_free_mr; + } + + spin_lock_irq(&dev->cq_table.lock); + if (mthca_array_set(&dev->cq_table.cq, + cq->cqn & (dev->limits.num_cqs - 1), + cq)) { + spin_unlock_irq(&dev->cq_table.lock); + goto err_out_free_mr; + } + spin_unlock_irq(&dev->cq_table.lock); + + cq->cons_index = 0; + + mthca_free_mailbox(dev, mailbox); + + return 0; + +err_out_free_mr: + if (cq->is_kernel) + mthca_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe); + +err_out_mailbox: + mthca_free_mailbox(dev, mailbox); + +err_out_arm: + if (cq->is_kernel && mthca_is_memfree(dev)) + mthca_free_db(dev, MTHCA_DB_TYPE_CQ_ARM, cq->arm_db_index); + +err_out_ci: + if (cq->is_kernel && mthca_is_memfree(dev)) + mthca_free_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, cq->set_ci_db_index); + +err_out_icm: + mthca_table_put(dev, dev->cq_table.table, cq->cqn); + +err_out: + mthca_free(&dev->cq_table.alloc, cq->cqn); + + return err; +} + +static inline int get_cq_refcount(struct mthca_dev *dev, struct mthca_cq *cq) +{ + int c; + + spin_lock_irq(&dev->cq_table.lock); + c = cq->refcount; + spin_unlock_irq(&dev->cq_table.lock); + + return c; +} + +void mthca_free_cq(struct mthca_dev *dev, + struct mthca_cq *cq) +{ + struct mthca_mailbox *mailbox; + int err; + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) { + mthca_warn(dev, "No memory for mailbox to free CQ.\n"); + return; + } + + err = mthca_HW2SW_CQ(dev, mailbox, cq->cqn); + if (err) + mthca_warn(dev, "HW2SW_CQ failed (%d)\n", err); + + if (0) { + __be32 *ctx = mailbox->buf; + int j; + + printk(KERN_ERR "context for CQN %x (cons index %x, next sw %d)\n", + cq->cqn, cq->cons_index, + cq->is_kernel ? !!next_cqe_sw(cq) : 0); + for (j = 0; j < 16; ++j) + printk(KERN_ERR "[%2x] %08x\n", j * 4, be32_to_cpu(ctx[j])); + } + + spin_lock_irq(&dev->cq_table.lock); + mthca_array_clear(&dev->cq_table.cq, + cq->cqn & (dev->limits.num_cqs - 1)); + --cq->refcount; + spin_unlock_irq(&dev->cq_table.lock); + + if (dev->mthca_flags & MTHCA_FLAG_MSI_X) + synchronize_irq(dev->eq_table.eq[MTHCA_EQ_COMP].msi_x_vector); + else + synchronize_irq(dev->pdev->irq); + + wait_event(cq->wait, !get_cq_refcount(dev, cq)); + + if (cq->is_kernel) { + mthca_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe); + if (mthca_is_memfree(dev)) { + mthca_free_db(dev, MTHCA_DB_TYPE_CQ_ARM, cq->arm_db_index); + mthca_free_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, cq->set_ci_db_index); + } + } + + mthca_table_put(dev, dev->cq_table.table, cq->cqn); + mthca_free(&dev->cq_table.alloc, cq->cqn); + mthca_free_mailbox(dev, mailbox); +} + +int mthca_init_cq_table(struct mthca_dev *dev) +{ + int err; + + spin_lock_init(&dev->cq_table.lock); + + err = mthca_alloc_init(&dev->cq_table.alloc, + dev->limits.num_cqs, + (1 << 24) - 1, + dev->limits.reserved_cqs); + if (err) + return err; + + err = mthca_array_init(&dev->cq_table.cq, + dev->limits.num_cqs); + if (err) + mthca_alloc_cleanup(&dev->cq_table.alloc); + + return err; +} + +void mthca_cleanup_cq_table(struct mthca_dev *dev) +{ + mthca_array_cleanup(&dev->cq_table.cq, dev->limits.num_cqs); + mthca_alloc_cleanup(&dev->cq_table.alloc); +} diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h new file mode 100644 index 0000000..7e6a6d6 --- /dev/null +++ b/drivers/infiniband/hw/mthca/mthca_dev.h @@ -0,0 +1,596 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MTHCA_DEV_H +#define MTHCA_DEV_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mthca_provider.h" +#include "mthca_doorbell.h" + +#define DRV_NAME "ib_mthca" +#define PFX DRV_NAME ": " +#define DRV_VERSION "1.0" +#define DRV_RELDATE "April 4, 2008" + +enum { + MTHCA_FLAG_DDR_HIDDEN = 1 << 1, + MTHCA_FLAG_SRQ = 1 << 2, + MTHCA_FLAG_MSI_X = 1 << 3, + MTHCA_FLAG_NO_LAM = 1 << 4, + MTHCA_FLAG_FMR = 1 << 5, + MTHCA_FLAG_MEMFREE = 1 << 6, + MTHCA_FLAG_PCIE = 1 << 7, + MTHCA_FLAG_SINAI_OPT = 1 << 8 +}; + +enum { + MTHCA_MAX_PORTS = 2 +}; + +enum { + MTHCA_BOARD_ID_LEN = 64 +}; + +enum { + MTHCA_EQ_CONTEXT_SIZE = 0x40, + MTHCA_CQ_CONTEXT_SIZE = 0x40, + MTHCA_QP_CONTEXT_SIZE = 0x200, + MTHCA_RDB_ENTRY_SIZE = 0x20, + MTHCA_AV_SIZE = 0x20, + MTHCA_MGM_ENTRY_SIZE = 0x100, + + /* Arbel FW gives us these, but we need them for Tavor */ + MTHCA_MPT_ENTRY_SIZE = 0x40, + MTHCA_MTT_SEG_SIZE = 0x40, + + MTHCA_QP_PER_MGM = 4 * (MTHCA_MGM_ENTRY_SIZE / 16 - 2) +}; + +enum { + MTHCA_EQ_CMD, + MTHCA_EQ_ASYNC, + MTHCA_EQ_COMP, + MTHCA_NUM_EQ +}; + +enum { + MTHCA_OPCODE_NOP = 0x00, + MTHCA_OPCODE_RDMA_WRITE = 0x08, + MTHCA_OPCODE_RDMA_WRITE_IMM = 0x09, + MTHCA_OPCODE_SEND = 0x0a, + MTHCA_OPCODE_SEND_IMM = 0x0b, + MTHCA_OPCODE_RDMA_READ = 0x10, + MTHCA_OPCODE_ATOMIC_CS = 0x11, + MTHCA_OPCODE_ATOMIC_FA = 0x12, + MTHCA_OPCODE_BIND_MW = 0x18, + MTHCA_OPCODE_INVALID = 0xff +}; + +enum { + MTHCA_CMD_USE_EVENTS = 1 << 0, + MTHCA_CMD_POST_DOORBELLS = 1 << 1 +}; + +enum { + MTHCA_CMD_NUM_DBELL_DWORDS = 8 +}; + +struct mthca_cmd { + struct pci_pool *pool; + struct mutex hcr_mutex; + struct semaphore poll_sem; + struct semaphore event_sem; + int max_cmds; + spinlock_t context_lock; + int free_head; + struct mthca_cmd_context *context; + u16 token_mask; + u32 flags; + void __iomem *dbell_map; + u16 dbell_offsets[MTHCA_CMD_NUM_DBELL_DWORDS]; +}; + +struct mthca_limits { + int num_ports; + int vl_cap; + int mtu_cap; + int gid_table_len; + int pkey_table_len; + int local_ca_ack_delay; + int num_uars; + int max_sg; + int num_qps; + int max_wqes; + int max_desc_sz; + int max_qp_init_rdma; + int reserved_qps; + int num_srqs; + int max_srq_wqes; + int max_srq_sge; + int reserved_srqs; + int num_eecs; + int reserved_eecs; + int num_cqs; + int max_cqes; + int reserved_cqs; + int num_eqs; + int reserved_eqs; + int num_mpts; + int num_mtt_segs; + int mtt_seg_size; + int fmr_reserved_mtts; + int reserved_mtts; + int reserved_mrws; + int reserved_uars; + int num_mgms; + int num_amgms; + int reserved_mcgs; + int num_pds; + int reserved_pds; + u32 page_size_cap; + u32 flags; + u16 stat_rate_support; + u8 port_width_cap; +}; + +struct mthca_alloc { + u32 last; + u32 top; + u32 max; + u32 mask; + spinlock_t lock; + unsigned long *table; +}; + +struct mthca_array { + struct { + void **page; + int used; + } *page_list; +}; + +struct mthca_uar_table { + struct mthca_alloc alloc; + u64 uarc_base; + int uarc_size; +}; + +struct mthca_pd_table { + struct mthca_alloc alloc; +}; + +struct mthca_buddy { + unsigned long **bits; + int *num_free; + int max_order; + spinlock_t lock; +}; + +struct mthca_mr_table { + struct mthca_alloc mpt_alloc; + struct mthca_buddy mtt_buddy; + struct mthca_buddy *fmr_mtt_buddy; + u64 mtt_base; + u64 mpt_base; + struct mthca_icm_table *mtt_table; + struct mthca_icm_table *mpt_table; + struct { + void __iomem *mpt_base; + void __iomem *mtt_base; + struct mthca_buddy mtt_buddy; + } tavor_fmr; +}; + +struct mthca_eq_table { + struct mthca_alloc alloc; + void __iomem *clr_int; + u32 clr_mask; + u32 arm_mask; + struct mthca_eq eq[MTHCA_NUM_EQ]; + u64 icm_virt; + struct page *icm_page; + dma_addr_t icm_dma; + int have_irq; + u8 inta_pin; +}; + +struct mthca_cq_table { + struct mthca_alloc alloc; + spinlock_t lock; + struct mthca_array cq; + struct mthca_icm_table *table; +}; + +struct mthca_srq_table { + struct mthca_alloc alloc; + spinlock_t lock; + struct mthca_array srq; + struct mthca_icm_table *table; +}; + +struct mthca_qp_table { + struct mthca_alloc alloc; + u32 rdb_base; + int rdb_shift; + int sqp_start; + spinlock_t lock; + struct mthca_array qp; + struct mthca_icm_table *qp_table; + struct mthca_icm_table *eqp_table; + struct mthca_icm_table *rdb_table; +}; + +struct mthca_av_table { + struct pci_pool *pool; + int num_ddr_avs; + u64 ddr_av_base; + void __iomem *av_map; + struct mthca_alloc alloc; +}; + +struct mthca_mcg_table { + struct mutex mutex; + struct mthca_alloc alloc; + struct mthca_icm_table *table; +}; + +struct mthca_catas_err { + u64 addr; + u32 __iomem *map; + u32 size; + struct timer_list timer; + struct list_head list; +}; + +extern struct mutex mthca_device_mutex; + +struct mthca_dev { + struct ib_device ib_dev; + struct pci_dev *pdev; + + int hca_type; + unsigned long mthca_flags; + unsigned long device_cap_flags; + + u32 rev_id; + char board_id[MTHCA_BOARD_ID_LEN]; + + /* firmware info */ + u64 fw_ver; + union { + struct { + u64 fw_start; + u64 fw_end; + } tavor; + struct { + u64 clr_int_base; + u64 eq_arm_base; + u64 eq_set_ci_base; + struct mthca_icm *fw_icm; + struct mthca_icm *aux_icm; + u16 fw_pages; + } arbel; + } fw; + + u64 ddr_start; + u64 ddr_end; + + MTHCA_DECLARE_DOORBELL_LOCK(doorbell_lock) + struct mutex cap_mask_mutex; + + void __iomem *hcr; + void __iomem *kar; + void __iomem *clr_base; + union { + struct { + void __iomem *ecr_base; + } tavor; + struct { + void __iomem *eq_arm; + void __iomem *eq_set_ci_base; + } arbel; + } eq_regs; + + struct mthca_cmd cmd; + struct mthca_limits limits; + + struct mthca_uar_table uar_table; + struct mthca_pd_table pd_table; + struct mthca_mr_table mr_table; + struct mthca_eq_table eq_table; + struct mthca_cq_table cq_table; + struct mthca_srq_table srq_table; + struct mthca_qp_table qp_table; + struct mthca_av_table av_table; + struct mthca_mcg_table mcg_table; + + struct mthca_catas_err catas_err; + + struct mthca_uar driver_uar; + struct mthca_db_table *db_tab; + struct mthca_pd driver_pd; + struct mthca_mr driver_mr; + + struct ib_mad_agent *send_agent[MTHCA_MAX_PORTS][2]; + struct ib_ah *sm_ah[MTHCA_MAX_PORTS]; + spinlock_t sm_lock; + u8 rate[MTHCA_MAX_PORTS]; + bool active; +}; + +#ifdef CONFIG_INFINIBAND_MTHCA_DEBUG +extern int mthca_debug_level; + +#define mthca_dbg(mdev, format, arg...) \ + do { \ + if (mthca_debug_level) \ + dev_printk(KERN_DEBUG, &mdev->pdev->dev, format, ## arg); \ + } while (0) + +#else /* CONFIG_INFINIBAND_MTHCA_DEBUG */ + +#define mthca_dbg(mdev, format, arg...) do { (void) mdev; } while (0) + +#endif /* CONFIG_INFINIBAND_MTHCA_DEBUG */ + +#define mthca_err(mdev, format, arg...) \ + dev_err(&mdev->pdev->dev, format, ## arg) +#define mthca_info(mdev, format, arg...) \ + dev_info(&mdev->pdev->dev, format, ## arg) +#define mthca_warn(mdev, format, arg...) \ + dev_warn(&mdev->pdev->dev, format, ## arg) + +extern void __buggy_use_of_MTHCA_GET(void); +extern void __buggy_use_of_MTHCA_PUT(void); + +#define MTHCA_GET(dest, source, offset) \ + do { \ + void *__p = (char *) (source) + (offset); \ + switch (sizeof (dest)) { \ + case 1: (dest) = *(u8 *) __p; break; \ + case 2: (dest) = be16_to_cpup(__p); break; \ + case 4: (dest) = be32_to_cpup(__p); break; \ + case 8: (dest) = be64_to_cpup(__p); break; \ + default: __buggy_use_of_MTHCA_GET(); \ + } \ + } while (0) + +#define MTHCA_PUT(dest, source, offset) \ + do { \ + void *__d = ((char *) (dest) + (offset)); \ + switch (sizeof(source)) { \ + case 1: *(u8 *) __d = (source); break; \ + case 2: *(__be16 *) __d = cpu_to_be16(source); break; \ + case 4: *(__be32 *) __d = cpu_to_be32(source); break; \ + case 8: *(__be64 *) __d = cpu_to_be64(source); break; \ + default: __buggy_use_of_MTHCA_PUT(); \ + } \ + } while (0) + +int mthca_reset(struct mthca_dev *mdev); + +u32 mthca_alloc(struct mthca_alloc *alloc); +void mthca_free(struct mthca_alloc *alloc, u32 obj); +int mthca_alloc_init(struct mthca_alloc *alloc, u32 num, u32 mask, + u32 reserved); +void mthca_alloc_cleanup(struct mthca_alloc *alloc); +void *mthca_array_get(struct mthca_array *array, int index); +int mthca_array_set(struct mthca_array *array, int index, void *value); +void mthca_array_clear(struct mthca_array *array, int index); +int mthca_array_init(struct mthca_array *array, int nent); +void mthca_array_cleanup(struct mthca_array *array, int nent); +int mthca_buf_alloc(struct mthca_dev *dev, int size, int max_direct, + union mthca_buf *buf, int *is_direct, struct mthca_pd *pd, + int hca_write, struct mthca_mr *mr); +void mthca_buf_free(struct mthca_dev *dev, int size, union mthca_buf *buf, + int is_direct, struct mthca_mr *mr); + +int mthca_init_uar_table(struct mthca_dev *dev); +int mthca_init_pd_table(struct mthca_dev *dev); +int mthca_init_mr_table(struct mthca_dev *dev); +int mthca_init_eq_table(struct mthca_dev *dev); +int mthca_init_cq_table(struct mthca_dev *dev); +int mthca_init_srq_table(struct mthca_dev *dev); +int mthca_init_qp_table(struct mthca_dev *dev); +int mthca_init_av_table(struct mthca_dev *dev); +int mthca_init_mcg_table(struct mthca_dev *dev); + +void mthca_cleanup_uar_table(struct mthca_dev *dev); +void mthca_cleanup_pd_table(struct mthca_dev *dev); +void mthca_cleanup_mr_table(struct mthca_dev *dev); +void mthca_cleanup_eq_table(struct mthca_dev *dev); +void mthca_cleanup_cq_table(struct mthca_dev *dev); +void mthca_cleanup_srq_table(struct mthca_dev *dev); +void mthca_cleanup_qp_table(struct mthca_dev *dev); +void mthca_cleanup_av_table(struct mthca_dev *dev); +void mthca_cleanup_mcg_table(struct mthca_dev *dev); + +int mthca_register_device(struct mthca_dev *dev); +void mthca_unregister_device(struct mthca_dev *dev); + +void mthca_start_catas_poll(struct mthca_dev *dev); +void mthca_stop_catas_poll(struct mthca_dev *dev); +int __mthca_restart_one(struct pci_dev *pdev); +int mthca_catas_init(void); +void mthca_catas_cleanup(void); + +int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar); +void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar); + +int mthca_pd_alloc(struct mthca_dev *dev, int privileged, struct mthca_pd *pd); +void mthca_pd_free(struct mthca_dev *dev, struct mthca_pd *pd); + +int mthca_write_mtt_size(struct mthca_dev *dev); + +struct mthca_mtt *mthca_alloc_mtt(struct mthca_dev *dev, int size); +void mthca_free_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt); +int mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt, + int start_index, u64 *buffer_list, int list_len); +int mthca_mr_alloc(struct mthca_dev *dev, u32 pd, int buffer_size_shift, + u64 iova, u64 total_size, u32 access, struct mthca_mr *mr); +int mthca_mr_alloc_notrans(struct mthca_dev *dev, u32 pd, + u32 access, struct mthca_mr *mr); +int mthca_mr_alloc_phys(struct mthca_dev *dev, u32 pd, + u64 *buffer_list, int buffer_size_shift, + int list_len, u64 iova, u64 total_size, + u32 access, struct mthca_mr *mr); +void mthca_free_mr(struct mthca_dev *dev, struct mthca_mr *mr); + +int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd, + u32 access, struct mthca_fmr *fmr); +int mthca_tavor_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, + int list_len, u64 iova); +void mthca_tavor_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr); +int mthca_arbel_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, + int list_len, u64 iova); +void mthca_arbel_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr); +int mthca_free_fmr(struct mthca_dev *dev, struct mthca_fmr *fmr); + +int mthca_map_eq_icm(struct mthca_dev *dev, u64 icm_virt); +void mthca_unmap_eq_icm(struct mthca_dev *dev); + +int mthca_poll_cq(struct ib_cq *ibcq, int num_entries, + struct ib_wc *entry); +int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); +int mthca_arbel_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); +int mthca_init_cq(struct mthca_dev *dev, int nent, + struct mthca_ucontext *ctx, u32 pdn, + struct mthca_cq *cq); +void mthca_free_cq(struct mthca_dev *dev, + struct mthca_cq *cq); +void mthca_cq_completion(struct mthca_dev *dev, u32 cqn); +void mthca_cq_event(struct mthca_dev *dev, u32 cqn, + enum ib_event_type event_type); +void mthca_cq_clean(struct mthca_dev *dev, struct mthca_cq *cq, u32 qpn, + struct mthca_srq *srq); +void mthca_cq_resize_copy_cqes(struct mthca_cq *cq); +int mthca_alloc_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int nent); +void mthca_free_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int cqe); + +int mthca_alloc_srq(struct mthca_dev *dev, struct mthca_pd *pd, + struct ib_srq_attr *attr, struct mthca_srq *srq); +void mthca_free_srq(struct mthca_dev *dev, struct mthca_srq *srq); +int mthca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata); +int mthca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); +int mthca_max_srq_sge(struct mthca_dev *dev); +void mthca_srq_event(struct mthca_dev *dev, u32 srqn, + enum ib_event_type event_type); +void mthca_free_srq_wqe(struct mthca_srq *srq, u32 wqe_addr); +int mthca_tavor_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +int mthca_arbel_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); + +void mthca_qp_event(struct mthca_dev *dev, u32 qpn, + enum ib_event_type event_type); +int mthca_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr); +int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, + struct ib_udata *udata); +int mthca_tavor_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +int mthca_arbel_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +void mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send, + int index, int *dbd, __be32 *new_wqe); +int mthca_alloc_qp(struct mthca_dev *dev, + struct mthca_pd *pd, + struct mthca_cq *send_cq, + struct mthca_cq *recv_cq, + enum ib_qp_type type, + enum ib_sig_type send_policy, + struct ib_qp_cap *cap, + struct mthca_qp *qp); +int mthca_alloc_sqp(struct mthca_dev *dev, + struct mthca_pd *pd, + struct mthca_cq *send_cq, + struct mthca_cq *recv_cq, + enum ib_sig_type send_policy, + struct ib_qp_cap *cap, + int qpn, + int port, + struct mthca_sqp *sqp); +void mthca_free_qp(struct mthca_dev *dev, struct mthca_qp *qp); +int mthca_create_ah(struct mthca_dev *dev, + struct mthca_pd *pd, + struct ib_ah_attr *ah_attr, + struct mthca_ah *ah); +int mthca_destroy_ah(struct mthca_dev *dev, struct mthca_ah *ah); +int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah, + struct ib_ud_header *header); +int mthca_ah_query(struct ib_ah *ibah, struct ib_ah_attr *attr); +int mthca_ah_grh_present(struct mthca_ah *ah); +u8 mthca_get_rate(struct mthca_dev *dev, int static_rate, u8 port); +enum ib_rate mthca_rate_to_ib(struct mthca_dev *dev, u8 mthca_rate, u8 port); + +int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); +int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); + +int mthca_process_mad(struct ib_device *ibdev, + int mad_flags, + u8 port_num, + struct ib_wc *in_wc, + struct ib_grh *in_grh, + struct ib_mad *in_mad, + struct ib_mad *out_mad); +int mthca_create_agents(struct mthca_dev *dev); +void mthca_free_agents(struct mthca_dev *dev); + +static inline struct mthca_dev *to_mdev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct mthca_dev, ib_dev); +} + +static inline int mthca_is_memfree(struct mthca_dev *dev) +{ + return dev->mthca_flags & MTHCA_FLAG_MEMFREE; +} + +#endif /* MTHCA_DEV_H */ diff --git a/drivers/infiniband/hw/mthca/mthca_doorbell.h b/drivers/infiniband/hw/mthca/mthca_doorbell.h new file mode 100644 index 0000000..14f51ef --- /dev/null +++ b/drivers/infiniband/hw/mthca/mthca_doorbell.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#define MTHCA_RD_DOORBELL 0x00 +#define MTHCA_SEND_DOORBELL 0x10 +#define MTHCA_RECEIVE_DOORBELL 0x18 +#define MTHCA_CQ_DOORBELL 0x20 +#define MTHCA_EQ_DOORBELL 0x28 + +#if BITS_PER_LONG == 64 +/* + * Assume that we can just write a 64-bit doorbell atomically. s390 + * actually doesn't have writeq() but S/390 systems don't even have + * PCI so we won't worry about it. + */ + +#define MTHCA_DECLARE_DOORBELL_LOCK(name) +#define MTHCA_INIT_DOORBELL_LOCK(ptr) do { } while (0) +#define MTHCA_GET_DOORBELL_LOCK(ptr) (NULL) + +static inline void mthca_write64_raw(__be64 val, void __iomem *dest) +{ + __raw_writeq((__force u64) val, dest); +} + +static inline void mthca_write64(u32 hi, u32 lo, void __iomem *dest, + spinlock_t *doorbell_lock) +{ + __raw_writeq((__force u64) cpu_to_be64((u64) hi << 32 | lo), dest); +} + +static inline void mthca_write_db_rec(__be32 val[2], __be32 *db) +{ + *(u64 *) db = *(u64 *) val; +} + +#else + +/* + * Just fall back to a spinlock to protect the doorbell if + * BITS_PER_LONG is 32 -- there's no portable way to do atomic 64-bit + * MMIO writes. + */ + +#define MTHCA_DECLARE_DOORBELL_LOCK(name) spinlock_t name; +#define MTHCA_INIT_DOORBELL_LOCK(ptr) spin_lock_init(ptr) +#define MTHCA_GET_DOORBELL_LOCK(ptr) (ptr) + +static inline void mthca_write64_raw(__be64 val, void __iomem *dest) +{ + __raw_writel(((__force u32 *) &val)[0], dest); + __raw_writel(((__force u32 *) &val)[1], dest + 4); +} + +static inline void mthca_write64(u32 hi, u32 lo, void __iomem *dest, + spinlock_t *doorbell_lock) +{ + unsigned long flags; + + hi = (__force u32) cpu_to_be32(hi); + lo = (__force u32) cpu_to_be32(lo); + + spin_lock_irqsave(doorbell_lock, flags); + __raw_writel(hi, dest); + __raw_writel(lo, dest + 4); + spin_unlock_irqrestore(doorbell_lock, flags); +} + +static inline void mthca_write_db_rec(__be32 val[2], __be32 *db) +{ + db[0] = val[0]; + wmb(); + db[1] = val[1]; +} + +#endif diff --git a/drivers/infiniband/hw/mthca/mthca_eq.c b/drivers/infiniband/hw/mthca/mthca_eq.c new file mode 100644 index 0000000..6902017 --- /dev/null +++ b/drivers/infiniband/hw/mthca/mthca_eq.c @@ -0,0 +1,905 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "mthca_dev.h" +#include "mthca_cmd.h" +#include "mthca_config_reg.h" + +enum { + MTHCA_NUM_ASYNC_EQE = 0x80, + MTHCA_NUM_CMD_EQE = 0x80, + MTHCA_NUM_SPARE_EQE = 0x80, + MTHCA_EQ_ENTRY_SIZE = 0x20 +}; + +/* + * Must be packed because start is 64 bits but only aligned to 32 bits. + */ +struct mthca_eq_context { + __be32 flags; + __be64 start; + __be32 logsize_usrpage; + __be32 tavor_pd; /* reserved for Arbel */ + u8 reserved1[3]; + u8 intr; + __be32 arbel_pd; /* lost_count for Tavor */ + __be32 lkey; + u32 reserved2[2]; + __be32 consumer_index; + __be32 producer_index; + u32 reserved3[4]; +} __attribute__((packed)); + +#define MTHCA_EQ_STATUS_OK ( 0 << 28) +#define MTHCA_EQ_STATUS_OVERFLOW ( 9 << 28) +#define MTHCA_EQ_STATUS_WRITE_FAIL (10 << 28) +#define MTHCA_EQ_OWNER_SW ( 0 << 24) +#define MTHCA_EQ_OWNER_HW ( 1 << 24) +#define MTHCA_EQ_FLAG_TR ( 1 << 18) +#define MTHCA_EQ_FLAG_OI ( 1 << 17) +#define MTHCA_EQ_STATE_ARMED ( 1 << 8) +#define MTHCA_EQ_STATE_FIRED ( 2 << 8) +#define MTHCA_EQ_STATE_ALWAYS_ARMED ( 3 << 8) +#define MTHCA_EQ_STATE_ARBEL ( 8 << 8) + +enum { + MTHCA_EVENT_TYPE_COMP = 0x00, + MTHCA_EVENT_TYPE_PATH_MIG = 0x01, + MTHCA_EVENT_TYPE_COMM_EST = 0x02, + MTHCA_EVENT_TYPE_SQ_DRAINED = 0x03, + MTHCA_EVENT_TYPE_SRQ_QP_LAST_WQE = 0x13, + MTHCA_EVENT_TYPE_SRQ_LIMIT = 0x14, + MTHCA_EVENT_TYPE_CQ_ERROR = 0x04, + MTHCA_EVENT_TYPE_WQ_CATAS_ERROR = 0x05, + MTHCA_EVENT_TYPE_EEC_CATAS_ERROR = 0x06, + MTHCA_EVENT_TYPE_PATH_MIG_FAILED = 0x07, + MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR = 0x10, + MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR = 0x11, + MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR = 0x12, + MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR = 0x08, + MTHCA_EVENT_TYPE_PORT_CHANGE = 0x09, + MTHCA_EVENT_TYPE_EQ_OVERFLOW = 0x0f, + MTHCA_EVENT_TYPE_ECC_DETECT = 0x0e, + MTHCA_EVENT_TYPE_CMD = 0x0a +}; + +#define MTHCA_ASYNC_EVENT_MASK ((1ULL << MTHCA_EVENT_TYPE_PATH_MIG) | \ + (1ULL << MTHCA_EVENT_TYPE_COMM_EST) | \ + (1ULL << MTHCA_EVENT_TYPE_SQ_DRAINED) | \ + (1ULL << MTHCA_EVENT_TYPE_CQ_ERROR) | \ + (1ULL << MTHCA_EVENT_TYPE_WQ_CATAS_ERROR) | \ + (1ULL << MTHCA_EVENT_TYPE_EEC_CATAS_ERROR) | \ + (1ULL << MTHCA_EVENT_TYPE_PATH_MIG_FAILED) | \ + (1ULL << MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | \ + (1ULL << MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR) | \ + (1ULL << MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR) | \ + (1ULL << MTHCA_EVENT_TYPE_PORT_CHANGE) | \ + (1ULL << MTHCA_EVENT_TYPE_ECC_DETECT)) +#define MTHCA_SRQ_EVENT_MASK ((1ULL << MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR) | \ + (1ULL << MTHCA_EVENT_TYPE_SRQ_QP_LAST_WQE) | \ + (1ULL << MTHCA_EVENT_TYPE_SRQ_LIMIT)) +#define MTHCA_CMD_EVENT_MASK (1ULL << MTHCA_EVENT_TYPE_CMD) + +#define MTHCA_EQ_DB_INC_CI (1 << 24) +#define MTHCA_EQ_DB_REQ_NOT (2 << 24) +#define MTHCA_EQ_DB_DISARM_CQ (3 << 24) +#define MTHCA_EQ_DB_SET_CI (4 << 24) +#define MTHCA_EQ_DB_ALWAYS_ARM (5 << 24) + +struct mthca_eqe { + u8 reserved1; + u8 type; + u8 reserved2; + u8 subtype; + union { + u32 raw[6]; + struct { + __be32 cqn; + } __attribute__((packed)) comp; + struct { + u16 reserved1; + __be16 token; + u32 reserved2; + u8 reserved3[3]; + u8 status; + __be64 out_param; + } __attribute__((packed)) cmd; + struct { + __be32 qpn; + } __attribute__((packed)) qp; + struct { + __be32 srqn; + } __attribute__((packed)) srq; + struct { + __be32 cqn; + u32 reserved1; + u8 reserved2[3]; + u8 syndrome; + } __attribute__((packed)) cq_err; + struct { + u32 reserved1[2]; + __be32 port; + } __attribute__((packed)) port_change; + } event; + u8 reserved3[3]; + u8 owner; +} __attribute__((packed)); + +#define MTHCA_EQ_ENTRY_OWNER_SW (0 << 7) +#define MTHCA_EQ_ENTRY_OWNER_HW (1 << 7) + +static inline u64 async_mask(struct mthca_dev *dev) +{ + return dev->mthca_flags & MTHCA_FLAG_SRQ ? + MTHCA_ASYNC_EVENT_MASK | MTHCA_SRQ_EVENT_MASK : + MTHCA_ASYNC_EVENT_MASK; +} + +static inline void tavor_set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci) +{ + /* + * This barrier makes sure that all updates to ownership bits + * done by set_eqe_hw() hit memory before the consumer index + * is updated. set_eq_ci() allows the HCA to possibly write + * more EQ entries, and we want to avoid the exceedingly + * unlikely possibility of the HCA writing an entry and then + * having set_eqe_hw() overwrite the owner field. + */ + wmb(); + mthca_write64(MTHCA_EQ_DB_SET_CI | eq->eqn, ci & (eq->nent - 1), + dev->kar + MTHCA_EQ_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); +} + +static inline void arbel_set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci) +{ + /* See comment in tavor_set_eq_ci() above. */ + wmb(); + __raw_writel((__force u32) cpu_to_be32(ci), + dev->eq_regs.arbel.eq_set_ci_base + eq->eqn * 8); + /* We still want ordering, just not swabbing, so add a barrier */ + mb(); +} + +static inline void set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci) +{ + if (mthca_is_memfree(dev)) + arbel_set_eq_ci(dev, eq, ci); + else + tavor_set_eq_ci(dev, eq, ci); +} + +static inline void tavor_eq_req_not(struct mthca_dev *dev, int eqn) +{ + mthca_write64(MTHCA_EQ_DB_REQ_NOT | eqn, 0, + dev->kar + MTHCA_EQ_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); +} + +static inline void arbel_eq_req_not(struct mthca_dev *dev, u32 eqn_mask) +{ + writel(eqn_mask, dev->eq_regs.arbel.eq_arm); +} + +static inline void disarm_cq(struct mthca_dev *dev, int eqn, int cqn) +{ + if (!mthca_is_memfree(dev)) { + mthca_write64(MTHCA_EQ_DB_DISARM_CQ | eqn, cqn, + dev->kar + MTHCA_EQ_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); + } +} + +static inline struct mthca_eqe *get_eqe(struct mthca_eq *eq, u32 entry) +{ + unsigned long off = (entry & (eq->nent - 1)) * MTHCA_EQ_ENTRY_SIZE; + return eq->page_list[off / PAGE_SIZE].buf + off % PAGE_SIZE; +} + +static inline struct mthca_eqe *next_eqe_sw(struct mthca_eq *eq) +{ + struct mthca_eqe *eqe; + eqe = get_eqe(eq, eq->cons_index); + return (MTHCA_EQ_ENTRY_OWNER_HW & eqe->owner) ? NULL : eqe; +} + +static inline void set_eqe_hw(struct mthca_eqe *eqe) +{ + eqe->owner = MTHCA_EQ_ENTRY_OWNER_HW; +} + +static void port_change(struct mthca_dev *dev, int port, int active) +{ + struct ib_event record; + + mthca_dbg(dev, "Port change to %s for port %d\n", + active ? "active" : "down", port); + + record.device = &dev->ib_dev; + record.event = active ? IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR; + record.element.port_num = port; + + ib_dispatch_event(&record); +} + +static int mthca_eq_int(struct mthca_dev *dev, struct mthca_eq *eq) +{ + struct mthca_eqe *eqe; + int disarm_cqn; + int eqes_found = 0; + int set_ci = 0; + + while ((eqe = next_eqe_sw(eq))) { + /* + * Make sure we read EQ entry contents after we've + * checked the ownership bit. + */ + rmb(); + + switch (eqe->type) { + case MTHCA_EVENT_TYPE_COMP: + disarm_cqn = be32_to_cpu(eqe->event.comp.cqn) & 0xffffff; + disarm_cq(dev, eq->eqn, disarm_cqn); + mthca_cq_completion(dev, disarm_cqn); + break; + + case MTHCA_EVENT_TYPE_PATH_MIG: + mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff, + IB_EVENT_PATH_MIG); + break; + + case MTHCA_EVENT_TYPE_COMM_EST: + mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff, + IB_EVENT_COMM_EST); + break; + + case MTHCA_EVENT_TYPE_SQ_DRAINED: + mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff, + IB_EVENT_SQ_DRAINED); + break; + + case MTHCA_EVENT_TYPE_SRQ_QP_LAST_WQE: + mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff, + IB_EVENT_QP_LAST_WQE_REACHED); + break; + + case MTHCA_EVENT_TYPE_SRQ_LIMIT: + mthca_srq_event(dev, be32_to_cpu(eqe->event.srq.srqn) & 0xffffff, + IB_EVENT_SRQ_LIMIT_REACHED); + break; + + case MTHCA_EVENT_TYPE_WQ_CATAS_ERROR: + mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff, + IB_EVENT_QP_FATAL); + break; + + case MTHCA_EVENT_TYPE_PATH_MIG_FAILED: + mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff, + IB_EVENT_PATH_MIG_ERR); + break; + + case MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff, + IB_EVENT_QP_REQ_ERR); + break; + + case MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR: + mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff, + IB_EVENT_QP_ACCESS_ERR); + break; + + case MTHCA_EVENT_TYPE_CMD: + mthca_cmd_event(dev, + be16_to_cpu(eqe->event.cmd.token), + eqe->event.cmd.status, + be64_to_cpu(eqe->event.cmd.out_param)); + break; + + case MTHCA_EVENT_TYPE_PORT_CHANGE: + port_change(dev, + (be32_to_cpu(eqe->event.port_change.port) >> 28) & 3, + eqe->subtype == 0x4); + break; + + case MTHCA_EVENT_TYPE_CQ_ERROR: + mthca_warn(dev, "CQ %s on CQN %06x\n", + eqe->event.cq_err.syndrome == 1 ? + "overrun" : "access violation", + be32_to_cpu(eqe->event.cq_err.cqn) & 0xffffff); + mthca_cq_event(dev, be32_to_cpu(eqe->event.cq_err.cqn), + IB_EVENT_CQ_ERR); + break; + + case MTHCA_EVENT_TYPE_EQ_OVERFLOW: + mthca_warn(dev, "EQ overrun on EQN %d\n", eq->eqn); + break; + + case MTHCA_EVENT_TYPE_EEC_CATAS_ERROR: + case MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR: + case MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR: + case MTHCA_EVENT_TYPE_ECC_DETECT: + default: + mthca_warn(dev, "Unhandled event %02x(%02x) on EQ %d\n", + eqe->type, eqe->subtype, eq->eqn); + break; + } + + set_eqe_hw(eqe); + ++eq->cons_index; + eqes_found = 1; + ++set_ci; + + /* + * The HCA will think the queue has overflowed if we + * don't tell it we've been processing events. We + * create our EQs with MTHCA_NUM_SPARE_EQE extra + * entries, so we must update our consumer index at + * least that often. + */ + if (unlikely(set_ci >= MTHCA_NUM_SPARE_EQE)) { + /* + * Conditional on hca_type is OK here because + * this is a rare case, not the fast path. + */ + set_eq_ci(dev, eq, eq->cons_index); + set_ci = 0; + } + } + + /* + * Rely on caller to set consumer index so that we don't have + * to test hca_type in our interrupt handling fast path. + */ + return eqes_found; +} + +static irqreturn_t mthca_tavor_interrupt(int irq, void *dev_ptr) +{ + struct mthca_dev *dev = dev_ptr; + u32 ecr; + int i; + + if (dev->eq_table.clr_mask) + writel(dev->eq_table.clr_mask, dev->eq_table.clr_int); + + ecr = readl(dev->eq_regs.tavor.ecr_base + 4); + if (!ecr) + return IRQ_NONE; + + writel(ecr, dev->eq_regs.tavor.ecr_base + + MTHCA_ECR_CLR_BASE - MTHCA_ECR_BASE + 4); + + for (i = 0; i < MTHCA_NUM_EQ; ++i) + if (ecr & dev->eq_table.eq[i].eqn_mask) { + if (mthca_eq_int(dev, &dev->eq_table.eq[i])) + tavor_set_eq_ci(dev, &dev->eq_table.eq[i], + dev->eq_table.eq[i].cons_index); + tavor_eq_req_not(dev, dev->eq_table.eq[i].eqn); + } + + return IRQ_HANDLED; +} + +static irqreturn_t mthca_tavor_msi_x_interrupt(int irq, void *eq_ptr) +{ + struct mthca_eq *eq = eq_ptr; + struct mthca_dev *dev = eq->dev; + + mthca_eq_int(dev, eq); + tavor_set_eq_ci(dev, eq, eq->cons_index); + tavor_eq_req_not(dev, eq->eqn); + + /* MSI-X vectors always belong to us */ + return IRQ_HANDLED; +} + +static irqreturn_t mthca_arbel_interrupt(int irq, void *dev_ptr) +{ + struct mthca_dev *dev = dev_ptr; + int work = 0; + int i; + + if (dev->eq_table.clr_mask) + writel(dev->eq_table.clr_mask, dev->eq_table.clr_int); + + for (i = 0; i < MTHCA_NUM_EQ; ++i) + if (mthca_eq_int(dev, &dev->eq_table.eq[i])) { + work = 1; + arbel_set_eq_ci(dev, &dev->eq_table.eq[i], + dev->eq_table.eq[i].cons_index); + } + + arbel_eq_req_not(dev, dev->eq_table.arm_mask); + + return IRQ_RETVAL(work); +} + +static irqreturn_t mthca_arbel_msi_x_interrupt(int irq, void *eq_ptr) +{ + struct mthca_eq *eq = eq_ptr; + struct mthca_dev *dev = eq->dev; + + mthca_eq_int(dev, eq); + arbel_set_eq_ci(dev, eq, eq->cons_index); + arbel_eq_req_not(dev, eq->eqn_mask); + + /* MSI-X vectors always belong to us */ + return IRQ_HANDLED; +} + +static int mthca_create_eq(struct mthca_dev *dev, + int nent, + u8 intr, + struct mthca_eq *eq) +{ + int npages; + u64 *dma_list = NULL; + dma_addr_t t; + struct mthca_mailbox *mailbox; + struct mthca_eq_context *eq_context; + int err = -ENOMEM; + int i; + + eq->dev = dev; + eq->nent = roundup_pow_of_two(max(nent, 2)); + npages = ALIGN(eq->nent * MTHCA_EQ_ENTRY_SIZE, PAGE_SIZE) / PAGE_SIZE; + + eq->page_list = kmalloc(npages * sizeof *eq->page_list, + GFP_KERNEL); + if (!eq->page_list) + goto err_out; + + for (i = 0; i < npages; ++i) + eq->page_list[i].buf = NULL; + + dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL); + if (!dma_list) + goto err_out_free; + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + goto err_out_free; + eq_context = mailbox->buf; + + for (i = 0; i < npages; ++i) { + eq->page_list[i].buf = dma_alloc_coherent(&dev->pdev->dev, + PAGE_SIZE, &t, GFP_KERNEL); + if (!eq->page_list[i].buf) + goto err_out_free_pages; + + dma_list[i] = t; + dma_unmap_addr_set(&eq->page_list[i], mapping, t); + + clear_page(eq->page_list[i].buf); + } + + for (i = 0; i < eq->nent; ++i) + set_eqe_hw(get_eqe(eq, i)); + + eq->eqn = mthca_alloc(&dev->eq_table.alloc); + if (eq->eqn == -1) + goto err_out_free_pages; + + err = mthca_mr_alloc_phys(dev, dev->driver_pd.pd_num, + dma_list, PAGE_SHIFT, npages, + 0, npages * PAGE_SIZE, + MTHCA_MPT_FLAG_LOCAL_WRITE | + MTHCA_MPT_FLAG_LOCAL_READ, + &eq->mr); + if (err) + goto err_out_free_eq; + + memset(eq_context, 0, sizeof *eq_context); + eq_context->flags = cpu_to_be32(MTHCA_EQ_STATUS_OK | + MTHCA_EQ_OWNER_HW | + MTHCA_EQ_STATE_ARMED | + MTHCA_EQ_FLAG_TR); + if (mthca_is_memfree(dev)) + eq_context->flags |= cpu_to_be32(MTHCA_EQ_STATE_ARBEL); + + eq_context->logsize_usrpage = cpu_to_be32((ffs(eq->nent) - 1) << 24); + if (mthca_is_memfree(dev)) { + eq_context->arbel_pd = cpu_to_be32(dev->driver_pd.pd_num); + } else { + eq_context->logsize_usrpage |= cpu_to_be32(dev->driver_uar.index); + eq_context->tavor_pd = cpu_to_be32(dev->driver_pd.pd_num); + } + eq_context->intr = intr; + eq_context->lkey = cpu_to_be32(eq->mr.ibmr.lkey); + + err = mthca_SW2HW_EQ(dev, mailbox, eq->eqn); + if (err) { + mthca_warn(dev, "SW2HW_EQ returned %d\n", err); + goto err_out_free_mr; + } + + kfree(dma_list); + mthca_free_mailbox(dev, mailbox); + + eq->eqn_mask = swab32(1 << eq->eqn); + eq->cons_index = 0; + + dev->eq_table.arm_mask |= eq->eqn_mask; + + mthca_dbg(dev, "Allocated EQ %d with %d entries\n", + eq->eqn, eq->nent); + + return err; + + err_out_free_mr: + mthca_free_mr(dev, &eq->mr); + + err_out_free_eq: + mthca_free(&dev->eq_table.alloc, eq->eqn); + + err_out_free_pages: + for (i = 0; i < npages; ++i) + if (eq->page_list[i].buf) + dma_free_coherent(&dev->pdev->dev, PAGE_SIZE, + eq->page_list[i].buf, + dma_unmap_addr(&eq->page_list[i], + mapping)); + + mthca_free_mailbox(dev, mailbox); + + err_out_free: + kfree(eq->page_list); + kfree(dma_list); + + err_out: + return err; +} + +static void mthca_free_eq(struct mthca_dev *dev, + struct mthca_eq *eq) +{ + struct mthca_mailbox *mailbox; + int err; + int npages = (eq->nent * MTHCA_EQ_ENTRY_SIZE + PAGE_SIZE - 1) / + PAGE_SIZE; + int i; + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return; + + err = mthca_HW2SW_EQ(dev, mailbox, eq->eqn); + if (err) + mthca_warn(dev, "HW2SW_EQ returned %d\n", err); + + dev->eq_table.arm_mask &= ~eq->eqn_mask; + + if (0) { + mthca_dbg(dev, "Dumping EQ context %02x:\n", eq->eqn); + for (i = 0; i < sizeof (struct mthca_eq_context) / 4; ++i) { + if (i % 4 == 0) + printk("[%02x] ", i * 4); + printk(" %08x", be32_to_cpup(mailbox->buf + i * 4)); + if ((i + 1) % 4 == 0) + printk("\n"); + } + } + + mthca_free_mr(dev, &eq->mr); + for (i = 0; i < npages; ++i) + pci_free_consistent(dev->pdev, PAGE_SIZE, + eq->page_list[i].buf, + dma_unmap_addr(&eq->page_list[i], mapping)); + + kfree(eq->page_list); + mthca_free_mailbox(dev, mailbox); +} + +static void mthca_free_irqs(struct mthca_dev *dev) +{ + int i; + + if (dev->eq_table.have_irq) + free_irq(dev->pdev->irq, dev); + for (i = 0; i < MTHCA_NUM_EQ; ++i) + if (dev->eq_table.eq[i].have_irq) { + free_irq(dev->eq_table.eq[i].msi_x_vector, + dev->eq_table.eq + i); + dev->eq_table.eq[i].have_irq = 0; + } +} + +static int mthca_map_reg(struct mthca_dev *dev, + unsigned long offset, unsigned long size, + void __iomem **map) +{ + phys_addr_t base = pci_resource_start(dev->pdev, 0); + + *map = ioremap(base + offset, size); + if (!*map) + return -ENOMEM; + + return 0; +} + +static int mthca_map_eq_regs(struct mthca_dev *dev) +{ + if (mthca_is_memfree(dev)) { + /* + * We assume that the EQ arm and EQ set CI registers + * fall within the first BAR. We can't trust the + * values firmware gives us, since those addresses are + * valid on the HCA's side of the PCI bus but not + * necessarily the host side. + */ + if (mthca_map_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) & + dev->fw.arbel.clr_int_base, MTHCA_CLR_INT_SIZE, + &dev->clr_base)) { + mthca_err(dev, "Couldn't map interrupt clear register, " + "aborting.\n"); + return -ENOMEM; + } + + /* + * Add 4 because we limit ourselves to EQs 0 ... 31, + * so we only need the low word of the register. + */ + if (mthca_map_reg(dev, ((pci_resource_len(dev->pdev, 0) - 1) & + dev->fw.arbel.eq_arm_base) + 4, 4, + &dev->eq_regs.arbel.eq_arm)) { + mthca_err(dev, "Couldn't map EQ arm register, aborting.\n"); + iounmap(dev->clr_base); + return -ENOMEM; + } + + if (mthca_map_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) & + dev->fw.arbel.eq_set_ci_base, + MTHCA_EQ_SET_CI_SIZE, + &dev->eq_regs.arbel.eq_set_ci_base)) { + mthca_err(dev, "Couldn't map EQ CI register, aborting.\n"); + iounmap(dev->eq_regs.arbel.eq_arm); + iounmap(dev->clr_base); + return -ENOMEM; + } + } else { + if (mthca_map_reg(dev, MTHCA_CLR_INT_BASE, MTHCA_CLR_INT_SIZE, + &dev->clr_base)) { + mthca_err(dev, "Couldn't map interrupt clear register, " + "aborting.\n"); + return -ENOMEM; + } + + if (mthca_map_reg(dev, MTHCA_ECR_BASE, + MTHCA_ECR_SIZE + MTHCA_ECR_CLR_SIZE, + &dev->eq_regs.tavor.ecr_base)) { + mthca_err(dev, "Couldn't map ecr register, " + "aborting.\n"); + iounmap(dev->clr_base); + return -ENOMEM; + } + } + + return 0; + +} + +static void mthca_unmap_eq_regs(struct mthca_dev *dev) +{ + if (mthca_is_memfree(dev)) { + iounmap(dev->eq_regs.arbel.eq_set_ci_base); + iounmap(dev->eq_regs.arbel.eq_arm); + iounmap(dev->clr_base); + } else { + iounmap(dev->eq_regs.tavor.ecr_base); + iounmap(dev->clr_base); + } +} + +int mthca_map_eq_icm(struct mthca_dev *dev, u64 icm_virt) +{ + int ret; + + /* + * We assume that mapping one page is enough for the whole EQ + * context table. This is fine with all current HCAs, because + * we only use 32 EQs and each EQ uses 32 bytes of context + * memory, or 1 KB total. + */ + dev->eq_table.icm_virt = icm_virt; + dev->eq_table.icm_page = alloc_page(GFP_HIGHUSER); + if (!dev->eq_table.icm_page) + return -ENOMEM; + dev->eq_table.icm_dma = pci_map_page(dev->pdev, dev->eq_table.icm_page, 0, + PAGE_SIZE, PCI_DMA_BIDIRECTIONAL); + if (pci_dma_mapping_error(dev->pdev, dev->eq_table.icm_dma)) { + __free_page(dev->eq_table.icm_page); + return -ENOMEM; + } + + ret = mthca_MAP_ICM_page(dev, dev->eq_table.icm_dma, icm_virt); + if (ret) { + pci_unmap_page(dev->pdev, dev->eq_table.icm_dma, PAGE_SIZE, + PCI_DMA_BIDIRECTIONAL); + __free_page(dev->eq_table.icm_page); + } + + return ret; +} + +void mthca_unmap_eq_icm(struct mthca_dev *dev) +{ + mthca_UNMAP_ICM(dev, dev->eq_table.icm_virt, 1); + pci_unmap_page(dev->pdev, dev->eq_table.icm_dma, PAGE_SIZE, + PCI_DMA_BIDIRECTIONAL); + __free_page(dev->eq_table.icm_page); +} + +int mthca_init_eq_table(struct mthca_dev *dev) +{ + int err; + u8 intr; + int i; + + err = mthca_alloc_init(&dev->eq_table.alloc, + dev->limits.num_eqs, + dev->limits.num_eqs - 1, + dev->limits.reserved_eqs); + if (err) + return err; + + err = mthca_map_eq_regs(dev); + if (err) + goto err_out_free; + + if (dev->mthca_flags & MTHCA_FLAG_MSI_X) { + dev->eq_table.clr_mask = 0; + } else { + dev->eq_table.clr_mask = + swab32(1 << (dev->eq_table.inta_pin & 31)); + dev->eq_table.clr_int = dev->clr_base + + (dev->eq_table.inta_pin < 32 ? 4 : 0); + } + + dev->eq_table.arm_mask = 0; + + intr = dev->eq_table.inta_pin; + + err = mthca_create_eq(dev, dev->limits.num_cqs + MTHCA_NUM_SPARE_EQE, + (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 128 : intr, + &dev->eq_table.eq[MTHCA_EQ_COMP]); + if (err) + goto err_out_unmap; + + err = mthca_create_eq(dev, MTHCA_NUM_ASYNC_EQE + MTHCA_NUM_SPARE_EQE, + (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 129 : intr, + &dev->eq_table.eq[MTHCA_EQ_ASYNC]); + if (err) + goto err_out_comp; + + err = mthca_create_eq(dev, MTHCA_NUM_CMD_EQE + MTHCA_NUM_SPARE_EQE, + (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 130 : intr, + &dev->eq_table.eq[MTHCA_EQ_CMD]); + if (err) + goto err_out_async; + + if (dev->mthca_flags & MTHCA_FLAG_MSI_X) { + static const char *eq_name[] = { + [MTHCA_EQ_COMP] = DRV_NAME "-comp", + [MTHCA_EQ_ASYNC] = DRV_NAME "-async", + [MTHCA_EQ_CMD] = DRV_NAME "-cmd" + }; + + for (i = 0; i < MTHCA_NUM_EQ; ++i) { + snprintf(dev->eq_table.eq[i].irq_name, + IB_DEVICE_NAME_MAX, + "%s@pci:%s", eq_name[i], + pci_name(dev->pdev)); + err = request_irq(dev->eq_table.eq[i].msi_x_vector, + mthca_is_memfree(dev) ? + mthca_arbel_msi_x_interrupt : + mthca_tavor_msi_x_interrupt, + 0, dev->eq_table.eq[i].irq_name, + dev->eq_table.eq + i); + if (err) + goto err_out_cmd; + dev->eq_table.eq[i].have_irq = 1; + } + } else { + snprintf(dev->eq_table.eq[0].irq_name, IB_DEVICE_NAME_MAX, + DRV_NAME "@pci:%s", pci_name(dev->pdev)); + err = request_irq(dev->pdev->irq, + mthca_is_memfree(dev) ? + mthca_arbel_interrupt : + mthca_tavor_interrupt, + IRQF_SHARED, dev->eq_table.eq[0].irq_name, dev); + if (err) + goto err_out_cmd; + dev->eq_table.have_irq = 1; + } + + err = mthca_MAP_EQ(dev, async_mask(dev), + 0, dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn); + if (err) + mthca_warn(dev, "MAP_EQ for async EQ %d failed (%d)\n", + dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn, err); + + err = mthca_MAP_EQ(dev, MTHCA_CMD_EVENT_MASK, + 0, dev->eq_table.eq[MTHCA_EQ_CMD].eqn); + if (err) + mthca_warn(dev, "MAP_EQ for cmd EQ %d failed (%d)\n", + dev->eq_table.eq[MTHCA_EQ_CMD].eqn, err); + + for (i = 0; i < MTHCA_NUM_EQ; ++i) + if (mthca_is_memfree(dev)) + arbel_eq_req_not(dev, dev->eq_table.eq[i].eqn_mask); + else + tavor_eq_req_not(dev, dev->eq_table.eq[i].eqn); + + return 0; + +err_out_cmd: + mthca_free_irqs(dev); + mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_CMD]); + +err_out_async: + mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_ASYNC]); + +err_out_comp: + mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_COMP]); + +err_out_unmap: + mthca_unmap_eq_regs(dev); + +err_out_free: + mthca_alloc_cleanup(&dev->eq_table.alloc); + return err; +} + +void mthca_cleanup_eq_table(struct mthca_dev *dev) +{ + int i; + + mthca_free_irqs(dev); + + mthca_MAP_EQ(dev, async_mask(dev), + 1, dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn); + mthca_MAP_EQ(dev, MTHCA_CMD_EVENT_MASK, + 1, dev->eq_table.eq[MTHCA_EQ_CMD].eqn); + + for (i = 0; i < MTHCA_NUM_EQ; ++i) + mthca_free_eq(dev, &dev->eq_table.eq[i]); + + mthca_unmap_eq_regs(dev); + + mthca_alloc_cleanup(&dev->eq_table.alloc); +} diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c new file mode 100644 index 0000000..8881fa3 --- /dev/null +++ b/drivers/infiniband/hw/mthca/mthca_mad.c @@ -0,0 +1,341 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include +#include +#include + +#include "mthca_dev.h" +#include "mthca_cmd.h" + +enum { + MTHCA_VENDOR_CLASS1 = 0x9, + MTHCA_VENDOR_CLASS2 = 0xa +}; + +static int mthca_update_rate(struct mthca_dev *dev, u8 port_num) +{ + struct ib_port_attr *tprops = NULL; + int ret; + + tprops = kmalloc(sizeof *tprops, GFP_KERNEL); + if (!tprops) + return -ENOMEM; + + ret = ib_query_port(&dev->ib_dev, port_num, tprops); + if (ret) { + printk(KERN_WARNING "ib_query_port failed (%d) for %s port %d\n", + ret, dev->ib_dev.name, port_num); + goto out; + } + + dev->rate[port_num - 1] = tprops->active_speed * + ib_width_enum_to_int(tprops->active_width); + +out: + kfree(tprops); + return ret; +} + +static void update_sm_ah(struct mthca_dev *dev, + u8 port_num, u16 lid, u8 sl) +{ + struct ib_ah *new_ah; + struct ib_ah_attr ah_attr; + unsigned long flags; + + if (!dev->send_agent[port_num - 1][0]) + return; + + memset(&ah_attr, 0, sizeof ah_attr); + ah_attr.dlid = lid; + ah_attr.sl = sl; + ah_attr.port_num = port_num; + + new_ah = ib_create_ah(dev->send_agent[port_num - 1][0]->qp->pd, + &ah_attr); + if (IS_ERR(new_ah)) + return; + + spin_lock_irqsave(&dev->sm_lock, flags); + if (dev->sm_ah[port_num - 1]) + ib_destroy_ah(dev->sm_ah[port_num - 1]); + dev->sm_ah[port_num - 1] = new_ah; + spin_unlock_irqrestore(&dev->sm_lock, flags); +} + +/* + * Snoop SM MADs for port info and P_Key table sets, so we can + * synthesize LID change and P_Key change events. + */ +static void smp_snoop(struct ib_device *ibdev, + u8 port_num, + struct ib_mad *mad, + u16 prev_lid) +{ + struct ib_event event; + + if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && + mad->mad_hdr.method == IB_MGMT_METHOD_SET) { + if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO) { + struct ib_port_info *pinfo = + (struct ib_port_info *) ((struct ib_smp *) mad)->data; + u16 lid = be16_to_cpu(pinfo->lid); + + mthca_update_rate(to_mdev(ibdev), port_num); + update_sm_ah(to_mdev(ibdev), port_num, + be16_to_cpu(pinfo->sm_lid), + pinfo->neighbormtu_mastersmsl & 0xf); + + event.device = ibdev; + event.element.port_num = port_num; + + if (pinfo->clientrereg_resv_subnetto & 0x80) { + event.event = IB_EVENT_CLIENT_REREGISTER; + ib_dispatch_event(&event); + } + + if (prev_lid != lid) { + event.event = IB_EVENT_LID_CHANGE; + ib_dispatch_event(&event); + } + } + + if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PKEY_TABLE) { + event.device = ibdev; + event.event = IB_EVENT_PKEY_CHANGE; + event.element.port_num = port_num; + ib_dispatch_event(&event); + } + } +} + +static void node_desc_override(struct ib_device *dev, + struct ib_mad *mad) +{ + if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && + mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP && + mad->mad_hdr.attr_id == IB_SMP_ATTR_NODE_DESC) { + mutex_lock(&to_mdev(dev)->cap_mask_mutex); + memcpy(((struct ib_smp *) mad)->data, dev->node_desc, 64); + mutex_unlock(&to_mdev(dev)->cap_mask_mutex); + } +} + +static void forward_trap(struct mthca_dev *dev, + u8 port_num, + struct ib_mad *mad) +{ + int qpn = mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED; + struct ib_mad_send_buf *send_buf; + struct ib_mad_agent *agent = dev->send_agent[port_num - 1][qpn]; + int ret; + unsigned long flags; + + if (agent) { + send_buf = ib_create_send_mad(agent, qpn, 0, 0, IB_MGMT_MAD_HDR, + IB_MGMT_MAD_DATA, GFP_ATOMIC); + if (IS_ERR(send_buf)) + return; + /* + * We rely here on the fact that MLX QPs don't use the + * address handle after the send is posted (this is + * wrong following the IB spec strictly, but we know + * it's OK for our devices). + */ + spin_lock_irqsave(&dev->sm_lock, flags); + memcpy(send_buf->mad, mad, sizeof *mad); + if ((send_buf->ah = dev->sm_ah[port_num - 1])) + ret = ib_post_send_mad(send_buf, NULL); + else + ret = -EINVAL; + spin_unlock_irqrestore(&dev->sm_lock, flags); + + if (ret) + ib_free_send_mad(send_buf); + } +} + +int mthca_process_mad(struct ib_device *ibdev, + int mad_flags, + u8 port_num, + struct ib_wc *in_wc, + struct ib_grh *in_grh, + struct ib_mad *in_mad, + struct ib_mad *out_mad) +{ + int err; + u16 slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); + u16 prev_lid = 0; + struct ib_port_attr pattr; + + /* Forward locally generated traps to the SM */ + if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && + slid == 0) { + forward_trap(to_mdev(ibdev), port_num, in_mad); + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + } + + /* + * Only handle SM gets, sets and trap represses for SM class + * + * Only handle PMA and Mellanox vendor-specific class gets and + * sets for other classes. + */ + if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_SET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_TRAP_REPRESS) + return IB_MAD_RESULT_SUCCESS; + + /* + * Don't process SMInfo queries or vendor-specific + * MADs -- the SMA can't handle them. + */ + if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO || + ((in_mad->mad_hdr.attr_id & IB_SMP_ATTR_VENDOR_MASK) == + IB_SMP_ATTR_VENDOR_MASK)) + return IB_MAD_RESULT_SUCCESS; + } else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT || + in_mad->mad_hdr.mgmt_class == MTHCA_VENDOR_CLASS1 || + in_mad->mad_hdr.mgmt_class == MTHCA_VENDOR_CLASS2) { + if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET && + in_mad->mad_hdr.method != IB_MGMT_METHOD_SET) + return IB_MAD_RESULT_SUCCESS; + } else + return IB_MAD_RESULT_SUCCESS; + if ((in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && + in_mad->mad_hdr.method == IB_MGMT_METHOD_SET && + in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO && + !ib_query_port(ibdev, port_num, &pattr)) + prev_lid = pattr.lid; + + err = mthca_MAD_IFC(to_mdev(ibdev), + mad_flags & IB_MAD_IGNORE_MKEY, + mad_flags & IB_MAD_IGNORE_BKEY, + port_num, in_wc, in_grh, in_mad, out_mad); + if (err == -EBADMSG) + return IB_MAD_RESULT_SUCCESS; + else if (err) { + mthca_err(to_mdev(ibdev), "MAD_IFC returned %d\n", err); + return IB_MAD_RESULT_FAILURE; + } + + if (!out_mad->mad_hdr.status) { + smp_snoop(ibdev, port_num, in_mad, prev_lid); + node_desc_override(ibdev, out_mad); + } + + /* set return bit in status of directed route responses */ + if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + out_mad->mad_hdr.status |= cpu_to_be16(1 << 15); + + if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) + /* no response for trap repress */ + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +static void send_handler(struct ib_mad_agent *agent, + struct ib_mad_send_wc *mad_send_wc) +{ + ib_free_send_mad(mad_send_wc->send_buf); +} + +int mthca_create_agents(struct mthca_dev *dev) +{ + struct ib_mad_agent *agent; + int p, q; + int ret; + + spin_lock_init(&dev->sm_lock); + + for (p = 0; p < dev->limits.num_ports; ++p) + for (q = 0; q <= 1; ++q) { + agent = ib_register_mad_agent(&dev->ib_dev, p + 1, + q ? IB_QPT_GSI : IB_QPT_SMI, + NULL, 0, send_handler, + NULL, NULL, 0); + if (IS_ERR(agent)) { + ret = PTR_ERR(agent); + goto err; + } + dev->send_agent[p][q] = agent; + } + + + for (p = 1; p <= dev->limits.num_ports; ++p) { + ret = mthca_update_rate(dev, p); + if (ret) { + mthca_err(dev, "Failed to obtain port %d rate." + " aborting.\n", p); + goto err; + } + } + + return 0; + +err: + for (p = 0; p < dev->limits.num_ports; ++p) + for (q = 0; q <= 1; ++q) + if (dev->send_agent[p][q]) + ib_unregister_mad_agent(dev->send_agent[p][q]); + + return ret; +} + +void mthca_free_agents(struct mthca_dev *dev) +{ + struct ib_mad_agent *agent; + int p, q; + + for (p = 0; p < dev->limits.num_ports; ++p) { + for (q = 0; q <= 1; ++q) { + agent = dev->send_agent[p][q]; + dev->send_agent[p][q] = NULL; + ib_unregister_mad_agent(agent); + } + + if (dev->sm_ah[p]) + ib_destroy_ah(dev->sm_ah[p]); + } +} diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c new file mode 100644 index 0000000..ded76c1 --- /dev/null +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -0,0 +1,1275 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#include "mthca_dev.h" +#include "mthca_config_reg.h" +#include "mthca_cmd.h" +#include "mthca_profile.h" +#include "mthca_memfree.h" +#include "mthca_wqe.h" + +MODULE_AUTHOR("Roland Dreier"); +MODULE_DESCRIPTION("Mellanox InfiniBand HCA low-level driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); + +#ifdef CONFIG_INFINIBAND_MTHCA_DEBUG + +int mthca_debug_level = 0; +module_param_named(debug_level, mthca_debug_level, int, 0644); +MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); + +#endif /* CONFIG_INFINIBAND_MTHCA_DEBUG */ + +#ifdef CONFIG_PCI_MSI + +static int msi_x = 1; +module_param(msi_x, int, 0444); +MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero"); + +#else /* CONFIG_PCI_MSI */ + +#define msi_x (0) + +#endif /* CONFIG_PCI_MSI */ + +static int tune_pci = 0; +module_param(tune_pci, int, 0444); +MODULE_PARM_DESC(tune_pci, "increase PCI burst from the default set by BIOS if nonzero"); + +DEFINE_MUTEX(mthca_device_mutex); + +#define MTHCA_DEFAULT_NUM_QP (1 << 16) +#define MTHCA_DEFAULT_RDB_PER_QP (1 << 2) +#define MTHCA_DEFAULT_NUM_CQ (1 << 16) +#define MTHCA_DEFAULT_NUM_MCG (1 << 13) +#define MTHCA_DEFAULT_NUM_MPT (1 << 17) +#define MTHCA_DEFAULT_NUM_MTT (1 << 20) +#define MTHCA_DEFAULT_NUM_UDAV (1 << 15) +#define MTHCA_DEFAULT_NUM_RESERVED_MTTS (1 << 18) +#define MTHCA_DEFAULT_NUM_UARC_SIZE (1 << 18) + +static struct mthca_profile hca_profile = { + .num_qp = MTHCA_DEFAULT_NUM_QP, + .rdb_per_qp = MTHCA_DEFAULT_RDB_PER_QP, + .num_cq = MTHCA_DEFAULT_NUM_CQ, + .num_mcg = MTHCA_DEFAULT_NUM_MCG, + .num_mpt = MTHCA_DEFAULT_NUM_MPT, + .num_mtt = MTHCA_DEFAULT_NUM_MTT, + .num_udav = MTHCA_DEFAULT_NUM_UDAV, /* Tavor only */ + .fmr_reserved_mtts = MTHCA_DEFAULT_NUM_RESERVED_MTTS, /* Tavor only */ + .uarc_size = MTHCA_DEFAULT_NUM_UARC_SIZE, /* Arbel only */ +}; + +module_param_named(num_qp, hca_profile.num_qp, int, 0444); +MODULE_PARM_DESC(num_qp, "maximum number of QPs per HCA"); + +module_param_named(rdb_per_qp, hca_profile.rdb_per_qp, int, 0444); +MODULE_PARM_DESC(rdb_per_qp, "number of RDB buffers per QP"); + +module_param_named(num_cq, hca_profile.num_cq, int, 0444); +MODULE_PARM_DESC(num_cq, "maximum number of CQs per HCA"); + +module_param_named(num_mcg, hca_profile.num_mcg, int, 0444); +MODULE_PARM_DESC(num_mcg, "maximum number of multicast groups per HCA"); + +module_param_named(num_mpt, hca_profile.num_mpt, int, 0444); +MODULE_PARM_DESC(num_mpt, + "maximum number of memory protection table entries per HCA"); + +module_param_named(num_mtt, hca_profile.num_mtt, int, 0444); +MODULE_PARM_DESC(num_mtt, + "maximum number of memory translation table segments per HCA"); + +module_param_named(num_udav, hca_profile.num_udav, int, 0444); +MODULE_PARM_DESC(num_udav, "maximum number of UD address vectors per HCA"); + +module_param_named(fmr_reserved_mtts, hca_profile.fmr_reserved_mtts, int, 0444); +MODULE_PARM_DESC(fmr_reserved_mtts, + "number of memory translation table segments reserved for FMR"); + +static int log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8); +module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444); +MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment (1-5)"); + +static char mthca_version[] = + DRV_NAME ": Mellanox InfiniBand HCA driver v" + DRV_VERSION " (" DRV_RELDATE ")\n"; + +static int mthca_tune_pci(struct mthca_dev *mdev) +{ + if (!tune_pci) + return 0; + + /* First try to max out Read Byte Count */ + if (pci_find_capability(mdev->pdev, PCI_CAP_ID_PCIX)) { + if (pcix_set_mmrbc(mdev->pdev, pcix_get_max_mmrbc(mdev->pdev))) { + mthca_err(mdev, "Couldn't set PCI-X max read count, " + "aborting.\n"); + return -ENODEV; + } + } else if (!(mdev->mthca_flags & MTHCA_FLAG_PCIE)) + mthca_info(mdev, "No PCI-X capability, not setting RBC.\n"); + + if (pci_is_pcie(mdev->pdev)) { + if (pcie_set_readrq(mdev->pdev, 4096)) { + mthca_err(mdev, "Couldn't write PCI Express read request, " + "aborting.\n"); + return -ENODEV; + } + } else if (mdev->mthca_flags & MTHCA_FLAG_PCIE) + mthca_info(mdev, "No PCI Express capability, " + "not setting Max Read Request Size.\n"); + + return 0; +} + +static int mthca_dev_lim(struct mthca_dev *mdev, struct mthca_dev_lim *dev_lim) +{ + int err; + + mdev->limits.mtt_seg_size = (1 << log_mtts_per_seg) * 8; + err = mthca_QUERY_DEV_LIM(mdev, dev_lim); + if (err) { + mthca_err(mdev, "QUERY_DEV_LIM command returned %d" + ", aborting.\n", err); + return err; + } + if (dev_lim->min_page_sz > PAGE_SIZE) { + mthca_err(mdev, "HCA minimum page size of %d bigger than " + "kernel PAGE_SIZE of %ld, aborting.\n", + dev_lim->min_page_sz, PAGE_SIZE); + return -ENODEV; + } + if (dev_lim->num_ports > MTHCA_MAX_PORTS) { + mthca_err(mdev, "HCA has %d ports, but we only support %d, " + "aborting.\n", + dev_lim->num_ports, MTHCA_MAX_PORTS); + return -ENODEV; + } + + if (dev_lim->uar_size > pci_resource_len(mdev->pdev, 2)) { + mthca_err(mdev, "HCA reported UAR size of 0x%x bigger than " + "PCI resource 2 size of 0x%llx, aborting.\n", + dev_lim->uar_size, + (unsigned long long)pci_resource_len(mdev->pdev, 2)); + return -ENODEV; + } + + mdev->limits.num_ports = dev_lim->num_ports; + mdev->limits.vl_cap = dev_lim->max_vl; + mdev->limits.mtu_cap = dev_lim->max_mtu; + mdev->limits.gid_table_len = dev_lim->max_gids; + mdev->limits.pkey_table_len = dev_lim->max_pkeys; + mdev->limits.local_ca_ack_delay = dev_lim->local_ca_ack_delay; + /* + * Need to allow for worst case send WQE overhead and check + * whether max_desc_sz imposes a lower limit than max_sg; UD + * send has the biggest overhead. + */ + mdev->limits.max_sg = min_t(int, dev_lim->max_sg, + (dev_lim->max_desc_sz - + sizeof (struct mthca_next_seg) - + (mthca_is_memfree(mdev) ? + sizeof (struct mthca_arbel_ud_seg) : + sizeof (struct mthca_tavor_ud_seg))) / + sizeof (struct mthca_data_seg)); + mdev->limits.max_wqes = dev_lim->max_qp_sz; + mdev->limits.max_qp_init_rdma = dev_lim->max_requester_per_qp; + mdev->limits.reserved_qps = dev_lim->reserved_qps; + mdev->limits.max_srq_wqes = dev_lim->max_srq_sz; + mdev->limits.reserved_srqs = dev_lim->reserved_srqs; + mdev->limits.reserved_eecs = dev_lim->reserved_eecs; + mdev->limits.max_desc_sz = dev_lim->max_desc_sz; + mdev->limits.max_srq_sge = mthca_max_srq_sge(mdev); + /* + * Subtract 1 from the limit because we need to allocate a + * spare CQE so the HCA HW can tell the difference between an + * empty CQ and a full CQ. + */ + mdev->limits.max_cqes = dev_lim->max_cq_sz - 1; + mdev->limits.reserved_cqs = dev_lim->reserved_cqs; + mdev->limits.reserved_eqs = dev_lim->reserved_eqs; + mdev->limits.reserved_mtts = dev_lim->reserved_mtts; + mdev->limits.reserved_mrws = dev_lim->reserved_mrws; + mdev->limits.reserved_uars = dev_lim->reserved_uars; + mdev->limits.reserved_pds = dev_lim->reserved_pds; + mdev->limits.port_width_cap = dev_lim->max_port_width; + mdev->limits.page_size_cap = ~(u32) (dev_lim->min_page_sz - 1); + mdev->limits.flags = dev_lim->flags; + /* + * For old FW that doesn't return static rate support, use a + * value of 0x3 (only static rate values of 0 or 1 are handled), + * except on Sinai, where even old FW can handle static rate + * values of 2 and 3. + */ + if (dev_lim->stat_rate_support) + mdev->limits.stat_rate_support = dev_lim->stat_rate_support; + else if (mdev->mthca_flags & MTHCA_FLAG_SINAI_OPT) + mdev->limits.stat_rate_support = 0xf; + else + mdev->limits.stat_rate_support = 0x3; + + /* IB_DEVICE_RESIZE_MAX_WR not supported by driver. + May be doable since hardware supports it for SRQ. + + IB_DEVICE_N_NOTIFY_CQ is supported by hardware but not by driver. + + IB_DEVICE_SRQ_RESIZE is supported by hardware but SRQ is not + supported by driver. */ + mdev->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | + IB_DEVICE_PORT_ACTIVE_EVENT | + IB_DEVICE_SYS_IMAGE_GUID | + IB_DEVICE_RC_RNR_NAK_GEN; + + if (dev_lim->flags & DEV_LIM_FLAG_BAD_PKEY_CNTR) + mdev->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; + + if (dev_lim->flags & DEV_LIM_FLAG_BAD_QKEY_CNTR) + mdev->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; + + if (dev_lim->flags & DEV_LIM_FLAG_RAW_MULTI) + mdev->device_cap_flags |= IB_DEVICE_RAW_MULTI; + + if (dev_lim->flags & DEV_LIM_FLAG_AUTO_PATH_MIG) + mdev->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; + + if (dev_lim->flags & DEV_LIM_FLAG_UD_AV_PORT_ENFORCE) + mdev->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE; + + if (dev_lim->flags & DEV_LIM_FLAG_SRQ) + mdev->mthca_flags |= MTHCA_FLAG_SRQ; + + if (mthca_is_memfree(mdev)) + if (dev_lim->flags & DEV_LIM_FLAG_IPOIB_CSUM) + mdev->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; + + return 0; +} + +static int mthca_init_tavor(struct mthca_dev *mdev) +{ + s64 size; + int err; + struct mthca_dev_lim dev_lim; + struct mthca_profile profile; + struct mthca_init_hca_param init_hca; + + err = mthca_SYS_EN(mdev); + if (err) { + mthca_err(mdev, "SYS_EN command returned %d, aborting.\n", err); + return err; + } + + err = mthca_QUERY_FW(mdev); + if (err) { + mthca_err(mdev, "QUERY_FW command returned %d," + " aborting.\n", err); + goto err_disable; + } + err = mthca_QUERY_DDR(mdev); + if (err) { + mthca_err(mdev, "QUERY_DDR command returned %d, aborting.\n", err); + goto err_disable; + } + + err = mthca_dev_lim(mdev, &dev_lim); + if (err) { + mthca_err(mdev, "QUERY_DEV_LIM command returned %d, aborting.\n", err); + goto err_disable; + } + + profile = hca_profile; + profile.num_uar = dev_lim.uar_size / PAGE_SIZE; + profile.uarc_size = 0; + if (mdev->mthca_flags & MTHCA_FLAG_SRQ) + profile.num_srq = dev_lim.max_srqs; + + size = mthca_make_profile(mdev, &profile, &dev_lim, &init_hca); + if (size < 0) { + err = size; + goto err_disable; + } + + err = mthca_INIT_HCA(mdev, &init_hca); + if (err) { + mthca_err(mdev, "INIT_HCA command returned %d, aborting.\n", err); + goto err_disable; + } + + return 0; + +err_disable: + mthca_SYS_DIS(mdev); + + return err; +} + +static int mthca_load_fw(struct mthca_dev *mdev) +{ + int err; + + /* FIXME: use HCA-attached memory for FW if present */ + + mdev->fw.arbel.fw_icm = + mthca_alloc_icm(mdev, mdev->fw.arbel.fw_pages, + GFP_HIGHUSER | __GFP_NOWARN, 0); + if (!mdev->fw.arbel.fw_icm) { + mthca_err(mdev, "Couldn't allocate FW area, aborting.\n"); + return -ENOMEM; + } + + err = mthca_MAP_FA(mdev, mdev->fw.arbel.fw_icm); + if (err) { + mthca_err(mdev, "MAP_FA command returned %d, aborting.\n", err); + goto err_free; + } + err = mthca_RUN_FW(mdev); + if (err) { + mthca_err(mdev, "RUN_FW command returned %d, aborting.\n", err); + goto err_unmap_fa; + } + + return 0; + +err_unmap_fa: + mthca_UNMAP_FA(mdev); + +err_free: + mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0); + return err; +} + +static int mthca_init_icm(struct mthca_dev *mdev, + struct mthca_dev_lim *dev_lim, + struct mthca_init_hca_param *init_hca, + u64 icm_size) +{ + u64 aux_pages; + int err; + + err = mthca_SET_ICM_SIZE(mdev, icm_size, &aux_pages); + if (err) { + mthca_err(mdev, "SET_ICM_SIZE command returned %d, aborting.\n", err); + return err; + } + + mthca_dbg(mdev, "%lld KB of HCA context requires %lld KB aux memory.\n", + (unsigned long long) icm_size >> 10, + (unsigned long long) aux_pages << 2); + + mdev->fw.arbel.aux_icm = mthca_alloc_icm(mdev, aux_pages, + GFP_HIGHUSER | __GFP_NOWARN, 0); + if (!mdev->fw.arbel.aux_icm) { + mthca_err(mdev, "Couldn't allocate aux memory, aborting.\n"); + return -ENOMEM; + } + + err = mthca_MAP_ICM_AUX(mdev, mdev->fw.arbel.aux_icm); + if (err) { + mthca_err(mdev, "MAP_ICM_AUX returned %d, aborting.\n", err); + goto err_free_aux; + } + + err = mthca_map_eq_icm(mdev, init_hca->eqc_base); + if (err) { + mthca_err(mdev, "Failed to map EQ context memory, aborting.\n"); + goto err_unmap_aux; + } + + /* CPU writes to non-reserved MTTs, while HCA might DMA to reserved mtts */ + mdev->limits.reserved_mtts = ALIGN(mdev->limits.reserved_mtts * mdev->limits.mtt_seg_size, + dma_get_cache_alignment()) / mdev->limits.mtt_seg_size; + + mdev->mr_table.mtt_table = mthca_alloc_icm_table(mdev, init_hca->mtt_base, + mdev->limits.mtt_seg_size, + mdev->limits.num_mtt_segs, + mdev->limits.reserved_mtts, + 1, 0); + if (!mdev->mr_table.mtt_table) { + mthca_err(mdev, "Failed to map MTT context memory, aborting.\n"); + err = -ENOMEM; + goto err_unmap_eq; + } + + mdev->mr_table.mpt_table = mthca_alloc_icm_table(mdev, init_hca->mpt_base, + dev_lim->mpt_entry_sz, + mdev->limits.num_mpts, + mdev->limits.reserved_mrws, + 1, 1); + if (!mdev->mr_table.mpt_table) { + mthca_err(mdev, "Failed to map MPT context memory, aborting.\n"); + err = -ENOMEM; + goto err_unmap_mtt; + } + + mdev->qp_table.qp_table = mthca_alloc_icm_table(mdev, init_hca->qpc_base, + dev_lim->qpc_entry_sz, + mdev->limits.num_qps, + mdev->limits.reserved_qps, + 0, 0); + if (!mdev->qp_table.qp_table) { + mthca_err(mdev, "Failed to map QP context memory, aborting.\n"); + err = -ENOMEM; + goto err_unmap_mpt; + } + + mdev->qp_table.eqp_table = mthca_alloc_icm_table(mdev, init_hca->eqpc_base, + dev_lim->eqpc_entry_sz, + mdev->limits.num_qps, + mdev->limits.reserved_qps, + 0, 0); + if (!mdev->qp_table.eqp_table) { + mthca_err(mdev, "Failed to map EQP context memory, aborting.\n"); + err = -ENOMEM; + goto err_unmap_qp; + } + + mdev->qp_table.rdb_table = mthca_alloc_icm_table(mdev, init_hca->rdb_base, + MTHCA_RDB_ENTRY_SIZE, + mdev->limits.num_qps << + mdev->qp_table.rdb_shift, 0, + 0, 0); + if (!mdev->qp_table.rdb_table) { + mthca_err(mdev, "Failed to map RDB context memory, aborting\n"); + err = -ENOMEM; + goto err_unmap_eqp; + } + + mdev->cq_table.table = mthca_alloc_icm_table(mdev, init_hca->cqc_base, + dev_lim->cqc_entry_sz, + mdev->limits.num_cqs, + mdev->limits.reserved_cqs, + 0, 0); + if (!mdev->cq_table.table) { + mthca_err(mdev, "Failed to map CQ context memory, aborting.\n"); + err = -ENOMEM; + goto err_unmap_rdb; + } + + if (mdev->mthca_flags & MTHCA_FLAG_SRQ) { + mdev->srq_table.table = + mthca_alloc_icm_table(mdev, init_hca->srqc_base, + dev_lim->srq_entry_sz, + mdev->limits.num_srqs, + mdev->limits.reserved_srqs, + 0, 0); + if (!mdev->srq_table.table) { + mthca_err(mdev, "Failed to map SRQ context memory, " + "aborting.\n"); + err = -ENOMEM; + goto err_unmap_cq; + } + } + + /* + * It's not strictly required, but for simplicity just map the + * whole multicast group table now. The table isn't very big + * and it's a lot easier than trying to track ref counts. + */ + mdev->mcg_table.table = mthca_alloc_icm_table(mdev, init_hca->mc_base, + MTHCA_MGM_ENTRY_SIZE, + mdev->limits.num_mgms + + mdev->limits.num_amgms, + mdev->limits.num_mgms + + mdev->limits.num_amgms, + 0, 0); + if (!mdev->mcg_table.table) { + mthca_err(mdev, "Failed to map MCG context memory, aborting.\n"); + err = -ENOMEM; + goto err_unmap_srq; + } + + return 0; + +err_unmap_srq: + if (mdev->mthca_flags & MTHCA_FLAG_SRQ) + mthca_free_icm_table(mdev, mdev->srq_table.table); + +err_unmap_cq: + mthca_free_icm_table(mdev, mdev->cq_table.table); + +err_unmap_rdb: + mthca_free_icm_table(mdev, mdev->qp_table.rdb_table); + +err_unmap_eqp: + mthca_free_icm_table(mdev, mdev->qp_table.eqp_table); + +err_unmap_qp: + mthca_free_icm_table(mdev, mdev->qp_table.qp_table); + +err_unmap_mpt: + mthca_free_icm_table(mdev, mdev->mr_table.mpt_table); + +err_unmap_mtt: + mthca_free_icm_table(mdev, mdev->mr_table.mtt_table); + +err_unmap_eq: + mthca_unmap_eq_icm(mdev); + +err_unmap_aux: + mthca_UNMAP_ICM_AUX(mdev); + +err_free_aux: + mthca_free_icm(mdev, mdev->fw.arbel.aux_icm, 0); + + return err; +} + +static void mthca_free_icms(struct mthca_dev *mdev) +{ + + mthca_free_icm_table(mdev, mdev->mcg_table.table); + if (mdev->mthca_flags & MTHCA_FLAG_SRQ) + mthca_free_icm_table(mdev, mdev->srq_table.table); + mthca_free_icm_table(mdev, mdev->cq_table.table); + mthca_free_icm_table(mdev, mdev->qp_table.rdb_table); + mthca_free_icm_table(mdev, mdev->qp_table.eqp_table); + mthca_free_icm_table(mdev, mdev->qp_table.qp_table); + mthca_free_icm_table(mdev, mdev->mr_table.mpt_table); + mthca_free_icm_table(mdev, mdev->mr_table.mtt_table); + mthca_unmap_eq_icm(mdev); + + mthca_UNMAP_ICM_AUX(mdev); + mthca_free_icm(mdev, mdev->fw.arbel.aux_icm, 0); +} + +static int mthca_init_arbel(struct mthca_dev *mdev) +{ + struct mthca_dev_lim dev_lim; + struct mthca_profile profile; + struct mthca_init_hca_param init_hca; + s64 icm_size; + int err; + + err = mthca_QUERY_FW(mdev); + if (err) { + mthca_err(mdev, "QUERY_FW command failed %d, aborting.\n", err); + return err; + } + + err = mthca_ENABLE_LAM(mdev); + if (err == -EAGAIN) { + mthca_dbg(mdev, "No HCA-attached memory (running in MemFree mode)\n"); + mdev->mthca_flags |= MTHCA_FLAG_NO_LAM; + } else if (err) { + mthca_err(mdev, "ENABLE_LAM returned %d, aborting.\n", err); + return err; + } + + err = mthca_load_fw(mdev); + if (err) { + mthca_err(mdev, "Loading FW returned %d, aborting.\n", err); + goto err_disable; + } + + err = mthca_dev_lim(mdev, &dev_lim); + if (err) { + mthca_err(mdev, "QUERY_DEV_LIM returned %d, aborting.\n", err); + goto err_stop_fw; + } + + profile = hca_profile; + profile.num_uar = dev_lim.uar_size / PAGE_SIZE; + profile.num_udav = 0; + if (mdev->mthca_flags & MTHCA_FLAG_SRQ) + profile.num_srq = dev_lim.max_srqs; + + icm_size = mthca_make_profile(mdev, &profile, &dev_lim, &init_hca); + if (icm_size < 0) { + err = icm_size; + goto err_stop_fw; + } + + err = mthca_init_icm(mdev, &dev_lim, &init_hca, icm_size); + if (err) + goto err_stop_fw; + + err = mthca_INIT_HCA(mdev, &init_hca); + if (err) { + mthca_err(mdev, "INIT_HCA command returned %d, aborting.\n", err); + goto err_free_icm; + } + + return 0; + +err_free_icm: + mthca_free_icms(mdev); + +err_stop_fw: + mthca_UNMAP_FA(mdev); + mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0); + +err_disable: + if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM)) + mthca_DISABLE_LAM(mdev); + + return err; +} + +static void mthca_close_hca(struct mthca_dev *mdev) +{ + mthca_CLOSE_HCA(mdev, 0); + + if (mthca_is_memfree(mdev)) { + mthca_free_icms(mdev); + + mthca_UNMAP_FA(mdev); + mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0); + + if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM)) + mthca_DISABLE_LAM(mdev); + } else + mthca_SYS_DIS(mdev); +} + +static int mthca_init_hca(struct mthca_dev *mdev) +{ + int err; + struct mthca_adapter adapter; + + if (mthca_is_memfree(mdev)) + err = mthca_init_arbel(mdev); + else + err = mthca_init_tavor(mdev); + + if (err) + return err; + + err = mthca_QUERY_ADAPTER(mdev, &adapter); + if (err) { + mthca_err(mdev, "QUERY_ADAPTER command returned %d, aborting.\n", err); + goto err_close; + } + + mdev->eq_table.inta_pin = adapter.inta_pin; + if (!mthca_is_memfree(mdev)) + mdev->rev_id = adapter.revision_id; + memcpy(mdev->board_id, adapter.board_id, sizeof mdev->board_id); + + return 0; + +err_close: + mthca_close_hca(mdev); + return err; +} + +static int mthca_setup_hca(struct mthca_dev *dev) +{ + int err; + + MTHCA_INIT_DOORBELL_LOCK(&dev->doorbell_lock); + + err = mthca_init_uar_table(dev); + if (err) { + mthca_err(dev, "Failed to initialize " + "user access region table, aborting.\n"); + return err; + } + + err = mthca_uar_alloc(dev, &dev->driver_uar); + if (err) { + mthca_err(dev, "Failed to allocate driver access region, " + "aborting.\n"); + goto err_uar_table_free; + } + + dev->kar = ioremap((phys_addr_t) dev->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE); + if (!dev->kar) { + mthca_err(dev, "Couldn't map kernel access region, " + "aborting.\n"); + err = -ENOMEM; + goto err_uar_free; + } + + err = mthca_init_pd_table(dev); + if (err) { + mthca_err(dev, "Failed to initialize " + "protection domain table, aborting.\n"); + goto err_kar_unmap; + } + + err = mthca_init_mr_table(dev); + if (err) { + mthca_err(dev, "Failed to initialize " + "memory region table, aborting.\n"); + goto err_pd_table_free; + } + + err = mthca_pd_alloc(dev, 1, &dev->driver_pd); + if (err) { + mthca_err(dev, "Failed to create driver PD, " + "aborting.\n"); + goto err_mr_table_free; + } + + err = mthca_init_eq_table(dev); + if (err) { + mthca_err(dev, "Failed to initialize " + "event queue table, aborting.\n"); + goto err_pd_free; + } + + err = mthca_cmd_use_events(dev); + if (err) { + mthca_err(dev, "Failed to switch to event-driven " + "firmware commands, aborting.\n"); + goto err_eq_table_free; + } + + err = mthca_NOP(dev); + if (err) { + if (dev->mthca_flags & MTHCA_FLAG_MSI_X) { + mthca_warn(dev, "NOP command failed to generate interrupt " + "(IRQ %d).\n", + dev->eq_table.eq[MTHCA_EQ_CMD].msi_x_vector); + mthca_warn(dev, "Trying again with MSI-X disabled.\n"); + } else { + mthca_err(dev, "NOP command failed to generate interrupt " + "(IRQ %d), aborting.\n", + dev->pdev->irq); + mthca_err(dev, "BIOS or ACPI interrupt routing problem?\n"); + } + + goto err_cmd_poll; + } + + mthca_dbg(dev, "NOP command IRQ test passed\n"); + + err = mthca_init_cq_table(dev); + if (err) { + mthca_err(dev, "Failed to initialize " + "completion queue table, aborting.\n"); + goto err_cmd_poll; + } + + err = mthca_init_srq_table(dev); + if (err) { + mthca_err(dev, "Failed to initialize " + "shared receive queue table, aborting.\n"); + goto err_cq_table_free; + } + + err = mthca_init_qp_table(dev); + if (err) { + mthca_err(dev, "Failed to initialize " + "queue pair table, aborting.\n"); + goto err_srq_table_free; + } + + err = mthca_init_av_table(dev); + if (err) { + mthca_err(dev, "Failed to initialize " + "address vector table, aborting.\n"); + goto err_qp_table_free; + } + + err = mthca_init_mcg_table(dev); + if (err) { + mthca_err(dev, "Failed to initialize " + "multicast group table, aborting.\n"); + goto err_av_table_free; + } + + return 0; + +err_av_table_free: + mthca_cleanup_av_table(dev); + +err_qp_table_free: + mthca_cleanup_qp_table(dev); + +err_srq_table_free: + mthca_cleanup_srq_table(dev); + +err_cq_table_free: + mthca_cleanup_cq_table(dev); + +err_cmd_poll: + mthca_cmd_use_polling(dev); + +err_eq_table_free: + mthca_cleanup_eq_table(dev); + +err_pd_free: + mthca_pd_free(dev, &dev->driver_pd); + +err_mr_table_free: + mthca_cleanup_mr_table(dev); + +err_pd_table_free: + mthca_cleanup_pd_table(dev); + +err_kar_unmap: + iounmap(dev->kar); + +err_uar_free: + mthca_uar_free(dev, &dev->driver_uar); + +err_uar_table_free: + mthca_cleanup_uar_table(dev); + return err; +} + +static int mthca_enable_msi_x(struct mthca_dev *mdev) +{ + struct msix_entry entries[3]; + int err; + + entries[0].entry = 0; + entries[1].entry = 1; + entries[2].entry = 2; + + err = pci_enable_msix_exact(mdev->pdev, entries, ARRAY_SIZE(entries)); + if (err) + return err; + + mdev->eq_table.eq[MTHCA_EQ_COMP ].msi_x_vector = entries[0].vector; + mdev->eq_table.eq[MTHCA_EQ_ASYNC].msi_x_vector = entries[1].vector; + mdev->eq_table.eq[MTHCA_EQ_CMD ].msi_x_vector = entries[2].vector; + + return 0; +} + +/* Types of supported HCA */ +enum { + TAVOR, /* MT23108 */ + ARBEL_COMPAT, /* MT25208 in Tavor compat mode */ + ARBEL_NATIVE, /* MT25208 with extended features */ + SINAI /* MT25204 */ +}; + +#define MTHCA_FW_VER(major, minor, subminor) \ + (((u64) (major) << 32) | ((u64) (minor) << 16) | (u64) (subminor)) + +static struct { + u64 latest_fw; + u32 flags; +} mthca_hca_table[] = { + [TAVOR] = { .latest_fw = MTHCA_FW_VER(3, 5, 0), + .flags = 0 }, + [ARBEL_COMPAT] = { .latest_fw = MTHCA_FW_VER(4, 8, 200), + .flags = MTHCA_FLAG_PCIE }, + [ARBEL_NATIVE] = { .latest_fw = MTHCA_FW_VER(5, 3, 0), + .flags = MTHCA_FLAG_MEMFREE | + MTHCA_FLAG_PCIE }, + [SINAI] = { .latest_fw = MTHCA_FW_VER(1, 2, 0), + .flags = MTHCA_FLAG_MEMFREE | + MTHCA_FLAG_PCIE | + MTHCA_FLAG_SINAI_OPT } +}; + +static int __mthca_init_one(struct pci_dev *pdev, int hca_type) +{ + int ddr_hidden = 0; + int err; + struct mthca_dev *mdev; + + printk(KERN_INFO PFX "Initializing %s\n", + pci_name(pdev)); + + err = pci_enable_device(pdev); + if (err) { + dev_err(&pdev->dev, "Cannot enable PCI device, " + "aborting.\n"); + return err; + } + + /* + * Check for BARs. We expect 0: 1MB, 2: 8MB, 4: DDR (may not + * be present) + */ + if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM) || + pci_resource_len(pdev, 0) != 1 << 20) { + dev_err(&pdev->dev, "Missing DCS, aborting.\n"); + err = -ENODEV; + goto err_disable_pdev; + } + if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) { + dev_err(&pdev->dev, "Missing UAR, aborting.\n"); + err = -ENODEV; + goto err_disable_pdev; + } + if (!(pci_resource_flags(pdev, 4) & IORESOURCE_MEM)) + ddr_hidden = 1; + + err = pci_request_regions(pdev, DRV_NAME); + if (err) { + dev_err(&pdev->dev, "Cannot obtain PCI resources, " + "aborting.\n"); + goto err_disable_pdev; + } + + pci_set_master(pdev); + + err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + if (err) { + dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask.\n"); + err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + if (err) { + dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting.\n"); + goto err_free_res; + } + } + err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); + if (err) { + dev_warn(&pdev->dev, "Warning: couldn't set 64-bit " + "consistent PCI DMA mask.\n"); + err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); + if (err) { + dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, " + "aborting.\n"); + goto err_free_res; + } + } + + /* We can handle large RDMA requests, so allow larger segments. */ + dma_set_max_seg_size(&pdev->dev, 1024 * 1024 * 1024); + + mdev = (struct mthca_dev *) ib_alloc_device(sizeof *mdev); + if (!mdev) { + dev_err(&pdev->dev, "Device struct alloc failed, " + "aborting.\n"); + err = -ENOMEM; + goto err_free_res; + } + + mdev->pdev = pdev; + + mdev->mthca_flags = mthca_hca_table[hca_type].flags; + if (ddr_hidden) + mdev->mthca_flags |= MTHCA_FLAG_DDR_HIDDEN; + + /* + * Now reset the HCA before we touch the PCI capabilities or + * attempt a firmware command, since a boot ROM may have left + * the HCA in an undefined state. + */ + err = mthca_reset(mdev); + if (err) { + mthca_err(mdev, "Failed to reset HCA, aborting.\n"); + goto err_free_dev; + } + + if (mthca_cmd_init(mdev)) { + mthca_err(mdev, "Failed to init command interface, aborting.\n"); + goto err_free_dev; + } + + err = mthca_tune_pci(mdev); + if (err) + goto err_cmd; + + err = mthca_init_hca(mdev); + if (err) + goto err_cmd; + + if (mdev->fw_ver < mthca_hca_table[hca_type].latest_fw) { + mthca_warn(mdev, "HCA FW version %d.%d.%03d is old (%d.%d.%03d is current).\n", + (int) (mdev->fw_ver >> 32), (int) (mdev->fw_ver >> 16) & 0xffff, + (int) (mdev->fw_ver & 0xffff), + (int) (mthca_hca_table[hca_type].latest_fw >> 32), + (int) (mthca_hca_table[hca_type].latest_fw >> 16) & 0xffff, + (int) (mthca_hca_table[hca_type].latest_fw & 0xffff)); + mthca_warn(mdev, "If you have problems, try updating your HCA FW.\n"); + } + + if (msi_x && !mthca_enable_msi_x(mdev)) + mdev->mthca_flags |= MTHCA_FLAG_MSI_X; + + err = mthca_setup_hca(mdev); + if (err == -EBUSY && (mdev->mthca_flags & MTHCA_FLAG_MSI_X)) { + if (mdev->mthca_flags & MTHCA_FLAG_MSI_X) + pci_disable_msix(pdev); + mdev->mthca_flags &= ~MTHCA_FLAG_MSI_X; + + err = mthca_setup_hca(mdev); + } + + if (err) + goto err_close; + + err = mthca_register_device(mdev); + if (err) + goto err_cleanup; + + err = mthca_create_agents(mdev); + if (err) + goto err_unregister; + + pci_set_drvdata(pdev, mdev); + mdev->hca_type = hca_type; + + mdev->active = true; + + return 0; + +err_unregister: + mthca_unregister_device(mdev); + +err_cleanup: + mthca_cleanup_mcg_table(mdev); + mthca_cleanup_av_table(mdev); + mthca_cleanup_qp_table(mdev); + mthca_cleanup_srq_table(mdev); + mthca_cleanup_cq_table(mdev); + mthca_cmd_use_polling(mdev); + mthca_cleanup_eq_table(mdev); + + mthca_pd_free(mdev, &mdev->driver_pd); + + mthca_cleanup_mr_table(mdev); + mthca_cleanup_pd_table(mdev); + mthca_cleanup_uar_table(mdev); + +err_close: + if (mdev->mthca_flags & MTHCA_FLAG_MSI_X) + pci_disable_msix(pdev); + + mthca_close_hca(mdev); + +err_cmd: + mthca_cmd_cleanup(mdev); + +err_free_dev: + ib_dealloc_device(&mdev->ib_dev); + +err_free_res: + pci_release_regions(pdev); + +err_disable_pdev: + pci_disable_device(pdev); + pci_set_drvdata(pdev, NULL); + return err; +} + +static void __mthca_remove_one(struct pci_dev *pdev) +{ + struct mthca_dev *mdev = pci_get_drvdata(pdev); + int p; + + if (mdev) { + mthca_free_agents(mdev); + mthca_unregister_device(mdev); + + for (p = 1; p <= mdev->limits.num_ports; ++p) + mthca_CLOSE_IB(mdev, p); + + mthca_cleanup_mcg_table(mdev); + mthca_cleanup_av_table(mdev); + mthca_cleanup_qp_table(mdev); + mthca_cleanup_srq_table(mdev); + mthca_cleanup_cq_table(mdev); + mthca_cmd_use_polling(mdev); + mthca_cleanup_eq_table(mdev); + + mthca_pd_free(mdev, &mdev->driver_pd); + + mthca_cleanup_mr_table(mdev); + mthca_cleanup_pd_table(mdev); + + iounmap(mdev->kar); + mthca_uar_free(mdev, &mdev->driver_uar); + mthca_cleanup_uar_table(mdev); + mthca_close_hca(mdev); + mthca_cmd_cleanup(mdev); + + if (mdev->mthca_flags & MTHCA_FLAG_MSI_X) + pci_disable_msix(pdev); + + ib_dealloc_device(&mdev->ib_dev); + pci_release_regions(pdev); + pci_disable_device(pdev); + pci_set_drvdata(pdev, NULL); + } +} + +int __mthca_restart_one(struct pci_dev *pdev) +{ + struct mthca_dev *mdev; + int hca_type; + + mdev = pci_get_drvdata(pdev); + if (!mdev) + return -ENODEV; + hca_type = mdev->hca_type; + __mthca_remove_one(pdev); + return __mthca_init_one(pdev, hca_type); +} + +static int mthca_init_one(struct pci_dev *pdev, const struct pci_device_id *id) +{ + int ret; + + mutex_lock(&mthca_device_mutex); + + printk_once(KERN_INFO "%s", mthca_version); + + if (id->driver_data >= ARRAY_SIZE(mthca_hca_table)) { + printk(KERN_ERR PFX "%s has invalid driver data %lx\n", + pci_name(pdev), id->driver_data); + mutex_unlock(&mthca_device_mutex); + return -ENODEV; + } + + ret = __mthca_init_one(pdev, id->driver_data); + + mutex_unlock(&mthca_device_mutex); + + return ret; +} + +static void mthca_remove_one(struct pci_dev *pdev) +{ + mutex_lock(&mthca_device_mutex); + __mthca_remove_one(pdev); + mutex_unlock(&mthca_device_mutex); +} + +static struct pci_device_id mthca_pci_table[] = { + { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR), + .driver_data = TAVOR }, + { PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_TAVOR), + .driver_data = TAVOR }, + { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT), + .driver_data = ARBEL_COMPAT }, + { PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT), + .driver_data = ARBEL_COMPAT }, + { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_ARBEL), + .driver_data = ARBEL_NATIVE }, + { PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_ARBEL), + .driver_data = ARBEL_NATIVE }, + { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_SINAI), + .driver_data = SINAI }, + { PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_SINAI), + .driver_data = SINAI }, + { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_SINAI_OLD), + .driver_data = SINAI }, + { PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_SINAI_OLD), + .driver_data = SINAI }, + { 0, } +}; + +MODULE_DEVICE_TABLE(pci, mthca_pci_table); + +static struct pci_driver mthca_driver = { + .name = DRV_NAME, + .id_table = mthca_pci_table, + .probe = mthca_init_one, + .remove = mthca_remove_one, +}; + +static void __init __mthca_check_profile_val(const char *name, int *pval, + int pval_default) +{ + /* value must be positive and power of 2 */ + int old_pval = *pval; + + if (old_pval <= 0) + *pval = pval_default; + else + *pval = roundup_pow_of_two(old_pval); + + if (old_pval != *pval) { + printk(KERN_WARNING PFX "Invalid value %d for %s in module parameter.\n", + old_pval, name); + printk(KERN_WARNING PFX "Corrected %s to %d.\n", name, *pval); + } +} + +#define mthca_check_profile_val(name, default) \ + __mthca_check_profile_val(#name, &hca_profile.name, default) + +static void __init mthca_validate_profile(void) +{ + mthca_check_profile_val(num_qp, MTHCA_DEFAULT_NUM_QP); + mthca_check_profile_val(rdb_per_qp, MTHCA_DEFAULT_RDB_PER_QP); + mthca_check_profile_val(num_cq, MTHCA_DEFAULT_NUM_CQ); + mthca_check_profile_val(num_mcg, MTHCA_DEFAULT_NUM_MCG); + mthca_check_profile_val(num_mpt, MTHCA_DEFAULT_NUM_MPT); + mthca_check_profile_val(num_mtt, MTHCA_DEFAULT_NUM_MTT); + mthca_check_profile_val(num_udav, MTHCA_DEFAULT_NUM_UDAV); + mthca_check_profile_val(fmr_reserved_mtts, MTHCA_DEFAULT_NUM_RESERVED_MTTS); + + if (hca_profile.fmr_reserved_mtts >= hca_profile.num_mtt) { + printk(KERN_WARNING PFX "Invalid fmr_reserved_mtts module parameter %d.\n", + hca_profile.fmr_reserved_mtts); + printk(KERN_WARNING PFX "(Must be smaller than num_mtt %d)\n", + hca_profile.num_mtt); + hca_profile.fmr_reserved_mtts = hca_profile.num_mtt / 2; + printk(KERN_WARNING PFX "Corrected fmr_reserved_mtts to %d.\n", + hca_profile.fmr_reserved_mtts); + } + + if ((log_mtts_per_seg < 1) || (log_mtts_per_seg > 5)) { + printk(KERN_WARNING PFX "bad log_mtts_per_seg (%d). Using default - %d\n", + log_mtts_per_seg, ilog2(MTHCA_MTT_SEG_SIZE / 8)); + log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8); + } +} + +static int __init mthca_init(void) +{ + int ret; + + mthca_validate_profile(); + + ret = mthca_catas_init(); + if (ret) + return ret; + + ret = pci_register_driver(&mthca_driver); + if (ret < 0) { + mthca_catas_cleanup(); + return ret; + } + + return 0; +} + +static void __exit mthca_cleanup(void) +{ + pci_unregister_driver(&mthca_driver); + mthca_catas_cleanup(); +} + +module_init(mthca_init); +module_exit(mthca_cleanup); diff --git a/drivers/infiniband/hw/mthca/mthca_mcg.c b/drivers/infiniband/hw/mthca/mthca_mcg.c new file mode 100644 index 0000000..6304ae8 --- /dev/null +++ b/drivers/infiniband/hw/mthca/mthca_mcg.c @@ -0,0 +1,335 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "mthca_dev.h" +#include "mthca_cmd.h" + +struct mthca_mgm { + __be32 next_gid_index; + u32 reserved[3]; + u8 gid[16]; + __be32 qp[MTHCA_QP_PER_MGM]; +}; + +static const u8 zero_gid[16]; /* automatically initialized to 0 */ + +/* + * Caller must hold MCG table semaphore. gid and mgm parameters must + * be properly aligned for command interface. + * + * Returns 0 unless a firmware command error occurs. + * + * If GID is found in MGM or MGM is empty, *index = *hash, *prev = -1 + * and *mgm holds MGM entry. + * + * if GID is found in AMGM, *index = index in AMGM, *prev = index of + * previous entry in hash chain and *mgm holds AMGM entry. + * + * If no AMGM exists for given gid, *index = -1, *prev = index of last + * entry in hash chain and *mgm holds end of hash chain. + */ +static int find_mgm(struct mthca_dev *dev, + u8 *gid, struct mthca_mailbox *mgm_mailbox, + u16 *hash, int *prev, int *index) +{ + struct mthca_mailbox *mailbox; + struct mthca_mgm *mgm = mgm_mailbox->buf; + u8 *mgid; + int err; + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return -ENOMEM; + mgid = mailbox->buf; + + memcpy(mgid, gid, 16); + + err = mthca_MGID_HASH(dev, mailbox, hash); + if (err) { + mthca_err(dev, "MGID_HASH failed (%d)\n", err); + goto out; + } + + if (0) + mthca_dbg(dev, "Hash for %pI6 is %04x\n", gid, *hash); + + *index = *hash; + *prev = -1; + + do { + err = mthca_READ_MGM(dev, *index, mgm_mailbox); + if (err) { + mthca_err(dev, "READ_MGM failed (%d)\n", err); + goto out; + } + + if (!memcmp(mgm->gid, zero_gid, 16)) { + if (*index != *hash) { + mthca_err(dev, "Found zero MGID in AMGM.\n"); + err = -EINVAL; + } + goto out; + } + + if (!memcmp(mgm->gid, gid, 16)) + goto out; + + *prev = *index; + *index = be32_to_cpu(mgm->next_gid_index) >> 6; + } while (*index); + + *index = -1; + + out: + mthca_free_mailbox(dev, mailbox); + return err; +} + +int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct mthca_dev *dev = to_mdev(ibqp->device); + struct mthca_mailbox *mailbox; + struct mthca_mgm *mgm; + u16 hash; + int index, prev; + int link = 0; + int i; + int err; + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + mgm = mailbox->buf; + + mutex_lock(&dev->mcg_table.mutex); + + err = find_mgm(dev, gid->raw, mailbox, &hash, &prev, &index); + if (err) + goto out; + + if (index != -1) { + if (!memcmp(mgm->gid, zero_gid, 16)) + memcpy(mgm->gid, gid->raw, 16); + } else { + link = 1; + + index = mthca_alloc(&dev->mcg_table.alloc); + if (index == -1) { + mthca_err(dev, "No AMGM entries left\n"); + err = -ENOMEM; + goto out; + } + + err = mthca_READ_MGM(dev, index, mailbox); + if (err) { + mthca_err(dev, "READ_MGM failed (%d)\n", err); + goto out; + } + memset(mgm, 0, sizeof *mgm); + memcpy(mgm->gid, gid->raw, 16); + } + + for (i = 0; i < MTHCA_QP_PER_MGM; ++i) + if (mgm->qp[i] == cpu_to_be32(ibqp->qp_num | (1 << 31))) { + mthca_dbg(dev, "QP %06x already a member of MGM\n", + ibqp->qp_num); + err = 0; + goto out; + } else if (!(mgm->qp[i] & cpu_to_be32(1 << 31))) { + mgm->qp[i] = cpu_to_be32(ibqp->qp_num | (1 << 31)); + break; + } + + if (i == MTHCA_QP_PER_MGM) { + mthca_err(dev, "MGM at index %x is full.\n", index); + err = -ENOMEM; + goto out; + } + + err = mthca_WRITE_MGM(dev, index, mailbox); + if (err) { + mthca_err(dev, "WRITE_MGM failed %d\n", err); + err = -EINVAL; + goto out; + } + + if (!link) + goto out; + + err = mthca_READ_MGM(dev, prev, mailbox); + if (err) { + mthca_err(dev, "READ_MGM failed %d\n", err); + goto out; + } + + mgm->next_gid_index = cpu_to_be32(index << 6); + + err = mthca_WRITE_MGM(dev, prev, mailbox); + if (err) + mthca_err(dev, "WRITE_MGM returned %d\n", err); + + out: + if (err && link && index != -1) { + BUG_ON(index < dev->limits.num_mgms); + mthca_free(&dev->mcg_table.alloc, index); + } + mutex_unlock(&dev->mcg_table.mutex); + + mthca_free_mailbox(dev, mailbox); + return err; +} + +int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct mthca_dev *dev = to_mdev(ibqp->device); + struct mthca_mailbox *mailbox; + struct mthca_mgm *mgm; + u16 hash; + int prev, index; + int i, loc; + int err; + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + mgm = mailbox->buf; + + mutex_lock(&dev->mcg_table.mutex); + + err = find_mgm(dev, gid->raw, mailbox, &hash, &prev, &index); + if (err) + goto out; + + if (index == -1) { + mthca_err(dev, "MGID %pI6 not found\n", gid->raw); + err = -EINVAL; + goto out; + } + + for (loc = -1, i = 0; i < MTHCA_QP_PER_MGM; ++i) { + if (mgm->qp[i] == cpu_to_be32(ibqp->qp_num | (1 << 31))) + loc = i; + if (!(mgm->qp[i] & cpu_to_be32(1 << 31))) + break; + } + + if (loc == -1) { + mthca_err(dev, "QP %06x not found in MGM\n", ibqp->qp_num); + err = -EINVAL; + goto out; + } + + mgm->qp[loc] = mgm->qp[i - 1]; + mgm->qp[i - 1] = 0; + + err = mthca_WRITE_MGM(dev, index, mailbox); + if (err) { + mthca_err(dev, "WRITE_MGM returned %d\n", err); + goto out; + } + + if (i != 1) + goto out; + + if (prev == -1) { + /* Remove entry from MGM */ + int amgm_index_to_free = be32_to_cpu(mgm->next_gid_index) >> 6; + if (amgm_index_to_free) { + err = mthca_READ_MGM(dev, amgm_index_to_free, + mailbox); + if (err) { + mthca_err(dev, "READ_MGM returned %d\n", err); + goto out; + } + } else + memset(mgm->gid, 0, 16); + + err = mthca_WRITE_MGM(dev, index, mailbox); + if (err) { + mthca_err(dev, "WRITE_MGM returned %d\n", err); + goto out; + } + if (amgm_index_to_free) { + BUG_ON(amgm_index_to_free < dev->limits.num_mgms); + mthca_free(&dev->mcg_table.alloc, amgm_index_to_free); + } + } else { + /* Remove entry from AMGM */ + int curr_next_index = be32_to_cpu(mgm->next_gid_index) >> 6; + err = mthca_READ_MGM(dev, prev, mailbox); + if (err) { + mthca_err(dev, "READ_MGM returned %d\n", err); + goto out; + } + + mgm->next_gid_index = cpu_to_be32(curr_next_index << 6); + + err = mthca_WRITE_MGM(dev, prev, mailbox); + if (err) { + mthca_err(dev, "WRITE_MGM returned %d\n", err); + goto out; + } + BUG_ON(index < dev->limits.num_mgms); + mthca_free(&dev->mcg_table.alloc, index); + } + + out: + mutex_unlock(&dev->mcg_table.mutex); + + mthca_free_mailbox(dev, mailbox); + return err; +} + +int mthca_init_mcg_table(struct mthca_dev *dev) +{ + int err; + int table_size = dev->limits.num_mgms + dev->limits.num_amgms; + + err = mthca_alloc_init(&dev->mcg_table.alloc, + table_size, + table_size - 1, + dev->limits.num_mgms); + if (err) + return err; + + mutex_init(&dev->mcg_table.mutex); + + return 0; +} + +void mthca_cleanup_mcg_table(struct mthca_dev *dev) +{ + mthca_alloc_cleanup(&dev->mcg_table.alloc); +} diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c new file mode 100644 index 0000000..7d2e42d --- /dev/null +++ b/drivers/infiniband/hw/mthca/mthca_memfree.c @@ -0,0 +1,760 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include + +#include "mthca_memfree.h" +#include "mthca_dev.h" +#include "mthca_cmd.h" + +/* + * We allocate in as big chunks as we can, up to a maximum of 256 KB + * per chunk. + */ +enum { + MTHCA_ICM_ALLOC_SIZE = 1 << 18, + MTHCA_TABLE_CHUNK_SIZE = 1 << 18 +}; + +struct mthca_user_db_table { + struct mutex mutex; + struct { + u64 uvirt; + struct scatterlist mem; + int refcount; + } page[0]; +}; + +static void mthca_free_icm_pages(struct mthca_dev *dev, struct mthca_icm_chunk *chunk) +{ + int i; + + if (chunk->nsg > 0) + pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages, + PCI_DMA_BIDIRECTIONAL); + + for (i = 0; i < chunk->npages; ++i) + __free_pages(sg_page(&chunk->mem[i]), + get_order(chunk->mem[i].length)); +} + +static void mthca_free_icm_coherent(struct mthca_dev *dev, struct mthca_icm_chunk *chunk) +{ + int i; + + for (i = 0; i < chunk->npages; ++i) { + dma_free_coherent(&dev->pdev->dev, chunk->mem[i].length, + lowmem_page_address(sg_page(&chunk->mem[i])), + sg_dma_address(&chunk->mem[i])); + } +} + +void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm, int coherent) +{ + struct mthca_icm_chunk *chunk, *tmp; + + if (!icm) + return; + + list_for_each_entry_safe(chunk, tmp, &icm->chunk_list, list) { + if (coherent) + mthca_free_icm_coherent(dev, chunk); + else + mthca_free_icm_pages(dev, chunk); + + kfree(chunk); + } + + kfree(icm); +} + +static int mthca_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_mask) +{ + struct page *page; + + /* + * Use __GFP_ZERO because buggy firmware assumes ICM pages are + * cleared, and subtle failures are seen if they aren't. + */ + page = alloc_pages(gfp_mask | __GFP_ZERO, order); + if (!page) + return -ENOMEM; + + sg_set_page(mem, page, PAGE_SIZE << order, 0); + return 0; +} + +static int mthca_alloc_icm_coherent(struct device *dev, struct scatterlist *mem, + int order, gfp_t gfp_mask) +{ + void *buf = dma_alloc_coherent(dev, PAGE_SIZE << order, &sg_dma_address(mem), + gfp_mask); + if (!buf) + return -ENOMEM; + + sg_set_buf(mem, buf, PAGE_SIZE << order); + BUG_ON(mem->offset); + sg_dma_len(mem) = PAGE_SIZE << order; + return 0; +} + +struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages, + gfp_t gfp_mask, int coherent) +{ + struct mthca_icm *icm; + struct mthca_icm_chunk *chunk = NULL; + int cur_order; + int ret; + + /* We use sg_set_buf for coherent allocs, which assumes low memory */ + BUG_ON(coherent && (gfp_mask & __GFP_HIGHMEM)); + + icm = kmalloc(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN)); + if (!icm) + return icm; + + icm->refcount = 0; + INIT_LIST_HEAD(&icm->chunk_list); + + cur_order = get_order(MTHCA_ICM_ALLOC_SIZE); + + while (npages > 0) { + if (!chunk) { + chunk = kmalloc(sizeof *chunk, + gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN)); + if (!chunk) + goto fail; + + sg_init_table(chunk->mem, MTHCA_ICM_CHUNK_LEN); + chunk->npages = 0; + chunk->nsg = 0; + list_add_tail(&chunk->list, &icm->chunk_list); + } + + while (1 << cur_order > npages) + --cur_order; + + if (coherent) + ret = mthca_alloc_icm_coherent(&dev->pdev->dev, + &chunk->mem[chunk->npages], + cur_order, gfp_mask); + else + ret = mthca_alloc_icm_pages(&chunk->mem[chunk->npages], + cur_order, gfp_mask); + + if (!ret) { + ++chunk->npages; + + if (coherent) + ++chunk->nsg; + else if (chunk->npages == MTHCA_ICM_CHUNK_LEN) { + chunk->nsg = pci_map_sg(dev->pdev, chunk->mem, + chunk->npages, + PCI_DMA_BIDIRECTIONAL); + + if (chunk->nsg <= 0) + goto fail; + } + + if (chunk->npages == MTHCA_ICM_CHUNK_LEN) + chunk = NULL; + + npages -= 1 << cur_order; + } else { + --cur_order; + if (cur_order < 0) + goto fail; + } + } + + if (!coherent && chunk) { + chunk->nsg = pci_map_sg(dev->pdev, chunk->mem, + chunk->npages, + PCI_DMA_BIDIRECTIONAL); + + if (chunk->nsg <= 0) + goto fail; + } + + return icm; + +fail: + mthca_free_icm(dev, icm, coherent); + return NULL; +} + +int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int obj) +{ + int i = (obj & (table->num_obj - 1)) * table->obj_size / MTHCA_TABLE_CHUNK_SIZE; + int ret = 0; + + mutex_lock(&table->mutex); + + if (table->icm[i]) { + ++table->icm[i]->refcount; + goto out; + } + + table->icm[i] = mthca_alloc_icm(dev, MTHCA_TABLE_CHUNK_SIZE >> PAGE_SHIFT, + (table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) | + __GFP_NOWARN, table->coherent); + if (!table->icm[i]) { + ret = -ENOMEM; + goto out; + } + + if (mthca_MAP_ICM(dev, table->icm[i], + table->virt + i * MTHCA_TABLE_CHUNK_SIZE)) { + mthca_free_icm(dev, table->icm[i], table->coherent); + table->icm[i] = NULL; + ret = -ENOMEM; + goto out; + } + + ++table->icm[i]->refcount; + +out: + mutex_unlock(&table->mutex); + return ret; +} + +void mthca_table_put(struct mthca_dev *dev, struct mthca_icm_table *table, int obj) +{ + int i; + + if (!mthca_is_memfree(dev)) + return; + + i = (obj & (table->num_obj - 1)) * table->obj_size / MTHCA_TABLE_CHUNK_SIZE; + + mutex_lock(&table->mutex); + + if (--table->icm[i]->refcount == 0) { + mthca_UNMAP_ICM(dev, table->virt + i * MTHCA_TABLE_CHUNK_SIZE, + MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE); + mthca_free_icm(dev, table->icm[i], table->coherent); + table->icm[i] = NULL; + } + + mutex_unlock(&table->mutex); +} + +void *mthca_table_find(struct mthca_icm_table *table, int obj, dma_addr_t *dma_handle) +{ + int idx, offset, dma_offset, i; + struct mthca_icm_chunk *chunk; + struct mthca_icm *icm; + struct page *page = NULL; + + if (!table->lowmem) + return NULL; + + mutex_lock(&table->mutex); + + idx = (obj & (table->num_obj - 1)) * table->obj_size; + icm = table->icm[idx / MTHCA_TABLE_CHUNK_SIZE]; + dma_offset = offset = idx % MTHCA_TABLE_CHUNK_SIZE; + + if (!icm) + goto out; + + list_for_each_entry(chunk, &icm->chunk_list, list) { + for (i = 0; i < chunk->npages; ++i) { + if (dma_handle && dma_offset >= 0) { + if (sg_dma_len(&chunk->mem[i]) > dma_offset) + *dma_handle = sg_dma_address(&chunk->mem[i]) + + dma_offset; + dma_offset -= sg_dma_len(&chunk->mem[i]); + } + /* DMA mapping can merge pages but not split them, + * so if we found the page, dma_handle has already + * been assigned to. */ + if (chunk->mem[i].length > offset) { + page = sg_page(&chunk->mem[i]); + goto out; + } + offset -= chunk->mem[i].length; + } + } + +out: + mutex_unlock(&table->mutex); + return page ? lowmem_page_address(page) + offset : NULL; +} + +int mthca_table_get_range(struct mthca_dev *dev, struct mthca_icm_table *table, + int start, int end) +{ + int inc = MTHCA_TABLE_CHUNK_SIZE / table->obj_size; + int i, err; + + for (i = start; i <= end; i += inc) { + err = mthca_table_get(dev, table, i); + if (err) + goto fail; + } + + return 0; + +fail: + while (i > start) { + i -= inc; + mthca_table_put(dev, table, i); + } + + return err; +} + +void mthca_table_put_range(struct mthca_dev *dev, struct mthca_icm_table *table, + int start, int end) +{ + int i; + + if (!mthca_is_memfree(dev)) + return; + + for (i = start; i <= end; i += MTHCA_TABLE_CHUNK_SIZE / table->obj_size) + mthca_table_put(dev, table, i); +} + +struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev, + u64 virt, int obj_size, + int nobj, int reserved, + int use_lowmem, int use_coherent) +{ + struct mthca_icm_table *table; + int obj_per_chunk; + int num_icm; + unsigned chunk_size; + int i; + + obj_per_chunk = MTHCA_TABLE_CHUNK_SIZE / obj_size; + num_icm = DIV_ROUND_UP(nobj, obj_per_chunk); + + table = kmalloc(sizeof *table + num_icm * sizeof *table->icm, GFP_KERNEL); + if (!table) + return NULL; + + table->virt = virt; + table->num_icm = num_icm; + table->num_obj = nobj; + table->obj_size = obj_size; + table->lowmem = use_lowmem; + table->coherent = use_coherent; + mutex_init(&table->mutex); + + for (i = 0; i < num_icm; ++i) + table->icm[i] = NULL; + + for (i = 0; i * MTHCA_TABLE_CHUNK_SIZE < reserved * obj_size; ++i) { + chunk_size = MTHCA_TABLE_CHUNK_SIZE; + if ((i + 1) * MTHCA_TABLE_CHUNK_SIZE > nobj * obj_size) + chunk_size = nobj * obj_size - i * MTHCA_TABLE_CHUNK_SIZE; + + table->icm[i] = mthca_alloc_icm(dev, chunk_size >> PAGE_SHIFT, + (use_lowmem ? GFP_KERNEL : GFP_HIGHUSER) | + __GFP_NOWARN, use_coherent); + if (!table->icm[i]) + goto err; + if (mthca_MAP_ICM(dev, table->icm[i], + virt + i * MTHCA_TABLE_CHUNK_SIZE)) { + mthca_free_icm(dev, table->icm[i], table->coherent); + table->icm[i] = NULL; + goto err; + } + + /* + * Add a reference to this ICM chunk so that it never + * gets freed (since it contains reserved firmware objects). + */ + ++table->icm[i]->refcount; + } + + return table; + +err: + for (i = 0; i < num_icm; ++i) + if (table->icm[i]) { + mthca_UNMAP_ICM(dev, virt + i * MTHCA_TABLE_CHUNK_SIZE, + MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE); + mthca_free_icm(dev, table->icm[i], table->coherent); + } + + kfree(table); + + return NULL; +} + +void mthca_free_icm_table(struct mthca_dev *dev, struct mthca_icm_table *table) +{ + int i; + + for (i = 0; i < table->num_icm; ++i) + if (table->icm[i]) { + mthca_UNMAP_ICM(dev, + table->virt + i * MTHCA_TABLE_CHUNK_SIZE, + MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE); + mthca_free_icm(dev, table->icm[i], table->coherent); + } + + kfree(table); +} + +static u64 mthca_uarc_virt(struct mthca_dev *dev, struct mthca_uar *uar, int page) +{ + return dev->uar_table.uarc_base + + uar->index * dev->uar_table.uarc_size + + page * MTHCA_ICM_PAGE_SIZE; +} + +int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar, + struct mthca_user_db_table *db_tab, int index, u64 uaddr) +{ + struct page *pages[1]; + int ret = 0; + int i; + + if (!mthca_is_memfree(dev)) + return 0; + + if (index < 0 || index > dev->uar_table.uarc_size / 8) + return -EINVAL; + + mutex_lock(&db_tab->mutex); + + i = index / MTHCA_DB_REC_PER_PAGE; + + if ((db_tab->page[i].refcount >= MTHCA_DB_REC_PER_PAGE) || + (db_tab->page[i].uvirt && db_tab->page[i].uvirt != uaddr) || + (uaddr & 4095)) { + ret = -EINVAL; + goto out; + } + + if (db_tab->page[i].refcount) { + ++db_tab->page[i].refcount; + goto out; + } + + ret = get_user_pages(current, current->mm, uaddr & PAGE_MASK, 1, 1, 0, + pages, NULL); + if (ret < 0) + goto out; + + sg_set_page(&db_tab->page[i].mem, pages[0], MTHCA_ICM_PAGE_SIZE, + uaddr & ~PAGE_MASK); + + ret = pci_map_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE); + if (ret < 0) { + put_page(pages[0]); + goto out; + } + + ret = mthca_MAP_ICM_page(dev, sg_dma_address(&db_tab->page[i].mem), + mthca_uarc_virt(dev, uar, i)); + if (ret) { + pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE); + put_page(sg_page(&db_tab->page[i].mem)); + goto out; + } + + db_tab->page[i].uvirt = uaddr; + db_tab->page[i].refcount = 1; + +out: + mutex_unlock(&db_tab->mutex); + return ret; +} + +void mthca_unmap_user_db(struct mthca_dev *dev, struct mthca_uar *uar, + struct mthca_user_db_table *db_tab, int index) +{ + if (!mthca_is_memfree(dev)) + return; + + /* + * To make our bookkeeping simpler, we don't unmap DB + * pages until we clean up the whole db table. + */ + + mutex_lock(&db_tab->mutex); + + --db_tab->page[index / MTHCA_DB_REC_PER_PAGE].refcount; + + mutex_unlock(&db_tab->mutex); +} + +struct mthca_user_db_table *mthca_init_user_db_tab(struct mthca_dev *dev) +{ + struct mthca_user_db_table *db_tab; + int npages; + int i; + + if (!mthca_is_memfree(dev)) + return NULL; + + npages = dev->uar_table.uarc_size / MTHCA_ICM_PAGE_SIZE; + db_tab = kmalloc(sizeof *db_tab + npages * sizeof *db_tab->page, GFP_KERNEL); + if (!db_tab) + return ERR_PTR(-ENOMEM); + + mutex_init(&db_tab->mutex); + for (i = 0; i < npages; ++i) { + db_tab->page[i].refcount = 0; + db_tab->page[i].uvirt = 0; + sg_init_table(&db_tab->page[i].mem, 1); + } + + return db_tab; +} + +void mthca_cleanup_user_db_tab(struct mthca_dev *dev, struct mthca_uar *uar, + struct mthca_user_db_table *db_tab) +{ + int i; + + if (!mthca_is_memfree(dev)) + return; + + for (i = 0; i < dev->uar_table.uarc_size / MTHCA_ICM_PAGE_SIZE; ++i) { + if (db_tab->page[i].uvirt) { + mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, uar, i), 1); + pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE); + put_page(sg_page(&db_tab->page[i].mem)); + } + } + + kfree(db_tab); +} + +int mthca_alloc_db(struct mthca_dev *dev, enum mthca_db_type type, + u32 qn, __be32 **db) +{ + int group; + int start, end, dir; + int i, j; + struct mthca_db_page *page; + int ret = 0; + + mutex_lock(&dev->db_tab->mutex); + + switch (type) { + case MTHCA_DB_TYPE_CQ_ARM: + case MTHCA_DB_TYPE_SQ: + group = 0; + start = 0; + end = dev->db_tab->max_group1; + dir = 1; + break; + + case MTHCA_DB_TYPE_CQ_SET_CI: + case MTHCA_DB_TYPE_RQ: + case MTHCA_DB_TYPE_SRQ: + group = 1; + start = dev->db_tab->npages - 1; + end = dev->db_tab->min_group2; + dir = -1; + break; + + default: + ret = -EINVAL; + goto out; + } + + for (i = start; i != end; i += dir) + if (dev->db_tab->page[i].db_rec && + !bitmap_full(dev->db_tab->page[i].used, + MTHCA_DB_REC_PER_PAGE)) { + page = dev->db_tab->page + i; + goto found; + } + + for (i = start; i != end; i += dir) + if (!dev->db_tab->page[i].db_rec) { + page = dev->db_tab->page + i; + goto alloc; + } + + if (dev->db_tab->max_group1 >= dev->db_tab->min_group2 - 1) { + ret = -ENOMEM; + goto out; + } + + if (group == 0) + ++dev->db_tab->max_group1; + else + --dev->db_tab->min_group2; + + page = dev->db_tab->page + end; + +alloc: + page->db_rec = dma_alloc_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE, + &page->mapping, GFP_KERNEL); + if (!page->db_rec) { + ret = -ENOMEM; + goto out; + } + memset(page->db_rec, 0, MTHCA_ICM_PAGE_SIZE); + + ret = mthca_MAP_ICM_page(dev, page->mapping, + mthca_uarc_virt(dev, &dev->driver_uar, i)); + if (ret) { + dma_free_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE, + page->db_rec, page->mapping); + goto out; + } + + bitmap_zero(page->used, MTHCA_DB_REC_PER_PAGE); + +found: + j = find_first_zero_bit(page->used, MTHCA_DB_REC_PER_PAGE); + set_bit(j, page->used); + + if (group == 1) + j = MTHCA_DB_REC_PER_PAGE - 1 - j; + + ret = i * MTHCA_DB_REC_PER_PAGE + j; + + page->db_rec[j] = cpu_to_be64((qn << 8) | (type << 5)); + + *db = (__be32 *) &page->db_rec[j]; + +out: + mutex_unlock(&dev->db_tab->mutex); + + return ret; +} + +void mthca_free_db(struct mthca_dev *dev, int type, int db_index) +{ + int i, j; + struct mthca_db_page *page; + + i = db_index / MTHCA_DB_REC_PER_PAGE; + j = db_index % MTHCA_DB_REC_PER_PAGE; + + page = dev->db_tab->page + i; + + mutex_lock(&dev->db_tab->mutex); + + page->db_rec[j] = 0; + if (i >= dev->db_tab->min_group2) + j = MTHCA_DB_REC_PER_PAGE - 1 - j; + clear_bit(j, page->used); + + if (bitmap_empty(page->used, MTHCA_DB_REC_PER_PAGE) && + i >= dev->db_tab->max_group1 - 1) { + mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, &dev->driver_uar, i), 1); + + dma_free_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE, + page->db_rec, page->mapping); + page->db_rec = NULL; + + if (i == dev->db_tab->max_group1) { + --dev->db_tab->max_group1; + /* XXX may be able to unmap more pages now */ + } + if (i == dev->db_tab->min_group2) + ++dev->db_tab->min_group2; + } + + mutex_unlock(&dev->db_tab->mutex); +} + +int mthca_init_db_tab(struct mthca_dev *dev) +{ + int i; + + if (!mthca_is_memfree(dev)) + return 0; + + dev->db_tab = kmalloc(sizeof *dev->db_tab, GFP_KERNEL); + if (!dev->db_tab) + return -ENOMEM; + + mutex_init(&dev->db_tab->mutex); + + dev->db_tab->npages = dev->uar_table.uarc_size / MTHCA_ICM_PAGE_SIZE; + dev->db_tab->max_group1 = 0; + dev->db_tab->min_group2 = dev->db_tab->npages - 1; + + dev->db_tab->page = kmalloc(dev->db_tab->npages * + sizeof *dev->db_tab->page, + GFP_KERNEL); + if (!dev->db_tab->page) { + kfree(dev->db_tab); + return -ENOMEM; + } + + for (i = 0; i < dev->db_tab->npages; ++i) + dev->db_tab->page[i].db_rec = NULL; + + return 0; +} + +void mthca_cleanup_db_tab(struct mthca_dev *dev) +{ + int i; + + if (!mthca_is_memfree(dev)) + return; + + /* + * Because we don't always free our UARC pages when they + * become empty to make mthca_free_db() simpler we need to + * make a sweep through the doorbell pages and free any + * leftover pages now. + */ + for (i = 0; i < dev->db_tab->npages; ++i) { + if (!dev->db_tab->page[i].db_rec) + continue; + + if (!bitmap_empty(dev->db_tab->page[i].used, MTHCA_DB_REC_PER_PAGE)) + mthca_warn(dev, "Kernel UARC page %d not empty\n", i); + + mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, &dev->driver_uar, i), 1); + + dma_free_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE, + dev->db_tab->page[i].db_rec, + dev->db_tab->page[i].mapping); + } + + kfree(dev->db_tab->page); + kfree(dev->db_tab); +} diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.h b/drivers/infiniband/hw/mthca/mthca_memfree.h new file mode 100644 index 0000000..da9b8f9 --- /dev/null +++ b/drivers/infiniband/hw/mthca/mthca_memfree.h @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MTHCA_MEMFREE_H +#define MTHCA_MEMFREE_H + +#include +#include + +#define MTHCA_ICM_CHUNK_LEN \ + ((256 - sizeof (struct list_head) - 2 * sizeof (int)) / \ + (sizeof (struct scatterlist))) + +enum { + MTHCA_ICM_PAGE_SHIFT = 12, + MTHCA_ICM_PAGE_SIZE = 1 << MTHCA_ICM_PAGE_SHIFT, + MTHCA_DB_REC_PER_PAGE = MTHCA_ICM_PAGE_SIZE / 8 +}; + +struct mthca_icm_chunk { + struct list_head list; + int npages; + int nsg; + struct scatterlist mem[MTHCA_ICM_CHUNK_LEN]; +}; + +struct mthca_icm { + struct list_head chunk_list; + int refcount; +}; + +struct mthca_icm_table { + u64 virt; + int num_icm; + int num_obj; + int obj_size; + int lowmem; + int coherent; + struct mutex mutex; + struct mthca_icm *icm[0]; +}; + +struct mthca_icm_iter { + struct mthca_icm *icm; + struct mthca_icm_chunk *chunk; + int page_idx; +}; + +struct mthca_dev; + +struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages, + gfp_t gfp_mask, int coherent); +void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm, int coherent); + +struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev, + u64 virt, int obj_size, + int nobj, int reserved, + int use_lowmem, int use_coherent); +void mthca_free_icm_table(struct mthca_dev *dev, struct mthca_icm_table *table); +int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int obj); +void mthca_table_put(struct mthca_dev *dev, struct mthca_icm_table *table, int obj); +void *mthca_table_find(struct mthca_icm_table *table, int obj, dma_addr_t *dma_handle); +int mthca_table_get_range(struct mthca_dev *dev, struct mthca_icm_table *table, + int start, int end); +void mthca_table_put_range(struct mthca_dev *dev, struct mthca_icm_table *table, + int start, int end); + +static inline void mthca_icm_first(struct mthca_icm *icm, + struct mthca_icm_iter *iter) +{ + iter->icm = icm; + iter->chunk = list_empty(&icm->chunk_list) ? + NULL : list_entry(icm->chunk_list.next, + struct mthca_icm_chunk, list); + iter->page_idx = 0; +} + +static inline int mthca_icm_last(struct mthca_icm_iter *iter) +{ + return !iter->chunk; +} + +static inline void mthca_icm_next(struct mthca_icm_iter *iter) +{ + if (++iter->page_idx >= iter->chunk->nsg) { + if (iter->chunk->list.next == &iter->icm->chunk_list) { + iter->chunk = NULL; + return; + } + + iter->chunk = list_entry(iter->chunk->list.next, + struct mthca_icm_chunk, list); + iter->page_idx = 0; + } +} + +static inline dma_addr_t mthca_icm_addr(struct mthca_icm_iter *iter) +{ + return sg_dma_address(&iter->chunk->mem[iter->page_idx]); +} + +static inline unsigned long mthca_icm_size(struct mthca_icm_iter *iter) +{ + return sg_dma_len(&iter->chunk->mem[iter->page_idx]); +} + +struct mthca_db_page { + DECLARE_BITMAP(used, MTHCA_DB_REC_PER_PAGE); + __be64 *db_rec; + dma_addr_t mapping; +}; + +struct mthca_db_table { + int npages; + int max_group1; + int min_group2; + struct mthca_db_page *page; + struct mutex mutex; +}; + +enum mthca_db_type { + MTHCA_DB_TYPE_INVALID = 0x0, + MTHCA_DB_TYPE_CQ_SET_CI = 0x1, + MTHCA_DB_TYPE_CQ_ARM = 0x2, + MTHCA_DB_TYPE_SQ = 0x3, + MTHCA_DB_TYPE_RQ = 0x4, + MTHCA_DB_TYPE_SRQ = 0x5, + MTHCA_DB_TYPE_GROUP_SEP = 0x7 +}; + +struct mthca_user_db_table; +struct mthca_uar; + +int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar, + struct mthca_user_db_table *db_tab, int index, u64 uaddr); +void mthca_unmap_user_db(struct mthca_dev *dev, struct mthca_uar *uar, + struct mthca_user_db_table *db_tab, int index); +struct mthca_user_db_table *mthca_init_user_db_tab(struct mthca_dev *dev); +void mthca_cleanup_user_db_tab(struct mthca_dev *dev, struct mthca_uar *uar, + struct mthca_user_db_table *db_tab); + +int mthca_init_db_tab(struct mthca_dev *dev); +void mthca_cleanup_db_tab(struct mthca_dev *dev); +int mthca_alloc_db(struct mthca_dev *dev, enum mthca_db_type type, + u32 qn, __be32 **db); +void mthca_free_db(struct mthca_dev *dev, int type, int db_index); + +#endif /* MTHCA_MEMFREE_H */ diff --git a/drivers/infiniband/hw/mthca/mthca_mr.c b/drivers/infiniband/hw/mthca/mthca_mr.c new file mode 100644 index 0000000..ed9a989 --- /dev/null +++ b/drivers/infiniband/hw/mthca/mthca_mr.c @@ -0,0 +1,965 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "mthca_dev.h" +#include "mthca_cmd.h" +#include "mthca_memfree.h" + +struct mthca_mtt { + struct mthca_buddy *buddy; + int order; + u32 first_seg; +}; + +/* + * Must be packed because mtt_seg is 64 bits but only aligned to 32 bits. + */ +struct mthca_mpt_entry { + __be32 flags; + __be32 page_size; + __be32 key; + __be32 pd; + __be64 start; + __be64 length; + __be32 lkey; + __be32 window_count; + __be32 window_count_limit; + __be64 mtt_seg; + __be32 mtt_sz; /* Arbel only */ + u32 reserved[2]; +} __attribute__((packed)); + +#define MTHCA_MPT_FLAG_SW_OWNS (0xfUL << 28) +#define MTHCA_MPT_FLAG_MIO (1 << 17) +#define MTHCA_MPT_FLAG_BIND_ENABLE (1 << 15) +#define MTHCA_MPT_FLAG_PHYSICAL (1 << 9) +#define MTHCA_MPT_FLAG_REGION (1 << 8) + +#define MTHCA_MTT_FLAG_PRESENT 1 + +#define MTHCA_MPT_STATUS_SW 0xF0 +#define MTHCA_MPT_STATUS_HW 0x00 + +#define SINAI_FMR_KEY_INC 0x1000000 + +/* + * Buddy allocator for MTT segments (currently not very efficient + * since it doesn't keep a free list and just searches linearly + * through the bitmaps) + */ + +static u32 mthca_buddy_alloc(struct mthca_buddy *buddy, int order) +{ + int o; + int m; + u32 seg; + + spin_lock(&buddy->lock); + + for (o = order; o <= buddy->max_order; ++o) + if (buddy->num_free[o]) { + m = 1 << (buddy->max_order - o); + seg = find_first_bit(buddy->bits[o], m); + if (seg < m) + goto found; + } + + spin_unlock(&buddy->lock); + return -1; + + found: + clear_bit(seg, buddy->bits[o]); + --buddy->num_free[o]; + + while (o > order) { + --o; + seg <<= 1; + set_bit(seg ^ 1, buddy->bits[o]); + ++buddy->num_free[o]; + } + + spin_unlock(&buddy->lock); + + seg <<= order; + + return seg; +} + +static void mthca_buddy_free(struct mthca_buddy *buddy, u32 seg, int order) +{ + seg >>= order; + + spin_lock(&buddy->lock); + + while (test_bit(seg ^ 1, buddy->bits[order])) { + clear_bit(seg ^ 1, buddy->bits[order]); + --buddy->num_free[order]; + seg >>= 1; + ++order; + } + + set_bit(seg, buddy->bits[order]); + ++buddy->num_free[order]; + + spin_unlock(&buddy->lock); +} + +static int mthca_buddy_init(struct mthca_buddy *buddy, int max_order) +{ + int i, s; + + buddy->max_order = max_order; + spin_lock_init(&buddy->lock); + + buddy->bits = kzalloc((buddy->max_order + 1) * sizeof (long *), + GFP_KERNEL); + buddy->num_free = kcalloc((buddy->max_order + 1), sizeof *buddy->num_free, + GFP_KERNEL); + if (!buddy->bits || !buddy->num_free) + goto err_out; + + for (i = 0; i <= buddy->max_order; ++i) { + s = BITS_TO_LONGS(1 << (buddy->max_order - i)); + buddy->bits[i] = kmalloc(s * sizeof (long), GFP_KERNEL); + if (!buddy->bits[i]) + goto err_out_free; + bitmap_zero(buddy->bits[i], + 1 << (buddy->max_order - i)); + } + + set_bit(0, buddy->bits[buddy->max_order]); + buddy->num_free[buddy->max_order] = 1; + + return 0; + +err_out_free: + for (i = 0; i <= buddy->max_order; ++i) + kfree(buddy->bits[i]); + +err_out: + kfree(buddy->bits); + kfree(buddy->num_free); + + return -ENOMEM; +} + +static void mthca_buddy_cleanup(struct mthca_buddy *buddy) +{ + int i; + + for (i = 0; i <= buddy->max_order; ++i) + kfree(buddy->bits[i]); + + kfree(buddy->bits); + kfree(buddy->num_free); +} + +static u32 mthca_alloc_mtt_range(struct mthca_dev *dev, int order, + struct mthca_buddy *buddy) +{ + u32 seg = mthca_buddy_alloc(buddy, order); + + if (seg == -1) + return -1; + + if (mthca_is_memfree(dev)) + if (mthca_table_get_range(dev, dev->mr_table.mtt_table, seg, + seg + (1 << order) - 1)) { + mthca_buddy_free(buddy, seg, order); + seg = -1; + } + + return seg; +} + +static struct mthca_mtt *__mthca_alloc_mtt(struct mthca_dev *dev, int size, + struct mthca_buddy *buddy) +{ + struct mthca_mtt *mtt; + int i; + + if (size <= 0) + return ERR_PTR(-EINVAL); + + mtt = kmalloc(sizeof *mtt, GFP_KERNEL); + if (!mtt) + return ERR_PTR(-ENOMEM); + + mtt->buddy = buddy; + mtt->order = 0; + for (i = dev->limits.mtt_seg_size / 8; i < size; i <<= 1) + ++mtt->order; + + mtt->first_seg = mthca_alloc_mtt_range(dev, mtt->order, buddy); + if (mtt->first_seg == -1) { + kfree(mtt); + return ERR_PTR(-ENOMEM); + } + + return mtt; +} + +struct mthca_mtt *mthca_alloc_mtt(struct mthca_dev *dev, int size) +{ + return __mthca_alloc_mtt(dev, size, &dev->mr_table.mtt_buddy); +} + +void mthca_free_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt) +{ + if (!mtt) + return; + + mthca_buddy_free(mtt->buddy, mtt->first_seg, mtt->order); + + mthca_table_put_range(dev, dev->mr_table.mtt_table, + mtt->first_seg, + mtt->first_seg + (1 << mtt->order) - 1); + + kfree(mtt); +} + +static int __mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt, + int start_index, u64 *buffer_list, int list_len) +{ + struct mthca_mailbox *mailbox; + __be64 *mtt_entry; + int err = 0; + int i; + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + mtt_entry = mailbox->buf; + + while (list_len > 0) { + mtt_entry[0] = cpu_to_be64(dev->mr_table.mtt_base + + mtt->first_seg * dev->limits.mtt_seg_size + + start_index * 8); + mtt_entry[1] = 0; + for (i = 0; i < list_len && i < MTHCA_MAILBOX_SIZE / 8 - 2; ++i) + mtt_entry[i + 2] = cpu_to_be64(buffer_list[i] | + MTHCA_MTT_FLAG_PRESENT); + + /* + * If we have an odd number of entries to write, add + * one more dummy entry for firmware efficiency. + */ + if (i & 1) + mtt_entry[i + 2] = 0; + + err = mthca_WRITE_MTT(dev, mailbox, (i + 1) & ~1); + if (err) { + mthca_warn(dev, "WRITE_MTT failed (%d)\n", err); + goto out; + } + + list_len -= i; + start_index += i; + buffer_list += i; + } + +out: + mthca_free_mailbox(dev, mailbox); + return err; +} + +int mthca_write_mtt_size(struct mthca_dev *dev) +{ + if (dev->mr_table.fmr_mtt_buddy != &dev->mr_table.mtt_buddy || + !(dev->mthca_flags & MTHCA_FLAG_FMR)) + /* + * Be friendly to WRITE_MTT command + * and leave two empty slots for the + * index and reserved fields of the + * mailbox. + */ + return PAGE_SIZE / sizeof (u64) - 2; + + /* For Arbel, all MTTs must fit in the same page. */ + return mthca_is_memfree(dev) ? (PAGE_SIZE / sizeof (u64)) : 0x7ffffff; +} + +static void mthca_tavor_write_mtt_seg(struct mthca_dev *dev, + struct mthca_mtt *mtt, int start_index, + u64 *buffer_list, int list_len) +{ + u64 __iomem *mtts; + int i; + + mtts = dev->mr_table.tavor_fmr.mtt_base + mtt->first_seg * dev->limits.mtt_seg_size + + start_index * sizeof (u64); + for (i = 0; i < list_len; ++i) + mthca_write64_raw(cpu_to_be64(buffer_list[i] | MTHCA_MTT_FLAG_PRESENT), + mtts + i); +} + +static void mthca_arbel_write_mtt_seg(struct mthca_dev *dev, + struct mthca_mtt *mtt, int start_index, + u64 *buffer_list, int list_len) +{ + __be64 *mtts; + dma_addr_t dma_handle; + int i; + int s = start_index * sizeof (u64); + + /* For Arbel, all MTTs must fit in the same page. */ + BUG_ON(s / PAGE_SIZE != (s + list_len * sizeof(u64) - 1) / PAGE_SIZE); + /* Require full segments */ + BUG_ON(s % dev->limits.mtt_seg_size); + + mtts = mthca_table_find(dev->mr_table.mtt_table, mtt->first_seg + + s / dev->limits.mtt_seg_size, &dma_handle); + + BUG_ON(!mtts); + + dma_sync_single_for_cpu(&dev->pdev->dev, dma_handle, + list_len * sizeof (u64), DMA_TO_DEVICE); + + for (i = 0; i < list_len; ++i) + mtts[i] = cpu_to_be64(buffer_list[i] | MTHCA_MTT_FLAG_PRESENT); + + dma_sync_single_for_device(&dev->pdev->dev, dma_handle, + list_len * sizeof (u64), DMA_TO_DEVICE); +} + +int mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt, + int start_index, u64 *buffer_list, int list_len) +{ + int size = mthca_write_mtt_size(dev); + int chunk; + + if (dev->mr_table.fmr_mtt_buddy != &dev->mr_table.mtt_buddy || + !(dev->mthca_flags & MTHCA_FLAG_FMR)) + return __mthca_write_mtt(dev, mtt, start_index, buffer_list, list_len); + + while (list_len > 0) { + chunk = min(size, list_len); + if (mthca_is_memfree(dev)) + mthca_arbel_write_mtt_seg(dev, mtt, start_index, + buffer_list, chunk); + else + mthca_tavor_write_mtt_seg(dev, mtt, start_index, + buffer_list, chunk); + + list_len -= chunk; + start_index += chunk; + buffer_list += chunk; + } + + return 0; +} + +static inline u32 tavor_hw_index_to_key(u32 ind) +{ + return ind; +} + +static inline u32 tavor_key_to_hw_index(u32 key) +{ + return key; +} + +static inline u32 arbel_hw_index_to_key(u32 ind) +{ + return (ind >> 24) | (ind << 8); +} + +static inline u32 arbel_key_to_hw_index(u32 key) +{ + return (key << 24) | (key >> 8); +} + +static inline u32 hw_index_to_key(struct mthca_dev *dev, u32 ind) +{ + if (mthca_is_memfree(dev)) + return arbel_hw_index_to_key(ind); + else + return tavor_hw_index_to_key(ind); +} + +static inline u32 key_to_hw_index(struct mthca_dev *dev, u32 key) +{ + if (mthca_is_memfree(dev)) + return arbel_key_to_hw_index(key); + else + return tavor_key_to_hw_index(key); +} + +static inline u32 adjust_key(struct mthca_dev *dev, u32 key) +{ + if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT) + return ((key << 20) & 0x800000) | (key & 0x7fffff); + else + return key; +} + +int mthca_mr_alloc(struct mthca_dev *dev, u32 pd, int buffer_size_shift, + u64 iova, u64 total_size, u32 access, struct mthca_mr *mr) +{ + struct mthca_mailbox *mailbox; + struct mthca_mpt_entry *mpt_entry; + u32 key; + int i; + int err; + + WARN_ON(buffer_size_shift >= 32); + + key = mthca_alloc(&dev->mr_table.mpt_alloc); + if (key == -1) + return -ENOMEM; + key = adjust_key(dev, key); + mr->ibmr.rkey = mr->ibmr.lkey = hw_index_to_key(dev, key); + + if (mthca_is_memfree(dev)) { + err = mthca_table_get(dev, dev->mr_table.mpt_table, key); + if (err) + goto err_out_mpt_free; + } + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) { + err = PTR_ERR(mailbox); + goto err_out_table; + } + mpt_entry = mailbox->buf; + + mpt_entry->flags = cpu_to_be32(MTHCA_MPT_FLAG_SW_OWNS | + MTHCA_MPT_FLAG_MIO | + MTHCA_MPT_FLAG_REGION | + access); + if (!mr->mtt) + mpt_entry->flags |= cpu_to_be32(MTHCA_MPT_FLAG_PHYSICAL); + + mpt_entry->page_size = cpu_to_be32(buffer_size_shift - 12); + mpt_entry->key = cpu_to_be32(key); + mpt_entry->pd = cpu_to_be32(pd); + mpt_entry->start = cpu_to_be64(iova); + mpt_entry->length = cpu_to_be64(total_size); + + memset(&mpt_entry->lkey, 0, + sizeof *mpt_entry - offsetof(struct mthca_mpt_entry, lkey)); + + if (mr->mtt) + mpt_entry->mtt_seg = + cpu_to_be64(dev->mr_table.mtt_base + + mr->mtt->first_seg * dev->limits.mtt_seg_size); + + if (0) { + mthca_dbg(dev, "Dumping MPT entry %08x:\n", mr->ibmr.lkey); + for (i = 0; i < sizeof (struct mthca_mpt_entry) / 4; ++i) { + if (i % 4 == 0) + printk("[%02x] ", i * 4); + printk(" %08x", be32_to_cpu(((__be32 *) mpt_entry)[i])); + if ((i + 1) % 4 == 0) + printk("\n"); + } + } + + err = mthca_SW2HW_MPT(dev, mailbox, + key & (dev->limits.num_mpts - 1)); + if (err) { + mthca_warn(dev, "SW2HW_MPT failed (%d)\n", err); + goto err_out_mailbox; + } + + mthca_free_mailbox(dev, mailbox); + return err; + +err_out_mailbox: + mthca_free_mailbox(dev, mailbox); + +err_out_table: + mthca_table_put(dev, dev->mr_table.mpt_table, key); + +err_out_mpt_free: + mthca_free(&dev->mr_table.mpt_alloc, key); + return err; +} + +int mthca_mr_alloc_notrans(struct mthca_dev *dev, u32 pd, + u32 access, struct mthca_mr *mr) +{ + mr->mtt = NULL; + return mthca_mr_alloc(dev, pd, 12, 0, ~0ULL, access, mr); +} + +int mthca_mr_alloc_phys(struct mthca_dev *dev, u32 pd, + u64 *buffer_list, int buffer_size_shift, + int list_len, u64 iova, u64 total_size, + u32 access, struct mthca_mr *mr) +{ + int err; + + mr->mtt = mthca_alloc_mtt(dev, list_len); + if (IS_ERR(mr->mtt)) + return PTR_ERR(mr->mtt); + + err = mthca_write_mtt(dev, mr->mtt, 0, buffer_list, list_len); + if (err) { + mthca_free_mtt(dev, mr->mtt); + return err; + } + + err = mthca_mr_alloc(dev, pd, buffer_size_shift, iova, + total_size, access, mr); + if (err) + mthca_free_mtt(dev, mr->mtt); + + return err; +} + +/* Free mr or fmr */ +static void mthca_free_region(struct mthca_dev *dev, u32 lkey) +{ + mthca_table_put(dev, dev->mr_table.mpt_table, + key_to_hw_index(dev, lkey)); + + mthca_free(&dev->mr_table.mpt_alloc, key_to_hw_index(dev, lkey)); +} + +void mthca_free_mr(struct mthca_dev *dev, struct mthca_mr *mr) +{ + int err; + + err = mthca_HW2SW_MPT(dev, NULL, + key_to_hw_index(dev, mr->ibmr.lkey) & + (dev->limits.num_mpts - 1)); + if (err) + mthca_warn(dev, "HW2SW_MPT failed (%d)\n", err); + + mthca_free_region(dev, mr->ibmr.lkey); + mthca_free_mtt(dev, mr->mtt); +} + +int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd, + u32 access, struct mthca_fmr *mr) +{ + struct mthca_mpt_entry *mpt_entry; + struct mthca_mailbox *mailbox; + u64 mtt_seg; + u32 key, idx; + int list_len = mr->attr.max_pages; + int err = -ENOMEM; + int i; + + if (mr->attr.page_shift < 12 || mr->attr.page_shift >= 32) + return -EINVAL; + + /* For Arbel, all MTTs must fit in the same page. */ + if (mthca_is_memfree(dev) && + mr->attr.max_pages * sizeof *mr->mem.arbel.mtts > PAGE_SIZE) + return -EINVAL; + + mr->maps = 0; + + key = mthca_alloc(&dev->mr_table.mpt_alloc); + if (key == -1) + return -ENOMEM; + key = adjust_key(dev, key); + + idx = key & (dev->limits.num_mpts - 1); + mr->ibmr.rkey = mr->ibmr.lkey = hw_index_to_key(dev, key); + + if (mthca_is_memfree(dev)) { + err = mthca_table_get(dev, dev->mr_table.mpt_table, key); + if (err) + goto err_out_mpt_free; + + mr->mem.arbel.mpt = mthca_table_find(dev->mr_table.mpt_table, key, NULL); + BUG_ON(!mr->mem.arbel.mpt); + } else + mr->mem.tavor.mpt = dev->mr_table.tavor_fmr.mpt_base + + sizeof *(mr->mem.tavor.mpt) * idx; + + mr->mtt = __mthca_alloc_mtt(dev, list_len, dev->mr_table.fmr_mtt_buddy); + if (IS_ERR(mr->mtt)) { + err = PTR_ERR(mr->mtt); + goto err_out_table; + } + + mtt_seg = mr->mtt->first_seg * dev->limits.mtt_seg_size; + + if (mthca_is_memfree(dev)) { + mr->mem.arbel.mtts = mthca_table_find(dev->mr_table.mtt_table, + mr->mtt->first_seg, + &mr->mem.arbel.dma_handle); + BUG_ON(!mr->mem.arbel.mtts); + } else + mr->mem.tavor.mtts = dev->mr_table.tavor_fmr.mtt_base + mtt_seg; + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) { + err = PTR_ERR(mailbox); + goto err_out_free_mtt; + } + + mpt_entry = mailbox->buf; + + mpt_entry->flags = cpu_to_be32(MTHCA_MPT_FLAG_SW_OWNS | + MTHCA_MPT_FLAG_MIO | + MTHCA_MPT_FLAG_REGION | + access); + + mpt_entry->page_size = cpu_to_be32(mr->attr.page_shift - 12); + mpt_entry->key = cpu_to_be32(key); + mpt_entry->pd = cpu_to_be32(pd); + memset(&mpt_entry->start, 0, + sizeof *mpt_entry - offsetof(struct mthca_mpt_entry, start)); + mpt_entry->mtt_seg = cpu_to_be64(dev->mr_table.mtt_base + mtt_seg); + + if (0) { + mthca_dbg(dev, "Dumping MPT entry %08x:\n", mr->ibmr.lkey); + for (i = 0; i < sizeof (struct mthca_mpt_entry) / 4; ++i) { + if (i % 4 == 0) + printk("[%02x] ", i * 4); + printk(" %08x", be32_to_cpu(((__be32 *) mpt_entry)[i])); + if ((i + 1) % 4 == 0) + printk("\n"); + } + } + + err = mthca_SW2HW_MPT(dev, mailbox, + key & (dev->limits.num_mpts - 1)); + if (err) { + mthca_warn(dev, "SW2HW_MPT failed (%d)\n", err); + goto err_out_mailbox_free; + } + + mthca_free_mailbox(dev, mailbox); + return 0; + +err_out_mailbox_free: + mthca_free_mailbox(dev, mailbox); + +err_out_free_mtt: + mthca_free_mtt(dev, mr->mtt); + +err_out_table: + mthca_table_put(dev, dev->mr_table.mpt_table, key); + +err_out_mpt_free: + mthca_free(&dev->mr_table.mpt_alloc, key); + return err; +} + +int mthca_free_fmr(struct mthca_dev *dev, struct mthca_fmr *fmr) +{ + if (fmr->maps) + return -EBUSY; + + mthca_free_region(dev, fmr->ibmr.lkey); + mthca_free_mtt(dev, fmr->mtt); + + return 0; +} + +static inline int mthca_check_fmr(struct mthca_fmr *fmr, u64 *page_list, + int list_len, u64 iova) +{ + int i, page_mask; + + if (list_len > fmr->attr.max_pages) + return -EINVAL; + + page_mask = (1 << fmr->attr.page_shift) - 1; + + /* We are getting page lists, so va must be page aligned. */ + if (iova & page_mask) + return -EINVAL; + + /* Trust the user not to pass misaligned data in page_list */ + if (0) + for (i = 0; i < list_len; ++i) { + if (page_list[i] & ~page_mask) + return -EINVAL; + } + + if (fmr->maps >= fmr->attr.max_maps) + return -EINVAL; + + return 0; +} + + +int mthca_tavor_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, + int list_len, u64 iova) +{ + struct mthca_fmr *fmr = to_mfmr(ibfmr); + struct mthca_dev *dev = to_mdev(ibfmr->device); + struct mthca_mpt_entry mpt_entry; + u32 key; + int i, err; + + err = mthca_check_fmr(fmr, page_list, list_len, iova); + if (err) + return err; + + ++fmr->maps; + + key = tavor_key_to_hw_index(fmr->ibmr.lkey); + key += dev->limits.num_mpts; + fmr->ibmr.lkey = fmr->ibmr.rkey = tavor_hw_index_to_key(key); + + writeb(MTHCA_MPT_STATUS_SW, fmr->mem.tavor.mpt); + + for (i = 0; i < list_len; ++i) { + __be64 mtt_entry = cpu_to_be64(page_list[i] | + MTHCA_MTT_FLAG_PRESENT); + mthca_write64_raw(mtt_entry, fmr->mem.tavor.mtts + i); + } + + mpt_entry.lkey = cpu_to_be32(key); + mpt_entry.length = cpu_to_be64(list_len * (1ull << fmr->attr.page_shift)); + mpt_entry.start = cpu_to_be64(iova); + + __raw_writel((__force u32) mpt_entry.lkey, &fmr->mem.tavor.mpt->key); + memcpy_toio(&fmr->mem.tavor.mpt->start, &mpt_entry.start, + offsetof(struct mthca_mpt_entry, window_count) - + offsetof(struct mthca_mpt_entry, start)); + + writeb(MTHCA_MPT_STATUS_HW, fmr->mem.tavor.mpt); + + return 0; +} + +int mthca_arbel_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, + int list_len, u64 iova) +{ + struct mthca_fmr *fmr = to_mfmr(ibfmr); + struct mthca_dev *dev = to_mdev(ibfmr->device); + u32 key; + int i, err; + + err = mthca_check_fmr(fmr, page_list, list_len, iova); + if (err) + return err; + + ++fmr->maps; + + key = arbel_key_to_hw_index(fmr->ibmr.lkey); + if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT) + key += SINAI_FMR_KEY_INC; + else + key += dev->limits.num_mpts; + fmr->ibmr.lkey = fmr->ibmr.rkey = arbel_hw_index_to_key(key); + + *(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_SW; + + wmb(); + + dma_sync_single_for_cpu(&dev->pdev->dev, fmr->mem.arbel.dma_handle, + list_len * sizeof(u64), DMA_TO_DEVICE); + + for (i = 0; i < list_len; ++i) + fmr->mem.arbel.mtts[i] = cpu_to_be64(page_list[i] | + MTHCA_MTT_FLAG_PRESENT); + + dma_sync_single_for_device(&dev->pdev->dev, fmr->mem.arbel.dma_handle, + list_len * sizeof(u64), DMA_TO_DEVICE); + + fmr->mem.arbel.mpt->key = cpu_to_be32(key); + fmr->mem.arbel.mpt->lkey = cpu_to_be32(key); + fmr->mem.arbel.mpt->length = cpu_to_be64(list_len * (1ull << fmr->attr.page_shift)); + fmr->mem.arbel.mpt->start = cpu_to_be64(iova); + + wmb(); + + *(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_HW; + + wmb(); + + return 0; +} + +void mthca_tavor_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr) +{ + if (!fmr->maps) + return; + + fmr->maps = 0; + + writeb(MTHCA_MPT_STATUS_SW, fmr->mem.tavor.mpt); +} + +void mthca_arbel_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr) +{ + if (!fmr->maps) + return; + + fmr->maps = 0; + + *(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_SW; +} + +int mthca_init_mr_table(struct mthca_dev *dev) +{ + phys_addr_t addr; + int mpts, mtts, err, i; + + err = mthca_alloc_init(&dev->mr_table.mpt_alloc, + dev->limits.num_mpts, + ~0, dev->limits.reserved_mrws); + if (err) + return err; + + if (!mthca_is_memfree(dev) && + (dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) + dev->limits.fmr_reserved_mtts = 0; + else + dev->mthca_flags |= MTHCA_FLAG_FMR; + + if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT) + mthca_dbg(dev, "Memory key throughput optimization activated.\n"); + + err = mthca_buddy_init(&dev->mr_table.mtt_buddy, + fls(dev->limits.num_mtt_segs - 1)); + + if (err) + goto err_mtt_buddy; + + dev->mr_table.tavor_fmr.mpt_base = NULL; + dev->mr_table.tavor_fmr.mtt_base = NULL; + + if (dev->limits.fmr_reserved_mtts) { + i = fls(dev->limits.fmr_reserved_mtts - 1); + + if (i >= 31) { + mthca_warn(dev, "Unable to reserve 2^31 FMR MTTs.\n"); + err = -EINVAL; + goto err_fmr_mpt; + } + mpts = mtts = 1 << i; + } else { + mtts = dev->limits.num_mtt_segs; + mpts = dev->limits.num_mpts; + } + + if (!mthca_is_memfree(dev) && + (dev->mthca_flags & MTHCA_FLAG_FMR)) { + + addr = pci_resource_start(dev->pdev, 4) + + ((pci_resource_len(dev->pdev, 4) - 1) & + dev->mr_table.mpt_base); + + dev->mr_table.tavor_fmr.mpt_base = + ioremap(addr, mpts * sizeof(struct mthca_mpt_entry)); + + if (!dev->mr_table.tavor_fmr.mpt_base) { + mthca_warn(dev, "MPT ioremap for FMR failed.\n"); + err = -ENOMEM; + goto err_fmr_mpt; + } + + addr = pci_resource_start(dev->pdev, 4) + + ((pci_resource_len(dev->pdev, 4) - 1) & + dev->mr_table.mtt_base); + + dev->mr_table.tavor_fmr.mtt_base = + ioremap(addr, mtts * dev->limits.mtt_seg_size); + if (!dev->mr_table.tavor_fmr.mtt_base) { + mthca_warn(dev, "MTT ioremap for FMR failed.\n"); + err = -ENOMEM; + goto err_fmr_mtt; + } + } + + if (dev->limits.fmr_reserved_mtts) { + err = mthca_buddy_init(&dev->mr_table.tavor_fmr.mtt_buddy, fls(mtts - 1)); + if (err) + goto err_fmr_mtt_buddy; + + /* Prevent regular MRs from using FMR keys */ + err = mthca_buddy_alloc(&dev->mr_table.mtt_buddy, fls(mtts - 1)); + if (err) + goto err_reserve_fmr; + + dev->mr_table.fmr_mtt_buddy = + &dev->mr_table.tavor_fmr.mtt_buddy; + } else + dev->mr_table.fmr_mtt_buddy = &dev->mr_table.mtt_buddy; + + /* FMR table is always the first, take reserved MTTs out of there */ + if (dev->limits.reserved_mtts) { + i = fls(dev->limits.reserved_mtts - 1); + + if (mthca_alloc_mtt_range(dev, i, + dev->mr_table.fmr_mtt_buddy) == -1) { + mthca_warn(dev, "MTT table of order %d is too small.\n", + dev->mr_table.fmr_mtt_buddy->max_order); + err = -ENOMEM; + goto err_reserve_mtts; + } + } + + return 0; + +err_reserve_mtts: +err_reserve_fmr: + if (dev->limits.fmr_reserved_mtts) + mthca_buddy_cleanup(&dev->mr_table.tavor_fmr.mtt_buddy); + +err_fmr_mtt_buddy: + if (dev->mr_table.tavor_fmr.mtt_base) + iounmap(dev->mr_table.tavor_fmr.mtt_base); + +err_fmr_mtt: + if (dev->mr_table.tavor_fmr.mpt_base) + iounmap(dev->mr_table.tavor_fmr.mpt_base); + +err_fmr_mpt: + mthca_buddy_cleanup(&dev->mr_table.mtt_buddy); + +err_mtt_buddy: + mthca_alloc_cleanup(&dev->mr_table.mpt_alloc); + + return err; +} + +void mthca_cleanup_mr_table(struct mthca_dev *dev) +{ + /* XXX check if any MRs are still allocated? */ + if (dev->limits.fmr_reserved_mtts) + mthca_buddy_cleanup(&dev->mr_table.tavor_fmr.mtt_buddy); + + mthca_buddy_cleanup(&dev->mr_table.mtt_buddy); + + if (dev->mr_table.tavor_fmr.mtt_base) + iounmap(dev->mr_table.tavor_fmr.mtt_base); + if (dev->mr_table.tavor_fmr.mpt_base) + iounmap(dev->mr_table.tavor_fmr.mpt_base); + + mthca_alloc_cleanup(&dev->mr_table.mpt_alloc); +} diff --git a/drivers/infiniband/hw/mthca/mthca_pd.c b/drivers/infiniband/hw/mthca/mthca_pd.c new file mode 100644 index 0000000..266f14e --- /dev/null +++ b/drivers/infiniband/hw/mthca/mthca_pd.c @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "mthca_dev.h" + +int mthca_pd_alloc(struct mthca_dev *dev, int privileged, struct mthca_pd *pd) +{ + int err = 0; + + pd->privileged = privileged; + + atomic_set(&pd->sqp_count, 0); + pd->pd_num = mthca_alloc(&dev->pd_table.alloc); + if (pd->pd_num == -1) + return -ENOMEM; + + if (privileged) { + err = mthca_mr_alloc_notrans(dev, pd->pd_num, + MTHCA_MPT_FLAG_LOCAL_READ | + MTHCA_MPT_FLAG_LOCAL_WRITE, + &pd->ntmr); + if (err) + mthca_free(&dev->pd_table.alloc, pd->pd_num); + } + + return err; +} + +void mthca_pd_free(struct mthca_dev *dev, struct mthca_pd *pd) +{ + if (pd->privileged) + mthca_free_mr(dev, &pd->ntmr); + mthca_free(&dev->pd_table.alloc, pd->pd_num); +} + +int mthca_init_pd_table(struct mthca_dev *dev) +{ + return mthca_alloc_init(&dev->pd_table.alloc, + dev->limits.num_pds, + (1 << 24) - 1, + dev->limits.reserved_pds); +} + +void mthca_cleanup_pd_table(struct mthca_dev *dev) +{ + /* XXX check if any PDs are still allocated? */ + mthca_alloc_cleanup(&dev->pd_table.alloc); +} diff --git a/drivers/infiniband/hw/mthca/mthca_profile.c b/drivers/infiniband/hw/mthca/mthca_profile.c new file mode 100644 index 0000000..8edb28a --- /dev/null +++ b/drivers/infiniband/hw/mthca/mthca_profile.c @@ -0,0 +1,285 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "mthca_profile.h" + +enum { + MTHCA_RES_QP, + MTHCA_RES_EEC, + MTHCA_RES_SRQ, + MTHCA_RES_CQ, + MTHCA_RES_EQP, + MTHCA_RES_EEEC, + MTHCA_RES_EQ, + MTHCA_RES_RDB, + MTHCA_RES_MCG, + MTHCA_RES_MPT, + MTHCA_RES_MTT, + MTHCA_RES_UAR, + MTHCA_RES_UDAV, + MTHCA_RES_UARC, + MTHCA_RES_NUM +}; + +enum { + MTHCA_NUM_EQS = 32, + MTHCA_NUM_PDS = 1 << 15 +}; + +s64 mthca_make_profile(struct mthca_dev *dev, + struct mthca_profile *request, + struct mthca_dev_lim *dev_lim, + struct mthca_init_hca_param *init_hca) +{ + struct mthca_resource { + u64 size; + u64 start; + int type; + int num; + int log_num; + }; + + u64 mem_base, mem_avail; + s64 total_size = 0; + struct mthca_resource *profile; + struct mthca_resource tmp; + int i, j; + + profile = kzalloc(MTHCA_RES_NUM * sizeof *profile, GFP_KERNEL); + if (!profile) + return -ENOMEM; + + profile[MTHCA_RES_QP].size = dev_lim->qpc_entry_sz; + profile[MTHCA_RES_EEC].size = dev_lim->eec_entry_sz; + profile[MTHCA_RES_SRQ].size = dev_lim->srq_entry_sz; + profile[MTHCA_RES_CQ].size = dev_lim->cqc_entry_sz; + profile[MTHCA_RES_EQP].size = dev_lim->eqpc_entry_sz; + profile[MTHCA_RES_EEEC].size = dev_lim->eeec_entry_sz; + profile[MTHCA_RES_EQ].size = dev_lim->eqc_entry_sz; + profile[MTHCA_RES_RDB].size = MTHCA_RDB_ENTRY_SIZE; + profile[MTHCA_RES_MCG].size = MTHCA_MGM_ENTRY_SIZE; + profile[MTHCA_RES_MPT].size = dev_lim->mpt_entry_sz; + profile[MTHCA_RES_MTT].size = dev->limits.mtt_seg_size; + profile[MTHCA_RES_UAR].size = dev_lim->uar_scratch_entry_sz; + profile[MTHCA_RES_UDAV].size = MTHCA_AV_SIZE; + profile[MTHCA_RES_UARC].size = request->uarc_size; + + profile[MTHCA_RES_QP].num = request->num_qp; + profile[MTHCA_RES_SRQ].num = request->num_srq; + profile[MTHCA_RES_EQP].num = request->num_qp; + profile[MTHCA_RES_RDB].num = request->num_qp * request->rdb_per_qp; + profile[MTHCA_RES_CQ].num = request->num_cq; + profile[MTHCA_RES_EQ].num = MTHCA_NUM_EQS; + profile[MTHCA_RES_MCG].num = request->num_mcg; + profile[MTHCA_RES_MPT].num = request->num_mpt; + profile[MTHCA_RES_MTT].num = request->num_mtt; + profile[MTHCA_RES_UAR].num = request->num_uar; + profile[MTHCA_RES_UARC].num = request->num_uar; + profile[MTHCA_RES_UDAV].num = request->num_udav; + + for (i = 0; i < MTHCA_RES_NUM; ++i) { + profile[i].type = i; + profile[i].log_num = max(ffs(profile[i].num) - 1, 0); + profile[i].size *= profile[i].num; + if (mthca_is_memfree(dev)) + profile[i].size = max(profile[i].size, (u64) PAGE_SIZE); + } + + if (mthca_is_memfree(dev)) { + mem_base = 0; + mem_avail = dev_lim->hca.arbel.max_icm_sz; + } else { + mem_base = dev->ddr_start; + mem_avail = dev->fw.tavor.fw_start - dev->ddr_start; + } + + /* + * Sort the resources in decreasing order of size. Since they + * all have sizes that are powers of 2, we'll be able to keep + * resources aligned to their size and pack them without gaps + * using the sorted order. + */ + for (i = MTHCA_RES_NUM; i > 0; --i) + for (j = 1; j < i; ++j) { + if (profile[j].size > profile[j - 1].size) { + tmp = profile[j]; + profile[j] = profile[j - 1]; + profile[j - 1] = tmp; + } + } + + for (i = 0; i < MTHCA_RES_NUM; ++i) { + if (profile[i].size) { + profile[i].start = mem_base + total_size; + total_size += profile[i].size; + } + if (total_size > mem_avail) { + mthca_err(dev, "Profile requires 0x%llx bytes; " + "won't fit in 0x%llx bytes of context memory.\n", + (unsigned long long) total_size, + (unsigned long long) mem_avail); + kfree(profile); + return -ENOMEM; + } + + if (profile[i].size) + mthca_dbg(dev, "profile[%2d]--%2d/%2d @ 0x%16llx " + "(size 0x%8llx)\n", + i, profile[i].type, profile[i].log_num, + (unsigned long long) profile[i].start, + (unsigned long long) profile[i].size); + } + + if (mthca_is_memfree(dev)) + mthca_dbg(dev, "HCA context memory: reserving %d KB\n", + (int) (total_size >> 10)); + else + mthca_dbg(dev, "HCA memory: allocated %d KB/%d KB (%d KB free)\n", + (int) (total_size >> 10), (int) (mem_avail >> 10), + (int) ((mem_avail - total_size) >> 10)); + + for (i = 0; i < MTHCA_RES_NUM; ++i) { + switch (profile[i].type) { + case MTHCA_RES_QP: + dev->limits.num_qps = profile[i].num; + init_hca->qpc_base = profile[i].start; + init_hca->log_num_qps = profile[i].log_num; + break; + case MTHCA_RES_EEC: + dev->limits.num_eecs = profile[i].num; + init_hca->eec_base = profile[i].start; + init_hca->log_num_eecs = profile[i].log_num; + break; + case MTHCA_RES_SRQ: + dev->limits.num_srqs = profile[i].num; + init_hca->srqc_base = profile[i].start; + init_hca->log_num_srqs = profile[i].log_num; + break; + case MTHCA_RES_CQ: + dev->limits.num_cqs = profile[i].num; + init_hca->cqc_base = profile[i].start; + init_hca->log_num_cqs = profile[i].log_num; + break; + case MTHCA_RES_EQP: + init_hca->eqpc_base = profile[i].start; + break; + case MTHCA_RES_EEEC: + init_hca->eeec_base = profile[i].start; + break; + case MTHCA_RES_EQ: + dev->limits.num_eqs = profile[i].num; + init_hca->eqc_base = profile[i].start; + init_hca->log_num_eqs = profile[i].log_num; + break; + case MTHCA_RES_RDB: + for (dev->qp_table.rdb_shift = 0; + request->num_qp << dev->qp_table.rdb_shift < profile[i].num; + ++dev->qp_table.rdb_shift) + ; /* nothing */ + dev->qp_table.rdb_base = (u32) profile[i].start; + init_hca->rdb_base = profile[i].start; + break; + case MTHCA_RES_MCG: + dev->limits.num_mgms = profile[i].num >> 1; + dev->limits.num_amgms = profile[i].num >> 1; + init_hca->mc_base = profile[i].start; + init_hca->log_mc_entry_sz = ffs(MTHCA_MGM_ENTRY_SIZE) - 1; + init_hca->log_mc_table_sz = profile[i].log_num; + init_hca->mc_hash_sz = 1 << (profile[i].log_num - 1); + break; + case MTHCA_RES_MPT: + dev->limits.num_mpts = profile[i].num; + dev->mr_table.mpt_base = profile[i].start; + init_hca->mpt_base = profile[i].start; + init_hca->log_mpt_sz = profile[i].log_num; + break; + case MTHCA_RES_MTT: + dev->limits.num_mtt_segs = profile[i].num; + dev->mr_table.mtt_base = profile[i].start; + init_hca->mtt_base = profile[i].start; + init_hca->mtt_seg_sz = ffs(dev->limits.mtt_seg_size) - 7; + break; + case MTHCA_RES_UAR: + dev->limits.num_uars = profile[i].num; + init_hca->uar_scratch_base = profile[i].start; + break; + case MTHCA_RES_UDAV: + dev->av_table.ddr_av_base = profile[i].start; + dev->av_table.num_ddr_avs = profile[i].num; + break; + case MTHCA_RES_UARC: + dev->uar_table.uarc_size = request->uarc_size; + dev->uar_table.uarc_base = profile[i].start; + init_hca->uarc_base = profile[i].start; + init_hca->log_uarc_sz = ffs(request->uarc_size) - 13; + init_hca->log_uar_sz = ffs(request->num_uar) - 1; + break; + default: + break; + } + } + + /* + * PDs don't take any HCA memory, but we assign them as part + * of the HCA profile anyway. + */ + dev->limits.num_pds = MTHCA_NUM_PDS; + + if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT && + init_hca->log_mpt_sz > 23) { + mthca_warn(dev, "MPT table too large (requested size 2^%d >= 2^24)\n", + init_hca->log_mpt_sz); + mthca_warn(dev, "Disabling memory key throughput optimization.\n"); + dev->mthca_flags &= ~MTHCA_FLAG_SINAI_OPT; + } + + /* + * For Tavor, FMRs use ioremapped PCI memory. For 32 bit + * systems it may use too much vmalloc space to map all MTT + * memory, so we reserve some MTTs for FMR access, taking them + * out of the MR pool. They don't use additional memory, but + * we assign them as part of the HCA profile anyway. + */ + if (mthca_is_memfree(dev) || BITS_PER_LONG == 64) + dev->limits.fmr_reserved_mtts = 0; + else + dev->limits.fmr_reserved_mtts = request->fmr_reserved_mtts; + + kfree(profile); + return total_size; +} diff --git a/drivers/infiniband/hw/mthca/mthca_profile.h b/drivers/infiniband/hw/mthca/mthca_profile.h new file mode 100644 index 0000000..62b009c --- /dev/null +++ b/drivers/infiniband/hw/mthca/mthca_profile.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MTHCA_PROFILE_H +#define MTHCA_PROFILE_H + +#include "mthca_dev.h" +#include "mthca_cmd.h" + +struct mthca_profile { + int num_qp; + int rdb_per_qp; + int num_srq; + int num_cq; + int num_mcg; + int num_mpt; + int num_mtt; + int num_udav; + int num_uar; + int uarc_size; + int fmr_reserved_mtts; +}; + +s64 mthca_make_profile(struct mthca_dev *mdev, + struct mthca_profile *request, + struct mthca_dev_lim *dev_lim, + struct mthca_init_hca_param *init_hca); + +#endif /* MTHCA_PROFILE_H */ diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c new file mode 100644 index 0000000..415f8e1 --- /dev/null +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -0,0 +1,1375 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "mthca_dev.h" +#include "mthca_cmd.h" +#include "mthca_user.h" +#include "mthca_memfree.h" + +static void init_query_mad(struct ib_smp *mad) +{ + mad->base_version = 1; + mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; + mad->class_version = 1; + mad->method = IB_MGMT_METHOD_GET; +} + +static int mthca_query_device(struct ib_device *ibdev, + struct ib_device_attr *props) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + struct mthca_dev *mdev = to_mdev(ibdev); + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + memset(props, 0, sizeof *props); + + props->fw_ver = mdev->fw_ver; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; + + err = mthca_MAD_IFC(mdev, 1, 1, + 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + props->device_cap_flags = mdev->device_cap_flags; + props->vendor_id = be32_to_cpup((__be32 *) (out_mad->data + 36)) & + 0xffffff; + props->vendor_part_id = be16_to_cpup((__be16 *) (out_mad->data + 30)); + props->hw_ver = be32_to_cpup((__be32 *) (out_mad->data + 32)); + memcpy(&props->sys_image_guid, out_mad->data + 4, 8); + + props->max_mr_size = ~0ull; + props->page_size_cap = mdev->limits.page_size_cap; + props->max_qp = mdev->limits.num_qps - mdev->limits.reserved_qps; + props->max_qp_wr = mdev->limits.max_wqes; + props->max_sge = mdev->limits.max_sg; + props->max_cq = mdev->limits.num_cqs - mdev->limits.reserved_cqs; + props->max_cqe = mdev->limits.max_cqes; + props->max_mr = mdev->limits.num_mpts - mdev->limits.reserved_mrws; + props->max_pd = mdev->limits.num_pds - mdev->limits.reserved_pds; + props->max_qp_rd_atom = 1 << mdev->qp_table.rdb_shift; + props->max_qp_init_rd_atom = mdev->limits.max_qp_init_rdma; + props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; + props->max_srq = mdev->limits.num_srqs - mdev->limits.reserved_srqs; + props->max_srq_wr = mdev->limits.max_srq_wqes; + props->max_srq_sge = mdev->limits.max_srq_sge; + props->local_ca_ack_delay = mdev->limits.local_ca_ack_delay; + props->atomic_cap = mdev->limits.flags & DEV_LIM_FLAG_ATOMIC ? + IB_ATOMIC_HCA : IB_ATOMIC_NONE; + props->max_pkeys = mdev->limits.pkey_table_len; + props->max_mcast_grp = mdev->limits.num_mgms + mdev->limits.num_amgms; + props->max_mcast_qp_attach = MTHCA_QP_PER_MGM; + props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * + props->max_mcast_grp; + /* + * If Sinai memory key optimization is being used, then only + * the 8-bit key portion will change. For other HCAs, the + * unused index bits will also be used for FMR remapping. + */ + if (mdev->mthca_flags & MTHCA_FLAG_SINAI_OPT) + props->max_map_per_fmr = 255; + else + props->max_map_per_fmr = + (1 << (32 - ilog2(mdev->limits.num_mpts))) - 1; + + err = 0; + out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static int mthca_query_port(struct ib_device *ibdev, + u8 port, struct ib_port_attr *props) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + memset(props, 0, sizeof *props); + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1, + port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + props->lid = be16_to_cpup((__be16 *) (out_mad->data + 16)); + props->lmc = out_mad->data[34] & 0x7; + props->sm_lid = be16_to_cpup((__be16 *) (out_mad->data + 18)); + props->sm_sl = out_mad->data[36] & 0xf; + props->state = out_mad->data[32] & 0xf; + props->phys_state = out_mad->data[33] >> 4; + props->port_cap_flags = be32_to_cpup((__be32 *) (out_mad->data + 20)); + props->gid_tbl_len = to_mdev(ibdev)->limits.gid_table_len; + props->max_msg_sz = 0x80000000; + props->pkey_tbl_len = to_mdev(ibdev)->limits.pkey_table_len; + props->bad_pkey_cntr = be16_to_cpup((__be16 *) (out_mad->data + 46)); + props->qkey_viol_cntr = be16_to_cpup((__be16 *) (out_mad->data + 48)); + props->active_width = out_mad->data[31] & 0xf; + props->active_speed = out_mad->data[35] >> 4; + props->max_mtu = out_mad->data[41] & 0xf; + props->active_mtu = out_mad->data[36] >> 4; + props->subnet_timeout = out_mad->data[51] & 0x1f; + props->max_vl_num = out_mad->data[37] >> 4; + props->init_type_reply = out_mad->data[41] >> 4; + + out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static int mthca_modify_device(struct ib_device *ibdev, + int mask, + struct ib_device_modify *props) +{ + if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) + return -EOPNOTSUPP; + + if (mask & IB_DEVICE_MODIFY_NODE_DESC) { + if (mutex_lock_interruptible(&to_mdev(ibdev)->cap_mask_mutex)) + return -ERESTARTSYS; + memcpy(ibdev->node_desc, props->node_desc, 64); + mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex); + } + + return 0; +} + +static int mthca_modify_port(struct ib_device *ibdev, + u8 port, int port_modify_mask, + struct ib_port_modify *props) +{ + struct mthca_set_ib_param set_ib; + struct ib_port_attr attr; + int err; + + if (mutex_lock_interruptible(&to_mdev(ibdev)->cap_mask_mutex)) + return -ERESTARTSYS; + + err = mthca_query_port(ibdev, port, &attr); + if (err) + goto out; + + set_ib.set_si_guid = 0; + set_ib.reset_qkey_viol = !!(port_modify_mask & IB_PORT_RESET_QKEY_CNTR); + + set_ib.cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) & + ~props->clr_port_cap_mask; + + err = mthca_SET_IB(to_mdev(ibdev), &set_ib, port); + if (err) + goto out; +out: + mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex); + return err; +} + +static int mthca_query_pkey(struct ib_device *ibdev, + u8 port, u16 index, u16 *pkey) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PKEY_TABLE; + in_mad->attr_mod = cpu_to_be32(index / 32); + + err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1, + port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + *pkey = be16_to_cpu(((__be16 *) out_mad->data)[index % 32]); + + out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static int mthca_query_gid(struct ib_device *ibdev, u8 port, + int index, union ib_gid *gid) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1, + port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(gid->raw, out_mad->data + 8, 8); + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; + in_mad->attr_mod = cpu_to_be32(index / 8); + + err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1, + port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8); + + out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static struct ib_ucontext *mthca_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct mthca_alloc_ucontext_resp uresp; + struct mthca_ucontext *context; + int err; + + if (!(to_mdev(ibdev)->active)) + return ERR_PTR(-EAGAIN); + + memset(&uresp, 0, sizeof uresp); + + uresp.qp_tab_size = to_mdev(ibdev)->limits.num_qps; + if (mthca_is_memfree(to_mdev(ibdev))) + uresp.uarc_size = to_mdev(ibdev)->uar_table.uarc_size; + else + uresp.uarc_size = 0; + + context = kmalloc(sizeof *context, GFP_KERNEL); + if (!context) + return ERR_PTR(-ENOMEM); + + err = mthca_uar_alloc(to_mdev(ibdev), &context->uar); + if (err) { + kfree(context); + return ERR_PTR(err); + } + + context->db_tab = mthca_init_user_db_tab(to_mdev(ibdev)); + if (IS_ERR(context->db_tab)) { + err = PTR_ERR(context->db_tab); + mthca_uar_free(to_mdev(ibdev), &context->uar); + kfree(context); + return ERR_PTR(err); + } + + if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) { + mthca_cleanup_user_db_tab(to_mdev(ibdev), &context->uar, context->db_tab); + mthca_uar_free(to_mdev(ibdev), &context->uar); + kfree(context); + return ERR_PTR(-EFAULT); + } + + context->reg_mr_warned = 0; + + return &context->ibucontext; +} + +static int mthca_dealloc_ucontext(struct ib_ucontext *context) +{ + mthca_cleanup_user_db_tab(to_mdev(context->device), &to_mucontext(context)->uar, + to_mucontext(context)->db_tab); + mthca_uar_free(to_mdev(context->device), &to_mucontext(context)->uar); + kfree(to_mucontext(context)); + + return 0; +} + +static int mthca_mmap_uar(struct ib_ucontext *context, + struct vm_area_struct *vma) +{ + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + if (io_remap_pfn_range(vma, vma->vm_start, + to_mucontext(context)->uar.pfn, + PAGE_SIZE, vma->vm_page_prot)) + return -EAGAIN; + + return 0; +} + +static struct ib_pd *mthca_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mthca_pd *pd; + int err; + + pd = kmalloc(sizeof *pd, GFP_KERNEL); + if (!pd) + return ERR_PTR(-ENOMEM); + + err = mthca_pd_alloc(to_mdev(ibdev), !context, pd); + if (err) { + kfree(pd); + return ERR_PTR(err); + } + + if (context) { + if (ib_copy_to_udata(udata, &pd->pd_num, sizeof (__u32))) { + mthca_pd_free(to_mdev(ibdev), pd); + kfree(pd); + return ERR_PTR(-EFAULT); + } + } + + return &pd->ibpd; +} + +static int mthca_dealloc_pd(struct ib_pd *pd) +{ + mthca_pd_free(to_mdev(pd->device), to_mpd(pd)); + kfree(pd); + + return 0; +} + +static struct ib_ah *mthca_ah_create(struct ib_pd *pd, + struct ib_ah_attr *ah_attr) +{ + int err; + struct mthca_ah *ah; + + ah = kmalloc(sizeof *ah, GFP_ATOMIC); + if (!ah) + return ERR_PTR(-ENOMEM); + + err = mthca_create_ah(to_mdev(pd->device), to_mpd(pd), ah_attr, ah); + if (err) { + kfree(ah); + return ERR_PTR(err); + } + + return &ah->ibah; +} + +static int mthca_ah_destroy(struct ib_ah *ah) +{ + mthca_destroy_ah(to_mdev(ah->device), to_mah(ah)); + kfree(ah); + + return 0; +} + +static struct ib_srq *mthca_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mthca_create_srq ucmd; + struct mthca_ucontext *context = NULL; + struct mthca_srq *srq; + int err; + + if (init_attr->srq_type != IB_SRQT_BASIC) + return ERR_PTR(-ENOSYS); + + srq = kmalloc(sizeof *srq, GFP_KERNEL); + if (!srq) + return ERR_PTR(-ENOMEM); + + if (pd->uobject) { + context = to_mucontext(pd->uobject->context); + + if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { + err = -EFAULT; + goto err_free; + } + + err = mthca_map_user_db(to_mdev(pd->device), &context->uar, + context->db_tab, ucmd.db_index, + ucmd.db_page); + + if (err) + goto err_free; + + srq->mr.ibmr.lkey = ucmd.lkey; + srq->db_index = ucmd.db_index; + } + + err = mthca_alloc_srq(to_mdev(pd->device), to_mpd(pd), + &init_attr->attr, srq); + + if (err && pd->uobject) + mthca_unmap_user_db(to_mdev(pd->device), &context->uar, + context->db_tab, ucmd.db_index); + + if (err) + goto err_free; + + if (context && ib_copy_to_udata(udata, &srq->srqn, sizeof (__u32))) { + mthca_free_srq(to_mdev(pd->device), srq); + err = -EFAULT; + goto err_free; + } + + return &srq->ibsrq; + +err_free: + kfree(srq); + + return ERR_PTR(err); +} + +static int mthca_destroy_srq(struct ib_srq *srq) +{ + struct mthca_ucontext *context; + + if (srq->uobject) { + context = to_mucontext(srq->uobject->context); + + mthca_unmap_user_db(to_mdev(srq->device), &context->uar, + context->db_tab, to_msrq(srq)->db_index); + } + + mthca_free_srq(to_mdev(srq->device), to_msrq(srq)); + kfree(srq); + + return 0; +} + +static struct ib_qp *mthca_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mthca_create_qp ucmd; + struct mthca_qp *qp; + int err; + + if (init_attr->create_flags) + return ERR_PTR(-EINVAL); + + switch (init_attr->qp_type) { + case IB_QPT_RC: + case IB_QPT_UC: + case IB_QPT_UD: + { + struct mthca_ucontext *context; + + qp = kmalloc(sizeof *qp, GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + if (pd->uobject) { + context = to_mucontext(pd->uobject->context); + + if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { + kfree(qp); + return ERR_PTR(-EFAULT); + } + + err = mthca_map_user_db(to_mdev(pd->device), &context->uar, + context->db_tab, + ucmd.sq_db_index, ucmd.sq_db_page); + if (err) { + kfree(qp); + return ERR_PTR(err); + } + + err = mthca_map_user_db(to_mdev(pd->device), &context->uar, + context->db_tab, + ucmd.rq_db_index, ucmd.rq_db_page); + if (err) { + mthca_unmap_user_db(to_mdev(pd->device), + &context->uar, + context->db_tab, + ucmd.sq_db_index); + kfree(qp); + return ERR_PTR(err); + } + + qp->mr.ibmr.lkey = ucmd.lkey; + qp->sq.db_index = ucmd.sq_db_index; + qp->rq.db_index = ucmd.rq_db_index; + } + + err = mthca_alloc_qp(to_mdev(pd->device), to_mpd(pd), + to_mcq(init_attr->send_cq), + to_mcq(init_attr->recv_cq), + init_attr->qp_type, init_attr->sq_sig_type, + &init_attr->cap, qp); + + if (err && pd->uobject) { + context = to_mucontext(pd->uobject->context); + + mthca_unmap_user_db(to_mdev(pd->device), + &context->uar, + context->db_tab, + ucmd.sq_db_index); + mthca_unmap_user_db(to_mdev(pd->device), + &context->uar, + context->db_tab, + ucmd.rq_db_index); + } + + qp->ibqp.qp_num = qp->qpn; + break; + } + case IB_QPT_SMI: + case IB_QPT_GSI: + { + /* Don't allow userspace to create special QPs */ + if (pd->uobject) + return ERR_PTR(-EINVAL); + + qp = kmalloc(sizeof (struct mthca_sqp), GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1; + + err = mthca_alloc_sqp(to_mdev(pd->device), to_mpd(pd), + to_mcq(init_attr->send_cq), + to_mcq(init_attr->recv_cq), + init_attr->sq_sig_type, &init_attr->cap, + qp->ibqp.qp_num, init_attr->port_num, + to_msqp(qp)); + break; + } + default: + /* Don't support raw QPs */ + return ERR_PTR(-ENOSYS); + } + + if (err) { + kfree(qp); + return ERR_PTR(err); + } + + init_attr->cap.max_send_wr = qp->sq.max; + init_attr->cap.max_recv_wr = qp->rq.max; + init_attr->cap.max_send_sge = qp->sq.max_gs; + init_attr->cap.max_recv_sge = qp->rq.max_gs; + init_attr->cap.max_inline_data = qp->max_inline_data; + + return &qp->ibqp; +} + +static int mthca_destroy_qp(struct ib_qp *qp) +{ + if (qp->uobject) { + mthca_unmap_user_db(to_mdev(qp->device), + &to_mucontext(qp->uobject->context)->uar, + to_mucontext(qp->uobject->context)->db_tab, + to_mqp(qp)->sq.db_index); + mthca_unmap_user_db(to_mdev(qp->device), + &to_mucontext(qp->uobject->context)->uar, + to_mucontext(qp->uobject->context)->db_tab, + to_mqp(qp)->rq.db_index); + } + mthca_free_qp(to_mdev(qp->device), to_mqp(qp)); + kfree(qp); + return 0; +} + +static struct ib_cq *mthca_create_cq(struct ib_device *ibdev, int entries, + int comp_vector, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mthca_create_cq ucmd; + struct mthca_cq *cq; + int nent; + int err; + + if (entries < 1 || entries > to_mdev(ibdev)->limits.max_cqes) + return ERR_PTR(-EINVAL); + + if (context) { + if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) + return ERR_PTR(-EFAULT); + + err = mthca_map_user_db(to_mdev(ibdev), &to_mucontext(context)->uar, + to_mucontext(context)->db_tab, + ucmd.set_db_index, ucmd.set_db_page); + if (err) + return ERR_PTR(err); + + err = mthca_map_user_db(to_mdev(ibdev), &to_mucontext(context)->uar, + to_mucontext(context)->db_tab, + ucmd.arm_db_index, ucmd.arm_db_page); + if (err) + goto err_unmap_set; + } + + cq = kmalloc(sizeof *cq, GFP_KERNEL); + if (!cq) { + err = -ENOMEM; + goto err_unmap_arm; + } + + if (context) { + cq->buf.mr.ibmr.lkey = ucmd.lkey; + cq->set_ci_db_index = ucmd.set_db_index; + cq->arm_db_index = ucmd.arm_db_index; + } + + for (nent = 1; nent <= entries; nent <<= 1) + ; /* nothing */ + + err = mthca_init_cq(to_mdev(ibdev), nent, + context ? to_mucontext(context) : NULL, + context ? ucmd.pdn : to_mdev(ibdev)->driver_pd.pd_num, + cq); + if (err) + goto err_free; + + if (context && ib_copy_to_udata(udata, &cq->cqn, sizeof (__u32))) { + mthca_free_cq(to_mdev(ibdev), cq); + err = -EFAULT; + goto err_free; + } + + cq->resize_buf = NULL; + + return &cq->ibcq; + +err_free: + kfree(cq); + +err_unmap_arm: + if (context) + mthca_unmap_user_db(to_mdev(ibdev), &to_mucontext(context)->uar, + to_mucontext(context)->db_tab, ucmd.arm_db_index); + +err_unmap_set: + if (context) + mthca_unmap_user_db(to_mdev(ibdev), &to_mucontext(context)->uar, + to_mucontext(context)->db_tab, ucmd.set_db_index); + + return ERR_PTR(err); +} + +static int mthca_alloc_resize_buf(struct mthca_dev *dev, struct mthca_cq *cq, + int entries) +{ + int ret; + + spin_lock_irq(&cq->lock); + if (cq->resize_buf) { + ret = -EBUSY; + goto unlock; + } + + cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC); + if (!cq->resize_buf) { + ret = -ENOMEM; + goto unlock; + } + + cq->resize_buf->state = CQ_RESIZE_ALLOC; + + ret = 0; + +unlock: + spin_unlock_irq(&cq->lock); + + if (ret) + return ret; + + ret = mthca_alloc_cq_buf(dev, &cq->resize_buf->buf, entries); + if (ret) { + spin_lock_irq(&cq->lock); + kfree(cq->resize_buf); + cq->resize_buf = NULL; + spin_unlock_irq(&cq->lock); + return ret; + } + + cq->resize_buf->cqe = entries - 1; + + spin_lock_irq(&cq->lock); + cq->resize_buf->state = CQ_RESIZE_READY; + spin_unlock_irq(&cq->lock); + + return 0; +} + +static int mthca_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) +{ + struct mthca_dev *dev = to_mdev(ibcq->device); + struct mthca_cq *cq = to_mcq(ibcq); + struct mthca_resize_cq ucmd; + u32 lkey; + int ret; + + if (entries < 1 || entries > dev->limits.max_cqes) + return -EINVAL; + + mutex_lock(&cq->mutex); + + entries = roundup_pow_of_two(entries + 1); + if (entries == ibcq->cqe + 1) { + ret = 0; + goto out; + } + + if (cq->is_kernel) { + ret = mthca_alloc_resize_buf(dev, cq, entries); + if (ret) + goto out; + lkey = cq->resize_buf->buf.mr.ibmr.lkey; + } else { + if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { + ret = -EFAULT; + goto out; + } + lkey = ucmd.lkey; + } + + ret = mthca_RESIZE_CQ(dev, cq->cqn, lkey, ilog2(entries)); + + if (ret) { + if (cq->resize_buf) { + mthca_free_cq_buf(dev, &cq->resize_buf->buf, + cq->resize_buf->cqe); + kfree(cq->resize_buf); + spin_lock_irq(&cq->lock); + cq->resize_buf = NULL; + spin_unlock_irq(&cq->lock); + } + goto out; + } + + if (cq->is_kernel) { + struct mthca_cq_buf tbuf; + int tcqe; + + spin_lock_irq(&cq->lock); + if (cq->resize_buf->state == CQ_RESIZE_READY) { + mthca_cq_resize_copy_cqes(cq); + tbuf = cq->buf; + tcqe = cq->ibcq.cqe; + cq->buf = cq->resize_buf->buf; + cq->ibcq.cqe = cq->resize_buf->cqe; + } else { + tbuf = cq->resize_buf->buf; + tcqe = cq->resize_buf->cqe; + } + + kfree(cq->resize_buf); + cq->resize_buf = NULL; + spin_unlock_irq(&cq->lock); + + mthca_free_cq_buf(dev, &tbuf, tcqe); + } else + ibcq->cqe = entries - 1; + +out: + mutex_unlock(&cq->mutex); + + return ret; +} + +static int mthca_destroy_cq(struct ib_cq *cq) +{ + if (cq->uobject) { + mthca_unmap_user_db(to_mdev(cq->device), + &to_mucontext(cq->uobject->context)->uar, + to_mucontext(cq->uobject->context)->db_tab, + to_mcq(cq)->arm_db_index); + mthca_unmap_user_db(to_mdev(cq->device), + &to_mucontext(cq->uobject->context)->uar, + to_mucontext(cq->uobject->context)->db_tab, + to_mcq(cq)->set_ci_db_index); + } + mthca_free_cq(to_mdev(cq->device), to_mcq(cq)); + kfree(cq); + + return 0; +} + +static inline u32 convert_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_ATOMIC ? MTHCA_MPT_FLAG_ATOMIC : 0) | + (acc & IB_ACCESS_REMOTE_WRITE ? MTHCA_MPT_FLAG_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? MTHCA_MPT_FLAG_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? MTHCA_MPT_FLAG_LOCAL_WRITE : 0) | + MTHCA_MPT_FLAG_LOCAL_READ; +} + +static struct ib_mr *mthca_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct mthca_mr *mr; + int err; + + mr = kmalloc(sizeof *mr, GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + err = mthca_mr_alloc_notrans(to_mdev(pd->device), + to_mpd(pd)->pd_num, + convert_access(acc), mr); + + if (err) { + kfree(mr); + return ERR_PTR(err); + } + + mr->umem = NULL; + + return &mr->ibmr; +} + +static struct ib_mr *mthca_reg_phys_mr(struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, + int acc, + u64 *iova_start) +{ + struct mthca_mr *mr; + u64 *page_list; + u64 total_size; + unsigned long mask; + int shift; + int npages; + int err; + int i, j, n; + + mask = buffer_list[0].addr ^ *iova_start; + total_size = 0; + for (i = 0; i < num_phys_buf; ++i) { + if (i != 0) + mask |= buffer_list[i].addr; + if (i != num_phys_buf - 1) + mask |= buffer_list[i].addr + buffer_list[i].size; + + total_size += buffer_list[i].size; + } + + if (mask & ~PAGE_MASK) + return ERR_PTR(-EINVAL); + + shift = __ffs(mask | 1 << 31); + + buffer_list[0].size += buffer_list[0].addr & ((1ULL << shift) - 1); + buffer_list[0].addr &= ~0ull << shift; + + mr = kmalloc(sizeof *mr, GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + npages = 0; + for (i = 0; i < num_phys_buf; ++i) + npages += (buffer_list[i].size + (1ULL << shift) - 1) >> shift; + + if (!npages) + return &mr->ibmr; + + page_list = kmalloc(npages * sizeof *page_list, GFP_KERNEL); + if (!page_list) { + kfree(mr); + return ERR_PTR(-ENOMEM); + } + + n = 0; + for (i = 0; i < num_phys_buf; ++i) + for (j = 0; + j < (buffer_list[i].size + (1ULL << shift) - 1) >> shift; + ++j) + page_list[n++] = buffer_list[i].addr + ((u64) j << shift); + + mthca_dbg(to_mdev(pd->device), "Registering memory at %llx (iova %llx) " + "in PD %x; shift %d, npages %d.\n", + (unsigned long long) buffer_list[0].addr, + (unsigned long long) *iova_start, + to_mpd(pd)->pd_num, + shift, npages); + + err = mthca_mr_alloc_phys(to_mdev(pd->device), + to_mpd(pd)->pd_num, + page_list, shift, npages, + *iova_start, total_size, + convert_access(acc), mr); + + if (err) { + kfree(page_list); + kfree(mr); + return ERR_PTR(err); + } + + kfree(page_list); + mr->umem = NULL; + + return &mr->ibmr; +} + +static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt, int acc, struct ib_udata *udata) +{ + struct mthca_dev *dev = to_mdev(pd->device); + struct scatterlist *sg; + struct mthca_mr *mr; + struct mthca_reg_mr ucmd; + u64 *pages; + int shift, n, len; + int i, k, entry; + int err = 0; + int write_mtt_size; + + if (udata->inlen - sizeof (struct ib_uverbs_cmd_hdr) < sizeof ucmd) { + if (!to_mucontext(pd->uobject->context)->reg_mr_warned) { + mthca_warn(dev, "Process '%s' did not pass in MR attrs.\n", + current->comm); + mthca_warn(dev, " Update libmthca to fix this.\n"); + } + ++to_mucontext(pd->uobject->context)->reg_mr_warned; + ucmd.mr_attrs = 0; + } else if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) + return ERR_PTR(-EFAULT); + + mr = kmalloc(sizeof *mr, GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + mr->umem = ib_umem_get(pd->uobject->context, start, length, acc, + ucmd.mr_attrs & MTHCA_MR_DMASYNC); + + if (IS_ERR(mr->umem)) { + err = PTR_ERR(mr->umem); + goto err; + } + + shift = ffs(mr->umem->page_size) - 1; + n = mr->umem->nmap; + + mr->mtt = mthca_alloc_mtt(dev, n); + if (IS_ERR(mr->mtt)) { + err = PTR_ERR(mr->mtt); + goto err_umem; + } + + pages = (u64 *) __get_free_page(GFP_KERNEL); + if (!pages) { + err = -ENOMEM; + goto err_mtt; + } + + i = n = 0; + + write_mtt_size = min(mthca_write_mtt_size(dev), (int) (PAGE_SIZE / sizeof *pages)); + + for_each_sg(mr->umem->sg_head.sgl, sg, mr->umem->nmap, entry) { + len = sg_dma_len(sg) >> shift; + for (k = 0; k < len; ++k) { + pages[i++] = sg_dma_address(sg) + + mr->umem->page_size * k; + /* + * Be friendly to write_mtt and pass it chunks + * of appropriate size. + */ + if (i == write_mtt_size) { + err = mthca_write_mtt(dev, mr->mtt, n, pages, i); + if (err) + goto mtt_done; + n += i; + i = 0; + } + } + } + + if (i) + err = mthca_write_mtt(dev, mr->mtt, n, pages, i); +mtt_done: + free_page((unsigned long) pages); + if (err) + goto err_mtt; + + err = mthca_mr_alloc(dev, to_mpd(pd)->pd_num, shift, virt, length, + convert_access(acc), mr); + + if (err) + goto err_mtt; + + return &mr->ibmr; + +err_mtt: + mthca_free_mtt(dev, mr->mtt); + +err_umem: + ib_umem_release(mr->umem); + +err: + kfree(mr); + return ERR_PTR(err); +} + +static int mthca_dereg_mr(struct ib_mr *mr) +{ + struct mthca_mr *mmr = to_mmr(mr); + + mthca_free_mr(to_mdev(mr->device), mmr); + if (mmr->umem) + ib_umem_release(mmr->umem); + kfree(mmr); + + return 0; +} + +static struct ib_fmr *mthca_alloc_fmr(struct ib_pd *pd, int mr_access_flags, + struct ib_fmr_attr *fmr_attr) +{ + struct mthca_fmr *fmr; + int err; + + fmr = kmalloc(sizeof *fmr, GFP_KERNEL); + if (!fmr) + return ERR_PTR(-ENOMEM); + + memcpy(&fmr->attr, fmr_attr, sizeof *fmr_attr); + err = mthca_fmr_alloc(to_mdev(pd->device), to_mpd(pd)->pd_num, + convert_access(mr_access_flags), fmr); + + if (err) { + kfree(fmr); + return ERR_PTR(err); + } + + return &fmr->ibmr; +} + +static int mthca_dealloc_fmr(struct ib_fmr *fmr) +{ + struct mthca_fmr *mfmr = to_mfmr(fmr); + int err; + + err = mthca_free_fmr(to_mdev(fmr->device), mfmr); + if (err) + return err; + + kfree(mfmr); + return 0; +} + +static int mthca_unmap_fmr(struct list_head *fmr_list) +{ + struct ib_fmr *fmr; + int err; + struct mthca_dev *mdev = NULL; + + list_for_each_entry(fmr, fmr_list, list) { + if (mdev && to_mdev(fmr->device) != mdev) + return -EINVAL; + mdev = to_mdev(fmr->device); + } + + if (!mdev) + return 0; + + if (mthca_is_memfree(mdev)) { + list_for_each_entry(fmr, fmr_list, list) + mthca_arbel_fmr_unmap(mdev, to_mfmr(fmr)); + + wmb(); + } else + list_for_each_entry(fmr, fmr_list, list) + mthca_tavor_fmr_unmap(mdev, to_mfmr(fmr)); + + err = mthca_SYNC_TPT(mdev); + return err; +} + +static ssize_t show_rev(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mthca_dev *dev = + container_of(device, struct mthca_dev, ib_dev.dev); + return sprintf(buf, "%x\n", dev->rev_id); +} + +static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mthca_dev *dev = + container_of(device, struct mthca_dev, ib_dev.dev); + return sprintf(buf, "%d.%d.%d\n", (int) (dev->fw_ver >> 32), + (int) (dev->fw_ver >> 16) & 0xffff, + (int) dev->fw_ver & 0xffff); +} + +static ssize_t show_hca(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mthca_dev *dev = + container_of(device, struct mthca_dev, ib_dev.dev); + switch (dev->pdev->device) { + case PCI_DEVICE_ID_MELLANOX_TAVOR: + return sprintf(buf, "MT23108\n"); + case PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT: + return sprintf(buf, "MT25208 (MT23108 compat mode)\n"); + case PCI_DEVICE_ID_MELLANOX_ARBEL: + return sprintf(buf, "MT25208\n"); + case PCI_DEVICE_ID_MELLANOX_SINAI: + case PCI_DEVICE_ID_MELLANOX_SINAI_OLD: + return sprintf(buf, "MT25204\n"); + default: + return sprintf(buf, "unknown\n"); + } +} + +static ssize_t show_board(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mthca_dev *dev = + container_of(device, struct mthca_dev, ib_dev.dev); + return sprintf(buf, "%.*s\n", MTHCA_BOARD_ID_LEN, dev->board_id); +} + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); + +static struct device_attribute *mthca_dev_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_fw_ver, + &dev_attr_hca_type, + &dev_attr_board_id +}; + +static int mthca_init_node_data(struct mthca_dev *dev) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_DESC; + + err = mthca_MAD_IFC(dev, 1, 1, + 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(dev->ib_dev.node_desc, out_mad->data, 64); + + in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; + + err = mthca_MAD_IFC(dev, 1, 1, + 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + if (mthca_is_memfree(dev)) + dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32)); + memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +int mthca_register_device(struct mthca_dev *dev) +{ + int ret; + int i; + + ret = mthca_init_node_data(dev); + if (ret) + return ret; + + strlcpy(dev->ib_dev.name, "mthca%d", IB_DEVICE_NAME_MAX); + dev->ib_dev.owner = THIS_MODULE; + + dev->ib_dev.uverbs_abi_ver = MTHCA_UVERBS_ABI_VERSION; + dev->ib_dev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_DETACH_MCAST); + dev->ib_dev.node_type = RDMA_NODE_IB_CA; + dev->ib_dev.phys_port_cnt = dev->limits.num_ports; + dev->ib_dev.num_comp_vectors = 1; + dev->ib_dev.dma_device = &dev->pdev->dev; + dev->ib_dev.query_device = mthca_query_device; + dev->ib_dev.query_port = mthca_query_port; + dev->ib_dev.modify_device = mthca_modify_device; + dev->ib_dev.modify_port = mthca_modify_port; + dev->ib_dev.query_pkey = mthca_query_pkey; + dev->ib_dev.query_gid = mthca_query_gid; + dev->ib_dev.alloc_ucontext = mthca_alloc_ucontext; + dev->ib_dev.dealloc_ucontext = mthca_dealloc_ucontext; + dev->ib_dev.mmap = mthca_mmap_uar; + dev->ib_dev.alloc_pd = mthca_alloc_pd; + dev->ib_dev.dealloc_pd = mthca_dealloc_pd; + dev->ib_dev.create_ah = mthca_ah_create; + dev->ib_dev.query_ah = mthca_ah_query; + dev->ib_dev.destroy_ah = mthca_ah_destroy; + + if (dev->mthca_flags & MTHCA_FLAG_SRQ) { + dev->ib_dev.create_srq = mthca_create_srq; + dev->ib_dev.modify_srq = mthca_modify_srq; + dev->ib_dev.query_srq = mthca_query_srq; + dev->ib_dev.destroy_srq = mthca_destroy_srq; + dev->ib_dev.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ); + + if (mthca_is_memfree(dev)) + dev->ib_dev.post_srq_recv = mthca_arbel_post_srq_recv; + else + dev->ib_dev.post_srq_recv = mthca_tavor_post_srq_recv; + } + + dev->ib_dev.create_qp = mthca_create_qp; + dev->ib_dev.modify_qp = mthca_modify_qp; + dev->ib_dev.query_qp = mthca_query_qp; + dev->ib_dev.destroy_qp = mthca_destroy_qp; + dev->ib_dev.create_cq = mthca_create_cq; + dev->ib_dev.resize_cq = mthca_resize_cq; + dev->ib_dev.destroy_cq = mthca_destroy_cq; + dev->ib_dev.poll_cq = mthca_poll_cq; + dev->ib_dev.get_dma_mr = mthca_get_dma_mr; + dev->ib_dev.reg_phys_mr = mthca_reg_phys_mr; + dev->ib_dev.reg_user_mr = mthca_reg_user_mr; + dev->ib_dev.dereg_mr = mthca_dereg_mr; + + if (dev->mthca_flags & MTHCA_FLAG_FMR) { + dev->ib_dev.alloc_fmr = mthca_alloc_fmr; + dev->ib_dev.unmap_fmr = mthca_unmap_fmr; + dev->ib_dev.dealloc_fmr = mthca_dealloc_fmr; + if (mthca_is_memfree(dev)) + dev->ib_dev.map_phys_fmr = mthca_arbel_map_phys_fmr; + else + dev->ib_dev.map_phys_fmr = mthca_tavor_map_phys_fmr; + } + + dev->ib_dev.attach_mcast = mthca_multicast_attach; + dev->ib_dev.detach_mcast = mthca_multicast_detach; + dev->ib_dev.process_mad = mthca_process_mad; + + if (mthca_is_memfree(dev)) { + dev->ib_dev.req_notify_cq = mthca_arbel_arm_cq; + dev->ib_dev.post_send = mthca_arbel_post_send; + dev->ib_dev.post_recv = mthca_arbel_post_receive; + } else { + dev->ib_dev.req_notify_cq = mthca_tavor_arm_cq; + dev->ib_dev.post_send = mthca_tavor_post_send; + dev->ib_dev.post_recv = mthca_tavor_post_receive; + } + + mutex_init(&dev->cap_mask_mutex); + + ret = ib_register_device(&dev->ib_dev, NULL); + if (ret) + return ret; + + for (i = 0; i < ARRAY_SIZE(mthca_dev_attributes); ++i) { + ret = device_create_file(&dev->ib_dev.dev, + mthca_dev_attributes[i]); + if (ret) { + ib_unregister_device(&dev->ib_dev); + return ret; + } + } + + mthca_start_catas_poll(dev); + + return 0; +} + +void mthca_unregister_device(struct mthca_dev *dev) +{ + mthca_stop_catas_poll(dev); + ib_unregister_device(&dev->ib_dev); +} diff --git a/drivers/infiniband/hw/mthca/mthca_provider.h b/drivers/infiniband/hw/mthca/mthca_provider.h new file mode 100644 index 0000000..596acc4 --- /dev/null +++ b/drivers/infiniband/hw/mthca/mthca_provider.h @@ -0,0 +1,344 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MTHCA_PROVIDER_H +#define MTHCA_PROVIDER_H + +#include +#include + +#define MTHCA_MPT_FLAG_ATOMIC (1 << 14) +#define MTHCA_MPT_FLAG_REMOTE_WRITE (1 << 13) +#define MTHCA_MPT_FLAG_REMOTE_READ (1 << 12) +#define MTHCA_MPT_FLAG_LOCAL_WRITE (1 << 11) +#define MTHCA_MPT_FLAG_LOCAL_READ (1 << 10) + +struct mthca_buf_list { + void *buf; + DEFINE_DMA_UNMAP_ADDR(mapping); +}; + +union mthca_buf { + struct mthca_buf_list direct; + struct mthca_buf_list *page_list; +}; + +struct mthca_uar { + unsigned long pfn; + int index; +}; + +struct mthca_user_db_table; + +struct mthca_ucontext { + struct ib_ucontext ibucontext; + struct mthca_uar uar; + struct mthca_user_db_table *db_tab; + int reg_mr_warned; +}; + +struct mthca_mtt; + +struct mthca_mr { + struct ib_mr ibmr; + struct ib_umem *umem; + struct mthca_mtt *mtt; +}; + +struct mthca_fmr { + struct ib_fmr ibmr; + struct ib_fmr_attr attr; + struct mthca_mtt *mtt; + int maps; + union { + struct { + struct mthca_mpt_entry __iomem *mpt; + u64 __iomem *mtts; + } tavor; + struct { + struct mthca_mpt_entry *mpt; + __be64 *mtts; + dma_addr_t dma_handle; + } arbel; + } mem; +}; + +struct mthca_pd { + struct ib_pd ibpd; + u32 pd_num; + atomic_t sqp_count; + struct mthca_mr ntmr; + int privileged; +}; + +struct mthca_eq { + struct mthca_dev *dev; + int eqn; + u32 eqn_mask; + u32 cons_index; + u16 msi_x_vector; + u16 msi_x_entry; + int have_irq; + int nent; + struct mthca_buf_list *page_list; + struct mthca_mr mr; + char irq_name[IB_DEVICE_NAME_MAX]; +}; + +struct mthca_av; + +enum mthca_ah_type { + MTHCA_AH_ON_HCA, + MTHCA_AH_PCI_POOL, + MTHCA_AH_KMALLOC +}; + +struct mthca_ah { + struct ib_ah ibah; + enum mthca_ah_type type; + u32 key; + struct mthca_av *av; + dma_addr_t avdma; +}; + +/* + * Quick description of our CQ/QP locking scheme: + * + * We have one global lock that protects dev->cq/qp_table. Each + * struct mthca_cq/qp also has its own lock. An individual qp lock + * may be taken inside of an individual cq lock. Both cqs attached to + * a qp may be locked, with the cq with the lower cqn locked first. + * No other nesting should be done. + * + * Each struct mthca_cq/qp also has an ref count, protected by the + * corresponding table lock. The pointer from the cq/qp_table to the + * struct counts as one reference. This reference also is good for + * access through the consumer API, so modifying the CQ/QP etc doesn't + * need to take another reference. Access to a QP because of a + * completion being polled does not need a reference either. + * + * Finally, each struct mthca_cq/qp has a wait_queue_head_t for the + * destroy function to sleep on. + * + * This means that access from the consumer API requires nothing but + * taking the struct's lock. + * + * Access because of a completion event should go as follows: + * - lock cq/qp_table and look up struct + * - increment ref count in struct + * - drop cq/qp_table lock + * - lock struct, do your thing, and unlock struct + * - decrement ref count; if zero, wake up waiters + * + * To destroy a CQ/QP, we can do the following: + * - lock cq/qp_table + * - remove pointer and decrement ref count + * - unlock cq/qp_table lock + * - wait_event until ref count is zero + * + * It is the consumer's responsibilty to make sure that no QP + * operations (WQE posting or state modification) are pending when a + * QP is destroyed. Also, the consumer must make sure that calls to + * qp_modify are serialized. Similarly, the consumer is responsible + * for ensuring that no CQ resize operations are pending when a CQ + * is destroyed. + * + * Possible optimizations (wait for profile data to see if/where we + * have locks bouncing between CPUs): + * - split cq/qp table lock into n separate (cache-aligned) locks, + * indexed (say) by the page in the table + * - split QP struct lock into three (one for common info, one for the + * send queue and one for the receive queue) + */ + +struct mthca_cq_buf { + union mthca_buf queue; + struct mthca_mr mr; + int is_direct; +}; + +struct mthca_cq_resize { + struct mthca_cq_buf buf; + int cqe; + enum { + CQ_RESIZE_ALLOC, + CQ_RESIZE_READY, + CQ_RESIZE_SWAPPED + } state; +}; + +struct mthca_cq { + struct ib_cq ibcq; + spinlock_t lock; + int refcount; + int cqn; + u32 cons_index; + struct mthca_cq_buf buf; + struct mthca_cq_resize *resize_buf; + int is_kernel; + + /* Next fields are Arbel only */ + int set_ci_db_index; + __be32 *set_ci_db; + int arm_db_index; + __be32 *arm_db; + int arm_sn; + + wait_queue_head_t wait; + struct mutex mutex; +}; + +struct mthca_srq { + struct ib_srq ibsrq; + spinlock_t lock; + int refcount; + int srqn; + int max; + int max_gs; + int wqe_shift; + int first_free; + int last_free; + u16 counter; /* Arbel only */ + int db_index; /* Arbel only */ + __be32 *db; /* Arbel only */ + void *last; + + int is_direct; + u64 *wrid; + union mthca_buf queue; + struct mthca_mr mr; + + wait_queue_head_t wait; + struct mutex mutex; +}; + +struct mthca_wq { + spinlock_t lock; + int max; + unsigned next_ind; + unsigned last_comp; + unsigned head; + unsigned tail; + void *last; + int max_gs; + int wqe_shift; + + int db_index; /* Arbel only */ + __be32 *db; +}; + +struct mthca_qp { + struct ib_qp ibqp; + int refcount; + u32 qpn; + int is_direct; + u8 port; /* for SQP and memfree use only */ + u8 alt_port; /* for memfree use only */ + u8 transport; + u8 state; + u8 atomic_rd_en; + u8 resp_depth; + + struct mthca_mr mr; + + struct mthca_wq rq; + struct mthca_wq sq; + enum ib_sig_type sq_policy; + int send_wqe_offset; + int max_inline_data; + + u64 *wrid; + union mthca_buf queue; + + wait_queue_head_t wait; + struct mutex mutex; +}; + +struct mthca_sqp { + struct mthca_qp qp; + int pkey_index; + u32 qkey; + u32 send_psn; + struct ib_ud_header ud_header; + int header_buf_size; + void *header_buf; + dma_addr_t header_dma; +}; + +static inline struct mthca_ucontext *to_mucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct mthca_ucontext, ibucontext); +} + +static inline struct mthca_fmr *to_mfmr(struct ib_fmr *ibmr) +{ + return container_of(ibmr, struct mthca_fmr, ibmr); +} + +static inline struct mthca_mr *to_mmr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct mthca_mr, ibmr); +} + +static inline struct mthca_pd *to_mpd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct mthca_pd, ibpd); +} + +static inline struct mthca_ah *to_mah(struct ib_ah *ibah) +{ + return container_of(ibah, struct mthca_ah, ibah); +} + +static inline struct mthca_cq *to_mcq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct mthca_cq, ibcq); +} + +static inline struct mthca_srq *to_msrq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct mthca_srq, ibsrq); +} + +static inline struct mthca_qp *to_mqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct mthca_qp, ibqp); +} + +static inline struct mthca_sqp *to_msqp(struct mthca_qp *qp) +{ + return container_of(qp, struct mthca_sqp, qp); +} + +#endif /* MTHCA_PROVIDER_H */ diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c new file mode 100644 index 0000000..e354b2f --- /dev/null +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -0,0 +1,2311 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include + +#include +#include +#include + +#include "mthca_dev.h" +#include "mthca_cmd.h" +#include "mthca_memfree.h" +#include "mthca_wqe.h" + +enum { + MTHCA_MAX_DIRECT_QP_SIZE = 4 * PAGE_SIZE, + MTHCA_ACK_REQ_FREQ = 10, + MTHCA_FLIGHT_LIMIT = 9, + MTHCA_UD_HEADER_SIZE = 72, /* largest UD header possible */ + MTHCA_INLINE_HEADER_SIZE = 4, /* data segment overhead for inline */ + MTHCA_INLINE_CHUNK_SIZE = 16 /* inline data segment chunk */ +}; + +enum { + MTHCA_QP_STATE_RST = 0, + MTHCA_QP_STATE_INIT = 1, + MTHCA_QP_STATE_RTR = 2, + MTHCA_QP_STATE_RTS = 3, + MTHCA_QP_STATE_SQE = 4, + MTHCA_QP_STATE_SQD = 5, + MTHCA_QP_STATE_ERR = 6, + MTHCA_QP_STATE_DRAINING = 7 +}; + +enum { + MTHCA_QP_ST_RC = 0x0, + MTHCA_QP_ST_UC = 0x1, + MTHCA_QP_ST_RD = 0x2, + MTHCA_QP_ST_UD = 0x3, + MTHCA_QP_ST_MLX = 0x7 +}; + +enum { + MTHCA_QP_PM_MIGRATED = 0x3, + MTHCA_QP_PM_ARMED = 0x0, + MTHCA_QP_PM_REARM = 0x1 +}; + +enum { + /* qp_context flags */ + MTHCA_QP_BIT_DE = 1 << 8, + /* params1 */ + MTHCA_QP_BIT_SRE = 1 << 15, + MTHCA_QP_BIT_SWE = 1 << 14, + MTHCA_QP_BIT_SAE = 1 << 13, + MTHCA_QP_BIT_SIC = 1 << 4, + MTHCA_QP_BIT_SSC = 1 << 3, + /* params2 */ + MTHCA_QP_BIT_RRE = 1 << 15, + MTHCA_QP_BIT_RWE = 1 << 14, + MTHCA_QP_BIT_RAE = 1 << 13, + MTHCA_QP_BIT_RIC = 1 << 4, + MTHCA_QP_BIT_RSC = 1 << 3 +}; + +enum { + MTHCA_SEND_DOORBELL_FENCE = 1 << 5 +}; + +struct mthca_qp_path { + __be32 port_pkey; + u8 rnr_retry; + u8 g_mylmc; + __be16 rlid; + u8 ackto; + u8 mgid_index; + u8 static_rate; + u8 hop_limit; + __be32 sl_tclass_flowlabel; + u8 rgid[16]; +} __attribute__((packed)); + +struct mthca_qp_context { + __be32 flags; + __be32 tavor_sched_queue; /* Reserved on Arbel */ + u8 mtu_msgmax; + u8 rq_size_stride; /* Reserved on Tavor */ + u8 sq_size_stride; /* Reserved on Tavor */ + u8 rlkey_arbel_sched_queue; /* Reserved on Tavor */ + __be32 usr_page; + __be32 local_qpn; + __be32 remote_qpn; + u32 reserved1[2]; + struct mthca_qp_path pri_path; + struct mthca_qp_path alt_path; + __be32 rdd; + __be32 pd; + __be32 wqe_base; + __be32 wqe_lkey; + __be32 params1; + __be32 reserved2; + __be32 next_send_psn; + __be32 cqn_snd; + __be32 snd_wqe_base_l; /* Next send WQE on Tavor */ + __be32 snd_db_index; /* (debugging only entries) */ + __be32 last_acked_psn; + __be32 ssn; + __be32 params2; + __be32 rnr_nextrecvpsn; + __be32 ra_buff_indx; + __be32 cqn_rcv; + __be32 rcv_wqe_base_l; /* Next recv WQE on Tavor */ + __be32 rcv_db_index; /* (debugging only entries) */ + __be32 qkey; + __be32 srqn; + __be32 rmsn; + __be16 rq_wqe_counter; /* reserved on Tavor */ + __be16 sq_wqe_counter; /* reserved on Tavor */ + u32 reserved3[18]; +} __attribute__((packed)); + +struct mthca_qp_param { + __be32 opt_param_mask; + u32 reserved1; + struct mthca_qp_context context; + u32 reserved2[62]; +} __attribute__((packed)); + +enum { + MTHCA_QP_OPTPAR_ALT_ADDR_PATH = 1 << 0, + MTHCA_QP_OPTPAR_RRE = 1 << 1, + MTHCA_QP_OPTPAR_RAE = 1 << 2, + MTHCA_QP_OPTPAR_RWE = 1 << 3, + MTHCA_QP_OPTPAR_PKEY_INDEX = 1 << 4, + MTHCA_QP_OPTPAR_Q_KEY = 1 << 5, + MTHCA_QP_OPTPAR_RNR_TIMEOUT = 1 << 6, + MTHCA_QP_OPTPAR_PRIMARY_ADDR_PATH = 1 << 7, + MTHCA_QP_OPTPAR_SRA_MAX = 1 << 8, + MTHCA_QP_OPTPAR_RRA_MAX = 1 << 9, + MTHCA_QP_OPTPAR_PM_STATE = 1 << 10, + MTHCA_QP_OPTPAR_PORT_NUM = 1 << 11, + MTHCA_QP_OPTPAR_RETRY_COUNT = 1 << 12, + MTHCA_QP_OPTPAR_ALT_RNR_RETRY = 1 << 13, + MTHCA_QP_OPTPAR_ACK_TIMEOUT = 1 << 14, + MTHCA_QP_OPTPAR_RNR_RETRY = 1 << 15, + MTHCA_QP_OPTPAR_SCHED_QUEUE = 1 << 16 +}; + +static const u8 mthca_opcode[] = { + [IB_WR_SEND] = MTHCA_OPCODE_SEND, + [IB_WR_SEND_WITH_IMM] = MTHCA_OPCODE_SEND_IMM, + [IB_WR_RDMA_WRITE] = MTHCA_OPCODE_RDMA_WRITE, + [IB_WR_RDMA_WRITE_WITH_IMM] = MTHCA_OPCODE_RDMA_WRITE_IMM, + [IB_WR_RDMA_READ] = MTHCA_OPCODE_RDMA_READ, + [IB_WR_ATOMIC_CMP_AND_SWP] = MTHCA_OPCODE_ATOMIC_CS, + [IB_WR_ATOMIC_FETCH_AND_ADD] = MTHCA_OPCODE_ATOMIC_FA, +}; + +static int is_sqp(struct mthca_dev *dev, struct mthca_qp *qp) +{ + return qp->qpn >= dev->qp_table.sqp_start && + qp->qpn <= dev->qp_table.sqp_start + 3; +} + +static int is_qp0(struct mthca_dev *dev, struct mthca_qp *qp) +{ + return qp->qpn >= dev->qp_table.sqp_start && + qp->qpn <= dev->qp_table.sqp_start + 1; +} + +static void *get_recv_wqe(struct mthca_qp *qp, int n) +{ + if (qp->is_direct) + return qp->queue.direct.buf + (n << qp->rq.wqe_shift); + else + return qp->queue.page_list[(n << qp->rq.wqe_shift) >> PAGE_SHIFT].buf + + ((n << qp->rq.wqe_shift) & (PAGE_SIZE - 1)); +} + +static void *get_send_wqe(struct mthca_qp *qp, int n) +{ + if (qp->is_direct) + return qp->queue.direct.buf + qp->send_wqe_offset + + (n << qp->sq.wqe_shift); + else + return qp->queue.page_list[(qp->send_wqe_offset + + (n << qp->sq.wqe_shift)) >> + PAGE_SHIFT].buf + + ((qp->send_wqe_offset + (n << qp->sq.wqe_shift)) & + (PAGE_SIZE - 1)); +} + +static void mthca_wq_reset(struct mthca_wq *wq) +{ + wq->next_ind = 0; + wq->last_comp = wq->max - 1; + wq->head = 0; + wq->tail = 0; +} + +void mthca_qp_event(struct mthca_dev *dev, u32 qpn, + enum ib_event_type event_type) +{ + struct mthca_qp *qp; + struct ib_event event; + + spin_lock(&dev->qp_table.lock); + qp = mthca_array_get(&dev->qp_table.qp, qpn & (dev->limits.num_qps - 1)); + if (qp) + ++qp->refcount; + spin_unlock(&dev->qp_table.lock); + + if (!qp) { + mthca_warn(dev, "Async event %d for bogus QP %08x\n", + event_type, qpn); + return; + } + + if (event_type == IB_EVENT_PATH_MIG) + qp->port = qp->alt_port; + + event.device = &dev->ib_dev; + event.event = event_type; + event.element.qp = &qp->ibqp; + if (qp->ibqp.event_handler) + qp->ibqp.event_handler(&event, qp->ibqp.qp_context); + + spin_lock(&dev->qp_table.lock); + if (!--qp->refcount) + wake_up(&qp->wait); + spin_unlock(&dev->qp_table.lock); +} + +static int to_mthca_state(enum ib_qp_state ib_state) +{ + switch (ib_state) { + case IB_QPS_RESET: return MTHCA_QP_STATE_RST; + case IB_QPS_INIT: return MTHCA_QP_STATE_INIT; + case IB_QPS_RTR: return MTHCA_QP_STATE_RTR; + case IB_QPS_RTS: return MTHCA_QP_STATE_RTS; + case IB_QPS_SQD: return MTHCA_QP_STATE_SQD; + case IB_QPS_SQE: return MTHCA_QP_STATE_SQE; + case IB_QPS_ERR: return MTHCA_QP_STATE_ERR; + default: return -1; + } +} + +enum { RC, UC, UD, RD, RDEE, MLX, NUM_TRANS }; + +static int to_mthca_st(int transport) +{ + switch (transport) { + case RC: return MTHCA_QP_ST_RC; + case UC: return MTHCA_QP_ST_UC; + case UD: return MTHCA_QP_ST_UD; + case RD: return MTHCA_QP_ST_RD; + case MLX: return MTHCA_QP_ST_MLX; + default: return -1; + } +} + +static void store_attrs(struct mthca_sqp *sqp, const struct ib_qp_attr *attr, + int attr_mask) +{ + if (attr_mask & IB_QP_PKEY_INDEX) + sqp->pkey_index = attr->pkey_index; + if (attr_mask & IB_QP_QKEY) + sqp->qkey = attr->qkey; + if (attr_mask & IB_QP_SQ_PSN) + sqp->send_psn = attr->sq_psn; +} + +static void init_port(struct mthca_dev *dev, int port) +{ + int err; + struct mthca_init_ib_param param; + + memset(¶m, 0, sizeof param); + + param.port_width = dev->limits.port_width_cap; + param.vl_cap = dev->limits.vl_cap; + param.mtu_cap = dev->limits.mtu_cap; + param.gid_cap = dev->limits.gid_table_len; + param.pkey_cap = dev->limits.pkey_table_len; + + err = mthca_INIT_IB(dev, ¶m, port); + if (err) + mthca_warn(dev, "INIT_IB failed, return code %d.\n", err); +} + +static __be32 get_hw_access_flags(struct mthca_qp *qp, const struct ib_qp_attr *attr, + int attr_mask) +{ + u8 dest_rd_atomic; + u32 access_flags; + u32 hw_access_flags = 0; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + dest_rd_atomic = attr->max_dest_rd_atomic; + else + dest_rd_atomic = qp->resp_depth; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + access_flags = attr->qp_access_flags; + else + access_flags = qp->atomic_rd_en; + + if (!dest_rd_atomic) + access_flags &= IB_ACCESS_REMOTE_WRITE; + + if (access_flags & IB_ACCESS_REMOTE_READ) + hw_access_flags |= MTHCA_QP_BIT_RRE; + if (access_flags & IB_ACCESS_REMOTE_ATOMIC) + hw_access_flags |= MTHCA_QP_BIT_RAE; + if (access_flags & IB_ACCESS_REMOTE_WRITE) + hw_access_flags |= MTHCA_QP_BIT_RWE; + + return cpu_to_be32(hw_access_flags); +} + +static inline enum ib_qp_state to_ib_qp_state(int mthca_state) +{ + switch (mthca_state) { + case MTHCA_QP_STATE_RST: return IB_QPS_RESET; + case MTHCA_QP_STATE_INIT: return IB_QPS_INIT; + case MTHCA_QP_STATE_RTR: return IB_QPS_RTR; + case MTHCA_QP_STATE_RTS: return IB_QPS_RTS; + case MTHCA_QP_STATE_DRAINING: + case MTHCA_QP_STATE_SQD: return IB_QPS_SQD; + case MTHCA_QP_STATE_SQE: return IB_QPS_SQE; + case MTHCA_QP_STATE_ERR: return IB_QPS_ERR; + default: return -1; + } +} + +static inline enum ib_mig_state to_ib_mig_state(int mthca_mig_state) +{ + switch (mthca_mig_state) { + case 0: return IB_MIG_ARMED; + case 1: return IB_MIG_REARM; + case 3: return IB_MIG_MIGRATED; + default: return -1; + } +} + +static int to_ib_qp_access_flags(int mthca_flags) +{ + int ib_flags = 0; + + if (mthca_flags & MTHCA_QP_BIT_RRE) + ib_flags |= IB_ACCESS_REMOTE_READ; + if (mthca_flags & MTHCA_QP_BIT_RWE) + ib_flags |= IB_ACCESS_REMOTE_WRITE; + if (mthca_flags & MTHCA_QP_BIT_RAE) + ib_flags |= IB_ACCESS_REMOTE_ATOMIC; + + return ib_flags; +} + +static void to_ib_ah_attr(struct mthca_dev *dev, struct ib_ah_attr *ib_ah_attr, + struct mthca_qp_path *path) +{ + memset(ib_ah_attr, 0, sizeof *ib_ah_attr); + ib_ah_attr->port_num = (be32_to_cpu(path->port_pkey) >> 24) & 0x3; + + if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->limits.num_ports) + return; + + ib_ah_attr->dlid = be16_to_cpu(path->rlid); + ib_ah_attr->sl = be32_to_cpu(path->sl_tclass_flowlabel) >> 28; + ib_ah_attr->src_path_bits = path->g_mylmc & 0x7f; + ib_ah_attr->static_rate = mthca_rate_to_ib(dev, + path->static_rate & 0xf, + ib_ah_attr->port_num); + ib_ah_attr->ah_flags = (path->g_mylmc & (1 << 7)) ? IB_AH_GRH : 0; + if (ib_ah_attr->ah_flags) { + ib_ah_attr->grh.sgid_index = path->mgid_index & (dev->limits.gid_table_len - 1); + ib_ah_attr->grh.hop_limit = path->hop_limit; + ib_ah_attr->grh.traffic_class = + (be32_to_cpu(path->sl_tclass_flowlabel) >> 20) & 0xff; + ib_ah_attr->grh.flow_label = + be32_to_cpu(path->sl_tclass_flowlabel) & 0xfffff; + memcpy(ib_ah_attr->grh.dgid.raw, + path->rgid, sizeof ib_ah_attr->grh.dgid.raw); + } +} + +int mthca_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct mthca_dev *dev = to_mdev(ibqp->device); + struct mthca_qp *qp = to_mqp(ibqp); + int err = 0; + struct mthca_mailbox *mailbox = NULL; + struct mthca_qp_param *qp_param; + struct mthca_qp_context *context; + int mthca_state; + + mutex_lock(&qp->mutex); + + if (qp->state == IB_QPS_RESET) { + qp_attr->qp_state = IB_QPS_RESET; + goto done; + } + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) { + err = PTR_ERR(mailbox); + goto out; + } + + err = mthca_QUERY_QP(dev, qp->qpn, 0, mailbox); + if (err) { + mthca_warn(dev, "QUERY_QP failed (%d)\n", err); + goto out_mailbox; + } + + qp_param = mailbox->buf; + context = &qp_param->context; + mthca_state = be32_to_cpu(context->flags) >> 28; + + qp->state = to_ib_qp_state(mthca_state); + qp_attr->qp_state = qp->state; + qp_attr->path_mtu = context->mtu_msgmax >> 5; + qp_attr->path_mig_state = + to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3); + qp_attr->qkey = be32_to_cpu(context->qkey); + qp_attr->rq_psn = be32_to_cpu(context->rnr_nextrecvpsn) & 0xffffff; + qp_attr->sq_psn = be32_to_cpu(context->next_send_psn) & 0xffffff; + qp_attr->dest_qp_num = be32_to_cpu(context->remote_qpn) & 0xffffff; + qp_attr->qp_access_flags = + to_ib_qp_access_flags(be32_to_cpu(context->params2)); + + if (qp->transport == RC || qp->transport == UC) { + to_ib_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path); + to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path); + qp_attr->alt_pkey_index = + be32_to_cpu(context->alt_path.port_pkey) & 0x7f; + qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num; + } + + qp_attr->pkey_index = be32_to_cpu(context->pri_path.port_pkey) & 0x7f; + qp_attr->port_num = + (be32_to_cpu(context->pri_path.port_pkey) >> 24) & 0x3; + + /* qp_attr->en_sqd_async_notify is only applicable in modify qp */ + qp_attr->sq_draining = mthca_state == MTHCA_QP_STATE_DRAINING; + + qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context->params1) >> 21) & 0x7); + + qp_attr->max_dest_rd_atomic = + 1 << ((be32_to_cpu(context->params2) >> 21) & 0x7); + qp_attr->min_rnr_timer = + (be32_to_cpu(context->rnr_nextrecvpsn) >> 24) & 0x1f; + qp_attr->timeout = context->pri_path.ackto >> 3; + qp_attr->retry_cnt = (be32_to_cpu(context->params1) >> 16) & 0x7; + qp_attr->rnr_retry = context->pri_path.rnr_retry >> 5; + qp_attr->alt_timeout = context->alt_path.ackto >> 3; + +done: + qp_attr->cur_qp_state = qp_attr->qp_state; + qp_attr->cap.max_send_wr = qp->sq.max; + qp_attr->cap.max_recv_wr = qp->rq.max; + qp_attr->cap.max_send_sge = qp->sq.max_gs; + qp_attr->cap.max_recv_sge = qp->rq.max_gs; + qp_attr->cap.max_inline_data = qp->max_inline_data; + + qp_init_attr->cap = qp_attr->cap; + qp_init_attr->sq_sig_type = qp->sq_policy; + +out_mailbox: + mthca_free_mailbox(dev, mailbox); + +out: + mutex_unlock(&qp->mutex); + return err; +} + +static int mthca_path_set(struct mthca_dev *dev, const struct ib_ah_attr *ah, + struct mthca_qp_path *path, u8 port) +{ + path->g_mylmc = ah->src_path_bits & 0x7f; + path->rlid = cpu_to_be16(ah->dlid); + path->static_rate = mthca_get_rate(dev, ah->static_rate, port); + + if (ah->ah_flags & IB_AH_GRH) { + if (ah->grh.sgid_index >= dev->limits.gid_table_len) { + mthca_dbg(dev, "sgid_index (%u) too large. max is %d\n", + ah->grh.sgid_index, dev->limits.gid_table_len-1); + return -1; + } + + path->g_mylmc |= 1 << 7; + path->mgid_index = ah->grh.sgid_index; + path->hop_limit = ah->grh.hop_limit; + path->sl_tclass_flowlabel = + cpu_to_be32((ah->sl << 28) | + (ah->grh.traffic_class << 20) | + (ah->grh.flow_label)); + memcpy(path->rgid, ah->grh.dgid.raw, 16); + } else + path->sl_tclass_flowlabel = cpu_to_be32(ah->sl << 28); + + return 0; +} + +static int __mthca_modify_qp(struct ib_qp *ibqp, + const struct ib_qp_attr *attr, int attr_mask, + enum ib_qp_state cur_state, enum ib_qp_state new_state) +{ + struct mthca_dev *dev = to_mdev(ibqp->device); + struct mthca_qp *qp = to_mqp(ibqp); + struct mthca_mailbox *mailbox; + struct mthca_qp_param *qp_param; + struct mthca_qp_context *qp_context; + u32 sqd_event = 0; + int err = -EINVAL; + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) { + err = PTR_ERR(mailbox); + goto out; + } + qp_param = mailbox->buf; + qp_context = &qp_param->context; + memset(qp_param, 0, sizeof *qp_param); + + qp_context->flags = cpu_to_be32((to_mthca_state(new_state) << 28) | + (to_mthca_st(qp->transport) << 16)); + qp_context->flags |= cpu_to_be32(MTHCA_QP_BIT_DE); + if (!(attr_mask & IB_QP_PATH_MIG_STATE)) + qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_MIGRATED << 11); + else { + qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PM_STATE); + switch (attr->path_mig_state) { + case IB_MIG_MIGRATED: + qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_MIGRATED << 11); + break; + case IB_MIG_REARM: + qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_REARM << 11); + break; + case IB_MIG_ARMED: + qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_ARMED << 11); + break; + } + } + + /* leave tavor_sched_queue as 0 */ + + if (qp->transport == MLX || qp->transport == UD) + qp_context->mtu_msgmax = (IB_MTU_2048 << 5) | 11; + else if (attr_mask & IB_QP_PATH_MTU) { + if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_2048) { + mthca_dbg(dev, "path MTU (%u) is invalid\n", + attr->path_mtu); + goto out_mailbox; + } + qp_context->mtu_msgmax = (attr->path_mtu << 5) | 31; + } + + if (mthca_is_memfree(dev)) { + if (qp->rq.max) + qp_context->rq_size_stride = ilog2(qp->rq.max) << 3; + qp_context->rq_size_stride |= qp->rq.wqe_shift - 4; + + if (qp->sq.max) + qp_context->sq_size_stride = ilog2(qp->sq.max) << 3; + qp_context->sq_size_stride |= qp->sq.wqe_shift - 4; + } + + /* leave arbel_sched_queue as 0 */ + + if (qp->ibqp.uobject) + qp_context->usr_page = + cpu_to_be32(to_mucontext(qp->ibqp.uobject->context)->uar.index); + else + qp_context->usr_page = cpu_to_be32(dev->driver_uar.index); + qp_context->local_qpn = cpu_to_be32(qp->qpn); + if (attr_mask & IB_QP_DEST_QPN) { + qp_context->remote_qpn = cpu_to_be32(attr->dest_qp_num); + } + + if (qp->transport == MLX) + qp_context->pri_path.port_pkey |= + cpu_to_be32(qp->port << 24); + else { + if (attr_mask & IB_QP_PORT) { + qp_context->pri_path.port_pkey |= + cpu_to_be32(attr->port_num << 24); + qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PORT_NUM); + } + } + + if (attr_mask & IB_QP_PKEY_INDEX) { + qp_context->pri_path.port_pkey |= + cpu_to_be32(attr->pkey_index); + qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PKEY_INDEX); + } + + if (attr_mask & IB_QP_RNR_RETRY) { + qp_context->alt_path.rnr_retry = qp_context->pri_path.rnr_retry = + attr->rnr_retry << 5; + qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RNR_RETRY | + MTHCA_QP_OPTPAR_ALT_RNR_RETRY); + } + + if (attr_mask & IB_QP_AV) { + if (mthca_path_set(dev, &attr->ah_attr, &qp_context->pri_path, + attr_mask & IB_QP_PORT ? attr->port_num : qp->port)) + goto out_mailbox; + + qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PRIMARY_ADDR_PATH); + } + + if (ibqp->qp_type == IB_QPT_RC && + cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { + u8 sched_queue = ibqp->uobject ? 0x2 : 0x1; + + if (mthca_is_memfree(dev)) + qp_context->rlkey_arbel_sched_queue |= sched_queue; + else + qp_context->tavor_sched_queue |= cpu_to_be32(sched_queue); + + qp_param->opt_param_mask |= + cpu_to_be32(MTHCA_QP_OPTPAR_SCHED_QUEUE); + } + + if (attr_mask & IB_QP_TIMEOUT) { + qp_context->pri_path.ackto = attr->timeout << 3; + qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_ACK_TIMEOUT); + } + + if (attr_mask & IB_QP_ALT_PATH) { + if (attr->alt_pkey_index >= dev->limits.pkey_table_len) { + mthca_dbg(dev, "Alternate P_Key index (%u) too large. max is %d\n", + attr->alt_pkey_index, dev->limits.pkey_table_len-1); + goto out_mailbox; + } + + if (attr->alt_port_num == 0 || attr->alt_port_num > dev->limits.num_ports) { + mthca_dbg(dev, "Alternate port number (%u) is invalid\n", + attr->alt_port_num); + goto out_mailbox; + } + + if (mthca_path_set(dev, &attr->alt_ah_attr, &qp_context->alt_path, + attr->alt_ah_attr.port_num)) + goto out_mailbox; + + qp_context->alt_path.port_pkey |= cpu_to_be32(attr->alt_pkey_index | + attr->alt_port_num << 24); + qp_context->alt_path.ackto = attr->alt_timeout << 3; + qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_ALT_ADDR_PATH); + } + + /* leave rdd as 0 */ + qp_context->pd = cpu_to_be32(to_mpd(ibqp->pd)->pd_num); + /* leave wqe_base as 0 (we always create an MR based at 0 for WQs) */ + qp_context->wqe_lkey = cpu_to_be32(qp->mr.ibmr.lkey); + qp_context->params1 = cpu_to_be32((MTHCA_ACK_REQ_FREQ << 28) | + (MTHCA_FLIGHT_LIMIT << 24) | + MTHCA_QP_BIT_SWE); + if (qp->sq_policy == IB_SIGNAL_ALL_WR) + qp_context->params1 |= cpu_to_be32(MTHCA_QP_BIT_SSC); + if (attr_mask & IB_QP_RETRY_CNT) { + qp_context->params1 |= cpu_to_be32(attr->retry_cnt << 16); + qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RETRY_COUNT); + } + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { + if (attr->max_rd_atomic) { + qp_context->params1 |= + cpu_to_be32(MTHCA_QP_BIT_SRE | + MTHCA_QP_BIT_SAE); + qp_context->params1 |= + cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21); + } + qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_SRA_MAX); + } + + if (attr_mask & IB_QP_SQ_PSN) + qp_context->next_send_psn = cpu_to_be32(attr->sq_psn); + qp_context->cqn_snd = cpu_to_be32(to_mcq(ibqp->send_cq)->cqn); + + if (mthca_is_memfree(dev)) { + qp_context->snd_wqe_base_l = cpu_to_be32(qp->send_wqe_offset); + qp_context->snd_db_index = cpu_to_be32(qp->sq.db_index); + } + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { + if (attr->max_dest_rd_atomic) + qp_context->params2 |= + cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21); + + qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RRA_MAX); + } + + if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) { + qp_context->params2 |= get_hw_access_flags(qp, attr, attr_mask); + qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RWE | + MTHCA_QP_OPTPAR_RRE | + MTHCA_QP_OPTPAR_RAE); + } + + qp_context->params2 |= cpu_to_be32(MTHCA_QP_BIT_RSC); + + if (ibqp->srq) + qp_context->params2 |= cpu_to_be32(MTHCA_QP_BIT_RIC); + + if (attr_mask & IB_QP_MIN_RNR_TIMER) { + qp_context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24); + qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RNR_TIMEOUT); + } + if (attr_mask & IB_QP_RQ_PSN) + qp_context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn); + + qp_context->ra_buff_indx = + cpu_to_be32(dev->qp_table.rdb_base + + ((qp->qpn & (dev->limits.num_qps - 1)) * MTHCA_RDB_ENTRY_SIZE << + dev->qp_table.rdb_shift)); + + qp_context->cqn_rcv = cpu_to_be32(to_mcq(ibqp->recv_cq)->cqn); + + if (mthca_is_memfree(dev)) + qp_context->rcv_db_index = cpu_to_be32(qp->rq.db_index); + + if (attr_mask & IB_QP_QKEY) { + qp_context->qkey = cpu_to_be32(attr->qkey); + qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_Q_KEY); + } + + if (ibqp->srq) + qp_context->srqn = cpu_to_be32(1 << 24 | + to_msrq(ibqp->srq)->srqn); + + if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD && + attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && + attr->en_sqd_async_notify) + sqd_event = 1 << 31; + + err = mthca_MODIFY_QP(dev, cur_state, new_state, qp->qpn, 0, + mailbox, sqd_event); + if (err) { + mthca_warn(dev, "modify QP %d->%d returned %d.\n", + cur_state, new_state, err); + goto out_mailbox; + } + + qp->state = new_state; + if (attr_mask & IB_QP_ACCESS_FLAGS) + qp->atomic_rd_en = attr->qp_access_flags; + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + qp->resp_depth = attr->max_dest_rd_atomic; + if (attr_mask & IB_QP_PORT) + qp->port = attr->port_num; + if (attr_mask & IB_QP_ALT_PATH) + qp->alt_port = attr->alt_port_num; + + if (is_sqp(dev, qp)) + store_attrs(to_msqp(qp), attr, attr_mask); + + /* + * If we moved QP0 to RTR, bring the IB link up; if we moved + * QP0 to RESET or ERROR, bring the link back down. + */ + if (is_qp0(dev, qp)) { + if (cur_state != IB_QPS_RTR && + new_state == IB_QPS_RTR) + init_port(dev, qp->port); + + if (cur_state != IB_QPS_RESET && + cur_state != IB_QPS_ERR && + (new_state == IB_QPS_RESET || + new_state == IB_QPS_ERR)) + mthca_CLOSE_IB(dev, qp->port); + } + + /* + * If we moved a kernel QP to RESET, clean up all old CQ + * entries and reinitialize the QP. + */ + if (new_state == IB_QPS_RESET && !qp->ibqp.uobject) { + mthca_cq_clean(dev, to_mcq(qp->ibqp.recv_cq), qp->qpn, + qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL); + if (qp->ibqp.send_cq != qp->ibqp.recv_cq) + mthca_cq_clean(dev, to_mcq(qp->ibqp.send_cq), qp->qpn, NULL); + + mthca_wq_reset(&qp->sq); + qp->sq.last = get_send_wqe(qp, qp->sq.max - 1); + + mthca_wq_reset(&qp->rq); + qp->rq.last = get_recv_wqe(qp, qp->rq.max - 1); + + if (mthca_is_memfree(dev)) { + *qp->sq.db = 0; + *qp->rq.db = 0; + } + } + +out_mailbox: + mthca_free_mailbox(dev, mailbox); +out: + return err; +} + +int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, + struct ib_udata *udata) +{ + struct mthca_dev *dev = to_mdev(ibqp->device); + struct mthca_qp *qp = to_mqp(ibqp); + enum ib_qp_state cur_state, new_state; + int err = -EINVAL; + + mutex_lock(&qp->mutex); + if (attr_mask & IB_QP_CUR_STATE) { + cur_state = attr->cur_qp_state; + } else { + spin_lock_irq(&qp->sq.lock); + spin_lock(&qp->rq.lock); + cur_state = qp->state; + spin_unlock(&qp->rq.lock); + spin_unlock_irq(&qp->sq.lock); + } + + new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, + IB_LINK_LAYER_UNSPECIFIED)) { + mthca_dbg(dev, "Bad QP transition (transport %d) " + "%d->%d with attr 0x%08x\n", + qp->transport, cur_state, new_state, + attr_mask); + goto out; + } + + if ((attr_mask & IB_QP_PKEY_INDEX) && + attr->pkey_index >= dev->limits.pkey_table_len) { + mthca_dbg(dev, "P_Key index (%u) too large. max is %d\n", + attr->pkey_index, dev->limits.pkey_table_len-1); + goto out; + } + + if ((attr_mask & IB_QP_PORT) && + (attr->port_num == 0 || attr->port_num > dev->limits.num_ports)) { + mthca_dbg(dev, "Port number (%u) is invalid\n", attr->port_num); + goto out; + } + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && + attr->max_rd_atomic > dev->limits.max_qp_init_rdma) { + mthca_dbg(dev, "Max rdma_atomic as initiator %u too large (max is %d)\n", + attr->max_rd_atomic, dev->limits.max_qp_init_rdma); + goto out; + } + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && + attr->max_dest_rd_atomic > 1 << dev->qp_table.rdb_shift) { + mthca_dbg(dev, "Max rdma_atomic as responder %u too large (max %d)\n", + attr->max_dest_rd_atomic, 1 << dev->qp_table.rdb_shift); + goto out; + } + + if (cur_state == new_state && cur_state == IB_QPS_RESET) { + err = 0; + goto out; + } + + err = __mthca_modify_qp(ibqp, attr, attr_mask, cur_state, new_state); + +out: + mutex_unlock(&qp->mutex); + return err; +} + +static int mthca_max_data_size(struct mthca_dev *dev, struct mthca_qp *qp, int desc_sz) +{ + /* + * Calculate the maximum size of WQE s/g segments, excluding + * the next segment and other non-data segments. + */ + int max_data_size = desc_sz - sizeof (struct mthca_next_seg); + + switch (qp->transport) { + case MLX: + max_data_size -= 2 * sizeof (struct mthca_data_seg); + break; + + case UD: + if (mthca_is_memfree(dev)) + max_data_size -= sizeof (struct mthca_arbel_ud_seg); + else + max_data_size -= sizeof (struct mthca_tavor_ud_seg); + break; + + default: + max_data_size -= sizeof (struct mthca_raddr_seg); + break; + } + + return max_data_size; +} + +static inline int mthca_max_inline_data(struct mthca_pd *pd, int max_data_size) +{ + /* We don't support inline data for kernel QPs (yet). */ + return pd->ibpd.uobject ? max_data_size - MTHCA_INLINE_HEADER_SIZE : 0; +} + +static void mthca_adjust_qp_caps(struct mthca_dev *dev, + struct mthca_pd *pd, + struct mthca_qp *qp) +{ + int max_data_size = mthca_max_data_size(dev, qp, + min(dev->limits.max_desc_sz, + 1 << qp->sq.wqe_shift)); + + qp->max_inline_data = mthca_max_inline_data(pd, max_data_size); + + qp->sq.max_gs = min_t(int, dev->limits.max_sg, + max_data_size / sizeof (struct mthca_data_seg)); + qp->rq.max_gs = min_t(int, dev->limits.max_sg, + (min(dev->limits.max_desc_sz, 1 << qp->rq.wqe_shift) - + sizeof (struct mthca_next_seg)) / + sizeof (struct mthca_data_seg)); +} + +/* + * Allocate and register buffer for WQEs. qp->rq.max, sq.max, + * rq.max_gs and sq.max_gs must all be assigned. + * mthca_alloc_wqe_buf will calculate rq.wqe_shift and + * sq.wqe_shift (as well as send_wqe_offset, is_direct, and + * queue) + */ +static int mthca_alloc_wqe_buf(struct mthca_dev *dev, + struct mthca_pd *pd, + struct mthca_qp *qp) +{ + int size; + int err = -ENOMEM; + + size = sizeof (struct mthca_next_seg) + + qp->rq.max_gs * sizeof (struct mthca_data_seg); + + if (size > dev->limits.max_desc_sz) + return -EINVAL; + + for (qp->rq.wqe_shift = 6; 1 << qp->rq.wqe_shift < size; + qp->rq.wqe_shift++) + ; /* nothing */ + + size = qp->sq.max_gs * sizeof (struct mthca_data_seg); + switch (qp->transport) { + case MLX: + size += 2 * sizeof (struct mthca_data_seg); + break; + + case UD: + size += mthca_is_memfree(dev) ? + sizeof (struct mthca_arbel_ud_seg) : + sizeof (struct mthca_tavor_ud_seg); + break; + + case UC: + size += sizeof (struct mthca_raddr_seg); + break; + + case RC: + size += sizeof (struct mthca_raddr_seg); + /* + * An atomic op will require an atomic segment, a + * remote address segment and one scatter entry. + */ + size = max_t(int, size, + sizeof (struct mthca_atomic_seg) + + sizeof (struct mthca_raddr_seg) + + sizeof (struct mthca_data_seg)); + break; + + default: + break; + } + + /* Make sure that we have enough space for a bind request */ + size = max_t(int, size, sizeof (struct mthca_bind_seg)); + + size += sizeof (struct mthca_next_seg); + + if (size > dev->limits.max_desc_sz) + return -EINVAL; + + for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size; + qp->sq.wqe_shift++) + ; /* nothing */ + + qp->send_wqe_offset = ALIGN(qp->rq.max << qp->rq.wqe_shift, + 1 << qp->sq.wqe_shift); + + /* + * If this is a userspace QP, we don't actually have to + * allocate anything. All we need is to calculate the WQE + * sizes and the send_wqe_offset, so we're done now. + */ + if (pd->ibpd.uobject) + return 0; + + size = PAGE_ALIGN(qp->send_wqe_offset + + (qp->sq.max << qp->sq.wqe_shift)); + + qp->wrid = kmalloc((qp->rq.max + qp->sq.max) * sizeof (u64), + GFP_KERNEL); + if (!qp->wrid) + goto err_out; + + err = mthca_buf_alloc(dev, size, MTHCA_MAX_DIRECT_QP_SIZE, + &qp->queue, &qp->is_direct, pd, 0, &qp->mr); + if (err) + goto err_out; + + return 0; + +err_out: + kfree(qp->wrid); + return err; +} + +static void mthca_free_wqe_buf(struct mthca_dev *dev, + struct mthca_qp *qp) +{ + mthca_buf_free(dev, PAGE_ALIGN(qp->send_wqe_offset + + (qp->sq.max << qp->sq.wqe_shift)), + &qp->queue, qp->is_direct, &qp->mr); + kfree(qp->wrid); +} + +static int mthca_map_memfree(struct mthca_dev *dev, + struct mthca_qp *qp) +{ + int ret; + + if (mthca_is_memfree(dev)) { + ret = mthca_table_get(dev, dev->qp_table.qp_table, qp->qpn); + if (ret) + return ret; + + ret = mthca_table_get(dev, dev->qp_table.eqp_table, qp->qpn); + if (ret) + goto err_qpc; + + ret = mthca_table_get(dev, dev->qp_table.rdb_table, + qp->qpn << dev->qp_table.rdb_shift); + if (ret) + goto err_eqpc; + + } + + return 0; + +err_eqpc: + mthca_table_put(dev, dev->qp_table.eqp_table, qp->qpn); + +err_qpc: + mthca_table_put(dev, dev->qp_table.qp_table, qp->qpn); + + return ret; +} + +static void mthca_unmap_memfree(struct mthca_dev *dev, + struct mthca_qp *qp) +{ + mthca_table_put(dev, dev->qp_table.rdb_table, + qp->qpn << dev->qp_table.rdb_shift); + mthca_table_put(dev, dev->qp_table.eqp_table, qp->qpn); + mthca_table_put(dev, dev->qp_table.qp_table, qp->qpn); +} + +static int mthca_alloc_memfree(struct mthca_dev *dev, + struct mthca_qp *qp) +{ + if (mthca_is_memfree(dev)) { + qp->rq.db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_RQ, + qp->qpn, &qp->rq.db); + if (qp->rq.db_index < 0) + return -ENOMEM; + + qp->sq.db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_SQ, + qp->qpn, &qp->sq.db); + if (qp->sq.db_index < 0) { + mthca_free_db(dev, MTHCA_DB_TYPE_RQ, qp->rq.db_index); + return -ENOMEM; + } + } + + return 0; +} + +static void mthca_free_memfree(struct mthca_dev *dev, + struct mthca_qp *qp) +{ + if (mthca_is_memfree(dev)) { + mthca_free_db(dev, MTHCA_DB_TYPE_SQ, qp->sq.db_index); + mthca_free_db(dev, MTHCA_DB_TYPE_RQ, qp->rq.db_index); + } +} + +static int mthca_alloc_qp_common(struct mthca_dev *dev, + struct mthca_pd *pd, + struct mthca_cq *send_cq, + struct mthca_cq *recv_cq, + enum ib_sig_type send_policy, + struct mthca_qp *qp) +{ + int ret; + int i; + struct mthca_next_seg *next; + + qp->refcount = 1; + init_waitqueue_head(&qp->wait); + mutex_init(&qp->mutex); + qp->state = IB_QPS_RESET; + qp->atomic_rd_en = 0; + qp->resp_depth = 0; + qp->sq_policy = send_policy; + mthca_wq_reset(&qp->sq); + mthca_wq_reset(&qp->rq); + + spin_lock_init(&qp->sq.lock); + spin_lock_init(&qp->rq.lock); + + ret = mthca_map_memfree(dev, qp); + if (ret) + return ret; + + ret = mthca_alloc_wqe_buf(dev, pd, qp); + if (ret) { + mthca_unmap_memfree(dev, qp); + return ret; + } + + mthca_adjust_qp_caps(dev, pd, qp); + + /* + * If this is a userspace QP, we're done now. The doorbells + * will be allocated and buffers will be initialized in + * userspace. + */ + if (pd->ibpd.uobject) + return 0; + + ret = mthca_alloc_memfree(dev, qp); + if (ret) { + mthca_free_wqe_buf(dev, qp); + mthca_unmap_memfree(dev, qp); + return ret; + } + + if (mthca_is_memfree(dev)) { + struct mthca_data_seg *scatter; + int size = (sizeof (struct mthca_next_seg) + + qp->rq.max_gs * sizeof (struct mthca_data_seg)) / 16; + + for (i = 0; i < qp->rq.max; ++i) { + next = get_recv_wqe(qp, i); + next->nda_op = cpu_to_be32(((i + 1) & (qp->rq.max - 1)) << + qp->rq.wqe_shift); + next->ee_nds = cpu_to_be32(size); + + for (scatter = (void *) (next + 1); + (void *) scatter < (void *) next + (1 << qp->rq.wqe_shift); + ++scatter) + scatter->lkey = cpu_to_be32(MTHCA_INVAL_LKEY); + } + + for (i = 0; i < qp->sq.max; ++i) { + next = get_send_wqe(qp, i); + next->nda_op = cpu_to_be32((((i + 1) & (qp->sq.max - 1)) << + qp->sq.wqe_shift) + + qp->send_wqe_offset); + } + } else { + for (i = 0; i < qp->rq.max; ++i) { + next = get_recv_wqe(qp, i); + next->nda_op = htonl((((i + 1) % qp->rq.max) << + qp->rq.wqe_shift) | 1); + } + + } + + qp->sq.last = get_send_wqe(qp, qp->sq.max - 1); + qp->rq.last = get_recv_wqe(qp, qp->rq.max - 1); + + return 0; +} + +static int mthca_set_qp_size(struct mthca_dev *dev, struct ib_qp_cap *cap, + struct mthca_pd *pd, struct mthca_qp *qp) +{ + int max_data_size = mthca_max_data_size(dev, qp, dev->limits.max_desc_sz); + + /* Sanity check QP size before proceeding */ + if (cap->max_send_wr > dev->limits.max_wqes || + cap->max_recv_wr > dev->limits.max_wqes || + cap->max_send_sge > dev->limits.max_sg || + cap->max_recv_sge > dev->limits.max_sg || + cap->max_inline_data > mthca_max_inline_data(pd, max_data_size)) + return -EINVAL; + + /* + * For MLX transport we need 2 extra send gather entries: + * one for the header and one for the checksum at the end + */ + if (qp->transport == MLX && cap->max_send_sge + 2 > dev->limits.max_sg) + return -EINVAL; + + if (mthca_is_memfree(dev)) { + qp->rq.max = cap->max_recv_wr ? + roundup_pow_of_two(cap->max_recv_wr) : 0; + qp->sq.max = cap->max_send_wr ? + roundup_pow_of_two(cap->max_send_wr) : 0; + } else { + qp->rq.max = cap->max_recv_wr; + qp->sq.max = cap->max_send_wr; + } + + qp->rq.max_gs = cap->max_recv_sge; + qp->sq.max_gs = max_t(int, cap->max_send_sge, + ALIGN(cap->max_inline_data + MTHCA_INLINE_HEADER_SIZE, + MTHCA_INLINE_CHUNK_SIZE) / + sizeof (struct mthca_data_seg)); + + return 0; +} + +int mthca_alloc_qp(struct mthca_dev *dev, + struct mthca_pd *pd, + struct mthca_cq *send_cq, + struct mthca_cq *recv_cq, + enum ib_qp_type type, + enum ib_sig_type send_policy, + struct ib_qp_cap *cap, + struct mthca_qp *qp) +{ + int err; + + switch (type) { + case IB_QPT_RC: qp->transport = RC; break; + case IB_QPT_UC: qp->transport = UC; break; + case IB_QPT_UD: qp->transport = UD; break; + default: return -EINVAL; + } + + err = mthca_set_qp_size(dev, cap, pd, qp); + if (err) + return err; + + qp->qpn = mthca_alloc(&dev->qp_table.alloc); + if (qp->qpn == -1) + return -ENOMEM; + + /* initialize port to zero for error-catching. */ + qp->port = 0; + + err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq, + send_policy, qp); + if (err) { + mthca_free(&dev->qp_table.alloc, qp->qpn); + return err; + } + + spin_lock_irq(&dev->qp_table.lock); + mthca_array_set(&dev->qp_table.qp, + qp->qpn & (dev->limits.num_qps - 1), qp); + spin_unlock_irq(&dev->qp_table.lock); + + return 0; +} + +static void mthca_lock_cqs(struct mthca_cq *send_cq, struct mthca_cq *recv_cq) + __acquires(&send_cq->lock) __acquires(&recv_cq->lock) +{ + if (send_cq == recv_cq) { + spin_lock_irq(&send_cq->lock); + __acquire(&recv_cq->lock); + } else if (send_cq->cqn < recv_cq->cqn) { + spin_lock_irq(&send_cq->lock); + spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); + } else { + spin_lock_irq(&recv_cq->lock); + spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING); + } +} + +static void mthca_unlock_cqs(struct mthca_cq *send_cq, struct mthca_cq *recv_cq) + __releases(&send_cq->lock) __releases(&recv_cq->lock) +{ + if (send_cq == recv_cq) { + __release(&recv_cq->lock); + spin_unlock_irq(&send_cq->lock); + } else if (send_cq->cqn < recv_cq->cqn) { + spin_unlock(&recv_cq->lock); + spin_unlock_irq(&send_cq->lock); + } else { + spin_unlock(&send_cq->lock); + spin_unlock_irq(&recv_cq->lock); + } +} + +int mthca_alloc_sqp(struct mthca_dev *dev, + struct mthca_pd *pd, + struct mthca_cq *send_cq, + struct mthca_cq *recv_cq, + enum ib_sig_type send_policy, + struct ib_qp_cap *cap, + int qpn, + int port, + struct mthca_sqp *sqp) +{ + u32 mqpn = qpn * 2 + dev->qp_table.sqp_start + port - 1; + int err; + + sqp->qp.transport = MLX; + err = mthca_set_qp_size(dev, cap, pd, &sqp->qp); + if (err) + return err; + + sqp->header_buf_size = sqp->qp.sq.max * MTHCA_UD_HEADER_SIZE; + sqp->header_buf = dma_alloc_coherent(&dev->pdev->dev, sqp->header_buf_size, + &sqp->header_dma, GFP_KERNEL); + if (!sqp->header_buf) + return -ENOMEM; + + spin_lock_irq(&dev->qp_table.lock); + if (mthca_array_get(&dev->qp_table.qp, mqpn)) + err = -EBUSY; + else + mthca_array_set(&dev->qp_table.qp, mqpn, sqp); + spin_unlock_irq(&dev->qp_table.lock); + + if (err) + goto err_out; + + sqp->qp.port = port; + sqp->qp.qpn = mqpn; + sqp->qp.transport = MLX; + + err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq, + send_policy, &sqp->qp); + if (err) + goto err_out_free; + + atomic_inc(&pd->sqp_count); + + return 0; + + err_out_free: + /* + * Lock CQs here, so that CQ polling code can do QP lookup + * without taking a lock. + */ + mthca_lock_cqs(send_cq, recv_cq); + + spin_lock(&dev->qp_table.lock); + mthca_array_clear(&dev->qp_table.qp, mqpn); + spin_unlock(&dev->qp_table.lock); + + mthca_unlock_cqs(send_cq, recv_cq); + + err_out: + dma_free_coherent(&dev->pdev->dev, sqp->header_buf_size, + sqp->header_buf, sqp->header_dma); + + return err; +} + +static inline int get_qp_refcount(struct mthca_dev *dev, struct mthca_qp *qp) +{ + int c; + + spin_lock_irq(&dev->qp_table.lock); + c = qp->refcount; + spin_unlock_irq(&dev->qp_table.lock); + + return c; +} + +void mthca_free_qp(struct mthca_dev *dev, + struct mthca_qp *qp) +{ + struct mthca_cq *send_cq; + struct mthca_cq *recv_cq; + + send_cq = to_mcq(qp->ibqp.send_cq); + recv_cq = to_mcq(qp->ibqp.recv_cq); + + /* + * Lock CQs here, so that CQ polling code can do QP lookup + * without taking a lock. + */ + mthca_lock_cqs(send_cq, recv_cq); + + spin_lock(&dev->qp_table.lock); + mthca_array_clear(&dev->qp_table.qp, + qp->qpn & (dev->limits.num_qps - 1)); + --qp->refcount; + spin_unlock(&dev->qp_table.lock); + + mthca_unlock_cqs(send_cq, recv_cq); + + wait_event(qp->wait, !get_qp_refcount(dev, qp)); + + if (qp->state != IB_QPS_RESET) + mthca_MODIFY_QP(dev, qp->state, IB_QPS_RESET, qp->qpn, 0, + NULL, 0); + + /* + * If this is a userspace QP, the buffers, MR, CQs and so on + * will be cleaned up in userspace, so all we have to do is + * unref the mem-free tables and free the QPN in our table. + */ + if (!qp->ibqp.uobject) { + mthca_cq_clean(dev, recv_cq, qp->qpn, + qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL); + if (send_cq != recv_cq) + mthca_cq_clean(dev, send_cq, qp->qpn, NULL); + + mthca_free_memfree(dev, qp); + mthca_free_wqe_buf(dev, qp); + } + + mthca_unmap_memfree(dev, qp); + + if (is_sqp(dev, qp)) { + atomic_dec(&(to_mpd(qp->ibqp.pd)->sqp_count)); + dma_free_coherent(&dev->pdev->dev, + to_msqp(qp)->header_buf_size, + to_msqp(qp)->header_buf, + to_msqp(qp)->header_dma); + } else + mthca_free(&dev->qp_table.alloc, qp->qpn); +} + +/* Create UD header for an MLX send and build a data segment for it */ +static int build_mlx_header(struct mthca_dev *dev, struct mthca_sqp *sqp, + int ind, struct ib_send_wr *wr, + struct mthca_mlx_seg *mlx, + struct mthca_data_seg *data) +{ + int header_size; + int err; + u16 pkey; + + ib_ud_header_init(256, /* assume a MAD */ 1, 0, 0, + mthca_ah_grh_present(to_mah(wr->wr.ud.ah)), 0, + &sqp->ud_header); + + err = mthca_read_ah(dev, to_mah(wr->wr.ud.ah), &sqp->ud_header); + if (err) + return err; + mlx->flags &= ~cpu_to_be32(MTHCA_NEXT_SOLICIT | 1); + mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MTHCA_MLX_VL15 : 0) | + (sqp->ud_header.lrh.destination_lid == + IB_LID_PERMISSIVE ? MTHCA_MLX_SLR : 0) | + (sqp->ud_header.lrh.service_level << 8)); + mlx->rlid = sqp->ud_header.lrh.destination_lid; + mlx->vcrc = 0; + + switch (wr->opcode) { + case IB_WR_SEND: + sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY; + sqp->ud_header.immediate_present = 0; + break; + case IB_WR_SEND_WITH_IMM: + sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; + sqp->ud_header.immediate_present = 1; + sqp->ud_header.immediate_data = wr->ex.imm_data; + break; + default: + return -EINVAL; + } + + sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : 0; + if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE) + sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE; + sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED); + if (!sqp->qp.ibqp.qp_num) + ib_get_cached_pkey(&dev->ib_dev, sqp->qp.port, + sqp->pkey_index, &pkey); + else + ib_get_cached_pkey(&dev->ib_dev, sqp->qp.port, + wr->wr.ud.pkey_index, &pkey); + sqp->ud_header.bth.pkey = cpu_to_be16(pkey); + sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); + sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); + sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ? + sqp->qkey : wr->wr.ud.remote_qkey); + sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num); + + header_size = ib_ud_header_pack(&sqp->ud_header, + sqp->header_buf + + ind * MTHCA_UD_HEADER_SIZE); + + data->byte_count = cpu_to_be32(header_size); + data->lkey = cpu_to_be32(to_mpd(sqp->qp.ibqp.pd)->ntmr.ibmr.lkey); + data->addr = cpu_to_be64(sqp->header_dma + + ind * MTHCA_UD_HEADER_SIZE); + + return 0; +} + +static inline int mthca_wq_overflow(struct mthca_wq *wq, int nreq, + struct ib_cq *ib_cq) +{ + unsigned cur; + struct mthca_cq *cq; + + cur = wq->head - wq->tail; + if (likely(cur + nreq < wq->max)) + return 0; + + cq = to_mcq(ib_cq); + spin_lock(&cq->lock); + cur = wq->head - wq->tail; + spin_unlock(&cq->lock); + + return cur + nreq >= wq->max; +} + +static __always_inline void set_raddr_seg(struct mthca_raddr_seg *rseg, + u64 remote_addr, u32 rkey) +{ + rseg->raddr = cpu_to_be64(remote_addr); + rseg->rkey = cpu_to_be32(rkey); + rseg->reserved = 0; +} + +static __always_inline void set_atomic_seg(struct mthca_atomic_seg *aseg, + struct ib_send_wr *wr) +{ + if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) { + aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap); + aseg->compare = cpu_to_be64(wr->wr.atomic.compare_add); + } else { + aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add); + aseg->compare = 0; + } + +} + +static void set_tavor_ud_seg(struct mthca_tavor_ud_seg *useg, + struct ib_send_wr *wr) +{ + useg->lkey = cpu_to_be32(to_mah(wr->wr.ud.ah)->key); + useg->av_addr = cpu_to_be64(to_mah(wr->wr.ud.ah)->avdma); + useg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn); + useg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey); + +} + +static void set_arbel_ud_seg(struct mthca_arbel_ud_seg *useg, + struct ib_send_wr *wr) +{ + memcpy(useg->av, to_mah(wr->wr.ud.ah)->av, MTHCA_AV_SIZE); + useg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn); + useg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey); +} + +int mthca_tavor_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + struct mthca_dev *dev = to_mdev(ibqp->device); + struct mthca_qp *qp = to_mqp(ibqp); + void *wqe; + void *prev_wqe; + unsigned long flags; + int err = 0; + int nreq; + int i; + int size; + /* + * f0 and size0 are only used if nreq != 0, and they will + * always be initialized the first time through the main loop + * before nreq is incremented. So nreq cannot become non-zero + * without initializing f0 and size0, and they are in fact + * never used uninitialized. + */ + int uninitialized_var(size0); + u32 uninitialized_var(f0); + int ind; + u8 op0 = 0; + + spin_lock_irqsave(&qp->sq.lock, flags); + + /* XXX check that state is OK to post send */ + + ind = qp->sq.next_ind; + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (mthca_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) { + mthca_err(dev, "SQ %06x full (%u head, %u tail," + " %d max, %d nreq)\n", qp->qpn, + qp->sq.head, qp->sq.tail, + qp->sq.max, nreq); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + wqe = get_send_wqe(qp, ind); + prev_wqe = qp->sq.last; + qp->sq.last = wqe; + + ((struct mthca_next_seg *) wqe)->nda_op = 0; + ((struct mthca_next_seg *) wqe)->ee_nds = 0; + ((struct mthca_next_seg *) wqe)->flags = + ((wr->send_flags & IB_SEND_SIGNALED) ? + cpu_to_be32(MTHCA_NEXT_CQ_UPDATE) : 0) | + ((wr->send_flags & IB_SEND_SOLICITED) ? + cpu_to_be32(MTHCA_NEXT_SOLICIT) : 0) | + cpu_to_be32(1); + if (wr->opcode == IB_WR_SEND_WITH_IMM || + wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) + ((struct mthca_next_seg *) wqe)->imm = wr->ex.imm_data; + + wqe += sizeof (struct mthca_next_seg); + size = sizeof (struct mthca_next_seg) / 16; + + switch (qp->transport) { + case RC: + switch (wr->opcode) { + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + set_raddr_seg(wqe, wr->wr.atomic.remote_addr, + wr->wr.atomic.rkey); + wqe += sizeof (struct mthca_raddr_seg); + + set_atomic_seg(wqe, wr); + wqe += sizeof (struct mthca_atomic_seg); + size += (sizeof (struct mthca_raddr_seg) + + sizeof (struct mthca_atomic_seg)) / 16; + break; + + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + case IB_WR_RDMA_READ: + set_raddr_seg(wqe, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + wqe += sizeof (struct mthca_raddr_seg); + size += sizeof (struct mthca_raddr_seg) / 16; + break; + + default: + /* No extra segments required for sends */ + break; + } + + break; + + case UC: + switch (wr->opcode) { + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + set_raddr_seg(wqe, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + wqe += sizeof (struct mthca_raddr_seg); + size += sizeof (struct mthca_raddr_seg) / 16; + break; + + default: + /* No extra segments required for sends */ + break; + } + + break; + + case UD: + set_tavor_ud_seg(wqe, wr); + wqe += sizeof (struct mthca_tavor_ud_seg); + size += sizeof (struct mthca_tavor_ud_seg) / 16; + break; + + case MLX: + err = build_mlx_header(dev, to_msqp(qp), ind, wr, + wqe - sizeof (struct mthca_next_seg), + wqe); + if (err) { + *bad_wr = wr; + goto out; + } + wqe += sizeof (struct mthca_data_seg); + size += sizeof (struct mthca_data_seg) / 16; + break; + } + + if (wr->num_sge > qp->sq.max_gs) { + mthca_err(dev, "too many gathers\n"); + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + for (i = 0; i < wr->num_sge; ++i) { + mthca_set_data_seg(wqe, wr->sg_list + i); + wqe += sizeof (struct mthca_data_seg); + size += sizeof (struct mthca_data_seg) / 16; + } + + /* Add one more inline data segment for ICRC */ + if (qp->transport == MLX) { + ((struct mthca_data_seg *) wqe)->byte_count = + cpu_to_be32((1 << 31) | 4); + ((u32 *) wqe)[1] = 0; + wqe += sizeof (struct mthca_data_seg); + size += sizeof (struct mthca_data_seg) / 16; + } + + qp->wrid[ind + qp->rq.max] = wr->wr_id; + + if (wr->opcode >= ARRAY_SIZE(mthca_opcode)) { + mthca_err(dev, "opcode invalid\n"); + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + ((struct mthca_next_seg *) prev_wqe)->nda_op = + cpu_to_be32(((ind << qp->sq.wqe_shift) + + qp->send_wqe_offset) | + mthca_opcode[wr->opcode]); + wmb(); + ((struct mthca_next_seg *) prev_wqe)->ee_nds = + cpu_to_be32((nreq ? 0 : MTHCA_NEXT_DBD) | size | + ((wr->send_flags & IB_SEND_FENCE) ? + MTHCA_NEXT_FENCE : 0)); + + if (!nreq) { + size0 = size; + op0 = mthca_opcode[wr->opcode]; + f0 = wr->send_flags & IB_SEND_FENCE ? + MTHCA_SEND_DOORBELL_FENCE : 0; + } + + ++ind; + if (unlikely(ind >= qp->sq.max)) + ind -= qp->sq.max; + } + +out: + if (likely(nreq)) { + wmb(); + + mthca_write64(((qp->sq.next_ind << qp->sq.wqe_shift) + + qp->send_wqe_offset) | f0 | op0, + (qp->qpn << 8) | size0, + dev->kar + MTHCA_SEND_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); + /* + * Make sure doorbells don't leak out of SQ spinlock + * and reach the HCA out of order: + */ + mmiowb(); + } + + qp->sq.next_ind = ind; + qp->sq.head += nreq; + + spin_unlock_irqrestore(&qp->sq.lock, flags); + return err; +} + +int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mthca_dev *dev = to_mdev(ibqp->device); + struct mthca_qp *qp = to_mqp(ibqp); + unsigned long flags; + int err = 0; + int nreq; + int i; + int size; + /* + * size0 is only used if nreq != 0, and it will always be + * initialized the first time through the main loop before + * nreq is incremented. So nreq cannot become non-zero + * without initializing size0, and it is in fact never used + * uninitialized. + */ + int uninitialized_var(size0); + int ind; + void *wqe; + void *prev_wqe; + + spin_lock_irqsave(&qp->rq.lock, flags); + + /* XXX check that state is OK to post receive */ + + ind = qp->rq.next_ind; + + for (nreq = 0; wr; wr = wr->next) { + if (mthca_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { + mthca_err(dev, "RQ %06x full (%u head, %u tail," + " %d max, %d nreq)\n", qp->qpn, + qp->rq.head, qp->rq.tail, + qp->rq.max, nreq); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + wqe = get_recv_wqe(qp, ind); + prev_wqe = qp->rq.last; + qp->rq.last = wqe; + + ((struct mthca_next_seg *) wqe)->ee_nds = + cpu_to_be32(MTHCA_NEXT_DBD); + ((struct mthca_next_seg *) wqe)->flags = 0; + + wqe += sizeof (struct mthca_next_seg); + size = sizeof (struct mthca_next_seg) / 16; + + if (unlikely(wr->num_sge > qp->rq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + for (i = 0; i < wr->num_sge; ++i) { + mthca_set_data_seg(wqe, wr->sg_list + i); + wqe += sizeof (struct mthca_data_seg); + size += sizeof (struct mthca_data_seg) / 16; + } + + qp->wrid[ind] = wr->wr_id; + + ((struct mthca_next_seg *) prev_wqe)->ee_nds = + cpu_to_be32(MTHCA_NEXT_DBD | size); + + if (!nreq) + size0 = size; + + ++ind; + if (unlikely(ind >= qp->rq.max)) + ind -= qp->rq.max; + + ++nreq; + if (unlikely(nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB)) { + nreq = 0; + + wmb(); + + mthca_write64((qp->rq.next_ind << qp->rq.wqe_shift) | size0, + qp->qpn << 8, dev->kar + MTHCA_RECEIVE_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); + + qp->rq.next_ind = ind; + qp->rq.head += MTHCA_TAVOR_MAX_WQES_PER_RECV_DB; + } + } + +out: + if (likely(nreq)) { + wmb(); + + mthca_write64((qp->rq.next_ind << qp->rq.wqe_shift) | size0, + qp->qpn << 8 | nreq, dev->kar + MTHCA_RECEIVE_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); + } + + qp->rq.next_ind = ind; + qp->rq.head += nreq; + + /* + * Make sure doorbells don't leak out of RQ spinlock and reach + * the HCA out of order: + */ + mmiowb(); + + spin_unlock_irqrestore(&qp->rq.lock, flags); + return err; +} + +int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + struct mthca_dev *dev = to_mdev(ibqp->device); + struct mthca_qp *qp = to_mqp(ibqp); + u32 dbhi; + void *wqe; + void *prev_wqe; + unsigned long flags; + int err = 0; + int nreq; + int i; + int size; + /* + * f0 and size0 are only used if nreq != 0, and they will + * always be initialized the first time through the main loop + * before nreq is incremented. So nreq cannot become non-zero + * without initializing f0 and size0, and they are in fact + * never used uninitialized. + */ + int uninitialized_var(size0); + u32 uninitialized_var(f0); + int ind; + u8 op0 = 0; + + spin_lock_irqsave(&qp->sq.lock, flags); + + /* XXX check that state is OK to post send */ + + ind = qp->sq.head & (qp->sq.max - 1); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (unlikely(nreq == MTHCA_ARBEL_MAX_WQES_PER_SEND_DB)) { + nreq = 0; + + dbhi = (MTHCA_ARBEL_MAX_WQES_PER_SEND_DB << 24) | + ((qp->sq.head & 0xffff) << 8) | f0 | op0; + + qp->sq.head += MTHCA_ARBEL_MAX_WQES_PER_SEND_DB; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + *qp->sq.db = cpu_to_be32(qp->sq.head & 0xffff); + + /* + * Make sure doorbell record is written before we + * write MMIO send doorbell. + */ + wmb(); + + mthca_write64(dbhi, (qp->qpn << 8) | size0, + dev->kar + MTHCA_SEND_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); + } + + if (mthca_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) { + mthca_err(dev, "SQ %06x full (%u head, %u tail," + " %d max, %d nreq)\n", qp->qpn, + qp->sq.head, qp->sq.tail, + qp->sq.max, nreq); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + wqe = get_send_wqe(qp, ind); + prev_wqe = qp->sq.last; + qp->sq.last = wqe; + + ((struct mthca_next_seg *) wqe)->flags = + ((wr->send_flags & IB_SEND_SIGNALED) ? + cpu_to_be32(MTHCA_NEXT_CQ_UPDATE) : 0) | + ((wr->send_flags & IB_SEND_SOLICITED) ? + cpu_to_be32(MTHCA_NEXT_SOLICIT) : 0) | + ((wr->send_flags & IB_SEND_IP_CSUM) ? + cpu_to_be32(MTHCA_NEXT_IP_CSUM | MTHCA_NEXT_TCP_UDP_CSUM) : 0) | + cpu_to_be32(1); + if (wr->opcode == IB_WR_SEND_WITH_IMM || + wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) + ((struct mthca_next_seg *) wqe)->imm = wr->ex.imm_data; + + wqe += sizeof (struct mthca_next_seg); + size = sizeof (struct mthca_next_seg) / 16; + + switch (qp->transport) { + case RC: + switch (wr->opcode) { + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + set_raddr_seg(wqe, wr->wr.atomic.remote_addr, + wr->wr.atomic.rkey); + wqe += sizeof (struct mthca_raddr_seg); + + set_atomic_seg(wqe, wr); + wqe += sizeof (struct mthca_atomic_seg); + size += (sizeof (struct mthca_raddr_seg) + + sizeof (struct mthca_atomic_seg)) / 16; + break; + + case IB_WR_RDMA_READ: + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + set_raddr_seg(wqe, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + wqe += sizeof (struct mthca_raddr_seg); + size += sizeof (struct mthca_raddr_seg) / 16; + break; + + default: + /* No extra segments required for sends */ + break; + } + + break; + + case UC: + switch (wr->opcode) { + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + set_raddr_seg(wqe, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + wqe += sizeof (struct mthca_raddr_seg); + size += sizeof (struct mthca_raddr_seg) / 16; + break; + + default: + /* No extra segments required for sends */ + break; + } + + break; + + case UD: + set_arbel_ud_seg(wqe, wr); + wqe += sizeof (struct mthca_arbel_ud_seg); + size += sizeof (struct mthca_arbel_ud_seg) / 16; + break; + + case MLX: + err = build_mlx_header(dev, to_msqp(qp), ind, wr, + wqe - sizeof (struct mthca_next_seg), + wqe); + if (err) { + *bad_wr = wr; + goto out; + } + wqe += sizeof (struct mthca_data_seg); + size += sizeof (struct mthca_data_seg) / 16; + break; + } + + if (wr->num_sge > qp->sq.max_gs) { + mthca_err(dev, "too many gathers\n"); + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + for (i = 0; i < wr->num_sge; ++i) { + mthca_set_data_seg(wqe, wr->sg_list + i); + wqe += sizeof (struct mthca_data_seg); + size += sizeof (struct mthca_data_seg) / 16; + } + + /* Add one more inline data segment for ICRC */ + if (qp->transport == MLX) { + ((struct mthca_data_seg *) wqe)->byte_count = + cpu_to_be32((1 << 31) | 4); + ((u32 *) wqe)[1] = 0; + wqe += sizeof (struct mthca_data_seg); + size += sizeof (struct mthca_data_seg) / 16; + } + + qp->wrid[ind + qp->rq.max] = wr->wr_id; + + if (wr->opcode >= ARRAY_SIZE(mthca_opcode)) { + mthca_err(dev, "opcode invalid\n"); + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + ((struct mthca_next_seg *) prev_wqe)->nda_op = + cpu_to_be32(((ind << qp->sq.wqe_shift) + + qp->send_wqe_offset) | + mthca_opcode[wr->opcode]); + wmb(); + ((struct mthca_next_seg *) prev_wqe)->ee_nds = + cpu_to_be32(MTHCA_NEXT_DBD | size | + ((wr->send_flags & IB_SEND_FENCE) ? + MTHCA_NEXT_FENCE : 0)); + + if (!nreq) { + size0 = size; + op0 = mthca_opcode[wr->opcode]; + f0 = wr->send_flags & IB_SEND_FENCE ? + MTHCA_SEND_DOORBELL_FENCE : 0; + } + + ++ind; + if (unlikely(ind >= qp->sq.max)) + ind -= qp->sq.max; + } + +out: + if (likely(nreq)) { + dbhi = (nreq << 24) | ((qp->sq.head & 0xffff) << 8) | f0 | op0; + + qp->sq.head += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + *qp->sq.db = cpu_to_be32(qp->sq.head & 0xffff); + + /* + * Make sure doorbell record is written before we + * write MMIO send doorbell. + */ + wmb(); + + mthca_write64(dbhi, (qp->qpn << 8) | size0, dev->kar + MTHCA_SEND_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); + } + + /* + * Make sure doorbells don't leak out of SQ spinlock and reach + * the HCA out of order: + */ + mmiowb(); + + spin_unlock_irqrestore(&qp->sq.lock, flags); + return err; +} + +int mthca_arbel_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mthca_dev *dev = to_mdev(ibqp->device); + struct mthca_qp *qp = to_mqp(ibqp); + unsigned long flags; + int err = 0; + int nreq; + int ind; + int i; + void *wqe; + + spin_lock_irqsave(&qp->rq.lock, flags); + + /* XXX check that state is OK to post receive */ + + ind = qp->rq.head & (qp->rq.max - 1); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (mthca_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { + mthca_err(dev, "RQ %06x full (%u head, %u tail," + " %d max, %d nreq)\n", qp->qpn, + qp->rq.head, qp->rq.tail, + qp->rq.max, nreq); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + wqe = get_recv_wqe(qp, ind); + + ((struct mthca_next_seg *) wqe)->flags = 0; + + wqe += sizeof (struct mthca_next_seg); + + if (unlikely(wr->num_sge > qp->rq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + for (i = 0; i < wr->num_sge; ++i) { + mthca_set_data_seg(wqe, wr->sg_list + i); + wqe += sizeof (struct mthca_data_seg); + } + + if (i < qp->rq.max_gs) + mthca_set_data_seg_inval(wqe); + + qp->wrid[ind] = wr->wr_id; + + ++ind; + if (unlikely(ind >= qp->rq.max)) + ind -= qp->rq.max; + } +out: + if (likely(nreq)) { + qp->rq.head += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + *qp->rq.db = cpu_to_be32(qp->rq.head & 0xffff); + } + + spin_unlock_irqrestore(&qp->rq.lock, flags); + return err; +} + +void mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send, + int index, int *dbd, __be32 *new_wqe) +{ + struct mthca_next_seg *next; + + /* + * For SRQs, all receive WQEs generate a CQE, so we're always + * at the end of the doorbell chain. + */ + if (qp->ibqp.srq && !is_send) { + *new_wqe = 0; + return; + } + + if (is_send) + next = get_send_wqe(qp, index); + else + next = get_recv_wqe(qp, index); + + *dbd = !!(next->ee_nds & cpu_to_be32(MTHCA_NEXT_DBD)); + if (next->ee_nds & cpu_to_be32(0x3f)) + *new_wqe = (next->nda_op & cpu_to_be32(~0x3f)) | + (next->ee_nds & cpu_to_be32(0x3f)); + else + *new_wqe = 0; +} + +int mthca_init_qp_table(struct mthca_dev *dev) +{ + int err; + int i; + + spin_lock_init(&dev->qp_table.lock); + + /* + * We reserve 2 extra QPs per port for the special QPs. The + * special QP for port 1 has to be even, so round up. + */ + dev->qp_table.sqp_start = (dev->limits.reserved_qps + 1) & ~1UL; + err = mthca_alloc_init(&dev->qp_table.alloc, + dev->limits.num_qps, + (1 << 24) - 1, + dev->qp_table.sqp_start + + MTHCA_MAX_PORTS * 2); + if (err) + return err; + + err = mthca_array_init(&dev->qp_table.qp, + dev->limits.num_qps); + if (err) { + mthca_alloc_cleanup(&dev->qp_table.alloc); + return err; + } + + for (i = 0; i < 2; ++i) { + err = mthca_CONF_SPECIAL_QP(dev, i ? IB_QPT_GSI : IB_QPT_SMI, + dev->qp_table.sqp_start + i * 2); + if (err) { + mthca_warn(dev, "CONF_SPECIAL_QP returned " + "%d, aborting.\n", err); + goto err_out; + } + } + return 0; + + err_out: + for (i = 0; i < 2; ++i) + mthca_CONF_SPECIAL_QP(dev, i, 0); + + mthca_array_cleanup(&dev->qp_table.qp, dev->limits.num_qps); + mthca_alloc_cleanup(&dev->qp_table.alloc); + + return err; +} + +void mthca_cleanup_qp_table(struct mthca_dev *dev) +{ + int i; + + for (i = 0; i < 2; ++i) + mthca_CONF_SPECIAL_QP(dev, i, 0); + + mthca_array_cleanup(&dev->qp_table.qp, dev->limits.num_qps); + mthca_alloc_cleanup(&dev->qp_table.alloc); +} diff --git a/drivers/infiniband/hw/mthca/mthca_reset.c b/drivers/infiniband/hw/mthca/mthca_reset.c new file mode 100644 index 0000000..74c6a94 --- /dev/null +++ b/drivers/infiniband/hw/mthca/mthca_reset.c @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "mthca_dev.h" +#include "mthca_cmd.h" + +int mthca_reset(struct mthca_dev *mdev) +{ + int i; + int err = 0; + u32 *hca_header = NULL; + u32 *bridge_header = NULL; + struct pci_dev *bridge = NULL; + int bridge_pcix_cap = 0; + int hca_pcie_cap = 0; + int hca_pcix_cap = 0; + + u16 devctl; + u16 linkctl; + +#define MTHCA_RESET_OFFSET 0xf0010 +#define MTHCA_RESET_VALUE swab32(1) + + /* + * Reset the chip. This is somewhat ugly because we have to + * save off the PCI header before reset and then restore it + * after the chip reboots. We skip config space offsets 22 + * and 23 since those have a special meaning. + * + * To make matters worse, for Tavor (PCI-X HCA) we have to + * find the associated bridge device and save off its PCI + * header as well. + */ + + if (!(mdev->mthca_flags & MTHCA_FLAG_PCIE)) { + /* Look for the bridge -- its device ID will be 2 more + than HCA's device ID. */ + while ((bridge = pci_get_device(mdev->pdev->vendor, + mdev->pdev->device + 2, + bridge)) != NULL) { + if (bridge->hdr_type == PCI_HEADER_TYPE_BRIDGE && + bridge->subordinate == mdev->pdev->bus) { + mthca_dbg(mdev, "Found bridge: %s\n", + pci_name(bridge)); + break; + } + } + + if (!bridge) { + /* + * Didn't find a bridge for a Tavor device -- + * assume we're in no-bridge mode and hope for + * the best. + */ + mthca_warn(mdev, "No bridge found for %s\n", + pci_name(mdev->pdev)); + } + + } + + /* For Arbel do we need to save off the full 4K PCI Express header?? */ + hca_header = kmalloc(256, GFP_KERNEL); + if (!hca_header) { + err = -ENOMEM; + mthca_err(mdev, "Couldn't allocate memory to save HCA " + "PCI header, aborting.\n"); + goto out; + } + + for (i = 0; i < 64; ++i) { + if (i == 22 || i == 23) + continue; + if (pci_read_config_dword(mdev->pdev, i * 4, hca_header + i)) { + err = -ENODEV; + mthca_err(mdev, "Couldn't save HCA " + "PCI header, aborting.\n"); + goto out; + } + } + + hca_pcix_cap = pci_find_capability(mdev->pdev, PCI_CAP_ID_PCIX); + hca_pcie_cap = pci_pcie_cap(mdev->pdev); + + if (bridge) { + bridge_header = kmalloc(256, GFP_KERNEL); + if (!bridge_header) { + err = -ENOMEM; + mthca_err(mdev, "Couldn't allocate memory to save HCA " + "bridge PCI header, aborting.\n"); + goto out; + } + + for (i = 0; i < 64; ++i) { + if (i == 22 || i == 23) + continue; + if (pci_read_config_dword(bridge, i * 4, bridge_header + i)) { + err = -ENODEV; + mthca_err(mdev, "Couldn't save HCA bridge " + "PCI header, aborting.\n"); + goto out; + } + } + bridge_pcix_cap = pci_find_capability(bridge, PCI_CAP_ID_PCIX); + if (!bridge_pcix_cap) { + err = -ENODEV; + mthca_err(mdev, "Couldn't locate HCA bridge " + "PCI-X capability, aborting.\n"); + goto out; + } + } + + /* actually hit reset */ + { + void __iomem *reset = ioremap(pci_resource_start(mdev->pdev, 0) + + MTHCA_RESET_OFFSET, 4); + + if (!reset) { + err = -ENOMEM; + mthca_err(mdev, "Couldn't map HCA reset register, " + "aborting.\n"); + goto out; + } + + writel(MTHCA_RESET_VALUE, reset); + iounmap(reset); + } + + /* Docs say to wait one second before accessing device */ + msleep(1000); + + /* Now wait for PCI device to start responding again */ + { + u32 v; + int c = 0; + + for (c = 0; c < 100; ++c) { + if (pci_read_config_dword(bridge ? bridge : mdev->pdev, 0, &v)) { + err = -ENODEV; + mthca_err(mdev, "Couldn't access HCA after reset, " + "aborting.\n"); + goto out; + } + + if (v != 0xffffffff) + goto good; + + msleep(100); + } + + err = -ENODEV; + mthca_err(mdev, "PCI device did not come back after reset, " + "aborting.\n"); + goto out; + } + +good: + /* Now restore the PCI headers */ + if (bridge) { + if (pci_write_config_dword(bridge, bridge_pcix_cap + 0x8, + bridge_header[(bridge_pcix_cap + 0x8) / 4])) { + err = -ENODEV; + mthca_err(mdev, "Couldn't restore HCA bridge Upstream " + "split transaction control, aborting.\n"); + goto out; + } + if (pci_write_config_dword(bridge, bridge_pcix_cap + 0xc, + bridge_header[(bridge_pcix_cap + 0xc) / 4])) { + err = -ENODEV; + mthca_err(mdev, "Couldn't restore HCA bridge Downstream " + "split transaction control, aborting.\n"); + goto out; + } + /* + * Bridge control register is at 0x3e, so we'll + * naturally restore it last in this loop. + */ + for (i = 0; i < 16; ++i) { + if (i * 4 == PCI_COMMAND) + continue; + + if (pci_write_config_dword(bridge, i * 4, bridge_header[i])) { + err = -ENODEV; + mthca_err(mdev, "Couldn't restore HCA bridge reg %x, " + "aborting.\n", i); + goto out; + } + } + + if (pci_write_config_dword(bridge, PCI_COMMAND, + bridge_header[PCI_COMMAND / 4])) { + err = -ENODEV; + mthca_err(mdev, "Couldn't restore HCA bridge COMMAND, " + "aborting.\n"); + goto out; + } + } + + if (hca_pcix_cap) { + if (pci_write_config_dword(mdev->pdev, hca_pcix_cap, + hca_header[hca_pcix_cap / 4])) { + err = -ENODEV; + mthca_err(mdev, "Couldn't restore HCA PCI-X " + "command register, aborting.\n"); + goto out; + } + } + + if (hca_pcie_cap) { + devctl = hca_header[(hca_pcie_cap + PCI_EXP_DEVCTL) / 4]; + if (pcie_capability_write_word(mdev->pdev, PCI_EXP_DEVCTL, + devctl)) { + err = -ENODEV; + mthca_err(mdev, "Couldn't restore HCA PCI Express " + "Device Control register, aborting.\n"); + goto out; + } + linkctl = hca_header[(hca_pcie_cap + PCI_EXP_LNKCTL) / 4]; + if (pcie_capability_write_word(mdev->pdev, PCI_EXP_LNKCTL, + linkctl)) { + err = -ENODEV; + mthca_err(mdev, "Couldn't restore HCA PCI Express " + "Link control register, aborting.\n"); + goto out; + } + } + + for (i = 0; i < 16; ++i) { + if (i * 4 == PCI_COMMAND) + continue; + + if (pci_write_config_dword(mdev->pdev, i * 4, hca_header[i])) { + err = -ENODEV; + mthca_err(mdev, "Couldn't restore HCA reg %x, " + "aborting.\n", i); + goto out; + } + } + + if (pci_write_config_dword(mdev->pdev, PCI_COMMAND, + hca_header[PCI_COMMAND / 4])) { + err = -ENODEV; + mthca_err(mdev, "Couldn't restore HCA COMMAND, " + "aborting.\n"); + goto out; + } + +out: + if (bridge) + pci_dev_put(bridge); + kfree(bridge_header); + kfree(hca_header); + + return err; +} diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c new file mode 100644 index 0000000..d22f970 --- /dev/null +++ b/drivers/infiniband/hw/mthca/mthca_srq.c @@ -0,0 +1,696 @@ +/* + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include + +#include "mthca_dev.h" +#include "mthca_cmd.h" +#include "mthca_memfree.h" +#include "mthca_wqe.h" + +enum { + MTHCA_MAX_DIRECT_SRQ_SIZE = 4 * PAGE_SIZE +}; + +struct mthca_tavor_srq_context { + __be64 wqe_base_ds; /* low 6 bits is descriptor size */ + __be32 state_pd; + __be32 lkey; + __be32 uar; + __be16 limit_watermark; + __be16 wqe_cnt; + u32 reserved[2]; +}; + +struct mthca_arbel_srq_context { + __be32 state_logsize_srqn; + __be32 lkey; + __be32 db_index; + __be32 logstride_usrpage; + __be64 wqe_base; + __be32 eq_pd; + __be16 limit_watermark; + __be16 wqe_cnt; + u16 reserved1; + __be16 wqe_counter; + u32 reserved2[3]; +}; + +static void *get_wqe(struct mthca_srq *srq, int n) +{ + if (srq->is_direct) + return srq->queue.direct.buf + (n << srq->wqe_shift); + else + return srq->queue.page_list[(n << srq->wqe_shift) >> PAGE_SHIFT].buf + + ((n << srq->wqe_shift) & (PAGE_SIZE - 1)); +} + +/* + * Return a pointer to the location within a WQE that we're using as a + * link when the WQE is in the free list. We use the imm field + * because in the Tavor case, posting a WQE may overwrite the next + * segment of the previous WQE, but a receive WQE will never touch the + * imm field. This avoids corrupting our free list if the previous + * WQE has already completed and been put on the free list when we + * post the next WQE. + */ +static inline int *wqe_to_link(void *wqe) +{ + return (int *) (wqe + offsetof(struct mthca_next_seg, imm)); +} + +static void mthca_tavor_init_srq_context(struct mthca_dev *dev, + struct mthca_pd *pd, + struct mthca_srq *srq, + struct mthca_tavor_srq_context *context) +{ + memset(context, 0, sizeof *context); + + context->wqe_base_ds = cpu_to_be64(1 << (srq->wqe_shift - 4)); + context->state_pd = cpu_to_be32(pd->pd_num); + context->lkey = cpu_to_be32(srq->mr.ibmr.lkey); + + if (pd->ibpd.uobject) + context->uar = + cpu_to_be32(to_mucontext(pd->ibpd.uobject->context)->uar.index); + else + context->uar = cpu_to_be32(dev->driver_uar.index); +} + +static void mthca_arbel_init_srq_context(struct mthca_dev *dev, + struct mthca_pd *pd, + struct mthca_srq *srq, + struct mthca_arbel_srq_context *context) +{ + int logsize, max; + + memset(context, 0, sizeof *context); + + /* + * Put max in a temporary variable to work around gcc bug + * triggered by ilog2() on sparc64. + */ + max = srq->max; + logsize = ilog2(max); + context->state_logsize_srqn = cpu_to_be32(logsize << 24 | srq->srqn); + context->lkey = cpu_to_be32(srq->mr.ibmr.lkey); + context->db_index = cpu_to_be32(srq->db_index); + context->logstride_usrpage = cpu_to_be32((srq->wqe_shift - 4) << 29); + if (pd->ibpd.uobject) + context->logstride_usrpage |= + cpu_to_be32(to_mucontext(pd->ibpd.uobject->context)->uar.index); + else + context->logstride_usrpage |= cpu_to_be32(dev->driver_uar.index); + context->eq_pd = cpu_to_be32(MTHCA_EQ_ASYNC << 24 | pd->pd_num); +} + +static void mthca_free_srq_buf(struct mthca_dev *dev, struct mthca_srq *srq) +{ + mthca_buf_free(dev, srq->max << srq->wqe_shift, &srq->queue, + srq->is_direct, &srq->mr); + kfree(srq->wrid); +} + +static int mthca_alloc_srq_buf(struct mthca_dev *dev, struct mthca_pd *pd, + struct mthca_srq *srq) +{ + struct mthca_data_seg *scatter; + void *wqe; + int err; + int i; + + if (pd->ibpd.uobject) + return 0; + + srq->wrid = kmalloc(srq->max * sizeof (u64), GFP_KERNEL); + if (!srq->wrid) + return -ENOMEM; + + err = mthca_buf_alloc(dev, srq->max << srq->wqe_shift, + MTHCA_MAX_DIRECT_SRQ_SIZE, + &srq->queue, &srq->is_direct, pd, 1, &srq->mr); + if (err) { + kfree(srq->wrid); + return err; + } + + /* + * Now initialize the SRQ buffer so that all of the WQEs are + * linked into the list of free WQEs. In addition, set the + * scatter list L_Keys to the sentry value of 0x100. + */ + for (i = 0; i < srq->max; ++i) { + struct mthca_next_seg *next; + + next = wqe = get_wqe(srq, i); + + if (i < srq->max - 1) { + *wqe_to_link(wqe) = i + 1; + next->nda_op = htonl(((i + 1) << srq->wqe_shift) | 1); + } else { + *wqe_to_link(wqe) = -1; + next->nda_op = 0; + } + + for (scatter = wqe + sizeof (struct mthca_next_seg); + (void *) scatter < wqe + (1 << srq->wqe_shift); + ++scatter) + scatter->lkey = cpu_to_be32(MTHCA_INVAL_LKEY); + } + + srq->last = get_wqe(srq, srq->max - 1); + + return 0; +} + +int mthca_alloc_srq(struct mthca_dev *dev, struct mthca_pd *pd, + struct ib_srq_attr *attr, struct mthca_srq *srq) +{ + struct mthca_mailbox *mailbox; + int ds; + int err; + + /* Sanity check SRQ size before proceeding */ + if (attr->max_wr > dev->limits.max_srq_wqes || + attr->max_sge > dev->limits.max_srq_sge) + return -EINVAL; + + srq->max = attr->max_wr; + srq->max_gs = attr->max_sge; + srq->counter = 0; + + if (mthca_is_memfree(dev)) + srq->max = roundup_pow_of_two(srq->max + 1); + else + srq->max = srq->max + 1; + + ds = max(64UL, + roundup_pow_of_two(sizeof (struct mthca_next_seg) + + srq->max_gs * sizeof (struct mthca_data_seg))); + + if (!mthca_is_memfree(dev) && (ds > dev->limits.max_desc_sz)) + return -EINVAL; + + srq->wqe_shift = ilog2(ds); + + srq->srqn = mthca_alloc(&dev->srq_table.alloc); + if (srq->srqn == -1) + return -ENOMEM; + + if (mthca_is_memfree(dev)) { + err = mthca_table_get(dev, dev->srq_table.table, srq->srqn); + if (err) + goto err_out; + + if (!pd->ibpd.uobject) { + srq->db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_SRQ, + srq->srqn, &srq->db); + if (srq->db_index < 0) { + err = -ENOMEM; + goto err_out_icm; + } + } + } + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) { + err = PTR_ERR(mailbox); + goto err_out_db; + } + + err = mthca_alloc_srq_buf(dev, pd, srq); + if (err) + goto err_out_mailbox; + + spin_lock_init(&srq->lock); + srq->refcount = 1; + init_waitqueue_head(&srq->wait); + mutex_init(&srq->mutex); + + if (mthca_is_memfree(dev)) + mthca_arbel_init_srq_context(dev, pd, srq, mailbox->buf); + else + mthca_tavor_init_srq_context(dev, pd, srq, mailbox->buf); + + err = mthca_SW2HW_SRQ(dev, mailbox, srq->srqn); + + if (err) { + mthca_warn(dev, "SW2HW_SRQ failed (%d)\n", err); + goto err_out_free_buf; + } + + spin_lock_irq(&dev->srq_table.lock); + if (mthca_array_set(&dev->srq_table.srq, + srq->srqn & (dev->limits.num_srqs - 1), + srq)) { + spin_unlock_irq(&dev->srq_table.lock); + goto err_out_free_srq; + } + spin_unlock_irq(&dev->srq_table.lock); + + mthca_free_mailbox(dev, mailbox); + + srq->first_free = 0; + srq->last_free = srq->max - 1; + + attr->max_wr = srq->max - 1; + attr->max_sge = srq->max_gs; + + return 0; + +err_out_free_srq: + err = mthca_HW2SW_SRQ(dev, mailbox, srq->srqn); + if (err) + mthca_warn(dev, "HW2SW_SRQ failed (%d)\n", err); + +err_out_free_buf: + if (!pd->ibpd.uobject) + mthca_free_srq_buf(dev, srq); + +err_out_mailbox: + mthca_free_mailbox(dev, mailbox); + +err_out_db: + if (!pd->ibpd.uobject && mthca_is_memfree(dev)) + mthca_free_db(dev, MTHCA_DB_TYPE_SRQ, srq->db_index); + +err_out_icm: + mthca_table_put(dev, dev->srq_table.table, srq->srqn); + +err_out: + mthca_free(&dev->srq_table.alloc, srq->srqn); + + return err; +} + +static inline int get_srq_refcount(struct mthca_dev *dev, struct mthca_srq *srq) +{ + int c; + + spin_lock_irq(&dev->srq_table.lock); + c = srq->refcount; + spin_unlock_irq(&dev->srq_table.lock); + + return c; +} + +void mthca_free_srq(struct mthca_dev *dev, struct mthca_srq *srq) +{ + struct mthca_mailbox *mailbox; + int err; + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) { + mthca_warn(dev, "No memory for mailbox to free SRQ.\n"); + return; + } + + err = mthca_HW2SW_SRQ(dev, mailbox, srq->srqn); + if (err) + mthca_warn(dev, "HW2SW_SRQ failed (%d)\n", err); + + spin_lock_irq(&dev->srq_table.lock); + mthca_array_clear(&dev->srq_table.srq, + srq->srqn & (dev->limits.num_srqs - 1)); + --srq->refcount; + spin_unlock_irq(&dev->srq_table.lock); + + wait_event(srq->wait, !get_srq_refcount(dev, srq)); + + if (!srq->ibsrq.uobject) { + mthca_free_srq_buf(dev, srq); + if (mthca_is_memfree(dev)) + mthca_free_db(dev, MTHCA_DB_TYPE_SRQ, srq->db_index); + } + + mthca_table_put(dev, dev->srq_table.table, srq->srqn); + mthca_free(&dev->srq_table.alloc, srq->srqn); + mthca_free_mailbox(dev, mailbox); +} + +int mthca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) +{ + struct mthca_dev *dev = to_mdev(ibsrq->device); + struct mthca_srq *srq = to_msrq(ibsrq); + int ret = 0; + + /* We don't support resizing SRQs (yet?) */ + if (attr_mask & IB_SRQ_MAX_WR) + return -EINVAL; + + if (attr_mask & IB_SRQ_LIMIT) { + u32 max_wr = mthca_is_memfree(dev) ? srq->max - 1 : srq->max; + if (attr->srq_limit > max_wr) + return -EINVAL; + + mutex_lock(&srq->mutex); + ret = mthca_ARM_SRQ(dev, srq->srqn, attr->srq_limit); + mutex_unlock(&srq->mutex); + } + + return ret; +} + +int mthca_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr) +{ + struct mthca_dev *dev = to_mdev(ibsrq->device); + struct mthca_srq *srq = to_msrq(ibsrq); + struct mthca_mailbox *mailbox; + struct mthca_arbel_srq_context *arbel_ctx; + struct mthca_tavor_srq_context *tavor_ctx; + int err; + + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + err = mthca_QUERY_SRQ(dev, srq->srqn, mailbox); + if (err) + goto out; + + if (mthca_is_memfree(dev)) { + arbel_ctx = mailbox->buf; + srq_attr->srq_limit = be16_to_cpu(arbel_ctx->limit_watermark); + } else { + tavor_ctx = mailbox->buf; + srq_attr->srq_limit = be16_to_cpu(tavor_ctx->limit_watermark); + } + + srq_attr->max_wr = srq->max - 1; + srq_attr->max_sge = srq->max_gs; + +out: + mthca_free_mailbox(dev, mailbox); + + return err; +} + +void mthca_srq_event(struct mthca_dev *dev, u32 srqn, + enum ib_event_type event_type) +{ + struct mthca_srq *srq; + struct ib_event event; + + spin_lock(&dev->srq_table.lock); + srq = mthca_array_get(&dev->srq_table.srq, srqn & (dev->limits.num_srqs - 1)); + if (srq) + ++srq->refcount; + spin_unlock(&dev->srq_table.lock); + + if (!srq) { + mthca_warn(dev, "Async event for bogus SRQ %08x\n", srqn); + return; + } + + if (!srq->ibsrq.event_handler) + goto out; + + event.device = &dev->ib_dev; + event.event = event_type; + event.element.srq = &srq->ibsrq; + srq->ibsrq.event_handler(&event, srq->ibsrq.srq_context); + +out: + spin_lock(&dev->srq_table.lock); + if (!--srq->refcount) + wake_up(&srq->wait); + spin_unlock(&dev->srq_table.lock); +} + +/* + * This function must be called with IRQs disabled. + */ +void mthca_free_srq_wqe(struct mthca_srq *srq, u32 wqe_addr) +{ + int ind; + struct mthca_next_seg *last_free; + + ind = wqe_addr >> srq->wqe_shift; + + spin_lock(&srq->lock); + + last_free = get_wqe(srq, srq->last_free); + *wqe_to_link(last_free) = ind; + last_free->nda_op = htonl((ind << srq->wqe_shift) | 1); + *wqe_to_link(get_wqe(srq, ind)) = -1; + srq->last_free = ind; + + spin_unlock(&srq->lock); +} + +int mthca_tavor_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mthca_dev *dev = to_mdev(ibsrq->device); + struct mthca_srq *srq = to_msrq(ibsrq); + unsigned long flags; + int err = 0; + int first_ind; + int ind; + int next_ind; + int nreq; + int i; + void *wqe; + void *prev_wqe; + + spin_lock_irqsave(&srq->lock, flags); + + first_ind = srq->first_free; + + for (nreq = 0; wr; wr = wr->next) { + ind = srq->first_free; + wqe = get_wqe(srq, ind); + next_ind = *wqe_to_link(wqe); + + if (unlikely(next_ind < 0)) { + mthca_err(dev, "SRQ %06x full\n", srq->srqn); + err = -ENOMEM; + *bad_wr = wr; + break; + } + + prev_wqe = srq->last; + srq->last = wqe; + + ((struct mthca_next_seg *) wqe)->ee_nds = 0; + /* flags field will always remain 0 */ + + wqe += sizeof (struct mthca_next_seg); + + if (unlikely(wr->num_sge > srq->max_gs)) { + err = -EINVAL; + *bad_wr = wr; + srq->last = prev_wqe; + break; + } + + for (i = 0; i < wr->num_sge; ++i) { + mthca_set_data_seg(wqe, wr->sg_list + i); + wqe += sizeof (struct mthca_data_seg); + } + + if (i < srq->max_gs) + mthca_set_data_seg_inval(wqe); + + ((struct mthca_next_seg *) prev_wqe)->ee_nds = + cpu_to_be32(MTHCA_NEXT_DBD); + + srq->wrid[ind] = wr->wr_id; + srq->first_free = next_ind; + + ++nreq; + if (unlikely(nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB)) { + nreq = 0; + + /* + * Make sure that descriptors are written + * before doorbell is rung. + */ + wmb(); + + mthca_write64(first_ind << srq->wqe_shift, srq->srqn << 8, + dev->kar + MTHCA_RECEIVE_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); + + first_ind = srq->first_free; + } + } + + if (likely(nreq)) { + /* + * Make sure that descriptors are written before + * doorbell is rung. + */ + wmb(); + + mthca_write64(first_ind << srq->wqe_shift, (srq->srqn << 8) | nreq, + dev->kar + MTHCA_RECEIVE_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); + } + + /* + * Make sure doorbells don't leak out of SRQ spinlock and + * reach the HCA out of order: + */ + mmiowb(); + + spin_unlock_irqrestore(&srq->lock, flags); + return err; +} + +int mthca_arbel_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mthca_dev *dev = to_mdev(ibsrq->device); + struct mthca_srq *srq = to_msrq(ibsrq); + unsigned long flags; + int err = 0; + int ind; + int next_ind; + int nreq; + int i; + void *wqe; + + spin_lock_irqsave(&srq->lock, flags); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + ind = srq->first_free; + wqe = get_wqe(srq, ind); + next_ind = *wqe_to_link(wqe); + + if (unlikely(next_ind < 0)) { + mthca_err(dev, "SRQ %06x full\n", srq->srqn); + err = -ENOMEM; + *bad_wr = wr; + break; + } + + ((struct mthca_next_seg *) wqe)->ee_nds = 0; + /* flags field will always remain 0 */ + + wqe += sizeof (struct mthca_next_seg); + + if (unlikely(wr->num_sge > srq->max_gs)) { + err = -EINVAL; + *bad_wr = wr; + break; + } + + for (i = 0; i < wr->num_sge; ++i) { + mthca_set_data_seg(wqe, wr->sg_list + i); + wqe += sizeof (struct mthca_data_seg); + } + + if (i < srq->max_gs) + mthca_set_data_seg_inval(wqe); + + srq->wrid[ind] = wr->wr_id; + srq->first_free = next_ind; + } + + if (likely(nreq)) { + srq->counter += nreq; + + /* + * Make sure that descriptors are written before + * we write doorbell record. + */ + wmb(); + *srq->db = cpu_to_be32(srq->counter); + } + + spin_unlock_irqrestore(&srq->lock, flags); + return err; +} + +int mthca_max_srq_sge(struct mthca_dev *dev) +{ + if (mthca_is_memfree(dev)) + return dev->limits.max_sg; + + /* + * SRQ allocations are based on powers of 2 for Tavor, + * (although they only need to be multiples of 16 bytes). + * + * Therefore, we need to base the max number of sg entries on + * the largest power of 2 descriptor size that is <= to the + * actual max WQE descriptor size, rather than return the + * max_sg value given by the firmware (which is based on WQE + * sizes as multiples of 16, not powers of 2). + * + * If SRQ implementation is changed for Tavor to be based on + * multiples of 16, the calculation below can be deleted and + * the FW max_sg value returned. + */ + return min_t(int, dev->limits.max_sg, + ((1 << (fls(dev->limits.max_desc_sz) - 1)) - + sizeof (struct mthca_next_seg)) / + sizeof (struct mthca_data_seg)); +} + +int mthca_init_srq_table(struct mthca_dev *dev) +{ + int err; + + if (!(dev->mthca_flags & MTHCA_FLAG_SRQ)) + return 0; + + spin_lock_init(&dev->srq_table.lock); + + err = mthca_alloc_init(&dev->srq_table.alloc, + dev->limits.num_srqs, + dev->limits.num_srqs - 1, + dev->limits.reserved_srqs); + if (err) + return err; + + err = mthca_array_init(&dev->srq_table.srq, + dev->limits.num_srqs); + if (err) + mthca_alloc_cleanup(&dev->srq_table.alloc); + + return err; +} + +void mthca_cleanup_srq_table(struct mthca_dev *dev) +{ + if (!(dev->mthca_flags & MTHCA_FLAG_SRQ)) + return; + + mthca_array_cleanup(&dev->srq_table.srq, dev->limits.num_srqs); + mthca_alloc_cleanup(&dev->srq_table.alloc); +} diff --git a/drivers/infiniband/hw/mthca/mthca_uar.c b/drivers/infiniband/hw/mthca/mthca_uar.c new file mode 100644 index 0000000..ca5900c --- /dev/null +++ b/drivers/infiniband/hw/mthca/mthca_uar.c @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include /* PAGE_SHIFT */ + +#include "mthca_dev.h" +#include "mthca_memfree.h" + +int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar) +{ + uar->index = mthca_alloc(&dev->uar_table.alloc); + if (uar->index == -1) + return -ENOMEM; + + uar->pfn = (pci_resource_start(dev->pdev, 2) >> PAGE_SHIFT) + uar->index; + + return 0; +} + +void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar) +{ + mthca_free(&dev->uar_table.alloc, uar->index); +} + +int mthca_init_uar_table(struct mthca_dev *dev) +{ + int ret; + + ret = mthca_alloc_init(&dev->uar_table.alloc, + dev->limits.num_uars, + dev->limits.num_uars - 1, + dev->limits.reserved_uars + 1); + if (ret) + return ret; + + ret = mthca_init_db_tab(dev); + if (ret) + mthca_alloc_cleanup(&dev->uar_table.alloc); + + return ret; +} + +void mthca_cleanup_uar_table(struct mthca_dev *dev) +{ + mthca_cleanup_db_tab(dev); + + /* XXX check if any UARs are still allocated? */ + mthca_alloc_cleanup(&dev->uar_table.alloc); +} diff --git a/drivers/infiniband/hw/mthca/mthca_user.h b/drivers/infiniband/hw/mthca/mthca_user.h new file mode 100644 index 0000000..5fe56e8 --- /dev/null +++ b/drivers/infiniband/hw/mthca/mthca_user.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MTHCA_USER_H +#define MTHCA_USER_H + +#include + +/* + * Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define MTHCA_UVERBS_ABI_VERSION 1 + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ + +struct mthca_alloc_ucontext_resp { + __u32 qp_tab_size; + __u32 uarc_size; +}; + +struct mthca_alloc_pd_resp { + __u32 pdn; + __u32 reserved; +}; + +struct mthca_reg_mr { +/* + * Mark the memory region with a DMA attribute that causes + * in-flight DMA to be flushed when the region is written to: + */ +#define MTHCA_MR_DMASYNC 0x1 + __u32 mr_attrs; + __u32 reserved; +}; + +struct mthca_create_cq { + __u32 lkey; + __u32 pdn; + __u64 arm_db_page; + __u64 set_db_page; + __u32 arm_db_index; + __u32 set_db_index; +}; + +struct mthca_create_cq_resp { + __u32 cqn; + __u32 reserved; +}; + +struct mthca_resize_cq { + __u32 lkey; + __u32 reserved; +}; + +struct mthca_create_srq { + __u32 lkey; + __u32 db_index; + __u64 db_page; +}; + +struct mthca_create_srq_resp { + __u32 srqn; + __u32 reserved; +}; + +struct mthca_create_qp { + __u32 lkey; + __u32 reserved; + __u64 sq_db_page; + __u64 rq_db_page; + __u32 sq_db_index; + __u32 rq_db_index; +}; + +#endif /* MTHCA_USER_H */ diff --git a/drivers/infiniband/hw/mthca/mthca_wqe.h b/drivers/infiniband/hw/mthca/mthca_wqe.h new file mode 100644 index 0000000..341a5ae --- /dev/null +++ b/drivers/infiniband/hw/mthca/mthca_wqe.h @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MTHCA_WQE_H +#define MTHCA_WQE_H + +#include + +enum { + MTHCA_NEXT_DBD = 1 << 7, + MTHCA_NEXT_FENCE = 1 << 6, + MTHCA_NEXT_CQ_UPDATE = 1 << 3, + MTHCA_NEXT_EVENT_GEN = 1 << 2, + MTHCA_NEXT_SOLICIT = 1 << 1, + MTHCA_NEXT_IP_CSUM = 1 << 4, + MTHCA_NEXT_TCP_UDP_CSUM = 1 << 5, + + MTHCA_MLX_VL15 = 1 << 17, + MTHCA_MLX_SLR = 1 << 16 +}; + +enum { + MTHCA_INVAL_LKEY = 0x100, + MTHCA_TAVOR_MAX_WQES_PER_RECV_DB = 256, + MTHCA_ARBEL_MAX_WQES_PER_SEND_DB = 255 +}; + +struct mthca_next_seg { + __be32 nda_op; /* [31:6] next WQE [4:0] next opcode */ + __be32 ee_nds; /* [31:8] next EE [7] DBD [6] F [5:0] next WQE size */ + __be32 flags; /* [3] CQ [2] Event [1] Solicit */ + __be32 imm; /* immediate data */ +}; + +struct mthca_tavor_ud_seg { + u32 reserved1; + __be32 lkey; + __be64 av_addr; + u32 reserved2[4]; + __be32 dqpn; + __be32 qkey; + u32 reserved3[2]; +}; + +struct mthca_arbel_ud_seg { + __be32 av[8]; + __be32 dqpn; + __be32 qkey; + u32 reserved[2]; +}; + +struct mthca_bind_seg { + __be32 flags; /* [31] Atomic [30] rem write [29] rem read */ + u32 reserved; + __be32 new_rkey; + __be32 lkey; + __be64 addr; + __be64 length; +}; + +struct mthca_raddr_seg { + __be64 raddr; + __be32 rkey; + u32 reserved; +}; + +struct mthca_atomic_seg { + __be64 swap_add; + __be64 compare; +}; + +struct mthca_data_seg { + __be32 byte_count; + __be32 lkey; + __be64 addr; +}; + +struct mthca_mlx_seg { + __be32 nda_op; + __be32 nds; + __be32 flags; /* [17] VL15 [16] SLR [14:12] static rate + [11:8] SL [3] C [2] E */ + __be16 rlid; + __be16 vcrc; +}; + +static __always_inline void mthca_set_data_seg(struct mthca_data_seg *dseg, + struct ib_sge *sg) +{ + dseg->byte_count = cpu_to_be32(sg->length); + dseg->lkey = cpu_to_be32(sg->lkey); + dseg->addr = cpu_to_be64(sg->addr); +} + +static __always_inline void mthca_set_data_seg_inval(struct mthca_data_seg *dseg) +{ + dseg->byte_count = 0; + dseg->lkey = cpu_to_be32(MTHCA_INVAL_LKEY); + dseg->addr = 0; +} + +#endif /* MTHCA_WQE_H */ diff --git a/drivers/infiniband/hw/nes/Kconfig b/drivers/infiniband/hw/nes/Kconfig new file mode 100644 index 0000000..846dc97 --- /dev/null +++ b/drivers/infiniband/hw/nes/Kconfig @@ -0,0 +1,16 @@ +config INFINIBAND_NES + tristate "NetEffect RNIC Driver" + depends on PCI && INET && INFINIBAND + select LIBCRC32C + select INET_LRO + ---help--- + This is the RDMA Network Interface Card (RNIC) driver for + NetEffect Ethernet Cluster Server Adapters. + +config INFINIBAND_NES_DEBUG + bool "Verbose debugging output" + depends on INFINIBAND_NES + default n + ---help--- + This option enables debug messages from the NetEffect RNIC + driver. Select this if you are diagnosing a problem. diff --git a/drivers/infiniband/hw/nes/Makefile b/drivers/infiniband/hw/nes/Makefile new file mode 100644 index 0000000..97820c2 --- /dev/null +++ b/drivers/infiniband/hw/nes/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_INFINIBAND_NES) += iw_nes.o + +iw_nes-objs := nes.o nes_hw.o nes_nic.o nes_utils.o nes_verbs.o nes_cm.o nes_mgt.o diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c new file mode 100644 index 0000000..3b2a6dc --- /dev/null +++ b/drivers/infiniband/hw/nes/nes.c @@ -0,0 +1,1269 @@ +/* + * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nes.h" + +#include +#include +#include +#include + +MODULE_AUTHOR("NetEffect"); +MODULE_DESCRIPTION("NetEffect RNIC Low-level iWARP Driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); + +int max_mtu = 9000; +int interrupt_mod_interval = 0; + +/* Interoperability */ +int mpa_version = 1; +module_param(mpa_version, int, 0644); +MODULE_PARM_DESC(mpa_version, "MPA version to be used int MPA Req/Resp (0 or 1)"); + +/* Interoperability */ +int disable_mpa_crc = 0; +module_param(disable_mpa_crc, int, 0644); +MODULE_PARM_DESC(disable_mpa_crc, "Disable checking of MPA CRC"); + +unsigned int nes_drv_opt = NES_DRV_OPT_DISABLE_INT_MOD | NES_DRV_OPT_ENABLE_PAU; +module_param(nes_drv_opt, int, 0644); +MODULE_PARM_DESC(nes_drv_opt, "Driver option parameters"); + +unsigned int nes_debug_level = 0; +module_param_named(debug_level, nes_debug_level, uint, 0644); +MODULE_PARM_DESC(debug_level, "Enable debug output level"); + +unsigned int wqm_quanta = 0x10000; +module_param(wqm_quanta, int, 0644); +MODULE_PARM_DESC(wqm_quanta, "WQM quanta"); + +static bool limit_maxrdreqsz; +module_param(limit_maxrdreqsz, bool, 0644); +MODULE_PARM_DESC(limit_maxrdreqsz, "Limit max read request size to 256 Bytes"); + +LIST_HEAD(nes_adapter_list); +static LIST_HEAD(nes_dev_list); + +atomic_t qps_destroyed; + +static unsigned int ee_flsh_adapter; +static unsigned int sysfs_nonidx_addr; +static unsigned int sysfs_idx_addr; + +static struct pci_device_id nes_pci_table[] = { + { PCI_VDEVICE(NETEFFECT, PCI_DEVICE_ID_NETEFFECT_NE020), }, + { PCI_VDEVICE(NETEFFECT, PCI_DEVICE_ID_NETEFFECT_NE020_KR), }, + {0} +}; + +MODULE_DEVICE_TABLE(pci, nes_pci_table); + +/* registered nes netlink callbacks */ +static struct ibnl_client_cbs nes_nl_cb_table[] = { + [RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb}, + [RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb}, + [RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb}, + [RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb}, + [RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb}, + [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb} +}; + +static int nes_inetaddr_event(struct notifier_block *, unsigned long, void *); +static int nes_net_event(struct notifier_block *, unsigned long, void *); +static int nes_notifiers_registered; + + +static struct notifier_block nes_inetaddr_notifier = { + .notifier_call = nes_inetaddr_event +}; + +static struct notifier_block nes_net_notifier = { + .notifier_call = nes_net_event +}; + +/** + * nes_inetaddr_event + */ +static int nes_inetaddr_event(struct notifier_block *notifier, + unsigned long event, void *ptr) +{ + struct in_ifaddr *ifa = ptr; + struct net_device *event_netdev = ifa->ifa_dev->dev; + struct nes_device *nesdev; + struct net_device *netdev; + struct net_device *upper_dev; + struct nes_vnic *nesvnic; + unsigned int is_bonded; + + nes_debug(NES_DBG_NETDEV, "nes_inetaddr_event: ip address %pI4, netmask %pI4.\n", + &ifa->ifa_address, &ifa->ifa_mask); + list_for_each_entry(nesdev, &nes_dev_list, list) { + nes_debug(NES_DBG_NETDEV, "Nesdev list entry = 0x%p. (%s)\n", + nesdev, nesdev->netdev[0]->name); + netdev = nesdev->netdev[0]; + nesvnic = netdev_priv(netdev); + upper_dev = netdev_master_upper_dev_get(netdev); + is_bonded = netif_is_bond_slave(netdev) && + (upper_dev == event_netdev); + if ((netdev == event_netdev) || is_bonded) { + if (nesvnic->rdma_enabled == 0) { + nes_debug(NES_DBG_NETDEV, "Returning without processing event for %s since" + " RDMA is not enabled.\n", + netdev->name); + return NOTIFY_OK; + } + /* we have ifa->ifa_address/mask here if we need it */ + switch (event) { + case NETDEV_DOWN: + nes_debug(NES_DBG_NETDEV, "event:DOWN\n"); + nes_write_indexed(nesdev, + NES_IDX_DST_IP_ADDR+(0x10*PCI_FUNC(nesdev->pcidev->devfn)), 0); + + nes_manage_arp_cache(netdev, netdev->dev_addr, + ntohl(nesvnic->local_ipaddr), NES_ARP_DELETE); + nesvnic->local_ipaddr = 0; + if (is_bonded) + continue; + else + return NOTIFY_OK; + break; + case NETDEV_UP: + nes_debug(NES_DBG_NETDEV, "event:UP\n"); + + if (nesvnic->local_ipaddr != 0) { + nes_debug(NES_DBG_NETDEV, "Interface already has local_ipaddr\n"); + return NOTIFY_OK; + } + /* fall through */ + case NETDEV_CHANGEADDR: + /* Add the address to the IP table */ + if (upper_dev) + nesvnic->local_ipaddr = + ((struct in_device *)upper_dev->ip_ptr)->ifa_list->ifa_address; + else + nesvnic->local_ipaddr = ifa->ifa_address; + + nes_write_indexed(nesdev, + NES_IDX_DST_IP_ADDR+(0x10*PCI_FUNC(nesdev->pcidev->devfn)), + ntohl(nesvnic->local_ipaddr)); + nes_manage_arp_cache(netdev, netdev->dev_addr, + ntohl(nesvnic->local_ipaddr), NES_ARP_ADD); + if (is_bonded) + continue; + else + return NOTIFY_OK; + break; + default: + break; + } + } + } + + return NOTIFY_DONE; +} + + +/** + * nes_net_event + */ +static int nes_net_event(struct notifier_block *notifier, + unsigned long event, void *ptr) +{ + struct neighbour *neigh = ptr; + struct nes_device *nesdev; + struct net_device *netdev; + struct nes_vnic *nesvnic; + + switch (event) { + case NETEVENT_NEIGH_UPDATE: + list_for_each_entry(nesdev, &nes_dev_list, list) { + /* nes_debug(NES_DBG_NETDEV, "Nesdev list entry = 0x%p.\n", nesdev); */ + netdev = nesdev->netdev[0]; + nesvnic = netdev_priv(netdev); + if (netdev == neigh->dev) { + if (nesvnic->rdma_enabled == 0) { + nes_debug(NES_DBG_NETDEV, "Skipping device %s since no RDMA\n", + netdev->name); + } else { + if (neigh->nud_state & NUD_VALID) { + nes_manage_arp_cache(neigh->dev, neigh->ha, + ntohl(*(__be32 *)neigh->primary_key), NES_ARP_ADD); + } else { + nes_manage_arp_cache(neigh->dev, neigh->ha, + ntohl(*(__be32 *)neigh->primary_key), NES_ARP_DELETE); + } + } + return NOTIFY_OK; + } + } + break; + default: + nes_debug(NES_DBG_NETDEV, "NETEVENT_ %lu undefined\n", event); + break; + } + + return NOTIFY_DONE; +} + + +/** + * nes_add_ref + */ +void nes_add_ref(struct ib_qp *ibqp) +{ + struct nes_qp *nesqp; + + nesqp = to_nesqp(ibqp); + nes_debug(NES_DBG_QP, "Bumping refcount for QP%u. Pre-inc value = %u\n", + ibqp->qp_num, atomic_read(&nesqp->refcount)); + atomic_inc(&nesqp->refcount); +} + +static void nes_cqp_rem_ref_callback(struct nes_device *nesdev, struct nes_cqp_request *cqp_request) +{ + unsigned long flags; + struct nes_qp *nesqp = cqp_request->cqp_callback_pointer; + struct nes_adapter *nesadapter = nesdev->nesadapter; + + atomic_inc(&qps_destroyed); + + /* Free the control structures */ + + if (nesqp->pbl_vbase) { + pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size, + nesqp->hwqp.q2_vbase, nesqp->hwqp.q2_pbase); + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + nesadapter->free_256pbl++; + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase); + nesqp->pbl_vbase = NULL; + + } else { + pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size, + nesqp->hwqp.sq_vbase, nesqp->hwqp.sq_pbase); + } + nes_free_resource(nesadapter, nesadapter->allocated_qps, nesqp->hwqp.qp_id); + + nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = NULL; + kfree(nesqp->allocated_buffer); + +} + +/** + * nes_rem_ref + */ +void nes_rem_ref(struct ib_qp *ibqp) +{ + u64 u64temp; + struct nes_qp *nesqp; + struct nes_vnic *nesvnic = to_nesvnic(ibqp->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_cqp_request *cqp_request; + u32 opcode; + + nesqp = to_nesqp(ibqp); + + if (atomic_read(&nesqp->refcount) == 0) { + printk(KERN_INFO PFX "%s: Reference count already 0 for QP%d, last aeq = 0x%04X.\n", + __func__, ibqp->qp_num, nesqp->last_aeq); + BUG(); + } + + if (atomic_dec_and_test(&nesqp->refcount)) { + if (nesqp->pau_mode) + nes_destroy_pau_qp(nesdev, nesqp); + + /* Destroy the QP */ + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n"); + return; + } + cqp_request->waiting = 0; + cqp_request->callback = 1; + cqp_request->cqp_callback = nes_cqp_rem_ref_callback; + cqp_request->cqp_callback_pointer = nesqp; + cqp_wqe = &cqp_request->cqp_wqe; + + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + opcode = NES_CQP_DESTROY_QP | NES_CQP_QP_TYPE_IWARP; + + if (nesqp->hte_added) { + opcode |= NES_CQP_QP_DEL_HTE; + nesqp->hte_added = 0; + } + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id); + u64temp = (u64)nesqp->nesqp_context_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp); + nes_post_cqp_request(nesdev, cqp_request); + } +} + + +/** + * nes_get_qp + */ +struct ib_qp *nes_get_qp(struct ib_device *device, int qpn) +{ + struct nes_vnic *nesvnic = to_nesvnic(device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + + if ((qpn < NES_FIRST_QPN) || (qpn >= (NES_FIRST_QPN + nesadapter->max_qp))) + return NULL; + + return &nesadapter->qp_table[qpn - NES_FIRST_QPN]->ibqp; +} + + +/** + * nes_print_macaddr + */ +static void nes_print_macaddr(struct net_device *netdev) +{ + nes_debug(NES_DBG_INIT, "%s: %pM, IRQ %u\n", + netdev->name, netdev->dev_addr, netdev->irq); +} + +/** + * nes_interrupt - handle interrupts + */ +static irqreturn_t nes_interrupt(int irq, void *dev_id) +{ + struct nes_device *nesdev = (struct nes_device *)dev_id; + int handled = 0; + u32 int_mask; + u32 int_req; + u32 int_stat; + u32 intf_int_stat; + u32 timer_stat; + + if (nesdev->msi_enabled) { + /* No need to read the interrupt pending register if msi is enabled */ + handled = 1; + } else { + if (unlikely(nesdev->nesadapter->hw_rev == NE020_REV)) { + /* Master interrupt enable provides synchronization for kicking off bottom half + when interrupt sharing is going on */ + int_mask = nes_read32(nesdev->regs + NES_INT_MASK); + if (int_mask & 0x80000000) { + /* Check interrupt status to see if this might be ours */ + int_stat = nes_read32(nesdev->regs + NES_INT_STAT); + int_req = nesdev->int_req; + if (int_stat&int_req) { + /* if interesting CEQ or AEQ is pending, claim the interrupt */ + if ((int_stat&int_req) & (~(NES_INT_TIMER|NES_INT_INTF))) { + handled = 1; + } else { + if (((int_stat & int_req) & NES_INT_TIMER) == NES_INT_TIMER) { + /* Timer might be running but might be for another function */ + timer_stat = nes_read32(nesdev->regs + NES_TIMER_STAT); + if ((timer_stat & nesdev->timer_int_req) != 0) { + handled = 1; + } + } + if ((((int_stat & int_req) & NES_INT_INTF) == NES_INT_INTF) && + (handled == 0)) { + intf_int_stat = nes_read32(nesdev->regs+NES_INTF_INT_STAT); + if ((intf_int_stat & nesdev->intf_int_req) != 0) { + handled = 1; + } + } + } + if (handled) { + nes_write32(nesdev->regs+NES_INT_MASK, int_mask & (~0x80000000)); + int_mask = nes_read32(nesdev->regs+NES_INT_MASK); + /* Save off the status to save an additional read */ + nesdev->int_stat = int_stat; + nesdev->napi_isr_ran = 1; + } + } + } + } else { + handled = nes_read32(nesdev->regs+NES_INT_PENDING); + } + } + + if (handled) { + + if (nes_napi_isr(nesdev) == 0) { + tasklet_schedule(&nesdev->dpc_tasklet); + + } + return IRQ_HANDLED; + } else { + return IRQ_NONE; + } +} + + +/** + * nes_probe - Device initialization + */ +static int nes_probe(struct pci_dev *pcidev, const struct pci_device_id *ent) +{ + struct net_device *netdev = NULL; + struct nes_device *nesdev = NULL; + int ret = 0; + void __iomem *mmio_regs = NULL; + u8 hw_rev; + + assert(pcidev != NULL); + assert(ent != NULL); + + printk(KERN_INFO PFX "NetEffect RNIC driver v%s loading. (%s)\n", + DRV_VERSION, pci_name(pcidev)); + + ret = pci_enable_device(pcidev); + if (ret) { + printk(KERN_ERR PFX "Unable to enable PCI device. (%s)\n", pci_name(pcidev)); + goto bail0; + } + + nes_debug(NES_DBG_INIT, "BAR0 (@0x%08lX) size = 0x%lX bytes\n", + (long unsigned int)pci_resource_start(pcidev, BAR_0), + (long unsigned int)pci_resource_len(pcidev, BAR_0)); + nes_debug(NES_DBG_INIT, "BAR1 (@0x%08lX) size = 0x%lX bytes\n", + (long unsigned int)pci_resource_start(pcidev, BAR_1), + (long unsigned int)pci_resource_len(pcidev, BAR_1)); + + /* Make sure PCI base addr are MMIO */ + if (!(pci_resource_flags(pcidev, BAR_0) & IORESOURCE_MEM) || + !(pci_resource_flags(pcidev, BAR_1) & IORESOURCE_MEM)) { + printk(KERN_ERR PFX "PCI regions not an MMIO resource\n"); + ret = -ENODEV; + goto bail1; + } + + /* Reserve PCI I/O and memory resources */ + ret = pci_request_regions(pcidev, DRV_NAME); + if (ret) { + printk(KERN_ERR PFX "Unable to request regions. (%s)\n", pci_name(pcidev)); + goto bail1; + } + + if ((sizeof(dma_addr_t) > 4)) { + ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(64)); + if (ret < 0) { + printk(KERN_ERR PFX "64b DMA mask configuration failed\n"); + goto bail2; + } + ret = pci_set_consistent_dma_mask(pcidev, DMA_BIT_MASK(64)); + if (ret) { + printk(KERN_ERR PFX "64b DMA consistent mask configuration failed\n"); + goto bail2; + } + } else { + ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(32)); + if (ret < 0) { + printk(KERN_ERR PFX "32b DMA mask configuration failed\n"); + goto bail2; + } + ret = pci_set_consistent_dma_mask(pcidev, DMA_BIT_MASK(32)); + if (ret) { + printk(KERN_ERR PFX "32b DMA consistent mask configuration failed\n"); + goto bail2; + } + } + + pci_set_master(pcidev); + + /* Allocate hardware structure */ + nesdev = kzalloc(sizeof(struct nes_device), GFP_KERNEL); + if (!nesdev) { + printk(KERN_ERR PFX "%s: Unable to alloc hardware struct\n", pci_name(pcidev)); + ret = -ENOMEM; + goto bail2; + } + + nes_debug(NES_DBG_INIT, "Allocated nes device at %p\n", nesdev); + nesdev->pcidev = pcidev; + pci_set_drvdata(pcidev, nesdev); + + pci_read_config_byte(pcidev, 0x0008, &hw_rev); + nes_debug(NES_DBG_INIT, "hw_rev=%u\n", hw_rev); + + spin_lock_init(&nesdev->indexed_regs_lock); + + /* Remap the PCI registers in adapter BAR0 to kernel VA space */ + mmio_regs = ioremap_nocache(pci_resource_start(pcidev, BAR_0), + pci_resource_len(pcidev, BAR_0)); + if (mmio_regs == NULL) { + printk(KERN_ERR PFX "Unable to remap BAR0\n"); + ret = -EIO; + goto bail3; + } + nesdev->regs = mmio_regs; + nesdev->index_reg = 0x50 + (PCI_FUNC(pcidev->devfn)*8) + mmio_regs; + + /* Ensure interrupts are disabled */ + nes_write32(nesdev->regs+NES_INT_MASK, 0x7fffffff); + + if (nes_drv_opt & NES_DRV_OPT_ENABLE_MSI) { + if (!pci_enable_msi(nesdev->pcidev)) { + nesdev->msi_enabled = 1; + nes_debug(NES_DBG_INIT, "MSI is enabled for device %s\n", + pci_name(pcidev)); + } else { + nes_debug(NES_DBG_INIT, "MSI is disabled by linux for device %s\n", + pci_name(pcidev)); + } + } else { + nes_debug(NES_DBG_INIT, "MSI not requested due to driver options for device %s\n", + pci_name(pcidev)); + } + + nesdev->csr_start = pci_resource_start(nesdev->pcidev, BAR_0); + nesdev->doorbell_region = pci_resource_start(nesdev->pcidev, BAR_1); + + /* Init the adapter */ + nesdev->nesadapter = nes_init_adapter(nesdev, hw_rev); + if (!nesdev->nesadapter) { + printk(KERN_ERR PFX "Unable to initialize adapter.\n"); + ret = -ENOMEM; + goto bail5; + } + nesdev->nesadapter->et_rx_coalesce_usecs_irq = interrupt_mod_interval; + nesdev->nesadapter->wqm_quanta = wqm_quanta; + + /* nesdev->base_doorbell_index = + nesdev->nesadapter->pd_config_base[PCI_FUNC(nesdev->pcidev->devfn)]; */ + nesdev->base_doorbell_index = 1; + nesdev->doorbell_start = nesdev->nesadapter->doorbell_start; + if (nesdev->nesadapter->phy_type[0] == NES_PHY_TYPE_PUMA_1G) { + switch (PCI_FUNC(nesdev->pcidev->devfn) % + nesdev->nesadapter->port_count) { + case 1: + nesdev->mac_index = 2; + break; + case 2: + nesdev->mac_index = 1; + break; + case 3: + nesdev->mac_index = 3; + break; + case 0: + default: + nesdev->mac_index = 0; + } + } else { + nesdev->mac_index = PCI_FUNC(nesdev->pcidev->devfn) % + nesdev->nesadapter->port_count; + } + + if ((limit_maxrdreqsz || + ((nesdev->nesadapter->phy_type[0] == NES_PHY_TYPE_GLADIUS) && + (hw_rev == NE020_REV1))) && + (pcie_get_readrq(pcidev) > 256)) { + if (pcie_set_readrq(pcidev, 256)) + printk(KERN_ERR PFX "Unable to set max read request" + " to 256 bytes\n"); + else + nes_debug(NES_DBG_INIT, "Max read request size set" + " to 256 bytes\n"); + } + + tasklet_init(&nesdev->dpc_tasklet, nes_dpc, (unsigned long)nesdev); + + /* bring up the Control QP */ + if (nes_init_cqp(nesdev)) { + ret = -ENODEV; + goto bail6; + } + + /* Arm the CCQ */ + nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT | + PCI_FUNC(nesdev->pcidev->devfn)); + nes_read32(nesdev->regs+NES_CQE_ALLOC); + + /* Enable the interrupts */ + nesdev->int_req = (0x101 << PCI_FUNC(nesdev->pcidev->devfn)) | + (1 << (PCI_FUNC(nesdev->pcidev->devfn)+16)); + if (PCI_FUNC(nesdev->pcidev->devfn) < 4) { + nesdev->int_req |= (1 << (PCI_FUNC(nesdev->mac_index)+24)); + } + + /* TODO: This really should be the first driver to load, not function 0 */ + if (PCI_FUNC(nesdev->pcidev->devfn) == 0) { + /* pick up PCI and critical errors if the first driver to load */ + nesdev->intf_int_req = NES_INTF_INT_PCIERR | NES_INTF_INT_CRITERR; + nesdev->int_req |= NES_INT_INTF; + } else { + nesdev->intf_int_req = 0; + } + nesdev->intf_int_req |= (1 << (PCI_FUNC(nesdev->pcidev->devfn)+16)); + nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS0, 0); + nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS1, 0); + nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS2, 0x00001265); + nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS4, 0x18021804); + + nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS3, 0x17801790); + + /* deal with both periodic and one_shot */ + nesdev->timer_int_req = 0x101 << PCI_FUNC(nesdev->pcidev->devfn); + nesdev->nesadapter->timer_int_req |= nesdev->timer_int_req; + nes_debug(NES_DBG_INIT, "setting int_req for function %u, nesdev = 0x%04X, adapter = 0x%04X\n", + PCI_FUNC(nesdev->pcidev->devfn), + nesdev->timer_int_req, nesdev->nesadapter->timer_int_req); + + nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req)); + + list_add_tail(&nesdev->list, &nes_dev_list); + + /* Request an interrupt line for the driver */ + ret = request_irq(pcidev->irq, nes_interrupt, IRQF_SHARED, DRV_NAME, nesdev); + if (ret) { + printk(KERN_ERR PFX "%s: requested IRQ %u is busy\n", + pci_name(pcidev), pcidev->irq); + goto bail65; + } + + nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req); + + if (nes_notifiers_registered == 0) { + register_inetaddr_notifier(&nes_inetaddr_notifier); + register_netevent_notifier(&nes_net_notifier); + } + nes_notifiers_registered++; + + if (ibnl_add_client(RDMA_NL_NES, RDMA_NL_IWPM_NUM_OPS, nes_nl_cb_table)) + printk(KERN_ERR PFX "%s[%u]: Failed to add netlink callback\n", + __func__, __LINE__); + + ret = iwpm_init(RDMA_NL_NES); + if (ret) { + printk(KERN_ERR PFX "%s: port mapper initialization failed\n", + pci_name(pcidev)); + goto bail7; + } + + INIT_DELAYED_WORK(&nesdev->work, nes_recheck_link_status); + + /* Initialize network devices */ + netdev = nes_netdev_init(nesdev, mmio_regs); + if (netdev == NULL) { + ret = -ENOMEM; + goto bail7; + } + + /* Register network device */ + ret = register_netdev(netdev); + if (ret) { + printk(KERN_ERR PFX "Unable to register netdev, ret = %d\n", ret); + nes_netdev_destroy(netdev); + goto bail7; + } + + nes_print_macaddr(netdev); + + nesdev->netdev_count++; + nesdev->nesadapter->netdev_count++; + + printk(KERN_INFO PFX "%s: NetEffect RNIC driver successfully loaded.\n", + pci_name(pcidev)); + return 0; + + bail7: + printk(KERN_ERR PFX "bail7\n"); + while (nesdev->netdev_count > 0) { + nesdev->netdev_count--; + nesdev->nesadapter->netdev_count--; + + unregister_netdev(nesdev->netdev[nesdev->netdev_count]); + nes_netdev_destroy(nesdev->netdev[nesdev->netdev_count]); + } + + nes_debug(NES_DBG_INIT, "netdev_count=%d, nesadapter->netdev_count=%d\n", + nesdev->netdev_count, nesdev->nesadapter->netdev_count); + ibnl_remove_client(RDMA_NL_NES); + + nes_notifiers_registered--; + if (nes_notifiers_registered == 0) { + unregister_netevent_notifier(&nes_net_notifier); + unregister_inetaddr_notifier(&nes_inetaddr_notifier); + } + + list_del(&nesdev->list); + nes_destroy_cqp(nesdev); + + bail65: + printk(KERN_ERR PFX "bail65\n"); + free_irq(pcidev->irq, nesdev); + if (nesdev->msi_enabled) { + pci_disable_msi(pcidev); + } + bail6: + printk(KERN_ERR PFX "bail6\n"); + tasklet_kill(&nesdev->dpc_tasklet); + /* Deallocate the Adapter Structure */ + nes_destroy_adapter(nesdev->nesadapter); + + bail5: + printk(KERN_ERR PFX "bail5\n"); + iounmap(nesdev->regs); + + bail3: + printk(KERN_ERR PFX "bail3\n"); + kfree(nesdev); + + bail2: + pci_release_regions(pcidev); + + bail1: + pci_disable_device(pcidev); + + bail0: + return ret; +} + + +/** + * nes_remove - unload from kernel + */ +static void nes_remove(struct pci_dev *pcidev) +{ + struct nes_device *nesdev = pci_get_drvdata(pcidev); + struct net_device *netdev; + int netdev_index = 0; + unsigned long flags; + + if (nesdev->netdev_count) { + netdev = nesdev->netdev[netdev_index]; + if (netdev) { + netif_stop_queue(netdev); + unregister_netdev(netdev); + nes_netdev_destroy(netdev); + + nesdev->netdev[netdev_index] = NULL; + nesdev->netdev_count--; + nesdev->nesadapter->netdev_count--; + } + } + ibnl_remove_client(RDMA_NL_NES); + iwpm_exit(RDMA_NL_NES); + + nes_notifiers_registered--; + if (nes_notifiers_registered == 0) { + unregister_netevent_notifier(&nes_net_notifier); + unregister_inetaddr_notifier(&nes_inetaddr_notifier); + } + + list_del(&nesdev->list); + nes_destroy_cqp(nesdev); + + free_irq(pcidev->irq, nesdev); + tasklet_kill(&nesdev->dpc_tasklet); + + spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags); + if (nesdev->link_recheck) { + spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags); + cancel_delayed_work_sync(&nesdev->work); + } else { + spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags); + } + + /* Deallocate the Adapter Structure */ + nes_destroy_adapter(nesdev->nesadapter); + + if (nesdev->msi_enabled) { + pci_disable_msi(pcidev); + } + + iounmap(nesdev->regs); + kfree(nesdev); + + /* nes_debug(NES_DBG_SHUTDOWN, "calling pci_release_regions.\n"); */ + pci_release_regions(pcidev); + pci_disable_device(pcidev); + pci_set_drvdata(pcidev, NULL); +} + + +static struct pci_driver nes_pci_driver = { + .name = DRV_NAME, + .id_table = nes_pci_table, + .probe = nes_probe, + .remove = nes_remove, +}; + +static ssize_t nes_show_adapter(struct device_driver *ddp, char *buf) +{ + unsigned int devfn = 0xffffffff; + unsigned char bus_number = 0xff; + unsigned int i = 0; + struct nes_device *nesdev; + + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + devfn = nesdev->pcidev->devfn; + bus_number = nesdev->pcidev->bus->number; + break; + } + i++; + } + + return snprintf(buf, PAGE_SIZE, "%x:%x\n", bus_number, devfn); +} + +static ssize_t nes_store_adapter(struct device_driver *ddp, + const char *buf, size_t count) +{ + char *p = (char *)buf; + + ee_flsh_adapter = simple_strtoul(p, &p, 10); + return strnlen(buf, count); +} + +static ssize_t nes_show_ee_cmd(struct device_driver *ddp, char *buf) +{ + u32 eeprom_cmd = 0xdead; + u32 i = 0; + struct nes_device *nesdev; + + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + eeprom_cmd = nes_read32(nesdev->regs + NES_EEPROM_COMMAND); + break; + } + i++; + } + return snprintf(buf, PAGE_SIZE, "0x%x\n", eeprom_cmd); +} + +static ssize_t nes_store_ee_cmd(struct device_driver *ddp, + const char *buf, size_t count) +{ + char *p = (char *)buf; + u32 val; + u32 i = 0; + struct nes_device *nesdev; + + if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') { + val = simple_strtoul(p, &p, 16); + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + nes_write32(nesdev->regs + NES_EEPROM_COMMAND, val); + break; + } + i++; + } + } + return strnlen(buf, count); +} + +static ssize_t nes_show_ee_data(struct device_driver *ddp, char *buf) +{ + u32 eeprom_data = 0xdead; + u32 i = 0; + struct nes_device *nesdev; + + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + eeprom_data = nes_read32(nesdev->regs + NES_EEPROM_DATA); + break; + } + i++; + } + + return snprintf(buf, PAGE_SIZE, "0x%x\n", eeprom_data); +} + +static ssize_t nes_store_ee_data(struct device_driver *ddp, + const char *buf, size_t count) +{ + char *p = (char *)buf; + u32 val; + u32 i = 0; + struct nes_device *nesdev; + + if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') { + val = simple_strtoul(p, &p, 16); + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + nes_write32(nesdev->regs + NES_EEPROM_DATA, val); + break; + } + i++; + } + } + return strnlen(buf, count); +} + +static ssize_t nes_show_flash_cmd(struct device_driver *ddp, char *buf) +{ + u32 flash_cmd = 0xdead; + u32 i = 0; + struct nes_device *nesdev; + + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + flash_cmd = nes_read32(nesdev->regs + NES_FLASH_COMMAND); + break; + } + i++; + } + + return snprintf(buf, PAGE_SIZE, "0x%x\n", flash_cmd); +} + +static ssize_t nes_store_flash_cmd(struct device_driver *ddp, + const char *buf, size_t count) +{ + char *p = (char *)buf; + u32 val; + u32 i = 0; + struct nes_device *nesdev; + + if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') { + val = simple_strtoul(p, &p, 16); + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + nes_write32(nesdev->regs + NES_FLASH_COMMAND, val); + break; + } + i++; + } + } + return strnlen(buf, count); +} + +static ssize_t nes_show_flash_data(struct device_driver *ddp, char *buf) +{ + u32 flash_data = 0xdead; + u32 i = 0; + struct nes_device *nesdev; + + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + flash_data = nes_read32(nesdev->regs + NES_FLASH_DATA); + break; + } + i++; + } + + return snprintf(buf, PAGE_SIZE, "0x%x\n", flash_data); +} + +static ssize_t nes_store_flash_data(struct device_driver *ddp, + const char *buf, size_t count) +{ + char *p = (char *)buf; + u32 val; + u32 i = 0; + struct nes_device *nesdev; + + if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') { + val = simple_strtoul(p, &p, 16); + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + nes_write32(nesdev->regs + NES_FLASH_DATA, val); + break; + } + i++; + } + } + return strnlen(buf, count); +} + +static ssize_t nes_show_nonidx_addr(struct device_driver *ddp, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "0x%x\n", sysfs_nonidx_addr); +} + +static ssize_t nes_store_nonidx_addr(struct device_driver *ddp, + const char *buf, size_t count) +{ + char *p = (char *)buf; + + if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') + sysfs_nonidx_addr = simple_strtoul(p, &p, 16); + + return strnlen(buf, count); +} + +static ssize_t nes_show_nonidx_data(struct device_driver *ddp, char *buf) +{ + u32 nonidx_data = 0xdead; + u32 i = 0; + struct nes_device *nesdev; + + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + nonidx_data = nes_read32(nesdev->regs + sysfs_nonidx_addr); + break; + } + i++; + } + + return snprintf(buf, PAGE_SIZE, "0x%x\n", nonidx_data); +} + +static ssize_t nes_store_nonidx_data(struct device_driver *ddp, + const char *buf, size_t count) +{ + char *p = (char *)buf; + u32 val; + u32 i = 0; + struct nes_device *nesdev; + + if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') { + val = simple_strtoul(p, &p, 16); + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + nes_write32(nesdev->regs + sysfs_nonidx_addr, val); + break; + } + i++; + } + } + return strnlen(buf, count); +} + +static ssize_t nes_show_idx_addr(struct device_driver *ddp, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "0x%x\n", sysfs_idx_addr); +} + +static ssize_t nes_store_idx_addr(struct device_driver *ddp, + const char *buf, size_t count) +{ + char *p = (char *)buf; + + if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') + sysfs_idx_addr = simple_strtoul(p, &p, 16); + + return strnlen(buf, count); +} + +static ssize_t nes_show_idx_data(struct device_driver *ddp, char *buf) +{ + u32 idx_data = 0xdead; + u32 i = 0; + struct nes_device *nesdev; + + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + idx_data = nes_read_indexed(nesdev, sysfs_idx_addr); + break; + } + i++; + } + + return snprintf(buf, PAGE_SIZE, "0x%x\n", idx_data); +} + +static ssize_t nes_store_idx_data(struct device_driver *ddp, + const char *buf, size_t count) +{ + char *p = (char *)buf; + u32 val; + u32 i = 0; + struct nes_device *nesdev; + + if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') { + val = simple_strtoul(p, &p, 16); + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + nes_write_indexed(nesdev, sysfs_idx_addr, val); + break; + } + i++; + } + } + return strnlen(buf, count); +} + + +/** + * nes_show_wqm_quanta + */ +static ssize_t nes_show_wqm_quanta(struct device_driver *ddp, char *buf) +{ + u32 wqm_quanta_value = 0xdead; + u32 i = 0; + struct nes_device *nesdev; + + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + wqm_quanta_value = nesdev->nesadapter->wqm_quanta; + break; + } + i++; + } + + return snprintf(buf, PAGE_SIZE, "0x%X\n", wqm_quanta_value); +} + + +/** + * nes_store_wqm_quanta + */ +static ssize_t nes_store_wqm_quanta(struct device_driver *ddp, + const char *buf, size_t count) +{ + unsigned long wqm_quanta_value; + u32 wqm_config1; + u32 i = 0; + struct nes_device *nesdev; + + if (kstrtoul(buf, 0, &wqm_quanta_value) < 0) + return -EINVAL; + + list_for_each_entry(nesdev, &nes_dev_list, list) { + if (i == ee_flsh_adapter) { + nesdev->nesadapter->wqm_quanta = wqm_quanta_value; + wqm_config1 = nes_read_indexed(nesdev, + NES_IDX_WQM_CONFIG1); + nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG1, + ((wqm_quanta_value << 1) | + (wqm_config1 & 0x00000001))); + break; + } + i++; + } + return strnlen(buf, count); +} + +static DRIVER_ATTR(adapter, S_IRUSR | S_IWUSR, + nes_show_adapter, nes_store_adapter); +static DRIVER_ATTR(eeprom_cmd, S_IRUSR | S_IWUSR, + nes_show_ee_cmd, nes_store_ee_cmd); +static DRIVER_ATTR(eeprom_data, S_IRUSR | S_IWUSR, + nes_show_ee_data, nes_store_ee_data); +static DRIVER_ATTR(flash_cmd, S_IRUSR | S_IWUSR, + nes_show_flash_cmd, nes_store_flash_cmd); +static DRIVER_ATTR(flash_data, S_IRUSR | S_IWUSR, + nes_show_flash_data, nes_store_flash_data); +static DRIVER_ATTR(nonidx_addr, S_IRUSR | S_IWUSR, + nes_show_nonidx_addr, nes_store_nonidx_addr); +static DRIVER_ATTR(nonidx_data, S_IRUSR | S_IWUSR, + nes_show_nonidx_data, nes_store_nonidx_data); +static DRIVER_ATTR(idx_addr, S_IRUSR | S_IWUSR, + nes_show_idx_addr, nes_store_idx_addr); +static DRIVER_ATTR(idx_data, S_IRUSR | S_IWUSR, + nes_show_idx_data, nes_store_idx_data); +static DRIVER_ATTR(wqm_quanta, S_IRUSR | S_IWUSR, + nes_show_wqm_quanta, nes_store_wqm_quanta); + +static int nes_create_driver_sysfs(struct pci_driver *drv) +{ + int error; + error = driver_create_file(&drv->driver, &driver_attr_adapter); + error |= driver_create_file(&drv->driver, &driver_attr_eeprom_cmd); + error |= driver_create_file(&drv->driver, &driver_attr_eeprom_data); + error |= driver_create_file(&drv->driver, &driver_attr_flash_cmd); + error |= driver_create_file(&drv->driver, &driver_attr_flash_data); + error |= driver_create_file(&drv->driver, &driver_attr_nonidx_addr); + error |= driver_create_file(&drv->driver, &driver_attr_nonidx_data); + error |= driver_create_file(&drv->driver, &driver_attr_idx_addr); + error |= driver_create_file(&drv->driver, &driver_attr_idx_data); + error |= driver_create_file(&drv->driver, &driver_attr_wqm_quanta); + return error; +} + +static void nes_remove_driver_sysfs(struct pci_driver *drv) +{ + driver_remove_file(&drv->driver, &driver_attr_adapter); + driver_remove_file(&drv->driver, &driver_attr_eeprom_cmd); + driver_remove_file(&drv->driver, &driver_attr_eeprom_data); + driver_remove_file(&drv->driver, &driver_attr_flash_cmd); + driver_remove_file(&drv->driver, &driver_attr_flash_data); + driver_remove_file(&drv->driver, &driver_attr_nonidx_addr); + driver_remove_file(&drv->driver, &driver_attr_nonidx_data); + driver_remove_file(&drv->driver, &driver_attr_idx_addr); + driver_remove_file(&drv->driver, &driver_attr_idx_data); + driver_remove_file(&drv->driver, &driver_attr_wqm_quanta); +} + +/** + * nes_init_module - module initialization entry point + */ +static int __init nes_init_module(void) +{ + int retval; + int retval1; + + retval = nes_cm_start(); + if (retval) { + printk(KERN_ERR PFX "Unable to start NetEffect iWARP CM.\n"); + return retval; + } + retval = pci_register_driver(&nes_pci_driver); + if (retval >= 0) { + retval1 = nes_create_driver_sysfs(&nes_pci_driver); + if (retval1 < 0) + printk(KERN_ERR PFX "Unable to create NetEffect sys files.\n"); + } + return retval; +} + + +/** + * nes_exit_module - module unload entry point + */ +static void __exit nes_exit_module(void) +{ + nes_cm_stop(); + nes_remove_driver_sysfs(&nes_pci_driver); + + pci_unregister_driver(&nes_pci_driver); +} + + +module_init(nes_init_module); +module_exit(nes_exit_module); diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h new file mode 100644 index 0000000..bd9d132 --- /dev/null +++ b/drivers/infiniband/hw/nes/nes.h @@ -0,0 +1,582 @@ +/* + * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __NES_H +#define __NES_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define NES_SEND_FIRST_WRITE + +#define QUEUE_DISCONNECTS + +#define DRV_NAME "iw_nes" +#define DRV_VERSION "1.5.0.1" +#define PFX DRV_NAME ": " + +/* + * NetEffect PCI vendor id and NE010 PCI device id. + */ +#ifndef PCI_VENDOR_ID_NETEFFECT /* not in pci.ids yet */ +#define PCI_VENDOR_ID_NETEFFECT 0x1678 +#define PCI_DEVICE_ID_NETEFFECT_NE020 0x0100 +#define PCI_DEVICE_ID_NETEFFECT_NE020_KR 0x0110 +#endif + +#define NE020_REV 4 +#define NE020_REV1 5 + +#define BAR_0 0 +#define BAR_1 2 + +#define RX_BUF_SIZE (1536 + 8) +#define NES_REG0_SIZE (4 * 1024) +#define NES_TX_TIMEOUT (6*HZ) +#define NES_FIRST_QPN 64 +#define NES_SW_CONTEXT_ALIGN 1024 + +#define NES_NIC_MAX_NICS 16 +#define NES_MAX_ARP_TABLE_SIZE 4096 + +#define NES_NIC_CEQ_SIZE 8 +/* NICs will be on a separate CQ */ +#define NES_CCEQ_SIZE ((nesadapter->max_cq / nesadapter->port_count) - 32) + +#define NES_MAX_PORT_COUNT 4 + +#define MAX_DPC_ITERATIONS 128 + +#define NES_DRV_OPT_ENABLE_MPA_VER_0 0x00000001 +#define NES_DRV_OPT_DISABLE_MPA_CRC 0x00000002 +#define NES_DRV_OPT_DISABLE_FIRST_WRITE 0x00000004 +#define NES_DRV_OPT_DISABLE_INTF 0x00000008 +#define NES_DRV_OPT_ENABLE_MSI 0x00000010 +#define NES_DRV_OPT_DUAL_LOGICAL_PORT 0x00000020 +#define NES_DRV_OPT_SUPRESS_OPTION_BC 0x00000040 +#define NES_DRV_OPT_NO_INLINE_DATA 0x00000080 +#define NES_DRV_OPT_DISABLE_INT_MOD 0x00000100 +#define NES_DRV_OPT_DISABLE_VIRT_WQ 0x00000200 +#define NES_DRV_OPT_ENABLE_PAU 0x00000400 + +#define NES_AEQ_EVENT_TIMEOUT 2500 +#define NES_DISCONNECT_EVENT_TIMEOUT 2000 + +/* debug levels */ +/* must match userspace */ +#define NES_DBG_HW 0x00000001 +#define NES_DBG_INIT 0x00000002 +#define NES_DBG_ISR 0x00000004 +#define NES_DBG_PHY 0x00000008 +#define NES_DBG_NETDEV 0x00000010 +#define NES_DBG_CM 0x00000020 +#define NES_DBG_CM1 0x00000040 +#define NES_DBG_NIC_RX 0x00000080 +#define NES_DBG_NIC_TX 0x00000100 +#define NES_DBG_CQP 0x00000200 +#define NES_DBG_MMAP 0x00000400 +#define NES_DBG_MR 0x00000800 +#define NES_DBG_PD 0x00001000 +#define NES_DBG_CQ 0x00002000 +#define NES_DBG_QP 0x00004000 +#define NES_DBG_MOD_QP 0x00008000 +#define NES_DBG_AEQ 0x00010000 +#define NES_DBG_IW_RX 0x00020000 +#define NES_DBG_IW_TX 0x00040000 +#define NES_DBG_SHUTDOWN 0x00080000 +#define NES_DBG_PAU 0x00100000 +#define NES_DBG_NLMSG 0x00200000 +#define NES_DBG_RSVD1 0x10000000 +#define NES_DBG_RSVD2 0x20000000 +#define NES_DBG_RSVD3 0x40000000 +#define NES_DBG_RSVD4 0x80000000 +#define NES_DBG_ALL 0xffffffff + +#ifdef CONFIG_INFINIBAND_NES_DEBUG +#define nes_debug(level, fmt, args...) \ +do { \ + if (level & nes_debug_level) \ + printk(KERN_ERR PFX "%s[%u]: " fmt, __func__, __LINE__, ##args); \ +} while (0) + +#define assert(expr) \ +do { \ + if (!(expr)) { \ + printk(KERN_ERR PFX "Assertion failed! %s, %s, %s, line %d\n", \ + #expr, __FILE__, __func__, __LINE__); \ + } \ +} while (0) + +#define NES_EVENT_TIMEOUT 1200000 +#else +#define nes_debug(level, fmt, args...) +#define assert(expr) do {} while (0) + +#define NES_EVENT_TIMEOUT 100000 +#endif + +#include "nes_hw.h" +#include "nes_verbs.h" +#include "nes_context.h" +#include "nes_user.h" +#include "nes_cm.h" +#include "nes_mgt.h" + +extern int max_mtu; +#define max_frame_len (max_mtu+ETH_HLEN) +extern int interrupt_mod_interval; +extern int nes_if_count; +extern int mpa_version; +extern int disable_mpa_crc; +extern unsigned int nes_drv_opt; +extern unsigned int nes_debug_level; +extern unsigned int wqm_quanta; +extern struct list_head nes_adapter_list; + +extern atomic_t cm_connects; +extern atomic_t cm_accepts; +extern atomic_t cm_disconnects; +extern atomic_t cm_closes; +extern atomic_t cm_connecteds; +extern atomic_t cm_connect_reqs; +extern atomic_t cm_rejects; +extern atomic_t mod_qp_timouts; +extern atomic_t qps_created; +extern atomic_t qps_destroyed; +extern atomic_t sw_qps_destroyed; +extern u32 mh_detected; +extern u32 mh_pauses_sent; +extern u32 cm_packets_sent; +extern u32 cm_packets_bounced; +extern u32 cm_packets_created; +extern u32 cm_packets_received; +extern u32 cm_packets_dropped; +extern u32 cm_packets_retrans; +extern atomic_t cm_listens_created; +extern atomic_t cm_listens_destroyed; +extern u32 cm_backlog_drops; +extern atomic_t cm_loopbacks; +extern atomic_t cm_nodes_created; +extern atomic_t cm_nodes_destroyed; +extern atomic_t cm_accel_dropped_pkts; +extern atomic_t cm_resets_recvd; +extern atomic_t pau_qps_created; +extern atomic_t pau_qps_destroyed; + +extern u32 int_mod_timer_init; +extern u32 int_mod_cq_depth_256; +extern u32 int_mod_cq_depth_128; +extern u32 int_mod_cq_depth_32; +extern u32 int_mod_cq_depth_24; +extern u32 int_mod_cq_depth_16; +extern u32 int_mod_cq_depth_4; +extern u32 int_mod_cq_depth_1; + +struct nes_device { + struct nes_adapter *nesadapter; + void __iomem *regs; + void __iomem *index_reg; + struct pci_dev *pcidev; + struct net_device *netdev[NES_NIC_MAX_NICS]; + u64 link_status_interrupts; + struct tasklet_struct dpc_tasklet; + spinlock_t indexed_regs_lock; + unsigned long csr_start; + unsigned long doorbell_region; + unsigned long doorbell_start; + unsigned long mac_tx_errors; + unsigned long mac_pause_frames_sent; + unsigned long mac_pause_frames_received; + unsigned long mac_rx_errors; + unsigned long mac_rx_crc_errors; + unsigned long mac_rx_symbol_err_frames; + unsigned long mac_rx_jabber_frames; + unsigned long mac_rx_oversized_frames; + unsigned long mac_rx_short_frames; + unsigned long port_rx_discards; + unsigned long port_tx_discards; + unsigned int mac_index; + unsigned int nes_stack_start; + + /* Control Structures */ + void *cqp_vbase; + dma_addr_t cqp_pbase; + u32 cqp_mem_size; + u8 ceq_index; + u8 nic_ceq_index; + struct nes_hw_cqp cqp; + struct nes_hw_cq ccq; + struct list_head cqp_avail_reqs; + struct list_head cqp_pending_reqs; + struct nes_cqp_request *nes_cqp_requests; + + u32 int_req; + u32 int_stat; + u32 timer_int_req; + u32 timer_only_int_count; + u32 intf_int_req; + u32 last_mac_tx_pauses; + u32 last_used_chunks_tx; + struct list_head list; + + u16 base_doorbell_index; + u16 currcq_count; + u16 deepcq_count; + u8 iw_status; + u8 msi_enabled; + u8 netdev_count; + u8 napi_isr_ran; + u8 disable_rx_flow_control; + u8 disable_tx_flow_control; + + struct delayed_work work; + u8 link_recheck; +}; + +/* Receive skb private area - must fit in skb->cb area */ +struct nes_rskb_cb { + u64 busaddr; + u32 maplen; + u32 seqnum; + u8 *data_start; + struct nes_qp *nesqp; +}; + +static inline __le32 get_crc_value(struct nes_v4_quad *nes_quad) +{ + u32 crc_value; + crc_value = crc32c(~0, (void *)nes_quad, sizeof (struct nes_v4_quad)); + + /* + * With commit ef19454b ("[LIB] crc32c: Keep intermediate crc + * state in cpu order"), behavior of crc32c changes on + * big-endian platforms. Our algorithm expects the previous + * behavior; otherwise we have RDMA connection establishment + * issue on big-endian. + */ + return cpu_to_le32(crc_value); +} + +static inline void +set_wqe_64bit_value(__le32 *wqe_words, u32 index, u64 value) +{ + wqe_words[index] = cpu_to_le32((u32) value); + wqe_words[index + 1] = cpu_to_le32(upper_32_bits(value)); +} + +static inline void +set_wqe_32bit_value(__le32 *wqe_words, u32 index, u32 value) +{ + wqe_words[index] = cpu_to_le32(value); +} + +static inline void +nes_fill_init_cqp_wqe(struct nes_hw_cqp_wqe *cqp_wqe, struct nes_device *nesdev) +{ + cqp_wqe->wqe_words[NES_CQP_WQE_COMP_CTX_LOW_IDX] = 0; + cqp_wqe->wqe_words[NES_CQP_WQE_COMP_CTX_HIGH_IDX] = 0; + cqp_wqe->wqe_words[NES_CQP_WQE_COMP_SCRATCH_LOW_IDX] = 0; + cqp_wqe->wqe_words[NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX] = 0; + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX] = 0; + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PBL_LEN_IDX] = 0; + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_LOW_IDX] = 0; + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PA_LOW_IDX] = 0; + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PA_HIGH_IDX] = 0; +} + +static inline void +nes_fill_init_qp_wqe(struct nes_hw_qp_wqe *wqe, struct nes_qp *nesqp, u32 head) +{ + u32 value; + value = ((u32)((unsigned long) nesqp)) | head; + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_HIGH_IDX, + (u32)(upper_32_bits((unsigned long)(nesqp)))); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, value); +} + +/* Read from memory-mapped device */ +static inline u32 nes_read_indexed(struct nes_device *nesdev, u32 reg_index) +{ + unsigned long flags; + void __iomem *addr = nesdev->index_reg; + u32 value; + + spin_lock_irqsave(&nesdev->indexed_regs_lock, flags); + + writel(reg_index, addr); + value = readl((void __iomem *)addr + 4); + + spin_unlock_irqrestore(&nesdev->indexed_regs_lock, flags); + return value; +} + +static inline u32 nes_read32(const void __iomem *addr) +{ + return readl(addr); +} + +static inline u16 nes_read16(const void __iomem *addr) +{ + return readw(addr); +} + +static inline u8 nes_read8(const void __iomem *addr) +{ + return readb(addr); +} + +/* Write to memory-mapped device */ +static inline void nes_write_indexed(struct nes_device *nesdev, u32 reg_index, u32 val) +{ + unsigned long flags; + void __iomem *addr = nesdev->index_reg; + + spin_lock_irqsave(&nesdev->indexed_regs_lock, flags); + + writel(reg_index, addr); + writel(val, (void __iomem *)addr + 4); + + spin_unlock_irqrestore(&nesdev->indexed_regs_lock, flags); +} + +static inline void nes_write32(void __iomem *addr, u32 val) +{ + writel(val, addr); +} + +static inline void nes_write16(void __iomem *addr, u16 val) +{ + writew(val, addr); +} + +static inline void nes_write8(void __iomem *addr, u8 val) +{ + writeb(val, addr); +} + +enum nes_resource { + NES_RESOURCE_MW = 1, + NES_RESOURCE_FAST_MR, + NES_RESOURCE_PHYS_MR, + NES_RESOURCE_USER_MR, + NES_RESOURCE_PD, + NES_RESOURCE_QP, + NES_RESOURCE_CQ, + NES_RESOURCE_ARP +}; + +static inline int nes_alloc_resource(struct nes_adapter *nesadapter, + unsigned long *resource_array, u32 max_resources, + u32 *req_resource_num, u32 *next, enum nes_resource resource_type) +{ + unsigned long flags; + u32 resource_num; + + spin_lock_irqsave(&nesadapter->resource_lock, flags); + + resource_num = find_next_zero_bit(resource_array, max_resources, *next); + if (resource_num >= max_resources) { + resource_num = find_first_zero_bit(resource_array, max_resources); + if (resource_num >= max_resources) { + printk(KERN_ERR PFX "%s: No available resources [type=%u].\n", __func__, resource_type); + spin_unlock_irqrestore(&nesadapter->resource_lock, flags); + return -EMFILE; + } + } + set_bit(resource_num, resource_array); + *next = resource_num+1; + if (*next == max_resources) { + *next = 0; + } + spin_unlock_irqrestore(&nesadapter->resource_lock, flags); + *req_resource_num = resource_num; + + return 0; +} + +static inline int nes_is_resource_allocated(struct nes_adapter *nesadapter, + unsigned long *resource_array, u32 resource_num) +{ + unsigned long flags; + int bit_is_set; + + spin_lock_irqsave(&nesadapter->resource_lock, flags); + + bit_is_set = test_bit(resource_num, resource_array); + nes_debug(NES_DBG_HW, "resource_num %u is%s allocated.\n", + resource_num, (bit_is_set ? "": " not")); + spin_unlock_irqrestore(&nesadapter->resource_lock, flags); + + return bit_is_set; +} + +static inline void nes_free_resource(struct nes_adapter *nesadapter, + unsigned long *resource_array, u32 resource_num) +{ + unsigned long flags; + + spin_lock_irqsave(&nesadapter->resource_lock, flags); + clear_bit(resource_num, resource_array); + spin_unlock_irqrestore(&nesadapter->resource_lock, flags); +} + +static inline struct nes_vnic *to_nesvnic(struct ib_device *ibdev) +{ + return container_of(ibdev, struct nes_ib_device, ibdev)->nesvnic; +} + +static inline struct nes_pd *to_nespd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct nes_pd, ibpd); +} + +static inline struct nes_ucontext *to_nesucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct nes_ucontext, ibucontext); +} + +static inline struct nes_mr *to_nesmr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct nes_mr, ibmr); +} + +static inline struct nes_mr *to_nesmr_from_ibfmr(struct ib_fmr *ibfmr) +{ + return container_of(ibfmr, struct nes_mr, ibfmr); +} + +static inline struct nes_mr *to_nesmw(struct ib_mw *ibmw) +{ + return container_of(ibmw, struct nes_mr, ibmw); +} + +static inline struct nes_fmr *to_nesfmr(struct nes_mr *nesmr) +{ + return container_of(nesmr, struct nes_fmr, nesmr); +} + +static inline struct nes_cq *to_nescq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct nes_cq, ibcq); +} + +static inline struct nes_qp *to_nesqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct nes_qp, ibqp); +} + + + +/* nes.c */ +void nes_add_ref(struct ib_qp *); +void nes_rem_ref(struct ib_qp *); +struct ib_qp *nes_get_qp(struct ib_device *, int); + + +/* nes_hw.c */ +struct nes_adapter *nes_init_adapter(struct nes_device *, u8); +void nes_nic_init_timer_defaults(struct nes_device *, u8); +void nes_destroy_adapter(struct nes_adapter *); +int nes_init_cqp(struct nes_device *); +int nes_init_phy(struct nes_device *); +int nes_init_nic_qp(struct nes_device *, struct net_device *); +void nes_destroy_nic_qp(struct nes_vnic *); +int nes_napi_isr(struct nes_device *); +void nes_dpc(unsigned long); +void nes_nic_ce_handler(struct nes_device *, struct nes_hw_nic_cq *); +void nes_iwarp_ce_handler(struct nes_device *, struct nes_hw_cq *); +int nes_destroy_cqp(struct nes_device *); +int nes_nic_cm_xmit(struct sk_buff *, struct net_device *); +void nes_recheck_link_status(struct work_struct *work); +void nes_terminate_timeout(unsigned long context); + +/* nes_nic.c */ +struct net_device *nes_netdev_init(struct nes_device *, void __iomem *); +void nes_netdev_destroy(struct net_device *); +int nes_nic_cm_xmit(struct sk_buff *, struct net_device *); + +/* nes_cm.c */ +void *nes_cm_create(struct net_device *); +int nes_cm_recv(struct sk_buff *, struct net_device *); +void nes_update_arp(unsigned char *, u32, u32, u16, u16); +void nes_manage_arp_cache(struct net_device *, unsigned char *, u32, u32); +void nes_sock_release(struct nes_qp *, unsigned long *); +void flush_wqes(struct nes_device *nesdev, struct nes_qp *, u32, u32); +int nes_manage_apbvt(struct nes_vnic *, u32, u32, u32); +int nes_cm_disconn(struct nes_qp *); +void nes_cm_disconn_worker(void *); + +/* nes_verbs.c */ +int nes_hw_modify_qp(struct nes_device *, struct nes_qp *, u32, u32, u32); +int nes_modify_qp(struct ib_qp *, struct ib_qp_attr *, int, struct ib_udata *); +struct nes_ib_device *nes_init_ofa_device(struct net_device *); +void nes_port_ibevent(struct nes_vnic *nesvnic); +void nes_destroy_ofa_device(struct nes_ib_device *); +int nes_register_ofa_device(struct nes_ib_device *); + +/* nes_util.c */ +int nes_read_eeprom_values(struct nes_device *, struct nes_adapter *); +void nes_write_1G_phy_reg(struct nes_device *, u8, u8, u16); +void nes_read_1G_phy_reg(struct nes_device *, u8, u8, u16 *); +void nes_write_10G_phy_reg(struct nes_device *, u16, u8, u16, u16); +void nes_read_10G_phy_reg(struct nes_device *, u8, u8, u16); +struct nes_cqp_request *nes_get_cqp_request(struct nes_device *); +void nes_free_cqp_request(struct nes_device *nesdev, + struct nes_cqp_request *cqp_request); +void nes_put_cqp_request(struct nes_device *nesdev, + struct nes_cqp_request *cqp_request); +void nes_post_cqp_request(struct nes_device *, struct nes_cqp_request *); +int nes_arp_table(struct nes_device *, u32, u8 *, u32); +void nes_mh_fix(unsigned long); +void nes_clc(unsigned long); +void nes_dump_mem(unsigned int, void *, int); +u32 nes_crc32(u32, u32, u32, u32, u8 *, u32, u32, u32); + +#endif /* __NES_H */ diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c new file mode 100644 index 0000000..6f09a72 --- /dev/null +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -0,0 +1,4153 @@ +/* + * Copyright (c) 2006 - 2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + + +#define TCPOPT_TIMESTAMP 8 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nes.h" + +u32 cm_packets_sent; +u32 cm_packets_bounced; +u32 cm_packets_dropped; +u32 cm_packets_retrans; +u32 cm_packets_created; +u32 cm_packets_received; +atomic_t cm_listens_created; +atomic_t cm_listens_destroyed; +u32 cm_backlog_drops; +atomic_t cm_loopbacks; +atomic_t cm_nodes_created; +atomic_t cm_nodes_destroyed; +atomic_t cm_accel_dropped_pkts; +atomic_t cm_resets_recvd; + +static inline int mini_cm_accelerated(struct nes_cm_core *, struct nes_cm_node *); +static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *, struct nes_vnic *, struct nes_cm_info *); +static int mini_cm_del_listen(struct nes_cm_core *, struct nes_cm_listener *); +static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *, struct nes_vnic *, u16, void *, struct nes_cm_info *); +static int mini_cm_close(struct nes_cm_core *, struct nes_cm_node *); +static int mini_cm_accept(struct nes_cm_core *, struct nes_cm_node *); +static int mini_cm_reject(struct nes_cm_core *, struct nes_cm_node *); +static int mini_cm_recv_pkt(struct nes_cm_core *, struct nes_vnic *, struct sk_buff *); +static int mini_cm_dealloc_core(struct nes_cm_core *); +static int mini_cm_get(struct nes_cm_core *); +static int mini_cm_set(struct nes_cm_core *, u32, u32); + +static void form_cm_frame(struct sk_buff *, struct nes_cm_node *, void *, u32, void *, u32, u8); +static int add_ref_cm_node(struct nes_cm_node *); +static int rem_ref_cm_node(struct nes_cm_core *, struct nes_cm_node *); + +static int nes_cm_disconn_true(struct nes_qp *); +static int nes_cm_post_event(struct nes_cm_event *event); +static int nes_disconnect(struct nes_qp *nesqp, int abrupt); +static void nes_disconnect_worker(struct work_struct *work); + +static int send_mpa_request(struct nes_cm_node *, struct sk_buff *); +static int send_mpa_reject(struct nes_cm_node *); +static int send_syn(struct nes_cm_node *, u32, struct sk_buff *); +static int send_reset(struct nes_cm_node *, struct sk_buff *); +static int send_ack(struct nes_cm_node *cm_node, struct sk_buff *skb); +static int send_fin(struct nes_cm_node *cm_node, struct sk_buff *skb); +static void process_packet(struct nes_cm_node *, struct sk_buff *, struct nes_cm_core *); + +static void active_open_err(struct nes_cm_node *, struct sk_buff *, int); +static void passive_open_err(struct nes_cm_node *, struct sk_buff *, int); +static void cleanup_retrans_entry(struct nes_cm_node *); +static void handle_rcv_mpa(struct nes_cm_node *, struct sk_buff *); +static void free_retrans_entry(struct nes_cm_node *cm_node); +static int handle_tcp_options(struct nes_cm_node *cm_node, struct tcphdr *tcph, struct sk_buff *skb, int optionsize, int passive); + +/* CM event handler functions */ +static void cm_event_connected(struct nes_cm_event *); +static void cm_event_connect_error(struct nes_cm_event *); +static void cm_event_reset(struct nes_cm_event *); +static void cm_event_mpa_req(struct nes_cm_event *); +static void cm_event_mpa_reject(struct nes_cm_event *); +static void handle_recv_entry(struct nes_cm_node *cm_node, u32 rem_node); + +/* MPA build functions */ +static int cm_build_mpa_frame(struct nes_cm_node *, u8 **, u16 *, u8 *, u8); +static void build_mpa_v2(struct nes_cm_node *, void *, u8); +static void build_mpa_v1(struct nes_cm_node *, void *, u8); +static void build_rdma0_msg(struct nes_cm_node *, struct nes_qp **); + +static void print_core(struct nes_cm_core *core); +static void record_ird_ord(struct nes_cm_node *, u16, u16); + +/* External CM API Interface */ +/* instance of function pointers for client API */ +/* set address of this instance to cm_core->cm_ops at cm_core alloc */ +static struct nes_cm_ops nes_cm_api = { + mini_cm_accelerated, + mini_cm_listen, + mini_cm_del_listen, + mini_cm_connect, + mini_cm_close, + mini_cm_accept, + mini_cm_reject, + mini_cm_recv_pkt, + mini_cm_dealloc_core, + mini_cm_get, + mini_cm_set +}; + +static struct nes_cm_core *g_cm_core; + +atomic_t cm_connects; +atomic_t cm_accepts; +atomic_t cm_disconnects; +atomic_t cm_closes; +atomic_t cm_connecteds; +atomic_t cm_connect_reqs; +atomic_t cm_rejects; + +int nes_add_ref_cm_node(struct nes_cm_node *cm_node) +{ + return add_ref_cm_node(cm_node); +} + +int nes_rem_ref_cm_node(struct nes_cm_node *cm_node) +{ + return rem_ref_cm_node(cm_node->cm_core, cm_node); +} +/** + * create_event + */ +static struct nes_cm_event *create_event(struct nes_cm_node * cm_node, + enum nes_cm_event_type type) +{ + struct nes_cm_event *event; + + if (!cm_node->cm_id) + return NULL; + + /* allocate an empty event */ + event = kzalloc(sizeof(*event), GFP_ATOMIC); + + if (!event) + return NULL; + + event->type = type; + event->cm_node = cm_node; + event->cm_info.rem_addr = cm_node->rem_addr; + event->cm_info.loc_addr = cm_node->loc_addr; + event->cm_info.rem_port = cm_node->rem_port; + event->cm_info.loc_port = cm_node->loc_port; + event->cm_info.cm_id = cm_node->cm_id; + + nes_debug(NES_DBG_CM, "cm_node=%p Created event=%p, type=%u, " + "dst_addr=%08x[%x], src_addr=%08x[%x]\n", + cm_node, event, type, event->cm_info.loc_addr, + event->cm_info.loc_port, event->cm_info.rem_addr, + event->cm_info.rem_port); + + nes_cm_post_event(event); + return event; +} + + +/** + * send_mpa_request + */ +static int send_mpa_request(struct nes_cm_node *cm_node, struct sk_buff *skb) +{ + u8 start_addr = 0; + u8 *start_ptr = &start_addr; + u8 **start_buff = &start_ptr; + u16 buff_len = 0; + + if (!skb) { + nes_debug(NES_DBG_CM, "skb set to NULL\n"); + return -1; + } + + /* send an MPA Request frame */ + cm_build_mpa_frame(cm_node, start_buff, &buff_len, NULL, MPA_KEY_REQUEST); + form_cm_frame(skb, cm_node, NULL, 0, *start_buff, buff_len, SET_ACK); + + return schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0); +} + + + +static int send_mpa_reject(struct nes_cm_node *cm_node) +{ + struct sk_buff *skb = NULL; + u8 start_addr = 0; + u8 *start_ptr = &start_addr; + u8 **start_buff = &start_ptr; + u16 buff_len = 0; + struct ietf_mpa_v1 *mpa_frame; + + skb = dev_alloc_skb(MAX_CM_BUFFER); + if (!skb) { + nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n"); + return -ENOMEM; + } + + /* send an MPA reject frame */ + cm_build_mpa_frame(cm_node, start_buff, &buff_len, NULL, MPA_KEY_REPLY); + mpa_frame = (struct ietf_mpa_v1 *)*start_buff; + mpa_frame->flags |= IETF_MPA_FLAGS_REJECT; + form_cm_frame(skb, cm_node, NULL, 0, *start_buff, buff_len, SET_ACK | SET_FIN); + + cm_node->state = NES_CM_STATE_FIN_WAIT1; + return schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0); +} + + +/** + * recv_mpa - process a received TCP pkt, we are expecting an + * IETF MPA frame + */ +static int parse_mpa(struct nes_cm_node *cm_node, u8 *buffer, u32 *type, + u32 len) +{ + struct ietf_mpa_v1 *mpa_frame; + struct ietf_mpa_v2 *mpa_v2_frame; + struct ietf_rtr_msg *rtr_msg; + int mpa_hdr_len; + int priv_data_len; + + *type = NES_MPA_REQUEST_ACCEPT; + + /* assume req frame is in tcp data payload */ + if (len < sizeof(struct ietf_mpa_v1)) { + nes_debug(NES_DBG_CM, "The received ietf buffer was too small (%x)\n", len); + return -EINVAL; + } + + /* points to the beginning of the frame, which could be MPA V1 or V2 */ + mpa_frame = (struct ietf_mpa_v1 *)buffer; + mpa_hdr_len = sizeof(struct ietf_mpa_v1); + priv_data_len = ntohs(mpa_frame->priv_data_len); + + /* make sure mpa private data len is less than 512 bytes */ + if (priv_data_len > IETF_MAX_PRIV_DATA_LEN) { + nes_debug(NES_DBG_CM, "The received Length of Private" + " Data field exceeds 512 octets\n"); + return -EINVAL; + } + /* + * make sure MPA receiver interoperate with the + * received MPA version and MPA key information + * + */ + if (mpa_frame->rev != IETF_MPA_V1 && mpa_frame->rev != IETF_MPA_V2) { + nes_debug(NES_DBG_CM, "The received mpa version" + " is not supported\n"); + return -EINVAL; + } + /* + * backwards compatibility only + */ + if (mpa_frame->rev > cm_node->mpa_frame_rev) { + nes_debug(NES_DBG_CM, "The received mpa version" + " can not be interoperated\n"); + return -EINVAL; + } else { + cm_node->mpa_frame_rev = mpa_frame->rev; + } + + if (cm_node->state != NES_CM_STATE_MPAREQ_SENT) { + if (memcmp(mpa_frame->key, IEFT_MPA_KEY_REQ, IETF_MPA_KEY_SIZE)) { + nes_debug(NES_DBG_CM, "Unexpected MPA Key received \n"); + return -EINVAL; + } + } else { + if (memcmp(mpa_frame->key, IEFT_MPA_KEY_REP, IETF_MPA_KEY_SIZE)) { + nes_debug(NES_DBG_CM, "Unexpected MPA Key received \n"); + return -EINVAL; + } + } + + if (priv_data_len + mpa_hdr_len != len) { + nes_debug(NES_DBG_CM, "The received ietf buffer was not right" + " complete (%x + %x != %x)\n", + priv_data_len, mpa_hdr_len, len); + return -EINVAL; + } + /* make sure it does not exceed the max size */ + if (len > MAX_CM_BUFFER) { + nes_debug(NES_DBG_CM, "The received ietf buffer was too large" + " (%x + %x != %x)\n", + priv_data_len, mpa_hdr_len, len); + return -EINVAL; + } + + cm_node->mpa_frame_size = priv_data_len; + + switch (mpa_frame->rev) { + case IETF_MPA_V2: { + u16 ird_size; + u16 ord_size; + u16 rtr_ctrl_ird; + u16 rtr_ctrl_ord; + + mpa_v2_frame = (struct ietf_mpa_v2 *)buffer; + mpa_hdr_len += IETF_RTR_MSG_SIZE; + cm_node->mpa_frame_size -= IETF_RTR_MSG_SIZE; + rtr_msg = &mpa_v2_frame->rtr_msg; + + /* parse rtr message */ + rtr_ctrl_ird = ntohs(rtr_msg->ctrl_ird); + rtr_ctrl_ord = ntohs(rtr_msg->ctrl_ord); + ird_size = rtr_ctrl_ird & IETF_NO_IRD_ORD; + ord_size = rtr_ctrl_ord & IETF_NO_IRD_ORD; + + if (!(rtr_ctrl_ird & IETF_PEER_TO_PEER)) { + /* send reset */ + return -EINVAL; + } + if (ird_size == IETF_NO_IRD_ORD || ord_size == IETF_NO_IRD_ORD) + cm_node->mpav2_ird_ord = IETF_NO_IRD_ORD; + + if (cm_node->mpav2_ird_ord != IETF_NO_IRD_ORD) { + /* responder */ + if (cm_node->state != NES_CM_STATE_MPAREQ_SENT) { + /* we are still negotiating */ + if (ord_size > NES_MAX_IRD) { + cm_node->ird_size = NES_MAX_IRD; + } else { + cm_node->ird_size = ord_size; + if (ord_size == 0 && + (rtr_ctrl_ord & IETF_RDMA0_READ)) { + cm_node->ird_size = 1; + nes_debug(NES_DBG_CM, + "%s: Remote peer doesn't support RDMA0_READ (ord=%u)\n", + __func__, ord_size); + } + } + if (ird_size > NES_MAX_ORD) + cm_node->ord_size = NES_MAX_ORD; + else + cm_node->ord_size = ird_size; + } else { /* initiator */ + if (ord_size > NES_MAX_IRD) { + nes_debug(NES_DBG_CM, + "%s: Unable to support the requested (ord =%u)\n", + __func__, ord_size); + return -EINVAL; + } + cm_node->ird_size = ord_size; + + if (ird_size > NES_MAX_ORD) { + cm_node->ord_size = NES_MAX_ORD; + } else { + if (ird_size == 0 && + (rtr_ctrl_ord & IETF_RDMA0_READ)) { + nes_debug(NES_DBG_CM, + "%s: Remote peer doesn't support RDMA0_READ (ird=%u)\n", + __func__, ird_size); + return -EINVAL; + } else { + cm_node->ord_size = ird_size; + } + } + } + } + + if (rtr_ctrl_ord & IETF_RDMA0_READ) { + cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO; + + } else if (rtr_ctrl_ord & IETF_RDMA0_WRITE) { + cm_node->send_rdma0_op = SEND_RDMA_WRITE_ZERO; + } else { /* Not supported RDMA0 operation */ + return -EINVAL; + } + break; + } + case IETF_MPA_V1: + default: + break; + } + + /* copy entire MPA frame to our cm_node's frame */ + memcpy(cm_node->mpa_frame_buf, buffer + mpa_hdr_len, cm_node->mpa_frame_size); + + if (mpa_frame->flags & IETF_MPA_FLAGS_REJECT) + *type = NES_MPA_REQUEST_REJECT; + return 0; +} + + +/** + * form_cm_frame - get a free packet and build empty frame Use + * node info to build. + */ +static void form_cm_frame(struct sk_buff *skb, + struct nes_cm_node *cm_node, void *options, u32 optionsize, + void *data, u32 datasize, u8 flags) +{ + struct tcphdr *tcph; + struct iphdr *iph; + struct ethhdr *ethh; + u8 *buf; + u16 packetsize = sizeof(*iph); + + packetsize += sizeof(*tcph); + packetsize += optionsize + datasize; + + skb_trim(skb, 0); + memset(skb->data, 0x00, ETH_HLEN + sizeof(*iph) + sizeof(*tcph)); + + buf = skb_put(skb, packetsize + ETH_HLEN); + + ethh = (struct ethhdr *)buf; + buf += ETH_HLEN; + + iph = (struct iphdr *)buf; + buf += sizeof(*iph); + tcph = (struct tcphdr *)buf; + skb_reset_mac_header(skb); + skb_set_network_header(skb, ETH_HLEN); + skb_set_transport_header(skb, ETH_HLEN + sizeof(*iph)); + buf += sizeof(*tcph); + + skb->ip_summed = CHECKSUM_PARTIAL; + if (!(cm_node->netdev->features & NETIF_F_IP_CSUM)) + skb->ip_summed = CHECKSUM_NONE; + skb->protocol = htons(0x800); + skb->data_len = 0; + skb->mac_len = ETH_HLEN; + + memcpy(ethh->h_dest, cm_node->rem_mac, ETH_ALEN); + memcpy(ethh->h_source, cm_node->loc_mac, ETH_ALEN); + ethh->h_proto = htons(0x0800); + + iph->version = IPVERSION; + iph->ihl = 5; /* 5 * 4Byte words, IP headr len */ + iph->tos = 0; + iph->tot_len = htons(packetsize); + iph->id = htons(++cm_node->tcp_cntxt.loc_id); + + iph->frag_off = htons(0x4000); + iph->ttl = 0x40; + iph->protocol = 0x06; /* IPPROTO_TCP */ + + iph->saddr = htonl(cm_node->mapped_loc_addr); + iph->daddr = htonl(cm_node->mapped_rem_addr); + + tcph->source = htons(cm_node->mapped_loc_port); + tcph->dest = htons(cm_node->mapped_rem_port); + tcph->seq = htonl(cm_node->tcp_cntxt.loc_seq_num); + + if (flags & SET_ACK) { + cm_node->tcp_cntxt.loc_ack_num = cm_node->tcp_cntxt.rcv_nxt; + tcph->ack_seq = htonl(cm_node->tcp_cntxt.loc_ack_num); + tcph->ack = 1; + } else { + tcph->ack_seq = 0; + } + + if (flags & SET_SYN) { + cm_node->tcp_cntxt.loc_seq_num++; + tcph->syn = 1; + } else { + cm_node->tcp_cntxt.loc_seq_num += datasize; + } + + if (flags & SET_FIN) { + cm_node->tcp_cntxt.loc_seq_num++; + tcph->fin = 1; + } + + if (flags & SET_RST) + tcph->rst = 1; + + tcph->doff = (u16)((sizeof(*tcph) + optionsize + 3) >> 2); + tcph->window = htons(cm_node->tcp_cntxt.rcv_wnd); + tcph->urg_ptr = 0; + if (optionsize) + memcpy(buf, options, optionsize); + buf += optionsize; + if (datasize) + memcpy(buf, data, datasize); + + skb_shinfo(skb)->nr_frags = 0; + cm_packets_created++; +} + +/* + * nes_create_sockaddr - Record ip addr and tcp port in a sockaddr struct + */ +static void nes_create_sockaddr(__be32 ip_addr, __be16 port, + struct sockaddr_storage *addr) +{ + struct sockaddr_in *nes_sockaddr = (struct sockaddr_in *)addr; + nes_sockaddr->sin_family = AF_INET; + memcpy(&nes_sockaddr->sin_addr.s_addr, &ip_addr, sizeof(__be32)); + nes_sockaddr->sin_port = port; +} + +/* + * nes_create_mapinfo - Create a mapinfo object in the port mapper data base + */ +static int nes_create_mapinfo(struct nes_cm_info *cm_info) +{ + struct sockaddr_storage local_sockaddr; + struct sockaddr_storage mapped_sockaddr; + + nes_create_sockaddr(htonl(cm_info->loc_addr), htons(cm_info->loc_port), + &local_sockaddr); + nes_create_sockaddr(htonl(cm_info->mapped_loc_addr), + htons(cm_info->mapped_loc_port), &mapped_sockaddr); + + return iwpm_create_mapinfo(&local_sockaddr, + &mapped_sockaddr, RDMA_NL_NES); +} + +/* + * nes_remove_mapinfo - Remove a mapinfo object from the port mapper data base + * and send a remove mapping op message to + * the userspace port mapper + */ +static int nes_remove_mapinfo(u32 loc_addr, u16 loc_port, + u32 mapped_loc_addr, u16 mapped_loc_port) +{ + struct sockaddr_storage local_sockaddr; + struct sockaddr_storage mapped_sockaddr; + + nes_create_sockaddr(htonl(loc_addr), htons(loc_port), &local_sockaddr); + nes_create_sockaddr(htonl(mapped_loc_addr), htons(mapped_loc_port), + &mapped_sockaddr); + + iwpm_remove_mapinfo(&local_sockaddr, &mapped_sockaddr); + return iwpm_remove_mapping(&local_sockaddr, RDMA_NL_NES); +} + +/* + * nes_form_pm_msg - Form a port mapper message with mapping info + */ +static void nes_form_pm_msg(struct nes_cm_info *cm_info, + struct iwpm_sa_data *pm_msg) +{ + nes_create_sockaddr(htonl(cm_info->loc_addr), htons(cm_info->loc_port), + &pm_msg->loc_addr); + nes_create_sockaddr(htonl(cm_info->rem_addr), htons(cm_info->rem_port), + &pm_msg->rem_addr); +} + +/* + * nes_form_reg_msg - Form a port mapper message with dev info + */ +static void nes_form_reg_msg(struct nes_vnic *nesvnic, + struct iwpm_dev_data *pm_msg) +{ + memcpy(pm_msg->dev_name, nesvnic->nesibdev->ibdev.name, + IWPM_DEVNAME_SIZE); + memcpy(pm_msg->if_name, nesvnic->netdev->name, IWPM_IFNAME_SIZE); +} + +/* + * nes_record_pm_msg - Save the received mapping info + */ +static void nes_record_pm_msg(struct nes_cm_info *cm_info, + struct iwpm_sa_data *pm_msg) +{ + struct sockaddr_in *mapped_loc_addr = + (struct sockaddr_in *)&pm_msg->mapped_loc_addr; + struct sockaddr_in *mapped_rem_addr = + (struct sockaddr_in *)&pm_msg->mapped_rem_addr; + + if (mapped_loc_addr->sin_family == AF_INET) { + cm_info->mapped_loc_addr = + ntohl(mapped_loc_addr->sin_addr.s_addr); + cm_info->mapped_loc_port = ntohs(mapped_loc_addr->sin_port); + } + if (mapped_rem_addr->sin_family == AF_INET) { + cm_info->mapped_rem_addr = + ntohl(mapped_rem_addr->sin_addr.s_addr); + cm_info->mapped_rem_port = ntohs(mapped_rem_addr->sin_port); + } +} + +/** + * print_core - dump a cm core + */ +static void print_core(struct nes_cm_core *core) +{ + nes_debug(NES_DBG_CM, "---------------------------------------------\n"); + nes_debug(NES_DBG_CM, "CM Core -- (core = %p )\n", core); + if (!core) + return; + nes_debug(NES_DBG_CM, "---------------------------------------------\n"); + + nes_debug(NES_DBG_CM, "State : %u \n", core->state); + + nes_debug(NES_DBG_CM, "Listen Nodes : %u \n", atomic_read(&core->listen_node_cnt)); + nes_debug(NES_DBG_CM, "Active Nodes : %u \n", atomic_read(&core->node_cnt)); + + nes_debug(NES_DBG_CM, "core : %p \n", core); + + nes_debug(NES_DBG_CM, "-------------- end core ---------------\n"); +} + +static void record_ird_ord(struct nes_cm_node *cm_node, + u16 conn_ird, u16 conn_ord) +{ + if (conn_ird > NES_MAX_IRD) + conn_ird = NES_MAX_IRD; + + if (conn_ord > NES_MAX_ORD) + conn_ord = NES_MAX_ORD; + + cm_node->ird_size = conn_ird; + cm_node->ord_size = conn_ord; +} + +/** + * cm_build_mpa_frame - build a MPA V1 frame or MPA V2 frame + */ +static int cm_build_mpa_frame(struct nes_cm_node *cm_node, u8 **start_buff, + u16 *buff_len, u8 *pci_mem, u8 mpa_key) +{ + int ret = 0; + + *start_buff = (pci_mem) ? pci_mem : &cm_node->mpa_frame_buf[0]; + + switch (cm_node->mpa_frame_rev) { + case IETF_MPA_V1: + *start_buff = (u8 *)*start_buff + sizeof(struct ietf_rtr_msg); + *buff_len = sizeof(struct ietf_mpa_v1) + cm_node->mpa_frame_size; + build_mpa_v1(cm_node, *start_buff, mpa_key); + break; + case IETF_MPA_V2: + *buff_len = sizeof(struct ietf_mpa_v2) + cm_node->mpa_frame_size; + build_mpa_v2(cm_node, *start_buff, mpa_key); + break; + default: + ret = -EINVAL; + } + return ret; +} + +/** + * build_mpa_v2 - build a MPA V2 frame + */ +static void build_mpa_v2(struct nes_cm_node *cm_node, + void *start_addr, u8 mpa_key) +{ + struct ietf_mpa_v2 *mpa_frame = (struct ietf_mpa_v2 *)start_addr; + struct ietf_rtr_msg *rtr_msg = &mpa_frame->rtr_msg; + u16 ctrl_ird; + u16 ctrl_ord; + + /* initialize the upper 5 bytes of the frame */ + build_mpa_v1(cm_node, start_addr, mpa_key); + mpa_frame->flags |= IETF_MPA_V2_FLAG; /* set a bit to indicate MPA V2 */ + mpa_frame->priv_data_len += htons(IETF_RTR_MSG_SIZE); + + /* initialize RTR msg */ + if (cm_node->mpav2_ird_ord == IETF_NO_IRD_ORD) { + ctrl_ird = IETF_NO_IRD_ORD; + ctrl_ord = IETF_NO_IRD_ORD; + } else { + ctrl_ird = cm_node->ird_size & IETF_NO_IRD_ORD; + ctrl_ord = cm_node->ord_size & IETF_NO_IRD_ORD; + } + ctrl_ird |= IETF_PEER_TO_PEER; + ctrl_ird |= IETF_FLPDU_ZERO_LEN; + + switch (mpa_key) { + case MPA_KEY_REQUEST: + ctrl_ord |= IETF_RDMA0_WRITE; + ctrl_ord |= IETF_RDMA0_READ; + break; + case MPA_KEY_REPLY: + switch (cm_node->send_rdma0_op) { + case SEND_RDMA_WRITE_ZERO: + ctrl_ord |= IETF_RDMA0_WRITE; + break; + case SEND_RDMA_READ_ZERO: + ctrl_ord |= IETF_RDMA0_READ; + break; + } + } + rtr_msg->ctrl_ird = htons(ctrl_ird); + rtr_msg->ctrl_ord = htons(ctrl_ord); +} + +/** + * build_mpa_v1 - build a MPA V1 frame + */ +static void build_mpa_v1(struct nes_cm_node *cm_node, void *start_addr, u8 mpa_key) +{ + struct ietf_mpa_v1 *mpa_frame = (struct ietf_mpa_v1 *)start_addr; + + switch (mpa_key) { + case MPA_KEY_REQUEST: + memcpy(mpa_frame->key, IEFT_MPA_KEY_REQ, IETF_MPA_KEY_SIZE); + break; + case MPA_KEY_REPLY: + memcpy(mpa_frame->key, IEFT_MPA_KEY_REP, IETF_MPA_KEY_SIZE); + break; + } + mpa_frame->flags = IETF_MPA_FLAGS_CRC; + mpa_frame->rev = cm_node->mpa_frame_rev; + mpa_frame->priv_data_len = htons(cm_node->mpa_frame_size); +} + +static void build_rdma0_msg(struct nes_cm_node *cm_node, struct nes_qp **nesqp_addr) +{ + u64 u64temp; + struct nes_qp *nesqp = *nesqp_addr; + struct nes_hw_qp_wqe *wqe = &nesqp->hwqp.sq_vbase[0]; + + u64temp = (unsigned long)nesqp->nesuqp_addr; + u64temp |= NES_SW_CONTEXT_ALIGN >> 1; + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, u64temp); + + wqe->wqe_words[NES_IWARP_SQ_WQE_FRAG0_LOW_IDX] = 0; + wqe->wqe_words[NES_IWARP_SQ_WQE_FRAG0_HIGH_IDX] = 0; + + switch (cm_node->send_rdma0_op) { + case SEND_RDMA_WRITE_ZERO: + nes_debug(NES_DBG_CM, "Sending first write.\n"); + wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = + cpu_to_le32(NES_IWARP_SQ_OP_RDMAW); + wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX] = 0; + wqe->wqe_words[NES_IWARP_SQ_WQE_LENGTH0_IDX] = 0; + wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = 0; + break; + + case SEND_RDMA_READ_ZERO: + default: + if (cm_node->send_rdma0_op != SEND_RDMA_READ_ZERO) + WARN(1, "Unsupported RDMA0 len operation=%u\n", + cm_node->send_rdma0_op); + nes_debug(NES_DBG_CM, "Sending first rdma operation.\n"); + wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = + cpu_to_le32(NES_IWARP_SQ_OP_RDMAR); + wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX] = 1; + wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_TO_HIGH_IDX] = 0; + wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX] = 0; + wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_STAG_IDX] = 1; + wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = 1; + break; + } + + if (nesqp->sq_kmapped) { + nesqp->sq_kmapped = 0; + kunmap(nesqp->page); + } + + /*use the reserved spot on the WQ for the extra first WQE*/ + nesqp->nesqp_context->ird_ord_sizes &= cpu_to_le32(~(NES_QPCONTEXT_ORDIRD_LSMM_PRESENT | + NES_QPCONTEXT_ORDIRD_WRPDU | + NES_QPCONTEXT_ORDIRD_ALSMM)); + nesqp->skip_lsmm = 1; + nesqp->hwqp.sq_tail = 0; +} + +/** + * schedule_nes_timer + * note - cm_node needs to be protected before calling this. Encase in: + * rem_ref_cm_node(cm_core, cm_node);add_ref_cm_node(cm_node); + */ +int schedule_nes_timer(struct nes_cm_node *cm_node, struct sk_buff *skb, + enum nes_timer_type type, int send_retrans, + int close_when_complete) +{ + unsigned long flags; + struct nes_cm_core *cm_core = cm_node->cm_core; + struct nes_timer_entry *new_send; + int ret = 0; + + new_send = kzalloc(sizeof(*new_send), GFP_ATOMIC); + if (!new_send) + return -ENOMEM; + + /* new_send->timetosend = currenttime */ + new_send->retrycount = NES_DEFAULT_RETRYS; + new_send->retranscount = NES_DEFAULT_RETRANS; + new_send->skb = skb; + new_send->timetosend = jiffies; + new_send->type = type; + new_send->netdev = cm_node->netdev; + new_send->send_retrans = send_retrans; + new_send->close_when_complete = close_when_complete; + + if (type == NES_TIMER_TYPE_CLOSE) { + new_send->timetosend += (HZ / 10); + if (cm_node->recv_entry) { + kfree(new_send); + WARN_ON(1); + return -EINVAL; + } + cm_node->recv_entry = new_send; + } + + if (type == NES_TIMER_TYPE_SEND) { + new_send->seq_num = ntohl(tcp_hdr(skb)->seq); + atomic_inc(&new_send->skb->users); + spin_lock_irqsave(&cm_node->retrans_list_lock, flags); + cm_node->send_entry = new_send; + add_ref_cm_node(cm_node); + spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags); + new_send->timetosend = jiffies + NES_RETRY_TIMEOUT; + + ret = nes_nic_cm_xmit(new_send->skb, cm_node->netdev); + if (ret != NETDEV_TX_OK) { + nes_debug(NES_DBG_CM, "Error sending packet %p " + "(jiffies = %lu)\n", new_send, jiffies); + new_send->timetosend = jiffies; + ret = NETDEV_TX_OK; + } else { + cm_packets_sent++; + if (!send_retrans) { + cleanup_retrans_entry(cm_node); + if (close_when_complete) + rem_ref_cm_node(cm_core, cm_node); + return ret; + } + } + } + + if (!timer_pending(&cm_core->tcp_timer)) + mod_timer(&cm_core->tcp_timer, new_send->timetosend); + + return ret; +} + +static void nes_retrans_expired(struct nes_cm_node *cm_node) +{ + struct iw_cm_id *cm_id = cm_node->cm_id; + enum nes_cm_node_state state = cm_node->state; + cm_node->state = NES_CM_STATE_CLOSED; + + switch (state) { + case NES_CM_STATE_SYN_RCVD: + case NES_CM_STATE_CLOSING: + rem_ref_cm_node(cm_node->cm_core, cm_node); + break; + case NES_CM_STATE_LAST_ACK: + case NES_CM_STATE_FIN_WAIT1: + if (cm_node->cm_id) + cm_id->rem_ref(cm_id); + send_reset(cm_node, NULL); + break; + default: + add_ref_cm_node(cm_node); + send_reset(cm_node, NULL); + create_event(cm_node, NES_CM_EVENT_ABORTED); + } +} + +static void handle_recv_entry(struct nes_cm_node *cm_node, u32 rem_node) +{ + struct nes_timer_entry *recv_entry = cm_node->recv_entry; + struct iw_cm_id *cm_id = cm_node->cm_id; + struct nes_qp *nesqp; + unsigned long qplockflags; + + if (!recv_entry) + return; + nesqp = (struct nes_qp *)recv_entry->skb; + if (nesqp) { + spin_lock_irqsave(&nesqp->lock, qplockflags); + if (nesqp->cm_id) { + nes_debug(NES_DBG_CM, "QP%u: cm_id = %p, " + "refcount = %d: HIT A " + "NES_TIMER_TYPE_CLOSE with something " + "to do!!!\n", nesqp->hwqp.qp_id, cm_id, + atomic_read(&nesqp->refcount)); + nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED; + nesqp->last_aeq = NES_AEQE_AEID_RESET_SENT; + nesqp->ibqp_state = IB_QPS_ERR; + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + nes_cm_disconn(nesqp); + } else { + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + nes_debug(NES_DBG_CM, "QP%u: cm_id = %p, " + "refcount = %d: HIT A " + "NES_TIMER_TYPE_CLOSE with nothing " + "to do!!!\n", nesqp->hwqp.qp_id, cm_id, + atomic_read(&nesqp->refcount)); + } + } else if (rem_node) { + /* TIME_WAIT state */ + rem_ref_cm_node(cm_node->cm_core, cm_node); + } + if (cm_node->cm_id) + cm_id->rem_ref(cm_id); + kfree(recv_entry); + cm_node->recv_entry = NULL; +} + +/** + * nes_cm_timer_tick + */ +static void nes_cm_timer_tick(unsigned long pass) +{ + unsigned long flags; + unsigned long nexttimeout = jiffies + NES_LONG_TIME; + struct nes_cm_node *cm_node; + struct nes_timer_entry *send_entry, *recv_entry; + struct list_head *list_core_temp; + struct list_head *list_node; + struct nes_cm_core *cm_core = g_cm_core; + u32 settimer = 0; + unsigned long timetosend; + int ret = NETDEV_TX_OK; + + struct list_head timer_list; + + INIT_LIST_HEAD(&timer_list); + spin_lock_irqsave(&cm_core->ht_lock, flags); + + list_for_each_safe(list_node, list_core_temp, + &cm_core->connected_nodes) { + cm_node = container_of(list_node, struct nes_cm_node, list); + if ((cm_node->recv_entry) || (cm_node->send_entry)) { + add_ref_cm_node(cm_node); + list_add(&cm_node->timer_entry, &timer_list); + } + } + spin_unlock_irqrestore(&cm_core->ht_lock, flags); + + list_for_each_safe(list_node, list_core_temp, &timer_list) { + cm_node = container_of(list_node, struct nes_cm_node, + timer_entry); + recv_entry = cm_node->recv_entry; + + if (recv_entry) { + if (time_after(recv_entry->timetosend, jiffies)) { + if (nexttimeout > recv_entry->timetosend || + !settimer) { + nexttimeout = recv_entry->timetosend; + settimer = 1; + } + } else { + handle_recv_entry(cm_node, 1); + } + } + + spin_lock_irqsave(&cm_node->retrans_list_lock, flags); + do { + send_entry = cm_node->send_entry; + if (!send_entry) + break; + if (time_after(send_entry->timetosend, jiffies)) { + if (cm_node->state != NES_CM_STATE_TSA) { + if ((nexttimeout > + send_entry->timetosend) || + !settimer) { + nexttimeout = + send_entry->timetosend; + settimer = 1; + } + } else { + free_retrans_entry(cm_node); + } + break; + } + + if ((cm_node->state == NES_CM_STATE_TSA) || + (cm_node->state == NES_CM_STATE_CLOSED)) { + free_retrans_entry(cm_node); + break; + } + + if (!send_entry->retranscount || + !send_entry->retrycount) { + cm_packets_dropped++; + free_retrans_entry(cm_node); + + spin_unlock_irqrestore( + &cm_node->retrans_list_lock, flags); + nes_retrans_expired(cm_node); + cm_node->state = NES_CM_STATE_CLOSED; + spin_lock_irqsave(&cm_node->retrans_list_lock, + flags); + break; + } + atomic_inc(&send_entry->skb->users); + cm_packets_retrans++; + nes_debug(NES_DBG_CM, "Retransmitting send_entry %p " + "for node %p, jiffies = %lu, time to send = " + "%lu, retranscount = %u, send_entry->seq_num = " + "0x%08X, cm_node->tcp_cntxt.rem_ack_num = " + "0x%08X\n", send_entry, cm_node, jiffies, + send_entry->timetosend, + send_entry->retranscount, + send_entry->seq_num, + cm_node->tcp_cntxt.rem_ack_num); + + spin_unlock_irqrestore(&cm_node->retrans_list_lock, + flags); + ret = nes_nic_cm_xmit(send_entry->skb, cm_node->netdev); + spin_lock_irqsave(&cm_node->retrans_list_lock, flags); + if (ret != NETDEV_TX_OK) { + nes_debug(NES_DBG_CM, "rexmit failed for " + "node=%p\n", cm_node); + cm_packets_bounced++; + send_entry->retrycount--; + nexttimeout = jiffies + NES_SHORT_TIME; + settimer = 1; + break; + } else { + cm_packets_sent++; + } + nes_debug(NES_DBG_CM, "Packet Sent: retrans count = " + "%u, retry count = %u.\n", + send_entry->retranscount, + send_entry->retrycount); + if (send_entry->send_retrans) { + send_entry->retranscount--; + timetosend = (NES_RETRY_TIMEOUT << + (NES_DEFAULT_RETRANS - send_entry->retranscount)); + + send_entry->timetosend = jiffies + + min(timetosend, NES_MAX_TIMEOUT); + if (nexttimeout > send_entry->timetosend || + !settimer) { + nexttimeout = send_entry->timetosend; + settimer = 1; + } + } else { + int close_when_complete; + close_when_complete = + send_entry->close_when_complete; + nes_debug(NES_DBG_CM, "cm_node=%p state=%d\n", + cm_node, cm_node->state); + free_retrans_entry(cm_node); + if (close_when_complete) + rem_ref_cm_node(cm_node->cm_core, + cm_node); + } + } while (0); + + spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags); + rem_ref_cm_node(cm_node->cm_core, cm_node); + } + + if (settimer) { + if (!timer_pending(&cm_core->tcp_timer)) + mod_timer(&cm_core->tcp_timer, nexttimeout); + } +} + + +/** + * send_syn + */ +static int send_syn(struct nes_cm_node *cm_node, u32 sendack, + struct sk_buff *skb) +{ + int ret; + int flags = SET_SYN; + char optionsbuffer[sizeof(struct option_mss) + + sizeof(struct option_windowscale) + sizeof(struct option_base) + + TCP_OPTIONS_PADDING]; + + int optionssize = 0; + /* Sending MSS option */ + union all_known_options *options; + + if (!cm_node) + return -EINVAL; + + options = (union all_known_options *)&optionsbuffer[optionssize]; + options->as_mss.optionnum = OPTION_NUMBER_MSS; + options->as_mss.length = sizeof(struct option_mss); + options->as_mss.mss = htons(cm_node->tcp_cntxt.mss); + optionssize += sizeof(struct option_mss); + + options = (union all_known_options *)&optionsbuffer[optionssize]; + options->as_windowscale.optionnum = OPTION_NUMBER_WINDOW_SCALE; + options->as_windowscale.length = sizeof(struct option_windowscale); + options->as_windowscale.shiftcount = cm_node->tcp_cntxt.rcv_wscale; + optionssize += sizeof(struct option_windowscale); + + if (sendack && !(NES_DRV_OPT_SUPRESS_OPTION_BC & nes_drv_opt)) { + options = (union all_known_options *)&optionsbuffer[optionssize]; + options->as_base.optionnum = OPTION_NUMBER_WRITE0; + options->as_base.length = sizeof(struct option_base); + optionssize += sizeof(struct option_base); + /* we need the size to be a multiple of 4 */ + options = (union all_known_options *)&optionsbuffer[optionssize]; + options->as_end = 1; + optionssize += 1; + options = (union all_known_options *)&optionsbuffer[optionssize]; + options->as_end = 1; + optionssize += 1; + } + + options = (union all_known_options *)&optionsbuffer[optionssize]; + options->as_end = OPTION_NUMBER_END; + optionssize += 1; + + if (!skb) + skb = dev_alloc_skb(MAX_CM_BUFFER); + if (!skb) { + nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n"); + return -1; + } + + if (sendack) + flags |= SET_ACK; + + form_cm_frame(skb, cm_node, optionsbuffer, optionssize, NULL, 0, flags); + ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0); + + return ret; +} + + +/** + * send_reset + */ +static int send_reset(struct nes_cm_node *cm_node, struct sk_buff *skb) +{ + int ret; + int flags = SET_RST | SET_ACK; + + if (!skb) + skb = dev_alloc_skb(MAX_CM_BUFFER); + if (!skb) { + nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n"); + return -ENOMEM; + } + + form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, flags); + ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 0, 1); + + return ret; +} + + +/** + * send_ack + */ +static int send_ack(struct nes_cm_node *cm_node, struct sk_buff *skb) +{ + int ret; + + if (!skb) + skb = dev_alloc_skb(MAX_CM_BUFFER); + + if (!skb) { + nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n"); + return -1; + } + + form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, SET_ACK); + ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 0, 0); + + return ret; +} + + +/** + * send_fin + */ +static int send_fin(struct nes_cm_node *cm_node, struct sk_buff *skb) +{ + int ret; + + /* if we didn't get a frame get one */ + if (!skb) + skb = dev_alloc_skb(MAX_CM_BUFFER); + + if (!skb) { + nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n"); + return -1; + } + + form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, SET_ACK | SET_FIN); + ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0); + + return ret; +} + + +/** + * find_node - find a cm node that matches the reference cm node + */ +static struct nes_cm_node *find_node(struct nes_cm_core *cm_core, + u16 rem_port, nes_addr_t rem_addr, u16 loc_port, nes_addr_t loc_addr) +{ + unsigned long flags; + struct list_head *hte; + struct nes_cm_node *cm_node; + + /* get a handle on the hte */ + hte = &cm_core->connected_nodes; + + /* walk list and find cm_node associated with this session ID */ + spin_lock_irqsave(&cm_core->ht_lock, flags); + list_for_each_entry(cm_node, hte, list) { + /* compare quad, return node handle if a match */ + nes_debug(NES_DBG_CM, "finding node %x:%x =? %x:%x ^ %x:%x =? %x:%x\n", + cm_node->loc_addr, cm_node->loc_port, + loc_addr, loc_port, + cm_node->rem_addr, cm_node->rem_port, + rem_addr, rem_port); + if ((cm_node->mapped_loc_addr == loc_addr) && + (cm_node->mapped_loc_port == loc_port) && + (cm_node->mapped_rem_addr == rem_addr) && + (cm_node->mapped_rem_port == rem_port)) { + + add_ref_cm_node(cm_node); + spin_unlock_irqrestore(&cm_core->ht_lock, flags); + return cm_node; + } + } + spin_unlock_irqrestore(&cm_core->ht_lock, flags); + + /* no owner node */ + return NULL; +} + + +/** + * find_listener - find a cm node listening on this addr-port pair + */ +static struct nes_cm_listener *find_listener(struct nes_cm_core *cm_core, + nes_addr_t dst_addr, u16 dst_port, + enum nes_cm_listener_state listener_state, int local) +{ + unsigned long flags; + struct nes_cm_listener *listen_node; + nes_addr_t listen_addr; + u16 listen_port; + + /* walk list and find cm_node associated with this session ID */ + spin_lock_irqsave(&cm_core->listen_list_lock, flags); + list_for_each_entry(listen_node, &cm_core->listen_list.list, list) { + if (local) { + listen_addr = listen_node->loc_addr; + listen_port = listen_node->loc_port; + } else { + listen_addr = listen_node->mapped_loc_addr; + listen_port = listen_node->mapped_loc_port; + } + /* compare node pair, return node handle if a match */ + if (((listen_addr == dst_addr) || + listen_addr == 0x00000000) && + (listen_port == dst_port) && + (listener_state & listen_node->listener_state)) { + atomic_inc(&listen_node->ref_count); + spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); + return listen_node; + } + } + spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); + + /* no listener */ + return NULL; +} + +/** + * add_hte_node - add a cm node to the hash table + */ +static int add_hte_node(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node) +{ + unsigned long flags; + struct list_head *hte; + + if (!cm_node || !cm_core) + return -EINVAL; + + nes_debug(NES_DBG_CM, "Adding Node %p to Active Connection HT\n", + cm_node); + + spin_lock_irqsave(&cm_core->ht_lock, flags); + + /* get a handle on the hash table element (list head for this slot) */ + hte = &cm_core->connected_nodes; + list_add_tail(&cm_node->list, hte); + atomic_inc(&cm_core->ht_node_cnt); + + spin_unlock_irqrestore(&cm_core->ht_lock, flags); + + return 0; +} + + +/** + * mini_cm_dec_refcnt_listen + */ +static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core, + struct nes_cm_listener *listener, int free_hanging_nodes) +{ + int ret = -EINVAL; + int err = 0; + unsigned long flags; + struct list_head *list_pos = NULL; + struct list_head *list_temp = NULL; + struct nes_cm_node *cm_node = NULL; + struct list_head reset_list; + + nes_debug(NES_DBG_CM, "attempting listener= %p free_nodes= %d, " + "refcnt=%d\n", listener, free_hanging_nodes, + atomic_read(&listener->ref_count)); + /* free non-accelerated child nodes for this listener */ + INIT_LIST_HEAD(&reset_list); + if (free_hanging_nodes) { + spin_lock_irqsave(&cm_core->ht_lock, flags); + list_for_each_safe(list_pos, list_temp, + &g_cm_core->connected_nodes) { + cm_node = container_of(list_pos, struct nes_cm_node, + list); + if ((cm_node->listener == listener) && + (!cm_node->accelerated)) { + add_ref_cm_node(cm_node); + list_add(&cm_node->reset_entry, &reset_list); + } + } + spin_unlock_irqrestore(&cm_core->ht_lock, flags); + } + + list_for_each_safe(list_pos, list_temp, &reset_list) { + cm_node = container_of(list_pos, struct nes_cm_node, + reset_entry); + { + struct nes_cm_node *loopback = cm_node->loopbackpartner; + enum nes_cm_node_state old_state; + if (NES_CM_STATE_FIN_WAIT1 <= cm_node->state) { + rem_ref_cm_node(cm_node->cm_core, cm_node); + } else { + if (!loopback) { + cleanup_retrans_entry(cm_node); + err = send_reset(cm_node, NULL); + if (err) { + cm_node->state = + NES_CM_STATE_CLOSED; + WARN_ON(1); + } else { + old_state = cm_node->state; + cm_node->state = NES_CM_STATE_LISTENER_DESTROYED; + if (old_state != NES_CM_STATE_MPAREQ_RCVD) + rem_ref_cm_node( + cm_node->cm_core, + cm_node); + } + } else { + struct nes_cm_event event; + + event.cm_node = loopback; + event.cm_info.rem_addr = + loopback->rem_addr; + event.cm_info.loc_addr = + loopback->loc_addr; + event.cm_info.rem_port = + loopback->rem_port; + event.cm_info.loc_port = + loopback->loc_port; + event.cm_info.cm_id = loopback->cm_id; + add_ref_cm_node(loopback); + loopback->state = NES_CM_STATE_CLOSED; + cm_event_connect_error(&event); + cm_node->state = NES_CM_STATE_LISTENER_DESTROYED; + + rem_ref_cm_node(cm_node->cm_core, + cm_node); + + } + } + } + } + + spin_lock_irqsave(&cm_core->listen_list_lock, flags); + if (!atomic_dec_return(&listener->ref_count)) { + list_del(&listener->list); + + /* decrement our listen node count */ + atomic_dec(&cm_core->listen_node_cnt); + + spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); + + if (listener->nesvnic) { + nes_manage_apbvt(listener->nesvnic, + listener->mapped_loc_port, + PCI_FUNC(listener->nesvnic->nesdev->pcidev->devfn), + NES_MANAGE_APBVT_DEL); + + nes_remove_mapinfo(listener->loc_addr, + listener->loc_port, + listener->mapped_loc_addr, + listener->mapped_loc_port); + nes_debug(NES_DBG_NLMSG, + "Delete APBVT mapped_loc_port = %04X\n", + listener->mapped_loc_port); + } + + nes_debug(NES_DBG_CM, "destroying listener (%p)\n", listener); + + kfree(listener); + listener = NULL; + ret = 0; + atomic_inc(&cm_listens_destroyed); + } else { + spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); + } + if (listener) { + if (atomic_read(&listener->pend_accepts_cnt) > 0) + nes_debug(NES_DBG_CM, "destroying listener (%p)" + " with non-zero pending accepts=%u\n", + listener, atomic_read(&listener->pend_accepts_cnt)); + } + + return ret; +} + + +/** + * mini_cm_del_listen + */ +static int mini_cm_del_listen(struct nes_cm_core *cm_core, + struct nes_cm_listener *listener) +{ + listener->listener_state = NES_CM_LISTENER_PASSIVE_STATE; + listener->cm_id = NULL; /* going to be destroyed pretty soon */ + return mini_cm_dec_refcnt_listen(cm_core, listener, 1); +} + + +/** + * mini_cm_accelerated + */ +static inline int mini_cm_accelerated(struct nes_cm_core *cm_core, + struct nes_cm_node *cm_node) +{ + cm_node->accelerated = 1; + + if (cm_node->accept_pend) { + BUG_ON(!cm_node->listener); + atomic_dec(&cm_node->listener->pend_accepts_cnt); + cm_node->accept_pend = 0; + BUG_ON(atomic_read(&cm_node->listener->pend_accepts_cnt) < 0); + } + + if (!timer_pending(&cm_core->tcp_timer)) + mod_timer(&cm_core->tcp_timer, (jiffies + NES_SHORT_TIME)); + + return 0; +} + + +/** + * nes_addr_resolve_neigh + */ +static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip, int arpindex) +{ + struct rtable *rt; + struct neighbour *neigh; + int rc = arpindex; + struct net_device *netdev; + struct nes_adapter *nesadapter = nesvnic->nesdev->nesadapter; + + rt = ip_route_output(&init_net, htonl(dst_ip), 0, 0, 0); + if (IS_ERR(rt)) { + printk(KERN_ERR "%s: ip_route_output_key failed for 0x%08X\n", + __func__, dst_ip); + return rc; + } + + if (netif_is_bond_slave(nesvnic->netdev)) + netdev = netdev_master_upper_dev_get(nesvnic->netdev); + else + netdev = nesvnic->netdev; + + neigh = neigh_lookup(&arp_tbl, &rt->rt_gateway, netdev); + + rcu_read_lock(); + if (neigh) { + if (neigh->nud_state & NUD_VALID) { + nes_debug(NES_DBG_CM, "Neighbor MAC address for 0x%08X" + " is %pM, Gateway is 0x%08X \n", dst_ip, + neigh->ha, ntohl(rt->rt_gateway)); + + if (arpindex >= 0) { + if (ether_addr_equal(nesadapter->arp_table[arpindex].mac_addr, neigh->ha)) { + /* Mac address same as in nes_arp_table */ + goto out; + } + + nes_manage_arp_cache(nesvnic->netdev, + nesadapter->arp_table[arpindex].mac_addr, + dst_ip, NES_ARP_DELETE); + } + + nes_manage_arp_cache(nesvnic->netdev, neigh->ha, + dst_ip, NES_ARP_ADD); + rc = nes_arp_table(nesvnic->nesdev, dst_ip, NULL, + NES_ARP_RESOLVE); + } else { + neigh_event_send(neigh, NULL); + } + } +out: + rcu_read_unlock(); + + if (neigh) + neigh_release(neigh); + + ip_rt_put(rt); + return rc; +} + +/** + * make_cm_node - create a new instance of a cm node + */ +static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core, + struct nes_vnic *nesvnic, struct nes_cm_info *cm_info, + struct nes_cm_listener *listener) +{ + struct nes_cm_node *cm_node; + struct timespec ts; + int oldarpindex = 0; + int arpindex = 0; + struct nes_device *nesdev; + struct nes_adapter *nesadapter; + + /* create an hte and cm_node for this instance */ + cm_node = kzalloc(sizeof(*cm_node), GFP_ATOMIC); + if (!cm_node) + return NULL; + + /* set our node specific transport info */ + cm_node->loc_addr = cm_info->loc_addr; + cm_node->rem_addr = cm_info->rem_addr; + cm_node->loc_port = cm_info->loc_port; + cm_node->rem_port = cm_info->rem_port; + + cm_node->mapped_loc_addr = cm_info->mapped_loc_addr; + cm_node->mapped_rem_addr = cm_info->mapped_rem_addr; + cm_node->mapped_loc_port = cm_info->mapped_loc_port; + cm_node->mapped_rem_port = cm_info->mapped_rem_port; + + cm_node->mpa_frame_rev = mpa_version; + cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO; + cm_node->mpav2_ird_ord = 0; + cm_node->ird_size = 0; + cm_node->ord_size = 0; + + nes_debug(NES_DBG_CM, "Make node addresses : loc = %pI4:%x, rem = %pI4:%x\n", + &cm_node->loc_addr, cm_node->loc_port, + &cm_node->rem_addr, cm_node->rem_port); + cm_node->listener = listener; + cm_node->netdev = nesvnic->netdev; + cm_node->cm_id = cm_info->cm_id; + memcpy(cm_node->loc_mac, nesvnic->netdev->dev_addr, ETH_ALEN); + + nes_debug(NES_DBG_CM, "listener=%p, cm_id=%p\n", cm_node->listener, + cm_node->cm_id); + + spin_lock_init(&cm_node->retrans_list_lock); + + cm_node->loopbackpartner = NULL; + atomic_set(&cm_node->ref_count, 1); + /* associate our parent CM core */ + cm_node->cm_core = cm_core; + cm_node->tcp_cntxt.loc_id = NES_CM_DEF_LOCAL_ID; + cm_node->tcp_cntxt.rcv_wscale = NES_CM_DEFAULT_RCV_WND_SCALE; + cm_node->tcp_cntxt.rcv_wnd = NES_CM_DEFAULT_RCV_WND_SCALED >> + NES_CM_DEFAULT_RCV_WND_SCALE; + ts = current_kernel_time(); + cm_node->tcp_cntxt.loc_seq_num = htonl(ts.tv_nsec); + cm_node->tcp_cntxt.mss = nesvnic->max_frame_size - sizeof(struct iphdr) - + sizeof(struct tcphdr) - ETH_HLEN - VLAN_HLEN; + cm_node->tcp_cntxt.rcv_nxt = 0; + /* get a unique session ID , add thread_id to an upcounter to handle race */ + atomic_inc(&cm_core->node_cnt); + cm_node->conn_type = cm_info->conn_type; + cm_node->apbvt_set = 0; + cm_node->accept_pend = 0; + + cm_node->nesvnic = nesvnic; + /* get some device handles, for arp lookup */ + nesdev = nesvnic->nesdev; + nesadapter = nesdev->nesadapter; + + cm_node->loopbackpartner = NULL; + + /* get the mac addr for the remote node */ + oldarpindex = nes_arp_table(nesdev, cm_node->mapped_rem_addr, + NULL, NES_ARP_RESOLVE); + arpindex = nes_addr_resolve_neigh(nesvnic, + cm_node->mapped_rem_addr, oldarpindex); + if (arpindex < 0) { + kfree(cm_node); + return NULL; + } + + /* copy the mac addr to node context */ + memcpy(cm_node->rem_mac, nesadapter->arp_table[arpindex].mac_addr, ETH_ALEN); + nes_debug(NES_DBG_CM, "Remote mac addr from arp table: %pM\n", + cm_node->rem_mac); + + add_hte_node(cm_core, cm_node); + atomic_inc(&cm_nodes_created); + + return cm_node; +} + + +/** + * add_ref_cm_node - destroy an instance of a cm node + */ +static int add_ref_cm_node(struct nes_cm_node *cm_node) +{ + atomic_inc(&cm_node->ref_count); + return 0; +} + + +/** + * rem_ref_cm_node - destroy an instance of a cm node + */ +static int rem_ref_cm_node(struct nes_cm_core *cm_core, + struct nes_cm_node *cm_node) +{ + unsigned long flags; + struct nes_qp *nesqp; + + if (!cm_node) + return -EINVAL; + + spin_lock_irqsave(&cm_node->cm_core->ht_lock, flags); + if (atomic_dec_return(&cm_node->ref_count)) { + spin_unlock_irqrestore(&cm_node->cm_core->ht_lock, flags); + return 0; + } + list_del(&cm_node->list); + atomic_dec(&cm_core->ht_node_cnt); + spin_unlock_irqrestore(&cm_node->cm_core->ht_lock, flags); + + /* if the node is destroyed before connection was accelerated */ + if (!cm_node->accelerated && cm_node->accept_pend) { + BUG_ON(!cm_node->listener); + atomic_dec(&cm_node->listener->pend_accepts_cnt); + BUG_ON(atomic_read(&cm_node->listener->pend_accepts_cnt) < 0); + } + WARN_ON(cm_node->send_entry); + if (cm_node->recv_entry) + handle_recv_entry(cm_node, 0); + if (cm_node->listener) { + mini_cm_dec_refcnt_listen(cm_core, cm_node->listener, 0); + } else { + if (cm_node->apbvt_set && cm_node->nesvnic) { + nes_manage_apbvt(cm_node->nesvnic, cm_node->mapped_loc_port, + PCI_FUNC(cm_node->nesvnic->nesdev->pcidev->devfn), + NES_MANAGE_APBVT_DEL); + } + nes_debug(NES_DBG_NLMSG, "Delete APBVT mapped_loc_port = %04X\n", + cm_node->mapped_loc_port); + nes_remove_mapinfo(cm_node->loc_addr, cm_node->loc_port, + cm_node->mapped_loc_addr, cm_node->mapped_loc_port); + } + + atomic_dec(&cm_core->node_cnt); + atomic_inc(&cm_nodes_destroyed); + nesqp = cm_node->nesqp; + if (nesqp) { + nesqp->cm_node = NULL; + nes_rem_ref(&nesqp->ibqp); + cm_node->nesqp = NULL; + } + + kfree(cm_node); + return 0; +} + +/** + * process_options + */ +static int process_options(struct nes_cm_node *cm_node, u8 *optionsloc, + u32 optionsize, u32 syn_packet) +{ + u32 tmp; + u32 offset = 0; + union all_known_options *all_options; + char got_mss_option = 0; + + while (offset < optionsize) { + all_options = (union all_known_options *)(optionsloc + offset); + switch (all_options->as_base.optionnum) { + case OPTION_NUMBER_END: + offset = optionsize; + break; + case OPTION_NUMBER_NONE: + offset += 1; + continue; + case OPTION_NUMBER_MSS: + nes_debug(NES_DBG_CM, "%s: MSS Length: %d Offset: %d " + "Size: %d\n", __func__, + all_options->as_mss.length, offset, optionsize); + got_mss_option = 1; + if (all_options->as_mss.length != 4) { + return 1; + } else { + tmp = ntohs(all_options->as_mss.mss); + if (tmp > 0 && tmp < + cm_node->tcp_cntxt.mss) + cm_node->tcp_cntxt.mss = tmp; + } + break; + case OPTION_NUMBER_WINDOW_SCALE: + cm_node->tcp_cntxt.snd_wscale = + all_options->as_windowscale.shiftcount; + break; + default: + nes_debug(NES_DBG_CM, "TCP Option not understood: %x\n", + all_options->as_base.optionnum); + break; + } + offset += all_options->as_base.length; + } + if ((!got_mss_option) && (syn_packet)) + cm_node->tcp_cntxt.mss = NES_CM_DEFAULT_MSS; + return 0; +} + +static void drop_packet(struct sk_buff *skb) +{ + atomic_inc(&cm_accel_dropped_pkts); + dev_kfree_skb_any(skb); +} + +static void handle_fin_pkt(struct nes_cm_node *cm_node) +{ + nes_debug(NES_DBG_CM, "Received FIN, cm_node = %p, state = %u. " + "refcnt=%d\n", cm_node, cm_node->state, + atomic_read(&cm_node->ref_count)); + switch (cm_node->state) { + case NES_CM_STATE_SYN_RCVD: + case NES_CM_STATE_SYN_SENT: + case NES_CM_STATE_ESTABLISHED: + case NES_CM_STATE_MPAREJ_RCVD: + cm_node->tcp_cntxt.rcv_nxt++; + cleanup_retrans_entry(cm_node); + cm_node->state = NES_CM_STATE_LAST_ACK; + send_fin(cm_node, NULL); + break; + case NES_CM_STATE_MPAREQ_SENT: + create_event(cm_node, NES_CM_EVENT_ABORTED); + cm_node->tcp_cntxt.rcv_nxt++; + cleanup_retrans_entry(cm_node); + cm_node->state = NES_CM_STATE_CLOSED; + add_ref_cm_node(cm_node); + send_reset(cm_node, NULL); + break; + case NES_CM_STATE_FIN_WAIT1: + cm_node->tcp_cntxt.rcv_nxt++; + cleanup_retrans_entry(cm_node); + cm_node->state = NES_CM_STATE_CLOSING; + send_ack(cm_node, NULL); + /* Wait for ACK as this is simultaneous close.. + * After we receive ACK, do not send anything.. + * Just rm the node.. Done.. */ + break; + case NES_CM_STATE_FIN_WAIT2: + cm_node->tcp_cntxt.rcv_nxt++; + cleanup_retrans_entry(cm_node); + cm_node->state = NES_CM_STATE_TIME_WAIT; + send_ack(cm_node, NULL); + schedule_nes_timer(cm_node, NULL, NES_TIMER_TYPE_CLOSE, 1, 0); + break; + case NES_CM_STATE_TIME_WAIT: + cm_node->tcp_cntxt.rcv_nxt++; + cleanup_retrans_entry(cm_node); + cm_node->state = NES_CM_STATE_CLOSED; + rem_ref_cm_node(cm_node->cm_core, cm_node); + break; + case NES_CM_STATE_TSA: + default: + nes_debug(NES_DBG_CM, "Error Rcvd FIN for node-%p state = %d\n", + cm_node, cm_node->state); + break; + } +} + + +static void handle_rst_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, + struct tcphdr *tcph) +{ + + int reset = 0; /* whether to send reset in case of err.. */ + atomic_inc(&cm_resets_recvd); + nes_debug(NES_DBG_CM, "Received Reset, cm_node = %p, state = %u." + " refcnt=%d\n", cm_node, cm_node->state, + atomic_read(&cm_node->ref_count)); + cleanup_retrans_entry(cm_node); + switch (cm_node->state) { + case NES_CM_STATE_SYN_SENT: + case NES_CM_STATE_MPAREQ_SENT: + nes_debug(NES_DBG_CM, "%s[%u] create abort for cm_node=%p " + "listener=%p state=%d\n", __func__, __LINE__, cm_node, + cm_node->listener, cm_node->state); + switch (cm_node->mpa_frame_rev) { + case IETF_MPA_V2: + cm_node->mpa_frame_rev = IETF_MPA_V1; + /* send a syn and goto syn sent state */ + cm_node->state = NES_CM_STATE_SYN_SENT; + if (send_syn(cm_node, 0, NULL)) { + active_open_err(cm_node, skb, reset); + } + break; + case IETF_MPA_V1: + default: + active_open_err(cm_node, skb, reset); + break; + } + break; + case NES_CM_STATE_MPAREQ_RCVD: + atomic_inc(&cm_node->passive_state); + dev_kfree_skb_any(skb); + break; + case NES_CM_STATE_ESTABLISHED: + case NES_CM_STATE_SYN_RCVD: + case NES_CM_STATE_LISTENING: + nes_debug(NES_DBG_CM, "Bad state %s[%u]\n", __func__, __LINE__); + passive_open_err(cm_node, skb, reset); + break; + case NES_CM_STATE_TSA: + active_open_err(cm_node, skb, reset); + break; + case NES_CM_STATE_CLOSED: + drop_packet(skb); + break; + case NES_CM_STATE_FIN_WAIT2: + case NES_CM_STATE_FIN_WAIT1: + case NES_CM_STATE_LAST_ACK: + cm_node->cm_id->rem_ref(cm_node->cm_id); + case NES_CM_STATE_TIME_WAIT: + cm_node->state = NES_CM_STATE_CLOSED; + rem_ref_cm_node(cm_node->cm_core, cm_node); + drop_packet(skb); + break; + default: + drop_packet(skb); + break; + } +} + + +static void handle_rcv_mpa(struct nes_cm_node *cm_node, struct sk_buff *skb) +{ + int ret = 0; + int datasize = skb->len; + u8 *dataloc = skb->data; + + enum nes_cm_event_type type = NES_CM_EVENT_UNKNOWN; + u32 res_type; + + ret = parse_mpa(cm_node, dataloc, &res_type, datasize); + if (ret) { + nes_debug(NES_DBG_CM, "didn't like MPA Request\n"); + if (cm_node->state == NES_CM_STATE_MPAREQ_SENT) { + nes_debug(NES_DBG_CM, "%s[%u] create abort for " + "cm_node=%p listener=%p state=%d\n", __func__, + __LINE__, cm_node, cm_node->listener, + cm_node->state); + active_open_err(cm_node, skb, 1); + } else { + passive_open_err(cm_node, skb, 1); + } + return; + } + + switch (cm_node->state) { + case NES_CM_STATE_ESTABLISHED: + if (res_type == NES_MPA_REQUEST_REJECT) + /*BIG problem as we are receiving the MPA.. So should + * not be REJECT.. This is Passive Open.. We can + * only receive it Reject for Active Open...*/ + WARN_ON(1); + cm_node->state = NES_CM_STATE_MPAREQ_RCVD; + type = NES_CM_EVENT_MPA_REQ; + atomic_set(&cm_node->passive_state, + NES_PASSIVE_STATE_INDICATED); + break; + case NES_CM_STATE_MPAREQ_SENT: + cleanup_retrans_entry(cm_node); + if (res_type == NES_MPA_REQUEST_REJECT) { + type = NES_CM_EVENT_MPA_REJECT; + cm_node->state = NES_CM_STATE_MPAREJ_RCVD; + } else { + type = NES_CM_EVENT_CONNECTED; + cm_node->state = NES_CM_STATE_TSA; + } + + break; + default: + WARN_ON(1); + break; + } + dev_kfree_skb_any(skb); + create_event(cm_node, type); +} + +static void indicate_pkt_err(struct nes_cm_node *cm_node, struct sk_buff *skb) +{ + switch (cm_node->state) { + case NES_CM_STATE_SYN_SENT: + case NES_CM_STATE_MPAREQ_SENT: + nes_debug(NES_DBG_CM, "%s[%u] create abort for cm_node=%p " + "listener=%p state=%d\n", __func__, __LINE__, cm_node, + cm_node->listener, cm_node->state); + active_open_err(cm_node, skb, 1); + break; + case NES_CM_STATE_ESTABLISHED: + case NES_CM_STATE_SYN_RCVD: + passive_open_err(cm_node, skb, 1); + break; + case NES_CM_STATE_TSA: + default: + drop_packet(skb); + } +} + +static int check_syn(struct nes_cm_node *cm_node, struct tcphdr *tcph, + struct sk_buff *skb) +{ + int err; + + err = ((ntohl(tcph->ack_seq) == cm_node->tcp_cntxt.loc_seq_num)) ? 0 : 1; + if (err) + active_open_err(cm_node, skb, 1); + + return err; +} + +static int check_seq(struct nes_cm_node *cm_node, struct tcphdr *tcph, + struct sk_buff *skb) +{ + int err = 0; + u32 seq; + u32 ack_seq; + u32 loc_seq_num = cm_node->tcp_cntxt.loc_seq_num; + u32 rcv_nxt = cm_node->tcp_cntxt.rcv_nxt; + u32 rcv_wnd; + + seq = ntohl(tcph->seq); + ack_seq = ntohl(tcph->ack_seq); + rcv_wnd = cm_node->tcp_cntxt.rcv_wnd; + if (ack_seq != loc_seq_num) + err = 1; + else if (!between(seq, rcv_nxt, (rcv_nxt + rcv_wnd))) + err = 1; + if (err) { + nes_debug(NES_DBG_CM, "%s[%u] create abort for cm_node=%p " + "listener=%p state=%d\n", __func__, __LINE__, cm_node, + cm_node->listener, cm_node->state); + indicate_pkt_err(cm_node, skb); + nes_debug(NES_DBG_CM, "seq ERROR cm_node =%p seq=0x%08X " + "rcv_nxt=0x%08X rcv_wnd=0x%x\n", cm_node, seq, rcv_nxt, + rcv_wnd); + } + return err; +} + +/* + * handle_syn_pkt() is for Passive node. The syn packet is received when a node + * is created with a listener or it may comein as rexmitted packet which in + * that case will be just dropped. + */ +static void handle_syn_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, + struct tcphdr *tcph) +{ + int ret; + u32 inc_sequence; + int optionsize; + + optionsize = (tcph->doff << 2) - sizeof(struct tcphdr); + skb_trim(skb, 0); + inc_sequence = ntohl(tcph->seq); + + switch (cm_node->state) { + case NES_CM_STATE_SYN_SENT: + case NES_CM_STATE_MPAREQ_SENT: + /* Rcvd syn on active open connection*/ + active_open_err(cm_node, skb, 1); + break; + case NES_CM_STATE_LISTENING: + /* Passive OPEN */ + if (atomic_read(&cm_node->listener->pend_accepts_cnt) > + cm_node->listener->backlog) { + nes_debug(NES_DBG_CM, "drop syn due to backlog " + "pressure \n"); + cm_backlog_drops++; + passive_open_err(cm_node, skb, 0); + break; + } + ret = handle_tcp_options(cm_node, tcph, skb, optionsize, + 1); + if (ret) { + passive_open_err(cm_node, skb, 0); + /* drop pkt */ + break; + } + cm_node->tcp_cntxt.rcv_nxt = inc_sequence + 1; + BUG_ON(cm_node->send_entry); + cm_node->accept_pend = 1; + atomic_inc(&cm_node->listener->pend_accepts_cnt); + + cm_node->state = NES_CM_STATE_SYN_RCVD; + send_syn(cm_node, 1, skb); + break; + case NES_CM_STATE_CLOSED: + cleanup_retrans_entry(cm_node); + add_ref_cm_node(cm_node); + send_reset(cm_node, skb); + break; + case NES_CM_STATE_TSA: + case NES_CM_STATE_ESTABLISHED: + case NES_CM_STATE_FIN_WAIT1: + case NES_CM_STATE_FIN_WAIT2: + case NES_CM_STATE_MPAREQ_RCVD: + case NES_CM_STATE_LAST_ACK: + case NES_CM_STATE_CLOSING: + case NES_CM_STATE_UNKNOWN: + default: + drop_packet(skb); + break; + } +} + +static void handle_synack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, + struct tcphdr *tcph) +{ + int ret; + u32 inc_sequence; + int optionsize; + + optionsize = (tcph->doff << 2) - sizeof(struct tcphdr); + skb_trim(skb, 0); + inc_sequence = ntohl(tcph->seq); + switch (cm_node->state) { + case NES_CM_STATE_SYN_SENT: + cleanup_retrans_entry(cm_node); + /* active open */ + if (check_syn(cm_node, tcph, skb)) + return; + cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq); + /* setup options */ + ret = handle_tcp_options(cm_node, tcph, skb, optionsize, 0); + if (ret) { + nes_debug(NES_DBG_CM, "cm_node=%p tcp_options failed\n", + cm_node); + break; + } + cleanup_retrans_entry(cm_node); + cm_node->tcp_cntxt.rcv_nxt = inc_sequence + 1; + send_mpa_request(cm_node, skb); + cm_node->state = NES_CM_STATE_MPAREQ_SENT; + break; + case NES_CM_STATE_MPAREQ_RCVD: + /* passive open, so should not be here */ + passive_open_err(cm_node, skb, 1); + break; + case NES_CM_STATE_LISTENING: + cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq); + cleanup_retrans_entry(cm_node); + cm_node->state = NES_CM_STATE_CLOSED; + send_reset(cm_node, skb); + break; + case NES_CM_STATE_CLOSED: + cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq); + cleanup_retrans_entry(cm_node); + add_ref_cm_node(cm_node); + send_reset(cm_node, skb); + break; + case NES_CM_STATE_ESTABLISHED: + case NES_CM_STATE_FIN_WAIT1: + case NES_CM_STATE_FIN_WAIT2: + case NES_CM_STATE_LAST_ACK: + case NES_CM_STATE_TSA: + case NES_CM_STATE_CLOSING: + case NES_CM_STATE_UNKNOWN: + case NES_CM_STATE_MPAREQ_SENT: + default: + drop_packet(skb); + break; + } +} + +static int handle_ack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, + struct tcphdr *tcph) +{ + int datasize = 0; + u32 inc_sequence; + int ret = 0; + int optionsize; + + optionsize = (tcph->doff << 2) - sizeof(struct tcphdr); + + if (check_seq(cm_node, tcph, skb)) + return -EINVAL; + + skb_pull(skb, tcph->doff << 2); + inc_sequence = ntohl(tcph->seq); + datasize = skb->len; + switch (cm_node->state) { + case NES_CM_STATE_SYN_RCVD: + /* Passive OPEN */ + cleanup_retrans_entry(cm_node); + ret = handle_tcp_options(cm_node, tcph, skb, optionsize, 1); + if (ret) + break; + cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq); + cm_node->state = NES_CM_STATE_ESTABLISHED; + if (datasize) { + cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize; + handle_rcv_mpa(cm_node, skb); + } else { /* rcvd ACK only */ + dev_kfree_skb_any(skb); + } + break; + case NES_CM_STATE_ESTABLISHED: + /* Passive OPEN */ + cleanup_retrans_entry(cm_node); + if (datasize) { + cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize; + handle_rcv_mpa(cm_node, skb); + } else { + drop_packet(skb); + } + break; + case NES_CM_STATE_MPAREQ_SENT: + cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq); + if (datasize) { + cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize; + handle_rcv_mpa(cm_node, skb); + } else { /* Could be just an ack pkt.. */ + dev_kfree_skb_any(skb); + } + break; + case NES_CM_STATE_LISTENING: + cleanup_retrans_entry(cm_node); + cm_node->state = NES_CM_STATE_CLOSED; + send_reset(cm_node, skb); + break; + case NES_CM_STATE_CLOSED: + cleanup_retrans_entry(cm_node); + add_ref_cm_node(cm_node); + send_reset(cm_node, skb); + break; + case NES_CM_STATE_LAST_ACK: + case NES_CM_STATE_CLOSING: + cleanup_retrans_entry(cm_node); + cm_node->state = NES_CM_STATE_CLOSED; + cm_node->cm_id->rem_ref(cm_node->cm_id); + rem_ref_cm_node(cm_node->cm_core, cm_node); + drop_packet(skb); + break; + case NES_CM_STATE_FIN_WAIT1: + cleanup_retrans_entry(cm_node); + drop_packet(skb); + cm_node->state = NES_CM_STATE_FIN_WAIT2; + break; + case NES_CM_STATE_SYN_SENT: + case NES_CM_STATE_FIN_WAIT2: + case NES_CM_STATE_TSA: + case NES_CM_STATE_MPAREQ_RCVD: + case NES_CM_STATE_UNKNOWN: + default: + cleanup_retrans_entry(cm_node); + drop_packet(skb); + break; + } + return ret; +} + + + +static int handle_tcp_options(struct nes_cm_node *cm_node, struct tcphdr *tcph, + struct sk_buff *skb, int optionsize, int passive) +{ + u8 *optionsloc = (u8 *)&tcph[1]; + + if (optionsize) { + if (process_options(cm_node, optionsloc, optionsize, + (u32)tcph->syn)) { + nes_debug(NES_DBG_CM, "%s: Node %p, Sending RESET\n", + __func__, cm_node); + if (passive) + passive_open_err(cm_node, skb, 1); + else + active_open_err(cm_node, skb, 1); + return 1; + } + } + + cm_node->tcp_cntxt.snd_wnd = ntohs(tcph->window) << + cm_node->tcp_cntxt.snd_wscale; + + if (cm_node->tcp_cntxt.snd_wnd > cm_node->tcp_cntxt.max_snd_wnd) + cm_node->tcp_cntxt.max_snd_wnd = cm_node->tcp_cntxt.snd_wnd; + return 0; +} + +/* + * active_open_err() will send reset() if flag set.. + * It will also send ABORT event. + */ +static void active_open_err(struct nes_cm_node *cm_node, struct sk_buff *skb, + int reset) +{ + cleanup_retrans_entry(cm_node); + if (reset) { + nes_debug(NES_DBG_CM, "ERROR active err called for cm_node=%p, " + "state=%d\n", cm_node, cm_node->state); + add_ref_cm_node(cm_node); + send_reset(cm_node, skb); + } else { + dev_kfree_skb_any(skb); + } + + cm_node->state = NES_CM_STATE_CLOSED; + create_event(cm_node, NES_CM_EVENT_ABORTED); +} + +/* + * passive_open_err() will either do a reset() or will free up the skb and + * remove the cm_node. + */ +static void passive_open_err(struct nes_cm_node *cm_node, struct sk_buff *skb, + int reset) +{ + cleanup_retrans_entry(cm_node); + cm_node->state = NES_CM_STATE_CLOSED; + if (reset) { + nes_debug(NES_DBG_CM, "passive_open_err sending RST for " + "cm_node=%p state =%d\n", cm_node, cm_node->state); + send_reset(cm_node, skb); + } else { + dev_kfree_skb_any(skb); + rem_ref_cm_node(cm_node->cm_core, cm_node); + } +} + +/* + * free_retrans_entry() routines assumes that the retrans_list_lock has + * been acquired before calling. + */ +static void free_retrans_entry(struct nes_cm_node *cm_node) +{ + struct nes_timer_entry *send_entry; + + send_entry = cm_node->send_entry; + if (send_entry) { + cm_node->send_entry = NULL; + dev_kfree_skb_any(send_entry->skb); + kfree(send_entry); + rem_ref_cm_node(cm_node->cm_core, cm_node); + } +} + +static void cleanup_retrans_entry(struct nes_cm_node *cm_node) +{ + unsigned long flags; + + spin_lock_irqsave(&cm_node->retrans_list_lock, flags); + free_retrans_entry(cm_node); + spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags); +} + +/** + * process_packet + * Returns skb if to be freed, else it will return NULL if already used.. + */ +static void process_packet(struct nes_cm_node *cm_node, struct sk_buff *skb, + struct nes_cm_core *cm_core) +{ + enum nes_tcpip_pkt_type pkt_type = NES_PKT_TYPE_UNKNOWN; + struct tcphdr *tcph = tcp_hdr(skb); + u32 fin_set = 0; + int ret = 0; + + skb_pull(skb, ip_hdr(skb)->ihl << 2); + + nes_debug(NES_DBG_CM, "process_packet: cm_node=%p state =%d syn=%d " + "ack=%d rst=%d fin=%d\n", cm_node, cm_node->state, tcph->syn, + tcph->ack, tcph->rst, tcph->fin); + + if (tcph->rst) { + pkt_type = NES_PKT_TYPE_RST; + } else if (tcph->syn) { + pkt_type = NES_PKT_TYPE_SYN; + if (tcph->ack) + pkt_type = NES_PKT_TYPE_SYNACK; + } else if (tcph->ack) { + pkt_type = NES_PKT_TYPE_ACK; + } + if (tcph->fin) + fin_set = 1; + + switch (pkt_type) { + case NES_PKT_TYPE_SYN: + handle_syn_pkt(cm_node, skb, tcph); + break; + case NES_PKT_TYPE_SYNACK: + handle_synack_pkt(cm_node, skb, tcph); + break; + case NES_PKT_TYPE_ACK: + ret = handle_ack_pkt(cm_node, skb, tcph); + if (fin_set && !ret) + handle_fin_pkt(cm_node); + break; + case NES_PKT_TYPE_RST: + handle_rst_pkt(cm_node, skb, tcph); + break; + default: + if ((fin_set) && (!check_seq(cm_node, tcph, skb))) + handle_fin_pkt(cm_node); + drop_packet(skb); + break; + } +} + +/** + * mini_cm_listen - create a listen node with params + */ +static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core, + struct nes_vnic *nesvnic, struct nes_cm_info *cm_info) +{ + struct nes_cm_listener *listener; + struct iwpm_dev_data pm_reg_msg; + struct iwpm_sa_data pm_msg; + unsigned long flags; + int iwpm_err = 0; + + nes_debug(NES_DBG_CM, "Search for 0x%08x : 0x%04x\n", + cm_info->loc_addr, cm_info->loc_port); + + /* cannot have multiple matching listeners */ + listener = find_listener(cm_core, cm_info->loc_addr, cm_info->loc_port, + NES_CM_LISTENER_EITHER_STATE, 1); + + if (listener && listener->listener_state == NES_CM_LISTENER_ACTIVE_STATE) { + /* find automatically incs ref count ??? */ + atomic_dec(&listener->ref_count); + nes_debug(NES_DBG_CM, "Not creating listener since it already exists\n"); + return NULL; + } + + if (!listener) { + nes_form_reg_msg(nesvnic, &pm_reg_msg); + iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_NES); + if (iwpm_err) { + nes_debug(NES_DBG_NLMSG, + "Port Mapper reg pid fail (err = %d).\n", iwpm_err); + } + if (iwpm_valid_pid() && !iwpm_err) { + nes_form_pm_msg(cm_info, &pm_msg); + iwpm_err = iwpm_add_mapping(&pm_msg, RDMA_NL_NES); + if (iwpm_err) + nes_debug(NES_DBG_NLMSG, + "Port Mapper query fail (err = %d).\n", iwpm_err); + else + nes_record_pm_msg(cm_info, &pm_msg); + } + + /* create a CM listen node (1/2 node to compare incoming traffic to) */ + listener = kzalloc(sizeof(*listener), GFP_ATOMIC); + if (!listener) { + nes_debug(NES_DBG_CM, "Not creating listener memory allocation failed\n"); + return NULL; + } + + listener->loc_addr = cm_info->loc_addr; + listener->loc_port = cm_info->loc_port; + listener->mapped_loc_addr = cm_info->mapped_loc_addr; + listener->mapped_loc_port = cm_info->mapped_loc_port; + listener->reused_node = 0; + + atomic_set(&listener->ref_count, 1); + } + /* pasive case */ + /* find already inc'ed the ref count */ + else { + listener->reused_node = 1; + } + + listener->cm_id = cm_info->cm_id; + atomic_set(&listener->pend_accepts_cnt, 0); + listener->cm_core = cm_core; + listener->nesvnic = nesvnic; + atomic_inc(&cm_core->node_cnt); + + listener->conn_type = cm_info->conn_type; + listener->backlog = cm_info->backlog; + listener->listener_state = NES_CM_LISTENER_ACTIVE_STATE; + + if (!listener->reused_node) { + spin_lock_irqsave(&cm_core->listen_list_lock, flags); + list_add(&listener->list, &cm_core->listen_list.list); + spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); + atomic_inc(&cm_core->listen_node_cnt); + } + + nes_debug(NES_DBG_CM, "Api - listen(): addr=0x%08X, port=0x%04x," + " listener = %p, backlog = %d, cm_id = %p.\n", + cm_info->loc_addr, cm_info->loc_port, + listener, listener->backlog, listener->cm_id); + + return listener; +} + + +/** + * mini_cm_connect - make a connection node with params + */ +static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *cm_core, + struct nes_vnic *nesvnic, u16 private_data_len, + void *private_data, struct nes_cm_info *cm_info) +{ + int ret = 0; + struct nes_cm_node *cm_node; + struct nes_cm_listener *loopbackremotelistener; + struct nes_cm_node *loopbackremotenode; + struct nes_cm_info loopback_cm_info; + u8 *start_buff; + + /* create a CM connection node */ + cm_node = make_cm_node(cm_core, nesvnic, cm_info, NULL); + if (!cm_node) + return NULL; + + /* set our node side to client (active) side */ + cm_node->tcp_cntxt.client = 1; + cm_node->tcp_cntxt.rcv_wscale = NES_CM_DEFAULT_RCV_WND_SCALE; + + if (cm_info->loc_addr == cm_info->rem_addr) { + loopbackremotelistener = find_listener(cm_core, + cm_node->mapped_loc_addr, cm_node->mapped_rem_port, + NES_CM_LISTENER_ACTIVE_STATE, 0); + if (loopbackremotelistener == NULL) { + create_event(cm_node, NES_CM_EVENT_ABORTED); + } else { + loopback_cm_info = *cm_info; + loopback_cm_info.loc_port = cm_info->rem_port; + loopback_cm_info.rem_port = cm_info->loc_port; + loopback_cm_info.mapped_loc_port = + cm_info->mapped_rem_port; + loopback_cm_info.mapped_rem_port = + cm_info->mapped_loc_port; + loopback_cm_info.cm_id = loopbackremotelistener->cm_id; + loopbackremotenode = make_cm_node(cm_core, nesvnic, + &loopback_cm_info, loopbackremotelistener); + if (!loopbackremotenode) { + rem_ref_cm_node(cm_node->cm_core, cm_node); + return NULL; + } + atomic_inc(&cm_loopbacks); + loopbackremotenode->loopbackpartner = cm_node; + loopbackremotenode->tcp_cntxt.rcv_wscale = + NES_CM_DEFAULT_RCV_WND_SCALE; + cm_node->loopbackpartner = loopbackremotenode; + memcpy(loopbackremotenode->mpa_frame_buf, private_data, + private_data_len); + loopbackremotenode->mpa_frame_size = private_data_len; + + /* we are done handling this state. */ + /* set node to a TSA state */ + cm_node->state = NES_CM_STATE_TSA; + cm_node->tcp_cntxt.rcv_nxt = + loopbackremotenode->tcp_cntxt.loc_seq_num; + loopbackremotenode->tcp_cntxt.rcv_nxt = + cm_node->tcp_cntxt.loc_seq_num; + cm_node->tcp_cntxt.max_snd_wnd = + loopbackremotenode->tcp_cntxt.rcv_wnd; + loopbackremotenode->tcp_cntxt.max_snd_wnd = + cm_node->tcp_cntxt.rcv_wnd; + cm_node->tcp_cntxt.snd_wnd = + loopbackremotenode->tcp_cntxt.rcv_wnd; + loopbackremotenode->tcp_cntxt.snd_wnd = + cm_node->tcp_cntxt.rcv_wnd; + cm_node->tcp_cntxt.snd_wscale = + loopbackremotenode->tcp_cntxt.rcv_wscale; + loopbackremotenode->tcp_cntxt.snd_wscale = + cm_node->tcp_cntxt.rcv_wscale; + loopbackremotenode->state = NES_CM_STATE_MPAREQ_RCVD; + create_event(loopbackremotenode, NES_CM_EVENT_MPA_REQ); + } + return cm_node; + } + + start_buff = &cm_node->mpa_frame_buf[0] + sizeof(struct ietf_mpa_v2); + cm_node->mpa_frame_size = private_data_len; + + memcpy(start_buff, private_data, private_data_len); + + /* send a syn and goto syn sent state */ + cm_node->state = NES_CM_STATE_SYN_SENT; + ret = send_syn(cm_node, 0, NULL); + + if (ret) { + /* error in sending the syn free up the cm_node struct */ + nes_debug(NES_DBG_CM, "Api - connect() FAILED: dest " + "addr=0x%08X, port=0x%04x, cm_node=%p, cm_id = %p.\n", + cm_node->rem_addr, cm_node->rem_port, cm_node, + cm_node->cm_id); + rem_ref_cm_node(cm_node->cm_core, cm_node); + cm_node = NULL; + } + + if (cm_node) { + nes_debug(NES_DBG_CM, "Api - connect(): dest addr=0x%08X," + "port=0x%04x, cm_node=%p, cm_id = %p.\n", + cm_node->rem_addr, cm_node->rem_port, cm_node, + cm_node->cm_id); + } + + return cm_node; +} + + +/** + * mini_cm_accept - accept a connection + * This function is never called + */ +static int mini_cm_accept(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node) +{ + return 0; +} + + +/** + * mini_cm_reject - reject and teardown a connection + */ +static int mini_cm_reject(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node) +{ + int ret = 0; + int err = 0; + int passive_state; + struct nes_cm_event event; + struct iw_cm_id *cm_id = cm_node->cm_id; + struct nes_cm_node *loopback = cm_node->loopbackpartner; + + nes_debug(NES_DBG_CM, "%s cm_node=%p type=%d state=%d\n", + __func__, cm_node, cm_node->tcp_cntxt.client, cm_node->state); + + if (cm_node->tcp_cntxt.client) + return ret; + cleanup_retrans_entry(cm_node); + + if (!loopback) { + passive_state = atomic_add_return(1, &cm_node->passive_state); + if (passive_state == NES_SEND_RESET_EVENT) { + cm_node->state = NES_CM_STATE_CLOSED; + rem_ref_cm_node(cm_core, cm_node); + } else { + if (cm_node->state == NES_CM_STATE_LISTENER_DESTROYED) { + rem_ref_cm_node(cm_core, cm_node); + } else { + ret = send_mpa_reject(cm_node); + if (ret) { + cm_node->state = NES_CM_STATE_CLOSED; + err = send_reset(cm_node, NULL); + if (err) + WARN_ON(1); + } else { + cm_id->add_ref(cm_id); + } + } + } + } else { + cm_node->cm_id = NULL; + if (cm_node->state == NES_CM_STATE_LISTENER_DESTROYED) { + rem_ref_cm_node(cm_core, cm_node); + rem_ref_cm_node(cm_core, loopback); + } else { + event.cm_node = loopback; + event.cm_info.rem_addr = loopback->rem_addr; + event.cm_info.loc_addr = loopback->loc_addr; + event.cm_info.rem_port = loopback->rem_port; + event.cm_info.loc_port = loopback->loc_port; + event.cm_info.cm_id = loopback->cm_id; + cm_event_mpa_reject(&event); + rem_ref_cm_node(cm_core, cm_node); + loopback->state = NES_CM_STATE_CLOSING; + + cm_id = loopback->cm_id; + rem_ref_cm_node(cm_core, loopback); + cm_id->rem_ref(cm_id); + } + } + + return ret; +} + + +/** + * mini_cm_close + */ +static int mini_cm_close(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node) +{ + int ret = 0; + + if (!cm_core || !cm_node) + return -EINVAL; + + switch (cm_node->state) { + case NES_CM_STATE_SYN_RCVD: + case NES_CM_STATE_SYN_SENT: + case NES_CM_STATE_ONE_SIDE_ESTABLISHED: + case NES_CM_STATE_ESTABLISHED: + case NES_CM_STATE_ACCEPTING: + case NES_CM_STATE_MPAREQ_SENT: + case NES_CM_STATE_MPAREQ_RCVD: + cleanup_retrans_entry(cm_node); + send_reset(cm_node, NULL); + break; + case NES_CM_STATE_CLOSE_WAIT: + cm_node->state = NES_CM_STATE_LAST_ACK; + send_fin(cm_node, NULL); + break; + case NES_CM_STATE_FIN_WAIT1: + case NES_CM_STATE_FIN_WAIT2: + case NES_CM_STATE_LAST_ACK: + case NES_CM_STATE_TIME_WAIT: + case NES_CM_STATE_CLOSING: + ret = -1; + break; + case NES_CM_STATE_LISTENING: + cleanup_retrans_entry(cm_node); + send_reset(cm_node, NULL); + break; + case NES_CM_STATE_MPAREJ_RCVD: + case NES_CM_STATE_UNKNOWN: + case NES_CM_STATE_INITED: + case NES_CM_STATE_CLOSED: + case NES_CM_STATE_LISTENER_DESTROYED: + ret = rem_ref_cm_node(cm_core, cm_node); + break; + case NES_CM_STATE_TSA: + if (cm_node->send_entry) + printk(KERN_ERR "ERROR Close got called from STATE_TSA " + "send_entry=%p\n", cm_node->send_entry); + ret = rem_ref_cm_node(cm_core, cm_node); + break; + } + return ret; +} + + +/** + * recv_pkt - recv an ETHERNET packet, and process it through CM + * node state machine + */ +static int mini_cm_recv_pkt(struct nes_cm_core *cm_core, + struct nes_vnic *nesvnic, struct sk_buff *skb) +{ + struct nes_cm_node *cm_node = NULL; + struct nes_cm_listener *listener = NULL; + struct iphdr *iph; + struct tcphdr *tcph; + struct nes_cm_info nfo; + int skb_handled = 1; + __be32 tmp_daddr, tmp_saddr; + + if (!skb) + return 0; + if (skb->len < sizeof(struct iphdr) + sizeof(struct tcphdr)) + return 0; + + iph = (struct iphdr *)skb->data; + tcph = (struct tcphdr *)(skb->data + sizeof(struct iphdr)); + + nfo.loc_addr = ntohl(iph->daddr); + nfo.loc_port = ntohs(tcph->dest); + nfo.rem_addr = ntohl(iph->saddr); + nfo.rem_port = ntohs(tcph->source); + + /* If port mapper is available these should be mapped address info */ + nfo.mapped_loc_addr = ntohl(iph->daddr); + nfo.mapped_loc_port = ntohs(tcph->dest); + nfo.mapped_rem_addr = ntohl(iph->saddr); + nfo.mapped_rem_port = ntohs(tcph->source); + + tmp_daddr = cpu_to_be32(iph->daddr); + tmp_saddr = cpu_to_be32(iph->saddr); + + nes_debug(NES_DBG_CM, "Received packet: dest=%pI4:0x%04X src=%pI4:0x%04X\n", + &tmp_daddr, tcph->dest, &tmp_saddr, tcph->source); + + do { + cm_node = find_node(cm_core, + nfo.mapped_rem_port, nfo.mapped_rem_addr, + nfo.mapped_loc_port, nfo.mapped_loc_addr); + + if (!cm_node) { + /* Only type of packet accepted are for */ + /* the PASSIVE open (syn only) */ + if ((!tcph->syn) || (tcph->ack)) { + skb_handled = 0; + break; + } + listener = find_listener(cm_core, nfo.mapped_loc_addr, + nfo.mapped_loc_port, + NES_CM_LISTENER_ACTIVE_STATE, 0); + if (!listener) { + nfo.cm_id = NULL; + nfo.conn_type = 0; + nes_debug(NES_DBG_CM, "Unable to find listener for the pkt\n"); + skb_handled = 0; + break; + } + nfo.cm_id = listener->cm_id; + nfo.conn_type = listener->conn_type; + cm_node = make_cm_node(cm_core, nesvnic, &nfo, + listener); + if (!cm_node) { + nes_debug(NES_DBG_CM, "Unable to allocate " + "node\n"); + cm_packets_dropped++; + atomic_dec(&listener->ref_count); + dev_kfree_skb_any(skb); + break; + } + if (!tcph->rst && !tcph->fin) { + cm_node->state = NES_CM_STATE_LISTENING; + } else { + cm_packets_dropped++; + rem_ref_cm_node(cm_core, cm_node); + dev_kfree_skb_any(skb); + break; + } + add_ref_cm_node(cm_node); + } else if (cm_node->state == NES_CM_STATE_TSA) { + if (cm_node->nesqp->pau_mode) + nes_queue_mgt_skbs(skb, nesvnic, cm_node->nesqp); + else { + rem_ref_cm_node(cm_core, cm_node); + atomic_inc(&cm_accel_dropped_pkts); + dev_kfree_skb_any(skb); + } + break; + } + skb_reset_network_header(skb); + skb_set_transport_header(skb, sizeof(*tcph)); + skb->len = ntohs(iph->tot_len); + process_packet(cm_node, skb, cm_core); + rem_ref_cm_node(cm_core, cm_node); + } while (0); + return skb_handled; +} + + +/** + * nes_cm_alloc_core - allocate a top level instance of a cm core + */ +static struct nes_cm_core *nes_cm_alloc_core(void) +{ + struct nes_cm_core *cm_core; + + /* setup the CM core */ + /* alloc top level core control structure */ + cm_core = kzalloc(sizeof(*cm_core), GFP_KERNEL); + if (!cm_core) + return NULL; + + INIT_LIST_HEAD(&cm_core->connected_nodes); + init_timer(&cm_core->tcp_timer); + cm_core->tcp_timer.function = nes_cm_timer_tick; + + cm_core->mtu = NES_CM_DEFAULT_MTU; + cm_core->state = NES_CM_STATE_INITED; + cm_core->free_tx_pkt_max = NES_CM_DEFAULT_FREE_PKTS; + + atomic_set(&cm_core->events_posted, 0); + + cm_core->api = &nes_cm_api; + + spin_lock_init(&cm_core->ht_lock); + spin_lock_init(&cm_core->listen_list_lock); + + INIT_LIST_HEAD(&cm_core->listen_list.list); + + nes_debug(NES_DBG_CM, "Init CM Core completed -- cm_core=%p\n", cm_core); + + nes_debug(NES_DBG_CM, "Enable QUEUE EVENTS\n"); + cm_core->event_wq = create_singlethread_workqueue("nesewq"); + cm_core->post_event = nes_cm_post_event; + nes_debug(NES_DBG_CM, "Enable QUEUE DISCONNECTS\n"); + cm_core->disconn_wq = create_singlethread_workqueue("nesdwq"); + + print_core(cm_core); + return cm_core; +} + + +/** + * mini_cm_dealloc_core - deallocate a top level instance of a cm core + */ +static int mini_cm_dealloc_core(struct nes_cm_core *cm_core) +{ + nes_debug(NES_DBG_CM, "De-Alloc CM Core (%p)\n", cm_core); + + if (!cm_core) + return -EINVAL; + + barrier(); + + if (timer_pending(&cm_core->tcp_timer)) + del_timer(&cm_core->tcp_timer); + + destroy_workqueue(cm_core->event_wq); + destroy_workqueue(cm_core->disconn_wq); + nes_debug(NES_DBG_CM, "\n"); + kfree(cm_core); + + return 0; +} + + +/** + * mini_cm_get + */ +static int mini_cm_get(struct nes_cm_core *cm_core) +{ + return cm_core->state; +} + + +/** + * mini_cm_set + */ +static int mini_cm_set(struct nes_cm_core *cm_core, u32 type, u32 value) +{ + int ret = 0; + + switch (type) { + case NES_CM_SET_PKT_SIZE: + cm_core->mtu = value; + break; + case NES_CM_SET_FREE_PKT_Q_SIZE: + cm_core->free_tx_pkt_max = value; + break; + default: + /* unknown set option */ + ret = -EINVAL; + } + + return ret; +} + + +/** + * nes_cm_init_tsa_conn setup HW; MPA frames must be + * successfully exchanged when this is called + */ +static int nes_cm_init_tsa_conn(struct nes_qp *nesqp, struct nes_cm_node *cm_node) +{ + int ret = 0; + + if (!nesqp) + return -EINVAL; + + nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_IPV4 | + NES_QPCONTEXT_MISC_NO_NAGLE | NES_QPCONTEXT_MISC_DO_NOT_FRAG | + NES_QPCONTEXT_MISC_DROS); + + if (cm_node->tcp_cntxt.snd_wscale || cm_node->tcp_cntxt.rcv_wscale) + nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_WSCALE); + + nesqp->nesqp_context->misc2 |= cpu_to_le32(64 << NES_QPCONTEXT_MISC2_TTL_SHIFT); + + nesqp->nesqp_context->mss |= cpu_to_le32(((u32)cm_node->tcp_cntxt.mss) << 16); + + nesqp->nesqp_context->tcp_state_flow_label |= cpu_to_le32( + (u32)NES_QPCONTEXT_TCPSTATE_EST << NES_QPCONTEXT_TCPFLOW_TCP_STATE_SHIFT); + + nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32( + (cm_node->tcp_cntxt.snd_wscale << NES_QPCONTEXT_PDWSCALE_SND_WSCALE_SHIFT) & + NES_QPCONTEXT_PDWSCALE_SND_WSCALE_MASK); + + nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32( + (cm_node->tcp_cntxt.rcv_wscale << NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_SHIFT) & + NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_MASK); + + nesqp->nesqp_context->keepalive = cpu_to_le32(0x80); + nesqp->nesqp_context->ts_recent = 0; + nesqp->nesqp_context->ts_age = 0; + nesqp->nesqp_context->snd_nxt = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num); + nesqp->nesqp_context->snd_wnd = cpu_to_le32(cm_node->tcp_cntxt.snd_wnd); + nesqp->nesqp_context->rcv_nxt = cpu_to_le32(cm_node->tcp_cntxt.rcv_nxt); + nesqp->nesqp_context->rcv_wnd = cpu_to_le32(cm_node->tcp_cntxt.rcv_wnd << + cm_node->tcp_cntxt.rcv_wscale); + nesqp->nesqp_context->snd_max = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num); + nesqp->nesqp_context->snd_una = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num); + nesqp->nesqp_context->srtt = 0; + nesqp->nesqp_context->rttvar = cpu_to_le32(0x6); + nesqp->nesqp_context->ssthresh = cpu_to_le32(0x3FFFC000); + nesqp->nesqp_context->cwnd = cpu_to_le32(2 * cm_node->tcp_cntxt.mss); + nesqp->nesqp_context->snd_wl1 = cpu_to_le32(cm_node->tcp_cntxt.rcv_nxt); + nesqp->nesqp_context->snd_wl2 = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num); + nesqp->nesqp_context->max_snd_wnd = cpu_to_le32(cm_node->tcp_cntxt.max_snd_wnd); + + nes_debug(NES_DBG_CM, "QP%u: rcv_nxt = 0x%08X, snd_nxt = 0x%08X," + " Setting MSS to %u, PDWscale = 0x%08X, rcv_wnd = %u, context misc = 0x%08X.\n", + nesqp->hwqp.qp_id, le32_to_cpu(nesqp->nesqp_context->rcv_nxt), + le32_to_cpu(nesqp->nesqp_context->snd_nxt), + cm_node->tcp_cntxt.mss, le32_to_cpu(nesqp->nesqp_context->pd_index_wscale), + le32_to_cpu(nesqp->nesqp_context->rcv_wnd), + le32_to_cpu(nesqp->nesqp_context->misc)); + nes_debug(NES_DBG_CM, " snd_wnd = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->snd_wnd)); + nes_debug(NES_DBG_CM, " snd_cwnd = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->cwnd)); + nes_debug(NES_DBG_CM, " max_swnd = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->max_snd_wnd)); + + nes_debug(NES_DBG_CM, "Change cm_node state to TSA\n"); + cm_node->state = NES_CM_STATE_TSA; + + return ret; +} + + +/** + * nes_cm_disconn + */ +int nes_cm_disconn(struct nes_qp *nesqp) +{ + struct disconn_work *work; + + work = kzalloc(sizeof *work, GFP_ATOMIC); + if (!work) + return -ENOMEM; /* Timer will clean up */ + + nes_add_ref(&nesqp->ibqp); + work->nesqp = nesqp; + INIT_WORK(&work->work, nes_disconnect_worker); + queue_work(g_cm_core->disconn_wq, &work->work); + return 0; +} + + +/** + * nes_disconnect_worker + */ +static void nes_disconnect_worker(struct work_struct *work) +{ + struct disconn_work *dwork = container_of(work, struct disconn_work, work); + struct nes_qp *nesqp = dwork->nesqp; + + kfree(dwork); + nes_debug(NES_DBG_CM, "processing AEQE id 0x%04X for QP%u.\n", + nesqp->last_aeq, nesqp->hwqp.qp_id); + nes_cm_disconn_true(nesqp); + nes_rem_ref(&nesqp->ibqp); +} + + +/** + * nes_cm_disconn_true + */ +static int nes_cm_disconn_true(struct nes_qp *nesqp) +{ + unsigned long flags; + int ret = 0; + struct iw_cm_id *cm_id; + struct iw_cm_event cm_event; + struct nes_vnic *nesvnic; + u16 last_ae; + u8 original_hw_tcp_state; + u8 original_ibqp_state; + int disconn_status = 0; + int issue_disconn = 0; + int issue_close = 0; + int issue_flush = 0; + u32 flush_q = NES_CQP_FLUSH_RQ; + struct ib_event ibevent; + + if (!nesqp) { + nes_debug(NES_DBG_CM, "disconnect_worker nesqp is NULL\n"); + return -1; + } + + spin_lock_irqsave(&nesqp->lock, flags); + cm_id = nesqp->cm_id; + /* make sure we havent already closed this connection */ + if (!cm_id) { + nes_debug(NES_DBG_CM, "QP%u disconnect_worker cmid is NULL\n", + nesqp->hwqp.qp_id); + spin_unlock_irqrestore(&nesqp->lock, flags); + return -1; + } + + nesvnic = to_nesvnic(nesqp->ibqp.device); + nes_debug(NES_DBG_CM, "Disconnecting QP%u\n", nesqp->hwqp.qp_id); + + original_hw_tcp_state = nesqp->hw_tcp_state; + original_ibqp_state = nesqp->ibqp_state; + last_ae = nesqp->last_aeq; + + if (nesqp->term_flags) { + issue_disconn = 1; + issue_close = 1; + nesqp->cm_id = NULL; + del_timer(&nesqp->terminate_timer); + if (nesqp->flush_issued == 0) { + nesqp->flush_issued = 1; + issue_flush = 1; + } + } else if ((original_hw_tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) || + ((original_ibqp_state == IB_QPS_RTS) && + (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET))) { + issue_disconn = 1; + if (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET) + disconn_status = -ECONNRESET; + } + + if (((original_hw_tcp_state == NES_AEQE_TCP_STATE_CLOSED) || + (original_hw_tcp_state == NES_AEQE_TCP_STATE_TIME_WAIT) || + (last_ae == NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) || + (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET))) { + issue_close = 1; + nesqp->cm_id = NULL; + if (nesqp->flush_issued == 0) { + nesqp->flush_issued = 1; + issue_flush = 1; + } + } + + spin_unlock_irqrestore(&nesqp->lock, flags); + + if ((issue_flush) && (nesqp->destroyed == 0)) { + /* Flush the queue(s) */ + if (nesqp->hw_iwarp_state >= NES_AEQE_IWARP_STATE_TERMINATE) + flush_q |= NES_CQP_FLUSH_SQ; + flush_wqes(nesvnic->nesdev, nesqp, flush_q, 1); + + if (nesqp->term_flags) { + ibevent.device = nesqp->ibqp.device; + ibevent.event = nesqp->terminate_eventtype; + ibevent.element.qp = &nesqp->ibqp; + if (nesqp->ibqp.event_handler) + nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context); + } + } + + if ((cm_id) && (cm_id->event_handler)) { + if (issue_disconn) { + atomic_inc(&cm_disconnects); + cm_event.event = IW_CM_EVENT_DISCONNECT; + cm_event.status = disconn_status; + cm_event.local_addr = cm_id->local_addr; + cm_event.remote_addr = cm_id->remote_addr; + cm_event.private_data = NULL; + cm_event.private_data_len = 0; + + nes_debug(NES_DBG_CM, "Generating a CM Disconnect Event" + " for QP%u, SQ Head = %u, SQ Tail = %u. " + "cm_id = %p, refcount = %u.\n", + nesqp->hwqp.qp_id, nesqp->hwqp.sq_head, + nesqp->hwqp.sq_tail, cm_id, + atomic_read(&nesqp->refcount)); + + ret = cm_id->event_handler(cm_id, &cm_event); + if (ret) + nes_debug(NES_DBG_CM, "OFA CM event_handler " + "returned, ret=%d\n", ret); + } + + if (issue_close) { + atomic_inc(&cm_closes); + nes_disconnect(nesqp, 1); + + cm_id->provider_data = nesqp; + /* Send up the close complete event */ + cm_event.event = IW_CM_EVENT_CLOSE; + cm_event.status = 0; + cm_event.provider_data = cm_id->provider_data; + cm_event.local_addr = cm_id->local_addr; + cm_event.remote_addr = cm_id->remote_addr; + cm_event.private_data = NULL; + cm_event.private_data_len = 0; + + ret = cm_id->event_handler(cm_id, &cm_event); + if (ret) + nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret); + + cm_id->rem_ref(cm_id); + } + } + + return 0; +} + + +/** + * nes_disconnect + */ +static int nes_disconnect(struct nes_qp *nesqp, int abrupt) +{ + int ret = 0; + struct nes_vnic *nesvnic; + struct nes_device *nesdev; + struct nes_ib_device *nesibdev; + + nesvnic = to_nesvnic(nesqp->ibqp.device); + if (!nesvnic) + return -EINVAL; + + nesdev = nesvnic->nesdev; + nesibdev = nesvnic->nesibdev; + + nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n", + netdev_refcnt_read(nesvnic->netdev)); + + if (nesqp->active_conn) { + + /* indicate this connection is NOT active */ + nesqp->active_conn = 0; + } else { + /* Need to free the Last Streaming Mode Message */ + if (nesqp->ietf_frame) { + if (nesqp->lsmm_mr) + nesibdev->ibdev.dereg_mr(nesqp->lsmm_mr); + pci_free_consistent(nesdev->pcidev, + nesqp->private_data_len + nesqp->ietf_frame_size, + nesqp->ietf_frame, nesqp->ietf_frame_pbase); + } + } + + /* close the CM node down if it is still active */ + if (nesqp->cm_node) { + nes_debug(NES_DBG_CM, "Call close API\n"); + + g_cm_core->api->close(g_cm_core, nesqp->cm_node); + } + + return ret; +} + + +/** + * nes_accept + */ +int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) +{ + u64 u64temp; + struct ib_qp *ibqp; + struct nes_qp *nesqp; + struct nes_vnic *nesvnic; + struct nes_device *nesdev; + struct nes_cm_node *cm_node; + struct nes_adapter *adapter; + struct ib_qp_attr attr; + struct iw_cm_event cm_event; + struct nes_hw_qp_wqe *wqe; + struct nes_v4_quad nes_quad; + u32 crc_value; + int ret; + int passive_state; + struct nes_ib_device *nesibdev; + struct ib_mr *ibmr = NULL; + struct ib_phys_buf ibphysbuf; + struct nes_pd *nespd; + u64 tagged_offset; + u8 mpa_frame_offset = 0; + struct ietf_mpa_v2 *mpa_v2_frame; + u8 start_addr = 0; + u8 *start_ptr = &start_addr; + u8 **start_buff = &start_ptr; + u16 buff_len = 0; + struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr; + struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; + + ibqp = nes_get_qp(cm_id->device, conn_param->qpn); + if (!ibqp) + return -EINVAL; + + /* get all our handles */ + nesqp = to_nesqp(ibqp); + nesvnic = to_nesvnic(nesqp->ibqp.device); + nesdev = nesvnic->nesdev; + adapter = nesdev->nesadapter; + + cm_node = (struct nes_cm_node *)cm_id->provider_data; + nes_debug(NES_DBG_CM, "nes_accept: cm_node= %p nesvnic=%p, netdev=%p," + "%s\n", cm_node, nesvnic, nesvnic->netdev, + nesvnic->netdev->name); + + if (NES_CM_STATE_LISTENER_DESTROYED == cm_node->state) { + if (cm_node->loopbackpartner) + rem_ref_cm_node(cm_node->cm_core, cm_node->loopbackpartner); + rem_ref_cm_node(cm_node->cm_core, cm_node); + return -EINVAL; + } + + passive_state = atomic_add_return(1, &cm_node->passive_state); + if (passive_state == NES_SEND_RESET_EVENT) { + rem_ref_cm_node(cm_node->cm_core, cm_node); + return -ECONNRESET; + } + /* associate the node with the QP */ + nesqp->cm_node = (void *)cm_node; + cm_node->nesqp = nesqp; + + + nes_debug(NES_DBG_CM, "QP%u, cm_node=%p, jiffies = %lu listener = %p\n", + nesqp->hwqp.qp_id, cm_node, jiffies, cm_node->listener); + atomic_inc(&cm_accepts); + + nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n", + netdev_refcnt_read(nesvnic->netdev)); + + nesqp->ietf_frame_size = sizeof(struct ietf_mpa_v2); + /* allocate the ietf frame and space for private data */ + nesqp->ietf_frame = pci_alloc_consistent(nesdev->pcidev, + nesqp->ietf_frame_size + conn_param->private_data_len, + &nesqp->ietf_frame_pbase); + + if (!nesqp->ietf_frame) { + nes_debug(NES_DBG_CM, "Unable to allocate memory for private data\n"); + return -ENOMEM; + } + mpa_v2_frame = (struct ietf_mpa_v2 *)nesqp->ietf_frame; + + if (cm_node->mpa_frame_rev == IETF_MPA_V1) + mpa_frame_offset = 4; + + if (cm_node->mpa_frame_rev == IETF_MPA_V1 || + cm_node->mpav2_ird_ord == IETF_NO_IRD_ORD) { + record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord); + } + + memcpy(mpa_v2_frame->priv_data, conn_param->private_data, + conn_param->private_data_len); + + cm_build_mpa_frame(cm_node, start_buff, &buff_len, nesqp->ietf_frame, MPA_KEY_REPLY); + nesqp->private_data_len = conn_param->private_data_len; + + /* setup our first outgoing iWarp send WQE (the IETF frame response) */ + wqe = &nesqp->hwqp.sq_vbase[0]; + + if (raddr->sin_addr.s_addr != laddr->sin_addr.s_addr) { + u64temp = (unsigned long)nesqp; + nesibdev = nesvnic->nesibdev; + nespd = nesqp->nespd; + ibphysbuf.addr = nesqp->ietf_frame_pbase + mpa_frame_offset; + ibphysbuf.size = buff_len; + tagged_offset = (u64)(unsigned long)*start_buff; + ibmr = nesibdev->ibdev.reg_phys_mr((struct ib_pd *)nespd, + &ibphysbuf, 1, + IB_ACCESS_LOCAL_WRITE, + &tagged_offset); + if (!ibmr) { + nes_debug(NES_DBG_CM, "Unable to register memory region" + "for lSMM for cm_node = %p \n", + cm_node); + pci_free_consistent(nesdev->pcidev, + nesqp->private_data_len + nesqp->ietf_frame_size, + nesqp->ietf_frame, nesqp->ietf_frame_pbase); + return -ENOMEM; + } + + ibmr->pd = &nespd->ibpd; + ibmr->device = nespd->ibpd.device; + nesqp->lsmm_mr = ibmr; + + u64temp |= NES_SW_CONTEXT_ALIGN >> 1; + set_wqe_64bit_value(wqe->wqe_words, + NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, + u64temp); + wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = + cpu_to_le32(NES_IWARP_SQ_WQE_STREAMING | + NES_IWARP_SQ_WQE_WRPDU); + wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX] = + cpu_to_le32(buff_len); + set_wqe_64bit_value(wqe->wqe_words, + NES_IWARP_SQ_WQE_FRAG0_LOW_IDX, + (u64)(unsigned long)(*start_buff)); + wqe->wqe_words[NES_IWARP_SQ_WQE_LENGTH0_IDX] = + cpu_to_le32(buff_len); + wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = ibmr->lkey; + if (nesqp->sq_kmapped) { + nesqp->sq_kmapped = 0; + kunmap(nesqp->page); + } + + nesqp->nesqp_context->ird_ord_sizes |= + cpu_to_le32(NES_QPCONTEXT_ORDIRD_LSMM_PRESENT | + NES_QPCONTEXT_ORDIRD_WRPDU); + } else { + nesqp->nesqp_context->ird_ord_sizes |= + cpu_to_le32(NES_QPCONTEXT_ORDIRD_WRPDU); + } + nesqp->skip_lsmm = 1; + + /* Cache the cm_id in the qp */ + nesqp->cm_id = cm_id; + cm_node->cm_id = cm_id; + + /* nesqp->cm_node = (void *)cm_id->provider_data; */ + cm_id->provider_data = nesqp; + nesqp->active_conn = 0; + + if (cm_node->state == NES_CM_STATE_TSA) + nes_debug(NES_DBG_CM, "Already state = TSA for cm_node=%p\n", + cm_node); + + nes_cm_init_tsa_conn(nesqp, cm_node); + + nesqp->nesqp_context->tcpPorts[0] = + cpu_to_le16(cm_node->mapped_loc_port); + nesqp->nesqp_context->tcpPorts[1] = + cpu_to_le16(cm_node->mapped_rem_port); + + nesqp->nesqp_context->ip0 = cpu_to_le32(cm_node->mapped_rem_addr); + + nesqp->nesqp_context->misc2 |= cpu_to_le32( + (u32)PCI_FUNC(nesdev->pcidev->devfn) << + NES_QPCONTEXT_MISC2_SRC_IP_SHIFT); + + nesqp->nesqp_context->arp_index_vlan |= + cpu_to_le32(nes_arp_table(nesdev, + le32_to_cpu(nesqp->nesqp_context->ip0), NULL, + NES_ARP_RESOLVE) << 16); + + nesqp->nesqp_context->ts_val_delta = cpu_to_le32( + jiffies - nes_read_indexed(nesdev, NES_IDX_TCP_NOW)); + + nesqp->nesqp_context->ird_index = cpu_to_le32(nesqp->hwqp.qp_id); + + nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32( + ((u32)1 << NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT)); + nesqp->nesqp_context->ird_ord_sizes |= + cpu_to_le32((u32)cm_node->ord_size); + + memset(&nes_quad, 0, sizeof(nes_quad)); + nes_quad.DstIpAdrIndex = + cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24); + nes_quad.SrcIpadr = htonl(cm_node->mapped_rem_addr); + nes_quad.TcpPorts[0] = htons(cm_node->mapped_rem_port); + nes_quad.TcpPorts[1] = htons(cm_node->mapped_loc_port); + + /* Produce hash key */ + crc_value = get_crc_value(&nes_quad); + nesqp->hte_index = cpu_to_be32(crc_value ^ 0xffffffff); + nes_debug(NES_DBG_CM, "HTE Index = 0x%08X, CRC = 0x%08X\n", + nesqp->hte_index, nesqp->hte_index & adapter->hte_index_mask); + + nesqp->hte_index &= adapter->hte_index_mask; + nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index); + + cm_node->cm_core->api->accelerated(cm_node->cm_core, cm_node); + + nes_debug(NES_DBG_CM, "QP%u, Destination IP = 0x%08X:0x%04X, local = " + "0x%08X:0x%04X, rcv_nxt=0x%08X, snd_nxt=0x%08X, mpa + " + "private data length=%u.\n", nesqp->hwqp.qp_id, + ntohl(raddr->sin_addr.s_addr), ntohs(raddr->sin_port), + ntohl(laddr->sin_addr.s_addr), ntohs(laddr->sin_port), + le32_to_cpu(nesqp->nesqp_context->rcv_nxt), + le32_to_cpu(nesqp->nesqp_context->snd_nxt), + buff_len); + + /* notify OF layer that accept event was successful */ + cm_id->add_ref(cm_id); + nes_add_ref(&nesqp->ibqp); + + cm_event.event = IW_CM_EVENT_ESTABLISHED; + cm_event.status = 0; + cm_event.provider_data = (void *)nesqp; + cm_event.local_addr = cm_id->local_addr; + cm_event.remote_addr = cm_id->remote_addr; + cm_event.private_data = NULL; + cm_event.private_data_len = 0; + cm_event.ird = cm_node->ird_size; + cm_event.ord = cm_node->ord_size; + + ret = cm_id->event_handler(cm_id, &cm_event); + attr.qp_state = IB_QPS_RTS; + nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL); + if (cm_node->loopbackpartner) { + cm_node->loopbackpartner->mpa_frame_size = + nesqp->private_data_len; + /* copy entire MPA frame to our cm_node's frame */ + memcpy(cm_node->loopbackpartner->mpa_frame_buf, + conn_param->private_data, conn_param->private_data_len); + create_event(cm_node->loopbackpartner, NES_CM_EVENT_CONNECTED); + } + if (ret) + printk(KERN_ERR "%s[%u] OFA CM event_handler returned, " + "ret=%d\n", __func__, __LINE__, ret); + + return 0; +} + + +/** + * nes_reject + */ +int nes_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) +{ + struct nes_cm_node *cm_node; + struct nes_cm_node *loopback; + struct nes_cm_core *cm_core; + u8 *start_buff; + + atomic_inc(&cm_rejects); + cm_node = (struct nes_cm_node *)cm_id->provider_data; + loopback = cm_node->loopbackpartner; + cm_core = cm_node->cm_core; + cm_node->cm_id = cm_id; + + if (pdata_len + sizeof(struct ietf_mpa_v2) > MAX_CM_BUFFER) + return -EINVAL; + + if (loopback) { + memcpy(&loopback->mpa_frame.priv_data, pdata, pdata_len); + loopback->mpa_frame.priv_data_len = pdata_len; + loopback->mpa_frame_size = pdata_len; + } else { + start_buff = &cm_node->mpa_frame_buf[0] + sizeof(struct ietf_mpa_v2); + cm_node->mpa_frame_size = pdata_len; + memcpy(start_buff, pdata, pdata_len); + } + return cm_core->api->reject(cm_core, cm_node); +} + + +/** + * nes_connect + * setup and launch cm connect node + */ +int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) +{ + struct ib_qp *ibqp; + struct nes_qp *nesqp; + struct nes_vnic *nesvnic; + struct nes_device *nesdev; + struct nes_cm_node *cm_node; + struct nes_cm_info cm_info; + int apbvt_set = 0; + struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr; + struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; + struct iwpm_dev_data pm_reg_msg; + struct iwpm_sa_data pm_msg; + int iwpm_err = 0; + + if (cm_id->remote_addr.ss_family != AF_INET) + return -ENOSYS; + ibqp = nes_get_qp(cm_id->device, conn_param->qpn); + if (!ibqp) + return -EINVAL; + nesqp = to_nesqp(ibqp); + if (!nesqp) + return -EINVAL; + nesvnic = to_nesvnic(nesqp->ibqp.device); + if (!nesvnic) + return -EINVAL; + nesdev = nesvnic->nesdev; + if (!nesdev) + return -EINVAL; + + if (!laddr->sin_port || !raddr->sin_port) + return -EINVAL; + + nes_debug(NES_DBG_CM, "QP%u, current IP = 0x%08X, Destination IP = " + "0x%08X:0x%04X, local = 0x%08X:0x%04X.\n", nesqp->hwqp.qp_id, + ntohl(nesvnic->local_ipaddr), ntohl(raddr->sin_addr.s_addr), + ntohs(raddr->sin_port), ntohl(laddr->sin_addr.s_addr), + ntohs(laddr->sin_port)); + + atomic_inc(&cm_connects); + nesqp->active_conn = 1; + + /* cache the cm_id in the qp */ + nesqp->cm_id = cm_id; + cm_id->provider_data = nesqp; + nesqp->private_data_len = conn_param->private_data_len; + + nes_debug(NES_DBG_CM, "requested ord = 0x%08X.\n", (u32)conn_param->ord); + nes_debug(NES_DBG_CM, "mpa private data len =%u\n", + conn_param->private_data_len); + + /* set up the connection params for the node */ + cm_info.loc_addr = ntohl(laddr->sin_addr.s_addr); + cm_info.loc_port = ntohs(laddr->sin_port); + cm_info.rem_addr = ntohl(raddr->sin_addr.s_addr); + cm_info.rem_port = ntohs(raddr->sin_port); + cm_info.cm_id = cm_id; + cm_info.conn_type = NES_CM_IWARP_CONN_TYPE; + + /* No port mapper available, go with the specified peer information */ + cm_info.mapped_loc_addr = cm_info.loc_addr; + cm_info.mapped_loc_port = cm_info.loc_port; + cm_info.mapped_rem_addr = cm_info.rem_addr; + cm_info.mapped_rem_port = cm_info.rem_port; + + nes_form_reg_msg(nesvnic, &pm_reg_msg); + iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_NES); + if (iwpm_err) { + nes_debug(NES_DBG_NLMSG, + "Port Mapper reg pid fail (err = %d).\n", iwpm_err); + } + if (iwpm_valid_pid() && !iwpm_err) { + nes_form_pm_msg(&cm_info, &pm_msg); + iwpm_err = iwpm_add_and_query_mapping(&pm_msg, RDMA_NL_NES); + if (iwpm_err) + nes_debug(NES_DBG_NLMSG, + "Port Mapper query fail (err = %d).\n", iwpm_err); + else + nes_record_pm_msg(&cm_info, &pm_msg); + } + + if (laddr->sin_addr.s_addr != raddr->sin_addr.s_addr) { + nes_manage_apbvt(nesvnic, cm_info.mapped_loc_port, + PCI_FUNC(nesdev->pcidev->devfn), NES_MANAGE_APBVT_ADD); + apbvt_set = 1; + } + + if (nes_create_mapinfo(&cm_info)) + return -ENOMEM; + + cm_id->add_ref(cm_id); + + /* create a connect CM node connection */ + cm_node = g_cm_core->api->connect(g_cm_core, nesvnic, + conn_param->private_data_len, (void *)conn_param->private_data, + &cm_info); + if (!cm_node) { + if (apbvt_set) + nes_manage_apbvt(nesvnic, cm_info.mapped_loc_port, + PCI_FUNC(nesdev->pcidev->devfn), + NES_MANAGE_APBVT_DEL); + + nes_debug(NES_DBG_NLMSG, "Delete mapped_loc_port = %04X\n", + cm_info.mapped_loc_port); + nes_remove_mapinfo(cm_info.loc_addr, cm_info.loc_port, + cm_info.mapped_loc_addr, cm_info.mapped_loc_port); + cm_id->rem_ref(cm_id); + return -ENOMEM; + } + + record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord); + if (cm_node->send_rdma0_op == SEND_RDMA_READ_ZERO && + cm_node->ord_size == 0) + cm_node->ord_size = 1; + + cm_node->apbvt_set = apbvt_set; + nesqp->cm_node = cm_node; + cm_node->nesqp = nesqp; + nes_add_ref(&nesqp->ibqp); + + return 0; +} + + +/** + * nes_create_listen + */ +int nes_create_listen(struct iw_cm_id *cm_id, int backlog) +{ + struct nes_vnic *nesvnic; + struct nes_cm_listener *cm_node; + struct nes_cm_info cm_info; + int err; + struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr; + + nes_debug(NES_DBG_CM, "cm_id = %p, local port = 0x%04X.\n", + cm_id, ntohs(laddr->sin_port)); + + if (cm_id->local_addr.ss_family != AF_INET) + return -ENOSYS; + nesvnic = to_nesvnic(cm_id->device); + if (!nesvnic) + return -EINVAL; + + nes_debug(NES_DBG_CM, "nesvnic=%p, netdev=%p, %s\n", + nesvnic, nesvnic->netdev, nesvnic->netdev->name); + + nes_debug(NES_DBG_CM, "nesvnic->local_ipaddr=0x%08x, sin_addr.s_addr=0x%08x\n", + nesvnic->local_ipaddr, laddr->sin_addr.s_addr); + + /* setup listen params in our api call struct */ + cm_info.loc_addr = ntohl(nesvnic->local_ipaddr); + cm_info.loc_port = ntohs(laddr->sin_port); + cm_info.backlog = backlog; + cm_info.cm_id = cm_id; + + cm_info.conn_type = NES_CM_IWARP_CONN_TYPE; + + /* No port mapper available, go with the specified info */ + cm_info.mapped_loc_addr = cm_info.loc_addr; + cm_info.mapped_loc_port = cm_info.loc_port; + + cm_node = g_cm_core->api->listen(g_cm_core, nesvnic, &cm_info); + if (!cm_node) { + printk(KERN_ERR "%s[%u] Error returned from listen API call\n", + __func__, __LINE__); + return -ENOMEM; + } + + cm_id->provider_data = cm_node; + + if (!cm_node->reused_node) { + if (nes_create_mapinfo(&cm_info)) + return -ENOMEM; + + err = nes_manage_apbvt(nesvnic, cm_node->mapped_loc_port, + PCI_FUNC(nesvnic->nesdev->pcidev->devfn), + NES_MANAGE_APBVT_ADD); + if (err) { + printk(KERN_ERR "nes_manage_apbvt call returned %d.\n", + err); + g_cm_core->api->stop_listener(g_cm_core, (void *)cm_node); + return err; + } + atomic_inc(&cm_listens_created); + } + + cm_id->add_ref(cm_id); + cm_id->provider_data = (void *)cm_node; + + + return 0; +} + + +/** + * nes_destroy_listen + */ +int nes_destroy_listen(struct iw_cm_id *cm_id) +{ + if (cm_id->provider_data) + g_cm_core->api->stop_listener(g_cm_core, cm_id->provider_data); + else + nes_debug(NES_DBG_CM, "cm_id->provider_data was NULL\n"); + + cm_id->rem_ref(cm_id); + + return 0; +} + + +/** + * nes_cm_recv + */ +int nes_cm_recv(struct sk_buff *skb, struct net_device *netdevice) +{ + int rc = 0; + + cm_packets_received++; + if ((g_cm_core) && (g_cm_core->api)) + rc = g_cm_core->api->recv_pkt(g_cm_core, netdev_priv(netdevice), skb); + else + nes_debug(NES_DBG_CM, "Unable to process packet for CM," + " cm is not setup properly.\n"); + + return rc; +} + + +/** + * nes_cm_start + * Start and init a cm core module + */ +int nes_cm_start(void) +{ + nes_debug(NES_DBG_CM, "\n"); + /* create the primary CM core, pass this handle to subsequent core inits */ + g_cm_core = nes_cm_alloc_core(); + if (g_cm_core) + return 0; + else + return -ENOMEM; +} + + +/** + * nes_cm_stop + * stop and dealloc all cm core instances + */ +int nes_cm_stop(void) +{ + g_cm_core->api->destroy_cm_core(g_cm_core); + return 0; +} + + +/** + * cm_event_connected + * handle a connected event, setup QPs and HW + */ +static void cm_event_connected(struct nes_cm_event *event) +{ + struct nes_qp *nesqp; + struct nes_vnic *nesvnic; + struct nes_device *nesdev; + struct nes_cm_node *cm_node; + struct nes_adapter *nesadapter; + struct ib_qp_attr attr; + struct iw_cm_id *cm_id; + struct iw_cm_event cm_event; + struct nes_v4_quad nes_quad; + u32 crc_value; + int ret; + struct sockaddr_in *laddr; + struct sockaddr_in *raddr; + struct sockaddr_in *cm_event_laddr; + + /* get all our handles */ + cm_node = event->cm_node; + cm_id = cm_node->cm_id; + nes_debug(NES_DBG_CM, "cm_event_connected - %p - cm_id = %p\n", cm_node, cm_id); + nesqp = (struct nes_qp *)cm_id->provider_data; + nesvnic = to_nesvnic(nesqp->ibqp.device); + nesdev = nesvnic->nesdev; + nesadapter = nesdev->nesadapter; + laddr = (struct sockaddr_in *)&cm_id->local_addr; + raddr = (struct sockaddr_in *)&cm_id->remote_addr; + cm_event_laddr = (struct sockaddr_in *)&cm_event.local_addr; + + if (nesqp->destroyed) + return; + atomic_inc(&cm_connecteds); + nes_debug(NES_DBG_CM, "QP%u attempting to connect to 0x%08X:0x%04X on" + " local port 0x%04X. jiffies = %lu.\n", + nesqp->hwqp.qp_id, ntohl(raddr->sin_addr.s_addr), + ntohs(raddr->sin_port), ntohs(laddr->sin_port), jiffies); + + nes_cm_init_tsa_conn(nesqp, cm_node); + + /* set the QP tsa context */ + nesqp->nesqp_context->tcpPorts[0] = + cpu_to_le16(cm_node->mapped_loc_port); + nesqp->nesqp_context->tcpPorts[1] = + cpu_to_le16(cm_node->mapped_rem_port); + nesqp->nesqp_context->ip0 = cpu_to_le32(cm_node->mapped_rem_addr); + + nesqp->nesqp_context->misc2 |= cpu_to_le32( + (u32)PCI_FUNC(nesdev->pcidev->devfn) << + NES_QPCONTEXT_MISC2_SRC_IP_SHIFT); + nesqp->nesqp_context->arp_index_vlan |= cpu_to_le32( + nes_arp_table(nesdev, + le32_to_cpu(nesqp->nesqp_context->ip0), + NULL, NES_ARP_RESOLVE) << 16); + nesqp->nesqp_context->ts_val_delta = cpu_to_le32( + jiffies - nes_read_indexed(nesdev, NES_IDX_TCP_NOW)); + nesqp->nesqp_context->ird_index = cpu_to_le32(nesqp->hwqp.qp_id); + nesqp->nesqp_context->ird_ord_sizes |= + cpu_to_le32((u32)1 << + NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT); + nesqp->nesqp_context->ird_ord_sizes |= + cpu_to_le32((u32)cm_node->ord_size); + + /* Adjust tail for not having a LSMM */ + /*nesqp->hwqp.sq_tail = 1;*/ + + build_rdma0_msg(cm_node, &nesqp); + + nes_write32(nesdev->regs + NES_WQE_ALLOC, + (1 << 24) | 0x00800000 | nesqp->hwqp.qp_id); + + memset(&nes_quad, 0, sizeof(nes_quad)); + + nes_quad.DstIpAdrIndex = + cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24); + nes_quad.SrcIpadr = htonl(cm_node->mapped_rem_addr); + nes_quad.TcpPorts[0] = htons(cm_node->mapped_rem_port); + nes_quad.TcpPorts[1] = htons(cm_node->mapped_loc_port); + + /* Produce hash key */ + crc_value = get_crc_value(&nes_quad); + nesqp->hte_index = cpu_to_be32(crc_value ^ 0xffffffff); + nes_debug(NES_DBG_CM, "HTE Index = 0x%08X, After CRC = 0x%08X\n", + nesqp->hte_index, nesqp->hte_index & nesadapter->hte_index_mask); + + nesqp->hte_index &= nesadapter->hte_index_mask; + nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index); + + nesqp->ietf_frame = &cm_node->mpa_frame; + nesqp->private_data_len = (u8)cm_node->mpa_frame_size; + cm_node->cm_core->api->accelerated(cm_node->cm_core, cm_node); + + /* notify OF layer we successfully created the requested connection */ + cm_event.event = IW_CM_EVENT_CONNECT_REPLY; + cm_event.status = 0; + cm_event.provider_data = cm_id->provider_data; + cm_event_laddr->sin_family = AF_INET; + cm_event_laddr->sin_port = laddr->sin_port; + cm_event.remote_addr = cm_id->remote_addr; + + cm_event.private_data = (void *)event->cm_node->mpa_frame_buf; + cm_event.private_data_len = (u8)event->cm_node->mpa_frame_size; + cm_event.ird = cm_node->ird_size; + cm_event.ord = cm_node->ord_size; + + cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.rem_addr); + ret = cm_id->event_handler(cm_id, &cm_event); + nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret); + + if (ret) + printk(KERN_ERR "%s[%u] OFA CM event_handler returned, " + "ret=%d\n", __func__, __LINE__, ret); + attr.qp_state = IB_QPS_RTS; + nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL); + + nes_debug(NES_DBG_CM, "Exiting connect thread for QP%u. jiffies = " + "%lu\n", nesqp->hwqp.qp_id, jiffies); + + return; +} + + +/** + * cm_event_connect_error + */ +static void cm_event_connect_error(struct nes_cm_event *event) +{ + struct nes_qp *nesqp; + struct iw_cm_id *cm_id; + struct iw_cm_event cm_event; + /* struct nes_cm_info cm_info; */ + int ret; + + if (!event->cm_node) + return; + + cm_id = event->cm_node->cm_id; + if (!cm_id) + return; + + nes_debug(NES_DBG_CM, "cm_node=%p, cm_id=%p\n", event->cm_node, cm_id); + nesqp = cm_id->provider_data; + + if (!nesqp) + return; + + /* notify OF layer about this connection error event */ + /* cm_id->rem_ref(cm_id); */ + nesqp->cm_id = NULL; + cm_id->provider_data = NULL; + cm_event.event = IW_CM_EVENT_CONNECT_REPLY; + cm_event.status = -ECONNRESET; + cm_event.provider_data = cm_id->provider_data; + cm_event.local_addr = cm_id->local_addr; + cm_event.remote_addr = cm_id->remote_addr; + cm_event.private_data = NULL; + cm_event.private_data_len = 0; + +#ifdef CONFIG_INFINIBAND_NES_DEBUG + { + struct sockaddr_in *cm_event_laddr = (struct sockaddr_in *) + &cm_event.local_addr; + struct sockaddr_in *cm_event_raddr = (struct sockaddr_in *) + &cm_event.remote_addr; + nes_debug(NES_DBG_CM, "call CM_EVENT REJECTED, local_addr=%08x, remote_addr=%08x\n", + cm_event_laddr->sin_addr.s_addr, cm_event_raddr->sin_addr.s_addr); + } +#endif + + ret = cm_id->event_handler(cm_id, &cm_event); + nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret); + if (ret) + printk(KERN_ERR "%s[%u] OFA CM event_handler returned, " + "ret=%d\n", __func__, __LINE__, ret); + cm_id->rem_ref(cm_id); + + rem_ref_cm_node(event->cm_node->cm_core, event->cm_node); + return; +} + + +/** + * cm_event_reset + */ +static void cm_event_reset(struct nes_cm_event *event) +{ + struct nes_qp *nesqp; + struct iw_cm_id *cm_id; + struct iw_cm_event cm_event; + /* struct nes_cm_info cm_info; */ + int ret; + + if (!event->cm_node) + return; + + if (!event->cm_node->cm_id) + return; + + cm_id = event->cm_node->cm_id; + + nes_debug(NES_DBG_CM, "%p - cm_id = %p\n", event->cm_node, cm_id); + nesqp = cm_id->provider_data; + if (!nesqp) + return; + + nesqp->cm_id = NULL; + /* cm_id->provider_data = NULL; */ + cm_event.event = IW_CM_EVENT_DISCONNECT; + cm_event.status = -ECONNRESET; + cm_event.provider_data = cm_id->provider_data; + cm_event.local_addr = cm_id->local_addr; + cm_event.remote_addr = cm_id->remote_addr; + cm_event.private_data = NULL; + cm_event.private_data_len = 0; + + cm_id->add_ref(cm_id); + ret = cm_id->event_handler(cm_id, &cm_event); + atomic_inc(&cm_closes); + cm_event.event = IW_CM_EVENT_CLOSE; + cm_event.status = 0; + cm_event.provider_data = cm_id->provider_data; + cm_event.local_addr = cm_id->local_addr; + cm_event.remote_addr = cm_id->remote_addr; + cm_event.private_data = NULL; + cm_event.private_data_len = 0; + nes_debug(NES_DBG_CM, "NODE %p Generating CLOSE\n", event->cm_node); + ret = cm_id->event_handler(cm_id, &cm_event); + + nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret); + + + /* notify OF layer about this connection error event */ + cm_id->rem_ref(cm_id); + + return; +} + + +/** + * cm_event_mpa_req + */ +static void cm_event_mpa_req(struct nes_cm_event *event) +{ + struct iw_cm_id *cm_id; + struct iw_cm_event cm_event; + int ret; + struct nes_cm_node *cm_node; + struct sockaddr_in *cm_event_laddr = (struct sockaddr_in *) + &cm_event.local_addr; + struct sockaddr_in *cm_event_raddr = (struct sockaddr_in *) + &cm_event.remote_addr; + + cm_node = event->cm_node; + if (!cm_node) + return; + cm_id = cm_node->cm_id; + + atomic_inc(&cm_connect_reqs); + nes_debug(NES_DBG_CM, "cm_node = %p - cm_id = %p, jiffies = %lu\n", + cm_node, cm_id, jiffies); + + cm_event.event = IW_CM_EVENT_CONNECT_REQUEST; + cm_event.status = 0; + cm_event.provider_data = (void *)cm_node; + + cm_event_laddr->sin_family = AF_INET; + cm_event_laddr->sin_port = htons(event->cm_info.loc_port); + cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.loc_addr); + + cm_event_raddr->sin_family = AF_INET; + cm_event_raddr->sin_port = htons(event->cm_info.rem_port); + cm_event_raddr->sin_addr.s_addr = htonl(event->cm_info.rem_addr); + cm_event.private_data = cm_node->mpa_frame_buf; + cm_event.private_data_len = (u8)cm_node->mpa_frame_size; + if (cm_node->mpa_frame_rev == IETF_MPA_V1) { + cm_event.ird = NES_MAX_IRD; + cm_event.ord = NES_MAX_ORD; + } else { + cm_event.ird = cm_node->ird_size; + cm_event.ord = cm_node->ord_size; + } + + ret = cm_id->event_handler(cm_id, &cm_event); + if (ret) + printk(KERN_ERR "%s[%u] OFA CM event_handler returned, ret=%d\n", + __func__, __LINE__, ret); + return; +} + + +static void cm_event_mpa_reject(struct nes_cm_event *event) +{ + struct iw_cm_id *cm_id; + struct iw_cm_event cm_event; + struct nes_cm_node *cm_node; + int ret; + struct sockaddr_in *cm_event_laddr = (struct sockaddr_in *) + &cm_event.local_addr; + struct sockaddr_in *cm_event_raddr = (struct sockaddr_in *) + &cm_event.remote_addr; + + cm_node = event->cm_node; + if (!cm_node) + return; + cm_id = cm_node->cm_id; + + atomic_inc(&cm_connect_reqs); + nes_debug(NES_DBG_CM, "cm_node = %p - cm_id = %p, jiffies = %lu\n", + cm_node, cm_id, jiffies); + + cm_event.event = IW_CM_EVENT_CONNECT_REPLY; + cm_event.status = -ECONNREFUSED; + cm_event.provider_data = cm_id->provider_data; + + cm_event_laddr->sin_family = AF_INET; + cm_event_laddr->sin_port = htons(event->cm_info.loc_port); + cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.loc_addr); + + cm_event_raddr->sin_family = AF_INET; + cm_event_raddr->sin_port = htons(event->cm_info.rem_port); + cm_event_raddr->sin_addr.s_addr = htonl(event->cm_info.rem_addr); + + cm_event.private_data = cm_node->mpa_frame_buf; + cm_event.private_data_len = (u8)cm_node->mpa_frame_size; + + nes_debug(NES_DBG_CM, "call CM_EVENT_MPA_REJECTED, local_addr=%08x, " + "remove_addr=%08x\n", + cm_event_laddr->sin_addr.s_addr, + cm_event_raddr->sin_addr.s_addr); + + ret = cm_id->event_handler(cm_id, &cm_event); + if (ret) + printk(KERN_ERR "%s[%u] OFA CM event_handler returned, ret=%d\n", + __func__, __LINE__, ret); + + return; +} + + +static void nes_cm_event_handler(struct work_struct *); + +/** + * nes_cm_post_event + * post an event to the cm event handler + */ +static int nes_cm_post_event(struct nes_cm_event *event) +{ + atomic_inc(&event->cm_node->cm_core->events_posted); + add_ref_cm_node(event->cm_node); + event->cm_info.cm_id->add_ref(event->cm_info.cm_id); + INIT_WORK(&event->event_work, nes_cm_event_handler); + nes_debug(NES_DBG_CM, "cm_node=%p queue_work, event=%p\n", + event->cm_node, event); + + queue_work(event->cm_node->cm_core->event_wq, &event->event_work); + + nes_debug(NES_DBG_CM, "Exit\n"); + return 0; +} + + +/** + * nes_cm_event_handler + * worker function to handle cm events + * will free instance of nes_cm_event + */ +static void nes_cm_event_handler(struct work_struct *work) +{ + struct nes_cm_event *event = container_of(work, struct nes_cm_event, + event_work); + struct nes_cm_core *cm_core; + + if ((!event) || (!event->cm_node) || (!event->cm_node->cm_core)) + return; + + cm_core = event->cm_node->cm_core; + nes_debug(NES_DBG_CM, "event=%p, event->type=%u, events posted=%u\n", + event, event->type, atomic_read(&cm_core->events_posted)); + + switch (event->type) { + case NES_CM_EVENT_MPA_REQ: + cm_event_mpa_req(event); + nes_debug(NES_DBG_CM, "cm_node=%p CM Event: MPA REQUEST\n", + event->cm_node); + break; + case NES_CM_EVENT_RESET: + nes_debug(NES_DBG_CM, "cm_node = %p CM Event: RESET\n", + event->cm_node); + cm_event_reset(event); + break; + case NES_CM_EVENT_CONNECTED: + if ((!event->cm_node->cm_id) || + (event->cm_node->state != NES_CM_STATE_TSA)) + break; + cm_event_connected(event); + nes_debug(NES_DBG_CM, "CM Event: CONNECTED\n"); + break; + case NES_CM_EVENT_MPA_REJECT: + if ((!event->cm_node->cm_id) || + (event->cm_node->state == NES_CM_STATE_TSA)) + break; + cm_event_mpa_reject(event); + nes_debug(NES_DBG_CM, "CM Event: REJECT\n"); + break; + + case NES_CM_EVENT_ABORTED: + if ((!event->cm_node->cm_id) || + (event->cm_node->state == NES_CM_STATE_TSA)) + break; + cm_event_connect_error(event); + nes_debug(NES_DBG_CM, "CM Event: ABORTED\n"); + break; + case NES_CM_EVENT_DROPPED_PKT: + nes_debug(NES_DBG_CM, "CM Event: DROPPED PKT\n"); + break; + default: + nes_debug(NES_DBG_CM, "CM Event: UNKNOWN EVENT TYPE\n"); + break; + } + + atomic_dec(&cm_core->events_posted); + event->cm_info.cm_id->rem_ref(event->cm_info.cm_id); + rem_ref_cm_node(cm_core, event->cm_node); + kfree(event); + + return; +} diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h new file mode 100644 index 0000000..f522cf6 --- /dev/null +++ b/drivers/infiniband/hw/nes/nes_cm.h @@ -0,0 +1,476 @@ +/* + * Copyright (c) 2006 - 2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef NES_CM_H +#define NES_CM_H + +#define QUEUE_EVENTS + +#define NES_MANAGE_APBVT_DEL 0 +#define NES_MANAGE_APBVT_ADD 1 + +#define NES_MPA_REQUEST_ACCEPT 1 +#define NES_MPA_REQUEST_REJECT 2 + +/* IETF MPA -- defines, enums, structs */ +#define IEFT_MPA_KEY_REQ "MPA ID Req Frame" +#define IEFT_MPA_KEY_REP "MPA ID Rep Frame" +#define IETF_MPA_KEY_SIZE 16 +#define IETF_MPA_VERSION 1 +#define IETF_MAX_PRIV_DATA_LEN 512 +#define IETF_MPA_FRAME_SIZE 20 +#define IETF_RTR_MSG_SIZE 4 +#define IETF_MPA_V2_FLAG 0x10 + +/* IETF RTR MSG Fields */ +#define IETF_PEER_TO_PEER 0x8000 +#define IETF_FLPDU_ZERO_LEN 0x4000 +#define IETF_RDMA0_WRITE 0x8000 +#define IETF_RDMA0_READ 0x4000 +#define IETF_NO_IRD_ORD 0x3FFF +#define NES_MAX_IRD 0x40 +#define NES_MAX_ORD 0x7F + +enum ietf_mpa_flags { + IETF_MPA_FLAGS_MARKERS = 0x80, /* receive Markers */ + IETF_MPA_FLAGS_CRC = 0x40, /* receive Markers */ + IETF_MPA_FLAGS_REJECT = 0x20, /* Reject */ +}; + +struct ietf_mpa_v1 { + u8 key[IETF_MPA_KEY_SIZE]; + u8 flags; + u8 rev; + __be16 priv_data_len; + u8 priv_data[0]; +}; + +#define ietf_mpa_req_resp_frame ietf_mpa_frame + +struct ietf_rtr_msg { + __be16 ctrl_ird; + __be16 ctrl_ord; +}; + +struct ietf_mpa_v2 { + u8 key[IETF_MPA_KEY_SIZE]; + u8 flags; + u8 rev; + __be16 priv_data_len; + struct ietf_rtr_msg rtr_msg; + u8 priv_data[0]; +}; + +struct nes_v4_quad { + u32 rsvd0; + __le32 DstIpAdrIndex; /* Only most significant 5 bits are valid */ + __be32 SrcIpadr; + __be16 TcpPorts[2]; /* src is low, dest is high */ +}; + +struct nes_cm_node; +enum nes_timer_type { + NES_TIMER_TYPE_SEND, + NES_TIMER_TYPE_RECV, + NES_TIMER_NODE_CLEANUP, + NES_TIMER_TYPE_CLOSE, +}; + +#define NES_PASSIVE_STATE_INDICATED 0 +#define NES_DO_NOT_SEND_RESET_EVENT 1 +#define NES_SEND_RESET_EVENT 2 + +#define MAX_NES_IFS 4 + +#define SET_ACK 1 +#define SET_SYN 2 +#define SET_FIN 4 +#define SET_RST 8 + +#define TCP_OPTIONS_PADDING 3 + +struct option_base { + u8 optionnum; + u8 length; +}; + +enum option_numbers { + OPTION_NUMBER_END, + OPTION_NUMBER_NONE, + OPTION_NUMBER_MSS, + OPTION_NUMBER_WINDOW_SCALE, + OPTION_NUMBER_SACK_PERM, + OPTION_NUMBER_SACK, + OPTION_NUMBER_WRITE0 = 0xbc +}; + +struct option_mss { + u8 optionnum; + u8 length; + __be16 mss; +}; + +struct option_windowscale { + u8 optionnum; + u8 length; + u8 shiftcount; +}; + +union all_known_options { + char as_end; + struct option_base as_base; + struct option_mss as_mss; + struct option_windowscale as_windowscale; +}; + +struct nes_timer_entry { + struct list_head list; + unsigned long timetosend; /* jiffies */ + struct sk_buff *skb; + u32 type; + u32 retrycount; + u32 retranscount; + u32 context; + u32 seq_num; + u32 send_retrans; + int close_when_complete; + struct net_device *netdev; +}; + +#define NES_DEFAULT_RETRYS 64 +#define NES_DEFAULT_RETRANS 8 +#ifdef CONFIG_INFINIBAND_NES_DEBUG +#define NES_RETRY_TIMEOUT (1000*HZ/1000) +#else +#define NES_RETRY_TIMEOUT (3000*HZ/1000) +#endif +#define NES_SHORT_TIME (10) +#define NES_LONG_TIME (2000*HZ/1000) +#define NES_MAX_TIMEOUT ((unsigned long) (12*HZ)) + +#define NES_CM_HASHTABLE_SIZE 1024 +#define NES_CM_TCP_TIMER_INTERVAL 3000 +#define NES_CM_DEFAULT_MTU 1540 +#define NES_CM_DEFAULT_FRAME_CNT 10 +#define NES_CM_THREAD_STACK_SIZE 256 +#define NES_CM_DEFAULT_RCV_WND 64240 // before we know that window scaling is allowed +#define NES_CM_DEFAULT_RCV_WND_SCALED 256960 // after we know that window scaling is allowed +#define NES_CM_DEFAULT_RCV_WND_SCALE 2 +#define NES_CM_DEFAULT_FREE_PKTS 0x000A +#define NES_CM_FREE_PKT_LO_WATERMARK 2 + +#define NES_CM_DEFAULT_MSS 536 + +#define NES_CM_DEF_SEQ 0x159bf75f +#define NES_CM_DEF_LOCAL_ID 0x3b47 + +#define NES_CM_DEF_SEQ2 0x18ed5740 +#define NES_CM_DEF_LOCAL_ID2 0xb807 +#define MAX_CM_BUFFER (IETF_MPA_FRAME_SIZE + IETF_RTR_MSG_SIZE + IETF_MAX_PRIV_DATA_LEN) + +typedef u32 nes_addr_t; + +#define nes_cm_tsa_context nes_qp_context + +struct nes_qp; + +/* cm node transition states */ +enum nes_cm_node_state { + NES_CM_STATE_UNKNOWN, + NES_CM_STATE_INITED, + NES_CM_STATE_LISTENING, + NES_CM_STATE_SYN_RCVD, + NES_CM_STATE_SYN_SENT, + NES_CM_STATE_ONE_SIDE_ESTABLISHED, + NES_CM_STATE_ESTABLISHED, + NES_CM_STATE_ACCEPTING, + NES_CM_STATE_MPAREQ_SENT, + NES_CM_STATE_MPAREQ_RCVD, + NES_CM_STATE_MPAREJ_RCVD, + NES_CM_STATE_TSA, + NES_CM_STATE_FIN_WAIT1, + NES_CM_STATE_FIN_WAIT2, + NES_CM_STATE_CLOSE_WAIT, + NES_CM_STATE_TIME_WAIT, + NES_CM_STATE_LAST_ACK, + NES_CM_STATE_CLOSING, + NES_CM_STATE_LISTENER_DESTROYED, + NES_CM_STATE_CLOSED +}; + +enum mpa_frame_version { + IETF_MPA_V1 = 1, + IETF_MPA_V2 = 2 +}; + +enum mpa_frame_key { + MPA_KEY_REQUEST, + MPA_KEY_REPLY +}; + +enum send_rdma0 { + SEND_RDMA_READ_ZERO = 1, + SEND_RDMA_WRITE_ZERO = 2 +}; + +enum nes_tcpip_pkt_type { + NES_PKT_TYPE_UNKNOWN, + NES_PKT_TYPE_SYN, + NES_PKT_TYPE_SYNACK, + NES_PKT_TYPE_ACK, + NES_PKT_TYPE_FIN, + NES_PKT_TYPE_RST +}; + + +/* type of nes connection */ +enum nes_cm_conn_type { + NES_CM_IWARP_CONN_TYPE, +}; + +/* CM context params */ +struct nes_cm_tcp_context { + u8 client; + + u32 loc_seq_num; + u32 loc_ack_num; + u32 rem_ack_num; + u32 rcv_nxt; + + u32 loc_id; + u32 rem_id; + + u32 snd_wnd; + u32 max_snd_wnd; + + u32 rcv_wnd; + u32 mss; + u8 snd_wscale; + u8 rcv_wscale; + + struct nes_cm_tsa_context tsa_cntxt; + struct timeval sent_ts; +}; + + +enum nes_cm_listener_state { + NES_CM_LISTENER_PASSIVE_STATE = 1, + NES_CM_LISTENER_ACTIVE_STATE = 2, + NES_CM_LISTENER_EITHER_STATE = 3 +}; + +struct nes_cm_listener { + struct list_head list; + struct nes_cm_core *cm_core; + u8 loc_mac[ETH_ALEN]; + nes_addr_t loc_addr, mapped_loc_addr; + u16 loc_port, mapped_loc_port; + struct iw_cm_id *cm_id; + enum nes_cm_conn_type conn_type; + atomic_t ref_count; + struct nes_vnic *nesvnic; + atomic_t pend_accepts_cnt; + int backlog; + enum nes_cm_listener_state listener_state; + u32 reused_node; +}; + +/* per connection node and node state information */ +struct nes_cm_node { + nes_addr_t loc_addr, rem_addr; + nes_addr_t mapped_loc_addr, mapped_rem_addr; + u16 loc_port, rem_port; + u16 mapped_loc_port, mapped_rem_port; + + u8 loc_mac[ETH_ALEN]; + u8 rem_mac[ETH_ALEN]; + + enum nes_cm_node_state state; + struct nes_cm_tcp_context tcp_cntxt; + struct nes_cm_core *cm_core; + struct sk_buff_head resend_list; + atomic_t ref_count; + struct net_device *netdev; + + struct nes_cm_node *loopbackpartner; + + struct nes_timer_entry *send_entry; + struct nes_timer_entry *recv_entry; + spinlock_t retrans_list_lock; + enum send_rdma0 send_rdma0_op; + + union { + struct ietf_mpa_v1 mpa_frame; + struct ietf_mpa_v2 mpa_v2_frame; + u8 mpa_frame_buf[MAX_CM_BUFFER]; + }; + enum mpa_frame_version mpa_frame_rev; + u16 ird_size; + u16 ord_size; + u16 mpav2_ird_ord; + + u16 mpa_frame_size; + struct iw_cm_id *cm_id; + struct list_head list; + int accelerated; + struct nes_cm_listener *listener; + enum nes_cm_conn_type conn_type; + struct nes_vnic *nesvnic; + int apbvt_set; + int accept_pend; + struct list_head timer_entry; + struct list_head reset_entry; + struct nes_qp *nesqp; + atomic_t passive_state; +}; + +/* structure for client or CM to fill when making CM api calls. */ +/* - only need to set relevant data, based on op. */ +struct nes_cm_info { + union { + struct iw_cm_id *cm_id; + struct net_device *netdev; + }; + + u16 loc_port; + u16 rem_port; + nes_addr_t loc_addr; + nes_addr_t rem_addr; + u16 mapped_loc_port; + u16 mapped_rem_port; + nes_addr_t mapped_loc_addr; + nes_addr_t mapped_rem_addr; + + enum nes_cm_conn_type conn_type; + int backlog; +}; + +/* CM event codes */ +enum nes_cm_event_type { + NES_CM_EVENT_UNKNOWN, + NES_CM_EVENT_ESTABLISHED, + NES_CM_EVENT_MPA_REQ, + NES_CM_EVENT_MPA_CONNECT, + NES_CM_EVENT_MPA_ACCEPT, + NES_CM_EVENT_MPA_REJECT, + NES_CM_EVENT_MPA_ESTABLISHED, + NES_CM_EVENT_CONNECTED, + NES_CM_EVENT_CLOSED, + NES_CM_EVENT_RESET, + NES_CM_EVENT_DROPPED_PKT, + NES_CM_EVENT_CLOSE_IMMED, + NES_CM_EVENT_CLOSE_HARD, + NES_CM_EVENT_CLOSE_CLEAN, + NES_CM_EVENT_ABORTED, + NES_CM_EVENT_SEND_FIRST +}; + +/* event to post to CM event handler */ +struct nes_cm_event { + enum nes_cm_event_type type; + + struct nes_cm_info cm_info; + struct work_struct event_work; + struct nes_cm_node *cm_node; +}; + +struct nes_cm_core { + enum nes_cm_node_state state; + + atomic_t listen_node_cnt; + struct nes_cm_node listen_list; + spinlock_t listen_list_lock; + + u32 mtu; + u32 free_tx_pkt_max; + u32 rx_pkt_posted; + atomic_t ht_node_cnt; + struct list_head connected_nodes; + /* struct list_head hashtable[NES_CM_HASHTABLE_SIZE]; */ + spinlock_t ht_lock; + + struct timer_list tcp_timer; + + struct nes_cm_ops *api; + + int (*post_event)(struct nes_cm_event *event); + atomic_t events_posted; + struct workqueue_struct *event_wq; + struct workqueue_struct *disconn_wq; + + atomic_t node_cnt; + u64 aborted_connects; + u32 options; + + struct nes_cm_node *current_listen_node; +}; + + +#define NES_CM_SET_PKT_SIZE (1 << 1) +#define NES_CM_SET_FREE_PKT_Q_SIZE (1 << 2) + +/* CM ops/API for client interface */ +struct nes_cm_ops { + int (*accelerated)(struct nes_cm_core *, struct nes_cm_node *); + struct nes_cm_listener * (*listen)(struct nes_cm_core *, struct nes_vnic *, + struct nes_cm_info *); + int (*stop_listener)(struct nes_cm_core *, struct nes_cm_listener *); + struct nes_cm_node * (*connect)(struct nes_cm_core *, + struct nes_vnic *, u16, void *, + struct nes_cm_info *); + int (*close)(struct nes_cm_core *, struct nes_cm_node *); + int (*accept)(struct nes_cm_core *, struct nes_cm_node *); + int (*reject)(struct nes_cm_core *, struct nes_cm_node *); + int (*recv_pkt)(struct nes_cm_core *, struct nes_vnic *, + struct sk_buff *); + int (*destroy_cm_core)(struct nes_cm_core *); + int (*get)(struct nes_cm_core *); + int (*set)(struct nes_cm_core *, u32, u32); +}; + +int schedule_nes_timer(struct nes_cm_node *, struct sk_buff *, + enum nes_timer_type, int, int); + +int nes_accept(struct iw_cm_id *, struct iw_cm_conn_param *); +int nes_reject(struct iw_cm_id *, const void *, u8); +int nes_connect(struct iw_cm_id *, struct iw_cm_conn_param *); +int nes_create_listen(struct iw_cm_id *, int); +int nes_destroy_listen(struct iw_cm_id *); + +int nes_cm_recv(struct sk_buff *, struct net_device *); +int nes_cm_start(void); +int nes_cm_stop(void); +int nes_add_ref_cm_node(struct nes_cm_node *cm_node); +int nes_rem_ref_cm_node(struct nes_cm_node *cm_node); + +#endif /* NES_CM_H */ diff --git a/drivers/infiniband/hw/nes/nes_context.h b/drivers/infiniband/hw/nes/nes_context.h new file mode 100644 index 0000000..a69eef1 --- /dev/null +++ b/drivers/infiniband/hw/nes/nes_context.h @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef NES_CONTEXT_H +#define NES_CONTEXT_H + +struct nes_qp_context { + __le32 misc; + __le32 cqs; + __le32 sq_addr_low; + __le32 sq_addr_high; + __le32 rq_addr_low; + __le32 rq_addr_high; + __le32 misc2; + __le16 tcpPorts[2]; + __le32 ip0; + __le32 ip1; + __le32 ip2; + __le32 ip3; + __le32 mss; + __le32 arp_index_vlan; + __le32 tcp_state_flow_label; + __le32 pd_index_wscale; + __le32 keepalive; + u32 ts_recent; + u32 ts_age; + __le32 snd_nxt; + __le32 snd_wnd; + __le32 rcv_nxt; + __le32 rcv_wnd; + __le32 snd_max; + __le32 snd_una; + u32 srtt; + __le32 rttvar; + __le32 ssthresh; + __le32 cwnd; + __le32 snd_wl1; + __le32 snd_wl2; + __le32 max_snd_wnd; + __le32 ts_val_delta; + u32 retransmit; + u32 probe_cnt; + u32 hte_index; + __le32 q2_addr_low; + __le32 q2_addr_high; + __le32 ird_index; + u32 Rsvd3; + __le32 ird_ord_sizes; + u32 mrkr_offset; + __le32 aeq_token_low; + __le32 aeq_token_high; +}; + +/* QP Context Misc Field */ + +#define NES_QPCONTEXT_MISC_IWARP_VER_MASK 0x00000003 +#define NES_QPCONTEXT_MISC_IWARP_VER_SHIFT 0 +#define NES_QPCONTEXT_MISC_EFB_SIZE_MASK 0x000000C0 +#define NES_QPCONTEXT_MISC_EFB_SIZE_SHIFT 6 +#define NES_QPCONTEXT_MISC_RQ_SIZE_MASK 0x00000300 +#define NES_QPCONTEXT_MISC_RQ_SIZE_SHIFT 8 +#define NES_QPCONTEXT_MISC_SQ_SIZE_MASK 0x00000c00 +#define NES_QPCONTEXT_MISC_SQ_SIZE_SHIFT 10 +#define NES_QPCONTEXT_MISC_PCI_FCN_MASK 0x00007000 +#define NES_QPCONTEXT_MISC_PCI_FCN_SHIFT 12 +#define NES_QPCONTEXT_MISC_DUP_ACKS_MASK 0x00070000 +#define NES_QPCONTEXT_MISC_DUP_ACKS_SHIFT 16 + +enum nes_qp_context_misc_bits { + NES_QPCONTEXT_MISC_RX_WQE_SIZE = 0x00000004, + NES_QPCONTEXT_MISC_IPV4 = 0x00000008, + NES_QPCONTEXT_MISC_DO_NOT_FRAG = 0x00000010, + NES_QPCONTEXT_MISC_INSERT_VLAN = 0x00000020, + NES_QPCONTEXT_MISC_DROS = 0x00008000, + NES_QPCONTEXT_MISC_WSCALE = 0x00080000, + NES_QPCONTEXT_MISC_KEEPALIVE = 0x00100000, + NES_QPCONTEXT_MISC_TIMESTAMP = 0x00200000, + NES_QPCONTEXT_MISC_SACK = 0x00400000, + NES_QPCONTEXT_MISC_RDMA_WRITE_EN = 0x00800000, + NES_QPCONTEXT_MISC_RDMA_READ_EN = 0x01000000, + NES_QPCONTEXT_MISC_WBIND_EN = 0x10000000, + NES_QPCONTEXT_MISC_FAST_REGISTER_EN = 0x20000000, + NES_QPCONTEXT_MISC_PRIV_EN = 0x40000000, + NES_QPCONTEXT_MISC_NO_NAGLE = 0x80000000 +}; + +enum nes_qp_acc_wq_sizes { + HCONTEXT_TSA_WQ_SIZE_4 = 0, + HCONTEXT_TSA_WQ_SIZE_32 = 1, + HCONTEXT_TSA_WQ_SIZE_128 = 2, + HCONTEXT_TSA_WQ_SIZE_512 = 3 +}; + +/* QP Context Misc2 Fields */ +#define NES_QPCONTEXT_MISC2_TTL_MASK 0x000000ff +#define NES_QPCONTEXT_MISC2_TTL_SHIFT 0 +#define NES_QPCONTEXT_MISC2_HOP_LIMIT_MASK 0x000000ff +#define NES_QPCONTEXT_MISC2_HOP_LIMIT_SHIFT 0 +#define NES_QPCONTEXT_MISC2_LIMIT_MASK 0x00000300 +#define NES_QPCONTEXT_MISC2_LIMIT_SHIFT 8 +#define NES_QPCONTEXT_MISC2_NIC_INDEX_MASK 0x0000fc00 +#define NES_QPCONTEXT_MISC2_NIC_INDEX_SHIFT 10 +#define NES_QPCONTEXT_MISC2_SRC_IP_MASK 0x001f0000 +#define NES_QPCONTEXT_MISC2_SRC_IP_SHIFT 16 +#define NES_QPCONTEXT_MISC2_TOS_MASK 0xff000000 +#define NES_QPCONTEXT_MISC2_TOS_SHIFT 24 +#define NES_QPCONTEXT_MISC2_TRAFFIC_CLASS_MASK 0xff000000 +#define NES_QPCONTEXT_MISC2_TRAFFIC_CLASS_SHIFT 24 + +/* QP Context Tcp State/Flow Label Fields */ +#define NES_QPCONTEXT_TCPFLOW_FLOW_LABEL_MASK 0x000fffff +#define NES_QPCONTEXT_TCPFLOW_FLOW_LABEL_SHIFT 0 +#define NES_QPCONTEXT_TCPFLOW_TCP_STATE_MASK 0xf0000000 +#define NES_QPCONTEXT_TCPFLOW_TCP_STATE_SHIFT 28 + +enum nes_qp_tcp_state { + NES_QPCONTEXT_TCPSTATE_CLOSED = 1, + NES_QPCONTEXT_TCPSTATE_EST = 5, + NES_QPCONTEXT_TCPSTATE_TIME_WAIT = 11, +}; + +/* QP Context PD Index/wscale Fields */ +#define NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_MASK 0x0000000f +#define NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_SHIFT 0 +#define NES_QPCONTEXT_PDWSCALE_SND_WSCALE_MASK 0x00000f00 +#define NES_QPCONTEXT_PDWSCALE_SND_WSCALE_SHIFT 8 +#define NES_QPCONTEXT_PDWSCALE_PDINDEX_MASK 0xffff0000 +#define NES_QPCONTEXT_PDWSCALE_PDINDEX_SHIFT 16 + +/* QP Context Keepalive Fields */ +#define NES_QPCONTEXT_KEEPALIVE_DELTA_MASK 0x0000ffff +#define NES_QPCONTEXT_KEEPALIVE_DELTA_SHIFT 0 +#define NES_QPCONTEXT_KEEPALIVE_PROBE_CNT_MASK 0x00ff0000 +#define NES_QPCONTEXT_KEEPALIVE_PROBE_CNT_SHIFT 16 +#define NES_QPCONTEXT_KEEPALIVE_INTV_MASK 0xff000000 +#define NES_QPCONTEXT_KEEPALIVE_INTV_SHIFT 24 + +/* QP Context ORD/IRD Fields */ +#define NES_QPCONTEXT_ORDIRD_ORDSIZE_MASK 0x0000007f +#define NES_QPCONTEXT_ORDIRD_ORDSIZE_SHIFT 0 +#define NES_QPCONTEXT_ORDIRD_IRDSIZE_MASK 0x00030000 +#define NES_QPCONTEXT_ORDIRD_IRDSIZE_SHIFT 16 +#define NES_QPCONTEXT_ORDIRD_IWARP_MODE_MASK 0x30000000 +#define NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT 28 + +enum nes_ord_ird_bits { + NES_QPCONTEXT_ORDIRD_WRPDU = 0x02000000, + NES_QPCONTEXT_ORDIRD_LSMM_PRESENT = 0x04000000, + NES_QPCONTEXT_ORDIRD_ALSMM = 0x08000000, + NES_QPCONTEXT_ORDIRD_AAH = 0x40000000, + NES_QPCONTEXT_ORDIRD_RNMC = 0x80000000 +}; + +enum nes_iwarp_qp_state { + NES_QPCONTEXT_IWARP_STATE_NONEXIST = 0, + NES_QPCONTEXT_IWARP_STATE_IDLE = 1, + NES_QPCONTEXT_IWARP_STATE_RTS = 2, + NES_QPCONTEXT_IWARP_STATE_CLOSING = 3, + NES_QPCONTEXT_IWARP_STATE_TERMINATE = 5, + NES_QPCONTEXT_IWARP_STATE_ERROR = 6 +}; + + +#endif /* NES_CONTEXT_H */ diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c new file mode 100644 index 0000000..02120d3 --- /dev/null +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -0,0 +1,3937 @@ +/* + * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nes.h" + +static unsigned int nes_lro_max_aggr = NES_LRO_MAX_AGGR; +module_param(nes_lro_max_aggr, uint, 0444); +MODULE_PARM_DESC(nes_lro_max_aggr, "NIC LRO max packet aggregation"); + +static int wide_ppm_offset; +module_param(wide_ppm_offset, int, 0644); +MODULE_PARM_DESC(wide_ppm_offset, "Increase CX4 interface clock ppm offset, 0=100ppm (default), 1=300ppm"); + +static u32 crit_err_count; +u32 int_mod_timer_init; +u32 int_mod_cq_depth_256; +u32 int_mod_cq_depth_128; +u32 int_mod_cq_depth_32; +u32 int_mod_cq_depth_24; +u32 int_mod_cq_depth_16; +u32 int_mod_cq_depth_4; +u32 int_mod_cq_depth_1; +static const u8 nes_max_critical_error_count = 100; +#include "nes_cm.h" + +static void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq); +static void nes_init_csr_ne020(struct nes_device *nesdev, u8 hw_rev, u8 port_count); +static int nes_init_serdes(struct nes_device *nesdev, u8 hw_rev, u8 port_count, + struct nes_adapter *nesadapter, u8 OneG_Mode); +static void nes_nic_napi_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq); +static void nes_process_aeq(struct nes_device *nesdev, struct nes_hw_aeq *aeq); +static void nes_process_ceq(struct nes_device *nesdev, struct nes_hw_ceq *ceq); +static void nes_process_iwarp_aeqe(struct nes_device *nesdev, + struct nes_hw_aeqe *aeqe); +static void process_critical_error(struct nes_device *nesdev); +static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number); +static unsigned int nes_reset_adapter_ne020(struct nes_device *nesdev, u8 *OneG_Mode); +static void nes_terminate_start_timer(struct nes_qp *nesqp); + +#ifdef CONFIG_INFINIBAND_NES_DEBUG +static unsigned char *nes_iwarp_state_str[] = { + "Non-Existent", + "Idle", + "RTS", + "Closing", + "RSVD1", + "Terminate", + "Error", + "RSVD2", +}; + +static unsigned char *nes_tcp_state_str[] = { + "Non-Existent", + "Closed", + "Listen", + "SYN Sent", + "SYN Rcvd", + "Established", + "Close Wait", + "FIN Wait 1", + "Closing", + "Last Ack", + "FIN Wait 2", + "Time Wait", + "RSVD1", + "RSVD2", + "RSVD3", + "RSVD4", +}; +#endif + +static inline void print_ip(struct nes_cm_node *cm_node) +{ + unsigned char *rem_addr; + if (cm_node) { + rem_addr = (unsigned char *)&cm_node->rem_addr; + printk(KERN_ERR PFX "Remote IP addr: %pI4\n", rem_addr); + } +} + +/** + * nes_nic_init_timer_defaults + */ +void nes_nic_init_timer_defaults(struct nes_device *nesdev, u8 jumbomode) +{ + unsigned long flags; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer; + + spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags); + + shared_timer->timer_in_use_min = NES_NIC_FAST_TIMER_LOW; + shared_timer->timer_in_use_max = NES_NIC_FAST_TIMER_HIGH; + if (jumbomode) { + shared_timer->threshold_low = DEFAULT_JUMBO_NES_QL_LOW; + shared_timer->threshold_target = DEFAULT_JUMBO_NES_QL_TARGET; + shared_timer->threshold_high = DEFAULT_JUMBO_NES_QL_HIGH; + } else { + shared_timer->threshold_low = DEFAULT_NES_QL_LOW; + shared_timer->threshold_target = DEFAULT_NES_QL_TARGET; + shared_timer->threshold_high = DEFAULT_NES_QL_HIGH; + } + + /* todo use netdev->mtu to set thresholds */ + spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags); +} + + +/** + * nes_nic_init_timer + */ +static void nes_nic_init_timer(struct nes_device *nesdev) +{ + unsigned long flags; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer; + + spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags); + + if (shared_timer->timer_in_use_old == 0) { + nesdev->deepcq_count = 0; + shared_timer->timer_direction_upward = 0; + shared_timer->timer_direction_downward = 0; + shared_timer->timer_in_use = NES_NIC_FAST_TIMER; + shared_timer->timer_in_use_old = 0; + + } + if (shared_timer->timer_in_use != shared_timer->timer_in_use_old) { + shared_timer->timer_in_use_old = shared_timer->timer_in_use; + nes_write32(nesdev->regs+NES_PERIODIC_CONTROL, + 0x80000000 | ((u32)(shared_timer->timer_in_use*8))); + } + /* todo use netdev->mtu to set thresholds */ + spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags); +} + + +/** + * nes_nic_tune_timer + */ +static void nes_nic_tune_timer(struct nes_device *nesdev) +{ + unsigned long flags; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer; + u16 cq_count = nesdev->currcq_count; + + spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags); + + if (shared_timer->cq_count_old <= cq_count) + shared_timer->cq_direction_downward = 0; + else + shared_timer->cq_direction_downward++; + shared_timer->cq_count_old = cq_count; + if (shared_timer->cq_direction_downward > NES_NIC_CQ_DOWNWARD_TREND) { + if (cq_count <= shared_timer->threshold_low && + shared_timer->threshold_low > 4) { + shared_timer->threshold_low = shared_timer->threshold_low/2; + shared_timer->cq_direction_downward=0; + nesdev->currcq_count = 0; + spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags); + return; + } + } + + if (cq_count > 1) { + nesdev->deepcq_count += cq_count; + if (cq_count <= shared_timer->threshold_low) { /* increase timer gently */ + shared_timer->timer_direction_upward++; + shared_timer->timer_direction_downward = 0; + } else if (cq_count <= shared_timer->threshold_target) { /* balanced */ + shared_timer->timer_direction_upward = 0; + shared_timer->timer_direction_downward = 0; + } else if (cq_count <= shared_timer->threshold_high) { /* decrease timer gently */ + shared_timer->timer_direction_downward++; + shared_timer->timer_direction_upward = 0; + } else if (cq_count <= (shared_timer->threshold_high) * 2) { + shared_timer->timer_in_use -= 2; + shared_timer->timer_direction_upward = 0; + shared_timer->timer_direction_downward++; + } else { + shared_timer->timer_in_use -= 4; + shared_timer->timer_direction_upward = 0; + shared_timer->timer_direction_downward++; + } + + if (shared_timer->timer_direction_upward > 3 ) { /* using history */ + shared_timer->timer_in_use += 3; + shared_timer->timer_direction_upward = 0; + shared_timer->timer_direction_downward = 0; + } + if (shared_timer->timer_direction_downward > 5) { /* using history */ + shared_timer->timer_in_use -= 4 ; + shared_timer->timer_direction_downward = 0; + shared_timer->timer_direction_upward = 0; + } + } + + /* boundary checking */ + if (shared_timer->timer_in_use > shared_timer->threshold_high) + shared_timer->timer_in_use = shared_timer->threshold_high; + else if (shared_timer->timer_in_use < shared_timer->threshold_low) + shared_timer->timer_in_use = shared_timer->threshold_low; + + nesdev->currcq_count = 0; + + spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags); +} + + +/** + * nes_init_adapter - initialize adapter + */ +struct nes_adapter *nes_init_adapter(struct nes_device *nesdev, u8 hw_rev) { + struct nes_adapter *nesadapter = NULL; + unsigned long num_pds; + u32 u32temp; + u32 port_count; + u16 max_rq_wrs; + u16 max_sq_wrs; + u32 max_mr; + u32 max_256pbl; + u32 max_4kpbl; + u32 max_qp; + u32 max_irrq; + u32 max_cq; + u32 hte_index_mask; + u32 adapter_size; + u32 arp_table_size; + u16 vendor_id; + u16 device_id; + u8 OneG_Mode; + u8 func_index; + + /* search the list of existing adapters */ + list_for_each_entry(nesadapter, &nes_adapter_list, list) { + nes_debug(NES_DBG_INIT, "Searching Adapter list for PCI devfn = 0x%X," + " adapter PCI slot/bus = %u/%u, pci devices PCI slot/bus = %u/%u, .\n", + nesdev->pcidev->devfn, + PCI_SLOT(nesadapter->devfn), + nesadapter->bus_number, + PCI_SLOT(nesdev->pcidev->devfn), + nesdev->pcidev->bus->number ); + if ((PCI_SLOT(nesadapter->devfn) == PCI_SLOT(nesdev->pcidev->devfn)) && + (nesadapter->bus_number == nesdev->pcidev->bus->number)) { + nesadapter->ref_count++; + return nesadapter; + } + } + + /* no adapter found */ + num_pds = pci_resource_len(nesdev->pcidev, BAR_1) >> PAGE_SHIFT; + if ((hw_rev != NE020_REV) && (hw_rev != NE020_REV1)) { + nes_debug(NES_DBG_INIT, "NE020 driver detected unknown hardware revision 0x%x\n", + hw_rev); + return NULL; + } + + nes_debug(NES_DBG_INIT, "Determine Soft Reset, QP_control=0x%x, CPU0=0x%x, CPU1=0x%x, CPU2=0x%x\n", + nes_read_indexed(nesdev, NES_IDX_QP_CONTROL + PCI_FUNC(nesdev->pcidev->devfn) * 8), + nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS), + nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS + 4), + nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS + 8)); + + nes_debug(NES_DBG_INIT, "Reset and init NE020\n"); + + + if ((port_count = nes_reset_adapter_ne020(nesdev, &OneG_Mode)) == 0) + return NULL; + + max_qp = nes_read_indexed(nesdev, NES_IDX_QP_CTX_SIZE); + nes_debug(NES_DBG_INIT, "QP_CTX_SIZE=%u\n", max_qp); + + u32temp = nes_read_indexed(nesdev, NES_IDX_QUAD_HASH_TABLE_SIZE); + if (max_qp > ((u32)1 << (u32temp & 0x001f))) { + nes_debug(NES_DBG_INIT, "Reducing Max QPs to %u due to hash table size = 0x%08X\n", + max_qp, u32temp); + max_qp = (u32)1 << (u32temp & 0x001f); + } + + hte_index_mask = ((u32)1 << ((u32temp & 0x001f)+1))-1; + nes_debug(NES_DBG_INIT, "Max QP = %u, hte_index_mask = 0x%08X.\n", + max_qp, hte_index_mask); + + u32temp = nes_read_indexed(nesdev, NES_IDX_IRRQ_COUNT); + + max_irrq = 1 << (u32temp & 0x001f); + + if (max_qp > max_irrq) { + max_qp = max_irrq; + nes_debug(NES_DBG_INIT, "Reducing Max QPs to %u due to Available Q1s.\n", + max_qp); + } + + /* there should be no reason to allocate more pds than qps */ + if (num_pds > max_qp) + num_pds = max_qp; + + u32temp = nes_read_indexed(nesdev, NES_IDX_MRT_SIZE); + max_mr = (u32)8192 << (u32temp & 0x7); + + u32temp = nes_read_indexed(nesdev, NES_IDX_PBL_REGION_SIZE); + max_256pbl = (u32)1 << (u32temp & 0x0000001f); + max_4kpbl = (u32)1 << ((u32temp >> 16) & 0x0000001f); + max_cq = nes_read_indexed(nesdev, NES_IDX_CQ_CTX_SIZE); + + u32temp = nes_read_indexed(nesdev, NES_IDX_ARP_CACHE_SIZE); + arp_table_size = 1 << u32temp; + + adapter_size = (sizeof(struct nes_adapter) + + (sizeof(unsigned long)-1)) & (~(sizeof(unsigned long)-1)); + adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_qp); + adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_mr); + adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_cq); + adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(num_pds); + adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(arp_table_size); + adapter_size += sizeof(struct nes_qp **) * max_qp; + + /* allocate a new adapter struct */ + nesadapter = kzalloc(adapter_size, GFP_KERNEL); + if (nesadapter == NULL) { + return NULL; + } + + nes_debug(NES_DBG_INIT, "Allocating new nesadapter @ %p, size = %u (actual size = %u).\n", + nesadapter, (u32)sizeof(struct nes_adapter), adapter_size); + + if (nes_read_eeprom_values(nesdev, nesadapter)) { + printk(KERN_ERR PFX "Unable to read EEPROM data.\n"); + kfree(nesadapter); + return NULL; + } + + nesadapter->vendor_id = (((u32) nesadapter->mac_addr_high) << 8) | + (nesadapter->mac_addr_low >> 24); + + pci_bus_read_config_word(nesdev->pcidev->bus, nesdev->pcidev->devfn, + PCI_DEVICE_ID, &device_id); + nesadapter->vendor_part_id = device_id; + + if (nes_init_serdes(nesdev, hw_rev, port_count, nesadapter, + OneG_Mode)) { + kfree(nesadapter); + return NULL; + } + nes_init_csr_ne020(nesdev, hw_rev, port_count); + + memset(nesadapter->pft_mcast_map, 255, + sizeof nesadapter->pft_mcast_map); + + /* populate the new nesadapter */ + nesadapter->devfn = nesdev->pcidev->devfn; + nesadapter->bus_number = nesdev->pcidev->bus->number; + nesadapter->ref_count = 1; + nesadapter->timer_int_req = 0xffff0000; + nesadapter->OneG_Mode = OneG_Mode; + nesadapter->doorbell_start = nesdev->doorbell_region; + + /* nesadapter->tick_delta = clk_divisor; */ + nesadapter->hw_rev = hw_rev; + nesadapter->port_count = port_count; + + nesadapter->max_qp = max_qp; + nesadapter->hte_index_mask = hte_index_mask; + nesadapter->max_irrq = max_irrq; + nesadapter->max_mr = max_mr; + nesadapter->max_256pbl = max_256pbl - 1; + nesadapter->max_4kpbl = max_4kpbl - 1; + nesadapter->max_cq = max_cq; + nesadapter->free_256pbl = max_256pbl - 1; + nesadapter->free_4kpbl = max_4kpbl - 1; + nesadapter->max_pd = num_pds; + nesadapter->arp_table_size = arp_table_size; + + nesadapter->et_pkt_rate_low = NES_TIMER_ENABLE_LIMIT; + if (nes_drv_opt & NES_DRV_OPT_DISABLE_INT_MOD) { + nesadapter->et_use_adaptive_rx_coalesce = 0; + nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT; + nesadapter->et_rx_coalesce_usecs_irq = interrupt_mod_interval; + } else { + nesadapter->et_use_adaptive_rx_coalesce = 1; + nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT_DYNAMIC; + nesadapter->et_rx_coalesce_usecs_irq = 0; + printk(PFX "%s: Using Adaptive Interrupt Moderation\n", __func__); + } + /* Setup and enable the periodic timer */ + if (nesadapter->et_rx_coalesce_usecs_irq) + nes_write32(nesdev->regs+NES_PERIODIC_CONTROL, 0x80000000 | + ((u32)(nesadapter->et_rx_coalesce_usecs_irq * 8))); + else + nes_write32(nesdev->regs+NES_PERIODIC_CONTROL, 0x00000000); + + nesadapter->base_pd = 1; + + nesadapter->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | + IB_DEVICE_MEM_WINDOW | + IB_DEVICE_MEM_MGT_EXTENSIONS; + + nesadapter->allocated_qps = (unsigned long *)&(((unsigned char *)nesadapter) + [(sizeof(struct nes_adapter)+(sizeof(unsigned long)-1))&(~(sizeof(unsigned long)-1))]); + nesadapter->allocated_cqs = &nesadapter->allocated_qps[BITS_TO_LONGS(max_qp)]; + nesadapter->allocated_mrs = &nesadapter->allocated_cqs[BITS_TO_LONGS(max_cq)]; + nesadapter->allocated_pds = &nesadapter->allocated_mrs[BITS_TO_LONGS(max_mr)]; + nesadapter->allocated_arps = &nesadapter->allocated_pds[BITS_TO_LONGS(num_pds)]; + nesadapter->qp_table = (struct nes_qp **)(&nesadapter->allocated_arps[BITS_TO_LONGS(arp_table_size)]); + + + /* mark the usual suspect QPs, MR and CQs as in use */ + for (u32temp = 0; u32temp < NES_FIRST_QPN; u32temp++) { + set_bit(u32temp, nesadapter->allocated_qps); + set_bit(u32temp, nesadapter->allocated_cqs); + } + set_bit(0, nesadapter->allocated_mrs); + + for (u32temp = 0; u32temp < 20; u32temp++) + set_bit(u32temp, nesadapter->allocated_pds); + u32temp = nes_read_indexed(nesdev, NES_IDX_QP_MAX_CFG_SIZES); + + max_rq_wrs = ((u32temp >> 8) & 3); + switch (max_rq_wrs) { + case 0: + max_rq_wrs = 4; + break; + case 1: + max_rq_wrs = 16; + break; + case 2: + max_rq_wrs = 32; + break; + case 3: + max_rq_wrs = 512; + break; + } + + max_sq_wrs = (u32temp & 3); + switch (max_sq_wrs) { + case 0: + max_sq_wrs = 4; + break; + case 1: + max_sq_wrs = 16; + break; + case 2: + max_sq_wrs = 32; + break; + case 3: + max_sq_wrs = 512; + break; + } + nesadapter->max_qp_wr = min(max_rq_wrs, max_sq_wrs); + nesadapter->max_irrq_wr = (u32temp >> 16) & 3; + + nesadapter->max_sge = 4; + nesadapter->max_cqe = 32766; + + if (nes_read_eeprom_values(nesdev, nesadapter)) { + printk(KERN_ERR PFX "Unable to read EEPROM data.\n"); + kfree(nesadapter); + return NULL; + } + + u32temp = nes_read_indexed(nesdev, NES_IDX_TCP_TIMER_CONFIG); + nes_write_indexed(nesdev, NES_IDX_TCP_TIMER_CONFIG, + (u32temp & 0xff000000) | (nesadapter->tcp_timer_core_clk_divisor & 0x00ffffff)); + + /* setup port configuration */ + if (nesadapter->port_count == 1) { + nesadapter->log_port = 0x00000000; + if (nes_drv_opt & NES_DRV_OPT_DUAL_LOGICAL_PORT) + nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000002); + else + nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000003); + } else { + if (nesadapter->phy_type[0] == NES_PHY_TYPE_PUMA_1G) { + nesadapter->log_port = 0x000000D8; + } else { + if (nesadapter->port_count == 2) + nesadapter->log_port = 0x00000044; + else + nesadapter->log_port = 0x000000e4; + } + nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000003); + } + + nes_write_indexed(nesdev, NES_IDX_NIC_LOGPORT_TO_PHYPORT, + nesadapter->log_port); + nes_debug(NES_DBG_INIT, "Probe time, LOG2PHY=%u\n", + nes_read_indexed(nesdev, NES_IDX_NIC_LOGPORT_TO_PHYPORT)); + + spin_lock_init(&nesadapter->resource_lock); + spin_lock_init(&nesadapter->phy_lock); + spin_lock_init(&nesadapter->pbl_lock); + spin_lock_init(&nesadapter->periodic_timer_lock); + + INIT_LIST_HEAD(&nesadapter->nesvnic_list[0]); + INIT_LIST_HEAD(&nesadapter->nesvnic_list[1]); + INIT_LIST_HEAD(&nesadapter->nesvnic_list[2]); + INIT_LIST_HEAD(&nesadapter->nesvnic_list[3]); + + if ((!nesadapter->OneG_Mode) && (nesadapter->port_count == 2)) { + u32 pcs_control_status0, pcs_control_status1; + u32 reset_value; + u32 i = 0; + u32 int_cnt = 0; + u32 ext_cnt = 0; + unsigned long flags; + u32 j = 0; + + pcs_control_status0 = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0); + pcs_control_status1 = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200); + + for (i = 0; i < NES_MAX_LINK_CHECK; i++) { + pcs_control_status0 = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0); + pcs_control_status1 = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200); + if ((0x0F000100 == (pcs_control_status0 & 0x0F000100)) + || (0x0F000100 == (pcs_control_status1 & 0x0F000100))) + int_cnt++; + msleep(1); + } + if (int_cnt > 1) { + spin_lock_irqsave(&nesadapter->phy_lock, flags); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F0C8); + mh_detected++; + reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET); + reset_value |= 0x0000003d; + nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value); + + while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) + & 0x00000040) != 0x00000040) && (j++ < 5000)); + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); + + pcs_control_status0 = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0); + pcs_control_status1 = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200); + + for (i = 0; i < NES_MAX_LINK_CHECK; i++) { + pcs_control_status0 = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0); + pcs_control_status1 = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200); + if ((0x0F000100 == (pcs_control_status0 & 0x0F000100)) + || (0x0F000100 == (pcs_control_status1 & 0x0F000100))) { + if (++ext_cnt > int_cnt) { + spin_lock_irqsave(&nesadapter->phy_lock, flags); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, + 0x0000F088); + mh_detected++; + reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET); + reset_value |= 0x0000003d; + nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value); + + while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) + & 0x00000040) != 0x00000040) && (j++ < 5000)); + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); + break; + } + } + msleep(1); + } + } + } + + if (nesadapter->hw_rev == NE020_REV) { + init_timer(&nesadapter->mh_timer); + nesadapter->mh_timer.function = nes_mh_fix; + nesadapter->mh_timer.expires = jiffies + (HZ/5); /* 1 second */ + nesadapter->mh_timer.data = (unsigned long)nesdev; + add_timer(&nesadapter->mh_timer); + } else { + nes_write32(nesdev->regs+NES_INTF_INT_STAT, 0x0f000000); + } + + init_timer(&nesadapter->lc_timer); + nesadapter->lc_timer.function = nes_clc; + nesadapter->lc_timer.expires = jiffies + 3600 * HZ; /* 1 hour */ + nesadapter->lc_timer.data = (unsigned long)nesdev; + add_timer(&nesadapter->lc_timer); + + list_add_tail(&nesadapter->list, &nes_adapter_list); + + for (func_index = 0; func_index < 8; func_index++) { + pci_bus_read_config_word(nesdev->pcidev->bus, + PCI_DEVFN(PCI_SLOT(nesdev->pcidev->devfn), + func_index), 0, &vendor_id); + if (vendor_id == 0xffff) + break; + } + nes_debug(NES_DBG_INIT, "%s %d functions found for %s.\n", __func__, + func_index, pci_name(nesdev->pcidev)); + nesadapter->adapter_fcn_count = func_index; + + return nesadapter; +} + + +/** + * nes_reset_adapter_ne020 + */ +static unsigned int nes_reset_adapter_ne020(struct nes_device *nesdev, u8 *OneG_Mode) +{ + u32 port_count; + u32 u32temp; + u32 i; + + u32temp = nes_read32(nesdev->regs+NES_SOFTWARE_RESET); + port_count = ((u32temp & 0x00000300) >> 8) + 1; + /* TODO: assuming that both SERDES are set the same for now */ + *OneG_Mode = (u32temp & 0x00003c00) ? 0 : 1; + nes_debug(NES_DBG_INIT, "Initial Software Reset = 0x%08X, port_count=%u\n", + u32temp, port_count); + if (*OneG_Mode) + nes_debug(NES_DBG_INIT, "Running in 1G mode.\n"); + u32temp &= 0xff00ffc0; + switch (port_count) { + case 1: + u32temp |= 0x00ee0000; + break; + case 2: + u32temp |= 0x00cc0000; + break; + case 4: + u32temp |= 0x00000000; + break; + default: + return 0; + break; + } + + /* check and do full reset if needed */ + if (nes_read_indexed(nesdev, NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8))) { + nes_debug(NES_DBG_INIT, "Issuing Full Soft reset = 0x%08X\n", u32temp | 0xd); + nes_write32(nesdev->regs+NES_SOFTWARE_RESET, u32temp | 0xd); + + i = 0; + while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) & 0x00000040) == 0) && i++ < 10000) + mdelay(1); + if (i > 10000) { + nes_debug(NES_DBG_INIT, "Did not see full soft reset done.\n"); + return 0; + } + + i = 0; + while ((nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS) != 0x80) && i++ < 10000) + mdelay(1); + if (i > 10000) { + printk(KERN_ERR PFX "Internal CPU not ready, status = %02X\n", + nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS)); + return 0; + } + } + + /* port reset */ + switch (port_count) { + case 1: + u32temp |= 0x00ee0010; + break; + case 2: + u32temp |= 0x00cc0030; + break; + case 4: + u32temp |= 0x00000030; + break; + } + + nes_debug(NES_DBG_INIT, "Issuing Port Soft reset = 0x%08X\n", u32temp | 0xd); + nes_write32(nesdev->regs+NES_SOFTWARE_RESET, u32temp | 0xd); + + i = 0; + while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) & 0x00000040) == 0) && i++ < 10000) + mdelay(1); + if (i > 10000) { + nes_debug(NES_DBG_INIT, "Did not see port soft reset done.\n"); + return 0; + } + + /* serdes 0 */ + i = 0; + while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0) + & 0x0000000f)) != 0x0000000f) && i++ < 5000) + mdelay(1); + if (i > 5000) { + nes_debug(NES_DBG_INIT, "Serdes 0 not ready, status=%x\n", u32temp); + return 0; + } + + /* serdes 1 */ + if (port_count > 1) { + i = 0; + while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS1) + & 0x0000000f)) != 0x0000000f) && i++ < 5000) + mdelay(1); + if (i > 5000) { + nes_debug(NES_DBG_INIT, "Serdes 1 not ready, status=%x\n", u32temp); + return 0; + } + } + + return port_count; +} + + +/** + * nes_init_serdes + */ +static int nes_init_serdes(struct nes_device *nesdev, u8 hw_rev, u8 port_count, + struct nes_adapter *nesadapter, u8 OneG_Mode) +{ + int i; + u32 u32temp; + u32 sds; + + if (hw_rev != NE020_REV) { + /* init serdes 0 */ + switch (nesadapter->phy_type[0]) { + case NES_PHY_TYPE_CX4: + if (wide_ppm_offset) + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000FFFAA); + else + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF); + break; + case NES_PHY_TYPE_KR: + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x00000000); + break; + case NES_PHY_TYPE_PUMA_1G: + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF); + sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0); + sds |= 0x00000100; + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, sds); + break; + default: + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF); + break; + } + + if (!OneG_Mode) + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE0, 0x11110000); + + if (port_count < 2) + return 0; + + /* init serdes 1 */ + if (!(OneG_Mode && (nesadapter->phy_type[1] != NES_PHY_TYPE_PUMA_1G))) + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000000FF); + + switch (nesadapter->phy_type[1]) { + case NES_PHY_TYPE_ARGUS: + case NES_PHY_TYPE_SFP_D: + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x00000000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x00000000); + break; + case NES_PHY_TYPE_CX4: + if (wide_ppm_offset) + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000FFFAA); + break; + case NES_PHY_TYPE_KR: + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x00000000); + break; + case NES_PHY_TYPE_PUMA_1G: + sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1); + sds |= 0x000000100; + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, sds); + } + if (!OneG_Mode) { + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE1, 0x11110000); + sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1); + sds &= 0xFFFFFFBF; + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, sds); + } + } else { + /* init serdes 0 */ + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, 0x00000008); + i = 0; + while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0) + & 0x0000000f)) != 0x0000000f) && i++ < 5000) + mdelay(1); + if (i > 5000) { + nes_debug(NES_DBG_PHY, "Init: serdes 0 not ready, status=%x\n", u32temp); + return 1; + } + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x000bdef7); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE0, 0x9ce73000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE0, 0x0ff00000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET0, 0x00000000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS0, 0x00000000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0, 0x00000000); + if (OneG_Mode) + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0182222); + else + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0042222); + + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000ff); + if (port_count > 1) { + /* init serdes 1 */ + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x00000048); + i = 0; + while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS1) + & 0x0000000f)) != 0x0000000f) && (i++ < 5000)) + mdelay(1); + if (i > 5000) { + printk("%s: Init: serdes 1 not ready, status=%x\n", __func__, u32temp); + /* return 1; */ + } + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x000bdef7); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE1, 0x9ce73000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE1, 0x0ff00000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET1, 0x00000000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS1, 0x00000000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL1, 0x00000000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL1, 0xf0002222); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000000ff); + } + } + return 0; +} + + +/** + * nes_init_csr_ne020 + * Initialize registers for ne020 hardware + */ +static void nes_init_csr_ne020(struct nes_device *nesdev, u8 hw_rev, u8 port_count) +{ + u32 u32temp; + + nes_debug(NES_DBG_INIT, "port_count=%d\n", port_count); + + nes_write_indexed(nesdev, 0x000001E4, 0x00000007); + /* nes_write_indexed(nesdev, 0x000001E8, 0x000208C4); */ + nes_write_indexed(nesdev, 0x000001E8, 0x00020874); + nes_write_indexed(nesdev, 0x000001D8, 0x00048002); + /* nes_write_indexed(nesdev, 0x000001D8, 0x0004B002); */ + nes_write_indexed(nesdev, 0x000001FC, 0x00050005); + nes_write_indexed(nesdev, 0x00000600, 0x55555555); + nes_write_indexed(nesdev, 0x00000604, 0x55555555); + + /* TODO: move these MAC register settings to NIC bringup */ + nes_write_indexed(nesdev, 0x00002000, 0x00000001); + nes_write_indexed(nesdev, 0x00002004, 0x00000001); + nes_write_indexed(nesdev, 0x00002008, 0x0000FFFF); + nes_write_indexed(nesdev, 0x0000200C, 0x00000001); + nes_write_indexed(nesdev, 0x00002010, 0x000003c1); + nes_write_indexed(nesdev, 0x0000201C, 0x75345678); + if (port_count > 1) { + nes_write_indexed(nesdev, 0x00002200, 0x00000001); + nes_write_indexed(nesdev, 0x00002204, 0x00000001); + nes_write_indexed(nesdev, 0x00002208, 0x0000FFFF); + nes_write_indexed(nesdev, 0x0000220C, 0x00000001); + nes_write_indexed(nesdev, 0x00002210, 0x000003c1); + nes_write_indexed(nesdev, 0x0000221C, 0x75345678); + nes_write_indexed(nesdev, 0x00000908, 0x20000001); + } + if (port_count > 2) { + nes_write_indexed(nesdev, 0x00002400, 0x00000001); + nes_write_indexed(nesdev, 0x00002404, 0x00000001); + nes_write_indexed(nesdev, 0x00002408, 0x0000FFFF); + nes_write_indexed(nesdev, 0x0000240C, 0x00000001); + nes_write_indexed(nesdev, 0x00002410, 0x000003c1); + nes_write_indexed(nesdev, 0x0000241C, 0x75345678); + nes_write_indexed(nesdev, 0x00000910, 0x20000001); + + nes_write_indexed(nesdev, 0x00002600, 0x00000001); + nes_write_indexed(nesdev, 0x00002604, 0x00000001); + nes_write_indexed(nesdev, 0x00002608, 0x0000FFFF); + nes_write_indexed(nesdev, 0x0000260C, 0x00000001); + nes_write_indexed(nesdev, 0x00002610, 0x000003c1); + nes_write_indexed(nesdev, 0x0000261C, 0x75345678); + nes_write_indexed(nesdev, 0x00000918, 0x20000001); + } + + nes_write_indexed(nesdev, 0x00005000, 0x00018000); + /* nes_write_indexed(nesdev, 0x00005000, 0x00010000); */ + nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG1, (wqm_quanta << 1) | + 0x00000001); + nes_write_indexed(nesdev, 0x00005008, 0x1F1F1F1F); + nes_write_indexed(nesdev, 0x00005010, 0x1F1F1F1F); + nes_write_indexed(nesdev, 0x00005018, 0x1F1F1F1F); + nes_write_indexed(nesdev, 0x00005020, 0x1F1F1F1F); + nes_write_indexed(nesdev, 0x00006090, 0xFFFFFFFF); + + /* TODO: move this to code, get from EEPROM */ + nes_write_indexed(nesdev, 0x00000900, 0x20000001); + nes_write_indexed(nesdev, 0x000060C0, 0x0000028e); + nes_write_indexed(nesdev, 0x000060C8, 0x00000020); + + nes_write_indexed(nesdev, 0x000001EC, 0x7b2625a0); + /* nes_write_indexed(nesdev, 0x000001EC, 0x5f2625a0); */ + + if (hw_rev != NE020_REV) { + u32temp = nes_read_indexed(nesdev, 0x000008e8); + u32temp |= 0x80000000; + nes_write_indexed(nesdev, 0x000008e8, u32temp); + u32temp = nes_read_indexed(nesdev, 0x000021f8); + u32temp &= 0x7fffffff; + u32temp |= 0x7fff0010; + nes_write_indexed(nesdev, 0x000021f8, u32temp); + if (port_count > 1) { + u32temp = nes_read_indexed(nesdev, 0x000023f8); + u32temp &= 0x7fffffff; + u32temp |= 0x7fff0010; + nes_write_indexed(nesdev, 0x000023f8, u32temp); + } + } +} + + +/** + * nes_destroy_adapter - destroy the adapter structure + */ +void nes_destroy_adapter(struct nes_adapter *nesadapter) +{ + struct nes_adapter *tmp_adapter; + + list_for_each_entry(tmp_adapter, &nes_adapter_list, list) { + nes_debug(NES_DBG_SHUTDOWN, "Nes Adapter list entry = 0x%p.\n", + tmp_adapter); + } + + nesadapter->ref_count--; + if (!nesadapter->ref_count) { + if (nesadapter->hw_rev == NE020_REV) { + del_timer(&nesadapter->mh_timer); + } + del_timer(&nesadapter->lc_timer); + + list_del(&nesadapter->list); + kfree(nesadapter); + } +} + + +/** + * nes_init_cqp + */ +int nes_init_cqp(struct nes_device *nesdev) +{ + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_hw_cqp_qp_context *cqp_qp_context; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_hw_ceq *ceq; + struct nes_hw_ceq *nic_ceq; + struct nes_hw_aeq *aeq; + void *vmem; + dma_addr_t pmem; + u32 count=0; + u32 cqp_head; + u64 u64temp; + u32 u32temp; + + /* allocate CQP memory */ + /* Need to add max_cq to the aeq size once cq overflow checking is added back */ + /* SQ is 512 byte aligned, others are 256 byte aligned */ + nesdev->cqp_mem_size = 512 + + (sizeof(struct nes_hw_cqp_wqe) * NES_CQP_SQ_SIZE) + + (sizeof(struct nes_hw_cqe) * NES_CCQ_SIZE) + + max(((u32)sizeof(struct nes_hw_ceqe) * NES_CCEQ_SIZE), (u32)256) + + max(((u32)sizeof(struct nes_hw_ceqe) * NES_NIC_CEQ_SIZE), (u32)256) + + (sizeof(struct nes_hw_aeqe) * nesadapter->max_qp) + + sizeof(struct nes_hw_cqp_qp_context); + + nesdev->cqp_vbase = pci_zalloc_consistent(nesdev->pcidev, + nesdev->cqp_mem_size, + &nesdev->cqp_pbase); + if (!nesdev->cqp_vbase) { + nes_debug(NES_DBG_INIT, "Unable to allocate memory for host descriptor rings\n"); + return -ENOMEM; + } + + /* Allocate a twice the number of CQP requests as the SQ size */ + nesdev->nes_cqp_requests = kzalloc(sizeof(struct nes_cqp_request) * + 2 * NES_CQP_SQ_SIZE, GFP_KERNEL); + if (nesdev->nes_cqp_requests == NULL) { + nes_debug(NES_DBG_INIT, "Unable to allocate memory CQP request entries.\n"); + pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size, nesdev->cqp.sq_vbase, + nesdev->cqp.sq_pbase); + return -ENOMEM; + } + + nes_debug(NES_DBG_INIT, "Allocated CQP structures at %p (phys = %016lX), size = %u.\n", + nesdev->cqp_vbase, (unsigned long)nesdev->cqp_pbase, nesdev->cqp_mem_size); + + spin_lock_init(&nesdev->cqp.lock); + init_waitqueue_head(&nesdev->cqp.waitq); + + /* Setup Various Structures */ + vmem = (void *)(((unsigned long)nesdev->cqp_vbase + (512 - 1)) & + ~(unsigned long)(512 - 1)); + pmem = (dma_addr_t)(((unsigned long long)nesdev->cqp_pbase + (512 - 1)) & + ~(unsigned long long)(512 - 1)); + + nesdev->cqp.sq_vbase = vmem; + nesdev->cqp.sq_pbase = pmem; + nesdev->cqp.sq_size = NES_CQP_SQ_SIZE; + nesdev->cqp.sq_head = 0; + nesdev->cqp.sq_tail = 0; + nesdev->cqp.qp_id = PCI_FUNC(nesdev->pcidev->devfn); + + vmem += (sizeof(struct nes_hw_cqp_wqe) * nesdev->cqp.sq_size); + pmem += (sizeof(struct nes_hw_cqp_wqe) * nesdev->cqp.sq_size); + + nesdev->ccq.cq_vbase = vmem; + nesdev->ccq.cq_pbase = pmem; + nesdev->ccq.cq_size = NES_CCQ_SIZE; + nesdev->ccq.cq_head = 0; + nesdev->ccq.ce_handler = nes_cqp_ce_handler; + nesdev->ccq.cq_number = PCI_FUNC(nesdev->pcidev->devfn); + + vmem += (sizeof(struct nes_hw_cqe) * nesdev->ccq.cq_size); + pmem += (sizeof(struct nes_hw_cqe) * nesdev->ccq.cq_size); + + nesdev->ceq_index = PCI_FUNC(nesdev->pcidev->devfn); + ceq = &nesadapter->ceq[nesdev->ceq_index]; + ceq->ceq_vbase = vmem; + ceq->ceq_pbase = pmem; + ceq->ceq_size = NES_CCEQ_SIZE; + ceq->ceq_head = 0; + + vmem += max(((u32)sizeof(struct nes_hw_ceqe) * ceq->ceq_size), (u32)256); + pmem += max(((u32)sizeof(struct nes_hw_ceqe) * ceq->ceq_size), (u32)256); + + nesdev->nic_ceq_index = PCI_FUNC(nesdev->pcidev->devfn) + 8; + nic_ceq = &nesadapter->ceq[nesdev->nic_ceq_index]; + nic_ceq->ceq_vbase = vmem; + nic_ceq->ceq_pbase = pmem; + nic_ceq->ceq_size = NES_NIC_CEQ_SIZE; + nic_ceq->ceq_head = 0; + + vmem += max(((u32)sizeof(struct nes_hw_ceqe) * nic_ceq->ceq_size), (u32)256); + pmem += max(((u32)sizeof(struct nes_hw_ceqe) * nic_ceq->ceq_size), (u32)256); + + aeq = &nesadapter->aeq[PCI_FUNC(nesdev->pcidev->devfn)]; + aeq->aeq_vbase = vmem; + aeq->aeq_pbase = pmem; + aeq->aeq_size = nesadapter->max_qp; + aeq->aeq_head = 0; + + /* Setup QP Context */ + vmem += (sizeof(struct nes_hw_aeqe) * aeq->aeq_size); + pmem += (sizeof(struct nes_hw_aeqe) * aeq->aeq_size); + + cqp_qp_context = vmem; + cqp_qp_context->context_words[0] = + cpu_to_le32((PCI_FUNC(nesdev->pcidev->devfn) << 12) + (2 << 10)); + cqp_qp_context->context_words[1] = 0; + cqp_qp_context->context_words[2] = cpu_to_le32((u32)nesdev->cqp.sq_pbase); + cqp_qp_context->context_words[3] = cpu_to_le32(((u64)nesdev->cqp.sq_pbase) >> 32); + + + /* Write the address to Create CQP */ + if ((sizeof(dma_addr_t) > 4)) { + nes_write_indexed(nesdev, + NES_IDX_CREATE_CQP_HIGH + (PCI_FUNC(nesdev->pcidev->devfn) * 8), + ((u64)pmem) >> 32); + } else { + nes_write_indexed(nesdev, + NES_IDX_CREATE_CQP_HIGH + (PCI_FUNC(nesdev->pcidev->devfn) * 8), 0); + } + nes_write_indexed(nesdev, + NES_IDX_CREATE_CQP_LOW + (PCI_FUNC(nesdev->pcidev->devfn) * 8), + (u32)pmem); + + INIT_LIST_HEAD(&nesdev->cqp_avail_reqs); + INIT_LIST_HEAD(&nesdev->cqp_pending_reqs); + + for (count = 0; count < 2*NES_CQP_SQ_SIZE; count++) { + init_waitqueue_head(&nesdev->nes_cqp_requests[count].waitq); + list_add_tail(&nesdev->nes_cqp_requests[count].list, &nesdev->cqp_avail_reqs); + } + + /* Write Create CCQ WQE */ + cqp_head = nesdev->cqp.sq_head++; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + (NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID | + NES_CQP_CQ_CHK_OVERFLOW | ((u32)nesdev->ccq.cq_size << 16))); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, + (nesdev->ccq.cq_number | + ((u32)nesdev->ceq_index << 16))); + u64temp = (u64)nesdev->ccq.cq_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0; + u64temp = (unsigned long)&nesdev->ccq; + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] = + cpu_to_le32((u32)(u64temp >> 1)); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = + cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0; + + /* Write Create CEQ WQE */ + cqp_head = nesdev->cqp.sq_head++; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + (NES_CQP_CREATE_CEQ + ((u32)nesdev->ceq_index << 8))); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CEQ_WQE_ELEMENT_COUNT_IDX, ceq->ceq_size); + u64temp = (u64)ceq->ceq_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp); + + /* Write Create AEQ WQE */ + cqp_head = nesdev->cqp.sq_head++; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + (NES_CQP_CREATE_AEQ + ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 8))); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_AEQ_WQE_ELEMENT_COUNT_IDX, aeq->aeq_size); + u64temp = (u64)aeq->aeq_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp); + + /* Write Create NIC CEQ WQE */ + cqp_head = nesdev->cqp.sq_head++; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + (NES_CQP_CREATE_CEQ + ((u32)nesdev->nic_ceq_index << 8))); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CEQ_WQE_ELEMENT_COUNT_IDX, nic_ceq->ceq_size); + u64temp = (u64)nic_ceq->ceq_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp); + + /* Poll until CCQP done */ + count = 0; + do { + if (count++ > 1000) { + printk(KERN_ERR PFX "Error creating CQP\n"); + pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size, + nesdev->cqp_vbase, nesdev->cqp_pbase); + return -1; + } + udelay(10); + } while (!(nes_read_indexed(nesdev, + NES_IDX_QP_CONTROL + (PCI_FUNC(nesdev->pcidev->devfn) * 8)) & (1 << 8))); + + nes_debug(NES_DBG_INIT, "CQP Status = 0x%08X\n", nes_read_indexed(nesdev, + NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8))); + + u32temp = 0x04800000; + nes_write32(nesdev->regs+NES_WQE_ALLOC, u32temp | nesdev->cqp.qp_id); + + /* wait for the CCQ, CEQ, and AEQ to get created */ + count = 0; + do { + if (count++ > 1000) { + printk(KERN_ERR PFX "Error creating CCQ, CEQ, and AEQ\n"); + pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size, + nesdev->cqp_vbase, nesdev->cqp_pbase); + return -1; + } + udelay(10); + } while (((nes_read_indexed(nesdev, + NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8)) & (15<<8)) != (15<<8))); + + /* dump the QP status value */ + nes_debug(NES_DBG_INIT, "QP Status = 0x%08X\n", nes_read_indexed(nesdev, + NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8))); + + nesdev->cqp.sq_tail++; + + return 0; +} + + +/** + * nes_destroy_cqp + */ +int nes_destroy_cqp(struct nes_device *nesdev) +{ + struct nes_hw_cqp_wqe *cqp_wqe; + u32 count = 0; + u32 cqp_head; + unsigned long flags; + + do { + if (count++ > 1000) + break; + udelay(10); + } while (!(nesdev->cqp.sq_head == nesdev->cqp.sq_tail)); + + /* Reset CCQ */ + nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_RESET | + nesdev->ccq.cq_number); + + /* Disable device interrupts */ + nes_write32(nesdev->regs+NES_INT_MASK, 0x7fffffff); + + spin_lock_irqsave(&nesdev->cqp.lock, flags); + + /* Destroy the AEQ */ + cqp_head = nesdev->cqp.sq_head++; + nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_AEQ | + ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 8)); + cqp_wqe->wqe_words[NES_CQP_WQE_COMP_CTX_HIGH_IDX] = 0; + + /* Destroy the NIC CEQ */ + cqp_head = nesdev->cqp.sq_head++; + nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CEQ | + ((u32)nesdev->nic_ceq_index << 8)); + + /* Destroy the CEQ */ + cqp_head = nesdev->cqp.sq_head++; + nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CEQ | + (nesdev->ceq_index << 8)); + + /* Destroy the CCQ */ + cqp_head = nesdev->cqp.sq_head++; + nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CQ); + cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesdev->ccq.cq_number | + ((u32)nesdev->ceq_index << 16)); + + /* Destroy CQP */ + cqp_head = nesdev->cqp.sq_head++; + nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_QP | + NES_CQP_QP_TYPE_CQP); + cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesdev->cqp.qp_id); + + barrier(); + /* Ring doorbell (5 WQEs) */ + nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x05800000 | nesdev->cqp.qp_id); + + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + + /* wait for the CCQ, CEQ, and AEQ to get destroyed */ + count = 0; + do { + if (count++ > 1000) { + printk(KERN_ERR PFX "Function%d: Error destroying CCQ, CEQ, and AEQ\n", + PCI_FUNC(nesdev->pcidev->devfn)); + break; + } + udelay(10); + } while (((nes_read_indexed(nesdev, + NES_IDX_QP_CONTROL + (PCI_FUNC(nesdev->pcidev->devfn)*8)) & (15 << 8)) != 0)); + + /* dump the QP status value */ + nes_debug(NES_DBG_SHUTDOWN, "Function%d: QP Status = 0x%08X\n", + PCI_FUNC(nesdev->pcidev->devfn), + nes_read_indexed(nesdev, + NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8))); + + kfree(nesdev->nes_cqp_requests); + + /* Free the control structures */ + pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size, nesdev->cqp.sq_vbase, + nesdev->cqp.sq_pbase); + + return 0; +} + + +/** + * nes_init_1g_phy + */ +static int nes_init_1g_phy(struct nes_device *nesdev, u8 phy_type, u8 phy_index) +{ + u32 counter = 0; + u16 phy_data; + int ret = 0; + + nes_read_1G_phy_reg(nesdev, 1, phy_index, &phy_data); + nes_write_1G_phy_reg(nesdev, 23, phy_index, 0xb000); + + /* Reset the PHY */ + nes_write_1G_phy_reg(nesdev, 0, phy_index, 0x8000); + udelay(100); + counter = 0; + do { + nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data); + if (counter++ > 100) { + ret = -1; + break; + } + } while (phy_data & 0x8000); + + /* Setting no phy loopback */ + phy_data &= 0xbfff; + phy_data |= 0x1140; + nes_write_1G_phy_reg(nesdev, 0, phy_index, phy_data); + nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data); + nes_read_1G_phy_reg(nesdev, 0x17, phy_index, &phy_data); + nes_read_1G_phy_reg(nesdev, 0x1e, phy_index, &phy_data); + + /* Setting the interrupt mask */ + nes_read_1G_phy_reg(nesdev, 0x19, phy_index, &phy_data); + nes_write_1G_phy_reg(nesdev, 0x19, phy_index, 0xffee); + nes_read_1G_phy_reg(nesdev, 0x19, phy_index, &phy_data); + + /* turning on flow control */ + nes_read_1G_phy_reg(nesdev, 4, phy_index, &phy_data); + nes_write_1G_phy_reg(nesdev, 4, phy_index, (phy_data & ~(0x03E0)) | 0xc00); + nes_read_1G_phy_reg(nesdev, 4, phy_index, &phy_data); + + /* Clear Half duplex */ + nes_read_1G_phy_reg(nesdev, 9, phy_index, &phy_data); + nes_write_1G_phy_reg(nesdev, 9, phy_index, phy_data & ~(0x0100)); + nes_read_1G_phy_reg(nesdev, 9, phy_index, &phy_data); + + nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data); + nes_write_1G_phy_reg(nesdev, 0, phy_index, phy_data | 0x0300); + + return ret; +} + + +/** + * nes_init_2025_phy + */ +static int nes_init_2025_phy(struct nes_device *nesdev, u8 phy_type, u8 phy_index) +{ + u32 temp_phy_data = 0; + u32 temp_phy_data2 = 0; + u32 counter = 0; + u32 sds; + u32 mac_index = nesdev->mac_index; + int ret = 0; + unsigned int first_attempt = 1; + + /* Check firmware heartbeat */ + nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee); + temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + udelay(1500); + nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee); + temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + + if (temp_phy_data != temp_phy_data2) { + nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7fd); + temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + if ((temp_phy_data & 0xff) > 0x20) + return 0; + printk(PFX "Reinitialize external PHY\n"); + } + + /* no heartbeat, configure the PHY */ + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0x0000, 0x8000); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc300, 0x0000); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052); + + switch (phy_type) { + case NES_PHY_TYPE_ARGUS: + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x000C); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0008); + nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0001); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0098); + nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00); + + /* setup LEDs */ + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x0007); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x000A); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0009); + break; + + case NES_PHY_TYPE_SFP_D: + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x0004); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0038); + nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0013); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0098); + nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00); + + /* setup LEDs */ + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x0007); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x000A); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0009); + break; + + case NES_PHY_TYPE_KR: + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x000C); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0010); + nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0013); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0080); + nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00); + + /* setup LEDs */ + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x000B); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x0003); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0004); + + nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0022, 0x406D); + nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0023, 0x0020); + break; + } + + nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0028, 0xA528); + + /* Bring PHY out of reset */ + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc300, 0x0002); + + /* Check for heartbeat */ + counter = 0; + mdelay(690); + nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee); + temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + do { + if (counter++ > 150) { + printk(PFX "No PHY heartbeat\n"); + break; + } + mdelay(1); + nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee); + temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + } while ((temp_phy_data2 == temp_phy_data)); + + /* wait for tracking */ + counter = 0; + do { + nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7fd); + temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + if (counter++ > 300) { + if (((temp_phy_data & 0xff) == 0x0) && first_attempt) { + first_attempt = 0; + counter = 0; + /* reset AMCC PHY and try again */ + nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0xe854, 0x00c0); + nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0xe854, 0x0040); + continue; + } else { + ret = 1; + break; + } + } + mdelay(10); + } while ((temp_phy_data & 0xff) < 0x30); + + /* setup signal integrity */ + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd003, 0x0000); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00D, 0x00FE); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00E, 0x0032); + if (phy_type == NES_PHY_TYPE_KR) { + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00F, 0x000C); + } else { + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00F, 0x0002); + nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc314, 0x0063); + } + + /* reset serdes */ + sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200); + sds |= 0x1; + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200, sds); + sds &= 0xfffffffe; + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200, sds); + + counter = 0; + while (((nes_read32(nesdev->regs + NES_SOFTWARE_RESET) & 0x00000040) != 0x00000040) + && (counter++ < 5000)) + ; + + return ret; +} + + +/** + * nes_init_phy + */ +int nes_init_phy(struct nes_device *nesdev) +{ + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 mac_index = nesdev->mac_index; + u32 tx_config = 0; + unsigned long flags; + u8 phy_type = nesadapter->phy_type[mac_index]; + u8 phy_index = nesadapter->phy_index[mac_index]; + int ret = 0; + + tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG); + if (phy_type == NES_PHY_TYPE_1G) { + /* setup 1G MDIO operation */ + tx_config &= 0xFFFFFFE3; + tx_config |= 0x04; + } else { + /* setup 10G MDIO operation */ + tx_config &= 0xFFFFFFE3; + tx_config |= 0x1D; + } + nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config); + + spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags); + + switch (phy_type) { + case NES_PHY_TYPE_1G: + ret = nes_init_1g_phy(nesdev, phy_type, phy_index); + break; + case NES_PHY_TYPE_ARGUS: + case NES_PHY_TYPE_SFP_D: + case NES_PHY_TYPE_KR: + ret = nes_init_2025_phy(nesdev, phy_type, phy_index); + break; + } + + spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags); + + return ret; +} + + +/** + * nes_replenish_nic_rq + */ +static void nes_replenish_nic_rq(struct nes_vnic *nesvnic) +{ + unsigned long flags; + dma_addr_t bus_address; + struct sk_buff *skb; + struct nes_hw_nic_rq_wqe *nic_rqe; + struct nes_hw_nic *nesnic; + struct nes_device *nesdev; + struct nes_rskb_cb *cb; + u32 rx_wqes_posted = 0; + + nesnic = &nesvnic->nic; + nesdev = nesvnic->nesdev; + spin_lock_irqsave(&nesnic->rq_lock, flags); + if (nesnic->replenishing_rq !=0) { + if (((nesnic->rq_size-1) == atomic_read(&nesvnic->rx_skbs_needed)) && + (atomic_read(&nesvnic->rx_skb_timer_running) == 0)) { + atomic_set(&nesvnic->rx_skb_timer_running, 1); + spin_unlock_irqrestore(&nesnic->rq_lock, flags); + nesvnic->rq_wqes_timer.expires = jiffies + (HZ/2); /* 1/2 second */ + add_timer(&nesvnic->rq_wqes_timer); + } else + spin_unlock_irqrestore(&nesnic->rq_lock, flags); + return; + } + nesnic->replenishing_rq = 1; + spin_unlock_irqrestore(&nesnic->rq_lock, flags); + do { + skb = dev_alloc_skb(nesvnic->max_frame_size); + if (skb) { + skb->dev = nesvnic->netdev; + + bus_address = pci_map_single(nesdev->pcidev, + skb->data, nesvnic->max_frame_size, PCI_DMA_FROMDEVICE); + cb = (struct nes_rskb_cb *)&skb->cb[0]; + cb->busaddr = bus_address; + cb->maplen = nesvnic->max_frame_size; + + nic_rqe = &nesnic->rq_vbase[nesvnic->nic.rq_head]; + nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = + cpu_to_le32(nesvnic->max_frame_size); + nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0; + nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] = + cpu_to_le32((u32)bus_address); + nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] = + cpu_to_le32((u32)((u64)bus_address >> 32)); + nesnic->rx_skb[nesnic->rq_head] = skb; + nesnic->rq_head++; + nesnic->rq_head &= nesnic->rq_size - 1; + atomic_dec(&nesvnic->rx_skbs_needed); + barrier(); + if (++rx_wqes_posted == 255) { + nes_write32(nesdev->regs+NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesnic->qp_id); + rx_wqes_posted = 0; + } + } else { + spin_lock_irqsave(&nesnic->rq_lock, flags); + if (((nesnic->rq_size-1) == atomic_read(&nesvnic->rx_skbs_needed)) && + (atomic_read(&nesvnic->rx_skb_timer_running) == 0)) { + atomic_set(&nesvnic->rx_skb_timer_running, 1); + spin_unlock_irqrestore(&nesnic->rq_lock, flags); + nesvnic->rq_wqes_timer.expires = jiffies + (HZ/2); /* 1/2 second */ + add_timer(&nesvnic->rq_wqes_timer); + } else + spin_unlock_irqrestore(&nesnic->rq_lock, flags); + break; + } + } while (atomic_read(&nesvnic->rx_skbs_needed)); + barrier(); + if (rx_wqes_posted) + nes_write32(nesdev->regs+NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesnic->qp_id); + nesnic->replenishing_rq = 0; +} + + +/** + * nes_rq_wqes_timeout + */ +static void nes_rq_wqes_timeout(unsigned long parm) +{ + struct nes_vnic *nesvnic = (struct nes_vnic *)parm; + printk("%s: Timer fired.\n", __func__); + atomic_set(&nesvnic->rx_skb_timer_running, 0); + if (atomic_read(&nesvnic->rx_skbs_needed)) + nes_replenish_nic_rq(nesvnic); +} + + +static int nes_lro_get_skb_hdr(struct sk_buff *skb, void **iphdr, + void **tcph, u64 *hdr_flags, void *priv) +{ + unsigned int ip_len; + struct iphdr *iph; + skb_reset_network_header(skb); + iph = ip_hdr(skb); + if (iph->protocol != IPPROTO_TCP) + return -1; + ip_len = ip_hdrlen(skb); + skb_set_transport_header(skb, ip_len); + *tcph = tcp_hdr(skb); + + *hdr_flags = LRO_IPV4 | LRO_TCP; + *iphdr = iph; + return 0; +} + + +/** + * nes_init_nic_qp + */ +int nes_init_nic_qp(struct nes_device *nesdev, struct net_device *netdev) +{ + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_hw_nic_sq_wqe *nic_sqe; + struct nes_hw_nic_qp_context *nic_context; + struct sk_buff *skb; + struct nes_hw_nic_rq_wqe *nic_rqe; + struct nes_vnic *nesvnic = netdev_priv(netdev); + unsigned long flags; + void *vmem; + dma_addr_t pmem; + u64 u64temp; + int ret; + u32 cqp_head; + u32 counter; + u32 wqe_count; + struct nes_rskb_cb *cb; + u8 jumbomode=0; + + /* Allocate fragment, SQ, RQ, and CQ; Reuse CEQ based on the PCI function */ + nesvnic->nic_mem_size = 256 + + (NES_NIC_WQ_SIZE * sizeof(struct nes_first_frag)) + + (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe)) + + (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe)) + + (NES_NIC_WQ_SIZE * 2 * sizeof(struct nes_hw_nic_cqe)) + + sizeof(struct nes_hw_nic_qp_context); + + nesvnic->nic_vbase = pci_zalloc_consistent(nesdev->pcidev, + nesvnic->nic_mem_size, + &nesvnic->nic_pbase); + if (!nesvnic->nic_vbase) { + nes_debug(NES_DBG_INIT, "Unable to allocate memory for NIC host descriptor rings\n"); + return -ENOMEM; + } + nes_debug(NES_DBG_INIT, "Allocated NIC QP structures at %p (phys = %016lX), size = %u.\n", + nesvnic->nic_vbase, (unsigned long)nesvnic->nic_pbase, nesvnic->nic_mem_size); + + vmem = (void *)(((unsigned long)nesvnic->nic_vbase + (256 - 1)) & + ~(unsigned long)(256 - 1)); + pmem = (dma_addr_t)(((unsigned long long)nesvnic->nic_pbase + (256 - 1)) & + ~(unsigned long long)(256 - 1)); + + /* Setup the first Fragment buffers */ + nesvnic->nic.first_frag_vbase = vmem; + + for (counter = 0; counter < NES_NIC_WQ_SIZE; counter++) { + nesvnic->nic.frag_paddr[counter] = pmem; + pmem += sizeof(struct nes_first_frag); + } + + /* setup the SQ */ + vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_first_frag)); + + nesvnic->nic.sq_vbase = (void *)vmem; + nesvnic->nic.sq_pbase = pmem; + nesvnic->nic.sq_head = 0; + nesvnic->nic.sq_tail = 0; + nesvnic->nic.sq_size = NES_NIC_WQ_SIZE; + for (counter = 0; counter < NES_NIC_WQ_SIZE; counter++) { + nic_sqe = &nesvnic->nic.sq_vbase[counter]; + nic_sqe->wqe_words[NES_NIC_SQ_WQE_MISC_IDX] = + cpu_to_le32(NES_NIC_SQ_WQE_DISABLE_CHKSUM | + NES_NIC_SQ_WQE_COMPLETION); + nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX] = + cpu_to_le32((u32)NES_FIRST_FRAG_SIZE << 16); + nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX] = + cpu_to_le32((u32)nesvnic->nic.frag_paddr[counter]); + nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX] = + cpu_to_le32((u32)((u64)nesvnic->nic.frag_paddr[counter] >> 32)); + } + + nesvnic->get_cqp_request = nes_get_cqp_request; + nesvnic->post_cqp_request = nes_post_cqp_request; + nesvnic->mcrq_mcast_filter = NULL; + + spin_lock_init(&nesvnic->nic.rq_lock); + + /* setup the RQ */ + vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe)); + pmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe)); + + + nesvnic->nic.rq_vbase = vmem; + nesvnic->nic.rq_pbase = pmem; + nesvnic->nic.rq_head = 0; + nesvnic->nic.rq_tail = 0; + nesvnic->nic.rq_size = NES_NIC_WQ_SIZE; + + /* setup the CQ */ + vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe)); + pmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe)); + + if (nesdev->nesadapter->netdev_count > 2) + nesvnic->mcrq_qp_id = nesvnic->nic_index + 32; + else + nesvnic->mcrq_qp_id = nesvnic->nic.qp_id + 4; + + nesvnic->nic_cq.cq_vbase = vmem; + nesvnic->nic_cq.cq_pbase = pmem; + nesvnic->nic_cq.cq_head = 0; + nesvnic->nic_cq.cq_size = NES_NIC_WQ_SIZE * 2; + + nesvnic->nic_cq.ce_handler = nes_nic_napi_ce_handler; + + /* Send CreateCQ request to CQP */ + spin_lock_irqsave(&nesdev->cqp.lock, flags); + cqp_head = nesdev->cqp.sq_head; + + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32( + NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID | + ((u32)nesvnic->nic_cq.cq_size << 16)); + cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32( + nesvnic->nic_cq.cq_number | ((u32)nesdev->nic_ceq_index << 16)); + u64temp = (u64)nesvnic->nic_cq.cq_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0; + u64temp = (unsigned long)&nesvnic->nic_cq; + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] = cpu_to_le32((u32)(u64temp >> 1)); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = + cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0; + if (++cqp_head >= nesdev->cqp.sq_size) + cqp_head = 0; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + + /* Send CreateQP request to CQP */ + nic_context = (void *)(&nesvnic->nic_cq.cq_vbase[nesvnic->nic_cq.cq_size]); + nic_context->context_words[NES_NIC_CTX_MISC_IDX] = + cpu_to_le32((u32)NES_NIC_CTX_SIZE | + ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 12)); + nes_debug(NES_DBG_INIT, "RX_WINDOW_BUFFER_PAGE_TABLE_SIZE = 0x%08X, RX_WINDOW_BUFFER_SIZE = 0x%08X\n", + nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_PAGE_TABLE_SIZE), + nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE)); + if (nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE) != 0) { + nic_context->context_words[NES_NIC_CTX_MISC_IDX] |= cpu_to_le32(NES_NIC_BACK_STORE); + } + + u64temp = (u64)nesvnic->nic.sq_pbase; + nic_context->context_words[NES_NIC_CTX_SQ_LOW_IDX] = cpu_to_le32((u32)u64temp); + nic_context->context_words[NES_NIC_CTX_SQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32)); + u64temp = (u64)nesvnic->nic.rq_pbase; + nic_context->context_words[NES_NIC_CTX_RQ_LOW_IDX] = cpu_to_le32((u32)u64temp); + nic_context->context_words[NES_NIC_CTX_RQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32)); + + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_CREATE_QP | + NES_CQP_QP_TYPE_NIC); + cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesvnic->nic.qp_id); + u64temp = (u64)nesvnic->nic_cq.cq_pbase + + (nesvnic->nic_cq.cq_size * sizeof(struct nes_hw_nic_cqe)); + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp); + + if (++cqp_head >= nesdev->cqp.sq_size) + cqp_head = 0; + nesdev->cqp.sq_head = cqp_head; + + barrier(); + + /* Ring doorbell (2 WQEs) */ + nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id); + + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + nes_debug(NES_DBG_INIT, "Waiting for create NIC QP%u to complete.\n", + nesvnic->nic.qp_id); + + ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_INIT, "Create NIC QP%u completed, wait_event_timeout ret = %u.\n", + nesvnic->nic.qp_id, ret); + if (!ret) { + nes_debug(NES_DBG_INIT, "NIC QP%u create timeout expired\n", nesvnic->nic.qp_id); + pci_free_consistent(nesdev->pcidev, nesvnic->nic_mem_size, nesvnic->nic_vbase, + nesvnic->nic_pbase); + return -EIO; + } + + /* Populate the RQ */ + for (counter = 0; counter < (NES_NIC_WQ_SIZE - 1); counter++) { + skb = dev_alloc_skb(nesvnic->max_frame_size); + if (!skb) { + nes_debug(NES_DBG_INIT, "%s: out of memory for receive skb\n", netdev->name); + + nes_destroy_nic_qp(nesvnic); + return -ENOMEM; + } + + skb->dev = netdev; + + pmem = pci_map_single(nesdev->pcidev, skb->data, + nesvnic->max_frame_size, PCI_DMA_FROMDEVICE); + cb = (struct nes_rskb_cb *)&skb->cb[0]; + cb->busaddr = pmem; + cb->maplen = nesvnic->max_frame_size; + + nic_rqe = &nesvnic->nic.rq_vbase[counter]; + nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = cpu_to_le32(nesvnic->max_frame_size); + nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0; + nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] = cpu_to_le32((u32)pmem); + nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] = cpu_to_le32((u32)((u64)pmem >> 32)); + nesvnic->nic.rx_skb[counter] = skb; + } + + wqe_count = NES_NIC_WQ_SIZE - 1; + nesvnic->nic.rq_head = wqe_count; + barrier(); + do { + counter = min(wqe_count, ((u32)255)); + wqe_count -= counter; + nes_write32(nesdev->regs+NES_WQE_ALLOC, (counter << 24) | nesvnic->nic.qp_id); + } while (wqe_count); + init_timer(&nesvnic->rq_wqes_timer); + nesvnic->rq_wqes_timer.function = nes_rq_wqes_timeout; + nesvnic->rq_wqes_timer.data = (unsigned long)nesvnic; + nes_debug(NES_DBG_INIT, "NAPI support Enabled\n"); + if (nesdev->nesadapter->et_use_adaptive_rx_coalesce) + { + nes_nic_init_timer(nesdev); + if (netdev->mtu > 1500) + jumbomode = 1; + nes_nic_init_timer_defaults(nesdev, jumbomode); + } + if ((nesdev->nesadapter->allow_unaligned_fpdus) && + (nes_init_mgt_qp(nesdev, netdev, nesvnic))) { + nes_debug(NES_DBG_INIT, "%s: Out of memory for pau nic\n", netdev->name); + nes_destroy_nic_qp(nesvnic); + return -ENOMEM; + } + + nesvnic->lro_mgr.max_aggr = nes_lro_max_aggr; + nesvnic->lro_mgr.max_desc = NES_MAX_LRO_DESCRIPTORS; + nesvnic->lro_mgr.lro_arr = nesvnic->lro_desc; + nesvnic->lro_mgr.get_skb_header = nes_lro_get_skb_hdr; + nesvnic->lro_mgr.features = LRO_F_NAPI | LRO_F_EXTRACT_VLAN_ID; + nesvnic->lro_mgr.dev = netdev; + nesvnic->lro_mgr.ip_summed = CHECKSUM_UNNECESSARY; + nesvnic->lro_mgr.ip_summed_aggr = CHECKSUM_UNNECESSARY; + return 0; +} + + +/** + * nes_destroy_nic_qp + */ +void nes_destroy_nic_qp(struct nes_vnic *nesvnic) +{ + u64 u64temp; + dma_addr_t bus_address; + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_hw_nic_sq_wqe *nic_sqe; + __le16 *wqe_fragment_length; + u16 wqe_fragment_index; + u32 cqp_head; + u32 wqm_cfg0; + unsigned long flags; + struct sk_buff *rx_skb; + struct nes_rskb_cb *cb; + int ret; + + if (nesdev->nesadapter->allow_unaligned_fpdus) + nes_destroy_mgt(nesvnic); + + /* clear wqe stall before destroying NIC QP */ + wqm_cfg0 = nes_read_indexed(nesdev, NES_IDX_WQM_CONFIG0); + nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG0, wqm_cfg0 & 0xFFFF7FFF); + + /* Free remaining NIC receive buffers */ + while (nesvnic->nic.rq_head != nesvnic->nic.rq_tail) { + rx_skb = nesvnic->nic.rx_skb[nesvnic->nic.rq_tail]; + cb = (struct nes_rskb_cb *)&rx_skb->cb[0]; + pci_unmap_single(nesdev->pcidev, cb->busaddr, cb->maplen, + PCI_DMA_FROMDEVICE); + + dev_kfree_skb(nesvnic->nic.rx_skb[nesvnic->nic.rq_tail++]); + nesvnic->nic.rq_tail &= (nesvnic->nic.rq_size - 1); + } + + /* Free remaining NIC transmit buffers */ + while (nesvnic->nic.sq_head != nesvnic->nic.sq_tail) { + nic_sqe = &nesvnic->nic.sq_vbase[nesvnic->nic.sq_tail]; + wqe_fragment_index = 1; + wqe_fragment_length = (__le16 *) + &nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX]; + /* bump past the vlan tag */ + wqe_fragment_length++; + if (le16_to_cpu(wqe_fragment_length[wqe_fragment_index]) != 0) { + u64temp = (u64)le32_to_cpu( + nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX+ + wqe_fragment_index*2]); + u64temp += ((u64)le32_to_cpu( + nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX + + wqe_fragment_index*2]))<<32; + bus_address = (dma_addr_t)u64temp; + if (test_and_clear_bit(nesvnic->nic.sq_tail, + nesvnic->nic.first_frag_overflow)) { + pci_unmap_single(nesdev->pcidev, + bus_address, + le16_to_cpu(wqe_fragment_length[ + wqe_fragment_index++]), + PCI_DMA_TODEVICE); + } + for (; wqe_fragment_index < 5; wqe_fragment_index++) { + if (wqe_fragment_length[wqe_fragment_index]) { + u64temp = le32_to_cpu( + nic_sqe->wqe_words[ + NES_NIC_SQ_WQE_FRAG0_LOW_IDX+ + wqe_fragment_index*2]); + u64temp += ((u64)le32_to_cpu( + nic_sqe->wqe_words[ + NES_NIC_SQ_WQE_FRAG0_HIGH_IDX+ + wqe_fragment_index*2]))<<32; + bus_address = (dma_addr_t)u64temp; + pci_unmap_page(nesdev->pcidev, + bus_address, + le16_to_cpu( + wqe_fragment_length[ + wqe_fragment_index]), + PCI_DMA_TODEVICE); + } else + break; + } + } + if (nesvnic->nic.tx_skb[nesvnic->nic.sq_tail]) + dev_kfree_skb( + nesvnic->nic.tx_skb[nesvnic->nic.sq_tail]); + + nesvnic->nic.sq_tail = (nesvnic->nic.sq_tail + 1) + & (nesvnic->nic.sq_size - 1); + } + + spin_lock_irqsave(&nesdev->cqp.lock, flags); + + /* Destroy NIC QP */ + cqp_head = nesdev->cqp.sq_head; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + (NES_CQP_DESTROY_QP | NES_CQP_QP_TYPE_NIC)); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, + nesvnic->nic.qp_id); + + if (++cqp_head >= nesdev->cqp.sq_size) + cqp_head = 0; + + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + + /* Destroy NIC CQ */ + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + (NES_CQP_DESTROY_CQ | ((u32)nesvnic->nic_cq.cq_size << 16))); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, + (nesvnic->nic_cq.cq_number | ((u32)nesdev->nic_ceq_index << 16))); + + if (++cqp_head >= nesdev->cqp.sq_size) + cqp_head = 0; + + nesdev->cqp.sq_head = cqp_head; + barrier(); + + /* Ring doorbell (2 WQEs) */ + nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id); + + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + nes_debug(NES_DBG_SHUTDOWN, "Waiting for CQP, cqp_head=%u, cqp.sq_head=%u," + " cqp.sq_tail=%u, cqp.sq_size=%u\n", + cqp_head, nesdev->cqp.sq_head, + nesdev->cqp.sq_tail, nesdev->cqp.sq_size); + + ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head), + NES_EVENT_TIMEOUT); + + nes_debug(NES_DBG_SHUTDOWN, "Destroy NIC QP returned, wait_event_timeout ret = %u, cqp_head=%u," + " cqp.sq_head=%u, cqp.sq_tail=%u\n", + ret, cqp_head, nesdev->cqp.sq_head, nesdev->cqp.sq_tail); + if (!ret) { + nes_debug(NES_DBG_SHUTDOWN, "NIC QP%u destroy timeout expired\n", + nesvnic->nic.qp_id); + } + + pci_free_consistent(nesdev->pcidev, nesvnic->nic_mem_size, nesvnic->nic_vbase, + nesvnic->nic_pbase); + + /* restore old wqm_cfg0 value */ + nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG0, wqm_cfg0); +} + +/** + * nes_napi_isr + */ +int nes_napi_isr(struct nes_device *nesdev) +{ + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 int_stat; + + if (nesdev->napi_isr_ran) { + /* interrupt status has already been read in ISR */ + int_stat = nesdev->int_stat; + } else { + int_stat = nes_read32(nesdev->regs + NES_INT_STAT); + nesdev->int_stat = int_stat; + nesdev->napi_isr_ran = 1; + } + + int_stat &= nesdev->int_req; + /* iff NIC, process here, else wait for DPC */ + if ((int_stat) && ((int_stat & 0x0000ff00) == int_stat)) { + nesdev->napi_isr_ran = 0; + nes_write32(nesdev->regs + NES_INT_STAT, + (int_stat & + ~(NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0 | NES_INT_MAC1 | NES_INT_MAC2 | NES_INT_MAC3))); + + /* Process the CEQs */ + nes_process_ceq(nesdev, &nesdev->nesadapter->ceq[nesdev->nic_ceq_index]); + + if (unlikely((((nesadapter->et_rx_coalesce_usecs_irq) && + (!nesadapter->et_use_adaptive_rx_coalesce)) || + ((nesadapter->et_use_adaptive_rx_coalesce) && + (nesdev->deepcq_count > nesadapter->et_pkt_rate_low))))) { + if ((nesdev->int_req & NES_INT_TIMER) == 0) { + /* Enable Periodic timer interrupts */ + nesdev->int_req |= NES_INT_TIMER; + /* ack any pending periodic timer interrupts so we don't get an immediate interrupt */ + /* TODO: need to also ack other unused periodic timer values, get from nesadapter */ + nes_write32(nesdev->regs+NES_TIMER_STAT, + nesdev->timer_int_req | ~(nesdev->nesadapter->timer_int_req)); + nes_write32(nesdev->regs+NES_INTF_INT_MASK, + ~(nesdev->intf_int_req | NES_INTF_PERIODIC_TIMER)); + } + + if (unlikely(nesadapter->et_use_adaptive_rx_coalesce)) + { + nes_nic_init_timer(nesdev); + } + /* Enable interrupts, except CEQs */ + nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req)); + } else { + /* Enable interrupts, make sure timer is off */ + nesdev->int_req &= ~NES_INT_TIMER; + nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req)); + nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req); + } + nesdev->deepcq_count = 0; + return 1; + } else { + return 0; + } +} + +static void process_critical_error(struct nes_device *nesdev) +{ + u32 debug_error; + u32 nes_idx_debug_error_masks0 = 0; + u16 error_module = 0; + + debug_error = nes_read_indexed(nesdev, NES_IDX_DEBUG_ERROR_CONTROL_STATUS); + printk(KERN_ERR PFX "Critical Error reported by device!!! 0x%02X\n", + (u16)debug_error); + nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_CONTROL_STATUS, + 0x01010000 | (debug_error & 0x0000ffff)); + if (crit_err_count++ > 10) + nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS1, 1 << 0x17); + error_module = (u16) (debug_error & 0x1F00) >> 8; + if (++nesdev->nesadapter->crit_error_count[error_module-1] >= + nes_max_critical_error_count) { + printk(KERN_ERR PFX "Masking off critical error for module " + "0x%02X\n", (u16)error_module); + nes_idx_debug_error_masks0 = nes_read_indexed(nesdev, + NES_IDX_DEBUG_ERROR_MASKS0); + nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS0, + nes_idx_debug_error_masks0 | (1 << error_module)); + } +} +/** + * nes_dpc + */ +void nes_dpc(unsigned long param) +{ + struct nes_device *nesdev = (struct nes_device *)param; + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 counter; + u32 loop_counter = 0; + u32 int_status_bit; + u32 int_stat; + u32 timer_stat; + u32 temp_int_stat; + u32 intf_int_stat; + u32 processed_intf_int = 0; + u16 processed_timer_int = 0; + u16 completion_ints = 0; + u16 timer_ints = 0; + + /* nes_debug(NES_DBG_ISR, "\n"); */ + + do { + timer_stat = 0; + if (nesdev->napi_isr_ran) { + nesdev->napi_isr_ran = 0; + int_stat = nesdev->int_stat; + } else + int_stat = nes_read32(nesdev->regs+NES_INT_STAT); + if (processed_intf_int != 0) + int_stat &= nesdev->int_req & ~NES_INT_INTF; + else + int_stat &= nesdev->int_req; + if (processed_timer_int == 0) { + processed_timer_int = 1; + if (int_stat & NES_INT_TIMER) { + timer_stat = nes_read32(nesdev->regs + NES_TIMER_STAT); + if ((timer_stat & nesdev->timer_int_req) == 0) { + int_stat &= ~NES_INT_TIMER; + } + } + } else { + int_stat &= ~NES_INT_TIMER; + } + + if (int_stat) { + if (int_stat & ~(NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0| + NES_INT_MAC1|NES_INT_MAC2 | NES_INT_MAC3)) { + /* Ack the interrupts */ + nes_write32(nesdev->regs+NES_INT_STAT, + (int_stat & ~(NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0| + NES_INT_MAC1 | NES_INT_MAC2 | NES_INT_MAC3))); + } + + temp_int_stat = int_stat; + for (counter = 0, int_status_bit = 1; counter < 16; counter++) { + if (int_stat & int_status_bit) { + nes_process_ceq(nesdev, &nesadapter->ceq[counter]); + temp_int_stat &= ~int_status_bit; + completion_ints = 1; + } + if (!(temp_int_stat & 0x0000ffff)) + break; + int_status_bit <<= 1; + } + + /* Process the AEQ for this pci function */ + int_status_bit = 1 << (16 + PCI_FUNC(nesdev->pcidev->devfn)); + if (int_stat & int_status_bit) { + nes_process_aeq(nesdev, &nesadapter->aeq[PCI_FUNC(nesdev->pcidev->devfn)]); + } + + /* Process the MAC interrupt for this pci function */ + int_status_bit = 1 << (24 + nesdev->mac_index); + if (int_stat & int_status_bit) { + nes_process_mac_intr(nesdev, nesdev->mac_index); + } + + if (int_stat & NES_INT_TIMER) { + if (timer_stat & nesdev->timer_int_req) { + nes_write32(nesdev->regs + NES_TIMER_STAT, + (timer_stat & nesdev->timer_int_req) | + ~(nesdev->nesadapter->timer_int_req)); + timer_ints = 1; + } + } + + if (int_stat & NES_INT_INTF) { + processed_intf_int = 1; + intf_int_stat = nes_read32(nesdev->regs+NES_INTF_INT_STAT); + intf_int_stat &= nesdev->intf_int_req; + if (NES_INTF_INT_CRITERR & intf_int_stat) { + process_critical_error(nesdev); + } + if (NES_INTF_INT_PCIERR & intf_int_stat) { + printk(KERN_ERR PFX "PCI Error reported by device!!!\n"); + BUG(); + } + if (NES_INTF_INT_AEQ_OFLOW & intf_int_stat) { + printk(KERN_ERR PFX "AEQ Overflow reported by device!!!\n"); + BUG(); + } + nes_write32(nesdev->regs+NES_INTF_INT_STAT, intf_int_stat); + } + + if (int_stat & NES_INT_TSW) { + } + } + /* Don't use the interface interrupt bit stay in loop */ + int_stat &= ~NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0 | + NES_INT_MAC1 | NES_INT_MAC2 | NES_INT_MAC3; + } while ((int_stat != 0) && (loop_counter++ < MAX_DPC_ITERATIONS)); + + if (timer_ints == 1) { + if ((nesadapter->et_rx_coalesce_usecs_irq) || (nesadapter->et_use_adaptive_rx_coalesce)) { + if (completion_ints == 0) { + nesdev->timer_only_int_count++; + if (nesdev->timer_only_int_count>=nesadapter->timer_int_limit) { + nesdev->timer_only_int_count = 0; + nesdev->int_req &= ~NES_INT_TIMER; + nes_write32(nesdev->regs + NES_INTF_INT_MASK, ~(nesdev->intf_int_req)); + nes_write32(nesdev->regs + NES_INT_MASK, ~nesdev->int_req); + } else { + nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req)); + } + } else { + if (unlikely(nesadapter->et_use_adaptive_rx_coalesce)) + { + nes_nic_init_timer(nesdev); + } + nesdev->timer_only_int_count = 0; + nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req)); + } + } else { + nesdev->timer_only_int_count = 0; + nesdev->int_req &= ~NES_INT_TIMER; + nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req)); + nes_write32(nesdev->regs+NES_TIMER_STAT, + nesdev->timer_int_req | ~(nesdev->nesadapter->timer_int_req)); + nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req); + } + } else { + if ( (completion_ints == 1) && + (((nesadapter->et_rx_coalesce_usecs_irq) && + (!nesadapter->et_use_adaptive_rx_coalesce)) || + ((nesdev->deepcq_count > nesadapter->et_pkt_rate_low) && + (nesadapter->et_use_adaptive_rx_coalesce) )) ) { + /* nes_debug(NES_DBG_ISR, "Enabling periodic timer interrupt.\n" ); */ + nesdev->timer_only_int_count = 0; + nesdev->int_req |= NES_INT_TIMER; + nes_write32(nesdev->regs+NES_TIMER_STAT, + nesdev->timer_int_req | ~(nesdev->nesadapter->timer_int_req)); + nes_write32(nesdev->regs+NES_INTF_INT_MASK, + ~(nesdev->intf_int_req | NES_INTF_PERIODIC_TIMER)); + nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req)); + } else { + nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req); + } + } + nesdev->deepcq_count = 0; +} + + +/** + * nes_process_ceq + */ +static void nes_process_ceq(struct nes_device *nesdev, struct nes_hw_ceq *ceq) +{ + u64 u64temp; + struct nes_hw_cq *cq; + u32 head; + u32 ceq_size; + + /* nes_debug(NES_DBG_CQ, "\n"); */ + head = ceq->ceq_head; + ceq_size = ceq->ceq_size; + + do { + if (le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX]) & + NES_CEQE_VALID) { + u64temp = (((u64)(le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX]))) << 32) | + ((u64)(le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_LOW_IDX]))); + u64temp <<= 1; + cq = *((struct nes_hw_cq **)&u64temp); + /* nes_debug(NES_DBG_CQ, "pCQ = %p\n", cq); */ + barrier(); + ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX] = 0; + + /* call the event handler */ + cq->ce_handler(nesdev, cq); + + if (++head >= ceq_size) + head = 0; + } else { + break; + } + + } while (1); + + ceq->ceq_head = head; +} + + +/** + * nes_process_aeq + */ +static void nes_process_aeq(struct nes_device *nesdev, struct nes_hw_aeq *aeq) +{ + /* u64 u64temp; */ + u32 head; + u32 aeq_size; + u32 aeqe_misc; + u32 aeqe_cq_id; + struct nes_hw_aeqe volatile *aeqe; + + head = aeq->aeq_head; + aeq_size = aeq->aeq_size; + + do { + aeqe = &aeq->aeq_vbase[head]; + if ((le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]) & NES_AEQE_VALID) == 0) + break; + aeqe_misc = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]); + aeqe_cq_id = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]); + if (aeqe_misc & (NES_AEQE_QP|NES_AEQE_CQ)) { + if (aeqe_cq_id >= NES_FIRST_QPN) { + /* dealing with an accelerated QP related AE */ + /* + * u64temp = (((u64)(le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX]))) << 32) | + * ((u64)(le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX]))); + */ + nes_process_iwarp_aeqe(nesdev, (struct nes_hw_aeqe *)aeqe); + } else { + /* TODO: dealing with a CQP related AE */ + nes_debug(NES_DBG_AEQ, "Processing CQP related AE, misc = 0x%04X\n", + (u16)(aeqe_misc >> 16)); + } + } + + aeqe->aeqe_words[NES_AEQE_MISC_IDX] = 0; + + if (++head >= aeq_size) + head = 0; + + nes_write32(nesdev->regs + NES_AEQ_ALLOC, 1 << 16); + } + while (1); + aeq->aeq_head = head; +} + +static void nes_reset_link(struct nes_device *nesdev, u32 mac_index) +{ + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 reset_value; + u32 i=0; + u32 u32temp; + + if (nesadapter->hw_rev == NE020_REV) { + return; + } + mh_detected++; + + reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET); + + if ((mac_index == 0) || ((mac_index == 1) && (nesadapter->OneG_Mode))) + reset_value |= 0x0000001d; + else + reset_value |= 0x0000002d; + + if (4 <= (nesadapter->link_interrupt_count[mac_index] / ((u16)NES_MAX_LINK_INTERRUPTS))) { + if ((!nesadapter->OneG_Mode) && (nesadapter->port_count == 2)) { + nesadapter->link_interrupt_count[0] = 0; + nesadapter->link_interrupt_count[1] = 0; + u32temp = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1); + if (0x00000040 & u32temp) + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F088); + else + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F0C8); + + reset_value |= 0x0000003d; + } + nesadapter->link_interrupt_count[mac_index] = 0; + } + + nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value); + + while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) + & 0x00000040) != 0x00000040) && (i++ < 5000)); + + if (0x0000003d == (reset_value & 0x0000003d)) { + u32 pcs_control_status0, pcs_control_status1; + + for (i = 0; i < 10; i++) { + pcs_control_status0 = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0); + pcs_control_status1 = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200); + if (((0x0F000000 == (pcs_control_status0 & 0x0F000000)) + && (pcs_control_status0 & 0x00100000)) + || ((0x0F000000 == (pcs_control_status1 & 0x0F000000)) + && (pcs_control_status1 & 0x00100000))) + continue; + else + break; + } + if (10 == i) { + u32temp = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1); + if (0x00000040 & u32temp) + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F088); + else + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F0C8); + + nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value); + + while (((nes_read32(nesdev->regs + NES_SOFTWARE_RESET) + & 0x00000040) != 0x00000040) && (i++ < 5000)); + } + } +} + +/** + * nes_process_mac_intr + */ +static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number) +{ + unsigned long flags; + u32 pcs_control_status; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_vnic *nesvnic; + u32 mac_status; + u32 mac_index = nesdev->mac_index; + u32 u32temp; + u16 phy_data; + u16 temp_phy_data; + u32 pcs_val = 0x0f0f0000; + u32 pcs_mask = 0x0f1f0000; + u32 cdr_ctrl; + + spin_lock_irqsave(&nesadapter->phy_lock, flags); + if (nesadapter->mac_sw_state[mac_number] != NES_MAC_SW_IDLE) { + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); + return; + } + nesadapter->mac_sw_state[mac_number] = NES_MAC_SW_INTERRUPT; + + /* ack the MAC interrupt */ + mac_status = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (mac_index * 0x200)); + /* Clear the interrupt */ + nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (mac_index * 0x200), mac_status); + + nes_debug(NES_DBG_PHY, "MAC%u interrupt status = 0x%X.\n", mac_number, mac_status); + + if (mac_status & (NES_MAC_INT_LINK_STAT_CHG | NES_MAC_INT_XGMII_EXT)) { + nesdev->link_status_interrupts++; + if (0 == (++nesadapter->link_interrupt_count[mac_index] % ((u16)NES_MAX_LINK_INTERRUPTS))) + nes_reset_link(nesdev, mac_index); + + /* read the PHY interrupt status register */ + if ((nesadapter->OneG_Mode) && + (nesadapter->phy_type[mac_index] != NES_PHY_TYPE_PUMA_1G)) { + do { + nes_read_1G_phy_reg(nesdev, 0x1a, + nesadapter->phy_index[mac_index], &phy_data); + nes_debug(NES_DBG_PHY, "Phy%d data from register 0x1a = 0x%X.\n", + nesadapter->phy_index[mac_index], phy_data); + } while (phy_data&0x8000); + + temp_phy_data = 0; + do { + nes_read_1G_phy_reg(nesdev, 0x11, + nesadapter->phy_index[mac_index], &phy_data); + nes_debug(NES_DBG_PHY, "Phy%d data from register 0x11 = 0x%X.\n", + nesadapter->phy_index[mac_index], phy_data); + if (temp_phy_data == phy_data) + break; + temp_phy_data = phy_data; + } while (1); + + nes_read_1G_phy_reg(nesdev, 0x1e, + nesadapter->phy_index[mac_index], &phy_data); + nes_debug(NES_DBG_PHY, "Phy%d data from register 0x1e = 0x%X.\n", + nesadapter->phy_index[mac_index], phy_data); + + nes_read_1G_phy_reg(nesdev, 1, + nesadapter->phy_index[mac_index], &phy_data); + nes_debug(NES_DBG_PHY, "1G phy%u data from register 1 = 0x%X\n", + nesadapter->phy_index[mac_index], phy_data); + + if (temp_phy_data & 0x1000) { + nes_debug(NES_DBG_PHY, "The Link is up according to the PHY\n"); + phy_data = 4; + } else { + nes_debug(NES_DBG_PHY, "The Link is down according to the PHY\n"); + } + } + nes_debug(NES_DBG_PHY, "Eth SERDES Common Status: 0=0x%08X, 1=0x%08X\n", + nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0), + nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0+0x200)); + + if (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_PUMA_1G) { + switch (mac_index) { + case 1: + case 3: + pcs_control_status = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200); + break; + default: + pcs_control_status = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0); + break; + } + } else { + pcs_control_status = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0 + ((mac_index & 1) * 0x200)); + pcs_control_status = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0 + ((mac_index & 1) * 0x200)); + } + + nes_debug(NES_DBG_PHY, "PCS PHY Control/Status%u: 0x%08X\n", + mac_index, pcs_control_status); + if ((nesadapter->OneG_Mode) && + (nesadapter->phy_type[mac_index] != NES_PHY_TYPE_PUMA_1G)) { + u32temp = 0x01010000; + if (nesadapter->port_count > 2) { + u32temp |= 0x02020000; + } + if ((pcs_control_status & u32temp)!= u32temp) { + phy_data = 0; + nes_debug(NES_DBG_PHY, "PCS says the link is down\n"); + } + } else { + switch (nesadapter->phy_type[mac_index]) { + case NES_PHY_TYPE_ARGUS: + case NES_PHY_TYPE_SFP_D: + case NES_PHY_TYPE_KR: + /* clear the alarms */ + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0x0008); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc001); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc002); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc005); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc006); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9003); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9004); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9005); + /* check link status */ + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9003); + temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021); + nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021); + phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + + phy_data = (!temp_phy_data && (phy_data == 0x8000)) ? 0x4 : 0x0; + + nes_debug(NES_DBG_PHY, "%s: Phy data = 0x%04X, link was %s.\n", + __func__, phy_data, nesadapter->mac_link_down[mac_index] ? "DOWN" : "UP"); + break; + + case NES_PHY_TYPE_PUMA_1G: + if (mac_index < 2) + pcs_val = pcs_mask = 0x01010000; + else + pcs_val = pcs_mask = 0x02020000; + /* fall through */ + default: + phy_data = (pcs_val == (pcs_control_status & pcs_mask)) ? 0x4 : 0x0; + break; + } + } + + if (phy_data & 0x0004) { + if (wide_ppm_offset && + (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_CX4) && + (nesadapter->hw_rev != NE020_REV)) { + cdr_ctrl = nes_read_indexed(nesdev, + NES_IDX_ETH_SERDES_CDR_CONTROL0 + + mac_index * 0x200); + nes_write_indexed(nesdev, + NES_IDX_ETH_SERDES_CDR_CONTROL0 + + mac_index * 0x200, + cdr_ctrl | 0x000F0000); + } + nesadapter->mac_link_down[mac_index] = 0; + list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) { + nes_debug(NES_DBG_PHY, "The Link is UP!!. linkup was %d\n", + nesvnic->linkup); + if (nesvnic->linkup == 0) { + printk(PFX "The Link is now up for port %s, netdev %p.\n", + nesvnic->netdev->name, nesvnic->netdev); + if (netif_queue_stopped(nesvnic->netdev)) + netif_start_queue(nesvnic->netdev); + nesvnic->linkup = 1; + netif_carrier_on(nesvnic->netdev); + + spin_lock(&nesvnic->port_ibevent_lock); + if (nesvnic->of_device_registered) { + if (nesdev->iw_status == 0) { + nesdev->iw_status = 1; + nes_port_ibevent(nesvnic); + } + } + spin_unlock(&nesvnic->port_ibevent_lock); + } + } + } else { + if (wide_ppm_offset && + (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_CX4) && + (nesadapter->hw_rev != NE020_REV)) { + cdr_ctrl = nes_read_indexed(nesdev, + NES_IDX_ETH_SERDES_CDR_CONTROL0 + + mac_index * 0x200); + nes_write_indexed(nesdev, + NES_IDX_ETH_SERDES_CDR_CONTROL0 + + mac_index * 0x200, + cdr_ctrl & 0xFFF0FFFF); + } + nesadapter->mac_link_down[mac_index] = 1; + list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) { + nes_debug(NES_DBG_PHY, "The Link is Down!!. linkup was %d\n", + nesvnic->linkup); + if (nesvnic->linkup == 1) { + printk(PFX "The Link is now down for port %s, netdev %p.\n", + nesvnic->netdev->name, nesvnic->netdev); + if (!(netif_queue_stopped(nesvnic->netdev))) + netif_stop_queue(nesvnic->netdev); + nesvnic->linkup = 0; + netif_carrier_off(nesvnic->netdev); + + spin_lock(&nesvnic->port_ibevent_lock); + if (nesvnic->of_device_registered) { + if (nesdev->iw_status == 1) { + nesdev->iw_status = 0; + nes_port_ibevent(nesvnic); + } + } + spin_unlock(&nesvnic->port_ibevent_lock); + } + } + } + if (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_SFP_D) { + nesdev->link_recheck = 1; + mod_delayed_work(system_wq, &nesdev->work, + NES_LINK_RECHECK_DELAY); + } + } + + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); + + nesadapter->mac_sw_state[mac_number] = NES_MAC_SW_IDLE; +} + +void nes_recheck_link_status(struct work_struct *work) +{ + unsigned long flags; + struct nes_device *nesdev = container_of(work, struct nes_device, work.work); + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_vnic *nesvnic; + u32 mac_index = nesdev->mac_index; + u16 phy_data; + u16 temp_phy_data; + + spin_lock_irqsave(&nesadapter->phy_lock, flags); + + /* check link status */ + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9003); + temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021); + nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021); + phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + + phy_data = (!temp_phy_data && (phy_data == 0x8000)) ? 0x4 : 0x0; + + nes_debug(NES_DBG_PHY, "%s: Phy data = 0x%04X, link was %s.\n", + __func__, phy_data, + nesadapter->mac_link_down[mac_index] ? "DOWN" : "UP"); + + if (phy_data & 0x0004) { + nesadapter->mac_link_down[mac_index] = 0; + list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) { + if (nesvnic->linkup == 0) { + printk(PFX "The Link is now up for port %s, netdev %p.\n", + nesvnic->netdev->name, nesvnic->netdev); + if (netif_queue_stopped(nesvnic->netdev)) + netif_start_queue(nesvnic->netdev); + nesvnic->linkup = 1; + netif_carrier_on(nesvnic->netdev); + + spin_lock(&nesvnic->port_ibevent_lock); + if (nesvnic->of_device_registered) { + if (nesdev->iw_status == 0) { + nesdev->iw_status = 1; + nes_port_ibevent(nesvnic); + } + } + spin_unlock(&nesvnic->port_ibevent_lock); + } + } + + } else { + nesadapter->mac_link_down[mac_index] = 1; + list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) { + if (nesvnic->linkup == 1) { + printk(PFX "The Link is now down for port %s, netdev %p.\n", + nesvnic->netdev->name, nesvnic->netdev); + if (!(netif_queue_stopped(nesvnic->netdev))) + netif_stop_queue(nesvnic->netdev); + nesvnic->linkup = 0; + netif_carrier_off(nesvnic->netdev); + + spin_lock(&nesvnic->port_ibevent_lock); + if (nesvnic->of_device_registered) { + if (nesdev->iw_status == 1) { + nesdev->iw_status = 0; + nes_port_ibevent(nesvnic); + } + } + spin_unlock(&nesvnic->port_ibevent_lock); + } + } + } + if (nesdev->link_recheck++ < NES_LINK_RECHECK_MAX) + schedule_delayed_work(&nesdev->work, NES_LINK_RECHECK_DELAY); + else + nesdev->link_recheck = 0; + + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); +} + + +static void nes_nic_napi_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq) +{ + struct nes_vnic *nesvnic = container_of(cq, struct nes_vnic, nic_cq); + + napi_schedule(&nesvnic->napi); +} + + +/* The MAX_RQES_TO_PROCESS defines how many max read requests to complete before +* getting out of nic_ce_handler +*/ +#define MAX_RQES_TO_PROCESS 384 + +/** + * nes_nic_ce_handler + */ +void nes_nic_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq) +{ + u64 u64temp; + dma_addr_t bus_address; + struct nes_hw_nic *nesnic; + struct nes_vnic *nesvnic = container_of(cq, struct nes_vnic, nic_cq); + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_hw_nic_rq_wqe *nic_rqe; + struct nes_hw_nic_sq_wqe *nic_sqe; + struct sk_buff *skb; + struct sk_buff *rx_skb; + struct nes_rskb_cb *cb; + __le16 *wqe_fragment_length; + u32 head; + u32 cq_size; + u32 rx_pkt_size; + u32 cqe_count=0; + u32 cqe_errv; + u32 cqe_misc; + u16 wqe_fragment_index = 1; /* first fragment (0) is used by copy buffer */ + u16 vlan_tag; + u16 pkt_type; + u16 rqes_processed = 0; + u8 sq_cqes = 0; + u8 nes_use_lro = 0; + + head = cq->cq_head; + cq_size = cq->cq_size; + cq->cqes_pending = 1; + if (nesvnic->netdev->features & NETIF_F_LRO) + nes_use_lro = 1; + do { + if (le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]) & + NES_NIC_CQE_VALID) { + nesnic = &nesvnic->nic; + cqe_misc = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]); + if (cqe_misc & NES_NIC_CQE_SQ) { + sq_cqes++; + wqe_fragment_index = 1; + nic_sqe = &nesnic->sq_vbase[nesnic->sq_tail]; + skb = nesnic->tx_skb[nesnic->sq_tail]; + wqe_fragment_length = (__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX]; + /* bump past the vlan tag */ + wqe_fragment_length++; + if (le16_to_cpu(wqe_fragment_length[wqe_fragment_index]) != 0) { + u64temp = (u64) le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX + + wqe_fragment_index * 2]); + u64temp += ((u64)le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX + + wqe_fragment_index * 2])) << 32; + bus_address = (dma_addr_t)u64temp; + if (test_and_clear_bit(nesnic->sq_tail, nesnic->first_frag_overflow)) { + pci_unmap_single(nesdev->pcidev, + bus_address, + le16_to_cpu(wqe_fragment_length[wqe_fragment_index++]), + PCI_DMA_TODEVICE); + } + for (; wqe_fragment_index < 5; wqe_fragment_index++) { + if (wqe_fragment_length[wqe_fragment_index]) { + u64temp = le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX + + wqe_fragment_index * 2]); + u64temp += ((u64)le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX + + wqe_fragment_index * 2])) <<32; + bus_address = (dma_addr_t)u64temp; + pci_unmap_page(nesdev->pcidev, + bus_address, + le16_to_cpu(wqe_fragment_length[wqe_fragment_index]), + PCI_DMA_TODEVICE); + } else + break; + } + } + if (skb) + dev_kfree_skb_any(skb); + nesnic->sq_tail++; + nesnic->sq_tail &= nesnic->sq_size-1; + if (sq_cqes > 128) { + barrier(); + /* restart the queue if it had been stopped */ + if (netif_queue_stopped(nesvnic->netdev)) + netif_wake_queue(nesvnic->netdev); + sq_cqes = 0; + } + } else { + rqes_processed ++; + + cq->rx_cqes_completed++; + cq->rx_pkts_indicated++; + rx_pkt_size = cqe_misc & 0x0000ffff; + nic_rqe = &nesnic->rq_vbase[nesnic->rq_tail]; + /* Get the skb */ + rx_skb = nesnic->rx_skb[nesnic->rq_tail]; + nic_rqe = &nesnic->rq_vbase[nesvnic->nic.rq_tail]; + bus_address = (dma_addr_t)le32_to_cpu(nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX]); + bus_address += ((u64)le32_to_cpu(nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX])) << 32; + pci_unmap_single(nesdev->pcidev, bus_address, + nesvnic->max_frame_size, PCI_DMA_FROMDEVICE); + cb = (struct nes_rskb_cb *)&rx_skb->cb[0]; + cb->busaddr = 0; + /* rx_skb->tail = rx_skb->data + rx_pkt_size; */ + /* rx_skb->len = rx_pkt_size; */ + rx_skb->len = 0; /* TODO: see if this is necessary */ + skb_put(rx_skb, rx_pkt_size); + rx_skb->protocol = eth_type_trans(rx_skb, nesvnic->netdev); + nesnic->rq_tail++; + nesnic->rq_tail &= nesnic->rq_size - 1; + + atomic_inc(&nesvnic->rx_skbs_needed); + if (atomic_read(&nesvnic->rx_skbs_needed) > (nesvnic->nic.rq_size>>1)) { + nes_write32(nesdev->regs+NES_CQE_ALLOC, + cq->cq_number | (cqe_count << 16)); + /* nesadapter->tune_timer.cq_count += cqe_count; */ + nesdev->currcq_count += cqe_count; + cqe_count = 0; + nes_replenish_nic_rq(nesvnic); + } + pkt_type = (u16)(le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_TAG_PKT_TYPE_IDX])); + cqe_errv = (cqe_misc & NES_NIC_CQE_ERRV_MASK) >> NES_NIC_CQE_ERRV_SHIFT; + rx_skb->ip_summed = CHECKSUM_NONE; + + if ((NES_PKT_TYPE_TCPV4_BITS == (pkt_type & NES_PKT_TYPE_TCPV4_MASK)) || + (NES_PKT_TYPE_UDPV4_BITS == (pkt_type & NES_PKT_TYPE_UDPV4_MASK))) { + if ((cqe_errv & + (NES_NIC_ERRV_BITS_IPV4_CSUM_ERR | NES_NIC_ERRV_BITS_TCPUDP_CSUM_ERR | + NES_NIC_ERRV_BITS_IPH_ERR | NES_NIC_ERRV_BITS_WQE_OVERRUN)) == 0) { + if (nesvnic->netdev->features & NETIF_F_RXCSUM) + rx_skb->ip_summed = CHECKSUM_UNNECESSARY; + } else + nes_debug(NES_DBG_CQ, "%s: unsuccessfully checksummed TCP or UDP packet." + " errv = 0x%X, pkt_type = 0x%X.\n", + nesvnic->netdev->name, cqe_errv, pkt_type); + + } else if ((pkt_type & NES_PKT_TYPE_IPV4_MASK) == NES_PKT_TYPE_IPV4_BITS) { + if ((cqe_errv & + (NES_NIC_ERRV_BITS_IPV4_CSUM_ERR | NES_NIC_ERRV_BITS_IPH_ERR | + NES_NIC_ERRV_BITS_WQE_OVERRUN)) == 0) { + if (nesvnic->netdev->features & NETIF_F_RXCSUM) { + rx_skb->ip_summed = CHECKSUM_UNNECESSARY; + /* nes_debug(NES_DBG_CQ, "%s: Reporting successfully checksummed IPv4 packet.\n", + nesvnic->netdev->name); */ + } + } else + nes_debug(NES_DBG_CQ, "%s: unsuccessfully checksummed TCP or UDP packet." + " errv = 0x%X, pkt_type = 0x%X.\n", + nesvnic->netdev->name, cqe_errv, pkt_type); + } + /* nes_debug(NES_DBG_CQ, "pkt_type=%x, APBVT_MASK=%x\n", + pkt_type, (pkt_type & NES_PKT_TYPE_APBVT_MASK)); */ + + if ((pkt_type & NES_PKT_TYPE_APBVT_MASK) == NES_PKT_TYPE_APBVT_BITS) { + if (nes_cm_recv(rx_skb, nesvnic->netdev)) + rx_skb = NULL; + } + if (rx_skb == NULL) + goto skip_rx_indicate0; + + + if (cqe_misc & NES_NIC_CQE_TAG_VALID) { + vlan_tag = (u16)(le32_to_cpu( + cq->cq_vbase[head].cqe_words[NES_NIC_CQE_TAG_PKT_TYPE_IDX]) + >> 16); + nes_debug(NES_DBG_CQ, "%s: Reporting stripped VLAN packet. Tag = 0x%04X\n", + nesvnic->netdev->name, vlan_tag); + + __vlan_hwaccel_put_tag(rx_skb, htons(ETH_P_8021Q), vlan_tag); + } + if (nes_use_lro) + lro_receive_skb(&nesvnic->lro_mgr, rx_skb, NULL); + else + netif_receive_skb(rx_skb); + +skip_rx_indicate0: + ; + /* nesvnic->netstats.rx_packets++; */ + /* nesvnic->netstats.rx_bytes += rx_pkt_size; */ + } + + cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX] = 0; + /* Accounting... */ + cqe_count++; + if (++head >= cq_size) + head = 0; + if (cqe_count == 255) { + /* Replenish Nic CQ */ + nes_write32(nesdev->regs+NES_CQE_ALLOC, + cq->cq_number | (cqe_count << 16)); + /* nesdev->nesadapter->tune_timer.cq_count += cqe_count; */ + nesdev->currcq_count += cqe_count; + cqe_count = 0; + } + + if (cq->rx_cqes_completed >= nesvnic->budget) + break; + } else { + cq->cqes_pending = 0; + break; + } + + } while (1); + + if (nes_use_lro) + lro_flush_all(&nesvnic->lro_mgr); + if (sq_cqes) { + barrier(); + /* restart the queue if it had been stopped */ + if (netif_queue_stopped(nesvnic->netdev)) + netif_wake_queue(nesvnic->netdev); + } + cq->cq_head = head; + /* nes_debug(NES_DBG_CQ, "CQ%u Processed = %u cqes, new head = %u.\n", + cq->cq_number, cqe_count, cq->cq_head); */ + cq->cqe_allocs_pending = cqe_count; + if (unlikely(nesadapter->et_use_adaptive_rx_coalesce)) + { + /* nesdev->nesadapter->tune_timer.cq_count += cqe_count; */ + nesdev->currcq_count += cqe_count; + nes_nic_tune_timer(nesdev); + } + if (atomic_read(&nesvnic->rx_skbs_needed)) + nes_replenish_nic_rq(nesvnic); +} + + + +/** + * nes_cqp_ce_handler + */ +static void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq) +{ + u64 u64temp; + unsigned long flags; + struct nes_hw_cqp *cqp = NULL; + struct nes_cqp_request *cqp_request; + struct nes_hw_cqp_wqe *cqp_wqe; + u32 head; + u32 cq_size; + u32 cqe_count=0; + u32 error_code; + u32 opcode; + u32 ctx_index; + /* u32 counter; */ + + head = cq->cq_head; + cq_size = cq->cq_size; + + do { + /* process the CQE */ + /* nes_debug(NES_DBG_CQP, "head=%u cqe_words=%08X\n", head, + le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX])); */ + + opcode = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX]); + if (opcode & NES_CQE_VALID) { + cqp = &nesdev->cqp; + + error_code = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_ERROR_CODE_IDX]); + if (error_code) { + nes_debug(NES_DBG_CQP, "Bad Completion code for opcode 0x%02X from CQP," + " Major/Minor codes = 0x%04X:%04X.\n", + le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX])&0x3f, + (u16)(error_code >> 16), + (u16)error_code); + } + + u64temp = (((u64)(le32_to_cpu(cq->cq_vbase[head]. + cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX]))) << 32) | + ((u64)(le32_to_cpu(cq->cq_vbase[head]. + cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]))); + + cqp_request = (struct nes_cqp_request *)(unsigned long)u64temp; + if (cqp_request) { + if (cqp_request->waiting) { + /* nes_debug(NES_DBG_CQP, "%s: Waking up requestor\n"); */ + cqp_request->major_code = (u16)(error_code >> 16); + cqp_request->minor_code = (u16)error_code; + barrier(); + cqp_request->request_done = 1; + wake_up(&cqp_request->waitq); + nes_put_cqp_request(nesdev, cqp_request); + } else { + if (cqp_request->callback) + cqp_request->cqp_callback(nesdev, cqp_request); + nes_free_cqp_request(nesdev, cqp_request); + } + } else { + wake_up(&nesdev->cqp.waitq); + } + + cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX] = 0; + nes_write32(nesdev->regs + NES_CQE_ALLOC, cq->cq_number | (1 << 16)); + if (++cqp->sq_tail >= cqp->sq_size) + cqp->sq_tail = 0; + + /* Accounting... */ + cqe_count++; + if (++head >= cq_size) + head = 0; + } else { + break; + } + } while (1); + cq->cq_head = head; + + spin_lock_irqsave(&nesdev->cqp.lock, flags); + while ((!list_empty(&nesdev->cqp_pending_reqs)) && + ((((nesdev->cqp.sq_tail+nesdev->cqp.sq_size)-nesdev->cqp.sq_head) & + (nesdev->cqp.sq_size - 1)) != 1)) { + cqp_request = list_entry(nesdev->cqp_pending_reqs.next, + struct nes_cqp_request, list); + list_del_init(&cqp_request->list); + head = nesdev->cqp.sq_head++; + nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1; + cqp_wqe = &nesdev->cqp.sq_vbase[head]; + memcpy(cqp_wqe, &cqp_request->cqp_wqe, sizeof(*cqp_wqe)); + barrier(); + + opcode = cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX]; + if ((opcode & NES_CQP_OPCODE_MASK) == NES_CQP_DOWNLOAD_SEGMENT) + ctx_index = NES_CQP_WQE_DL_COMP_CTX_LOW_IDX; + else + ctx_index = NES_CQP_WQE_COMP_CTX_LOW_IDX; + cqp_wqe->wqe_words[ctx_index] = + cpu_to_le32((u32)((unsigned long)cqp_request)); + cqp_wqe->wqe_words[ctx_index + 1] = + cpu_to_le32((u32)(upper_32_bits((unsigned long)cqp_request))); + nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) put on CQPs SQ wqe%u.\n", + cqp_request, le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f, head); + /* Ring doorbell (1 WQEs) */ + barrier(); + nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x01800000 | nesdev->cqp.qp_id); + } + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + + /* Arm the CCQ */ + nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT | + cq->cq_number); + nes_read32(nesdev->regs+NES_CQE_ALLOC); +} + +static u8 *locate_mpa(u8 *pkt, u32 aeq_info) +{ + if (aeq_info & NES_AEQE_Q2_DATA_ETHERNET) { + /* skip over ethernet header */ + pkt += ETH_HLEN; + + /* Skip over IP and TCP headers */ + pkt += 4 * (pkt[0] & 0x0f); + pkt += 4 * ((pkt[12] >> 4) & 0x0f); + } + return pkt; +} + +/* Determine if incoming error pkt is rdma layer */ +static u32 iwarp_opcode(struct nes_qp *nesqp, u32 aeq_info) +{ + u8 *pkt; + u16 *mpa; + u32 opcode = 0xffffffff; + + if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) { + pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET; + mpa = (u16 *)locate_mpa(pkt, aeq_info); + opcode = be16_to_cpu(mpa[1]) & 0xf; + } + + return opcode; +} + +/* Build iWARP terminate header */ +static int nes_bld_terminate_hdr(struct nes_qp *nesqp, u16 async_event_id, u32 aeq_info) +{ + u8 *pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET; + u16 ddp_seg_len; + int copy_len = 0; + u8 is_tagged = 0; + u8 flush_code = 0; + struct nes_terminate_hdr *termhdr; + + termhdr = (struct nes_terminate_hdr *)nesqp->hwqp.q2_vbase; + memset(termhdr, 0, 64); + + if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) { + + /* Use data from offending packet to fill in ddp & rdma hdrs */ + pkt = locate_mpa(pkt, aeq_info); + ddp_seg_len = be16_to_cpu(*(u16 *)pkt); + if (ddp_seg_len) { + copy_len = 2; + termhdr->hdrct = DDP_LEN_FLAG; + if (pkt[2] & 0x80) { + is_tagged = 1; + if (ddp_seg_len >= TERM_DDP_LEN_TAGGED) { + copy_len += TERM_DDP_LEN_TAGGED; + termhdr->hdrct |= DDP_HDR_FLAG; + } + } else { + if (ddp_seg_len >= TERM_DDP_LEN_UNTAGGED) { + copy_len += TERM_DDP_LEN_UNTAGGED; + termhdr->hdrct |= DDP_HDR_FLAG; + } + + if (ddp_seg_len >= (TERM_DDP_LEN_UNTAGGED + TERM_RDMA_LEN)) { + if ((pkt[3] & RDMA_OPCODE_MASK) == RDMA_READ_REQ_OPCODE) { + copy_len += TERM_RDMA_LEN; + termhdr->hdrct |= RDMA_HDR_FLAG; + } + } + } + } + } + + switch (async_event_id) { + case NES_AEQE_AEID_AMP_UNALLOCATED_STAG: + switch (iwarp_opcode(nesqp, aeq_info)) { + case IWARP_OPCODE_WRITE: + flush_code = IB_WC_LOC_PROT_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER; + termhdr->error_code = DDP_TAGGED_INV_STAG; + break; + default: + flush_code = IB_WC_REM_ACCESS_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_INV_STAG; + } + break; + case NES_AEQE_AEID_AMP_INVALID_STAG: + flush_code = IB_WC_REM_ACCESS_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_INV_STAG; + break; + case NES_AEQE_AEID_AMP_BAD_QP: + flush_code = IB_WC_LOC_QP_OP_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; + termhdr->error_code = DDP_UNTAGGED_INV_QN; + break; + case NES_AEQE_AEID_AMP_BAD_STAG_KEY: + case NES_AEQE_AEID_AMP_BAD_STAG_INDEX: + switch (iwarp_opcode(nesqp, aeq_info)) { + case IWARP_OPCODE_SEND_INV: + case IWARP_OPCODE_SEND_SE_INV: + flush_code = IB_WC_REM_OP_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP; + termhdr->error_code = RDMAP_CANT_INV_STAG; + break; + default: + flush_code = IB_WC_REM_ACCESS_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_INV_STAG; + } + break; + case NES_AEQE_AEID_AMP_BOUNDS_VIOLATION: + if (aeq_info & (NES_AEQE_Q2_DATA_ETHERNET | NES_AEQE_Q2_DATA_MPA)) { + flush_code = IB_WC_LOC_PROT_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER; + termhdr->error_code = DDP_TAGGED_BOUNDS; + } else { + flush_code = IB_WC_REM_ACCESS_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_INV_BOUNDS; + } + break; + case NES_AEQE_AEID_AMP_RIGHTS_VIOLATION: + case NES_AEQE_AEID_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS: + case NES_AEQE_AEID_PRIV_OPERATION_DENIED: + flush_code = IB_WC_REM_ACCESS_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_ACCESS; + break; + case NES_AEQE_AEID_AMP_TO_WRAP: + flush_code = IB_WC_REM_ACCESS_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_TO_WRAP; + break; + case NES_AEQE_AEID_AMP_BAD_PD: + switch (iwarp_opcode(nesqp, aeq_info)) { + case IWARP_OPCODE_WRITE: + flush_code = IB_WC_LOC_PROT_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER; + termhdr->error_code = DDP_TAGGED_UNASSOC_STAG; + break; + case IWARP_OPCODE_SEND_INV: + case IWARP_OPCODE_SEND_SE_INV: + flush_code = IB_WC_REM_ACCESS_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_CANT_INV_STAG; + break; + default: + flush_code = IB_WC_REM_ACCESS_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT; + termhdr->error_code = RDMAP_UNASSOC_STAG; + } + break; + case NES_AEQE_AEID_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH: + flush_code = IB_WC_LOC_LEN_ERR; + termhdr->layer_etype = (LAYER_MPA << 4) | DDP_LLP; + termhdr->error_code = MPA_MARKER; + break; + case NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR: + flush_code = IB_WC_GENERAL_ERR; + termhdr->layer_etype = (LAYER_MPA << 4) | DDP_LLP; + termhdr->error_code = MPA_CRC; + break; + case NES_AEQE_AEID_LLP_SEGMENT_TOO_LARGE: + case NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL: + flush_code = IB_WC_LOC_LEN_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_CATASTROPHIC; + termhdr->error_code = DDP_CATASTROPHIC_LOCAL; + break; + case NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC: + case NES_AEQE_AEID_DDP_NO_L_BIT: + flush_code = IB_WC_FATAL_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_CATASTROPHIC; + termhdr->error_code = DDP_CATASTROPHIC_LOCAL; + break; + case NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN: + case NES_AEQE_AEID_DDP_INVALID_MSN_RANGE_IS_NOT_VALID: + flush_code = IB_WC_GENERAL_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; + termhdr->error_code = DDP_UNTAGGED_INV_MSN_RANGE; + break; + case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER: + flush_code = IB_WC_LOC_LEN_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; + termhdr->error_code = DDP_UNTAGGED_INV_TOO_LONG; + break; + case NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION: + flush_code = IB_WC_GENERAL_ERR; + if (is_tagged) { + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER; + termhdr->error_code = DDP_TAGGED_INV_DDP_VER; + } else { + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; + termhdr->error_code = DDP_UNTAGGED_INV_DDP_VER; + } + break; + case NES_AEQE_AEID_DDP_UBE_INVALID_MO: + flush_code = IB_WC_GENERAL_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; + termhdr->error_code = DDP_UNTAGGED_INV_MO; + break; + case NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE: + flush_code = IB_WC_REM_OP_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; + termhdr->error_code = DDP_UNTAGGED_INV_MSN_NO_BUF; + break; + case NES_AEQE_AEID_DDP_UBE_INVALID_QN: + flush_code = IB_WC_GENERAL_ERR; + termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER; + termhdr->error_code = DDP_UNTAGGED_INV_QN; + break; + case NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION: + flush_code = IB_WC_GENERAL_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP; + termhdr->error_code = RDMAP_INV_RDMAP_VER; + break; + case NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE: + flush_code = IB_WC_LOC_QP_OP_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP; + termhdr->error_code = RDMAP_UNEXPECTED_OP; + break; + default: + flush_code = IB_WC_FATAL_ERR; + termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP; + termhdr->error_code = RDMAP_UNSPECIFIED; + break; + } + + if (copy_len) + memcpy(termhdr + 1, pkt, copy_len); + + if ((flush_code) && ((NES_AEQE_INBOUND_RDMA & aeq_info) == 0)) { + if (aeq_info & NES_AEQE_SQ) + nesqp->term_sq_flush_code = flush_code; + else + nesqp->term_rq_flush_code = flush_code; + } + + return sizeof(struct nes_terminate_hdr) + copy_len; +} + +static void nes_terminate_connection(struct nes_device *nesdev, struct nes_qp *nesqp, + struct nes_hw_aeqe *aeqe, enum ib_event_type eventtype) +{ + u64 context; + unsigned long flags; + u32 aeq_info; + u16 async_event_id; + u8 tcp_state; + u8 iwarp_state; + u32 termlen = 0; + u32 mod_qp_flags = NES_CQP_QP_IWARP_STATE_TERMINATE | + NES_CQP_QP_TERM_DONT_SEND_FIN; + struct nes_adapter *nesadapter = nesdev->nesadapter; + + if (nesqp->term_flags & NES_TERM_SENT) + return; /* Sanity check */ + + aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]); + tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT; + iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT; + async_event_id = (u16)aeq_info; + + context = (unsigned long)nesadapter->qp_table[le32_to_cpu( + aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]) - NES_FIRST_QPN]; + if (!context) { + WARN_ON(!context); + return; + } + + nesqp = (struct nes_qp *)(unsigned long)context; + spin_lock_irqsave(&nesqp->lock, flags); + nesqp->hw_iwarp_state = iwarp_state; + nesqp->hw_tcp_state = tcp_state; + nesqp->last_aeq = async_event_id; + nesqp->terminate_eventtype = eventtype; + spin_unlock_irqrestore(&nesqp->lock, flags); + + if (nesadapter->send_term_ok) + termlen = nes_bld_terminate_hdr(nesqp, async_event_id, aeq_info); + else + mod_qp_flags |= NES_CQP_QP_TERM_DONT_SEND_TERM_MSG; + + if (!nesdev->iw_status) { + nesqp->term_flags = NES_TERM_DONE; + nes_hw_modify_qp(nesdev, nesqp, NES_CQP_QP_IWARP_STATE_ERROR, 0, 0); + nes_cm_disconn(nesqp); + } else { + nes_terminate_start_timer(nesqp); + nesqp->term_flags |= NES_TERM_SENT; + nes_hw_modify_qp(nesdev, nesqp, mod_qp_flags, termlen, 0); + } +} + +static void nes_terminate_send_fin(struct nes_device *nesdev, + struct nes_qp *nesqp, struct nes_hw_aeqe *aeqe) +{ + u32 aeq_info; + u16 async_event_id; + u8 tcp_state; + u8 iwarp_state; + unsigned long flags; + + aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]); + tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT; + iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT; + async_event_id = (u16)aeq_info; + + spin_lock_irqsave(&nesqp->lock, flags); + nesqp->hw_iwarp_state = iwarp_state; + nesqp->hw_tcp_state = tcp_state; + nesqp->last_aeq = async_event_id; + spin_unlock_irqrestore(&nesqp->lock, flags); + + /* Send the fin only */ + nes_hw_modify_qp(nesdev, nesqp, NES_CQP_QP_IWARP_STATE_TERMINATE | + NES_CQP_QP_TERM_DONT_SEND_TERM_MSG, 0, 0); +} + +/* Cleanup after a terminate sent or received */ +static void nes_terminate_done(struct nes_qp *nesqp, int timeout_occurred) +{ + u32 next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR; + unsigned long flags; + struct nes_vnic *nesvnic = to_nesvnic(nesqp->ibqp.device); + struct nes_device *nesdev = nesvnic->nesdev; + u8 first_time = 0; + + spin_lock_irqsave(&nesqp->lock, flags); + if (nesqp->hte_added) { + nesqp->hte_added = 0; + next_iwarp_state |= NES_CQP_QP_DEL_HTE; + } + + first_time = (nesqp->term_flags & NES_TERM_DONE) == 0; + nesqp->term_flags |= NES_TERM_DONE; + spin_unlock_irqrestore(&nesqp->lock, flags); + + /* Make sure we go through this only once */ + if (first_time) { + if (timeout_occurred == 0) + del_timer(&nesqp->terminate_timer); + else + next_iwarp_state |= NES_CQP_QP_RESET; + + nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0); + nes_cm_disconn(nesqp); + } +} + +static void nes_terminate_received(struct nes_device *nesdev, + struct nes_qp *nesqp, struct nes_hw_aeqe *aeqe) +{ + u32 aeq_info; + u8 *pkt; + u32 *mpa; + u8 ddp_ctl; + u8 rdma_ctl; + u16 aeq_id = 0; + + aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]); + if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) { + /* Terminate is not a performance path so the silicon */ + /* did not validate the frame - do it now */ + pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET; + mpa = (u32 *)locate_mpa(pkt, aeq_info); + ddp_ctl = (be32_to_cpu(mpa[0]) >> 8) & 0xff; + rdma_ctl = be32_to_cpu(mpa[0]) & 0xff; + if ((ddp_ctl & 0xc0) != 0x40) + aeq_id = NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC; + else if ((ddp_ctl & 0x03) != 1) + aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION; + else if (be32_to_cpu(mpa[2]) != 2) + aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_QN; + else if (be32_to_cpu(mpa[3]) != 1) + aeq_id = NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN; + else if (be32_to_cpu(mpa[4]) != 0) + aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_MO; + else if ((rdma_ctl & 0xc0) != 0x40) + aeq_id = NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION; + + if (aeq_id) { + /* Bad terminate recvd - send back a terminate */ + aeq_info = (aeq_info & 0xffff0000) | aeq_id; + aeqe->aeqe_words[NES_AEQE_MISC_IDX] = cpu_to_le32(aeq_info); + nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL); + return; + } + } + + nesqp->term_flags |= NES_TERM_RCVD; + nesqp->terminate_eventtype = IB_EVENT_QP_FATAL; + nes_terminate_start_timer(nesqp); + nes_terminate_send_fin(nesdev, nesqp, aeqe); +} + +/* Timeout routine in case terminate fails to complete */ +void nes_terminate_timeout(unsigned long context) +{ + struct nes_qp *nesqp = (struct nes_qp *)(unsigned long)context; + + nes_terminate_done(nesqp, 1); +} + +/* Set a timer in case hw cannot complete the terminate sequence */ +static void nes_terminate_start_timer(struct nes_qp *nesqp) +{ + mod_timer(&nesqp->terminate_timer, (jiffies + HZ)); +} + +/** + * nes_process_iwarp_aeqe + */ +static void nes_process_iwarp_aeqe(struct nes_device *nesdev, + struct nes_hw_aeqe *aeqe) +{ + u64 context; + unsigned long flags; + struct nes_qp *nesqp; + struct nes_hw_cq *hw_cq; + struct nes_cq *nescq; + int resource_allocated; + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 aeq_info; + u32 next_iwarp_state = 0; + u32 aeqe_cq_id; + u16 async_event_id; + u8 tcp_state; + u8 iwarp_state; + struct ib_event ibevent; + + nes_debug(NES_DBG_AEQ, "\n"); + aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]); + if ((NES_AEQE_INBOUND_RDMA & aeq_info) || (!(NES_AEQE_QP & aeq_info))) { + context = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX]); + context += ((u64)le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX])) << 32; + } else { + context = (unsigned long)nesadapter->qp_table[le32_to_cpu( + aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]) - NES_FIRST_QPN]; + BUG_ON(!context); + } + + /* context is nesqp unless async_event_id == CQ ERROR */ + nesqp = (struct nes_qp *)(unsigned long)context; + async_event_id = (u16)aeq_info; + tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT; + iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT; + nes_debug(NES_DBG_AEQ, "aeid = 0x%04X, qp-cq id = %d, aeqe = %p," + " Tcp state = %s, iWARP state = %s\n", + async_event_id, + le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]), aeqe, + nes_tcp_state_str[tcp_state], nes_iwarp_state_str[iwarp_state]); + + aeqe_cq_id = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]); + if (aeq_info & NES_AEQE_QP) { + if (!nes_is_resource_allocated(nesadapter, + nesadapter->allocated_qps, + aeqe_cq_id)) + return; + } + + switch (async_event_id) { + case NES_AEQE_AEID_LLP_FIN_RECEIVED: + if (nesqp->term_flags) + return; /* Ignore it, wait for close complete */ + + if (atomic_inc_return(&nesqp->close_timer_started) == 1) { + if ((tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) && + (nesqp->ibqp_state == IB_QPS_RTS)) { + spin_lock_irqsave(&nesqp->lock, flags); + nesqp->hw_iwarp_state = iwarp_state; + nesqp->hw_tcp_state = tcp_state; + nesqp->last_aeq = async_event_id; + next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING; + nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING; + spin_unlock_irqrestore(&nesqp->lock, flags); + nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0); + nes_cm_disconn(nesqp); + } + nesqp->cm_id->add_ref(nesqp->cm_id); + schedule_nes_timer(nesqp->cm_node, (struct sk_buff *)nesqp, + NES_TIMER_TYPE_CLOSE, 1, 0); + nes_debug(NES_DBG_AEQ, "QP%u Not decrementing QP refcount (%d)," + " need ae to finish up, original_last_aeq = 0x%04X." + " last_aeq = 0x%04X, scheduling timer. TCP state = %d\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), + async_event_id, nesqp->last_aeq, tcp_state); + } + break; + case NES_AEQE_AEID_LLP_CLOSE_COMPLETE: + spin_lock_irqsave(&nesqp->lock, flags); + nesqp->hw_iwarp_state = iwarp_state; + nesqp->hw_tcp_state = tcp_state; + nesqp->last_aeq = async_event_id; + spin_unlock_irqrestore(&nesqp->lock, flags); + nes_cm_disconn(nesqp); + break; + + case NES_AEQE_AEID_RESET_SENT: + tcp_state = NES_AEQE_TCP_STATE_CLOSED; + spin_lock_irqsave(&nesqp->lock, flags); + nesqp->hw_iwarp_state = iwarp_state; + nesqp->hw_tcp_state = tcp_state; + nesqp->last_aeq = async_event_id; + nesqp->hte_added = 0; + spin_unlock_irqrestore(&nesqp->lock, flags); + next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR | NES_CQP_QP_DEL_HTE; + nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0); + nes_cm_disconn(nesqp); + break; + + case NES_AEQE_AEID_LLP_CONNECTION_RESET: + if (atomic_read(&nesqp->close_timer_started)) + return; + spin_lock_irqsave(&nesqp->lock, flags); + nesqp->hw_iwarp_state = iwarp_state; + nesqp->hw_tcp_state = tcp_state; + nesqp->last_aeq = async_event_id; + spin_unlock_irqrestore(&nesqp->lock, flags); + nes_cm_disconn(nesqp); + break; + + case NES_AEQE_AEID_TERMINATE_SENT: + nes_terminate_send_fin(nesdev, nesqp, aeqe); + break; + + case NES_AEQE_AEID_LLP_TERMINATE_RECEIVED: + nes_terminate_received(nesdev, nesqp, aeqe); + break; + + case NES_AEQE_AEID_AMP_BAD_STAG_KEY: + case NES_AEQE_AEID_AMP_BAD_STAG_INDEX: + case NES_AEQE_AEID_AMP_UNALLOCATED_STAG: + case NES_AEQE_AEID_AMP_INVALID_STAG: + case NES_AEQE_AEID_AMP_RIGHTS_VIOLATION: + case NES_AEQE_AEID_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS: + case NES_AEQE_AEID_PRIV_OPERATION_DENIED: + case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER: + case NES_AEQE_AEID_AMP_BOUNDS_VIOLATION: + case NES_AEQE_AEID_AMP_TO_WRAP: + printk(KERN_ERR PFX "QP[%u] async_event_id=0x%04X IB_EVENT_QP_ACCESS_ERR\n", + nesqp->hwqp.qp_id, async_event_id); + nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_ACCESS_ERR); + break; + + case NES_AEQE_AEID_LLP_SEGMENT_TOO_LARGE: + case NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL: + case NES_AEQE_AEID_DDP_UBE_INVALID_MO: + case NES_AEQE_AEID_DDP_UBE_INVALID_QN: + if (iwarp_opcode(nesqp, aeq_info) > IWARP_OPCODE_TERM) { + aeq_info &= 0xffff0000; + aeq_info |= NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE; + aeqe->aeqe_words[NES_AEQE_MISC_IDX] = cpu_to_le32(aeq_info); + } + + case NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE: + case NES_AEQE_AEID_LLP_TOO_MANY_RETRIES: + case NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE: + case NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR: + case NES_AEQE_AEID_AMP_BAD_QP: + case NES_AEQE_AEID_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH: + case NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC: + case NES_AEQE_AEID_DDP_NO_L_BIT: + case NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN: + case NES_AEQE_AEID_DDP_INVALID_MSN_RANGE_IS_NOT_VALID: + case NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION: + case NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION: + case NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE: + case NES_AEQE_AEID_AMP_BAD_PD: + case NES_AEQE_AEID_AMP_FASTREG_SHARED: + case NES_AEQE_AEID_AMP_FASTREG_VALID_STAG: + case NES_AEQE_AEID_AMP_FASTREG_MW_STAG: + case NES_AEQE_AEID_AMP_FASTREG_INVALID_RIGHTS: + case NES_AEQE_AEID_AMP_FASTREG_PBL_TABLE_OVERFLOW: + case NES_AEQE_AEID_AMP_FASTREG_INVALID_LENGTH: + case NES_AEQE_AEID_AMP_INVALIDATE_SHARED: + case NES_AEQE_AEID_AMP_INVALIDATE_MR_WITH_BOUND_WINDOWS: + case NES_AEQE_AEID_AMP_MWBIND_VALID_STAG: + case NES_AEQE_AEID_AMP_MWBIND_OF_MR_STAG: + case NES_AEQE_AEID_AMP_MWBIND_TO_ZERO_BASED_STAG: + case NES_AEQE_AEID_AMP_MWBIND_TO_MW_STAG: + case NES_AEQE_AEID_AMP_MWBIND_INVALID_RIGHTS: + case NES_AEQE_AEID_AMP_MWBIND_INVALID_BOUNDS: + case NES_AEQE_AEID_AMP_MWBIND_TO_INVALID_PARENT: + case NES_AEQE_AEID_AMP_MWBIND_BIND_DISABLED: + case NES_AEQE_AEID_BAD_CLOSE: + case NES_AEQE_AEID_RDMA_READ_WHILE_ORD_ZERO: + case NES_AEQE_AEID_STAG_ZERO_INVALID: + case NES_AEQE_AEID_ROE_INVALID_RDMA_READ_REQUEST: + case NES_AEQE_AEID_ROE_INVALID_RDMA_WRITE_OR_READ_RESP: + printk(KERN_ERR PFX "QP[%u] async_event_id=0x%04X IB_EVENT_QP_FATAL\n", + nesqp->hwqp.qp_id, async_event_id); + print_ip(nesqp->cm_node); + if (!atomic_read(&nesqp->close_timer_started)) + nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL); + break; + + case NES_AEQE_AEID_CQ_OPERATION_ERROR: + context <<= 1; + nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_CQ_OPERATION_ERROR event on CQ%u, %p\n", + le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]), (void *)(unsigned long)context); + resource_allocated = nes_is_resource_allocated(nesadapter, nesadapter->allocated_cqs, + le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX])); + if (resource_allocated) { + printk(KERN_ERR PFX "%s: Processing an NES_AEQE_AEID_CQ_OPERATION_ERROR event on CQ%u\n", + __func__, le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX])); + hw_cq = (struct nes_hw_cq *)(unsigned long)context; + if (hw_cq) { + nescq = container_of(hw_cq, struct nes_cq, hw_cq); + if (nescq->ibcq.event_handler) { + ibevent.device = nescq->ibcq.device; + ibevent.event = IB_EVENT_CQ_ERR; + ibevent.element.cq = &nescq->ibcq; + nescq->ibcq.event_handler(&ibevent, nescq->ibcq.cq_context); + } + } + } + break; + + default: + nes_debug(NES_DBG_AEQ, "Processing an iWARP related AE for QP, misc = 0x%04X\n", + async_event_id); + break; + } + +} + +/** + * nes_iwarp_ce_handler + */ +void nes_iwarp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *hw_cq) +{ + struct nes_cq *nescq = container_of(hw_cq, struct nes_cq, hw_cq); + + /* nes_debug(NES_DBG_CQ, "Processing completion event for iWARP CQ%u.\n", + nescq->hw_cq.cq_number); */ + nes_write32(nesdev->regs+NES_CQ_ACK, nescq->hw_cq.cq_number); + + if (nescq->ibcq.comp_handler) + nescq->ibcq.comp_handler(&nescq->ibcq, nescq->ibcq.cq_context); + + return; +} + + +/** + * nes_manage_apbvt() + */ +int nes_manage_apbvt(struct nes_vnic *nesvnic, u32 accel_local_port, + u32 nic_index, u32 add_port) +{ + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_cqp_request *cqp_request; + int ret = 0; + u16 major_code; + + /* Send manage APBVT request to CQP */ + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n"); + return -ENOMEM; + } + cqp_request->waiting = 1; + cqp_wqe = &cqp_request->cqp_wqe; + + nes_debug(NES_DBG_QP, "%s APBV for local port=%u(0x%04x), nic_index=%u\n", + (add_port == NES_MANAGE_APBVT_ADD) ? "ADD" : "DEL", + accel_local_port, accel_local_port, nic_index); + + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, (NES_CQP_MANAGE_APBVT | + ((add_port == NES_MANAGE_APBVT_ADD) ? NES_CQP_APBVT_ADD : 0))); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, + ((nic_index << NES_CQP_APBVT_NIC_SHIFT) | accel_local_port)); + + nes_debug(NES_DBG_QP, "Waiting for CQP completion for APBVT.\n"); + + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request); + + if (add_port == NES_MANAGE_APBVT_ADD) + ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_QP, "Completed, ret=%u, CQP Major:Minor codes = 0x%04X:0x%04X\n", + ret, cqp_request->major_code, cqp_request->minor_code); + major_code = cqp_request->major_code; + + nes_put_cqp_request(nesdev, cqp_request); + + if (!ret) + return -ETIME; + else if (major_code) + return -EIO; + else + return 0; +} + + +/** + * nes_manage_arp_cache + */ +void nes_manage_arp_cache(struct net_device *netdev, unsigned char *mac_addr, + u32 ip_addr, u32 action) +{ + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev; + struct nes_cqp_request *cqp_request; + int arp_index; + + nesdev = nesvnic->nesdev; + arp_index = nes_arp_table(nesdev, ip_addr, mac_addr, action); + if (arp_index == -1) { + return; + } + + /* update the ARP entry */ + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_NETDEV, "Failed to get a cqp_request.\n"); + return; + } + cqp_request->waiting = 0; + cqp_wqe = &cqp_request->cqp_wqe; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32( + NES_CQP_MANAGE_ARP_CACHE | NES_CQP_ARP_PERM); + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32( + (u32)PCI_FUNC(nesdev->pcidev->devfn) << NES_CQP_ARP_AEQ_INDEX_SHIFT); + cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(arp_index); + + if (action == NES_ARP_ADD) { + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_ARP_VALID); + cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_ADDR_LOW_IDX] = cpu_to_le32( + (((u32)mac_addr[2]) << 24) | (((u32)mac_addr[3]) << 16) | + (((u32)mac_addr[4]) << 8) | (u32)mac_addr[5]); + cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_HIGH_IDX] = cpu_to_le32( + (((u32)mac_addr[0]) << 16) | (u32)mac_addr[1]); + } else { + cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_ADDR_LOW_IDX] = 0; + cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_HIGH_IDX] = 0; + } + + nes_debug(NES_DBG_NETDEV, "Not waiting for CQP, cqp.sq_head=%u, cqp.sq_tail=%u\n", + nesdev->cqp.sq_head, nesdev->cqp.sq_tail); + + atomic_set(&cqp_request->refcount, 1); + nes_post_cqp_request(nesdev, cqp_request); +} + + +/** + * flush_wqes + */ +void flush_wqes(struct nes_device *nesdev, struct nes_qp *nesqp, + u32 which_wq, u32 wait_completion) +{ + struct nes_cqp_request *cqp_request; + struct nes_hw_cqp_wqe *cqp_wqe; + u32 sq_code = (NES_IWARP_CQE_MAJOR_FLUSH << 16) | NES_IWARP_CQE_MINOR_FLUSH; + u32 rq_code = (NES_IWARP_CQE_MAJOR_FLUSH << 16) | NES_IWARP_CQE_MINOR_FLUSH; + int ret; + + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n"); + return; + } + if (wait_completion) { + cqp_request->waiting = 1; + atomic_set(&cqp_request->refcount, 2); + } else { + cqp_request->waiting = 0; + } + cqp_wqe = &cqp_request->cqp_wqe; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + + /* If wqe in error was identified, set code to be put into cqe */ + if ((nesqp->term_sq_flush_code) && (which_wq & NES_CQP_FLUSH_SQ)) { + which_wq |= NES_CQP_FLUSH_MAJ_MIN; + sq_code = (CQE_MAJOR_DRV << 16) | nesqp->term_sq_flush_code; + nesqp->term_sq_flush_code = 0; + } + + if ((nesqp->term_rq_flush_code) && (which_wq & NES_CQP_FLUSH_RQ)) { + which_wq |= NES_CQP_FLUSH_MAJ_MIN; + rq_code = (CQE_MAJOR_DRV << 16) | nesqp->term_rq_flush_code; + nesqp->term_rq_flush_code = 0; + } + + if (which_wq & NES_CQP_FLUSH_MAJ_MIN) { + cqp_wqe->wqe_words[NES_CQP_QP_WQE_FLUSH_SQ_CODE] = cpu_to_le32(sq_code); + cqp_wqe->wqe_words[NES_CQP_QP_WQE_FLUSH_RQ_CODE] = cpu_to_le32(rq_code); + } + + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = + cpu_to_le32(NES_CQP_FLUSH_WQES | which_wq); + cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesqp->hwqp.qp_id); + + nes_post_cqp_request(nesdev, cqp_request); + + if (wait_completion) { + /* Wait for CQP */ + ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_QP, "Flush SQ QP WQEs completed, ret=%u," + " CQP Major:Minor codes = 0x%04X:0x%04X\n", + ret, cqp_request->major_code, cqp_request->minor_code); + nes_put_cqp_request(nesdev, cqp_request); + } +} diff --git a/drivers/infiniband/hw/nes/nes_hw.h b/drivers/infiniband/hw/nes/nes_hw.h new file mode 100644 index 0000000..d748e4b --- /dev/null +++ b/drivers/infiniband/hw/nes/nes_hw.h @@ -0,0 +1,1392 @@ +/* +* Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. +* +* This software is available to you under a choice of one of two +* licenses. You may choose to be licensed under the terms of the GNU +* General Public License (GPL) Version 2, available from the file +* COPYING in the main directory of this source tree, or the +* OpenIB.org BSD license below: +* +* Redistribution and use in source and binary forms, with or +* without modification, are permitted provided that the following +* conditions are met: +* +* - Redistributions of source code must retain the above +* copyright notice, this list of conditions and the following +* disclaimer. +* +* - Redistributions in binary form must reproduce the above +* copyright notice, this list of conditions and the following +* disclaimer in the documentation and/or other materials +* provided with the distribution. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ + +#ifndef __NES_HW_H +#define __NES_HW_H + +#include + +#define NES_PHY_TYPE_CX4 1 +#define NES_PHY_TYPE_1G 2 +#define NES_PHY_TYPE_ARGUS 4 +#define NES_PHY_TYPE_PUMA_1G 5 +#define NES_PHY_TYPE_PUMA_10G 6 +#define NES_PHY_TYPE_GLADIUS 7 +#define NES_PHY_TYPE_SFP_D 8 +#define NES_PHY_TYPE_KR 9 + +#define NES_MULTICAST_PF_MAX 8 +#define NES_A0 3 + +#define NES_ENABLE_PAU 0x07000001 +#define NES_DISABLE_PAU 0x07000000 +#define NES_PAU_COUNTER 10 +#define NES_CQP_OPCODE_MASK 0x3f + +enum pci_regs { + NES_INT_STAT = 0x0000, + NES_INT_MASK = 0x0004, + NES_INT_PENDING = 0x0008, + NES_INTF_INT_STAT = 0x000C, + NES_INTF_INT_MASK = 0x0010, + NES_TIMER_STAT = 0x0014, + NES_PERIODIC_CONTROL = 0x0018, + NES_ONE_SHOT_CONTROL = 0x001C, + NES_EEPROM_COMMAND = 0x0020, + NES_EEPROM_DATA = 0x0024, + NES_FLASH_COMMAND = 0x0028, + NES_FLASH_DATA = 0x002C, + NES_SOFTWARE_RESET = 0x0030, + NES_CQ_ACK = 0x0034, + NES_WQE_ALLOC = 0x0040, + NES_CQE_ALLOC = 0x0044, + NES_AEQ_ALLOC = 0x0048 +}; + +enum indexed_regs { + NES_IDX_CREATE_CQP_LOW = 0x0000, + NES_IDX_CREATE_CQP_HIGH = 0x0004, + NES_IDX_QP_CONTROL = 0x0040, + NES_IDX_FLM_CONTROL = 0x0080, + NES_IDX_INT_CPU_STATUS = 0x00a0, + NES_IDX_GPR_TRIGGER = 0x00bc, + NES_IDX_GPIO_CONTROL = 0x00f0, + NES_IDX_GPIO_DATA = 0x00f4, + NES_IDX_GPR2 = 0x010c, + NES_IDX_TCP_CONFIG0 = 0x01e4, + NES_IDX_TCP_TIMER_CONFIG = 0x01ec, + NES_IDX_TCP_NOW = 0x01f0, + NES_IDX_QP_MAX_CFG_SIZES = 0x0200, + NES_IDX_QP_CTX_SIZE = 0x0218, + NES_IDX_TCP_TIMER_SIZE0 = 0x0238, + NES_IDX_TCP_TIMER_SIZE1 = 0x0240, + NES_IDX_ARP_CACHE_SIZE = 0x0258, + NES_IDX_CQ_CTX_SIZE = 0x0260, + NES_IDX_MRT_SIZE = 0x0278, + NES_IDX_PBL_REGION_SIZE = 0x0280, + NES_IDX_IRRQ_COUNT = 0x02b0, + NES_IDX_RX_WINDOW_BUFFER_PAGE_TABLE_SIZE = 0x02f0, + NES_IDX_RX_WINDOW_BUFFER_SIZE = 0x0300, + NES_IDX_DST_IP_ADDR = 0x0400, + NES_IDX_PCIX_DIAG = 0x08e8, + NES_IDX_MPP_DEBUG = 0x0a00, + NES_IDX_PORT_RX_DISCARDS = 0x0a30, + NES_IDX_PORT_TX_DISCARDS = 0x0a34, + NES_IDX_MPP_LB_DEBUG = 0x0b00, + NES_IDX_DENALI_CTL_22 = 0x1058, + NES_IDX_MAC_TX_CONTROL = 0x2000, + NES_IDX_MAC_TX_CONFIG = 0x2004, + NES_IDX_MAC_TX_PAUSE_QUANTA = 0x2008, + NES_IDX_MAC_RX_CONTROL = 0x200c, + NES_IDX_MAC_RX_CONFIG = 0x2010, + NES_IDX_MAC_EXACT_MATCH_BOTTOM = 0x201c, + NES_IDX_MAC_MDIO_CONTROL = 0x2084, + NES_IDX_MAC_TX_OCTETS_LOW = 0x2100, + NES_IDX_MAC_TX_OCTETS_HIGH = 0x2104, + NES_IDX_MAC_TX_FRAMES_LOW = 0x2108, + NES_IDX_MAC_TX_FRAMES_HIGH = 0x210c, + NES_IDX_MAC_TX_PAUSE_FRAMES = 0x2118, + NES_IDX_MAC_TX_ERRORS = 0x2138, + NES_IDX_MAC_RX_OCTETS_LOW = 0x213c, + NES_IDX_MAC_RX_OCTETS_HIGH = 0x2140, + NES_IDX_MAC_RX_FRAMES_LOW = 0x2144, + NES_IDX_MAC_RX_FRAMES_HIGH = 0x2148, + NES_IDX_MAC_RX_BC_FRAMES_LOW = 0x214c, + NES_IDX_MAC_RX_MC_FRAMES_HIGH = 0x2150, + NES_IDX_MAC_RX_PAUSE_FRAMES = 0x2154, + NES_IDX_MAC_RX_SHORT_FRAMES = 0x2174, + NES_IDX_MAC_RX_OVERSIZED_FRAMES = 0x2178, + NES_IDX_MAC_RX_JABBER_FRAMES = 0x217c, + NES_IDX_MAC_RX_CRC_ERR_FRAMES = 0x2180, + NES_IDX_MAC_RX_LENGTH_ERR_FRAMES = 0x2184, + NES_IDX_MAC_RX_SYMBOL_ERR_FRAMES = 0x2188, + NES_IDX_MAC_INT_STATUS = 0x21f0, + NES_IDX_MAC_INT_MASK = 0x21f4, + NES_IDX_PHY_PCS_CONTROL_STATUS0 = 0x2800, + NES_IDX_PHY_PCS_CONTROL_STATUS1 = 0x2a00, + NES_IDX_ETH_SERDES_COMMON_CONTROL0 = 0x2808, + NES_IDX_ETH_SERDES_COMMON_CONTROL1 = 0x2a08, + NES_IDX_ETH_SERDES_COMMON_STATUS0 = 0x280c, + NES_IDX_ETH_SERDES_COMMON_STATUS1 = 0x2a0c, + NES_IDX_ETH_SERDES_TX_EMP0 = 0x2810, + NES_IDX_ETH_SERDES_TX_EMP1 = 0x2a10, + NES_IDX_ETH_SERDES_TX_DRIVE0 = 0x2814, + NES_IDX_ETH_SERDES_TX_DRIVE1 = 0x2a14, + NES_IDX_ETH_SERDES_RX_MODE0 = 0x2818, + NES_IDX_ETH_SERDES_RX_MODE1 = 0x2a18, + NES_IDX_ETH_SERDES_RX_SIGDET0 = 0x281c, + NES_IDX_ETH_SERDES_RX_SIGDET1 = 0x2a1c, + NES_IDX_ETH_SERDES_BYPASS0 = 0x2820, + NES_IDX_ETH_SERDES_BYPASS1 = 0x2a20, + NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0 = 0x2824, + NES_IDX_ETH_SERDES_LOOPBACK_CONTROL1 = 0x2a24, + NES_IDX_ETH_SERDES_RX_EQ_CONTROL0 = 0x2828, + NES_IDX_ETH_SERDES_RX_EQ_CONTROL1 = 0x2a28, + NES_IDX_ETH_SERDES_RX_EQ_STATUS0 = 0x282c, + NES_IDX_ETH_SERDES_RX_EQ_STATUS1 = 0x2a2c, + NES_IDX_ETH_SERDES_CDR_RESET0 = 0x2830, + NES_IDX_ETH_SERDES_CDR_RESET1 = 0x2a30, + NES_IDX_ETH_SERDES_CDR_CONTROL0 = 0x2834, + NES_IDX_ETH_SERDES_CDR_CONTROL1 = 0x2a34, + NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE0 = 0x2838, + NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE1 = 0x2a38, + NES_IDX_ENDNODE0_NSTAT_RX_DISCARD = 0x3080, + NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO = 0x3000, + NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI = 0x3004, + NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO = 0x3008, + NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI = 0x300c, + NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO = 0x7000, + NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI = 0x7004, + NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO = 0x7008, + NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI = 0x700c, + NES_IDX_WQM_CONFIG0 = 0x5000, + NES_IDX_WQM_CONFIG1 = 0x5004, + NES_IDX_CM_CONFIG = 0x5100, + NES_IDX_NIC_LOGPORT_TO_PHYPORT = 0x6000, + NES_IDX_NIC_PHYPORT_TO_USW = 0x6008, + NES_IDX_NIC_ACTIVE = 0x6010, + NES_IDX_NIC_UNICAST_ALL = 0x6018, + NES_IDX_NIC_MULTICAST_ALL = 0x6020, + NES_IDX_NIC_MULTICAST_ENABLE = 0x6028, + NES_IDX_NIC_BROADCAST_ON = 0x6030, + NES_IDX_USED_CHUNKS_TX = 0x60b0, + NES_IDX_TX_POOL_SIZE = 0x60b8, + NES_IDX_QUAD_HASH_TABLE_SIZE = 0x6148, + NES_IDX_PERFECT_FILTER_LOW = 0x6200, + NES_IDX_PERFECT_FILTER_HIGH = 0x6204, + NES_IDX_IPV4_TCP_REXMITS = 0x7080, + NES_IDX_DEBUG_ERROR_CONTROL_STATUS = 0x913c, + NES_IDX_DEBUG_ERROR_MASKS0 = 0x9140, + NES_IDX_DEBUG_ERROR_MASKS1 = 0x9144, + NES_IDX_DEBUG_ERROR_MASKS2 = 0x9148, + NES_IDX_DEBUG_ERROR_MASKS3 = 0x914c, + NES_IDX_DEBUG_ERROR_MASKS4 = 0x9150, + NES_IDX_DEBUG_ERROR_MASKS5 = 0x9154, +}; + +#define NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE 1 +#define NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE (1 << 17) + +enum nes_cqp_opcodes { + NES_CQP_CREATE_QP = 0x00, + NES_CQP_MODIFY_QP = 0x01, + NES_CQP_DESTROY_QP = 0x02, + NES_CQP_CREATE_CQ = 0x03, + NES_CQP_MODIFY_CQ = 0x04, + NES_CQP_DESTROY_CQ = 0x05, + NES_CQP_ALLOCATE_STAG = 0x09, + NES_CQP_REGISTER_STAG = 0x0a, + NES_CQP_QUERY_STAG = 0x0b, + NES_CQP_REGISTER_SHARED_STAG = 0x0c, + NES_CQP_DEALLOCATE_STAG = 0x0d, + NES_CQP_MANAGE_ARP_CACHE = 0x0f, + NES_CQP_DOWNLOAD_SEGMENT = 0x10, + NES_CQP_SUSPEND_QPS = 0x11, + NES_CQP_UPLOAD_CONTEXT = 0x13, + NES_CQP_CREATE_CEQ = 0x16, + NES_CQP_DESTROY_CEQ = 0x18, + NES_CQP_CREATE_AEQ = 0x19, + NES_CQP_DESTROY_AEQ = 0x1b, + NES_CQP_LMI_ACCESS = 0x20, + NES_CQP_FLUSH_WQES = 0x22, + NES_CQP_MANAGE_APBVT = 0x23, + NES_CQP_MANAGE_QUAD_HASH = 0x25 +}; + +enum nes_cqp_wqe_word_idx { + NES_CQP_WQE_OPCODE_IDX = 0, + NES_CQP_WQE_ID_IDX = 1, + NES_CQP_WQE_COMP_CTX_LOW_IDX = 2, + NES_CQP_WQE_COMP_CTX_HIGH_IDX = 3, + NES_CQP_WQE_COMP_SCRATCH_LOW_IDX = 4, + NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX = 5, +}; + +enum nes_cqp_wqe_word_download_idx { /* format differs from other cqp ops */ + NES_CQP_WQE_DL_OPCODE_IDX = 0, + NES_CQP_WQE_DL_COMP_CTX_LOW_IDX = 1, + NES_CQP_WQE_DL_COMP_CTX_HIGH_IDX = 2, + NES_CQP_WQE_DL_LENGTH_0_TOTAL_IDX = 3 + /* For index values 4-15 use NES_NIC_SQ_WQE_ values */ +}; + +enum nes_cqp_cq_wqeword_idx { + NES_CQP_CQ_WQE_PBL_LOW_IDX = 6, + NES_CQP_CQ_WQE_PBL_HIGH_IDX = 7, + NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX = 8, + NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX = 9, + NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX = 10, +}; + +enum nes_cqp_stag_wqeword_idx { + NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX = 1, + NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX = 6, + NES_CQP_STAG_WQE_LEN_LOW_IDX = 7, + NES_CQP_STAG_WQE_STAG_IDX = 8, + NES_CQP_STAG_WQE_VA_LOW_IDX = 10, + NES_CQP_STAG_WQE_VA_HIGH_IDX = 11, + NES_CQP_STAG_WQE_PA_LOW_IDX = 12, + NES_CQP_STAG_WQE_PA_HIGH_IDX = 13, + NES_CQP_STAG_WQE_PBL_LEN_IDX = 14 +}; + +#define NES_CQP_OP_LOGICAL_PORT_SHIFT 26 +#define NES_CQP_OP_IWARP_STATE_SHIFT 28 +#define NES_CQP_OP_TERMLEN_SHIFT 28 + +enum nes_cqp_qp_bits { + NES_CQP_QP_ARP_VALID = (1<<8), + NES_CQP_QP_WINBUF_VALID = (1<<9), + NES_CQP_QP_CONTEXT_VALID = (1<<10), + NES_CQP_QP_ORD_VALID = (1<<11), + NES_CQP_QP_WINBUF_DATAIND_EN = (1<<12), + NES_CQP_QP_VIRT_WQS = (1<<13), + NES_CQP_QP_DEL_HTE = (1<<14), + NES_CQP_QP_CQS_VALID = (1<<15), + NES_CQP_QP_TYPE_TSA = 0, + NES_CQP_QP_TYPE_IWARP = (1<<16), + NES_CQP_QP_TYPE_CQP = (4<<16), + NES_CQP_QP_TYPE_NIC = (5<<16), + NES_CQP_QP_MSS_CHG = (1<<20), + NES_CQP_QP_STATIC_RESOURCES = (1<<21), + NES_CQP_QP_IGNORE_MW_BOUND = (1<<22), + NES_CQP_QP_VWQ_USE_LMI = (1<<23), + NES_CQP_QP_IWARP_STATE_IDLE = (1<netdev */ + u8 perfect_filter_index; + u8 nic_index; + u8 qp_nic_index[4]; + u8 next_qp_nic_index; + u8 of_device_registered; + u8 rdma_enabled; + u32 lro_max_aggr; + struct net_lro_mgr lro_mgr; + struct net_lro_desc lro_desc[NES_MAX_LRO_DESCRIPTORS]; + struct timer_list event_timer; + enum ib_event_type delayed_event; + enum ib_event_type last_dispatched_event; + spinlock_t port_ibevent_lock; + u32 mgt_mem_size; + void *mgt_vbase; + dma_addr_t mgt_pbase; + struct nes_vnic_mgt *mgtvnic[NES_MGT_QP_COUNT]; + struct task_struct *mgt_thread; + wait_queue_head_t mgt_wait_queue; + struct sk_buff_head mgt_skb_list; + +}; + +struct nes_ib_device { + struct ib_device ibdev; + struct nes_vnic *nesvnic; + + /* Virtual RNIC Limits */ + u32 max_mr; + u32 max_qp; + u32 max_cq; + u32 max_pd; + u32 num_mr; + u32 num_qp; + u32 num_cq; + u32 num_pd; +}; + +enum nes_hdrct_flags { + DDP_LEN_FLAG = 0x80, + DDP_HDR_FLAG = 0x40, + RDMA_HDR_FLAG = 0x20 +}; + +enum nes_term_layers { + LAYER_RDMA = 0, + LAYER_DDP = 1, + LAYER_MPA = 2 +}; + +enum nes_term_error_types { + RDMAP_CATASTROPHIC = 0, + RDMAP_REMOTE_PROT = 1, + RDMAP_REMOTE_OP = 2, + DDP_CATASTROPHIC = 0, + DDP_TAGGED_BUFFER = 1, + DDP_UNTAGGED_BUFFER = 2, + DDP_LLP = 3 +}; + +enum nes_term_rdma_errors { + RDMAP_INV_STAG = 0x00, + RDMAP_INV_BOUNDS = 0x01, + RDMAP_ACCESS = 0x02, + RDMAP_UNASSOC_STAG = 0x03, + RDMAP_TO_WRAP = 0x04, + RDMAP_INV_RDMAP_VER = 0x05, + RDMAP_UNEXPECTED_OP = 0x06, + RDMAP_CATASTROPHIC_LOCAL = 0x07, + RDMAP_CATASTROPHIC_GLOBAL = 0x08, + RDMAP_CANT_INV_STAG = 0x09, + RDMAP_UNSPECIFIED = 0xff +}; + +enum nes_term_ddp_errors { + DDP_CATASTROPHIC_LOCAL = 0x00, + DDP_TAGGED_INV_STAG = 0x00, + DDP_TAGGED_BOUNDS = 0x01, + DDP_TAGGED_UNASSOC_STAG = 0x02, + DDP_TAGGED_TO_WRAP = 0x03, + DDP_TAGGED_INV_DDP_VER = 0x04, + DDP_UNTAGGED_INV_QN = 0x01, + DDP_UNTAGGED_INV_MSN_NO_BUF = 0x02, + DDP_UNTAGGED_INV_MSN_RANGE = 0x03, + DDP_UNTAGGED_INV_MO = 0x04, + DDP_UNTAGGED_INV_TOO_LONG = 0x05, + DDP_UNTAGGED_INV_DDP_VER = 0x06 +}; + +enum nes_term_mpa_errors { + MPA_CLOSED = 0x01, + MPA_CRC = 0x02, + MPA_MARKER = 0x03, + MPA_REQ_RSP = 0x04, +}; + +struct nes_terminate_hdr { + u8 layer_etype; + u8 error_code; + u8 hdrct; + u8 rsvd; +}; + +/* Used to determine how to fill in terminate error codes */ +#define IWARP_OPCODE_WRITE 0 +#define IWARP_OPCODE_READREQ 1 +#define IWARP_OPCODE_READRSP 2 +#define IWARP_OPCODE_SEND 3 +#define IWARP_OPCODE_SEND_INV 4 +#define IWARP_OPCODE_SEND_SE 5 +#define IWARP_OPCODE_SEND_SE_INV 6 +#define IWARP_OPCODE_TERM 7 + +/* These values are used only during terminate processing */ +#define TERM_DDP_LEN_TAGGED 14 +#define TERM_DDP_LEN_UNTAGGED 18 +#define TERM_RDMA_LEN 28 +#define RDMA_OPCODE_MASK 0x0f +#define RDMA_READ_REQ_OPCODE 1 +#define BAD_FRAME_OFFSET 64 +#define CQE_MAJOR_DRV 0x8000 + +/* Used for link status recheck after interrupt processing */ +#define NES_LINK_RECHECK_DELAY msecs_to_jiffies(50) +#define NES_LINK_RECHECK_MAX 60 + +#endif /* __NES_HW_H */ diff --git a/drivers/infiniband/hw/nes/nes_mgt.c b/drivers/infiniband/hw/nes/nes_mgt.c new file mode 100644 index 0000000..4166452 --- /dev/null +++ b/drivers/infiniband/hw/nes/nes_mgt.c @@ -0,0 +1,1160 @@ +/* + * Copyright (c) 2006 - 2011 Intel-NE, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include +#include "nes.h" +#include "nes_mgt.h" + +atomic_t pau_qps_created; +atomic_t pau_qps_destroyed; + +static void nes_replenish_mgt_rq(struct nes_vnic_mgt *mgtvnic) +{ + unsigned long flags; + dma_addr_t bus_address; + struct sk_buff *skb; + struct nes_hw_nic_rq_wqe *nic_rqe; + struct nes_hw_mgt *nesmgt; + struct nes_device *nesdev; + struct nes_rskb_cb *cb; + u32 rx_wqes_posted = 0; + + nesmgt = &mgtvnic->mgt; + nesdev = mgtvnic->nesvnic->nesdev; + spin_lock_irqsave(&nesmgt->rq_lock, flags); + if (nesmgt->replenishing_rq != 0) { + if (((nesmgt->rq_size - 1) == atomic_read(&mgtvnic->rx_skbs_needed)) && + (atomic_read(&mgtvnic->rx_skb_timer_running) == 0)) { + atomic_set(&mgtvnic->rx_skb_timer_running, 1); + spin_unlock_irqrestore(&nesmgt->rq_lock, flags); + mgtvnic->rq_wqes_timer.expires = jiffies + (HZ / 2); /* 1/2 second */ + add_timer(&mgtvnic->rq_wqes_timer); + } else { + spin_unlock_irqrestore(&nesmgt->rq_lock, flags); + } + return; + } + nesmgt->replenishing_rq = 1; + spin_unlock_irqrestore(&nesmgt->rq_lock, flags); + do { + skb = dev_alloc_skb(mgtvnic->nesvnic->max_frame_size); + if (skb) { + skb->dev = mgtvnic->nesvnic->netdev; + + bus_address = pci_map_single(nesdev->pcidev, + skb->data, mgtvnic->nesvnic->max_frame_size, PCI_DMA_FROMDEVICE); + cb = (struct nes_rskb_cb *)&skb->cb[0]; + cb->busaddr = bus_address; + cb->maplen = mgtvnic->nesvnic->max_frame_size; + + nic_rqe = &nesmgt->rq_vbase[mgtvnic->mgt.rq_head]; + nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = + cpu_to_le32(mgtvnic->nesvnic->max_frame_size); + nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0; + nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] = + cpu_to_le32((u32)bus_address); + nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] = + cpu_to_le32((u32)((u64)bus_address >> 32)); + nesmgt->rx_skb[nesmgt->rq_head] = skb; + nesmgt->rq_head++; + nesmgt->rq_head &= nesmgt->rq_size - 1; + atomic_dec(&mgtvnic->rx_skbs_needed); + barrier(); + if (++rx_wqes_posted == 255) { + nes_write32(nesdev->regs + NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesmgt->qp_id); + rx_wqes_posted = 0; + } + } else { + spin_lock_irqsave(&nesmgt->rq_lock, flags); + if (((nesmgt->rq_size - 1) == atomic_read(&mgtvnic->rx_skbs_needed)) && + (atomic_read(&mgtvnic->rx_skb_timer_running) == 0)) { + atomic_set(&mgtvnic->rx_skb_timer_running, 1); + spin_unlock_irqrestore(&nesmgt->rq_lock, flags); + mgtvnic->rq_wqes_timer.expires = jiffies + (HZ / 2); /* 1/2 second */ + add_timer(&mgtvnic->rq_wqes_timer); + } else { + spin_unlock_irqrestore(&nesmgt->rq_lock, flags); + } + break; + } + } while (atomic_read(&mgtvnic->rx_skbs_needed)); + barrier(); + if (rx_wqes_posted) + nes_write32(nesdev->regs + NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesmgt->qp_id); + nesmgt->replenishing_rq = 0; +} + +/** + * nes_mgt_rq_wqes_timeout + */ +static void nes_mgt_rq_wqes_timeout(unsigned long parm) +{ + struct nes_vnic_mgt *mgtvnic = (struct nes_vnic_mgt *)parm; + + atomic_set(&mgtvnic->rx_skb_timer_running, 0); + if (atomic_read(&mgtvnic->rx_skbs_needed)) + nes_replenish_mgt_rq(mgtvnic); +} + +/** + * nes_mgt_free_skb - unmap and free skb + */ +static void nes_mgt_free_skb(struct nes_device *nesdev, struct sk_buff *skb, u32 dir) +{ + struct nes_rskb_cb *cb; + + cb = (struct nes_rskb_cb *)&skb->cb[0]; + pci_unmap_single(nesdev->pcidev, cb->busaddr, cb->maplen, dir); + cb->busaddr = 0; + dev_kfree_skb_any(skb); +} + +/** + * nes_download_callback - handle download completions + */ +static void nes_download_callback(struct nes_device *nesdev, struct nes_cqp_request *cqp_request) +{ + struct pau_fpdu_info *fpdu_info = cqp_request->cqp_callback_pointer; + struct nes_qp *nesqp = fpdu_info->nesqp; + struct sk_buff *skb; + int i; + + for (i = 0; i < fpdu_info->frag_cnt; i++) { + skb = fpdu_info->frags[i].skb; + if (fpdu_info->frags[i].cmplt) { + nes_mgt_free_skb(nesdev, skb, PCI_DMA_TODEVICE); + nes_rem_ref_cm_node(nesqp->cm_node); + } + } + + if (fpdu_info->hdr_vbase) + pci_free_consistent(nesdev->pcidev, fpdu_info->hdr_len, + fpdu_info->hdr_vbase, fpdu_info->hdr_pbase); + kfree(fpdu_info); +} + +/** + * nes_get_seq - Get the seq, ack_seq and window from the packet + */ +static u32 nes_get_seq(struct sk_buff *skb, u32 *ack, u16 *wnd, u32 *fin_rcvd, u32 *rst_rcvd) +{ + struct nes_rskb_cb *cb = (struct nes_rskb_cb *)&skb->cb[0]; + struct iphdr *iph = (struct iphdr *)(cb->data_start + ETH_HLEN); + struct tcphdr *tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl)); + + *ack = be32_to_cpu(tcph->ack_seq); + *wnd = be16_to_cpu(tcph->window); + *fin_rcvd = tcph->fin; + *rst_rcvd = tcph->rst; + return be32_to_cpu(tcph->seq); +} + +/** + * nes_get_next_skb - Get the next skb based on where current skb is in the queue + */ +static struct sk_buff *nes_get_next_skb(struct nes_device *nesdev, struct nes_qp *nesqp, + struct sk_buff *skb, u32 nextseq, u32 *ack, + u16 *wnd, u32 *fin_rcvd, u32 *rst_rcvd) +{ + u32 seq; + bool processacks; + struct sk_buff *old_skb; + + if (skb) { + /* Continue processing fpdu */ + if (skb->next == (struct sk_buff *)&nesqp->pau_list) + goto out; + skb = skb->next; + processacks = false; + } else { + /* Starting a new one */ + if (skb_queue_empty(&nesqp->pau_list)) + goto out; + skb = skb_peek(&nesqp->pau_list); + processacks = true; + } + + while (1) { + if (skb_queue_empty(&nesqp->pau_list)) + goto out; + + seq = nes_get_seq(skb, ack, wnd, fin_rcvd, rst_rcvd); + if (seq == nextseq) { + if (skb->len || processacks) + break; + } else if (after(seq, nextseq)) { + goto out; + } + + old_skb = skb; + skb = skb->next; + skb_unlink(old_skb, &nesqp->pau_list); + nes_mgt_free_skb(nesdev, old_skb, PCI_DMA_TODEVICE); + nes_rem_ref_cm_node(nesqp->cm_node); + if (skb == (struct sk_buff *)&nesqp->pau_list) + goto out; + } + return skb; + +out: + return NULL; +} + +/** + * get_fpdu_info - Find the next complete fpdu and return its fragments. + */ +static int get_fpdu_info(struct nes_device *nesdev, struct nes_qp *nesqp, + struct pau_fpdu_info **pau_fpdu_info) +{ + struct sk_buff *skb; + struct iphdr *iph; + struct tcphdr *tcph; + struct nes_rskb_cb *cb; + struct pau_fpdu_info *fpdu_info = NULL; + struct pau_fpdu_frag frags[MAX_FPDU_FRAGS]; + u32 fpdu_len = 0; + u32 tmp_len; + int frag_cnt = 0; + u32 tot_len; + u32 frag_tot; + u32 ack; + u32 fin_rcvd; + u32 rst_rcvd; + u16 wnd; + int i; + int rc = 0; + + *pau_fpdu_info = NULL; + + skb = nes_get_next_skb(nesdev, nesqp, NULL, nesqp->pau_rcv_nxt, &ack, &wnd, &fin_rcvd, &rst_rcvd); + if (!skb) + goto out; + + cb = (struct nes_rskb_cb *)&skb->cb[0]; + if (skb->len) { + fpdu_len = be16_to_cpu(*(__be16 *) skb->data) + MPA_FRAMING; + fpdu_len = (fpdu_len + 3) & 0xfffffffc; + tmp_len = fpdu_len; + + /* See if we have all of the fpdu */ + frag_tot = 0; + memset(&frags, 0, sizeof frags); + for (i = 0; i < MAX_FPDU_FRAGS; i++) { + frags[i].physaddr = cb->busaddr; + frags[i].physaddr += skb->data - cb->data_start; + frags[i].frag_len = min(tmp_len, skb->len); + frags[i].skb = skb; + frags[i].cmplt = (skb->len == frags[i].frag_len); + frag_tot += frags[i].frag_len; + frag_cnt++; + + tmp_len -= frags[i].frag_len; + if (tmp_len == 0) + break; + + skb = nes_get_next_skb(nesdev, nesqp, skb, + nesqp->pau_rcv_nxt + frag_tot, &ack, &wnd, &fin_rcvd, &rst_rcvd); + if (!skb) + goto out; + if (rst_rcvd) { + /* rst received in the middle of fpdu */ + for (; i >= 0; i--) { + skb_unlink(frags[i].skb, &nesqp->pau_list); + nes_mgt_free_skb(nesdev, frags[i].skb, PCI_DMA_TODEVICE); + } + cb = (struct nes_rskb_cb *)&skb->cb[0]; + frags[0].physaddr = cb->busaddr; + frags[0].physaddr += skb->data - cb->data_start; + frags[0].frag_len = skb->len; + frags[0].skb = skb; + frags[0].cmplt = true; + frag_cnt = 1; + break; + } + + cb = (struct nes_rskb_cb *)&skb->cb[0]; + } + } else { + /* no data */ + frags[0].physaddr = cb->busaddr; + frags[0].frag_len = 0; + frags[0].skb = skb; + frags[0].cmplt = true; + frag_cnt = 1; + } + + /* Found one */ + fpdu_info = kzalloc(sizeof(*fpdu_info), GFP_ATOMIC); + if (fpdu_info == NULL) { + nes_debug(NES_DBG_PAU, "Failed to alloc a fpdu_info.\n"); + rc = -ENOMEM; + goto out; + } + + fpdu_info->cqp_request = nes_get_cqp_request(nesdev); + if (fpdu_info->cqp_request == NULL) { + nes_debug(NES_DBG_PAU, "Failed to get a cqp_request.\n"); + rc = -ENOMEM; + goto out; + } + + cb = (struct nes_rskb_cb *)&frags[0].skb->cb[0]; + iph = (struct iphdr *)(cb->data_start + ETH_HLEN); + tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl)); + fpdu_info->hdr_len = (((unsigned char *)tcph) + 4 * (tcph->doff)) - cb->data_start; + fpdu_info->data_len = fpdu_len; + tot_len = fpdu_info->hdr_len + fpdu_len - ETH_HLEN; + + if (frags[0].cmplt) { + fpdu_info->hdr_pbase = cb->busaddr; + fpdu_info->hdr_vbase = NULL; + } else { + fpdu_info->hdr_vbase = pci_alloc_consistent(nesdev->pcidev, + fpdu_info->hdr_len, &fpdu_info->hdr_pbase); + if (!fpdu_info->hdr_vbase) { + nes_debug(NES_DBG_PAU, "Unable to allocate memory for pau first frag\n"); + rc = -ENOMEM; + goto out; + } + + /* Copy hdrs, adjusting len and seqnum */ + memcpy(fpdu_info->hdr_vbase, cb->data_start, fpdu_info->hdr_len); + iph = (struct iphdr *)(fpdu_info->hdr_vbase + ETH_HLEN); + tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl)); + } + + iph->tot_len = cpu_to_be16(tot_len); + iph->saddr = cpu_to_be32(0x7f000001); + + tcph->seq = cpu_to_be32(nesqp->pau_rcv_nxt); + tcph->ack_seq = cpu_to_be32(ack); + tcph->window = cpu_to_be16(wnd); + + nesqp->pau_rcv_nxt += fpdu_len + fin_rcvd; + + memcpy(fpdu_info->frags, frags, sizeof(fpdu_info->frags)); + fpdu_info->frag_cnt = frag_cnt; + fpdu_info->nesqp = nesqp; + *pau_fpdu_info = fpdu_info; + + /* Update skb's for next pass */ + for (i = 0; i < frag_cnt; i++) { + cb = (struct nes_rskb_cb *)&frags[i].skb->cb[0]; + skb_pull(frags[i].skb, frags[i].frag_len); + + if (frags[i].skb->len == 0) { + /* Pull skb off the list - it will be freed in the callback */ + if (!skb_queue_empty(&nesqp->pau_list)) + skb_unlink(frags[i].skb, &nesqp->pau_list); + } else { + /* Last skb still has data so update the seq */ + iph = (struct iphdr *)(cb->data_start + ETH_HLEN); + tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl)); + tcph->seq = cpu_to_be32(nesqp->pau_rcv_nxt); + } + } + +out: + if (rc) { + if (fpdu_info) { + if (fpdu_info->cqp_request) + nes_put_cqp_request(nesdev, fpdu_info->cqp_request); + kfree(fpdu_info); + } + } + return rc; +} + +/** + * forward_fpdu - send complete fpdus, one at a time + */ +static int forward_fpdus(struct nes_vnic *nesvnic, struct nes_qp *nesqp) +{ + struct nes_device *nesdev = nesvnic->nesdev; + struct pau_fpdu_info *fpdu_info; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_cqp_request *cqp_request; + unsigned long flags; + u64 u64tmp; + u32 u32tmp; + int rc; + + while (1) { + spin_lock_irqsave(&nesqp->pau_lock, flags); + rc = get_fpdu_info(nesdev, nesqp, &fpdu_info); + if (rc || (fpdu_info == NULL)) { + spin_unlock_irqrestore(&nesqp->pau_lock, flags); + return rc; + } + + cqp_request = fpdu_info->cqp_request; + cqp_wqe = &cqp_request->cqp_wqe; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_DL_OPCODE_IDX, + NES_CQP_DOWNLOAD_SEGMENT | + (((u32)nesvnic->logical_port) << NES_CQP_OP_LOGICAL_PORT_SHIFT)); + + u32tmp = fpdu_info->hdr_len << 16; + u32tmp |= fpdu_info->hdr_len + (u32)fpdu_info->data_len; + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_DL_LENGTH_0_TOTAL_IDX, + u32tmp); + + u32tmp = (fpdu_info->frags[1].frag_len << 16) | fpdu_info->frags[0].frag_len; + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_LENGTH_2_1_IDX, + u32tmp); + + u32tmp = (fpdu_info->frags[3].frag_len << 16) | fpdu_info->frags[2].frag_len; + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_LENGTH_4_3_IDX, + u32tmp); + + u64tmp = (u64)fpdu_info->hdr_pbase; + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX, + lower_32_bits(u64tmp)); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_HIGH_IDX, + upper_32_bits(u64tmp)); + + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX, + lower_32_bits(fpdu_info->frags[0].physaddr)); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_HIGH_IDX, + upper_32_bits(fpdu_info->frags[0].physaddr)); + + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG2_LOW_IDX, + lower_32_bits(fpdu_info->frags[1].physaddr)); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG2_HIGH_IDX, + upper_32_bits(fpdu_info->frags[1].physaddr)); + + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG3_LOW_IDX, + lower_32_bits(fpdu_info->frags[2].physaddr)); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG3_HIGH_IDX, + upper_32_bits(fpdu_info->frags[2].physaddr)); + + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG4_LOW_IDX, + lower_32_bits(fpdu_info->frags[3].physaddr)); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG4_HIGH_IDX, + upper_32_bits(fpdu_info->frags[3].physaddr)); + + cqp_request->cqp_callback_pointer = fpdu_info; + cqp_request->callback = 1; + cqp_request->cqp_callback = nes_download_callback; + + atomic_set(&cqp_request->refcount, 1); + nes_post_cqp_request(nesdev, cqp_request); + spin_unlock_irqrestore(&nesqp->pau_lock, flags); + } + + return 0; +} + +static void process_fpdus(struct nes_vnic *nesvnic, struct nes_qp *nesqp) +{ + int again = 1; + unsigned long flags; + + do { + /* Ignore rc - if it failed, tcp retries will cause it to try again */ + forward_fpdus(nesvnic, nesqp); + + spin_lock_irqsave(&nesqp->pau_lock, flags); + if (nesqp->pau_pending) { + nesqp->pau_pending = 0; + } else { + nesqp->pau_busy = 0; + again = 0; + } + + spin_unlock_irqrestore(&nesqp->pau_lock, flags); + } while (again); +} + +/** + * queue_fpdus - Handle fpdu's that hw passed up to sw + */ +static void queue_fpdus(struct sk_buff *skb, struct nes_vnic *nesvnic, struct nes_qp *nesqp) +{ + struct sk_buff *tmpskb; + struct nes_rskb_cb *cb; + struct iphdr *iph; + struct tcphdr *tcph; + unsigned char *tcph_end; + u32 rcv_nxt; + u32 rcv_wnd; + u32 seqnum; + u32 len; + bool process_it = false; + unsigned long flags; + + /* Move data ptr to after tcp header */ + iph = (struct iphdr *)skb->data; + tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl)); + seqnum = be32_to_cpu(tcph->seq); + tcph_end = (((char *)tcph) + (4 * tcph->doff)); + + len = be16_to_cpu(iph->tot_len); + if (skb->len > len) + skb_trim(skb, len); + skb_pull(skb, tcph_end - skb->data); + + /* Initialize tracking values */ + cb = (struct nes_rskb_cb *)&skb->cb[0]; + cb->seqnum = seqnum; + + /* Make sure data is in the receive window */ + rcv_nxt = nesqp->pau_rcv_nxt; + rcv_wnd = le32_to_cpu(nesqp->nesqp_context->rcv_wnd); + if (!between(seqnum, rcv_nxt, (rcv_nxt + rcv_wnd))) { + nes_mgt_free_skb(nesvnic->nesdev, skb, PCI_DMA_TODEVICE); + nes_rem_ref_cm_node(nesqp->cm_node); + return; + } + + spin_lock_irqsave(&nesqp->pau_lock, flags); + + if (nesqp->pau_busy) + nesqp->pau_pending = 1; + else + nesqp->pau_busy = 1; + + /* Queue skb by sequence number */ + if (skb_queue_len(&nesqp->pau_list) == 0) { + skb_queue_head(&nesqp->pau_list, skb); + } else { + tmpskb = nesqp->pau_list.next; + while (tmpskb != (struct sk_buff *)&nesqp->pau_list) { + cb = (struct nes_rskb_cb *)&tmpskb->cb[0]; + if (before(seqnum, cb->seqnum)) + break; + tmpskb = tmpskb->next; + } + skb_insert(tmpskb, skb, &nesqp->pau_list); + } + if (nesqp->pau_state == PAU_READY) + process_it = true; + spin_unlock_irqrestore(&nesqp->pau_lock, flags); + + if (process_it) + process_fpdus(nesvnic, nesqp); + + return; +} + +/** + * mgt_thread - Handle mgt skbs in a safe context + */ +static int mgt_thread(void *context) +{ + struct nes_vnic *nesvnic = context; + struct sk_buff *skb; + struct nes_rskb_cb *cb; + + while (!kthread_should_stop()) { + wait_event_interruptible(nesvnic->mgt_wait_queue, + skb_queue_len(&nesvnic->mgt_skb_list) || kthread_should_stop()); + while ((skb_queue_len(&nesvnic->mgt_skb_list)) && !kthread_should_stop()) { + skb = skb_dequeue(&nesvnic->mgt_skb_list); + cb = (struct nes_rskb_cb *)&skb->cb[0]; + cb->data_start = skb->data - ETH_HLEN; + cb->busaddr = pci_map_single(nesvnic->nesdev->pcidev, cb->data_start, + nesvnic->max_frame_size, PCI_DMA_TODEVICE); + queue_fpdus(skb, nesvnic, cb->nesqp); + } + } + + /* Closing down so delete any entries on the queue */ + while (skb_queue_len(&nesvnic->mgt_skb_list)) { + skb = skb_dequeue(&nesvnic->mgt_skb_list); + cb = (struct nes_rskb_cb *)&skb->cb[0]; + nes_rem_ref_cm_node(cb->nesqp->cm_node); + dev_kfree_skb_any(skb); + } + return 0; +} + +/** + * nes_queue_skbs - Queue skb so it can be handled in a thread context + */ +void nes_queue_mgt_skbs(struct sk_buff *skb, struct nes_vnic *nesvnic, struct nes_qp *nesqp) +{ + struct nes_rskb_cb *cb; + + cb = (struct nes_rskb_cb *)&skb->cb[0]; + cb->nesqp = nesqp; + skb_queue_tail(&nesvnic->mgt_skb_list, skb); + wake_up_interruptible(&nesvnic->mgt_wait_queue); +} + +void nes_destroy_pau_qp(struct nes_device *nesdev, struct nes_qp *nesqp) +{ + struct sk_buff *skb; + unsigned long flags; + atomic_inc(&pau_qps_destroyed); + + /* Free packets that have not yet been forwarded */ + /* Lock is acquired by skb_dequeue when removing the skb */ + spin_lock_irqsave(&nesqp->pau_lock, flags); + while (skb_queue_len(&nesqp->pau_list)) { + skb = skb_dequeue(&nesqp->pau_list); + nes_mgt_free_skb(nesdev, skb, PCI_DMA_TODEVICE); + nes_rem_ref_cm_node(nesqp->cm_node); + } + spin_unlock_irqrestore(&nesqp->pau_lock, flags); +} + +static void nes_chg_qh_handler(struct nes_device *nesdev, struct nes_cqp_request *cqp_request) +{ + struct pau_qh_chg *qh_chg = cqp_request->cqp_callback_pointer; + struct nes_cqp_request *new_request; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_adapter *nesadapter; + struct nes_qp *nesqp; + struct nes_v4_quad nes_quad; + u32 crc_value; + u64 u64temp; + + nesadapter = nesdev->nesadapter; + nesqp = qh_chg->nesqp; + + /* Should we handle the bad completion */ + if (cqp_request->major_code) + WARN(1, PFX "Invalid cqp_request major_code=0x%x\n", + cqp_request->major_code); + + switch (nesqp->pau_state) { + case PAU_DEL_QH: + /* Old hash code deleted, now set the new one */ + nesqp->pau_state = PAU_ADD_LB_QH; + new_request = nes_get_cqp_request(nesdev); + if (new_request == NULL) { + nes_debug(NES_DBG_PAU, "Failed to get a new_request.\n"); + WARN_ON(1); + return; + } + + memset(&nes_quad, 0, sizeof(nes_quad)); + nes_quad.DstIpAdrIndex = + cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24); + nes_quad.SrcIpadr = cpu_to_be32(0x7f000001); + nes_quad.TcpPorts[0] = swab16(nesqp->nesqp_context->tcpPorts[1]); + nes_quad.TcpPorts[1] = swab16(nesqp->nesqp_context->tcpPorts[0]); + + /* Produce hash key */ + crc_value = get_crc_value(&nes_quad); + nesqp->hte_index = cpu_to_be32(crc_value ^ 0xffffffff); + nes_debug(NES_DBG_PAU, "new HTE Index = 0x%08X, CRC = 0x%08X\n", + nesqp->hte_index, nesqp->hte_index & nesadapter->hte_index_mask); + + nesqp->hte_index &= nesadapter->hte_index_mask; + nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index); + nesqp->nesqp_context->ip0 = cpu_to_le32(0x7f000001); + nesqp->nesqp_context->rcv_nxt = cpu_to_le32(nesqp->pau_rcv_nxt); + + cqp_wqe = &new_request->cqp_wqe; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, + NES_CQP_WQE_OPCODE_IDX, NES_CQP_MANAGE_QUAD_HASH | + NES_CQP_QP_TYPE_IWARP | NES_CQP_QP_CONTEXT_VALID | NES_CQP_QP_IWARP_STATE_RTS); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id); + u64temp = (u64)nesqp->nesqp_context_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp); + + nes_debug(NES_DBG_PAU, "Waiting for CQP completion for adding the quad hash.\n"); + + new_request->cqp_callback_pointer = qh_chg; + new_request->callback = 1; + new_request->cqp_callback = nes_chg_qh_handler; + atomic_set(&new_request->refcount, 1); + nes_post_cqp_request(nesdev, new_request); + break; + + case PAU_ADD_LB_QH: + /* Start processing the queued fpdu's */ + nesqp->pau_state = PAU_READY; + process_fpdus(qh_chg->nesvnic, qh_chg->nesqp); + kfree(qh_chg); + break; + } +} + +/** + * nes_change_quad_hash + */ +static int nes_change_quad_hash(struct nes_device *nesdev, + struct nes_vnic *nesvnic, struct nes_qp *nesqp) +{ + struct nes_cqp_request *cqp_request = NULL; + struct pau_qh_chg *qh_chg = NULL; + u64 u64temp; + struct nes_hw_cqp_wqe *cqp_wqe; + int ret = 0; + + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_PAU, "Failed to get a cqp_request.\n"); + ret = -ENOMEM; + goto chg_qh_err; + } + + qh_chg = kmalloc(sizeof *qh_chg, GFP_ATOMIC); + if (qh_chg == NULL) { + nes_debug(NES_DBG_PAU, "Failed to get a cqp_request.\n"); + ret = -ENOMEM; + goto chg_qh_err; + } + qh_chg->nesdev = nesdev; + qh_chg->nesvnic = nesvnic; + qh_chg->nesqp = nesqp; + nesqp->pau_state = PAU_DEL_QH; + + cqp_wqe = &cqp_request->cqp_wqe; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, + NES_CQP_WQE_OPCODE_IDX, NES_CQP_MANAGE_QUAD_HASH | NES_CQP_QP_DEL_HTE | + NES_CQP_QP_TYPE_IWARP | NES_CQP_QP_CONTEXT_VALID | NES_CQP_QP_IWARP_STATE_RTS); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id); + u64temp = (u64)nesqp->nesqp_context_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp); + + nes_debug(NES_DBG_PAU, "Waiting for CQP completion for deleting the quad hash.\n"); + + cqp_request->cqp_callback_pointer = qh_chg; + cqp_request->callback = 1; + cqp_request->cqp_callback = nes_chg_qh_handler; + atomic_set(&cqp_request->refcount, 1); + nes_post_cqp_request(nesdev, cqp_request); + + return ret; + +chg_qh_err: + kfree(qh_chg); + if (cqp_request) + nes_put_cqp_request(nesdev, cqp_request); + return ret; +} + +/** + * nes_mgt_ce_handler + * This management code deals with any packed and unaligned (pau) fpdu's + * that the hardware cannot handle. + */ +static void nes_mgt_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq) +{ + struct nes_vnic_mgt *mgtvnic = container_of(cq, struct nes_vnic_mgt, mgt_cq); + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 head; + u32 cq_size; + u32 cqe_count = 0; + u32 cqe_misc; + u32 qp_id = 0; + u32 skbs_needed; + unsigned long context; + struct nes_qp *nesqp; + struct sk_buff *rx_skb; + struct nes_rskb_cb *cb; + + head = cq->cq_head; + cq_size = cq->cq_size; + + while (1) { + cqe_misc = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]); + if (!(cqe_misc & NES_NIC_CQE_VALID)) + break; + + nesqp = NULL; + if (cqe_misc & NES_NIC_CQE_ACCQP_VALID) { + qp_id = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_ACCQP_ID_IDX]); + qp_id &= 0x001fffff; + if (qp_id < nesadapter->max_qp) { + context = (unsigned long)nesadapter->qp_table[qp_id - NES_FIRST_QPN]; + nesqp = (struct nes_qp *)context; + } + } + + if (nesqp) { + if (nesqp->pau_mode == false) { + nesqp->pau_mode = true; /* First time for this qp */ + nesqp->pau_rcv_nxt = le32_to_cpu( + cq->cq_vbase[head].cqe_words[NES_NIC_CQE_HASH_RCVNXT]); + skb_queue_head_init(&nesqp->pau_list); + spin_lock_init(&nesqp->pau_lock); + atomic_inc(&pau_qps_created); + nes_change_quad_hash(nesdev, mgtvnic->nesvnic, nesqp); + } + + rx_skb = mgtvnic->mgt.rx_skb[mgtvnic->mgt.rq_tail]; + rx_skb->len = 0; + skb_put(rx_skb, cqe_misc & 0x0000ffff); + rx_skb->protocol = eth_type_trans(rx_skb, mgtvnic->nesvnic->netdev); + cb = (struct nes_rskb_cb *)&rx_skb->cb[0]; + pci_unmap_single(nesdev->pcidev, cb->busaddr, cb->maplen, PCI_DMA_FROMDEVICE); + cb->busaddr = 0; + mgtvnic->mgt.rq_tail++; + mgtvnic->mgt.rq_tail &= mgtvnic->mgt.rq_size - 1; + + nes_add_ref_cm_node(nesqp->cm_node); + nes_queue_mgt_skbs(rx_skb, mgtvnic->nesvnic, nesqp); + } else { + printk(KERN_ERR PFX "Invalid QP %d for packed/unaligned handling\n", qp_id); + } + + cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX] = 0; + cqe_count++; + if (++head >= cq_size) + head = 0; + + if (cqe_count == 255) { + /* Replenish mgt CQ */ + nes_write32(nesdev->regs + NES_CQE_ALLOC, cq->cq_number | (cqe_count << 16)); + nesdev->currcq_count += cqe_count; + cqe_count = 0; + } + + skbs_needed = atomic_inc_return(&mgtvnic->rx_skbs_needed); + if (skbs_needed > (mgtvnic->mgt.rq_size >> 1)) + nes_replenish_mgt_rq(mgtvnic); + } + + cq->cq_head = head; + nes_write32(nesdev->regs + NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT | + cq->cq_number | (cqe_count << 16)); + nes_read32(nesdev->regs + NES_CQE_ALLOC); + nesdev->currcq_count += cqe_count; +} + +/** + * nes_init_mgt_qp + */ +int nes_init_mgt_qp(struct nes_device *nesdev, struct net_device *netdev, struct nes_vnic *nesvnic) +{ + struct nes_vnic_mgt *mgtvnic; + u32 counter; + void *vmem; + dma_addr_t pmem; + struct nes_hw_cqp_wqe *cqp_wqe; + u32 cqp_head; + unsigned long flags; + struct nes_hw_nic_qp_context *mgt_context; + u64 u64temp; + struct nes_hw_nic_rq_wqe *mgt_rqe; + struct sk_buff *skb; + u32 wqe_count; + struct nes_rskb_cb *cb; + u32 mgt_mem_size; + void *mgt_vbase; + dma_addr_t mgt_pbase; + int i; + int ret; + + /* Allocate space the all mgt QPs once */ + mgtvnic = kzalloc(NES_MGT_QP_COUNT * sizeof(struct nes_vnic_mgt), GFP_KERNEL); + if (mgtvnic == NULL) { + nes_debug(NES_DBG_INIT, "Unable to allocate memory for mgt structure\n"); + return -ENOMEM; + } + + /* Allocate fragment, RQ, and CQ; Reuse CEQ based on the PCI function */ + /* We are not sending from this NIC so sq is not allocated */ + mgt_mem_size = 256 + + (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_rq_wqe)) + + (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_cqe)) + + sizeof(struct nes_hw_nic_qp_context); + mgt_mem_size = (mgt_mem_size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1); + mgt_vbase = pci_alloc_consistent(nesdev->pcidev, NES_MGT_QP_COUNT * mgt_mem_size, &mgt_pbase); + if (!mgt_vbase) { + kfree(mgtvnic); + nes_debug(NES_DBG_INIT, "Unable to allocate memory for mgt host descriptor rings\n"); + return -ENOMEM; + } + + nesvnic->mgt_mem_size = NES_MGT_QP_COUNT * mgt_mem_size; + nesvnic->mgt_vbase = mgt_vbase; + nesvnic->mgt_pbase = mgt_pbase; + + skb_queue_head_init(&nesvnic->mgt_skb_list); + init_waitqueue_head(&nesvnic->mgt_wait_queue); + nesvnic->mgt_thread = kthread_run(mgt_thread, nesvnic, "nes_mgt_thread"); + + for (i = 0; i < NES_MGT_QP_COUNT; i++) { + mgtvnic->nesvnic = nesvnic; + mgtvnic->mgt.qp_id = nesdev->mac_index + NES_MGT_QP_OFFSET + i; + memset(mgt_vbase, 0, mgt_mem_size); + nes_debug(NES_DBG_INIT, "Allocated mgt QP structures at %p (phys = %016lX), size = %u.\n", + mgt_vbase, (unsigned long)mgt_pbase, mgt_mem_size); + + vmem = (void *)(((unsigned long)mgt_vbase + (256 - 1)) & + ~(unsigned long)(256 - 1)); + pmem = (dma_addr_t)(((unsigned long long)mgt_pbase + (256 - 1)) & + ~(unsigned long long)(256 - 1)); + + spin_lock_init(&mgtvnic->mgt.rq_lock); + + /* setup the RQ */ + mgtvnic->mgt.rq_vbase = vmem; + mgtvnic->mgt.rq_pbase = pmem; + mgtvnic->mgt.rq_head = 0; + mgtvnic->mgt.rq_tail = 0; + mgtvnic->mgt.rq_size = NES_MGT_WQ_COUNT; + + /* setup the CQ */ + vmem += (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_rq_wqe)); + pmem += (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_rq_wqe)); + + mgtvnic->mgt_cq.cq_number = mgtvnic->mgt.qp_id; + mgtvnic->mgt_cq.cq_vbase = vmem; + mgtvnic->mgt_cq.cq_pbase = pmem; + mgtvnic->mgt_cq.cq_head = 0; + mgtvnic->mgt_cq.cq_size = NES_MGT_WQ_COUNT; + + mgtvnic->mgt_cq.ce_handler = nes_mgt_ce_handler; + + /* Send CreateCQ request to CQP */ + spin_lock_irqsave(&nesdev->cqp.lock, flags); + cqp_head = nesdev->cqp.sq_head; + + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32( + NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID | + ((u32)mgtvnic->mgt_cq.cq_size << 16)); + cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32( + mgtvnic->mgt_cq.cq_number | ((u32)nesdev->ceq_index << 16)); + u64temp = (u64)mgtvnic->mgt_cq.cq_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0; + u64temp = (unsigned long)&mgtvnic->mgt_cq; + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] = cpu_to_le32((u32)(u64temp >> 1)); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = + cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0; + + if (++cqp_head >= nesdev->cqp.sq_size) + cqp_head = 0; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + + /* Send CreateQP request to CQP */ + mgt_context = (void *)(&mgtvnic->mgt_cq.cq_vbase[mgtvnic->mgt_cq.cq_size]); + mgt_context->context_words[NES_NIC_CTX_MISC_IDX] = + cpu_to_le32((u32)NES_MGT_CTX_SIZE | + ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 12)); + nes_debug(NES_DBG_INIT, "RX_WINDOW_BUFFER_PAGE_TABLE_SIZE = 0x%08X, RX_WINDOW_BUFFER_SIZE = 0x%08X\n", + nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_PAGE_TABLE_SIZE), + nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE)); + if (nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE) != 0) + mgt_context->context_words[NES_NIC_CTX_MISC_IDX] |= cpu_to_le32(NES_NIC_BACK_STORE); + + u64temp = (u64)mgtvnic->mgt.rq_pbase; + mgt_context->context_words[NES_NIC_CTX_SQ_LOW_IDX] = cpu_to_le32((u32)u64temp); + mgt_context->context_words[NES_NIC_CTX_SQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32)); + u64temp = (u64)mgtvnic->mgt.rq_pbase; + mgt_context->context_words[NES_NIC_CTX_RQ_LOW_IDX] = cpu_to_le32((u32)u64temp); + mgt_context->context_words[NES_NIC_CTX_RQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32)); + + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_CREATE_QP | + NES_CQP_QP_TYPE_NIC); + cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(mgtvnic->mgt.qp_id); + u64temp = (u64)mgtvnic->mgt_cq.cq_pbase + + (mgtvnic->mgt_cq.cq_size * sizeof(struct nes_hw_nic_cqe)); + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp); + + if (++cqp_head >= nesdev->cqp.sq_size) + cqp_head = 0; + nesdev->cqp.sq_head = cqp_head; + + barrier(); + + /* Ring doorbell (2 WQEs) */ + nes_write32(nesdev->regs + NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id); + + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + nes_debug(NES_DBG_INIT, "Waiting for create MGT QP%u to complete.\n", + mgtvnic->mgt.qp_id); + + ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_INIT, "Create MGT QP%u completed, wait_event_timeout ret = %u.\n", + mgtvnic->mgt.qp_id, ret); + if (!ret) { + nes_debug(NES_DBG_INIT, "MGT QP%u create timeout expired\n", mgtvnic->mgt.qp_id); + if (i == 0) { + pci_free_consistent(nesdev->pcidev, nesvnic->mgt_mem_size, nesvnic->mgt_vbase, + nesvnic->mgt_pbase); + kfree(mgtvnic); + } else { + nes_destroy_mgt(nesvnic); + } + return -EIO; + } + + /* Populate the RQ */ + for (counter = 0; counter < (NES_MGT_WQ_COUNT - 1); counter++) { + skb = dev_alloc_skb(nesvnic->max_frame_size); + if (!skb) { + nes_debug(NES_DBG_INIT, "%s: out of memory for receive skb\n", netdev->name); + return -ENOMEM; + } + + skb->dev = netdev; + + pmem = pci_map_single(nesdev->pcidev, skb->data, + nesvnic->max_frame_size, PCI_DMA_FROMDEVICE); + cb = (struct nes_rskb_cb *)&skb->cb[0]; + cb->busaddr = pmem; + cb->maplen = nesvnic->max_frame_size; + + mgt_rqe = &mgtvnic->mgt.rq_vbase[counter]; + mgt_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = cpu_to_le32((u32)nesvnic->max_frame_size); + mgt_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0; + mgt_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] = cpu_to_le32((u32)pmem); + mgt_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] = cpu_to_le32((u32)((u64)pmem >> 32)); + mgtvnic->mgt.rx_skb[counter] = skb; + } + + init_timer(&mgtvnic->rq_wqes_timer); + mgtvnic->rq_wqes_timer.function = nes_mgt_rq_wqes_timeout; + mgtvnic->rq_wqes_timer.data = (unsigned long)mgtvnic; + + wqe_count = NES_MGT_WQ_COUNT - 1; + mgtvnic->mgt.rq_head = wqe_count; + barrier(); + do { + counter = min(wqe_count, ((u32)255)); + wqe_count -= counter; + nes_write32(nesdev->regs + NES_WQE_ALLOC, (counter << 24) | mgtvnic->mgt.qp_id); + } while (wqe_count); + + nes_write32(nesdev->regs + NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT | + mgtvnic->mgt_cq.cq_number); + nes_read32(nesdev->regs + NES_CQE_ALLOC); + + mgt_vbase += mgt_mem_size; + mgt_pbase += mgt_mem_size; + nesvnic->mgtvnic[i] = mgtvnic++; + } + return 0; +} + + +void nes_destroy_mgt(struct nes_vnic *nesvnic) +{ + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_vnic_mgt *mgtvnic; + struct nes_vnic_mgt *first_mgtvnic; + unsigned long flags; + struct nes_hw_cqp_wqe *cqp_wqe; + u32 cqp_head; + struct sk_buff *rx_skb; + int i; + int ret; + + kthread_stop(nesvnic->mgt_thread); + + /* Free remaining NIC receive buffers */ + first_mgtvnic = nesvnic->mgtvnic[0]; + for (i = 0; i < NES_MGT_QP_COUNT; i++) { + mgtvnic = nesvnic->mgtvnic[i]; + if (mgtvnic == NULL) + continue; + + while (mgtvnic->mgt.rq_head != mgtvnic->mgt.rq_tail) { + rx_skb = mgtvnic->mgt.rx_skb[mgtvnic->mgt.rq_tail]; + nes_mgt_free_skb(nesdev, rx_skb, PCI_DMA_FROMDEVICE); + mgtvnic->mgt.rq_tail++; + mgtvnic->mgt.rq_tail &= (mgtvnic->mgt.rq_size - 1); + } + + spin_lock_irqsave(&nesdev->cqp.lock, flags); + + /* Destroy NIC QP */ + cqp_head = nesdev->cqp.sq_head; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + (NES_CQP_DESTROY_QP | NES_CQP_QP_TYPE_NIC)); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, + mgtvnic->mgt.qp_id); + + if (++cqp_head >= nesdev->cqp.sq_size) + cqp_head = 0; + + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + + /* Destroy NIC CQ */ + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + (NES_CQP_DESTROY_CQ | ((u32)mgtvnic->mgt_cq.cq_size << 16))); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, + (mgtvnic->mgt_cq.cq_number | ((u32)nesdev->ceq_index << 16))); + + if (++cqp_head >= nesdev->cqp.sq_size) + cqp_head = 0; + + nesdev->cqp.sq_head = cqp_head; + barrier(); + + /* Ring doorbell (2 WQEs) */ + nes_write32(nesdev->regs + NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id); + + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + nes_debug(NES_DBG_SHUTDOWN, "Waiting for CQP, cqp_head=%u, cqp.sq_head=%u," + " cqp.sq_tail=%u, cqp.sq_size=%u\n", + cqp_head, nesdev->cqp.sq_head, + nesdev->cqp.sq_tail, nesdev->cqp.sq_size); + + ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head), + NES_EVENT_TIMEOUT); + + nes_debug(NES_DBG_SHUTDOWN, "Destroy MGT QP returned, wait_event_timeout ret = %u, cqp_head=%u," + " cqp.sq_head=%u, cqp.sq_tail=%u\n", + ret, cqp_head, nesdev->cqp.sq_head, nesdev->cqp.sq_tail); + if (!ret) + nes_debug(NES_DBG_SHUTDOWN, "MGT QP%u destroy timeout expired\n", + mgtvnic->mgt.qp_id); + + nesvnic->mgtvnic[i] = NULL; + } + + if (nesvnic->mgt_vbase) { + pci_free_consistent(nesdev->pcidev, nesvnic->mgt_mem_size, nesvnic->mgt_vbase, + nesvnic->mgt_pbase); + nesvnic->mgt_vbase = NULL; + nesvnic->mgt_pbase = 0; + } + + kfree(first_mgtvnic); +} diff --git a/drivers/infiniband/hw/nes/nes_mgt.h b/drivers/infiniband/hw/nes/nes_mgt.h new file mode 100644 index 0000000..4f7f701 --- /dev/null +++ b/drivers/infiniband/hw/nes/nes_mgt.h @@ -0,0 +1,97 @@ +/* +* Copyright (c) 2006 - 2011 Intel-NE, Inc. All rights reserved. +* +* This software is available to you under a choice of one of two +* licenses. You may choose to be licensed under the terms of the GNU +* General Public License (GPL) Version 2, available from the file +* COPYING in the main directory of this source tree, or the +* OpenIB.org BSD license below: +* +* Redistribution and use in source and binary forms, with or +* without modification, are permitted provided that the following +* conditions are met: +* +* - Redistributions of source code must retain the above +* copyright notice, this list of conditions and the following +* disclaimer. +* +* - Redistributions in binary form must reproduce the above +* copyright notice, this list of conditions and the following +* disclaimer in the documentation and/or other materials +* provided with the distribution. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ + +#ifndef __NES_MGT_H +#define __NES_MGT_H + +#define MPA_FRAMING 6 /* length is 2 bytes, crc is 4 bytes */ + +int nes_init_mgt_qp(struct nes_device *nesdev, struct net_device *netdev, struct nes_vnic *nesvnic); +void nes_queue_mgt_skbs(struct sk_buff *skb, struct nes_vnic *nesvnic, struct nes_qp *nesqp); +void nes_destroy_mgt(struct nes_vnic *nesvnic); +void nes_destroy_pau_qp(struct nes_device *nesdev, struct nes_qp *nesqp); + +struct nes_hw_mgt { + struct nes_hw_nic_rq_wqe *rq_vbase; /* virtual address of rq */ + dma_addr_t rq_pbase; /* PCI memory for host rings */ + struct sk_buff *rx_skb[NES_NIC_WQ_SIZE]; + u16 qp_id; + u16 sq_head; + u16 rq_head; + u16 rq_tail; + u16 rq_size; + u8 replenishing_rq; + u8 reserved; + spinlock_t rq_lock; +}; + +struct nes_vnic_mgt { + struct nes_vnic *nesvnic; + struct nes_hw_mgt mgt; + struct nes_hw_nic_cq mgt_cq; + atomic_t rx_skbs_needed; + struct timer_list rq_wqes_timer; + atomic_t rx_skb_timer_running; +}; + +#define MAX_FPDU_FRAGS 4 +struct pau_fpdu_frag { + struct sk_buff *skb; + u64 physaddr; + u32 frag_len; + bool cmplt; +}; + +struct pau_fpdu_info { + struct nes_qp *nesqp; + struct nes_cqp_request *cqp_request; + void *hdr_vbase; + dma_addr_t hdr_pbase; + int hdr_len; + u16 data_len; + u16 frag_cnt; + struct pau_fpdu_frag frags[MAX_FPDU_FRAGS]; +}; + +enum pau_qh_state { + PAU_DEL_QH, + PAU_ADD_LB_QH, + PAU_READY +}; + +struct pau_qh_chg { + struct nes_device *nesdev; + struct nes_vnic *nesvnic; + struct nes_qp *nesqp; +}; + +#endif /* __NES_MGT_H */ diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c new file mode 100644 index 0000000..49eb511 --- /dev/null +++ b/drivers/infiniband/hw/nes/nes_nic.c @@ -0,0 +1,1882 @@ +/* + * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "nes.h" + +static struct nic_qp_map nic_qp_mapping_0[] = { + {16,0,0,1},{24,4,0,0},{28,8,0,0},{32,12,0,0}, + {20,2,2,1},{26,6,2,0},{30,10,2,0},{34,14,2,0}, + {18,1,1,1},{25,5,1,0},{29,9,1,0},{33,13,1,0}, + {22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0} +}; + +static struct nic_qp_map nic_qp_mapping_1[] = { + {18,1,1,1},{25,5,1,0},{29,9,1,0},{33,13,1,0}, + {22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0} +}; + +static struct nic_qp_map nic_qp_mapping_2[] = { + {20,2,2,1},{26,6,2,0},{30,10,2,0},{34,14,2,0} +}; + +static struct nic_qp_map nic_qp_mapping_3[] = { + {22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0} +}; + +static struct nic_qp_map nic_qp_mapping_4[] = { + {28,8,0,0},{32,12,0,0} +}; + +static struct nic_qp_map nic_qp_mapping_5[] = { + {29,9,1,0},{33,13,1,0} +}; + +static struct nic_qp_map nic_qp_mapping_6[] = { + {30,10,2,0},{34,14,2,0} +}; + +static struct nic_qp_map nic_qp_mapping_7[] = { + {31,11,3,0},{35,15,3,0} +}; + +static struct nic_qp_map *nic_qp_mapping_per_function[] = { + nic_qp_mapping_0, nic_qp_mapping_1, nic_qp_mapping_2, nic_qp_mapping_3, + nic_qp_mapping_4, nic_qp_mapping_5, nic_qp_mapping_6, nic_qp_mapping_7 +}; + +static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK + | NETIF_MSG_IFUP | NETIF_MSG_IFDOWN; +static int debug = -1; +static int nics_per_function = 1; + +/** + * nes_netdev_poll + */ +static int nes_netdev_poll(struct napi_struct *napi, int budget) +{ + struct nes_vnic *nesvnic = container_of(napi, struct nes_vnic, napi); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_hw_nic_cq *nescq = &nesvnic->nic_cq; + + nesvnic->budget = budget; + nescq->cqes_pending = 0; + nescq->rx_cqes_completed = 0; + nescq->cqe_allocs_pending = 0; + nescq->rx_pkts_indicated = 0; + + nes_nic_ce_handler(nesdev, nescq); + + if (nescq->cqes_pending == 0) { + napi_complete(napi); + /* clear out completed cqes and arm */ + nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT | + nescq->cq_number | (nescq->cqe_allocs_pending << 16)); + nes_read32(nesdev->regs+NES_CQE_ALLOC); + } else { + /* clear out completed cqes but don't arm */ + nes_write32(nesdev->regs+NES_CQE_ALLOC, + nescq->cq_number | (nescq->cqe_allocs_pending << 16)); + nes_debug(NES_DBG_NETDEV, "%s: exiting with work pending\n", + nesvnic->netdev->name); + } + return nescq->rx_pkts_indicated; +} + + +/** + * nes_netdev_open - Activate the network interface; ifconfig + * ethx up. + */ +static int nes_netdev_open(struct net_device *netdev) +{ + u32 macaddr_low; + u16 macaddr_high; + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + int ret; + int i; + struct nes_vnic *first_nesvnic = NULL; + u32 nic_active_bit; + u32 nic_active; + struct list_head *list_pos, *list_temp; + unsigned long flags; + + assert(nesdev != NULL); + + if (nesvnic->netdev_open == 1) + return 0; + + if (netif_msg_ifup(nesvnic)) + printk(KERN_INFO PFX "%s: enabling interface\n", netdev->name); + + ret = nes_init_nic_qp(nesdev, netdev); + if (ret) { + return ret; + } + + netif_carrier_off(netdev); + netif_stop_queue(netdev); + + if ((!nesvnic->of_device_registered) && (nesvnic->rdma_enabled)) { + nesvnic->nesibdev = nes_init_ofa_device(netdev); + if (nesvnic->nesibdev == NULL) { + printk(KERN_ERR PFX "%s: nesvnic->nesibdev alloc failed", netdev->name); + } else { + nesvnic->nesibdev->nesvnic = nesvnic; + ret = nes_register_ofa_device(nesvnic->nesibdev); + if (ret) { + printk(KERN_ERR PFX "%s: Unable to register RDMA device, ret = %d\n", + netdev->name, ret); + } + } + } + /* Set packet filters */ + nic_active_bit = 1 << nesvnic->nic_index; + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_ACTIVE); + nic_active |= nic_active_bit; + nes_write_indexed(nesdev, NES_IDX_NIC_ACTIVE, nic_active); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE); + nic_active |= nic_active_bit; + nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE, nic_active); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON); + nic_active |= nic_active_bit; + nes_write_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON, nic_active); + + macaddr_high = ((u16)netdev->dev_addr[0]) << 8; + macaddr_high += (u16)netdev->dev_addr[1]; + + macaddr_low = ((u32)netdev->dev_addr[2]) << 24; + macaddr_low += ((u32)netdev->dev_addr[3]) << 16; + macaddr_low += ((u32)netdev->dev_addr[4]) << 8; + macaddr_low += (u32)netdev->dev_addr[5]; + + /* Program the various MAC regs */ + for (i = 0; i < NES_MAX_PORT_COUNT; i++) { + if (nesvnic->qp_nic_index[i] == 0xf) { + break; + } + nes_debug(NES_DBG_NETDEV, "i=%d, perfect filter table index= %d, PERF FILTER LOW" + " (Addr:%08X) = %08X, HIGH = %08X.\n", + i, nesvnic->qp_nic_index[i], + NES_IDX_PERFECT_FILTER_LOW+ + (nesvnic->qp_nic_index[i] * 8), + macaddr_low, + (u32)macaddr_high | NES_MAC_ADDR_VALID | + ((((u32)nesvnic->nic_index) << 16))); + nes_write_indexed(nesdev, + NES_IDX_PERFECT_FILTER_LOW + (nesvnic->qp_nic_index[i] * 8), + macaddr_low); + nes_write_indexed(nesdev, + NES_IDX_PERFECT_FILTER_HIGH + (nesvnic->qp_nic_index[i] * 8), + (u32)macaddr_high | NES_MAC_ADDR_VALID | + ((((u32)nesvnic->nic_index) << 16))); + } + + + nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT | + nesvnic->nic_cq.cq_number); + nes_read32(nesdev->regs+NES_CQE_ALLOC); + list_for_each_safe(list_pos, list_temp, &nesdev->nesadapter->nesvnic_list[nesdev->mac_index]) { + first_nesvnic = container_of(list_pos, struct nes_vnic, list); + if (first_nesvnic->netdev_open == 1) + break; + } + if (first_nesvnic->netdev_open == 0) { + nes_debug(NES_DBG_INIT, "Setting up MAC interrupt mask.\n"); + nes_write_indexed(nesdev, NES_IDX_MAC_INT_MASK + (0x200 * nesdev->mac_index), + ~(NES_MAC_INT_LINK_STAT_CHG | NES_MAC_INT_XGMII_EXT | + NES_MAC_INT_TX_UNDERFLOW | NES_MAC_INT_TX_ERROR)); + first_nesvnic = nesvnic; + } + + if (first_nesvnic->linkup) { + /* Enable network packets */ + nesvnic->linkup = 1; + netif_start_queue(netdev); + netif_carrier_on(netdev); + } + + spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags); + if (nesdev->nesadapter->phy_type[nesdev->mac_index] == NES_PHY_TYPE_SFP_D) { + nesdev->link_recheck = 1; + mod_delayed_work(system_wq, &nesdev->work, + NES_LINK_RECHECK_DELAY); + } + spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags); + + spin_lock_irqsave(&nesvnic->port_ibevent_lock, flags); + if (nesvnic->of_device_registered) { + nesdev->nesadapter->send_term_ok = 1; + if (nesvnic->linkup == 1) { + if (nesdev->iw_status == 0) { + nesdev->iw_status = 1; + nes_port_ibevent(nesvnic); + } + } else { + nesdev->iw_status = 0; + } + } + spin_unlock_irqrestore(&nesvnic->port_ibevent_lock, flags); + + napi_enable(&nesvnic->napi); + nesvnic->netdev_open = 1; + + return 0; +} + + +/** + * nes_netdev_stop + */ +static int nes_netdev_stop(struct net_device *netdev) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + u32 nic_active_mask; + u32 nic_active; + struct nes_vnic *first_nesvnic = NULL; + struct list_head *list_pos, *list_temp; + unsigned long flags; + + nes_debug(NES_DBG_SHUTDOWN, "nesvnic=%p, nesdev=%p, netdev=%p %s\n", + nesvnic, nesdev, netdev, netdev->name); + if (nesvnic->netdev_open == 0) + return 0; + + if (netif_msg_ifdown(nesvnic)) + printk(KERN_INFO PFX "%s: disabling interface\n", netdev->name); + netif_carrier_off(netdev); + + /* Disable network packets */ + napi_disable(&nesvnic->napi); + netif_stop_queue(netdev); + list_for_each_safe(list_pos, list_temp, &nesdev->nesadapter->nesvnic_list[nesdev->mac_index]) { + first_nesvnic = container_of(list_pos, struct nes_vnic, list); + if ((first_nesvnic->netdev_open == 1) && (first_nesvnic != nesvnic)) + break; + } + + if ((first_nesvnic->netdev_open == 1) && (first_nesvnic != nesvnic) && + (PCI_FUNC(first_nesvnic->nesdev->pcidev->devfn) != + PCI_FUNC(nesvnic->nesdev->pcidev->devfn))) { + nes_write_indexed(nesdev, NES_IDX_MAC_INT_MASK+ + (0x200*nesdev->mac_index), 0xffffffff); + nes_write_indexed(first_nesvnic->nesdev, + NES_IDX_MAC_INT_MASK+ + (0x200*first_nesvnic->nesdev->mac_index), + ~(NES_MAC_INT_LINK_STAT_CHG | NES_MAC_INT_XGMII_EXT | + NES_MAC_INT_TX_UNDERFLOW | NES_MAC_INT_TX_ERROR)); + } else { + nes_write_indexed(nesdev, NES_IDX_MAC_INT_MASK+(0x200*nesdev->mac_index), 0xffffffff); + } + + nic_active_mask = ~((u32)(1 << nesvnic->nic_index)); + nes_write_indexed(nesdev, NES_IDX_PERFECT_FILTER_HIGH+ + (nesvnic->perfect_filter_index*8), 0); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_ACTIVE); + nic_active &= nic_active_mask; + nes_write_indexed(nesdev, NES_IDX_NIC_ACTIVE, nic_active); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL); + nic_active &= nic_active_mask; + nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE); + nic_active &= nic_active_mask; + nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE, nic_active); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL); + nic_active &= nic_active_mask; + nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON); + nic_active &= nic_active_mask; + nes_write_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON, nic_active); + + spin_lock_irqsave(&nesvnic->port_ibevent_lock, flags); + if (nesvnic->of_device_registered) { + nesdev->nesadapter->send_term_ok = 0; + nesdev->iw_status = 0; + if (nesvnic->linkup == 1) + nes_port_ibevent(nesvnic); + } + del_timer_sync(&nesvnic->event_timer); + nesvnic->event_timer.function = NULL; + spin_unlock_irqrestore(&nesvnic->port_ibevent_lock, flags); + + nes_destroy_nic_qp(nesvnic); + + nesvnic->netdev_open = 0; + + return 0; +} + + +/** + * nes_nic_send + */ +static int nes_nic_send(struct sk_buff *skb, struct net_device *netdev) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_hw_nic *nesnic = &nesvnic->nic; + struct nes_hw_nic_sq_wqe *nic_sqe; + struct tcphdr *tcph; + __le16 *wqe_fragment_length; + u32 wqe_misc; + u16 wqe_fragment_index = 1; /* first fragment (0) is used by copy buffer */ + u16 skb_fragment_index; + dma_addr_t bus_address; + + nic_sqe = &nesnic->sq_vbase[nesnic->sq_head]; + wqe_fragment_length = (__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX]; + + /* setup the VLAN tag if present */ + if (vlan_tx_tag_present(skb)) { + nes_debug(NES_DBG_NIC_TX, "%s: VLAN packet to send... VLAN = %08X\n", + netdev->name, vlan_tx_tag_get(skb)); + wqe_misc = NES_NIC_SQ_WQE_TAGVALUE_ENABLE; + wqe_fragment_length[0] = (__force __le16) vlan_tx_tag_get(skb); + } else + wqe_misc = 0; + + /* bump past the vlan tag */ + wqe_fragment_length++; + /* wqe_fragment_address = (u64 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX]; */ + wqe_misc |= NES_NIC_SQ_WQE_COMPLETION; + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + if (skb_is_gso(skb)) { + tcph = tcp_hdr(skb); + /* nes_debug(NES_DBG_NIC_TX, "%s: TSO request... is_gso = %u seg size = %u\n", + netdev->name, skb_is_gso(skb), skb_shinfo(skb)->gso_size); */ + wqe_misc |= NES_NIC_SQ_WQE_LSO_ENABLE | (u16)skb_shinfo(skb)->gso_size; + set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_LSO_INFO_IDX, + ((u32)tcph->doff) | + (((u32)(((unsigned char *)tcph) - skb->data)) << 4)); + } + } else { /* CHECKSUM_HW */ + wqe_misc |= NES_NIC_SQ_WQE_DISABLE_CHKSUM; + } + + set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_TOTAL_LENGTH_IDX, + skb->len); + memcpy(&nesnic->first_frag_vbase[nesnic->sq_head].buffer, + skb->data, min(((unsigned int)NES_FIRST_FRAG_SIZE), skb_headlen(skb))); + wqe_fragment_length[0] = cpu_to_le16(min(((unsigned int)NES_FIRST_FRAG_SIZE), + skb_headlen(skb))); + wqe_fragment_length[1] = 0; + if (skb_headlen(skb) > NES_FIRST_FRAG_SIZE) { + if ((skb_shinfo(skb)->nr_frags + 1) > 4) { + nes_debug(NES_DBG_NIC_TX, "%s: Packet with %u fragments not sent, skb_headlen=%u\n", + netdev->name, skb_shinfo(skb)->nr_frags + 2, skb_headlen(skb)); + kfree_skb(skb); + nesvnic->tx_sw_dropped++; + return NETDEV_TX_LOCKED; + } + set_bit(nesnic->sq_head, nesnic->first_frag_overflow); + bus_address = pci_map_single(nesdev->pcidev, skb->data + NES_FIRST_FRAG_SIZE, + skb_headlen(skb) - NES_FIRST_FRAG_SIZE, PCI_DMA_TODEVICE); + wqe_fragment_length[wqe_fragment_index++] = + cpu_to_le16(skb_headlen(skb) - NES_FIRST_FRAG_SIZE); + wqe_fragment_length[wqe_fragment_index] = 0; + set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX, + ((u64)(bus_address))); + nesnic->tx_skb[nesnic->sq_head] = skb; + } + + if (skb_headlen(skb) == skb->len) { + if (skb_headlen(skb) <= NES_FIRST_FRAG_SIZE) { + nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_2_1_IDX] = 0; + nesnic->tx_skb[nesnic->sq_head] = skb; + } + } else { + /* Deal with Fragments */ + nesnic->tx_skb[nesnic->sq_head] = skb; + for (skb_fragment_index = 0; skb_fragment_index < skb_shinfo(skb)->nr_frags; + skb_fragment_index++) { + skb_frag_t *frag = + &skb_shinfo(skb)->frags[skb_fragment_index]; + bus_address = skb_frag_dma_map(&nesdev->pcidev->dev, + frag, 0, skb_frag_size(frag), + DMA_TO_DEVICE); + wqe_fragment_length[wqe_fragment_index] = + cpu_to_le16(skb_frag_size(&skb_shinfo(skb)->frags[skb_fragment_index])); + set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX+(2*wqe_fragment_index), + bus_address); + wqe_fragment_index++; + if (wqe_fragment_index < 5) + wqe_fragment_length[wqe_fragment_index] = 0; + } + } + + set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_MISC_IDX, wqe_misc); + nesnic->sq_head++; + nesnic->sq_head &= nesnic->sq_size - 1; + + return NETDEV_TX_OK; +} + + +/** + * nes_netdev_start_xmit + */ +static int nes_netdev_start_xmit(struct sk_buff *skb, struct net_device *netdev) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_hw_nic *nesnic = &nesvnic->nic; + struct nes_hw_nic_sq_wqe *nic_sqe; + struct tcphdr *tcph; + /* struct udphdr *udph; */ +#define NES_MAX_TSO_FRAGS MAX_SKB_FRAGS + /* 64K segment plus overflow on each side */ + dma_addr_t tso_bus_address[NES_MAX_TSO_FRAGS]; + dma_addr_t bus_address; + u32 tso_frag_index; + u32 tso_frag_count; + u32 tso_wqe_length; + u32 curr_tcp_seq; + u32 wqe_count=1; + u32 send_rc; + struct iphdr *iph; + __le16 *wqe_fragment_length; + u32 nr_frags; + u32 original_first_length; + /* u64 *wqe_fragment_address; */ + /* first fragment (0) is used by copy buffer */ + u16 wqe_fragment_index=1; + u16 hoffset; + u16 nhoffset; + u16 wqes_needed; + u16 wqes_available; + u32 wqe_misc; + + /* + * nes_debug(NES_DBG_NIC_TX, "%s Request to tx NIC packet length %u, headlen %u," + * " (%u frags), tso_size=%u\n", + * netdev->name, skb->len, skb_headlen(skb), + * skb_shinfo(skb)->nr_frags, skb_is_gso(skb)); + */ + + if (!netif_carrier_ok(netdev)) + return NETDEV_TX_OK; + + if (netif_queue_stopped(netdev)) + return NETDEV_TX_BUSY; + + /* Check if SQ is full */ + if ((((nesnic->sq_tail+(nesnic->sq_size*2))-nesnic->sq_head) & (nesnic->sq_size - 1)) == 1) { + if (!netif_queue_stopped(netdev)) { + netif_stop_queue(netdev); + barrier(); + if ((((((volatile u16)nesnic->sq_tail)+(nesnic->sq_size*2))-nesnic->sq_head) & (nesnic->sq_size - 1)) != 1) { + netif_start_queue(netdev); + goto sq_no_longer_full; + } + } + nesvnic->sq_full++; + return NETDEV_TX_BUSY; + } + +sq_no_longer_full: + nr_frags = skb_shinfo(skb)->nr_frags; + if (skb_headlen(skb) > NES_FIRST_FRAG_SIZE) { + nr_frags++; + } + /* Check if too many fragments */ + if (unlikely((nr_frags > 4))) { + if (skb_is_gso(skb)) { + nesvnic->segmented_tso_requests++; + nesvnic->tso_requests++; + /* Basically 4 fragments available per WQE with extended fragments */ + wqes_needed = nr_frags >> 2; + wqes_needed += (nr_frags&3)?1:0; + wqes_available = (((nesnic->sq_tail+nesnic->sq_size)-nesnic->sq_head) - 1) & + (nesnic->sq_size - 1); + + if (unlikely(wqes_needed > wqes_available)) { + if (!netif_queue_stopped(netdev)) { + netif_stop_queue(netdev); + barrier(); + wqes_available = (((((volatile u16)nesnic->sq_tail)+nesnic->sq_size)-nesnic->sq_head) - 1) & + (nesnic->sq_size - 1); + if (wqes_needed <= wqes_available) { + netif_start_queue(netdev); + goto tso_sq_no_longer_full; + } + } + nesvnic->sq_full++; + nes_debug(NES_DBG_NIC_TX, "%s: HNIC SQ full- TSO request has too many frags!\n", + netdev->name); + return NETDEV_TX_BUSY; + } +tso_sq_no_longer_full: + /* Map all the buffers */ + for (tso_frag_count=0; tso_frag_count < skb_shinfo(skb)->nr_frags; + tso_frag_count++) { + skb_frag_t *frag = + &skb_shinfo(skb)->frags[tso_frag_count]; + tso_bus_address[tso_frag_count] = + skb_frag_dma_map(&nesdev->pcidev->dev, + frag, 0, skb_frag_size(frag), + DMA_TO_DEVICE); + } + + tso_frag_index = 0; + curr_tcp_seq = ntohl(tcp_hdr(skb)->seq); + hoffset = skb_transport_header(skb) - skb->data; + nhoffset = skb_network_header(skb) - skb->data; + original_first_length = hoffset + ((((struct tcphdr *)skb_transport_header(skb))->doff)<<2); + + for (wqe_count=0; wqe_count<((u32)wqes_needed); wqe_count++) { + tso_wqe_length = 0; + nic_sqe = &nesnic->sq_vbase[nesnic->sq_head]; + wqe_fragment_length = + (__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX]; + /* setup the VLAN tag if present */ + if (vlan_tx_tag_present(skb)) { + nes_debug(NES_DBG_NIC_TX, "%s: VLAN packet to send... VLAN = %08X\n", + netdev->name, vlan_tx_tag_get(skb) ); + wqe_misc = NES_NIC_SQ_WQE_TAGVALUE_ENABLE; + wqe_fragment_length[0] = (__force __le16) vlan_tx_tag_get(skb); + } else + wqe_misc = 0; + + /* bump past the vlan tag */ + wqe_fragment_length++; + + /* Assumes header totally fits in allocated buffer and is in first fragment */ + if (original_first_length > NES_FIRST_FRAG_SIZE) { + nes_debug(NES_DBG_NIC_TX, "ERROR: SKB header too big, headlen=%u, FIRST_FRAG_SIZE=%u\n", + original_first_length, NES_FIRST_FRAG_SIZE); + nes_debug(NES_DBG_NIC_TX, "%s Request to tx NIC packet length %u, headlen %u," + " (%u frags), is_gso = %u tso_size=%u\n", + netdev->name, + skb->len, skb_headlen(skb), + skb_shinfo(skb)->nr_frags, skb_is_gso(skb), skb_shinfo(skb)->gso_size); + } + memcpy(&nesnic->first_frag_vbase[nesnic->sq_head].buffer, + skb->data, min(((unsigned int)NES_FIRST_FRAG_SIZE), + original_first_length)); + iph = (struct iphdr *) + (&nesnic->first_frag_vbase[nesnic->sq_head].buffer[nhoffset]); + tcph = (struct tcphdr *) + (&nesnic->first_frag_vbase[nesnic->sq_head].buffer[hoffset]); + if ((wqe_count+1)!=(u32)wqes_needed) { + tcph->fin = 0; + tcph->psh = 0; + tcph->rst = 0; + tcph->urg = 0; + } + if (wqe_count) { + tcph->syn = 0; + } + tcph->seq = htonl(curr_tcp_seq); + wqe_fragment_length[0] = cpu_to_le16(min(((unsigned int)NES_FIRST_FRAG_SIZE), + original_first_length)); + + wqe_fragment_index = 1; + if ((wqe_count==0) && (skb_headlen(skb) > original_first_length)) { + set_bit(nesnic->sq_head, nesnic->first_frag_overflow); + bus_address = pci_map_single(nesdev->pcidev, skb->data + original_first_length, + skb_headlen(skb) - original_first_length, PCI_DMA_TODEVICE); + wqe_fragment_length[wqe_fragment_index++] = + cpu_to_le16(skb_headlen(skb) - original_first_length); + wqe_fragment_length[wqe_fragment_index] = 0; + set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX, + bus_address); + tso_wqe_length += skb_headlen(skb) - + original_first_length; + } + while (wqe_fragment_index < 5) { + wqe_fragment_length[wqe_fragment_index] = + cpu_to_le16(skb_frag_size(&skb_shinfo(skb)->frags[tso_frag_index])); + set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX+(2*wqe_fragment_index), + (u64)tso_bus_address[tso_frag_index]); + wqe_fragment_index++; + tso_wqe_length += skb_frag_size(&skb_shinfo(skb)->frags[tso_frag_index++]); + if (wqe_fragment_index < 5) + wqe_fragment_length[wqe_fragment_index] = 0; + if (tso_frag_index == tso_frag_count) + break; + } + if ((wqe_count+1) == (u32)wqes_needed) { + nesnic->tx_skb[nesnic->sq_head] = skb; + } else { + nesnic->tx_skb[nesnic->sq_head] = NULL; + } + wqe_misc |= NES_NIC_SQ_WQE_COMPLETION | (u16)skb_shinfo(skb)->gso_size; + if ((tso_wqe_length + original_first_length) > skb_shinfo(skb)->gso_size) { + wqe_misc |= NES_NIC_SQ_WQE_LSO_ENABLE; + } else { + iph->tot_len = htons(tso_wqe_length + original_first_length - nhoffset); + } + + set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_MISC_IDX, + wqe_misc); + set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_LSO_INFO_IDX, + ((u32)tcph->doff) | (((u32)hoffset) << 4)); + + set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_TOTAL_LENGTH_IDX, + tso_wqe_length + original_first_length); + curr_tcp_seq += tso_wqe_length; + nesnic->sq_head++; + nesnic->sq_head &= nesnic->sq_size-1; + } + } else { + nesvnic->linearized_skbs++; + hoffset = skb_transport_header(skb) - skb->data; + nhoffset = skb_network_header(skb) - skb->data; + skb_linearize(skb); + skb_set_transport_header(skb, hoffset); + skb_set_network_header(skb, nhoffset); + send_rc = nes_nic_send(skb, netdev); + if (send_rc != NETDEV_TX_OK) + return NETDEV_TX_OK; + } + } else { + send_rc = nes_nic_send(skb, netdev); + if (send_rc != NETDEV_TX_OK) + return NETDEV_TX_OK; + } + + barrier(); + + if (wqe_count) + nes_write32(nesdev->regs+NES_WQE_ALLOC, + (wqe_count << 24) | (1 << 23) | nesvnic->nic.qp_id); + + netdev->trans_start = jiffies; + + return NETDEV_TX_OK; +} + + +/** + * nes_netdev_get_stats + */ +static struct net_device_stats *nes_netdev_get_stats(struct net_device *netdev) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + u64 u64temp; + u32 u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_DISCARD + (nesvnic->nic_index*0x200)); + nesvnic->netstats.rx_dropped += u32temp; + nesvnic->endnode_nstat_rx_discard += u32temp; + + u64temp = (u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO + (nesvnic->nic_index*0x200)); + u64temp += ((u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI + (nesvnic->nic_index*0x200))) << 32; + + nesvnic->endnode_nstat_rx_octets += u64temp; + nesvnic->netstats.rx_bytes += u64temp; + + u64temp = (u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO + (nesvnic->nic_index*0x200)); + u64temp += ((u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI + (nesvnic->nic_index*0x200))) << 32; + + nesvnic->endnode_nstat_rx_frames += u64temp; + nesvnic->netstats.rx_packets += u64temp; + + u64temp = (u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO + (nesvnic->nic_index*0x200)); + u64temp += ((u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI + (nesvnic->nic_index*0x200))) << 32; + + nesvnic->endnode_nstat_tx_octets += u64temp; + nesvnic->netstats.tx_bytes += u64temp; + + u64temp = (u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO + (nesvnic->nic_index*0x200)); + u64temp += ((u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI + (nesvnic->nic_index*0x200))) << 32; + + nesvnic->endnode_nstat_tx_frames += u64temp; + nesvnic->netstats.tx_packets += u64temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_SHORT_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->netstats.rx_dropped += u32temp; + nesvnic->nesdev->mac_rx_errors += u32temp; + nesvnic->nesdev->mac_rx_short_frames += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_OVERSIZED_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->netstats.rx_dropped += u32temp; + nesvnic->nesdev->mac_rx_errors += u32temp; + nesvnic->nesdev->mac_rx_oversized_frames += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_JABBER_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->netstats.rx_dropped += u32temp; + nesvnic->nesdev->mac_rx_errors += u32temp; + nesvnic->nesdev->mac_rx_jabber_frames += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_SYMBOL_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->netstats.rx_dropped += u32temp; + nesvnic->nesdev->mac_rx_errors += u32temp; + nesvnic->nesdev->mac_rx_symbol_err_frames += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_LENGTH_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->netstats.rx_length_errors += u32temp; + nesvnic->nesdev->mac_rx_errors += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_CRC_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->nesdev->mac_rx_errors += u32temp; + nesvnic->nesdev->mac_rx_crc_errors += u32temp; + nesvnic->netstats.rx_crc_errors += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_TX_ERRORS + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->nesdev->mac_tx_errors += u32temp; + nesvnic->netstats.tx_errors += u32temp; + + return &nesvnic->netstats; +} + + +/** + * nes_netdev_tx_timeout + */ +static void nes_netdev_tx_timeout(struct net_device *netdev) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + + if (netif_msg_timer(nesvnic)) + nes_debug(NES_DBG_NIC_TX, "%s: tx timeout\n", netdev->name); +} + + +/** + * nes_netdev_set_mac_address + */ +static int nes_netdev_set_mac_address(struct net_device *netdev, void *p) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct sockaddr *mac_addr = p; + int i; + u32 macaddr_low; + u16 macaddr_high; + + if (!is_valid_ether_addr(mac_addr->sa_data)) + return -EADDRNOTAVAIL; + + memcpy(netdev->dev_addr, mac_addr->sa_data, netdev->addr_len); + printk(PFX "%s: Address length = %d, Address = %pM\n", + __func__, netdev->addr_len, mac_addr->sa_data); + macaddr_high = ((u16)netdev->dev_addr[0]) << 8; + macaddr_high += (u16)netdev->dev_addr[1]; + macaddr_low = ((u32)netdev->dev_addr[2]) << 24; + macaddr_low += ((u32)netdev->dev_addr[3]) << 16; + macaddr_low += ((u32)netdev->dev_addr[4]) << 8; + macaddr_low += (u32)netdev->dev_addr[5]; + + for (i = 0; i < NES_MAX_PORT_COUNT; i++) { + if (nesvnic->qp_nic_index[i] == 0xf) { + break; + } + nes_write_indexed(nesdev, + NES_IDX_PERFECT_FILTER_LOW + (nesvnic->qp_nic_index[i] * 8), + macaddr_low); + nes_write_indexed(nesdev, + NES_IDX_PERFECT_FILTER_HIGH + (nesvnic->qp_nic_index[i] * 8), + (u32)macaddr_high | NES_MAC_ADDR_VALID | + ((((u32)nesvnic->nic_index) << 16))); + } + return 0; +} + + +static void set_allmulti(struct nes_device *nesdev, u32 nic_active_bit) +{ + u32 nic_active; + + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL); + nic_active |= nic_active_bit; + nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL); + nic_active &= ~nic_active_bit; + nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active); +} + +#define get_addr(addrs, index) ((addrs) + (index) * ETH_ALEN) + +/** + * nes_netdev_set_multicast_list + */ +static void nes_netdev_set_multicast_list(struct net_device *netdev) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesvnic->nesdev->nesadapter; + u32 nic_active_bit; + u32 nic_active; + u32 perfect_filter_register_address; + u32 macaddr_low; + u16 macaddr_high; + u8 mc_all_on = 0; + u8 mc_index; + int mc_nic_index = -1; + u8 pft_entries_preallocated = max(nesadapter->adapter_fcn_count * + nics_per_function, 4); + u8 max_pft_entries_avaiable = NES_PFT_SIZE - pft_entries_preallocated; + unsigned long flags; + int mc_count = netdev_mc_count(netdev); + + spin_lock_irqsave(&nesadapter->resource_lock, flags); + nic_active_bit = 1 << nesvnic->nic_index; + + if (netdev->flags & IFF_PROMISC) { + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL); + nic_active |= nic_active_bit; + nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL); + nic_active |= nic_active_bit; + nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active); + mc_all_on = 1; + } else if ((netdev->flags & IFF_ALLMULTI) || + (nesvnic->nic_index > 3)) { + set_allmulti(nesdev, nic_active_bit); + mc_all_on = 1; + } else { + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL); + nic_active &= ~nic_active_bit; + nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active); + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL); + nic_active &= ~nic_active_bit; + nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active); + } + + nes_debug(NES_DBG_NIC_RX, "Number of MC entries = %d, Promiscuous = %d, All Multicast = %d.\n", + mc_count, !!(netdev->flags & IFF_PROMISC), + !!(netdev->flags & IFF_ALLMULTI)); + if (!mc_all_on) { + char *addrs; + int i; + struct netdev_hw_addr *ha; + + addrs = kmalloc(ETH_ALEN * mc_count, GFP_ATOMIC); + if (!addrs) { + set_allmulti(nesdev, nic_active_bit); + goto unlock; + } + i = 0; + netdev_for_each_mc_addr(ha, netdev) + memcpy(get_addr(addrs, i++), ha->addr, ETH_ALEN); + + perfect_filter_register_address = NES_IDX_PERFECT_FILTER_LOW + + pft_entries_preallocated * 0x8; + for (i = 0, mc_index = 0; mc_index < max_pft_entries_avaiable; + mc_index++) { + while (i < mc_count && nesvnic->mcrq_mcast_filter && + ((mc_nic_index = nesvnic->mcrq_mcast_filter(nesvnic, + get_addr(addrs, i++))) == 0)); + if (mc_nic_index < 0) + mc_nic_index = nesvnic->nic_index; + while (nesadapter->pft_mcast_map[mc_index] < 16 && + nesadapter->pft_mcast_map[mc_index] != + nesvnic->nic_index && + mc_index < max_pft_entries_avaiable) { + nes_debug(NES_DBG_NIC_RX, + "mc_index=%d skipping nic_index=%d, " + "used for=%d \n", mc_index, + nesvnic->nic_index, + nesadapter->pft_mcast_map[mc_index]); + mc_index++; + } + if (mc_index >= max_pft_entries_avaiable) + break; + if (i < mc_count) { + char *addr = get_addr(addrs, i++); + + nes_debug(NES_DBG_NIC_RX, "Assigning MC Address %pM to register 0x%04X nic_idx=%d\n", + addr, + perfect_filter_register_address+(mc_index * 8), + mc_nic_index); + macaddr_high = ((u8) addr[0]) << 8; + macaddr_high += (u8) addr[1]; + macaddr_low = ((u8) addr[2]) << 24; + macaddr_low += ((u8) addr[3]) << 16; + macaddr_low += ((u8) addr[4]) << 8; + macaddr_low += (u8) addr[5]; + + nes_write_indexed(nesdev, + perfect_filter_register_address+(mc_index * 8), + macaddr_low); + nes_write_indexed(nesdev, + perfect_filter_register_address+4+(mc_index * 8), + (u32)macaddr_high | NES_MAC_ADDR_VALID | + ((((u32)(1<pft_mcast_map[mc_index] = + nesvnic->nic_index; + } else { + nes_debug(NES_DBG_NIC_RX, "Clearing MC Address at register 0x%04X\n", + perfect_filter_register_address+(mc_index * 8)); + nes_write_indexed(nesdev, + perfect_filter_register_address+4+(mc_index * 8), + 0); + nesadapter->pft_mcast_map[mc_index] = 255; + } + } + kfree(addrs); + /* PFT is not large enough */ + if (i < mc_count) + set_allmulti(nesdev, nic_active_bit); + } + +unlock: + spin_unlock_irqrestore(&nesadapter->resource_lock, flags); +} + + +/** + * nes_netdev_change_mtu + */ +static int nes_netdev_change_mtu(struct net_device *netdev, int new_mtu) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + int ret = 0; + u8 jumbomode = 0; + u32 nic_active; + u32 nic_active_bit; + u32 uc_all_active; + u32 mc_all_active; + + if ((new_mtu < ETH_ZLEN) || (new_mtu > max_mtu)) + return -EINVAL; + + netdev->mtu = new_mtu; + nesvnic->max_frame_size = new_mtu + VLAN_ETH_HLEN; + + if (netdev->mtu > 1500) { + jumbomode=1; + } + nes_nic_init_timer_defaults(nesdev, jumbomode); + + if (netif_running(netdev)) { + nic_active_bit = 1 << nesvnic->nic_index; + mc_all_active = nes_read_indexed(nesdev, + NES_IDX_NIC_MULTICAST_ALL) & nic_active_bit; + uc_all_active = nes_read_indexed(nesdev, + NES_IDX_NIC_UNICAST_ALL) & nic_active_bit; + + nes_netdev_stop(netdev); + nes_netdev_open(netdev); + + nic_active = nes_read_indexed(nesdev, + NES_IDX_NIC_MULTICAST_ALL); + nic_active |= mc_all_active; + nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, + nic_active); + + nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL); + nic_active |= uc_all_active; + nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active); + } + + return ret; +} + + +static const char nes_ethtool_stringset[][ETH_GSTRING_LEN] = { + "Link Change Interrupts", + "Linearized SKBs", + "T/GSO Requests", + "Pause Frames Sent", + "Pause Frames Received", + "Internal Routing Errors", + "SQ SW Dropped SKBs", + "SQ Full", + "Segmented TSO Requests", + "Rx Symbol Errors", + "Rx Jabber Errors", + "Rx Oversized Frames", + "Rx Short Frames", + "Rx Length Errors", + "Rx CRC Errors", + "Rx Port Discard", + "Endnode Rx Discards", + "Endnode Rx Octets", + "Endnode Rx Frames", + "Endnode Tx Octets", + "Endnode Tx Frames", + "Tx Errors", + "mh detected", + "mh pauses", + "Retransmission Count", + "CM Connects", + "CM Accepts", + "Disconnects", + "Connected Events", + "Connect Requests", + "CM Rejects", + "ModifyQP Timeouts", + "CreateQPs", + "SW DestroyQPs", + "DestroyQPs", + "CM Closes", + "CM Packets Sent", + "CM Packets Bounced", + "CM Packets Created", + "CM Packets Rcvd", + "CM Packets Dropped", + "CM Packets Retrans", + "CM Listens Created", + "CM Listens Destroyed", + "CM Backlog Drops", + "CM Loopbacks", + "CM Nodes Created", + "CM Nodes Destroyed", + "CM Accel Drops", + "CM Resets Received", + "Free 4Kpbls", + "Free 256pbls", + "Timer Inits", + "LRO aggregated", + "LRO flushed", + "LRO no_desc", + "PAU CreateQPs", + "PAU DestroyQPs", +}; +#define NES_ETHTOOL_STAT_COUNT ARRAY_SIZE(nes_ethtool_stringset) + + +/** + * nes_netdev_get_sset_count + */ +static int nes_netdev_get_sset_count(struct net_device *netdev, int stringset) +{ + if (stringset == ETH_SS_STATS) + return NES_ETHTOOL_STAT_COUNT; + else + return -EINVAL; +} + + +/** + * nes_netdev_get_strings + */ +static void nes_netdev_get_strings(struct net_device *netdev, u32 stringset, + u8 *ethtool_strings) +{ + if (stringset == ETH_SS_STATS) + memcpy(ethtool_strings, + &nes_ethtool_stringset, + sizeof(nes_ethtool_stringset)); +} + + +/** + * nes_netdev_get_ethtool_stats + */ + +static void nes_netdev_get_ethtool_stats(struct net_device *netdev, + struct ethtool_stats *target_ethtool_stats, u64 *target_stat_values) +{ + u64 u64temp; + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 nic_count; + u32 u32temp; + u32 index = 0; + + target_ethtool_stats->n_stats = NES_ETHTOOL_STAT_COUNT; + target_stat_values[index] = nesvnic->nesdev->link_status_interrupts; + target_stat_values[++index] = nesvnic->linearized_skbs; + target_stat_values[++index] = nesvnic->tso_requests; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_TX_PAUSE_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->nesdev->mac_pause_frames_sent += u32temp; + target_stat_values[++index] = nesvnic->nesdev->mac_pause_frames_sent; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_PAUSE_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->nesdev->mac_pause_frames_received += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_PORT_RX_DISCARDS + (nesvnic->nesdev->mac_index*0x40)); + nesvnic->nesdev->port_rx_discards += u32temp; + nesvnic->netstats.rx_dropped += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_PORT_TX_DISCARDS + (nesvnic->nesdev->mac_index*0x40)); + nesvnic->nesdev->port_tx_discards += u32temp; + nesvnic->netstats.tx_dropped += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_SHORT_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->netstats.rx_dropped += u32temp; + nesvnic->nesdev->mac_rx_errors += u32temp; + nesvnic->nesdev->mac_rx_short_frames += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_OVERSIZED_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->netstats.rx_dropped += u32temp; + nesvnic->nesdev->mac_rx_errors += u32temp; + nesvnic->nesdev->mac_rx_oversized_frames += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_JABBER_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->netstats.rx_dropped += u32temp; + nesvnic->nesdev->mac_rx_errors += u32temp; + nesvnic->nesdev->mac_rx_jabber_frames += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_SYMBOL_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->netstats.rx_dropped += u32temp; + nesvnic->nesdev->mac_rx_errors += u32temp; + nesvnic->nesdev->mac_rx_symbol_err_frames += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_LENGTH_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->netstats.rx_length_errors += u32temp; + nesvnic->nesdev->mac_rx_errors += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_RX_CRC_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->nesdev->mac_rx_errors += u32temp; + nesvnic->nesdev->mac_rx_crc_errors += u32temp; + nesvnic->netstats.rx_crc_errors += u32temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_TX_ERRORS + (nesvnic->nesdev->mac_index*0x200)); + nesvnic->nesdev->mac_tx_errors += u32temp; + nesvnic->netstats.tx_errors += u32temp; + + for (nic_count = 0; nic_count < NES_MAX_PORT_COUNT; nic_count++) { + if (nesvnic->qp_nic_index[nic_count] == 0xf) + break; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_DISCARD + + (nesvnic->qp_nic_index[nic_count]*0x200)); + nesvnic->netstats.rx_dropped += u32temp; + nesvnic->endnode_nstat_rx_discard += u32temp; + + u64temp = (u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO + + (nesvnic->qp_nic_index[nic_count]*0x200)); + u64temp += ((u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI + + (nesvnic->qp_nic_index[nic_count]*0x200))) << 32; + + nesvnic->endnode_nstat_rx_octets += u64temp; + nesvnic->netstats.rx_bytes += u64temp; + + u64temp = (u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO + + (nesvnic->qp_nic_index[nic_count]*0x200)); + u64temp += ((u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI + + (nesvnic->qp_nic_index[nic_count]*0x200))) << 32; + + nesvnic->endnode_nstat_rx_frames += u64temp; + nesvnic->netstats.rx_packets += u64temp; + + u64temp = (u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO + + (nesvnic->qp_nic_index[nic_count]*0x200)); + u64temp += ((u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI + + (nesvnic->qp_nic_index[nic_count]*0x200))) << 32; + + nesvnic->endnode_nstat_tx_octets += u64temp; + nesvnic->netstats.tx_bytes += u64temp; + + u64temp = (u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO + + (nesvnic->qp_nic_index[nic_count]*0x200)); + u64temp += ((u64)nes_read_indexed(nesdev, + NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI + + (nesvnic->qp_nic_index[nic_count]*0x200))) << 32; + + nesvnic->endnode_nstat_tx_frames += u64temp; + nesvnic->netstats.tx_packets += u64temp; + + u32temp = nes_read_indexed(nesdev, + NES_IDX_IPV4_TCP_REXMITS + (nesvnic->qp_nic_index[nic_count]*0x200)); + nesvnic->endnode_ipv4_tcp_retransmits += u32temp; + } + + target_stat_values[++index] = nesvnic->nesdev->mac_pause_frames_received; + target_stat_values[++index] = nesdev->nesadapter->nic_rx_eth_route_err; + target_stat_values[++index] = nesvnic->tx_sw_dropped; + target_stat_values[++index] = nesvnic->sq_full; + target_stat_values[++index] = nesvnic->segmented_tso_requests; + target_stat_values[++index] = nesvnic->nesdev->mac_rx_symbol_err_frames; + target_stat_values[++index] = nesvnic->nesdev->mac_rx_jabber_frames; + target_stat_values[++index] = nesvnic->nesdev->mac_rx_oversized_frames; + target_stat_values[++index] = nesvnic->nesdev->mac_rx_short_frames; + target_stat_values[++index] = nesvnic->netstats.rx_length_errors; + target_stat_values[++index] = nesvnic->nesdev->mac_rx_crc_errors; + target_stat_values[++index] = nesvnic->nesdev->port_rx_discards; + target_stat_values[++index] = nesvnic->endnode_nstat_rx_discard; + target_stat_values[++index] = nesvnic->endnode_nstat_rx_octets; + target_stat_values[++index] = nesvnic->endnode_nstat_rx_frames; + target_stat_values[++index] = nesvnic->endnode_nstat_tx_octets; + target_stat_values[++index] = nesvnic->endnode_nstat_tx_frames; + target_stat_values[++index] = nesvnic->nesdev->mac_tx_errors; + target_stat_values[++index] = mh_detected; + target_stat_values[++index] = mh_pauses_sent; + target_stat_values[++index] = nesvnic->endnode_ipv4_tcp_retransmits; + target_stat_values[++index] = atomic_read(&cm_connects); + target_stat_values[++index] = atomic_read(&cm_accepts); + target_stat_values[++index] = atomic_read(&cm_disconnects); + target_stat_values[++index] = atomic_read(&cm_connecteds); + target_stat_values[++index] = atomic_read(&cm_connect_reqs); + target_stat_values[++index] = atomic_read(&cm_rejects); + target_stat_values[++index] = atomic_read(&mod_qp_timouts); + target_stat_values[++index] = atomic_read(&qps_created); + target_stat_values[++index] = atomic_read(&sw_qps_destroyed); + target_stat_values[++index] = atomic_read(&qps_destroyed); + target_stat_values[++index] = atomic_read(&cm_closes); + target_stat_values[++index] = cm_packets_sent; + target_stat_values[++index] = cm_packets_bounced; + target_stat_values[++index] = cm_packets_created; + target_stat_values[++index] = cm_packets_received; + target_stat_values[++index] = cm_packets_dropped; + target_stat_values[++index] = cm_packets_retrans; + target_stat_values[++index] = atomic_read(&cm_listens_created); + target_stat_values[++index] = atomic_read(&cm_listens_destroyed); + target_stat_values[++index] = cm_backlog_drops; + target_stat_values[++index] = atomic_read(&cm_loopbacks); + target_stat_values[++index] = atomic_read(&cm_nodes_created); + target_stat_values[++index] = atomic_read(&cm_nodes_destroyed); + target_stat_values[++index] = atomic_read(&cm_accel_dropped_pkts); + target_stat_values[++index] = atomic_read(&cm_resets_recvd); + target_stat_values[++index] = nesadapter->free_4kpbl; + target_stat_values[++index] = nesadapter->free_256pbl; + target_stat_values[++index] = int_mod_timer_init; + target_stat_values[++index] = nesvnic->lro_mgr.stats.aggregated; + target_stat_values[++index] = nesvnic->lro_mgr.stats.flushed; + target_stat_values[++index] = nesvnic->lro_mgr.stats.no_desc; + target_stat_values[++index] = atomic_read(&pau_qps_created); + target_stat_values[++index] = atomic_read(&pau_qps_destroyed); +} + +/** + * nes_netdev_get_drvinfo + */ +static void nes_netdev_get_drvinfo(struct net_device *netdev, + struct ethtool_drvinfo *drvinfo) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_adapter *nesadapter = nesvnic->nesdev->nesadapter; + + strlcpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver)); + strlcpy(drvinfo->bus_info, pci_name(nesvnic->nesdev->pcidev), + sizeof(drvinfo->bus_info)); + snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), + "%u.%u", nesadapter->firmware_version >> 16, + nesadapter->firmware_version & 0x000000ff); + strlcpy(drvinfo->version, DRV_VERSION, sizeof(drvinfo->version)); + drvinfo->testinfo_len = 0; + drvinfo->eedump_len = 0; + drvinfo->regdump_len = 0; +} + + +/** + * nes_netdev_set_coalesce + */ +static int nes_netdev_set_coalesce(struct net_device *netdev, + struct ethtool_coalesce *et_coalesce) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer; + unsigned long flags; + + spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags); + if (et_coalesce->rx_max_coalesced_frames_low) { + shared_timer->threshold_low = et_coalesce->rx_max_coalesced_frames_low; + } + if (et_coalesce->rx_max_coalesced_frames_irq) { + shared_timer->threshold_target = et_coalesce->rx_max_coalesced_frames_irq; + } + if (et_coalesce->rx_max_coalesced_frames_high) { + shared_timer->threshold_high = et_coalesce->rx_max_coalesced_frames_high; + } + if (et_coalesce->rx_coalesce_usecs_low) { + shared_timer->timer_in_use_min = et_coalesce->rx_coalesce_usecs_low; + } + if (et_coalesce->rx_coalesce_usecs_high) { + shared_timer->timer_in_use_max = et_coalesce->rx_coalesce_usecs_high; + } + spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags); + + /* using this to drive total interrupt moderation */ + nesadapter->et_rx_coalesce_usecs_irq = et_coalesce->rx_coalesce_usecs_irq; + if (et_coalesce->use_adaptive_rx_coalesce) { + nesadapter->et_use_adaptive_rx_coalesce = 1; + nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT_DYNAMIC; + nesadapter->et_rx_coalesce_usecs_irq = 0; + if (et_coalesce->pkt_rate_low) { + nesadapter->et_pkt_rate_low = et_coalesce->pkt_rate_low; + } + } else { + nesadapter->et_use_adaptive_rx_coalesce = 0; + nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT; + if (nesadapter->et_rx_coalesce_usecs_irq) { + nes_write32(nesdev->regs+NES_PERIODIC_CONTROL, + 0x80000000 | ((u32)(nesadapter->et_rx_coalesce_usecs_irq*8))); + } + } + return 0; +} + + +/** + * nes_netdev_get_coalesce + */ +static int nes_netdev_get_coalesce(struct net_device *netdev, + struct ethtool_coalesce *et_coalesce) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct ethtool_coalesce temp_et_coalesce; + struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer; + unsigned long flags; + + memset(&temp_et_coalesce, 0, sizeof(temp_et_coalesce)); + temp_et_coalesce.rx_coalesce_usecs_irq = nesadapter->et_rx_coalesce_usecs_irq; + temp_et_coalesce.use_adaptive_rx_coalesce = nesadapter->et_use_adaptive_rx_coalesce; + temp_et_coalesce.rate_sample_interval = nesadapter->et_rate_sample_interval; + temp_et_coalesce.pkt_rate_low = nesadapter->et_pkt_rate_low; + spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags); + temp_et_coalesce.rx_max_coalesced_frames_low = shared_timer->threshold_low; + temp_et_coalesce.rx_max_coalesced_frames_irq = shared_timer->threshold_target; + temp_et_coalesce.rx_max_coalesced_frames_high = shared_timer->threshold_high; + temp_et_coalesce.rx_coalesce_usecs_low = shared_timer->timer_in_use_min; + temp_et_coalesce.rx_coalesce_usecs_high = shared_timer->timer_in_use_max; + if (nesadapter->et_use_adaptive_rx_coalesce) { + temp_et_coalesce.rx_coalesce_usecs_irq = shared_timer->timer_in_use; + } + spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags); + memcpy(et_coalesce, &temp_et_coalesce, sizeof(*et_coalesce)); + return 0; +} + + +/** + * nes_netdev_get_pauseparam + */ +static void nes_netdev_get_pauseparam(struct net_device *netdev, + struct ethtool_pauseparam *et_pauseparam) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + + et_pauseparam->autoneg = 0; + et_pauseparam->rx_pause = (nesvnic->nesdev->disable_rx_flow_control == 0) ? 1:0; + et_pauseparam->tx_pause = (nesvnic->nesdev->disable_tx_flow_control == 0) ? 1:0; +} + + +/** + * nes_netdev_set_pauseparam + */ +static int nes_netdev_set_pauseparam(struct net_device *netdev, + struct ethtool_pauseparam *et_pauseparam) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + u32 u32temp; + + if (et_pauseparam->autoneg) { + /* TODO: should return unsupported */ + return 0; + } + if ((et_pauseparam->tx_pause == 1) && (nesdev->disable_tx_flow_control == 1)) { + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200)); + u32temp |= NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE; + nes_write_indexed(nesdev, + NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200), u32temp); + nesdev->disable_tx_flow_control = 0; + } else if ((et_pauseparam->tx_pause == 0) && (nesdev->disable_tx_flow_control == 0)) { + u32temp = nes_read_indexed(nesdev, + NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200)); + u32temp &= ~NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE; + nes_write_indexed(nesdev, + NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200), u32temp); + nesdev->disable_tx_flow_control = 1; + } + if ((et_pauseparam->rx_pause == 1) && (nesdev->disable_rx_flow_control == 1)) { + u32temp = nes_read_indexed(nesdev, + NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40)); + u32temp &= ~NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE; + nes_write_indexed(nesdev, + NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40), u32temp); + nesdev->disable_rx_flow_control = 0; + } else if ((et_pauseparam->rx_pause == 0) && (nesdev->disable_rx_flow_control == 0)) { + u32temp = nes_read_indexed(nesdev, + NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40)); + u32temp |= NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE; + nes_write_indexed(nesdev, + NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40), u32temp); + nesdev->disable_rx_flow_control = 1; + } + + return 0; +} + + +/** + * nes_netdev_get_settings + */ +static int nes_netdev_get_settings(struct net_device *netdev, struct ethtool_cmd *et_cmd) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 mac_index = nesdev->mac_index; + u8 phy_type = nesadapter->phy_type[mac_index]; + u8 phy_index = nesadapter->phy_index[mac_index]; + u16 phy_data; + + et_cmd->duplex = DUPLEX_FULL; + et_cmd->port = PORT_MII; + et_cmd->maxtxpkt = 511; + et_cmd->maxrxpkt = 511; + + if (nesadapter->OneG_Mode) { + ethtool_cmd_speed_set(et_cmd, SPEED_1000); + if (phy_type == NES_PHY_TYPE_PUMA_1G) { + et_cmd->supported = SUPPORTED_1000baseT_Full; + et_cmd->advertising = ADVERTISED_1000baseT_Full; + et_cmd->autoneg = AUTONEG_DISABLE; + et_cmd->transceiver = XCVR_INTERNAL; + et_cmd->phy_address = mac_index; + } else { + unsigned long flags; + et_cmd->supported = SUPPORTED_1000baseT_Full + | SUPPORTED_Autoneg; + et_cmd->advertising = ADVERTISED_1000baseT_Full + | ADVERTISED_Autoneg; + spin_lock_irqsave(&nesadapter->phy_lock, flags); + nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data); + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); + if (phy_data & 0x1000) + et_cmd->autoneg = AUTONEG_ENABLE; + else + et_cmd->autoneg = AUTONEG_DISABLE; + et_cmd->transceiver = XCVR_EXTERNAL; + et_cmd->phy_address = phy_index; + } + return 0; + } + if ((phy_type == NES_PHY_TYPE_ARGUS) || + (phy_type == NES_PHY_TYPE_SFP_D) || + (phy_type == NES_PHY_TYPE_KR)) { + et_cmd->transceiver = XCVR_EXTERNAL; + et_cmd->port = PORT_FIBRE; + et_cmd->supported = SUPPORTED_FIBRE; + et_cmd->advertising = ADVERTISED_FIBRE; + et_cmd->phy_address = phy_index; + } else { + et_cmd->transceiver = XCVR_INTERNAL; + et_cmd->supported = SUPPORTED_10000baseT_Full; + et_cmd->advertising = ADVERTISED_10000baseT_Full; + et_cmd->phy_address = mac_index; + } + ethtool_cmd_speed_set(et_cmd, SPEED_10000); + et_cmd->autoneg = AUTONEG_DISABLE; + return 0; +} + + +/** + * nes_netdev_set_settings + */ +static int nes_netdev_set_settings(struct net_device *netdev, struct ethtool_cmd *et_cmd) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + + if ((nesadapter->OneG_Mode) && + (nesadapter->phy_type[nesdev->mac_index] != NES_PHY_TYPE_PUMA_1G)) { + unsigned long flags; + u16 phy_data; + u8 phy_index = nesadapter->phy_index[nesdev->mac_index]; + + spin_lock_irqsave(&nesadapter->phy_lock, flags); + nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data); + if (et_cmd->autoneg) { + /* Turn on Full duplex, Autoneg, and restart autonegotiation */ + phy_data |= 0x1300; + } else { + /* Turn off autoneg */ + phy_data &= ~0x1000; + } + nes_write_1G_phy_reg(nesdev, 0, phy_index, phy_data); + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); + } + + return 0; +} + + +static const struct ethtool_ops nes_ethtool_ops = { + .get_link = ethtool_op_get_link, + .get_settings = nes_netdev_get_settings, + .set_settings = nes_netdev_set_settings, + .get_strings = nes_netdev_get_strings, + .get_sset_count = nes_netdev_get_sset_count, + .get_ethtool_stats = nes_netdev_get_ethtool_stats, + .get_drvinfo = nes_netdev_get_drvinfo, + .get_coalesce = nes_netdev_get_coalesce, + .set_coalesce = nes_netdev_set_coalesce, + .get_pauseparam = nes_netdev_get_pauseparam, + .set_pauseparam = nes_netdev_set_pauseparam, +}; + +static void nes_vlan_mode(struct net_device *netdev, struct nes_device *nesdev, netdev_features_t features) +{ + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 u32temp; + unsigned long flags; + + spin_lock_irqsave(&nesadapter->phy_lock, flags); + + nes_debug(NES_DBG_NETDEV, "%s: %s\n", __func__, netdev->name); + + /* Enable/Disable VLAN Stripping */ + u32temp = nes_read_indexed(nesdev, NES_IDX_PCIX_DIAG); + if (features & NETIF_F_HW_VLAN_CTAG_RX) + u32temp &= 0xfdffffff; + else + u32temp |= 0x02000000; + + nes_write_indexed(nesdev, NES_IDX_PCIX_DIAG, u32temp); + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); +} + +static netdev_features_t nes_fix_features(struct net_device *netdev, netdev_features_t features) +{ + /* + * Since there is no support for separate rx/tx vlan accel + * enable/disable make sure tx flag is always in same state as rx. + */ + if (features & NETIF_F_HW_VLAN_CTAG_RX) + features |= NETIF_F_HW_VLAN_CTAG_TX; + else + features &= ~NETIF_F_HW_VLAN_CTAG_TX; + + return features; +} + +static int nes_set_features(struct net_device *netdev, netdev_features_t features) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + u32 changed = netdev->features ^ features; + + if (changed & NETIF_F_HW_VLAN_CTAG_RX) + nes_vlan_mode(netdev, nesdev, features); + + return 0; +} + +static const struct net_device_ops nes_netdev_ops = { + .ndo_open = nes_netdev_open, + .ndo_stop = nes_netdev_stop, + .ndo_start_xmit = nes_netdev_start_xmit, + .ndo_get_stats = nes_netdev_get_stats, + .ndo_tx_timeout = nes_netdev_tx_timeout, + .ndo_set_mac_address = nes_netdev_set_mac_address, + .ndo_set_rx_mode = nes_netdev_set_multicast_list, + .ndo_change_mtu = nes_netdev_change_mtu, + .ndo_validate_addr = eth_validate_addr, + .ndo_fix_features = nes_fix_features, + .ndo_set_features = nes_set_features, +}; + +/** + * nes_netdev_init - initialize network device + */ +struct net_device *nes_netdev_init(struct nes_device *nesdev, + void __iomem *mmio_addr) +{ + u64 u64temp; + struct nes_vnic *nesvnic; + struct net_device *netdev; + struct nic_qp_map *curr_qp_map; + u8 phy_type = nesdev->nesadapter->phy_type[nesdev->mac_index]; + + netdev = alloc_etherdev(sizeof(struct nes_vnic)); + if (!netdev) { + printk(KERN_ERR PFX "nesvnic etherdev alloc failed"); + return NULL; + } + nesvnic = netdev_priv(netdev); + + nes_debug(NES_DBG_INIT, "netdev = %p, %s\n", netdev, netdev->name); + + SET_NETDEV_DEV(netdev, &nesdev->pcidev->dev); + + netdev->watchdog_timeo = NES_TX_TIMEOUT; + netdev->irq = nesdev->pcidev->irq; + netdev->mtu = ETH_DATA_LEN; + netdev->hard_header_len = ETH_HLEN; + netdev->addr_len = ETH_ALEN; + netdev->type = ARPHRD_ETHER; + netdev->netdev_ops = &nes_netdev_ops; + netdev->ethtool_ops = &nes_ethtool_ops; + netif_napi_add(netdev, &nesvnic->napi, nes_netdev_poll, 128); + nes_debug(NES_DBG_INIT, "Enabling VLAN Insert/Delete.\n"); + + /* Fill in the port structure */ + nesvnic->netdev = netdev; + nesvnic->nesdev = nesdev; + nesvnic->msg_enable = netif_msg_init(debug, default_msg); + nesvnic->netdev_index = nesdev->netdev_count; + nesvnic->perfect_filter_index = nesdev->nesadapter->netdev_count; + nesvnic->max_frame_size = netdev->mtu + netdev->hard_header_len + VLAN_HLEN; + + curr_qp_map = nic_qp_mapping_per_function[PCI_FUNC(nesdev->pcidev->devfn)]; + nesvnic->nic.qp_id = curr_qp_map[nesdev->netdev_count].qpid; + nesvnic->nic_index = curr_qp_map[nesdev->netdev_count].nic_index; + nesvnic->logical_port = curr_qp_map[nesdev->netdev_count].logical_port; + + /* Setup the burned in MAC address */ + u64temp = (u64)nesdev->nesadapter->mac_addr_low; + u64temp += ((u64)nesdev->nesadapter->mac_addr_high) << 32; + u64temp += nesvnic->nic_index; + netdev->dev_addr[0] = (u8)(u64temp>>40); + netdev->dev_addr[1] = (u8)(u64temp>>32); + netdev->dev_addr[2] = (u8)(u64temp>>24); + netdev->dev_addr[3] = (u8)(u64temp>>16); + netdev->dev_addr[4] = (u8)(u64temp>>8); + netdev->dev_addr[5] = (u8)u64temp; + + netdev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_RX; + if ((nesvnic->logical_port < 2) || (nesdev->nesadapter->hw_rev != NE020_REV)) + netdev->hw_features |= NETIF_F_TSO; + + netdev->features = netdev->hw_features | NETIF_F_HIGHDMA | NETIF_F_HW_VLAN_CTAG_TX; + netdev->hw_features |= NETIF_F_LRO; + + nes_debug(NES_DBG_INIT, "nesvnic = %p, reported features = 0x%lX, QPid = %d," + " nic_index = %d, logical_port = %d, mac_index = %d.\n", + nesvnic, (unsigned long)netdev->features, nesvnic->nic.qp_id, + nesvnic->nic_index, nesvnic->logical_port, nesdev->mac_index); + + if (nesvnic->nesdev->nesadapter->port_count == 1 && + nesvnic->nesdev->nesadapter->adapter_fcn_count == 1) { + + nesvnic->qp_nic_index[0] = nesvnic->nic_index; + nesvnic->qp_nic_index[1] = nesvnic->nic_index + 1; + if (nes_drv_opt & NES_DRV_OPT_DUAL_LOGICAL_PORT) { + nesvnic->qp_nic_index[2] = 0xf; + nesvnic->qp_nic_index[3] = 0xf; + } else { + nesvnic->qp_nic_index[2] = nesvnic->nic_index + 2; + nesvnic->qp_nic_index[3] = nesvnic->nic_index + 3; + } + } else { + if (nesvnic->nesdev->nesadapter->port_count == 2 || + (nesvnic->nesdev->nesadapter->port_count == 1 && + nesvnic->nesdev->nesadapter->adapter_fcn_count == 2)) { + nesvnic->qp_nic_index[0] = nesvnic->nic_index; + nesvnic->qp_nic_index[1] = nesvnic->nic_index + + 2; + nesvnic->qp_nic_index[2] = 0xf; + nesvnic->qp_nic_index[3] = 0xf; + } else { + nesvnic->qp_nic_index[0] = nesvnic->nic_index; + nesvnic->qp_nic_index[1] = 0xf; + nesvnic->qp_nic_index[2] = 0xf; + nesvnic->qp_nic_index[3] = 0xf; + } + } + nesvnic->next_qp_nic_index = 0; + + if (nesdev->netdev_count == 0) { + nesvnic->rdma_enabled = 1; + } else { + nesvnic->rdma_enabled = 0; + } + nesvnic->nic_cq.cq_number = nesvnic->nic.qp_id; + init_timer(&nesvnic->event_timer); + nesvnic->event_timer.function = NULL; + spin_lock_init(&nesvnic->tx_lock); + spin_lock_init(&nesvnic->port_ibevent_lock); + nesdev->netdev[nesdev->netdev_count] = netdev; + + nes_debug(NES_DBG_INIT, "Adding nesvnic (%p) to the adapters nesvnic_list for MAC%d.\n", + nesvnic, nesdev->mac_index); + list_add_tail(&nesvnic->list, &nesdev->nesadapter->nesvnic_list[nesdev->mac_index]); + + if ((nesdev->netdev_count == 0) && + ((PCI_FUNC(nesdev->pcidev->devfn) == nesdev->mac_index) || + ((phy_type == NES_PHY_TYPE_PUMA_1G) && + (((PCI_FUNC(nesdev->pcidev->devfn) == 1) && (nesdev->mac_index == 2)) || + ((PCI_FUNC(nesdev->pcidev->devfn) == 2) && (nesdev->mac_index == 1)))))) { + u32 u32temp; + u32 link_mask = 0; + u32 link_val = 0; + u16 temp_phy_data; + u16 phy_data = 0; + unsigned long flags; + + u32temp = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 + + (0x200 * (nesdev->mac_index & 1))); + if (phy_type != NES_PHY_TYPE_PUMA_1G) { + u32temp |= 0x00200000; + nes_write_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 + + (0x200 * (nesdev->mac_index & 1)), u32temp); + } + + /* Check and set linkup here. This is for back to back */ + /* configuration where second port won't get link interrupt */ + switch (phy_type) { + case NES_PHY_TYPE_PUMA_1G: + if (nesdev->mac_index < 2) { + link_mask = 0x01010000; + link_val = 0x01010000; + } else { + link_mask = 0x02020000; + link_val = 0x02020000; + } + break; + case NES_PHY_TYPE_SFP_D: + spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags); + nes_read_10G_phy_reg(nesdev, + nesdev->nesadapter->phy_index[nesdev->mac_index], + 1, 0x9003); + temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + nes_read_10G_phy_reg(nesdev, + nesdev->nesadapter->phy_index[nesdev->mac_index], + 3, 0x0021); + nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + nes_read_10G_phy_reg(nesdev, + nesdev->nesadapter->phy_index[nesdev->mac_index], + 3, 0x0021); + phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags); + phy_data = (!temp_phy_data && (phy_data == 0x8000)) ? 0x4 : 0x0; + break; + default: + link_mask = 0x0f1f0000; + link_val = 0x0f0f0000; + break; + } + + u32temp = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0 + + (0x200 * (nesdev->mac_index & 1))); + + if (phy_type == NES_PHY_TYPE_SFP_D) { + if (phy_data & 0x0004) + nesvnic->linkup = 1; + } else { + if ((u32temp & link_mask) == link_val) + nesvnic->linkup = 1; + } + + /* clear the MAC interrupt status, assumes direct logical to physical mapping */ + u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (0x200 * nesdev->mac_index)); + nes_debug(NES_DBG_INIT, "Phy interrupt status = 0x%X.\n", u32temp); + nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (0x200 * nesdev->mac_index), u32temp); + + nes_init_phy(nesdev); + } + + nes_vlan_mode(netdev, nesdev, netdev->features); + + return netdev; +} + + +/** + * nes_netdev_destroy - destroy network device structure + */ +void nes_netdev_destroy(struct net_device *netdev) +{ + struct nes_vnic *nesvnic = netdev_priv(netdev); + + /* make sure 'stop' method is called by Linux stack */ + /* nes_netdev_stop(netdev); */ + + list_del(&nesvnic->list); + + if (nesvnic->of_device_registered) { + nes_destroy_ofa_device(nesvnic->nesibdev); + } + + free_netdev(netdev); +} + + +/** + * nes_nic_cm_xmit -- CM calls this to send out pkts + */ +int nes_nic_cm_xmit(struct sk_buff *skb, struct net_device *netdev) +{ + int ret; + + skb->dev = netdev; + ret = dev_queue_xmit(skb); + if (ret) { + nes_debug(NES_DBG_CM, "Bad return code from dev_queue_xmit %d\n", ret); + } + + return ret; +} diff --git a/drivers/infiniband/hw/nes/nes_user.h b/drivers/infiniband/hw/nes/nes_user.h new file mode 100644 index 0000000..529c421 --- /dev/null +++ b/drivers/infiniband/hw/nes/nes_user.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef NES_USER_H +#define NES_USER_H + +#include + +#define NES_ABI_USERSPACE_VER 2 +#define NES_ABI_KERNEL_VER 2 + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ + +struct nes_alloc_ucontext_req { + __u32 reserved32; + __u8 userspace_ver; + __u8 reserved8[3]; +}; + +struct nes_alloc_ucontext_resp { + __u32 max_pds; /* maximum pds allowed for this user process */ + __u32 max_qps; /* maximum qps allowed for this user process */ + __u32 wq_size; /* size of the WQs (sq+rq) allocated to the mmaped area */ + __u8 virtwq; /* flag to indicate if virtual WQ are to be used or not */ + __u8 kernel_ver; + __u8 reserved[2]; +}; + +struct nes_alloc_pd_resp { + __u32 pd_id; + __u32 mmap_db_index; +}; + +struct nes_create_cq_req { + __u64 user_cq_buffer; + __u32 mcrqf; + __u8 reserved[4]; +}; + +struct nes_create_qp_req { + __u64 user_wqe_buffers; + __u64 user_qp_buffer; +}; + +enum iwnes_memreg_type { + IWNES_MEMREG_TYPE_MEM = 0x0000, + IWNES_MEMREG_TYPE_QP = 0x0001, + IWNES_MEMREG_TYPE_CQ = 0x0002, + IWNES_MEMREG_TYPE_MW = 0x0003, + IWNES_MEMREG_TYPE_FMR = 0x0004, + IWNES_MEMREG_TYPE_FMEM = 0x0005, +}; + +struct nes_mem_reg_req { + __u32 reg_type; /* indicates if id is memory, QP or CQ */ + __u32 reserved; +}; + +struct nes_create_cq_resp { + __u32 cq_id; + __u32 cq_size; + __u32 mmap_db_index; + __u32 reserved; +}; + +struct nes_create_qp_resp { + __u32 qp_id; + __u32 actual_sq_size; + __u32 actual_rq_size; + __u32 mmap_sq_db_index; + __u32 mmap_rq_db_index; + __u32 nes_drv_opt; +}; + +#endif /* NES_USER_H */ diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c new file mode 100644 index 0000000..2042c0f --- /dev/null +++ b/drivers/infiniband/hw/nes/nes_utils.c @@ -0,0 +1,972 @@ +/* + * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "nes.h" + +static u16 nes_read16_eeprom(void __iomem *addr, u16 offset); + +u32 mh_detected; +u32 mh_pauses_sent; + +static u32 nes_set_pau(struct nes_device *nesdev) +{ + u32 ret = 0; + u32 counter; + + nes_write_indexed(nesdev, NES_IDX_GPR2, NES_ENABLE_PAU); + nes_write_indexed(nesdev, NES_IDX_GPR_TRIGGER, 1); + + for (counter = 0; counter < NES_PAU_COUNTER; counter++) { + udelay(30); + if (!nes_read_indexed(nesdev, NES_IDX_GPR2)) { + printk(KERN_INFO PFX "PAU is supported.\n"); + break; + } + nes_write_indexed(nesdev, NES_IDX_GPR_TRIGGER, 1); + } + if (counter == NES_PAU_COUNTER) { + printk(KERN_INFO PFX "PAU is not supported.\n"); + return -EPERM; + } + return ret; +} + +/** + * nes_read_eeprom_values - + */ +int nes_read_eeprom_values(struct nes_device *nesdev, struct nes_adapter *nesadapter) +{ + u32 mac_addr_low; + u16 mac_addr_high; + u16 eeprom_data; + u16 eeprom_offset; + u16 next_section_address; + u16 sw_section_ver; + u8 major_ver = 0; + u8 minor_ver = 0; + + /* TODO: deal with EEPROM endian issues */ + if (nesadapter->firmware_eeprom_offset == 0) { + /* Read the EEPROM Parameters */ + eeprom_data = nes_read16_eeprom(nesdev->regs, 0); + nes_debug(NES_DBG_HW, "EEPROM Offset 0 = 0x%04X\n", eeprom_data); + eeprom_offset = 2 + (((eeprom_data & 0x007f) << 3) << + ((eeprom_data & 0x0080) >> 7)); + nes_debug(NES_DBG_HW, "Firmware Offset = 0x%04X\n", eeprom_offset); + nesadapter->firmware_eeprom_offset = eeprom_offset; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 4); + if (eeprom_data != 0x5746) { + nes_debug(NES_DBG_HW, "Not a valid Firmware Image = 0x%04X\n", eeprom_data); + return -1; + } + + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2); + nes_debug(NES_DBG_HW, "EEPROM Offset %u = 0x%04X\n", + eeprom_offset + 2, eeprom_data); + eeprom_offset += ((eeprom_data & 0x00ff) << 3) << ((eeprom_data & 0x0100) >> 8); + nes_debug(NES_DBG_HW, "Software Offset = 0x%04X\n", eeprom_offset); + nesadapter->software_eeprom_offset = eeprom_offset; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 4); + if (eeprom_data != 0x5753) { + printk("Not a valid Software Image = 0x%04X\n", eeprom_data); + return -1; + } + sw_section_ver = nes_read16_eeprom(nesdev->regs, nesadapter->software_eeprom_offset + 6); + nes_debug(NES_DBG_HW, "Software section version number = 0x%04X\n", + sw_section_ver); + + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2); + nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section) = 0x%04X\n", + eeprom_offset + 2, eeprom_data); + next_section_address = eeprom_offset + (((eeprom_data & 0x00ff) << 3) << + ((eeprom_data & 0x0100) >> 8)); + eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4); + if (eeprom_data != 0x414d) { + nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x414d but was 0x%04X\n", + eeprom_data); + goto no_fw_rev; + } + eeprom_offset = next_section_address; + + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2); + nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section) = 0x%04X\n", + eeprom_offset + 2, eeprom_data); + next_section_address = eeprom_offset + (((eeprom_data & 0x00ff) << 3) << + ((eeprom_data & 0x0100) >> 8)); + eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4); + if (eeprom_data != 0x4f52) { + nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x4f52 but was 0x%04X\n", + eeprom_data); + goto no_fw_rev; + } + eeprom_offset = next_section_address; + + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2); + nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section) = 0x%04X\n", + eeprom_offset + 2, eeprom_data); + next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3); + eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4); + if (eeprom_data != 0x5746) { + nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x5746 but was 0x%04X\n", + eeprom_data); + goto no_fw_rev; + } + eeprom_offset = next_section_address; + + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2); + nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section) = 0x%04X\n", + eeprom_offset + 2, eeprom_data); + next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3); + eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4); + if (eeprom_data != 0x5753) { + nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x5753 but was 0x%04X\n", + eeprom_data); + goto no_fw_rev; + } + eeprom_offset = next_section_address; + + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2); + nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section) = 0x%04X\n", + eeprom_offset + 2, eeprom_data); + next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3); + eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4); + if (eeprom_data != 0x414d) { + nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x414d but was 0x%04X\n", + eeprom_data); + goto no_fw_rev; + } + eeprom_offset = next_section_address; + + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2); + nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section) = 0x%04X\n", + eeprom_offset + 2, eeprom_data); + next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3); + eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4); + if (eeprom_data != 0x464e) { + nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x464e but was 0x%04X\n", + eeprom_data); + goto no_fw_rev; + } + eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 8); + printk(PFX "Firmware version %u.%u\n", (u8)(eeprom_data>>8), (u8)eeprom_data); + major_ver = (u8)(eeprom_data >> 8); + minor_ver = (u8)(eeprom_data); + + if (nes_drv_opt & NES_DRV_OPT_DISABLE_VIRT_WQ) { + nes_debug(NES_DBG_HW, "Virtual WQs have been disabled\n"); + } else if (((major_ver == 2) && (minor_ver > 21)) || ((major_ver > 2) && (major_ver != 255))) { + nesadapter->virtwq = 1; + } + if (((major_ver == 3) && (minor_ver >= 16)) || (major_ver > 3)) + nesadapter->send_term_ok = 1; + + if (nes_drv_opt & NES_DRV_OPT_ENABLE_PAU) { + if (!nes_set_pau(nesdev)) + nesadapter->allow_unaligned_fpdus = 1; + } + + nesadapter->firmware_version = (((u32)(u8)(eeprom_data>>8)) << 16) + + (u32)((u8)eeprom_data); + + eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 10); + printk(PFX "EEPROM version %u.%u\n", (u8)(eeprom_data>>8), (u8)eeprom_data); + nesadapter->eeprom_version = (((u32)(u8)(eeprom_data>>8)) << 16) + + (u32)((u8)eeprom_data); + +no_fw_rev: + /* eeprom is valid */ + eeprom_offset = nesadapter->software_eeprom_offset; + eeprom_offset += 8; + nesadapter->netdev_max = (u8)nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + mac_addr_high = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + mac_addr_low = (u32)nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + mac_addr_low <<= 16; + mac_addr_low += (u32)nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "Base MAC Address = 0x%04X%08X\n", + mac_addr_high, mac_addr_low); + nes_debug(NES_DBG_HW, "MAC Address count = %u\n", nesadapter->netdev_max); + + nesadapter->mac_addr_low = mac_addr_low; + nesadapter->mac_addr_high = mac_addr_high; + + /* Read the Phy Type array */ + eeprom_offset += 10; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->phy_type[0] = (u8)(eeprom_data >> 8); + nesadapter->phy_type[1] = (u8)eeprom_data; + + /* Read the port array */ + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->phy_type[2] = (u8)(eeprom_data >> 8); + nesadapter->phy_type[3] = (u8)eeprom_data; + /* port_count is set by soft reset reg */ + nes_debug(NES_DBG_HW, "port_count = %u, port 0 -> %u, port 1 -> %u," + " port 2 -> %u, port 3 -> %u\n", + nesadapter->port_count, + nesadapter->phy_type[0], nesadapter->phy_type[1], + nesadapter->phy_type[2], nesadapter->phy_type[3]); + + /* Read PD config array */ + eeprom_offset += 10; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->pd_config_size[0] = eeprom_data; + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->pd_config_base[0] = eeprom_data; + nes_debug(NES_DBG_HW, "PD0 config, size=0x%04x, base=0x%04x\n", + nesadapter->pd_config_size[0], nesadapter->pd_config_base[0]); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->pd_config_size[1] = eeprom_data; + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->pd_config_base[1] = eeprom_data; + nes_debug(NES_DBG_HW, "PD1 config, size=0x%04x, base=0x%04x\n", + nesadapter->pd_config_size[1], nesadapter->pd_config_base[1]); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->pd_config_size[2] = eeprom_data; + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->pd_config_base[2] = eeprom_data; + nes_debug(NES_DBG_HW, "PD2 config, size=0x%04x, base=0x%04x\n", + nesadapter->pd_config_size[2], nesadapter->pd_config_base[2]); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->pd_config_size[3] = eeprom_data; + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->pd_config_base[3] = eeprom_data; + nes_debug(NES_DBG_HW, "PD3 config, size=0x%04x, base=0x%04x\n", + nesadapter->pd_config_size[3], nesadapter->pd_config_base[3]); + + /* Read Rx Pool Size */ + eeprom_offset += 22; /* 46 */ + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->rx_pool_size = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "rx_pool_size = 0x%08X\n", nesadapter->rx_pool_size); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->tx_pool_size = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "tx_pool_size = 0x%08X\n", nesadapter->tx_pool_size); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->rx_threshold = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "rx_threshold = 0x%08X\n", nesadapter->rx_threshold); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->tcp_timer_core_clk_divisor = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "tcp_timer_core_clk_divisor = 0x%08X\n", + nesadapter->tcp_timer_core_clk_divisor); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->iwarp_config = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "iwarp_config = 0x%08X\n", nesadapter->iwarp_config); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->cm_config = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "cm_config = 0x%08X\n", nesadapter->cm_config); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->sws_timer_config = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "sws_timer_config = 0x%08X\n", nesadapter->sws_timer_config); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->tcp_config1 = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "tcp_config1 = 0x%08X\n", nesadapter->tcp_config1); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->wqm_wat = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "wqm_wat = 0x%08X\n", nesadapter->wqm_wat); + + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + eeprom_offset += 2; + nesadapter->core_clock = (((u32)eeprom_data) << 16) + + nes_read16_eeprom(nesdev->regs, eeprom_offset); + nes_debug(NES_DBG_HW, "core_clock = 0x%08X\n", nesadapter->core_clock); + + if ((sw_section_ver) && (nesadapter->hw_rev != NE020_REV)) { + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->phy_index[0] = (eeprom_data & 0xff00)>>8; + nesadapter->phy_index[1] = eeprom_data & 0x00ff; + eeprom_offset += 2; + eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset); + nesadapter->phy_index[2] = (eeprom_data & 0xff00)>>8; + nesadapter->phy_index[3] = eeprom_data & 0x00ff; + } else { + nesadapter->phy_index[0] = 4; + nesadapter->phy_index[1] = 5; + nesadapter->phy_index[2] = 6; + nesadapter->phy_index[3] = 7; + } + nes_debug(NES_DBG_HW, "Phy address map = 0 > %u, 1 > %u, 2 > %u, 3 > %u\n", + nesadapter->phy_index[0],nesadapter->phy_index[1], + nesadapter->phy_index[2],nesadapter->phy_index[3]); + } + + return 0; +} + + +/** + * nes_read16_eeprom + */ +static u16 nes_read16_eeprom(void __iomem *addr, u16 offset) +{ + writel(NES_EEPROM_READ_REQUEST + (offset >> 1), + (void __iomem *)addr + NES_EEPROM_COMMAND); + + do { + } while (readl((void __iomem *)addr + NES_EEPROM_COMMAND) & + NES_EEPROM_READ_REQUEST); + + return readw((void __iomem *)addr + NES_EEPROM_DATA); +} + + +/** + * nes_write_1G_phy_reg + */ +void nes_write_1G_phy_reg(struct nes_device *nesdev, u8 phy_reg, u8 phy_addr, u16 data) +{ + u32 u32temp; + u32 counter; + + nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL, + 0x50020000 | data | ((u32)phy_reg << 18) | ((u32)phy_addr << 23)); + for (counter = 0; counter < 100 ; counter++) { + udelay(30); + u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS); + if (u32temp & 1) { + /* nes_debug(NES_DBG_PHY, "Phy interrupt status = 0x%X.\n", u32temp); */ + nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1); + break; + } + } + if (!(u32temp & 1)) + nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n", + u32temp); +} + + +/** + * nes_read_1G_phy_reg + * This routine only issues the read, the data must be read + * separately. + */ +void nes_read_1G_phy_reg(struct nes_device *nesdev, u8 phy_reg, u8 phy_addr, u16 *data) +{ + u32 u32temp; + u32 counter; + + /* nes_debug(NES_DBG_PHY, "phy addr = %d, mac_index = %d\n", + phy_addr, nesdev->mac_index); */ + + nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL, + 0x60020000 | ((u32)phy_reg << 18) | ((u32)phy_addr << 23)); + for (counter = 0; counter < 100 ; counter++) { + udelay(30); + u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS); + if (u32temp & 1) { + /* nes_debug(NES_DBG_PHY, "Phy interrupt status = 0x%X.\n", u32temp); */ + nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1); + break; + } + } + if (!(u32temp & 1)) { + nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n", + u32temp); + *data = 0xffff; + } else { + *data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + } +} + + +/** + * nes_write_10G_phy_reg + */ +void nes_write_10G_phy_reg(struct nes_device *nesdev, u16 phy_addr, u8 dev_addr, u16 phy_reg, + u16 data) +{ + u32 port_addr; + u32 u32temp; + u32 counter; + + port_addr = phy_addr; + + /* set address */ + nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL, + 0x00020000 | (u32)phy_reg | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23)); + for (counter = 0; counter < 100 ; counter++) { + udelay(30); + u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS); + if (u32temp & 1) { + nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1); + break; + } + } + if (!(u32temp & 1)) + nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n", + u32temp); + + /* set data */ + nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL, + 0x10020000 | (u32)data | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23)); + for (counter = 0; counter < 100 ; counter++) { + udelay(30); + u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS); + if (u32temp & 1) { + nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1); + break; + } + } + if (!(u32temp & 1)) + nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n", + u32temp); +} + + +/** + * nes_read_10G_phy_reg + * This routine only issues the read, the data must be read + * separately. + */ +void nes_read_10G_phy_reg(struct nes_device *nesdev, u8 phy_addr, u8 dev_addr, u16 phy_reg) +{ + u32 port_addr; + u32 u32temp; + u32 counter; + + port_addr = phy_addr; + + /* set address */ + nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL, + 0x00020000 | (u32)phy_reg | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23)); + for (counter = 0; counter < 100 ; counter++) { + udelay(30); + u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS); + if (u32temp & 1) { + nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1); + break; + } + } + if (!(u32temp & 1)) + nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n", + u32temp); + + /* issue read */ + nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL, + 0x30020000 | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23)); + for (counter = 0; counter < 100 ; counter++) { + udelay(30); + u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS); + if (u32temp & 1) { + nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1); + break; + } + } + if (!(u32temp & 1)) + nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n", + u32temp); +} + + +/** + * nes_get_cqp_request + */ +struct nes_cqp_request *nes_get_cqp_request(struct nes_device *nesdev) +{ + unsigned long flags; + struct nes_cqp_request *cqp_request = NULL; + + if (!list_empty(&nesdev->cqp_avail_reqs)) { + spin_lock_irqsave(&nesdev->cqp.lock, flags); + if (!list_empty(&nesdev->cqp_avail_reqs)) { + cqp_request = list_entry(nesdev->cqp_avail_reqs.next, + struct nes_cqp_request, list); + list_del_init(&cqp_request->list); + } + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + } + if (cqp_request == NULL) { + cqp_request = kzalloc(sizeof(struct nes_cqp_request), GFP_ATOMIC); + if (cqp_request) { + cqp_request->dynamic = 1; + INIT_LIST_HEAD(&cqp_request->list); + } + } + + if (cqp_request) { + init_waitqueue_head(&cqp_request->waitq); + cqp_request->waiting = 0; + cqp_request->request_done = 0; + cqp_request->callback = 0; + init_waitqueue_head(&cqp_request->waitq); + nes_debug(NES_DBG_CQP, "Got cqp request %p from the available list \n", + cqp_request); + } else + printk(KERN_ERR PFX "%s: Could not allocated a CQP request.\n", + __func__); + + return cqp_request; +} + +void nes_free_cqp_request(struct nes_device *nesdev, + struct nes_cqp_request *cqp_request) +{ + unsigned long flags; + + nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) freed.\n", + cqp_request, + le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX]) & 0x3f); + + if (cqp_request->dynamic) { + kfree(cqp_request); + } else { + spin_lock_irqsave(&nesdev->cqp.lock, flags); + list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs); + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + } +} + +void nes_put_cqp_request(struct nes_device *nesdev, + struct nes_cqp_request *cqp_request) +{ + if (atomic_dec_and_test(&cqp_request->refcount)) + nes_free_cqp_request(nesdev, cqp_request); +} + + +/** + * nes_post_cqp_request + */ +void nes_post_cqp_request(struct nes_device *nesdev, + struct nes_cqp_request *cqp_request) +{ + struct nes_hw_cqp_wqe *cqp_wqe; + unsigned long flags; + u32 cqp_head; + u64 u64temp; + u32 opcode; + int ctx_index = NES_CQP_WQE_COMP_CTX_LOW_IDX; + + spin_lock_irqsave(&nesdev->cqp.lock, flags); + + if (((((nesdev->cqp.sq_tail+(nesdev->cqp.sq_size*2))-nesdev->cqp.sq_head) & + (nesdev->cqp.sq_size - 1)) != 1) + && (list_empty(&nesdev->cqp_pending_reqs))) { + cqp_head = nesdev->cqp.sq_head++; + nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1; + cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head]; + memcpy(cqp_wqe, &cqp_request->cqp_wqe, sizeof(*cqp_wqe)); + opcode = le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX]); + if ((opcode & NES_CQP_OPCODE_MASK) == NES_CQP_DOWNLOAD_SEGMENT) + ctx_index = NES_CQP_WQE_DL_COMP_CTX_LOW_IDX; + barrier(); + u64temp = (unsigned long)cqp_request; + set_wqe_64bit_value(cqp_wqe->wqe_words, ctx_index, u64temp); + nes_debug(NES_DBG_CQP, "CQP request (opcode 0x%02X), line 1 = 0x%08X put on CQPs SQ," + " request = %p, cqp_head = %u, cqp_tail = %u, cqp_size = %u," + " waiting = %d, refcount = %d.\n", + opcode & NES_CQP_OPCODE_MASK, + le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX]), cqp_request, + nesdev->cqp.sq_head, nesdev->cqp.sq_tail, nesdev->cqp.sq_size, + cqp_request->waiting, atomic_read(&cqp_request->refcount)); + + barrier(); + + /* Ring doorbell (1 WQEs) */ + nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x01800000 | nesdev->cqp.qp_id); + + barrier(); + } else { + nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X), line 1 = 0x%08X" + " put on the pending queue.\n", + cqp_request, + le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f, + le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_ID_IDX])); + list_add_tail(&cqp_request->list, &nesdev->cqp_pending_reqs); + } + + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + + return; +} + +/** + * nes_arp_table + */ +int nes_arp_table(struct nes_device *nesdev, u32 ip_addr, u8 *mac_addr, u32 action) +{ + struct nes_adapter *nesadapter = nesdev->nesadapter; + int arp_index; + int err = 0; + __be32 tmp_addr; + + for (arp_index = 0; (u32) arp_index < nesadapter->arp_table_size; arp_index++) { + if (nesadapter->arp_table[arp_index].ip_addr == ip_addr) + break; + } + + if (action == NES_ARP_ADD) { + if (arp_index != nesadapter->arp_table_size) { + return -1; + } + + arp_index = 0; + err = nes_alloc_resource(nesadapter, nesadapter->allocated_arps, + nesadapter->arp_table_size, (u32 *)&arp_index, &nesadapter->next_arp_index, NES_RESOURCE_ARP); + if (err) { + nes_debug(NES_DBG_NETDEV, "nes_alloc_resource returned error = %u\n", err); + return err; + } + nes_debug(NES_DBG_NETDEV, "ADD, arp_index=%d\n", arp_index); + + nesadapter->arp_table[arp_index].ip_addr = ip_addr; + memcpy(nesadapter->arp_table[arp_index].mac_addr, mac_addr, ETH_ALEN); + return arp_index; + } + + /* DELETE or RESOLVE */ + if (arp_index == nesadapter->arp_table_size) { + tmp_addr = cpu_to_be32(ip_addr); + nes_debug(NES_DBG_NETDEV, "MAC for %pI4 not in ARP table - cannot %s\n", + &tmp_addr, action == NES_ARP_RESOLVE ? "resolve" : "delete"); + return -1; + } + + if (action == NES_ARP_RESOLVE) { + nes_debug(NES_DBG_NETDEV, "RESOLVE, arp_index=%d\n", arp_index); + return arp_index; + } + + if (action == NES_ARP_DELETE) { + nes_debug(NES_DBG_NETDEV, "DELETE, arp_index=%d\n", arp_index); + nesadapter->arp_table[arp_index].ip_addr = 0; + memset(nesadapter->arp_table[arp_index].mac_addr, 0x00, ETH_ALEN); + nes_free_resource(nesadapter, nesadapter->allocated_arps, arp_index); + return arp_index; + } + + return -1; +} + + +/** + * nes_mh_fix + */ +void nes_mh_fix(unsigned long parm) +{ + unsigned long flags; + struct nes_device *nesdev = (struct nes_device *)parm; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_vnic *nesvnic; + u32 used_chunks_tx; + u32 temp_used_chunks_tx; + u32 temp_last_used_chunks_tx; + u32 used_chunks_mask; + u32 mac_tx_frames_low; + u32 mac_tx_frames_high; + u32 mac_tx_pauses; + u32 serdes_status; + u32 reset_value; + u32 tx_control; + u32 tx_config; + u32 tx_pause_quanta; + u32 rx_control; + u32 rx_config; + u32 mac_exact_match; + u32 mpp_debug; + u32 i=0; + u32 chunks_tx_progress = 0; + + spin_lock_irqsave(&nesadapter->phy_lock, flags); + if ((nesadapter->mac_sw_state[0] != NES_MAC_SW_IDLE) || (nesadapter->mac_link_down[0])) { + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); + goto no_mh_work; + } + nesadapter->mac_sw_state[0] = NES_MAC_SW_MH; + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); + do { + mac_tx_frames_low = nes_read_indexed(nesdev, NES_IDX_MAC_TX_FRAMES_LOW); + mac_tx_frames_high = nes_read_indexed(nesdev, NES_IDX_MAC_TX_FRAMES_HIGH); + mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES); + used_chunks_tx = nes_read_indexed(nesdev, NES_IDX_USED_CHUNKS_TX); + nesdev->mac_pause_frames_sent += mac_tx_pauses; + used_chunks_mask = 0; + temp_used_chunks_tx = used_chunks_tx; + temp_last_used_chunks_tx = nesdev->last_used_chunks_tx; + + if (nesdev->netdev[0]) { + nesvnic = netdev_priv(nesdev->netdev[0]); + } else { + break; + } + + for (i=0; i<4; i++) { + used_chunks_mask <<= 8; + if (nesvnic->qp_nic_index[i] != 0xff) { + used_chunks_mask |= 0xff; + if ((temp_used_chunks_tx&0xff)<(temp_last_used_chunks_tx&0xff)) { + chunks_tx_progress = 1; + } + } + temp_used_chunks_tx >>= 8; + temp_last_used_chunks_tx >>= 8; + } + if ((mac_tx_frames_low) || (mac_tx_frames_high) || + (!(used_chunks_tx&used_chunks_mask)) || + (!(nesdev->last_used_chunks_tx&used_chunks_mask)) || + (chunks_tx_progress) ) { + nesdev->last_used_chunks_tx = used_chunks_tx; + break; + } + nesdev->last_used_chunks_tx = used_chunks_tx; + barrier(); + + nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, 0x00000005); + mh_pauses_sent++; + mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES); + if (mac_tx_pauses) { + nesdev->mac_pause_frames_sent += mac_tx_pauses; + break; + } + + tx_control = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONTROL); + tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG); + tx_pause_quanta = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_QUANTA); + rx_control = nes_read_indexed(nesdev, NES_IDX_MAC_RX_CONTROL); + rx_config = nes_read_indexed(nesdev, NES_IDX_MAC_RX_CONFIG); + mac_exact_match = nes_read_indexed(nesdev, NES_IDX_MAC_EXACT_MATCH_BOTTOM); + mpp_debug = nes_read_indexed(nesdev, NES_IDX_MPP_DEBUG); + + /* one last ditch effort to avoid a false positive */ + mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES); + if (mac_tx_pauses) { + nesdev->last_mac_tx_pauses = nesdev->mac_pause_frames_sent; + nes_debug(NES_DBG_HW, "failsafe caught slow outbound pause\n"); + break; + } + mh_detected++; + + nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, 0x00000000); + nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, 0x00000000); + reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET); + + nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value | 0x0000001d); + + while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) + & 0x00000040) != 0x00000040) && (i++ < 5000)) { + /* mdelay(1); */ + } + + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, 0x00000008); + serdes_status = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0); + + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x000bdef7); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE0, 0x9ce73000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE0, 0x0ff00000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET0, 0x00000000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS0, 0x00000000); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0, 0x00000000); + if (nesadapter->OneG_Mode) { + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0182222); + } else { + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0042222); + } + serdes_status = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_STATUS0); + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000ff); + + nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, tx_control); + nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config); + nes_write_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_QUANTA, tx_pause_quanta); + nes_write_indexed(nesdev, NES_IDX_MAC_RX_CONTROL, rx_control); + nes_write_indexed(nesdev, NES_IDX_MAC_RX_CONFIG, rx_config); + nes_write_indexed(nesdev, NES_IDX_MAC_EXACT_MATCH_BOTTOM, mac_exact_match); + nes_write_indexed(nesdev, NES_IDX_MPP_DEBUG, mpp_debug); + + } while (0); + + nesadapter->mac_sw_state[0] = NES_MAC_SW_IDLE; +no_mh_work: + nesdev->nesadapter->mh_timer.expires = jiffies + (HZ/5); + add_timer(&nesdev->nesadapter->mh_timer); +} + +/** + * nes_clc + */ +void nes_clc(unsigned long parm) +{ + unsigned long flags; + struct nes_device *nesdev = (struct nes_device *)parm; + struct nes_adapter *nesadapter = nesdev->nesadapter; + + spin_lock_irqsave(&nesadapter->phy_lock, flags); + nesadapter->link_interrupt_count[0] = 0; + nesadapter->link_interrupt_count[1] = 0; + nesadapter->link_interrupt_count[2] = 0; + nesadapter->link_interrupt_count[3] = 0; + spin_unlock_irqrestore(&nesadapter->phy_lock, flags); + + nesadapter->lc_timer.expires = jiffies + 3600 * HZ; /* 1 hour */ + add_timer(&nesadapter->lc_timer); +} + + +/** + * nes_dump_mem + */ +void nes_dump_mem(unsigned int dump_debug_level, void *addr, int length) +{ + char xlate[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', + 'a', 'b', 'c', 'd', 'e', 'f'}; + char *ptr; + char hex_buf[80]; + char ascii_buf[20]; + int num_char; + int num_ascii; + int num_hex; + + if (!(nes_debug_level & dump_debug_level)) { + return; + } + + ptr = addr; + if (length > 0x100) { + nes_debug(dump_debug_level, "Length truncated from %x to %x\n", length, 0x100); + length = 0x100; + } + nes_debug(dump_debug_level, "Address=0x%p, length=0x%x (%d)\n", ptr, length, length); + + memset(ascii_buf, 0, 20); + memset(hex_buf, 0, 80); + + num_ascii = 0; + num_hex = 0; + for (num_char = 0; num_char < length; num_char++) { + if (num_ascii == 8) { + ascii_buf[num_ascii++] = ' '; + hex_buf[num_hex++] = '-'; + hex_buf[num_hex++] = ' '; + } + + if (*ptr < 0x20 || *ptr > 0x7e) + ascii_buf[num_ascii++] = '.'; + else + ascii_buf[num_ascii++] = *ptr; + hex_buf[num_hex++] = xlate[((*ptr & 0xf0) >> 4)]; + hex_buf[num_hex++] = xlate[*ptr & 0x0f]; + hex_buf[num_hex++] = ' '; + ptr++; + + if (num_ascii >= 17) { + /* output line and reset */ + nes_debug(dump_debug_level, " %s | %s\n", hex_buf, ascii_buf); + memset(ascii_buf, 0, 20); + memset(hex_buf, 0, 80); + num_ascii = 0; + num_hex = 0; + } + } + + /* output the rest */ + if (num_ascii) { + while (num_ascii < 17) { + if (num_ascii == 8) { + hex_buf[num_hex++] = ' '; + hex_buf[num_hex++] = ' '; + } + hex_buf[num_hex++] = ' '; + hex_buf[num_hex++] = ' '; + hex_buf[num_hex++] = ' '; + num_ascii++; + } + + nes_debug(dump_debug_level, " %s | %s\n", hex_buf, ascii_buf); + } +} diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c new file mode 100644 index 0000000..fef067c --- /dev/null +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -0,0 +1,4054 @@ +/* + * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "nes.h" + +#include + +atomic_t mod_qp_timouts; +atomic_t qps_created; +atomic_t sw_qps_destroyed; + +static void nes_unregister_ofa_device(struct nes_ib_device *nesibdev); + +/** + * nes_alloc_mw + */ +static struct ib_mw *nes_alloc_mw(struct ib_pd *ibpd, enum ib_mw_type type) +{ + struct nes_pd *nespd = to_nespd(ibpd); + struct nes_vnic *nesvnic = to_nesvnic(ibpd->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_cqp_request *cqp_request; + struct nes_mr *nesmr; + struct ib_mw *ibmw; + struct nes_hw_cqp_wqe *cqp_wqe; + int ret; + u32 stag; + u32 stag_index = 0; + u32 next_stag_index = 0; + u32 driver_key = 0; + u8 stag_key = 0; + + if (type != IB_MW_TYPE_1) + return ERR_PTR(-EINVAL); + + get_random_bytes(&next_stag_index, sizeof(next_stag_index)); + stag_key = (u8)next_stag_index; + + driver_key = 0; + + next_stag_index >>= 8; + next_stag_index %= nesadapter->max_mr; + + ret = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, + nesadapter->max_mr, &stag_index, &next_stag_index, NES_RESOURCE_MW); + if (ret) { + return ERR_PTR(ret); + } + + nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL); + if (!nesmr) { + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + return ERR_PTR(-ENOMEM); + } + + stag = stag_index << 8; + stag |= driver_key; + stag += (u32)stag_key; + + nes_debug(NES_DBG_MR, "Registering STag 0x%08X, index = 0x%08X\n", + stag, stag_index); + + /* Register the region with the adapter */ + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + kfree(nesmr); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + return ERR_PTR(-ENOMEM); + } + + cqp_request->waiting = 1; + cqp_wqe = &cqp_request->cqp_wqe; + + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = + cpu_to_le32( NES_CQP_ALLOCATE_STAG | NES_CQP_STAG_RIGHTS_REMOTE_READ | + NES_CQP_STAG_RIGHTS_REMOTE_WRITE | NES_CQP_STAG_VA_TO | + NES_CQP_STAG_REM_ACC_EN); + + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX, (nespd->pd_id & 0x00007fff)); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag); + + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request); + + /* Wait for CQP */ + ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_MR, "Register STag 0x%08X completed, wait_event_timeout ret = %u," + " CQP Major:Minor codes = 0x%04X:0x%04X.\n", + stag, ret, cqp_request->major_code, cqp_request->minor_code); + if ((!ret) || (cqp_request->major_code)) { + nes_put_cqp_request(nesdev, cqp_request); + kfree(nesmr); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + if (!ret) { + return ERR_PTR(-ETIME); + } else { + return ERR_PTR(-ENOMEM); + } + } + nes_put_cqp_request(nesdev, cqp_request); + + nesmr->ibmw.rkey = stag; + nesmr->mode = IWNES_MEMREG_TYPE_MW; + ibmw = &nesmr->ibmw; + nesmr->pbl_4k = 0; + nesmr->pbls_used = 0; + + return ibmw; +} + + +/** + * nes_dealloc_mw + */ +static int nes_dealloc_mw(struct ib_mw *ibmw) +{ + struct nes_mr *nesmr = to_nesmw(ibmw); + struct nes_vnic *nesvnic = to_nesvnic(ibmw->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_cqp_request *cqp_request; + int err = 0; + int ret; + + /* Deallocate the window with the adapter */ + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n"); + return -ENOMEM; + } + cqp_request->waiting = 1; + cqp_wqe = &cqp_request->cqp_wqe; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, NES_CQP_DEALLOCATE_STAG); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, ibmw->rkey); + + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request); + + /* Wait for CQP */ + nes_debug(NES_DBG_MR, "Waiting for deallocate STag 0x%08X to complete.\n", + ibmw->rkey); + ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_MR, "Deallocate STag completed, wait_event_timeout ret = %u," + " CQP Major:Minor codes = 0x%04X:0x%04X.\n", + ret, cqp_request->major_code, cqp_request->minor_code); + if (!ret) + err = -ETIME; + else if (cqp_request->major_code) + err = -EIO; + + nes_put_cqp_request(nesdev, cqp_request); + + nes_free_resource(nesadapter, nesadapter->allocated_mrs, + (ibmw->rkey & 0x0fffff00) >> 8); + kfree(nesmr); + + return err; +} + + +/** + * nes_bind_mw + */ +static int nes_bind_mw(struct ib_qp *ibqp, struct ib_mw *ibmw, + struct ib_mw_bind *ibmw_bind) +{ + u64 u64temp; + struct nes_vnic *nesvnic = to_nesvnic(ibqp->device); + struct nes_device *nesdev = nesvnic->nesdev; + /* struct nes_mr *nesmr = to_nesmw(ibmw); */ + struct nes_qp *nesqp = to_nesqp(ibqp); + struct nes_hw_qp_wqe *wqe; + unsigned long flags = 0; + u32 head; + u32 wqe_misc = 0; + u32 qsize; + + if (nesqp->ibqp_state > IB_QPS_RTS) + return -EINVAL; + + spin_lock_irqsave(&nesqp->lock, flags); + + head = nesqp->hwqp.sq_head; + qsize = nesqp->hwqp.sq_tail; + + /* Check for SQ overflow */ + if (((head + (2 * qsize) - nesqp->hwqp.sq_tail) % qsize) == (qsize - 1)) { + spin_unlock_irqrestore(&nesqp->lock, flags); + return -ENOMEM; + } + + wqe = &nesqp->hwqp.sq_vbase[head]; + /* nes_debug(NES_DBG_MR, "processing sq wqe at %p, head = %u.\n", wqe, head); */ + nes_fill_init_qp_wqe(wqe, nesqp, head); + u64temp = ibmw_bind->wr_id; + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX, u64temp); + wqe_misc = NES_IWARP_SQ_OP_BIND; + + wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE; + + if (ibmw_bind->send_flags & IB_SEND_SIGNALED) + wqe_misc |= NES_IWARP_SQ_WQE_SIGNALED_COMPL; + + if (ibmw_bind->bind_info.mw_access_flags & IB_ACCESS_REMOTE_WRITE) + wqe_misc |= NES_CQP_STAG_RIGHTS_REMOTE_WRITE; + if (ibmw_bind->bind_info.mw_access_flags & IB_ACCESS_REMOTE_READ) + wqe_misc |= NES_CQP_STAG_RIGHTS_REMOTE_READ; + + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_MISC_IDX, wqe_misc); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_MR_IDX, + ibmw_bind->bind_info.mr->lkey); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_MW_IDX, ibmw->rkey); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_LENGTH_LOW_IDX, + ibmw_bind->bind_info.length); + wqe->wqe_words[NES_IWARP_SQ_BIND_WQE_LENGTH_HIGH_IDX] = 0; + u64temp = (u64)ibmw_bind->bind_info.addr; + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_VA_FBO_LOW_IDX, u64temp); + + head++; + if (head >= qsize) + head = 0; + + nesqp->hwqp.sq_head = head; + barrier(); + + nes_write32(nesdev->regs+NES_WQE_ALLOC, + (1 << 24) | 0x00800000 | nesqp->hwqp.qp_id); + + spin_unlock_irqrestore(&nesqp->lock, flags); + + return 0; +} + + +/* + * nes_alloc_fast_mr + */ +static int alloc_fast_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd, + u32 stag, u32 page_count) +{ + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_cqp_request *cqp_request; + unsigned long flags; + int ret; + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 opcode = 0; + u16 major_code; + u64 region_length = page_count * PAGE_SIZE; + + + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n"); + return -ENOMEM; + } + nes_debug(NES_DBG_MR, "alloc_fast_reg_mr: page_count = %d, " + "region_length = %llu\n", + page_count, region_length); + cqp_request->waiting = 1; + cqp_wqe = &cqp_request->cqp_wqe; + + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + if (nesadapter->free_4kpbl > 0) { + nesadapter->free_4kpbl--; + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + } else { + /* No 4kpbl's available: */ + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + nes_debug(NES_DBG_MR, "Out of Pbls\n"); + nes_free_cqp_request(nesdev, cqp_request); + return -ENOMEM; + } + + opcode = NES_CQP_ALLOCATE_STAG | NES_CQP_STAG_MR | + NES_CQP_STAG_PBL_BLK_SIZE | NES_CQP_STAG_VA_TO | + NES_CQP_STAG_REM_ACC_EN; + /* + * The current OFED API does not support the zero based TO option. + * If added then need to changed the NES_CQP_STAG_VA* option. Also, + * the API does not support that ability to have the MR set for local + * access only when created and not allow the SQ op to override. Given + * this the remote enable must be set here. + */ + + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX, 1); + + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] = + cpu_to_le32((u32)(region_length >> 8) & 0xff000000); + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] |= + cpu_to_le32(nespd->pd_id & 0x00007fff); + + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag); + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_VA_LOW_IDX, 0); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_LOW_IDX, 0); + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, 0); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_LEN_IDX, (page_count * 8)); + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_STAG_PBL_BLK_SIZE); + barrier(); + + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request); + + /* Wait for CQP */ + ret = wait_event_timeout(cqp_request->waitq, + (0 != cqp_request->request_done), + NES_EVENT_TIMEOUT); + + nes_debug(NES_DBG_MR, "Allocate STag 0x%08X completed, " + "wait_event_timeout ret = %u, CQP Major:Minor codes = " + "0x%04X:0x%04X.\n", stag, ret, cqp_request->major_code, + cqp_request->minor_code); + major_code = cqp_request->major_code; + nes_put_cqp_request(nesdev, cqp_request); + + if (!ret || major_code) { + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + nesadapter->free_4kpbl++; + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + } + + if (!ret) + return -ETIME; + else if (major_code) + return -EIO; + return 0; +} + +/* + * nes_alloc_fast_reg_mr + */ +static struct ib_mr *nes_alloc_fast_reg_mr(struct ib_pd *ibpd, int max_page_list_len) +{ + struct nes_pd *nespd = to_nespd(ibpd); + struct nes_vnic *nesvnic = to_nesvnic(ibpd->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + + u32 next_stag_index; + u8 stag_key = 0; + u32 driver_key = 0; + int err = 0; + u32 stag_index = 0; + struct nes_mr *nesmr; + u32 stag; + int ret; + struct ib_mr *ibmr; +/* + * Note: Set to always use a fixed length single page entry PBL. This is to allow + * for the fast_reg_mr operation to always know the size of the PBL. + */ + if (max_page_list_len > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64))) + return ERR_PTR(-E2BIG); + + get_random_bytes(&next_stag_index, sizeof(next_stag_index)); + stag_key = (u8)next_stag_index; + next_stag_index >>= 8; + next_stag_index %= nesadapter->max_mr; + + err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, + nesadapter->max_mr, &stag_index, + &next_stag_index, NES_RESOURCE_FAST_MR); + if (err) + return ERR_PTR(err); + + nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL); + if (!nesmr) { + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + return ERR_PTR(-ENOMEM); + } + + stag = stag_index << 8; + stag |= driver_key; + stag += (u32)stag_key; + + nes_debug(NES_DBG_MR, "Allocating STag 0x%08X index = 0x%08X\n", + stag, stag_index); + + ret = alloc_fast_reg_mr(nesdev, nespd, stag, max_page_list_len); + + if (ret == 0) { + nesmr->ibmr.rkey = stag; + nesmr->ibmr.lkey = stag; + nesmr->mode = IWNES_MEMREG_TYPE_FMEM; + ibmr = &nesmr->ibmr; + } else { + kfree(nesmr); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + ibmr = ERR_PTR(-ENOMEM); + } + return ibmr; +} + +/* + * nes_alloc_fast_reg_page_list + */ +static struct ib_fast_reg_page_list *nes_alloc_fast_reg_page_list( + struct ib_device *ibdev, + int page_list_len) +{ + struct nes_vnic *nesvnic = to_nesvnic(ibdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct ib_fast_reg_page_list *pifrpl; + struct nes_ib_fast_reg_page_list *pnesfrpl; + + if (page_list_len > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64))) + return ERR_PTR(-E2BIG); + /* + * Allocate the ib_fast_reg_page_list structure, the + * nes_fast_bpl structure, and the PLB table. + */ + pnesfrpl = kmalloc(sizeof(struct nes_ib_fast_reg_page_list) + + page_list_len * sizeof(u64), GFP_KERNEL); + + if (!pnesfrpl) + return ERR_PTR(-ENOMEM); + + pifrpl = &pnesfrpl->ibfrpl; + pifrpl->page_list = &pnesfrpl->pbl; + pifrpl->max_page_list_len = page_list_len; + /* + * Allocate the WQE PBL + */ + pnesfrpl->nes_wqe_pbl.kva = pci_alloc_consistent(nesdev->pcidev, + page_list_len * sizeof(u64), + &pnesfrpl->nes_wqe_pbl.paddr); + + if (!pnesfrpl->nes_wqe_pbl.kva) { + kfree(pnesfrpl); + return ERR_PTR(-ENOMEM); + } + nes_debug(NES_DBG_MR, "nes_alloc_fast_reg_pbl: nes_frpl = %p, " + "ibfrpl = %p, ibfrpl.page_list = %p, pbl.kva = %p, " + "pbl.paddr = %llx\n", pnesfrpl, &pnesfrpl->ibfrpl, + pnesfrpl->ibfrpl.page_list, pnesfrpl->nes_wqe_pbl.kva, + (unsigned long long) pnesfrpl->nes_wqe_pbl.paddr); + + return pifrpl; +} + +/* + * nes_free_fast_reg_page_list + */ +static void nes_free_fast_reg_page_list(struct ib_fast_reg_page_list *pifrpl) +{ + struct nes_vnic *nesvnic = to_nesvnic(pifrpl->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_ib_fast_reg_page_list *pnesfrpl; + + pnesfrpl = container_of(pifrpl, struct nes_ib_fast_reg_page_list, ibfrpl); + /* + * Free the WQE PBL. + */ + pci_free_consistent(nesdev->pcidev, + pifrpl->max_page_list_len * sizeof(u64), + pnesfrpl->nes_wqe_pbl.kva, + pnesfrpl->nes_wqe_pbl.paddr); + /* + * Free the PBL structure + */ + kfree(pnesfrpl); +} + +/** + * nes_query_device + */ +static int nes_query_device(struct ib_device *ibdev, struct ib_device_attr *props) +{ + struct nes_vnic *nesvnic = to_nesvnic(ibdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_ib_device *nesibdev = nesvnic->nesibdev; + + memset(props, 0, sizeof(*props)); + memcpy(&props->sys_image_guid, nesvnic->netdev->dev_addr, 6); + + props->fw_ver = nesdev->nesadapter->firmware_version; + props->device_cap_flags = nesdev->nesadapter->device_cap_flags; + props->vendor_id = nesdev->nesadapter->vendor_id; + props->vendor_part_id = nesdev->nesadapter->vendor_part_id; + props->hw_ver = nesdev->nesadapter->hw_rev; + props->max_mr_size = 0x80000000; + props->max_qp = nesibdev->max_qp; + props->max_qp_wr = nesdev->nesadapter->max_qp_wr - 2; + props->max_sge = nesdev->nesadapter->max_sge; + props->max_cq = nesibdev->max_cq; + props->max_cqe = nesdev->nesadapter->max_cqe; + props->max_mr = nesibdev->max_mr; + props->max_mw = nesibdev->max_mr; + props->max_pd = nesibdev->max_pd; + props->max_sge_rd = 1; + switch (nesdev->nesadapter->max_irrq_wr) { + case 0: + props->max_qp_rd_atom = 2; + break; + case 1: + props->max_qp_rd_atom = 8; + break; + case 2: + props->max_qp_rd_atom = 32; + break; + case 3: + props->max_qp_rd_atom = 64; + break; + default: + props->max_qp_rd_atom = 0; + } + props->max_qp_init_rd_atom = props->max_qp_rd_atom; + props->atomic_cap = IB_ATOMIC_NONE; + props->max_map_per_fmr = 1; + + return 0; +} + + +/** + * nes_query_port + */ +static int nes_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) +{ + struct nes_vnic *nesvnic = to_nesvnic(ibdev); + struct net_device *netdev = nesvnic->netdev; + + memset(props, 0, sizeof(*props)); + + props->max_mtu = IB_MTU_4096; + + if (netdev->mtu >= 4096) + props->active_mtu = IB_MTU_4096; + else if (netdev->mtu >= 2048) + props->active_mtu = IB_MTU_2048; + else if (netdev->mtu >= 1024) + props->active_mtu = IB_MTU_1024; + else if (netdev->mtu >= 512) + props->active_mtu = IB_MTU_512; + else + props->active_mtu = IB_MTU_256; + + props->lid = 1; + props->lmc = 0; + props->sm_lid = 0; + props->sm_sl = 0; + if (netif_queue_stopped(netdev)) + props->state = IB_PORT_DOWN; + else if (nesvnic->linkup) + props->state = IB_PORT_ACTIVE; + else + props->state = IB_PORT_DOWN; + props->phys_state = 0; + props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_REINIT_SUP | + IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP; + props->gid_tbl_len = 1; + props->pkey_tbl_len = 1; + props->qkey_viol_cntr = 0; + props->active_width = IB_WIDTH_4X; + props->active_speed = IB_SPEED_SDR; + props->max_msg_sz = 0x80000000; + + return 0; +} + + +/** + * nes_query_pkey + */ +static int nes_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) +{ + *pkey = 0; + return 0; +} + + +/** + * nes_query_gid + */ +static int nes_query_gid(struct ib_device *ibdev, u8 port, + int index, union ib_gid *gid) +{ + struct nes_vnic *nesvnic = to_nesvnic(ibdev); + + memset(&(gid->raw[0]), 0, sizeof(gid->raw)); + memcpy(&(gid->raw[0]), nesvnic->netdev->dev_addr, 6); + + return 0; +} + + +/** + * nes_alloc_ucontext - Allocate the user context data structure. This keeps track + * of all objects associated with a particular user-mode client. + */ +static struct ib_ucontext *nes_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct nes_vnic *nesvnic = to_nesvnic(ibdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_alloc_ucontext_req req; + struct nes_alloc_ucontext_resp uresp; + struct nes_ucontext *nes_ucontext; + struct nes_ib_device *nesibdev = nesvnic->nesibdev; + + + if (ib_copy_from_udata(&req, udata, sizeof(struct nes_alloc_ucontext_req))) { + printk(KERN_ERR PFX "Invalid structure size on allocate user context.\n"); + return ERR_PTR(-EINVAL); + } + + if (req.userspace_ver != NES_ABI_USERSPACE_VER) { + printk(KERN_ERR PFX "Invalid userspace driver version detected. Detected version %d, should be %d\n", + req.userspace_ver, NES_ABI_USERSPACE_VER); + return ERR_PTR(-EINVAL); + } + + + memset(&uresp, 0, sizeof uresp); + + uresp.max_qps = nesibdev->max_qp; + uresp.max_pds = nesibdev->max_pd; + uresp.wq_size = nesdev->nesadapter->max_qp_wr * 2; + uresp.virtwq = nesadapter->virtwq; + uresp.kernel_ver = NES_ABI_KERNEL_VER; + + nes_ucontext = kzalloc(sizeof *nes_ucontext, GFP_KERNEL); + if (!nes_ucontext) + return ERR_PTR(-ENOMEM); + + nes_ucontext->nesdev = nesdev; + nes_ucontext->mmap_wq_offset = uresp.max_pds; + nes_ucontext->mmap_cq_offset = nes_ucontext->mmap_wq_offset + + ((sizeof(struct nes_hw_qp_wqe) * uresp.max_qps * 2) + PAGE_SIZE-1) / + PAGE_SIZE; + + + if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) { + kfree(nes_ucontext); + return ERR_PTR(-EFAULT); + } + + INIT_LIST_HEAD(&nes_ucontext->cq_reg_mem_list); + INIT_LIST_HEAD(&nes_ucontext->qp_reg_mem_list); + atomic_set(&nes_ucontext->usecnt, 1); + return &nes_ucontext->ibucontext; +} + + +/** + * nes_dealloc_ucontext + */ +static int nes_dealloc_ucontext(struct ib_ucontext *context) +{ + /* struct nes_vnic *nesvnic = to_nesvnic(context->device); */ + /* struct nes_device *nesdev = nesvnic->nesdev; */ + struct nes_ucontext *nes_ucontext = to_nesucontext(context); + + if (!atomic_dec_and_test(&nes_ucontext->usecnt)) + return 0; + kfree(nes_ucontext); + return 0; +} + + +/** + * nes_mmap + */ +static int nes_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + unsigned long index; + struct nes_vnic *nesvnic = to_nesvnic(context->device); + struct nes_device *nesdev = nesvnic->nesdev; + /* struct nes_adapter *nesadapter = nesdev->nesadapter; */ + struct nes_ucontext *nes_ucontext; + struct nes_qp *nesqp; + + nes_ucontext = to_nesucontext(context); + + + if (vma->vm_pgoff >= nes_ucontext->mmap_wq_offset) { + index = (vma->vm_pgoff - nes_ucontext->mmap_wq_offset) * PAGE_SIZE; + index /= ((sizeof(struct nes_hw_qp_wqe) * nesdev->nesadapter->max_qp_wr * 2) + + PAGE_SIZE-1) & (~(PAGE_SIZE-1)); + if (!test_bit(index, nes_ucontext->allocated_wqs)) { + nes_debug(NES_DBG_MMAP, "wq %lu not allocated\n", index); + return -EFAULT; + } + nesqp = nes_ucontext->mmap_nesqp[index]; + if (nesqp == NULL) { + nes_debug(NES_DBG_MMAP, "wq %lu has a NULL QP base.\n", index); + return -EFAULT; + } + if (remap_pfn_range(vma, vma->vm_start, + virt_to_phys(nesqp->hwqp.sq_vbase) >> PAGE_SHIFT, + vma->vm_end - vma->vm_start, + vma->vm_page_prot)) { + nes_debug(NES_DBG_MMAP, "remap_pfn_range failed.\n"); + return -EAGAIN; + } + vma->vm_private_data = nesqp; + return 0; + } else { + index = vma->vm_pgoff; + if (!test_bit(index, nes_ucontext->allocated_doorbells)) + return -EFAULT; + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + if (io_remap_pfn_range(vma, vma->vm_start, + (nesdev->doorbell_start + + ((nes_ucontext->mmap_db_index[index] - nesdev->base_doorbell_index) * 4096)) + >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot)) + return -EAGAIN; + vma->vm_private_data = nes_ucontext; + return 0; + } + + return -ENOSYS; +} + + +/** + * nes_alloc_pd + */ +static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, struct ib_udata *udata) +{ + struct nes_pd *nespd; + struct nes_vnic *nesvnic = to_nesvnic(ibdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_ucontext *nesucontext; + struct nes_alloc_pd_resp uresp; + u32 pd_num = 0; + int err; + + nes_debug(NES_DBG_PD, "nesvnic=%p, netdev=%p %s, ibdev=%p, context=%p, netdev refcnt=%u\n", + nesvnic, nesdev->netdev[0], nesdev->netdev[0]->name, ibdev, context, + netdev_refcnt_read(nesvnic->netdev)); + + err = nes_alloc_resource(nesadapter, nesadapter->allocated_pds, + nesadapter->max_pd, &pd_num, &nesadapter->next_pd, NES_RESOURCE_PD); + if (err) { + return ERR_PTR(err); + } + + nespd = kzalloc(sizeof (struct nes_pd), GFP_KERNEL); + if (!nespd) { + nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num); + return ERR_PTR(-ENOMEM); + } + + nes_debug(NES_DBG_PD, "Allocating PD (%p) for ib device %s\n", + nespd, nesvnic->nesibdev->ibdev.name); + + nespd->pd_id = (pd_num << (PAGE_SHIFT-12)) + nesadapter->base_pd; + + if (context) { + nesucontext = to_nesucontext(context); + nespd->mmap_db_index = find_next_zero_bit(nesucontext->allocated_doorbells, + NES_MAX_USER_DB_REGIONS, nesucontext->first_free_db); + nes_debug(NES_DBG_PD, "find_first_zero_biton doorbells returned %u, mapping pd_id %u.\n", + nespd->mmap_db_index, nespd->pd_id); + if (nespd->mmap_db_index >= NES_MAX_USER_DB_REGIONS) { + nes_debug(NES_DBG_PD, "mmap_db_index > MAX\n"); + nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num); + kfree(nespd); + return ERR_PTR(-ENOMEM); + } + + uresp.pd_id = nespd->pd_id; + uresp.mmap_db_index = nespd->mmap_db_index; + if (ib_copy_to_udata(udata, &uresp, sizeof (struct nes_alloc_pd_resp))) { + nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num); + kfree(nespd); + return ERR_PTR(-EFAULT); + } + + set_bit(nespd->mmap_db_index, nesucontext->allocated_doorbells); + nesucontext->mmap_db_index[nespd->mmap_db_index] = nespd->pd_id; + nesucontext->first_free_db = nespd->mmap_db_index + 1; + } + + nes_debug(NES_DBG_PD, "PD%u structure located @%p.\n", nespd->pd_id, nespd); + return &nespd->ibpd; +} + + +/** + * nes_dealloc_pd + */ +static int nes_dealloc_pd(struct ib_pd *ibpd) +{ + struct nes_ucontext *nesucontext; + struct nes_pd *nespd = to_nespd(ibpd); + struct nes_vnic *nesvnic = to_nesvnic(ibpd->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + + if ((ibpd->uobject) && (ibpd->uobject->context)) { + nesucontext = to_nesucontext(ibpd->uobject->context); + nes_debug(NES_DBG_PD, "Clearing bit %u from allocated doorbells\n", + nespd->mmap_db_index); + clear_bit(nespd->mmap_db_index, nesucontext->allocated_doorbells); + nesucontext->mmap_db_index[nespd->mmap_db_index] = 0; + if (nesucontext->first_free_db > nespd->mmap_db_index) { + nesucontext->first_free_db = nespd->mmap_db_index; + } + } + + nes_debug(NES_DBG_PD, "Deallocating PD%u structure located @%p.\n", + nespd->pd_id, nespd); + nes_free_resource(nesadapter, nesadapter->allocated_pds, + (nespd->pd_id-nesadapter->base_pd)>>(PAGE_SHIFT-12)); + kfree(nespd); + + return 0; +} + + +/** + * nes_create_ah + */ +static struct ib_ah *nes_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +{ + return ERR_PTR(-ENOSYS); +} + + +/** + * nes_destroy_ah + */ +static int nes_destroy_ah(struct ib_ah *ah) +{ + return -ENOSYS; +} + + +/** + * nes_get_encoded_size + */ +static inline u8 nes_get_encoded_size(int *size) +{ + u8 encoded_size = 0; + if (*size <= 32) { + *size = 32; + encoded_size = 1; + } else if (*size <= 128) { + *size = 128; + encoded_size = 2; + } else if (*size <= 512) { + *size = 512; + encoded_size = 3; + } + return (encoded_size); +} + + + +/** + * nes_setup_virt_qp + */ +static int nes_setup_virt_qp(struct nes_qp *nesqp, struct nes_pbl *nespbl, + struct nes_vnic *nesvnic, int sq_size, int rq_size) +{ + unsigned long flags; + void *mem; + __le64 *pbl = NULL; + __le64 *tpbl; + __le64 *pblbuffer; + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + u32 pbl_entries; + u8 rq_pbl_entries; + u8 sq_pbl_entries; + + pbl_entries = nespbl->pbl_size >> 3; + nes_debug(NES_DBG_QP, "Userspace PBL, pbl_size=%u, pbl_entries = %d pbl_vbase=%p, pbl_pbase=%lx\n", + nespbl->pbl_size, pbl_entries, + (void *)nespbl->pbl_vbase, + (unsigned long) nespbl->pbl_pbase); + pbl = (__le64 *) nespbl->pbl_vbase; /* points to first pbl entry */ + /* now lets set the sq_vbase as well as rq_vbase addrs we will assign */ + /* the first pbl to be fro the rq_vbase... */ + rq_pbl_entries = (rq_size * sizeof(struct nes_hw_qp_wqe)) >> 12; + sq_pbl_entries = (sq_size * sizeof(struct nes_hw_qp_wqe)) >> 12; + nesqp->hwqp.sq_pbase = (le32_to_cpu(((__le32 *)pbl)[0])) | ((u64)((le32_to_cpu(((__le32 *)pbl)[1]))) << 32); + if (!nespbl->page) { + nes_debug(NES_DBG_QP, "QP nespbl->page is NULL \n"); + kfree(nespbl); + return -ENOMEM; + } + + nesqp->hwqp.sq_vbase = kmap(nespbl->page); + nesqp->page = nespbl->page; + if (!nesqp->hwqp.sq_vbase) { + nes_debug(NES_DBG_QP, "QP sq_vbase kmap failed\n"); + kfree(nespbl); + return -ENOMEM; + } + + /* Now to get to sq.. we need to calculate how many */ + /* PBL entries were used by the rq.. */ + pbl += sq_pbl_entries; + nesqp->hwqp.rq_pbase = (le32_to_cpu(((__le32 *)pbl)[0])) | ((u64)((le32_to_cpu(((__le32 *)pbl)[1]))) << 32); + /* nesqp->hwqp.rq_vbase = bus_to_virt(*pbl); */ + /*nesqp->hwqp.rq_vbase = phys_to_virt(*pbl); */ + + nes_debug(NES_DBG_QP, "QP sq_vbase= %p sq_pbase=%lx rq_vbase=%p rq_pbase=%lx\n", + nesqp->hwqp.sq_vbase, (unsigned long) nesqp->hwqp.sq_pbase, + nesqp->hwqp.rq_vbase, (unsigned long) nesqp->hwqp.rq_pbase); + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + if (!nesadapter->free_256pbl) { + pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase, + nespbl->pbl_pbase); + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + kunmap(nesqp->page); + kfree(nespbl); + return -ENOMEM; + } + nesadapter->free_256pbl--; + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + + nesqp->pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 256, &nesqp->pbl_pbase); + pblbuffer = nesqp->pbl_vbase; + if (!nesqp->pbl_vbase) { + /* memory allocated during nes_reg_user_mr() */ + pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase, + nespbl->pbl_pbase); + kfree(nespbl); + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + nesadapter->free_256pbl++; + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + kunmap(nesqp->page); + return -ENOMEM; + } + memset(nesqp->pbl_vbase, 0, 256); + /* fill in the page address in the pbl buffer.. */ + tpbl = pblbuffer + 16; + pbl = (__le64 *)nespbl->pbl_vbase; + while (sq_pbl_entries--) + *tpbl++ = *pbl++; + tpbl = pblbuffer; + while (rq_pbl_entries--) + *tpbl++ = *pbl++; + + /* done with memory allocated during nes_reg_user_mr() */ + pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase, + nespbl->pbl_pbase); + kfree(nespbl); + + nesqp->qp_mem_size = + max((u32)sizeof(struct nes_qp_context), ((u32)256)) + 256; /* this is Q2 */ + /* Round up to a multiple of a page */ + nesqp->qp_mem_size += PAGE_SIZE - 1; + nesqp->qp_mem_size &= ~(PAGE_SIZE - 1); + + mem = pci_alloc_consistent(nesdev->pcidev, nesqp->qp_mem_size, + &nesqp->hwqp.q2_pbase); + + if (!mem) { + pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase); + nesqp->pbl_vbase = NULL; + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + nesadapter->free_256pbl++; + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + kunmap(nesqp->page); + return -ENOMEM; + } + nesqp->sq_kmapped = 1; + nesqp->hwqp.q2_vbase = mem; + mem += 256; + memset(nesqp->hwqp.q2_vbase, 0, 256); + nesqp->nesqp_context = mem; + memset(nesqp->nesqp_context, 0, sizeof(*nesqp->nesqp_context)); + nesqp->nesqp_context_pbase = nesqp->hwqp.q2_pbase + 256; + + return 0; +} + + +/** + * nes_setup_mmap_qp + */ +static int nes_setup_mmap_qp(struct nes_qp *nesqp, struct nes_vnic *nesvnic, + int sq_size, int rq_size) +{ + void *mem; + struct nes_device *nesdev = nesvnic->nesdev; + + nesqp->qp_mem_size = (sizeof(struct nes_hw_qp_wqe) * sq_size) + + (sizeof(struct nes_hw_qp_wqe) * rq_size) + + max((u32)sizeof(struct nes_qp_context), ((u32)256)) + + 256; /* this is Q2 */ + /* Round up to a multiple of a page */ + nesqp->qp_mem_size += PAGE_SIZE - 1; + nesqp->qp_mem_size &= ~(PAGE_SIZE - 1); + + mem = pci_alloc_consistent(nesdev->pcidev, nesqp->qp_mem_size, + &nesqp->hwqp.sq_pbase); + if (!mem) + return -ENOMEM; + nes_debug(NES_DBG_QP, "PCI consistent memory for " + "host descriptor rings located @ %p (pa = 0x%08lX.) size = %u.\n", + mem, (unsigned long)nesqp->hwqp.sq_pbase, nesqp->qp_mem_size); + + memset(mem, 0, nesqp->qp_mem_size); + + nesqp->hwqp.sq_vbase = mem; + mem += sizeof(struct nes_hw_qp_wqe) * sq_size; + + nesqp->hwqp.rq_vbase = mem; + nesqp->hwqp.rq_pbase = nesqp->hwqp.sq_pbase + + sizeof(struct nes_hw_qp_wqe) * sq_size; + mem += sizeof(struct nes_hw_qp_wqe) * rq_size; + + nesqp->hwqp.q2_vbase = mem; + nesqp->hwqp.q2_pbase = nesqp->hwqp.rq_pbase + + sizeof(struct nes_hw_qp_wqe) * rq_size; + mem += 256; + memset(nesqp->hwqp.q2_vbase, 0, 256); + + nesqp->nesqp_context = mem; + nesqp->nesqp_context_pbase = nesqp->hwqp.q2_pbase + 256; + memset(nesqp->nesqp_context, 0, sizeof(*nesqp->nesqp_context)); + return 0; +} + + +/** + * nes_free_qp_mem() is to free up the qp's pci_alloc_consistent() memory. + */ +static inline void nes_free_qp_mem(struct nes_device *nesdev, + struct nes_qp *nesqp, int virt_wqs) +{ + unsigned long flags; + struct nes_adapter *nesadapter = nesdev->nesadapter; + if (!virt_wqs) { + pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size, + nesqp->hwqp.sq_vbase, nesqp->hwqp.sq_pbase); + }else { + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + nesadapter->free_256pbl++; + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size, nesqp->hwqp.q2_vbase, nesqp->hwqp.q2_pbase); + pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase ); + nesqp->pbl_vbase = NULL; + if (nesqp->sq_kmapped) { + nesqp->sq_kmapped = 0; + kunmap(nesqp->page); + } + } +} + + +/** + * nes_create_qp + */ +static struct ib_qp *nes_create_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *init_attr, struct ib_udata *udata) +{ + u64 u64temp= 0; + u64 u64nesqp = 0; + struct nes_pd *nespd = to_nespd(ibpd); + struct nes_vnic *nesvnic = to_nesvnic(ibpd->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_qp *nesqp; + struct nes_cq *nescq; + struct nes_ucontext *nes_ucontext; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_cqp_request *cqp_request; + struct nes_create_qp_req req; + struct nes_create_qp_resp uresp; + struct nes_pbl *nespbl = NULL; + u32 qp_num = 0; + u32 opcode = 0; + /* u32 counter = 0; */ + void *mem; + unsigned long flags; + int ret; + int err; + int virt_wqs = 0; + int sq_size; + int rq_size; + u8 sq_encoded_size; + u8 rq_encoded_size; + /* int counter; */ + + if (init_attr->create_flags) + return ERR_PTR(-EINVAL); + + atomic_inc(&qps_created); + switch (init_attr->qp_type) { + case IB_QPT_RC: + if (nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) { + init_attr->cap.max_inline_data = 0; + } else { + init_attr->cap.max_inline_data = 64; + } + sq_size = init_attr->cap.max_send_wr; + rq_size = init_attr->cap.max_recv_wr; + + /* check if the encoded sizes are OK or not... */ + sq_encoded_size = nes_get_encoded_size(&sq_size); + rq_encoded_size = nes_get_encoded_size(&rq_size); + + if ((!sq_encoded_size) || (!rq_encoded_size)) { + nes_debug(NES_DBG_QP, "ERROR bad rq (%u) or sq (%u) size\n", + rq_size, sq_size); + return ERR_PTR(-EINVAL); + } + + init_attr->cap.max_send_wr = sq_size -2; + init_attr->cap.max_recv_wr = rq_size -1; + nes_debug(NES_DBG_QP, "RQ size=%u, SQ Size=%u\n", rq_size, sq_size); + + ret = nes_alloc_resource(nesadapter, nesadapter->allocated_qps, + nesadapter->max_qp, &qp_num, &nesadapter->next_qp, NES_RESOURCE_QP); + if (ret) { + return ERR_PTR(ret); + } + + /* Need 512 (actually now 1024) byte alignment on this structure */ + mem = kzalloc(sizeof(*nesqp)+NES_SW_CONTEXT_ALIGN-1, GFP_KERNEL); + if (!mem) { + nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); + nes_debug(NES_DBG_QP, "Unable to allocate QP\n"); + return ERR_PTR(-ENOMEM); + } + u64nesqp = (unsigned long)mem; + u64nesqp += ((u64)NES_SW_CONTEXT_ALIGN) - 1; + u64temp = ((u64)NES_SW_CONTEXT_ALIGN) - 1; + u64nesqp &= ~u64temp; + nesqp = (struct nes_qp *)(unsigned long)u64nesqp; + /* nes_debug(NES_DBG_QP, "nesqp=%p, allocated buffer=%p. Rounded to closest %u\n", + nesqp, mem, NES_SW_CONTEXT_ALIGN); */ + nesqp->allocated_buffer = mem; + + if (udata) { + if (ib_copy_from_udata(&req, udata, sizeof(struct nes_create_qp_req))) { + nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); + kfree(nesqp->allocated_buffer); + nes_debug(NES_DBG_QP, "ib_copy_from_udata() Failed \n"); + return ERR_PTR(-EFAULT); + } + if (req.user_wqe_buffers) { + virt_wqs = 1; + } + if (req.user_qp_buffer) + nesqp->nesuqp_addr = req.user_qp_buffer; + if ((ibpd->uobject) && (ibpd->uobject->context)) { + nesqp->user_mode = 1; + nes_ucontext = to_nesucontext(ibpd->uobject->context); + if (virt_wqs) { + err = 1; + list_for_each_entry(nespbl, &nes_ucontext->qp_reg_mem_list, list) { + if (nespbl->user_base == (unsigned long )req.user_wqe_buffers) { + list_del(&nespbl->list); + err = 0; + nes_debug(NES_DBG_QP, "Found PBL for virtual QP. nespbl=%p. user_base=0x%lx\n", + nespbl, nespbl->user_base); + break; + } + } + if (err) { + nes_debug(NES_DBG_QP, "Didn't Find PBL for virtual QP. address = %llx.\n", + (long long unsigned int)req.user_wqe_buffers); + nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); + kfree(nesqp->allocated_buffer); + return ERR_PTR(-EFAULT); + } + } + + nes_ucontext = to_nesucontext(ibpd->uobject->context); + nesqp->mmap_sq_db_index = + find_next_zero_bit(nes_ucontext->allocated_wqs, + NES_MAX_USER_WQ_REGIONS, nes_ucontext->first_free_wq); + /* nes_debug(NES_DBG_QP, "find_first_zero_biton wqs returned %u\n", + nespd->mmap_db_index); */ + if (nesqp->mmap_sq_db_index >= NES_MAX_USER_WQ_REGIONS) { + nes_debug(NES_DBG_QP, + "db index > max user regions, failing create QP\n"); + nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); + if (virt_wqs) { + pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase, + nespbl->pbl_pbase); + kfree(nespbl); + } + kfree(nesqp->allocated_buffer); + return ERR_PTR(-ENOMEM); + } + set_bit(nesqp->mmap_sq_db_index, nes_ucontext->allocated_wqs); + nes_ucontext->mmap_nesqp[nesqp->mmap_sq_db_index] = nesqp; + nes_ucontext->first_free_wq = nesqp->mmap_sq_db_index + 1; + } else { + nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); + kfree(nesqp->allocated_buffer); + return ERR_PTR(-EFAULT); + } + } + err = (!virt_wqs) ? nes_setup_mmap_qp(nesqp, nesvnic, sq_size, rq_size) : + nes_setup_virt_qp(nesqp, nespbl, nesvnic, sq_size, rq_size); + if (err) { + nes_debug(NES_DBG_QP, + "error geting qp mem code = %d\n", err); + nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); + kfree(nesqp->allocated_buffer); + return ERR_PTR(-ENOMEM); + } + + nesqp->hwqp.sq_size = sq_size; + nesqp->hwqp.sq_encoded_size = sq_encoded_size; + nesqp->hwqp.sq_head = 1; + nesqp->hwqp.rq_size = rq_size; + nesqp->hwqp.rq_encoded_size = rq_encoded_size; + /* nes_debug(NES_DBG_QP, "nesqp->nesqp_context_pbase = %p\n", + (void *)nesqp->nesqp_context_pbase); + */ + nesqp->hwqp.qp_id = qp_num; + nesqp->ibqp.qp_num = nesqp->hwqp.qp_id; + nesqp->nespd = nespd; + + nescq = to_nescq(init_attr->send_cq); + nesqp->nesscq = nescq; + nescq = to_nescq(init_attr->recv_cq); + nesqp->nesrcq = nescq; + + nesqp->nesqp_context->misc |= cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << + NES_QPCONTEXT_MISC_PCI_FCN_SHIFT); + nesqp->nesqp_context->misc |= cpu_to_le32((u32)nesqp->hwqp.rq_encoded_size << + NES_QPCONTEXT_MISC_RQ_SIZE_SHIFT); + nesqp->nesqp_context->misc |= cpu_to_le32((u32)nesqp->hwqp.sq_encoded_size << + NES_QPCONTEXT_MISC_SQ_SIZE_SHIFT); + if (!udata) { + nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_PRIV_EN); + nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_FAST_REGISTER_EN); + } + nesqp->nesqp_context->cqs = cpu_to_le32(nesqp->nesscq->hw_cq.cq_number + + ((u32)nesqp->nesrcq->hw_cq.cq_number << 16)); + u64temp = (u64)nesqp->hwqp.sq_pbase; + nesqp->nesqp_context->sq_addr_low = cpu_to_le32((u32)u64temp); + nesqp->nesqp_context->sq_addr_high = cpu_to_le32((u32)(u64temp >> 32)); + + + if (!virt_wqs) { + u64temp = (u64)nesqp->hwqp.sq_pbase; + nesqp->nesqp_context->sq_addr_low = cpu_to_le32((u32)u64temp); + nesqp->nesqp_context->sq_addr_high = cpu_to_le32((u32)(u64temp >> 32)); + u64temp = (u64)nesqp->hwqp.rq_pbase; + nesqp->nesqp_context->rq_addr_low = cpu_to_le32((u32)u64temp); + nesqp->nesqp_context->rq_addr_high = cpu_to_le32((u32)(u64temp >> 32)); + } else { + u64temp = (u64)nesqp->pbl_pbase; + nesqp->nesqp_context->rq_addr_low = cpu_to_le32((u32)u64temp); + nesqp->nesqp_context->rq_addr_high = cpu_to_le32((u32)(u64temp >> 32)); + } + + /* nes_debug(NES_DBG_QP, "next_qp_nic_index=%u, using nic_index=%d\n", + nesvnic->next_qp_nic_index, + nesvnic->qp_nic_index[nesvnic->next_qp_nic_index]); */ + spin_lock_irqsave(&nesdev->cqp.lock, flags); + nesqp->nesqp_context->misc2 |= cpu_to_le32( + (u32)nesvnic->qp_nic_index[nesvnic->next_qp_nic_index] << + NES_QPCONTEXT_MISC2_NIC_INDEX_SHIFT); + nesvnic->next_qp_nic_index++; + if ((nesvnic->next_qp_nic_index > 3) || + (nesvnic->qp_nic_index[nesvnic->next_qp_nic_index] == 0xf)) { + nesvnic->next_qp_nic_index = 0; + } + spin_unlock_irqrestore(&nesdev->cqp.lock, flags); + + nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32((u32)nesqp->nespd->pd_id << 16); + u64temp = (u64)nesqp->hwqp.q2_pbase; + nesqp->nesqp_context->q2_addr_low = cpu_to_le32((u32)u64temp); + nesqp->nesqp_context->q2_addr_high = cpu_to_le32((u32)(u64temp >> 32)); + nesqp->nesqp_context->aeq_token_low = cpu_to_le32((u32)((unsigned long)(nesqp))); + nesqp->nesqp_context->aeq_token_high = cpu_to_le32((u32)(upper_32_bits((unsigned long)(nesqp)))); + nesqp->nesqp_context->ird_ord_sizes = cpu_to_le32(NES_QPCONTEXT_ORDIRD_ALSMM | + NES_QPCONTEXT_ORDIRD_AAH | + ((((u32)nesadapter->max_irrq_wr) << + NES_QPCONTEXT_ORDIRD_IRDSIZE_SHIFT) & NES_QPCONTEXT_ORDIRD_IRDSIZE_MASK)); + if (disable_mpa_crc) { + nes_debug(NES_DBG_QP, "Disabling MPA crc checking due to module option.\n"); + nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32(NES_QPCONTEXT_ORDIRD_RNMC); + } + + + /* Create the QP */ + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_QP, "Failed to get a cqp_request\n"); + nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); + nes_free_qp_mem(nesdev, nesqp,virt_wqs); + kfree(nesqp->allocated_buffer); + return ERR_PTR(-ENOMEM); + } + cqp_request->waiting = 1; + cqp_wqe = &cqp_request->cqp_wqe; + + if (!virt_wqs) { + opcode = NES_CQP_CREATE_QP | NES_CQP_QP_TYPE_IWARP | + NES_CQP_QP_IWARP_STATE_IDLE; + } else { + opcode = NES_CQP_CREATE_QP | NES_CQP_QP_TYPE_IWARP | NES_CQP_QP_VIRT_WQS | + NES_CQP_QP_IWARP_STATE_IDLE; + } + opcode |= NES_CQP_QP_CQS_VALID; + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id); + + u64temp = (u64)nesqp->nesqp_context_pbase; + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp); + + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request); + + /* Wait for CQP */ + nes_debug(NES_DBG_QP, "Waiting for create iWARP QP%u to complete.\n", + nesqp->hwqp.qp_id); + ret = wait_event_timeout(cqp_request->waitq, + (cqp_request->request_done != 0), NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_QP, "Create iwarp QP%u completed, wait_event_timeout ret=%u," + " nesdev->cqp_head = %u, nesdev->cqp.sq_tail = %u," + " CQP Major:Minor codes = 0x%04X:0x%04X.\n", + nesqp->hwqp.qp_id, ret, nesdev->cqp.sq_head, nesdev->cqp.sq_tail, + cqp_request->major_code, cqp_request->minor_code); + if ((!ret) || (cqp_request->major_code)) { + nes_put_cqp_request(nesdev, cqp_request); + nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); + nes_free_qp_mem(nesdev, nesqp,virt_wqs); + kfree(nesqp->allocated_buffer); + if (!ret) { + return ERR_PTR(-ETIME); + } else { + return ERR_PTR(-EIO); + } + } + + nes_put_cqp_request(nesdev, cqp_request); + + if (ibpd->uobject) { + uresp.mmap_sq_db_index = nesqp->mmap_sq_db_index; + uresp.mmap_rq_db_index = 0; + uresp.actual_sq_size = sq_size; + uresp.actual_rq_size = rq_size; + uresp.qp_id = nesqp->hwqp.qp_id; + uresp.nes_drv_opt = nes_drv_opt; + if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) { + nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num); + nes_free_qp_mem(nesdev, nesqp,virt_wqs); + kfree(nesqp->allocated_buffer); + return ERR_PTR(-EFAULT); + } + } + + nes_debug(NES_DBG_QP, "QP%u structure located @%p.Size = %u.\n", + nesqp->hwqp.qp_id, nesqp, (u32)sizeof(*nesqp)); + spin_lock_init(&nesqp->lock); + nes_add_ref(&nesqp->ibqp); + break; + default: + nes_debug(NES_DBG_QP, "Invalid QP type: %d\n", init_attr->qp_type); + return ERR_PTR(-EINVAL); + } + + nesqp->sig_all = (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR); + init_timer(&nesqp->terminate_timer); + nesqp->terminate_timer.function = nes_terminate_timeout; + nesqp->terminate_timer.data = (unsigned long)nesqp; + + /* update the QP table */ + nesdev->nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = nesqp; + nes_debug(NES_DBG_QP, "netdev refcnt=%u\n", + netdev_refcnt_read(nesvnic->netdev)); + + return &nesqp->ibqp; +} + +/** + * nes_clean_cq + */ +static void nes_clean_cq(struct nes_qp *nesqp, struct nes_cq *nescq) +{ + u32 cq_head; + u32 lo; + u32 hi; + u64 u64temp; + unsigned long flags = 0; + + spin_lock_irqsave(&nescq->lock, flags); + + cq_head = nescq->hw_cq.cq_head; + while (le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_OPCODE_IDX]) & NES_CQE_VALID) { + rmb(); + lo = le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]); + hi = le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX]); + u64temp = (((u64)hi) << 32) | ((u64)lo); + u64temp &= ~(NES_SW_CONTEXT_ALIGN-1); + if (u64temp == (u64)(unsigned long)nesqp) { + /* Zero the context value so cqe will be ignored */ + nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX] = 0; + nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX] = 0; + } + + if (++cq_head >= nescq->hw_cq.cq_size) + cq_head = 0; + } + + spin_unlock_irqrestore(&nescq->lock, flags); +} + + +/** + * nes_destroy_qp + */ +static int nes_destroy_qp(struct ib_qp *ibqp) +{ + struct nes_qp *nesqp = to_nesqp(ibqp); + struct nes_ucontext *nes_ucontext; + struct ib_qp_attr attr; + struct iw_cm_id *cm_id; + struct iw_cm_event cm_event; + int ret = 0; + + atomic_inc(&sw_qps_destroyed); + nesqp->destroyed = 1; + + /* Blow away the connection if it exists. */ + if (nesqp->ibqp_state >= IB_QPS_INIT && nesqp->ibqp_state <= IB_QPS_RTS) { + /* if (nesqp->ibqp_state == IB_QPS_RTS) { */ + attr.qp_state = IB_QPS_ERR; + nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL); + } + + if (((nesqp->ibqp_state == IB_QPS_INIT) || + (nesqp->ibqp_state == IB_QPS_RTR)) && (nesqp->cm_id)) { + cm_id = nesqp->cm_id; + cm_event.event = IW_CM_EVENT_CONNECT_REPLY; + cm_event.status = -ETIMEDOUT; + cm_event.local_addr = cm_id->local_addr; + cm_event.remote_addr = cm_id->remote_addr; + cm_event.private_data = NULL; + cm_event.private_data_len = 0; + + nes_debug(NES_DBG_QP, "Generating a CM Timeout Event for " + "QP%u. cm_id = %p, refcount = %u. \n", + nesqp->hwqp.qp_id, cm_id, atomic_read(&nesqp->refcount)); + + cm_id->rem_ref(cm_id); + ret = cm_id->event_handler(cm_id, &cm_event); + if (ret) + nes_debug(NES_DBG_QP, "OFA CM event_handler returned, ret=%d\n", ret); + } + + if (nesqp->user_mode) { + if ((ibqp->uobject)&&(ibqp->uobject->context)) { + nes_ucontext = to_nesucontext(ibqp->uobject->context); + clear_bit(nesqp->mmap_sq_db_index, nes_ucontext->allocated_wqs); + nes_ucontext->mmap_nesqp[nesqp->mmap_sq_db_index] = NULL; + if (nes_ucontext->first_free_wq > nesqp->mmap_sq_db_index) { + nes_ucontext->first_free_wq = nesqp->mmap_sq_db_index; + } + } + if (nesqp->pbl_pbase && nesqp->sq_kmapped) { + nesqp->sq_kmapped = 0; + kunmap(nesqp->page); + } + } else { + /* Clean any pending completions from the cq(s) */ + if (nesqp->nesscq) + nes_clean_cq(nesqp, nesqp->nesscq); + + if ((nesqp->nesrcq) && (nesqp->nesrcq != nesqp->nesscq)) + nes_clean_cq(nesqp, nesqp->nesrcq); + } + nes_rem_ref(&nesqp->ibqp); + return 0; +} + + +/** + * nes_create_cq + */ +static struct ib_cq *nes_create_cq(struct ib_device *ibdev, int entries, + int comp_vector, + struct ib_ucontext *context, struct ib_udata *udata) +{ + u64 u64temp; + struct nes_vnic *nesvnic = to_nesvnic(ibdev); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_cq *nescq; + struct nes_ucontext *nes_ucontext = NULL; + struct nes_cqp_request *cqp_request; + void *mem = NULL; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_pbl *nespbl = NULL; + struct nes_create_cq_req req; + struct nes_create_cq_resp resp; + u32 cq_num = 0; + u32 opcode = 0; + u32 pbl_entries = 1; + int err; + unsigned long flags; + int ret; + + if (entries > nesadapter->max_cqe) + return ERR_PTR(-EINVAL); + + err = nes_alloc_resource(nesadapter, nesadapter->allocated_cqs, + nesadapter->max_cq, &cq_num, &nesadapter->next_cq, NES_RESOURCE_CQ); + if (err) { + return ERR_PTR(err); + } + + nescq = kzalloc(sizeof(struct nes_cq), GFP_KERNEL); + if (!nescq) { + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + nes_debug(NES_DBG_CQ, "Unable to allocate nes_cq struct\n"); + return ERR_PTR(-ENOMEM); + } + + nescq->hw_cq.cq_size = max(entries + 1, 5); + nescq->hw_cq.cq_number = cq_num; + nescq->ibcq.cqe = nescq->hw_cq.cq_size - 1; + + + if (context) { + nes_ucontext = to_nesucontext(context); + if (ib_copy_from_udata(&req, udata, sizeof (struct nes_create_cq_req))) { + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + kfree(nescq); + return ERR_PTR(-EFAULT); + } + nesvnic->mcrq_ucontext = nes_ucontext; + nes_ucontext->mcrqf = req.mcrqf; + if (nes_ucontext->mcrqf) { + if (nes_ucontext->mcrqf & 0x80000000) + nescq->hw_cq.cq_number = nesvnic->nic.qp_id + 28 + 2 * ((nes_ucontext->mcrqf & 0xf) - 1); + else if (nes_ucontext->mcrqf & 0x40000000) + nescq->hw_cq.cq_number = nes_ucontext->mcrqf & 0xffff; + else + nescq->hw_cq.cq_number = nesvnic->mcrq_qp_id + nes_ucontext->mcrqf-1; + nescq->mcrqf = nes_ucontext->mcrqf; + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + } + nes_debug(NES_DBG_CQ, "CQ Virtual Address = %08lX, size = %u.\n", + (unsigned long)req.user_cq_buffer, entries); + err = 1; + list_for_each_entry(nespbl, &nes_ucontext->cq_reg_mem_list, list) { + if (nespbl->user_base == (unsigned long )req.user_cq_buffer) { + list_del(&nespbl->list); + err = 0; + nes_debug(NES_DBG_CQ, "Found PBL for virtual CQ. nespbl=%p.\n", + nespbl); + break; + } + } + if (err) { + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + kfree(nescq); + return ERR_PTR(-EFAULT); + } + + pbl_entries = nespbl->pbl_size >> 3; + nescq->cq_mem_size = 0; + } else { + nescq->cq_mem_size = nescq->hw_cq.cq_size * sizeof(struct nes_hw_cqe); + nes_debug(NES_DBG_CQ, "Attempting to allocate pci memory (%u entries, %u bytes) for CQ%u.\n", + entries, nescq->cq_mem_size, nescq->hw_cq.cq_number); + + /* allocate the physical buffer space */ + mem = pci_zalloc_consistent(nesdev->pcidev, nescq->cq_mem_size, + &nescq->hw_cq.cq_pbase); + if (!mem) { + printk(KERN_ERR PFX "Unable to allocate pci memory for cq\n"); + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + kfree(nescq); + return ERR_PTR(-ENOMEM); + } + + nescq->hw_cq.cq_vbase = mem; + nescq->hw_cq.cq_head = 0; + nes_debug(NES_DBG_CQ, "CQ%u virtual address @ %p, phys = 0x%08X\n", + nescq->hw_cq.cq_number, nescq->hw_cq.cq_vbase, + (u32)nescq->hw_cq.cq_pbase); + } + + nescq->hw_cq.ce_handler = nes_iwarp_ce_handler; + spin_lock_init(&nescq->lock); + + /* send CreateCQ request to CQP */ + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_CQ, "Failed to get a cqp_request.\n"); + if (!context) + pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem, + nescq->hw_cq.cq_pbase); + else { + pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, + nespbl->pbl_vbase, nespbl->pbl_pbase); + kfree(nespbl); + } + + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + kfree(nescq); + return ERR_PTR(-ENOMEM); + } + cqp_request->waiting = 1; + cqp_wqe = &cqp_request->cqp_wqe; + + opcode = NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID | + NES_CQP_CQ_CHK_OVERFLOW | + NES_CQP_CQ_CEQE_MASK | ((u32)nescq->hw_cq.cq_size << 16); + + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + + if (pbl_entries != 1) { + if (pbl_entries > 32) { + /* use 4k pbl */ + nes_debug(NES_DBG_CQ, "pbl_entries=%u, use a 4k PBL\n", pbl_entries); + if (nesadapter->free_4kpbl == 0) { + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + nes_free_cqp_request(nesdev, cqp_request); + if (!context) + pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem, + nescq->hw_cq.cq_pbase); + else { + pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, + nespbl->pbl_vbase, nespbl->pbl_pbase); + kfree(nespbl); + } + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + kfree(nescq); + return ERR_PTR(-ENOMEM); + } else { + opcode |= (NES_CQP_CQ_VIRT | NES_CQP_CQ_4KB_CHUNK); + nescq->virtual_cq = 2; + nesadapter->free_4kpbl--; + } + } else { + /* use 256 byte pbl */ + nes_debug(NES_DBG_CQ, "pbl_entries=%u, use a 256 byte PBL\n", pbl_entries); + if (nesadapter->free_256pbl == 0) { + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + nes_free_cqp_request(nesdev, cqp_request); + if (!context) + pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem, + nescq->hw_cq.cq_pbase); + else { + pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, + nespbl->pbl_vbase, nespbl->pbl_pbase); + kfree(nespbl); + } + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + kfree(nescq); + return ERR_PTR(-ENOMEM); + } else { + opcode |= NES_CQP_CQ_VIRT; + nescq->virtual_cq = 1; + nesadapter->free_256pbl--; + } + } + } + + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, + (nescq->hw_cq.cq_number | ((u32)nesdev->ceq_index << 16))); + + if (context) { + if (pbl_entries != 1) + u64temp = (u64)nespbl->pbl_pbase; + else + u64temp = le64_to_cpu(nespbl->pbl_vbase[0]); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX, + nes_ucontext->mmap_db_index[0]); + } else { + u64temp = (u64)nescq->hw_cq.cq_pbase; + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0; + } + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0; + u64temp = (u64)(unsigned long)&nescq->hw_cq; + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] = + cpu_to_le32((u32)(u64temp >> 1)); + cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = + cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF); + + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request); + + /* Wait for CQP */ + nes_debug(NES_DBG_CQ, "Waiting for create iWARP CQ%u to complete.\n", + nescq->hw_cq.cq_number); + ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done), + NES_EVENT_TIMEOUT * 2); + nes_debug(NES_DBG_CQ, "Create iWARP CQ%u completed, wait_event_timeout ret = %d.\n", + nescq->hw_cq.cq_number, ret); + if ((!ret) || (cqp_request->major_code)) { + nes_put_cqp_request(nesdev, cqp_request); + if (!context) + pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem, + nescq->hw_cq.cq_pbase); + else { + pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, + nespbl->pbl_vbase, nespbl->pbl_pbase); + kfree(nespbl); + } + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + kfree(nescq); + return ERR_PTR(-EIO); + } + nes_put_cqp_request(nesdev, cqp_request); + + if (context) { + /* free the nespbl */ + pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase, + nespbl->pbl_pbase); + kfree(nespbl); + resp.cq_id = nescq->hw_cq.cq_number; + resp.cq_size = nescq->hw_cq.cq_size; + resp.mmap_db_index = 0; + if (ib_copy_to_udata(udata, &resp, sizeof resp - sizeof resp.reserved)) { + nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); + kfree(nescq); + return ERR_PTR(-EFAULT); + } + } + + return &nescq->ibcq; +} + + +/** + * nes_destroy_cq + */ +static int nes_destroy_cq(struct ib_cq *ib_cq) +{ + struct nes_cq *nescq; + struct nes_device *nesdev; + struct nes_vnic *nesvnic; + struct nes_adapter *nesadapter; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_cqp_request *cqp_request; + unsigned long flags; + u32 opcode = 0; + int ret; + + if (ib_cq == NULL) + return 0; + + nescq = to_nescq(ib_cq); + nesvnic = to_nesvnic(ib_cq->device); + nesdev = nesvnic->nesdev; + nesadapter = nesdev->nesadapter; + + nes_debug(NES_DBG_CQ, "Destroy CQ%u\n", nescq->hw_cq.cq_number); + + /* Send DestroyCQ request to CQP */ + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_CQ, "Failed to get a cqp_request.\n"); + return -ENOMEM; + } + cqp_request->waiting = 1; + cqp_wqe = &cqp_request->cqp_wqe; + opcode = NES_CQP_DESTROY_CQ | (nescq->hw_cq.cq_size << 16); + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + if (nescq->virtual_cq == 1) { + nesadapter->free_256pbl++; + if (nesadapter->free_256pbl > nesadapter->max_256pbl) { + printk(KERN_ERR PFX "%s: free 256B PBLs(%u) has exceeded the max(%u)\n", + __func__, nesadapter->free_256pbl, nesadapter->max_256pbl); + } + } else if (nescq->virtual_cq == 2) { + nesadapter->free_4kpbl++; + if (nesadapter->free_4kpbl > nesadapter->max_4kpbl) { + printk(KERN_ERR PFX "%s: free 4K PBLs(%u) has exceeded the max(%u)\n", + __func__, nesadapter->free_4kpbl, nesadapter->max_4kpbl); + } + opcode |= NES_CQP_CQ_4KB_CHUNK; + } + + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, + (nescq->hw_cq.cq_number | ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 16))); + if (!nescq->mcrqf) + nes_free_resource(nesadapter, nesadapter->allocated_cqs, nescq->hw_cq.cq_number); + + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request); + + /* Wait for CQP */ + nes_debug(NES_DBG_CQ, "Waiting for destroy iWARP CQ%u to complete.\n", + nescq->hw_cq.cq_number); + ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_CQ, "Destroy iWARP CQ%u completed, wait_event_timeout ret = %u," + " CQP Major:Minor codes = 0x%04X:0x%04X.\n", + nescq->hw_cq.cq_number, ret, cqp_request->major_code, + cqp_request->minor_code); + if (!ret) { + nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy timeout expired\n", + nescq->hw_cq.cq_number); + ret = -ETIME; + } else if (cqp_request->major_code) { + nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy failed\n", + nescq->hw_cq.cq_number); + ret = -EIO; + } else { + ret = 0; + } + nes_put_cqp_request(nesdev, cqp_request); + + if (nescq->cq_mem_size) + pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, + nescq->hw_cq.cq_vbase, nescq->hw_cq.cq_pbase); + kfree(nescq); + + return ret; +} + +/** + * root_256 + */ +static u32 root_256(struct nes_device *nesdev, + struct nes_root_vpbl *root_vpbl, + struct nes_root_vpbl *new_root, + u16 pbl_count_4k) +{ + u64 leaf_pbl; + int i, j, k; + + if (pbl_count_4k == 1) { + new_root->pbl_vbase = pci_alloc_consistent(nesdev->pcidev, + 512, &new_root->pbl_pbase); + + if (new_root->pbl_vbase == NULL) + return 0; + + leaf_pbl = (u64)root_vpbl->pbl_pbase; + for (i = 0; i < 16; i++) { + new_root->pbl_vbase[i].pa_low = + cpu_to_le32((u32)leaf_pbl); + new_root->pbl_vbase[i].pa_high = + cpu_to_le32((u32)((((u64)leaf_pbl) >> 32))); + leaf_pbl += 256; + } + } else { + for (i = 3; i >= 0; i--) { + j = i * 16; + root_vpbl->pbl_vbase[j] = root_vpbl->pbl_vbase[i]; + leaf_pbl = le32_to_cpu(root_vpbl->pbl_vbase[j].pa_low) + + (((u64)le32_to_cpu(root_vpbl->pbl_vbase[j].pa_high)) + << 32); + for (k = 1; k < 16; k++) { + leaf_pbl += 256; + root_vpbl->pbl_vbase[j + k].pa_low = + cpu_to_le32((u32)leaf_pbl); + root_vpbl->pbl_vbase[j + k].pa_high = + cpu_to_le32((u32)((((u64)leaf_pbl) >> 32))); + } + } + } + + return 1; +} + + +/** + * nes_reg_mr + */ +static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd, + u32 stag, u64 region_length, struct nes_root_vpbl *root_vpbl, + dma_addr_t single_buffer, u16 pbl_count_4k, + u16 residual_page_count_4k, int acc, u64 *iova_start, + u16 *actual_pbl_cnt, u8 *used_4k_pbls) +{ + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_cqp_request *cqp_request; + unsigned long flags; + int ret; + struct nes_adapter *nesadapter = nesdev->nesadapter; + uint pg_cnt = 0; + u16 pbl_count_256 = 0; + u16 pbl_count = 0; + u8 use_256_pbls = 0; + u8 use_4k_pbls = 0; + u16 use_two_level = (pbl_count_4k > 1) ? 1 : 0; + struct nes_root_vpbl new_root = { 0, NULL, NULL }; + u32 opcode = 0; + u16 major_code; + + /* Register the region with the adapter */ + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n"); + return -ENOMEM; + } + cqp_request->waiting = 1; + cqp_wqe = &cqp_request->cqp_wqe; + + if (pbl_count_4k) { + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + + pg_cnt = ((pbl_count_4k - 1) * 512) + residual_page_count_4k; + pbl_count_256 = (pg_cnt + 31) / 32; + if (pg_cnt <= 32) { + if (pbl_count_256 <= nesadapter->free_256pbl) + use_256_pbls = 1; + else if (pbl_count_4k <= nesadapter->free_4kpbl) + use_4k_pbls = 1; + } else if (pg_cnt <= 2048) { + if (((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) && + (nesadapter->free_4kpbl > (nesadapter->max_4kpbl >> 1))) { + use_4k_pbls = 1; + } else if ((pbl_count_256 + 1) <= nesadapter->free_256pbl) { + use_256_pbls = 1; + use_two_level = 1; + } else if ((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) { + use_4k_pbls = 1; + } + } else { + if ((pbl_count_4k + 1) <= nesadapter->free_4kpbl) + use_4k_pbls = 1; + } + + if (use_256_pbls) { + pbl_count = pbl_count_256; + nesadapter->free_256pbl -= pbl_count + use_two_level; + } else if (use_4k_pbls) { + pbl_count = pbl_count_4k; + nesadapter->free_4kpbl -= pbl_count + use_two_level; + } else { + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + nes_debug(NES_DBG_MR, "Out of Pbls\n"); + nes_free_cqp_request(nesdev, cqp_request); + return -ENOMEM; + } + + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + } + + if (use_256_pbls && use_two_level) { + if (root_256(nesdev, root_vpbl, &new_root, pbl_count_4k) == 1) { + if (new_root.pbl_pbase != 0) + root_vpbl = &new_root; + } else { + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + nesadapter->free_256pbl += pbl_count_256 + use_two_level; + use_256_pbls = 0; + + if (pbl_count_4k == 1) + use_two_level = 0; + pbl_count = pbl_count_4k; + + if ((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) { + nesadapter->free_4kpbl -= pbl_count + use_two_level; + use_4k_pbls = 1; + } + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + + if (use_4k_pbls == 0) + return -ENOMEM; + } + } + + opcode = NES_CQP_REGISTER_STAG | NES_CQP_STAG_RIGHTS_LOCAL_READ | + NES_CQP_STAG_VA_TO | NES_CQP_STAG_MR; + if (acc & IB_ACCESS_LOCAL_WRITE) + opcode |= NES_CQP_STAG_RIGHTS_LOCAL_WRITE; + if (acc & IB_ACCESS_REMOTE_WRITE) + opcode |= NES_CQP_STAG_RIGHTS_REMOTE_WRITE | NES_CQP_STAG_REM_ACC_EN; + if (acc & IB_ACCESS_REMOTE_READ) + opcode |= NES_CQP_STAG_RIGHTS_REMOTE_READ | NES_CQP_STAG_REM_ACC_EN; + if (acc & IB_ACCESS_MW_BIND) + opcode |= NES_CQP_STAG_RIGHTS_WINDOW_BIND | NES_CQP_STAG_REM_ACC_EN; + + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode); + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_VA_LOW_IDX, *iova_start); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_LOW_IDX, region_length); + + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] = + cpu_to_le32((u32)(region_length >> 8) & 0xff000000); + cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] |= + cpu_to_le32(nespd->pd_id & 0x00007fff); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag); + + if (pbl_count == 0) { + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, single_buffer); + } else { + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, root_vpbl->pbl_pbase); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX, pbl_count); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_LEN_IDX, (pg_cnt * 8)); + + if (use_4k_pbls) + cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_STAG_PBL_BLK_SIZE); + } + barrier(); + + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request); + + /* Wait for CQP */ + ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_MR, "Register STag 0x%08X completed, wait_event_timeout ret = %u," + " CQP Major:Minor codes = 0x%04X:0x%04X.\n", + stag, ret, cqp_request->major_code, cqp_request->minor_code); + major_code = cqp_request->major_code; + nes_put_cqp_request(nesdev, cqp_request); + + if ((!ret || major_code) && pbl_count != 0) { + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + if (use_256_pbls) + nesadapter->free_256pbl += pbl_count + use_two_level; + else if (use_4k_pbls) + nesadapter->free_4kpbl += pbl_count + use_two_level; + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + } + if (new_root.pbl_pbase) + pci_free_consistent(nesdev->pcidev, 512, new_root.pbl_vbase, + new_root.pbl_pbase); + + if (!ret) + return -ETIME; + else if (major_code) + return -EIO; + + *actual_pbl_cnt = pbl_count + use_two_level; + *used_4k_pbls = use_4k_pbls; + return 0; +} + + +/** + * nes_reg_phys_mr + */ +static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd, + struct ib_phys_buf *buffer_list, int num_phys_buf, int acc, + u64 * iova_start) +{ + u64 region_length; + struct nes_pd *nespd = to_nespd(ib_pd); + struct nes_vnic *nesvnic = to_nesvnic(ib_pd->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_mr *nesmr; + struct ib_mr *ibmr; + struct nes_vpbl vpbl; + struct nes_root_vpbl root_vpbl; + u32 stag; + u32 i; + unsigned long mask; + u32 stag_index = 0; + u32 next_stag_index = 0; + u32 driver_key = 0; + u32 root_pbl_index = 0; + u32 cur_pbl_index = 0; + int err = 0; + int ret = 0; + u16 pbl_count = 0; + u8 single_page = 1; + u8 stag_key = 0; + + region_length = 0; + vpbl.pbl_vbase = NULL; + root_vpbl.pbl_vbase = NULL; + root_vpbl.pbl_pbase = 0; + + get_random_bytes(&next_stag_index, sizeof(next_stag_index)); + stag_key = (u8)next_stag_index; + + driver_key = 0; + + next_stag_index >>= 8; + next_stag_index %= nesadapter->max_mr; + if (num_phys_buf > (1024*512)) { + return ERR_PTR(-E2BIG); + } + + if ((buffer_list[0].addr ^ *iova_start) & ~PAGE_MASK) + return ERR_PTR(-EINVAL); + + err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, nesadapter->max_mr, + &stag_index, &next_stag_index, NES_RESOURCE_PHYS_MR); + if (err) { + return ERR_PTR(err); + } + + nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL); + if (!nesmr) { + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + return ERR_PTR(-ENOMEM); + } + + for (i = 0; i < num_phys_buf; i++) { + + if ((i & 0x01FF) == 0) { + if (root_pbl_index == 1) { + /* Allocate the root PBL */ + root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 8192, + &root_vpbl.pbl_pbase); + nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n", + root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase); + if (!root_vpbl.pbl_vbase) { + pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, + vpbl.pbl_pbase); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + kfree(nesmr); + return ERR_PTR(-ENOMEM); + } + root_vpbl.leaf_vpbl = kzalloc(sizeof(*root_vpbl.leaf_vpbl)*1024, GFP_KERNEL); + if (!root_vpbl.leaf_vpbl) { + pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase, + root_vpbl.pbl_pbase); + pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, + vpbl.pbl_pbase); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + kfree(nesmr); + return ERR_PTR(-ENOMEM); + } + root_vpbl.pbl_vbase[0].pa_low = cpu_to_le32((u32)vpbl.pbl_pbase); + root_vpbl.pbl_vbase[0].pa_high = + cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32))); + root_vpbl.leaf_vpbl[0] = vpbl; + } + /* Allocate a 4K buffer for the PBL */ + vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096, + &vpbl.pbl_pbase); + nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%016lX\n", + vpbl.pbl_vbase, (unsigned long)vpbl.pbl_pbase); + if (!vpbl.pbl_vbase) { + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + ibmr = ERR_PTR(-ENOMEM); + kfree(nesmr); + goto reg_phys_err; + } + /* Fill in the root table */ + if (1 <= root_pbl_index) { + root_vpbl.pbl_vbase[root_pbl_index].pa_low = + cpu_to_le32((u32)vpbl.pbl_pbase); + root_vpbl.pbl_vbase[root_pbl_index].pa_high = + cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32))); + root_vpbl.leaf_vpbl[root_pbl_index] = vpbl; + } + root_pbl_index++; + cur_pbl_index = 0; + } + + mask = !buffer_list[i].size; + if (i != 0) + mask |= buffer_list[i].addr; + if (i != num_phys_buf - 1) + mask |= buffer_list[i].addr + buffer_list[i].size; + + if (mask & ~PAGE_MASK) { + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + nes_debug(NES_DBG_MR, "Invalid buffer addr or size\n"); + ibmr = ERR_PTR(-EINVAL); + kfree(nesmr); + goto reg_phys_err; + } + + region_length += buffer_list[i].size; + if ((i != 0) && (single_page)) { + if ((buffer_list[i-1].addr+PAGE_SIZE) != buffer_list[i].addr) + single_page = 0; + } + vpbl.pbl_vbase[cur_pbl_index].pa_low = cpu_to_le32((u32)buffer_list[i].addr & PAGE_MASK); + vpbl.pbl_vbase[cur_pbl_index++].pa_high = + cpu_to_le32((u32)((((u64)buffer_list[i].addr) >> 32))); + } + + stag = stag_index << 8; + stag |= driver_key; + stag += (u32)stag_key; + + nes_debug(NES_DBG_MR, "Registering STag 0x%08X, VA = 0x%016lX," + " length = 0x%016lX, index = 0x%08X\n", + stag, (unsigned long)*iova_start, (unsigned long)region_length, stag_index); + + /* Make the leaf PBL the root if only one PBL */ + if (root_pbl_index == 1) { + root_vpbl.pbl_pbase = vpbl.pbl_pbase; + } + + if (single_page) { + pbl_count = 0; + } else { + pbl_count = root_pbl_index; + } + ret = nes_reg_mr(nesdev, nespd, stag, region_length, &root_vpbl, + buffer_list[0].addr, pbl_count, (u16)cur_pbl_index, acc, iova_start, + &nesmr->pbls_used, &nesmr->pbl_4k); + + if (ret == 0) { + nesmr->ibmr.rkey = stag; + nesmr->ibmr.lkey = stag; + nesmr->mode = IWNES_MEMREG_TYPE_MEM; + ibmr = &nesmr->ibmr; + } else { + kfree(nesmr); + ibmr = ERR_PTR(-ENOMEM); + } + + reg_phys_err: + /* free the resources */ + if (root_pbl_index == 1) { + /* single PBL case */ + pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, vpbl.pbl_pbase); + } else { + for (i=0; ipcidev, 4096, root_vpbl.leaf_vpbl[i].pbl_vbase, + root_vpbl.leaf_vpbl[i].pbl_pbase); + } + kfree(root_vpbl.leaf_vpbl); + pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase, + root_vpbl.pbl_pbase); + } + + return ibmr; +} + + +/** + * nes_get_dma_mr + */ +static struct ib_mr *nes_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct ib_phys_buf bl; + u64 kva = 0; + + nes_debug(NES_DBG_MR, "\n"); + + bl.size = (u64)0xffffffffffULL; + bl.addr = 0; + return nes_reg_phys_mr(pd, &bl, 1, acc, &kva); +} + + +/** + * nes_reg_user_mr + */ +static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt, int acc, struct ib_udata *udata) +{ + u64 iova_start; + __le64 *pbl; + u64 region_length; + dma_addr_t last_dma_addr = 0; + dma_addr_t first_dma_addr = 0; + struct nes_pd *nespd = to_nespd(pd); + struct nes_vnic *nesvnic = to_nesvnic(pd->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct ib_mr *ibmr = ERR_PTR(-EINVAL); + struct scatterlist *sg; + struct nes_ucontext *nes_ucontext; + struct nes_pbl *nespbl; + struct nes_mr *nesmr; + struct ib_umem *region; + struct nes_mem_reg_req req; + struct nes_vpbl vpbl; + struct nes_root_vpbl root_vpbl; + int entry, page_index; + int page_count = 0; + int err, pbl_depth = 0; + int chunk_pages; + int ret; + u32 stag; + u32 stag_index = 0; + u32 next_stag_index; + u32 driver_key; + u32 root_pbl_index = 0; + u32 cur_pbl_index = 0; + u32 skip_pages; + u16 pbl_count; + u8 single_page = 1; + u8 stag_key; + int first_page = 1; + + region = ib_umem_get(pd->uobject->context, start, length, acc, 0); + if (IS_ERR(region)) { + return (struct ib_mr *)region; + } + + nes_debug(NES_DBG_MR, "User base = 0x%lX, Virt base = 0x%lX, length = %u," + " offset = %u, page size = %u.\n", + (unsigned long int)start, (unsigned long int)virt, (u32)length, + region->offset, region->page_size); + + skip_pages = ((u32)region->offset) >> 12; + + if (ib_copy_from_udata(&req, udata, sizeof(req))) { + ib_umem_release(region); + return ERR_PTR(-EFAULT); + } + nes_debug(NES_DBG_MR, "Memory Registration type = %08X.\n", req.reg_type); + + switch (req.reg_type) { + case IWNES_MEMREG_TYPE_MEM: + pbl_depth = 0; + region_length = 0; + vpbl.pbl_vbase = NULL; + root_vpbl.pbl_vbase = NULL; + root_vpbl.pbl_pbase = 0; + + get_random_bytes(&next_stag_index, sizeof(next_stag_index)); + stag_key = (u8)next_stag_index; + + driver_key = next_stag_index & 0x70000000; + + next_stag_index >>= 8; + next_stag_index %= nesadapter->max_mr; + + err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, + nesadapter->max_mr, &stag_index, &next_stag_index, NES_RESOURCE_USER_MR); + if (err) { + ib_umem_release(region); + return ERR_PTR(err); + } + + nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL); + if (!nesmr) { + ib_umem_release(region); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + return ERR_PTR(-ENOMEM); + } + nesmr->region = region; + + for_each_sg(region->sg_head.sgl, sg, region->nmap, entry) { + if (sg_dma_address(sg) & ~PAGE_MASK) { + ib_umem_release(region); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + nes_debug(NES_DBG_MR, "Unaligned Memory Buffer: 0x%x\n", + (unsigned int) sg_dma_address(sg)); + ibmr = ERR_PTR(-EINVAL); + kfree(nesmr); + goto reg_user_mr_err; + } + + if (!sg_dma_len(sg)) { + ib_umem_release(region); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, + stag_index); + nes_debug(NES_DBG_MR, "Invalid Buffer Size\n"); + ibmr = ERR_PTR(-EINVAL); + kfree(nesmr); + goto reg_user_mr_err; + } + + region_length += sg_dma_len(sg); + chunk_pages = sg_dma_len(sg) >> 12; + region_length -= skip_pages << 12; + for (page_index = skip_pages; page_index < chunk_pages; page_index++) { + skip_pages = 0; + if ((page_count != 0) && (page_count<<12)-(region->offset&(4096-1)) >= region->length) + goto enough_pages; + if ((page_count&0x01FF) == 0) { + if (page_count >= 1024 * 512) { + ib_umem_release(region); + nes_free_resource(nesadapter, + nesadapter->allocated_mrs, stag_index); + kfree(nesmr); + ibmr = ERR_PTR(-E2BIG); + goto reg_user_mr_err; + } + if (root_pbl_index == 1) { + root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, + 8192, &root_vpbl.pbl_pbase); + nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n", + root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase); + if (!root_vpbl.pbl_vbase) { + ib_umem_release(region); + pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, + vpbl.pbl_pbase); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, + stag_index); + kfree(nesmr); + ibmr = ERR_PTR(-ENOMEM); + goto reg_user_mr_err; + } + root_vpbl.leaf_vpbl = kzalloc(sizeof(*root_vpbl.leaf_vpbl)*1024, + GFP_KERNEL); + if (!root_vpbl.leaf_vpbl) { + ib_umem_release(region); + pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase, + root_vpbl.pbl_pbase); + pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, + vpbl.pbl_pbase); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, + stag_index); + kfree(nesmr); + ibmr = ERR_PTR(-ENOMEM); + goto reg_user_mr_err; + } + root_vpbl.pbl_vbase[0].pa_low = + cpu_to_le32((u32)vpbl.pbl_pbase); + root_vpbl.pbl_vbase[0].pa_high = + cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32))); + root_vpbl.leaf_vpbl[0] = vpbl; + } + vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096, + &vpbl.pbl_pbase); + nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%08X\n", + vpbl.pbl_vbase, (unsigned int)vpbl.pbl_pbase); + if (!vpbl.pbl_vbase) { + ib_umem_release(region); + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + ibmr = ERR_PTR(-ENOMEM); + kfree(nesmr); + goto reg_user_mr_err; + } + if (1 <= root_pbl_index) { + root_vpbl.pbl_vbase[root_pbl_index].pa_low = + cpu_to_le32((u32)vpbl.pbl_pbase); + root_vpbl.pbl_vbase[root_pbl_index].pa_high = + cpu_to_le32((u32)((((u64)vpbl.pbl_pbase)>>32))); + root_vpbl.leaf_vpbl[root_pbl_index] = vpbl; + } + root_pbl_index++; + cur_pbl_index = 0; + } + if (single_page) { + if (page_count != 0) { + if ((last_dma_addr+4096) != + (sg_dma_address(sg)+ + (page_index*4096))) + single_page = 0; + last_dma_addr = sg_dma_address(sg)+ + (page_index*4096); + } else { + first_dma_addr = sg_dma_address(sg)+ + (page_index*4096); + last_dma_addr = first_dma_addr; + } + } + + vpbl.pbl_vbase[cur_pbl_index].pa_low = + cpu_to_le32((u32)(sg_dma_address(sg)+ + (page_index*4096))); + vpbl.pbl_vbase[cur_pbl_index].pa_high = + cpu_to_le32((u32)((((u64)(sg_dma_address(sg)+ + (page_index*4096))) >> 32))); + cur_pbl_index++; + page_count++; + } + } + + enough_pages: + nes_debug(NES_DBG_MR, "calculating stag, stag_index=0x%08x, driver_key=0x%08x," + " stag_key=0x%08x\n", + stag_index, driver_key, stag_key); + stag = stag_index << 8; + stag |= driver_key; + stag += (u32)stag_key; + + iova_start = virt; + /* Make the leaf PBL the root if only one PBL */ + if (root_pbl_index == 1) { + root_vpbl.pbl_pbase = vpbl.pbl_pbase; + } + + if (single_page) { + pbl_count = 0; + } else { + pbl_count = root_pbl_index; + first_dma_addr = 0; + } + nes_debug(NES_DBG_MR, "Registering STag 0x%08X, VA = 0x%08X, length = 0x%08X," + " index = 0x%08X, region->length=0x%08llx, pbl_count = %u\n", + stag, (unsigned int)iova_start, + (unsigned int)region_length, stag_index, + (unsigned long long)region->length, pbl_count); + ret = nes_reg_mr(nesdev, nespd, stag, region->length, &root_vpbl, + first_dma_addr, pbl_count, (u16)cur_pbl_index, acc, + &iova_start, &nesmr->pbls_used, &nesmr->pbl_4k); + + nes_debug(NES_DBG_MR, "ret=%d\n", ret); + + if (ret == 0) { + nesmr->ibmr.rkey = stag; + nesmr->ibmr.lkey = stag; + nesmr->mode = IWNES_MEMREG_TYPE_MEM; + ibmr = &nesmr->ibmr; + } else { + ib_umem_release(region); + kfree(nesmr); + ibmr = ERR_PTR(-ENOMEM); + } + + reg_user_mr_err: + /* free the resources */ + if (root_pbl_index == 1) { + pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, + vpbl.pbl_pbase); + } else { + for (page_index=0; page_indexpcidev, 4096, + root_vpbl.leaf_vpbl[page_index].pbl_vbase, + root_vpbl.leaf_vpbl[page_index].pbl_pbase); + } + kfree(root_vpbl.leaf_vpbl); + pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase, + root_vpbl.pbl_pbase); + } + + nes_debug(NES_DBG_MR, "Leaving, ibmr=%p", ibmr); + + return ibmr; + case IWNES_MEMREG_TYPE_QP: + case IWNES_MEMREG_TYPE_CQ: + if (!region->length) { + nes_debug(NES_DBG_MR, "Unable to register zero length region for CQ\n"); + ib_umem_release(region); + return ERR_PTR(-EINVAL); + } + nespbl = kzalloc(sizeof(*nespbl), GFP_KERNEL); + if (!nespbl) { + nes_debug(NES_DBG_MR, "Unable to allocate PBL\n"); + ib_umem_release(region); + return ERR_PTR(-ENOMEM); + } + nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL); + if (!nesmr) { + ib_umem_release(region); + kfree(nespbl); + nes_debug(NES_DBG_MR, "Unable to allocate nesmr\n"); + return ERR_PTR(-ENOMEM); + } + nesmr->region = region; + nes_ucontext = to_nesucontext(pd->uobject->context); + pbl_depth = region->length >> 12; + pbl_depth += (region->length & (4096-1)) ? 1 : 0; + nespbl->pbl_size = pbl_depth*sizeof(u64); + if (req.reg_type == IWNES_MEMREG_TYPE_QP) { + nes_debug(NES_DBG_MR, "Attempting to allocate QP PBL memory"); + } else { + nes_debug(NES_DBG_MR, "Attempting to allocate CP PBL memory"); + } + + nes_debug(NES_DBG_MR, " %u bytes, %u entries.\n", + nespbl->pbl_size, pbl_depth); + pbl = pci_alloc_consistent(nesdev->pcidev, nespbl->pbl_size, + &nespbl->pbl_pbase); + if (!pbl) { + ib_umem_release(region); + kfree(nesmr); + kfree(nespbl); + nes_debug(NES_DBG_MR, "Unable to allocate PBL memory\n"); + return ERR_PTR(-ENOMEM); + } + + nespbl->pbl_vbase = (u64 *)pbl; + nespbl->user_base = start; + nes_debug(NES_DBG_MR, "Allocated PBL memory, %u bytes, pbl_pbase=%lx," + " pbl_vbase=%p user_base=0x%lx\n", + nespbl->pbl_size, (unsigned long) nespbl->pbl_pbase, + (void *) nespbl->pbl_vbase, nespbl->user_base); + + for_each_sg(region->sg_head.sgl, sg, region->nmap, entry) { + chunk_pages = sg_dma_len(sg) >> 12; + chunk_pages += (sg_dma_len(sg) & (4096-1)) ? 1 : 0; + if (first_page) { + nespbl->page = sg_page(sg); + first_page = 0; + } + + for (page_index = 0; page_index < chunk_pages; page_index++) { + ((__le32 *)pbl)[0] = cpu_to_le32((u32) + (sg_dma_address(sg)+ + (page_index*4096))); + ((__le32 *)pbl)[1] = cpu_to_le32(((u64) + (sg_dma_address(sg)+ + (page_index*4096)))>>32); + nes_debug(NES_DBG_MR, "pbl=%p, *pbl=0x%016llx, 0x%08x%08x\n", pbl, + (unsigned long long)*pbl, + le32_to_cpu(((__le32 *)pbl)[1]), le32_to_cpu(((__le32 *)pbl)[0])); + pbl++; + } + } + + if (req.reg_type == IWNES_MEMREG_TYPE_QP) { + list_add_tail(&nespbl->list, &nes_ucontext->qp_reg_mem_list); + } else { + list_add_tail(&nespbl->list, &nes_ucontext->cq_reg_mem_list); + } + nesmr->ibmr.rkey = -1; + nesmr->ibmr.lkey = -1; + nesmr->mode = req.reg_type; + return &nesmr->ibmr; + } + + ib_umem_release(region); + return ERR_PTR(-ENOSYS); +} + + +/** + * nes_dereg_mr + */ +static int nes_dereg_mr(struct ib_mr *ib_mr) +{ + struct nes_mr *nesmr = to_nesmr(ib_mr); + struct nes_vnic *nesvnic = to_nesvnic(ib_mr->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + struct nes_hw_cqp_wqe *cqp_wqe; + struct nes_cqp_request *cqp_request; + unsigned long flags; + int ret; + u16 major_code; + u16 minor_code; + + if (nesmr->region) { + ib_umem_release(nesmr->region); + } + if (nesmr->mode != IWNES_MEMREG_TYPE_MEM) { + kfree(nesmr); + return 0; + } + + /* Deallocate the region with the adapter */ + + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n"); + return -ENOMEM; + } + cqp_request->waiting = 1; + cqp_wqe = &cqp_request->cqp_wqe; + + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + NES_CQP_DEALLOCATE_STAG | NES_CQP_STAG_VA_TO | + NES_CQP_STAG_DEALLOC_PBLS | NES_CQP_STAG_MR); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, ib_mr->rkey); + + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request); + + /* Wait for CQP */ + nes_debug(NES_DBG_MR, "Waiting for deallocate STag 0x%08X completed\n", ib_mr->rkey); + ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_MR, "Deallocate STag 0x%08X completed, wait_event_timeout ret = %u," + " CQP Major:Minor codes = 0x%04X:0x%04X\n", + ib_mr->rkey, ret, cqp_request->major_code, cqp_request->minor_code); + + major_code = cqp_request->major_code; + minor_code = cqp_request->minor_code; + + nes_put_cqp_request(nesdev, cqp_request); + + if (!ret) { + nes_debug(NES_DBG_MR, "Timeout waiting to destroy STag," + " ib_mr=%p, rkey = 0x%08X\n", + ib_mr, ib_mr->rkey); + return -ETIME; + } else if (major_code) { + nes_debug(NES_DBG_MR, "Error (0x%04X:0x%04X) while attempting" + " to destroy STag, ib_mr=%p, rkey = 0x%08X\n", + major_code, minor_code, ib_mr, ib_mr->rkey); + return -EIO; + } + + if (nesmr->pbls_used != 0) { + spin_lock_irqsave(&nesadapter->pbl_lock, flags); + if (nesmr->pbl_4k) { + nesadapter->free_4kpbl += nesmr->pbls_used; + if (nesadapter->free_4kpbl > nesadapter->max_4kpbl) + printk(KERN_ERR PFX "free 4KB PBLs(%u) has " + "exceeded the max(%u)\n", + nesadapter->free_4kpbl, + nesadapter->max_4kpbl); + } else { + nesadapter->free_256pbl += nesmr->pbls_used; + if (nesadapter->free_256pbl > nesadapter->max_256pbl) + printk(KERN_ERR PFX "free 256B PBLs(%u) has " + "exceeded the max(%u)\n", + nesadapter->free_256pbl, + nesadapter->max_256pbl); + } + spin_unlock_irqrestore(&nesadapter->pbl_lock, flags); + } + nes_free_resource(nesadapter, nesadapter->allocated_mrs, + (ib_mr->rkey & 0x0fffff00) >> 8); + + kfree(nesmr); + + return 0; +} + + +/** + * show_rev + */ +static ssize_t show_rev(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nes_ib_device *nesibdev = + container_of(dev, struct nes_ib_device, ibdev.dev); + struct nes_vnic *nesvnic = nesibdev->nesvnic; + + nes_debug(NES_DBG_INIT, "\n"); + return sprintf(buf, "%x\n", nesvnic->nesdev->nesadapter->hw_rev); +} + + +/** + * show_fw_ver + */ +static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nes_ib_device *nesibdev = + container_of(dev, struct nes_ib_device, ibdev.dev); + struct nes_vnic *nesvnic = nesibdev->nesvnic; + + nes_debug(NES_DBG_INIT, "\n"); + return sprintf(buf, "%u.%u\n", + (nesvnic->nesdev->nesadapter->firmware_version >> 16), + (nesvnic->nesdev->nesadapter->firmware_version & 0x000000ff)); +} + + +/** + * show_hca + */ +static ssize_t show_hca(struct device *dev, struct device_attribute *attr, + char *buf) +{ + nes_debug(NES_DBG_INIT, "\n"); + return sprintf(buf, "NES020\n"); +} + + +/** + * show_board + */ +static ssize_t show_board(struct device *dev, struct device_attribute *attr, + char *buf) +{ + nes_debug(NES_DBG_INIT, "\n"); + return sprintf(buf, "%.*s\n", 32, "NES020 Board ID"); +} + + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); + +static struct device_attribute *nes_dev_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_fw_ver, + &dev_attr_hca_type, + &dev_attr_board_id +}; + + +/** + * nes_query_qp + */ +static int nes_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_qp_init_attr *init_attr) +{ + struct nes_qp *nesqp = to_nesqp(ibqp); + + nes_debug(NES_DBG_QP, "\n"); + + attr->qp_access_flags = 0; + attr->cap.max_send_wr = nesqp->hwqp.sq_size; + attr->cap.max_recv_wr = nesqp->hwqp.rq_size; + attr->cap.max_recv_sge = 1; + if (nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) + attr->cap.max_inline_data = 0; + else + attr->cap.max_inline_data = 64; + + init_attr->event_handler = nesqp->ibqp.event_handler; + init_attr->qp_context = nesqp->ibqp.qp_context; + init_attr->send_cq = nesqp->ibqp.send_cq; + init_attr->recv_cq = nesqp->ibqp.recv_cq; + init_attr->srq = nesqp->ibqp.srq; + init_attr->cap = attr->cap; + + return 0; +} + + +/** + * nes_hw_modify_qp + */ +int nes_hw_modify_qp(struct nes_device *nesdev, struct nes_qp *nesqp, + u32 next_iwarp_state, u32 termlen, u32 wait_completion) +{ + struct nes_hw_cqp_wqe *cqp_wqe; + /* struct iw_cm_id *cm_id = nesqp->cm_id; */ + /* struct iw_cm_event cm_event; */ + struct nes_cqp_request *cqp_request; + int ret; + u16 major_code; + + nes_debug(NES_DBG_MOD_QP, "QP%u, refcount=%d\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount)); + + cqp_request = nes_get_cqp_request(nesdev); + if (cqp_request == NULL) { + nes_debug(NES_DBG_MOD_QP, "Failed to get a cqp_request.\n"); + return -ENOMEM; + } + if (wait_completion) { + cqp_request->waiting = 1; + } else { + cqp_request->waiting = 0; + } + cqp_wqe = &cqp_request->cqp_wqe; + + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, + NES_CQP_MODIFY_QP | NES_CQP_QP_TYPE_IWARP | next_iwarp_state); + nes_debug(NES_DBG_MOD_QP, "using next_iwarp_state=%08x, wqe_words=%08x\n", + next_iwarp_state, le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX])); + nes_fill_init_cqp_wqe(cqp_wqe, nesdev); + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id); + set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, (u64)nesqp->nesqp_context_pbase); + + /* If sending a terminate message, fill in the length (in words) */ + if (((next_iwarp_state & NES_CQP_QP_IWARP_STATE_MASK) == NES_CQP_QP_IWARP_STATE_TERMINATE) && + !(next_iwarp_state & NES_CQP_QP_TERM_DONT_SEND_TERM_MSG)) { + termlen = ((termlen + 3) >> 2) << NES_CQP_OP_TERMLEN_SHIFT; + set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_NEW_MSS_IDX, termlen); + } + + atomic_set(&cqp_request->refcount, 2); + nes_post_cqp_request(nesdev, cqp_request); + + /* Wait for CQP */ + if (wait_completion) { + /* nes_debug(NES_DBG_MOD_QP, "Waiting for modify iWARP QP%u to complete.\n", + nesqp->hwqp.qp_id); */ + ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0), + NES_EVENT_TIMEOUT); + nes_debug(NES_DBG_MOD_QP, "Modify iwarp QP%u completed, wait_event_timeout ret=%u, " + "CQP Major:Minor codes = 0x%04X:0x%04X.\n", + nesqp->hwqp.qp_id, ret, cqp_request->major_code, cqp_request->minor_code); + major_code = cqp_request->major_code; + if (major_code) { + nes_debug(NES_DBG_MOD_QP, "Modify iwarp QP%u failed" + "CQP Major:Minor codes = 0x%04X:0x%04X, intended next state = 0x%08X.\n", + nesqp->hwqp.qp_id, cqp_request->major_code, + cqp_request->minor_code, next_iwarp_state); + } + + nes_put_cqp_request(nesdev, cqp_request); + + if (!ret) + return -ETIME; + else if (major_code) + return -EIO; + else + return 0; + } else { + return 0; + } +} + + +/** + * nes_modify_qp + */ +int nes_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct nes_qp *nesqp = to_nesqp(ibqp); + struct nes_vnic *nesvnic = to_nesvnic(ibqp->device); + struct nes_device *nesdev = nesvnic->nesdev; + /* u32 cqp_head; */ + /* u32 counter; */ + u32 next_iwarp_state = 0; + int err; + unsigned long qplockflags; + int ret; + u16 original_last_aeq; + u8 issue_modify_qp = 0; + u8 dont_wait = 0; + + nes_debug(NES_DBG_MOD_QP, "QP%u: QP State=%u, cur QP State=%u," + " iwarp_state=0x%X, refcount=%d\n", + nesqp->hwqp.qp_id, attr->qp_state, nesqp->ibqp_state, + nesqp->iwarp_state, atomic_read(&nesqp->refcount)); + + spin_lock_irqsave(&nesqp->lock, qplockflags); + + nes_debug(NES_DBG_MOD_QP, "QP%u: hw_iwarp_state=0x%X, hw_tcp_state=0x%X," + " QP Access Flags=0x%X, attr_mask = 0x%0x\n", + nesqp->hwqp.qp_id, nesqp->hw_iwarp_state, + nesqp->hw_tcp_state, attr->qp_access_flags, attr_mask); + + if (attr_mask & IB_QP_STATE) { + switch (attr->qp_state) { + case IB_QPS_INIT: + nes_debug(NES_DBG_MOD_QP, "QP%u: new state = init\n", + nesqp->hwqp.qp_id); + if (nesqp->iwarp_state > (u32)NES_CQP_QP_IWARP_STATE_IDLE) { + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + return -EINVAL; + } + next_iwarp_state = NES_CQP_QP_IWARP_STATE_IDLE; + issue_modify_qp = 1; + break; + case IB_QPS_RTR: + nes_debug(NES_DBG_MOD_QP, "QP%u: new state = rtr\n", + nesqp->hwqp.qp_id); + if (nesqp->iwarp_state>(u32)NES_CQP_QP_IWARP_STATE_IDLE) { + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + return -EINVAL; + } + next_iwarp_state = NES_CQP_QP_IWARP_STATE_IDLE; + issue_modify_qp = 1; + break; + case IB_QPS_RTS: + nes_debug(NES_DBG_MOD_QP, "QP%u: new state = rts\n", + nesqp->hwqp.qp_id); + if (nesqp->iwarp_state>(u32)NES_CQP_QP_IWARP_STATE_RTS) { + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + return -EINVAL; + } + if (nesqp->cm_id == NULL) { + nes_debug(NES_DBG_MOD_QP, "QP%u: Failing attempt to move QP to RTS without a CM_ID. \n", + nesqp->hwqp.qp_id ); + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + return -EINVAL; + } + next_iwarp_state = NES_CQP_QP_IWARP_STATE_RTS; + if (nesqp->iwarp_state != NES_CQP_QP_IWARP_STATE_RTS) + next_iwarp_state |= NES_CQP_QP_CONTEXT_VALID | + NES_CQP_QP_ARP_VALID | NES_CQP_QP_ORD_VALID; + issue_modify_qp = 1; + nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_ESTABLISHED; + nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_RTS; + nesqp->hte_added = 1; + break; + case IB_QPS_SQD: + issue_modify_qp = 1; + nes_debug(NES_DBG_MOD_QP, "QP%u: new state=closing. SQ head=%u, SQ tail=%u\n", + nesqp->hwqp.qp_id, nesqp->hwqp.sq_head, nesqp->hwqp.sq_tail); + if (nesqp->iwarp_state == (u32)NES_CQP_QP_IWARP_STATE_CLOSING) { + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + return 0; + } else { + if (nesqp->iwarp_state > (u32)NES_CQP_QP_IWARP_STATE_CLOSING) { + nes_debug(NES_DBG_MOD_QP, "QP%u: State change to closing" + " ignored due to current iWARP state\n", + nesqp->hwqp.qp_id); + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + return -EINVAL; + } + if (nesqp->hw_iwarp_state != NES_AEQE_IWARP_STATE_RTS) { + nes_debug(NES_DBG_MOD_QP, "QP%u: State change to closing" + " already done based on hw state.\n", + nesqp->hwqp.qp_id); + issue_modify_qp = 0; + } + switch (nesqp->hw_iwarp_state) { + case NES_AEQE_IWARP_STATE_CLOSING: + next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING; + break; + case NES_AEQE_IWARP_STATE_TERMINATE: + next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE; + break; + case NES_AEQE_IWARP_STATE_ERROR: + next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR; + break; + default: + next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING; + nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING; + break; + } + } + break; + case IB_QPS_SQE: + nes_debug(NES_DBG_MOD_QP, "QP%u: new state = terminate\n", + nesqp->hwqp.qp_id); + if (nesqp->iwarp_state>=(u32)NES_CQP_QP_IWARP_STATE_TERMINATE) { + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + return -EINVAL; + } + /* next_iwarp_state = (NES_CQP_QP_IWARP_STATE_TERMINATE | 0x02000000); */ + next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE; + nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_TERMINATE; + issue_modify_qp = 1; + break; + case IB_QPS_ERR: + case IB_QPS_RESET: + if (nesqp->iwarp_state == (u32)NES_CQP_QP_IWARP_STATE_ERROR) { + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + return -EINVAL; + } + nes_debug(NES_DBG_MOD_QP, "QP%u: new state = error\n", + nesqp->hwqp.qp_id); + if (nesqp->term_flags) + del_timer(&nesqp->terminate_timer); + + next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR; + /* next_iwarp_state = (NES_CQP_QP_IWARP_STATE_TERMINATE | 0x02000000); */ + if (nesqp->hte_added) { + nes_debug(NES_DBG_MOD_QP, "set CQP_QP_DEL_HTE\n"); + next_iwarp_state |= NES_CQP_QP_DEL_HTE; + nesqp->hte_added = 0; + } + if ((nesqp->hw_tcp_state > NES_AEQE_TCP_STATE_CLOSED) && + (nesdev->iw_status) && + (nesqp->hw_tcp_state != NES_AEQE_TCP_STATE_TIME_WAIT)) { + next_iwarp_state |= NES_CQP_QP_RESET; + } else { + nes_debug(NES_DBG_MOD_QP, "QP%u NOT setting NES_CQP_QP_RESET since TCP state = %u\n", + nesqp->hwqp.qp_id, nesqp->hw_tcp_state); + dont_wait = 1; + } + issue_modify_qp = 1; + nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_ERROR; + break; + default: + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + return -EINVAL; + break; + } + + nesqp->ibqp_state = attr->qp_state; + nesqp->iwarp_state = next_iwarp_state & NES_CQP_QP_IWARP_STATE_MASK; + nes_debug(NES_DBG_MOD_QP, "Change nesqp->iwarp_state=%08x\n", + nesqp->iwarp_state); + } + + if (attr_mask & IB_QP_ACCESS_FLAGS) { + if (attr->qp_access_flags & IB_ACCESS_LOCAL_WRITE) { + nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN | + NES_QPCONTEXT_MISC_RDMA_READ_EN); + issue_modify_qp = 1; + } + if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) { + nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN); + issue_modify_qp = 1; + } + if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) { + nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_READ_EN); + issue_modify_qp = 1; + } + if (attr->qp_access_flags & IB_ACCESS_MW_BIND) { + nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_WBIND_EN); + issue_modify_qp = 1; + } + + if (nesqp->user_mode) { + nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN | + NES_QPCONTEXT_MISC_RDMA_READ_EN); + issue_modify_qp = 1; + } + } + + original_last_aeq = nesqp->last_aeq; + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + + nes_debug(NES_DBG_MOD_QP, "issue_modify_qp=%u\n", issue_modify_qp); + + ret = 0; + + + if (issue_modify_qp) { + nes_debug(NES_DBG_MOD_QP, "call nes_hw_modify_qp\n"); + ret = nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 1); + if (ret) + nes_debug(NES_DBG_MOD_QP, "nes_hw_modify_qp (next_iwarp_state = 0x%08X)" + " failed for QP%u.\n", + next_iwarp_state, nesqp->hwqp.qp_id); + + } + + if ((issue_modify_qp) && (nesqp->ibqp_state > IB_QPS_RTS)) { + nes_debug(NES_DBG_MOD_QP, "QP%u Issued ModifyQP refcount (%d)," + " original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), + original_last_aeq, nesqp->last_aeq); + if (!ret || original_last_aeq != NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) { + if (dont_wait) { + if (nesqp->cm_id && nesqp->hw_tcp_state != 0) { + nes_debug(NES_DBG_MOD_QP, "QP%u Queuing fake disconnect for QP refcount (%d)," + " original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), + original_last_aeq, nesqp->last_aeq); + /* this one is for the cm_disconnect thread */ + spin_lock_irqsave(&nesqp->lock, qplockflags); + nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED; + nesqp->last_aeq = NES_AEQE_AEID_RESET_SENT; + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + nes_cm_disconn(nesqp); + } else { + nes_debug(NES_DBG_MOD_QP, "QP%u No fake disconnect, QP refcount=%d\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount)); + } + } else { + spin_lock_irqsave(&nesqp->lock, qplockflags); + if (nesqp->cm_id) { + /* These two are for the timer thread */ + if (atomic_inc_return(&nesqp->close_timer_started) == 1) { + nesqp->cm_id->add_ref(nesqp->cm_id); + nes_debug(NES_DBG_MOD_QP, "QP%u Not decrementing QP refcount (%d)," + " need ae to finish up, original_last_aeq = 0x%04X." + " last_aeq = 0x%04X, scheduling timer.\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), + original_last_aeq, nesqp->last_aeq); + schedule_nes_timer(nesqp->cm_node, (struct sk_buff *) nesqp, NES_TIMER_TYPE_CLOSE, 1, 0); + } + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + } else { + spin_unlock_irqrestore(&nesqp->lock, qplockflags); + nes_debug(NES_DBG_MOD_QP, "QP%u Not decrementing QP refcount (%d)," + " need ae to finish up, original_last_aeq = 0x%04X." + " last_aeq = 0x%04X.\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), + original_last_aeq, nesqp->last_aeq); + } + } + } else { + nes_debug(NES_DBG_MOD_QP, "QP%u Decrementing QP refcount (%d), No ae to finish up," + " original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), + original_last_aeq, nesqp->last_aeq); + } + } else { + nes_debug(NES_DBG_MOD_QP, "QP%u Decrementing QP refcount (%d), No ae to finish up," + " original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), + original_last_aeq, nesqp->last_aeq); + } + + err = 0; + + nes_debug(NES_DBG_MOD_QP, "QP%u Leaving, refcount=%d\n", + nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount)); + + return err; +} + + +/** + * nes_muticast_attach + */ +static int nes_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + nes_debug(NES_DBG_INIT, "\n"); + return -ENOSYS; +} + + +/** + * nes_multicast_detach + */ +static int nes_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + nes_debug(NES_DBG_INIT, "\n"); + return -ENOSYS; +} + + +/** + * nes_process_mad + */ +static int nes_process_mad(struct ib_device *ibdev, int mad_flags, + u8 port_num, struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + nes_debug(NES_DBG_INIT, "\n"); + return -ENOSYS; +} + +static inline void +fill_wqe_sg_send(struct nes_hw_qp_wqe *wqe, struct ib_send_wr *ib_wr, u32 uselkey) +{ + int sge_index; + int total_payload_length = 0; + for (sge_index = 0; sge_index < ib_wr->num_sge; sge_index++) { + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_FRAG0_LOW_IDX+(sge_index*4), + ib_wr->sg_list[sge_index].addr); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_LENGTH0_IDX + (sge_index*4), + ib_wr->sg_list[sge_index].length); + if (uselkey) + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX + (sge_index*4), + (ib_wr->sg_list[sge_index].lkey)); + else + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX + (sge_index*4), 0); + + total_payload_length += ib_wr->sg_list[sge_index].length; + } + nes_debug(NES_DBG_IW_TX, "UC UC UC, sending total_payload_length=%u \n", + total_payload_length); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX, + total_payload_length); +} + +/** + * nes_post_send + */ +static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, + struct ib_send_wr **bad_wr) +{ + u64 u64temp; + unsigned long flags = 0; + struct nes_vnic *nesvnic = to_nesvnic(ibqp->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_qp *nesqp = to_nesqp(ibqp); + struct nes_hw_qp_wqe *wqe; + int err = 0; + u32 qsize = nesqp->hwqp.sq_size; + u32 head; + u32 wqe_misc = 0; + u32 wqe_count = 0; + u32 counter; + + if (nesqp->ibqp_state > IB_QPS_RTS) { + err = -EINVAL; + goto out; + } + + spin_lock_irqsave(&nesqp->lock, flags); + + head = nesqp->hwqp.sq_head; + + while (ib_wr) { + /* Check for QP error */ + if (nesqp->term_flags) { + err = -EINVAL; + break; + } + + /* Check for SQ overflow */ + if (((head + (2 * qsize) - nesqp->hwqp.sq_tail) % qsize) == (qsize - 1)) { + err = -ENOMEM; + break; + } + + wqe = &nesqp->hwqp.sq_vbase[head]; + /* nes_debug(NES_DBG_IW_TX, "processing sq wqe for QP%u at %p, head = %u.\n", + nesqp->hwqp.qp_id, wqe, head); */ + nes_fill_init_qp_wqe(wqe, nesqp, head); + u64temp = (u64)(ib_wr->wr_id); + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX, + u64temp); + switch (ib_wr->opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_INV: + if (IB_WR_SEND == ib_wr->opcode) { + if (ib_wr->send_flags & IB_SEND_SOLICITED) + wqe_misc = NES_IWARP_SQ_OP_SENDSE; + else + wqe_misc = NES_IWARP_SQ_OP_SEND; + } else { + if (ib_wr->send_flags & IB_SEND_SOLICITED) + wqe_misc = NES_IWARP_SQ_OP_SENDSEINV; + else + wqe_misc = NES_IWARP_SQ_OP_SENDINV; + + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_INV_STAG_LOW_IDX, + ib_wr->ex.invalidate_rkey); + } + + if (ib_wr->num_sge > nesdev->nesadapter->max_sge) { + err = -EINVAL; + break; + } + + if (ib_wr->send_flags & IB_SEND_FENCE) + wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE; + + if ((ib_wr->send_flags & IB_SEND_INLINE) && + ((nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) == 0) && + (ib_wr->sg_list[0].length <= 64)) { + memcpy(&wqe->wqe_words[NES_IWARP_SQ_WQE_IMM_DATA_START_IDX], + (void *)(unsigned long)ib_wr->sg_list[0].addr, ib_wr->sg_list[0].length); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX, + ib_wr->sg_list[0].length); + wqe_misc |= NES_IWARP_SQ_WQE_IMM_DATA; + } else { + fill_wqe_sg_send(wqe, ib_wr, 1); + } + + break; + case IB_WR_RDMA_WRITE: + wqe_misc = NES_IWARP_SQ_OP_RDMAW; + if (ib_wr->num_sge > nesdev->nesadapter->max_sge) { + nes_debug(NES_DBG_IW_TX, "Exceeded max sge, ib_wr=%u, max=%u\n", + ib_wr->num_sge, nesdev->nesadapter->max_sge); + err = -EINVAL; + break; + } + + if (ib_wr->send_flags & IB_SEND_FENCE) + wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE; + + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_STAG_IDX, + ib_wr->wr.rdma.rkey); + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX, + ib_wr->wr.rdma.remote_addr); + + if ((ib_wr->send_flags & IB_SEND_INLINE) && + ((nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) == 0) && + (ib_wr->sg_list[0].length <= 64)) { + memcpy(&wqe->wqe_words[NES_IWARP_SQ_WQE_IMM_DATA_START_IDX], + (void *)(unsigned long)ib_wr->sg_list[0].addr, ib_wr->sg_list[0].length); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX, + ib_wr->sg_list[0].length); + wqe_misc |= NES_IWARP_SQ_WQE_IMM_DATA; + } else { + fill_wqe_sg_send(wqe, ib_wr, 1); + } + + wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX] = + wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX]; + break; + case IB_WR_RDMA_READ: + case IB_WR_RDMA_READ_WITH_INV: + /* iWARP only supports 1 sge for RDMA reads */ + if (ib_wr->num_sge > 1) { + nes_debug(NES_DBG_IW_TX, "Exceeded max sge, ib_wr=%u, max=1\n", + ib_wr->num_sge); + err = -EINVAL; + break; + } + if (ib_wr->opcode == IB_WR_RDMA_READ) { + wqe_misc = NES_IWARP_SQ_OP_RDMAR; + } else { + wqe_misc = NES_IWARP_SQ_OP_RDMAR_LOCINV; + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_INV_STAG_LOW_IDX, + ib_wr->ex.invalidate_rkey); + } + + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX, + ib_wr->wr.rdma.remote_addr); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_STAG_IDX, + ib_wr->wr.rdma.rkey); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX, + ib_wr->sg_list->length); + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_FRAG0_LOW_IDX, + ib_wr->sg_list->addr); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX, + ib_wr->sg_list->lkey); + break; + case IB_WR_LOCAL_INV: + wqe_misc = NES_IWARP_SQ_OP_LOCINV; + set_wqe_32bit_value(wqe->wqe_words, + NES_IWARP_SQ_LOCINV_WQE_INV_STAG_IDX, + ib_wr->ex.invalidate_rkey); + break; + case IB_WR_FAST_REG_MR: + { + int i; + int flags = ib_wr->wr.fast_reg.access_flags; + struct nes_ib_fast_reg_page_list *pnesfrpl = + container_of(ib_wr->wr.fast_reg.page_list, + struct nes_ib_fast_reg_page_list, + ibfrpl); + u64 *src_page_list = pnesfrpl->ibfrpl.page_list; + u64 *dst_page_list = pnesfrpl->nes_wqe_pbl.kva; + + if (ib_wr->wr.fast_reg.page_list_len > + (NES_4K_PBL_CHUNK_SIZE / sizeof(u64))) { + nes_debug(NES_DBG_IW_TX, "SQ_FMR: bad page_list_len\n"); + err = -EINVAL; + break; + } + wqe_misc = NES_IWARP_SQ_OP_FAST_REG; + set_wqe_64bit_value(wqe->wqe_words, + NES_IWARP_SQ_FMR_WQE_VA_FBO_LOW_IDX, + ib_wr->wr.fast_reg.iova_start); + set_wqe_32bit_value(wqe->wqe_words, + NES_IWARP_SQ_FMR_WQE_LENGTH_LOW_IDX, + ib_wr->wr.fast_reg.length); + set_wqe_32bit_value(wqe->wqe_words, + NES_IWARP_SQ_FMR_WQE_LENGTH_HIGH_IDX, 0); + set_wqe_32bit_value(wqe->wqe_words, + NES_IWARP_SQ_FMR_WQE_MR_STAG_IDX, + ib_wr->wr.fast_reg.rkey); + /* Set page size: */ + if (ib_wr->wr.fast_reg.page_shift == 12) { + wqe_misc |= NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_4K; + } else if (ib_wr->wr.fast_reg.page_shift == 21) { + wqe_misc |= NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_2M; + } else { + nes_debug(NES_DBG_IW_TX, "Invalid page shift," + " ib_wr=%u, max=1\n", ib_wr->num_sge); + err = -EINVAL; + break; + } + /* Set access_flags */ + wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_READ; + if (flags & IB_ACCESS_LOCAL_WRITE) + wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_WRITE; + + if (flags & IB_ACCESS_REMOTE_WRITE) + wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_WRITE; + + if (flags & IB_ACCESS_REMOTE_READ) + wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_READ; + + if (flags & IB_ACCESS_MW_BIND) + wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_WINDOW_BIND; + + /* Fill in PBL info: */ + if (ib_wr->wr.fast_reg.page_list_len > + pnesfrpl->ibfrpl.max_page_list_len) { + nes_debug(NES_DBG_IW_TX, "Invalid page list length," + " ib_wr=%p, value=%u, max=%u\n", + ib_wr, ib_wr->wr.fast_reg.page_list_len, + pnesfrpl->ibfrpl.max_page_list_len); + err = -EINVAL; + break; + } + + set_wqe_64bit_value(wqe->wqe_words, + NES_IWARP_SQ_FMR_WQE_PBL_ADDR_LOW_IDX, + pnesfrpl->nes_wqe_pbl.paddr); + + set_wqe_32bit_value(wqe->wqe_words, + NES_IWARP_SQ_FMR_WQE_PBL_LENGTH_IDX, + ib_wr->wr.fast_reg.page_list_len * 8); + + for (i = 0; i < ib_wr->wr.fast_reg.page_list_len; i++) + dst_page_list[i] = cpu_to_le64(src_page_list[i]); + + nes_debug(NES_DBG_IW_TX, "SQ_FMR: iova_start: %llx, " + "length: %d, rkey: %0x, pgl_paddr: %llx, " + "page_list_len: %u, wqe_misc: %x\n", + (unsigned long long) ib_wr->wr.fast_reg.iova_start, + ib_wr->wr.fast_reg.length, + ib_wr->wr.fast_reg.rkey, + (unsigned long long) pnesfrpl->nes_wqe_pbl.paddr, + ib_wr->wr.fast_reg.page_list_len, + wqe_misc); + break; + } + default: + /* error */ + err = -EINVAL; + break; + } + + if (err) + break; + + if ((ib_wr->send_flags & IB_SEND_SIGNALED) || nesqp->sig_all) + wqe_misc |= NES_IWARP_SQ_WQE_SIGNALED_COMPL; + + wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = cpu_to_le32(wqe_misc); + + ib_wr = ib_wr->next; + head++; + wqe_count++; + if (head >= qsize) + head = 0; + + } + + nesqp->hwqp.sq_head = head; + barrier(); + while (wqe_count) { + counter = min(wqe_count, ((u32)255)); + wqe_count -= counter; + nes_write32(nesdev->regs + NES_WQE_ALLOC, + (counter << 24) | 0x00800000 | nesqp->hwqp.qp_id); + } + + spin_unlock_irqrestore(&nesqp->lock, flags); + +out: + if (err) + *bad_wr = ib_wr; + return err; +} + + +/** + * nes_post_recv + */ +static int nes_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr, + struct ib_recv_wr **bad_wr) +{ + u64 u64temp; + unsigned long flags = 0; + struct nes_vnic *nesvnic = to_nesvnic(ibqp->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_qp *nesqp = to_nesqp(ibqp); + struct nes_hw_qp_wqe *wqe; + int err = 0; + int sge_index; + u32 qsize = nesqp->hwqp.rq_size; + u32 head; + u32 wqe_count = 0; + u32 counter; + u32 total_payload_length; + + if (nesqp->ibqp_state > IB_QPS_RTS) { + err = -EINVAL; + goto out; + } + + spin_lock_irqsave(&nesqp->lock, flags); + + head = nesqp->hwqp.rq_head; + + while (ib_wr) { + /* Check for QP error */ + if (nesqp->term_flags) { + err = -EINVAL; + break; + } + + if (ib_wr->num_sge > nesdev->nesadapter->max_sge) { + err = -EINVAL; + break; + } + /* Check for RQ overflow */ + if (((head + (2 * qsize) - nesqp->hwqp.rq_tail) % qsize) == (qsize - 1)) { + err = -ENOMEM; + break; + } + + nes_debug(NES_DBG_IW_RX, "ibwr sge count = %u.\n", ib_wr->num_sge); + wqe = &nesqp->hwqp.rq_vbase[head]; + + /* nes_debug(NES_DBG_IW_RX, "QP%u:processing rq wqe at %p, head = %u.\n", + nesqp->hwqp.qp_id, wqe, head); */ + nes_fill_init_qp_wqe(wqe, nesqp, head); + u64temp = (u64)(ib_wr->wr_id); + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX, + u64temp); + total_payload_length = 0; + for (sge_index=0; sge_index < ib_wr->num_sge; sge_index++) { + set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_FRAG0_LOW_IDX+(sge_index*4), + ib_wr->sg_list[sge_index].addr); + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_LENGTH0_IDX+(sge_index*4), + ib_wr->sg_list[sge_index].length); + set_wqe_32bit_value(wqe->wqe_words,NES_IWARP_RQ_WQE_STAG0_IDX+(sge_index*4), + ib_wr->sg_list[sge_index].lkey); + + total_payload_length += ib_wr->sg_list[sge_index].length; + } + set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_TOTAL_PAYLOAD_IDX, + total_payload_length); + + ib_wr = ib_wr->next; + head++; + wqe_count++; + if (head >= qsize) + head = 0; + } + + nesqp->hwqp.rq_head = head; + barrier(); + while (wqe_count) { + counter = min(wqe_count, ((u32)255)); + wqe_count -= counter; + nes_write32(nesdev->regs+NES_WQE_ALLOC, (counter<<24) | nesqp->hwqp.qp_id); + } + + spin_unlock_irqrestore(&nesqp->lock, flags); + +out: + if (err) + *bad_wr = ib_wr; + return err; +} + + +/** + * nes_poll_cq + */ +static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) +{ + u64 u64temp; + u64 wrid; + unsigned long flags = 0; + struct nes_vnic *nesvnic = to_nesvnic(ibcq->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_cq *nescq = to_nescq(ibcq); + struct nes_qp *nesqp; + struct nes_hw_cqe cqe; + u32 head; + u32 wq_tail = 0; + u32 cq_size; + u32 cqe_count = 0; + u32 wqe_index; + u32 u32temp; + u32 move_cq_head = 1; + u32 err_code; + + nes_debug(NES_DBG_CQ, "\n"); + + spin_lock_irqsave(&nescq->lock, flags); + + head = nescq->hw_cq.cq_head; + cq_size = nescq->hw_cq.cq_size; + + while (cqe_count < num_entries) { + if ((le32_to_cpu(nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX]) & + NES_CQE_VALID) == 0) + break; + + /* + * Make sure we read CQ entry contents *after* + * we've checked the valid bit. + */ + rmb(); + + cqe = nescq->hw_cq.cq_vbase[head]; + u32temp = le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]); + wqe_index = u32temp & (nesdev->nesadapter->max_qp_wr - 1); + u32temp &= ~(NES_SW_CONTEXT_ALIGN-1); + /* parse CQE, get completion context from WQE (either rq or sq) */ + u64temp = (((u64)(le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX])))<<32) | + ((u64)u32temp); + + if (u64temp) { + nesqp = (struct nes_qp *)(unsigned long)u64temp; + memset(entry, 0, sizeof *entry); + if (cqe.cqe_words[NES_CQE_ERROR_CODE_IDX] == 0) { + entry->status = IB_WC_SUCCESS; + } else { + err_code = le32_to_cpu(cqe.cqe_words[NES_CQE_ERROR_CODE_IDX]); + if (NES_IWARP_CQE_MAJOR_DRV == (err_code >> 16)) { + entry->status = err_code & 0x0000ffff; + + /* The rest of the cqe's will be marked as flushed */ + nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_ERROR_CODE_IDX] = + cpu_to_le32((NES_IWARP_CQE_MAJOR_FLUSH << 16) | + NES_IWARP_CQE_MINOR_FLUSH); + } else + entry->status = IB_WC_WR_FLUSH_ERR; + } + + entry->qp = &nesqp->ibqp; + entry->src_qp = nesqp->hwqp.qp_id; + + if (le32_to_cpu(cqe.cqe_words[NES_CQE_OPCODE_IDX]) & NES_CQE_SQ) { + if (nesqp->skip_lsmm) { + nesqp->skip_lsmm = 0; + nesqp->hwqp.sq_tail++; + } + + /* Working on a SQ Completion*/ + wrid = (((u64)(cpu_to_le32((u32)nesqp->hwqp.sq_vbase[wqe_index]. + wqe_words[NES_IWARP_SQ_WQE_COMP_SCRATCH_HIGH_IDX]))) << 32) | + ((u64)(cpu_to_le32((u32)nesqp->hwqp.sq_vbase[wqe_index]. + wqe_words[NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX]))); + entry->byte_len = le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index]. + wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX]); + + switch (le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index]. + wqe_words[NES_IWARP_SQ_WQE_MISC_IDX]) & 0x3f) { + case NES_IWARP_SQ_OP_RDMAW: + nes_debug(NES_DBG_CQ, "Operation = RDMA WRITE.\n"); + entry->opcode = IB_WC_RDMA_WRITE; + break; + case NES_IWARP_SQ_OP_RDMAR: + nes_debug(NES_DBG_CQ, "Operation = RDMA READ.\n"); + entry->opcode = IB_WC_RDMA_READ; + entry->byte_len = le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index]. + wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX]); + break; + case NES_IWARP_SQ_OP_SENDINV: + case NES_IWARP_SQ_OP_SENDSEINV: + case NES_IWARP_SQ_OP_SEND: + case NES_IWARP_SQ_OP_SENDSE: + nes_debug(NES_DBG_CQ, "Operation = Send.\n"); + entry->opcode = IB_WC_SEND; + break; + case NES_IWARP_SQ_OP_LOCINV: + entry->opcode = IB_WC_LOCAL_INV; + break; + case NES_IWARP_SQ_OP_FAST_REG: + entry->opcode = IB_WC_FAST_REG_MR; + break; + } + + nesqp->hwqp.sq_tail = (wqe_index+1)&(nesqp->hwqp.sq_size - 1); + if ((entry->status != IB_WC_SUCCESS) && (nesqp->hwqp.sq_tail != nesqp->hwqp.sq_head)) { + move_cq_head = 0; + wq_tail = nesqp->hwqp.sq_tail; + } + } else { + /* Working on a RQ Completion*/ + entry->byte_len = le32_to_cpu(cqe.cqe_words[NES_CQE_PAYLOAD_LENGTH_IDX]); + wrid = ((u64)(le32_to_cpu(nesqp->hwqp.rq_vbase[wqe_index].wqe_words[NES_IWARP_RQ_WQE_COMP_SCRATCH_LOW_IDX]))) | + ((u64)(le32_to_cpu(nesqp->hwqp.rq_vbase[wqe_index].wqe_words[NES_IWARP_RQ_WQE_COMP_SCRATCH_HIGH_IDX]))<<32); + entry->opcode = IB_WC_RECV; + + nesqp->hwqp.rq_tail = (wqe_index+1)&(nesqp->hwqp.rq_size - 1); + if ((entry->status != IB_WC_SUCCESS) && (nesqp->hwqp.rq_tail != nesqp->hwqp.rq_head)) { + move_cq_head = 0; + wq_tail = nesqp->hwqp.rq_tail; + } + } + + entry->wr_id = wrid; + entry++; + cqe_count++; + } + + if (move_cq_head) { + nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX] = 0; + if (++head >= cq_size) + head = 0; + nescq->polled_completions++; + + if ((nescq->polled_completions > (cq_size / 2)) || + (nescq->polled_completions == 255)) { + nes_debug(NES_DBG_CQ, "CQ%u Issuing CQE Allocate since more than half of cqes" + " are pending %u of %u.\n", + nescq->hw_cq.cq_number, nescq->polled_completions, cq_size); + nes_write32(nesdev->regs+NES_CQE_ALLOC, + nescq->hw_cq.cq_number | (nescq->polled_completions << 16)); + nescq->polled_completions = 0; + } + } else { + /* Update the wqe index and set status to flush */ + wqe_index = le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]); + wqe_index = (wqe_index & (~(nesdev->nesadapter->max_qp_wr - 1))) | wq_tail; + nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX] = + cpu_to_le32(wqe_index); + move_cq_head = 1; /* ready for next pass */ + } + } + + if (nescq->polled_completions) { + nes_write32(nesdev->regs+NES_CQE_ALLOC, + nescq->hw_cq.cq_number | (nescq->polled_completions << 16)); + nescq->polled_completions = 0; + } + + nescq->hw_cq.cq_head = head; + nes_debug(NES_DBG_CQ, "Reporting %u completions for CQ%u.\n", + cqe_count, nescq->hw_cq.cq_number); + + spin_unlock_irqrestore(&nescq->lock, flags); + + return cqe_count; +} + + +/** + * nes_req_notify_cq + */ +static int nes_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags) + { + struct nes_vnic *nesvnic = to_nesvnic(ibcq->device); + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_cq *nescq = to_nescq(ibcq); + u32 cq_arm; + + nes_debug(NES_DBG_CQ, "Requesting notification for CQ%u.\n", + nescq->hw_cq.cq_number); + + cq_arm = nescq->hw_cq.cq_number; + if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_NEXT_COMP) + cq_arm |= NES_CQE_ALLOC_NOTIFY_NEXT; + else if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED) + cq_arm |= NES_CQE_ALLOC_NOTIFY_SE; + else + return -EINVAL; + + nes_write32(nesdev->regs+NES_CQE_ALLOC, cq_arm); + nes_read32(nesdev->regs+NES_CQE_ALLOC); + + return 0; +} + + +/** + * nes_init_ofa_device + */ +struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev) +{ + struct nes_ib_device *nesibdev; + struct nes_vnic *nesvnic = netdev_priv(netdev); + struct nes_device *nesdev = nesvnic->nesdev; + + nesibdev = (struct nes_ib_device *)ib_alloc_device(sizeof(struct nes_ib_device)); + if (nesibdev == NULL) { + return NULL; + } + strlcpy(nesibdev->ibdev.name, "nes%d", IB_DEVICE_NAME_MAX); + nesibdev->ibdev.owner = THIS_MODULE; + + nesibdev->ibdev.node_type = RDMA_NODE_RNIC; + memset(&nesibdev->ibdev.node_guid, 0, sizeof(nesibdev->ibdev.node_guid)); + memcpy(&nesibdev->ibdev.node_guid, netdev->dev_addr, 6); + + nesibdev->ibdev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_AH) | + (1ull << IB_USER_VERBS_CMD_DESTROY_AH) | + (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_POLL_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_ALLOC_MW) | + (1ull << IB_USER_VERBS_CMD_BIND_MW) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_MW) | + (1ull << IB_USER_VERBS_CMD_POST_RECV) | + (1ull << IB_USER_VERBS_CMD_POST_SEND); + + nesibdev->ibdev.phys_port_cnt = 1; + nesibdev->ibdev.num_comp_vectors = 1; + nesibdev->ibdev.dma_device = &nesdev->pcidev->dev; + nesibdev->ibdev.dev.parent = &nesdev->pcidev->dev; + nesibdev->ibdev.query_device = nes_query_device; + nesibdev->ibdev.query_port = nes_query_port; + nesibdev->ibdev.query_pkey = nes_query_pkey; + nesibdev->ibdev.query_gid = nes_query_gid; + nesibdev->ibdev.alloc_ucontext = nes_alloc_ucontext; + nesibdev->ibdev.dealloc_ucontext = nes_dealloc_ucontext; + nesibdev->ibdev.mmap = nes_mmap; + nesibdev->ibdev.alloc_pd = nes_alloc_pd; + nesibdev->ibdev.dealloc_pd = nes_dealloc_pd; + nesibdev->ibdev.create_ah = nes_create_ah; + nesibdev->ibdev.destroy_ah = nes_destroy_ah; + nesibdev->ibdev.create_qp = nes_create_qp; + nesibdev->ibdev.modify_qp = nes_modify_qp; + nesibdev->ibdev.query_qp = nes_query_qp; + nesibdev->ibdev.destroy_qp = nes_destroy_qp; + nesibdev->ibdev.create_cq = nes_create_cq; + nesibdev->ibdev.destroy_cq = nes_destroy_cq; + nesibdev->ibdev.poll_cq = nes_poll_cq; + nesibdev->ibdev.get_dma_mr = nes_get_dma_mr; + nesibdev->ibdev.reg_phys_mr = nes_reg_phys_mr; + nesibdev->ibdev.reg_user_mr = nes_reg_user_mr; + nesibdev->ibdev.dereg_mr = nes_dereg_mr; + nesibdev->ibdev.alloc_mw = nes_alloc_mw; + nesibdev->ibdev.dealloc_mw = nes_dealloc_mw; + nesibdev->ibdev.bind_mw = nes_bind_mw; + + nesibdev->ibdev.alloc_fast_reg_mr = nes_alloc_fast_reg_mr; + nesibdev->ibdev.alloc_fast_reg_page_list = nes_alloc_fast_reg_page_list; + nesibdev->ibdev.free_fast_reg_page_list = nes_free_fast_reg_page_list; + + nesibdev->ibdev.attach_mcast = nes_multicast_attach; + nesibdev->ibdev.detach_mcast = nes_multicast_detach; + nesibdev->ibdev.process_mad = nes_process_mad; + + nesibdev->ibdev.req_notify_cq = nes_req_notify_cq; + nesibdev->ibdev.post_send = nes_post_send; + nesibdev->ibdev.post_recv = nes_post_recv; + + nesibdev->ibdev.iwcm = kzalloc(sizeof(*nesibdev->ibdev.iwcm), GFP_KERNEL); + if (nesibdev->ibdev.iwcm == NULL) { + ib_dealloc_device(&nesibdev->ibdev); + return NULL; + } + nesibdev->ibdev.iwcm->add_ref = nes_add_ref; + nesibdev->ibdev.iwcm->rem_ref = nes_rem_ref; + nesibdev->ibdev.iwcm->get_qp = nes_get_qp; + nesibdev->ibdev.iwcm->connect = nes_connect; + nesibdev->ibdev.iwcm->accept = nes_accept; + nesibdev->ibdev.iwcm->reject = nes_reject; + nesibdev->ibdev.iwcm->create_listen = nes_create_listen; + nesibdev->ibdev.iwcm->destroy_listen = nes_destroy_listen; + + return nesibdev; +} + + +/** + * nes_handle_delayed_event + */ +static void nes_handle_delayed_event(unsigned long data) +{ + struct nes_vnic *nesvnic = (void *) data; + + if (nesvnic->delayed_event != nesvnic->last_dispatched_event) { + struct ib_event event; + + event.device = &nesvnic->nesibdev->ibdev; + if (!event.device) + goto stop_timer; + event.event = nesvnic->delayed_event; + event.element.port_num = nesvnic->logical_port + 1; + ib_dispatch_event(&event); + } + +stop_timer: + nesvnic->event_timer.function = NULL; +} + + +void nes_port_ibevent(struct nes_vnic *nesvnic) +{ + struct nes_ib_device *nesibdev = nesvnic->nesibdev; + struct nes_device *nesdev = nesvnic->nesdev; + struct ib_event event; + event.device = &nesibdev->ibdev; + event.element.port_num = nesvnic->logical_port + 1; + event.event = nesdev->iw_status ? IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR; + + if (!nesvnic->event_timer.function) { + ib_dispatch_event(&event); + nesvnic->last_dispatched_event = event.event; + nesvnic->event_timer.function = nes_handle_delayed_event; + nesvnic->event_timer.data = (unsigned long) nesvnic; + nesvnic->event_timer.expires = jiffies + NES_EVENT_DELAY; + add_timer(&nesvnic->event_timer); + } else { + mod_timer(&nesvnic->event_timer, jiffies + NES_EVENT_DELAY); + } + nesvnic->delayed_event = event.event; +} + + +/** + * nes_destroy_ofa_device + */ +void nes_destroy_ofa_device(struct nes_ib_device *nesibdev) +{ + if (nesibdev == NULL) + return; + + nes_unregister_ofa_device(nesibdev); + + kfree(nesibdev->ibdev.iwcm); + ib_dealloc_device(&nesibdev->ibdev); +} + + +/** + * nes_register_ofa_device + */ +int nes_register_ofa_device(struct nes_ib_device *nesibdev) +{ + struct nes_vnic *nesvnic = nesibdev->nesvnic; + struct nes_device *nesdev = nesvnic->nesdev; + struct nes_adapter *nesadapter = nesdev->nesadapter; + int i, ret; + + ret = ib_register_device(&nesvnic->nesibdev->ibdev, NULL); + if (ret) { + return ret; + } + + /* Get the resources allocated to this device */ + nesibdev->max_cq = (nesadapter->max_cq-NES_FIRST_QPN) / nesadapter->port_count; + nesibdev->max_mr = nesadapter->max_mr / nesadapter->port_count; + nesibdev->max_qp = (nesadapter->max_qp-NES_FIRST_QPN) / nesadapter->port_count; + nesibdev->max_pd = nesadapter->max_pd / nesadapter->port_count; + + for (i = 0; i < ARRAY_SIZE(nes_dev_attributes); ++i) { + ret = device_create_file(&nesibdev->ibdev.dev, nes_dev_attributes[i]); + if (ret) { + while (i > 0) { + i--; + device_remove_file(&nesibdev->ibdev.dev, + nes_dev_attributes[i]); + } + ib_unregister_device(&nesibdev->ibdev); + return ret; + } + } + + nesvnic->of_device_registered = 1; + + return 0; +} + + +/** + * nes_unregister_ofa_device + */ +static void nes_unregister_ofa_device(struct nes_ib_device *nesibdev) +{ + struct nes_vnic *nesvnic = nesibdev->nesvnic; + int i; + + for (i = 0; i < ARRAY_SIZE(nes_dev_attributes); ++i) { + device_remove_file(&nesibdev->ibdev.dev, nes_dev_attributes[i]); + } + + if (nesvnic->of_device_registered) { + ib_unregister_device(&nesibdev->ibdev); + } + + nesvnic->of_device_registered = 0; +} diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h new file mode 100644 index 0000000..309b31c --- /dev/null +++ b/drivers/infiniband/hw/nes/nes_verbs.h @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2006 - 2011 Intel Corporation. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef NES_VERBS_H +#define NES_VERBS_H + +struct nes_device; + +#define NES_MAX_USER_DB_REGIONS 4096 +#define NES_MAX_USER_WQ_REGIONS 4096 + +#define NES_TERM_SENT 0x01 +#define NES_TERM_RCVD 0x02 +#define NES_TERM_DONE 0x04 + +struct nes_ucontext { + struct ib_ucontext ibucontext; + struct nes_device *nesdev; + unsigned long mmap_wq_offset; + unsigned long mmap_cq_offset; /* to be removed */ + int index; /* rnic index (minor) */ + unsigned long allocated_doorbells[BITS_TO_LONGS(NES_MAX_USER_DB_REGIONS)]; + u16 mmap_db_index[NES_MAX_USER_DB_REGIONS]; + u16 first_free_db; + unsigned long allocated_wqs[BITS_TO_LONGS(NES_MAX_USER_WQ_REGIONS)]; + struct nes_qp *mmap_nesqp[NES_MAX_USER_WQ_REGIONS]; + u16 first_free_wq; + struct list_head cq_reg_mem_list; + struct list_head qp_reg_mem_list; + u32 mcrqf; + atomic_t usecnt; +}; + +struct nes_pd { + struct ib_pd ibpd; + u16 pd_id; + atomic_t sqp_count; + u16 mmap_db_index; +}; + +struct nes_mr { + union { + struct ib_mr ibmr; + struct ib_mw ibmw; + struct ib_fmr ibfmr; + }; + struct ib_umem *region; + u16 pbls_used; + u8 mode; + u8 pbl_4k; +}; + +struct nes_hw_pb { + __le32 pa_low; + __le32 pa_high; +}; + +struct nes_vpbl { + dma_addr_t pbl_pbase; + struct nes_hw_pb *pbl_vbase; +}; + +struct nes_root_vpbl { + dma_addr_t pbl_pbase; + struct nes_hw_pb *pbl_vbase; + struct nes_vpbl *leaf_vpbl; +}; + +struct nes_fmr { + struct nes_mr nesmr; + u32 leaf_pbl_cnt; + struct nes_root_vpbl root_vpbl; + struct ib_qp *ib_qp; + int access_rights; + struct ib_fmr_attr attr; +}; + +struct nes_av; + +struct nes_cq { + struct ib_cq ibcq; + struct nes_hw_cq hw_cq; + u32 polled_completions; + u32 cq_mem_size; + spinlock_t lock; + u8 virtual_cq; + u8 pad[3]; + u32 mcrqf; +}; + +struct nes_wq { + spinlock_t lock; +}; + +struct disconn_work { + struct work_struct work; + struct nes_qp *nesqp; +}; + +struct iw_cm_id; +struct ietf_mpa_frame; + +struct nes_qp { + struct ib_qp ibqp; + void *allocated_buffer; + struct iw_cm_id *cm_id; + struct nes_cq *nesscq; + struct nes_cq *nesrcq; + struct nes_pd *nespd; + void *cm_node; /* handle of the node this QP is associated with */ + void *ietf_frame; + u8 ietf_frame_size; + dma_addr_t ietf_frame_pbase; + struct ib_mr *lsmm_mr; + struct nes_hw_qp hwqp; + struct work_struct work; + enum ib_qp_state ibqp_state; + u32 iwarp_state; + u32 hte_index; + u32 last_aeq; + u32 qp_mem_size; + atomic_t refcount; + atomic_t close_timer_started; + u32 mmap_sq_db_index; + u32 mmap_rq_db_index; + spinlock_t lock; + spinlock_t pau_lock; + struct nes_qp_context *nesqp_context; + dma_addr_t nesqp_context_pbase; + void *pbl_vbase; + dma_addr_t pbl_pbase; + struct page *page; + struct timer_list terminate_timer; + enum ib_event_type terminate_eventtype; + struct sk_buff_head pau_list; + u32 pau_rcv_nxt; + u16 active_conn:1; + u16 skip_lsmm:1; + u16 user_mode:1; + u16 hte_added:1; + u16 flush_issued:1; + u16 destroyed:1; + u16 sig_all:1; + u16 pau_mode:1; + u16 rsvd:8; + u16 private_data_len; + u16 term_sq_flush_code; + u16 term_rq_flush_code; + u8 hw_iwarp_state; + u8 hw_tcp_state; + u8 term_flags; + u8 sq_kmapped; + u8 pau_busy; + u8 pau_pending; + u8 pau_state; + __u64 nesuqp_addr; +}; +#endif /* NES_VERBS_H */ diff --git a/drivers/infiniband/hw/ocrdma/Kconfig b/drivers/infiniband/hw/ocrdma/Kconfig new file mode 100644 index 0000000..c0cddc0 --- /dev/null +++ b/drivers/infiniband/hw/ocrdma/Kconfig @@ -0,0 +1,8 @@ +config INFINIBAND_OCRDMA + tristate "Emulex One Connect HCA support" + depends on ETHERNET && NETDEVICES && PCI && INET && (IPV6 || IPV6=n) + select NET_VENDOR_EMULEX + select BE2NET + ---help--- + This driver provides low-level InfiniBand over Ethernet + support for Emulex One Connect host channel adapters (HCAs). diff --git a/drivers/infiniband/hw/ocrdma/Makefile b/drivers/infiniband/hw/ocrdma/Makefile new file mode 100644 index 0000000..d1bfd4f --- /dev/null +++ b/drivers/infiniband/hw/ocrdma/Makefile @@ -0,0 +1,5 @@ +ccflags-y := -Idrivers/net/ethernet/emulex/benet + +obj-$(CONFIG_INFINIBAND_OCRDMA) += ocrdma.o + +ocrdma-y := ocrdma_main.o ocrdma_verbs.o ocrdma_hw.o ocrdma_ah.o ocrdma_stats.o diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h new file mode 100644 index 0000000..b43456a --- /dev/null +++ b/drivers/infiniband/hw/ocrdma/ocrdma.h @@ -0,0 +1,543 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for * + * RoCE (RDMA over Converged Ethernet) adapters. * + * Copyright (C) 2008-2012 Emulex. All rights reserved. * + * EMULEX and SLI are trademarks of Emulex. * + * www.emulex.com * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of version 2 of the GNU General * + * Public License as published by the Free Software Foundation. * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID. See the GNU General Public License for * + * more details, a copy of which can be found in the file COPYING * + * included with this package. * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#ifndef __OCRDMA_H__ +#define __OCRDMA_H__ + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include "ocrdma_sli.h" + +#define OCRDMA_ROCE_DRV_VERSION "10.2.287.0u" + +#define OCRDMA_ROCE_DRV_DESC "Emulex OneConnect RoCE Driver" +#define OCRDMA_NODE_DESC "Emulex OneConnect RoCE HCA" + +#define OC_NAME_SH OCRDMA_NODE_DESC "(Skyhawk)" +#define OC_NAME_UNKNOWN OCRDMA_NODE_DESC "(Unknown)" + +#define OC_SKH_DEVICE_PF 0x720 +#define OC_SKH_DEVICE_VF 0x728 +#define OCRDMA_MAX_AH 512 + +#define OCRDMA_UVERBS(CMD_NAME) (1ull << IB_USER_VERBS_CMD_##CMD_NAME) + +#define convert_to_64bit(lo, hi) ((u64)hi << 32 | (u64)lo) + +struct ocrdma_dev_attr { + u8 fw_ver[32]; + u32 vendor_id; + u32 device_id; + u16 max_pd; + u16 max_cq; + u16 max_cqe; + u16 max_qp; + u16 max_wqe; + u16 max_rqe; + u16 max_srq; + u32 max_inline_data; + int max_send_sge; + int max_recv_sge; + int max_srq_sge; + int max_rdma_sge; + int max_mr; + u64 max_mr_size; + u32 max_num_mr_pbl; + int max_mw; + int max_fmr; + int max_map_per_fmr; + int max_pages_per_frmr; + u16 max_ord_per_qp; + u16 max_ird_per_qp; + + int device_cap_flags; + u8 cq_overflow_detect; + u8 srq_supported; + + u32 wqe_size; + u32 rqe_size; + u32 ird_page_size; + u8 local_ca_ack_delay; + u8 ird; + u8 num_ird_pages; +}; + +struct ocrdma_dma_mem { + void *va; + dma_addr_t pa; + u32 size; +}; + +struct ocrdma_pbl { + void *va; + dma_addr_t pa; +}; + +struct ocrdma_queue_info { + void *va; + dma_addr_t dma; + u32 size; + u16 len; + u16 entry_size; /* Size of an element in the queue */ + u16 id; /* qid, where to ring the doorbell. */ + u16 head, tail; + bool created; +}; + +struct ocrdma_eq { + struct ocrdma_queue_info q; + u32 vector; + int cq_cnt; + struct ocrdma_dev *dev; + char irq_name[32]; +}; + +struct ocrdma_mq { + struct ocrdma_queue_info sq; + struct ocrdma_queue_info cq; + bool rearm_cq; +}; + +struct mqe_ctx { + struct mutex lock; /* for serializing mailbox commands on MQ */ + wait_queue_head_t cmd_wait; + u32 tag; + u16 cqe_status; + u16 ext_status; + bool cmd_done; + bool fw_error_state; +}; + +struct ocrdma_hw_mr { + u32 lkey; + u8 fr_mr; + u8 remote_atomic; + u8 remote_rd; + u8 remote_wr; + u8 local_rd; + u8 local_wr; + u8 mw_bind; + u8 rsvd; + u64 len; + struct ocrdma_pbl *pbl_table; + u32 num_pbls; + u32 num_pbes; + u32 pbl_size; + u32 pbe_size; + u64 fbo; + u64 va; +}; + +struct ocrdma_mr { + struct ib_mr ibmr; + struct ib_umem *umem; + struct ocrdma_hw_mr hwmr; +}; + +struct ocrdma_stats { + u8 type; + struct ocrdma_dev *dev; +}; + +struct stats_mem { + struct ocrdma_mqe mqe; + void *va; + dma_addr_t pa; + u32 size; + char *debugfs_mem; +}; + +struct phy_info { + u16 auto_speeds_supported; + u16 fixed_speeds_supported; + u16 phy_type; + u16 interface_type; +}; + +struct ocrdma_dev { + struct ib_device ibdev; + struct ocrdma_dev_attr attr; + + struct mutex dev_lock; /* provides syncronise access to device data */ + spinlock_t flush_q_lock ____cacheline_aligned; + + struct ocrdma_cq **cq_tbl; + struct ocrdma_qp **qp_tbl; + + struct ocrdma_eq *eq_tbl; + int eq_cnt; + u16 base_eqid; + u16 max_eq; + + union ib_gid *sgid_tbl; + /* provided synchronization to sgid table for + * updating gid entries triggered by notifier. + */ + spinlock_t sgid_lock; + + int gsi_qp_created; + struct ocrdma_cq *gsi_sqcq; + struct ocrdma_cq *gsi_rqcq; + + struct { + struct ocrdma_av *va; + dma_addr_t pa; + u32 size; + u32 num_ah; + /* provide synchronization for av + * entry allocations. + */ + spinlock_t lock; + u32 ahid; + struct ocrdma_pbl pbl; + } av_tbl; + + void *mbx_cmd; + struct ocrdma_mq mq; + struct mqe_ctx mqe_ctx; + + struct be_dev_info nic_info; + struct phy_info phy; + char model_number[32]; + u32 hba_port_num; + + struct list_head entry; + struct rcu_head rcu; + int id; + u64 *stag_arr; + u8 sl; /* service level */ + bool pfc_state; + atomic_t update_sl; + u16 pvid; + u32 asic_id; + + ulong last_stats_time; + struct mutex stats_lock; /* provide synch for debugfs operations */ + struct stats_mem stats_mem; + struct ocrdma_stats rsrc_stats; + struct ocrdma_stats rx_stats; + struct ocrdma_stats wqe_stats; + struct ocrdma_stats tx_stats; + struct ocrdma_stats db_err_stats; + struct ocrdma_stats tx_qp_err_stats; + struct ocrdma_stats rx_qp_err_stats; + struct ocrdma_stats tx_dbg_stats; + struct ocrdma_stats rx_dbg_stats; + struct dentry *dir; +}; + +struct ocrdma_cq { + struct ib_cq ibcq; + struct ocrdma_cqe *va; + u32 phase; + u32 getp; /* pointer to pending wrs to + * return to stack, wrap arounds + * at max_hw_cqe + */ + u32 max_hw_cqe; + bool phase_change; + bool deferred_arm, deferred_sol; + bool first_arm; + + spinlock_t cq_lock ____cacheline_aligned; /* provide synchronization + * to cq polling + */ + /* syncronizes cq completion handler invoked from multiple context */ + spinlock_t comp_handler_lock ____cacheline_aligned; + u16 id; + u16 eqn; + + struct ocrdma_ucontext *ucontext; + dma_addr_t pa; + u32 len; + u32 cqe_cnt; + + /* head of all qp's sq and rq for which cqes need to be flushed + * by the software. + */ + struct list_head sq_head, rq_head; +}; + +struct ocrdma_pd { + struct ib_pd ibpd; + struct ocrdma_ucontext *uctx; + u32 id; + int num_dpp_qp; + u32 dpp_page; + bool dpp_enabled; +}; + +struct ocrdma_ah { + struct ib_ah ibah; + struct ocrdma_av *av; + u16 sgid_index; + u32 id; +}; + +struct ocrdma_qp_hwq_info { + u8 *va; /* virtual address */ + u32 max_sges; + u32 head, tail; + u32 entry_size; + u32 max_cnt; + u32 max_wqe_idx; + u16 dbid; /* qid, where to ring the doorbell. */ + u32 len; + dma_addr_t pa; +}; + +struct ocrdma_srq { + struct ib_srq ibsrq; + u8 __iomem *db; + struct ocrdma_qp_hwq_info rq; + u64 *rqe_wr_id_tbl; + u32 *idx_bit_fields; + u32 bit_fields_len; + + /* provide synchronization to multiple context(s) posting rqe */ + spinlock_t q_lock ____cacheline_aligned; + + struct ocrdma_pd *pd; + u32 id; +}; + +struct ocrdma_qp { + struct ib_qp ibqp; + struct ocrdma_dev *dev; + + u8 __iomem *sq_db; + struct ocrdma_qp_hwq_info sq; + struct { + uint64_t wrid; + uint16_t dpp_wqe_idx; + uint16_t dpp_wqe; + uint8_t signaled; + uint8_t rsvd[3]; + } *wqe_wr_id_tbl; + u32 max_inline_data; + + /* provide synchronization to multiple context(s) posting wqe, rqe */ + spinlock_t q_lock ____cacheline_aligned; + struct ocrdma_cq *sq_cq; + /* list maintained per CQ to flush SQ errors */ + struct list_head sq_entry; + + u8 __iomem *rq_db; + struct ocrdma_qp_hwq_info rq; + u64 *rqe_wr_id_tbl; + struct ocrdma_cq *rq_cq; + struct ocrdma_srq *srq; + /* list maintained per CQ to flush RQ errors */ + struct list_head rq_entry; + + enum ocrdma_qp_state state; /* QP state */ + int cap_flags; + u32 max_ord, max_ird; + + u32 id; + struct ocrdma_pd *pd; + + enum ib_qp_type qp_type; + + int sgid_idx; + u32 qkey; + bool dpp_enabled; + u8 *ird_q_va; + bool signaled; +}; + +struct ocrdma_ucontext { + struct ib_ucontext ibucontext; + + struct list_head mm_head; + struct mutex mm_list_lock; /* protects list entries of mm type */ + struct ocrdma_pd *cntxt_pd; + int pd_in_use; + + struct { + u32 *va; + dma_addr_t pa; + u32 len; + } ah_tbl; +}; + +struct ocrdma_mm { + struct { + u64 phy_addr; + unsigned long len; + } key; + struct list_head entry; +}; + +static inline struct ocrdma_dev *get_ocrdma_dev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct ocrdma_dev, ibdev); +} + +static inline struct ocrdma_ucontext *get_ocrdma_ucontext(struct ib_ucontext + *ibucontext) +{ + return container_of(ibucontext, struct ocrdma_ucontext, ibucontext); +} + +static inline struct ocrdma_pd *get_ocrdma_pd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct ocrdma_pd, ibpd); +} + +static inline struct ocrdma_cq *get_ocrdma_cq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct ocrdma_cq, ibcq); +} + +static inline struct ocrdma_qp *get_ocrdma_qp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct ocrdma_qp, ibqp); +} + +static inline struct ocrdma_mr *get_ocrdma_mr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct ocrdma_mr, ibmr); +} + +static inline struct ocrdma_ah *get_ocrdma_ah(struct ib_ah *ibah) +{ + return container_of(ibah, struct ocrdma_ah, ibah); +} + +static inline struct ocrdma_srq *get_ocrdma_srq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct ocrdma_srq, ibsrq); +} + +static inline int is_cqe_valid(struct ocrdma_cq *cq, struct ocrdma_cqe *cqe) +{ + int cqe_valid; + cqe_valid = le32_to_cpu(cqe->flags_status_srcqpn) & OCRDMA_CQE_VALID; + return (cqe_valid == cq->phase); +} + +static inline int is_cqe_for_sq(struct ocrdma_cqe *cqe) +{ + return (le32_to_cpu(cqe->flags_status_srcqpn) & + OCRDMA_CQE_QTYPE) ? 0 : 1; +} + +static inline int is_cqe_invalidated(struct ocrdma_cqe *cqe) +{ + return (le32_to_cpu(cqe->flags_status_srcqpn) & + OCRDMA_CQE_INVALIDATE) ? 1 : 0; +} + +static inline int is_cqe_imm(struct ocrdma_cqe *cqe) +{ + return (le32_to_cpu(cqe->flags_status_srcqpn) & + OCRDMA_CQE_IMM) ? 1 : 0; +} + +static inline int is_cqe_wr_imm(struct ocrdma_cqe *cqe) +{ + return (le32_to_cpu(cqe->flags_status_srcqpn) & + OCRDMA_CQE_WRITE_IMM) ? 1 : 0; +} + +static inline int ocrdma_resolve_dmac(struct ocrdma_dev *dev, + struct ib_ah_attr *ah_attr, u8 *mac_addr) +{ + struct in6_addr in6; + + memcpy(&in6, ah_attr->grh.dgid.raw, sizeof(in6)); + if (rdma_is_multicast_addr(&in6)) + rdma_get_mcast_mac(&in6, mac_addr); + else + memcpy(mac_addr, ah_attr->dmac, ETH_ALEN); + return 0; +} + +static inline char *hca_name(struct ocrdma_dev *dev) +{ + switch (dev->nic_info.pdev->device) { + case OC_SKH_DEVICE_PF: + case OC_SKH_DEVICE_VF: + return OC_NAME_SH; + default: + return OC_NAME_UNKNOWN; + } +} + +static inline int ocrdma_get_eq_table_index(struct ocrdma_dev *dev, + int eqid) +{ + int indx; + + for (indx = 0; indx < dev->eq_cnt; indx++) { + if (dev->eq_tbl[indx].q.id == eqid) + return indx; + } + + return -EINVAL; +} + +static inline u8 ocrdma_get_asic_type(struct ocrdma_dev *dev) +{ + if (dev->nic_info.dev_family == 0xF && !dev->asic_id) { + pci_read_config_dword( + dev->nic_info.pdev, + OCRDMA_SLI_ASIC_ID_OFFSET, &dev->asic_id); + } + + return (dev->asic_id & OCRDMA_SLI_ASIC_GEN_NUM_MASK) >> + OCRDMA_SLI_ASIC_GEN_NUM_SHIFT; +} + +static inline u8 ocrdma_get_pfc_prio(u8 *pfc, u8 prio) +{ + return *(pfc + prio); +} + +static inline u8 ocrdma_get_app_prio(u8 *app_prio, u8 prio) +{ + return *(app_prio + prio); +} + +static inline u8 ocrdma_is_enabled_and_synced(u32 state) +{ /* May also be used to interpret TC-state, QCN-state + * Appl-state and Logical-link-state in future. + */ + return (state & OCRDMA_STATE_FLAG_ENABLED) && + (state & OCRDMA_STATE_FLAG_SYNC); +} + +#endif diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_abi.h b/drivers/infiniband/hw/ocrdma/ocrdma_abi.h new file mode 100644 index 0000000..1554cca --- /dev/null +++ b/drivers/infiniband/hw/ocrdma/ocrdma_abi.h @@ -0,0 +1,134 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for * + * RoCE (RDMA over Converged Ethernet) adapters. * + * Copyright (C) 2008-2012 Emulex. All rights reserved. * + * EMULEX and SLI are trademarks of Emulex. * + * www.emulex.com * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of version 2 of the GNU General * + * Public License as published by the Free Software Foundation. * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID. See the GNU General Public License for * + * more details, a copy of which can be found in the file COPYING * + * included with this package. * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#ifndef __OCRDMA_ABI_H__ +#define __OCRDMA_ABI_H__ + +#define OCRDMA_ABI_VERSION 2 +#define OCRDMA_BE_ROCE_ABI_VERSION 1 +/* user kernel communication data structures. */ + +struct ocrdma_alloc_ucontext_resp { + u32 dev_id; + u32 wqe_size; + u32 max_inline_data; + u32 dpp_wqe_size; + u64 ah_tbl_page; + u32 ah_tbl_len; + u32 rqe_size; + u8 fw_ver[32]; + /* for future use/new features in progress */ + u64 rsvd1; + u64 rsvd2; +}; + +struct ocrdma_alloc_pd_ureq { + u64 rsvd1; +}; + +struct ocrdma_alloc_pd_uresp { + u32 id; + u32 dpp_enabled; + u32 dpp_page_addr_hi; + u32 dpp_page_addr_lo; + u64 rsvd1; +}; + +struct ocrdma_create_cq_ureq { + u32 dpp_cq; + u32 rsvd; /* pad */ +}; + +#define MAX_CQ_PAGES 8 +struct ocrdma_create_cq_uresp { + u32 cq_id; + u32 page_size; + u32 num_pages; + u32 max_hw_cqe; + u64 page_addr[MAX_CQ_PAGES]; + u64 db_page_addr; + u32 db_page_size; + u32 phase_change; + /* for future use/new features in progress */ + u64 rsvd1; + u64 rsvd2; +}; + +#define MAX_QP_PAGES 8 +#define MAX_UD_AV_PAGES 8 + +struct ocrdma_create_qp_ureq { + u8 enable_dpp_cq; + u8 rsvd; + u16 dpp_cq_id; + u32 rsvd1; /* pad */ +}; + +struct ocrdma_create_qp_uresp { + u16 qp_id; + u16 sq_dbid; + u16 rq_dbid; + u16 resv0; /* pad */ + u32 sq_page_size; + u32 rq_page_size; + u32 num_sq_pages; + u32 num_rq_pages; + u64 sq_page_addr[MAX_QP_PAGES]; + u64 rq_page_addr[MAX_QP_PAGES]; + u64 db_page_addr; + u32 db_page_size; + u32 dpp_credit; + u32 dpp_offset; + u32 num_wqe_allocated; + u32 num_rqe_allocated; + u32 db_sq_offset; + u32 db_rq_offset; + u32 db_shift; + u64 rsvd[11]; +} __packed; + +struct ocrdma_create_srq_uresp { + u16 rq_dbid; + u16 resv0; /* pad */ + u32 resv1; + + u32 rq_page_size; + u32 num_rq_pages; + + u64 rq_page_addr[MAX_QP_PAGES]; + u64 db_page_addr; + + u32 db_page_size; + u32 num_rqe_allocated; + u32 db_rq_offset; + u32 db_shift; + + u64 rsvd2; + u64 rsvd3; +}; + +#endif /* __OCRDMA_ABI_H__ */ diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c new file mode 100644 index 0000000..ac02ce4 --- /dev/null +++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c @@ -0,0 +1,198 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for * + * RoCE (RDMA over Converged Ethernet) adapters. * + * Copyright (C) 2008-2012 Emulex. All rights reserved. * + * EMULEX and SLI are trademarks of Emulex. * + * www.emulex.com * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of version 2 of the GNU General * + * Public License as published by the Free Software Foundation. * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID. See the GNU General Public License for * + * more details, a copy of which can be found in the file COPYING * + * included with this package. * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#include +#include + +#include + +#include "ocrdma.h" +#include "ocrdma_verbs.h" +#include "ocrdma_ah.h" +#include "ocrdma_hw.h" + +#define OCRDMA_VID_PCP_SHIFT 0xD + +static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah, + struct ib_ah_attr *attr, union ib_gid *sgid, int pdid) +{ + int status = 0; + u16 vlan_tag; bool vlan_enabled = false; + struct ocrdma_eth_vlan eth; + struct ocrdma_grh grh; + int eth_sz; + + memset(ð, 0, sizeof(eth)); + memset(&grh, 0, sizeof(grh)); + + /* VLAN */ + vlan_tag = attr->vlan_id; + if (!vlan_tag || (vlan_tag > 0xFFF)) + vlan_tag = dev->pvid; + if (vlan_tag && (vlan_tag < 0x1000)) { + eth.eth_type = cpu_to_be16(0x8100); + eth.roce_eth_type = cpu_to_be16(OCRDMA_ROCE_ETH_TYPE); + vlan_tag |= (dev->sl & 0x07) << OCRDMA_VID_PCP_SHIFT; + eth.vlan_tag = cpu_to_be16(vlan_tag); + eth_sz = sizeof(struct ocrdma_eth_vlan); + vlan_enabled = true; + } else { + eth.eth_type = cpu_to_be16(OCRDMA_ROCE_ETH_TYPE); + eth_sz = sizeof(struct ocrdma_eth_basic); + } + /* MAC */ + memcpy(ð.smac[0], &dev->nic_info.mac_addr[0], ETH_ALEN); + status = ocrdma_resolve_dmac(dev, attr, ð.dmac[0]); + if (status) + return status; + ah->sgid_index = attr->grh.sgid_index; + memcpy(&grh.sgid[0], sgid->raw, sizeof(union ib_gid)); + memcpy(&grh.dgid[0], attr->grh.dgid.raw, sizeof(attr->grh.dgid.raw)); + + grh.tclass_flow = cpu_to_be32((6 << 28) | + (attr->grh.traffic_class << 24) | + attr->grh.flow_label); + /* 0x1b is next header value in GRH */ + grh.pdid_hoplimit = cpu_to_be32((pdid << 16) | + (0x1b << 8) | attr->grh.hop_limit); + /* Eth HDR */ + memcpy(&ah->av->eth_hdr, ð, eth_sz); + memcpy((u8 *)ah->av + eth_sz, &grh, sizeof(struct ocrdma_grh)); + if (vlan_enabled) + ah->av->valid |= OCRDMA_AV_VLAN_VALID; + ah->av->valid = cpu_to_le32(ah->av->valid); + return status; +} + +struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr) +{ + u32 *ahid_addr; + int status; + struct ocrdma_ah *ah; + struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); + struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); + union ib_gid sgid; + u8 zmac[ETH_ALEN]; + + if (!(attr->ah_flags & IB_AH_GRH)) + return ERR_PTR(-EINVAL); + + if (atomic_cmpxchg(&dev->update_sl, 1, 0)) + ocrdma_init_service_level(dev); + ah = kzalloc(sizeof(*ah), GFP_ATOMIC); + if (!ah) + return ERR_PTR(-ENOMEM); + + status = ocrdma_alloc_av(dev, ah); + if (status) + goto av_err; + + status = ocrdma_query_gid(&dev->ibdev, 1, attr->grh.sgid_index, &sgid); + if (status) { + pr_err("%s(): Failed to query sgid, status = %d\n", + __func__, status); + goto av_conf_err; + } + + memset(&zmac, 0, ETH_ALEN); + if (pd->uctx && + memcmp(attr->dmac, &zmac, ETH_ALEN)) { + status = rdma_addr_find_dmac_by_grh(&sgid, &attr->grh.dgid, + attr->dmac, &attr->vlan_id); + if (status) { + pr_err("%s(): Failed to resolve dmac from gid." + "status = %d\n", __func__, status); + goto av_conf_err; + } + } + + status = set_av_attr(dev, ah, attr, &sgid, pd->id); + if (status) + goto av_conf_err; + + /* if pd is for the user process, pass the ah_id to user space */ + if ((pd->uctx) && (pd->uctx->ah_tbl.va)) { + ahid_addr = pd->uctx->ah_tbl.va + attr->dlid; + *ahid_addr = ah->id; + } + return &ah->ibah; + +av_conf_err: + ocrdma_free_av(dev, ah); +av_err: + kfree(ah); + return ERR_PTR(status); +} + +int ocrdma_destroy_ah(struct ib_ah *ibah) +{ + struct ocrdma_ah *ah = get_ocrdma_ah(ibah); + struct ocrdma_dev *dev = get_ocrdma_dev(ibah->device); + + ocrdma_free_av(dev, ah); + kfree(ah); + return 0; +} + +int ocrdma_query_ah(struct ib_ah *ibah, struct ib_ah_attr *attr) +{ + struct ocrdma_ah *ah = get_ocrdma_ah(ibah); + struct ocrdma_av *av = ah->av; + struct ocrdma_grh *grh; + attr->ah_flags |= IB_AH_GRH; + if (ah->av->valid & OCRDMA_AV_VALID) { + grh = (struct ocrdma_grh *)((u8 *)ah->av + + sizeof(struct ocrdma_eth_vlan)); + attr->sl = be16_to_cpu(av->eth_hdr.vlan_tag) >> 13; + } else { + grh = (struct ocrdma_grh *)((u8 *)ah->av + + sizeof(struct ocrdma_eth_basic)); + attr->sl = 0; + } + memcpy(&attr->grh.dgid.raw[0], &grh->dgid[0], sizeof(grh->dgid)); + attr->grh.sgid_index = ah->sgid_index; + attr->grh.hop_limit = be32_to_cpu(grh->pdid_hoplimit) & 0xff; + attr->grh.traffic_class = be32_to_cpu(grh->tclass_flow) >> 24; + attr->grh.flow_label = be32_to_cpu(grh->tclass_flow) & 0x00ffffffff; + return 0; +} + +int ocrdma_modify_ah(struct ib_ah *ibah, struct ib_ah_attr *attr) +{ + /* modify_ah is unsupported */ + return -ENOSYS; +} + +int ocrdma_process_mad(struct ib_device *ibdev, + int process_mad_flags, + u8 port_num, + struct ib_wc *in_wc, + struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + return IB_MAD_RESULT_SUCCESS; +} diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.h b/drivers/infiniband/hw/ocrdma/ocrdma_ah.h new file mode 100644 index 0000000..8ac49e7 --- /dev/null +++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.h @@ -0,0 +1,42 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for * + * RoCE (RDMA over Converged Ethernet) adapters. * + * Copyright (C) 2008-2012 Emulex. All rights reserved. * + * EMULEX and SLI are trademarks of Emulex. * + * www.emulex.com * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of version 2 of the GNU General * + * Public License as published by the Free Software Foundation. * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID. See the GNU General Public License for * + * more details, a copy of which can be found in the file COPYING * + * included with this package. * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#ifndef __OCRDMA_AH_H__ +#define __OCRDMA_AH_H__ + +struct ib_ah *ocrdma_create_ah(struct ib_pd *, struct ib_ah_attr *); +int ocrdma_destroy_ah(struct ib_ah *); +int ocrdma_query_ah(struct ib_ah *, struct ib_ah_attr *); +int ocrdma_modify_ah(struct ib_ah *, struct ib_ah_attr *); + +int ocrdma_process_mad(struct ib_device *, + int process_mad_flags, + u8 port_num, + struct ib_wc *in_wc, + struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad); +#endif /* __OCRDMA_AH_H__ */ diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c new file mode 100644 index 0000000..638bff1 --- /dev/null +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -0,0 +1,2925 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for * + * RoCE (RDMA over Converged Ethernet) CNA Adapters. * + * Copyright (C) 2008-2012 Emulex. All rights reserved. * + * EMULEX and SLI are trademarks of Emulex. * + * www.emulex.com * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of version 2 of the GNU General * + * Public License as published by the Free Software Foundation. * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID. See the GNU General Public License for * + * more details, a copy of which can be found in the file COPYING * + * included with this package. * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#include +#include +#include +#include + +#include +#include + +#include "ocrdma.h" +#include "ocrdma_hw.h" +#include "ocrdma_verbs.h" +#include "ocrdma_ah.h" + +enum mbx_status { + OCRDMA_MBX_STATUS_FAILED = 1, + OCRDMA_MBX_STATUS_ILLEGAL_FIELD = 3, + OCRDMA_MBX_STATUS_OOR = 100, + OCRDMA_MBX_STATUS_INVALID_PD = 101, + OCRDMA_MBX_STATUS_PD_INUSE = 102, + OCRDMA_MBX_STATUS_INVALID_CQ = 103, + OCRDMA_MBX_STATUS_INVALID_QP = 104, + OCRDMA_MBX_STATUS_INVALID_LKEY = 105, + OCRDMA_MBX_STATUS_ORD_EXCEEDS = 106, + OCRDMA_MBX_STATUS_IRD_EXCEEDS = 107, + OCRDMA_MBX_STATUS_SENDQ_WQE_EXCEEDS = 108, + OCRDMA_MBX_STATUS_RECVQ_RQE_EXCEEDS = 109, + OCRDMA_MBX_STATUS_SGE_SEND_EXCEEDS = 110, + OCRDMA_MBX_STATUS_SGE_WRITE_EXCEEDS = 111, + OCRDMA_MBX_STATUS_SGE_RECV_EXCEEDS = 112, + OCRDMA_MBX_STATUS_INVALID_STATE_CHANGE = 113, + OCRDMA_MBX_STATUS_MW_BOUND = 114, + OCRDMA_MBX_STATUS_INVALID_VA = 115, + OCRDMA_MBX_STATUS_INVALID_LENGTH = 116, + OCRDMA_MBX_STATUS_INVALID_FBO = 117, + OCRDMA_MBX_STATUS_INVALID_ACC_RIGHTS = 118, + OCRDMA_MBX_STATUS_INVALID_PBE_SIZE = 119, + OCRDMA_MBX_STATUS_INVALID_PBL_ENTRY = 120, + OCRDMA_MBX_STATUS_INVALID_PBL_SHIFT = 121, + OCRDMA_MBX_STATUS_INVALID_SRQ_ID = 129, + OCRDMA_MBX_STATUS_SRQ_ERROR = 133, + OCRDMA_MBX_STATUS_RQE_EXCEEDS = 134, + OCRDMA_MBX_STATUS_MTU_EXCEEDS = 135, + OCRDMA_MBX_STATUS_MAX_QP_EXCEEDS = 136, + OCRDMA_MBX_STATUS_SRQ_LIMIT_EXCEEDS = 137, + OCRDMA_MBX_STATUS_SRQ_SIZE_UNDERUNS = 138, + OCRDMA_MBX_STATUS_QP_BOUND = 130, + OCRDMA_MBX_STATUS_INVALID_CHANGE = 139, + OCRDMA_MBX_STATUS_ATOMIC_OPS_UNSUP = 140, + OCRDMA_MBX_STATUS_INVALID_RNR_NAK_TIMER = 141, + OCRDMA_MBX_STATUS_MW_STILL_BOUND = 142, + OCRDMA_MBX_STATUS_PKEY_INDEX_INVALID = 143, + OCRDMA_MBX_STATUS_PKEY_INDEX_EXCEEDS = 144 +}; + +enum additional_status { + OCRDMA_MBX_ADDI_STATUS_INSUFFICIENT_RESOURCES = 22 +}; + +enum cqe_status { + OCRDMA_MBX_CQE_STATUS_INSUFFICIENT_PRIVILEDGES = 1, + OCRDMA_MBX_CQE_STATUS_INVALID_PARAMETER = 2, + OCRDMA_MBX_CQE_STATUS_INSUFFICIENT_RESOURCES = 3, + OCRDMA_MBX_CQE_STATUS_QUEUE_FLUSHING = 4, + OCRDMA_MBX_CQE_STATUS_DMA_FAILED = 5 +}; + +static inline void *ocrdma_get_eqe(struct ocrdma_eq *eq) +{ + return eq->q.va + (eq->q.tail * sizeof(struct ocrdma_eqe)); +} + +static inline void ocrdma_eq_inc_tail(struct ocrdma_eq *eq) +{ + eq->q.tail = (eq->q.tail + 1) & (OCRDMA_EQ_LEN - 1); +} + +static inline void *ocrdma_get_mcqe(struct ocrdma_dev *dev) +{ + struct ocrdma_mcqe *cqe = (struct ocrdma_mcqe *) + (dev->mq.cq.va + (dev->mq.cq.tail * sizeof(struct ocrdma_mcqe))); + + if (!(le32_to_cpu(cqe->valid_ae_cmpl_cons) & OCRDMA_MCQE_VALID_MASK)) + return NULL; + return cqe; +} + +static inline void ocrdma_mcq_inc_tail(struct ocrdma_dev *dev) +{ + dev->mq.cq.tail = (dev->mq.cq.tail + 1) & (OCRDMA_MQ_CQ_LEN - 1); +} + +static inline struct ocrdma_mqe *ocrdma_get_mqe(struct ocrdma_dev *dev) +{ + return dev->mq.sq.va + (dev->mq.sq.head * sizeof(struct ocrdma_mqe)); +} + +static inline void ocrdma_mq_inc_head(struct ocrdma_dev *dev) +{ + dev->mq.sq.head = (dev->mq.sq.head + 1) & (OCRDMA_MQ_LEN - 1); +} + +static inline void *ocrdma_get_mqe_rsp(struct ocrdma_dev *dev) +{ + return dev->mq.sq.va + (dev->mqe_ctx.tag * sizeof(struct ocrdma_mqe)); +} + +enum ib_qp_state get_ibqp_state(enum ocrdma_qp_state qps) +{ + switch (qps) { + case OCRDMA_QPS_RST: + return IB_QPS_RESET; + case OCRDMA_QPS_INIT: + return IB_QPS_INIT; + case OCRDMA_QPS_RTR: + return IB_QPS_RTR; + case OCRDMA_QPS_RTS: + return IB_QPS_RTS; + case OCRDMA_QPS_SQD: + case OCRDMA_QPS_SQ_DRAINING: + return IB_QPS_SQD; + case OCRDMA_QPS_SQE: + return IB_QPS_SQE; + case OCRDMA_QPS_ERR: + return IB_QPS_ERR; + } + return IB_QPS_ERR; +} + +static enum ocrdma_qp_state get_ocrdma_qp_state(enum ib_qp_state qps) +{ + switch (qps) { + case IB_QPS_RESET: + return OCRDMA_QPS_RST; + case IB_QPS_INIT: + return OCRDMA_QPS_INIT; + case IB_QPS_RTR: + return OCRDMA_QPS_RTR; + case IB_QPS_RTS: + return OCRDMA_QPS_RTS; + case IB_QPS_SQD: + return OCRDMA_QPS_SQD; + case IB_QPS_SQE: + return OCRDMA_QPS_SQE; + case IB_QPS_ERR: + return OCRDMA_QPS_ERR; + } + return OCRDMA_QPS_ERR; +} + +static int ocrdma_get_mbx_errno(u32 status) +{ + int err_num; + u8 mbox_status = (status & OCRDMA_MBX_RSP_STATUS_MASK) >> + OCRDMA_MBX_RSP_STATUS_SHIFT; + u8 add_status = (status & OCRDMA_MBX_RSP_ASTATUS_MASK) >> + OCRDMA_MBX_RSP_ASTATUS_SHIFT; + + switch (mbox_status) { + case OCRDMA_MBX_STATUS_OOR: + case OCRDMA_MBX_STATUS_MAX_QP_EXCEEDS: + err_num = -EAGAIN; + break; + + case OCRDMA_MBX_STATUS_INVALID_PD: + case OCRDMA_MBX_STATUS_INVALID_CQ: + case OCRDMA_MBX_STATUS_INVALID_SRQ_ID: + case OCRDMA_MBX_STATUS_INVALID_QP: + case OCRDMA_MBX_STATUS_INVALID_CHANGE: + case OCRDMA_MBX_STATUS_MTU_EXCEEDS: + case OCRDMA_MBX_STATUS_INVALID_RNR_NAK_TIMER: + case OCRDMA_MBX_STATUS_PKEY_INDEX_INVALID: + case OCRDMA_MBX_STATUS_PKEY_INDEX_EXCEEDS: + case OCRDMA_MBX_STATUS_ILLEGAL_FIELD: + case OCRDMA_MBX_STATUS_INVALID_PBL_ENTRY: + case OCRDMA_MBX_STATUS_INVALID_LKEY: + case OCRDMA_MBX_STATUS_INVALID_VA: + case OCRDMA_MBX_STATUS_INVALID_LENGTH: + case OCRDMA_MBX_STATUS_INVALID_FBO: + case OCRDMA_MBX_STATUS_INVALID_ACC_RIGHTS: + case OCRDMA_MBX_STATUS_INVALID_PBE_SIZE: + case OCRDMA_MBX_STATUS_ATOMIC_OPS_UNSUP: + case OCRDMA_MBX_STATUS_SRQ_ERROR: + case OCRDMA_MBX_STATUS_SRQ_SIZE_UNDERUNS: + err_num = -EINVAL; + break; + + case OCRDMA_MBX_STATUS_PD_INUSE: + case OCRDMA_MBX_STATUS_QP_BOUND: + case OCRDMA_MBX_STATUS_MW_STILL_BOUND: + case OCRDMA_MBX_STATUS_MW_BOUND: + err_num = -EBUSY; + break; + + case OCRDMA_MBX_STATUS_RECVQ_RQE_EXCEEDS: + case OCRDMA_MBX_STATUS_SGE_RECV_EXCEEDS: + case OCRDMA_MBX_STATUS_RQE_EXCEEDS: + case OCRDMA_MBX_STATUS_SRQ_LIMIT_EXCEEDS: + case OCRDMA_MBX_STATUS_ORD_EXCEEDS: + case OCRDMA_MBX_STATUS_IRD_EXCEEDS: + case OCRDMA_MBX_STATUS_SENDQ_WQE_EXCEEDS: + case OCRDMA_MBX_STATUS_SGE_SEND_EXCEEDS: + case OCRDMA_MBX_STATUS_SGE_WRITE_EXCEEDS: + err_num = -ENOBUFS; + break; + + case OCRDMA_MBX_STATUS_FAILED: + switch (add_status) { + case OCRDMA_MBX_ADDI_STATUS_INSUFFICIENT_RESOURCES: + err_num = -EAGAIN; + break; + } + default: + err_num = -EFAULT; + } + return err_num; +} + +char *port_speed_string(struct ocrdma_dev *dev) +{ + char *str = ""; + u16 speeds_supported; + + speeds_supported = dev->phy.fixed_speeds_supported | + dev->phy.auto_speeds_supported; + if (speeds_supported & OCRDMA_PHY_SPEED_40GBPS) + str = "40Gbps "; + else if (speeds_supported & OCRDMA_PHY_SPEED_10GBPS) + str = "10Gbps "; + else if (speeds_supported & OCRDMA_PHY_SPEED_1GBPS) + str = "1Gbps "; + + return str; +} + +static int ocrdma_get_mbx_cqe_errno(u16 cqe_status) +{ + int err_num = -EINVAL; + + switch (cqe_status) { + case OCRDMA_MBX_CQE_STATUS_INSUFFICIENT_PRIVILEDGES: + err_num = -EPERM; + break; + case OCRDMA_MBX_CQE_STATUS_INVALID_PARAMETER: + err_num = -EINVAL; + break; + case OCRDMA_MBX_CQE_STATUS_INSUFFICIENT_RESOURCES: + case OCRDMA_MBX_CQE_STATUS_QUEUE_FLUSHING: + err_num = -EINVAL; + break; + case OCRDMA_MBX_CQE_STATUS_DMA_FAILED: + default: + err_num = -EINVAL; + break; + } + return err_num; +} + +void ocrdma_ring_cq_db(struct ocrdma_dev *dev, u16 cq_id, bool armed, + bool solicited, u16 cqe_popped) +{ + u32 val = cq_id & OCRDMA_DB_CQ_RING_ID_MASK; + + val |= ((cq_id & OCRDMA_DB_CQ_RING_ID_EXT_MASK) << + OCRDMA_DB_CQ_RING_ID_EXT_MASK_SHIFT); + + if (armed) + val |= (1 << OCRDMA_DB_CQ_REARM_SHIFT); + if (solicited) + val |= (1 << OCRDMA_DB_CQ_SOLICIT_SHIFT); + val |= (cqe_popped << OCRDMA_DB_CQ_NUM_POPPED_SHIFT); + iowrite32(val, dev->nic_info.db + OCRDMA_DB_CQ_OFFSET); +} + +static void ocrdma_ring_mq_db(struct ocrdma_dev *dev) +{ + u32 val = 0; + + val |= dev->mq.sq.id & OCRDMA_MQ_ID_MASK; + val |= 1 << OCRDMA_MQ_NUM_MQE_SHIFT; + iowrite32(val, dev->nic_info.db + OCRDMA_DB_MQ_OFFSET); +} + +static void ocrdma_ring_eq_db(struct ocrdma_dev *dev, u16 eq_id, + bool arm, bool clear_int, u16 num_eqe) +{ + u32 val = 0; + + val |= eq_id & OCRDMA_EQ_ID_MASK; + val |= ((eq_id & OCRDMA_EQ_ID_EXT_MASK) << OCRDMA_EQ_ID_EXT_MASK_SHIFT); + if (arm) + val |= (1 << OCRDMA_REARM_SHIFT); + if (clear_int) + val |= (1 << OCRDMA_EQ_CLR_SHIFT); + val |= (1 << OCRDMA_EQ_TYPE_SHIFT); + val |= (num_eqe << OCRDMA_NUM_EQE_SHIFT); + iowrite32(val, dev->nic_info.db + OCRDMA_DB_EQ_OFFSET); +} + +static void ocrdma_init_mch(struct ocrdma_mbx_hdr *cmd_hdr, + u8 opcode, u8 subsys, u32 cmd_len) +{ + cmd_hdr->subsys_op = (opcode | (subsys << OCRDMA_MCH_SUBSYS_SHIFT)); + cmd_hdr->timeout = 20; /* seconds */ + cmd_hdr->cmd_len = cmd_len - sizeof(struct ocrdma_mbx_hdr); +} + +static void *ocrdma_init_emb_mqe(u8 opcode, u32 cmd_len) +{ + struct ocrdma_mqe *mqe; + + mqe = kzalloc(sizeof(struct ocrdma_mqe), GFP_KERNEL); + if (!mqe) + return NULL; + mqe->hdr.spcl_sge_cnt_emb |= + (OCRDMA_MQE_EMBEDDED << OCRDMA_MQE_HDR_EMB_SHIFT) & + OCRDMA_MQE_HDR_EMB_MASK; + mqe->hdr.pyld_len = cmd_len - sizeof(struct ocrdma_mqe_hdr); + + ocrdma_init_mch(&mqe->u.emb_req.mch, opcode, OCRDMA_SUBSYS_ROCE, + mqe->hdr.pyld_len); + return mqe; +} + +static void ocrdma_free_q(struct ocrdma_dev *dev, struct ocrdma_queue_info *q) +{ + dma_free_coherent(&dev->nic_info.pdev->dev, q->size, q->va, q->dma); +} + +static int ocrdma_alloc_q(struct ocrdma_dev *dev, + struct ocrdma_queue_info *q, u16 len, u16 entry_size) +{ + memset(q, 0, sizeof(*q)); + q->len = len; + q->entry_size = entry_size; + q->size = len * entry_size; + q->va = dma_alloc_coherent(&dev->nic_info.pdev->dev, q->size, + &q->dma, GFP_KERNEL); + if (!q->va) + return -ENOMEM; + memset(q->va, 0, q->size); + return 0; +} + +static void ocrdma_build_q_pages(struct ocrdma_pa *q_pa, int cnt, + dma_addr_t host_pa, int hw_page_size) +{ + int i; + + for (i = 0; i < cnt; i++) { + q_pa[i].lo = (u32) (host_pa & 0xffffffff); + q_pa[i].hi = (u32) upper_32_bits(host_pa); + host_pa += hw_page_size; + } +} + +static int ocrdma_mbx_delete_q(struct ocrdma_dev *dev, + struct ocrdma_queue_info *q, int queue_type) +{ + u8 opcode = 0; + int status; + struct ocrdma_delete_q_req *cmd = dev->mbx_cmd; + + switch (queue_type) { + case QTYPE_MCCQ: + opcode = OCRDMA_CMD_DELETE_MQ; + break; + case QTYPE_CQ: + opcode = OCRDMA_CMD_DELETE_CQ; + break; + case QTYPE_EQ: + opcode = OCRDMA_CMD_DELETE_EQ; + break; + default: + BUG(); + } + memset(cmd, 0, sizeof(*cmd)); + ocrdma_init_mch(&cmd->req, opcode, OCRDMA_SUBSYS_COMMON, sizeof(*cmd)); + cmd->id = q->id; + + status = be_roce_mcc_cmd(dev->nic_info.netdev, + cmd, sizeof(*cmd), NULL, NULL); + if (!status) + q->created = false; + return status; +} + +static int ocrdma_mbx_create_eq(struct ocrdma_dev *dev, struct ocrdma_eq *eq) +{ + int status; + struct ocrdma_create_eq_req *cmd = dev->mbx_cmd; + struct ocrdma_create_eq_rsp *rsp = dev->mbx_cmd; + + memset(cmd, 0, sizeof(*cmd)); + ocrdma_init_mch(&cmd->req, OCRDMA_CMD_CREATE_EQ, OCRDMA_SUBSYS_COMMON, + sizeof(*cmd)); + + cmd->req.rsvd_version = 2; + cmd->num_pages = 4; + cmd->valid = OCRDMA_CREATE_EQ_VALID; + cmd->cnt = 4 << OCRDMA_CREATE_EQ_CNT_SHIFT; + + ocrdma_build_q_pages(&cmd->pa[0], cmd->num_pages, eq->q.dma, + PAGE_SIZE_4K); + status = be_roce_mcc_cmd(dev->nic_info.netdev, cmd, sizeof(*cmd), NULL, + NULL); + if (!status) { + eq->q.id = rsp->vector_eqid & 0xffff; + eq->vector = (rsp->vector_eqid >> 16) & 0xffff; + eq->q.created = true; + } + return status; +} + +static int ocrdma_create_eq(struct ocrdma_dev *dev, + struct ocrdma_eq *eq, u16 q_len) +{ + int status; + + status = ocrdma_alloc_q(dev, &eq->q, OCRDMA_EQ_LEN, + sizeof(struct ocrdma_eqe)); + if (status) + return status; + + status = ocrdma_mbx_create_eq(dev, eq); + if (status) + goto mbx_err; + eq->dev = dev; + ocrdma_ring_eq_db(dev, eq->q.id, true, true, 0); + + return 0; +mbx_err: + ocrdma_free_q(dev, &eq->q); + return status; +} + +int ocrdma_get_irq(struct ocrdma_dev *dev, struct ocrdma_eq *eq) +{ + int irq; + + if (dev->nic_info.intr_mode == BE_INTERRUPT_MODE_INTX) + irq = dev->nic_info.pdev->irq; + else + irq = dev->nic_info.msix.vector_list[eq->vector]; + return irq; +} + +static void _ocrdma_destroy_eq(struct ocrdma_dev *dev, struct ocrdma_eq *eq) +{ + if (eq->q.created) { + ocrdma_mbx_delete_q(dev, &eq->q, QTYPE_EQ); + ocrdma_free_q(dev, &eq->q); + } +} + +static void ocrdma_destroy_eq(struct ocrdma_dev *dev, struct ocrdma_eq *eq) +{ + int irq; + + /* disarm EQ so that interrupts are not generated + * during freeing and EQ delete is in progress. + */ + ocrdma_ring_eq_db(dev, eq->q.id, false, false, 0); + + irq = ocrdma_get_irq(dev, eq); + free_irq(irq, eq); + _ocrdma_destroy_eq(dev, eq); +} + +static void ocrdma_destroy_eqs(struct ocrdma_dev *dev) +{ + int i; + + for (i = 0; i < dev->eq_cnt; i++) + ocrdma_destroy_eq(dev, &dev->eq_tbl[i]); +} + +static int ocrdma_mbx_mq_cq_create(struct ocrdma_dev *dev, + struct ocrdma_queue_info *cq, + struct ocrdma_queue_info *eq) +{ + struct ocrdma_create_cq_cmd *cmd = dev->mbx_cmd; + struct ocrdma_create_cq_cmd_rsp *rsp = dev->mbx_cmd; + int status; + + memset(cmd, 0, sizeof(*cmd)); + ocrdma_init_mch(&cmd->req, OCRDMA_CMD_CREATE_CQ, + OCRDMA_SUBSYS_COMMON, sizeof(*cmd)); + + cmd->req.rsvd_version = OCRDMA_CREATE_CQ_VER2; + cmd->pgsz_pgcnt = (cq->size / OCRDMA_MIN_Q_PAGE_SIZE) << + OCRDMA_CREATE_CQ_PAGE_SIZE_SHIFT; + cmd->pgsz_pgcnt |= PAGES_4K_SPANNED(cq->va, cq->size); + + cmd->ev_cnt_flags = OCRDMA_CREATE_CQ_DEF_FLAGS; + cmd->eqn = eq->id; + cmd->pdid_cqecnt = cq->size / sizeof(struct ocrdma_mcqe); + + ocrdma_build_q_pages(&cmd->pa[0], cq->size / OCRDMA_MIN_Q_PAGE_SIZE, + cq->dma, PAGE_SIZE_4K); + status = be_roce_mcc_cmd(dev->nic_info.netdev, + cmd, sizeof(*cmd), NULL, NULL); + if (!status) { + cq->id = (u16) (rsp->cq_id & OCRDMA_CREATE_CQ_RSP_CQ_ID_MASK); + cq->created = true; + } + return status; +} + +static u32 ocrdma_encoded_q_len(int q_len) +{ + u32 len_encoded = fls(q_len); /* log2(len) + 1 */ + + if (len_encoded == 16) + len_encoded = 0; + return len_encoded; +} + +static int ocrdma_mbx_create_mq(struct ocrdma_dev *dev, + struct ocrdma_queue_info *mq, + struct ocrdma_queue_info *cq) +{ + int num_pages, status; + struct ocrdma_create_mq_req *cmd = dev->mbx_cmd; + struct ocrdma_create_mq_rsp *rsp = dev->mbx_cmd; + struct ocrdma_pa *pa; + + memset(cmd, 0, sizeof(*cmd)); + num_pages = PAGES_4K_SPANNED(mq->va, mq->size); + + ocrdma_init_mch(&cmd->req, OCRDMA_CMD_CREATE_MQ_EXT, + OCRDMA_SUBSYS_COMMON, sizeof(*cmd)); + cmd->req.rsvd_version = 1; + cmd->cqid_pages = num_pages; + cmd->cqid_pages |= (cq->id << OCRDMA_CREATE_MQ_CQ_ID_SHIFT); + cmd->async_cqid_valid = OCRDMA_CREATE_MQ_ASYNC_CQ_VALID; + + cmd->async_event_bitmap = BIT(OCRDMA_ASYNC_GRP5_EVE_CODE); + cmd->async_event_bitmap |= BIT(OCRDMA_ASYNC_RDMA_EVE_CODE); + + cmd->async_cqid_ringsize = cq->id; + cmd->async_cqid_ringsize |= (ocrdma_encoded_q_len(mq->len) << + OCRDMA_CREATE_MQ_RING_SIZE_SHIFT); + cmd->valid = OCRDMA_CREATE_MQ_VALID; + pa = &cmd->pa[0]; + + ocrdma_build_q_pages(pa, num_pages, mq->dma, PAGE_SIZE_4K); + status = be_roce_mcc_cmd(dev->nic_info.netdev, + cmd, sizeof(*cmd), NULL, NULL); + if (!status) { + mq->id = rsp->id; + mq->created = true; + } + return status; +} + +static int ocrdma_create_mq(struct ocrdma_dev *dev) +{ + int status; + + /* Alloc completion queue for Mailbox queue */ + status = ocrdma_alloc_q(dev, &dev->mq.cq, OCRDMA_MQ_CQ_LEN, + sizeof(struct ocrdma_mcqe)); + if (status) + goto alloc_err; + + dev->eq_tbl[0].cq_cnt++; + status = ocrdma_mbx_mq_cq_create(dev, &dev->mq.cq, &dev->eq_tbl[0].q); + if (status) + goto mbx_cq_free; + + memset(&dev->mqe_ctx, 0, sizeof(dev->mqe_ctx)); + init_waitqueue_head(&dev->mqe_ctx.cmd_wait); + mutex_init(&dev->mqe_ctx.lock); + + /* Alloc Mailbox queue */ + status = ocrdma_alloc_q(dev, &dev->mq.sq, OCRDMA_MQ_LEN, + sizeof(struct ocrdma_mqe)); + if (status) + goto mbx_cq_destroy; + status = ocrdma_mbx_create_mq(dev, &dev->mq.sq, &dev->mq.cq); + if (status) + goto mbx_q_free; + ocrdma_ring_cq_db(dev, dev->mq.cq.id, true, false, 0); + return 0; + +mbx_q_free: + ocrdma_free_q(dev, &dev->mq.sq); +mbx_cq_destroy: + ocrdma_mbx_delete_q(dev, &dev->mq.cq, QTYPE_CQ); +mbx_cq_free: + ocrdma_free_q(dev, &dev->mq.cq); +alloc_err: + return status; +} + +static void ocrdma_destroy_mq(struct ocrdma_dev *dev) +{ + struct ocrdma_queue_info *mbxq, *cq; + + /* mqe_ctx lock synchronizes with any other pending cmds. */ + mutex_lock(&dev->mqe_ctx.lock); + mbxq = &dev->mq.sq; + if (mbxq->created) { + ocrdma_mbx_delete_q(dev, mbxq, QTYPE_MCCQ); + ocrdma_free_q(dev, mbxq); + } + mutex_unlock(&dev->mqe_ctx.lock); + + cq = &dev->mq.cq; + if (cq->created) { + ocrdma_mbx_delete_q(dev, cq, QTYPE_CQ); + ocrdma_free_q(dev, cq); + } +} + +static void ocrdma_process_qpcat_error(struct ocrdma_dev *dev, + struct ocrdma_qp *qp) +{ + enum ib_qp_state new_ib_qps = IB_QPS_ERR; + enum ib_qp_state old_ib_qps; + + if (qp == NULL) + BUG(); + ocrdma_qp_state_change(qp, new_ib_qps, &old_ib_qps); +} + +static void ocrdma_dispatch_ibevent(struct ocrdma_dev *dev, + struct ocrdma_ae_mcqe *cqe) +{ + struct ocrdma_qp *qp = NULL; + struct ocrdma_cq *cq = NULL; + struct ib_event ib_evt; + int cq_event = 0; + int qp_event = 1; + int srq_event = 0; + int dev_event = 0; + int type = (cqe->valid_ae_event & OCRDMA_AE_MCQE_EVENT_TYPE_MASK) >> + OCRDMA_AE_MCQE_EVENT_TYPE_SHIFT; + + if (cqe->qpvalid_qpid & OCRDMA_AE_MCQE_QPVALID) + qp = dev->qp_tbl[cqe->qpvalid_qpid & OCRDMA_AE_MCQE_QPID_MASK]; + if (cqe->cqvalid_cqid & OCRDMA_AE_MCQE_CQVALID) + cq = dev->cq_tbl[cqe->cqvalid_cqid & OCRDMA_AE_MCQE_CQID_MASK]; + + memset(&ib_evt, 0, sizeof(ib_evt)); + + ib_evt.device = &dev->ibdev; + + switch (type) { + case OCRDMA_CQ_ERROR: + ib_evt.element.cq = &cq->ibcq; + ib_evt.event = IB_EVENT_CQ_ERR; + cq_event = 1; + qp_event = 0; + break; + case OCRDMA_CQ_OVERRUN_ERROR: + ib_evt.element.cq = &cq->ibcq; + ib_evt.event = IB_EVENT_CQ_ERR; + cq_event = 1; + qp_event = 0; + break; + case OCRDMA_CQ_QPCAT_ERROR: + ib_evt.element.qp = &qp->ibqp; + ib_evt.event = IB_EVENT_QP_FATAL; + ocrdma_process_qpcat_error(dev, qp); + break; + case OCRDMA_QP_ACCESS_ERROR: + ib_evt.element.qp = &qp->ibqp; + ib_evt.event = IB_EVENT_QP_ACCESS_ERR; + break; + case OCRDMA_QP_COMM_EST_EVENT: + ib_evt.element.qp = &qp->ibqp; + ib_evt.event = IB_EVENT_COMM_EST; + break; + case OCRDMA_SQ_DRAINED_EVENT: + ib_evt.element.qp = &qp->ibqp; + ib_evt.event = IB_EVENT_SQ_DRAINED; + break; + case OCRDMA_DEVICE_FATAL_EVENT: + ib_evt.element.port_num = 1; + ib_evt.event = IB_EVENT_DEVICE_FATAL; + qp_event = 0; + dev_event = 1; + break; + case OCRDMA_SRQCAT_ERROR: + ib_evt.element.srq = &qp->srq->ibsrq; + ib_evt.event = IB_EVENT_SRQ_ERR; + srq_event = 1; + qp_event = 0; + break; + case OCRDMA_SRQ_LIMIT_EVENT: + ib_evt.element.srq = &qp->srq->ibsrq; + ib_evt.event = IB_EVENT_SRQ_LIMIT_REACHED; + srq_event = 1; + qp_event = 0; + break; + case OCRDMA_QP_LAST_WQE_EVENT: + ib_evt.element.qp = &qp->ibqp; + ib_evt.event = IB_EVENT_QP_LAST_WQE_REACHED; + break; + default: + cq_event = 0; + qp_event = 0; + srq_event = 0; + dev_event = 0; + pr_err("%s() unknown type=0x%x\n", __func__, type); + break; + } + + if (qp_event) { + if (qp->ibqp.event_handler) + qp->ibqp.event_handler(&ib_evt, qp->ibqp.qp_context); + } else if (cq_event) { + if (cq->ibcq.event_handler) + cq->ibcq.event_handler(&ib_evt, cq->ibcq.cq_context); + } else if (srq_event) { + if (qp->srq->ibsrq.event_handler) + qp->srq->ibsrq.event_handler(&ib_evt, + qp->srq->ibsrq. + srq_context); + } else if (dev_event) { + pr_err("%s: Fatal event received\n", dev->ibdev.name); + ib_dispatch_event(&ib_evt); + } + +} + +static void ocrdma_process_grp5_aync(struct ocrdma_dev *dev, + struct ocrdma_ae_mcqe *cqe) +{ + struct ocrdma_ae_pvid_mcqe *evt; + int type = (cqe->valid_ae_event & OCRDMA_AE_MCQE_EVENT_TYPE_MASK) >> + OCRDMA_AE_MCQE_EVENT_TYPE_SHIFT; + + switch (type) { + case OCRDMA_ASYNC_EVENT_PVID_STATE: + evt = (struct ocrdma_ae_pvid_mcqe *)cqe; + if ((evt->tag_enabled & OCRDMA_AE_PVID_MCQE_ENABLED_MASK) >> + OCRDMA_AE_PVID_MCQE_ENABLED_SHIFT) + dev->pvid = ((evt->tag_enabled & + OCRDMA_AE_PVID_MCQE_TAG_MASK) >> + OCRDMA_AE_PVID_MCQE_TAG_SHIFT); + break; + + case OCRDMA_ASYNC_EVENT_COS_VALUE: + atomic_set(&dev->update_sl, 1); + break; + default: + /* Not interested evts. */ + break; + } +} + +static void ocrdma_process_acqe(struct ocrdma_dev *dev, void *ae_cqe) +{ + /* async CQE processing */ + struct ocrdma_ae_mcqe *cqe = ae_cqe; + u32 evt_code = (cqe->valid_ae_event & OCRDMA_AE_MCQE_EVENT_CODE_MASK) >> + OCRDMA_AE_MCQE_EVENT_CODE_SHIFT; + + if (evt_code == OCRDMA_ASYNC_RDMA_EVE_CODE) + ocrdma_dispatch_ibevent(dev, cqe); + else if (evt_code == OCRDMA_ASYNC_GRP5_EVE_CODE) + ocrdma_process_grp5_aync(dev, cqe); + else + pr_err("%s(%d) invalid evt code=0x%x\n", __func__, + dev->id, evt_code); +} + +static void ocrdma_process_mcqe(struct ocrdma_dev *dev, struct ocrdma_mcqe *cqe) +{ + if (dev->mqe_ctx.tag == cqe->tag_lo && dev->mqe_ctx.cmd_done == false) { + dev->mqe_ctx.cqe_status = (cqe->status & + OCRDMA_MCQE_STATUS_MASK) >> OCRDMA_MCQE_STATUS_SHIFT; + dev->mqe_ctx.ext_status = + (cqe->status & OCRDMA_MCQE_ESTATUS_MASK) + >> OCRDMA_MCQE_ESTATUS_SHIFT; + dev->mqe_ctx.cmd_done = true; + wake_up(&dev->mqe_ctx.cmd_wait); + } else + pr_err("%s() cqe for invalid tag0x%x.expected=0x%x\n", + __func__, cqe->tag_lo, dev->mqe_ctx.tag); +} + +static int ocrdma_mq_cq_handler(struct ocrdma_dev *dev, u16 cq_id) +{ + u16 cqe_popped = 0; + struct ocrdma_mcqe *cqe; + + while (1) { + cqe = ocrdma_get_mcqe(dev); + if (cqe == NULL) + break; + ocrdma_le32_to_cpu(cqe, sizeof(*cqe)); + cqe_popped += 1; + if (cqe->valid_ae_cmpl_cons & OCRDMA_MCQE_AE_MASK) + ocrdma_process_acqe(dev, cqe); + else if (cqe->valid_ae_cmpl_cons & OCRDMA_MCQE_CMPL_MASK) + ocrdma_process_mcqe(dev, cqe); + memset(cqe, 0, sizeof(struct ocrdma_mcqe)); + ocrdma_mcq_inc_tail(dev); + } + ocrdma_ring_cq_db(dev, dev->mq.cq.id, true, false, cqe_popped); + return 0; +} + +static void ocrdma_qp_buddy_cq_handler(struct ocrdma_dev *dev, + struct ocrdma_cq *cq) +{ + unsigned long flags; + struct ocrdma_qp *qp; + bool buddy_cq_found = false; + /* Go through list of QPs in error state which are using this CQ + * and invoke its callback handler to trigger CQE processing for + * error/flushed CQE. It is rare to find more than few entries in + * this list as most consumers stops after getting error CQE. + * List is traversed only once when a matching buddy cq found for a QP. + */ + spin_lock_irqsave(&dev->flush_q_lock, flags); + list_for_each_entry(qp, &cq->sq_head, sq_entry) { + if (qp->srq) + continue; + /* if wq and rq share the same cq, than comp_handler + * is already invoked. + */ + if (qp->sq_cq == qp->rq_cq) + continue; + /* if completion came on sq, rq's cq is buddy cq. + * if completion came on rq, sq's cq is buddy cq. + */ + if (qp->sq_cq == cq) + cq = qp->rq_cq; + else + cq = qp->sq_cq; + buddy_cq_found = true; + break; + } + spin_unlock_irqrestore(&dev->flush_q_lock, flags); + if (buddy_cq_found == false) + return; + if (cq->ibcq.comp_handler) { + spin_lock_irqsave(&cq->comp_handler_lock, flags); + (*cq->ibcq.comp_handler) (&cq->ibcq, cq->ibcq.cq_context); + spin_unlock_irqrestore(&cq->comp_handler_lock, flags); + } +} + +static void ocrdma_qp_cq_handler(struct ocrdma_dev *dev, u16 cq_idx) +{ + unsigned long flags; + struct ocrdma_cq *cq; + + if (cq_idx >= OCRDMA_MAX_CQ) + BUG(); + + cq = dev->cq_tbl[cq_idx]; + if (cq == NULL) + return; + + if (cq->ibcq.comp_handler) { + spin_lock_irqsave(&cq->comp_handler_lock, flags); + (*cq->ibcq.comp_handler) (&cq->ibcq, cq->ibcq.cq_context); + spin_unlock_irqrestore(&cq->comp_handler_lock, flags); + } + ocrdma_qp_buddy_cq_handler(dev, cq); +} + +static void ocrdma_cq_handler(struct ocrdma_dev *dev, u16 cq_id) +{ + /* process the MQ-CQE. */ + if (cq_id == dev->mq.cq.id) + ocrdma_mq_cq_handler(dev, cq_id); + else + ocrdma_qp_cq_handler(dev, cq_id); +} + +static irqreturn_t ocrdma_irq_handler(int irq, void *handle) +{ + struct ocrdma_eq *eq = handle; + struct ocrdma_dev *dev = eq->dev; + struct ocrdma_eqe eqe; + struct ocrdma_eqe *ptr; + u16 cq_id; + int budget = eq->cq_cnt; + + do { + ptr = ocrdma_get_eqe(eq); + eqe = *ptr; + ocrdma_le32_to_cpu(&eqe, sizeof(eqe)); + if ((eqe.id_valid & OCRDMA_EQE_VALID_MASK) == 0) + break; + + ptr->id_valid = 0; + /* ring eq doorbell as soon as its consumed. */ + ocrdma_ring_eq_db(dev, eq->q.id, false, true, 1); + /* check whether its CQE or not. */ + if ((eqe.id_valid & OCRDMA_EQE_FOR_CQE_MASK) == 0) { + cq_id = eqe.id_valid >> OCRDMA_EQE_RESOURCE_ID_SHIFT; + ocrdma_cq_handler(dev, cq_id); + } + ocrdma_eq_inc_tail(eq); + + /* There can be a stale EQE after the last bound CQ is + * destroyed. EQE valid and budget == 0 implies this. + */ + if (budget) + budget--; + + } while (budget); + + ocrdma_ring_eq_db(dev, eq->q.id, true, true, 0); + return IRQ_HANDLED; +} + +static void ocrdma_post_mqe(struct ocrdma_dev *dev, struct ocrdma_mqe *cmd) +{ + struct ocrdma_mqe *mqe; + + dev->mqe_ctx.tag = dev->mq.sq.head; + dev->mqe_ctx.cmd_done = false; + mqe = ocrdma_get_mqe(dev); + cmd->hdr.tag_lo = dev->mq.sq.head; + ocrdma_copy_cpu_to_le32(mqe, cmd, sizeof(*mqe)); + /* make sure descriptor is written before ringing doorbell */ + wmb(); + ocrdma_mq_inc_head(dev); + ocrdma_ring_mq_db(dev); +} + +static int ocrdma_wait_mqe_cmpl(struct ocrdma_dev *dev) +{ + long status; + /* 30 sec timeout */ + status = wait_event_timeout(dev->mqe_ctx.cmd_wait, + (dev->mqe_ctx.cmd_done != false), + msecs_to_jiffies(30000)); + if (status) + return 0; + else { + dev->mqe_ctx.fw_error_state = true; + pr_err("%s(%d) mailbox timeout: fw not responding\n", + __func__, dev->id); + return -1; + } +} + +/* issue a mailbox command on the MQ */ +static int ocrdma_mbx_cmd(struct ocrdma_dev *dev, struct ocrdma_mqe *mqe) +{ + int status = 0; + u16 cqe_status, ext_status; + struct ocrdma_mqe *rsp_mqe; + struct ocrdma_mbx_rsp *rsp = NULL; + + mutex_lock(&dev->mqe_ctx.lock); + if (dev->mqe_ctx.fw_error_state) + goto mbx_err; + ocrdma_post_mqe(dev, mqe); + status = ocrdma_wait_mqe_cmpl(dev); + if (status) + goto mbx_err; + cqe_status = dev->mqe_ctx.cqe_status; + ext_status = dev->mqe_ctx.ext_status; + rsp_mqe = ocrdma_get_mqe_rsp(dev); + ocrdma_copy_le32_to_cpu(mqe, rsp_mqe, (sizeof(*mqe))); + if ((mqe->hdr.spcl_sge_cnt_emb & OCRDMA_MQE_HDR_EMB_MASK) >> + OCRDMA_MQE_HDR_EMB_SHIFT) + rsp = &mqe->u.rsp; + + if (cqe_status || ext_status) { + pr_err("%s() cqe_status=0x%x, ext_status=0x%x,", + __func__, cqe_status, ext_status); + if (rsp) { + /* This is for embedded cmds. */ + pr_err("opcode=0x%x, subsystem=0x%x\n", + (rsp->subsys_op & OCRDMA_MBX_RSP_OPCODE_MASK) >> + OCRDMA_MBX_RSP_OPCODE_SHIFT, + (rsp->subsys_op & OCRDMA_MBX_RSP_SUBSYS_MASK) >> + OCRDMA_MBX_RSP_SUBSYS_SHIFT); + } + status = ocrdma_get_mbx_cqe_errno(cqe_status); + goto mbx_err; + } + /* For non embedded, rsp errors are handled in ocrdma_nonemb_mbx_cmd */ + if (rsp && (mqe->u.rsp.status & OCRDMA_MBX_RSP_STATUS_MASK)) + status = ocrdma_get_mbx_errno(mqe->u.rsp.status); +mbx_err: + mutex_unlock(&dev->mqe_ctx.lock); + return status; +} + +static int ocrdma_nonemb_mbx_cmd(struct ocrdma_dev *dev, struct ocrdma_mqe *mqe, + void *payload_va) +{ + int status = 0; + struct ocrdma_mbx_rsp *rsp = payload_va; + + if ((mqe->hdr.spcl_sge_cnt_emb & OCRDMA_MQE_HDR_EMB_MASK) >> + OCRDMA_MQE_HDR_EMB_SHIFT) + BUG(); + + status = ocrdma_mbx_cmd(dev, mqe); + if (!status) + /* For non embedded, only CQE failures are handled in + * ocrdma_mbx_cmd. We need to check for RSP errors. + */ + if (rsp->status & OCRDMA_MBX_RSP_STATUS_MASK) + status = ocrdma_get_mbx_errno(rsp->status); + + if (status) + pr_err("opcode=0x%x, subsystem=0x%x\n", + (rsp->subsys_op & OCRDMA_MBX_RSP_OPCODE_MASK) >> + OCRDMA_MBX_RSP_OPCODE_SHIFT, + (rsp->subsys_op & OCRDMA_MBX_RSP_SUBSYS_MASK) >> + OCRDMA_MBX_RSP_SUBSYS_SHIFT); + return status; +} + +static void ocrdma_get_attr(struct ocrdma_dev *dev, + struct ocrdma_dev_attr *attr, + struct ocrdma_mbx_query_config *rsp) +{ + attr->max_pd = + (rsp->max_pd_ca_ack_delay & OCRDMA_MBX_QUERY_CFG_MAX_PD_MASK) >> + OCRDMA_MBX_QUERY_CFG_MAX_PD_SHIFT; + attr->max_qp = + (rsp->qp_srq_cq_ird_ord & OCRDMA_MBX_QUERY_CFG_MAX_QP_MASK) >> + OCRDMA_MBX_QUERY_CFG_MAX_QP_SHIFT; + attr->max_srq = + (rsp->max_srq_rpir_qps & OCRDMA_MBX_QUERY_CFG_MAX_SRQ_MASK) >> + OCRDMA_MBX_QUERY_CFG_MAX_SRQ_OFFSET; + attr->max_send_sge = ((rsp->max_write_send_sge & + OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_MASK) >> + OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_SHIFT); + attr->max_recv_sge = (rsp->max_write_send_sge & + OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_MASK) >> + OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_SHIFT; + attr->max_srq_sge = (rsp->max_srq_rqe_sge & + OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_MASK) >> + OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_OFFSET; + attr->max_rdma_sge = (rsp->max_write_send_sge & + OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_MASK) >> + OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_SHIFT; + attr->max_ord_per_qp = (rsp->max_ird_ord_per_qp & + OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_MASK) >> + OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_SHIFT; + attr->max_ird_per_qp = (rsp->max_ird_ord_per_qp & + OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_MASK) >> + OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_SHIFT; + attr->cq_overflow_detect = (rsp->qp_srq_cq_ird_ord & + OCRDMA_MBX_QUERY_CFG_CQ_OVERFLOW_MASK) >> + OCRDMA_MBX_QUERY_CFG_CQ_OVERFLOW_SHIFT; + attr->srq_supported = (rsp->qp_srq_cq_ird_ord & + OCRDMA_MBX_QUERY_CFG_SRQ_SUPPORTED_MASK) >> + OCRDMA_MBX_QUERY_CFG_SRQ_SUPPORTED_SHIFT; + attr->local_ca_ack_delay = (rsp->max_pd_ca_ack_delay & + OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_MASK) >> + OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_SHIFT; + attr->max_mw = rsp->max_mw; + attr->max_mr = rsp->max_mr; + attr->max_mr_size = ((u64)rsp->max_mr_size_hi << 32) | + rsp->max_mr_size_lo; + attr->max_fmr = 0; + attr->max_pages_per_frmr = rsp->max_pages_per_frmr; + attr->max_num_mr_pbl = rsp->max_num_mr_pbl; + attr->max_cqe = rsp->max_cq_cqes_per_cq & + OCRDMA_MBX_QUERY_CFG_MAX_CQES_PER_CQ_MASK; + attr->max_cq = (rsp->max_cq_cqes_per_cq & + OCRDMA_MBX_QUERY_CFG_MAX_CQ_MASK) >> + OCRDMA_MBX_QUERY_CFG_MAX_CQ_OFFSET; + attr->wqe_size = ((rsp->wqe_rqe_stride_max_dpp_cqs & + OCRDMA_MBX_QUERY_CFG_MAX_WQE_SIZE_MASK) >> + OCRDMA_MBX_QUERY_CFG_MAX_WQE_SIZE_OFFSET) * + OCRDMA_WQE_STRIDE; + attr->rqe_size = ((rsp->wqe_rqe_stride_max_dpp_cqs & + OCRDMA_MBX_QUERY_CFG_MAX_RQE_SIZE_MASK) >> + OCRDMA_MBX_QUERY_CFG_MAX_RQE_SIZE_OFFSET) * + OCRDMA_WQE_STRIDE; + attr->max_inline_data = + attr->wqe_size - (sizeof(struct ocrdma_hdr_wqe) + + sizeof(struct ocrdma_sge)); + if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) { + attr->ird = 1; + attr->ird_page_size = OCRDMA_MIN_Q_PAGE_SIZE; + attr->num_ird_pages = MAX_OCRDMA_IRD_PAGES; + } + dev->attr.max_wqe = rsp->max_wqes_rqes_per_q >> + OCRDMA_MBX_QUERY_CFG_MAX_WQES_PER_WQ_OFFSET; + dev->attr.max_rqe = rsp->max_wqes_rqes_per_q & + OCRDMA_MBX_QUERY_CFG_MAX_RQES_PER_RQ_MASK; +} + +static int ocrdma_check_fw_config(struct ocrdma_dev *dev, + struct ocrdma_fw_conf_rsp *conf) +{ + u32 fn_mode; + + fn_mode = conf->fn_mode & OCRDMA_FN_MODE_RDMA; + if (fn_mode != OCRDMA_FN_MODE_RDMA) + return -EINVAL; + dev->base_eqid = conf->base_eqid; + dev->max_eq = conf->max_eq; + return 0; +} + +/* can be issued only during init time. */ +static int ocrdma_mbx_query_fw_ver(struct ocrdma_dev *dev) +{ + int status = -ENOMEM; + struct ocrdma_mqe *cmd; + struct ocrdma_fw_ver_rsp *rsp; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_GET_FW_VER, sizeof(*cmd)); + if (!cmd) + return -ENOMEM; + ocrdma_init_mch((struct ocrdma_mbx_hdr *)&cmd->u.cmd[0], + OCRDMA_CMD_GET_FW_VER, + OCRDMA_SUBSYS_COMMON, sizeof(*cmd)); + + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + rsp = (struct ocrdma_fw_ver_rsp *)cmd; + memset(&dev->attr.fw_ver[0], 0, sizeof(dev->attr.fw_ver)); + memcpy(&dev->attr.fw_ver[0], &rsp->running_ver[0], + sizeof(rsp->running_ver)); + ocrdma_le32_to_cpu(dev->attr.fw_ver, sizeof(rsp->running_ver)); +mbx_err: + kfree(cmd); + return status; +} + +/* can be issued only during init time. */ +static int ocrdma_mbx_query_fw_config(struct ocrdma_dev *dev) +{ + int status = -ENOMEM; + struct ocrdma_mqe *cmd; + struct ocrdma_fw_conf_rsp *rsp; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_GET_FW_CONFIG, sizeof(*cmd)); + if (!cmd) + return -ENOMEM; + ocrdma_init_mch((struct ocrdma_mbx_hdr *)&cmd->u.cmd[0], + OCRDMA_CMD_GET_FW_CONFIG, + OCRDMA_SUBSYS_COMMON, sizeof(*cmd)); + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + rsp = (struct ocrdma_fw_conf_rsp *)cmd; + status = ocrdma_check_fw_config(dev, rsp); +mbx_err: + kfree(cmd); + return status; +} + +int ocrdma_mbx_rdma_stats(struct ocrdma_dev *dev, bool reset) +{ + struct ocrdma_rdma_stats_req *req = dev->stats_mem.va; + struct ocrdma_mqe *mqe = &dev->stats_mem.mqe; + struct ocrdma_rdma_stats_resp *old_stats; + int status; + + old_stats = kmalloc(sizeof(*old_stats), GFP_KERNEL); + if (old_stats == NULL) + return -ENOMEM; + + memset(mqe, 0, sizeof(*mqe)); + mqe->hdr.pyld_len = dev->stats_mem.size; + mqe->hdr.spcl_sge_cnt_emb |= + (1 << OCRDMA_MQE_HDR_SGE_CNT_SHIFT) & + OCRDMA_MQE_HDR_SGE_CNT_MASK; + mqe->u.nonemb_req.sge[0].pa_lo = (u32) (dev->stats_mem.pa & 0xffffffff); + mqe->u.nonemb_req.sge[0].pa_hi = (u32) upper_32_bits(dev->stats_mem.pa); + mqe->u.nonemb_req.sge[0].len = dev->stats_mem.size; + + /* Cache the old stats */ + memcpy(old_stats, req, sizeof(struct ocrdma_rdma_stats_resp)); + memset(req, 0, dev->stats_mem.size); + + ocrdma_init_mch((struct ocrdma_mbx_hdr *)req, + OCRDMA_CMD_GET_RDMA_STATS, + OCRDMA_SUBSYS_ROCE, + dev->stats_mem.size); + if (reset) + req->reset_stats = reset; + + status = ocrdma_nonemb_mbx_cmd(dev, mqe, dev->stats_mem.va); + if (status) + /* Copy from cache, if mbox fails */ + memcpy(req, old_stats, sizeof(struct ocrdma_rdma_stats_resp)); + else + ocrdma_le32_to_cpu(req, dev->stats_mem.size); + + kfree(old_stats); + return status; +} + +static int ocrdma_mbx_get_ctrl_attribs(struct ocrdma_dev *dev) +{ + int status = -ENOMEM; + struct ocrdma_dma_mem dma; + struct ocrdma_mqe *mqe; + struct ocrdma_get_ctrl_attribs_rsp *ctrl_attr_rsp; + struct mgmt_hba_attribs *hba_attribs; + + mqe = kzalloc(sizeof(struct ocrdma_mqe), GFP_KERNEL); + if (!mqe) + return status; + + dma.size = sizeof(struct ocrdma_get_ctrl_attribs_rsp); + dma.va = dma_alloc_coherent(&dev->nic_info.pdev->dev, + dma.size, &dma.pa, GFP_KERNEL); + if (!dma.va) + goto free_mqe; + + mqe->hdr.pyld_len = dma.size; + mqe->hdr.spcl_sge_cnt_emb |= + (1 << OCRDMA_MQE_HDR_SGE_CNT_SHIFT) & + OCRDMA_MQE_HDR_SGE_CNT_MASK; + mqe->u.nonemb_req.sge[0].pa_lo = (u32) (dma.pa & 0xffffffff); + mqe->u.nonemb_req.sge[0].pa_hi = (u32) upper_32_bits(dma.pa); + mqe->u.nonemb_req.sge[0].len = dma.size; + + memset(dma.va, 0, dma.size); + ocrdma_init_mch((struct ocrdma_mbx_hdr *)dma.va, + OCRDMA_CMD_GET_CTRL_ATTRIBUTES, + OCRDMA_SUBSYS_COMMON, + dma.size); + + status = ocrdma_nonemb_mbx_cmd(dev, mqe, dma.va); + if (!status) { + ctrl_attr_rsp = (struct ocrdma_get_ctrl_attribs_rsp *)dma.va; + hba_attribs = &ctrl_attr_rsp->ctrl_attribs.hba_attribs; + + dev->hba_port_num = (hba_attribs->ptpnum_maxdoms_hbast_cv & + OCRDMA_HBA_ATTRB_PTNUM_MASK) + >> OCRDMA_HBA_ATTRB_PTNUM_SHIFT; + strncpy(dev->model_number, + hba_attribs->controller_model_number, 31); + } + dma_free_coherent(&dev->nic_info.pdev->dev, dma.size, dma.va, dma.pa); +free_mqe: + kfree(mqe); + return status; +} + +static int ocrdma_mbx_query_dev(struct ocrdma_dev *dev) +{ + int status = -ENOMEM; + struct ocrdma_mbx_query_config *rsp; + struct ocrdma_mqe *cmd; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_QUERY_CONFIG, sizeof(*cmd)); + if (!cmd) + return status; + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + rsp = (struct ocrdma_mbx_query_config *)cmd; + ocrdma_get_attr(dev, &dev->attr, rsp); +mbx_err: + kfree(cmd); + return status; +} + +int ocrdma_mbx_get_link_speed(struct ocrdma_dev *dev, u8 *lnk_speed) +{ + int status = -ENOMEM; + struct ocrdma_get_link_speed_rsp *rsp; + struct ocrdma_mqe *cmd; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_QUERY_NTWK_LINK_CONFIG_V1, + sizeof(*cmd)); + if (!cmd) + return status; + ocrdma_init_mch((struct ocrdma_mbx_hdr *)&cmd->u.cmd[0], + OCRDMA_CMD_QUERY_NTWK_LINK_CONFIG_V1, + OCRDMA_SUBSYS_COMMON, sizeof(*cmd)); + + ((struct ocrdma_mbx_hdr *)cmd->u.cmd)->rsvd_version = 0x1; + + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + + rsp = (struct ocrdma_get_link_speed_rsp *)cmd; + *lnk_speed = (rsp->pflt_pps_ld_pnum & OCRDMA_PHY_PS_MASK) + >> OCRDMA_PHY_PS_SHIFT; + +mbx_err: + kfree(cmd); + return status; +} + +static int ocrdma_mbx_get_phy_info(struct ocrdma_dev *dev) +{ + int status = -ENOMEM; + struct ocrdma_mqe *cmd; + struct ocrdma_get_phy_info_rsp *rsp; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_PHY_DETAILS, sizeof(*cmd)); + if (!cmd) + return status; + + ocrdma_init_mch((struct ocrdma_mbx_hdr *)&cmd->u.cmd[0], + OCRDMA_CMD_PHY_DETAILS, OCRDMA_SUBSYS_COMMON, + sizeof(*cmd)); + + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + + rsp = (struct ocrdma_get_phy_info_rsp *)cmd; + dev->phy.phy_type = + (rsp->ityp_ptyp & OCRDMA_PHY_TYPE_MASK); + dev->phy.interface_type = + (rsp->ityp_ptyp & OCRDMA_IF_TYPE_MASK) + >> OCRDMA_IF_TYPE_SHIFT; + dev->phy.auto_speeds_supported = + (rsp->fspeed_aspeed & OCRDMA_ASPEED_SUPP_MASK); + dev->phy.fixed_speeds_supported = + (rsp->fspeed_aspeed & OCRDMA_FSPEED_SUPP_MASK) + >> OCRDMA_FSPEED_SUPP_SHIFT; +mbx_err: + kfree(cmd); + return status; +} + +int ocrdma_mbx_alloc_pd(struct ocrdma_dev *dev, struct ocrdma_pd *pd) +{ + int status = -ENOMEM; + struct ocrdma_alloc_pd *cmd; + struct ocrdma_alloc_pd_rsp *rsp; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_ALLOC_PD, sizeof(*cmd)); + if (!cmd) + return status; + if (pd->dpp_enabled) + cmd->enable_dpp_rsvd |= OCRDMA_ALLOC_PD_ENABLE_DPP; + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + rsp = (struct ocrdma_alloc_pd_rsp *)cmd; + pd->id = rsp->dpp_page_pdid & OCRDMA_ALLOC_PD_RSP_PDID_MASK; + if (rsp->dpp_page_pdid & OCRDMA_ALLOC_PD_RSP_DPP) { + pd->dpp_enabled = true; + pd->dpp_page = rsp->dpp_page_pdid >> + OCRDMA_ALLOC_PD_RSP_DPP_PAGE_SHIFT; + } else { + pd->dpp_enabled = false; + pd->num_dpp_qp = 0; + } +mbx_err: + kfree(cmd); + return status; +} + +int ocrdma_mbx_dealloc_pd(struct ocrdma_dev *dev, struct ocrdma_pd *pd) +{ + int status = -ENOMEM; + struct ocrdma_dealloc_pd *cmd; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DEALLOC_PD, sizeof(*cmd)); + if (!cmd) + return status; + cmd->id = pd->id; + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + kfree(cmd); + return status; +} + +static int ocrdma_build_q_conf(u32 *num_entries, int entry_size, + int *num_pages, int *page_size) +{ + int i; + int mem_size; + + *num_entries = roundup_pow_of_two(*num_entries); + mem_size = *num_entries * entry_size; + /* find the possible lowest possible multiplier */ + for (i = 0; i < OCRDMA_MAX_Q_PAGE_SIZE_CNT; i++) { + if (mem_size <= (OCRDMA_Q_PAGE_BASE_SIZE << i)) + break; + } + if (i >= OCRDMA_MAX_Q_PAGE_SIZE_CNT) + return -EINVAL; + mem_size = roundup(mem_size, + ((OCRDMA_Q_PAGE_BASE_SIZE << i) / OCRDMA_MAX_Q_PAGES)); + *num_pages = + mem_size / ((OCRDMA_Q_PAGE_BASE_SIZE << i) / OCRDMA_MAX_Q_PAGES); + *page_size = ((OCRDMA_Q_PAGE_BASE_SIZE << i) / OCRDMA_MAX_Q_PAGES); + *num_entries = mem_size / entry_size; + return 0; +} + +static int ocrdma_mbx_create_ah_tbl(struct ocrdma_dev *dev) +{ + int i; + int status = 0; + int max_ah; + struct ocrdma_create_ah_tbl *cmd; + struct ocrdma_create_ah_tbl_rsp *rsp; + struct pci_dev *pdev = dev->nic_info.pdev; + dma_addr_t pa; + struct ocrdma_pbe *pbes; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_CREATE_AH_TBL, sizeof(*cmd)); + if (!cmd) + return status; + + max_ah = OCRDMA_MAX_AH; + dev->av_tbl.size = sizeof(struct ocrdma_av) * max_ah; + + /* number of PBEs in PBL */ + cmd->ah_conf = (OCRDMA_AH_TBL_PAGES << + OCRDMA_CREATE_AH_NUM_PAGES_SHIFT) & + OCRDMA_CREATE_AH_NUM_PAGES_MASK; + + /* page size */ + for (i = 0; i < OCRDMA_MAX_Q_PAGE_SIZE_CNT; i++) { + if (PAGE_SIZE == (OCRDMA_MIN_Q_PAGE_SIZE << i)) + break; + } + cmd->ah_conf |= (i << OCRDMA_CREATE_AH_PAGE_SIZE_SHIFT) & + OCRDMA_CREATE_AH_PAGE_SIZE_MASK; + + /* ah_entry size */ + cmd->ah_conf |= (sizeof(struct ocrdma_av) << + OCRDMA_CREATE_AH_ENTRY_SIZE_SHIFT) & + OCRDMA_CREATE_AH_ENTRY_SIZE_MASK; + + dev->av_tbl.pbl.va = dma_alloc_coherent(&pdev->dev, PAGE_SIZE, + &dev->av_tbl.pbl.pa, + GFP_KERNEL); + if (dev->av_tbl.pbl.va == NULL) + goto mem_err; + + dev->av_tbl.va = dma_alloc_coherent(&pdev->dev, dev->av_tbl.size, + &pa, GFP_KERNEL); + if (dev->av_tbl.va == NULL) + goto mem_err_ah; + dev->av_tbl.pa = pa; + dev->av_tbl.num_ah = max_ah; + memset(dev->av_tbl.va, 0, dev->av_tbl.size); + + pbes = (struct ocrdma_pbe *)dev->av_tbl.pbl.va; + for (i = 0; i < dev->av_tbl.size / OCRDMA_MIN_Q_PAGE_SIZE; i++) { + pbes[i].pa_lo = (u32)cpu_to_le32(pa & 0xffffffff); + pbes[i].pa_hi = (u32)cpu_to_le32(upper_32_bits(pa)); + pa += PAGE_SIZE; + } + cmd->tbl_addr[0].lo = (u32)(dev->av_tbl.pbl.pa & 0xFFFFFFFF); + cmd->tbl_addr[0].hi = (u32)upper_32_bits(dev->av_tbl.pbl.pa); + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + rsp = (struct ocrdma_create_ah_tbl_rsp *)cmd; + dev->av_tbl.ahid = rsp->ahid & 0xFFFF; + kfree(cmd); + return 0; + +mbx_err: + dma_free_coherent(&pdev->dev, dev->av_tbl.size, dev->av_tbl.va, + dev->av_tbl.pa); + dev->av_tbl.va = NULL; +mem_err_ah: + dma_free_coherent(&pdev->dev, PAGE_SIZE, dev->av_tbl.pbl.va, + dev->av_tbl.pbl.pa); + dev->av_tbl.pbl.va = NULL; + dev->av_tbl.size = 0; +mem_err: + kfree(cmd); + return status; +} + +static void ocrdma_mbx_delete_ah_tbl(struct ocrdma_dev *dev) +{ + struct ocrdma_delete_ah_tbl *cmd; + struct pci_dev *pdev = dev->nic_info.pdev; + + if (dev->av_tbl.va == NULL) + return; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DELETE_AH_TBL, sizeof(*cmd)); + if (!cmd) + return; + cmd->ahid = dev->av_tbl.ahid; + + ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + dma_free_coherent(&pdev->dev, dev->av_tbl.size, dev->av_tbl.va, + dev->av_tbl.pa); + dev->av_tbl.va = NULL; + dma_free_coherent(&pdev->dev, PAGE_SIZE, dev->av_tbl.pbl.va, + dev->av_tbl.pbl.pa); + kfree(cmd); +} + +/* Multiple CQs uses the EQ. This routine returns least used + * EQ to associate with CQ. This will distributes the interrupt + * processing and CPU load to associated EQ, vector and so to that CPU. + */ +static u16 ocrdma_bind_eq(struct ocrdma_dev *dev) +{ + int i, selected_eq = 0, cq_cnt = 0; + u16 eq_id; + + mutex_lock(&dev->dev_lock); + cq_cnt = dev->eq_tbl[0].cq_cnt; + eq_id = dev->eq_tbl[0].q.id; + /* find the EQ which is has the least number of + * CQs associated with it. + */ + for (i = 0; i < dev->eq_cnt; i++) { + if (dev->eq_tbl[i].cq_cnt < cq_cnt) { + cq_cnt = dev->eq_tbl[i].cq_cnt; + eq_id = dev->eq_tbl[i].q.id; + selected_eq = i; + } + } + dev->eq_tbl[selected_eq].cq_cnt += 1; + mutex_unlock(&dev->dev_lock); + return eq_id; +} + +static void ocrdma_unbind_eq(struct ocrdma_dev *dev, u16 eq_id) +{ + int i; + + mutex_lock(&dev->dev_lock); + i = ocrdma_get_eq_table_index(dev, eq_id); + if (i == -EINVAL) + BUG(); + dev->eq_tbl[i].cq_cnt -= 1; + mutex_unlock(&dev->dev_lock); +} + +int ocrdma_mbx_create_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq, + int entries, int dpp_cq, u16 pd_id) +{ + int status = -ENOMEM; int max_hw_cqe; + struct pci_dev *pdev = dev->nic_info.pdev; + struct ocrdma_create_cq *cmd; + struct ocrdma_create_cq_rsp *rsp; + u32 hw_pages, cqe_size, page_size, cqe_count; + + if (entries > dev->attr.max_cqe) { + pr_err("%s(%d) max_cqe=0x%x, requester_cqe=0x%x\n", + __func__, dev->id, dev->attr.max_cqe, entries); + return -EINVAL; + } + if (dpp_cq && (ocrdma_get_asic_type(dev) != OCRDMA_ASIC_GEN_SKH_R)) + return -EINVAL; + + if (dpp_cq) { + cq->max_hw_cqe = 1; + max_hw_cqe = 1; + cqe_size = OCRDMA_DPP_CQE_SIZE; + hw_pages = 1; + } else { + cq->max_hw_cqe = dev->attr.max_cqe; + max_hw_cqe = dev->attr.max_cqe; + cqe_size = sizeof(struct ocrdma_cqe); + hw_pages = OCRDMA_CREATE_CQ_MAX_PAGES; + } + + cq->len = roundup(max_hw_cqe * cqe_size, OCRDMA_MIN_Q_PAGE_SIZE); + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_CREATE_CQ, sizeof(*cmd)); + if (!cmd) + return -ENOMEM; + ocrdma_init_mch(&cmd->cmd.req, OCRDMA_CMD_CREATE_CQ, + OCRDMA_SUBSYS_COMMON, sizeof(*cmd)); + cq->va = dma_alloc_coherent(&pdev->dev, cq->len, &cq->pa, GFP_KERNEL); + if (!cq->va) { + status = -ENOMEM; + goto mem_err; + } + memset(cq->va, 0, cq->len); + page_size = cq->len / hw_pages; + cmd->cmd.pgsz_pgcnt = (page_size / OCRDMA_MIN_Q_PAGE_SIZE) << + OCRDMA_CREATE_CQ_PAGE_SIZE_SHIFT; + cmd->cmd.pgsz_pgcnt |= hw_pages; + cmd->cmd.ev_cnt_flags = OCRDMA_CREATE_CQ_DEF_FLAGS; + + cq->eqn = ocrdma_bind_eq(dev); + cmd->cmd.req.rsvd_version = OCRDMA_CREATE_CQ_VER3; + cqe_count = cq->len / cqe_size; + cq->cqe_cnt = cqe_count; + if (cqe_count > 1024) { + /* Set cnt to 3 to indicate more than 1024 cq entries */ + cmd->cmd.ev_cnt_flags |= (0x3 << OCRDMA_CREATE_CQ_CNT_SHIFT); + } else { + u8 count = 0; + switch (cqe_count) { + case 256: + count = 0; + break; + case 512: + count = 1; + break; + case 1024: + count = 2; + break; + default: + goto mbx_err; + } + cmd->cmd.ev_cnt_flags |= (count << OCRDMA_CREATE_CQ_CNT_SHIFT); + } + /* shared eq between all the consumer cqs. */ + cmd->cmd.eqn = cq->eqn; + if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) { + if (dpp_cq) + cmd->cmd.pgsz_pgcnt |= OCRDMA_CREATE_CQ_DPP << + OCRDMA_CREATE_CQ_TYPE_SHIFT; + cq->phase_change = false; + cmd->cmd.pdid_cqecnt = (cq->len / cqe_size); + } else { + cmd->cmd.pdid_cqecnt = (cq->len / cqe_size) - 1; + cmd->cmd.ev_cnt_flags |= OCRDMA_CREATE_CQ_FLAGS_AUTO_VALID; + cq->phase_change = true; + } + + /* pd_id valid only for v3 */ + cmd->cmd.pdid_cqecnt |= (pd_id << + OCRDMA_CREATE_CQ_CMD_PDID_SHIFT); + ocrdma_build_q_pages(&cmd->cmd.pa[0], hw_pages, cq->pa, page_size); + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + + rsp = (struct ocrdma_create_cq_rsp *)cmd; + cq->id = (u16) (rsp->rsp.cq_id & OCRDMA_CREATE_CQ_RSP_CQ_ID_MASK); + kfree(cmd); + return 0; +mbx_err: + ocrdma_unbind_eq(dev, cq->eqn); + dma_free_coherent(&pdev->dev, cq->len, cq->va, cq->pa); +mem_err: + kfree(cmd); + return status; +} + +int ocrdma_mbx_destroy_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq) +{ + int status = -ENOMEM; + struct ocrdma_destroy_cq *cmd; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DELETE_CQ, sizeof(*cmd)); + if (!cmd) + return status; + ocrdma_init_mch(&cmd->req, OCRDMA_CMD_DELETE_CQ, + OCRDMA_SUBSYS_COMMON, sizeof(*cmd)); + + cmd->bypass_flush_qid |= + (cq->id << OCRDMA_DESTROY_CQ_QID_SHIFT) & + OCRDMA_DESTROY_CQ_QID_MASK; + + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + ocrdma_unbind_eq(dev, cq->eqn); + dma_free_coherent(&dev->nic_info.pdev->dev, cq->len, cq->va, cq->pa); + kfree(cmd); + return status; +} + +int ocrdma_mbx_alloc_lkey(struct ocrdma_dev *dev, struct ocrdma_hw_mr *hwmr, + u32 pdid, int addr_check) +{ + int status = -ENOMEM; + struct ocrdma_alloc_lkey *cmd; + struct ocrdma_alloc_lkey_rsp *rsp; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_ALLOC_LKEY, sizeof(*cmd)); + if (!cmd) + return status; + cmd->pdid = pdid; + cmd->pbl_sz_flags |= addr_check; + cmd->pbl_sz_flags |= (hwmr->fr_mr << OCRDMA_ALLOC_LKEY_FMR_SHIFT); + cmd->pbl_sz_flags |= + (hwmr->remote_wr << OCRDMA_ALLOC_LKEY_REMOTE_WR_SHIFT); + cmd->pbl_sz_flags |= + (hwmr->remote_rd << OCRDMA_ALLOC_LKEY_REMOTE_RD_SHIFT); + cmd->pbl_sz_flags |= + (hwmr->local_wr << OCRDMA_ALLOC_LKEY_LOCAL_WR_SHIFT); + cmd->pbl_sz_flags |= + (hwmr->remote_atomic << OCRDMA_ALLOC_LKEY_REMOTE_ATOMIC_SHIFT); + cmd->pbl_sz_flags |= + (hwmr->num_pbls << OCRDMA_ALLOC_LKEY_PBL_SIZE_SHIFT); + + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + rsp = (struct ocrdma_alloc_lkey_rsp *)cmd; + hwmr->lkey = rsp->lrkey; +mbx_err: + kfree(cmd); + return status; +} + +int ocrdma_mbx_dealloc_lkey(struct ocrdma_dev *dev, int fr_mr, u32 lkey) +{ + int status = -ENOMEM; + struct ocrdma_dealloc_lkey *cmd; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DEALLOC_LKEY, sizeof(*cmd)); + if (!cmd) + return -ENOMEM; + cmd->lkey = lkey; + cmd->rsvd_frmr = fr_mr ? 1 : 0; + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; +mbx_err: + kfree(cmd); + return status; +} + +static int ocrdma_mbx_reg_mr(struct ocrdma_dev *dev, struct ocrdma_hw_mr *hwmr, + u32 pdid, u32 pbl_cnt, u32 pbe_size, u32 last) +{ + int status = -ENOMEM; + int i; + struct ocrdma_reg_nsmr *cmd; + struct ocrdma_reg_nsmr_rsp *rsp; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_REGISTER_NSMR, sizeof(*cmd)); + if (!cmd) + return -ENOMEM; + cmd->num_pbl_pdid = + pdid | (hwmr->num_pbls << OCRDMA_REG_NSMR_NUM_PBL_SHIFT); + cmd->fr_mr = hwmr->fr_mr; + + cmd->flags_hpage_pbe_sz |= (hwmr->remote_wr << + OCRDMA_REG_NSMR_REMOTE_WR_SHIFT); + cmd->flags_hpage_pbe_sz |= (hwmr->remote_rd << + OCRDMA_REG_NSMR_REMOTE_RD_SHIFT); + cmd->flags_hpage_pbe_sz |= (hwmr->local_wr << + OCRDMA_REG_NSMR_LOCAL_WR_SHIFT); + cmd->flags_hpage_pbe_sz |= (hwmr->remote_atomic << + OCRDMA_REG_NSMR_REMOTE_ATOMIC_SHIFT); + cmd->flags_hpage_pbe_sz |= (hwmr->mw_bind << + OCRDMA_REG_NSMR_BIND_MEMWIN_SHIFT); + cmd->flags_hpage_pbe_sz |= (last << OCRDMA_REG_NSMR_LAST_SHIFT); + + cmd->flags_hpage_pbe_sz |= (hwmr->pbe_size / OCRDMA_MIN_HPAGE_SIZE); + cmd->flags_hpage_pbe_sz |= (hwmr->pbl_size / OCRDMA_MIN_HPAGE_SIZE) << + OCRDMA_REG_NSMR_HPAGE_SIZE_SHIFT; + cmd->totlen_low = hwmr->len; + cmd->totlen_high = upper_32_bits(hwmr->len); + cmd->fbo_low = (u32) (hwmr->fbo & 0xffffffff); + cmd->fbo_high = (u32) upper_32_bits(hwmr->fbo); + cmd->va_loaddr = (u32) hwmr->va; + cmd->va_hiaddr = (u32) upper_32_bits(hwmr->va); + + for (i = 0; i < pbl_cnt; i++) { + cmd->pbl[i].lo = (u32) (hwmr->pbl_table[i].pa & 0xffffffff); + cmd->pbl[i].hi = upper_32_bits(hwmr->pbl_table[i].pa); + } + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + rsp = (struct ocrdma_reg_nsmr_rsp *)cmd; + hwmr->lkey = rsp->lrkey; +mbx_err: + kfree(cmd); + return status; +} + +static int ocrdma_mbx_reg_mr_cont(struct ocrdma_dev *dev, + struct ocrdma_hw_mr *hwmr, u32 pbl_cnt, + u32 pbl_offset, u32 last) +{ + int status = -ENOMEM; + int i; + struct ocrdma_reg_nsmr_cont *cmd; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_REGISTER_NSMR_CONT, sizeof(*cmd)); + if (!cmd) + return -ENOMEM; + cmd->lrkey = hwmr->lkey; + cmd->num_pbl_offset = (pbl_cnt << OCRDMA_REG_NSMR_CONT_NUM_PBL_SHIFT) | + (pbl_offset & OCRDMA_REG_NSMR_CONT_PBL_SHIFT_MASK); + cmd->last = last << OCRDMA_REG_NSMR_CONT_LAST_SHIFT; + + for (i = 0; i < pbl_cnt; i++) { + cmd->pbl[i].lo = + (u32) (hwmr->pbl_table[i + pbl_offset].pa & 0xffffffff); + cmd->pbl[i].hi = + upper_32_bits(hwmr->pbl_table[i + pbl_offset].pa); + } + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; +mbx_err: + kfree(cmd); + return status; +} + +int ocrdma_reg_mr(struct ocrdma_dev *dev, + struct ocrdma_hw_mr *hwmr, u32 pdid, int acc) +{ + int status; + u32 last = 0; + u32 cur_pbl_cnt, pbl_offset; + u32 pending_pbl_cnt = hwmr->num_pbls; + + pbl_offset = 0; + cur_pbl_cnt = min(pending_pbl_cnt, MAX_OCRDMA_NSMR_PBL); + if (cur_pbl_cnt == pending_pbl_cnt) + last = 1; + + status = ocrdma_mbx_reg_mr(dev, hwmr, pdid, + cur_pbl_cnt, hwmr->pbe_size, last); + if (status) { + pr_err("%s() status=%d\n", __func__, status); + return status; + } + /* if there is no more pbls to register then exit. */ + if (last) + return 0; + + while (!last) { + pbl_offset += cur_pbl_cnt; + pending_pbl_cnt -= cur_pbl_cnt; + cur_pbl_cnt = min(pending_pbl_cnt, MAX_OCRDMA_NSMR_PBL); + /* if we reach the end of the pbls, then need to set the last + * bit, indicating no more pbls to register for this memory key. + */ + if (cur_pbl_cnt == pending_pbl_cnt) + last = 1; + + status = ocrdma_mbx_reg_mr_cont(dev, hwmr, cur_pbl_cnt, + pbl_offset, last); + if (status) + break; + } + if (status) + pr_err("%s() err. status=%d\n", __func__, status); + + return status; +} + +bool ocrdma_is_qp_in_sq_flushlist(struct ocrdma_cq *cq, struct ocrdma_qp *qp) +{ + struct ocrdma_qp *tmp; + bool found = false; + list_for_each_entry(tmp, &cq->sq_head, sq_entry) { + if (qp == tmp) { + found = true; + break; + } + } + return found; +} + +bool ocrdma_is_qp_in_rq_flushlist(struct ocrdma_cq *cq, struct ocrdma_qp *qp) +{ + struct ocrdma_qp *tmp; + bool found = false; + list_for_each_entry(tmp, &cq->rq_head, rq_entry) { + if (qp == tmp) { + found = true; + break; + } + } + return found; +} + +void ocrdma_flush_qp(struct ocrdma_qp *qp) +{ + bool found; + unsigned long flags; + + spin_lock_irqsave(&qp->dev->flush_q_lock, flags); + found = ocrdma_is_qp_in_sq_flushlist(qp->sq_cq, qp); + if (!found) + list_add_tail(&qp->sq_entry, &qp->sq_cq->sq_head); + if (!qp->srq) { + found = ocrdma_is_qp_in_rq_flushlist(qp->rq_cq, qp); + if (!found) + list_add_tail(&qp->rq_entry, &qp->rq_cq->rq_head); + } + spin_unlock_irqrestore(&qp->dev->flush_q_lock, flags); +} + +static void ocrdma_init_hwq_ptr(struct ocrdma_qp *qp) +{ + qp->sq.head = 0; + qp->sq.tail = 0; + qp->rq.head = 0; + qp->rq.tail = 0; +} + +int ocrdma_qp_state_change(struct ocrdma_qp *qp, enum ib_qp_state new_ib_state, + enum ib_qp_state *old_ib_state) +{ + unsigned long flags; + int status = 0; + enum ocrdma_qp_state new_state; + new_state = get_ocrdma_qp_state(new_ib_state); + + /* sync with wqe and rqe posting */ + spin_lock_irqsave(&qp->q_lock, flags); + + if (old_ib_state) + *old_ib_state = get_ibqp_state(qp->state); + if (new_state == qp->state) { + spin_unlock_irqrestore(&qp->q_lock, flags); + return 1; + } + + + if (new_state == OCRDMA_QPS_INIT) { + ocrdma_init_hwq_ptr(qp); + ocrdma_del_flush_qp(qp); + } else if (new_state == OCRDMA_QPS_ERR) { + ocrdma_flush_qp(qp); + } + + qp->state = new_state; + + spin_unlock_irqrestore(&qp->q_lock, flags); + return status; +} + +static u32 ocrdma_set_create_qp_mbx_access_flags(struct ocrdma_qp *qp) +{ + u32 flags = 0; + if (qp->cap_flags & OCRDMA_QP_INB_RD) + flags |= OCRDMA_CREATE_QP_REQ_INB_RDEN_MASK; + if (qp->cap_flags & OCRDMA_QP_INB_WR) + flags |= OCRDMA_CREATE_QP_REQ_INB_WREN_MASK; + if (qp->cap_flags & OCRDMA_QP_MW_BIND) + flags |= OCRDMA_CREATE_QP_REQ_BIND_MEMWIN_MASK; + if (qp->cap_flags & OCRDMA_QP_LKEY0) + flags |= OCRDMA_CREATE_QP_REQ_ZERO_LKEYEN_MASK; + if (qp->cap_flags & OCRDMA_QP_FAST_REG) + flags |= OCRDMA_CREATE_QP_REQ_FMR_EN_MASK; + return flags; +} + +static int ocrdma_set_create_qp_sq_cmd(struct ocrdma_create_qp_req *cmd, + struct ib_qp_init_attr *attrs, + struct ocrdma_qp *qp) +{ + int status; + u32 len, hw_pages, hw_page_size; + dma_addr_t pa; + struct ocrdma_dev *dev = qp->dev; + struct pci_dev *pdev = dev->nic_info.pdev; + u32 max_wqe_allocated; + u32 max_sges = attrs->cap.max_send_sge; + + /* QP1 may exceed 127 */ + max_wqe_allocated = min_t(u32, attrs->cap.max_send_wr + 1, + dev->attr.max_wqe); + + status = ocrdma_build_q_conf(&max_wqe_allocated, + dev->attr.wqe_size, &hw_pages, &hw_page_size); + if (status) { + pr_err("%s() req. max_send_wr=0x%x\n", __func__, + max_wqe_allocated); + return -EINVAL; + } + qp->sq.max_cnt = max_wqe_allocated; + len = (hw_pages * hw_page_size); + + qp->sq.va = dma_alloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL); + if (!qp->sq.va) + return -EINVAL; + memset(qp->sq.va, 0, len); + qp->sq.len = len; + qp->sq.pa = pa; + qp->sq.entry_size = dev->attr.wqe_size; + ocrdma_build_q_pages(&cmd->wq_addr[0], hw_pages, pa, hw_page_size); + + cmd->type_pgsz_pdn |= (ilog2(hw_page_size / OCRDMA_MIN_Q_PAGE_SIZE) + << OCRDMA_CREATE_QP_REQ_SQ_PAGE_SIZE_SHIFT); + cmd->num_wq_rq_pages |= (hw_pages << + OCRDMA_CREATE_QP_REQ_NUM_WQ_PAGES_SHIFT) & + OCRDMA_CREATE_QP_REQ_NUM_WQ_PAGES_MASK; + cmd->max_sge_send_write |= (max_sges << + OCRDMA_CREATE_QP_REQ_MAX_SGE_SEND_SHIFT) & + OCRDMA_CREATE_QP_REQ_MAX_SGE_SEND_MASK; + cmd->max_sge_send_write |= (max_sges << + OCRDMA_CREATE_QP_REQ_MAX_SGE_WRITE_SHIFT) & + OCRDMA_CREATE_QP_REQ_MAX_SGE_WRITE_MASK; + cmd->max_wqe_rqe |= (ilog2(qp->sq.max_cnt) << + OCRDMA_CREATE_QP_REQ_MAX_WQE_SHIFT) & + OCRDMA_CREATE_QP_REQ_MAX_WQE_MASK; + cmd->wqe_rqe_size |= (dev->attr.wqe_size << + OCRDMA_CREATE_QP_REQ_WQE_SIZE_SHIFT) & + OCRDMA_CREATE_QP_REQ_WQE_SIZE_MASK; + return 0; +} + +static int ocrdma_set_create_qp_rq_cmd(struct ocrdma_create_qp_req *cmd, + struct ib_qp_init_attr *attrs, + struct ocrdma_qp *qp) +{ + int status; + u32 len, hw_pages, hw_page_size; + dma_addr_t pa = 0; + struct ocrdma_dev *dev = qp->dev; + struct pci_dev *pdev = dev->nic_info.pdev; + u32 max_rqe_allocated = attrs->cap.max_recv_wr + 1; + + status = ocrdma_build_q_conf(&max_rqe_allocated, dev->attr.rqe_size, + &hw_pages, &hw_page_size); + if (status) { + pr_err("%s() req. max_recv_wr=0x%x\n", __func__, + attrs->cap.max_recv_wr + 1); + return status; + } + qp->rq.max_cnt = max_rqe_allocated; + len = (hw_pages * hw_page_size); + + qp->rq.va = dma_alloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL); + if (!qp->rq.va) + return -ENOMEM; + memset(qp->rq.va, 0, len); + qp->rq.pa = pa; + qp->rq.len = len; + qp->rq.entry_size = dev->attr.rqe_size; + + ocrdma_build_q_pages(&cmd->rq_addr[0], hw_pages, pa, hw_page_size); + cmd->type_pgsz_pdn |= (ilog2(hw_page_size / OCRDMA_MIN_Q_PAGE_SIZE) << + OCRDMA_CREATE_QP_REQ_RQ_PAGE_SIZE_SHIFT); + cmd->num_wq_rq_pages |= + (hw_pages << OCRDMA_CREATE_QP_REQ_NUM_RQ_PAGES_SHIFT) & + OCRDMA_CREATE_QP_REQ_NUM_RQ_PAGES_MASK; + cmd->max_sge_recv_flags |= (attrs->cap.max_recv_sge << + OCRDMA_CREATE_QP_REQ_MAX_SGE_RECV_SHIFT) & + OCRDMA_CREATE_QP_REQ_MAX_SGE_RECV_MASK; + cmd->max_wqe_rqe |= (ilog2(qp->rq.max_cnt) << + OCRDMA_CREATE_QP_REQ_MAX_RQE_SHIFT) & + OCRDMA_CREATE_QP_REQ_MAX_RQE_MASK; + cmd->wqe_rqe_size |= (dev->attr.rqe_size << + OCRDMA_CREATE_QP_REQ_RQE_SIZE_SHIFT) & + OCRDMA_CREATE_QP_REQ_RQE_SIZE_MASK; + return 0; +} + +static void ocrdma_set_create_qp_dpp_cmd(struct ocrdma_create_qp_req *cmd, + struct ocrdma_pd *pd, + struct ocrdma_qp *qp, + u8 enable_dpp_cq, u16 dpp_cq_id) +{ + pd->num_dpp_qp--; + qp->dpp_enabled = true; + cmd->max_sge_recv_flags |= OCRDMA_CREATE_QP_REQ_ENABLE_DPP_MASK; + if (!enable_dpp_cq) + return; + cmd->max_sge_recv_flags |= OCRDMA_CREATE_QP_REQ_ENABLE_DPP_MASK; + cmd->dpp_credits_cqid = dpp_cq_id; + cmd->dpp_credits_cqid |= OCRDMA_CREATE_QP_REQ_DPP_CREDIT_LIMIT << + OCRDMA_CREATE_QP_REQ_DPP_CREDIT_SHIFT; +} + +static int ocrdma_set_create_qp_ird_cmd(struct ocrdma_create_qp_req *cmd, + struct ocrdma_qp *qp) +{ + struct ocrdma_dev *dev = qp->dev; + struct pci_dev *pdev = dev->nic_info.pdev; + dma_addr_t pa = 0; + int ird_page_size = dev->attr.ird_page_size; + int ird_q_len = dev->attr.num_ird_pages * ird_page_size; + struct ocrdma_hdr_wqe *rqe; + int i = 0; + + if (dev->attr.ird == 0) + return 0; + + qp->ird_q_va = dma_alloc_coherent(&pdev->dev, ird_q_len, + &pa, GFP_KERNEL); + if (!qp->ird_q_va) + return -ENOMEM; + memset(qp->ird_q_va, 0, ird_q_len); + ocrdma_build_q_pages(&cmd->ird_addr[0], dev->attr.num_ird_pages, + pa, ird_page_size); + for (; i < ird_q_len / dev->attr.rqe_size; i++) { + rqe = (struct ocrdma_hdr_wqe *)(qp->ird_q_va + + (i * dev->attr.rqe_size)); + rqe->cw = 0; + rqe->cw |= 2; + rqe->cw |= (OCRDMA_TYPE_LKEY << OCRDMA_WQE_TYPE_SHIFT); + rqe->cw |= (8 << OCRDMA_WQE_SIZE_SHIFT); + rqe->cw |= (8 << OCRDMA_WQE_NXT_WQE_SIZE_SHIFT); + } + return 0; +} + +static void ocrdma_get_create_qp_rsp(struct ocrdma_create_qp_rsp *rsp, + struct ocrdma_qp *qp, + struct ib_qp_init_attr *attrs, + u16 *dpp_offset, u16 *dpp_credit_lmt) +{ + u32 max_wqe_allocated, max_rqe_allocated; + qp->id = rsp->qp_id & OCRDMA_CREATE_QP_RSP_QP_ID_MASK; + qp->rq.dbid = rsp->sq_rq_id & OCRDMA_CREATE_QP_RSP_RQ_ID_MASK; + qp->sq.dbid = rsp->sq_rq_id >> OCRDMA_CREATE_QP_RSP_SQ_ID_SHIFT; + qp->max_ird = rsp->max_ord_ird & OCRDMA_CREATE_QP_RSP_MAX_IRD_MASK; + qp->max_ord = (rsp->max_ord_ird >> OCRDMA_CREATE_QP_RSP_MAX_ORD_SHIFT); + qp->dpp_enabled = false; + if (rsp->dpp_response & OCRDMA_CREATE_QP_RSP_DPP_ENABLED_MASK) { + qp->dpp_enabled = true; + *dpp_credit_lmt = (rsp->dpp_response & + OCRDMA_CREATE_QP_RSP_DPP_CREDITS_MASK) >> + OCRDMA_CREATE_QP_RSP_DPP_CREDITS_SHIFT; + *dpp_offset = (rsp->dpp_response & + OCRDMA_CREATE_QP_RSP_DPP_PAGE_OFFSET_MASK) >> + OCRDMA_CREATE_QP_RSP_DPP_PAGE_OFFSET_SHIFT; + } + max_wqe_allocated = + rsp->max_wqe_rqe >> OCRDMA_CREATE_QP_RSP_MAX_WQE_SHIFT; + max_wqe_allocated = 1 << max_wqe_allocated; + max_rqe_allocated = 1 << ((u16)rsp->max_wqe_rqe); + + qp->sq.max_cnt = max_wqe_allocated; + qp->sq.max_wqe_idx = max_wqe_allocated - 1; + + if (!attrs->srq) { + qp->rq.max_cnt = max_rqe_allocated; + qp->rq.max_wqe_idx = max_rqe_allocated - 1; + } +} + +int ocrdma_mbx_create_qp(struct ocrdma_qp *qp, struct ib_qp_init_attr *attrs, + u8 enable_dpp_cq, u16 dpp_cq_id, u16 *dpp_offset, + u16 *dpp_credit_lmt) +{ + int status = -ENOMEM; + u32 flags = 0; + struct ocrdma_dev *dev = qp->dev; + struct ocrdma_pd *pd = qp->pd; + struct pci_dev *pdev = dev->nic_info.pdev; + struct ocrdma_cq *cq; + struct ocrdma_create_qp_req *cmd; + struct ocrdma_create_qp_rsp *rsp; + int qptype; + + switch (attrs->qp_type) { + case IB_QPT_GSI: + qptype = OCRDMA_QPT_GSI; + break; + case IB_QPT_RC: + qptype = OCRDMA_QPT_RC; + break; + case IB_QPT_UD: + qptype = OCRDMA_QPT_UD; + break; + default: + return -EINVAL; + } + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_CREATE_QP, sizeof(*cmd)); + if (!cmd) + return status; + cmd->type_pgsz_pdn |= (qptype << OCRDMA_CREATE_QP_REQ_QPT_SHIFT) & + OCRDMA_CREATE_QP_REQ_QPT_MASK; + status = ocrdma_set_create_qp_sq_cmd(cmd, attrs, qp); + if (status) + goto sq_err; + + if (attrs->srq) { + struct ocrdma_srq *srq = get_ocrdma_srq(attrs->srq); + cmd->max_sge_recv_flags |= OCRDMA_CREATE_QP_REQ_USE_SRQ_MASK; + cmd->rq_addr[0].lo = srq->id; + qp->srq = srq; + } else { + status = ocrdma_set_create_qp_rq_cmd(cmd, attrs, qp); + if (status) + goto rq_err; + } + + status = ocrdma_set_create_qp_ird_cmd(cmd, qp); + if (status) + goto mbx_err; + + cmd->type_pgsz_pdn |= (pd->id << OCRDMA_CREATE_QP_REQ_PD_ID_SHIFT) & + OCRDMA_CREATE_QP_REQ_PD_ID_MASK; + + flags = ocrdma_set_create_qp_mbx_access_flags(qp); + + cmd->max_sge_recv_flags |= flags; + cmd->max_ord_ird |= (dev->attr.max_ord_per_qp << + OCRDMA_CREATE_QP_REQ_MAX_ORD_SHIFT) & + OCRDMA_CREATE_QP_REQ_MAX_ORD_MASK; + cmd->max_ord_ird |= (dev->attr.max_ird_per_qp << + OCRDMA_CREATE_QP_REQ_MAX_IRD_SHIFT) & + OCRDMA_CREATE_QP_REQ_MAX_IRD_MASK; + cq = get_ocrdma_cq(attrs->send_cq); + cmd->wq_rq_cqid |= (cq->id << OCRDMA_CREATE_QP_REQ_WQ_CQID_SHIFT) & + OCRDMA_CREATE_QP_REQ_WQ_CQID_MASK; + qp->sq_cq = cq; + cq = get_ocrdma_cq(attrs->recv_cq); + cmd->wq_rq_cqid |= (cq->id << OCRDMA_CREATE_QP_REQ_RQ_CQID_SHIFT) & + OCRDMA_CREATE_QP_REQ_RQ_CQID_MASK; + qp->rq_cq = cq; + + if (pd->dpp_enabled && attrs->cap.max_inline_data && pd->num_dpp_qp && + (attrs->cap.max_inline_data <= dev->attr.max_inline_data)) { + ocrdma_set_create_qp_dpp_cmd(cmd, pd, qp, enable_dpp_cq, + dpp_cq_id); + } + + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + rsp = (struct ocrdma_create_qp_rsp *)cmd; + ocrdma_get_create_qp_rsp(rsp, qp, attrs, dpp_offset, dpp_credit_lmt); + qp->state = OCRDMA_QPS_RST; + kfree(cmd); + return 0; +mbx_err: + if (qp->rq.va) + dma_free_coherent(&pdev->dev, qp->rq.len, qp->rq.va, qp->rq.pa); +rq_err: + pr_err("%s(%d) rq_err\n", __func__, dev->id); + dma_free_coherent(&pdev->dev, qp->sq.len, qp->sq.va, qp->sq.pa); +sq_err: + pr_err("%s(%d) sq_err\n", __func__, dev->id); + kfree(cmd); + return status; +} + +int ocrdma_mbx_query_qp(struct ocrdma_dev *dev, struct ocrdma_qp *qp, + struct ocrdma_qp_params *param) +{ + int status = -ENOMEM; + struct ocrdma_query_qp *cmd; + struct ocrdma_query_qp_rsp *rsp; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_QUERY_QP, sizeof(*cmd)); + if (!cmd) + return status; + cmd->qp_id = qp->id; + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + rsp = (struct ocrdma_query_qp_rsp *)cmd; + memcpy(param, &rsp->params, sizeof(struct ocrdma_qp_params)); +mbx_err: + kfree(cmd); + return status; +} + +static int ocrdma_set_av_params(struct ocrdma_qp *qp, + struct ocrdma_modify_qp *cmd, + struct ib_qp_attr *attrs, + int attr_mask) +{ + int status; + struct ib_ah_attr *ah_attr = &attrs->ah_attr; + union ib_gid sgid, zgid; + u32 vlan_id; + u8 mac_addr[6]; + + if ((ah_attr->ah_flags & IB_AH_GRH) == 0) + return -EINVAL; + if (atomic_cmpxchg(&qp->dev->update_sl, 1, 0)) + ocrdma_init_service_level(qp->dev); + cmd->params.tclass_sq_psn |= + (ah_attr->grh.traffic_class << OCRDMA_QP_PARAMS_TCLASS_SHIFT); + cmd->params.rnt_rc_sl_fl |= + (ah_attr->grh.flow_label & OCRDMA_QP_PARAMS_FLOW_LABEL_MASK); + cmd->params.rnt_rc_sl_fl |= (ah_attr->sl << OCRDMA_QP_PARAMS_SL_SHIFT); + cmd->params.hop_lmt_rq_psn |= + (ah_attr->grh.hop_limit << OCRDMA_QP_PARAMS_HOP_LMT_SHIFT); + cmd->flags |= OCRDMA_QP_PARA_FLOW_LBL_VALID; + memcpy(&cmd->params.dgid[0], &ah_attr->grh.dgid.raw[0], + sizeof(cmd->params.dgid)); + status = ocrdma_query_gid(&qp->dev->ibdev, 1, + ah_attr->grh.sgid_index, &sgid); + if (status) + return status; + + memset(&zgid, 0, sizeof(zgid)); + if (!memcmp(&sgid, &zgid, sizeof(zgid))) + return -EINVAL; + + qp->sgid_idx = ah_attr->grh.sgid_index; + memcpy(&cmd->params.sgid[0], &sgid.raw[0], sizeof(cmd->params.sgid)); + ocrdma_resolve_dmac(qp->dev, ah_attr, &mac_addr[0]); + cmd->params.dmac_b0_to_b3 = mac_addr[0] | (mac_addr[1] << 8) | + (mac_addr[2] << 16) | (mac_addr[3] << 24); + /* convert them to LE format. */ + ocrdma_cpu_to_le32(&cmd->params.dgid[0], sizeof(cmd->params.dgid)); + ocrdma_cpu_to_le32(&cmd->params.sgid[0], sizeof(cmd->params.sgid)); + cmd->params.vlan_dmac_b4_to_b5 = mac_addr[4] | (mac_addr[5] << 8); + if (attr_mask & IB_QP_VID) { + vlan_id = attrs->vlan_id; + cmd->params.vlan_dmac_b4_to_b5 |= + vlan_id << OCRDMA_QP_PARAMS_VLAN_SHIFT; + cmd->flags |= OCRDMA_QP_PARA_VLAN_EN_VALID; + cmd->params.rnt_rc_sl_fl |= + (qp->dev->sl & 0x07) << OCRDMA_QP_PARAMS_SL_SHIFT; + } + return 0; +} + +static int ocrdma_set_qp_params(struct ocrdma_qp *qp, + struct ocrdma_modify_qp *cmd, + struct ib_qp_attr *attrs, int attr_mask) +{ + int status = 0; + + if (attr_mask & IB_QP_PKEY_INDEX) { + cmd->params.path_mtu_pkey_indx |= (attrs->pkey_index & + OCRDMA_QP_PARAMS_PKEY_INDEX_MASK); + cmd->flags |= OCRDMA_QP_PARA_PKEY_VALID; + } + if (attr_mask & IB_QP_QKEY) { + qp->qkey = attrs->qkey; + cmd->params.qkey = attrs->qkey; + cmd->flags |= OCRDMA_QP_PARA_QKEY_VALID; + } + if (attr_mask & IB_QP_AV) { + status = ocrdma_set_av_params(qp, cmd, attrs, attr_mask); + if (status) + return status; + } else if (qp->qp_type == IB_QPT_GSI || qp->qp_type == IB_QPT_UD) { + /* set the default mac address for UD, GSI QPs */ + cmd->params.dmac_b0_to_b3 = qp->dev->nic_info.mac_addr[0] | + (qp->dev->nic_info.mac_addr[1] << 8) | + (qp->dev->nic_info.mac_addr[2] << 16) | + (qp->dev->nic_info.mac_addr[3] << 24); + cmd->params.vlan_dmac_b4_to_b5 = qp->dev->nic_info.mac_addr[4] | + (qp->dev->nic_info.mac_addr[5] << 8); + } + if ((attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) && + attrs->en_sqd_async_notify) { + cmd->params.max_sge_recv_flags |= + OCRDMA_QP_PARAMS_FLAGS_SQD_ASYNC; + cmd->flags |= OCRDMA_QP_PARA_DST_QPN_VALID; + } + if (attr_mask & IB_QP_DEST_QPN) { + cmd->params.ack_to_rnr_rtc_dest_qpn |= (attrs->dest_qp_num & + OCRDMA_QP_PARAMS_DEST_QPN_MASK); + cmd->flags |= OCRDMA_QP_PARA_DST_QPN_VALID; + } + if (attr_mask & IB_QP_PATH_MTU) { + if (attrs->path_mtu < IB_MTU_256 || + attrs->path_mtu > IB_MTU_4096) { + status = -EINVAL; + goto pmtu_err; + } + cmd->params.path_mtu_pkey_indx |= + (ib_mtu_enum_to_int(attrs->path_mtu) << + OCRDMA_QP_PARAMS_PATH_MTU_SHIFT) & + OCRDMA_QP_PARAMS_PATH_MTU_MASK; + cmd->flags |= OCRDMA_QP_PARA_PMTU_VALID; + } + if (attr_mask & IB_QP_TIMEOUT) { + cmd->params.ack_to_rnr_rtc_dest_qpn |= attrs->timeout << + OCRDMA_QP_PARAMS_ACK_TIMEOUT_SHIFT; + cmd->flags |= OCRDMA_QP_PARA_ACK_TO_VALID; + } + if (attr_mask & IB_QP_RETRY_CNT) { + cmd->params.rnt_rc_sl_fl |= (attrs->retry_cnt << + OCRDMA_QP_PARAMS_RETRY_CNT_SHIFT) & + OCRDMA_QP_PARAMS_RETRY_CNT_MASK; + cmd->flags |= OCRDMA_QP_PARA_RETRY_CNT_VALID; + } + if (attr_mask & IB_QP_MIN_RNR_TIMER) { + cmd->params.rnt_rc_sl_fl |= (attrs->min_rnr_timer << + OCRDMA_QP_PARAMS_RNR_NAK_TIMER_SHIFT) & + OCRDMA_QP_PARAMS_RNR_NAK_TIMER_MASK; + cmd->flags |= OCRDMA_QP_PARA_RNT_VALID; + } + if (attr_mask & IB_QP_RNR_RETRY) { + cmd->params.ack_to_rnr_rtc_dest_qpn |= (attrs->rnr_retry << + OCRDMA_QP_PARAMS_RNR_RETRY_CNT_SHIFT) + & OCRDMA_QP_PARAMS_RNR_RETRY_CNT_MASK; + cmd->flags |= OCRDMA_QP_PARA_RRC_VALID; + } + if (attr_mask & IB_QP_SQ_PSN) { + cmd->params.tclass_sq_psn |= (attrs->sq_psn & 0x00ffffff); + cmd->flags |= OCRDMA_QP_PARA_SQPSN_VALID; + } + if (attr_mask & IB_QP_RQ_PSN) { + cmd->params.hop_lmt_rq_psn |= (attrs->rq_psn & 0x00ffffff); + cmd->flags |= OCRDMA_QP_PARA_RQPSN_VALID; + } + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { + if (attrs->max_rd_atomic > qp->dev->attr.max_ord_per_qp) { + status = -EINVAL; + goto pmtu_err; + } + qp->max_ord = attrs->max_rd_atomic; + cmd->flags |= OCRDMA_QP_PARA_MAX_ORD_VALID; + } + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { + if (attrs->max_dest_rd_atomic > qp->dev->attr.max_ird_per_qp) { + status = -EINVAL; + goto pmtu_err; + } + qp->max_ird = attrs->max_dest_rd_atomic; + cmd->flags |= OCRDMA_QP_PARA_MAX_IRD_VALID; + } + cmd->params.max_ord_ird = (qp->max_ord << + OCRDMA_QP_PARAMS_MAX_ORD_SHIFT) | + (qp->max_ird & OCRDMA_QP_PARAMS_MAX_IRD_MASK); +pmtu_err: + return status; +} + +int ocrdma_mbx_modify_qp(struct ocrdma_dev *dev, struct ocrdma_qp *qp, + struct ib_qp_attr *attrs, int attr_mask) +{ + int status = -ENOMEM; + struct ocrdma_modify_qp *cmd; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_MODIFY_QP, sizeof(*cmd)); + if (!cmd) + return status; + + cmd->params.id = qp->id; + cmd->flags = 0; + if (attr_mask & IB_QP_STATE) { + cmd->params.max_sge_recv_flags |= + (get_ocrdma_qp_state(attrs->qp_state) << + OCRDMA_QP_PARAMS_STATE_SHIFT) & + OCRDMA_QP_PARAMS_STATE_MASK; + cmd->flags |= OCRDMA_QP_PARA_QPS_VALID; + } else { + cmd->params.max_sge_recv_flags |= + (qp->state << OCRDMA_QP_PARAMS_STATE_SHIFT) & + OCRDMA_QP_PARAMS_STATE_MASK; + } + + status = ocrdma_set_qp_params(qp, cmd, attrs, attr_mask); + if (status) + goto mbx_err; + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + +mbx_err: + kfree(cmd); + return status; +} + +int ocrdma_mbx_destroy_qp(struct ocrdma_dev *dev, struct ocrdma_qp *qp) +{ + int status = -ENOMEM; + struct ocrdma_destroy_qp *cmd; + struct pci_dev *pdev = dev->nic_info.pdev; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DELETE_QP, sizeof(*cmd)); + if (!cmd) + return status; + cmd->qp_id = qp->id; + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + +mbx_err: + kfree(cmd); + if (qp->sq.va) + dma_free_coherent(&pdev->dev, qp->sq.len, qp->sq.va, qp->sq.pa); + if (!qp->srq && qp->rq.va) + dma_free_coherent(&pdev->dev, qp->rq.len, qp->rq.va, qp->rq.pa); + if (qp->dpp_enabled) + qp->pd->num_dpp_qp++; + return status; +} + +int ocrdma_mbx_create_srq(struct ocrdma_dev *dev, struct ocrdma_srq *srq, + struct ib_srq_init_attr *srq_attr, + struct ocrdma_pd *pd) +{ + int status = -ENOMEM; + int hw_pages, hw_page_size; + int len; + struct ocrdma_create_srq_rsp *rsp; + struct ocrdma_create_srq *cmd; + dma_addr_t pa; + struct pci_dev *pdev = dev->nic_info.pdev; + u32 max_rqe_allocated; + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_CREATE_SRQ, sizeof(*cmd)); + if (!cmd) + return status; + + cmd->pgsz_pdid = pd->id & OCRDMA_CREATE_SRQ_PD_ID_MASK; + max_rqe_allocated = srq_attr->attr.max_wr + 1; + status = ocrdma_build_q_conf(&max_rqe_allocated, + dev->attr.rqe_size, + &hw_pages, &hw_page_size); + if (status) { + pr_err("%s() req. max_wr=0x%x\n", __func__, + srq_attr->attr.max_wr); + status = -EINVAL; + goto ret; + } + len = hw_pages * hw_page_size; + srq->rq.va = dma_alloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL); + if (!srq->rq.va) { + status = -ENOMEM; + goto ret; + } + ocrdma_build_q_pages(&cmd->rq_addr[0], hw_pages, pa, hw_page_size); + + srq->rq.entry_size = dev->attr.rqe_size; + srq->rq.pa = pa; + srq->rq.len = len; + srq->rq.max_cnt = max_rqe_allocated; + + cmd->max_sge_rqe = ilog2(max_rqe_allocated); + cmd->max_sge_rqe |= srq_attr->attr.max_sge << + OCRDMA_CREATE_SRQ_MAX_SGE_RECV_SHIFT; + + cmd->pgsz_pdid |= (ilog2(hw_page_size / OCRDMA_MIN_Q_PAGE_SIZE) + << OCRDMA_CREATE_SRQ_PG_SZ_SHIFT); + cmd->pages_rqe_sz |= (dev->attr.rqe_size + << OCRDMA_CREATE_SRQ_RQE_SIZE_SHIFT) + & OCRDMA_CREATE_SRQ_RQE_SIZE_MASK; + cmd->pages_rqe_sz |= hw_pages << OCRDMA_CREATE_SRQ_NUM_RQ_PAGES_SHIFT; + + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status) + goto mbx_err; + rsp = (struct ocrdma_create_srq_rsp *)cmd; + srq->id = rsp->id; + srq->rq.dbid = rsp->id; + max_rqe_allocated = ((rsp->max_sge_rqe_allocated & + OCRDMA_CREATE_SRQ_RSP_MAX_RQE_ALLOCATED_MASK) >> + OCRDMA_CREATE_SRQ_RSP_MAX_RQE_ALLOCATED_SHIFT); + max_rqe_allocated = (1 << max_rqe_allocated); + srq->rq.max_cnt = max_rqe_allocated; + srq->rq.max_wqe_idx = max_rqe_allocated - 1; + srq->rq.max_sges = (rsp->max_sge_rqe_allocated & + OCRDMA_CREATE_SRQ_RSP_MAX_SGE_RECV_ALLOCATED_MASK) >> + OCRDMA_CREATE_SRQ_RSP_MAX_SGE_RECV_ALLOCATED_SHIFT; + goto ret; +mbx_err: + dma_free_coherent(&pdev->dev, srq->rq.len, srq->rq.va, pa); +ret: + kfree(cmd); + return status; +} + +int ocrdma_mbx_modify_srq(struct ocrdma_srq *srq, struct ib_srq_attr *srq_attr) +{ + int status = -ENOMEM; + struct ocrdma_modify_srq *cmd; + struct ocrdma_pd *pd = srq->pd; + struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device); + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_MODIFY_SRQ, sizeof(*cmd)); + if (!cmd) + return status; + cmd->id = srq->id; + cmd->limit_max_rqe |= srq_attr->srq_limit << + OCRDMA_MODIFY_SRQ_LIMIT_SHIFT; + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + kfree(cmd); + return status; +} + +int ocrdma_mbx_query_srq(struct ocrdma_srq *srq, struct ib_srq_attr *srq_attr) +{ + int status = -ENOMEM; + struct ocrdma_query_srq *cmd; + struct ocrdma_dev *dev = get_ocrdma_dev(srq->ibsrq.device); + + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_QUERY_SRQ, sizeof(*cmd)); + if (!cmd) + return status; + cmd->id = srq->rq.dbid; + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (status == 0) { + struct ocrdma_query_srq_rsp *rsp = + (struct ocrdma_query_srq_rsp *)cmd; + srq_attr->max_sge = + rsp->srq_lmt_max_sge & + OCRDMA_QUERY_SRQ_RSP_MAX_SGE_RECV_MASK; + srq_attr->max_wr = + rsp->max_rqe_pdid >> OCRDMA_QUERY_SRQ_RSP_MAX_RQE_SHIFT; + srq_attr->srq_limit = rsp->srq_lmt_max_sge >> + OCRDMA_QUERY_SRQ_RSP_SRQ_LIMIT_SHIFT; + } + kfree(cmd); + return status; +} + +int ocrdma_mbx_destroy_srq(struct ocrdma_dev *dev, struct ocrdma_srq *srq) +{ + int status = -ENOMEM; + struct ocrdma_destroy_srq *cmd; + struct pci_dev *pdev = dev->nic_info.pdev; + cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DELETE_SRQ, sizeof(*cmd)); + if (!cmd) + return status; + cmd->id = srq->id; + status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + if (srq->rq.va) + dma_free_coherent(&pdev->dev, srq->rq.len, + srq->rq.va, srq->rq.pa); + kfree(cmd); + return status; +} + +static int ocrdma_mbx_get_dcbx_config(struct ocrdma_dev *dev, u32 ptype, + struct ocrdma_dcbx_cfg *dcbxcfg) +{ + int status = 0; + dma_addr_t pa; + struct ocrdma_mqe cmd; + + struct ocrdma_get_dcbx_cfg_req *req = NULL; + struct ocrdma_get_dcbx_cfg_rsp *rsp = NULL; + struct pci_dev *pdev = dev->nic_info.pdev; + struct ocrdma_mqe_sge *mqe_sge = cmd.u.nonemb_req.sge; + + memset(&cmd, 0, sizeof(struct ocrdma_mqe)); + cmd.hdr.pyld_len = max_t (u32, sizeof(struct ocrdma_get_dcbx_cfg_rsp), + sizeof(struct ocrdma_get_dcbx_cfg_req)); + req = dma_alloc_coherent(&pdev->dev, cmd.hdr.pyld_len, &pa, GFP_KERNEL); + if (!req) { + status = -ENOMEM; + goto mem_err; + } + + cmd.hdr.spcl_sge_cnt_emb |= (1 << OCRDMA_MQE_HDR_SGE_CNT_SHIFT) & + OCRDMA_MQE_HDR_SGE_CNT_MASK; + mqe_sge->pa_lo = (u32) (pa & 0xFFFFFFFFUL); + mqe_sge->pa_hi = (u32) upper_32_bits(pa); + mqe_sge->len = cmd.hdr.pyld_len; + + memset(req, 0, sizeof(struct ocrdma_get_dcbx_cfg_req)); + ocrdma_init_mch(&req->hdr, OCRDMA_CMD_GET_DCBX_CONFIG, + OCRDMA_SUBSYS_DCBX, cmd.hdr.pyld_len); + req->param_type = ptype; + + status = ocrdma_mbx_cmd(dev, &cmd); + if (status) + goto mbx_err; + + rsp = (struct ocrdma_get_dcbx_cfg_rsp *)req; + ocrdma_le32_to_cpu(rsp, sizeof(struct ocrdma_get_dcbx_cfg_rsp)); + memcpy(dcbxcfg, &rsp->cfg, sizeof(struct ocrdma_dcbx_cfg)); + +mbx_err: + dma_free_coherent(&pdev->dev, cmd.hdr.pyld_len, req, pa); +mem_err: + return status; +} + +#define OCRDMA_MAX_SERVICE_LEVEL_INDEX 0x08 +#define OCRDMA_DEFAULT_SERVICE_LEVEL 0x05 + +static int ocrdma_parse_dcbxcfg_rsp(struct ocrdma_dev *dev, int ptype, + struct ocrdma_dcbx_cfg *dcbxcfg, + u8 *srvc_lvl) +{ + int status = -EINVAL, indx, slindx; + int ventry_cnt; + struct ocrdma_app_parameter *app_param; + u8 valid, proto_sel; + u8 app_prio, pfc_prio; + u16 proto; + + if (!(dcbxcfg->tcv_aev_opv_st & OCRDMA_DCBX_STATE_MASK)) { + pr_info("%s ocrdma%d DCBX is disabled\n", + dev_name(&dev->nic_info.pdev->dev), dev->id); + goto out; + } + + if (!ocrdma_is_enabled_and_synced(dcbxcfg->pfc_state)) { + pr_info("%s ocrdma%d priority flow control(%s) is %s%s\n", + dev_name(&dev->nic_info.pdev->dev), dev->id, + (ptype > 0 ? "operational" : "admin"), + (dcbxcfg->pfc_state & OCRDMA_STATE_FLAG_ENABLED) ? + "enabled" : "disabled", + (dcbxcfg->pfc_state & OCRDMA_STATE_FLAG_SYNC) ? + "" : ", not sync'ed"); + goto out; + } else { + pr_info("%s ocrdma%d priority flow control is enabled and sync'ed\n", + dev_name(&dev->nic_info.pdev->dev), dev->id); + } + + ventry_cnt = (dcbxcfg->tcv_aev_opv_st >> + OCRDMA_DCBX_APP_ENTRY_SHIFT) + & OCRDMA_DCBX_STATE_MASK; + + for (indx = 0; indx < ventry_cnt; indx++) { + app_param = &dcbxcfg->app_param[indx]; + valid = (app_param->valid_proto_app >> + OCRDMA_APP_PARAM_VALID_SHIFT) + & OCRDMA_APP_PARAM_VALID_MASK; + proto_sel = (app_param->valid_proto_app + >> OCRDMA_APP_PARAM_PROTO_SEL_SHIFT) + & OCRDMA_APP_PARAM_PROTO_SEL_MASK; + proto = app_param->valid_proto_app & + OCRDMA_APP_PARAM_APP_PROTO_MASK; + + if ( + valid && proto == OCRDMA_APP_PROTO_ROCE && + proto_sel == OCRDMA_PROTO_SELECT_L2) { + for (slindx = 0; slindx < + OCRDMA_MAX_SERVICE_LEVEL_INDEX; slindx++) { + app_prio = ocrdma_get_app_prio( + (u8 *)app_param->app_prio, + slindx); + pfc_prio = ocrdma_get_pfc_prio( + (u8 *)dcbxcfg->pfc_prio, + slindx); + + if (app_prio && pfc_prio) { + *srvc_lvl = slindx; + status = 0; + goto out; + } + } + if (slindx == OCRDMA_MAX_SERVICE_LEVEL_INDEX) { + pr_info("%s ocrdma%d application priority not set for 0x%x protocol\n", + dev_name(&dev->nic_info.pdev->dev), + dev->id, proto); + } + } + } + +out: + return status; +} + +void ocrdma_init_service_level(struct ocrdma_dev *dev) +{ + int status = 0, indx; + struct ocrdma_dcbx_cfg dcbxcfg; + u8 srvc_lvl = OCRDMA_DEFAULT_SERVICE_LEVEL; + int ptype = OCRDMA_PARAMETER_TYPE_OPER; + + for (indx = 0; indx < 2; indx++) { + status = ocrdma_mbx_get_dcbx_config(dev, ptype, &dcbxcfg); + if (status) { + pr_err("%s(): status=%d\n", __func__, status); + ptype = OCRDMA_PARAMETER_TYPE_ADMIN; + continue; + } + + status = ocrdma_parse_dcbxcfg_rsp(dev, ptype, + &dcbxcfg, &srvc_lvl); + if (status) { + ptype = OCRDMA_PARAMETER_TYPE_ADMIN; + continue; + } + + break; + } + + if (status) + pr_info("%s ocrdma%d service level default\n", + dev_name(&dev->nic_info.pdev->dev), dev->id); + else + pr_info("%s ocrdma%d service level %d\n", + dev_name(&dev->nic_info.pdev->dev), dev->id, + srvc_lvl); + + dev->pfc_state = ocrdma_is_enabled_and_synced(dcbxcfg.pfc_state); + dev->sl = srvc_lvl; +} + +int ocrdma_alloc_av(struct ocrdma_dev *dev, struct ocrdma_ah *ah) +{ + int i; + int status = -EINVAL; + struct ocrdma_av *av; + unsigned long flags; + + av = dev->av_tbl.va; + spin_lock_irqsave(&dev->av_tbl.lock, flags); + for (i = 0; i < dev->av_tbl.num_ah; i++) { + if (av->valid == 0) { + av->valid = OCRDMA_AV_VALID; + ah->av = av; + ah->id = i; + status = 0; + break; + } + av++; + } + if (i == dev->av_tbl.num_ah) + status = -EAGAIN; + spin_unlock_irqrestore(&dev->av_tbl.lock, flags); + return status; +} + +int ocrdma_free_av(struct ocrdma_dev *dev, struct ocrdma_ah *ah) +{ + unsigned long flags; + spin_lock_irqsave(&dev->av_tbl.lock, flags); + ah->av->valid = 0; + spin_unlock_irqrestore(&dev->av_tbl.lock, flags); + return 0; +} + +static int ocrdma_create_eqs(struct ocrdma_dev *dev) +{ + int num_eq, i, status = 0; + int irq; + unsigned long flags = 0; + + num_eq = dev->nic_info.msix.num_vectors - + dev->nic_info.msix.start_vector; + if (dev->nic_info.intr_mode == BE_INTERRUPT_MODE_INTX) { + num_eq = 1; + flags = IRQF_SHARED; + } else { + num_eq = min_t(u32, num_eq, num_online_cpus()); + } + + if (!num_eq) + return -EINVAL; + + dev->eq_tbl = kzalloc(sizeof(struct ocrdma_eq) * num_eq, GFP_KERNEL); + if (!dev->eq_tbl) + return -ENOMEM; + + for (i = 0; i < num_eq; i++) { + status = ocrdma_create_eq(dev, &dev->eq_tbl[i], + OCRDMA_EQ_LEN); + if (status) { + status = -EINVAL; + break; + } + sprintf(dev->eq_tbl[i].irq_name, "ocrdma%d-%d", + dev->id, i); + irq = ocrdma_get_irq(dev, &dev->eq_tbl[i]); + status = request_irq(irq, ocrdma_irq_handler, flags, + dev->eq_tbl[i].irq_name, + &dev->eq_tbl[i]); + if (status) + goto done; + dev->eq_cnt += 1; + } + /* one eq is sufficient for data path to work */ + return 0; +done: + ocrdma_destroy_eqs(dev); + return status; +} + +int ocrdma_init_hw(struct ocrdma_dev *dev) +{ + int status; + + /* create the eqs */ + status = ocrdma_create_eqs(dev); + if (status) + goto qpeq_err; + status = ocrdma_create_mq(dev); + if (status) + goto mq_err; + status = ocrdma_mbx_query_fw_config(dev); + if (status) + goto conf_err; + status = ocrdma_mbx_query_dev(dev); + if (status) + goto conf_err; + status = ocrdma_mbx_query_fw_ver(dev); + if (status) + goto conf_err; + status = ocrdma_mbx_create_ah_tbl(dev); + if (status) + goto conf_err; + status = ocrdma_mbx_get_phy_info(dev); + if (status) + goto info_attrb_err; + status = ocrdma_mbx_get_ctrl_attribs(dev); + if (status) + goto info_attrb_err; + + return 0; + +info_attrb_err: + ocrdma_mbx_delete_ah_tbl(dev); +conf_err: + ocrdma_destroy_mq(dev); +mq_err: + ocrdma_destroy_eqs(dev); +qpeq_err: + pr_err("%s() status=%d\n", __func__, status); + return status; +} + +void ocrdma_cleanup_hw(struct ocrdma_dev *dev) +{ + ocrdma_mbx_delete_ah_tbl(dev); + + /* cleanup the eqs */ + ocrdma_destroy_eqs(dev); + + /* cleanup the control path */ + ocrdma_destroy_mq(dev); +} diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.h b/drivers/infiniband/hw/ocrdma/ocrdma_hw.h new file mode 100644 index 0000000..6eed8f1 --- /dev/null +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.h @@ -0,0 +1,140 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for * + * RoCE (RDMA over Converged Ethernet) CNA Adapters. * + * Copyright (C) 2008-2012 Emulex. All rights reserved. * + * EMULEX and SLI are trademarks of Emulex. * + * www.emulex.com * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of version 2 of the GNU General * + * Public License as published by the Free Software Foundation. * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID. See the GNU General Public License for * + * more details, a copy of which can be found in the file COPYING * + * included with this package. * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#ifndef __OCRDMA_HW_H__ +#define __OCRDMA_HW_H__ + +#include "ocrdma_sli.h" + +static inline void ocrdma_cpu_to_le32(void *dst, u32 len) +{ +#ifdef __BIG_ENDIAN + int i = 0; + u32 *src_ptr = dst; + u32 *dst_ptr = dst; + for (; i < (len / 4); i++) + *(dst_ptr + i) = cpu_to_le32p(src_ptr + i); +#endif +} + +static inline void ocrdma_le32_to_cpu(void *dst, u32 len) +{ +#ifdef __BIG_ENDIAN + int i = 0; + u32 *src_ptr = dst; + u32 *dst_ptr = dst; + for (; i < (len / sizeof(u32)); i++) + *(dst_ptr + i) = le32_to_cpu(*(src_ptr + i)); +#endif +} + +static inline void ocrdma_copy_cpu_to_le32(void *dst, void *src, u32 len) +{ +#ifdef __BIG_ENDIAN + int i = 0; + u32 *src_ptr = src; + u32 *dst_ptr = dst; + for (; i < (len / sizeof(u32)); i++) + *(dst_ptr + i) = cpu_to_le32p(src_ptr + i); +#else + memcpy(dst, src, len); +#endif +} + +static inline void ocrdma_copy_le32_to_cpu(void *dst, void *src, u32 len) +{ +#ifdef __BIG_ENDIAN + int i = 0; + u32 *src_ptr = src; + u32 *dst_ptr = dst; + for (; i < len / sizeof(u32); i++) + *(dst_ptr + i) = le32_to_cpu(*(src_ptr + i)); +#else + memcpy(dst, src, len); +#endif +} + +static inline u64 ocrdma_get_db_addr(struct ocrdma_dev *dev, u32 pdid) +{ + return dev->nic_info.unmapped_db + (pdid * dev->nic_info.db_page_size); +} + +int ocrdma_init_hw(struct ocrdma_dev *); +void ocrdma_cleanup_hw(struct ocrdma_dev *); + +enum ib_qp_state get_ibqp_state(enum ocrdma_qp_state qps); +void ocrdma_ring_cq_db(struct ocrdma_dev *, u16 cq_id, bool armed, + bool solicited, u16 cqe_popped); + +/* verbs specific mailbox commands */ +int ocrdma_mbx_get_link_speed(struct ocrdma_dev *dev, u8 *lnk_speed); +int ocrdma_query_config(struct ocrdma_dev *, + struct ocrdma_mbx_query_config *config); + +int ocrdma_mbx_alloc_pd(struct ocrdma_dev *, struct ocrdma_pd *); +int ocrdma_mbx_dealloc_pd(struct ocrdma_dev *, struct ocrdma_pd *); + +int ocrdma_mbx_alloc_lkey(struct ocrdma_dev *, struct ocrdma_hw_mr *hwmr, + u32 pd_id, int addr_check); +int ocrdma_mbx_dealloc_lkey(struct ocrdma_dev *, int fmr, u32 lkey); + +int ocrdma_reg_mr(struct ocrdma_dev *, struct ocrdma_hw_mr *hwmr, + u32 pd_id, int acc); +int ocrdma_mbx_create_cq(struct ocrdma_dev *, struct ocrdma_cq *, + int entries, int dpp_cq, u16 pd_id); +int ocrdma_mbx_destroy_cq(struct ocrdma_dev *, struct ocrdma_cq *); + +int ocrdma_mbx_create_qp(struct ocrdma_qp *, struct ib_qp_init_attr *attrs, + u8 enable_dpp_cq, u16 dpp_cq_id, u16 *dpp_offset, + u16 *dpp_credit_lmt); +int ocrdma_mbx_modify_qp(struct ocrdma_dev *, struct ocrdma_qp *, + struct ib_qp_attr *attrs, int attr_mask); +int ocrdma_mbx_query_qp(struct ocrdma_dev *, struct ocrdma_qp *, + struct ocrdma_qp_params *param); +int ocrdma_mbx_destroy_qp(struct ocrdma_dev *, struct ocrdma_qp *); +int ocrdma_mbx_create_srq(struct ocrdma_dev *, struct ocrdma_srq *, + struct ib_srq_init_attr *, + struct ocrdma_pd *); +int ocrdma_mbx_modify_srq(struct ocrdma_srq *, struct ib_srq_attr *); +int ocrdma_mbx_query_srq(struct ocrdma_srq *, struct ib_srq_attr *); +int ocrdma_mbx_destroy_srq(struct ocrdma_dev *, struct ocrdma_srq *); + +int ocrdma_alloc_av(struct ocrdma_dev *, struct ocrdma_ah *); +int ocrdma_free_av(struct ocrdma_dev *, struct ocrdma_ah *); + +int ocrdma_qp_state_change(struct ocrdma_qp *, enum ib_qp_state new_state, + enum ib_qp_state *old_ib_state); +bool ocrdma_is_qp_in_sq_flushlist(struct ocrdma_cq *, struct ocrdma_qp *); +bool ocrdma_is_qp_in_rq_flushlist(struct ocrdma_cq *, struct ocrdma_qp *); +void ocrdma_flush_qp(struct ocrdma_qp *); +int ocrdma_get_irq(struct ocrdma_dev *dev, struct ocrdma_eq *eq); + +int ocrdma_mbx_rdma_stats(struct ocrdma_dev *, bool reset); +char *port_speed_string(struct ocrdma_dev *dev); +void ocrdma_init_service_level(struct ocrdma_dev *); + +#endif /* __OCRDMA_HW_H__ */ diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c new file mode 100644 index 0000000..b0b2257 --- /dev/null +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -0,0 +1,676 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for * + * RoCE (RDMA over Converged Ethernet) adapters. * + * Copyright (C) 2008-2012 Emulex. All rights reserved. * + * EMULEX and SLI are trademarks of Emulex. * + * www.emulex.com * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of version 2 of the GNU General * + * Public License as published by the Free Software Foundation. * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID. See the GNU General Public License for * + * more details, a copy of which can be found in the file COPYING * + * included with this package. * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#include +#include +#include +#include +#include + +#include +#include + +#include "ocrdma.h" +#include "ocrdma_verbs.h" +#include "ocrdma_ah.h" +#include "be_roce.h" +#include "ocrdma_hw.h" +#include "ocrdma_stats.h" +#include "ocrdma_abi.h" + +MODULE_VERSION(OCRDMA_ROCE_DRV_VERSION); +MODULE_DESCRIPTION(OCRDMA_ROCE_DRV_DESC " " OCRDMA_ROCE_DRV_VERSION); +MODULE_AUTHOR("Emulex Corporation"); +MODULE_LICENSE("GPL"); + +static LIST_HEAD(ocrdma_dev_list); +static DEFINE_SPINLOCK(ocrdma_devlist_lock); +static DEFINE_IDR(ocrdma_dev_id); + +static union ib_gid ocrdma_zero_sgid; + +void ocrdma_get_guid(struct ocrdma_dev *dev, u8 *guid) +{ + u8 mac_addr[6]; + + memcpy(&mac_addr[0], &dev->nic_info.mac_addr[0], ETH_ALEN); + guid[0] = mac_addr[0] ^ 2; + guid[1] = mac_addr[1]; + guid[2] = mac_addr[2]; + guid[3] = 0xff; + guid[4] = 0xfe; + guid[5] = mac_addr[3]; + guid[6] = mac_addr[4]; + guid[7] = mac_addr[5]; +} + +static bool ocrdma_add_sgid(struct ocrdma_dev *dev, union ib_gid *new_sgid) +{ + int i; + unsigned long flags; + + memset(&ocrdma_zero_sgid, 0, sizeof(union ib_gid)); + + + spin_lock_irqsave(&dev->sgid_lock, flags); + for (i = 0; i < OCRDMA_MAX_SGID; i++) { + if (!memcmp(&dev->sgid_tbl[i], &ocrdma_zero_sgid, + sizeof(union ib_gid))) { + /* found free entry */ + memcpy(&dev->sgid_tbl[i], new_sgid, + sizeof(union ib_gid)); + spin_unlock_irqrestore(&dev->sgid_lock, flags); + return true; + } else if (!memcmp(&dev->sgid_tbl[i], new_sgid, + sizeof(union ib_gid))) { + /* entry already present, no addition is required. */ + spin_unlock_irqrestore(&dev->sgid_lock, flags); + return false; + } + } + spin_unlock_irqrestore(&dev->sgid_lock, flags); + return false; +} + +static bool ocrdma_del_sgid(struct ocrdma_dev *dev, union ib_gid *sgid) +{ + int found = false; + int i; + unsigned long flags; + + + spin_lock_irqsave(&dev->sgid_lock, flags); + /* first is default sgid, which cannot be deleted. */ + for (i = 1; i < OCRDMA_MAX_SGID; i++) { + if (!memcmp(&dev->sgid_tbl[i], sgid, sizeof(union ib_gid))) { + /* found matching entry */ + memset(&dev->sgid_tbl[i], 0, sizeof(union ib_gid)); + found = true; + break; + } + } + spin_unlock_irqrestore(&dev->sgid_lock, flags); + return found; +} + +static int ocrdma_addr_event(unsigned long event, struct net_device *netdev, + union ib_gid *gid) +{ + struct ib_event gid_event; + struct ocrdma_dev *dev; + bool found = false; + bool updated = false; + bool is_vlan = false; + + is_vlan = netdev->priv_flags & IFF_802_1Q_VLAN; + if (is_vlan) + netdev = rdma_vlan_dev_real_dev(netdev); + + rcu_read_lock(); + list_for_each_entry_rcu(dev, &ocrdma_dev_list, entry) { + if (dev->nic_info.netdev == netdev) { + found = true; + break; + } + } + rcu_read_unlock(); + + if (!found) + return NOTIFY_DONE; + + mutex_lock(&dev->dev_lock); + switch (event) { + case NETDEV_UP: + updated = ocrdma_add_sgid(dev, gid); + break; + case NETDEV_DOWN: + updated = ocrdma_del_sgid(dev, gid); + break; + default: + break; + } + if (updated) { + /* GID table updated, notify the consumers about it */ + gid_event.device = &dev->ibdev; + gid_event.element.port_num = 1; + gid_event.event = IB_EVENT_GID_CHANGE; + ib_dispatch_event(&gid_event); + } + mutex_unlock(&dev->dev_lock); + return NOTIFY_OK; +} + +static int ocrdma_inetaddr_event(struct notifier_block *notifier, + unsigned long event, void *ptr) +{ + struct in_ifaddr *ifa = ptr; + union ib_gid gid; + struct net_device *netdev = ifa->ifa_dev->dev; + + ipv6_addr_set_v4mapped(ifa->ifa_address, (struct in6_addr *)&gid); + return ocrdma_addr_event(event, netdev, &gid); +} + +static struct notifier_block ocrdma_inetaddr_notifier = { + .notifier_call = ocrdma_inetaddr_event +}; + +#if IS_ENABLED(CONFIG_IPV6) + +static int ocrdma_inet6addr_event(struct notifier_block *notifier, + unsigned long event, void *ptr) +{ + struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; + union ib_gid *gid = (union ib_gid *)&ifa->addr; + struct net_device *netdev = ifa->idev->dev; + return ocrdma_addr_event(event, netdev, gid); +} + +static struct notifier_block ocrdma_inet6addr_notifier = { + .notifier_call = ocrdma_inet6addr_event +}; + +#endif /* IPV6 and VLAN */ + +static enum rdma_link_layer ocrdma_link_layer(struct ib_device *device, + u8 port_num) +{ + return IB_LINK_LAYER_ETHERNET; +} + +static int ocrdma_register_device(struct ocrdma_dev *dev) +{ + strlcpy(dev->ibdev.name, "ocrdma%d", IB_DEVICE_NAME_MAX); + ocrdma_get_guid(dev, (u8 *)&dev->ibdev.node_guid); + memcpy(dev->ibdev.node_desc, OCRDMA_NODE_DESC, + sizeof(OCRDMA_NODE_DESC)); + dev->ibdev.owner = THIS_MODULE; + dev->ibdev.uverbs_abi_ver = OCRDMA_ABI_VERSION; + dev->ibdev.uverbs_cmd_mask = + OCRDMA_UVERBS(GET_CONTEXT) | + OCRDMA_UVERBS(QUERY_DEVICE) | + OCRDMA_UVERBS(QUERY_PORT) | + OCRDMA_UVERBS(ALLOC_PD) | + OCRDMA_UVERBS(DEALLOC_PD) | + OCRDMA_UVERBS(REG_MR) | + OCRDMA_UVERBS(DEREG_MR) | + OCRDMA_UVERBS(CREATE_COMP_CHANNEL) | + OCRDMA_UVERBS(CREATE_CQ) | + OCRDMA_UVERBS(RESIZE_CQ) | + OCRDMA_UVERBS(DESTROY_CQ) | + OCRDMA_UVERBS(REQ_NOTIFY_CQ) | + OCRDMA_UVERBS(CREATE_QP) | + OCRDMA_UVERBS(MODIFY_QP) | + OCRDMA_UVERBS(QUERY_QP) | + OCRDMA_UVERBS(DESTROY_QP) | + OCRDMA_UVERBS(POLL_CQ) | + OCRDMA_UVERBS(POST_SEND) | + OCRDMA_UVERBS(POST_RECV); + + dev->ibdev.uverbs_cmd_mask |= + OCRDMA_UVERBS(CREATE_AH) | + OCRDMA_UVERBS(MODIFY_AH) | + OCRDMA_UVERBS(QUERY_AH) | + OCRDMA_UVERBS(DESTROY_AH); + + dev->ibdev.node_type = RDMA_NODE_IB_CA; + dev->ibdev.phys_port_cnt = 1; + dev->ibdev.num_comp_vectors = 1; + + /* mandatory verbs. */ + dev->ibdev.query_device = ocrdma_query_device; + dev->ibdev.query_port = ocrdma_query_port; + dev->ibdev.modify_port = ocrdma_modify_port; + dev->ibdev.query_gid = ocrdma_query_gid; + dev->ibdev.get_link_layer = ocrdma_link_layer; + dev->ibdev.alloc_pd = ocrdma_alloc_pd; + dev->ibdev.dealloc_pd = ocrdma_dealloc_pd; + + dev->ibdev.create_cq = ocrdma_create_cq; + dev->ibdev.destroy_cq = ocrdma_destroy_cq; + dev->ibdev.resize_cq = ocrdma_resize_cq; + + dev->ibdev.create_qp = ocrdma_create_qp; + dev->ibdev.modify_qp = ocrdma_modify_qp; + dev->ibdev.query_qp = ocrdma_query_qp; + dev->ibdev.destroy_qp = ocrdma_destroy_qp; + + dev->ibdev.query_pkey = ocrdma_query_pkey; + dev->ibdev.create_ah = ocrdma_create_ah; + dev->ibdev.destroy_ah = ocrdma_destroy_ah; + dev->ibdev.query_ah = ocrdma_query_ah; + dev->ibdev.modify_ah = ocrdma_modify_ah; + + dev->ibdev.poll_cq = ocrdma_poll_cq; + dev->ibdev.post_send = ocrdma_post_send; + dev->ibdev.post_recv = ocrdma_post_recv; + dev->ibdev.req_notify_cq = ocrdma_arm_cq; + + dev->ibdev.get_dma_mr = ocrdma_get_dma_mr; + dev->ibdev.reg_phys_mr = ocrdma_reg_kernel_mr; + dev->ibdev.dereg_mr = ocrdma_dereg_mr; + dev->ibdev.reg_user_mr = ocrdma_reg_user_mr; + + dev->ibdev.alloc_fast_reg_mr = ocrdma_alloc_frmr; + dev->ibdev.alloc_fast_reg_page_list = ocrdma_alloc_frmr_page_list; + dev->ibdev.free_fast_reg_page_list = ocrdma_free_frmr_page_list; + + /* mandatory to support user space verbs consumer. */ + dev->ibdev.alloc_ucontext = ocrdma_alloc_ucontext; + dev->ibdev.dealloc_ucontext = ocrdma_dealloc_ucontext; + dev->ibdev.mmap = ocrdma_mmap; + dev->ibdev.dma_device = &dev->nic_info.pdev->dev; + + dev->ibdev.process_mad = ocrdma_process_mad; + + if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) { + dev->ibdev.uverbs_cmd_mask |= + OCRDMA_UVERBS(CREATE_SRQ) | + OCRDMA_UVERBS(MODIFY_SRQ) | + OCRDMA_UVERBS(QUERY_SRQ) | + OCRDMA_UVERBS(DESTROY_SRQ) | + OCRDMA_UVERBS(POST_SRQ_RECV); + + dev->ibdev.create_srq = ocrdma_create_srq; + dev->ibdev.modify_srq = ocrdma_modify_srq; + dev->ibdev.query_srq = ocrdma_query_srq; + dev->ibdev.destroy_srq = ocrdma_destroy_srq; + dev->ibdev.post_srq_recv = ocrdma_post_srq_recv; + } + return ib_register_device(&dev->ibdev, NULL); +} + +static int ocrdma_alloc_resources(struct ocrdma_dev *dev) +{ + mutex_init(&dev->dev_lock); + dev->sgid_tbl = kzalloc(sizeof(union ib_gid) * + OCRDMA_MAX_SGID, GFP_KERNEL); + if (!dev->sgid_tbl) + goto alloc_err; + spin_lock_init(&dev->sgid_lock); + + dev->cq_tbl = kzalloc(sizeof(struct ocrdma_cq *) * + OCRDMA_MAX_CQ, GFP_KERNEL); + if (!dev->cq_tbl) + goto alloc_err; + + if (dev->attr.max_qp) { + dev->qp_tbl = kzalloc(sizeof(struct ocrdma_qp *) * + OCRDMA_MAX_QP, GFP_KERNEL); + if (!dev->qp_tbl) + goto alloc_err; + } + + dev->stag_arr = kzalloc(sizeof(u64) * OCRDMA_MAX_STAG, GFP_KERNEL); + if (dev->stag_arr == NULL) + goto alloc_err; + + spin_lock_init(&dev->av_tbl.lock); + spin_lock_init(&dev->flush_q_lock); + return 0; +alloc_err: + pr_err("%s(%d) error.\n", __func__, dev->id); + return -ENOMEM; +} + +static void ocrdma_free_resources(struct ocrdma_dev *dev) +{ + kfree(dev->stag_arr); + kfree(dev->qp_tbl); + kfree(dev->cq_tbl); + kfree(dev->sgid_tbl); +} + +/* OCRDMA sysfs interface */ +static ssize_t show_rev(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct ocrdma_dev *dev = dev_get_drvdata(device); + + return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->nic_info.pdev->vendor); +} + +static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct ocrdma_dev *dev = dev_get_drvdata(device); + + return scnprintf(buf, PAGE_SIZE, "%s\n", &dev->attr.fw_ver[0]); +} + +static ssize_t show_hca_type(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct ocrdma_dev *dev = dev_get_drvdata(device); + + return scnprintf(buf, PAGE_SIZE, "%s\n", &dev->model_number[0]); +} + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca_type, NULL); + +static struct device_attribute *ocrdma_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_fw_ver, + &dev_attr_hca_type +}; + +static void ocrdma_remove_sysfiles(struct ocrdma_dev *dev) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(ocrdma_attributes); i++) + device_remove_file(&dev->ibdev.dev, ocrdma_attributes[i]); +} + +static void ocrdma_add_default_sgid(struct ocrdma_dev *dev) +{ + /* GID Index 0 - Invariant manufacturer-assigned EUI-64 */ + union ib_gid *sgid = &dev->sgid_tbl[0]; + + sgid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); + ocrdma_get_guid(dev, &sgid->raw[8]); +} + +static void ocrdma_init_ipv4_gids(struct ocrdma_dev *dev, + struct net_device *net) +{ + struct in_device *in_dev; + union ib_gid gid; + in_dev = in_dev_get(net); + if (in_dev) { + for_ifa(in_dev) { + ipv6_addr_set_v4mapped(ifa->ifa_address, + (struct in6_addr *)&gid); + ocrdma_add_sgid(dev, &gid); + } + endfor_ifa(in_dev); + in_dev_put(in_dev); + } +} + +static void ocrdma_init_ipv6_gids(struct ocrdma_dev *dev, + struct net_device *net) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_dev *in6_dev; + union ib_gid *pgid; + struct inet6_ifaddr *ifp; + in6_dev = in6_dev_get(net); + if (in6_dev) { + read_lock_bh(&in6_dev->lock); + list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { + pgid = (union ib_gid *)&ifp->addr; + ocrdma_add_sgid(dev, pgid); + } + read_unlock_bh(&in6_dev->lock); + in6_dev_put(in6_dev); + } +#endif +} + +static void ocrdma_init_gid_table(struct ocrdma_dev *dev) +{ + struct net_device *net_dev; + + for_each_netdev(&init_net, net_dev) { + struct net_device *real_dev = rdma_vlan_dev_real_dev(net_dev) ? + rdma_vlan_dev_real_dev(net_dev) : net_dev; + + if (real_dev == dev->nic_info.netdev) { + ocrdma_add_default_sgid(dev); + ocrdma_init_ipv4_gids(dev, net_dev); + ocrdma_init_ipv6_gids(dev, net_dev); + } + } +} + +static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info) +{ + int status = 0, i; + struct ocrdma_dev *dev; + + dev = (struct ocrdma_dev *)ib_alloc_device(sizeof(struct ocrdma_dev)); + if (!dev) { + pr_err("Unable to allocate ib device\n"); + return NULL; + } + dev->mbx_cmd = kzalloc(sizeof(struct ocrdma_mqe_emb_cmd), GFP_KERNEL); + if (!dev->mbx_cmd) + goto idr_err; + + memcpy(&dev->nic_info, dev_info, sizeof(*dev_info)); + dev->id = idr_alloc(&ocrdma_dev_id, NULL, 0, 0, GFP_KERNEL); + if (dev->id < 0) + goto idr_err; + + status = ocrdma_init_hw(dev); + if (status) + goto init_err; + + status = ocrdma_alloc_resources(dev); + if (status) + goto alloc_err; + + ocrdma_init_service_level(dev); + ocrdma_init_gid_table(dev); + status = ocrdma_register_device(dev); + if (status) + goto alloc_err; + + for (i = 0; i < ARRAY_SIZE(ocrdma_attributes); i++) + if (device_create_file(&dev->ibdev.dev, ocrdma_attributes[i])) + goto sysfs_err; + spin_lock(&ocrdma_devlist_lock); + list_add_tail_rcu(&dev->entry, &ocrdma_dev_list); + spin_unlock(&ocrdma_devlist_lock); + /* Init stats */ + ocrdma_add_port_stats(dev); + + pr_info("%s %s: %s \"%s\" port %d\n", + dev_name(&dev->nic_info.pdev->dev), hca_name(dev), + port_speed_string(dev), dev->model_number, + dev->hba_port_num); + pr_info("%s ocrdma%d driver loaded successfully\n", + dev_name(&dev->nic_info.pdev->dev), dev->id); + return dev; + +sysfs_err: + ocrdma_remove_sysfiles(dev); +alloc_err: + ocrdma_free_resources(dev); + ocrdma_cleanup_hw(dev); +init_err: + idr_remove(&ocrdma_dev_id, dev->id); +idr_err: + kfree(dev->mbx_cmd); + ib_dealloc_device(&dev->ibdev); + pr_err("%s() leaving. ret=%d\n", __func__, status); + return NULL; +} + +static void ocrdma_remove_free(struct rcu_head *rcu) +{ + struct ocrdma_dev *dev = container_of(rcu, struct ocrdma_dev, rcu); + + idr_remove(&ocrdma_dev_id, dev->id); + kfree(dev->mbx_cmd); + ib_dealloc_device(&dev->ibdev); +} + +static void ocrdma_remove(struct ocrdma_dev *dev) +{ + /* first unregister with stack to stop all the active traffic + * of the registered clients. + */ + ocrdma_rem_port_stats(dev); + ocrdma_remove_sysfiles(dev); + + ib_unregister_device(&dev->ibdev); + + spin_lock(&ocrdma_devlist_lock); + list_del_rcu(&dev->entry); + spin_unlock(&ocrdma_devlist_lock); + + ocrdma_free_resources(dev); + ocrdma_cleanup_hw(dev); + + call_rcu(&dev->rcu, ocrdma_remove_free); +} + +static int ocrdma_open(struct ocrdma_dev *dev) +{ + struct ib_event port_event; + + port_event.event = IB_EVENT_PORT_ACTIVE; + port_event.element.port_num = 1; + port_event.device = &dev->ibdev; + ib_dispatch_event(&port_event); + return 0; +} + +static int ocrdma_close(struct ocrdma_dev *dev) +{ + int i; + struct ocrdma_qp *qp, **cur_qp; + struct ib_event err_event; + struct ib_qp_attr attrs; + int attr_mask = IB_QP_STATE; + + attrs.qp_state = IB_QPS_ERR; + mutex_lock(&dev->dev_lock); + if (dev->qp_tbl) { + cur_qp = dev->qp_tbl; + for (i = 0; i < OCRDMA_MAX_QP; i++) { + qp = cur_qp[i]; + if (qp && qp->ibqp.qp_type != IB_QPT_GSI) { + /* change the QP state to ERROR */ + _ocrdma_modify_qp(&qp->ibqp, &attrs, attr_mask); + + err_event.event = IB_EVENT_QP_FATAL; + err_event.element.qp = &qp->ibqp; + err_event.device = &dev->ibdev; + ib_dispatch_event(&err_event); + } + } + } + mutex_unlock(&dev->dev_lock); + + err_event.event = IB_EVENT_PORT_ERR; + err_event.element.port_num = 1; + err_event.device = &dev->ibdev; + ib_dispatch_event(&err_event); + return 0; +} + +static void ocrdma_shutdown(struct ocrdma_dev *dev) +{ + ocrdma_close(dev); + ocrdma_remove(dev); +} + +/* event handling via NIC driver ensures that all the NIC specific + * initialization done before RoCE driver notifies + * event to stack. + */ +static void ocrdma_event_handler(struct ocrdma_dev *dev, u32 event) +{ + switch (event) { + case BE_DEV_UP: + ocrdma_open(dev); + break; + case BE_DEV_DOWN: + ocrdma_close(dev); + break; + case BE_DEV_SHUTDOWN: + ocrdma_shutdown(dev); + break; + } +} + +static struct ocrdma_driver ocrdma_drv = { + .name = "ocrdma_driver", + .add = ocrdma_add, + .remove = ocrdma_remove, + .state_change_handler = ocrdma_event_handler, + .be_abi_version = OCRDMA_BE_ROCE_ABI_VERSION, +}; + +static void ocrdma_unregister_inet6addr_notifier(void) +{ +#if IS_ENABLED(CONFIG_IPV6) + unregister_inet6addr_notifier(&ocrdma_inet6addr_notifier); +#endif +} + +static void ocrdma_unregister_inetaddr_notifier(void) +{ + unregister_inetaddr_notifier(&ocrdma_inetaddr_notifier); +} + +static int __init ocrdma_init_module(void) +{ + int status; + + ocrdma_init_debugfs(); + + status = register_inetaddr_notifier(&ocrdma_inetaddr_notifier); + if (status) + return status; + +#if IS_ENABLED(CONFIG_IPV6) + status = register_inet6addr_notifier(&ocrdma_inet6addr_notifier); + if (status) + goto err_notifier6; +#endif + + status = be_roce_register_driver(&ocrdma_drv); + if (status) + goto err_be_reg; + + return 0; + +err_be_reg: +#if IS_ENABLED(CONFIG_IPV6) + ocrdma_unregister_inet6addr_notifier(); +err_notifier6: +#endif + ocrdma_unregister_inetaddr_notifier(); + return status; +} + +static void __exit ocrdma_exit_module(void) +{ + be_roce_unregister_driver(&ocrdma_drv); + ocrdma_unregister_inet6addr_notifier(); + ocrdma_unregister_inetaddr_notifier(); + ocrdma_rem_debugfs(); +} + +module_init(ocrdma_init_module); +module_exit(ocrdma_exit_module); diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h new file mode 100644 index 0000000..4e03648 --- /dev/null +++ b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h @@ -0,0 +1,2102 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for * + * RoCE (RDMA over Converged Ethernet) adapters. * + * Copyright (C) 2008-2012 Emulex. All rights reserved. * + * EMULEX and SLI are trademarks of Emulex. * + * www.emulex.com * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of version 2 of the GNU General * + * Public License as published by the Free Software Foundation. * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID. See the GNU General Public License for * + * more details, a copy of which can be found in the file COPYING * + * included with this package. * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#ifndef __OCRDMA_SLI_H__ +#define __OCRDMA_SLI_H__ + +enum { + OCRDMA_ASIC_GEN_SKH_R = 0x04, + OCRDMA_ASIC_GEN_LANCER = 0x0B +}; + +enum { + OCRDMA_ASIC_REV_A0 = 0x00, + OCRDMA_ASIC_REV_B0 = 0x10, + OCRDMA_ASIC_REV_C0 = 0x20 +}; + +#define OCRDMA_SUBSYS_ROCE 10 +enum { + OCRDMA_CMD_QUERY_CONFIG = 1, + OCRDMA_CMD_ALLOC_PD = 2, + OCRDMA_CMD_DEALLOC_PD = 3, + + OCRDMA_CMD_CREATE_AH_TBL = 4, + OCRDMA_CMD_DELETE_AH_TBL = 5, + + OCRDMA_CMD_CREATE_QP = 6, + OCRDMA_CMD_QUERY_QP = 7, + OCRDMA_CMD_MODIFY_QP = 8 , + OCRDMA_CMD_DELETE_QP = 9, + + OCRDMA_CMD_RSVD1 = 10, + OCRDMA_CMD_ALLOC_LKEY = 11, + OCRDMA_CMD_DEALLOC_LKEY = 12, + OCRDMA_CMD_REGISTER_NSMR = 13, + OCRDMA_CMD_REREGISTER_NSMR = 14, + OCRDMA_CMD_REGISTER_NSMR_CONT = 15, + OCRDMA_CMD_QUERY_NSMR = 16, + OCRDMA_CMD_ALLOC_MW = 17, + OCRDMA_CMD_QUERY_MW = 18, + + OCRDMA_CMD_CREATE_SRQ = 19, + OCRDMA_CMD_QUERY_SRQ = 20, + OCRDMA_CMD_MODIFY_SRQ = 21, + OCRDMA_CMD_DELETE_SRQ = 22, + + OCRDMA_CMD_ATTACH_MCAST = 23, + OCRDMA_CMD_DETACH_MCAST = 24, + + OCRDMA_CMD_CREATE_RBQ = 25, + OCRDMA_CMD_DESTROY_RBQ = 26, + + OCRDMA_CMD_GET_RDMA_STATS = 27, + + OCRDMA_CMD_MAX +}; + +#define OCRDMA_SUBSYS_COMMON 1 +enum { + OCRDMA_CMD_QUERY_NTWK_LINK_CONFIG_V1 = 5, + OCRDMA_CMD_CREATE_CQ = 12, + OCRDMA_CMD_CREATE_EQ = 13, + OCRDMA_CMD_CREATE_MQ = 21, + OCRDMA_CMD_GET_CTRL_ATTRIBUTES = 32, + OCRDMA_CMD_GET_FW_VER = 35, + OCRDMA_CMD_DELETE_MQ = 53, + OCRDMA_CMD_DELETE_CQ = 54, + OCRDMA_CMD_DELETE_EQ = 55, + OCRDMA_CMD_GET_FW_CONFIG = 58, + OCRDMA_CMD_CREATE_MQ_EXT = 90, + OCRDMA_CMD_PHY_DETAILS = 102 +}; + +enum { + QTYPE_EQ = 1, + QTYPE_CQ = 2, + QTYPE_MCCQ = 3 +}; + +#define OCRDMA_MAX_SGID 8 + +#define OCRDMA_MAX_QP 2048 +#define OCRDMA_MAX_CQ 2048 +#define OCRDMA_MAX_STAG 16384 + +enum { + OCRDMA_DB_RQ_OFFSET = 0xE0, + OCRDMA_DB_GEN2_RQ_OFFSET = 0x100, + OCRDMA_DB_SQ_OFFSET = 0x60, + OCRDMA_DB_GEN2_SQ_OFFSET = 0x1C0, + OCRDMA_DB_SRQ_OFFSET = OCRDMA_DB_RQ_OFFSET, + OCRDMA_DB_GEN2_SRQ_OFFSET = OCRDMA_DB_GEN2_RQ_OFFSET, + OCRDMA_DB_CQ_OFFSET = 0x120, + OCRDMA_DB_EQ_OFFSET = OCRDMA_DB_CQ_OFFSET, + OCRDMA_DB_MQ_OFFSET = 0x140, + + OCRDMA_DB_SQ_SHIFT = 16, + OCRDMA_DB_RQ_SHIFT = 24 +}; + +#define OCRDMA_DB_CQ_RING_ID_MASK 0x3FF /* bits 0 - 9 */ +#define OCRDMA_DB_CQ_RING_ID_EXT_MASK 0x0C00 /* bits 10-11 of qid at 12-11 */ +/* qid #2 msbits at 12-11 */ +#define OCRDMA_DB_CQ_RING_ID_EXT_MASK_SHIFT 0x1 +#define OCRDMA_DB_CQ_NUM_POPPED_SHIFT 16 /* bits 16 - 28 */ +/* Rearm bit */ +#define OCRDMA_DB_CQ_REARM_SHIFT 29 /* bit 29 */ +/* solicited bit */ +#define OCRDMA_DB_CQ_SOLICIT_SHIFT 31 /* bit 31 */ + +#define OCRDMA_EQ_ID_MASK 0x1FF /* bits 0 - 8 */ +#define OCRDMA_EQ_ID_EXT_MASK 0x3e00 /* bits 9-13 */ +#define OCRDMA_EQ_ID_EXT_MASK_SHIFT 2 /* qid bits 9-13 at 11-15 */ + +/* Clear the interrupt for this eq */ +#define OCRDMA_EQ_CLR_SHIFT 9 /* bit 9 */ +/* Must be 1 */ +#define OCRDMA_EQ_TYPE_SHIFT 10 /* bit 10 */ +/* Number of event entries processed */ +#define OCRDMA_NUM_EQE_SHIFT 16 /* bits 16 - 28 */ +/* Rearm bit */ +#define OCRDMA_REARM_SHIFT 29 /* bit 29 */ + +#define OCRDMA_MQ_ID_MASK 0x7FF /* bits 0 - 10 */ +/* Number of entries posted */ +#define OCRDMA_MQ_NUM_MQE_SHIFT 16 /* bits 16 - 29 */ + +#define OCRDMA_MIN_HPAGE_SIZE 4096 + +#define OCRDMA_MIN_Q_PAGE_SIZE 4096 +#define OCRDMA_MAX_Q_PAGES 8 + +#define OCRDMA_SLI_ASIC_ID_OFFSET 0x9C +#define OCRDMA_SLI_ASIC_REV_MASK 0x000000FF +#define OCRDMA_SLI_ASIC_GEN_NUM_MASK 0x0000FF00 +#define OCRDMA_SLI_ASIC_GEN_NUM_SHIFT 0x08 +/* +# 0: 4K Bytes +# 1: 8K Bytes +# 2: 16K Bytes +# 3: 32K Bytes +# 4: 64K Bytes +# 5: 128K Bytes +# 6: 256K Bytes +# 7: 512K Bytes +*/ +#define OCRDMA_MAX_Q_PAGE_SIZE_CNT 8 +#define OCRDMA_Q_PAGE_BASE_SIZE (OCRDMA_MIN_Q_PAGE_SIZE * OCRDMA_MAX_Q_PAGES) + +#define MAX_OCRDMA_QP_PAGES 8 +#define OCRDMA_MAX_WQE_MEM_SIZE (MAX_OCRDMA_QP_PAGES * OCRDMA_MIN_HQ_PAGE_SIZE) + +#define OCRDMA_CREATE_CQ_MAX_PAGES 4 +#define OCRDMA_DPP_CQE_SIZE 4 + +#define OCRDMA_GEN2_MAX_CQE 1024 +#define OCRDMA_GEN2_CQ_PAGE_SIZE 4096 +#define OCRDMA_GEN2_WQE_SIZE 256 +#define OCRDMA_MAX_CQE 4095 +#define OCRDMA_CQ_PAGE_SIZE 16384 +#define OCRDMA_WQE_SIZE 128 +#define OCRDMA_WQE_STRIDE 8 +#define OCRDMA_WQE_ALIGN_BYTES 16 + +#define MAX_OCRDMA_SRQ_PAGES MAX_OCRDMA_QP_PAGES + +enum { + OCRDMA_MCH_OPCODE_SHIFT = 0, + OCRDMA_MCH_OPCODE_MASK = 0xFF, + OCRDMA_MCH_SUBSYS_SHIFT = 8, + OCRDMA_MCH_SUBSYS_MASK = 0xFF00 +}; + +/* mailbox cmd header */ +struct ocrdma_mbx_hdr { + u32 subsys_op; + u32 timeout; /* in seconds */ + u32 cmd_len; + u32 rsvd_version; +}; + +enum { + OCRDMA_MBX_RSP_OPCODE_SHIFT = 0, + OCRDMA_MBX_RSP_OPCODE_MASK = 0xFF, + OCRDMA_MBX_RSP_SUBSYS_SHIFT = 8, + OCRDMA_MBX_RSP_SUBSYS_MASK = 0xFF << OCRDMA_MBX_RSP_SUBSYS_SHIFT, + + OCRDMA_MBX_RSP_STATUS_SHIFT = 0, + OCRDMA_MBX_RSP_STATUS_MASK = 0xFF, + OCRDMA_MBX_RSP_ASTATUS_SHIFT = 8, + OCRDMA_MBX_RSP_ASTATUS_MASK = 0xFF << OCRDMA_MBX_RSP_ASTATUS_SHIFT +}; + +/* mailbox cmd response */ +struct ocrdma_mbx_rsp { + u32 subsys_op; + u32 status; + u32 rsp_len; + u32 add_rsp_len; +}; + +enum { + OCRDMA_MQE_EMBEDDED = 1, + OCRDMA_MQE_NONEMBEDDED = 0 +}; + +struct ocrdma_mqe_sge { + u32 pa_lo; + u32 pa_hi; + u32 len; +}; + +enum { + OCRDMA_MQE_HDR_EMB_SHIFT = 0, + OCRDMA_MQE_HDR_EMB_MASK = BIT(0), + OCRDMA_MQE_HDR_SGE_CNT_SHIFT = 3, + OCRDMA_MQE_HDR_SGE_CNT_MASK = 0x1F << OCRDMA_MQE_HDR_SGE_CNT_SHIFT, + OCRDMA_MQE_HDR_SPECIAL_SHIFT = 24, + OCRDMA_MQE_HDR_SPECIAL_MASK = 0xFF << OCRDMA_MQE_HDR_SPECIAL_SHIFT +}; + +struct ocrdma_mqe_hdr { + u32 spcl_sge_cnt_emb; + u32 pyld_len; + u32 tag_lo; + u32 tag_hi; + u32 rsvd3; +}; + +struct ocrdma_mqe_emb_cmd { + struct ocrdma_mbx_hdr mch; + u8 pyld[220]; +}; + +struct ocrdma_mqe { + struct ocrdma_mqe_hdr hdr; + union { + struct ocrdma_mqe_emb_cmd emb_req; + struct { + struct ocrdma_mqe_sge sge[19]; + } nonemb_req; + u8 cmd[236]; + struct ocrdma_mbx_rsp rsp; + } u; +}; + +#define OCRDMA_EQ_LEN 4096 +#define OCRDMA_MQ_CQ_LEN 256 +#define OCRDMA_MQ_LEN 128 + +#define PAGE_SHIFT_4K 12 +#define PAGE_SIZE_4K (1 << PAGE_SHIFT_4K) + +/* Returns number of pages spanned by the data starting at the given addr */ +#define PAGES_4K_SPANNED(_address, size) \ + ((u32)((((size_t)(_address) & (PAGE_SIZE_4K - 1)) + \ + (size) + (PAGE_SIZE_4K - 1)) >> PAGE_SHIFT_4K)) + +struct ocrdma_delete_q_req { + struct ocrdma_mbx_hdr req; + u32 id; +}; + +struct ocrdma_pa { + u32 lo; + u32 hi; +}; + +#define MAX_OCRDMA_EQ_PAGES 8 +struct ocrdma_create_eq_req { + struct ocrdma_mbx_hdr req; + u32 num_pages; + u32 valid; + u32 cnt; + u32 delay; + u32 rsvd; + struct ocrdma_pa pa[MAX_OCRDMA_EQ_PAGES]; +}; + +enum { + OCRDMA_CREATE_EQ_VALID = BIT(29), + OCRDMA_CREATE_EQ_CNT_SHIFT = 26, + OCRDMA_CREATE_CQ_DELAY_SHIFT = 13, +}; + +struct ocrdma_create_eq_rsp { + struct ocrdma_mbx_rsp rsp; + u32 vector_eqid; +}; + +#define OCRDMA_EQ_MINOR_OTHER 0x1 + +enum { + OCRDMA_MCQE_STATUS_SHIFT = 0, + OCRDMA_MCQE_STATUS_MASK = 0xFFFF, + OCRDMA_MCQE_ESTATUS_SHIFT = 16, + OCRDMA_MCQE_ESTATUS_MASK = 0xFFFF << OCRDMA_MCQE_ESTATUS_SHIFT, + OCRDMA_MCQE_CONS_SHIFT = 27, + OCRDMA_MCQE_CONS_MASK = BIT(27), + OCRDMA_MCQE_CMPL_SHIFT = 28, + OCRDMA_MCQE_CMPL_MASK = BIT(28), + OCRDMA_MCQE_AE_SHIFT = 30, + OCRDMA_MCQE_AE_MASK = BIT(30), + OCRDMA_MCQE_VALID_SHIFT = 31, + OCRDMA_MCQE_VALID_MASK = BIT(31) +}; + +struct ocrdma_mcqe { + u32 status; + u32 tag_lo; + u32 tag_hi; + u32 valid_ae_cmpl_cons; +}; + +enum { + OCRDMA_AE_MCQE_QPVALID = BIT(31), + OCRDMA_AE_MCQE_QPID_MASK = 0xFFFF, + + OCRDMA_AE_MCQE_CQVALID = BIT(31), + OCRDMA_AE_MCQE_CQID_MASK = 0xFFFF, + OCRDMA_AE_MCQE_VALID = BIT(31), + OCRDMA_AE_MCQE_AE = BIT(30), + OCRDMA_AE_MCQE_EVENT_TYPE_SHIFT = 16, + OCRDMA_AE_MCQE_EVENT_TYPE_MASK = + 0xFF << OCRDMA_AE_MCQE_EVENT_TYPE_SHIFT, + OCRDMA_AE_MCQE_EVENT_CODE_SHIFT = 8, + OCRDMA_AE_MCQE_EVENT_CODE_MASK = + 0xFF << OCRDMA_AE_MCQE_EVENT_CODE_SHIFT +}; +struct ocrdma_ae_mcqe { + u32 qpvalid_qpid; + u32 cqvalid_cqid; + u32 evt_tag; + u32 valid_ae_event; +}; + +enum { + OCRDMA_AE_PVID_MCQE_ENABLED_SHIFT = 0, + OCRDMA_AE_PVID_MCQE_ENABLED_MASK = 0xFF, + OCRDMA_AE_PVID_MCQE_TAG_SHIFT = 16, + OCRDMA_AE_PVID_MCQE_TAG_MASK = 0xFFFF << OCRDMA_AE_PVID_MCQE_TAG_SHIFT +}; + +struct ocrdma_ae_pvid_mcqe { + u32 tag_enabled; + u32 event_tag; + u32 rsvd1; + u32 rsvd2; +}; + +enum { + OCRDMA_AE_MPA_MCQE_REQ_ID_SHIFT = 16, + OCRDMA_AE_MPA_MCQE_REQ_ID_MASK = 0xFFFF << + OCRDMA_AE_MPA_MCQE_REQ_ID_SHIFT, + + OCRDMA_AE_MPA_MCQE_EVENT_CODE_SHIFT = 8, + OCRDMA_AE_MPA_MCQE_EVENT_CODE_MASK = 0xFF << + OCRDMA_AE_MPA_MCQE_EVENT_CODE_SHIFT, + OCRDMA_AE_MPA_MCQE_EVENT_TYPE_SHIFT = 16, + OCRDMA_AE_MPA_MCQE_EVENT_TYPE_MASK = 0xFF << + OCRDMA_AE_MPA_MCQE_EVENT_TYPE_SHIFT, + OCRDMA_AE_MPA_MCQE_EVENT_AE_SHIFT = 30, + OCRDMA_AE_MPA_MCQE_EVENT_AE_MASK = BIT(30), + OCRDMA_AE_MPA_MCQE_EVENT_VALID_SHIFT = 31, + OCRDMA_AE_MPA_MCQE_EVENT_VALID_MASK = BIT(31) +}; + +struct ocrdma_ae_mpa_mcqe { + u32 req_id; + u32 w1; + u32 w2; + u32 valid_ae_event; +}; + +enum { + OCRDMA_AE_QP_MCQE_NEW_QP_STATE_SHIFT = 0, + OCRDMA_AE_QP_MCQE_NEW_QP_STATE_MASK = 0xFFFF, + OCRDMA_AE_QP_MCQE_QP_ID_SHIFT = 16, + OCRDMA_AE_QP_MCQE_QP_ID_MASK = 0xFFFF << + OCRDMA_AE_QP_MCQE_QP_ID_SHIFT, + + OCRDMA_AE_QP_MCQE_EVENT_CODE_SHIFT = 8, + OCRDMA_AE_QP_MCQE_EVENT_CODE_MASK = 0xFF << + OCRDMA_AE_QP_MCQE_EVENT_CODE_SHIFT, + OCRDMA_AE_QP_MCQE_EVENT_TYPE_SHIFT = 16, + OCRDMA_AE_QP_MCQE_EVENT_TYPE_MASK = 0xFF << + OCRDMA_AE_QP_MCQE_EVENT_TYPE_SHIFT, + OCRDMA_AE_QP_MCQE_EVENT_AE_SHIFT = 30, + OCRDMA_AE_QP_MCQE_EVENT_AE_MASK = BIT(30), + OCRDMA_AE_QP_MCQE_EVENT_VALID_SHIFT = 31, + OCRDMA_AE_QP_MCQE_EVENT_VALID_MASK = BIT(31) +}; + +struct ocrdma_ae_qp_mcqe { + u32 qp_id_state; + u32 w1; + u32 w2; + u32 valid_ae_event; +}; + +#define OCRDMA_ASYNC_RDMA_EVE_CODE 0x14 +#define OCRDMA_ASYNC_GRP5_EVE_CODE 0x5 + +enum ocrdma_async_grp5_events { + OCRDMA_ASYNC_EVENT_QOS_VALUE = 0x01, + OCRDMA_ASYNC_EVENT_COS_VALUE = 0x02, + OCRDMA_ASYNC_EVENT_PVID_STATE = 0x03 +}; + +enum OCRDMA_ASYNC_EVENT_TYPE { + OCRDMA_CQ_ERROR = 0x00, + OCRDMA_CQ_OVERRUN_ERROR = 0x01, + OCRDMA_CQ_QPCAT_ERROR = 0x02, + OCRDMA_QP_ACCESS_ERROR = 0x03, + OCRDMA_QP_COMM_EST_EVENT = 0x04, + OCRDMA_SQ_DRAINED_EVENT = 0x05, + OCRDMA_DEVICE_FATAL_EVENT = 0x08, + OCRDMA_SRQCAT_ERROR = 0x0E, + OCRDMA_SRQ_LIMIT_EVENT = 0x0F, + OCRDMA_QP_LAST_WQE_EVENT = 0x10 +}; + +/* mailbox command request and responses */ +enum { + OCRDMA_MBX_QUERY_CFG_CQ_OVERFLOW_SHIFT = 2, + OCRDMA_MBX_QUERY_CFG_CQ_OVERFLOW_MASK = BIT(2), + OCRDMA_MBX_QUERY_CFG_SRQ_SUPPORTED_SHIFT = 3, + OCRDMA_MBX_QUERY_CFG_SRQ_SUPPORTED_MASK = BIT(3), + OCRDMA_MBX_QUERY_CFG_MAX_QP_SHIFT = 8, + OCRDMA_MBX_QUERY_CFG_MAX_QP_MASK = 0xFFFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_QP_SHIFT, + + OCRDMA_MBX_QUERY_CFG_MAX_PD_SHIFT = 16, + OCRDMA_MBX_QUERY_CFG_MAX_PD_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_PD_SHIFT, + OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_SHIFT = 8, + OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_MASK = 0xFF << + OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_SHIFT, + + OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_SHIFT = 0, + OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_MASK = 0xFFFF, + OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_SHIFT = 16, + OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_SHIFT, + + OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_SHIFT = 0, + OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_MASK = 0xFFFF, + OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_SHIFT = 16, + OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_SHIFT, + + OCRDMA_MBX_QUERY_CFG_MAX_WQE_SIZE_OFFSET = 24, + OCRDMA_MBX_QUERY_CFG_MAX_WQE_SIZE_MASK = 0xFF << + OCRDMA_MBX_QUERY_CFG_MAX_WQE_SIZE_OFFSET, + OCRDMA_MBX_QUERY_CFG_MAX_RQE_SIZE_OFFSET = 16, + OCRDMA_MBX_QUERY_CFG_MAX_RQE_SIZE_MASK = 0xFF << + OCRDMA_MBX_QUERY_CFG_MAX_RQE_SIZE_OFFSET, + OCRDMA_MBX_QUERY_CFG_MAX_DPP_CQES_OFFSET = 0, + OCRDMA_MBX_QUERY_CFG_MAX_DPP_CQES_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_DPP_CQES_OFFSET, + + OCRDMA_MBX_QUERY_CFG_MAX_SRQ_OFFSET = 16, + OCRDMA_MBX_QUERY_CFG_MAX_SRQ_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_SRQ_OFFSET, + OCRDMA_MBX_QUERY_CFG_MAX_RPIR_QPS_OFFSET = 0, + OCRDMA_MBX_QUERY_CFG_MAX_RPIR_QPS_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_RPIR_QPS_OFFSET, + + OCRDMA_MBX_QUERY_CFG_MAX_DPP_PDS_OFFSET = 16, + OCRDMA_MBX_QUERY_CFG_MAX_DPP_PDS_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_DPP_PDS_OFFSET, + OCRDMA_MBX_QUERY_CFG_MAX_DPP_CREDITS_OFFSET = 0, + OCRDMA_MBX_QUERY_CFG_MAX_DPP_CREDITS_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_DPP_CREDITS_OFFSET, + + OCRDMA_MBX_QUERY_CFG_MAX_DPP_QPS_OFFSET = 0, + OCRDMA_MBX_QUERY_CFG_MAX_DPP_QPS_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_DPP_QPS_OFFSET, + + OCRDMA_MBX_QUERY_CFG_MAX_WQES_PER_WQ_OFFSET = 16, + OCRDMA_MBX_QUERY_CFG_MAX_WQES_PER_WQ_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_WQES_PER_WQ_OFFSET, + OCRDMA_MBX_QUERY_CFG_MAX_RQES_PER_RQ_OFFSET = 0, + OCRDMA_MBX_QUERY_CFG_MAX_RQES_PER_RQ_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_RQES_PER_RQ_OFFSET, + + OCRDMA_MBX_QUERY_CFG_MAX_CQ_OFFSET = 16, + OCRDMA_MBX_QUERY_CFG_MAX_CQ_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_CQ_OFFSET, + OCRDMA_MBX_QUERY_CFG_MAX_CQES_PER_CQ_OFFSET = 0, + OCRDMA_MBX_QUERY_CFG_MAX_CQES_PER_CQ_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_CQES_PER_CQ_OFFSET, + + OCRDMA_MBX_QUERY_CFG_MAX_SRQ_RQE_OFFSET = 16, + OCRDMA_MBX_QUERY_CFG_MAX_SRQ_RQE_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_SRQ_RQE_OFFSET, + OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_OFFSET = 0, + OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_MASK = 0xFFFF << + OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_OFFSET, +}; + +struct ocrdma_mbx_query_config { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + u32 qp_srq_cq_ird_ord; + u32 max_pd_ca_ack_delay; + u32 max_write_send_sge; + u32 max_ird_ord_per_qp; + u32 max_shared_ird_ord; + u32 max_mr; + u32 max_mr_size_hi; + u32 max_mr_size_lo; + u32 max_num_mr_pbl; + u32 max_mw; + u32 max_fmr; + u32 max_pages_per_frmr; + u32 max_mcast_group; + u32 max_mcast_qp_attach; + u32 max_total_mcast_qp_attach; + u32 wqe_rqe_stride_max_dpp_cqs; + u32 max_srq_rpir_qps; + u32 max_dpp_pds_credits; + u32 max_dpp_credits_pds_per_pd; + u32 max_wqes_rqes_per_q; + u32 max_cq_cqes_per_cq; + u32 max_srq_rqe_sge; +}; + +struct ocrdma_fw_ver_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + + u8 running_ver[32]; +}; + +struct ocrdma_fw_conf_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + + u32 config_num; + u32 asic_revision; + u32 phy_port; + u32 fn_mode; + struct { + u32 mode; + u32 nic_wqid_base; + u32 nic_wq_tot; + u32 prot_wqid_base; + u32 prot_wq_tot; + u32 prot_rqid_base; + u32 prot_rqid_tot; + u32 rsvd[6]; + } ulp[2]; + u32 fn_capabilities; + u32 rsvd1; + u32 rsvd2; + u32 base_eqid; + u32 max_eq; + +}; + +enum { + OCRDMA_FN_MODE_RDMA = 0x4 +}; + +enum { + OCRDMA_IF_TYPE_MASK = 0xFFFF0000, + OCRDMA_IF_TYPE_SHIFT = 0x10, + OCRDMA_PHY_TYPE_MASK = 0x0000FFFF, + OCRDMA_FUTURE_DETAILS_MASK = 0xFFFF0000, + OCRDMA_FUTURE_DETAILS_SHIFT = 0x10, + OCRDMA_EX_PHY_DETAILS_MASK = 0x0000FFFF, + OCRDMA_FSPEED_SUPP_MASK = 0xFFFF0000, + OCRDMA_FSPEED_SUPP_SHIFT = 0x10, + OCRDMA_ASPEED_SUPP_MASK = 0x0000FFFF +}; + +struct ocrdma_get_phy_info_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + + u32 ityp_ptyp; + u32 misc_params; + u32 ftrdtl_exphydtl; + u32 fspeed_aspeed; + u32 future_use[2]; +}; + +enum { + OCRDMA_PHY_SPEED_ZERO = 0x0, + OCRDMA_PHY_SPEED_10MBPS = 0x1, + OCRDMA_PHY_SPEED_100MBPS = 0x2, + OCRDMA_PHY_SPEED_1GBPS = 0x4, + OCRDMA_PHY_SPEED_10GBPS = 0x8, + OCRDMA_PHY_SPEED_40GBPS = 0x20 +}; + +enum { + OCRDMA_PORT_NUM_MASK = 0x3F, + OCRDMA_PT_MASK = 0xC0, + OCRDMA_PT_SHIFT = 0x6, + OCRDMA_LINK_DUP_MASK = 0x0000FF00, + OCRDMA_LINK_DUP_SHIFT = 0x8, + OCRDMA_PHY_PS_MASK = 0x00FF0000, + OCRDMA_PHY_PS_SHIFT = 0x10, + OCRDMA_PHY_PFLT_MASK = 0xFF000000, + OCRDMA_PHY_PFLT_SHIFT = 0x18, + OCRDMA_QOS_LNKSP_MASK = 0xFFFF0000, + OCRDMA_QOS_LNKSP_SHIFT = 0x10, + OCRDMA_LLST_MASK = 0xFF, + OCRDMA_PLFC_MASK = 0x00000400, + OCRDMA_PLFC_SHIFT = 0x8, + OCRDMA_PLRFC_MASK = 0x00000200, + OCRDMA_PLRFC_SHIFT = 0x8, + OCRDMA_PLTFC_MASK = 0x00000100, + OCRDMA_PLTFC_SHIFT = 0x8 +}; + +struct ocrdma_get_link_speed_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + + u32 pflt_pps_ld_pnum; + u32 qos_lsp; + u32 res_lls; +}; + +enum { + OCRDMA_PHYS_LINK_SPEED_ZERO = 0x0, + OCRDMA_PHYS_LINK_SPEED_10MBPS = 0x1, + OCRDMA_PHYS_LINK_SPEED_100MBPS = 0x2, + OCRDMA_PHYS_LINK_SPEED_1GBPS = 0x3, + OCRDMA_PHYS_LINK_SPEED_10GBPS = 0x4, + OCRDMA_PHYS_LINK_SPEED_20GBPS = 0x5, + OCRDMA_PHYS_LINK_SPEED_25GBPS = 0x6, + OCRDMA_PHYS_LINK_SPEED_40GBPS = 0x7, + OCRDMA_PHYS_LINK_SPEED_100GBPS = 0x8 +}; + +enum { + OCRDMA_CREATE_CQ_VER2 = 2, + OCRDMA_CREATE_CQ_VER3 = 3, + + OCRDMA_CREATE_CQ_PAGE_CNT_MASK = 0xFFFF, + OCRDMA_CREATE_CQ_PAGE_SIZE_SHIFT = 16, + OCRDMA_CREATE_CQ_PAGE_SIZE_MASK = 0xFF, + + OCRDMA_CREATE_CQ_COALESCWM_SHIFT = 12, + OCRDMA_CREATE_CQ_COALESCWM_MASK = BIT(13) | BIT(12), + OCRDMA_CREATE_CQ_FLAGS_NODELAY = BIT(14), + OCRDMA_CREATE_CQ_FLAGS_AUTO_VALID = BIT(15), + + OCRDMA_CREATE_CQ_EQ_ID_MASK = 0xFFFF, + OCRDMA_CREATE_CQ_CQE_COUNT_MASK = 0xFFFF +}; + +enum { + OCRDMA_CREATE_CQ_VER0 = 0, + OCRDMA_CREATE_CQ_DPP = 1, + OCRDMA_CREATE_CQ_TYPE_SHIFT = 24, + OCRDMA_CREATE_CQ_EQID_SHIFT = 22, + + OCRDMA_CREATE_CQ_CNT_SHIFT = 27, + OCRDMA_CREATE_CQ_FLAGS_VALID = BIT(29), + OCRDMA_CREATE_CQ_FLAGS_EVENTABLE = BIT(31), + OCRDMA_CREATE_CQ_DEF_FLAGS = OCRDMA_CREATE_CQ_FLAGS_VALID | + OCRDMA_CREATE_CQ_FLAGS_EVENTABLE | + OCRDMA_CREATE_CQ_FLAGS_NODELAY +}; + +struct ocrdma_create_cq_cmd { + struct ocrdma_mbx_hdr req; + u32 pgsz_pgcnt; + u32 ev_cnt_flags; + u32 eqn; + u32 pdid_cqecnt; + u32 rsvd6; + struct ocrdma_pa pa[OCRDMA_CREATE_CQ_MAX_PAGES]; +}; + +struct ocrdma_create_cq { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_create_cq_cmd cmd; +}; + +enum { + OCRDMA_CREATE_CQ_CMD_PDID_SHIFT = 0x10 +}; + +enum { + OCRDMA_CREATE_CQ_RSP_CQ_ID_MASK = 0xFFFF +}; + +struct ocrdma_create_cq_cmd_rsp { + struct ocrdma_mbx_rsp rsp; + u32 cq_id; +}; + +struct ocrdma_create_cq_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_create_cq_cmd_rsp rsp; +}; + +enum { + OCRDMA_CREATE_MQ_V0_CQ_ID_SHIFT = 22, + OCRDMA_CREATE_MQ_CQ_ID_SHIFT = 16, + OCRDMA_CREATE_MQ_RING_SIZE_SHIFT = 16, + OCRDMA_CREATE_MQ_VALID = BIT(31), + OCRDMA_CREATE_MQ_ASYNC_CQ_VALID = BIT(0) +}; + +struct ocrdma_create_mq_req { + struct ocrdma_mbx_hdr req; + u32 cqid_pages; + u32 async_event_bitmap; + u32 async_cqid_ringsize; + u32 valid; + u32 async_cqid_valid; + u32 rsvd; + struct ocrdma_pa pa[8]; +}; + +struct ocrdma_create_mq_rsp { + struct ocrdma_mbx_rsp rsp; + u32 id; +}; + +enum { + OCRDMA_DESTROY_CQ_QID_SHIFT = 0, + OCRDMA_DESTROY_CQ_QID_MASK = 0xFFFF, + OCRDMA_DESTROY_CQ_QID_BYPASS_FLUSH_SHIFT = 16, + OCRDMA_DESTROY_CQ_QID_BYPASS_FLUSH_MASK = 0xFFFF << + OCRDMA_DESTROY_CQ_QID_BYPASS_FLUSH_SHIFT +}; + +struct ocrdma_destroy_cq { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + + u32 bypass_flush_qid; +}; + +struct ocrdma_destroy_cq_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; +}; + +enum { + OCRDMA_QPT_GSI = 1, + OCRDMA_QPT_RC = 2, + OCRDMA_QPT_UD = 4, +}; + +enum { + OCRDMA_CREATE_QP_REQ_PD_ID_SHIFT = 0, + OCRDMA_CREATE_QP_REQ_PD_ID_MASK = 0xFFFF, + OCRDMA_CREATE_QP_REQ_SQ_PAGE_SIZE_SHIFT = 16, + OCRDMA_CREATE_QP_REQ_RQ_PAGE_SIZE_SHIFT = 19, + OCRDMA_CREATE_QP_REQ_QPT_SHIFT = 29, + OCRDMA_CREATE_QP_REQ_QPT_MASK = BIT(31) | BIT(30) | BIT(29), + + OCRDMA_CREATE_QP_REQ_MAX_RQE_SHIFT = 0, + OCRDMA_CREATE_QP_REQ_MAX_RQE_MASK = 0xFFFF, + OCRDMA_CREATE_QP_REQ_MAX_WQE_SHIFT = 16, + OCRDMA_CREATE_QP_REQ_MAX_WQE_MASK = 0xFFFF << + OCRDMA_CREATE_QP_REQ_MAX_WQE_SHIFT, + + OCRDMA_CREATE_QP_REQ_MAX_SGE_WRITE_SHIFT = 0, + OCRDMA_CREATE_QP_REQ_MAX_SGE_WRITE_MASK = 0xFFFF, + OCRDMA_CREATE_QP_REQ_MAX_SGE_SEND_SHIFT = 16, + OCRDMA_CREATE_QP_REQ_MAX_SGE_SEND_MASK = 0xFFFF << + OCRDMA_CREATE_QP_REQ_MAX_SGE_SEND_SHIFT, + + OCRDMA_CREATE_QP_REQ_FMR_EN_SHIFT = 0, + OCRDMA_CREATE_QP_REQ_FMR_EN_MASK = BIT(0), + OCRDMA_CREATE_QP_REQ_ZERO_LKEYEN_SHIFT = 1, + OCRDMA_CREATE_QP_REQ_ZERO_LKEYEN_MASK = BIT(1), + OCRDMA_CREATE_QP_REQ_BIND_MEMWIN_SHIFT = 2, + OCRDMA_CREATE_QP_REQ_BIND_MEMWIN_MASK = BIT(2), + OCRDMA_CREATE_QP_REQ_INB_WREN_SHIFT = 3, + OCRDMA_CREATE_QP_REQ_INB_WREN_MASK = BIT(3), + OCRDMA_CREATE_QP_REQ_INB_RDEN_SHIFT = 4, + OCRDMA_CREATE_QP_REQ_INB_RDEN_MASK = BIT(4), + OCRDMA_CREATE_QP_REQ_USE_SRQ_SHIFT = 5, + OCRDMA_CREATE_QP_REQ_USE_SRQ_MASK = BIT(5), + OCRDMA_CREATE_QP_REQ_ENABLE_RPIR_SHIFT = 6, + OCRDMA_CREATE_QP_REQ_ENABLE_RPIR_MASK = BIT(6), + OCRDMA_CREATE_QP_REQ_ENABLE_DPP_SHIFT = 7, + OCRDMA_CREATE_QP_REQ_ENABLE_DPP_MASK = BIT(7), + OCRDMA_CREATE_QP_REQ_ENABLE_DPP_CQ_SHIFT = 8, + OCRDMA_CREATE_QP_REQ_ENABLE_DPP_CQ_MASK = BIT(8), + OCRDMA_CREATE_QP_REQ_MAX_SGE_RECV_SHIFT = 16, + OCRDMA_CREATE_QP_REQ_MAX_SGE_RECV_MASK = 0xFFFF << + OCRDMA_CREATE_QP_REQ_MAX_SGE_RECV_SHIFT, + + OCRDMA_CREATE_QP_REQ_MAX_IRD_SHIFT = 0, + OCRDMA_CREATE_QP_REQ_MAX_IRD_MASK = 0xFFFF, + OCRDMA_CREATE_QP_REQ_MAX_ORD_SHIFT = 16, + OCRDMA_CREATE_QP_REQ_MAX_ORD_MASK = 0xFFFF << + OCRDMA_CREATE_QP_REQ_MAX_ORD_SHIFT, + + OCRDMA_CREATE_QP_REQ_NUM_RQ_PAGES_SHIFT = 0, + OCRDMA_CREATE_QP_REQ_NUM_RQ_PAGES_MASK = 0xFFFF, + OCRDMA_CREATE_QP_REQ_NUM_WQ_PAGES_SHIFT = 16, + OCRDMA_CREATE_QP_REQ_NUM_WQ_PAGES_MASK = 0xFFFF << + OCRDMA_CREATE_QP_REQ_NUM_WQ_PAGES_SHIFT, + + OCRDMA_CREATE_QP_REQ_RQE_SIZE_SHIFT = 0, + OCRDMA_CREATE_QP_REQ_RQE_SIZE_MASK = 0xFFFF, + OCRDMA_CREATE_QP_REQ_WQE_SIZE_SHIFT = 16, + OCRDMA_CREATE_QP_REQ_WQE_SIZE_MASK = 0xFFFF << + OCRDMA_CREATE_QP_REQ_WQE_SIZE_SHIFT, + + OCRDMA_CREATE_QP_REQ_RQ_CQID_SHIFT = 0, + OCRDMA_CREATE_QP_REQ_RQ_CQID_MASK = 0xFFFF, + OCRDMA_CREATE_QP_REQ_WQ_CQID_SHIFT = 16, + OCRDMA_CREATE_QP_REQ_WQ_CQID_MASK = 0xFFFF << + OCRDMA_CREATE_QP_REQ_WQ_CQID_SHIFT, + + OCRDMA_CREATE_QP_REQ_DPP_CQPID_SHIFT = 0, + OCRDMA_CREATE_QP_REQ_DPP_CQPID_MASK = 0xFFFF, + OCRDMA_CREATE_QP_REQ_DPP_CREDIT_SHIFT = 16, + OCRDMA_CREATE_QP_REQ_DPP_CREDIT_MASK = 0xFFFF << + OCRDMA_CREATE_QP_REQ_DPP_CREDIT_SHIFT +}; + +enum { + OCRDMA_CREATE_QP_REQ_DPP_CREDIT_LIMIT = 16, + OCRDMA_CREATE_QP_RSP_DPP_PAGE_SHIFT = 1 +}; + +#define MAX_OCRDMA_IRD_PAGES 4 + +enum ocrdma_qp_flags { + OCRDMA_QP_MW_BIND = 1, + OCRDMA_QP_LKEY0 = (1 << 1), + OCRDMA_QP_FAST_REG = (1 << 2), + OCRDMA_QP_INB_RD = (1 << 6), + OCRDMA_QP_INB_WR = (1 << 7), +}; + +enum ocrdma_qp_state { + OCRDMA_QPS_RST = 0, + OCRDMA_QPS_INIT = 1, + OCRDMA_QPS_RTR = 2, + OCRDMA_QPS_RTS = 3, + OCRDMA_QPS_SQE = 4, + OCRDMA_QPS_SQ_DRAINING = 5, + OCRDMA_QPS_ERR = 6, + OCRDMA_QPS_SQD = 7 +}; + +struct ocrdma_create_qp_req { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + + u32 type_pgsz_pdn; + u32 max_wqe_rqe; + u32 max_sge_send_write; + u32 max_sge_recv_flags; + u32 max_ord_ird; + u32 num_wq_rq_pages; + u32 wqe_rqe_size; + u32 wq_rq_cqid; + struct ocrdma_pa wq_addr[MAX_OCRDMA_QP_PAGES]; + struct ocrdma_pa rq_addr[MAX_OCRDMA_QP_PAGES]; + u32 dpp_credits_cqid; + u32 rpir_lkey; + struct ocrdma_pa ird_addr[MAX_OCRDMA_IRD_PAGES]; +}; + +enum { + OCRDMA_CREATE_QP_RSP_QP_ID_SHIFT = 0, + OCRDMA_CREATE_QP_RSP_QP_ID_MASK = 0xFFFF, + + OCRDMA_CREATE_QP_RSP_MAX_RQE_SHIFT = 0, + OCRDMA_CREATE_QP_RSP_MAX_RQE_MASK = 0xFFFF, + OCRDMA_CREATE_QP_RSP_MAX_WQE_SHIFT = 16, + OCRDMA_CREATE_QP_RSP_MAX_WQE_MASK = 0xFFFF << + OCRDMA_CREATE_QP_RSP_MAX_WQE_SHIFT, + + OCRDMA_CREATE_QP_RSP_MAX_SGE_WRITE_SHIFT = 0, + OCRDMA_CREATE_QP_RSP_MAX_SGE_WRITE_MASK = 0xFFFF, + OCRDMA_CREATE_QP_RSP_MAX_SGE_SEND_SHIFT = 16, + OCRDMA_CREATE_QP_RSP_MAX_SGE_SEND_MASK = 0xFFFF << + OCRDMA_CREATE_QP_RSP_MAX_SGE_SEND_SHIFT, + + OCRDMA_CREATE_QP_RSP_MAX_SGE_RECV_SHIFT = 16, + OCRDMA_CREATE_QP_RSP_MAX_SGE_RECV_MASK = 0xFFFF << + OCRDMA_CREATE_QP_RSP_MAX_SGE_RECV_SHIFT, + + OCRDMA_CREATE_QP_RSP_MAX_IRD_SHIFT = 0, + OCRDMA_CREATE_QP_RSP_MAX_IRD_MASK = 0xFFFF, + OCRDMA_CREATE_QP_RSP_MAX_ORD_SHIFT = 16, + OCRDMA_CREATE_QP_RSP_MAX_ORD_MASK = 0xFFFF << + OCRDMA_CREATE_QP_RSP_MAX_ORD_SHIFT, + + OCRDMA_CREATE_QP_RSP_RQ_ID_SHIFT = 0, + OCRDMA_CREATE_QP_RSP_RQ_ID_MASK = 0xFFFF, + OCRDMA_CREATE_QP_RSP_SQ_ID_SHIFT = 16, + OCRDMA_CREATE_QP_RSP_SQ_ID_MASK = 0xFFFF << + OCRDMA_CREATE_QP_RSP_SQ_ID_SHIFT, + + OCRDMA_CREATE_QP_RSP_DPP_ENABLED_MASK = BIT(0), + OCRDMA_CREATE_QP_RSP_DPP_PAGE_OFFSET_SHIFT = 1, + OCRDMA_CREATE_QP_RSP_DPP_PAGE_OFFSET_MASK = 0x7FFF << + OCRDMA_CREATE_QP_RSP_DPP_PAGE_OFFSET_SHIFT, + OCRDMA_CREATE_QP_RSP_DPP_CREDITS_SHIFT = 16, + OCRDMA_CREATE_QP_RSP_DPP_CREDITS_MASK = 0xFFFF << + OCRDMA_CREATE_QP_RSP_DPP_CREDITS_SHIFT, +}; + +struct ocrdma_create_qp_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + + u32 qp_id; + u32 max_wqe_rqe; + u32 max_sge_send_write; + u32 max_sge_recv; + u32 max_ord_ird; + u32 sq_rq_id; + u32 dpp_response; +}; + +struct ocrdma_destroy_qp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + u32 qp_id; +}; + +struct ocrdma_destroy_qp_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; +}; + +enum { + OCRDMA_MODIFY_QP_ID_SHIFT = 0, + OCRDMA_MODIFY_QP_ID_MASK = 0xFFFF, + + OCRDMA_QP_PARA_QPS_VALID = BIT(0), + OCRDMA_QP_PARA_SQD_ASYNC_VALID = BIT(1), + OCRDMA_QP_PARA_PKEY_VALID = BIT(2), + OCRDMA_QP_PARA_QKEY_VALID = BIT(3), + OCRDMA_QP_PARA_PMTU_VALID = BIT(4), + OCRDMA_QP_PARA_ACK_TO_VALID = BIT(5), + OCRDMA_QP_PARA_RETRY_CNT_VALID = BIT(6), + OCRDMA_QP_PARA_RRC_VALID = BIT(7), + OCRDMA_QP_PARA_RQPSN_VALID = BIT(8), + OCRDMA_QP_PARA_MAX_IRD_VALID = BIT(9), + OCRDMA_QP_PARA_MAX_ORD_VALID = BIT(10), + OCRDMA_QP_PARA_RNT_VALID = BIT(11), + OCRDMA_QP_PARA_SQPSN_VALID = BIT(12), + OCRDMA_QP_PARA_DST_QPN_VALID = BIT(13), + OCRDMA_QP_PARA_MAX_WQE_VALID = BIT(14), + OCRDMA_QP_PARA_MAX_RQE_VALID = BIT(15), + OCRDMA_QP_PARA_SGE_SEND_VALID = BIT(16), + OCRDMA_QP_PARA_SGE_RECV_VALID = BIT(17), + OCRDMA_QP_PARA_SGE_WR_VALID = BIT(18), + OCRDMA_QP_PARA_INB_RDEN_VALID = BIT(19), + OCRDMA_QP_PARA_INB_WREN_VALID = BIT(20), + OCRDMA_QP_PARA_FLOW_LBL_VALID = BIT(21), + OCRDMA_QP_PARA_BIND_EN_VALID = BIT(22), + OCRDMA_QP_PARA_ZLKEY_EN_VALID = BIT(23), + OCRDMA_QP_PARA_FMR_EN_VALID = BIT(24), + OCRDMA_QP_PARA_INBAT_EN_VALID = BIT(25), + OCRDMA_QP_PARA_VLAN_EN_VALID = BIT(26), + + OCRDMA_MODIFY_QP_FLAGS_RD = BIT(0), + OCRDMA_MODIFY_QP_FLAGS_WR = BIT(1), + OCRDMA_MODIFY_QP_FLAGS_SEND = BIT(2), + OCRDMA_MODIFY_QP_FLAGS_ATOMIC = BIT(3) +}; + +enum { + OCRDMA_QP_PARAMS_SRQ_ID_SHIFT = 0, + OCRDMA_QP_PARAMS_SRQ_ID_MASK = 0xFFFF, + + OCRDMA_QP_PARAMS_MAX_RQE_SHIFT = 0, + OCRDMA_QP_PARAMS_MAX_RQE_MASK = 0xFFFF, + OCRDMA_QP_PARAMS_MAX_WQE_SHIFT = 16, + OCRDMA_QP_PARAMS_MAX_WQE_MASK = 0xFFFF << + OCRDMA_QP_PARAMS_MAX_WQE_SHIFT, + + OCRDMA_QP_PARAMS_MAX_SGE_WRITE_SHIFT = 0, + OCRDMA_QP_PARAMS_MAX_SGE_WRITE_MASK = 0xFFFF, + OCRDMA_QP_PARAMS_MAX_SGE_SEND_SHIFT = 16, + OCRDMA_QP_PARAMS_MAX_SGE_SEND_MASK = 0xFFFF << + OCRDMA_QP_PARAMS_MAX_SGE_SEND_SHIFT, + + OCRDMA_QP_PARAMS_FLAGS_FMR_EN = BIT(0), + OCRDMA_QP_PARAMS_FLAGS_LKEY_0_EN = BIT(1), + OCRDMA_QP_PARAMS_FLAGS_BIND_MW_EN = BIT(2), + OCRDMA_QP_PARAMS_FLAGS_INBWR_EN = BIT(3), + OCRDMA_QP_PARAMS_FLAGS_INBRD_EN = BIT(4), + OCRDMA_QP_PARAMS_STATE_SHIFT = 5, + OCRDMA_QP_PARAMS_STATE_MASK = BIT(5) | BIT(6) | BIT(7), + OCRDMA_QP_PARAMS_FLAGS_SQD_ASYNC = BIT(8), + OCRDMA_QP_PARAMS_FLAGS_INB_ATEN = BIT(9), + OCRDMA_QP_PARAMS_MAX_SGE_RECV_SHIFT = 16, + OCRDMA_QP_PARAMS_MAX_SGE_RECV_MASK = 0xFFFF << + OCRDMA_QP_PARAMS_MAX_SGE_RECV_SHIFT, + + OCRDMA_QP_PARAMS_MAX_IRD_SHIFT = 0, + OCRDMA_QP_PARAMS_MAX_IRD_MASK = 0xFFFF, + OCRDMA_QP_PARAMS_MAX_ORD_SHIFT = 16, + OCRDMA_QP_PARAMS_MAX_ORD_MASK = 0xFFFF << + OCRDMA_QP_PARAMS_MAX_ORD_SHIFT, + + OCRDMA_QP_PARAMS_RQ_CQID_SHIFT = 0, + OCRDMA_QP_PARAMS_RQ_CQID_MASK = 0xFFFF, + OCRDMA_QP_PARAMS_WQ_CQID_SHIFT = 16, + OCRDMA_QP_PARAMS_WQ_CQID_MASK = 0xFFFF << + OCRDMA_QP_PARAMS_WQ_CQID_SHIFT, + + OCRDMA_QP_PARAMS_RQ_PSN_SHIFT = 0, + OCRDMA_QP_PARAMS_RQ_PSN_MASK = 0xFFFFFF, + OCRDMA_QP_PARAMS_HOP_LMT_SHIFT = 24, + OCRDMA_QP_PARAMS_HOP_LMT_MASK = 0xFF << + OCRDMA_QP_PARAMS_HOP_LMT_SHIFT, + + OCRDMA_QP_PARAMS_SQ_PSN_SHIFT = 0, + OCRDMA_QP_PARAMS_SQ_PSN_MASK = 0xFFFFFF, + OCRDMA_QP_PARAMS_TCLASS_SHIFT = 24, + OCRDMA_QP_PARAMS_TCLASS_MASK = 0xFF << + OCRDMA_QP_PARAMS_TCLASS_SHIFT, + + OCRDMA_QP_PARAMS_DEST_QPN_SHIFT = 0, + OCRDMA_QP_PARAMS_DEST_QPN_MASK = 0xFFFFFF, + OCRDMA_QP_PARAMS_RNR_RETRY_CNT_SHIFT = 24, + OCRDMA_QP_PARAMS_RNR_RETRY_CNT_MASK = 0x7 << + OCRDMA_QP_PARAMS_RNR_RETRY_CNT_SHIFT, + OCRDMA_QP_PARAMS_ACK_TIMEOUT_SHIFT = 27, + OCRDMA_QP_PARAMS_ACK_TIMEOUT_MASK = 0x1F << + OCRDMA_QP_PARAMS_ACK_TIMEOUT_SHIFT, + + OCRDMA_QP_PARAMS_PKEY_IDNEX_SHIFT = 0, + OCRDMA_QP_PARAMS_PKEY_INDEX_MASK = 0xFFFF, + OCRDMA_QP_PARAMS_PATH_MTU_SHIFT = 18, + OCRDMA_QP_PARAMS_PATH_MTU_MASK = 0x3FFF << + OCRDMA_QP_PARAMS_PATH_MTU_SHIFT, + + OCRDMA_QP_PARAMS_FLOW_LABEL_SHIFT = 0, + OCRDMA_QP_PARAMS_FLOW_LABEL_MASK = 0xFFFFF, + OCRDMA_QP_PARAMS_SL_SHIFT = 20, + OCRDMA_QP_PARAMS_SL_MASK = 0xF << + OCRDMA_QP_PARAMS_SL_SHIFT, + OCRDMA_QP_PARAMS_RETRY_CNT_SHIFT = 24, + OCRDMA_QP_PARAMS_RETRY_CNT_MASK = 0x7 << + OCRDMA_QP_PARAMS_RETRY_CNT_SHIFT, + OCRDMA_QP_PARAMS_RNR_NAK_TIMER_SHIFT = 27, + OCRDMA_QP_PARAMS_RNR_NAK_TIMER_MASK = 0x1F << + OCRDMA_QP_PARAMS_RNR_NAK_TIMER_SHIFT, + + OCRDMA_QP_PARAMS_DMAC_B4_TO_B5_SHIFT = 0, + OCRDMA_QP_PARAMS_DMAC_B4_TO_B5_MASK = 0xFFFF, + OCRDMA_QP_PARAMS_VLAN_SHIFT = 16, + OCRDMA_QP_PARAMS_VLAN_MASK = 0xFFFF << + OCRDMA_QP_PARAMS_VLAN_SHIFT +}; + +struct ocrdma_qp_params { + u32 id; + u32 max_wqe_rqe; + u32 max_sge_send_write; + u32 max_sge_recv_flags; + u32 max_ord_ird; + u32 wq_rq_cqid; + u32 hop_lmt_rq_psn; + u32 tclass_sq_psn; + u32 ack_to_rnr_rtc_dest_qpn; + u32 path_mtu_pkey_indx; + u32 rnt_rc_sl_fl; + u8 sgid[16]; + u8 dgid[16]; + u32 dmac_b0_to_b3; + u32 vlan_dmac_b4_to_b5; + u32 qkey; +}; + + +struct ocrdma_modify_qp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + + struct ocrdma_qp_params params; + u32 flags; + u32 rdma_flags; + u32 num_outstanding_atomic_rd; +}; + +enum { + OCRDMA_MODIFY_QP_RSP_MAX_RQE_SHIFT = 0, + OCRDMA_MODIFY_QP_RSP_MAX_RQE_MASK = 0xFFFF, + OCRDMA_MODIFY_QP_RSP_MAX_WQE_SHIFT = 16, + OCRDMA_MODIFY_QP_RSP_MAX_WQE_MASK = 0xFFFF << + OCRDMA_MODIFY_QP_RSP_MAX_WQE_SHIFT, + + OCRDMA_MODIFY_QP_RSP_MAX_IRD_SHIFT = 0, + OCRDMA_MODIFY_QP_RSP_MAX_IRD_MASK = 0xFFFF, + OCRDMA_MODIFY_QP_RSP_MAX_ORD_SHIFT = 16, + OCRDMA_MODIFY_QP_RSP_MAX_ORD_MASK = 0xFFFF << + OCRDMA_MODIFY_QP_RSP_MAX_ORD_SHIFT +}; + +struct ocrdma_modify_qp_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + + u32 max_wqe_rqe; + u32 max_ord_ird; +}; + +struct ocrdma_query_qp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + +#define OCRDMA_QUERY_UP_QP_ID_SHIFT 0 +#define OCRDMA_QUERY_UP_QP_ID_MASK 0xFFFFFF + u32 qp_id; +}; + +struct ocrdma_query_qp_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + struct ocrdma_qp_params params; +}; + +enum { + OCRDMA_CREATE_SRQ_PD_ID_SHIFT = 0, + OCRDMA_CREATE_SRQ_PD_ID_MASK = 0xFFFF, + OCRDMA_CREATE_SRQ_PG_SZ_SHIFT = 16, + OCRDMA_CREATE_SRQ_PG_SZ_MASK = 0x3 << + OCRDMA_CREATE_SRQ_PG_SZ_SHIFT, + + OCRDMA_CREATE_SRQ_MAX_RQE_SHIFT = 0, + OCRDMA_CREATE_SRQ_MAX_SGE_RECV_SHIFT = 16, + OCRDMA_CREATE_SRQ_MAX_SGE_RECV_MASK = 0xFFFF << + OCRDMA_CREATE_SRQ_MAX_SGE_RECV_SHIFT, + + OCRDMA_CREATE_SRQ_RQE_SIZE_SHIFT = 0, + OCRDMA_CREATE_SRQ_RQE_SIZE_MASK = 0xFFFF, + OCRDMA_CREATE_SRQ_NUM_RQ_PAGES_SHIFT = 16, + OCRDMA_CREATE_SRQ_NUM_RQ_PAGES_MASK = 0xFFFF << + OCRDMA_CREATE_SRQ_NUM_RQ_PAGES_SHIFT +}; + +struct ocrdma_create_srq { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + + u32 pgsz_pdid; + u32 max_sge_rqe; + u32 pages_rqe_sz; + struct ocrdma_pa rq_addr[MAX_OCRDMA_SRQ_PAGES]; +}; + +enum { + OCRDMA_CREATE_SRQ_RSP_SRQ_ID_SHIFT = 0, + OCRDMA_CREATE_SRQ_RSP_SRQ_ID_MASK = 0xFFFFFF, + + OCRDMA_CREATE_SRQ_RSP_MAX_RQE_ALLOCATED_SHIFT = 0, + OCRDMA_CREATE_SRQ_RSP_MAX_RQE_ALLOCATED_MASK = 0xFFFF, + OCRDMA_CREATE_SRQ_RSP_MAX_SGE_RECV_ALLOCATED_SHIFT = 16, + OCRDMA_CREATE_SRQ_RSP_MAX_SGE_RECV_ALLOCATED_MASK = 0xFFFF << + OCRDMA_CREATE_SRQ_RSP_MAX_SGE_RECV_ALLOCATED_SHIFT +}; + +struct ocrdma_create_srq_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + + u32 id; + u32 max_sge_rqe_allocated; +}; + +enum { + OCRDMA_MODIFY_SRQ_ID_SHIFT = 0, + OCRDMA_MODIFY_SRQ_ID_MASK = 0xFFFFFF, + + OCRDMA_MODIFY_SRQ_MAX_RQE_SHIFT = 0, + OCRDMA_MODIFY_SRQ_MAX_RQE_MASK = 0xFFFF, + OCRDMA_MODIFY_SRQ_LIMIT_SHIFT = 16, + OCRDMA_MODIFY_SRQ__LIMIT_MASK = 0xFFFF << + OCRDMA_MODIFY_SRQ_LIMIT_SHIFT +}; + +struct ocrdma_modify_srq { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rep; + + u32 id; + u32 limit_max_rqe; +}; + +enum { + OCRDMA_QUERY_SRQ_ID_SHIFT = 0, + OCRDMA_QUERY_SRQ_ID_MASK = 0xFFFFFF +}; + +struct ocrdma_query_srq { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp req; + + u32 id; +}; + +enum { + OCRDMA_QUERY_SRQ_RSP_PD_ID_SHIFT = 0, + OCRDMA_QUERY_SRQ_RSP_PD_ID_MASK = 0xFFFF, + OCRDMA_QUERY_SRQ_RSP_MAX_RQE_SHIFT = 16, + OCRDMA_QUERY_SRQ_RSP_MAX_RQE_MASK = 0xFFFF << + OCRDMA_QUERY_SRQ_RSP_MAX_RQE_SHIFT, + + OCRDMA_QUERY_SRQ_RSP_MAX_SGE_RECV_SHIFT = 0, + OCRDMA_QUERY_SRQ_RSP_MAX_SGE_RECV_MASK = 0xFFFF, + OCRDMA_QUERY_SRQ_RSP_SRQ_LIMIT_SHIFT = 16, + OCRDMA_QUERY_SRQ_RSP_SRQ_LIMIT_MASK = 0xFFFF << + OCRDMA_QUERY_SRQ_RSP_SRQ_LIMIT_SHIFT +}; + +struct ocrdma_query_srq_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp req; + + u32 max_rqe_pdid; + u32 srq_lmt_max_sge; +}; + +enum { + OCRDMA_DESTROY_SRQ_ID_SHIFT = 0, + OCRDMA_DESTROY_SRQ_ID_MASK = 0xFFFFFF +}; + +struct ocrdma_destroy_srq { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp req; + + u32 id; +}; + +enum { + OCRDMA_ALLOC_PD_ENABLE_DPP = BIT(16), + OCRDMA_DPP_PAGE_SIZE = 4096 +}; + +struct ocrdma_alloc_pd { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + u32 enable_dpp_rsvd; +}; + +enum { + OCRDMA_ALLOC_PD_RSP_DPP = BIT(16), + OCRDMA_ALLOC_PD_RSP_DPP_PAGE_SHIFT = 20, + OCRDMA_ALLOC_PD_RSP_PDID_MASK = 0xFFFF, +}; + +struct ocrdma_alloc_pd_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + u32 dpp_page_pdid; +}; + +struct ocrdma_dealloc_pd { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + u32 id; +}; + +struct ocrdma_dealloc_pd_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; +}; + +enum { + OCRDMA_ADDR_CHECK_ENABLE = 1, + OCRDMA_ADDR_CHECK_DISABLE = 0 +}; + +enum { + OCRDMA_ALLOC_LKEY_PD_ID_SHIFT = 0, + OCRDMA_ALLOC_LKEY_PD_ID_MASK = 0xFFFF, + + OCRDMA_ALLOC_LKEY_ADDR_CHECK_SHIFT = 0, + OCRDMA_ALLOC_LKEY_ADDR_CHECK_MASK = BIT(0), + OCRDMA_ALLOC_LKEY_FMR_SHIFT = 1, + OCRDMA_ALLOC_LKEY_FMR_MASK = BIT(1), + OCRDMA_ALLOC_LKEY_REMOTE_INV_SHIFT = 2, + OCRDMA_ALLOC_LKEY_REMOTE_INV_MASK = BIT(2), + OCRDMA_ALLOC_LKEY_REMOTE_WR_SHIFT = 3, + OCRDMA_ALLOC_LKEY_REMOTE_WR_MASK = BIT(3), + OCRDMA_ALLOC_LKEY_REMOTE_RD_SHIFT = 4, + OCRDMA_ALLOC_LKEY_REMOTE_RD_MASK = BIT(4), + OCRDMA_ALLOC_LKEY_LOCAL_WR_SHIFT = 5, + OCRDMA_ALLOC_LKEY_LOCAL_WR_MASK = BIT(5), + OCRDMA_ALLOC_LKEY_REMOTE_ATOMIC_MASK = BIT(6), + OCRDMA_ALLOC_LKEY_REMOTE_ATOMIC_SHIFT = 6, + OCRDMA_ALLOC_LKEY_PBL_SIZE_SHIFT = 16, + OCRDMA_ALLOC_LKEY_PBL_SIZE_MASK = 0xFFFF << + OCRDMA_ALLOC_LKEY_PBL_SIZE_SHIFT +}; + +struct ocrdma_alloc_lkey { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + + u32 pdid; + u32 pbl_sz_flags; +}; + +struct ocrdma_alloc_lkey_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + + u32 lrkey; + u32 num_pbl_rsvd; +}; + +struct ocrdma_dealloc_lkey { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + + u32 lkey; + u32 rsvd_frmr; +}; + +struct ocrdma_dealloc_lkey_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; +}; + +#define MAX_OCRDMA_NSMR_PBL (u32)22 +#define MAX_OCRDMA_PBL_SIZE 65536 +#define MAX_OCRDMA_PBL_PER_LKEY 32767 + +enum { + OCRDMA_REG_NSMR_LRKEY_INDEX_SHIFT = 0, + OCRDMA_REG_NSMR_LRKEY_INDEX_MASK = 0xFFFFFF, + OCRDMA_REG_NSMR_LRKEY_SHIFT = 24, + OCRDMA_REG_NSMR_LRKEY_MASK = 0xFF << + OCRDMA_REG_NSMR_LRKEY_SHIFT, + + OCRDMA_REG_NSMR_PD_ID_SHIFT = 0, + OCRDMA_REG_NSMR_PD_ID_MASK = 0xFFFF, + OCRDMA_REG_NSMR_NUM_PBL_SHIFT = 16, + OCRDMA_REG_NSMR_NUM_PBL_MASK = 0xFFFF << + OCRDMA_REG_NSMR_NUM_PBL_SHIFT, + + OCRDMA_REG_NSMR_PBE_SIZE_SHIFT = 0, + OCRDMA_REG_NSMR_PBE_SIZE_MASK = 0xFFFF, + OCRDMA_REG_NSMR_HPAGE_SIZE_SHIFT = 16, + OCRDMA_REG_NSMR_HPAGE_SIZE_MASK = 0xFF << + OCRDMA_REG_NSMR_HPAGE_SIZE_SHIFT, + OCRDMA_REG_NSMR_BIND_MEMWIN_SHIFT = 24, + OCRDMA_REG_NSMR_BIND_MEMWIN_MASK = BIT(24), + OCRDMA_REG_NSMR_ZB_SHIFT = 25, + OCRDMA_REG_NSMR_ZB_SHIFT_MASK = BIT(25), + OCRDMA_REG_NSMR_REMOTE_INV_SHIFT = 26, + OCRDMA_REG_NSMR_REMOTE_INV_MASK = BIT(26), + OCRDMA_REG_NSMR_REMOTE_WR_SHIFT = 27, + OCRDMA_REG_NSMR_REMOTE_WR_MASK = BIT(27), + OCRDMA_REG_NSMR_REMOTE_RD_SHIFT = 28, + OCRDMA_REG_NSMR_REMOTE_RD_MASK = BIT(28), + OCRDMA_REG_NSMR_LOCAL_WR_SHIFT = 29, + OCRDMA_REG_NSMR_LOCAL_WR_MASK = BIT(29), + OCRDMA_REG_NSMR_REMOTE_ATOMIC_SHIFT = 30, + OCRDMA_REG_NSMR_REMOTE_ATOMIC_MASK = BIT(30), + OCRDMA_REG_NSMR_LAST_SHIFT = 31, + OCRDMA_REG_NSMR_LAST_MASK = BIT(31) +}; + +struct ocrdma_reg_nsmr { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr cmd; + + u32 fr_mr; + u32 num_pbl_pdid; + u32 flags_hpage_pbe_sz; + u32 totlen_low; + u32 totlen_high; + u32 fbo_low; + u32 fbo_high; + u32 va_loaddr; + u32 va_hiaddr; + struct ocrdma_pa pbl[MAX_OCRDMA_NSMR_PBL]; +}; + +enum { + OCRDMA_REG_NSMR_CONT_PBL_SHIFT = 0, + OCRDMA_REG_NSMR_CONT_PBL_SHIFT_MASK = 0xFFFF, + OCRDMA_REG_NSMR_CONT_NUM_PBL_SHIFT = 16, + OCRDMA_REG_NSMR_CONT_NUM_PBL_MASK = 0xFFFF << + OCRDMA_REG_NSMR_CONT_NUM_PBL_SHIFT, + + OCRDMA_REG_NSMR_CONT_LAST_SHIFT = 31, + OCRDMA_REG_NSMR_CONT_LAST_MASK = BIT(31) +}; + +struct ocrdma_reg_nsmr_cont { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr cmd; + + u32 lrkey; + u32 num_pbl_offset; + u32 last; + + struct ocrdma_pa pbl[MAX_OCRDMA_NSMR_PBL]; +}; + +struct ocrdma_pbe { + u32 pa_hi; + u32 pa_lo; +}; + +enum { + OCRDMA_REG_NSMR_RSP_NUM_PBL_SHIFT = 16, + OCRDMA_REG_NSMR_RSP_NUM_PBL_MASK = 0xFFFF0000 +}; +struct ocrdma_reg_nsmr_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + + u32 lrkey; + u32 num_pbl; +}; + +enum { + OCRDMA_REG_NSMR_CONT_RSP_LRKEY_INDEX_SHIFT = 0, + OCRDMA_REG_NSMR_CONT_RSP_LRKEY_INDEX_MASK = 0xFFFFFF, + OCRDMA_REG_NSMR_CONT_RSP_LRKEY_SHIFT = 24, + OCRDMA_REG_NSMR_CONT_RSP_LRKEY_MASK = 0xFF << + OCRDMA_REG_NSMR_CONT_RSP_LRKEY_SHIFT, + + OCRDMA_REG_NSMR_CONT_RSP_NUM_PBL_SHIFT = 16, + OCRDMA_REG_NSMR_CONT_RSP_NUM_PBL_MASK = 0xFFFF << + OCRDMA_REG_NSMR_CONT_RSP_NUM_PBL_SHIFT +}; + +struct ocrdma_reg_nsmr_cont_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + + u32 lrkey_key_index; + u32 num_pbl; +}; + +enum { + OCRDMA_ALLOC_MW_PD_ID_SHIFT = 0, + OCRDMA_ALLOC_MW_PD_ID_MASK = 0xFFFF +}; + +struct ocrdma_alloc_mw { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + + u32 pdid; +}; + +enum { + OCRDMA_ALLOC_MW_RSP_LRKEY_INDEX_SHIFT = 0, + OCRDMA_ALLOC_MW_RSP_LRKEY_INDEX_MASK = 0xFFFFFF +}; + +struct ocrdma_alloc_mw_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + + u32 lrkey_index; +}; + +struct ocrdma_attach_mcast { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + u32 qp_id; + u8 mgid[16]; + u32 mac_b0_to_b3; + u32 vlan_mac_b4_to_b5; +}; + +struct ocrdma_attach_mcast_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; +}; + +struct ocrdma_detach_mcast { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + u32 qp_id; + u8 mgid[16]; + u32 mac_b0_to_b3; + u32 vlan_mac_b4_to_b5; +}; + +struct ocrdma_detach_mcast_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; +}; + +enum { + OCRDMA_CREATE_AH_NUM_PAGES_SHIFT = 19, + OCRDMA_CREATE_AH_NUM_PAGES_MASK = 0xF << + OCRDMA_CREATE_AH_NUM_PAGES_SHIFT, + + OCRDMA_CREATE_AH_PAGE_SIZE_SHIFT = 16, + OCRDMA_CREATE_AH_PAGE_SIZE_MASK = 0x7 << + OCRDMA_CREATE_AH_PAGE_SIZE_SHIFT, + + OCRDMA_CREATE_AH_ENTRY_SIZE_SHIFT = 23, + OCRDMA_CREATE_AH_ENTRY_SIZE_MASK = 0x1FF << + OCRDMA_CREATE_AH_ENTRY_SIZE_SHIFT, +}; + +#define OCRDMA_AH_TBL_PAGES 8 + +struct ocrdma_create_ah_tbl { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + + u32 ah_conf; + struct ocrdma_pa tbl_addr[8]; +}; + +struct ocrdma_create_ah_tbl_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; + u32 ahid; +}; + +struct ocrdma_delete_ah_tbl { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_hdr req; + u32 ahid; +}; + +struct ocrdma_delete_ah_tbl_rsp { + struct ocrdma_mqe_hdr hdr; + struct ocrdma_mbx_rsp rsp; +}; + +enum { + OCRDMA_EQE_VALID_SHIFT = 0, + OCRDMA_EQE_VALID_MASK = BIT(0), + OCRDMA_EQE_FOR_CQE_MASK = 0xFFFE, + OCRDMA_EQE_RESOURCE_ID_SHIFT = 16, + OCRDMA_EQE_RESOURCE_ID_MASK = 0xFFFF << + OCRDMA_EQE_RESOURCE_ID_SHIFT, +}; + +struct ocrdma_eqe { + u32 id_valid; +}; + +enum OCRDMA_CQE_STATUS { + OCRDMA_CQE_SUCCESS = 0, + OCRDMA_CQE_LOC_LEN_ERR, + OCRDMA_CQE_LOC_QP_OP_ERR, + OCRDMA_CQE_LOC_EEC_OP_ERR, + OCRDMA_CQE_LOC_PROT_ERR, + OCRDMA_CQE_WR_FLUSH_ERR, + OCRDMA_CQE_MW_BIND_ERR, + OCRDMA_CQE_BAD_RESP_ERR, + OCRDMA_CQE_LOC_ACCESS_ERR, + OCRDMA_CQE_REM_INV_REQ_ERR, + OCRDMA_CQE_REM_ACCESS_ERR, + OCRDMA_CQE_REM_OP_ERR, + OCRDMA_CQE_RETRY_EXC_ERR, + OCRDMA_CQE_RNR_RETRY_EXC_ERR, + OCRDMA_CQE_LOC_RDD_VIOL_ERR, + OCRDMA_CQE_REM_INV_RD_REQ_ERR, + OCRDMA_CQE_REM_ABORT_ERR, + OCRDMA_CQE_INV_EECN_ERR, + OCRDMA_CQE_INV_EEC_STATE_ERR, + OCRDMA_CQE_FATAL_ERR, + OCRDMA_CQE_RESP_TIMEOUT_ERR, + OCRDMA_CQE_GENERAL_ERR +}; + +enum { + /* w0 */ + OCRDMA_CQE_WQEIDX_SHIFT = 0, + OCRDMA_CQE_WQEIDX_MASK = 0xFFFF, + + /* w1 */ + OCRDMA_CQE_UD_XFER_LEN_SHIFT = 16, + OCRDMA_CQE_PKEY_SHIFT = 0, + OCRDMA_CQE_PKEY_MASK = 0xFFFF, + + /* w2 */ + OCRDMA_CQE_QPN_SHIFT = 0, + OCRDMA_CQE_QPN_MASK = 0x0000FFFF, + + OCRDMA_CQE_BUFTAG_SHIFT = 16, + OCRDMA_CQE_BUFTAG_MASK = 0xFFFF << OCRDMA_CQE_BUFTAG_SHIFT, + + /* w3 */ + OCRDMA_CQE_UD_STATUS_SHIFT = 24, + OCRDMA_CQE_UD_STATUS_MASK = 0x7 << OCRDMA_CQE_UD_STATUS_SHIFT, + OCRDMA_CQE_STATUS_SHIFT = 16, + OCRDMA_CQE_STATUS_MASK = 0xFF << OCRDMA_CQE_STATUS_SHIFT, + OCRDMA_CQE_VALID = BIT(31), + OCRDMA_CQE_INVALIDATE = BIT(30), + OCRDMA_CQE_QTYPE = BIT(29), + OCRDMA_CQE_IMM = BIT(28), + OCRDMA_CQE_WRITE_IMM = BIT(27), + OCRDMA_CQE_QTYPE_SQ = 0, + OCRDMA_CQE_QTYPE_RQ = 1, + OCRDMA_CQE_SRCQP_MASK = 0xFFFFFF +}; + +struct ocrdma_cqe { + union { + /* w0 to w2 */ + struct { + u32 wqeidx; + u32 bytes_xfered; + u32 qpn; + } wq; + struct { + u32 lkey_immdt; + u32 rxlen; + u32 buftag_qpn; + } rq; + struct { + u32 lkey_immdt; + u32 rxlen_pkey; + u32 buftag_qpn; + } ud; + struct { + u32 word_0; + u32 word_1; + u32 qpn; + } cmn; + }; + u32 flags_status_srcqpn; /* w3 */ +}; + +struct ocrdma_sge { + u32 addr_hi; + u32 addr_lo; + u32 lrkey; + u32 len; +}; + +enum { + OCRDMA_FLAG_SIG = 0x1, + OCRDMA_FLAG_INV = 0x2, + OCRDMA_FLAG_FENCE_L = 0x4, + OCRDMA_FLAG_FENCE_R = 0x8, + OCRDMA_FLAG_SOLICIT = 0x10, + OCRDMA_FLAG_IMM = 0x20, + + /* Stag flags */ + OCRDMA_LKEY_FLAG_LOCAL_WR = 0x1, + OCRDMA_LKEY_FLAG_REMOTE_RD = 0x2, + OCRDMA_LKEY_FLAG_REMOTE_WR = 0x4, + OCRDMA_LKEY_FLAG_VATO = 0x8, +}; + +enum OCRDMA_WQE_OPCODE { + OCRDMA_WRITE = 0x06, + OCRDMA_READ = 0x0C, + OCRDMA_RESV0 = 0x02, + OCRDMA_SEND = 0x00, + OCRDMA_CMP_SWP = 0x14, + OCRDMA_BIND_MW = 0x10, + OCRDMA_FR_MR = 0x11, + OCRDMA_RESV1 = 0x0A, + OCRDMA_LKEY_INV = 0x15, + OCRDMA_FETCH_ADD = 0x13, + OCRDMA_POST_RQ = 0x12 +}; + +enum { + OCRDMA_TYPE_INLINE = 0x0, + OCRDMA_TYPE_LKEY = 0x1, +}; + +enum { + OCRDMA_WQE_OPCODE_SHIFT = 0, + OCRDMA_WQE_OPCODE_MASK = 0x0000001F, + OCRDMA_WQE_FLAGS_SHIFT = 5, + OCRDMA_WQE_TYPE_SHIFT = 16, + OCRDMA_WQE_TYPE_MASK = 0x00030000, + OCRDMA_WQE_SIZE_SHIFT = 18, + OCRDMA_WQE_SIZE_MASK = 0xFF, + OCRDMA_WQE_NXT_WQE_SIZE_SHIFT = 25, + + OCRDMA_WQE_LKEY_FLAGS_SHIFT = 0, + OCRDMA_WQE_LKEY_FLAGS_MASK = 0xF +}; + +/* header WQE for all the SQ and RQ operations */ +struct ocrdma_hdr_wqe { + u32 cw; + union { + u32 rsvd_tag; + u32 rsvd_lkey_flags; + }; + union { + u32 immdt; + u32 lkey; + }; + u32 total_len; +}; + +struct ocrdma_ewqe_ud_hdr { + u32 rsvd_dest_qpn; + u32 qkey; + u32 rsvd_ahid; + u32 rsvd; +}; + +/* extended wqe followed by hdr_wqe for Fast Memory register */ +struct ocrdma_ewqe_fr { + u32 va_hi; + u32 va_lo; + u32 fbo_hi; + u32 fbo_lo; + u32 size_sge; + u32 num_sges; + u32 rsvd; + u32 rsvd2; +}; + +struct ocrdma_eth_basic { + u8 dmac[6]; + u8 smac[6]; + __be16 eth_type; +} __packed; + +struct ocrdma_eth_vlan { + u8 dmac[6]; + u8 smac[6]; + __be16 eth_type; + __be16 vlan_tag; +#define OCRDMA_ROCE_ETH_TYPE 0x8915 + __be16 roce_eth_type; +} __packed; + +struct ocrdma_grh { + __be32 tclass_flow; + __be32 pdid_hoplimit; + u8 sgid[16]; + u8 dgid[16]; + u16 rsvd; +} __packed; + +#define OCRDMA_AV_VALID BIT(7) +#define OCRDMA_AV_VLAN_VALID BIT(1) + +struct ocrdma_av { + struct ocrdma_eth_vlan eth_hdr; + struct ocrdma_grh grh; + u32 valid; +} __packed; + +struct ocrdma_rsrc_stats { + u32 dpp_pds; + u32 non_dpp_pds; + u32 rc_dpp_qps; + u32 uc_dpp_qps; + u32 ud_dpp_qps; + u32 rc_non_dpp_qps; + u32 rsvd; + u32 uc_non_dpp_qps; + u32 ud_non_dpp_qps; + u32 rsvd1; + u32 srqs; + u32 rbqs; + u32 r64K_nsmr; + u32 r64K_to_2M_nsmr; + u32 r2M_to_44M_nsmr; + u32 r44M_to_1G_nsmr; + u32 r1G_to_4G_nsmr; + u32 nsmr_count_4G_to_32G; + u32 r32G_to_64G_nsmr; + u32 r64G_to_128G_nsmr; + u32 r128G_to_higher_nsmr; + u32 embedded_nsmr; + u32 frmr; + u32 prefetch_qps; + u32 ondemand_qps; + u32 phy_mr; + u32 mw; + u32 rsvd2[7]; +}; + +struct ocrdma_db_err_stats { + u32 sq_doorbell_errors; + u32 cq_doorbell_errors; + u32 rq_srq_doorbell_errors; + u32 cq_overflow_errors; + u32 rsvd[4]; +}; + +struct ocrdma_wqe_stats { + u32 large_send_rc_wqes_lo; + u32 large_send_rc_wqes_hi; + u32 large_write_rc_wqes_lo; + u32 large_write_rc_wqes_hi; + u32 rsvd[4]; + u32 read_wqes_lo; + u32 read_wqes_hi; + u32 frmr_wqes_lo; + u32 frmr_wqes_hi; + u32 mw_bind_wqes_lo; + u32 mw_bind_wqes_hi; + u32 invalidate_wqes_lo; + u32 invalidate_wqes_hi; + u32 rsvd1[2]; + u32 dpp_wqe_drops; + u32 rsvd2[5]; +}; + +struct ocrdma_tx_stats { + u32 send_pkts_lo; + u32 send_pkts_hi; + u32 write_pkts_lo; + u32 write_pkts_hi; + u32 read_pkts_lo; + u32 read_pkts_hi; + u32 read_rsp_pkts_lo; + u32 read_rsp_pkts_hi; + u32 ack_pkts_lo; + u32 ack_pkts_hi; + u32 send_bytes_lo; + u32 send_bytes_hi; + u32 write_bytes_lo; + u32 write_bytes_hi; + u32 read_req_bytes_lo; + u32 read_req_bytes_hi; + u32 read_rsp_bytes_lo; + u32 read_rsp_bytes_hi; + u32 ack_timeouts; + u32 rsvd[5]; +}; + + +struct ocrdma_tx_qp_err_stats { + u32 local_length_errors; + u32 local_protection_errors; + u32 local_qp_operation_errors; + u32 retry_count_exceeded_errors; + u32 rnr_retry_count_exceeded_errors; + u32 rsvd[3]; +}; + +struct ocrdma_rx_stats { + u32 roce_frame_bytes_lo; + u32 roce_frame_bytes_hi; + u32 roce_frame_icrc_drops; + u32 roce_frame_payload_len_drops; + u32 ud_drops; + u32 qp1_drops; + u32 psn_error_request_packets; + u32 psn_error_resp_packets; + u32 rnr_nak_timeouts; + u32 rnr_nak_receives; + u32 roce_frame_rxmt_drops; + u32 nak_count_psn_sequence_errors; + u32 rc_drop_count_lookup_errors; + u32 rq_rnr_naks; + u32 srq_rnr_naks; + u32 roce_frames_lo; + u32 roce_frames_hi; + u32 rsvd; +}; + +struct ocrdma_rx_qp_err_stats { + u32 nak_invalid_requst_errors; + u32 nak_remote_operation_errors; + u32 nak_count_remote_access_errors; + u32 local_length_errors; + u32 local_protection_errors; + u32 local_qp_operation_errors; + u32 rsvd[2]; +}; + +struct ocrdma_tx_dbg_stats { + u32 data[100]; +}; + +struct ocrdma_rx_dbg_stats { + u32 data[200]; +}; + +struct ocrdma_rdma_stats_req { + struct ocrdma_mbx_hdr hdr; + u8 reset_stats; + u8 rsvd[3]; +} __packed; + +struct ocrdma_rdma_stats_resp { + struct ocrdma_mbx_hdr hdr; + struct ocrdma_rsrc_stats act_rsrc_stats; + struct ocrdma_rsrc_stats th_rsrc_stats; + struct ocrdma_db_err_stats db_err_stats; + struct ocrdma_wqe_stats wqe_stats; + struct ocrdma_tx_stats tx_stats; + struct ocrdma_tx_qp_err_stats tx_qp_err_stats; + struct ocrdma_rx_stats rx_stats; + struct ocrdma_rx_qp_err_stats rx_qp_err_stats; + struct ocrdma_tx_dbg_stats tx_dbg_stats; + struct ocrdma_rx_dbg_stats rx_dbg_stats; +} __packed; + +enum { + OCRDMA_HBA_ATTRB_EPROM_VER_LO_MASK = 0xFF, + OCRDMA_HBA_ATTRB_EPROM_VER_HI_MASK = 0xFF00, + OCRDMA_HBA_ATTRB_EPROM_VER_HI_SHIFT = 0x08, + OCRDMA_HBA_ATTRB_CDBLEN_MASK = 0xFFFF, + OCRDMA_HBA_ATTRB_ASIC_REV_MASK = 0xFF0000, + OCRDMA_HBA_ATTRB_ASIC_REV_SHIFT = 0x10, + OCRDMA_HBA_ATTRB_GUID0_MASK = 0xFF000000, + OCRDMA_HBA_ATTRB_GUID0_SHIFT = 0x18, + OCRDMA_HBA_ATTRB_GUID13_MASK = 0xFF, + OCRDMA_HBA_ATTRB_GUID14_MASK = 0xFF00, + OCRDMA_HBA_ATTRB_GUID14_SHIFT = 0x08, + OCRDMA_HBA_ATTRB_GUID15_MASK = 0xFF0000, + OCRDMA_HBA_ATTRB_GUID15_SHIFT = 0x10, + OCRDMA_HBA_ATTRB_PCNT_MASK = 0xFF000000, + OCRDMA_HBA_ATTRB_PCNT_SHIFT = 0x18, + OCRDMA_HBA_ATTRB_LDTOUT_MASK = 0xFFFF, + OCRDMA_HBA_ATTRB_ISCSI_VER_MASK = 0xFF0000, + OCRDMA_HBA_ATTRB_ISCSI_VER_SHIFT = 0x10, + OCRDMA_HBA_ATTRB_MFUNC_DEV_MASK = 0xFF000000, + OCRDMA_HBA_ATTRB_MFUNC_DEV_SHIFT = 0x18, + OCRDMA_HBA_ATTRB_CV_MASK = 0xFF, + OCRDMA_HBA_ATTRB_HBA_ST_MASK = 0xFF00, + OCRDMA_HBA_ATTRB_HBA_ST_SHIFT = 0x08, + OCRDMA_HBA_ATTRB_MAX_DOMS_MASK = 0xFF0000, + OCRDMA_HBA_ATTRB_MAX_DOMS_SHIFT = 0x10, + OCRDMA_HBA_ATTRB_PTNUM_MASK = 0x3F000000, + OCRDMA_HBA_ATTRB_PTNUM_SHIFT = 0x18, + OCRDMA_HBA_ATTRB_PT_MASK = 0xC0000000, + OCRDMA_HBA_ATTRB_PT_SHIFT = 0x1E, + OCRDMA_HBA_ATTRB_ISCSI_FET_MASK = 0xFF, + OCRDMA_HBA_ATTRB_ASIC_GEN_MASK = 0xFF00, + OCRDMA_HBA_ATTRB_ASIC_GEN_SHIFT = 0x08, + OCRDMA_HBA_ATTRB_PCI_VID_MASK = 0xFFFF, + OCRDMA_HBA_ATTRB_PCI_DID_MASK = 0xFFFF0000, + OCRDMA_HBA_ATTRB_PCI_DID_SHIFT = 0x10, + OCRDMA_HBA_ATTRB_PCI_SVID_MASK = 0xFFFF, + OCRDMA_HBA_ATTRB_PCI_SSID_MASK = 0xFFFF0000, + OCRDMA_HBA_ATTRB_PCI_SSID_SHIFT = 0x10, + OCRDMA_HBA_ATTRB_PCI_BUSNUM_MASK = 0xFF, + OCRDMA_HBA_ATTRB_PCI_DEVNUM_MASK = 0xFF00, + OCRDMA_HBA_ATTRB_PCI_DEVNUM_SHIFT = 0x08, + OCRDMA_HBA_ATTRB_PCI_FUNCNUM_MASK = 0xFF0000, + OCRDMA_HBA_ATTRB_PCI_FUNCNUM_SHIFT = 0x10, + OCRDMA_HBA_ATTRB_IF_TYPE_MASK = 0xFF000000, + OCRDMA_HBA_ATTRB_IF_TYPE_SHIFT = 0x18, + OCRDMA_HBA_ATTRB_NETFIL_MASK =0xFF +}; + +struct mgmt_hba_attribs { + u8 flashrom_version_string[32]; + u8 manufacturer_name[32]; + u32 supported_modes; + u32 rsvd_eprom_verhi_verlo; + u32 mbx_ds_ver; + u32 epfw_ds_ver; + u8 ncsi_ver_string[12]; + u32 default_extended_timeout; + u8 controller_model_number[32]; + u8 controller_description[64]; + u8 controller_serial_number[32]; + u8 ip_version_string[32]; + u8 firmware_version_string[32]; + u8 bios_version_string[32]; + u8 redboot_version_string[32]; + u8 driver_version_string[32]; + u8 fw_on_flash_version_string[32]; + u32 functionalities_supported; + u32 guid0_asicrev_cdblen; + u8 generational_guid[12]; + u32 portcnt_guid15; + u32 mfuncdev_iscsi_ldtout; + u32 ptpnum_maxdoms_hbast_cv; + u32 firmware_post_status; + u32 hba_mtu[8]; + u32 res_asicgen_iscsi_feaures; + u32 rsvd1[3]; +}; + +struct mgmt_controller_attrib { + struct mgmt_hba_attribs hba_attribs; + u32 pci_did_vid; + u32 pci_ssid_svid; + u32 ityp_fnum_devnum_bnum; + u32 uid_hi; + u32 uid_lo; + u32 res_nnetfil; + u32 rsvd0[4]; +}; + +struct ocrdma_get_ctrl_attribs_rsp { + struct ocrdma_mbx_hdr hdr; + struct mgmt_controller_attrib ctrl_attribs; +}; + +#define OCRDMA_SUBSYS_DCBX 0x10 + +enum OCRDMA_DCBX_OPCODE { + OCRDMA_CMD_GET_DCBX_CONFIG = 0x01 +}; + +enum OCRDMA_DCBX_PARAM_TYPE { + OCRDMA_PARAMETER_TYPE_ADMIN = 0x00, + OCRDMA_PARAMETER_TYPE_OPER = 0x01, + OCRDMA_PARAMETER_TYPE_PEER = 0x02 +}; + +enum OCRDMA_DCBX_APP_PROTO { + OCRDMA_APP_PROTO_ROCE = 0x8915 +}; + +enum OCRDMA_DCBX_PROTO { + OCRDMA_PROTO_SELECT_L2 = 0x00, + OCRDMA_PROTO_SELECT_L4 = 0x01 +}; + +enum OCRDMA_DCBX_APP_PARAM { + OCRDMA_APP_PARAM_APP_PROTO_MASK = 0xFFFF, + OCRDMA_APP_PARAM_PROTO_SEL_MASK = 0xFF, + OCRDMA_APP_PARAM_PROTO_SEL_SHIFT = 0x10, + OCRDMA_APP_PARAM_VALID_MASK = 0xFF, + OCRDMA_APP_PARAM_VALID_SHIFT = 0x18 +}; + +enum OCRDMA_DCBX_STATE_FLAGS { + OCRDMA_STATE_FLAG_ENABLED = 0x01, + OCRDMA_STATE_FLAG_ADDVERTISED = 0x02, + OCRDMA_STATE_FLAG_WILLING = 0x04, + OCRDMA_STATE_FLAG_SYNC = 0x08, + OCRDMA_STATE_FLAG_UNSUPPORTED = 0x40000000, + OCRDMA_STATE_FLAG_NEG_FAILD = 0x80000000 +}; + +enum OCRDMA_TCV_AEV_OPV_ST { + OCRDMA_DCBX_TC_SUPPORT_MASK = 0xFF, + OCRDMA_DCBX_TC_SUPPORT_SHIFT = 0x18, + OCRDMA_DCBX_APP_ENTRY_SHIFT = 0x10, + OCRDMA_DCBX_OP_PARAM_SHIFT = 0x08, + OCRDMA_DCBX_STATE_MASK = 0xFF +}; + +struct ocrdma_app_parameter { + u32 valid_proto_app; + u32 oui; + u32 app_prio[2]; +}; + +struct ocrdma_dcbx_cfg { + u32 tcv_aev_opv_st; + u32 tc_state; + u32 pfc_state; + u32 qcn_state; + u32 appl_state; + u32 ll_state; + u32 tc_bw[2]; + u32 tc_prio[8]; + u32 pfc_prio[2]; + struct ocrdma_app_parameter app_param[15]; +}; + +struct ocrdma_get_dcbx_cfg_req { + struct ocrdma_mbx_hdr hdr; + u32 param_type; +} __packed; + +struct ocrdma_get_dcbx_cfg_rsp { + struct ocrdma_mbx_rsp hdr; + struct ocrdma_dcbx_cfg cfg; +} __packed; + +#endif /* __OCRDMA_SLI_H__ */ diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c new file mode 100644 index 0000000..41a9aec --- /dev/null +++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c @@ -0,0 +1,616 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for * + * RoCE (RDMA over Converged Ethernet) adapters. * + * Copyright (C) 2008-2014 Emulex. All rights reserved. * + * EMULEX and SLI are trademarks of Emulex. * + * www.emulex.com * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of version 2 of the GNU General * + * Public License as published by the Free Software Foundation. * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID. See the GNU General Public License for * + * more details, a copy of which can be found in the file COPYING * + * included with this package. * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#include +#include "ocrdma_stats.h" + +static struct dentry *ocrdma_dbgfs_dir; + +static int ocrdma_add_stat(char *start, char *pcur, + char *name, u64 count) +{ + char buff[128] = {0}; + int cpy_len = 0; + + snprintf(buff, 128, "%s: %llu\n", name, count); + cpy_len = strlen(buff); + + if (pcur + cpy_len > start + OCRDMA_MAX_DBGFS_MEM) { + pr_err("%s: No space in stats buff\n", __func__); + return 0; + } + + memcpy(pcur, buff, cpy_len); + return cpy_len; +} + +static bool ocrdma_alloc_stats_mem(struct ocrdma_dev *dev) +{ + struct stats_mem *mem = &dev->stats_mem; + + /* Alloc mbox command mem*/ + mem->size = max_t(u32, sizeof(struct ocrdma_rdma_stats_req), + sizeof(struct ocrdma_rdma_stats_resp)); + + mem->va = dma_alloc_coherent(&dev->nic_info.pdev->dev, mem->size, + &mem->pa, GFP_KERNEL); + if (!mem->va) { + pr_err("%s: stats mbox allocation failed\n", __func__); + return false; + } + + memset(mem->va, 0, mem->size); + + /* Alloc debugfs mem */ + mem->debugfs_mem = kzalloc(OCRDMA_MAX_DBGFS_MEM, GFP_KERNEL); + if (!mem->debugfs_mem) { + pr_err("%s: stats debugfs mem allocation failed\n", __func__); + return false; + } + + return true; +} + +static void ocrdma_release_stats_mem(struct ocrdma_dev *dev) +{ + struct stats_mem *mem = &dev->stats_mem; + + if (mem->va) + dma_free_coherent(&dev->nic_info.pdev->dev, mem->size, + mem->va, mem->pa); + kfree(mem->debugfs_mem); +} + +static char *ocrdma_resource_stats(struct ocrdma_dev *dev) +{ + char *stats = dev->stats_mem.debugfs_mem, *pcur; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_rsrc_stats *rsrc_stats = &rdma_stats->act_rsrc_stats; + + memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + pcur = stats; + pcur += ocrdma_add_stat(stats, pcur, "active_dpp_pds", + (u64)rsrc_stats->dpp_pds); + pcur += ocrdma_add_stat(stats, pcur, "active_non_dpp_pds", + (u64)rsrc_stats->non_dpp_pds); + pcur += ocrdma_add_stat(stats, pcur, "active_rc_dpp_qps", + (u64)rsrc_stats->rc_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_uc_dpp_qps", + (u64)rsrc_stats->uc_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_ud_dpp_qps", + (u64)rsrc_stats->ud_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_rc_non_dpp_qps", + (u64)rsrc_stats->rc_non_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_uc_non_dpp_qps", + (u64)rsrc_stats->uc_non_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_ud_non_dpp_qps", + (u64)rsrc_stats->ud_non_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_srqs", + (u64)rsrc_stats->srqs); + pcur += ocrdma_add_stat(stats, pcur, "active_rbqs", + (u64)rsrc_stats->rbqs); + pcur += ocrdma_add_stat(stats, pcur, "active_64K_nsmr", + (u64)rsrc_stats->r64K_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_64K_to_2M_nsmr", + (u64)rsrc_stats->r64K_to_2M_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_2M_to_44M_nsmr", + (u64)rsrc_stats->r2M_to_44M_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_44M_to_1G_nsmr", + (u64)rsrc_stats->r44M_to_1G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_1G_to_4G_nsmr", + (u64)rsrc_stats->r1G_to_4G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_nsmr_count_4G_to_32G", + (u64)rsrc_stats->nsmr_count_4G_to_32G); + pcur += ocrdma_add_stat(stats, pcur, "active_32G_to_64G_nsmr", + (u64)rsrc_stats->r32G_to_64G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_64G_to_128G_nsmr", + (u64)rsrc_stats->r64G_to_128G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_128G_to_higher_nsmr", + (u64)rsrc_stats->r128G_to_higher_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_embedded_nsmr", + (u64)rsrc_stats->embedded_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "active_frmr", + (u64)rsrc_stats->frmr); + pcur += ocrdma_add_stat(stats, pcur, "active_prefetch_qps", + (u64)rsrc_stats->prefetch_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_ondemand_qps", + (u64)rsrc_stats->ondemand_qps); + pcur += ocrdma_add_stat(stats, pcur, "active_phy_mr", + (u64)rsrc_stats->phy_mr); + pcur += ocrdma_add_stat(stats, pcur, "active_mw", + (u64)rsrc_stats->mw); + + /* Print the threshold stats */ + rsrc_stats = &rdma_stats->th_rsrc_stats; + + pcur += ocrdma_add_stat(stats, pcur, "threshold_dpp_pds", + (u64)rsrc_stats->dpp_pds); + pcur += ocrdma_add_stat(stats, pcur, "threshold_non_dpp_pds", + (u64)rsrc_stats->non_dpp_pds); + pcur += ocrdma_add_stat(stats, pcur, "threshold_rc_dpp_qps", + (u64)rsrc_stats->rc_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_uc_dpp_qps", + (u64)rsrc_stats->uc_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_ud_dpp_qps", + (u64)rsrc_stats->ud_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_rc_non_dpp_qps", + (u64)rsrc_stats->rc_non_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_uc_non_dpp_qps", + (u64)rsrc_stats->uc_non_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_ud_non_dpp_qps", + (u64)rsrc_stats->ud_non_dpp_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_srqs", + (u64)rsrc_stats->srqs); + pcur += ocrdma_add_stat(stats, pcur, "threshold_rbqs", + (u64)rsrc_stats->rbqs); + pcur += ocrdma_add_stat(stats, pcur, "threshold_64K_nsmr", + (u64)rsrc_stats->r64K_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_64K_to_2M_nsmr", + (u64)rsrc_stats->r64K_to_2M_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_2M_to_44M_nsmr", + (u64)rsrc_stats->r2M_to_44M_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_44M_to_1G_nsmr", + (u64)rsrc_stats->r44M_to_1G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_1G_to_4G_nsmr", + (u64)rsrc_stats->r1G_to_4G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_nsmr_count_4G_to_32G", + (u64)rsrc_stats->nsmr_count_4G_to_32G); + pcur += ocrdma_add_stat(stats, pcur, "threshold_32G_to_64G_nsmr", + (u64)rsrc_stats->r32G_to_64G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_64G_to_128G_nsmr", + (u64)rsrc_stats->r64G_to_128G_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_128G_to_higher_nsmr", + (u64)rsrc_stats->r128G_to_higher_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_embedded_nsmr", + (u64)rsrc_stats->embedded_nsmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_frmr", + (u64)rsrc_stats->frmr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_prefetch_qps", + (u64)rsrc_stats->prefetch_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_ondemand_qps", + (u64)rsrc_stats->ondemand_qps); + pcur += ocrdma_add_stat(stats, pcur, "threshold_phy_mr", + (u64)rsrc_stats->phy_mr); + pcur += ocrdma_add_stat(stats, pcur, "threshold_mw", + (u64)rsrc_stats->mw); + return stats; +} + +static char *ocrdma_rx_stats(struct ocrdma_dev *dev) +{ + char *stats = dev->stats_mem.debugfs_mem, *pcur; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_rx_stats *rx_stats = &rdma_stats->rx_stats; + + memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + pcur = stats; + pcur += ocrdma_add_stat + (stats, pcur, "roce_frame_bytes", + convert_to_64bit(rx_stats->roce_frame_bytes_lo, + rx_stats->roce_frame_bytes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "roce_frame_icrc_drops", + (u64)rx_stats->roce_frame_icrc_drops); + pcur += ocrdma_add_stat(stats, pcur, "roce_frame_payload_len_drops", + (u64)rx_stats->roce_frame_payload_len_drops); + pcur += ocrdma_add_stat(stats, pcur, "ud_drops", + (u64)rx_stats->ud_drops); + pcur += ocrdma_add_stat(stats, pcur, "qp1_drops", + (u64)rx_stats->qp1_drops); + pcur += ocrdma_add_stat(stats, pcur, "psn_error_request_packets", + (u64)rx_stats->psn_error_request_packets); + pcur += ocrdma_add_stat(stats, pcur, "psn_error_resp_packets", + (u64)rx_stats->psn_error_resp_packets); + pcur += ocrdma_add_stat(stats, pcur, "rnr_nak_timeouts", + (u64)rx_stats->rnr_nak_timeouts); + pcur += ocrdma_add_stat(stats, pcur, "rnr_nak_receives", + (u64)rx_stats->rnr_nak_receives); + pcur += ocrdma_add_stat(stats, pcur, "roce_frame_rxmt_drops", + (u64)rx_stats->roce_frame_rxmt_drops); + pcur += ocrdma_add_stat(stats, pcur, "nak_count_psn_sequence_errors", + (u64)rx_stats->nak_count_psn_sequence_errors); + pcur += ocrdma_add_stat(stats, pcur, "rc_drop_count_lookup_errors", + (u64)rx_stats->rc_drop_count_lookup_errors); + pcur += ocrdma_add_stat(stats, pcur, "rq_rnr_naks", + (u64)rx_stats->rq_rnr_naks); + pcur += ocrdma_add_stat(stats, pcur, "srq_rnr_naks", + (u64)rx_stats->srq_rnr_naks); + pcur += ocrdma_add_stat(stats, pcur, "roce_frames", + convert_to_64bit(rx_stats->roce_frames_lo, + rx_stats->roce_frames_hi)); + + return stats; +} + +static char *ocrdma_tx_stats(struct ocrdma_dev *dev) +{ + char *stats = dev->stats_mem.debugfs_mem, *pcur; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_tx_stats *tx_stats = &rdma_stats->tx_stats; + + memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + pcur = stats; + pcur += ocrdma_add_stat(stats, pcur, "send_pkts", + convert_to_64bit(tx_stats->send_pkts_lo, + tx_stats->send_pkts_hi)); + pcur += ocrdma_add_stat(stats, pcur, "write_pkts", + convert_to_64bit(tx_stats->write_pkts_lo, + tx_stats->write_pkts_hi)); + pcur += ocrdma_add_stat(stats, pcur, "read_pkts", + convert_to_64bit(tx_stats->read_pkts_lo, + tx_stats->read_pkts_hi)); + pcur += ocrdma_add_stat(stats, pcur, "read_rsp_pkts", + convert_to_64bit(tx_stats->read_rsp_pkts_lo, + tx_stats->read_rsp_pkts_hi)); + pcur += ocrdma_add_stat(stats, pcur, "ack_pkts", + convert_to_64bit(tx_stats->ack_pkts_lo, + tx_stats->ack_pkts_hi)); + pcur += ocrdma_add_stat(stats, pcur, "send_bytes", + convert_to_64bit(tx_stats->send_bytes_lo, + tx_stats->send_bytes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "write_bytes", + convert_to_64bit(tx_stats->write_bytes_lo, + tx_stats->write_bytes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "read_req_bytes", + convert_to_64bit(tx_stats->read_req_bytes_lo, + tx_stats->read_req_bytes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "read_rsp_bytes", + convert_to_64bit(tx_stats->read_rsp_bytes_lo, + tx_stats->read_rsp_bytes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "ack_timeouts", + (u64)tx_stats->ack_timeouts); + + return stats; +} + +static char *ocrdma_wqe_stats(struct ocrdma_dev *dev) +{ + char *stats = dev->stats_mem.debugfs_mem, *pcur; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_wqe_stats *wqe_stats = &rdma_stats->wqe_stats; + + memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + pcur = stats; + pcur += ocrdma_add_stat(stats, pcur, "large_send_rc_wqes", + convert_to_64bit(wqe_stats->large_send_rc_wqes_lo, + wqe_stats->large_send_rc_wqes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "large_write_rc_wqes", + convert_to_64bit(wqe_stats->large_write_rc_wqes_lo, + wqe_stats->large_write_rc_wqes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "read_wqes", + convert_to_64bit(wqe_stats->read_wqes_lo, + wqe_stats->read_wqes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "frmr_wqes", + convert_to_64bit(wqe_stats->frmr_wqes_lo, + wqe_stats->frmr_wqes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "mw_bind_wqes", + convert_to_64bit(wqe_stats->mw_bind_wqes_lo, + wqe_stats->mw_bind_wqes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "invalidate_wqes", + convert_to_64bit(wqe_stats->invalidate_wqes_lo, + wqe_stats->invalidate_wqes_hi)); + pcur += ocrdma_add_stat(stats, pcur, "dpp_wqe_drops", + (u64)wqe_stats->dpp_wqe_drops); + return stats; +} + +static char *ocrdma_db_errstats(struct ocrdma_dev *dev) +{ + char *stats = dev->stats_mem.debugfs_mem, *pcur; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_db_err_stats *db_err_stats = &rdma_stats->db_err_stats; + + memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + pcur = stats; + pcur += ocrdma_add_stat(stats, pcur, "sq_doorbell_errors", + (u64)db_err_stats->sq_doorbell_errors); + pcur += ocrdma_add_stat(stats, pcur, "cq_doorbell_errors", + (u64)db_err_stats->cq_doorbell_errors); + pcur += ocrdma_add_stat(stats, pcur, "rq_srq_doorbell_errors", + (u64)db_err_stats->rq_srq_doorbell_errors); + pcur += ocrdma_add_stat(stats, pcur, "cq_overflow_errors", + (u64)db_err_stats->cq_overflow_errors); + return stats; +} + +static char *ocrdma_rxqp_errstats(struct ocrdma_dev *dev) +{ + char *stats = dev->stats_mem.debugfs_mem, *pcur; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_rx_qp_err_stats *rx_qp_err_stats = + &rdma_stats->rx_qp_err_stats; + + memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + pcur = stats; + pcur += ocrdma_add_stat(stats, pcur, "nak_invalid_requst_errors", + (u64)rx_qp_err_stats->nak_invalid_requst_errors); + pcur += ocrdma_add_stat(stats, pcur, "nak_remote_operation_errors", + (u64)rx_qp_err_stats->nak_remote_operation_errors); + pcur += ocrdma_add_stat(stats, pcur, "nak_count_remote_access_errors", + (u64)rx_qp_err_stats->nak_count_remote_access_errors); + pcur += ocrdma_add_stat(stats, pcur, "local_length_errors", + (u64)rx_qp_err_stats->local_length_errors); + pcur += ocrdma_add_stat(stats, pcur, "local_protection_errors", + (u64)rx_qp_err_stats->local_protection_errors); + pcur += ocrdma_add_stat(stats, pcur, "local_qp_operation_errors", + (u64)rx_qp_err_stats->local_qp_operation_errors); + return stats; +} + +static char *ocrdma_txqp_errstats(struct ocrdma_dev *dev) +{ + char *stats = dev->stats_mem.debugfs_mem, *pcur; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_tx_qp_err_stats *tx_qp_err_stats = + &rdma_stats->tx_qp_err_stats; + + memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + pcur = stats; + pcur += ocrdma_add_stat(stats, pcur, "local_length_errors", + (u64)tx_qp_err_stats->local_length_errors); + pcur += ocrdma_add_stat(stats, pcur, "local_protection_errors", + (u64)tx_qp_err_stats->local_protection_errors); + pcur += ocrdma_add_stat(stats, pcur, "local_qp_operation_errors", + (u64)tx_qp_err_stats->local_qp_operation_errors); + pcur += ocrdma_add_stat(stats, pcur, "retry_count_exceeded_errors", + (u64)tx_qp_err_stats->retry_count_exceeded_errors); + pcur += ocrdma_add_stat(stats, pcur, "rnr_retry_count_exceeded_errors", + (u64)tx_qp_err_stats->rnr_retry_count_exceeded_errors); + return stats; +} + +static char *ocrdma_tx_dbg_stats(struct ocrdma_dev *dev) +{ + int i; + char *pstats = dev->stats_mem.debugfs_mem; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_tx_dbg_stats *tx_dbg_stats = + &rdma_stats->tx_dbg_stats; + + memset(pstats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + for (i = 0; i < 100; i++) + pstats += snprintf(pstats, 80, "DW[%d] = 0x%x\n", i, + tx_dbg_stats->data[i]); + + return dev->stats_mem.debugfs_mem; +} + +static char *ocrdma_rx_dbg_stats(struct ocrdma_dev *dev) +{ + int i; + char *pstats = dev->stats_mem.debugfs_mem; + struct ocrdma_rdma_stats_resp *rdma_stats = + (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; + struct ocrdma_rx_dbg_stats *rx_dbg_stats = + &rdma_stats->rx_dbg_stats; + + memset(pstats, 0, (OCRDMA_MAX_DBGFS_MEM)); + + for (i = 0; i < 200; i++) + pstats += snprintf(pstats, 80, "DW[%d] = 0x%x\n", i, + rx_dbg_stats->data[i]); + + return dev->stats_mem.debugfs_mem; +} + +static void ocrdma_update_stats(struct ocrdma_dev *dev) +{ + ulong now = jiffies, secs; + int status = 0; + + secs = jiffies_to_msecs(now - dev->last_stats_time) / 1000U; + if (secs) { + /* update */ + status = ocrdma_mbx_rdma_stats(dev, false); + if (status) + pr_err("%s: stats mbox failed with status = %d\n", + __func__, status); + dev->last_stats_time = jiffies; + } +} + +static ssize_t ocrdma_dbgfs_ops_read(struct file *filp, char __user *buffer, + size_t usr_buf_len, loff_t *ppos) +{ + struct ocrdma_stats *pstats = filp->private_data; + struct ocrdma_dev *dev = pstats->dev; + ssize_t status = 0; + char *data = NULL; + + /* No partial reads */ + if (*ppos != 0) + return 0; + + mutex_lock(&dev->stats_lock); + + ocrdma_update_stats(dev); + + switch (pstats->type) { + case OCRDMA_RSRC_STATS: + data = ocrdma_resource_stats(dev); + break; + case OCRDMA_RXSTATS: + data = ocrdma_rx_stats(dev); + break; + case OCRDMA_WQESTATS: + data = ocrdma_wqe_stats(dev); + break; + case OCRDMA_TXSTATS: + data = ocrdma_tx_stats(dev); + break; + case OCRDMA_DB_ERRSTATS: + data = ocrdma_db_errstats(dev); + break; + case OCRDMA_RXQP_ERRSTATS: + data = ocrdma_rxqp_errstats(dev); + break; + case OCRDMA_TXQP_ERRSTATS: + data = ocrdma_txqp_errstats(dev); + break; + case OCRDMA_TX_DBG_STATS: + data = ocrdma_tx_dbg_stats(dev); + break; + case OCRDMA_RX_DBG_STATS: + data = ocrdma_rx_dbg_stats(dev); + break; + + default: + status = -EFAULT; + goto exit; + } + + if (usr_buf_len < strlen(data)) { + status = -ENOSPC; + goto exit; + } + + status = simple_read_from_buffer(buffer, usr_buf_len, ppos, data, + strlen(data)); +exit: + mutex_unlock(&dev->stats_lock); + return status; +} + +static const struct file_operations ocrdma_dbg_ops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = ocrdma_dbgfs_ops_read, +}; + +void ocrdma_add_port_stats(struct ocrdma_dev *dev) +{ + if (!ocrdma_dbgfs_dir) + return; + + /* Create post stats base dir */ + dev->dir = debugfs_create_dir(dev->ibdev.name, ocrdma_dbgfs_dir); + if (!dev->dir) + goto err; + + dev->rsrc_stats.type = OCRDMA_RSRC_STATS; + dev->rsrc_stats.dev = dev; + if (!debugfs_create_file("resource_stats", S_IRUSR, dev->dir, + &dev->rsrc_stats, &ocrdma_dbg_ops)) + goto err; + + dev->rx_stats.type = OCRDMA_RXSTATS; + dev->rx_stats.dev = dev; + if (!debugfs_create_file("rx_stats", S_IRUSR, dev->dir, + &dev->rx_stats, &ocrdma_dbg_ops)) + goto err; + + dev->wqe_stats.type = OCRDMA_WQESTATS; + dev->wqe_stats.dev = dev; + if (!debugfs_create_file("wqe_stats", S_IRUSR, dev->dir, + &dev->wqe_stats, &ocrdma_dbg_ops)) + goto err; + + dev->tx_stats.type = OCRDMA_TXSTATS; + dev->tx_stats.dev = dev; + if (!debugfs_create_file("tx_stats", S_IRUSR, dev->dir, + &dev->tx_stats, &ocrdma_dbg_ops)) + goto err; + + dev->db_err_stats.type = OCRDMA_DB_ERRSTATS; + dev->db_err_stats.dev = dev; + if (!debugfs_create_file("db_err_stats", S_IRUSR, dev->dir, + &dev->db_err_stats, &ocrdma_dbg_ops)) + goto err; + + + dev->tx_qp_err_stats.type = OCRDMA_TXQP_ERRSTATS; + dev->tx_qp_err_stats.dev = dev; + if (!debugfs_create_file("tx_qp_err_stats", S_IRUSR, dev->dir, + &dev->tx_qp_err_stats, &ocrdma_dbg_ops)) + goto err; + + dev->rx_qp_err_stats.type = OCRDMA_RXQP_ERRSTATS; + dev->rx_qp_err_stats.dev = dev; + if (!debugfs_create_file("rx_qp_err_stats", S_IRUSR, dev->dir, + &dev->rx_qp_err_stats, &ocrdma_dbg_ops)) + goto err; + + + dev->tx_dbg_stats.type = OCRDMA_TX_DBG_STATS; + dev->tx_dbg_stats.dev = dev; + if (!debugfs_create_file("tx_dbg_stats", S_IRUSR, dev->dir, + &dev->tx_dbg_stats, &ocrdma_dbg_ops)) + goto err; + + dev->rx_dbg_stats.type = OCRDMA_RX_DBG_STATS; + dev->rx_dbg_stats.dev = dev; + if (!debugfs_create_file("rx_dbg_stats", S_IRUSR, dev->dir, + &dev->rx_dbg_stats, &ocrdma_dbg_ops)) + goto err; + + /* Now create dma_mem for stats mbx command */ + if (!ocrdma_alloc_stats_mem(dev)) + goto err; + + mutex_init(&dev->stats_lock); + + return; +err: + ocrdma_release_stats_mem(dev); + debugfs_remove_recursive(dev->dir); + dev->dir = NULL; +} + +void ocrdma_rem_port_stats(struct ocrdma_dev *dev) +{ + if (!dev->dir) + return; + mutex_destroy(&dev->stats_lock); + ocrdma_release_stats_mem(dev); + debugfs_remove(dev->dir); +} + +void ocrdma_init_debugfs(void) +{ + /* Create base dir in debugfs root dir */ + ocrdma_dbgfs_dir = debugfs_create_dir("ocrdma", NULL); +} + +void ocrdma_rem_debugfs(void) +{ + debugfs_remove_recursive(ocrdma_dbgfs_dir); +} diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.h b/drivers/infiniband/hw/ocrdma/ocrdma_stats.h new file mode 100644 index 0000000..5f5e20c --- /dev/null +++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.h @@ -0,0 +1,54 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for * + * RoCE (RDMA over Converged Ethernet) adapters. * + * Copyright (C) 2008-2014 Emulex. All rights reserved. * + * EMULEX and SLI are trademarks of Emulex. * + * www.emulex.com * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of version 2 of the GNU General * + * Public License as published by the Free Software Foundation. * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID. See the GNU General Public License for * + * more details, a copy of which can be found in the file COPYING * + * included with this package. * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#ifndef __OCRDMA_STATS_H__ +#define __OCRDMA_STATS_H__ + +#include +#include "ocrdma.h" +#include "ocrdma_hw.h" + +#define OCRDMA_MAX_DBGFS_MEM 4096 + +enum OCRDMA_STATS_TYPE { + OCRDMA_RSRC_STATS, + OCRDMA_RXSTATS, + OCRDMA_WQESTATS, + OCRDMA_TXSTATS, + OCRDMA_DB_ERRSTATS, + OCRDMA_RXQP_ERRSTATS, + OCRDMA_TXQP_ERRSTATS, + OCRDMA_TX_DBG_STATS, + OCRDMA_RX_DBG_STATS +}; + +void ocrdma_rem_debugfs(void); +void ocrdma_init_debugfs(void); +void ocrdma_rem_port_stats(struct ocrdma_dev *dev); +void ocrdma_add_port_stats(struct ocrdma_dev *dev); + +#endif /* __OCRDMA_STATS_H__ */ diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c new file mode 100644 index 0000000..4c68305 --- /dev/null +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -0,0 +1,3080 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for * + * RoCE (RDMA over Converged Ethernet) adapters. * + * Copyright (C) 2008-2012 Emulex. All rights reserved. * + * EMULEX and SLI are trademarks of Emulex. * + * www.emulex.com * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of version 2 of the GNU General * + * Public License as published by the Free Software Foundation. * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID. See the GNU General Public License for * + * more details, a copy of which can be found in the file COPYING * + * included with this package. * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include "ocrdma.h" +#include "ocrdma_hw.h" +#include "ocrdma_verbs.h" +#include "ocrdma_abi.h" + +int ocrdma_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) +{ + if (index > 1) + return -EINVAL; + + *pkey = 0xffff; + return 0; +} + +int ocrdma_query_gid(struct ib_device *ibdev, u8 port, + int index, union ib_gid *sgid) +{ + struct ocrdma_dev *dev; + + dev = get_ocrdma_dev(ibdev); + memset(sgid, 0, sizeof(*sgid)); + if (index > OCRDMA_MAX_SGID) + return -EINVAL; + + memcpy(sgid, &dev->sgid_tbl[index], sizeof(*sgid)); + + return 0; +} + +int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr) +{ + struct ocrdma_dev *dev = get_ocrdma_dev(ibdev); + + memset(attr, 0, sizeof *attr); + memcpy(&attr->fw_ver, &dev->attr.fw_ver[0], + min(sizeof(dev->attr.fw_ver), sizeof(attr->fw_ver))); + ocrdma_get_guid(dev, (u8 *)&attr->sys_image_guid); + attr->max_mr_size = dev->attr.max_mr_size; + attr->page_size_cap = 0xffff000; + attr->vendor_id = dev->nic_info.pdev->vendor; + attr->vendor_part_id = dev->nic_info.pdev->device; + attr->hw_ver = dev->asic_id; + attr->max_qp = dev->attr.max_qp; + attr->max_ah = OCRDMA_MAX_AH; + attr->max_qp_wr = dev->attr.max_wqe; + + attr->device_cap_flags = IB_DEVICE_CURR_QP_STATE_MOD | + IB_DEVICE_RC_RNR_NAK_GEN | + IB_DEVICE_SHUTDOWN_PORT | + IB_DEVICE_SYS_IMAGE_GUID | + IB_DEVICE_LOCAL_DMA_LKEY | + IB_DEVICE_MEM_MGT_EXTENSIONS; + attr->max_sge = min(dev->attr.max_send_sge, dev->attr.max_srq_sge); + attr->max_sge_rd = 0; + attr->max_cq = dev->attr.max_cq; + attr->max_cqe = dev->attr.max_cqe; + attr->max_mr = dev->attr.max_mr; + attr->max_mw = dev->attr.max_mw; + attr->max_pd = dev->attr.max_pd; + attr->atomic_cap = 0; + attr->max_fmr = 0; + attr->max_map_per_fmr = 0; + attr->max_qp_rd_atom = + min(dev->attr.max_ord_per_qp, dev->attr.max_ird_per_qp); + attr->max_qp_init_rd_atom = dev->attr.max_ord_per_qp; + attr->max_srq = dev->attr.max_srq; + attr->max_srq_sge = dev->attr.max_srq_sge; + attr->max_srq_wr = dev->attr.max_rqe; + attr->local_ca_ack_delay = dev->attr.local_ca_ack_delay; + attr->max_fast_reg_page_list_len = dev->attr.max_pages_per_frmr; + attr->max_pkeys = 1; + return 0; +} + +static inline void get_link_speed_and_width(struct ocrdma_dev *dev, + u8 *ib_speed, u8 *ib_width) +{ + int status; + u8 speed; + + status = ocrdma_mbx_get_link_speed(dev, &speed); + if (status) + speed = OCRDMA_PHYS_LINK_SPEED_ZERO; + + switch (speed) { + case OCRDMA_PHYS_LINK_SPEED_1GBPS: + *ib_speed = IB_SPEED_SDR; + *ib_width = IB_WIDTH_1X; + break; + + case OCRDMA_PHYS_LINK_SPEED_10GBPS: + *ib_speed = IB_SPEED_QDR; + *ib_width = IB_WIDTH_1X; + break; + + case OCRDMA_PHYS_LINK_SPEED_20GBPS: + *ib_speed = IB_SPEED_DDR; + *ib_width = IB_WIDTH_4X; + break; + + case OCRDMA_PHYS_LINK_SPEED_40GBPS: + *ib_speed = IB_SPEED_QDR; + *ib_width = IB_WIDTH_4X; + break; + + default: + /* Unsupported */ + *ib_speed = IB_SPEED_SDR; + *ib_width = IB_WIDTH_1X; + } +} + +int ocrdma_query_port(struct ib_device *ibdev, + u8 port, struct ib_port_attr *props) +{ + enum ib_port_state port_state; + struct ocrdma_dev *dev; + struct net_device *netdev; + + dev = get_ocrdma_dev(ibdev); + if (port > 1) { + pr_err("%s(%d) invalid_port=0x%x\n", __func__, + dev->id, port); + return -EINVAL; + } + netdev = dev->nic_info.netdev; + if (netif_running(netdev) && netif_oper_up(netdev)) { + port_state = IB_PORT_ACTIVE; + props->phys_state = 5; + } else { + port_state = IB_PORT_DOWN; + props->phys_state = 3; + } + props->max_mtu = IB_MTU_4096; + props->active_mtu = iboe_get_mtu(netdev->mtu); + props->lid = 0; + props->lmc = 0; + props->sm_lid = 0; + props->sm_sl = 0; + props->state = port_state; + props->port_cap_flags = + IB_PORT_CM_SUP | + IB_PORT_REINIT_SUP | + IB_PORT_DEVICE_MGMT_SUP | IB_PORT_VENDOR_CLASS_SUP | IB_PORT_IP_BASED_GIDS; + props->gid_tbl_len = OCRDMA_MAX_SGID; + props->pkey_tbl_len = 1; + props->bad_pkey_cntr = 0; + props->qkey_viol_cntr = 0; + get_link_speed_and_width(dev, &props->active_speed, + &props->active_width); + props->max_msg_sz = 0x80000000; + props->max_vl_num = 4; + return 0; +} + +int ocrdma_modify_port(struct ib_device *ibdev, u8 port, int mask, + struct ib_port_modify *props) +{ + struct ocrdma_dev *dev; + + dev = get_ocrdma_dev(ibdev); + if (port > 1) { + pr_err("%s(%d) invalid_port=0x%x\n", __func__, dev->id, port); + return -EINVAL; + } + return 0; +} + +static int ocrdma_add_mmap(struct ocrdma_ucontext *uctx, u64 phy_addr, + unsigned long len) +{ + struct ocrdma_mm *mm; + + mm = kzalloc(sizeof(*mm), GFP_KERNEL); + if (mm == NULL) + return -ENOMEM; + mm->key.phy_addr = phy_addr; + mm->key.len = len; + INIT_LIST_HEAD(&mm->entry); + + mutex_lock(&uctx->mm_list_lock); + list_add_tail(&mm->entry, &uctx->mm_head); + mutex_unlock(&uctx->mm_list_lock); + return 0; +} + +static void ocrdma_del_mmap(struct ocrdma_ucontext *uctx, u64 phy_addr, + unsigned long len) +{ + struct ocrdma_mm *mm, *tmp; + + mutex_lock(&uctx->mm_list_lock); + list_for_each_entry_safe(mm, tmp, &uctx->mm_head, entry) { + if (len != mm->key.len && phy_addr != mm->key.phy_addr) + continue; + + list_del(&mm->entry); + kfree(mm); + break; + } + mutex_unlock(&uctx->mm_list_lock); +} + +static bool ocrdma_search_mmap(struct ocrdma_ucontext *uctx, u64 phy_addr, + unsigned long len) +{ + bool found = false; + struct ocrdma_mm *mm; + + mutex_lock(&uctx->mm_list_lock); + list_for_each_entry(mm, &uctx->mm_head, entry) { + if (len != mm->key.len && phy_addr != mm->key.phy_addr) + continue; + + found = true; + break; + } + mutex_unlock(&uctx->mm_list_lock); + return found; +} + +static struct ocrdma_pd *_ocrdma_alloc_pd(struct ocrdma_dev *dev, + struct ocrdma_ucontext *uctx, + struct ib_udata *udata) +{ + struct ocrdma_pd *pd = NULL; + int status = 0; + + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return ERR_PTR(-ENOMEM); + + if (udata && uctx) { + pd->dpp_enabled = + ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R; + pd->num_dpp_qp = + pd->dpp_enabled ? (dev->nic_info.db_page_size / + dev->attr.wqe_size) : 0; + } + +retry: + status = ocrdma_mbx_alloc_pd(dev, pd); + if (status) { + if (pd->dpp_enabled) { + pd->dpp_enabled = false; + pd->num_dpp_qp = 0; + goto retry; + } else { + kfree(pd); + return ERR_PTR(status); + } + } + + return pd; +} + +static inline int is_ucontext_pd(struct ocrdma_ucontext *uctx, + struct ocrdma_pd *pd) +{ + return (uctx->cntxt_pd == pd ? true : false); +} + +static int _ocrdma_dealloc_pd(struct ocrdma_dev *dev, + struct ocrdma_pd *pd) +{ + int status = 0; + + status = ocrdma_mbx_dealloc_pd(dev, pd); + kfree(pd); + return status; +} + +static int ocrdma_alloc_ucontext_pd(struct ocrdma_dev *dev, + struct ocrdma_ucontext *uctx, + struct ib_udata *udata) +{ + int status = 0; + + uctx->cntxt_pd = _ocrdma_alloc_pd(dev, uctx, udata); + if (IS_ERR(uctx->cntxt_pd)) { + status = PTR_ERR(uctx->cntxt_pd); + uctx->cntxt_pd = NULL; + goto err; + } + + uctx->cntxt_pd->uctx = uctx; + uctx->cntxt_pd->ibpd.device = &dev->ibdev; +err: + return status; +} + +static int ocrdma_dealloc_ucontext_pd(struct ocrdma_ucontext *uctx) +{ + int status = 0; + struct ocrdma_pd *pd = uctx->cntxt_pd; + struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device); + + if (uctx->pd_in_use) { + pr_err("%s(%d) Freeing in use pdid=0x%x.\n", + __func__, dev->id, pd->id); + } + uctx->cntxt_pd = NULL; + status = _ocrdma_dealloc_pd(dev, pd); + return status; +} + +static struct ocrdma_pd *ocrdma_get_ucontext_pd(struct ocrdma_ucontext *uctx) +{ + struct ocrdma_pd *pd = NULL; + + mutex_lock(&uctx->mm_list_lock); + if (!uctx->pd_in_use) { + uctx->pd_in_use = true; + pd = uctx->cntxt_pd; + } + mutex_unlock(&uctx->mm_list_lock); + + return pd; +} + +static void ocrdma_release_ucontext_pd(struct ocrdma_ucontext *uctx) +{ + mutex_lock(&uctx->mm_list_lock); + uctx->pd_in_use = false; + mutex_unlock(&uctx->mm_list_lock); +} + +struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + int status; + struct ocrdma_ucontext *ctx; + struct ocrdma_alloc_ucontext_resp resp; + struct ocrdma_dev *dev = get_ocrdma_dev(ibdev); + struct pci_dev *pdev = dev->nic_info.pdev; + u32 map_len = roundup(sizeof(u32) * 2048, PAGE_SIZE); + + if (!udata) + return ERR_PTR(-EFAULT); + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return ERR_PTR(-ENOMEM); + INIT_LIST_HEAD(&ctx->mm_head); + mutex_init(&ctx->mm_list_lock); + + ctx->ah_tbl.va = dma_alloc_coherent(&pdev->dev, map_len, + &ctx->ah_tbl.pa, GFP_KERNEL); + if (!ctx->ah_tbl.va) { + kfree(ctx); + return ERR_PTR(-ENOMEM); + } + memset(ctx->ah_tbl.va, 0, map_len); + ctx->ah_tbl.len = map_len; + + memset(&resp, 0, sizeof(resp)); + resp.ah_tbl_len = ctx->ah_tbl.len; + resp.ah_tbl_page = virt_to_phys(ctx->ah_tbl.va); + + status = ocrdma_add_mmap(ctx, resp.ah_tbl_page, resp.ah_tbl_len); + if (status) + goto map_err; + + status = ocrdma_alloc_ucontext_pd(dev, ctx, udata); + if (status) + goto pd_err; + + resp.dev_id = dev->id; + resp.max_inline_data = dev->attr.max_inline_data; + resp.wqe_size = dev->attr.wqe_size; + resp.rqe_size = dev->attr.rqe_size; + resp.dpp_wqe_size = dev->attr.wqe_size; + + memcpy(resp.fw_ver, dev->attr.fw_ver, sizeof(resp.fw_ver)); + status = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (status) + goto cpy_err; + return &ctx->ibucontext; + +cpy_err: +pd_err: + ocrdma_del_mmap(ctx, ctx->ah_tbl.pa, ctx->ah_tbl.len); +map_err: + dma_free_coherent(&pdev->dev, ctx->ah_tbl.len, ctx->ah_tbl.va, + ctx->ah_tbl.pa); + kfree(ctx); + return ERR_PTR(status); +} + +int ocrdma_dealloc_ucontext(struct ib_ucontext *ibctx) +{ + int status = 0; + struct ocrdma_mm *mm, *tmp; + struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ibctx); + struct ocrdma_dev *dev = get_ocrdma_dev(ibctx->device); + struct pci_dev *pdev = dev->nic_info.pdev; + + status = ocrdma_dealloc_ucontext_pd(uctx); + + ocrdma_del_mmap(uctx, uctx->ah_tbl.pa, uctx->ah_tbl.len); + dma_free_coherent(&pdev->dev, uctx->ah_tbl.len, uctx->ah_tbl.va, + uctx->ah_tbl.pa); + + list_for_each_entry_safe(mm, tmp, &uctx->mm_head, entry) { + list_del(&mm->entry); + kfree(mm); + } + kfree(uctx); + return status; +} + +int ocrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + struct ocrdma_ucontext *ucontext = get_ocrdma_ucontext(context); + struct ocrdma_dev *dev = get_ocrdma_dev(context->device); + unsigned long vm_page = vma->vm_pgoff << PAGE_SHIFT; + u64 unmapped_db = (u64) dev->nic_info.unmapped_db; + unsigned long len = (vma->vm_end - vma->vm_start); + int status = 0; + bool found; + + if (vma->vm_start & (PAGE_SIZE - 1)) + return -EINVAL; + found = ocrdma_search_mmap(ucontext, vma->vm_pgoff << PAGE_SHIFT, len); + if (!found) + return -EINVAL; + + if ((vm_page >= unmapped_db) && (vm_page <= (unmapped_db + + dev->nic_info.db_total_size)) && + (len <= dev->nic_info.db_page_size)) { + if (vma->vm_flags & VM_READ) + return -EPERM; + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + status = io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, + len, vma->vm_page_prot); + } else if (dev->nic_info.dpp_unmapped_len && + (vm_page >= (u64) dev->nic_info.dpp_unmapped_addr) && + (vm_page <= (u64) (dev->nic_info.dpp_unmapped_addr + + dev->nic_info.dpp_unmapped_len)) && + (len <= dev->nic_info.dpp_unmapped_len)) { + if (vma->vm_flags & VM_READ) + return -EPERM; + + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + status = io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, + len, vma->vm_page_prot); + } else { + status = remap_pfn_range(vma, vma->vm_start, + vma->vm_pgoff, len, vma->vm_page_prot); + } + return status; +} + +static int ocrdma_copy_pd_uresp(struct ocrdma_dev *dev, struct ocrdma_pd *pd, + struct ib_ucontext *ib_ctx, + struct ib_udata *udata) +{ + int status; + u64 db_page_addr; + u64 dpp_page_addr = 0; + u32 db_page_size; + struct ocrdma_alloc_pd_uresp rsp; + struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ib_ctx); + + memset(&rsp, 0, sizeof(rsp)); + rsp.id = pd->id; + rsp.dpp_enabled = pd->dpp_enabled; + db_page_addr = ocrdma_get_db_addr(dev, pd->id); + db_page_size = dev->nic_info.db_page_size; + + status = ocrdma_add_mmap(uctx, db_page_addr, db_page_size); + if (status) + return status; + + if (pd->dpp_enabled) { + dpp_page_addr = dev->nic_info.dpp_unmapped_addr + + (pd->id * PAGE_SIZE); + status = ocrdma_add_mmap(uctx, dpp_page_addr, + PAGE_SIZE); + if (status) + goto dpp_map_err; + rsp.dpp_page_addr_hi = upper_32_bits(dpp_page_addr); + rsp.dpp_page_addr_lo = dpp_page_addr; + } + + status = ib_copy_to_udata(udata, &rsp, sizeof(rsp)); + if (status) + goto ucopy_err; + + pd->uctx = uctx; + return 0; + +ucopy_err: + if (pd->dpp_enabled) + ocrdma_del_mmap(pd->uctx, dpp_page_addr, PAGE_SIZE); +dpp_map_err: + ocrdma_del_mmap(pd->uctx, db_page_addr, db_page_size); + return status; +} + +struct ib_pd *ocrdma_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct ocrdma_dev *dev = get_ocrdma_dev(ibdev); + struct ocrdma_pd *pd; + struct ocrdma_ucontext *uctx = NULL; + int status; + u8 is_uctx_pd = false; + + if (udata && context) { + uctx = get_ocrdma_ucontext(context); + pd = ocrdma_get_ucontext_pd(uctx); + if (pd) { + is_uctx_pd = true; + goto pd_mapping; + } + } + + pd = _ocrdma_alloc_pd(dev, uctx, udata); + if (IS_ERR(pd)) { + status = PTR_ERR(pd); + goto exit; + } + +pd_mapping: + if (udata && context) { + status = ocrdma_copy_pd_uresp(dev, pd, context, udata); + if (status) + goto err; + } + return &pd->ibpd; + +err: + if (is_uctx_pd) { + ocrdma_release_ucontext_pd(uctx); + } else { + status = ocrdma_mbx_dealloc_pd(dev, pd); + kfree(pd); + } +exit: + return ERR_PTR(status); +} + +int ocrdma_dealloc_pd(struct ib_pd *ibpd) +{ + struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); + struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); + struct ocrdma_ucontext *uctx = NULL; + int status = 0; + u64 usr_db; + + uctx = pd->uctx; + if (uctx) { + u64 dpp_db = dev->nic_info.dpp_unmapped_addr + + (pd->id * PAGE_SIZE); + if (pd->dpp_enabled) + ocrdma_del_mmap(pd->uctx, dpp_db, PAGE_SIZE); + usr_db = ocrdma_get_db_addr(dev, pd->id); + ocrdma_del_mmap(pd->uctx, usr_db, dev->nic_info.db_page_size); + + if (is_ucontext_pd(uctx, pd)) { + ocrdma_release_ucontext_pd(uctx); + return status; + } + } + status = _ocrdma_dealloc_pd(dev, pd); + return status; +} + +static int ocrdma_alloc_lkey(struct ocrdma_dev *dev, struct ocrdma_mr *mr, + u32 pdid, int acc, u32 num_pbls, u32 addr_check) +{ + int status; + + mr->hwmr.fr_mr = 0; + mr->hwmr.local_rd = 1; + mr->hwmr.remote_rd = (acc & IB_ACCESS_REMOTE_READ) ? 1 : 0; + mr->hwmr.remote_wr = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0; + mr->hwmr.local_wr = (acc & IB_ACCESS_LOCAL_WRITE) ? 1 : 0; + mr->hwmr.mw_bind = (acc & IB_ACCESS_MW_BIND) ? 1 : 0; + mr->hwmr.remote_atomic = (acc & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0; + mr->hwmr.num_pbls = num_pbls; + + status = ocrdma_mbx_alloc_lkey(dev, &mr->hwmr, pdid, addr_check); + if (status) + return status; + + mr->ibmr.lkey = mr->hwmr.lkey; + if (mr->hwmr.remote_wr || mr->hwmr.remote_rd) + mr->ibmr.rkey = mr->hwmr.lkey; + return 0; +} + +struct ib_mr *ocrdma_get_dma_mr(struct ib_pd *ibpd, int acc) +{ + int status; + struct ocrdma_mr *mr; + struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); + struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); + + if (acc & IB_ACCESS_REMOTE_WRITE && !(acc & IB_ACCESS_LOCAL_WRITE)) { + pr_err("%s err, invalid access rights\n", __func__); + return ERR_PTR(-EINVAL); + } + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + status = ocrdma_alloc_lkey(dev, mr, pd->id, acc, 0, + OCRDMA_ADDR_CHECK_DISABLE); + if (status) { + kfree(mr); + return ERR_PTR(status); + } + + return &mr->ibmr; +} + +static void ocrdma_free_mr_pbl_tbl(struct ocrdma_dev *dev, + struct ocrdma_hw_mr *mr) +{ + struct pci_dev *pdev = dev->nic_info.pdev; + int i = 0; + + if (mr->pbl_table) { + for (i = 0; i < mr->num_pbls; i++) { + if (!mr->pbl_table[i].va) + continue; + dma_free_coherent(&pdev->dev, mr->pbl_size, + mr->pbl_table[i].va, + mr->pbl_table[i].pa); + } + kfree(mr->pbl_table); + mr->pbl_table = NULL; + } +} + +static int ocrdma_get_pbl_info(struct ocrdma_dev *dev, struct ocrdma_mr *mr, + u32 num_pbes) +{ + u32 num_pbls = 0; + u32 idx = 0; + int status = 0; + u32 pbl_size; + + do { + pbl_size = OCRDMA_MIN_HPAGE_SIZE * (1 << idx); + if (pbl_size > MAX_OCRDMA_PBL_SIZE) { + status = -EFAULT; + break; + } + num_pbls = roundup(num_pbes, (pbl_size / sizeof(u64))); + num_pbls = num_pbls / (pbl_size / sizeof(u64)); + idx++; + } while (num_pbls >= dev->attr.max_num_mr_pbl); + + mr->hwmr.num_pbes = num_pbes; + mr->hwmr.num_pbls = num_pbls; + mr->hwmr.pbl_size = pbl_size; + return status; +} + +static int ocrdma_build_pbl_tbl(struct ocrdma_dev *dev, struct ocrdma_hw_mr *mr) +{ + int status = 0; + int i; + u32 dma_len = mr->pbl_size; + struct pci_dev *pdev = dev->nic_info.pdev; + void *va; + dma_addr_t pa; + + mr->pbl_table = kzalloc(sizeof(struct ocrdma_pbl) * + mr->num_pbls, GFP_KERNEL); + + if (!mr->pbl_table) + return -ENOMEM; + + for (i = 0; i < mr->num_pbls; i++) { + va = dma_alloc_coherent(&pdev->dev, dma_len, &pa, GFP_KERNEL); + if (!va) { + ocrdma_free_mr_pbl_tbl(dev, mr); + status = -ENOMEM; + break; + } + memset(va, 0, dma_len); + mr->pbl_table[i].va = va; + mr->pbl_table[i].pa = pa; + } + return status; +} + +static void build_user_pbes(struct ocrdma_dev *dev, struct ocrdma_mr *mr, + u32 num_pbes) +{ + struct ocrdma_pbe *pbe; + struct scatterlist *sg; + struct ocrdma_pbl *pbl_tbl = mr->hwmr.pbl_table; + struct ib_umem *umem = mr->umem; + int shift, pg_cnt, pages, pbe_cnt, entry, total_num_pbes = 0; + + if (!mr->hwmr.num_pbes) + return; + + pbe = (struct ocrdma_pbe *)pbl_tbl->va; + pbe_cnt = 0; + + shift = ilog2(umem->page_size); + + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + pages = sg_dma_len(sg) >> shift; + for (pg_cnt = 0; pg_cnt < pages; pg_cnt++) { + /* store the page address in pbe */ + pbe->pa_lo = + cpu_to_le32(sg_dma_address + (sg) + + (umem->page_size * pg_cnt)); + pbe->pa_hi = + cpu_to_le32(upper_32_bits + ((sg_dma_address + (sg) + + umem->page_size * pg_cnt))); + pbe_cnt += 1; + total_num_pbes += 1; + pbe++; + + /* if done building pbes, issue the mbx cmd. */ + if (total_num_pbes == num_pbes) + return; + + /* if the given pbl is full storing the pbes, + * move to next pbl. + */ + if (pbe_cnt == + (mr->hwmr.pbl_size / sizeof(u64))) { + pbl_tbl++; + pbe = (struct ocrdma_pbe *)pbl_tbl->va; + pbe_cnt = 0; + } + + } + } +} + +struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, + u64 usr_addr, int acc, struct ib_udata *udata) +{ + int status = -ENOMEM; + struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); + struct ocrdma_mr *mr; + struct ocrdma_pd *pd; + u32 num_pbes; + + pd = get_ocrdma_pd(ibpd); + + if (acc & IB_ACCESS_REMOTE_WRITE && !(acc & IB_ACCESS_LOCAL_WRITE)) + return ERR_PTR(-EINVAL); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(status); + mr->umem = ib_umem_get(ibpd->uobject->context, start, len, acc, 0); + if (IS_ERR(mr->umem)) { + status = -EFAULT; + goto umem_err; + } + num_pbes = ib_umem_page_count(mr->umem); + status = ocrdma_get_pbl_info(dev, mr, num_pbes); + if (status) + goto umem_err; + + mr->hwmr.pbe_size = mr->umem->page_size; + mr->hwmr.fbo = mr->umem->offset; + mr->hwmr.va = usr_addr; + mr->hwmr.len = len; + mr->hwmr.remote_wr = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0; + mr->hwmr.remote_rd = (acc & IB_ACCESS_REMOTE_READ) ? 1 : 0; + mr->hwmr.local_wr = (acc & IB_ACCESS_LOCAL_WRITE) ? 1 : 0; + mr->hwmr.local_rd = 1; + mr->hwmr.remote_atomic = (acc & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0; + status = ocrdma_build_pbl_tbl(dev, &mr->hwmr); + if (status) + goto umem_err; + build_user_pbes(dev, mr, num_pbes); + status = ocrdma_reg_mr(dev, &mr->hwmr, pd->id, acc); + if (status) + goto mbx_err; + mr->ibmr.lkey = mr->hwmr.lkey; + if (mr->hwmr.remote_wr || mr->hwmr.remote_rd) + mr->ibmr.rkey = mr->hwmr.lkey; + + return &mr->ibmr; + +mbx_err: + ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr); +umem_err: + kfree(mr); + return ERR_PTR(status); +} + +int ocrdma_dereg_mr(struct ib_mr *ib_mr) +{ + struct ocrdma_mr *mr = get_ocrdma_mr(ib_mr); + struct ocrdma_dev *dev = get_ocrdma_dev(ib_mr->device); + int status; + + status = ocrdma_mbx_dealloc_lkey(dev, mr->hwmr.fr_mr, mr->hwmr.lkey); + + ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr); + + /* it could be user registered memory. */ + if (mr->umem) + ib_umem_release(mr->umem); + kfree(mr); + + /* Don't stop cleanup, in case FW is unresponsive */ + if (dev->mqe_ctx.fw_error_state) { + status = 0; + pr_err("%s(%d) fw not responding.\n", + __func__, dev->id); + } + return status; +} + +static int ocrdma_copy_cq_uresp(struct ocrdma_dev *dev, struct ocrdma_cq *cq, + struct ib_udata *udata, + struct ib_ucontext *ib_ctx) +{ + int status; + struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ib_ctx); + struct ocrdma_create_cq_uresp uresp; + + memset(&uresp, 0, sizeof(uresp)); + uresp.cq_id = cq->id; + uresp.page_size = PAGE_ALIGN(cq->len); + uresp.num_pages = 1; + uresp.max_hw_cqe = cq->max_hw_cqe; + uresp.page_addr[0] = virt_to_phys(cq->va); + uresp.db_page_addr = ocrdma_get_db_addr(dev, uctx->cntxt_pd->id); + uresp.db_page_size = dev->nic_info.db_page_size; + uresp.phase_change = cq->phase_change ? 1 : 0; + status = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (status) { + pr_err("%s(%d) copy error cqid=0x%x.\n", + __func__, dev->id, cq->id); + goto err; + } + status = ocrdma_add_mmap(uctx, uresp.db_page_addr, uresp.db_page_size); + if (status) + goto err; + status = ocrdma_add_mmap(uctx, uresp.page_addr[0], uresp.page_size); + if (status) { + ocrdma_del_mmap(uctx, uresp.db_page_addr, uresp.db_page_size); + goto err; + } + cq->ucontext = uctx; +err: + return status; +} + +struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev, int entries, int vector, + struct ib_ucontext *ib_ctx, + struct ib_udata *udata) +{ + struct ocrdma_cq *cq; + struct ocrdma_dev *dev = get_ocrdma_dev(ibdev); + struct ocrdma_ucontext *uctx = NULL; + u16 pd_id = 0; + int status; + struct ocrdma_create_cq_ureq ureq; + + if (udata) { + if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) + return ERR_PTR(-EFAULT); + } else + ureq.dpp_cq = 0; + cq = kzalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&cq->cq_lock); + spin_lock_init(&cq->comp_handler_lock); + INIT_LIST_HEAD(&cq->sq_head); + INIT_LIST_HEAD(&cq->rq_head); + cq->first_arm = true; + + if (ib_ctx) { + uctx = get_ocrdma_ucontext(ib_ctx); + pd_id = uctx->cntxt_pd->id; + } + + status = ocrdma_mbx_create_cq(dev, cq, entries, ureq.dpp_cq, pd_id); + if (status) { + kfree(cq); + return ERR_PTR(status); + } + if (ib_ctx) { + status = ocrdma_copy_cq_uresp(dev, cq, udata, ib_ctx); + if (status) + goto ctx_err; + } + cq->phase = OCRDMA_CQE_VALID; + dev->cq_tbl[cq->id] = cq; + return &cq->ibcq; + +ctx_err: + ocrdma_mbx_destroy_cq(dev, cq); + kfree(cq); + return ERR_PTR(status); +} + +int ocrdma_resize_cq(struct ib_cq *ibcq, int new_cnt, + struct ib_udata *udata) +{ + int status = 0; + struct ocrdma_cq *cq = get_ocrdma_cq(ibcq); + + if (new_cnt < 1 || new_cnt > cq->max_hw_cqe) { + status = -EINVAL; + return status; + } + ibcq->cqe = new_cnt; + return status; +} + +static void ocrdma_flush_cq(struct ocrdma_cq *cq) +{ + int cqe_cnt; + int valid_count = 0; + unsigned long flags; + + struct ocrdma_dev *dev = get_ocrdma_dev(cq->ibcq.device); + struct ocrdma_cqe *cqe = NULL; + + cqe = cq->va; + cqe_cnt = cq->cqe_cnt; + + /* Last irq might have scheduled a polling thread + * sync-up with it before hard flushing. + */ + spin_lock_irqsave(&cq->cq_lock, flags); + while (cqe_cnt) { + if (is_cqe_valid(cq, cqe)) + valid_count++; + cqe++; + cqe_cnt--; + } + ocrdma_ring_cq_db(dev, cq->id, false, false, valid_count); + spin_unlock_irqrestore(&cq->cq_lock, flags); +} + +int ocrdma_destroy_cq(struct ib_cq *ibcq) +{ + int status; + struct ocrdma_cq *cq = get_ocrdma_cq(ibcq); + struct ocrdma_eq *eq = NULL; + struct ocrdma_dev *dev = get_ocrdma_dev(ibcq->device); + int pdid = 0; + u32 irq, indx; + + dev->cq_tbl[cq->id] = NULL; + indx = ocrdma_get_eq_table_index(dev, cq->eqn); + if (indx == -EINVAL) + BUG(); + + eq = &dev->eq_tbl[indx]; + irq = ocrdma_get_irq(dev, eq); + synchronize_irq(irq); + ocrdma_flush_cq(cq); + + status = ocrdma_mbx_destroy_cq(dev, cq); + if (cq->ucontext) { + pdid = cq->ucontext->cntxt_pd->id; + ocrdma_del_mmap(cq->ucontext, (u64) cq->pa, + PAGE_ALIGN(cq->len)); + ocrdma_del_mmap(cq->ucontext, + ocrdma_get_db_addr(dev, pdid), + dev->nic_info.db_page_size); + } + + kfree(cq); + return status; +} + +static int ocrdma_add_qpn_map(struct ocrdma_dev *dev, struct ocrdma_qp *qp) +{ + int status = -EINVAL; + + if (qp->id < OCRDMA_MAX_QP && dev->qp_tbl[qp->id] == NULL) { + dev->qp_tbl[qp->id] = qp; + status = 0; + } + return status; +} + +static void ocrdma_del_qpn_map(struct ocrdma_dev *dev, struct ocrdma_qp *qp) +{ + dev->qp_tbl[qp->id] = NULL; +} + +static int ocrdma_check_qp_params(struct ib_pd *ibpd, struct ocrdma_dev *dev, + struct ib_qp_init_attr *attrs) +{ + if ((attrs->qp_type != IB_QPT_GSI) && + (attrs->qp_type != IB_QPT_RC) && + (attrs->qp_type != IB_QPT_UC) && + (attrs->qp_type != IB_QPT_UD)) { + pr_err("%s(%d) unsupported qp type=0x%x requested\n", + __func__, dev->id, attrs->qp_type); + return -EINVAL; + } + /* Skip the check for QP1 to support CM size of 128 */ + if ((attrs->qp_type != IB_QPT_GSI) && + (attrs->cap.max_send_wr > dev->attr.max_wqe)) { + pr_err("%s(%d) unsupported send_wr=0x%x requested\n", + __func__, dev->id, attrs->cap.max_send_wr); + pr_err("%s(%d) supported send_wr=0x%x\n", + __func__, dev->id, dev->attr.max_wqe); + return -EINVAL; + } + if (!attrs->srq && (attrs->cap.max_recv_wr > dev->attr.max_rqe)) { + pr_err("%s(%d) unsupported recv_wr=0x%x requested\n", + __func__, dev->id, attrs->cap.max_recv_wr); + pr_err("%s(%d) supported recv_wr=0x%x\n", + __func__, dev->id, dev->attr.max_rqe); + return -EINVAL; + } + if (attrs->cap.max_inline_data > dev->attr.max_inline_data) { + pr_err("%s(%d) unsupported inline data size=0x%x requested\n", + __func__, dev->id, attrs->cap.max_inline_data); + pr_err("%s(%d) supported inline data size=0x%x\n", + __func__, dev->id, dev->attr.max_inline_data); + return -EINVAL; + } + if (attrs->cap.max_send_sge > dev->attr.max_send_sge) { + pr_err("%s(%d) unsupported send_sge=0x%x requested\n", + __func__, dev->id, attrs->cap.max_send_sge); + pr_err("%s(%d) supported send_sge=0x%x\n", + __func__, dev->id, dev->attr.max_send_sge); + return -EINVAL; + } + if (attrs->cap.max_recv_sge > dev->attr.max_recv_sge) { + pr_err("%s(%d) unsupported recv_sge=0x%x requested\n", + __func__, dev->id, attrs->cap.max_recv_sge); + pr_err("%s(%d) supported recv_sge=0x%x\n", + __func__, dev->id, dev->attr.max_recv_sge); + return -EINVAL; + } + /* unprivileged user space cannot create special QP */ + if (ibpd->uobject && attrs->qp_type == IB_QPT_GSI) { + pr_err + ("%s(%d) Userspace can't create special QPs of type=0x%x\n", + __func__, dev->id, attrs->qp_type); + return -EINVAL; + } + /* allow creating only one GSI type of QP */ + if (attrs->qp_type == IB_QPT_GSI && dev->gsi_qp_created) { + pr_err("%s(%d) GSI special QPs already created.\n", + __func__, dev->id); + return -EINVAL; + } + /* verify consumer QPs are not trying to use GSI QP's CQ */ + if ((attrs->qp_type != IB_QPT_GSI) && (dev->gsi_qp_created)) { + if ((dev->gsi_sqcq == get_ocrdma_cq(attrs->send_cq)) || + (dev->gsi_rqcq == get_ocrdma_cq(attrs->recv_cq))) { + pr_err("%s(%d) Consumer QP cannot use GSI CQs.\n", + __func__, dev->id); + return -EINVAL; + } + } + return 0; +} + +static int ocrdma_copy_qp_uresp(struct ocrdma_qp *qp, + struct ib_udata *udata, int dpp_offset, + int dpp_credit_lmt, int srq) +{ + int status = 0; + u64 usr_db; + struct ocrdma_create_qp_uresp uresp; + struct ocrdma_dev *dev = qp->dev; + struct ocrdma_pd *pd = qp->pd; + + memset(&uresp, 0, sizeof(uresp)); + usr_db = dev->nic_info.unmapped_db + + (pd->id * dev->nic_info.db_page_size); + uresp.qp_id = qp->id; + uresp.sq_dbid = qp->sq.dbid; + uresp.num_sq_pages = 1; + uresp.sq_page_size = PAGE_ALIGN(qp->sq.len); + uresp.sq_page_addr[0] = virt_to_phys(qp->sq.va); + uresp.num_wqe_allocated = qp->sq.max_cnt; + if (!srq) { + uresp.rq_dbid = qp->rq.dbid; + uresp.num_rq_pages = 1; + uresp.rq_page_size = PAGE_ALIGN(qp->rq.len); + uresp.rq_page_addr[0] = virt_to_phys(qp->rq.va); + uresp.num_rqe_allocated = qp->rq.max_cnt; + } + uresp.db_page_addr = usr_db; + uresp.db_page_size = dev->nic_info.db_page_size; + uresp.db_sq_offset = OCRDMA_DB_GEN2_SQ_OFFSET; + uresp.db_rq_offset = OCRDMA_DB_GEN2_RQ_OFFSET; + uresp.db_shift = OCRDMA_DB_RQ_SHIFT; + + if (qp->dpp_enabled) { + uresp.dpp_credit = dpp_credit_lmt; + uresp.dpp_offset = dpp_offset; + } + status = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (status) { + pr_err("%s(%d) user copy error.\n", __func__, dev->id); + goto err; + } + status = ocrdma_add_mmap(pd->uctx, uresp.sq_page_addr[0], + uresp.sq_page_size); + if (status) + goto err; + + if (!srq) { + status = ocrdma_add_mmap(pd->uctx, uresp.rq_page_addr[0], + uresp.rq_page_size); + if (status) + goto rq_map_err; + } + return status; +rq_map_err: + ocrdma_del_mmap(pd->uctx, uresp.sq_page_addr[0], uresp.sq_page_size); +err: + return status; +} + +static void ocrdma_set_qp_db(struct ocrdma_dev *dev, struct ocrdma_qp *qp, + struct ocrdma_pd *pd) +{ + if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) { + qp->sq_db = dev->nic_info.db + + (pd->id * dev->nic_info.db_page_size) + + OCRDMA_DB_GEN2_SQ_OFFSET; + qp->rq_db = dev->nic_info.db + + (pd->id * dev->nic_info.db_page_size) + + OCRDMA_DB_GEN2_RQ_OFFSET; + } else { + qp->sq_db = dev->nic_info.db + + (pd->id * dev->nic_info.db_page_size) + + OCRDMA_DB_SQ_OFFSET; + qp->rq_db = dev->nic_info.db + + (pd->id * dev->nic_info.db_page_size) + + OCRDMA_DB_RQ_OFFSET; + } +} + +static int ocrdma_alloc_wr_id_tbl(struct ocrdma_qp *qp) +{ + qp->wqe_wr_id_tbl = + kzalloc(sizeof(*(qp->wqe_wr_id_tbl)) * qp->sq.max_cnt, + GFP_KERNEL); + if (qp->wqe_wr_id_tbl == NULL) + return -ENOMEM; + qp->rqe_wr_id_tbl = + kzalloc(sizeof(u64) * qp->rq.max_cnt, GFP_KERNEL); + if (qp->rqe_wr_id_tbl == NULL) + return -ENOMEM; + + return 0; +} + +static void ocrdma_set_qp_init_params(struct ocrdma_qp *qp, + struct ocrdma_pd *pd, + struct ib_qp_init_attr *attrs) +{ + qp->pd = pd; + spin_lock_init(&qp->q_lock); + INIT_LIST_HEAD(&qp->sq_entry); + INIT_LIST_HEAD(&qp->rq_entry); + + qp->qp_type = attrs->qp_type; + qp->cap_flags = OCRDMA_QP_INB_RD | OCRDMA_QP_INB_WR; + qp->max_inline_data = attrs->cap.max_inline_data; + qp->sq.max_sges = attrs->cap.max_send_sge; + qp->rq.max_sges = attrs->cap.max_recv_sge; + qp->state = OCRDMA_QPS_RST; + qp->signaled = (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) ? true : false; +} + +static void ocrdma_store_gsi_qp_cq(struct ocrdma_dev *dev, + struct ib_qp_init_attr *attrs) +{ + if (attrs->qp_type == IB_QPT_GSI) { + dev->gsi_qp_created = 1; + dev->gsi_sqcq = get_ocrdma_cq(attrs->send_cq); + dev->gsi_rqcq = get_ocrdma_cq(attrs->recv_cq); + } +} + +struct ib_qp *ocrdma_create_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *attrs, + struct ib_udata *udata) +{ + int status; + struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); + struct ocrdma_qp *qp; + struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); + struct ocrdma_create_qp_ureq ureq; + u16 dpp_credit_lmt, dpp_offset; + + status = ocrdma_check_qp_params(ibpd, dev, attrs); + if (status) + goto gen_err; + + memset(&ureq, 0, sizeof(ureq)); + if (udata) { + if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) + return ERR_PTR(-EFAULT); + } + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) { + status = -ENOMEM; + goto gen_err; + } + qp->dev = dev; + ocrdma_set_qp_init_params(qp, pd, attrs); + if (udata == NULL) + qp->cap_flags |= (OCRDMA_QP_MW_BIND | OCRDMA_QP_LKEY0 | + OCRDMA_QP_FAST_REG); + + mutex_lock(&dev->dev_lock); + status = ocrdma_mbx_create_qp(qp, attrs, ureq.enable_dpp_cq, + ureq.dpp_cq_id, + &dpp_offset, &dpp_credit_lmt); + if (status) + goto mbx_err; + + /* user space QP's wr_id table are managed in library */ + if (udata == NULL) { + status = ocrdma_alloc_wr_id_tbl(qp); + if (status) + goto map_err; + } + + status = ocrdma_add_qpn_map(dev, qp); + if (status) + goto map_err; + ocrdma_set_qp_db(dev, qp, pd); + if (udata) { + status = ocrdma_copy_qp_uresp(qp, udata, dpp_offset, + dpp_credit_lmt, + (attrs->srq != NULL)); + if (status) + goto cpy_err; + } + ocrdma_store_gsi_qp_cq(dev, attrs); + qp->ibqp.qp_num = qp->id; + mutex_unlock(&dev->dev_lock); + return &qp->ibqp; + +cpy_err: + ocrdma_del_qpn_map(dev, qp); +map_err: + ocrdma_mbx_destroy_qp(dev, qp); +mbx_err: + mutex_unlock(&dev->dev_lock); + kfree(qp->wqe_wr_id_tbl); + kfree(qp->rqe_wr_id_tbl); + kfree(qp); + pr_err("%s(%d) error=%d\n", __func__, dev->id, status); +gen_err: + return ERR_PTR(status); +} + +int _ocrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask) +{ + int status = 0; + struct ocrdma_qp *qp; + struct ocrdma_dev *dev; + enum ib_qp_state old_qps; + + qp = get_ocrdma_qp(ibqp); + dev = qp->dev; + if (attr_mask & IB_QP_STATE) + status = ocrdma_qp_state_change(qp, attr->qp_state, &old_qps); + /* if new and previous states are same hw doesn't need to + * know about it. + */ + if (status < 0) + return status; + status = ocrdma_mbx_modify_qp(dev, qp, attr, attr_mask); + + return status; +} + +int ocrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + unsigned long flags; + int status = -EINVAL; + struct ocrdma_qp *qp; + struct ocrdma_dev *dev; + enum ib_qp_state old_qps, new_qps; + + qp = get_ocrdma_qp(ibqp); + dev = qp->dev; + + /* syncronize with multiple context trying to change, retrive qps */ + mutex_lock(&dev->dev_lock); + /* syncronize with wqe, rqe posting and cqe processing contexts */ + spin_lock_irqsave(&qp->q_lock, flags); + old_qps = get_ibqp_state(qp->state); + if (attr_mask & IB_QP_STATE) + new_qps = attr->qp_state; + else + new_qps = old_qps; + spin_unlock_irqrestore(&qp->q_lock, flags); + + if (!ib_modify_qp_is_ok(old_qps, new_qps, ibqp->qp_type, attr_mask, + IB_LINK_LAYER_ETHERNET)) { + pr_err("%s(%d) invalid attribute mask=0x%x specified for\n" + "qpn=0x%x of type=0x%x old_qps=0x%x, new_qps=0x%x\n", + __func__, dev->id, attr_mask, qp->id, ibqp->qp_type, + old_qps, new_qps); + goto param_err; + } + + status = _ocrdma_modify_qp(ibqp, attr, attr_mask); + if (status > 0) + status = 0; +param_err: + mutex_unlock(&dev->dev_lock); + return status; +} + +static enum ib_mtu ocrdma_mtu_int_to_enum(u16 mtu) +{ + switch (mtu) { + case 256: + return IB_MTU_256; + case 512: + return IB_MTU_512; + case 1024: + return IB_MTU_1024; + case 2048: + return IB_MTU_2048; + case 4096: + return IB_MTU_4096; + default: + return IB_MTU_1024; + } +} + +static int ocrdma_to_ib_qp_acc_flags(int qp_cap_flags) +{ + int ib_qp_acc_flags = 0; + + if (qp_cap_flags & OCRDMA_QP_INB_WR) + ib_qp_acc_flags |= IB_ACCESS_REMOTE_WRITE; + if (qp_cap_flags & OCRDMA_QP_INB_RD) + ib_qp_acc_flags |= IB_ACCESS_LOCAL_WRITE; + return ib_qp_acc_flags; +} + +int ocrdma_query_qp(struct ib_qp *ibqp, + struct ib_qp_attr *qp_attr, + int attr_mask, struct ib_qp_init_attr *qp_init_attr) +{ + int status; + u32 qp_state; + struct ocrdma_qp_params params; + struct ocrdma_qp *qp = get_ocrdma_qp(ibqp); + struct ocrdma_dev *dev = qp->dev; + + memset(¶ms, 0, sizeof(params)); + mutex_lock(&dev->dev_lock); + status = ocrdma_mbx_query_qp(dev, qp, ¶ms); + mutex_unlock(&dev->dev_lock); + if (status) + goto mbx_err; + qp_attr->qp_state = get_ibqp_state(IB_QPS_INIT); + qp_attr->cur_qp_state = get_ibqp_state(IB_QPS_INIT); + qp_attr->path_mtu = + ocrdma_mtu_int_to_enum(params.path_mtu_pkey_indx & + OCRDMA_QP_PARAMS_PATH_MTU_MASK) >> + OCRDMA_QP_PARAMS_PATH_MTU_SHIFT; + qp_attr->path_mig_state = IB_MIG_MIGRATED; + qp_attr->rq_psn = params.hop_lmt_rq_psn & OCRDMA_QP_PARAMS_RQ_PSN_MASK; + qp_attr->sq_psn = params.tclass_sq_psn & OCRDMA_QP_PARAMS_SQ_PSN_MASK; + qp_attr->dest_qp_num = + params.ack_to_rnr_rtc_dest_qpn & OCRDMA_QP_PARAMS_DEST_QPN_MASK; + + qp_attr->qp_access_flags = ocrdma_to_ib_qp_acc_flags(qp->cap_flags); + qp_attr->cap.max_send_wr = qp->sq.max_cnt - 1; + qp_attr->cap.max_recv_wr = qp->rq.max_cnt - 1; + qp_attr->cap.max_send_sge = qp->sq.max_sges; + qp_attr->cap.max_recv_sge = qp->rq.max_sges; + qp_attr->cap.max_inline_data = qp->max_inline_data; + qp_init_attr->cap = qp_attr->cap; + memcpy(&qp_attr->ah_attr.grh.dgid, ¶ms.dgid[0], + sizeof(params.dgid)); + qp_attr->ah_attr.grh.flow_label = params.rnt_rc_sl_fl & + OCRDMA_QP_PARAMS_FLOW_LABEL_MASK; + qp_attr->ah_attr.grh.sgid_index = qp->sgid_idx; + qp_attr->ah_attr.grh.hop_limit = (params.hop_lmt_rq_psn & + OCRDMA_QP_PARAMS_HOP_LMT_MASK) >> + OCRDMA_QP_PARAMS_HOP_LMT_SHIFT; + qp_attr->ah_attr.grh.traffic_class = (params.tclass_sq_psn & + OCRDMA_QP_PARAMS_TCLASS_MASK) >> + OCRDMA_QP_PARAMS_TCLASS_SHIFT; + + qp_attr->ah_attr.ah_flags = IB_AH_GRH; + qp_attr->ah_attr.port_num = 1; + qp_attr->ah_attr.sl = (params.rnt_rc_sl_fl & + OCRDMA_QP_PARAMS_SL_MASK) >> + OCRDMA_QP_PARAMS_SL_SHIFT; + qp_attr->timeout = (params.ack_to_rnr_rtc_dest_qpn & + OCRDMA_QP_PARAMS_ACK_TIMEOUT_MASK) >> + OCRDMA_QP_PARAMS_ACK_TIMEOUT_SHIFT; + qp_attr->rnr_retry = (params.ack_to_rnr_rtc_dest_qpn & + OCRDMA_QP_PARAMS_RNR_RETRY_CNT_MASK) >> + OCRDMA_QP_PARAMS_RNR_RETRY_CNT_SHIFT; + qp_attr->retry_cnt = + (params.rnt_rc_sl_fl & OCRDMA_QP_PARAMS_RETRY_CNT_MASK) >> + OCRDMA_QP_PARAMS_RETRY_CNT_SHIFT; + qp_attr->min_rnr_timer = 0; + qp_attr->pkey_index = 0; + qp_attr->port_num = 1; + qp_attr->ah_attr.src_path_bits = 0; + qp_attr->ah_attr.static_rate = 0; + qp_attr->alt_pkey_index = 0; + qp_attr->alt_port_num = 0; + qp_attr->alt_timeout = 0; + memset(&qp_attr->alt_ah_attr, 0, sizeof(qp_attr->alt_ah_attr)); + qp_state = (params.max_sge_recv_flags & OCRDMA_QP_PARAMS_STATE_MASK) >> + OCRDMA_QP_PARAMS_STATE_SHIFT; + qp_attr->sq_draining = (qp_state == OCRDMA_QPS_SQ_DRAINING) ? 1 : 0; + qp_attr->max_dest_rd_atomic = + params.max_ord_ird >> OCRDMA_QP_PARAMS_MAX_ORD_SHIFT; + qp_attr->max_rd_atomic = + params.max_ord_ird & OCRDMA_QP_PARAMS_MAX_IRD_MASK; + qp_attr->en_sqd_async_notify = (params.max_sge_recv_flags & + OCRDMA_QP_PARAMS_FLAGS_SQD_ASYNC) ? 1 : 0; +mbx_err: + return status; +} + +static void ocrdma_srq_toggle_bit(struct ocrdma_srq *srq, int idx) +{ + int i = idx / 32; + unsigned int mask = (1 << (idx % 32)); + + if (srq->idx_bit_fields[i] & mask) + srq->idx_bit_fields[i] &= ~mask; + else + srq->idx_bit_fields[i] |= mask; +} + +static int ocrdma_hwq_free_cnt(struct ocrdma_qp_hwq_info *q) +{ + return ((q->max_wqe_idx - q->head) + q->tail) % q->max_cnt; +} + +static int is_hw_sq_empty(struct ocrdma_qp *qp) +{ + return (qp->sq.tail == qp->sq.head); +} + +static int is_hw_rq_empty(struct ocrdma_qp *qp) +{ + return (qp->rq.tail == qp->rq.head); +} + +static void *ocrdma_hwq_head(struct ocrdma_qp_hwq_info *q) +{ + return q->va + (q->head * q->entry_size); +} + +static void *ocrdma_hwq_head_from_idx(struct ocrdma_qp_hwq_info *q, + u32 idx) +{ + return q->va + (idx * q->entry_size); +} + +static void ocrdma_hwq_inc_head(struct ocrdma_qp_hwq_info *q) +{ + q->head = (q->head + 1) & q->max_wqe_idx; +} + +static void ocrdma_hwq_inc_tail(struct ocrdma_qp_hwq_info *q) +{ + q->tail = (q->tail + 1) & q->max_wqe_idx; +} + +/* discard the cqe for a given QP */ +static void ocrdma_discard_cqes(struct ocrdma_qp *qp, struct ocrdma_cq *cq) +{ + unsigned long cq_flags; + unsigned long flags; + int discard_cnt = 0; + u32 cur_getp, stop_getp; + struct ocrdma_cqe *cqe; + u32 qpn = 0, wqe_idx = 0; + + spin_lock_irqsave(&cq->cq_lock, cq_flags); + + /* traverse through the CQEs in the hw CQ, + * find the matching CQE for a given qp, + * mark the matching one discarded by clearing qpn. + * ring the doorbell in the poll_cq() as + * we don't complete out of order cqe. + */ + + cur_getp = cq->getp; + /* find upto when do we reap the cq. */ + stop_getp = cur_getp; + do { + if (is_hw_sq_empty(qp) && (!qp->srq && is_hw_rq_empty(qp))) + break; + + cqe = cq->va + cur_getp; + /* if (a) done reaping whole hw cq, or + * (b) qp_xq becomes empty. + * then exit + */ + qpn = cqe->cmn.qpn & OCRDMA_CQE_QPN_MASK; + /* if previously discarded cqe found, skip that too. */ + /* check for matching qp */ + if (qpn == 0 || qpn != qp->id) + goto skip_cqe; + + if (is_cqe_for_sq(cqe)) { + ocrdma_hwq_inc_tail(&qp->sq); + } else { + if (qp->srq) { + wqe_idx = (le32_to_cpu(cqe->rq.buftag_qpn) >> + OCRDMA_CQE_BUFTAG_SHIFT) & + qp->srq->rq.max_wqe_idx; + if (wqe_idx < 1) + BUG(); + spin_lock_irqsave(&qp->srq->q_lock, flags); + ocrdma_hwq_inc_tail(&qp->srq->rq); + ocrdma_srq_toggle_bit(qp->srq, wqe_idx - 1); + spin_unlock_irqrestore(&qp->srq->q_lock, flags); + + } else { + ocrdma_hwq_inc_tail(&qp->rq); + } + } + /* mark cqe discarded so that it is not picked up later + * in the poll_cq(). + */ + discard_cnt += 1; + cqe->cmn.qpn = 0; +skip_cqe: + cur_getp = (cur_getp + 1) % cq->max_hw_cqe; + } while (cur_getp != stop_getp); + spin_unlock_irqrestore(&cq->cq_lock, cq_flags); +} + +void ocrdma_del_flush_qp(struct ocrdma_qp *qp) +{ + int found = false; + unsigned long flags; + struct ocrdma_dev *dev = qp->dev; + /* sync with any active CQ poll */ + + spin_lock_irqsave(&dev->flush_q_lock, flags); + found = ocrdma_is_qp_in_sq_flushlist(qp->sq_cq, qp); + if (found) + list_del(&qp->sq_entry); + if (!qp->srq) { + found = ocrdma_is_qp_in_rq_flushlist(qp->rq_cq, qp); + if (found) + list_del(&qp->rq_entry); + } + spin_unlock_irqrestore(&dev->flush_q_lock, flags); +} + +int ocrdma_destroy_qp(struct ib_qp *ibqp) +{ + int status; + struct ocrdma_pd *pd; + struct ocrdma_qp *qp; + struct ocrdma_dev *dev; + struct ib_qp_attr attrs; + int attr_mask = IB_QP_STATE; + unsigned long flags; + + qp = get_ocrdma_qp(ibqp); + dev = qp->dev; + + attrs.qp_state = IB_QPS_ERR; + pd = qp->pd; + + /* change the QP state to ERROR */ + _ocrdma_modify_qp(ibqp, &attrs, attr_mask); + + /* ensure that CQEs for newly created QP (whose id may be same with + * one which just getting destroyed are same), dont get + * discarded until the old CQEs are discarded. + */ + mutex_lock(&dev->dev_lock); + status = ocrdma_mbx_destroy_qp(dev, qp); + + /* + * acquire CQ lock while destroy is in progress, in order to + * protect against proessing in-flight CQEs for this QP. + */ + spin_lock_irqsave(&qp->sq_cq->cq_lock, flags); + if (qp->rq_cq && (qp->rq_cq != qp->sq_cq)) + spin_lock(&qp->rq_cq->cq_lock); + + ocrdma_del_qpn_map(dev, qp); + + if (qp->rq_cq && (qp->rq_cq != qp->sq_cq)) + spin_unlock(&qp->rq_cq->cq_lock); + spin_unlock_irqrestore(&qp->sq_cq->cq_lock, flags); + + if (!pd->uctx) { + ocrdma_discard_cqes(qp, qp->sq_cq); + ocrdma_discard_cqes(qp, qp->rq_cq); + } + mutex_unlock(&dev->dev_lock); + + if (pd->uctx) { + ocrdma_del_mmap(pd->uctx, (u64) qp->sq.pa, + PAGE_ALIGN(qp->sq.len)); + if (!qp->srq) + ocrdma_del_mmap(pd->uctx, (u64) qp->rq.pa, + PAGE_ALIGN(qp->rq.len)); + } + + ocrdma_del_flush_qp(qp); + + kfree(qp->wqe_wr_id_tbl); + kfree(qp->rqe_wr_id_tbl); + kfree(qp); + return status; +} + +static int ocrdma_copy_srq_uresp(struct ocrdma_dev *dev, struct ocrdma_srq *srq, + struct ib_udata *udata) +{ + int status; + struct ocrdma_create_srq_uresp uresp; + + memset(&uresp, 0, sizeof(uresp)); + uresp.rq_dbid = srq->rq.dbid; + uresp.num_rq_pages = 1; + uresp.rq_page_addr[0] = virt_to_phys(srq->rq.va); + uresp.rq_page_size = srq->rq.len; + uresp.db_page_addr = dev->nic_info.unmapped_db + + (srq->pd->id * dev->nic_info.db_page_size); + uresp.db_page_size = dev->nic_info.db_page_size; + uresp.num_rqe_allocated = srq->rq.max_cnt; + if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) { + uresp.db_rq_offset = OCRDMA_DB_GEN2_RQ_OFFSET; + uresp.db_shift = 24; + } else { + uresp.db_rq_offset = OCRDMA_DB_RQ_OFFSET; + uresp.db_shift = 16; + } + + status = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (status) + return status; + status = ocrdma_add_mmap(srq->pd->uctx, uresp.rq_page_addr[0], + uresp.rq_page_size); + if (status) + return status; + return status; +} + +struct ib_srq *ocrdma_create_srq(struct ib_pd *ibpd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata) +{ + int status = -ENOMEM; + struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); + struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); + struct ocrdma_srq *srq; + + if (init_attr->attr.max_sge > dev->attr.max_recv_sge) + return ERR_PTR(-EINVAL); + if (init_attr->attr.max_wr > dev->attr.max_rqe) + return ERR_PTR(-EINVAL); + + srq = kzalloc(sizeof(*srq), GFP_KERNEL); + if (!srq) + return ERR_PTR(status); + + spin_lock_init(&srq->q_lock); + srq->pd = pd; + srq->db = dev->nic_info.db + (pd->id * dev->nic_info.db_page_size); + status = ocrdma_mbx_create_srq(dev, srq, init_attr, pd); + if (status) + goto err; + + if (udata == NULL) { + srq->rqe_wr_id_tbl = kzalloc(sizeof(u64) * srq->rq.max_cnt, + GFP_KERNEL); + if (srq->rqe_wr_id_tbl == NULL) + goto arm_err; + + srq->bit_fields_len = (srq->rq.max_cnt / 32) + + (srq->rq.max_cnt % 32 ? 1 : 0); + srq->idx_bit_fields = + kmalloc(srq->bit_fields_len * sizeof(u32), GFP_KERNEL); + if (srq->idx_bit_fields == NULL) + goto arm_err; + memset(srq->idx_bit_fields, 0xff, + srq->bit_fields_len * sizeof(u32)); + } + + if (init_attr->attr.srq_limit) { + status = ocrdma_mbx_modify_srq(srq, &init_attr->attr); + if (status) + goto arm_err; + } + + if (udata) { + status = ocrdma_copy_srq_uresp(dev, srq, udata); + if (status) + goto arm_err; + } + + return &srq->ibsrq; + +arm_err: + ocrdma_mbx_destroy_srq(dev, srq); +err: + kfree(srq->rqe_wr_id_tbl); + kfree(srq->idx_bit_fields); + kfree(srq); + return ERR_PTR(status); +} + +int ocrdma_modify_srq(struct ib_srq *ibsrq, + struct ib_srq_attr *srq_attr, + enum ib_srq_attr_mask srq_attr_mask, + struct ib_udata *udata) +{ + int status = 0; + struct ocrdma_srq *srq; + + srq = get_ocrdma_srq(ibsrq); + if (srq_attr_mask & IB_SRQ_MAX_WR) + status = -EINVAL; + else + status = ocrdma_mbx_modify_srq(srq, srq_attr); + return status; +} + +int ocrdma_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr) +{ + int status; + struct ocrdma_srq *srq; + + srq = get_ocrdma_srq(ibsrq); + status = ocrdma_mbx_query_srq(srq, srq_attr); + return status; +} + +int ocrdma_destroy_srq(struct ib_srq *ibsrq) +{ + int status; + struct ocrdma_srq *srq; + struct ocrdma_dev *dev = get_ocrdma_dev(ibsrq->device); + + srq = get_ocrdma_srq(ibsrq); + + status = ocrdma_mbx_destroy_srq(dev, srq); + + if (srq->pd->uctx) + ocrdma_del_mmap(srq->pd->uctx, (u64) srq->rq.pa, + PAGE_ALIGN(srq->rq.len)); + + kfree(srq->idx_bit_fields); + kfree(srq->rqe_wr_id_tbl); + kfree(srq); + return status; +} + +/* unprivileged verbs and their support functions. */ +static void ocrdma_build_ud_hdr(struct ocrdma_qp *qp, + struct ocrdma_hdr_wqe *hdr, + struct ib_send_wr *wr) +{ + struct ocrdma_ewqe_ud_hdr *ud_hdr = + (struct ocrdma_ewqe_ud_hdr *)(hdr + 1); + struct ocrdma_ah *ah = get_ocrdma_ah(wr->wr.ud.ah); + + ud_hdr->rsvd_dest_qpn = wr->wr.ud.remote_qpn; + if (qp->qp_type == IB_QPT_GSI) + ud_hdr->qkey = qp->qkey; + else + ud_hdr->qkey = wr->wr.ud.remote_qkey; + ud_hdr->rsvd_ahid = ah->id; +} + +static void ocrdma_build_sges(struct ocrdma_hdr_wqe *hdr, + struct ocrdma_sge *sge, int num_sge, + struct ib_sge *sg_list) +{ + int i; + + for (i = 0; i < num_sge; i++) { + sge[i].lrkey = sg_list[i].lkey; + sge[i].addr_lo = sg_list[i].addr; + sge[i].addr_hi = upper_32_bits(sg_list[i].addr); + sge[i].len = sg_list[i].length; + hdr->total_len += sg_list[i].length; + } + if (num_sge == 0) + memset(sge, 0, sizeof(*sge)); +} + +static inline uint32_t ocrdma_sglist_len(struct ib_sge *sg_list, int num_sge) +{ + uint32_t total_len = 0, i; + + for (i = 0; i < num_sge; i++) + total_len += sg_list[i].length; + return total_len; +} + + +static int ocrdma_build_inline_sges(struct ocrdma_qp *qp, + struct ocrdma_hdr_wqe *hdr, + struct ocrdma_sge *sge, + struct ib_send_wr *wr, u32 wqe_size) +{ + int i; + char *dpp_addr; + + if (wr->send_flags & IB_SEND_INLINE && qp->qp_type != IB_QPT_UD) { + hdr->total_len = ocrdma_sglist_len(wr->sg_list, wr->num_sge); + if (unlikely(hdr->total_len > qp->max_inline_data)) { + pr_err("%s() supported_len=0x%x,\n" + " unsupported len req=0x%x\n", __func__, + qp->max_inline_data, hdr->total_len); + return -EINVAL; + } + dpp_addr = (char *)sge; + for (i = 0; i < wr->num_sge; i++) { + memcpy(dpp_addr, + (void *)(unsigned long)wr->sg_list[i].addr, + wr->sg_list[i].length); + dpp_addr += wr->sg_list[i].length; + } + + wqe_size += roundup(hdr->total_len, OCRDMA_WQE_ALIGN_BYTES); + if (0 == hdr->total_len) + wqe_size += sizeof(struct ocrdma_sge); + hdr->cw |= (OCRDMA_TYPE_INLINE << OCRDMA_WQE_TYPE_SHIFT); + } else { + ocrdma_build_sges(hdr, sge, wr->num_sge, wr->sg_list); + if (wr->num_sge) + wqe_size += (wr->num_sge * sizeof(struct ocrdma_sge)); + else + wqe_size += sizeof(struct ocrdma_sge); + hdr->cw |= (OCRDMA_TYPE_LKEY << OCRDMA_WQE_TYPE_SHIFT); + } + hdr->cw |= ((wqe_size / OCRDMA_WQE_STRIDE) << OCRDMA_WQE_SIZE_SHIFT); + return 0; +} + +static int ocrdma_build_send(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr, + struct ib_send_wr *wr) +{ + int status; + struct ocrdma_sge *sge; + u32 wqe_size = sizeof(*hdr); + + if (qp->qp_type == IB_QPT_UD || qp->qp_type == IB_QPT_GSI) { + ocrdma_build_ud_hdr(qp, hdr, wr); + sge = (struct ocrdma_sge *)(hdr + 2); + wqe_size += sizeof(struct ocrdma_ewqe_ud_hdr); + } else { + sge = (struct ocrdma_sge *)(hdr + 1); + } + + status = ocrdma_build_inline_sges(qp, hdr, sge, wr, wqe_size); + return status; +} + +static int ocrdma_build_write(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr, + struct ib_send_wr *wr) +{ + int status; + struct ocrdma_sge *ext_rw = (struct ocrdma_sge *)(hdr + 1); + struct ocrdma_sge *sge = ext_rw + 1; + u32 wqe_size = sizeof(*hdr) + sizeof(*ext_rw); + + status = ocrdma_build_inline_sges(qp, hdr, sge, wr, wqe_size); + if (status) + return status; + ext_rw->addr_lo = wr->wr.rdma.remote_addr; + ext_rw->addr_hi = upper_32_bits(wr->wr.rdma.remote_addr); + ext_rw->lrkey = wr->wr.rdma.rkey; + ext_rw->len = hdr->total_len; + return 0; +} + +static void ocrdma_build_read(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr, + struct ib_send_wr *wr) +{ + struct ocrdma_sge *ext_rw = (struct ocrdma_sge *)(hdr + 1); + struct ocrdma_sge *sge = ext_rw + 1; + u32 wqe_size = ((wr->num_sge + 1) * sizeof(struct ocrdma_sge)) + + sizeof(struct ocrdma_hdr_wqe); + + ocrdma_build_sges(hdr, sge, wr->num_sge, wr->sg_list); + hdr->cw |= ((wqe_size / OCRDMA_WQE_STRIDE) << OCRDMA_WQE_SIZE_SHIFT); + hdr->cw |= (OCRDMA_READ << OCRDMA_WQE_OPCODE_SHIFT); + hdr->cw |= (OCRDMA_TYPE_LKEY << OCRDMA_WQE_TYPE_SHIFT); + + ext_rw->addr_lo = wr->wr.rdma.remote_addr; + ext_rw->addr_hi = upper_32_bits(wr->wr.rdma.remote_addr); + ext_rw->lrkey = wr->wr.rdma.rkey; + ext_rw->len = hdr->total_len; +} + +static void build_frmr_pbes(struct ib_send_wr *wr, struct ocrdma_pbl *pbl_tbl, + struct ocrdma_hw_mr *hwmr) +{ + int i; + u64 buf_addr = 0; + int num_pbes; + struct ocrdma_pbe *pbe; + + pbe = (struct ocrdma_pbe *)pbl_tbl->va; + num_pbes = 0; + + /* go through the OS phy regions & fill hw pbe entries into pbls. */ + for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) { + /* number of pbes can be more for one OS buf, when + * buffers are of different sizes. + * split the ib_buf to one or more pbes. + */ + buf_addr = wr->wr.fast_reg.page_list->page_list[i]; + pbe->pa_lo = cpu_to_le32((u32) (buf_addr & PAGE_MASK)); + pbe->pa_hi = cpu_to_le32((u32) upper_32_bits(buf_addr)); + num_pbes += 1; + pbe++; + + /* if the pbl is full storing the pbes, + * move to next pbl. + */ + if (num_pbes == (hwmr->pbl_size/sizeof(u64))) { + pbl_tbl++; + pbe = (struct ocrdma_pbe *)pbl_tbl->va; + } + } + return; +} + +static int get_encoded_page_size(int pg_sz) +{ + /* Max size is 256M 4096 << 16 */ + int i = 0; + for (; i < 17; i++) + if (pg_sz == (4096 << i)) + break; + return i; +} + + +static int ocrdma_build_fr(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr, + struct ib_send_wr *wr) +{ + u64 fbo; + struct ocrdma_ewqe_fr *fast_reg = (struct ocrdma_ewqe_fr *)(hdr + 1); + struct ocrdma_mr *mr; + u32 wqe_size = sizeof(*fast_reg) + sizeof(*hdr); + + wqe_size = roundup(wqe_size, OCRDMA_WQE_ALIGN_BYTES); + + if (wr->wr.fast_reg.page_list_len > qp->dev->attr.max_pages_per_frmr) + return -EINVAL; + + hdr->cw |= (OCRDMA_FR_MR << OCRDMA_WQE_OPCODE_SHIFT); + hdr->cw |= ((wqe_size / OCRDMA_WQE_STRIDE) << OCRDMA_WQE_SIZE_SHIFT); + + if (wr->wr.fast_reg.page_list_len == 0) + BUG(); + if (wr->wr.fast_reg.access_flags & IB_ACCESS_LOCAL_WRITE) + hdr->rsvd_lkey_flags |= OCRDMA_LKEY_FLAG_LOCAL_WR; + if (wr->wr.fast_reg.access_flags & IB_ACCESS_REMOTE_WRITE) + hdr->rsvd_lkey_flags |= OCRDMA_LKEY_FLAG_REMOTE_WR; + if (wr->wr.fast_reg.access_flags & IB_ACCESS_REMOTE_READ) + hdr->rsvd_lkey_flags |= OCRDMA_LKEY_FLAG_REMOTE_RD; + hdr->lkey = wr->wr.fast_reg.rkey; + hdr->total_len = wr->wr.fast_reg.length; + + fbo = wr->wr.fast_reg.iova_start - + (wr->wr.fast_reg.page_list->page_list[0] & PAGE_MASK); + + fast_reg->va_hi = upper_32_bits(wr->wr.fast_reg.iova_start); + fast_reg->va_lo = (u32) (wr->wr.fast_reg.iova_start & 0xffffffff); + fast_reg->fbo_hi = upper_32_bits(fbo); + fast_reg->fbo_lo = (u32) fbo & 0xffffffff; + fast_reg->num_sges = wr->wr.fast_reg.page_list_len; + fast_reg->size_sge = + get_encoded_page_size(1 << wr->wr.fast_reg.page_shift); + mr = (struct ocrdma_mr *) (unsigned long) + qp->dev->stag_arr[(hdr->lkey >> 8) & (OCRDMA_MAX_STAG - 1)]; + build_frmr_pbes(wr, mr->hwmr.pbl_table, &mr->hwmr); + return 0; +} + +static void ocrdma_ring_sq_db(struct ocrdma_qp *qp) +{ + u32 val = qp->sq.dbid | (1 << OCRDMA_DB_SQ_SHIFT); + + iowrite32(val, qp->sq_db); +} + +int ocrdma_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + int status = 0; + struct ocrdma_qp *qp = get_ocrdma_qp(ibqp); + struct ocrdma_hdr_wqe *hdr; + unsigned long flags; + + spin_lock_irqsave(&qp->q_lock, flags); + if (qp->state != OCRDMA_QPS_RTS && qp->state != OCRDMA_QPS_SQD) { + spin_unlock_irqrestore(&qp->q_lock, flags); + *bad_wr = wr; + return -EINVAL; + } + + while (wr) { + if (qp->qp_type == IB_QPT_UD && + (wr->opcode != IB_WR_SEND && + wr->opcode != IB_WR_SEND_WITH_IMM)) { + *bad_wr = wr; + status = -EINVAL; + break; + } + if (ocrdma_hwq_free_cnt(&qp->sq) == 0 || + wr->num_sge > qp->sq.max_sges) { + *bad_wr = wr; + status = -ENOMEM; + break; + } + hdr = ocrdma_hwq_head(&qp->sq); + hdr->cw = 0; + if (wr->send_flags & IB_SEND_SIGNALED || qp->signaled) + hdr->cw |= (OCRDMA_FLAG_SIG << OCRDMA_WQE_FLAGS_SHIFT); + if (wr->send_flags & IB_SEND_FENCE) + hdr->cw |= + (OCRDMA_FLAG_FENCE_L << OCRDMA_WQE_FLAGS_SHIFT); + if (wr->send_flags & IB_SEND_SOLICITED) + hdr->cw |= + (OCRDMA_FLAG_SOLICIT << OCRDMA_WQE_FLAGS_SHIFT); + hdr->total_len = 0; + switch (wr->opcode) { + case IB_WR_SEND_WITH_IMM: + hdr->cw |= (OCRDMA_FLAG_IMM << OCRDMA_WQE_FLAGS_SHIFT); + hdr->immdt = ntohl(wr->ex.imm_data); + case IB_WR_SEND: + hdr->cw |= (OCRDMA_SEND << OCRDMA_WQE_OPCODE_SHIFT); + ocrdma_build_send(qp, hdr, wr); + break; + case IB_WR_SEND_WITH_INV: + hdr->cw |= (OCRDMA_FLAG_INV << OCRDMA_WQE_FLAGS_SHIFT); + hdr->cw |= (OCRDMA_SEND << OCRDMA_WQE_OPCODE_SHIFT); + hdr->lkey = wr->ex.invalidate_rkey; + status = ocrdma_build_send(qp, hdr, wr); + break; + case IB_WR_RDMA_WRITE_WITH_IMM: + hdr->cw |= (OCRDMA_FLAG_IMM << OCRDMA_WQE_FLAGS_SHIFT); + hdr->immdt = ntohl(wr->ex.imm_data); + case IB_WR_RDMA_WRITE: + hdr->cw |= (OCRDMA_WRITE << OCRDMA_WQE_OPCODE_SHIFT); + status = ocrdma_build_write(qp, hdr, wr); + break; + case IB_WR_RDMA_READ_WITH_INV: + hdr->cw |= (OCRDMA_FLAG_INV << OCRDMA_WQE_FLAGS_SHIFT); + case IB_WR_RDMA_READ: + ocrdma_build_read(qp, hdr, wr); + break; + case IB_WR_LOCAL_INV: + hdr->cw |= + (OCRDMA_LKEY_INV << OCRDMA_WQE_OPCODE_SHIFT); + hdr->cw |= ((sizeof(struct ocrdma_hdr_wqe) + + sizeof(struct ocrdma_sge)) / + OCRDMA_WQE_STRIDE) << OCRDMA_WQE_SIZE_SHIFT; + hdr->lkey = wr->ex.invalidate_rkey; + break; + case IB_WR_FAST_REG_MR: + status = ocrdma_build_fr(qp, hdr, wr); + break; + default: + status = -EINVAL; + break; + } + if (status) { + *bad_wr = wr; + break; + } + if (wr->send_flags & IB_SEND_SIGNALED || qp->signaled) + qp->wqe_wr_id_tbl[qp->sq.head].signaled = 1; + else + qp->wqe_wr_id_tbl[qp->sq.head].signaled = 0; + qp->wqe_wr_id_tbl[qp->sq.head].wrid = wr->wr_id; + ocrdma_cpu_to_le32(hdr, ((hdr->cw >> OCRDMA_WQE_SIZE_SHIFT) & + OCRDMA_WQE_SIZE_MASK) * OCRDMA_WQE_STRIDE); + /* make sure wqe is written before adapter can access it */ + wmb(); + /* inform hw to start processing it */ + ocrdma_ring_sq_db(qp); + + /* update pointer, counter for next wr */ + ocrdma_hwq_inc_head(&qp->sq); + wr = wr->next; + } + spin_unlock_irqrestore(&qp->q_lock, flags); + return status; +} + +static void ocrdma_ring_rq_db(struct ocrdma_qp *qp) +{ + u32 val = qp->rq.dbid | (1 << OCRDMA_DB_RQ_SHIFT); + + iowrite32(val, qp->rq_db); +} + +static void ocrdma_build_rqe(struct ocrdma_hdr_wqe *rqe, struct ib_recv_wr *wr, + u16 tag) +{ + u32 wqe_size = 0; + struct ocrdma_sge *sge; + if (wr->num_sge) + wqe_size = (wr->num_sge * sizeof(*sge)) + sizeof(*rqe); + else + wqe_size = sizeof(*sge) + sizeof(*rqe); + + rqe->cw = ((wqe_size / OCRDMA_WQE_STRIDE) << + OCRDMA_WQE_SIZE_SHIFT); + rqe->cw |= (OCRDMA_FLAG_SIG << OCRDMA_WQE_FLAGS_SHIFT); + rqe->cw |= (OCRDMA_TYPE_LKEY << OCRDMA_WQE_TYPE_SHIFT); + rqe->total_len = 0; + rqe->rsvd_tag = tag; + sge = (struct ocrdma_sge *)(rqe + 1); + ocrdma_build_sges(rqe, sge, wr->num_sge, wr->sg_list); + ocrdma_cpu_to_le32(rqe, wqe_size); +} + +int ocrdma_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + int status = 0; + unsigned long flags; + struct ocrdma_qp *qp = get_ocrdma_qp(ibqp); + struct ocrdma_hdr_wqe *rqe; + + spin_lock_irqsave(&qp->q_lock, flags); + if (qp->state == OCRDMA_QPS_RST || qp->state == OCRDMA_QPS_ERR) { + spin_unlock_irqrestore(&qp->q_lock, flags); + *bad_wr = wr; + return -EINVAL; + } + while (wr) { + if (ocrdma_hwq_free_cnt(&qp->rq) == 0 || + wr->num_sge > qp->rq.max_sges) { + *bad_wr = wr; + status = -ENOMEM; + break; + } + rqe = ocrdma_hwq_head(&qp->rq); + ocrdma_build_rqe(rqe, wr, 0); + + qp->rqe_wr_id_tbl[qp->rq.head] = wr->wr_id; + /* make sure rqe is written before adapter can access it */ + wmb(); + + /* inform hw to start processing it */ + ocrdma_ring_rq_db(qp); + + /* update pointer, counter for next wr */ + ocrdma_hwq_inc_head(&qp->rq); + wr = wr->next; + } + spin_unlock_irqrestore(&qp->q_lock, flags); + return status; +} + +/* cqe for srq's rqe can potentially arrive out of order. + * index gives the entry in the shadow table where to store + * the wr_id. tag/index is returned in cqe to reference back + * for a given rqe. + */ +static int ocrdma_srq_get_idx(struct ocrdma_srq *srq) +{ + int row = 0; + int indx = 0; + + for (row = 0; row < srq->bit_fields_len; row++) { + if (srq->idx_bit_fields[row]) { + indx = ffs(srq->idx_bit_fields[row]); + indx = (row * 32) + (indx - 1); + if (indx >= srq->rq.max_cnt) + BUG(); + ocrdma_srq_toggle_bit(srq, indx); + break; + } + } + + if (row == srq->bit_fields_len) + BUG(); + return indx + 1; /* Use from index 1 */ +} + +static void ocrdma_ring_srq_db(struct ocrdma_srq *srq) +{ + u32 val = srq->rq.dbid | (1 << 16); + + iowrite32(val, srq->db + OCRDMA_DB_GEN2_SRQ_OFFSET); +} + +int ocrdma_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + int status = 0; + unsigned long flags; + struct ocrdma_srq *srq; + struct ocrdma_hdr_wqe *rqe; + u16 tag; + + srq = get_ocrdma_srq(ibsrq); + + spin_lock_irqsave(&srq->q_lock, flags); + while (wr) { + if (ocrdma_hwq_free_cnt(&srq->rq) == 0 || + wr->num_sge > srq->rq.max_sges) { + status = -ENOMEM; + *bad_wr = wr; + break; + } + tag = ocrdma_srq_get_idx(srq); + rqe = ocrdma_hwq_head(&srq->rq); + ocrdma_build_rqe(rqe, wr, tag); + + srq->rqe_wr_id_tbl[tag] = wr->wr_id; + /* make sure rqe is written before adapter can perform DMA */ + wmb(); + /* inform hw to start processing it */ + ocrdma_ring_srq_db(srq); + /* update pointer, counter for next wr */ + ocrdma_hwq_inc_head(&srq->rq); + wr = wr->next; + } + spin_unlock_irqrestore(&srq->q_lock, flags); + return status; +} + +static enum ib_wc_status ocrdma_to_ibwc_err(u16 status) +{ + enum ib_wc_status ibwc_status; + + switch (status) { + case OCRDMA_CQE_GENERAL_ERR: + ibwc_status = IB_WC_GENERAL_ERR; + break; + case OCRDMA_CQE_LOC_LEN_ERR: + ibwc_status = IB_WC_LOC_LEN_ERR; + break; + case OCRDMA_CQE_LOC_QP_OP_ERR: + ibwc_status = IB_WC_LOC_QP_OP_ERR; + break; + case OCRDMA_CQE_LOC_EEC_OP_ERR: + ibwc_status = IB_WC_LOC_EEC_OP_ERR; + break; + case OCRDMA_CQE_LOC_PROT_ERR: + ibwc_status = IB_WC_LOC_PROT_ERR; + break; + case OCRDMA_CQE_WR_FLUSH_ERR: + ibwc_status = IB_WC_WR_FLUSH_ERR; + break; + case OCRDMA_CQE_MW_BIND_ERR: + ibwc_status = IB_WC_MW_BIND_ERR; + break; + case OCRDMA_CQE_BAD_RESP_ERR: + ibwc_status = IB_WC_BAD_RESP_ERR; + break; + case OCRDMA_CQE_LOC_ACCESS_ERR: + ibwc_status = IB_WC_LOC_ACCESS_ERR; + break; + case OCRDMA_CQE_REM_INV_REQ_ERR: + ibwc_status = IB_WC_REM_INV_REQ_ERR; + break; + case OCRDMA_CQE_REM_ACCESS_ERR: + ibwc_status = IB_WC_REM_ACCESS_ERR; + break; + case OCRDMA_CQE_REM_OP_ERR: + ibwc_status = IB_WC_REM_OP_ERR; + break; + case OCRDMA_CQE_RETRY_EXC_ERR: + ibwc_status = IB_WC_RETRY_EXC_ERR; + break; + case OCRDMA_CQE_RNR_RETRY_EXC_ERR: + ibwc_status = IB_WC_RNR_RETRY_EXC_ERR; + break; + case OCRDMA_CQE_LOC_RDD_VIOL_ERR: + ibwc_status = IB_WC_LOC_RDD_VIOL_ERR; + break; + case OCRDMA_CQE_REM_INV_RD_REQ_ERR: + ibwc_status = IB_WC_REM_INV_RD_REQ_ERR; + break; + case OCRDMA_CQE_REM_ABORT_ERR: + ibwc_status = IB_WC_REM_ABORT_ERR; + break; + case OCRDMA_CQE_INV_EECN_ERR: + ibwc_status = IB_WC_INV_EECN_ERR; + break; + case OCRDMA_CQE_INV_EEC_STATE_ERR: + ibwc_status = IB_WC_INV_EEC_STATE_ERR; + break; + case OCRDMA_CQE_FATAL_ERR: + ibwc_status = IB_WC_FATAL_ERR; + break; + case OCRDMA_CQE_RESP_TIMEOUT_ERR: + ibwc_status = IB_WC_RESP_TIMEOUT_ERR; + break; + default: + ibwc_status = IB_WC_GENERAL_ERR; + break; + } + return ibwc_status; +} + +static void ocrdma_update_wc(struct ocrdma_qp *qp, struct ib_wc *ibwc, + u32 wqe_idx) +{ + struct ocrdma_hdr_wqe *hdr; + struct ocrdma_sge *rw; + int opcode; + + hdr = ocrdma_hwq_head_from_idx(&qp->sq, wqe_idx); + + ibwc->wr_id = qp->wqe_wr_id_tbl[wqe_idx].wrid; + /* Undo the hdr->cw swap */ + opcode = le32_to_cpu(hdr->cw) & OCRDMA_WQE_OPCODE_MASK; + switch (opcode) { + case OCRDMA_WRITE: + ibwc->opcode = IB_WC_RDMA_WRITE; + break; + case OCRDMA_READ: + rw = (struct ocrdma_sge *)(hdr + 1); + ibwc->opcode = IB_WC_RDMA_READ; + ibwc->byte_len = rw->len; + break; + case OCRDMA_SEND: + ibwc->opcode = IB_WC_SEND; + break; + case OCRDMA_FR_MR: + ibwc->opcode = IB_WC_FAST_REG_MR; + break; + case OCRDMA_LKEY_INV: + ibwc->opcode = IB_WC_LOCAL_INV; + break; + default: + ibwc->status = IB_WC_GENERAL_ERR; + pr_err("%s() invalid opcode received = 0x%x\n", + __func__, hdr->cw & OCRDMA_WQE_OPCODE_MASK); + break; + } +} + +static void ocrdma_set_cqe_status_flushed(struct ocrdma_qp *qp, + struct ocrdma_cqe *cqe) +{ + if (is_cqe_for_sq(cqe)) { + cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu( + cqe->flags_status_srcqpn) & + ~OCRDMA_CQE_STATUS_MASK); + cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu( + cqe->flags_status_srcqpn) | + (OCRDMA_CQE_WR_FLUSH_ERR << + OCRDMA_CQE_STATUS_SHIFT)); + } else { + if (qp->qp_type == IB_QPT_UD || qp->qp_type == IB_QPT_GSI) { + cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu( + cqe->flags_status_srcqpn) & + ~OCRDMA_CQE_UD_STATUS_MASK); + cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu( + cqe->flags_status_srcqpn) | + (OCRDMA_CQE_WR_FLUSH_ERR << + OCRDMA_CQE_UD_STATUS_SHIFT)); + } else { + cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu( + cqe->flags_status_srcqpn) & + ~OCRDMA_CQE_STATUS_MASK); + cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu( + cqe->flags_status_srcqpn) | + (OCRDMA_CQE_WR_FLUSH_ERR << + OCRDMA_CQE_STATUS_SHIFT)); + } + } +} + +static bool ocrdma_update_err_cqe(struct ib_wc *ibwc, struct ocrdma_cqe *cqe, + struct ocrdma_qp *qp, int status) +{ + bool expand = false; + + ibwc->byte_len = 0; + ibwc->qp = &qp->ibqp; + ibwc->status = ocrdma_to_ibwc_err(status); + + ocrdma_flush_qp(qp); + ocrdma_qp_state_change(qp, IB_QPS_ERR, NULL); + + /* if wqe/rqe pending for which cqe needs to be returned, + * trigger inflating it. + */ + if (!is_hw_rq_empty(qp) || !is_hw_sq_empty(qp)) { + expand = true; + ocrdma_set_cqe_status_flushed(qp, cqe); + } + return expand; +} + +static int ocrdma_update_err_rcqe(struct ib_wc *ibwc, struct ocrdma_cqe *cqe, + struct ocrdma_qp *qp, int status) +{ + ibwc->opcode = IB_WC_RECV; + ibwc->wr_id = qp->rqe_wr_id_tbl[qp->rq.tail]; + ocrdma_hwq_inc_tail(&qp->rq); + + return ocrdma_update_err_cqe(ibwc, cqe, qp, status); +} + +static int ocrdma_update_err_scqe(struct ib_wc *ibwc, struct ocrdma_cqe *cqe, + struct ocrdma_qp *qp, int status) +{ + ocrdma_update_wc(qp, ibwc, qp->sq.tail); + ocrdma_hwq_inc_tail(&qp->sq); + + return ocrdma_update_err_cqe(ibwc, cqe, qp, status); +} + + +static bool ocrdma_poll_err_scqe(struct ocrdma_qp *qp, + struct ocrdma_cqe *cqe, struct ib_wc *ibwc, + bool *polled, bool *stop) +{ + bool expand; + int status = (le32_to_cpu(cqe->flags_status_srcqpn) & + OCRDMA_CQE_STATUS_MASK) >> OCRDMA_CQE_STATUS_SHIFT; + + /* when hw sq is empty, but rq is not empty, so we continue + * to keep the cqe in order to get the cq event again. + */ + if (is_hw_sq_empty(qp) && !is_hw_rq_empty(qp)) { + /* when cq for rq and sq is same, it is safe to return + * flush cqe for RQEs. + */ + if (!qp->srq && (qp->sq_cq == qp->rq_cq)) { + *polled = true; + status = OCRDMA_CQE_WR_FLUSH_ERR; + expand = ocrdma_update_err_rcqe(ibwc, cqe, qp, status); + } else { + /* stop processing further cqe as this cqe is used for + * triggering cq event on buddy cq of RQ. + * When QP is destroyed, this cqe will be removed + * from the cq's hardware q. + */ + *polled = false; + *stop = true; + expand = false; + } + } else if (is_hw_sq_empty(qp)) { + /* Do nothing */ + expand = false; + *polled = false; + *stop = false; + } else { + *polled = true; + expand = ocrdma_update_err_scqe(ibwc, cqe, qp, status); + } + return expand; +} + +static bool ocrdma_poll_success_scqe(struct ocrdma_qp *qp, + struct ocrdma_cqe *cqe, + struct ib_wc *ibwc, bool *polled) +{ + bool expand = false; + int tail = qp->sq.tail; + u32 wqe_idx; + + if (!qp->wqe_wr_id_tbl[tail].signaled) { + *polled = false; /* WC cannot be consumed yet */ + } else { + ibwc->status = IB_WC_SUCCESS; + ibwc->wc_flags = 0; + ibwc->qp = &qp->ibqp; + ocrdma_update_wc(qp, ibwc, tail); + *polled = true; + } + wqe_idx = (le32_to_cpu(cqe->wq.wqeidx) & + OCRDMA_CQE_WQEIDX_MASK) & qp->sq.max_wqe_idx; + if (tail != wqe_idx) + expand = true; /* Coalesced CQE can't be consumed yet */ + + ocrdma_hwq_inc_tail(&qp->sq); + return expand; +} + +static bool ocrdma_poll_scqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe, + struct ib_wc *ibwc, bool *polled, bool *stop) +{ + int status; + bool expand; + + status = (le32_to_cpu(cqe->flags_status_srcqpn) & + OCRDMA_CQE_STATUS_MASK) >> OCRDMA_CQE_STATUS_SHIFT; + + if (status == OCRDMA_CQE_SUCCESS) + expand = ocrdma_poll_success_scqe(qp, cqe, ibwc, polled); + else + expand = ocrdma_poll_err_scqe(qp, cqe, ibwc, polled, stop); + return expand; +} + +static int ocrdma_update_ud_rcqe(struct ib_wc *ibwc, struct ocrdma_cqe *cqe) +{ + int status; + + status = (le32_to_cpu(cqe->flags_status_srcqpn) & + OCRDMA_CQE_UD_STATUS_MASK) >> OCRDMA_CQE_UD_STATUS_SHIFT; + ibwc->src_qp = le32_to_cpu(cqe->flags_status_srcqpn) & + OCRDMA_CQE_SRCQP_MASK; + ibwc->pkey_index = le32_to_cpu(cqe->ud.rxlen_pkey) & + OCRDMA_CQE_PKEY_MASK; + ibwc->wc_flags = IB_WC_GRH; + ibwc->byte_len = (le32_to_cpu(cqe->ud.rxlen_pkey) >> + OCRDMA_CQE_UD_XFER_LEN_SHIFT); + return status; +} + +static void ocrdma_update_free_srq_cqe(struct ib_wc *ibwc, + struct ocrdma_cqe *cqe, + struct ocrdma_qp *qp) +{ + unsigned long flags; + struct ocrdma_srq *srq; + u32 wqe_idx; + + srq = get_ocrdma_srq(qp->ibqp.srq); + wqe_idx = (le32_to_cpu(cqe->rq.buftag_qpn) >> + OCRDMA_CQE_BUFTAG_SHIFT) & srq->rq.max_wqe_idx; + if (wqe_idx < 1) + BUG(); + + ibwc->wr_id = srq->rqe_wr_id_tbl[wqe_idx]; + spin_lock_irqsave(&srq->q_lock, flags); + ocrdma_srq_toggle_bit(srq, wqe_idx - 1); + spin_unlock_irqrestore(&srq->q_lock, flags); + ocrdma_hwq_inc_tail(&srq->rq); +} + +static bool ocrdma_poll_err_rcqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe, + struct ib_wc *ibwc, bool *polled, bool *stop, + int status) +{ + bool expand; + + /* when hw_rq is empty, but wq is not empty, so continue + * to keep the cqe to get the cq event again. + */ + if (is_hw_rq_empty(qp) && !is_hw_sq_empty(qp)) { + if (!qp->srq && (qp->sq_cq == qp->rq_cq)) { + *polled = true; + status = OCRDMA_CQE_WR_FLUSH_ERR; + expand = ocrdma_update_err_scqe(ibwc, cqe, qp, status); + } else { + *polled = false; + *stop = true; + expand = false; + } + } else if (is_hw_rq_empty(qp)) { + /* Do nothing */ + expand = false; + *polled = false; + *stop = false; + } else { + *polled = true; + expand = ocrdma_update_err_rcqe(ibwc, cqe, qp, status); + } + return expand; +} + +static void ocrdma_poll_success_rcqe(struct ocrdma_qp *qp, + struct ocrdma_cqe *cqe, struct ib_wc *ibwc) +{ + ibwc->opcode = IB_WC_RECV; + ibwc->qp = &qp->ibqp; + ibwc->status = IB_WC_SUCCESS; + + if (qp->qp_type == IB_QPT_UD || qp->qp_type == IB_QPT_GSI) + ocrdma_update_ud_rcqe(ibwc, cqe); + else + ibwc->byte_len = le32_to_cpu(cqe->rq.rxlen); + + if (is_cqe_imm(cqe)) { + ibwc->ex.imm_data = htonl(le32_to_cpu(cqe->rq.lkey_immdt)); + ibwc->wc_flags |= IB_WC_WITH_IMM; + } else if (is_cqe_wr_imm(cqe)) { + ibwc->opcode = IB_WC_RECV_RDMA_WITH_IMM; + ibwc->ex.imm_data = htonl(le32_to_cpu(cqe->rq.lkey_immdt)); + ibwc->wc_flags |= IB_WC_WITH_IMM; + } else if (is_cqe_invalidated(cqe)) { + ibwc->ex.invalidate_rkey = le32_to_cpu(cqe->rq.lkey_immdt); + ibwc->wc_flags |= IB_WC_WITH_INVALIDATE; + } + if (qp->ibqp.srq) { + ocrdma_update_free_srq_cqe(ibwc, cqe, qp); + } else { + ibwc->wr_id = qp->rqe_wr_id_tbl[qp->rq.tail]; + ocrdma_hwq_inc_tail(&qp->rq); + } +} + +static bool ocrdma_poll_rcqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe, + struct ib_wc *ibwc, bool *polled, bool *stop) +{ + int status; + bool expand = false; + + ibwc->wc_flags = 0; + if (qp->qp_type == IB_QPT_UD || qp->qp_type == IB_QPT_GSI) { + status = (le32_to_cpu(cqe->flags_status_srcqpn) & + OCRDMA_CQE_UD_STATUS_MASK) >> + OCRDMA_CQE_UD_STATUS_SHIFT; + } else { + status = (le32_to_cpu(cqe->flags_status_srcqpn) & + OCRDMA_CQE_STATUS_MASK) >> OCRDMA_CQE_STATUS_SHIFT; + } + + if (status == OCRDMA_CQE_SUCCESS) { + *polled = true; + ocrdma_poll_success_rcqe(qp, cqe, ibwc); + } else { + expand = ocrdma_poll_err_rcqe(qp, cqe, ibwc, polled, stop, + status); + } + return expand; +} + +static void ocrdma_change_cq_phase(struct ocrdma_cq *cq, struct ocrdma_cqe *cqe, + u16 cur_getp) +{ + if (cq->phase_change) { + if (cur_getp == 0) + cq->phase = (~cq->phase & OCRDMA_CQE_VALID); + } else { + /* clear valid bit */ + cqe->flags_status_srcqpn = 0; + } +} + +static int ocrdma_poll_hwcq(struct ocrdma_cq *cq, int num_entries, + struct ib_wc *ibwc) +{ + u16 qpn = 0; + int i = 0; + bool expand = false; + int polled_hw_cqes = 0; + struct ocrdma_qp *qp = NULL; + struct ocrdma_dev *dev = get_ocrdma_dev(cq->ibcq.device); + struct ocrdma_cqe *cqe; + u16 cur_getp; bool polled = false; bool stop = false; + + cur_getp = cq->getp; + while (num_entries) { + cqe = cq->va + cur_getp; + /* check whether valid cqe or not */ + if (!is_cqe_valid(cq, cqe)) + break; + qpn = (le32_to_cpu(cqe->cmn.qpn) & OCRDMA_CQE_QPN_MASK); + /* ignore discarded cqe */ + if (qpn == 0) + goto skip_cqe; + qp = dev->qp_tbl[qpn]; + BUG_ON(qp == NULL); + + if (is_cqe_for_sq(cqe)) { + expand = ocrdma_poll_scqe(qp, cqe, ibwc, &polled, + &stop); + } else { + expand = ocrdma_poll_rcqe(qp, cqe, ibwc, &polled, + &stop); + } + if (expand) + goto expand_cqe; + if (stop) + goto stop_cqe; + /* clear qpn to avoid duplicate processing by discard_cqe() */ + cqe->cmn.qpn = 0; +skip_cqe: + polled_hw_cqes += 1; + cur_getp = (cur_getp + 1) % cq->max_hw_cqe; + ocrdma_change_cq_phase(cq, cqe, cur_getp); +expand_cqe: + if (polled) { + num_entries -= 1; + i += 1; + ibwc = ibwc + 1; + polled = false; + } + } +stop_cqe: + cq->getp = cur_getp; + if (cq->deferred_arm) { + ocrdma_ring_cq_db(dev, cq->id, true, cq->deferred_sol, + polled_hw_cqes); + cq->deferred_arm = false; + cq->deferred_sol = false; + } else { + /* We need to pop the CQE. No need to arm */ + ocrdma_ring_cq_db(dev, cq->id, false, cq->deferred_sol, + polled_hw_cqes); + cq->deferred_sol = false; + } + + return i; +} + +/* insert error cqe if the QP's SQ or RQ's CQ matches the CQ under poll. */ +static int ocrdma_add_err_cqe(struct ocrdma_cq *cq, int num_entries, + struct ocrdma_qp *qp, struct ib_wc *ibwc) +{ + int err_cqes = 0; + + while (num_entries) { + if (is_hw_sq_empty(qp) && is_hw_rq_empty(qp)) + break; + if (!is_hw_sq_empty(qp) && qp->sq_cq == cq) { + ocrdma_update_wc(qp, ibwc, qp->sq.tail); + ocrdma_hwq_inc_tail(&qp->sq); + } else if (!is_hw_rq_empty(qp) && qp->rq_cq == cq) { + ibwc->wr_id = qp->rqe_wr_id_tbl[qp->rq.tail]; + ocrdma_hwq_inc_tail(&qp->rq); + } else { + return err_cqes; + } + ibwc->byte_len = 0; + ibwc->status = IB_WC_WR_FLUSH_ERR; + ibwc = ibwc + 1; + err_cqes += 1; + num_entries -= 1; + } + return err_cqes; +} + +int ocrdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + int cqes_to_poll = num_entries; + struct ocrdma_cq *cq = get_ocrdma_cq(ibcq); + struct ocrdma_dev *dev = get_ocrdma_dev(ibcq->device); + int num_os_cqe = 0, err_cqes = 0; + struct ocrdma_qp *qp; + unsigned long flags; + + /* poll cqes from adapter CQ */ + spin_lock_irqsave(&cq->cq_lock, flags); + num_os_cqe = ocrdma_poll_hwcq(cq, cqes_to_poll, wc); + spin_unlock_irqrestore(&cq->cq_lock, flags); + cqes_to_poll -= num_os_cqe; + + if (cqes_to_poll) { + wc = wc + num_os_cqe; + /* adapter returns single error cqe when qp moves to + * error state. So insert error cqes with wc_status as + * FLUSHED for pending WQEs and RQEs of QP's SQ and RQ + * respectively which uses this CQ. + */ + spin_lock_irqsave(&dev->flush_q_lock, flags); + list_for_each_entry(qp, &cq->sq_head, sq_entry) { + if (cqes_to_poll == 0) + break; + err_cqes = ocrdma_add_err_cqe(cq, cqes_to_poll, qp, wc); + cqes_to_poll -= err_cqes; + num_os_cqe += err_cqes; + wc = wc + err_cqes; + } + spin_unlock_irqrestore(&dev->flush_q_lock, flags); + } + return num_os_cqe; +} + +int ocrdma_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags cq_flags) +{ + struct ocrdma_cq *cq = get_ocrdma_cq(ibcq); + struct ocrdma_dev *dev = get_ocrdma_dev(ibcq->device); + u16 cq_id; + unsigned long flags; + bool arm_needed = false, sol_needed = false; + + cq_id = cq->id; + + spin_lock_irqsave(&cq->cq_lock, flags); + if (cq_flags & IB_CQ_NEXT_COMP || cq_flags & IB_CQ_SOLICITED) + arm_needed = true; + if (cq_flags & IB_CQ_SOLICITED) + sol_needed = true; + + if (cq->first_arm) { + ocrdma_ring_cq_db(dev, cq_id, arm_needed, sol_needed, 0); + cq->first_arm = false; + } + + cq->deferred_arm = true; + cq->deferred_sol = sol_needed; + spin_unlock_irqrestore(&cq->cq_lock, flags); + + return 0; +} + +struct ib_mr *ocrdma_alloc_frmr(struct ib_pd *ibpd, int max_page_list_len) +{ + int status; + struct ocrdma_mr *mr; + struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); + struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); + + if (max_page_list_len > dev->attr.max_pages_per_frmr) + return ERR_PTR(-EINVAL); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + status = ocrdma_get_pbl_info(dev, mr, max_page_list_len); + if (status) + goto pbl_err; + mr->hwmr.fr_mr = 1; + mr->hwmr.remote_rd = 0; + mr->hwmr.remote_wr = 0; + mr->hwmr.local_rd = 0; + mr->hwmr.local_wr = 0; + mr->hwmr.mw_bind = 0; + status = ocrdma_build_pbl_tbl(dev, &mr->hwmr); + if (status) + goto pbl_err; + status = ocrdma_reg_mr(dev, &mr->hwmr, pd->id, 0); + if (status) + goto mbx_err; + mr->ibmr.rkey = mr->hwmr.lkey; + mr->ibmr.lkey = mr->hwmr.lkey; + dev->stag_arr[(mr->hwmr.lkey >> 8) & (OCRDMA_MAX_STAG - 1)] = + (unsigned long) mr; + return &mr->ibmr; +mbx_err: + ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr); +pbl_err: + kfree(mr); + return ERR_PTR(-ENOMEM); +} + +struct ib_fast_reg_page_list *ocrdma_alloc_frmr_page_list(struct ib_device + *ibdev, + int page_list_len) +{ + struct ib_fast_reg_page_list *frmr_list; + int size; + + size = sizeof(*frmr_list) + (page_list_len * sizeof(u64)); + frmr_list = kzalloc(size, GFP_KERNEL); + if (!frmr_list) + return ERR_PTR(-ENOMEM); + frmr_list->page_list = (u64 *)(frmr_list + 1); + return frmr_list; +} + +void ocrdma_free_frmr_page_list(struct ib_fast_reg_page_list *page_list) +{ + kfree(page_list); +} + +#define MAX_KERNEL_PBE_SIZE 65536 +static inline int count_kernel_pbes(struct ib_phys_buf *buf_list, + int buf_cnt, u32 *pbe_size) +{ + u64 total_size = 0; + u64 buf_size = 0; + int i; + *pbe_size = roundup(buf_list[0].size, PAGE_SIZE); + *pbe_size = roundup_pow_of_two(*pbe_size); + + /* find the smallest PBE size that we can have */ + for (i = 0; i < buf_cnt; i++) { + /* first addr may not be page aligned, so ignore checking */ + if ((i != 0) && ((buf_list[i].addr & ~PAGE_MASK) || + (buf_list[i].size & ~PAGE_MASK))) { + return 0; + } + + /* if configured PBE size is greater then the chosen one, + * reduce the PBE size. + */ + buf_size = roundup(buf_list[i].size, PAGE_SIZE); + /* pbe_size has to be even multiple of 4K 1,2,4,8...*/ + buf_size = roundup_pow_of_two(buf_size); + if (*pbe_size > buf_size) + *pbe_size = buf_size; + + total_size += buf_size; + } + *pbe_size = *pbe_size > MAX_KERNEL_PBE_SIZE ? + (MAX_KERNEL_PBE_SIZE) : (*pbe_size); + + /* num_pbes = total_size / (*pbe_size); this is implemented below. */ + + return total_size >> ilog2(*pbe_size); +} + +static void build_kernel_pbes(struct ib_phys_buf *buf_list, int ib_buf_cnt, + u32 pbe_size, struct ocrdma_pbl *pbl_tbl, + struct ocrdma_hw_mr *hwmr) +{ + int i; + int idx; + int pbes_per_buf = 0; + u64 buf_addr = 0; + int num_pbes; + struct ocrdma_pbe *pbe; + int total_num_pbes = 0; + + if (!hwmr->num_pbes) + return; + + pbe = (struct ocrdma_pbe *)pbl_tbl->va; + num_pbes = 0; + + /* go through the OS phy regions & fill hw pbe entries into pbls. */ + for (i = 0; i < ib_buf_cnt; i++) { + buf_addr = buf_list[i].addr; + pbes_per_buf = + roundup_pow_of_two(roundup(buf_list[i].size, PAGE_SIZE)) / + pbe_size; + hwmr->len += buf_list[i].size; + /* number of pbes can be more for one OS buf, when + * buffers are of different sizes. + * split the ib_buf to one or more pbes. + */ + for (idx = 0; idx < pbes_per_buf; idx++) { + /* we program always page aligned addresses, + * first unaligned address is taken care by fbo. + */ + if (i == 0) { + /* for non zero fbo, assign the + * start of the page. + */ + pbe->pa_lo = + cpu_to_le32((u32) (buf_addr & PAGE_MASK)); + pbe->pa_hi = + cpu_to_le32((u32) upper_32_bits(buf_addr)); + } else { + pbe->pa_lo = + cpu_to_le32((u32) (buf_addr & 0xffffffff)); + pbe->pa_hi = + cpu_to_le32((u32) upper_32_bits(buf_addr)); + } + buf_addr += pbe_size; + num_pbes += 1; + total_num_pbes += 1; + pbe++; + + if (total_num_pbes == hwmr->num_pbes) + goto mr_tbl_done; + /* if the pbl is full storing the pbes, + * move to next pbl. + */ + if (num_pbes == (hwmr->pbl_size/sizeof(u64))) { + pbl_tbl++; + pbe = (struct ocrdma_pbe *)pbl_tbl->va; + num_pbes = 0; + } + } + } +mr_tbl_done: + return; +} + +struct ib_mr *ocrdma_reg_kernel_mr(struct ib_pd *ibpd, + struct ib_phys_buf *buf_list, + int buf_cnt, int acc, u64 *iova_start) +{ + int status = -ENOMEM; + struct ocrdma_mr *mr; + struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); + struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); + u32 num_pbes; + u32 pbe_size = 0; + + if ((acc & IB_ACCESS_REMOTE_WRITE) && !(acc & IB_ACCESS_LOCAL_WRITE)) + return ERR_PTR(-EINVAL); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(status); + + num_pbes = count_kernel_pbes(buf_list, buf_cnt, &pbe_size); + if (num_pbes == 0) { + status = -EINVAL; + goto pbl_err; + } + status = ocrdma_get_pbl_info(dev, mr, num_pbes); + if (status) + goto pbl_err; + + mr->hwmr.pbe_size = pbe_size; + mr->hwmr.fbo = *iova_start - (buf_list[0].addr & PAGE_MASK); + mr->hwmr.va = *iova_start; + mr->hwmr.local_rd = 1; + mr->hwmr.remote_wr = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0; + mr->hwmr.remote_rd = (acc & IB_ACCESS_REMOTE_READ) ? 1 : 0; + mr->hwmr.local_wr = (acc & IB_ACCESS_LOCAL_WRITE) ? 1 : 0; + mr->hwmr.remote_atomic = (acc & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0; + mr->hwmr.mw_bind = (acc & IB_ACCESS_MW_BIND) ? 1 : 0; + + status = ocrdma_build_pbl_tbl(dev, &mr->hwmr); + if (status) + goto pbl_err; + build_kernel_pbes(buf_list, buf_cnt, pbe_size, mr->hwmr.pbl_table, + &mr->hwmr); + status = ocrdma_reg_mr(dev, &mr->hwmr, pd->id, acc); + if (status) + goto mbx_err; + + mr->ibmr.lkey = mr->hwmr.lkey; + if (mr->hwmr.remote_wr || mr->hwmr.remote_rd) + mr->ibmr.rkey = mr->hwmr.lkey; + return &mr->ibmr; + +mbx_err: + ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr); +pbl_err: + kfree(mr); + return ERR_PTR(status); +} diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h new file mode 100644 index 0000000..b8f7853 --- /dev/null +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h @@ -0,0 +1,99 @@ +/******************************************************************* + * This file is part of the Emulex RoCE Device Driver for * + * RoCE (RDMA over Converged Ethernet) adapters. * + * Copyright (C) 2008-2012 Emulex. All rights reserved. * + * EMULEX and SLI are trademarks of Emulex. * + * www.emulex.com * + * * + * This program is free software; you can redistribute it and/or * + * modify it under the terms of version 2 of the GNU General * + * Public License as published by the Free Software Foundation. * + * This program is distributed in the hope that it will be useful. * + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND * + * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE * + * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD * + * TO BE LEGALLY INVALID. See the GNU General Public License for * + * more details, a copy of which can be found in the file COPYING * + * included with this package. * + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + *******************************************************************/ + +#ifndef __OCRDMA_VERBS_H__ +#define __OCRDMA_VERBS_H__ + +int ocrdma_post_send(struct ib_qp *, struct ib_send_wr *, + struct ib_send_wr **bad_wr); +int ocrdma_post_recv(struct ib_qp *, struct ib_recv_wr *, + struct ib_recv_wr **bad_wr); + +int ocrdma_poll_cq(struct ib_cq *, int num_entries, struct ib_wc *wc); +int ocrdma_arm_cq(struct ib_cq *, enum ib_cq_notify_flags flags); + +int ocrdma_query_device(struct ib_device *, struct ib_device_attr *props); +int ocrdma_query_port(struct ib_device *, u8 port, struct ib_port_attr *props); +int ocrdma_modify_port(struct ib_device *, u8 port, int mask, + struct ib_port_modify *props); + +void ocrdma_get_guid(struct ocrdma_dev *, u8 *guid); +int ocrdma_query_gid(struct ib_device *, u8 port, + int index, union ib_gid *gid); +int ocrdma_query_pkey(struct ib_device *, u8 port, u16 index, u16 *pkey); + +struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *, + struct ib_udata *); +int ocrdma_dealloc_ucontext(struct ib_ucontext *); + +int ocrdma_mmap(struct ib_ucontext *, struct vm_area_struct *vma); + +struct ib_pd *ocrdma_alloc_pd(struct ib_device *, + struct ib_ucontext *, struct ib_udata *); +int ocrdma_dealloc_pd(struct ib_pd *pd); + +struct ib_cq *ocrdma_create_cq(struct ib_device *, int entries, int vector, + struct ib_ucontext *, struct ib_udata *); +int ocrdma_resize_cq(struct ib_cq *, int cqe, struct ib_udata *); +int ocrdma_destroy_cq(struct ib_cq *); + +struct ib_qp *ocrdma_create_qp(struct ib_pd *, + struct ib_qp_init_attr *attrs, + struct ib_udata *); +int _ocrdma_modify_qp(struct ib_qp *, struct ib_qp_attr *attr, + int attr_mask); +int ocrdma_modify_qp(struct ib_qp *, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); +int ocrdma_query_qp(struct ib_qp *, + struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *); +int ocrdma_destroy_qp(struct ib_qp *); +void ocrdma_del_flush_qp(struct ocrdma_qp *qp); + +struct ib_srq *ocrdma_create_srq(struct ib_pd *, struct ib_srq_init_attr *, + struct ib_udata *); +int ocrdma_modify_srq(struct ib_srq *, struct ib_srq_attr *, + enum ib_srq_attr_mask, struct ib_udata *); +int ocrdma_query_srq(struct ib_srq *, struct ib_srq_attr *); +int ocrdma_destroy_srq(struct ib_srq *); +int ocrdma_post_srq_recv(struct ib_srq *, struct ib_recv_wr *, + struct ib_recv_wr **bad_recv_wr); + +int ocrdma_dereg_mr(struct ib_mr *); +struct ib_mr *ocrdma_get_dma_mr(struct ib_pd *, int acc); +struct ib_mr *ocrdma_reg_kernel_mr(struct ib_pd *, + struct ib_phys_buf *buffer_list, + int num_phys_buf, int acc, u64 *iova_start); +struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *, u64 start, u64 length, + u64 virt, int acc, struct ib_udata *); +struct ib_mr *ocrdma_alloc_frmr(struct ib_pd *pd, int max_page_list_len); +struct ib_fast_reg_page_list *ocrdma_alloc_frmr_page_list(struct ib_device + *ibdev, + int page_list_len); +void ocrdma_free_frmr_page_list(struct ib_fast_reg_page_list *page_list); + +#endif /* __OCRDMA_VERBS_H__ */ diff --git a/drivers/infiniband/hw/qib/Kconfig b/drivers/infiniband/hw/qib/Kconfig new file mode 100644 index 0000000..495be09 --- /dev/null +++ b/drivers/infiniband/hw/qib/Kconfig @@ -0,0 +1,15 @@ +config INFINIBAND_QIB + tristate "Intel PCIe HCA support" + depends on 64BIT + ---help--- + This is a low-level driver for Intel PCIe QLE InfiniBand host + channel adapters. This driver does not support the Intel + HyperTransport card (model QHT7140). + +config INFINIBAND_QIB_DCA + bool "QIB DCA support" + depends on INFINIBAND_QIB && DCA && SMP && !(INFINIBAND_QIB=y && DCA=m) + default y + ---help--- + Setting this enables DCA support on some Intel chip sets + with the iba7322 HCA. diff --git a/drivers/infiniband/hw/qib/Makefile b/drivers/infiniband/hw/qib/Makefile new file mode 100644 index 0000000..57f8103 --- /dev/null +++ b/drivers/infiniband/hw/qib/Makefile @@ -0,0 +1,16 @@ +obj-$(CONFIG_INFINIBAND_QIB) += ib_qib.o + +ib_qib-y := qib_cq.o qib_diag.o qib_dma.o qib_driver.o qib_eeprom.o \ + qib_file_ops.o qib_fs.o qib_init.o qib_intr.o qib_keys.o \ + qib_mad.o qib_mmap.o qib_mr.o qib_pcie.o qib_pio_copy.o \ + qib_qp.o qib_qsfp.o qib_rc.o qib_ruc.o qib_sdma.o qib_srq.o \ + qib_sysfs.o qib_twsi.o qib_tx.o qib_uc.o qib_ud.o \ + qib_user_pages.o qib_user_sdma.o qib_verbs_mcast.o qib_iba7220.o \ + qib_sd7220.o qib_iba7322.o qib_verbs.o + +# 6120 has no fallback if no MSI interrupts, others can do INTx +ib_qib-$(CONFIG_PCI_MSI) += qib_iba6120.o + +ib_qib-$(CONFIG_X86_64) += qib_wc_x86_64.o +ib_qib-$(CONFIG_PPC64) += qib_wc_ppc64.o +ib_qib-$(CONFIG_DEBUG_FS) += qib_debugfs.o diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h new file mode 100644 index 0000000..c00ae09 --- /dev/null +++ b/drivers/infiniband/hw/qib/qib.h @@ -0,0 +1,1548 @@ +#ifndef _QIB_KERNEL_H +#define _QIB_KERNEL_H +/* + * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This header file is the base header file for qlogic_ib kernel code + * qib_user.h serves a similar purpose for user code. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "qib_common.h" +#include "qib_verbs.h" + +/* only s/w major version of QLogic_IB we can handle */ +#define QIB_CHIP_VERS_MAJ 2U + +/* don't care about this except printing */ +#define QIB_CHIP_VERS_MIN 0U + +/* The Organization Unique Identifier (Mfg code), and its position in GUID */ +#define QIB_OUI 0x001175 +#define QIB_OUI_LSB 40 + +/* + * per driver stats, either not device nor port-specific, or + * summed over all of the devices and ports. + * They are described by name via ipathfs filesystem, so layout + * and number of elements can change without breaking compatibility. + * If members are added or deleted qib_statnames[] in qib_fs.c must + * change to match. + */ +struct qlogic_ib_stats { + __u64 sps_ints; /* number of interrupts handled */ + __u64 sps_errints; /* number of error interrupts */ + __u64 sps_txerrs; /* tx-related packet errors */ + __u64 sps_rcverrs; /* non-crc rcv packet errors */ + __u64 sps_hwerrs; /* hardware errors reported (parity, etc.) */ + __u64 sps_nopiobufs; /* no pio bufs avail from kernel */ + __u64 sps_ctxts; /* number of contexts currently open */ + __u64 sps_lenerrs; /* number of kernel packets where RHF != LRH len */ + __u64 sps_buffull; + __u64 sps_hdrfull; +}; + +extern struct qlogic_ib_stats qib_stats; +extern const struct pci_error_handlers qib_pci_err_handler; + +#define QIB_CHIP_SWVERSION QIB_CHIP_VERS_MAJ +/* + * First-cut critierion for "device is active" is + * two thousand dwords combined Tx, Rx traffic per + * 5-second interval. SMA packets are 64 dwords, + * and occur "a few per second", presumably each way. + */ +#define QIB_TRAFFIC_ACTIVE_THRESHOLD (2000) + +/* + * Struct used to indicate which errors are logged in each of the + * error-counters that are logged to EEPROM. A counter is incremented + * _once_ (saturating at 255) for each event with any bits set in + * the error or hwerror register masks below. + */ +#define QIB_EEP_LOG_CNT (4) +struct qib_eep_log_mask { + u64 errs_to_log; + u64 hwerrs_to_log; +}; + +/* + * Below contains all data related to a single context (formerly called port). + */ + +#ifdef CONFIG_DEBUG_FS +struct qib_opcode_stats_perctx; +#endif + +struct qib_ctxtdata { + void **rcvegrbuf; + dma_addr_t *rcvegrbuf_phys; + /* rcvhdrq base, needs mmap before useful */ + void *rcvhdrq; + /* kernel virtual address where hdrqtail is updated */ + void *rcvhdrtail_kvaddr; + /* + * temp buffer for expected send setup, allocated at open, instead + * of each setup call + */ + void *tid_pg_list; + /* + * Shared page for kernel to signal user processes that send buffers + * need disarming. The process should call QIB_CMD_DISARM_BUFS + * or QIB_CMD_ACK_EVENT with IPATH_EVENT_DISARM_BUFS set. + */ + unsigned long *user_event_mask; + /* when waiting for rcv or pioavail */ + wait_queue_head_t wait; + /* + * rcvegr bufs base, physical, must fit + * in 44 bits so 32 bit programs mmap64 44 bit works) + */ + dma_addr_t rcvegr_phys; + /* mmap of hdrq, must fit in 44 bits */ + dma_addr_t rcvhdrq_phys; + dma_addr_t rcvhdrqtailaddr_phys; + + /* + * number of opens (including slave sub-contexts) on this instance + * (ignoring forks, dup, etc. for now) + */ + int cnt; + /* + * how much space to leave at start of eager TID entries for + * protocol use, on each TID + */ + /* instead of calculating it */ + unsigned ctxt; + /* local node of context */ + int node_id; + /* non-zero if ctxt is being shared. */ + u16 subctxt_cnt; + /* non-zero if ctxt is being shared. */ + u16 subctxt_id; + /* number of eager TID entries. */ + u16 rcvegrcnt; + /* index of first eager TID entry. */ + u16 rcvegr_tid_base; + /* number of pio bufs for this ctxt (all procs, if shared) */ + u32 piocnt; + /* first pio buffer for this ctxt */ + u32 pio_base; + /* chip offset of PIO buffers for this ctxt */ + u32 piobufs; + /* how many alloc_pages() chunks in rcvegrbuf_pages */ + u32 rcvegrbuf_chunks; + /* how many egrbufs per chunk */ + u16 rcvegrbufs_perchunk; + /* ilog2 of above */ + u16 rcvegrbufs_perchunk_shift; + /* order for rcvegrbuf_pages */ + size_t rcvegrbuf_size; + /* rcvhdrq size (for freeing) */ + size_t rcvhdrq_size; + /* per-context flags for fileops/intr communication */ + unsigned long flag; + /* next expected TID to check when looking for free */ + u32 tidcursor; + /* WAIT_RCV that timed out, no interrupt */ + u32 rcvwait_to; + /* WAIT_PIO that timed out, no interrupt */ + u32 piowait_to; + /* WAIT_RCV already happened, no wait */ + u32 rcvnowait; + /* WAIT_PIO already happened, no wait */ + u32 pionowait; + /* total number of polled urgent packets */ + u32 urgent; + /* saved total number of polled urgent packets for poll edge trigger */ + u32 urgent_poll; + /* pid of process using this ctxt */ + pid_t pid; + pid_t subpid[QLOGIC_IB_MAX_SUBCTXT]; + /* same size as task_struct .comm[], command that opened context */ + char comm[16]; + /* pkeys set by this use of this ctxt */ + u16 pkeys[4]; + /* so file ops can get at unit */ + struct qib_devdata *dd; + /* so funcs that need physical port can get it easily */ + struct qib_pportdata *ppd; + /* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */ + void *subctxt_uregbase; + /* An array of pages for the eager receive buffers * N */ + void *subctxt_rcvegrbuf; + /* An array of pages for the eager header queue entries * N */ + void *subctxt_rcvhdr_base; + /* The version of the library which opened this ctxt */ + u32 userversion; + /* Bitmask of active slaves */ + u32 active_slaves; + /* Type of packets or conditions we want to poll for */ + u16 poll_type; + /* receive packet sequence counter */ + u8 seq_cnt; + u8 redirect_seq_cnt; + /* ctxt rcvhdrq head offset */ + u32 head; + /* lookaside fields */ + struct qib_qp *lookaside_qp; + u32 lookaside_qpn; + /* QPs waiting for context processing */ + struct list_head qp_wait_list; +#ifdef CONFIG_DEBUG_FS + /* verbs stats per CTX */ + struct qib_opcode_stats_perctx *opstats; +#endif +}; + +struct qib_sge_state; + +struct qib_sdma_txreq { + int flags; + int sg_count; + dma_addr_t addr; + void (*callback)(struct qib_sdma_txreq *, int); + u16 start_idx; /* sdma private */ + u16 next_descq_idx; /* sdma private */ + struct list_head list; /* sdma private */ +}; + +struct qib_sdma_desc { + __le64 qw[2]; +}; + +struct qib_verbs_txreq { + struct qib_sdma_txreq txreq; + struct qib_qp *qp; + struct qib_swqe *wqe; + u32 dwords; + u16 hdr_dwords; + u16 hdr_inx; + struct qib_pio_header *align_buf; + struct qib_mregion *mr; + struct qib_sge_state *ss; +}; + +#define QIB_SDMA_TXREQ_F_USELARGEBUF 0x1 +#define QIB_SDMA_TXREQ_F_HEADTOHOST 0x2 +#define QIB_SDMA_TXREQ_F_INTREQ 0x4 +#define QIB_SDMA_TXREQ_F_FREEBUF 0x8 +#define QIB_SDMA_TXREQ_F_FREEDESC 0x10 + +#define QIB_SDMA_TXREQ_S_OK 0 +#define QIB_SDMA_TXREQ_S_SENDERROR 1 +#define QIB_SDMA_TXREQ_S_ABORTED 2 +#define QIB_SDMA_TXREQ_S_SHUTDOWN 3 + +/* + * Get/Set IB link-level config parameters for f_get/set_ib_cfg() + * Mostly for MADs that set or query link parameters, also ipath + * config interfaces + */ +#define QIB_IB_CFG_LIDLMC 0 /* LID (LS16b) and Mask (MS16b) */ +#define QIB_IB_CFG_LWID_ENB 2 /* allowed Link-width */ +#define QIB_IB_CFG_LWID 3 /* currently active Link-width */ +#define QIB_IB_CFG_SPD_ENB 4 /* allowed Link speeds */ +#define QIB_IB_CFG_SPD 5 /* current Link spd */ +#define QIB_IB_CFG_RXPOL_ENB 6 /* Auto-RX-polarity enable */ +#define QIB_IB_CFG_LREV_ENB 7 /* Auto-Lane-reversal enable */ +#define QIB_IB_CFG_LINKLATENCY 8 /* Link Latency (IB1.2 only) */ +#define QIB_IB_CFG_HRTBT 9 /* IB heartbeat off/enable/auto; DDR/QDR only */ +#define QIB_IB_CFG_OP_VLS 10 /* operational VLs */ +#define QIB_IB_CFG_VL_HIGH_CAP 11 /* num of VL high priority weights */ +#define QIB_IB_CFG_VL_LOW_CAP 12 /* num of VL low priority weights */ +#define QIB_IB_CFG_OVERRUN_THRESH 13 /* IB overrun threshold */ +#define QIB_IB_CFG_PHYERR_THRESH 14 /* IB PHY error threshold */ +#define QIB_IB_CFG_LINKDEFAULT 15 /* IB link default (sleep/poll) */ +#define QIB_IB_CFG_PKEYS 16 /* update partition keys */ +#define QIB_IB_CFG_MTU 17 /* update MTU in IBC */ +#define QIB_IB_CFG_LSTATE 18 /* update linkcmd and linkinitcmd in IBC */ +#define QIB_IB_CFG_VL_HIGH_LIMIT 19 +#define QIB_IB_CFG_PMA_TICKS 20 /* PMA sample tick resolution */ +#define QIB_IB_CFG_PORT 21 /* switch port we are connected to */ + +/* + * for CFG_LSTATE: LINKCMD in upper 16 bits, LINKINITCMD in lower 16 + * IB_LINKINITCMD_POLL and SLEEP are also used as set/get values for + * QIB_IB_CFG_LINKDEFAULT cmd + */ +#define IB_LINKCMD_DOWN (0 << 16) +#define IB_LINKCMD_ARMED (1 << 16) +#define IB_LINKCMD_ACTIVE (2 << 16) +#define IB_LINKINITCMD_NOP 0 +#define IB_LINKINITCMD_POLL 1 +#define IB_LINKINITCMD_SLEEP 2 +#define IB_LINKINITCMD_DISABLE 3 + +/* + * valid states passed to qib_set_linkstate() user call + */ +#define QIB_IB_LINKDOWN 0 +#define QIB_IB_LINKARM 1 +#define QIB_IB_LINKACTIVE 2 +#define QIB_IB_LINKDOWN_ONLY 3 +#define QIB_IB_LINKDOWN_SLEEP 4 +#define QIB_IB_LINKDOWN_DISABLE 5 + +/* + * These 7 values (SDR, DDR, and QDR may be ORed for auto-speed + * negotiation) are used for the 3rd argument to path_f_set_ib_cfg + * with cmd QIB_IB_CFG_SPD_ENB, by direct calls or via sysfs. They + * are also the the possible values for qib_link_speed_enabled and active + * The values were chosen to match values used within the IB spec. + */ +#define QIB_IB_SDR 1 +#define QIB_IB_DDR 2 +#define QIB_IB_QDR 4 + +#define QIB_DEFAULT_MTU 4096 + +/* max number of IB ports supported per HCA */ +#define QIB_MAX_IB_PORTS 2 + +/* + * Possible IB config parameters for f_get/set_ib_table() + */ +#define QIB_IB_TBL_VL_HIGH_ARB 1 /* Get/set VL high priority weights */ +#define QIB_IB_TBL_VL_LOW_ARB 2 /* Get/set VL low priority weights */ + +/* + * Possible "operations" for f_rcvctrl(ppd, op, ctxt) + * these are bits so they can be combined, e.g. + * QIB_RCVCTRL_INTRAVAIL_ENB | QIB_RCVCTRL_CTXT_ENB + */ +#define QIB_RCVCTRL_TAILUPD_ENB 0x01 +#define QIB_RCVCTRL_TAILUPD_DIS 0x02 +#define QIB_RCVCTRL_CTXT_ENB 0x04 +#define QIB_RCVCTRL_CTXT_DIS 0x08 +#define QIB_RCVCTRL_INTRAVAIL_ENB 0x10 +#define QIB_RCVCTRL_INTRAVAIL_DIS 0x20 +#define QIB_RCVCTRL_PKEY_ENB 0x40 /* Note, default is enabled */ +#define QIB_RCVCTRL_PKEY_DIS 0x80 +#define QIB_RCVCTRL_BP_ENB 0x0100 +#define QIB_RCVCTRL_BP_DIS 0x0200 +#define QIB_RCVCTRL_TIDFLOW_ENB 0x0400 +#define QIB_RCVCTRL_TIDFLOW_DIS 0x0800 + +/* + * Possible "operations" for f_sendctrl(ppd, op, var) + * these are bits so they can be combined, e.g. + * QIB_SENDCTRL_BUFAVAIL_ENB | QIB_SENDCTRL_ENB + * Some operations (e.g. DISARM, ABORT) are known to + * be "one-shot", so do not modify shadow. + */ +#define QIB_SENDCTRL_DISARM (0x1000) +#define QIB_SENDCTRL_DISARM_BUF(bufn) ((bufn) | QIB_SENDCTRL_DISARM) + /* available (0x2000) */ +#define QIB_SENDCTRL_AVAIL_DIS (0x4000) +#define QIB_SENDCTRL_AVAIL_ENB (0x8000) +#define QIB_SENDCTRL_AVAIL_BLIP (0x10000) +#define QIB_SENDCTRL_SEND_DIS (0x20000) +#define QIB_SENDCTRL_SEND_ENB (0x40000) +#define QIB_SENDCTRL_FLUSH (0x80000) +#define QIB_SENDCTRL_CLEAR (0x100000) +#define QIB_SENDCTRL_DISARM_ALL (0x200000) + +/* + * These are the generic indices for requesting per-port + * counter values via the f_portcntr function. They + * are always returned as 64 bit values, although most + * are 32 bit counters. + */ +/* send-related counters */ +#define QIBPORTCNTR_PKTSEND 0U +#define QIBPORTCNTR_WORDSEND 1U +#define QIBPORTCNTR_PSXMITDATA 2U +#define QIBPORTCNTR_PSXMITPKTS 3U +#define QIBPORTCNTR_PSXMITWAIT 4U +#define QIBPORTCNTR_SENDSTALL 5U +/* receive-related counters */ +#define QIBPORTCNTR_PKTRCV 6U +#define QIBPORTCNTR_PSRCVDATA 7U +#define QIBPORTCNTR_PSRCVPKTS 8U +#define QIBPORTCNTR_RCVEBP 9U +#define QIBPORTCNTR_RCVOVFL 10U +#define QIBPORTCNTR_WORDRCV 11U +/* IB link related error counters */ +#define QIBPORTCNTR_RXLOCALPHYERR 12U +#define QIBPORTCNTR_RXVLERR 13U +#define QIBPORTCNTR_ERRICRC 14U +#define QIBPORTCNTR_ERRVCRC 15U +#define QIBPORTCNTR_ERRLPCRC 16U +#define QIBPORTCNTR_BADFORMAT 17U +#define QIBPORTCNTR_ERR_RLEN 18U +#define QIBPORTCNTR_IBSYMBOLERR 19U +#define QIBPORTCNTR_INVALIDRLEN 20U +#define QIBPORTCNTR_UNSUPVL 21U +#define QIBPORTCNTR_EXCESSBUFOVFL 22U +#define QIBPORTCNTR_ERRLINK 23U +#define QIBPORTCNTR_IBLINKDOWN 24U +#define QIBPORTCNTR_IBLINKERRRECOV 25U +#define QIBPORTCNTR_LLI 26U +/* other error counters */ +#define QIBPORTCNTR_RXDROPPKT 27U +#define QIBPORTCNTR_VL15PKTDROP 28U +#define QIBPORTCNTR_ERRPKEY 29U +#define QIBPORTCNTR_KHDROVFL 30U +/* sampling counters (these are actually control registers) */ +#define QIBPORTCNTR_PSINTERVAL 31U +#define QIBPORTCNTR_PSSTART 32U +#define QIBPORTCNTR_PSSTAT 33U + +/* how often we check for packet activity for "power on hours (in seconds) */ +#define ACTIVITY_TIMER 5 + +#define MAX_NAME_SIZE 64 + +#ifdef CONFIG_INFINIBAND_QIB_DCA +struct qib_irq_notify; +#endif + +struct qib_msix_entry { + struct msix_entry msix; + void *arg; +#ifdef CONFIG_INFINIBAND_QIB_DCA + int dca; + int rcv; + struct qib_irq_notify *notifier; +#endif + char name[MAX_NAME_SIZE]; + cpumask_var_t mask; +}; + +/* Below is an opaque struct. Each chip (device) can maintain + * private data needed for its operation, but not germane to the + * rest of the driver. For convenience, we define another that + * is chip-specific, per-port + */ +struct qib_chip_specific; +struct qib_chipport_specific; + +enum qib_sdma_states { + qib_sdma_state_s00_hw_down, + qib_sdma_state_s10_hw_start_up_wait, + qib_sdma_state_s20_idle, + qib_sdma_state_s30_sw_clean_up_wait, + qib_sdma_state_s40_hw_clean_up_wait, + qib_sdma_state_s50_hw_halt_wait, + qib_sdma_state_s99_running, +}; + +enum qib_sdma_events { + qib_sdma_event_e00_go_hw_down, + qib_sdma_event_e10_go_hw_start, + qib_sdma_event_e20_hw_started, + qib_sdma_event_e30_go_running, + qib_sdma_event_e40_sw_cleaned, + qib_sdma_event_e50_hw_cleaned, + qib_sdma_event_e60_hw_halted, + qib_sdma_event_e70_go_idle, + qib_sdma_event_e7220_err_halted, + qib_sdma_event_e7322_err_halted, + qib_sdma_event_e90_timer_tick, +}; + +extern char *qib_sdma_state_names[]; +extern char *qib_sdma_event_names[]; + +struct sdma_set_state_action { + unsigned op_enable:1; + unsigned op_intenable:1; + unsigned op_halt:1; + unsigned op_drain:1; + unsigned go_s99_running_tofalse:1; + unsigned go_s99_running_totrue:1; +}; + +struct qib_sdma_state { + struct kref kref; + struct completion comp; + enum qib_sdma_states current_state; + struct sdma_set_state_action *set_state_action; + unsigned current_op; + unsigned go_s99_running; + unsigned first_sendbuf; + unsigned last_sendbuf; /* really last +1 */ + /* debugging/devel */ + enum qib_sdma_states previous_state; + unsigned previous_op; + enum qib_sdma_events last_event; +}; + +struct xmit_wait { + struct timer_list timer; + u64 counter; + u8 flags; + struct cache { + u64 psxmitdata; + u64 psrcvdata; + u64 psxmitpkts; + u64 psrcvpkts; + u64 psxmitwait; + } counter_cache; +}; + +/* + * The structure below encapsulates data relevant to a physical IB Port. + * Current chips support only one such port, but the separation + * clarifies things a bit. Note that to conform to IB conventions, + * port-numbers are one-based. The first or only port is port1. + */ +struct qib_pportdata { + struct qib_ibport ibport_data; + + struct qib_devdata *dd; + struct qib_chippport_specific *cpspec; /* chip-specific per-port */ + struct kobject pport_kobj; + struct kobject pport_cc_kobj; + struct kobject sl2vl_kobj; + struct kobject diagc_kobj; + + /* GUID for this interface, in network order */ + __be64 guid; + + /* QIB_POLL, etc. link-state specific flags, per port */ + u32 lflags; + /* qib_lflags driver is waiting for */ + u32 state_wanted; + spinlock_t lflags_lock; + + /* ref count for each pkey */ + atomic_t pkeyrefs[4]; + + /* + * this address is mapped readonly into user processes so they can + * get status cheaply, whenever they want. One qword of status per port + */ + u64 *statusp; + + /* SendDMA related entries */ + + /* read mostly */ + struct qib_sdma_desc *sdma_descq; + struct workqueue_struct *qib_wq; + struct qib_sdma_state sdma_state; + dma_addr_t sdma_descq_phys; + volatile __le64 *sdma_head_dma; /* DMA'ed by chip */ + dma_addr_t sdma_head_phys; + u16 sdma_descq_cnt; + + /* read/write using lock */ + spinlock_t sdma_lock ____cacheline_aligned_in_smp; + struct list_head sdma_activelist; + struct list_head sdma_userpending; + u64 sdma_descq_added; + u64 sdma_descq_removed; + u16 sdma_descq_tail; + u16 sdma_descq_head; + u8 sdma_generation; + u8 sdma_intrequest; + + struct tasklet_struct sdma_sw_clean_up_task + ____cacheline_aligned_in_smp; + + wait_queue_head_t state_wait; /* for state_wanted */ + + /* HoL blocking for SMP replies */ + unsigned hol_state; + struct timer_list hol_timer; + + /* + * Shadow copies of registers; size indicates read access size. + * Most of them are readonly, but some are write-only register, + * where we manipulate the bits in the shadow copy, and then write + * the shadow copy to qlogic_ib. + * + * We deliberately make most of these 32 bits, since they have + * restricted range. For any that we read, we won't to generate 32 + * bit accesses, since Opteron will generate 2 separate 32 bit HT + * transactions for a 64 bit read, and we want to avoid unnecessary + * bus transactions. + */ + + /* This is the 64 bit group */ + /* last ibcstatus. opaque outside chip-specific code */ + u64 lastibcstat; + + /* these are the "32 bit" regs */ + + /* + * the following two are 32-bit bitmasks, but {test,clear,set}_bit + * all expect bit fields to be "unsigned long" + */ + unsigned long p_rcvctrl; /* shadow per-port rcvctrl */ + unsigned long p_sendctrl; /* shadow per-port sendctrl */ + + u32 ibmtu; /* The MTU programmed for this unit */ + /* + * Current max size IB packet (in bytes) including IB headers, that + * we can send. Changes when ibmtu changes. + */ + u32 ibmaxlen; + /* + * ibmaxlen at init time, limited by chip and by receive buffer + * size. Not changed after init. + */ + u32 init_ibmaxlen; + /* LID programmed for this instance */ + u16 lid; + /* list of pkeys programmed; 0 if not set */ + u16 pkeys[4]; + /* LID mask control */ + u8 lmc; + u8 link_width_supported; + u8 link_speed_supported; + u8 link_width_enabled; + u8 link_speed_enabled; + u8 link_width_active; + u8 link_speed_active; + u8 vls_supported; + u8 vls_operational; + /* Rx Polarity inversion (compensate for ~tx on partner) */ + u8 rx_pol_inv; + + u8 hw_pidx; /* physical port index */ + u8 port; /* IB port number and index into dd->pports - 1 */ + + u8 delay_mult; + + /* used to override LED behavior */ + u8 led_override; /* Substituted for normal value, if non-zero */ + u16 led_override_timeoff; /* delta to next timer event */ + u8 led_override_vals[2]; /* Alternates per blink-frame */ + u8 led_override_phase; /* Just counts, LSB picks from vals[] */ + atomic_t led_override_timer_active; + /* Used to flash LEDs in override mode */ + struct timer_list led_override_timer; + struct xmit_wait cong_stats; + struct timer_list symerr_clear_timer; + + /* Synchronize access between driver writes and sysfs reads */ + spinlock_t cc_shadow_lock + ____cacheline_aligned_in_smp; + + /* Shadow copy of the congestion control table */ + struct cc_table_shadow *ccti_entries_shadow; + + /* Shadow copy of the congestion control entries */ + struct ib_cc_congestion_setting_attr_shadow *congestion_entries_shadow; + + /* List of congestion control table entries */ + struct ib_cc_table_entry_shadow *ccti_entries; + + /* 16 congestion entries with each entry corresponding to a SL */ + struct ib_cc_congestion_entry_shadow *congestion_entries; + + /* Maximum number of congestion control entries that the agent expects + * the manager to send. + */ + u16 cc_supported_table_entries; + + /* Total number of congestion control table entries */ + u16 total_cct_entry; + + /* Bit map identifying service level */ + u16 cc_sl_control_map; + + /* maximum congestion control table index */ + u16 ccti_limit; + + /* CA's max number of 64 entry units in the congestion control table */ + u8 cc_max_table_entries; +}; + +/* Observers. Not to be taken lightly, possibly not to ship. */ +/* + * If a diag read or write is to (bottom <= offset <= top), + * the "hoook" is called, allowing, e.g. shadows to be + * updated in sync with the driver. struct diag_observer + * is the "visible" part. + */ +struct diag_observer; + +typedef int (*diag_hook) (struct qib_devdata *dd, + const struct diag_observer *op, + u32 offs, u64 *data, u64 mask, int only_32); + +struct diag_observer { + diag_hook hook; + u32 bottom; + u32 top; +}; + +extern int qib_register_observer(struct qib_devdata *dd, + const struct diag_observer *op); + +/* Only declared here, not defined. Private to diags */ +struct diag_observer_list_elt; + +/* device data struct now contains only "general per-device" info. + * fields related to a physical IB port are in a qib_pportdata struct, + * described above) while fields only used by a particular chip-type are in + * a qib_chipdata struct, whose contents are opaque to this file. + */ +struct qib_devdata { + struct qib_ibdev verbs_dev; /* must be first */ + struct list_head list; + /* pointers to related structs for this device */ + /* pci access data structure */ + struct pci_dev *pcidev; + struct cdev *user_cdev; + struct cdev *diag_cdev; + struct device *user_device; + struct device *diag_device; + + /* mem-mapped pointer to base of chip regs */ + u64 __iomem *kregbase; + /* end of mem-mapped chip space excluding sendbuf and user regs */ + u64 __iomem *kregend; + /* physical address of chip for io_remap, etc. */ + resource_size_t physaddr; + /* qib_cfgctxts pointers */ + struct qib_ctxtdata **rcd; /* Receive Context Data */ + + /* qib_pportdata, points to array of (physical) port-specific + * data structs, indexed by pidx (0..n-1) + */ + struct qib_pportdata *pport; + struct qib_chip_specific *cspec; /* chip-specific */ + + /* kvirt address of 1st 2k pio buffer */ + void __iomem *pio2kbase; + /* kvirt address of 1st 4k pio buffer */ + void __iomem *pio4kbase; + /* mem-mapped pointer to base of PIO buffers (if using WC PAT) */ + void __iomem *piobase; + /* mem-mapped pointer to base of user chip regs (if using WC PAT) */ + u64 __iomem *userbase; + void __iomem *piovl15base; /* base of VL15 buffers, if not WC */ + /* + * points to area where PIOavail registers will be DMA'ed. + * Has to be on a page of it's own, because the page will be + * mapped into user program space. This copy is *ONLY* ever + * written by DMA, not by the driver! Need a copy per device + * when we get to multiple devices + */ + volatile __le64 *pioavailregs_dma; /* DMA'ed by chip */ + /* physical address where updates occur */ + dma_addr_t pioavailregs_phys; + + /* device-specific implementations of functions needed by + * common code. Contrary to previous consensus, we can't + * really just point to a device-specific table, because we + * may need to "bend", e.g. *_f_put_tid + */ + /* fallback to alternate interrupt type if possible */ + int (*f_intr_fallback)(struct qib_devdata *); + /* hard reset chip */ + int (*f_reset)(struct qib_devdata *); + void (*f_quiet_serdes)(struct qib_pportdata *); + int (*f_bringup_serdes)(struct qib_pportdata *); + int (*f_early_init)(struct qib_devdata *); + void (*f_clear_tids)(struct qib_devdata *, struct qib_ctxtdata *); + void (*f_put_tid)(struct qib_devdata *, u64 __iomem*, + u32, unsigned long); + void (*f_cleanup)(struct qib_devdata *); + void (*f_setextled)(struct qib_pportdata *, u32); + /* fill out chip-specific fields */ + int (*f_get_base_info)(struct qib_ctxtdata *, struct qib_base_info *); + /* free irq */ + void (*f_free_irq)(struct qib_devdata *); + struct qib_message_header *(*f_get_msgheader) + (struct qib_devdata *, __le32 *); + void (*f_config_ctxts)(struct qib_devdata *); + int (*f_get_ib_cfg)(struct qib_pportdata *, int); + int (*f_set_ib_cfg)(struct qib_pportdata *, int, u32); + int (*f_set_ib_loopback)(struct qib_pportdata *, const char *); + int (*f_get_ib_table)(struct qib_pportdata *, int, void *); + int (*f_set_ib_table)(struct qib_pportdata *, int, void *); + u32 (*f_iblink_state)(u64); + u8 (*f_ibphys_portstate)(u64); + void (*f_xgxs_reset)(struct qib_pportdata *); + /* per chip actions needed for IB Link up/down changes */ + int (*f_ib_updown)(struct qib_pportdata *, int, u64); + u32 __iomem *(*f_getsendbuf)(struct qib_pportdata *, u64, u32 *); + /* Read/modify/write of GPIO pins (potentially chip-specific */ + int (*f_gpio_mod)(struct qib_devdata *dd, u32 out, u32 dir, + u32 mask); + /* Enable writes to config EEPROM (if supported) */ + int (*f_eeprom_wen)(struct qib_devdata *dd, int wen); + /* + * modify rcvctrl shadow[s] and write to appropriate chip-regs. + * see above QIB_RCVCTRL_xxx_ENB/DIS for operations. + * (ctxt == -1) means "all contexts", only meaningful for + * clearing. Could remove if chip_spec shutdown properly done. + */ + void (*f_rcvctrl)(struct qib_pportdata *, unsigned int op, + int ctxt); + /* Read/modify/write sendctrl appropriately for op and port. */ + void (*f_sendctrl)(struct qib_pportdata *, u32 op); + void (*f_set_intr_state)(struct qib_devdata *, u32); + void (*f_set_armlaunch)(struct qib_devdata *, u32); + void (*f_wantpiobuf_intr)(struct qib_devdata *, u32); + int (*f_late_initreg)(struct qib_devdata *); + int (*f_init_sdma_regs)(struct qib_pportdata *); + u16 (*f_sdma_gethead)(struct qib_pportdata *); + int (*f_sdma_busy)(struct qib_pportdata *); + void (*f_sdma_update_tail)(struct qib_pportdata *, u16); + void (*f_sdma_set_desc_cnt)(struct qib_pportdata *, unsigned); + void (*f_sdma_sendctrl)(struct qib_pportdata *, unsigned); + void (*f_sdma_hw_clean_up)(struct qib_pportdata *); + void (*f_sdma_hw_start_up)(struct qib_pportdata *); + void (*f_sdma_init_early)(struct qib_pportdata *); + void (*f_set_cntr_sample)(struct qib_pportdata *, u32, u32); + void (*f_update_usrhead)(struct qib_ctxtdata *, u64, u32, u32, u32); + u32 (*f_hdrqempty)(struct qib_ctxtdata *); + u64 (*f_portcntr)(struct qib_pportdata *, u32); + u32 (*f_read_cntrs)(struct qib_devdata *, loff_t, char **, + u64 **); + u32 (*f_read_portcntrs)(struct qib_devdata *, loff_t, u32, + char **, u64 **); + u32 (*f_setpbc_control)(struct qib_pportdata *, u32, u8, u8); + void (*f_initvl15_bufs)(struct qib_devdata *); + void (*f_init_ctxt)(struct qib_ctxtdata *); + void (*f_txchk_change)(struct qib_devdata *, u32, u32, u32, + struct qib_ctxtdata *); + void (*f_writescratch)(struct qib_devdata *, u32); + int (*f_tempsense_rd)(struct qib_devdata *, int regnum); +#ifdef CONFIG_INFINIBAND_QIB_DCA + int (*f_notify_dca)(struct qib_devdata *, unsigned long event); +#endif + + char *boardname; /* human readable board info */ + + /* template for writing TIDs */ + u64 tidtemplate; + /* value to write to free TIDs */ + u64 tidinvalid; + + /* number of registers used for pioavail */ + u32 pioavregs; + /* device (not port) flags, basically device capabilities */ + u32 flags; + /* last buffer for user use */ + u32 lastctxt_piobuf; + + /* reset value */ + u64 z_int_counter; + /* percpu intcounter */ + u64 __percpu *int_counter; + + /* pio bufs allocated per ctxt */ + u32 pbufsctxt; + /* if remainder on bufs/ctxt, ctxts < extrabuf get 1 extra */ + u32 ctxts_extrabuf; + /* + * number of ctxts configured as max; zero is set to number chip + * supports, less gives more pio bufs/ctxt, etc. + */ + u32 cfgctxts; + /* + * number of ctxts available for PSM open + */ + u32 freectxts; + + /* + * hint that we should update pioavailshadow before + * looking for a PIO buffer + */ + u32 upd_pio_shadow; + + /* internal debugging stats */ + u32 maxpkts_call; + u32 avgpkts_call; + u64 nopiobufs; + + /* PCI Vendor ID (here for NodeInfo) */ + u16 vendorid; + /* PCI Device ID (here for NodeInfo) */ + u16 deviceid; + /* for write combining settings */ + unsigned long wc_cookie; + unsigned long wc_base; + unsigned long wc_len; + + /* shadow copy of struct page *'s for exp tid pages */ + struct page **pageshadow; + /* shadow copy of dma handles for exp tid pages */ + dma_addr_t *physshadow; + u64 __iomem *egrtidbase; + spinlock_t sendctrl_lock; /* protect changes to sendctrl shadow */ + /* around rcd and (user ctxts) ctxt_cnt use (intr vs free) */ + spinlock_t uctxt_lock; /* rcd and user context changes */ + /* + * per unit status, see also portdata statusp + * mapped readonly into user processes so they can get unit and + * IB link status cheaply + */ + u64 *devstatusp; + char *freezemsg; /* freeze msg if hw error put chip in freeze */ + u32 freezelen; /* max length of freezemsg */ + /* timer used to prevent stats overflow, error throttling, etc. */ + struct timer_list stats_timer; + + /* timer to verify interrupts work, and fallback if possible */ + struct timer_list intrchk_timer; + unsigned long ureg_align; /* user register alignment */ + + /* + * Protects pioavailshadow, pioavailkernel, pio_need_disarm, and + * pio_writing. + */ + spinlock_t pioavail_lock; + /* + * index of last buffer to optimize search for next + */ + u32 last_pio; + /* + * min kernel pio buffer to optimize search + */ + u32 min_kernel_pio; + /* + * Shadow copies of registers; size indicates read access size. + * Most of them are readonly, but some are write-only register, + * where we manipulate the bits in the shadow copy, and then write + * the shadow copy to qlogic_ib. + * + * We deliberately make most of these 32 bits, since they have + * restricted range. For any that we read, we won't to generate 32 + * bit accesses, since Opteron will generate 2 separate 32 bit HT + * transactions for a 64 bit read, and we want to avoid unnecessary + * bus transactions. + */ + + /* This is the 64 bit group */ + + unsigned long pioavailshadow[6]; + /* bitmap of send buffers available for the kernel to use with PIO. */ + unsigned long pioavailkernel[6]; + /* bitmap of send buffers which need to be disarmed. */ + unsigned long pio_need_disarm[3]; + /* bitmap of send buffers which are being written to. */ + unsigned long pio_writing[3]; + /* kr_revision shadow */ + u64 revision; + /* Base GUID for device (from eeprom, network order) */ + __be64 base_guid; + + /* + * kr_sendpiobufbase value (chip offset of pio buffers), and the + * base of the 2KB buffer s(user processes only use 2K) + */ + u64 piobufbase; + u32 pio2k_bufbase; + + /* these are the "32 bit" regs */ + + /* number of GUIDs in the flash for this interface */ + u32 nguid; + /* + * the following two are 32-bit bitmasks, but {test,clear,set}_bit + * all expect bit fields to be "unsigned long" + */ + unsigned long rcvctrl; /* shadow per device rcvctrl */ + unsigned long sendctrl; /* shadow per device sendctrl */ + + /* value we put in kr_rcvhdrcnt */ + u32 rcvhdrcnt; + /* value we put in kr_rcvhdrsize */ + u32 rcvhdrsize; + /* value we put in kr_rcvhdrentsize */ + u32 rcvhdrentsize; + /* kr_ctxtcnt value */ + u32 ctxtcnt; + /* kr_pagealign value */ + u32 palign; + /* number of "2KB" PIO buffers */ + u32 piobcnt2k; + /* size in bytes of "2KB" PIO buffers */ + u32 piosize2k; + /* max usable size in dwords of a "2KB" PIO buffer before going "4KB" */ + u32 piosize2kmax_dwords; + /* number of "4KB" PIO buffers */ + u32 piobcnt4k; + /* size in bytes of "4KB" PIO buffers */ + u32 piosize4k; + /* kr_rcvegrbase value */ + u32 rcvegrbase; + /* kr_rcvtidbase value */ + u32 rcvtidbase; + /* kr_rcvtidcnt value */ + u32 rcvtidcnt; + /* kr_userregbase */ + u32 uregbase; + /* shadow the control register contents */ + u32 control; + + /* chip address space used by 4k pio buffers */ + u32 align4k; + /* size of each rcvegrbuffer */ + u16 rcvegrbufsize; + /* log2 of above */ + u16 rcvegrbufsize_shift; + /* localbus width (1, 2,4,8,16,32) from config space */ + u32 lbus_width; + /* localbus speed in MHz */ + u32 lbus_speed; + int unit; /* unit # of this chip */ + + /* start of CHIP_SPEC move to chipspec, but need code changes */ + /* low and high portions of MSI capability/vector */ + u32 msi_lo; + /* saved after PCIe init for restore after reset */ + u32 msi_hi; + /* MSI data (vector) saved for restore */ + u16 msi_data; + /* so we can rewrite it after a chip reset */ + u32 pcibar0; + /* so we can rewrite it after a chip reset */ + u32 pcibar1; + u64 rhdrhead_intr_off; + + /* + * ASCII serial number, from flash, large enough for original + * all digit strings, and longer QLogic serial number format + */ + u8 serial[16]; + /* human readable board version */ + u8 boardversion[96]; + u8 lbus_info[32]; /* human readable localbus info */ + /* chip major rev, from qib_revision */ + u8 majrev; + /* chip minor rev, from qib_revision */ + u8 minrev; + + /* Misc small ints */ + /* Number of physical ports available */ + u8 num_pports; + /* Lowest context number which can be used by user processes */ + u8 first_user_ctxt; + u8 n_krcv_queues; + u8 qpn_mask; + u8 skip_kctxt_mask; + + u16 rhf_offset; /* offset of RHF within receive header entry */ + + /* + * GPIO pins for twsi-connected devices, and device code for eeprom + */ + u8 gpio_sda_num; + u8 gpio_scl_num; + u8 twsi_eeprom_dev; + u8 board_atten; + + /* Support (including locks) for EEPROM logging of errors and time */ + /* control access to actual counters, timer */ + spinlock_t eep_st_lock; + /* control high-level access to EEPROM */ + struct mutex eep_lock; + uint64_t traffic_wds; + /* active time is kept in seconds, but logged in hours */ + atomic_t active_time; + /* Below are nominal shadow of EEPROM, new since last EEPROM update */ + uint8_t eep_st_errs[QIB_EEP_LOG_CNT]; + uint8_t eep_st_new_errs[QIB_EEP_LOG_CNT]; + uint16_t eep_hrs; + /* + * masks for which bits of errs, hwerrs that cause + * each of the counters to increment. + */ + struct qib_eep_log_mask eep_st_masks[QIB_EEP_LOG_CNT]; + struct qib_diag_client *diag_client; + spinlock_t qib_diag_trans_lock; /* protect diag observer ops */ + struct diag_observer_list_elt *diag_observer_list; + + u8 psxmitwait_supported; + /* cycle length of PS* counters in HW (in picoseconds) */ + u16 psxmitwait_check_rate; + /* high volume overflow errors defered to tasklet */ + struct tasklet_struct error_tasklet; + /* per device cq worker */ + struct kthread_worker *worker; + + int assigned_node_id; /* NUMA node closest to HCA */ +}; + +/* hol_state values */ +#define QIB_HOL_UP 0 +#define QIB_HOL_INIT 1 + +#define QIB_SDMA_SENDCTRL_OP_ENABLE (1U << 0) +#define QIB_SDMA_SENDCTRL_OP_INTENABLE (1U << 1) +#define QIB_SDMA_SENDCTRL_OP_HALT (1U << 2) +#define QIB_SDMA_SENDCTRL_OP_CLEANUP (1U << 3) +#define QIB_SDMA_SENDCTRL_OP_DRAIN (1U << 4) + +/* operation types for f_txchk_change() */ +#define TXCHK_CHG_TYPE_DIS1 3 +#define TXCHK_CHG_TYPE_ENAB1 2 +#define TXCHK_CHG_TYPE_KERN 1 +#define TXCHK_CHG_TYPE_USER 0 + +#define QIB_CHASE_TIME msecs_to_jiffies(145) +#define QIB_CHASE_DIS_TIME msecs_to_jiffies(160) + +/* Private data for file operations */ +struct qib_filedata { + struct qib_ctxtdata *rcd; + unsigned subctxt; + unsigned tidcursor; + struct qib_user_sdma_queue *pq; + int rec_cpu_num; /* for cpu affinity; -1 if none */ +}; + +extern struct list_head qib_dev_list; +extern spinlock_t qib_devs_lock; +extern struct qib_devdata *qib_lookup(int unit); +extern u32 qib_cpulist_count; +extern unsigned long *qib_cpulist; + +extern unsigned qib_wc_pat; +extern unsigned qib_cc_table_size; +int qib_init(struct qib_devdata *, int); +int init_chip_wc_pat(struct qib_devdata *dd, u32); +int qib_enable_wc(struct qib_devdata *dd); +void qib_disable_wc(struct qib_devdata *dd); +int qib_count_units(int *npresentp, int *nupp); +int qib_count_active_units(void); + +int qib_cdev_init(int minor, const char *name, + const struct file_operations *fops, + struct cdev **cdevp, struct device **devp); +void qib_cdev_cleanup(struct cdev **cdevp, struct device **devp); +int qib_dev_init(void); +void qib_dev_cleanup(void); + +int qib_diag_add(struct qib_devdata *); +void qib_diag_remove(struct qib_devdata *); +void qib_handle_e_ibstatuschanged(struct qib_pportdata *, u64); +void qib_sdma_update_tail(struct qib_pportdata *, u16); /* hold sdma_lock */ + +int qib_decode_err(struct qib_devdata *dd, char *buf, size_t blen, u64 err); +void qib_bad_intrstatus(struct qib_devdata *); +void qib_handle_urcv(struct qib_devdata *, u64); + +/* clean up any per-chip chip-specific stuff */ +void qib_chip_cleanup(struct qib_devdata *); +/* clean up any chip type-specific stuff */ +void qib_chip_done(void); + +/* check to see if we have to force ordering for write combining */ +int qib_unordered_wc(void); +void qib_pio_copy(void __iomem *to, const void *from, size_t count); + +void qib_disarm_piobufs(struct qib_devdata *, unsigned, unsigned); +int qib_disarm_piobufs_ifneeded(struct qib_ctxtdata *); +void qib_disarm_piobufs_set(struct qib_devdata *, unsigned long *, unsigned); +void qib_cancel_sends(struct qib_pportdata *); + +int qib_create_rcvhdrq(struct qib_devdata *, struct qib_ctxtdata *); +int qib_setup_eagerbufs(struct qib_ctxtdata *); +void qib_set_ctxtcnt(struct qib_devdata *); +int qib_create_ctxts(struct qib_devdata *dd); +struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *, u32, int); +int qib_init_pportdata(struct qib_pportdata *, struct qib_devdata *, u8, u8); +void qib_free_ctxtdata(struct qib_devdata *, struct qib_ctxtdata *); + +u32 qib_kreceive(struct qib_ctxtdata *, u32 *, u32 *); +int qib_reset_device(int); +int qib_wait_linkstate(struct qib_pportdata *, u32, int); +int qib_set_linkstate(struct qib_pportdata *, u8); +int qib_set_mtu(struct qib_pportdata *, u16); +int qib_set_lid(struct qib_pportdata *, u32, u8); +void qib_hol_down(struct qib_pportdata *); +void qib_hol_init(struct qib_pportdata *); +void qib_hol_up(struct qib_pportdata *); +void qib_hol_event(unsigned long); +void qib_disable_after_error(struct qib_devdata *); +int qib_set_uevent_bits(struct qib_pportdata *, const int); + +/* for use in system calls, where we want to know device type, etc. */ +#define ctxt_fp(fp) \ + (((struct qib_filedata *)(fp)->private_data)->rcd) +#define subctxt_fp(fp) \ + (((struct qib_filedata *)(fp)->private_data)->subctxt) +#define tidcursor_fp(fp) \ + (((struct qib_filedata *)(fp)->private_data)->tidcursor) +#define user_sdma_queue_fp(fp) \ + (((struct qib_filedata *)(fp)->private_data)->pq) + +static inline struct qib_devdata *dd_from_ppd(struct qib_pportdata *ppd) +{ + return ppd->dd; +} + +static inline struct qib_devdata *dd_from_dev(struct qib_ibdev *dev) +{ + return container_of(dev, struct qib_devdata, verbs_dev); +} + +static inline struct qib_devdata *dd_from_ibdev(struct ib_device *ibdev) +{ + return dd_from_dev(to_idev(ibdev)); +} + +static inline struct qib_pportdata *ppd_from_ibp(struct qib_ibport *ibp) +{ + return container_of(ibp, struct qib_pportdata, ibport_data); +} + +static inline struct qib_ibport *to_iport(struct ib_device *ibdev, u8 port) +{ + struct qib_devdata *dd = dd_from_ibdev(ibdev); + unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */ + + WARN_ON(pidx >= dd->num_pports); + return &dd->pport[pidx].ibport_data; +} + +/* + * values for dd->flags (_device_ related flags) and + */ +#define QIB_HAS_LINK_LATENCY 0x1 /* supports link latency (IB 1.2) */ +#define QIB_INITTED 0x2 /* chip and driver up and initted */ +#define QIB_DOING_RESET 0x4 /* in the middle of doing chip reset */ +#define QIB_PRESENT 0x8 /* chip accesses can be done */ +#define QIB_PIO_FLUSH_WC 0x10 /* Needs Write combining flush for PIO */ +#define QIB_HAS_THRESH_UPDATE 0x40 +#define QIB_HAS_SDMA_TIMEOUT 0x80 +#define QIB_USE_SPCL_TRIG 0x100 /* SpecialTrigger launch enabled */ +#define QIB_NODMA_RTAIL 0x200 /* rcvhdrtail register DMA enabled */ +#define QIB_HAS_INTX 0x800 /* Supports INTx interrupts */ +#define QIB_HAS_SEND_DMA 0x1000 /* Supports Send DMA */ +#define QIB_HAS_VLSUPP 0x2000 /* Supports multiple VLs; PBC different */ +#define QIB_HAS_HDRSUPP 0x4000 /* Supports header suppression */ +#define QIB_BADINTR 0x8000 /* severe interrupt problems */ +#define QIB_DCA_ENABLED 0x10000 /* Direct Cache Access enabled */ +#define QIB_HAS_QSFP 0x20000 /* device (card instance) has QSFP */ + +/* + * values for ppd->lflags (_ib_port_ related flags) + */ +#define QIBL_LINKV 0x1 /* IB link state valid */ +#define QIBL_LINKDOWN 0x8 /* IB link is down */ +#define QIBL_LINKINIT 0x10 /* IB link level is up */ +#define QIBL_LINKARMED 0x20 /* IB link is ARMED */ +#define QIBL_LINKACTIVE 0x40 /* IB link is ACTIVE */ +/* leave a gap for more IB-link state */ +#define QIBL_IB_AUTONEG_INPROG 0x1000 /* non-IBTA DDR/QDR neg active */ +#define QIBL_IB_AUTONEG_FAILED 0x2000 /* non-IBTA DDR/QDR neg failed */ +#define QIBL_IB_LINK_DISABLED 0x4000 /* Linkdown-disable forced, + * Do not try to bring up */ +#define QIBL_IB_FORCE_NOTIFY 0x8000 /* force notify on next ib change */ + +/* IB dword length mask in PBC (lower 11 bits); same for all chips */ +#define QIB_PBC_LENGTH_MASK ((1 << 11) - 1) + + +/* ctxt_flag bit offsets */ + /* waiting for a packet to arrive */ +#define QIB_CTXT_WAITING_RCV 2 + /* master has not finished initializing */ +#define QIB_CTXT_MASTER_UNINIT 4 + /* waiting for an urgent packet to arrive */ +#define QIB_CTXT_WAITING_URG 5 + +/* free up any allocated data at closes */ +void qib_free_data(struct qib_ctxtdata *dd); +void qib_chg_pioavailkernel(struct qib_devdata *, unsigned, unsigned, + u32, struct qib_ctxtdata *); +struct qib_devdata *qib_init_iba7322_funcs(struct pci_dev *, + const struct pci_device_id *); +struct qib_devdata *qib_init_iba7220_funcs(struct pci_dev *, + const struct pci_device_id *); +struct qib_devdata *qib_init_iba6120_funcs(struct pci_dev *, + const struct pci_device_id *); +void qib_free_devdata(struct qib_devdata *); +struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra); + +#define QIB_TWSI_NO_DEV 0xFF +/* Below qib_twsi_ functions must be called with eep_lock held */ +int qib_twsi_reset(struct qib_devdata *dd); +int qib_twsi_blk_rd(struct qib_devdata *dd, int dev, int addr, void *buffer, + int len); +int qib_twsi_blk_wr(struct qib_devdata *dd, int dev, int addr, + const void *buffer, int len); +void qib_get_eeprom_info(struct qib_devdata *); +int qib_update_eeprom_log(struct qib_devdata *dd); +void qib_inc_eeprom_err(struct qib_devdata *dd, u32 eidx, u32 incr); +void qib_dump_lookup_output_queue(struct qib_devdata *); +void qib_force_pio_avail_update(struct qib_devdata *); +void qib_clear_symerror_on_linkup(unsigned long opaque); + +/* + * Set LED override, only the two LSBs have "public" meaning, but + * any non-zero value substitutes them for the Link and LinkTrain + * LED states. + */ +#define QIB_LED_PHYS 1 /* Physical (linktraining) GREEN LED */ +#define QIB_LED_LOG 2 /* Logical (link) YELLOW LED */ +void qib_set_led_override(struct qib_pportdata *ppd, unsigned int val); + +/* send dma routines */ +int qib_setup_sdma(struct qib_pportdata *); +void qib_teardown_sdma(struct qib_pportdata *); +void __qib_sdma_intr(struct qib_pportdata *); +void qib_sdma_intr(struct qib_pportdata *); +void qib_user_sdma_send_desc(struct qib_pportdata *dd, + struct list_head *pktlist); +int qib_sdma_verbs_send(struct qib_pportdata *, struct qib_sge_state *, + u32, struct qib_verbs_txreq *); +/* ppd->sdma_lock should be locked before calling this. */ +int qib_sdma_make_progress(struct qib_pportdata *dd); + +static inline int qib_sdma_empty(const struct qib_pportdata *ppd) +{ + return ppd->sdma_descq_added == ppd->sdma_descq_removed; +} + +/* must be called under qib_sdma_lock */ +static inline u16 qib_sdma_descq_freecnt(const struct qib_pportdata *ppd) +{ + return ppd->sdma_descq_cnt - + (ppd->sdma_descq_added - ppd->sdma_descq_removed) - 1; +} + +static inline int __qib_sdma_running(struct qib_pportdata *ppd) +{ + return ppd->sdma_state.current_state == qib_sdma_state_s99_running; +} +int qib_sdma_running(struct qib_pportdata *); +void dump_sdma_state(struct qib_pportdata *ppd); +void __qib_sdma_process_event(struct qib_pportdata *, enum qib_sdma_events); +void qib_sdma_process_event(struct qib_pportdata *, enum qib_sdma_events); + +/* + * number of words used for protocol header if not set by qib_userinit(); + */ +#define QIB_DFLT_RCVHDRSIZE 9 + +/* + * We need to be able to handle an IB header of at least 24 dwords. + * We need the rcvhdrq large enough to handle largest IB header, but + * still have room for a 2KB MTU standard IB packet. + * Additionally, some processor/memory controller combinations + * benefit quite strongly from having the DMA'ed data be cacheline + * aligned and a cacheline multiple, so we set the size to 32 dwords + * (2 64-byte primary cachelines for pretty much all processors of + * interest). The alignment hurts nothing, other than using somewhat + * more memory. + */ +#define QIB_RCVHDR_ENTSIZE 32 + +int qib_get_user_pages(unsigned long, size_t, struct page **); +void qib_release_user_pages(struct page **, size_t); +int qib_eeprom_read(struct qib_devdata *, u8, void *, int); +int qib_eeprom_write(struct qib_devdata *, u8, const void *, int); +u32 __iomem *qib_getsendbuf_range(struct qib_devdata *, u32 *, u32, u32); +void qib_sendbuf_done(struct qib_devdata *, unsigned); + +static inline void qib_clear_rcvhdrtail(const struct qib_ctxtdata *rcd) +{ + *((u64 *) rcd->rcvhdrtail_kvaddr) = 0ULL; +} + +static inline u32 qib_get_rcvhdrtail(const struct qib_ctxtdata *rcd) +{ + /* + * volatile because it's a DMA target from the chip, routine is + * inlined, and don't want register caching or reordering. + */ + return (u32) le64_to_cpu( + *((volatile __le64 *)rcd->rcvhdrtail_kvaddr)); /* DMA'ed */ +} + +static inline u32 qib_get_hdrqtail(const struct qib_ctxtdata *rcd) +{ + const struct qib_devdata *dd = rcd->dd; + u32 hdrqtail; + + if (dd->flags & QIB_NODMA_RTAIL) { + __le32 *rhf_addr; + u32 seq; + + rhf_addr = (__le32 *) rcd->rcvhdrq + + rcd->head + dd->rhf_offset; + seq = qib_hdrget_seq(rhf_addr); + hdrqtail = rcd->head; + if (seq == rcd->seq_cnt) + hdrqtail++; + } else + hdrqtail = qib_get_rcvhdrtail(rcd); + + return hdrqtail; +} + +/* + * sysfs interface. + */ + +extern const char ib_qib_version[]; + +int qib_device_create(struct qib_devdata *); +void qib_device_remove(struct qib_devdata *); + +int qib_create_port_files(struct ib_device *ibdev, u8 port_num, + struct kobject *kobj); +int qib_verbs_register_sysfs(struct qib_devdata *); +void qib_verbs_unregister_sysfs(struct qib_devdata *); +/* Hook for sysfs read of QSFP */ +extern int qib_qsfp_dump(struct qib_pportdata *ppd, char *buf, int len); + +int __init qib_init_qibfs(void); +int __exit qib_exit_qibfs(void); + +int qibfs_add(struct qib_devdata *); +int qibfs_remove(struct qib_devdata *); + +int qib_pcie_init(struct pci_dev *, const struct pci_device_id *); +int qib_pcie_ddinit(struct qib_devdata *, struct pci_dev *, + const struct pci_device_id *); +void qib_pcie_ddcleanup(struct qib_devdata *); +int qib_pcie_params(struct qib_devdata *, u32, u32 *, struct qib_msix_entry *); +int qib_reinit_intr(struct qib_devdata *); +void qib_enable_intx(struct pci_dev *); +void qib_nomsi(struct qib_devdata *); +void qib_nomsix(struct qib_devdata *); +void qib_pcie_getcmd(struct qib_devdata *, u16 *, u8 *, u8 *); +void qib_pcie_reenable(struct qib_devdata *, u16, u8, u8); +/* interrupts for device */ +u64 qib_int_counter(struct qib_devdata *); +/* interrupt for all devices */ +u64 qib_sps_ints(void); + +/* + * dma_addr wrappers - all 0's invalid for hw + */ +dma_addr_t qib_map_page(struct pci_dev *, struct page *, unsigned long, + size_t, int); +const char *qib_get_unit_name(int unit); + +/* + * Flush write combining store buffers (if present) and perform a write + * barrier. + */ +#if defined(CONFIG_X86_64) +#define qib_flush_wc() asm volatile("sfence" : : : "memory") +#else +#define qib_flush_wc() wmb() /* no reorder around wc flush */ +#endif + +/* global module parameter variables */ +extern unsigned qib_ibmtu; +extern ushort qib_cfgctxts; +extern ushort qib_num_cfg_vls; +extern ushort qib_mini_init; /* If set, do few (ideally 0) writes to chip */ +extern unsigned qib_n_krcv_queues; +extern unsigned qib_sdma_fetch_arb; +extern unsigned qib_compat_ddr_negotiate; +extern int qib_special_trigger; +extern unsigned qib_numa_aware; + +extern struct mutex qib_mutex; + +/* Number of seconds before our card status check... */ +#define STATUS_TIMEOUT 60 + +#define QIB_DRV_NAME "ib_qib" +#define QIB_USER_MINOR_BASE 0 +#define QIB_TRACE_MINOR 127 +#define QIB_DIAGPKT_MINOR 128 +#define QIB_DIAG_MINOR_BASE 129 +#define QIB_NMINORS 255 + +#define PCI_VENDOR_ID_PATHSCALE 0x1fc1 +#define PCI_VENDOR_ID_QLOGIC 0x1077 +#define PCI_DEVICE_ID_QLOGIC_IB_6120 0x10 +#define PCI_DEVICE_ID_QLOGIC_IB_7220 0x7220 +#define PCI_DEVICE_ID_QLOGIC_IB_7322 0x7322 + +/* + * qib_early_err is used (only!) to print early errors before devdata is + * allocated, or when dd->pcidev may not be valid, and at the tail end of + * cleanup when devdata may have been freed, etc. qib_dev_porterr is + * the same as qib_dev_err, but is used when the message really needs + * the IB port# to be definitive as to what's happening.. + * All of these go to the trace log, and the trace log entry is done + * first to avoid possible serial port delays from printk. + */ +#define qib_early_err(dev, fmt, ...) \ + dev_err(dev, fmt, ##__VA_ARGS__) + +#define qib_dev_err(dd, fmt, ...) \ + dev_err(&(dd)->pcidev->dev, "%s: " fmt, \ + qib_get_unit_name((dd)->unit), ##__VA_ARGS__) + +#define qib_dev_warn(dd, fmt, ...) \ + dev_warn(&(dd)->pcidev->dev, "%s: " fmt, \ + qib_get_unit_name((dd)->unit), ##__VA_ARGS__) + +#define qib_dev_porterr(dd, port, fmt, ...) \ + dev_err(&(dd)->pcidev->dev, "%s: IB%u:%u " fmt, \ + qib_get_unit_name((dd)->unit), (dd)->unit, (port), \ + ##__VA_ARGS__) + +#define qib_devinfo(pcidev, fmt, ...) \ + dev_info(&(pcidev)->dev, fmt, ##__VA_ARGS__) + +/* + * this is used for formatting hw error messages... + */ +struct qib_hwerror_msgs { + u64 mask; + const char *msg; + size_t sz; +}; + +#define QLOGIC_IB_HWE_MSG(a, b) { .mask = a, .msg = b } + +/* in qib_intr.c... */ +void qib_format_hwerrors(u64 hwerrs, + const struct qib_hwerror_msgs *hwerrmsgs, + size_t nhwerrmsgs, char *msg, size_t lmsg); +#endif /* _QIB_KERNEL_H */ diff --git a/drivers/infiniband/hw/qib/qib_6120_regs.h b/drivers/infiniband/hw/qib/qib_6120_regs.h new file mode 100644 index 0000000..e16cb6f --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_6120_regs.h @@ -0,0 +1,977 @@ +/* + * Copyright (c) 2008, 2009, 2010 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* This file is mechanically generated from RTL. Any hand-edits will be lost! */ + +#define QIB_6120_Revision_OFFS 0x0 +#define QIB_6120_Revision_R_Simulator_LSB 0x3F +#define QIB_6120_Revision_R_Simulator_RMASK 0x1 +#define QIB_6120_Revision_Reserved_LSB 0x28 +#define QIB_6120_Revision_Reserved_RMASK 0x7FFFFF +#define QIB_6120_Revision_BoardID_LSB 0x20 +#define QIB_6120_Revision_BoardID_RMASK 0xFF +#define QIB_6120_Revision_R_SW_LSB 0x18 +#define QIB_6120_Revision_R_SW_RMASK 0xFF +#define QIB_6120_Revision_R_Arch_LSB 0x10 +#define QIB_6120_Revision_R_Arch_RMASK 0xFF +#define QIB_6120_Revision_R_ChipRevMajor_LSB 0x8 +#define QIB_6120_Revision_R_ChipRevMajor_RMASK 0xFF +#define QIB_6120_Revision_R_ChipRevMinor_LSB 0x0 +#define QIB_6120_Revision_R_ChipRevMinor_RMASK 0xFF + +#define QIB_6120_Control_OFFS 0x8 +#define QIB_6120_Control_TxLatency_LSB 0x4 +#define QIB_6120_Control_TxLatency_RMASK 0x1 +#define QIB_6120_Control_PCIERetryBufDiagEn_LSB 0x3 +#define QIB_6120_Control_PCIERetryBufDiagEn_RMASK 0x1 +#define QIB_6120_Control_LinkEn_LSB 0x2 +#define QIB_6120_Control_LinkEn_RMASK 0x1 +#define QIB_6120_Control_FreezeMode_LSB 0x1 +#define QIB_6120_Control_FreezeMode_RMASK 0x1 +#define QIB_6120_Control_SyncReset_LSB 0x0 +#define QIB_6120_Control_SyncReset_RMASK 0x1 + +#define QIB_6120_PageAlign_OFFS 0x10 + +#define QIB_6120_PortCnt_OFFS 0x18 + +#define QIB_6120_SendRegBase_OFFS 0x30 + +#define QIB_6120_UserRegBase_OFFS 0x38 + +#define QIB_6120_CntrRegBase_OFFS 0x40 + +#define QIB_6120_Scratch_OFFS 0x48 +#define QIB_6120_Scratch_TopHalf_LSB 0x20 +#define QIB_6120_Scratch_TopHalf_RMASK 0xFFFFFFFF +#define QIB_6120_Scratch_BottomHalf_LSB 0x0 +#define QIB_6120_Scratch_BottomHalf_RMASK 0xFFFFFFFF + +#define QIB_6120_IntBlocked_OFFS 0x60 +#define QIB_6120_IntBlocked_ErrorIntBlocked_LSB 0x1F +#define QIB_6120_IntBlocked_ErrorIntBlocked_RMASK 0x1 +#define QIB_6120_IntBlocked_PioSetIntBlocked_LSB 0x1E +#define QIB_6120_IntBlocked_PioSetIntBlocked_RMASK 0x1 +#define QIB_6120_IntBlocked_PioBufAvailIntBlocked_LSB 0x1D +#define QIB_6120_IntBlocked_PioBufAvailIntBlocked_RMASK 0x1 +#define QIB_6120_IntBlocked_assertGPIOIntBlocked_LSB 0x1C +#define QIB_6120_IntBlocked_assertGPIOIntBlocked_RMASK 0x1 +#define QIB_6120_IntBlocked_Reserved_LSB 0xF +#define QIB_6120_IntBlocked_Reserved_RMASK 0x1FFF +#define QIB_6120_IntBlocked_RcvAvail4IntBlocked_LSB 0x10 +#define QIB_6120_IntBlocked_RcvAvail4IntBlocked_RMASK 0x1 +#define QIB_6120_IntBlocked_RcvAvail3IntBlocked_LSB 0xF +#define QIB_6120_IntBlocked_RcvAvail3IntBlocked_RMASK 0x1 +#define QIB_6120_IntBlocked_RcvAvail2IntBlocked_LSB 0xE +#define QIB_6120_IntBlocked_RcvAvail2IntBlocked_RMASK 0x1 +#define QIB_6120_IntBlocked_RcvAvail1IntBlocked_LSB 0xD +#define QIB_6120_IntBlocked_RcvAvail1IntBlocked_RMASK 0x1 +#define QIB_6120_IntBlocked_RcvAvail0IntBlocked_LSB 0xC +#define QIB_6120_IntBlocked_RcvAvail0IntBlocked_RMASK 0x1 +#define QIB_6120_IntBlocked_Reserved1_LSB 0x5 +#define QIB_6120_IntBlocked_Reserved1_RMASK 0x7F +#define QIB_6120_IntBlocked_RcvUrg4IntBlocked_LSB 0x4 +#define QIB_6120_IntBlocked_RcvUrg4IntBlocked_RMASK 0x1 +#define QIB_6120_IntBlocked_RcvUrg3IntBlocked_LSB 0x3 +#define QIB_6120_IntBlocked_RcvUrg3IntBlocked_RMASK 0x1 +#define QIB_6120_IntBlocked_RcvUrg2IntBlocked_LSB 0x2 +#define QIB_6120_IntBlocked_RcvUrg2IntBlocked_RMASK 0x1 +#define QIB_6120_IntBlocked_RcvUrg1IntBlocked_LSB 0x1 +#define QIB_6120_IntBlocked_RcvUrg1IntBlocked_RMASK 0x1 +#define QIB_6120_IntBlocked_RcvUrg0IntBlocked_LSB 0x0 +#define QIB_6120_IntBlocked_RcvUrg0IntBlocked_RMASK 0x1 + +#define QIB_6120_IntMask_OFFS 0x68 +#define QIB_6120_IntMask_ErrorIntMask_LSB 0x1F +#define QIB_6120_IntMask_ErrorIntMask_RMASK 0x1 +#define QIB_6120_IntMask_PioSetIntMask_LSB 0x1E +#define QIB_6120_IntMask_PioSetIntMask_RMASK 0x1 +#define QIB_6120_IntMask_PioBufAvailIntMask_LSB 0x1D +#define QIB_6120_IntMask_PioBufAvailIntMask_RMASK 0x1 +#define QIB_6120_IntMask_assertGPIOIntMask_LSB 0x1C +#define QIB_6120_IntMask_assertGPIOIntMask_RMASK 0x1 +#define QIB_6120_IntMask_Reserved_LSB 0x11 +#define QIB_6120_IntMask_Reserved_RMASK 0x7FF +#define QIB_6120_IntMask_RcvAvail4IntMask_LSB 0x10 +#define QIB_6120_IntMask_RcvAvail4IntMask_RMASK 0x1 +#define QIB_6120_IntMask_RcvAvail3IntMask_LSB 0xF +#define QIB_6120_IntMask_RcvAvail3IntMask_RMASK 0x1 +#define QIB_6120_IntMask_RcvAvail2IntMask_LSB 0xE +#define QIB_6120_IntMask_RcvAvail2IntMask_RMASK 0x1 +#define QIB_6120_IntMask_RcvAvail1IntMask_LSB 0xD +#define QIB_6120_IntMask_RcvAvail1IntMask_RMASK 0x1 +#define QIB_6120_IntMask_RcvAvail0IntMask_LSB 0xC +#define QIB_6120_IntMask_RcvAvail0IntMask_RMASK 0x1 +#define QIB_6120_IntMask_Reserved1_LSB 0x5 +#define QIB_6120_IntMask_Reserved1_RMASK 0x7F +#define QIB_6120_IntMask_RcvUrg4IntMask_LSB 0x4 +#define QIB_6120_IntMask_RcvUrg4IntMask_RMASK 0x1 +#define QIB_6120_IntMask_RcvUrg3IntMask_LSB 0x3 +#define QIB_6120_IntMask_RcvUrg3IntMask_RMASK 0x1 +#define QIB_6120_IntMask_RcvUrg2IntMask_LSB 0x2 +#define QIB_6120_IntMask_RcvUrg2IntMask_RMASK 0x1 +#define QIB_6120_IntMask_RcvUrg1IntMask_LSB 0x1 +#define QIB_6120_IntMask_RcvUrg1IntMask_RMASK 0x1 +#define QIB_6120_IntMask_RcvUrg0IntMask_LSB 0x0 +#define QIB_6120_IntMask_RcvUrg0IntMask_RMASK 0x1 + +#define QIB_6120_IntStatus_OFFS 0x70 +#define QIB_6120_IntStatus_Error_LSB 0x1F +#define QIB_6120_IntStatus_Error_RMASK 0x1 +#define QIB_6120_IntStatus_PioSent_LSB 0x1E +#define QIB_6120_IntStatus_PioSent_RMASK 0x1 +#define QIB_6120_IntStatus_PioBufAvail_LSB 0x1D +#define QIB_6120_IntStatus_PioBufAvail_RMASK 0x1 +#define QIB_6120_IntStatus_assertGPIO_LSB 0x1C +#define QIB_6120_IntStatus_assertGPIO_RMASK 0x1 +#define QIB_6120_IntStatus_Reserved_LSB 0xF +#define QIB_6120_IntStatus_Reserved_RMASK 0x1FFF +#define QIB_6120_IntStatus_RcvAvail4_LSB 0x10 +#define QIB_6120_IntStatus_RcvAvail4_RMASK 0x1 +#define QIB_6120_IntStatus_RcvAvail3_LSB 0xF +#define QIB_6120_IntStatus_RcvAvail3_RMASK 0x1 +#define QIB_6120_IntStatus_RcvAvail2_LSB 0xE +#define QIB_6120_IntStatus_RcvAvail2_RMASK 0x1 +#define QIB_6120_IntStatus_RcvAvail1_LSB 0xD +#define QIB_6120_IntStatus_RcvAvail1_RMASK 0x1 +#define QIB_6120_IntStatus_RcvAvail0_LSB 0xC +#define QIB_6120_IntStatus_RcvAvail0_RMASK 0x1 +#define QIB_6120_IntStatus_Reserved1_LSB 0x5 +#define QIB_6120_IntStatus_Reserved1_RMASK 0x7F +#define QIB_6120_IntStatus_RcvUrg4_LSB 0x4 +#define QIB_6120_IntStatus_RcvUrg4_RMASK 0x1 +#define QIB_6120_IntStatus_RcvUrg3_LSB 0x3 +#define QIB_6120_IntStatus_RcvUrg3_RMASK 0x1 +#define QIB_6120_IntStatus_RcvUrg2_LSB 0x2 +#define QIB_6120_IntStatus_RcvUrg2_RMASK 0x1 +#define QIB_6120_IntStatus_RcvUrg1_LSB 0x1 +#define QIB_6120_IntStatus_RcvUrg1_RMASK 0x1 +#define QIB_6120_IntStatus_RcvUrg0_LSB 0x0 +#define QIB_6120_IntStatus_RcvUrg0_RMASK 0x1 + +#define QIB_6120_IntClear_OFFS 0x78 +#define QIB_6120_IntClear_ErrorIntClear_LSB 0x1F +#define QIB_6120_IntClear_ErrorIntClear_RMASK 0x1 +#define QIB_6120_IntClear_PioSetIntClear_LSB 0x1E +#define QIB_6120_IntClear_PioSetIntClear_RMASK 0x1 +#define QIB_6120_IntClear_PioBufAvailIntClear_LSB 0x1D +#define QIB_6120_IntClear_PioBufAvailIntClear_RMASK 0x1 +#define QIB_6120_IntClear_assertGPIOIntClear_LSB 0x1C +#define QIB_6120_IntClear_assertGPIOIntClear_RMASK 0x1 +#define QIB_6120_IntClear_Reserved_LSB 0xF +#define QIB_6120_IntClear_Reserved_RMASK 0x1FFF +#define QIB_6120_IntClear_RcvAvail4IntClear_LSB 0x10 +#define QIB_6120_IntClear_RcvAvail4IntClear_RMASK 0x1 +#define QIB_6120_IntClear_RcvAvail3IntClear_LSB 0xF +#define QIB_6120_IntClear_RcvAvail3IntClear_RMASK 0x1 +#define QIB_6120_IntClear_RcvAvail2IntClear_LSB 0xE +#define QIB_6120_IntClear_RcvAvail2IntClear_RMASK 0x1 +#define QIB_6120_IntClear_RcvAvail1IntClear_LSB 0xD +#define QIB_6120_IntClear_RcvAvail1IntClear_RMASK 0x1 +#define QIB_6120_IntClear_RcvAvail0IntClear_LSB 0xC +#define QIB_6120_IntClear_RcvAvail0IntClear_RMASK 0x1 +#define QIB_6120_IntClear_Reserved1_LSB 0x5 +#define QIB_6120_IntClear_Reserved1_RMASK 0x7F +#define QIB_6120_IntClear_RcvUrg4IntClear_LSB 0x4 +#define QIB_6120_IntClear_RcvUrg4IntClear_RMASK 0x1 +#define QIB_6120_IntClear_RcvUrg3IntClear_LSB 0x3 +#define QIB_6120_IntClear_RcvUrg3IntClear_RMASK 0x1 +#define QIB_6120_IntClear_RcvUrg2IntClear_LSB 0x2 +#define QIB_6120_IntClear_RcvUrg2IntClear_RMASK 0x1 +#define QIB_6120_IntClear_RcvUrg1IntClear_LSB 0x1 +#define QIB_6120_IntClear_RcvUrg1IntClear_RMASK 0x1 +#define QIB_6120_IntClear_RcvUrg0IntClear_LSB 0x0 +#define QIB_6120_IntClear_RcvUrg0IntClear_RMASK 0x1 + +#define QIB_6120_ErrMask_OFFS 0x80 +#define QIB_6120_ErrMask_Reserved_LSB 0x34 +#define QIB_6120_ErrMask_Reserved_RMASK 0xFFF +#define QIB_6120_ErrMask_HardwareErrMask_LSB 0x33 +#define QIB_6120_ErrMask_HardwareErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_ResetNegatedMask_LSB 0x32 +#define QIB_6120_ErrMask_ResetNegatedMask_RMASK 0x1 +#define QIB_6120_ErrMask_InvalidAddrErrMask_LSB 0x31 +#define QIB_6120_ErrMask_InvalidAddrErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_IBStatusChangedMask_LSB 0x30 +#define QIB_6120_ErrMask_IBStatusChangedMask_RMASK 0x1 +#define QIB_6120_ErrMask_Reserved1_LSB 0x26 +#define QIB_6120_ErrMask_Reserved1_RMASK 0x3FF +#define QIB_6120_ErrMask_SendUnsupportedVLErrMask_LSB 0x25 +#define QIB_6120_ErrMask_SendUnsupportedVLErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_SendUnexpectedPktNumErrMask_LSB 0x24 +#define QIB_6120_ErrMask_SendUnexpectedPktNumErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_SendPioArmLaunchErrMask_LSB 0x23 +#define QIB_6120_ErrMask_SendPioArmLaunchErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_SendDroppedDataPktErrMask_LSB 0x22 +#define QIB_6120_ErrMask_SendDroppedDataPktErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_SendDroppedSmpPktErrMask_LSB 0x21 +#define QIB_6120_ErrMask_SendDroppedSmpPktErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_SendPktLenErrMask_LSB 0x20 +#define QIB_6120_ErrMask_SendPktLenErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_SendUnderRunErrMask_LSB 0x1F +#define QIB_6120_ErrMask_SendUnderRunErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_SendMaxPktLenErrMask_LSB 0x1E +#define QIB_6120_ErrMask_SendMaxPktLenErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_SendMinPktLenErrMask_LSB 0x1D +#define QIB_6120_ErrMask_SendMinPktLenErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_Reserved2_LSB 0x12 +#define QIB_6120_ErrMask_Reserved2_RMASK 0x7FF +#define QIB_6120_ErrMask_RcvIBLostLinkErrMask_LSB 0x11 +#define QIB_6120_ErrMask_RcvIBLostLinkErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvHdrErrMask_LSB 0x10 +#define QIB_6120_ErrMask_RcvHdrErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvHdrLenErrMask_LSB 0xF +#define QIB_6120_ErrMask_RcvHdrLenErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvBadTidErrMask_LSB 0xE +#define QIB_6120_ErrMask_RcvBadTidErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvHdrFullErrMask_LSB 0xD +#define QIB_6120_ErrMask_RcvHdrFullErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvEgrFullErrMask_LSB 0xC +#define QIB_6120_ErrMask_RcvEgrFullErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvBadVersionErrMask_LSB 0xB +#define QIB_6120_ErrMask_RcvBadVersionErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvIBFlowErrMask_LSB 0xA +#define QIB_6120_ErrMask_RcvIBFlowErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvEBPErrMask_LSB 0x9 +#define QIB_6120_ErrMask_RcvEBPErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvUnsupportedVLErrMask_LSB 0x8 +#define QIB_6120_ErrMask_RcvUnsupportedVLErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvUnexpectedCharErrMask_LSB 0x7 +#define QIB_6120_ErrMask_RcvUnexpectedCharErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvShortPktLenErrMask_LSB 0x6 +#define QIB_6120_ErrMask_RcvShortPktLenErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvLongPktLenErrMask_LSB 0x5 +#define QIB_6120_ErrMask_RcvLongPktLenErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvMaxPktLenErrMask_LSB 0x4 +#define QIB_6120_ErrMask_RcvMaxPktLenErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvMinPktLenErrMask_LSB 0x3 +#define QIB_6120_ErrMask_RcvMinPktLenErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvICRCErrMask_LSB 0x2 +#define QIB_6120_ErrMask_RcvICRCErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvVCRCErrMask_LSB 0x1 +#define QIB_6120_ErrMask_RcvVCRCErrMask_RMASK 0x1 +#define QIB_6120_ErrMask_RcvFormatErrMask_LSB 0x0 +#define QIB_6120_ErrMask_RcvFormatErrMask_RMASK 0x1 + +#define QIB_6120_ErrStatus_OFFS 0x88 +#define QIB_6120_ErrStatus_Reserved_LSB 0x34 +#define QIB_6120_ErrStatus_Reserved_RMASK 0xFFF +#define QIB_6120_ErrStatus_HardwareErr_LSB 0x33 +#define QIB_6120_ErrStatus_HardwareErr_RMASK 0x1 +#define QIB_6120_ErrStatus_ResetNegated_LSB 0x32 +#define QIB_6120_ErrStatus_ResetNegated_RMASK 0x1 +#define QIB_6120_ErrStatus_InvalidAddrErr_LSB 0x31 +#define QIB_6120_ErrStatus_InvalidAddrErr_RMASK 0x1 +#define QIB_6120_ErrStatus_IBStatusChanged_LSB 0x30 +#define QIB_6120_ErrStatus_IBStatusChanged_RMASK 0x1 +#define QIB_6120_ErrStatus_Reserved1_LSB 0x26 +#define QIB_6120_ErrStatus_Reserved1_RMASK 0x3FF +#define QIB_6120_ErrStatus_SendUnsupportedVLErr_LSB 0x25 +#define QIB_6120_ErrStatus_SendUnsupportedVLErr_RMASK 0x1 +#define QIB_6120_ErrStatus_SendUnexpectedPktNumErr_LSB 0x24 +#define QIB_6120_ErrStatus_SendUnexpectedPktNumErr_RMASK 0x1 +#define QIB_6120_ErrStatus_SendPioArmLaunchErr_LSB 0x23 +#define QIB_6120_ErrStatus_SendPioArmLaunchErr_RMASK 0x1 +#define QIB_6120_ErrStatus_SendDroppedDataPktErr_LSB 0x22 +#define QIB_6120_ErrStatus_SendDroppedDataPktErr_RMASK 0x1 +#define QIB_6120_ErrStatus_SendDroppedSmpPktErr_LSB 0x21 +#define QIB_6120_ErrStatus_SendDroppedSmpPktErr_RMASK 0x1 +#define QIB_6120_ErrStatus_SendPktLenErr_LSB 0x20 +#define QIB_6120_ErrStatus_SendPktLenErr_RMASK 0x1 +#define QIB_6120_ErrStatus_SendUnderRunErr_LSB 0x1F +#define QIB_6120_ErrStatus_SendUnderRunErr_RMASK 0x1 +#define QIB_6120_ErrStatus_SendMaxPktLenErr_LSB 0x1E +#define QIB_6120_ErrStatus_SendMaxPktLenErr_RMASK 0x1 +#define QIB_6120_ErrStatus_SendMinPktLenErr_LSB 0x1D +#define QIB_6120_ErrStatus_SendMinPktLenErr_RMASK 0x1 +#define QIB_6120_ErrStatus_Reserved2_LSB 0x12 +#define QIB_6120_ErrStatus_Reserved2_RMASK 0x7FF +#define QIB_6120_ErrStatus_RcvIBLostLinkErr_LSB 0x11 +#define QIB_6120_ErrStatus_RcvIBLostLinkErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvHdrErr_LSB 0x10 +#define QIB_6120_ErrStatus_RcvHdrErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvHdrLenErr_LSB 0xF +#define QIB_6120_ErrStatus_RcvHdrLenErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvBadTidErr_LSB 0xE +#define QIB_6120_ErrStatus_RcvBadTidErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvHdrFullErr_LSB 0xD +#define QIB_6120_ErrStatus_RcvHdrFullErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvEgrFullErr_LSB 0xC +#define QIB_6120_ErrStatus_RcvEgrFullErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvBadVersionErr_LSB 0xB +#define QIB_6120_ErrStatus_RcvBadVersionErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvIBFlowErr_LSB 0xA +#define QIB_6120_ErrStatus_RcvIBFlowErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvEBPErr_LSB 0x9 +#define QIB_6120_ErrStatus_RcvEBPErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvUnsupportedVLErr_LSB 0x8 +#define QIB_6120_ErrStatus_RcvUnsupportedVLErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvUnexpectedCharErr_LSB 0x7 +#define QIB_6120_ErrStatus_RcvUnexpectedCharErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvShortPktLenErr_LSB 0x6 +#define QIB_6120_ErrStatus_RcvShortPktLenErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvLongPktLenErr_LSB 0x5 +#define QIB_6120_ErrStatus_RcvLongPktLenErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvMaxPktLenErr_LSB 0x4 +#define QIB_6120_ErrStatus_RcvMaxPktLenErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvMinPktLenErr_LSB 0x3 +#define QIB_6120_ErrStatus_RcvMinPktLenErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvICRCErr_LSB 0x2 +#define QIB_6120_ErrStatus_RcvICRCErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvVCRCErr_LSB 0x1 +#define QIB_6120_ErrStatus_RcvVCRCErr_RMASK 0x1 +#define QIB_6120_ErrStatus_RcvFormatErr_LSB 0x0 +#define QIB_6120_ErrStatus_RcvFormatErr_RMASK 0x1 + +#define QIB_6120_ErrClear_OFFS 0x90 +#define QIB_6120_ErrClear_Reserved_LSB 0x34 +#define QIB_6120_ErrClear_Reserved_RMASK 0xFFF +#define QIB_6120_ErrClear_HardwareErrClear_LSB 0x33 +#define QIB_6120_ErrClear_HardwareErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_ResetNegatedClear_LSB 0x32 +#define QIB_6120_ErrClear_ResetNegatedClear_RMASK 0x1 +#define QIB_6120_ErrClear_InvalidAddrErrClear_LSB 0x31 +#define QIB_6120_ErrClear_InvalidAddrErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_IBStatusChangedClear_LSB 0x30 +#define QIB_6120_ErrClear_IBStatusChangedClear_RMASK 0x1 +#define QIB_6120_ErrClear_Reserved1_LSB 0x26 +#define QIB_6120_ErrClear_Reserved1_RMASK 0x3FF +#define QIB_6120_ErrClear_SendUnsupportedVLErrClear_LSB 0x25 +#define QIB_6120_ErrClear_SendUnsupportedVLErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_SendUnexpectedPktNumErrClear_LSB 0x24 +#define QIB_6120_ErrClear_SendUnexpectedPktNumErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_SendPioArmLaunchErrClear_LSB 0x23 +#define QIB_6120_ErrClear_SendPioArmLaunchErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_SendDroppedDataPktErrClear_LSB 0x22 +#define QIB_6120_ErrClear_SendDroppedDataPktErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_SendDroppedSmpPktErrClear_LSB 0x21 +#define QIB_6120_ErrClear_SendDroppedSmpPktErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_SendPktLenErrClear_LSB 0x20 +#define QIB_6120_ErrClear_SendPktLenErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_SendUnderRunErrClear_LSB 0x1F +#define QIB_6120_ErrClear_SendUnderRunErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_SendMaxPktLenErrClear_LSB 0x1E +#define QIB_6120_ErrClear_SendMaxPktLenErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_SendMinPktLenErrClear_LSB 0x1D +#define QIB_6120_ErrClear_SendMinPktLenErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_Reserved2_LSB 0x12 +#define QIB_6120_ErrClear_Reserved2_RMASK 0x7FF +#define QIB_6120_ErrClear_RcvIBLostLinkErrClear_LSB 0x11 +#define QIB_6120_ErrClear_RcvIBLostLinkErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvHdrErrClear_LSB 0x10 +#define QIB_6120_ErrClear_RcvHdrErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvHdrLenErrClear_LSB 0xF +#define QIB_6120_ErrClear_RcvHdrLenErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvBadTidErrClear_LSB 0xE +#define QIB_6120_ErrClear_RcvBadTidErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvHdrFullErrClear_LSB 0xD +#define QIB_6120_ErrClear_RcvHdrFullErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvEgrFullErrClear_LSB 0xC +#define QIB_6120_ErrClear_RcvEgrFullErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvBadVersionErrClear_LSB 0xB +#define QIB_6120_ErrClear_RcvBadVersionErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvIBFlowErrClear_LSB 0xA +#define QIB_6120_ErrClear_RcvIBFlowErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvEBPErrClear_LSB 0x9 +#define QIB_6120_ErrClear_RcvEBPErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvUnsupportedVLErrClear_LSB 0x8 +#define QIB_6120_ErrClear_RcvUnsupportedVLErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvUnexpectedCharErrClear_LSB 0x7 +#define QIB_6120_ErrClear_RcvUnexpectedCharErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvShortPktLenErrClear_LSB 0x6 +#define QIB_6120_ErrClear_RcvShortPktLenErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvLongPktLenErrClear_LSB 0x5 +#define QIB_6120_ErrClear_RcvLongPktLenErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvMaxPktLenErrClear_LSB 0x4 +#define QIB_6120_ErrClear_RcvMaxPktLenErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvMinPktLenErrClear_LSB 0x3 +#define QIB_6120_ErrClear_RcvMinPktLenErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvICRCErrClear_LSB 0x2 +#define QIB_6120_ErrClear_RcvICRCErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvVCRCErrClear_LSB 0x1 +#define QIB_6120_ErrClear_RcvVCRCErrClear_RMASK 0x1 +#define QIB_6120_ErrClear_RcvFormatErrClear_LSB 0x0 +#define QIB_6120_ErrClear_RcvFormatErrClear_RMASK 0x1 + +#define QIB_6120_HwErrMask_OFFS 0x98 +#define QIB_6120_HwErrMask_IBCBusFromSPCParityErrMask_LSB 0x3F +#define QIB_6120_HwErrMask_IBCBusFromSPCParityErrMask_RMASK 0x1 +#define QIB_6120_HwErrMask_IBCBusToSPCParityErrMask_LSB 0x3E +#define QIB_6120_HwErrMask_IBCBusToSPCParityErrMask_RMASK 0x1 +#define QIB_6120_HwErrMask_Reserved_LSB 0x3D +#define QIB_6120_HwErrMask_Reserved_RMASK 0x1 +#define QIB_6120_HwErrMask_IBSerdesPClkNotDetectMask_LSB 0x3C +#define QIB_6120_HwErrMask_IBSerdesPClkNotDetectMask_RMASK 0x1 +#define QIB_6120_HwErrMask_PCIESerdesQ0PClkNotDetectMask_LSB 0x3B +#define QIB_6120_HwErrMask_PCIESerdesQ0PClkNotDetectMask_RMASK 0x1 +#define QIB_6120_HwErrMask_PCIESerdesQ1PClkNotDetectMask_LSB 0x3A +#define QIB_6120_HwErrMask_PCIESerdesQ1PClkNotDetectMask_RMASK 0x1 +#define QIB_6120_HwErrMask_Reserved1_LSB 0x39 +#define QIB_6120_HwErrMask_Reserved1_RMASK 0x1 +#define QIB_6120_HwErrMask_IBPLLrfSlipMask_LSB 0x38 +#define QIB_6120_HwErrMask_IBPLLrfSlipMask_RMASK 0x1 +#define QIB_6120_HwErrMask_IBPLLfbSlipMask_LSB 0x37 +#define QIB_6120_HwErrMask_IBPLLfbSlipMask_RMASK 0x1 +#define QIB_6120_HwErrMask_PowerOnBISTFailedMask_LSB 0x36 +#define QIB_6120_HwErrMask_PowerOnBISTFailedMask_RMASK 0x1 +#define QIB_6120_HwErrMask_Reserved2_LSB 0x33 +#define QIB_6120_HwErrMask_Reserved2_RMASK 0x7 +#define QIB_6120_HwErrMask_RXEMemParityErrMask_LSB 0x2C +#define QIB_6120_HwErrMask_RXEMemParityErrMask_RMASK 0x7F +#define QIB_6120_HwErrMask_TXEMemParityErrMask_LSB 0x28 +#define QIB_6120_HwErrMask_TXEMemParityErrMask_RMASK 0xF +#define QIB_6120_HwErrMask_Reserved3_LSB 0x22 +#define QIB_6120_HwErrMask_Reserved3_RMASK 0x3F +#define QIB_6120_HwErrMask_PCIeBusParityErrMask_LSB 0x1F +#define QIB_6120_HwErrMask_PCIeBusParityErrMask_RMASK 0x7 +#define QIB_6120_HwErrMask_PcieCplTimeoutMask_LSB 0x1E +#define QIB_6120_HwErrMask_PcieCplTimeoutMask_RMASK 0x1 +#define QIB_6120_HwErrMask_PoisonedTLPMask_LSB 0x1D +#define QIB_6120_HwErrMask_PoisonedTLPMask_RMASK 0x1 +#define QIB_6120_HwErrMask_Reserved4_LSB 0x6 +#define QIB_6120_HwErrMask_Reserved4_RMASK 0x7FFFFF +#define QIB_6120_HwErrMask_PCIeMemParityErrMask_LSB 0x0 +#define QIB_6120_HwErrMask_PCIeMemParityErrMask_RMASK 0x3F + +#define QIB_6120_HwErrStatus_OFFS 0xA0 +#define QIB_6120_HwErrStatus_IBCBusFromSPCParityErr_LSB 0x3F +#define QIB_6120_HwErrStatus_IBCBusFromSPCParityErr_RMASK 0x1 +#define QIB_6120_HwErrStatus_IBCBusToSPCParityErr_LSB 0x3E +#define QIB_6120_HwErrStatus_IBCBusToSPCParityErr_RMASK 0x1 +#define QIB_6120_HwErrStatus_Reserved_LSB 0x3D +#define QIB_6120_HwErrStatus_Reserved_RMASK 0x1 +#define QIB_6120_HwErrStatus_IBSerdesPClkNotDetect_LSB 0x3C +#define QIB_6120_HwErrStatus_IBSerdesPClkNotDetect_RMASK 0x1 +#define QIB_6120_HwErrStatus_PCIESerdesQ0PClkNotDetect_LSB 0x3B +#define QIB_6120_HwErrStatus_PCIESerdesQ0PClkNotDetect_RMASK 0x1 +#define QIB_6120_HwErrStatus_PCIESerdesQ1PClkNotDetect_LSB 0x3A +#define QIB_6120_HwErrStatus_PCIESerdesQ1PClkNotDetect_RMASK 0x1 +#define QIB_6120_HwErrStatus_Reserved1_LSB 0x39 +#define QIB_6120_HwErrStatus_Reserved1_RMASK 0x1 +#define QIB_6120_HwErrStatus_IBPLLrfSlip_LSB 0x38 +#define QIB_6120_HwErrStatus_IBPLLrfSlip_RMASK 0x1 +#define QIB_6120_HwErrStatus_IBPLLfbSlip_LSB 0x37 +#define QIB_6120_HwErrStatus_IBPLLfbSlip_RMASK 0x1 +#define QIB_6120_HwErrStatus_PowerOnBISTFailed_LSB 0x36 +#define QIB_6120_HwErrStatus_PowerOnBISTFailed_RMASK 0x1 +#define QIB_6120_HwErrStatus_Reserved2_LSB 0x33 +#define QIB_6120_HwErrStatus_Reserved2_RMASK 0x7 +#define QIB_6120_HwErrStatus_RXEMemParity_LSB 0x2C +#define QIB_6120_HwErrStatus_RXEMemParity_RMASK 0x7F +#define QIB_6120_HwErrStatus_TXEMemParity_LSB 0x28 +#define QIB_6120_HwErrStatus_TXEMemParity_RMASK 0xF +#define QIB_6120_HwErrStatus_Reserved3_LSB 0x22 +#define QIB_6120_HwErrStatus_Reserved3_RMASK 0x3F +#define QIB_6120_HwErrStatus_PCIeBusParity_LSB 0x1F +#define QIB_6120_HwErrStatus_PCIeBusParity_RMASK 0x7 +#define QIB_6120_HwErrStatus_PcieCplTimeout_LSB 0x1E +#define QIB_6120_HwErrStatus_PcieCplTimeout_RMASK 0x1 +#define QIB_6120_HwErrStatus_PoisenedTLP_LSB 0x1D +#define QIB_6120_HwErrStatus_PoisenedTLP_RMASK 0x1 +#define QIB_6120_HwErrStatus_Reserved4_LSB 0x6 +#define QIB_6120_HwErrStatus_Reserved4_RMASK 0x7FFFFF +#define QIB_6120_HwErrStatus_PCIeMemParity_LSB 0x0 +#define QIB_6120_HwErrStatus_PCIeMemParity_RMASK 0x3F + +#define QIB_6120_HwErrClear_OFFS 0xA8 +#define QIB_6120_HwErrClear_IBCBusFromSPCParityErrClear_LSB 0x3F +#define QIB_6120_HwErrClear_IBCBusFromSPCParityErrClear_RMASK 0x1 +#define QIB_6120_HwErrClear_IBCBusToSPCparityErrClear_LSB 0x3E +#define QIB_6120_HwErrClear_IBCBusToSPCparityErrClear_RMASK 0x1 +#define QIB_6120_HwErrClear_Reserved_LSB 0x3D +#define QIB_6120_HwErrClear_Reserved_RMASK 0x1 +#define QIB_6120_HwErrClear_IBSerdesPClkNotDetectClear_LSB 0x3C +#define QIB_6120_HwErrClear_IBSerdesPClkNotDetectClear_RMASK 0x1 +#define QIB_6120_HwErrClear_PCIESerdesQ0PClkNotDetectClear_LSB 0x3B +#define QIB_6120_HwErrClear_PCIESerdesQ0PClkNotDetectClear_RMASK 0x1 +#define QIB_6120_HwErrClear_PCIESerdesQ1PClkNotDetectClear_LSB 0x3A +#define QIB_6120_HwErrClear_PCIESerdesQ1PClkNotDetectClear_RMASK 0x1 +#define QIB_6120_HwErrClear_Reserved1_LSB 0x39 +#define QIB_6120_HwErrClear_Reserved1_RMASK 0x1 +#define QIB_6120_HwErrClear_IBPLLrfSlipClear_LSB 0x38 +#define QIB_6120_HwErrClear_IBPLLrfSlipClear_RMASK 0x1 +#define QIB_6120_HwErrClear_IBPLLfbSlipClear_LSB 0x37 +#define QIB_6120_HwErrClear_IBPLLfbSlipClear_RMASK 0x1 +#define QIB_6120_HwErrClear_PowerOnBISTFailedClear_LSB 0x36 +#define QIB_6120_HwErrClear_PowerOnBISTFailedClear_RMASK 0x1 +#define QIB_6120_HwErrClear_Reserved2_LSB 0x33 +#define QIB_6120_HwErrClear_Reserved2_RMASK 0x7 +#define QIB_6120_HwErrClear_RXEMemParityClear_LSB 0x2C +#define QIB_6120_HwErrClear_RXEMemParityClear_RMASK 0x7F +#define QIB_6120_HwErrClear_TXEMemParityClear_LSB 0x28 +#define QIB_6120_HwErrClear_TXEMemParityClear_RMASK 0xF +#define QIB_6120_HwErrClear_Reserved3_LSB 0x22 +#define QIB_6120_HwErrClear_Reserved3_RMASK 0x3F +#define QIB_6120_HwErrClear_PCIeBusParityClr_LSB 0x1F +#define QIB_6120_HwErrClear_PCIeBusParityClr_RMASK 0x7 +#define QIB_6120_HwErrClear_PcieCplTimeoutClear_LSB 0x1E +#define QIB_6120_HwErrClear_PcieCplTimeoutClear_RMASK 0x1 +#define QIB_6120_HwErrClear_PoisonedTLPClear_LSB 0x1D +#define QIB_6120_HwErrClear_PoisonedTLPClear_RMASK 0x1 +#define QIB_6120_HwErrClear_Reserved4_LSB 0x6 +#define QIB_6120_HwErrClear_Reserved4_RMASK 0x7FFFFF +#define QIB_6120_HwErrClear_PCIeMemParityClr_LSB 0x0 +#define QIB_6120_HwErrClear_PCIeMemParityClr_RMASK 0x3F + +#define QIB_6120_HwDiagCtrl_OFFS 0xB0 +#define QIB_6120_HwDiagCtrl_ForceIBCBusFromSPCParityErr_LSB 0x3F +#define QIB_6120_HwDiagCtrl_ForceIBCBusFromSPCParityErr_RMASK 0x1 +#define QIB_6120_HwDiagCtrl_ForceIBCBusToSPCParityErr_LSB 0x3E +#define QIB_6120_HwDiagCtrl_ForceIBCBusToSPCParityErr_RMASK 0x1 +#define QIB_6120_HwDiagCtrl_CounterWrEnable_LSB 0x3D +#define QIB_6120_HwDiagCtrl_CounterWrEnable_RMASK 0x1 +#define QIB_6120_HwDiagCtrl_CounterDisable_LSB 0x3C +#define QIB_6120_HwDiagCtrl_CounterDisable_RMASK 0x1 +#define QIB_6120_HwDiagCtrl_Reserved_LSB 0x33 +#define QIB_6120_HwDiagCtrl_Reserved_RMASK 0x1FF +#define QIB_6120_HwDiagCtrl_ForceRxMemParityErr_LSB 0x2C +#define QIB_6120_HwDiagCtrl_ForceRxMemParityErr_RMASK 0x7F +#define QIB_6120_HwDiagCtrl_ForceTxMemparityErr_LSB 0x28 +#define QIB_6120_HwDiagCtrl_ForceTxMemparityErr_RMASK 0xF +#define QIB_6120_HwDiagCtrl_Reserved1_LSB 0x23 +#define QIB_6120_HwDiagCtrl_Reserved1_RMASK 0x1F +#define QIB_6120_HwDiagCtrl_forcePCIeBusParity_LSB 0x1F +#define QIB_6120_HwDiagCtrl_forcePCIeBusParity_RMASK 0xF +#define QIB_6120_HwDiagCtrl_Reserved2_LSB 0x6 +#define QIB_6120_HwDiagCtrl_Reserved2_RMASK 0x1FFFFFF +#define QIB_6120_HwDiagCtrl_forcePCIeMemParity_LSB 0x0 +#define QIB_6120_HwDiagCtrl_forcePCIeMemParity_RMASK 0x3F + +#define QIB_6120_IBCStatus_OFFS 0xC0 +#define QIB_6120_IBCStatus_TxCreditOk_LSB 0x1F +#define QIB_6120_IBCStatus_TxCreditOk_RMASK 0x1 +#define QIB_6120_IBCStatus_TxReady_LSB 0x1E +#define QIB_6120_IBCStatus_TxReady_RMASK 0x1 +#define QIB_6120_IBCStatus_Reserved_LSB 0x7 +#define QIB_6120_IBCStatus_Reserved_RMASK 0x7FFFFF +#define QIB_6120_IBCStatus_LinkState_LSB 0x4 +#define QIB_6120_IBCStatus_LinkState_RMASK 0x7 +#define QIB_6120_IBCStatus_LinkTrainingState_LSB 0x0 +#define QIB_6120_IBCStatus_LinkTrainingState_RMASK 0xF + +#define QIB_6120_IBCCtrl_OFFS 0xC8 +#define QIB_6120_IBCCtrl_Loopback_LSB 0x3F +#define QIB_6120_IBCCtrl_Loopback_RMASK 0x1 +#define QIB_6120_IBCCtrl_LinkDownDefaultState_LSB 0x3E +#define QIB_6120_IBCCtrl_LinkDownDefaultState_RMASK 0x1 +#define QIB_6120_IBCCtrl_Reserved_LSB 0x2B +#define QIB_6120_IBCCtrl_Reserved_RMASK 0x7FFFF +#define QIB_6120_IBCCtrl_CreditScale_LSB 0x28 +#define QIB_6120_IBCCtrl_CreditScale_RMASK 0x7 +#define QIB_6120_IBCCtrl_OverrunThreshold_LSB 0x24 +#define QIB_6120_IBCCtrl_OverrunThreshold_RMASK 0xF +#define QIB_6120_IBCCtrl_PhyerrThreshold_LSB 0x20 +#define QIB_6120_IBCCtrl_PhyerrThreshold_RMASK 0xF +#define QIB_6120_IBCCtrl_Reserved1_LSB 0x1F +#define QIB_6120_IBCCtrl_Reserved1_RMASK 0x1 +#define QIB_6120_IBCCtrl_MaxPktLen_LSB 0x14 +#define QIB_6120_IBCCtrl_MaxPktLen_RMASK 0x7FF +#define QIB_6120_IBCCtrl_LinkCmd_LSB 0x12 +#define QIB_6120_IBCCtrl_LinkCmd_RMASK 0x3 +#define QIB_6120_IBCCtrl_LinkInitCmd_LSB 0x10 +#define QIB_6120_IBCCtrl_LinkInitCmd_RMASK 0x3 +#define QIB_6120_IBCCtrl_FlowCtrlWaterMark_LSB 0x8 +#define QIB_6120_IBCCtrl_FlowCtrlWaterMark_RMASK 0xFF +#define QIB_6120_IBCCtrl_FlowCtrlPeriod_LSB 0x0 +#define QIB_6120_IBCCtrl_FlowCtrlPeriod_RMASK 0xFF + +#define QIB_6120_EXTStatus_OFFS 0xD0 +#define QIB_6120_EXTStatus_GPIOIn_LSB 0x30 +#define QIB_6120_EXTStatus_GPIOIn_RMASK 0xFFFF +#define QIB_6120_EXTStatus_Reserved_LSB 0x20 +#define QIB_6120_EXTStatus_Reserved_RMASK 0xFFFF +#define QIB_6120_EXTStatus_Reserved1_LSB 0x10 +#define QIB_6120_EXTStatus_Reserved1_RMASK 0xFFFF +#define QIB_6120_EXTStatus_MemBISTFoundErr_LSB 0xF +#define QIB_6120_EXTStatus_MemBISTFoundErr_RMASK 0x1 +#define QIB_6120_EXTStatus_MemBISTEndTest_LSB 0xE +#define QIB_6120_EXTStatus_MemBISTEndTest_RMASK 0x1 +#define QIB_6120_EXTStatus_Reserved2_LSB 0x0 +#define QIB_6120_EXTStatus_Reserved2_RMASK 0x3FFF + +#define QIB_6120_EXTCtrl_OFFS 0xD8 +#define QIB_6120_EXTCtrl_GPIOOe_LSB 0x30 +#define QIB_6120_EXTCtrl_GPIOOe_RMASK 0xFFFF +#define QIB_6120_EXTCtrl_GPIOInvert_LSB 0x20 +#define QIB_6120_EXTCtrl_GPIOInvert_RMASK 0xFFFF +#define QIB_6120_EXTCtrl_Reserved_LSB 0x4 +#define QIB_6120_EXTCtrl_Reserved_RMASK 0xFFFFFFF +#define QIB_6120_EXTCtrl_LEDPriPortGreenOn_LSB 0x3 +#define QIB_6120_EXTCtrl_LEDPriPortGreenOn_RMASK 0x1 +#define QIB_6120_EXTCtrl_LEDPriPortYellowOn_LSB 0x2 +#define QIB_6120_EXTCtrl_LEDPriPortYellowOn_RMASK 0x1 +#define QIB_6120_EXTCtrl_LEDGblOkGreenOn_LSB 0x1 +#define QIB_6120_EXTCtrl_LEDGblOkGreenOn_RMASK 0x1 +#define QIB_6120_EXTCtrl_LEDGblErrRedOff_LSB 0x0 +#define QIB_6120_EXTCtrl_LEDGblErrRedOff_RMASK 0x1 + +#define QIB_6120_GPIOOut_OFFS 0xE0 + +#define QIB_6120_GPIOMask_OFFS 0xE8 + +#define QIB_6120_GPIOStatus_OFFS 0xF0 + +#define QIB_6120_GPIOClear_OFFS 0xF8 + +#define QIB_6120_RcvCtrl_OFFS 0x100 +#define QIB_6120_RcvCtrl_TailUpd_LSB 0x1F +#define QIB_6120_RcvCtrl_TailUpd_RMASK 0x1 +#define QIB_6120_RcvCtrl_RcvPartitionKeyDisable_LSB 0x1E +#define QIB_6120_RcvCtrl_RcvPartitionKeyDisable_RMASK 0x1 +#define QIB_6120_RcvCtrl_Reserved_LSB 0x15 +#define QIB_6120_RcvCtrl_Reserved_RMASK 0x1FF +#define QIB_6120_RcvCtrl_IntrAvail_LSB 0x10 +#define QIB_6120_RcvCtrl_IntrAvail_RMASK 0x1F +#define QIB_6120_RcvCtrl_Reserved1_LSB 0x9 +#define QIB_6120_RcvCtrl_Reserved1_RMASK 0x7F +#define QIB_6120_RcvCtrl_Reserved2_LSB 0x5 +#define QIB_6120_RcvCtrl_Reserved2_RMASK 0xF +#define QIB_6120_RcvCtrl_PortEnable_LSB 0x0 +#define QIB_6120_RcvCtrl_PortEnable_RMASK 0x1F + +#define QIB_6120_RcvBTHQP_OFFS 0x108 +#define QIB_6120_RcvBTHQP_BTHQP_Mask_LSB 0x1E +#define QIB_6120_RcvBTHQP_BTHQP_Mask_RMASK 0x3 +#define QIB_6120_RcvBTHQP_Reserved_LSB 0x18 +#define QIB_6120_RcvBTHQP_Reserved_RMASK 0x3F +#define QIB_6120_RcvBTHQP_RcvBTHQP_LSB 0x0 +#define QIB_6120_RcvBTHQP_RcvBTHQP_RMASK 0xFFFFFF + +#define QIB_6120_RcvHdrSize_OFFS 0x110 + +#define QIB_6120_RcvHdrCnt_OFFS 0x118 + +#define QIB_6120_RcvHdrEntSize_OFFS 0x120 + +#define QIB_6120_RcvTIDBase_OFFS 0x128 + +#define QIB_6120_RcvTIDCnt_OFFS 0x130 + +#define QIB_6120_RcvEgrBase_OFFS 0x138 + +#define QIB_6120_RcvEgrCnt_OFFS 0x140 + +#define QIB_6120_RcvBufBase_OFFS 0x148 + +#define QIB_6120_RcvBufSize_OFFS 0x150 + +#define QIB_6120_RxIntMemBase_OFFS 0x158 + +#define QIB_6120_RxIntMemSize_OFFS 0x160 + +#define QIB_6120_RcvPartitionKey_OFFS 0x168 + +#define QIB_6120_RcvPktLEDCnt_OFFS 0x178 +#define QIB_6120_RcvPktLEDCnt_ONperiod_LSB 0x20 +#define QIB_6120_RcvPktLEDCnt_ONperiod_RMASK 0xFFFFFFFF +#define QIB_6120_RcvPktLEDCnt_OFFperiod_LSB 0x0 +#define QIB_6120_RcvPktLEDCnt_OFFperiod_RMASK 0xFFFFFFFF + +#define QIB_6120_SendCtrl_OFFS 0x1C0 +#define QIB_6120_SendCtrl_Disarm_LSB 0x1F +#define QIB_6120_SendCtrl_Disarm_RMASK 0x1 +#define QIB_6120_SendCtrl_Reserved_LSB 0x17 +#define QIB_6120_SendCtrl_Reserved_RMASK 0xFF +#define QIB_6120_SendCtrl_DisarmPIOBuf_LSB 0x10 +#define QIB_6120_SendCtrl_DisarmPIOBuf_RMASK 0x7F +#define QIB_6120_SendCtrl_Reserved1_LSB 0x4 +#define QIB_6120_SendCtrl_Reserved1_RMASK 0xFFF +#define QIB_6120_SendCtrl_PIOEnable_LSB 0x3 +#define QIB_6120_SendCtrl_PIOEnable_RMASK 0x1 +#define QIB_6120_SendCtrl_PIOBufAvailUpd_LSB 0x2 +#define QIB_6120_SendCtrl_PIOBufAvailUpd_RMASK 0x1 +#define QIB_6120_SendCtrl_PIOIntBufAvail_LSB 0x1 +#define QIB_6120_SendCtrl_PIOIntBufAvail_RMASK 0x1 +#define QIB_6120_SendCtrl_Abort_LSB 0x0 +#define QIB_6120_SendCtrl_Abort_RMASK 0x1 + +#define QIB_6120_SendPIOBufBase_OFFS 0x1C8 +#define QIB_6120_SendPIOBufBase_Reserved_LSB 0x35 +#define QIB_6120_SendPIOBufBase_Reserved_RMASK 0x7FF +#define QIB_6120_SendPIOBufBase_BaseAddr_LargePIO_LSB 0x20 +#define QIB_6120_SendPIOBufBase_BaseAddr_LargePIO_RMASK 0x1FFFFF +#define QIB_6120_SendPIOBufBase_Reserved1_LSB 0x15 +#define QIB_6120_SendPIOBufBase_Reserved1_RMASK 0x7FF +#define QIB_6120_SendPIOBufBase_BaseAddr_SmallPIO_LSB 0x0 +#define QIB_6120_SendPIOBufBase_BaseAddr_SmallPIO_RMASK 0x1FFFFF + +#define QIB_6120_SendPIOSize_OFFS 0x1D0 +#define QIB_6120_SendPIOSize_Reserved_LSB 0x2D +#define QIB_6120_SendPIOSize_Reserved_RMASK 0xFFFFF +#define QIB_6120_SendPIOSize_Size_LargePIO_LSB 0x20 +#define QIB_6120_SendPIOSize_Size_LargePIO_RMASK 0x1FFF +#define QIB_6120_SendPIOSize_Reserved1_LSB 0xC +#define QIB_6120_SendPIOSize_Reserved1_RMASK 0xFFFFF +#define QIB_6120_SendPIOSize_Size_SmallPIO_LSB 0x0 +#define QIB_6120_SendPIOSize_Size_SmallPIO_RMASK 0xFFF + +#define QIB_6120_SendPIOBufCnt_OFFS 0x1D8 +#define QIB_6120_SendPIOBufCnt_Reserved_LSB 0x24 +#define QIB_6120_SendPIOBufCnt_Reserved_RMASK 0xFFFFFFF +#define QIB_6120_SendPIOBufCnt_Num_LargePIO_LSB 0x20 +#define QIB_6120_SendPIOBufCnt_Num_LargePIO_RMASK 0xF +#define QIB_6120_SendPIOBufCnt_Reserved1_LSB 0x9 +#define QIB_6120_SendPIOBufCnt_Reserved1_RMASK 0x7FFFFF +#define QIB_6120_SendPIOBufCnt_Num_SmallPIO_LSB 0x0 +#define QIB_6120_SendPIOBufCnt_Num_SmallPIO_RMASK 0x1FF + +#define QIB_6120_SendPIOAvailAddr_OFFS 0x1E0 +#define QIB_6120_SendPIOAvailAddr_SendPIOAvailAddr_LSB 0x6 +#define QIB_6120_SendPIOAvailAddr_SendPIOAvailAddr_RMASK 0x3FFFFFFFF +#define QIB_6120_SendPIOAvailAddr_Reserved_LSB 0x0 +#define QIB_6120_SendPIOAvailAddr_Reserved_RMASK 0x3F + +#define QIB_6120_SendBufErr0_OFFS 0x240 +#define QIB_6120_SendBufErr0_SendBufErrPIO_63_0_LSB 0x0 +#define QIB_6120_SendBufErr0_SendBufErrPIO_63_0_RMASK 0x0 + +#define QIB_6120_RcvHdrAddr0_OFFS 0x280 +#define QIB_6120_RcvHdrAddr0_RcvHdrAddr0_LSB 0x2 +#define QIB_6120_RcvHdrAddr0_RcvHdrAddr0_RMASK 0x3FFFFFFFFF +#define QIB_6120_RcvHdrAddr0_Reserved_LSB 0x0 +#define QIB_6120_RcvHdrAddr0_Reserved_RMASK 0x3 + +#define QIB_6120_RcvHdrTailAddr0_OFFS 0x300 +#define QIB_6120_RcvHdrTailAddr0_RcvHdrTailAddr0_LSB 0x2 +#define QIB_6120_RcvHdrTailAddr0_RcvHdrTailAddr0_RMASK 0x3FFFFFFFFF +#define QIB_6120_RcvHdrTailAddr0_Reserved_LSB 0x0 +#define QIB_6120_RcvHdrTailAddr0_Reserved_RMASK 0x3 + +#define QIB_6120_SerdesCfg0_OFFS 0x3C0 +#define QIB_6120_SerdesCfg0_DisableIBTxIdleDetect_LSB 0x3F +#define QIB_6120_SerdesCfg0_DisableIBTxIdleDetect_RMASK 0x1 +#define QIB_6120_SerdesCfg0_Reserved_LSB 0x38 +#define QIB_6120_SerdesCfg0_Reserved_RMASK 0x7F +#define QIB_6120_SerdesCfg0_RxEqCtl_LSB 0x36 +#define QIB_6120_SerdesCfg0_RxEqCtl_RMASK 0x3 +#define QIB_6120_SerdesCfg0_TxTermAdj_LSB 0x34 +#define QIB_6120_SerdesCfg0_TxTermAdj_RMASK 0x3 +#define QIB_6120_SerdesCfg0_RxTermAdj_LSB 0x32 +#define QIB_6120_SerdesCfg0_RxTermAdj_RMASK 0x3 +#define QIB_6120_SerdesCfg0_TermAdj1_LSB 0x31 +#define QIB_6120_SerdesCfg0_TermAdj1_RMASK 0x1 +#define QIB_6120_SerdesCfg0_TermAdj0_LSB 0x30 +#define QIB_6120_SerdesCfg0_TermAdj0_RMASK 0x1 +#define QIB_6120_SerdesCfg0_LPBKA_LSB 0x2F +#define QIB_6120_SerdesCfg0_LPBKA_RMASK 0x1 +#define QIB_6120_SerdesCfg0_LPBKB_LSB 0x2E +#define QIB_6120_SerdesCfg0_LPBKB_RMASK 0x1 +#define QIB_6120_SerdesCfg0_LPBKC_LSB 0x2D +#define QIB_6120_SerdesCfg0_LPBKC_RMASK 0x1 +#define QIB_6120_SerdesCfg0_LPBKD_LSB 0x2C +#define QIB_6120_SerdesCfg0_LPBKD_RMASK 0x1 +#define QIB_6120_SerdesCfg0_PW_LSB 0x2B +#define QIB_6120_SerdesCfg0_PW_RMASK 0x1 +#define QIB_6120_SerdesCfg0_RefSel_LSB 0x29 +#define QIB_6120_SerdesCfg0_RefSel_RMASK 0x3 +#define QIB_6120_SerdesCfg0_ParReset_LSB 0x28 +#define QIB_6120_SerdesCfg0_ParReset_RMASK 0x1 +#define QIB_6120_SerdesCfg0_ParLPBK_LSB 0x27 +#define QIB_6120_SerdesCfg0_ParLPBK_RMASK 0x1 +#define QIB_6120_SerdesCfg0_OffsetEn_LSB 0x26 +#define QIB_6120_SerdesCfg0_OffsetEn_RMASK 0x1 +#define QIB_6120_SerdesCfg0_Offset_LSB 0x1E +#define QIB_6120_SerdesCfg0_Offset_RMASK 0xFF +#define QIB_6120_SerdesCfg0_L2PwrDn_LSB 0x1D +#define QIB_6120_SerdesCfg0_L2PwrDn_RMASK 0x1 +#define QIB_6120_SerdesCfg0_ResetPLL_LSB 0x1C +#define QIB_6120_SerdesCfg0_ResetPLL_RMASK 0x1 +#define QIB_6120_SerdesCfg0_RxTermEnX_LSB 0x18 +#define QIB_6120_SerdesCfg0_RxTermEnX_RMASK 0xF +#define QIB_6120_SerdesCfg0_BeaconTxEnX_LSB 0x14 +#define QIB_6120_SerdesCfg0_BeaconTxEnX_RMASK 0xF +#define QIB_6120_SerdesCfg0_RxDetEnX_LSB 0x10 +#define QIB_6120_SerdesCfg0_RxDetEnX_RMASK 0xF +#define QIB_6120_SerdesCfg0_TxIdeEnX_LSB 0xC +#define QIB_6120_SerdesCfg0_TxIdeEnX_RMASK 0xF +#define QIB_6120_SerdesCfg0_RxIdleEnX_LSB 0x8 +#define QIB_6120_SerdesCfg0_RxIdleEnX_RMASK 0xF +#define QIB_6120_SerdesCfg0_L1PwrDnA_LSB 0x7 +#define QIB_6120_SerdesCfg0_L1PwrDnA_RMASK 0x1 +#define QIB_6120_SerdesCfg0_L1PwrDnB_LSB 0x6 +#define QIB_6120_SerdesCfg0_L1PwrDnB_RMASK 0x1 +#define QIB_6120_SerdesCfg0_L1PwrDnC_LSB 0x5 +#define QIB_6120_SerdesCfg0_L1PwrDnC_RMASK 0x1 +#define QIB_6120_SerdesCfg0_L1PwrDnD_LSB 0x4 +#define QIB_6120_SerdesCfg0_L1PwrDnD_RMASK 0x1 +#define QIB_6120_SerdesCfg0_ResetA_LSB 0x3 +#define QIB_6120_SerdesCfg0_ResetA_RMASK 0x1 +#define QIB_6120_SerdesCfg0_ResetB_LSB 0x2 +#define QIB_6120_SerdesCfg0_ResetB_RMASK 0x1 +#define QIB_6120_SerdesCfg0_ResetC_LSB 0x1 +#define QIB_6120_SerdesCfg0_ResetC_RMASK 0x1 +#define QIB_6120_SerdesCfg0_ResetD_LSB 0x0 +#define QIB_6120_SerdesCfg0_ResetD_RMASK 0x1 + +#define QIB_6120_SerdesStat_OFFS 0x3D0 +#define QIB_6120_SerdesStat_Reserved_LSB 0xC +#define QIB_6120_SerdesStat_Reserved_RMASK 0xFFFFFFFFFFFFF +#define QIB_6120_SerdesStat_BeaconDetA_LSB 0xB +#define QIB_6120_SerdesStat_BeaconDetA_RMASK 0x1 +#define QIB_6120_SerdesStat_BeaconDetB_LSB 0xA +#define QIB_6120_SerdesStat_BeaconDetB_RMASK 0x1 +#define QIB_6120_SerdesStat_BeaconDetC_LSB 0x9 +#define QIB_6120_SerdesStat_BeaconDetC_RMASK 0x1 +#define QIB_6120_SerdesStat_BeaconDetD_LSB 0x8 +#define QIB_6120_SerdesStat_BeaconDetD_RMASK 0x1 +#define QIB_6120_SerdesStat_RxDetA_LSB 0x7 +#define QIB_6120_SerdesStat_RxDetA_RMASK 0x1 +#define QIB_6120_SerdesStat_RxDetB_LSB 0x6 +#define QIB_6120_SerdesStat_RxDetB_RMASK 0x1 +#define QIB_6120_SerdesStat_RxDetC_LSB 0x5 +#define QIB_6120_SerdesStat_RxDetC_RMASK 0x1 +#define QIB_6120_SerdesStat_RxDetD_LSB 0x4 +#define QIB_6120_SerdesStat_RxDetD_RMASK 0x1 +#define QIB_6120_SerdesStat_TxIdleDetA_LSB 0x3 +#define QIB_6120_SerdesStat_TxIdleDetA_RMASK 0x1 +#define QIB_6120_SerdesStat_TxIdleDetB_LSB 0x2 +#define QIB_6120_SerdesStat_TxIdleDetB_RMASK 0x1 +#define QIB_6120_SerdesStat_TxIdleDetC_LSB 0x1 +#define QIB_6120_SerdesStat_TxIdleDetC_RMASK 0x1 +#define QIB_6120_SerdesStat_TxIdleDetD_LSB 0x0 +#define QIB_6120_SerdesStat_TxIdleDetD_RMASK 0x1 + +#define QIB_6120_XGXSCfg_OFFS 0x3D8 +#define QIB_6120_XGXSCfg_ArmLaunchErrorDisable_LSB 0x3F +#define QIB_6120_XGXSCfg_ArmLaunchErrorDisable_RMASK 0x1 +#define QIB_6120_XGXSCfg_Reserved_LSB 0x17 +#define QIB_6120_XGXSCfg_Reserved_RMASK 0xFFFFFFFFFF +#define QIB_6120_XGXSCfg_polarity_inv_LSB 0x13 +#define QIB_6120_XGXSCfg_polarity_inv_RMASK 0xF +#define QIB_6120_XGXSCfg_link_sync_mask_LSB 0x9 +#define QIB_6120_XGXSCfg_link_sync_mask_RMASK 0x3FF +#define QIB_6120_XGXSCfg_port_addr_LSB 0x4 +#define QIB_6120_XGXSCfg_port_addr_RMASK 0x1F +#define QIB_6120_XGXSCfg_mdd_30_LSB 0x3 +#define QIB_6120_XGXSCfg_mdd_30_RMASK 0x1 +#define QIB_6120_XGXSCfg_xcv_resetn_LSB 0x2 +#define QIB_6120_XGXSCfg_xcv_resetn_RMASK 0x1 +#define QIB_6120_XGXSCfg_Reserved1_LSB 0x1 +#define QIB_6120_XGXSCfg_Reserved1_RMASK 0x1 +#define QIB_6120_XGXSCfg_tx_rx_resetn_LSB 0x0 +#define QIB_6120_XGXSCfg_tx_rx_resetn_RMASK 0x1 + +#define QIB_6120_LBIntCnt_OFFS 0x12000 + +#define QIB_6120_LBFlowStallCnt_OFFS 0x12008 + +#define QIB_6120_TxUnsupVLErrCnt_OFFS 0x12018 + +#define QIB_6120_TxDataPktCnt_OFFS 0x12020 + +#define QIB_6120_TxFlowPktCnt_OFFS 0x12028 + +#define QIB_6120_TxDwordCnt_OFFS 0x12030 + +#define QIB_6120_TxLenErrCnt_OFFS 0x12038 + +#define QIB_6120_TxMaxMinLenErrCnt_OFFS 0x12040 + +#define QIB_6120_TxUnderrunCnt_OFFS 0x12048 + +#define QIB_6120_TxFlowStallCnt_OFFS 0x12050 + +#define QIB_6120_TxDroppedPktCnt_OFFS 0x12058 + +#define QIB_6120_RxDroppedPktCnt_OFFS 0x12060 + +#define QIB_6120_RxDataPktCnt_OFFS 0x12068 + +#define QIB_6120_RxFlowPktCnt_OFFS 0x12070 + +#define QIB_6120_RxDwordCnt_OFFS 0x12078 + +#define QIB_6120_RxLenErrCnt_OFFS 0x12080 + +#define QIB_6120_RxMaxMinLenErrCnt_OFFS 0x12088 + +#define QIB_6120_RxICRCErrCnt_OFFS 0x12090 + +#define QIB_6120_RxVCRCErrCnt_OFFS 0x12098 + +#define QIB_6120_RxFlowCtrlErrCnt_OFFS 0x120A0 + +#define QIB_6120_RxBadFormatCnt_OFFS 0x120A8 + +#define QIB_6120_RxLinkProblemCnt_OFFS 0x120B0 + +#define QIB_6120_RxEBPCnt_OFFS 0x120B8 + +#define QIB_6120_RxLPCRCErrCnt_OFFS 0x120C0 + +#define QIB_6120_RxBufOvflCnt_OFFS 0x120C8 + +#define QIB_6120_RxTIDFullErrCnt_OFFS 0x120D0 + +#define QIB_6120_RxTIDValidErrCnt_OFFS 0x120D8 + +#define QIB_6120_RxPKeyMismatchCnt_OFFS 0x120E0 + +#define QIB_6120_RxP0HdrEgrOvflCnt_OFFS 0x120E8 + +#define QIB_6120_IBStatusChangeCnt_OFFS 0x12140 + +#define QIB_6120_IBLinkErrRecoveryCnt_OFFS 0x12148 + +#define QIB_6120_IBLinkDownedCnt_OFFS 0x12150 + +#define QIB_6120_IBSymbolErrCnt_OFFS 0x12158 + +#define QIB_6120_PcieRetryBufDiagQwordCnt_OFFS 0x12170 + +#define QIB_6120_RcvEgrArray0_OFFS 0x14000 + +#define QIB_6120_RcvTIDArray0_OFFS 0x54000 + +#define QIB_6120_PIOLaunchFIFO_OFFS 0x64000 + +#define QIB_6120_SendPIOpbcCache_OFFS 0x64800 + +#define QIB_6120_RcvBuf1_OFFS 0x72000 + +#define QIB_6120_RcvBuf2_OFFS 0x75000 + +#define QIB_6120_RcvFlags_OFFS 0x77000 + +#define QIB_6120_RcvLookupBuf1_OFFS 0x79000 + +#define QIB_6120_RcvDMABuf_OFFS 0x7B000 + +#define QIB_6120_MiscRXEIntMem_OFFS 0x7C000 + +#define QIB_6120_PCIERcvBuf_OFFS 0x80000 + +#define QIB_6120_PCIERetryBuf_OFFS 0x82000 + +#define QIB_6120_PCIERcvBufRdToWrAddr_OFFS 0x84000 + +#define QIB_6120_PIOBuf0_MA_OFFS 0x100000 diff --git a/drivers/infiniband/hw/qib/qib_7220.h b/drivers/infiniband/hw/qib/qib_7220.h new file mode 100644 index 0000000..a5356cb --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_7220.h @@ -0,0 +1,149 @@ +#ifndef _QIB_7220_H +#define _QIB_7220_H +/* + * Copyright (c) 2007, 2009, 2010 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* grab register-defs auto-generated by HW */ +#include "qib_7220_regs.h" + +/* The number of eager receive TIDs for context zero. */ +#define IBA7220_KRCVEGRCNT 2048U + +#define IB_7220_LT_STATE_CFGRCVFCFG 0x09 +#define IB_7220_LT_STATE_CFGWAITRMT 0x0a +#define IB_7220_LT_STATE_TXREVLANES 0x0d +#define IB_7220_LT_STATE_CFGENH 0x10 + +struct qib_chip_specific { + u64 __iomem *cregbase; + u64 *cntrs; + u64 *portcntrs; + spinlock_t sdepb_lock; /* serdes EPB bus */ + spinlock_t rcvmod_lock; /* protect rcvctrl shadow changes */ + spinlock_t gpio_lock; /* RMW of shadows/regs for ExtCtrl and GPIO */ + u64 hwerrmask; + u64 errormask; + u64 gpio_out; /* shadow of kr_gpio_out, for rmw ops */ + u64 gpio_mask; /* shadow the gpio mask register */ + u64 extctrl; /* shadow the gpio output enable, etc... */ + u32 ncntrs; + u32 nportcntrs; + u32 cntrnamelen; + u32 portcntrnamelen; + u32 numctxts; + u32 rcvegrcnt; + u32 autoneg_tries; + u32 serdes_first_init_done; + u32 sdmabufcnt; + u32 lastbuf_for_pio; + u32 updthresh; /* current AvailUpdThld */ + u32 updthresh_dflt; /* default AvailUpdThld */ + int irq; + u8 presets_needed; + u8 relock_timer_active; + char emsgbuf[128]; + char sdmamsgbuf[192]; + char bitsmsgbuf[64]; + struct timer_list relock_timer; + unsigned int relock_interval; /* in jiffies */ +}; + +struct qib_chippport_specific { + struct qib_pportdata pportdata; + wait_queue_head_t autoneg_wait; + struct delayed_work autoneg_work; + struct timer_list chase_timer; + /* + * these 5 fields are used to establish deltas for IB symbol + * errors and linkrecovery errors. They can be reported on + * some chips during link negotiation prior to INIT, and with + * DDR when faking DDR negotiations with non-IBTA switches. + * The chip counters are adjusted at driver unload if there is + * a non-zero delta. + */ + u64 ibdeltainprog; + u64 ibsymdelta; + u64 ibsymsnap; + u64 iblnkerrdelta; + u64 iblnkerrsnap; + u64 ibcctrl; /* kr_ibcctrl shadow */ + u64 ibcddrctrl; /* kr_ibcddrctrl shadow */ + unsigned long chase_end; + u32 last_delay_mult; +}; + +/* + * This header file provides the declarations and common definitions + * for (mostly) manipulation of the SerDes blocks within the IBA7220. + * the functions declared should only be called from within other + * 7220-related files such as qib_iba7220.c or qib_sd7220.c. + */ +int qib_sd7220_presets(struct qib_devdata *dd); +int qib_sd7220_init(struct qib_devdata *dd); +void qib_sd7220_clr_ibpar(struct qib_devdata *); +/* + * Below used for sdnum parameter, selecting one of the two sections + * used for PCIe, or the single SerDes used for IB, which is the + * only one currently used + */ +#define IB_7220_SERDES 2 + +static inline u32 qib_read_kreg32(const struct qib_devdata *dd, + const u16 regno) +{ + if (!dd->kregbase || !(dd->flags & QIB_PRESENT)) + return -1; + return readl((u32 __iomem *)&dd->kregbase[regno]); +} + +static inline u64 qib_read_kreg64(const struct qib_devdata *dd, + const u16 regno) +{ + if (!dd->kregbase || !(dd->flags & QIB_PRESENT)) + return -1; + + return readq(&dd->kregbase[regno]); +} + +static inline void qib_write_kreg(const struct qib_devdata *dd, + const u16 regno, u64 value) +{ + if (dd->kregbase) + writeq(value, &dd->kregbase[regno]); +} + +void set_7220_relock_poll(struct qib_devdata *, int); +void shutdown_7220_relock_poll(struct qib_devdata *); +void toggle_7220_rclkrls(struct qib_devdata *); + + +#endif /* _QIB_7220_H */ diff --git a/drivers/infiniband/hw/qib/qib_7220_regs.h b/drivers/infiniband/hw/qib/qib_7220_regs.h new file mode 100644 index 0000000..0da5bb7 --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_7220_regs.h @@ -0,0 +1,1496 @@ +/* + * Copyright (c) 2008, 2009, 2010 QLogic Corporation. All rights reserved. + * + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +/* This file is mechanically generated from RTL. Any hand-edits will be lost! */ + +#define QIB_7220_Revision_OFFS 0x0 +#define QIB_7220_Revision_R_Simulator_LSB 0x3F +#define QIB_7220_Revision_R_Simulator_RMASK 0x1 +#define QIB_7220_Revision_R_Emulation_LSB 0x3E +#define QIB_7220_Revision_R_Emulation_RMASK 0x1 +#define QIB_7220_Revision_R_Emulation_Revcode_LSB 0x28 +#define QIB_7220_Revision_R_Emulation_Revcode_RMASK 0x3FFFFF +#define QIB_7220_Revision_BoardID_LSB 0x20 +#define QIB_7220_Revision_BoardID_RMASK 0xFF +#define QIB_7220_Revision_R_SW_LSB 0x18 +#define QIB_7220_Revision_R_SW_RMASK 0xFF +#define QIB_7220_Revision_R_Arch_LSB 0x10 +#define QIB_7220_Revision_R_Arch_RMASK 0xFF +#define QIB_7220_Revision_R_ChipRevMajor_LSB 0x8 +#define QIB_7220_Revision_R_ChipRevMajor_RMASK 0xFF +#define QIB_7220_Revision_R_ChipRevMinor_LSB 0x0 +#define QIB_7220_Revision_R_ChipRevMinor_RMASK 0xFF + +#define QIB_7220_Control_OFFS 0x8 +#define QIB_7220_Control_SyncResetExceptPcieIRAMRST_LSB 0x7 +#define QIB_7220_Control_SyncResetExceptPcieIRAMRST_RMASK 0x1 +#define QIB_7220_Control_PCIECplQDiagEn_LSB 0x6 +#define QIB_7220_Control_PCIECplQDiagEn_RMASK 0x1 +#define QIB_7220_Control_Reserved_LSB 0x5 +#define QIB_7220_Control_Reserved_RMASK 0x1 +#define QIB_7220_Control_TxLatency_LSB 0x4 +#define QIB_7220_Control_TxLatency_RMASK 0x1 +#define QIB_7220_Control_PCIERetryBufDiagEn_LSB 0x3 +#define QIB_7220_Control_PCIERetryBufDiagEn_RMASK 0x1 +#define QIB_7220_Control_LinkEn_LSB 0x2 +#define QIB_7220_Control_LinkEn_RMASK 0x1 +#define QIB_7220_Control_FreezeMode_LSB 0x1 +#define QIB_7220_Control_FreezeMode_RMASK 0x1 +#define QIB_7220_Control_SyncReset_LSB 0x0 +#define QIB_7220_Control_SyncReset_RMASK 0x1 + +#define QIB_7220_PageAlign_OFFS 0x10 + +#define QIB_7220_PortCnt_OFFS 0x18 + +#define QIB_7220_SendRegBase_OFFS 0x30 + +#define QIB_7220_UserRegBase_OFFS 0x38 + +#define QIB_7220_CntrRegBase_OFFS 0x40 + +#define QIB_7220_Scratch_OFFS 0x48 + +#define QIB_7220_IntMask_OFFS 0x68 +#define QIB_7220_IntMask_SDmaIntMask_LSB 0x3F +#define QIB_7220_IntMask_SDmaIntMask_RMASK 0x1 +#define QIB_7220_IntMask_SDmaDisabledMasked_LSB 0x3E +#define QIB_7220_IntMask_SDmaDisabledMasked_RMASK 0x1 +#define QIB_7220_IntMask_Reserved_LSB 0x31 +#define QIB_7220_IntMask_Reserved_RMASK 0x1FFF +#define QIB_7220_IntMask_RcvUrg16IntMask_LSB 0x30 +#define QIB_7220_IntMask_RcvUrg16IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg15IntMask_LSB 0x2F +#define QIB_7220_IntMask_RcvUrg15IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg14IntMask_LSB 0x2E +#define QIB_7220_IntMask_RcvUrg14IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg13IntMask_LSB 0x2D +#define QIB_7220_IntMask_RcvUrg13IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg12IntMask_LSB 0x2C +#define QIB_7220_IntMask_RcvUrg12IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg11IntMask_LSB 0x2B +#define QIB_7220_IntMask_RcvUrg11IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg10IntMask_LSB 0x2A +#define QIB_7220_IntMask_RcvUrg10IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg9IntMask_LSB 0x29 +#define QIB_7220_IntMask_RcvUrg9IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg8IntMask_LSB 0x28 +#define QIB_7220_IntMask_RcvUrg8IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg7IntMask_LSB 0x27 +#define QIB_7220_IntMask_RcvUrg7IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg6IntMask_LSB 0x26 +#define QIB_7220_IntMask_RcvUrg6IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg5IntMask_LSB 0x25 +#define QIB_7220_IntMask_RcvUrg5IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg4IntMask_LSB 0x24 +#define QIB_7220_IntMask_RcvUrg4IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg3IntMask_LSB 0x23 +#define QIB_7220_IntMask_RcvUrg3IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg2IntMask_LSB 0x22 +#define QIB_7220_IntMask_RcvUrg2IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg1IntMask_LSB 0x21 +#define QIB_7220_IntMask_RcvUrg1IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvUrg0IntMask_LSB 0x20 +#define QIB_7220_IntMask_RcvUrg0IntMask_RMASK 0x1 +#define QIB_7220_IntMask_ErrorIntMask_LSB 0x1F +#define QIB_7220_IntMask_ErrorIntMask_RMASK 0x1 +#define QIB_7220_IntMask_PioSetIntMask_LSB 0x1E +#define QIB_7220_IntMask_PioSetIntMask_RMASK 0x1 +#define QIB_7220_IntMask_PioBufAvailIntMask_LSB 0x1D +#define QIB_7220_IntMask_PioBufAvailIntMask_RMASK 0x1 +#define QIB_7220_IntMask_assertGPIOIntMask_LSB 0x1C +#define QIB_7220_IntMask_assertGPIOIntMask_RMASK 0x1 +#define QIB_7220_IntMask_IBSerdesTrimDoneIntMask_LSB 0x1B +#define QIB_7220_IntMask_IBSerdesTrimDoneIntMask_RMASK 0x1 +#define QIB_7220_IntMask_JIntMask_LSB 0x1A +#define QIB_7220_IntMask_JIntMask_RMASK 0x1 +#define QIB_7220_IntMask_Reserved1_LSB 0x11 +#define QIB_7220_IntMask_Reserved1_RMASK 0x1FF +#define QIB_7220_IntMask_RcvAvail16IntMask_LSB 0x10 +#define QIB_7220_IntMask_RcvAvail16IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail15IntMask_LSB 0xF +#define QIB_7220_IntMask_RcvAvail15IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail14IntMask_LSB 0xE +#define QIB_7220_IntMask_RcvAvail14IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail13IntMask_LSB 0xD +#define QIB_7220_IntMask_RcvAvail13IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail12IntMask_LSB 0xC +#define QIB_7220_IntMask_RcvAvail12IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail11IntMask_LSB 0xB +#define QIB_7220_IntMask_RcvAvail11IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail10IntMask_LSB 0xA +#define QIB_7220_IntMask_RcvAvail10IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail9IntMask_LSB 0x9 +#define QIB_7220_IntMask_RcvAvail9IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail8IntMask_LSB 0x8 +#define QIB_7220_IntMask_RcvAvail8IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail7IntMask_LSB 0x7 +#define QIB_7220_IntMask_RcvAvail7IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail6IntMask_LSB 0x6 +#define QIB_7220_IntMask_RcvAvail6IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail5IntMask_LSB 0x5 +#define QIB_7220_IntMask_RcvAvail5IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail4IntMask_LSB 0x4 +#define QIB_7220_IntMask_RcvAvail4IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail3IntMask_LSB 0x3 +#define QIB_7220_IntMask_RcvAvail3IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail2IntMask_LSB 0x2 +#define QIB_7220_IntMask_RcvAvail2IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail1IntMask_LSB 0x1 +#define QIB_7220_IntMask_RcvAvail1IntMask_RMASK 0x1 +#define QIB_7220_IntMask_RcvAvail0IntMask_LSB 0x0 +#define QIB_7220_IntMask_RcvAvail0IntMask_RMASK 0x1 + +#define QIB_7220_IntStatus_OFFS 0x70 +#define QIB_7220_IntStatus_SDmaInt_LSB 0x3F +#define QIB_7220_IntStatus_SDmaInt_RMASK 0x1 +#define QIB_7220_IntStatus_SDmaDisabled_LSB 0x3E +#define QIB_7220_IntStatus_SDmaDisabled_RMASK 0x1 +#define QIB_7220_IntStatus_Reserved_LSB 0x31 +#define QIB_7220_IntStatus_Reserved_RMASK 0x1FFF +#define QIB_7220_IntStatus_RcvUrg16_LSB 0x30 +#define QIB_7220_IntStatus_RcvUrg16_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg15_LSB 0x2F +#define QIB_7220_IntStatus_RcvUrg15_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg14_LSB 0x2E +#define QIB_7220_IntStatus_RcvUrg14_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg13_LSB 0x2D +#define QIB_7220_IntStatus_RcvUrg13_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg12_LSB 0x2C +#define QIB_7220_IntStatus_RcvUrg12_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg11_LSB 0x2B +#define QIB_7220_IntStatus_RcvUrg11_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg10_LSB 0x2A +#define QIB_7220_IntStatus_RcvUrg10_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg9_LSB 0x29 +#define QIB_7220_IntStatus_RcvUrg9_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg8_LSB 0x28 +#define QIB_7220_IntStatus_RcvUrg8_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg7_LSB 0x27 +#define QIB_7220_IntStatus_RcvUrg7_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg6_LSB 0x26 +#define QIB_7220_IntStatus_RcvUrg6_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg5_LSB 0x25 +#define QIB_7220_IntStatus_RcvUrg5_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg4_LSB 0x24 +#define QIB_7220_IntStatus_RcvUrg4_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg3_LSB 0x23 +#define QIB_7220_IntStatus_RcvUrg3_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg2_LSB 0x22 +#define QIB_7220_IntStatus_RcvUrg2_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg1_LSB 0x21 +#define QIB_7220_IntStatus_RcvUrg1_RMASK 0x1 +#define QIB_7220_IntStatus_RcvUrg0_LSB 0x20 +#define QIB_7220_IntStatus_RcvUrg0_RMASK 0x1 +#define QIB_7220_IntStatus_Error_LSB 0x1F +#define QIB_7220_IntStatus_Error_RMASK 0x1 +#define QIB_7220_IntStatus_PioSent_LSB 0x1E +#define QIB_7220_IntStatus_PioSent_RMASK 0x1 +#define QIB_7220_IntStatus_PioBufAvail_LSB 0x1D +#define QIB_7220_IntStatus_PioBufAvail_RMASK 0x1 +#define QIB_7220_IntStatus_assertGPIO_LSB 0x1C +#define QIB_7220_IntStatus_assertGPIO_RMASK 0x1 +#define QIB_7220_IntStatus_IBSerdesTrimDone_LSB 0x1B +#define QIB_7220_IntStatus_IBSerdesTrimDone_RMASK 0x1 +#define QIB_7220_IntStatus_JInt_LSB 0x1A +#define QIB_7220_IntStatus_JInt_RMASK 0x1 +#define QIB_7220_IntStatus_Reserved1_LSB 0x11 +#define QIB_7220_IntStatus_Reserved1_RMASK 0x1FF +#define QIB_7220_IntStatus_RcvAvail16_LSB 0x10 +#define QIB_7220_IntStatus_RcvAvail16_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail15_LSB 0xF +#define QIB_7220_IntStatus_RcvAvail15_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail14_LSB 0xE +#define QIB_7220_IntStatus_RcvAvail14_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail13_LSB 0xD +#define QIB_7220_IntStatus_RcvAvail13_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail12_LSB 0xC +#define QIB_7220_IntStatus_RcvAvail12_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail11_LSB 0xB +#define QIB_7220_IntStatus_RcvAvail11_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail10_LSB 0xA +#define QIB_7220_IntStatus_RcvAvail10_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail9_LSB 0x9 +#define QIB_7220_IntStatus_RcvAvail9_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail8_LSB 0x8 +#define QIB_7220_IntStatus_RcvAvail8_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail7_LSB 0x7 +#define QIB_7220_IntStatus_RcvAvail7_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail6_LSB 0x6 +#define QIB_7220_IntStatus_RcvAvail6_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail5_LSB 0x5 +#define QIB_7220_IntStatus_RcvAvail5_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail4_LSB 0x4 +#define QIB_7220_IntStatus_RcvAvail4_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail3_LSB 0x3 +#define QIB_7220_IntStatus_RcvAvail3_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail2_LSB 0x2 +#define QIB_7220_IntStatus_RcvAvail2_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail1_LSB 0x1 +#define QIB_7220_IntStatus_RcvAvail1_RMASK 0x1 +#define QIB_7220_IntStatus_RcvAvail0_LSB 0x0 +#define QIB_7220_IntStatus_RcvAvail0_RMASK 0x1 + +#define QIB_7220_IntClear_OFFS 0x78 +#define QIB_7220_IntClear_SDmaIntClear_LSB 0x3F +#define QIB_7220_IntClear_SDmaIntClear_RMASK 0x1 +#define QIB_7220_IntClear_SDmaDisabledClear_LSB 0x3E +#define QIB_7220_IntClear_SDmaDisabledClear_RMASK 0x1 +#define QIB_7220_IntClear_Reserved_LSB 0x31 +#define QIB_7220_IntClear_Reserved_RMASK 0x1FFF +#define QIB_7220_IntClear_RcvUrg16IntClear_LSB 0x30 +#define QIB_7220_IntClear_RcvUrg16IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg15IntClear_LSB 0x2F +#define QIB_7220_IntClear_RcvUrg15IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg14IntClear_LSB 0x2E +#define QIB_7220_IntClear_RcvUrg14IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg13IntClear_LSB 0x2D +#define QIB_7220_IntClear_RcvUrg13IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg12IntClear_LSB 0x2C +#define QIB_7220_IntClear_RcvUrg12IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg11IntClear_LSB 0x2B +#define QIB_7220_IntClear_RcvUrg11IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg10IntClear_LSB 0x2A +#define QIB_7220_IntClear_RcvUrg10IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg9IntClear_LSB 0x29 +#define QIB_7220_IntClear_RcvUrg9IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg8IntClear_LSB 0x28 +#define QIB_7220_IntClear_RcvUrg8IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg7IntClear_LSB 0x27 +#define QIB_7220_IntClear_RcvUrg7IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg6IntClear_LSB 0x26 +#define QIB_7220_IntClear_RcvUrg6IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg5IntClear_LSB 0x25 +#define QIB_7220_IntClear_RcvUrg5IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg4IntClear_LSB 0x24 +#define QIB_7220_IntClear_RcvUrg4IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg3IntClear_LSB 0x23 +#define QIB_7220_IntClear_RcvUrg3IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg2IntClear_LSB 0x22 +#define QIB_7220_IntClear_RcvUrg2IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg1IntClear_LSB 0x21 +#define QIB_7220_IntClear_RcvUrg1IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvUrg0IntClear_LSB 0x20 +#define QIB_7220_IntClear_RcvUrg0IntClear_RMASK 0x1 +#define QIB_7220_IntClear_ErrorIntClear_LSB 0x1F +#define QIB_7220_IntClear_ErrorIntClear_RMASK 0x1 +#define QIB_7220_IntClear_PioSetIntClear_LSB 0x1E +#define QIB_7220_IntClear_PioSetIntClear_RMASK 0x1 +#define QIB_7220_IntClear_PioBufAvailIntClear_LSB 0x1D +#define QIB_7220_IntClear_PioBufAvailIntClear_RMASK 0x1 +#define QIB_7220_IntClear_assertGPIOIntClear_LSB 0x1C +#define QIB_7220_IntClear_assertGPIOIntClear_RMASK 0x1 +#define QIB_7220_IntClear_IBSerdesTrimDoneClear_LSB 0x1B +#define QIB_7220_IntClear_IBSerdesTrimDoneClear_RMASK 0x1 +#define QIB_7220_IntClear_JIntClear_LSB 0x1A +#define QIB_7220_IntClear_JIntClear_RMASK 0x1 +#define QIB_7220_IntClear_Reserved1_LSB 0x11 +#define QIB_7220_IntClear_Reserved1_RMASK 0x1FF +#define QIB_7220_IntClear_RcvAvail16IntClear_LSB 0x10 +#define QIB_7220_IntClear_RcvAvail16IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail15IntClear_LSB 0xF +#define QIB_7220_IntClear_RcvAvail15IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail14IntClear_LSB 0xE +#define QIB_7220_IntClear_RcvAvail14IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail13IntClear_LSB 0xD +#define QIB_7220_IntClear_RcvAvail13IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail12IntClear_LSB 0xC +#define QIB_7220_IntClear_RcvAvail12IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail11IntClear_LSB 0xB +#define QIB_7220_IntClear_RcvAvail11IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail10IntClear_LSB 0xA +#define QIB_7220_IntClear_RcvAvail10IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail9IntClear_LSB 0x9 +#define QIB_7220_IntClear_RcvAvail9IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail8IntClear_LSB 0x8 +#define QIB_7220_IntClear_RcvAvail8IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail7IntClear_LSB 0x7 +#define QIB_7220_IntClear_RcvAvail7IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail6IntClear_LSB 0x6 +#define QIB_7220_IntClear_RcvAvail6IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail5IntClear_LSB 0x5 +#define QIB_7220_IntClear_RcvAvail5IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail4IntClear_LSB 0x4 +#define QIB_7220_IntClear_RcvAvail4IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail3IntClear_LSB 0x3 +#define QIB_7220_IntClear_RcvAvail3IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail2IntClear_LSB 0x2 +#define QIB_7220_IntClear_RcvAvail2IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail1IntClear_LSB 0x1 +#define QIB_7220_IntClear_RcvAvail1IntClear_RMASK 0x1 +#define QIB_7220_IntClear_RcvAvail0IntClear_LSB 0x0 +#define QIB_7220_IntClear_RcvAvail0IntClear_RMASK 0x1 + +#define QIB_7220_ErrMask_OFFS 0x80 +#define QIB_7220_ErrMask_Reserved_LSB 0x36 +#define QIB_7220_ErrMask_Reserved_RMASK 0x3FF +#define QIB_7220_ErrMask_InvalidEEPCmdMask_LSB 0x35 +#define QIB_7220_ErrMask_InvalidEEPCmdMask_RMASK 0x1 +#define QIB_7220_ErrMask_SDmaDescAddrMisalignErrMask_LSB 0x34 +#define QIB_7220_ErrMask_SDmaDescAddrMisalignErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_HardwareErrMask_LSB 0x33 +#define QIB_7220_ErrMask_HardwareErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_ResetNegatedMask_LSB 0x32 +#define QIB_7220_ErrMask_ResetNegatedMask_RMASK 0x1 +#define QIB_7220_ErrMask_InvalidAddrErrMask_LSB 0x31 +#define QIB_7220_ErrMask_InvalidAddrErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_IBStatusChangedMask_LSB 0x30 +#define QIB_7220_ErrMask_IBStatusChangedMask_RMASK 0x1 +#define QIB_7220_ErrMask_SDmaUnexpDataErrMask_LSB 0x2F +#define QIB_7220_ErrMask_SDmaUnexpDataErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SDmaMissingDwErrMask_LSB 0x2E +#define QIB_7220_ErrMask_SDmaMissingDwErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SDmaDwEnErrMask_LSB 0x2D +#define QIB_7220_ErrMask_SDmaDwEnErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SDmaRpyTagErrMask_LSB 0x2C +#define QIB_7220_ErrMask_SDmaRpyTagErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SDma1stDescErrMask_LSB 0x2B +#define QIB_7220_ErrMask_SDma1stDescErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SDmaBaseErrMask_LSB 0x2A +#define QIB_7220_ErrMask_SDmaBaseErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SDmaTailOutOfBoundErrMask_LSB 0x29 +#define QIB_7220_ErrMask_SDmaTailOutOfBoundErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SDmaOutOfBoundErrMask_LSB 0x28 +#define QIB_7220_ErrMask_SDmaOutOfBoundErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SDmaGenMismatchErrMask_LSB 0x27 +#define QIB_7220_ErrMask_SDmaGenMismatchErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SendBufMisuseErrMask_LSB 0x26 +#define QIB_7220_ErrMask_SendBufMisuseErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SendUnsupportedVLErrMask_LSB 0x25 +#define QIB_7220_ErrMask_SendUnsupportedVLErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SendUnexpectedPktNumErrMask_LSB 0x24 +#define QIB_7220_ErrMask_SendUnexpectedPktNumErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SendPioArmLaunchErrMask_LSB 0x23 +#define QIB_7220_ErrMask_SendPioArmLaunchErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SendDroppedDataPktErrMask_LSB 0x22 +#define QIB_7220_ErrMask_SendDroppedDataPktErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SendDroppedSmpPktErrMask_LSB 0x21 +#define QIB_7220_ErrMask_SendDroppedSmpPktErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SendPktLenErrMask_LSB 0x20 +#define QIB_7220_ErrMask_SendPktLenErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SendUnderRunErrMask_LSB 0x1F +#define QIB_7220_ErrMask_SendUnderRunErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SendMaxPktLenErrMask_LSB 0x1E +#define QIB_7220_ErrMask_SendMaxPktLenErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SendMinPktLenErrMask_LSB 0x1D +#define QIB_7220_ErrMask_SendMinPktLenErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SDmaDisabledErrMask_LSB 0x1C +#define QIB_7220_ErrMask_SDmaDisabledErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_SendSpecialTriggerErrMask_LSB 0x1B +#define QIB_7220_ErrMask_SendSpecialTriggerErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_Reserved1_LSB 0x12 +#define QIB_7220_ErrMask_Reserved1_RMASK 0x1FF +#define QIB_7220_ErrMask_RcvIBLostLinkErrMask_LSB 0x11 +#define QIB_7220_ErrMask_RcvIBLostLinkErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvHdrErrMask_LSB 0x10 +#define QIB_7220_ErrMask_RcvHdrErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvHdrLenErrMask_LSB 0xF +#define QIB_7220_ErrMask_RcvHdrLenErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvBadTidErrMask_LSB 0xE +#define QIB_7220_ErrMask_RcvBadTidErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvHdrFullErrMask_LSB 0xD +#define QIB_7220_ErrMask_RcvHdrFullErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvEgrFullErrMask_LSB 0xC +#define QIB_7220_ErrMask_RcvEgrFullErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvBadVersionErrMask_LSB 0xB +#define QIB_7220_ErrMask_RcvBadVersionErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvIBFlowErrMask_LSB 0xA +#define QIB_7220_ErrMask_RcvIBFlowErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvEBPErrMask_LSB 0x9 +#define QIB_7220_ErrMask_RcvEBPErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvUnsupportedVLErrMask_LSB 0x8 +#define QIB_7220_ErrMask_RcvUnsupportedVLErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvUnexpectedCharErrMask_LSB 0x7 +#define QIB_7220_ErrMask_RcvUnexpectedCharErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvShortPktLenErrMask_LSB 0x6 +#define QIB_7220_ErrMask_RcvShortPktLenErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvLongPktLenErrMask_LSB 0x5 +#define QIB_7220_ErrMask_RcvLongPktLenErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvMaxPktLenErrMask_LSB 0x4 +#define QIB_7220_ErrMask_RcvMaxPktLenErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvMinPktLenErrMask_LSB 0x3 +#define QIB_7220_ErrMask_RcvMinPktLenErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvICRCErrMask_LSB 0x2 +#define QIB_7220_ErrMask_RcvICRCErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvVCRCErrMask_LSB 0x1 +#define QIB_7220_ErrMask_RcvVCRCErrMask_RMASK 0x1 +#define QIB_7220_ErrMask_RcvFormatErrMask_LSB 0x0 +#define QIB_7220_ErrMask_RcvFormatErrMask_RMASK 0x1 + +#define QIB_7220_ErrStatus_OFFS 0x88 +#define QIB_7220_ErrStatus_Reserved_LSB 0x36 +#define QIB_7220_ErrStatus_Reserved_RMASK 0x3FF +#define QIB_7220_ErrStatus_InvalidEEPCmdErr_LSB 0x35 +#define QIB_7220_ErrStatus_InvalidEEPCmdErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SDmaDescAddrMisalignErr_LSB 0x34 +#define QIB_7220_ErrStatus_SDmaDescAddrMisalignErr_RMASK 0x1 +#define QIB_7220_ErrStatus_HardwareErr_LSB 0x33 +#define QIB_7220_ErrStatus_HardwareErr_RMASK 0x1 +#define QIB_7220_ErrStatus_ResetNegated_LSB 0x32 +#define QIB_7220_ErrStatus_ResetNegated_RMASK 0x1 +#define QIB_7220_ErrStatus_InvalidAddrErr_LSB 0x31 +#define QIB_7220_ErrStatus_InvalidAddrErr_RMASK 0x1 +#define QIB_7220_ErrStatus_IBStatusChanged_LSB 0x30 +#define QIB_7220_ErrStatus_IBStatusChanged_RMASK 0x1 +#define QIB_7220_ErrStatus_SDmaUnexpDataErr_LSB 0x2F +#define QIB_7220_ErrStatus_SDmaUnexpDataErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SDmaMissingDwErr_LSB 0x2E +#define QIB_7220_ErrStatus_SDmaMissingDwErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SDmaDwEnErr_LSB 0x2D +#define QIB_7220_ErrStatus_SDmaDwEnErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SDmaRpyTagErr_LSB 0x2C +#define QIB_7220_ErrStatus_SDmaRpyTagErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SDma1stDescErr_LSB 0x2B +#define QIB_7220_ErrStatus_SDma1stDescErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SDmaBaseErr_LSB 0x2A +#define QIB_7220_ErrStatus_SDmaBaseErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SDmaTailOutOfBoundErr_LSB 0x29 +#define QIB_7220_ErrStatus_SDmaTailOutOfBoundErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SDmaOutOfBoundErr_LSB 0x28 +#define QIB_7220_ErrStatus_SDmaOutOfBoundErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SDmaGenMismatchErr_LSB 0x27 +#define QIB_7220_ErrStatus_SDmaGenMismatchErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SendBufMisuseErr_LSB 0x26 +#define QIB_7220_ErrStatus_SendBufMisuseErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SendUnsupportedVLErr_LSB 0x25 +#define QIB_7220_ErrStatus_SendUnsupportedVLErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SendUnexpectedPktNumErr_LSB 0x24 +#define QIB_7220_ErrStatus_SendUnexpectedPktNumErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SendPioArmLaunchErr_LSB 0x23 +#define QIB_7220_ErrStatus_SendPioArmLaunchErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SendDroppedDataPktErr_LSB 0x22 +#define QIB_7220_ErrStatus_SendDroppedDataPktErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SendDroppedSmpPktErr_LSB 0x21 +#define QIB_7220_ErrStatus_SendDroppedSmpPktErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SendPktLenErr_LSB 0x20 +#define QIB_7220_ErrStatus_SendPktLenErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SendUnderRunErr_LSB 0x1F +#define QIB_7220_ErrStatus_SendUnderRunErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SendMaxPktLenErr_LSB 0x1E +#define QIB_7220_ErrStatus_SendMaxPktLenErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SendMinPktLenErr_LSB 0x1D +#define QIB_7220_ErrStatus_SendMinPktLenErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SDmaDisabledErr_LSB 0x1C +#define QIB_7220_ErrStatus_SDmaDisabledErr_RMASK 0x1 +#define QIB_7220_ErrStatus_SendSpecialTriggerErr_LSB 0x1B +#define QIB_7220_ErrStatus_SendSpecialTriggerErr_RMASK 0x1 +#define QIB_7220_ErrStatus_Reserved1_LSB 0x12 +#define QIB_7220_ErrStatus_Reserved1_RMASK 0x1FF +#define QIB_7220_ErrStatus_RcvIBLostLinkErr_LSB 0x11 +#define QIB_7220_ErrStatus_RcvIBLostLinkErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvHdrErr_LSB 0x10 +#define QIB_7220_ErrStatus_RcvHdrErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvHdrLenErr_LSB 0xF +#define QIB_7220_ErrStatus_RcvHdrLenErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvBadTidErr_LSB 0xE +#define QIB_7220_ErrStatus_RcvBadTidErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvHdrFullErr_LSB 0xD +#define QIB_7220_ErrStatus_RcvHdrFullErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvEgrFullErr_LSB 0xC +#define QIB_7220_ErrStatus_RcvEgrFullErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvBadVersionErr_LSB 0xB +#define QIB_7220_ErrStatus_RcvBadVersionErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvIBFlowErr_LSB 0xA +#define QIB_7220_ErrStatus_RcvIBFlowErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvEBPErr_LSB 0x9 +#define QIB_7220_ErrStatus_RcvEBPErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvUnsupportedVLErr_LSB 0x8 +#define QIB_7220_ErrStatus_RcvUnsupportedVLErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvUnexpectedCharErr_LSB 0x7 +#define QIB_7220_ErrStatus_RcvUnexpectedCharErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvShortPktLenErr_LSB 0x6 +#define QIB_7220_ErrStatus_RcvShortPktLenErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvLongPktLenErr_LSB 0x5 +#define QIB_7220_ErrStatus_RcvLongPktLenErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvMaxPktLenErr_LSB 0x4 +#define QIB_7220_ErrStatus_RcvMaxPktLenErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvMinPktLenErr_LSB 0x3 +#define QIB_7220_ErrStatus_RcvMinPktLenErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvICRCErr_LSB 0x2 +#define QIB_7220_ErrStatus_RcvICRCErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvVCRCErr_LSB 0x1 +#define QIB_7220_ErrStatus_RcvVCRCErr_RMASK 0x1 +#define QIB_7220_ErrStatus_RcvFormatErr_LSB 0x0 +#define QIB_7220_ErrStatus_RcvFormatErr_RMASK 0x1 + +#define QIB_7220_ErrClear_OFFS 0x90 +#define QIB_7220_ErrClear_Reserved_LSB 0x36 +#define QIB_7220_ErrClear_Reserved_RMASK 0x3FF +#define QIB_7220_ErrClear_InvalidEEPCmdErrClear_LSB 0x35 +#define QIB_7220_ErrClear_InvalidEEPCmdErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SDmaDescAddrMisalignErrClear_LSB 0x34 +#define QIB_7220_ErrClear_SDmaDescAddrMisalignErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_HardwareErrClear_LSB 0x33 +#define QIB_7220_ErrClear_HardwareErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_ResetNegatedClear_LSB 0x32 +#define QIB_7220_ErrClear_ResetNegatedClear_RMASK 0x1 +#define QIB_7220_ErrClear_InvalidAddrErrClear_LSB 0x31 +#define QIB_7220_ErrClear_InvalidAddrErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_IBStatusChangedClear_LSB 0x30 +#define QIB_7220_ErrClear_IBStatusChangedClear_RMASK 0x1 +#define QIB_7220_ErrClear_SDmaUnexpDataErrClear_LSB 0x2F +#define QIB_7220_ErrClear_SDmaUnexpDataErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SDmaMissingDwErrClear_LSB 0x2E +#define QIB_7220_ErrClear_SDmaMissingDwErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SDmaDwEnErrClear_LSB 0x2D +#define QIB_7220_ErrClear_SDmaDwEnErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SDmaRpyTagErrClear_LSB 0x2C +#define QIB_7220_ErrClear_SDmaRpyTagErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SDma1stDescErrClear_LSB 0x2B +#define QIB_7220_ErrClear_SDma1stDescErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SDmaBaseErrClear_LSB 0x2A +#define QIB_7220_ErrClear_SDmaBaseErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SDmaTailOutOfBoundErrClear_LSB 0x29 +#define QIB_7220_ErrClear_SDmaTailOutOfBoundErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SDmaOutOfBoundErrClear_LSB 0x28 +#define QIB_7220_ErrClear_SDmaOutOfBoundErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SDmaGenMismatchErrClear_LSB 0x27 +#define QIB_7220_ErrClear_SDmaGenMismatchErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SendBufMisuseErrClear_LSB 0x26 +#define QIB_7220_ErrClear_SendBufMisuseErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SendUnsupportedVLErrClear_LSB 0x25 +#define QIB_7220_ErrClear_SendUnsupportedVLErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SendUnexpectedPktNumErrClear_LSB 0x24 +#define QIB_7220_ErrClear_SendUnexpectedPktNumErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SendPioArmLaunchErrClear_LSB 0x23 +#define QIB_7220_ErrClear_SendPioArmLaunchErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SendDroppedDataPktErrClear_LSB 0x22 +#define QIB_7220_ErrClear_SendDroppedDataPktErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SendDroppedSmpPktErrClear_LSB 0x21 +#define QIB_7220_ErrClear_SendDroppedSmpPktErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SendPktLenErrClear_LSB 0x20 +#define QIB_7220_ErrClear_SendPktLenErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SendUnderRunErrClear_LSB 0x1F +#define QIB_7220_ErrClear_SendUnderRunErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SendMaxPktLenErrClear_LSB 0x1E +#define QIB_7220_ErrClear_SendMaxPktLenErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SendMinPktLenErrClear_LSB 0x1D +#define QIB_7220_ErrClear_SendMinPktLenErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SDmaDisabledErrClear_LSB 0x1C +#define QIB_7220_ErrClear_SDmaDisabledErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_SendSpecialTriggerErrClear_LSB 0x1B +#define QIB_7220_ErrClear_SendSpecialTriggerErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_Reserved1_LSB 0x12 +#define QIB_7220_ErrClear_Reserved1_RMASK 0x1FF +#define QIB_7220_ErrClear_RcvIBLostLinkErrClear_LSB 0x11 +#define QIB_7220_ErrClear_RcvIBLostLinkErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvHdrErrClear_LSB 0x10 +#define QIB_7220_ErrClear_RcvHdrErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvHdrLenErrClear_LSB 0xF +#define QIB_7220_ErrClear_RcvHdrLenErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvBadTidErrClear_LSB 0xE +#define QIB_7220_ErrClear_RcvBadTidErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvHdrFullErrClear_LSB 0xD +#define QIB_7220_ErrClear_RcvHdrFullErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvEgrFullErrClear_LSB 0xC +#define QIB_7220_ErrClear_RcvEgrFullErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvBadVersionErrClear_LSB 0xB +#define QIB_7220_ErrClear_RcvBadVersionErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvIBFlowErrClear_LSB 0xA +#define QIB_7220_ErrClear_RcvIBFlowErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvEBPErrClear_LSB 0x9 +#define QIB_7220_ErrClear_RcvEBPErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvUnsupportedVLErrClear_LSB 0x8 +#define QIB_7220_ErrClear_RcvUnsupportedVLErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvUnexpectedCharErrClear_LSB 0x7 +#define QIB_7220_ErrClear_RcvUnexpectedCharErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvShortPktLenErrClear_LSB 0x6 +#define QIB_7220_ErrClear_RcvShortPktLenErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvLongPktLenErrClear_LSB 0x5 +#define QIB_7220_ErrClear_RcvLongPktLenErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvMaxPktLenErrClear_LSB 0x4 +#define QIB_7220_ErrClear_RcvMaxPktLenErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvMinPktLenErrClear_LSB 0x3 +#define QIB_7220_ErrClear_RcvMinPktLenErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvICRCErrClear_LSB 0x2 +#define QIB_7220_ErrClear_RcvICRCErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvVCRCErrClear_LSB 0x1 +#define QIB_7220_ErrClear_RcvVCRCErrClear_RMASK 0x1 +#define QIB_7220_ErrClear_RcvFormatErrClear_LSB 0x0 +#define QIB_7220_ErrClear_RcvFormatErrClear_RMASK 0x1 + +#define QIB_7220_HwErrMask_OFFS 0x98 +#define QIB_7220_HwErrMask_IBCBusFromSPCParityErrMask_LSB 0x3F +#define QIB_7220_HwErrMask_IBCBusFromSPCParityErrMask_RMASK 0x1 +#define QIB_7220_HwErrMask_IBCBusToSPCParityErrMask_LSB 0x3E +#define QIB_7220_HwErrMask_IBCBusToSPCParityErrMask_RMASK 0x1 +#define QIB_7220_HwErrMask_Clk_uC_PLLNotLockedMask_LSB 0x3D +#define QIB_7220_HwErrMask_Clk_uC_PLLNotLockedMask_RMASK 0x1 +#define QIB_7220_HwErrMask_IBSerdesPClkNotDetectMask_LSB 0x3C +#define QIB_7220_HwErrMask_IBSerdesPClkNotDetectMask_RMASK 0x1 +#define QIB_7220_HwErrMask_PCIESerdesQ3PClkNotDetectMask_LSB 0x3B +#define QIB_7220_HwErrMask_PCIESerdesQ3PClkNotDetectMask_RMASK 0x1 +#define QIB_7220_HwErrMask_PCIESerdesQ2PClkNotDetectMask_LSB 0x3A +#define QIB_7220_HwErrMask_PCIESerdesQ2PClkNotDetectMask_RMASK 0x1 +#define QIB_7220_HwErrMask_PCIESerdesQ1PClkNotDetectMask_LSB 0x39 +#define QIB_7220_HwErrMask_PCIESerdesQ1PClkNotDetectMask_RMASK 0x1 +#define QIB_7220_HwErrMask_PCIESerdesQ0PClkNotDetectMask_LSB 0x38 +#define QIB_7220_HwErrMask_PCIESerdesQ0PClkNotDetectMask_RMASK 0x1 +#define QIB_7220_HwErrMask_Reserved_LSB 0x37 +#define QIB_7220_HwErrMask_Reserved_RMASK 0x1 +#define QIB_7220_HwErrMask_PowerOnBISTFailedMask_LSB 0x36 +#define QIB_7220_HwErrMask_PowerOnBISTFailedMask_RMASK 0x1 +#define QIB_7220_HwErrMask_Reserved1_LSB 0x33 +#define QIB_7220_HwErrMask_Reserved1_RMASK 0x7 +#define QIB_7220_HwErrMask_RXEMemParityErrMask_LSB 0x2C +#define QIB_7220_HwErrMask_RXEMemParityErrMask_RMASK 0x7F +#define QIB_7220_HwErrMask_TXEMemParityErrMask_LSB 0x28 +#define QIB_7220_HwErrMask_TXEMemParityErrMask_RMASK 0xF +#define QIB_7220_HwErrMask_DDSRXEQMemoryParityErrMask_LSB 0x27 +#define QIB_7220_HwErrMask_DDSRXEQMemoryParityErrMask_RMASK 0x1 +#define QIB_7220_HwErrMask_IB_uC_MemoryParityErrMask_LSB 0x26 +#define QIB_7220_HwErrMask_IB_uC_MemoryParityErrMask_RMASK 0x1 +#define QIB_7220_HwErrMask_PCIEOct1_uC_MemoryParityErrMask_LSB 0x25 +#define QIB_7220_HwErrMask_PCIEOct1_uC_MemoryParityErrMask_RMASK 0x1 +#define QIB_7220_HwErrMask_PCIEOct0_uC_MemoryParityErrMask_LSB 0x24 +#define QIB_7220_HwErrMask_PCIEOct0_uC_MemoryParityErrMask_RMASK 0x1 +#define QIB_7220_HwErrMask_Reserved2_LSB 0x22 +#define QIB_7220_HwErrMask_Reserved2_RMASK 0x3 +#define QIB_7220_HwErrMask_PCIeBusParityErrMask_LSB 0x1F +#define QIB_7220_HwErrMask_PCIeBusParityErrMask_RMASK 0x7 +#define QIB_7220_HwErrMask_PcieCplTimeoutMask_LSB 0x1E +#define QIB_7220_HwErrMask_PcieCplTimeoutMask_RMASK 0x1 +#define QIB_7220_HwErrMask_PoisonedTLPMask_LSB 0x1D +#define QIB_7220_HwErrMask_PoisonedTLPMask_RMASK 0x1 +#define QIB_7220_HwErrMask_SDmaMemReadErrMask_LSB 0x1C +#define QIB_7220_HwErrMask_SDmaMemReadErrMask_RMASK 0x1 +#define QIB_7220_HwErrMask_Reserved3_LSB 0x8 +#define QIB_7220_HwErrMask_Reserved3_RMASK 0xFFFFF +#define QIB_7220_HwErrMask_PCIeMemParityErrMask_LSB 0x0 +#define QIB_7220_HwErrMask_PCIeMemParityErrMask_RMASK 0xFF + +#define QIB_7220_HwErrStatus_OFFS 0xA0 +#define QIB_7220_HwErrStatus_IBCBusFromSPCParityErr_LSB 0x3F +#define QIB_7220_HwErrStatus_IBCBusFromSPCParityErr_RMASK 0x1 +#define QIB_7220_HwErrStatus_IBCBusToSPCParityErr_LSB 0x3E +#define QIB_7220_HwErrStatus_IBCBusToSPCParityErr_RMASK 0x1 +#define QIB_7220_HwErrStatus_Clk_uC_PLLNotLocked_LSB 0x3D +#define QIB_7220_HwErrStatus_Clk_uC_PLLNotLocked_RMASK 0x1 +#define QIB_7220_HwErrStatus_IBSerdesPClkNotDetect_LSB 0x3C +#define QIB_7220_HwErrStatus_IBSerdesPClkNotDetect_RMASK 0x1 +#define QIB_7220_HwErrStatus_PCIESerdesQ3PClkNotDetect_LSB 0x3B +#define QIB_7220_HwErrStatus_PCIESerdesQ3PClkNotDetect_RMASK 0x1 +#define QIB_7220_HwErrStatus_PCIESerdesQ2PClkNotDetect_LSB 0x3A +#define QIB_7220_HwErrStatus_PCIESerdesQ2PClkNotDetect_RMASK 0x1 +#define QIB_7220_HwErrStatus_PCIESerdesQ1PClkNotDetect_LSB 0x39 +#define QIB_7220_HwErrStatus_PCIESerdesQ1PClkNotDetect_RMASK 0x1 +#define QIB_7220_HwErrStatus_PCIESerdesQ0PClkNotDetect_LSB 0x38 +#define QIB_7220_HwErrStatus_PCIESerdesQ0PClkNotDetect_RMASK 0x1 +#define QIB_7220_HwErrStatus_Reserved_LSB 0x37 +#define QIB_7220_HwErrStatus_Reserved_RMASK 0x1 +#define QIB_7220_HwErrStatus_PowerOnBISTFailed_LSB 0x36 +#define QIB_7220_HwErrStatus_PowerOnBISTFailed_RMASK 0x1 +#define QIB_7220_HwErrStatus_Reserved1_LSB 0x33 +#define QIB_7220_HwErrStatus_Reserved1_RMASK 0x7 +#define QIB_7220_HwErrStatus_RXEMemParity_LSB 0x2C +#define QIB_7220_HwErrStatus_RXEMemParity_RMASK 0x7F +#define QIB_7220_HwErrStatus_TXEMemParity_LSB 0x28 +#define QIB_7220_HwErrStatus_TXEMemParity_RMASK 0xF +#define QIB_7220_HwErrStatus_DDSRXEQMemoryParityErr_LSB 0x27 +#define QIB_7220_HwErrStatus_DDSRXEQMemoryParityErr_RMASK 0x1 +#define QIB_7220_HwErrStatus_IB_uC_MemoryParityErr_LSB 0x26 +#define QIB_7220_HwErrStatus_IB_uC_MemoryParityErr_RMASK 0x1 +#define QIB_7220_HwErrStatus_PCIE_uC_Oct1MemoryParityErr_LSB 0x25 +#define QIB_7220_HwErrStatus_PCIE_uC_Oct1MemoryParityErr_RMASK 0x1 +#define QIB_7220_HwErrStatus_PCIE_uC_Oct0MemoryParityErr_LSB 0x24 +#define QIB_7220_HwErrStatus_PCIE_uC_Oct0MemoryParityErr_RMASK 0x1 +#define QIB_7220_HwErrStatus_Reserved2_LSB 0x22 +#define QIB_7220_HwErrStatus_Reserved2_RMASK 0x3 +#define QIB_7220_HwErrStatus_PCIeBusParity_LSB 0x1F +#define QIB_7220_HwErrStatus_PCIeBusParity_RMASK 0x7 +#define QIB_7220_HwErrStatus_PcieCplTimeout_LSB 0x1E +#define QIB_7220_HwErrStatus_PcieCplTimeout_RMASK 0x1 +#define QIB_7220_HwErrStatus_PoisenedTLP_LSB 0x1D +#define QIB_7220_HwErrStatus_PoisenedTLP_RMASK 0x1 +#define QIB_7220_HwErrStatus_SDmaMemReadErr_LSB 0x1C +#define QIB_7220_HwErrStatus_SDmaMemReadErr_RMASK 0x1 +#define QIB_7220_HwErrStatus_Reserved3_LSB 0x8 +#define QIB_7220_HwErrStatus_Reserved3_RMASK 0xFFFFF +#define QIB_7220_HwErrStatus_PCIeMemParity_LSB 0x0 +#define QIB_7220_HwErrStatus_PCIeMemParity_RMASK 0xFF + +#define QIB_7220_HwErrClear_OFFS 0xA8 +#define QIB_7220_HwErrClear_IBCBusFromSPCParityErrClear_LSB 0x3F +#define QIB_7220_HwErrClear_IBCBusFromSPCParityErrClear_RMASK 0x1 +#define QIB_7220_HwErrClear_IBCBusToSPCparityErrClear_LSB 0x3E +#define QIB_7220_HwErrClear_IBCBusToSPCparityErrClear_RMASK 0x1 +#define QIB_7220_HwErrClear_Clk_uC_PLLNotLockedClear_LSB 0x3D +#define QIB_7220_HwErrClear_Clk_uC_PLLNotLockedClear_RMASK 0x1 +#define QIB_7220_HwErrClear_IBSerdesPClkNotDetectClear_LSB 0x3C +#define QIB_7220_HwErrClear_IBSerdesPClkNotDetectClear_RMASK 0x1 +#define QIB_7220_HwErrClear_PCIESerdesQ3PClkNotDetectClear_LSB 0x3B +#define QIB_7220_HwErrClear_PCIESerdesQ3PClkNotDetectClear_RMASK 0x1 +#define QIB_7220_HwErrClear_PCIESerdesQ2PClkNotDetectClear_LSB 0x3A +#define QIB_7220_HwErrClear_PCIESerdesQ2PClkNotDetectClear_RMASK 0x1 +#define QIB_7220_HwErrClear_PCIESerdesQ1PClkNotDetectClear_LSB 0x39 +#define QIB_7220_HwErrClear_PCIESerdesQ1PClkNotDetectClear_RMASK 0x1 +#define QIB_7220_HwErrClear_PCIESerdesQ0PClkNotDetectClear_LSB 0x38 +#define QIB_7220_HwErrClear_PCIESerdesQ0PClkNotDetectClear_RMASK 0x1 +#define QIB_7220_HwErrClear_Reserved_LSB 0x37 +#define QIB_7220_HwErrClear_Reserved_RMASK 0x1 +#define QIB_7220_HwErrClear_PowerOnBISTFailedClear_LSB 0x36 +#define QIB_7220_HwErrClear_PowerOnBISTFailedClear_RMASK 0x1 +#define QIB_7220_HwErrClear_Reserved1_LSB 0x33 +#define QIB_7220_HwErrClear_Reserved1_RMASK 0x7 +#define QIB_7220_HwErrClear_RXEMemParityClear_LSB 0x2C +#define QIB_7220_HwErrClear_RXEMemParityClear_RMASK 0x7F +#define QIB_7220_HwErrClear_TXEMemParityClear_LSB 0x28 +#define QIB_7220_HwErrClear_TXEMemParityClear_RMASK 0xF +#define QIB_7220_HwErrClear_DDSRXEQMemoryParityErrClear_LSB 0x27 +#define QIB_7220_HwErrClear_DDSRXEQMemoryParityErrClear_RMASK 0x1 +#define QIB_7220_HwErrClear_IB_uC_MemoryParityErrClear_LSB 0x26 +#define QIB_7220_HwErrClear_IB_uC_MemoryParityErrClear_RMASK 0x1 +#define QIB_7220_HwErrClear_PCIE_uC_Oct1MemoryParityErrClear_LSB 0x25 +#define QIB_7220_HwErrClear_PCIE_uC_Oct1MemoryParityErrClear_RMASK 0x1 +#define QIB_7220_HwErrClear_PCIE_uC_Oct0MemoryParityErrClear_LSB 0x24 +#define QIB_7220_HwErrClear_PCIE_uC_Oct0MemoryParityErrClear_RMASK 0x1 +#define QIB_7220_HwErrClear_Reserved2_LSB 0x22 +#define QIB_7220_HwErrClear_Reserved2_RMASK 0x3 +#define QIB_7220_HwErrClear_PCIeBusParityClr_LSB 0x1F +#define QIB_7220_HwErrClear_PCIeBusParityClr_RMASK 0x7 +#define QIB_7220_HwErrClear_PcieCplTimeoutClear_LSB 0x1E +#define QIB_7220_HwErrClear_PcieCplTimeoutClear_RMASK 0x1 +#define QIB_7220_HwErrClear_PoisonedTLPClear_LSB 0x1D +#define QIB_7220_HwErrClear_PoisonedTLPClear_RMASK 0x1 +#define QIB_7220_HwErrClear_SDmaMemReadErrClear_LSB 0x1C +#define QIB_7220_HwErrClear_SDmaMemReadErrClear_RMASK 0x1 +#define QIB_7220_HwErrClear_Reserved3_LSB 0x8 +#define QIB_7220_HwErrClear_Reserved3_RMASK 0xFFFFF +#define QIB_7220_HwErrClear_PCIeMemParityClr_LSB 0x0 +#define QIB_7220_HwErrClear_PCIeMemParityClr_RMASK 0xFF + +#define QIB_7220_HwDiagCtrl_OFFS 0xB0 +#define QIB_7220_HwDiagCtrl_ForceIBCBusFromSPCParityErr_LSB 0x3F +#define QIB_7220_HwDiagCtrl_ForceIBCBusFromSPCParityErr_RMASK 0x1 +#define QIB_7220_HwDiagCtrl_ForceIBCBusToSPCParityErr_LSB 0x3E +#define QIB_7220_HwDiagCtrl_ForceIBCBusToSPCParityErr_RMASK 0x1 +#define QIB_7220_HwDiagCtrl_CounterWrEnable_LSB 0x3D +#define QIB_7220_HwDiagCtrl_CounterWrEnable_RMASK 0x1 +#define QIB_7220_HwDiagCtrl_CounterDisable_LSB 0x3C +#define QIB_7220_HwDiagCtrl_CounterDisable_RMASK 0x1 +#define QIB_7220_HwDiagCtrl_Reserved_LSB 0x33 +#define QIB_7220_HwDiagCtrl_Reserved_RMASK 0x1FF +#define QIB_7220_HwDiagCtrl_ForceRxMemParityErr_LSB 0x2C +#define QIB_7220_HwDiagCtrl_ForceRxMemParityErr_RMASK 0x7F +#define QIB_7220_HwDiagCtrl_ForceTxMemparityErr_LSB 0x28 +#define QIB_7220_HwDiagCtrl_ForceTxMemparityErr_RMASK 0xF +#define QIB_7220_HwDiagCtrl_ForceDDSRXEQMemoryParityErr_LSB 0x27 +#define QIB_7220_HwDiagCtrl_ForceDDSRXEQMemoryParityErr_RMASK 0x1 +#define QIB_7220_HwDiagCtrl_ForceIB_uC_MemoryParityErr_LSB 0x26 +#define QIB_7220_HwDiagCtrl_ForceIB_uC_MemoryParityErr_RMASK 0x1 +#define QIB_7220_HwDiagCtrl_ForcePCIE_uC_Oct1MemoryParityErr_LSB 0x25 +#define QIB_7220_HwDiagCtrl_ForcePCIE_uC_Oct1MemoryParityErr_RMASK 0x1 +#define QIB_7220_HwDiagCtrl_ForcePCIE_uC_Oct0MemoryParityErr_LSB 0x24 +#define QIB_7220_HwDiagCtrl_ForcePCIE_uC_Oct0MemoryParityErr_RMASK 0x1 +#define QIB_7220_HwDiagCtrl_Reserved1_LSB 0x23 +#define QIB_7220_HwDiagCtrl_Reserved1_RMASK 0x1 +#define QIB_7220_HwDiagCtrl_forcePCIeBusParity_LSB 0x1F +#define QIB_7220_HwDiagCtrl_forcePCIeBusParity_RMASK 0xF +#define QIB_7220_HwDiagCtrl_Reserved2_LSB 0x8 +#define QIB_7220_HwDiagCtrl_Reserved2_RMASK 0x7FFFFF +#define QIB_7220_HwDiagCtrl_forcePCIeMemParity_LSB 0x0 +#define QIB_7220_HwDiagCtrl_forcePCIeMemParity_RMASK 0xFF + +#define QIB_7220_REG_0000B8_OFFS 0xB8 + +#define QIB_7220_IBCStatus_OFFS 0xC0 +#define QIB_7220_IBCStatus_TxCreditOk_LSB 0x1F +#define QIB_7220_IBCStatus_TxCreditOk_RMASK 0x1 +#define QIB_7220_IBCStatus_TxReady_LSB 0x1E +#define QIB_7220_IBCStatus_TxReady_RMASK 0x1 +#define QIB_7220_IBCStatus_Reserved_LSB 0xE +#define QIB_7220_IBCStatus_Reserved_RMASK 0xFFFF +#define QIB_7220_IBCStatus_IBTxLaneReversed_LSB 0xD +#define QIB_7220_IBCStatus_IBTxLaneReversed_RMASK 0x1 +#define QIB_7220_IBCStatus_IBRxLaneReversed_LSB 0xC +#define QIB_7220_IBCStatus_IBRxLaneReversed_RMASK 0x1 +#define QIB_7220_IBCStatus_IB_SERDES_TRIM_DONE_LSB 0xB +#define QIB_7220_IBCStatus_IB_SERDES_TRIM_DONE_RMASK 0x1 +#define QIB_7220_IBCStatus_DDS_RXEQ_FAIL_LSB 0xA +#define QIB_7220_IBCStatus_DDS_RXEQ_FAIL_RMASK 0x1 +#define QIB_7220_IBCStatus_LinkWidthActive_LSB 0x9 +#define QIB_7220_IBCStatus_LinkWidthActive_RMASK 0x1 +#define QIB_7220_IBCStatus_LinkSpeedActive_LSB 0x8 +#define QIB_7220_IBCStatus_LinkSpeedActive_RMASK 0x1 +#define QIB_7220_IBCStatus_LinkState_LSB 0x5 +#define QIB_7220_IBCStatus_LinkState_RMASK 0x7 +#define QIB_7220_IBCStatus_LinkTrainingState_LSB 0x0 +#define QIB_7220_IBCStatus_LinkTrainingState_RMASK 0x1F + +#define QIB_7220_IBCCtrl_OFFS 0xC8 +#define QIB_7220_IBCCtrl_Loopback_LSB 0x3F +#define QIB_7220_IBCCtrl_Loopback_RMASK 0x1 +#define QIB_7220_IBCCtrl_LinkDownDefaultState_LSB 0x3E +#define QIB_7220_IBCCtrl_LinkDownDefaultState_RMASK 0x1 +#define QIB_7220_IBCCtrl_Reserved_LSB 0x2B +#define QIB_7220_IBCCtrl_Reserved_RMASK 0x7FFFF +#define QIB_7220_IBCCtrl_CreditScale_LSB 0x28 +#define QIB_7220_IBCCtrl_CreditScale_RMASK 0x7 +#define QIB_7220_IBCCtrl_OverrunThreshold_LSB 0x24 +#define QIB_7220_IBCCtrl_OverrunThreshold_RMASK 0xF +#define QIB_7220_IBCCtrl_PhyerrThreshold_LSB 0x20 +#define QIB_7220_IBCCtrl_PhyerrThreshold_RMASK 0xF +#define QIB_7220_IBCCtrl_MaxPktLen_LSB 0x15 +#define QIB_7220_IBCCtrl_MaxPktLen_RMASK 0x7FF +#define QIB_7220_IBCCtrl_LinkCmd_LSB 0x13 +#define QIB_7220_IBCCtrl_LinkCmd_RMASK 0x3 +#define QIB_7220_IBCCtrl_LinkInitCmd_LSB 0x10 +#define QIB_7220_IBCCtrl_LinkInitCmd_RMASK 0x7 +#define QIB_7220_IBCCtrl_FlowCtrlWaterMark_LSB 0x8 +#define QIB_7220_IBCCtrl_FlowCtrlWaterMark_RMASK 0xFF +#define QIB_7220_IBCCtrl_FlowCtrlPeriod_LSB 0x0 +#define QIB_7220_IBCCtrl_FlowCtrlPeriod_RMASK 0xFF + +#define QIB_7220_EXTStatus_OFFS 0xD0 +#define QIB_7220_EXTStatus_GPIOIn_LSB 0x30 +#define QIB_7220_EXTStatus_GPIOIn_RMASK 0xFFFF +#define QIB_7220_EXTStatus_Reserved_LSB 0x20 +#define QIB_7220_EXTStatus_Reserved_RMASK 0xFFFF +#define QIB_7220_EXTStatus_Reserved1_LSB 0x10 +#define QIB_7220_EXTStatus_Reserved1_RMASK 0xFFFF +#define QIB_7220_EXTStatus_MemBISTDisabled_LSB 0xF +#define QIB_7220_EXTStatus_MemBISTDisabled_RMASK 0x1 +#define QIB_7220_EXTStatus_MemBISTEndTest_LSB 0xE +#define QIB_7220_EXTStatus_MemBISTEndTest_RMASK 0x1 +#define QIB_7220_EXTStatus_Reserved2_LSB 0x0 +#define QIB_7220_EXTStatus_Reserved2_RMASK 0x3FFF + +#define QIB_7220_EXTCtrl_OFFS 0xD8 +#define QIB_7220_EXTCtrl_GPIOOe_LSB 0x30 +#define QIB_7220_EXTCtrl_GPIOOe_RMASK 0xFFFF +#define QIB_7220_EXTCtrl_GPIOInvert_LSB 0x20 +#define QIB_7220_EXTCtrl_GPIOInvert_RMASK 0xFFFF +#define QIB_7220_EXTCtrl_Reserved_LSB 0x4 +#define QIB_7220_EXTCtrl_Reserved_RMASK 0xFFFFFFF +#define QIB_7220_EXTCtrl_LEDPriPortGreenOn_LSB 0x3 +#define QIB_7220_EXTCtrl_LEDPriPortGreenOn_RMASK 0x1 +#define QIB_7220_EXTCtrl_LEDPriPortYellowOn_LSB 0x2 +#define QIB_7220_EXTCtrl_LEDPriPortYellowOn_RMASK 0x1 +#define QIB_7220_EXTCtrl_LEDGblOkGreenOn_LSB 0x1 +#define QIB_7220_EXTCtrl_LEDGblOkGreenOn_RMASK 0x1 +#define QIB_7220_EXTCtrl_LEDGblErrRedOff_LSB 0x0 +#define QIB_7220_EXTCtrl_LEDGblErrRedOff_RMASK 0x1 + +#define QIB_7220_GPIOOut_OFFS 0xE0 + +#define QIB_7220_GPIOMask_OFFS 0xE8 + +#define QIB_7220_GPIOStatus_OFFS 0xF0 + +#define QIB_7220_GPIOClear_OFFS 0xF8 + +#define QIB_7220_RcvCtrl_OFFS 0x100 +#define QIB_7220_RcvCtrl_Reserved_LSB 0x27 +#define QIB_7220_RcvCtrl_Reserved_RMASK 0x1FFFFFF +#define QIB_7220_RcvCtrl_RcvQPMapEnable_LSB 0x26 +#define QIB_7220_RcvCtrl_RcvQPMapEnable_RMASK 0x1 +#define QIB_7220_RcvCtrl_PortCfg_LSB 0x24 +#define QIB_7220_RcvCtrl_PortCfg_RMASK 0x3 +#define QIB_7220_RcvCtrl_TailUpd_LSB 0x23 +#define QIB_7220_RcvCtrl_TailUpd_RMASK 0x1 +#define QIB_7220_RcvCtrl_RcvPartitionKeyDisable_LSB 0x22 +#define QIB_7220_RcvCtrl_RcvPartitionKeyDisable_RMASK 0x1 +#define QIB_7220_RcvCtrl_IntrAvail_LSB 0x11 +#define QIB_7220_RcvCtrl_IntrAvail_RMASK 0x1FFFF +#define QIB_7220_RcvCtrl_PortEnable_LSB 0x0 +#define QIB_7220_RcvCtrl_PortEnable_RMASK 0x1FFFF + +#define QIB_7220_RcvBTHQP_OFFS 0x108 +#define QIB_7220_RcvBTHQP_Reserved_LSB 0x18 +#define QIB_7220_RcvBTHQP_Reserved_RMASK 0xFF +#define QIB_7220_RcvBTHQP_RcvBTHQP_LSB 0x0 +#define QIB_7220_RcvBTHQP_RcvBTHQP_RMASK 0xFFFFFF + +#define QIB_7220_RcvHdrSize_OFFS 0x110 + +#define QIB_7220_RcvHdrCnt_OFFS 0x118 + +#define QIB_7220_RcvHdrEntSize_OFFS 0x120 + +#define QIB_7220_RcvTIDBase_OFFS 0x128 + +#define QIB_7220_RcvTIDCnt_OFFS 0x130 + +#define QIB_7220_RcvEgrBase_OFFS 0x138 + +#define QIB_7220_RcvEgrCnt_OFFS 0x140 + +#define QIB_7220_RcvBufBase_OFFS 0x148 + +#define QIB_7220_RcvBufSize_OFFS 0x150 + +#define QIB_7220_RxIntMemBase_OFFS 0x158 + +#define QIB_7220_RxIntMemSize_OFFS 0x160 + +#define QIB_7220_RcvPartitionKey_OFFS 0x168 + +#define QIB_7220_RcvQPMulticastPort_OFFS 0x170 +#define QIB_7220_RcvQPMulticastPort_Reserved_LSB 0x5 +#define QIB_7220_RcvQPMulticastPort_Reserved_RMASK 0x7FFFFFFFFFFFFFF +#define QIB_7220_RcvQPMulticastPort_RcvQpMcPort_LSB 0x0 +#define QIB_7220_RcvQPMulticastPort_RcvQpMcPort_RMASK 0x1F + +#define QIB_7220_RcvPktLEDCnt_OFFS 0x178 +#define QIB_7220_RcvPktLEDCnt_ONperiod_LSB 0x20 +#define QIB_7220_RcvPktLEDCnt_ONperiod_RMASK 0xFFFFFFFF +#define QIB_7220_RcvPktLEDCnt_OFFperiod_LSB 0x0 +#define QIB_7220_RcvPktLEDCnt_OFFperiod_RMASK 0xFFFFFFFF + +#define QIB_7220_IBCDDRCtrl_OFFS 0x180 +#define QIB_7220_IBCDDRCtrl_IB_DLID_MASK_LSB 0x30 +#define QIB_7220_IBCDDRCtrl_IB_DLID_MASK_RMASK 0xFFFF +#define QIB_7220_IBCDDRCtrl_IB_DLID_LSB 0x20 +#define QIB_7220_IBCDDRCtrl_IB_DLID_RMASK 0xFFFF +#define QIB_7220_IBCDDRCtrl_Reserved_LSB 0x1B +#define QIB_7220_IBCDDRCtrl_Reserved_RMASK 0x1F +#define QIB_7220_IBCDDRCtrl_HRTBT_REQ_LSB 0x1A +#define QIB_7220_IBCDDRCtrl_HRTBT_REQ_RMASK 0x1 +#define QIB_7220_IBCDDRCtrl_HRTBT_PORT_LSB 0x12 +#define QIB_7220_IBCDDRCtrl_HRTBT_PORT_RMASK 0xFF +#define QIB_7220_IBCDDRCtrl_HRTBT_AUTO_LSB 0x11 +#define QIB_7220_IBCDDRCtrl_HRTBT_AUTO_RMASK 0x1 +#define QIB_7220_IBCDDRCtrl_HRTBT_ENB_LSB 0x10 +#define QIB_7220_IBCDDRCtrl_HRTBT_ENB_RMASK 0x1 +#define QIB_7220_IBCDDRCtrl_SD_DDS_LSB 0xC +#define QIB_7220_IBCDDRCtrl_SD_DDS_RMASK 0xF +#define QIB_7220_IBCDDRCtrl_SD_DDSV_LSB 0xB +#define QIB_7220_IBCDDRCtrl_SD_DDSV_RMASK 0x1 +#define QIB_7220_IBCDDRCtrl_SD_ADD_ENB_LSB 0xA +#define QIB_7220_IBCDDRCtrl_SD_ADD_ENB_RMASK 0x1 +#define QIB_7220_IBCDDRCtrl_SD_RX_EQUAL_ENABLE_LSB 0x9 +#define QIB_7220_IBCDDRCtrl_SD_RX_EQUAL_ENABLE_RMASK 0x1 +#define QIB_7220_IBCDDRCtrl_IB_LANE_REV_SUPPORTED_LSB 0x8 +#define QIB_7220_IBCDDRCtrl_IB_LANE_REV_SUPPORTED_RMASK 0x1 +#define QIB_7220_IBCDDRCtrl_IB_POLARITY_REV_SUPP_LSB 0x7 +#define QIB_7220_IBCDDRCtrl_IB_POLARITY_REV_SUPP_RMASK 0x1 +#define QIB_7220_IBCDDRCtrl_IB_NUM_CHANNELS_LSB 0x5 +#define QIB_7220_IBCDDRCtrl_IB_NUM_CHANNELS_RMASK 0x3 +#define QIB_7220_IBCDDRCtrl_SD_SPEED_QDR_LSB 0x4 +#define QIB_7220_IBCDDRCtrl_SD_SPEED_QDR_RMASK 0x1 +#define QIB_7220_IBCDDRCtrl_SD_SPEED_DDR_LSB 0x3 +#define QIB_7220_IBCDDRCtrl_SD_SPEED_DDR_RMASK 0x1 +#define QIB_7220_IBCDDRCtrl_SD_SPEED_SDR_LSB 0x2 +#define QIB_7220_IBCDDRCtrl_SD_SPEED_SDR_RMASK 0x1 +#define QIB_7220_IBCDDRCtrl_SD_SPEED_LSB 0x1 +#define QIB_7220_IBCDDRCtrl_SD_SPEED_RMASK 0x1 +#define QIB_7220_IBCDDRCtrl_IB_ENHANCED_MODE_LSB 0x0 +#define QIB_7220_IBCDDRCtrl_IB_ENHANCED_MODE_RMASK 0x1 + +#define QIB_7220_HRTBT_GUID_OFFS 0x188 + +#define QIB_7220_IBCDDRCtrl2_OFFS 0x1A0 +#define QIB_7220_IBCDDRCtrl2_IB_BACK_PORCH_LSB 0x5 +#define QIB_7220_IBCDDRCtrl2_IB_BACK_PORCH_RMASK 0x1F +#define QIB_7220_IBCDDRCtrl2_IB_FRONT_PORCH_LSB 0x0 +#define QIB_7220_IBCDDRCtrl2_IB_FRONT_PORCH_RMASK 0x1F + +#define QIB_7220_IBCDDRStatus_OFFS 0x1A8 +#define QIB_7220_IBCDDRStatus_heartbeat_timed_out_LSB 0x24 +#define QIB_7220_IBCDDRStatus_heartbeat_timed_out_RMASK 0x1 +#define QIB_7220_IBCDDRStatus_heartbeat_crosstalk_LSB 0x20 +#define QIB_7220_IBCDDRStatus_heartbeat_crosstalk_RMASK 0xF +#define QIB_7220_IBCDDRStatus_RxEqLocalDevice_LSB 0x1E +#define QIB_7220_IBCDDRStatus_RxEqLocalDevice_RMASK 0x3 +#define QIB_7220_IBCDDRStatus_ReqDDSLocalFromRmt_LSB 0x1A +#define QIB_7220_IBCDDRStatus_ReqDDSLocalFromRmt_RMASK 0xF +#define QIB_7220_IBCDDRStatus_LinkRoundTripLatency_LSB 0x0 +#define QIB_7220_IBCDDRStatus_LinkRoundTripLatency_RMASK 0x3FFFFFF + +#define QIB_7220_JIntReload_OFFS 0x1B0 +#define QIB_7220_JIntReload_J_limit_reload_LSB 0x10 +#define QIB_7220_JIntReload_J_limit_reload_RMASK 0xFFFF +#define QIB_7220_JIntReload_J_reload_LSB 0x0 +#define QIB_7220_JIntReload_J_reload_RMASK 0xFFFF + +#define QIB_7220_IBNCModeCtrl_OFFS 0x1B8 +#define QIB_7220_IBNCModeCtrl_Reserved_LSB 0x1A +#define QIB_7220_IBNCModeCtrl_Reserved_RMASK 0x3FFFFFFFFF +#define QIB_7220_IBNCModeCtrl_TSMCode_TS2_LSB 0x11 +#define QIB_7220_IBNCModeCtrl_TSMCode_TS2_RMASK 0x1FF +#define QIB_7220_IBNCModeCtrl_TSMCode_TS1_LSB 0x8 +#define QIB_7220_IBNCModeCtrl_TSMCode_TS1_RMASK 0x1FF +#define QIB_7220_IBNCModeCtrl_Reserved1_LSB 0x3 +#define QIB_7220_IBNCModeCtrl_Reserved1_RMASK 0x1F +#define QIB_7220_IBNCModeCtrl_TSMEnable_ignore_TSM_on_rx_LSB 0x2 +#define QIB_7220_IBNCModeCtrl_TSMEnable_ignore_TSM_on_rx_RMASK 0x1 +#define QIB_7220_IBNCModeCtrl_TSMEnable_send_TS2_LSB 0x1 +#define QIB_7220_IBNCModeCtrl_TSMEnable_send_TS2_RMASK 0x1 +#define QIB_7220_IBNCModeCtrl_TSMEnable_send_TS1_LSB 0x0 +#define QIB_7220_IBNCModeCtrl_TSMEnable_send_TS1_RMASK 0x1 + +#define QIB_7220_SendCtrl_OFFS 0x1C0 +#define QIB_7220_SendCtrl_Disarm_LSB 0x1F +#define QIB_7220_SendCtrl_Disarm_RMASK 0x1 +#define QIB_7220_SendCtrl_Reserved_LSB 0x1D +#define QIB_7220_SendCtrl_Reserved_RMASK 0x3 +#define QIB_7220_SendCtrl_AvailUpdThld_LSB 0x18 +#define QIB_7220_SendCtrl_AvailUpdThld_RMASK 0x1F +#define QIB_7220_SendCtrl_DisarmPIOBuf_LSB 0x10 +#define QIB_7220_SendCtrl_DisarmPIOBuf_RMASK 0xFF +#define QIB_7220_SendCtrl_Reserved1_LSB 0xD +#define QIB_7220_SendCtrl_Reserved1_RMASK 0x7 +#define QIB_7220_SendCtrl_SDmaHalt_LSB 0xC +#define QIB_7220_SendCtrl_SDmaHalt_RMASK 0x1 +#define QIB_7220_SendCtrl_SDmaEnable_LSB 0xB +#define QIB_7220_SendCtrl_SDmaEnable_RMASK 0x1 +#define QIB_7220_SendCtrl_SDmaSingleDescriptor_LSB 0xA +#define QIB_7220_SendCtrl_SDmaSingleDescriptor_RMASK 0x1 +#define QIB_7220_SendCtrl_SDmaIntEnable_LSB 0x9 +#define QIB_7220_SendCtrl_SDmaIntEnable_RMASK 0x1 +#define QIB_7220_SendCtrl_Reserved2_LSB 0x5 +#define QIB_7220_SendCtrl_Reserved2_RMASK 0xF +#define QIB_7220_SendCtrl_SSpecialTriggerEn_LSB 0x4 +#define QIB_7220_SendCtrl_SSpecialTriggerEn_RMASK 0x1 +#define QIB_7220_SendCtrl_SPioEnable_LSB 0x3 +#define QIB_7220_SendCtrl_SPioEnable_RMASK 0x1 +#define QIB_7220_SendCtrl_SendBufAvailUpd_LSB 0x2 +#define QIB_7220_SendCtrl_SendBufAvailUpd_RMASK 0x1 +#define QIB_7220_SendCtrl_SendIntBufAvail_LSB 0x1 +#define QIB_7220_SendCtrl_SendIntBufAvail_RMASK 0x1 +#define QIB_7220_SendCtrl_Abort_LSB 0x0 +#define QIB_7220_SendCtrl_Abort_RMASK 0x1 + +#define QIB_7220_SendBufBase_OFFS 0x1C8 +#define QIB_7220_SendBufBase_Reserved_LSB 0x35 +#define QIB_7220_SendBufBase_Reserved_RMASK 0x7FF +#define QIB_7220_SendBufBase_BaseAddr_LargePIO_LSB 0x20 +#define QIB_7220_SendBufBase_BaseAddr_LargePIO_RMASK 0x1FFFFF +#define QIB_7220_SendBufBase_Reserved1_LSB 0x15 +#define QIB_7220_SendBufBase_Reserved1_RMASK 0x7FF +#define QIB_7220_SendBufBase_BaseAddr_SmallPIO_LSB 0x0 +#define QIB_7220_SendBufBase_BaseAddr_SmallPIO_RMASK 0x1FFFFF + +#define QIB_7220_SendBufSize_OFFS 0x1D0 +#define QIB_7220_SendBufSize_Reserved_LSB 0x2D +#define QIB_7220_SendBufSize_Reserved_RMASK 0xFFFFF +#define QIB_7220_SendBufSize_Size_LargePIO_LSB 0x20 +#define QIB_7220_SendBufSize_Size_LargePIO_RMASK 0x1FFF +#define QIB_7220_SendBufSize_Reserved1_LSB 0xC +#define QIB_7220_SendBufSize_Reserved1_RMASK 0xFFFFF +#define QIB_7220_SendBufSize_Size_SmallPIO_LSB 0x0 +#define QIB_7220_SendBufSize_Size_SmallPIO_RMASK 0xFFF + +#define QIB_7220_SendBufCnt_OFFS 0x1D8 +#define QIB_7220_SendBufCnt_Reserved_LSB 0x24 +#define QIB_7220_SendBufCnt_Reserved_RMASK 0xFFFFFFF +#define QIB_7220_SendBufCnt_Num_LargeBuffers_LSB 0x20 +#define QIB_7220_SendBufCnt_Num_LargeBuffers_RMASK 0xF +#define QIB_7220_SendBufCnt_Reserved1_LSB 0x9 +#define QIB_7220_SendBufCnt_Reserved1_RMASK 0x7FFFFF +#define QIB_7220_SendBufCnt_Num_SmallBuffers_LSB 0x0 +#define QIB_7220_SendBufCnt_Num_SmallBuffers_RMASK 0x1FF + +#define QIB_7220_SendBufAvailAddr_OFFS 0x1E0 +#define QIB_7220_SendBufAvailAddr_SendBufAvailAddr_LSB 0x6 +#define QIB_7220_SendBufAvailAddr_SendBufAvailAddr_RMASK 0x3FFFFFFFF +#define QIB_7220_SendBufAvailAddr_Reserved_LSB 0x0 +#define QIB_7220_SendBufAvailAddr_Reserved_RMASK 0x3F + +#define QIB_7220_TxIntMemBase_OFFS 0x1E8 + +#define QIB_7220_TxIntMemSize_OFFS 0x1F0 + +#define QIB_7220_SendDmaBase_OFFS 0x1F8 +#define QIB_7220_SendDmaBase_Reserved_LSB 0x30 +#define QIB_7220_SendDmaBase_Reserved_RMASK 0xFFFF +#define QIB_7220_SendDmaBase_SendDmaBase_LSB 0x0 +#define QIB_7220_SendDmaBase_SendDmaBase_RMASK 0xFFFFFFFFFFFF + +#define QIB_7220_SendDmaLenGen_OFFS 0x200 +#define QIB_7220_SendDmaLenGen_Reserved_LSB 0x13 +#define QIB_7220_SendDmaLenGen_Reserved_RMASK 0x1FFFFFFFFFFF +#define QIB_7220_SendDmaLenGen_Generation_LSB 0x10 +#define QIB_7220_SendDmaLenGen_Generation_MSB 0x12 +#define QIB_7220_SendDmaLenGen_Generation_RMASK 0x7 +#define QIB_7220_SendDmaLenGen_Length_LSB 0x0 +#define QIB_7220_SendDmaLenGen_Length_RMASK 0xFFFF + +#define QIB_7220_SendDmaTail_OFFS 0x208 +#define QIB_7220_SendDmaTail_Reserved_LSB 0x10 +#define QIB_7220_SendDmaTail_Reserved_RMASK 0xFFFFFFFFFFFF +#define QIB_7220_SendDmaTail_SendDmaTail_LSB 0x0 +#define QIB_7220_SendDmaTail_SendDmaTail_RMASK 0xFFFF + +#define QIB_7220_SendDmaHead_OFFS 0x210 +#define QIB_7220_SendDmaHead_Reserved_LSB 0x30 +#define QIB_7220_SendDmaHead_Reserved_RMASK 0xFFFF +#define QIB_7220_SendDmaHead_InternalSendDmaHead_LSB 0x20 +#define QIB_7220_SendDmaHead_InternalSendDmaHead_RMASK 0xFFFF +#define QIB_7220_SendDmaHead_Reserved1_LSB 0x10 +#define QIB_7220_SendDmaHead_Reserved1_RMASK 0xFFFF +#define QIB_7220_SendDmaHead_SendDmaHead_LSB 0x0 +#define QIB_7220_SendDmaHead_SendDmaHead_RMASK 0xFFFF + +#define QIB_7220_SendDmaHeadAddr_OFFS 0x218 +#define QIB_7220_SendDmaHeadAddr_Reserved_LSB 0x30 +#define QIB_7220_SendDmaHeadAddr_Reserved_RMASK 0xFFFF +#define QIB_7220_SendDmaHeadAddr_SendDmaHeadAddr_LSB 0x0 +#define QIB_7220_SendDmaHeadAddr_SendDmaHeadAddr_RMASK 0xFFFFFFFFFFFF + +#define QIB_7220_SendDmaBufMask0_OFFS 0x220 +#define QIB_7220_SendDmaBufMask0_BufMask_63_0_LSB 0x0 +#define QIB_7220_SendDmaBufMask0_BufMask_63_0_RMASK 0x0 + +#define QIB_7220_SendDmaStatus_OFFS 0x238 +#define QIB_7220_SendDmaStatus_ScoreBoardDrainInProg_LSB 0x3F +#define QIB_7220_SendDmaStatus_ScoreBoardDrainInProg_RMASK 0x1 +#define QIB_7220_SendDmaStatus_AbortInProg_LSB 0x3E +#define QIB_7220_SendDmaStatus_AbortInProg_RMASK 0x1 +#define QIB_7220_SendDmaStatus_InternalSDmaEnable_LSB 0x3D +#define QIB_7220_SendDmaStatus_InternalSDmaEnable_RMASK 0x1 +#define QIB_7220_SendDmaStatus_ScbDescIndex_13_0_LSB 0x2F +#define QIB_7220_SendDmaStatus_ScbDescIndex_13_0_RMASK 0x3FFF +#define QIB_7220_SendDmaStatus_RpyLowAddr_6_0_LSB 0x28 +#define QIB_7220_SendDmaStatus_RpyLowAddr_6_0_RMASK 0x7F +#define QIB_7220_SendDmaStatus_RpyTag_7_0_LSB 0x20 +#define QIB_7220_SendDmaStatus_RpyTag_7_0_RMASK 0xFF +#define QIB_7220_SendDmaStatus_ScbFull_LSB 0x1F +#define QIB_7220_SendDmaStatus_ScbFull_RMASK 0x1 +#define QIB_7220_SendDmaStatus_ScbEmpty_LSB 0x1E +#define QIB_7220_SendDmaStatus_ScbEmpty_RMASK 0x1 +#define QIB_7220_SendDmaStatus_ScbEntryValid_LSB 0x1D +#define QIB_7220_SendDmaStatus_ScbEntryValid_RMASK 0x1 +#define QIB_7220_SendDmaStatus_ScbFetchDescFlag_LSB 0x1C +#define QIB_7220_SendDmaStatus_ScbFetchDescFlag_RMASK 0x1 +#define QIB_7220_SendDmaStatus_SplFifoReadyToGo_LSB 0x1B +#define QIB_7220_SendDmaStatus_SplFifoReadyToGo_RMASK 0x1 +#define QIB_7220_SendDmaStatus_SplFifoDisarmed_LSB 0x1A +#define QIB_7220_SendDmaStatus_SplFifoDisarmed_RMASK 0x1 +#define QIB_7220_SendDmaStatus_SplFifoEmpty_LSB 0x19 +#define QIB_7220_SendDmaStatus_SplFifoEmpty_RMASK 0x1 +#define QIB_7220_SendDmaStatus_SplFifoFull_LSB 0x18 +#define QIB_7220_SendDmaStatus_SplFifoFull_RMASK 0x1 +#define QIB_7220_SendDmaStatus_SplFifoBufNum_LSB 0x10 +#define QIB_7220_SendDmaStatus_SplFifoBufNum_RMASK 0xFF +#define QIB_7220_SendDmaStatus_SplFifoDescIndex_LSB 0x0 +#define QIB_7220_SendDmaStatus_SplFifoDescIndex_RMASK 0xFFFF + +#define QIB_7220_SendBufErr0_OFFS 0x240 +#define QIB_7220_SendBufErr0_SendBufErr_63_0_LSB 0x0 +#define QIB_7220_SendBufErr0_SendBufErr_63_0_RMASK 0x0 + +#define QIB_7220_RcvHdrAddr0_OFFS 0x270 +#define QIB_7220_RcvHdrAddr0_RcvHdrAddr0_LSB 0x2 +#define QIB_7220_RcvHdrAddr0_RcvHdrAddr0_RMASK 0x3FFFFFFFFF +#define QIB_7220_RcvHdrAddr0_Reserved_LSB 0x0 +#define QIB_7220_RcvHdrAddr0_Reserved_RMASK 0x3 + +#define QIB_7220_RcvHdrTailAddr0_OFFS 0x300 +#define QIB_7220_RcvHdrTailAddr0_RcvHdrTailAddr0_LSB 0x2 +#define QIB_7220_RcvHdrTailAddr0_RcvHdrTailAddr0_RMASK 0x3FFFFFFFFF +#define QIB_7220_RcvHdrTailAddr0_Reserved_LSB 0x0 +#define QIB_7220_RcvHdrTailAddr0_Reserved_RMASK 0x3 + +#define QIB_7220_ibsd_epb_access_ctrl_OFFS 0x3C0 +#define QIB_7220_ibsd_epb_access_ctrl_sw_ib_epb_req_granted_LSB 0x8 +#define QIB_7220_ibsd_epb_access_ctrl_sw_ib_epb_req_granted_RMASK 0x1 +#define QIB_7220_ibsd_epb_access_ctrl_Reserved_LSB 0x1 +#define QIB_7220_ibsd_epb_access_ctrl_Reserved_RMASK 0x7F +#define QIB_7220_ibsd_epb_access_ctrl_sw_ib_epb_req_LSB 0x0 +#define QIB_7220_ibsd_epb_access_ctrl_sw_ib_epb_req_RMASK 0x1 + +#define QIB_7220_ibsd_epb_transaction_reg_OFFS 0x3C8 +#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_rdy_LSB 0x1F +#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_rdy_RMASK 0x1 +#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_req_error_LSB 0x1E +#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_req_error_RMASK 0x1 +#define QIB_7220_ibsd_epb_transaction_reg_Reserved_LSB 0x1D +#define QIB_7220_ibsd_epb_transaction_reg_Reserved_RMASK 0x1 +#define QIB_7220_ibsd_epb_transaction_reg_mem_data_parity_LSB 0x1C +#define QIB_7220_ibsd_epb_transaction_reg_mem_data_parity_RMASK 0x1 +#define QIB_7220_ibsd_epb_transaction_reg_Reserved1_LSB 0x1B +#define QIB_7220_ibsd_epb_transaction_reg_Reserved1_RMASK 0x1 +#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_cs_LSB 0x19 +#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_cs_RMASK 0x3 +#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_read_write_LSB 0x18 +#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_read_write_RMASK 0x1 +#define QIB_7220_ibsd_epb_transaction_reg_Reserved2_LSB 0x17 +#define QIB_7220_ibsd_epb_transaction_reg_Reserved2_RMASK 0x1 +#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_address_LSB 0x8 +#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_address_RMASK 0x7FFF +#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_data_LSB 0x0 +#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_data_RMASK 0xFF + +#define QIB_7220_XGXSCfg_OFFS 0x3D8 +#define QIB_7220_XGXSCfg_sel_link_down_for_fctrl_lane_sync_reset_LSB 0x3F +#define QIB_7220_XGXSCfg_sel_link_down_for_fctrl_lane_sync_reset_RMASK 0x1 +#define QIB_7220_XGXSCfg_Reserved_LSB 0x13 +#define QIB_7220_XGXSCfg_Reserved_RMASK 0xFFFFFFFFFFF +#define QIB_7220_XGXSCfg_link_sync_mask_LSB 0x9 +#define QIB_7220_XGXSCfg_link_sync_mask_RMASK 0x3FF +#define QIB_7220_XGXSCfg_Reserved1_LSB 0x3 +#define QIB_7220_XGXSCfg_Reserved1_RMASK 0x3F +#define QIB_7220_XGXSCfg_xcv_reset_LSB 0x2 +#define QIB_7220_XGXSCfg_xcv_reset_RMASK 0x1 +#define QIB_7220_XGXSCfg_Reserved2_LSB 0x1 +#define QIB_7220_XGXSCfg_Reserved2_RMASK 0x1 +#define QIB_7220_XGXSCfg_tx_rx_reset_LSB 0x0 +#define QIB_7220_XGXSCfg_tx_rx_reset_RMASK 0x1 + +#define QIB_7220_IBSerDesCtrl_OFFS 0x3E0 +#define QIB_7220_IBSerDesCtrl_Reserved_LSB 0x2D +#define QIB_7220_IBSerDesCtrl_Reserved_RMASK 0x7FFFF +#define QIB_7220_IBSerDesCtrl_INT_uC_LSB 0x2C +#define QIB_7220_IBSerDesCtrl_INT_uC_RMASK 0x1 +#define QIB_7220_IBSerDesCtrl_CKSEL_uC_LSB 0x2A +#define QIB_7220_IBSerDesCtrl_CKSEL_uC_RMASK 0x3 +#define QIB_7220_IBSerDesCtrl_PLLN_LSB 0x28 +#define QIB_7220_IBSerDesCtrl_PLLN_RMASK 0x3 +#define QIB_7220_IBSerDesCtrl_PLLM_LSB 0x25 +#define QIB_7220_IBSerDesCtrl_PLLM_RMASK 0x7 +#define QIB_7220_IBSerDesCtrl_TXOBPD_LSB 0x24 +#define QIB_7220_IBSerDesCtrl_TXOBPD_RMASK 0x1 +#define QIB_7220_IBSerDesCtrl_TWC_LSB 0x23 +#define QIB_7220_IBSerDesCtrl_TWC_RMASK 0x1 +#define QIB_7220_IBSerDesCtrl_RXIDLE_LSB 0x22 +#define QIB_7220_IBSerDesCtrl_RXIDLE_RMASK 0x1 +#define QIB_7220_IBSerDesCtrl_RXINV_LSB 0x21 +#define QIB_7220_IBSerDesCtrl_RXINV_RMASK 0x1 +#define QIB_7220_IBSerDesCtrl_TXINV_LSB 0x20 +#define QIB_7220_IBSerDesCtrl_TXINV_RMASK 0x1 +#define QIB_7220_IBSerDesCtrl_Reserved1_LSB 0x12 +#define QIB_7220_IBSerDesCtrl_Reserved1_RMASK 0x3FFF +#define QIB_7220_IBSerDesCtrl_NumSerDesRegsToWrForRXEQ_LSB 0xD +#define QIB_7220_IBSerDesCtrl_NumSerDesRegsToWrForRXEQ_RMASK 0x1F +#define QIB_7220_IBSerDesCtrl_NumSerDesRegsToWrForDDS_LSB 0x8 +#define QIB_7220_IBSerDesCtrl_NumSerDesRegsToWrForDDS_RMASK 0x1F +#define QIB_7220_IBSerDesCtrl_Reserved2_LSB 0x1 +#define QIB_7220_IBSerDesCtrl_Reserved2_RMASK 0x7F +#define QIB_7220_IBSerDesCtrl_ResetIB_uC_Core_LSB 0x0 +#define QIB_7220_IBSerDesCtrl_ResetIB_uC_Core_RMASK 0x1 + +#define QIB_7220_pciesd_epb_access_ctrl_OFFS 0x400 +#define QIB_7220_pciesd_epb_access_ctrl_sw_pcie_epb_req_granted_LSB 0x8 +#define QIB_7220_pciesd_epb_access_ctrl_sw_pcie_epb_req_granted_RMASK 0x1 +#define QIB_7220_pciesd_epb_access_ctrl_Reserved_LSB 0x3 +#define QIB_7220_pciesd_epb_access_ctrl_Reserved_RMASK 0x1F +#define QIB_7220_pciesd_epb_access_ctrl_sw_pcieepb_star_en_LSB 0x1 +#define QIB_7220_pciesd_epb_access_ctrl_sw_pcieepb_star_en_RMASK 0x3 +#define QIB_7220_pciesd_epb_access_ctrl_sw_pcie_epb_req_LSB 0x0 +#define QIB_7220_pciesd_epb_access_ctrl_sw_pcie_epb_req_RMASK 0x1 + +#define QIB_7220_pciesd_epb_transaction_reg_OFFS 0x408 +#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_rdy_LSB 0x1F +#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_rdy_RMASK 0x1 +#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_req_error_LSB 0x1E +#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_req_error_RMASK 0x1 +#define QIB_7220_pciesd_epb_transaction_reg_Reserved_LSB 0x1D +#define QIB_7220_pciesd_epb_transaction_reg_Reserved_RMASK 0x1 +#define QIB_7220_pciesd_epb_transaction_reg_mem_data_parity_LSB 0x1C +#define QIB_7220_pciesd_epb_transaction_reg_mem_data_parity_RMASK 0x1 +#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_cs_LSB 0x19 +#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_cs_RMASK 0x7 +#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_read_write_LSB 0x18 +#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_read_write_RMASK 0x1 +#define QIB_7220_pciesd_epb_transaction_reg_Reserved1_LSB 0x17 +#define QIB_7220_pciesd_epb_transaction_reg_Reserved1_RMASK 0x1 +#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_address_LSB 0x8 +#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_address_RMASK 0x7FFF +#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_data_LSB 0x0 +#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_data_RMASK 0xFF + +#define QIB_7220_SerDes_DDSRXEQ0_OFFS 0x500 +#define QIB_7220_SerDes_DDSRXEQ0_reg_addr_LSB 0x4 +#define QIB_7220_SerDes_DDSRXEQ0_reg_addr_RMASK 0x3F +#define QIB_7220_SerDes_DDSRXEQ0_element_num_LSB 0x0 +#define QIB_7220_SerDes_DDSRXEQ0_element_num_RMASK 0xF + +#define QIB_7220_LBIntCnt_OFFS 0x13000 + +#define QIB_7220_LBFlowStallCnt_OFFS 0x13008 + +#define QIB_7220_TxSDmaDescCnt_OFFS 0x13010 + +#define QIB_7220_TxUnsupVLErrCnt_OFFS 0x13018 + +#define QIB_7220_TxDataPktCnt_OFFS 0x13020 + +#define QIB_7220_TxFlowPktCnt_OFFS 0x13028 + +#define QIB_7220_TxDwordCnt_OFFS 0x13030 + +#define QIB_7220_TxLenErrCnt_OFFS 0x13038 + +#define QIB_7220_TxMaxMinLenErrCnt_OFFS 0x13040 + +#define QIB_7220_TxUnderrunCnt_OFFS 0x13048 + +#define QIB_7220_TxFlowStallCnt_OFFS 0x13050 + +#define QIB_7220_TxDroppedPktCnt_OFFS 0x13058 + +#define QIB_7220_RxDroppedPktCnt_OFFS 0x13060 + +#define QIB_7220_RxDataPktCnt_OFFS 0x13068 + +#define QIB_7220_RxFlowPktCnt_OFFS 0x13070 + +#define QIB_7220_RxDwordCnt_OFFS 0x13078 + +#define QIB_7220_RxLenErrCnt_OFFS 0x13080 + +#define QIB_7220_RxMaxMinLenErrCnt_OFFS 0x13088 + +#define QIB_7220_RxICRCErrCnt_OFFS 0x13090 + +#define QIB_7220_RxVCRCErrCnt_OFFS 0x13098 + +#define QIB_7220_RxFlowCtrlViolCnt_OFFS 0x130A0 + +#define QIB_7220_RxVersionErrCnt_OFFS 0x130A8 + +#define QIB_7220_RxLinkMalformCnt_OFFS 0x130B0 + +#define QIB_7220_RxEBPCnt_OFFS 0x130B8 + +#define QIB_7220_RxLPCRCErrCnt_OFFS 0x130C0 + +#define QIB_7220_RxBufOvflCnt_OFFS 0x130C8 + +#define QIB_7220_RxTIDFullErrCnt_OFFS 0x130D0 + +#define QIB_7220_RxTIDValidErrCnt_OFFS 0x130D8 + +#define QIB_7220_RxPKeyMismatchCnt_OFFS 0x130E0 + +#define QIB_7220_RxP0HdrEgrOvflCnt_OFFS 0x130E8 + +#define QIB_7220_IBStatusChangeCnt_OFFS 0x13170 + +#define QIB_7220_IBLinkErrRecoveryCnt_OFFS 0x13178 + +#define QIB_7220_IBLinkDownedCnt_OFFS 0x13180 + +#define QIB_7220_IBSymbolErrCnt_OFFS 0x13188 + +#define QIB_7220_RxVL15DroppedPktCnt_OFFS 0x13190 + +#define QIB_7220_RxOtherLocalPhyErrCnt_OFFS 0x13198 + +#define QIB_7220_PcieRetryBufDiagQwordCnt_OFFS 0x131A0 + +#define QIB_7220_ExcessBufferOvflCnt_OFFS 0x131A8 + +#define QIB_7220_LocalLinkIntegrityErrCnt_OFFS 0x131B0 + +#define QIB_7220_RxVlErrCnt_OFFS 0x131B8 + +#define QIB_7220_RxDlidFltrCnt_OFFS 0x131C0 + +#define QIB_7220_CNT_0131C8_OFFS 0x131C8 + +#define QIB_7220_PSStat_OFFS 0x13200 + +#define QIB_7220_PSStart_OFFS 0x13208 + +#define QIB_7220_PSInterval_OFFS 0x13210 + +#define QIB_7220_PSRcvDataCount_OFFS 0x13218 + +#define QIB_7220_PSRcvPktsCount_OFFS 0x13220 + +#define QIB_7220_PSXmitDataCount_OFFS 0x13228 + +#define QIB_7220_PSXmitPktsCount_OFFS 0x13230 + +#define QIB_7220_PSXmitWaitCount_OFFS 0x13238 + +#define QIB_7220_CNT_013240_OFFS 0x13240 + +#define QIB_7220_RcvEgrArray_OFFS 0x14000 + +#define QIB_7220_MEM_038000_OFFS 0x38000 + +#define QIB_7220_RcvTIDArray0_OFFS 0x53000 + +#define QIB_7220_PIOLaunchFIFO_OFFS 0x64000 + +#define QIB_7220_MEM_064480_OFFS 0x64480 + +#define QIB_7220_SendPIOpbcCache_OFFS 0x64800 + +#define QIB_7220_MEM_064C80_OFFS 0x64C80 + +#define QIB_7220_PreLaunchFIFO_OFFS 0x65000 + +#define QIB_7220_MEM_065080_OFFS 0x65080 + +#define QIB_7220_ScoreBoard_OFFS 0x65400 + +#define QIB_7220_MEM_065440_OFFS 0x65440 + +#define QIB_7220_DescriptorFIFO_OFFS 0x65800 + +#define QIB_7220_MEM_065880_OFFS 0x65880 + +#define QIB_7220_RcvBuf1_OFFS 0x72000 + +#define QIB_7220_MEM_074800_OFFS 0x74800 + +#define QIB_7220_RcvBuf2_OFFS 0x75000 + +#define QIB_7220_MEM_076400_OFFS 0x76400 + +#define QIB_7220_RcvFlags_OFFS 0x77000 + +#define QIB_7220_MEM_078400_OFFS 0x78400 + +#define QIB_7220_RcvLookupBuf1_OFFS 0x79000 + +#define QIB_7220_MEM_07A400_OFFS 0x7A400 + +#define QIB_7220_RcvDMADatBuf_OFFS 0x7B000 + +#define QIB_7220_RcvDMAHdrBuf_OFFS 0x7B800 + +#define QIB_7220_MiscRXEIntMem_OFFS 0x7C000 + +#define QIB_7220_MEM_07D400_OFFS 0x7D400 + +#define QIB_7220_PCIERcvBuf_OFFS 0x80000 + +#define QIB_7220_PCIERetryBuf_OFFS 0x84000 + +#define QIB_7220_PCIERcvBufRdToWrAddr_OFFS 0x88000 + +#define QIB_7220_PCIECplBuf_OFFS 0x90000 + +#define QIB_7220_IBSerDesMappTable_OFFS 0x94000 + +#define QIB_7220_MEM_095000_OFFS 0x95000 + +#define QIB_7220_SendBuf0_MA_OFFS 0x100000 + +#define QIB_7220_MEM_1A0000_OFFS 0x1A0000 diff --git a/drivers/infiniband/hw/qib/qib_7322_regs.h b/drivers/infiniband/hw/qib/qib_7322_regs.h new file mode 100644 index 0000000..32dc81f --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_7322_regs.h @@ -0,0 +1,3163 @@ +/* + * Copyright (c) 2008, 2009, 2010 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* This file is mechanically generated from RTL. Any hand-edits will be lost! */ + +#define QIB_7322_Revision_OFFS 0x0 +#define QIB_7322_Revision_DEF 0x0000000002010601 +#define QIB_7322_Revision_R_Simulator_LSB 0x3F +#define QIB_7322_Revision_R_Simulator_MSB 0x3F +#define QIB_7322_Revision_R_Simulator_RMASK 0x1 +#define QIB_7322_Revision_R_Emulation_LSB 0x3E +#define QIB_7322_Revision_R_Emulation_MSB 0x3E +#define QIB_7322_Revision_R_Emulation_RMASK 0x1 +#define QIB_7322_Revision_R_Emulation_Revcode_LSB 0x28 +#define QIB_7322_Revision_R_Emulation_Revcode_MSB 0x3D +#define QIB_7322_Revision_R_Emulation_Revcode_RMASK 0x3FFFFF +#define QIB_7322_Revision_BoardID_LSB 0x20 +#define QIB_7322_Revision_BoardID_MSB 0x27 +#define QIB_7322_Revision_BoardID_RMASK 0xFF +#define QIB_7322_Revision_R_SW_LSB 0x18 +#define QIB_7322_Revision_R_SW_MSB 0x1F +#define QIB_7322_Revision_R_SW_RMASK 0xFF +#define QIB_7322_Revision_R_Arch_LSB 0x10 +#define QIB_7322_Revision_R_Arch_MSB 0x17 +#define QIB_7322_Revision_R_Arch_RMASK 0xFF +#define QIB_7322_Revision_R_ChipRevMajor_LSB 0x8 +#define QIB_7322_Revision_R_ChipRevMajor_MSB 0xF +#define QIB_7322_Revision_R_ChipRevMajor_RMASK 0xFF +#define QIB_7322_Revision_R_ChipRevMinor_LSB 0x0 +#define QIB_7322_Revision_R_ChipRevMinor_MSB 0x7 +#define QIB_7322_Revision_R_ChipRevMinor_RMASK 0xFF + +#define QIB_7322_Control_OFFS 0x8 +#define QIB_7322_Control_DEF 0x0000000000000000 +#define QIB_7322_Control_PCIECplQDiagEn_LSB 0x6 +#define QIB_7322_Control_PCIECplQDiagEn_MSB 0x6 +#define QIB_7322_Control_PCIECplQDiagEn_RMASK 0x1 +#define QIB_7322_Control_PCIEPostQDiagEn_LSB 0x5 +#define QIB_7322_Control_PCIEPostQDiagEn_MSB 0x5 +#define QIB_7322_Control_PCIEPostQDiagEn_RMASK 0x1 +#define QIB_7322_Control_SDmaDescFetchPriorityEn_LSB 0x4 +#define QIB_7322_Control_SDmaDescFetchPriorityEn_MSB 0x4 +#define QIB_7322_Control_SDmaDescFetchPriorityEn_RMASK 0x1 +#define QIB_7322_Control_PCIERetryBufDiagEn_LSB 0x3 +#define QIB_7322_Control_PCIERetryBufDiagEn_MSB 0x3 +#define QIB_7322_Control_PCIERetryBufDiagEn_RMASK 0x1 +#define QIB_7322_Control_FreezeMode_LSB 0x1 +#define QIB_7322_Control_FreezeMode_MSB 0x1 +#define QIB_7322_Control_FreezeMode_RMASK 0x1 +#define QIB_7322_Control_SyncReset_LSB 0x0 +#define QIB_7322_Control_SyncReset_MSB 0x0 +#define QIB_7322_Control_SyncReset_RMASK 0x1 + +#define QIB_7322_PageAlign_OFFS 0x10 +#define QIB_7322_PageAlign_DEF 0x0000000000001000 + +#define QIB_7322_ContextCnt_OFFS 0x18 +#define QIB_7322_ContextCnt_DEF 0x0000000000000012 + +#define QIB_7322_Scratch_OFFS 0x20 +#define QIB_7322_Scratch_DEF 0x0000000000000000 + +#define QIB_7322_CntrRegBase_OFFS 0x28 +#define QIB_7322_CntrRegBase_DEF 0x0000000000011000 + +#define QIB_7322_SendRegBase_OFFS 0x30 +#define QIB_7322_SendRegBase_DEF 0x0000000000003000 + +#define QIB_7322_UserRegBase_OFFS 0x38 +#define QIB_7322_UserRegBase_DEF 0x0000000000200000 + +#define QIB_7322_IntMask_OFFS 0x68 +#define QIB_7322_IntMask_DEF 0x0000000000000000 +#define QIB_7322_IntMask_SDmaIntMask_1_LSB 0x3F +#define QIB_7322_IntMask_SDmaIntMask_1_MSB 0x3F +#define QIB_7322_IntMask_SDmaIntMask_1_RMASK 0x1 +#define QIB_7322_IntMask_SDmaIntMask_0_LSB 0x3E +#define QIB_7322_IntMask_SDmaIntMask_0_MSB 0x3E +#define QIB_7322_IntMask_SDmaIntMask_0_RMASK 0x1 +#define QIB_7322_IntMask_SDmaProgressIntMask_1_LSB 0x3D +#define QIB_7322_IntMask_SDmaProgressIntMask_1_MSB 0x3D +#define QIB_7322_IntMask_SDmaProgressIntMask_1_RMASK 0x1 +#define QIB_7322_IntMask_SDmaProgressIntMask_0_LSB 0x3C +#define QIB_7322_IntMask_SDmaProgressIntMask_0_MSB 0x3C +#define QIB_7322_IntMask_SDmaProgressIntMask_0_RMASK 0x1 +#define QIB_7322_IntMask_SDmaIdleIntMask_1_LSB 0x3B +#define QIB_7322_IntMask_SDmaIdleIntMask_1_MSB 0x3B +#define QIB_7322_IntMask_SDmaIdleIntMask_1_RMASK 0x1 +#define QIB_7322_IntMask_SDmaIdleIntMask_0_LSB 0x3A +#define QIB_7322_IntMask_SDmaIdleIntMask_0_MSB 0x3A +#define QIB_7322_IntMask_SDmaIdleIntMask_0_RMASK 0x1 +#define QIB_7322_IntMask_SDmaCleanupDoneMask_1_LSB 0x39 +#define QIB_7322_IntMask_SDmaCleanupDoneMask_1_MSB 0x39 +#define QIB_7322_IntMask_SDmaCleanupDoneMask_1_RMASK 0x1 +#define QIB_7322_IntMask_SDmaCleanupDoneMask_0_LSB 0x38 +#define QIB_7322_IntMask_SDmaCleanupDoneMask_0_MSB 0x38 +#define QIB_7322_IntMask_SDmaCleanupDoneMask_0_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg17IntMask_LSB 0x31 +#define QIB_7322_IntMask_RcvUrg17IntMask_MSB 0x31 +#define QIB_7322_IntMask_RcvUrg17IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg16IntMask_LSB 0x30 +#define QIB_7322_IntMask_RcvUrg16IntMask_MSB 0x30 +#define QIB_7322_IntMask_RcvUrg16IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg15IntMask_LSB 0x2F +#define QIB_7322_IntMask_RcvUrg15IntMask_MSB 0x2F +#define QIB_7322_IntMask_RcvUrg15IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg14IntMask_LSB 0x2E +#define QIB_7322_IntMask_RcvUrg14IntMask_MSB 0x2E +#define QIB_7322_IntMask_RcvUrg14IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg13IntMask_LSB 0x2D +#define QIB_7322_IntMask_RcvUrg13IntMask_MSB 0x2D +#define QIB_7322_IntMask_RcvUrg13IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg12IntMask_LSB 0x2C +#define QIB_7322_IntMask_RcvUrg12IntMask_MSB 0x2C +#define QIB_7322_IntMask_RcvUrg12IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg11IntMask_LSB 0x2B +#define QIB_7322_IntMask_RcvUrg11IntMask_MSB 0x2B +#define QIB_7322_IntMask_RcvUrg11IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg10IntMask_LSB 0x2A +#define QIB_7322_IntMask_RcvUrg10IntMask_MSB 0x2A +#define QIB_7322_IntMask_RcvUrg10IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg9IntMask_LSB 0x29 +#define QIB_7322_IntMask_RcvUrg9IntMask_MSB 0x29 +#define QIB_7322_IntMask_RcvUrg9IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg8IntMask_LSB 0x28 +#define QIB_7322_IntMask_RcvUrg8IntMask_MSB 0x28 +#define QIB_7322_IntMask_RcvUrg8IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg7IntMask_LSB 0x27 +#define QIB_7322_IntMask_RcvUrg7IntMask_MSB 0x27 +#define QIB_7322_IntMask_RcvUrg7IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg6IntMask_LSB 0x26 +#define QIB_7322_IntMask_RcvUrg6IntMask_MSB 0x26 +#define QIB_7322_IntMask_RcvUrg6IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg5IntMask_LSB 0x25 +#define QIB_7322_IntMask_RcvUrg5IntMask_MSB 0x25 +#define QIB_7322_IntMask_RcvUrg5IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg4IntMask_LSB 0x24 +#define QIB_7322_IntMask_RcvUrg4IntMask_MSB 0x24 +#define QIB_7322_IntMask_RcvUrg4IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg3IntMask_LSB 0x23 +#define QIB_7322_IntMask_RcvUrg3IntMask_MSB 0x23 +#define QIB_7322_IntMask_RcvUrg3IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg2IntMask_LSB 0x22 +#define QIB_7322_IntMask_RcvUrg2IntMask_MSB 0x22 +#define QIB_7322_IntMask_RcvUrg2IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg1IntMask_LSB 0x21 +#define QIB_7322_IntMask_RcvUrg1IntMask_MSB 0x21 +#define QIB_7322_IntMask_RcvUrg1IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvUrg0IntMask_LSB 0x20 +#define QIB_7322_IntMask_RcvUrg0IntMask_MSB 0x20 +#define QIB_7322_IntMask_RcvUrg0IntMask_RMASK 0x1 +#define QIB_7322_IntMask_ErrIntMask_1_LSB 0x1F +#define QIB_7322_IntMask_ErrIntMask_1_MSB 0x1F +#define QIB_7322_IntMask_ErrIntMask_1_RMASK 0x1 +#define QIB_7322_IntMask_ErrIntMask_0_LSB 0x1E +#define QIB_7322_IntMask_ErrIntMask_0_MSB 0x1E +#define QIB_7322_IntMask_ErrIntMask_0_RMASK 0x1 +#define QIB_7322_IntMask_ErrIntMask_LSB 0x1D +#define QIB_7322_IntMask_ErrIntMask_MSB 0x1D +#define QIB_7322_IntMask_ErrIntMask_RMASK 0x1 +#define QIB_7322_IntMask_AssertGPIOIntMask_LSB 0x1C +#define QIB_7322_IntMask_AssertGPIOIntMask_MSB 0x1C +#define QIB_7322_IntMask_AssertGPIOIntMask_RMASK 0x1 +#define QIB_7322_IntMask_SendDoneIntMask_1_LSB 0x19 +#define QIB_7322_IntMask_SendDoneIntMask_1_MSB 0x19 +#define QIB_7322_IntMask_SendDoneIntMask_1_RMASK 0x1 +#define QIB_7322_IntMask_SendDoneIntMask_0_LSB 0x18 +#define QIB_7322_IntMask_SendDoneIntMask_0_MSB 0x18 +#define QIB_7322_IntMask_SendDoneIntMask_0_RMASK 0x1 +#define QIB_7322_IntMask_SendBufAvailIntMask_LSB 0x17 +#define QIB_7322_IntMask_SendBufAvailIntMask_MSB 0x17 +#define QIB_7322_IntMask_SendBufAvailIntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail17IntMask_LSB 0x11 +#define QIB_7322_IntMask_RcvAvail17IntMask_MSB 0x11 +#define QIB_7322_IntMask_RcvAvail17IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail16IntMask_LSB 0x10 +#define QIB_7322_IntMask_RcvAvail16IntMask_MSB 0x10 +#define QIB_7322_IntMask_RcvAvail16IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail15IntMask_LSB 0xF +#define QIB_7322_IntMask_RcvAvail15IntMask_MSB 0xF +#define QIB_7322_IntMask_RcvAvail15IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail14IntMask_LSB 0xE +#define QIB_7322_IntMask_RcvAvail14IntMask_MSB 0xE +#define QIB_7322_IntMask_RcvAvail14IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail13IntMask_LSB 0xD +#define QIB_7322_IntMask_RcvAvail13IntMask_MSB 0xD +#define QIB_7322_IntMask_RcvAvail13IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail12IntMask_LSB 0xC +#define QIB_7322_IntMask_RcvAvail12IntMask_MSB 0xC +#define QIB_7322_IntMask_RcvAvail12IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail11IntMask_LSB 0xB +#define QIB_7322_IntMask_RcvAvail11IntMask_MSB 0xB +#define QIB_7322_IntMask_RcvAvail11IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail10IntMask_LSB 0xA +#define QIB_7322_IntMask_RcvAvail10IntMask_MSB 0xA +#define QIB_7322_IntMask_RcvAvail10IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail9IntMask_LSB 0x9 +#define QIB_7322_IntMask_RcvAvail9IntMask_MSB 0x9 +#define QIB_7322_IntMask_RcvAvail9IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail8IntMask_LSB 0x8 +#define QIB_7322_IntMask_RcvAvail8IntMask_MSB 0x8 +#define QIB_7322_IntMask_RcvAvail8IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail7IntMask_LSB 0x7 +#define QIB_7322_IntMask_RcvAvail7IntMask_MSB 0x7 +#define QIB_7322_IntMask_RcvAvail7IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail6IntMask_LSB 0x6 +#define QIB_7322_IntMask_RcvAvail6IntMask_MSB 0x6 +#define QIB_7322_IntMask_RcvAvail6IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail5IntMask_LSB 0x5 +#define QIB_7322_IntMask_RcvAvail5IntMask_MSB 0x5 +#define QIB_7322_IntMask_RcvAvail5IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail4IntMask_LSB 0x4 +#define QIB_7322_IntMask_RcvAvail4IntMask_MSB 0x4 +#define QIB_7322_IntMask_RcvAvail4IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail3IntMask_LSB 0x3 +#define QIB_7322_IntMask_RcvAvail3IntMask_MSB 0x3 +#define QIB_7322_IntMask_RcvAvail3IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail2IntMask_LSB 0x2 +#define QIB_7322_IntMask_RcvAvail2IntMask_MSB 0x2 +#define QIB_7322_IntMask_RcvAvail2IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail1IntMask_LSB 0x1 +#define QIB_7322_IntMask_RcvAvail1IntMask_MSB 0x1 +#define QIB_7322_IntMask_RcvAvail1IntMask_RMASK 0x1 +#define QIB_7322_IntMask_RcvAvail0IntMask_LSB 0x0 +#define QIB_7322_IntMask_RcvAvail0IntMask_MSB 0x0 +#define QIB_7322_IntMask_RcvAvail0IntMask_RMASK 0x1 + +#define QIB_7322_IntStatus_OFFS 0x70 +#define QIB_7322_IntStatus_DEF 0x0000000000000000 +#define QIB_7322_IntStatus_SDmaInt_1_LSB 0x3F +#define QIB_7322_IntStatus_SDmaInt_1_MSB 0x3F +#define QIB_7322_IntStatus_SDmaInt_1_RMASK 0x1 +#define QIB_7322_IntStatus_SDmaInt_0_LSB 0x3E +#define QIB_7322_IntStatus_SDmaInt_0_MSB 0x3E +#define QIB_7322_IntStatus_SDmaInt_0_RMASK 0x1 +#define QIB_7322_IntStatus_SDmaProgressInt_1_LSB 0x3D +#define QIB_7322_IntStatus_SDmaProgressInt_1_MSB 0x3D +#define QIB_7322_IntStatus_SDmaProgressInt_1_RMASK 0x1 +#define QIB_7322_IntStatus_SDmaProgressInt_0_LSB 0x3C +#define QIB_7322_IntStatus_SDmaProgressInt_0_MSB 0x3C +#define QIB_7322_IntStatus_SDmaProgressInt_0_RMASK 0x1 +#define QIB_7322_IntStatus_SDmaIdleInt_1_LSB 0x3B +#define QIB_7322_IntStatus_SDmaIdleInt_1_MSB 0x3B +#define QIB_7322_IntStatus_SDmaIdleInt_1_RMASK 0x1 +#define QIB_7322_IntStatus_SDmaIdleInt_0_LSB 0x3A +#define QIB_7322_IntStatus_SDmaIdleInt_0_MSB 0x3A +#define QIB_7322_IntStatus_SDmaIdleInt_0_RMASK 0x1 +#define QIB_7322_IntStatus_SDmaCleanupDone_1_LSB 0x39 +#define QIB_7322_IntStatus_SDmaCleanupDone_1_MSB 0x39 +#define QIB_7322_IntStatus_SDmaCleanupDone_1_RMASK 0x1 +#define QIB_7322_IntStatus_SDmaCleanupDone_0_LSB 0x38 +#define QIB_7322_IntStatus_SDmaCleanupDone_0_MSB 0x38 +#define QIB_7322_IntStatus_SDmaCleanupDone_0_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg17_LSB 0x31 +#define QIB_7322_IntStatus_RcvUrg17_MSB 0x31 +#define QIB_7322_IntStatus_RcvUrg17_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg16_LSB 0x30 +#define QIB_7322_IntStatus_RcvUrg16_MSB 0x30 +#define QIB_7322_IntStatus_RcvUrg16_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg15_LSB 0x2F +#define QIB_7322_IntStatus_RcvUrg15_MSB 0x2F +#define QIB_7322_IntStatus_RcvUrg15_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg14_LSB 0x2E +#define QIB_7322_IntStatus_RcvUrg14_MSB 0x2E +#define QIB_7322_IntStatus_RcvUrg14_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg13_LSB 0x2D +#define QIB_7322_IntStatus_RcvUrg13_MSB 0x2D +#define QIB_7322_IntStatus_RcvUrg13_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg12_LSB 0x2C +#define QIB_7322_IntStatus_RcvUrg12_MSB 0x2C +#define QIB_7322_IntStatus_RcvUrg12_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg11_LSB 0x2B +#define QIB_7322_IntStatus_RcvUrg11_MSB 0x2B +#define QIB_7322_IntStatus_RcvUrg11_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg10_LSB 0x2A +#define QIB_7322_IntStatus_RcvUrg10_MSB 0x2A +#define QIB_7322_IntStatus_RcvUrg10_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg9_LSB 0x29 +#define QIB_7322_IntStatus_RcvUrg9_MSB 0x29 +#define QIB_7322_IntStatus_RcvUrg9_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg8_LSB 0x28 +#define QIB_7322_IntStatus_RcvUrg8_MSB 0x28 +#define QIB_7322_IntStatus_RcvUrg8_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg7_LSB 0x27 +#define QIB_7322_IntStatus_RcvUrg7_MSB 0x27 +#define QIB_7322_IntStatus_RcvUrg7_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg6_LSB 0x26 +#define QIB_7322_IntStatus_RcvUrg6_MSB 0x26 +#define QIB_7322_IntStatus_RcvUrg6_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg5_LSB 0x25 +#define QIB_7322_IntStatus_RcvUrg5_MSB 0x25 +#define QIB_7322_IntStatus_RcvUrg5_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg4_LSB 0x24 +#define QIB_7322_IntStatus_RcvUrg4_MSB 0x24 +#define QIB_7322_IntStatus_RcvUrg4_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg3_LSB 0x23 +#define QIB_7322_IntStatus_RcvUrg3_MSB 0x23 +#define QIB_7322_IntStatus_RcvUrg3_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg2_LSB 0x22 +#define QIB_7322_IntStatus_RcvUrg2_MSB 0x22 +#define QIB_7322_IntStatus_RcvUrg2_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg1_LSB 0x21 +#define QIB_7322_IntStatus_RcvUrg1_MSB 0x21 +#define QIB_7322_IntStatus_RcvUrg1_RMASK 0x1 +#define QIB_7322_IntStatus_RcvUrg0_LSB 0x20 +#define QIB_7322_IntStatus_RcvUrg0_MSB 0x20 +#define QIB_7322_IntStatus_RcvUrg0_RMASK 0x1 +#define QIB_7322_IntStatus_Err_1_LSB 0x1F +#define QIB_7322_IntStatus_Err_1_MSB 0x1F +#define QIB_7322_IntStatus_Err_1_RMASK 0x1 +#define QIB_7322_IntStatus_Err_0_LSB 0x1E +#define QIB_7322_IntStatus_Err_0_MSB 0x1E +#define QIB_7322_IntStatus_Err_0_RMASK 0x1 +#define QIB_7322_IntStatus_Err_LSB 0x1D +#define QIB_7322_IntStatus_Err_MSB 0x1D +#define QIB_7322_IntStatus_Err_RMASK 0x1 +#define QIB_7322_IntStatus_AssertGPIO_LSB 0x1C +#define QIB_7322_IntStatus_AssertGPIO_MSB 0x1C +#define QIB_7322_IntStatus_AssertGPIO_RMASK 0x1 +#define QIB_7322_IntStatus_SendDone_1_LSB 0x19 +#define QIB_7322_IntStatus_SendDone_1_MSB 0x19 +#define QIB_7322_IntStatus_SendDone_1_RMASK 0x1 +#define QIB_7322_IntStatus_SendDone_0_LSB 0x18 +#define QIB_7322_IntStatus_SendDone_0_MSB 0x18 +#define QIB_7322_IntStatus_SendDone_0_RMASK 0x1 +#define QIB_7322_IntStatus_SendBufAvail_LSB 0x17 +#define QIB_7322_IntStatus_SendBufAvail_MSB 0x17 +#define QIB_7322_IntStatus_SendBufAvail_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail17_LSB 0x11 +#define QIB_7322_IntStatus_RcvAvail17_MSB 0x11 +#define QIB_7322_IntStatus_RcvAvail17_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail16_LSB 0x10 +#define QIB_7322_IntStatus_RcvAvail16_MSB 0x10 +#define QIB_7322_IntStatus_RcvAvail16_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail15_LSB 0xF +#define QIB_7322_IntStatus_RcvAvail15_MSB 0xF +#define QIB_7322_IntStatus_RcvAvail15_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail14_LSB 0xE +#define QIB_7322_IntStatus_RcvAvail14_MSB 0xE +#define QIB_7322_IntStatus_RcvAvail14_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail13_LSB 0xD +#define QIB_7322_IntStatus_RcvAvail13_MSB 0xD +#define QIB_7322_IntStatus_RcvAvail13_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail12_LSB 0xC +#define QIB_7322_IntStatus_RcvAvail12_MSB 0xC +#define QIB_7322_IntStatus_RcvAvail12_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail11_LSB 0xB +#define QIB_7322_IntStatus_RcvAvail11_MSB 0xB +#define QIB_7322_IntStatus_RcvAvail11_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail10_LSB 0xA +#define QIB_7322_IntStatus_RcvAvail10_MSB 0xA +#define QIB_7322_IntStatus_RcvAvail10_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail9_LSB 0x9 +#define QIB_7322_IntStatus_RcvAvail9_MSB 0x9 +#define QIB_7322_IntStatus_RcvAvail9_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail8_LSB 0x8 +#define QIB_7322_IntStatus_RcvAvail8_MSB 0x8 +#define QIB_7322_IntStatus_RcvAvail8_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail7_LSB 0x7 +#define QIB_7322_IntStatus_RcvAvail7_MSB 0x7 +#define QIB_7322_IntStatus_RcvAvail7_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail6_LSB 0x6 +#define QIB_7322_IntStatus_RcvAvail6_MSB 0x6 +#define QIB_7322_IntStatus_RcvAvail6_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail5_LSB 0x5 +#define QIB_7322_IntStatus_RcvAvail5_MSB 0x5 +#define QIB_7322_IntStatus_RcvAvail5_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail4_LSB 0x4 +#define QIB_7322_IntStatus_RcvAvail4_MSB 0x4 +#define QIB_7322_IntStatus_RcvAvail4_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail3_LSB 0x3 +#define QIB_7322_IntStatus_RcvAvail3_MSB 0x3 +#define QIB_7322_IntStatus_RcvAvail3_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail2_LSB 0x2 +#define QIB_7322_IntStatus_RcvAvail2_MSB 0x2 +#define QIB_7322_IntStatus_RcvAvail2_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail1_LSB 0x1 +#define QIB_7322_IntStatus_RcvAvail1_MSB 0x1 +#define QIB_7322_IntStatus_RcvAvail1_RMASK 0x1 +#define QIB_7322_IntStatus_RcvAvail0_LSB 0x0 +#define QIB_7322_IntStatus_RcvAvail0_MSB 0x0 +#define QIB_7322_IntStatus_RcvAvail0_RMASK 0x1 + +#define QIB_7322_IntClear_OFFS 0x78 +#define QIB_7322_IntClear_DEF 0x0000000000000000 +#define QIB_7322_IntClear_SDmaIntClear_1_LSB 0x3F +#define QIB_7322_IntClear_SDmaIntClear_1_MSB 0x3F +#define QIB_7322_IntClear_SDmaIntClear_1_RMASK 0x1 +#define QIB_7322_IntClear_SDmaIntClear_0_LSB 0x3E +#define QIB_7322_IntClear_SDmaIntClear_0_MSB 0x3E +#define QIB_7322_IntClear_SDmaIntClear_0_RMASK 0x1 +#define QIB_7322_IntClear_SDmaProgressIntClear_1_LSB 0x3D +#define QIB_7322_IntClear_SDmaProgressIntClear_1_MSB 0x3D +#define QIB_7322_IntClear_SDmaProgressIntClear_1_RMASK 0x1 +#define QIB_7322_IntClear_SDmaProgressIntClear_0_LSB 0x3C +#define QIB_7322_IntClear_SDmaProgressIntClear_0_MSB 0x3C +#define QIB_7322_IntClear_SDmaProgressIntClear_0_RMASK 0x1 +#define QIB_7322_IntClear_SDmaIdleIntClear_1_LSB 0x3B +#define QIB_7322_IntClear_SDmaIdleIntClear_1_MSB 0x3B +#define QIB_7322_IntClear_SDmaIdleIntClear_1_RMASK 0x1 +#define QIB_7322_IntClear_SDmaIdleIntClear_0_LSB 0x3A +#define QIB_7322_IntClear_SDmaIdleIntClear_0_MSB 0x3A +#define QIB_7322_IntClear_SDmaIdleIntClear_0_RMASK 0x1 +#define QIB_7322_IntClear_SDmaCleanupDoneClear_1_LSB 0x39 +#define QIB_7322_IntClear_SDmaCleanupDoneClear_1_MSB 0x39 +#define QIB_7322_IntClear_SDmaCleanupDoneClear_1_RMASK 0x1 +#define QIB_7322_IntClear_SDmaCleanupDoneClear_0_LSB 0x38 +#define QIB_7322_IntClear_SDmaCleanupDoneClear_0_MSB 0x38 +#define QIB_7322_IntClear_SDmaCleanupDoneClear_0_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg17IntClear_LSB 0x31 +#define QIB_7322_IntClear_RcvUrg17IntClear_MSB 0x31 +#define QIB_7322_IntClear_RcvUrg17IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg16IntClear_LSB 0x30 +#define QIB_7322_IntClear_RcvUrg16IntClear_MSB 0x30 +#define QIB_7322_IntClear_RcvUrg16IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg15IntClear_LSB 0x2F +#define QIB_7322_IntClear_RcvUrg15IntClear_MSB 0x2F +#define QIB_7322_IntClear_RcvUrg15IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg14IntClear_LSB 0x2E +#define QIB_7322_IntClear_RcvUrg14IntClear_MSB 0x2E +#define QIB_7322_IntClear_RcvUrg14IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg13IntClear_LSB 0x2D +#define QIB_7322_IntClear_RcvUrg13IntClear_MSB 0x2D +#define QIB_7322_IntClear_RcvUrg13IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg12IntClear_LSB 0x2C +#define QIB_7322_IntClear_RcvUrg12IntClear_MSB 0x2C +#define QIB_7322_IntClear_RcvUrg12IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg11IntClear_LSB 0x2B +#define QIB_7322_IntClear_RcvUrg11IntClear_MSB 0x2B +#define QIB_7322_IntClear_RcvUrg11IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg10IntClear_LSB 0x2A +#define QIB_7322_IntClear_RcvUrg10IntClear_MSB 0x2A +#define QIB_7322_IntClear_RcvUrg10IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg9IntClear_LSB 0x29 +#define QIB_7322_IntClear_RcvUrg9IntClear_MSB 0x29 +#define QIB_7322_IntClear_RcvUrg9IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg8IntClear_LSB 0x28 +#define QIB_7322_IntClear_RcvUrg8IntClear_MSB 0x28 +#define QIB_7322_IntClear_RcvUrg8IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg7IntClear_LSB 0x27 +#define QIB_7322_IntClear_RcvUrg7IntClear_MSB 0x27 +#define QIB_7322_IntClear_RcvUrg7IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg6IntClear_LSB 0x26 +#define QIB_7322_IntClear_RcvUrg6IntClear_MSB 0x26 +#define QIB_7322_IntClear_RcvUrg6IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg5IntClear_LSB 0x25 +#define QIB_7322_IntClear_RcvUrg5IntClear_MSB 0x25 +#define QIB_7322_IntClear_RcvUrg5IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg4IntClear_LSB 0x24 +#define QIB_7322_IntClear_RcvUrg4IntClear_MSB 0x24 +#define QIB_7322_IntClear_RcvUrg4IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg3IntClear_LSB 0x23 +#define QIB_7322_IntClear_RcvUrg3IntClear_MSB 0x23 +#define QIB_7322_IntClear_RcvUrg3IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg2IntClear_LSB 0x22 +#define QIB_7322_IntClear_RcvUrg2IntClear_MSB 0x22 +#define QIB_7322_IntClear_RcvUrg2IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg1IntClear_LSB 0x21 +#define QIB_7322_IntClear_RcvUrg1IntClear_MSB 0x21 +#define QIB_7322_IntClear_RcvUrg1IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvUrg0IntClear_LSB 0x20 +#define QIB_7322_IntClear_RcvUrg0IntClear_MSB 0x20 +#define QIB_7322_IntClear_RcvUrg0IntClear_RMASK 0x1 +#define QIB_7322_IntClear_ErrIntClear_1_LSB 0x1F +#define QIB_7322_IntClear_ErrIntClear_1_MSB 0x1F +#define QIB_7322_IntClear_ErrIntClear_1_RMASK 0x1 +#define QIB_7322_IntClear_ErrIntClear_0_LSB 0x1E +#define QIB_7322_IntClear_ErrIntClear_0_MSB 0x1E +#define QIB_7322_IntClear_ErrIntClear_0_RMASK 0x1 +#define QIB_7322_IntClear_ErrIntClear_LSB 0x1D +#define QIB_7322_IntClear_ErrIntClear_MSB 0x1D +#define QIB_7322_IntClear_ErrIntClear_RMASK 0x1 +#define QIB_7322_IntClear_AssertGPIOIntClear_LSB 0x1C +#define QIB_7322_IntClear_AssertGPIOIntClear_MSB 0x1C +#define QIB_7322_IntClear_AssertGPIOIntClear_RMASK 0x1 +#define QIB_7322_IntClear_SendDoneIntClear_1_LSB 0x19 +#define QIB_7322_IntClear_SendDoneIntClear_1_MSB 0x19 +#define QIB_7322_IntClear_SendDoneIntClear_1_RMASK 0x1 +#define QIB_7322_IntClear_SendDoneIntClear_0_LSB 0x18 +#define QIB_7322_IntClear_SendDoneIntClear_0_MSB 0x18 +#define QIB_7322_IntClear_SendDoneIntClear_0_RMASK 0x1 +#define QIB_7322_IntClear_SendBufAvailIntClear_LSB 0x17 +#define QIB_7322_IntClear_SendBufAvailIntClear_MSB 0x17 +#define QIB_7322_IntClear_SendBufAvailIntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail17IntClear_LSB 0x11 +#define QIB_7322_IntClear_RcvAvail17IntClear_MSB 0x11 +#define QIB_7322_IntClear_RcvAvail17IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail16IntClear_LSB 0x10 +#define QIB_7322_IntClear_RcvAvail16IntClear_MSB 0x10 +#define QIB_7322_IntClear_RcvAvail16IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail15IntClear_LSB 0xF +#define QIB_7322_IntClear_RcvAvail15IntClear_MSB 0xF +#define QIB_7322_IntClear_RcvAvail15IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail14IntClear_LSB 0xE +#define QIB_7322_IntClear_RcvAvail14IntClear_MSB 0xE +#define QIB_7322_IntClear_RcvAvail14IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail13IntClear_LSB 0xD +#define QIB_7322_IntClear_RcvAvail13IntClear_MSB 0xD +#define QIB_7322_IntClear_RcvAvail13IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail12IntClear_LSB 0xC +#define QIB_7322_IntClear_RcvAvail12IntClear_MSB 0xC +#define QIB_7322_IntClear_RcvAvail12IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail11IntClear_LSB 0xB +#define QIB_7322_IntClear_RcvAvail11IntClear_MSB 0xB +#define QIB_7322_IntClear_RcvAvail11IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail10IntClear_LSB 0xA +#define QIB_7322_IntClear_RcvAvail10IntClear_MSB 0xA +#define QIB_7322_IntClear_RcvAvail10IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail9IntClear_LSB 0x9 +#define QIB_7322_IntClear_RcvAvail9IntClear_MSB 0x9 +#define QIB_7322_IntClear_RcvAvail9IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail8IntClear_LSB 0x8 +#define QIB_7322_IntClear_RcvAvail8IntClear_MSB 0x8 +#define QIB_7322_IntClear_RcvAvail8IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail7IntClear_LSB 0x7 +#define QIB_7322_IntClear_RcvAvail7IntClear_MSB 0x7 +#define QIB_7322_IntClear_RcvAvail7IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail6IntClear_LSB 0x6 +#define QIB_7322_IntClear_RcvAvail6IntClear_MSB 0x6 +#define QIB_7322_IntClear_RcvAvail6IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail5IntClear_LSB 0x5 +#define QIB_7322_IntClear_RcvAvail5IntClear_MSB 0x5 +#define QIB_7322_IntClear_RcvAvail5IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail4IntClear_LSB 0x4 +#define QIB_7322_IntClear_RcvAvail4IntClear_MSB 0x4 +#define QIB_7322_IntClear_RcvAvail4IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail3IntClear_LSB 0x3 +#define QIB_7322_IntClear_RcvAvail3IntClear_MSB 0x3 +#define QIB_7322_IntClear_RcvAvail3IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail2IntClear_LSB 0x2 +#define QIB_7322_IntClear_RcvAvail2IntClear_MSB 0x2 +#define QIB_7322_IntClear_RcvAvail2IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail1IntClear_LSB 0x1 +#define QIB_7322_IntClear_RcvAvail1IntClear_MSB 0x1 +#define QIB_7322_IntClear_RcvAvail1IntClear_RMASK 0x1 +#define QIB_7322_IntClear_RcvAvail0IntClear_LSB 0x0 +#define QIB_7322_IntClear_RcvAvail0IntClear_MSB 0x0 +#define QIB_7322_IntClear_RcvAvail0IntClear_RMASK 0x1 + +#define QIB_7322_ErrMask_OFFS 0x80 +#define QIB_7322_ErrMask_DEF 0x0000000000000000 +#define QIB_7322_ErrMask_ResetNegatedMask_LSB 0x3F +#define QIB_7322_ErrMask_ResetNegatedMask_MSB 0x3F +#define QIB_7322_ErrMask_ResetNegatedMask_RMASK 0x1 +#define QIB_7322_ErrMask_HardwareErrMask_LSB 0x3E +#define QIB_7322_ErrMask_HardwareErrMask_MSB 0x3E +#define QIB_7322_ErrMask_HardwareErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_InvalidAddrErrMask_LSB 0x3D +#define QIB_7322_ErrMask_InvalidAddrErrMask_MSB 0x3D +#define QIB_7322_ErrMask_InvalidAddrErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_SDmaVL15ErrMask_LSB 0x38 +#define QIB_7322_ErrMask_SDmaVL15ErrMask_MSB 0x38 +#define QIB_7322_ErrMask_SDmaVL15ErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_SBufVL15MisUseErrMask_LSB 0x37 +#define QIB_7322_ErrMask_SBufVL15MisUseErrMask_MSB 0x37 +#define QIB_7322_ErrMask_SBufVL15MisUseErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_InvalidEEPCmdMask_LSB 0x35 +#define QIB_7322_ErrMask_InvalidEEPCmdMask_MSB 0x35 +#define QIB_7322_ErrMask_InvalidEEPCmdMask_RMASK 0x1 +#define QIB_7322_ErrMask_RcvContextShareErrMask_LSB 0x34 +#define QIB_7322_ErrMask_RcvContextShareErrMask_MSB 0x34 +#define QIB_7322_ErrMask_RcvContextShareErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_SendVLMismatchErrMask_LSB 0x24 +#define QIB_7322_ErrMask_SendVLMismatchErrMask_MSB 0x24 +#define QIB_7322_ErrMask_SendVLMismatchErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_SendArmLaunchErrMask_LSB 0x23 +#define QIB_7322_ErrMask_SendArmLaunchErrMask_MSB 0x23 +#define QIB_7322_ErrMask_SendArmLaunchErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_SendSpecialTriggerErrMask_LSB 0x1B +#define QIB_7322_ErrMask_SendSpecialTriggerErrMask_MSB 0x1B +#define QIB_7322_ErrMask_SendSpecialTriggerErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_SDmaWrongPortErrMask_LSB 0x1A +#define QIB_7322_ErrMask_SDmaWrongPortErrMask_MSB 0x1A +#define QIB_7322_ErrMask_SDmaWrongPortErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_SDmaBufMaskDuplicateErrMask_LSB 0x19 +#define QIB_7322_ErrMask_SDmaBufMaskDuplicateErrMask_MSB 0x19 +#define QIB_7322_ErrMask_SDmaBufMaskDuplicateErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_RcvHdrFullErrMask_LSB 0xD +#define QIB_7322_ErrMask_RcvHdrFullErrMask_MSB 0xD +#define QIB_7322_ErrMask_RcvHdrFullErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_RcvEgrFullErrMask_LSB 0xC +#define QIB_7322_ErrMask_RcvEgrFullErrMask_MSB 0xC +#define QIB_7322_ErrMask_RcvEgrFullErrMask_RMASK 0x1 + +#define QIB_7322_ErrStatus_OFFS 0x88 +#define QIB_7322_ErrStatus_DEF 0x0000000000000000 +#define QIB_7322_ErrStatus_ResetNegated_LSB 0x3F +#define QIB_7322_ErrStatus_ResetNegated_MSB 0x3F +#define QIB_7322_ErrStatus_ResetNegated_RMASK 0x1 +#define QIB_7322_ErrStatus_HardwareErr_LSB 0x3E +#define QIB_7322_ErrStatus_HardwareErr_MSB 0x3E +#define QIB_7322_ErrStatus_HardwareErr_RMASK 0x1 +#define QIB_7322_ErrStatus_InvalidAddrErr_LSB 0x3D +#define QIB_7322_ErrStatus_InvalidAddrErr_MSB 0x3D +#define QIB_7322_ErrStatus_InvalidAddrErr_RMASK 0x1 +#define QIB_7322_ErrStatus_SDmaVL15Err_LSB 0x38 +#define QIB_7322_ErrStatus_SDmaVL15Err_MSB 0x38 +#define QIB_7322_ErrStatus_SDmaVL15Err_RMASK 0x1 +#define QIB_7322_ErrStatus_SBufVL15MisUseErr_LSB 0x37 +#define QIB_7322_ErrStatus_SBufVL15MisUseErr_MSB 0x37 +#define QIB_7322_ErrStatus_SBufVL15MisUseErr_RMASK 0x1 +#define QIB_7322_ErrStatus_InvalidEEPCmdErr_LSB 0x35 +#define QIB_7322_ErrStatus_InvalidEEPCmdErr_MSB 0x35 +#define QIB_7322_ErrStatus_InvalidEEPCmdErr_RMASK 0x1 +#define QIB_7322_ErrStatus_RcvContextShareErr_LSB 0x34 +#define QIB_7322_ErrStatus_RcvContextShareErr_MSB 0x34 +#define QIB_7322_ErrStatus_RcvContextShareErr_RMASK 0x1 +#define QIB_7322_ErrStatus_SendVLMismatchErr_LSB 0x24 +#define QIB_7322_ErrStatus_SendVLMismatchErr_MSB 0x24 +#define QIB_7322_ErrStatus_SendVLMismatchErr_RMASK 0x1 +#define QIB_7322_ErrStatus_SendArmLaunchErr_LSB 0x23 +#define QIB_7322_ErrStatus_SendArmLaunchErr_MSB 0x23 +#define QIB_7322_ErrStatus_SendArmLaunchErr_RMASK 0x1 +#define QIB_7322_ErrStatus_SendSpecialTriggerErr_LSB 0x1B +#define QIB_7322_ErrStatus_SendSpecialTriggerErr_MSB 0x1B +#define QIB_7322_ErrStatus_SendSpecialTriggerErr_RMASK 0x1 +#define QIB_7322_ErrStatus_SDmaWrongPortErr_LSB 0x1A +#define QIB_7322_ErrStatus_SDmaWrongPortErr_MSB 0x1A +#define QIB_7322_ErrStatus_SDmaWrongPortErr_RMASK 0x1 +#define QIB_7322_ErrStatus_SDmaBufMaskDuplicateErr_LSB 0x19 +#define QIB_7322_ErrStatus_SDmaBufMaskDuplicateErr_MSB 0x19 +#define QIB_7322_ErrStatus_SDmaBufMaskDuplicateErr_RMASK 0x1 +#define QIB_7322_ErrStatus_RcvHdrFullErr_LSB 0xD +#define QIB_7322_ErrStatus_RcvHdrFullErr_MSB 0xD +#define QIB_7322_ErrStatus_RcvHdrFullErr_RMASK 0x1 +#define QIB_7322_ErrStatus_RcvEgrFullErr_LSB 0xC +#define QIB_7322_ErrStatus_RcvEgrFullErr_MSB 0xC +#define QIB_7322_ErrStatus_RcvEgrFullErr_RMASK 0x1 + +#define QIB_7322_ErrClear_OFFS 0x90 +#define QIB_7322_ErrClear_DEF 0x0000000000000000 +#define QIB_7322_ErrClear_ResetNegatedClear_LSB 0x3F +#define QIB_7322_ErrClear_ResetNegatedClear_MSB 0x3F +#define QIB_7322_ErrClear_ResetNegatedClear_RMASK 0x1 +#define QIB_7322_ErrClear_HardwareErrClear_LSB 0x3E +#define QIB_7322_ErrClear_HardwareErrClear_MSB 0x3E +#define QIB_7322_ErrClear_HardwareErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_InvalidAddrErrClear_LSB 0x3D +#define QIB_7322_ErrClear_InvalidAddrErrClear_MSB 0x3D +#define QIB_7322_ErrClear_InvalidAddrErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_SDmaVL15ErrClear_LSB 0x38 +#define QIB_7322_ErrClear_SDmaVL15ErrClear_MSB 0x38 +#define QIB_7322_ErrClear_SDmaVL15ErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_SBufVL15MisUseErrClear_LSB 0x37 +#define QIB_7322_ErrClear_SBufVL15MisUseErrClear_MSB 0x37 +#define QIB_7322_ErrClear_SBufVL15MisUseErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_InvalidEEPCmdErrClear_LSB 0x35 +#define QIB_7322_ErrClear_InvalidEEPCmdErrClear_MSB 0x35 +#define QIB_7322_ErrClear_InvalidEEPCmdErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_RcvContextShareErrClear_LSB 0x34 +#define QIB_7322_ErrClear_RcvContextShareErrClear_MSB 0x34 +#define QIB_7322_ErrClear_RcvContextShareErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_SendVLMismatchErrMask_LSB 0x24 +#define QIB_7322_ErrClear_SendVLMismatchErrMask_MSB 0x24 +#define QIB_7322_ErrClear_SendVLMismatchErrMask_RMASK 0x1 +#define QIB_7322_ErrClear_SendArmLaunchErrClear_LSB 0x23 +#define QIB_7322_ErrClear_SendArmLaunchErrClear_MSB 0x23 +#define QIB_7322_ErrClear_SendArmLaunchErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_SendSpecialTriggerErrClear_LSB 0x1B +#define QIB_7322_ErrClear_SendSpecialTriggerErrClear_MSB 0x1B +#define QIB_7322_ErrClear_SendSpecialTriggerErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_SDmaWrongPortErrClear_LSB 0x1A +#define QIB_7322_ErrClear_SDmaWrongPortErrClear_MSB 0x1A +#define QIB_7322_ErrClear_SDmaWrongPortErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_SDmaBufMaskDuplicateErrClear_LSB 0x19 +#define QIB_7322_ErrClear_SDmaBufMaskDuplicateErrClear_MSB 0x19 +#define QIB_7322_ErrClear_SDmaBufMaskDuplicateErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_RcvHdrFullErrClear_LSB 0xD +#define QIB_7322_ErrClear_RcvHdrFullErrClear_MSB 0xD +#define QIB_7322_ErrClear_RcvHdrFullErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_RcvEgrFullErrClear_LSB 0xC +#define QIB_7322_ErrClear_RcvEgrFullErrClear_MSB 0xC +#define QIB_7322_ErrClear_RcvEgrFullErrClear_RMASK 0x1 + +#define QIB_7322_HwErrMask_OFFS 0x98 +#define QIB_7322_HwErrMask_DEF 0x0000000000000000 +#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_1_LSB 0x3F +#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_1_MSB 0x3F +#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_1_RMASK 0x1 +#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_0_LSB 0x3E +#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_0_MSB 0x3E +#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_0_RMASK 0x1 +#define QIB_7322_HwErrMask_PCIESerdesPClkNotDetectMask_LSB 0x37 +#define QIB_7322_HwErrMask_PCIESerdesPClkNotDetectMask_MSB 0x37 +#define QIB_7322_HwErrMask_PCIESerdesPClkNotDetectMask_RMASK 0x1 +#define QIB_7322_HwErrMask_PowerOnBISTFailedMask_LSB 0x36 +#define QIB_7322_HwErrMask_PowerOnBISTFailedMask_MSB 0x36 +#define QIB_7322_HwErrMask_PowerOnBISTFailedMask_RMASK 0x1 +#define QIB_7322_HwErrMask_TempsenseTholdReachedMask_LSB 0x35 +#define QIB_7322_HwErrMask_TempsenseTholdReachedMask_MSB 0x35 +#define QIB_7322_HwErrMask_TempsenseTholdReachedMask_RMASK 0x1 +#define QIB_7322_HwErrMask_MemoryErrMask_LSB 0x30 +#define QIB_7322_HwErrMask_MemoryErrMask_MSB 0x30 +#define QIB_7322_HwErrMask_MemoryErrMask_RMASK 0x1 +#define QIB_7322_HwErrMask_pcie_phy_txParityErr_LSB 0x22 +#define QIB_7322_HwErrMask_pcie_phy_txParityErr_MSB 0x22 +#define QIB_7322_HwErrMask_pcie_phy_txParityErr_RMASK 0x1 +#define QIB_7322_HwErrMask_PCIeBusParityErrMask_LSB 0x1F +#define QIB_7322_HwErrMask_PCIeBusParityErrMask_MSB 0x21 +#define QIB_7322_HwErrMask_PCIeBusParityErrMask_RMASK 0x7 +#define QIB_7322_HwErrMask_PcieCplTimeoutMask_LSB 0x1E +#define QIB_7322_HwErrMask_PcieCplTimeoutMask_MSB 0x1E +#define QIB_7322_HwErrMask_PcieCplTimeoutMask_RMASK 0x1 +#define QIB_7322_HwErrMask_PciePoisonedTLPMask_LSB 0x1D +#define QIB_7322_HwErrMask_PciePoisonedTLPMask_MSB 0x1D +#define QIB_7322_HwErrMask_PciePoisonedTLPMask_RMASK 0x1 +#define QIB_7322_HwErrMask_SDmaMemReadErrMask_1_LSB 0x1C +#define QIB_7322_HwErrMask_SDmaMemReadErrMask_1_MSB 0x1C +#define QIB_7322_HwErrMask_SDmaMemReadErrMask_1_RMASK 0x1 +#define QIB_7322_HwErrMask_SDmaMemReadErrMask_0_LSB 0x1B +#define QIB_7322_HwErrMask_SDmaMemReadErrMask_0_MSB 0x1B +#define QIB_7322_HwErrMask_SDmaMemReadErrMask_0_RMASK 0x1 +#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_1_LSB 0xF +#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_1_MSB 0xF +#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_1_RMASK 0x1 +#define QIB_7322_HwErrMask_IBCBusToSPCParityErrMask_1_LSB 0xE +#define QIB_7322_HwErrMask_IBCBusToSPCParityErrMask_1_MSB 0xE +#define QIB_7322_HwErrMask_IBCBusToSPCParityErrMask_1_RMASK 0x1 +#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_0_LSB 0xD +#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_0_MSB 0xD +#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_0_RMASK 0x1 +#define QIB_7322_HwErrMask_statusValidNoEopMask_LSB 0xC +#define QIB_7322_HwErrMask_statusValidNoEopMask_MSB 0xC +#define QIB_7322_HwErrMask_statusValidNoEopMask_RMASK 0x1 +#define QIB_7322_HwErrMask_LATriggeredMask_LSB 0xB +#define QIB_7322_HwErrMask_LATriggeredMask_MSB 0xB +#define QIB_7322_HwErrMask_LATriggeredMask_RMASK 0x1 + +#define QIB_7322_HwErrStatus_OFFS 0xA0 +#define QIB_7322_HwErrStatus_DEF 0x0000000000000000 +#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_1_LSB 0x3F +#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_1_MSB 0x3F +#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_1_RMASK 0x1 +#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_0_LSB 0x3E +#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_0_MSB 0x3E +#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_0_RMASK 0x1 +#define QIB_7322_HwErrStatus_PCIESerdesPClkNotDetect_LSB 0x37 +#define QIB_7322_HwErrStatus_PCIESerdesPClkNotDetect_MSB 0x37 +#define QIB_7322_HwErrStatus_PCIESerdesPClkNotDetect_RMASK 0x1 +#define QIB_7322_HwErrStatus_PowerOnBISTFailed_LSB 0x36 +#define QIB_7322_HwErrStatus_PowerOnBISTFailed_MSB 0x36 +#define QIB_7322_HwErrStatus_PowerOnBISTFailed_RMASK 0x1 +#define QIB_7322_HwErrStatus_TempsenseTholdReached_LSB 0x35 +#define QIB_7322_HwErrStatus_TempsenseTholdReached_MSB 0x35 +#define QIB_7322_HwErrStatus_TempsenseTholdReached_RMASK 0x1 +#define QIB_7322_HwErrStatus_MemoryErr_LSB 0x30 +#define QIB_7322_HwErrStatus_MemoryErr_MSB 0x30 +#define QIB_7322_HwErrStatus_MemoryErr_RMASK 0x1 +#define QIB_7322_HwErrStatus_pcie_phy_txParityErr_LSB 0x22 +#define QIB_7322_HwErrStatus_pcie_phy_txParityErr_MSB 0x22 +#define QIB_7322_HwErrStatus_pcie_phy_txParityErr_RMASK 0x1 +#define QIB_7322_HwErrStatus_PCIeBusParity_LSB 0x1F +#define QIB_7322_HwErrStatus_PCIeBusParity_MSB 0x21 +#define QIB_7322_HwErrStatus_PCIeBusParity_RMASK 0x7 +#define QIB_7322_HwErrStatus_PcieCplTimeout_LSB 0x1E +#define QIB_7322_HwErrStatus_PcieCplTimeout_MSB 0x1E +#define QIB_7322_HwErrStatus_PcieCplTimeout_RMASK 0x1 +#define QIB_7322_HwErrStatus_PciePoisonedTLP_LSB 0x1D +#define QIB_7322_HwErrStatus_PciePoisonedTLP_MSB 0x1D +#define QIB_7322_HwErrStatus_PciePoisonedTLP_RMASK 0x1 +#define QIB_7322_HwErrStatus_SDmaMemReadErr_1_LSB 0x1C +#define QIB_7322_HwErrStatus_SDmaMemReadErr_1_MSB 0x1C +#define QIB_7322_HwErrStatus_SDmaMemReadErr_1_RMASK 0x1 +#define QIB_7322_HwErrStatus_SDmaMemReadErr_0_LSB 0x1B +#define QIB_7322_HwErrStatus_SDmaMemReadErr_0_MSB 0x1B +#define QIB_7322_HwErrStatus_SDmaMemReadErr_0_RMASK 0x1 +#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_1_LSB 0xF +#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_1_MSB 0xF +#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_1_RMASK 0x1 +#define QIB_7322_HwErrStatus_IBCBusToSPCParityErr_1_LSB 0xE +#define QIB_7322_HwErrStatus_IBCBusToSPCParityErr_1_MSB 0xE +#define QIB_7322_HwErrStatus_IBCBusToSPCParityErr_1_RMASK 0x1 +#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_0_LSB 0xD +#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_0_MSB 0xD +#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_0_RMASK 0x1 +#define QIB_7322_HwErrStatus_statusValidNoEop_LSB 0xC +#define QIB_7322_HwErrStatus_statusValidNoEop_MSB 0xC +#define QIB_7322_HwErrStatus_statusValidNoEop_RMASK 0x1 +#define QIB_7322_HwErrStatus_LATriggered_LSB 0xB +#define QIB_7322_HwErrStatus_LATriggered_MSB 0xB +#define QIB_7322_HwErrStatus_LATriggered_RMASK 0x1 + +#define QIB_7322_HwErrClear_OFFS 0xA8 +#define QIB_7322_HwErrClear_DEF 0x0000000000000000 +#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_1_LSB 0x3F +#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_1_MSB 0x3F +#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_1_RMASK 0x1 +#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_0_LSB 0x3E +#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_0_MSB 0x3E +#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_0_RMASK 0x1 +#define QIB_7322_HwErrClear_PCIESerdesPClkNotDetectClear_LSB 0x37 +#define QIB_7322_HwErrClear_PCIESerdesPClkNotDetectClear_MSB 0x37 +#define QIB_7322_HwErrClear_PCIESerdesPClkNotDetectClear_RMASK 0x1 +#define QIB_7322_HwErrClear_PowerOnBISTFailedClear_LSB 0x36 +#define QIB_7322_HwErrClear_PowerOnBISTFailedClear_MSB 0x36 +#define QIB_7322_HwErrClear_PowerOnBISTFailedClear_RMASK 0x1 +#define QIB_7322_HwErrClear_TempsenseTholdReachedClear_LSB 0x35 +#define QIB_7322_HwErrClear_TempsenseTholdReachedClear_MSB 0x35 +#define QIB_7322_HwErrClear_TempsenseTholdReachedClear_RMASK 0x1 +#define QIB_7322_HwErrClear_MemoryErrClear_LSB 0x30 +#define QIB_7322_HwErrClear_MemoryErrClear_MSB 0x30 +#define QIB_7322_HwErrClear_MemoryErrClear_RMASK 0x1 +#define QIB_7322_HwErrClear_pcie_phy_txParityErr_LSB 0x22 +#define QIB_7322_HwErrClear_pcie_phy_txParityErr_MSB 0x22 +#define QIB_7322_HwErrClear_pcie_phy_txParityErr_RMASK 0x1 +#define QIB_7322_HwErrClear_PCIeBusParityClear_LSB 0x1F +#define QIB_7322_HwErrClear_PCIeBusParityClear_MSB 0x21 +#define QIB_7322_HwErrClear_PCIeBusParityClear_RMASK 0x7 +#define QIB_7322_HwErrClear_PcieCplTimeoutClear_LSB 0x1E +#define QIB_7322_HwErrClear_PcieCplTimeoutClear_MSB 0x1E +#define QIB_7322_HwErrClear_PcieCplTimeoutClear_RMASK 0x1 +#define QIB_7322_HwErrClear_PciePoisonedTLPClear_LSB 0x1D +#define QIB_7322_HwErrClear_PciePoisonedTLPClear_MSB 0x1D +#define QIB_7322_HwErrClear_PciePoisonedTLPClear_RMASK 0x1 +#define QIB_7322_HwErrClear_SDmaMemReadErrClear_1_LSB 0x1C +#define QIB_7322_HwErrClear_SDmaMemReadErrClear_1_MSB 0x1C +#define QIB_7322_HwErrClear_SDmaMemReadErrClear_1_RMASK 0x1 +#define QIB_7322_HwErrClear_SDmaMemReadErrClear_0_LSB 0x1B +#define QIB_7322_HwErrClear_SDmaMemReadErrClear_0_MSB 0x1B +#define QIB_7322_HwErrClear_SDmaMemReadErrClear_0_RMASK 0x1 +#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_1_LSB 0xF +#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_1_MSB 0xF +#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_1_RMASK 0x1 +#define QIB_7322_HwErrClear_IBCBusToSPCParityErrClear_1_LSB 0xE +#define QIB_7322_HwErrClear_IBCBusToSPCParityErrClear_1_MSB 0xE +#define QIB_7322_HwErrClear_IBCBusToSPCParityErrClear_1_RMASK 0x1 +#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_0_LSB 0xD +#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_0_MSB 0xD +#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_0_RMASK 0x1 +#define QIB_7322_HwErrClear_statusValidNoEopClear_LSB 0xC +#define QIB_7322_HwErrClear_statusValidNoEopClear_MSB 0xC +#define QIB_7322_HwErrClear_statusValidNoEopClear_RMASK 0x1 +#define QIB_7322_HwErrClear_LATriggeredClear_LSB 0xB +#define QIB_7322_HwErrClear_LATriggeredClear_MSB 0xB +#define QIB_7322_HwErrClear_LATriggeredClear_RMASK 0x1 + +#define QIB_7322_HwDiagCtrl_OFFS 0xB0 +#define QIB_7322_HwDiagCtrl_DEF 0x0000000000000000 +#define QIB_7322_HwDiagCtrl_Diagnostic_LSB 0x3F +#define QIB_7322_HwDiagCtrl_Diagnostic_MSB 0x3F +#define QIB_7322_HwDiagCtrl_Diagnostic_RMASK 0x1 +#define QIB_7322_HwDiagCtrl_CounterWrEnable_LSB 0x3D +#define QIB_7322_HwDiagCtrl_CounterWrEnable_MSB 0x3D +#define QIB_7322_HwDiagCtrl_CounterWrEnable_RMASK 0x1 +#define QIB_7322_HwDiagCtrl_CounterDisable_LSB 0x3C +#define QIB_7322_HwDiagCtrl_CounterDisable_MSB 0x3C +#define QIB_7322_HwDiagCtrl_CounterDisable_RMASK 0x1 +#define QIB_7322_HwDiagCtrl_forcePCIeBusParity_LSB 0x1F +#define QIB_7322_HwDiagCtrl_forcePCIeBusParity_MSB 0x22 +#define QIB_7322_HwDiagCtrl_forcePCIeBusParity_RMASK 0xF +#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_1_LSB 0xF +#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_1_MSB 0xF +#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_1_RMASK 0x1 +#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_1_LSB 0xE +#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_1_MSB 0xE +#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_1_RMASK 0x1 +#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_0_LSB 0xD +#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_0_MSB 0xD +#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_0_RMASK 0x1 +#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_0_LSB 0xC +#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_0_MSB 0xC +#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_0_RMASK 0x1 + +#define QIB_7322_EXTStatus_OFFS 0xC0 +#define QIB_7322_EXTStatus_DEF 0x000000000000X000 +#define QIB_7322_EXTStatus_GPIOIn_LSB 0x30 +#define QIB_7322_EXTStatus_GPIOIn_MSB 0x3F +#define QIB_7322_EXTStatus_GPIOIn_RMASK 0xFFFF +#define QIB_7322_EXTStatus_MemBISTDisabled_LSB 0xF +#define QIB_7322_EXTStatus_MemBISTDisabled_MSB 0xF +#define QIB_7322_EXTStatus_MemBISTDisabled_RMASK 0x1 +#define QIB_7322_EXTStatus_MemBISTEndTest_LSB 0xE +#define QIB_7322_EXTStatus_MemBISTEndTest_MSB 0xE +#define QIB_7322_EXTStatus_MemBISTEndTest_RMASK 0x1 + +#define QIB_7322_EXTCtrl_OFFS 0xC8 +#define QIB_7322_EXTCtrl_DEF 0x0000000000000000 +#define QIB_7322_EXTCtrl_GPIOOe_LSB 0x30 +#define QIB_7322_EXTCtrl_GPIOOe_MSB 0x3F +#define QIB_7322_EXTCtrl_GPIOOe_RMASK 0xFFFF +#define QIB_7322_EXTCtrl_GPIOInvert_LSB 0x20 +#define QIB_7322_EXTCtrl_GPIOInvert_MSB 0x2F +#define QIB_7322_EXTCtrl_GPIOInvert_RMASK 0xFFFF +#define QIB_7322_EXTCtrl_LEDPort1GreenOn_LSB 0x3 +#define QIB_7322_EXTCtrl_LEDPort1GreenOn_MSB 0x3 +#define QIB_7322_EXTCtrl_LEDPort1GreenOn_RMASK 0x1 +#define QIB_7322_EXTCtrl_LEDPort1YellowOn_LSB 0x2 +#define QIB_7322_EXTCtrl_LEDPort1YellowOn_MSB 0x2 +#define QIB_7322_EXTCtrl_LEDPort1YellowOn_RMASK 0x1 +#define QIB_7322_EXTCtrl_LEDPort0GreenOn_LSB 0x1 +#define QIB_7322_EXTCtrl_LEDPort0GreenOn_MSB 0x1 +#define QIB_7322_EXTCtrl_LEDPort0GreenOn_RMASK 0x1 +#define QIB_7322_EXTCtrl_LEDPort0YellowOn_LSB 0x0 +#define QIB_7322_EXTCtrl_LEDPort0YellowOn_MSB 0x0 +#define QIB_7322_EXTCtrl_LEDPort0YellowOn_RMASK 0x1 + +#define QIB_7322_GPIOOut_OFFS 0xE0 +#define QIB_7322_GPIOOut_DEF 0x0000000000000000 + +#define QIB_7322_GPIOMask_OFFS 0xE8 +#define QIB_7322_GPIOMask_DEF 0x0000000000000000 + +#define QIB_7322_GPIOStatus_OFFS 0xF0 +#define QIB_7322_GPIOStatus_DEF 0x0000000000000000 + +#define QIB_7322_GPIOClear_OFFS 0xF8 +#define QIB_7322_GPIOClear_DEF 0x0000000000000000 + +#define QIB_7322_RcvCtrl_OFFS 0x100 +#define QIB_7322_RcvCtrl_DEF 0x0000000000000000 +#define QIB_7322_RcvCtrl_TidReDirect_LSB 0x30 +#define QIB_7322_RcvCtrl_TidReDirect_MSB 0x3F +#define QIB_7322_RcvCtrl_TidReDirect_RMASK 0xFFFF +#define QIB_7322_RcvCtrl_TailUpd_LSB 0x2F +#define QIB_7322_RcvCtrl_TailUpd_MSB 0x2F +#define QIB_7322_RcvCtrl_TailUpd_RMASK 0x1 +#define QIB_7322_RcvCtrl_XrcTypeCode_LSB 0x2C +#define QIB_7322_RcvCtrl_XrcTypeCode_MSB 0x2E +#define QIB_7322_RcvCtrl_XrcTypeCode_RMASK 0x7 +#define QIB_7322_RcvCtrl_TidFlowEnable_LSB 0x2B +#define QIB_7322_RcvCtrl_TidFlowEnable_MSB 0x2B +#define QIB_7322_RcvCtrl_TidFlowEnable_RMASK 0x1 +#define QIB_7322_RcvCtrl_ContextCfg_LSB 0x29 +#define QIB_7322_RcvCtrl_ContextCfg_MSB 0x2A +#define QIB_7322_RcvCtrl_ContextCfg_RMASK 0x3 +#define QIB_7322_RcvCtrl_IntrAvail_LSB 0x14 +#define QIB_7322_RcvCtrl_IntrAvail_MSB 0x25 +#define QIB_7322_RcvCtrl_IntrAvail_RMASK 0x3FFFF +#define QIB_7322_RcvCtrl_dontDropRHQFull_LSB 0x0 +#define QIB_7322_RcvCtrl_dontDropRHQFull_MSB 0x11 +#define QIB_7322_RcvCtrl_dontDropRHQFull_RMASK 0x3FFFF + +#define QIB_7322_RcvHdrSize_OFFS 0x110 +#define QIB_7322_RcvHdrSize_DEF 0x0000000000000000 + +#define QIB_7322_RcvHdrCnt_OFFS 0x118 +#define QIB_7322_RcvHdrCnt_DEF 0x0000000000000000 + +#define QIB_7322_RcvHdrEntSize_OFFS 0x120 +#define QIB_7322_RcvHdrEntSize_DEF 0x0000000000000000 + +#define QIB_7322_RcvTIDBase_OFFS 0x128 +#define QIB_7322_RcvTIDBase_DEF 0x0000000000050000 + +#define QIB_7322_RcvTIDCnt_OFFS 0x130 +#define QIB_7322_RcvTIDCnt_DEF 0x0000000000000200 + +#define QIB_7322_RcvEgrBase_OFFS 0x138 +#define QIB_7322_RcvEgrBase_DEF 0x0000000000014000 + +#define QIB_7322_RcvEgrCnt_OFFS 0x140 +#define QIB_7322_RcvEgrCnt_DEF 0x0000000000001000 + +#define QIB_7322_RcvBufBase_OFFS 0x148 +#define QIB_7322_RcvBufBase_DEF 0x0000000000080000 + +#define QIB_7322_RcvBufSize_OFFS 0x150 +#define QIB_7322_RcvBufSize_DEF 0x0000000000005000 + +#define QIB_7322_RxIntMemBase_OFFS 0x158 +#define QIB_7322_RxIntMemBase_DEF 0x0000000000077000 + +#define QIB_7322_RxIntMemSize_OFFS 0x160 +#define QIB_7322_RxIntMemSize_DEF 0x0000000000007000 + +#define QIB_7322_feature_mask_OFFS 0x190 +#define QIB_7322_feature_mask_DEF 0x00000000000000XX + +#define QIB_7322_active_feature_mask_OFFS 0x198 +#define QIB_7322_active_feature_mask_DEF 0x00000000000000XX +#define QIB_7322_active_feature_mask_Port1_QDR_Enabled_LSB 0x5 +#define QIB_7322_active_feature_mask_Port1_QDR_Enabled_MSB 0x5 +#define QIB_7322_active_feature_mask_Port1_QDR_Enabled_RMASK 0x1 +#define QIB_7322_active_feature_mask_Port1_DDR_Enabled_LSB 0x4 +#define QIB_7322_active_feature_mask_Port1_DDR_Enabled_MSB 0x4 +#define QIB_7322_active_feature_mask_Port1_DDR_Enabled_RMASK 0x1 +#define QIB_7322_active_feature_mask_Port1_SDR_Enabled_LSB 0x3 +#define QIB_7322_active_feature_mask_Port1_SDR_Enabled_MSB 0x3 +#define QIB_7322_active_feature_mask_Port1_SDR_Enabled_RMASK 0x1 +#define QIB_7322_active_feature_mask_Port0_QDR_Enabled_LSB 0x2 +#define QIB_7322_active_feature_mask_Port0_QDR_Enabled_MSB 0x2 +#define QIB_7322_active_feature_mask_Port0_QDR_Enabled_RMASK 0x1 +#define QIB_7322_active_feature_mask_Port0_DDR_Enabled_LSB 0x1 +#define QIB_7322_active_feature_mask_Port0_DDR_Enabled_MSB 0x1 +#define QIB_7322_active_feature_mask_Port0_DDR_Enabled_RMASK 0x1 +#define QIB_7322_active_feature_mask_Port0_SDR_Enabled_LSB 0x0 +#define QIB_7322_active_feature_mask_Port0_SDR_Enabled_MSB 0x0 +#define QIB_7322_active_feature_mask_Port0_SDR_Enabled_RMASK 0x1 + +#define QIB_7322_SendCtrl_OFFS 0x1C0 +#define QIB_7322_SendCtrl_DEF 0x0000000000000000 +#define QIB_7322_SendCtrl_Disarm_LSB 0x1F +#define QIB_7322_SendCtrl_Disarm_MSB 0x1F +#define QIB_7322_SendCtrl_Disarm_RMASK 0x1 +#define QIB_7322_SendCtrl_SendBufAvailPad64Byte_LSB 0x1D +#define QIB_7322_SendCtrl_SendBufAvailPad64Byte_MSB 0x1D +#define QIB_7322_SendCtrl_SendBufAvailPad64Byte_RMASK 0x1 +#define QIB_7322_SendCtrl_AvailUpdThld_LSB 0x18 +#define QIB_7322_SendCtrl_AvailUpdThld_MSB 0x1C +#define QIB_7322_SendCtrl_AvailUpdThld_RMASK 0x1F +#define QIB_7322_SendCtrl_DisarmSendBuf_LSB 0x10 +#define QIB_7322_SendCtrl_DisarmSendBuf_MSB 0x17 +#define QIB_7322_SendCtrl_DisarmSendBuf_RMASK 0xFF +#define QIB_7322_SendCtrl_SpecialTriggerEn_LSB 0x4 +#define QIB_7322_SendCtrl_SpecialTriggerEn_MSB 0x4 +#define QIB_7322_SendCtrl_SpecialTriggerEn_RMASK 0x1 +#define QIB_7322_SendCtrl_SendBufAvailUpd_LSB 0x2 +#define QIB_7322_SendCtrl_SendBufAvailUpd_MSB 0x2 +#define QIB_7322_SendCtrl_SendBufAvailUpd_RMASK 0x1 +#define QIB_7322_SendCtrl_SendIntBufAvail_LSB 0x1 +#define QIB_7322_SendCtrl_SendIntBufAvail_MSB 0x1 +#define QIB_7322_SendCtrl_SendIntBufAvail_RMASK 0x1 + +#define QIB_7322_SendBufBase_OFFS 0x1C8 +#define QIB_7322_SendBufBase_DEF 0x0018000000100000 +#define QIB_7322_SendBufBase_BaseAddr_LargePIO_LSB 0x20 +#define QIB_7322_SendBufBase_BaseAddr_LargePIO_MSB 0x34 +#define QIB_7322_SendBufBase_BaseAddr_LargePIO_RMASK 0x1FFFFF +#define QIB_7322_SendBufBase_BaseAddr_SmallPIO_LSB 0x0 +#define QIB_7322_SendBufBase_BaseAddr_SmallPIO_MSB 0x14 +#define QIB_7322_SendBufBase_BaseAddr_SmallPIO_RMASK 0x1FFFFF + +#define QIB_7322_SendBufSize_OFFS 0x1D0 +#define QIB_7322_SendBufSize_DEF 0x0000108000000880 +#define QIB_7322_SendBufSize_Size_LargePIO_LSB 0x20 +#define QIB_7322_SendBufSize_Size_LargePIO_MSB 0x2C +#define QIB_7322_SendBufSize_Size_LargePIO_RMASK 0x1FFF +#define QIB_7322_SendBufSize_Size_SmallPIO_LSB 0x0 +#define QIB_7322_SendBufSize_Size_SmallPIO_MSB 0xB +#define QIB_7322_SendBufSize_Size_SmallPIO_RMASK 0xFFF + +#define QIB_7322_SendBufCnt_OFFS 0x1D8 +#define QIB_7322_SendBufCnt_DEF 0x0000002000000080 +#define QIB_7322_SendBufCnt_Num_LargeBuffers_LSB 0x20 +#define QIB_7322_SendBufCnt_Num_LargeBuffers_MSB 0x25 +#define QIB_7322_SendBufCnt_Num_LargeBuffers_RMASK 0x3F +#define QIB_7322_SendBufCnt_Num_SmallBuffers_LSB 0x0 +#define QIB_7322_SendBufCnt_Num_SmallBuffers_MSB 0x8 +#define QIB_7322_SendBufCnt_Num_SmallBuffers_RMASK 0x1FF + +#define QIB_7322_SendBufAvailAddr_OFFS 0x1E0 +#define QIB_7322_SendBufAvailAddr_DEF 0x0000000000000000 +#define QIB_7322_SendBufAvailAddr_SendBufAvailAddr_LSB 0x6 +#define QIB_7322_SendBufAvailAddr_SendBufAvailAddr_MSB 0x27 +#define QIB_7322_SendBufAvailAddr_SendBufAvailAddr_RMASK 0x3FFFFFFFF + +#define QIB_7322_SendBufErr0_OFFS 0x240 +#define QIB_7322_SendBufErr0_DEF 0x0000000000000000 +#define QIB_7322_SendBufErr0_SendBufErr_63_0_LSB 0x0 +#define QIB_7322_SendBufErr0_SendBufErr_63_0_MSB 0x3F +#define QIB_7322_SendBufErr0_SendBufErr_63_0_RMASK 0x0 + +#define QIB_7322_AvailUpdCount_OFFS 0x268 +#define QIB_7322_AvailUpdCount_DEF 0x0000000000000000 +#define QIB_7322_AvailUpdCount_AvailUpdCount_LSB 0x0 +#define QIB_7322_AvailUpdCount_AvailUpdCount_MSB 0x4 +#define QIB_7322_AvailUpdCount_AvailUpdCount_RMASK 0x1F + +#define QIB_7322_RcvHdrAddr0_OFFS 0x280 +#define QIB_7322_RcvHdrAddr0_DEF 0x0000000000000000 +#define QIB_7322_RcvHdrAddr0_RcvHdrAddr_LSB 0x2 +#define QIB_7322_RcvHdrAddr0_RcvHdrAddr_MSB 0x27 +#define QIB_7322_RcvHdrAddr0_RcvHdrAddr_RMASK 0x3FFFFFFFFF + +#define QIB_7322_RcvHdrTailAddr0_OFFS 0x340 +#define QIB_7322_RcvHdrTailAddr0_DEF 0x0000000000000000 +#define QIB_7322_RcvHdrTailAddr0_RcvHdrTailAddr_LSB 0x2 +#define QIB_7322_RcvHdrTailAddr0_RcvHdrTailAddr_MSB 0x27 +#define QIB_7322_RcvHdrTailAddr0_RcvHdrTailAddr_RMASK 0x3FFFFFFFFF + +#define QIB_7322_ahb_access_ctrl_OFFS 0x460 +#define QIB_7322_ahb_access_ctrl_DEF 0x0000000000000000 +#define QIB_7322_ahb_access_ctrl_sw_sel_ahb_trgt_LSB 0x1 +#define QIB_7322_ahb_access_ctrl_sw_sel_ahb_trgt_MSB 0x2 +#define QIB_7322_ahb_access_ctrl_sw_sel_ahb_trgt_RMASK 0x3 +#define QIB_7322_ahb_access_ctrl_sw_ahb_sel_LSB 0x0 +#define QIB_7322_ahb_access_ctrl_sw_ahb_sel_MSB 0x0 +#define QIB_7322_ahb_access_ctrl_sw_ahb_sel_RMASK 0x1 + +#define QIB_7322_ahb_transaction_reg_OFFS 0x468 +#define QIB_7322_ahb_transaction_reg_DEF 0x0000000080000000 +#define QIB_7322_ahb_transaction_reg_ahb_data_LSB 0x20 +#define QIB_7322_ahb_transaction_reg_ahb_data_MSB 0x3F +#define QIB_7322_ahb_transaction_reg_ahb_data_RMASK 0xFFFFFFFF +#define QIB_7322_ahb_transaction_reg_ahb_rdy_LSB 0x1F +#define QIB_7322_ahb_transaction_reg_ahb_rdy_MSB 0x1F +#define QIB_7322_ahb_transaction_reg_ahb_rdy_RMASK 0x1 +#define QIB_7322_ahb_transaction_reg_ahb_req_err_LSB 0x1E +#define QIB_7322_ahb_transaction_reg_ahb_req_err_MSB 0x1E +#define QIB_7322_ahb_transaction_reg_ahb_req_err_RMASK 0x1 +#define QIB_7322_ahb_transaction_reg_write_not_read_LSB 0x1B +#define QIB_7322_ahb_transaction_reg_write_not_read_MSB 0x1B +#define QIB_7322_ahb_transaction_reg_write_not_read_RMASK 0x1 +#define QIB_7322_ahb_transaction_reg_ahb_address_LSB 0x10 +#define QIB_7322_ahb_transaction_reg_ahb_address_MSB 0x1A +#define QIB_7322_ahb_transaction_reg_ahb_address_RMASK 0x7FF + +#define QIB_7322_SPC_JTAG_ACCESS_REG_OFFS 0x470 +#define QIB_7322_SPC_JTAG_ACCESS_REG_DEF 0x0000000000000001 +#define QIB_7322_SPC_JTAG_ACCESS_REG_SPC_JTAG_ACCESS_EN_LSB 0xA +#define QIB_7322_SPC_JTAG_ACCESS_REG_SPC_JTAG_ACCESS_EN_MSB 0xA +#define QIB_7322_SPC_JTAG_ACCESS_REG_SPC_JTAG_ACCESS_EN_RMASK 0x1 +#define QIB_7322_SPC_JTAG_ACCESS_REG_bist_en_LSB 0x5 +#define QIB_7322_SPC_JTAG_ACCESS_REG_bist_en_MSB 0x9 +#define QIB_7322_SPC_JTAG_ACCESS_REG_bist_en_RMASK 0x1F +#define QIB_7322_SPC_JTAG_ACCESS_REG_opcode_LSB 0x3 +#define QIB_7322_SPC_JTAG_ACCESS_REG_opcode_MSB 0x4 +#define QIB_7322_SPC_JTAG_ACCESS_REG_opcode_RMASK 0x3 +#define QIB_7322_SPC_JTAG_ACCESS_REG_tdi_LSB 0x2 +#define QIB_7322_SPC_JTAG_ACCESS_REG_tdi_MSB 0x2 +#define QIB_7322_SPC_JTAG_ACCESS_REG_tdi_RMASK 0x1 +#define QIB_7322_SPC_JTAG_ACCESS_REG_tdo_LSB 0x1 +#define QIB_7322_SPC_JTAG_ACCESS_REG_tdo_MSB 0x1 +#define QIB_7322_SPC_JTAG_ACCESS_REG_tdo_RMASK 0x1 +#define QIB_7322_SPC_JTAG_ACCESS_REG_rdy_LSB 0x0 +#define QIB_7322_SPC_JTAG_ACCESS_REG_rdy_MSB 0x0 +#define QIB_7322_SPC_JTAG_ACCESS_REG_rdy_RMASK 0x1 + +#define QIB_7322_SendCheckMask0_OFFS 0x4C0 +#define QIB_7322_SendCheckMask0_DEF 0x0000000000000000 +#define QIB_7322_SendCheckMask0_SendCheckMask_63_32_LSB 0x0 +#define QIB_7322_SendCheckMask0_SendCheckMask_63_32_MSB 0x3F +#define QIB_7322_SendCheckMask0_SendCheckMask_63_32_RMASK 0x0 + +#define QIB_7322_SendGRHCheckMask0_OFFS 0x4E0 +#define QIB_7322_SendGRHCheckMask0_DEF 0x0000000000000000 +#define QIB_7322_SendGRHCheckMask0_SendGRHCheckMask_63_32_LSB 0x0 +#define QIB_7322_SendGRHCheckMask0_SendGRHCheckMask_63_32_MSB 0x3F +#define QIB_7322_SendGRHCheckMask0_SendGRHCheckMask_63_32_RMASK 0x0 + +#define QIB_7322_SendIBPacketMask0_OFFS 0x500 +#define QIB_7322_SendIBPacketMask0_DEF 0x0000000000000000 +#define QIB_7322_SendIBPacketMask0_SendIBPacketMask_63_32_LSB 0x0 +#define QIB_7322_SendIBPacketMask0_SendIBPacketMask_63_32_MSB 0x3F +#define QIB_7322_SendIBPacketMask0_SendIBPacketMask_63_32_RMASK 0x0 + +#define QIB_7322_IntRedirect0_OFFS 0x540 +#define QIB_7322_IntRedirect0_DEF 0x0000000000000000 +#define QIB_7322_IntRedirect0_vec11_LSB 0x37 +#define QIB_7322_IntRedirect0_vec11_MSB 0x3B +#define QIB_7322_IntRedirect0_vec11_RMASK 0x1F +#define QIB_7322_IntRedirect0_vec10_LSB 0x32 +#define QIB_7322_IntRedirect0_vec10_MSB 0x36 +#define QIB_7322_IntRedirect0_vec10_RMASK 0x1F +#define QIB_7322_IntRedirect0_vec9_LSB 0x2D +#define QIB_7322_IntRedirect0_vec9_MSB 0x31 +#define QIB_7322_IntRedirect0_vec9_RMASK 0x1F +#define QIB_7322_IntRedirect0_vec8_LSB 0x28 +#define QIB_7322_IntRedirect0_vec8_MSB 0x2C +#define QIB_7322_IntRedirect0_vec8_RMASK 0x1F +#define QIB_7322_IntRedirect0_vec7_LSB 0x23 +#define QIB_7322_IntRedirect0_vec7_MSB 0x27 +#define QIB_7322_IntRedirect0_vec7_RMASK 0x1F +#define QIB_7322_IntRedirect0_vec6_LSB 0x1E +#define QIB_7322_IntRedirect0_vec6_MSB 0x22 +#define QIB_7322_IntRedirect0_vec6_RMASK 0x1F +#define QIB_7322_IntRedirect0_vec5_LSB 0x19 +#define QIB_7322_IntRedirect0_vec5_MSB 0x1D +#define QIB_7322_IntRedirect0_vec5_RMASK 0x1F +#define QIB_7322_IntRedirect0_vec4_LSB 0x14 +#define QIB_7322_IntRedirect0_vec4_MSB 0x18 +#define QIB_7322_IntRedirect0_vec4_RMASK 0x1F +#define QIB_7322_IntRedirect0_vec3_LSB 0xF +#define QIB_7322_IntRedirect0_vec3_MSB 0x13 +#define QIB_7322_IntRedirect0_vec3_RMASK 0x1F +#define QIB_7322_IntRedirect0_vec2_LSB 0xA +#define QIB_7322_IntRedirect0_vec2_MSB 0xE +#define QIB_7322_IntRedirect0_vec2_RMASK 0x1F +#define QIB_7322_IntRedirect0_vec1_LSB 0x5 +#define QIB_7322_IntRedirect0_vec1_MSB 0x9 +#define QIB_7322_IntRedirect0_vec1_RMASK 0x1F +#define QIB_7322_IntRedirect0_vec0_LSB 0x0 +#define QIB_7322_IntRedirect0_vec0_MSB 0x4 +#define QIB_7322_IntRedirect0_vec0_RMASK 0x1F + +#define QIB_7322_Int_Granted_OFFS 0x570 +#define QIB_7322_Int_Granted_DEF 0x0000000000000000 + +#define QIB_7322_vec_clr_without_int_OFFS 0x578 +#define QIB_7322_vec_clr_without_int_DEF 0x0000000000000000 + +#define QIB_7322_DCACtrlA_OFFS 0x580 +#define QIB_7322_DCACtrlA_DEF 0x0000000000000000 +#define QIB_7322_DCACtrlA_SendDMAHead1DCAEnable_LSB 0x4 +#define QIB_7322_DCACtrlA_SendDMAHead1DCAEnable_MSB 0x4 +#define QIB_7322_DCACtrlA_SendDMAHead1DCAEnable_RMASK 0x1 +#define QIB_7322_DCACtrlA_SendDMAHead0DCAEnable_LSB 0x3 +#define QIB_7322_DCACtrlA_SendDMAHead0DCAEnable_MSB 0x3 +#define QIB_7322_DCACtrlA_SendDMAHead0DCAEnable_RMASK 0x1 +#define QIB_7322_DCACtrlA_RcvTailUpdDCAEnable_LSB 0x2 +#define QIB_7322_DCACtrlA_RcvTailUpdDCAEnable_MSB 0x2 +#define QIB_7322_DCACtrlA_RcvTailUpdDCAEnable_RMASK 0x1 +#define QIB_7322_DCACtrlA_EagerDCAEnable_LSB 0x1 +#define QIB_7322_DCACtrlA_EagerDCAEnable_MSB 0x1 +#define QIB_7322_DCACtrlA_EagerDCAEnable_RMASK 0x1 +#define QIB_7322_DCACtrlA_RcvHdrqDCAEnable_LSB 0x0 +#define QIB_7322_DCACtrlA_RcvHdrqDCAEnable_MSB 0x0 +#define QIB_7322_DCACtrlA_RcvHdrqDCAEnable_RMASK 0x1 + +#define QIB_7322_DCACtrlB_OFFS 0x588 +#define QIB_7322_DCACtrlB_DEF 0x0000000000000000 +#define QIB_7322_DCACtrlB_RcvHdrq3DCAXfrCnt_LSB 0x36 +#define QIB_7322_DCACtrlB_RcvHdrq3DCAXfrCnt_MSB 0x3B +#define QIB_7322_DCACtrlB_RcvHdrq3DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlB_RcvHdrq3DCAOPH_LSB 0x2E +#define QIB_7322_DCACtrlB_RcvHdrq3DCAOPH_MSB 0x35 +#define QIB_7322_DCACtrlB_RcvHdrq3DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlB_RcvHdrq2DCAXfrCnt_LSB 0x28 +#define QIB_7322_DCACtrlB_RcvHdrq2DCAXfrCnt_MSB 0x2D +#define QIB_7322_DCACtrlB_RcvHdrq2DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlB_RcvHdrq2DCAOPH_LSB 0x20 +#define QIB_7322_DCACtrlB_RcvHdrq2DCAOPH_MSB 0x27 +#define QIB_7322_DCACtrlB_RcvHdrq2DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlB_RcvHdrq1DCAXfrCnt_LSB 0x16 +#define QIB_7322_DCACtrlB_RcvHdrq1DCAXfrCnt_MSB 0x1B +#define QIB_7322_DCACtrlB_RcvHdrq1DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlB_RcvHdrq1DCAOPH_LSB 0xE +#define QIB_7322_DCACtrlB_RcvHdrq1DCAOPH_MSB 0x15 +#define QIB_7322_DCACtrlB_RcvHdrq1DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlB_RcvHdrq0DCAXfrCnt_LSB 0x8 +#define QIB_7322_DCACtrlB_RcvHdrq0DCAXfrCnt_MSB 0xD +#define QIB_7322_DCACtrlB_RcvHdrq0DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlB_RcvHdrq0DCAOPH_LSB 0x0 +#define QIB_7322_DCACtrlB_RcvHdrq0DCAOPH_MSB 0x7 +#define QIB_7322_DCACtrlB_RcvHdrq0DCAOPH_RMASK 0xFF + +#define QIB_7322_DCACtrlC_OFFS 0x590 +#define QIB_7322_DCACtrlC_DEF 0x0000000000000000 +#define QIB_7322_DCACtrlC_RcvHdrq7DCAXfrCnt_LSB 0x36 +#define QIB_7322_DCACtrlC_RcvHdrq7DCAXfrCnt_MSB 0x3B +#define QIB_7322_DCACtrlC_RcvHdrq7DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlC_RcvHdrq7DCAOPH_LSB 0x2E +#define QIB_7322_DCACtrlC_RcvHdrq7DCAOPH_MSB 0x35 +#define QIB_7322_DCACtrlC_RcvHdrq7DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlC_RcvHdrq6DCAXfrCnt_LSB 0x28 +#define QIB_7322_DCACtrlC_RcvHdrq6DCAXfrCnt_MSB 0x2D +#define QIB_7322_DCACtrlC_RcvHdrq6DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlC_RcvHdrq6DCAOPH_LSB 0x20 +#define QIB_7322_DCACtrlC_RcvHdrq6DCAOPH_MSB 0x27 +#define QIB_7322_DCACtrlC_RcvHdrq6DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlC_RcvHdrq5DCAXfrCnt_LSB 0x16 +#define QIB_7322_DCACtrlC_RcvHdrq5DCAXfrCnt_MSB 0x1B +#define QIB_7322_DCACtrlC_RcvHdrq5DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlC_RcvHdrq5DCAOPH_LSB 0xE +#define QIB_7322_DCACtrlC_RcvHdrq5DCAOPH_MSB 0x15 +#define QIB_7322_DCACtrlC_RcvHdrq5DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlC_RcvHdrq4DCAXfrCnt_LSB 0x8 +#define QIB_7322_DCACtrlC_RcvHdrq4DCAXfrCnt_MSB 0xD +#define QIB_7322_DCACtrlC_RcvHdrq4DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlC_RcvHdrq4DCAOPH_LSB 0x0 +#define QIB_7322_DCACtrlC_RcvHdrq4DCAOPH_MSB 0x7 +#define QIB_7322_DCACtrlC_RcvHdrq4DCAOPH_RMASK 0xFF + +#define QIB_7322_DCACtrlD_OFFS 0x598 +#define QIB_7322_DCACtrlD_DEF 0x0000000000000000 +#define QIB_7322_DCACtrlD_RcvHdrq11DCAXfrCnt_LSB 0x36 +#define QIB_7322_DCACtrlD_RcvHdrq11DCAXfrCnt_MSB 0x3B +#define QIB_7322_DCACtrlD_RcvHdrq11DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlD_RcvHdrq11DCAOPH_LSB 0x2E +#define QIB_7322_DCACtrlD_RcvHdrq11DCAOPH_MSB 0x35 +#define QIB_7322_DCACtrlD_RcvHdrq11DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlD_RcvHdrq10DCAXfrCnt_LSB 0x28 +#define QIB_7322_DCACtrlD_RcvHdrq10DCAXfrCnt_MSB 0x2D +#define QIB_7322_DCACtrlD_RcvHdrq10DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlD_RcvHdrq10DCAOPH_LSB 0x20 +#define QIB_7322_DCACtrlD_RcvHdrq10DCAOPH_MSB 0x27 +#define QIB_7322_DCACtrlD_RcvHdrq10DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlD_RcvHdrq9DCAXfrCnt_LSB 0x16 +#define QIB_7322_DCACtrlD_RcvHdrq9DCAXfrCnt_MSB 0x1B +#define QIB_7322_DCACtrlD_RcvHdrq9DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlD_RcvHdrq9DCAOPH_LSB 0xE +#define QIB_7322_DCACtrlD_RcvHdrq9DCAOPH_MSB 0x15 +#define QIB_7322_DCACtrlD_RcvHdrq9DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlD_RcvHdrq8DCAXfrCnt_LSB 0x8 +#define QIB_7322_DCACtrlD_RcvHdrq8DCAXfrCnt_MSB 0xD +#define QIB_7322_DCACtrlD_RcvHdrq8DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlD_RcvHdrq8DCAOPH_LSB 0x0 +#define QIB_7322_DCACtrlD_RcvHdrq8DCAOPH_MSB 0x7 +#define QIB_7322_DCACtrlD_RcvHdrq8DCAOPH_RMASK 0xFF + +#define QIB_7322_DCACtrlE_OFFS 0x5A0 +#define QIB_7322_DCACtrlE_DEF 0x0000000000000000 +#define QIB_7322_DCACtrlE_RcvHdrq15DCAXfrCnt_LSB 0x36 +#define QIB_7322_DCACtrlE_RcvHdrq15DCAXfrCnt_MSB 0x3B +#define QIB_7322_DCACtrlE_RcvHdrq15DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlE_RcvHdrq15DCAOPH_LSB 0x2E +#define QIB_7322_DCACtrlE_RcvHdrq15DCAOPH_MSB 0x35 +#define QIB_7322_DCACtrlE_RcvHdrq15DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlE_RcvHdrq14DCAXfrCnt_LSB 0x28 +#define QIB_7322_DCACtrlE_RcvHdrq14DCAXfrCnt_MSB 0x2D +#define QIB_7322_DCACtrlE_RcvHdrq14DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlE_RcvHdrq14DCAOPH_LSB 0x20 +#define QIB_7322_DCACtrlE_RcvHdrq14DCAOPH_MSB 0x27 +#define QIB_7322_DCACtrlE_RcvHdrq14DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlE_RcvHdrq13DCAXfrCnt_LSB 0x16 +#define QIB_7322_DCACtrlE_RcvHdrq13DCAXfrCnt_MSB 0x1B +#define QIB_7322_DCACtrlE_RcvHdrq13DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlE_RcvHdrq13DCAOPH_LSB 0xE +#define QIB_7322_DCACtrlE_RcvHdrq13DCAOPH_MSB 0x15 +#define QIB_7322_DCACtrlE_RcvHdrq13DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlE_RcvHdrq12DCAXfrCnt_LSB 0x8 +#define QIB_7322_DCACtrlE_RcvHdrq12DCAXfrCnt_MSB 0xD +#define QIB_7322_DCACtrlE_RcvHdrq12DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlE_RcvHdrq12DCAOPH_LSB 0x0 +#define QIB_7322_DCACtrlE_RcvHdrq12DCAOPH_MSB 0x7 +#define QIB_7322_DCACtrlE_RcvHdrq12DCAOPH_RMASK 0xFF + +#define QIB_7322_DCACtrlF_OFFS 0x5A8 +#define QIB_7322_DCACtrlF_DEF 0x0000000000000000 +#define QIB_7322_DCACtrlF_SendDma1DCAOPH_LSB 0x28 +#define QIB_7322_DCACtrlF_SendDma1DCAOPH_MSB 0x2F +#define QIB_7322_DCACtrlF_SendDma1DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlF_SendDma0DCAOPH_LSB 0x20 +#define QIB_7322_DCACtrlF_SendDma0DCAOPH_MSB 0x27 +#define QIB_7322_DCACtrlF_SendDma0DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlF_RcvHdrq17DCAXfrCnt_LSB 0x16 +#define QIB_7322_DCACtrlF_RcvHdrq17DCAXfrCnt_MSB 0x1B +#define QIB_7322_DCACtrlF_RcvHdrq17DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlF_RcvHdrq17DCAOPH_LSB 0xE +#define QIB_7322_DCACtrlF_RcvHdrq17DCAOPH_MSB 0x15 +#define QIB_7322_DCACtrlF_RcvHdrq17DCAOPH_RMASK 0xFF +#define QIB_7322_DCACtrlF_RcvHdrq16DCAXfrCnt_LSB 0x8 +#define QIB_7322_DCACtrlF_RcvHdrq16DCAXfrCnt_MSB 0xD +#define QIB_7322_DCACtrlF_RcvHdrq16DCAXfrCnt_RMASK 0x3F +#define QIB_7322_DCACtrlF_RcvHdrq16DCAOPH_LSB 0x0 +#define QIB_7322_DCACtrlF_RcvHdrq16DCAOPH_MSB 0x7 +#define QIB_7322_DCACtrlF_RcvHdrq16DCAOPH_RMASK 0xFF + +#define QIB_7322_RcvAvailTimeOut0_OFFS 0xC00 +#define QIB_7322_RcvAvailTimeOut0_DEF 0x0000000000000000 +#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOCount_LSB 0x10 +#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOCount_MSB 0x1F +#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOCount_RMASK 0xFFFF +#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOReload_LSB 0x0 +#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOReload_MSB 0xF +#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOReload_RMASK 0xFFFF + +#define QIB_7322_CntrRegBase_0_OFFS 0x1028 +#define QIB_7322_CntrRegBase_0_DEF 0x0000000000012000 + +#define QIB_7322_ErrMask_0_OFFS 0x1080 +#define QIB_7322_ErrMask_0_DEF 0x0000000000000000 +#define QIB_7322_ErrMask_0_IBStatusChangedMask_LSB 0x3A +#define QIB_7322_ErrMask_0_IBStatusChangedMask_MSB 0x3A +#define QIB_7322_ErrMask_0_IBStatusChangedMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SHeadersErrMask_LSB 0x39 +#define QIB_7322_ErrMask_0_SHeadersErrMask_MSB 0x39 +#define QIB_7322_ErrMask_0_SHeadersErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_VL15BufMisuseErrMask_LSB 0x36 +#define QIB_7322_ErrMask_0_VL15BufMisuseErrMask_MSB 0x36 +#define QIB_7322_ErrMask_0_VL15BufMisuseErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SDmaHaltErrMask_LSB 0x31 +#define QIB_7322_ErrMask_0_SDmaHaltErrMask_MSB 0x31 +#define QIB_7322_ErrMask_0_SDmaHaltErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SDmaDescAddrMisalignErrMask_LSB 0x30 +#define QIB_7322_ErrMask_0_SDmaDescAddrMisalignErrMask_MSB 0x30 +#define QIB_7322_ErrMask_0_SDmaDescAddrMisalignErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SDmaUnexpDataErrMask_LSB 0x2F +#define QIB_7322_ErrMask_0_SDmaUnexpDataErrMask_MSB 0x2F +#define QIB_7322_ErrMask_0_SDmaUnexpDataErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SDmaMissingDwErrMask_LSB 0x2E +#define QIB_7322_ErrMask_0_SDmaMissingDwErrMask_MSB 0x2E +#define QIB_7322_ErrMask_0_SDmaMissingDwErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SDmaDwEnErrMask_LSB 0x2D +#define QIB_7322_ErrMask_0_SDmaDwEnErrMask_MSB 0x2D +#define QIB_7322_ErrMask_0_SDmaDwEnErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SDmaRpyTagErrMask_LSB 0x2C +#define QIB_7322_ErrMask_0_SDmaRpyTagErrMask_MSB 0x2C +#define QIB_7322_ErrMask_0_SDmaRpyTagErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SDma1stDescErrMask_LSB 0x2B +#define QIB_7322_ErrMask_0_SDma1stDescErrMask_MSB 0x2B +#define QIB_7322_ErrMask_0_SDma1stDescErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SDmaBaseErrMask_LSB 0x2A +#define QIB_7322_ErrMask_0_SDmaBaseErrMask_MSB 0x2A +#define QIB_7322_ErrMask_0_SDmaBaseErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SDmaTailOutOfBoundErrMask_LSB 0x29 +#define QIB_7322_ErrMask_0_SDmaTailOutOfBoundErrMask_MSB 0x29 +#define QIB_7322_ErrMask_0_SDmaTailOutOfBoundErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SDmaOutOfBoundErrMask_LSB 0x28 +#define QIB_7322_ErrMask_0_SDmaOutOfBoundErrMask_MSB 0x28 +#define QIB_7322_ErrMask_0_SDmaOutOfBoundErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SDmaGenMismatchErrMask_LSB 0x27 +#define QIB_7322_ErrMask_0_SDmaGenMismatchErrMask_MSB 0x27 +#define QIB_7322_ErrMask_0_SDmaGenMismatchErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SendBufMisuseErrMask_LSB 0x26 +#define QIB_7322_ErrMask_0_SendBufMisuseErrMask_MSB 0x26 +#define QIB_7322_ErrMask_0_SendBufMisuseErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SendUnsupportedVLErrMask_LSB 0x25 +#define QIB_7322_ErrMask_0_SendUnsupportedVLErrMask_MSB 0x25 +#define QIB_7322_ErrMask_0_SendUnsupportedVLErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SendUnexpectedPktNumErrMask_LSB 0x24 +#define QIB_7322_ErrMask_0_SendUnexpectedPktNumErrMask_MSB 0x24 +#define QIB_7322_ErrMask_0_SendUnexpectedPktNumErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SendDroppedDataPktErrMask_LSB 0x22 +#define QIB_7322_ErrMask_0_SendDroppedDataPktErrMask_MSB 0x22 +#define QIB_7322_ErrMask_0_SendDroppedDataPktErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SendDroppedSmpPktErrMask_LSB 0x21 +#define QIB_7322_ErrMask_0_SendDroppedSmpPktErrMask_MSB 0x21 +#define QIB_7322_ErrMask_0_SendDroppedSmpPktErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SendPktLenErrMask_LSB 0x20 +#define QIB_7322_ErrMask_0_SendPktLenErrMask_MSB 0x20 +#define QIB_7322_ErrMask_0_SendPktLenErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SendUnderRunErrMask_LSB 0x1F +#define QIB_7322_ErrMask_0_SendUnderRunErrMask_MSB 0x1F +#define QIB_7322_ErrMask_0_SendUnderRunErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SendMaxPktLenErrMask_LSB 0x1E +#define QIB_7322_ErrMask_0_SendMaxPktLenErrMask_MSB 0x1E +#define QIB_7322_ErrMask_0_SendMaxPktLenErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_SendMinPktLenErrMask_LSB 0x1D +#define QIB_7322_ErrMask_0_SendMinPktLenErrMask_MSB 0x1D +#define QIB_7322_ErrMask_0_SendMinPktLenErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvIBLostLinkErrMask_LSB 0x11 +#define QIB_7322_ErrMask_0_RcvIBLostLinkErrMask_MSB 0x11 +#define QIB_7322_ErrMask_0_RcvIBLostLinkErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvHdrErrMask_LSB 0x10 +#define QIB_7322_ErrMask_0_RcvHdrErrMask_MSB 0x10 +#define QIB_7322_ErrMask_0_RcvHdrErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvHdrLenErrMask_LSB 0xF +#define QIB_7322_ErrMask_0_RcvHdrLenErrMask_MSB 0xF +#define QIB_7322_ErrMask_0_RcvHdrLenErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvBadTidErrMask_LSB 0xE +#define QIB_7322_ErrMask_0_RcvBadTidErrMask_MSB 0xE +#define QIB_7322_ErrMask_0_RcvBadTidErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvBadVersionErrMask_LSB 0xB +#define QIB_7322_ErrMask_0_RcvBadVersionErrMask_MSB 0xB +#define QIB_7322_ErrMask_0_RcvBadVersionErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvIBFlowErrMask_LSB 0xA +#define QIB_7322_ErrMask_0_RcvIBFlowErrMask_MSB 0xA +#define QIB_7322_ErrMask_0_RcvIBFlowErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvEBPErrMask_LSB 0x9 +#define QIB_7322_ErrMask_0_RcvEBPErrMask_MSB 0x9 +#define QIB_7322_ErrMask_0_RcvEBPErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvUnsupportedVLErrMask_LSB 0x8 +#define QIB_7322_ErrMask_0_RcvUnsupportedVLErrMask_MSB 0x8 +#define QIB_7322_ErrMask_0_RcvUnsupportedVLErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvUnexpectedCharErrMask_LSB 0x7 +#define QIB_7322_ErrMask_0_RcvUnexpectedCharErrMask_MSB 0x7 +#define QIB_7322_ErrMask_0_RcvUnexpectedCharErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvShortPktLenErrMask_LSB 0x6 +#define QIB_7322_ErrMask_0_RcvShortPktLenErrMask_MSB 0x6 +#define QIB_7322_ErrMask_0_RcvShortPktLenErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvLongPktLenErrMask_LSB 0x5 +#define QIB_7322_ErrMask_0_RcvLongPktLenErrMask_MSB 0x5 +#define QIB_7322_ErrMask_0_RcvLongPktLenErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvMaxPktLenErrMask_LSB 0x4 +#define QIB_7322_ErrMask_0_RcvMaxPktLenErrMask_MSB 0x4 +#define QIB_7322_ErrMask_0_RcvMaxPktLenErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvMinPktLenErrMask_LSB 0x3 +#define QIB_7322_ErrMask_0_RcvMinPktLenErrMask_MSB 0x3 +#define QIB_7322_ErrMask_0_RcvMinPktLenErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvICRCErrMask_LSB 0x2 +#define QIB_7322_ErrMask_0_RcvICRCErrMask_MSB 0x2 +#define QIB_7322_ErrMask_0_RcvICRCErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvVCRCErrMask_LSB 0x1 +#define QIB_7322_ErrMask_0_RcvVCRCErrMask_MSB 0x1 +#define QIB_7322_ErrMask_0_RcvVCRCErrMask_RMASK 0x1 +#define QIB_7322_ErrMask_0_RcvFormatErrMask_LSB 0x0 +#define QIB_7322_ErrMask_0_RcvFormatErrMask_MSB 0x0 +#define QIB_7322_ErrMask_0_RcvFormatErrMask_RMASK 0x1 + +#define QIB_7322_ErrStatus_0_OFFS 0x1088 +#define QIB_7322_ErrStatus_0_DEF 0x0000000000000000 +#define QIB_7322_ErrStatus_0_IBStatusChanged_LSB 0x3A +#define QIB_7322_ErrStatus_0_IBStatusChanged_MSB 0x3A +#define QIB_7322_ErrStatus_0_IBStatusChanged_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SHeadersErr_LSB 0x39 +#define QIB_7322_ErrStatus_0_SHeadersErr_MSB 0x39 +#define QIB_7322_ErrStatus_0_SHeadersErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_VL15BufMisuseErr_LSB 0x36 +#define QIB_7322_ErrStatus_0_VL15BufMisuseErr_MSB 0x36 +#define QIB_7322_ErrStatus_0_VL15BufMisuseErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SDmaHaltErr_LSB 0x31 +#define QIB_7322_ErrStatus_0_SDmaHaltErr_MSB 0x31 +#define QIB_7322_ErrStatus_0_SDmaHaltErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SDmaDescAddrMisalignErr_LSB 0x30 +#define QIB_7322_ErrStatus_0_SDmaDescAddrMisalignErr_MSB 0x30 +#define QIB_7322_ErrStatus_0_SDmaDescAddrMisalignErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SDmaUnexpDataErr_LSB 0x2F +#define QIB_7322_ErrStatus_0_SDmaUnexpDataErr_MSB 0x2F +#define QIB_7322_ErrStatus_0_SDmaUnexpDataErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SDmaMissingDwErr_LSB 0x2E +#define QIB_7322_ErrStatus_0_SDmaMissingDwErr_MSB 0x2E +#define QIB_7322_ErrStatus_0_SDmaMissingDwErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SDmaDwEnErr_LSB 0x2D +#define QIB_7322_ErrStatus_0_SDmaDwEnErr_MSB 0x2D +#define QIB_7322_ErrStatus_0_SDmaDwEnErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SDmaRpyTagErr_LSB 0x2C +#define QIB_7322_ErrStatus_0_SDmaRpyTagErr_MSB 0x2C +#define QIB_7322_ErrStatus_0_SDmaRpyTagErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SDma1stDescErr_LSB 0x2B +#define QIB_7322_ErrStatus_0_SDma1stDescErr_MSB 0x2B +#define QIB_7322_ErrStatus_0_SDma1stDescErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SDmaBaseErr_LSB 0x2A +#define QIB_7322_ErrStatus_0_SDmaBaseErr_MSB 0x2A +#define QIB_7322_ErrStatus_0_SDmaBaseErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SDmaTailOutOfBoundErr_LSB 0x29 +#define QIB_7322_ErrStatus_0_SDmaTailOutOfBoundErr_MSB 0x29 +#define QIB_7322_ErrStatus_0_SDmaTailOutOfBoundErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SDmaOutOfBoundErr_LSB 0x28 +#define QIB_7322_ErrStatus_0_SDmaOutOfBoundErr_MSB 0x28 +#define QIB_7322_ErrStatus_0_SDmaOutOfBoundErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SDmaGenMismatchErr_LSB 0x27 +#define QIB_7322_ErrStatus_0_SDmaGenMismatchErr_MSB 0x27 +#define QIB_7322_ErrStatus_0_SDmaGenMismatchErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SendBufMisuseErr_LSB 0x26 +#define QIB_7322_ErrStatus_0_SendBufMisuseErr_MSB 0x26 +#define QIB_7322_ErrStatus_0_SendBufMisuseErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SendUnsupportedVLErr_LSB 0x25 +#define QIB_7322_ErrStatus_0_SendUnsupportedVLErr_MSB 0x25 +#define QIB_7322_ErrStatus_0_SendUnsupportedVLErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SendUnexpectedPktNumErr_LSB 0x24 +#define QIB_7322_ErrStatus_0_SendUnexpectedPktNumErr_MSB 0x24 +#define QIB_7322_ErrStatus_0_SendUnexpectedPktNumErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SendDroppedDataPktErr_LSB 0x22 +#define QIB_7322_ErrStatus_0_SendDroppedDataPktErr_MSB 0x22 +#define QIB_7322_ErrStatus_0_SendDroppedDataPktErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SendDroppedSmpPktErr_LSB 0x21 +#define QIB_7322_ErrStatus_0_SendDroppedSmpPktErr_MSB 0x21 +#define QIB_7322_ErrStatus_0_SendDroppedSmpPktErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SendPktLenErr_LSB 0x20 +#define QIB_7322_ErrStatus_0_SendPktLenErr_MSB 0x20 +#define QIB_7322_ErrStatus_0_SendPktLenErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SendUnderRunErr_LSB 0x1F +#define QIB_7322_ErrStatus_0_SendUnderRunErr_MSB 0x1F +#define QIB_7322_ErrStatus_0_SendUnderRunErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SendMaxPktLenErr_LSB 0x1E +#define QIB_7322_ErrStatus_0_SendMaxPktLenErr_MSB 0x1E +#define QIB_7322_ErrStatus_0_SendMaxPktLenErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_SendMinPktLenErr_LSB 0x1D +#define QIB_7322_ErrStatus_0_SendMinPktLenErr_MSB 0x1D +#define QIB_7322_ErrStatus_0_SendMinPktLenErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvIBLostLinkErr_LSB 0x11 +#define QIB_7322_ErrStatus_0_RcvIBLostLinkErr_MSB 0x11 +#define QIB_7322_ErrStatus_0_RcvIBLostLinkErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvHdrErr_LSB 0x10 +#define QIB_7322_ErrStatus_0_RcvHdrErr_MSB 0x10 +#define QIB_7322_ErrStatus_0_RcvHdrErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvHdrLenErr_LSB 0xF +#define QIB_7322_ErrStatus_0_RcvHdrLenErr_MSB 0xF +#define QIB_7322_ErrStatus_0_RcvHdrLenErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvBadTidErr_LSB 0xE +#define QIB_7322_ErrStatus_0_RcvBadTidErr_MSB 0xE +#define QIB_7322_ErrStatus_0_RcvBadTidErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvBadVersionErr_LSB 0xB +#define QIB_7322_ErrStatus_0_RcvBadVersionErr_MSB 0xB +#define QIB_7322_ErrStatus_0_RcvBadVersionErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvIBFlowErr_LSB 0xA +#define QIB_7322_ErrStatus_0_RcvIBFlowErr_MSB 0xA +#define QIB_7322_ErrStatus_0_RcvIBFlowErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvEBPErr_LSB 0x9 +#define QIB_7322_ErrStatus_0_RcvEBPErr_MSB 0x9 +#define QIB_7322_ErrStatus_0_RcvEBPErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvUnsupportedVLErr_LSB 0x8 +#define QIB_7322_ErrStatus_0_RcvUnsupportedVLErr_MSB 0x8 +#define QIB_7322_ErrStatus_0_RcvUnsupportedVLErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvUnexpectedCharErr_LSB 0x7 +#define QIB_7322_ErrStatus_0_RcvUnexpectedCharErr_MSB 0x7 +#define QIB_7322_ErrStatus_0_RcvUnexpectedCharErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvShortPktLenErr_LSB 0x6 +#define QIB_7322_ErrStatus_0_RcvShortPktLenErr_MSB 0x6 +#define QIB_7322_ErrStatus_0_RcvShortPktLenErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvLongPktLenErr_LSB 0x5 +#define QIB_7322_ErrStatus_0_RcvLongPktLenErr_MSB 0x5 +#define QIB_7322_ErrStatus_0_RcvLongPktLenErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvMaxPktLenErr_LSB 0x4 +#define QIB_7322_ErrStatus_0_RcvMaxPktLenErr_MSB 0x4 +#define QIB_7322_ErrStatus_0_RcvMaxPktLenErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvMinPktLenErr_LSB 0x3 +#define QIB_7322_ErrStatus_0_RcvMinPktLenErr_MSB 0x3 +#define QIB_7322_ErrStatus_0_RcvMinPktLenErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvICRCErr_LSB 0x2 +#define QIB_7322_ErrStatus_0_RcvICRCErr_MSB 0x2 +#define QIB_7322_ErrStatus_0_RcvICRCErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvVCRCErr_LSB 0x1 +#define QIB_7322_ErrStatus_0_RcvVCRCErr_MSB 0x1 +#define QIB_7322_ErrStatus_0_RcvVCRCErr_RMASK 0x1 +#define QIB_7322_ErrStatus_0_RcvFormatErr_LSB 0x0 +#define QIB_7322_ErrStatus_0_RcvFormatErr_MSB 0x0 +#define QIB_7322_ErrStatus_0_RcvFormatErr_RMASK 0x1 + +#define QIB_7322_ErrClear_0_OFFS 0x1090 +#define QIB_7322_ErrClear_0_DEF 0x0000000000000000 +#define QIB_7322_ErrClear_0_IBStatusChangedClear_LSB 0x3A +#define QIB_7322_ErrClear_0_IBStatusChangedClear_MSB 0x3A +#define QIB_7322_ErrClear_0_IBStatusChangedClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SHeadersErrClear_LSB 0x39 +#define QIB_7322_ErrClear_0_SHeadersErrClear_MSB 0x39 +#define QIB_7322_ErrClear_0_SHeadersErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_VL15BufMisuseErrClear_LSB 0x36 +#define QIB_7322_ErrClear_0_VL15BufMisuseErrClear_MSB 0x36 +#define QIB_7322_ErrClear_0_VL15BufMisuseErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SDmaHaltErrClear_LSB 0x31 +#define QIB_7322_ErrClear_0_SDmaHaltErrClear_MSB 0x31 +#define QIB_7322_ErrClear_0_SDmaHaltErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SDmaDescAddrMisalignErrClear_LSB 0x30 +#define QIB_7322_ErrClear_0_SDmaDescAddrMisalignErrClear_MSB 0x30 +#define QIB_7322_ErrClear_0_SDmaDescAddrMisalignErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SDmaUnexpDataErrClear_LSB 0x2F +#define QIB_7322_ErrClear_0_SDmaUnexpDataErrClear_MSB 0x2F +#define QIB_7322_ErrClear_0_SDmaUnexpDataErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SDmaMissingDwErrClear_LSB 0x2E +#define QIB_7322_ErrClear_0_SDmaMissingDwErrClear_MSB 0x2E +#define QIB_7322_ErrClear_0_SDmaMissingDwErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SDmaDwEnErrClear_LSB 0x2D +#define QIB_7322_ErrClear_0_SDmaDwEnErrClear_MSB 0x2D +#define QIB_7322_ErrClear_0_SDmaDwEnErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SDmaRpyTagErrClear_LSB 0x2C +#define QIB_7322_ErrClear_0_SDmaRpyTagErrClear_MSB 0x2C +#define QIB_7322_ErrClear_0_SDmaRpyTagErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SDma1stDescErrClear_LSB 0x2B +#define QIB_7322_ErrClear_0_SDma1stDescErrClear_MSB 0x2B +#define QIB_7322_ErrClear_0_SDma1stDescErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SDmaBaseErrClear_LSB 0x2A +#define QIB_7322_ErrClear_0_SDmaBaseErrClear_MSB 0x2A +#define QIB_7322_ErrClear_0_SDmaBaseErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SDmaTailOutOfBoundErrClear_LSB 0x29 +#define QIB_7322_ErrClear_0_SDmaTailOutOfBoundErrClear_MSB 0x29 +#define QIB_7322_ErrClear_0_SDmaTailOutOfBoundErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SDmaOutOfBoundErrClear_LSB 0x28 +#define QIB_7322_ErrClear_0_SDmaOutOfBoundErrClear_MSB 0x28 +#define QIB_7322_ErrClear_0_SDmaOutOfBoundErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SDmaGenMismatchErrClear_LSB 0x27 +#define QIB_7322_ErrClear_0_SDmaGenMismatchErrClear_MSB 0x27 +#define QIB_7322_ErrClear_0_SDmaGenMismatchErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SendBufMisuseErrClear_LSB 0x26 +#define QIB_7322_ErrClear_0_SendBufMisuseErrClear_MSB 0x26 +#define QIB_7322_ErrClear_0_SendBufMisuseErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SendUnsupportedVLErrClear_LSB 0x25 +#define QIB_7322_ErrClear_0_SendUnsupportedVLErrClear_MSB 0x25 +#define QIB_7322_ErrClear_0_SendUnsupportedVLErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SendUnexpectedPktNumErrClear_LSB 0x24 +#define QIB_7322_ErrClear_0_SendUnexpectedPktNumErrClear_MSB 0x24 +#define QIB_7322_ErrClear_0_SendUnexpectedPktNumErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SendDroppedDataPktErrClear_LSB 0x22 +#define QIB_7322_ErrClear_0_SendDroppedDataPktErrClear_MSB 0x22 +#define QIB_7322_ErrClear_0_SendDroppedDataPktErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SendDroppedSmpPktErrClear_LSB 0x21 +#define QIB_7322_ErrClear_0_SendDroppedSmpPktErrClear_MSB 0x21 +#define QIB_7322_ErrClear_0_SendDroppedSmpPktErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SendPktLenErrClear_LSB 0x20 +#define QIB_7322_ErrClear_0_SendPktLenErrClear_MSB 0x20 +#define QIB_7322_ErrClear_0_SendPktLenErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SendUnderRunErrClear_LSB 0x1F +#define QIB_7322_ErrClear_0_SendUnderRunErrClear_MSB 0x1F +#define QIB_7322_ErrClear_0_SendUnderRunErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SendMaxPktLenErrClear_LSB 0x1E +#define QIB_7322_ErrClear_0_SendMaxPktLenErrClear_MSB 0x1E +#define QIB_7322_ErrClear_0_SendMaxPktLenErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_SendMinPktLenErrClear_LSB 0x1D +#define QIB_7322_ErrClear_0_SendMinPktLenErrClear_MSB 0x1D +#define QIB_7322_ErrClear_0_SendMinPktLenErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvIBLostLinkErrClear_LSB 0x11 +#define QIB_7322_ErrClear_0_RcvIBLostLinkErrClear_MSB 0x11 +#define QIB_7322_ErrClear_0_RcvIBLostLinkErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvHdrErrClear_LSB 0x10 +#define QIB_7322_ErrClear_0_RcvHdrErrClear_MSB 0x10 +#define QIB_7322_ErrClear_0_RcvHdrErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvHdrLenErrClear_LSB 0xF +#define QIB_7322_ErrClear_0_RcvHdrLenErrClear_MSB 0xF +#define QIB_7322_ErrClear_0_RcvHdrLenErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvBadTidErrClear_LSB 0xE +#define QIB_7322_ErrClear_0_RcvBadTidErrClear_MSB 0xE +#define QIB_7322_ErrClear_0_RcvBadTidErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvBadVersionErrClear_LSB 0xB +#define QIB_7322_ErrClear_0_RcvBadVersionErrClear_MSB 0xB +#define QIB_7322_ErrClear_0_RcvBadVersionErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvIBFlowErrClear_LSB 0xA +#define QIB_7322_ErrClear_0_RcvIBFlowErrClear_MSB 0xA +#define QIB_7322_ErrClear_0_RcvIBFlowErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvEBPErrClear_LSB 0x9 +#define QIB_7322_ErrClear_0_RcvEBPErrClear_MSB 0x9 +#define QIB_7322_ErrClear_0_RcvEBPErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvUnsupportedVLErrClear_LSB 0x8 +#define QIB_7322_ErrClear_0_RcvUnsupportedVLErrClear_MSB 0x8 +#define QIB_7322_ErrClear_0_RcvUnsupportedVLErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvUnexpectedCharErrClear_LSB 0x7 +#define QIB_7322_ErrClear_0_RcvUnexpectedCharErrClear_MSB 0x7 +#define QIB_7322_ErrClear_0_RcvUnexpectedCharErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvShortPktLenErrClear_LSB 0x6 +#define QIB_7322_ErrClear_0_RcvShortPktLenErrClear_MSB 0x6 +#define QIB_7322_ErrClear_0_RcvShortPktLenErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvLongPktLenErrClear_LSB 0x5 +#define QIB_7322_ErrClear_0_RcvLongPktLenErrClear_MSB 0x5 +#define QIB_7322_ErrClear_0_RcvLongPktLenErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvMaxPktLenErrClear_LSB 0x4 +#define QIB_7322_ErrClear_0_RcvMaxPktLenErrClear_MSB 0x4 +#define QIB_7322_ErrClear_0_RcvMaxPktLenErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvMinPktLenErrClear_LSB 0x3 +#define QIB_7322_ErrClear_0_RcvMinPktLenErrClear_MSB 0x3 +#define QIB_7322_ErrClear_0_RcvMinPktLenErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvICRCErrClear_LSB 0x2 +#define QIB_7322_ErrClear_0_RcvICRCErrClear_MSB 0x2 +#define QIB_7322_ErrClear_0_RcvICRCErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvVCRCErrClear_LSB 0x1 +#define QIB_7322_ErrClear_0_RcvVCRCErrClear_MSB 0x1 +#define QIB_7322_ErrClear_0_RcvVCRCErrClear_RMASK 0x1 +#define QIB_7322_ErrClear_0_RcvFormatErrClear_LSB 0x0 +#define QIB_7322_ErrClear_0_RcvFormatErrClear_MSB 0x0 +#define QIB_7322_ErrClear_0_RcvFormatErrClear_RMASK 0x1 + +#define QIB_7322_TXEStatus_0_OFFS 0x10B8 +#define QIB_7322_TXEStatus_0_DEF 0x0000000XC00080FF +#define QIB_7322_TXEStatus_0_TXE_IBC_Idle_LSB 0x1F +#define QIB_7322_TXEStatus_0_TXE_IBC_Idle_MSB 0x1F +#define QIB_7322_TXEStatus_0_TXE_IBC_Idle_RMASK 0x1 +#define QIB_7322_TXEStatus_0_RmFifoEmpty_LSB 0x1E +#define QIB_7322_TXEStatus_0_RmFifoEmpty_MSB 0x1E +#define QIB_7322_TXEStatus_0_RmFifoEmpty_RMASK 0x1 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL15_LSB 0xF +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL15_MSB 0xF +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL15_RMASK 0x1 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL7_LSB 0x7 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL7_MSB 0x7 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL7_RMASK 0x1 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL6_LSB 0x6 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL6_MSB 0x6 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL6_RMASK 0x1 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL5_LSB 0x5 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL5_MSB 0x5 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL5_RMASK 0x1 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL4_LSB 0x4 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL4_MSB 0x4 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL4_RMASK 0x1 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL3_LSB 0x3 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL3_MSB 0x3 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL3_RMASK 0x1 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL2_LSB 0x2 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL2_MSB 0x2 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL2_RMASK 0x1 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL1_LSB 0x1 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL1_MSB 0x1 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL1_RMASK 0x1 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL0_LSB 0x0 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL0_MSB 0x0 +#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL0_RMASK 0x1 + +#define QIB_7322_RcvCtrl_0_OFFS 0x1100 +#define QIB_7322_RcvCtrl_0_DEF 0x0000000000000000 +#define QIB_7322_RcvCtrl_0_RcvResetCredit_LSB 0x2A +#define QIB_7322_RcvCtrl_0_RcvResetCredit_MSB 0x2A +#define QIB_7322_RcvCtrl_0_RcvResetCredit_RMASK 0x1 +#define QIB_7322_RcvCtrl_0_RcvPartitionKeyDisable_LSB 0x29 +#define QIB_7322_RcvCtrl_0_RcvPartitionKeyDisable_MSB 0x29 +#define QIB_7322_RcvCtrl_0_RcvPartitionKeyDisable_RMASK 0x1 +#define QIB_7322_RcvCtrl_0_RcvQPMapEnable_LSB 0x28 +#define QIB_7322_RcvCtrl_0_RcvQPMapEnable_MSB 0x28 +#define QIB_7322_RcvCtrl_0_RcvQPMapEnable_RMASK 0x1 +#define QIB_7322_RcvCtrl_0_RcvIBPortEnable_LSB 0x27 +#define QIB_7322_RcvCtrl_0_RcvIBPortEnable_MSB 0x27 +#define QIB_7322_RcvCtrl_0_RcvIBPortEnable_RMASK 0x1 +#define QIB_7322_RcvCtrl_0_ContextEnableUser_LSB 0x2 +#define QIB_7322_RcvCtrl_0_ContextEnableUser_MSB 0x11 +#define QIB_7322_RcvCtrl_0_ContextEnableUser_RMASK 0xFFFF +#define QIB_7322_RcvCtrl_0_ContextEnableKernel_LSB 0x0 +#define QIB_7322_RcvCtrl_0_ContextEnableKernel_MSB 0x0 +#define QIB_7322_RcvCtrl_0_ContextEnableKernel_RMASK 0x1 + +#define QIB_7322_RcvBTHQP_0_OFFS 0x1108 +#define QIB_7322_RcvBTHQP_0_DEF 0x0000000000000000 +#define QIB_7322_RcvBTHQP_0_RcvBTHQP_LSB 0x0 +#define QIB_7322_RcvBTHQP_0_RcvBTHQP_MSB 0x17 +#define QIB_7322_RcvBTHQP_0_RcvBTHQP_RMASK 0xFFFFFF + +#define QIB_7322_RcvQPMapTableA_0_OFFS 0x1110 +#define QIB_7322_RcvQPMapTableA_0_DEF 0x0000000000000000 +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext5_LSB 0x19 +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext5_MSB 0x1D +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext5_RMASK 0x1F +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext4_LSB 0x14 +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext4_MSB 0x18 +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext4_RMASK 0x1F +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext3_LSB 0xF +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext3_MSB 0x13 +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext3_RMASK 0x1F +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext2_LSB 0xA +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext2_MSB 0xE +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext2_RMASK 0x1F +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext1_LSB 0x5 +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext1_MSB 0x9 +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext1_RMASK 0x1F +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext0_LSB 0x0 +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext0_MSB 0x4 +#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext0_RMASK 0x1F + +#define QIB_7322_RcvQPMapTableB_0_OFFS 0x1118 +#define QIB_7322_RcvQPMapTableB_0_DEF 0x0000000000000000 +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext11_LSB 0x19 +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext11_MSB 0x1D +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext11_RMASK 0x1F +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext10_LSB 0x14 +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext10_MSB 0x18 +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext10_RMASK 0x1F +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext9_LSB 0xF +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext9_MSB 0x13 +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext9_RMASK 0x1F +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext8_LSB 0xA +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext8_MSB 0xE +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext8_RMASK 0x1F +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext7_LSB 0x5 +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext7_MSB 0x9 +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext7_RMASK 0x1F +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext6_LSB 0x0 +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext6_MSB 0x4 +#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext6_RMASK 0x1F + +#define QIB_7322_RcvQPMapTableC_0_OFFS 0x1120 +#define QIB_7322_RcvQPMapTableC_0_DEF 0x0000000000000000 +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext17_LSB 0x19 +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext17_MSB 0x1D +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext17_RMASK 0x1F +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext16_LSB 0x14 +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext16_MSB 0x18 +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext16_RMASK 0x1F +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext15_LSB 0xF +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext15_MSB 0x13 +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext15_RMASK 0x1F +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext14_LSB 0xA +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext14_MSB 0xE +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext14_RMASK 0x1F +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext13_LSB 0x5 +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext13_MSB 0x9 +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext13_RMASK 0x1F +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext12_LSB 0x0 +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext12_MSB 0x4 +#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext12_RMASK 0x1F + +#define QIB_7322_RcvQPMapTableD_0_OFFS 0x1128 +#define QIB_7322_RcvQPMapTableD_0_DEF 0x0000000000000000 +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext23_LSB 0x19 +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext23_MSB 0x1D +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext23_RMASK 0x1F +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext22_LSB 0x14 +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext22_MSB 0x18 +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext22_RMASK 0x1F +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext21_LSB 0xF +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext21_MSB 0x13 +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext21_RMASK 0x1F +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext20_LSB 0xA +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext20_MSB 0xE +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext20_RMASK 0x1F +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext19_LSB 0x5 +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext19_MSB 0x9 +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext19_RMASK 0x1F +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext18_LSB 0x0 +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext18_MSB 0x4 +#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext18_RMASK 0x1F + +#define QIB_7322_RcvQPMapTableE_0_OFFS 0x1130 +#define QIB_7322_RcvQPMapTableE_0_DEF 0x0000000000000000 +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext29_LSB 0x19 +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext29_MSB 0x1D +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext29_RMASK 0x1F +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext28_LSB 0x14 +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext28_MSB 0x18 +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext28_RMASK 0x1F +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext27_LSB 0xF +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext27_MSB 0x13 +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext27_RMASK 0x1F +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext26_LSB 0xA +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext26_MSB 0xE +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext26_RMASK 0x1F +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext25_LSB 0x5 +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext25_MSB 0x9 +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext25_RMASK 0x1F +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext24_LSB 0x0 +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext24_MSB 0x4 +#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext24_RMASK 0x1F + +#define QIB_7322_RcvQPMapTableF_0_OFFS 0x1138 +#define QIB_7322_RcvQPMapTableF_0_DEF 0x0000000000000000 +#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext31_LSB 0x5 +#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext31_MSB 0x9 +#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext31_RMASK 0x1F +#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext30_LSB 0x0 +#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext30_MSB 0x4 +#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext30_RMASK 0x1F + +#define QIB_7322_PSStat_0_OFFS 0x1140 +#define QIB_7322_PSStat_0_DEF 0x0000000000000000 + +#define QIB_7322_PSStart_0_OFFS 0x1148 +#define QIB_7322_PSStart_0_DEF 0x0000000000000000 + +#define QIB_7322_PSInterval_0_OFFS 0x1150 +#define QIB_7322_PSInterval_0_DEF 0x0000000000000000 + +#define QIB_7322_RcvStatus_0_OFFS 0x1160 +#define QIB_7322_RcvStatus_0_DEF 0x0000000000000000 +#define QIB_7322_RcvStatus_0_DmaeqBlockingContext_LSB 0x1 +#define QIB_7322_RcvStatus_0_DmaeqBlockingContext_MSB 0x5 +#define QIB_7322_RcvStatus_0_DmaeqBlockingContext_RMASK 0x1F +#define QIB_7322_RcvStatus_0_RxPktInProgress_LSB 0x0 +#define QIB_7322_RcvStatus_0_RxPktInProgress_MSB 0x0 +#define QIB_7322_RcvStatus_0_RxPktInProgress_RMASK 0x1 + +#define QIB_7322_RcvPartitionKey_0_OFFS 0x1168 +#define QIB_7322_RcvPartitionKey_0_DEF 0x0000000000000000 + +#define QIB_7322_RcvQPMulticastContext_0_OFFS 0x1170 +#define QIB_7322_RcvQPMulticastContext_0_DEF 0x0000000000000000 +#define QIB_7322_RcvQPMulticastContext_0_RcvQpMcContext_LSB 0x0 +#define QIB_7322_RcvQPMulticastContext_0_RcvQpMcContext_MSB 0x4 +#define QIB_7322_RcvQPMulticastContext_0_RcvQpMcContext_RMASK 0x1F + +#define QIB_7322_RcvPktLEDCnt_0_OFFS 0x1178 +#define QIB_7322_RcvPktLEDCnt_0_DEF 0x0000000000000000 +#define QIB_7322_RcvPktLEDCnt_0_ONperiod_LSB 0x20 +#define QIB_7322_RcvPktLEDCnt_0_ONperiod_MSB 0x3F +#define QIB_7322_RcvPktLEDCnt_0_ONperiod_RMASK 0xFFFFFFFF +#define QIB_7322_RcvPktLEDCnt_0_OFFperiod_LSB 0x0 +#define QIB_7322_RcvPktLEDCnt_0_OFFperiod_MSB 0x1F +#define QIB_7322_RcvPktLEDCnt_0_OFFperiod_RMASK 0xFFFFFFFF + +#define QIB_7322_SendDmaIdleCnt_0_OFFS 0x1180 +#define QIB_7322_SendDmaIdleCnt_0_DEF 0x0000000000000000 +#define QIB_7322_SendDmaIdleCnt_0_SendDmaIdleCnt_LSB 0x0 +#define QIB_7322_SendDmaIdleCnt_0_SendDmaIdleCnt_MSB 0xF +#define QIB_7322_SendDmaIdleCnt_0_SendDmaIdleCnt_RMASK 0xFFFF + +#define QIB_7322_SendDmaReloadCnt_0_OFFS 0x1188 +#define QIB_7322_SendDmaReloadCnt_0_DEF 0x0000000000000000 +#define QIB_7322_SendDmaReloadCnt_0_SendDmaReloadCnt_LSB 0x0 +#define QIB_7322_SendDmaReloadCnt_0_SendDmaReloadCnt_MSB 0xF +#define QIB_7322_SendDmaReloadCnt_0_SendDmaReloadCnt_RMASK 0xFFFF + +#define QIB_7322_SendDmaDescCnt_0_OFFS 0x1190 +#define QIB_7322_SendDmaDescCnt_0_DEF 0x0000000000000000 +#define QIB_7322_SendDmaDescCnt_0_SendDmaDescCnt_LSB 0x0 +#define QIB_7322_SendDmaDescCnt_0_SendDmaDescCnt_MSB 0xF +#define QIB_7322_SendDmaDescCnt_0_SendDmaDescCnt_RMASK 0xFFFF + +#define QIB_7322_SendCtrl_0_OFFS 0x11C0 +#define QIB_7322_SendCtrl_0_DEF 0x0000000000000000 +#define QIB_7322_SendCtrl_0_IBVLArbiterEn_LSB 0xF +#define QIB_7322_SendCtrl_0_IBVLArbiterEn_MSB 0xF +#define QIB_7322_SendCtrl_0_IBVLArbiterEn_RMASK 0x1 +#define QIB_7322_SendCtrl_0_TxeDrainRmFifo_LSB 0xE +#define QIB_7322_SendCtrl_0_TxeDrainRmFifo_MSB 0xE +#define QIB_7322_SendCtrl_0_TxeDrainRmFifo_RMASK 0x1 +#define QIB_7322_SendCtrl_0_TxeDrainLaFifo_LSB 0xD +#define QIB_7322_SendCtrl_0_TxeDrainLaFifo_MSB 0xD +#define QIB_7322_SendCtrl_0_TxeDrainLaFifo_RMASK 0x1 +#define QIB_7322_SendCtrl_0_SDmaHalt_LSB 0xC +#define QIB_7322_SendCtrl_0_SDmaHalt_MSB 0xC +#define QIB_7322_SendCtrl_0_SDmaHalt_RMASK 0x1 +#define QIB_7322_SendCtrl_0_SDmaEnable_LSB 0xB +#define QIB_7322_SendCtrl_0_SDmaEnable_MSB 0xB +#define QIB_7322_SendCtrl_0_SDmaEnable_RMASK 0x1 +#define QIB_7322_SendCtrl_0_SDmaSingleDescriptor_LSB 0xA +#define QIB_7322_SendCtrl_0_SDmaSingleDescriptor_MSB 0xA +#define QIB_7322_SendCtrl_0_SDmaSingleDescriptor_RMASK 0x1 +#define QIB_7322_SendCtrl_0_SDmaIntEnable_LSB 0x9 +#define QIB_7322_SendCtrl_0_SDmaIntEnable_MSB 0x9 +#define QIB_7322_SendCtrl_0_SDmaIntEnable_RMASK 0x1 +#define QIB_7322_SendCtrl_0_SDmaCleanup_LSB 0x8 +#define QIB_7322_SendCtrl_0_SDmaCleanup_MSB 0x8 +#define QIB_7322_SendCtrl_0_SDmaCleanup_RMASK 0x1 +#define QIB_7322_SendCtrl_0_ForceCreditUpToDate_LSB 0x7 +#define QIB_7322_SendCtrl_0_ForceCreditUpToDate_MSB 0x7 +#define QIB_7322_SendCtrl_0_ForceCreditUpToDate_RMASK 0x1 +#define QIB_7322_SendCtrl_0_SendEnable_LSB 0x3 +#define QIB_7322_SendCtrl_0_SendEnable_MSB 0x3 +#define QIB_7322_SendCtrl_0_SendEnable_RMASK 0x1 +#define QIB_7322_SendCtrl_0_TxeBypassIbc_LSB 0x1 +#define QIB_7322_SendCtrl_0_TxeBypassIbc_MSB 0x1 +#define QIB_7322_SendCtrl_0_TxeBypassIbc_RMASK 0x1 +#define QIB_7322_SendCtrl_0_TxeAbortIbc_LSB 0x0 +#define QIB_7322_SendCtrl_0_TxeAbortIbc_MSB 0x0 +#define QIB_7322_SendCtrl_0_TxeAbortIbc_RMASK 0x1 + +#define QIB_7322_SendDmaBase_0_OFFS 0x11F8 +#define QIB_7322_SendDmaBase_0_DEF 0x0000000000000000 +#define QIB_7322_SendDmaBase_0_SendDmaBase_LSB 0x0 +#define QIB_7322_SendDmaBase_0_SendDmaBase_MSB 0x2F +#define QIB_7322_SendDmaBase_0_SendDmaBase_RMASK 0xFFFFFFFFFFFF + +#define QIB_7322_SendDmaLenGen_0_OFFS 0x1200 +#define QIB_7322_SendDmaLenGen_0_DEF 0x0000000000000000 +#define QIB_7322_SendDmaLenGen_0_Generation_LSB 0x10 +#define QIB_7322_SendDmaLenGen_0_Generation_MSB 0x12 +#define QIB_7322_SendDmaLenGen_0_Generation_RMASK 0x7 +#define QIB_7322_SendDmaLenGen_0_Length_LSB 0x0 +#define QIB_7322_SendDmaLenGen_0_Length_MSB 0xF +#define QIB_7322_SendDmaLenGen_0_Length_RMASK 0xFFFF + +#define QIB_7322_SendDmaTail_0_OFFS 0x1208 +#define QIB_7322_SendDmaTail_0_DEF 0x0000000000000000 +#define QIB_7322_SendDmaTail_0_SendDmaTail_LSB 0x0 +#define QIB_7322_SendDmaTail_0_SendDmaTail_MSB 0xF +#define QIB_7322_SendDmaTail_0_SendDmaTail_RMASK 0xFFFF + +#define QIB_7322_SendDmaHead_0_OFFS 0x1210 +#define QIB_7322_SendDmaHead_0_DEF 0x0000000000000000 +#define QIB_7322_SendDmaHead_0_InternalSendDmaHead_LSB 0x20 +#define QIB_7322_SendDmaHead_0_InternalSendDmaHead_MSB 0x2F +#define QIB_7322_SendDmaHead_0_InternalSendDmaHead_RMASK 0xFFFF +#define QIB_7322_SendDmaHead_0_SendDmaHead_LSB 0x0 +#define QIB_7322_SendDmaHead_0_SendDmaHead_MSB 0xF +#define QIB_7322_SendDmaHead_0_SendDmaHead_RMASK 0xFFFF + +#define QIB_7322_SendDmaHeadAddr_0_OFFS 0x1218 +#define QIB_7322_SendDmaHeadAddr_0_DEF 0x0000000000000000 +#define QIB_7322_SendDmaHeadAddr_0_SendDmaHeadAddr_LSB 0x0 +#define QIB_7322_SendDmaHeadAddr_0_SendDmaHeadAddr_MSB 0x2F +#define QIB_7322_SendDmaHeadAddr_0_SendDmaHeadAddr_RMASK 0xFFFFFFFFFFFF + +#define QIB_7322_SendDmaBufMask0_0_OFFS 0x1220 +#define QIB_7322_SendDmaBufMask0_0_DEF 0x0000000000000000 +#define QIB_7322_SendDmaBufMask0_0_BufMask_63_0_LSB 0x0 +#define QIB_7322_SendDmaBufMask0_0_BufMask_63_0_MSB 0x3F +#define QIB_7322_SendDmaBufMask0_0_BufMask_63_0_RMASK 0x0 + +#define QIB_7322_SendDmaStatus_0_OFFS 0x1238 +#define QIB_7322_SendDmaStatus_0_DEF 0x0000000042000000 +#define QIB_7322_SendDmaStatus_0_ScoreBoardDrainInProg_LSB 0x3F +#define QIB_7322_SendDmaStatus_0_ScoreBoardDrainInProg_MSB 0x3F +#define QIB_7322_SendDmaStatus_0_ScoreBoardDrainInProg_RMASK 0x1 +#define QIB_7322_SendDmaStatus_0_HaltInProg_LSB 0x3E +#define QIB_7322_SendDmaStatus_0_HaltInProg_MSB 0x3E +#define QIB_7322_SendDmaStatus_0_HaltInProg_RMASK 0x1 +#define QIB_7322_SendDmaStatus_0_InternalSDmaHalt_LSB 0x3D +#define QIB_7322_SendDmaStatus_0_InternalSDmaHalt_MSB 0x3D +#define QIB_7322_SendDmaStatus_0_InternalSDmaHalt_RMASK 0x1 +#define QIB_7322_SendDmaStatus_0_ScbDescIndex_13_0_LSB 0x2F +#define QIB_7322_SendDmaStatus_0_ScbDescIndex_13_0_MSB 0x3C +#define QIB_7322_SendDmaStatus_0_ScbDescIndex_13_0_RMASK 0x3FFF +#define QIB_7322_SendDmaStatus_0_RpyLowAddr_6_0_LSB 0x28 +#define QIB_7322_SendDmaStatus_0_RpyLowAddr_6_0_MSB 0x2E +#define QIB_7322_SendDmaStatus_0_RpyLowAddr_6_0_RMASK 0x7F +#define QIB_7322_SendDmaStatus_0_RpyTag_7_0_LSB 0x20 +#define QIB_7322_SendDmaStatus_0_RpyTag_7_0_MSB 0x27 +#define QIB_7322_SendDmaStatus_0_RpyTag_7_0_RMASK 0xFF +#define QIB_7322_SendDmaStatus_0_ScbFull_LSB 0x1F +#define QIB_7322_SendDmaStatus_0_ScbFull_MSB 0x1F +#define QIB_7322_SendDmaStatus_0_ScbFull_RMASK 0x1 +#define QIB_7322_SendDmaStatus_0_ScbEmpty_LSB 0x1E +#define QIB_7322_SendDmaStatus_0_ScbEmpty_MSB 0x1E +#define QIB_7322_SendDmaStatus_0_ScbEmpty_RMASK 0x1 +#define QIB_7322_SendDmaStatus_0_ScbEntryValid_LSB 0x1D +#define QIB_7322_SendDmaStatus_0_ScbEntryValid_MSB 0x1D +#define QIB_7322_SendDmaStatus_0_ScbEntryValid_RMASK 0x1 +#define QIB_7322_SendDmaStatus_0_ScbFetchDescFlag_LSB 0x1C +#define QIB_7322_SendDmaStatus_0_ScbFetchDescFlag_MSB 0x1C +#define QIB_7322_SendDmaStatus_0_ScbFetchDescFlag_RMASK 0x1 +#define QIB_7322_SendDmaStatus_0_SplFifoReadyToGo_LSB 0x1B +#define QIB_7322_SendDmaStatus_0_SplFifoReadyToGo_MSB 0x1B +#define QIB_7322_SendDmaStatus_0_SplFifoReadyToGo_RMASK 0x1 +#define QIB_7322_SendDmaStatus_0_SplFifoDisarmed_LSB 0x1A +#define QIB_7322_SendDmaStatus_0_SplFifoDisarmed_MSB 0x1A +#define QIB_7322_SendDmaStatus_0_SplFifoDisarmed_RMASK 0x1 +#define QIB_7322_SendDmaStatus_0_SplFifoEmpty_LSB 0x19 +#define QIB_7322_SendDmaStatus_0_SplFifoEmpty_MSB 0x19 +#define QIB_7322_SendDmaStatus_0_SplFifoEmpty_RMASK 0x1 +#define QIB_7322_SendDmaStatus_0_SplFifoFull_LSB 0x18 +#define QIB_7322_SendDmaStatus_0_SplFifoFull_MSB 0x18 +#define QIB_7322_SendDmaStatus_0_SplFifoFull_RMASK 0x1 +#define QIB_7322_SendDmaStatus_0_SplFifoBufNum_LSB 0x10 +#define QIB_7322_SendDmaStatus_0_SplFifoBufNum_MSB 0x17 +#define QIB_7322_SendDmaStatus_0_SplFifoBufNum_RMASK 0xFF +#define QIB_7322_SendDmaStatus_0_SplFifoDescIndex_LSB 0x0 +#define QIB_7322_SendDmaStatus_0_SplFifoDescIndex_MSB 0xF +#define QIB_7322_SendDmaStatus_0_SplFifoDescIndex_RMASK 0xFFFF + +#define QIB_7322_SendDmaPriorityThld_0_OFFS 0x1258 +#define QIB_7322_SendDmaPriorityThld_0_DEF 0x0000000000000000 +#define QIB_7322_SendDmaPriorityThld_0_PriorityThreshold_LSB 0x0 +#define QIB_7322_SendDmaPriorityThld_0_PriorityThreshold_MSB 0x3 +#define QIB_7322_SendDmaPriorityThld_0_PriorityThreshold_RMASK 0xF + +#define QIB_7322_SendHdrErrSymptom_0_OFFS 0x1260 +#define QIB_7322_SendHdrErrSymptom_0_DEF 0x0000000000000000 +#define QIB_7322_SendHdrErrSymptom_0_NonKeyPacket_LSB 0x6 +#define QIB_7322_SendHdrErrSymptom_0_NonKeyPacket_MSB 0x6 +#define QIB_7322_SendHdrErrSymptom_0_NonKeyPacket_RMASK 0x1 +#define QIB_7322_SendHdrErrSymptom_0_GRHFail_LSB 0x5 +#define QIB_7322_SendHdrErrSymptom_0_GRHFail_MSB 0x5 +#define QIB_7322_SendHdrErrSymptom_0_GRHFail_RMASK 0x1 +#define QIB_7322_SendHdrErrSymptom_0_PkeyFail_LSB 0x4 +#define QIB_7322_SendHdrErrSymptom_0_PkeyFail_MSB 0x4 +#define QIB_7322_SendHdrErrSymptom_0_PkeyFail_RMASK 0x1 +#define QIB_7322_SendHdrErrSymptom_0_QPFail_LSB 0x3 +#define QIB_7322_SendHdrErrSymptom_0_QPFail_MSB 0x3 +#define QIB_7322_SendHdrErrSymptom_0_QPFail_RMASK 0x1 +#define QIB_7322_SendHdrErrSymptom_0_SLIDFail_LSB 0x2 +#define QIB_7322_SendHdrErrSymptom_0_SLIDFail_MSB 0x2 +#define QIB_7322_SendHdrErrSymptom_0_SLIDFail_RMASK 0x1 +#define QIB_7322_SendHdrErrSymptom_0_RawIPV6_LSB 0x1 +#define QIB_7322_SendHdrErrSymptom_0_RawIPV6_MSB 0x1 +#define QIB_7322_SendHdrErrSymptom_0_RawIPV6_RMASK 0x1 +#define QIB_7322_SendHdrErrSymptom_0_PacketTooSmall_LSB 0x0 +#define QIB_7322_SendHdrErrSymptom_0_PacketTooSmall_MSB 0x0 +#define QIB_7322_SendHdrErrSymptom_0_PacketTooSmall_RMASK 0x1 + +#define QIB_7322_RxCreditVL0_0_OFFS 0x1280 +#define QIB_7322_RxCreditVL0_0_DEF 0x0000000000000000 +#define QIB_7322_RxCreditVL0_0_RxBufrConsumedVL_LSB 0x10 +#define QIB_7322_RxCreditVL0_0_RxBufrConsumedVL_MSB 0x1B +#define QIB_7322_RxCreditVL0_0_RxBufrConsumedVL_RMASK 0xFFF +#define QIB_7322_RxCreditVL0_0_RxMaxCreditVL_LSB 0x0 +#define QIB_7322_RxCreditVL0_0_RxMaxCreditVL_MSB 0xB +#define QIB_7322_RxCreditVL0_0_RxMaxCreditVL_RMASK 0xFFF + +#define QIB_7322_SendDmaBufUsed0_0_OFFS 0x1480 +#define QIB_7322_SendDmaBufUsed0_0_DEF 0x0000000000000000 +#define QIB_7322_SendDmaBufUsed0_0_BufUsed_63_0_LSB 0x0 +#define QIB_7322_SendDmaBufUsed0_0_BufUsed_63_0_MSB 0x3F +#define QIB_7322_SendDmaBufUsed0_0_BufUsed_63_0_RMASK 0x0 + +#define QIB_7322_SendCheckControl_0_OFFS 0x14A8 +#define QIB_7322_SendCheckControl_0_DEF 0x0000000000000000 +#define QIB_7322_SendCheckControl_0_PKey_En_LSB 0x4 +#define QIB_7322_SendCheckControl_0_PKey_En_MSB 0x4 +#define QIB_7322_SendCheckControl_0_PKey_En_RMASK 0x1 +#define QIB_7322_SendCheckControl_0_BTHQP_En_LSB 0x3 +#define QIB_7322_SendCheckControl_0_BTHQP_En_MSB 0x3 +#define QIB_7322_SendCheckControl_0_BTHQP_En_RMASK 0x1 +#define QIB_7322_SendCheckControl_0_SLID_En_LSB 0x2 +#define QIB_7322_SendCheckControl_0_SLID_En_MSB 0x2 +#define QIB_7322_SendCheckControl_0_SLID_En_RMASK 0x1 +#define QIB_7322_SendCheckControl_0_RawIPV6_En_LSB 0x1 +#define QIB_7322_SendCheckControl_0_RawIPV6_En_MSB 0x1 +#define QIB_7322_SendCheckControl_0_RawIPV6_En_RMASK 0x1 +#define QIB_7322_SendCheckControl_0_PacketTooSmall_En_LSB 0x0 +#define QIB_7322_SendCheckControl_0_PacketTooSmall_En_MSB 0x0 +#define QIB_7322_SendCheckControl_0_PacketTooSmall_En_RMASK 0x1 + +#define QIB_7322_SendIBSLIDMask_0_OFFS 0x14B0 +#define QIB_7322_SendIBSLIDMask_0_DEF 0x0000000000000000 +#define QIB_7322_SendIBSLIDMask_0_SendIBSLIDMask_15_0_LSB 0x0 +#define QIB_7322_SendIBSLIDMask_0_SendIBSLIDMask_15_0_MSB 0xF +#define QIB_7322_SendIBSLIDMask_0_SendIBSLIDMask_15_0_RMASK 0xFFFF + +#define QIB_7322_SendIBSLIDAssign_0_OFFS 0x14B8 +#define QIB_7322_SendIBSLIDAssign_0_DEF 0x0000000000000000 +#define QIB_7322_SendIBSLIDAssign_0_SendIBSLIDAssign_15_0_LSB 0x0 +#define QIB_7322_SendIBSLIDAssign_0_SendIBSLIDAssign_15_0_MSB 0xF +#define QIB_7322_SendIBSLIDAssign_0_SendIBSLIDAssign_15_0_RMASK 0xFFFF + +#define QIB_7322_IBCStatusA_0_OFFS 0x1540 +#define QIB_7322_IBCStatusA_0_DEF 0x0000000000000X02 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL7_LSB 0x27 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL7_MSB 0x27 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL7_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL6_LSB 0x26 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL6_MSB 0x26 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL6_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL5_LSB 0x25 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL5_MSB 0x25 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL5_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL4_LSB 0x24 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL4_MSB 0x24 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL4_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL3_LSB 0x23 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL3_MSB 0x23 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL3_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL2_LSB 0x22 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL2_MSB 0x22 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL2_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL1_LSB 0x21 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL1_MSB 0x21 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL1_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL0_LSB 0x20 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL0_MSB 0x20 +#define QIB_7322_IBCStatusA_0_TxCreditOk_VL0_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_TxReady_LSB 0x1E +#define QIB_7322_IBCStatusA_0_TxReady_MSB 0x1E +#define QIB_7322_IBCStatusA_0_TxReady_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_LinkSpeedQDR_LSB 0x1D +#define QIB_7322_IBCStatusA_0_LinkSpeedQDR_MSB 0x1D +#define QIB_7322_IBCStatusA_0_LinkSpeedQDR_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_ScrambleCapRemote_LSB 0xF +#define QIB_7322_IBCStatusA_0_ScrambleCapRemote_MSB 0xF +#define QIB_7322_IBCStatusA_0_ScrambleCapRemote_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_ScrambleEn_LSB 0xE +#define QIB_7322_IBCStatusA_0_ScrambleEn_MSB 0xE +#define QIB_7322_IBCStatusA_0_ScrambleEn_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_IBTxLaneReversed_LSB 0xD +#define QIB_7322_IBCStatusA_0_IBTxLaneReversed_MSB 0xD +#define QIB_7322_IBCStatusA_0_IBTxLaneReversed_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_IBRxLaneReversed_LSB 0xC +#define QIB_7322_IBCStatusA_0_IBRxLaneReversed_MSB 0xC +#define QIB_7322_IBCStatusA_0_IBRxLaneReversed_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_DDS_RXEQ_FAIL_LSB 0xA +#define QIB_7322_IBCStatusA_0_DDS_RXEQ_FAIL_MSB 0xA +#define QIB_7322_IBCStatusA_0_DDS_RXEQ_FAIL_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_LinkWidthActive_LSB 0x9 +#define QIB_7322_IBCStatusA_0_LinkWidthActive_MSB 0x9 +#define QIB_7322_IBCStatusA_0_LinkWidthActive_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_LinkSpeedActive_LSB 0x8 +#define QIB_7322_IBCStatusA_0_LinkSpeedActive_MSB 0x8 +#define QIB_7322_IBCStatusA_0_LinkSpeedActive_RMASK 0x1 +#define QIB_7322_IBCStatusA_0_LinkState_LSB 0x5 +#define QIB_7322_IBCStatusA_0_LinkState_MSB 0x7 +#define QIB_7322_IBCStatusA_0_LinkState_RMASK 0x7 +#define QIB_7322_IBCStatusA_0_LinkTrainingState_LSB 0x0 +#define QIB_7322_IBCStatusA_0_LinkTrainingState_MSB 0x4 +#define QIB_7322_IBCStatusA_0_LinkTrainingState_RMASK 0x1F + +#define QIB_7322_IBCStatusB_0_OFFS 0x1548 +#define QIB_7322_IBCStatusB_0_DEF 0x00000000XXXXXXXX +#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_debug_LSB 0x27 +#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_debug_MSB 0x27 +#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_debug_RMASK 0x1 +#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_reached_threshold_LSB 0x26 +#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_reached_threshold_MSB 0x26 +#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_reached_threshold_RMASK 0x1 +#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_started_LSB 0x25 +#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_started_MSB 0x25 +#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_started_RMASK 0x1 +#define QIB_7322_IBCStatusB_0_heartbeat_timed_out_LSB 0x24 +#define QIB_7322_IBCStatusB_0_heartbeat_timed_out_MSB 0x24 +#define QIB_7322_IBCStatusB_0_heartbeat_timed_out_RMASK 0x1 +#define QIB_7322_IBCStatusB_0_heartbeat_crosstalk_LSB 0x20 +#define QIB_7322_IBCStatusB_0_heartbeat_crosstalk_MSB 0x23 +#define QIB_7322_IBCStatusB_0_heartbeat_crosstalk_RMASK 0xF +#define QIB_7322_IBCStatusB_0_RxEqLocalDevice_LSB 0x1E +#define QIB_7322_IBCStatusB_0_RxEqLocalDevice_MSB 0x1F +#define QIB_7322_IBCStatusB_0_RxEqLocalDevice_RMASK 0x3 +#define QIB_7322_IBCStatusB_0_ReqDDSLocalFromRmt_LSB 0x1A +#define QIB_7322_IBCStatusB_0_ReqDDSLocalFromRmt_MSB 0x1D +#define QIB_7322_IBCStatusB_0_ReqDDSLocalFromRmt_RMASK 0xF +#define QIB_7322_IBCStatusB_0_LinkRoundTripLatency_LSB 0x0 +#define QIB_7322_IBCStatusB_0_LinkRoundTripLatency_MSB 0x19 +#define QIB_7322_IBCStatusB_0_LinkRoundTripLatency_RMASK 0x3FFFFFF + +#define QIB_7322_IBCCtrlA_0_OFFS 0x1560 +#define QIB_7322_IBCCtrlA_0_DEF 0x0000000000000000 +#define QIB_7322_IBCCtrlA_0_Loopback_LSB 0x3F +#define QIB_7322_IBCCtrlA_0_Loopback_MSB 0x3F +#define QIB_7322_IBCCtrlA_0_Loopback_RMASK 0x1 +#define QIB_7322_IBCCtrlA_0_LinkDownDefaultState_LSB 0x3E +#define QIB_7322_IBCCtrlA_0_LinkDownDefaultState_MSB 0x3E +#define QIB_7322_IBCCtrlA_0_LinkDownDefaultState_RMASK 0x1 +#define QIB_7322_IBCCtrlA_0_IBLinkEn_LSB 0x3D +#define QIB_7322_IBCCtrlA_0_IBLinkEn_MSB 0x3D +#define QIB_7322_IBCCtrlA_0_IBLinkEn_RMASK 0x1 +#define QIB_7322_IBCCtrlA_0_IBStatIntReductionEn_LSB 0x3C +#define QIB_7322_IBCCtrlA_0_IBStatIntReductionEn_MSB 0x3C +#define QIB_7322_IBCCtrlA_0_IBStatIntReductionEn_RMASK 0x1 +#define QIB_7322_IBCCtrlA_0_NumVLane_LSB 0x30 +#define QIB_7322_IBCCtrlA_0_NumVLane_MSB 0x32 +#define QIB_7322_IBCCtrlA_0_NumVLane_RMASK 0x7 +#define QIB_7322_IBCCtrlA_0_OverrunThreshold_LSB 0x24 +#define QIB_7322_IBCCtrlA_0_OverrunThreshold_MSB 0x27 +#define QIB_7322_IBCCtrlA_0_OverrunThreshold_RMASK 0xF +#define QIB_7322_IBCCtrlA_0_PhyerrThreshold_LSB 0x20 +#define QIB_7322_IBCCtrlA_0_PhyerrThreshold_MSB 0x23 +#define QIB_7322_IBCCtrlA_0_PhyerrThreshold_RMASK 0xF +#define QIB_7322_IBCCtrlA_0_MaxPktLen_LSB 0x15 +#define QIB_7322_IBCCtrlA_0_MaxPktLen_MSB 0x1F +#define QIB_7322_IBCCtrlA_0_MaxPktLen_RMASK 0x7FF +#define QIB_7322_IBCCtrlA_0_LinkCmd_LSB 0x13 +#define QIB_7322_IBCCtrlA_0_LinkCmd_MSB 0x14 +#define QIB_7322_IBCCtrlA_0_LinkCmd_RMASK 0x3 +#define QIB_7322_IBCCtrlA_0_LinkInitCmd_LSB 0x10 +#define QIB_7322_IBCCtrlA_0_LinkInitCmd_MSB 0x12 +#define QIB_7322_IBCCtrlA_0_LinkInitCmd_RMASK 0x7 +#define QIB_7322_IBCCtrlA_0_FlowCtrlWaterMark_LSB 0x8 +#define QIB_7322_IBCCtrlA_0_FlowCtrlWaterMark_MSB 0xF +#define QIB_7322_IBCCtrlA_0_FlowCtrlWaterMark_RMASK 0xFF +#define QIB_7322_IBCCtrlA_0_FlowCtrlPeriod_LSB 0x0 +#define QIB_7322_IBCCtrlA_0_FlowCtrlPeriod_MSB 0x7 +#define QIB_7322_IBCCtrlA_0_FlowCtrlPeriod_RMASK 0xFF + +#define QIB_7322_IBCCtrlB_0_OFFS 0x1568 +#define QIB_7322_IBCCtrlB_0_DEF 0x00000000000305FF +#define QIB_7322_IBCCtrlB_0_IB_DLID_MASK_LSB 0x30 +#define QIB_7322_IBCCtrlB_0_IB_DLID_MASK_MSB 0x3F +#define QIB_7322_IBCCtrlB_0_IB_DLID_MASK_RMASK 0xFFFF +#define QIB_7322_IBCCtrlB_0_IB_DLID_LSB 0x20 +#define QIB_7322_IBCCtrlB_0_IB_DLID_MSB 0x2F +#define QIB_7322_IBCCtrlB_0_IB_DLID_RMASK 0xFFFF +#define QIB_7322_IBCCtrlB_0_IB_ENABLE_FILT_DPKT_LSB 0x1B +#define QIB_7322_IBCCtrlB_0_IB_ENABLE_FILT_DPKT_MSB 0x1B +#define QIB_7322_IBCCtrlB_0_IB_ENABLE_FILT_DPKT_RMASK 0x1 +#define QIB_7322_IBCCtrlB_0_HRTBT_REQ_LSB 0x1A +#define QIB_7322_IBCCtrlB_0_HRTBT_REQ_MSB 0x1A +#define QIB_7322_IBCCtrlB_0_HRTBT_REQ_RMASK 0x1 +#define QIB_7322_IBCCtrlB_0_HRTBT_PORT_LSB 0x12 +#define QIB_7322_IBCCtrlB_0_HRTBT_PORT_MSB 0x19 +#define QIB_7322_IBCCtrlB_0_HRTBT_PORT_RMASK 0xFF +#define QIB_7322_IBCCtrlB_0_HRTBT_AUTO_LSB 0x11 +#define QIB_7322_IBCCtrlB_0_HRTBT_AUTO_MSB 0x11 +#define QIB_7322_IBCCtrlB_0_HRTBT_AUTO_RMASK 0x1 +#define QIB_7322_IBCCtrlB_0_HRTBT_ENB_LSB 0x10 +#define QIB_7322_IBCCtrlB_0_HRTBT_ENB_MSB 0x10 +#define QIB_7322_IBCCtrlB_0_HRTBT_ENB_RMASK 0x1 +#define QIB_7322_IBCCtrlB_0_SD_DDS_LSB 0xC +#define QIB_7322_IBCCtrlB_0_SD_DDS_MSB 0xF +#define QIB_7322_IBCCtrlB_0_SD_DDS_RMASK 0xF +#define QIB_7322_IBCCtrlB_0_SD_DDSV_LSB 0xB +#define QIB_7322_IBCCtrlB_0_SD_DDSV_MSB 0xB +#define QIB_7322_IBCCtrlB_0_SD_DDSV_RMASK 0x1 +#define QIB_7322_IBCCtrlB_0_SD_ADD_ENB_LSB 0xA +#define QIB_7322_IBCCtrlB_0_SD_ADD_ENB_MSB 0xA +#define QIB_7322_IBCCtrlB_0_SD_ADD_ENB_RMASK 0x1 +#define QIB_7322_IBCCtrlB_0_SD_RX_EQUAL_ENABLE_LSB 0x9 +#define QIB_7322_IBCCtrlB_0_SD_RX_EQUAL_ENABLE_MSB 0x9 +#define QIB_7322_IBCCtrlB_0_SD_RX_EQUAL_ENABLE_RMASK 0x1 +#define QIB_7322_IBCCtrlB_0_IB_LANE_REV_SUPPORTED_LSB 0x8 +#define QIB_7322_IBCCtrlB_0_IB_LANE_REV_SUPPORTED_MSB 0x8 +#define QIB_7322_IBCCtrlB_0_IB_LANE_REV_SUPPORTED_RMASK 0x1 +#define QIB_7322_IBCCtrlB_0_IB_POLARITY_REV_SUPP_LSB 0x7 +#define QIB_7322_IBCCtrlB_0_IB_POLARITY_REV_SUPP_MSB 0x7 +#define QIB_7322_IBCCtrlB_0_IB_POLARITY_REV_SUPP_RMASK 0x1 +#define QIB_7322_IBCCtrlB_0_IB_NUM_CHANNELS_LSB 0x5 +#define QIB_7322_IBCCtrlB_0_IB_NUM_CHANNELS_MSB 0x6 +#define QIB_7322_IBCCtrlB_0_IB_NUM_CHANNELS_RMASK 0x3 +#define QIB_7322_IBCCtrlB_0_SD_SPEED_QDR_LSB 0x4 +#define QIB_7322_IBCCtrlB_0_SD_SPEED_QDR_MSB 0x4 +#define QIB_7322_IBCCtrlB_0_SD_SPEED_QDR_RMASK 0x1 +#define QIB_7322_IBCCtrlB_0_SD_SPEED_DDR_LSB 0x3 +#define QIB_7322_IBCCtrlB_0_SD_SPEED_DDR_MSB 0x3 +#define QIB_7322_IBCCtrlB_0_SD_SPEED_DDR_RMASK 0x1 +#define QIB_7322_IBCCtrlB_0_SD_SPEED_SDR_LSB 0x2 +#define QIB_7322_IBCCtrlB_0_SD_SPEED_SDR_MSB 0x2 +#define QIB_7322_IBCCtrlB_0_SD_SPEED_SDR_RMASK 0x1 +#define QIB_7322_IBCCtrlB_0_SD_SPEED_LSB 0x1 +#define QIB_7322_IBCCtrlB_0_SD_SPEED_MSB 0x1 +#define QIB_7322_IBCCtrlB_0_SD_SPEED_RMASK 0x1 +#define QIB_7322_IBCCtrlB_0_IB_ENHANCED_MODE_LSB 0x0 +#define QIB_7322_IBCCtrlB_0_IB_ENHANCED_MODE_MSB 0x0 +#define QIB_7322_IBCCtrlB_0_IB_ENHANCED_MODE_RMASK 0x1 + +#define QIB_7322_IBCCtrlC_0_OFFS 0x1570 +#define QIB_7322_IBCCtrlC_0_DEF 0x0000000000000301 +#define QIB_7322_IBCCtrlC_0_IB_BACK_PORCH_LSB 0x5 +#define QIB_7322_IBCCtrlC_0_IB_BACK_PORCH_MSB 0x9 +#define QIB_7322_IBCCtrlC_0_IB_BACK_PORCH_RMASK 0x1F +#define QIB_7322_IBCCtrlC_0_IB_FRONT_PORCH_LSB 0x0 +#define QIB_7322_IBCCtrlC_0_IB_FRONT_PORCH_MSB 0x4 +#define QIB_7322_IBCCtrlC_0_IB_FRONT_PORCH_RMASK 0x1F + +#define QIB_7322_HRTBT_GUID_0_OFFS 0x1588 +#define QIB_7322_HRTBT_GUID_0_DEF 0x0000000000000000 + +#define QIB_7322_IB_SDTEST_IF_TX_0_OFFS 0x1590 +#define QIB_7322_IB_SDTEST_IF_TX_0_DEF 0x0000000000000000 +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_RX_CFG_LSB 0x30 +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_RX_CFG_MSB 0x3F +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_RX_CFG_RMASK 0xFFFF +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_TX_CFG_LSB 0x20 +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_TX_CFG_MSB 0x2F +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_TX_CFG_RMASK 0xFFFF +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_SPEED_LSB 0xD +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_SPEED_MSB 0xF +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_SPEED_RMASK 0x7 +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_OPCODE_LSB 0xB +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_OPCODE_MSB 0xC +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_OPCODE_RMASK 0x3 +#define QIB_7322_IB_SDTEST_IF_TX_0_CREDIT_CHANGE_LSB 0x4 +#define QIB_7322_IB_SDTEST_IF_TX_0_CREDIT_CHANGE_MSB 0x4 +#define QIB_7322_IB_SDTEST_IF_TX_0_CREDIT_CHANGE_RMASK 0x1 +#define QIB_7322_IB_SDTEST_IF_TX_0_VL_CAP_LSB 0x2 +#define QIB_7322_IB_SDTEST_IF_TX_0_VL_CAP_MSB 0x3 +#define QIB_7322_IB_SDTEST_IF_TX_0_VL_CAP_RMASK 0x3 +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_3_TX_VALID_LSB 0x1 +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_3_TX_VALID_MSB 0x1 +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_3_TX_VALID_RMASK 0x1 +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_T_TX_VALID_LSB 0x0 +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_T_TX_VALID_MSB 0x0 +#define QIB_7322_IB_SDTEST_IF_TX_0_TS_T_TX_VALID_RMASK 0x1 + +#define QIB_7322_IB_SDTEST_IF_RX_0_OFFS 0x1598 +#define QIB_7322_IB_SDTEST_IF_RX_0_DEF 0x0000000000000000 +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_RX_CFG_LSB 0x30 +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_RX_CFG_MSB 0x3F +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_RX_CFG_RMASK 0xFFFF +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_TX_CFG_LSB 0x20 +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_TX_CFG_MSB 0x2F +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_TX_CFG_RMASK 0xFFFF +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_B_LSB 0x18 +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_B_MSB 0x1F +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_B_RMASK 0xFF +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_A_LSB 0x10 +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_A_MSB 0x17 +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_A_RMASK 0xFF +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_3_RX_VALID_LSB 0x1 +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_3_RX_VALID_MSB 0x1 +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_3_RX_VALID_RMASK 0x1 +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_T_RX_VALID_LSB 0x0 +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_T_RX_VALID_MSB 0x0 +#define QIB_7322_IB_SDTEST_IF_RX_0_TS_T_RX_VALID_RMASK 0x1 + +#define QIB_7322_IBNCModeCtrl_0_OFFS 0x15B8 +#define QIB_7322_IBNCModeCtrl_0_DEF 0x0000000000000000 +#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteForce_LSB 0x22 +#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteForce_MSB 0x22 +#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteForce_RMASK 0x1 +#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteMask_LSB 0x21 +#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteMask_MSB 0x21 +#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteMask_RMASK 0x1 +#define QIB_7322_IBNCModeCtrl_0_ScrambleCapLocal_LSB 0x20 +#define QIB_7322_IBNCModeCtrl_0_ScrambleCapLocal_MSB 0x20 +#define QIB_7322_IBNCModeCtrl_0_ScrambleCapLocal_RMASK 0x1 +#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS2_LSB 0x11 +#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS2_MSB 0x19 +#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS2_RMASK 0x1FF +#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS1_LSB 0x8 +#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS1_MSB 0x10 +#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS1_RMASK 0x1FF +#define QIB_7322_IBNCModeCtrl_0_TSMEnable_ignore_TSM_on_rx_LSB 0x2 +#define QIB_7322_IBNCModeCtrl_0_TSMEnable_ignore_TSM_on_rx_MSB 0x2 +#define QIB_7322_IBNCModeCtrl_0_TSMEnable_ignore_TSM_on_rx_RMASK 0x1 +#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS2_LSB 0x1 +#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS2_MSB 0x1 +#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS2_RMASK 0x1 +#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS1_LSB 0x0 +#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS1_MSB 0x0 +#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS1_RMASK 0x1 + +#define QIB_7322_IBSerdesStatus_0_OFFS 0x15D0 +#define QIB_7322_IBSerdesStatus_0_DEF 0x0000000000000000 + +#define QIB_7322_IBPCSConfig_0_OFFS 0x15D8 +#define QIB_7322_IBPCSConfig_0_DEF 0x0000000000000007 +#define QIB_7322_IBPCSConfig_0_link_sync_mask_LSB 0x9 +#define QIB_7322_IBPCSConfig_0_link_sync_mask_MSB 0x12 +#define QIB_7322_IBPCSConfig_0_link_sync_mask_RMASK 0x3FF +#define QIB_7322_IBPCSConfig_0_xcv_rreset_LSB 0x2 +#define QIB_7322_IBPCSConfig_0_xcv_rreset_MSB 0x2 +#define QIB_7322_IBPCSConfig_0_xcv_rreset_RMASK 0x1 +#define QIB_7322_IBPCSConfig_0_xcv_treset_LSB 0x1 +#define QIB_7322_IBPCSConfig_0_xcv_treset_MSB 0x1 +#define QIB_7322_IBPCSConfig_0_xcv_treset_RMASK 0x1 +#define QIB_7322_IBPCSConfig_0_tx_rx_reset_LSB 0x0 +#define QIB_7322_IBPCSConfig_0_tx_rx_reset_MSB 0x0 +#define QIB_7322_IBPCSConfig_0_tx_rx_reset_RMASK 0x1 + +#define QIB_7322_IBSerdesCtrl_0_OFFS 0x15E0 +#define QIB_7322_IBSerdesCtrl_0_DEF 0x0000000000FFA00F +#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_QDR_LSB 0x1A +#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_QDR_MSB 0x1A +#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_QDR_RMASK 0x1 +#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_DDR_LSB 0x19 +#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_DDR_MSB 0x19 +#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_DDR_RMASK 0x1 +#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_SDR_LSB 0x18 +#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_SDR_MSB 0x18 +#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_SDR_RMASK 0x1 +#define QIB_7322_IBSerdesCtrl_0_CHANNEL_RESET_N_LSB 0x14 +#define QIB_7322_IBSerdesCtrl_0_CHANNEL_RESET_N_MSB 0x17 +#define QIB_7322_IBSerdesCtrl_0_CHANNEL_RESET_N_RMASK 0xF +#define QIB_7322_IBSerdesCtrl_0_CGMODE_LSB 0x10 +#define QIB_7322_IBSerdesCtrl_0_CGMODE_MSB 0x13 +#define QIB_7322_IBSerdesCtrl_0_CGMODE_RMASK 0xF +#define QIB_7322_IBSerdesCtrl_0_IB_LAT_MODE_LSB 0xF +#define QIB_7322_IBSerdesCtrl_0_IB_LAT_MODE_MSB 0xF +#define QIB_7322_IBSerdesCtrl_0_IB_LAT_MODE_RMASK 0x1 +#define QIB_7322_IBSerdesCtrl_0_RXLOSEN_LSB 0xD +#define QIB_7322_IBSerdesCtrl_0_RXLOSEN_MSB 0xD +#define QIB_7322_IBSerdesCtrl_0_RXLOSEN_RMASK 0x1 +#define QIB_7322_IBSerdesCtrl_0_LPEN_LSB 0xC +#define QIB_7322_IBSerdesCtrl_0_LPEN_MSB 0xC +#define QIB_7322_IBSerdesCtrl_0_LPEN_RMASK 0x1 +#define QIB_7322_IBSerdesCtrl_0_PLLPD_LSB 0xB +#define QIB_7322_IBSerdesCtrl_0_PLLPD_MSB 0xB +#define QIB_7322_IBSerdesCtrl_0_PLLPD_RMASK 0x1 +#define QIB_7322_IBSerdesCtrl_0_TXPD_LSB 0xA +#define QIB_7322_IBSerdesCtrl_0_TXPD_MSB 0xA +#define QIB_7322_IBSerdesCtrl_0_TXPD_RMASK 0x1 +#define QIB_7322_IBSerdesCtrl_0_RXPD_LSB 0x9 +#define QIB_7322_IBSerdesCtrl_0_RXPD_MSB 0x9 +#define QIB_7322_IBSerdesCtrl_0_RXPD_RMASK 0x1 +#define QIB_7322_IBSerdesCtrl_0_TXIDLE_LSB 0x8 +#define QIB_7322_IBSerdesCtrl_0_TXIDLE_MSB 0x8 +#define QIB_7322_IBSerdesCtrl_0_TXIDLE_RMASK 0x1 +#define QIB_7322_IBSerdesCtrl_0_CMODE_LSB 0x0 +#define QIB_7322_IBSerdesCtrl_0_CMODE_MSB 0x6 +#define QIB_7322_IBSerdesCtrl_0_CMODE_RMASK 0x7F + +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_OFFS 0x1600 +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_DEF 0x0000000000000000 +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_tx_override_deemphasis_select_LSB 0x1F +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_tx_override_deemphasis_select_MSB 0x1F +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_tx_override_deemphasis_select_RMASK 0x1 +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_reset_tx_deemphasis_override_LSB 0x1E +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_reset_tx_deemphasis_override_MSB 0x1E +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_reset_tx_deemphasis_override_RMASK 0x1 +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txampcntl_d2a_LSB 0xE +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txampcntl_d2a_MSB 0x11 +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txampcntl_d2a_RMASK 0xF +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txc0_ena_LSB 0x9 +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txc0_ena_MSB 0xD +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txc0_ena_RMASK 0x1F +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcp1_ena_LSB 0x5 +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcp1_ena_MSB 0x8 +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcp1_ena_RMASK 0xF +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_xtra_emph0_LSB 0x3 +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_xtra_emph0_MSB 0x4 +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_xtra_emph0_RMASK 0x3 +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_ena_LSB 0x0 +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_ena_MSB 0x2 +#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_ena_RMASK 0x7 + +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_OFFS 0x1640 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_DEF 0x0000000000000000 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch3_LSB 0x27 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch3_MSB 0x27 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch3_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch2_LSB 0x26 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch2_MSB 0x26 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch2_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch1_LSB 0x25 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch1_MSB 0x25 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch1_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch0_LSB 0x24 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch0_MSB 0x24 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch0_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch3_LSB 0x23 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch3_MSB 0x23 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch3_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch2_LSB 0x22 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch2_MSB 0x22 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch2_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch1_LSB 0x21 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch1_MSB 0x21 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch1_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch0_LSB 0x20 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch0_MSB 0x20 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch0_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch3_LSB 0x18 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch3_MSB 0x1F +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch3_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch2_LSB 0x10 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch2_MSB 0x17 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch2_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch1_LSB 0x8 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch1_MSB 0xF +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch1_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch0_LSB 0x0 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch0_MSB 0x7 +#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch0_RMASK 0xFF + +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_OFFS 0x1648 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_DEF 0x0000000000000000 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch3_LSB 0x27 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch3_MSB 0x27 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch3_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch2_LSB 0x26 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch2_MSB 0x26 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch2_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch1_LSB 0x25 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch1_MSB 0x25 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch1_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch0_LSB 0x24 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch0_MSB 0x24 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch0_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch3_LSB 0x23 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch3_MSB 0x23 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch3_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch2_LSB 0x22 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch2_MSB 0x22 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch2_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch1_LSB 0x21 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch1_MSB 0x21 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch1_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch0_LSB 0x20 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch0_MSB 0x20 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch0_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch3_LSB 0x18 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch3_MSB 0x1F +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch3_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch2_LSB 0x10 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch2_MSB 0x17 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch2_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch1_LSB 0x8 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch1_MSB 0xF +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch1_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch0_LSB 0x0 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch0_MSB 0x7 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch0_RMASK 0xFF + +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_OFFS 0x1650 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_DEF 0x0000000000000000 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch3_LSB 0x27 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch3_MSB 0x27 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch3_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch2_LSB 0x26 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch2_MSB 0x26 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch2_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch1_LSB 0x25 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch1_MSB 0x25 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch1_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch0_LSB 0x24 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch0_MSB 0x24 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch0_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch3_LSB 0x23 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch3_MSB 0x23 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch3_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch2_LSB 0x22 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch2_MSB 0x22 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch2_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch1_LSB 0x21 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch1_MSB 0x21 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch1_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch0_LSB 0x20 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch0_MSB 0x20 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch0_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch3_LSB 0x18 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch3_MSB 0x1F +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch3_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch2_LSB 0x10 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch2_MSB 0x17 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch2_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch1_LSB 0x8 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch1_MSB 0xF +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch1_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch0_LSB 0x0 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch0_MSB 0x7 +#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch0_RMASK 0xFF + +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_OFFS 0x1658 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_DEF 0x0000000000000000 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch3_LSB 0x27 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch3_MSB 0x27 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch3_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch2_LSB 0x26 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch2_MSB 0x26 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch2_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch1_LSB 0x25 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch1_MSB 0x25 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch1_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch0_LSB 0x24 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch0_MSB 0x24 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch0_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch3_LSB 0x23 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch3_MSB 0x23 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch3_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch2_LSB 0x22 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch2_MSB 0x22 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch2_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch1_LSB 0x21 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch1_MSB 0x21 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch1_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch0_LSB 0x20 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch0_MSB 0x20 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch0_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch3_LSB 0x18 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch3_MSB 0x1F +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch3_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch2_LSB 0x10 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch2_MSB 0x17 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch2_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch1_LSB 0x8 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch1_MSB 0xF +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch1_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch0_LSB 0x0 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch0_MSB 0x7 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch0_RMASK 0xFF + +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_OFFS 0x1660 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_DEF 0x0000000000000000 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch3_LSB 0x27 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch3_MSB 0x27 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch3_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch2_LSB 0x26 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch2_MSB 0x26 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch2_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch1_LSB 0x25 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch1_MSB 0x25 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch1_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch0_LSB 0x24 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch0_MSB 0x24 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch0_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch3_LSB 0x23 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch3_MSB 0x23 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch3_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch2_LSB 0x22 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch2_MSB 0x22 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch2_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch1_LSB 0x21 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch1_MSB 0x21 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch1_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch0_LSB 0x20 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch0_MSB 0x20 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch0_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch3_LSB 0x18 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch3_MSB 0x1F +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch3_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch2_LSB 0x10 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch2_MSB 0x17 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch2_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch1_LSB 0x8 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch1_MSB 0xF +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch1_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch0_LSB 0x0 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch0_MSB 0x7 +#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch0_RMASK 0xFF + +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_OFFS 0x1668 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_DEF 0x0000000000000000 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch3_LSB 0x27 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch3_MSB 0x27 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch3_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch2_LSB 0x26 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch2_MSB 0x26 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch2_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch1_LSB 0x25 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch1_MSB 0x25 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch1_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch0_LSB 0x24 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch0_MSB 0x24 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch0_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch3_LSB 0x23 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch3_MSB 0x23 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch3_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch2_LSB 0x22 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch2_MSB 0x22 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch2_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch1_LSB 0x21 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch1_MSB 0x21 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch1_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch0_LSB 0x20 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch0_MSB 0x20 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch0_RMASK 0x1 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch3_LSB 0x18 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch3_MSB 0x1F +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch3_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch2_LSB 0x10 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch2_MSB 0x17 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch2_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch1_LSB 0x8 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch1_MSB 0xF +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch1_RMASK 0xFF +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch0_LSB 0x0 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch0_MSB 0x7 +#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch0_RMASK 0xFF + +#define QIB_7322_ADAPT_DISABLE_TIMER_THRESHOLD_0_OFFS 0x1670 +#define QIB_7322_ADAPT_DISABLE_TIMER_THRESHOLD_0_DEF 0x0000000000000000 + +#define QIB_7322_HighPriorityLimit_0_OFFS 0x1BC0 +#define QIB_7322_HighPriorityLimit_0_DEF 0x0000000000000000 +#define QIB_7322_HighPriorityLimit_0_Limit_LSB 0x0 +#define QIB_7322_HighPriorityLimit_0_Limit_MSB 0x7 +#define QIB_7322_HighPriorityLimit_0_Limit_RMASK 0xFF + +#define QIB_7322_LowPriority0_0_OFFS 0x1C00 +#define QIB_7322_LowPriority0_0_DEF 0x0000000000000000 +#define QIB_7322_LowPriority0_0_VirtualLane_LSB 0x10 +#define QIB_7322_LowPriority0_0_VirtualLane_MSB 0x12 +#define QIB_7322_LowPriority0_0_VirtualLane_RMASK 0x7 +#define QIB_7322_LowPriority0_0_Weight_LSB 0x0 +#define QIB_7322_LowPriority0_0_Weight_MSB 0x7 +#define QIB_7322_LowPriority0_0_Weight_RMASK 0xFF + +#define QIB_7322_HighPriority0_0_OFFS 0x1E00 +#define QIB_7322_HighPriority0_0_DEF 0x0000000000000000 +#define QIB_7322_HighPriority0_0_VirtualLane_LSB 0x10 +#define QIB_7322_HighPriority0_0_VirtualLane_MSB 0x12 +#define QIB_7322_HighPriority0_0_VirtualLane_RMASK 0x7 +#define QIB_7322_HighPriority0_0_Weight_LSB 0x0 +#define QIB_7322_HighPriority0_0_Weight_MSB 0x7 +#define QIB_7322_HighPriority0_0_Weight_RMASK 0xFF + +#define QIB_7322_CntrRegBase_1_OFFS 0x2028 +#define QIB_7322_CntrRegBase_1_DEF 0x0000000000013000 + +#define QIB_7322_RcvQPMulticastContext_1_OFFS 0x2170 + +#define QIB_7322_SendCtrl_1_OFFS 0x21C0 + +#define QIB_7322_SendBufAvail0_OFFS 0x3000 +#define QIB_7322_SendBufAvail0_DEF 0x0000000000000000 +#define QIB_7322_SendBufAvail0_SendBuf_31_0_LSB 0x0 +#define QIB_7322_SendBufAvail0_SendBuf_31_0_MSB 0x3F +#define QIB_7322_SendBufAvail0_SendBuf_31_0_RMASK 0x0 + +#define QIB_7322_MsixTable_OFFS 0x8000 +#define QIB_7322_MsixTable_DEF 0x0000000000000000 + +#define QIB_7322_MsixPba_OFFS 0x9000 +#define QIB_7322_MsixPba_DEF 0x0000000000000000 + +#define QIB_7322_LAMemory_OFFS 0xA000 +#define QIB_7322_LAMemory_DEF 0x0000000000000000 + +#define QIB_7322_LBIntCnt_OFFS 0x11000 +#define QIB_7322_LBIntCnt_DEF 0x0000000000000000 + +#define QIB_7322_LBFlowStallCnt_OFFS 0x11008 +#define QIB_7322_LBFlowStallCnt_DEF 0x0000000000000000 + +#define QIB_7322_RxTIDFullErrCnt_OFFS 0x110D0 +#define QIB_7322_RxTIDFullErrCnt_DEF 0x0000000000000000 + +#define QIB_7322_RxTIDValidErrCnt_OFFS 0x110D8 +#define QIB_7322_RxTIDValidErrCnt_DEF 0x0000000000000000 + +#define QIB_7322_RxP0HdrEgrOvflCnt_OFFS 0x110E8 +#define QIB_7322_RxP0HdrEgrOvflCnt_DEF 0x0000000000000000 + +#define QIB_7322_PcieRetryBufDiagQwordCnt_OFFS 0x111A0 +#define QIB_7322_PcieRetryBufDiagQwordCnt_DEF 0x0000000000000000 + +#define QIB_7322_RxTidFlowDropCnt_OFFS 0x111E0 +#define QIB_7322_RxTidFlowDropCnt_DEF 0x0000000000000000 + +#define QIB_7322_LBIntCnt_0_OFFS 0x12000 +#define QIB_7322_LBIntCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_TxCreditUpToDateTimeOut_0_OFFS 0x12008 +#define QIB_7322_TxCreditUpToDateTimeOut_0_DEF 0x0000000000000000 + +#define QIB_7322_TxSDmaDescCnt_0_OFFS 0x12010 +#define QIB_7322_TxSDmaDescCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_TxUnsupVLErrCnt_0_OFFS 0x12018 +#define QIB_7322_TxUnsupVLErrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_TxDataPktCnt_0_OFFS 0x12020 +#define QIB_7322_TxDataPktCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_TxFlowPktCnt_0_OFFS 0x12028 +#define QIB_7322_TxFlowPktCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_TxDwordCnt_0_OFFS 0x12030 +#define QIB_7322_TxDwordCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_TxLenErrCnt_0_OFFS 0x12038 +#define QIB_7322_TxLenErrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_TxMaxMinLenErrCnt_0_OFFS 0x12040 +#define QIB_7322_TxMaxMinLenErrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_TxUnderrunCnt_0_OFFS 0x12048 +#define QIB_7322_TxUnderrunCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_TxFlowStallCnt_0_OFFS 0x12050 +#define QIB_7322_TxFlowStallCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_TxDroppedPktCnt_0_OFFS 0x12058 +#define QIB_7322_TxDroppedPktCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxDroppedPktCnt_0_OFFS 0x12060 +#define QIB_7322_RxDroppedPktCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxDataPktCnt_0_OFFS 0x12068 +#define QIB_7322_RxDataPktCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxFlowPktCnt_0_OFFS 0x12070 +#define QIB_7322_RxFlowPktCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxDwordCnt_0_OFFS 0x12078 +#define QIB_7322_RxDwordCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxLenErrCnt_0_OFFS 0x12080 +#define QIB_7322_RxLenErrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxMaxMinLenErrCnt_0_OFFS 0x12088 +#define QIB_7322_RxMaxMinLenErrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxICRCErrCnt_0_OFFS 0x12090 +#define QIB_7322_RxICRCErrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxVCRCErrCnt_0_OFFS 0x12098 +#define QIB_7322_RxVCRCErrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxFlowCtrlViolCnt_0_OFFS 0x120A0 +#define QIB_7322_RxFlowCtrlViolCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxVersionErrCnt_0_OFFS 0x120A8 +#define QIB_7322_RxVersionErrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxLinkMalformCnt_0_OFFS 0x120B0 +#define QIB_7322_RxLinkMalformCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxEBPCnt_0_OFFS 0x120B8 +#define QIB_7322_RxEBPCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxLPCRCErrCnt_0_OFFS 0x120C0 +#define QIB_7322_RxLPCRCErrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxBufOvflCnt_0_OFFS 0x120C8 +#define QIB_7322_RxBufOvflCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxLenTruncateCnt_0_OFFS 0x120D0 +#define QIB_7322_RxLenTruncateCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxPKeyMismatchCnt_0_OFFS 0x120E0 +#define QIB_7322_RxPKeyMismatchCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_IBLinkDownedCnt_0_OFFS 0x12180 +#define QIB_7322_IBLinkDownedCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_IBSymbolErrCnt_0_OFFS 0x12188 +#define QIB_7322_IBSymbolErrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_IBStatusChangeCnt_0_OFFS 0x12190 +#define QIB_7322_IBStatusChangeCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_IBLinkErrRecoveryCnt_0_OFFS 0x12198 +#define QIB_7322_IBLinkErrRecoveryCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_ExcessBufferOvflCnt_0_OFFS 0x121A8 +#define QIB_7322_ExcessBufferOvflCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_LocalLinkIntegrityErrCnt_0_OFFS 0x121B0 +#define QIB_7322_LocalLinkIntegrityErrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxVlErrCnt_0_OFFS 0x121B8 +#define QIB_7322_RxVlErrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxDlidFltrCnt_0_OFFS 0x121C0 +#define QIB_7322_RxDlidFltrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxVL15DroppedPktCnt_0_OFFS 0x121C8 +#define QIB_7322_RxVL15DroppedPktCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxOtherLocalPhyErrCnt_0_OFFS 0x121D0 +#define QIB_7322_RxOtherLocalPhyErrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_RxQPInvalidContextCnt_0_OFFS 0x121D8 +#define QIB_7322_RxQPInvalidContextCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_TxHeadersErrCnt_0_OFFS 0x121F8 +#define QIB_7322_TxHeadersErrCnt_0_DEF 0x0000000000000000 + +#define QIB_7322_PSRcvDataCount_0_OFFS 0x12218 +#define QIB_7322_PSRcvDataCount_0_DEF 0x0000000000000000 + +#define QIB_7322_PSRcvPktsCount_0_OFFS 0x12220 +#define QIB_7322_PSRcvPktsCount_0_DEF 0x0000000000000000 + +#define QIB_7322_PSXmitDataCount_0_OFFS 0x12228 +#define QIB_7322_PSXmitDataCount_0_DEF 0x0000000000000000 + +#define QIB_7322_PSXmitPktsCount_0_OFFS 0x12230 +#define QIB_7322_PSXmitPktsCount_0_DEF 0x0000000000000000 + +#define QIB_7322_PSXmitWaitCount_0_OFFS 0x12238 +#define QIB_7322_PSXmitWaitCount_0_DEF 0x0000000000000000 + +#define QIB_7322_LBIntCnt_1_OFFS 0x13000 +#define QIB_7322_LBIntCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_TxCreditUpToDateTimeOut_1_OFFS 0x13008 +#define QIB_7322_TxCreditUpToDateTimeOut_1_DEF 0x0000000000000000 + +#define QIB_7322_TxSDmaDescCnt_1_OFFS 0x13010 +#define QIB_7322_TxSDmaDescCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_TxUnsupVLErrCnt_1_OFFS 0x13018 +#define QIB_7322_TxUnsupVLErrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_TxDataPktCnt_1_OFFS 0x13020 +#define QIB_7322_TxDataPktCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_TxFlowPktCnt_1_OFFS 0x13028 +#define QIB_7322_TxFlowPktCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_TxDwordCnt_1_OFFS 0x13030 +#define QIB_7322_TxDwordCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_TxLenErrCnt_1_OFFS 0x13038 +#define QIB_7322_TxLenErrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_TxMaxMinLenErrCnt_1_OFFS 0x13040 +#define QIB_7322_TxMaxMinLenErrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_TxUnderrunCnt_1_OFFS 0x13048 +#define QIB_7322_TxUnderrunCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_TxFlowStallCnt_1_OFFS 0x13050 +#define QIB_7322_TxFlowStallCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_TxDroppedPktCnt_1_OFFS 0x13058 +#define QIB_7322_TxDroppedPktCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxDroppedPktCnt_1_OFFS 0x13060 +#define QIB_7322_RxDroppedPktCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxDataPktCnt_1_OFFS 0x13068 +#define QIB_7322_RxDataPktCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxFlowPktCnt_1_OFFS 0x13070 +#define QIB_7322_RxFlowPktCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxDwordCnt_1_OFFS 0x13078 +#define QIB_7322_RxDwordCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxLenErrCnt_1_OFFS 0x13080 +#define QIB_7322_RxLenErrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxMaxMinLenErrCnt_1_OFFS 0x13088 +#define QIB_7322_RxMaxMinLenErrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxICRCErrCnt_1_OFFS 0x13090 +#define QIB_7322_RxICRCErrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxVCRCErrCnt_1_OFFS 0x13098 +#define QIB_7322_RxVCRCErrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxFlowCtrlViolCnt_1_OFFS 0x130A0 +#define QIB_7322_RxFlowCtrlViolCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxVersionErrCnt_1_OFFS 0x130A8 +#define QIB_7322_RxVersionErrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxLinkMalformCnt_1_OFFS 0x130B0 +#define QIB_7322_RxLinkMalformCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxEBPCnt_1_OFFS 0x130B8 +#define QIB_7322_RxEBPCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxLPCRCErrCnt_1_OFFS 0x130C0 +#define QIB_7322_RxLPCRCErrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxBufOvflCnt_1_OFFS 0x130C8 +#define QIB_7322_RxBufOvflCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxLenTruncateCnt_1_OFFS 0x130D0 +#define QIB_7322_RxLenTruncateCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxPKeyMismatchCnt_1_OFFS 0x130E0 +#define QIB_7322_RxPKeyMismatchCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_IBLinkDownedCnt_1_OFFS 0x13180 +#define QIB_7322_IBLinkDownedCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_IBSymbolErrCnt_1_OFFS 0x13188 +#define QIB_7322_IBSymbolErrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_IBStatusChangeCnt_1_OFFS 0x13190 +#define QIB_7322_IBStatusChangeCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_IBLinkErrRecoveryCnt_1_OFFS 0x13198 +#define QIB_7322_IBLinkErrRecoveryCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_ExcessBufferOvflCnt_1_OFFS 0x131A8 +#define QIB_7322_ExcessBufferOvflCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_LocalLinkIntegrityErrCnt_1_OFFS 0x131B0 +#define QIB_7322_LocalLinkIntegrityErrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxVlErrCnt_1_OFFS 0x131B8 +#define QIB_7322_RxVlErrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxDlidFltrCnt_1_OFFS 0x131C0 +#define QIB_7322_RxDlidFltrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxVL15DroppedPktCnt_1_OFFS 0x131C8 +#define QIB_7322_RxVL15DroppedPktCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxOtherLocalPhyErrCnt_1_OFFS 0x131D0 +#define QIB_7322_RxOtherLocalPhyErrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_RxQPInvalidContextCnt_1_OFFS 0x131D8 +#define QIB_7322_RxQPInvalidContextCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_TxHeadersErrCnt_1_OFFS 0x131F8 +#define QIB_7322_TxHeadersErrCnt_1_DEF 0x0000000000000000 + +#define QIB_7322_PSRcvDataCount_1_OFFS 0x13218 +#define QIB_7322_PSRcvDataCount_1_DEF 0x0000000000000000 + +#define QIB_7322_PSRcvPktsCount_1_OFFS 0x13220 +#define QIB_7322_PSRcvPktsCount_1_DEF 0x0000000000000000 + +#define QIB_7322_PSXmitDataCount_1_OFFS 0x13228 +#define QIB_7322_PSXmitDataCount_1_DEF 0x0000000000000000 + +#define QIB_7322_PSXmitPktsCount_1_OFFS 0x13230 +#define QIB_7322_PSXmitPktsCount_1_DEF 0x0000000000000000 + +#define QIB_7322_PSXmitWaitCount_1_OFFS 0x13238 +#define QIB_7322_PSXmitWaitCount_1_DEF 0x0000000000000000 + +#define QIB_7322_RcvEgrArray_OFFS 0x14000 +#define QIB_7322_RcvEgrArray_DEF 0x0000000000000000 +#define QIB_7322_RcvEgrArray_RT_BufSize_LSB 0x25 +#define QIB_7322_RcvEgrArray_RT_BufSize_MSB 0x27 +#define QIB_7322_RcvEgrArray_RT_BufSize_RMASK 0x7 +#define QIB_7322_RcvEgrArray_RT_Addr_LSB 0x0 +#define QIB_7322_RcvEgrArray_RT_Addr_MSB 0x24 +#define QIB_7322_RcvEgrArray_RT_Addr_RMASK 0x1FFFFFFFFF + +#define QIB_7322_RcvTIDArray0_OFFS 0x50000 +#define QIB_7322_RcvTIDArray0_DEF 0x0000000000000000 +#define QIB_7322_RcvTIDArray0_RT_BufSize_LSB 0x25 +#define QIB_7322_RcvTIDArray0_RT_BufSize_MSB 0x27 +#define QIB_7322_RcvTIDArray0_RT_BufSize_RMASK 0x7 +#define QIB_7322_RcvTIDArray0_RT_Addr_LSB 0x0 +#define QIB_7322_RcvTIDArray0_RT_Addr_MSB 0x24 +#define QIB_7322_RcvTIDArray0_RT_Addr_RMASK 0x1FFFFFFFFF + +#define QIB_7322_IBSD_DDS_MAP_TABLE_0_OFFS 0xD0000 +#define QIB_7322_IBSD_DDS_MAP_TABLE_0_DEF 0x0000000000000000 + +#define QIB_7322_RcvHdrTail0_OFFS 0x200000 +#define QIB_7322_RcvHdrTail0_DEF 0x0000000000000000 + +#define QIB_7322_RcvHdrHead0_OFFS 0x200008 +#define QIB_7322_RcvHdrHead0_DEF 0x0000000000000000 +#define QIB_7322_RcvHdrHead0_counter_LSB 0x20 +#define QIB_7322_RcvHdrHead0_counter_MSB 0x2F +#define QIB_7322_RcvHdrHead0_counter_RMASK 0xFFFF +#define QIB_7322_RcvHdrHead0_RcvHeadPointer_LSB 0x0 +#define QIB_7322_RcvHdrHead0_RcvHeadPointer_MSB 0x1F +#define QIB_7322_RcvHdrHead0_RcvHeadPointer_RMASK 0xFFFFFFFF + +#define QIB_7322_RcvEgrIndexTail0_OFFS 0x200010 +#define QIB_7322_RcvEgrIndexTail0_DEF 0x0000000000000000 + +#define QIB_7322_RcvEgrIndexHead0_OFFS 0x200018 +#define QIB_7322_RcvEgrIndexHead0_DEF 0x0000000000000000 + +#define QIB_7322_RcvTIDFlowTable0_OFFS 0x201000 +#define QIB_7322_RcvTIDFlowTable0_DEF 0x0000000000000000 +#define QIB_7322_RcvTIDFlowTable0_GenMismatch_LSB 0x1C +#define QIB_7322_RcvTIDFlowTable0_GenMismatch_MSB 0x1C +#define QIB_7322_RcvTIDFlowTable0_GenMismatch_RMASK 0x1 +#define QIB_7322_RcvTIDFlowTable0_SeqMismatch_LSB 0x1B +#define QIB_7322_RcvTIDFlowTable0_SeqMismatch_MSB 0x1B +#define QIB_7322_RcvTIDFlowTable0_SeqMismatch_RMASK 0x1 +#define QIB_7322_RcvTIDFlowTable0_KeepOnGenErr_LSB 0x16 +#define QIB_7322_RcvTIDFlowTable0_KeepOnGenErr_MSB 0x16 +#define QIB_7322_RcvTIDFlowTable0_KeepOnGenErr_RMASK 0x1 +#define QIB_7322_RcvTIDFlowTable0_KeepAfterSeqErr_LSB 0x15 +#define QIB_7322_RcvTIDFlowTable0_KeepAfterSeqErr_MSB 0x15 +#define QIB_7322_RcvTIDFlowTable0_KeepAfterSeqErr_RMASK 0x1 +#define QIB_7322_RcvTIDFlowTable0_HdrSuppEnabled_LSB 0x14 +#define QIB_7322_RcvTIDFlowTable0_HdrSuppEnabled_MSB 0x14 +#define QIB_7322_RcvTIDFlowTable0_HdrSuppEnabled_RMASK 0x1 +#define QIB_7322_RcvTIDFlowTable0_FlowValid_LSB 0x13 +#define QIB_7322_RcvTIDFlowTable0_FlowValid_MSB 0x13 +#define QIB_7322_RcvTIDFlowTable0_FlowValid_RMASK 0x1 +#define QIB_7322_RcvTIDFlowTable0_GenVal_LSB 0xB +#define QIB_7322_RcvTIDFlowTable0_GenVal_MSB 0x12 +#define QIB_7322_RcvTIDFlowTable0_GenVal_RMASK 0xFF +#define QIB_7322_RcvTIDFlowTable0_SeqNum_LSB 0x0 +#define QIB_7322_RcvTIDFlowTable0_SeqNum_MSB 0xA +#define QIB_7322_RcvTIDFlowTable0_SeqNum_RMASK 0x7FF diff --git a/drivers/infiniband/hw/qib/qib_common.h b/drivers/infiniband/hw/qib/qib_common.h new file mode 100644 index 0000000..5670ace --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_common.h @@ -0,0 +1,812 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation. + * All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _QIB_COMMON_H +#define _QIB_COMMON_H + +/* + * This file contains defines, structures, etc. that are used + * to communicate between kernel and user code. + */ + +/* This is the IEEE-assigned OUI for QLogic Inc. QLogic_IB */ +#define QIB_SRC_OUI_1 0x00 +#define QIB_SRC_OUI_2 0x11 +#define QIB_SRC_OUI_3 0x75 + +/* version of protocol header (known to chip also). In the long run, + * we should be able to generate and accept a range of version numbers; + * for now we only accept one, and it's compiled in. + */ +#define IPS_PROTO_VERSION 2 + +/* + * These are compile time constants that you may want to enable or disable + * if you are trying to debug problems with code or performance. + * QIB_VERBOSE_TRACING define as 1 if you want additional tracing in + * fastpath code + * QIB_TRACE_REGWRITES define as 1 if you want register writes to be + * traced in faspath code + * _QIB_TRACING define as 0 if you want to remove all tracing in a + * compilation unit + */ + +/* + * The value in the BTH QP field that QLogic_IB uses to differentiate + * an qlogic_ib protocol IB packet vs standard IB transport + * This it needs to be even (0x656b78), because the LSB is sometimes + * used for the MSB of context. The change may cause a problem + * interoperating with older software. + */ +#define QIB_KD_QP 0x656b78 + +/* + * These are the status bits readable (in ascii form, 64bit value) + * from the "status" sysfs file. For binary compatibility, values + * must remain as is; removed states can be reused for different + * purposes. + */ +#define QIB_STATUS_INITTED 0x1 /* basic initialization done */ +/* Chip has been found and initted */ +#define QIB_STATUS_CHIP_PRESENT 0x20 +/* IB link is at ACTIVE, usable for data traffic */ +#define QIB_STATUS_IB_READY 0x40 +/* link is configured, LID, MTU, etc. have been set */ +#define QIB_STATUS_IB_CONF 0x80 +/* A Fatal hardware error has occurred. */ +#define QIB_STATUS_HWERROR 0x200 + +/* + * The list of usermode accessible registers. Also see Reg_* later in file. + */ +enum qib_ureg { + /* (RO) DMA RcvHdr to be used next. */ + ur_rcvhdrtail = 0, + /* (RW) RcvHdr entry to be processed next by host. */ + ur_rcvhdrhead = 1, + /* (RO) Index of next Eager index to use. */ + ur_rcvegrindextail = 2, + /* (RW) Eager TID to be processed next */ + ur_rcvegrindexhead = 3, + /* For internal use only; max register number. */ + _QIB_UregMax +}; + +/* bit values for spi_runtime_flags */ +#define QIB_RUNTIME_PCIE 0x0002 +#define QIB_RUNTIME_FORCE_WC_ORDER 0x0004 +#define QIB_RUNTIME_RCVHDR_COPY 0x0008 +#define QIB_RUNTIME_MASTER 0x0010 +#define QIB_RUNTIME_RCHK 0x0020 +#define QIB_RUNTIME_NODMA_RTAIL 0x0080 +#define QIB_RUNTIME_SPECIAL_TRIGGER 0x0100 +#define QIB_RUNTIME_SDMA 0x0200 +#define QIB_RUNTIME_FORCE_PIOAVAIL 0x0400 +#define QIB_RUNTIME_PIO_REGSWAPPED 0x0800 +#define QIB_RUNTIME_CTXT_MSB_IN_QP 0x1000 +#define QIB_RUNTIME_CTXT_REDIRECT 0x2000 +#define QIB_RUNTIME_HDRSUPP 0x4000 + +/* + * This structure is returned by qib_userinit() immediately after + * open to get implementation-specific info, and info specific to this + * instance. + * + * This struct must have explict pad fields where type sizes + * may result in different alignments between 32 and 64 bit + * programs, since the 64 bit * bit kernel requires the user code + * to have matching offsets + */ +struct qib_base_info { + /* version of hardware, for feature checking. */ + __u32 spi_hw_version; + /* version of software, for feature checking. */ + __u32 spi_sw_version; + /* QLogic_IB context assigned, goes into sent packets */ + __u16 spi_ctxt; + __u16 spi_subctxt; + /* + * IB MTU, packets IB data must be less than this. + * The MTU is in bytes, and will be a multiple of 4 bytes. + */ + __u32 spi_mtu; + /* + * Size of a PIO buffer. Any given packet's total size must be less + * than this (in words). Included is the starting control word, so + * if 513 is returned, then total pkt size is 512 words or less. + */ + __u32 spi_piosize; + /* size of the TID cache in qlogic_ib, in entries */ + __u32 spi_tidcnt; + /* size of the TID Eager list in qlogic_ib, in entries */ + __u32 spi_tidegrcnt; + /* size of a single receive header queue entry in words. */ + __u32 spi_rcvhdrent_size; + /* + * Count of receive header queue entries allocated. + * This may be less than the spu_rcvhdrcnt passed in!. + */ + __u32 spi_rcvhdr_cnt; + + /* per-chip and other runtime features bitmap (QIB_RUNTIME_*) */ + __u32 spi_runtime_flags; + + /* address where hardware receive header queue is mapped */ + __u64 spi_rcvhdr_base; + + /* user program. */ + + /* base address of eager TID receive buffers used by hardware. */ + __u64 spi_rcv_egrbufs; + + /* Allocated by initialization code, not by protocol. */ + + /* + * Size of each TID buffer in host memory, starting at + * spi_rcv_egrbufs. The buffers are virtually contiguous. + */ + __u32 spi_rcv_egrbufsize; + /* + * The special QP (queue pair) value that identifies an qlogic_ib + * protocol packet from standard IB packets. More, probably much + * more, to be added. + */ + __u32 spi_qpair; + + /* + * User register base for init code, not to be used directly by + * protocol or applications. Always points to chip registers, + * for normal or shared context. + */ + __u64 spi_uregbase; + /* + * Maximum buffer size in bytes that can be used in a single TID + * entry (assuming the buffer is aligned to this boundary). This is + * the minimum of what the hardware and software support Guaranteed + * to be a power of 2. + */ + __u32 spi_tid_maxsize; + /* + * alignment of each pio send buffer (byte count + * to add to spi_piobufbase to get to second buffer) + */ + __u32 spi_pioalign; + /* + * The index of the first pio buffer available to this process; + * needed to do lookup in spi_pioavailaddr; not added to + * spi_piobufbase. + */ + __u32 spi_pioindex; + /* number of buffers mapped for this process */ + __u32 spi_piocnt; + + /* + * Base address of writeonly pio buffers for this process. + * Each buffer has spi_piosize words, and is aligned on spi_pioalign + * boundaries. spi_piocnt buffers are mapped from this address + */ + __u64 spi_piobufbase; + + /* + * Base address of readonly memory copy of the pioavail registers. + * There are 2 bits for each buffer. + */ + __u64 spi_pioavailaddr; + + /* + * Address where driver updates a copy of the interface and driver + * status (QIB_STATUS_*) as a 64 bit value. It's followed by a + * link status qword (formerly combined with driver status), then a + * string indicating hardware error, if there was one. + */ + __u64 spi_status; + + /* number of chip ctxts available to user processes */ + __u32 spi_nctxts; + __u16 spi_unit; /* unit number of chip we are using */ + __u16 spi_port; /* IB port number we are using */ + /* num bufs in each contiguous set */ + __u32 spi_rcv_egrperchunk; + /* size in bytes of each contiguous set */ + __u32 spi_rcv_egrchunksize; + /* total size of mmap to cover full rcvegrbuffers */ + __u32 spi_rcv_egrbuftotlen; + __u32 spi_rhf_offset; /* dword offset in hdrqent for rcvhdr flags */ + /* address of readonly memory copy of the rcvhdrq tail register. */ + __u64 spi_rcvhdr_tailaddr; + + /* + * shared memory pages for subctxts if ctxt is shared; these cover + * all the processes in the group sharing a single context. + * all have enough space for the num_subcontexts value on this job. + */ + __u64 spi_subctxt_uregbase; + __u64 spi_subctxt_rcvegrbuf; + __u64 spi_subctxt_rcvhdr_base; + + /* shared memory page for send buffer disarm status */ + __u64 spi_sendbuf_status; +} __attribute__ ((aligned(8))); + +/* + * This version number is given to the driver by the user code during + * initialization in the spu_userversion field of qib_user_info, so + * the driver can check for compatibility with user code. + * + * The major version changes when data structures + * change in an incompatible way. The driver must be the same or higher + * for initialization to succeed. In some cases, a higher version + * driver will not interoperate with older software, and initialization + * will return an error. + */ +#define QIB_USER_SWMAJOR 1 + +/* + * Minor version differences are always compatible + * a within a major version, however if user software is larger + * than driver software, some new features and/or structure fields + * may not be implemented; the user code must deal with this if it + * cares, or it must abort after initialization reports the difference. + */ +#define QIB_USER_SWMINOR 13 + +#define QIB_USER_SWVERSION ((QIB_USER_SWMAJOR << 16) | QIB_USER_SWMINOR) + +#ifndef QIB_KERN_TYPE +#define QIB_KERN_TYPE 0 +#endif + +/* + * Similarly, this is the kernel version going back to the user. It's + * slightly different, in that we want to tell if the driver was built as + * part of a QLogic release, or from the driver from openfabrics.org, + * kernel.org, or a standard distribution, for support reasons. + * The high bit is 0 for non-QLogic and 1 for QLogic-built/supplied. + * + * It's returned by the driver to the user code during initialization in the + * spi_sw_version field of qib_base_info, so the user code can in turn + * check for compatibility with the kernel. +*/ +#define QIB_KERN_SWVERSION ((QIB_KERN_TYPE << 31) | QIB_USER_SWVERSION) + +/* + * Define the driver version number. This is something that refers only + * to the driver itself, not the software interfaces it supports. + */ +#define QIB_DRIVER_VERSION_BASE "1.11" + +/* create the final driver version string */ +#ifdef QIB_IDSTR +#define QIB_DRIVER_VERSION QIB_DRIVER_VERSION_BASE " " QIB_IDSTR +#else +#define QIB_DRIVER_VERSION QIB_DRIVER_VERSION_BASE +#endif + +/* + * If the unit is specified via open, HCA choice is fixed. If port is + * specified, it's also fixed. Otherwise we try to spread contexts + * across ports and HCAs, using different algorithims. WITHIN is + * the old default, prior to this mechanism. + */ +#define QIB_PORT_ALG_ACROSS 0 /* round robin contexts across HCAs, then + * ports; this is the default */ +#define QIB_PORT_ALG_WITHIN 1 /* use all contexts on an HCA (round robin + * active ports within), then next HCA */ +#define QIB_PORT_ALG_COUNT 2 /* number of algorithm choices */ + +/* + * This structure is passed to qib_userinit() to tell the driver where + * user code buffers are, sizes, etc. The offsets and sizes of the + * fields must remain unchanged, for binary compatibility. It can + * be extended, if userversion is changed so user code can tell, if needed + */ +struct qib_user_info { + /* + * version of user software, to detect compatibility issues. + * Should be set to QIB_USER_SWVERSION. + */ + __u32 spu_userversion; + + __u32 _spu_unused2; + + /* size of struct base_info to write to */ + __u32 spu_base_info_size; + + __u32 spu_port_alg; /* which QIB_PORT_ALG_*; unused user minor < 11 */ + + /* + * If two or more processes wish to share a context, each process + * must set the spu_subctxt_cnt and spu_subctxt_id to the same + * values. The only restriction on the spu_subctxt_id is that + * it be unique for a given node. + */ + __u16 spu_subctxt_cnt; + __u16 spu_subctxt_id; + + __u32 spu_port; /* IB port requested by user if > 0 */ + + /* + * address of struct base_info to write to + */ + __u64 spu_base_info; + +} __attribute__ ((aligned(8))); + +/* User commands. */ + +/* 16 available, was: old set up userspace (for old user code) */ +#define QIB_CMD_CTXT_INFO 17 /* find out what resources we got */ +#define QIB_CMD_RECV_CTRL 18 /* control receipt of packets */ +#define QIB_CMD_TID_UPDATE 19 /* update expected TID entries */ +#define QIB_CMD_TID_FREE 20 /* free expected TID entries */ +#define QIB_CMD_SET_PART_KEY 21 /* add partition key */ +/* 22 available, was: return info on slave processes (for old user code) */ +#define QIB_CMD_ASSIGN_CTXT 23 /* allocate HCA and ctxt */ +#define QIB_CMD_USER_INIT 24 /* set up userspace */ +#define QIB_CMD_UNUSED_1 25 +#define QIB_CMD_UNUSED_2 26 +#define QIB_CMD_PIOAVAILUPD 27 /* force an update of PIOAvail reg */ +#define QIB_CMD_POLL_TYPE 28 /* set the kind of polling we want */ +#define QIB_CMD_ARMLAUNCH_CTRL 29 /* armlaunch detection control */ +/* 30 is unused */ +#define QIB_CMD_SDMA_INFLIGHT 31 /* sdma inflight counter request */ +#define QIB_CMD_SDMA_COMPLETE 32 /* sdma completion counter request */ +/* 33 available, was a testing feature */ +#define QIB_CMD_DISARM_BUFS 34 /* disarm send buffers w/ errors */ +#define QIB_CMD_ACK_EVENT 35 /* ack & clear bits */ +#define QIB_CMD_CPUS_LIST 36 /* list of cpus allocated, for pinned + * processes: qib_cpus_list */ + +/* + * QIB_CMD_ACK_EVENT obsoletes QIB_CMD_DISARM_BUFS, but we keep it for + * compatibility with libraries from previous release. The ACK_EVENT + * will take appropriate driver action (if any, just DISARM for now), + * then clear the bits passed in as part of the mask. These bits are + * in the first 64bit word at spi_sendbuf_status, and are passed to + * the driver in the event_mask union as well. + */ +#define _QIB_EVENT_DISARM_BUFS_BIT 0 +#define _QIB_EVENT_LINKDOWN_BIT 1 +#define _QIB_EVENT_LID_CHANGE_BIT 2 +#define _QIB_EVENT_LMC_CHANGE_BIT 3 +#define _QIB_EVENT_SL2VL_CHANGE_BIT 4 +#define _QIB_MAX_EVENT_BIT _QIB_EVENT_SL2VL_CHANGE_BIT + +#define QIB_EVENT_DISARM_BUFS_BIT (1UL << _QIB_EVENT_DISARM_BUFS_BIT) +#define QIB_EVENT_LINKDOWN_BIT (1UL << _QIB_EVENT_LINKDOWN_BIT) +#define QIB_EVENT_LID_CHANGE_BIT (1UL << _QIB_EVENT_LID_CHANGE_BIT) +#define QIB_EVENT_LMC_CHANGE_BIT (1UL << _QIB_EVENT_LMC_CHANGE_BIT) +#define QIB_EVENT_SL2VL_CHANGE_BIT (1UL << _QIB_EVENT_SL2VL_CHANGE_BIT) + + +/* + * Poll types + */ +#define QIB_POLL_TYPE_ANYRCV 0x0 +#define QIB_POLL_TYPE_URGENT 0x1 + +struct qib_ctxt_info { + __u16 num_active; /* number of active units */ + __u16 unit; /* unit (chip) assigned to caller */ + __u16 port; /* IB port assigned to caller (1-based) */ + __u16 ctxt; /* ctxt on unit assigned to caller */ + __u16 subctxt; /* subctxt on unit assigned to caller */ + __u16 num_ctxts; /* number of ctxts available on unit */ + __u16 num_subctxts; /* number of subctxts opened on ctxt */ + __u16 rec_cpu; /* cpu # for affinity (ffff if none) */ +}; + +struct qib_tid_info { + __u32 tidcnt; + /* make structure same size in 32 and 64 bit */ + __u32 tid__unused; + /* virtual address of first page in transfer */ + __u64 tidvaddr; + /* pointer (same size 32/64 bit) to __u16 tid array */ + __u64 tidlist; + + /* + * pointer (same size 32/64 bit) to bitmap of TIDs used + * for this call; checked for being large enough at open + */ + __u64 tidmap; +}; + +struct qib_cmd { + __u32 type; /* command type */ + union { + struct qib_tid_info tid_info; + struct qib_user_info user_info; + + /* + * address in userspace where we should put the sdma + * inflight counter + */ + __u64 sdma_inflight; + /* + * address in userspace where we should put the sdma + * completion counter + */ + __u64 sdma_complete; + /* address in userspace of struct qib_ctxt_info to + write result to */ + __u64 ctxt_info; + /* enable/disable receipt of packets */ + __u32 recv_ctrl; + /* enable/disable armlaunch errors (non-zero to enable) */ + __u32 armlaunch_ctrl; + /* partition key to set */ + __u16 part_key; + /* user address of __u32 bitmask of active slaves */ + __u64 slave_mask_addr; + /* type of polling we want */ + __u16 poll_type; + /* back pressure enable bit for one particular context */ + __u8 ctxt_bp; + /* qib_user_event_ack(), IPATH_EVENT_* bits */ + __u64 event_mask; + } cmd; +}; + +struct qib_iovec { + /* Pointer to data, but same size 32 and 64 bit */ + __u64 iov_base; + + /* + * Length of data; don't need 64 bits, but want + * qib_sendpkt to remain same size as before 32 bit changes, so... + */ + __u64 iov_len; +}; + +/* + * Describes a single packet for send. Each packet can have one or more + * buffers, but the total length (exclusive of IB headers) must be less + * than the MTU, and if using the PIO method, entire packet length, + * including IB headers, must be less than the qib_piosize value (words). + * Use of this necessitates including sys/uio.h + */ +struct __qib_sendpkt { + __u32 sps_flags; /* flags for packet (TBD) */ + __u32 sps_cnt; /* number of entries to use in sps_iov */ + /* array of iov's describing packet. TEMPORARY */ + struct qib_iovec sps_iov[4]; +}; + +/* + * Diagnostics can send a packet by "writing" the following + * structs to the diag data special file. + * This allows a custom + * pbc (+ static rate) qword, so that special modes and deliberate + * changes to CRCs can be used. The elements were also re-ordered + * for better alignment and to avoid padding issues. + */ +#define _DIAG_XPKT_VERS 3 +struct qib_diag_xpkt { + __u16 version; + __u16 unit; + __u16 port; + __u16 len; + __u64 data; + __u64 pbc_wd; +}; + +/* + * Data layout in I2C flash (for GUID, etc.) + * All fields are little-endian binary unless otherwise stated + */ +#define QIB_FLASH_VERSION 2 +struct qib_flash { + /* flash layout version (QIB_FLASH_VERSION) */ + __u8 if_fversion; + /* checksum protecting if_length bytes */ + __u8 if_csum; + /* + * valid length (in use, protected by if_csum), including + * if_fversion and if_csum themselves) + */ + __u8 if_length; + /* the GUID, in network order */ + __u8 if_guid[8]; + /* number of GUIDs to use, starting from if_guid */ + __u8 if_numguid; + /* the (last 10 characters of) board serial number, in ASCII */ + char if_serial[12]; + /* board mfg date (YYYYMMDD ASCII) */ + char if_mfgdate[8]; + /* last board rework/test date (YYYYMMDD ASCII) */ + char if_testdate[8]; + /* logging of error counts, TBD */ + __u8 if_errcntp[4]; + /* powered on hours, updated at driver unload */ + __u8 if_powerhour[2]; + /* ASCII free-form comment field */ + char if_comment[32]; + /* Backwards compatible prefix for longer QLogic Serial Numbers */ + char if_sprefix[4]; + /* 82 bytes used, min flash size is 128 bytes */ + __u8 if_future[46]; +}; + +/* + * These are the counters implemented in the chip, and are listed in order. + * The InterCaps naming is taken straight from the chip spec. + */ +struct qlogic_ib_counters { + __u64 LBIntCnt; + __u64 LBFlowStallCnt; + __u64 TxSDmaDescCnt; /* was Reserved1 */ + __u64 TxUnsupVLErrCnt; + __u64 TxDataPktCnt; + __u64 TxFlowPktCnt; + __u64 TxDwordCnt; + __u64 TxLenErrCnt; + __u64 TxMaxMinLenErrCnt; + __u64 TxUnderrunCnt; + __u64 TxFlowStallCnt; + __u64 TxDroppedPktCnt; + __u64 RxDroppedPktCnt; + __u64 RxDataPktCnt; + __u64 RxFlowPktCnt; + __u64 RxDwordCnt; + __u64 RxLenErrCnt; + __u64 RxMaxMinLenErrCnt; + __u64 RxICRCErrCnt; + __u64 RxVCRCErrCnt; + __u64 RxFlowCtrlErrCnt; + __u64 RxBadFormatCnt; + __u64 RxLinkProblemCnt; + __u64 RxEBPCnt; + __u64 RxLPCRCErrCnt; + __u64 RxBufOvflCnt; + __u64 RxTIDFullErrCnt; + __u64 RxTIDValidErrCnt; + __u64 RxPKeyMismatchCnt; + __u64 RxP0HdrEgrOvflCnt; + __u64 RxP1HdrEgrOvflCnt; + __u64 RxP2HdrEgrOvflCnt; + __u64 RxP3HdrEgrOvflCnt; + __u64 RxP4HdrEgrOvflCnt; + __u64 RxP5HdrEgrOvflCnt; + __u64 RxP6HdrEgrOvflCnt; + __u64 RxP7HdrEgrOvflCnt; + __u64 RxP8HdrEgrOvflCnt; + __u64 RxP9HdrEgrOvflCnt; + __u64 RxP10HdrEgrOvflCnt; + __u64 RxP11HdrEgrOvflCnt; + __u64 RxP12HdrEgrOvflCnt; + __u64 RxP13HdrEgrOvflCnt; + __u64 RxP14HdrEgrOvflCnt; + __u64 RxP15HdrEgrOvflCnt; + __u64 RxP16HdrEgrOvflCnt; + __u64 IBStatusChangeCnt; + __u64 IBLinkErrRecoveryCnt; + __u64 IBLinkDownedCnt; + __u64 IBSymbolErrCnt; + __u64 RxVL15DroppedPktCnt; + __u64 RxOtherLocalPhyErrCnt; + __u64 PcieRetryBufDiagQwordCnt; + __u64 ExcessBufferOvflCnt; + __u64 LocalLinkIntegrityErrCnt; + __u64 RxVlErrCnt; + __u64 RxDlidFltrCnt; +}; + +/* + * The next set of defines are for packet headers, and chip register + * and memory bits that are visible to and/or used by user-mode software. + */ + +/* RcvHdrFlags bits */ +#define QLOGIC_IB_RHF_LENGTH_MASK 0x7FF +#define QLOGIC_IB_RHF_LENGTH_SHIFT 0 +#define QLOGIC_IB_RHF_RCVTYPE_MASK 0x7 +#define QLOGIC_IB_RHF_RCVTYPE_SHIFT 11 +#define QLOGIC_IB_RHF_EGRINDEX_MASK 0xFFF +#define QLOGIC_IB_RHF_EGRINDEX_SHIFT 16 +#define QLOGIC_IB_RHF_SEQ_MASK 0xF +#define QLOGIC_IB_RHF_SEQ_SHIFT 0 +#define QLOGIC_IB_RHF_HDRQ_OFFSET_MASK 0x7FF +#define QLOGIC_IB_RHF_HDRQ_OFFSET_SHIFT 4 +#define QLOGIC_IB_RHF_H_ICRCERR 0x80000000 +#define QLOGIC_IB_RHF_H_VCRCERR 0x40000000 +#define QLOGIC_IB_RHF_H_PARITYERR 0x20000000 +#define QLOGIC_IB_RHF_H_LENERR 0x10000000 +#define QLOGIC_IB_RHF_H_MTUERR 0x08000000 +#define QLOGIC_IB_RHF_H_IHDRERR 0x04000000 +#define QLOGIC_IB_RHF_H_TIDERR 0x02000000 +#define QLOGIC_IB_RHF_H_MKERR 0x01000000 +#define QLOGIC_IB_RHF_H_IBERR 0x00800000 +#define QLOGIC_IB_RHF_H_ERR_MASK 0xFF800000 +#define QLOGIC_IB_RHF_L_USE_EGR 0x80000000 +#define QLOGIC_IB_RHF_L_SWA 0x00008000 +#define QLOGIC_IB_RHF_L_SWB 0x00004000 + +/* qlogic_ib header fields */ +#define QLOGIC_IB_I_VERS_MASK 0xF +#define QLOGIC_IB_I_VERS_SHIFT 28 +#define QLOGIC_IB_I_CTXT_MASK 0xF +#define QLOGIC_IB_I_CTXT_SHIFT 24 +#define QLOGIC_IB_I_TID_MASK 0x7FF +#define QLOGIC_IB_I_TID_SHIFT 13 +#define QLOGIC_IB_I_OFFSET_MASK 0x1FFF +#define QLOGIC_IB_I_OFFSET_SHIFT 0 + +/* K_PktFlags bits */ +#define QLOGIC_IB_KPF_INTR 0x1 +#define QLOGIC_IB_KPF_SUBCTXT_MASK 0x3 +#define QLOGIC_IB_KPF_SUBCTXT_SHIFT 1 + +#define QLOGIC_IB_MAX_SUBCTXT 4 + +/* SendPIO per-buffer control */ +#define QLOGIC_IB_SP_TEST 0x40 +#define QLOGIC_IB_SP_TESTEBP 0x20 +#define QLOGIC_IB_SP_TRIGGER_SHIFT 15 + +/* SendPIOAvail bits */ +#define QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT 1 +#define QLOGIC_IB_SENDPIOAVAIL_CHECK_SHIFT 0 + +/* qlogic_ib header format */ +struct qib_header { + /* + * Version - 4 bits, Context - 4 bits, TID - 10 bits and Offset - + * 14 bits before ECO change ~28 Dec 03. After that, Vers 4, + * Context 4, TID 11, offset 13. + */ + __le32 ver_ctxt_tid_offset; + __le16 chksum; + __le16 pkt_flags; +}; + +/* + * qlogic_ib user message header format. + * This structure contains the first 4 fields common to all protocols + * that employ qlogic_ib. + */ +struct qib_message_header { + __be16 lrh[4]; + __be32 bth[3]; + /* fields below this point are in host byte order */ + struct qib_header iph; + /* fields below are simplified, but should match PSM */ + /* some are accessed by driver when packet spliting is needed */ + __u8 sub_opcode; + __u8 flags; + __u16 commidx; + __u32 ack_seq_num; + __u8 flowid; + __u8 hdr_dlen; + __u16 mqhdr; + __u32 uwords[4]; +}; + +/* sequence number bits for message */ +union qib_seqnum { + struct { + __u32 seq:11; + __u32 gen:8; + __u32 flow:5; + }; + struct { + __u32 pkt:16; + __u32 msg:8; + }; + __u32 val; +}; + +/* qib receiving-dma tid-session-member */ +struct qib_tid_session_member { + __u16 tid; + __u16 offset; + __u16 length; +}; + +/* IB - LRH header consts */ +#define QIB_LRH_GRH 0x0003 /* 1. word of IB LRH - next header: GRH */ +#define QIB_LRH_BTH 0x0002 /* 1. word of IB LRH - next header: BTH */ + +/* misc. */ +#define SIZE_OF_CRC 1 + +#define QIB_DEFAULT_P_KEY 0xFFFF +#define QIB_PERMISSIVE_LID 0xFFFF +#define QIB_AETH_CREDIT_SHIFT 24 +#define QIB_AETH_CREDIT_MASK 0x1F +#define QIB_AETH_CREDIT_INVAL 0x1F +#define QIB_PSN_MASK 0xFFFFFF +#define QIB_MSN_MASK 0xFFFFFF +#define QIB_QPN_MASK 0xFFFFFF +#define QIB_MULTICAST_LID_BASE 0xC000 +#define QIB_EAGER_TID_ID QLOGIC_IB_I_TID_MASK +#define QIB_MULTICAST_QPN 0xFFFFFF + +/* Receive Header Queue: receive type (from qlogic_ib) */ +#define RCVHQ_RCV_TYPE_EXPECTED 0 +#define RCVHQ_RCV_TYPE_EAGER 1 +#define RCVHQ_RCV_TYPE_NON_KD 2 +#define RCVHQ_RCV_TYPE_ERROR 3 + +#define QIB_HEADER_QUEUE_WORDS 9 + +/* functions for extracting fields from rcvhdrq entries for the driver. + */ +static inline __u32 qib_hdrget_err_flags(const __le32 *rbuf) +{ + return __le32_to_cpu(rbuf[1]) & QLOGIC_IB_RHF_H_ERR_MASK; +} + +static inline __u32 qib_hdrget_rcv_type(const __le32 *rbuf) +{ + return (__le32_to_cpu(rbuf[0]) >> QLOGIC_IB_RHF_RCVTYPE_SHIFT) & + QLOGIC_IB_RHF_RCVTYPE_MASK; +} + +static inline __u32 qib_hdrget_length_in_bytes(const __le32 *rbuf) +{ + return ((__le32_to_cpu(rbuf[0]) >> QLOGIC_IB_RHF_LENGTH_SHIFT) & + QLOGIC_IB_RHF_LENGTH_MASK) << 2; +} + +static inline __u32 qib_hdrget_index(const __le32 *rbuf) +{ + return (__le32_to_cpu(rbuf[0]) >> QLOGIC_IB_RHF_EGRINDEX_SHIFT) & + QLOGIC_IB_RHF_EGRINDEX_MASK; +} + +static inline __u32 qib_hdrget_seq(const __le32 *rbuf) +{ + return (__le32_to_cpu(rbuf[1]) >> QLOGIC_IB_RHF_SEQ_SHIFT) & + QLOGIC_IB_RHF_SEQ_MASK; +} + +static inline __u32 qib_hdrget_offset(const __le32 *rbuf) +{ + return (__le32_to_cpu(rbuf[1]) >> QLOGIC_IB_RHF_HDRQ_OFFSET_SHIFT) & + QLOGIC_IB_RHF_HDRQ_OFFSET_MASK; +} + +static inline __u32 qib_hdrget_use_egr_buf(const __le32 *rbuf) +{ + return __le32_to_cpu(rbuf[0]) & QLOGIC_IB_RHF_L_USE_EGR; +} + +static inline __u32 qib_hdrget_qib_ver(__le32 hdrword) +{ + return (__le32_to_cpu(hdrword) >> QLOGIC_IB_I_VERS_SHIFT) & + QLOGIC_IB_I_VERS_MASK; +} + +#endif /* _QIB_COMMON_H */ diff --git a/drivers/infiniband/hw/qib/qib_cq.c b/drivers/infiniband/hw/qib/qib_cq.c new file mode 100644 index 0000000..ab4e11c --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_cq.c @@ -0,0 +1,540 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * Copyright (c) 2006, 2007, 2008, 2010 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "qib_verbs.h" +#include "qib.h" + +/** + * qib_cq_enter - add a new entry to the completion queue + * @cq: completion queue + * @entry: work completion entry to add + * @sig: true if @entry is a solicitated entry + * + * This may be called with qp->s_lock held. + */ +void qib_cq_enter(struct qib_cq *cq, struct ib_wc *entry, int solicited) +{ + struct qib_cq_wc *wc; + unsigned long flags; + u32 head; + u32 next; + + spin_lock_irqsave(&cq->lock, flags); + + /* + * Note that the head pointer might be writable by user processes. + * Take care to verify it is a sane value. + */ + wc = cq->queue; + head = wc->head; + if (head >= (unsigned) cq->ibcq.cqe) { + head = cq->ibcq.cqe; + next = 0; + } else + next = head + 1; + if (unlikely(next == wc->tail)) { + spin_unlock_irqrestore(&cq->lock, flags); + if (cq->ibcq.event_handler) { + struct ib_event ev; + + ev.device = cq->ibcq.device; + ev.element.cq = &cq->ibcq; + ev.event = IB_EVENT_CQ_ERR; + cq->ibcq.event_handler(&ev, cq->ibcq.cq_context); + } + return; + } + if (cq->ip) { + wc->uqueue[head].wr_id = entry->wr_id; + wc->uqueue[head].status = entry->status; + wc->uqueue[head].opcode = entry->opcode; + wc->uqueue[head].vendor_err = entry->vendor_err; + wc->uqueue[head].byte_len = entry->byte_len; + wc->uqueue[head].ex.imm_data = + (__u32 __force)entry->ex.imm_data; + wc->uqueue[head].qp_num = entry->qp->qp_num; + wc->uqueue[head].src_qp = entry->src_qp; + wc->uqueue[head].wc_flags = entry->wc_flags; + wc->uqueue[head].pkey_index = entry->pkey_index; + wc->uqueue[head].slid = entry->slid; + wc->uqueue[head].sl = entry->sl; + wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits; + wc->uqueue[head].port_num = entry->port_num; + /* Make sure entry is written before the head index. */ + smp_wmb(); + } else + wc->kqueue[head] = *entry; + wc->head = next; + + if (cq->notify == IB_CQ_NEXT_COMP || + (cq->notify == IB_CQ_SOLICITED && + (solicited || entry->status != IB_WC_SUCCESS))) { + struct kthread_worker *worker; + /* + * This will cause send_complete() to be called in + * another thread. + */ + smp_rmb(); + worker = cq->dd->worker; + if (likely(worker)) { + cq->notify = IB_CQ_NONE; + cq->triggered++; + queue_kthread_work(worker, &cq->comptask); + } + } + + spin_unlock_irqrestore(&cq->lock, flags); +} + +/** + * qib_poll_cq - poll for work completion entries + * @ibcq: the completion queue to poll + * @num_entries: the maximum number of entries to return + * @entry: pointer to array where work completions are placed + * + * Returns the number of completion entries polled. + * + * This may be called from interrupt context. Also called by ib_poll_cq() + * in the generic verbs code. + */ +int qib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) +{ + struct qib_cq *cq = to_icq(ibcq); + struct qib_cq_wc *wc; + unsigned long flags; + int npolled; + u32 tail; + + /* The kernel can only poll a kernel completion queue */ + if (cq->ip) { + npolled = -EINVAL; + goto bail; + } + + spin_lock_irqsave(&cq->lock, flags); + + wc = cq->queue; + tail = wc->tail; + if (tail > (u32) cq->ibcq.cqe) + tail = (u32) cq->ibcq.cqe; + for (npolled = 0; npolled < num_entries; ++npolled, ++entry) { + if (tail == wc->head) + break; + /* The kernel doesn't need a RMB since it has the lock. */ + *entry = wc->kqueue[tail]; + if (tail >= cq->ibcq.cqe) + tail = 0; + else + tail++; + } + wc->tail = tail; + + spin_unlock_irqrestore(&cq->lock, flags); + +bail: + return npolled; +} + +static void send_complete(struct kthread_work *work) +{ + struct qib_cq *cq = container_of(work, struct qib_cq, comptask); + + /* + * The completion handler will most likely rearm the notification + * and poll for all pending entries. If a new completion entry + * is added while we are in this routine, queue_work() + * won't call us again until we return so we check triggered to + * see if we need to call the handler again. + */ + for (;;) { + u8 triggered = cq->triggered; + + /* + * IPoIB connected mode assumes the callback is from a + * soft IRQ. We simulate this by blocking "bottom halves". + * See the implementation for ipoib_cm_handle_tx_wc(), + * netif_tx_lock_bh() and netif_tx_lock(). + */ + local_bh_disable(); + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); + local_bh_enable(); + + if (cq->triggered == triggered) + return; + } +} + +/** + * qib_create_cq - create a completion queue + * @ibdev: the device this completion queue is attached to + * @entries: the minimum size of the completion queue + * @context: unused by the QLogic_IB driver + * @udata: user data for libibverbs.so + * + * Returns a pointer to the completion queue or negative errno values + * for failure. + * + * Called by ib_create_cq() in the generic verbs code. + */ +struct ib_cq *qib_create_cq(struct ib_device *ibdev, int entries, + int comp_vector, struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct qib_ibdev *dev = to_idev(ibdev); + struct qib_cq *cq; + struct qib_cq_wc *wc; + struct ib_cq *ret; + u32 sz; + + if (entries < 1 || entries > ib_qib_max_cqes) { + ret = ERR_PTR(-EINVAL); + goto done; + } + + /* Allocate the completion queue structure. */ + cq = kmalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) { + ret = ERR_PTR(-ENOMEM); + goto done; + } + + /* + * Allocate the completion queue entries and head/tail pointers. + * This is allocated separately so that it can be resized and + * also mapped into user space. + * We need to use vmalloc() in order to support mmap and large + * numbers of entries. + */ + sz = sizeof(*wc); + if (udata && udata->outlen >= sizeof(__u64)) + sz += sizeof(struct ib_uverbs_wc) * (entries + 1); + else + sz += sizeof(struct ib_wc) * (entries + 1); + wc = vmalloc_user(sz); + if (!wc) { + ret = ERR_PTR(-ENOMEM); + goto bail_cq; + } + + /* + * Return the address of the WC as the offset to mmap. + * See qib_mmap() for details. + */ + if (udata && udata->outlen >= sizeof(__u64)) { + int err; + + cq->ip = qib_create_mmap_info(dev, sz, context, wc); + if (!cq->ip) { + ret = ERR_PTR(-ENOMEM); + goto bail_wc; + } + + err = ib_copy_to_udata(udata, &cq->ip->offset, + sizeof(cq->ip->offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_ip; + } + } else + cq->ip = NULL; + + spin_lock(&dev->n_cqs_lock); + if (dev->n_cqs_allocated == ib_qib_max_cqs) { + spin_unlock(&dev->n_cqs_lock); + ret = ERR_PTR(-ENOMEM); + goto bail_ip; + } + + dev->n_cqs_allocated++; + spin_unlock(&dev->n_cqs_lock); + + if (cq->ip) { + spin_lock_irq(&dev->pending_lock); + list_add(&cq->ip->pending_mmaps, &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + + /* + * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe. + * The number of entries should be >= the number requested or return + * an error. + */ + cq->dd = dd_from_dev(dev); + cq->ibcq.cqe = entries; + cq->notify = IB_CQ_NONE; + cq->triggered = 0; + spin_lock_init(&cq->lock); + init_kthread_work(&cq->comptask, send_complete); + wc->head = 0; + wc->tail = 0; + cq->queue = wc; + + ret = &cq->ibcq; + + goto done; + +bail_ip: + kfree(cq->ip); +bail_wc: + vfree(wc); +bail_cq: + kfree(cq); +done: + return ret; +} + +/** + * qib_destroy_cq - destroy a completion queue + * @ibcq: the completion queue to destroy. + * + * Returns 0 for success. + * + * Called by ib_destroy_cq() in the generic verbs code. + */ +int qib_destroy_cq(struct ib_cq *ibcq) +{ + struct qib_ibdev *dev = to_idev(ibcq->device); + struct qib_cq *cq = to_icq(ibcq); + + flush_kthread_work(&cq->comptask); + spin_lock(&dev->n_cqs_lock); + dev->n_cqs_allocated--; + spin_unlock(&dev->n_cqs_lock); + if (cq->ip) + kref_put(&cq->ip->ref, qib_release_mmap_info); + else + vfree(cq->queue); + kfree(cq); + + return 0; +} + +/** + * qib_req_notify_cq - change the notification type for a completion queue + * @ibcq: the completion queue + * @notify_flags: the type of notification to request + * + * Returns 0 for success. + * + * This may be called from interrupt context. Also called by + * ib_req_notify_cq() in the generic verbs code. + */ +int qib_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags) +{ + struct qib_cq *cq = to_icq(ibcq); + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&cq->lock, flags); + /* + * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow + * any other transitions (see C11-31 and C11-32 in ch. 11.4.2.2). + */ + if (cq->notify != IB_CQ_NEXT_COMP) + cq->notify = notify_flags & IB_CQ_SOLICITED_MASK; + + if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) && + cq->queue->head != cq->queue->tail) + ret = 1; + + spin_unlock_irqrestore(&cq->lock, flags); + + return ret; +} + +/** + * qib_resize_cq - change the size of the CQ + * @ibcq: the completion queue + * + * Returns 0 for success. + */ +int qib_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) +{ + struct qib_cq *cq = to_icq(ibcq); + struct qib_cq_wc *old_wc; + struct qib_cq_wc *wc; + u32 head, tail, n; + int ret; + u32 sz; + + if (cqe < 1 || cqe > ib_qib_max_cqes) { + ret = -EINVAL; + goto bail; + } + + /* + * Need to use vmalloc() if we want to support large #s of entries. + */ + sz = sizeof(*wc); + if (udata && udata->outlen >= sizeof(__u64)) + sz += sizeof(struct ib_uverbs_wc) * (cqe + 1); + else + sz += sizeof(struct ib_wc) * (cqe + 1); + wc = vmalloc_user(sz); + if (!wc) { + ret = -ENOMEM; + goto bail; + } + + /* Check that we can write the offset to mmap. */ + if (udata && udata->outlen >= sizeof(__u64)) { + __u64 offset = 0; + + ret = ib_copy_to_udata(udata, &offset, sizeof(offset)); + if (ret) + goto bail_free; + } + + spin_lock_irq(&cq->lock); + /* + * Make sure head and tail are sane since they + * might be user writable. + */ + old_wc = cq->queue; + head = old_wc->head; + if (head > (u32) cq->ibcq.cqe) + head = (u32) cq->ibcq.cqe; + tail = old_wc->tail; + if (tail > (u32) cq->ibcq.cqe) + tail = (u32) cq->ibcq.cqe; + if (head < tail) + n = cq->ibcq.cqe + 1 + head - tail; + else + n = head - tail; + if (unlikely((u32)cqe < n)) { + ret = -EINVAL; + goto bail_unlock; + } + for (n = 0; tail != head; n++) { + if (cq->ip) + wc->uqueue[n] = old_wc->uqueue[tail]; + else + wc->kqueue[n] = old_wc->kqueue[tail]; + if (tail == (u32) cq->ibcq.cqe) + tail = 0; + else + tail++; + } + cq->ibcq.cqe = cqe; + wc->head = n; + wc->tail = 0; + cq->queue = wc; + spin_unlock_irq(&cq->lock); + + vfree(old_wc); + + if (cq->ip) { + struct qib_ibdev *dev = to_idev(ibcq->device); + struct qib_mmap_info *ip = cq->ip; + + qib_update_mmap_info(dev, ip, sz, wc); + + /* + * Return the offset to mmap. + * See qib_mmap() for details. + */ + if (udata && udata->outlen >= sizeof(__u64)) { + ret = ib_copy_to_udata(udata, &ip->offset, + sizeof(ip->offset)); + if (ret) + goto bail; + } + + spin_lock_irq(&dev->pending_lock); + if (list_empty(&ip->pending_mmaps)) + list_add(&ip->pending_mmaps, &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + + ret = 0; + goto bail; + +bail_unlock: + spin_unlock_irq(&cq->lock); +bail_free: + vfree(wc); +bail: + return ret; +} + +int qib_cq_init(struct qib_devdata *dd) +{ + int ret = 0; + int cpu; + struct task_struct *task; + + if (dd->worker) + return 0; + dd->worker = kzalloc(sizeof(*dd->worker), GFP_KERNEL); + if (!dd->worker) + return -ENOMEM; + init_kthread_worker(dd->worker); + task = kthread_create_on_node( + kthread_worker_fn, + dd->worker, + dd->assigned_node_id, + "qib_cq%d", dd->unit); + if (IS_ERR(task)) + goto task_fail; + cpu = cpumask_first(cpumask_of_node(dd->assigned_node_id)); + kthread_bind(task, cpu); + wake_up_process(task); +out: + return ret; +task_fail: + ret = PTR_ERR(task); + kfree(dd->worker); + dd->worker = NULL; + goto out; +} + +void qib_cq_exit(struct qib_devdata *dd) +{ + struct kthread_worker *worker; + + worker = dd->worker; + if (!worker) + return; + /* blocks future queuing from send_complete() */ + dd->worker = NULL; + smp_wmb(); + flush_kthread_worker(worker); + kthread_stop(worker->task); + kfree(worker); +} diff --git a/drivers/infiniband/hw/qib/qib_debugfs.c b/drivers/infiniband/hw/qib/qib_debugfs.c new file mode 100644 index 0000000..6abd3ed --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_debugfs.c @@ -0,0 +1,284 @@ +#ifdef CONFIG_DEBUG_FS +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include + +#include "qib.h" +#include "qib_verbs.h" +#include "qib_debugfs.h" + +static struct dentry *qib_dbg_root; + +#define DEBUGFS_FILE(name) \ +static const struct seq_operations _##name##_seq_ops = { \ + .start = _##name##_seq_start, \ + .next = _##name##_seq_next, \ + .stop = _##name##_seq_stop, \ + .show = _##name##_seq_show \ +}; \ +static int _##name##_open(struct inode *inode, struct file *s) \ +{ \ + struct seq_file *seq; \ + int ret; \ + ret = seq_open(s, &_##name##_seq_ops); \ + if (ret) \ + return ret; \ + seq = s->private_data; \ + seq->private = inode->i_private; \ + return 0; \ +} \ +static const struct file_operations _##name##_file_ops = { \ + .owner = THIS_MODULE, \ + .open = _##name##_open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = seq_release \ +}; + +#define DEBUGFS_FILE_CREATE(name) \ +do { \ + struct dentry *ent; \ + ent = debugfs_create_file(#name , 0400, ibd->qib_ibdev_dbg, \ + ibd, &_##name##_file_ops); \ + if (!ent) \ + pr_warn("create of " #name " failed\n"); \ +} while (0) + +static void *_opcode_stats_seq_start(struct seq_file *s, loff_t *pos) +{ + struct qib_opcode_stats_perctx *opstats; + + if (*pos >= ARRAY_SIZE(opstats->stats)) + return NULL; + return pos; +} + +static void *_opcode_stats_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct qib_opcode_stats_perctx *opstats; + + ++*pos; + if (*pos >= ARRAY_SIZE(opstats->stats)) + return NULL; + return pos; +} + + +static void _opcode_stats_seq_stop(struct seq_file *s, void *v) +{ + /* nothing allocated */ +} + +static int _opcode_stats_seq_show(struct seq_file *s, void *v) +{ + loff_t *spos = v; + loff_t i = *spos, j; + u64 n_packets = 0, n_bytes = 0; + struct qib_ibdev *ibd = (struct qib_ibdev *)s->private; + struct qib_devdata *dd = dd_from_dev(ibd); + + for (j = 0; j < dd->first_user_ctxt; j++) { + if (!dd->rcd[j]) + continue; + n_packets += dd->rcd[j]->opstats->stats[i].n_packets; + n_bytes += dd->rcd[j]->opstats->stats[i].n_bytes; + } + if (!n_packets && !n_bytes) + return SEQ_SKIP; + seq_printf(s, "%02llx %llu/%llu\n", i, + (unsigned long long) n_packets, + (unsigned long long) n_bytes); + + return 0; +} + +DEBUGFS_FILE(opcode_stats) + +static void *_ctx_stats_seq_start(struct seq_file *s, loff_t *pos) +{ + struct qib_ibdev *ibd = (struct qib_ibdev *)s->private; + struct qib_devdata *dd = dd_from_dev(ibd); + + if (!*pos) + return SEQ_START_TOKEN; + if (*pos >= dd->first_user_ctxt) + return NULL; + return pos; +} + +static void *_ctx_stats_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct qib_ibdev *ibd = (struct qib_ibdev *)s->private; + struct qib_devdata *dd = dd_from_dev(ibd); + + if (v == SEQ_START_TOKEN) + return pos; + + ++*pos; + if (*pos >= dd->first_user_ctxt) + return NULL; + return pos; +} + +static void _ctx_stats_seq_stop(struct seq_file *s, void *v) +{ + /* nothing allocated */ +} + +static int _ctx_stats_seq_show(struct seq_file *s, void *v) +{ + loff_t *spos; + loff_t i, j; + u64 n_packets = 0; + struct qib_ibdev *ibd = (struct qib_ibdev *)s->private; + struct qib_devdata *dd = dd_from_dev(ibd); + + if (v == SEQ_START_TOKEN) { + seq_puts(s, "Ctx:npkts\n"); + return 0; + } + + spos = v; + i = *spos; + + if (!dd->rcd[i]) + return SEQ_SKIP; + + for (j = 0; j < ARRAY_SIZE(dd->rcd[i]->opstats->stats); j++) + n_packets += dd->rcd[i]->opstats->stats[j].n_packets; + + if (!n_packets) + return SEQ_SKIP; + + seq_printf(s, " %llu:%llu\n", i, n_packets); + return 0; +} + +DEBUGFS_FILE(ctx_stats) + +static void *_qp_stats_seq_start(struct seq_file *s, loff_t *pos) +{ + struct qib_qp_iter *iter; + loff_t n = *pos; + + rcu_read_lock(); + iter = qib_qp_iter_init(s->private); + if (!iter) + return NULL; + + while (n--) { + if (qib_qp_iter_next(iter)) { + kfree(iter); + return NULL; + } + } + + return iter; +} + +static void *_qp_stats_seq_next(struct seq_file *s, void *iter_ptr, + loff_t *pos) +{ + struct qib_qp_iter *iter = iter_ptr; + + (*pos)++; + + if (qib_qp_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +static void _qp_stats_seq_stop(struct seq_file *s, void *iter_ptr) +{ + rcu_read_unlock(); +} + +static int _qp_stats_seq_show(struct seq_file *s, void *iter_ptr) +{ + struct qib_qp_iter *iter = iter_ptr; + + if (!iter) + return 0; + + qib_qp_iter_print(s, iter); + + return 0; +} + +DEBUGFS_FILE(qp_stats) + +void qib_dbg_ibdev_init(struct qib_ibdev *ibd) +{ + char name[10]; + + snprintf(name, sizeof(name), "qib%d", dd_from_dev(ibd)->unit); + ibd->qib_ibdev_dbg = debugfs_create_dir(name, qib_dbg_root); + if (!ibd->qib_ibdev_dbg) { + pr_warn("create of %s failed\n", name); + return; + } + DEBUGFS_FILE_CREATE(opcode_stats); + DEBUGFS_FILE_CREATE(ctx_stats); + DEBUGFS_FILE_CREATE(qp_stats); + return; +} + +void qib_dbg_ibdev_exit(struct qib_ibdev *ibd) +{ + if (!qib_dbg_root) + goto out; + debugfs_remove_recursive(ibd->qib_ibdev_dbg); +out: + ibd->qib_ibdev_dbg = NULL; +} + +void qib_dbg_init(void) +{ + qib_dbg_root = debugfs_create_dir(QIB_DRV_NAME, NULL); + if (!qib_dbg_root) + pr_warn("init of debugfs failed\n"); +} + +void qib_dbg_exit(void) +{ + debugfs_remove_recursive(qib_dbg_root); + qib_dbg_root = NULL; +} + +#endif + diff --git a/drivers/infiniband/hw/qib/qib_debugfs.h b/drivers/infiniband/hw/qib/qib_debugfs.h new file mode 100644 index 0000000..7ae983a --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_debugfs.h @@ -0,0 +1,45 @@ +#ifndef _QIB_DEBUGFS_H +#define _QIB_DEBUGFS_H + +#ifdef CONFIG_DEBUG_FS +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +struct qib_ibdev; +void qib_dbg_ibdev_init(struct qib_ibdev *ibd); +void qib_dbg_ibdev_exit(struct qib_ibdev *ibd); +void qib_dbg_init(void); +void qib_dbg_exit(void); + +#endif + +#endif /* _QIB_DEBUGFS_H */ diff --git a/drivers/infiniband/hw/qib/qib_diag.c b/drivers/infiniband/hw/qib/qib_diag.c new file mode 100644 index 0000000..5dfda4c --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_diag.c @@ -0,0 +1,911 @@ +/* + * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This file contains support for diagnostic functions. It is accessed by + * opening the qib_diag device, normally minor number 129. Diagnostic use + * of the QLogic_IB chip may render the chip or board unusable until the + * driver is unloaded, or in some cases, until the system is rebooted. + * + * Accesses to the chip through this interface are not similar to going + * through the /sys/bus/pci resource mmap interface. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "qib.h" +#include "qib_common.h" + +#undef pr_fmt +#define pr_fmt(fmt) QIB_DRV_NAME ": " fmt + +/* + * Each client that opens the diag device must read then write + * offset 0, to prevent lossage from random cat or od. diag_state + * sequences this "handshake". + */ +enum diag_state { UNUSED = 0, OPENED, INIT, READY }; + +/* State for an individual client. PID so children cannot abuse handshake */ +static struct qib_diag_client { + struct qib_diag_client *next; + struct qib_devdata *dd; + pid_t pid; + enum diag_state state; +} *client_pool; + +/* + * Get a client struct. Recycled if possible, else kmalloc. + * Must be called with qib_mutex held + */ +static struct qib_diag_client *get_client(struct qib_devdata *dd) +{ + struct qib_diag_client *dc; + + dc = client_pool; + if (dc) + /* got from pool remove it and use */ + client_pool = dc->next; + else + /* None in pool, alloc and init */ + dc = kmalloc(sizeof *dc, GFP_KERNEL); + + if (dc) { + dc->next = NULL; + dc->dd = dd; + dc->pid = current->pid; + dc->state = OPENED; + } + return dc; +} + +/* + * Return to pool. Must be called with qib_mutex held + */ +static void return_client(struct qib_diag_client *dc) +{ + struct qib_devdata *dd = dc->dd; + struct qib_diag_client *tdc, *rdc; + + rdc = NULL; + if (dc == dd->diag_client) { + dd->diag_client = dc->next; + rdc = dc; + } else { + tdc = dc->dd->diag_client; + while (tdc) { + if (dc == tdc->next) { + tdc->next = dc->next; + rdc = dc; + break; + } + tdc = tdc->next; + } + } + if (rdc) { + rdc->state = UNUSED; + rdc->dd = NULL; + rdc->pid = 0; + rdc->next = client_pool; + client_pool = rdc; + } +} + +static int qib_diag_open(struct inode *in, struct file *fp); +static int qib_diag_release(struct inode *in, struct file *fp); +static ssize_t qib_diag_read(struct file *fp, char __user *data, + size_t count, loff_t *off); +static ssize_t qib_diag_write(struct file *fp, const char __user *data, + size_t count, loff_t *off); + +static const struct file_operations diag_file_ops = { + .owner = THIS_MODULE, + .write = qib_diag_write, + .read = qib_diag_read, + .open = qib_diag_open, + .release = qib_diag_release, + .llseek = default_llseek, +}; + +static atomic_t diagpkt_count = ATOMIC_INIT(0); +static struct cdev *diagpkt_cdev; +static struct device *diagpkt_device; + +static ssize_t qib_diagpkt_write(struct file *fp, const char __user *data, + size_t count, loff_t *off); + +static const struct file_operations diagpkt_file_ops = { + .owner = THIS_MODULE, + .write = qib_diagpkt_write, + .llseek = noop_llseek, +}; + +int qib_diag_add(struct qib_devdata *dd) +{ + char name[16]; + int ret = 0; + + if (atomic_inc_return(&diagpkt_count) == 1) { + ret = qib_cdev_init(QIB_DIAGPKT_MINOR, "ipath_diagpkt", + &diagpkt_file_ops, &diagpkt_cdev, + &diagpkt_device); + if (ret) + goto done; + } + + snprintf(name, sizeof(name), "ipath_diag%d", dd->unit); + ret = qib_cdev_init(QIB_DIAG_MINOR_BASE + dd->unit, name, + &diag_file_ops, &dd->diag_cdev, + &dd->diag_device); +done: + return ret; +} + +static void qib_unregister_observers(struct qib_devdata *dd); + +void qib_diag_remove(struct qib_devdata *dd) +{ + struct qib_diag_client *dc; + + if (atomic_dec_and_test(&diagpkt_count)) + qib_cdev_cleanup(&diagpkt_cdev, &diagpkt_device); + + qib_cdev_cleanup(&dd->diag_cdev, &dd->diag_device); + + /* + * Return all diag_clients of this device. There should be none, + * as we are "guaranteed" that no clients are still open + */ + while (dd->diag_client) + return_client(dd->diag_client); + + /* Now clean up all unused client structs */ + while (client_pool) { + dc = client_pool; + client_pool = dc->next; + kfree(dc); + } + /* Clean up observer list */ + qib_unregister_observers(dd); +} + +/* qib_remap_ioaddr32 - remap an offset into chip address space to __iomem * + * + * @dd: the qlogic_ib device + * @offs: the offset in chip-space + * @cntp: Pointer to max (byte) count for transfer starting at offset + * This returns a u32 __iomem * so it can be used for both 64 and 32-bit + * mapping. It is needed because with the use of PAT for control of + * write-combining, the logically contiguous address-space of the chip + * may be split into virtually non-contiguous spaces, with different + * attributes, which are them mapped to contiguous physical space + * based from the first BAR. + * + * The code below makes the same assumptions as were made in + * init_chip_wc_pat() (qib_init.c), copied here: + * Assumes chip address space looks like: + * - kregs + sregs + cregs + uregs (in any order) + * - piobufs (2K and 4K bufs in either order) + * or: + * - kregs + sregs + cregs (in any order) + * - piobufs (2K and 4K bufs in either order) + * - uregs + * + * If cntp is non-NULL, returns how many bytes from offset can be accessed + * Returns 0 if the offset is not mapped. + */ +static u32 __iomem *qib_remap_ioaddr32(struct qib_devdata *dd, u32 offset, + u32 *cntp) +{ + u32 kreglen; + u32 snd_bottom, snd_lim = 0; + u32 __iomem *krb32 = (u32 __iomem *)dd->kregbase; + u32 __iomem *map = NULL; + u32 cnt = 0; + u32 tot4k, offs4k; + + /* First, simplest case, offset is within the first map. */ + kreglen = (dd->kregend - dd->kregbase) * sizeof(u64); + if (offset < kreglen) { + map = krb32 + (offset / sizeof(u32)); + cnt = kreglen - offset; + goto mapped; + } + + /* + * Next check for user regs, the next most common case, + * and a cheap check because if they are not in the first map + * they are last in chip. + */ + if (dd->userbase) { + /* If user regs mapped, they are after send, so set limit. */ + u32 ulim = (dd->cfgctxts * dd->ureg_align) + dd->uregbase; + if (!dd->piovl15base) + snd_lim = dd->uregbase; + krb32 = (u32 __iomem *)dd->userbase; + if (offset >= dd->uregbase && offset < ulim) { + map = krb32 + (offset - dd->uregbase) / sizeof(u32); + cnt = ulim - offset; + goto mapped; + } + } + + /* + * Lastly, check for offset within Send Buffers. + * This is gnarly because struct devdata is deliberately vague + * about things like 7322 VL15 buffers, and we are not in + * chip-specific code here, so should not make many assumptions. + * The one we _do_ make is that the only chip that has more sndbufs + * than we admit is the 7322, and it has userregs above that, so + * we know the snd_lim. + */ + /* Assume 2K buffers are first. */ + snd_bottom = dd->pio2k_bufbase; + if (snd_lim == 0) { + u32 tot2k = dd->piobcnt2k * ALIGN(dd->piosize2k, dd->palign); + snd_lim = snd_bottom + tot2k; + } + /* If 4k buffers exist, account for them by bumping + * appropriate limit. + */ + tot4k = dd->piobcnt4k * dd->align4k; + offs4k = dd->piobufbase >> 32; + if (dd->piobcnt4k) { + if (snd_bottom > offs4k) + snd_bottom = offs4k; + else { + /* 4k above 2k. Bump snd_lim, if needed*/ + if (!dd->userbase || dd->piovl15base) + snd_lim = offs4k + tot4k; + } + } + /* + * Judgement call: can we ignore the space between SendBuffs and + * UserRegs, where we would like to see vl15 buffs, but not more? + */ + if (offset >= snd_bottom && offset < snd_lim) { + offset -= snd_bottom; + map = (u32 __iomem *)dd->piobase + (offset / sizeof(u32)); + cnt = snd_lim - offset; + } + + if (!map && offs4k && dd->piovl15base) { + snd_lim = offs4k + tot4k + 2 * dd->align4k; + if (offset >= (offs4k + tot4k) && offset < snd_lim) { + map = (u32 __iomem *)dd->piovl15base + + ((offset - (offs4k + tot4k)) / sizeof(u32)); + cnt = snd_lim - offset; + } + } + +mapped: + if (cntp) + *cntp = cnt; + return map; +} + +/* + * qib_read_umem64 - read a 64-bit quantity from the chip into user space + * @dd: the qlogic_ib device + * @uaddr: the location to store the data in user memory + * @regoffs: the offset from BAR0 (_NOT_ full pointer, anymore) + * @count: number of bytes to copy (multiple of 32 bits) + * + * This function also localizes all chip memory accesses. + * The copy should be written such that we read full cacheline packets + * from the chip. This is usually used for a single qword + * + * NOTE: This assumes the chip address is 64-bit aligned. + */ +static int qib_read_umem64(struct qib_devdata *dd, void __user *uaddr, + u32 regoffs, size_t count) +{ + const u64 __iomem *reg_addr; + const u64 __iomem *reg_end; + u32 limit; + int ret; + + reg_addr = (const u64 __iomem *)qib_remap_ioaddr32(dd, regoffs, &limit); + if (reg_addr == NULL || limit == 0 || !(dd->flags & QIB_PRESENT)) { + ret = -EINVAL; + goto bail; + } + if (count >= limit) + count = limit; + reg_end = reg_addr + (count / sizeof(u64)); + + /* not very efficient, but it works for now */ + while (reg_addr < reg_end) { + u64 data = readq(reg_addr); + + if (copy_to_user(uaddr, &data, sizeof(u64))) { + ret = -EFAULT; + goto bail; + } + reg_addr++; + uaddr += sizeof(u64); + } + ret = 0; +bail: + return ret; +} + +/* + * qib_write_umem64 - write a 64-bit quantity to the chip from user space + * @dd: the qlogic_ib device + * @regoffs: the offset from BAR0 (_NOT_ full pointer, anymore) + * @uaddr: the source of the data in user memory + * @count: the number of bytes to copy (multiple of 32 bits) + * + * This is usually used for a single qword + * NOTE: This assumes the chip address is 64-bit aligned. + */ + +static int qib_write_umem64(struct qib_devdata *dd, u32 regoffs, + const void __user *uaddr, size_t count) +{ + u64 __iomem *reg_addr; + const u64 __iomem *reg_end; + u32 limit; + int ret; + + reg_addr = (u64 __iomem *)qib_remap_ioaddr32(dd, regoffs, &limit); + if (reg_addr == NULL || limit == 0 || !(dd->flags & QIB_PRESENT)) { + ret = -EINVAL; + goto bail; + } + if (count >= limit) + count = limit; + reg_end = reg_addr + (count / sizeof(u64)); + + /* not very efficient, but it works for now */ + while (reg_addr < reg_end) { + u64 data; + if (copy_from_user(&data, uaddr, sizeof(data))) { + ret = -EFAULT; + goto bail; + } + writeq(data, reg_addr); + + reg_addr++; + uaddr += sizeof(u64); + } + ret = 0; +bail: + return ret; +} + +/* + * qib_read_umem32 - read a 32-bit quantity from the chip into user space + * @dd: the qlogic_ib device + * @uaddr: the location to store the data in user memory + * @regoffs: the offset from BAR0 (_NOT_ full pointer, anymore) + * @count: number of bytes to copy + * + * read 32 bit values, not 64 bit; for memories that only + * support 32 bit reads; usually a single dword. + */ +static int qib_read_umem32(struct qib_devdata *dd, void __user *uaddr, + u32 regoffs, size_t count) +{ + const u32 __iomem *reg_addr; + const u32 __iomem *reg_end; + u32 limit; + int ret; + + reg_addr = qib_remap_ioaddr32(dd, regoffs, &limit); + if (reg_addr == NULL || limit == 0 || !(dd->flags & QIB_PRESENT)) { + ret = -EINVAL; + goto bail; + } + if (count >= limit) + count = limit; + reg_end = reg_addr + (count / sizeof(u32)); + + /* not very efficient, but it works for now */ + while (reg_addr < reg_end) { + u32 data = readl(reg_addr); + + if (copy_to_user(uaddr, &data, sizeof(data))) { + ret = -EFAULT; + goto bail; + } + + reg_addr++; + uaddr += sizeof(u32); + + } + ret = 0; +bail: + return ret; +} + +/* + * qib_write_umem32 - write a 32-bit quantity to the chip from user space + * @dd: the qlogic_ib device + * @regoffs: the offset from BAR0 (_NOT_ full pointer, anymore) + * @uaddr: the source of the data in user memory + * @count: number of bytes to copy + * + * write 32 bit values, not 64 bit; for memories that only + * support 32 bit write; usually a single dword. + */ + +static int qib_write_umem32(struct qib_devdata *dd, u32 regoffs, + const void __user *uaddr, size_t count) +{ + u32 __iomem *reg_addr; + const u32 __iomem *reg_end; + u32 limit; + int ret; + + reg_addr = qib_remap_ioaddr32(dd, regoffs, &limit); + if (reg_addr == NULL || limit == 0 || !(dd->flags & QIB_PRESENT)) { + ret = -EINVAL; + goto bail; + } + if (count >= limit) + count = limit; + reg_end = reg_addr + (count / sizeof(u32)); + + while (reg_addr < reg_end) { + u32 data; + + if (copy_from_user(&data, uaddr, sizeof(data))) { + ret = -EFAULT; + goto bail; + } + writel(data, reg_addr); + + reg_addr++; + uaddr += sizeof(u32); + } + ret = 0; +bail: + return ret; +} + +static int qib_diag_open(struct inode *in, struct file *fp) +{ + int unit = iminor(in) - QIB_DIAG_MINOR_BASE; + struct qib_devdata *dd; + struct qib_diag_client *dc; + int ret; + + mutex_lock(&qib_mutex); + + dd = qib_lookup(unit); + + if (dd == NULL || !(dd->flags & QIB_PRESENT) || + !dd->kregbase) { + ret = -ENODEV; + goto bail; + } + + dc = get_client(dd); + if (!dc) { + ret = -ENOMEM; + goto bail; + } + dc->next = dd->diag_client; + dd->diag_client = dc; + fp->private_data = dc; + ret = 0; +bail: + mutex_unlock(&qib_mutex); + + return ret; +} + +/** + * qib_diagpkt_write - write an IB packet + * @fp: the diag data device file pointer + * @data: qib_diag_pkt structure saying where to get the packet + * @count: size of data to write + * @off: unused by this code + */ +static ssize_t qib_diagpkt_write(struct file *fp, + const char __user *data, + size_t count, loff_t *off) +{ + u32 __iomem *piobuf; + u32 plen, pbufn, maxlen_reserve; + struct qib_diag_xpkt dp; + u32 *tmpbuf = NULL; + struct qib_devdata *dd; + struct qib_pportdata *ppd; + ssize_t ret = 0; + + if (count != sizeof(dp)) { + ret = -EINVAL; + goto bail; + } + if (copy_from_user(&dp, data, sizeof(dp))) { + ret = -EFAULT; + goto bail; + } + + dd = qib_lookup(dp.unit); + if (!dd || !(dd->flags & QIB_PRESENT) || !dd->kregbase) { + ret = -ENODEV; + goto bail; + } + if (!(dd->flags & QIB_INITTED)) { + /* no hardware, freeze, etc. */ + ret = -ENODEV; + goto bail; + } + + if (dp.version != _DIAG_XPKT_VERS) { + qib_dev_err(dd, "Invalid version %u for diagpkt_write\n", + dp.version); + ret = -EINVAL; + goto bail; + } + /* send count must be an exact number of dwords */ + if (dp.len & 3) { + ret = -EINVAL; + goto bail; + } + if (!dp.port || dp.port > dd->num_pports) { + ret = -EINVAL; + goto bail; + } + ppd = &dd->pport[dp.port - 1]; + + /* + * need total length before first word written, plus 2 Dwords. One Dword + * is for padding so we get the full user data when not aligned on + * a word boundary. The other Dword is to make sure we have room for the + * ICRC which gets tacked on later. + */ + maxlen_reserve = 2 * sizeof(u32); + if (dp.len > ppd->ibmaxlen - maxlen_reserve) { + ret = -EINVAL; + goto bail; + } + + plen = sizeof(u32) + dp.len; + + tmpbuf = vmalloc(plen); + if (!tmpbuf) { + qib_devinfo(dd->pcidev, + "Unable to allocate tmp buffer, failing\n"); + ret = -ENOMEM; + goto bail; + } + + if (copy_from_user(tmpbuf, + (const void __user *) (unsigned long) dp.data, + dp.len)) { + ret = -EFAULT; + goto bail; + } + + plen >>= 2; /* in dwords */ + + if (dp.pbc_wd == 0) + dp.pbc_wd = plen; + + piobuf = dd->f_getsendbuf(ppd, dp.pbc_wd, &pbufn); + if (!piobuf) { + ret = -EBUSY; + goto bail; + } + /* disarm it just to be extra sure */ + dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_BUF(pbufn)); + + /* disable header check on pbufn for this packet */ + dd->f_txchk_change(dd, pbufn, 1, TXCHK_CHG_TYPE_DIS1, NULL); + + writeq(dp.pbc_wd, piobuf); + /* + * Copy all but the trigger word, then flush, so it's written + * to chip before trigger word, then write trigger word, then + * flush again, so packet is sent. + */ + if (dd->flags & QIB_PIO_FLUSH_WC) { + qib_flush_wc(); + qib_pio_copy(piobuf + 2, tmpbuf, plen - 1); + qib_flush_wc(); + __raw_writel(tmpbuf[plen - 1], piobuf + plen + 1); + } else + qib_pio_copy(piobuf + 2, tmpbuf, plen); + + if (dd->flags & QIB_USE_SPCL_TRIG) { + u32 spcl_off = (pbufn >= dd->piobcnt2k) ? 2047 : 1023; + + qib_flush_wc(); + __raw_writel(0xaebecede, piobuf + spcl_off); + } + + /* + * Ensure buffer is written to the chip, then re-enable + * header checks (if supported by chip). The txchk + * code will ensure seen by chip before returning. + */ + qib_flush_wc(); + qib_sendbuf_done(dd, pbufn); + dd->f_txchk_change(dd, pbufn, 1, TXCHK_CHG_TYPE_ENAB1, NULL); + + ret = sizeof(dp); + +bail: + vfree(tmpbuf); + return ret; +} + +static int qib_diag_release(struct inode *in, struct file *fp) +{ + mutex_lock(&qib_mutex); + return_client(fp->private_data); + fp->private_data = NULL; + mutex_unlock(&qib_mutex); + return 0; +} + +/* + * Chip-specific code calls to register its interest in + * a specific range. + */ +struct diag_observer_list_elt { + struct diag_observer_list_elt *next; + const struct diag_observer *op; +}; + +int qib_register_observer(struct qib_devdata *dd, + const struct diag_observer *op) +{ + struct diag_observer_list_elt *olp; + unsigned long flags; + + if (!dd || !op) + return -EINVAL; + olp = vmalloc(sizeof *olp); + if (!olp) { + pr_err("vmalloc for observer failed\n"); + return -ENOMEM; + } + + spin_lock_irqsave(&dd->qib_diag_trans_lock, flags); + olp->op = op; + olp->next = dd->diag_observer_list; + dd->diag_observer_list = olp; + spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags); + + return 0; +} + +/* Remove all registered observers when device is closed */ +static void qib_unregister_observers(struct qib_devdata *dd) +{ + struct diag_observer_list_elt *olp; + unsigned long flags; + + spin_lock_irqsave(&dd->qib_diag_trans_lock, flags); + olp = dd->diag_observer_list; + while (olp) { + /* Pop one observer, let go of lock */ + dd->diag_observer_list = olp->next; + spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags); + vfree(olp); + /* try again. */ + spin_lock_irqsave(&dd->qib_diag_trans_lock, flags); + olp = dd->diag_observer_list; + } + spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags); +} + +/* + * Find the observer, if any, for the specified address. Initial implementation + * is simple stack of observers. This must be called with diag transaction + * lock held. + */ +static const struct diag_observer *diag_get_observer(struct qib_devdata *dd, + u32 addr) +{ + struct diag_observer_list_elt *olp; + const struct diag_observer *op = NULL; + + olp = dd->diag_observer_list; + while (olp) { + op = olp->op; + if (addr >= op->bottom && addr <= op->top) + break; + olp = olp->next; + } + if (!olp) + op = NULL; + + return op; +} + +static ssize_t qib_diag_read(struct file *fp, char __user *data, + size_t count, loff_t *off) +{ + struct qib_diag_client *dc = fp->private_data; + struct qib_devdata *dd = dc->dd; + void __iomem *kreg_base; + ssize_t ret; + + if (dc->pid != current->pid) { + ret = -EPERM; + goto bail; + } + + kreg_base = dd->kregbase; + + if (count == 0) + ret = 0; + else if ((count % 4) || (*off % 4)) + /* address or length is not 32-bit aligned, hence invalid */ + ret = -EINVAL; + else if (dc->state < READY && (*off || count != 8)) + ret = -EINVAL; /* prevent cat /dev/qib_diag* */ + else { + unsigned long flags; + u64 data64 = 0; + int use_32; + const struct diag_observer *op; + + use_32 = (count % 8) || (*off % 8); + ret = -1; + spin_lock_irqsave(&dd->qib_diag_trans_lock, flags); + /* + * Check for observer on this address range. + * we only support a single 32 or 64-bit read + * via observer, currently. + */ + op = diag_get_observer(dd, *off); + if (op) { + u32 offset = *off; + ret = op->hook(dd, op, offset, &data64, 0, use_32); + } + /* + * We need to release lock before any copy_to_user(), + * whether implicit in qib_read_umem* or explicit below. + */ + spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags); + if (!op) { + if (use_32) + /* + * Address or length is not 64-bit aligned; + * do 32-bit rd + */ + ret = qib_read_umem32(dd, data, (u32) *off, + count); + else + ret = qib_read_umem64(dd, data, (u32) *off, + count); + } else if (ret == count) { + /* Below finishes case where observer existed */ + ret = copy_to_user(data, &data64, use_32 ? + sizeof(u32) : sizeof(u64)); + if (ret) + ret = -EFAULT; + } + } + + if (ret >= 0) { + *off += count; + ret = count; + if (dc->state == OPENED) + dc->state = INIT; + } +bail: + return ret; +} + +static ssize_t qib_diag_write(struct file *fp, const char __user *data, + size_t count, loff_t *off) +{ + struct qib_diag_client *dc = fp->private_data; + struct qib_devdata *dd = dc->dd; + void __iomem *kreg_base; + ssize_t ret; + + if (dc->pid != current->pid) { + ret = -EPERM; + goto bail; + } + + kreg_base = dd->kregbase; + + if (count == 0) + ret = 0; + else if ((count % 4) || (*off % 4)) + /* address or length is not 32-bit aligned, hence invalid */ + ret = -EINVAL; + else if (dc->state < READY && + ((*off || count != 8) || dc->state != INIT)) + /* No writes except second-step of init seq */ + ret = -EINVAL; /* before any other write allowed */ + else { + unsigned long flags; + const struct diag_observer *op = NULL; + int use_32 = (count % 8) || (*off % 8); + + /* + * Check for observer on this address range. + * We only support a single 32 or 64-bit write + * via observer, currently. This helps, because + * we would otherwise have to jump through hoops + * to make "diag transaction" meaningful when we + * cannot do a copy_from_user while holding the lock. + */ + if (count == 4 || count == 8) { + u64 data64; + u32 offset = *off; + ret = copy_from_user(&data64, data, count); + if (ret) { + ret = -EFAULT; + goto bail; + } + spin_lock_irqsave(&dd->qib_diag_trans_lock, flags); + op = diag_get_observer(dd, *off); + if (op) + ret = op->hook(dd, op, offset, &data64, ~0Ull, + use_32); + spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags); + } + + if (!op) { + if (use_32) + /* + * Address or length is not 64-bit aligned; + * do 32-bit write + */ + ret = qib_write_umem32(dd, (u32) *off, data, + count); + else + ret = qib_write_umem64(dd, (u32) *off, data, + count); + } + } + + if (ret >= 0) { + *off += count; + ret = count; + if (dc->state == INIT) + dc->state = READY; /* all read/write OK now */ + } +bail: + return ret; +} diff --git a/drivers/infiniband/hw/qib/qib_dma.c b/drivers/infiniband/hw/qib/qib_dma.c new file mode 100644 index 0000000..59fe092 --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_dma.c @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2006, 2009, 2010 QLogic, Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include + +#include "qib_verbs.h" + +#define BAD_DMA_ADDRESS ((u64) 0) + +/* + * The following functions implement driver specific replacements + * for the ib_dma_*() functions. + * + * These functions return kernel virtual addresses instead of + * device bus addresses since the driver uses the CPU to copy + * data instead of using hardware DMA. + */ + +static int qib_mapping_error(struct ib_device *dev, u64 dma_addr) +{ + return dma_addr == BAD_DMA_ADDRESS; +} + +static u64 qib_dma_map_single(struct ib_device *dev, void *cpu_addr, + size_t size, enum dma_data_direction direction) +{ + BUG_ON(!valid_dma_direction(direction)); + return (u64) cpu_addr; +} + +static void qib_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size, + enum dma_data_direction direction) +{ + BUG_ON(!valid_dma_direction(direction)); +} + +static u64 qib_dma_map_page(struct ib_device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction direction) +{ + u64 addr; + + BUG_ON(!valid_dma_direction(direction)); + + if (offset + size > PAGE_SIZE) { + addr = BAD_DMA_ADDRESS; + goto done; + } + + addr = (u64) page_address(page); + if (addr) + addr += offset; + /* TODO: handle highmem pages */ + +done: + return addr; +} + +static void qib_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size, + enum dma_data_direction direction) +{ + BUG_ON(!valid_dma_direction(direction)); +} + +static int qib_map_sg(struct ib_device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction direction) +{ + struct scatterlist *sg; + u64 addr; + int i; + int ret = nents; + + BUG_ON(!valid_dma_direction(direction)); + + for_each_sg(sgl, sg, nents, i) { + addr = (u64) page_address(sg_page(sg)); + /* TODO: handle highmem pages */ + if (!addr) { + ret = 0; + break; + } + sg->dma_address = addr + sg->offset; +#ifdef CONFIG_NEED_SG_DMA_LENGTH + sg->dma_length = sg->length; +#endif + } + return ret; +} + +static void qib_unmap_sg(struct ib_device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + BUG_ON(!valid_dma_direction(direction)); +} + +static void qib_sync_single_for_cpu(struct ib_device *dev, u64 addr, + size_t size, enum dma_data_direction dir) +{ +} + +static void qib_sync_single_for_device(struct ib_device *dev, u64 addr, + size_t size, + enum dma_data_direction dir) +{ +} + +static void *qib_dma_alloc_coherent(struct ib_device *dev, size_t size, + u64 *dma_handle, gfp_t flag) +{ + struct page *p; + void *addr = NULL; + + p = alloc_pages(flag, get_order(size)); + if (p) + addr = page_address(p); + if (dma_handle) + *dma_handle = (u64) addr; + return addr; +} + +static void qib_dma_free_coherent(struct ib_device *dev, size_t size, + void *cpu_addr, u64 dma_handle) +{ + free_pages((unsigned long) cpu_addr, get_order(size)); +} + +struct ib_dma_mapping_ops qib_dma_mapping_ops = { + .mapping_error = qib_mapping_error, + .map_single = qib_dma_map_single, + .unmap_single = qib_dma_unmap_single, + .map_page = qib_dma_map_page, + .unmap_page = qib_dma_unmap_page, + .map_sg = qib_map_sg, + .unmap_sg = qib_unmap_sg, + .sync_single_for_cpu = qib_sync_single_for_cpu, + .sync_single_for_device = qib_sync_single_for_device, + .alloc_coherent = qib_dma_alloc_coherent, + .free_coherent = qib_dma_free_coherent +}; diff --git a/drivers/infiniband/hw/qib/qib_driver.c b/drivers/infiniband/hw/qib/qib_driver.c new file mode 100644 index 0000000..5bee08f --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_driver.c @@ -0,0 +1,817 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "qib.h" + +/* + * The size has to be longer than this string, so we can append + * board/chip information to it in the init code. + */ +const char ib_qib_version[] = QIB_DRIVER_VERSION "\n"; + +DEFINE_SPINLOCK(qib_devs_lock); +LIST_HEAD(qib_dev_list); +DEFINE_MUTEX(qib_mutex); /* general driver use */ + +unsigned qib_ibmtu; +module_param_named(ibmtu, qib_ibmtu, uint, S_IRUGO); +MODULE_PARM_DESC(ibmtu, "Set max IB MTU (0=2KB, 1=256, 2=512, ... 5=4096"); + +unsigned qib_compat_ddr_negotiate = 1; +module_param_named(compat_ddr_negotiate, qib_compat_ddr_negotiate, uint, + S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(compat_ddr_negotiate, + "Attempt pre-IBTA 1.2 DDR speed negotiation"); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Intel "); +MODULE_DESCRIPTION("Intel IB driver"); +MODULE_VERSION(QIB_DRIVER_VERSION); + +/* + * QIB_PIO_MAXIBHDR is the max IB header size allowed for in our + * PIO send buffers. This is well beyond anything currently + * defined in the InfiniBand spec. + */ +#define QIB_PIO_MAXIBHDR 128 + +/* + * QIB_MAX_PKT_RCV is the max # if packets processed per receive interrupt. + */ +#define QIB_MAX_PKT_RECV 64 + +struct qlogic_ib_stats qib_stats; + +const char *qib_get_unit_name(int unit) +{ + static char iname[16]; + + snprintf(iname, sizeof iname, "infinipath%u", unit); + return iname; +} + +/* + * Return count of units with at least one port ACTIVE. + */ +int qib_count_active_units(void) +{ + struct qib_devdata *dd; + struct qib_pportdata *ppd; + unsigned long flags; + int pidx, nunits_active = 0; + + spin_lock_irqsave(&qib_devs_lock, flags); + list_for_each_entry(dd, &qib_dev_list, list) { + if (!(dd->flags & QIB_PRESENT) || !dd->kregbase) + continue; + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (ppd->lid && (ppd->lflags & (QIBL_LINKINIT | + QIBL_LINKARMED | QIBL_LINKACTIVE))) { + nunits_active++; + break; + } + } + } + spin_unlock_irqrestore(&qib_devs_lock, flags); + return nunits_active; +} + +/* + * Return count of all units, optionally return in arguments + * the number of usable (present) units, and the number of + * ports that are up. + */ +int qib_count_units(int *npresentp, int *nupp) +{ + int nunits = 0, npresent = 0, nup = 0; + struct qib_devdata *dd; + unsigned long flags; + int pidx; + struct qib_pportdata *ppd; + + spin_lock_irqsave(&qib_devs_lock, flags); + + list_for_each_entry(dd, &qib_dev_list, list) { + nunits++; + if ((dd->flags & QIB_PRESENT) && dd->kregbase) + npresent++; + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (ppd->lid && (ppd->lflags & (QIBL_LINKINIT | + QIBL_LINKARMED | QIBL_LINKACTIVE))) + nup++; + } + } + + spin_unlock_irqrestore(&qib_devs_lock, flags); + + if (npresentp) + *npresentp = npresent; + if (nupp) + *nupp = nup; + + return nunits; +} + +/** + * qib_wait_linkstate - wait for an IB link state change to occur + * @dd: the qlogic_ib device + * @state: the state to wait for + * @msecs: the number of milliseconds to wait + * + * wait up to msecs milliseconds for IB link state change to occur for + * now, take the easy polling route. Currently used only by + * qib_set_linkstate. Returns 0 if state reached, otherwise + * -ETIMEDOUT state can have multiple states set, for any of several + * transitions. + */ +int qib_wait_linkstate(struct qib_pportdata *ppd, u32 state, int msecs) +{ + int ret; + unsigned long flags; + + spin_lock_irqsave(&ppd->lflags_lock, flags); + if (ppd->state_wanted) { + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + ret = -EBUSY; + goto bail; + } + ppd->state_wanted = state; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + wait_event_interruptible_timeout(ppd->state_wait, + (ppd->lflags & state), + msecs_to_jiffies(msecs)); + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->state_wanted = 0; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + + if (!(ppd->lflags & state)) + ret = -ETIMEDOUT; + else + ret = 0; +bail: + return ret; +} + +int qib_set_linkstate(struct qib_pportdata *ppd, u8 newstate) +{ + u32 lstate; + int ret; + struct qib_devdata *dd = ppd->dd; + unsigned long flags; + + switch (newstate) { + case QIB_IB_LINKDOWN_ONLY: + dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE, + IB_LINKCMD_DOWN | IB_LINKINITCMD_NOP); + /* don't wait */ + ret = 0; + goto bail; + + case QIB_IB_LINKDOWN: + dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE, + IB_LINKCMD_DOWN | IB_LINKINITCMD_POLL); + /* don't wait */ + ret = 0; + goto bail; + + case QIB_IB_LINKDOWN_SLEEP: + dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE, + IB_LINKCMD_DOWN | IB_LINKINITCMD_SLEEP); + /* don't wait */ + ret = 0; + goto bail; + + case QIB_IB_LINKDOWN_DISABLE: + dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE, + IB_LINKCMD_DOWN | IB_LINKINITCMD_DISABLE); + /* don't wait */ + ret = 0; + goto bail; + + case QIB_IB_LINKARM: + if (ppd->lflags & QIBL_LINKARMED) { + ret = 0; + goto bail; + } + if (!(ppd->lflags & (QIBL_LINKINIT | QIBL_LINKACTIVE))) { + ret = -EINVAL; + goto bail; + } + /* + * Since the port can be ACTIVE when we ask for ARMED, + * clear QIBL_LINKV so we can wait for a transition. + * If the link isn't ARMED, then something else happened + * and there is no point waiting for ARMED. + */ + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_LINKV; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE, + IB_LINKCMD_ARMED | IB_LINKINITCMD_NOP); + lstate = QIBL_LINKV; + break; + + case QIB_IB_LINKACTIVE: + if (ppd->lflags & QIBL_LINKACTIVE) { + ret = 0; + goto bail; + } + if (!(ppd->lflags & QIBL_LINKARMED)) { + ret = -EINVAL; + goto bail; + } + dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE, + IB_LINKCMD_ACTIVE | IB_LINKINITCMD_NOP); + lstate = QIBL_LINKACTIVE; + break; + + default: + ret = -EINVAL; + goto bail; + } + ret = qib_wait_linkstate(ppd, lstate, 10); + +bail: + return ret; +} + +/* + * Get address of eager buffer from it's index (allocated in chunks, not + * contiguous). + */ +static inline void *qib_get_egrbuf(const struct qib_ctxtdata *rcd, u32 etail) +{ + const u32 chunk = etail >> rcd->rcvegrbufs_perchunk_shift; + const u32 idx = etail & ((u32)rcd->rcvegrbufs_perchunk - 1); + + return rcd->rcvegrbuf[chunk] + (idx << rcd->dd->rcvegrbufsize_shift); +} + +/* + * Returns 1 if error was a CRC, else 0. + * Needed for some chip's synthesized error counters. + */ +static u32 qib_rcv_hdrerr(struct qib_ctxtdata *rcd, struct qib_pportdata *ppd, + u32 ctxt, u32 eflags, u32 l, u32 etail, + __le32 *rhf_addr, struct qib_message_header *rhdr) +{ + u32 ret = 0; + + if (eflags & (QLOGIC_IB_RHF_H_ICRCERR | QLOGIC_IB_RHF_H_VCRCERR)) + ret = 1; + else if (eflags == QLOGIC_IB_RHF_H_TIDERR) { + /* For TIDERR and RC QPs premptively schedule a NAK */ + struct qib_ib_header *hdr = (struct qib_ib_header *) rhdr; + struct qib_other_headers *ohdr = NULL; + struct qib_ibport *ibp = &ppd->ibport_data; + struct qib_qp *qp = NULL; + u32 tlen = qib_hdrget_length_in_bytes(rhf_addr); + u16 lid = be16_to_cpu(hdr->lrh[1]); + int lnh = be16_to_cpu(hdr->lrh[0]) & 3; + u32 qp_num; + u32 opcode; + u32 psn; + int diff; + + /* Sanity check packet */ + if (tlen < 24) + goto drop; + + if (lid < QIB_MULTICAST_LID_BASE) { + lid &= ~((1 << ppd->lmc) - 1); + if (unlikely(lid != ppd->lid)) + goto drop; + } + + /* Check for GRH */ + if (lnh == QIB_LRH_BTH) + ohdr = &hdr->u.oth; + else if (lnh == QIB_LRH_GRH) { + u32 vtf; + + ohdr = &hdr->u.l.oth; + if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR) + goto drop; + vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow); + if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION) + goto drop; + } else + goto drop; + + /* Get opcode and PSN from packet */ + opcode = be32_to_cpu(ohdr->bth[0]); + opcode >>= 24; + psn = be32_to_cpu(ohdr->bth[2]); + + /* Get the destination QP number. */ + qp_num = be32_to_cpu(ohdr->bth[1]) & QIB_QPN_MASK; + if (qp_num != QIB_MULTICAST_QPN) { + int ruc_res; + qp = qib_lookup_qpn(ibp, qp_num); + if (!qp) + goto drop; + + /* + * Handle only RC QPs - for other QP types drop error + * packet. + */ + spin_lock(&qp->r_lock); + + /* Check for valid receive state. */ + if (!(ib_qib_state_ops[qp->state] & + QIB_PROCESS_RECV_OK)) { + ibp->n_pkt_drops++; + goto unlock; + } + + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + ruc_res = + qib_ruc_check_hdr( + ibp, hdr, + lnh == QIB_LRH_GRH, + qp, + be32_to_cpu(ohdr->bth[0])); + if (ruc_res) + goto unlock; + + /* Only deal with RDMA Writes for now */ + if (opcode < + IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) { + diff = qib_cmp24(psn, qp->r_psn); + if (!qp->r_nak_state && diff >= 0) { + ibp->n_rc_seqnak++; + qp->r_nak_state = + IB_NAK_PSN_ERROR; + /* Use the expected PSN. */ + qp->r_ack_psn = qp->r_psn; + /* + * Wait to send the sequence + * NAK until all packets + * in the receive queue have + * been processed. + * Otherwise, we end up + * propagating congestion. + */ + if (list_empty(&qp->rspwait)) { + qp->r_flags |= + QIB_R_RSP_NAK; + atomic_inc( + &qp->refcount); + list_add_tail( + &qp->rspwait, + &rcd->qp_wait_list); + } + } /* Out of sequence NAK */ + } /* QP Request NAKs */ + break; + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_QPT_UD: + case IB_QPT_UC: + default: + /* For now don't handle any other QP types */ + break; + } + +unlock: + spin_unlock(&qp->r_lock); + /* + * Notify qib_destroy_qp() if it is waiting + * for us to finish. + */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } /* Unicast QP */ + } /* Valid packet with TIDErr */ + +drop: + return ret; +} + +/* + * qib_kreceive - receive a packet + * @rcd: the qlogic_ib context + * @llic: gets count of good packets needed to clear lli, + * (used with chips that need need to track crcs for lli) + * + * called from interrupt handler for errors or receive interrupt + * Returns number of CRC error packets, needed by some chips for + * local link integrity tracking. crcs are adjusted down by following + * good packets, if any, and count of good packets is also tracked. + */ +u32 qib_kreceive(struct qib_ctxtdata *rcd, u32 *llic, u32 *npkts) +{ + struct qib_devdata *dd = rcd->dd; + struct qib_pportdata *ppd = rcd->ppd; + __le32 *rhf_addr; + void *ebuf; + const u32 rsize = dd->rcvhdrentsize; /* words */ + const u32 maxcnt = dd->rcvhdrcnt * rsize; /* words */ + u32 etail = -1, l, hdrqtail; + struct qib_message_header *hdr; + u32 eflags, etype, tlen, i = 0, updegr = 0, crcs = 0; + int last; + u64 lval; + struct qib_qp *qp, *nqp; + + l = rcd->head; + rhf_addr = (__le32 *) rcd->rcvhdrq + l + dd->rhf_offset; + if (dd->flags & QIB_NODMA_RTAIL) { + u32 seq = qib_hdrget_seq(rhf_addr); + if (seq != rcd->seq_cnt) + goto bail; + hdrqtail = 0; + } else { + hdrqtail = qib_get_rcvhdrtail(rcd); + if (l == hdrqtail) + goto bail; + smp_rmb(); /* prevent speculative reads of dma'ed hdrq */ + } + + for (last = 0, i = 1; !last; i += !last) { + hdr = dd->f_get_msgheader(dd, rhf_addr); + eflags = qib_hdrget_err_flags(rhf_addr); + etype = qib_hdrget_rcv_type(rhf_addr); + /* total length */ + tlen = qib_hdrget_length_in_bytes(rhf_addr); + ebuf = NULL; + if ((dd->flags & QIB_NODMA_RTAIL) ? + qib_hdrget_use_egr_buf(rhf_addr) : + (etype != RCVHQ_RCV_TYPE_EXPECTED)) { + etail = qib_hdrget_index(rhf_addr); + updegr = 1; + if (tlen > sizeof(*hdr) || + etype >= RCVHQ_RCV_TYPE_NON_KD) { + ebuf = qib_get_egrbuf(rcd, etail); + prefetch_range(ebuf, tlen - sizeof(*hdr)); + } + } + if (!eflags) { + u16 lrh_len = be16_to_cpu(hdr->lrh[2]) << 2; + + if (lrh_len != tlen) { + qib_stats.sps_lenerrs++; + goto move_along; + } + } + if (etype == RCVHQ_RCV_TYPE_NON_KD && !eflags && + ebuf == NULL && + tlen > (dd->rcvhdrentsize - 2 + 1 - + qib_hdrget_offset(rhf_addr)) << 2) { + goto move_along; + } + + /* + * Both tiderr and qibhdrerr are set for all plain IB + * packets; only qibhdrerr should be set. + */ + if (unlikely(eflags)) + crcs += qib_rcv_hdrerr(rcd, ppd, rcd->ctxt, eflags, l, + etail, rhf_addr, hdr); + else if (etype == RCVHQ_RCV_TYPE_NON_KD) { + qib_ib_rcv(rcd, hdr, ebuf, tlen); + if (crcs) + crcs--; + else if (llic && *llic) + --*llic; + } +move_along: + l += rsize; + if (l >= maxcnt) + l = 0; + if (i == QIB_MAX_PKT_RECV) + last = 1; + + rhf_addr = (__le32 *) rcd->rcvhdrq + l + dd->rhf_offset; + if (dd->flags & QIB_NODMA_RTAIL) { + u32 seq = qib_hdrget_seq(rhf_addr); + + if (++rcd->seq_cnt > 13) + rcd->seq_cnt = 1; + if (seq != rcd->seq_cnt) + last = 1; + } else if (l == hdrqtail) + last = 1; + /* + * Update head regs etc., every 16 packets, if not last pkt, + * to help prevent rcvhdrq overflows, when many packets + * are processed and queue is nearly full. + * Don't request an interrupt for intermediate updates. + */ + lval = l; + if (!last && !(i & 0xf)) { + dd->f_update_usrhead(rcd, lval, updegr, etail, i); + updegr = 0; + } + } + /* + * Notify qib_destroy_qp() if it is waiting + * for lookaside_qp to finish. + */ + if (rcd->lookaside_qp) { + if (atomic_dec_and_test(&rcd->lookaside_qp->refcount)) + wake_up(&rcd->lookaside_qp->wait); + rcd->lookaside_qp = NULL; + } + + rcd->head = l; + + /* + * Iterate over all QPs waiting to respond. + * The list won't change since the IRQ is only run on one CPU. + */ + list_for_each_entry_safe(qp, nqp, &rcd->qp_wait_list, rspwait) { + list_del_init(&qp->rspwait); + if (qp->r_flags & QIB_R_RSP_NAK) { + qp->r_flags &= ~QIB_R_RSP_NAK; + qib_send_rc_ack(qp); + } + if (qp->r_flags & QIB_R_RSP_SEND) { + unsigned long flags; + + qp->r_flags &= ~QIB_R_RSP_SEND; + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_qib_state_ops[qp->state] & + QIB_PROCESS_OR_FLUSH_SEND) + qib_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + } + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } + +bail: + /* Report number of packets consumed */ + if (npkts) + *npkts = i; + + /* + * Always write head at end, and setup rcv interrupt, even + * if no packets were processed. + */ + lval = (u64)rcd->head | dd->rhdrhead_intr_off; + dd->f_update_usrhead(rcd, lval, updegr, etail, i); + return crcs; +} + +/** + * qib_set_mtu - set the MTU + * @ppd: the perport data + * @arg: the new MTU + * + * We can handle "any" incoming size, the issue here is whether we + * need to restrict our outgoing size. For now, we don't do any + * sanity checking on this, and we don't deal with what happens to + * programs that are already running when the size changes. + * NOTE: changing the MTU will usually cause the IBC to go back to + * link INIT state... + */ +int qib_set_mtu(struct qib_pportdata *ppd, u16 arg) +{ + u32 piosize; + int ret, chk; + + if (arg != 256 && arg != 512 && arg != 1024 && arg != 2048 && + arg != 4096) { + ret = -EINVAL; + goto bail; + } + chk = ib_mtu_enum_to_int(qib_ibmtu); + if (chk > 0 && arg > chk) { + ret = -EINVAL; + goto bail; + } + + piosize = ppd->ibmaxlen; + ppd->ibmtu = arg; + + if (arg >= (piosize - QIB_PIO_MAXIBHDR)) { + /* Only if it's not the initial value (or reset to it) */ + if (piosize != ppd->init_ibmaxlen) { + if (arg > piosize && arg <= ppd->init_ibmaxlen) + piosize = ppd->init_ibmaxlen - 2 * sizeof(u32); + ppd->ibmaxlen = piosize; + } + } else if ((arg + QIB_PIO_MAXIBHDR) != ppd->ibmaxlen) { + piosize = arg + QIB_PIO_MAXIBHDR - 2 * sizeof(u32); + ppd->ibmaxlen = piosize; + } + + ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_MTU, 0); + + ret = 0; + +bail: + return ret; +} + +int qib_set_lid(struct qib_pportdata *ppd, u32 lid, u8 lmc) +{ + struct qib_devdata *dd = ppd->dd; + ppd->lid = lid; + ppd->lmc = lmc; + + dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LIDLMC, + lid | (~((1U << lmc) - 1)) << 16); + + qib_devinfo(dd->pcidev, "IB%u:%u got a lid: 0x%x\n", + dd->unit, ppd->port, lid); + + return 0; +} + +/* + * Following deal with the "obviously simple" task of overriding the state + * of the LEDS, which normally indicate link physical and logical status. + * The complications arise in dealing with different hardware mappings + * and the board-dependent routine being called from interrupts. + * and then there's the requirement to _flash_ them. + */ +#define LED_OVER_FREQ_SHIFT 8 +#define LED_OVER_FREQ_MASK (0xFF<dd; + int timeoff; + int ph_idx; + + if (!(dd->flags & QIB_INITTED)) + return; + + ph_idx = ppd->led_override_phase++ & 1; + ppd->led_override = ppd->led_override_vals[ph_idx]; + timeoff = ppd->led_override_timeoff; + + dd->f_setextled(ppd, 1); + /* + * don't re-fire the timer if user asked for it to be off; we let + * it fire one more time after they turn it off to simplify + */ + if (ppd->led_override_vals[0] || ppd->led_override_vals[1]) + mod_timer(&ppd->led_override_timer, jiffies + timeoff); +} + +void qib_set_led_override(struct qib_pportdata *ppd, unsigned int val) +{ + struct qib_devdata *dd = ppd->dd; + int timeoff, freq; + + if (!(dd->flags & QIB_INITTED)) + return; + + /* First check if we are blinking. If not, use 1HZ polling */ + timeoff = HZ; + freq = (val & LED_OVER_FREQ_MASK) >> LED_OVER_FREQ_SHIFT; + + if (freq) { + /* For blink, set each phase from one nybble of val */ + ppd->led_override_vals[0] = val & 0xF; + ppd->led_override_vals[1] = (val >> 4) & 0xF; + timeoff = (HZ << 4)/freq; + } else { + /* Non-blink set both phases the same. */ + ppd->led_override_vals[0] = val & 0xF; + ppd->led_override_vals[1] = val & 0xF; + } + ppd->led_override_timeoff = timeoff; + + /* + * If the timer has not already been started, do so. Use a "quick" + * timeout so the function will be called soon, to look at our request. + */ + if (atomic_inc_return(&ppd->led_override_timer_active) == 1) { + /* Need to start timer */ + init_timer(&ppd->led_override_timer); + ppd->led_override_timer.function = qib_run_led_override; + ppd->led_override_timer.data = (unsigned long) ppd; + ppd->led_override_timer.expires = jiffies + 1; + add_timer(&ppd->led_override_timer); + } else { + if (ppd->led_override_vals[0] || ppd->led_override_vals[1]) + mod_timer(&ppd->led_override_timer, jiffies + 1); + atomic_dec(&ppd->led_override_timer_active); + } +} + +/** + * qib_reset_device - reset the chip if possible + * @unit: the device to reset + * + * Whether or not reset is successful, we attempt to re-initialize the chip + * (that is, much like a driver unload/reload). We clear the INITTED flag + * so that the various entry points will fail until we reinitialize. For + * now, we only allow this if no user contexts are open that use chip resources + */ +int qib_reset_device(int unit) +{ + int ret, i; + struct qib_devdata *dd = qib_lookup(unit); + struct qib_pportdata *ppd; + unsigned long flags; + int pidx; + + if (!dd) { + ret = -ENODEV; + goto bail; + } + + qib_devinfo(dd->pcidev, "Reset on unit %u requested\n", unit); + + if (!dd->kregbase || !(dd->flags & QIB_PRESENT)) { + qib_devinfo(dd->pcidev, + "Invalid unit number %u or not initialized or not present\n", + unit); + ret = -ENXIO; + goto bail; + } + + spin_lock_irqsave(&dd->uctxt_lock, flags); + if (dd->rcd) + for (i = dd->first_user_ctxt; i < dd->cfgctxts; i++) { + if (!dd->rcd[i] || !dd->rcd[i]->cnt) + continue; + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + ret = -EBUSY; + goto bail; + } + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (atomic_read(&ppd->led_override_timer_active)) { + /* Need to stop LED timer, _then_ shut off LEDs */ + del_timer_sync(&ppd->led_override_timer); + atomic_set(&ppd->led_override_timer_active, 0); + } + + /* Shut off LEDs after we are sure timer is not running */ + ppd->led_override = LED_OVER_BOTH_OFF; + dd->f_setextled(ppd, 0); + if (dd->flags & QIB_HAS_SEND_DMA) + qib_teardown_sdma(ppd); + } + + ret = dd->f_reset(dd); + if (ret == 1) + ret = qib_init(dd, 1); + else + ret = -EAGAIN; + if (ret) + qib_dev_err(dd, + "Reinitialize unit %u after reset failed with %d\n", + unit, ret); + else + qib_devinfo(dd->pcidev, + "Reinitialized unit %u after resetting\n", + unit); + +bail: + return ret; +} diff --git a/drivers/infiniband/hw/qib/qib_eeprom.c b/drivers/infiniband/hw/qib/qib_eeprom.c new file mode 100644 index 0000000..4d5d71a --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_eeprom.c @@ -0,0 +1,456 @@ +/* + * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "qib.h" + +/* + * Functions specific to the serial EEPROM on cards handled by ib_qib. + * The actual serail interface code is in qib_twsi.c. This file is a client + */ + +/** + * qib_eeprom_read - receives bytes from the eeprom via I2C + * @dd: the qlogic_ib device + * @eeprom_offset: address to read from + * @buffer: where to store result + * @len: number of bytes to receive + */ +int qib_eeprom_read(struct qib_devdata *dd, u8 eeprom_offset, + void *buff, int len) +{ + int ret; + + ret = mutex_lock_interruptible(&dd->eep_lock); + if (!ret) { + ret = qib_twsi_reset(dd); + if (ret) + qib_dev_err(dd, "EEPROM Reset for read failed\n"); + else + ret = qib_twsi_blk_rd(dd, dd->twsi_eeprom_dev, + eeprom_offset, buff, len); + mutex_unlock(&dd->eep_lock); + } + + return ret; +} + +/* + * Actually update the eeprom, first doing write enable if + * needed, then restoring write enable state. + * Must be called with eep_lock held + */ +static int eeprom_write_with_enable(struct qib_devdata *dd, u8 offset, + const void *buf, int len) +{ + int ret, pwen; + + pwen = dd->f_eeprom_wen(dd, 1); + ret = qib_twsi_reset(dd); + if (ret) + qib_dev_err(dd, "EEPROM Reset for write failed\n"); + else + ret = qib_twsi_blk_wr(dd, dd->twsi_eeprom_dev, + offset, buf, len); + dd->f_eeprom_wen(dd, pwen); + return ret; +} + +/** + * qib_eeprom_write - writes data to the eeprom via I2C + * @dd: the qlogic_ib device + * @eeprom_offset: where to place data + * @buffer: data to write + * @len: number of bytes to write + */ +int qib_eeprom_write(struct qib_devdata *dd, u8 eeprom_offset, + const void *buff, int len) +{ + int ret; + + ret = mutex_lock_interruptible(&dd->eep_lock); + if (!ret) { + ret = eeprom_write_with_enable(dd, eeprom_offset, buff, len); + mutex_unlock(&dd->eep_lock); + } + + return ret; +} + +static u8 flash_csum(struct qib_flash *ifp, int adjust) +{ + u8 *ip = (u8 *) ifp; + u8 csum = 0, len; + + /* + * Limit length checksummed to max length of actual data. + * Checksum of erased eeprom will still be bad, but we avoid + * reading past the end of the buffer we were passed. + */ + len = ifp->if_length; + if (len > sizeof(struct qib_flash)) + len = sizeof(struct qib_flash); + while (len--) + csum += *ip++; + csum -= ifp->if_csum; + csum = ~csum; + if (adjust) + ifp->if_csum = csum; + + return csum; +} + +/** + * qib_get_eeprom_info- get the GUID et al. from the TSWI EEPROM device + * @dd: the qlogic_ib device + * + * We have the capability to use the nguid field, and get + * the guid from the first chip's flash, to use for all of them. + */ +void qib_get_eeprom_info(struct qib_devdata *dd) +{ + void *buf; + struct qib_flash *ifp; + __be64 guid; + int len, eep_stat; + u8 csum, *bguid; + int t = dd->unit; + struct qib_devdata *dd0 = qib_lookup(0); + + if (t && dd0->nguid > 1 && t <= dd0->nguid) { + u8 oguid; + dd->base_guid = dd0->base_guid; + bguid = (u8 *) &dd->base_guid; + + oguid = bguid[7]; + bguid[7] += t; + if (oguid > bguid[7]) { + if (bguid[6] == 0xff) { + if (bguid[5] == 0xff) { + qib_dev_err(dd, + "Can't set %s GUID from base, wraps to OUI!\n", + qib_get_unit_name(t)); + dd->base_guid = 0; + goto bail; + } + bguid[5]++; + } + bguid[6]++; + } + dd->nguid = 1; + goto bail; + } + + /* + * Read full flash, not just currently used part, since it may have + * been written with a newer definition. + * */ + len = sizeof(struct qib_flash); + buf = vmalloc(len); + if (!buf) { + qib_dev_err(dd, + "Couldn't allocate memory to read %u bytes from eeprom for GUID\n", + len); + goto bail; + } + + /* + * Use "public" eeprom read function, which does locking and + * figures out device. This will migrate to chip-specific. + */ + eep_stat = qib_eeprom_read(dd, 0, buf, len); + + if (eep_stat) { + qib_dev_err(dd, "Failed reading GUID from eeprom\n"); + goto done; + } + ifp = (struct qib_flash *)buf; + + csum = flash_csum(ifp, 0); + if (csum != ifp->if_csum) { + qib_devinfo(dd->pcidev, + "Bad I2C flash checksum: 0x%x, not 0x%x\n", + csum, ifp->if_csum); + goto done; + } + if (*(__be64 *) ifp->if_guid == cpu_to_be64(0) || + *(__be64 *) ifp->if_guid == ~cpu_to_be64(0)) { + qib_dev_err(dd, + "Invalid GUID %llx from flash; ignoring\n", + *(unsigned long long *) ifp->if_guid); + /* don't allow GUID if all 0 or all 1's */ + goto done; + } + + /* complain, but allow it */ + if (*(u64 *) ifp->if_guid == 0x100007511000000ULL) + qib_devinfo(dd->pcidev, + "Warning, GUID %llx is default, probably not correct!\n", + *(unsigned long long *) ifp->if_guid); + + bguid = ifp->if_guid; + if (!bguid[0] && !bguid[1] && !bguid[2]) { + /* + * Original incorrect GUID format in flash; fix in + * core copy, by shifting up 2 octets; don't need to + * change top octet, since both it and shifted are 0. + */ + bguid[1] = bguid[3]; + bguid[2] = bguid[4]; + bguid[3] = 0; + bguid[4] = 0; + guid = *(__be64 *) ifp->if_guid; + } else + guid = *(__be64 *) ifp->if_guid; + dd->base_guid = guid; + dd->nguid = ifp->if_numguid; + /* + * Things are slightly complicated by the desire to transparently + * support both the Pathscale 10-digit serial number and the QLogic + * 13-character version. + */ + if ((ifp->if_fversion > 1) && ifp->if_sprefix[0] && + ((u8 *) ifp->if_sprefix)[0] != 0xFF) { + char *snp = dd->serial; + + /* + * This board has a Serial-prefix, which is stored + * elsewhere for backward-compatibility. + */ + memcpy(snp, ifp->if_sprefix, sizeof ifp->if_sprefix); + snp[sizeof ifp->if_sprefix] = '\0'; + len = strlen(snp); + snp += len; + len = (sizeof dd->serial) - len; + if (len > sizeof ifp->if_serial) + len = sizeof ifp->if_serial; + memcpy(snp, ifp->if_serial, len); + } else + memcpy(dd->serial, ifp->if_serial, + sizeof ifp->if_serial); + if (!strstr(ifp->if_comment, "Tested successfully")) + qib_dev_err(dd, + "Board SN %s did not pass functional test: %s\n", + dd->serial, ifp->if_comment); + + memcpy(&dd->eep_st_errs, &ifp->if_errcntp, QIB_EEP_LOG_CNT); + /* + * Power-on (actually "active") hours are kept as little-endian value + * in EEPROM, but as seconds in a (possibly as small as 24-bit) + * atomic_t while running. + */ + atomic_set(&dd->active_time, 0); + dd->eep_hrs = ifp->if_powerhour[0] | (ifp->if_powerhour[1] << 8); + +done: + vfree(buf); + +bail:; +} + +/** + * qib_update_eeprom_log - copy active-time and error counters to eeprom + * @dd: the qlogic_ib device + * + * Although the time is kept as seconds in the qib_devdata struct, it is + * rounded to hours for re-write, as we have only 16 bits in EEPROM. + * First-cut code reads whole (expected) struct qib_flash, modifies, + * re-writes. Future direction: read/write only what we need, assuming + * that the EEPROM had to have been "good enough" for driver init, and + * if not, we aren't making it worse. + * + */ +int qib_update_eeprom_log(struct qib_devdata *dd) +{ + void *buf; + struct qib_flash *ifp; + int len, hi_water; + uint32_t new_time, new_hrs; + u8 csum; + int ret, idx; + unsigned long flags; + + /* first, check if we actually need to do anything. */ + ret = 0; + for (idx = 0; idx < QIB_EEP_LOG_CNT; ++idx) { + if (dd->eep_st_new_errs[idx]) { + ret = 1; + break; + } + } + new_time = atomic_read(&dd->active_time); + + if (ret == 0 && new_time < 3600) + goto bail; + + /* + * The quick-check above determined that there is something worthy + * of logging, so get current contents and do a more detailed idea. + * read full flash, not just currently used part, since it may have + * been written with a newer definition + */ + len = sizeof(struct qib_flash); + buf = vmalloc(len); + ret = 1; + if (!buf) { + qib_dev_err(dd, + "Couldn't allocate memory to read %u bytes from eeprom for logging\n", + len); + goto bail; + } + + /* Grab semaphore and read current EEPROM. If we get an + * error, let go, but if not, keep it until we finish write. + */ + ret = mutex_lock_interruptible(&dd->eep_lock); + if (ret) { + qib_dev_err(dd, "Unable to acquire EEPROM for logging\n"); + goto free_bail; + } + ret = qib_twsi_blk_rd(dd, dd->twsi_eeprom_dev, 0, buf, len); + if (ret) { + mutex_unlock(&dd->eep_lock); + qib_dev_err(dd, "Unable read EEPROM for logging\n"); + goto free_bail; + } + ifp = (struct qib_flash *)buf; + + csum = flash_csum(ifp, 0); + if (csum != ifp->if_csum) { + mutex_unlock(&dd->eep_lock); + qib_dev_err(dd, "EEPROM cks err (0x%02X, S/B 0x%02X)\n", + csum, ifp->if_csum); + ret = 1; + goto free_bail; + } + hi_water = 0; + spin_lock_irqsave(&dd->eep_st_lock, flags); + for (idx = 0; idx < QIB_EEP_LOG_CNT; ++idx) { + int new_val = dd->eep_st_new_errs[idx]; + if (new_val) { + /* + * If we have seen any errors, add to EEPROM values + * We need to saturate at 0xFF (255) and we also + * would need to adjust the checksum if we were + * trying to minimize EEPROM traffic + * Note that we add to actual current count in EEPROM, + * in case it was altered while we were running. + */ + new_val += ifp->if_errcntp[idx]; + if (new_val > 0xFF) + new_val = 0xFF; + if (ifp->if_errcntp[idx] != new_val) { + ifp->if_errcntp[idx] = new_val; + hi_water = offsetof(struct qib_flash, + if_errcntp) + idx; + } + /* + * update our shadow (used to minimize EEPROM + * traffic), to match what we are about to write. + */ + dd->eep_st_errs[idx] = new_val; + dd->eep_st_new_errs[idx] = 0; + } + } + /* + * Now update active-time. We would like to round to the nearest hour + * but unless atomic_t are sure to be proper signed ints we cannot, + * because we need to account for what we "transfer" to EEPROM and + * if we log an hour at 31 minutes, then we would need to set + * active_time to -29 to accurately count the _next_ hour. + */ + if (new_time >= 3600) { + new_hrs = new_time / 3600; + atomic_sub((new_hrs * 3600), &dd->active_time); + new_hrs += dd->eep_hrs; + if (new_hrs > 0xFFFF) + new_hrs = 0xFFFF; + dd->eep_hrs = new_hrs; + if ((new_hrs & 0xFF) != ifp->if_powerhour[0]) { + ifp->if_powerhour[0] = new_hrs & 0xFF; + hi_water = offsetof(struct qib_flash, if_powerhour); + } + if ((new_hrs >> 8) != ifp->if_powerhour[1]) { + ifp->if_powerhour[1] = new_hrs >> 8; + hi_water = offsetof(struct qib_flash, if_powerhour) + 1; + } + } + /* + * There is a tiny possibility that we could somehow fail to write + * the EEPROM after updating our shadows, but problems from holding + * the spinlock too long are a much bigger issue. + */ + spin_unlock_irqrestore(&dd->eep_st_lock, flags); + if (hi_water) { + /* we made some change to the data, uopdate cksum and write */ + csum = flash_csum(ifp, 1); + ret = eeprom_write_with_enable(dd, 0, buf, hi_water + 1); + } + mutex_unlock(&dd->eep_lock); + if (ret) + qib_dev_err(dd, "Failed updating EEPROM\n"); + +free_bail: + vfree(buf); +bail: + return ret; +} + +/** + * qib_inc_eeprom_err - increment one of the four error counters + * that are logged to EEPROM. + * @dd: the qlogic_ib device + * @eidx: 0..3, the counter to increment + * @incr: how much to add + * + * Each counter is 8-bits, and saturates at 255 (0xFF). They + * are copied to the EEPROM (aka flash) whenever qib_update_eeprom_log() + * is called, but it can only be called in a context that allows sleep. + * This function can be called even at interrupt level. + */ +void qib_inc_eeprom_err(struct qib_devdata *dd, u32 eidx, u32 incr) +{ + uint new_val; + unsigned long flags; + + spin_lock_irqsave(&dd->eep_st_lock, flags); + new_val = dd->eep_st_new_errs[eidx] + incr; + if (new_val > 255) + new_val = 255; + dd->eep_st_new_errs[eidx] = new_val; + spin_unlock_irqrestore(&dd->eep_st_lock, flags); +} diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c new file mode 100644 index 0000000..b15e34e --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -0,0 +1,2410 @@ +/* + * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "qib.h" +#include "qib_common.h" +#include "qib_user_sdma.h" + +#undef pr_fmt +#define pr_fmt(fmt) QIB_DRV_NAME ": " fmt + +static int qib_open(struct inode *, struct file *); +static int qib_close(struct inode *, struct file *); +static ssize_t qib_write(struct file *, const char __user *, size_t, loff_t *); +static ssize_t qib_aio_write(struct kiocb *, const struct iovec *, + unsigned long, loff_t); +static unsigned int qib_poll(struct file *, struct poll_table_struct *); +static int qib_mmapf(struct file *, struct vm_area_struct *); + +static const struct file_operations qib_file_ops = { + .owner = THIS_MODULE, + .write = qib_write, + .aio_write = qib_aio_write, + .open = qib_open, + .release = qib_close, + .poll = qib_poll, + .mmap = qib_mmapf, + .llseek = noop_llseek, +}; + +/* + * Convert kernel virtual addresses to physical addresses so they don't + * potentially conflict with the chip addresses used as mmap offsets. + * It doesn't really matter what mmap offset we use as long as we can + * interpret it correctly. + */ +static u64 cvt_kvaddr(void *p) +{ + struct page *page; + u64 paddr = 0; + + page = vmalloc_to_page(p); + if (page) + paddr = page_to_pfn(page) << PAGE_SHIFT; + + return paddr; +} + +static int qib_get_base_info(struct file *fp, void __user *ubase, + size_t ubase_size) +{ + struct qib_ctxtdata *rcd = ctxt_fp(fp); + int ret = 0; + struct qib_base_info *kinfo = NULL; + struct qib_devdata *dd = rcd->dd; + struct qib_pportdata *ppd = rcd->ppd; + unsigned subctxt_cnt; + int shared, master; + size_t sz; + + subctxt_cnt = rcd->subctxt_cnt; + if (!subctxt_cnt) { + shared = 0; + master = 0; + subctxt_cnt = 1; + } else { + shared = 1; + master = !subctxt_fp(fp); + } + + sz = sizeof(*kinfo); + /* If context sharing is not requested, allow the old size structure */ + if (!shared) + sz -= 7 * sizeof(u64); + if (ubase_size < sz) { + ret = -EINVAL; + goto bail; + } + + kinfo = kzalloc(sizeof(*kinfo), GFP_KERNEL); + if (kinfo == NULL) { + ret = -ENOMEM; + goto bail; + } + + ret = dd->f_get_base_info(rcd, kinfo); + if (ret < 0) + goto bail; + + kinfo->spi_rcvhdr_cnt = dd->rcvhdrcnt; + kinfo->spi_rcvhdrent_size = dd->rcvhdrentsize; + kinfo->spi_tidegrcnt = rcd->rcvegrcnt; + kinfo->spi_rcv_egrbufsize = dd->rcvegrbufsize; + /* + * have to mmap whole thing + */ + kinfo->spi_rcv_egrbuftotlen = + rcd->rcvegrbuf_chunks * rcd->rcvegrbuf_size; + kinfo->spi_rcv_egrperchunk = rcd->rcvegrbufs_perchunk; + kinfo->spi_rcv_egrchunksize = kinfo->spi_rcv_egrbuftotlen / + rcd->rcvegrbuf_chunks; + kinfo->spi_tidcnt = dd->rcvtidcnt / subctxt_cnt; + if (master) + kinfo->spi_tidcnt += dd->rcvtidcnt % subctxt_cnt; + /* + * for this use, may be cfgctxts summed over all chips that + * are are configured and present + */ + kinfo->spi_nctxts = dd->cfgctxts; + /* unit (chip/board) our context is on */ + kinfo->spi_unit = dd->unit; + kinfo->spi_port = ppd->port; + /* for now, only a single page */ + kinfo->spi_tid_maxsize = PAGE_SIZE; + + /* + * Doing this per context, and based on the skip value, etc. This has + * to be the actual buffer size, since the protocol code treats it + * as an array. + * + * These have to be set to user addresses in the user code via mmap. + * These values are used on return to user code for the mmap target + * addresses only. For 32 bit, same 44 bit address problem, so use + * the physical address, not virtual. Before 2.6.11, using the + * page_address() macro worked, but in 2.6.11, even that returns the + * full 64 bit address (upper bits all 1's). So far, using the + * physical addresses (or chip offsets, for chip mapping) works, but + * no doubt some future kernel release will change that, and we'll be + * on to yet another method of dealing with this. + * Normally only one of rcvhdr_tailaddr or rhf_offset is useful + * since the chips with non-zero rhf_offset don't normally + * enable tail register updates to host memory, but for testing, + * both can be enabled and used. + */ + kinfo->spi_rcvhdr_base = (u64) rcd->rcvhdrq_phys; + kinfo->spi_rcvhdr_tailaddr = (u64) rcd->rcvhdrqtailaddr_phys; + kinfo->spi_rhf_offset = dd->rhf_offset; + kinfo->spi_rcv_egrbufs = (u64) rcd->rcvegr_phys; + kinfo->spi_pioavailaddr = (u64) dd->pioavailregs_phys; + /* setup per-unit (not port) status area for user programs */ + kinfo->spi_status = (u64) kinfo->spi_pioavailaddr + + (char *) ppd->statusp - + (char *) dd->pioavailregs_dma; + kinfo->spi_uregbase = (u64) dd->uregbase + dd->ureg_align * rcd->ctxt; + if (!shared) { + kinfo->spi_piocnt = rcd->piocnt; + kinfo->spi_piobufbase = (u64) rcd->piobufs; + kinfo->spi_sendbuf_status = cvt_kvaddr(rcd->user_event_mask); + } else if (master) { + kinfo->spi_piocnt = (rcd->piocnt / subctxt_cnt) + + (rcd->piocnt % subctxt_cnt); + /* Master's PIO buffers are after all the slave's */ + kinfo->spi_piobufbase = (u64) rcd->piobufs + + dd->palign * + (rcd->piocnt - kinfo->spi_piocnt); + } else { + unsigned slave = subctxt_fp(fp) - 1; + + kinfo->spi_piocnt = rcd->piocnt / subctxt_cnt; + kinfo->spi_piobufbase = (u64) rcd->piobufs + + dd->palign * kinfo->spi_piocnt * slave; + } + + if (shared) { + kinfo->spi_sendbuf_status = + cvt_kvaddr(&rcd->user_event_mask[subctxt_fp(fp)]); + /* only spi_subctxt_* fields should be set in this block! */ + kinfo->spi_subctxt_uregbase = cvt_kvaddr(rcd->subctxt_uregbase); + + kinfo->spi_subctxt_rcvegrbuf = + cvt_kvaddr(rcd->subctxt_rcvegrbuf); + kinfo->spi_subctxt_rcvhdr_base = + cvt_kvaddr(rcd->subctxt_rcvhdr_base); + } + + /* + * All user buffers are 2KB buffers. If we ever support + * giving 4KB buffers to user processes, this will need some + * work. Can't use piobufbase directly, because it has + * both 2K and 4K buffer base values. + */ + kinfo->spi_pioindex = (kinfo->spi_piobufbase - dd->pio2k_bufbase) / + dd->palign; + kinfo->spi_pioalign = dd->palign; + kinfo->spi_qpair = QIB_KD_QP; + /* + * user mode PIO buffers are always 2KB, even when 4KB can + * be received, and sent via the kernel; this is ibmaxlen + * for 2K MTU. + */ + kinfo->spi_piosize = dd->piosize2k - 2 * sizeof(u32); + kinfo->spi_mtu = ppd->ibmaxlen; /* maxlen, not ibmtu */ + kinfo->spi_ctxt = rcd->ctxt; + kinfo->spi_subctxt = subctxt_fp(fp); + kinfo->spi_sw_version = QIB_KERN_SWVERSION; + kinfo->spi_sw_version |= 1U << 31; /* QLogic-built, not kernel.org */ + kinfo->spi_hw_version = dd->revision; + + if (master) + kinfo->spi_runtime_flags |= QIB_RUNTIME_MASTER; + + sz = (ubase_size < sizeof(*kinfo)) ? ubase_size : sizeof(*kinfo); + if (copy_to_user(ubase, kinfo, sz)) + ret = -EFAULT; +bail: + kfree(kinfo); + return ret; +} + +/** + * qib_tid_update - update a context TID + * @rcd: the context + * @fp: the qib device file + * @ti: the TID information + * + * The new implementation as of Oct 2004 is that the driver assigns + * the tid and returns it to the caller. To reduce search time, we + * keep a cursor for each context, walking the shadow tid array to find + * one that's not in use. + * + * For now, if we can't allocate the full list, we fail, although + * in the long run, we'll allocate as many as we can, and the + * caller will deal with that by trying the remaining pages later. + * That means that when we fail, we have to mark the tids as not in + * use again, in our shadow copy. + * + * It's up to the caller to free the tids when they are done. + * We'll unlock the pages as they free them. + * + * Also, right now we are locking one page at a time, but since + * the intended use of this routine is for a single group of + * virtually contiguous pages, that should change to improve + * performance. + */ +static int qib_tid_update(struct qib_ctxtdata *rcd, struct file *fp, + const struct qib_tid_info *ti) +{ + int ret = 0, ntids; + u32 tid, ctxttid, cnt, i, tidcnt, tidoff; + u16 *tidlist; + struct qib_devdata *dd = rcd->dd; + u64 physaddr; + unsigned long vaddr; + u64 __iomem *tidbase; + unsigned long tidmap[8]; + struct page **pagep = NULL; + unsigned subctxt = subctxt_fp(fp); + + if (!dd->pageshadow) { + ret = -ENOMEM; + goto done; + } + + cnt = ti->tidcnt; + if (!cnt) { + ret = -EFAULT; + goto done; + } + ctxttid = rcd->ctxt * dd->rcvtidcnt; + if (!rcd->subctxt_cnt) { + tidcnt = dd->rcvtidcnt; + tid = rcd->tidcursor; + tidoff = 0; + } else if (!subctxt) { + tidcnt = (dd->rcvtidcnt / rcd->subctxt_cnt) + + (dd->rcvtidcnt % rcd->subctxt_cnt); + tidoff = dd->rcvtidcnt - tidcnt; + ctxttid += tidoff; + tid = tidcursor_fp(fp); + } else { + tidcnt = dd->rcvtidcnt / rcd->subctxt_cnt; + tidoff = tidcnt * (subctxt - 1); + ctxttid += tidoff; + tid = tidcursor_fp(fp); + } + if (cnt > tidcnt) { + /* make sure it all fits in tid_pg_list */ + qib_devinfo(dd->pcidev, + "Process tried to allocate %u TIDs, only trying max (%u)\n", + cnt, tidcnt); + cnt = tidcnt; + } + pagep = (struct page **) rcd->tid_pg_list; + tidlist = (u16 *) &pagep[dd->rcvtidcnt]; + pagep += tidoff; + tidlist += tidoff; + + memset(tidmap, 0, sizeof(tidmap)); + /* before decrement; chip actual # */ + ntids = tidcnt; + tidbase = (u64 __iomem *) (((char __iomem *) dd->kregbase) + + dd->rcvtidbase + + ctxttid * sizeof(*tidbase)); + + /* virtual address of first page in transfer */ + vaddr = ti->tidvaddr; + if (!access_ok(VERIFY_WRITE, (void __user *) vaddr, + cnt * PAGE_SIZE)) { + ret = -EFAULT; + goto done; + } + ret = qib_get_user_pages(vaddr, cnt, pagep); + if (ret) { + /* + * if (ret == -EBUSY) + * We can't continue because the pagep array won't be + * initialized. This should never happen, + * unless perhaps the user has mpin'ed the pages + * themselves. + */ + qib_devinfo(dd->pcidev, + "Failed to lock addr %p, %u pages: " + "errno %d\n", (void *) vaddr, cnt, -ret); + goto done; + } + for (i = 0; i < cnt; i++, vaddr += PAGE_SIZE) { + for (; ntids--; tid++) { + if (tid == tidcnt) + tid = 0; + if (!dd->pageshadow[ctxttid + tid]) + break; + } + if (ntids < 0) { + /* + * Oops, wrapped all the way through their TIDs, + * and didn't have enough free; see comments at + * start of routine + */ + i--; /* last tidlist[i] not filled in */ + ret = -ENOMEM; + break; + } + tidlist[i] = tid + tidoff; + /* we "know" system pages and TID pages are same size */ + dd->pageshadow[ctxttid + tid] = pagep[i]; + dd->physshadow[ctxttid + tid] = + qib_map_page(dd->pcidev, pagep[i], 0, PAGE_SIZE, + PCI_DMA_FROMDEVICE); + /* + * don't need atomic or it's overhead + */ + __set_bit(tid, tidmap); + physaddr = dd->physshadow[ctxttid + tid]; + /* PERFORMANCE: below should almost certainly be cached */ + dd->f_put_tid(dd, &tidbase[tid], + RCVHQ_RCV_TYPE_EXPECTED, physaddr); + /* + * don't check this tid in qib_ctxtshadow, since we + * just filled it in; start with the next one. + */ + tid++; + } + + if (ret) { + u32 limit; +cleanup: + /* jump here if copy out of updated info failed... */ + /* same code that's in qib_free_tid() */ + limit = sizeof(tidmap) * BITS_PER_BYTE; + if (limit > tidcnt) + /* just in case size changes in future */ + limit = tidcnt; + tid = find_first_bit((const unsigned long *)tidmap, limit); + for (; tid < limit; tid++) { + if (!test_bit(tid, tidmap)) + continue; + if (dd->pageshadow[ctxttid + tid]) { + dma_addr_t phys; + + phys = dd->physshadow[ctxttid + tid]; + dd->physshadow[ctxttid + tid] = dd->tidinvalid; + /* PERFORMANCE: below should almost certainly + * be cached + */ + dd->f_put_tid(dd, &tidbase[tid], + RCVHQ_RCV_TYPE_EXPECTED, + dd->tidinvalid); + pci_unmap_page(dd->pcidev, phys, PAGE_SIZE, + PCI_DMA_FROMDEVICE); + dd->pageshadow[ctxttid + tid] = NULL; + } + } + qib_release_user_pages(pagep, cnt); + } else { + /* + * Copy the updated array, with qib_tid's filled in, back + * to user. Since we did the copy in already, this "should + * never fail" If it does, we have to clean up... + */ + if (copy_to_user((void __user *) + (unsigned long) ti->tidlist, + tidlist, cnt * sizeof(*tidlist))) { + ret = -EFAULT; + goto cleanup; + } + if (copy_to_user((void __user *) (unsigned long) ti->tidmap, + tidmap, sizeof tidmap)) { + ret = -EFAULT; + goto cleanup; + } + if (tid == tidcnt) + tid = 0; + if (!rcd->subctxt_cnt) + rcd->tidcursor = tid; + else + tidcursor_fp(fp) = tid; + } + +done: + return ret; +} + +/** + * qib_tid_free - free a context TID + * @rcd: the context + * @subctxt: the subcontext + * @ti: the TID info + * + * right now we are unlocking one page at a time, but since + * the intended use of this routine is for a single group of + * virtually contiguous pages, that should change to improve + * performance. We check that the TID is in range for this context + * but otherwise don't check validity; if user has an error and + * frees the wrong tid, it's only their own data that can thereby + * be corrupted. We do check that the TID was in use, for sanity + * We always use our idea of the saved address, not the address that + * they pass in to us. + */ +static int qib_tid_free(struct qib_ctxtdata *rcd, unsigned subctxt, + const struct qib_tid_info *ti) +{ + int ret = 0; + u32 tid, ctxttid, cnt, limit, tidcnt; + struct qib_devdata *dd = rcd->dd; + u64 __iomem *tidbase; + unsigned long tidmap[8]; + + if (!dd->pageshadow) { + ret = -ENOMEM; + goto done; + } + + if (copy_from_user(tidmap, (void __user *)(unsigned long)ti->tidmap, + sizeof tidmap)) { + ret = -EFAULT; + goto done; + } + + ctxttid = rcd->ctxt * dd->rcvtidcnt; + if (!rcd->subctxt_cnt) + tidcnt = dd->rcvtidcnt; + else if (!subctxt) { + tidcnt = (dd->rcvtidcnt / rcd->subctxt_cnt) + + (dd->rcvtidcnt % rcd->subctxt_cnt); + ctxttid += dd->rcvtidcnt - tidcnt; + } else { + tidcnt = dd->rcvtidcnt / rcd->subctxt_cnt; + ctxttid += tidcnt * (subctxt - 1); + } + tidbase = (u64 __iomem *) ((char __iomem *)(dd->kregbase) + + dd->rcvtidbase + + ctxttid * sizeof(*tidbase)); + + limit = sizeof(tidmap) * BITS_PER_BYTE; + if (limit > tidcnt) + /* just in case size changes in future */ + limit = tidcnt; + tid = find_first_bit(tidmap, limit); + for (cnt = 0; tid < limit; tid++) { + /* + * small optimization; if we detect a run of 3 or so without + * any set, use find_first_bit again. That's mainly to + * accelerate the case where we wrapped, so we have some at + * the beginning, and some at the end, and a big gap + * in the middle. + */ + if (!test_bit(tid, tidmap)) + continue; + cnt++; + if (dd->pageshadow[ctxttid + tid]) { + struct page *p; + dma_addr_t phys; + + p = dd->pageshadow[ctxttid + tid]; + dd->pageshadow[ctxttid + tid] = NULL; + phys = dd->physshadow[ctxttid + tid]; + dd->physshadow[ctxttid + tid] = dd->tidinvalid; + /* PERFORMANCE: below should almost certainly be + * cached + */ + dd->f_put_tid(dd, &tidbase[tid], + RCVHQ_RCV_TYPE_EXPECTED, dd->tidinvalid); + pci_unmap_page(dd->pcidev, phys, PAGE_SIZE, + PCI_DMA_FROMDEVICE); + qib_release_user_pages(&p, 1); + } + } +done: + return ret; +} + +/** + * qib_set_part_key - set a partition key + * @rcd: the context + * @key: the key + * + * We can have up to 4 active at a time (other than the default, which is + * always allowed). This is somewhat tricky, since multiple contexts may set + * the same key, so we reference count them, and clean up at exit. All 4 + * partition keys are packed into a single qlogic_ib register. It's an + * error for a process to set the same pkey multiple times. We provide no + * mechanism to de-allocate a pkey at this time, we may eventually need to + * do that. I've used the atomic operations, and no locking, and only make + * a single pass through what's available. This should be more than + * adequate for some time. I'll think about spinlocks or the like if and as + * it's necessary. + */ +static int qib_set_part_key(struct qib_ctxtdata *rcd, u16 key) +{ + struct qib_pportdata *ppd = rcd->ppd; + int i, any = 0, pidx = -1; + u16 lkey = key & 0x7FFF; + int ret; + + if (lkey == (QIB_DEFAULT_P_KEY & 0x7FFF)) { + /* nothing to do; this key always valid */ + ret = 0; + goto bail; + } + + if (!lkey) { + ret = -EINVAL; + goto bail; + } + + /* + * Set the full membership bit, because it has to be + * set in the register or the packet, and it seems + * cleaner to set in the register than to force all + * callers to set it. + */ + key |= 0x8000; + + for (i = 0; i < ARRAY_SIZE(rcd->pkeys); i++) { + if (!rcd->pkeys[i] && pidx == -1) + pidx = i; + if (rcd->pkeys[i] == key) { + ret = -EEXIST; + goto bail; + } + } + if (pidx == -1) { + ret = -EBUSY; + goto bail; + } + for (any = i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) { + if (!ppd->pkeys[i]) { + any++; + continue; + } + if (ppd->pkeys[i] == key) { + atomic_t *pkrefs = &ppd->pkeyrefs[i]; + + if (atomic_inc_return(pkrefs) > 1) { + rcd->pkeys[pidx] = key; + ret = 0; + goto bail; + } else { + /* + * lost race, decrement count, catch below + */ + atomic_dec(pkrefs); + any++; + } + } + if ((ppd->pkeys[i] & 0x7FFF) == lkey) { + /* + * It makes no sense to have both the limited and + * full membership PKEY set at the same time since + * the unlimited one will disable the limited one. + */ + ret = -EEXIST; + goto bail; + } + } + if (!any) { + ret = -EBUSY; + goto bail; + } + for (any = i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) { + if (!ppd->pkeys[i] && + atomic_inc_return(&ppd->pkeyrefs[i]) == 1) { + rcd->pkeys[pidx] = key; + ppd->pkeys[i] = key; + (void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PKEYS, 0); + ret = 0; + goto bail; + } + } + ret = -EBUSY; + +bail: + return ret; +} + +/** + * qib_manage_rcvq - manage a context's receive queue + * @rcd: the context + * @subctxt: the subcontext + * @start_stop: action to carry out + * + * start_stop == 0 disables receive on the context, for use in queue + * overflow conditions. start_stop==1 re-enables, to be used to + * re-init the software copy of the head register + */ +static int qib_manage_rcvq(struct qib_ctxtdata *rcd, unsigned subctxt, + int start_stop) +{ + struct qib_devdata *dd = rcd->dd; + unsigned int rcvctrl_op; + + if (subctxt) + goto bail; + /* atomically clear receive enable ctxt. */ + if (start_stop) { + /* + * On enable, force in-memory copy of the tail register to + * 0, so that protocol code doesn't have to worry about + * whether or not the chip has yet updated the in-memory + * copy or not on return from the system call. The chip + * always resets it's tail register back to 0 on a + * transition from disabled to enabled. + */ + if (rcd->rcvhdrtail_kvaddr) + qib_clear_rcvhdrtail(rcd); + rcvctrl_op = QIB_RCVCTRL_CTXT_ENB; + } else + rcvctrl_op = QIB_RCVCTRL_CTXT_DIS; + dd->f_rcvctrl(rcd->ppd, rcvctrl_op, rcd->ctxt); + /* always; new head should be equal to new tail; see above */ +bail: + return 0; +} + +static void qib_clean_part_key(struct qib_ctxtdata *rcd, + struct qib_devdata *dd) +{ + int i, j, pchanged = 0; + u64 oldpkey; + struct qib_pportdata *ppd = rcd->ppd; + + /* for debugging only */ + oldpkey = (u64) ppd->pkeys[0] | + ((u64) ppd->pkeys[1] << 16) | + ((u64) ppd->pkeys[2] << 32) | + ((u64) ppd->pkeys[3] << 48); + + for (i = 0; i < ARRAY_SIZE(rcd->pkeys); i++) { + if (!rcd->pkeys[i]) + continue; + for (j = 0; j < ARRAY_SIZE(ppd->pkeys); j++) { + /* check for match independent of the global bit */ + if ((ppd->pkeys[j] & 0x7fff) != + (rcd->pkeys[i] & 0x7fff)) + continue; + if (atomic_dec_and_test(&ppd->pkeyrefs[j])) { + ppd->pkeys[j] = 0; + pchanged++; + } + break; + } + rcd->pkeys[i] = 0; + } + if (pchanged) + (void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PKEYS, 0); +} + +/* common code for the mappings on dma_alloc_coherent mem */ +static int qib_mmap_mem(struct vm_area_struct *vma, struct qib_ctxtdata *rcd, + unsigned len, void *kvaddr, u32 write_ok, char *what) +{ + struct qib_devdata *dd = rcd->dd; + unsigned long pfn; + int ret; + + if ((vma->vm_end - vma->vm_start) > len) { + qib_devinfo(dd->pcidev, + "FAIL on %s: len %lx > %x\n", what, + vma->vm_end - vma->vm_start, len); + ret = -EFAULT; + goto bail; + } + + /* + * shared context user code requires rcvhdrq mapped r/w, others + * only allowed readonly mapping. + */ + if (!write_ok) { + if (vma->vm_flags & VM_WRITE) { + qib_devinfo(dd->pcidev, + "%s must be mapped readonly\n", what); + ret = -EPERM; + goto bail; + } + + /* don't allow them to later change with mprotect */ + vma->vm_flags &= ~VM_MAYWRITE; + } + + pfn = virt_to_phys(kvaddr) >> PAGE_SHIFT; + ret = remap_pfn_range(vma, vma->vm_start, pfn, + len, vma->vm_page_prot); + if (ret) + qib_devinfo(dd->pcidev, + "%s ctxt%u mmap of %lx, %x bytes failed: %d\n", + what, rcd->ctxt, pfn, len, ret); +bail: + return ret; +} + +static int mmap_ureg(struct vm_area_struct *vma, struct qib_devdata *dd, + u64 ureg) +{ + unsigned long phys; + unsigned long sz; + int ret; + + /* + * This is real hardware, so use io_remap. This is the mechanism + * for the user process to update the head registers for their ctxt + * in the chip. + */ + sz = dd->flags & QIB_HAS_HDRSUPP ? 2 * PAGE_SIZE : PAGE_SIZE; + if ((vma->vm_end - vma->vm_start) > sz) { + qib_devinfo(dd->pcidev, + "FAIL mmap userreg: reqlen %lx > PAGE\n", + vma->vm_end - vma->vm_start); + ret = -EFAULT; + } else { + phys = dd->physaddr + ureg; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + ret = io_remap_pfn_range(vma, vma->vm_start, + phys >> PAGE_SHIFT, + vma->vm_end - vma->vm_start, + vma->vm_page_prot); + } + return ret; +} + +static int mmap_piobufs(struct vm_area_struct *vma, + struct qib_devdata *dd, + struct qib_ctxtdata *rcd, + unsigned piobufs, unsigned piocnt) +{ + unsigned long phys; + int ret; + + /* + * When we map the PIO buffers in the chip, we want to map them as + * writeonly, no read possible; unfortunately, x86 doesn't allow + * for this in hardware, but we still prevent users from asking + * for it. + */ + if ((vma->vm_end - vma->vm_start) > (piocnt * dd->palign)) { + qib_devinfo(dd->pcidev, + "FAIL mmap piobufs: reqlen %lx > PAGE\n", + vma->vm_end - vma->vm_start); + ret = -EINVAL; + goto bail; + } + + phys = dd->physaddr + piobufs; + +#if defined(__powerpc__) + /* There isn't a generic way to specify writethrough mappings */ + pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE; + pgprot_val(vma->vm_page_prot) |= _PAGE_WRITETHRU; + pgprot_val(vma->vm_page_prot) &= ~_PAGE_GUARDED; +#endif + + /* + * don't allow them to later change to readable with mprotect (for when + * not initially mapped readable, as is normally the case) + */ + vma->vm_flags &= ~VM_MAYREAD; + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + + if (qib_wc_pat) + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + + ret = io_remap_pfn_range(vma, vma->vm_start, phys >> PAGE_SHIFT, + vma->vm_end - vma->vm_start, + vma->vm_page_prot); +bail: + return ret; +} + +static int mmap_rcvegrbufs(struct vm_area_struct *vma, + struct qib_ctxtdata *rcd) +{ + struct qib_devdata *dd = rcd->dd; + unsigned long start, size; + size_t total_size, i; + unsigned long pfn; + int ret; + + size = rcd->rcvegrbuf_size; + total_size = rcd->rcvegrbuf_chunks * size; + if ((vma->vm_end - vma->vm_start) > total_size) { + qib_devinfo(dd->pcidev, + "FAIL on egr bufs: reqlen %lx > actual %lx\n", + vma->vm_end - vma->vm_start, + (unsigned long) total_size); + ret = -EINVAL; + goto bail; + } + + if (vma->vm_flags & VM_WRITE) { + qib_devinfo(dd->pcidev, + "Can't map eager buffers as writable (flags=%lx)\n", + vma->vm_flags); + ret = -EPERM; + goto bail; + } + /* don't allow them to later change to writeable with mprotect */ + vma->vm_flags &= ~VM_MAYWRITE; + + start = vma->vm_start; + + for (i = 0; i < rcd->rcvegrbuf_chunks; i++, start += size) { + pfn = virt_to_phys(rcd->rcvegrbuf[i]) >> PAGE_SHIFT; + ret = remap_pfn_range(vma, start, pfn, size, + vma->vm_page_prot); + if (ret < 0) + goto bail; + } + ret = 0; + +bail: + return ret; +} + +/* + * qib_file_vma_fault - handle a VMA page fault. + */ +static int qib_file_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page; + + page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT)); + if (!page) + return VM_FAULT_SIGBUS; + + get_page(page); + vmf->page = page; + + return 0; +} + +static struct vm_operations_struct qib_file_vm_ops = { + .fault = qib_file_vma_fault, +}; + +static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr, + struct qib_ctxtdata *rcd, unsigned subctxt) +{ + struct qib_devdata *dd = rcd->dd; + unsigned subctxt_cnt; + unsigned long len; + void *addr; + size_t size; + int ret = 0; + + subctxt_cnt = rcd->subctxt_cnt; + size = rcd->rcvegrbuf_chunks * rcd->rcvegrbuf_size; + + /* + * Each process has all the subctxt uregbase, rcvhdrq, and + * rcvegrbufs mmapped - as an array for all the processes, + * and also separately for this process. + */ + if (pgaddr == cvt_kvaddr(rcd->subctxt_uregbase)) { + addr = rcd->subctxt_uregbase; + size = PAGE_SIZE * subctxt_cnt; + } else if (pgaddr == cvt_kvaddr(rcd->subctxt_rcvhdr_base)) { + addr = rcd->subctxt_rcvhdr_base; + size = rcd->rcvhdrq_size * subctxt_cnt; + } else if (pgaddr == cvt_kvaddr(rcd->subctxt_rcvegrbuf)) { + addr = rcd->subctxt_rcvegrbuf; + size *= subctxt_cnt; + } else if (pgaddr == cvt_kvaddr(rcd->subctxt_uregbase + + PAGE_SIZE * subctxt)) { + addr = rcd->subctxt_uregbase + PAGE_SIZE * subctxt; + size = PAGE_SIZE; + } else if (pgaddr == cvt_kvaddr(rcd->subctxt_rcvhdr_base + + rcd->rcvhdrq_size * subctxt)) { + addr = rcd->subctxt_rcvhdr_base + + rcd->rcvhdrq_size * subctxt; + size = rcd->rcvhdrq_size; + } else if (pgaddr == cvt_kvaddr(&rcd->user_event_mask[subctxt])) { + addr = rcd->user_event_mask; + size = PAGE_SIZE; + } else if (pgaddr == cvt_kvaddr(rcd->subctxt_rcvegrbuf + + size * subctxt)) { + addr = rcd->subctxt_rcvegrbuf + size * subctxt; + /* rcvegrbufs are read-only on the slave */ + if (vma->vm_flags & VM_WRITE) { + qib_devinfo(dd->pcidev, + "Can't map eager buffers as " + "writable (flags=%lx)\n", vma->vm_flags); + ret = -EPERM; + goto bail; + } + /* + * Don't allow permission to later change to writeable + * with mprotect. + */ + vma->vm_flags &= ~VM_MAYWRITE; + } else + goto bail; + len = vma->vm_end - vma->vm_start; + if (len > size) { + ret = -EINVAL; + goto bail; + } + + vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT; + vma->vm_ops = &qib_file_vm_ops; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + ret = 1; + +bail: + return ret; +} + +/** + * qib_mmapf - mmap various structures into user space + * @fp: the file pointer + * @vma: the VM area + * + * We use this to have a shared buffer between the kernel and the user code + * for the rcvhdr queue, egr buffers, and the per-context user regs and pio + * buffers in the chip. We have the open and close entries so we can bump + * the ref count and keep the driver from being unloaded while still mapped. + */ +static int qib_mmapf(struct file *fp, struct vm_area_struct *vma) +{ + struct qib_ctxtdata *rcd; + struct qib_devdata *dd; + u64 pgaddr, ureg; + unsigned piobufs, piocnt; + int ret, match = 1; + + rcd = ctxt_fp(fp); + if (!rcd || !(vma->vm_flags & VM_SHARED)) { + ret = -EINVAL; + goto bail; + } + dd = rcd->dd; + + /* + * This is the qib_do_user_init() code, mapping the shared buffers + * and per-context user registers into the user process. The address + * referred to by vm_pgoff is the file offset passed via mmap(). + * For shared contexts, this is the kernel vmalloc() address of the + * pages to share with the master. + * For non-shared or master ctxts, this is a physical address. + * We only do one mmap for each space mapped. + */ + pgaddr = vma->vm_pgoff << PAGE_SHIFT; + + /* + * Check for 0 in case one of the allocations failed, but user + * called mmap anyway. + */ + if (!pgaddr) { + ret = -EINVAL; + goto bail; + } + + /* + * Physical addresses must fit in 40 bits for our hardware. + * Check for kernel virtual addresses first, anything else must + * match a HW or memory address. + */ + ret = mmap_kvaddr(vma, pgaddr, rcd, subctxt_fp(fp)); + if (ret) { + if (ret > 0) + ret = 0; + goto bail; + } + + ureg = dd->uregbase + dd->ureg_align * rcd->ctxt; + if (!rcd->subctxt_cnt) { + /* ctxt is not shared */ + piocnt = rcd->piocnt; + piobufs = rcd->piobufs; + } else if (!subctxt_fp(fp)) { + /* caller is the master */ + piocnt = (rcd->piocnt / rcd->subctxt_cnt) + + (rcd->piocnt % rcd->subctxt_cnt); + piobufs = rcd->piobufs + + dd->palign * (rcd->piocnt - piocnt); + } else { + unsigned slave = subctxt_fp(fp) - 1; + + /* caller is a slave */ + piocnt = rcd->piocnt / rcd->subctxt_cnt; + piobufs = rcd->piobufs + dd->palign * piocnt * slave; + } + + if (pgaddr == ureg) + ret = mmap_ureg(vma, dd, ureg); + else if (pgaddr == piobufs) + ret = mmap_piobufs(vma, dd, rcd, piobufs, piocnt); + else if (pgaddr == dd->pioavailregs_phys) + /* in-memory copy of pioavail registers */ + ret = qib_mmap_mem(vma, rcd, PAGE_SIZE, + (void *) dd->pioavailregs_dma, 0, + "pioavail registers"); + else if (pgaddr == rcd->rcvegr_phys) + ret = mmap_rcvegrbufs(vma, rcd); + else if (pgaddr == (u64) rcd->rcvhdrq_phys) + /* + * The rcvhdrq itself; multiple pages, contiguous + * from an i/o perspective. Shared contexts need + * to map r/w, so we allow writing. + */ + ret = qib_mmap_mem(vma, rcd, rcd->rcvhdrq_size, + rcd->rcvhdrq, 1, "rcvhdrq"); + else if (pgaddr == (u64) rcd->rcvhdrqtailaddr_phys) + /* in-memory copy of rcvhdrq tail register */ + ret = qib_mmap_mem(vma, rcd, PAGE_SIZE, + rcd->rcvhdrtail_kvaddr, 0, + "rcvhdrq tail"); + else + match = 0; + if (!match) + ret = -EINVAL; + + vma->vm_private_data = NULL; + + if (ret < 0) + qib_devinfo(dd->pcidev, + "mmap Failure %d: off %llx len %lx\n", + -ret, (unsigned long long)pgaddr, + vma->vm_end - vma->vm_start); +bail: + return ret; +} + +static unsigned int qib_poll_urgent(struct qib_ctxtdata *rcd, + struct file *fp, + struct poll_table_struct *pt) +{ + struct qib_devdata *dd = rcd->dd; + unsigned pollflag; + + poll_wait(fp, &rcd->wait, pt); + + spin_lock_irq(&dd->uctxt_lock); + if (rcd->urgent != rcd->urgent_poll) { + pollflag = POLLIN | POLLRDNORM; + rcd->urgent_poll = rcd->urgent; + } else { + pollflag = 0; + set_bit(QIB_CTXT_WAITING_URG, &rcd->flag); + } + spin_unlock_irq(&dd->uctxt_lock); + + return pollflag; +} + +static unsigned int qib_poll_next(struct qib_ctxtdata *rcd, + struct file *fp, + struct poll_table_struct *pt) +{ + struct qib_devdata *dd = rcd->dd; + unsigned pollflag; + + poll_wait(fp, &rcd->wait, pt); + + spin_lock_irq(&dd->uctxt_lock); + if (dd->f_hdrqempty(rcd)) { + set_bit(QIB_CTXT_WAITING_RCV, &rcd->flag); + dd->f_rcvctrl(rcd->ppd, QIB_RCVCTRL_INTRAVAIL_ENB, rcd->ctxt); + pollflag = 0; + } else + pollflag = POLLIN | POLLRDNORM; + spin_unlock_irq(&dd->uctxt_lock); + + return pollflag; +} + +static unsigned int qib_poll(struct file *fp, struct poll_table_struct *pt) +{ + struct qib_ctxtdata *rcd; + unsigned pollflag; + + rcd = ctxt_fp(fp); + if (!rcd) + pollflag = POLLERR; + else if (rcd->poll_type == QIB_POLL_TYPE_URGENT) + pollflag = qib_poll_urgent(rcd, fp, pt); + else if (rcd->poll_type == QIB_POLL_TYPE_ANYRCV) + pollflag = qib_poll_next(rcd, fp, pt); + else /* invalid */ + pollflag = POLLERR; + + return pollflag; +} + +static void assign_ctxt_affinity(struct file *fp, struct qib_devdata *dd) +{ + struct qib_filedata *fd = fp->private_data; + const unsigned int weight = cpumask_weight(¤t->cpus_allowed); + const struct cpumask *local_mask = cpumask_of_pcibus(dd->pcidev->bus); + int local_cpu; + + /* + * If process has NOT already set it's affinity, select and + * reserve a processor for it on the local NUMA node. + */ + if ((weight >= qib_cpulist_count) && + (cpumask_weight(local_mask) <= qib_cpulist_count)) { + for_each_cpu(local_cpu, local_mask) + if (!test_and_set_bit(local_cpu, qib_cpulist)) { + fd->rec_cpu_num = local_cpu; + return; + } + } + + /* + * If process has NOT already set it's affinity, select and + * reserve a processor for it, as a rendevous for all + * users of the driver. If they don't actually later + * set affinity to this cpu, or set it to some other cpu, + * it just means that sooner or later we don't recommend + * a cpu, and let the scheduler do it's best. + */ + if (weight >= qib_cpulist_count) { + int cpu; + cpu = find_first_zero_bit(qib_cpulist, + qib_cpulist_count); + if (cpu == qib_cpulist_count) + qib_dev_err(dd, + "no cpus avail for affinity PID %u\n", + current->pid); + else { + __set_bit(cpu, qib_cpulist); + fd->rec_cpu_num = cpu; + } + } +} + +/* + * Check that userland and driver are compatible for subcontexts. + */ +static int qib_compatible_subctxts(int user_swmajor, int user_swminor) +{ + /* this code is written long-hand for clarity */ + if (QIB_USER_SWMAJOR != user_swmajor) { + /* no promise of compatibility if major mismatch */ + return 0; + } + if (QIB_USER_SWMAJOR == 1) { + switch (QIB_USER_SWMINOR) { + case 0: + case 1: + case 2: + /* no subctxt implementation so cannot be compatible */ + return 0; + case 3: + /* 3 is only compatible with itself */ + return user_swminor == 3; + default: + /* >= 4 are compatible (or are expected to be) */ + return user_swminor <= QIB_USER_SWMINOR; + } + } + /* make no promises yet for future major versions */ + return 0; +} + +static int init_subctxts(struct qib_devdata *dd, + struct qib_ctxtdata *rcd, + const struct qib_user_info *uinfo) +{ + int ret = 0; + unsigned num_subctxts; + size_t size; + + /* + * If the user is requesting zero subctxts, + * skip the subctxt allocation. + */ + if (uinfo->spu_subctxt_cnt <= 0) + goto bail; + num_subctxts = uinfo->spu_subctxt_cnt; + + /* Check for subctxt compatibility */ + if (!qib_compatible_subctxts(uinfo->spu_userversion >> 16, + uinfo->spu_userversion & 0xffff)) { + qib_devinfo(dd->pcidev, + "Mismatched user version (%d.%d) and driver " + "version (%d.%d) while context sharing. Ensure " + "that driver and library are from the same " + "release.\n", + (int) (uinfo->spu_userversion >> 16), + (int) (uinfo->spu_userversion & 0xffff), + QIB_USER_SWMAJOR, QIB_USER_SWMINOR); + goto bail; + } + if (num_subctxts > QLOGIC_IB_MAX_SUBCTXT) { + ret = -EINVAL; + goto bail; + } + + rcd->subctxt_uregbase = vmalloc_user(PAGE_SIZE * num_subctxts); + if (!rcd->subctxt_uregbase) { + ret = -ENOMEM; + goto bail; + } + /* Note: rcd->rcvhdrq_size isn't initialized yet. */ + size = ALIGN(dd->rcvhdrcnt * dd->rcvhdrentsize * + sizeof(u32), PAGE_SIZE) * num_subctxts; + rcd->subctxt_rcvhdr_base = vmalloc_user(size); + if (!rcd->subctxt_rcvhdr_base) { + ret = -ENOMEM; + goto bail_ureg; + } + + rcd->subctxt_rcvegrbuf = vmalloc_user(rcd->rcvegrbuf_chunks * + rcd->rcvegrbuf_size * + num_subctxts); + if (!rcd->subctxt_rcvegrbuf) { + ret = -ENOMEM; + goto bail_rhdr; + } + + rcd->subctxt_cnt = uinfo->spu_subctxt_cnt; + rcd->subctxt_id = uinfo->spu_subctxt_id; + rcd->active_slaves = 1; + rcd->redirect_seq_cnt = 1; + set_bit(QIB_CTXT_MASTER_UNINIT, &rcd->flag); + goto bail; + +bail_rhdr: + vfree(rcd->subctxt_rcvhdr_base); +bail_ureg: + vfree(rcd->subctxt_uregbase); + rcd->subctxt_uregbase = NULL; +bail: + return ret; +} + +static int setup_ctxt(struct qib_pportdata *ppd, int ctxt, + struct file *fp, const struct qib_user_info *uinfo) +{ + struct qib_filedata *fd = fp->private_data; + struct qib_devdata *dd = ppd->dd; + struct qib_ctxtdata *rcd; + void *ptmp = NULL; + int ret; + int numa_id; + + assign_ctxt_affinity(fp, dd); + + numa_id = qib_numa_aware ? ((fd->rec_cpu_num != -1) ? + cpu_to_node(fd->rec_cpu_num) : + numa_node_id()) : dd->assigned_node_id; + + rcd = qib_create_ctxtdata(ppd, ctxt, numa_id); + + /* + * Allocate memory for use in qib_tid_update() at open to + * reduce cost of expected send setup per message segment + */ + if (rcd) + ptmp = kmalloc(dd->rcvtidcnt * sizeof(u16) + + dd->rcvtidcnt * sizeof(struct page **), + GFP_KERNEL); + + if (!rcd || !ptmp) { + qib_dev_err(dd, + "Unable to allocate ctxtdata memory, failing open\n"); + ret = -ENOMEM; + goto bailerr; + } + rcd->userversion = uinfo->spu_userversion; + ret = init_subctxts(dd, rcd, uinfo); + if (ret) + goto bailerr; + rcd->tid_pg_list = ptmp; + rcd->pid = current->pid; + init_waitqueue_head(&dd->rcd[ctxt]->wait); + strlcpy(rcd->comm, current->comm, sizeof(rcd->comm)); + ctxt_fp(fp) = rcd; + qib_stats.sps_ctxts++; + dd->freectxts--; + ret = 0; + goto bail; + +bailerr: + if (fd->rec_cpu_num != -1) + __clear_bit(fd->rec_cpu_num, qib_cpulist); + + dd->rcd[ctxt] = NULL; + kfree(rcd); + kfree(ptmp); +bail: + return ret; +} + +static inline int usable(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + + return dd && (dd->flags & QIB_PRESENT) && dd->kregbase && ppd->lid && + (ppd->lflags & QIBL_LINKACTIVE); +} + +/* + * Select a context on the given device, either using a requested port + * or the port based on the context number. + */ +static int choose_port_ctxt(struct file *fp, struct qib_devdata *dd, u32 port, + const struct qib_user_info *uinfo) +{ + struct qib_pportdata *ppd = NULL; + int ret, ctxt; + + if (port) { + if (!usable(dd->pport + port - 1)) { + ret = -ENETDOWN; + goto done; + } else + ppd = dd->pport + port - 1; + } + for (ctxt = dd->first_user_ctxt; ctxt < dd->cfgctxts && dd->rcd[ctxt]; + ctxt++) + ; + if (ctxt == dd->cfgctxts) { + ret = -EBUSY; + goto done; + } + if (!ppd) { + u32 pidx = ctxt % dd->num_pports; + if (usable(dd->pport + pidx)) + ppd = dd->pport + pidx; + else { + for (pidx = 0; pidx < dd->num_pports && !ppd; + pidx++) + if (usable(dd->pport + pidx)) + ppd = dd->pport + pidx; + } + } + ret = ppd ? setup_ctxt(ppd, ctxt, fp, uinfo) : -ENETDOWN; +done: + return ret; +} + +static int find_free_ctxt(int unit, struct file *fp, + const struct qib_user_info *uinfo) +{ + struct qib_devdata *dd = qib_lookup(unit); + int ret; + + if (!dd || (uinfo->spu_port && uinfo->spu_port > dd->num_pports)) + ret = -ENODEV; + else + ret = choose_port_ctxt(fp, dd, uinfo->spu_port, uinfo); + + return ret; +} + +static int get_a_ctxt(struct file *fp, const struct qib_user_info *uinfo, + unsigned alg) +{ + struct qib_devdata *udd = NULL; + int ret = 0, devmax, npresent, nup, ndev, dusable = 0, i; + u32 port = uinfo->spu_port, ctxt; + + devmax = qib_count_units(&npresent, &nup); + if (!npresent) { + ret = -ENXIO; + goto done; + } + if (nup == 0) { + ret = -ENETDOWN; + goto done; + } + + if (alg == QIB_PORT_ALG_ACROSS) { + unsigned inuse = ~0U; + /* find device (with ACTIVE ports) with fewest ctxts in use */ + for (ndev = 0; ndev < devmax; ndev++) { + struct qib_devdata *dd = qib_lookup(ndev); + unsigned cused = 0, cfree = 0, pusable = 0; + if (!dd) + continue; + if (port && port <= dd->num_pports && + usable(dd->pport + port - 1)) + pusable = 1; + else + for (i = 0; i < dd->num_pports; i++) + if (usable(dd->pport + i)) + pusable++; + if (!pusable) + continue; + for (ctxt = dd->first_user_ctxt; ctxt < dd->cfgctxts; + ctxt++) + if (dd->rcd[ctxt]) + cused++; + else + cfree++; + if (cfree && cused < inuse) { + udd = dd; + inuse = cused; + } + } + if (udd) { + ret = choose_port_ctxt(fp, udd, port, uinfo); + goto done; + } + } else { + for (ndev = 0; ndev < devmax; ndev++) { + struct qib_devdata *dd = qib_lookup(ndev); + if (dd) { + ret = choose_port_ctxt(fp, dd, port, uinfo); + if (!ret) + goto done; + if (ret == -EBUSY) + dusable++; + } + } + } + ret = dusable ? -EBUSY : -ENETDOWN; + +done: + return ret; +} + +static int find_shared_ctxt(struct file *fp, + const struct qib_user_info *uinfo) +{ + int devmax, ndev, i; + int ret = 0; + + devmax = qib_count_units(NULL, NULL); + + for (ndev = 0; ndev < devmax; ndev++) { + struct qib_devdata *dd = qib_lookup(ndev); + + /* device portion of usable() */ + if (!(dd && (dd->flags & QIB_PRESENT) && dd->kregbase)) + continue; + for (i = dd->first_user_ctxt; i < dd->cfgctxts; i++) { + struct qib_ctxtdata *rcd = dd->rcd[i]; + + /* Skip ctxts which are not yet open */ + if (!rcd || !rcd->cnt) + continue; + /* Skip ctxt if it doesn't match the requested one */ + if (rcd->subctxt_id != uinfo->spu_subctxt_id) + continue; + /* Verify the sharing process matches the master */ + if (rcd->subctxt_cnt != uinfo->spu_subctxt_cnt || + rcd->userversion != uinfo->spu_userversion || + rcd->cnt >= rcd->subctxt_cnt) { + ret = -EINVAL; + goto done; + } + ctxt_fp(fp) = rcd; + subctxt_fp(fp) = rcd->cnt++; + rcd->subpid[subctxt_fp(fp)] = current->pid; + tidcursor_fp(fp) = 0; + rcd->active_slaves |= 1 << subctxt_fp(fp); + ret = 1; + goto done; + } + } + +done: + return ret; +} + +static int qib_open(struct inode *in, struct file *fp) +{ + /* The real work is performed later in qib_assign_ctxt() */ + fp->private_data = kzalloc(sizeof(struct qib_filedata), GFP_KERNEL); + if (fp->private_data) /* no cpu affinity by default */ + ((struct qib_filedata *)fp->private_data)->rec_cpu_num = -1; + return fp->private_data ? 0 : -ENOMEM; +} + +static int find_hca(unsigned int cpu, int *unit) +{ + int ret = 0, devmax, npresent, nup, ndev; + + *unit = -1; + + devmax = qib_count_units(&npresent, &nup); + if (!npresent) { + ret = -ENXIO; + goto done; + } + if (!nup) { + ret = -ENETDOWN; + goto done; + } + for (ndev = 0; ndev < devmax; ndev++) { + struct qib_devdata *dd = qib_lookup(ndev); + if (dd) { + if (pcibus_to_node(dd->pcidev->bus) < 0) { + ret = -EINVAL; + goto done; + } + if (cpu_to_node(cpu) == + pcibus_to_node(dd->pcidev->bus)) { + *unit = ndev; + goto done; + } + } + } +done: + return ret; +} + +static int do_qib_user_sdma_queue_create(struct file *fp) +{ + struct qib_filedata *fd = fp->private_data; + struct qib_ctxtdata *rcd = fd->rcd; + struct qib_devdata *dd = rcd->dd; + + if (dd->flags & QIB_HAS_SEND_DMA) { + + fd->pq = qib_user_sdma_queue_create(&dd->pcidev->dev, + dd->unit, + rcd->ctxt, + fd->subctxt); + if (!fd->pq) + return -ENOMEM; + } + + return 0; +} + +/* + * Get ctxt early, so can set affinity prior to memory allocation. + */ +static int qib_assign_ctxt(struct file *fp, const struct qib_user_info *uinfo) +{ + int ret; + int i_minor; + unsigned swmajor, swminor, alg = QIB_PORT_ALG_ACROSS; + + /* Check to be sure we haven't already initialized this file */ + if (ctxt_fp(fp)) { + ret = -EINVAL; + goto done; + } + + /* for now, if major version is different, bail */ + swmajor = uinfo->spu_userversion >> 16; + if (swmajor != QIB_USER_SWMAJOR) { + ret = -ENODEV; + goto done; + } + + swminor = uinfo->spu_userversion & 0xffff; + + if (swminor >= 11 && uinfo->spu_port_alg < QIB_PORT_ALG_COUNT) + alg = uinfo->spu_port_alg; + + mutex_lock(&qib_mutex); + + if (qib_compatible_subctxts(swmajor, swminor) && + uinfo->spu_subctxt_cnt) { + ret = find_shared_ctxt(fp, uinfo); + if (ret > 0) { + ret = do_qib_user_sdma_queue_create(fp); + if (!ret) + assign_ctxt_affinity(fp, (ctxt_fp(fp))->dd); + goto done_ok; + } + } + + i_minor = iminor(file_inode(fp)) - QIB_USER_MINOR_BASE; + if (i_minor) + ret = find_free_ctxt(i_minor - 1, fp, uinfo); + else { + int unit; + const unsigned int cpu = cpumask_first(¤t->cpus_allowed); + const unsigned int weight = + cpumask_weight(¤t->cpus_allowed); + + if (weight == 1 && !test_bit(cpu, qib_cpulist)) + if (!find_hca(cpu, &unit) && unit >= 0) + if (!find_free_ctxt(unit, fp, uinfo)) { + ret = 0; + goto done_chk_sdma; + } + ret = get_a_ctxt(fp, uinfo, alg); + } + +done_chk_sdma: + if (!ret) + ret = do_qib_user_sdma_queue_create(fp); +done_ok: + mutex_unlock(&qib_mutex); + +done: + return ret; +} + + +static int qib_do_user_init(struct file *fp, + const struct qib_user_info *uinfo) +{ + int ret; + struct qib_ctxtdata *rcd = ctxt_fp(fp); + struct qib_devdata *dd; + unsigned uctxt; + + /* Subctxts don't need to initialize anything since master did it. */ + if (subctxt_fp(fp)) { + ret = wait_event_interruptible(rcd->wait, + !test_bit(QIB_CTXT_MASTER_UNINIT, &rcd->flag)); + goto bail; + } + + dd = rcd->dd; + + /* some ctxts may get extra buffers, calculate that here */ + uctxt = rcd->ctxt - dd->first_user_ctxt; + if (uctxt < dd->ctxts_extrabuf) { + rcd->piocnt = dd->pbufsctxt + 1; + rcd->pio_base = rcd->piocnt * uctxt; + } else { + rcd->piocnt = dd->pbufsctxt; + rcd->pio_base = rcd->piocnt * uctxt + + dd->ctxts_extrabuf; + } + + /* + * All user buffers are 2KB buffers. If we ever support + * giving 4KB buffers to user processes, this will need some + * work. Can't use piobufbase directly, because it has + * both 2K and 4K buffer base values. So check and handle. + */ + if ((rcd->pio_base + rcd->piocnt) > dd->piobcnt2k) { + if (rcd->pio_base >= dd->piobcnt2k) { + qib_dev_err(dd, + "%u:ctxt%u: no 2KB buffers available\n", + dd->unit, rcd->ctxt); + ret = -ENOBUFS; + goto bail; + } + rcd->piocnt = dd->piobcnt2k - rcd->pio_base; + qib_dev_err(dd, "Ctxt%u: would use 4KB bufs, using %u\n", + rcd->ctxt, rcd->piocnt); + } + + rcd->piobufs = dd->pio2k_bufbase + rcd->pio_base * dd->palign; + qib_chg_pioavailkernel(dd, rcd->pio_base, rcd->piocnt, + TXCHK_CHG_TYPE_USER, rcd); + /* + * try to ensure that processes start up with consistent avail update + * for their own range, at least. If system very quiet, it might + * have the in-memory copy out of date at startup for this range of + * buffers, when a context gets re-used. Do after the chg_pioavail + * and before the rest of setup, so it's "almost certain" the dma + * will have occurred (can't 100% guarantee, but should be many + * decimals of 9s, with this ordering), given how much else happens + * after this. + */ + dd->f_sendctrl(dd->pport, QIB_SENDCTRL_AVAIL_BLIP); + + /* + * Now allocate the rcvhdr Q and eager TIDs; skip the TID + * array for time being. If rcd->ctxt > chip-supported, + * we need to do extra stuff here to handle by handling overflow + * through ctxt 0, someday + */ + ret = qib_create_rcvhdrq(dd, rcd); + if (!ret) + ret = qib_setup_eagerbufs(rcd); + if (ret) + goto bail_pio; + + rcd->tidcursor = 0; /* start at beginning after open */ + + /* initialize poll variables... */ + rcd->urgent = 0; + rcd->urgent_poll = 0; + + /* + * Now enable the ctxt for receive. + * For chips that are set to DMA the tail register to memory + * when they change (and when the update bit transitions from + * 0 to 1. So for those chips, we turn it off and then back on. + * This will (very briefly) affect any other open ctxts, but the + * duration is very short, and therefore isn't an issue. We + * explicitly set the in-memory tail copy to 0 beforehand, so we + * don't have to wait to be sure the DMA update has happened + * (chip resets head/tail to 0 on transition to enable). + */ + if (rcd->rcvhdrtail_kvaddr) + qib_clear_rcvhdrtail(rcd); + + dd->f_rcvctrl(rcd->ppd, QIB_RCVCTRL_CTXT_ENB | QIB_RCVCTRL_TIDFLOW_ENB, + rcd->ctxt); + + /* Notify any waiting slaves */ + if (rcd->subctxt_cnt) { + clear_bit(QIB_CTXT_MASTER_UNINIT, &rcd->flag); + wake_up(&rcd->wait); + } + return 0; + +bail_pio: + qib_chg_pioavailkernel(dd, rcd->pio_base, rcd->piocnt, + TXCHK_CHG_TYPE_KERN, rcd); +bail: + return ret; +} + +/** + * unlock_exptid - unlock any expected TID entries context still had in use + * @rcd: ctxt + * + * We don't actually update the chip here, because we do a bulk update + * below, using f_clear_tids. + */ +static void unlock_expected_tids(struct qib_ctxtdata *rcd) +{ + struct qib_devdata *dd = rcd->dd; + int ctxt_tidbase = rcd->ctxt * dd->rcvtidcnt; + int i, cnt = 0, maxtid = ctxt_tidbase + dd->rcvtidcnt; + + for (i = ctxt_tidbase; i < maxtid; i++) { + struct page *p = dd->pageshadow[i]; + dma_addr_t phys; + + if (!p) + continue; + + phys = dd->physshadow[i]; + dd->physshadow[i] = dd->tidinvalid; + dd->pageshadow[i] = NULL; + pci_unmap_page(dd->pcidev, phys, PAGE_SIZE, + PCI_DMA_FROMDEVICE); + qib_release_user_pages(&p, 1); + cnt++; + } +} + +static int qib_close(struct inode *in, struct file *fp) +{ + int ret = 0; + struct qib_filedata *fd; + struct qib_ctxtdata *rcd; + struct qib_devdata *dd; + unsigned long flags; + unsigned ctxt; + pid_t pid; + + mutex_lock(&qib_mutex); + + fd = fp->private_data; + fp->private_data = NULL; + rcd = fd->rcd; + if (!rcd) { + mutex_unlock(&qib_mutex); + goto bail; + } + + dd = rcd->dd; + + /* ensure all pio buffer writes in progress are flushed */ + qib_flush_wc(); + + /* drain user sdma queue */ + if (fd->pq) { + qib_user_sdma_queue_drain(rcd->ppd, fd->pq); + qib_user_sdma_queue_destroy(fd->pq); + } + + if (fd->rec_cpu_num != -1) + __clear_bit(fd->rec_cpu_num, qib_cpulist); + + if (--rcd->cnt) { + /* + * XXX If the master closes the context before the slave(s), + * revoke the mmap for the eager receive queue so + * the slave(s) don't wait for receive data forever. + */ + rcd->active_slaves &= ~(1 << fd->subctxt); + rcd->subpid[fd->subctxt] = 0; + mutex_unlock(&qib_mutex); + goto bail; + } + + /* early; no interrupt users after this */ + spin_lock_irqsave(&dd->uctxt_lock, flags); + ctxt = rcd->ctxt; + dd->rcd[ctxt] = NULL; + pid = rcd->pid; + rcd->pid = 0; + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + + if (rcd->rcvwait_to || rcd->piowait_to || + rcd->rcvnowait || rcd->pionowait) { + rcd->rcvwait_to = 0; + rcd->piowait_to = 0; + rcd->rcvnowait = 0; + rcd->pionowait = 0; + } + if (rcd->flag) + rcd->flag = 0; + + if (dd->kregbase) { + /* atomically clear receive enable ctxt and intr avail. */ + dd->f_rcvctrl(rcd->ppd, QIB_RCVCTRL_CTXT_DIS | + QIB_RCVCTRL_INTRAVAIL_DIS, ctxt); + + /* clean up the pkeys for this ctxt user */ + qib_clean_part_key(rcd, dd); + qib_disarm_piobufs(dd, rcd->pio_base, rcd->piocnt); + qib_chg_pioavailkernel(dd, rcd->pio_base, + rcd->piocnt, TXCHK_CHG_TYPE_KERN, NULL); + + dd->f_clear_tids(dd, rcd); + + if (dd->pageshadow) + unlock_expected_tids(rcd); + qib_stats.sps_ctxts--; + dd->freectxts++; + } + + mutex_unlock(&qib_mutex); + qib_free_ctxtdata(dd, rcd); /* after releasing the mutex */ + +bail: + kfree(fd); + return ret; +} + +static int qib_ctxt_info(struct file *fp, struct qib_ctxt_info __user *uinfo) +{ + struct qib_ctxt_info info; + int ret; + size_t sz; + struct qib_ctxtdata *rcd = ctxt_fp(fp); + struct qib_filedata *fd; + + fd = fp->private_data; + + info.num_active = qib_count_active_units(); + info.unit = rcd->dd->unit; + info.port = rcd->ppd->port; + info.ctxt = rcd->ctxt; + info.subctxt = subctxt_fp(fp); + /* Number of user ctxts available for this device. */ + info.num_ctxts = rcd->dd->cfgctxts - rcd->dd->first_user_ctxt; + info.num_subctxts = rcd->subctxt_cnt; + info.rec_cpu = fd->rec_cpu_num; + sz = sizeof(info); + + if (copy_to_user(uinfo, &info, sz)) { + ret = -EFAULT; + goto bail; + } + ret = 0; + +bail: + return ret; +} + +static int qib_sdma_get_inflight(struct qib_user_sdma_queue *pq, + u32 __user *inflightp) +{ + const u32 val = qib_user_sdma_inflight_counter(pq); + + if (put_user(val, inflightp)) + return -EFAULT; + + return 0; +} + +static int qib_sdma_get_complete(struct qib_pportdata *ppd, + struct qib_user_sdma_queue *pq, + u32 __user *completep) +{ + u32 val; + int err; + + if (!pq) + return -EINVAL; + + err = qib_user_sdma_make_progress(ppd, pq); + if (err < 0) + return err; + + val = qib_user_sdma_complete_counter(pq); + if (put_user(val, completep)) + return -EFAULT; + + return 0; +} + +static int disarm_req_delay(struct qib_ctxtdata *rcd) +{ + int ret = 0; + + if (!usable(rcd->ppd)) { + int i; + /* + * if link is down, or otherwise not usable, delay + * the caller up to 30 seconds, so we don't thrash + * in trying to get the chip back to ACTIVE, and + * set flag so they make the call again. + */ + if (rcd->user_event_mask) { + /* + * subctxt_cnt is 0 if not shared, so do base + * separately, first, then remaining subctxt, if any + */ + set_bit(_QIB_EVENT_DISARM_BUFS_BIT, + &rcd->user_event_mask[0]); + for (i = 1; i < rcd->subctxt_cnt; i++) + set_bit(_QIB_EVENT_DISARM_BUFS_BIT, + &rcd->user_event_mask[i]); + } + for (i = 0; !usable(rcd->ppd) && i < 300; i++) + msleep(100); + ret = -ENETDOWN; + } + return ret; +} + +/* + * Find all user contexts in use, and set the specified bit in their + * event mask. + * See also find_ctxt() for a similar use, that is specific to send buffers. + */ +int qib_set_uevent_bits(struct qib_pportdata *ppd, const int evtbit) +{ + struct qib_ctxtdata *rcd; + unsigned ctxt; + int ret = 0; + unsigned long flags; + + spin_lock_irqsave(&ppd->dd->uctxt_lock, flags); + for (ctxt = ppd->dd->first_user_ctxt; ctxt < ppd->dd->cfgctxts; + ctxt++) { + rcd = ppd->dd->rcd[ctxt]; + if (!rcd) + continue; + if (rcd->user_event_mask) { + int i; + /* + * subctxt_cnt is 0 if not shared, so do base + * separately, first, then remaining subctxt, if any + */ + set_bit(evtbit, &rcd->user_event_mask[0]); + for (i = 1; i < rcd->subctxt_cnt; i++) + set_bit(evtbit, &rcd->user_event_mask[i]); + } + ret = 1; + break; + } + spin_unlock_irqrestore(&ppd->dd->uctxt_lock, flags); + + return ret; +} + +/* + * clear the event notifier events for this context. + * For the DISARM_BUFS case, we also take action (this obsoletes + * the older QIB_CMD_DISARM_BUFS, but we keep it for backwards + * compatibility. + * Other bits don't currently require actions, just atomically clear. + * User process then performs actions appropriate to bit having been + * set, if desired, and checks again in future. + */ +static int qib_user_event_ack(struct qib_ctxtdata *rcd, int subctxt, + unsigned long events) +{ + int ret = 0, i; + + for (i = 0; i <= _QIB_MAX_EVENT_BIT; i++) { + if (!test_bit(i, &events)) + continue; + if (i == _QIB_EVENT_DISARM_BUFS_BIT) { + (void)qib_disarm_piobufs_ifneeded(rcd); + ret = disarm_req_delay(rcd); + } else + clear_bit(i, &rcd->user_event_mask[subctxt]); + } + return ret; +} + +static ssize_t qib_write(struct file *fp, const char __user *data, + size_t count, loff_t *off) +{ + const struct qib_cmd __user *ucmd; + struct qib_ctxtdata *rcd; + const void __user *src; + size_t consumed, copy = 0; + struct qib_cmd cmd; + ssize_t ret = 0; + void *dest; + + if (count < sizeof(cmd.type)) { + ret = -EINVAL; + goto bail; + } + + ucmd = (const struct qib_cmd __user *) data; + + if (copy_from_user(&cmd.type, &ucmd->type, sizeof(cmd.type))) { + ret = -EFAULT; + goto bail; + } + + consumed = sizeof(cmd.type); + + switch (cmd.type) { + case QIB_CMD_ASSIGN_CTXT: + case QIB_CMD_USER_INIT: + copy = sizeof(cmd.cmd.user_info); + dest = &cmd.cmd.user_info; + src = &ucmd->cmd.user_info; + break; + + case QIB_CMD_RECV_CTRL: + copy = sizeof(cmd.cmd.recv_ctrl); + dest = &cmd.cmd.recv_ctrl; + src = &ucmd->cmd.recv_ctrl; + break; + + case QIB_CMD_CTXT_INFO: + copy = sizeof(cmd.cmd.ctxt_info); + dest = &cmd.cmd.ctxt_info; + src = &ucmd->cmd.ctxt_info; + break; + + case QIB_CMD_TID_UPDATE: + case QIB_CMD_TID_FREE: + copy = sizeof(cmd.cmd.tid_info); + dest = &cmd.cmd.tid_info; + src = &ucmd->cmd.tid_info; + break; + + case QIB_CMD_SET_PART_KEY: + copy = sizeof(cmd.cmd.part_key); + dest = &cmd.cmd.part_key; + src = &ucmd->cmd.part_key; + break; + + case QIB_CMD_DISARM_BUFS: + case QIB_CMD_PIOAVAILUPD: /* force an update of PIOAvail reg */ + copy = 0; + src = NULL; + dest = NULL; + break; + + case QIB_CMD_POLL_TYPE: + copy = sizeof(cmd.cmd.poll_type); + dest = &cmd.cmd.poll_type; + src = &ucmd->cmd.poll_type; + break; + + case QIB_CMD_ARMLAUNCH_CTRL: + copy = sizeof(cmd.cmd.armlaunch_ctrl); + dest = &cmd.cmd.armlaunch_ctrl; + src = &ucmd->cmd.armlaunch_ctrl; + break; + + case QIB_CMD_SDMA_INFLIGHT: + copy = sizeof(cmd.cmd.sdma_inflight); + dest = &cmd.cmd.sdma_inflight; + src = &ucmd->cmd.sdma_inflight; + break; + + case QIB_CMD_SDMA_COMPLETE: + copy = sizeof(cmd.cmd.sdma_complete); + dest = &cmd.cmd.sdma_complete; + src = &ucmd->cmd.sdma_complete; + break; + + case QIB_CMD_ACK_EVENT: + copy = sizeof(cmd.cmd.event_mask); + dest = &cmd.cmd.event_mask; + src = &ucmd->cmd.event_mask; + break; + + default: + ret = -EINVAL; + goto bail; + } + + if (copy) { + if ((count - consumed) < copy) { + ret = -EINVAL; + goto bail; + } + if (copy_from_user(dest, src, copy)) { + ret = -EFAULT; + goto bail; + } + consumed += copy; + } + + rcd = ctxt_fp(fp); + if (!rcd && cmd.type != QIB_CMD_ASSIGN_CTXT) { + ret = -EINVAL; + goto bail; + } + + switch (cmd.type) { + case QIB_CMD_ASSIGN_CTXT: + ret = qib_assign_ctxt(fp, &cmd.cmd.user_info); + if (ret) + goto bail; + break; + + case QIB_CMD_USER_INIT: + ret = qib_do_user_init(fp, &cmd.cmd.user_info); + if (ret) + goto bail; + ret = qib_get_base_info(fp, (void __user *) (unsigned long) + cmd.cmd.user_info.spu_base_info, + cmd.cmd.user_info.spu_base_info_size); + break; + + case QIB_CMD_RECV_CTRL: + ret = qib_manage_rcvq(rcd, subctxt_fp(fp), cmd.cmd.recv_ctrl); + break; + + case QIB_CMD_CTXT_INFO: + ret = qib_ctxt_info(fp, (struct qib_ctxt_info __user *) + (unsigned long) cmd.cmd.ctxt_info); + break; + + case QIB_CMD_TID_UPDATE: + ret = qib_tid_update(rcd, fp, &cmd.cmd.tid_info); + break; + + case QIB_CMD_TID_FREE: + ret = qib_tid_free(rcd, subctxt_fp(fp), &cmd.cmd.tid_info); + break; + + case QIB_CMD_SET_PART_KEY: + ret = qib_set_part_key(rcd, cmd.cmd.part_key); + break; + + case QIB_CMD_DISARM_BUFS: + (void)qib_disarm_piobufs_ifneeded(rcd); + ret = disarm_req_delay(rcd); + break; + + case QIB_CMD_PIOAVAILUPD: + qib_force_pio_avail_update(rcd->dd); + break; + + case QIB_CMD_POLL_TYPE: + rcd->poll_type = cmd.cmd.poll_type; + break; + + case QIB_CMD_ARMLAUNCH_CTRL: + rcd->dd->f_set_armlaunch(rcd->dd, cmd.cmd.armlaunch_ctrl); + break; + + case QIB_CMD_SDMA_INFLIGHT: + ret = qib_sdma_get_inflight(user_sdma_queue_fp(fp), + (u32 __user *) (unsigned long) + cmd.cmd.sdma_inflight); + break; + + case QIB_CMD_SDMA_COMPLETE: + ret = qib_sdma_get_complete(rcd->ppd, + user_sdma_queue_fp(fp), + (u32 __user *) (unsigned long) + cmd.cmd.sdma_complete); + break; + + case QIB_CMD_ACK_EVENT: + ret = qib_user_event_ack(rcd, subctxt_fp(fp), + cmd.cmd.event_mask); + break; + } + + if (ret >= 0) + ret = consumed; + +bail: + return ret; +} + +static ssize_t qib_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long dim, loff_t off) +{ + struct qib_filedata *fp = iocb->ki_filp->private_data; + struct qib_ctxtdata *rcd = ctxt_fp(iocb->ki_filp); + struct qib_user_sdma_queue *pq = fp->pq; + + if (!dim || !pq) + return -EINVAL; + + return qib_user_sdma_writev(rcd, pq, iov, dim); +} + +static struct class *qib_class; +static dev_t qib_dev; + +int qib_cdev_init(int minor, const char *name, + const struct file_operations *fops, + struct cdev **cdevp, struct device **devp) +{ + const dev_t dev = MKDEV(MAJOR(qib_dev), minor); + struct cdev *cdev; + struct device *device = NULL; + int ret; + + cdev = cdev_alloc(); + if (!cdev) { + pr_err("Could not allocate cdev for minor %d, %s\n", + minor, name); + ret = -ENOMEM; + goto done; + } + + cdev->owner = THIS_MODULE; + cdev->ops = fops; + kobject_set_name(&cdev->kobj, name); + + ret = cdev_add(cdev, dev, 1); + if (ret < 0) { + pr_err("Could not add cdev for minor %d, %s (err %d)\n", + minor, name, -ret); + goto err_cdev; + } + + device = device_create(qib_class, NULL, dev, NULL, "%s", name); + if (!IS_ERR(device)) + goto done; + ret = PTR_ERR(device); + device = NULL; + pr_err("Could not create device for minor %d, %s (err %d)\n", + minor, name, -ret); +err_cdev: + cdev_del(cdev); + cdev = NULL; +done: + *cdevp = cdev; + *devp = device; + return ret; +} + +void qib_cdev_cleanup(struct cdev **cdevp, struct device **devp) +{ + struct device *device = *devp; + + if (device) { + device_unregister(device); + *devp = NULL; + } + + if (*cdevp) { + cdev_del(*cdevp); + *cdevp = NULL; + } +} + +static struct cdev *wildcard_cdev; +static struct device *wildcard_device; + +int __init qib_dev_init(void) +{ + int ret; + + ret = alloc_chrdev_region(&qib_dev, 0, QIB_NMINORS, QIB_DRV_NAME); + if (ret < 0) { + pr_err("Could not allocate chrdev region (err %d)\n", -ret); + goto done; + } + + qib_class = class_create(THIS_MODULE, "ipath"); + if (IS_ERR(qib_class)) { + ret = PTR_ERR(qib_class); + pr_err("Could not create device class (err %d)\n", -ret); + unregister_chrdev_region(qib_dev, QIB_NMINORS); + } + +done: + return ret; +} + +void qib_dev_cleanup(void) +{ + if (qib_class) { + class_destroy(qib_class); + qib_class = NULL; + } + + unregister_chrdev_region(qib_dev, QIB_NMINORS); +} + +static atomic_t user_count = ATOMIC_INIT(0); + +static void qib_user_remove(struct qib_devdata *dd) +{ + if (atomic_dec_return(&user_count) == 0) + qib_cdev_cleanup(&wildcard_cdev, &wildcard_device); + + qib_cdev_cleanup(&dd->user_cdev, &dd->user_device); +} + +static int qib_user_add(struct qib_devdata *dd) +{ + char name[10]; + int ret; + + if (atomic_inc_return(&user_count) == 1) { + ret = qib_cdev_init(0, "ipath", &qib_file_ops, + &wildcard_cdev, &wildcard_device); + if (ret) + goto done; + } + + snprintf(name, sizeof(name), "ipath%d", dd->unit); + ret = qib_cdev_init(dd->unit + 1, name, &qib_file_ops, + &dd->user_cdev, &dd->user_device); + if (ret) + qib_user_remove(dd); +done: + return ret; +} + +/* + * Create per-unit files in /dev + */ +int qib_device_create(struct qib_devdata *dd) +{ + int r, ret; + + r = qib_user_add(dd); + ret = qib_diag_add(dd); + if (r && !ret) + ret = r; + return ret; +} + +/* + * Remove per-unit files in /dev + * void, core kernel returns no errors for this stuff + */ +void qib_device_remove(struct qib_devdata *dd) +{ + qib_user_remove(dd); + qib_diag_remove(dd); +} diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c new file mode 100644 index 0000000..8185458 --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_fs.c @@ -0,0 +1,620 @@ +/* + * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. + * Copyright (c) 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#include "qib.h" + +#define QIBFS_MAGIC 0x726a77 + +static struct super_block *qib_super; + +#define private2dd(file) (file_inode(file)->i_private) + +static int qibfs_mknod(struct inode *dir, struct dentry *dentry, + umode_t mode, const struct file_operations *fops, + void *data) +{ + int error; + struct inode *inode = new_inode(dir->i_sb); + + if (!inode) { + error = -EPERM; + goto bail; + } + + inode->i_ino = get_next_ino(); + inode->i_mode = mode; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_blocks = 0; + inode->i_atime = CURRENT_TIME; + inode->i_mtime = inode->i_atime; + inode->i_ctime = inode->i_atime; + inode->i_private = data; + if (S_ISDIR(mode)) { + inode->i_op = &simple_dir_inode_operations; + inc_nlink(inode); + inc_nlink(dir); + } + + inode->i_fop = fops; + + d_instantiate(dentry, inode); + error = 0; + +bail: + return error; +} + +static int create_file(const char *name, umode_t mode, + struct dentry *parent, struct dentry **dentry, + const struct file_operations *fops, void *data) +{ + int error; + + mutex_lock(&parent->d_inode->i_mutex); + *dentry = lookup_one_len(name, parent, strlen(name)); + if (!IS_ERR(*dentry)) + error = qibfs_mknod(parent->d_inode, *dentry, + mode, fops, data); + else + error = PTR_ERR(*dentry); + mutex_unlock(&parent->d_inode->i_mutex); + + return error; +} + +static ssize_t driver_stats_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + qib_stats.sps_ints = qib_sps_ints(); + return simple_read_from_buffer(buf, count, ppos, &qib_stats, + sizeof qib_stats); +} + +/* + * driver stats field names, one line per stat, single string. Used by + * programs like ipathstats to print the stats in a way which works for + * different versions of drivers, without changing program source. + * if qlogic_ib_stats changes, this needs to change. Names need to be + * 12 chars or less (w/o newline), for proper display by ipathstats utility. + */ +static const char qib_statnames[] = + "KernIntr\n" + "ErrorIntr\n" + "Tx_Errs\n" + "Rcv_Errs\n" + "H/W_Errs\n" + "NoPIOBufs\n" + "CtxtsOpen\n" + "RcvLen_Errs\n" + "EgrBufFull\n" + "EgrHdrFull\n" + ; + +static ssize_t driver_names_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return simple_read_from_buffer(buf, count, ppos, qib_statnames, + sizeof qib_statnames - 1); /* no null */ +} + +static const struct file_operations driver_ops[] = { + { .read = driver_stats_read, .llseek = generic_file_llseek, }, + { .read = driver_names_read, .llseek = generic_file_llseek, }, +}; + +/* read the per-device counters */ +static ssize_t dev_counters_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 *counters; + size_t avail; + struct qib_devdata *dd = private2dd(file); + + avail = dd->f_read_cntrs(dd, *ppos, NULL, &counters); + return simple_read_from_buffer(buf, count, ppos, counters, avail); +} + +/* read the per-device counters */ +static ssize_t dev_names_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + char *names; + size_t avail; + struct qib_devdata *dd = private2dd(file); + + avail = dd->f_read_cntrs(dd, *ppos, &names, NULL); + return simple_read_from_buffer(buf, count, ppos, names, avail); +} + +static const struct file_operations cntr_ops[] = { + { .read = dev_counters_read, .llseek = generic_file_llseek, }, + { .read = dev_names_read, .llseek = generic_file_llseek, }, +}; + +/* + * Could use file_inode(file)->i_ino to figure out which file, + * instead of separate routine for each, but for now, this works... + */ + +/* read the per-port names (same for each port) */ +static ssize_t portnames_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + char *names; + size_t avail; + struct qib_devdata *dd = private2dd(file); + + avail = dd->f_read_portcntrs(dd, *ppos, 0, &names, NULL); + return simple_read_from_buffer(buf, count, ppos, names, avail); +} + +/* read the per-port counters for port 1 (pidx 0) */ +static ssize_t portcntrs_1_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 *counters; + size_t avail; + struct qib_devdata *dd = private2dd(file); + + avail = dd->f_read_portcntrs(dd, *ppos, 0, NULL, &counters); + return simple_read_from_buffer(buf, count, ppos, counters, avail); +} + +/* read the per-port counters for port 2 (pidx 1) */ +static ssize_t portcntrs_2_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 *counters; + size_t avail; + struct qib_devdata *dd = private2dd(file); + + avail = dd->f_read_portcntrs(dd, *ppos, 1, NULL, &counters); + return simple_read_from_buffer(buf, count, ppos, counters, avail); +} + +static const struct file_operations portcntr_ops[] = { + { .read = portnames_read, .llseek = generic_file_llseek, }, + { .read = portcntrs_1_read, .llseek = generic_file_llseek, }, + { .read = portcntrs_2_read, .llseek = generic_file_llseek, }, +}; + +/* + * read the per-port QSFP data for port 1 (pidx 0) + */ +static ssize_t qsfp_1_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct qib_devdata *dd = private2dd(file); + char *tmp; + int ret; + + tmp = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + ret = qib_qsfp_dump(dd->pport, tmp, PAGE_SIZE); + if (ret > 0) + ret = simple_read_from_buffer(buf, count, ppos, tmp, ret); + kfree(tmp); + return ret; +} + +/* + * read the per-port QSFP data for port 2 (pidx 1) + */ +static ssize_t qsfp_2_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct qib_devdata *dd = private2dd(file); + char *tmp; + int ret; + + if (dd->num_pports < 2) + return -ENODEV; + + tmp = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + ret = qib_qsfp_dump(dd->pport + 1, tmp, PAGE_SIZE); + if (ret > 0) + ret = simple_read_from_buffer(buf, count, ppos, tmp, ret); + kfree(tmp); + return ret; +} + +static const struct file_operations qsfp_ops[] = { + { .read = qsfp_1_read, .llseek = generic_file_llseek, }, + { .read = qsfp_2_read, .llseek = generic_file_llseek, }, +}; + +static ssize_t flash_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct qib_devdata *dd; + ssize_t ret; + loff_t pos; + char *tmp; + + pos = *ppos; + + if (pos < 0) { + ret = -EINVAL; + goto bail; + } + + if (pos >= sizeof(struct qib_flash)) { + ret = 0; + goto bail; + } + + if (count > sizeof(struct qib_flash) - pos) + count = sizeof(struct qib_flash) - pos; + + tmp = kmalloc(count, GFP_KERNEL); + if (!tmp) { + ret = -ENOMEM; + goto bail; + } + + dd = private2dd(file); + if (qib_eeprom_read(dd, pos, tmp, count)) { + qib_dev_err(dd, "failed to read from flash\n"); + ret = -ENXIO; + goto bail_tmp; + } + + if (copy_to_user(buf, tmp, count)) { + ret = -EFAULT; + goto bail_tmp; + } + + *ppos = pos + count; + ret = count; + +bail_tmp: + kfree(tmp); + +bail: + return ret; +} + +static ssize_t flash_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct qib_devdata *dd; + ssize_t ret; + loff_t pos; + char *tmp; + + pos = *ppos; + + if (pos != 0) { + ret = -EINVAL; + goto bail; + } + + if (count != sizeof(struct qib_flash)) { + ret = -EINVAL; + goto bail; + } + + tmp = kmalloc(count, GFP_KERNEL); + if (!tmp) { + ret = -ENOMEM; + goto bail; + } + + if (copy_from_user(tmp, buf, count)) { + ret = -EFAULT; + goto bail_tmp; + } + + dd = private2dd(file); + if (qib_eeprom_write(dd, pos, tmp, count)) { + ret = -ENXIO; + qib_dev_err(dd, "failed to write to flash\n"); + goto bail_tmp; + } + + *ppos = pos + count; + ret = count; + +bail_tmp: + kfree(tmp); + +bail: + return ret; +} + +static const struct file_operations flash_ops = { + .read = flash_read, + .write = flash_write, + .llseek = default_llseek, +}; + +static int add_cntr_files(struct super_block *sb, struct qib_devdata *dd) +{ + struct dentry *dir, *tmp; + char unit[10]; + int ret, i; + + /* create the per-unit directory */ + snprintf(unit, sizeof unit, "%u", dd->unit); + ret = create_file(unit, S_IFDIR|S_IRUGO|S_IXUGO, sb->s_root, &dir, + &simple_dir_operations, dd); + if (ret) { + pr_err("create_file(%s) failed: %d\n", unit, ret); + goto bail; + } + + /* create the files in the new directory */ + ret = create_file("counters", S_IFREG|S_IRUGO, dir, &tmp, + &cntr_ops[0], dd); + if (ret) { + pr_err("create_file(%s/counters) failed: %d\n", + unit, ret); + goto bail; + } + ret = create_file("counter_names", S_IFREG|S_IRUGO, dir, &tmp, + &cntr_ops[1], dd); + if (ret) { + pr_err("create_file(%s/counter_names) failed: %d\n", + unit, ret); + goto bail; + } + ret = create_file("portcounter_names", S_IFREG|S_IRUGO, dir, &tmp, + &portcntr_ops[0], dd); + if (ret) { + pr_err("create_file(%s/%s) failed: %d\n", + unit, "portcounter_names", ret); + goto bail; + } + for (i = 1; i <= dd->num_pports; i++) { + char fname[24]; + + sprintf(fname, "port%dcounters", i); + /* create the files in the new directory */ + ret = create_file(fname, S_IFREG|S_IRUGO, dir, &tmp, + &portcntr_ops[i], dd); + if (ret) { + pr_err("create_file(%s/%s) failed: %d\n", + unit, fname, ret); + goto bail; + } + if (!(dd->flags & QIB_HAS_QSFP)) + continue; + sprintf(fname, "qsfp%d", i); + ret = create_file(fname, S_IFREG|S_IRUGO, dir, &tmp, + &qsfp_ops[i - 1], dd); + if (ret) { + pr_err("create_file(%s/%s) failed: %d\n", + unit, fname, ret); + goto bail; + } + } + + ret = create_file("flash", S_IFREG|S_IWUSR|S_IRUGO, dir, &tmp, + &flash_ops, dd); + if (ret) + pr_err("create_file(%s/flash) failed: %d\n", + unit, ret); +bail: + return ret; +} + +static int remove_file(struct dentry *parent, char *name) +{ + struct dentry *tmp; + int ret; + + tmp = lookup_one_len(name, parent, strlen(name)); + + if (IS_ERR(tmp)) { + ret = PTR_ERR(tmp); + goto bail; + } + + spin_lock(&tmp->d_lock); + if (!(d_unhashed(tmp) && tmp->d_inode)) { + __d_drop(tmp); + spin_unlock(&tmp->d_lock); + simple_unlink(parent->d_inode, tmp); + } else { + spin_unlock(&tmp->d_lock); + } + dput(tmp); + + ret = 0; +bail: + /* + * We don't expect clients to care about the return value, but + * it's there if they need it. + */ + return ret; +} + +static int remove_device_files(struct super_block *sb, + struct qib_devdata *dd) +{ + struct dentry *dir, *root; + char unit[10]; + int ret, i; + + root = dget(sb->s_root); + mutex_lock(&root->d_inode->i_mutex); + snprintf(unit, sizeof unit, "%u", dd->unit); + dir = lookup_one_len(unit, root, strlen(unit)); + + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); + pr_err("Lookup of %s failed\n", unit); + goto bail; + } + + mutex_lock(&dir->d_inode->i_mutex); + remove_file(dir, "counters"); + remove_file(dir, "counter_names"); + remove_file(dir, "portcounter_names"); + for (i = 0; i < dd->num_pports; i++) { + char fname[24]; + + sprintf(fname, "port%dcounters", i + 1); + remove_file(dir, fname); + if (dd->flags & QIB_HAS_QSFP) { + sprintf(fname, "qsfp%d", i + 1); + remove_file(dir, fname); + } + } + remove_file(dir, "flash"); + mutex_unlock(&dir->d_inode->i_mutex); + ret = simple_rmdir(root->d_inode, dir); + d_delete(dir); + dput(dir); + +bail: + mutex_unlock(&root->d_inode->i_mutex); + dput(root); + return ret; +} + +/* + * This fills everything in when the fs is mounted, to handle umount/mount + * after device init. The direct add_cntr_files() call handles adding + * them from the init code, when the fs is already mounted. + */ +static int qibfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct qib_devdata *dd, *tmp; + unsigned long flags; + int ret; + + static struct tree_descr files[] = { + [2] = {"driver_stats", &driver_ops[0], S_IRUGO}, + [3] = {"driver_stats_names", &driver_ops[1], S_IRUGO}, + {""}, + }; + + ret = simple_fill_super(sb, QIBFS_MAGIC, files); + if (ret) { + pr_err("simple_fill_super failed: %d\n", ret); + goto bail; + } + + spin_lock_irqsave(&qib_devs_lock, flags); + + list_for_each_entry_safe(dd, tmp, &qib_dev_list, list) { + spin_unlock_irqrestore(&qib_devs_lock, flags); + ret = add_cntr_files(sb, dd); + if (ret) + goto bail; + spin_lock_irqsave(&qib_devs_lock, flags); + } + + spin_unlock_irqrestore(&qib_devs_lock, flags); + +bail: + return ret; +} + +static struct dentry *qibfs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + struct dentry *ret; + ret = mount_single(fs_type, flags, data, qibfs_fill_super); + if (!IS_ERR(ret)) + qib_super = ret->d_sb; + return ret; +} + +static void qibfs_kill_super(struct super_block *s) +{ + kill_litter_super(s); + qib_super = NULL; +} + +int qibfs_add(struct qib_devdata *dd) +{ + int ret; + + /* + * On first unit initialized, qib_super will not yet exist + * because nobody has yet tried to mount the filesystem, so + * we can't consider that to be an error; if an error occurs + * during the mount, that will get a complaint, so this is OK. + * add_cntr_files() for all units is done at mount from + * qibfs_fill_super(), so one way or another, everything works. + */ + if (qib_super == NULL) + ret = 0; + else + ret = add_cntr_files(qib_super, dd); + return ret; +} + +int qibfs_remove(struct qib_devdata *dd) +{ + int ret = 0; + + if (qib_super) + ret = remove_device_files(qib_super, dd); + + return ret; +} + +static struct file_system_type qibfs_fs_type = { + .owner = THIS_MODULE, + .name = "ipathfs", + .mount = qibfs_mount, + .kill_sb = qibfs_kill_super, +}; +MODULE_ALIAS_FS("ipathfs"); + +int __init qib_init_qibfs(void) +{ + return register_filesystem(&qibfs_fs_type); +} + +int __exit qib_exit_qibfs(void) +{ + return unregister_filesystem(&qibfs_fs_type); +} diff --git a/drivers/infiniband/hw/qib/qib_iba6120.c b/drivers/infiniband/hw/qib/qib_iba6120.c new file mode 100644 index 0000000..d68266a --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_iba6120.c @@ -0,0 +1,3599 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation. + * All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +/* + * This file contains all of the code that is specific to the + * QLogic_IB 6120 PCIe chip. + */ + +#include +#include +#include +#include + +#include "qib.h" +#include "qib_6120_regs.h" + +static void qib_6120_setup_setextled(struct qib_pportdata *, u32); +static void sendctrl_6120_mod(struct qib_pportdata *ppd, u32 op); +static u8 qib_6120_phys_portstate(u64); +static u32 qib_6120_iblink_state(u64); + +/* + * This file contains all the chip-specific register information and + * access functions for the Intel Intel_IB PCI-Express chip. + * + */ + +/* KREG_IDX uses machine-generated #defines */ +#define KREG_IDX(regname) (QIB_6120_##regname##_OFFS / sizeof(u64)) + +/* Use defines to tie machine-generated names to lower-case names */ +#define kr_extctrl KREG_IDX(EXTCtrl) +#define kr_extstatus KREG_IDX(EXTStatus) +#define kr_gpio_clear KREG_IDX(GPIOClear) +#define kr_gpio_mask KREG_IDX(GPIOMask) +#define kr_gpio_out KREG_IDX(GPIOOut) +#define kr_gpio_status KREG_IDX(GPIOStatus) +#define kr_rcvctrl KREG_IDX(RcvCtrl) +#define kr_sendctrl KREG_IDX(SendCtrl) +#define kr_partitionkey KREG_IDX(RcvPartitionKey) +#define kr_hwdiagctrl KREG_IDX(HwDiagCtrl) +#define kr_ibcstatus KREG_IDX(IBCStatus) +#define kr_ibcctrl KREG_IDX(IBCCtrl) +#define kr_sendbuffererror KREG_IDX(SendBufErr0) +#define kr_rcvbthqp KREG_IDX(RcvBTHQP) +#define kr_counterregbase KREG_IDX(CntrRegBase) +#define kr_palign KREG_IDX(PageAlign) +#define kr_rcvegrbase KREG_IDX(RcvEgrBase) +#define kr_rcvegrcnt KREG_IDX(RcvEgrCnt) +#define kr_rcvhdrcnt KREG_IDX(RcvHdrCnt) +#define kr_rcvhdrentsize KREG_IDX(RcvHdrEntSize) +#define kr_rcvhdrsize KREG_IDX(RcvHdrSize) +#define kr_rcvtidbase KREG_IDX(RcvTIDBase) +#define kr_rcvtidcnt KREG_IDX(RcvTIDCnt) +#define kr_scratch KREG_IDX(Scratch) +#define kr_sendctrl KREG_IDX(SendCtrl) +#define kr_sendpioavailaddr KREG_IDX(SendPIOAvailAddr) +#define kr_sendpiobufbase KREG_IDX(SendPIOBufBase) +#define kr_sendpiobufcnt KREG_IDX(SendPIOBufCnt) +#define kr_sendpiosize KREG_IDX(SendPIOSize) +#define kr_sendregbase KREG_IDX(SendRegBase) +#define kr_userregbase KREG_IDX(UserRegBase) +#define kr_control KREG_IDX(Control) +#define kr_intclear KREG_IDX(IntClear) +#define kr_intmask KREG_IDX(IntMask) +#define kr_intstatus KREG_IDX(IntStatus) +#define kr_errclear KREG_IDX(ErrClear) +#define kr_errmask KREG_IDX(ErrMask) +#define kr_errstatus KREG_IDX(ErrStatus) +#define kr_hwerrclear KREG_IDX(HwErrClear) +#define kr_hwerrmask KREG_IDX(HwErrMask) +#define kr_hwerrstatus KREG_IDX(HwErrStatus) +#define kr_revision KREG_IDX(Revision) +#define kr_portcnt KREG_IDX(PortCnt) +#define kr_serdes_cfg0 KREG_IDX(SerdesCfg0) +#define kr_serdes_cfg1 (kr_serdes_cfg0 + 1) +#define kr_serdes_stat KREG_IDX(SerdesStat) +#define kr_xgxs_cfg KREG_IDX(XGXSCfg) + +/* These must only be written via qib_write_kreg_ctxt() */ +#define kr_rcvhdraddr KREG_IDX(RcvHdrAddr0) +#define kr_rcvhdrtailaddr KREG_IDX(RcvHdrTailAddr0) + +#define CREG_IDX(regname) ((QIB_6120_##regname##_OFFS - \ + QIB_6120_LBIntCnt_OFFS) / sizeof(u64)) + +#define cr_badformat CREG_IDX(RxBadFormatCnt) +#define cr_erricrc CREG_IDX(RxICRCErrCnt) +#define cr_errlink CREG_IDX(RxLinkProblemCnt) +#define cr_errlpcrc CREG_IDX(RxLPCRCErrCnt) +#define cr_errpkey CREG_IDX(RxPKeyMismatchCnt) +#define cr_rcvflowctrl_err CREG_IDX(RxFlowCtrlErrCnt) +#define cr_err_rlen CREG_IDX(RxLenErrCnt) +#define cr_errslen CREG_IDX(TxLenErrCnt) +#define cr_errtidfull CREG_IDX(RxTIDFullErrCnt) +#define cr_errtidvalid CREG_IDX(RxTIDValidErrCnt) +#define cr_errvcrc CREG_IDX(RxVCRCErrCnt) +#define cr_ibstatuschange CREG_IDX(IBStatusChangeCnt) +#define cr_lbint CREG_IDX(LBIntCnt) +#define cr_invalidrlen CREG_IDX(RxMaxMinLenErrCnt) +#define cr_invalidslen CREG_IDX(TxMaxMinLenErrCnt) +#define cr_lbflowstall CREG_IDX(LBFlowStallCnt) +#define cr_pktrcv CREG_IDX(RxDataPktCnt) +#define cr_pktrcvflowctrl CREG_IDX(RxFlowPktCnt) +#define cr_pktsend CREG_IDX(TxDataPktCnt) +#define cr_pktsendflow CREG_IDX(TxFlowPktCnt) +#define cr_portovfl CREG_IDX(RxP0HdrEgrOvflCnt) +#define cr_rcvebp CREG_IDX(RxEBPCnt) +#define cr_rcvovfl CREG_IDX(RxBufOvflCnt) +#define cr_senddropped CREG_IDX(TxDroppedPktCnt) +#define cr_sendstall CREG_IDX(TxFlowStallCnt) +#define cr_sendunderrun CREG_IDX(TxUnderrunCnt) +#define cr_wordrcv CREG_IDX(RxDwordCnt) +#define cr_wordsend CREG_IDX(TxDwordCnt) +#define cr_txunsupvl CREG_IDX(TxUnsupVLErrCnt) +#define cr_rxdroppkt CREG_IDX(RxDroppedPktCnt) +#define cr_iblinkerrrecov CREG_IDX(IBLinkErrRecoveryCnt) +#define cr_iblinkdown CREG_IDX(IBLinkDownedCnt) +#define cr_ibsymbolerr CREG_IDX(IBSymbolErrCnt) + +#define SYM_RMASK(regname, fldname) ((u64) \ + QIB_6120_##regname##_##fldname##_RMASK) +#define SYM_MASK(regname, fldname) ((u64) \ + QIB_6120_##regname##_##fldname##_RMASK << \ + QIB_6120_##regname##_##fldname##_LSB) +#define SYM_LSB(regname, fldname) (QIB_6120_##regname##_##fldname##_LSB) + +#define SYM_FIELD(value, regname, fldname) ((u64) \ + (((value) >> SYM_LSB(regname, fldname)) & \ + SYM_RMASK(regname, fldname))) +#define ERR_MASK(fldname) SYM_MASK(ErrMask, fldname##Mask) +#define HWE_MASK(fldname) SYM_MASK(HwErrMask, fldname##Mask) + +/* link training states, from IBC */ +#define IB_6120_LT_STATE_DISABLED 0x00 +#define IB_6120_LT_STATE_LINKUP 0x01 +#define IB_6120_LT_STATE_POLLACTIVE 0x02 +#define IB_6120_LT_STATE_POLLQUIET 0x03 +#define IB_6120_LT_STATE_SLEEPDELAY 0x04 +#define IB_6120_LT_STATE_SLEEPQUIET 0x05 +#define IB_6120_LT_STATE_CFGDEBOUNCE 0x08 +#define IB_6120_LT_STATE_CFGRCVFCFG 0x09 +#define IB_6120_LT_STATE_CFGWAITRMT 0x0a +#define IB_6120_LT_STATE_CFGIDLE 0x0b +#define IB_6120_LT_STATE_RECOVERRETRAIN 0x0c +#define IB_6120_LT_STATE_RECOVERWAITRMT 0x0e +#define IB_6120_LT_STATE_RECOVERIDLE 0x0f + +/* link state machine states from IBC */ +#define IB_6120_L_STATE_DOWN 0x0 +#define IB_6120_L_STATE_INIT 0x1 +#define IB_6120_L_STATE_ARM 0x2 +#define IB_6120_L_STATE_ACTIVE 0x3 +#define IB_6120_L_STATE_ACT_DEFER 0x4 + +static const u8 qib_6120_physportstate[0x20] = { + [IB_6120_LT_STATE_DISABLED] = IB_PHYSPORTSTATE_DISABLED, + [IB_6120_LT_STATE_LINKUP] = IB_PHYSPORTSTATE_LINKUP, + [IB_6120_LT_STATE_POLLACTIVE] = IB_PHYSPORTSTATE_POLL, + [IB_6120_LT_STATE_POLLQUIET] = IB_PHYSPORTSTATE_POLL, + [IB_6120_LT_STATE_SLEEPDELAY] = IB_PHYSPORTSTATE_SLEEP, + [IB_6120_LT_STATE_SLEEPQUIET] = IB_PHYSPORTSTATE_SLEEP, + [IB_6120_LT_STATE_CFGDEBOUNCE] = + IB_PHYSPORTSTATE_CFG_TRAIN, + [IB_6120_LT_STATE_CFGRCVFCFG] = + IB_PHYSPORTSTATE_CFG_TRAIN, + [IB_6120_LT_STATE_CFGWAITRMT] = + IB_PHYSPORTSTATE_CFG_TRAIN, + [IB_6120_LT_STATE_CFGIDLE] = IB_PHYSPORTSTATE_CFG_TRAIN, + [IB_6120_LT_STATE_RECOVERRETRAIN] = + IB_PHYSPORTSTATE_LINK_ERR_RECOVER, + [IB_6120_LT_STATE_RECOVERWAITRMT] = + IB_PHYSPORTSTATE_LINK_ERR_RECOVER, + [IB_6120_LT_STATE_RECOVERIDLE] = + IB_PHYSPORTSTATE_LINK_ERR_RECOVER, + [0x10] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x11] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x12] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x13] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x14] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x15] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x16] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x17] = IB_PHYSPORTSTATE_CFG_TRAIN +}; + + +struct qib_chip_specific { + u64 __iomem *cregbase; + u64 *cntrs; + u64 *portcntrs; + void *dummy_hdrq; /* used after ctxt close */ + dma_addr_t dummy_hdrq_phys; + spinlock_t kernel_tid_lock; /* no back to back kernel TID writes */ + spinlock_t user_tid_lock; /* no back to back user TID writes */ + spinlock_t rcvmod_lock; /* protect rcvctrl shadow changes */ + spinlock_t gpio_lock; /* RMW of shadows/regs for ExtCtrl and GPIO */ + u64 hwerrmask; + u64 errormask; + u64 gpio_out; /* shadow of kr_gpio_out, for rmw ops */ + u64 gpio_mask; /* shadow the gpio mask register */ + u64 extctrl; /* shadow the gpio output enable, etc... */ + /* + * these 5 fields are used to establish deltas for IB symbol + * errors and linkrecovery errors. They can be reported on + * some chips during link negotiation prior to INIT, and with + * DDR when faking DDR negotiations with non-IBTA switches. + * The chip counters are adjusted at driver unload if there is + * a non-zero delta. + */ + u64 ibdeltainprog; + u64 ibsymdelta; + u64 ibsymsnap; + u64 iblnkerrdelta; + u64 iblnkerrsnap; + u64 ibcctrl; /* shadow for kr_ibcctrl */ + u32 lastlinkrecov; /* link recovery issue */ + int irq; + u32 cntrnamelen; + u32 portcntrnamelen; + u32 ncntrs; + u32 nportcntrs; + /* used with gpio interrupts to implement IB counters */ + u32 rxfc_unsupvl_errs; + u32 overrun_thresh_errs; + /* + * these count only cases where _successive_ LocalLinkIntegrity + * errors were seen in the receive headers of IB standard packets + */ + u32 lli_errs; + u32 lli_counter; + u64 lli_thresh; + u64 sword; /* total dwords sent (sample result) */ + u64 rword; /* total dwords received (sample result) */ + u64 spkts; /* total packets sent (sample result) */ + u64 rpkts; /* total packets received (sample result) */ + u64 xmit_wait; /* # of ticks no data sent (sample result) */ + struct timer_list pma_timer; + char emsgbuf[128]; + char bitsmsgbuf[64]; + u8 pma_sample_status; +}; + +/* ibcctrl bits */ +#define QLOGIC_IB_IBCC_LINKINITCMD_DISABLE 1 +/* cycle through TS1/TS2 till OK */ +#define QLOGIC_IB_IBCC_LINKINITCMD_POLL 2 +/* wait for TS1, then go on */ +#define QLOGIC_IB_IBCC_LINKINITCMD_SLEEP 3 +#define QLOGIC_IB_IBCC_LINKINITCMD_SHIFT 16 + +#define QLOGIC_IB_IBCC_LINKCMD_DOWN 1 /* move to 0x11 */ +#define QLOGIC_IB_IBCC_LINKCMD_ARMED 2 /* move to 0x21 */ +#define QLOGIC_IB_IBCC_LINKCMD_ACTIVE 3 /* move to 0x31 */ +#define QLOGIC_IB_IBCC_LINKCMD_SHIFT 18 + +/* + * We could have a single register get/put routine, that takes a group type, + * but this is somewhat clearer and cleaner. It also gives us some error + * checking. 64 bit register reads should always work, but are inefficient + * on opteron (the northbridge always generates 2 separate HT 32 bit reads), + * so we use kreg32 wherever possible. User register and counter register + * reads are always 32 bit reads, so only one form of those routines. + */ + +/** + * qib_read_ureg32 - read 32-bit virtualized per-context register + * @dd: device + * @regno: register number + * @ctxt: context number + * + * Return the contents of a register that is virtualized to be per context. + * Returns -1 on errors (not distinguishable from valid contents at + * runtime; we may add a separate error variable at some point). + */ +static inline u32 qib_read_ureg32(const struct qib_devdata *dd, + enum qib_ureg regno, int ctxt) +{ + if (!dd->kregbase || !(dd->flags & QIB_PRESENT)) + return 0; + + if (dd->userbase) + return readl(regno + (u64 __iomem *) + ((char __iomem *)dd->userbase + + dd->ureg_align * ctxt)); + else + return readl(regno + (u64 __iomem *) + (dd->uregbase + + (char __iomem *)dd->kregbase + + dd->ureg_align * ctxt)); +} + +/** + * qib_write_ureg - write 32-bit virtualized per-context register + * @dd: device + * @regno: register number + * @value: value + * @ctxt: context + * + * Write the contents of a register that is virtualized to be per context. + */ +static inline void qib_write_ureg(const struct qib_devdata *dd, + enum qib_ureg regno, u64 value, int ctxt) +{ + u64 __iomem *ubase; + if (dd->userbase) + ubase = (u64 __iomem *) + ((char __iomem *) dd->userbase + + dd->ureg_align * ctxt); + else + ubase = (u64 __iomem *) + (dd->uregbase + + (char __iomem *) dd->kregbase + + dd->ureg_align * ctxt); + + if (dd->kregbase && (dd->flags & QIB_PRESENT)) + writeq(value, &ubase[regno]); +} + +static inline u32 qib_read_kreg32(const struct qib_devdata *dd, + const u16 regno) +{ + if (!dd->kregbase || !(dd->flags & QIB_PRESENT)) + return -1; + return readl((u32 __iomem *)&dd->kregbase[regno]); +} + +static inline u64 qib_read_kreg64(const struct qib_devdata *dd, + const u16 regno) +{ + if (!dd->kregbase || !(dd->flags & QIB_PRESENT)) + return -1; + + return readq(&dd->kregbase[regno]); +} + +static inline void qib_write_kreg(const struct qib_devdata *dd, + const u16 regno, u64 value) +{ + if (dd->kregbase && (dd->flags & QIB_PRESENT)) + writeq(value, &dd->kregbase[regno]); +} + +/** + * qib_write_kreg_ctxt - write a device's per-ctxt 64-bit kernel register + * @dd: the qlogic_ib device + * @regno: the register number to write + * @ctxt: the context containing the register + * @value: the value to write + */ +static inline void qib_write_kreg_ctxt(const struct qib_devdata *dd, + const u16 regno, unsigned ctxt, + u64 value) +{ + qib_write_kreg(dd, regno + ctxt, value); +} + +static inline void write_6120_creg(const struct qib_devdata *dd, + u16 regno, u64 value) +{ + if (dd->cspec->cregbase && (dd->flags & QIB_PRESENT)) + writeq(value, &dd->cspec->cregbase[regno]); +} + +static inline u64 read_6120_creg(const struct qib_devdata *dd, u16 regno) +{ + if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT)) + return 0; + return readq(&dd->cspec->cregbase[regno]); +} + +static inline u32 read_6120_creg32(const struct qib_devdata *dd, u16 regno) +{ + if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT)) + return 0; + return readl(&dd->cspec->cregbase[regno]); +} + +/* kr_control bits */ +#define QLOGIC_IB_C_RESET 1U + +/* kr_intstatus, kr_intclear, kr_intmask bits */ +#define QLOGIC_IB_I_RCVURG_MASK ((1U << 5) - 1) +#define QLOGIC_IB_I_RCVURG_SHIFT 0 +#define QLOGIC_IB_I_RCVAVAIL_MASK ((1U << 5) - 1) +#define QLOGIC_IB_I_RCVAVAIL_SHIFT 12 + +#define QLOGIC_IB_C_FREEZEMODE 0x00000002 +#define QLOGIC_IB_C_LINKENABLE 0x00000004 +#define QLOGIC_IB_I_ERROR 0x0000000080000000ULL +#define QLOGIC_IB_I_SPIOSENT 0x0000000040000000ULL +#define QLOGIC_IB_I_SPIOBUFAVAIL 0x0000000020000000ULL +#define QLOGIC_IB_I_GPIO 0x0000000010000000ULL +#define QLOGIC_IB_I_BITSEXTANT \ + ((QLOGIC_IB_I_RCVURG_MASK << QLOGIC_IB_I_RCVURG_SHIFT) | \ + (QLOGIC_IB_I_RCVAVAIL_MASK << \ + QLOGIC_IB_I_RCVAVAIL_SHIFT) | \ + QLOGIC_IB_I_ERROR | QLOGIC_IB_I_SPIOSENT | \ + QLOGIC_IB_I_SPIOBUFAVAIL | QLOGIC_IB_I_GPIO) + +/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */ +#define QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK 0x000000000000003fULL +#define QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT 0 +#define QLOGIC_IB_HWE_PCIEPOISONEDTLP 0x0000000010000000ULL +#define QLOGIC_IB_HWE_PCIECPLTIMEOUT 0x0000000020000000ULL +#define QLOGIC_IB_HWE_PCIEBUSPARITYXTLH 0x0000000040000000ULL +#define QLOGIC_IB_HWE_PCIEBUSPARITYXADM 0x0000000080000000ULL +#define QLOGIC_IB_HWE_PCIEBUSPARITYRADM 0x0000000100000000ULL +#define QLOGIC_IB_HWE_COREPLL_FBSLIP 0x0080000000000000ULL +#define QLOGIC_IB_HWE_COREPLL_RFSLIP 0x0100000000000000ULL +#define QLOGIC_IB_HWE_PCIE1PLLFAILED 0x0400000000000000ULL +#define QLOGIC_IB_HWE_PCIE0PLLFAILED 0x0800000000000000ULL +#define QLOGIC_IB_HWE_SERDESPLLFAILED 0x1000000000000000ULL + + +/* kr_extstatus bits */ +#define QLOGIC_IB_EXTS_FREQSEL 0x2 +#define QLOGIC_IB_EXTS_SERDESSEL 0x4 +#define QLOGIC_IB_EXTS_MEMBIST_ENDTEST 0x0000000000004000 +#define QLOGIC_IB_EXTS_MEMBIST_FOUND 0x0000000000008000 + +/* kr_xgxsconfig bits */ +#define QLOGIC_IB_XGXS_RESET 0x5ULL + +#define _QIB_GPIO_SDA_NUM 1 +#define _QIB_GPIO_SCL_NUM 0 + +/* Bits in GPIO for the added IB link interrupts */ +#define GPIO_RXUVL_BIT 3 +#define GPIO_OVRUN_BIT 4 +#define GPIO_LLI_BIT 5 +#define GPIO_ERRINTR_MASK 0x38 + + +#define QLOGIC_IB_RT_BUFSIZE_MASK 0xe0000000ULL +#define QLOGIC_IB_RT_BUFSIZE_SHIFTVAL(tid) \ + ((((tid) & QLOGIC_IB_RT_BUFSIZE_MASK) >> 29) + 11 - 1) +#define QLOGIC_IB_RT_BUFSIZE(tid) (1 << QLOGIC_IB_RT_BUFSIZE_SHIFTVAL(tid)) +#define QLOGIC_IB_RT_IS_VALID(tid) \ + (((tid) & QLOGIC_IB_RT_BUFSIZE_MASK) && \ + ((((tid) & QLOGIC_IB_RT_BUFSIZE_MASK) != QLOGIC_IB_RT_BUFSIZE_MASK))) +#define QLOGIC_IB_RT_ADDR_MASK 0x1FFFFFFFULL /* 29 bits valid */ +#define QLOGIC_IB_RT_ADDR_SHIFT 10 + +#define QLOGIC_IB_R_INTRAVAIL_SHIFT 16 +#define QLOGIC_IB_R_TAILUPD_SHIFT 31 +#define IBA6120_R_PKEY_DIS_SHIFT 30 + +#define PBC_6120_VL15_SEND_CTRL (1ULL << 31) /* pbc; VL15; link_buf only */ + +#define IBCBUSFRSPCPARITYERR HWE_MASK(IBCBusFromSPCParityErr) +#define IBCBUSTOSPCPARITYERR HWE_MASK(IBCBusToSPCParityErr) + +#define SYM_MASK_BIT(regname, fldname, bit) ((u64) \ + ((1ULL << (SYM_LSB(regname, fldname) + (bit))))) + +#define TXEMEMPARITYERR_PIOBUF \ + SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 0) +#define TXEMEMPARITYERR_PIOPBC \ + SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 1) +#define TXEMEMPARITYERR_PIOLAUNCHFIFO \ + SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 2) + +#define RXEMEMPARITYERR_RCVBUF \ + SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 0) +#define RXEMEMPARITYERR_LOOKUPQ \ + SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 1) +#define RXEMEMPARITYERR_EXPTID \ + SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 2) +#define RXEMEMPARITYERR_EAGERTID \ + SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 3) +#define RXEMEMPARITYERR_FLAGBUF \ + SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 4) +#define RXEMEMPARITYERR_DATAINFO \ + SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 5) +#define RXEMEMPARITYERR_HDRINFO \ + SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 6) + +/* 6120 specific hardware errors... */ +static const struct qib_hwerror_msgs qib_6120_hwerror_msgs[] = { + /* generic hardware errors */ + QLOGIC_IB_HWE_MSG(IBCBUSFRSPCPARITYERR, "QIB2IB Parity"), + QLOGIC_IB_HWE_MSG(IBCBUSTOSPCPARITYERR, "IB2QIB Parity"), + + QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOBUF, + "TXE PIOBUF Memory Parity"), + QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOPBC, + "TXE PIOPBC Memory Parity"), + QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOLAUNCHFIFO, + "TXE PIOLAUNCHFIFO Memory Parity"), + + QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_RCVBUF, + "RXE RCVBUF Memory Parity"), + QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_LOOKUPQ, + "RXE LOOKUPQ Memory Parity"), + QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_EAGERTID, + "RXE EAGERTID Memory Parity"), + QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_EXPTID, + "RXE EXPTID Memory Parity"), + QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_FLAGBUF, + "RXE FLAGBUF Memory Parity"), + QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_DATAINFO, + "RXE DATAINFO Memory Parity"), + QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_HDRINFO, + "RXE HDRINFO Memory Parity"), + + /* chip-specific hardware errors */ + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEPOISONEDTLP, + "PCIe Poisoned TLP"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIECPLTIMEOUT, + "PCIe completion timeout"), + /* + * In practice, it's unlikely wthat we'll see PCIe PLL, or bus + * parity or memory parity error failures, because most likely we + * won't be able to talk to the core of the chip. Nonetheless, we + * might see them, if they are in parts of the PCIe core that aren't + * essential. + */ + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE1PLLFAILED, + "PCIePLL1"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE0PLLFAILED, + "PCIePLL0"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYXTLH, + "PCIe XTLH core parity"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYXADM, + "PCIe ADM TX core parity"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYRADM, + "PCIe ADM RX core parity"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_SERDESPLLFAILED, + "SerDes PLL"), +}; + +#define TXE_PIO_PARITY (TXEMEMPARITYERR_PIOBUF | TXEMEMPARITYERR_PIOPBC) +#define _QIB_PLL_FAIL (QLOGIC_IB_HWE_COREPLL_FBSLIP | \ + QLOGIC_IB_HWE_COREPLL_RFSLIP) + + /* variables for sanity checking interrupt and errors */ +#define IB_HWE_BITSEXTANT \ + (HWE_MASK(RXEMemParityErr) | \ + HWE_MASK(TXEMemParityErr) | \ + (QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK << \ + QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT) | \ + QLOGIC_IB_HWE_PCIE1PLLFAILED | \ + QLOGIC_IB_HWE_PCIE0PLLFAILED | \ + QLOGIC_IB_HWE_PCIEPOISONEDTLP | \ + QLOGIC_IB_HWE_PCIECPLTIMEOUT | \ + QLOGIC_IB_HWE_PCIEBUSPARITYXTLH | \ + QLOGIC_IB_HWE_PCIEBUSPARITYXADM | \ + QLOGIC_IB_HWE_PCIEBUSPARITYRADM | \ + HWE_MASK(PowerOnBISTFailed) | \ + QLOGIC_IB_HWE_COREPLL_FBSLIP | \ + QLOGIC_IB_HWE_COREPLL_RFSLIP | \ + QLOGIC_IB_HWE_SERDESPLLFAILED | \ + HWE_MASK(IBCBusToSPCParityErr) | \ + HWE_MASK(IBCBusFromSPCParityErr)) + +#define IB_E_BITSEXTANT \ + (ERR_MASK(RcvFormatErr) | ERR_MASK(RcvVCRCErr) | \ + ERR_MASK(RcvICRCErr) | ERR_MASK(RcvMinPktLenErr) | \ + ERR_MASK(RcvMaxPktLenErr) | ERR_MASK(RcvLongPktLenErr) | \ + ERR_MASK(RcvShortPktLenErr) | ERR_MASK(RcvUnexpectedCharErr) | \ + ERR_MASK(RcvUnsupportedVLErr) | ERR_MASK(RcvEBPErr) | \ + ERR_MASK(RcvIBFlowErr) | ERR_MASK(RcvBadVersionErr) | \ + ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr) | \ + ERR_MASK(RcvBadTidErr) | ERR_MASK(RcvHdrLenErr) | \ + ERR_MASK(RcvHdrErr) | ERR_MASK(RcvIBLostLinkErr) | \ + ERR_MASK(SendMinPktLenErr) | ERR_MASK(SendMaxPktLenErr) | \ + ERR_MASK(SendUnderRunErr) | ERR_MASK(SendPktLenErr) | \ + ERR_MASK(SendDroppedSmpPktErr) | \ + ERR_MASK(SendDroppedDataPktErr) | \ + ERR_MASK(SendPioArmLaunchErr) | \ + ERR_MASK(SendUnexpectedPktNumErr) | \ + ERR_MASK(SendUnsupportedVLErr) | ERR_MASK(IBStatusChanged) | \ + ERR_MASK(InvalidAddrErr) | ERR_MASK(ResetNegated) | \ + ERR_MASK(HardwareErr)) + +#define QLOGIC_IB_E_PKTERRS ( \ + ERR_MASK(SendPktLenErr) | \ + ERR_MASK(SendDroppedDataPktErr) | \ + ERR_MASK(RcvVCRCErr) | \ + ERR_MASK(RcvICRCErr) | \ + ERR_MASK(RcvShortPktLenErr) | \ + ERR_MASK(RcvEBPErr)) + +/* These are all rcv-related errors which we want to count for stats */ +#define E_SUM_PKTERRS \ + (ERR_MASK(RcvHdrLenErr) | ERR_MASK(RcvBadTidErr) | \ + ERR_MASK(RcvBadVersionErr) | ERR_MASK(RcvHdrErr) | \ + ERR_MASK(RcvLongPktLenErr) | ERR_MASK(RcvShortPktLenErr) | \ + ERR_MASK(RcvMaxPktLenErr) | ERR_MASK(RcvMinPktLenErr) | \ + ERR_MASK(RcvFormatErr) | ERR_MASK(RcvUnsupportedVLErr) | \ + ERR_MASK(RcvUnexpectedCharErr) | ERR_MASK(RcvEBPErr)) + +/* These are all send-related errors which we want to count for stats */ +#define E_SUM_ERRS \ + (ERR_MASK(SendPioArmLaunchErr) | \ + ERR_MASK(SendUnexpectedPktNumErr) | \ + ERR_MASK(SendDroppedDataPktErr) | \ + ERR_MASK(SendDroppedSmpPktErr) | \ + ERR_MASK(SendMaxPktLenErr) | ERR_MASK(SendUnsupportedVLErr) | \ + ERR_MASK(SendMinPktLenErr) | ERR_MASK(SendPktLenErr) | \ + ERR_MASK(InvalidAddrErr)) + +/* + * this is similar to E_SUM_ERRS, but can't ignore armlaunch, don't ignore + * errors not related to freeze and cancelling buffers. Can't ignore + * armlaunch because could get more while still cleaning up, and need + * to cancel those as they happen. + */ +#define E_SPKT_ERRS_IGNORE \ + (ERR_MASK(SendDroppedDataPktErr) | \ + ERR_MASK(SendDroppedSmpPktErr) | \ + ERR_MASK(SendMaxPktLenErr) | ERR_MASK(SendMinPktLenErr) | \ + ERR_MASK(SendPktLenErr)) + +/* + * these are errors that can occur when the link changes state while + * a packet is being sent or received. This doesn't cover things + * like EBP or VCRC that can be the result of a sending having the + * link change state, so we receive a "known bad" packet. + */ +#define E_SUM_LINK_PKTERRS \ + (ERR_MASK(SendDroppedDataPktErr) | \ + ERR_MASK(SendDroppedSmpPktErr) | \ + ERR_MASK(SendMinPktLenErr) | ERR_MASK(SendPktLenErr) | \ + ERR_MASK(RcvShortPktLenErr) | ERR_MASK(RcvMinPktLenErr) | \ + ERR_MASK(RcvUnexpectedCharErr)) + +static void qib_6120_put_tid_2(struct qib_devdata *, u64 __iomem *, + u32, unsigned long); + +/* + * On platforms using this chip, and not having ordered WC stores, we + * can get TXE parity errors due to speculative reads to the PIO buffers, + * and this, due to a chip issue can result in (many) false parity error + * reports. So it's a debug print on those, and an info print on systems + * where the speculative reads don't occur. + */ +static void qib_6120_txe_recover(struct qib_devdata *dd) +{ + if (!qib_unordered_wc()) + qib_devinfo(dd->pcidev, + "Recovering from TXE PIO parity error\n"); +} + +/* enable/disable chip from delivering interrupts */ +static void qib_6120_set_intr_state(struct qib_devdata *dd, u32 enable) +{ + if (enable) { + if (dd->flags & QIB_BADINTR) + return; + qib_write_kreg(dd, kr_intmask, ~0ULL); + /* force re-interrupt of any pending interrupts. */ + qib_write_kreg(dd, kr_intclear, 0ULL); + } else + qib_write_kreg(dd, kr_intmask, 0ULL); +} + +/* + * Try to cleanup as much as possible for anything that might have gone + * wrong while in freeze mode, such as pio buffers being written by user + * processes (causing armlaunch), send errors due to going into freeze mode, + * etc., and try to avoid causing extra interrupts while doing so. + * Forcibly update the in-memory pioavail register copies after cleanup + * because the chip won't do it while in freeze mode (the register values + * themselves are kept correct). + * Make sure that we don't lose any important interrupts by using the chip + * feature that says that writing 0 to a bit in *clear that is set in + * *status will cause an interrupt to be generated again (if allowed by + * the *mask value). + * This is in chip-specific code because of all of the register accesses, + * even though the details are similar on most chips + */ +static void qib_6120_clear_freeze(struct qib_devdata *dd) +{ + /* disable error interrupts, to avoid confusion */ + qib_write_kreg(dd, kr_errmask, 0ULL); + + /* also disable interrupts; errormask is sometimes overwriten */ + qib_6120_set_intr_state(dd, 0); + + qib_cancel_sends(dd->pport); + + /* clear the freeze, and be sure chip saw it */ + qib_write_kreg(dd, kr_control, dd->control); + qib_read_kreg32(dd, kr_scratch); + + /* force in-memory update now we are out of freeze */ + qib_force_pio_avail_update(dd); + + /* + * force new interrupt if any hwerr, error or interrupt bits are + * still set, and clear "safe" send packet errors related to freeze + * and cancelling sends. Re-enable error interrupts before possible + * force of re-interrupt on pending interrupts. + */ + qib_write_kreg(dd, kr_hwerrclear, 0ULL); + qib_write_kreg(dd, kr_errclear, E_SPKT_ERRS_IGNORE); + qib_write_kreg(dd, kr_errmask, dd->cspec->errormask); + qib_6120_set_intr_state(dd, 1); +} + +/** + * qib_handle_6120_hwerrors - display hardware errors. + * @dd: the qlogic_ib device + * @msg: the output buffer + * @msgl: the size of the output buffer + * + * Use same msg buffer as regular errors to avoid excessive stack + * use. Most hardware errors are catastrophic, but for right now, + * we'll print them and continue. Reuse the same message buffer as + * handle_6120_errors() to avoid excessive stack usage. + */ +static void qib_handle_6120_hwerrors(struct qib_devdata *dd, char *msg, + size_t msgl) +{ + u64 hwerrs; + u32 bits, ctrl; + int isfatal = 0; + char *bitsmsg; + int log_idx; + + hwerrs = qib_read_kreg64(dd, kr_hwerrstatus); + if (!hwerrs) + return; + if (hwerrs == ~0ULL) { + qib_dev_err(dd, + "Read of hardware error status failed (all bits set); ignoring\n"); + return; + } + qib_stats.sps_hwerrs++; + + /* Always clear the error status register, except MEMBISTFAIL, + * regardless of whether we continue or stop using the chip. + * We want that set so we know it failed, even across driver reload. + * We'll still ignore it in the hwerrmask. We do this partly for + * diagnostics, but also for support */ + qib_write_kreg(dd, kr_hwerrclear, + hwerrs & ~HWE_MASK(PowerOnBISTFailed)); + + hwerrs &= dd->cspec->hwerrmask; + + /* We log some errors to EEPROM, check if we have any of those. */ + for (log_idx = 0; log_idx < QIB_EEP_LOG_CNT; ++log_idx) + if (hwerrs & dd->eep_st_masks[log_idx].hwerrs_to_log) + qib_inc_eeprom_err(dd, log_idx, 1); + + /* + * Make sure we get this much out, unless told to be quiet, + * or it's occurred within the last 5 seconds. + */ + if (hwerrs & ~(TXE_PIO_PARITY | RXEMEMPARITYERR_EAGERTID)) + qib_devinfo(dd->pcidev, + "Hardware error: hwerr=0x%llx (cleared)\n", + (unsigned long long) hwerrs); + + if (hwerrs & ~IB_HWE_BITSEXTANT) + qib_dev_err(dd, + "hwerror interrupt with unknown errors %llx set\n", + (unsigned long long)(hwerrs & ~IB_HWE_BITSEXTANT)); + + ctrl = qib_read_kreg32(dd, kr_control); + if ((ctrl & QLOGIC_IB_C_FREEZEMODE) && !dd->diag_client) { + /* + * Parity errors in send memory are recoverable, + * just cancel the send (if indicated in * sendbuffererror), + * count the occurrence, unfreeze (if no other handled + * hardware error bits are set), and continue. They can + * occur if a processor speculative read is done to the PIO + * buffer while we are sending a packet, for example. + */ + if (hwerrs & TXE_PIO_PARITY) { + qib_6120_txe_recover(dd); + hwerrs &= ~TXE_PIO_PARITY; + } + + if (!hwerrs) { + static u32 freeze_cnt; + + freeze_cnt++; + qib_6120_clear_freeze(dd); + } else + isfatal = 1; + } + + *msg = '\0'; + + if (hwerrs & HWE_MASK(PowerOnBISTFailed)) { + isfatal = 1; + strlcat(msg, + "[Memory BIST test failed, InfiniPath hardware unusable]", + msgl); + /* ignore from now on, so disable until driver reloaded */ + dd->cspec->hwerrmask &= ~HWE_MASK(PowerOnBISTFailed); + qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask); + } + + qib_format_hwerrors(hwerrs, qib_6120_hwerror_msgs, + ARRAY_SIZE(qib_6120_hwerror_msgs), msg, msgl); + + bitsmsg = dd->cspec->bitsmsgbuf; + if (hwerrs & (QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK << + QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT)) { + bits = (u32) ((hwerrs >> + QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT) & + QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK); + snprintf(bitsmsg, sizeof dd->cspec->bitsmsgbuf, + "[PCIe Mem Parity Errs %x] ", bits); + strlcat(msg, bitsmsg, msgl); + } + + if (hwerrs & _QIB_PLL_FAIL) { + isfatal = 1; + snprintf(bitsmsg, sizeof dd->cspec->bitsmsgbuf, + "[PLL failed (%llx), InfiniPath hardware unusable]", + (unsigned long long) hwerrs & _QIB_PLL_FAIL); + strlcat(msg, bitsmsg, msgl); + /* ignore from now on, so disable until driver reloaded */ + dd->cspec->hwerrmask &= ~(hwerrs & _QIB_PLL_FAIL); + qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask); + } + + if (hwerrs & QLOGIC_IB_HWE_SERDESPLLFAILED) { + /* + * If it occurs, it is left masked since the external + * interface is unused + */ + dd->cspec->hwerrmask &= ~QLOGIC_IB_HWE_SERDESPLLFAILED; + qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask); + } + + if (hwerrs) + /* + * if any set that we aren't ignoring; only + * make the complaint once, in case it's stuck + * or recurring, and we get here multiple + * times. + */ + qib_dev_err(dd, "%s hardware error\n", msg); + else + *msg = 0; /* recovered from all of them */ + + if (isfatal && !dd->diag_client) { + qib_dev_err(dd, + "Fatal Hardware Error, no longer usable, SN %.16s\n", + dd->serial); + /* + * for /sys status file and user programs to print; if no + * trailing brace is copied, we'll know it was truncated. + */ + if (dd->freezemsg) + snprintf(dd->freezemsg, dd->freezelen, + "{%s}", msg); + qib_disable_after_error(dd); + } +} + +/* + * Decode the error status into strings, deciding whether to always + * print * it or not depending on "normal packet errors" vs everything + * else. Return 1 if "real" errors, otherwise 0 if only packet + * errors, so caller can decide what to print with the string. + */ +static int qib_decode_6120_err(struct qib_devdata *dd, char *buf, size_t blen, + u64 err) +{ + int iserr = 1; + + *buf = '\0'; + if (err & QLOGIC_IB_E_PKTERRS) { + if (!(err & ~QLOGIC_IB_E_PKTERRS)) + iserr = 0; + if ((err & ERR_MASK(RcvICRCErr)) && + !(err&(ERR_MASK(RcvVCRCErr)|ERR_MASK(RcvEBPErr)))) + strlcat(buf, "CRC ", blen); + if (!iserr) + goto done; + } + if (err & ERR_MASK(RcvHdrLenErr)) + strlcat(buf, "rhdrlen ", blen); + if (err & ERR_MASK(RcvBadTidErr)) + strlcat(buf, "rbadtid ", blen); + if (err & ERR_MASK(RcvBadVersionErr)) + strlcat(buf, "rbadversion ", blen); + if (err & ERR_MASK(RcvHdrErr)) + strlcat(buf, "rhdr ", blen); + if (err & ERR_MASK(RcvLongPktLenErr)) + strlcat(buf, "rlongpktlen ", blen); + if (err & ERR_MASK(RcvMaxPktLenErr)) + strlcat(buf, "rmaxpktlen ", blen); + if (err & ERR_MASK(RcvMinPktLenErr)) + strlcat(buf, "rminpktlen ", blen); + if (err & ERR_MASK(SendMinPktLenErr)) + strlcat(buf, "sminpktlen ", blen); + if (err & ERR_MASK(RcvFormatErr)) + strlcat(buf, "rformaterr ", blen); + if (err & ERR_MASK(RcvUnsupportedVLErr)) + strlcat(buf, "runsupvl ", blen); + if (err & ERR_MASK(RcvUnexpectedCharErr)) + strlcat(buf, "runexpchar ", blen); + if (err & ERR_MASK(RcvIBFlowErr)) + strlcat(buf, "ribflow ", blen); + if (err & ERR_MASK(SendUnderRunErr)) + strlcat(buf, "sunderrun ", blen); + if (err & ERR_MASK(SendPioArmLaunchErr)) + strlcat(buf, "spioarmlaunch ", blen); + if (err & ERR_MASK(SendUnexpectedPktNumErr)) + strlcat(buf, "sunexperrpktnum ", blen); + if (err & ERR_MASK(SendDroppedSmpPktErr)) + strlcat(buf, "sdroppedsmppkt ", blen); + if (err & ERR_MASK(SendMaxPktLenErr)) + strlcat(buf, "smaxpktlen ", blen); + if (err & ERR_MASK(SendUnsupportedVLErr)) + strlcat(buf, "sunsupVL ", blen); + if (err & ERR_MASK(InvalidAddrErr)) + strlcat(buf, "invalidaddr ", blen); + if (err & ERR_MASK(RcvEgrFullErr)) + strlcat(buf, "rcvegrfull ", blen); + if (err & ERR_MASK(RcvHdrFullErr)) + strlcat(buf, "rcvhdrfull ", blen); + if (err & ERR_MASK(IBStatusChanged)) + strlcat(buf, "ibcstatuschg ", blen); + if (err & ERR_MASK(RcvIBLostLinkErr)) + strlcat(buf, "riblostlink ", blen); + if (err & ERR_MASK(HardwareErr)) + strlcat(buf, "hardware ", blen); + if (err & ERR_MASK(ResetNegated)) + strlcat(buf, "reset ", blen); +done: + return iserr; +} + +/* + * Called when we might have an error that is specific to a particular + * PIO buffer, and may need to cancel that buffer, so it can be re-used. + */ +static void qib_disarm_6120_senderrbufs(struct qib_pportdata *ppd) +{ + unsigned long sbuf[2]; + struct qib_devdata *dd = ppd->dd; + + /* + * It's possible that sendbuffererror could have bits set; might + * have already done this as a result of hardware error handling. + */ + sbuf[0] = qib_read_kreg64(dd, kr_sendbuffererror); + sbuf[1] = qib_read_kreg64(dd, kr_sendbuffererror + 1); + + if (sbuf[0] || sbuf[1]) + qib_disarm_piobufs_set(dd, sbuf, + dd->piobcnt2k + dd->piobcnt4k); +} + +static int chk_6120_linkrecovery(struct qib_devdata *dd, u64 ibcs) +{ + int ret = 1; + u32 ibstate = qib_6120_iblink_state(ibcs); + u32 linkrecov = read_6120_creg32(dd, cr_iblinkerrrecov); + + if (linkrecov != dd->cspec->lastlinkrecov) { + /* and no more until active again */ + dd->cspec->lastlinkrecov = 0; + qib_set_linkstate(dd->pport, QIB_IB_LINKDOWN); + ret = 0; + } + if (ibstate == IB_PORT_ACTIVE) + dd->cspec->lastlinkrecov = + read_6120_creg32(dd, cr_iblinkerrrecov); + return ret; +} + +static void handle_6120_errors(struct qib_devdata *dd, u64 errs) +{ + char *msg; + u64 ignore_this_time = 0; + u64 iserr = 0; + int log_idx; + struct qib_pportdata *ppd = dd->pport; + u64 mask; + + /* don't report errors that are masked */ + errs &= dd->cspec->errormask; + msg = dd->cspec->emsgbuf; + + /* do these first, they are most important */ + if (errs & ERR_MASK(HardwareErr)) + qib_handle_6120_hwerrors(dd, msg, sizeof dd->cspec->emsgbuf); + else + for (log_idx = 0; log_idx < QIB_EEP_LOG_CNT; ++log_idx) + if (errs & dd->eep_st_masks[log_idx].errs_to_log) + qib_inc_eeprom_err(dd, log_idx, 1); + + if (errs & ~IB_E_BITSEXTANT) + qib_dev_err(dd, + "error interrupt with unknown errors %llx set\n", + (unsigned long long) (errs & ~IB_E_BITSEXTANT)); + + if (errs & E_SUM_ERRS) { + qib_disarm_6120_senderrbufs(ppd); + if ((errs & E_SUM_LINK_PKTERRS) && + !(ppd->lflags & QIBL_LINKACTIVE)) { + /* + * This can happen when trying to bring the link + * up, but the IB link changes state at the "wrong" + * time. The IB logic then complains that the packet + * isn't valid. We don't want to confuse people, so + * we just don't print them, except at debug + */ + ignore_this_time = errs & E_SUM_LINK_PKTERRS; + } + } else if ((errs & E_SUM_LINK_PKTERRS) && + !(ppd->lflags & QIBL_LINKACTIVE)) { + /* + * This can happen when SMA is trying to bring the link + * up, but the IB link changes state at the "wrong" time. + * The IB logic then complains that the packet isn't + * valid. We don't want to confuse people, so we just + * don't print them, except at debug + */ + ignore_this_time = errs & E_SUM_LINK_PKTERRS; + } + + qib_write_kreg(dd, kr_errclear, errs); + + errs &= ~ignore_this_time; + if (!errs) + goto done; + + /* + * The ones we mask off are handled specially below + * or above. + */ + mask = ERR_MASK(IBStatusChanged) | ERR_MASK(RcvEgrFullErr) | + ERR_MASK(RcvHdrFullErr) | ERR_MASK(HardwareErr); + qib_decode_6120_err(dd, msg, sizeof dd->cspec->emsgbuf, errs & ~mask); + + if (errs & E_SUM_PKTERRS) + qib_stats.sps_rcverrs++; + if (errs & E_SUM_ERRS) + qib_stats.sps_txerrs++; + + iserr = errs & ~(E_SUM_PKTERRS | QLOGIC_IB_E_PKTERRS); + + if (errs & ERR_MASK(IBStatusChanged)) { + u64 ibcs = qib_read_kreg64(dd, kr_ibcstatus); + u32 ibstate = qib_6120_iblink_state(ibcs); + int handle = 1; + + if (ibstate != IB_PORT_INIT && dd->cspec->lastlinkrecov) + handle = chk_6120_linkrecovery(dd, ibcs); + /* + * Since going into a recovery state causes the link state + * to go down and since recovery is transitory, it is better + * if we "miss" ever seeing the link training state go into + * recovery (i.e., ignore this transition for link state + * special handling purposes) without updating lastibcstat. + */ + if (handle && qib_6120_phys_portstate(ibcs) == + IB_PHYSPORTSTATE_LINK_ERR_RECOVER) + handle = 0; + if (handle) + qib_handle_e_ibstatuschanged(ppd, ibcs); + } + + if (errs & ERR_MASK(ResetNegated)) { + qib_dev_err(dd, + "Got reset, requires re-init (unload and reload driver)\n"); + dd->flags &= ~QIB_INITTED; /* needs re-init */ + /* mark as having had error */ + *dd->devstatusp |= QIB_STATUS_HWERROR; + *dd->pport->statusp &= ~QIB_STATUS_IB_CONF; + } + + if (*msg && iserr) + qib_dev_porterr(dd, ppd->port, "%s error\n", msg); + + if (ppd->state_wanted & ppd->lflags) + wake_up_interruptible(&ppd->state_wait); + + /* + * If there were hdrq or egrfull errors, wake up any processes + * waiting in poll. We used to try to check which contexts had + * the overflow, but given the cost of that and the chip reads + * to support it, it's better to just wake everybody up if we + * get an overflow; waiters can poll again if it's not them. + */ + if (errs & (ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr))) { + qib_handle_urcv(dd, ~0U); + if (errs & ERR_MASK(RcvEgrFullErr)) + qib_stats.sps_buffull++; + else + qib_stats.sps_hdrfull++; + } +done: + return; +} + +/** + * qib_6120_init_hwerrors - enable hardware errors + * @dd: the qlogic_ib device + * + * now that we have finished initializing everything that might reasonably + * cause a hardware error, and cleared those errors bits as they occur, + * we can enable hardware errors in the mask (potentially enabling + * freeze mode), and enable hardware errors as errors (along with + * everything else) in errormask + */ +static void qib_6120_init_hwerrors(struct qib_devdata *dd) +{ + u64 val; + u64 extsval; + + extsval = qib_read_kreg64(dd, kr_extstatus); + + if (!(extsval & QLOGIC_IB_EXTS_MEMBIST_ENDTEST)) + qib_dev_err(dd, "MemBIST did not complete!\n"); + + /* init so all hwerrors interrupt, and enter freeze, ajdust below */ + val = ~0ULL; + if (dd->minrev < 2) { + /* + * Avoid problem with internal interface bus parity + * checking. Fixed in Rev2. + */ + val &= ~QLOGIC_IB_HWE_PCIEBUSPARITYRADM; + } + /* avoid some intel cpu's speculative read freeze mode issue */ + val &= ~TXEMEMPARITYERR_PIOBUF; + + dd->cspec->hwerrmask = val; + + qib_write_kreg(dd, kr_hwerrclear, ~HWE_MASK(PowerOnBISTFailed)); + qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask); + + /* clear all */ + qib_write_kreg(dd, kr_errclear, ~0ULL); + /* enable errors that are masked, at least this first time. */ + qib_write_kreg(dd, kr_errmask, ~0ULL); + dd->cspec->errormask = qib_read_kreg64(dd, kr_errmask); + /* clear any interrupts up to this point (ints still not enabled) */ + qib_write_kreg(dd, kr_intclear, ~0ULL); + + qib_write_kreg(dd, kr_rcvbthqp, + dd->qpn_mask << (QIB_6120_RcvBTHQP_BTHQP_Mask_LSB - 1) | + QIB_KD_QP); +} + +/* + * Disable and enable the armlaunch error. Used for PIO bandwidth testing + * on chips that are count-based, rather than trigger-based. There is no + * reference counting, but that's also fine, given the intended use. + * Only chip-specific because it's all register accesses + */ +static void qib_set_6120_armlaunch(struct qib_devdata *dd, u32 enable) +{ + if (enable) { + qib_write_kreg(dd, kr_errclear, + ERR_MASK(SendPioArmLaunchErr)); + dd->cspec->errormask |= ERR_MASK(SendPioArmLaunchErr); + } else + dd->cspec->errormask &= ~ERR_MASK(SendPioArmLaunchErr); + qib_write_kreg(dd, kr_errmask, dd->cspec->errormask); +} + +/* + * Formerly took parameter in pre-shifted, + * pre-merged form with LinkCmd and LinkInitCmd + * together, and assuming the zero was NOP. + */ +static void qib_set_ib_6120_lstate(struct qib_pportdata *ppd, u16 linkcmd, + u16 linitcmd) +{ + u64 mod_wd; + struct qib_devdata *dd = ppd->dd; + unsigned long flags; + + if (linitcmd == QLOGIC_IB_IBCC_LINKINITCMD_DISABLE) { + /* + * If we are told to disable, note that so link-recovery + * code does not attempt to bring us back up. + */ + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags |= QIBL_IB_LINK_DISABLED; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + } else if (linitcmd || linkcmd == QLOGIC_IB_IBCC_LINKCMD_DOWN) { + /* + * Any other linkinitcmd will lead to LINKDOWN and then + * to INIT (if all is well), so clear flag to let + * link-recovery code attempt to bring us back up. + */ + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_IB_LINK_DISABLED; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + } + + mod_wd = (linkcmd << QLOGIC_IB_IBCC_LINKCMD_SHIFT) | + (linitcmd << QLOGIC_IB_IBCC_LINKINITCMD_SHIFT); + + qib_write_kreg(dd, kr_ibcctrl, dd->cspec->ibcctrl | mod_wd); + /* write to chip to prevent back-to-back writes of control reg */ + qib_write_kreg(dd, kr_scratch, 0); +} + +/** + * qib_6120_bringup_serdes - bring up the serdes + * @dd: the qlogic_ib device + */ +static int qib_6120_bringup_serdes(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + u64 val, config1, prev_val, hwstat, ibc; + + /* Put IBC in reset, sends disabled */ + dd->control &= ~QLOGIC_IB_C_LINKENABLE; + qib_write_kreg(dd, kr_control, 0ULL); + + dd->cspec->ibdeltainprog = 1; + dd->cspec->ibsymsnap = read_6120_creg32(dd, cr_ibsymbolerr); + dd->cspec->iblnkerrsnap = read_6120_creg32(dd, cr_iblinkerrrecov); + + /* flowcontrolwatermark is in units of KBytes */ + ibc = 0x5ULL << SYM_LSB(IBCCtrl, FlowCtrlWaterMark); + /* + * How often flowctrl sent. More or less in usecs; balance against + * watermark value, so that in theory senders always get a flow + * control update in time to not let the IB link go idle. + */ + ibc |= 0x3ULL << SYM_LSB(IBCCtrl, FlowCtrlPeriod); + /* max error tolerance */ + dd->cspec->lli_thresh = 0xf; + ibc |= (u64) dd->cspec->lli_thresh << SYM_LSB(IBCCtrl, PhyerrThreshold); + /* use "real" buffer space for */ + ibc |= 4ULL << SYM_LSB(IBCCtrl, CreditScale); + /* IB credit flow control. */ + ibc |= 0xfULL << SYM_LSB(IBCCtrl, OverrunThreshold); + /* + * set initial max size pkt IBC will send, including ICRC; it's the + * PIO buffer size in dwords, less 1; also see qib_set_mtu() + */ + ibc |= ((u64)(ppd->ibmaxlen >> 2) + 1) << SYM_LSB(IBCCtrl, MaxPktLen); + dd->cspec->ibcctrl = ibc; /* without linkcmd or linkinitcmd! */ + + /* initially come up waiting for TS1, without sending anything. */ + val = dd->cspec->ibcctrl | (QLOGIC_IB_IBCC_LINKINITCMD_DISABLE << + QLOGIC_IB_IBCC_LINKINITCMD_SHIFT); + qib_write_kreg(dd, kr_ibcctrl, val); + + val = qib_read_kreg64(dd, kr_serdes_cfg0); + config1 = qib_read_kreg64(dd, kr_serdes_cfg1); + + /* + * Force reset on, also set rxdetect enable. Must do before reading + * serdesstatus at least for simulation, or some of the bits in + * serdes status will come back as undefined and cause simulation + * failures + */ + val |= SYM_MASK(SerdesCfg0, ResetPLL) | + SYM_MASK(SerdesCfg0, RxDetEnX) | + (SYM_MASK(SerdesCfg0, L1PwrDnA) | + SYM_MASK(SerdesCfg0, L1PwrDnB) | + SYM_MASK(SerdesCfg0, L1PwrDnC) | + SYM_MASK(SerdesCfg0, L1PwrDnD)); + qib_write_kreg(dd, kr_serdes_cfg0, val); + /* be sure chip saw it */ + qib_read_kreg64(dd, kr_scratch); + udelay(5); /* need pll reset set at least for a bit */ + /* + * after PLL is reset, set the per-lane Resets and TxIdle and + * clear the PLL reset and rxdetect (to get falling edge). + * Leave L1PWR bits set (permanently) + */ + val &= ~(SYM_MASK(SerdesCfg0, RxDetEnX) | + SYM_MASK(SerdesCfg0, ResetPLL) | + (SYM_MASK(SerdesCfg0, L1PwrDnA) | + SYM_MASK(SerdesCfg0, L1PwrDnB) | + SYM_MASK(SerdesCfg0, L1PwrDnC) | + SYM_MASK(SerdesCfg0, L1PwrDnD))); + val |= (SYM_MASK(SerdesCfg0, ResetA) | + SYM_MASK(SerdesCfg0, ResetB) | + SYM_MASK(SerdesCfg0, ResetC) | + SYM_MASK(SerdesCfg0, ResetD)) | + SYM_MASK(SerdesCfg0, TxIdeEnX); + qib_write_kreg(dd, kr_serdes_cfg0, val); + /* be sure chip saw it */ + (void) qib_read_kreg64(dd, kr_scratch); + /* need PLL reset clear for at least 11 usec before lane + * resets cleared; give it a few more to be sure */ + udelay(15); + val &= ~((SYM_MASK(SerdesCfg0, ResetA) | + SYM_MASK(SerdesCfg0, ResetB) | + SYM_MASK(SerdesCfg0, ResetC) | + SYM_MASK(SerdesCfg0, ResetD)) | + SYM_MASK(SerdesCfg0, TxIdeEnX)); + + qib_write_kreg(dd, kr_serdes_cfg0, val); + /* be sure chip saw it */ + (void) qib_read_kreg64(dd, kr_scratch); + + val = qib_read_kreg64(dd, kr_xgxs_cfg); + prev_val = val; + if (val & QLOGIC_IB_XGXS_RESET) + val &= ~QLOGIC_IB_XGXS_RESET; + if (SYM_FIELD(val, XGXSCfg, polarity_inv) != ppd->rx_pol_inv) { + /* need to compensate for Tx inversion in partner */ + val &= ~SYM_MASK(XGXSCfg, polarity_inv); + val |= (u64)ppd->rx_pol_inv << SYM_LSB(XGXSCfg, polarity_inv); + } + if (val != prev_val) + qib_write_kreg(dd, kr_xgxs_cfg, val); + + val = qib_read_kreg64(dd, kr_serdes_cfg0); + + /* clear current and de-emphasis bits */ + config1 &= ~0x0ffffffff00ULL; + /* set current to 20ma */ + config1 |= 0x00000000000ULL; + /* set de-emphasis to -5.68dB */ + config1 |= 0x0cccc000000ULL; + qib_write_kreg(dd, kr_serdes_cfg1, config1); + + /* base and port guid same for single port */ + ppd->guid = dd->base_guid; + + /* + * the process of setting and un-resetting the serdes normally + * causes a serdes PLL error, so check for that and clear it + * here. Also clearr hwerr bit in errstatus, but not others. + */ + hwstat = qib_read_kreg64(dd, kr_hwerrstatus); + if (hwstat) { + /* should just have PLL, clear all set, in an case */ + qib_write_kreg(dd, kr_hwerrclear, hwstat); + qib_write_kreg(dd, kr_errclear, ERR_MASK(HardwareErr)); + } + + dd->control |= QLOGIC_IB_C_LINKENABLE; + dd->control &= ~QLOGIC_IB_C_FREEZEMODE; + qib_write_kreg(dd, kr_control, dd->control); + + return 0; +} + +/** + * qib_6120_quiet_serdes - set serdes to txidle + * @ppd: physical port of the qlogic_ib device + * Called when driver is being unloaded + */ +static void qib_6120_quiet_serdes(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + u64 val; + + qib_set_ib_6120_lstate(ppd, 0, QLOGIC_IB_IBCC_LINKINITCMD_DISABLE); + + /* disable IBC */ + dd->control &= ~QLOGIC_IB_C_LINKENABLE; + qib_write_kreg(dd, kr_control, + dd->control | QLOGIC_IB_C_FREEZEMODE); + + if (dd->cspec->ibsymdelta || dd->cspec->iblnkerrdelta || + dd->cspec->ibdeltainprog) { + u64 diagc; + + /* enable counter writes */ + diagc = qib_read_kreg64(dd, kr_hwdiagctrl); + qib_write_kreg(dd, kr_hwdiagctrl, + diagc | SYM_MASK(HwDiagCtrl, CounterWrEnable)); + + if (dd->cspec->ibsymdelta || dd->cspec->ibdeltainprog) { + val = read_6120_creg32(dd, cr_ibsymbolerr); + if (dd->cspec->ibdeltainprog) + val -= val - dd->cspec->ibsymsnap; + val -= dd->cspec->ibsymdelta; + write_6120_creg(dd, cr_ibsymbolerr, val); + } + if (dd->cspec->iblnkerrdelta || dd->cspec->ibdeltainprog) { + val = read_6120_creg32(dd, cr_iblinkerrrecov); + if (dd->cspec->ibdeltainprog) + val -= val - dd->cspec->iblnkerrsnap; + val -= dd->cspec->iblnkerrdelta; + write_6120_creg(dd, cr_iblinkerrrecov, val); + } + + /* and disable counter writes */ + qib_write_kreg(dd, kr_hwdiagctrl, diagc); + } + + val = qib_read_kreg64(dd, kr_serdes_cfg0); + val |= SYM_MASK(SerdesCfg0, TxIdeEnX); + qib_write_kreg(dd, kr_serdes_cfg0, val); +} + +/** + * qib_6120_setup_setextled - set the state of the two external LEDs + * @dd: the qlogic_ib device + * @on: whether the link is up or not + * + * The exact combo of LEDs if on is true is determined by looking + * at the ibcstatus. + + * These LEDs indicate the physical and logical state of IB link. + * For this chip (at least with recommended board pinouts), LED1 + * is Yellow (logical state) and LED2 is Green (physical state), + * + * Note: We try to match the Mellanox HCA LED behavior as best + * we can. Green indicates physical link state is OK (something is + * plugged in, and we can train). + * Amber indicates the link is logically up (ACTIVE). + * Mellanox further blinks the amber LED to indicate data packet + * activity, but we have no hardware support for that, so it would + * require waking up every 10-20 msecs and checking the counters + * on the chip, and then turning the LED off if appropriate. That's + * visible overhead, so not something we will do. + * + */ +static void qib_6120_setup_setextled(struct qib_pportdata *ppd, u32 on) +{ + u64 extctl, val, lst, ltst; + unsigned long flags; + struct qib_devdata *dd = ppd->dd; + + /* + * The diags use the LED to indicate diag info, so we leave + * the external LED alone when the diags are running. + */ + if (dd->diag_client) + return; + + /* Allow override of LED display for, e.g. Locating system in rack */ + if (ppd->led_override) { + ltst = (ppd->led_override & QIB_LED_PHYS) ? + IB_PHYSPORTSTATE_LINKUP : IB_PHYSPORTSTATE_DISABLED, + lst = (ppd->led_override & QIB_LED_LOG) ? + IB_PORT_ACTIVE : IB_PORT_DOWN; + } else if (on) { + val = qib_read_kreg64(dd, kr_ibcstatus); + ltst = qib_6120_phys_portstate(val); + lst = qib_6120_iblink_state(val); + } else { + ltst = 0; + lst = 0; + } + + spin_lock_irqsave(&dd->cspec->gpio_lock, flags); + extctl = dd->cspec->extctrl & ~(SYM_MASK(EXTCtrl, LEDPriPortGreenOn) | + SYM_MASK(EXTCtrl, LEDPriPortYellowOn)); + + if (ltst == IB_PHYSPORTSTATE_LINKUP) + extctl |= SYM_MASK(EXTCtrl, LEDPriPortYellowOn); + if (lst == IB_PORT_ACTIVE) + extctl |= SYM_MASK(EXTCtrl, LEDPriPortGreenOn); + dd->cspec->extctrl = extctl; + qib_write_kreg(dd, kr_extctrl, extctl); + spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags); +} + +static void qib_6120_free_irq(struct qib_devdata *dd) +{ + if (dd->cspec->irq) { + free_irq(dd->cspec->irq, dd); + dd->cspec->irq = 0; + } + qib_nomsi(dd); +} + +/** + * qib_6120_setup_cleanup - clean up any per-chip chip-specific stuff + * @dd: the qlogic_ib device + * + * This is called during driver unload. +*/ +static void qib_6120_setup_cleanup(struct qib_devdata *dd) +{ + qib_6120_free_irq(dd); + kfree(dd->cspec->cntrs); + kfree(dd->cspec->portcntrs); + if (dd->cspec->dummy_hdrq) { + dma_free_coherent(&dd->pcidev->dev, + ALIGN(dd->rcvhdrcnt * + dd->rcvhdrentsize * + sizeof(u32), PAGE_SIZE), + dd->cspec->dummy_hdrq, + dd->cspec->dummy_hdrq_phys); + dd->cspec->dummy_hdrq = NULL; + } +} + +static void qib_wantpiobuf_6120_intr(struct qib_devdata *dd, u32 needint) +{ + unsigned long flags; + + spin_lock_irqsave(&dd->sendctrl_lock, flags); + if (needint) + dd->sendctrl |= SYM_MASK(SendCtrl, PIOIntBufAvail); + else + dd->sendctrl &= ~SYM_MASK(SendCtrl, PIOIntBufAvail); + qib_write_kreg(dd, kr_sendctrl, dd->sendctrl); + qib_write_kreg(dd, kr_scratch, 0ULL); + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); +} + +/* + * handle errors and unusual events first, separate function + * to improve cache hits for fast path interrupt handling + */ +static noinline void unlikely_6120_intr(struct qib_devdata *dd, u64 istat) +{ + if (unlikely(istat & ~QLOGIC_IB_I_BITSEXTANT)) + qib_dev_err(dd, "interrupt with unknown interrupts %Lx set\n", + istat & ~QLOGIC_IB_I_BITSEXTANT); + + if (istat & QLOGIC_IB_I_ERROR) { + u64 estat = 0; + + qib_stats.sps_errints++; + estat = qib_read_kreg64(dd, kr_errstatus); + if (!estat) + qib_devinfo(dd->pcidev, + "error interrupt (%Lx), but no error bits set!\n", + istat); + handle_6120_errors(dd, estat); + } + + if (istat & QLOGIC_IB_I_GPIO) { + u32 gpiostatus; + u32 to_clear = 0; + + /* + * GPIO_3..5 on IBA6120 Rev2 chips indicate + * errors that we need to count. + */ + gpiostatus = qib_read_kreg32(dd, kr_gpio_status); + /* First the error-counter case. */ + if (gpiostatus & GPIO_ERRINTR_MASK) { + /* want to clear the bits we see asserted. */ + to_clear |= (gpiostatus & GPIO_ERRINTR_MASK); + + /* + * Count appropriately, clear bits out of our copy, + * as they have been "handled". + */ + if (gpiostatus & (1 << GPIO_RXUVL_BIT)) + dd->cspec->rxfc_unsupvl_errs++; + if (gpiostatus & (1 << GPIO_OVRUN_BIT)) + dd->cspec->overrun_thresh_errs++; + if (gpiostatus & (1 << GPIO_LLI_BIT)) + dd->cspec->lli_errs++; + gpiostatus &= ~GPIO_ERRINTR_MASK; + } + if (gpiostatus) { + /* + * Some unexpected bits remain. If they could have + * caused the interrupt, complain and clear. + * To avoid repetition of this condition, also clear + * the mask. It is almost certainly due to error. + */ + const u32 mask = qib_read_kreg32(dd, kr_gpio_mask); + + /* + * Also check that the chip reflects our shadow, + * and report issues, If they caused the interrupt. + * we will suppress by refreshing from the shadow. + */ + if (mask & gpiostatus) { + to_clear |= (gpiostatus & mask); + dd->cspec->gpio_mask &= ~(gpiostatus & mask); + qib_write_kreg(dd, kr_gpio_mask, + dd->cspec->gpio_mask); + } + } + if (to_clear) + qib_write_kreg(dd, kr_gpio_clear, (u64) to_clear); + } +} + +static irqreturn_t qib_6120intr(int irq, void *data) +{ + struct qib_devdata *dd = data; + irqreturn_t ret; + u32 istat, ctxtrbits, rmask, crcs = 0; + unsigned i; + + if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT) { + /* + * This return value is not great, but we do not want the + * interrupt core code to remove our interrupt handler + * because we don't appear to be handling an interrupt + * during a chip reset. + */ + ret = IRQ_HANDLED; + goto bail; + } + + istat = qib_read_kreg32(dd, kr_intstatus); + + if (unlikely(!istat)) { + ret = IRQ_NONE; /* not our interrupt, or already handled */ + goto bail; + } + if (unlikely(istat == -1)) { + qib_bad_intrstatus(dd); + /* don't know if it was our interrupt or not */ + ret = IRQ_NONE; + goto bail; + } + + this_cpu_inc(*dd->int_counter); + + if (unlikely(istat & (~QLOGIC_IB_I_BITSEXTANT | + QLOGIC_IB_I_GPIO | QLOGIC_IB_I_ERROR))) + unlikely_6120_intr(dd, istat); + + /* + * Clear the interrupt bits we found set, relatively early, so we + * "know" know the chip will have seen this by the time we process + * the queue, and will re-interrupt if necessary. The processor + * itself won't take the interrupt again until we return. + */ + qib_write_kreg(dd, kr_intclear, istat); + + /* + * Handle kernel receive queues before checking for pio buffers + * available since receives can overflow; piobuf waiters can afford + * a few extra cycles, since they were waiting anyway. + */ + ctxtrbits = istat & + ((QLOGIC_IB_I_RCVAVAIL_MASK << QLOGIC_IB_I_RCVAVAIL_SHIFT) | + (QLOGIC_IB_I_RCVURG_MASK << QLOGIC_IB_I_RCVURG_SHIFT)); + if (ctxtrbits) { + rmask = (1U << QLOGIC_IB_I_RCVAVAIL_SHIFT) | + (1U << QLOGIC_IB_I_RCVURG_SHIFT); + for (i = 0; i < dd->first_user_ctxt; i++) { + if (ctxtrbits & rmask) { + ctxtrbits &= ~rmask; + crcs += qib_kreceive(dd->rcd[i], + &dd->cspec->lli_counter, + NULL); + } + rmask <<= 1; + } + if (crcs) { + u32 cntr = dd->cspec->lli_counter; + cntr += crcs; + if (cntr) { + if (cntr > dd->cspec->lli_thresh) { + dd->cspec->lli_counter = 0; + dd->cspec->lli_errs++; + } else + dd->cspec->lli_counter += cntr; + } + } + + + if (ctxtrbits) { + ctxtrbits = + (ctxtrbits >> QLOGIC_IB_I_RCVAVAIL_SHIFT) | + (ctxtrbits >> QLOGIC_IB_I_RCVURG_SHIFT); + qib_handle_urcv(dd, ctxtrbits); + } + } + + if ((istat & QLOGIC_IB_I_SPIOBUFAVAIL) && (dd->flags & QIB_INITTED)) + qib_ib_piobufavail(dd); + + ret = IRQ_HANDLED; +bail: + return ret; +} + +/* + * Set up our chip-specific interrupt handler + * The interrupt type has already been setup, so + * we just need to do the registration and error checking. + */ +static void qib_setup_6120_interrupt(struct qib_devdata *dd) +{ + /* + * If the chip supports added error indication via GPIO pins, + * enable interrupts on those bits so the interrupt routine + * can count the events. Also set flag so interrupt routine + * can know they are expected. + */ + if (SYM_FIELD(dd->revision, Revision_R, + ChipRevMinor) > 1) { + /* Rev2+ reports extra errors via internal GPIO pins */ + dd->cspec->gpio_mask |= GPIO_ERRINTR_MASK; + qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask); + } + + if (!dd->cspec->irq) + qib_dev_err(dd, + "irq is 0, BIOS error? Interrupts won't work\n"); + else { + int ret; + ret = request_irq(dd->cspec->irq, qib_6120intr, 0, + QIB_DRV_NAME, dd); + if (ret) + qib_dev_err(dd, + "Couldn't setup interrupt (irq=%d): %d\n", + dd->cspec->irq, ret); + } +} + +/** + * pe_boardname - fill in the board name + * @dd: the qlogic_ib device + * + * info is based on the board revision register + */ +static void pe_boardname(struct qib_devdata *dd) +{ + char *n; + u32 boardid, namelen; + + boardid = SYM_FIELD(dd->revision, Revision, + BoardID); + + switch (boardid) { + case 2: + n = "InfiniPath_QLE7140"; + break; + default: + qib_dev_err(dd, "Unknown 6120 board with ID %u\n", boardid); + n = "Unknown_InfiniPath_6120"; + break; + } + namelen = strlen(n) + 1; + dd->boardname = kmalloc(namelen, GFP_KERNEL); + if (!dd->boardname) + qib_dev_err(dd, "Failed allocation for board name: %s\n", n); + else + snprintf(dd->boardname, namelen, "%s", n); + + if (dd->majrev != 4 || !dd->minrev || dd->minrev > 2) + qib_dev_err(dd, + "Unsupported InfiniPath hardware revision %u.%u!\n", + dd->majrev, dd->minrev); + + snprintf(dd->boardversion, sizeof(dd->boardversion), + "ChipABI %u.%u, %s, InfiniPath%u %u.%u, SW Compat %u\n", + QIB_CHIP_VERS_MAJ, QIB_CHIP_VERS_MIN, dd->boardname, + (unsigned)SYM_FIELD(dd->revision, Revision_R, Arch), + dd->majrev, dd->minrev, + (unsigned)SYM_FIELD(dd->revision, Revision_R, SW)); + +} + +/* + * This routine sleeps, so it can only be called from user context, not + * from interrupt context. If we need interrupt context, we can split + * it into two routines. + */ +static int qib_6120_setup_reset(struct qib_devdata *dd) +{ + u64 val; + int i; + int ret; + u16 cmdval; + u8 int_line, clinesz; + + qib_pcie_getcmd(dd, &cmdval, &int_line, &clinesz); + + /* Use ERROR so it shows up in logs, etc. */ + qib_dev_err(dd, "Resetting InfiniPath unit %u\n", dd->unit); + + /* no interrupts till re-initted */ + qib_6120_set_intr_state(dd, 0); + + dd->cspec->ibdeltainprog = 0; + dd->cspec->ibsymdelta = 0; + dd->cspec->iblnkerrdelta = 0; + + /* + * Keep chip from being accessed until we are ready. Use + * writeq() directly, to allow the write even though QIB_PRESENT + * isn't set. + */ + dd->flags &= ~(QIB_INITTED | QIB_PRESENT); + /* so we check interrupts work again */ + dd->z_int_counter = qib_int_counter(dd); + val = dd->control | QLOGIC_IB_C_RESET; + writeq(val, &dd->kregbase[kr_control]); + mb(); /* prevent compiler re-ordering around actual reset */ + + for (i = 1; i <= 5; i++) { + /* + * Allow MBIST, etc. to complete; longer on each retry. + * We sometimes get machine checks from bus timeout if no + * response, so for now, make it *really* long. + */ + msleep(1000 + (1 + i) * 2000); + + qib_pcie_reenable(dd, cmdval, int_line, clinesz); + + /* + * Use readq directly, so we don't need to mark it as PRESENT + * until we get a successful indication that all is well. + */ + val = readq(&dd->kregbase[kr_revision]); + if (val == dd->revision) { + dd->flags |= QIB_PRESENT; /* it's back */ + ret = qib_reinit_intr(dd); + goto bail; + } + } + ret = 0; /* failed */ + +bail: + if (ret) { + if (qib_pcie_params(dd, dd->lbus_width, NULL, NULL)) + qib_dev_err(dd, + "Reset failed to setup PCIe or interrupts; continuing anyway\n"); + /* clear the reset error, init error/hwerror mask */ + qib_6120_init_hwerrors(dd); + /* for Rev2 error interrupts; nop for rev 1 */ + qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask); + /* clear the reset error, init error/hwerror mask */ + qib_6120_init_hwerrors(dd); + } + return ret; +} + +/** + * qib_6120_put_tid - write a TID in chip + * @dd: the qlogic_ib device + * @tidptr: pointer to the expected TID (in chip) to update + * @tidtype: RCVHQ_RCV_TYPE_EAGER (1) for eager, RCVHQ_RCV_TYPE_EXPECTED (0) + * for expected + * @pa: physical address of in memory buffer; tidinvalid if freeing + * + * This exists as a separate routine to allow for special locking etc. + * It's used for both the full cleanup on exit, as well as the normal + * setup and teardown. + */ +static void qib_6120_put_tid(struct qib_devdata *dd, u64 __iomem *tidptr, + u32 type, unsigned long pa) +{ + u32 __iomem *tidp32 = (u32 __iomem *)tidptr; + unsigned long flags; + int tidx; + spinlock_t *tidlockp; /* select appropriate spinlock */ + + if (!dd->kregbase) + return; + + if (pa != dd->tidinvalid) { + if (pa & ((1U << 11) - 1)) { + qib_dev_err(dd, "Physaddr %lx not 2KB aligned!\n", + pa); + return; + } + pa >>= 11; + if (pa & ~QLOGIC_IB_RT_ADDR_MASK) { + qib_dev_err(dd, + "Physical page address 0x%lx larger than supported\n", + pa); + return; + } + + if (type == RCVHQ_RCV_TYPE_EAGER) + pa |= dd->tidtemplate; + else /* for now, always full 4KB page */ + pa |= 2 << 29; + } + + /* + * Avoid chip issue by writing the scratch register + * before and after the TID, and with an io write barrier. + * We use a spinlock around the writes, so they can't intermix + * with other TID (eager or expected) writes (the chip problem + * is triggered by back to back TID writes). Unfortunately, this + * call can be done from interrupt level for the ctxt 0 eager TIDs, + * so we have to use irqsave locks. + */ + /* + * Assumes tidptr always > egrtidbase + * if type == RCVHQ_RCV_TYPE_EAGER. + */ + tidx = tidptr - dd->egrtidbase; + + tidlockp = (type == RCVHQ_RCV_TYPE_EAGER && tidx < dd->rcvhdrcnt) + ? &dd->cspec->kernel_tid_lock : &dd->cspec->user_tid_lock; + spin_lock_irqsave(tidlockp, flags); + qib_write_kreg(dd, kr_scratch, 0xfeeddeaf); + writel(pa, tidp32); + qib_write_kreg(dd, kr_scratch, 0xdeadbeef); + mmiowb(); + spin_unlock_irqrestore(tidlockp, flags); +} + +/** + * qib_6120_put_tid_2 - write a TID in chip, Revision 2 or higher + * @dd: the qlogic_ib device + * @tidptr: pointer to the expected TID (in chip) to update + * @tidtype: RCVHQ_RCV_TYPE_EAGER (1) for eager, RCVHQ_RCV_TYPE_EXPECTED (0) + * for expected + * @pa: physical address of in memory buffer; tidinvalid if freeing + * + * This exists as a separate routine to allow for selection of the + * appropriate "flavor". The static calls in cleanup just use the + * revision-agnostic form, as they are not performance critical. + */ +static void qib_6120_put_tid_2(struct qib_devdata *dd, u64 __iomem *tidptr, + u32 type, unsigned long pa) +{ + u32 __iomem *tidp32 = (u32 __iomem *)tidptr; + u32 tidx; + + if (!dd->kregbase) + return; + + if (pa != dd->tidinvalid) { + if (pa & ((1U << 11) - 1)) { + qib_dev_err(dd, "Physaddr %lx not 2KB aligned!\n", + pa); + return; + } + pa >>= 11; + if (pa & ~QLOGIC_IB_RT_ADDR_MASK) { + qib_dev_err(dd, + "Physical page address 0x%lx larger than supported\n", + pa); + return; + } + + if (type == RCVHQ_RCV_TYPE_EAGER) + pa |= dd->tidtemplate; + else /* for now, always full 4KB page */ + pa |= 2 << 29; + } + tidx = tidptr - dd->egrtidbase; + writel(pa, tidp32); + mmiowb(); +} + + +/** + * qib_6120_clear_tids - clear all TID entries for a context, expected and eager + * @dd: the qlogic_ib device + * @ctxt: the context + * + * clear all TID entries for a context, expected and eager. + * Used from qib_close(). On this chip, TIDs are only 32 bits, + * not 64, but they are still on 64 bit boundaries, so tidbase + * is declared as u64 * for the pointer math, even though we write 32 bits + */ +static void qib_6120_clear_tids(struct qib_devdata *dd, + struct qib_ctxtdata *rcd) +{ + u64 __iomem *tidbase; + unsigned long tidinv; + u32 ctxt; + int i; + + if (!dd->kregbase || !rcd) + return; + + ctxt = rcd->ctxt; + + tidinv = dd->tidinvalid; + tidbase = (u64 __iomem *) + ((char __iomem *)(dd->kregbase) + + dd->rcvtidbase + + ctxt * dd->rcvtidcnt * sizeof(*tidbase)); + + for (i = 0; i < dd->rcvtidcnt; i++) + /* use func pointer because could be one of two funcs */ + dd->f_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EXPECTED, + tidinv); + + tidbase = (u64 __iomem *) + ((char __iomem *)(dd->kregbase) + + dd->rcvegrbase + + rcd->rcvegr_tid_base * sizeof(*tidbase)); + + for (i = 0; i < rcd->rcvegrcnt; i++) + /* use func pointer because could be one of two funcs */ + dd->f_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EAGER, + tidinv); +} + +/** + * qib_6120_tidtemplate - setup constants for TID updates + * @dd: the qlogic_ib device + * + * We setup stuff that we use a lot, to avoid calculating each time + */ +static void qib_6120_tidtemplate(struct qib_devdata *dd) +{ + u32 egrsize = dd->rcvegrbufsize; + + /* + * For now, we always allocate 4KB buffers (at init) so we can + * receive max size packets. We may want a module parameter to + * specify 2KB or 4KB and/or make be per ctxt instead of per device + * for those who want to reduce memory footprint. Note that the + * rcvhdrentsize size must be large enough to hold the largest + * IB header (currently 96 bytes) that we expect to handle (plus of + * course the 2 dwords of RHF). + */ + if (egrsize == 2048) + dd->tidtemplate = 1U << 29; + else if (egrsize == 4096) + dd->tidtemplate = 2U << 29; + dd->tidinvalid = 0; +} + +int __attribute__((weak)) qib_unordered_wc(void) +{ + return 0; +} + +/** + * qib_6120_get_base_info - set chip-specific flags for user code + * @rcd: the qlogic_ib ctxt + * @kbase: qib_base_info pointer + * + * We set the PCIE flag because the lower bandwidth on PCIe vs + * HyperTransport can affect some user packet algorithms. + */ +static int qib_6120_get_base_info(struct qib_ctxtdata *rcd, + struct qib_base_info *kinfo) +{ + if (qib_unordered_wc()) + kinfo->spi_runtime_flags |= QIB_RUNTIME_FORCE_WC_ORDER; + + kinfo->spi_runtime_flags |= QIB_RUNTIME_PCIE | + QIB_RUNTIME_FORCE_PIOAVAIL | QIB_RUNTIME_PIO_REGSWAPPED; + return 0; +} + + +static struct qib_message_header * +qib_6120_get_msgheader(struct qib_devdata *dd, __le32 *rhf_addr) +{ + return (struct qib_message_header *) + &rhf_addr[sizeof(u64) / sizeof(u32)]; +} + +static void qib_6120_config_ctxts(struct qib_devdata *dd) +{ + dd->ctxtcnt = qib_read_kreg32(dd, kr_portcnt); + if (qib_n_krcv_queues > 1) { + dd->first_user_ctxt = qib_n_krcv_queues * dd->num_pports; + if (dd->first_user_ctxt > dd->ctxtcnt) + dd->first_user_ctxt = dd->ctxtcnt; + dd->qpn_mask = dd->first_user_ctxt <= 2 ? 2 : 6; + } else + dd->first_user_ctxt = dd->num_pports; + dd->n_krcv_queues = dd->first_user_ctxt; +} + +static void qib_update_6120_usrhead(struct qib_ctxtdata *rcd, u64 hd, + u32 updegr, u32 egrhd, u32 npkts) +{ + if (updegr) + qib_write_ureg(rcd->dd, ur_rcvegrindexhead, egrhd, rcd->ctxt); + mmiowb(); + qib_write_ureg(rcd->dd, ur_rcvhdrhead, hd, rcd->ctxt); + mmiowb(); +} + +static u32 qib_6120_hdrqempty(struct qib_ctxtdata *rcd) +{ + u32 head, tail; + + head = qib_read_ureg32(rcd->dd, ur_rcvhdrhead, rcd->ctxt); + if (rcd->rcvhdrtail_kvaddr) + tail = qib_get_rcvhdrtail(rcd); + else + tail = qib_read_ureg32(rcd->dd, ur_rcvhdrtail, rcd->ctxt); + return head == tail; +} + +/* + * Used when we close any ctxt, for DMA already in flight + * at close. Can't be done until we know hdrq size, so not + * early in chip init. + */ +static void alloc_dummy_hdrq(struct qib_devdata *dd) +{ + dd->cspec->dummy_hdrq = dma_alloc_coherent(&dd->pcidev->dev, + dd->rcd[0]->rcvhdrq_size, + &dd->cspec->dummy_hdrq_phys, + GFP_ATOMIC | __GFP_COMP); + if (!dd->cspec->dummy_hdrq) { + qib_devinfo(dd->pcidev, "Couldn't allocate dummy hdrq\n"); + /* fallback to just 0'ing */ + dd->cspec->dummy_hdrq_phys = 0UL; + } +} + +/* + * Modify the RCVCTRL register in chip-specific way. This + * is a function because bit positions and (future) register + * location is chip-specific, but the needed operations are + * generic. is a bit-mask because we often want to + * do multiple modifications. + */ +static void rcvctrl_6120_mod(struct qib_pportdata *ppd, unsigned int op, + int ctxt) +{ + struct qib_devdata *dd = ppd->dd; + u64 mask, val; + unsigned long flags; + + spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags); + + if (op & QIB_RCVCTRL_TAILUPD_ENB) + dd->rcvctrl |= (1ULL << QLOGIC_IB_R_TAILUPD_SHIFT); + if (op & QIB_RCVCTRL_TAILUPD_DIS) + dd->rcvctrl &= ~(1ULL << QLOGIC_IB_R_TAILUPD_SHIFT); + if (op & QIB_RCVCTRL_PKEY_ENB) + dd->rcvctrl &= ~(1ULL << IBA6120_R_PKEY_DIS_SHIFT); + if (op & QIB_RCVCTRL_PKEY_DIS) + dd->rcvctrl |= (1ULL << IBA6120_R_PKEY_DIS_SHIFT); + if (ctxt < 0) + mask = (1ULL << dd->ctxtcnt) - 1; + else + mask = (1ULL << ctxt); + if (op & QIB_RCVCTRL_CTXT_ENB) { + /* always done for specific ctxt */ + dd->rcvctrl |= (mask << SYM_LSB(RcvCtrl, PortEnable)); + if (!(dd->flags & QIB_NODMA_RTAIL)) + dd->rcvctrl |= 1ULL << QLOGIC_IB_R_TAILUPD_SHIFT; + /* Write these registers before the context is enabled. */ + qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr, ctxt, + dd->rcd[ctxt]->rcvhdrqtailaddr_phys); + qib_write_kreg_ctxt(dd, kr_rcvhdraddr, ctxt, + dd->rcd[ctxt]->rcvhdrq_phys); + + if (ctxt == 0 && !dd->cspec->dummy_hdrq) + alloc_dummy_hdrq(dd); + } + if (op & QIB_RCVCTRL_CTXT_DIS) + dd->rcvctrl &= ~(mask << SYM_LSB(RcvCtrl, PortEnable)); + if (op & QIB_RCVCTRL_INTRAVAIL_ENB) + dd->rcvctrl |= (mask << QLOGIC_IB_R_INTRAVAIL_SHIFT); + if (op & QIB_RCVCTRL_INTRAVAIL_DIS) + dd->rcvctrl &= ~(mask << QLOGIC_IB_R_INTRAVAIL_SHIFT); + qib_write_kreg(dd, kr_rcvctrl, dd->rcvctrl); + if ((op & QIB_RCVCTRL_INTRAVAIL_ENB) && dd->rhdrhead_intr_off) { + /* arm rcv interrupt */ + val = qib_read_ureg32(dd, ur_rcvhdrhead, ctxt) | + dd->rhdrhead_intr_off; + qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt); + } + if (op & QIB_RCVCTRL_CTXT_ENB) { + /* + * Init the context registers also; if we were + * disabled, tail and head should both be zero + * already from the enable, but since we don't + * know, we have to do it explicitly. + */ + val = qib_read_ureg32(dd, ur_rcvegrindextail, ctxt); + qib_write_ureg(dd, ur_rcvegrindexhead, val, ctxt); + + val = qib_read_ureg32(dd, ur_rcvhdrtail, ctxt); + dd->rcd[ctxt]->head = val; + /* If kctxt, interrupt on next receive. */ + if (ctxt < dd->first_user_ctxt) + val |= dd->rhdrhead_intr_off; + qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt); + } + if (op & QIB_RCVCTRL_CTXT_DIS) { + /* + * Be paranoid, and never write 0's to these, just use an + * unused page. Of course, + * rcvhdraddr points to a large chunk of memory, so this + * could still trash things, but at least it won't trash + * page 0, and by disabling the ctxt, it should stop "soon", + * even if a packet or two is in already in flight after we + * disabled the ctxt. Only 6120 has this issue. + */ + if (ctxt >= 0) { + qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr, ctxt, + dd->cspec->dummy_hdrq_phys); + qib_write_kreg_ctxt(dd, kr_rcvhdraddr, ctxt, + dd->cspec->dummy_hdrq_phys); + } else { + unsigned i; + + for (i = 0; i < dd->cfgctxts; i++) { + qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr, + i, dd->cspec->dummy_hdrq_phys); + qib_write_kreg_ctxt(dd, kr_rcvhdraddr, + i, dd->cspec->dummy_hdrq_phys); + } + } + } + spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags); +} + +/* + * Modify the SENDCTRL register in chip-specific way. This + * is a function there may be multiple such registers with + * slightly different layouts. Only operations actually used + * are implemented yet. + * Chip requires no back-back sendctrl writes, so write + * scratch register after writing sendctrl + */ +static void sendctrl_6120_mod(struct qib_pportdata *ppd, u32 op) +{ + struct qib_devdata *dd = ppd->dd; + u64 tmp_dd_sendctrl; + unsigned long flags; + + spin_lock_irqsave(&dd->sendctrl_lock, flags); + + /* First the ones that are "sticky", saved in shadow */ + if (op & QIB_SENDCTRL_CLEAR) + dd->sendctrl = 0; + if (op & QIB_SENDCTRL_SEND_DIS) + dd->sendctrl &= ~SYM_MASK(SendCtrl, PIOEnable); + else if (op & QIB_SENDCTRL_SEND_ENB) + dd->sendctrl |= SYM_MASK(SendCtrl, PIOEnable); + if (op & QIB_SENDCTRL_AVAIL_DIS) + dd->sendctrl &= ~SYM_MASK(SendCtrl, PIOBufAvailUpd); + else if (op & QIB_SENDCTRL_AVAIL_ENB) + dd->sendctrl |= SYM_MASK(SendCtrl, PIOBufAvailUpd); + + if (op & QIB_SENDCTRL_DISARM_ALL) { + u32 i, last; + + tmp_dd_sendctrl = dd->sendctrl; + /* + * disarm any that are not yet launched, disabling sends + * and updates until done. + */ + last = dd->piobcnt2k + dd->piobcnt4k; + tmp_dd_sendctrl &= + ~(SYM_MASK(SendCtrl, PIOEnable) | + SYM_MASK(SendCtrl, PIOBufAvailUpd)); + for (i = 0; i < last; i++) { + qib_write_kreg(dd, kr_sendctrl, tmp_dd_sendctrl | + SYM_MASK(SendCtrl, Disarm) | i); + qib_write_kreg(dd, kr_scratch, 0); + } + } + + tmp_dd_sendctrl = dd->sendctrl; + + if (op & QIB_SENDCTRL_FLUSH) + tmp_dd_sendctrl |= SYM_MASK(SendCtrl, Abort); + if (op & QIB_SENDCTRL_DISARM) + tmp_dd_sendctrl |= SYM_MASK(SendCtrl, Disarm) | + ((op & QIB_6120_SendCtrl_DisarmPIOBuf_RMASK) << + SYM_LSB(SendCtrl, DisarmPIOBuf)); + if (op & QIB_SENDCTRL_AVAIL_BLIP) + tmp_dd_sendctrl &= ~SYM_MASK(SendCtrl, PIOBufAvailUpd); + + qib_write_kreg(dd, kr_sendctrl, tmp_dd_sendctrl); + qib_write_kreg(dd, kr_scratch, 0); + + if (op & QIB_SENDCTRL_AVAIL_BLIP) { + qib_write_kreg(dd, kr_sendctrl, dd->sendctrl); + qib_write_kreg(dd, kr_scratch, 0); + } + + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); + + if (op & QIB_SENDCTRL_FLUSH) { + u32 v; + /* + * ensure writes have hit chip, then do a few + * more reads, to allow DMA of pioavail registers + * to occur, so in-memory copy is in sync with + * the chip. Not always safe to sleep. + */ + v = qib_read_kreg32(dd, kr_scratch); + qib_write_kreg(dd, kr_scratch, v); + v = qib_read_kreg32(dd, kr_scratch); + qib_write_kreg(dd, kr_scratch, v); + qib_read_kreg32(dd, kr_scratch); + } +} + +/** + * qib_portcntr_6120 - read a per-port counter + * @dd: the qlogic_ib device + * @creg: the counter to snapshot + */ +static u64 qib_portcntr_6120(struct qib_pportdata *ppd, u32 reg) +{ + u64 ret = 0ULL; + struct qib_devdata *dd = ppd->dd; + u16 creg; + /* 0xffff for unimplemented or synthesized counters */ + static const u16 xlator[] = { + [QIBPORTCNTR_PKTSEND] = cr_pktsend, + [QIBPORTCNTR_WORDSEND] = cr_wordsend, + [QIBPORTCNTR_PSXMITDATA] = 0xffff, + [QIBPORTCNTR_PSXMITPKTS] = 0xffff, + [QIBPORTCNTR_PSXMITWAIT] = 0xffff, + [QIBPORTCNTR_SENDSTALL] = cr_sendstall, + [QIBPORTCNTR_PKTRCV] = cr_pktrcv, + [QIBPORTCNTR_PSRCVDATA] = 0xffff, + [QIBPORTCNTR_PSRCVPKTS] = 0xffff, + [QIBPORTCNTR_RCVEBP] = cr_rcvebp, + [QIBPORTCNTR_RCVOVFL] = cr_rcvovfl, + [QIBPORTCNTR_WORDRCV] = cr_wordrcv, + [QIBPORTCNTR_RXDROPPKT] = cr_rxdroppkt, + [QIBPORTCNTR_RXLOCALPHYERR] = 0xffff, + [QIBPORTCNTR_RXVLERR] = 0xffff, + [QIBPORTCNTR_ERRICRC] = cr_erricrc, + [QIBPORTCNTR_ERRVCRC] = cr_errvcrc, + [QIBPORTCNTR_ERRLPCRC] = cr_errlpcrc, + [QIBPORTCNTR_BADFORMAT] = cr_badformat, + [QIBPORTCNTR_ERR_RLEN] = cr_err_rlen, + [QIBPORTCNTR_IBSYMBOLERR] = cr_ibsymbolerr, + [QIBPORTCNTR_INVALIDRLEN] = cr_invalidrlen, + [QIBPORTCNTR_UNSUPVL] = cr_txunsupvl, + [QIBPORTCNTR_EXCESSBUFOVFL] = 0xffff, + [QIBPORTCNTR_ERRLINK] = cr_errlink, + [QIBPORTCNTR_IBLINKDOWN] = cr_iblinkdown, + [QIBPORTCNTR_IBLINKERRRECOV] = cr_iblinkerrrecov, + [QIBPORTCNTR_LLI] = 0xffff, + [QIBPORTCNTR_PSINTERVAL] = 0xffff, + [QIBPORTCNTR_PSSTART] = 0xffff, + [QIBPORTCNTR_PSSTAT] = 0xffff, + [QIBPORTCNTR_VL15PKTDROP] = 0xffff, + [QIBPORTCNTR_ERRPKEY] = cr_errpkey, + [QIBPORTCNTR_KHDROVFL] = 0xffff, + }; + + if (reg >= ARRAY_SIZE(xlator)) { + qib_devinfo(ppd->dd->pcidev, + "Unimplemented portcounter %u\n", reg); + goto done; + } + creg = xlator[reg]; + + /* handle counters requests not implemented as chip counters */ + if (reg == QIBPORTCNTR_LLI) + ret = dd->cspec->lli_errs; + else if (reg == QIBPORTCNTR_EXCESSBUFOVFL) + ret = dd->cspec->overrun_thresh_errs; + else if (reg == QIBPORTCNTR_KHDROVFL) { + int i; + + /* sum over all kernel contexts */ + for (i = 0; i < dd->first_user_ctxt; i++) + ret += read_6120_creg32(dd, cr_portovfl + i); + } else if (reg == QIBPORTCNTR_PSSTAT) + ret = dd->cspec->pma_sample_status; + if (creg == 0xffff) + goto done; + + /* + * only fast incrementing counters are 64bit; use 32 bit reads to + * avoid two independent reads when on opteron + */ + if (creg == cr_wordsend || creg == cr_wordrcv || + creg == cr_pktsend || creg == cr_pktrcv) + ret = read_6120_creg(dd, creg); + else + ret = read_6120_creg32(dd, creg); + if (creg == cr_ibsymbolerr) { + if (dd->cspec->ibdeltainprog) + ret -= ret - dd->cspec->ibsymsnap; + ret -= dd->cspec->ibsymdelta; + } else if (creg == cr_iblinkerrrecov) { + if (dd->cspec->ibdeltainprog) + ret -= ret - dd->cspec->iblnkerrsnap; + ret -= dd->cspec->iblnkerrdelta; + } + if (reg == QIBPORTCNTR_RXDROPPKT) /* add special cased count */ + ret += dd->cspec->rxfc_unsupvl_errs; + +done: + return ret; +} + +/* + * Device counter names (not port-specific), one line per stat, + * single string. Used by utilities like ipathstats to print the stats + * in a way which works for different versions of drivers, without changing + * the utility. Names need to be 12 chars or less (w/o newline), for proper + * display by utility. + * Non-error counters are first. + * Start of "error" conters is indicated by a leading "E " on the first + * "error" counter, and doesn't count in label length. + * The EgrOvfl list needs to be last so we truncate them at the configured + * context count for the device. + * cntr6120indices contains the corresponding register indices. + */ +static const char cntr6120names[] = + "Interrupts\n" + "HostBusStall\n" + "E RxTIDFull\n" + "RxTIDInvalid\n" + "Ctxt0EgrOvfl\n" + "Ctxt1EgrOvfl\n" + "Ctxt2EgrOvfl\n" + "Ctxt3EgrOvfl\n" + "Ctxt4EgrOvfl\n"; + +static const size_t cntr6120indices[] = { + cr_lbint, + cr_lbflowstall, + cr_errtidfull, + cr_errtidvalid, + cr_portovfl + 0, + cr_portovfl + 1, + cr_portovfl + 2, + cr_portovfl + 3, + cr_portovfl + 4, +}; + +/* + * same as cntr6120names and cntr6120indices, but for port-specific counters. + * portcntr6120indices is somewhat complicated by some registers needing + * adjustments of various kinds, and those are ORed with _PORT_VIRT_FLAG + */ +static const char portcntr6120names[] = + "TxPkt\n" + "TxFlowPkt\n" + "TxWords\n" + "RxPkt\n" + "RxFlowPkt\n" + "RxWords\n" + "TxFlowStall\n" + "E IBStatusChng\n" + "IBLinkDown\n" + "IBLnkRecov\n" + "IBRxLinkErr\n" + "IBSymbolErr\n" + "RxLLIErr\n" + "RxBadFormat\n" + "RxBadLen\n" + "RxBufOvrfl\n" + "RxEBP\n" + "RxFlowCtlErr\n" + "RxICRCerr\n" + "RxLPCRCerr\n" + "RxVCRCerr\n" + "RxInvalLen\n" + "RxInvalPKey\n" + "RxPktDropped\n" + "TxBadLength\n" + "TxDropped\n" + "TxInvalLen\n" + "TxUnderrun\n" + "TxUnsupVL\n" + ; + +#define _PORT_VIRT_FLAG 0x8000 /* "virtual", need adjustments */ +static const size_t portcntr6120indices[] = { + QIBPORTCNTR_PKTSEND | _PORT_VIRT_FLAG, + cr_pktsendflow, + QIBPORTCNTR_WORDSEND | _PORT_VIRT_FLAG, + QIBPORTCNTR_PKTRCV | _PORT_VIRT_FLAG, + cr_pktrcvflowctrl, + QIBPORTCNTR_WORDRCV | _PORT_VIRT_FLAG, + QIBPORTCNTR_SENDSTALL | _PORT_VIRT_FLAG, + cr_ibstatuschange, + QIBPORTCNTR_IBLINKDOWN | _PORT_VIRT_FLAG, + QIBPORTCNTR_IBLINKERRRECOV | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERRLINK | _PORT_VIRT_FLAG, + QIBPORTCNTR_IBSYMBOLERR | _PORT_VIRT_FLAG, + QIBPORTCNTR_LLI | _PORT_VIRT_FLAG, + QIBPORTCNTR_BADFORMAT | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERR_RLEN | _PORT_VIRT_FLAG, + QIBPORTCNTR_RCVOVFL | _PORT_VIRT_FLAG, + QIBPORTCNTR_RCVEBP | _PORT_VIRT_FLAG, + cr_rcvflowctrl_err, + QIBPORTCNTR_ERRICRC | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERRLPCRC | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERRVCRC | _PORT_VIRT_FLAG, + QIBPORTCNTR_INVALIDRLEN | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERRPKEY | _PORT_VIRT_FLAG, + QIBPORTCNTR_RXDROPPKT | _PORT_VIRT_FLAG, + cr_invalidslen, + cr_senddropped, + cr_errslen, + cr_sendunderrun, + cr_txunsupvl, +}; + +/* do all the setup to make the counter reads efficient later */ +static void init_6120_cntrnames(struct qib_devdata *dd) +{ + int i, j = 0; + char *s; + + for (i = 0, s = (char *)cntr6120names; s && j <= dd->cfgctxts; + i++) { + /* we always have at least one counter before the egrovfl */ + if (!j && !strncmp("Ctxt0EgrOvfl", s + 1, 12)) + j = 1; + s = strchr(s + 1, '\n'); + if (s && j) + j++; + } + dd->cspec->ncntrs = i; + if (!s) + /* full list; size is without terminating null */ + dd->cspec->cntrnamelen = sizeof(cntr6120names) - 1; + else + dd->cspec->cntrnamelen = 1 + s - cntr6120names; + dd->cspec->cntrs = kmalloc(dd->cspec->ncntrs + * sizeof(u64), GFP_KERNEL); + if (!dd->cspec->cntrs) + qib_dev_err(dd, "Failed allocation for counters\n"); + + for (i = 0, s = (char *)portcntr6120names; s; i++) + s = strchr(s + 1, '\n'); + dd->cspec->nportcntrs = i - 1; + dd->cspec->portcntrnamelen = sizeof(portcntr6120names) - 1; + dd->cspec->portcntrs = kmalloc(dd->cspec->nportcntrs + * sizeof(u64), GFP_KERNEL); + if (!dd->cspec->portcntrs) + qib_dev_err(dd, "Failed allocation for portcounters\n"); +} + +static u32 qib_read_6120cntrs(struct qib_devdata *dd, loff_t pos, char **namep, + u64 **cntrp) +{ + u32 ret; + + if (namep) { + ret = dd->cspec->cntrnamelen; + if (pos >= ret) + ret = 0; /* final read after getting everything */ + else + *namep = (char *)cntr6120names; + } else { + u64 *cntr = dd->cspec->cntrs; + int i; + + ret = dd->cspec->ncntrs * sizeof(u64); + if (!cntr || pos >= ret) { + /* everything read, or couldn't get memory */ + ret = 0; + goto done; + } + if (pos >= ret) { + ret = 0; /* final read after getting everything */ + goto done; + } + *cntrp = cntr; + for (i = 0; i < dd->cspec->ncntrs; i++) + *cntr++ = read_6120_creg32(dd, cntr6120indices[i]); + } +done: + return ret; +} + +static u32 qib_read_6120portcntrs(struct qib_devdata *dd, loff_t pos, u32 port, + char **namep, u64 **cntrp) +{ + u32 ret; + + if (namep) { + ret = dd->cspec->portcntrnamelen; + if (pos >= ret) + ret = 0; /* final read after getting everything */ + else + *namep = (char *)portcntr6120names; + } else { + u64 *cntr = dd->cspec->portcntrs; + struct qib_pportdata *ppd = &dd->pport[port]; + int i; + + ret = dd->cspec->nportcntrs * sizeof(u64); + if (!cntr || pos >= ret) { + /* everything read, or couldn't get memory */ + ret = 0; + goto done; + } + *cntrp = cntr; + for (i = 0; i < dd->cspec->nportcntrs; i++) { + if (portcntr6120indices[i] & _PORT_VIRT_FLAG) + *cntr++ = qib_portcntr_6120(ppd, + portcntr6120indices[i] & + ~_PORT_VIRT_FLAG); + else + *cntr++ = read_6120_creg32(dd, + portcntr6120indices[i]); + } + } +done: + return ret; +} + +static void qib_chk_6120_errormask(struct qib_devdata *dd) +{ + static u32 fixed; + u32 ctrl; + unsigned long errormask; + unsigned long hwerrs; + + if (!dd->cspec->errormask || !(dd->flags & QIB_INITTED)) + return; + + errormask = qib_read_kreg64(dd, kr_errmask); + + if (errormask == dd->cspec->errormask) + return; + fixed++; + + hwerrs = qib_read_kreg64(dd, kr_hwerrstatus); + ctrl = qib_read_kreg32(dd, kr_control); + + qib_write_kreg(dd, kr_errmask, + dd->cspec->errormask); + + if ((hwerrs & dd->cspec->hwerrmask) || + (ctrl & QLOGIC_IB_C_FREEZEMODE)) { + qib_write_kreg(dd, kr_hwerrclear, 0ULL); + qib_write_kreg(dd, kr_errclear, 0ULL); + /* force re-interrupt of pending events, just in case */ + qib_write_kreg(dd, kr_intclear, 0ULL); + qib_devinfo(dd->pcidev, + "errormask fixed(%u) %lx->%lx, ctrl %x hwerr %lx\n", + fixed, errormask, (unsigned long)dd->cspec->errormask, + ctrl, hwerrs); + } +} + +/** + * qib_get_faststats - get word counters from chip before they overflow + * @opaque - contains a pointer to the qlogic_ib device qib_devdata + * + * This needs more work; in particular, decision on whether we really + * need traffic_wds done the way it is + * called from add_timer + */ +static void qib_get_6120_faststats(unsigned long opaque) +{ + struct qib_devdata *dd = (struct qib_devdata *) opaque; + struct qib_pportdata *ppd = dd->pport; + unsigned long flags; + u64 traffic_wds; + + /* + * don't access the chip while running diags, or memory diags can + * fail + */ + if (!(dd->flags & QIB_INITTED) || dd->diag_client) + /* but re-arm the timer, for diags case; won't hurt other */ + goto done; + + /* + * We now try to maintain an activity timer, based on traffic + * exceeding a threshold, so we need to check the word-counts + * even if they are 64-bit. + */ + traffic_wds = qib_portcntr_6120(ppd, cr_wordsend) + + qib_portcntr_6120(ppd, cr_wordrcv); + spin_lock_irqsave(&dd->eep_st_lock, flags); + traffic_wds -= dd->traffic_wds; + dd->traffic_wds += traffic_wds; + if (traffic_wds >= QIB_TRAFFIC_ACTIVE_THRESHOLD) + atomic_add(5, &dd->active_time); /* S/B #define */ + spin_unlock_irqrestore(&dd->eep_st_lock, flags); + + qib_chk_6120_errormask(dd); +done: + mod_timer(&dd->stats_timer, jiffies + HZ * ACTIVITY_TIMER); +} + +/* no interrupt fallback for these chips */ +static int qib_6120_nointr_fallback(struct qib_devdata *dd) +{ + return 0; +} + +/* + * reset the XGXS (between serdes and IBC). Slightly less intrusive + * than resetting the IBC or external link state, and useful in some + * cases to cause some retraining. To do this right, we reset IBC + * as well. + */ +static void qib_6120_xgxs_reset(struct qib_pportdata *ppd) +{ + u64 val, prev_val; + struct qib_devdata *dd = ppd->dd; + + prev_val = qib_read_kreg64(dd, kr_xgxs_cfg); + val = prev_val | QLOGIC_IB_XGXS_RESET; + prev_val &= ~QLOGIC_IB_XGXS_RESET; /* be sure */ + qib_write_kreg(dd, kr_control, + dd->control & ~QLOGIC_IB_C_LINKENABLE); + qib_write_kreg(dd, kr_xgxs_cfg, val); + qib_read_kreg32(dd, kr_scratch); + qib_write_kreg(dd, kr_xgxs_cfg, prev_val); + qib_write_kreg(dd, kr_control, dd->control); +} + +static int qib_6120_get_ib_cfg(struct qib_pportdata *ppd, int which) +{ + int ret; + + switch (which) { + case QIB_IB_CFG_LWID: + ret = ppd->link_width_active; + break; + + case QIB_IB_CFG_SPD: + ret = ppd->link_speed_active; + break; + + case QIB_IB_CFG_LWID_ENB: + ret = ppd->link_width_enabled; + break; + + case QIB_IB_CFG_SPD_ENB: + ret = ppd->link_speed_enabled; + break; + + case QIB_IB_CFG_OP_VLS: + ret = ppd->vls_operational; + break; + + case QIB_IB_CFG_VL_HIGH_CAP: + ret = 0; + break; + + case QIB_IB_CFG_VL_LOW_CAP: + ret = 0; + break; + + case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */ + ret = SYM_FIELD(ppd->dd->cspec->ibcctrl, IBCCtrl, + OverrunThreshold); + break; + + case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */ + ret = SYM_FIELD(ppd->dd->cspec->ibcctrl, IBCCtrl, + PhyerrThreshold); + break; + + case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */ + /* will only take effect when the link state changes */ + ret = (ppd->dd->cspec->ibcctrl & + SYM_MASK(IBCCtrl, LinkDownDefaultState)) ? + IB_LINKINITCMD_SLEEP : IB_LINKINITCMD_POLL; + break; + + case QIB_IB_CFG_HRTBT: /* Get Heartbeat off/enable/auto */ + ret = 0; /* no heartbeat on this chip */ + break; + + case QIB_IB_CFG_PMA_TICKS: + ret = 250; /* 1 usec. */ + break; + + default: + ret = -EINVAL; + break; + } + return ret; +} + +/* + * We assume range checking is already done, if needed. + */ +static int qib_6120_set_ib_cfg(struct qib_pportdata *ppd, int which, u32 val) +{ + struct qib_devdata *dd = ppd->dd; + int ret = 0; + u64 val64; + u16 lcmd, licmd; + + switch (which) { + case QIB_IB_CFG_LWID_ENB: + ppd->link_width_enabled = val; + break; + + case QIB_IB_CFG_SPD_ENB: + ppd->link_speed_enabled = val; + break; + + case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */ + val64 = SYM_FIELD(dd->cspec->ibcctrl, IBCCtrl, + OverrunThreshold); + if (val64 != val) { + dd->cspec->ibcctrl &= + ~SYM_MASK(IBCCtrl, OverrunThreshold); + dd->cspec->ibcctrl |= (u64) val << + SYM_LSB(IBCCtrl, OverrunThreshold); + qib_write_kreg(dd, kr_ibcctrl, dd->cspec->ibcctrl); + qib_write_kreg(dd, kr_scratch, 0); + } + break; + + case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */ + val64 = SYM_FIELD(dd->cspec->ibcctrl, IBCCtrl, + PhyerrThreshold); + if (val64 != val) { + dd->cspec->ibcctrl &= + ~SYM_MASK(IBCCtrl, PhyerrThreshold); + dd->cspec->ibcctrl |= (u64) val << + SYM_LSB(IBCCtrl, PhyerrThreshold); + qib_write_kreg(dd, kr_ibcctrl, dd->cspec->ibcctrl); + qib_write_kreg(dd, kr_scratch, 0); + } + break; + + case QIB_IB_CFG_PKEYS: /* update pkeys */ + val64 = (u64) ppd->pkeys[0] | ((u64) ppd->pkeys[1] << 16) | + ((u64) ppd->pkeys[2] << 32) | + ((u64) ppd->pkeys[3] << 48); + qib_write_kreg(dd, kr_partitionkey, val64); + break; + + case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */ + /* will only take effect when the link state changes */ + if (val == IB_LINKINITCMD_POLL) + dd->cspec->ibcctrl &= + ~SYM_MASK(IBCCtrl, LinkDownDefaultState); + else /* SLEEP */ + dd->cspec->ibcctrl |= + SYM_MASK(IBCCtrl, LinkDownDefaultState); + qib_write_kreg(dd, kr_ibcctrl, dd->cspec->ibcctrl); + qib_write_kreg(dd, kr_scratch, 0); + break; + + case QIB_IB_CFG_MTU: /* update the MTU in IBC */ + /* + * Update our housekeeping variables, and set IBC max + * size, same as init code; max IBC is max we allow in + * buffer, less the qword pbc, plus 1 for ICRC, in dwords + * Set even if it's unchanged, print debug message only + * on changes. + */ + val = (ppd->ibmaxlen >> 2) + 1; + dd->cspec->ibcctrl &= ~SYM_MASK(IBCCtrl, MaxPktLen); + dd->cspec->ibcctrl |= (u64)val << + SYM_LSB(IBCCtrl, MaxPktLen); + qib_write_kreg(dd, kr_ibcctrl, dd->cspec->ibcctrl); + qib_write_kreg(dd, kr_scratch, 0); + break; + + case QIB_IB_CFG_LSTATE: /* set the IB link state */ + switch (val & 0xffff0000) { + case IB_LINKCMD_DOWN: + lcmd = QLOGIC_IB_IBCC_LINKCMD_DOWN; + if (!dd->cspec->ibdeltainprog) { + dd->cspec->ibdeltainprog = 1; + dd->cspec->ibsymsnap = + read_6120_creg32(dd, cr_ibsymbolerr); + dd->cspec->iblnkerrsnap = + read_6120_creg32(dd, cr_iblinkerrrecov); + } + break; + + case IB_LINKCMD_ARMED: + lcmd = QLOGIC_IB_IBCC_LINKCMD_ARMED; + break; + + case IB_LINKCMD_ACTIVE: + lcmd = QLOGIC_IB_IBCC_LINKCMD_ACTIVE; + break; + + default: + ret = -EINVAL; + qib_dev_err(dd, "bad linkcmd req 0x%x\n", val >> 16); + goto bail; + } + switch (val & 0xffff) { + case IB_LINKINITCMD_NOP: + licmd = 0; + break; + + case IB_LINKINITCMD_POLL: + licmd = QLOGIC_IB_IBCC_LINKINITCMD_POLL; + break; + + case IB_LINKINITCMD_SLEEP: + licmd = QLOGIC_IB_IBCC_LINKINITCMD_SLEEP; + break; + + case IB_LINKINITCMD_DISABLE: + licmd = QLOGIC_IB_IBCC_LINKINITCMD_DISABLE; + break; + + default: + ret = -EINVAL; + qib_dev_err(dd, "bad linkinitcmd req 0x%x\n", + val & 0xffff); + goto bail; + } + qib_set_ib_6120_lstate(ppd, lcmd, licmd); + goto bail; + + case QIB_IB_CFG_HRTBT: + ret = -EINVAL; + break; + + default: + ret = -EINVAL; + } +bail: + return ret; +} + +static int qib_6120_set_loopback(struct qib_pportdata *ppd, const char *what) +{ + int ret = 0; + if (!strncmp(what, "ibc", 3)) { + ppd->dd->cspec->ibcctrl |= SYM_MASK(IBCCtrl, Loopback); + qib_devinfo(ppd->dd->pcidev, "Enabling IB%u:%u IBC loopback\n", + ppd->dd->unit, ppd->port); + } else if (!strncmp(what, "off", 3)) { + ppd->dd->cspec->ibcctrl &= ~SYM_MASK(IBCCtrl, Loopback); + qib_devinfo(ppd->dd->pcidev, + "Disabling IB%u:%u IBC loopback (normal)\n", + ppd->dd->unit, ppd->port); + } else + ret = -EINVAL; + if (!ret) { + qib_write_kreg(ppd->dd, kr_ibcctrl, ppd->dd->cspec->ibcctrl); + qib_write_kreg(ppd->dd, kr_scratch, 0); + } + return ret; +} + +static void pma_6120_timer(unsigned long data) +{ + struct qib_pportdata *ppd = (struct qib_pportdata *)data; + struct qib_chip_specific *cs = ppd->dd->cspec; + struct qib_ibport *ibp = &ppd->ibport_data; + unsigned long flags; + + spin_lock_irqsave(&ibp->lock, flags); + if (cs->pma_sample_status == IB_PMA_SAMPLE_STATUS_STARTED) { + cs->pma_sample_status = IB_PMA_SAMPLE_STATUS_RUNNING; + qib_snapshot_counters(ppd, &cs->sword, &cs->rword, + &cs->spkts, &cs->rpkts, &cs->xmit_wait); + mod_timer(&cs->pma_timer, + jiffies + usecs_to_jiffies(ibp->pma_sample_interval)); + } else if (cs->pma_sample_status == IB_PMA_SAMPLE_STATUS_RUNNING) { + u64 ta, tb, tc, td, te; + + cs->pma_sample_status = IB_PMA_SAMPLE_STATUS_DONE; + qib_snapshot_counters(ppd, &ta, &tb, &tc, &td, &te); + + cs->sword = ta - cs->sword; + cs->rword = tb - cs->rword; + cs->spkts = tc - cs->spkts; + cs->rpkts = td - cs->rpkts; + cs->xmit_wait = te - cs->xmit_wait; + } + spin_unlock_irqrestore(&ibp->lock, flags); +} + +/* + * Note that the caller has the ibp->lock held. + */ +static void qib_set_cntr_6120_sample(struct qib_pportdata *ppd, u32 intv, + u32 start) +{ + struct qib_chip_specific *cs = ppd->dd->cspec; + + if (start && intv) { + cs->pma_sample_status = IB_PMA_SAMPLE_STATUS_STARTED; + mod_timer(&cs->pma_timer, jiffies + usecs_to_jiffies(start)); + } else if (intv) { + cs->pma_sample_status = IB_PMA_SAMPLE_STATUS_RUNNING; + qib_snapshot_counters(ppd, &cs->sword, &cs->rword, + &cs->spkts, &cs->rpkts, &cs->xmit_wait); + mod_timer(&cs->pma_timer, jiffies + usecs_to_jiffies(intv)); + } else { + cs->pma_sample_status = IB_PMA_SAMPLE_STATUS_DONE; + cs->sword = 0; + cs->rword = 0; + cs->spkts = 0; + cs->rpkts = 0; + cs->xmit_wait = 0; + } +} + +static u32 qib_6120_iblink_state(u64 ibcs) +{ + u32 state = (u32)SYM_FIELD(ibcs, IBCStatus, LinkState); + + switch (state) { + case IB_6120_L_STATE_INIT: + state = IB_PORT_INIT; + break; + case IB_6120_L_STATE_ARM: + state = IB_PORT_ARMED; + break; + case IB_6120_L_STATE_ACTIVE: + /* fall through */ + case IB_6120_L_STATE_ACT_DEFER: + state = IB_PORT_ACTIVE; + break; + default: /* fall through */ + case IB_6120_L_STATE_DOWN: + state = IB_PORT_DOWN; + break; + } + return state; +} + +/* returns the IBTA port state, rather than the IBC link training state */ +static u8 qib_6120_phys_portstate(u64 ibcs) +{ + u8 state = (u8)SYM_FIELD(ibcs, IBCStatus, LinkTrainingState); + return qib_6120_physportstate[state]; +} + +static int qib_6120_ib_updown(struct qib_pportdata *ppd, int ibup, u64 ibcs) +{ + unsigned long flags; + + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_IB_FORCE_NOTIFY; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + + if (ibup) { + if (ppd->dd->cspec->ibdeltainprog) { + ppd->dd->cspec->ibdeltainprog = 0; + ppd->dd->cspec->ibsymdelta += + read_6120_creg32(ppd->dd, cr_ibsymbolerr) - + ppd->dd->cspec->ibsymsnap; + ppd->dd->cspec->iblnkerrdelta += + read_6120_creg32(ppd->dd, cr_iblinkerrrecov) - + ppd->dd->cspec->iblnkerrsnap; + } + qib_hol_init(ppd); + } else { + ppd->dd->cspec->lli_counter = 0; + if (!ppd->dd->cspec->ibdeltainprog) { + ppd->dd->cspec->ibdeltainprog = 1; + ppd->dd->cspec->ibsymsnap = + read_6120_creg32(ppd->dd, cr_ibsymbolerr); + ppd->dd->cspec->iblnkerrsnap = + read_6120_creg32(ppd->dd, cr_iblinkerrrecov); + } + qib_hol_down(ppd); + } + + qib_6120_setup_setextled(ppd, ibup); + + return 0; +} + +/* Does read/modify/write to appropriate registers to + * set output and direction bits selected by mask. + * these are in their canonical postions (e.g. lsb of + * dir will end up in D48 of extctrl on existing chips). + * returns contents of GP Inputs. + */ +static int gpio_6120_mod(struct qib_devdata *dd, u32 out, u32 dir, u32 mask) +{ + u64 read_val, new_out; + unsigned long flags; + + if (mask) { + /* some bits being written, lock access to GPIO */ + dir &= mask; + out &= mask; + spin_lock_irqsave(&dd->cspec->gpio_lock, flags); + dd->cspec->extctrl &= ~((u64)mask << SYM_LSB(EXTCtrl, GPIOOe)); + dd->cspec->extctrl |= ((u64) dir << SYM_LSB(EXTCtrl, GPIOOe)); + new_out = (dd->cspec->gpio_out & ~mask) | out; + + qib_write_kreg(dd, kr_extctrl, dd->cspec->extctrl); + qib_write_kreg(dd, kr_gpio_out, new_out); + dd->cspec->gpio_out = new_out; + spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags); + } + /* + * It is unlikely that a read at this time would get valid + * data on a pin whose direction line was set in the same + * call to this function. We include the read here because + * that allows us to potentially combine a change on one pin with + * a read on another, and because the old code did something like + * this. + */ + read_val = qib_read_kreg64(dd, kr_extstatus); + return SYM_FIELD(read_val, EXTStatus, GPIOIn); +} + +/* + * Read fundamental info we need to use the chip. These are + * the registers that describe chip capabilities, and are + * saved in shadow registers. + */ +static void get_6120_chip_params(struct qib_devdata *dd) +{ + u64 val; + u32 piobufs; + int mtu; + + dd->uregbase = qib_read_kreg32(dd, kr_userregbase); + + dd->rcvtidcnt = qib_read_kreg32(dd, kr_rcvtidcnt); + dd->rcvtidbase = qib_read_kreg32(dd, kr_rcvtidbase); + dd->rcvegrbase = qib_read_kreg32(dd, kr_rcvegrbase); + dd->palign = qib_read_kreg32(dd, kr_palign); + dd->piobufbase = qib_read_kreg64(dd, kr_sendpiobufbase); + dd->pio2k_bufbase = dd->piobufbase & 0xffffffff; + + dd->rcvhdrcnt = qib_read_kreg32(dd, kr_rcvegrcnt); + + val = qib_read_kreg64(dd, kr_sendpiosize); + dd->piosize2k = val & ~0U; + dd->piosize4k = val >> 32; + + mtu = ib_mtu_enum_to_int(qib_ibmtu); + if (mtu == -1) + mtu = QIB_DEFAULT_MTU; + dd->pport->ibmtu = (u32)mtu; + + val = qib_read_kreg64(dd, kr_sendpiobufcnt); + dd->piobcnt2k = val & ~0U; + dd->piobcnt4k = val >> 32; + dd->last_pio = dd->piobcnt4k + dd->piobcnt2k - 1; + /* these may be adjusted in init_chip_wc_pat() */ + dd->pio2kbase = (u32 __iomem *) + (((char __iomem *)dd->kregbase) + dd->pio2k_bufbase); + if (dd->piobcnt4k) { + dd->pio4kbase = (u32 __iomem *) + (((char __iomem *) dd->kregbase) + + (dd->piobufbase >> 32)); + /* + * 4K buffers take 2 pages; we use roundup just to be + * paranoid; we calculate it once here, rather than on + * ever buf allocate + */ + dd->align4k = ALIGN(dd->piosize4k, dd->palign); + } + + piobufs = dd->piobcnt4k + dd->piobcnt2k; + + dd->pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2) / + (sizeof(u64) * BITS_PER_BYTE / 2); +} + +/* + * The chip base addresses in cspec and cpspec have to be set + * after possible init_chip_wc_pat(), rather than in + * get_6120_chip_params(), so split out as separate function + */ +static void set_6120_baseaddrs(struct qib_devdata *dd) +{ + u32 cregbase; + cregbase = qib_read_kreg32(dd, kr_counterregbase); + dd->cspec->cregbase = (u64 __iomem *) + ((char __iomem *) dd->kregbase + cregbase); + + dd->egrtidbase = (u64 __iomem *) + ((char __iomem *) dd->kregbase + dd->rcvegrbase); +} + +/* + * Write the final few registers that depend on some of the + * init setup. Done late in init, just before bringing up + * the serdes. + */ +static int qib_late_6120_initreg(struct qib_devdata *dd) +{ + int ret = 0; + u64 val; + + qib_write_kreg(dd, kr_rcvhdrentsize, dd->rcvhdrentsize); + qib_write_kreg(dd, kr_rcvhdrsize, dd->rcvhdrsize); + qib_write_kreg(dd, kr_rcvhdrcnt, dd->rcvhdrcnt); + qib_write_kreg(dd, kr_sendpioavailaddr, dd->pioavailregs_phys); + val = qib_read_kreg64(dd, kr_sendpioavailaddr); + if (val != dd->pioavailregs_phys) { + qib_dev_err(dd, + "Catastrophic software error, SendPIOAvailAddr written as %lx, read back as %llx\n", + (unsigned long) dd->pioavailregs_phys, + (unsigned long long) val); + ret = -EINVAL; + } + return ret; +} + +static int init_6120_variables(struct qib_devdata *dd) +{ + int ret = 0; + struct qib_pportdata *ppd; + u32 sbufs; + + ppd = (struct qib_pportdata *)(dd + 1); + dd->pport = ppd; + dd->num_pports = 1; + + dd->cspec = (struct qib_chip_specific *)(ppd + dd->num_pports); + ppd->cpspec = NULL; /* not used in this chip */ + + spin_lock_init(&dd->cspec->kernel_tid_lock); + spin_lock_init(&dd->cspec->user_tid_lock); + spin_lock_init(&dd->cspec->rcvmod_lock); + spin_lock_init(&dd->cspec->gpio_lock); + + /* we haven't yet set QIB_PRESENT, so use read directly */ + dd->revision = readq(&dd->kregbase[kr_revision]); + + if ((dd->revision & 0xffffffffU) == 0xffffffffU) { + qib_dev_err(dd, + "Revision register read failure, giving up initialization\n"); + ret = -ENODEV; + goto bail; + } + dd->flags |= QIB_PRESENT; /* now register routines work */ + + dd->majrev = (u8) SYM_FIELD(dd->revision, Revision_R, + ChipRevMajor); + dd->minrev = (u8) SYM_FIELD(dd->revision, Revision_R, + ChipRevMinor); + + get_6120_chip_params(dd); + pe_boardname(dd); /* fill in boardname */ + + /* + * GPIO bits for TWSI data and clock, + * used for serial EEPROM. + */ + dd->gpio_sda_num = _QIB_GPIO_SDA_NUM; + dd->gpio_scl_num = _QIB_GPIO_SCL_NUM; + dd->twsi_eeprom_dev = QIB_TWSI_NO_DEV; + + if (qib_unordered_wc()) + dd->flags |= QIB_PIO_FLUSH_WC; + + /* + * EEPROM error log 0 is TXE Parity errors. 1 is RXE Parity. + * 2 is Some Misc, 3 is reserved for future. + */ + dd->eep_st_masks[0].hwerrs_to_log = HWE_MASK(TXEMemParityErr); + + /* Ignore errors in PIO/PBC on systems with unordered write-combining */ + if (qib_unordered_wc()) + dd->eep_st_masks[0].hwerrs_to_log &= ~TXE_PIO_PARITY; + + dd->eep_st_masks[1].hwerrs_to_log = HWE_MASK(RXEMemParityErr); + + dd->eep_st_masks[2].errs_to_log = ERR_MASK(ResetNegated); + + ret = qib_init_pportdata(ppd, dd, 0, 1); + if (ret) + goto bail; + ppd->link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X; + ppd->link_speed_supported = QIB_IB_SDR; + ppd->link_width_enabled = IB_WIDTH_4X; + ppd->link_speed_enabled = ppd->link_speed_supported; + /* these can't change for this chip, so set once */ + ppd->link_width_active = ppd->link_width_enabled; + ppd->link_speed_active = ppd->link_speed_enabled; + ppd->vls_supported = IB_VL_VL0; + ppd->vls_operational = ppd->vls_supported; + + dd->rcvhdrentsize = QIB_RCVHDR_ENTSIZE; + dd->rcvhdrsize = QIB_DFLT_RCVHDRSIZE; + dd->rhf_offset = 0; + + /* we always allocate at least 2048 bytes for eager buffers */ + ret = ib_mtu_enum_to_int(qib_ibmtu); + dd->rcvegrbufsize = ret != -1 ? max(ret, 2048) : QIB_DEFAULT_MTU; + BUG_ON(!is_power_of_2(dd->rcvegrbufsize)); + dd->rcvegrbufsize_shift = ilog2(dd->rcvegrbufsize); + + qib_6120_tidtemplate(dd); + + /* + * We can request a receive interrupt for 1 or + * more packets from current offset. For now, we set this + * up for a single packet. + */ + dd->rhdrhead_intr_off = 1ULL << 32; + + /* setup the stats timer; the add_timer is done at end of init */ + init_timer(&dd->stats_timer); + dd->stats_timer.function = qib_get_6120_faststats; + dd->stats_timer.data = (unsigned long) dd; + + init_timer(&dd->cspec->pma_timer); + dd->cspec->pma_timer.function = pma_6120_timer; + dd->cspec->pma_timer.data = (unsigned long) ppd; + + dd->ureg_align = qib_read_kreg32(dd, kr_palign); + + dd->piosize2kmax_dwords = dd->piosize2k >> 2; + qib_6120_config_ctxts(dd); + qib_set_ctxtcnt(dd); + + if (qib_wc_pat) { + ret = init_chip_wc_pat(dd, 0); + if (ret) + goto bail; + } + set_6120_baseaddrs(dd); /* set chip access pointers now */ + + ret = 0; + if (qib_mini_init) + goto bail; + + qib_num_cfg_vls = 1; /* if any 6120's, only one VL */ + + ret = qib_create_ctxts(dd); + init_6120_cntrnames(dd); + + /* use all of 4KB buffers for the kernel, otherwise 16 */ + sbufs = dd->piobcnt4k ? dd->piobcnt4k : 16; + + dd->lastctxt_piobuf = dd->piobcnt2k + dd->piobcnt4k - sbufs; + dd->pbufsctxt = dd->lastctxt_piobuf / + (dd->cfgctxts - dd->first_user_ctxt); + + if (ret) + goto bail; +bail: + return ret; +} + +/* + * For this chip, we want to use the same buffer every time + * when we are trying to bring the link up (they are always VL15 + * packets). At that link state the packet should always go out immediately + * (or at least be discarded at the tx interface if the link is down). + * If it doesn't, and the buffer isn't available, that means some other + * sender has gotten ahead of us, and is preventing our packet from going + * out. In that case, we flush all packets, and try again. If that still + * fails, we fail the request, and hope things work the next time around. + * + * We don't need very complicated heuristics on whether the packet had + * time to go out or not, since even at SDR 1X, it goes out in very short + * time periods, covered by the chip reads done here and as part of the + * flush. + */ +static u32 __iomem *get_6120_link_buf(struct qib_pportdata *ppd, u32 *bnum) +{ + u32 __iomem *buf; + u32 lbuf = ppd->dd->piobcnt2k + ppd->dd->piobcnt4k - 1; + + /* + * always blip to get avail list updated, since it's almost + * always needed, and is fairly cheap. + */ + sendctrl_6120_mod(ppd->dd->pport, QIB_SENDCTRL_AVAIL_BLIP); + qib_read_kreg64(ppd->dd, kr_scratch); /* extra chip flush */ + buf = qib_getsendbuf_range(ppd->dd, bnum, lbuf, lbuf); + if (buf) + goto done; + + sendctrl_6120_mod(ppd, QIB_SENDCTRL_DISARM_ALL | QIB_SENDCTRL_FLUSH | + QIB_SENDCTRL_AVAIL_BLIP); + ppd->dd->upd_pio_shadow = 1; /* update our idea of what's busy */ + qib_read_kreg64(ppd->dd, kr_scratch); /* extra chip flush */ + buf = qib_getsendbuf_range(ppd->dd, bnum, lbuf, lbuf); +done: + return buf; +} + +static u32 __iomem *qib_6120_getsendbuf(struct qib_pportdata *ppd, u64 pbc, + u32 *pbufnum) +{ + u32 first, last, plen = pbc & QIB_PBC_LENGTH_MASK; + struct qib_devdata *dd = ppd->dd; + u32 __iomem *buf; + + if (((pbc >> 32) & PBC_6120_VL15_SEND_CTRL) && + !(ppd->lflags & (QIBL_IB_AUTONEG_INPROG | QIBL_LINKACTIVE))) + buf = get_6120_link_buf(ppd, pbufnum); + else { + + if ((plen + 1) > dd->piosize2kmax_dwords) + first = dd->piobcnt2k; + else + first = 0; + /* try 4k if all 2k busy, so same last for both sizes */ + last = dd->piobcnt2k + dd->piobcnt4k - 1; + buf = qib_getsendbuf_range(dd, pbufnum, first, last); + } + return buf; +} + +static int init_sdma_6120_regs(struct qib_pportdata *ppd) +{ + return -ENODEV; +} + +static u16 qib_sdma_6120_gethead(struct qib_pportdata *ppd) +{ + return 0; +} + +static int qib_sdma_6120_busy(struct qib_pportdata *ppd) +{ + return 0; +} + +static void qib_sdma_update_6120_tail(struct qib_pportdata *ppd, u16 tail) +{ +} + +static void qib_6120_sdma_sendctrl(struct qib_pportdata *ppd, unsigned op) +{ +} + +static void qib_sdma_set_6120_desc_cnt(struct qib_pportdata *ppd, unsigned cnt) +{ +} + +/* + * the pbc doesn't need a VL15 indicator, but we need it for link_buf. + * The chip ignores the bit if set. + */ +static u32 qib_6120_setpbc_control(struct qib_pportdata *ppd, u32 plen, + u8 srate, u8 vl) +{ + return vl == 15 ? PBC_6120_VL15_SEND_CTRL : 0; +} + +static void qib_6120_initvl15_bufs(struct qib_devdata *dd) +{ +} + +static void qib_6120_init_ctxt(struct qib_ctxtdata *rcd) +{ + rcd->rcvegrcnt = rcd->dd->rcvhdrcnt; + rcd->rcvegr_tid_base = rcd->ctxt * rcd->rcvegrcnt; +} + +static void qib_6120_txchk_change(struct qib_devdata *dd, u32 start, + u32 len, u32 avail, struct qib_ctxtdata *rcd) +{ +} + +static void writescratch(struct qib_devdata *dd, u32 val) +{ + (void) qib_write_kreg(dd, kr_scratch, val); +} + +static int qib_6120_tempsense_rd(struct qib_devdata *dd, int regnum) +{ + return -ENXIO; +} + +#ifdef CONFIG_INFINIBAND_QIB_DCA +static int qib_6120_notify_dca(struct qib_devdata *dd, unsigned long event) +{ + return 0; +} +#endif + +/* Dummy function, as 6120 boards never disable EEPROM Write */ +static int qib_6120_eeprom_wen(struct qib_devdata *dd, int wen) +{ + return 1; +} + +/** + * qib_init_iba6120_funcs - set up the chip-specific function pointers + * @pdev: pci_dev of the qlogic_ib device + * @ent: pci_device_id matching this chip + * + * This is global, and is called directly at init to set up the + * chip-specific function pointers for later use. + * + * It also allocates/partially-inits the qib_devdata struct for + * this device. + */ +struct qib_devdata *qib_init_iba6120_funcs(struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + struct qib_devdata *dd; + int ret; + + dd = qib_alloc_devdata(pdev, sizeof(struct qib_pportdata) + + sizeof(struct qib_chip_specific)); + if (IS_ERR(dd)) + goto bail; + + dd->f_bringup_serdes = qib_6120_bringup_serdes; + dd->f_cleanup = qib_6120_setup_cleanup; + dd->f_clear_tids = qib_6120_clear_tids; + dd->f_free_irq = qib_6120_free_irq; + dd->f_get_base_info = qib_6120_get_base_info; + dd->f_get_msgheader = qib_6120_get_msgheader; + dd->f_getsendbuf = qib_6120_getsendbuf; + dd->f_gpio_mod = gpio_6120_mod; + dd->f_eeprom_wen = qib_6120_eeprom_wen; + dd->f_hdrqempty = qib_6120_hdrqempty; + dd->f_ib_updown = qib_6120_ib_updown; + dd->f_init_ctxt = qib_6120_init_ctxt; + dd->f_initvl15_bufs = qib_6120_initvl15_bufs; + dd->f_intr_fallback = qib_6120_nointr_fallback; + dd->f_late_initreg = qib_late_6120_initreg; + dd->f_setpbc_control = qib_6120_setpbc_control; + dd->f_portcntr = qib_portcntr_6120; + dd->f_put_tid = (dd->minrev >= 2) ? + qib_6120_put_tid_2 : + qib_6120_put_tid; + dd->f_quiet_serdes = qib_6120_quiet_serdes; + dd->f_rcvctrl = rcvctrl_6120_mod; + dd->f_read_cntrs = qib_read_6120cntrs; + dd->f_read_portcntrs = qib_read_6120portcntrs; + dd->f_reset = qib_6120_setup_reset; + dd->f_init_sdma_regs = init_sdma_6120_regs; + dd->f_sdma_busy = qib_sdma_6120_busy; + dd->f_sdma_gethead = qib_sdma_6120_gethead; + dd->f_sdma_sendctrl = qib_6120_sdma_sendctrl; + dd->f_sdma_set_desc_cnt = qib_sdma_set_6120_desc_cnt; + dd->f_sdma_update_tail = qib_sdma_update_6120_tail; + dd->f_sendctrl = sendctrl_6120_mod; + dd->f_set_armlaunch = qib_set_6120_armlaunch; + dd->f_set_cntr_sample = qib_set_cntr_6120_sample; + dd->f_iblink_state = qib_6120_iblink_state; + dd->f_ibphys_portstate = qib_6120_phys_portstate; + dd->f_get_ib_cfg = qib_6120_get_ib_cfg; + dd->f_set_ib_cfg = qib_6120_set_ib_cfg; + dd->f_set_ib_loopback = qib_6120_set_loopback; + dd->f_set_intr_state = qib_6120_set_intr_state; + dd->f_setextled = qib_6120_setup_setextled; + dd->f_txchk_change = qib_6120_txchk_change; + dd->f_update_usrhead = qib_update_6120_usrhead; + dd->f_wantpiobuf_intr = qib_wantpiobuf_6120_intr; + dd->f_xgxs_reset = qib_6120_xgxs_reset; + dd->f_writescratch = writescratch; + dd->f_tempsense_rd = qib_6120_tempsense_rd; +#ifdef CONFIG_INFINIBAND_QIB_DCA + dd->f_notify_dca = qib_6120_notify_dca; +#endif + /* + * Do remaining pcie setup and save pcie values in dd. + * Any error printing is already done by the init code. + * On return, we have the chip mapped and accessible, + * but chip registers are not set up until start of + * init_6120_variables. + */ + ret = qib_pcie_ddinit(dd, pdev, ent); + if (ret < 0) + goto bail_free; + + /* initialize chip-specific variables */ + ret = init_6120_variables(dd); + if (ret) + goto bail_cleanup; + + if (qib_mini_init) + goto bail; + + if (qib_pcie_params(dd, 8, NULL, NULL)) + qib_dev_err(dd, + "Failed to setup PCIe or interrupts; continuing anyway\n"); + dd->cspec->irq = pdev->irq; /* save IRQ */ + + /* clear diagctrl register, in case diags were running and crashed */ + qib_write_kreg(dd, kr_hwdiagctrl, 0); + + if (qib_read_kreg64(dd, kr_hwerrstatus) & + QLOGIC_IB_HWE_SERDESPLLFAILED) + qib_write_kreg(dd, kr_hwerrclear, + QLOGIC_IB_HWE_SERDESPLLFAILED); + + /* setup interrupt handler (interrupt type handled above) */ + qib_setup_6120_interrupt(dd); + /* Note that qpn_mask is set by qib_6120_config_ctxts() first */ + qib_6120_init_hwerrors(dd); + + goto bail; + +bail_cleanup: + qib_pcie_ddcleanup(dd); +bail_free: + qib_free_devdata(dd); + dd = ERR_PTR(ret); +bail: + return dd; +} diff --git a/drivers/infiniband/hw/qib/qib_iba7220.c b/drivers/infiniband/hw/qib/qib_iba7220.c new file mode 100644 index 0000000..7dec89f --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_iba7220.c @@ -0,0 +1,4659 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation. + * All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +/* + * This file contains all of the code that is specific to the + * QLogic_IB 7220 chip (except that specific to the SerDes) + */ + +#include +#include +#include +#include +#include +#include + +#include "qib.h" +#include "qib_7220.h" + +static void qib_setup_7220_setextled(struct qib_pportdata *, u32); +static void qib_7220_handle_hwerrors(struct qib_devdata *, char *, size_t); +static void sendctrl_7220_mod(struct qib_pportdata *ppd, u32 op); +static u32 qib_7220_iblink_state(u64); +static u8 qib_7220_phys_portstate(u64); +static void qib_sdma_update_7220_tail(struct qib_pportdata *, u16); +static void qib_set_ib_7220_lstate(struct qib_pportdata *, u16, u16); + +/* + * This file contains almost all the chip-specific register information and + * access functions for the QLogic QLogic_IB 7220 PCI-Express chip, with the + * exception of SerDes support, which in in qib_sd7220.c. + */ + +/* Below uses machine-generated qib_chipnum_regs.h file */ +#define KREG_IDX(regname) (QIB_7220_##regname##_OFFS / sizeof(u64)) + +/* Use defines to tie machine-generated names to lower-case names */ +#define kr_control KREG_IDX(Control) +#define kr_counterregbase KREG_IDX(CntrRegBase) +#define kr_errclear KREG_IDX(ErrClear) +#define kr_errmask KREG_IDX(ErrMask) +#define kr_errstatus KREG_IDX(ErrStatus) +#define kr_extctrl KREG_IDX(EXTCtrl) +#define kr_extstatus KREG_IDX(EXTStatus) +#define kr_gpio_clear KREG_IDX(GPIOClear) +#define kr_gpio_mask KREG_IDX(GPIOMask) +#define kr_gpio_out KREG_IDX(GPIOOut) +#define kr_gpio_status KREG_IDX(GPIOStatus) +#define kr_hrtbt_guid KREG_IDX(HRTBT_GUID) +#define kr_hwdiagctrl KREG_IDX(HwDiagCtrl) +#define kr_hwerrclear KREG_IDX(HwErrClear) +#define kr_hwerrmask KREG_IDX(HwErrMask) +#define kr_hwerrstatus KREG_IDX(HwErrStatus) +#define kr_ibcctrl KREG_IDX(IBCCtrl) +#define kr_ibcddrctrl KREG_IDX(IBCDDRCtrl) +#define kr_ibcddrstatus KREG_IDX(IBCDDRStatus) +#define kr_ibcstatus KREG_IDX(IBCStatus) +#define kr_ibserdesctrl KREG_IDX(IBSerDesCtrl) +#define kr_intclear KREG_IDX(IntClear) +#define kr_intmask KREG_IDX(IntMask) +#define kr_intstatus KREG_IDX(IntStatus) +#define kr_ncmodectrl KREG_IDX(IBNCModeCtrl) +#define kr_palign KREG_IDX(PageAlign) +#define kr_partitionkey KREG_IDX(RcvPartitionKey) +#define kr_portcnt KREG_IDX(PortCnt) +#define kr_rcvbthqp KREG_IDX(RcvBTHQP) +#define kr_rcvctrl KREG_IDX(RcvCtrl) +#define kr_rcvegrbase KREG_IDX(RcvEgrBase) +#define kr_rcvegrcnt KREG_IDX(RcvEgrCnt) +#define kr_rcvhdrcnt KREG_IDX(RcvHdrCnt) +#define kr_rcvhdrentsize KREG_IDX(RcvHdrEntSize) +#define kr_rcvhdrsize KREG_IDX(RcvHdrSize) +#define kr_rcvpktledcnt KREG_IDX(RcvPktLEDCnt) +#define kr_rcvtidbase KREG_IDX(RcvTIDBase) +#define kr_rcvtidcnt KREG_IDX(RcvTIDCnt) +#define kr_revision KREG_IDX(Revision) +#define kr_scratch KREG_IDX(Scratch) +#define kr_sendbuffererror KREG_IDX(SendBufErr0) +#define kr_sendctrl KREG_IDX(SendCtrl) +#define kr_senddmabase KREG_IDX(SendDmaBase) +#define kr_senddmabufmask0 KREG_IDX(SendDmaBufMask0) +#define kr_senddmabufmask1 (KREG_IDX(SendDmaBufMask0) + 1) +#define kr_senddmabufmask2 (KREG_IDX(SendDmaBufMask0) + 2) +#define kr_senddmahead KREG_IDX(SendDmaHead) +#define kr_senddmaheadaddr KREG_IDX(SendDmaHeadAddr) +#define kr_senddmalengen KREG_IDX(SendDmaLenGen) +#define kr_senddmastatus KREG_IDX(SendDmaStatus) +#define kr_senddmatail KREG_IDX(SendDmaTail) +#define kr_sendpioavailaddr KREG_IDX(SendBufAvailAddr) +#define kr_sendpiobufbase KREG_IDX(SendBufBase) +#define kr_sendpiobufcnt KREG_IDX(SendBufCnt) +#define kr_sendpiosize KREG_IDX(SendBufSize) +#define kr_sendregbase KREG_IDX(SendRegBase) +#define kr_userregbase KREG_IDX(UserRegBase) +#define kr_xgxs_cfg KREG_IDX(XGXSCfg) + +/* These must only be written via qib_write_kreg_ctxt() */ +#define kr_rcvhdraddr KREG_IDX(RcvHdrAddr0) +#define kr_rcvhdrtailaddr KREG_IDX(RcvHdrTailAddr0) + + +#define CREG_IDX(regname) ((QIB_7220_##regname##_OFFS - \ + QIB_7220_LBIntCnt_OFFS) / sizeof(u64)) + +#define cr_badformat CREG_IDX(RxVersionErrCnt) +#define cr_erricrc CREG_IDX(RxICRCErrCnt) +#define cr_errlink CREG_IDX(RxLinkMalformCnt) +#define cr_errlpcrc CREG_IDX(RxLPCRCErrCnt) +#define cr_errpkey CREG_IDX(RxPKeyMismatchCnt) +#define cr_rcvflowctrl_err CREG_IDX(RxFlowCtrlViolCnt) +#define cr_err_rlen CREG_IDX(RxLenErrCnt) +#define cr_errslen CREG_IDX(TxLenErrCnt) +#define cr_errtidfull CREG_IDX(RxTIDFullErrCnt) +#define cr_errtidvalid CREG_IDX(RxTIDValidErrCnt) +#define cr_errvcrc CREG_IDX(RxVCRCErrCnt) +#define cr_ibstatuschange CREG_IDX(IBStatusChangeCnt) +#define cr_lbint CREG_IDX(LBIntCnt) +#define cr_invalidrlen CREG_IDX(RxMaxMinLenErrCnt) +#define cr_invalidslen CREG_IDX(TxMaxMinLenErrCnt) +#define cr_lbflowstall CREG_IDX(LBFlowStallCnt) +#define cr_pktrcv CREG_IDX(RxDataPktCnt) +#define cr_pktrcvflowctrl CREG_IDX(RxFlowPktCnt) +#define cr_pktsend CREG_IDX(TxDataPktCnt) +#define cr_pktsendflow CREG_IDX(TxFlowPktCnt) +#define cr_portovfl CREG_IDX(RxP0HdrEgrOvflCnt) +#define cr_rcvebp CREG_IDX(RxEBPCnt) +#define cr_rcvovfl CREG_IDX(RxBufOvflCnt) +#define cr_senddropped CREG_IDX(TxDroppedPktCnt) +#define cr_sendstall CREG_IDX(TxFlowStallCnt) +#define cr_sendunderrun CREG_IDX(TxUnderrunCnt) +#define cr_wordrcv CREG_IDX(RxDwordCnt) +#define cr_wordsend CREG_IDX(TxDwordCnt) +#define cr_txunsupvl CREG_IDX(TxUnsupVLErrCnt) +#define cr_rxdroppkt CREG_IDX(RxDroppedPktCnt) +#define cr_iblinkerrrecov CREG_IDX(IBLinkErrRecoveryCnt) +#define cr_iblinkdown CREG_IDX(IBLinkDownedCnt) +#define cr_ibsymbolerr CREG_IDX(IBSymbolErrCnt) +#define cr_vl15droppedpkt CREG_IDX(RxVL15DroppedPktCnt) +#define cr_rxotherlocalphyerr CREG_IDX(RxOtherLocalPhyErrCnt) +#define cr_excessbufferovfl CREG_IDX(ExcessBufferOvflCnt) +#define cr_locallinkintegrityerr CREG_IDX(LocalLinkIntegrityErrCnt) +#define cr_rxvlerr CREG_IDX(RxVlErrCnt) +#define cr_rxdlidfltr CREG_IDX(RxDlidFltrCnt) +#define cr_psstat CREG_IDX(PSStat) +#define cr_psstart CREG_IDX(PSStart) +#define cr_psinterval CREG_IDX(PSInterval) +#define cr_psrcvdatacount CREG_IDX(PSRcvDataCount) +#define cr_psrcvpktscount CREG_IDX(PSRcvPktsCount) +#define cr_psxmitdatacount CREG_IDX(PSXmitDataCount) +#define cr_psxmitpktscount CREG_IDX(PSXmitPktsCount) +#define cr_psxmitwaitcount CREG_IDX(PSXmitWaitCount) +#define cr_txsdmadesc CREG_IDX(TxSDmaDescCnt) +#define cr_pcieretrydiag CREG_IDX(PcieRetryBufDiagQwordCnt) + +#define SYM_RMASK(regname, fldname) ((u64) \ + QIB_7220_##regname##_##fldname##_RMASK) +#define SYM_MASK(regname, fldname) ((u64) \ + QIB_7220_##regname##_##fldname##_RMASK << \ + QIB_7220_##regname##_##fldname##_LSB) +#define SYM_LSB(regname, fldname) (QIB_7220_##regname##_##fldname##_LSB) +#define SYM_FIELD(value, regname, fldname) ((u64) \ + (((value) >> SYM_LSB(regname, fldname)) & \ + SYM_RMASK(regname, fldname))) +#define ERR_MASK(fldname) SYM_MASK(ErrMask, fldname##Mask) +#define HWE_MASK(fldname) SYM_MASK(HwErrMask, fldname##Mask) + +/* ibcctrl bits */ +#define QLOGIC_IB_IBCC_LINKINITCMD_DISABLE 1 +/* cycle through TS1/TS2 till OK */ +#define QLOGIC_IB_IBCC_LINKINITCMD_POLL 2 +/* wait for TS1, then go on */ +#define QLOGIC_IB_IBCC_LINKINITCMD_SLEEP 3 +#define QLOGIC_IB_IBCC_LINKINITCMD_SHIFT 16 + +#define QLOGIC_IB_IBCC_LINKCMD_DOWN 1 /* move to 0x11 */ +#define QLOGIC_IB_IBCC_LINKCMD_ARMED 2 /* move to 0x21 */ +#define QLOGIC_IB_IBCC_LINKCMD_ACTIVE 3 /* move to 0x31 */ + +#define BLOB_7220_IBCHG 0x81 + +/* + * We could have a single register get/put routine, that takes a group type, + * but this is somewhat clearer and cleaner. It also gives us some error + * checking. 64 bit register reads should always work, but are inefficient + * on opteron (the northbridge always generates 2 separate HT 32 bit reads), + * so we use kreg32 wherever possible. User register and counter register + * reads are always 32 bit reads, so only one form of those routines. + */ + +/** + * qib_read_ureg32 - read 32-bit virtualized per-context register + * @dd: device + * @regno: register number + * @ctxt: context number + * + * Return the contents of a register that is virtualized to be per context. + * Returns -1 on errors (not distinguishable from valid contents at + * runtime; we may add a separate error variable at some point). + */ +static inline u32 qib_read_ureg32(const struct qib_devdata *dd, + enum qib_ureg regno, int ctxt) +{ + if (!dd->kregbase || !(dd->flags & QIB_PRESENT)) + return 0; + + if (dd->userbase) + return readl(regno + (u64 __iomem *) + ((char __iomem *)dd->userbase + + dd->ureg_align * ctxt)); + else + return readl(regno + (u64 __iomem *) + (dd->uregbase + + (char __iomem *)dd->kregbase + + dd->ureg_align * ctxt)); +} + +/** + * qib_write_ureg - write 32-bit virtualized per-context register + * @dd: device + * @regno: register number + * @value: value + * @ctxt: context + * + * Write the contents of a register that is virtualized to be per context. + */ +static inline void qib_write_ureg(const struct qib_devdata *dd, + enum qib_ureg regno, u64 value, int ctxt) +{ + u64 __iomem *ubase; + + if (dd->userbase) + ubase = (u64 __iomem *) + ((char __iomem *) dd->userbase + + dd->ureg_align * ctxt); + else + ubase = (u64 __iomem *) + (dd->uregbase + + (char __iomem *) dd->kregbase + + dd->ureg_align * ctxt); + + if (dd->kregbase && (dd->flags & QIB_PRESENT)) + writeq(value, &ubase[regno]); +} + +/** + * qib_write_kreg_ctxt - write a device's per-ctxt 64-bit kernel register + * @dd: the qlogic_ib device + * @regno: the register number to write + * @ctxt: the context containing the register + * @value: the value to write + */ +static inline void qib_write_kreg_ctxt(const struct qib_devdata *dd, + const u16 regno, unsigned ctxt, + u64 value) +{ + qib_write_kreg(dd, regno + ctxt, value); +} + +static inline void write_7220_creg(const struct qib_devdata *dd, + u16 regno, u64 value) +{ + if (dd->cspec->cregbase && (dd->flags & QIB_PRESENT)) + writeq(value, &dd->cspec->cregbase[regno]); +} + +static inline u64 read_7220_creg(const struct qib_devdata *dd, u16 regno) +{ + if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT)) + return 0; + return readq(&dd->cspec->cregbase[regno]); +} + +static inline u32 read_7220_creg32(const struct qib_devdata *dd, u16 regno) +{ + if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT)) + return 0; + return readl(&dd->cspec->cregbase[regno]); +} + +/* kr_revision bits */ +#define QLOGIC_IB_R_EMULATORREV_MASK ((1ULL << 22) - 1) +#define QLOGIC_IB_R_EMULATORREV_SHIFT 40 + +/* kr_control bits */ +#define QLOGIC_IB_C_RESET (1U << 7) + +/* kr_intstatus, kr_intclear, kr_intmask bits */ +#define QLOGIC_IB_I_RCVURG_MASK ((1ULL << 17) - 1) +#define QLOGIC_IB_I_RCVURG_SHIFT 32 +#define QLOGIC_IB_I_RCVAVAIL_MASK ((1ULL << 17) - 1) +#define QLOGIC_IB_I_RCVAVAIL_SHIFT 0 +#define QLOGIC_IB_I_SERDESTRIMDONE (1ULL << 27) + +#define QLOGIC_IB_C_FREEZEMODE 0x00000002 +#define QLOGIC_IB_C_LINKENABLE 0x00000004 + +#define QLOGIC_IB_I_SDMAINT 0x8000000000000000ULL +#define QLOGIC_IB_I_SDMADISABLED 0x4000000000000000ULL +#define QLOGIC_IB_I_ERROR 0x0000000080000000ULL +#define QLOGIC_IB_I_SPIOSENT 0x0000000040000000ULL +#define QLOGIC_IB_I_SPIOBUFAVAIL 0x0000000020000000ULL +#define QLOGIC_IB_I_GPIO 0x0000000010000000ULL + +/* variables for sanity checking interrupt and errors */ +#define QLOGIC_IB_I_BITSEXTANT \ + (QLOGIC_IB_I_SDMAINT | QLOGIC_IB_I_SDMADISABLED | \ + (QLOGIC_IB_I_RCVURG_MASK << QLOGIC_IB_I_RCVURG_SHIFT) | \ + (QLOGIC_IB_I_RCVAVAIL_MASK << \ + QLOGIC_IB_I_RCVAVAIL_SHIFT) | \ + QLOGIC_IB_I_ERROR | QLOGIC_IB_I_SPIOSENT | \ + QLOGIC_IB_I_SPIOBUFAVAIL | QLOGIC_IB_I_GPIO | \ + QLOGIC_IB_I_SERDESTRIMDONE) + +#define IB_HWE_BITSEXTANT \ + (HWE_MASK(RXEMemParityErr) | \ + HWE_MASK(TXEMemParityErr) | \ + (QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK << \ + QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT) | \ + QLOGIC_IB_HWE_PCIE1PLLFAILED | \ + QLOGIC_IB_HWE_PCIE0PLLFAILED | \ + QLOGIC_IB_HWE_PCIEPOISONEDTLP | \ + QLOGIC_IB_HWE_PCIECPLTIMEOUT | \ + QLOGIC_IB_HWE_PCIEBUSPARITYXTLH | \ + QLOGIC_IB_HWE_PCIEBUSPARITYXADM | \ + QLOGIC_IB_HWE_PCIEBUSPARITYRADM | \ + HWE_MASK(PowerOnBISTFailed) | \ + QLOGIC_IB_HWE_COREPLL_FBSLIP | \ + QLOGIC_IB_HWE_COREPLL_RFSLIP | \ + QLOGIC_IB_HWE_SERDESPLLFAILED | \ + HWE_MASK(IBCBusToSPCParityErr) | \ + HWE_MASK(IBCBusFromSPCParityErr) | \ + QLOGIC_IB_HWE_PCIECPLDATAQUEUEERR | \ + QLOGIC_IB_HWE_PCIECPLHDRQUEUEERR | \ + QLOGIC_IB_HWE_SDMAMEMREADERR | \ + QLOGIC_IB_HWE_CLK_UC_PLLNOTLOCKED | \ + QLOGIC_IB_HWE_PCIESERDESQ0PCLKNOTDETECT | \ + QLOGIC_IB_HWE_PCIESERDESQ1PCLKNOTDETECT | \ + QLOGIC_IB_HWE_PCIESERDESQ2PCLKNOTDETECT | \ + QLOGIC_IB_HWE_PCIESERDESQ3PCLKNOTDETECT | \ + QLOGIC_IB_HWE_DDSRXEQMEMORYPARITYERR | \ + QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR | \ + QLOGIC_IB_HWE_PCIE_UC_OCT0MEMORYPARITYERR | \ + QLOGIC_IB_HWE_PCIE_UC_OCT1MEMORYPARITYERR) + +#define IB_E_BITSEXTANT \ + (ERR_MASK(RcvFormatErr) | ERR_MASK(RcvVCRCErr) | \ + ERR_MASK(RcvICRCErr) | ERR_MASK(RcvMinPktLenErr) | \ + ERR_MASK(RcvMaxPktLenErr) | ERR_MASK(RcvLongPktLenErr) | \ + ERR_MASK(RcvShortPktLenErr) | ERR_MASK(RcvUnexpectedCharErr) | \ + ERR_MASK(RcvUnsupportedVLErr) | ERR_MASK(RcvEBPErr) | \ + ERR_MASK(RcvIBFlowErr) | ERR_MASK(RcvBadVersionErr) | \ + ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr) | \ + ERR_MASK(RcvBadTidErr) | ERR_MASK(RcvHdrLenErr) | \ + ERR_MASK(RcvHdrErr) | ERR_MASK(RcvIBLostLinkErr) | \ + ERR_MASK(SendSpecialTriggerErr) | \ + ERR_MASK(SDmaDisabledErr) | ERR_MASK(SendMinPktLenErr) | \ + ERR_MASK(SendMaxPktLenErr) | ERR_MASK(SendUnderRunErr) | \ + ERR_MASK(SendPktLenErr) | ERR_MASK(SendDroppedSmpPktErr) | \ + ERR_MASK(SendDroppedDataPktErr) | \ + ERR_MASK(SendPioArmLaunchErr) | \ + ERR_MASK(SendUnexpectedPktNumErr) | \ + ERR_MASK(SendUnsupportedVLErr) | ERR_MASK(SendBufMisuseErr) | \ + ERR_MASK(SDmaGenMismatchErr) | ERR_MASK(SDmaOutOfBoundErr) | \ + ERR_MASK(SDmaTailOutOfBoundErr) | ERR_MASK(SDmaBaseErr) | \ + ERR_MASK(SDma1stDescErr) | ERR_MASK(SDmaRpyTagErr) | \ + ERR_MASK(SDmaDwEnErr) | ERR_MASK(SDmaMissingDwErr) | \ + ERR_MASK(SDmaUnexpDataErr) | \ + ERR_MASK(IBStatusChanged) | ERR_MASK(InvalidAddrErr) | \ + ERR_MASK(ResetNegated) | ERR_MASK(HardwareErr) | \ + ERR_MASK(SDmaDescAddrMisalignErr) | \ + ERR_MASK(InvalidEEPCmd)) + +/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */ +#define QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK 0x00000000000000ffULL +#define QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT 0 +#define QLOGIC_IB_HWE_PCIEPOISONEDTLP 0x0000000010000000ULL +#define QLOGIC_IB_HWE_PCIECPLTIMEOUT 0x0000000020000000ULL +#define QLOGIC_IB_HWE_PCIEBUSPARITYXTLH 0x0000000040000000ULL +#define QLOGIC_IB_HWE_PCIEBUSPARITYXADM 0x0000000080000000ULL +#define QLOGIC_IB_HWE_PCIEBUSPARITYRADM 0x0000000100000000ULL +#define QLOGIC_IB_HWE_COREPLL_FBSLIP 0x0080000000000000ULL +#define QLOGIC_IB_HWE_COREPLL_RFSLIP 0x0100000000000000ULL +#define QLOGIC_IB_HWE_PCIE1PLLFAILED 0x0400000000000000ULL +#define QLOGIC_IB_HWE_PCIE0PLLFAILED 0x0800000000000000ULL +#define QLOGIC_IB_HWE_SERDESPLLFAILED 0x1000000000000000ULL +/* specific to this chip */ +#define QLOGIC_IB_HWE_PCIECPLDATAQUEUEERR 0x0000000000000040ULL +#define QLOGIC_IB_HWE_PCIECPLHDRQUEUEERR 0x0000000000000080ULL +#define QLOGIC_IB_HWE_SDMAMEMREADERR 0x0000000010000000ULL +#define QLOGIC_IB_HWE_CLK_UC_PLLNOTLOCKED 0x2000000000000000ULL +#define QLOGIC_IB_HWE_PCIESERDESQ0PCLKNOTDETECT 0x0100000000000000ULL +#define QLOGIC_IB_HWE_PCIESERDESQ1PCLKNOTDETECT 0x0200000000000000ULL +#define QLOGIC_IB_HWE_PCIESERDESQ2PCLKNOTDETECT 0x0400000000000000ULL +#define QLOGIC_IB_HWE_PCIESERDESQ3PCLKNOTDETECT 0x0800000000000000ULL +#define QLOGIC_IB_HWE_DDSRXEQMEMORYPARITYERR 0x0000008000000000ULL +#define QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR 0x0000004000000000ULL +#define QLOGIC_IB_HWE_PCIE_UC_OCT0MEMORYPARITYERR 0x0000001000000000ULL +#define QLOGIC_IB_HWE_PCIE_UC_OCT1MEMORYPARITYERR 0x0000002000000000ULL + +#define IBA7220_IBCC_LINKCMD_SHIFT 19 + +/* kr_ibcddrctrl bits */ +#define IBA7220_IBC_DLIDLMC_MASK 0xFFFFFFFFUL +#define IBA7220_IBC_DLIDLMC_SHIFT 32 + +#define IBA7220_IBC_HRTBT_MASK (SYM_RMASK(IBCDDRCtrl, HRTBT_AUTO) | \ + SYM_RMASK(IBCDDRCtrl, HRTBT_ENB)) +#define IBA7220_IBC_HRTBT_SHIFT SYM_LSB(IBCDDRCtrl, HRTBT_ENB) + +#define IBA7220_IBC_LANE_REV_SUPPORTED (1<<8) +#define IBA7220_IBC_LREV_MASK 1 +#define IBA7220_IBC_LREV_SHIFT 8 +#define IBA7220_IBC_RXPOL_MASK 1 +#define IBA7220_IBC_RXPOL_SHIFT 7 +#define IBA7220_IBC_WIDTH_SHIFT 5 +#define IBA7220_IBC_WIDTH_MASK 0x3 +#define IBA7220_IBC_WIDTH_1X_ONLY (0 << IBA7220_IBC_WIDTH_SHIFT) +#define IBA7220_IBC_WIDTH_4X_ONLY (1 << IBA7220_IBC_WIDTH_SHIFT) +#define IBA7220_IBC_WIDTH_AUTONEG (2 << IBA7220_IBC_WIDTH_SHIFT) +#define IBA7220_IBC_SPEED_AUTONEG (1 << 1) +#define IBA7220_IBC_SPEED_SDR (1 << 2) +#define IBA7220_IBC_SPEED_DDR (1 << 3) +#define IBA7220_IBC_SPEED_AUTONEG_MASK (0x7 << 1) +#define IBA7220_IBC_IBTA_1_2_MASK (1) + +/* kr_ibcddrstatus */ +/* link latency shift is 0, don't bother defining */ +#define IBA7220_DDRSTAT_LINKLAT_MASK 0x3ffffff + +/* kr_extstatus bits */ +#define QLOGIC_IB_EXTS_FREQSEL 0x2 +#define QLOGIC_IB_EXTS_SERDESSEL 0x4 +#define QLOGIC_IB_EXTS_MEMBIST_ENDTEST 0x0000000000004000 +#define QLOGIC_IB_EXTS_MEMBIST_DISABLED 0x0000000000008000 + +/* kr_xgxsconfig bits */ +#define QLOGIC_IB_XGXS_RESET 0x5ULL +#define QLOGIC_IB_XGXS_FC_SAFE (1ULL << 63) + +/* kr_rcvpktledcnt */ +#define IBA7220_LEDBLINK_ON_SHIFT 32 /* 4ns period on after packet */ +#define IBA7220_LEDBLINK_OFF_SHIFT 0 /* 4ns period off before next on */ + +#define _QIB_GPIO_SDA_NUM 1 +#define _QIB_GPIO_SCL_NUM 0 +#define QIB_TWSI_EEPROM_DEV 0xA2 /* All Production 7220 cards. */ +#define QIB_TWSI_TEMP_DEV 0x98 + +/* HW counter clock is at 4nsec */ +#define QIB_7220_PSXMITWAIT_CHECK_RATE 4000 + +#define IBA7220_R_INTRAVAIL_SHIFT 17 +#define IBA7220_R_PKEY_DIS_SHIFT 34 +#define IBA7220_R_TAILUPD_SHIFT 35 +#define IBA7220_R_CTXTCFG_SHIFT 36 + +#define IBA7220_HDRHEAD_PKTINT_SHIFT 32 /* interrupt cnt in upper 32 bits */ + +/* + * the size bits give us 2^N, in KB units. 0 marks as invalid, + * and 7 is reserved. We currently use only 2KB and 4KB + */ +#define IBA7220_TID_SZ_SHIFT 37 /* shift to 3bit size selector */ +#define IBA7220_TID_SZ_2K (1UL << IBA7220_TID_SZ_SHIFT) /* 2KB */ +#define IBA7220_TID_SZ_4K (2UL << IBA7220_TID_SZ_SHIFT) /* 4KB */ +#define IBA7220_TID_PA_SHIFT 11U /* TID addr in chip stored w/o low bits */ +#define PBC_7220_VL15_SEND (1ULL << 63) /* pbc; VL15, no credit check */ +#define PBC_7220_VL15_SEND_CTRL (1ULL << 31) /* control version of same */ + +#define AUTONEG_TRIES 5 /* sequential retries to negotiate DDR */ + +/* packet rate matching delay multiplier */ +static u8 rate_to_delay[2][2] = { + /* 1x, 4x */ + { 8, 2 }, /* SDR */ + { 4, 1 } /* DDR */ +}; + +static u8 ib_rate_to_delay[IB_RATE_120_GBPS + 1] = { + [IB_RATE_2_5_GBPS] = 8, + [IB_RATE_5_GBPS] = 4, + [IB_RATE_10_GBPS] = 2, + [IB_RATE_20_GBPS] = 1 +}; + +#define IBA7220_LINKSPEED_SHIFT SYM_LSB(IBCStatus, LinkSpeedActive) +#define IBA7220_LINKWIDTH_SHIFT SYM_LSB(IBCStatus, LinkWidthActive) + +/* link training states, from IBC */ +#define IB_7220_LT_STATE_DISABLED 0x00 +#define IB_7220_LT_STATE_LINKUP 0x01 +#define IB_7220_LT_STATE_POLLACTIVE 0x02 +#define IB_7220_LT_STATE_POLLQUIET 0x03 +#define IB_7220_LT_STATE_SLEEPDELAY 0x04 +#define IB_7220_LT_STATE_SLEEPQUIET 0x05 +#define IB_7220_LT_STATE_CFGDEBOUNCE 0x08 +#define IB_7220_LT_STATE_CFGRCVFCFG 0x09 +#define IB_7220_LT_STATE_CFGWAITRMT 0x0a +#define IB_7220_LT_STATE_CFGIDLE 0x0b +#define IB_7220_LT_STATE_RECOVERRETRAIN 0x0c +#define IB_7220_LT_STATE_RECOVERWAITRMT 0x0e +#define IB_7220_LT_STATE_RECOVERIDLE 0x0f + +/* link state machine states from IBC */ +#define IB_7220_L_STATE_DOWN 0x0 +#define IB_7220_L_STATE_INIT 0x1 +#define IB_7220_L_STATE_ARM 0x2 +#define IB_7220_L_STATE_ACTIVE 0x3 +#define IB_7220_L_STATE_ACT_DEFER 0x4 + +static const u8 qib_7220_physportstate[0x20] = { + [IB_7220_LT_STATE_DISABLED] = IB_PHYSPORTSTATE_DISABLED, + [IB_7220_LT_STATE_LINKUP] = IB_PHYSPORTSTATE_LINKUP, + [IB_7220_LT_STATE_POLLACTIVE] = IB_PHYSPORTSTATE_POLL, + [IB_7220_LT_STATE_POLLQUIET] = IB_PHYSPORTSTATE_POLL, + [IB_7220_LT_STATE_SLEEPDELAY] = IB_PHYSPORTSTATE_SLEEP, + [IB_7220_LT_STATE_SLEEPQUIET] = IB_PHYSPORTSTATE_SLEEP, + [IB_7220_LT_STATE_CFGDEBOUNCE] = + IB_PHYSPORTSTATE_CFG_TRAIN, + [IB_7220_LT_STATE_CFGRCVFCFG] = + IB_PHYSPORTSTATE_CFG_TRAIN, + [IB_7220_LT_STATE_CFGWAITRMT] = + IB_PHYSPORTSTATE_CFG_TRAIN, + [IB_7220_LT_STATE_CFGIDLE] = IB_PHYSPORTSTATE_CFG_TRAIN, + [IB_7220_LT_STATE_RECOVERRETRAIN] = + IB_PHYSPORTSTATE_LINK_ERR_RECOVER, + [IB_7220_LT_STATE_RECOVERWAITRMT] = + IB_PHYSPORTSTATE_LINK_ERR_RECOVER, + [IB_7220_LT_STATE_RECOVERIDLE] = + IB_PHYSPORTSTATE_LINK_ERR_RECOVER, + [0x10] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x11] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x12] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x13] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x14] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x15] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x16] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x17] = IB_PHYSPORTSTATE_CFG_TRAIN +}; + +int qib_special_trigger; +module_param_named(special_trigger, qib_special_trigger, int, S_IRUGO); +MODULE_PARM_DESC(special_trigger, "Enable SpecialTrigger arm/launch"); + +#define IBCBUSFRSPCPARITYERR HWE_MASK(IBCBusFromSPCParityErr) +#define IBCBUSTOSPCPARITYERR HWE_MASK(IBCBusToSPCParityErr) + +#define SYM_MASK_BIT(regname, fldname, bit) ((u64) \ + (1ULL << (SYM_LSB(regname, fldname) + (bit)))) + +#define TXEMEMPARITYERR_PIOBUF \ + SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 0) +#define TXEMEMPARITYERR_PIOPBC \ + SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 1) +#define TXEMEMPARITYERR_PIOLAUNCHFIFO \ + SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 2) + +#define RXEMEMPARITYERR_RCVBUF \ + SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 0) +#define RXEMEMPARITYERR_LOOKUPQ \ + SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 1) +#define RXEMEMPARITYERR_EXPTID \ + SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 2) +#define RXEMEMPARITYERR_EAGERTID \ + SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 3) +#define RXEMEMPARITYERR_FLAGBUF \ + SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 4) +#define RXEMEMPARITYERR_DATAINFO \ + SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 5) +#define RXEMEMPARITYERR_HDRINFO \ + SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 6) + +/* 7220 specific hardware errors... */ +static const struct qib_hwerror_msgs qib_7220_hwerror_msgs[] = { + /* generic hardware errors */ + QLOGIC_IB_HWE_MSG(IBCBUSFRSPCPARITYERR, "QIB2IB Parity"), + QLOGIC_IB_HWE_MSG(IBCBUSTOSPCPARITYERR, "IB2QIB Parity"), + + QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOBUF, + "TXE PIOBUF Memory Parity"), + QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOPBC, + "TXE PIOPBC Memory Parity"), + QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOLAUNCHFIFO, + "TXE PIOLAUNCHFIFO Memory Parity"), + + QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_RCVBUF, + "RXE RCVBUF Memory Parity"), + QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_LOOKUPQ, + "RXE LOOKUPQ Memory Parity"), + QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_EAGERTID, + "RXE EAGERTID Memory Parity"), + QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_EXPTID, + "RXE EXPTID Memory Parity"), + QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_FLAGBUF, + "RXE FLAGBUF Memory Parity"), + QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_DATAINFO, + "RXE DATAINFO Memory Parity"), + QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_HDRINFO, + "RXE HDRINFO Memory Parity"), + + /* chip-specific hardware errors */ + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEPOISONEDTLP, + "PCIe Poisoned TLP"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIECPLTIMEOUT, + "PCIe completion timeout"), + /* + * In practice, it's unlikely wthat we'll see PCIe PLL, or bus + * parity or memory parity error failures, because most likely we + * won't be able to talk to the core of the chip. Nonetheless, we + * might see them, if they are in parts of the PCIe core that aren't + * essential. + */ + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE1PLLFAILED, + "PCIePLL1"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE0PLLFAILED, + "PCIePLL0"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYXTLH, + "PCIe XTLH core parity"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYXADM, + "PCIe ADM TX core parity"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYRADM, + "PCIe ADM RX core parity"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_SERDESPLLFAILED, + "SerDes PLL"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIECPLDATAQUEUEERR, + "PCIe cpl header queue"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIECPLHDRQUEUEERR, + "PCIe cpl data queue"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_SDMAMEMREADERR, + "Send DMA memory read"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_CLK_UC_PLLNOTLOCKED, + "uC PLL clock not locked"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIESERDESQ0PCLKNOTDETECT, + "PCIe serdes Q0 no clock"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIESERDESQ1PCLKNOTDETECT, + "PCIe serdes Q1 no clock"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIESERDESQ2PCLKNOTDETECT, + "PCIe serdes Q2 no clock"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIESERDESQ3PCLKNOTDETECT, + "PCIe serdes Q3 no clock"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_DDSRXEQMEMORYPARITYERR, + "DDS RXEQ memory parity"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR, + "IB uC memory parity"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE_UC_OCT0MEMORYPARITYERR, + "PCIe uC oct0 memory parity"), + QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE_UC_OCT1MEMORYPARITYERR, + "PCIe uC oct1 memory parity"), +}; + +#define RXE_PARITY (RXEMEMPARITYERR_EAGERTID|RXEMEMPARITYERR_EXPTID) + +#define QLOGIC_IB_E_PKTERRS (\ + ERR_MASK(SendPktLenErr) | \ + ERR_MASK(SendDroppedDataPktErr) | \ + ERR_MASK(RcvVCRCErr) | \ + ERR_MASK(RcvICRCErr) | \ + ERR_MASK(RcvShortPktLenErr) | \ + ERR_MASK(RcvEBPErr)) + +/* Convenience for decoding Send DMA errors */ +#define QLOGIC_IB_E_SDMAERRS ( \ + ERR_MASK(SDmaGenMismatchErr) | \ + ERR_MASK(SDmaOutOfBoundErr) | \ + ERR_MASK(SDmaTailOutOfBoundErr) | ERR_MASK(SDmaBaseErr) | \ + ERR_MASK(SDma1stDescErr) | ERR_MASK(SDmaRpyTagErr) | \ + ERR_MASK(SDmaDwEnErr) | ERR_MASK(SDmaMissingDwErr) | \ + ERR_MASK(SDmaUnexpDataErr) | \ + ERR_MASK(SDmaDescAddrMisalignErr) | \ + ERR_MASK(SDmaDisabledErr) | \ + ERR_MASK(SendBufMisuseErr)) + +/* These are all rcv-related errors which we want to count for stats */ +#define E_SUM_PKTERRS \ + (ERR_MASK(RcvHdrLenErr) | ERR_MASK(RcvBadTidErr) | \ + ERR_MASK(RcvBadVersionErr) | ERR_MASK(RcvHdrErr) | \ + ERR_MASK(RcvLongPktLenErr) | ERR_MASK(RcvShortPktLenErr) | \ + ERR_MASK(RcvMaxPktLenErr) | ERR_MASK(RcvMinPktLenErr) | \ + ERR_MASK(RcvFormatErr) | ERR_MASK(RcvUnsupportedVLErr) | \ + ERR_MASK(RcvUnexpectedCharErr) | ERR_MASK(RcvEBPErr)) + +/* These are all send-related errors which we want to count for stats */ +#define E_SUM_ERRS \ + (ERR_MASK(SendPioArmLaunchErr) | ERR_MASK(SendUnexpectedPktNumErr) | \ + ERR_MASK(SendDroppedDataPktErr) | ERR_MASK(SendDroppedSmpPktErr) | \ + ERR_MASK(SendMaxPktLenErr) | ERR_MASK(SendUnsupportedVLErr) | \ + ERR_MASK(SendMinPktLenErr) | ERR_MASK(SendPktLenErr) | \ + ERR_MASK(InvalidAddrErr)) + +/* + * this is similar to E_SUM_ERRS, but can't ignore armlaunch, don't ignore + * errors not related to freeze and cancelling buffers. Can't ignore + * armlaunch because could get more while still cleaning up, and need + * to cancel those as they happen. + */ +#define E_SPKT_ERRS_IGNORE \ + (ERR_MASK(SendDroppedDataPktErr) | ERR_MASK(SendDroppedSmpPktErr) | \ + ERR_MASK(SendMaxPktLenErr) | ERR_MASK(SendMinPktLenErr) | \ + ERR_MASK(SendPktLenErr)) + +/* + * these are errors that can occur when the link changes state while + * a packet is being sent or received. This doesn't cover things + * like EBP or VCRC that can be the result of a sending having the + * link change state, so we receive a "known bad" packet. + */ +#define E_SUM_LINK_PKTERRS \ + (ERR_MASK(SendDroppedDataPktErr) | ERR_MASK(SendDroppedSmpPktErr) | \ + ERR_MASK(SendMinPktLenErr) | ERR_MASK(SendPktLenErr) | \ + ERR_MASK(RcvShortPktLenErr) | ERR_MASK(RcvMinPktLenErr) | \ + ERR_MASK(RcvUnexpectedCharErr)) + +static void autoneg_7220_work(struct work_struct *); +static u32 __iomem *qib_7220_getsendbuf(struct qib_pportdata *, u64, u32 *); + +/* + * Called when we might have an error that is specific to a particular + * PIO buffer, and may need to cancel that buffer, so it can be re-used. + * because we don't need to force the update of pioavail. + */ +static void qib_disarm_7220_senderrbufs(struct qib_pportdata *ppd) +{ + unsigned long sbuf[3]; + struct qib_devdata *dd = ppd->dd; + + /* + * It's possible that sendbuffererror could have bits set; might + * have already done this as a result of hardware error handling. + */ + /* read these before writing errorclear */ + sbuf[0] = qib_read_kreg64(dd, kr_sendbuffererror); + sbuf[1] = qib_read_kreg64(dd, kr_sendbuffererror + 1); + sbuf[2] = qib_read_kreg64(dd, kr_sendbuffererror + 2); + + if (sbuf[0] || sbuf[1] || sbuf[2]) + qib_disarm_piobufs_set(dd, sbuf, + dd->piobcnt2k + dd->piobcnt4k); +} + +static void qib_7220_txe_recover(struct qib_devdata *dd) +{ + qib_devinfo(dd->pcidev, "Recovering from TXE PIO parity error\n"); + qib_disarm_7220_senderrbufs(dd->pport); +} + +/* + * This is called with interrupts disabled and sdma_lock held. + */ +static void qib_7220_sdma_sendctrl(struct qib_pportdata *ppd, unsigned op) +{ + struct qib_devdata *dd = ppd->dd; + u64 set_sendctrl = 0; + u64 clr_sendctrl = 0; + + if (op & QIB_SDMA_SENDCTRL_OP_ENABLE) + set_sendctrl |= SYM_MASK(SendCtrl, SDmaEnable); + else + clr_sendctrl |= SYM_MASK(SendCtrl, SDmaEnable); + + if (op & QIB_SDMA_SENDCTRL_OP_INTENABLE) + set_sendctrl |= SYM_MASK(SendCtrl, SDmaIntEnable); + else + clr_sendctrl |= SYM_MASK(SendCtrl, SDmaIntEnable); + + if (op & QIB_SDMA_SENDCTRL_OP_HALT) + set_sendctrl |= SYM_MASK(SendCtrl, SDmaHalt); + else + clr_sendctrl |= SYM_MASK(SendCtrl, SDmaHalt); + + spin_lock(&dd->sendctrl_lock); + + dd->sendctrl |= set_sendctrl; + dd->sendctrl &= ~clr_sendctrl; + + qib_write_kreg(dd, kr_sendctrl, dd->sendctrl); + qib_write_kreg(dd, kr_scratch, 0); + + spin_unlock(&dd->sendctrl_lock); +} + +static void qib_decode_7220_sdma_errs(struct qib_pportdata *ppd, + u64 err, char *buf, size_t blen) +{ + static const struct { + u64 err; + const char *msg; + } errs[] = { + { ERR_MASK(SDmaGenMismatchErr), + "SDmaGenMismatch" }, + { ERR_MASK(SDmaOutOfBoundErr), + "SDmaOutOfBound" }, + { ERR_MASK(SDmaTailOutOfBoundErr), + "SDmaTailOutOfBound" }, + { ERR_MASK(SDmaBaseErr), + "SDmaBase" }, + { ERR_MASK(SDma1stDescErr), + "SDma1stDesc" }, + { ERR_MASK(SDmaRpyTagErr), + "SDmaRpyTag" }, + { ERR_MASK(SDmaDwEnErr), + "SDmaDwEn" }, + { ERR_MASK(SDmaMissingDwErr), + "SDmaMissingDw" }, + { ERR_MASK(SDmaUnexpDataErr), + "SDmaUnexpData" }, + { ERR_MASK(SDmaDescAddrMisalignErr), + "SDmaDescAddrMisalign" }, + { ERR_MASK(SendBufMisuseErr), + "SendBufMisuse" }, + { ERR_MASK(SDmaDisabledErr), + "SDmaDisabled" }, + }; + int i; + size_t bidx = 0; + + for (i = 0; i < ARRAY_SIZE(errs); i++) { + if (err & errs[i].err) + bidx += scnprintf(buf + bidx, blen - bidx, + "%s ", errs[i].msg); + } +} + +/* + * This is called as part of link down clean up so disarm and flush + * all send buffers so that SMP packets can be sent. + */ +static void qib_7220_sdma_hw_clean_up(struct qib_pportdata *ppd) +{ + /* This will trigger the Abort interrupt */ + sendctrl_7220_mod(ppd, QIB_SENDCTRL_DISARM_ALL | QIB_SENDCTRL_FLUSH | + QIB_SENDCTRL_AVAIL_BLIP); + ppd->dd->upd_pio_shadow = 1; /* update our idea of what's busy */ +} + +static void qib_sdma_7220_setlengen(struct qib_pportdata *ppd) +{ + /* + * Set SendDmaLenGen and clear and set + * the MSB of the generation count to enable generation checking + * and load the internal generation counter. + */ + qib_write_kreg(ppd->dd, kr_senddmalengen, ppd->sdma_descq_cnt); + qib_write_kreg(ppd->dd, kr_senddmalengen, + ppd->sdma_descq_cnt | + (1ULL << QIB_7220_SendDmaLenGen_Generation_MSB)); +} + +static void qib_7220_sdma_hw_start_up(struct qib_pportdata *ppd) +{ + qib_sdma_7220_setlengen(ppd); + qib_sdma_update_7220_tail(ppd, 0); /* Set SendDmaTail */ + ppd->sdma_head_dma[0] = 0; +} + +#define DISABLES_SDMA ( \ + ERR_MASK(SDmaDisabledErr) | \ + ERR_MASK(SDmaBaseErr) | \ + ERR_MASK(SDmaTailOutOfBoundErr) | \ + ERR_MASK(SDmaOutOfBoundErr) | \ + ERR_MASK(SDma1stDescErr) | \ + ERR_MASK(SDmaRpyTagErr) | \ + ERR_MASK(SDmaGenMismatchErr) | \ + ERR_MASK(SDmaDescAddrMisalignErr) | \ + ERR_MASK(SDmaMissingDwErr) | \ + ERR_MASK(SDmaDwEnErr)) + +static void sdma_7220_errors(struct qib_pportdata *ppd, u64 errs) +{ + unsigned long flags; + struct qib_devdata *dd = ppd->dd; + char *msg; + + errs &= QLOGIC_IB_E_SDMAERRS; + + msg = dd->cspec->sdmamsgbuf; + qib_decode_7220_sdma_errs(ppd, errs, msg, sizeof dd->cspec->sdmamsgbuf); + spin_lock_irqsave(&ppd->sdma_lock, flags); + + if (errs & ERR_MASK(SendBufMisuseErr)) { + unsigned long sbuf[3]; + + sbuf[0] = qib_read_kreg64(dd, kr_sendbuffererror); + sbuf[1] = qib_read_kreg64(dd, kr_sendbuffererror + 1); + sbuf[2] = qib_read_kreg64(dd, kr_sendbuffererror + 2); + + qib_dev_err(ppd->dd, + "IB%u:%u SendBufMisuse: %04lx %016lx %016lx\n", + ppd->dd->unit, ppd->port, sbuf[2], sbuf[1], + sbuf[0]); + } + + if (errs & ERR_MASK(SDmaUnexpDataErr)) + qib_dev_err(dd, "IB%u:%u SDmaUnexpData\n", ppd->dd->unit, + ppd->port); + + switch (ppd->sdma_state.current_state) { + case qib_sdma_state_s00_hw_down: + /* not expecting any interrupts */ + break; + + case qib_sdma_state_s10_hw_start_up_wait: + /* handled in intr path */ + break; + + case qib_sdma_state_s20_idle: + /* not expecting any interrupts */ + break; + + case qib_sdma_state_s30_sw_clean_up_wait: + /* not expecting any interrupts */ + break; + + case qib_sdma_state_s40_hw_clean_up_wait: + if (errs & ERR_MASK(SDmaDisabledErr)) + __qib_sdma_process_event(ppd, + qib_sdma_event_e50_hw_cleaned); + break; + + case qib_sdma_state_s50_hw_halt_wait: + /* handled in intr path */ + break; + + case qib_sdma_state_s99_running: + if (errs & DISABLES_SDMA) + __qib_sdma_process_event(ppd, + qib_sdma_event_e7220_err_halted); + break; + } + + spin_unlock_irqrestore(&ppd->sdma_lock, flags); +} + +/* + * Decode the error status into strings, deciding whether to always + * print * it or not depending on "normal packet errors" vs everything + * else. Return 1 if "real" errors, otherwise 0 if only packet + * errors, so caller can decide what to print with the string. + */ +static int qib_decode_7220_err(struct qib_devdata *dd, char *buf, size_t blen, + u64 err) +{ + int iserr = 1; + + *buf = '\0'; + if (err & QLOGIC_IB_E_PKTERRS) { + if (!(err & ~QLOGIC_IB_E_PKTERRS)) + iserr = 0; + if ((err & ERR_MASK(RcvICRCErr)) && + !(err & (ERR_MASK(RcvVCRCErr) | ERR_MASK(RcvEBPErr)))) + strlcat(buf, "CRC ", blen); + if (!iserr) + goto done; + } + if (err & ERR_MASK(RcvHdrLenErr)) + strlcat(buf, "rhdrlen ", blen); + if (err & ERR_MASK(RcvBadTidErr)) + strlcat(buf, "rbadtid ", blen); + if (err & ERR_MASK(RcvBadVersionErr)) + strlcat(buf, "rbadversion ", blen); + if (err & ERR_MASK(RcvHdrErr)) + strlcat(buf, "rhdr ", blen); + if (err & ERR_MASK(SendSpecialTriggerErr)) + strlcat(buf, "sendspecialtrigger ", blen); + if (err & ERR_MASK(RcvLongPktLenErr)) + strlcat(buf, "rlongpktlen ", blen); + if (err & ERR_MASK(RcvMaxPktLenErr)) + strlcat(buf, "rmaxpktlen ", blen); + if (err & ERR_MASK(RcvMinPktLenErr)) + strlcat(buf, "rminpktlen ", blen); + if (err & ERR_MASK(SendMinPktLenErr)) + strlcat(buf, "sminpktlen ", blen); + if (err & ERR_MASK(RcvFormatErr)) + strlcat(buf, "rformaterr ", blen); + if (err & ERR_MASK(RcvUnsupportedVLErr)) + strlcat(buf, "runsupvl ", blen); + if (err & ERR_MASK(RcvUnexpectedCharErr)) + strlcat(buf, "runexpchar ", blen); + if (err & ERR_MASK(RcvIBFlowErr)) + strlcat(buf, "ribflow ", blen); + if (err & ERR_MASK(SendUnderRunErr)) + strlcat(buf, "sunderrun ", blen); + if (err & ERR_MASK(SendPioArmLaunchErr)) + strlcat(buf, "spioarmlaunch ", blen); + if (err & ERR_MASK(SendUnexpectedPktNumErr)) + strlcat(buf, "sunexperrpktnum ", blen); + if (err & ERR_MASK(SendDroppedSmpPktErr)) + strlcat(buf, "sdroppedsmppkt ", blen); + if (err & ERR_MASK(SendMaxPktLenErr)) + strlcat(buf, "smaxpktlen ", blen); + if (err & ERR_MASK(SendUnsupportedVLErr)) + strlcat(buf, "sunsupVL ", blen); + if (err & ERR_MASK(InvalidAddrErr)) + strlcat(buf, "invalidaddr ", blen); + if (err & ERR_MASK(RcvEgrFullErr)) + strlcat(buf, "rcvegrfull ", blen); + if (err & ERR_MASK(RcvHdrFullErr)) + strlcat(buf, "rcvhdrfull ", blen); + if (err & ERR_MASK(IBStatusChanged)) + strlcat(buf, "ibcstatuschg ", blen); + if (err & ERR_MASK(RcvIBLostLinkErr)) + strlcat(buf, "riblostlink ", blen); + if (err & ERR_MASK(HardwareErr)) + strlcat(buf, "hardware ", blen); + if (err & ERR_MASK(ResetNegated)) + strlcat(buf, "reset ", blen); + if (err & QLOGIC_IB_E_SDMAERRS) + qib_decode_7220_sdma_errs(dd->pport, err, buf, blen); + if (err & ERR_MASK(InvalidEEPCmd)) + strlcat(buf, "invalideepromcmd ", blen); +done: + return iserr; +} + +static void reenable_7220_chase(unsigned long opaque) +{ + struct qib_pportdata *ppd = (struct qib_pportdata *)opaque; + ppd->cpspec->chase_timer.expires = 0; + qib_set_ib_7220_lstate(ppd, QLOGIC_IB_IBCC_LINKCMD_DOWN, + QLOGIC_IB_IBCC_LINKINITCMD_POLL); +} + +static void handle_7220_chase(struct qib_pportdata *ppd, u64 ibcst) +{ + u8 ibclt; + unsigned long tnow; + + ibclt = (u8)SYM_FIELD(ibcst, IBCStatus, LinkTrainingState); + + /* + * Detect and handle the state chase issue, where we can + * get stuck if we are unlucky on timing on both sides of + * the link. If we are, we disable, set a timer, and + * then re-enable. + */ + switch (ibclt) { + case IB_7220_LT_STATE_CFGRCVFCFG: + case IB_7220_LT_STATE_CFGWAITRMT: + case IB_7220_LT_STATE_TXREVLANES: + case IB_7220_LT_STATE_CFGENH: + tnow = jiffies; + if (ppd->cpspec->chase_end && + time_after(tnow, ppd->cpspec->chase_end)) { + ppd->cpspec->chase_end = 0; + qib_set_ib_7220_lstate(ppd, + QLOGIC_IB_IBCC_LINKCMD_DOWN, + QLOGIC_IB_IBCC_LINKINITCMD_DISABLE); + ppd->cpspec->chase_timer.expires = jiffies + + QIB_CHASE_DIS_TIME; + add_timer(&ppd->cpspec->chase_timer); + } else if (!ppd->cpspec->chase_end) + ppd->cpspec->chase_end = tnow + QIB_CHASE_TIME; + break; + + default: + ppd->cpspec->chase_end = 0; + break; + } +} + +static void handle_7220_errors(struct qib_devdata *dd, u64 errs) +{ + char *msg; + u64 ignore_this_time = 0; + u64 iserr = 0; + int log_idx; + struct qib_pportdata *ppd = dd->pport; + u64 mask; + + /* don't report errors that are masked */ + errs &= dd->cspec->errormask; + msg = dd->cspec->emsgbuf; + + /* do these first, they are most important */ + if (errs & ERR_MASK(HardwareErr)) + qib_7220_handle_hwerrors(dd, msg, sizeof dd->cspec->emsgbuf); + else + for (log_idx = 0; log_idx < QIB_EEP_LOG_CNT; ++log_idx) + if (errs & dd->eep_st_masks[log_idx].errs_to_log) + qib_inc_eeprom_err(dd, log_idx, 1); + + if (errs & QLOGIC_IB_E_SDMAERRS) + sdma_7220_errors(ppd, errs); + + if (errs & ~IB_E_BITSEXTANT) + qib_dev_err(dd, + "error interrupt with unknown errors %llx set\n", + (unsigned long long) (errs & ~IB_E_BITSEXTANT)); + + if (errs & E_SUM_ERRS) { + qib_disarm_7220_senderrbufs(ppd); + if ((errs & E_SUM_LINK_PKTERRS) && + !(ppd->lflags & QIBL_LINKACTIVE)) { + /* + * This can happen when trying to bring the link + * up, but the IB link changes state at the "wrong" + * time. The IB logic then complains that the packet + * isn't valid. We don't want to confuse people, so + * we just don't print them, except at debug + */ + ignore_this_time = errs & E_SUM_LINK_PKTERRS; + } + } else if ((errs & E_SUM_LINK_PKTERRS) && + !(ppd->lflags & QIBL_LINKACTIVE)) { + /* + * This can happen when SMA is trying to bring the link + * up, but the IB link changes state at the "wrong" time. + * The IB logic then complains that the packet isn't + * valid. We don't want to confuse people, so we just + * don't print them, except at debug + */ + ignore_this_time = errs & E_SUM_LINK_PKTERRS; + } + + qib_write_kreg(dd, kr_errclear, errs); + + errs &= ~ignore_this_time; + if (!errs) + goto done; + + /* + * The ones we mask off are handled specially below + * or above. Also mask SDMADISABLED by default as it + * is too chatty. + */ + mask = ERR_MASK(IBStatusChanged) | + ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr) | + ERR_MASK(HardwareErr) | ERR_MASK(SDmaDisabledErr); + + qib_decode_7220_err(dd, msg, sizeof dd->cspec->emsgbuf, errs & ~mask); + + if (errs & E_SUM_PKTERRS) + qib_stats.sps_rcverrs++; + if (errs & E_SUM_ERRS) + qib_stats.sps_txerrs++; + iserr = errs & ~(E_SUM_PKTERRS | QLOGIC_IB_E_PKTERRS | + ERR_MASK(SDmaDisabledErr)); + + if (errs & ERR_MASK(IBStatusChanged)) { + u64 ibcs; + + ibcs = qib_read_kreg64(dd, kr_ibcstatus); + if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) + handle_7220_chase(ppd, ibcs); + + /* Update our picture of width and speed from chip */ + ppd->link_width_active = + ((ibcs >> IBA7220_LINKWIDTH_SHIFT) & 1) ? + IB_WIDTH_4X : IB_WIDTH_1X; + ppd->link_speed_active = + ((ibcs >> IBA7220_LINKSPEED_SHIFT) & 1) ? + QIB_IB_DDR : QIB_IB_SDR; + + /* + * Since going into a recovery state causes the link state + * to go down and since recovery is transitory, it is better + * if we "miss" ever seeing the link training state go into + * recovery (i.e., ignore this transition for link state + * special handling purposes) without updating lastibcstat. + */ + if (qib_7220_phys_portstate(ibcs) != + IB_PHYSPORTSTATE_LINK_ERR_RECOVER) + qib_handle_e_ibstatuschanged(ppd, ibcs); + } + + if (errs & ERR_MASK(ResetNegated)) { + qib_dev_err(dd, + "Got reset, requires re-init (unload and reload driver)\n"); + dd->flags &= ~QIB_INITTED; /* needs re-init */ + /* mark as having had error */ + *dd->devstatusp |= QIB_STATUS_HWERROR; + *dd->pport->statusp &= ~QIB_STATUS_IB_CONF; + } + + if (*msg && iserr) + qib_dev_porterr(dd, ppd->port, "%s error\n", msg); + + if (ppd->state_wanted & ppd->lflags) + wake_up_interruptible(&ppd->state_wait); + + /* + * If there were hdrq or egrfull errors, wake up any processes + * waiting in poll. We used to try to check which contexts had + * the overflow, but given the cost of that and the chip reads + * to support it, it's better to just wake everybody up if we + * get an overflow; waiters can poll again if it's not them. + */ + if (errs & (ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr))) { + qib_handle_urcv(dd, ~0U); + if (errs & ERR_MASK(RcvEgrFullErr)) + qib_stats.sps_buffull++; + else + qib_stats.sps_hdrfull++; + } +done: + return; +} + +/* enable/disable chip from delivering interrupts */ +static void qib_7220_set_intr_state(struct qib_devdata *dd, u32 enable) +{ + if (enable) { + if (dd->flags & QIB_BADINTR) + return; + qib_write_kreg(dd, kr_intmask, ~0ULL); + /* force re-interrupt of any pending interrupts. */ + qib_write_kreg(dd, kr_intclear, 0ULL); + } else + qib_write_kreg(dd, kr_intmask, 0ULL); +} + +/* + * Try to cleanup as much as possible for anything that might have gone + * wrong while in freeze mode, such as pio buffers being written by user + * processes (causing armlaunch), send errors due to going into freeze mode, + * etc., and try to avoid causing extra interrupts while doing so. + * Forcibly update the in-memory pioavail register copies after cleanup + * because the chip won't do it while in freeze mode (the register values + * themselves are kept correct). + * Make sure that we don't lose any important interrupts by using the chip + * feature that says that writing 0 to a bit in *clear that is set in + * *status will cause an interrupt to be generated again (if allowed by + * the *mask value). + * This is in chip-specific code because of all of the register accesses, + * even though the details are similar on most chips. + */ +static void qib_7220_clear_freeze(struct qib_devdata *dd) +{ + /* disable error interrupts, to avoid confusion */ + qib_write_kreg(dd, kr_errmask, 0ULL); + + /* also disable interrupts; errormask is sometimes overwriten */ + qib_7220_set_intr_state(dd, 0); + + qib_cancel_sends(dd->pport); + + /* clear the freeze, and be sure chip saw it */ + qib_write_kreg(dd, kr_control, dd->control); + qib_read_kreg32(dd, kr_scratch); + + /* force in-memory update now we are out of freeze */ + qib_force_pio_avail_update(dd); + + /* + * force new interrupt if any hwerr, error or interrupt bits are + * still set, and clear "safe" send packet errors related to freeze + * and cancelling sends. Re-enable error interrupts before possible + * force of re-interrupt on pending interrupts. + */ + qib_write_kreg(dd, kr_hwerrclear, 0ULL); + qib_write_kreg(dd, kr_errclear, E_SPKT_ERRS_IGNORE); + qib_write_kreg(dd, kr_errmask, dd->cspec->errormask); + qib_7220_set_intr_state(dd, 1); +} + +/** + * qib_7220_handle_hwerrors - display hardware errors. + * @dd: the qlogic_ib device + * @msg: the output buffer + * @msgl: the size of the output buffer + * + * Use same msg buffer as regular errors to avoid excessive stack + * use. Most hardware errors are catastrophic, but for right now, + * we'll print them and continue. We reuse the same message buffer as + * handle_7220_errors() to avoid excessive stack usage. + */ +static void qib_7220_handle_hwerrors(struct qib_devdata *dd, char *msg, + size_t msgl) +{ + u64 hwerrs; + u32 bits, ctrl; + int isfatal = 0; + char *bitsmsg; + int log_idx; + + hwerrs = qib_read_kreg64(dd, kr_hwerrstatus); + if (!hwerrs) + goto bail; + if (hwerrs == ~0ULL) { + qib_dev_err(dd, + "Read of hardware error status failed (all bits set); ignoring\n"); + goto bail; + } + qib_stats.sps_hwerrs++; + + /* + * Always clear the error status register, except MEMBISTFAIL, + * regardless of whether we continue or stop using the chip. + * We want that set so we know it failed, even across driver reload. + * We'll still ignore it in the hwerrmask. We do this partly for + * diagnostics, but also for support. + */ + qib_write_kreg(dd, kr_hwerrclear, + hwerrs & ~HWE_MASK(PowerOnBISTFailed)); + + hwerrs &= dd->cspec->hwerrmask; + + /* We log some errors to EEPROM, check if we have any of those. */ + for (log_idx = 0; log_idx < QIB_EEP_LOG_CNT; ++log_idx) + if (hwerrs & dd->eep_st_masks[log_idx].hwerrs_to_log) + qib_inc_eeprom_err(dd, log_idx, 1); + if (hwerrs & ~(TXEMEMPARITYERR_PIOBUF | TXEMEMPARITYERR_PIOPBC | + RXE_PARITY)) + qib_devinfo(dd->pcidev, + "Hardware error: hwerr=0x%llx (cleared)\n", + (unsigned long long) hwerrs); + + if (hwerrs & ~IB_HWE_BITSEXTANT) + qib_dev_err(dd, + "hwerror interrupt with unknown errors %llx set\n", + (unsigned long long) (hwerrs & ~IB_HWE_BITSEXTANT)); + + if (hwerrs & QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR) + qib_sd7220_clr_ibpar(dd); + + ctrl = qib_read_kreg32(dd, kr_control); + if ((ctrl & QLOGIC_IB_C_FREEZEMODE) && !dd->diag_client) { + /* + * Parity errors in send memory are recoverable by h/w + * just do housekeeping, exit freeze mode and continue. + */ + if (hwerrs & (TXEMEMPARITYERR_PIOBUF | + TXEMEMPARITYERR_PIOPBC)) { + qib_7220_txe_recover(dd); + hwerrs &= ~(TXEMEMPARITYERR_PIOBUF | + TXEMEMPARITYERR_PIOPBC); + } + if (hwerrs) + isfatal = 1; + else + qib_7220_clear_freeze(dd); + } + + *msg = '\0'; + + if (hwerrs & HWE_MASK(PowerOnBISTFailed)) { + isfatal = 1; + strlcat(msg, + "[Memory BIST test failed, InfiniPath hardware unusable]", + msgl); + /* ignore from now on, so disable until driver reloaded */ + dd->cspec->hwerrmask &= ~HWE_MASK(PowerOnBISTFailed); + qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask); + } + + qib_format_hwerrors(hwerrs, qib_7220_hwerror_msgs, + ARRAY_SIZE(qib_7220_hwerror_msgs), msg, msgl); + + bitsmsg = dd->cspec->bitsmsgbuf; + if (hwerrs & (QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK << + QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT)) { + bits = (u32) ((hwerrs >> + QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT) & + QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK); + snprintf(bitsmsg, sizeof dd->cspec->bitsmsgbuf, + "[PCIe Mem Parity Errs %x] ", bits); + strlcat(msg, bitsmsg, msgl); + } + +#define _QIB_PLL_FAIL (QLOGIC_IB_HWE_COREPLL_FBSLIP | \ + QLOGIC_IB_HWE_COREPLL_RFSLIP) + + if (hwerrs & _QIB_PLL_FAIL) { + isfatal = 1; + snprintf(bitsmsg, sizeof dd->cspec->bitsmsgbuf, + "[PLL failed (%llx), InfiniPath hardware unusable]", + (unsigned long long) hwerrs & _QIB_PLL_FAIL); + strlcat(msg, bitsmsg, msgl); + /* ignore from now on, so disable until driver reloaded */ + dd->cspec->hwerrmask &= ~(hwerrs & _QIB_PLL_FAIL); + qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask); + } + + if (hwerrs & QLOGIC_IB_HWE_SERDESPLLFAILED) { + /* + * If it occurs, it is left masked since the eternal + * interface is unused. + */ + dd->cspec->hwerrmask &= ~QLOGIC_IB_HWE_SERDESPLLFAILED; + qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask); + } + + qib_dev_err(dd, "%s hardware error\n", msg); + + if (isfatal && !dd->diag_client) { + qib_dev_err(dd, + "Fatal Hardware Error, no longer usable, SN %.16s\n", + dd->serial); + /* + * For /sys status file and user programs to print; if no + * trailing brace is copied, we'll know it was truncated. + */ + if (dd->freezemsg) + snprintf(dd->freezemsg, dd->freezelen, + "{%s}", msg); + qib_disable_after_error(dd); + } +bail:; +} + +/** + * qib_7220_init_hwerrors - enable hardware errors + * @dd: the qlogic_ib device + * + * now that we have finished initializing everything that might reasonably + * cause a hardware error, and cleared those errors bits as they occur, + * we can enable hardware errors in the mask (potentially enabling + * freeze mode), and enable hardware errors as errors (along with + * everything else) in errormask + */ +static void qib_7220_init_hwerrors(struct qib_devdata *dd) +{ + u64 val; + u64 extsval; + + extsval = qib_read_kreg64(dd, kr_extstatus); + + if (!(extsval & (QLOGIC_IB_EXTS_MEMBIST_ENDTEST | + QLOGIC_IB_EXTS_MEMBIST_DISABLED))) + qib_dev_err(dd, "MemBIST did not complete!\n"); + if (extsval & QLOGIC_IB_EXTS_MEMBIST_DISABLED) + qib_devinfo(dd->pcidev, "MemBIST is disabled.\n"); + + val = ~0ULL; /* default to all hwerrors become interrupts, */ + + val &= ~QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR; + dd->cspec->hwerrmask = val; + + qib_write_kreg(dd, kr_hwerrclear, ~HWE_MASK(PowerOnBISTFailed)); + qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask); + + /* clear all */ + qib_write_kreg(dd, kr_errclear, ~0ULL); + /* enable errors that are masked, at least this first time. */ + qib_write_kreg(dd, kr_errmask, ~0ULL); + dd->cspec->errormask = qib_read_kreg64(dd, kr_errmask); + /* clear any interrupts up to this point (ints still not enabled) */ + qib_write_kreg(dd, kr_intclear, ~0ULL); +} + +/* + * Disable and enable the armlaunch error. Used for PIO bandwidth testing + * on chips that are count-based, rather than trigger-based. There is no + * reference counting, but that's also fine, given the intended use. + * Only chip-specific because it's all register accesses + */ +static void qib_set_7220_armlaunch(struct qib_devdata *dd, u32 enable) +{ + if (enable) { + qib_write_kreg(dd, kr_errclear, ERR_MASK(SendPioArmLaunchErr)); + dd->cspec->errormask |= ERR_MASK(SendPioArmLaunchErr); + } else + dd->cspec->errormask &= ~ERR_MASK(SendPioArmLaunchErr); + qib_write_kreg(dd, kr_errmask, dd->cspec->errormask); +} + +/* + * Formerly took parameter in pre-shifted, + * pre-merged form with LinkCmd and LinkInitCmd + * together, and assuming the zero was NOP. + */ +static void qib_set_ib_7220_lstate(struct qib_pportdata *ppd, u16 linkcmd, + u16 linitcmd) +{ + u64 mod_wd; + struct qib_devdata *dd = ppd->dd; + unsigned long flags; + + if (linitcmd == QLOGIC_IB_IBCC_LINKINITCMD_DISABLE) { + /* + * If we are told to disable, note that so link-recovery + * code does not attempt to bring us back up. + */ + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags |= QIBL_IB_LINK_DISABLED; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + } else if (linitcmd || linkcmd == QLOGIC_IB_IBCC_LINKCMD_DOWN) { + /* + * Any other linkinitcmd will lead to LINKDOWN and then + * to INIT (if all is well), so clear flag to let + * link-recovery code attempt to bring us back up. + */ + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_IB_LINK_DISABLED; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + } + + mod_wd = (linkcmd << IBA7220_IBCC_LINKCMD_SHIFT) | + (linitcmd << QLOGIC_IB_IBCC_LINKINITCMD_SHIFT); + + qib_write_kreg(dd, kr_ibcctrl, ppd->cpspec->ibcctrl | mod_wd); + /* write to chip to prevent back-to-back writes of ibc reg */ + qib_write_kreg(dd, kr_scratch, 0); +} + +/* + * All detailed interaction with the SerDes has been moved to qib_sd7220.c + * + * The portion of IBA7220-specific bringup_serdes() that actually deals with + * registers and memory within the SerDes itself is qib_sd7220_init(). + */ + +/** + * qib_7220_bringup_serdes - bring up the serdes + * @ppd: physical port on the qlogic_ib device + */ +static int qib_7220_bringup_serdes(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + u64 val, prev_val, guid, ibc; + int ret = 0; + + /* Put IBC in reset, sends disabled */ + dd->control &= ~QLOGIC_IB_C_LINKENABLE; + qib_write_kreg(dd, kr_control, 0ULL); + + if (qib_compat_ddr_negotiate) { + ppd->cpspec->ibdeltainprog = 1; + ppd->cpspec->ibsymsnap = read_7220_creg32(dd, cr_ibsymbolerr); + ppd->cpspec->iblnkerrsnap = + read_7220_creg32(dd, cr_iblinkerrrecov); + } + + /* flowcontrolwatermark is in units of KBytes */ + ibc = 0x5ULL << SYM_LSB(IBCCtrl, FlowCtrlWaterMark); + /* + * How often flowctrl sent. More or less in usecs; balance against + * watermark value, so that in theory senders always get a flow + * control update in time to not let the IB link go idle. + */ + ibc |= 0x3ULL << SYM_LSB(IBCCtrl, FlowCtrlPeriod); + /* max error tolerance */ + ibc |= 0xfULL << SYM_LSB(IBCCtrl, PhyerrThreshold); + /* use "real" buffer space for */ + ibc |= 4ULL << SYM_LSB(IBCCtrl, CreditScale); + /* IB credit flow control. */ + ibc |= 0xfULL << SYM_LSB(IBCCtrl, OverrunThreshold); + /* + * set initial max size pkt IBC will send, including ICRC; it's the + * PIO buffer size in dwords, less 1; also see qib_set_mtu() + */ + ibc |= ((u64)(ppd->ibmaxlen >> 2) + 1) << SYM_LSB(IBCCtrl, MaxPktLen); + ppd->cpspec->ibcctrl = ibc; /* without linkcmd or linkinitcmd! */ + + /* initially come up waiting for TS1, without sending anything. */ + val = ppd->cpspec->ibcctrl | (QLOGIC_IB_IBCC_LINKINITCMD_DISABLE << + QLOGIC_IB_IBCC_LINKINITCMD_SHIFT); + qib_write_kreg(dd, kr_ibcctrl, val); + + if (!ppd->cpspec->ibcddrctrl) { + /* not on re-init after reset */ + ppd->cpspec->ibcddrctrl = qib_read_kreg64(dd, kr_ibcddrctrl); + + if (ppd->link_speed_enabled == (QIB_IB_SDR | QIB_IB_DDR)) + ppd->cpspec->ibcddrctrl |= + IBA7220_IBC_SPEED_AUTONEG_MASK | + IBA7220_IBC_IBTA_1_2_MASK; + else + ppd->cpspec->ibcddrctrl |= + ppd->link_speed_enabled == QIB_IB_DDR ? + IBA7220_IBC_SPEED_DDR : IBA7220_IBC_SPEED_SDR; + if ((ppd->link_width_enabled & (IB_WIDTH_1X | IB_WIDTH_4X)) == + (IB_WIDTH_1X | IB_WIDTH_4X)) + ppd->cpspec->ibcddrctrl |= IBA7220_IBC_WIDTH_AUTONEG; + else + ppd->cpspec->ibcddrctrl |= + ppd->link_width_enabled == IB_WIDTH_4X ? + IBA7220_IBC_WIDTH_4X_ONLY : + IBA7220_IBC_WIDTH_1X_ONLY; + + /* always enable these on driver reload, not sticky */ + ppd->cpspec->ibcddrctrl |= + IBA7220_IBC_RXPOL_MASK << IBA7220_IBC_RXPOL_SHIFT; + ppd->cpspec->ibcddrctrl |= + IBA7220_IBC_HRTBT_MASK << IBA7220_IBC_HRTBT_SHIFT; + + /* enable automatic lane reversal detection for receive */ + ppd->cpspec->ibcddrctrl |= IBA7220_IBC_LANE_REV_SUPPORTED; + } else + /* write to chip to prevent back-to-back writes of ibc reg */ + qib_write_kreg(dd, kr_scratch, 0); + + qib_write_kreg(dd, kr_ibcddrctrl, ppd->cpspec->ibcddrctrl); + qib_write_kreg(dd, kr_scratch, 0); + + qib_write_kreg(dd, kr_ncmodectrl, 0Ull); + qib_write_kreg(dd, kr_scratch, 0); + + ret = qib_sd7220_init(dd); + + val = qib_read_kreg64(dd, kr_xgxs_cfg); + prev_val = val; + val |= QLOGIC_IB_XGXS_FC_SAFE; + if (val != prev_val) { + qib_write_kreg(dd, kr_xgxs_cfg, val); + qib_read_kreg32(dd, kr_scratch); + } + if (val & QLOGIC_IB_XGXS_RESET) + val &= ~QLOGIC_IB_XGXS_RESET; + if (val != prev_val) + qib_write_kreg(dd, kr_xgxs_cfg, val); + + /* first time through, set port guid */ + if (!ppd->guid) + ppd->guid = dd->base_guid; + guid = be64_to_cpu(ppd->guid); + + qib_write_kreg(dd, kr_hrtbt_guid, guid); + if (!ret) { + dd->control |= QLOGIC_IB_C_LINKENABLE; + qib_write_kreg(dd, kr_control, dd->control); + } else + /* write to chip to prevent back-to-back writes of ibc reg */ + qib_write_kreg(dd, kr_scratch, 0); + return ret; +} + +/** + * qib_7220_quiet_serdes - set serdes to txidle + * @ppd: physical port of the qlogic_ib device + * Called when driver is being unloaded + */ +static void qib_7220_quiet_serdes(struct qib_pportdata *ppd) +{ + u64 val; + struct qib_devdata *dd = ppd->dd; + unsigned long flags; + + /* disable IBC */ + dd->control &= ~QLOGIC_IB_C_LINKENABLE; + qib_write_kreg(dd, kr_control, + dd->control | QLOGIC_IB_C_FREEZEMODE); + + ppd->cpspec->chase_end = 0; + if (ppd->cpspec->chase_timer.data) /* if initted */ + del_timer_sync(&ppd->cpspec->chase_timer); + + if (ppd->cpspec->ibsymdelta || ppd->cpspec->iblnkerrdelta || + ppd->cpspec->ibdeltainprog) { + u64 diagc; + + /* enable counter writes */ + diagc = qib_read_kreg64(dd, kr_hwdiagctrl); + qib_write_kreg(dd, kr_hwdiagctrl, + diagc | SYM_MASK(HwDiagCtrl, CounterWrEnable)); + + if (ppd->cpspec->ibsymdelta || ppd->cpspec->ibdeltainprog) { + val = read_7220_creg32(dd, cr_ibsymbolerr); + if (ppd->cpspec->ibdeltainprog) + val -= val - ppd->cpspec->ibsymsnap; + val -= ppd->cpspec->ibsymdelta; + write_7220_creg(dd, cr_ibsymbolerr, val); + } + if (ppd->cpspec->iblnkerrdelta || ppd->cpspec->ibdeltainprog) { + val = read_7220_creg32(dd, cr_iblinkerrrecov); + if (ppd->cpspec->ibdeltainprog) + val -= val - ppd->cpspec->iblnkerrsnap; + val -= ppd->cpspec->iblnkerrdelta; + write_7220_creg(dd, cr_iblinkerrrecov, val); + } + + /* and disable counter writes */ + qib_write_kreg(dd, kr_hwdiagctrl, diagc); + } + qib_set_ib_7220_lstate(ppd, 0, QLOGIC_IB_IBCC_LINKINITCMD_DISABLE); + + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_IB_AUTONEG_INPROG; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + wake_up(&ppd->cpspec->autoneg_wait); + cancel_delayed_work_sync(&ppd->cpspec->autoneg_work); + + shutdown_7220_relock_poll(ppd->dd); + val = qib_read_kreg64(ppd->dd, kr_xgxs_cfg); + val |= QLOGIC_IB_XGXS_RESET; + qib_write_kreg(ppd->dd, kr_xgxs_cfg, val); +} + +/** + * qib_setup_7220_setextled - set the state of the two external LEDs + * @dd: the qlogic_ib device + * @on: whether the link is up or not + * + * The exact combo of LEDs if on is true is determined by looking + * at the ibcstatus. + * + * These LEDs indicate the physical and logical state of IB link. + * For this chip (at least with recommended board pinouts), LED1 + * is Yellow (logical state) and LED2 is Green (physical state), + * + * Note: We try to match the Mellanox HCA LED behavior as best + * we can. Green indicates physical link state is OK (something is + * plugged in, and we can train). + * Amber indicates the link is logically up (ACTIVE). + * Mellanox further blinks the amber LED to indicate data packet + * activity, but we have no hardware support for that, so it would + * require waking up every 10-20 msecs and checking the counters + * on the chip, and then turning the LED off if appropriate. That's + * visible overhead, so not something we will do. + * + */ +static void qib_setup_7220_setextled(struct qib_pportdata *ppd, u32 on) +{ + struct qib_devdata *dd = ppd->dd; + u64 extctl, ledblink = 0, val, lst, ltst; + unsigned long flags; + + /* + * The diags use the LED to indicate diag info, so we leave + * the external LED alone when the diags are running. + */ + if (dd->diag_client) + return; + + if (ppd->led_override) { + ltst = (ppd->led_override & QIB_LED_PHYS) ? + IB_PHYSPORTSTATE_LINKUP : IB_PHYSPORTSTATE_DISABLED, + lst = (ppd->led_override & QIB_LED_LOG) ? + IB_PORT_ACTIVE : IB_PORT_DOWN; + } else if (on) { + val = qib_read_kreg64(dd, kr_ibcstatus); + ltst = qib_7220_phys_portstate(val); + lst = qib_7220_iblink_state(val); + } else { + ltst = 0; + lst = 0; + } + + spin_lock_irqsave(&dd->cspec->gpio_lock, flags); + extctl = dd->cspec->extctrl & ~(SYM_MASK(EXTCtrl, LEDPriPortGreenOn) | + SYM_MASK(EXTCtrl, LEDPriPortYellowOn)); + if (ltst == IB_PHYSPORTSTATE_LINKUP) { + extctl |= SYM_MASK(EXTCtrl, LEDPriPortGreenOn); + /* + * counts are in chip clock (4ns) periods. + * This is 1/16 sec (66.6ms) on, + * 3/16 sec (187.5 ms) off, with packets rcvd + */ + ledblink = ((66600 * 1000UL / 4) << IBA7220_LEDBLINK_ON_SHIFT) + | ((187500 * 1000UL / 4) << IBA7220_LEDBLINK_OFF_SHIFT); + } + if (lst == IB_PORT_ACTIVE) + extctl |= SYM_MASK(EXTCtrl, LEDPriPortYellowOn); + dd->cspec->extctrl = extctl; + qib_write_kreg(dd, kr_extctrl, extctl); + spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags); + + if (ledblink) /* blink the LED on packet receive */ + qib_write_kreg(dd, kr_rcvpktledcnt, ledblink); +} + +static void qib_7220_free_irq(struct qib_devdata *dd) +{ + if (dd->cspec->irq) { + free_irq(dd->cspec->irq, dd); + dd->cspec->irq = 0; + } + qib_nomsi(dd); +} + +/* + * qib_setup_7220_cleanup - clean up any per-chip chip-specific stuff + * @dd: the qlogic_ib device + * + * This is called during driver unload. + * + */ +static void qib_setup_7220_cleanup(struct qib_devdata *dd) +{ + qib_7220_free_irq(dd); + kfree(dd->cspec->cntrs); + kfree(dd->cspec->portcntrs); +} + +/* + * This is only called for SDmaInt. + * SDmaDisabled is handled on the error path. + */ +static void sdma_7220_intr(struct qib_pportdata *ppd, u64 istat) +{ + unsigned long flags; + + spin_lock_irqsave(&ppd->sdma_lock, flags); + + switch (ppd->sdma_state.current_state) { + case qib_sdma_state_s00_hw_down: + break; + + case qib_sdma_state_s10_hw_start_up_wait: + __qib_sdma_process_event(ppd, qib_sdma_event_e20_hw_started); + break; + + case qib_sdma_state_s20_idle: + break; + + case qib_sdma_state_s30_sw_clean_up_wait: + break; + + case qib_sdma_state_s40_hw_clean_up_wait: + break; + + case qib_sdma_state_s50_hw_halt_wait: + __qib_sdma_process_event(ppd, qib_sdma_event_e60_hw_halted); + break; + + case qib_sdma_state_s99_running: + /* too chatty to print here */ + __qib_sdma_intr(ppd); + break; + } + spin_unlock_irqrestore(&ppd->sdma_lock, flags); +} + +static void qib_wantpiobuf_7220_intr(struct qib_devdata *dd, u32 needint) +{ + unsigned long flags; + + spin_lock_irqsave(&dd->sendctrl_lock, flags); + if (needint) { + if (!(dd->sendctrl & SYM_MASK(SendCtrl, SendBufAvailUpd))) + goto done; + /* + * blip the availupd off, next write will be on, so + * we ensure an avail update, regardless of threshold or + * buffers becoming free, whenever we want an interrupt + */ + qib_write_kreg(dd, kr_sendctrl, dd->sendctrl & + ~SYM_MASK(SendCtrl, SendBufAvailUpd)); + qib_write_kreg(dd, kr_scratch, 0ULL); + dd->sendctrl |= SYM_MASK(SendCtrl, SendIntBufAvail); + } else + dd->sendctrl &= ~SYM_MASK(SendCtrl, SendIntBufAvail); + qib_write_kreg(dd, kr_sendctrl, dd->sendctrl); + qib_write_kreg(dd, kr_scratch, 0ULL); +done: + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); +} + +/* + * Handle errors and unusual events first, separate function + * to improve cache hits for fast path interrupt handling. + */ +static noinline void unlikely_7220_intr(struct qib_devdata *dd, u64 istat) +{ + if (unlikely(istat & ~QLOGIC_IB_I_BITSEXTANT)) + qib_dev_err(dd, + "interrupt with unknown interrupts %Lx set\n", + istat & ~QLOGIC_IB_I_BITSEXTANT); + + if (istat & QLOGIC_IB_I_GPIO) { + u32 gpiostatus; + + /* + * Boards for this chip currently don't use GPIO interrupts, + * so clear by writing GPIOstatus to GPIOclear, and complain + * to alert developer. To avoid endless repeats, clear + * the bits in the mask, since there is some kind of + * programming error or chip problem. + */ + gpiostatus = qib_read_kreg32(dd, kr_gpio_status); + /* + * In theory, writing GPIOstatus to GPIOclear could + * have a bad side-effect on some diagnostic that wanted + * to poll for a status-change, but the various shadows + * make that problematic at best. Diags will just suppress + * all GPIO interrupts during such tests. + */ + qib_write_kreg(dd, kr_gpio_clear, gpiostatus); + + if (gpiostatus) { + const u32 mask = qib_read_kreg32(dd, kr_gpio_mask); + u32 gpio_irq = mask & gpiostatus; + + /* + * A bit set in status and (chip) Mask register + * would cause an interrupt. Since we are not + * expecting any, report it. Also check that the + * chip reflects our shadow, report issues, + * and refresh from the shadow. + */ + /* + * Clear any troublemakers, and update chip + * from shadow + */ + dd->cspec->gpio_mask &= ~gpio_irq; + qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask); + } + } + + if (istat & QLOGIC_IB_I_ERROR) { + u64 estat; + + qib_stats.sps_errints++; + estat = qib_read_kreg64(dd, kr_errstatus); + if (!estat) + qib_devinfo(dd->pcidev, + "error interrupt (%Lx), but no error bits set!\n", + istat); + else + handle_7220_errors(dd, estat); + } +} + +static irqreturn_t qib_7220intr(int irq, void *data) +{ + struct qib_devdata *dd = data; + irqreturn_t ret; + u64 istat; + u64 ctxtrbits; + u64 rmask; + unsigned i; + + if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT) { + /* + * This return value is not great, but we do not want the + * interrupt core code to remove our interrupt handler + * because we don't appear to be handling an interrupt + * during a chip reset. + */ + ret = IRQ_HANDLED; + goto bail; + } + + istat = qib_read_kreg64(dd, kr_intstatus); + + if (unlikely(!istat)) { + ret = IRQ_NONE; /* not our interrupt, or already handled */ + goto bail; + } + if (unlikely(istat == -1)) { + qib_bad_intrstatus(dd); + /* don't know if it was our interrupt or not */ + ret = IRQ_NONE; + goto bail; + } + + this_cpu_inc(*dd->int_counter); + if (unlikely(istat & (~QLOGIC_IB_I_BITSEXTANT | + QLOGIC_IB_I_GPIO | QLOGIC_IB_I_ERROR))) + unlikely_7220_intr(dd, istat); + + /* + * Clear the interrupt bits we found set, relatively early, so we + * "know" know the chip will have seen this by the time we process + * the queue, and will re-interrupt if necessary. The processor + * itself won't take the interrupt again until we return. + */ + qib_write_kreg(dd, kr_intclear, istat); + + /* + * Handle kernel receive queues before checking for pio buffers + * available since receives can overflow; piobuf waiters can afford + * a few extra cycles, since they were waiting anyway. + */ + ctxtrbits = istat & + ((QLOGIC_IB_I_RCVAVAIL_MASK << QLOGIC_IB_I_RCVAVAIL_SHIFT) | + (QLOGIC_IB_I_RCVURG_MASK << QLOGIC_IB_I_RCVURG_SHIFT)); + if (ctxtrbits) { + rmask = (1ULL << QLOGIC_IB_I_RCVAVAIL_SHIFT) | + (1ULL << QLOGIC_IB_I_RCVURG_SHIFT); + for (i = 0; i < dd->first_user_ctxt; i++) { + if (ctxtrbits & rmask) { + ctxtrbits &= ~rmask; + qib_kreceive(dd->rcd[i], NULL, NULL); + } + rmask <<= 1; + } + if (ctxtrbits) { + ctxtrbits = + (ctxtrbits >> QLOGIC_IB_I_RCVAVAIL_SHIFT) | + (ctxtrbits >> QLOGIC_IB_I_RCVURG_SHIFT); + qib_handle_urcv(dd, ctxtrbits); + } + } + + /* only call for SDmaInt */ + if (istat & QLOGIC_IB_I_SDMAINT) + sdma_7220_intr(dd->pport, istat); + + if ((istat & QLOGIC_IB_I_SPIOBUFAVAIL) && (dd->flags & QIB_INITTED)) + qib_ib_piobufavail(dd); + + ret = IRQ_HANDLED; +bail: + return ret; +} + +/* + * Set up our chip-specific interrupt handler. + * The interrupt type has already been setup, so + * we just need to do the registration and error checking. + * If we are using MSI interrupts, we may fall back to + * INTx later, if the interrupt handler doesn't get called + * within 1/2 second (see verify_interrupt()). + */ +static void qib_setup_7220_interrupt(struct qib_devdata *dd) +{ + if (!dd->cspec->irq) + qib_dev_err(dd, + "irq is 0, BIOS error? Interrupts won't work\n"); + else { + int ret = request_irq(dd->cspec->irq, qib_7220intr, + dd->msi_lo ? 0 : IRQF_SHARED, + QIB_DRV_NAME, dd); + + if (ret) + qib_dev_err(dd, + "Couldn't setup %s interrupt (irq=%d): %d\n", + dd->msi_lo ? "MSI" : "INTx", + dd->cspec->irq, ret); + } +} + +/** + * qib_7220_boardname - fill in the board name + * @dd: the qlogic_ib device + * + * info is based on the board revision register + */ +static void qib_7220_boardname(struct qib_devdata *dd) +{ + char *n; + u32 boardid, namelen; + + boardid = SYM_FIELD(dd->revision, Revision, + BoardID); + + switch (boardid) { + case 1: + n = "InfiniPath_QLE7240"; + break; + case 2: + n = "InfiniPath_QLE7280"; + break; + default: + qib_dev_err(dd, "Unknown 7220 board with ID %u\n", boardid); + n = "Unknown_InfiniPath_7220"; + break; + } + + namelen = strlen(n) + 1; + dd->boardname = kmalloc(namelen, GFP_KERNEL); + if (!dd->boardname) + qib_dev_err(dd, "Failed allocation for board name: %s\n", n); + else + snprintf(dd->boardname, namelen, "%s", n); + + if (dd->majrev != 5 || !dd->minrev || dd->minrev > 2) + qib_dev_err(dd, + "Unsupported InfiniPath hardware revision %u.%u!\n", + dd->majrev, dd->minrev); + + snprintf(dd->boardversion, sizeof(dd->boardversion), + "ChipABI %u.%u, %s, InfiniPath%u %u.%u, SW Compat %u\n", + QIB_CHIP_VERS_MAJ, QIB_CHIP_VERS_MIN, dd->boardname, + (unsigned)SYM_FIELD(dd->revision, Revision_R, Arch), + dd->majrev, dd->minrev, + (unsigned)SYM_FIELD(dd->revision, Revision_R, SW)); +} + +/* + * This routine sleeps, so it can only be called from user context, not + * from interrupt context. + */ +static int qib_setup_7220_reset(struct qib_devdata *dd) +{ + u64 val; + int i; + int ret; + u16 cmdval; + u8 int_line, clinesz; + unsigned long flags; + + qib_pcie_getcmd(dd, &cmdval, &int_line, &clinesz); + + /* Use dev_err so it shows up in logs, etc. */ + qib_dev_err(dd, "Resetting InfiniPath unit %u\n", dd->unit); + + /* no interrupts till re-initted */ + qib_7220_set_intr_state(dd, 0); + + dd->pport->cpspec->ibdeltainprog = 0; + dd->pport->cpspec->ibsymdelta = 0; + dd->pport->cpspec->iblnkerrdelta = 0; + + /* + * Keep chip from being accessed until we are ready. Use + * writeq() directly, to allow the write even though QIB_PRESENT + * isn't set. + */ + dd->flags &= ~(QIB_INITTED | QIB_PRESENT); + /* so we check interrupts work again */ + dd->z_int_counter = qib_int_counter(dd); + val = dd->control | QLOGIC_IB_C_RESET; + writeq(val, &dd->kregbase[kr_control]); + mb(); /* prevent compiler reordering around actual reset */ + + for (i = 1; i <= 5; i++) { + /* + * Allow MBIST, etc. to complete; longer on each retry. + * We sometimes get machine checks from bus timeout if no + * response, so for now, make it *really* long. + */ + msleep(1000 + (1 + i) * 2000); + + qib_pcie_reenable(dd, cmdval, int_line, clinesz); + + /* + * Use readq directly, so we don't need to mark it as PRESENT + * until we get a successful indication that all is well. + */ + val = readq(&dd->kregbase[kr_revision]); + if (val == dd->revision) { + dd->flags |= QIB_PRESENT; /* it's back */ + ret = qib_reinit_intr(dd); + goto bail; + } + } + ret = 0; /* failed */ + +bail: + if (ret) { + if (qib_pcie_params(dd, dd->lbus_width, NULL, NULL)) + qib_dev_err(dd, + "Reset failed to setup PCIe or interrupts; continuing anyway\n"); + + /* hold IBC in reset, no sends, etc till later */ + qib_write_kreg(dd, kr_control, 0ULL); + + /* clear the reset error, init error/hwerror mask */ + qib_7220_init_hwerrors(dd); + + /* do setup similar to speed or link-width changes */ + if (dd->pport->cpspec->ibcddrctrl & IBA7220_IBC_IBTA_1_2_MASK) + dd->cspec->presets_needed = 1; + spin_lock_irqsave(&dd->pport->lflags_lock, flags); + dd->pport->lflags |= QIBL_IB_FORCE_NOTIFY; + dd->pport->lflags &= ~QIBL_IB_AUTONEG_FAILED; + spin_unlock_irqrestore(&dd->pport->lflags_lock, flags); + } + + return ret; +} + +/** + * qib_7220_put_tid - write a TID to the chip + * @dd: the qlogic_ib device + * @tidptr: pointer to the expected TID (in chip) to update + * @tidtype: 0 for eager, 1 for expected + * @pa: physical address of in memory buffer; tidinvalid if freeing + */ +static void qib_7220_put_tid(struct qib_devdata *dd, u64 __iomem *tidptr, + u32 type, unsigned long pa) +{ + if (pa != dd->tidinvalid) { + u64 chippa = pa >> IBA7220_TID_PA_SHIFT; + + /* paranoia checks */ + if (pa != (chippa << IBA7220_TID_PA_SHIFT)) { + qib_dev_err(dd, "Physaddr %lx not 2KB aligned!\n", + pa); + return; + } + if (chippa >= (1UL << IBA7220_TID_SZ_SHIFT)) { + qib_dev_err(dd, + "Physical page address 0x%lx larger than supported\n", + pa); + return; + } + + if (type == RCVHQ_RCV_TYPE_EAGER) + chippa |= dd->tidtemplate; + else /* for now, always full 4KB page */ + chippa |= IBA7220_TID_SZ_4K; + pa = chippa; + } + writeq(pa, tidptr); + mmiowb(); +} + +/** + * qib_7220_clear_tids - clear all TID entries for a ctxt, expected and eager + * @dd: the qlogic_ib device + * @ctxt: the ctxt + * + * clear all TID entries for a ctxt, expected and eager. + * Used from qib_close(). On this chip, TIDs are only 32 bits, + * not 64, but they are still on 64 bit boundaries, so tidbase + * is declared as u64 * for the pointer math, even though we write 32 bits + */ +static void qib_7220_clear_tids(struct qib_devdata *dd, + struct qib_ctxtdata *rcd) +{ + u64 __iomem *tidbase; + unsigned long tidinv; + u32 ctxt; + int i; + + if (!dd->kregbase || !rcd) + return; + + ctxt = rcd->ctxt; + + tidinv = dd->tidinvalid; + tidbase = (u64 __iomem *) + ((char __iomem *)(dd->kregbase) + + dd->rcvtidbase + + ctxt * dd->rcvtidcnt * sizeof(*tidbase)); + + for (i = 0; i < dd->rcvtidcnt; i++) + qib_7220_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EXPECTED, + tidinv); + + tidbase = (u64 __iomem *) + ((char __iomem *)(dd->kregbase) + + dd->rcvegrbase + + rcd->rcvegr_tid_base * sizeof(*tidbase)); + + for (i = 0; i < rcd->rcvegrcnt; i++) + qib_7220_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EAGER, + tidinv); +} + +/** + * qib_7220_tidtemplate - setup constants for TID updates + * @dd: the qlogic_ib device + * + * We setup stuff that we use a lot, to avoid calculating each time + */ +static void qib_7220_tidtemplate(struct qib_devdata *dd) +{ + if (dd->rcvegrbufsize == 2048) + dd->tidtemplate = IBA7220_TID_SZ_2K; + else if (dd->rcvegrbufsize == 4096) + dd->tidtemplate = IBA7220_TID_SZ_4K; + dd->tidinvalid = 0; +} + +/** + * qib_init_7220_get_base_info - set chip-specific flags for user code + * @rcd: the qlogic_ib ctxt + * @kbase: qib_base_info pointer + * + * We set the PCIE flag because the lower bandwidth on PCIe vs + * HyperTransport can affect some user packet algorithims. + */ +static int qib_7220_get_base_info(struct qib_ctxtdata *rcd, + struct qib_base_info *kinfo) +{ + kinfo->spi_runtime_flags |= QIB_RUNTIME_PCIE | + QIB_RUNTIME_NODMA_RTAIL | QIB_RUNTIME_SDMA; + + if (rcd->dd->flags & QIB_USE_SPCL_TRIG) + kinfo->spi_runtime_flags |= QIB_RUNTIME_SPECIAL_TRIGGER; + + return 0; +} + +static struct qib_message_header * +qib_7220_get_msgheader(struct qib_devdata *dd, __le32 *rhf_addr) +{ + u32 offset = qib_hdrget_offset(rhf_addr); + + return (struct qib_message_header *) + (rhf_addr - dd->rhf_offset + offset); +} + +static void qib_7220_config_ctxts(struct qib_devdata *dd) +{ + unsigned long flags; + u32 nchipctxts; + + nchipctxts = qib_read_kreg32(dd, kr_portcnt); + dd->cspec->numctxts = nchipctxts; + if (qib_n_krcv_queues > 1) { + dd->qpn_mask = 0x3e; + dd->first_user_ctxt = qib_n_krcv_queues * dd->num_pports; + if (dd->first_user_ctxt > nchipctxts) + dd->first_user_ctxt = nchipctxts; + } else + dd->first_user_ctxt = dd->num_pports; + dd->n_krcv_queues = dd->first_user_ctxt; + + if (!qib_cfgctxts) { + int nctxts = dd->first_user_ctxt + num_online_cpus(); + + if (nctxts <= 5) + dd->ctxtcnt = 5; + else if (nctxts <= 9) + dd->ctxtcnt = 9; + else if (nctxts <= nchipctxts) + dd->ctxtcnt = nchipctxts; + } else if (qib_cfgctxts <= nchipctxts) + dd->ctxtcnt = qib_cfgctxts; + if (!dd->ctxtcnt) /* none of the above, set to max */ + dd->ctxtcnt = nchipctxts; + + /* + * Chip can be configured for 5, 9, or 17 ctxts, and choice + * affects number of eager TIDs per ctxt (1K, 2K, 4K). + * Lock to be paranoid about later motion, etc. + */ + spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags); + if (dd->ctxtcnt > 9) + dd->rcvctrl |= 2ULL << IBA7220_R_CTXTCFG_SHIFT; + else if (dd->ctxtcnt > 5) + dd->rcvctrl |= 1ULL << IBA7220_R_CTXTCFG_SHIFT; + /* else configure for default 5 receive ctxts */ + if (dd->qpn_mask) + dd->rcvctrl |= 1ULL << QIB_7220_RcvCtrl_RcvQPMapEnable_LSB; + qib_write_kreg(dd, kr_rcvctrl, dd->rcvctrl); + spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags); + + /* kr_rcvegrcnt changes based on the number of contexts enabled */ + dd->cspec->rcvegrcnt = qib_read_kreg32(dd, kr_rcvegrcnt); + dd->rcvhdrcnt = max(dd->cspec->rcvegrcnt, IBA7220_KRCVEGRCNT); +} + +static int qib_7220_get_ib_cfg(struct qib_pportdata *ppd, int which) +{ + int lsb, ret = 0; + u64 maskr; /* right-justified mask */ + + switch (which) { + case QIB_IB_CFG_LWID_ENB: /* Get allowed Link-width */ + ret = ppd->link_width_enabled; + goto done; + + case QIB_IB_CFG_LWID: /* Get currently active Link-width */ + ret = ppd->link_width_active; + goto done; + + case QIB_IB_CFG_SPD_ENB: /* Get allowed Link speeds */ + ret = ppd->link_speed_enabled; + goto done; + + case QIB_IB_CFG_SPD: /* Get current Link spd */ + ret = ppd->link_speed_active; + goto done; + + case QIB_IB_CFG_RXPOL_ENB: /* Get Auto-RX-polarity enable */ + lsb = IBA7220_IBC_RXPOL_SHIFT; + maskr = IBA7220_IBC_RXPOL_MASK; + break; + + case QIB_IB_CFG_LREV_ENB: /* Get Auto-Lane-reversal enable */ + lsb = IBA7220_IBC_LREV_SHIFT; + maskr = IBA7220_IBC_LREV_MASK; + break; + + case QIB_IB_CFG_LINKLATENCY: + ret = qib_read_kreg64(ppd->dd, kr_ibcddrstatus) + & IBA7220_DDRSTAT_LINKLAT_MASK; + goto done; + + case QIB_IB_CFG_OP_VLS: + ret = ppd->vls_operational; + goto done; + + case QIB_IB_CFG_VL_HIGH_CAP: + ret = 0; + goto done; + + case QIB_IB_CFG_VL_LOW_CAP: + ret = 0; + goto done; + + case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */ + ret = SYM_FIELD(ppd->cpspec->ibcctrl, IBCCtrl, + OverrunThreshold); + goto done; + + case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */ + ret = SYM_FIELD(ppd->cpspec->ibcctrl, IBCCtrl, + PhyerrThreshold); + goto done; + + case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */ + /* will only take effect when the link state changes */ + ret = (ppd->cpspec->ibcctrl & + SYM_MASK(IBCCtrl, LinkDownDefaultState)) ? + IB_LINKINITCMD_SLEEP : IB_LINKINITCMD_POLL; + goto done; + + case QIB_IB_CFG_HRTBT: /* Get Heartbeat off/enable/auto */ + lsb = IBA7220_IBC_HRTBT_SHIFT; + maskr = IBA7220_IBC_HRTBT_MASK; + break; + + case QIB_IB_CFG_PMA_TICKS: + /* + * 0x00 = 10x link transfer rate or 4 nsec. for 2.5Gbs + * Since the clock is always 250MHz, the value is 1 or 0. + */ + ret = (ppd->link_speed_active == QIB_IB_DDR); + goto done; + + default: + ret = -EINVAL; + goto done; + } + ret = (int)((ppd->cpspec->ibcddrctrl >> lsb) & maskr); +done: + return ret; +} + +static int qib_7220_set_ib_cfg(struct qib_pportdata *ppd, int which, u32 val) +{ + struct qib_devdata *dd = ppd->dd; + u64 maskr; /* right-justified mask */ + int lsb, ret = 0, setforce = 0; + u16 lcmd, licmd; + unsigned long flags; + u32 tmp = 0; + + switch (which) { + case QIB_IB_CFG_LIDLMC: + /* + * Set LID and LMC. Combined to avoid possible hazard + * caller puts LMC in 16MSbits, DLID in 16LSbits of val + */ + lsb = IBA7220_IBC_DLIDLMC_SHIFT; + maskr = IBA7220_IBC_DLIDLMC_MASK; + break; + + case QIB_IB_CFG_LWID_ENB: /* set allowed Link-width */ + /* + * As with speed, only write the actual register if + * the link is currently down, otherwise takes effect + * on next link change. + */ + ppd->link_width_enabled = val; + if (!(ppd->lflags & QIBL_LINKDOWN)) + goto bail; + /* + * We set the QIBL_IB_FORCE_NOTIFY bit so updown + * will get called because we want update + * link_width_active, and the change may not take + * effect for some time (if we are in POLL), so this + * flag will force the updown routine to be called + * on the next ibstatuschange down interrupt, even + * if it's not an down->up transition. + */ + val--; /* convert from IB to chip */ + maskr = IBA7220_IBC_WIDTH_MASK; + lsb = IBA7220_IBC_WIDTH_SHIFT; + setforce = 1; + break; + + case QIB_IB_CFG_SPD_ENB: /* set allowed Link speeds */ + /* + * If we turn off IB1.2, need to preset SerDes defaults, + * but not right now. Set a flag for the next time + * we command the link down. As with width, only write the + * actual register if the link is currently down, otherwise + * takes effect on next link change. Since setting is being + * explicitly requested (via MAD or sysfs), clear autoneg + * failure status if speed autoneg is enabled. + */ + ppd->link_speed_enabled = val; + if ((ppd->cpspec->ibcddrctrl & IBA7220_IBC_IBTA_1_2_MASK) && + !(val & (val - 1))) + dd->cspec->presets_needed = 1; + if (!(ppd->lflags & QIBL_LINKDOWN)) + goto bail; + /* + * We set the QIBL_IB_FORCE_NOTIFY bit so updown + * will get called because we want update + * link_speed_active, and the change may not take + * effect for some time (if we are in POLL), so this + * flag will force the updown routine to be called + * on the next ibstatuschange down interrupt, even + * if it's not an down->up transition. + */ + if (val == (QIB_IB_SDR | QIB_IB_DDR)) { + val = IBA7220_IBC_SPEED_AUTONEG_MASK | + IBA7220_IBC_IBTA_1_2_MASK; + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_IB_AUTONEG_FAILED; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + } else + val = val == QIB_IB_DDR ? + IBA7220_IBC_SPEED_DDR : IBA7220_IBC_SPEED_SDR; + maskr = IBA7220_IBC_SPEED_AUTONEG_MASK | + IBA7220_IBC_IBTA_1_2_MASK; + /* IBTA 1.2 mode + speed bits are contiguous */ + lsb = SYM_LSB(IBCDDRCtrl, IB_ENHANCED_MODE); + setforce = 1; + break; + + case QIB_IB_CFG_RXPOL_ENB: /* set Auto-RX-polarity enable */ + lsb = IBA7220_IBC_RXPOL_SHIFT; + maskr = IBA7220_IBC_RXPOL_MASK; + break; + + case QIB_IB_CFG_LREV_ENB: /* set Auto-Lane-reversal enable */ + lsb = IBA7220_IBC_LREV_SHIFT; + maskr = IBA7220_IBC_LREV_MASK; + break; + + case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */ + maskr = SYM_FIELD(ppd->cpspec->ibcctrl, IBCCtrl, + OverrunThreshold); + if (maskr != val) { + ppd->cpspec->ibcctrl &= + ~SYM_MASK(IBCCtrl, OverrunThreshold); + ppd->cpspec->ibcctrl |= (u64) val << + SYM_LSB(IBCCtrl, OverrunThreshold); + qib_write_kreg(dd, kr_ibcctrl, ppd->cpspec->ibcctrl); + qib_write_kreg(dd, kr_scratch, 0); + } + goto bail; + + case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */ + maskr = SYM_FIELD(ppd->cpspec->ibcctrl, IBCCtrl, + PhyerrThreshold); + if (maskr != val) { + ppd->cpspec->ibcctrl &= + ~SYM_MASK(IBCCtrl, PhyerrThreshold); + ppd->cpspec->ibcctrl |= (u64) val << + SYM_LSB(IBCCtrl, PhyerrThreshold); + qib_write_kreg(dd, kr_ibcctrl, ppd->cpspec->ibcctrl); + qib_write_kreg(dd, kr_scratch, 0); + } + goto bail; + + case QIB_IB_CFG_PKEYS: /* update pkeys */ + maskr = (u64) ppd->pkeys[0] | ((u64) ppd->pkeys[1] << 16) | + ((u64) ppd->pkeys[2] << 32) | + ((u64) ppd->pkeys[3] << 48); + qib_write_kreg(dd, kr_partitionkey, maskr); + goto bail; + + case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */ + /* will only take effect when the link state changes */ + if (val == IB_LINKINITCMD_POLL) + ppd->cpspec->ibcctrl &= + ~SYM_MASK(IBCCtrl, LinkDownDefaultState); + else /* SLEEP */ + ppd->cpspec->ibcctrl |= + SYM_MASK(IBCCtrl, LinkDownDefaultState); + qib_write_kreg(dd, kr_ibcctrl, ppd->cpspec->ibcctrl); + qib_write_kreg(dd, kr_scratch, 0); + goto bail; + + case QIB_IB_CFG_MTU: /* update the MTU in IBC */ + /* + * Update our housekeeping variables, and set IBC max + * size, same as init code; max IBC is max we allow in + * buffer, less the qword pbc, plus 1 for ICRC, in dwords + * Set even if it's unchanged, print debug message only + * on changes. + */ + val = (ppd->ibmaxlen >> 2) + 1; + ppd->cpspec->ibcctrl &= ~SYM_MASK(IBCCtrl, MaxPktLen); + ppd->cpspec->ibcctrl |= (u64)val << SYM_LSB(IBCCtrl, MaxPktLen); + qib_write_kreg(dd, kr_ibcctrl, ppd->cpspec->ibcctrl); + qib_write_kreg(dd, kr_scratch, 0); + goto bail; + + case QIB_IB_CFG_LSTATE: /* set the IB link state */ + switch (val & 0xffff0000) { + case IB_LINKCMD_DOWN: + lcmd = QLOGIC_IB_IBCC_LINKCMD_DOWN; + if (!ppd->cpspec->ibdeltainprog && + qib_compat_ddr_negotiate) { + ppd->cpspec->ibdeltainprog = 1; + ppd->cpspec->ibsymsnap = + read_7220_creg32(dd, cr_ibsymbolerr); + ppd->cpspec->iblnkerrsnap = + read_7220_creg32(dd, cr_iblinkerrrecov); + } + break; + + case IB_LINKCMD_ARMED: + lcmd = QLOGIC_IB_IBCC_LINKCMD_ARMED; + break; + + case IB_LINKCMD_ACTIVE: + lcmd = QLOGIC_IB_IBCC_LINKCMD_ACTIVE; + break; + + default: + ret = -EINVAL; + qib_dev_err(dd, "bad linkcmd req 0x%x\n", val >> 16); + goto bail; + } + switch (val & 0xffff) { + case IB_LINKINITCMD_NOP: + licmd = 0; + break; + + case IB_LINKINITCMD_POLL: + licmd = QLOGIC_IB_IBCC_LINKINITCMD_POLL; + break; + + case IB_LINKINITCMD_SLEEP: + licmd = QLOGIC_IB_IBCC_LINKINITCMD_SLEEP; + break; + + case IB_LINKINITCMD_DISABLE: + licmd = QLOGIC_IB_IBCC_LINKINITCMD_DISABLE; + ppd->cpspec->chase_end = 0; + /* + * stop state chase counter and timer, if running. + * wait forpending timer, but don't clear .data (ppd)! + */ + if (ppd->cpspec->chase_timer.expires) { + del_timer_sync(&ppd->cpspec->chase_timer); + ppd->cpspec->chase_timer.expires = 0; + } + break; + + default: + ret = -EINVAL; + qib_dev_err(dd, "bad linkinitcmd req 0x%x\n", + val & 0xffff); + goto bail; + } + qib_set_ib_7220_lstate(ppd, lcmd, licmd); + + maskr = IBA7220_IBC_WIDTH_MASK; + lsb = IBA7220_IBC_WIDTH_SHIFT; + tmp = (ppd->cpspec->ibcddrctrl >> lsb) & maskr; + /* If the width active on the chip does not match the + * width in the shadow register, write the new active + * width to the chip. + * We don't have to worry about speed as the speed is taken + * care of by set_7220_ibspeed_fast called by ib_updown. + */ + if (ppd->link_width_enabled-1 != tmp) { + ppd->cpspec->ibcddrctrl &= ~(maskr << lsb); + ppd->cpspec->ibcddrctrl |= + (((u64)(ppd->link_width_enabled-1) & maskr) << + lsb); + qib_write_kreg(dd, kr_ibcddrctrl, + ppd->cpspec->ibcddrctrl); + qib_write_kreg(dd, kr_scratch, 0); + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags |= QIBL_IB_FORCE_NOTIFY; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + } + goto bail; + + case QIB_IB_CFG_HRTBT: /* set Heartbeat off/enable/auto */ + if (val > IBA7220_IBC_HRTBT_MASK) { + ret = -EINVAL; + goto bail; + } + lsb = IBA7220_IBC_HRTBT_SHIFT; + maskr = IBA7220_IBC_HRTBT_MASK; + break; + + default: + ret = -EINVAL; + goto bail; + } + ppd->cpspec->ibcddrctrl &= ~(maskr << lsb); + ppd->cpspec->ibcddrctrl |= (((u64) val & maskr) << lsb); + qib_write_kreg(dd, kr_ibcddrctrl, ppd->cpspec->ibcddrctrl); + qib_write_kreg(dd, kr_scratch, 0); + if (setforce) { + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags |= QIBL_IB_FORCE_NOTIFY; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + } +bail: + return ret; +} + +static int qib_7220_set_loopback(struct qib_pportdata *ppd, const char *what) +{ + int ret = 0; + u64 val, ddr; + + if (!strncmp(what, "ibc", 3)) { + ppd->cpspec->ibcctrl |= SYM_MASK(IBCCtrl, Loopback); + val = 0; /* disable heart beat, so link will come up */ + qib_devinfo(ppd->dd->pcidev, "Enabling IB%u:%u IBC loopback\n", + ppd->dd->unit, ppd->port); + } else if (!strncmp(what, "off", 3)) { + ppd->cpspec->ibcctrl &= ~SYM_MASK(IBCCtrl, Loopback); + /* enable heart beat again */ + val = IBA7220_IBC_HRTBT_MASK << IBA7220_IBC_HRTBT_SHIFT; + qib_devinfo(ppd->dd->pcidev, + "Disabling IB%u:%u IBC loopback (normal)\n", + ppd->dd->unit, ppd->port); + } else + ret = -EINVAL; + if (!ret) { + qib_write_kreg(ppd->dd, kr_ibcctrl, ppd->cpspec->ibcctrl); + ddr = ppd->cpspec->ibcddrctrl & ~(IBA7220_IBC_HRTBT_MASK + << IBA7220_IBC_HRTBT_SHIFT); + ppd->cpspec->ibcddrctrl = ddr | val; + qib_write_kreg(ppd->dd, kr_ibcddrctrl, + ppd->cpspec->ibcddrctrl); + qib_write_kreg(ppd->dd, kr_scratch, 0); + } + return ret; +} + +static void qib_update_7220_usrhead(struct qib_ctxtdata *rcd, u64 hd, + u32 updegr, u32 egrhd, u32 npkts) +{ + if (updegr) + qib_write_ureg(rcd->dd, ur_rcvegrindexhead, egrhd, rcd->ctxt); + mmiowb(); + qib_write_ureg(rcd->dd, ur_rcvhdrhead, hd, rcd->ctxt); + mmiowb(); +} + +static u32 qib_7220_hdrqempty(struct qib_ctxtdata *rcd) +{ + u32 head, tail; + + head = qib_read_ureg32(rcd->dd, ur_rcvhdrhead, rcd->ctxt); + if (rcd->rcvhdrtail_kvaddr) + tail = qib_get_rcvhdrtail(rcd); + else + tail = qib_read_ureg32(rcd->dd, ur_rcvhdrtail, rcd->ctxt); + return head == tail; +} + +/* + * Modify the RCVCTRL register in chip-specific way. This + * is a function because bit positions and (future) register + * location is chip-specifc, but the needed operations are + * generic. is a bit-mask because we often want to + * do multiple modifications. + */ +static void rcvctrl_7220_mod(struct qib_pportdata *ppd, unsigned int op, + int ctxt) +{ + struct qib_devdata *dd = ppd->dd; + u64 mask, val; + unsigned long flags; + + spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags); + if (op & QIB_RCVCTRL_TAILUPD_ENB) + dd->rcvctrl |= (1ULL << IBA7220_R_TAILUPD_SHIFT); + if (op & QIB_RCVCTRL_TAILUPD_DIS) + dd->rcvctrl &= ~(1ULL << IBA7220_R_TAILUPD_SHIFT); + if (op & QIB_RCVCTRL_PKEY_ENB) + dd->rcvctrl &= ~(1ULL << IBA7220_R_PKEY_DIS_SHIFT); + if (op & QIB_RCVCTRL_PKEY_DIS) + dd->rcvctrl |= (1ULL << IBA7220_R_PKEY_DIS_SHIFT); + if (ctxt < 0) + mask = (1ULL << dd->ctxtcnt) - 1; + else + mask = (1ULL << ctxt); + if (op & QIB_RCVCTRL_CTXT_ENB) { + /* always done for specific ctxt */ + dd->rcvctrl |= (mask << SYM_LSB(RcvCtrl, PortEnable)); + if (!(dd->flags & QIB_NODMA_RTAIL)) + dd->rcvctrl |= 1ULL << IBA7220_R_TAILUPD_SHIFT; + /* Write these registers before the context is enabled. */ + qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr, ctxt, + dd->rcd[ctxt]->rcvhdrqtailaddr_phys); + qib_write_kreg_ctxt(dd, kr_rcvhdraddr, ctxt, + dd->rcd[ctxt]->rcvhdrq_phys); + dd->rcd[ctxt]->seq_cnt = 1; + } + if (op & QIB_RCVCTRL_CTXT_DIS) + dd->rcvctrl &= ~(mask << SYM_LSB(RcvCtrl, PortEnable)); + if (op & QIB_RCVCTRL_INTRAVAIL_ENB) + dd->rcvctrl |= (mask << IBA7220_R_INTRAVAIL_SHIFT); + if (op & QIB_RCVCTRL_INTRAVAIL_DIS) + dd->rcvctrl &= ~(mask << IBA7220_R_INTRAVAIL_SHIFT); + qib_write_kreg(dd, kr_rcvctrl, dd->rcvctrl); + if ((op & QIB_RCVCTRL_INTRAVAIL_ENB) && dd->rhdrhead_intr_off) { + /* arm rcv interrupt */ + val = qib_read_ureg32(dd, ur_rcvhdrhead, ctxt) | + dd->rhdrhead_intr_off; + qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt); + } + if (op & QIB_RCVCTRL_CTXT_ENB) { + /* + * Init the context registers also; if we were + * disabled, tail and head should both be zero + * already from the enable, but since we don't + * know, we have to do it explicitly. + */ + val = qib_read_ureg32(dd, ur_rcvegrindextail, ctxt); + qib_write_ureg(dd, ur_rcvegrindexhead, val, ctxt); + + val = qib_read_ureg32(dd, ur_rcvhdrtail, ctxt); + dd->rcd[ctxt]->head = val; + /* If kctxt, interrupt on next receive. */ + if (ctxt < dd->first_user_ctxt) + val |= dd->rhdrhead_intr_off; + qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt); + } + if (op & QIB_RCVCTRL_CTXT_DIS) { + if (ctxt >= 0) { + qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr, ctxt, 0); + qib_write_kreg_ctxt(dd, kr_rcvhdraddr, ctxt, 0); + } else { + unsigned i; + + for (i = 0; i < dd->cfgctxts; i++) { + qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr, + i, 0); + qib_write_kreg_ctxt(dd, kr_rcvhdraddr, i, 0); + } + } + } + spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags); +} + +/* + * Modify the SENDCTRL register in chip-specific way. This + * is a function there may be multiple such registers with + * slightly different layouts. To start, we assume the + * "canonical" register layout of the first chips. + * Chip requires no back-back sendctrl writes, so write + * scratch register after writing sendctrl + */ +static void sendctrl_7220_mod(struct qib_pportdata *ppd, u32 op) +{ + struct qib_devdata *dd = ppd->dd; + u64 tmp_dd_sendctrl; + unsigned long flags; + + spin_lock_irqsave(&dd->sendctrl_lock, flags); + + /* First the ones that are "sticky", saved in shadow */ + if (op & QIB_SENDCTRL_CLEAR) + dd->sendctrl = 0; + if (op & QIB_SENDCTRL_SEND_DIS) + dd->sendctrl &= ~SYM_MASK(SendCtrl, SPioEnable); + else if (op & QIB_SENDCTRL_SEND_ENB) { + dd->sendctrl |= SYM_MASK(SendCtrl, SPioEnable); + if (dd->flags & QIB_USE_SPCL_TRIG) + dd->sendctrl |= SYM_MASK(SendCtrl, + SSpecialTriggerEn); + } + if (op & QIB_SENDCTRL_AVAIL_DIS) + dd->sendctrl &= ~SYM_MASK(SendCtrl, SendBufAvailUpd); + else if (op & QIB_SENDCTRL_AVAIL_ENB) + dd->sendctrl |= SYM_MASK(SendCtrl, SendBufAvailUpd); + + if (op & QIB_SENDCTRL_DISARM_ALL) { + u32 i, last; + + tmp_dd_sendctrl = dd->sendctrl; + /* + * disarm any that are not yet launched, disabling sends + * and updates until done. + */ + last = dd->piobcnt2k + dd->piobcnt4k; + tmp_dd_sendctrl &= + ~(SYM_MASK(SendCtrl, SPioEnable) | + SYM_MASK(SendCtrl, SendBufAvailUpd)); + for (i = 0; i < last; i++) { + qib_write_kreg(dd, kr_sendctrl, + tmp_dd_sendctrl | + SYM_MASK(SendCtrl, Disarm) | i); + qib_write_kreg(dd, kr_scratch, 0); + } + } + + tmp_dd_sendctrl = dd->sendctrl; + + if (op & QIB_SENDCTRL_FLUSH) + tmp_dd_sendctrl |= SYM_MASK(SendCtrl, Abort); + if (op & QIB_SENDCTRL_DISARM) + tmp_dd_sendctrl |= SYM_MASK(SendCtrl, Disarm) | + ((op & QIB_7220_SendCtrl_DisarmPIOBuf_RMASK) << + SYM_LSB(SendCtrl, DisarmPIOBuf)); + if ((op & QIB_SENDCTRL_AVAIL_BLIP) && + (dd->sendctrl & SYM_MASK(SendCtrl, SendBufAvailUpd))) + tmp_dd_sendctrl &= ~SYM_MASK(SendCtrl, SendBufAvailUpd); + + qib_write_kreg(dd, kr_sendctrl, tmp_dd_sendctrl); + qib_write_kreg(dd, kr_scratch, 0); + + if (op & QIB_SENDCTRL_AVAIL_BLIP) { + qib_write_kreg(dd, kr_sendctrl, dd->sendctrl); + qib_write_kreg(dd, kr_scratch, 0); + } + + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); + + if (op & QIB_SENDCTRL_FLUSH) { + u32 v; + /* + * ensure writes have hit chip, then do a few + * more reads, to allow DMA of pioavail registers + * to occur, so in-memory copy is in sync with + * the chip. Not always safe to sleep. + */ + v = qib_read_kreg32(dd, kr_scratch); + qib_write_kreg(dd, kr_scratch, v); + v = qib_read_kreg32(dd, kr_scratch); + qib_write_kreg(dd, kr_scratch, v); + qib_read_kreg32(dd, kr_scratch); + } +} + +/** + * qib_portcntr_7220 - read a per-port counter + * @dd: the qlogic_ib device + * @creg: the counter to snapshot + */ +static u64 qib_portcntr_7220(struct qib_pportdata *ppd, u32 reg) +{ + u64 ret = 0ULL; + struct qib_devdata *dd = ppd->dd; + u16 creg; + /* 0xffff for unimplemented or synthesized counters */ + static const u16 xlator[] = { + [QIBPORTCNTR_PKTSEND] = cr_pktsend, + [QIBPORTCNTR_WORDSEND] = cr_wordsend, + [QIBPORTCNTR_PSXMITDATA] = cr_psxmitdatacount, + [QIBPORTCNTR_PSXMITPKTS] = cr_psxmitpktscount, + [QIBPORTCNTR_PSXMITWAIT] = cr_psxmitwaitcount, + [QIBPORTCNTR_SENDSTALL] = cr_sendstall, + [QIBPORTCNTR_PKTRCV] = cr_pktrcv, + [QIBPORTCNTR_PSRCVDATA] = cr_psrcvdatacount, + [QIBPORTCNTR_PSRCVPKTS] = cr_psrcvpktscount, + [QIBPORTCNTR_RCVEBP] = cr_rcvebp, + [QIBPORTCNTR_RCVOVFL] = cr_rcvovfl, + [QIBPORTCNTR_WORDRCV] = cr_wordrcv, + [QIBPORTCNTR_RXDROPPKT] = cr_rxdroppkt, + [QIBPORTCNTR_RXLOCALPHYERR] = cr_rxotherlocalphyerr, + [QIBPORTCNTR_RXVLERR] = cr_rxvlerr, + [QIBPORTCNTR_ERRICRC] = cr_erricrc, + [QIBPORTCNTR_ERRVCRC] = cr_errvcrc, + [QIBPORTCNTR_ERRLPCRC] = cr_errlpcrc, + [QIBPORTCNTR_BADFORMAT] = cr_badformat, + [QIBPORTCNTR_ERR_RLEN] = cr_err_rlen, + [QIBPORTCNTR_IBSYMBOLERR] = cr_ibsymbolerr, + [QIBPORTCNTR_INVALIDRLEN] = cr_invalidrlen, + [QIBPORTCNTR_UNSUPVL] = cr_txunsupvl, + [QIBPORTCNTR_EXCESSBUFOVFL] = cr_excessbufferovfl, + [QIBPORTCNTR_ERRLINK] = cr_errlink, + [QIBPORTCNTR_IBLINKDOWN] = cr_iblinkdown, + [QIBPORTCNTR_IBLINKERRRECOV] = cr_iblinkerrrecov, + [QIBPORTCNTR_LLI] = cr_locallinkintegrityerr, + [QIBPORTCNTR_PSINTERVAL] = cr_psinterval, + [QIBPORTCNTR_PSSTART] = cr_psstart, + [QIBPORTCNTR_PSSTAT] = cr_psstat, + [QIBPORTCNTR_VL15PKTDROP] = cr_vl15droppedpkt, + [QIBPORTCNTR_ERRPKEY] = cr_errpkey, + [QIBPORTCNTR_KHDROVFL] = 0xffff, + }; + + if (reg >= ARRAY_SIZE(xlator)) { + qib_devinfo(ppd->dd->pcidev, + "Unimplemented portcounter %u\n", reg); + goto done; + } + creg = xlator[reg]; + + if (reg == QIBPORTCNTR_KHDROVFL) { + int i; + + /* sum over all kernel contexts */ + for (i = 0; i < dd->first_user_ctxt; i++) + ret += read_7220_creg32(dd, cr_portovfl + i); + } + if (creg == 0xffff) + goto done; + + /* + * only fast incrementing counters are 64bit; use 32 bit reads to + * avoid two independent reads when on opteron + */ + if ((creg == cr_wordsend || creg == cr_wordrcv || + creg == cr_pktsend || creg == cr_pktrcv)) + ret = read_7220_creg(dd, creg); + else + ret = read_7220_creg32(dd, creg); + if (creg == cr_ibsymbolerr) { + if (dd->pport->cpspec->ibdeltainprog) + ret -= ret - ppd->cpspec->ibsymsnap; + ret -= dd->pport->cpspec->ibsymdelta; + } else if (creg == cr_iblinkerrrecov) { + if (dd->pport->cpspec->ibdeltainprog) + ret -= ret - ppd->cpspec->iblnkerrsnap; + ret -= dd->pport->cpspec->iblnkerrdelta; + } +done: + return ret; +} + +/* + * Device counter names (not port-specific), one line per stat, + * single string. Used by utilities like ipathstats to print the stats + * in a way which works for different versions of drivers, without changing + * the utility. Names need to be 12 chars or less (w/o newline), for proper + * display by utility. + * Non-error counters are first. + * Start of "error" conters is indicated by a leading "E " on the first + * "error" counter, and doesn't count in label length. + * The EgrOvfl list needs to be last so we truncate them at the configured + * context count for the device. + * cntr7220indices contains the corresponding register indices. + */ +static const char cntr7220names[] = + "Interrupts\n" + "HostBusStall\n" + "E RxTIDFull\n" + "RxTIDInvalid\n" + "Ctxt0EgrOvfl\n" + "Ctxt1EgrOvfl\n" + "Ctxt2EgrOvfl\n" + "Ctxt3EgrOvfl\n" + "Ctxt4EgrOvfl\n" + "Ctxt5EgrOvfl\n" + "Ctxt6EgrOvfl\n" + "Ctxt7EgrOvfl\n" + "Ctxt8EgrOvfl\n" + "Ctxt9EgrOvfl\n" + "Ctx10EgrOvfl\n" + "Ctx11EgrOvfl\n" + "Ctx12EgrOvfl\n" + "Ctx13EgrOvfl\n" + "Ctx14EgrOvfl\n" + "Ctx15EgrOvfl\n" + "Ctx16EgrOvfl\n"; + +static const size_t cntr7220indices[] = { + cr_lbint, + cr_lbflowstall, + cr_errtidfull, + cr_errtidvalid, + cr_portovfl + 0, + cr_portovfl + 1, + cr_portovfl + 2, + cr_portovfl + 3, + cr_portovfl + 4, + cr_portovfl + 5, + cr_portovfl + 6, + cr_portovfl + 7, + cr_portovfl + 8, + cr_portovfl + 9, + cr_portovfl + 10, + cr_portovfl + 11, + cr_portovfl + 12, + cr_portovfl + 13, + cr_portovfl + 14, + cr_portovfl + 15, + cr_portovfl + 16, +}; + +/* + * same as cntr7220names and cntr7220indices, but for port-specific counters. + * portcntr7220indices is somewhat complicated by some registers needing + * adjustments of various kinds, and those are ORed with _PORT_VIRT_FLAG + */ +static const char portcntr7220names[] = + "TxPkt\n" + "TxFlowPkt\n" + "TxWords\n" + "RxPkt\n" + "RxFlowPkt\n" + "RxWords\n" + "TxFlowStall\n" + "TxDmaDesc\n" /* 7220 and 7322-only */ + "E RxDlidFltr\n" /* 7220 and 7322-only */ + "IBStatusChng\n" + "IBLinkDown\n" + "IBLnkRecov\n" + "IBRxLinkErr\n" + "IBSymbolErr\n" + "RxLLIErr\n" + "RxBadFormat\n" + "RxBadLen\n" + "RxBufOvrfl\n" + "RxEBP\n" + "RxFlowCtlErr\n" + "RxICRCerr\n" + "RxLPCRCerr\n" + "RxVCRCerr\n" + "RxInvalLen\n" + "RxInvalPKey\n" + "RxPktDropped\n" + "TxBadLength\n" + "TxDropped\n" + "TxInvalLen\n" + "TxUnderrun\n" + "TxUnsupVL\n" + "RxLclPhyErr\n" /* 7220 and 7322-only */ + "RxVL15Drop\n" /* 7220 and 7322-only */ + "RxVlErr\n" /* 7220 and 7322-only */ + "XcessBufOvfl\n" /* 7220 and 7322-only */ + ; + +#define _PORT_VIRT_FLAG 0x8000 /* "virtual", need adjustments */ +static const size_t portcntr7220indices[] = { + QIBPORTCNTR_PKTSEND | _PORT_VIRT_FLAG, + cr_pktsendflow, + QIBPORTCNTR_WORDSEND | _PORT_VIRT_FLAG, + QIBPORTCNTR_PKTRCV | _PORT_VIRT_FLAG, + cr_pktrcvflowctrl, + QIBPORTCNTR_WORDRCV | _PORT_VIRT_FLAG, + QIBPORTCNTR_SENDSTALL | _PORT_VIRT_FLAG, + cr_txsdmadesc, + cr_rxdlidfltr, + cr_ibstatuschange, + QIBPORTCNTR_IBLINKDOWN | _PORT_VIRT_FLAG, + QIBPORTCNTR_IBLINKERRRECOV | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERRLINK | _PORT_VIRT_FLAG, + QIBPORTCNTR_IBSYMBOLERR | _PORT_VIRT_FLAG, + QIBPORTCNTR_LLI | _PORT_VIRT_FLAG, + QIBPORTCNTR_BADFORMAT | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERR_RLEN | _PORT_VIRT_FLAG, + QIBPORTCNTR_RCVOVFL | _PORT_VIRT_FLAG, + QIBPORTCNTR_RCVEBP | _PORT_VIRT_FLAG, + cr_rcvflowctrl_err, + QIBPORTCNTR_ERRICRC | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERRLPCRC | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERRVCRC | _PORT_VIRT_FLAG, + QIBPORTCNTR_INVALIDRLEN | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERRPKEY | _PORT_VIRT_FLAG, + QIBPORTCNTR_RXDROPPKT | _PORT_VIRT_FLAG, + cr_invalidslen, + cr_senddropped, + cr_errslen, + cr_sendunderrun, + cr_txunsupvl, + QIBPORTCNTR_RXLOCALPHYERR | _PORT_VIRT_FLAG, + QIBPORTCNTR_VL15PKTDROP | _PORT_VIRT_FLAG, + QIBPORTCNTR_RXVLERR | _PORT_VIRT_FLAG, + QIBPORTCNTR_EXCESSBUFOVFL | _PORT_VIRT_FLAG, +}; + +/* do all the setup to make the counter reads efficient later */ +static void init_7220_cntrnames(struct qib_devdata *dd) +{ + int i, j = 0; + char *s; + + for (i = 0, s = (char *)cntr7220names; s && j <= dd->cfgctxts; + i++) { + /* we always have at least one counter before the egrovfl */ + if (!j && !strncmp("Ctxt0EgrOvfl", s + 1, 12)) + j = 1; + s = strchr(s + 1, '\n'); + if (s && j) + j++; + } + dd->cspec->ncntrs = i; + if (!s) + /* full list; size is without terminating null */ + dd->cspec->cntrnamelen = sizeof(cntr7220names) - 1; + else + dd->cspec->cntrnamelen = 1 + s - cntr7220names; + dd->cspec->cntrs = kmalloc(dd->cspec->ncntrs + * sizeof(u64), GFP_KERNEL); + if (!dd->cspec->cntrs) + qib_dev_err(dd, "Failed allocation for counters\n"); + + for (i = 0, s = (char *)portcntr7220names; s; i++) + s = strchr(s + 1, '\n'); + dd->cspec->nportcntrs = i - 1; + dd->cspec->portcntrnamelen = sizeof(portcntr7220names) - 1; + dd->cspec->portcntrs = kmalloc(dd->cspec->nportcntrs + * sizeof(u64), GFP_KERNEL); + if (!dd->cspec->portcntrs) + qib_dev_err(dd, "Failed allocation for portcounters\n"); +} + +static u32 qib_read_7220cntrs(struct qib_devdata *dd, loff_t pos, char **namep, + u64 **cntrp) +{ + u32 ret; + + if (!dd->cspec->cntrs) { + ret = 0; + goto done; + } + + if (namep) { + *namep = (char *)cntr7220names; + ret = dd->cspec->cntrnamelen; + if (pos >= ret) + ret = 0; /* final read after getting everything */ + } else { + u64 *cntr = dd->cspec->cntrs; + int i; + + ret = dd->cspec->ncntrs * sizeof(u64); + if (!cntr || pos >= ret) { + /* everything read, or couldn't get memory */ + ret = 0; + goto done; + } + + *cntrp = cntr; + for (i = 0; i < dd->cspec->ncntrs; i++) + *cntr++ = read_7220_creg32(dd, cntr7220indices[i]); + } +done: + return ret; +} + +static u32 qib_read_7220portcntrs(struct qib_devdata *dd, loff_t pos, u32 port, + char **namep, u64 **cntrp) +{ + u32 ret; + + if (!dd->cspec->portcntrs) { + ret = 0; + goto done; + } + if (namep) { + *namep = (char *)portcntr7220names; + ret = dd->cspec->portcntrnamelen; + if (pos >= ret) + ret = 0; /* final read after getting everything */ + } else { + u64 *cntr = dd->cspec->portcntrs; + struct qib_pportdata *ppd = &dd->pport[port]; + int i; + + ret = dd->cspec->nportcntrs * sizeof(u64); + if (!cntr || pos >= ret) { + /* everything read, or couldn't get memory */ + ret = 0; + goto done; + } + *cntrp = cntr; + for (i = 0; i < dd->cspec->nportcntrs; i++) { + if (portcntr7220indices[i] & _PORT_VIRT_FLAG) + *cntr++ = qib_portcntr_7220(ppd, + portcntr7220indices[i] & + ~_PORT_VIRT_FLAG); + else + *cntr++ = read_7220_creg32(dd, + portcntr7220indices[i]); + } + } +done: + return ret; +} + +/** + * qib_get_7220_faststats - get word counters from chip before they overflow + * @opaque - contains a pointer to the qlogic_ib device qib_devdata + * + * This needs more work; in particular, decision on whether we really + * need traffic_wds done the way it is + * called from add_timer + */ +static void qib_get_7220_faststats(unsigned long opaque) +{ + struct qib_devdata *dd = (struct qib_devdata *) opaque; + struct qib_pportdata *ppd = dd->pport; + unsigned long flags; + u64 traffic_wds; + + /* + * don't access the chip while running diags, or memory diags can + * fail + */ + if (!(dd->flags & QIB_INITTED) || dd->diag_client) + /* but re-arm the timer, for diags case; won't hurt other */ + goto done; + + /* + * We now try to maintain an activity timer, based on traffic + * exceeding a threshold, so we need to check the word-counts + * even if they are 64-bit. + */ + traffic_wds = qib_portcntr_7220(ppd, cr_wordsend) + + qib_portcntr_7220(ppd, cr_wordrcv); + spin_lock_irqsave(&dd->eep_st_lock, flags); + traffic_wds -= dd->traffic_wds; + dd->traffic_wds += traffic_wds; + if (traffic_wds >= QIB_TRAFFIC_ACTIVE_THRESHOLD) + atomic_add(5, &dd->active_time); /* S/B #define */ + spin_unlock_irqrestore(&dd->eep_st_lock, flags); +done: + mod_timer(&dd->stats_timer, jiffies + HZ * ACTIVITY_TIMER); +} + +/* + * If we are using MSI, try to fallback to INTx. + */ +static int qib_7220_intr_fallback(struct qib_devdata *dd) +{ + if (!dd->msi_lo) + return 0; + + qib_devinfo(dd->pcidev, + "MSI interrupt not detected, trying INTx interrupts\n"); + qib_7220_free_irq(dd); + qib_enable_intx(dd->pcidev); + /* + * Some newer kernels require free_irq before disable_msi, + * and irq can be changed during disable and INTx enable + * and we need to therefore use the pcidev->irq value, + * not our saved MSI value. + */ + dd->cspec->irq = dd->pcidev->irq; + qib_setup_7220_interrupt(dd); + return 1; +} + +/* + * Reset the XGXS (between serdes and IBC). Slightly less intrusive + * than resetting the IBC or external link state, and useful in some + * cases to cause some retraining. To do this right, we reset IBC + * as well. + */ +static void qib_7220_xgxs_reset(struct qib_pportdata *ppd) +{ + u64 val, prev_val; + struct qib_devdata *dd = ppd->dd; + + prev_val = qib_read_kreg64(dd, kr_xgxs_cfg); + val = prev_val | QLOGIC_IB_XGXS_RESET; + prev_val &= ~QLOGIC_IB_XGXS_RESET; /* be sure */ + qib_write_kreg(dd, kr_control, + dd->control & ~QLOGIC_IB_C_LINKENABLE); + qib_write_kreg(dd, kr_xgxs_cfg, val); + qib_read_kreg32(dd, kr_scratch); + qib_write_kreg(dd, kr_xgxs_cfg, prev_val); + qib_write_kreg(dd, kr_control, dd->control); +} + +/* + * For this chip, we want to use the same buffer every time + * when we are trying to bring the link up (they are always VL15 + * packets). At that link state the packet should always go out immediately + * (or at least be discarded at the tx interface if the link is down). + * If it doesn't, and the buffer isn't available, that means some other + * sender has gotten ahead of us, and is preventing our packet from going + * out. In that case, we flush all packets, and try again. If that still + * fails, we fail the request, and hope things work the next time around. + * + * We don't need very complicated heuristics on whether the packet had + * time to go out or not, since even at SDR 1X, it goes out in very short + * time periods, covered by the chip reads done here and as part of the + * flush. + */ +static u32 __iomem *get_7220_link_buf(struct qib_pportdata *ppd, u32 *bnum) +{ + u32 __iomem *buf; + u32 lbuf = ppd->dd->cspec->lastbuf_for_pio; + int do_cleanup; + unsigned long flags; + + /* + * always blip to get avail list updated, since it's almost + * always needed, and is fairly cheap. + */ + sendctrl_7220_mod(ppd->dd->pport, QIB_SENDCTRL_AVAIL_BLIP); + qib_read_kreg64(ppd->dd, kr_scratch); /* extra chip flush */ + buf = qib_getsendbuf_range(ppd->dd, bnum, lbuf, lbuf); + if (buf) + goto done; + + spin_lock_irqsave(&ppd->sdma_lock, flags); + if (ppd->sdma_state.current_state == qib_sdma_state_s20_idle && + ppd->sdma_state.current_state != qib_sdma_state_s00_hw_down) { + __qib_sdma_process_event(ppd, qib_sdma_event_e00_go_hw_down); + do_cleanup = 0; + } else { + do_cleanup = 1; + qib_7220_sdma_hw_clean_up(ppd); + } + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + + if (do_cleanup) { + qib_read_kreg64(ppd->dd, kr_scratch); /* extra chip flush */ + buf = qib_getsendbuf_range(ppd->dd, bnum, lbuf, lbuf); + } +done: + return buf; +} + +/* + * This code for non-IBTA-compliant IB speed negotiation is only known to + * work for the SDR to DDR transition, and only between an HCA and a switch + * with recent firmware. It is based on observed heuristics, rather than + * actual knowledge of the non-compliant speed negotiation. + * It has a number of hard-coded fields, since the hope is to rewrite this + * when a spec is available on how the negoation is intended to work. + */ +static void autoneg_7220_sendpkt(struct qib_pportdata *ppd, u32 *hdr, + u32 dcnt, u32 *data) +{ + int i; + u64 pbc; + u32 __iomem *piobuf; + u32 pnum; + struct qib_devdata *dd = ppd->dd; + + i = 0; + pbc = 7 + dcnt + 1; /* 7 dword header, dword data, icrc */ + pbc |= PBC_7220_VL15_SEND; + while (!(piobuf = get_7220_link_buf(ppd, &pnum))) { + if (i++ > 5) + return; + udelay(2); + } + sendctrl_7220_mod(dd->pport, QIB_SENDCTRL_DISARM_BUF(pnum)); + writeq(pbc, piobuf); + qib_flush_wc(); + qib_pio_copy(piobuf + 2, hdr, 7); + qib_pio_copy(piobuf + 9, data, dcnt); + if (dd->flags & QIB_USE_SPCL_TRIG) { + u32 spcl_off = (pnum >= dd->piobcnt2k) ? 2047 : 1023; + + qib_flush_wc(); + __raw_writel(0xaebecede, piobuf + spcl_off); + } + qib_flush_wc(); + qib_sendbuf_done(dd, pnum); +} + +/* + * _start packet gets sent twice at start, _done gets sent twice at end + */ +static void autoneg_7220_send(struct qib_pportdata *ppd, int which) +{ + struct qib_devdata *dd = ppd->dd; + static u32 swapped; + u32 dw, i, hcnt, dcnt, *data; + static u32 hdr[7] = { 0xf002ffff, 0x48ffff, 0x6400abba }; + static u32 madpayload_start[0x40] = { + 0x1810103, 0x1, 0x0, 0x0, 0x2c90000, 0x2c9, 0x0, 0x0, + 0xffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x1388, 0x15e, 0x1, /* rest 0's */ + }; + static u32 madpayload_done[0x40] = { + 0x1810103, 0x1, 0x0, 0x0, 0x2c90000, 0x2c9, 0x0, 0x0, + 0xffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x40000001, 0x1388, 0x15e, /* rest 0's */ + }; + + dcnt = ARRAY_SIZE(madpayload_start); + hcnt = ARRAY_SIZE(hdr); + if (!swapped) { + /* for maintainability, do it at runtime */ + for (i = 0; i < hcnt; i++) { + dw = (__force u32) cpu_to_be32(hdr[i]); + hdr[i] = dw; + } + for (i = 0; i < dcnt; i++) { + dw = (__force u32) cpu_to_be32(madpayload_start[i]); + madpayload_start[i] = dw; + dw = (__force u32) cpu_to_be32(madpayload_done[i]); + madpayload_done[i] = dw; + } + swapped = 1; + } + + data = which ? madpayload_done : madpayload_start; + + autoneg_7220_sendpkt(ppd, hdr, dcnt, data); + qib_read_kreg64(dd, kr_scratch); + udelay(2); + autoneg_7220_sendpkt(ppd, hdr, dcnt, data); + qib_read_kreg64(dd, kr_scratch); + udelay(2); +} + +/* + * Do the absolute minimum to cause an IB speed change, and make it + * ready, but don't actually trigger the change. The caller will + * do that when ready (if link is in Polling training state, it will + * happen immediately, otherwise when link next goes down) + * + * This routine should only be used as part of the DDR autonegotation + * code for devices that are not compliant with IB 1.2 (or code that + * fixes things up for same). + * + * When link has gone down, and autoneg enabled, or autoneg has + * failed and we give up until next time we set both speeds, and + * then we want IBTA enabled as well as "use max enabled speed. + */ +static void set_7220_ibspeed_fast(struct qib_pportdata *ppd, u32 speed) +{ + ppd->cpspec->ibcddrctrl &= ~(IBA7220_IBC_SPEED_AUTONEG_MASK | + IBA7220_IBC_IBTA_1_2_MASK); + + if (speed == (QIB_IB_SDR | QIB_IB_DDR)) + ppd->cpspec->ibcddrctrl |= IBA7220_IBC_SPEED_AUTONEG_MASK | + IBA7220_IBC_IBTA_1_2_MASK; + else + ppd->cpspec->ibcddrctrl |= speed == QIB_IB_DDR ? + IBA7220_IBC_SPEED_DDR : IBA7220_IBC_SPEED_SDR; + + qib_write_kreg(ppd->dd, kr_ibcddrctrl, ppd->cpspec->ibcddrctrl); + qib_write_kreg(ppd->dd, kr_scratch, 0); +} + +/* + * This routine is only used when we are not talking to another + * IB 1.2-compliant device that we think can do DDR. + * (This includes all existing switch chips as of Oct 2007.) + * 1.2-compliant devices go directly to DDR prior to reaching INIT + */ +static void try_7220_autoneg(struct qib_pportdata *ppd) +{ + unsigned long flags; + + /* + * Required for older non-IB1.2 DDR switches. Newer + * non-IB-compliant switches don't need it, but so far, + * aren't bothered by it either. "Magic constant" + */ + qib_write_kreg(ppd->dd, kr_ncmodectrl, 0x3b9dc07); + + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags |= QIBL_IB_AUTONEG_INPROG; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + autoneg_7220_send(ppd, 0); + set_7220_ibspeed_fast(ppd, QIB_IB_DDR); + + toggle_7220_rclkrls(ppd->dd); + /* 2 msec is minimum length of a poll cycle */ + queue_delayed_work(ib_wq, &ppd->cpspec->autoneg_work, + msecs_to_jiffies(2)); +} + +/* + * Handle the empirically determined mechanism for auto-negotiation + * of DDR speed with switches. + */ +static void autoneg_7220_work(struct work_struct *work) +{ + struct qib_pportdata *ppd; + struct qib_devdata *dd; + u64 startms; + u32 i; + unsigned long flags; + + ppd = &container_of(work, struct qib_chippport_specific, + autoneg_work.work)->pportdata; + dd = ppd->dd; + + startms = jiffies_to_msecs(jiffies); + + /* + * Busy wait for this first part, it should be at most a + * few hundred usec, since we scheduled ourselves for 2msec. + */ + for (i = 0; i < 25; i++) { + if (SYM_FIELD(ppd->lastibcstat, IBCStatus, LinkTrainingState) + == IB_7220_LT_STATE_POLLQUIET) { + qib_set_linkstate(ppd, QIB_IB_LINKDOWN_DISABLE); + break; + } + udelay(100); + } + + if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) + goto done; /* we got there early or told to stop */ + + /* we expect this to timeout */ + if (wait_event_timeout(ppd->cpspec->autoneg_wait, + !(ppd->lflags & QIBL_IB_AUTONEG_INPROG), + msecs_to_jiffies(90))) + goto done; + + toggle_7220_rclkrls(dd); + + /* we expect this to timeout */ + if (wait_event_timeout(ppd->cpspec->autoneg_wait, + !(ppd->lflags & QIBL_IB_AUTONEG_INPROG), + msecs_to_jiffies(1700))) + goto done; + + set_7220_ibspeed_fast(ppd, QIB_IB_SDR); + toggle_7220_rclkrls(dd); + + /* + * Wait up to 250 msec for link to train and get to INIT at DDR; + * this should terminate early. + */ + wait_event_timeout(ppd->cpspec->autoneg_wait, + !(ppd->lflags & QIBL_IB_AUTONEG_INPROG), + msecs_to_jiffies(250)); +done: + if (ppd->lflags & QIBL_IB_AUTONEG_INPROG) { + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_IB_AUTONEG_INPROG; + if (dd->cspec->autoneg_tries == AUTONEG_TRIES) { + ppd->lflags |= QIBL_IB_AUTONEG_FAILED; + dd->cspec->autoneg_tries = 0; + } + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + set_7220_ibspeed_fast(ppd, ppd->link_speed_enabled); + } +} + +static u32 qib_7220_iblink_state(u64 ibcs) +{ + u32 state = (u32)SYM_FIELD(ibcs, IBCStatus, LinkState); + + switch (state) { + case IB_7220_L_STATE_INIT: + state = IB_PORT_INIT; + break; + case IB_7220_L_STATE_ARM: + state = IB_PORT_ARMED; + break; + case IB_7220_L_STATE_ACTIVE: + /* fall through */ + case IB_7220_L_STATE_ACT_DEFER: + state = IB_PORT_ACTIVE; + break; + default: /* fall through */ + case IB_7220_L_STATE_DOWN: + state = IB_PORT_DOWN; + break; + } + return state; +} + +/* returns the IBTA port state, rather than the IBC link training state */ +static u8 qib_7220_phys_portstate(u64 ibcs) +{ + u8 state = (u8)SYM_FIELD(ibcs, IBCStatus, LinkTrainingState); + return qib_7220_physportstate[state]; +} + +static int qib_7220_ib_updown(struct qib_pportdata *ppd, int ibup, u64 ibcs) +{ + int ret = 0, symadj = 0; + struct qib_devdata *dd = ppd->dd; + unsigned long flags; + + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_IB_FORCE_NOTIFY; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + + if (!ibup) { + /* + * When the link goes down we don't want AEQ running, so it + * won't interfere with IBC training, etc., and we need + * to go back to the static SerDes preset values. + */ + if (!(ppd->lflags & (QIBL_IB_AUTONEG_FAILED | + QIBL_IB_AUTONEG_INPROG))) + set_7220_ibspeed_fast(ppd, ppd->link_speed_enabled); + if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) { + qib_sd7220_presets(dd); + qib_cancel_sends(ppd); /* initial disarm, etc. */ + spin_lock_irqsave(&ppd->sdma_lock, flags); + if (__qib_sdma_running(ppd)) + __qib_sdma_process_event(ppd, + qib_sdma_event_e70_go_idle); + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + } + /* this might better in qib_sd7220_presets() */ + set_7220_relock_poll(dd, ibup); + } else { + if (qib_compat_ddr_negotiate && + !(ppd->lflags & (QIBL_IB_AUTONEG_FAILED | + QIBL_IB_AUTONEG_INPROG)) && + ppd->link_speed_active == QIB_IB_SDR && + (ppd->link_speed_enabled & (QIB_IB_DDR | QIB_IB_SDR)) == + (QIB_IB_DDR | QIB_IB_SDR) && + dd->cspec->autoneg_tries < AUTONEG_TRIES) { + /* we are SDR, and DDR auto-negotiation enabled */ + ++dd->cspec->autoneg_tries; + if (!ppd->cpspec->ibdeltainprog) { + ppd->cpspec->ibdeltainprog = 1; + ppd->cpspec->ibsymsnap = read_7220_creg32(dd, + cr_ibsymbolerr); + ppd->cpspec->iblnkerrsnap = read_7220_creg32(dd, + cr_iblinkerrrecov); + } + try_7220_autoneg(ppd); + ret = 1; /* no other IB status change processing */ + } else if ((ppd->lflags & QIBL_IB_AUTONEG_INPROG) && + ppd->link_speed_active == QIB_IB_SDR) { + autoneg_7220_send(ppd, 1); + set_7220_ibspeed_fast(ppd, QIB_IB_DDR); + udelay(2); + toggle_7220_rclkrls(dd); + ret = 1; /* no other IB status change processing */ + } else { + if ((ppd->lflags & QIBL_IB_AUTONEG_INPROG) && + (ppd->link_speed_active & QIB_IB_DDR)) { + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~(QIBL_IB_AUTONEG_INPROG | + QIBL_IB_AUTONEG_FAILED); + spin_unlock_irqrestore(&ppd->lflags_lock, + flags); + dd->cspec->autoneg_tries = 0; + /* re-enable SDR, for next link down */ + set_7220_ibspeed_fast(ppd, + ppd->link_speed_enabled); + wake_up(&ppd->cpspec->autoneg_wait); + symadj = 1; + } else if (ppd->lflags & QIBL_IB_AUTONEG_FAILED) { + /* + * Clear autoneg failure flag, and do setup + * so we'll try next time link goes down and + * back to INIT (possibly connected to a + * different device). + */ + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_IB_AUTONEG_FAILED; + spin_unlock_irqrestore(&ppd->lflags_lock, + flags); + ppd->cpspec->ibcddrctrl |= + IBA7220_IBC_IBTA_1_2_MASK; + qib_write_kreg(dd, kr_ncmodectrl, 0); + symadj = 1; + } + } + + if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) + symadj = 1; + + if (!ret) { + ppd->delay_mult = rate_to_delay + [(ibcs >> IBA7220_LINKSPEED_SHIFT) & 1] + [(ibcs >> IBA7220_LINKWIDTH_SHIFT) & 1]; + + set_7220_relock_poll(dd, ibup); + spin_lock_irqsave(&ppd->sdma_lock, flags); + /* + * Unlike 7322, the 7220 needs this, due to lack of + * interrupt in some cases when we have sdma active + * when the link goes down. + */ + if (ppd->sdma_state.current_state != + qib_sdma_state_s20_idle) + __qib_sdma_process_event(ppd, + qib_sdma_event_e00_go_hw_down); + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + } + } + + if (symadj) { + if (ppd->cpspec->ibdeltainprog) { + ppd->cpspec->ibdeltainprog = 0; + ppd->cpspec->ibsymdelta += read_7220_creg32(ppd->dd, + cr_ibsymbolerr) - ppd->cpspec->ibsymsnap; + ppd->cpspec->iblnkerrdelta += read_7220_creg32(ppd->dd, + cr_iblinkerrrecov) - ppd->cpspec->iblnkerrsnap; + } + } else if (!ibup && qib_compat_ddr_negotiate && + !ppd->cpspec->ibdeltainprog && + !(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) { + ppd->cpspec->ibdeltainprog = 1; + ppd->cpspec->ibsymsnap = read_7220_creg32(ppd->dd, + cr_ibsymbolerr); + ppd->cpspec->iblnkerrsnap = read_7220_creg32(ppd->dd, + cr_iblinkerrrecov); + } + + if (!ret) + qib_setup_7220_setextled(ppd, ibup); + return ret; +} + +/* + * Does read/modify/write to appropriate registers to + * set output and direction bits selected by mask. + * these are in their canonical postions (e.g. lsb of + * dir will end up in D48 of extctrl on existing chips). + * returns contents of GP Inputs. + */ +static int gpio_7220_mod(struct qib_devdata *dd, u32 out, u32 dir, u32 mask) +{ + u64 read_val, new_out; + unsigned long flags; + + if (mask) { + /* some bits being written, lock access to GPIO */ + dir &= mask; + out &= mask; + spin_lock_irqsave(&dd->cspec->gpio_lock, flags); + dd->cspec->extctrl &= ~((u64)mask << SYM_LSB(EXTCtrl, GPIOOe)); + dd->cspec->extctrl |= ((u64) dir << SYM_LSB(EXTCtrl, GPIOOe)); + new_out = (dd->cspec->gpio_out & ~mask) | out; + + qib_write_kreg(dd, kr_extctrl, dd->cspec->extctrl); + qib_write_kreg(dd, kr_gpio_out, new_out); + dd->cspec->gpio_out = new_out; + spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags); + } + /* + * It is unlikely that a read at this time would get valid + * data on a pin whose direction line was set in the same + * call to this function. We include the read here because + * that allows us to potentially combine a change on one pin with + * a read on another, and because the old code did something like + * this. + */ + read_val = qib_read_kreg64(dd, kr_extstatus); + return SYM_FIELD(read_val, EXTStatus, GPIOIn); +} + +/* + * Read fundamental info we need to use the chip. These are + * the registers that describe chip capabilities, and are + * saved in shadow registers. + */ +static void get_7220_chip_params(struct qib_devdata *dd) +{ + u64 val; + u32 piobufs; + int mtu; + + dd->uregbase = qib_read_kreg32(dd, kr_userregbase); + + dd->rcvtidcnt = qib_read_kreg32(dd, kr_rcvtidcnt); + dd->rcvtidbase = qib_read_kreg32(dd, kr_rcvtidbase); + dd->rcvegrbase = qib_read_kreg32(dd, kr_rcvegrbase); + dd->palign = qib_read_kreg32(dd, kr_palign); + dd->piobufbase = qib_read_kreg64(dd, kr_sendpiobufbase); + dd->pio2k_bufbase = dd->piobufbase & 0xffffffff; + + val = qib_read_kreg64(dd, kr_sendpiosize); + dd->piosize2k = val & ~0U; + dd->piosize4k = val >> 32; + + mtu = ib_mtu_enum_to_int(qib_ibmtu); + if (mtu == -1) + mtu = QIB_DEFAULT_MTU; + dd->pport->ibmtu = (u32)mtu; + + val = qib_read_kreg64(dd, kr_sendpiobufcnt); + dd->piobcnt2k = val & ~0U; + dd->piobcnt4k = val >> 32; + /* these may be adjusted in init_chip_wc_pat() */ + dd->pio2kbase = (u32 __iomem *) + ((char __iomem *) dd->kregbase + dd->pio2k_bufbase); + if (dd->piobcnt4k) { + dd->pio4kbase = (u32 __iomem *) + ((char __iomem *) dd->kregbase + + (dd->piobufbase >> 32)); + /* + * 4K buffers take 2 pages; we use roundup just to be + * paranoid; we calculate it once here, rather than on + * ever buf allocate + */ + dd->align4k = ALIGN(dd->piosize4k, dd->palign); + } + + piobufs = dd->piobcnt4k + dd->piobcnt2k; + + dd->pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2) / + (sizeof(u64) * BITS_PER_BYTE / 2); +} + +/* + * The chip base addresses in cspec and cpspec have to be set + * after possible init_chip_wc_pat(), rather than in + * qib_get_7220_chip_params(), so split out as separate function + */ +static void set_7220_baseaddrs(struct qib_devdata *dd) +{ + u32 cregbase; + /* init after possible re-map in init_chip_wc_pat() */ + cregbase = qib_read_kreg32(dd, kr_counterregbase); + dd->cspec->cregbase = (u64 __iomem *) + ((char __iomem *) dd->kregbase + cregbase); + + dd->egrtidbase = (u64 __iomem *) + ((char __iomem *) dd->kregbase + dd->rcvegrbase); +} + + +#define SENDCTRL_SHADOWED (SYM_MASK(SendCtrl, SendIntBufAvail) | \ + SYM_MASK(SendCtrl, SPioEnable) | \ + SYM_MASK(SendCtrl, SSpecialTriggerEn) | \ + SYM_MASK(SendCtrl, SendBufAvailUpd) | \ + SYM_MASK(SendCtrl, AvailUpdThld) | \ + SYM_MASK(SendCtrl, SDmaEnable) | \ + SYM_MASK(SendCtrl, SDmaIntEnable) | \ + SYM_MASK(SendCtrl, SDmaHalt) | \ + SYM_MASK(SendCtrl, SDmaSingleDescriptor)) + +static int sendctrl_hook(struct qib_devdata *dd, + const struct diag_observer *op, + u32 offs, u64 *data, u64 mask, int only_32) +{ + unsigned long flags; + unsigned idx = offs / sizeof(u64); + u64 local_data, all_bits; + + if (idx != kr_sendctrl) { + qib_dev_err(dd, "SendCtrl Hook called with offs %X, %s-bit\n", + offs, only_32 ? "32" : "64"); + return 0; + } + + all_bits = ~0ULL; + if (only_32) + all_bits >>= 32; + spin_lock_irqsave(&dd->sendctrl_lock, flags); + if ((mask & all_bits) != all_bits) { + /* + * At least some mask bits are zero, so we need + * to read. The judgement call is whether from + * reg or shadow. First-cut: read reg, and complain + * if any bits which should be shadowed are different + * from their shadowed value. + */ + if (only_32) + local_data = (u64)qib_read_kreg32(dd, idx); + else + local_data = qib_read_kreg64(dd, idx); + qib_dev_err(dd, "Sendctrl -> %X, Shad -> %X\n", + (u32)local_data, (u32)dd->sendctrl); + if ((local_data & SENDCTRL_SHADOWED) != + (dd->sendctrl & SENDCTRL_SHADOWED)) + qib_dev_err(dd, "Sendctrl read: %X shadow is %X\n", + (u32)local_data, (u32) dd->sendctrl); + *data = (local_data & ~mask) | (*data & mask); + } + if (mask) { + /* + * At least some mask bits are one, so we need + * to write, but only shadow some bits. + */ + u64 sval, tval; /* Shadowed, transient */ + + /* + * New shadow val is bits we don't want to touch, + * ORed with bits we do, that are intended for shadow. + */ + sval = (dd->sendctrl & ~mask); + sval |= *data & SENDCTRL_SHADOWED & mask; + dd->sendctrl = sval; + tval = sval | (*data & ~SENDCTRL_SHADOWED & mask); + qib_dev_err(dd, "Sendctrl <- %X, Shad <- %X\n", + (u32)tval, (u32)sval); + qib_write_kreg(dd, kr_sendctrl, tval); + qib_write_kreg(dd, kr_scratch, 0Ull); + } + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); + + return only_32 ? 4 : 8; +} + +static const struct diag_observer sendctrl_observer = { + sendctrl_hook, kr_sendctrl * sizeof(u64), + kr_sendctrl * sizeof(u64) +}; + +/* + * write the final few registers that depend on some of the + * init setup. Done late in init, just before bringing up + * the serdes. + */ +static int qib_late_7220_initreg(struct qib_devdata *dd) +{ + int ret = 0; + u64 val; + + qib_write_kreg(dd, kr_rcvhdrentsize, dd->rcvhdrentsize); + qib_write_kreg(dd, kr_rcvhdrsize, dd->rcvhdrsize); + qib_write_kreg(dd, kr_rcvhdrcnt, dd->rcvhdrcnt); + qib_write_kreg(dd, kr_sendpioavailaddr, dd->pioavailregs_phys); + val = qib_read_kreg64(dd, kr_sendpioavailaddr); + if (val != dd->pioavailregs_phys) { + qib_dev_err(dd, + "Catastrophic software error, SendPIOAvailAddr written as %lx, read back as %llx\n", + (unsigned long) dd->pioavailregs_phys, + (unsigned long long) val); + ret = -EINVAL; + } + qib_register_observer(dd, &sendctrl_observer); + return ret; +} + +static int qib_init_7220_variables(struct qib_devdata *dd) +{ + struct qib_chippport_specific *cpspec; + struct qib_pportdata *ppd; + int ret = 0; + u32 sbufs, updthresh; + + cpspec = (struct qib_chippport_specific *)(dd + 1); + ppd = &cpspec->pportdata; + dd->pport = ppd; + dd->num_pports = 1; + + dd->cspec = (struct qib_chip_specific *)(cpspec + dd->num_pports); + ppd->cpspec = cpspec; + + spin_lock_init(&dd->cspec->sdepb_lock); + spin_lock_init(&dd->cspec->rcvmod_lock); + spin_lock_init(&dd->cspec->gpio_lock); + + /* we haven't yet set QIB_PRESENT, so use read directly */ + dd->revision = readq(&dd->kregbase[kr_revision]); + + if ((dd->revision & 0xffffffffU) == 0xffffffffU) { + qib_dev_err(dd, + "Revision register read failure, giving up initialization\n"); + ret = -ENODEV; + goto bail; + } + dd->flags |= QIB_PRESENT; /* now register routines work */ + + dd->majrev = (u8) SYM_FIELD(dd->revision, Revision_R, + ChipRevMajor); + dd->minrev = (u8) SYM_FIELD(dd->revision, Revision_R, + ChipRevMinor); + + get_7220_chip_params(dd); + qib_7220_boardname(dd); + + /* + * GPIO bits for TWSI data and clock, + * used for serial EEPROM. + */ + dd->gpio_sda_num = _QIB_GPIO_SDA_NUM; + dd->gpio_scl_num = _QIB_GPIO_SCL_NUM; + dd->twsi_eeprom_dev = QIB_TWSI_EEPROM_DEV; + + dd->flags |= QIB_HAS_INTX | QIB_HAS_LINK_LATENCY | + QIB_NODMA_RTAIL | QIB_HAS_THRESH_UPDATE; + dd->flags |= qib_special_trigger ? + QIB_USE_SPCL_TRIG : QIB_HAS_SEND_DMA; + + /* + * EEPROM error log 0 is TXE Parity errors. 1 is RXE Parity. + * 2 is Some Misc, 3 is reserved for future. + */ + dd->eep_st_masks[0].hwerrs_to_log = HWE_MASK(TXEMemParityErr); + + dd->eep_st_masks[1].hwerrs_to_log = HWE_MASK(RXEMemParityErr); + + dd->eep_st_masks[2].errs_to_log = ERR_MASK(ResetNegated); + + init_waitqueue_head(&cpspec->autoneg_wait); + INIT_DELAYED_WORK(&cpspec->autoneg_work, autoneg_7220_work); + + ret = qib_init_pportdata(ppd, dd, 0, 1); + if (ret) + goto bail; + ppd->link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X; + ppd->link_speed_supported = QIB_IB_SDR | QIB_IB_DDR; + + ppd->link_width_enabled = ppd->link_width_supported; + ppd->link_speed_enabled = ppd->link_speed_supported; + /* + * Set the initial values to reasonable default, will be set + * for real when link is up. + */ + ppd->link_width_active = IB_WIDTH_4X; + ppd->link_speed_active = QIB_IB_SDR; + ppd->delay_mult = rate_to_delay[0][1]; + ppd->vls_supported = IB_VL_VL0; + ppd->vls_operational = ppd->vls_supported; + + if (!qib_mini_init) + qib_write_kreg(dd, kr_rcvbthqp, QIB_KD_QP); + + init_timer(&ppd->cpspec->chase_timer); + ppd->cpspec->chase_timer.function = reenable_7220_chase; + ppd->cpspec->chase_timer.data = (unsigned long)ppd; + + qib_num_cfg_vls = 1; /* if any 7220's, only one VL */ + + dd->rcvhdrentsize = QIB_RCVHDR_ENTSIZE; + dd->rcvhdrsize = QIB_DFLT_RCVHDRSIZE; + dd->rhf_offset = + dd->rcvhdrentsize - sizeof(u64) / sizeof(u32); + + /* we always allocate at least 2048 bytes for eager buffers */ + ret = ib_mtu_enum_to_int(qib_ibmtu); + dd->rcvegrbufsize = ret != -1 ? max(ret, 2048) : QIB_DEFAULT_MTU; + BUG_ON(!is_power_of_2(dd->rcvegrbufsize)); + dd->rcvegrbufsize_shift = ilog2(dd->rcvegrbufsize); + + qib_7220_tidtemplate(dd); + + /* + * We can request a receive interrupt for 1 or + * more packets from current offset. For now, we set this + * up for a single packet. + */ + dd->rhdrhead_intr_off = 1ULL << 32; + + /* setup the stats timer; the add_timer is done at end of init */ + init_timer(&dd->stats_timer); + dd->stats_timer.function = qib_get_7220_faststats; + dd->stats_timer.data = (unsigned long) dd; + dd->stats_timer.expires = jiffies + ACTIVITY_TIMER * HZ; + + /* + * Control[4] has been added to change the arbitration within + * the SDMA engine between favoring data fetches over descriptor + * fetches. qib_sdma_fetch_arb==0 gives data fetches priority. + */ + if (qib_sdma_fetch_arb) + dd->control |= 1 << 4; + + dd->ureg_align = 0x10000; /* 64KB alignment */ + + dd->piosize2kmax_dwords = (dd->piosize2k >> 2)-1; + qib_7220_config_ctxts(dd); + qib_set_ctxtcnt(dd); /* needed for PAT setup */ + + if (qib_wc_pat) { + ret = init_chip_wc_pat(dd, 0); + if (ret) + goto bail; + } + set_7220_baseaddrs(dd); /* set chip access pointers now */ + + ret = 0; + if (qib_mini_init) + goto bail; + + ret = qib_create_ctxts(dd); + init_7220_cntrnames(dd); + + /* use all of 4KB buffers for the kernel SDMA, zero if !SDMA. + * reserve the update threshold amount for other kernel use, such + * as sending SMI, MAD, and ACKs, or 3, whichever is greater, + * unless we aren't enabling SDMA, in which case we want to use + * all the 4k bufs for the kernel. + * if this was less than the update threshold, we could wait + * a long time for an update. Coded this way because we + * sometimes change the update threshold for various reasons, + * and we want this to remain robust. + */ + updthresh = 8U; /* update threshold */ + if (dd->flags & QIB_HAS_SEND_DMA) { + dd->cspec->sdmabufcnt = dd->piobcnt4k; + sbufs = updthresh > 3 ? updthresh : 3; + } else { + dd->cspec->sdmabufcnt = 0; + sbufs = dd->piobcnt4k; + } + + dd->cspec->lastbuf_for_pio = dd->piobcnt2k + dd->piobcnt4k - + dd->cspec->sdmabufcnt; + dd->lastctxt_piobuf = dd->cspec->lastbuf_for_pio - sbufs; + dd->cspec->lastbuf_for_pio--; /* range is <= , not < */ + dd->last_pio = dd->cspec->lastbuf_for_pio; + dd->pbufsctxt = dd->lastctxt_piobuf / + (dd->cfgctxts - dd->first_user_ctxt); + + /* + * if we are at 16 user contexts, we will have one 7 sbufs + * per context, so drop the update threshold to match. We + * want to update before we actually run out, at low pbufs/ctxt + * so give ourselves some margin + */ + if ((dd->pbufsctxt - 2) < updthresh) + updthresh = dd->pbufsctxt - 2; + + dd->cspec->updthresh_dflt = updthresh; + dd->cspec->updthresh = updthresh; + + /* before full enable, no interrupts, no locking needed */ + dd->sendctrl |= (updthresh & SYM_RMASK(SendCtrl, AvailUpdThld)) + << SYM_LSB(SendCtrl, AvailUpdThld); + + dd->psxmitwait_supported = 1; + dd->psxmitwait_check_rate = QIB_7220_PSXMITWAIT_CHECK_RATE; +bail: + return ret; +} + +static u32 __iomem *qib_7220_getsendbuf(struct qib_pportdata *ppd, u64 pbc, + u32 *pbufnum) +{ + u32 first, last, plen = pbc & QIB_PBC_LENGTH_MASK; + struct qib_devdata *dd = ppd->dd; + u32 __iomem *buf; + + if (((pbc >> 32) & PBC_7220_VL15_SEND_CTRL) && + !(ppd->lflags & (QIBL_IB_AUTONEG_INPROG | QIBL_LINKACTIVE))) + buf = get_7220_link_buf(ppd, pbufnum); + else { + if ((plen + 1) > dd->piosize2kmax_dwords) + first = dd->piobcnt2k; + else + first = 0; + /* try 4k if all 2k busy, so same last for both sizes */ + last = dd->cspec->lastbuf_for_pio; + buf = qib_getsendbuf_range(dd, pbufnum, first, last); + } + return buf; +} + +/* these 2 "counters" are really control registers, and are always RW */ +static void qib_set_cntr_7220_sample(struct qib_pportdata *ppd, u32 intv, + u32 start) +{ + write_7220_creg(ppd->dd, cr_psinterval, intv); + write_7220_creg(ppd->dd, cr_psstart, start); +} + +/* + * NOTE: no real attempt is made to generalize the SDMA stuff. + * At some point "soon" we will have a new more generalized + * set of sdma interface, and then we'll clean this up. + */ + +/* Must be called with sdma_lock held, or before init finished */ +static void qib_sdma_update_7220_tail(struct qib_pportdata *ppd, u16 tail) +{ + /* Commit writes to memory and advance the tail on the chip */ + wmb(); + ppd->sdma_descq_tail = tail; + qib_write_kreg(ppd->dd, kr_senddmatail, tail); +} + +static void qib_sdma_set_7220_desc_cnt(struct qib_pportdata *ppd, unsigned cnt) +{ +} + +static struct sdma_set_state_action sdma_7220_action_table[] = { + [qib_sdma_state_s00_hw_down] = { + .op_enable = 0, + .op_intenable = 0, + .op_halt = 0, + .go_s99_running_tofalse = 1, + }, + [qib_sdma_state_s10_hw_start_up_wait] = { + .op_enable = 1, + .op_intenable = 1, + .op_halt = 1, + }, + [qib_sdma_state_s20_idle] = { + .op_enable = 1, + .op_intenable = 1, + .op_halt = 1, + }, + [qib_sdma_state_s30_sw_clean_up_wait] = { + .op_enable = 0, + .op_intenable = 1, + .op_halt = 0, + }, + [qib_sdma_state_s40_hw_clean_up_wait] = { + .op_enable = 1, + .op_intenable = 1, + .op_halt = 1, + }, + [qib_sdma_state_s50_hw_halt_wait] = { + .op_enable = 1, + .op_intenable = 1, + .op_halt = 1, + }, + [qib_sdma_state_s99_running] = { + .op_enable = 1, + .op_intenable = 1, + .op_halt = 0, + .go_s99_running_totrue = 1, + }, +}; + +static void qib_7220_sdma_init_early(struct qib_pportdata *ppd) +{ + ppd->sdma_state.set_state_action = sdma_7220_action_table; +} + +static int init_sdma_7220_regs(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + unsigned i, n; + u64 senddmabufmask[3] = { 0 }; + + /* Set SendDmaBase */ + qib_write_kreg(dd, kr_senddmabase, ppd->sdma_descq_phys); + qib_sdma_7220_setlengen(ppd); + qib_sdma_update_7220_tail(ppd, 0); /* Set SendDmaTail */ + /* Set SendDmaHeadAddr */ + qib_write_kreg(dd, kr_senddmaheadaddr, ppd->sdma_head_phys); + + /* + * Reserve all the former "kernel" piobufs, using high number range + * so we get as many 4K buffers as possible + */ + n = dd->piobcnt2k + dd->piobcnt4k; + i = n - dd->cspec->sdmabufcnt; + + for (; i < n; ++i) { + unsigned word = i / 64; + unsigned bit = i & 63; + + BUG_ON(word >= 3); + senddmabufmask[word] |= 1ULL << bit; + } + qib_write_kreg(dd, kr_senddmabufmask0, senddmabufmask[0]); + qib_write_kreg(dd, kr_senddmabufmask1, senddmabufmask[1]); + qib_write_kreg(dd, kr_senddmabufmask2, senddmabufmask[2]); + + ppd->sdma_state.first_sendbuf = i; + ppd->sdma_state.last_sendbuf = n; + + return 0; +} + +/* sdma_lock must be held */ +static u16 qib_sdma_7220_gethead(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + int sane; + int use_dmahead; + u16 swhead; + u16 swtail; + u16 cnt; + u16 hwhead; + + use_dmahead = __qib_sdma_running(ppd) && + (dd->flags & QIB_HAS_SDMA_TIMEOUT); +retry: + hwhead = use_dmahead ? + (u16)le64_to_cpu(*ppd->sdma_head_dma) : + (u16)qib_read_kreg32(dd, kr_senddmahead); + + swhead = ppd->sdma_descq_head; + swtail = ppd->sdma_descq_tail; + cnt = ppd->sdma_descq_cnt; + + if (swhead < swtail) { + /* not wrapped */ + sane = (hwhead >= swhead) & (hwhead <= swtail); + } else if (swhead > swtail) { + /* wrapped around */ + sane = ((hwhead >= swhead) && (hwhead < cnt)) || + (hwhead <= swtail); + } else { + /* empty */ + sane = (hwhead == swhead); + } + + if (unlikely(!sane)) { + if (use_dmahead) { + /* try one more time, directly from the register */ + use_dmahead = 0; + goto retry; + } + /* assume no progress */ + hwhead = swhead; + } + + return hwhead; +} + +static int qib_sdma_7220_busy(struct qib_pportdata *ppd) +{ + u64 hwstatus = qib_read_kreg64(ppd->dd, kr_senddmastatus); + + return (hwstatus & SYM_MASK(SendDmaStatus, ScoreBoardDrainInProg)) || + (hwstatus & SYM_MASK(SendDmaStatus, AbortInProg)) || + (hwstatus & SYM_MASK(SendDmaStatus, InternalSDmaEnable)) || + !(hwstatus & SYM_MASK(SendDmaStatus, ScbEmpty)); +} + +/* + * Compute the amount of delay before sending the next packet if the + * port's send rate differs from the static rate set for the QP. + * Since the delay affects this packet but the amount of the delay is + * based on the length of the previous packet, use the last delay computed + * and save the delay count for this packet to be used next time + * we get here. + */ +static u32 qib_7220_setpbc_control(struct qib_pportdata *ppd, u32 plen, + u8 srate, u8 vl) +{ + u8 snd_mult = ppd->delay_mult; + u8 rcv_mult = ib_rate_to_delay[srate]; + u32 ret = ppd->cpspec->last_delay_mult; + + ppd->cpspec->last_delay_mult = (rcv_mult > snd_mult) ? + (plen * (rcv_mult - snd_mult) + 1) >> 1 : 0; + + /* Indicate VL15, if necessary */ + if (vl == 15) + ret |= PBC_7220_VL15_SEND_CTRL; + return ret; +} + +static void qib_7220_initvl15_bufs(struct qib_devdata *dd) +{ +} + +static void qib_7220_init_ctxt(struct qib_ctxtdata *rcd) +{ + if (!rcd->ctxt) { + rcd->rcvegrcnt = IBA7220_KRCVEGRCNT; + rcd->rcvegr_tid_base = 0; + } else { + rcd->rcvegrcnt = rcd->dd->cspec->rcvegrcnt; + rcd->rcvegr_tid_base = IBA7220_KRCVEGRCNT + + (rcd->ctxt - 1) * rcd->rcvegrcnt; + } +} + +static void qib_7220_txchk_change(struct qib_devdata *dd, u32 start, + u32 len, u32 which, struct qib_ctxtdata *rcd) +{ + int i; + unsigned long flags; + + switch (which) { + case TXCHK_CHG_TYPE_KERN: + /* see if we need to raise avail update threshold */ + spin_lock_irqsave(&dd->uctxt_lock, flags); + for (i = dd->first_user_ctxt; + dd->cspec->updthresh != dd->cspec->updthresh_dflt + && i < dd->cfgctxts; i++) + if (dd->rcd[i] && dd->rcd[i]->subctxt_cnt && + ((dd->rcd[i]->piocnt / dd->rcd[i]->subctxt_cnt) - 1) + < dd->cspec->updthresh_dflt) + break; + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + if (i == dd->cfgctxts) { + spin_lock_irqsave(&dd->sendctrl_lock, flags); + dd->cspec->updthresh = dd->cspec->updthresh_dflt; + dd->sendctrl &= ~SYM_MASK(SendCtrl, AvailUpdThld); + dd->sendctrl |= (dd->cspec->updthresh & + SYM_RMASK(SendCtrl, AvailUpdThld)) << + SYM_LSB(SendCtrl, AvailUpdThld); + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); + sendctrl_7220_mod(dd->pport, QIB_SENDCTRL_AVAIL_BLIP); + } + break; + case TXCHK_CHG_TYPE_USER: + spin_lock_irqsave(&dd->sendctrl_lock, flags); + if (rcd && rcd->subctxt_cnt && ((rcd->piocnt + / rcd->subctxt_cnt) - 1) < dd->cspec->updthresh) { + dd->cspec->updthresh = (rcd->piocnt / + rcd->subctxt_cnt) - 1; + dd->sendctrl &= ~SYM_MASK(SendCtrl, AvailUpdThld); + dd->sendctrl |= (dd->cspec->updthresh & + SYM_RMASK(SendCtrl, AvailUpdThld)) + << SYM_LSB(SendCtrl, AvailUpdThld); + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); + sendctrl_7220_mod(dd->pport, QIB_SENDCTRL_AVAIL_BLIP); + } else + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); + break; + } +} + +static void writescratch(struct qib_devdata *dd, u32 val) +{ + qib_write_kreg(dd, kr_scratch, val); +} + +#define VALID_TS_RD_REG_MASK 0xBF +/** + * qib_7220_tempsense_read - read register of temp sensor via TWSI + * @dd: the qlogic_ib device + * @regnum: register to read from + * + * returns reg contents (0..255) or < 0 for error + */ +static int qib_7220_tempsense_rd(struct qib_devdata *dd, int regnum) +{ + int ret; + u8 rdata; + + if (regnum > 7) { + ret = -EINVAL; + goto bail; + } + + /* return a bogus value for (the one) register we do not have */ + if (!((1 << regnum) & VALID_TS_RD_REG_MASK)) { + ret = 0; + goto bail; + } + + ret = mutex_lock_interruptible(&dd->eep_lock); + if (ret) + goto bail; + + ret = qib_twsi_blk_rd(dd, QIB_TWSI_TEMP_DEV, regnum, &rdata, 1); + if (!ret) + ret = rdata; + + mutex_unlock(&dd->eep_lock); + + /* + * There are three possibilities here: + * ret is actual value (0..255) + * ret is -ENXIO or -EINVAL from twsi code or this file + * ret is -EINTR from mutex_lock_interruptible. + */ +bail: + return ret; +} + +#ifdef CONFIG_INFINIBAND_QIB_DCA +static int qib_7220_notify_dca(struct qib_devdata *dd, unsigned long event) +{ + return 0; +} +#endif + +/* Dummy function, as 7220 boards never disable EEPROM Write */ +static int qib_7220_eeprom_wen(struct qib_devdata *dd, int wen) +{ + return 1; +} + +/** + * qib_init_iba7220_funcs - set up the chip-specific function pointers + * @dev: the pci_dev for qlogic_ib device + * @ent: pci_device_id struct for this dev + * + * This is global, and is called directly at init to set up the + * chip-specific function pointers for later use. + */ +struct qib_devdata *qib_init_iba7220_funcs(struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + struct qib_devdata *dd; + int ret; + u32 boardid, minwidth; + + dd = qib_alloc_devdata(pdev, sizeof(struct qib_chip_specific) + + sizeof(struct qib_chippport_specific)); + if (IS_ERR(dd)) + goto bail; + + dd->f_bringup_serdes = qib_7220_bringup_serdes; + dd->f_cleanup = qib_setup_7220_cleanup; + dd->f_clear_tids = qib_7220_clear_tids; + dd->f_free_irq = qib_7220_free_irq; + dd->f_get_base_info = qib_7220_get_base_info; + dd->f_get_msgheader = qib_7220_get_msgheader; + dd->f_getsendbuf = qib_7220_getsendbuf; + dd->f_gpio_mod = gpio_7220_mod; + dd->f_eeprom_wen = qib_7220_eeprom_wen; + dd->f_hdrqempty = qib_7220_hdrqempty; + dd->f_ib_updown = qib_7220_ib_updown; + dd->f_init_ctxt = qib_7220_init_ctxt; + dd->f_initvl15_bufs = qib_7220_initvl15_bufs; + dd->f_intr_fallback = qib_7220_intr_fallback; + dd->f_late_initreg = qib_late_7220_initreg; + dd->f_setpbc_control = qib_7220_setpbc_control; + dd->f_portcntr = qib_portcntr_7220; + dd->f_put_tid = qib_7220_put_tid; + dd->f_quiet_serdes = qib_7220_quiet_serdes; + dd->f_rcvctrl = rcvctrl_7220_mod; + dd->f_read_cntrs = qib_read_7220cntrs; + dd->f_read_portcntrs = qib_read_7220portcntrs; + dd->f_reset = qib_setup_7220_reset; + dd->f_init_sdma_regs = init_sdma_7220_regs; + dd->f_sdma_busy = qib_sdma_7220_busy; + dd->f_sdma_gethead = qib_sdma_7220_gethead; + dd->f_sdma_sendctrl = qib_7220_sdma_sendctrl; + dd->f_sdma_set_desc_cnt = qib_sdma_set_7220_desc_cnt; + dd->f_sdma_update_tail = qib_sdma_update_7220_tail; + dd->f_sdma_hw_clean_up = qib_7220_sdma_hw_clean_up; + dd->f_sdma_hw_start_up = qib_7220_sdma_hw_start_up; + dd->f_sdma_init_early = qib_7220_sdma_init_early; + dd->f_sendctrl = sendctrl_7220_mod; + dd->f_set_armlaunch = qib_set_7220_armlaunch; + dd->f_set_cntr_sample = qib_set_cntr_7220_sample; + dd->f_iblink_state = qib_7220_iblink_state; + dd->f_ibphys_portstate = qib_7220_phys_portstate; + dd->f_get_ib_cfg = qib_7220_get_ib_cfg; + dd->f_set_ib_cfg = qib_7220_set_ib_cfg; + dd->f_set_ib_loopback = qib_7220_set_loopback; + dd->f_set_intr_state = qib_7220_set_intr_state; + dd->f_setextled = qib_setup_7220_setextled; + dd->f_txchk_change = qib_7220_txchk_change; + dd->f_update_usrhead = qib_update_7220_usrhead; + dd->f_wantpiobuf_intr = qib_wantpiobuf_7220_intr; + dd->f_xgxs_reset = qib_7220_xgxs_reset; + dd->f_writescratch = writescratch; + dd->f_tempsense_rd = qib_7220_tempsense_rd; +#ifdef CONFIG_INFINIBAND_QIB_DCA + dd->f_notify_dca = qib_7220_notify_dca; +#endif + /* + * Do remaining pcie setup and save pcie values in dd. + * Any error printing is already done by the init code. + * On return, we have the chip mapped, but chip registers + * are not set up until start of qib_init_7220_variables. + */ + ret = qib_pcie_ddinit(dd, pdev, ent); + if (ret < 0) + goto bail_free; + + /* initialize chip-specific variables */ + ret = qib_init_7220_variables(dd); + if (ret) + goto bail_cleanup; + + if (qib_mini_init) + goto bail; + + boardid = SYM_FIELD(dd->revision, Revision, + BoardID); + switch (boardid) { + case 0: + case 2: + case 10: + case 12: + minwidth = 16; /* x16 capable boards */ + break; + default: + minwidth = 8; /* x8 capable boards */ + break; + } + if (qib_pcie_params(dd, minwidth, NULL, NULL)) + qib_dev_err(dd, + "Failed to setup PCIe or interrupts; continuing anyway\n"); + + /* save IRQ for possible later use */ + dd->cspec->irq = pdev->irq; + + if (qib_read_kreg64(dd, kr_hwerrstatus) & + QLOGIC_IB_HWE_SERDESPLLFAILED) + qib_write_kreg(dd, kr_hwerrclear, + QLOGIC_IB_HWE_SERDESPLLFAILED); + + /* setup interrupt handler (interrupt type handled above) */ + qib_setup_7220_interrupt(dd); + qib_7220_init_hwerrors(dd); + + /* clear diagctrl register, in case diags were running and crashed */ + qib_write_kreg(dd, kr_hwdiagctrl, 0); + + goto bail; + +bail_cleanup: + qib_pcie_ddcleanup(dd); +bail_free: + qib_free_devdata(dd); + dd = ERR_PTR(ret); +bail: + return dd; +} diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c new file mode 100644 index 0000000..a7eb325 --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -0,0 +1,8554 @@ +/* + * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2008 - 2012 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This file contains all of the code that is specific to the + * InfiniPath 7322 chip + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_INFINIBAND_QIB_DCA +#include +#endif + +#include "qib.h" +#include "qib_7322_regs.h" +#include "qib_qsfp.h" + +#include "qib_mad.h" +#include "qib_verbs.h" + +#undef pr_fmt +#define pr_fmt(fmt) QIB_DRV_NAME " " fmt + +static void qib_setup_7322_setextled(struct qib_pportdata *, u32); +static void qib_7322_handle_hwerrors(struct qib_devdata *, char *, size_t); +static void sendctrl_7322_mod(struct qib_pportdata *ppd, u32 op); +static irqreturn_t qib_7322intr(int irq, void *data); +static irqreturn_t qib_7322bufavail(int irq, void *data); +static irqreturn_t sdma_intr(int irq, void *data); +static irqreturn_t sdma_idle_intr(int irq, void *data); +static irqreturn_t sdma_progress_intr(int irq, void *data); +static irqreturn_t sdma_cleanup_intr(int irq, void *data); +static void qib_7322_txchk_change(struct qib_devdata *, u32, u32, u32, + struct qib_ctxtdata *rcd); +static u8 qib_7322_phys_portstate(u64); +static u32 qib_7322_iblink_state(u64); +static void qib_set_ib_7322_lstate(struct qib_pportdata *ppd, u16 linkcmd, + u16 linitcmd); +static void force_h1(struct qib_pportdata *); +static void adj_tx_serdes(struct qib_pportdata *); +static u32 qib_7322_setpbc_control(struct qib_pportdata *, u32, u8, u8); +static void qib_7322_mini_pcs_reset(struct qib_pportdata *); + +static u32 ahb_mod(struct qib_devdata *, int, int, int, u32, u32); +static void ibsd_wr_allchans(struct qib_pportdata *, int, unsigned, unsigned); +static void serdes_7322_los_enable(struct qib_pportdata *, int); +static int serdes_7322_init_old(struct qib_pportdata *); +static int serdes_7322_init_new(struct qib_pportdata *); +static void dump_sdma_7322_state(struct qib_pportdata *); + +#define BMASK(msb, lsb) (((1 << ((msb) + 1 - (lsb))) - 1) << (lsb)) + +/* LE2 serdes values for different cases */ +#define LE2_DEFAULT 5 +#define LE2_5m 4 +#define LE2_QME 0 + +/* Below is special-purpose, so only really works for the IB SerDes blocks. */ +#define IBSD(hw_pidx) (hw_pidx + 2) + +/* these are variables for documentation and experimentation purposes */ +static const unsigned rcv_int_timeout = 375; +static const unsigned rcv_int_count = 16; +static const unsigned sdma_idle_cnt = 64; + +/* Time to stop altering Rx Equalization parameters, after link up. */ +#define RXEQ_DISABLE_MSECS 2500 + +/* + * Number of VLs we are configured to use (to allow for more + * credits per vl, etc.) + */ +ushort qib_num_cfg_vls = 2; +module_param_named(num_vls, qib_num_cfg_vls, ushort, S_IRUGO); +MODULE_PARM_DESC(num_vls, "Set number of Virtual Lanes to use (1-8)"); + +static ushort qib_chase = 1; +module_param_named(chase, qib_chase, ushort, S_IRUGO); +MODULE_PARM_DESC(chase, "Enable state chase handling"); + +static ushort qib_long_atten = 10; /* 10 dB ~= 5m length */ +module_param_named(long_attenuation, qib_long_atten, ushort, S_IRUGO); +MODULE_PARM_DESC(long_attenuation, \ + "attenuation cutoff (dB) for long copper cable setup"); + +static ushort qib_singleport; +module_param_named(singleport, qib_singleport, ushort, S_IRUGO); +MODULE_PARM_DESC(singleport, "Use only IB port 1; more per-port buffer space"); + +static ushort qib_krcvq01_no_msi; +module_param_named(krcvq01_no_msi, qib_krcvq01_no_msi, ushort, S_IRUGO); +MODULE_PARM_DESC(krcvq01_no_msi, "No MSI for kctx < 2"); + +/* + * Receive header queue sizes + */ +static unsigned qib_rcvhdrcnt; +module_param_named(rcvhdrcnt, qib_rcvhdrcnt, uint, S_IRUGO); +MODULE_PARM_DESC(rcvhdrcnt, "receive header count"); + +static unsigned qib_rcvhdrsize; +module_param_named(rcvhdrsize, qib_rcvhdrsize, uint, S_IRUGO); +MODULE_PARM_DESC(rcvhdrsize, "receive header size in 32-bit words"); + +static unsigned qib_rcvhdrentsize; +module_param_named(rcvhdrentsize, qib_rcvhdrentsize, uint, S_IRUGO); +MODULE_PARM_DESC(rcvhdrentsize, "receive header entry size in 32-bit words"); + +#define MAX_ATTEN_LEN 64 /* plenty for any real system */ +/* for read back, default index is ~5m copper cable */ +static char txselect_list[MAX_ATTEN_LEN] = "10"; +static struct kparam_string kp_txselect = { + .string = txselect_list, + .maxlen = MAX_ATTEN_LEN +}; +static int setup_txselect(const char *, struct kernel_param *); +module_param_call(txselect, setup_txselect, param_get_string, + &kp_txselect, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(txselect, \ + "Tx serdes indices (for no QSFP or invalid QSFP data)"); + +#define BOARD_QME7342 5 +#define BOARD_QMH7342 6 +#define IS_QMH(dd) (SYM_FIELD((dd)->revision, Revision, BoardID) == \ + BOARD_QMH7342) +#define IS_QME(dd) (SYM_FIELD((dd)->revision, Revision, BoardID) == \ + BOARD_QME7342) + +#define KREG_IDX(regname) (QIB_7322_##regname##_OFFS / sizeof(u64)) + +#define KREG_IBPORT_IDX(regname) ((QIB_7322_##regname##_0_OFFS / sizeof(u64))) + +#define MASK_ACROSS(lsb, msb) \ + (((1ULL << ((msb) + 1 - (lsb))) - 1) << (lsb)) + +#define SYM_RMASK(regname, fldname) ((u64) \ + QIB_7322_##regname##_##fldname##_RMASK) + +#define SYM_MASK(regname, fldname) ((u64) \ + QIB_7322_##regname##_##fldname##_RMASK << \ + QIB_7322_##regname##_##fldname##_LSB) + +#define SYM_FIELD(value, regname, fldname) ((u64) \ + (((value) >> SYM_LSB(regname, fldname)) & \ + SYM_RMASK(regname, fldname))) + +/* useful for things like LaFifoEmpty_0...7, TxCreditOK_0...7, etc. */ +#define SYM_FIELD_ACROSS(value, regname, fldname, nbits) \ + (((value) >> SYM_LSB(regname, fldname)) & MASK_ACROSS(0, nbits)) + +#define HWE_MASK(fldname) SYM_MASK(HwErrMask, fldname##Mask) +#define ERR_MASK(fldname) SYM_MASK(ErrMask, fldname##Mask) +#define ERR_MASK_N(fldname) SYM_MASK(ErrMask_0, fldname##Mask) +#define INT_MASK(fldname) SYM_MASK(IntMask, fldname##IntMask) +#define INT_MASK_P(fldname, port) SYM_MASK(IntMask, fldname##IntMask##_##port) +/* Below because most, but not all, fields of IntMask have that full suffix */ +#define INT_MASK_PM(fldname, port) SYM_MASK(IntMask, fldname##Mask##_##port) + + +#define SYM_LSB(regname, fldname) (QIB_7322_##regname##_##fldname##_LSB) + +/* + * the size bits give us 2^N, in KB units. 0 marks as invalid, + * and 7 is reserved. We currently use only 2KB and 4KB + */ +#define IBA7322_TID_SZ_SHIFT QIB_7322_RcvTIDArray0_RT_BufSize_LSB +#define IBA7322_TID_SZ_2K (1UL<kregbase || !(dd->flags & QIB_PRESENT)) + return 0; + return readl(regno + (u64 __iomem *)( + (dd->ureg_align * ctxt) + (dd->userbase ? + (char __iomem *)dd->userbase : + (char __iomem *)dd->kregbase + dd->uregbase))); +} + +/** + * qib_read_ureg - read virtualized per-context register + * @dd: device + * @regno: register number + * @ctxt: context number + * + * Return the contents of a register that is virtualized to be per context. + * Returns -1 on errors (not distinguishable from valid contents at + * runtime; we may add a separate error variable at some point). + */ +static inline u64 qib_read_ureg(const struct qib_devdata *dd, + enum qib_ureg regno, int ctxt) +{ + + if (!dd->kregbase || !(dd->flags & QIB_PRESENT)) + return 0; + return readq(regno + (u64 __iomem *)( + (dd->ureg_align * ctxt) + (dd->userbase ? + (char __iomem *)dd->userbase : + (char __iomem *)dd->kregbase + dd->uregbase))); +} + +/** + * qib_write_ureg - write virtualized per-context register + * @dd: device + * @regno: register number + * @value: value + * @ctxt: context + * + * Write the contents of a register that is virtualized to be per context. + */ +static inline void qib_write_ureg(const struct qib_devdata *dd, + enum qib_ureg regno, u64 value, int ctxt) +{ + u64 __iomem *ubase; + if (dd->userbase) + ubase = (u64 __iomem *) + ((char __iomem *) dd->userbase + + dd->ureg_align * ctxt); + else + ubase = (u64 __iomem *) + (dd->uregbase + + (char __iomem *) dd->kregbase + + dd->ureg_align * ctxt); + + if (dd->kregbase && (dd->flags & QIB_PRESENT)) + writeq(value, &ubase[regno]); +} + +static inline u32 qib_read_kreg32(const struct qib_devdata *dd, + const u32 regno) +{ + if (!dd->kregbase || !(dd->flags & QIB_PRESENT)) + return -1; + return readl((u32 __iomem *) &dd->kregbase[regno]); +} + +static inline u64 qib_read_kreg64(const struct qib_devdata *dd, + const u32 regno) +{ + if (!dd->kregbase || !(dd->flags & QIB_PRESENT)) + return -1; + return readq(&dd->kregbase[regno]); +} + +static inline void qib_write_kreg(const struct qib_devdata *dd, + const u32 regno, u64 value) +{ + if (dd->kregbase && (dd->flags & QIB_PRESENT)) + writeq(value, &dd->kregbase[regno]); +} + +/* + * not many sanity checks for the port-specific kernel register routines, + * since they are only used when it's known to be safe. +*/ +static inline u64 qib_read_kreg_port(const struct qib_pportdata *ppd, + const u16 regno) +{ + if (!ppd->cpspec->kpregbase || !(ppd->dd->flags & QIB_PRESENT)) + return 0ULL; + return readq(&ppd->cpspec->kpregbase[regno]); +} + +static inline void qib_write_kreg_port(const struct qib_pportdata *ppd, + const u16 regno, u64 value) +{ + if (ppd->cpspec && ppd->dd && ppd->cpspec->kpregbase && + (ppd->dd->flags & QIB_PRESENT)) + writeq(value, &ppd->cpspec->kpregbase[regno]); +} + +/** + * qib_write_kreg_ctxt - write a device's per-ctxt 64-bit kernel register + * @dd: the qlogic_ib device + * @regno: the register number to write + * @ctxt: the context containing the register + * @value: the value to write + */ +static inline void qib_write_kreg_ctxt(const struct qib_devdata *dd, + const u16 regno, unsigned ctxt, + u64 value) +{ + qib_write_kreg(dd, regno + ctxt, value); +} + +static inline u64 read_7322_creg(const struct qib_devdata *dd, u16 regno) +{ + if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT)) + return 0; + return readq(&dd->cspec->cregbase[regno]); + + +} + +static inline u32 read_7322_creg32(const struct qib_devdata *dd, u16 regno) +{ + if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT)) + return 0; + return readl(&dd->cspec->cregbase[regno]); + + +} + +static inline void write_7322_creg_port(const struct qib_pportdata *ppd, + u16 regno, u64 value) +{ + if (ppd->cpspec && ppd->cpspec->cpregbase && + (ppd->dd->flags & QIB_PRESENT)) + writeq(value, &ppd->cpspec->cpregbase[regno]); +} + +static inline u64 read_7322_creg_port(const struct qib_pportdata *ppd, + u16 regno) +{ + if (!ppd->cpspec || !ppd->cpspec->cpregbase || + !(ppd->dd->flags & QIB_PRESENT)) + return 0; + return readq(&ppd->cpspec->cpregbase[regno]); +} + +static inline u32 read_7322_creg32_port(const struct qib_pportdata *ppd, + u16 regno) +{ + if (!ppd->cpspec || !ppd->cpspec->cpregbase || + !(ppd->dd->flags & QIB_PRESENT)) + return 0; + return readl(&ppd->cpspec->cpregbase[regno]); +} + +/* bits in Control register */ +#define QLOGIC_IB_C_RESET SYM_MASK(Control, SyncReset) +#define QLOGIC_IB_C_SDMAFETCHPRIOEN SYM_MASK(Control, SDmaDescFetchPriorityEn) + +/* bits in general interrupt regs */ +#define QIB_I_RCVURG_LSB SYM_LSB(IntMask, RcvUrg0IntMask) +#define QIB_I_RCVURG_RMASK MASK_ACROSS(0, 17) +#define QIB_I_RCVURG_MASK (QIB_I_RCVURG_RMASK << QIB_I_RCVURG_LSB) +#define QIB_I_RCVAVAIL_LSB SYM_LSB(IntMask, RcvAvail0IntMask) +#define QIB_I_RCVAVAIL_RMASK MASK_ACROSS(0, 17) +#define QIB_I_RCVAVAIL_MASK (QIB_I_RCVAVAIL_RMASK << QIB_I_RCVAVAIL_LSB) +#define QIB_I_C_ERROR INT_MASK(Err) + +#define QIB_I_SPIOSENT (INT_MASK_P(SendDone, 0) | INT_MASK_P(SendDone, 1)) +#define QIB_I_SPIOBUFAVAIL INT_MASK(SendBufAvail) +#define QIB_I_GPIO INT_MASK(AssertGPIO) +#define QIB_I_P_SDMAINT(pidx) \ + (INT_MASK_P(SDma, pidx) | INT_MASK_P(SDmaIdle, pidx) | \ + INT_MASK_P(SDmaProgress, pidx) | \ + INT_MASK_PM(SDmaCleanupDone, pidx)) + +/* Interrupt bits that are "per port" */ +#define QIB_I_P_BITSEXTANT(pidx) \ + (INT_MASK_P(Err, pidx) | INT_MASK_P(SendDone, pidx) | \ + INT_MASK_P(SDma, pidx) | INT_MASK_P(SDmaIdle, pidx) | \ + INT_MASK_P(SDmaProgress, pidx) | \ + INT_MASK_PM(SDmaCleanupDone, pidx)) + +/* Interrupt bits that are common to a device */ +/* currently unused: QIB_I_SPIOSENT */ +#define QIB_I_C_BITSEXTANT \ + (QIB_I_RCVURG_MASK | QIB_I_RCVAVAIL_MASK | \ + QIB_I_SPIOSENT | \ + QIB_I_C_ERROR | QIB_I_SPIOBUFAVAIL | QIB_I_GPIO) + +#define QIB_I_BITSEXTANT (QIB_I_C_BITSEXTANT | \ + QIB_I_P_BITSEXTANT(0) | QIB_I_P_BITSEXTANT(1)) + +/* + * Error bits that are "per port". + */ +#define QIB_E_P_IBSTATUSCHANGED ERR_MASK_N(IBStatusChanged) +#define QIB_E_P_SHDR ERR_MASK_N(SHeadersErr) +#define QIB_E_P_VL15_BUF_MISUSE ERR_MASK_N(VL15BufMisuseErr) +#define QIB_E_P_SND_BUF_MISUSE ERR_MASK_N(SendBufMisuseErr) +#define QIB_E_P_SUNSUPVL ERR_MASK_N(SendUnsupportedVLErr) +#define QIB_E_P_SUNEXP_PKTNUM ERR_MASK_N(SendUnexpectedPktNumErr) +#define QIB_E_P_SDROP_DATA ERR_MASK_N(SendDroppedDataPktErr) +#define QIB_E_P_SDROP_SMP ERR_MASK_N(SendDroppedSmpPktErr) +#define QIB_E_P_SPKTLEN ERR_MASK_N(SendPktLenErr) +#define QIB_E_P_SUNDERRUN ERR_MASK_N(SendUnderRunErr) +#define QIB_E_P_SMAXPKTLEN ERR_MASK_N(SendMaxPktLenErr) +#define QIB_E_P_SMINPKTLEN ERR_MASK_N(SendMinPktLenErr) +#define QIB_E_P_RIBLOSTLINK ERR_MASK_N(RcvIBLostLinkErr) +#define QIB_E_P_RHDR ERR_MASK_N(RcvHdrErr) +#define QIB_E_P_RHDRLEN ERR_MASK_N(RcvHdrLenErr) +#define QIB_E_P_RBADTID ERR_MASK_N(RcvBadTidErr) +#define QIB_E_P_RBADVERSION ERR_MASK_N(RcvBadVersionErr) +#define QIB_E_P_RIBFLOW ERR_MASK_N(RcvIBFlowErr) +#define QIB_E_P_REBP ERR_MASK_N(RcvEBPErr) +#define QIB_E_P_RUNSUPVL ERR_MASK_N(RcvUnsupportedVLErr) +#define QIB_E_P_RUNEXPCHAR ERR_MASK_N(RcvUnexpectedCharErr) +#define QIB_E_P_RSHORTPKTLEN ERR_MASK_N(RcvShortPktLenErr) +#define QIB_E_P_RLONGPKTLEN ERR_MASK_N(RcvLongPktLenErr) +#define QIB_E_P_RMAXPKTLEN ERR_MASK_N(RcvMaxPktLenErr) +#define QIB_E_P_RMINPKTLEN ERR_MASK_N(RcvMinPktLenErr) +#define QIB_E_P_RICRC ERR_MASK_N(RcvICRCErr) +#define QIB_E_P_RVCRC ERR_MASK_N(RcvVCRCErr) +#define QIB_E_P_RFORMATERR ERR_MASK_N(RcvFormatErr) + +#define QIB_E_P_SDMA1STDESC ERR_MASK_N(SDma1stDescErr) +#define QIB_E_P_SDMABASE ERR_MASK_N(SDmaBaseErr) +#define QIB_E_P_SDMADESCADDRMISALIGN ERR_MASK_N(SDmaDescAddrMisalignErr) +#define QIB_E_P_SDMADWEN ERR_MASK_N(SDmaDwEnErr) +#define QIB_E_P_SDMAGENMISMATCH ERR_MASK_N(SDmaGenMismatchErr) +#define QIB_E_P_SDMAHALT ERR_MASK_N(SDmaHaltErr) +#define QIB_E_P_SDMAMISSINGDW ERR_MASK_N(SDmaMissingDwErr) +#define QIB_E_P_SDMAOUTOFBOUND ERR_MASK_N(SDmaOutOfBoundErr) +#define QIB_E_P_SDMARPYTAG ERR_MASK_N(SDmaRpyTagErr) +#define QIB_E_P_SDMATAILOUTOFBOUND ERR_MASK_N(SDmaTailOutOfBoundErr) +#define QIB_E_P_SDMAUNEXPDATA ERR_MASK_N(SDmaUnexpDataErr) + +/* Error bits that are common to a device */ +#define QIB_E_RESET ERR_MASK(ResetNegated) +#define QIB_E_HARDWARE ERR_MASK(HardwareErr) +#define QIB_E_INVALIDADDR ERR_MASK(InvalidAddrErr) + + +/* + * Per chip (rather than per-port) errors. Most either do + * nothing but trigger a print (because they self-recover, or + * always occur in tandem with other errors that handle the + * issue), or because they indicate errors with no recovery, + * but we want to know that they happened. + */ +#define QIB_E_SBUF_VL15_MISUSE ERR_MASK(SBufVL15MisUseErr) +#define QIB_E_BADEEP ERR_MASK(InvalidEEPCmd) +#define QIB_E_VLMISMATCH ERR_MASK(SendVLMismatchErr) +#define QIB_E_ARMLAUNCH ERR_MASK(SendArmLaunchErr) +#define QIB_E_SPCLTRIG ERR_MASK(SendSpecialTriggerErr) +#define QIB_E_RRCVHDRFULL ERR_MASK(RcvHdrFullErr) +#define QIB_E_RRCVEGRFULL ERR_MASK(RcvEgrFullErr) +#define QIB_E_RCVCTXTSHARE ERR_MASK(RcvContextShareErr) + +/* SDMA chip errors (not per port) + * QIB_E_SDMA_BUF_DUP needs no special handling, because we will also get + * the SDMAHALT error immediately, so we just print the dup error via the + * E_AUTO mechanism. This is true of most of the per-port fatal errors + * as well, but since this is port-independent, by definition, it's + * handled a bit differently. SDMA_VL15 and SDMA_WRONG_PORT are per + * packet send errors, and so are handled in the same manner as other + * per-packet errors. + */ +#define QIB_E_SDMA_VL15 ERR_MASK(SDmaVL15Err) +#define QIB_E_SDMA_WRONG_PORT ERR_MASK(SDmaWrongPortErr) +#define QIB_E_SDMA_BUF_DUP ERR_MASK(SDmaBufMaskDuplicateErr) + +/* + * Below functionally equivalent to legacy QLOGIC_IB_E_PKTERRS + * it is used to print "common" packet errors. + */ +#define QIB_E_P_PKTERRS (QIB_E_P_SPKTLEN |\ + QIB_E_P_SDROP_DATA | QIB_E_P_RVCRC |\ + QIB_E_P_RICRC | QIB_E_P_RSHORTPKTLEN |\ + QIB_E_P_VL15_BUF_MISUSE | QIB_E_P_SHDR | \ + QIB_E_P_REBP) + +/* Error Bits that Packet-related (Receive, per-port) */ +#define QIB_E_P_RPKTERRS (\ + QIB_E_P_RHDRLEN | QIB_E_P_RBADTID | \ + QIB_E_P_RBADVERSION | QIB_E_P_RHDR | \ + QIB_E_P_RLONGPKTLEN | QIB_E_P_RSHORTPKTLEN |\ + QIB_E_P_RMAXPKTLEN | QIB_E_P_RMINPKTLEN | \ + QIB_E_P_RFORMATERR | QIB_E_P_RUNSUPVL | \ + QIB_E_P_RUNEXPCHAR | QIB_E_P_RIBFLOW | QIB_E_P_REBP) + +/* + * Error bits that are Send-related (per port) + * (ARMLAUNCH excluded from E_SPKTERRS because it gets special handling). + * All of these potentially need to have a buffer disarmed + */ +#define QIB_E_P_SPKTERRS (\ + QIB_E_P_SUNEXP_PKTNUM |\ + QIB_E_P_SDROP_DATA | QIB_E_P_SDROP_SMP |\ + QIB_E_P_SMAXPKTLEN |\ + QIB_E_P_VL15_BUF_MISUSE | QIB_E_P_SHDR | \ + QIB_E_P_SMINPKTLEN | QIB_E_P_SPKTLEN | \ + QIB_E_P_SND_BUF_MISUSE | QIB_E_P_SUNSUPVL) + +#define QIB_E_SPKTERRS ( \ + QIB_E_SBUF_VL15_MISUSE | QIB_E_VLMISMATCH | \ + ERR_MASK_N(SendUnsupportedVLErr) | \ + QIB_E_SPCLTRIG | QIB_E_SDMA_VL15 | QIB_E_SDMA_WRONG_PORT) + +#define QIB_E_P_SDMAERRS ( \ + QIB_E_P_SDMAHALT | \ + QIB_E_P_SDMADESCADDRMISALIGN | \ + QIB_E_P_SDMAUNEXPDATA | \ + QIB_E_P_SDMAMISSINGDW | \ + QIB_E_P_SDMADWEN | \ + QIB_E_P_SDMARPYTAG | \ + QIB_E_P_SDMA1STDESC | \ + QIB_E_P_SDMABASE | \ + QIB_E_P_SDMATAILOUTOFBOUND | \ + QIB_E_P_SDMAOUTOFBOUND | \ + QIB_E_P_SDMAGENMISMATCH) + +/* + * This sets some bits more than once, but makes it more obvious which + * bits are not handled under other categories, and the repeat definition + * is not a problem. + */ +#define QIB_E_P_BITSEXTANT ( \ + QIB_E_P_SPKTERRS | QIB_E_P_PKTERRS | QIB_E_P_RPKTERRS | \ + QIB_E_P_RIBLOSTLINK | QIB_E_P_IBSTATUSCHANGED | \ + QIB_E_P_SND_BUF_MISUSE | QIB_E_P_SUNDERRUN | \ + QIB_E_P_SHDR | QIB_E_P_VL15_BUF_MISUSE | QIB_E_P_SDMAERRS \ + ) + +/* + * These are errors that can occur when the link + * changes state while a packet is being sent or received. This doesn't + * cover things like EBP or VCRC that can be the result of a sending + * having the link change state, so we receive a "known bad" packet. + * All of these are "per port", so renamed: + */ +#define QIB_E_P_LINK_PKTERRS (\ + QIB_E_P_SDROP_DATA | QIB_E_P_SDROP_SMP |\ + QIB_E_P_SMINPKTLEN | QIB_E_P_SPKTLEN |\ + QIB_E_P_RSHORTPKTLEN | QIB_E_P_RMINPKTLEN |\ + QIB_E_P_RUNEXPCHAR) + +/* + * This sets some bits more than once, but makes it more obvious which + * bits are not handled under other categories (such as QIB_E_SPKTERRS), + * and the repeat definition is not a problem. + */ +#define QIB_E_C_BITSEXTANT (\ + QIB_E_HARDWARE | QIB_E_INVALIDADDR | QIB_E_BADEEP |\ + QIB_E_ARMLAUNCH | QIB_E_VLMISMATCH | QIB_E_RRCVHDRFULL |\ + QIB_E_RRCVEGRFULL | QIB_E_RESET | QIB_E_SBUF_VL15_MISUSE) + +/* Likewise Neuter E_SPKT_ERRS_IGNORE */ +#define E_SPKT_ERRS_IGNORE 0 + +#define QIB_EXTS_MEMBIST_DISABLED \ + SYM_MASK(EXTStatus, MemBISTDisabled) +#define QIB_EXTS_MEMBIST_ENDTEST \ + SYM_MASK(EXTStatus, MemBISTEndTest) + +#define QIB_E_SPIOARMLAUNCH \ + ERR_MASK(SendArmLaunchErr) + +#define IBA7322_IBCC_LINKINITCMD_MASK SYM_RMASK(IBCCtrlA_0, LinkInitCmd) +#define IBA7322_IBCC_LINKCMD_SHIFT SYM_LSB(IBCCtrlA_0, LinkCmd) + +/* + * IBTA_1_2 is set when multiple speeds are enabled (normal), + * and also if forced QDR (only QDR enabled). It's enabled for the + * forced QDR case so that scrambling will be enabled by the TS3 + * exchange, when supported by both sides of the link. + */ +#define IBA7322_IBC_IBTA_1_2_MASK SYM_MASK(IBCCtrlB_0, IB_ENHANCED_MODE) +#define IBA7322_IBC_MAX_SPEED_MASK SYM_MASK(IBCCtrlB_0, SD_SPEED) +#define IBA7322_IBC_SPEED_QDR SYM_MASK(IBCCtrlB_0, SD_SPEED_QDR) +#define IBA7322_IBC_SPEED_DDR SYM_MASK(IBCCtrlB_0, SD_SPEED_DDR) +#define IBA7322_IBC_SPEED_SDR SYM_MASK(IBCCtrlB_0, SD_SPEED_SDR) +#define IBA7322_IBC_SPEED_MASK (SYM_MASK(IBCCtrlB_0, SD_SPEED_SDR) | \ + SYM_MASK(IBCCtrlB_0, SD_SPEED_DDR) | SYM_MASK(IBCCtrlB_0, SD_SPEED_QDR)) +#define IBA7322_IBC_SPEED_LSB SYM_LSB(IBCCtrlB_0, SD_SPEED_SDR) + +#define IBA7322_LEDBLINK_OFF_SHIFT SYM_LSB(RcvPktLEDCnt_0, OFFperiod) +#define IBA7322_LEDBLINK_ON_SHIFT SYM_LSB(RcvPktLEDCnt_0, ONperiod) + +#define IBA7322_IBC_WIDTH_AUTONEG SYM_MASK(IBCCtrlB_0, IB_NUM_CHANNELS) +#define IBA7322_IBC_WIDTH_4X_ONLY (1<> \ + SYM_LSB(IBCCtrlB_0, HRTBT_ENB)) +#define IBA7322_IBC_HRTBT_LSB SYM_LSB(IBCCtrlB_0, HRTBT_ENB) + +#define IBA7322_REDIRECT_VEC_PER_REG 12 + +#define IBA7322_SENDCHK_PKEY SYM_MASK(SendCheckControl_0, PKey_En) +#define IBA7322_SENDCHK_BTHQP SYM_MASK(SendCheckControl_0, BTHQP_En) +#define IBA7322_SENDCHK_SLID SYM_MASK(SendCheckControl_0, SLID_En) +#define IBA7322_SENDCHK_RAW_IPV6 SYM_MASK(SendCheckControl_0, RawIPV6_En) +#define IBA7322_SENDCHK_MINSZ SYM_MASK(SendCheckControl_0, PacketTooSmall_En) + +#define AUTONEG_TRIES 3 /* sequential retries to negotiate DDR */ + +#define HWE_AUTO(fldname) { .mask = SYM_MASK(HwErrMask, fldname##Mask), \ + .msg = #fldname , .sz = sizeof(#fldname) } +#define HWE_AUTO_P(fldname, port) { .mask = SYM_MASK(HwErrMask, \ + fldname##Mask##_##port), .msg = #fldname , .sz = sizeof(#fldname) } +static const struct qib_hwerror_msgs qib_7322_hwerror_msgs[] = { + HWE_AUTO_P(IBSerdesPClkNotDetect, 1), + HWE_AUTO_P(IBSerdesPClkNotDetect, 0), + HWE_AUTO(PCIESerdesPClkNotDetect), + HWE_AUTO(PowerOnBISTFailed), + HWE_AUTO(TempsenseTholdReached), + HWE_AUTO(MemoryErr), + HWE_AUTO(PCIeBusParityErr), + HWE_AUTO(PcieCplTimeout), + HWE_AUTO(PciePoisonedTLP), + HWE_AUTO_P(SDmaMemReadErr, 1), + HWE_AUTO_P(SDmaMemReadErr, 0), + HWE_AUTO_P(IBCBusFromSPCParityErr, 1), + HWE_AUTO_P(IBCBusToSPCParityErr, 1), + HWE_AUTO_P(IBCBusFromSPCParityErr, 0), + HWE_AUTO(statusValidNoEop), + HWE_AUTO(LATriggered), + { .mask = 0, .sz = 0 } +}; + +#define E_AUTO(fldname) { .mask = SYM_MASK(ErrMask, fldname##Mask), \ + .msg = #fldname, .sz = sizeof(#fldname) } +#define E_P_AUTO(fldname) { .mask = SYM_MASK(ErrMask_0, fldname##Mask), \ + .msg = #fldname, .sz = sizeof(#fldname) } +static const struct qib_hwerror_msgs qib_7322error_msgs[] = { + E_AUTO(RcvEgrFullErr), + E_AUTO(RcvHdrFullErr), + E_AUTO(ResetNegated), + E_AUTO(HardwareErr), + E_AUTO(InvalidAddrErr), + E_AUTO(SDmaVL15Err), + E_AUTO(SBufVL15MisUseErr), + E_AUTO(InvalidEEPCmd), + E_AUTO(RcvContextShareErr), + E_AUTO(SendVLMismatchErr), + E_AUTO(SendArmLaunchErr), + E_AUTO(SendSpecialTriggerErr), + E_AUTO(SDmaWrongPortErr), + E_AUTO(SDmaBufMaskDuplicateErr), + { .mask = 0, .sz = 0 } +}; + +static const struct qib_hwerror_msgs qib_7322p_error_msgs[] = { + E_P_AUTO(IBStatusChanged), + E_P_AUTO(SHeadersErr), + E_P_AUTO(VL15BufMisuseErr), + /* + * SDmaHaltErr is not really an error, make it clearer; + */ + {.mask = SYM_MASK(ErrMask_0, SDmaHaltErrMask), .msg = "SDmaHalted", + .sz = 11}, + E_P_AUTO(SDmaDescAddrMisalignErr), + E_P_AUTO(SDmaUnexpDataErr), + E_P_AUTO(SDmaMissingDwErr), + E_P_AUTO(SDmaDwEnErr), + E_P_AUTO(SDmaRpyTagErr), + E_P_AUTO(SDma1stDescErr), + E_P_AUTO(SDmaBaseErr), + E_P_AUTO(SDmaTailOutOfBoundErr), + E_P_AUTO(SDmaOutOfBoundErr), + E_P_AUTO(SDmaGenMismatchErr), + E_P_AUTO(SendBufMisuseErr), + E_P_AUTO(SendUnsupportedVLErr), + E_P_AUTO(SendUnexpectedPktNumErr), + E_P_AUTO(SendDroppedDataPktErr), + E_P_AUTO(SendDroppedSmpPktErr), + E_P_AUTO(SendPktLenErr), + E_P_AUTO(SendUnderRunErr), + E_P_AUTO(SendMaxPktLenErr), + E_P_AUTO(SendMinPktLenErr), + E_P_AUTO(RcvIBLostLinkErr), + E_P_AUTO(RcvHdrErr), + E_P_AUTO(RcvHdrLenErr), + E_P_AUTO(RcvBadTidErr), + E_P_AUTO(RcvBadVersionErr), + E_P_AUTO(RcvIBFlowErr), + E_P_AUTO(RcvEBPErr), + E_P_AUTO(RcvUnsupportedVLErr), + E_P_AUTO(RcvUnexpectedCharErr), + E_P_AUTO(RcvShortPktLenErr), + E_P_AUTO(RcvLongPktLenErr), + E_P_AUTO(RcvMaxPktLenErr), + E_P_AUTO(RcvMinPktLenErr), + E_P_AUTO(RcvICRCErr), + E_P_AUTO(RcvVCRCErr), + E_P_AUTO(RcvFormatErr), + { .mask = 0, .sz = 0 } +}; + +/* + * Below generates "auto-message" for interrupts not specific to any port or + * context + */ +#define INTR_AUTO(fldname) { .mask = SYM_MASK(IntMask, fldname##Mask), \ + .msg = #fldname, .sz = sizeof(#fldname) } +/* Below generates "auto-message" for interrupts specific to a port */ +#define INTR_AUTO_P(fldname) { .mask = MASK_ACROSS(\ + SYM_LSB(IntMask, fldname##Mask##_0), \ + SYM_LSB(IntMask, fldname##Mask##_1)), \ + .msg = #fldname "_P", .sz = sizeof(#fldname "_P") } +/* For some reason, the SerDesTrimDone bits are reversed */ +#define INTR_AUTO_PI(fldname) { .mask = MASK_ACROSS(\ + SYM_LSB(IntMask, fldname##Mask##_1), \ + SYM_LSB(IntMask, fldname##Mask##_0)), \ + .msg = #fldname "_P", .sz = sizeof(#fldname "_P") } +/* + * Below generates "auto-message" for interrupts specific to a context, + * with ctxt-number appended + */ +#define INTR_AUTO_C(fldname) { .mask = MASK_ACROSS(\ + SYM_LSB(IntMask, fldname##0IntMask), \ + SYM_LSB(IntMask, fldname##17IntMask)), \ + .msg = #fldname "_C", .sz = sizeof(#fldname "_C") } + +static const struct qib_hwerror_msgs qib_7322_intr_msgs[] = { + INTR_AUTO_P(SDmaInt), + INTR_AUTO_P(SDmaProgressInt), + INTR_AUTO_P(SDmaIdleInt), + INTR_AUTO_P(SDmaCleanupDone), + INTR_AUTO_C(RcvUrg), + INTR_AUTO_P(ErrInt), + INTR_AUTO(ErrInt), /* non-port-specific errs */ + INTR_AUTO(AssertGPIOInt), + INTR_AUTO_P(SendDoneInt), + INTR_AUTO(SendBufAvailInt), + INTR_AUTO_C(RcvAvail), + { .mask = 0, .sz = 0 } +}; + +#define TXSYMPTOM_AUTO_P(fldname) \ + { .mask = SYM_MASK(SendHdrErrSymptom_0, fldname), \ + .msg = #fldname, .sz = sizeof(#fldname) } +static const struct qib_hwerror_msgs hdrchk_msgs[] = { + TXSYMPTOM_AUTO_P(NonKeyPacket), + TXSYMPTOM_AUTO_P(GRHFail), + TXSYMPTOM_AUTO_P(PkeyFail), + TXSYMPTOM_AUTO_P(QPFail), + TXSYMPTOM_AUTO_P(SLIDFail), + TXSYMPTOM_AUTO_P(RawIPV6), + TXSYMPTOM_AUTO_P(PacketTooSmall), + { .mask = 0, .sz = 0 } +}; + +#define IBA7322_HDRHEAD_PKTINT_SHIFT 32 /* interrupt cnt in upper 32 bits */ + +/* + * Called when we might have an error that is specific to a particular + * PIO buffer, and may need to cancel that buffer, so it can be re-used, + * because we don't need to force the update of pioavail + */ +static void qib_disarm_7322_senderrbufs(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + u32 i; + int any; + u32 piobcnt = dd->piobcnt2k + dd->piobcnt4k + NUM_VL15_BUFS; + u32 regcnt = (piobcnt + BITS_PER_LONG - 1) / BITS_PER_LONG; + unsigned long sbuf[4]; + + /* + * It's possible that sendbuffererror could have bits set; might + * have already done this as a result of hardware error handling. + */ + any = 0; + for (i = 0; i < regcnt; ++i) { + sbuf[i] = qib_read_kreg64(dd, kr_sendbuffererror + i); + if (sbuf[i]) { + any = 1; + qib_write_kreg(dd, kr_sendbuffererror + i, sbuf[i]); + } + } + + if (any) + qib_disarm_piobufs_set(dd, sbuf, piobcnt); +} + +/* No txe_recover yet, if ever */ + +/* No decode__errors yet */ +static void err_decode(char *msg, size_t len, u64 errs, + const struct qib_hwerror_msgs *msp) +{ + u64 these, lmask; + int took, multi, n = 0; + + while (errs && msp && msp->mask) { + multi = (msp->mask & (msp->mask - 1)); + while (errs & msp->mask) { + these = (errs & msp->mask); + lmask = (these & (these - 1)) ^ these; + if (len) { + if (n++) { + /* separate the strings */ + *msg++ = ','; + len--; + } + BUG_ON(!msp->sz); + /* msp->sz counts the nul */ + took = min_t(size_t, msp->sz - (size_t)1, len); + memcpy(msg, msp->msg, took); + len -= took; + msg += took; + if (len) + *msg = '\0'; + } + errs &= ~lmask; + if (len && multi) { + /* More than one bit this mask */ + int idx = -1; + + while (lmask & msp->mask) { + ++idx; + lmask >>= 1; + } + took = scnprintf(msg, len, "_%d", idx); + len -= took; + msg += took; + } + } + ++msp; + } + /* If some bits are left, show in hex. */ + if (len && errs) + snprintf(msg, len, "%sMORE:%llX", n ? "," : "", + (unsigned long long) errs); +} + +/* only called if r1 set */ +static void flush_fifo(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + u32 __iomem *piobuf; + u32 bufn; + u32 *hdr; + u64 pbc; + const unsigned hdrwords = 7; + static struct qib_ib_header ibhdr = { + .lrh[0] = cpu_to_be16(0xF000 | QIB_LRH_BTH), + .lrh[1] = IB_LID_PERMISSIVE, + .lrh[2] = cpu_to_be16(hdrwords + SIZE_OF_CRC), + .lrh[3] = IB_LID_PERMISSIVE, + .u.oth.bth[0] = cpu_to_be32( + (IB_OPCODE_UD_SEND_ONLY << 24) | QIB_DEFAULT_P_KEY), + .u.oth.bth[1] = cpu_to_be32(0), + .u.oth.bth[2] = cpu_to_be32(0), + .u.oth.u.ud.deth[0] = cpu_to_be32(0), + .u.oth.u.ud.deth[1] = cpu_to_be32(0), + }; + + /* + * Send a dummy VL15 packet to flush the launch FIFO. + * This will not actually be sent since the TxeBypassIbc bit is set. + */ + pbc = PBC_7322_VL15_SEND | + (((u64)ppd->hw_pidx) << (PBC_PORT_SEL_LSB + 32)) | + (hdrwords + SIZE_OF_CRC); + piobuf = qib_7322_getsendbuf(ppd, pbc, &bufn); + if (!piobuf) + return; + writeq(pbc, piobuf); + hdr = (u32 *) &ibhdr; + if (dd->flags & QIB_PIO_FLUSH_WC) { + qib_flush_wc(); + qib_pio_copy(piobuf + 2, hdr, hdrwords - 1); + qib_flush_wc(); + __raw_writel(hdr[hdrwords - 1], piobuf + hdrwords + 1); + qib_flush_wc(); + } else + qib_pio_copy(piobuf + 2, hdr, hdrwords); + qib_sendbuf_done(dd, bufn); +} + +/* + * This is called with interrupts disabled and sdma_lock held. + */ +static void qib_7322_sdma_sendctrl(struct qib_pportdata *ppd, unsigned op) +{ + struct qib_devdata *dd = ppd->dd; + u64 set_sendctrl = 0; + u64 clr_sendctrl = 0; + + if (op & QIB_SDMA_SENDCTRL_OP_ENABLE) + set_sendctrl |= SYM_MASK(SendCtrl_0, SDmaEnable); + else + clr_sendctrl |= SYM_MASK(SendCtrl_0, SDmaEnable); + + if (op & QIB_SDMA_SENDCTRL_OP_INTENABLE) + set_sendctrl |= SYM_MASK(SendCtrl_0, SDmaIntEnable); + else + clr_sendctrl |= SYM_MASK(SendCtrl_0, SDmaIntEnable); + + if (op & QIB_SDMA_SENDCTRL_OP_HALT) + set_sendctrl |= SYM_MASK(SendCtrl_0, SDmaHalt); + else + clr_sendctrl |= SYM_MASK(SendCtrl_0, SDmaHalt); + + if (op & QIB_SDMA_SENDCTRL_OP_DRAIN) + set_sendctrl |= SYM_MASK(SendCtrl_0, TxeBypassIbc) | + SYM_MASK(SendCtrl_0, TxeAbortIbc) | + SYM_MASK(SendCtrl_0, TxeDrainRmFifo); + else + clr_sendctrl |= SYM_MASK(SendCtrl_0, TxeBypassIbc) | + SYM_MASK(SendCtrl_0, TxeAbortIbc) | + SYM_MASK(SendCtrl_0, TxeDrainRmFifo); + + spin_lock(&dd->sendctrl_lock); + + /* If we are draining everything, block sends first */ + if (op & QIB_SDMA_SENDCTRL_OP_DRAIN) { + ppd->p_sendctrl &= ~SYM_MASK(SendCtrl_0, SendEnable); + qib_write_kreg_port(ppd, krp_sendctrl, ppd->p_sendctrl); + qib_write_kreg(dd, kr_scratch, 0); + } + + ppd->p_sendctrl |= set_sendctrl; + ppd->p_sendctrl &= ~clr_sendctrl; + + if (op & QIB_SDMA_SENDCTRL_OP_CLEANUP) + qib_write_kreg_port(ppd, krp_sendctrl, + ppd->p_sendctrl | + SYM_MASK(SendCtrl_0, SDmaCleanup)); + else + qib_write_kreg_port(ppd, krp_sendctrl, ppd->p_sendctrl); + qib_write_kreg(dd, kr_scratch, 0); + + if (op & QIB_SDMA_SENDCTRL_OP_DRAIN) { + ppd->p_sendctrl |= SYM_MASK(SendCtrl_0, SendEnable); + qib_write_kreg_port(ppd, krp_sendctrl, ppd->p_sendctrl); + qib_write_kreg(dd, kr_scratch, 0); + } + + spin_unlock(&dd->sendctrl_lock); + + if ((op & QIB_SDMA_SENDCTRL_OP_DRAIN) && ppd->dd->cspec->r1) + flush_fifo(ppd); +} + +static void qib_7322_sdma_hw_clean_up(struct qib_pportdata *ppd) +{ + __qib_sdma_process_event(ppd, qib_sdma_event_e50_hw_cleaned); +} + +static void qib_sdma_7322_setlengen(struct qib_pportdata *ppd) +{ + /* + * Set SendDmaLenGen and clear and set + * the MSB of the generation count to enable generation checking + * and load the internal generation counter. + */ + qib_write_kreg_port(ppd, krp_senddmalengen, ppd->sdma_descq_cnt); + qib_write_kreg_port(ppd, krp_senddmalengen, + ppd->sdma_descq_cnt | + (1ULL << QIB_7322_SendDmaLenGen_0_Generation_MSB)); +} + +/* + * Must be called with sdma_lock held, or before init finished. + */ +static void qib_sdma_update_7322_tail(struct qib_pportdata *ppd, u16 tail) +{ + /* Commit writes to memory and advance the tail on the chip */ + wmb(); + ppd->sdma_descq_tail = tail; + qib_write_kreg_port(ppd, krp_senddmatail, tail); +} + +/* + * This is called with interrupts disabled and sdma_lock held. + */ +static void qib_7322_sdma_hw_start_up(struct qib_pportdata *ppd) +{ + /* + * Drain all FIFOs. + * The hardware doesn't require this but we do it so that verbs + * and user applications don't wait for link active to send stale + * data. + */ + sendctrl_7322_mod(ppd, QIB_SENDCTRL_FLUSH); + + qib_sdma_7322_setlengen(ppd); + qib_sdma_update_7322_tail(ppd, 0); /* Set SendDmaTail */ + ppd->sdma_head_dma[0] = 0; + qib_7322_sdma_sendctrl(ppd, + ppd->sdma_state.current_op | QIB_SDMA_SENDCTRL_OP_CLEANUP); +} + +#define DISABLES_SDMA ( \ + QIB_E_P_SDMAHALT | \ + QIB_E_P_SDMADESCADDRMISALIGN | \ + QIB_E_P_SDMAMISSINGDW | \ + QIB_E_P_SDMADWEN | \ + QIB_E_P_SDMARPYTAG | \ + QIB_E_P_SDMA1STDESC | \ + QIB_E_P_SDMABASE | \ + QIB_E_P_SDMATAILOUTOFBOUND | \ + QIB_E_P_SDMAOUTOFBOUND | \ + QIB_E_P_SDMAGENMISMATCH) + +static void sdma_7322_p_errors(struct qib_pportdata *ppd, u64 errs) +{ + unsigned long flags; + struct qib_devdata *dd = ppd->dd; + + errs &= QIB_E_P_SDMAERRS; + err_decode(ppd->cpspec->sdmamsgbuf, sizeof(ppd->cpspec->sdmamsgbuf), + errs, qib_7322p_error_msgs); + + if (errs & QIB_E_P_SDMAUNEXPDATA) + qib_dev_err(dd, "IB%u:%u SDmaUnexpData\n", dd->unit, + ppd->port); + + spin_lock_irqsave(&ppd->sdma_lock, flags); + + if (errs != QIB_E_P_SDMAHALT) { + /* SDMA errors have QIB_E_P_SDMAHALT and another bit set */ + qib_dev_porterr(dd, ppd->port, + "SDMA %s 0x%016llx %s\n", + qib_sdma_state_names[ppd->sdma_state.current_state], + errs, ppd->cpspec->sdmamsgbuf); + dump_sdma_7322_state(ppd); + } + + switch (ppd->sdma_state.current_state) { + case qib_sdma_state_s00_hw_down: + break; + + case qib_sdma_state_s10_hw_start_up_wait: + if (errs & QIB_E_P_SDMAHALT) + __qib_sdma_process_event(ppd, + qib_sdma_event_e20_hw_started); + break; + + case qib_sdma_state_s20_idle: + break; + + case qib_sdma_state_s30_sw_clean_up_wait: + break; + + case qib_sdma_state_s40_hw_clean_up_wait: + if (errs & QIB_E_P_SDMAHALT) + __qib_sdma_process_event(ppd, + qib_sdma_event_e50_hw_cleaned); + break; + + case qib_sdma_state_s50_hw_halt_wait: + if (errs & QIB_E_P_SDMAHALT) + __qib_sdma_process_event(ppd, + qib_sdma_event_e60_hw_halted); + break; + + case qib_sdma_state_s99_running: + __qib_sdma_process_event(ppd, qib_sdma_event_e7322_err_halted); + __qib_sdma_process_event(ppd, qib_sdma_event_e60_hw_halted); + break; + } + + spin_unlock_irqrestore(&ppd->sdma_lock, flags); +} + +/* + * handle per-device errors (not per-port errors) + */ +static noinline void handle_7322_errors(struct qib_devdata *dd) +{ + char *msg; + u64 iserr = 0; + u64 errs; + u64 mask; + int log_idx; + + qib_stats.sps_errints++; + errs = qib_read_kreg64(dd, kr_errstatus); + if (!errs) { + qib_devinfo(dd->pcidev, + "device error interrupt, but no error bits set!\n"); + goto done; + } + + /* don't report errors that are masked */ + errs &= dd->cspec->errormask; + msg = dd->cspec->emsgbuf; + + /* do these first, they are most important */ + if (errs & QIB_E_HARDWARE) { + *msg = '\0'; + qib_7322_handle_hwerrors(dd, msg, sizeof dd->cspec->emsgbuf); + } else + for (log_idx = 0; log_idx < QIB_EEP_LOG_CNT; ++log_idx) + if (errs & dd->eep_st_masks[log_idx].errs_to_log) + qib_inc_eeprom_err(dd, log_idx, 1); + + if (errs & QIB_E_SPKTERRS) { + qib_disarm_7322_senderrbufs(dd->pport); + qib_stats.sps_txerrs++; + } else if (errs & QIB_E_INVALIDADDR) + qib_stats.sps_txerrs++; + else if (errs & QIB_E_ARMLAUNCH) { + qib_stats.sps_txerrs++; + qib_disarm_7322_senderrbufs(dd->pport); + } + qib_write_kreg(dd, kr_errclear, errs); + + /* + * The ones we mask off are handled specially below + * or above. Also mask SDMADISABLED by default as it + * is too chatty. + */ + mask = QIB_E_HARDWARE; + *msg = '\0'; + + err_decode(msg, sizeof dd->cspec->emsgbuf, errs & ~mask, + qib_7322error_msgs); + + /* + * Getting reset is a tragedy for all ports. Mark the device + * _and_ the ports as "offline" in way meaningful to each. + */ + if (errs & QIB_E_RESET) { + int pidx; + + qib_dev_err(dd, + "Got reset, requires re-init (unload and reload driver)\n"); + dd->flags &= ~QIB_INITTED; /* needs re-init */ + /* mark as having had error */ + *dd->devstatusp |= QIB_STATUS_HWERROR; + for (pidx = 0; pidx < dd->num_pports; ++pidx) + if (dd->pport[pidx].link_speed_supported) + *dd->pport[pidx].statusp &= ~QIB_STATUS_IB_CONF; + } + + if (*msg && iserr) + qib_dev_err(dd, "%s error\n", msg); + + /* + * If there were hdrq or egrfull errors, wake up any processes + * waiting in poll. We used to try to check which contexts had + * the overflow, but given the cost of that and the chip reads + * to support it, it's better to just wake everybody up if we + * get an overflow; waiters can poll again if it's not them. + */ + if (errs & (ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr))) { + qib_handle_urcv(dd, ~0U); + if (errs & ERR_MASK(RcvEgrFullErr)) + qib_stats.sps_buffull++; + else + qib_stats.sps_hdrfull++; + } + +done: + return; +} + +static void qib_error_tasklet(unsigned long data) +{ + struct qib_devdata *dd = (struct qib_devdata *)data; + + handle_7322_errors(dd); + qib_write_kreg(dd, kr_errmask, dd->cspec->errormask); +} + +static void reenable_chase(unsigned long opaque) +{ + struct qib_pportdata *ppd = (struct qib_pportdata *)opaque; + + ppd->cpspec->chase_timer.expires = 0; + qib_set_ib_7322_lstate(ppd, QLOGIC_IB_IBCC_LINKCMD_DOWN, + QLOGIC_IB_IBCC_LINKINITCMD_POLL); +} + +static void disable_chase(struct qib_pportdata *ppd, unsigned long tnow, + u8 ibclt) +{ + ppd->cpspec->chase_end = 0; + + if (!qib_chase) + return; + + qib_set_ib_7322_lstate(ppd, QLOGIC_IB_IBCC_LINKCMD_DOWN, + QLOGIC_IB_IBCC_LINKINITCMD_DISABLE); + ppd->cpspec->chase_timer.expires = jiffies + QIB_CHASE_DIS_TIME; + add_timer(&ppd->cpspec->chase_timer); +} + +static void handle_serdes_issues(struct qib_pportdata *ppd, u64 ibcst) +{ + u8 ibclt; + unsigned long tnow; + + ibclt = (u8)SYM_FIELD(ibcst, IBCStatusA_0, LinkTrainingState); + + /* + * Detect and handle the state chase issue, where we can + * get stuck if we are unlucky on timing on both sides of + * the link. If we are, we disable, set a timer, and + * then re-enable. + */ + switch (ibclt) { + case IB_7322_LT_STATE_CFGRCVFCFG: + case IB_7322_LT_STATE_CFGWAITRMT: + case IB_7322_LT_STATE_TXREVLANES: + case IB_7322_LT_STATE_CFGENH: + tnow = jiffies; + if (ppd->cpspec->chase_end && + time_after(tnow, ppd->cpspec->chase_end)) + disable_chase(ppd, tnow, ibclt); + else if (!ppd->cpspec->chase_end) + ppd->cpspec->chase_end = tnow + QIB_CHASE_TIME; + break; + default: + ppd->cpspec->chase_end = 0; + break; + } + + if (((ibclt >= IB_7322_LT_STATE_CFGTEST && + ibclt <= IB_7322_LT_STATE_CFGWAITENH) || + ibclt == IB_7322_LT_STATE_LINKUP) && + (ibcst & SYM_MASK(IBCStatusA_0, LinkSpeedQDR))) { + force_h1(ppd); + ppd->cpspec->qdr_reforce = 1; + if (!ppd->dd->cspec->r1) + serdes_7322_los_enable(ppd, 0); + } else if (ppd->cpspec->qdr_reforce && + (ibcst & SYM_MASK(IBCStatusA_0, LinkSpeedQDR)) && + (ibclt == IB_7322_LT_STATE_CFGENH || + ibclt == IB_7322_LT_STATE_CFGIDLE || + ibclt == IB_7322_LT_STATE_LINKUP)) + force_h1(ppd); + + if ((IS_QMH(ppd->dd) || IS_QME(ppd->dd)) && + ppd->link_speed_enabled == QIB_IB_QDR && + (ibclt == IB_7322_LT_STATE_CFGTEST || + ibclt == IB_7322_LT_STATE_CFGENH || + (ibclt >= IB_7322_LT_STATE_POLLACTIVE && + ibclt <= IB_7322_LT_STATE_SLEEPQUIET))) + adj_tx_serdes(ppd); + + if (ibclt != IB_7322_LT_STATE_LINKUP) { + u8 ltstate = qib_7322_phys_portstate(ibcst); + u8 pibclt = (u8)SYM_FIELD(ppd->lastibcstat, IBCStatusA_0, + LinkTrainingState); + if (!ppd->dd->cspec->r1 && + pibclt == IB_7322_LT_STATE_LINKUP && + ltstate != IB_PHYSPORTSTATE_LINK_ERR_RECOVER && + ltstate != IB_PHYSPORTSTATE_RECOVERY_RETRAIN && + ltstate != IB_PHYSPORTSTATE_RECOVERY_WAITRMT && + ltstate != IB_PHYSPORTSTATE_RECOVERY_IDLE) + /* If the link went down (but no into recovery, + * turn LOS back on */ + serdes_7322_los_enable(ppd, 1); + if (!ppd->cpspec->qdr_dfe_on && + ibclt <= IB_7322_LT_STATE_SLEEPQUIET) { + ppd->cpspec->qdr_dfe_on = 1; + ppd->cpspec->qdr_dfe_time = 0; + /* On link down, reenable QDR adaptation */ + qib_write_kreg_port(ppd, krp_static_adapt_dis(2), + ppd->dd->cspec->r1 ? + QDR_STATIC_ADAPT_DOWN_R1 : + QDR_STATIC_ADAPT_DOWN); + pr_info( + "IB%u:%u re-enabled QDR adaptation ibclt %x\n", + ppd->dd->unit, ppd->port, ibclt); + } + } +} + +static int qib_7322_set_ib_cfg(struct qib_pportdata *, int, u32); + +/* + * This is per-pport error handling. + * will likely get it's own MSIx interrupt (one for each port, + * although just a single handler). + */ +static noinline void handle_7322_p_errors(struct qib_pportdata *ppd) +{ + char *msg; + u64 ignore_this_time = 0, iserr = 0, errs, fmask; + struct qib_devdata *dd = ppd->dd; + + /* do this as soon as possible */ + fmask = qib_read_kreg64(dd, kr_act_fmask); + if (!fmask) + check_7322_rxe_status(ppd); + + errs = qib_read_kreg_port(ppd, krp_errstatus); + if (!errs) + qib_devinfo(dd->pcidev, + "Port%d error interrupt, but no error bits set!\n", + ppd->port); + if (!fmask) + errs &= ~QIB_E_P_IBSTATUSCHANGED; + if (!errs) + goto done; + + msg = ppd->cpspec->epmsgbuf; + *msg = '\0'; + + if (errs & ~QIB_E_P_BITSEXTANT) { + err_decode(msg, sizeof ppd->cpspec->epmsgbuf, + errs & ~QIB_E_P_BITSEXTANT, qib_7322p_error_msgs); + if (!*msg) + snprintf(msg, sizeof ppd->cpspec->epmsgbuf, + "no others"); + qib_dev_porterr(dd, ppd->port, + "error interrupt with unknown errors 0x%016Lx set (and %s)\n", + (errs & ~QIB_E_P_BITSEXTANT), msg); + *msg = '\0'; + } + + if (errs & QIB_E_P_SHDR) { + u64 symptom; + + /* determine cause, then write to clear */ + symptom = qib_read_kreg_port(ppd, krp_sendhdrsymptom); + qib_write_kreg_port(ppd, krp_sendhdrsymptom, 0); + err_decode(msg, sizeof ppd->cpspec->epmsgbuf, symptom, + hdrchk_msgs); + *msg = '\0'; + /* senderrbuf cleared in SPKTERRS below */ + } + + if (errs & QIB_E_P_SPKTERRS) { + if ((errs & QIB_E_P_LINK_PKTERRS) && + !(ppd->lflags & QIBL_LINKACTIVE)) { + /* + * This can happen when trying to bring the link + * up, but the IB link changes state at the "wrong" + * time. The IB logic then complains that the packet + * isn't valid. We don't want to confuse people, so + * we just don't print them, except at debug + */ + err_decode(msg, sizeof ppd->cpspec->epmsgbuf, + (errs & QIB_E_P_LINK_PKTERRS), + qib_7322p_error_msgs); + *msg = '\0'; + ignore_this_time = errs & QIB_E_P_LINK_PKTERRS; + } + qib_disarm_7322_senderrbufs(ppd); + } else if ((errs & QIB_E_P_LINK_PKTERRS) && + !(ppd->lflags & QIBL_LINKACTIVE)) { + /* + * This can happen when SMA is trying to bring the link + * up, but the IB link changes state at the "wrong" time. + * The IB logic then complains that the packet isn't + * valid. We don't want to confuse people, so we just + * don't print them, except at debug + */ + err_decode(msg, sizeof ppd->cpspec->epmsgbuf, errs, + qib_7322p_error_msgs); + ignore_this_time = errs & QIB_E_P_LINK_PKTERRS; + *msg = '\0'; + } + + qib_write_kreg_port(ppd, krp_errclear, errs); + + errs &= ~ignore_this_time; + if (!errs) + goto done; + + if (errs & QIB_E_P_RPKTERRS) + qib_stats.sps_rcverrs++; + if (errs & QIB_E_P_SPKTERRS) + qib_stats.sps_txerrs++; + + iserr = errs & ~(QIB_E_P_RPKTERRS | QIB_E_P_PKTERRS); + + if (errs & QIB_E_P_SDMAERRS) + sdma_7322_p_errors(ppd, errs); + + if (errs & QIB_E_P_IBSTATUSCHANGED) { + u64 ibcs; + u8 ltstate; + + ibcs = qib_read_kreg_port(ppd, krp_ibcstatus_a); + ltstate = qib_7322_phys_portstate(ibcs); + + if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) + handle_serdes_issues(ppd, ibcs); + if (!(ppd->cpspec->ibcctrl_a & + SYM_MASK(IBCCtrlA_0, IBStatIntReductionEn))) { + /* + * We got our interrupt, so init code should be + * happy and not try alternatives. Now squelch + * other "chatter" from link-negotiation (pre Init) + */ + ppd->cpspec->ibcctrl_a |= + SYM_MASK(IBCCtrlA_0, IBStatIntReductionEn); + qib_write_kreg_port(ppd, krp_ibcctrl_a, + ppd->cpspec->ibcctrl_a); + } + + /* Update our picture of width and speed from chip */ + ppd->link_width_active = + (ibcs & SYM_MASK(IBCStatusA_0, LinkWidthActive)) ? + IB_WIDTH_4X : IB_WIDTH_1X; + ppd->link_speed_active = (ibcs & SYM_MASK(IBCStatusA_0, + LinkSpeedQDR)) ? QIB_IB_QDR : (ibcs & + SYM_MASK(IBCStatusA_0, LinkSpeedActive)) ? + QIB_IB_DDR : QIB_IB_SDR; + + if ((ppd->lflags & QIBL_IB_LINK_DISABLED) && ltstate != + IB_PHYSPORTSTATE_DISABLED) + qib_set_ib_7322_lstate(ppd, 0, + QLOGIC_IB_IBCC_LINKINITCMD_DISABLE); + else + /* + * Since going into a recovery state causes the link + * state to go down and since recovery is transitory, + * it is better if we "miss" ever seeing the link + * training state go into recovery (i.e., ignore this + * transition for link state special handling purposes) + * without updating lastibcstat. + */ + if (ltstate != IB_PHYSPORTSTATE_LINK_ERR_RECOVER && + ltstate != IB_PHYSPORTSTATE_RECOVERY_RETRAIN && + ltstate != IB_PHYSPORTSTATE_RECOVERY_WAITRMT && + ltstate != IB_PHYSPORTSTATE_RECOVERY_IDLE) + qib_handle_e_ibstatuschanged(ppd, ibcs); + } + if (*msg && iserr) + qib_dev_porterr(dd, ppd->port, "%s error\n", msg); + + if (ppd->state_wanted & ppd->lflags) + wake_up_interruptible(&ppd->state_wait); +done: + return; +} + +/* enable/disable chip from delivering interrupts */ +static void qib_7322_set_intr_state(struct qib_devdata *dd, u32 enable) +{ + if (enable) { + if (dd->flags & QIB_BADINTR) + return; + qib_write_kreg(dd, kr_intmask, dd->cspec->int_enable_mask); + /* cause any pending enabled interrupts to be re-delivered */ + qib_write_kreg(dd, kr_intclear, 0ULL); + if (dd->cspec->num_msix_entries) { + /* and same for MSIx */ + u64 val = qib_read_kreg64(dd, kr_intgranted); + if (val) + qib_write_kreg(dd, kr_intgranted, val); + } + } else + qib_write_kreg(dd, kr_intmask, 0ULL); +} + +/* + * Try to cleanup as much as possible for anything that might have gone + * wrong while in freeze mode, such as pio buffers being written by user + * processes (causing armlaunch), send errors due to going into freeze mode, + * etc., and try to avoid causing extra interrupts while doing so. + * Forcibly update the in-memory pioavail register copies after cleanup + * because the chip won't do it while in freeze mode (the register values + * themselves are kept correct). + * Make sure that we don't lose any important interrupts by using the chip + * feature that says that writing 0 to a bit in *clear that is set in + * *status will cause an interrupt to be generated again (if allowed by + * the *mask value). + * This is in chip-specific code because of all of the register accesses, + * even though the details are similar on most chips. + */ +static void qib_7322_clear_freeze(struct qib_devdata *dd) +{ + int pidx; + + /* disable error interrupts, to avoid confusion */ + qib_write_kreg(dd, kr_errmask, 0ULL); + + for (pidx = 0; pidx < dd->num_pports; ++pidx) + if (dd->pport[pidx].link_speed_supported) + qib_write_kreg_port(dd->pport + pidx, krp_errmask, + 0ULL); + + /* also disable interrupts; errormask is sometimes overwriten */ + qib_7322_set_intr_state(dd, 0); + + /* clear the freeze, and be sure chip saw it */ + qib_write_kreg(dd, kr_control, dd->control); + qib_read_kreg32(dd, kr_scratch); + + /* + * Force new interrupt if any hwerr, error or interrupt bits are + * still set, and clear "safe" send packet errors related to freeze + * and cancelling sends. Re-enable error interrupts before possible + * force of re-interrupt on pending interrupts. + */ + qib_write_kreg(dd, kr_hwerrclear, 0ULL); + qib_write_kreg(dd, kr_errclear, E_SPKT_ERRS_IGNORE); + qib_write_kreg(dd, kr_errmask, dd->cspec->errormask); + /* We need to purge per-port errs and reset mask, too */ + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + if (!dd->pport[pidx].link_speed_supported) + continue; + qib_write_kreg_port(dd->pport + pidx, krp_errclear, ~0Ull); + qib_write_kreg_port(dd->pport + pidx, krp_errmask, ~0Ull); + } + qib_7322_set_intr_state(dd, 1); +} + +/* no error handling to speak of */ +/** + * qib_7322_handle_hwerrors - display hardware errors. + * @dd: the qlogic_ib device + * @msg: the output buffer + * @msgl: the size of the output buffer + * + * Use same msg buffer as regular errors to avoid excessive stack + * use. Most hardware errors are catastrophic, but for right now, + * we'll print them and continue. We reuse the same message buffer as + * qib_handle_errors() to avoid excessive stack usage. + */ +static void qib_7322_handle_hwerrors(struct qib_devdata *dd, char *msg, + size_t msgl) +{ + u64 hwerrs; + u32 ctrl; + int isfatal = 0; + + hwerrs = qib_read_kreg64(dd, kr_hwerrstatus); + if (!hwerrs) + goto bail; + if (hwerrs == ~0ULL) { + qib_dev_err(dd, + "Read of hardware error status failed (all bits set); ignoring\n"); + goto bail; + } + qib_stats.sps_hwerrs++; + + /* Always clear the error status register, except BIST fail */ + qib_write_kreg(dd, kr_hwerrclear, hwerrs & + ~HWE_MASK(PowerOnBISTFailed)); + + hwerrs &= dd->cspec->hwerrmask; + + /* no EEPROM logging, yet */ + + if (hwerrs) + qib_devinfo(dd->pcidev, + "Hardware error: hwerr=0x%llx (cleared)\n", + (unsigned long long) hwerrs); + + ctrl = qib_read_kreg32(dd, kr_control); + if ((ctrl & SYM_MASK(Control, FreezeMode)) && !dd->diag_client) { + /* + * No recovery yet... + */ + if ((hwerrs & ~HWE_MASK(LATriggered)) || + dd->cspec->stay_in_freeze) { + /* + * If any set that we aren't ignoring only make the + * complaint once, in case it's stuck or recurring, + * and we get here multiple times + * Force link down, so switch knows, and + * LEDs are turned off. + */ + if (dd->flags & QIB_INITTED) + isfatal = 1; + } else + qib_7322_clear_freeze(dd); + } + + if (hwerrs & HWE_MASK(PowerOnBISTFailed)) { + isfatal = 1; + strlcpy(msg, + "[Memory BIST test failed, InfiniPath hardware unusable]", + msgl); + /* ignore from now on, so disable until driver reloaded */ + dd->cspec->hwerrmask &= ~HWE_MASK(PowerOnBISTFailed); + qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask); + } + + err_decode(msg, msgl, hwerrs, qib_7322_hwerror_msgs); + + /* Ignore esoteric PLL failures et al. */ + + qib_dev_err(dd, "%s hardware error\n", msg); + + if (hwerrs & + (SYM_MASK(HwErrMask, SDmaMemReadErrMask_0) | + SYM_MASK(HwErrMask, SDmaMemReadErrMask_1))) { + int pidx = 0; + int err; + unsigned long flags; + struct qib_pportdata *ppd = dd->pport; + for (; pidx < dd->num_pports; ++pidx, ppd++) { + err = 0; + if (pidx == 0 && (hwerrs & + SYM_MASK(HwErrMask, SDmaMemReadErrMask_0))) + err++; + if (pidx == 1 && (hwerrs & + SYM_MASK(HwErrMask, SDmaMemReadErrMask_1))) + err++; + if (err) { + spin_lock_irqsave(&ppd->sdma_lock, flags); + dump_sdma_7322_state(ppd); + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + } + } + } + + if (isfatal && !dd->diag_client) { + qib_dev_err(dd, + "Fatal Hardware Error, no longer usable, SN %.16s\n", + dd->serial); + /* + * for /sys status file and user programs to print; if no + * trailing brace is copied, we'll know it was truncated. + */ + if (dd->freezemsg) + snprintf(dd->freezemsg, dd->freezelen, + "{%s}", msg); + qib_disable_after_error(dd); + } +bail:; +} + +/** + * qib_7322_init_hwerrors - enable hardware errors + * @dd: the qlogic_ib device + * + * now that we have finished initializing everything that might reasonably + * cause a hardware error, and cleared those errors bits as they occur, + * we can enable hardware errors in the mask (potentially enabling + * freeze mode), and enable hardware errors as errors (along with + * everything else) in errormask + */ +static void qib_7322_init_hwerrors(struct qib_devdata *dd) +{ + int pidx; + u64 extsval; + + extsval = qib_read_kreg64(dd, kr_extstatus); + if (!(extsval & (QIB_EXTS_MEMBIST_DISABLED | + QIB_EXTS_MEMBIST_ENDTEST))) + qib_dev_err(dd, "MemBIST did not complete!\n"); + + /* never clear BIST failure, so reported on each driver load */ + qib_write_kreg(dd, kr_hwerrclear, ~HWE_MASK(PowerOnBISTFailed)); + qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask); + + /* clear all */ + qib_write_kreg(dd, kr_errclear, ~0ULL); + /* enable errors that are masked, at least this first time. */ + qib_write_kreg(dd, kr_errmask, ~0ULL); + dd->cspec->errormask = qib_read_kreg64(dd, kr_errmask); + for (pidx = 0; pidx < dd->num_pports; ++pidx) + if (dd->pport[pidx].link_speed_supported) + qib_write_kreg_port(dd->pport + pidx, krp_errmask, + ~0ULL); +} + +/* + * Disable and enable the armlaunch error. Used for PIO bandwidth testing + * on chips that are count-based, rather than trigger-based. There is no + * reference counting, but that's also fine, given the intended use. + * Only chip-specific because it's all register accesses + */ +static void qib_set_7322_armlaunch(struct qib_devdata *dd, u32 enable) +{ + if (enable) { + qib_write_kreg(dd, kr_errclear, QIB_E_SPIOARMLAUNCH); + dd->cspec->errormask |= QIB_E_SPIOARMLAUNCH; + } else + dd->cspec->errormask &= ~QIB_E_SPIOARMLAUNCH; + qib_write_kreg(dd, kr_errmask, dd->cspec->errormask); +} + +/* + * Formerly took parameter in pre-shifted, + * pre-merged form with LinkCmd and LinkInitCmd + * together, and assuming the zero was NOP. + */ +static void qib_set_ib_7322_lstate(struct qib_pportdata *ppd, u16 linkcmd, + u16 linitcmd) +{ + u64 mod_wd; + struct qib_devdata *dd = ppd->dd; + unsigned long flags; + + if (linitcmd == QLOGIC_IB_IBCC_LINKINITCMD_DISABLE) { + /* + * If we are told to disable, note that so link-recovery + * code does not attempt to bring us back up. + * Also reset everything that we can, so we start + * completely clean when re-enabled (before we + * actually issue the disable to the IBC) + */ + qib_7322_mini_pcs_reset(ppd); + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags |= QIBL_IB_LINK_DISABLED; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + } else if (linitcmd || linkcmd == QLOGIC_IB_IBCC_LINKCMD_DOWN) { + /* + * Any other linkinitcmd will lead to LINKDOWN and then + * to INIT (if all is well), so clear flag to let + * link-recovery code attempt to bring us back up. + */ + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_IB_LINK_DISABLED; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + /* + * Clear status change interrupt reduction so the + * new state is seen. + */ + ppd->cpspec->ibcctrl_a &= + ~SYM_MASK(IBCCtrlA_0, IBStatIntReductionEn); + } + + mod_wd = (linkcmd << IBA7322_IBCC_LINKCMD_SHIFT) | + (linitcmd << QLOGIC_IB_IBCC_LINKINITCMD_SHIFT); + + qib_write_kreg_port(ppd, krp_ibcctrl_a, ppd->cpspec->ibcctrl_a | + mod_wd); + /* write to chip to prevent back-to-back writes of ibc reg */ + qib_write_kreg(dd, kr_scratch, 0); + +} + +/* + * The total RCV buffer memory is 64KB, used for both ports, and is + * in units of 64 bytes (same as IB flow control credit unit). + * The consumedVL unit in the same registers are in 32 byte units! + * So, a VL15 packet needs 4.50 IB credits, and 9 rx buffer chunks, + * and we can therefore allocate just 9 IB credits for 2 VL15 packets + * in krp_rxcreditvl15, rather than 10. + */ +#define RCV_BUF_UNITSZ 64 +#define NUM_RCV_BUF_UNITS(dd) ((64 * 1024) / (RCV_BUF_UNITSZ * dd->num_pports)) + +static void set_vls(struct qib_pportdata *ppd) +{ + int i, numvls, totcred, cred_vl, vl0extra; + struct qib_devdata *dd = ppd->dd; + u64 val; + + numvls = qib_num_vls(ppd->vls_operational); + + /* + * Set up per-VL credits. Below is kluge based on these assumptions: + * 1) port is disabled at the time early_init is called. + * 2) give VL15 17 credits, for two max-plausible packets. + * 3) Give VL0-N the rest, with any rounding excess used for VL0 + */ + /* 2 VL15 packets @ 288 bytes each (including IB headers) */ + totcred = NUM_RCV_BUF_UNITS(dd); + cred_vl = (2 * 288 + RCV_BUF_UNITSZ - 1) / RCV_BUF_UNITSZ; + totcred -= cred_vl; + qib_write_kreg_port(ppd, krp_rxcreditvl15, (u64) cred_vl); + cred_vl = totcred / numvls; + vl0extra = totcred - cred_vl * numvls; + qib_write_kreg_port(ppd, krp_rxcreditvl0, cred_vl + vl0extra); + for (i = 1; i < numvls; i++) + qib_write_kreg_port(ppd, krp_rxcreditvl0 + i, cred_vl); + for (; i < 8; i++) /* no buffer space for other VLs */ + qib_write_kreg_port(ppd, krp_rxcreditvl0 + i, 0); + + /* Notify IBC that credits need to be recalculated */ + val = qib_read_kreg_port(ppd, krp_ibsdtestiftx); + val |= SYM_MASK(IB_SDTEST_IF_TX_0, CREDIT_CHANGE); + qib_write_kreg_port(ppd, krp_ibsdtestiftx, val); + qib_write_kreg(dd, kr_scratch, 0ULL); + val &= ~SYM_MASK(IB_SDTEST_IF_TX_0, CREDIT_CHANGE); + qib_write_kreg_port(ppd, krp_ibsdtestiftx, val); + + for (i = 0; i < numvls; i++) + val = qib_read_kreg_port(ppd, krp_rxcreditvl0 + i); + val = qib_read_kreg_port(ppd, krp_rxcreditvl15); + + /* Change the number of operational VLs */ + ppd->cpspec->ibcctrl_a = (ppd->cpspec->ibcctrl_a & + ~SYM_MASK(IBCCtrlA_0, NumVLane)) | + ((u64)(numvls - 1) << SYM_LSB(IBCCtrlA_0, NumVLane)); + qib_write_kreg_port(ppd, krp_ibcctrl_a, ppd->cpspec->ibcctrl_a); + qib_write_kreg(dd, kr_scratch, 0ULL); +} + +/* + * The code that deals with actual SerDes is in serdes_7322_init(). + * Compared to the code for iba7220, it is minimal. + */ +static int serdes_7322_init(struct qib_pportdata *ppd); + +/** + * qib_7322_bringup_serdes - bring up the serdes + * @ppd: physical port on the qlogic_ib device + */ +static int qib_7322_bringup_serdes(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + u64 val, guid, ibc; + unsigned long flags; + int ret = 0; + + /* + * SerDes model not in Pd, but still need to + * set up much of IBCCtrl and IBCDDRCtrl; move elsewhere + * eventually. + */ + /* Put IBC in reset, sends disabled (should be in reset already) */ + ppd->cpspec->ibcctrl_a &= ~SYM_MASK(IBCCtrlA_0, IBLinkEn); + qib_write_kreg_port(ppd, krp_ibcctrl_a, ppd->cpspec->ibcctrl_a); + qib_write_kreg(dd, kr_scratch, 0ULL); + + /* ensure previous Tx parameters are not still forced */ + qib_write_kreg_port(ppd, krp_tx_deemph_override, + SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, + reset_tx_deemphasis_override)); + + if (qib_compat_ddr_negotiate) { + ppd->cpspec->ibdeltainprog = 1; + ppd->cpspec->ibsymsnap = read_7322_creg32_port(ppd, + crp_ibsymbolerr); + ppd->cpspec->iblnkerrsnap = read_7322_creg32_port(ppd, + crp_iblinkerrrecov); + } + + /* flowcontrolwatermark is in units of KBytes */ + ibc = 0x5ULL << SYM_LSB(IBCCtrlA_0, FlowCtrlWaterMark); + /* + * Flow control is sent this often, even if no changes in + * buffer space occur. Units are 128ns for this chip. + * Set to 3usec. + */ + ibc |= 24ULL << SYM_LSB(IBCCtrlA_0, FlowCtrlPeriod); + /* max error tolerance */ + ibc |= 0xfULL << SYM_LSB(IBCCtrlA_0, PhyerrThreshold); + /* IB credit flow control. */ + ibc |= 0xfULL << SYM_LSB(IBCCtrlA_0, OverrunThreshold); + /* + * set initial max size pkt IBC will send, including ICRC; it's the + * PIO buffer size in dwords, less 1; also see qib_set_mtu() + */ + ibc |= ((u64)(ppd->ibmaxlen >> 2) + 1) << + SYM_LSB(IBCCtrlA_0, MaxPktLen); + ppd->cpspec->ibcctrl_a = ibc; /* without linkcmd or linkinitcmd! */ + + /* + * Reset the PCS interface to the serdes (and also ibc, which is still + * in reset from above). Writes new value of ibcctrl_a as last step. + */ + qib_7322_mini_pcs_reset(ppd); + + if (!ppd->cpspec->ibcctrl_b) { + unsigned lse = ppd->link_speed_enabled; + + /* + * Not on re-init after reset, establish shadow + * and force initial config. + */ + ppd->cpspec->ibcctrl_b = qib_read_kreg_port(ppd, + krp_ibcctrl_b); + ppd->cpspec->ibcctrl_b &= ~(IBA7322_IBC_SPEED_QDR | + IBA7322_IBC_SPEED_DDR | + IBA7322_IBC_SPEED_SDR | + IBA7322_IBC_WIDTH_AUTONEG | + SYM_MASK(IBCCtrlB_0, IB_LANE_REV_SUPPORTED)); + if (lse & (lse - 1)) /* Muliple speeds enabled */ + ppd->cpspec->ibcctrl_b |= + (lse << IBA7322_IBC_SPEED_LSB) | + IBA7322_IBC_IBTA_1_2_MASK | + IBA7322_IBC_MAX_SPEED_MASK; + else + ppd->cpspec->ibcctrl_b |= (lse == QIB_IB_QDR) ? + IBA7322_IBC_SPEED_QDR | + IBA7322_IBC_IBTA_1_2_MASK : + (lse == QIB_IB_DDR) ? + IBA7322_IBC_SPEED_DDR : + IBA7322_IBC_SPEED_SDR; + if ((ppd->link_width_enabled & (IB_WIDTH_1X | IB_WIDTH_4X)) == + (IB_WIDTH_1X | IB_WIDTH_4X)) + ppd->cpspec->ibcctrl_b |= IBA7322_IBC_WIDTH_AUTONEG; + else + ppd->cpspec->ibcctrl_b |= + ppd->link_width_enabled == IB_WIDTH_4X ? + IBA7322_IBC_WIDTH_4X_ONLY : + IBA7322_IBC_WIDTH_1X_ONLY; + + /* always enable these on driver reload, not sticky */ + ppd->cpspec->ibcctrl_b |= (IBA7322_IBC_RXPOL_MASK | + IBA7322_IBC_HRTBT_MASK); + } + qib_write_kreg_port(ppd, krp_ibcctrl_b, ppd->cpspec->ibcctrl_b); + + /* setup so we have more time at CFGTEST to change H1 */ + val = qib_read_kreg_port(ppd, krp_ibcctrl_c); + val &= ~SYM_MASK(IBCCtrlC_0, IB_FRONT_PORCH); + val |= 0xfULL << SYM_LSB(IBCCtrlC_0, IB_FRONT_PORCH); + qib_write_kreg_port(ppd, krp_ibcctrl_c, val); + + serdes_7322_init(ppd); + + guid = be64_to_cpu(ppd->guid); + if (!guid) { + if (dd->base_guid) + guid = be64_to_cpu(dd->base_guid) + ppd->port - 1; + ppd->guid = cpu_to_be64(guid); + } + + qib_write_kreg_port(ppd, krp_hrtbt_guid, guid); + /* write to chip to prevent back-to-back writes of ibc reg */ + qib_write_kreg(dd, kr_scratch, 0); + + /* Enable port */ + ppd->cpspec->ibcctrl_a |= SYM_MASK(IBCCtrlA_0, IBLinkEn); + set_vls(ppd); + + /* initially come up DISABLED, without sending anything. */ + val = ppd->cpspec->ibcctrl_a | (QLOGIC_IB_IBCC_LINKINITCMD_DISABLE << + QLOGIC_IB_IBCC_LINKINITCMD_SHIFT); + qib_write_kreg_port(ppd, krp_ibcctrl_a, val); + qib_write_kreg(dd, kr_scratch, 0ULL); + /* clear the linkinit cmds */ + ppd->cpspec->ibcctrl_a = val & ~SYM_MASK(IBCCtrlA_0, LinkInitCmd); + + /* be paranoid against later code motion, etc. */ + spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags); + ppd->p_rcvctrl |= SYM_MASK(RcvCtrl_0, RcvIBPortEnable); + qib_write_kreg_port(ppd, krp_rcvctrl, ppd->p_rcvctrl); + spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags); + + /* Also enable IBSTATUSCHG interrupt. */ + val = qib_read_kreg_port(ppd, krp_errmask); + qib_write_kreg_port(ppd, krp_errmask, + val | ERR_MASK_N(IBStatusChanged)); + + /* Always zero until we start messing with SerDes for real */ + return ret; +} + +/** + * qib_7322_quiet_serdes - set serdes to txidle + * @dd: the qlogic_ib device + * Called when driver is being unloaded + */ +static void qib_7322_mini_quiet_serdes(struct qib_pportdata *ppd) +{ + u64 val; + unsigned long flags; + + qib_set_ib_7322_lstate(ppd, 0, QLOGIC_IB_IBCC_LINKINITCMD_DISABLE); + + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_IB_AUTONEG_INPROG; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + wake_up(&ppd->cpspec->autoneg_wait); + cancel_delayed_work_sync(&ppd->cpspec->autoneg_work); + if (ppd->dd->cspec->r1) + cancel_delayed_work_sync(&ppd->cpspec->ipg_work); + + ppd->cpspec->chase_end = 0; + if (ppd->cpspec->chase_timer.data) /* if initted */ + del_timer_sync(&ppd->cpspec->chase_timer); + + /* + * Despite the name, actually disables IBC as well. Do it when + * we are as sure as possible that no more packets can be + * received, following the down and the PCS reset. + * The actual disabling happens in qib_7322_mini_pci_reset(), + * along with the PCS being reset. + */ + ppd->cpspec->ibcctrl_a &= ~SYM_MASK(IBCCtrlA_0, IBLinkEn); + qib_7322_mini_pcs_reset(ppd); + + /* + * Update the adjusted counters so the adjustment persists + * across driver reload. + */ + if (ppd->cpspec->ibsymdelta || ppd->cpspec->iblnkerrdelta || + ppd->cpspec->ibdeltainprog || ppd->cpspec->iblnkdowndelta) { + struct qib_devdata *dd = ppd->dd; + u64 diagc; + + /* enable counter writes */ + diagc = qib_read_kreg64(dd, kr_hwdiagctrl); + qib_write_kreg(dd, kr_hwdiagctrl, + diagc | SYM_MASK(HwDiagCtrl, CounterWrEnable)); + + if (ppd->cpspec->ibsymdelta || ppd->cpspec->ibdeltainprog) { + val = read_7322_creg32_port(ppd, crp_ibsymbolerr); + if (ppd->cpspec->ibdeltainprog) + val -= val - ppd->cpspec->ibsymsnap; + val -= ppd->cpspec->ibsymdelta; + write_7322_creg_port(ppd, crp_ibsymbolerr, val); + } + if (ppd->cpspec->iblnkerrdelta || ppd->cpspec->ibdeltainprog) { + val = read_7322_creg32_port(ppd, crp_iblinkerrrecov); + if (ppd->cpspec->ibdeltainprog) + val -= val - ppd->cpspec->iblnkerrsnap; + val -= ppd->cpspec->iblnkerrdelta; + write_7322_creg_port(ppd, crp_iblinkerrrecov, val); + } + if (ppd->cpspec->iblnkdowndelta) { + val = read_7322_creg32_port(ppd, crp_iblinkdown); + val += ppd->cpspec->iblnkdowndelta; + write_7322_creg_port(ppd, crp_iblinkdown, val); + } + /* + * No need to save ibmalfdelta since IB perfcounters + * are cleared on driver reload. + */ + + /* and disable counter writes */ + qib_write_kreg(dd, kr_hwdiagctrl, diagc); + } +} + +/** + * qib_setup_7322_setextled - set the state of the two external LEDs + * @ppd: physical port on the qlogic_ib device + * @on: whether the link is up or not + * + * The exact combo of LEDs if on is true is determined by looking + * at the ibcstatus. + * + * These LEDs indicate the physical and logical state of IB link. + * For this chip (at least with recommended board pinouts), LED1 + * is Yellow (logical state) and LED2 is Green (physical state), + * + * Note: We try to match the Mellanox HCA LED behavior as best + * we can. Green indicates physical link state is OK (something is + * plugged in, and we can train). + * Amber indicates the link is logically up (ACTIVE). + * Mellanox further blinks the amber LED to indicate data packet + * activity, but we have no hardware support for that, so it would + * require waking up every 10-20 msecs and checking the counters + * on the chip, and then turning the LED off if appropriate. That's + * visible overhead, so not something we will do. + */ +static void qib_setup_7322_setextled(struct qib_pportdata *ppd, u32 on) +{ + struct qib_devdata *dd = ppd->dd; + u64 extctl, ledblink = 0, val; + unsigned long flags; + int yel, grn; + + /* + * The diags use the LED to indicate diag info, so we leave + * the external LED alone when the diags are running. + */ + if (dd->diag_client) + return; + + /* Allow override of LED display for, e.g. Locating system in rack */ + if (ppd->led_override) { + grn = (ppd->led_override & QIB_LED_PHYS); + yel = (ppd->led_override & QIB_LED_LOG); + } else if (on) { + val = qib_read_kreg_port(ppd, krp_ibcstatus_a); + grn = qib_7322_phys_portstate(val) == + IB_PHYSPORTSTATE_LINKUP; + yel = qib_7322_iblink_state(val) == IB_PORT_ACTIVE; + } else { + grn = 0; + yel = 0; + } + + spin_lock_irqsave(&dd->cspec->gpio_lock, flags); + extctl = dd->cspec->extctrl & (ppd->port == 1 ? + ~ExtLED_IB1_MASK : ~ExtLED_IB2_MASK); + if (grn) { + extctl |= ppd->port == 1 ? ExtLED_IB1_GRN : ExtLED_IB2_GRN; + /* + * Counts are in chip clock (4ns) periods. + * This is 1/16 sec (66.6ms) on, + * 3/16 sec (187.5 ms) off, with packets rcvd. + */ + ledblink = ((66600 * 1000UL / 4) << IBA7322_LEDBLINK_ON_SHIFT) | + ((187500 * 1000UL / 4) << IBA7322_LEDBLINK_OFF_SHIFT); + } + if (yel) + extctl |= ppd->port == 1 ? ExtLED_IB1_YEL : ExtLED_IB2_YEL; + dd->cspec->extctrl = extctl; + qib_write_kreg(dd, kr_extctrl, dd->cspec->extctrl); + spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags); + + if (ledblink) /* blink the LED on packet receive */ + qib_write_kreg_port(ppd, krp_rcvpktledcnt, ledblink); +} + +#ifdef CONFIG_INFINIBAND_QIB_DCA + +static int qib_7322_notify_dca(struct qib_devdata *dd, unsigned long event) +{ + switch (event) { + case DCA_PROVIDER_ADD: + if (dd->flags & QIB_DCA_ENABLED) + break; + if (!dca_add_requester(&dd->pcidev->dev)) { + qib_devinfo(dd->pcidev, "DCA enabled\n"); + dd->flags |= QIB_DCA_ENABLED; + qib_setup_dca(dd); + } + break; + case DCA_PROVIDER_REMOVE: + if (dd->flags & QIB_DCA_ENABLED) { + dca_remove_requester(&dd->pcidev->dev); + dd->flags &= ~QIB_DCA_ENABLED; + dd->cspec->dca_ctrl = 0; + qib_write_kreg(dd, KREG_IDX(DCACtrlA), + dd->cspec->dca_ctrl); + } + break; + } + return 0; +} + +static void qib_update_rhdrq_dca(struct qib_ctxtdata *rcd, int cpu) +{ + struct qib_devdata *dd = rcd->dd; + struct qib_chip_specific *cspec = dd->cspec; + + if (!(dd->flags & QIB_DCA_ENABLED)) + return; + if (cspec->rhdr_cpu[rcd->ctxt] != cpu) { + const struct dca_reg_map *rmp; + + cspec->rhdr_cpu[rcd->ctxt] = cpu; + rmp = &dca_rcvhdr_reg_map[rcd->ctxt]; + cspec->dca_rcvhdr_ctrl[rmp->shadow_inx] &= rmp->mask; + cspec->dca_rcvhdr_ctrl[rmp->shadow_inx] |= + (u64) dca3_get_tag(&dd->pcidev->dev, cpu) << rmp->lsb; + qib_devinfo(dd->pcidev, + "Ctxt %d cpu %d dca %llx\n", rcd->ctxt, cpu, + (long long) cspec->dca_rcvhdr_ctrl[rmp->shadow_inx]); + qib_write_kreg(dd, rmp->regno, + cspec->dca_rcvhdr_ctrl[rmp->shadow_inx]); + cspec->dca_ctrl |= SYM_MASK(DCACtrlA, RcvHdrqDCAEnable); + qib_write_kreg(dd, KREG_IDX(DCACtrlA), cspec->dca_ctrl); + } +} + +static void qib_update_sdma_dca(struct qib_pportdata *ppd, int cpu) +{ + struct qib_devdata *dd = ppd->dd; + struct qib_chip_specific *cspec = dd->cspec; + unsigned pidx = ppd->port - 1; + + if (!(dd->flags & QIB_DCA_ENABLED)) + return; + if (cspec->sdma_cpu[pidx] != cpu) { + cspec->sdma_cpu[pidx] = cpu; + cspec->dca_rcvhdr_ctrl[4] &= ~(ppd->hw_pidx ? + SYM_MASK(DCACtrlF, SendDma1DCAOPH) : + SYM_MASK(DCACtrlF, SendDma0DCAOPH)); + cspec->dca_rcvhdr_ctrl[4] |= + (u64) dca3_get_tag(&dd->pcidev->dev, cpu) << + (ppd->hw_pidx ? + SYM_LSB(DCACtrlF, SendDma1DCAOPH) : + SYM_LSB(DCACtrlF, SendDma0DCAOPH)); + qib_devinfo(dd->pcidev, + "sdma %d cpu %d dca %llx\n", ppd->hw_pidx, cpu, + (long long) cspec->dca_rcvhdr_ctrl[4]); + qib_write_kreg(dd, KREG_IDX(DCACtrlF), + cspec->dca_rcvhdr_ctrl[4]); + cspec->dca_ctrl |= ppd->hw_pidx ? + SYM_MASK(DCACtrlA, SendDMAHead1DCAEnable) : + SYM_MASK(DCACtrlA, SendDMAHead0DCAEnable); + qib_write_kreg(dd, KREG_IDX(DCACtrlA), cspec->dca_ctrl); + } +} + +static void qib_setup_dca(struct qib_devdata *dd) +{ + struct qib_chip_specific *cspec = dd->cspec; + int i; + + for (i = 0; i < ARRAY_SIZE(cspec->rhdr_cpu); i++) + cspec->rhdr_cpu[i] = -1; + for (i = 0; i < ARRAY_SIZE(cspec->sdma_cpu); i++) + cspec->sdma_cpu[i] = -1; + cspec->dca_rcvhdr_ctrl[0] = + (1ULL << SYM_LSB(DCACtrlB, RcvHdrq0DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlB, RcvHdrq1DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlB, RcvHdrq2DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlB, RcvHdrq3DCAXfrCnt)); + cspec->dca_rcvhdr_ctrl[1] = + (1ULL << SYM_LSB(DCACtrlC, RcvHdrq4DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlC, RcvHdrq5DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlC, RcvHdrq6DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlC, RcvHdrq7DCAXfrCnt)); + cspec->dca_rcvhdr_ctrl[2] = + (1ULL << SYM_LSB(DCACtrlD, RcvHdrq8DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlD, RcvHdrq9DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlD, RcvHdrq10DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlD, RcvHdrq11DCAXfrCnt)); + cspec->dca_rcvhdr_ctrl[3] = + (1ULL << SYM_LSB(DCACtrlE, RcvHdrq12DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlE, RcvHdrq13DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlE, RcvHdrq14DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlE, RcvHdrq15DCAXfrCnt)); + cspec->dca_rcvhdr_ctrl[4] = + (1ULL << SYM_LSB(DCACtrlF, RcvHdrq16DCAXfrCnt)) | + (1ULL << SYM_LSB(DCACtrlF, RcvHdrq17DCAXfrCnt)); + for (i = 0; i < ARRAY_SIZE(cspec->sdma_cpu); i++) + qib_write_kreg(dd, KREG_IDX(DCACtrlB) + i, + cspec->dca_rcvhdr_ctrl[i]); + for (i = 0; i < cspec->num_msix_entries; i++) + setup_dca_notifier(dd, &cspec->msix_entries[i]); +} + +static void qib_irq_notifier_notify(struct irq_affinity_notify *notify, + const cpumask_t *mask) +{ + struct qib_irq_notify *n = + container_of(notify, struct qib_irq_notify, notify); + int cpu = cpumask_first(mask); + + if (n->rcv) { + struct qib_ctxtdata *rcd = (struct qib_ctxtdata *)n->arg; + qib_update_rhdrq_dca(rcd, cpu); + } else { + struct qib_pportdata *ppd = (struct qib_pportdata *)n->arg; + qib_update_sdma_dca(ppd, cpu); + } +} + +static void qib_irq_notifier_release(struct kref *ref) +{ + struct qib_irq_notify *n = + container_of(ref, struct qib_irq_notify, notify.kref); + struct qib_devdata *dd; + + if (n->rcv) { + struct qib_ctxtdata *rcd = (struct qib_ctxtdata *)n->arg; + dd = rcd->dd; + } else { + struct qib_pportdata *ppd = (struct qib_pportdata *)n->arg; + dd = ppd->dd; + } + qib_devinfo(dd->pcidev, + "release on HCA notify 0x%p n 0x%p\n", ref, n); + kfree(n); +} +#endif + +/* + * Disable MSIx interrupt if enabled, call generic MSIx code + * to cleanup, and clear pending MSIx interrupts. + * Used for fallback to INTx, after reset, and when MSIx setup fails. + */ +static void qib_7322_nomsix(struct qib_devdata *dd) +{ + u64 intgranted; + int n; + + dd->cspec->main_int_mask = ~0ULL; + n = dd->cspec->num_msix_entries; + if (n) { + int i; + + dd->cspec->num_msix_entries = 0; + for (i = 0; i < n; i++) { +#ifdef CONFIG_INFINIBAND_QIB_DCA + reset_dca_notifier(dd, &dd->cspec->msix_entries[i]); +#endif + irq_set_affinity_hint( + dd->cspec->msix_entries[i].msix.vector, NULL); + free_cpumask_var(dd->cspec->msix_entries[i].mask); + free_irq(dd->cspec->msix_entries[i].msix.vector, + dd->cspec->msix_entries[i].arg); + } + qib_nomsix(dd); + } + /* make sure no MSIx interrupts are left pending */ + intgranted = qib_read_kreg64(dd, kr_intgranted); + if (intgranted) + qib_write_kreg(dd, kr_intgranted, intgranted); +} + +static void qib_7322_free_irq(struct qib_devdata *dd) +{ + if (dd->cspec->irq) { + free_irq(dd->cspec->irq, dd); + dd->cspec->irq = 0; + } + qib_7322_nomsix(dd); +} + +static void qib_setup_7322_cleanup(struct qib_devdata *dd) +{ + int i; + +#ifdef CONFIG_INFINIBAND_QIB_DCA + if (dd->flags & QIB_DCA_ENABLED) { + dca_remove_requester(&dd->pcidev->dev); + dd->flags &= ~QIB_DCA_ENABLED; + dd->cspec->dca_ctrl = 0; + qib_write_kreg(dd, KREG_IDX(DCACtrlA), dd->cspec->dca_ctrl); + } +#endif + + qib_7322_free_irq(dd); + kfree(dd->cspec->cntrs); + kfree(dd->cspec->sendchkenable); + kfree(dd->cspec->sendgrhchk); + kfree(dd->cspec->sendibchk); + kfree(dd->cspec->msix_entries); + for (i = 0; i < dd->num_pports; i++) { + unsigned long flags; + u32 mask = QSFP_GPIO_MOD_PRS_N | + (QSFP_GPIO_MOD_PRS_N << QSFP_GPIO_PORT2_SHIFT); + + kfree(dd->pport[i].cpspec->portcntrs); + if (dd->flags & QIB_HAS_QSFP) { + spin_lock_irqsave(&dd->cspec->gpio_lock, flags); + dd->cspec->gpio_mask &= ~mask; + qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask); + spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags); + qib_qsfp_deinit(&dd->pport[i].cpspec->qsfp_data); + } + if (dd->pport[i].ibport_data.smi_ah) + ib_destroy_ah(&dd->pport[i].ibport_data.smi_ah->ibah); + } +} + +/* handle SDMA interrupts */ +static void sdma_7322_intr(struct qib_devdata *dd, u64 istat) +{ + struct qib_pportdata *ppd0 = &dd->pport[0]; + struct qib_pportdata *ppd1 = &dd->pport[1]; + u64 intr0 = istat & (INT_MASK_P(SDma, 0) | + INT_MASK_P(SDmaIdle, 0) | INT_MASK_P(SDmaProgress, 0)); + u64 intr1 = istat & (INT_MASK_P(SDma, 1) | + INT_MASK_P(SDmaIdle, 1) | INT_MASK_P(SDmaProgress, 1)); + + if (intr0) + qib_sdma_intr(ppd0); + if (intr1) + qib_sdma_intr(ppd1); + + if (istat & INT_MASK_PM(SDmaCleanupDone, 0)) + qib_sdma_process_event(ppd0, qib_sdma_event_e20_hw_started); + if (istat & INT_MASK_PM(SDmaCleanupDone, 1)) + qib_sdma_process_event(ppd1, qib_sdma_event_e20_hw_started); +} + +/* + * Set or clear the Send buffer available interrupt enable bit. + */ +static void qib_wantpiobuf_7322_intr(struct qib_devdata *dd, u32 needint) +{ + unsigned long flags; + + spin_lock_irqsave(&dd->sendctrl_lock, flags); + if (needint) + dd->sendctrl |= SYM_MASK(SendCtrl, SendIntBufAvail); + else + dd->sendctrl &= ~SYM_MASK(SendCtrl, SendIntBufAvail); + qib_write_kreg(dd, kr_sendctrl, dd->sendctrl); + qib_write_kreg(dd, kr_scratch, 0ULL); + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); +} + +/* + * Somehow got an interrupt with reserved bits set in interrupt status. + * Print a message so we know it happened, then clear them. + * keep mainline interrupt handler cache-friendly + */ +static noinline void unknown_7322_ibits(struct qib_devdata *dd, u64 istat) +{ + u64 kills; + char msg[128]; + + kills = istat & ~QIB_I_BITSEXTANT; + qib_dev_err(dd, + "Clearing reserved interrupt(s) 0x%016llx: %s\n", + (unsigned long long) kills, msg); + qib_write_kreg(dd, kr_intmask, (dd->cspec->int_enable_mask & ~kills)); +} + +/* keep mainline interrupt handler cache-friendly */ +static noinline void unknown_7322_gpio_intr(struct qib_devdata *dd) +{ + u32 gpiostatus; + int handled = 0; + int pidx; + + /* + * Boards for this chip currently don't use GPIO interrupts, + * so clear by writing GPIOstatus to GPIOclear, and complain + * to developer. To avoid endless repeats, clear + * the bits in the mask, since there is some kind of + * programming error or chip problem. + */ + gpiostatus = qib_read_kreg32(dd, kr_gpio_status); + /* + * In theory, writing GPIOstatus to GPIOclear could + * have a bad side-effect on some diagnostic that wanted + * to poll for a status-change, but the various shadows + * make that problematic at best. Diags will just suppress + * all GPIO interrupts during such tests. + */ + qib_write_kreg(dd, kr_gpio_clear, gpiostatus); + /* + * Check for QSFP MOD_PRS changes + * only works for single port if IB1 != pidx1 + */ + for (pidx = 0; pidx < dd->num_pports && (dd->flags & QIB_HAS_QSFP); + ++pidx) { + struct qib_pportdata *ppd; + struct qib_qsfp_data *qd; + u32 mask; + if (!dd->pport[pidx].link_speed_supported) + continue; + mask = QSFP_GPIO_MOD_PRS_N; + ppd = dd->pport + pidx; + mask <<= (QSFP_GPIO_PORT2_SHIFT * ppd->hw_pidx); + if (gpiostatus & dd->cspec->gpio_mask & mask) { + u64 pins; + qd = &ppd->cpspec->qsfp_data; + gpiostatus &= ~mask; + pins = qib_read_kreg64(dd, kr_extstatus); + pins >>= SYM_LSB(EXTStatus, GPIOIn); + if (!(pins & mask)) { + ++handled; + qd->t_insert = jiffies; + queue_work(ib_wq, &qd->work); + } + } + } + + if (gpiostatus && !handled) { + const u32 mask = qib_read_kreg32(dd, kr_gpio_mask); + u32 gpio_irq = mask & gpiostatus; + + /* + * Clear any troublemakers, and update chip from shadow + */ + dd->cspec->gpio_mask &= ~gpio_irq; + qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask); + } +} + +/* + * Handle errors and unusual events first, separate function + * to improve cache hits for fast path interrupt handling. + */ +static noinline void unlikely_7322_intr(struct qib_devdata *dd, u64 istat) +{ + if (istat & ~QIB_I_BITSEXTANT) + unknown_7322_ibits(dd, istat); + if (istat & QIB_I_GPIO) + unknown_7322_gpio_intr(dd); + if (istat & QIB_I_C_ERROR) { + qib_write_kreg(dd, kr_errmask, 0ULL); + tasklet_schedule(&dd->error_tasklet); + } + if (istat & INT_MASK_P(Err, 0) && dd->rcd[0]) + handle_7322_p_errors(dd->rcd[0]->ppd); + if (istat & INT_MASK_P(Err, 1) && dd->rcd[1]) + handle_7322_p_errors(dd->rcd[1]->ppd); +} + +/* + * Dynamically adjust the rcv int timeout for a context based on incoming + * packet rate. + */ +static void adjust_rcv_timeout(struct qib_ctxtdata *rcd, int npkts) +{ + struct qib_devdata *dd = rcd->dd; + u32 timeout = dd->cspec->rcvavail_timeout[rcd->ctxt]; + + /* + * Dynamically adjust idle timeout on chip + * based on number of packets processed. + */ + if (npkts < rcv_int_count && timeout > 2) + timeout >>= 1; + else if (npkts >= rcv_int_count && timeout < rcv_int_timeout) + timeout = min(timeout << 1, rcv_int_timeout); + else + return; + + dd->cspec->rcvavail_timeout[rcd->ctxt] = timeout; + qib_write_kreg(dd, kr_rcvavailtimeout + rcd->ctxt, timeout); +} + +/* + * This is the main interrupt handler. + * It will normally only be used for low frequency interrupts but may + * have to handle all interrupts if INTx is enabled or fewer than normal + * MSIx interrupts were allocated. + * This routine should ignore the interrupt bits for any of the + * dedicated MSIx handlers. + */ +static irqreturn_t qib_7322intr(int irq, void *data) +{ + struct qib_devdata *dd = data; + irqreturn_t ret; + u64 istat; + u64 ctxtrbits; + u64 rmask; + unsigned i; + u32 npkts; + + if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT) { + /* + * This return value is not great, but we do not want the + * interrupt core code to remove our interrupt handler + * because we don't appear to be handling an interrupt + * during a chip reset. + */ + ret = IRQ_HANDLED; + goto bail; + } + + istat = qib_read_kreg64(dd, kr_intstatus); + + if (unlikely(istat == ~0ULL)) { + qib_bad_intrstatus(dd); + qib_dev_err(dd, "Interrupt status all f's, skipping\n"); + /* don't know if it was our interrupt or not */ + ret = IRQ_NONE; + goto bail; + } + + istat &= dd->cspec->main_int_mask; + if (unlikely(!istat)) { + /* already handled, or shared and not us */ + ret = IRQ_NONE; + goto bail; + } + + this_cpu_inc(*dd->int_counter); + + /* handle "errors" of various kinds first, device ahead of port */ + if (unlikely(istat & (~QIB_I_BITSEXTANT | QIB_I_GPIO | + QIB_I_C_ERROR | INT_MASK_P(Err, 0) | + INT_MASK_P(Err, 1)))) + unlikely_7322_intr(dd, istat); + + /* + * Clear the interrupt bits we found set, relatively early, so we + * "know" know the chip will have seen this by the time we process + * the queue, and will re-interrupt if necessary. The processor + * itself won't take the interrupt again until we return. + */ + qib_write_kreg(dd, kr_intclear, istat); + + /* + * Handle kernel receive queues before checking for pio buffers + * available since receives can overflow; piobuf waiters can afford + * a few extra cycles, since they were waiting anyway. + */ + ctxtrbits = istat & (QIB_I_RCVAVAIL_MASK | QIB_I_RCVURG_MASK); + if (ctxtrbits) { + rmask = (1ULL << QIB_I_RCVAVAIL_LSB) | + (1ULL << QIB_I_RCVURG_LSB); + for (i = 0; i < dd->first_user_ctxt; i++) { + if (ctxtrbits & rmask) { + ctxtrbits &= ~rmask; + if (dd->rcd[i]) + qib_kreceive(dd->rcd[i], NULL, &npkts); + } + rmask <<= 1; + } + if (ctxtrbits) { + ctxtrbits = (ctxtrbits >> QIB_I_RCVAVAIL_LSB) | + (ctxtrbits >> QIB_I_RCVURG_LSB); + qib_handle_urcv(dd, ctxtrbits); + } + } + + if (istat & (QIB_I_P_SDMAINT(0) | QIB_I_P_SDMAINT(1))) + sdma_7322_intr(dd, istat); + + if ((istat & QIB_I_SPIOBUFAVAIL) && (dd->flags & QIB_INITTED)) + qib_ib_piobufavail(dd); + + ret = IRQ_HANDLED; +bail: + return ret; +} + +/* + * Dedicated receive packet available interrupt handler. + */ +static irqreturn_t qib_7322pintr(int irq, void *data) +{ + struct qib_ctxtdata *rcd = data; + struct qib_devdata *dd = rcd->dd; + u32 npkts; + + if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT) + /* + * This return value is not great, but we do not want the + * interrupt core code to remove our interrupt handler + * because we don't appear to be handling an interrupt + * during a chip reset. + */ + return IRQ_HANDLED; + + this_cpu_inc(*dd->int_counter); + + /* Clear the interrupt bit we expect to be set. */ + qib_write_kreg(dd, kr_intclear, ((1ULL << QIB_I_RCVAVAIL_LSB) | + (1ULL << QIB_I_RCVURG_LSB)) << rcd->ctxt); + + qib_kreceive(rcd, NULL, &npkts); + + return IRQ_HANDLED; +} + +/* + * Dedicated Send buffer available interrupt handler. + */ +static irqreturn_t qib_7322bufavail(int irq, void *data) +{ + struct qib_devdata *dd = data; + + if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT) + /* + * This return value is not great, but we do not want the + * interrupt core code to remove our interrupt handler + * because we don't appear to be handling an interrupt + * during a chip reset. + */ + return IRQ_HANDLED; + + this_cpu_inc(*dd->int_counter); + + /* Clear the interrupt bit we expect to be set. */ + qib_write_kreg(dd, kr_intclear, QIB_I_SPIOBUFAVAIL); + + /* qib_ib_piobufavail() will clear the want PIO interrupt if needed */ + if (dd->flags & QIB_INITTED) + qib_ib_piobufavail(dd); + else + qib_wantpiobuf_7322_intr(dd, 0); + + return IRQ_HANDLED; +} + +/* + * Dedicated Send DMA interrupt handler. + */ +static irqreturn_t sdma_intr(int irq, void *data) +{ + struct qib_pportdata *ppd = data; + struct qib_devdata *dd = ppd->dd; + + if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT) + /* + * This return value is not great, but we do not want the + * interrupt core code to remove our interrupt handler + * because we don't appear to be handling an interrupt + * during a chip reset. + */ + return IRQ_HANDLED; + + this_cpu_inc(*dd->int_counter); + + /* Clear the interrupt bit we expect to be set. */ + qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ? + INT_MASK_P(SDma, 1) : INT_MASK_P(SDma, 0)); + qib_sdma_intr(ppd); + + return IRQ_HANDLED; +} + +/* + * Dedicated Send DMA idle interrupt handler. + */ +static irqreturn_t sdma_idle_intr(int irq, void *data) +{ + struct qib_pportdata *ppd = data; + struct qib_devdata *dd = ppd->dd; + + if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT) + /* + * This return value is not great, but we do not want the + * interrupt core code to remove our interrupt handler + * because we don't appear to be handling an interrupt + * during a chip reset. + */ + return IRQ_HANDLED; + + this_cpu_inc(*dd->int_counter); + + /* Clear the interrupt bit we expect to be set. */ + qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ? + INT_MASK_P(SDmaIdle, 1) : INT_MASK_P(SDmaIdle, 0)); + qib_sdma_intr(ppd); + + return IRQ_HANDLED; +} + +/* + * Dedicated Send DMA progress interrupt handler. + */ +static irqreturn_t sdma_progress_intr(int irq, void *data) +{ + struct qib_pportdata *ppd = data; + struct qib_devdata *dd = ppd->dd; + + if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT) + /* + * This return value is not great, but we do not want the + * interrupt core code to remove our interrupt handler + * because we don't appear to be handling an interrupt + * during a chip reset. + */ + return IRQ_HANDLED; + + this_cpu_inc(*dd->int_counter); + + /* Clear the interrupt bit we expect to be set. */ + qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ? + INT_MASK_P(SDmaProgress, 1) : + INT_MASK_P(SDmaProgress, 0)); + qib_sdma_intr(ppd); + + return IRQ_HANDLED; +} + +/* + * Dedicated Send DMA cleanup interrupt handler. + */ +static irqreturn_t sdma_cleanup_intr(int irq, void *data) +{ + struct qib_pportdata *ppd = data; + struct qib_devdata *dd = ppd->dd; + + if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT) + /* + * This return value is not great, but we do not want the + * interrupt core code to remove our interrupt handler + * because we don't appear to be handling an interrupt + * during a chip reset. + */ + return IRQ_HANDLED; + + this_cpu_inc(*dd->int_counter); + + /* Clear the interrupt bit we expect to be set. */ + qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ? + INT_MASK_PM(SDmaCleanupDone, 1) : + INT_MASK_PM(SDmaCleanupDone, 0)); + qib_sdma_process_event(ppd, qib_sdma_event_e20_hw_started); + + return IRQ_HANDLED; +} + +#ifdef CONFIG_INFINIBAND_QIB_DCA + +static void reset_dca_notifier(struct qib_devdata *dd, struct qib_msix_entry *m) +{ + if (!m->dca) + return; + qib_devinfo(dd->pcidev, + "Disabling notifier on HCA %d irq %d\n", + dd->unit, + m->msix.vector); + irq_set_affinity_notifier( + m->msix.vector, + NULL); + m->notifier = NULL; +} + +static void setup_dca_notifier(struct qib_devdata *dd, struct qib_msix_entry *m) +{ + struct qib_irq_notify *n; + + if (!m->dca) + return; + n = kzalloc(sizeof(*n), GFP_KERNEL); + if (n) { + int ret; + + m->notifier = n; + n->notify.irq = m->msix.vector; + n->notify.notify = qib_irq_notifier_notify; + n->notify.release = qib_irq_notifier_release; + n->arg = m->arg; + n->rcv = m->rcv; + qib_devinfo(dd->pcidev, + "set notifier irq %d rcv %d notify %p\n", + n->notify.irq, n->rcv, &n->notify); + ret = irq_set_affinity_notifier( + n->notify.irq, + &n->notify); + if (ret) { + m->notifier = NULL; + kfree(n); + } + } +} + +#endif + +/* + * Set up our chip-specific interrupt handler. + * The interrupt type has already been setup, so + * we just need to do the registration and error checking. + * If we are using MSIx interrupts, we may fall back to + * INTx later, if the interrupt handler doesn't get called + * within 1/2 second (see verify_interrupt()). + */ +static void qib_setup_7322_interrupt(struct qib_devdata *dd, int clearpend) +{ + int ret, i, msixnum; + u64 redirect[6]; + u64 mask; + const struct cpumask *local_mask; + int firstcpu, secondcpu = 0, currrcvcpu = 0; + + if (!dd->num_pports) + return; + + if (clearpend) { + /* + * if not switching interrupt types, be sure interrupts are + * disabled, and then clear anything pending at this point, + * because we are starting clean. + */ + qib_7322_set_intr_state(dd, 0); + + /* clear the reset error, init error/hwerror mask */ + qib_7322_init_hwerrors(dd); + + /* clear any interrupt bits that might be set */ + qib_write_kreg(dd, kr_intclear, ~0ULL); + + /* make sure no pending MSIx intr, and clear diag reg */ + qib_write_kreg(dd, kr_intgranted, ~0ULL); + qib_write_kreg(dd, kr_vecclr_wo_int, ~0ULL); + } + + if (!dd->cspec->num_msix_entries) { + /* Try to get INTx interrupt */ +try_intx: + if (!dd->pcidev->irq) { + qib_dev_err(dd, + "irq is 0, BIOS error? Interrupts won't work\n"); + goto bail; + } + ret = request_irq(dd->pcidev->irq, qib_7322intr, + IRQF_SHARED, QIB_DRV_NAME, dd); + if (ret) { + qib_dev_err(dd, + "Couldn't setup INTx interrupt (irq=%d): %d\n", + dd->pcidev->irq, ret); + goto bail; + } + dd->cspec->irq = dd->pcidev->irq; + dd->cspec->main_int_mask = ~0ULL; + goto bail; + } + + /* Try to get MSIx interrupts */ + memset(redirect, 0, sizeof redirect); + mask = ~0ULL; + msixnum = 0; + local_mask = cpumask_of_pcibus(dd->pcidev->bus); + firstcpu = cpumask_first(local_mask); + if (firstcpu >= nr_cpu_ids || + cpumask_weight(local_mask) == num_online_cpus()) { + local_mask = topology_core_cpumask(0); + firstcpu = cpumask_first(local_mask); + } + if (firstcpu < nr_cpu_ids) { + secondcpu = cpumask_next(firstcpu, local_mask); + if (secondcpu >= nr_cpu_ids) + secondcpu = firstcpu; + currrcvcpu = secondcpu; + } + for (i = 0; msixnum < dd->cspec->num_msix_entries; i++) { + irq_handler_t handler; + void *arg; + u64 val; + int lsb, reg, sh; +#ifdef CONFIG_INFINIBAND_QIB_DCA + int dca = 0; +#endif + + dd->cspec->msix_entries[msixnum]. + name[sizeof(dd->cspec->msix_entries[msixnum].name) - 1] + = '\0'; + if (i < ARRAY_SIZE(irq_table)) { + if (irq_table[i].port) { + /* skip if for a non-configured port */ + if (irq_table[i].port > dd->num_pports) + continue; + arg = dd->pport + irq_table[i].port - 1; + } else + arg = dd; +#ifdef CONFIG_INFINIBAND_QIB_DCA + dca = irq_table[i].dca; +#endif + lsb = irq_table[i].lsb; + handler = irq_table[i].handler; + snprintf(dd->cspec->msix_entries[msixnum].name, + sizeof(dd->cspec->msix_entries[msixnum].name) + - 1, + QIB_DRV_NAME "%d%s", dd->unit, + irq_table[i].name); + } else { + unsigned ctxt; + + ctxt = i - ARRAY_SIZE(irq_table); + /* per krcvq context receive interrupt */ + arg = dd->rcd[ctxt]; + if (!arg) + continue; + if (qib_krcvq01_no_msi && ctxt < 2) + continue; +#ifdef CONFIG_INFINIBAND_QIB_DCA + dca = 1; +#endif + lsb = QIB_I_RCVAVAIL_LSB + ctxt; + handler = qib_7322pintr; + snprintf(dd->cspec->msix_entries[msixnum].name, + sizeof(dd->cspec->msix_entries[msixnum].name) + - 1, + QIB_DRV_NAME "%d (kctx)", dd->unit); + } + ret = request_irq( + dd->cspec->msix_entries[msixnum].msix.vector, + handler, 0, dd->cspec->msix_entries[msixnum].name, + arg); + if (ret) { + /* + * Shouldn't happen since the enable said we could + * have as many as we are trying to setup here. + */ + qib_dev_err(dd, + "Couldn't setup MSIx interrupt (vec=%d, irq=%d): %d\n", + msixnum, + dd->cspec->msix_entries[msixnum].msix.vector, + ret); + qib_7322_nomsix(dd); + goto try_intx; + } + dd->cspec->msix_entries[msixnum].arg = arg; +#ifdef CONFIG_INFINIBAND_QIB_DCA + dd->cspec->msix_entries[msixnum].dca = dca; + dd->cspec->msix_entries[msixnum].rcv = + handler == qib_7322pintr; +#endif + if (lsb >= 0) { + reg = lsb / IBA7322_REDIRECT_VEC_PER_REG; + sh = (lsb % IBA7322_REDIRECT_VEC_PER_REG) * + SYM_LSB(IntRedirect0, vec1); + mask &= ~(1ULL << lsb); + redirect[reg] |= ((u64) msixnum) << sh; + } + val = qib_read_kreg64(dd, 2 * msixnum + 1 + + (QIB_7322_MsixTable_OFFS / sizeof(u64))); + if (firstcpu < nr_cpu_ids && + zalloc_cpumask_var( + &dd->cspec->msix_entries[msixnum].mask, + GFP_KERNEL)) { + if (handler == qib_7322pintr) { + cpumask_set_cpu(currrcvcpu, + dd->cspec->msix_entries[msixnum].mask); + currrcvcpu = cpumask_next(currrcvcpu, + local_mask); + if (currrcvcpu >= nr_cpu_ids) + currrcvcpu = secondcpu; + } else { + cpumask_set_cpu(firstcpu, + dd->cspec->msix_entries[msixnum].mask); + } + irq_set_affinity_hint( + dd->cspec->msix_entries[msixnum].msix.vector, + dd->cspec->msix_entries[msixnum].mask); + } + msixnum++; + } + /* Initialize the vector mapping */ + for (i = 0; i < ARRAY_SIZE(redirect); i++) + qib_write_kreg(dd, kr_intredirect + i, redirect[i]); + dd->cspec->main_int_mask = mask; + tasklet_init(&dd->error_tasklet, qib_error_tasklet, + (unsigned long)dd); +bail:; +} + +/** + * qib_7322_boardname - fill in the board name and note features + * @dd: the qlogic_ib device + * + * info will be based on the board revision register + */ +static unsigned qib_7322_boardname(struct qib_devdata *dd) +{ + /* Will need enumeration of board-types here */ + char *n; + u32 boardid, namelen; + unsigned features = DUAL_PORT_CAP; + + boardid = SYM_FIELD(dd->revision, Revision, BoardID); + + switch (boardid) { + case 0: + n = "InfiniPath_QLE7342_Emulation"; + break; + case 1: + n = "InfiniPath_QLE7340"; + dd->flags |= QIB_HAS_QSFP; + features = PORT_SPD_CAP; + break; + case 2: + n = "InfiniPath_QLE7342"; + dd->flags |= QIB_HAS_QSFP; + break; + case 3: + n = "InfiniPath_QMI7342"; + break; + case 4: + n = "InfiniPath_Unsupported7342"; + qib_dev_err(dd, "Unsupported version of QMH7342\n"); + features = 0; + break; + case BOARD_QMH7342: + n = "InfiniPath_QMH7342"; + features = 0x24; + break; + case BOARD_QME7342: + n = "InfiniPath_QME7342"; + break; + case 8: + n = "InfiniPath_QME7362"; + dd->flags |= QIB_HAS_QSFP; + break; + case 15: + n = "InfiniPath_QLE7342_TEST"; + dd->flags |= QIB_HAS_QSFP; + break; + default: + n = "InfiniPath_QLE73xy_UNKNOWN"; + qib_dev_err(dd, "Unknown 7322 board type %u\n", boardid); + break; + } + dd->board_atten = 1; /* index into txdds_Xdr */ + + namelen = strlen(n) + 1; + dd->boardname = kmalloc(namelen, GFP_KERNEL); + if (!dd->boardname) + qib_dev_err(dd, "Failed allocation for board name: %s\n", n); + else + snprintf(dd->boardname, namelen, "%s", n); + + snprintf(dd->boardversion, sizeof(dd->boardversion), + "ChipABI %u.%u, %s, InfiniPath%u %u.%u, SW Compat %u\n", + QIB_CHIP_VERS_MAJ, QIB_CHIP_VERS_MIN, dd->boardname, + (unsigned)SYM_FIELD(dd->revision, Revision_R, Arch), + dd->majrev, dd->minrev, + (unsigned)SYM_FIELD(dd->revision, Revision_R, SW)); + + if (qib_singleport && (features >> PORT_SPD_CAP_SHIFT) & PORT_SPD_CAP) { + qib_devinfo(dd->pcidev, + "IB%u: Forced to single port mode by module parameter\n", + dd->unit); + features &= PORT_SPD_CAP; + } + + return features; +} + +/* + * This routine sleeps, so it can only be called from user context, not + * from interrupt context. + */ +static int qib_do_7322_reset(struct qib_devdata *dd) +{ + u64 val; + u64 *msix_vecsave; + int i, msix_entries, ret = 1; + u16 cmdval; + u8 int_line, clinesz; + unsigned long flags; + + /* Use dev_err so it shows up in logs, etc. */ + qib_dev_err(dd, "Resetting InfiniPath unit %u\n", dd->unit); + + qib_pcie_getcmd(dd, &cmdval, &int_line, &clinesz); + + msix_entries = dd->cspec->num_msix_entries; + + /* no interrupts till re-initted */ + qib_7322_set_intr_state(dd, 0); + + if (msix_entries) { + qib_7322_nomsix(dd); + /* can be up to 512 bytes, too big for stack */ + msix_vecsave = kmalloc(2 * dd->cspec->num_msix_entries * + sizeof(u64), GFP_KERNEL); + if (!msix_vecsave) + qib_dev_err(dd, "No mem to save MSIx data\n"); + } else + msix_vecsave = NULL; + + /* + * Core PCI (as of 2.6.18) doesn't save or rewrite the full vector + * info that is set up by the BIOS, so we have to save and restore + * it ourselves. There is some risk something could change it, + * after we save it, but since we have disabled the MSIx, it + * shouldn't be touched... + */ + for (i = 0; i < msix_entries; i++) { + u64 vecaddr, vecdata; + vecaddr = qib_read_kreg64(dd, 2 * i + + (QIB_7322_MsixTable_OFFS / sizeof(u64))); + vecdata = qib_read_kreg64(dd, 1 + 2 * i + + (QIB_7322_MsixTable_OFFS / sizeof(u64))); + if (msix_vecsave) { + msix_vecsave[2 * i] = vecaddr; + /* save it without the masked bit set */ + msix_vecsave[1 + 2 * i] = vecdata & ~0x100000000ULL; + } + } + + dd->pport->cpspec->ibdeltainprog = 0; + dd->pport->cpspec->ibsymdelta = 0; + dd->pport->cpspec->iblnkerrdelta = 0; + dd->pport->cpspec->ibmalfdelta = 0; + /* so we check interrupts work again */ + dd->z_int_counter = qib_int_counter(dd); + + /* + * Keep chip from being accessed until we are ready. Use + * writeq() directly, to allow the write even though QIB_PRESENT + * isn't set. + */ + dd->flags &= ~(QIB_INITTED | QIB_PRESENT | QIB_BADINTR); + dd->flags |= QIB_DOING_RESET; + val = dd->control | QLOGIC_IB_C_RESET; + writeq(val, &dd->kregbase[kr_control]); + + for (i = 1; i <= 5; i++) { + /* + * Allow MBIST, etc. to complete; longer on each retry. + * We sometimes get machine checks from bus timeout if no + * response, so for now, make it *really* long. + */ + msleep(1000 + (1 + i) * 3000); + + qib_pcie_reenable(dd, cmdval, int_line, clinesz); + + /* + * Use readq directly, so we don't need to mark it as PRESENT + * until we get a successful indication that all is well. + */ + val = readq(&dd->kregbase[kr_revision]); + if (val == dd->revision) + break; + if (i == 5) { + qib_dev_err(dd, + "Failed to initialize after reset, unusable\n"); + ret = 0; + goto bail; + } + } + + dd->flags |= QIB_PRESENT; /* it's back */ + + if (msix_entries) { + /* restore the MSIx vector address and data if saved above */ + for (i = 0; i < msix_entries; i++) { + dd->cspec->msix_entries[i].msix.entry = i; + if (!msix_vecsave || !msix_vecsave[2 * i]) + continue; + qib_write_kreg(dd, 2 * i + + (QIB_7322_MsixTable_OFFS / sizeof(u64)), + msix_vecsave[2 * i]); + qib_write_kreg(dd, 1 + 2 * i + + (QIB_7322_MsixTable_OFFS / sizeof(u64)), + msix_vecsave[1 + 2 * i]); + } + } + + /* initialize the remaining registers. */ + for (i = 0; i < dd->num_pports; ++i) + write_7322_init_portregs(&dd->pport[i]); + write_7322_initregs(dd); + + if (qib_pcie_params(dd, dd->lbus_width, + &dd->cspec->num_msix_entries, + dd->cspec->msix_entries)) + qib_dev_err(dd, + "Reset failed to setup PCIe or interrupts; continuing anyway\n"); + + qib_setup_7322_interrupt(dd, 1); + + for (i = 0; i < dd->num_pports; ++i) { + struct qib_pportdata *ppd = &dd->pport[i]; + + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags |= QIBL_IB_FORCE_NOTIFY; + ppd->lflags &= ~QIBL_IB_AUTONEG_FAILED; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + } + +bail: + dd->flags &= ~QIB_DOING_RESET; /* OK or not, no longer resetting */ + kfree(msix_vecsave); + return ret; +} + +/** + * qib_7322_put_tid - write a TID to the chip + * @dd: the qlogic_ib device + * @tidptr: pointer to the expected TID (in chip) to update + * @tidtype: 0 for eager, 1 for expected + * @pa: physical address of in memory buffer; tidinvalid if freeing + */ +static void qib_7322_put_tid(struct qib_devdata *dd, u64 __iomem *tidptr, + u32 type, unsigned long pa) +{ + if (!(dd->flags & QIB_PRESENT)) + return; + if (pa != dd->tidinvalid) { + u64 chippa = pa >> IBA7322_TID_PA_SHIFT; + + /* paranoia checks */ + if (pa != (chippa << IBA7322_TID_PA_SHIFT)) { + qib_dev_err(dd, "Physaddr %lx not 2KB aligned!\n", + pa); + return; + } + if (chippa >= (1UL << IBA7322_TID_SZ_SHIFT)) { + qib_dev_err(dd, + "Physical page address 0x%lx larger than supported\n", + pa); + return; + } + + if (type == RCVHQ_RCV_TYPE_EAGER) + chippa |= dd->tidtemplate; + else /* for now, always full 4KB page */ + chippa |= IBA7322_TID_SZ_4K; + pa = chippa; + } + writeq(pa, tidptr); + mmiowb(); +} + +/** + * qib_7322_clear_tids - clear all TID entries for a ctxt, expected and eager + * @dd: the qlogic_ib device + * @ctxt: the ctxt + * + * clear all TID entries for a ctxt, expected and eager. + * Used from qib_close(). + */ +static void qib_7322_clear_tids(struct qib_devdata *dd, + struct qib_ctxtdata *rcd) +{ + u64 __iomem *tidbase; + unsigned long tidinv; + u32 ctxt; + int i; + + if (!dd->kregbase || !rcd) + return; + + ctxt = rcd->ctxt; + + tidinv = dd->tidinvalid; + tidbase = (u64 __iomem *) + ((char __iomem *) dd->kregbase + + dd->rcvtidbase + + ctxt * dd->rcvtidcnt * sizeof(*tidbase)); + + for (i = 0; i < dd->rcvtidcnt; i++) + qib_7322_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EXPECTED, + tidinv); + + tidbase = (u64 __iomem *) + ((char __iomem *) dd->kregbase + + dd->rcvegrbase + + rcd->rcvegr_tid_base * sizeof(*tidbase)); + + for (i = 0; i < rcd->rcvegrcnt; i++) + qib_7322_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EAGER, + tidinv); +} + +/** + * qib_7322_tidtemplate - setup constants for TID updates + * @dd: the qlogic_ib device + * + * We setup stuff that we use a lot, to avoid calculating each time + */ +static void qib_7322_tidtemplate(struct qib_devdata *dd) +{ + /* + * For now, we always allocate 4KB buffers (at init) so we can + * receive max size packets. We may want a module parameter to + * specify 2KB or 4KB and/or make it per port instead of per device + * for those who want to reduce memory footprint. Note that the + * rcvhdrentsize size must be large enough to hold the largest + * IB header (currently 96 bytes) that we expect to handle (plus of + * course the 2 dwords of RHF). + */ + if (dd->rcvegrbufsize == 2048) + dd->tidtemplate = IBA7322_TID_SZ_2K; + else if (dd->rcvegrbufsize == 4096) + dd->tidtemplate = IBA7322_TID_SZ_4K; + dd->tidinvalid = 0; +} + +/** + * qib_init_7322_get_base_info - set chip-specific flags for user code + * @rcd: the qlogic_ib ctxt + * @kbase: qib_base_info pointer + * + * We set the PCIE flag because the lower bandwidth on PCIe vs + * HyperTransport can affect some user packet algorithims. + */ + +static int qib_7322_get_base_info(struct qib_ctxtdata *rcd, + struct qib_base_info *kinfo) +{ + kinfo->spi_runtime_flags |= QIB_RUNTIME_CTXT_MSB_IN_QP | + QIB_RUNTIME_PCIE | QIB_RUNTIME_NODMA_RTAIL | + QIB_RUNTIME_HDRSUPP | QIB_RUNTIME_SDMA; + if (rcd->dd->cspec->r1) + kinfo->spi_runtime_flags |= QIB_RUNTIME_RCHK; + if (rcd->dd->flags & QIB_USE_SPCL_TRIG) + kinfo->spi_runtime_flags |= QIB_RUNTIME_SPECIAL_TRIGGER; + + return 0; +} + +static struct qib_message_header * +qib_7322_get_msgheader(struct qib_devdata *dd, __le32 *rhf_addr) +{ + u32 offset = qib_hdrget_offset(rhf_addr); + + return (struct qib_message_header *) + (rhf_addr - dd->rhf_offset + offset); +} + +/* + * Configure number of contexts. + */ +static void qib_7322_config_ctxts(struct qib_devdata *dd) +{ + unsigned long flags; + u32 nchipctxts; + + nchipctxts = qib_read_kreg32(dd, kr_contextcnt); + dd->cspec->numctxts = nchipctxts; + if (qib_n_krcv_queues > 1 && dd->num_pports) { + dd->first_user_ctxt = NUM_IB_PORTS + + (qib_n_krcv_queues - 1) * dd->num_pports; + if (dd->first_user_ctxt > nchipctxts) + dd->first_user_ctxt = nchipctxts; + dd->n_krcv_queues = dd->first_user_ctxt / dd->num_pports; + } else { + dd->first_user_ctxt = NUM_IB_PORTS; + dd->n_krcv_queues = 1; + } + + if (!qib_cfgctxts) { + int nctxts = dd->first_user_ctxt + num_online_cpus(); + + if (nctxts <= 6) + dd->ctxtcnt = 6; + else if (nctxts <= 10) + dd->ctxtcnt = 10; + else if (nctxts <= nchipctxts) + dd->ctxtcnt = nchipctxts; + } else if (qib_cfgctxts < dd->num_pports) + dd->ctxtcnt = dd->num_pports; + else if (qib_cfgctxts <= nchipctxts) + dd->ctxtcnt = qib_cfgctxts; + if (!dd->ctxtcnt) /* none of the above, set to max */ + dd->ctxtcnt = nchipctxts; + + /* + * Chip can be configured for 6, 10, or 18 ctxts, and choice + * affects number of eager TIDs per ctxt (1K, 2K, 4K). + * Lock to be paranoid about later motion, etc. + */ + spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags); + if (dd->ctxtcnt > 10) + dd->rcvctrl |= 2ULL << SYM_LSB(RcvCtrl, ContextCfg); + else if (dd->ctxtcnt > 6) + dd->rcvctrl |= 1ULL << SYM_LSB(RcvCtrl, ContextCfg); + /* else configure for default 6 receive ctxts */ + + /* The XRC opcode is 5. */ + dd->rcvctrl |= 5ULL << SYM_LSB(RcvCtrl, XrcTypeCode); + + /* + * RcvCtrl *must* be written here so that the + * chip understands how to change rcvegrcnt below. + */ + qib_write_kreg(dd, kr_rcvctrl, dd->rcvctrl); + spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags); + + /* kr_rcvegrcnt changes based on the number of contexts enabled */ + dd->cspec->rcvegrcnt = qib_read_kreg32(dd, kr_rcvegrcnt); + if (qib_rcvhdrcnt) + dd->rcvhdrcnt = max(dd->cspec->rcvegrcnt, qib_rcvhdrcnt); + else + dd->rcvhdrcnt = 2 * max(dd->cspec->rcvegrcnt, + dd->num_pports > 1 ? 1024U : 2048U); +} + +static int qib_7322_get_ib_cfg(struct qib_pportdata *ppd, int which) +{ + + int lsb, ret = 0; + u64 maskr; /* right-justified mask */ + + switch (which) { + + case QIB_IB_CFG_LWID_ENB: /* Get allowed Link-width */ + ret = ppd->link_width_enabled; + goto done; + + case QIB_IB_CFG_LWID: /* Get currently active Link-width */ + ret = ppd->link_width_active; + goto done; + + case QIB_IB_CFG_SPD_ENB: /* Get allowed Link speeds */ + ret = ppd->link_speed_enabled; + goto done; + + case QIB_IB_CFG_SPD: /* Get current Link spd */ + ret = ppd->link_speed_active; + goto done; + + case QIB_IB_CFG_RXPOL_ENB: /* Get Auto-RX-polarity enable */ + lsb = SYM_LSB(IBCCtrlB_0, IB_POLARITY_REV_SUPP); + maskr = SYM_RMASK(IBCCtrlB_0, IB_POLARITY_REV_SUPP); + break; + + case QIB_IB_CFG_LREV_ENB: /* Get Auto-Lane-reversal enable */ + lsb = SYM_LSB(IBCCtrlB_0, IB_LANE_REV_SUPPORTED); + maskr = SYM_RMASK(IBCCtrlB_0, IB_LANE_REV_SUPPORTED); + break; + + case QIB_IB_CFG_LINKLATENCY: + ret = qib_read_kreg_port(ppd, krp_ibcstatus_b) & + SYM_MASK(IBCStatusB_0, LinkRoundTripLatency); + goto done; + + case QIB_IB_CFG_OP_VLS: + ret = ppd->vls_operational; + goto done; + + case QIB_IB_CFG_VL_HIGH_CAP: + ret = 16; + goto done; + + case QIB_IB_CFG_VL_LOW_CAP: + ret = 16; + goto done; + + case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */ + ret = SYM_FIELD(ppd->cpspec->ibcctrl_a, IBCCtrlA_0, + OverrunThreshold); + goto done; + + case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */ + ret = SYM_FIELD(ppd->cpspec->ibcctrl_a, IBCCtrlA_0, + PhyerrThreshold); + goto done; + + case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */ + /* will only take effect when the link state changes */ + ret = (ppd->cpspec->ibcctrl_a & + SYM_MASK(IBCCtrlA_0, LinkDownDefaultState)) ? + IB_LINKINITCMD_SLEEP : IB_LINKINITCMD_POLL; + goto done; + + case QIB_IB_CFG_HRTBT: /* Get Heartbeat off/enable/auto */ + lsb = IBA7322_IBC_HRTBT_LSB; + maskr = IBA7322_IBC_HRTBT_RMASK; /* OR of AUTO and ENB */ + break; + + case QIB_IB_CFG_PMA_TICKS: + /* + * 0x00 = 10x link transfer rate or 4 nsec. for 2.5Gbs + * Since the clock is always 250MHz, the value is 3, 1 or 0. + */ + if (ppd->link_speed_active == QIB_IB_QDR) + ret = 3; + else if (ppd->link_speed_active == QIB_IB_DDR) + ret = 1; + else + ret = 0; + goto done; + + default: + ret = -EINVAL; + goto done; + } + ret = (int)((ppd->cpspec->ibcctrl_b >> lsb) & maskr); +done: + return ret; +} + +/* + * Below again cribbed liberally from older version. Do not lean + * heavily on it. + */ +#define IBA7322_IBC_DLIDLMC_SHIFT QIB_7322_IBCCtrlB_0_IB_DLID_LSB +#define IBA7322_IBC_DLIDLMC_MASK (QIB_7322_IBCCtrlB_0_IB_DLID_RMASK \ + | (QIB_7322_IBCCtrlB_0_IB_DLID_MASK_RMASK << 16)) + +static int qib_7322_set_ib_cfg(struct qib_pportdata *ppd, int which, u32 val) +{ + struct qib_devdata *dd = ppd->dd; + u64 maskr; /* right-justified mask */ + int lsb, ret = 0; + u16 lcmd, licmd; + unsigned long flags; + + switch (which) { + case QIB_IB_CFG_LIDLMC: + /* + * Set LID and LMC. Combined to avoid possible hazard + * caller puts LMC in 16MSbits, DLID in 16LSbits of val + */ + lsb = IBA7322_IBC_DLIDLMC_SHIFT; + maskr = IBA7322_IBC_DLIDLMC_MASK; + /* + * For header-checking, the SLID in the packet will + * be masked with SendIBSLMCMask, and compared + * with SendIBSLIDAssignMask. Make sure we do not + * set any bits not covered by the mask, or we get + * false-positives. + */ + qib_write_kreg_port(ppd, krp_sendslid, + val & (val >> 16) & SendIBSLIDAssignMask); + qib_write_kreg_port(ppd, krp_sendslidmask, + (val >> 16) & SendIBSLMCMask); + break; + + case QIB_IB_CFG_LWID_ENB: /* set allowed Link-width */ + ppd->link_width_enabled = val; + /* convert IB value to chip register value */ + if (val == IB_WIDTH_1X) + val = 0; + else if (val == IB_WIDTH_4X) + val = 1; + else + val = 3; + maskr = SYM_RMASK(IBCCtrlB_0, IB_NUM_CHANNELS); + lsb = SYM_LSB(IBCCtrlB_0, IB_NUM_CHANNELS); + break; + + case QIB_IB_CFG_SPD_ENB: /* set allowed Link speeds */ + /* + * As with width, only write the actual register if the + * link is currently down, otherwise takes effect on next + * link change. Since setting is being explicitly requested + * (via MAD or sysfs), clear autoneg failure status if speed + * autoneg is enabled. + */ + ppd->link_speed_enabled = val; + val <<= IBA7322_IBC_SPEED_LSB; + maskr = IBA7322_IBC_SPEED_MASK | IBA7322_IBC_IBTA_1_2_MASK | + IBA7322_IBC_MAX_SPEED_MASK; + if (val & (val - 1)) { + /* Muliple speeds enabled */ + val |= IBA7322_IBC_IBTA_1_2_MASK | + IBA7322_IBC_MAX_SPEED_MASK; + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_IB_AUTONEG_FAILED; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + } else if (val & IBA7322_IBC_SPEED_QDR) + val |= IBA7322_IBC_IBTA_1_2_MASK; + /* IBTA 1.2 mode + min/max + speed bits are contiguous */ + lsb = SYM_LSB(IBCCtrlB_0, IB_ENHANCED_MODE); + break; + + case QIB_IB_CFG_RXPOL_ENB: /* set Auto-RX-polarity enable */ + lsb = SYM_LSB(IBCCtrlB_0, IB_POLARITY_REV_SUPP); + maskr = SYM_RMASK(IBCCtrlB_0, IB_POLARITY_REV_SUPP); + break; + + case QIB_IB_CFG_LREV_ENB: /* set Auto-Lane-reversal enable */ + lsb = SYM_LSB(IBCCtrlB_0, IB_LANE_REV_SUPPORTED); + maskr = SYM_RMASK(IBCCtrlB_0, IB_LANE_REV_SUPPORTED); + break; + + case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */ + maskr = SYM_FIELD(ppd->cpspec->ibcctrl_a, IBCCtrlA_0, + OverrunThreshold); + if (maskr != val) { + ppd->cpspec->ibcctrl_a &= + ~SYM_MASK(IBCCtrlA_0, OverrunThreshold); + ppd->cpspec->ibcctrl_a |= (u64) val << + SYM_LSB(IBCCtrlA_0, OverrunThreshold); + qib_write_kreg_port(ppd, krp_ibcctrl_a, + ppd->cpspec->ibcctrl_a); + qib_write_kreg(dd, kr_scratch, 0ULL); + } + goto bail; + + case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */ + maskr = SYM_FIELD(ppd->cpspec->ibcctrl_a, IBCCtrlA_0, + PhyerrThreshold); + if (maskr != val) { + ppd->cpspec->ibcctrl_a &= + ~SYM_MASK(IBCCtrlA_0, PhyerrThreshold); + ppd->cpspec->ibcctrl_a |= (u64) val << + SYM_LSB(IBCCtrlA_0, PhyerrThreshold); + qib_write_kreg_port(ppd, krp_ibcctrl_a, + ppd->cpspec->ibcctrl_a); + qib_write_kreg(dd, kr_scratch, 0ULL); + } + goto bail; + + case QIB_IB_CFG_PKEYS: /* update pkeys */ + maskr = (u64) ppd->pkeys[0] | ((u64) ppd->pkeys[1] << 16) | + ((u64) ppd->pkeys[2] << 32) | + ((u64) ppd->pkeys[3] << 48); + qib_write_kreg_port(ppd, krp_partitionkey, maskr); + goto bail; + + case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */ + /* will only take effect when the link state changes */ + if (val == IB_LINKINITCMD_POLL) + ppd->cpspec->ibcctrl_a &= + ~SYM_MASK(IBCCtrlA_0, LinkDownDefaultState); + else /* SLEEP */ + ppd->cpspec->ibcctrl_a |= + SYM_MASK(IBCCtrlA_0, LinkDownDefaultState); + qib_write_kreg_port(ppd, krp_ibcctrl_a, ppd->cpspec->ibcctrl_a); + qib_write_kreg(dd, kr_scratch, 0ULL); + goto bail; + + case QIB_IB_CFG_MTU: /* update the MTU in IBC */ + /* + * Update our housekeeping variables, and set IBC max + * size, same as init code; max IBC is max we allow in + * buffer, less the qword pbc, plus 1 for ICRC, in dwords + * Set even if it's unchanged, print debug message only + * on changes. + */ + val = (ppd->ibmaxlen >> 2) + 1; + ppd->cpspec->ibcctrl_a &= ~SYM_MASK(IBCCtrlA_0, MaxPktLen); + ppd->cpspec->ibcctrl_a |= (u64)val << + SYM_LSB(IBCCtrlA_0, MaxPktLen); + qib_write_kreg_port(ppd, krp_ibcctrl_a, + ppd->cpspec->ibcctrl_a); + qib_write_kreg(dd, kr_scratch, 0ULL); + goto bail; + + case QIB_IB_CFG_LSTATE: /* set the IB link state */ + switch (val & 0xffff0000) { + case IB_LINKCMD_DOWN: + lcmd = QLOGIC_IB_IBCC_LINKCMD_DOWN; + ppd->cpspec->ibmalfusesnap = 1; + ppd->cpspec->ibmalfsnap = read_7322_creg32_port(ppd, + crp_errlink); + if (!ppd->cpspec->ibdeltainprog && + qib_compat_ddr_negotiate) { + ppd->cpspec->ibdeltainprog = 1; + ppd->cpspec->ibsymsnap = + read_7322_creg32_port(ppd, + crp_ibsymbolerr); + ppd->cpspec->iblnkerrsnap = + read_7322_creg32_port(ppd, + crp_iblinkerrrecov); + } + break; + + case IB_LINKCMD_ARMED: + lcmd = QLOGIC_IB_IBCC_LINKCMD_ARMED; + if (ppd->cpspec->ibmalfusesnap) { + ppd->cpspec->ibmalfusesnap = 0; + ppd->cpspec->ibmalfdelta += + read_7322_creg32_port(ppd, + crp_errlink) - + ppd->cpspec->ibmalfsnap; + } + break; + + case IB_LINKCMD_ACTIVE: + lcmd = QLOGIC_IB_IBCC_LINKCMD_ACTIVE; + break; + + default: + ret = -EINVAL; + qib_dev_err(dd, "bad linkcmd req 0x%x\n", val >> 16); + goto bail; + } + switch (val & 0xffff) { + case IB_LINKINITCMD_NOP: + licmd = 0; + break; + + case IB_LINKINITCMD_POLL: + licmd = QLOGIC_IB_IBCC_LINKINITCMD_POLL; + break; + + case IB_LINKINITCMD_SLEEP: + licmd = QLOGIC_IB_IBCC_LINKINITCMD_SLEEP; + break; + + case IB_LINKINITCMD_DISABLE: + licmd = QLOGIC_IB_IBCC_LINKINITCMD_DISABLE; + ppd->cpspec->chase_end = 0; + /* + * stop state chase counter and timer, if running. + * wait forpending timer, but don't clear .data (ppd)! + */ + if (ppd->cpspec->chase_timer.expires) { + del_timer_sync(&ppd->cpspec->chase_timer); + ppd->cpspec->chase_timer.expires = 0; + } + break; + + default: + ret = -EINVAL; + qib_dev_err(dd, "bad linkinitcmd req 0x%x\n", + val & 0xffff); + goto bail; + } + qib_set_ib_7322_lstate(ppd, lcmd, licmd); + goto bail; + + case QIB_IB_CFG_OP_VLS: + if (ppd->vls_operational != val) { + ppd->vls_operational = val; + set_vls(ppd); + } + goto bail; + + case QIB_IB_CFG_VL_HIGH_LIMIT: + qib_write_kreg_port(ppd, krp_highprio_limit, val); + goto bail; + + case QIB_IB_CFG_HRTBT: /* set Heartbeat off/enable/auto */ + if (val > 3) { + ret = -EINVAL; + goto bail; + } + lsb = IBA7322_IBC_HRTBT_LSB; + maskr = IBA7322_IBC_HRTBT_RMASK; /* OR of AUTO and ENB */ + break; + + case QIB_IB_CFG_PORT: + /* val is the port number of the switch we are connected to. */ + if (ppd->dd->cspec->r1) { + cancel_delayed_work(&ppd->cpspec->ipg_work); + ppd->cpspec->ipg_tries = 0; + } + goto bail; + + default: + ret = -EINVAL; + goto bail; + } + ppd->cpspec->ibcctrl_b &= ~(maskr << lsb); + ppd->cpspec->ibcctrl_b |= (((u64) val & maskr) << lsb); + qib_write_kreg_port(ppd, krp_ibcctrl_b, ppd->cpspec->ibcctrl_b); + qib_write_kreg(dd, kr_scratch, 0); +bail: + return ret; +} + +static int qib_7322_set_loopback(struct qib_pportdata *ppd, const char *what) +{ + int ret = 0; + u64 val, ctrlb; + + /* only IBC loopback, may add serdes and xgxs loopbacks later */ + if (!strncmp(what, "ibc", 3)) { + ppd->cpspec->ibcctrl_a |= SYM_MASK(IBCCtrlA_0, + Loopback); + val = 0; /* disable heart beat, so link will come up */ + qib_devinfo(ppd->dd->pcidev, "Enabling IB%u:%u IBC loopback\n", + ppd->dd->unit, ppd->port); + } else if (!strncmp(what, "off", 3)) { + ppd->cpspec->ibcctrl_a &= ~SYM_MASK(IBCCtrlA_0, + Loopback); + /* enable heart beat again */ + val = IBA7322_IBC_HRTBT_RMASK << IBA7322_IBC_HRTBT_LSB; + qib_devinfo(ppd->dd->pcidev, + "Disabling IB%u:%u IBC loopback (normal)\n", + ppd->dd->unit, ppd->port); + } else + ret = -EINVAL; + if (!ret) { + qib_write_kreg_port(ppd, krp_ibcctrl_a, + ppd->cpspec->ibcctrl_a); + ctrlb = ppd->cpspec->ibcctrl_b & ~(IBA7322_IBC_HRTBT_MASK + << IBA7322_IBC_HRTBT_LSB); + ppd->cpspec->ibcctrl_b = ctrlb | val; + qib_write_kreg_port(ppd, krp_ibcctrl_b, + ppd->cpspec->ibcctrl_b); + qib_write_kreg(ppd->dd, kr_scratch, 0); + } + return ret; +} + +static void get_vl_weights(struct qib_pportdata *ppd, unsigned regno, + struct ib_vl_weight_elem *vl) +{ + unsigned i; + + for (i = 0; i < 16; i++, regno++, vl++) { + u32 val = qib_read_kreg_port(ppd, regno); + + vl->vl = (val >> SYM_LSB(LowPriority0_0, VirtualLane)) & + SYM_RMASK(LowPriority0_0, VirtualLane); + vl->weight = (val >> SYM_LSB(LowPriority0_0, Weight)) & + SYM_RMASK(LowPriority0_0, Weight); + } +} + +static void set_vl_weights(struct qib_pportdata *ppd, unsigned regno, + struct ib_vl_weight_elem *vl) +{ + unsigned i; + + for (i = 0; i < 16; i++, regno++, vl++) { + u64 val; + + val = ((vl->vl & SYM_RMASK(LowPriority0_0, VirtualLane)) << + SYM_LSB(LowPriority0_0, VirtualLane)) | + ((vl->weight & SYM_RMASK(LowPriority0_0, Weight)) << + SYM_LSB(LowPriority0_0, Weight)); + qib_write_kreg_port(ppd, regno, val); + } + if (!(ppd->p_sendctrl & SYM_MASK(SendCtrl_0, IBVLArbiterEn))) { + struct qib_devdata *dd = ppd->dd; + unsigned long flags; + + spin_lock_irqsave(&dd->sendctrl_lock, flags); + ppd->p_sendctrl |= SYM_MASK(SendCtrl_0, IBVLArbiterEn); + qib_write_kreg_port(ppd, krp_sendctrl, ppd->p_sendctrl); + qib_write_kreg(dd, kr_scratch, 0); + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); + } +} + +static int qib_7322_get_ib_table(struct qib_pportdata *ppd, int which, void *t) +{ + switch (which) { + case QIB_IB_TBL_VL_HIGH_ARB: + get_vl_weights(ppd, krp_highprio_0, t); + break; + + case QIB_IB_TBL_VL_LOW_ARB: + get_vl_weights(ppd, krp_lowprio_0, t); + break; + + default: + return -EINVAL; + } + return 0; +} + +static int qib_7322_set_ib_table(struct qib_pportdata *ppd, int which, void *t) +{ + switch (which) { + case QIB_IB_TBL_VL_HIGH_ARB: + set_vl_weights(ppd, krp_highprio_0, t); + break; + + case QIB_IB_TBL_VL_LOW_ARB: + set_vl_weights(ppd, krp_lowprio_0, t); + break; + + default: + return -EINVAL; + } + return 0; +} + +static void qib_update_7322_usrhead(struct qib_ctxtdata *rcd, u64 hd, + u32 updegr, u32 egrhd, u32 npkts) +{ + /* + * Need to write timeout register before updating rcvhdrhead to ensure + * that the timer is enabled on reception of a packet. + */ + if (hd >> IBA7322_HDRHEAD_PKTINT_SHIFT) + adjust_rcv_timeout(rcd, npkts); + if (updegr) + qib_write_ureg(rcd->dd, ur_rcvegrindexhead, egrhd, rcd->ctxt); + mmiowb(); + qib_write_ureg(rcd->dd, ur_rcvhdrhead, hd, rcd->ctxt); + qib_write_ureg(rcd->dd, ur_rcvhdrhead, hd, rcd->ctxt); + mmiowb(); +} + +static u32 qib_7322_hdrqempty(struct qib_ctxtdata *rcd) +{ + u32 head, tail; + + head = qib_read_ureg32(rcd->dd, ur_rcvhdrhead, rcd->ctxt); + if (rcd->rcvhdrtail_kvaddr) + tail = qib_get_rcvhdrtail(rcd); + else + tail = qib_read_ureg32(rcd->dd, ur_rcvhdrtail, rcd->ctxt); + return head == tail; +} + +#define RCVCTRL_COMMON_MODS (QIB_RCVCTRL_CTXT_ENB | \ + QIB_RCVCTRL_CTXT_DIS | \ + QIB_RCVCTRL_TIDFLOW_ENB | \ + QIB_RCVCTRL_TIDFLOW_DIS | \ + QIB_RCVCTRL_TAILUPD_ENB | \ + QIB_RCVCTRL_TAILUPD_DIS | \ + QIB_RCVCTRL_INTRAVAIL_ENB | \ + QIB_RCVCTRL_INTRAVAIL_DIS | \ + QIB_RCVCTRL_BP_ENB | \ + QIB_RCVCTRL_BP_DIS) + +#define RCVCTRL_PORT_MODS (QIB_RCVCTRL_CTXT_ENB | \ + QIB_RCVCTRL_CTXT_DIS | \ + QIB_RCVCTRL_PKEY_DIS | \ + QIB_RCVCTRL_PKEY_ENB) + +/* + * Modify the RCVCTRL register in chip-specific way. This + * is a function because bit positions and (future) register + * location is chip-specifc, but the needed operations are + * generic. is a bit-mask because we often want to + * do multiple modifications. + */ +static void rcvctrl_7322_mod(struct qib_pportdata *ppd, unsigned int op, + int ctxt) +{ + struct qib_devdata *dd = ppd->dd; + struct qib_ctxtdata *rcd; + u64 mask, val; + unsigned long flags; + + spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags); + + if (op & QIB_RCVCTRL_TIDFLOW_ENB) + dd->rcvctrl |= SYM_MASK(RcvCtrl, TidFlowEnable); + if (op & QIB_RCVCTRL_TIDFLOW_DIS) + dd->rcvctrl &= ~SYM_MASK(RcvCtrl, TidFlowEnable); + if (op & QIB_RCVCTRL_TAILUPD_ENB) + dd->rcvctrl |= SYM_MASK(RcvCtrl, TailUpd); + if (op & QIB_RCVCTRL_TAILUPD_DIS) + dd->rcvctrl &= ~SYM_MASK(RcvCtrl, TailUpd); + if (op & QIB_RCVCTRL_PKEY_ENB) + ppd->p_rcvctrl &= ~SYM_MASK(RcvCtrl_0, RcvPartitionKeyDisable); + if (op & QIB_RCVCTRL_PKEY_DIS) + ppd->p_rcvctrl |= SYM_MASK(RcvCtrl_0, RcvPartitionKeyDisable); + if (ctxt < 0) { + mask = (1ULL << dd->ctxtcnt) - 1; + rcd = NULL; + } else { + mask = (1ULL << ctxt); + rcd = dd->rcd[ctxt]; + } + if ((op & QIB_RCVCTRL_CTXT_ENB) && rcd) { + ppd->p_rcvctrl |= + (mask << SYM_LSB(RcvCtrl_0, ContextEnableKernel)); + if (!(dd->flags & QIB_NODMA_RTAIL)) { + op |= QIB_RCVCTRL_TAILUPD_ENB; /* need reg write */ + dd->rcvctrl |= SYM_MASK(RcvCtrl, TailUpd); + } + /* Write these registers before the context is enabled. */ + qib_write_kreg_ctxt(dd, krc_rcvhdrtailaddr, ctxt, + rcd->rcvhdrqtailaddr_phys); + qib_write_kreg_ctxt(dd, krc_rcvhdraddr, ctxt, + rcd->rcvhdrq_phys); + rcd->seq_cnt = 1; + } + if (op & QIB_RCVCTRL_CTXT_DIS) + ppd->p_rcvctrl &= + ~(mask << SYM_LSB(RcvCtrl_0, ContextEnableKernel)); + if (op & QIB_RCVCTRL_BP_ENB) + dd->rcvctrl |= mask << SYM_LSB(RcvCtrl, dontDropRHQFull); + if (op & QIB_RCVCTRL_BP_DIS) + dd->rcvctrl &= ~(mask << SYM_LSB(RcvCtrl, dontDropRHQFull)); + if (op & QIB_RCVCTRL_INTRAVAIL_ENB) + dd->rcvctrl |= (mask << SYM_LSB(RcvCtrl, IntrAvail)); + if (op & QIB_RCVCTRL_INTRAVAIL_DIS) + dd->rcvctrl &= ~(mask << SYM_LSB(RcvCtrl, IntrAvail)); + /* + * Decide which registers to write depending on the ops enabled. + * Special case is "flush" (no bits set at all) + * which needs to write both. + */ + if (op == 0 || (op & RCVCTRL_COMMON_MODS)) + qib_write_kreg(dd, kr_rcvctrl, dd->rcvctrl); + if (op == 0 || (op & RCVCTRL_PORT_MODS)) + qib_write_kreg_port(ppd, krp_rcvctrl, ppd->p_rcvctrl); + if ((op & QIB_RCVCTRL_CTXT_ENB) && dd->rcd[ctxt]) { + /* + * Init the context registers also; if we were + * disabled, tail and head should both be zero + * already from the enable, but since we don't + * know, we have to do it explicitly. + */ + val = qib_read_ureg32(dd, ur_rcvegrindextail, ctxt); + qib_write_ureg(dd, ur_rcvegrindexhead, val, ctxt); + + /* be sure enabling write seen; hd/tl should be 0 */ + (void) qib_read_kreg32(dd, kr_scratch); + val = qib_read_ureg32(dd, ur_rcvhdrtail, ctxt); + dd->rcd[ctxt]->head = val; + /* If kctxt, interrupt on next receive. */ + if (ctxt < dd->first_user_ctxt) + val |= dd->rhdrhead_intr_off; + qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt); + } else if ((op & QIB_RCVCTRL_INTRAVAIL_ENB) && + dd->rcd[ctxt] && dd->rhdrhead_intr_off) { + /* arm rcv interrupt */ + val = dd->rcd[ctxt]->head | dd->rhdrhead_intr_off; + qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt); + } + if (op & QIB_RCVCTRL_CTXT_DIS) { + unsigned f; + + /* Now that the context is disabled, clear these registers. */ + if (ctxt >= 0) { + qib_write_kreg_ctxt(dd, krc_rcvhdrtailaddr, ctxt, 0); + qib_write_kreg_ctxt(dd, krc_rcvhdraddr, ctxt, 0); + for (f = 0; f < NUM_TIDFLOWS_CTXT; f++) + qib_write_ureg(dd, ur_rcvflowtable + f, + TIDFLOW_ERRBITS, ctxt); + } else { + unsigned i; + + for (i = 0; i < dd->cfgctxts; i++) { + qib_write_kreg_ctxt(dd, krc_rcvhdrtailaddr, + i, 0); + qib_write_kreg_ctxt(dd, krc_rcvhdraddr, i, 0); + for (f = 0; f < NUM_TIDFLOWS_CTXT; f++) + qib_write_ureg(dd, ur_rcvflowtable + f, + TIDFLOW_ERRBITS, i); + } + } + } + spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags); +} + +/* + * Modify the SENDCTRL register in chip-specific way. This + * is a function where there are multiple such registers with + * slightly different layouts. + * The chip doesn't allow back-to-back sendctrl writes, so write + * the scratch register after writing sendctrl. + * + * Which register is written depends on the operation. + * Most operate on the common register, while + * SEND_ENB and SEND_DIS operate on the per-port ones. + * SEND_ENB is included in common because it can change SPCL_TRIG + */ +#define SENDCTRL_COMMON_MODS (\ + QIB_SENDCTRL_CLEAR | \ + QIB_SENDCTRL_AVAIL_DIS | \ + QIB_SENDCTRL_AVAIL_ENB | \ + QIB_SENDCTRL_AVAIL_BLIP | \ + QIB_SENDCTRL_DISARM | \ + QIB_SENDCTRL_DISARM_ALL | \ + QIB_SENDCTRL_SEND_ENB) + +#define SENDCTRL_PORT_MODS (\ + QIB_SENDCTRL_CLEAR | \ + QIB_SENDCTRL_SEND_ENB | \ + QIB_SENDCTRL_SEND_DIS | \ + QIB_SENDCTRL_FLUSH) + +static void sendctrl_7322_mod(struct qib_pportdata *ppd, u32 op) +{ + struct qib_devdata *dd = ppd->dd; + u64 tmp_dd_sendctrl; + unsigned long flags; + + spin_lock_irqsave(&dd->sendctrl_lock, flags); + + /* First the dd ones that are "sticky", saved in shadow */ + if (op & QIB_SENDCTRL_CLEAR) + dd->sendctrl = 0; + if (op & QIB_SENDCTRL_AVAIL_DIS) + dd->sendctrl &= ~SYM_MASK(SendCtrl, SendBufAvailUpd); + else if (op & QIB_SENDCTRL_AVAIL_ENB) { + dd->sendctrl |= SYM_MASK(SendCtrl, SendBufAvailUpd); + if (dd->flags & QIB_USE_SPCL_TRIG) + dd->sendctrl |= SYM_MASK(SendCtrl, SpecialTriggerEn); + } + + /* Then the ppd ones that are "sticky", saved in shadow */ + if (op & QIB_SENDCTRL_SEND_DIS) + ppd->p_sendctrl &= ~SYM_MASK(SendCtrl_0, SendEnable); + else if (op & QIB_SENDCTRL_SEND_ENB) + ppd->p_sendctrl |= SYM_MASK(SendCtrl_0, SendEnable); + + if (op & QIB_SENDCTRL_DISARM_ALL) { + u32 i, last; + + tmp_dd_sendctrl = dd->sendctrl; + last = dd->piobcnt2k + dd->piobcnt4k + NUM_VL15_BUFS; + /* + * Disarm any buffers that are not yet launched, + * disabling updates until done. + */ + tmp_dd_sendctrl &= ~SYM_MASK(SendCtrl, SendBufAvailUpd); + for (i = 0; i < last; i++) { + qib_write_kreg(dd, kr_sendctrl, + tmp_dd_sendctrl | + SYM_MASK(SendCtrl, Disarm) | i); + qib_write_kreg(dd, kr_scratch, 0); + } + } + + if (op & QIB_SENDCTRL_FLUSH) { + u64 tmp_ppd_sendctrl = ppd->p_sendctrl; + + /* + * Now drain all the fifos. The Abort bit should never be + * needed, so for now, at least, we don't use it. + */ + tmp_ppd_sendctrl |= + SYM_MASK(SendCtrl_0, TxeDrainRmFifo) | + SYM_MASK(SendCtrl_0, TxeDrainLaFifo) | + SYM_MASK(SendCtrl_0, TxeBypassIbc); + qib_write_kreg_port(ppd, krp_sendctrl, tmp_ppd_sendctrl); + qib_write_kreg(dd, kr_scratch, 0); + } + + tmp_dd_sendctrl = dd->sendctrl; + + if (op & QIB_SENDCTRL_DISARM) + tmp_dd_sendctrl |= SYM_MASK(SendCtrl, Disarm) | + ((op & QIB_7322_SendCtrl_DisarmSendBuf_RMASK) << + SYM_LSB(SendCtrl, DisarmSendBuf)); + if ((op & QIB_SENDCTRL_AVAIL_BLIP) && + (dd->sendctrl & SYM_MASK(SendCtrl, SendBufAvailUpd))) + tmp_dd_sendctrl &= ~SYM_MASK(SendCtrl, SendBufAvailUpd); + + if (op == 0 || (op & SENDCTRL_COMMON_MODS)) { + qib_write_kreg(dd, kr_sendctrl, tmp_dd_sendctrl); + qib_write_kreg(dd, kr_scratch, 0); + } + + if (op == 0 || (op & SENDCTRL_PORT_MODS)) { + qib_write_kreg_port(ppd, krp_sendctrl, ppd->p_sendctrl); + qib_write_kreg(dd, kr_scratch, 0); + } + + if (op & QIB_SENDCTRL_AVAIL_BLIP) { + qib_write_kreg(dd, kr_sendctrl, dd->sendctrl); + qib_write_kreg(dd, kr_scratch, 0); + } + + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); + + if (op & QIB_SENDCTRL_FLUSH) { + u32 v; + /* + * ensure writes have hit chip, then do a few + * more reads, to allow DMA of pioavail registers + * to occur, so in-memory copy is in sync with + * the chip. Not always safe to sleep. + */ + v = qib_read_kreg32(dd, kr_scratch); + qib_write_kreg(dd, kr_scratch, v); + v = qib_read_kreg32(dd, kr_scratch); + qib_write_kreg(dd, kr_scratch, v); + qib_read_kreg32(dd, kr_scratch); + } +} + +#define _PORT_VIRT_FLAG 0x8000U /* "virtual", need adjustments */ +#define _PORT_64BIT_FLAG 0x10000U /* not "virtual", but 64bit */ +#define _PORT_CNTR_IDXMASK 0x7fffU /* mask off flags above */ + +/** + * qib_portcntr_7322 - read a per-port chip counter + * @ppd: the qlogic_ib pport + * @creg: the counter to read (not a chip offset) + */ +static u64 qib_portcntr_7322(struct qib_pportdata *ppd, u32 reg) +{ + struct qib_devdata *dd = ppd->dd; + u64 ret = 0ULL; + u16 creg; + /* 0xffff for unimplemented or synthesized counters */ + static const u32 xlator[] = { + [QIBPORTCNTR_PKTSEND] = crp_pktsend | _PORT_64BIT_FLAG, + [QIBPORTCNTR_WORDSEND] = crp_wordsend | _PORT_64BIT_FLAG, + [QIBPORTCNTR_PSXMITDATA] = crp_psxmitdatacount, + [QIBPORTCNTR_PSXMITPKTS] = crp_psxmitpktscount, + [QIBPORTCNTR_PSXMITWAIT] = crp_psxmitwaitcount, + [QIBPORTCNTR_SENDSTALL] = crp_sendstall, + [QIBPORTCNTR_PKTRCV] = crp_pktrcv | _PORT_64BIT_FLAG, + [QIBPORTCNTR_PSRCVDATA] = crp_psrcvdatacount, + [QIBPORTCNTR_PSRCVPKTS] = crp_psrcvpktscount, + [QIBPORTCNTR_RCVEBP] = crp_rcvebp, + [QIBPORTCNTR_RCVOVFL] = crp_rcvovfl, + [QIBPORTCNTR_WORDRCV] = crp_wordrcv | _PORT_64BIT_FLAG, + [QIBPORTCNTR_RXDROPPKT] = 0xffff, /* not needed for 7322 */ + [QIBPORTCNTR_RXLOCALPHYERR] = crp_rxotherlocalphyerr, + [QIBPORTCNTR_RXVLERR] = crp_rxvlerr, + [QIBPORTCNTR_ERRICRC] = crp_erricrc, + [QIBPORTCNTR_ERRVCRC] = crp_errvcrc, + [QIBPORTCNTR_ERRLPCRC] = crp_errlpcrc, + [QIBPORTCNTR_BADFORMAT] = crp_badformat, + [QIBPORTCNTR_ERR_RLEN] = crp_err_rlen, + [QIBPORTCNTR_IBSYMBOLERR] = crp_ibsymbolerr, + [QIBPORTCNTR_INVALIDRLEN] = crp_invalidrlen, + [QIBPORTCNTR_UNSUPVL] = crp_txunsupvl, + [QIBPORTCNTR_EXCESSBUFOVFL] = crp_excessbufferovfl, + [QIBPORTCNTR_ERRLINK] = crp_errlink, + [QIBPORTCNTR_IBLINKDOWN] = crp_iblinkdown, + [QIBPORTCNTR_IBLINKERRRECOV] = crp_iblinkerrrecov, + [QIBPORTCNTR_LLI] = crp_locallinkintegrityerr, + [QIBPORTCNTR_VL15PKTDROP] = crp_vl15droppedpkt, + [QIBPORTCNTR_ERRPKEY] = crp_errpkey, + /* + * the next 3 aren't really counters, but were implemented + * as counters in older chips, so still get accessed as + * though they were counters from this code. + */ + [QIBPORTCNTR_PSINTERVAL] = krp_psinterval, + [QIBPORTCNTR_PSSTART] = krp_psstart, + [QIBPORTCNTR_PSSTAT] = krp_psstat, + /* pseudo-counter, summed for all ports */ + [QIBPORTCNTR_KHDROVFL] = 0xffff, + }; + + if (reg >= ARRAY_SIZE(xlator)) { + qib_devinfo(ppd->dd->pcidev, + "Unimplemented portcounter %u\n", reg); + goto done; + } + creg = xlator[reg] & _PORT_CNTR_IDXMASK; + + /* handle non-counters and special cases first */ + if (reg == QIBPORTCNTR_KHDROVFL) { + int i; + + /* sum over all kernel contexts (skip if mini_init) */ + for (i = 0; dd->rcd && i < dd->first_user_ctxt; i++) { + struct qib_ctxtdata *rcd = dd->rcd[i]; + + if (!rcd || rcd->ppd != ppd) + continue; + ret += read_7322_creg32(dd, cr_base_egrovfl + i); + } + goto done; + } else if (reg == QIBPORTCNTR_RXDROPPKT) { + /* + * Used as part of the synthesis of port_rcv_errors + * in the verbs code for IBTA counters. Not needed for 7322, + * because all the errors are already counted by other cntrs. + */ + goto done; + } else if (reg == QIBPORTCNTR_PSINTERVAL || + reg == QIBPORTCNTR_PSSTART || reg == QIBPORTCNTR_PSSTAT) { + /* were counters in older chips, now per-port kernel regs */ + ret = qib_read_kreg_port(ppd, creg); + goto done; + } + + /* + * Only fast increment counters are 64 bits; use 32 bit reads to + * avoid two independent reads when on Opteron. + */ + if (xlator[reg] & _PORT_64BIT_FLAG) + ret = read_7322_creg_port(ppd, creg); + else + ret = read_7322_creg32_port(ppd, creg); + if (creg == crp_ibsymbolerr) { + if (ppd->cpspec->ibdeltainprog) + ret -= ret - ppd->cpspec->ibsymsnap; + ret -= ppd->cpspec->ibsymdelta; + } else if (creg == crp_iblinkerrrecov) { + if (ppd->cpspec->ibdeltainprog) + ret -= ret - ppd->cpspec->iblnkerrsnap; + ret -= ppd->cpspec->iblnkerrdelta; + } else if (creg == crp_errlink) + ret -= ppd->cpspec->ibmalfdelta; + else if (creg == crp_iblinkdown) + ret += ppd->cpspec->iblnkdowndelta; +done: + return ret; +} + +/* + * Device counter names (not port-specific), one line per stat, + * single string. Used by utilities like ipathstats to print the stats + * in a way which works for different versions of drivers, without changing + * the utility. Names need to be 12 chars or less (w/o newline), for proper + * display by utility. + * Non-error counters are first. + * Start of "error" conters is indicated by a leading "E " on the first + * "error" counter, and doesn't count in label length. + * The EgrOvfl list needs to be last so we truncate them at the configured + * context count for the device. + * cntr7322indices contains the corresponding register indices. + */ +static const char cntr7322names[] = + "Interrupts\n" + "HostBusStall\n" + "E RxTIDFull\n" + "RxTIDInvalid\n" + "RxTIDFloDrop\n" /* 7322 only */ + "Ctxt0EgrOvfl\n" + "Ctxt1EgrOvfl\n" + "Ctxt2EgrOvfl\n" + "Ctxt3EgrOvfl\n" + "Ctxt4EgrOvfl\n" + "Ctxt5EgrOvfl\n" + "Ctxt6EgrOvfl\n" + "Ctxt7EgrOvfl\n" + "Ctxt8EgrOvfl\n" + "Ctxt9EgrOvfl\n" + "Ctx10EgrOvfl\n" + "Ctx11EgrOvfl\n" + "Ctx12EgrOvfl\n" + "Ctx13EgrOvfl\n" + "Ctx14EgrOvfl\n" + "Ctx15EgrOvfl\n" + "Ctx16EgrOvfl\n" + "Ctx17EgrOvfl\n" + ; + +static const u32 cntr7322indices[] = { + cr_lbint | _PORT_64BIT_FLAG, + cr_lbstall | _PORT_64BIT_FLAG, + cr_tidfull, + cr_tidinvalid, + cr_rxtidflowdrop, + cr_base_egrovfl + 0, + cr_base_egrovfl + 1, + cr_base_egrovfl + 2, + cr_base_egrovfl + 3, + cr_base_egrovfl + 4, + cr_base_egrovfl + 5, + cr_base_egrovfl + 6, + cr_base_egrovfl + 7, + cr_base_egrovfl + 8, + cr_base_egrovfl + 9, + cr_base_egrovfl + 10, + cr_base_egrovfl + 11, + cr_base_egrovfl + 12, + cr_base_egrovfl + 13, + cr_base_egrovfl + 14, + cr_base_egrovfl + 15, + cr_base_egrovfl + 16, + cr_base_egrovfl + 17, +}; + +/* + * same as cntr7322names and cntr7322indices, but for port-specific counters. + * portcntr7322indices is somewhat complicated by some registers needing + * adjustments of various kinds, and those are ORed with _PORT_VIRT_FLAG + */ +static const char portcntr7322names[] = + "TxPkt\n" + "TxFlowPkt\n" + "TxWords\n" + "RxPkt\n" + "RxFlowPkt\n" + "RxWords\n" + "TxFlowStall\n" + "TxDmaDesc\n" /* 7220 and 7322-only */ + "E RxDlidFltr\n" /* 7220 and 7322-only */ + "IBStatusChng\n" + "IBLinkDown\n" + "IBLnkRecov\n" + "IBRxLinkErr\n" + "IBSymbolErr\n" + "RxLLIErr\n" + "RxBadFormat\n" + "RxBadLen\n" + "RxBufOvrfl\n" + "RxEBP\n" + "RxFlowCtlErr\n" + "RxICRCerr\n" + "RxLPCRCerr\n" + "RxVCRCerr\n" + "RxInvalLen\n" + "RxInvalPKey\n" + "RxPktDropped\n" + "TxBadLength\n" + "TxDropped\n" + "TxInvalLen\n" + "TxUnderrun\n" + "TxUnsupVL\n" + "RxLclPhyErr\n" /* 7220 and 7322-only from here down */ + "RxVL15Drop\n" + "RxVlErr\n" + "XcessBufOvfl\n" + "RxQPBadCtxt\n" /* 7322-only from here down */ + "TXBadHeader\n" + ; + +static const u32 portcntr7322indices[] = { + QIBPORTCNTR_PKTSEND | _PORT_VIRT_FLAG, + crp_pktsendflow, + QIBPORTCNTR_WORDSEND | _PORT_VIRT_FLAG, + QIBPORTCNTR_PKTRCV | _PORT_VIRT_FLAG, + crp_pktrcvflowctrl, + QIBPORTCNTR_WORDRCV | _PORT_VIRT_FLAG, + QIBPORTCNTR_SENDSTALL | _PORT_VIRT_FLAG, + crp_txsdmadesc | _PORT_64BIT_FLAG, + crp_rxdlidfltr, + crp_ibstatuschange, + QIBPORTCNTR_IBLINKDOWN | _PORT_VIRT_FLAG, + QIBPORTCNTR_IBLINKERRRECOV | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERRLINK | _PORT_VIRT_FLAG, + QIBPORTCNTR_IBSYMBOLERR | _PORT_VIRT_FLAG, + QIBPORTCNTR_LLI | _PORT_VIRT_FLAG, + QIBPORTCNTR_BADFORMAT | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERR_RLEN | _PORT_VIRT_FLAG, + QIBPORTCNTR_RCVOVFL | _PORT_VIRT_FLAG, + QIBPORTCNTR_RCVEBP | _PORT_VIRT_FLAG, + crp_rcvflowctrlviol, + QIBPORTCNTR_ERRICRC | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERRLPCRC | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERRVCRC | _PORT_VIRT_FLAG, + QIBPORTCNTR_INVALIDRLEN | _PORT_VIRT_FLAG, + QIBPORTCNTR_ERRPKEY | _PORT_VIRT_FLAG, + QIBPORTCNTR_RXDROPPKT | _PORT_VIRT_FLAG, + crp_txminmaxlenerr, + crp_txdroppedpkt, + crp_txlenerr, + crp_txunderrun, + crp_txunsupvl, + QIBPORTCNTR_RXLOCALPHYERR | _PORT_VIRT_FLAG, + QIBPORTCNTR_VL15PKTDROP | _PORT_VIRT_FLAG, + QIBPORTCNTR_RXVLERR | _PORT_VIRT_FLAG, + QIBPORTCNTR_EXCESSBUFOVFL | _PORT_VIRT_FLAG, + crp_rxqpinvalidctxt, + crp_txhdrerr, +}; + +/* do all the setup to make the counter reads efficient later */ +static void init_7322_cntrnames(struct qib_devdata *dd) +{ + int i, j = 0; + char *s; + + for (i = 0, s = (char *)cntr7322names; s && j <= dd->cfgctxts; + i++) { + /* we always have at least one counter before the egrovfl */ + if (!j && !strncmp("Ctxt0EgrOvfl", s + 1, 12)) + j = 1; + s = strchr(s + 1, '\n'); + if (s && j) + j++; + } + dd->cspec->ncntrs = i; + if (!s) + /* full list; size is without terminating null */ + dd->cspec->cntrnamelen = sizeof(cntr7322names) - 1; + else + dd->cspec->cntrnamelen = 1 + s - cntr7322names; + dd->cspec->cntrs = kmalloc(dd->cspec->ncntrs + * sizeof(u64), GFP_KERNEL); + if (!dd->cspec->cntrs) + qib_dev_err(dd, "Failed allocation for counters\n"); + + for (i = 0, s = (char *)portcntr7322names; s; i++) + s = strchr(s + 1, '\n'); + dd->cspec->nportcntrs = i - 1; + dd->cspec->portcntrnamelen = sizeof(portcntr7322names) - 1; + for (i = 0; i < dd->num_pports; ++i) { + dd->pport[i].cpspec->portcntrs = kmalloc(dd->cspec->nportcntrs + * sizeof(u64), GFP_KERNEL); + if (!dd->pport[i].cpspec->portcntrs) + qib_dev_err(dd, + "Failed allocation for portcounters\n"); + } +} + +static u32 qib_read_7322cntrs(struct qib_devdata *dd, loff_t pos, char **namep, + u64 **cntrp) +{ + u32 ret; + + if (namep) { + ret = dd->cspec->cntrnamelen; + if (pos >= ret) + ret = 0; /* final read after getting everything */ + else + *namep = (char *) cntr7322names; + } else { + u64 *cntr = dd->cspec->cntrs; + int i; + + ret = dd->cspec->ncntrs * sizeof(u64); + if (!cntr || pos >= ret) { + /* everything read, or couldn't get memory */ + ret = 0; + goto done; + } + *cntrp = cntr; + for (i = 0; i < dd->cspec->ncntrs; i++) + if (cntr7322indices[i] & _PORT_64BIT_FLAG) + *cntr++ = read_7322_creg(dd, + cntr7322indices[i] & + _PORT_CNTR_IDXMASK); + else + *cntr++ = read_7322_creg32(dd, + cntr7322indices[i]); + } +done: + return ret; +} + +static u32 qib_read_7322portcntrs(struct qib_devdata *dd, loff_t pos, u32 port, + char **namep, u64 **cntrp) +{ + u32 ret; + + if (namep) { + ret = dd->cspec->portcntrnamelen; + if (pos >= ret) + ret = 0; /* final read after getting everything */ + else + *namep = (char *)portcntr7322names; + } else { + struct qib_pportdata *ppd = &dd->pport[port]; + u64 *cntr = ppd->cpspec->portcntrs; + int i; + + ret = dd->cspec->nportcntrs * sizeof(u64); + if (!cntr || pos >= ret) { + /* everything read, or couldn't get memory */ + ret = 0; + goto done; + } + *cntrp = cntr; + for (i = 0; i < dd->cspec->nportcntrs; i++) { + if (portcntr7322indices[i] & _PORT_VIRT_FLAG) + *cntr++ = qib_portcntr_7322(ppd, + portcntr7322indices[i] & + _PORT_CNTR_IDXMASK); + else if (portcntr7322indices[i] & _PORT_64BIT_FLAG) + *cntr++ = read_7322_creg_port(ppd, + portcntr7322indices[i] & + _PORT_CNTR_IDXMASK); + else + *cntr++ = read_7322_creg32_port(ppd, + portcntr7322indices[i]); + } + } +done: + return ret; +} + +/** + * qib_get_7322_faststats - get word counters from chip before they overflow + * @opaque - contains a pointer to the qlogic_ib device qib_devdata + * + * VESTIGIAL IBA7322 has no "small fast counters", so the only + * real purpose of this function is to maintain the notion of + * "active time", which in turn is only logged into the eeprom, + * which we don;t have, yet, for 7322-based boards. + * + * called from add_timer + */ +static void qib_get_7322_faststats(unsigned long opaque) +{ + struct qib_devdata *dd = (struct qib_devdata *) opaque; + struct qib_pportdata *ppd; + unsigned long flags; + u64 traffic_wds; + int pidx; + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + + /* + * If port isn't enabled or not operational ports, or + * diags is running (can cause memory diags to fail) + * skip this port this time. + */ + if (!ppd->link_speed_supported || !(dd->flags & QIB_INITTED) + || dd->diag_client) + continue; + + /* + * Maintain an activity timer, based on traffic + * exceeding a threshold, so we need to check the word-counts + * even if they are 64-bit. + */ + traffic_wds = qib_portcntr_7322(ppd, QIBPORTCNTR_WORDRCV) + + qib_portcntr_7322(ppd, QIBPORTCNTR_WORDSEND); + spin_lock_irqsave(&ppd->dd->eep_st_lock, flags); + traffic_wds -= ppd->dd->traffic_wds; + ppd->dd->traffic_wds += traffic_wds; + if (traffic_wds >= QIB_TRAFFIC_ACTIVE_THRESHOLD) + atomic_add(ACTIVITY_TIMER, &ppd->dd->active_time); + spin_unlock_irqrestore(&ppd->dd->eep_st_lock, flags); + if (ppd->cpspec->qdr_dfe_on && (ppd->link_speed_active & + QIB_IB_QDR) && + (ppd->lflags & (QIBL_LINKINIT | QIBL_LINKARMED | + QIBL_LINKACTIVE)) && + ppd->cpspec->qdr_dfe_time && + time_is_before_jiffies(ppd->cpspec->qdr_dfe_time)) { + ppd->cpspec->qdr_dfe_on = 0; + + qib_write_kreg_port(ppd, krp_static_adapt_dis(2), + ppd->dd->cspec->r1 ? + QDR_STATIC_ADAPT_INIT_R1 : + QDR_STATIC_ADAPT_INIT); + force_h1(ppd); + } + } + mod_timer(&dd->stats_timer, jiffies + HZ * ACTIVITY_TIMER); +} + +/* + * If we were using MSIx, try to fallback to INTx. + */ +static int qib_7322_intr_fallback(struct qib_devdata *dd) +{ + if (!dd->cspec->num_msix_entries) + return 0; /* already using INTx */ + + qib_devinfo(dd->pcidev, + "MSIx interrupt not detected, trying INTx interrupts\n"); + qib_7322_nomsix(dd); + qib_enable_intx(dd->pcidev); + qib_setup_7322_interrupt(dd, 0); + return 1; +} + +/* + * Reset the XGXS (between serdes and IBC). Slightly less intrusive + * than resetting the IBC or external link state, and useful in some + * cases to cause some retraining. To do this right, we reset IBC + * as well, then return to previous state (which may be still in reset) + * NOTE: some callers of this "know" this writes the current value + * of cpspec->ibcctrl_a as part of it's operation, so if that changes, + * check all callers. + */ +static void qib_7322_mini_pcs_reset(struct qib_pportdata *ppd) +{ + u64 val; + struct qib_devdata *dd = ppd->dd; + const u64 reset_bits = SYM_MASK(IBPCSConfig_0, xcv_rreset) | + SYM_MASK(IBPCSConfig_0, xcv_treset) | + SYM_MASK(IBPCSConfig_0, tx_rx_reset); + + val = qib_read_kreg_port(ppd, krp_ib_pcsconfig); + qib_write_kreg(dd, kr_hwerrmask, + dd->cspec->hwerrmask & ~HWE_MASK(statusValidNoEop)); + qib_write_kreg_port(ppd, krp_ibcctrl_a, + ppd->cpspec->ibcctrl_a & + ~SYM_MASK(IBCCtrlA_0, IBLinkEn)); + + qib_write_kreg_port(ppd, krp_ib_pcsconfig, val | reset_bits); + qib_read_kreg32(dd, kr_scratch); + qib_write_kreg_port(ppd, krp_ib_pcsconfig, val & ~reset_bits); + qib_write_kreg_port(ppd, krp_ibcctrl_a, ppd->cpspec->ibcctrl_a); + qib_write_kreg(dd, kr_scratch, 0ULL); + qib_write_kreg(dd, kr_hwerrclear, + SYM_MASK(HwErrClear, statusValidNoEopClear)); + qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask); +} + +/* + * This code for non-IBTA-compliant IB speed negotiation is only known to + * work for the SDR to DDR transition, and only between an HCA and a switch + * with recent firmware. It is based on observed heuristics, rather than + * actual knowledge of the non-compliant speed negotiation. + * It has a number of hard-coded fields, since the hope is to rewrite this + * when a spec is available on how the negoation is intended to work. + */ +static void autoneg_7322_sendpkt(struct qib_pportdata *ppd, u32 *hdr, + u32 dcnt, u32 *data) +{ + int i; + u64 pbc; + u32 __iomem *piobuf; + u32 pnum, control, len; + struct qib_devdata *dd = ppd->dd; + + i = 0; + len = 7 + dcnt + 1; /* 7 dword header, dword data, icrc */ + control = qib_7322_setpbc_control(ppd, len, 0, 15); + pbc = ((u64) control << 32) | len; + while (!(piobuf = qib_7322_getsendbuf(ppd, pbc, &pnum))) { + if (i++ > 15) + return; + udelay(2); + } + /* disable header check on this packet, since it can't be valid */ + dd->f_txchk_change(dd, pnum, 1, TXCHK_CHG_TYPE_DIS1, NULL); + writeq(pbc, piobuf); + qib_flush_wc(); + qib_pio_copy(piobuf + 2, hdr, 7); + qib_pio_copy(piobuf + 9, data, dcnt); + if (dd->flags & QIB_USE_SPCL_TRIG) { + u32 spcl_off = (pnum >= dd->piobcnt2k) ? 2047 : 1023; + + qib_flush_wc(); + __raw_writel(0xaebecede, piobuf + spcl_off); + } + qib_flush_wc(); + qib_sendbuf_done(dd, pnum); + /* and re-enable hdr check */ + dd->f_txchk_change(dd, pnum, 1, TXCHK_CHG_TYPE_ENAB1, NULL); +} + +/* + * _start packet gets sent twice at start, _done gets sent twice at end + */ +static void qib_autoneg_7322_send(struct qib_pportdata *ppd, int which) +{ + struct qib_devdata *dd = ppd->dd; + static u32 swapped; + u32 dw, i, hcnt, dcnt, *data; + static u32 hdr[7] = { 0xf002ffff, 0x48ffff, 0x6400abba }; + static u32 madpayload_start[0x40] = { + 0x1810103, 0x1, 0x0, 0x0, 0x2c90000, 0x2c9, 0x0, 0x0, + 0xffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x1, 0x1388, 0x15e, 0x1, /* rest 0's */ + }; + static u32 madpayload_done[0x40] = { + 0x1810103, 0x1, 0x0, 0x0, 0x2c90000, 0x2c9, 0x0, 0x0, + 0xffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x40000001, 0x1388, 0x15e, /* rest 0's */ + }; + + dcnt = ARRAY_SIZE(madpayload_start); + hcnt = ARRAY_SIZE(hdr); + if (!swapped) { + /* for maintainability, do it at runtime */ + for (i = 0; i < hcnt; i++) { + dw = (__force u32) cpu_to_be32(hdr[i]); + hdr[i] = dw; + } + for (i = 0; i < dcnt; i++) { + dw = (__force u32) cpu_to_be32(madpayload_start[i]); + madpayload_start[i] = dw; + dw = (__force u32) cpu_to_be32(madpayload_done[i]); + madpayload_done[i] = dw; + } + swapped = 1; + } + + data = which ? madpayload_done : madpayload_start; + + autoneg_7322_sendpkt(ppd, hdr, dcnt, data); + qib_read_kreg64(dd, kr_scratch); + udelay(2); + autoneg_7322_sendpkt(ppd, hdr, dcnt, data); + qib_read_kreg64(dd, kr_scratch); + udelay(2); +} + +/* + * Do the absolute minimum to cause an IB speed change, and make it + * ready, but don't actually trigger the change. The caller will + * do that when ready (if link is in Polling training state, it will + * happen immediately, otherwise when link next goes down) + * + * This routine should only be used as part of the DDR autonegotation + * code for devices that are not compliant with IB 1.2 (or code that + * fixes things up for same). + * + * When link has gone down, and autoneg enabled, or autoneg has + * failed and we give up until next time we set both speeds, and + * then we want IBTA enabled as well as "use max enabled speed. + */ +static void set_7322_ibspeed_fast(struct qib_pportdata *ppd, u32 speed) +{ + u64 newctrlb; + newctrlb = ppd->cpspec->ibcctrl_b & ~(IBA7322_IBC_SPEED_MASK | + IBA7322_IBC_IBTA_1_2_MASK | + IBA7322_IBC_MAX_SPEED_MASK); + + if (speed & (speed - 1)) /* multiple speeds */ + newctrlb |= (speed << IBA7322_IBC_SPEED_LSB) | + IBA7322_IBC_IBTA_1_2_MASK | + IBA7322_IBC_MAX_SPEED_MASK; + else + newctrlb |= speed == QIB_IB_QDR ? + IBA7322_IBC_SPEED_QDR | IBA7322_IBC_IBTA_1_2_MASK : + ((speed == QIB_IB_DDR ? + IBA7322_IBC_SPEED_DDR : IBA7322_IBC_SPEED_SDR)); + + if (newctrlb == ppd->cpspec->ibcctrl_b) + return; + + ppd->cpspec->ibcctrl_b = newctrlb; + qib_write_kreg_port(ppd, krp_ibcctrl_b, ppd->cpspec->ibcctrl_b); + qib_write_kreg(ppd->dd, kr_scratch, 0); +} + +/* + * This routine is only used when we are not talking to another + * IB 1.2-compliant device that we think can do DDR. + * (This includes all existing switch chips as of Oct 2007.) + * 1.2-compliant devices go directly to DDR prior to reaching INIT + */ +static void try_7322_autoneg(struct qib_pportdata *ppd) +{ + unsigned long flags; + + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags |= QIBL_IB_AUTONEG_INPROG; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + qib_autoneg_7322_send(ppd, 0); + set_7322_ibspeed_fast(ppd, QIB_IB_DDR); + qib_7322_mini_pcs_reset(ppd); + /* 2 msec is minimum length of a poll cycle */ + queue_delayed_work(ib_wq, &ppd->cpspec->autoneg_work, + msecs_to_jiffies(2)); +} + +/* + * Handle the empirically determined mechanism for auto-negotiation + * of DDR speed with switches. + */ +static void autoneg_7322_work(struct work_struct *work) +{ + struct qib_pportdata *ppd; + struct qib_devdata *dd; + u64 startms; + u32 i; + unsigned long flags; + + ppd = container_of(work, struct qib_chippport_specific, + autoneg_work.work)->ppd; + dd = ppd->dd; + + startms = jiffies_to_msecs(jiffies); + + /* + * Busy wait for this first part, it should be at most a + * few hundred usec, since we scheduled ourselves for 2msec. + */ + for (i = 0; i < 25; i++) { + if (SYM_FIELD(ppd->lastibcstat, IBCStatusA_0, LinkState) + == IB_7322_LT_STATE_POLLQUIET) { + qib_set_linkstate(ppd, QIB_IB_LINKDOWN_DISABLE); + break; + } + udelay(100); + } + + if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) + goto done; /* we got there early or told to stop */ + + /* we expect this to timeout */ + if (wait_event_timeout(ppd->cpspec->autoneg_wait, + !(ppd->lflags & QIBL_IB_AUTONEG_INPROG), + msecs_to_jiffies(90))) + goto done; + qib_7322_mini_pcs_reset(ppd); + + /* we expect this to timeout */ + if (wait_event_timeout(ppd->cpspec->autoneg_wait, + !(ppd->lflags & QIBL_IB_AUTONEG_INPROG), + msecs_to_jiffies(1700))) + goto done; + qib_7322_mini_pcs_reset(ppd); + + set_7322_ibspeed_fast(ppd, QIB_IB_SDR); + + /* + * Wait up to 250 msec for link to train and get to INIT at DDR; + * this should terminate early. + */ + wait_event_timeout(ppd->cpspec->autoneg_wait, + !(ppd->lflags & QIBL_IB_AUTONEG_INPROG), + msecs_to_jiffies(250)); +done: + if (ppd->lflags & QIBL_IB_AUTONEG_INPROG) { + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_IB_AUTONEG_INPROG; + if (ppd->cpspec->autoneg_tries == AUTONEG_TRIES) { + ppd->lflags |= QIBL_IB_AUTONEG_FAILED; + ppd->cpspec->autoneg_tries = 0; + } + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + set_7322_ibspeed_fast(ppd, ppd->link_speed_enabled); + } +} + +/* + * This routine is used to request IPG set in the QLogic switch. + * Only called if r1. + */ +static void try_7322_ipg(struct qib_pportdata *ppd) +{ + struct qib_ibport *ibp = &ppd->ibport_data; + struct ib_mad_send_buf *send_buf; + struct ib_mad_agent *agent; + struct ib_smp *smp; + unsigned delay; + int ret; + + agent = ibp->send_agent; + if (!agent) + goto retry; + + send_buf = ib_create_send_mad(agent, 0, 0, 0, IB_MGMT_MAD_HDR, + IB_MGMT_MAD_DATA, GFP_ATOMIC); + if (IS_ERR(send_buf)) + goto retry; + + if (!ibp->smi_ah) { + struct ib_ah *ah; + + ah = qib_create_qp0_ah(ibp, be16_to_cpu(IB_LID_PERMISSIVE)); + if (IS_ERR(ah)) + ret = PTR_ERR(ah); + else { + send_buf->ah = ah; + ibp->smi_ah = to_iah(ah); + ret = 0; + } + } else { + send_buf->ah = &ibp->smi_ah->ibah; + ret = 0; + } + + smp = send_buf->mad; + smp->base_version = IB_MGMT_BASE_VERSION; + smp->mgmt_class = IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE; + smp->class_version = 1; + smp->method = IB_MGMT_METHOD_SEND; + smp->hop_cnt = 1; + smp->attr_id = QIB_VENDOR_IPG; + smp->attr_mod = 0; + + if (!ret) + ret = ib_post_send_mad(send_buf, NULL); + if (ret) + ib_free_send_mad(send_buf); +retry: + delay = 2 << ppd->cpspec->ipg_tries; + queue_delayed_work(ib_wq, &ppd->cpspec->ipg_work, + msecs_to_jiffies(delay)); +} + +/* + * Timeout handler for setting IPG. + * Only called if r1. + */ +static void ipg_7322_work(struct work_struct *work) +{ + struct qib_pportdata *ppd; + + ppd = container_of(work, struct qib_chippport_specific, + ipg_work.work)->ppd; + if ((ppd->lflags & (QIBL_LINKINIT | QIBL_LINKARMED | QIBL_LINKACTIVE)) + && ++ppd->cpspec->ipg_tries <= 10) + try_7322_ipg(ppd); +} + +static u32 qib_7322_iblink_state(u64 ibcs) +{ + u32 state = (u32)SYM_FIELD(ibcs, IBCStatusA_0, LinkState); + + switch (state) { + case IB_7322_L_STATE_INIT: + state = IB_PORT_INIT; + break; + case IB_7322_L_STATE_ARM: + state = IB_PORT_ARMED; + break; + case IB_7322_L_STATE_ACTIVE: + /* fall through */ + case IB_7322_L_STATE_ACT_DEFER: + state = IB_PORT_ACTIVE; + break; + default: /* fall through */ + case IB_7322_L_STATE_DOWN: + state = IB_PORT_DOWN; + break; + } + return state; +} + +/* returns the IBTA port state, rather than the IBC link training state */ +static u8 qib_7322_phys_portstate(u64 ibcs) +{ + u8 state = (u8)SYM_FIELD(ibcs, IBCStatusA_0, LinkTrainingState); + return qib_7322_physportstate[state]; +} + +static int qib_7322_ib_updown(struct qib_pportdata *ppd, int ibup, u64 ibcs) +{ + int ret = 0, symadj = 0; + unsigned long flags; + int mult; + + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_IB_FORCE_NOTIFY; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + + /* Update our picture of width and speed from chip */ + if (ibcs & SYM_MASK(IBCStatusA_0, LinkSpeedQDR)) { + ppd->link_speed_active = QIB_IB_QDR; + mult = 4; + } else if (ibcs & SYM_MASK(IBCStatusA_0, LinkSpeedActive)) { + ppd->link_speed_active = QIB_IB_DDR; + mult = 2; + } else { + ppd->link_speed_active = QIB_IB_SDR; + mult = 1; + } + if (ibcs & SYM_MASK(IBCStatusA_0, LinkWidthActive)) { + ppd->link_width_active = IB_WIDTH_4X; + mult *= 4; + } else + ppd->link_width_active = IB_WIDTH_1X; + ppd->delay_mult = ib_rate_to_delay[mult_to_ib_rate(mult)]; + + if (!ibup) { + u64 clr; + + /* Link went down. */ + /* do IPG MAD again after linkdown, even if last time failed */ + ppd->cpspec->ipg_tries = 0; + clr = qib_read_kreg_port(ppd, krp_ibcstatus_b) & + (SYM_MASK(IBCStatusB_0, heartbeat_timed_out) | + SYM_MASK(IBCStatusB_0, heartbeat_crosstalk)); + if (clr) + qib_write_kreg_port(ppd, krp_ibcstatus_b, clr); + if (!(ppd->lflags & (QIBL_IB_AUTONEG_FAILED | + QIBL_IB_AUTONEG_INPROG))) + set_7322_ibspeed_fast(ppd, ppd->link_speed_enabled); + if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) { + struct qib_qsfp_data *qd = + &ppd->cpspec->qsfp_data; + /* unlock the Tx settings, speed may change */ + qib_write_kreg_port(ppd, krp_tx_deemph_override, + SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, + reset_tx_deemphasis_override)); + qib_cancel_sends(ppd); + /* on link down, ensure sane pcs state */ + qib_7322_mini_pcs_reset(ppd); + /* schedule the qsfp refresh which should turn the link + off */ + if (ppd->dd->flags & QIB_HAS_QSFP) { + qd->t_insert = jiffies; + queue_work(ib_wq, &qd->work); + } + spin_lock_irqsave(&ppd->sdma_lock, flags); + if (__qib_sdma_running(ppd)) + __qib_sdma_process_event(ppd, + qib_sdma_event_e70_go_idle); + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + } + clr = read_7322_creg32_port(ppd, crp_iblinkdown); + if (clr == ppd->cpspec->iblnkdownsnap) + ppd->cpspec->iblnkdowndelta++; + } else { + if (qib_compat_ddr_negotiate && + !(ppd->lflags & (QIBL_IB_AUTONEG_FAILED | + QIBL_IB_AUTONEG_INPROG)) && + ppd->link_speed_active == QIB_IB_SDR && + (ppd->link_speed_enabled & QIB_IB_DDR) + && ppd->cpspec->autoneg_tries < AUTONEG_TRIES) { + /* we are SDR, and auto-negotiation enabled */ + ++ppd->cpspec->autoneg_tries; + if (!ppd->cpspec->ibdeltainprog) { + ppd->cpspec->ibdeltainprog = 1; + ppd->cpspec->ibsymdelta += + read_7322_creg32_port(ppd, + crp_ibsymbolerr) - + ppd->cpspec->ibsymsnap; + ppd->cpspec->iblnkerrdelta += + read_7322_creg32_port(ppd, + crp_iblinkerrrecov) - + ppd->cpspec->iblnkerrsnap; + } + try_7322_autoneg(ppd); + ret = 1; /* no other IB status change processing */ + } else if ((ppd->lflags & QIBL_IB_AUTONEG_INPROG) && + ppd->link_speed_active == QIB_IB_SDR) { + qib_autoneg_7322_send(ppd, 1); + set_7322_ibspeed_fast(ppd, QIB_IB_DDR); + qib_7322_mini_pcs_reset(ppd); + udelay(2); + ret = 1; /* no other IB status change processing */ + } else if ((ppd->lflags & QIBL_IB_AUTONEG_INPROG) && + (ppd->link_speed_active & QIB_IB_DDR)) { + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~(QIBL_IB_AUTONEG_INPROG | + QIBL_IB_AUTONEG_FAILED); + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + ppd->cpspec->autoneg_tries = 0; + /* re-enable SDR, for next link down */ + set_7322_ibspeed_fast(ppd, ppd->link_speed_enabled); + wake_up(&ppd->cpspec->autoneg_wait); + symadj = 1; + } else if (ppd->lflags & QIBL_IB_AUTONEG_FAILED) { + /* + * Clear autoneg failure flag, and do setup + * so we'll try next time link goes down and + * back to INIT (possibly connected to a + * different device). + */ + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_IB_AUTONEG_FAILED; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + ppd->cpspec->ibcctrl_b |= IBA7322_IBC_IBTA_1_2_MASK; + symadj = 1; + } + if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) { + symadj = 1; + if (ppd->dd->cspec->r1 && ppd->cpspec->ipg_tries <= 10) + try_7322_ipg(ppd); + if (!ppd->cpspec->recovery_init) + setup_7322_link_recovery(ppd, 0); + ppd->cpspec->qdr_dfe_time = jiffies + + msecs_to_jiffies(QDR_DFE_DISABLE_DELAY); + } + ppd->cpspec->ibmalfusesnap = 0; + ppd->cpspec->ibmalfsnap = read_7322_creg32_port(ppd, + crp_errlink); + } + if (symadj) { + ppd->cpspec->iblnkdownsnap = + read_7322_creg32_port(ppd, crp_iblinkdown); + if (ppd->cpspec->ibdeltainprog) { + ppd->cpspec->ibdeltainprog = 0; + ppd->cpspec->ibsymdelta += read_7322_creg32_port(ppd, + crp_ibsymbolerr) - ppd->cpspec->ibsymsnap; + ppd->cpspec->iblnkerrdelta += read_7322_creg32_port(ppd, + crp_iblinkerrrecov) - ppd->cpspec->iblnkerrsnap; + } + } else if (!ibup && qib_compat_ddr_negotiate && + !ppd->cpspec->ibdeltainprog && + !(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) { + ppd->cpspec->ibdeltainprog = 1; + ppd->cpspec->ibsymsnap = read_7322_creg32_port(ppd, + crp_ibsymbolerr); + ppd->cpspec->iblnkerrsnap = read_7322_creg32_port(ppd, + crp_iblinkerrrecov); + } + + if (!ret) + qib_setup_7322_setextled(ppd, ibup); + return ret; +} + +/* + * Does read/modify/write to appropriate registers to + * set output and direction bits selected by mask. + * these are in their canonical postions (e.g. lsb of + * dir will end up in D48 of extctrl on existing chips). + * returns contents of GP Inputs. + */ +static int gpio_7322_mod(struct qib_devdata *dd, u32 out, u32 dir, u32 mask) +{ + u64 read_val, new_out; + unsigned long flags; + + if (mask) { + /* some bits being written, lock access to GPIO */ + dir &= mask; + out &= mask; + spin_lock_irqsave(&dd->cspec->gpio_lock, flags); + dd->cspec->extctrl &= ~((u64)mask << SYM_LSB(EXTCtrl, GPIOOe)); + dd->cspec->extctrl |= ((u64) dir << SYM_LSB(EXTCtrl, GPIOOe)); + new_out = (dd->cspec->gpio_out & ~mask) | out; + + qib_write_kreg(dd, kr_extctrl, dd->cspec->extctrl); + qib_write_kreg(dd, kr_gpio_out, new_out); + dd->cspec->gpio_out = new_out; + spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags); + } + /* + * It is unlikely that a read at this time would get valid + * data on a pin whose direction line was set in the same + * call to this function. We include the read here because + * that allows us to potentially combine a change on one pin with + * a read on another, and because the old code did something like + * this. + */ + read_val = qib_read_kreg64(dd, kr_extstatus); + return SYM_FIELD(read_val, EXTStatus, GPIOIn); +} + +/* Enable writes to config EEPROM, if possible. Returns previous state */ +static int qib_7322_eeprom_wen(struct qib_devdata *dd, int wen) +{ + int prev_wen; + u32 mask; + + mask = 1 << QIB_EEPROM_WEN_NUM; + prev_wen = ~gpio_7322_mod(dd, 0, 0, 0) >> QIB_EEPROM_WEN_NUM; + gpio_7322_mod(dd, wen ? 0 : mask, mask, mask); + + return prev_wen & 1; +} + +/* + * Read fundamental info we need to use the chip. These are + * the registers that describe chip capabilities, and are + * saved in shadow registers. + */ +static void get_7322_chip_params(struct qib_devdata *dd) +{ + u64 val; + u32 piobufs; + int mtu; + + dd->palign = qib_read_kreg32(dd, kr_pagealign); + + dd->uregbase = qib_read_kreg32(dd, kr_userregbase); + + dd->rcvtidcnt = qib_read_kreg32(dd, kr_rcvtidcnt); + dd->rcvtidbase = qib_read_kreg32(dd, kr_rcvtidbase); + dd->rcvegrbase = qib_read_kreg32(dd, kr_rcvegrbase); + dd->piobufbase = qib_read_kreg64(dd, kr_sendpiobufbase); + dd->pio2k_bufbase = dd->piobufbase & 0xffffffff; + + val = qib_read_kreg64(dd, kr_sendpiobufcnt); + dd->piobcnt2k = val & ~0U; + dd->piobcnt4k = val >> 32; + val = qib_read_kreg64(dd, kr_sendpiosize); + dd->piosize2k = val & ~0U; + dd->piosize4k = val >> 32; + + mtu = ib_mtu_enum_to_int(qib_ibmtu); + if (mtu == -1) + mtu = QIB_DEFAULT_MTU; + dd->pport[0].ibmtu = (u32)mtu; + dd->pport[1].ibmtu = (u32)mtu; + + /* these may be adjusted in init_chip_wc_pat() */ + dd->pio2kbase = (u32 __iomem *) + ((char __iomem *) dd->kregbase + dd->pio2k_bufbase); + dd->pio4kbase = (u32 __iomem *) + ((char __iomem *) dd->kregbase + + (dd->piobufbase >> 32)); + /* + * 4K buffers take 2 pages; we use roundup just to be + * paranoid; we calculate it once here, rather than on + * ever buf allocate + */ + dd->align4k = ALIGN(dd->piosize4k, dd->palign); + + piobufs = dd->piobcnt4k + dd->piobcnt2k + NUM_VL15_BUFS; + + dd->pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2) / + (sizeof(u64) * BITS_PER_BYTE / 2); +} + +/* + * The chip base addresses in cspec and cpspec have to be set + * after possible init_chip_wc_pat(), rather than in + * get_7322_chip_params(), so split out as separate function + */ +static void qib_7322_set_baseaddrs(struct qib_devdata *dd) +{ + u32 cregbase; + cregbase = qib_read_kreg32(dd, kr_counterregbase); + + dd->cspec->cregbase = (u64 __iomem *)(cregbase + + (char __iomem *)dd->kregbase); + + dd->egrtidbase = (u64 __iomem *) + ((char __iomem *) dd->kregbase + dd->rcvegrbase); + + /* port registers are defined as relative to base of chip */ + dd->pport[0].cpspec->kpregbase = + (u64 __iomem *)((char __iomem *)dd->kregbase); + dd->pport[1].cpspec->kpregbase = + (u64 __iomem *)(dd->palign + + (char __iomem *)dd->kregbase); + dd->pport[0].cpspec->cpregbase = + (u64 __iomem *)(qib_read_kreg_port(&dd->pport[0], + kr_counterregbase) + (char __iomem *)dd->kregbase); + dd->pport[1].cpspec->cpregbase = + (u64 __iomem *)(qib_read_kreg_port(&dd->pport[1], + kr_counterregbase) + (char __iomem *)dd->kregbase); +} + +/* + * This is a fairly special-purpose observer, so we only support + * the port-specific parts of SendCtrl + */ + +#define SENDCTRL_SHADOWED (SYM_MASK(SendCtrl_0, SendEnable) | \ + SYM_MASK(SendCtrl_0, SDmaEnable) | \ + SYM_MASK(SendCtrl_0, SDmaIntEnable) | \ + SYM_MASK(SendCtrl_0, SDmaSingleDescriptor) | \ + SYM_MASK(SendCtrl_0, SDmaHalt) | \ + SYM_MASK(SendCtrl_0, IBVLArbiterEn) | \ + SYM_MASK(SendCtrl_0, ForceCreditUpToDate)) + +static int sendctrl_hook(struct qib_devdata *dd, + const struct diag_observer *op, u32 offs, + u64 *data, u64 mask, int only_32) +{ + unsigned long flags; + unsigned idx; + unsigned pidx; + struct qib_pportdata *ppd = NULL; + u64 local_data, all_bits; + + /* + * The fixed correspondence between Physical ports and pports is + * severed. We need to hunt for the ppd that corresponds + * to the offset we got. And we have to do that without admitting + * we know the stride, apparently. + */ + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + u64 __iomem *psptr; + u32 psoffs; + + ppd = dd->pport + pidx; + if (!ppd->cpspec->kpregbase) + continue; + + psptr = ppd->cpspec->kpregbase + krp_sendctrl; + psoffs = (u32) (psptr - dd->kregbase) * sizeof(*psptr); + if (psoffs == offs) + break; + } + + /* If pport is not being managed by driver, just avoid shadows. */ + if (pidx >= dd->num_pports) + ppd = NULL; + + /* In any case, "idx" is flat index in kreg space */ + idx = offs / sizeof(u64); + + all_bits = ~0ULL; + if (only_32) + all_bits >>= 32; + + spin_lock_irqsave(&dd->sendctrl_lock, flags); + if (!ppd || (mask & all_bits) != all_bits) { + /* + * At least some mask bits are zero, so we need + * to read. The judgement call is whether from + * reg or shadow. First-cut: read reg, and complain + * if any bits which should be shadowed are different + * from their shadowed value. + */ + if (only_32) + local_data = (u64)qib_read_kreg32(dd, idx); + else + local_data = qib_read_kreg64(dd, idx); + *data = (local_data & ~mask) | (*data & mask); + } + if (mask) { + /* + * At least some mask bits are one, so we need + * to write, but only shadow some bits. + */ + u64 sval, tval; /* Shadowed, transient */ + + /* + * New shadow val is bits we don't want to touch, + * ORed with bits we do, that are intended for shadow. + */ + if (ppd) { + sval = ppd->p_sendctrl & ~mask; + sval |= *data & SENDCTRL_SHADOWED & mask; + ppd->p_sendctrl = sval; + } else + sval = *data & SENDCTRL_SHADOWED & mask; + tval = sval | (*data & ~SENDCTRL_SHADOWED & mask); + qib_write_kreg(dd, idx, tval); + qib_write_kreg(dd, kr_scratch, 0Ull); + } + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); + return only_32 ? 4 : 8; +} + +static const struct diag_observer sendctrl_0_observer = { + sendctrl_hook, KREG_IDX(SendCtrl_0) * sizeof(u64), + KREG_IDX(SendCtrl_0) * sizeof(u64) +}; + +static const struct diag_observer sendctrl_1_observer = { + sendctrl_hook, KREG_IDX(SendCtrl_1) * sizeof(u64), + KREG_IDX(SendCtrl_1) * sizeof(u64) +}; + +static ushort sdma_fetch_prio = 8; +module_param_named(sdma_fetch_prio, sdma_fetch_prio, ushort, S_IRUGO); +MODULE_PARM_DESC(sdma_fetch_prio, "SDMA descriptor fetch priority"); + +/* Besides logging QSFP events, we set appropriate TxDDS values */ +static void init_txdds_table(struct qib_pportdata *ppd, int override); + +static void qsfp_7322_event(struct work_struct *work) +{ + struct qib_qsfp_data *qd; + struct qib_pportdata *ppd; + unsigned long pwrup; + unsigned long flags; + int ret; + u32 le2; + + qd = container_of(work, struct qib_qsfp_data, work); + ppd = qd->ppd; + pwrup = qd->t_insert + + msecs_to_jiffies(QSFP_PWR_LAG_MSEC - QSFP_MODPRS_LAG_MSEC); + + /* Delay for 20 msecs to allow ModPrs resistor to setup */ + mdelay(QSFP_MODPRS_LAG_MSEC); + + if (!qib_qsfp_mod_present(ppd)) { + ppd->cpspec->qsfp_data.modpresent = 0; + /* Set the physical link to disabled */ + qib_set_ib_7322_lstate(ppd, 0, + QLOGIC_IB_IBCC_LINKINITCMD_DISABLE); + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_LINKV; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + } else { + /* + * Some QSFP's not only do not respond until the full power-up + * time, but may behave badly if we try. So hold off responding + * to insertion. + */ + while (1) { + if (time_is_before_jiffies(pwrup)) + break; + msleep(20); + } + + ret = qib_refresh_qsfp_cache(ppd, &qd->cache); + + /* + * Need to change LE2 back to defaults if we couldn't + * read the cable type (to handle cable swaps), so do this + * even on failure to read cable information. We don't + * get here for QME, so IS_QME check not needed here. + */ + if (!ret && !ppd->dd->cspec->r1) { + if (QSFP_IS_ACTIVE_FAR(qd->cache.tech)) + le2 = LE2_QME; + else if (qd->cache.atten[1] >= qib_long_atten && + QSFP_IS_CU(qd->cache.tech)) + le2 = LE2_5m; + else + le2 = LE2_DEFAULT; + } else + le2 = LE2_DEFAULT; + ibsd_wr_allchans(ppd, 13, (le2 << 7), BMASK(9, 7)); + /* + * We always change parameteters, since we can choose + * values for cables without eeproms, and the cable may have + * changed from a cable with full or partial eeprom content + * to one with partial or no content. + */ + init_txdds_table(ppd, 0); + /* The physical link is being re-enabled only when the + * previous state was DISABLED and the VALID bit is not + * set. This should only happen when the cable has been + * physically pulled. */ + if (!ppd->cpspec->qsfp_data.modpresent && + (ppd->lflags & (QIBL_LINKV | QIBL_IB_LINK_DISABLED))) { + ppd->cpspec->qsfp_data.modpresent = 1; + qib_set_ib_7322_lstate(ppd, 0, + QLOGIC_IB_IBCC_LINKINITCMD_SLEEP); + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags |= QIBL_LINKV; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + } + } +} + +/* + * There is little we can do but complain to the user if QSFP + * initialization fails. + */ +static void qib_init_7322_qsfp(struct qib_pportdata *ppd) +{ + unsigned long flags; + struct qib_qsfp_data *qd = &ppd->cpspec->qsfp_data; + struct qib_devdata *dd = ppd->dd; + u64 mod_prs_bit = QSFP_GPIO_MOD_PRS_N; + + mod_prs_bit <<= (QSFP_GPIO_PORT2_SHIFT * ppd->hw_pidx); + qd->ppd = ppd; + qib_qsfp_init(qd, qsfp_7322_event); + spin_lock_irqsave(&dd->cspec->gpio_lock, flags); + dd->cspec->extctrl |= (mod_prs_bit << SYM_LSB(EXTCtrl, GPIOInvert)); + dd->cspec->gpio_mask |= mod_prs_bit; + qib_write_kreg(dd, kr_extctrl, dd->cspec->extctrl); + qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask); + spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags); +} + +/* + * called at device initialization time, and also if the txselect + * module parameter is changed. This is used for cables that don't + * have valid QSFP EEPROMs (not present, or attenuation is zero). + * We initialize to the default, then if there is a specific + * unit,port match, we use that (and set it immediately, for the + * current speed, if the link is at INIT or better). + * String format is "default# unit#,port#=# ... u,p=#", separators must + * be a SPACE character. A newline terminates. The u,p=# tuples may + * optionally have "u,p=#,#", where the final # is the H1 value + * The last specific match is used (actually, all are used, but last + * one is the one that winds up set); if none at all, fall back on default. + */ +static void set_no_qsfp_atten(struct qib_devdata *dd, int change) +{ + char *nxt, *str; + u32 pidx, unit, port, deflt, h1; + unsigned long val; + int any = 0, seth1; + int txdds_size; + + str = txselect_list; + + /* default number is validated in setup_txselect() */ + deflt = simple_strtoul(str, &nxt, 0); + for (pidx = 0; pidx < dd->num_pports; ++pidx) + dd->pport[pidx].cpspec->no_eep = deflt; + + txdds_size = TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ; + if (IS_QME(dd) || IS_QMH(dd)) + txdds_size += TXDDS_MFG_SZ; + + while (*nxt && nxt[1]) { + str = ++nxt; + unit = simple_strtoul(str, &nxt, 0); + if (nxt == str || !*nxt || *nxt != ',') { + while (*nxt && *nxt++ != ' ') /* skip to next, if any */ + ; + continue; + } + str = ++nxt; + port = simple_strtoul(str, &nxt, 0); + if (nxt == str || *nxt != '=') { + while (*nxt && *nxt++ != ' ') /* skip to next, if any */ + ; + continue; + } + str = ++nxt; + val = simple_strtoul(str, &nxt, 0); + if (nxt == str) { + while (*nxt && *nxt++ != ' ') /* skip to next, if any */ + ; + continue; + } + if (val >= txdds_size) + continue; + seth1 = 0; + h1 = 0; /* gcc thinks it might be used uninitted */ + if (*nxt == ',' && nxt[1]) { + str = ++nxt; + h1 = (u32)simple_strtoul(str, &nxt, 0); + if (nxt == str) + while (*nxt && *nxt++ != ' ') /* skip */ + ; + else + seth1 = 1; + } + for (pidx = 0; dd->unit == unit && pidx < dd->num_pports; + ++pidx) { + struct qib_pportdata *ppd = &dd->pport[pidx]; + + if (ppd->port != port || !ppd->link_speed_supported) + continue; + ppd->cpspec->no_eep = val; + if (seth1) + ppd->cpspec->h1_val = h1; + /* now change the IBC and serdes, overriding generic */ + init_txdds_table(ppd, 1); + /* Re-enable the physical state machine on mezz boards + * now that the correct settings have been set. + * QSFP boards are handles by the QSFP event handler */ + if (IS_QMH(dd) || IS_QME(dd)) + qib_set_ib_7322_lstate(ppd, 0, + QLOGIC_IB_IBCC_LINKINITCMD_SLEEP); + any++; + } + if (*nxt == '\n') + break; /* done */ + } + if (change && !any) { + /* no specific setting, use the default. + * Change the IBC and serdes, but since it's + * general, don't override specific settings. + */ + for (pidx = 0; pidx < dd->num_pports; ++pidx) + if (dd->pport[pidx].link_speed_supported) + init_txdds_table(&dd->pport[pidx], 0); + } +} + +/* handle the txselect parameter changing */ +static int setup_txselect(const char *str, struct kernel_param *kp) +{ + struct qib_devdata *dd; + unsigned long val; + char *n; + if (strlen(str) >= MAX_ATTEN_LEN) { + pr_info("txselect_values string too long\n"); + return -ENOSPC; + } + val = simple_strtoul(str, &n, 0); + if (n == str || val >= (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ + + TXDDS_MFG_SZ)) { + pr_info("txselect_values must start with a number < %d\n", + TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ + TXDDS_MFG_SZ); + return -EINVAL; + } + strcpy(txselect_list, str); + + list_for_each_entry(dd, &qib_dev_list, list) + if (dd->deviceid == PCI_DEVICE_ID_QLOGIC_IB_7322) + set_no_qsfp_atten(dd, 1); + return 0; +} + +/* + * Write the final few registers that depend on some of the + * init setup. Done late in init, just before bringing up + * the serdes. + */ +static int qib_late_7322_initreg(struct qib_devdata *dd) +{ + int ret = 0, n; + u64 val; + + qib_write_kreg(dd, kr_rcvhdrentsize, dd->rcvhdrentsize); + qib_write_kreg(dd, kr_rcvhdrsize, dd->rcvhdrsize); + qib_write_kreg(dd, kr_rcvhdrcnt, dd->rcvhdrcnt); + qib_write_kreg(dd, kr_sendpioavailaddr, dd->pioavailregs_phys); + val = qib_read_kreg64(dd, kr_sendpioavailaddr); + if (val != dd->pioavailregs_phys) { + qib_dev_err(dd, + "Catastrophic software error, SendPIOAvailAddr written as %lx, read back as %llx\n", + (unsigned long) dd->pioavailregs_phys, + (unsigned long long) val); + ret = -EINVAL; + } + + n = dd->piobcnt2k + dd->piobcnt4k + NUM_VL15_BUFS; + qib_7322_txchk_change(dd, 0, n, TXCHK_CHG_TYPE_KERN, NULL); + /* driver sends get pkey, lid, etc. checking also, to catch bugs */ + qib_7322_txchk_change(dd, 0, n, TXCHK_CHG_TYPE_ENAB1, NULL); + + qib_register_observer(dd, &sendctrl_0_observer); + qib_register_observer(dd, &sendctrl_1_observer); + + dd->control &= ~QLOGIC_IB_C_SDMAFETCHPRIOEN; + qib_write_kreg(dd, kr_control, dd->control); + /* + * Set SendDmaFetchPriority and init Tx params, including + * QSFP handler on boards that have QSFP. + * First set our default attenuation entry for cables that + * don't have valid attenuation. + */ + set_no_qsfp_atten(dd, 0); + for (n = 0; n < dd->num_pports; ++n) { + struct qib_pportdata *ppd = dd->pport + n; + + qib_write_kreg_port(ppd, krp_senddmaprioritythld, + sdma_fetch_prio & 0xf); + /* Initialize qsfp if present on board. */ + if (dd->flags & QIB_HAS_QSFP) + qib_init_7322_qsfp(ppd); + } + dd->control |= QLOGIC_IB_C_SDMAFETCHPRIOEN; + qib_write_kreg(dd, kr_control, dd->control); + + return ret; +} + +/* per IB port errors. */ +#define SENDCTRL_PIBP (MASK_ACROSS(0, 1) | MASK_ACROSS(3, 3) | \ + MASK_ACROSS(8, 15)) +#define RCVCTRL_PIBP (MASK_ACROSS(0, 17) | MASK_ACROSS(39, 41)) +#define ERRS_PIBP (MASK_ACROSS(57, 58) | MASK_ACROSS(54, 54) | \ + MASK_ACROSS(36, 49) | MASK_ACROSS(29, 34) | MASK_ACROSS(14, 17) | \ + MASK_ACROSS(0, 11)) + +/* + * Write the initialization per-port registers that need to be done at + * driver load and after reset completes (i.e., that aren't done as part + * of other init procedures called from qib_init.c). + * Some of these should be redundant on reset, but play safe. + */ +static void write_7322_init_portregs(struct qib_pportdata *ppd) +{ + u64 val; + int i; + + if (!ppd->link_speed_supported) { + /* no buffer credits for this port */ + for (i = 1; i < 8; i++) + qib_write_kreg_port(ppd, krp_rxcreditvl0 + i, 0); + qib_write_kreg_port(ppd, krp_ibcctrl_b, 0); + qib_write_kreg(ppd->dd, kr_scratch, 0); + return; + } + + /* + * Set the number of supported virtual lanes in IBC, + * for flow control packet handling on unsupported VLs + */ + val = qib_read_kreg_port(ppd, krp_ibsdtestiftx); + val &= ~SYM_MASK(IB_SDTEST_IF_TX_0, VL_CAP); + val |= (u64)(ppd->vls_supported - 1) << + SYM_LSB(IB_SDTEST_IF_TX_0, VL_CAP); + qib_write_kreg_port(ppd, krp_ibsdtestiftx, val); + + qib_write_kreg_port(ppd, krp_rcvbthqp, QIB_KD_QP); + + /* enable tx header checking */ + qib_write_kreg_port(ppd, krp_sendcheckcontrol, IBA7322_SENDCHK_PKEY | + IBA7322_SENDCHK_BTHQP | IBA7322_SENDCHK_SLID | + IBA7322_SENDCHK_RAW_IPV6 | IBA7322_SENDCHK_MINSZ); + + qib_write_kreg_port(ppd, krp_ncmodectrl, + SYM_MASK(IBNCModeCtrl_0, ScrambleCapLocal)); + + /* + * Unconditionally clear the bufmask bits. If SDMA is + * enabled, we'll set them appropriately later. + */ + qib_write_kreg_port(ppd, krp_senddmabufmask0, 0); + qib_write_kreg_port(ppd, krp_senddmabufmask1, 0); + qib_write_kreg_port(ppd, krp_senddmabufmask2, 0); + if (ppd->dd->cspec->r1) + ppd->p_sendctrl |= SYM_MASK(SendCtrl_0, ForceCreditUpToDate); +} + +/* + * Write the initialization per-device registers that need to be done at + * driver load and after reset completes (i.e., that aren't done as part + * of other init procedures called from qib_init.c). Also write per-port + * registers that are affected by overall device config, such as QP mapping + * Some of these should be redundant on reset, but play safe. + */ +static void write_7322_initregs(struct qib_devdata *dd) +{ + struct qib_pportdata *ppd; + int i, pidx; + u64 val; + + /* Set Multicast QPs received by port 2 to map to context one. */ + qib_write_kreg(dd, KREG_IDX(RcvQPMulticastContext_1), 1); + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + unsigned n, regno; + unsigned long flags; + + if (dd->n_krcv_queues < 2 || + !dd->pport[pidx].link_speed_supported) + continue; + + ppd = &dd->pport[pidx]; + + /* be paranoid against later code motion, etc. */ + spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags); + ppd->p_rcvctrl |= SYM_MASK(RcvCtrl_0, RcvQPMapEnable); + spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags); + + /* Initialize QP to context mapping */ + regno = krp_rcvqpmaptable; + val = 0; + if (dd->num_pports > 1) + n = dd->first_user_ctxt / dd->num_pports; + else + n = dd->first_user_ctxt - 1; + for (i = 0; i < 32; ) { + unsigned ctxt; + + if (dd->num_pports > 1) + ctxt = (i % n) * dd->num_pports + pidx; + else if (i % n) + ctxt = (i % n) + 1; + else + ctxt = ppd->hw_pidx; + val |= ctxt << (5 * (i % 6)); + i++; + if (i % 6 == 0) { + qib_write_kreg_port(ppd, regno, val); + val = 0; + regno++; + } + } + qib_write_kreg_port(ppd, regno, val); + } + + /* + * Setup up interrupt mitigation for kernel contexts, but + * not user contexts (user contexts use interrupts when + * stalled waiting for any packet, so want those interrupts + * right away). + */ + for (i = 0; i < dd->first_user_ctxt; i++) { + dd->cspec->rcvavail_timeout[i] = rcv_int_timeout; + qib_write_kreg(dd, kr_rcvavailtimeout + i, rcv_int_timeout); + } + + /* + * Initialize as (disabled) rcvflow tables. Application code + * will setup each flow as it uses the flow. + * Doesn't clear any of the error bits that might be set. + */ + val = TIDFLOW_ERRBITS; /* these are W1C */ + for (i = 0; i < dd->cfgctxts; i++) { + int flow; + for (flow = 0; flow < NUM_TIDFLOWS_CTXT; flow++) + qib_write_ureg(dd, ur_rcvflowtable+flow, val, i); + } + + /* + * dual cards init to dual port recovery, single port cards to + * the one port. Dual port cards may later adjust to 1 port, + * and then back to dual port if both ports are connected + * */ + if (dd->num_pports) + setup_7322_link_recovery(dd->pport, dd->num_pports > 1); +} + +static int qib_init_7322_variables(struct qib_devdata *dd) +{ + struct qib_pportdata *ppd; + unsigned features, pidx, sbufcnt; + int ret, mtu; + u32 sbufs, updthresh; + + /* pport structs are contiguous, allocated after devdata */ + ppd = (struct qib_pportdata *)(dd + 1); + dd->pport = ppd; + ppd[0].dd = dd; + ppd[1].dd = dd; + + dd->cspec = (struct qib_chip_specific *)(ppd + 2); + + ppd[0].cpspec = (struct qib_chippport_specific *)(dd->cspec + 1); + ppd[1].cpspec = &ppd[0].cpspec[1]; + ppd[0].cpspec->ppd = &ppd[0]; /* for autoneg_7322_work() */ + ppd[1].cpspec->ppd = &ppd[1]; /* for autoneg_7322_work() */ + + spin_lock_init(&dd->cspec->rcvmod_lock); + spin_lock_init(&dd->cspec->gpio_lock); + + /* we haven't yet set QIB_PRESENT, so use read directly */ + dd->revision = readq(&dd->kregbase[kr_revision]); + + if ((dd->revision & 0xffffffffU) == 0xffffffffU) { + qib_dev_err(dd, + "Revision register read failure, giving up initialization\n"); + ret = -ENODEV; + goto bail; + } + dd->flags |= QIB_PRESENT; /* now register routines work */ + + dd->majrev = (u8) SYM_FIELD(dd->revision, Revision_R, ChipRevMajor); + dd->minrev = (u8) SYM_FIELD(dd->revision, Revision_R, ChipRevMinor); + dd->cspec->r1 = dd->minrev == 1; + + get_7322_chip_params(dd); + features = qib_7322_boardname(dd); + + /* now that piobcnt2k and 4k set, we can allocate these */ + sbufcnt = dd->piobcnt2k + dd->piobcnt4k + + NUM_VL15_BUFS + BITS_PER_LONG - 1; + sbufcnt /= BITS_PER_LONG; + dd->cspec->sendchkenable = kmalloc(sbufcnt * + sizeof(*dd->cspec->sendchkenable), GFP_KERNEL); + dd->cspec->sendgrhchk = kmalloc(sbufcnt * + sizeof(*dd->cspec->sendgrhchk), GFP_KERNEL); + dd->cspec->sendibchk = kmalloc(sbufcnt * + sizeof(*dd->cspec->sendibchk), GFP_KERNEL); + if (!dd->cspec->sendchkenable || !dd->cspec->sendgrhchk || + !dd->cspec->sendibchk) { + qib_dev_err(dd, "Failed allocation for hdrchk bitmaps\n"); + ret = -ENOMEM; + goto bail; + } + + ppd = dd->pport; + + /* + * GPIO bits for TWSI data and clock, + * used for serial EEPROM. + */ + dd->gpio_sda_num = _QIB_GPIO_SDA_NUM; + dd->gpio_scl_num = _QIB_GPIO_SCL_NUM; + dd->twsi_eeprom_dev = QIB_TWSI_EEPROM_DEV; + + dd->flags |= QIB_HAS_INTX | QIB_HAS_LINK_LATENCY | + QIB_NODMA_RTAIL | QIB_HAS_VLSUPP | QIB_HAS_HDRSUPP | + QIB_HAS_THRESH_UPDATE | + (sdma_idle_cnt ? QIB_HAS_SDMA_TIMEOUT : 0); + dd->flags |= qib_special_trigger ? + QIB_USE_SPCL_TRIG : QIB_HAS_SEND_DMA; + + /* + * Setup initial values. These may change when PAT is enabled, but + * we need these to do initial chip register accesses. + */ + qib_7322_set_baseaddrs(dd); + + mtu = ib_mtu_enum_to_int(qib_ibmtu); + if (mtu == -1) + mtu = QIB_DEFAULT_MTU; + + dd->cspec->int_enable_mask = QIB_I_BITSEXTANT; + /* all hwerrors become interrupts, unless special purposed */ + dd->cspec->hwerrmask = ~0ULL; + /* link_recovery setup causes these errors, so ignore them, + * other than clearing them when they occur */ + dd->cspec->hwerrmask &= + ~(SYM_MASK(HwErrMask, IBSerdesPClkNotDetectMask_0) | + SYM_MASK(HwErrMask, IBSerdesPClkNotDetectMask_1) | + HWE_MASK(LATriggered)); + + for (pidx = 0; pidx < NUM_IB_PORTS; ++pidx) { + struct qib_chippport_specific *cp = ppd->cpspec; + ppd->link_speed_supported = features & PORT_SPD_CAP; + features >>= PORT_SPD_CAP_SHIFT; + if (!ppd->link_speed_supported) { + /* single port mode (7340, or configured) */ + dd->skip_kctxt_mask |= 1 << pidx; + if (pidx == 0) { + /* Make sure port is disabled. */ + qib_write_kreg_port(ppd, krp_rcvctrl, 0); + qib_write_kreg_port(ppd, krp_ibcctrl_a, 0); + ppd[0] = ppd[1]; + dd->cspec->hwerrmask &= ~(SYM_MASK(HwErrMask, + IBSerdesPClkNotDetectMask_0) + | SYM_MASK(HwErrMask, + SDmaMemReadErrMask_0)); + dd->cspec->int_enable_mask &= ~( + SYM_MASK(IntMask, SDmaCleanupDoneMask_0) | + SYM_MASK(IntMask, SDmaIdleIntMask_0) | + SYM_MASK(IntMask, SDmaProgressIntMask_0) | + SYM_MASK(IntMask, SDmaIntMask_0) | + SYM_MASK(IntMask, ErrIntMask_0) | + SYM_MASK(IntMask, SendDoneIntMask_0)); + } else { + /* Make sure port is disabled. */ + qib_write_kreg_port(ppd, krp_rcvctrl, 0); + qib_write_kreg_port(ppd, krp_ibcctrl_a, 0); + dd->cspec->hwerrmask &= ~(SYM_MASK(HwErrMask, + IBSerdesPClkNotDetectMask_1) + | SYM_MASK(HwErrMask, + SDmaMemReadErrMask_1)); + dd->cspec->int_enable_mask &= ~( + SYM_MASK(IntMask, SDmaCleanupDoneMask_1) | + SYM_MASK(IntMask, SDmaIdleIntMask_1) | + SYM_MASK(IntMask, SDmaProgressIntMask_1) | + SYM_MASK(IntMask, SDmaIntMask_1) | + SYM_MASK(IntMask, ErrIntMask_1) | + SYM_MASK(IntMask, SendDoneIntMask_1)); + } + continue; + } + + dd->num_pports++; + ret = qib_init_pportdata(ppd, dd, pidx, dd->num_pports); + if (ret) { + dd->num_pports--; + goto bail; + } + + ppd->link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X; + ppd->link_width_enabled = IB_WIDTH_4X; + ppd->link_speed_enabled = ppd->link_speed_supported; + /* + * Set the initial values to reasonable default, will be set + * for real when link is up. + */ + ppd->link_width_active = IB_WIDTH_4X; + ppd->link_speed_active = QIB_IB_SDR; + ppd->delay_mult = ib_rate_to_delay[IB_RATE_10_GBPS]; + switch (qib_num_cfg_vls) { + case 1: + ppd->vls_supported = IB_VL_VL0; + break; + case 2: + ppd->vls_supported = IB_VL_VL0_1; + break; + default: + qib_devinfo(dd->pcidev, + "Invalid num_vls %u, using 4 VLs\n", + qib_num_cfg_vls); + qib_num_cfg_vls = 4; + /* fall through */ + case 4: + ppd->vls_supported = IB_VL_VL0_3; + break; + case 8: + if (mtu <= 2048) + ppd->vls_supported = IB_VL_VL0_7; + else { + qib_devinfo(dd->pcidev, + "Invalid num_vls %u for MTU %d " + ", using 4 VLs\n", + qib_num_cfg_vls, mtu); + ppd->vls_supported = IB_VL_VL0_3; + qib_num_cfg_vls = 4; + } + break; + } + ppd->vls_operational = ppd->vls_supported; + + init_waitqueue_head(&cp->autoneg_wait); + INIT_DELAYED_WORK(&cp->autoneg_work, + autoneg_7322_work); + if (ppd->dd->cspec->r1) + INIT_DELAYED_WORK(&cp->ipg_work, ipg_7322_work); + + /* + * For Mez and similar cards, no qsfp info, so do + * the "cable info" setup here. Can be overridden + * in adapter-specific routines. + */ + if (!(dd->flags & QIB_HAS_QSFP)) { + if (!IS_QMH(dd) && !IS_QME(dd)) + qib_devinfo(dd->pcidev, + "IB%u:%u: Unknown mezzanine card type\n", + dd->unit, ppd->port); + cp->h1_val = IS_QMH(dd) ? H1_FORCE_QMH : H1_FORCE_QME; + /* + * Choose center value as default tx serdes setting + * until changed through module parameter. + */ + ppd->cpspec->no_eep = IS_QMH(dd) ? + TXDDS_TABLE_SZ + 2 : TXDDS_TABLE_SZ + 4; + } else + cp->h1_val = H1_FORCE_VAL; + + /* Avoid writes to chip for mini_init */ + if (!qib_mini_init) + write_7322_init_portregs(ppd); + + init_timer(&cp->chase_timer); + cp->chase_timer.function = reenable_chase; + cp->chase_timer.data = (unsigned long)ppd; + + ppd++; + } + + dd->rcvhdrentsize = qib_rcvhdrentsize ? + qib_rcvhdrentsize : QIB_RCVHDR_ENTSIZE; + dd->rcvhdrsize = qib_rcvhdrsize ? + qib_rcvhdrsize : QIB_DFLT_RCVHDRSIZE; + dd->rhf_offset = dd->rcvhdrentsize - sizeof(u64) / sizeof(u32); + + /* we always allocate at least 2048 bytes for eager buffers */ + dd->rcvegrbufsize = max(mtu, 2048); + BUG_ON(!is_power_of_2(dd->rcvegrbufsize)); + dd->rcvegrbufsize_shift = ilog2(dd->rcvegrbufsize); + + qib_7322_tidtemplate(dd); + + /* + * We can request a receive interrupt for 1 or + * more packets from current offset. + */ + dd->rhdrhead_intr_off = + (u64) rcv_int_count << IBA7322_HDRHEAD_PKTINT_SHIFT; + + /* setup the stats timer; the add_timer is done at end of init */ + init_timer(&dd->stats_timer); + dd->stats_timer.function = qib_get_7322_faststats; + dd->stats_timer.data = (unsigned long) dd; + + dd->ureg_align = 0x10000; /* 64KB alignment */ + + dd->piosize2kmax_dwords = dd->piosize2k >> 2; + + qib_7322_config_ctxts(dd); + qib_set_ctxtcnt(dd); + + if (qib_wc_pat) { + resource_size_t vl15off; + /* + * We do not set WC on the VL15 buffers to avoid + * a rare problem with unaligned writes from + * interrupt-flushed store buffers, so we need + * to map those separately here. We can't solve + * this for the rarely used mtrr case. + */ + ret = init_chip_wc_pat(dd, 0); + if (ret) + goto bail; + + /* vl15 buffers start just after the 4k buffers */ + vl15off = dd->physaddr + (dd->piobufbase >> 32) + + dd->piobcnt4k * dd->align4k; + dd->piovl15base = ioremap_nocache(vl15off, + NUM_VL15_BUFS * dd->align4k); + if (!dd->piovl15base) { + ret = -ENOMEM; + goto bail; + } + } + qib_7322_set_baseaddrs(dd); /* set chip access pointers now */ + + ret = 0; + if (qib_mini_init) + goto bail; + if (!dd->num_pports) { + qib_dev_err(dd, "No ports enabled, giving up initialization\n"); + goto bail; /* no error, so can still figure out why err */ + } + + write_7322_initregs(dd); + ret = qib_create_ctxts(dd); + init_7322_cntrnames(dd); + + updthresh = 8U; /* update threshold */ + + /* use all of 4KB buffers for the kernel SDMA, zero if !SDMA. + * reserve the update threshold amount for other kernel use, such + * as sending SMI, MAD, and ACKs, or 3, whichever is greater, + * unless we aren't enabling SDMA, in which case we want to use + * all the 4k bufs for the kernel. + * if this was less than the update threshold, we could wait + * a long time for an update. Coded this way because we + * sometimes change the update threshold for various reasons, + * and we want this to remain robust. + */ + if (dd->flags & QIB_HAS_SEND_DMA) { + dd->cspec->sdmabufcnt = dd->piobcnt4k; + sbufs = updthresh > 3 ? updthresh : 3; + } else { + dd->cspec->sdmabufcnt = 0; + sbufs = dd->piobcnt4k; + } + dd->cspec->lastbuf_for_pio = dd->piobcnt2k + dd->piobcnt4k - + dd->cspec->sdmabufcnt; + dd->lastctxt_piobuf = dd->cspec->lastbuf_for_pio - sbufs; + dd->cspec->lastbuf_for_pio--; /* range is <= , not < */ + dd->last_pio = dd->cspec->lastbuf_for_pio; + dd->pbufsctxt = (dd->cfgctxts > dd->first_user_ctxt) ? + dd->lastctxt_piobuf / (dd->cfgctxts - dd->first_user_ctxt) : 0; + + /* + * If we have 16 user contexts, we will have 7 sbufs + * per context, so reduce the update threshold to match. We + * want to update before we actually run out, at low pbufs/ctxt + * so give ourselves some margin. + */ + if (dd->pbufsctxt >= 2 && dd->pbufsctxt - 2 < updthresh) + updthresh = dd->pbufsctxt - 2; + dd->cspec->updthresh_dflt = updthresh; + dd->cspec->updthresh = updthresh; + + /* before full enable, no interrupts, no locking needed */ + dd->sendctrl |= ((updthresh & SYM_RMASK(SendCtrl, AvailUpdThld)) + << SYM_LSB(SendCtrl, AvailUpdThld)) | + SYM_MASK(SendCtrl, SendBufAvailPad64Byte); + + dd->psxmitwait_supported = 1; + dd->psxmitwait_check_rate = QIB_7322_PSXMITWAIT_CHECK_RATE; +bail: + if (!dd->ctxtcnt) + dd->ctxtcnt = 1; /* for other initialization code */ + + return ret; +} + +static u32 __iomem *qib_7322_getsendbuf(struct qib_pportdata *ppd, u64 pbc, + u32 *pbufnum) +{ + u32 first, last, plen = pbc & QIB_PBC_LENGTH_MASK; + struct qib_devdata *dd = ppd->dd; + + /* last is same for 2k and 4k, because we use 4k if all 2k busy */ + if (pbc & PBC_7322_VL15_SEND) { + first = dd->piobcnt2k + dd->piobcnt4k + ppd->hw_pidx; + last = first; + } else { + if ((plen + 1) > dd->piosize2kmax_dwords) + first = dd->piobcnt2k; + else + first = 0; + last = dd->cspec->lastbuf_for_pio; + } + return qib_getsendbuf_range(dd, pbufnum, first, last); +} + +static void qib_set_cntr_7322_sample(struct qib_pportdata *ppd, u32 intv, + u32 start) +{ + qib_write_kreg_port(ppd, krp_psinterval, intv); + qib_write_kreg_port(ppd, krp_psstart, start); +} + +/* + * Must be called with sdma_lock held, or before init finished. + */ +static void qib_sdma_set_7322_desc_cnt(struct qib_pportdata *ppd, unsigned cnt) +{ + qib_write_kreg_port(ppd, krp_senddmadesccnt, cnt); +} + +/* + * sdma_lock should be acquired before calling this routine + */ +static void dump_sdma_7322_state(struct qib_pportdata *ppd) +{ + u64 reg, reg1, reg2; + + reg = qib_read_kreg_port(ppd, krp_senddmastatus); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmastatus: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_sendctrl); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA sendctrl: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_senddmabase); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmabase: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_senddmabufmask0); + reg1 = qib_read_kreg_port(ppd, krp_senddmabufmask1); + reg2 = qib_read_kreg_port(ppd, krp_senddmabufmask2); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmabufmask 0:%llx 1:%llx 2:%llx\n", + reg, reg1, reg2); + + /* get bufuse bits, clear them, and print them again if non-zero */ + reg = qib_read_kreg_port(ppd, krp_senddmabuf_use0); + qib_write_kreg_port(ppd, krp_senddmabuf_use0, reg); + reg1 = qib_read_kreg_port(ppd, krp_senddmabuf_use1); + qib_write_kreg_port(ppd, krp_senddmabuf_use0, reg1); + reg2 = qib_read_kreg_port(ppd, krp_senddmabuf_use2); + qib_write_kreg_port(ppd, krp_senddmabuf_use0, reg2); + /* 0 and 1 should always be zero, so print as short form */ + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA current senddmabuf_use 0:%llx 1:%llx 2:%llx\n", + reg, reg1, reg2); + reg = qib_read_kreg_port(ppd, krp_senddmabuf_use0); + reg1 = qib_read_kreg_port(ppd, krp_senddmabuf_use1); + reg2 = qib_read_kreg_port(ppd, krp_senddmabuf_use2); + /* 0 and 1 should always be zero, so print as short form */ + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA cleared senddmabuf_use 0:%llx 1:%llx 2:%llx\n", + reg, reg1, reg2); + + reg = qib_read_kreg_port(ppd, krp_senddmatail); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmatail: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_senddmahead); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmahead: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_senddmaheadaddr); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmaheadaddr: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_senddmalengen); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmalengen: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_senddmadesccnt); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmadesccnt: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_senddmaidlecnt); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmaidlecnt: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_senddmaprioritythld); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmapriorityhld: 0x%016llx\n", reg); + + reg = qib_read_kreg_port(ppd, krp_senddmareloadcnt); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA senddmareloadcnt: 0x%016llx\n", reg); + + dump_sdma_state(ppd); +} + +static struct sdma_set_state_action sdma_7322_action_table[] = { + [qib_sdma_state_s00_hw_down] = { + .go_s99_running_tofalse = 1, + .op_enable = 0, + .op_intenable = 0, + .op_halt = 0, + .op_drain = 0, + }, + [qib_sdma_state_s10_hw_start_up_wait] = { + .op_enable = 0, + .op_intenable = 1, + .op_halt = 1, + .op_drain = 0, + }, + [qib_sdma_state_s20_idle] = { + .op_enable = 1, + .op_intenable = 1, + .op_halt = 1, + .op_drain = 0, + }, + [qib_sdma_state_s30_sw_clean_up_wait] = { + .op_enable = 0, + .op_intenable = 1, + .op_halt = 1, + .op_drain = 0, + }, + [qib_sdma_state_s40_hw_clean_up_wait] = { + .op_enable = 1, + .op_intenable = 1, + .op_halt = 1, + .op_drain = 0, + }, + [qib_sdma_state_s50_hw_halt_wait] = { + .op_enable = 1, + .op_intenable = 1, + .op_halt = 1, + .op_drain = 1, + }, + [qib_sdma_state_s99_running] = { + .op_enable = 1, + .op_intenable = 1, + .op_halt = 0, + .op_drain = 0, + .go_s99_running_totrue = 1, + }, +}; + +static void qib_7322_sdma_init_early(struct qib_pportdata *ppd) +{ + ppd->sdma_state.set_state_action = sdma_7322_action_table; +} + +static int init_sdma_7322_regs(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + unsigned lastbuf, erstbuf; + u64 senddmabufmask[3] = { 0 }; + int n, ret = 0; + + qib_write_kreg_port(ppd, krp_senddmabase, ppd->sdma_descq_phys); + qib_sdma_7322_setlengen(ppd); + qib_sdma_update_7322_tail(ppd, 0); /* Set SendDmaTail */ + qib_write_kreg_port(ppd, krp_senddmareloadcnt, sdma_idle_cnt); + qib_write_kreg_port(ppd, krp_senddmadesccnt, 0); + qib_write_kreg_port(ppd, krp_senddmaheadaddr, ppd->sdma_head_phys); + + if (dd->num_pports) + n = dd->cspec->sdmabufcnt / dd->num_pports; /* no remainder */ + else + n = dd->cspec->sdmabufcnt; /* failsafe for init */ + erstbuf = (dd->piobcnt2k + dd->piobcnt4k) - + ((dd->num_pports == 1 || ppd->port == 2) ? n : + dd->cspec->sdmabufcnt); + lastbuf = erstbuf + n; + + ppd->sdma_state.first_sendbuf = erstbuf; + ppd->sdma_state.last_sendbuf = lastbuf; + for (; erstbuf < lastbuf; ++erstbuf) { + unsigned word = erstbuf / BITS_PER_LONG; + unsigned bit = erstbuf & (BITS_PER_LONG - 1); + + BUG_ON(word >= 3); + senddmabufmask[word] |= 1ULL << bit; + } + qib_write_kreg_port(ppd, krp_senddmabufmask0, senddmabufmask[0]); + qib_write_kreg_port(ppd, krp_senddmabufmask1, senddmabufmask[1]); + qib_write_kreg_port(ppd, krp_senddmabufmask2, senddmabufmask[2]); + return ret; +} + +/* sdma_lock must be held */ +static u16 qib_sdma_7322_gethead(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + int sane; + int use_dmahead; + u16 swhead; + u16 swtail; + u16 cnt; + u16 hwhead; + + use_dmahead = __qib_sdma_running(ppd) && + (dd->flags & QIB_HAS_SDMA_TIMEOUT); +retry: + hwhead = use_dmahead ? + (u16) le64_to_cpu(*ppd->sdma_head_dma) : + (u16) qib_read_kreg_port(ppd, krp_senddmahead); + + swhead = ppd->sdma_descq_head; + swtail = ppd->sdma_descq_tail; + cnt = ppd->sdma_descq_cnt; + + if (swhead < swtail) + /* not wrapped */ + sane = (hwhead >= swhead) & (hwhead <= swtail); + else if (swhead > swtail) + /* wrapped around */ + sane = ((hwhead >= swhead) && (hwhead < cnt)) || + (hwhead <= swtail); + else + /* empty */ + sane = (hwhead == swhead); + + if (unlikely(!sane)) { + if (use_dmahead) { + /* try one more time, directly from the register */ + use_dmahead = 0; + goto retry; + } + /* proceed as if no progress */ + hwhead = swhead; + } + + return hwhead; +} + +static int qib_sdma_7322_busy(struct qib_pportdata *ppd) +{ + u64 hwstatus = qib_read_kreg_port(ppd, krp_senddmastatus); + + return (hwstatus & SYM_MASK(SendDmaStatus_0, ScoreBoardDrainInProg)) || + (hwstatus & SYM_MASK(SendDmaStatus_0, HaltInProg)) || + !(hwstatus & SYM_MASK(SendDmaStatus_0, InternalSDmaHalt)) || + !(hwstatus & SYM_MASK(SendDmaStatus_0, ScbEmpty)); +} + +/* + * Compute the amount of delay before sending the next packet if the + * port's send rate differs from the static rate set for the QP. + * The delay affects the next packet and the amount of the delay is + * based on the length of the this packet. + */ +static u32 qib_7322_setpbc_control(struct qib_pportdata *ppd, u32 plen, + u8 srate, u8 vl) +{ + u8 snd_mult = ppd->delay_mult; + u8 rcv_mult = ib_rate_to_delay[srate]; + u32 ret; + + ret = rcv_mult > snd_mult ? ((plen + 1) >> 1) * snd_mult : 0; + + /* Indicate VL15, else set the VL in the control word */ + if (vl == 15) + ret |= PBC_7322_VL15_SEND_CTRL; + else + ret |= vl << PBC_VL_NUM_LSB; + ret |= ((u32)(ppd->hw_pidx)) << PBC_PORT_SEL_LSB; + + return ret; +} + +/* + * Enable the per-port VL15 send buffers for use. + * They follow the rest of the buffers, without a config parameter. + * This was in initregs, but that is done before the shadow + * is set up, and this has to be done after the shadow is + * set up. + */ +static void qib_7322_initvl15_bufs(struct qib_devdata *dd) +{ + unsigned vl15bufs; + + vl15bufs = dd->piobcnt2k + dd->piobcnt4k; + qib_chg_pioavailkernel(dd, vl15bufs, NUM_VL15_BUFS, + TXCHK_CHG_TYPE_KERN, NULL); +} + +static void qib_7322_init_ctxt(struct qib_ctxtdata *rcd) +{ + if (rcd->ctxt < NUM_IB_PORTS) { + if (rcd->dd->num_pports > 1) { + rcd->rcvegrcnt = KCTXT0_EGRCNT / 2; + rcd->rcvegr_tid_base = rcd->ctxt ? rcd->rcvegrcnt : 0; + } else { + rcd->rcvegrcnt = KCTXT0_EGRCNT; + rcd->rcvegr_tid_base = 0; + } + } else { + rcd->rcvegrcnt = rcd->dd->cspec->rcvegrcnt; + rcd->rcvegr_tid_base = KCTXT0_EGRCNT + + (rcd->ctxt - NUM_IB_PORTS) * rcd->rcvegrcnt; + } +} + +#define QTXSLEEPS 5000 +static void qib_7322_txchk_change(struct qib_devdata *dd, u32 start, + u32 len, u32 which, struct qib_ctxtdata *rcd) +{ + int i; + const int last = start + len - 1; + const int lastr = last / BITS_PER_LONG; + u32 sleeps = 0; + int wait = rcd != NULL; + unsigned long flags; + + while (wait) { + unsigned long shadow; + int cstart, previ = -1; + + /* + * when flipping from kernel to user, we can't change + * the checking type if the buffer is allocated to the + * driver. It's OK the other direction, because it's + * from close, and we have just disarm'ed all the + * buffers. All the kernel to kernel changes are also + * OK. + */ + for (cstart = start; cstart <= last; cstart++) { + i = ((2 * cstart) + QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT) + / BITS_PER_LONG; + if (i != previ) { + shadow = (unsigned long) + le64_to_cpu(dd->pioavailregs_dma[i]); + previ = i; + } + if (test_bit(((2 * cstart) + + QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT) + % BITS_PER_LONG, &shadow)) + break; + } + + if (cstart > last) + break; + + if (sleeps == QTXSLEEPS) + break; + /* make sure we see an updated copy next time around */ + sendctrl_7322_mod(dd->pport, QIB_SENDCTRL_AVAIL_BLIP); + sleeps++; + msleep(20); + } + + switch (which) { + case TXCHK_CHG_TYPE_DIS1: + /* + * disable checking on a range; used by diags; just + * one buffer, but still written generically + */ + for (i = start; i <= last; i++) + clear_bit(i, dd->cspec->sendchkenable); + break; + + case TXCHK_CHG_TYPE_ENAB1: + /* + * (re)enable checking on a range; used by diags; just + * one buffer, but still written generically; read + * scratch to be sure buffer actually triggered, not + * just flushed from processor. + */ + qib_read_kreg32(dd, kr_scratch); + for (i = start; i <= last; i++) + set_bit(i, dd->cspec->sendchkenable); + break; + + case TXCHK_CHG_TYPE_KERN: + /* usable by kernel */ + for (i = start; i <= last; i++) { + set_bit(i, dd->cspec->sendibchk); + clear_bit(i, dd->cspec->sendgrhchk); + } + spin_lock_irqsave(&dd->uctxt_lock, flags); + /* see if we need to raise avail update threshold */ + for (i = dd->first_user_ctxt; + dd->cspec->updthresh != dd->cspec->updthresh_dflt + && i < dd->cfgctxts; i++) + if (dd->rcd[i] && dd->rcd[i]->subctxt_cnt && + ((dd->rcd[i]->piocnt / dd->rcd[i]->subctxt_cnt) - 1) + < dd->cspec->updthresh_dflt) + break; + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + if (i == dd->cfgctxts) { + spin_lock_irqsave(&dd->sendctrl_lock, flags); + dd->cspec->updthresh = dd->cspec->updthresh_dflt; + dd->sendctrl &= ~SYM_MASK(SendCtrl, AvailUpdThld); + dd->sendctrl |= (dd->cspec->updthresh & + SYM_RMASK(SendCtrl, AvailUpdThld)) << + SYM_LSB(SendCtrl, AvailUpdThld); + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); + sendctrl_7322_mod(dd->pport, QIB_SENDCTRL_AVAIL_BLIP); + } + break; + + case TXCHK_CHG_TYPE_USER: + /* for user process */ + for (i = start; i <= last; i++) { + clear_bit(i, dd->cspec->sendibchk); + set_bit(i, dd->cspec->sendgrhchk); + } + spin_lock_irqsave(&dd->sendctrl_lock, flags); + if (rcd && rcd->subctxt_cnt && ((rcd->piocnt + / rcd->subctxt_cnt) - 1) < dd->cspec->updthresh) { + dd->cspec->updthresh = (rcd->piocnt / + rcd->subctxt_cnt) - 1; + dd->sendctrl &= ~SYM_MASK(SendCtrl, AvailUpdThld); + dd->sendctrl |= (dd->cspec->updthresh & + SYM_RMASK(SendCtrl, AvailUpdThld)) + << SYM_LSB(SendCtrl, AvailUpdThld); + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); + sendctrl_7322_mod(dd->pport, QIB_SENDCTRL_AVAIL_BLIP); + } else + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); + break; + + default: + break; + } + + for (i = start / BITS_PER_LONG; which >= 2 && i <= lastr; ++i) + qib_write_kreg(dd, kr_sendcheckmask + i, + dd->cspec->sendchkenable[i]); + + for (i = start / BITS_PER_LONG; which < 2 && i <= lastr; ++i) { + qib_write_kreg(dd, kr_sendgrhcheckmask + i, + dd->cspec->sendgrhchk[i]); + qib_write_kreg(dd, kr_sendibpktmask + i, + dd->cspec->sendibchk[i]); + } + + /* + * Be sure whatever we did was seen by the chip and acted upon, + * before we return. Mostly important for which >= 2. + */ + qib_read_kreg32(dd, kr_scratch); +} + + +/* useful for trigger analyzers, etc. */ +static void writescratch(struct qib_devdata *dd, u32 val) +{ + qib_write_kreg(dd, kr_scratch, val); +} + +/* Dummy for now, use chip regs soon */ +static int qib_7322_tempsense_rd(struct qib_devdata *dd, int regnum) +{ + return -ENXIO; +} + +/** + * qib_init_iba7322_funcs - set up the chip-specific function pointers + * @dev: the pci_dev for qlogic_ib device + * @ent: pci_device_id struct for this dev + * + * Also allocates, inits, and returns the devdata struct for this + * device instance + * + * This is global, and is called directly at init to set up the + * chip-specific function pointers for later use. + */ +struct qib_devdata *qib_init_iba7322_funcs(struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + struct qib_devdata *dd; + int ret, i; + u32 tabsize, actual_cnt = 0; + + dd = qib_alloc_devdata(pdev, + NUM_IB_PORTS * sizeof(struct qib_pportdata) + + sizeof(struct qib_chip_specific) + + NUM_IB_PORTS * sizeof(struct qib_chippport_specific)); + if (IS_ERR(dd)) + goto bail; + + dd->f_bringup_serdes = qib_7322_bringup_serdes; + dd->f_cleanup = qib_setup_7322_cleanup; + dd->f_clear_tids = qib_7322_clear_tids; + dd->f_free_irq = qib_7322_free_irq; + dd->f_get_base_info = qib_7322_get_base_info; + dd->f_get_msgheader = qib_7322_get_msgheader; + dd->f_getsendbuf = qib_7322_getsendbuf; + dd->f_gpio_mod = gpio_7322_mod; + dd->f_eeprom_wen = qib_7322_eeprom_wen; + dd->f_hdrqempty = qib_7322_hdrqempty; + dd->f_ib_updown = qib_7322_ib_updown; + dd->f_init_ctxt = qib_7322_init_ctxt; + dd->f_initvl15_bufs = qib_7322_initvl15_bufs; + dd->f_intr_fallback = qib_7322_intr_fallback; + dd->f_late_initreg = qib_late_7322_initreg; + dd->f_setpbc_control = qib_7322_setpbc_control; + dd->f_portcntr = qib_portcntr_7322; + dd->f_put_tid = qib_7322_put_tid; + dd->f_quiet_serdes = qib_7322_mini_quiet_serdes; + dd->f_rcvctrl = rcvctrl_7322_mod; + dd->f_read_cntrs = qib_read_7322cntrs; + dd->f_read_portcntrs = qib_read_7322portcntrs; + dd->f_reset = qib_do_7322_reset; + dd->f_init_sdma_regs = init_sdma_7322_regs; + dd->f_sdma_busy = qib_sdma_7322_busy; + dd->f_sdma_gethead = qib_sdma_7322_gethead; + dd->f_sdma_sendctrl = qib_7322_sdma_sendctrl; + dd->f_sdma_set_desc_cnt = qib_sdma_set_7322_desc_cnt; + dd->f_sdma_update_tail = qib_sdma_update_7322_tail; + dd->f_sendctrl = sendctrl_7322_mod; + dd->f_set_armlaunch = qib_set_7322_armlaunch; + dd->f_set_cntr_sample = qib_set_cntr_7322_sample; + dd->f_iblink_state = qib_7322_iblink_state; + dd->f_ibphys_portstate = qib_7322_phys_portstate; + dd->f_get_ib_cfg = qib_7322_get_ib_cfg; + dd->f_set_ib_cfg = qib_7322_set_ib_cfg; + dd->f_set_ib_loopback = qib_7322_set_loopback; + dd->f_get_ib_table = qib_7322_get_ib_table; + dd->f_set_ib_table = qib_7322_set_ib_table; + dd->f_set_intr_state = qib_7322_set_intr_state; + dd->f_setextled = qib_setup_7322_setextled; + dd->f_txchk_change = qib_7322_txchk_change; + dd->f_update_usrhead = qib_update_7322_usrhead; + dd->f_wantpiobuf_intr = qib_wantpiobuf_7322_intr; + dd->f_xgxs_reset = qib_7322_mini_pcs_reset; + dd->f_sdma_hw_clean_up = qib_7322_sdma_hw_clean_up; + dd->f_sdma_hw_start_up = qib_7322_sdma_hw_start_up; + dd->f_sdma_init_early = qib_7322_sdma_init_early; + dd->f_writescratch = writescratch; + dd->f_tempsense_rd = qib_7322_tempsense_rd; +#ifdef CONFIG_INFINIBAND_QIB_DCA + dd->f_notify_dca = qib_7322_notify_dca; +#endif + /* + * Do remaining PCIe setup and save PCIe values in dd. + * Any error printing is already done by the init code. + * On return, we have the chip mapped, but chip registers + * are not set up until start of qib_init_7322_variables. + */ + ret = qib_pcie_ddinit(dd, pdev, ent); + if (ret < 0) + goto bail_free; + + /* initialize chip-specific variables */ + ret = qib_init_7322_variables(dd); + if (ret) + goto bail_cleanup; + + if (qib_mini_init || !dd->num_pports) + goto bail; + + /* + * Determine number of vectors we want; depends on port count + * and number of configured kernel receive queues actually used. + * Should also depend on whether sdma is enabled or not, but + * that's such a rare testing case it's not worth worrying about. + */ + tabsize = dd->first_user_ctxt + ARRAY_SIZE(irq_table); + for (i = 0; i < tabsize; i++) + if ((i < ARRAY_SIZE(irq_table) && + irq_table[i].port <= dd->num_pports) || + (i >= ARRAY_SIZE(irq_table) && + dd->rcd[i - ARRAY_SIZE(irq_table)])) + actual_cnt++; + /* reduce by ctxt's < 2 */ + if (qib_krcvq01_no_msi) + actual_cnt -= dd->num_pports; + + tabsize = actual_cnt; + dd->cspec->msix_entries = kzalloc(tabsize * + sizeof(struct qib_msix_entry), GFP_KERNEL); + if (!dd->cspec->msix_entries) { + qib_dev_err(dd, "No memory for MSIx table\n"); + tabsize = 0; + } + for (i = 0; i < tabsize; i++) + dd->cspec->msix_entries[i].msix.entry = i; + + if (qib_pcie_params(dd, 8, &tabsize, dd->cspec->msix_entries)) + qib_dev_err(dd, + "Failed to setup PCIe or interrupts; continuing anyway\n"); + /* may be less than we wanted, if not enough available */ + dd->cspec->num_msix_entries = tabsize; + + /* setup interrupt handler */ + qib_setup_7322_interrupt(dd, 1); + + /* clear diagctrl register, in case diags were running and crashed */ + qib_write_kreg(dd, kr_hwdiagctrl, 0); +#ifdef CONFIG_INFINIBAND_QIB_DCA + if (!dca_add_requester(&pdev->dev)) { + qib_devinfo(dd->pcidev, "DCA enabled\n"); + dd->flags |= QIB_DCA_ENABLED; + qib_setup_dca(dd); + } +#endif + goto bail; + +bail_cleanup: + qib_pcie_ddcleanup(dd); +bail_free: + qib_free_devdata(dd); + dd = ERR_PTR(ret); +bail: + return dd; +} + +/* + * Set the table entry at the specified index from the table specifed. + * There are 3 * TXDDS_TABLE_SZ entries in all per port, with the first + * TXDDS_TABLE_SZ for SDR, the next for DDR, and the last for QDR. + * 'idx' below addresses the correct entry, while its 4 LSBs select the + * corresponding entry (one of TXDDS_TABLE_SZ) from the selected table. + */ +#define DDS_ENT_AMP_LSB 14 +#define DDS_ENT_MAIN_LSB 9 +#define DDS_ENT_POST_LSB 5 +#define DDS_ENT_PRE_XTRA_LSB 3 +#define DDS_ENT_PRE_LSB 0 + +/* + * Set one entry in the TxDDS table for spec'd port + * ridx picks one of the entries, while tp points + * to the appropriate table entry. + */ +static void set_txdds(struct qib_pportdata *ppd, int ridx, + const struct txdds_ent *tp) +{ + struct qib_devdata *dd = ppd->dd; + u32 pack_ent; + int regidx; + + /* Get correct offset in chip-space, and in source table */ + regidx = KREG_IBPORT_IDX(IBSD_DDS_MAP_TABLE) + ridx; + /* + * We do not use qib_write_kreg_port() because it was intended + * only for registers in the lower "port specific" pages. + * So do index calculation by hand. + */ + if (ppd->hw_pidx) + regidx += (dd->palign / sizeof(u64)); + + pack_ent = tp->amp << DDS_ENT_AMP_LSB; + pack_ent |= tp->main << DDS_ENT_MAIN_LSB; + pack_ent |= tp->pre << DDS_ENT_PRE_LSB; + pack_ent |= tp->post << DDS_ENT_POST_LSB; + qib_write_kreg(dd, regidx, pack_ent); + /* Prevent back-to-back writes by hitting scratch */ + qib_write_kreg(ppd->dd, kr_scratch, 0); +} + +static const struct vendor_txdds_ent vendor_txdds[] = { + { /* Amphenol 1m 30awg NoEq */ + { 0x41, 0x50, 0x48 }, "584470002 ", + { 10, 0, 0, 5 }, { 10, 0, 0, 9 }, { 7, 1, 0, 13 }, + }, + { /* Amphenol 3m 28awg NoEq */ + { 0x41, 0x50, 0x48 }, "584470004 ", + { 0, 0, 0, 8 }, { 0, 0, 0, 11 }, { 0, 1, 7, 15 }, + }, + { /* Finisar 3m OM2 Optical */ + { 0x00, 0x90, 0x65 }, "FCBG410QB1C03-QL", + { 0, 0, 0, 3 }, { 0, 0, 0, 4 }, { 0, 0, 0, 13 }, + }, + { /* Finisar 30m OM2 Optical */ + { 0x00, 0x90, 0x65 }, "FCBG410QB1C30-QL", + { 0, 0, 0, 1 }, { 0, 0, 0, 5 }, { 0, 0, 0, 11 }, + }, + { /* Finisar Default OM2 Optical */ + { 0x00, 0x90, 0x65 }, NULL, + { 0, 0, 0, 2 }, { 0, 0, 0, 5 }, { 0, 0, 0, 12 }, + }, + { /* Gore 1m 30awg NoEq */ + { 0x00, 0x21, 0x77 }, "QSN3300-1 ", + { 0, 0, 0, 6 }, { 0, 0, 0, 9 }, { 0, 1, 0, 15 }, + }, + { /* Gore 2m 30awg NoEq */ + { 0x00, 0x21, 0x77 }, "QSN3300-2 ", + { 0, 0, 0, 8 }, { 0, 0, 0, 10 }, { 0, 1, 7, 15 }, + }, + { /* Gore 1m 28awg NoEq */ + { 0x00, 0x21, 0x77 }, "QSN3800-1 ", + { 0, 0, 0, 6 }, { 0, 0, 0, 8 }, { 0, 1, 0, 15 }, + }, + { /* Gore 3m 28awg NoEq */ + { 0x00, 0x21, 0x77 }, "QSN3800-3 ", + { 0, 0, 0, 9 }, { 0, 0, 0, 13 }, { 0, 1, 7, 15 }, + }, + { /* Gore 5m 24awg Eq */ + { 0x00, 0x21, 0x77 }, "QSN7000-5 ", + { 0, 0, 0, 7 }, { 0, 0, 0, 9 }, { 0, 1, 3, 15 }, + }, + { /* Gore 7m 24awg Eq */ + { 0x00, 0x21, 0x77 }, "QSN7000-7 ", + { 0, 0, 0, 9 }, { 0, 0, 0, 11 }, { 0, 2, 6, 15 }, + }, + { /* Gore 5m 26awg Eq */ + { 0x00, 0x21, 0x77 }, "QSN7600-5 ", + { 0, 0, 0, 8 }, { 0, 0, 0, 11 }, { 0, 1, 9, 13 }, + }, + { /* Gore 7m 26awg Eq */ + { 0x00, 0x21, 0x77 }, "QSN7600-7 ", + { 0, 0, 0, 8 }, { 0, 0, 0, 11 }, { 10, 1, 8, 15 }, + }, + { /* Intersil 12m 24awg Active */ + { 0x00, 0x30, 0xB4 }, "QLX4000CQSFP1224", + { 0, 0, 0, 2 }, { 0, 0, 0, 5 }, { 0, 3, 0, 9 }, + }, + { /* Intersil 10m 28awg Active */ + { 0x00, 0x30, 0xB4 }, "QLX4000CQSFP1028", + { 0, 0, 0, 6 }, { 0, 0, 0, 4 }, { 0, 2, 0, 2 }, + }, + { /* Intersil 7m 30awg Active */ + { 0x00, 0x30, 0xB4 }, "QLX4000CQSFP0730", + { 0, 0, 0, 6 }, { 0, 0, 0, 4 }, { 0, 1, 0, 3 }, + }, + { /* Intersil 5m 32awg Active */ + { 0x00, 0x30, 0xB4 }, "QLX4000CQSFP0532", + { 0, 0, 0, 6 }, { 0, 0, 0, 6 }, { 0, 2, 0, 8 }, + }, + { /* Intersil Default Active */ + { 0x00, 0x30, 0xB4 }, NULL, + { 0, 0, 0, 6 }, { 0, 0, 0, 5 }, { 0, 2, 0, 5 }, + }, + { /* Luxtera 20m Active Optical */ + { 0x00, 0x25, 0x63 }, NULL, + { 0, 0, 0, 5 }, { 0, 0, 0, 8 }, { 0, 2, 0, 12 }, + }, + { /* Molex 1M Cu loopback */ + { 0x00, 0x09, 0x3A }, "74763-0025 ", + { 2, 2, 6, 15 }, { 2, 2, 6, 15 }, { 2, 2, 6, 15 }, + }, + { /* Molex 2m 28awg NoEq */ + { 0x00, 0x09, 0x3A }, "74757-2201 ", + { 0, 0, 0, 6 }, { 0, 0, 0, 9 }, { 0, 1, 1, 15 }, + }, +}; + +static const struct txdds_ent txdds_sdr[TXDDS_TABLE_SZ] = { + /* amp, pre, main, post */ + { 2, 2, 15, 6 }, /* Loopback */ + { 0, 0, 0, 1 }, /* 2 dB */ + { 0, 0, 0, 2 }, /* 3 dB */ + { 0, 0, 0, 3 }, /* 4 dB */ + { 0, 0, 0, 4 }, /* 5 dB */ + { 0, 0, 0, 5 }, /* 6 dB */ + { 0, 0, 0, 6 }, /* 7 dB */ + { 0, 0, 0, 7 }, /* 8 dB */ + { 0, 0, 0, 8 }, /* 9 dB */ + { 0, 0, 0, 9 }, /* 10 dB */ + { 0, 0, 0, 10 }, /* 11 dB */ + { 0, 0, 0, 11 }, /* 12 dB */ + { 0, 0, 0, 12 }, /* 13 dB */ + { 0, 0, 0, 13 }, /* 14 dB */ + { 0, 0, 0, 14 }, /* 15 dB */ + { 0, 0, 0, 15 }, /* 16 dB */ +}; + +static const struct txdds_ent txdds_ddr[TXDDS_TABLE_SZ] = { + /* amp, pre, main, post */ + { 2, 2, 15, 6 }, /* Loopback */ + { 0, 0, 0, 8 }, /* 2 dB */ + { 0, 0, 0, 8 }, /* 3 dB */ + { 0, 0, 0, 9 }, /* 4 dB */ + { 0, 0, 0, 9 }, /* 5 dB */ + { 0, 0, 0, 10 }, /* 6 dB */ + { 0, 0, 0, 10 }, /* 7 dB */ + { 0, 0, 0, 11 }, /* 8 dB */ + { 0, 0, 0, 11 }, /* 9 dB */ + { 0, 0, 0, 12 }, /* 10 dB */ + { 0, 0, 0, 12 }, /* 11 dB */ + { 0, 0, 0, 13 }, /* 12 dB */ + { 0, 0, 0, 13 }, /* 13 dB */ + { 0, 0, 0, 14 }, /* 14 dB */ + { 0, 0, 0, 14 }, /* 15 dB */ + { 0, 0, 0, 15 }, /* 16 dB */ +}; + +static const struct txdds_ent txdds_qdr[TXDDS_TABLE_SZ] = { + /* amp, pre, main, post */ + { 2, 2, 15, 6 }, /* Loopback */ + { 0, 1, 0, 7 }, /* 2 dB (also QMH7342) */ + { 0, 1, 0, 9 }, /* 3 dB (also QMH7342) */ + { 0, 1, 0, 11 }, /* 4 dB */ + { 0, 1, 0, 13 }, /* 5 dB */ + { 0, 1, 0, 15 }, /* 6 dB */ + { 0, 1, 3, 15 }, /* 7 dB */ + { 0, 1, 7, 15 }, /* 8 dB */ + { 0, 1, 7, 15 }, /* 9 dB */ + { 0, 1, 8, 15 }, /* 10 dB */ + { 0, 1, 9, 15 }, /* 11 dB */ + { 0, 1, 10, 15 }, /* 12 dB */ + { 0, 2, 6, 15 }, /* 13 dB */ + { 0, 2, 7, 15 }, /* 14 dB */ + { 0, 2, 8, 15 }, /* 15 dB */ + { 0, 2, 9, 15 }, /* 16 dB */ +}; + +/* + * extra entries for use with txselect, for indices >= TXDDS_TABLE_SZ. + * These are mostly used for mez cards going through connectors + * and backplane traces, but can be used to add other "unusual" + * table values as well. + */ +static const struct txdds_ent txdds_extra_sdr[TXDDS_EXTRA_SZ] = { + /* amp, pre, main, post */ + { 0, 0, 0, 1 }, /* QMH7342 backplane settings */ + { 0, 0, 0, 1 }, /* QMH7342 backplane settings */ + { 0, 0, 0, 2 }, /* QMH7342 backplane settings */ + { 0, 0, 0, 2 }, /* QMH7342 backplane settings */ + { 0, 0, 0, 3 }, /* QMH7342 backplane settings */ + { 0, 0, 0, 4 }, /* QMH7342 backplane settings */ + { 0, 1, 4, 15 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 3, 15 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 12 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 11 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 9 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 14 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 2, 15 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 11 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 7 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 9 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 6 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 8 }, /* QME7342 backplane settings 1.1 */ +}; + +static const struct txdds_ent txdds_extra_ddr[TXDDS_EXTRA_SZ] = { + /* amp, pre, main, post */ + { 0, 0, 0, 7 }, /* QMH7342 backplane settings */ + { 0, 0, 0, 7 }, /* QMH7342 backplane settings */ + { 0, 0, 0, 8 }, /* QMH7342 backplane settings */ + { 0, 0, 0, 8 }, /* QMH7342 backplane settings */ + { 0, 0, 0, 9 }, /* QMH7342 backplane settings */ + { 0, 0, 0, 10 }, /* QMH7342 backplane settings */ + { 0, 1, 4, 15 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 3, 15 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 12 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 11 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 9 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 14 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 2, 15 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 11 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 7 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 9 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 6 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 8 }, /* QME7342 backplane settings 1.1 */ +}; + +static const struct txdds_ent txdds_extra_qdr[TXDDS_EXTRA_SZ] = { + /* amp, pre, main, post */ + { 0, 1, 0, 4 }, /* QMH7342 backplane settings */ + { 0, 1, 0, 5 }, /* QMH7342 backplane settings */ + { 0, 1, 0, 6 }, /* QMH7342 backplane settings */ + { 0, 1, 0, 8 }, /* QMH7342 backplane settings */ + { 0, 1, 0, 10 }, /* QMH7342 backplane settings */ + { 0, 1, 0, 12 }, /* QMH7342 backplane settings */ + { 0, 1, 4, 15 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 3, 15 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 12 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 11 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 9 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 14 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 2, 15 }, /* QME7342 backplane settings 1.0 */ + { 0, 1, 0, 11 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 7 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 9 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 6 }, /* QME7342 backplane settings 1.1 */ + { 0, 1, 0, 8 }, /* QME7342 backplane settings 1.1 */ +}; + +static const struct txdds_ent txdds_extra_mfg[TXDDS_MFG_SZ] = { + /* amp, pre, main, post */ + { 0, 0, 0, 0 }, /* QME7342 mfg settings */ + { 0, 0, 0, 6 }, /* QME7342 P2 mfg settings */ +}; + +static const struct txdds_ent *get_atten_table(const struct txdds_ent *txdds, + unsigned atten) +{ + /* + * The attenuation table starts at 2dB for entry 1, + * with entry 0 being the loopback entry. + */ + if (atten <= 2) + atten = 1; + else if (atten > TXDDS_TABLE_SZ) + atten = TXDDS_TABLE_SZ - 1; + else + atten--; + return txdds + atten; +} + +/* + * if override is set, the module parameter txselect has a value + * for this specific port, so use it, rather than our normal mechanism. + */ +static void find_best_ent(struct qib_pportdata *ppd, + const struct txdds_ent **sdr_dds, + const struct txdds_ent **ddr_dds, + const struct txdds_ent **qdr_dds, int override) +{ + struct qib_qsfp_cache *qd = &ppd->cpspec->qsfp_data.cache; + int idx; + + /* Search table of known cables */ + for (idx = 0; !override && idx < ARRAY_SIZE(vendor_txdds); ++idx) { + const struct vendor_txdds_ent *v = vendor_txdds + idx; + + if (!memcmp(v->oui, qd->oui, QSFP_VOUI_LEN) && + (!v->partnum || + !memcmp(v->partnum, qd->partnum, QSFP_PN_LEN))) { + *sdr_dds = &v->sdr; + *ddr_dds = &v->ddr; + *qdr_dds = &v->qdr; + return; + } + } + + /* Active cables don't have attenuation so we only set SERDES + * settings to account for the attenuation of the board traces. */ + if (!override && QSFP_IS_ACTIVE(qd->tech)) { + *sdr_dds = txdds_sdr + ppd->dd->board_atten; + *ddr_dds = txdds_ddr + ppd->dd->board_atten; + *qdr_dds = txdds_qdr + ppd->dd->board_atten; + return; + } + + if (!override && QSFP_HAS_ATTEN(qd->tech) && (qd->atten[0] || + qd->atten[1])) { + *sdr_dds = get_atten_table(txdds_sdr, qd->atten[0]); + *ddr_dds = get_atten_table(txdds_ddr, qd->atten[0]); + *qdr_dds = get_atten_table(txdds_qdr, qd->atten[1]); + return; + } else if (ppd->cpspec->no_eep < TXDDS_TABLE_SZ) { + /* + * If we have no (or incomplete) data from the cable + * EEPROM, or no QSFP, or override is set, use the + * module parameter value to index into the attentuation + * table. + */ + idx = ppd->cpspec->no_eep; + *sdr_dds = &txdds_sdr[idx]; + *ddr_dds = &txdds_ddr[idx]; + *qdr_dds = &txdds_qdr[idx]; + } else if (ppd->cpspec->no_eep < (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ)) { + /* similar to above, but index into the "extra" table. */ + idx = ppd->cpspec->no_eep - TXDDS_TABLE_SZ; + *sdr_dds = &txdds_extra_sdr[idx]; + *ddr_dds = &txdds_extra_ddr[idx]; + *qdr_dds = &txdds_extra_qdr[idx]; + } else if ((IS_QME(ppd->dd) || IS_QMH(ppd->dd)) && + ppd->cpspec->no_eep < (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ + + TXDDS_MFG_SZ)) { + idx = ppd->cpspec->no_eep - (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ); + pr_info("IB%u:%u use idx %u into txdds_mfg\n", + ppd->dd->unit, ppd->port, idx); + *sdr_dds = &txdds_extra_mfg[idx]; + *ddr_dds = &txdds_extra_mfg[idx]; + *qdr_dds = &txdds_extra_mfg[idx]; + } else { + /* this shouldn't happen, it's range checked */ + *sdr_dds = txdds_sdr + qib_long_atten; + *ddr_dds = txdds_ddr + qib_long_atten; + *qdr_dds = txdds_qdr + qib_long_atten; + } +} + +static void init_txdds_table(struct qib_pportdata *ppd, int override) +{ + const struct txdds_ent *sdr_dds, *ddr_dds, *qdr_dds; + struct txdds_ent *dds; + int idx; + int single_ent = 0; + + find_best_ent(ppd, &sdr_dds, &ddr_dds, &qdr_dds, override); + + /* for mez cards or override, use the selected value for all entries */ + if (!(ppd->dd->flags & QIB_HAS_QSFP) || override) + single_ent = 1; + + /* Fill in the first entry with the best entry found. */ + set_txdds(ppd, 0, sdr_dds); + set_txdds(ppd, TXDDS_TABLE_SZ, ddr_dds); + set_txdds(ppd, 2 * TXDDS_TABLE_SZ, qdr_dds); + if (ppd->lflags & (QIBL_LINKINIT | QIBL_LINKARMED | + QIBL_LINKACTIVE)) { + dds = (struct txdds_ent *)(ppd->link_speed_active == + QIB_IB_QDR ? qdr_dds : + (ppd->link_speed_active == + QIB_IB_DDR ? ddr_dds : sdr_dds)); + write_tx_serdes_param(ppd, dds); + } + + /* Fill in the remaining entries with the default table values. */ + for (idx = 1; idx < ARRAY_SIZE(txdds_sdr); ++idx) { + set_txdds(ppd, idx, single_ent ? sdr_dds : txdds_sdr + idx); + set_txdds(ppd, idx + TXDDS_TABLE_SZ, + single_ent ? ddr_dds : txdds_ddr + idx); + set_txdds(ppd, idx + 2 * TXDDS_TABLE_SZ, + single_ent ? qdr_dds : txdds_qdr + idx); + } +} + +#define KR_AHB_ACC KREG_IDX(ahb_access_ctrl) +#define KR_AHB_TRANS KREG_IDX(ahb_transaction_reg) +#define AHB_TRANS_RDY SYM_MASK(ahb_transaction_reg, ahb_rdy) +#define AHB_ADDR_LSB SYM_LSB(ahb_transaction_reg, ahb_address) +#define AHB_DATA_LSB SYM_LSB(ahb_transaction_reg, ahb_data) +#define AHB_WR SYM_MASK(ahb_transaction_reg, write_not_read) +#define AHB_TRANS_TRIES 10 + +/* + * The chan argument is 0=chan0, 1=chan1, 2=pll, 3=chan2, 4=chan4, + * 5=subsystem which is why most calls have "chan + chan >> 1" + * for the channel argument. + */ +static u32 ahb_mod(struct qib_devdata *dd, int quad, int chan, int addr, + u32 data, u32 mask) +{ + u32 rd_data, wr_data, sz_mask; + u64 trans, acc, prev_acc; + u32 ret = 0xBAD0BAD; + int tries; + + prev_acc = qib_read_kreg64(dd, KR_AHB_ACC); + /* From this point on, make sure we return access */ + acc = (quad << 1) | 1; + qib_write_kreg(dd, KR_AHB_ACC, acc); + + for (tries = 1; tries < AHB_TRANS_TRIES; ++tries) { + trans = qib_read_kreg64(dd, KR_AHB_TRANS); + if (trans & AHB_TRANS_RDY) + break; + } + if (tries >= AHB_TRANS_TRIES) { + qib_dev_err(dd, "No ahb_rdy in %d tries\n", AHB_TRANS_TRIES); + goto bail; + } + + /* If mask is not all 1s, we need to read, but different SerDes + * entities have different sizes + */ + sz_mask = (1UL << ((quad == 1) ? 32 : 16)) - 1; + wr_data = data & mask & sz_mask; + if ((~mask & sz_mask) != 0) { + trans = ((chan << 6) | addr) << (AHB_ADDR_LSB + 1); + qib_write_kreg(dd, KR_AHB_TRANS, trans); + + for (tries = 1; tries < AHB_TRANS_TRIES; ++tries) { + trans = qib_read_kreg64(dd, KR_AHB_TRANS); + if (trans & AHB_TRANS_RDY) + break; + } + if (tries >= AHB_TRANS_TRIES) { + qib_dev_err(dd, "No Rd ahb_rdy in %d tries\n", + AHB_TRANS_TRIES); + goto bail; + } + /* Re-read in case host split reads and read data first */ + trans = qib_read_kreg64(dd, KR_AHB_TRANS); + rd_data = (uint32_t)(trans >> AHB_DATA_LSB); + wr_data |= (rd_data & ~mask & sz_mask); + } + + /* If mask is not zero, we need to write. */ + if (mask & sz_mask) { + trans = ((chan << 6) | addr) << (AHB_ADDR_LSB + 1); + trans |= ((uint64_t)wr_data << AHB_DATA_LSB); + trans |= AHB_WR; + qib_write_kreg(dd, KR_AHB_TRANS, trans); + + for (tries = 1; tries < AHB_TRANS_TRIES; ++tries) { + trans = qib_read_kreg64(dd, KR_AHB_TRANS); + if (trans & AHB_TRANS_RDY) + break; + } + if (tries >= AHB_TRANS_TRIES) { + qib_dev_err(dd, "No Wr ahb_rdy in %d tries\n", + AHB_TRANS_TRIES); + goto bail; + } + } + ret = wr_data; +bail: + qib_write_kreg(dd, KR_AHB_ACC, prev_acc); + return ret; +} + +static void ibsd_wr_allchans(struct qib_pportdata *ppd, int addr, unsigned data, + unsigned mask) +{ + struct qib_devdata *dd = ppd->dd; + int chan; + u32 rbc; + + for (chan = 0; chan < SERDES_CHANS; ++chan) { + ahb_mod(dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)), addr, + data, mask); + rbc = ahb_mod(dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)), + addr, 0, 0); + } +} + +static void serdes_7322_los_enable(struct qib_pportdata *ppd, int enable) +{ + u64 data = qib_read_kreg_port(ppd, krp_serdesctrl); + u8 state = SYM_FIELD(data, IBSerdesCtrl_0, RXLOSEN); + + if (enable && !state) { + pr_info("IB%u:%u Turning LOS on\n", + ppd->dd->unit, ppd->port); + data |= SYM_MASK(IBSerdesCtrl_0, RXLOSEN); + } else if (!enable && state) { + pr_info("IB%u:%u Turning LOS off\n", + ppd->dd->unit, ppd->port); + data &= ~SYM_MASK(IBSerdesCtrl_0, RXLOSEN); + } + qib_write_kreg_port(ppd, krp_serdesctrl, data); +} + +static int serdes_7322_init(struct qib_pportdata *ppd) +{ + int ret = 0; + if (ppd->dd->cspec->r1) + ret = serdes_7322_init_old(ppd); + else + ret = serdes_7322_init_new(ppd); + return ret; +} + +static int serdes_7322_init_old(struct qib_pportdata *ppd) +{ + u32 le_val; + + /* + * Initialize the Tx DDS tables. Also done every QSFP event, + * for adapters with QSFP + */ + init_txdds_table(ppd, 0); + + /* ensure no tx overrides from earlier driver loads */ + qib_write_kreg_port(ppd, krp_tx_deemph_override, + SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, + reset_tx_deemphasis_override)); + + /* Patch some SerDes defaults to "Better for IB" */ + /* Timing Loop Bandwidth: cdr_timing[11:9] = 0 */ + ibsd_wr_allchans(ppd, 2, 0, BMASK(11, 9)); + + /* Termination: rxtermctrl_r2d addr 11 bits [12:11] = 1 */ + ibsd_wr_allchans(ppd, 11, (1 << 11), BMASK(12, 11)); + /* Enable LE2: rxle2en_r2a addr 13 bit [6] = 1 */ + ibsd_wr_allchans(ppd, 13, (1 << 6), (1 << 6)); + + /* May be overridden in qsfp_7322_event */ + le_val = IS_QME(ppd->dd) ? LE2_QME : LE2_DEFAULT; + ibsd_wr_allchans(ppd, 13, (le_val << 7), BMASK(9, 7)); + + /* enable LE1 adaptation for all but QME, which is disabled */ + le_val = IS_QME(ppd->dd) ? 0 : 1; + ibsd_wr_allchans(ppd, 13, (le_val << 5), (1 << 5)); + + /* Clear cmode-override, may be set from older driver */ + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 0 << 14, 1 << 14); + + /* Timing Recovery: rxtapsel addr 5 bits [9:8] = 0 */ + ibsd_wr_allchans(ppd, 5, (0 << 8), BMASK(9, 8)); + + /* setup LoS params; these are subsystem, so chan == 5 */ + /* LoS filter threshold_count on, ch 0-3, set to 8 */ + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 5, 8 << 11, BMASK(14, 11)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 7, 8 << 4, BMASK(7, 4)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 8, 8 << 11, BMASK(14, 11)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 8 << 4, BMASK(7, 4)); + + /* LoS filter threshold_count off, ch 0-3, set to 4 */ + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 6, 4 << 0, BMASK(3, 0)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 7, 4 << 8, BMASK(11, 8)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 9, 4 << 0, BMASK(3, 0)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 4 << 8, BMASK(11, 8)); + + /* LoS filter select enabled */ + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 9, 1 << 15, 1 << 15); + + /* LoS target data: SDR=4, DDR=2, QDR=1 */ + ibsd_wr_allchans(ppd, 14, (1 << 3), BMASK(5, 3)); /* QDR */ + ibsd_wr_allchans(ppd, 20, (2 << 10), BMASK(12, 10)); /* DDR */ + ibsd_wr_allchans(ppd, 20, (4 << 13), BMASK(15, 13)); /* SDR */ + + serdes_7322_los_enable(ppd, 1); + + /* rxbistena; set 0 to avoid effects of it switch later */ + ibsd_wr_allchans(ppd, 9, 0 << 15, 1 << 15); + + /* Configure 4 DFE taps, and only they adapt */ + ibsd_wr_allchans(ppd, 16, 0 << 0, BMASK(1, 0)); + + /* gain hi stop 32 (22) (6:1) lo stop 7 (10:7) target 22 (13) (15:11) */ + le_val = (ppd->dd->cspec->r1 || IS_QME(ppd->dd)) ? 0xb6c0 : 0x6bac; + ibsd_wr_allchans(ppd, 21, le_val, 0xfffe); + + /* + * Set receive adaptation mode. SDR and DDR adaptation are + * always on, and QDR is initially enabled; later disabled. + */ + qib_write_kreg_port(ppd, krp_static_adapt_dis(0), 0ULL); + qib_write_kreg_port(ppd, krp_static_adapt_dis(1), 0ULL); + qib_write_kreg_port(ppd, krp_static_adapt_dis(2), + ppd->dd->cspec->r1 ? + QDR_STATIC_ADAPT_DOWN_R1 : QDR_STATIC_ADAPT_DOWN); + ppd->cpspec->qdr_dfe_on = 1; + + /* FLoop LOS gate: PPM filter enabled */ + ibsd_wr_allchans(ppd, 38, 0 << 10, 1 << 10); + + /* rx offset center enabled */ + ibsd_wr_allchans(ppd, 12, 1 << 4, 1 << 4); + + if (!ppd->dd->cspec->r1) { + ibsd_wr_allchans(ppd, 12, 1 << 12, 1 << 12); + ibsd_wr_allchans(ppd, 12, 2 << 8, 0x0f << 8); + } + + /* Set the frequency loop bandwidth to 15 */ + ibsd_wr_allchans(ppd, 2, 15 << 5, BMASK(8, 5)); + + return 0; +} + +static int serdes_7322_init_new(struct qib_pportdata *ppd) +{ + unsigned long tend; + u32 le_val, rxcaldone; + int chan, chan_done = (1 << SERDES_CHANS) - 1; + + /* Clear cmode-override, may be set from older driver */ + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 0 << 14, 1 << 14); + + /* ensure no tx overrides from earlier driver loads */ + qib_write_kreg_port(ppd, krp_tx_deemph_override, + SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, + reset_tx_deemphasis_override)); + + /* START OF LSI SUGGESTED SERDES BRINGUP */ + /* Reset - Calibration Setup */ + /* Stop DFE adaptaion */ + ibsd_wr_allchans(ppd, 1, 0, BMASK(9, 1)); + /* Disable LE1 */ + ibsd_wr_allchans(ppd, 13, 0, BMASK(5, 5)); + /* Disable autoadapt for LE1 */ + ibsd_wr_allchans(ppd, 1, 0, BMASK(15, 15)); + /* Disable LE2 */ + ibsd_wr_allchans(ppd, 13, 0, BMASK(6, 6)); + /* Disable VGA */ + ibsd_wr_allchans(ppd, 5, 0, BMASK(0, 0)); + /* Disable AFE Offset Cancel */ + ibsd_wr_allchans(ppd, 12, 0, BMASK(12, 12)); + /* Disable Timing Loop */ + ibsd_wr_allchans(ppd, 2, 0, BMASK(3, 3)); + /* Disable Frequency Loop */ + ibsd_wr_allchans(ppd, 2, 0, BMASK(4, 4)); + /* Disable Baseline Wander Correction */ + ibsd_wr_allchans(ppd, 13, 0, BMASK(13, 13)); + /* Disable RX Calibration */ + ibsd_wr_allchans(ppd, 4, 0, BMASK(10, 10)); + /* Disable RX Offset Calibration */ + ibsd_wr_allchans(ppd, 12, 0, BMASK(4, 4)); + /* Select BB CDR */ + ibsd_wr_allchans(ppd, 2, (1 << 15), BMASK(15, 15)); + /* CDR Step Size */ + ibsd_wr_allchans(ppd, 5, 0, BMASK(9, 8)); + /* Enable phase Calibration */ + ibsd_wr_allchans(ppd, 12, (1 << 5), BMASK(5, 5)); + /* DFE Bandwidth [2:14-12] */ + ibsd_wr_allchans(ppd, 2, (4 << 12), BMASK(14, 12)); + /* DFE Config (4 taps only) */ + ibsd_wr_allchans(ppd, 16, 0, BMASK(1, 0)); + /* Gain Loop Bandwidth */ + if (!ppd->dd->cspec->r1) { + ibsd_wr_allchans(ppd, 12, 1 << 12, BMASK(12, 12)); + ibsd_wr_allchans(ppd, 12, 2 << 8, BMASK(11, 8)); + } else { + ibsd_wr_allchans(ppd, 19, (3 << 11), BMASK(13, 11)); + } + /* Baseline Wander Correction Gain [13:4-0] (leave as default) */ + /* Baseline Wander Correction Gain [3:7-5] (leave as default) */ + /* Data Rate Select [5:7-6] (leave as default) */ + /* RX Parallel Word Width [3:10-8] (leave as default) */ + + /* RX REST */ + /* Single- or Multi-channel reset */ + /* RX Analog reset */ + /* RX Digital reset */ + ibsd_wr_allchans(ppd, 0, 0, BMASK(15, 13)); + msleep(20); + /* RX Analog reset */ + ibsd_wr_allchans(ppd, 0, (1 << 14), BMASK(14, 14)); + msleep(20); + /* RX Digital reset */ + ibsd_wr_allchans(ppd, 0, (1 << 13), BMASK(13, 13)); + msleep(20); + + /* setup LoS params; these are subsystem, so chan == 5 */ + /* LoS filter threshold_count on, ch 0-3, set to 8 */ + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 5, 8 << 11, BMASK(14, 11)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 7, 8 << 4, BMASK(7, 4)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 8, 8 << 11, BMASK(14, 11)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 8 << 4, BMASK(7, 4)); + + /* LoS filter threshold_count off, ch 0-3, set to 4 */ + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 6, 4 << 0, BMASK(3, 0)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 7, 4 << 8, BMASK(11, 8)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 9, 4 << 0, BMASK(3, 0)); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 4 << 8, BMASK(11, 8)); + + /* LoS filter select enabled */ + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 9, 1 << 15, 1 << 15); + + /* LoS target data: SDR=4, DDR=2, QDR=1 */ + ibsd_wr_allchans(ppd, 14, (1 << 3), BMASK(5, 3)); /* QDR */ + ibsd_wr_allchans(ppd, 20, (2 << 10), BMASK(12, 10)); /* DDR */ + ibsd_wr_allchans(ppd, 20, (4 << 13), BMASK(15, 13)); /* SDR */ + + /* Turn on LOS on initial SERDES init */ + serdes_7322_los_enable(ppd, 1); + /* FLoop LOS gate: PPM filter enabled */ + ibsd_wr_allchans(ppd, 38, 0 << 10, 1 << 10); + + /* RX LATCH CALIBRATION */ + /* Enable Eyefinder Phase Calibration latch */ + ibsd_wr_allchans(ppd, 15, 1, BMASK(0, 0)); + /* Enable RX Offset Calibration latch */ + ibsd_wr_allchans(ppd, 12, (1 << 4), BMASK(4, 4)); + msleep(20); + /* Start Calibration */ + ibsd_wr_allchans(ppd, 4, (1 << 10), BMASK(10, 10)); + tend = jiffies + msecs_to_jiffies(500); + while (chan_done && !time_is_before_jiffies(tend)) { + msleep(20); + for (chan = 0; chan < SERDES_CHANS; ++chan) { + rxcaldone = ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), + (chan + (chan >> 1)), + 25, 0, 0); + if ((~rxcaldone & (u32)BMASK(9, 9)) == 0 && + (~chan_done & (1 << chan)) == 0) + chan_done &= ~(1 << chan); + } + } + if (chan_done) { + pr_info("Serdes %d calibration not done after .5 sec: 0x%x\n", + IBSD(ppd->hw_pidx), chan_done); + } else { + for (chan = 0; chan < SERDES_CHANS; ++chan) { + rxcaldone = ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), + (chan + (chan >> 1)), + 25, 0, 0); + if ((~rxcaldone & (u32)BMASK(10, 10)) == 0) + pr_info("Serdes %d chan %d calibration failed\n", + IBSD(ppd->hw_pidx), chan); + } + } + + /* Turn off Calibration */ + ibsd_wr_allchans(ppd, 4, 0, BMASK(10, 10)); + msleep(20); + + /* BRING RX UP */ + /* Set LE2 value (May be overridden in qsfp_7322_event) */ + le_val = IS_QME(ppd->dd) ? LE2_QME : LE2_DEFAULT; + ibsd_wr_allchans(ppd, 13, (le_val << 7), BMASK(9, 7)); + /* Set LE2 Loop bandwidth */ + ibsd_wr_allchans(ppd, 3, (7 << 5), BMASK(7, 5)); + /* Enable LE2 */ + ibsd_wr_allchans(ppd, 13, (1 << 6), BMASK(6, 6)); + msleep(20); + /* Enable H0 only */ + ibsd_wr_allchans(ppd, 1, 1, BMASK(9, 1)); + /* gain hi stop 32 (22) (6:1) lo stop 7 (10:7) target 22 (13) (15:11) */ + le_val = (ppd->dd->cspec->r1 || IS_QME(ppd->dd)) ? 0xb6c0 : 0x6bac; + ibsd_wr_allchans(ppd, 21, le_val, 0xfffe); + /* Enable VGA */ + ibsd_wr_allchans(ppd, 5, 0, BMASK(0, 0)); + msleep(20); + /* Set Frequency Loop Bandwidth */ + ibsd_wr_allchans(ppd, 2, (15 << 5), BMASK(8, 5)); + /* Enable Frequency Loop */ + ibsd_wr_allchans(ppd, 2, (1 << 4), BMASK(4, 4)); + /* Set Timing Loop Bandwidth */ + ibsd_wr_allchans(ppd, 2, 0, BMASK(11, 9)); + /* Enable Timing Loop */ + ibsd_wr_allchans(ppd, 2, (1 << 3), BMASK(3, 3)); + msleep(50); + /* Enable DFE + * Set receive adaptation mode. SDR and DDR adaptation are + * always on, and QDR is initially enabled; later disabled. + */ + qib_write_kreg_port(ppd, krp_static_adapt_dis(0), 0ULL); + qib_write_kreg_port(ppd, krp_static_adapt_dis(1), 0ULL); + qib_write_kreg_port(ppd, krp_static_adapt_dis(2), + ppd->dd->cspec->r1 ? + QDR_STATIC_ADAPT_DOWN_R1 : QDR_STATIC_ADAPT_DOWN); + ppd->cpspec->qdr_dfe_on = 1; + /* Disable LE1 */ + ibsd_wr_allchans(ppd, 13, (0 << 5), (1 << 5)); + /* Disable auto adapt for LE1 */ + ibsd_wr_allchans(ppd, 1, (0 << 15), BMASK(15, 15)); + msleep(20); + /* Enable AFE Offset Cancel */ + ibsd_wr_allchans(ppd, 12, (1 << 12), BMASK(12, 12)); + /* Enable Baseline Wander Correction */ + ibsd_wr_allchans(ppd, 12, (1 << 13), BMASK(13, 13)); + /* Termination: rxtermctrl_r2d addr 11 bits [12:11] = 1 */ + ibsd_wr_allchans(ppd, 11, (1 << 11), BMASK(12, 11)); + /* VGA output common mode */ + ibsd_wr_allchans(ppd, 12, (3 << 2), BMASK(3, 2)); + + /* + * Initialize the Tx DDS tables. Also done every QSFP event, + * for adapters with QSFP + */ + init_txdds_table(ppd, 0); + + return 0; +} + +/* start adjust QMH serdes parameters */ + +static void set_man_code(struct qib_pportdata *ppd, int chan, int code) +{ + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)), + 9, code << 9, 0x3f << 9); +} + +static void set_man_mode_h1(struct qib_pportdata *ppd, int chan, + int enable, u32 tapenable) +{ + if (enable) + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)), + 1, 3 << 10, 0x1f << 10); + else + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)), + 1, 0, 0x1f << 10); +} + +/* Set clock to 1, 0, 1, 0 */ +static void clock_man(struct qib_pportdata *ppd, int chan) +{ + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)), + 4, 0x4000, 0x4000); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)), + 4, 0, 0x4000); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)), + 4, 0x4000, 0x4000); + ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)), + 4, 0, 0x4000); +} + +/* + * write the current Tx serdes pre,post,main,amp settings into the serdes. + * The caller must pass the settings appropriate for the current speed, + * or not care if they are correct for the current speed. + */ +static void write_tx_serdes_param(struct qib_pportdata *ppd, + struct txdds_ent *txdds) +{ + u64 deemph; + + deemph = qib_read_kreg_port(ppd, krp_tx_deemph_override); + /* field names for amp, main, post, pre, respectively */ + deemph &= ~(SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, txampcntl_d2a) | + SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, txc0_ena) | + SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, txcp1_ena) | + SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, txcn1_ena)); + + deemph |= SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, + tx_override_deemphasis_select); + deemph |= (txdds->amp & SYM_RMASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, + txampcntl_d2a)) << SYM_LSB(IBSD_TX_DEEMPHASIS_OVERRIDE_0, + txampcntl_d2a); + deemph |= (txdds->main & SYM_RMASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, + txc0_ena)) << SYM_LSB(IBSD_TX_DEEMPHASIS_OVERRIDE_0, + txc0_ena); + deemph |= (txdds->post & SYM_RMASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, + txcp1_ena)) << SYM_LSB(IBSD_TX_DEEMPHASIS_OVERRIDE_0, + txcp1_ena); + deemph |= (txdds->pre & SYM_RMASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, + txcn1_ena)) << SYM_LSB(IBSD_TX_DEEMPHASIS_OVERRIDE_0, + txcn1_ena); + qib_write_kreg_port(ppd, krp_tx_deemph_override, deemph); +} + +/* + * Set the parameters for mez cards on link bounce, so they are + * always exactly what was requested. Similar logic to init_txdds + * but does just the serdes. + */ +static void adj_tx_serdes(struct qib_pportdata *ppd) +{ + const struct txdds_ent *sdr_dds, *ddr_dds, *qdr_dds; + struct txdds_ent *dds; + + find_best_ent(ppd, &sdr_dds, &ddr_dds, &qdr_dds, 1); + dds = (struct txdds_ent *)(ppd->link_speed_active == QIB_IB_QDR ? + qdr_dds : (ppd->link_speed_active == QIB_IB_DDR ? + ddr_dds : sdr_dds)); + write_tx_serdes_param(ppd, dds); +} + +/* set QDR forced value for H1, if needed */ +static void force_h1(struct qib_pportdata *ppd) +{ + int chan; + + ppd->cpspec->qdr_reforce = 0; + if (!ppd->dd->cspec->r1) + return; + + for (chan = 0; chan < SERDES_CHANS; chan++) { + set_man_mode_h1(ppd, chan, 1, 0); + set_man_code(ppd, chan, ppd->cpspec->h1_val); + clock_man(ppd, chan); + set_man_mode_h1(ppd, chan, 0, 0); + } +} + +#define SJA_EN SYM_MASK(SPC_JTAG_ACCESS_REG, SPC_JTAG_ACCESS_EN) +#define BISTEN_LSB SYM_LSB(SPC_JTAG_ACCESS_REG, bist_en) + +#define R_OPCODE_LSB 3 +#define R_OP_NOP 0 +#define R_OP_SHIFT 2 +#define R_OP_UPDATE 3 +#define R_TDI_LSB 2 +#define R_TDO_LSB 1 +#define R_RDY 1 + +static int qib_r_grab(struct qib_devdata *dd) +{ + u64 val; + val = SJA_EN; + qib_write_kreg(dd, kr_r_access, val); + qib_read_kreg32(dd, kr_scratch); + return 0; +} + +/* qib_r_wait_for_rdy() not only waits for the ready bit, it + * returns the current state of R_TDO + */ +static int qib_r_wait_for_rdy(struct qib_devdata *dd) +{ + u64 val; + int timeout; + for (timeout = 0; timeout < 100 ; ++timeout) { + val = qib_read_kreg32(dd, kr_r_access); + if (val & R_RDY) + return (val >> R_TDO_LSB) & 1; + } + return -1; +} + +static int qib_r_shift(struct qib_devdata *dd, int bisten, + int len, u8 *inp, u8 *outp) +{ + u64 valbase, val; + int ret, pos; + + valbase = SJA_EN | (bisten << BISTEN_LSB) | + (R_OP_SHIFT << R_OPCODE_LSB); + ret = qib_r_wait_for_rdy(dd); + if (ret < 0) + goto bail; + for (pos = 0; pos < len; ++pos) { + val = valbase; + if (outp) { + outp[pos >> 3] &= ~(1 << (pos & 7)); + outp[pos >> 3] |= (ret << (pos & 7)); + } + if (inp) { + int tdi = inp[pos >> 3] >> (pos & 7); + val |= ((tdi & 1) << R_TDI_LSB); + } + qib_write_kreg(dd, kr_r_access, val); + qib_read_kreg32(dd, kr_scratch); + ret = qib_r_wait_for_rdy(dd); + if (ret < 0) + break; + } + /* Restore to NOP between operations. */ + val = SJA_EN | (bisten << BISTEN_LSB); + qib_write_kreg(dd, kr_r_access, val); + qib_read_kreg32(dd, kr_scratch); + ret = qib_r_wait_for_rdy(dd); + + if (ret >= 0) + ret = pos; +bail: + return ret; +} + +static int qib_r_update(struct qib_devdata *dd, int bisten) +{ + u64 val; + int ret; + + val = SJA_EN | (bisten << BISTEN_LSB) | (R_OP_UPDATE << R_OPCODE_LSB); + ret = qib_r_wait_for_rdy(dd); + if (ret >= 0) { + qib_write_kreg(dd, kr_r_access, val); + qib_read_kreg32(dd, kr_scratch); + } + return ret; +} + +#define BISTEN_PORT_SEL 15 +#define LEN_PORT_SEL 625 +#define BISTEN_AT 17 +#define LEN_AT 156 +#define BISTEN_ETM 16 +#define LEN_ETM 632 + +#define BIT2BYTE(x) (((x) + BITS_PER_BYTE - 1) / BITS_PER_BYTE) + +/* these are common for all IB port use cases. */ +static u8 reset_at[BIT2BYTE(LEN_AT)] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, +}; +static u8 reset_atetm[BIT2BYTE(LEN_ETM)] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x80, 0xe3, 0x81, 0x73, 0x3c, 0x70, 0x8e, + 0x07, 0xce, 0xf1, 0xc0, 0x39, 0x1e, 0x38, 0xc7, 0x03, 0xe7, + 0x78, 0xe0, 0x1c, 0x0f, 0x9c, 0x7f, 0x80, 0x73, 0x0f, 0x70, + 0xde, 0x01, 0xce, 0x39, 0xc0, 0xf9, 0x06, 0x38, 0xd7, 0x00, + 0xe7, 0x19, 0xe0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, +}; +static u8 at[BIT2BYTE(LEN_AT)] = { + 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, +}; + +/* used for IB1 or IB2, only one in use */ +static u8 atetm_1port[BIT2BYTE(LEN_ETM)] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x10, 0xf2, 0x80, 0x83, 0x1e, 0x38, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x50, 0xf4, 0x41, 0x00, 0x18, 0x78, 0xc8, 0x03, + 0x07, 0x7b, 0xa0, 0x3e, 0x00, 0x02, 0x00, 0x00, 0x18, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00, +}; + +/* used when both IB1 and IB2 are in use */ +static u8 atetm_2port[BIT2BYTE(LEN_ETM)] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x79, + 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0xf8, 0x80, 0x83, 0x1e, 0x38, 0xe0, 0x03, 0x05, + 0x7b, 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, + 0xa2, 0x0f, 0x50, 0xf4, 0x41, 0x00, 0x18, 0x78, 0xd1, 0x07, + 0x02, 0x7c, 0x80, 0x3e, 0x00, 0x02, 0x00, 0x00, 0x3e, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00, +}; + +/* used when only IB1 is in use */ +static u8 portsel_port1[BIT2BYTE(LEN_PORT_SEL)] = { + 0x32, 0x65, 0xa4, 0x7b, 0x10, 0x98, 0xdc, 0xfe, 0x13, 0x13, + 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x73, 0x0c, 0x0c, 0x0c, + 0x0c, 0x0c, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, + 0x13, 0x78, 0x78, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, + 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x74, 0x32, + 0x32, 0x32, 0x32, 0x32, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, + 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, + 0x14, 0x14, 0x9f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +/* used when only IB2 is in use */ +static u8 portsel_port2[BIT2BYTE(LEN_PORT_SEL)] = { + 0x32, 0x65, 0xa4, 0x7b, 0x10, 0x98, 0xdc, 0xfe, 0x39, 0x39, + 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x73, 0x32, 0x32, 0x32, + 0x32, 0x32, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, + 0x39, 0x78, 0x78, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, + 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x74, 0x32, + 0x32, 0x32, 0x32, 0x32, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, + 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, + 0x3a, 0x3a, 0x9f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, +}; + +/* used when both IB1 and IB2 are in use */ +static u8 portsel_2port[BIT2BYTE(LEN_PORT_SEL)] = { + 0x32, 0xba, 0x54, 0x76, 0x10, 0x98, 0xdc, 0xfe, 0x13, 0x13, + 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x73, 0x0c, 0x0c, 0x0c, + 0x0c, 0x0c, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, + 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, + 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x74, 0x32, + 0x32, 0x32, 0x32, 0x32, 0x14, 0x14, 0x14, 0x14, 0x14, 0x3a, + 0x3a, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, + 0x14, 0x14, 0x9f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +/* + * Do setup to properly handle IB link recovery; if port is zero, we + * are initializing to cover both ports; otherwise we are initializing + * to cover a single port card, or the port has reached INIT and we may + * need to switch coverage types. + */ +static void setup_7322_link_recovery(struct qib_pportdata *ppd, u32 both) +{ + u8 *portsel, *etm; + struct qib_devdata *dd = ppd->dd; + + if (!ppd->dd->cspec->r1) + return; + if (!both) { + dd->cspec->recovery_ports_initted++; + ppd->cpspec->recovery_init = 1; + } + if (!both && dd->cspec->recovery_ports_initted == 1) { + portsel = ppd->port == 1 ? portsel_port1 : portsel_port2; + etm = atetm_1port; + } else { + portsel = portsel_2port; + etm = atetm_2port; + } + + if (qib_r_grab(dd) < 0 || + qib_r_shift(dd, BISTEN_ETM, LEN_ETM, reset_atetm, NULL) < 0 || + qib_r_update(dd, BISTEN_ETM) < 0 || + qib_r_shift(dd, BISTEN_AT, LEN_AT, reset_at, NULL) < 0 || + qib_r_update(dd, BISTEN_AT) < 0 || + qib_r_shift(dd, BISTEN_PORT_SEL, LEN_PORT_SEL, + portsel, NULL) < 0 || + qib_r_update(dd, BISTEN_PORT_SEL) < 0 || + qib_r_shift(dd, BISTEN_AT, LEN_AT, at, NULL) < 0 || + qib_r_update(dd, BISTEN_AT) < 0 || + qib_r_shift(dd, BISTEN_ETM, LEN_ETM, etm, NULL) < 0 || + qib_r_update(dd, BISTEN_ETM) < 0) + qib_dev_err(dd, "Failed IB link recovery setup\n"); +} + +static void check_7322_rxe_status(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + u64 fmask; + + if (dd->cspec->recovery_ports_initted != 1) + return; /* rest doesn't apply to dualport */ + qib_write_kreg(dd, kr_control, dd->control | + SYM_MASK(Control, FreezeMode)); + (void)qib_read_kreg64(dd, kr_scratch); + udelay(3); /* ibcreset asserted 400ns, be sure that's over */ + fmask = qib_read_kreg64(dd, kr_act_fmask); + if (!fmask) { + /* + * require a powercycle before we'll work again, and make + * sure we get no more interrupts, and don't turn off + * freeze. + */ + ppd->dd->cspec->stay_in_freeze = 1; + qib_7322_set_intr_state(ppd->dd, 0); + qib_write_kreg(dd, kr_fmask, 0ULL); + qib_dev_err(dd, "HCA unusable until powercycled\n"); + return; /* eventually reset */ + } + + qib_write_kreg(ppd->dd, kr_hwerrclear, + SYM_MASK(HwErrClear, IBSerdesPClkNotDetectClear_1)); + + /* don't do the full clear_freeze(), not needed for this */ + qib_write_kreg(dd, kr_control, dd->control); + qib_read_kreg32(dd, kr_scratch); + /* take IBC out of reset */ + if (ppd->link_speed_supported) { + ppd->cpspec->ibcctrl_a &= + ~SYM_MASK(IBCCtrlA_0, IBStatIntReductionEn); + qib_write_kreg_port(ppd, krp_ibcctrl_a, + ppd->cpspec->ibcctrl_a); + qib_read_kreg32(dd, kr_scratch); + if (ppd->lflags & QIBL_IB_LINK_DISABLED) + qib_set_ib_7322_lstate(ppd, 0, + QLOGIC_IB_IBCC_LINKINITCMD_DISABLE); + } +} diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c new file mode 100644 index 0000000..729da39 --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -0,0 +1,1857 @@ +/* + * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_INFINIBAND_QIB_DCA +#include +#endif + +#include "qib.h" +#include "qib_common.h" +#include "qib_mad.h" +#ifdef CONFIG_DEBUG_FS +#include "qib_debugfs.h" +#include "qib_verbs.h" +#endif + +#undef pr_fmt +#define pr_fmt(fmt) QIB_DRV_NAME ": " fmt + +/* + * min buffers we want to have per context, after driver + */ +#define QIB_MIN_USER_CTXT_BUFCNT 7 + +#define QLOGIC_IB_R_SOFTWARE_MASK 0xFF +#define QLOGIC_IB_R_SOFTWARE_SHIFT 24 +#define QLOGIC_IB_R_EMULATOR_MASK (1ULL<<62) + +/* + * Number of ctxts we are configured to use (to allow for more pio + * buffers per ctxt, etc.) Zero means use chip value. + */ +ushort qib_cfgctxts; +module_param_named(cfgctxts, qib_cfgctxts, ushort, S_IRUGO); +MODULE_PARM_DESC(cfgctxts, "Set max number of contexts to use"); + +unsigned qib_numa_aware; +module_param_named(numa_aware, qib_numa_aware, uint, S_IRUGO); +MODULE_PARM_DESC(numa_aware, + "0 -> PSM allocation close to HCA, 1 -> PSM allocation local to process"); + +/* + * If set, do not write to any regs if avoidable, hack to allow + * check for deranged default register values. + */ +ushort qib_mini_init; +module_param_named(mini_init, qib_mini_init, ushort, S_IRUGO); +MODULE_PARM_DESC(mini_init, "If set, do minimal diag init"); + +unsigned qib_n_krcv_queues; +module_param_named(krcvqs, qib_n_krcv_queues, uint, S_IRUGO); +MODULE_PARM_DESC(krcvqs, "number of kernel receive queues per IB port"); + +unsigned qib_cc_table_size; +module_param_named(cc_table_size, qib_cc_table_size, uint, S_IRUGO); +MODULE_PARM_DESC(cc_table_size, "Congestion control table entries 0 (CCA disabled - default), min = 128, max = 1984"); +/* + * qib_wc_pat parameter: + * 0 is WC via MTRR + * 1 is WC via PAT + * If PAT initialization fails, code reverts back to MTRR + */ +unsigned qib_wc_pat = 1; /* default (1) is to use PAT, not MTRR */ +module_param_named(wc_pat, qib_wc_pat, uint, S_IRUGO); +MODULE_PARM_DESC(wc_pat, "enable write-combining via PAT mechanism"); + +static void verify_interrupt(unsigned long); + +static struct idr qib_unit_table; +u32 qib_cpulist_count; +unsigned long *qib_cpulist; + +/* set number of contexts we'll actually use */ +void qib_set_ctxtcnt(struct qib_devdata *dd) +{ + if (!qib_cfgctxts) { + dd->cfgctxts = dd->first_user_ctxt + num_online_cpus(); + if (dd->cfgctxts > dd->ctxtcnt) + dd->cfgctxts = dd->ctxtcnt; + } else if (qib_cfgctxts < dd->num_pports) + dd->cfgctxts = dd->ctxtcnt; + else if (qib_cfgctxts <= dd->ctxtcnt) + dd->cfgctxts = qib_cfgctxts; + else + dd->cfgctxts = dd->ctxtcnt; + dd->freectxts = (dd->first_user_ctxt > dd->cfgctxts) ? 0 : + dd->cfgctxts - dd->first_user_ctxt; +} + +/* + * Common code for creating the receive context array. + */ +int qib_create_ctxts(struct qib_devdata *dd) +{ + unsigned i; + int local_node_id = pcibus_to_node(dd->pcidev->bus); + + if (local_node_id < 0) + local_node_id = numa_node_id(); + dd->assigned_node_id = local_node_id; + + /* + * Allocate full ctxtcnt array, rather than just cfgctxts, because + * cleanup iterates across all possible ctxts. + */ + dd->rcd = kzalloc(sizeof(*dd->rcd) * dd->ctxtcnt, GFP_KERNEL); + if (!dd->rcd) { + qib_dev_err(dd, + "Unable to allocate ctxtdata array, failing\n"); + return -ENOMEM; + } + + /* create (one or more) kctxt */ + for (i = 0; i < dd->first_user_ctxt; ++i) { + struct qib_pportdata *ppd; + struct qib_ctxtdata *rcd; + + if (dd->skip_kctxt_mask & (1 << i)) + continue; + + ppd = dd->pport + (i % dd->num_pports); + + rcd = qib_create_ctxtdata(ppd, i, dd->assigned_node_id); + if (!rcd) { + qib_dev_err(dd, + "Unable to allocate ctxtdata for Kernel ctxt, failing\n"); + kfree(dd->rcd); + dd->rcd = NULL; + return -ENOMEM; + } + rcd->pkeys[0] = QIB_DEFAULT_P_KEY; + rcd->seq_cnt = 1; + } + return 0; +} + +/* + * Common code for user and kernel context setup. + */ +struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *ppd, u32 ctxt, + int node_id) +{ + struct qib_devdata *dd = ppd->dd; + struct qib_ctxtdata *rcd; + + rcd = kzalloc_node(sizeof(*rcd), GFP_KERNEL, node_id); + if (rcd) { + INIT_LIST_HEAD(&rcd->qp_wait_list); + rcd->node_id = node_id; + rcd->ppd = ppd; + rcd->dd = dd; + rcd->cnt = 1; + rcd->ctxt = ctxt; + dd->rcd[ctxt] = rcd; +#ifdef CONFIG_DEBUG_FS + if (ctxt < dd->first_user_ctxt) { /* N/A for PSM contexts */ + rcd->opstats = kzalloc_node(sizeof(*rcd->opstats), + GFP_KERNEL, node_id); + if (!rcd->opstats) { + kfree(rcd); + qib_dev_err(dd, + "Unable to allocate per ctxt stats buffer\n"); + return NULL; + } + } +#endif + dd->f_init_ctxt(rcd); + + /* + * To avoid wasting a lot of memory, we allocate 32KB chunks + * of physically contiguous memory, advance through it until + * used up and then allocate more. Of course, we need + * memory to store those extra pointers, now. 32KB seems to + * be the most that is "safe" under memory pressure + * (creating large files and then copying them over + * NFS while doing lots of MPI jobs). The OOM killer can + * get invoked, even though we say we can sleep and this can + * cause significant system problems.... + */ + rcd->rcvegrbuf_size = 0x8000; + rcd->rcvegrbufs_perchunk = + rcd->rcvegrbuf_size / dd->rcvegrbufsize; + rcd->rcvegrbuf_chunks = (rcd->rcvegrcnt + + rcd->rcvegrbufs_perchunk - 1) / + rcd->rcvegrbufs_perchunk; + BUG_ON(!is_power_of_2(rcd->rcvegrbufs_perchunk)); + rcd->rcvegrbufs_perchunk_shift = + ilog2(rcd->rcvegrbufs_perchunk); + } + return rcd; +} + +/* + * Common code for initializing the physical port structure. + */ +int qib_init_pportdata(struct qib_pportdata *ppd, struct qib_devdata *dd, + u8 hw_pidx, u8 port) +{ + int size; + ppd->dd = dd; + ppd->hw_pidx = hw_pidx; + ppd->port = port; /* IB port number, not index */ + + spin_lock_init(&ppd->sdma_lock); + spin_lock_init(&ppd->lflags_lock); + spin_lock_init(&ppd->cc_shadow_lock); + init_waitqueue_head(&ppd->state_wait); + + init_timer(&ppd->symerr_clear_timer); + ppd->symerr_clear_timer.function = qib_clear_symerror_on_linkup; + ppd->symerr_clear_timer.data = (unsigned long)ppd; + + ppd->qib_wq = NULL; + ppd->ibport_data.pmastats = + alloc_percpu(struct qib_pma_counters); + if (!ppd->ibport_data.pmastats) + return -ENOMEM; + + if (qib_cc_table_size < IB_CCT_MIN_ENTRIES) + goto bail; + + ppd->cc_supported_table_entries = min(max_t(int, qib_cc_table_size, + IB_CCT_MIN_ENTRIES), IB_CCT_ENTRIES*IB_CC_TABLE_CAP_DEFAULT); + + ppd->cc_max_table_entries = + ppd->cc_supported_table_entries/IB_CCT_ENTRIES; + + size = IB_CC_TABLE_CAP_DEFAULT * sizeof(struct ib_cc_table_entry) + * IB_CCT_ENTRIES; + ppd->ccti_entries = kzalloc(size, GFP_KERNEL); + if (!ppd->ccti_entries) { + qib_dev_err(dd, + "failed to allocate congestion control table for port %d!\n", + port); + goto bail; + } + + size = IB_CC_CCS_ENTRIES * sizeof(struct ib_cc_congestion_entry); + ppd->congestion_entries = kzalloc(size, GFP_KERNEL); + if (!ppd->congestion_entries) { + qib_dev_err(dd, + "failed to allocate congestion setting list for port %d!\n", + port); + goto bail_1; + } + + size = sizeof(struct cc_table_shadow); + ppd->ccti_entries_shadow = kzalloc(size, GFP_KERNEL); + if (!ppd->ccti_entries_shadow) { + qib_dev_err(dd, + "failed to allocate shadow ccti list for port %d!\n", + port); + goto bail_2; + } + + size = sizeof(struct ib_cc_congestion_setting_attr); + ppd->congestion_entries_shadow = kzalloc(size, GFP_KERNEL); + if (!ppd->congestion_entries_shadow) { + qib_dev_err(dd, + "failed to allocate shadow congestion setting list for port %d!\n", + port); + goto bail_3; + } + + return 0; + +bail_3: + kfree(ppd->ccti_entries_shadow); + ppd->ccti_entries_shadow = NULL; +bail_2: + kfree(ppd->congestion_entries); + ppd->congestion_entries = NULL; +bail_1: + kfree(ppd->ccti_entries); + ppd->ccti_entries = NULL; +bail: + /* User is intentionally disabling the congestion control agent */ + if (!qib_cc_table_size) + return 0; + + if (qib_cc_table_size < IB_CCT_MIN_ENTRIES) { + qib_cc_table_size = 0; + qib_dev_err(dd, + "Congestion Control table size %d less than minimum %d for port %d\n", + qib_cc_table_size, IB_CCT_MIN_ENTRIES, port); + } + + qib_dev_err(dd, "Congestion Control Agent disabled for port %d\n", + port); + return 0; +} + +static int init_pioavailregs(struct qib_devdata *dd) +{ + int ret, pidx; + u64 *status_page; + + dd->pioavailregs_dma = dma_alloc_coherent( + &dd->pcidev->dev, PAGE_SIZE, &dd->pioavailregs_phys, + GFP_KERNEL); + if (!dd->pioavailregs_dma) { + qib_dev_err(dd, + "failed to allocate PIOavail reg area in memory\n"); + ret = -ENOMEM; + goto done; + } + + /* + * We really want L2 cache aligned, but for current CPUs of + * interest, they are the same. + */ + status_page = (u64 *) + ((char *) dd->pioavailregs_dma + + ((2 * L1_CACHE_BYTES + + dd->pioavregs * sizeof(u64)) & ~L1_CACHE_BYTES)); + /* device status comes first, for backwards compatibility */ + dd->devstatusp = status_page; + *status_page++ = 0; + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + dd->pport[pidx].statusp = status_page; + *status_page++ = 0; + } + + /* + * Setup buffer to hold freeze and other messages, accessible to + * apps, following statusp. This is per-unit, not per port. + */ + dd->freezemsg = (char *) status_page; + *dd->freezemsg = 0; + /* length of msg buffer is "whatever is left" */ + ret = (char *) status_page - (char *) dd->pioavailregs_dma; + dd->freezelen = PAGE_SIZE - ret; + + ret = 0; + +done: + return ret; +} + +/** + * init_shadow_tids - allocate the shadow TID array + * @dd: the qlogic_ib device + * + * allocate the shadow TID array, so we can qib_munlock previous + * entries. It may make more sense to move the pageshadow to the + * ctxt data structure, so we only allocate memory for ctxts actually + * in use, since we at 8k per ctxt, now. + * We don't want failures here to prevent use of the driver/chip, + * so no return value. + */ +static void init_shadow_tids(struct qib_devdata *dd) +{ + struct page **pages; + dma_addr_t *addrs; + + pages = vzalloc(dd->cfgctxts * dd->rcvtidcnt * sizeof(struct page *)); + if (!pages) { + qib_dev_err(dd, + "failed to allocate shadow page * array, no expected sends!\n"); + goto bail; + } + + addrs = vzalloc(dd->cfgctxts * dd->rcvtidcnt * sizeof(dma_addr_t)); + if (!addrs) { + qib_dev_err(dd, + "failed to allocate shadow dma handle array, no expected sends!\n"); + goto bail_free; + } + + dd->pageshadow = pages; + dd->physshadow = addrs; + return; + +bail_free: + vfree(pages); +bail: + dd->pageshadow = NULL; +} + +/* + * Do initialization for device that is only needed on + * first detect, not on resets. + */ +static int loadtime_init(struct qib_devdata *dd) +{ + int ret = 0; + + if (((dd->revision >> QLOGIC_IB_R_SOFTWARE_SHIFT) & + QLOGIC_IB_R_SOFTWARE_MASK) != QIB_CHIP_SWVERSION) { + qib_dev_err(dd, + "Driver only handles version %d, chip swversion is %d (%llx), failng\n", + QIB_CHIP_SWVERSION, + (int)(dd->revision >> + QLOGIC_IB_R_SOFTWARE_SHIFT) & + QLOGIC_IB_R_SOFTWARE_MASK, + (unsigned long long) dd->revision); + ret = -ENOSYS; + goto done; + } + + if (dd->revision & QLOGIC_IB_R_EMULATOR_MASK) + qib_devinfo(dd->pcidev, "%s", dd->boardversion); + + spin_lock_init(&dd->pioavail_lock); + spin_lock_init(&dd->sendctrl_lock); + spin_lock_init(&dd->uctxt_lock); + spin_lock_init(&dd->qib_diag_trans_lock); + spin_lock_init(&dd->eep_st_lock); + mutex_init(&dd->eep_lock); + + if (qib_mini_init) + goto done; + + ret = init_pioavailregs(dd); + init_shadow_tids(dd); + + qib_get_eeprom_info(dd); + + /* setup time (don't start yet) to verify we got interrupt */ + init_timer(&dd->intrchk_timer); + dd->intrchk_timer.function = verify_interrupt; + dd->intrchk_timer.data = (unsigned long) dd; + + ret = qib_cq_init(dd); +done: + return ret; +} + +/** + * init_after_reset - re-initialize after a reset + * @dd: the qlogic_ib device + * + * sanity check at least some of the values after reset, and + * ensure no receive or transmit (explicitly, in case reset + * failed + */ +static int init_after_reset(struct qib_devdata *dd) +{ + int i; + + /* + * Ensure chip does no sends or receives, tail updates, or + * pioavail updates while we re-initialize. This is mostly + * for the driver data structures, not chip registers. + */ + for (i = 0; i < dd->num_pports; ++i) { + /* + * ctxt == -1 means "all contexts". Only really safe for + * _dis_abling things, as here. + */ + dd->f_rcvctrl(dd->pport + i, QIB_RCVCTRL_CTXT_DIS | + QIB_RCVCTRL_INTRAVAIL_DIS | + QIB_RCVCTRL_TAILUPD_DIS, -1); + /* Redundant across ports for some, but no big deal. */ + dd->f_sendctrl(dd->pport + i, QIB_SENDCTRL_SEND_DIS | + QIB_SENDCTRL_AVAIL_DIS); + } + + return 0; +} + +static void enable_chip(struct qib_devdata *dd) +{ + u64 rcvmask; + int i; + + /* + * Enable PIO send, and update of PIOavail regs to memory. + */ + for (i = 0; i < dd->num_pports; ++i) + dd->f_sendctrl(dd->pport + i, QIB_SENDCTRL_SEND_ENB | + QIB_SENDCTRL_AVAIL_ENB); + /* + * Enable kernel ctxts' receive and receive interrupt. + * Other ctxts done as user opens and inits them. + */ + rcvmask = QIB_RCVCTRL_CTXT_ENB | QIB_RCVCTRL_INTRAVAIL_ENB; + rcvmask |= (dd->flags & QIB_NODMA_RTAIL) ? + QIB_RCVCTRL_TAILUPD_DIS : QIB_RCVCTRL_TAILUPD_ENB; + for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) { + struct qib_ctxtdata *rcd = dd->rcd[i]; + + if (rcd) + dd->f_rcvctrl(rcd->ppd, rcvmask, i); + } +} + +static void verify_interrupt(unsigned long opaque) +{ + struct qib_devdata *dd = (struct qib_devdata *) opaque; + u64 int_counter; + + if (!dd) + return; /* being torn down */ + + /* + * If we don't have a lid or any interrupts, let the user know and + * don't bother checking again. + */ + int_counter = qib_int_counter(dd) - dd->z_int_counter; + if (int_counter == 0) { + if (!dd->f_intr_fallback(dd)) + dev_err(&dd->pcidev->dev, + "No interrupts detected, not usable.\n"); + else /* re-arm the timer to see if fallback works */ + mod_timer(&dd->intrchk_timer, jiffies + HZ/2); + } +} + +static void init_piobuf_state(struct qib_devdata *dd) +{ + int i, pidx; + u32 uctxts; + + /* + * Ensure all buffers are free, and fifos empty. Buffers + * are common, so only do once for port 0. + * + * After enable and qib_chg_pioavailkernel so we can safely + * enable pioavail updates and PIOENABLE. After this, packets + * are ready and able to go out. + */ + dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_ALL); + for (pidx = 0; pidx < dd->num_pports; ++pidx) + dd->f_sendctrl(dd->pport + pidx, QIB_SENDCTRL_FLUSH); + + /* + * If not all sendbufs are used, add the one to each of the lower + * numbered contexts. pbufsctxt and lastctxt_piobuf are + * calculated in chip-specific code because it may cause some + * chip-specific adjustments to be made. + */ + uctxts = dd->cfgctxts - dd->first_user_ctxt; + dd->ctxts_extrabuf = dd->pbufsctxt ? + dd->lastctxt_piobuf - (dd->pbufsctxt * uctxts) : 0; + + /* + * Set up the shadow copies of the piobufavail registers, + * which we compare against the chip registers for now, and + * the in memory DMA'ed copies of the registers. + * By now pioavail updates to memory should have occurred, so + * copy them into our working/shadow registers; this is in + * case something went wrong with abort, but mostly to get the + * initial values of the generation bit correct. + */ + for (i = 0; i < dd->pioavregs; i++) { + __le64 tmp; + + tmp = dd->pioavailregs_dma[i]; + /* + * Don't need to worry about pioavailkernel here + * because we will call qib_chg_pioavailkernel() later + * in initialization, to busy out buffers as needed. + */ + dd->pioavailshadow[i] = le64_to_cpu(tmp); + } + while (i < ARRAY_SIZE(dd->pioavailshadow)) + dd->pioavailshadow[i++] = 0; /* for debugging sanity */ + + /* after pioavailshadow is setup */ + qib_chg_pioavailkernel(dd, 0, dd->piobcnt2k + dd->piobcnt4k, + TXCHK_CHG_TYPE_KERN, NULL); + dd->f_initvl15_bufs(dd); +} + +/** + * qib_create_workqueues - create per port workqueues + * @dd: the qlogic_ib device + */ +static int qib_create_workqueues(struct qib_devdata *dd) +{ + int pidx; + struct qib_pportdata *ppd; + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (!ppd->qib_wq) { + char wq_name[8]; /* 3 + 2 + 1 + 1 + 1 */ + snprintf(wq_name, sizeof(wq_name), "qib%d_%d", + dd->unit, pidx); + ppd->qib_wq = + create_singlethread_workqueue(wq_name); + if (!ppd->qib_wq) + goto wq_error; + } + } + return 0; +wq_error: + pr_err("create_singlethread_workqueue failed for port %d\n", + pidx + 1); + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (ppd->qib_wq) { + destroy_workqueue(ppd->qib_wq); + ppd->qib_wq = NULL; + } + } + return -ENOMEM; +} + +static void qib_free_pportdata(struct qib_pportdata *ppd) +{ + free_percpu(ppd->ibport_data.pmastats); + ppd->ibport_data.pmastats = NULL; +} + +/** + * qib_init - do the actual initialization sequence on the chip + * @dd: the qlogic_ib device + * @reinit: reinitializing, so don't allocate new memory + * + * Do the actual initialization sequence on the chip. This is done + * both from the init routine called from the PCI infrastructure, and + * when we reset the chip, or detect that it was reset internally, + * or it's administratively re-enabled. + * + * Memory allocation here and in called routines is only done in + * the first case (reinit == 0). We have to be careful, because even + * without memory allocation, we need to re-write all the chip registers + * TIDs, etc. after the reset or enable has completed. + */ +int qib_init(struct qib_devdata *dd, int reinit) +{ + int ret = 0, pidx, lastfail = 0; + u32 portok = 0; + unsigned i; + struct qib_ctxtdata *rcd; + struct qib_pportdata *ppd; + unsigned long flags; + + /* Set linkstate to unknown, so we can watch for a transition. */ + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~(QIBL_LINKACTIVE | QIBL_LINKARMED | + QIBL_LINKDOWN | QIBL_LINKINIT | + QIBL_LINKV); + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + } + + if (reinit) + ret = init_after_reset(dd); + else + ret = loadtime_init(dd); + if (ret) + goto done; + + /* Bypass most chip-init, to get to device creation */ + if (qib_mini_init) + return 0; + + ret = dd->f_late_initreg(dd); + if (ret) + goto done; + + /* dd->rcd can be NULL if early init failed */ + for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) { + /* + * Set up the (kernel) rcvhdr queue and egr TIDs. If doing + * re-init, the simplest way to handle this is to free + * existing, and re-allocate. + * Need to re-create rest of ctxt 0 ctxtdata as well. + */ + rcd = dd->rcd[i]; + if (!rcd) + continue; + + lastfail = qib_create_rcvhdrq(dd, rcd); + if (!lastfail) + lastfail = qib_setup_eagerbufs(rcd); + if (lastfail) { + qib_dev_err(dd, + "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n"); + continue; + } + } + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + int mtu; + if (lastfail) + ret = lastfail; + ppd = dd->pport + pidx; + mtu = ib_mtu_enum_to_int(qib_ibmtu); + if (mtu == -1) { + mtu = QIB_DEFAULT_MTU; + qib_ibmtu = 0; /* don't leave invalid value */ + } + /* set max we can ever have for this driver load */ + ppd->init_ibmaxlen = min(mtu > 2048 ? + dd->piosize4k : dd->piosize2k, + dd->rcvegrbufsize + + (dd->rcvhdrentsize << 2)); + /* + * Have to initialize ibmaxlen, but this will normally + * change immediately in qib_set_mtu(). + */ + ppd->ibmaxlen = ppd->init_ibmaxlen; + qib_set_mtu(ppd, mtu); + + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags |= QIBL_IB_LINK_DISABLED; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + + lastfail = dd->f_bringup_serdes(ppd); + if (lastfail) { + qib_devinfo(dd->pcidev, + "Failed to bringup IB port %u\n", ppd->port); + lastfail = -ENETDOWN; + continue; + } + + portok++; + } + + if (!portok) { + /* none of the ports initialized */ + if (!ret && lastfail) + ret = lastfail; + else if (!ret) + ret = -ENETDOWN; + /* but continue on, so we can debug cause */ + } + + enable_chip(dd); + + init_piobuf_state(dd); + +done: + if (!ret) { + /* chip is OK for user apps; mark it as initialized */ + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + /* + * Set status even if port serdes is not initialized + * so that diags will work. + */ + *ppd->statusp |= QIB_STATUS_CHIP_PRESENT | + QIB_STATUS_INITTED; + if (!ppd->link_speed_enabled) + continue; + if (dd->flags & QIB_HAS_SEND_DMA) + ret = qib_setup_sdma(ppd); + init_timer(&ppd->hol_timer); + ppd->hol_timer.function = qib_hol_event; + ppd->hol_timer.data = (unsigned long)ppd; + ppd->hol_state = QIB_HOL_UP; + } + + /* now we can enable all interrupts from the chip */ + dd->f_set_intr_state(dd, 1); + + /* + * Setup to verify we get an interrupt, and fallback + * to an alternate if necessary and possible. + */ + mod_timer(&dd->intrchk_timer, jiffies + HZ/2); + /* start stats retrieval timer */ + mod_timer(&dd->stats_timer, jiffies + HZ * ACTIVITY_TIMER); + } + + /* if ret is non-zero, we probably should do some cleanup here... */ + return ret; +} + +/* + * These next two routines are placeholders in case we don't have per-arch + * code for controlling write combining. If explicit control of write + * combining is not available, performance will probably be awful. + */ + +int __attribute__((weak)) qib_enable_wc(struct qib_devdata *dd) +{ + return -EOPNOTSUPP; +} + +void __attribute__((weak)) qib_disable_wc(struct qib_devdata *dd) +{ +} + +static inline struct qib_devdata *__qib_lookup(int unit) +{ + return idr_find(&qib_unit_table, unit); +} + +struct qib_devdata *qib_lookup(int unit) +{ + struct qib_devdata *dd; + unsigned long flags; + + spin_lock_irqsave(&qib_devs_lock, flags); + dd = __qib_lookup(unit); + spin_unlock_irqrestore(&qib_devs_lock, flags); + + return dd; +} + +/* + * Stop the timers during unit shutdown, or after an error late + * in initialization. + */ +static void qib_stop_timers(struct qib_devdata *dd) +{ + struct qib_pportdata *ppd; + int pidx; + + if (dd->stats_timer.data) { + del_timer_sync(&dd->stats_timer); + dd->stats_timer.data = 0; + } + if (dd->intrchk_timer.data) { + del_timer_sync(&dd->intrchk_timer); + dd->intrchk_timer.data = 0; + } + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (ppd->hol_timer.data) + del_timer_sync(&ppd->hol_timer); + if (ppd->led_override_timer.data) { + del_timer_sync(&ppd->led_override_timer); + atomic_set(&ppd->led_override_timer_active, 0); + } + if (ppd->symerr_clear_timer.data) + del_timer_sync(&ppd->symerr_clear_timer); + } +} + +/** + * qib_shutdown_device - shut down a device + * @dd: the qlogic_ib device + * + * This is called to make the device quiet when we are about to + * unload the driver, and also when the device is administratively + * disabled. It does not free any data structures. + * Everything it does has to be setup again by qib_init(dd, 1) + */ +static void qib_shutdown_device(struct qib_devdata *dd) +{ + struct qib_pportdata *ppd; + unsigned pidx; + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + + spin_lock_irq(&ppd->lflags_lock); + ppd->lflags &= ~(QIBL_LINKDOWN | QIBL_LINKINIT | + QIBL_LINKARMED | QIBL_LINKACTIVE | + QIBL_LINKV); + spin_unlock_irq(&ppd->lflags_lock); + *ppd->statusp &= ~(QIB_STATUS_IB_CONF | QIB_STATUS_IB_READY); + } + dd->flags &= ~QIB_INITTED; + + /* mask interrupts, but not errors */ + dd->f_set_intr_state(dd, 0); + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + dd->f_rcvctrl(ppd, QIB_RCVCTRL_TAILUPD_DIS | + QIB_RCVCTRL_CTXT_DIS | + QIB_RCVCTRL_INTRAVAIL_DIS | + QIB_RCVCTRL_PKEY_ENB, -1); + /* + * Gracefully stop all sends allowing any in progress to + * trickle out first. + */ + dd->f_sendctrl(ppd, QIB_SENDCTRL_CLEAR); + } + + /* + * Enough for anything that's going to trickle out to have actually + * done so. + */ + udelay(20); + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + dd->f_setextled(ppd, 0); /* make sure LEDs are off */ + + if (dd->flags & QIB_HAS_SEND_DMA) + qib_teardown_sdma(ppd); + + dd->f_sendctrl(ppd, QIB_SENDCTRL_AVAIL_DIS | + QIB_SENDCTRL_SEND_DIS); + /* + * Clear SerdesEnable. + * We can't count on interrupts since we are stopping. + */ + dd->f_quiet_serdes(ppd); + + if (ppd->qib_wq) { + destroy_workqueue(ppd->qib_wq); + ppd->qib_wq = NULL; + } + qib_free_pportdata(ppd); + } + + qib_update_eeprom_log(dd); +} + +/** + * qib_free_ctxtdata - free a context's allocated data + * @dd: the qlogic_ib device + * @rcd: the ctxtdata structure + * + * free up any allocated data for a context + * This should not touch anything that would affect a simultaneous + * re-allocation of context data, because it is called after qib_mutex + * is released (and can be called from reinit as well). + * It should never change any chip state, or global driver state. + */ +void qib_free_ctxtdata(struct qib_devdata *dd, struct qib_ctxtdata *rcd) +{ + if (!rcd) + return; + + if (rcd->rcvhdrq) { + dma_free_coherent(&dd->pcidev->dev, rcd->rcvhdrq_size, + rcd->rcvhdrq, rcd->rcvhdrq_phys); + rcd->rcvhdrq = NULL; + if (rcd->rcvhdrtail_kvaddr) { + dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, + rcd->rcvhdrtail_kvaddr, + rcd->rcvhdrqtailaddr_phys); + rcd->rcvhdrtail_kvaddr = NULL; + } + } + if (rcd->rcvegrbuf) { + unsigned e; + + for (e = 0; e < rcd->rcvegrbuf_chunks; e++) { + void *base = rcd->rcvegrbuf[e]; + size_t size = rcd->rcvegrbuf_size; + + dma_free_coherent(&dd->pcidev->dev, size, + base, rcd->rcvegrbuf_phys[e]); + } + kfree(rcd->rcvegrbuf); + rcd->rcvegrbuf = NULL; + kfree(rcd->rcvegrbuf_phys); + rcd->rcvegrbuf_phys = NULL; + rcd->rcvegrbuf_chunks = 0; + } + + kfree(rcd->tid_pg_list); + vfree(rcd->user_event_mask); + vfree(rcd->subctxt_uregbase); + vfree(rcd->subctxt_rcvegrbuf); + vfree(rcd->subctxt_rcvhdr_base); +#ifdef CONFIG_DEBUG_FS + kfree(rcd->opstats); + rcd->opstats = NULL; +#endif + kfree(rcd); +} + +/* + * Perform a PIO buffer bandwidth write test, to verify proper system + * configuration. Even when all the setup calls work, occasionally + * BIOS or other issues can prevent write combining from working, or + * can cause other bandwidth problems to the chip. + * + * This test simply writes the same buffer over and over again, and + * measures close to the peak bandwidth to the chip (not testing + * data bandwidth to the wire). On chips that use an address-based + * trigger to send packets to the wire, this is easy. On chips that + * use a count to trigger, we want to make sure that the packet doesn't + * go out on the wire, or trigger flow control checks. + */ +static void qib_verify_pioperf(struct qib_devdata *dd) +{ + u32 pbnum, cnt, lcnt; + u32 __iomem *piobuf; + u32 *addr; + u64 msecs, emsecs; + + piobuf = dd->f_getsendbuf(dd->pport, 0ULL, &pbnum); + if (!piobuf) { + qib_devinfo(dd->pcidev, + "No PIObufs for checking perf, skipping\n"); + return; + } + + /* + * Enough to give us a reasonable test, less than piobuf size, and + * likely multiple of store buffer length. + */ + cnt = 1024; + + addr = vmalloc(cnt); + if (!addr) { + qib_devinfo(dd->pcidev, + "Couldn't get memory for checking PIO perf," + " skipping\n"); + goto done; + } + + preempt_disable(); /* we want reasonably accurate elapsed time */ + msecs = 1 + jiffies_to_msecs(jiffies); + for (lcnt = 0; lcnt < 10000U; lcnt++) { + /* wait until we cross msec boundary */ + if (jiffies_to_msecs(jiffies) >= msecs) + break; + udelay(1); + } + + dd->f_set_armlaunch(dd, 0); + + /* + * length 0, no dwords actually sent + */ + writeq(0, piobuf); + qib_flush_wc(); + + /* + * This is only roughly accurate, since even with preempt we + * still take interrupts that could take a while. Running for + * >= 5 msec seems to get us "close enough" to accurate values. + */ + msecs = jiffies_to_msecs(jiffies); + for (emsecs = lcnt = 0; emsecs <= 5UL; lcnt++) { + qib_pio_copy(piobuf + 64, addr, cnt >> 2); + emsecs = jiffies_to_msecs(jiffies) - msecs; + } + + /* 1 GiB/sec, slightly over IB SDR line rate */ + if (lcnt < (emsecs * 1024U)) + qib_dev_err(dd, + "Performance problem: bandwidth to PIO buffers is only %u MiB/sec\n", + lcnt / (u32) emsecs); + + preempt_enable(); + + vfree(addr); + +done: + /* disarm piobuf, so it's available again */ + dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_BUF(pbnum)); + qib_sendbuf_done(dd, pbnum); + dd->f_set_armlaunch(dd, 1); +} + +void qib_free_devdata(struct qib_devdata *dd) +{ + unsigned long flags; + + spin_lock_irqsave(&qib_devs_lock, flags); + idr_remove(&qib_unit_table, dd->unit); + list_del(&dd->list); + spin_unlock_irqrestore(&qib_devs_lock, flags); + +#ifdef CONFIG_DEBUG_FS + qib_dbg_ibdev_exit(&dd->verbs_dev); +#endif + free_percpu(dd->int_counter); + ib_dealloc_device(&dd->verbs_dev.ibdev); +} + +u64 qib_int_counter(struct qib_devdata *dd) +{ + int cpu; + u64 int_counter = 0; + + for_each_possible_cpu(cpu) + int_counter += *per_cpu_ptr(dd->int_counter, cpu); + return int_counter; +} + +u64 qib_sps_ints(void) +{ + unsigned long flags; + struct qib_devdata *dd; + u64 sps_ints = 0; + + spin_lock_irqsave(&qib_devs_lock, flags); + list_for_each_entry(dd, &qib_dev_list, list) { + sps_ints += qib_int_counter(dd); + } + spin_unlock_irqrestore(&qib_devs_lock, flags); + return sps_ints; +} + +/* + * Allocate our primary per-unit data structure. Must be done via verbs + * allocator, because the verbs cleanup process both does cleanup and + * free of the data structure. + * "extra" is for chip-specific data. + * + * Use the idr mechanism to get a unit number for this unit. + */ +struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra) +{ + unsigned long flags; + struct qib_devdata *dd; + int ret; + + dd = (struct qib_devdata *) ib_alloc_device(sizeof(*dd) + extra); + if (!dd) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&dd->list); + + idr_preload(GFP_KERNEL); + spin_lock_irqsave(&qib_devs_lock, flags); + + ret = idr_alloc(&qib_unit_table, dd, 0, 0, GFP_NOWAIT); + if (ret >= 0) { + dd->unit = ret; + list_add(&dd->list, &qib_dev_list); + } + + spin_unlock_irqrestore(&qib_devs_lock, flags); + idr_preload_end(); + + if (ret < 0) { + qib_early_err(&pdev->dev, + "Could not allocate unit ID: error %d\n", -ret); + goto bail; + } + dd->int_counter = alloc_percpu(u64); + if (!dd->int_counter) { + ret = -ENOMEM; + qib_early_err(&pdev->dev, + "Could not allocate per-cpu int_counter\n"); + goto bail; + } + + if (!qib_cpulist_count) { + u32 count = num_online_cpus(); + qib_cpulist = kzalloc(BITS_TO_LONGS(count) * + sizeof(long), GFP_KERNEL); + if (qib_cpulist) + qib_cpulist_count = count; + else + qib_early_err(&pdev->dev, + "Could not alloc cpulist info, cpu affinity might be wrong\n"); + } +#ifdef CONFIG_DEBUG_FS + qib_dbg_ibdev_init(&dd->verbs_dev); +#endif + return dd; +bail: + if (!list_empty(&dd->list)) + list_del_init(&dd->list); + ib_dealloc_device(&dd->verbs_dev.ibdev); + return ERR_PTR(ret);; +} + +/* + * Called from freeze mode handlers, and from PCI error + * reporting code. Should be paranoid about state of + * system and data structures. + */ +void qib_disable_after_error(struct qib_devdata *dd) +{ + if (dd->flags & QIB_INITTED) { + u32 pidx; + + dd->flags &= ~QIB_INITTED; + if (dd->pport) + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + struct qib_pportdata *ppd; + + ppd = dd->pport + pidx; + if (dd->flags & QIB_PRESENT) { + qib_set_linkstate(ppd, + QIB_IB_LINKDOWN_DISABLE); + dd->f_setextled(ppd, 0); + } + *ppd->statusp &= ~QIB_STATUS_IB_READY; + } + } + + /* + * Mark as having had an error for driver, and also + * for /sys and status word mapped to user programs. + * This marks unit as not usable, until reset. + */ + if (dd->devstatusp) + *dd->devstatusp |= QIB_STATUS_HWERROR; +} + +static void qib_remove_one(struct pci_dev *); +static int qib_init_one(struct pci_dev *, const struct pci_device_id *); + +#define DRIVER_LOAD_MSG "Intel " QIB_DRV_NAME " loaded: " +#define PFX QIB_DRV_NAME ": " + +static const struct pci_device_id qib_pci_tbl[] = { + { PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, PCI_DEVICE_ID_QLOGIC_IB_6120) }, + { PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, PCI_DEVICE_ID_QLOGIC_IB_7220) }, + { PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, PCI_DEVICE_ID_QLOGIC_IB_7322) }, + { 0, } +}; + +MODULE_DEVICE_TABLE(pci, qib_pci_tbl); + +static struct pci_driver qib_driver = { + .name = QIB_DRV_NAME, + .probe = qib_init_one, + .remove = qib_remove_one, + .id_table = qib_pci_tbl, + .err_handler = &qib_pci_err_handler, +}; + +#ifdef CONFIG_INFINIBAND_QIB_DCA + +static int qib_notify_dca(struct notifier_block *, unsigned long, void *); +static struct notifier_block dca_notifier = { + .notifier_call = qib_notify_dca, + .next = NULL, + .priority = 0 +}; + +static int qib_notify_dca_device(struct device *device, void *data) +{ + struct qib_devdata *dd = dev_get_drvdata(device); + unsigned long event = *(unsigned long *)data; + + return dd->f_notify_dca(dd, event); +} + +static int qib_notify_dca(struct notifier_block *nb, unsigned long event, + void *p) +{ + int rval; + + rval = driver_for_each_device(&qib_driver.driver, NULL, + &event, qib_notify_dca_device); + return rval ? NOTIFY_BAD : NOTIFY_DONE; +} + +#endif + +/* + * Do all the generic driver unit- and chip-independent memory + * allocation and initialization. + */ +static int __init qib_ib_init(void) +{ + int ret; + + ret = qib_dev_init(); + if (ret) + goto bail; + + /* + * These must be called before the driver is registered with + * the PCI subsystem. + */ + idr_init(&qib_unit_table); + +#ifdef CONFIG_INFINIBAND_QIB_DCA + dca_register_notify(&dca_notifier); +#endif +#ifdef CONFIG_DEBUG_FS + qib_dbg_init(); +#endif + ret = pci_register_driver(&qib_driver); + if (ret < 0) { + pr_err("Unable to register driver: error %d\n", -ret); + goto bail_dev; + } + + /* not fatal if it doesn't work */ + if (qib_init_qibfs()) + pr_err("Unable to register ipathfs\n"); + goto bail; /* all OK */ + +bail_dev: +#ifdef CONFIG_INFINIBAND_QIB_DCA + dca_unregister_notify(&dca_notifier); +#endif +#ifdef CONFIG_DEBUG_FS + qib_dbg_exit(); +#endif + idr_destroy(&qib_unit_table); + qib_dev_cleanup(); +bail: + return ret; +} + +module_init(qib_ib_init); + +/* + * Do the non-unit driver cleanup, memory free, etc. at unload. + */ +static void __exit qib_ib_cleanup(void) +{ + int ret; + + ret = qib_exit_qibfs(); + if (ret) + pr_err( + "Unable to cleanup counter filesystem: error %d\n", + -ret); + +#ifdef CONFIG_INFINIBAND_QIB_DCA + dca_unregister_notify(&dca_notifier); +#endif + pci_unregister_driver(&qib_driver); +#ifdef CONFIG_DEBUG_FS + qib_dbg_exit(); +#endif + + qib_cpulist_count = 0; + kfree(qib_cpulist); + + idr_destroy(&qib_unit_table); + qib_dev_cleanup(); +} + +module_exit(qib_ib_cleanup); + +/* this can only be called after a successful initialization */ +static void cleanup_device_data(struct qib_devdata *dd) +{ + int ctxt; + int pidx; + struct qib_ctxtdata **tmp; + unsigned long flags; + + /* users can't do anything more with chip */ + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + if (dd->pport[pidx].statusp) + *dd->pport[pidx].statusp &= ~QIB_STATUS_CHIP_PRESENT; + + spin_lock(&dd->pport[pidx].cc_shadow_lock); + + kfree(dd->pport[pidx].congestion_entries); + dd->pport[pidx].congestion_entries = NULL; + kfree(dd->pport[pidx].ccti_entries); + dd->pport[pidx].ccti_entries = NULL; + kfree(dd->pport[pidx].ccti_entries_shadow); + dd->pport[pidx].ccti_entries_shadow = NULL; + kfree(dd->pport[pidx].congestion_entries_shadow); + dd->pport[pidx].congestion_entries_shadow = NULL; + + spin_unlock(&dd->pport[pidx].cc_shadow_lock); + } + + if (!qib_wc_pat) + qib_disable_wc(dd); + + if (dd->pioavailregs_dma) { + dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, + (void *) dd->pioavailregs_dma, + dd->pioavailregs_phys); + dd->pioavailregs_dma = NULL; + } + + if (dd->pageshadow) { + struct page **tmpp = dd->pageshadow; + dma_addr_t *tmpd = dd->physshadow; + int i; + + for (ctxt = 0; ctxt < dd->cfgctxts; ctxt++) { + int ctxt_tidbase = ctxt * dd->rcvtidcnt; + int maxtid = ctxt_tidbase + dd->rcvtidcnt; + + for (i = ctxt_tidbase; i < maxtid; i++) { + if (!tmpp[i]) + continue; + pci_unmap_page(dd->pcidev, tmpd[i], + PAGE_SIZE, PCI_DMA_FROMDEVICE); + qib_release_user_pages(&tmpp[i], 1); + tmpp[i] = NULL; + } + } + + dd->pageshadow = NULL; + vfree(tmpp); + dd->physshadow = NULL; + vfree(tmpd); + } + + /* + * Free any resources still in use (usually just kernel contexts) + * at unload; we do for ctxtcnt, because that's what we allocate. + * We acquire lock to be really paranoid that rcd isn't being + * accessed from some interrupt-related code (that should not happen, + * but best to be sure). + */ + spin_lock_irqsave(&dd->uctxt_lock, flags); + tmp = dd->rcd; + dd->rcd = NULL; + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + for (ctxt = 0; tmp && ctxt < dd->ctxtcnt; ctxt++) { + struct qib_ctxtdata *rcd = tmp[ctxt]; + + tmp[ctxt] = NULL; /* debugging paranoia */ + qib_free_ctxtdata(dd, rcd); + } + kfree(tmp); + kfree(dd->boardname); + qib_cq_exit(dd); +} + +/* + * Clean up on unit shutdown, or error during unit load after + * successful initialization. + */ +static void qib_postinit_cleanup(struct qib_devdata *dd) +{ + /* + * Clean up chip-specific stuff. + * We check for NULL here, because it's outside + * the kregbase check, and we need to call it + * after the free_irq. Thus it's possible that + * the function pointers were never initialized. + */ + if (dd->f_cleanup) + dd->f_cleanup(dd); + + qib_pcie_ddcleanup(dd); + + cleanup_device_data(dd); + + qib_free_devdata(dd); +} + +static int qib_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + int ret, j, pidx, initfail; + struct qib_devdata *dd = NULL; + + ret = qib_pcie_init(pdev, ent); + if (ret) + goto bail; + + /* + * Do device-specific initialiation, function table setup, dd + * allocation, etc. + */ + switch (ent->device) { + case PCI_DEVICE_ID_QLOGIC_IB_6120: +#ifdef CONFIG_PCI_MSI + dd = qib_init_iba6120_funcs(pdev, ent); +#else + qib_early_err(&pdev->dev, + "Intel PCIE device 0x%x cannot work if CONFIG_PCI_MSI is not enabled\n", + ent->device); + dd = ERR_PTR(-ENODEV); +#endif + break; + + case PCI_DEVICE_ID_QLOGIC_IB_7220: + dd = qib_init_iba7220_funcs(pdev, ent); + break; + + case PCI_DEVICE_ID_QLOGIC_IB_7322: + dd = qib_init_iba7322_funcs(pdev, ent); + break; + + default: + qib_early_err(&pdev->dev, + "Failing on unknown Intel deviceid 0x%x\n", + ent->device); + ret = -ENODEV; + } + + if (IS_ERR(dd)) + ret = PTR_ERR(dd); + if (ret) + goto bail; /* error already printed */ + + ret = qib_create_workqueues(dd); + if (ret) + goto bail; + + /* do the generic initialization */ + initfail = qib_init(dd, 0); + + ret = qib_register_ib_device(dd); + + /* + * Now ready for use. this should be cleared whenever we + * detect a reset, or initiate one. If earlier failure, + * we still create devices, so diags, etc. can be used + * to determine cause of problem. + */ + if (!qib_mini_init && !initfail && !ret) + dd->flags |= QIB_INITTED; + + j = qib_device_create(dd); + if (j) + qib_dev_err(dd, "Failed to create /dev devices: %d\n", -j); + j = qibfs_add(dd); + if (j) + qib_dev_err(dd, "Failed filesystem setup for counters: %d\n", + -j); + + if (qib_mini_init || initfail || ret) { + qib_stop_timers(dd); + flush_workqueue(ib_wq); + for (pidx = 0; pidx < dd->num_pports; ++pidx) + dd->f_quiet_serdes(dd->pport + pidx); + if (qib_mini_init) + goto bail; + if (!j) { + (void) qibfs_remove(dd); + qib_device_remove(dd); + } + if (!ret) + qib_unregister_ib_device(dd); + qib_postinit_cleanup(dd); + if (initfail) + ret = initfail; + goto bail; + } + + if (!qib_wc_pat) { + ret = qib_enable_wc(dd); + if (ret) { + qib_dev_err(dd, + "Write combining not enabled (err %d): performance may be poor\n", + -ret); + ret = 0; + } + } + + qib_verify_pioperf(dd); +bail: + return ret; +} + +static void qib_remove_one(struct pci_dev *pdev) +{ + struct qib_devdata *dd = pci_get_drvdata(pdev); + int ret; + + /* unregister from IB core */ + qib_unregister_ib_device(dd); + + /* + * Disable the IB link, disable interrupts on the device, + * clear dma engines, etc. + */ + if (!qib_mini_init) + qib_shutdown_device(dd); + + qib_stop_timers(dd); + + /* wait until all of our (qsfp) queue_work() calls complete */ + flush_workqueue(ib_wq); + + ret = qibfs_remove(dd); + if (ret) + qib_dev_err(dd, "Failed counters filesystem cleanup: %d\n", + -ret); + + qib_device_remove(dd); + + qib_postinit_cleanup(dd); +} + +/** + * qib_create_rcvhdrq - create a receive header queue + * @dd: the qlogic_ib device + * @rcd: the context data + * + * This must be contiguous memory (from an i/o perspective), and must be + * DMA'able (which means for some systems, it will go through an IOMMU, + * or be forced into a low address range). + */ +int qib_create_rcvhdrq(struct qib_devdata *dd, struct qib_ctxtdata *rcd) +{ + unsigned amt; + int old_node_id; + + if (!rcd->rcvhdrq) { + dma_addr_t phys_hdrqtail; + gfp_t gfp_flags; + + amt = ALIGN(dd->rcvhdrcnt * dd->rcvhdrentsize * + sizeof(u32), PAGE_SIZE); + gfp_flags = (rcd->ctxt >= dd->first_user_ctxt) ? + GFP_USER : GFP_KERNEL; + + old_node_id = dev_to_node(&dd->pcidev->dev); + set_dev_node(&dd->pcidev->dev, rcd->node_id); + rcd->rcvhdrq = dma_alloc_coherent( + &dd->pcidev->dev, amt, &rcd->rcvhdrq_phys, + gfp_flags | __GFP_COMP); + set_dev_node(&dd->pcidev->dev, old_node_id); + + if (!rcd->rcvhdrq) { + qib_dev_err(dd, + "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n", + amt, rcd->ctxt); + goto bail; + } + + if (rcd->ctxt >= dd->first_user_ctxt) { + rcd->user_event_mask = vmalloc_user(PAGE_SIZE); + if (!rcd->user_event_mask) + goto bail_free_hdrq; + } + + if (!(dd->flags & QIB_NODMA_RTAIL)) { + set_dev_node(&dd->pcidev->dev, rcd->node_id); + rcd->rcvhdrtail_kvaddr = dma_alloc_coherent( + &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail, + gfp_flags); + set_dev_node(&dd->pcidev->dev, old_node_id); + if (!rcd->rcvhdrtail_kvaddr) + goto bail_free; + rcd->rcvhdrqtailaddr_phys = phys_hdrqtail; + } + + rcd->rcvhdrq_size = amt; + } + + /* clear for security and sanity on each use */ + memset(rcd->rcvhdrq, 0, rcd->rcvhdrq_size); + if (rcd->rcvhdrtail_kvaddr) + memset(rcd->rcvhdrtail_kvaddr, 0, PAGE_SIZE); + return 0; + +bail_free: + qib_dev_err(dd, + "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n", + rcd->ctxt); + vfree(rcd->user_event_mask); + rcd->user_event_mask = NULL; +bail_free_hdrq: + dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq, + rcd->rcvhdrq_phys); + rcd->rcvhdrq = NULL; +bail: + return -ENOMEM; +} + +/** + * allocate eager buffers, both kernel and user contexts. + * @rcd: the context we are setting up. + * + * Allocate the eager TID buffers and program them into hip. + * They are no longer completely contiguous, we do multiple allocation + * calls. Otherwise we get the OOM code involved, by asking for too + * much per call, with disastrous results on some kernels. + */ +int qib_setup_eagerbufs(struct qib_ctxtdata *rcd) +{ + struct qib_devdata *dd = rcd->dd; + unsigned e, egrcnt, egrperchunk, chunk, egrsize, egroff; + size_t size; + gfp_t gfp_flags; + int old_node_id; + + /* + * GFP_USER, but without GFP_FS, so buffer cache can be + * coalesced (we hope); otherwise, even at order 4, + * heavy filesystem activity makes these fail, and we can + * use compound pages. + */ + gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP; + + egrcnt = rcd->rcvegrcnt; + egroff = rcd->rcvegr_tid_base; + egrsize = dd->rcvegrbufsize; + + chunk = rcd->rcvegrbuf_chunks; + egrperchunk = rcd->rcvegrbufs_perchunk; + size = rcd->rcvegrbuf_size; + if (!rcd->rcvegrbuf) { + rcd->rcvegrbuf = + kzalloc_node(chunk * sizeof(rcd->rcvegrbuf[0]), + GFP_KERNEL, rcd->node_id); + if (!rcd->rcvegrbuf) + goto bail; + } + if (!rcd->rcvegrbuf_phys) { + rcd->rcvegrbuf_phys = + kmalloc_node(chunk * sizeof(rcd->rcvegrbuf_phys[0]), + GFP_KERNEL, rcd->node_id); + if (!rcd->rcvegrbuf_phys) + goto bail_rcvegrbuf; + } + for (e = 0; e < rcd->rcvegrbuf_chunks; e++) { + if (rcd->rcvegrbuf[e]) + continue; + + old_node_id = dev_to_node(&dd->pcidev->dev); + set_dev_node(&dd->pcidev->dev, rcd->node_id); + rcd->rcvegrbuf[e] = + dma_alloc_coherent(&dd->pcidev->dev, size, + &rcd->rcvegrbuf_phys[e], + gfp_flags); + set_dev_node(&dd->pcidev->dev, old_node_id); + if (!rcd->rcvegrbuf[e]) + goto bail_rcvegrbuf_phys; + } + + rcd->rcvegr_phys = rcd->rcvegrbuf_phys[0]; + + for (e = chunk = 0; chunk < rcd->rcvegrbuf_chunks; chunk++) { + dma_addr_t pa = rcd->rcvegrbuf_phys[chunk]; + unsigned i; + + /* clear for security and sanity on each use */ + memset(rcd->rcvegrbuf[chunk], 0, size); + + for (i = 0; e < egrcnt && i < egrperchunk; e++, i++) { + dd->f_put_tid(dd, e + egroff + + (u64 __iomem *) + ((char __iomem *) + dd->kregbase + + dd->rcvegrbase), + RCVHQ_RCV_TYPE_EAGER, pa); + pa += egrsize; + } + cond_resched(); /* don't hog the cpu */ + } + + return 0; + +bail_rcvegrbuf_phys: + for (e = 0; e < rcd->rcvegrbuf_chunks && rcd->rcvegrbuf[e]; e++) + dma_free_coherent(&dd->pcidev->dev, size, + rcd->rcvegrbuf[e], rcd->rcvegrbuf_phys[e]); + kfree(rcd->rcvegrbuf_phys); + rcd->rcvegrbuf_phys = NULL; +bail_rcvegrbuf: + kfree(rcd->rcvegrbuf); + rcd->rcvegrbuf = NULL; +bail: + return -ENOMEM; +} + +/* + * Note: Changes to this routine should be mirrored + * for the diagnostics routine qib_remap_ioaddr32(). + * There is also related code for VL15 buffers in qib_init_7322_variables(). + * The teardown code that unmaps is in qib_pcie_ddcleanup() + */ +int init_chip_wc_pat(struct qib_devdata *dd, u32 vl15buflen) +{ + u64 __iomem *qib_kregbase = NULL; + void __iomem *qib_piobase = NULL; + u64 __iomem *qib_userbase = NULL; + u64 qib_kreglen; + u64 qib_pio2koffset = dd->piobufbase & 0xffffffff; + u64 qib_pio4koffset = dd->piobufbase >> 32; + u64 qib_pio2klen = dd->piobcnt2k * dd->palign; + u64 qib_pio4klen = dd->piobcnt4k * dd->align4k; + u64 qib_physaddr = dd->physaddr; + u64 qib_piolen; + u64 qib_userlen = 0; + + /* + * Free the old mapping because the kernel will try to reuse the + * old mapping and not create a new mapping with the + * write combining attribute. + */ + iounmap(dd->kregbase); + dd->kregbase = NULL; + + /* + * Assumes chip address space looks like: + * - kregs + sregs + cregs + uregs (in any order) + * - piobufs (2K and 4K bufs in either order) + * or: + * - kregs + sregs + cregs (in any order) + * - piobufs (2K and 4K bufs in either order) + * - uregs + */ + if (dd->piobcnt4k == 0) { + qib_kreglen = qib_pio2koffset; + qib_piolen = qib_pio2klen; + } else if (qib_pio2koffset < qib_pio4koffset) { + qib_kreglen = qib_pio2koffset; + qib_piolen = qib_pio4koffset + qib_pio4klen - qib_kreglen; + } else { + qib_kreglen = qib_pio4koffset; + qib_piolen = qib_pio2koffset + qib_pio2klen - qib_kreglen; + } + qib_piolen += vl15buflen; + /* Map just the configured ports (not all hw ports) */ + if (dd->uregbase > qib_kreglen) + qib_userlen = dd->ureg_align * dd->cfgctxts; + + /* Sanity checks passed, now create the new mappings */ + qib_kregbase = ioremap_nocache(qib_physaddr, qib_kreglen); + if (!qib_kregbase) + goto bail; + + qib_piobase = ioremap_wc(qib_physaddr + qib_kreglen, qib_piolen); + if (!qib_piobase) + goto bail_kregbase; + + if (qib_userlen) { + qib_userbase = ioremap_nocache(qib_physaddr + dd->uregbase, + qib_userlen); + if (!qib_userbase) + goto bail_piobase; + } + + dd->kregbase = qib_kregbase; + dd->kregend = (u64 __iomem *) + ((char __iomem *) qib_kregbase + qib_kreglen); + dd->piobase = qib_piobase; + dd->pio2kbase = (void __iomem *) + (((char __iomem *) dd->piobase) + + qib_pio2koffset - qib_kreglen); + if (dd->piobcnt4k) + dd->pio4kbase = (void __iomem *) + (((char __iomem *) dd->piobase) + + qib_pio4koffset - qib_kreglen); + if (qib_userlen) + /* ureg will now be accessed relative to dd->userbase */ + dd->userbase = qib_userbase; + return 0; + +bail_piobase: + iounmap(qib_piobase); +bail_kregbase: + iounmap(qib_kregbase); +bail: + return -ENOMEM; +} diff --git a/drivers/infiniband/hw/qib/qib_intr.c b/drivers/infiniband/hw/qib/qib_intr.c new file mode 100644 index 0000000..f4918f2 --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_intr.c @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation. + * All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "qib.h" +#include "qib_common.h" + +/** + * qib_format_hwmsg - format a single hwerror message + * @msg message buffer + * @msgl length of message buffer + * @hwmsg message to add to message buffer + */ +static void qib_format_hwmsg(char *msg, size_t msgl, const char *hwmsg) +{ + strlcat(msg, "[", msgl); + strlcat(msg, hwmsg, msgl); + strlcat(msg, "]", msgl); +} + +/** + * qib_format_hwerrors - format hardware error messages for display + * @hwerrs hardware errors bit vector + * @hwerrmsgs hardware error descriptions + * @nhwerrmsgs number of hwerrmsgs + * @msg message buffer + * @msgl message buffer length + */ +void qib_format_hwerrors(u64 hwerrs, const struct qib_hwerror_msgs *hwerrmsgs, + size_t nhwerrmsgs, char *msg, size_t msgl) +{ + int i; + + for (i = 0; i < nhwerrmsgs; i++) + if (hwerrs & hwerrmsgs[i].mask) + qib_format_hwmsg(msg, msgl, hwerrmsgs[i].msg); +} + +static void signal_ib_event(struct qib_pportdata *ppd, enum ib_event_type ev) +{ + struct ib_event event; + struct qib_devdata *dd = ppd->dd; + + event.device = &dd->verbs_dev.ibdev; + event.element.port_num = ppd->port; + event.event = ev; + ib_dispatch_event(&event); +} + +void qib_handle_e_ibstatuschanged(struct qib_pportdata *ppd, u64 ibcs) +{ + struct qib_devdata *dd = ppd->dd; + unsigned long flags; + u32 lstate; + u8 ltstate; + enum ib_event_type ev = 0; + + lstate = dd->f_iblink_state(ibcs); /* linkstate */ + ltstate = dd->f_ibphys_portstate(ibcs); + + /* + * If linkstate transitions into INIT from any of the various down + * states, or if it transitions from any of the up (INIT or better) + * states into any of the down states (except link recovery), then + * call the chip-specific code to take appropriate actions. + * + * ppd->lflags could be 0 if this is the first time the interrupt + * handlers has been called but the link is already up. + */ + if (lstate >= IB_PORT_INIT && + (!ppd->lflags || (ppd->lflags & QIBL_LINKDOWN)) && + ltstate == IB_PHYSPORTSTATE_LINKUP) { + /* transitioned to UP */ + if (dd->f_ib_updown(ppd, 1, ibcs)) + goto skip_ibchange; /* chip-code handled */ + } else if (ppd->lflags & (QIBL_LINKINIT | QIBL_LINKARMED | + QIBL_LINKACTIVE | QIBL_IB_FORCE_NOTIFY)) { + if (ltstate != IB_PHYSPORTSTATE_LINKUP && + ltstate <= IB_PHYSPORTSTATE_CFG_TRAIN && + dd->f_ib_updown(ppd, 0, ibcs)) + goto skip_ibchange; /* chip-code handled */ + qib_set_uevent_bits(ppd, _QIB_EVENT_LINKDOWN_BIT); + } + + if (lstate != IB_PORT_DOWN) { + /* lstate is INIT, ARMED, or ACTIVE */ + if (lstate != IB_PORT_ACTIVE) { + *ppd->statusp &= ~QIB_STATUS_IB_READY; + if (ppd->lflags & QIBL_LINKACTIVE) + ev = IB_EVENT_PORT_ERR; + spin_lock_irqsave(&ppd->lflags_lock, flags); + if (lstate == IB_PORT_ARMED) { + ppd->lflags |= QIBL_LINKARMED | QIBL_LINKV; + ppd->lflags &= ~(QIBL_LINKINIT | + QIBL_LINKDOWN | QIBL_LINKACTIVE); + } else { + ppd->lflags |= QIBL_LINKINIT | QIBL_LINKV; + ppd->lflags &= ~(QIBL_LINKARMED | + QIBL_LINKDOWN | QIBL_LINKACTIVE); + } + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + /* start a 75msec timer to clear symbol errors */ + mod_timer(&ppd->symerr_clear_timer, + msecs_to_jiffies(75)); + } else if (ltstate == IB_PHYSPORTSTATE_LINKUP && + !(ppd->lflags & QIBL_LINKACTIVE)) { + /* active, but not active defered */ + qib_hol_up(ppd); /* useful only for 6120 now */ + *ppd->statusp |= + QIB_STATUS_IB_READY | QIB_STATUS_IB_CONF; + qib_clear_symerror_on_linkup((unsigned long)ppd); + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags |= QIBL_LINKACTIVE | QIBL_LINKV; + ppd->lflags &= ~(QIBL_LINKINIT | + QIBL_LINKDOWN | QIBL_LINKARMED); + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + if (dd->flags & QIB_HAS_SEND_DMA) + qib_sdma_process_event(ppd, + qib_sdma_event_e30_go_running); + ev = IB_EVENT_PORT_ACTIVE; + dd->f_setextled(ppd, 1); + } + } else { /* down */ + if (ppd->lflags & QIBL_LINKACTIVE) + ev = IB_EVENT_PORT_ERR; + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags |= QIBL_LINKDOWN | QIBL_LINKV; + ppd->lflags &= ~(QIBL_LINKINIT | + QIBL_LINKACTIVE | QIBL_LINKARMED); + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + *ppd->statusp &= ~QIB_STATUS_IB_READY; + } + +skip_ibchange: + ppd->lastibcstat = ibcs; + if (ev) + signal_ib_event(ppd, ev); + return; +} + +void qib_clear_symerror_on_linkup(unsigned long opaque) +{ + struct qib_pportdata *ppd = (struct qib_pportdata *)opaque; + + if (ppd->lflags & QIBL_LINKACTIVE) + return; + + ppd->ibport_data.z_symbol_error_counter = + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_IBSYMBOLERR); +} + +/* + * Handle receive interrupts for user ctxts; this means a user + * process was waiting for a packet to arrive, and didn't want + * to poll. + */ +void qib_handle_urcv(struct qib_devdata *dd, u64 ctxtr) +{ + struct qib_ctxtdata *rcd; + unsigned long flags; + int i; + + spin_lock_irqsave(&dd->uctxt_lock, flags); + for (i = dd->first_user_ctxt; dd->rcd && i < dd->cfgctxts; i++) { + if (!(ctxtr & (1ULL << i))) + continue; + rcd = dd->rcd[i]; + if (!rcd || !rcd->cnt) + continue; + + if (test_and_clear_bit(QIB_CTXT_WAITING_RCV, &rcd->flag)) { + wake_up_interruptible(&rcd->wait); + dd->f_rcvctrl(rcd->ppd, QIB_RCVCTRL_INTRAVAIL_DIS, + rcd->ctxt); + } else if (test_and_clear_bit(QIB_CTXT_WAITING_URG, + &rcd->flag)) { + rcd->urgent++; + wake_up_interruptible(&rcd->wait); + } + } + spin_unlock_irqrestore(&dd->uctxt_lock, flags); +} + +void qib_bad_intrstatus(struct qib_devdata *dd) +{ + static int allbits; + + /* separate routine, for better optimization of qib_intr() */ + + /* + * We print the message and disable interrupts, in hope of + * having a better chance of debugging the problem. + */ + qib_dev_err(dd, + "Read of chip interrupt status failed disabling interrupts\n"); + if (allbits++) { + /* disable interrupt delivery, something is very wrong */ + if (allbits == 2) + dd->f_set_intr_state(dd, 0); + if (allbits == 3) { + qib_dev_err(dd, + "2nd bad interrupt status, unregistering interrupts\n"); + dd->flags |= QIB_BADINTR; + dd->flags &= ~QIB_INITTED; + dd->f_free_irq(dd); + } + } +} diff --git a/drivers/infiniband/hw/qib/qib_keys.c b/drivers/infiniband/hw/qib/qib_keys.c new file mode 100644 index 0000000..3b9afcc --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_keys.c @@ -0,0 +1,387 @@ +/* + * Copyright (c) 2006, 2007, 2009 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "qib.h" + +/** + * qib_alloc_lkey - allocate an lkey + * @mr: memory region that this lkey protects + * @dma_region: 0->normal key, 1->restricted DMA key + * + * Returns 0 if successful, otherwise returns -errno. + * + * Increments mr reference count as required. + * + * Sets the lkey field mr for non-dma regions. + * + */ + +int qib_alloc_lkey(struct qib_mregion *mr, int dma_region) +{ + unsigned long flags; + u32 r; + u32 n; + int ret = 0; + struct qib_ibdev *dev = to_idev(mr->pd->device); + struct qib_lkey_table *rkt = &dev->lk_table; + + spin_lock_irqsave(&rkt->lock, flags); + + /* special case for dma_mr lkey == 0 */ + if (dma_region) { + struct qib_mregion *tmr; + + tmr = rcu_access_pointer(dev->dma_mr); + if (!tmr) { + qib_get_mr(mr); + rcu_assign_pointer(dev->dma_mr, mr); + mr->lkey_published = 1; + } + goto success; + } + + /* Find the next available LKEY */ + r = rkt->next; + n = r; + for (;;) { + if (rkt->table[r] == NULL) + break; + r = (r + 1) & (rkt->max - 1); + if (r == n) + goto bail; + } + rkt->next = (r + 1) & (rkt->max - 1); + /* + * Make sure lkey is never zero which is reserved to indicate an + * unrestricted LKEY. + */ + rkt->gen++; + mr->lkey = (r << (32 - ib_qib_lkey_table_size)) | + ((((1 << (24 - ib_qib_lkey_table_size)) - 1) & rkt->gen) + << 8); + if (mr->lkey == 0) { + mr->lkey |= 1 << 8; + rkt->gen++; + } + qib_get_mr(mr); + rcu_assign_pointer(rkt->table[r], mr); + mr->lkey_published = 1; +success: + spin_unlock_irqrestore(&rkt->lock, flags); +out: + return ret; +bail: + spin_unlock_irqrestore(&rkt->lock, flags); + ret = -ENOMEM; + goto out; +} + +/** + * qib_free_lkey - free an lkey + * @mr: mr to free from tables + */ +void qib_free_lkey(struct qib_mregion *mr) +{ + unsigned long flags; + u32 lkey = mr->lkey; + u32 r; + struct qib_ibdev *dev = to_idev(mr->pd->device); + struct qib_lkey_table *rkt = &dev->lk_table; + + spin_lock_irqsave(&rkt->lock, flags); + if (!mr->lkey_published) + goto out; + if (lkey == 0) + rcu_assign_pointer(dev->dma_mr, NULL); + else { + r = lkey >> (32 - ib_qib_lkey_table_size); + rcu_assign_pointer(rkt->table[r], NULL); + } + qib_put_mr(mr); + mr->lkey_published = 0; +out: + spin_unlock_irqrestore(&rkt->lock, flags); +} + +/** + * qib_lkey_ok - check IB SGE for validity and initialize + * @rkt: table containing lkey to check SGE against + * @pd: protection domain + * @isge: outgoing internal SGE + * @sge: SGE to check + * @acc: access flags + * + * Return 1 if valid and successful, otherwise returns 0. + * + * increments the reference count upon success + * + * Check the IB SGE for validity and initialize our internal version + * of it. + */ +int qib_lkey_ok(struct qib_lkey_table *rkt, struct qib_pd *pd, + struct qib_sge *isge, struct ib_sge *sge, int acc) +{ + struct qib_mregion *mr; + unsigned n, m; + size_t off; + + /* + * We use LKEY == zero for kernel virtual addresses + * (see qib_get_dma_mr and qib_dma.c). + */ + rcu_read_lock(); + if (sge->lkey == 0) { + struct qib_ibdev *dev = to_idev(pd->ibpd.device); + + if (pd->user) + goto bail; + mr = rcu_dereference(dev->dma_mr); + if (!mr) + goto bail; + if (unlikely(!atomic_inc_not_zero(&mr->refcount))) + goto bail; + rcu_read_unlock(); + + isge->mr = mr; + isge->vaddr = (void *) sge->addr; + isge->length = sge->length; + isge->sge_length = sge->length; + isge->m = 0; + isge->n = 0; + goto ok; + } + mr = rcu_dereference( + rkt->table[(sge->lkey >> (32 - ib_qib_lkey_table_size))]); + if (unlikely(!mr || mr->lkey != sge->lkey || mr->pd != &pd->ibpd)) + goto bail; + + off = sge->addr - mr->user_base; + if (unlikely(sge->addr < mr->user_base || + off + sge->length > mr->length || + (mr->access_flags & acc) != acc)) + goto bail; + if (unlikely(!atomic_inc_not_zero(&mr->refcount))) + goto bail; + rcu_read_unlock(); + + off += mr->offset; + if (mr->page_shift) { + /* + page sizes are uniform power of 2 so no loop is necessary + entries_spanned_by_off is the number of times the loop below + would have executed. + */ + size_t entries_spanned_by_off; + + entries_spanned_by_off = off >> mr->page_shift; + off -= (entries_spanned_by_off << mr->page_shift); + m = entries_spanned_by_off/QIB_SEGSZ; + n = entries_spanned_by_off%QIB_SEGSZ; + } else { + m = 0; + n = 0; + while (off >= mr->map[m]->segs[n].length) { + off -= mr->map[m]->segs[n].length; + n++; + if (n >= QIB_SEGSZ) { + m++; + n = 0; + } + } + } + isge->mr = mr; + isge->vaddr = mr->map[m]->segs[n].vaddr + off; + isge->length = mr->map[m]->segs[n].length - off; + isge->sge_length = sge->length; + isge->m = m; + isge->n = n; +ok: + return 1; +bail: + rcu_read_unlock(); + return 0; +} + +/** + * qib_rkey_ok - check the IB virtual address, length, and RKEY + * @qp: qp for validation + * @sge: SGE state + * @len: length of data + * @vaddr: virtual address to place data + * @rkey: rkey to check + * @acc: access flags + * + * Return 1 if successful, otherwise 0. + * + * increments the reference count upon success + */ +int qib_rkey_ok(struct qib_qp *qp, struct qib_sge *sge, + u32 len, u64 vaddr, u32 rkey, int acc) +{ + struct qib_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table; + struct qib_mregion *mr; + unsigned n, m; + size_t off; + + /* + * We use RKEY == zero for kernel virtual addresses + * (see qib_get_dma_mr and qib_dma.c). + */ + rcu_read_lock(); + if (rkey == 0) { + struct qib_pd *pd = to_ipd(qp->ibqp.pd); + struct qib_ibdev *dev = to_idev(pd->ibpd.device); + + if (pd->user) + goto bail; + mr = rcu_dereference(dev->dma_mr); + if (!mr) + goto bail; + if (unlikely(!atomic_inc_not_zero(&mr->refcount))) + goto bail; + rcu_read_unlock(); + + sge->mr = mr; + sge->vaddr = (void *) vaddr; + sge->length = len; + sge->sge_length = len; + sge->m = 0; + sge->n = 0; + goto ok; + } + + mr = rcu_dereference( + rkt->table[(rkey >> (32 - ib_qib_lkey_table_size))]); + if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd)) + goto bail; + + off = vaddr - mr->iova; + if (unlikely(vaddr < mr->iova || off + len > mr->length || + (mr->access_flags & acc) == 0)) + goto bail; + if (unlikely(!atomic_inc_not_zero(&mr->refcount))) + goto bail; + rcu_read_unlock(); + + off += mr->offset; + if (mr->page_shift) { + /* + page sizes are uniform power of 2 so no loop is necessary + entries_spanned_by_off is the number of times the loop below + would have executed. + */ + size_t entries_spanned_by_off; + + entries_spanned_by_off = off >> mr->page_shift; + off -= (entries_spanned_by_off << mr->page_shift); + m = entries_spanned_by_off/QIB_SEGSZ; + n = entries_spanned_by_off%QIB_SEGSZ; + } else { + m = 0; + n = 0; + while (off >= mr->map[m]->segs[n].length) { + off -= mr->map[m]->segs[n].length; + n++; + if (n >= QIB_SEGSZ) { + m++; + n = 0; + } + } + } + sge->mr = mr; + sge->vaddr = mr->map[m]->segs[n].vaddr + off; + sge->length = mr->map[m]->segs[n].length - off; + sge->sge_length = len; + sge->m = m; + sge->n = n; +ok: + return 1; +bail: + rcu_read_unlock(); + return 0; +} + +/* + * Initialize the memory region specified by the work reqeust. + */ +int qib_fast_reg_mr(struct qib_qp *qp, struct ib_send_wr *wr) +{ + struct qib_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table; + struct qib_pd *pd = to_ipd(qp->ibqp.pd); + struct qib_mregion *mr; + u32 rkey = wr->wr.fast_reg.rkey; + unsigned i, n, m; + int ret = -EINVAL; + unsigned long flags; + u64 *page_list; + size_t ps; + + spin_lock_irqsave(&rkt->lock, flags); + if (pd->user || rkey == 0) + goto bail; + + mr = rcu_dereference_protected( + rkt->table[(rkey >> (32 - ib_qib_lkey_table_size))], + lockdep_is_held(&rkt->lock)); + if (unlikely(mr == NULL || qp->ibqp.pd != mr->pd)) + goto bail; + + if (wr->wr.fast_reg.page_list_len > mr->max_segs) + goto bail; + + ps = 1UL << wr->wr.fast_reg.page_shift; + if (wr->wr.fast_reg.length > ps * wr->wr.fast_reg.page_list_len) + goto bail; + + mr->user_base = wr->wr.fast_reg.iova_start; + mr->iova = wr->wr.fast_reg.iova_start; + mr->lkey = rkey; + mr->length = wr->wr.fast_reg.length; + mr->access_flags = wr->wr.fast_reg.access_flags; + page_list = wr->wr.fast_reg.page_list->page_list; + m = 0; + n = 0; + for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) { + mr->map[m]->segs[n].vaddr = (void *) page_list[i]; + mr->map[m]->segs[n].length = ps; + if (++n == QIB_SEGSZ) { + m++; + n = 0; + } + } + + ret = 0; +bail: + spin_unlock_irqrestore(&rkt->lock, flags); + return ret; +} diff --git a/drivers/infiniband/hw/qib/qib_mad.c b/drivers/infiniband/hw/qib/qib_mad.c new file mode 100644 index 0000000..636be11 --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_mad.c @@ -0,0 +1,2533 @@ +/* + * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "qib.h" +#include "qib_mad.h" + +static int reply(struct ib_smp *smp) +{ + /* + * The verbs framework will handle the directed/LID route + * packet changes. + */ + smp->method = IB_MGMT_METHOD_GET_RESP; + if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + smp->status |= IB_SMP_DIRECTION; + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +static int reply_failure(struct ib_smp *smp) +{ + /* + * The verbs framework will handle the directed/LID route + * packet changes. + */ + smp->method = IB_MGMT_METHOD_GET_RESP; + if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + smp->status |= IB_SMP_DIRECTION; + return IB_MAD_RESULT_FAILURE | IB_MAD_RESULT_REPLY; +} + +static void qib_send_trap(struct qib_ibport *ibp, void *data, unsigned len) +{ + struct ib_mad_send_buf *send_buf; + struct ib_mad_agent *agent; + struct ib_smp *smp; + int ret; + unsigned long flags; + unsigned long timeout; + + agent = ibp->send_agent; + if (!agent) + return; + + /* o14-3.2.1 */ + if (!(ppd_from_ibp(ibp)->lflags & QIBL_LINKACTIVE)) + return; + + /* o14-2 */ + if (ibp->trap_timeout && time_before(jiffies, ibp->trap_timeout)) + return; + + send_buf = ib_create_send_mad(agent, 0, 0, 0, IB_MGMT_MAD_HDR, + IB_MGMT_MAD_DATA, GFP_ATOMIC); + if (IS_ERR(send_buf)) + return; + + smp = send_buf->mad; + smp->base_version = IB_MGMT_BASE_VERSION; + smp->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; + smp->class_version = 1; + smp->method = IB_MGMT_METHOD_TRAP; + ibp->tid++; + smp->tid = cpu_to_be64(ibp->tid); + smp->attr_id = IB_SMP_ATTR_NOTICE; + /* o14-1: smp->mkey = 0; */ + memcpy(smp->data, data, len); + + spin_lock_irqsave(&ibp->lock, flags); + if (!ibp->sm_ah) { + if (ibp->sm_lid != be16_to_cpu(IB_LID_PERMISSIVE)) { + struct ib_ah *ah; + + ah = qib_create_qp0_ah(ibp, ibp->sm_lid); + if (IS_ERR(ah)) + ret = PTR_ERR(ah); + else { + send_buf->ah = ah; + ibp->sm_ah = to_iah(ah); + ret = 0; + } + } else + ret = -EINVAL; + } else { + send_buf->ah = &ibp->sm_ah->ibah; + ret = 0; + } + spin_unlock_irqrestore(&ibp->lock, flags); + + if (!ret) + ret = ib_post_send_mad(send_buf, NULL); + if (!ret) { + /* 4.096 usec. */ + timeout = (4096 * (1UL << ibp->subnet_timeout)) / 1000; + ibp->trap_timeout = jiffies + usecs_to_jiffies(timeout); + } else { + ib_free_send_mad(send_buf); + ibp->trap_timeout = 0; + } +} + +/* + * Send a bad [PQ]_Key trap (ch. 14.3.8). + */ +void qib_bad_pqkey(struct qib_ibport *ibp, __be16 trap_num, u32 key, u32 sl, + u32 qp1, u32 qp2, __be16 lid1, __be16 lid2) +{ + struct ib_mad_notice_attr data; + + if (trap_num == IB_NOTICE_TRAP_BAD_PKEY) + ibp->pkey_violations++; + else + ibp->qkey_violations++; + ibp->n_pkt_drops++; + + /* Send violation trap */ + data.generic_type = IB_NOTICE_TYPE_SECURITY; + data.prod_type_msb = 0; + data.prod_type_lsb = IB_NOTICE_PROD_CA; + data.trap_num = trap_num; + data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid); + data.toggle_count = 0; + memset(&data.details, 0, sizeof data.details); + data.details.ntc_257_258.lid1 = lid1; + data.details.ntc_257_258.lid2 = lid2; + data.details.ntc_257_258.key = cpu_to_be32(key); + data.details.ntc_257_258.sl_qp1 = cpu_to_be32((sl << 28) | qp1); + data.details.ntc_257_258.qp2 = cpu_to_be32(qp2); + + qib_send_trap(ibp, &data, sizeof data); +} + +/* + * Send a bad M_Key trap (ch. 14.3.9). + */ +static void qib_bad_mkey(struct qib_ibport *ibp, struct ib_smp *smp) +{ + struct ib_mad_notice_attr data; + + /* Send violation trap */ + data.generic_type = IB_NOTICE_TYPE_SECURITY; + data.prod_type_msb = 0; + data.prod_type_lsb = IB_NOTICE_PROD_CA; + data.trap_num = IB_NOTICE_TRAP_BAD_MKEY; + data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid); + data.toggle_count = 0; + memset(&data.details, 0, sizeof data.details); + data.details.ntc_256.lid = data.issuer_lid; + data.details.ntc_256.method = smp->method; + data.details.ntc_256.attr_id = smp->attr_id; + data.details.ntc_256.attr_mod = smp->attr_mod; + data.details.ntc_256.mkey = smp->mkey; + if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + u8 hop_cnt; + + data.details.ntc_256.dr_slid = smp->dr_slid; + data.details.ntc_256.dr_trunc_hop = IB_NOTICE_TRAP_DR_NOTICE; + hop_cnt = smp->hop_cnt; + if (hop_cnt > ARRAY_SIZE(data.details.ntc_256.dr_rtn_path)) { + data.details.ntc_256.dr_trunc_hop |= + IB_NOTICE_TRAP_DR_TRUNC; + hop_cnt = ARRAY_SIZE(data.details.ntc_256.dr_rtn_path); + } + data.details.ntc_256.dr_trunc_hop |= hop_cnt; + memcpy(data.details.ntc_256.dr_rtn_path, smp->return_path, + hop_cnt); + } + + qib_send_trap(ibp, &data, sizeof data); +} + +/* + * Send a Port Capability Mask Changed trap (ch. 14.3.11). + */ +void qib_cap_mask_chg(struct qib_ibport *ibp) +{ + struct ib_mad_notice_attr data; + + data.generic_type = IB_NOTICE_TYPE_INFO; + data.prod_type_msb = 0; + data.prod_type_lsb = IB_NOTICE_PROD_CA; + data.trap_num = IB_NOTICE_TRAP_CAP_MASK_CHG; + data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid); + data.toggle_count = 0; + memset(&data.details, 0, sizeof data.details); + data.details.ntc_144.lid = data.issuer_lid; + data.details.ntc_144.new_cap_mask = cpu_to_be32(ibp->port_cap_flags); + + qib_send_trap(ibp, &data, sizeof data); +} + +/* + * Send a System Image GUID Changed trap (ch. 14.3.12). + */ +void qib_sys_guid_chg(struct qib_ibport *ibp) +{ + struct ib_mad_notice_attr data; + + data.generic_type = IB_NOTICE_TYPE_INFO; + data.prod_type_msb = 0; + data.prod_type_lsb = IB_NOTICE_PROD_CA; + data.trap_num = IB_NOTICE_TRAP_SYS_GUID_CHG; + data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid); + data.toggle_count = 0; + memset(&data.details, 0, sizeof data.details); + data.details.ntc_145.lid = data.issuer_lid; + data.details.ntc_145.new_sys_guid = ib_qib_sys_image_guid; + + qib_send_trap(ibp, &data, sizeof data); +} + +/* + * Send a Node Description Changed trap (ch. 14.3.13). + */ +void qib_node_desc_chg(struct qib_ibport *ibp) +{ + struct ib_mad_notice_attr data; + + data.generic_type = IB_NOTICE_TYPE_INFO; + data.prod_type_msb = 0; + data.prod_type_lsb = IB_NOTICE_PROD_CA; + data.trap_num = IB_NOTICE_TRAP_CAP_MASK_CHG; + data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid); + data.toggle_count = 0; + memset(&data.details, 0, sizeof data.details); + data.details.ntc_144.lid = data.issuer_lid; + data.details.ntc_144.local_changes = 1; + data.details.ntc_144.change_flags = IB_NOTICE_TRAP_NODE_DESC_CHG; + + qib_send_trap(ibp, &data, sizeof data); +} + +static int subn_get_nodedescription(struct ib_smp *smp, + struct ib_device *ibdev) +{ + if (smp->attr_mod) + smp->status |= IB_SMP_INVALID_FIELD; + + memcpy(smp->data, ibdev->node_desc, sizeof(smp->data)); + + return reply(smp); +} + +static int subn_get_nodeinfo(struct ib_smp *smp, struct ib_device *ibdev, + u8 port) +{ + struct ib_node_info *nip = (struct ib_node_info *)&smp->data; + struct qib_devdata *dd = dd_from_ibdev(ibdev); + u32 vendor, majrev, minrev; + unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */ + + /* GUID 0 is illegal */ + if (smp->attr_mod || pidx >= dd->num_pports || + dd->pport[pidx].guid == 0) + smp->status |= IB_SMP_INVALID_FIELD; + else + nip->port_guid = dd->pport[pidx].guid; + + nip->base_version = 1; + nip->class_version = 1; + nip->node_type = 1; /* channel adapter */ + nip->num_ports = ibdev->phys_port_cnt; + /* This is already in network order */ + nip->sys_guid = ib_qib_sys_image_guid; + nip->node_guid = dd->pport->guid; /* Use first-port GUID as node */ + nip->partition_cap = cpu_to_be16(qib_get_npkeys(dd)); + nip->device_id = cpu_to_be16(dd->deviceid); + majrev = dd->majrev; + minrev = dd->minrev; + nip->revision = cpu_to_be32((majrev << 16) | minrev); + nip->local_port_num = port; + vendor = dd->vendorid; + nip->vendor_id[0] = QIB_SRC_OUI_1; + nip->vendor_id[1] = QIB_SRC_OUI_2; + nip->vendor_id[2] = QIB_SRC_OUI_3; + + return reply(smp); +} + +static int subn_get_guidinfo(struct ib_smp *smp, struct ib_device *ibdev, + u8 port) +{ + struct qib_devdata *dd = dd_from_ibdev(ibdev); + u32 startgx = 8 * be32_to_cpu(smp->attr_mod); + __be64 *p = (__be64 *) smp->data; + unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */ + + /* 32 blocks of 8 64-bit GUIDs per block */ + + memset(smp->data, 0, sizeof(smp->data)); + + if (startgx == 0 && pidx < dd->num_pports) { + struct qib_pportdata *ppd = dd->pport + pidx; + struct qib_ibport *ibp = &ppd->ibport_data; + __be64 g = ppd->guid; + unsigned i; + + /* GUID 0 is illegal */ + if (g == 0) + smp->status |= IB_SMP_INVALID_FIELD; + else { + /* The first is a copy of the read-only HW GUID. */ + p[0] = g; + for (i = 1; i < QIB_GUIDS_PER_PORT; i++) + p[i] = ibp->guids[i - 1]; + } + } else + smp->status |= IB_SMP_INVALID_FIELD; + + return reply(smp); +} + +static void set_link_width_enabled(struct qib_pportdata *ppd, u32 w) +{ + (void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LWID_ENB, w); +} + +static void set_link_speed_enabled(struct qib_pportdata *ppd, u32 s) +{ + (void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_SPD_ENB, s); +} + +static int get_overrunthreshold(struct qib_pportdata *ppd) +{ + return ppd->dd->f_get_ib_cfg(ppd, QIB_IB_CFG_OVERRUN_THRESH); +} + +/** + * set_overrunthreshold - set the overrun threshold + * @ppd: the physical port data + * @n: the new threshold + * + * Note that this will only take effect when the link state changes. + */ +static int set_overrunthreshold(struct qib_pportdata *ppd, unsigned n) +{ + (void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_OVERRUN_THRESH, + (u32)n); + return 0; +} + +static int get_phyerrthreshold(struct qib_pportdata *ppd) +{ + return ppd->dd->f_get_ib_cfg(ppd, QIB_IB_CFG_PHYERR_THRESH); +} + +/** + * set_phyerrthreshold - set the physical error threshold + * @ppd: the physical port data + * @n: the new threshold + * + * Note that this will only take effect when the link state changes. + */ +static int set_phyerrthreshold(struct qib_pportdata *ppd, unsigned n) +{ + (void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PHYERR_THRESH, + (u32)n); + return 0; +} + +/** + * get_linkdowndefaultstate - get the default linkdown state + * @ppd: the physical port data + * + * Returns zero if the default is POLL, 1 if the default is SLEEP. + */ +static int get_linkdowndefaultstate(struct qib_pportdata *ppd) +{ + return ppd->dd->f_get_ib_cfg(ppd, QIB_IB_CFG_LINKDEFAULT) == + IB_LINKINITCMD_SLEEP; +} + +static int check_mkey(struct qib_ibport *ibp, struct ib_smp *smp, int mad_flags) +{ + int valid_mkey = 0; + int ret = 0; + + /* Is the mkey in the process of expiring? */ + if (ibp->mkey_lease_timeout && + time_after_eq(jiffies, ibp->mkey_lease_timeout)) { + /* Clear timeout and mkey protection field. */ + ibp->mkey_lease_timeout = 0; + ibp->mkeyprot = 0; + } + + if ((mad_flags & IB_MAD_IGNORE_MKEY) || ibp->mkey == 0 || + ibp->mkey == smp->mkey) + valid_mkey = 1; + + /* Unset lease timeout on any valid Get/Set/TrapRepress */ + if (valid_mkey && ibp->mkey_lease_timeout && + (smp->method == IB_MGMT_METHOD_GET || + smp->method == IB_MGMT_METHOD_SET || + smp->method == IB_MGMT_METHOD_TRAP_REPRESS)) + ibp->mkey_lease_timeout = 0; + + if (!valid_mkey) { + switch (smp->method) { + case IB_MGMT_METHOD_GET: + /* Bad mkey not a violation below level 2 */ + if (ibp->mkeyprot < 2) + break; + case IB_MGMT_METHOD_SET: + case IB_MGMT_METHOD_TRAP_REPRESS: + if (ibp->mkey_violations != 0xFFFF) + ++ibp->mkey_violations; + if (!ibp->mkey_lease_timeout && ibp->mkey_lease_period) + ibp->mkey_lease_timeout = jiffies + + ibp->mkey_lease_period * HZ; + /* Generate a trap notice. */ + qib_bad_mkey(ibp, smp); + ret = 1; + } + } + + return ret; +} + +static int subn_get_portinfo(struct ib_smp *smp, struct ib_device *ibdev, + u8 port) +{ + struct qib_devdata *dd; + struct qib_pportdata *ppd; + struct qib_ibport *ibp; + struct ib_port_info *pip = (struct ib_port_info *)smp->data; + u8 mtu; + int ret; + u32 state; + u32 port_num = be32_to_cpu(smp->attr_mod); + + if (port_num == 0) + port_num = port; + else { + if (port_num > ibdev->phys_port_cnt) { + smp->status |= IB_SMP_INVALID_FIELD; + ret = reply(smp); + goto bail; + } + if (port_num != port) { + ibp = to_iport(ibdev, port_num); + ret = check_mkey(ibp, smp, 0); + if (ret) { + ret = IB_MAD_RESULT_FAILURE; + goto bail; + } + } + } + + dd = dd_from_ibdev(ibdev); + /* IB numbers ports from 1, hdw from 0 */ + ppd = dd->pport + (port_num - 1); + ibp = &ppd->ibport_data; + + /* Clear all fields. Only set the non-zero fields. */ + memset(smp->data, 0, sizeof(smp->data)); + + /* Only return the mkey if the protection field allows it. */ + if (!(smp->method == IB_MGMT_METHOD_GET && + ibp->mkey != smp->mkey && + ibp->mkeyprot == 1)) + pip->mkey = ibp->mkey; + pip->gid_prefix = ibp->gid_prefix; + pip->lid = cpu_to_be16(ppd->lid); + pip->sm_lid = cpu_to_be16(ibp->sm_lid); + pip->cap_mask = cpu_to_be32(ibp->port_cap_flags); + /* pip->diag_code; */ + pip->mkey_lease_period = cpu_to_be16(ibp->mkey_lease_period); + pip->local_port_num = port; + pip->link_width_enabled = ppd->link_width_enabled; + pip->link_width_supported = ppd->link_width_supported; + pip->link_width_active = ppd->link_width_active; + state = dd->f_iblink_state(ppd->lastibcstat); + pip->linkspeed_portstate = ppd->link_speed_supported << 4 | state; + + pip->portphysstate_linkdown = + (dd->f_ibphys_portstate(ppd->lastibcstat) << 4) | + (get_linkdowndefaultstate(ppd) ? 1 : 2); + pip->mkeyprot_resv_lmc = (ibp->mkeyprot << 6) | ppd->lmc; + pip->linkspeedactive_enabled = (ppd->link_speed_active << 4) | + ppd->link_speed_enabled; + switch (ppd->ibmtu) { + default: /* something is wrong; fall through */ + case 4096: + mtu = IB_MTU_4096; + break; + case 2048: + mtu = IB_MTU_2048; + break; + case 1024: + mtu = IB_MTU_1024; + break; + case 512: + mtu = IB_MTU_512; + break; + case 256: + mtu = IB_MTU_256; + break; + } + pip->neighbormtu_mastersmsl = (mtu << 4) | ibp->sm_sl; + pip->vlcap_inittype = ppd->vls_supported << 4; /* InitType = 0 */ + pip->vl_high_limit = ibp->vl_high_limit; + pip->vl_arb_high_cap = + dd->f_get_ib_cfg(ppd, QIB_IB_CFG_VL_HIGH_CAP); + pip->vl_arb_low_cap = + dd->f_get_ib_cfg(ppd, QIB_IB_CFG_VL_LOW_CAP); + /* InitTypeReply = 0 */ + pip->inittypereply_mtucap = qib_ibmtu ? qib_ibmtu : IB_MTU_4096; + /* HCAs ignore VLStallCount and HOQLife */ + /* pip->vlstallcnt_hoqlife; */ + pip->operationalvl_pei_peo_fpi_fpo = + dd->f_get_ib_cfg(ppd, QIB_IB_CFG_OP_VLS) << 4; + pip->mkey_violations = cpu_to_be16(ibp->mkey_violations); + /* P_KeyViolations are counted by hardware. */ + pip->pkey_violations = cpu_to_be16(ibp->pkey_violations); + pip->qkey_violations = cpu_to_be16(ibp->qkey_violations); + /* Only the hardware GUID is supported for now */ + pip->guid_cap = QIB_GUIDS_PER_PORT; + pip->clientrereg_resv_subnetto = ibp->subnet_timeout; + /* 32.768 usec. response time (guessing) */ + pip->resv_resptimevalue = 3; + pip->localphyerrors_overrunerrors = + (get_phyerrthreshold(ppd) << 4) | + get_overrunthreshold(ppd); + /* pip->max_credit_hint; */ + if (ibp->port_cap_flags & IB_PORT_LINK_LATENCY_SUP) { + u32 v; + + v = dd->f_get_ib_cfg(ppd, QIB_IB_CFG_LINKLATENCY); + pip->link_roundtrip_latency[0] = v >> 16; + pip->link_roundtrip_latency[1] = v >> 8; + pip->link_roundtrip_latency[2] = v; + } + + ret = reply(smp); + +bail: + return ret; +} + +/** + * get_pkeys - return the PKEY table + * @dd: the qlogic_ib device + * @port: the IB port number + * @pkeys: the pkey table is placed here + */ +static int get_pkeys(struct qib_devdata *dd, u8 port, u16 *pkeys) +{ + struct qib_pportdata *ppd = dd->pport + port - 1; + /* + * always a kernel context, no locking needed. + * If we get here with ppd setup, no need to check + * that pd is valid. + */ + struct qib_ctxtdata *rcd = dd->rcd[ppd->hw_pidx]; + + memcpy(pkeys, rcd->pkeys, sizeof(rcd->pkeys)); + + return 0; +} + +static int subn_get_pkeytable(struct ib_smp *smp, struct ib_device *ibdev, + u8 port) +{ + u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff); + u16 *p = (u16 *) smp->data; + __be16 *q = (__be16 *) smp->data; + + /* 64 blocks of 32 16-bit P_Key entries */ + + memset(smp->data, 0, sizeof(smp->data)); + if (startpx == 0) { + struct qib_devdata *dd = dd_from_ibdev(ibdev); + unsigned i, n = qib_get_npkeys(dd); + + get_pkeys(dd, port, p); + + for (i = 0; i < n; i++) + q[i] = cpu_to_be16(p[i]); + } else + smp->status |= IB_SMP_INVALID_FIELD; + + return reply(smp); +} + +static int subn_set_guidinfo(struct ib_smp *smp, struct ib_device *ibdev, + u8 port) +{ + struct qib_devdata *dd = dd_from_ibdev(ibdev); + u32 startgx = 8 * be32_to_cpu(smp->attr_mod); + __be64 *p = (__be64 *) smp->data; + unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */ + + /* 32 blocks of 8 64-bit GUIDs per block */ + + if (startgx == 0 && pidx < dd->num_pports) { + struct qib_pportdata *ppd = dd->pport + pidx; + struct qib_ibport *ibp = &ppd->ibport_data; + unsigned i; + + /* The first entry is read-only. */ + for (i = 1; i < QIB_GUIDS_PER_PORT; i++) + ibp->guids[i - 1] = p[i]; + } else + smp->status |= IB_SMP_INVALID_FIELD; + + /* The only GUID we support is the first read-only entry. */ + return subn_get_guidinfo(smp, ibdev, port); +} + +/** + * subn_set_portinfo - set port information + * @smp: the incoming SM packet + * @ibdev: the infiniband device + * @port: the port on the device + * + * Set Portinfo (see ch. 14.2.5.6). + */ +static int subn_set_portinfo(struct ib_smp *smp, struct ib_device *ibdev, + u8 port) +{ + struct ib_port_info *pip = (struct ib_port_info *)smp->data; + struct ib_event event; + struct qib_devdata *dd; + struct qib_pportdata *ppd; + struct qib_ibport *ibp; + u8 clientrereg = (pip->clientrereg_resv_subnetto & 0x80); + unsigned long flags; + u16 lid, smlid; + u8 lwe; + u8 lse; + u8 state; + u8 vls; + u8 msl; + u16 lstate; + int ret, ore, mtu; + u32 port_num = be32_to_cpu(smp->attr_mod); + + if (port_num == 0) + port_num = port; + else { + if (port_num > ibdev->phys_port_cnt) + goto err; + /* Port attributes can only be set on the receiving port */ + if (port_num != port) + goto get_only; + } + + dd = dd_from_ibdev(ibdev); + /* IB numbers ports from 1, hdw from 0 */ + ppd = dd->pport + (port_num - 1); + ibp = &ppd->ibport_data; + event.device = ibdev; + event.element.port_num = port; + + ibp->mkey = pip->mkey; + ibp->gid_prefix = pip->gid_prefix; + ibp->mkey_lease_period = be16_to_cpu(pip->mkey_lease_period); + + lid = be16_to_cpu(pip->lid); + /* Must be a valid unicast LID address. */ + if (lid == 0 || lid >= QIB_MULTICAST_LID_BASE) + smp->status |= IB_SMP_INVALID_FIELD; + else if (ppd->lid != lid || ppd->lmc != (pip->mkeyprot_resv_lmc & 7)) { + if (ppd->lid != lid) + qib_set_uevent_bits(ppd, _QIB_EVENT_LID_CHANGE_BIT); + if (ppd->lmc != (pip->mkeyprot_resv_lmc & 7)) + qib_set_uevent_bits(ppd, _QIB_EVENT_LMC_CHANGE_BIT); + qib_set_lid(ppd, lid, pip->mkeyprot_resv_lmc & 7); + event.event = IB_EVENT_LID_CHANGE; + ib_dispatch_event(&event); + } + + smlid = be16_to_cpu(pip->sm_lid); + msl = pip->neighbormtu_mastersmsl & 0xF; + /* Must be a valid unicast LID address. */ + if (smlid == 0 || smlid >= QIB_MULTICAST_LID_BASE) + smp->status |= IB_SMP_INVALID_FIELD; + else if (smlid != ibp->sm_lid || msl != ibp->sm_sl) { + spin_lock_irqsave(&ibp->lock, flags); + if (ibp->sm_ah) { + if (smlid != ibp->sm_lid) + ibp->sm_ah->attr.dlid = smlid; + if (msl != ibp->sm_sl) + ibp->sm_ah->attr.sl = msl; + } + spin_unlock_irqrestore(&ibp->lock, flags); + if (smlid != ibp->sm_lid) + ibp->sm_lid = smlid; + if (msl != ibp->sm_sl) + ibp->sm_sl = msl; + event.event = IB_EVENT_SM_CHANGE; + ib_dispatch_event(&event); + } + + /* Allow 1x or 4x to be set (see 14.2.6.6). */ + lwe = pip->link_width_enabled; + if (lwe) { + if (lwe == 0xFF) + set_link_width_enabled(ppd, ppd->link_width_supported); + else if (lwe >= 16 || (lwe & ~ppd->link_width_supported)) + smp->status |= IB_SMP_INVALID_FIELD; + else if (lwe != ppd->link_width_enabled) + set_link_width_enabled(ppd, lwe); + } + + lse = pip->linkspeedactive_enabled & 0xF; + if (lse) { + /* + * The IB 1.2 spec. only allows link speed values + * 1, 3, 5, 7, 15. 1.2.1 extended to allow specific + * speeds. + */ + if (lse == 15) + set_link_speed_enabled(ppd, + ppd->link_speed_supported); + else if (lse >= 8 || (lse & ~ppd->link_speed_supported)) + smp->status |= IB_SMP_INVALID_FIELD; + else if (lse != ppd->link_speed_enabled) + set_link_speed_enabled(ppd, lse); + } + + /* Set link down default state. */ + switch (pip->portphysstate_linkdown & 0xF) { + case 0: /* NOP */ + break; + case 1: /* SLEEP */ + (void) dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LINKDEFAULT, + IB_LINKINITCMD_SLEEP); + break; + case 2: /* POLL */ + (void) dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LINKDEFAULT, + IB_LINKINITCMD_POLL); + break; + default: + smp->status |= IB_SMP_INVALID_FIELD; + } + + ibp->mkeyprot = pip->mkeyprot_resv_lmc >> 6; + ibp->vl_high_limit = pip->vl_high_limit; + (void) dd->f_set_ib_cfg(ppd, QIB_IB_CFG_VL_HIGH_LIMIT, + ibp->vl_high_limit); + + mtu = ib_mtu_enum_to_int((pip->neighbormtu_mastersmsl >> 4) & 0xF); + if (mtu == -1) + smp->status |= IB_SMP_INVALID_FIELD; + else + qib_set_mtu(ppd, mtu); + + /* Set operational VLs */ + vls = (pip->operationalvl_pei_peo_fpi_fpo >> 4) & 0xF; + if (vls) { + if (vls > ppd->vls_supported) + smp->status |= IB_SMP_INVALID_FIELD; + else + (void) dd->f_set_ib_cfg(ppd, QIB_IB_CFG_OP_VLS, vls); + } + + if (pip->mkey_violations == 0) + ibp->mkey_violations = 0; + + if (pip->pkey_violations == 0) + ibp->pkey_violations = 0; + + if (pip->qkey_violations == 0) + ibp->qkey_violations = 0; + + ore = pip->localphyerrors_overrunerrors; + if (set_phyerrthreshold(ppd, (ore >> 4) & 0xF)) + smp->status |= IB_SMP_INVALID_FIELD; + + if (set_overrunthreshold(ppd, (ore & 0xF))) + smp->status |= IB_SMP_INVALID_FIELD; + + ibp->subnet_timeout = pip->clientrereg_resv_subnetto & 0x1F; + + /* + * Do the port state change now that the other link parameters + * have been set. + * Changing the port physical state only makes sense if the link + * is down or is being set to down. + */ + state = pip->linkspeed_portstate & 0xF; + lstate = (pip->portphysstate_linkdown >> 4) & 0xF; + if (lstate && !(state == IB_PORT_DOWN || state == IB_PORT_NOP)) + smp->status |= IB_SMP_INVALID_FIELD; + + /* + * Only state changes of DOWN, ARM, and ACTIVE are valid + * and must be in the correct state to take effect (see 7.2.6). + */ + switch (state) { + case IB_PORT_NOP: + if (lstate == 0) + break; + /* FALLTHROUGH */ + case IB_PORT_DOWN: + if (lstate == 0) + lstate = QIB_IB_LINKDOWN_ONLY; + else if (lstate == 1) + lstate = QIB_IB_LINKDOWN_SLEEP; + else if (lstate == 2) + lstate = QIB_IB_LINKDOWN; + else if (lstate == 3) + lstate = QIB_IB_LINKDOWN_DISABLE; + else { + smp->status |= IB_SMP_INVALID_FIELD; + break; + } + spin_lock_irqsave(&ppd->lflags_lock, flags); + ppd->lflags &= ~QIBL_LINKV; + spin_unlock_irqrestore(&ppd->lflags_lock, flags); + qib_set_linkstate(ppd, lstate); + /* + * Don't send a reply if the response would be sent + * through the disabled port. + */ + if (lstate == QIB_IB_LINKDOWN_DISABLE && smp->hop_cnt) { + ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + goto done; + } + qib_wait_linkstate(ppd, QIBL_LINKV, 10); + break; + case IB_PORT_ARMED: + qib_set_linkstate(ppd, QIB_IB_LINKARM); + break; + case IB_PORT_ACTIVE: + qib_set_linkstate(ppd, QIB_IB_LINKACTIVE); + break; + default: + smp->status |= IB_SMP_INVALID_FIELD; + } + + if (clientrereg) { + event.event = IB_EVENT_CLIENT_REREGISTER; + ib_dispatch_event(&event); + } + + ret = subn_get_portinfo(smp, ibdev, port); + + /* restore re-reg bit per o14-12.2.1 */ + pip->clientrereg_resv_subnetto |= clientrereg; + + goto get_only; + +err: + smp->status |= IB_SMP_INVALID_FIELD; +get_only: + ret = subn_get_portinfo(smp, ibdev, port); +done: + return ret; +} + +/** + * rm_pkey - decrecment the reference count for the given PKEY + * @dd: the qlogic_ib device + * @key: the PKEY index + * + * Return true if this was the last reference and the hardware table entry + * needs to be changed. + */ +static int rm_pkey(struct qib_pportdata *ppd, u16 key) +{ + int i; + int ret; + + for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) { + if (ppd->pkeys[i] != key) + continue; + if (atomic_dec_and_test(&ppd->pkeyrefs[i])) { + ppd->pkeys[i] = 0; + ret = 1; + goto bail; + } + break; + } + + ret = 0; + +bail: + return ret; +} + +/** + * add_pkey - add the given PKEY to the hardware table + * @dd: the qlogic_ib device + * @key: the PKEY + * + * Return an error code if unable to add the entry, zero if no change, + * or 1 if the hardware PKEY register needs to be updated. + */ +static int add_pkey(struct qib_pportdata *ppd, u16 key) +{ + int i; + u16 lkey = key & 0x7FFF; + int any = 0; + int ret; + + if (lkey == 0x7FFF) { + ret = 0; + goto bail; + } + + /* Look for an empty slot or a matching PKEY. */ + for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) { + if (!ppd->pkeys[i]) { + any++; + continue; + } + /* If it matches exactly, try to increment the ref count */ + if (ppd->pkeys[i] == key) { + if (atomic_inc_return(&ppd->pkeyrefs[i]) > 1) { + ret = 0; + goto bail; + } + /* Lost the race. Look for an empty slot below. */ + atomic_dec(&ppd->pkeyrefs[i]); + any++; + } + /* + * It makes no sense to have both the limited and unlimited + * PKEY set at the same time since the unlimited one will + * disable the limited one. + */ + if ((ppd->pkeys[i] & 0x7FFF) == lkey) { + ret = -EEXIST; + goto bail; + } + } + if (!any) { + ret = -EBUSY; + goto bail; + } + for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) { + if (!ppd->pkeys[i] && + atomic_inc_return(&ppd->pkeyrefs[i]) == 1) { + /* for qibstats, etc. */ + ppd->pkeys[i] = key; + ret = 1; + goto bail; + } + } + ret = -EBUSY; + +bail: + return ret; +} + +/** + * set_pkeys - set the PKEY table for ctxt 0 + * @dd: the qlogic_ib device + * @port: the IB port number + * @pkeys: the PKEY table + */ +static int set_pkeys(struct qib_devdata *dd, u8 port, u16 *pkeys) +{ + struct qib_pportdata *ppd; + struct qib_ctxtdata *rcd; + int i; + int changed = 0; + + /* + * IB port one/two always maps to context zero/one, + * always a kernel context, no locking needed + * If we get here with ppd setup, no need to check + * that rcd is valid. + */ + ppd = dd->pport + (port - 1); + rcd = dd->rcd[ppd->hw_pidx]; + + for (i = 0; i < ARRAY_SIZE(rcd->pkeys); i++) { + u16 key = pkeys[i]; + u16 okey = rcd->pkeys[i]; + + if (key == okey) + continue; + /* + * The value of this PKEY table entry is changing. + * Remove the old entry in the hardware's array of PKEYs. + */ + if (okey & 0x7FFF) + changed |= rm_pkey(ppd, okey); + if (key & 0x7FFF) { + int ret = add_pkey(ppd, key); + + if (ret < 0) + key = 0; + else + changed |= ret; + } + rcd->pkeys[i] = key; + } + if (changed) { + struct ib_event event; + + (void) dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PKEYS, 0); + + event.event = IB_EVENT_PKEY_CHANGE; + event.device = &dd->verbs_dev.ibdev; + event.element.port_num = port; + ib_dispatch_event(&event); + } + return 0; +} + +static int subn_set_pkeytable(struct ib_smp *smp, struct ib_device *ibdev, + u8 port) +{ + u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff); + __be16 *p = (__be16 *) smp->data; + u16 *q = (u16 *) smp->data; + struct qib_devdata *dd = dd_from_ibdev(ibdev); + unsigned i, n = qib_get_npkeys(dd); + + for (i = 0; i < n; i++) + q[i] = be16_to_cpu(p[i]); + + if (startpx != 0 || set_pkeys(dd, port, q) != 0) + smp->status |= IB_SMP_INVALID_FIELD; + + return subn_get_pkeytable(smp, ibdev, port); +} + +static int subn_get_sl_to_vl(struct ib_smp *smp, struct ib_device *ibdev, + u8 port) +{ + struct qib_ibport *ibp = to_iport(ibdev, port); + u8 *p = (u8 *) smp->data; + unsigned i; + + memset(smp->data, 0, sizeof(smp->data)); + + if (!(ibp->port_cap_flags & IB_PORT_SL_MAP_SUP)) + smp->status |= IB_SMP_UNSUP_METHOD; + else + for (i = 0; i < ARRAY_SIZE(ibp->sl_to_vl); i += 2) + *p++ = (ibp->sl_to_vl[i] << 4) | ibp->sl_to_vl[i + 1]; + + return reply(smp); +} + +static int subn_set_sl_to_vl(struct ib_smp *smp, struct ib_device *ibdev, + u8 port) +{ + struct qib_ibport *ibp = to_iport(ibdev, port); + u8 *p = (u8 *) smp->data; + unsigned i; + + if (!(ibp->port_cap_flags & IB_PORT_SL_MAP_SUP)) { + smp->status |= IB_SMP_UNSUP_METHOD; + return reply(smp); + } + + for (i = 0; i < ARRAY_SIZE(ibp->sl_to_vl); i += 2, p++) { + ibp->sl_to_vl[i] = *p >> 4; + ibp->sl_to_vl[i + 1] = *p & 0xF; + } + qib_set_uevent_bits(ppd_from_ibp(to_iport(ibdev, port)), + _QIB_EVENT_SL2VL_CHANGE_BIT); + + return subn_get_sl_to_vl(smp, ibdev, port); +} + +static int subn_get_vl_arb(struct ib_smp *smp, struct ib_device *ibdev, + u8 port) +{ + unsigned which = be32_to_cpu(smp->attr_mod) >> 16; + struct qib_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port)); + + memset(smp->data, 0, sizeof(smp->data)); + + if (ppd->vls_supported == IB_VL_VL0) + smp->status |= IB_SMP_UNSUP_METHOD; + else if (which == IB_VLARB_LOWPRI_0_31) + (void) ppd->dd->f_get_ib_table(ppd, QIB_IB_TBL_VL_LOW_ARB, + smp->data); + else if (which == IB_VLARB_HIGHPRI_0_31) + (void) ppd->dd->f_get_ib_table(ppd, QIB_IB_TBL_VL_HIGH_ARB, + smp->data); + else + smp->status |= IB_SMP_INVALID_FIELD; + + return reply(smp); +} + +static int subn_set_vl_arb(struct ib_smp *smp, struct ib_device *ibdev, + u8 port) +{ + unsigned which = be32_to_cpu(smp->attr_mod) >> 16; + struct qib_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port)); + + if (ppd->vls_supported == IB_VL_VL0) + smp->status |= IB_SMP_UNSUP_METHOD; + else if (which == IB_VLARB_LOWPRI_0_31) + (void) ppd->dd->f_set_ib_table(ppd, QIB_IB_TBL_VL_LOW_ARB, + smp->data); + else if (which == IB_VLARB_HIGHPRI_0_31) + (void) ppd->dd->f_set_ib_table(ppd, QIB_IB_TBL_VL_HIGH_ARB, + smp->data); + else + smp->status |= IB_SMP_INVALID_FIELD; + + return subn_get_vl_arb(smp, ibdev, port); +} + +static int subn_trap_repress(struct ib_smp *smp, struct ib_device *ibdev, + u8 port) +{ + /* + * For now, we only send the trap once so no need to process this. + * o13-6, o13-7, + * o14-3.a4 The SMA shall not send any message in response to a valid + * SubnTrapRepress() message. + */ + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; +} + +static int pma_get_classportinfo(struct ib_pma_mad *pmp, + struct ib_device *ibdev) +{ + struct ib_class_port_info *p = + (struct ib_class_port_info *)pmp->data; + struct qib_devdata *dd = dd_from_ibdev(ibdev); + + memset(pmp->data, 0, sizeof(pmp->data)); + + if (pmp->mad_hdr.attr_mod != 0) + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + + /* Note that AllPortSelect is not valid */ + p->base_version = 1; + p->class_version = 1; + p->capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH; + /* + * Set the most significant bit of CM2 to indicate support for + * congestion statistics + */ + p->reserved[0] = dd->psxmitwait_supported << 7; + /* + * Expected response time is 4.096 usec. * 2^18 == 1.073741824 sec. + */ + p->resp_time_value = 18; + + return reply((struct ib_smp *) pmp); +} + +static int pma_get_portsamplescontrol(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portsamplescontrol *p = + (struct ib_pma_portsamplescontrol *)pmp->data; + struct qib_ibdev *dev = to_idev(ibdev); + struct qib_devdata *dd = dd_from_dev(dev); + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + unsigned long flags; + u8 port_select = p->port_select; + + memset(pmp->data, 0, sizeof(pmp->data)); + + p->port_select = port_select; + if (pmp->mad_hdr.attr_mod != 0 || port_select != port) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + goto bail; + } + spin_lock_irqsave(&ibp->lock, flags); + p->tick = dd->f_get_ib_cfg(ppd, QIB_IB_CFG_PMA_TICKS); + p->sample_status = dd->f_portcntr(ppd, QIBPORTCNTR_PSSTAT); + p->counter_width = 4; /* 32 bit counters */ + p->counter_mask0_9 = COUNTER_MASK0_9; + p->sample_start = cpu_to_be32(ibp->pma_sample_start); + p->sample_interval = cpu_to_be32(ibp->pma_sample_interval); + p->tag = cpu_to_be16(ibp->pma_tag); + p->counter_select[0] = ibp->pma_counter_select[0]; + p->counter_select[1] = ibp->pma_counter_select[1]; + p->counter_select[2] = ibp->pma_counter_select[2]; + p->counter_select[3] = ibp->pma_counter_select[3]; + p->counter_select[4] = ibp->pma_counter_select[4]; + spin_unlock_irqrestore(&ibp->lock, flags); + +bail: + return reply((struct ib_smp *) pmp); +} + +static int pma_set_portsamplescontrol(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portsamplescontrol *p = + (struct ib_pma_portsamplescontrol *)pmp->data; + struct qib_ibdev *dev = to_idev(ibdev); + struct qib_devdata *dd = dd_from_dev(dev); + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + unsigned long flags; + u8 status, xmit_flags; + int ret; + + if (pmp->mad_hdr.attr_mod != 0 || p->port_select != port) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + ret = reply((struct ib_smp *) pmp); + goto bail; + } + + spin_lock_irqsave(&ibp->lock, flags); + + /* Port Sampling code owns the PS* HW counters */ + xmit_flags = ppd->cong_stats.flags; + ppd->cong_stats.flags = IB_PMA_CONG_HW_CONTROL_SAMPLE; + status = dd->f_portcntr(ppd, QIBPORTCNTR_PSSTAT); + if (status == IB_PMA_SAMPLE_STATUS_DONE || + (status == IB_PMA_SAMPLE_STATUS_RUNNING && + xmit_flags == IB_PMA_CONG_HW_CONTROL_TIMER)) { + ibp->pma_sample_start = be32_to_cpu(p->sample_start); + ibp->pma_sample_interval = be32_to_cpu(p->sample_interval); + ibp->pma_tag = be16_to_cpu(p->tag); + ibp->pma_counter_select[0] = p->counter_select[0]; + ibp->pma_counter_select[1] = p->counter_select[1]; + ibp->pma_counter_select[2] = p->counter_select[2]; + ibp->pma_counter_select[3] = p->counter_select[3]; + ibp->pma_counter_select[4] = p->counter_select[4]; + dd->f_set_cntr_sample(ppd, ibp->pma_sample_interval, + ibp->pma_sample_start); + } + spin_unlock_irqrestore(&ibp->lock, flags); + + ret = pma_get_portsamplescontrol(pmp, ibdev, port); + +bail: + return ret; +} + +static u64 get_counter(struct qib_ibport *ibp, struct qib_pportdata *ppd, + __be16 sel) +{ + u64 ret; + + switch (sel) { + case IB_PMA_PORT_XMIT_DATA: + ret = ppd->dd->f_portcntr(ppd, QIBPORTCNTR_PSXMITDATA); + break; + case IB_PMA_PORT_RCV_DATA: + ret = ppd->dd->f_portcntr(ppd, QIBPORTCNTR_PSRCVDATA); + break; + case IB_PMA_PORT_XMIT_PKTS: + ret = ppd->dd->f_portcntr(ppd, QIBPORTCNTR_PSXMITPKTS); + break; + case IB_PMA_PORT_RCV_PKTS: + ret = ppd->dd->f_portcntr(ppd, QIBPORTCNTR_PSRCVPKTS); + break; + case IB_PMA_PORT_XMIT_WAIT: + ret = ppd->dd->f_portcntr(ppd, QIBPORTCNTR_PSXMITWAIT); + break; + default: + ret = 0; + } + + return ret; +} + +/* This function assumes that the xmit_wait lock is already held */ +static u64 xmit_wait_get_value_delta(struct qib_pportdata *ppd) +{ + u32 delta; + + delta = get_counter(&ppd->ibport_data, ppd, + IB_PMA_PORT_XMIT_WAIT); + return ppd->cong_stats.counter + delta; +} + +static void cache_hw_sample_counters(struct qib_pportdata *ppd) +{ + struct qib_ibport *ibp = &ppd->ibport_data; + + ppd->cong_stats.counter_cache.psxmitdata = + get_counter(ibp, ppd, IB_PMA_PORT_XMIT_DATA); + ppd->cong_stats.counter_cache.psrcvdata = + get_counter(ibp, ppd, IB_PMA_PORT_RCV_DATA); + ppd->cong_stats.counter_cache.psxmitpkts = + get_counter(ibp, ppd, IB_PMA_PORT_XMIT_PKTS); + ppd->cong_stats.counter_cache.psrcvpkts = + get_counter(ibp, ppd, IB_PMA_PORT_RCV_PKTS); + ppd->cong_stats.counter_cache.psxmitwait = + get_counter(ibp, ppd, IB_PMA_PORT_XMIT_WAIT); +} + +static u64 get_cache_hw_sample_counters(struct qib_pportdata *ppd, + __be16 sel) +{ + u64 ret; + + switch (sel) { + case IB_PMA_PORT_XMIT_DATA: + ret = ppd->cong_stats.counter_cache.psxmitdata; + break; + case IB_PMA_PORT_RCV_DATA: + ret = ppd->cong_stats.counter_cache.psrcvdata; + break; + case IB_PMA_PORT_XMIT_PKTS: + ret = ppd->cong_stats.counter_cache.psxmitpkts; + break; + case IB_PMA_PORT_RCV_PKTS: + ret = ppd->cong_stats.counter_cache.psrcvpkts; + break; + case IB_PMA_PORT_XMIT_WAIT: + ret = ppd->cong_stats.counter_cache.psxmitwait; + break; + default: + ret = 0; + } + + return ret; +} + +static int pma_get_portsamplesresult(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portsamplesresult *p = + (struct ib_pma_portsamplesresult *)pmp->data; + struct qib_ibdev *dev = to_idev(ibdev); + struct qib_devdata *dd = dd_from_dev(dev); + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + unsigned long flags; + u8 status; + int i; + + memset(pmp->data, 0, sizeof(pmp->data)); + spin_lock_irqsave(&ibp->lock, flags); + p->tag = cpu_to_be16(ibp->pma_tag); + if (ppd->cong_stats.flags == IB_PMA_CONG_HW_CONTROL_TIMER) + p->sample_status = IB_PMA_SAMPLE_STATUS_DONE; + else { + status = dd->f_portcntr(ppd, QIBPORTCNTR_PSSTAT); + p->sample_status = cpu_to_be16(status); + if (status == IB_PMA_SAMPLE_STATUS_DONE) { + cache_hw_sample_counters(ppd); + ppd->cong_stats.counter = + xmit_wait_get_value_delta(ppd); + dd->f_set_cntr_sample(ppd, + QIB_CONG_TIMER_PSINTERVAL, 0); + ppd->cong_stats.flags = IB_PMA_CONG_HW_CONTROL_TIMER; + } + } + for (i = 0; i < ARRAY_SIZE(ibp->pma_counter_select); i++) + p->counter[i] = cpu_to_be32( + get_cache_hw_sample_counters( + ppd, ibp->pma_counter_select[i])); + spin_unlock_irqrestore(&ibp->lock, flags); + + return reply((struct ib_smp *) pmp); +} + +static int pma_get_portsamplesresult_ext(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portsamplesresult_ext *p = + (struct ib_pma_portsamplesresult_ext *)pmp->data; + struct qib_ibdev *dev = to_idev(ibdev); + struct qib_devdata *dd = dd_from_dev(dev); + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + unsigned long flags; + u8 status; + int i; + + /* Port Sampling code owns the PS* HW counters */ + memset(pmp->data, 0, sizeof(pmp->data)); + spin_lock_irqsave(&ibp->lock, flags); + p->tag = cpu_to_be16(ibp->pma_tag); + if (ppd->cong_stats.flags == IB_PMA_CONG_HW_CONTROL_TIMER) + p->sample_status = IB_PMA_SAMPLE_STATUS_DONE; + else { + status = dd->f_portcntr(ppd, QIBPORTCNTR_PSSTAT); + p->sample_status = cpu_to_be16(status); + /* 64 bits */ + p->extended_width = cpu_to_be32(0x80000000); + if (status == IB_PMA_SAMPLE_STATUS_DONE) { + cache_hw_sample_counters(ppd); + ppd->cong_stats.counter = + xmit_wait_get_value_delta(ppd); + dd->f_set_cntr_sample(ppd, + QIB_CONG_TIMER_PSINTERVAL, 0); + ppd->cong_stats.flags = IB_PMA_CONG_HW_CONTROL_TIMER; + } + } + for (i = 0; i < ARRAY_SIZE(ibp->pma_counter_select); i++) + p->counter[i] = cpu_to_be64( + get_cache_hw_sample_counters( + ppd, ibp->pma_counter_select[i])); + spin_unlock_irqrestore(&ibp->lock, flags); + + return reply((struct ib_smp *) pmp); +} + +static int pma_get_portcounters(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portcounters *p = (struct ib_pma_portcounters *) + pmp->data; + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + struct qib_verbs_counters cntrs; + u8 port_select = p->port_select; + + qib_get_counters(ppd, &cntrs); + + /* Adjust counters for any resets done. */ + cntrs.symbol_error_counter -= ibp->z_symbol_error_counter; + cntrs.link_error_recovery_counter -= + ibp->z_link_error_recovery_counter; + cntrs.link_downed_counter -= ibp->z_link_downed_counter; + cntrs.port_rcv_errors -= ibp->z_port_rcv_errors; + cntrs.port_rcv_remphys_errors -= ibp->z_port_rcv_remphys_errors; + cntrs.port_xmit_discards -= ibp->z_port_xmit_discards; + cntrs.port_xmit_data -= ibp->z_port_xmit_data; + cntrs.port_rcv_data -= ibp->z_port_rcv_data; + cntrs.port_xmit_packets -= ibp->z_port_xmit_packets; + cntrs.port_rcv_packets -= ibp->z_port_rcv_packets; + cntrs.local_link_integrity_errors -= + ibp->z_local_link_integrity_errors; + cntrs.excessive_buffer_overrun_errors -= + ibp->z_excessive_buffer_overrun_errors; + cntrs.vl15_dropped -= ibp->z_vl15_dropped; + cntrs.vl15_dropped += ibp->n_vl15_dropped; + + memset(pmp->data, 0, sizeof(pmp->data)); + + p->port_select = port_select; + if (pmp->mad_hdr.attr_mod != 0 || port_select != port) + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + + if (cntrs.symbol_error_counter > 0xFFFFUL) + p->symbol_error_counter = cpu_to_be16(0xFFFF); + else + p->symbol_error_counter = + cpu_to_be16((u16)cntrs.symbol_error_counter); + if (cntrs.link_error_recovery_counter > 0xFFUL) + p->link_error_recovery_counter = 0xFF; + else + p->link_error_recovery_counter = + (u8)cntrs.link_error_recovery_counter; + if (cntrs.link_downed_counter > 0xFFUL) + p->link_downed_counter = 0xFF; + else + p->link_downed_counter = (u8)cntrs.link_downed_counter; + if (cntrs.port_rcv_errors > 0xFFFFUL) + p->port_rcv_errors = cpu_to_be16(0xFFFF); + else + p->port_rcv_errors = + cpu_to_be16((u16) cntrs.port_rcv_errors); + if (cntrs.port_rcv_remphys_errors > 0xFFFFUL) + p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF); + else + p->port_rcv_remphys_errors = + cpu_to_be16((u16)cntrs.port_rcv_remphys_errors); + if (cntrs.port_xmit_discards > 0xFFFFUL) + p->port_xmit_discards = cpu_to_be16(0xFFFF); + else + p->port_xmit_discards = + cpu_to_be16((u16)cntrs.port_xmit_discards); + if (cntrs.local_link_integrity_errors > 0xFUL) + cntrs.local_link_integrity_errors = 0xFUL; + if (cntrs.excessive_buffer_overrun_errors > 0xFUL) + cntrs.excessive_buffer_overrun_errors = 0xFUL; + p->link_overrun_errors = (cntrs.local_link_integrity_errors << 4) | + cntrs.excessive_buffer_overrun_errors; + if (cntrs.vl15_dropped > 0xFFFFUL) + p->vl15_dropped = cpu_to_be16(0xFFFF); + else + p->vl15_dropped = cpu_to_be16((u16)cntrs.vl15_dropped); + if (cntrs.port_xmit_data > 0xFFFFFFFFUL) + p->port_xmit_data = cpu_to_be32(0xFFFFFFFF); + else + p->port_xmit_data = cpu_to_be32((u32)cntrs.port_xmit_data); + if (cntrs.port_rcv_data > 0xFFFFFFFFUL) + p->port_rcv_data = cpu_to_be32(0xFFFFFFFF); + else + p->port_rcv_data = cpu_to_be32((u32)cntrs.port_rcv_data); + if (cntrs.port_xmit_packets > 0xFFFFFFFFUL) + p->port_xmit_packets = cpu_to_be32(0xFFFFFFFF); + else + p->port_xmit_packets = + cpu_to_be32((u32)cntrs.port_xmit_packets); + if (cntrs.port_rcv_packets > 0xFFFFFFFFUL) + p->port_rcv_packets = cpu_to_be32(0xFFFFFFFF); + else + p->port_rcv_packets = + cpu_to_be32((u32) cntrs.port_rcv_packets); + + return reply((struct ib_smp *) pmp); +} + +static int pma_get_portcounters_cong(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + /* Congestion PMA packets start at offset 24 not 64 */ + struct ib_pma_portcounters_cong *p = + (struct ib_pma_portcounters_cong *)pmp->reserved; + struct qib_verbs_counters cntrs; + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + struct qib_devdata *dd = dd_from_ppd(ppd); + u32 port_select = be32_to_cpu(pmp->mad_hdr.attr_mod) & 0xFF; + u64 xmit_wait_counter; + unsigned long flags; + + /* + * This check is performed only in the GET method because the + * SET method ends up calling this anyway. + */ + if (!dd->psxmitwait_supported) + pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR; + if (port_select != port) + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + + qib_get_counters(ppd, &cntrs); + spin_lock_irqsave(&ppd->ibport_data.lock, flags); + xmit_wait_counter = xmit_wait_get_value_delta(ppd); + spin_unlock_irqrestore(&ppd->ibport_data.lock, flags); + + /* Adjust counters for any resets done. */ + cntrs.symbol_error_counter -= ibp->z_symbol_error_counter; + cntrs.link_error_recovery_counter -= + ibp->z_link_error_recovery_counter; + cntrs.link_downed_counter -= ibp->z_link_downed_counter; + cntrs.port_rcv_errors -= ibp->z_port_rcv_errors; + cntrs.port_rcv_remphys_errors -= + ibp->z_port_rcv_remphys_errors; + cntrs.port_xmit_discards -= ibp->z_port_xmit_discards; + cntrs.local_link_integrity_errors -= + ibp->z_local_link_integrity_errors; + cntrs.excessive_buffer_overrun_errors -= + ibp->z_excessive_buffer_overrun_errors; + cntrs.vl15_dropped -= ibp->z_vl15_dropped; + cntrs.vl15_dropped += ibp->n_vl15_dropped; + cntrs.port_xmit_data -= ibp->z_port_xmit_data; + cntrs.port_rcv_data -= ibp->z_port_rcv_data; + cntrs.port_xmit_packets -= ibp->z_port_xmit_packets; + cntrs.port_rcv_packets -= ibp->z_port_rcv_packets; + + memset(pmp->reserved, 0, sizeof(pmp->reserved) + + sizeof(pmp->data)); + + /* + * Set top 3 bits to indicate interval in picoseconds in + * remaining bits. + */ + p->port_check_rate = + cpu_to_be16((QIB_XMIT_RATE_PICO << 13) | + (dd->psxmitwait_check_rate & + ~(QIB_XMIT_RATE_PICO << 13))); + p->port_adr_events = cpu_to_be64(0); + p->port_xmit_wait = cpu_to_be64(xmit_wait_counter); + p->port_xmit_data = cpu_to_be64(cntrs.port_xmit_data); + p->port_rcv_data = cpu_to_be64(cntrs.port_rcv_data); + p->port_xmit_packets = + cpu_to_be64(cntrs.port_xmit_packets); + p->port_rcv_packets = + cpu_to_be64(cntrs.port_rcv_packets); + if (cntrs.symbol_error_counter > 0xFFFFUL) + p->symbol_error_counter = cpu_to_be16(0xFFFF); + else + p->symbol_error_counter = + cpu_to_be16( + (u16)cntrs.symbol_error_counter); + if (cntrs.link_error_recovery_counter > 0xFFUL) + p->link_error_recovery_counter = 0xFF; + else + p->link_error_recovery_counter = + (u8)cntrs.link_error_recovery_counter; + if (cntrs.link_downed_counter > 0xFFUL) + p->link_downed_counter = 0xFF; + else + p->link_downed_counter = + (u8)cntrs.link_downed_counter; + if (cntrs.port_rcv_errors > 0xFFFFUL) + p->port_rcv_errors = cpu_to_be16(0xFFFF); + else + p->port_rcv_errors = + cpu_to_be16((u16) cntrs.port_rcv_errors); + if (cntrs.port_rcv_remphys_errors > 0xFFFFUL) + p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF); + else + p->port_rcv_remphys_errors = + cpu_to_be16( + (u16)cntrs.port_rcv_remphys_errors); + if (cntrs.port_xmit_discards > 0xFFFFUL) + p->port_xmit_discards = cpu_to_be16(0xFFFF); + else + p->port_xmit_discards = + cpu_to_be16((u16)cntrs.port_xmit_discards); + if (cntrs.local_link_integrity_errors > 0xFUL) + cntrs.local_link_integrity_errors = 0xFUL; + if (cntrs.excessive_buffer_overrun_errors > 0xFUL) + cntrs.excessive_buffer_overrun_errors = 0xFUL; + p->link_overrun_errors = (cntrs.local_link_integrity_errors << 4) | + cntrs.excessive_buffer_overrun_errors; + if (cntrs.vl15_dropped > 0xFFFFUL) + p->vl15_dropped = cpu_to_be16(0xFFFF); + else + p->vl15_dropped = cpu_to_be16((u16)cntrs.vl15_dropped); + + return reply((struct ib_smp *)pmp); +} + +static void qib_snapshot_pmacounters( + struct qib_ibport *ibp, + struct qib_pma_counters *pmacounters) +{ + struct qib_pma_counters *p; + int cpu; + + memset(pmacounters, 0, sizeof(*pmacounters)); + for_each_possible_cpu(cpu) { + p = per_cpu_ptr(ibp->pmastats, cpu); + pmacounters->n_unicast_xmit += p->n_unicast_xmit; + pmacounters->n_unicast_rcv += p->n_unicast_rcv; + pmacounters->n_multicast_xmit += p->n_multicast_xmit; + pmacounters->n_multicast_rcv += p->n_multicast_rcv; + } +} + +static int pma_get_portcounters_ext(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portcounters_ext *p = + (struct ib_pma_portcounters_ext *)pmp->data; + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + u64 swords, rwords, spkts, rpkts, xwait; + struct qib_pma_counters pma; + u8 port_select = p->port_select; + + memset(pmp->data, 0, sizeof(pmp->data)); + + p->port_select = port_select; + if (pmp->mad_hdr.attr_mod != 0 || port_select != port) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + goto bail; + } + + qib_snapshot_counters(ppd, &swords, &rwords, &spkts, &rpkts, &xwait); + + /* Adjust counters for any resets done. */ + swords -= ibp->z_port_xmit_data; + rwords -= ibp->z_port_rcv_data; + spkts -= ibp->z_port_xmit_packets; + rpkts -= ibp->z_port_rcv_packets; + + p->port_xmit_data = cpu_to_be64(swords); + p->port_rcv_data = cpu_to_be64(rwords); + p->port_xmit_packets = cpu_to_be64(spkts); + p->port_rcv_packets = cpu_to_be64(rpkts); + + qib_snapshot_pmacounters(ibp, &pma); + + p->port_unicast_xmit_packets = cpu_to_be64(pma.n_unicast_xmit + - ibp->z_unicast_xmit); + p->port_unicast_rcv_packets = cpu_to_be64(pma.n_unicast_rcv + - ibp->z_unicast_rcv); + p->port_multicast_xmit_packets = cpu_to_be64(pma.n_multicast_xmit + - ibp->z_multicast_xmit); + p->port_multicast_rcv_packets = cpu_to_be64(pma.n_multicast_rcv + - ibp->z_multicast_rcv); + +bail: + return reply((struct ib_smp *) pmp); +} + +static int pma_set_portcounters(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portcounters *p = (struct ib_pma_portcounters *) + pmp->data; + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + struct qib_verbs_counters cntrs; + + /* + * Since the HW doesn't support clearing counters, we save the + * current count and subtract it from future responses. + */ + qib_get_counters(ppd, &cntrs); + + if (p->counter_select & IB_PMA_SEL_SYMBOL_ERROR) + ibp->z_symbol_error_counter = cntrs.symbol_error_counter; + + if (p->counter_select & IB_PMA_SEL_LINK_ERROR_RECOVERY) + ibp->z_link_error_recovery_counter = + cntrs.link_error_recovery_counter; + + if (p->counter_select & IB_PMA_SEL_LINK_DOWNED) + ibp->z_link_downed_counter = cntrs.link_downed_counter; + + if (p->counter_select & IB_PMA_SEL_PORT_RCV_ERRORS) + ibp->z_port_rcv_errors = cntrs.port_rcv_errors; + + if (p->counter_select & IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS) + ibp->z_port_rcv_remphys_errors = + cntrs.port_rcv_remphys_errors; + + if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DISCARDS) + ibp->z_port_xmit_discards = cntrs.port_xmit_discards; + + if (p->counter_select & IB_PMA_SEL_LOCAL_LINK_INTEGRITY_ERRORS) + ibp->z_local_link_integrity_errors = + cntrs.local_link_integrity_errors; + + if (p->counter_select & IB_PMA_SEL_EXCESSIVE_BUFFER_OVERRUNS) + ibp->z_excessive_buffer_overrun_errors = + cntrs.excessive_buffer_overrun_errors; + + if (p->counter_select & IB_PMA_SEL_PORT_VL15_DROPPED) { + ibp->n_vl15_dropped = 0; + ibp->z_vl15_dropped = cntrs.vl15_dropped; + } + + if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DATA) + ibp->z_port_xmit_data = cntrs.port_xmit_data; + + if (p->counter_select & IB_PMA_SEL_PORT_RCV_DATA) + ibp->z_port_rcv_data = cntrs.port_rcv_data; + + if (p->counter_select & IB_PMA_SEL_PORT_XMIT_PACKETS) + ibp->z_port_xmit_packets = cntrs.port_xmit_packets; + + if (p->counter_select & IB_PMA_SEL_PORT_RCV_PACKETS) + ibp->z_port_rcv_packets = cntrs.port_rcv_packets; + + return pma_get_portcounters(pmp, ibdev, port); +} + +static int pma_set_portcounters_cong(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + struct qib_devdata *dd = dd_from_ppd(ppd); + struct qib_verbs_counters cntrs; + u32 counter_select = (be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24) & 0xFF; + int ret = 0; + unsigned long flags; + + qib_get_counters(ppd, &cntrs); + /* Get counter values before we save them */ + ret = pma_get_portcounters_cong(pmp, ibdev, port); + + if (counter_select & IB_PMA_SEL_CONG_XMIT) { + spin_lock_irqsave(&ppd->ibport_data.lock, flags); + ppd->cong_stats.counter = 0; + dd->f_set_cntr_sample(ppd, QIB_CONG_TIMER_PSINTERVAL, + 0x0); + spin_unlock_irqrestore(&ppd->ibport_data.lock, flags); + } + if (counter_select & IB_PMA_SEL_CONG_PORT_DATA) { + ibp->z_port_xmit_data = cntrs.port_xmit_data; + ibp->z_port_rcv_data = cntrs.port_rcv_data; + ibp->z_port_xmit_packets = cntrs.port_xmit_packets; + ibp->z_port_rcv_packets = cntrs.port_rcv_packets; + } + if (counter_select & IB_PMA_SEL_CONG_ALL) { + ibp->z_symbol_error_counter = + cntrs.symbol_error_counter; + ibp->z_link_error_recovery_counter = + cntrs.link_error_recovery_counter; + ibp->z_link_downed_counter = + cntrs.link_downed_counter; + ibp->z_port_rcv_errors = cntrs.port_rcv_errors; + ibp->z_port_rcv_remphys_errors = + cntrs.port_rcv_remphys_errors; + ibp->z_port_xmit_discards = + cntrs.port_xmit_discards; + ibp->z_local_link_integrity_errors = + cntrs.local_link_integrity_errors; + ibp->z_excessive_buffer_overrun_errors = + cntrs.excessive_buffer_overrun_errors; + ibp->n_vl15_dropped = 0; + ibp->z_vl15_dropped = cntrs.vl15_dropped; + } + + return ret; +} + +static int pma_set_portcounters_ext(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portcounters *p = (struct ib_pma_portcounters *) + pmp->data; + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + u64 swords, rwords, spkts, rpkts, xwait; + struct qib_pma_counters pma; + + qib_snapshot_counters(ppd, &swords, &rwords, &spkts, &rpkts, &xwait); + + if (p->counter_select & IB_PMA_SELX_PORT_XMIT_DATA) + ibp->z_port_xmit_data = swords; + + if (p->counter_select & IB_PMA_SELX_PORT_RCV_DATA) + ibp->z_port_rcv_data = rwords; + + if (p->counter_select & IB_PMA_SELX_PORT_XMIT_PACKETS) + ibp->z_port_xmit_packets = spkts; + + if (p->counter_select & IB_PMA_SELX_PORT_RCV_PACKETS) + ibp->z_port_rcv_packets = rpkts; + + qib_snapshot_pmacounters(ibp, &pma); + + if (p->counter_select & IB_PMA_SELX_PORT_UNI_XMIT_PACKETS) + ibp->z_unicast_xmit = pma.n_unicast_xmit; + + if (p->counter_select & IB_PMA_SELX_PORT_UNI_RCV_PACKETS) + ibp->z_unicast_rcv = pma.n_unicast_rcv; + + if (p->counter_select & IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS) + ibp->z_multicast_xmit = pma.n_multicast_xmit; + + if (p->counter_select & IB_PMA_SELX_PORT_MULTI_RCV_PACKETS) + ibp->z_multicast_rcv = pma.n_multicast_rcv; + + return pma_get_portcounters_ext(pmp, ibdev, port); +} + +static int process_subn(struct ib_device *ibdev, int mad_flags, + u8 port, struct ib_mad *in_mad, + struct ib_mad *out_mad) +{ + struct ib_smp *smp = (struct ib_smp *)out_mad; + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + int ret; + + *out_mad = *in_mad; + if (smp->class_version != 1) { + smp->status |= IB_SMP_UNSUP_VERSION; + ret = reply(smp); + goto bail; + } + + ret = check_mkey(ibp, smp, mad_flags); + if (ret) { + u32 port_num = be32_to_cpu(smp->attr_mod); + + /* + * If this is a get/set portinfo, we already check the + * M_Key if the MAD is for another port and the M_Key + * is OK on the receiving port. This check is needed + * to increment the error counters when the M_Key + * fails to match on *both* ports. + */ + if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO && + (smp->method == IB_MGMT_METHOD_GET || + smp->method == IB_MGMT_METHOD_SET) && + port_num && port_num <= ibdev->phys_port_cnt && + port != port_num) + (void) check_mkey(to_iport(ibdev, port_num), smp, 0); + ret = IB_MAD_RESULT_FAILURE; + goto bail; + } + + switch (smp->method) { + case IB_MGMT_METHOD_GET: + switch (smp->attr_id) { + case IB_SMP_ATTR_NODE_DESC: + ret = subn_get_nodedescription(smp, ibdev); + goto bail; + case IB_SMP_ATTR_NODE_INFO: + ret = subn_get_nodeinfo(smp, ibdev, port); + goto bail; + case IB_SMP_ATTR_GUID_INFO: + ret = subn_get_guidinfo(smp, ibdev, port); + goto bail; + case IB_SMP_ATTR_PORT_INFO: + ret = subn_get_portinfo(smp, ibdev, port); + goto bail; + case IB_SMP_ATTR_PKEY_TABLE: + ret = subn_get_pkeytable(smp, ibdev, port); + goto bail; + case IB_SMP_ATTR_SL_TO_VL_TABLE: + ret = subn_get_sl_to_vl(smp, ibdev, port); + goto bail; + case IB_SMP_ATTR_VL_ARB_TABLE: + ret = subn_get_vl_arb(smp, ibdev, port); + goto bail; + case IB_SMP_ATTR_SM_INFO: + if (ibp->port_cap_flags & IB_PORT_SM_DISABLED) { + ret = IB_MAD_RESULT_SUCCESS | + IB_MAD_RESULT_CONSUMED; + goto bail; + } + if (ibp->port_cap_flags & IB_PORT_SM) { + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + } + /* FALLTHROUGH */ + default: + smp->status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply(smp); + goto bail; + } + + case IB_MGMT_METHOD_SET: + switch (smp->attr_id) { + case IB_SMP_ATTR_GUID_INFO: + ret = subn_set_guidinfo(smp, ibdev, port); + goto bail; + case IB_SMP_ATTR_PORT_INFO: + ret = subn_set_portinfo(smp, ibdev, port); + goto bail; + case IB_SMP_ATTR_PKEY_TABLE: + ret = subn_set_pkeytable(smp, ibdev, port); + goto bail; + case IB_SMP_ATTR_SL_TO_VL_TABLE: + ret = subn_set_sl_to_vl(smp, ibdev, port); + goto bail; + case IB_SMP_ATTR_VL_ARB_TABLE: + ret = subn_set_vl_arb(smp, ibdev, port); + goto bail; + case IB_SMP_ATTR_SM_INFO: + if (ibp->port_cap_flags & IB_PORT_SM_DISABLED) { + ret = IB_MAD_RESULT_SUCCESS | + IB_MAD_RESULT_CONSUMED; + goto bail; + } + if (ibp->port_cap_flags & IB_PORT_SM) { + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + } + /* FALLTHROUGH */ + default: + smp->status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply(smp); + goto bail; + } + + case IB_MGMT_METHOD_TRAP_REPRESS: + if (smp->attr_id == IB_SMP_ATTR_NOTICE) + ret = subn_trap_repress(smp, ibdev, port); + else { + smp->status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply(smp); + } + goto bail; + + case IB_MGMT_METHOD_TRAP: + case IB_MGMT_METHOD_REPORT: + case IB_MGMT_METHOD_REPORT_RESP: + case IB_MGMT_METHOD_GET_RESP: + /* + * The ib_mad module will call us to process responses + * before checking for other consumers. + * Just tell the caller to process it normally. + */ + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + + case IB_MGMT_METHOD_SEND: + if (ib_get_smp_direction(smp) && + smp->attr_id == QIB_VENDOR_IPG) { + ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PORT, + smp->data[0]); + ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + } else + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + + default: + smp->status |= IB_SMP_UNSUP_METHOD; + ret = reply(smp); + } + +bail: + return ret; +} + +static int process_perf(struct ib_device *ibdev, u8 port, + struct ib_mad *in_mad, + struct ib_mad *out_mad) +{ + struct ib_pma_mad *pmp = (struct ib_pma_mad *)out_mad; + int ret; + + *out_mad = *in_mad; + if (pmp->mad_hdr.class_version != 1) { + pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION; + ret = reply((struct ib_smp *) pmp); + goto bail; + } + + switch (pmp->mad_hdr.method) { + case IB_MGMT_METHOD_GET: + switch (pmp->mad_hdr.attr_id) { + case IB_PMA_CLASS_PORT_INFO: + ret = pma_get_classportinfo(pmp, ibdev); + goto bail; + case IB_PMA_PORT_SAMPLES_CONTROL: + ret = pma_get_portsamplescontrol(pmp, ibdev, port); + goto bail; + case IB_PMA_PORT_SAMPLES_RESULT: + ret = pma_get_portsamplesresult(pmp, ibdev, port); + goto bail; + case IB_PMA_PORT_SAMPLES_RESULT_EXT: + ret = pma_get_portsamplesresult_ext(pmp, ibdev, port); + goto bail; + case IB_PMA_PORT_COUNTERS: + ret = pma_get_portcounters(pmp, ibdev, port); + goto bail; + case IB_PMA_PORT_COUNTERS_EXT: + ret = pma_get_portcounters_ext(pmp, ibdev, port); + goto bail; + case IB_PMA_PORT_COUNTERS_CONG: + ret = pma_get_portcounters_cong(pmp, ibdev, port); + goto bail; + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_smp *) pmp); + goto bail; + } + + case IB_MGMT_METHOD_SET: + switch (pmp->mad_hdr.attr_id) { + case IB_PMA_PORT_SAMPLES_CONTROL: + ret = pma_set_portsamplescontrol(pmp, ibdev, port); + goto bail; + case IB_PMA_PORT_COUNTERS: + ret = pma_set_portcounters(pmp, ibdev, port); + goto bail; + case IB_PMA_PORT_COUNTERS_EXT: + ret = pma_set_portcounters_ext(pmp, ibdev, port); + goto bail; + case IB_PMA_PORT_COUNTERS_CONG: + ret = pma_set_portcounters_cong(pmp, ibdev, port); + goto bail; + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_smp *) pmp); + goto bail; + } + + case IB_MGMT_METHOD_TRAP: + case IB_MGMT_METHOD_GET_RESP: + /* + * The ib_mad module will call us to process responses + * before checking for other consumers. + * Just tell the caller to process it normally. + */ + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD; + ret = reply((struct ib_smp *) pmp); + } + +bail: + return ret; +} + +static int cc_get_classportinfo(struct ib_cc_mad *ccp, + struct ib_device *ibdev) +{ + struct ib_cc_classportinfo_attr *p = + (struct ib_cc_classportinfo_attr *)ccp->mgmt_data; + + memset(ccp->mgmt_data, 0, sizeof(ccp->mgmt_data)); + + p->base_version = 1; + p->class_version = 1; + p->cap_mask = 0; + + /* + * Expected response time is 4.096 usec. * 2^18 == 1.073741824 sec. + */ + p->resp_time_value = 18; + + return reply((struct ib_smp *) ccp); +} + +static int cc_get_congestion_info(struct ib_cc_mad *ccp, + struct ib_device *ibdev, u8 port) +{ + struct ib_cc_info_attr *p = + (struct ib_cc_info_attr *)ccp->mgmt_data; + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + + memset(ccp->mgmt_data, 0, sizeof(ccp->mgmt_data)); + + p->congestion_info = 0; + p->control_table_cap = ppd->cc_max_table_entries; + + return reply((struct ib_smp *) ccp); +} + +static int cc_get_congestion_setting(struct ib_cc_mad *ccp, + struct ib_device *ibdev, u8 port) +{ + int i; + struct ib_cc_congestion_setting_attr *p = + (struct ib_cc_congestion_setting_attr *)ccp->mgmt_data; + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + struct ib_cc_congestion_entry_shadow *entries; + + memset(ccp->mgmt_data, 0, sizeof(ccp->mgmt_data)); + + spin_lock(&ppd->cc_shadow_lock); + + entries = ppd->congestion_entries_shadow->entries; + p->port_control = cpu_to_be16( + ppd->congestion_entries_shadow->port_control); + p->control_map = cpu_to_be16( + ppd->congestion_entries_shadow->control_map); + for (i = 0; i < IB_CC_CCS_ENTRIES; i++) { + p->entries[i].ccti_increase = entries[i].ccti_increase; + p->entries[i].ccti_timer = cpu_to_be16(entries[i].ccti_timer); + p->entries[i].trigger_threshold = entries[i].trigger_threshold; + p->entries[i].ccti_min = entries[i].ccti_min; + } + + spin_unlock(&ppd->cc_shadow_lock); + + return reply((struct ib_smp *) ccp); +} + +static int cc_get_congestion_control_table(struct ib_cc_mad *ccp, + struct ib_device *ibdev, u8 port) +{ + struct ib_cc_table_attr *p = + (struct ib_cc_table_attr *)ccp->mgmt_data; + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + u32 cct_block_index = be32_to_cpu(ccp->attr_mod); + u32 max_cct_block; + u32 cct_entry; + struct ib_cc_table_entry_shadow *entries; + int i; + + /* Is the table index more than what is supported? */ + if (cct_block_index > IB_CC_TABLE_CAP_DEFAULT - 1) + goto bail; + + memset(ccp->mgmt_data, 0, sizeof(ccp->mgmt_data)); + + spin_lock(&ppd->cc_shadow_lock); + + max_cct_block = + (ppd->ccti_entries_shadow->ccti_last_entry + 1)/IB_CCT_ENTRIES; + max_cct_block = max_cct_block ? max_cct_block - 1 : 0; + + if (cct_block_index > max_cct_block) { + spin_unlock(&ppd->cc_shadow_lock); + goto bail; + } + + ccp->attr_mod = cpu_to_be32(cct_block_index); + + cct_entry = IB_CCT_ENTRIES * (cct_block_index + 1); + + cct_entry--; + + p->ccti_limit = cpu_to_be16(cct_entry); + + entries = &ppd->ccti_entries_shadow-> + entries[IB_CCT_ENTRIES * cct_block_index]; + cct_entry %= IB_CCT_ENTRIES; + + for (i = 0; i <= cct_entry; i++) + p->ccti_entries[i].entry = cpu_to_be16(entries[i].entry); + + spin_unlock(&ppd->cc_shadow_lock); + + return reply((struct ib_smp *) ccp); + +bail: + return reply_failure((struct ib_smp *) ccp); +} + +static int cc_set_congestion_setting(struct ib_cc_mad *ccp, + struct ib_device *ibdev, u8 port) +{ + struct ib_cc_congestion_setting_attr *p = + (struct ib_cc_congestion_setting_attr *)ccp->mgmt_data; + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + int i; + + ppd->cc_sl_control_map = be16_to_cpu(p->control_map); + + for (i = 0; i < IB_CC_CCS_ENTRIES; i++) { + ppd->congestion_entries[i].ccti_increase = + p->entries[i].ccti_increase; + + ppd->congestion_entries[i].ccti_timer = + be16_to_cpu(p->entries[i].ccti_timer); + + ppd->congestion_entries[i].trigger_threshold = + p->entries[i].trigger_threshold; + + ppd->congestion_entries[i].ccti_min = + p->entries[i].ccti_min; + } + + return reply((struct ib_smp *) ccp); +} + +static int cc_set_congestion_control_table(struct ib_cc_mad *ccp, + struct ib_device *ibdev, u8 port) +{ + struct ib_cc_table_attr *p = + (struct ib_cc_table_attr *)ccp->mgmt_data; + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + u32 cct_block_index = be32_to_cpu(ccp->attr_mod); + u32 cct_entry; + struct ib_cc_table_entry_shadow *entries; + int i; + + /* Is the table index more than what is supported? */ + if (cct_block_index > IB_CC_TABLE_CAP_DEFAULT - 1) + goto bail; + + /* If this packet is the first in the sequence then + * zero the total table entry count. + */ + if (be16_to_cpu(p->ccti_limit) < IB_CCT_ENTRIES) + ppd->total_cct_entry = 0; + + cct_entry = (be16_to_cpu(p->ccti_limit))%IB_CCT_ENTRIES; + + /* ccti_limit is 0 to 63 */ + ppd->total_cct_entry += (cct_entry + 1); + + if (ppd->total_cct_entry > ppd->cc_supported_table_entries) + goto bail; + + ppd->ccti_limit = be16_to_cpu(p->ccti_limit); + + entries = ppd->ccti_entries + (IB_CCT_ENTRIES * cct_block_index); + + for (i = 0; i <= cct_entry; i++) + entries[i].entry = be16_to_cpu(p->ccti_entries[i].entry); + + spin_lock(&ppd->cc_shadow_lock); + + ppd->ccti_entries_shadow->ccti_last_entry = ppd->total_cct_entry - 1; + memcpy(ppd->ccti_entries_shadow->entries, ppd->ccti_entries, + (ppd->total_cct_entry * sizeof(struct ib_cc_table_entry))); + + ppd->congestion_entries_shadow->port_control = IB_CC_CCS_PC_SL_BASED; + ppd->congestion_entries_shadow->control_map = ppd->cc_sl_control_map; + memcpy(ppd->congestion_entries_shadow->entries, ppd->congestion_entries, + IB_CC_CCS_ENTRIES * sizeof(struct ib_cc_congestion_entry)); + + spin_unlock(&ppd->cc_shadow_lock); + + return reply((struct ib_smp *) ccp); + +bail: + return reply_failure((struct ib_smp *) ccp); +} + +static int check_cc_key(struct qib_ibport *ibp, + struct ib_cc_mad *ccp, int mad_flags) +{ + return 0; +} + +static int process_cc(struct ib_device *ibdev, int mad_flags, + u8 port, struct ib_mad *in_mad, + struct ib_mad *out_mad) +{ + struct ib_cc_mad *ccp = (struct ib_cc_mad *)out_mad; + struct qib_ibport *ibp = to_iport(ibdev, port); + int ret; + + *out_mad = *in_mad; + + if (ccp->class_version != 2) { + ccp->status |= IB_SMP_UNSUP_VERSION; + ret = reply((struct ib_smp *)ccp); + goto bail; + } + + ret = check_cc_key(ibp, ccp, mad_flags); + if (ret) + goto bail; + + switch (ccp->method) { + case IB_MGMT_METHOD_GET: + switch (ccp->attr_id) { + case IB_CC_ATTR_CLASSPORTINFO: + ret = cc_get_classportinfo(ccp, ibdev); + goto bail; + + case IB_CC_ATTR_CONGESTION_INFO: + ret = cc_get_congestion_info(ccp, ibdev, port); + goto bail; + + case IB_CC_ATTR_CA_CONGESTION_SETTING: + ret = cc_get_congestion_setting(ccp, ibdev, port); + goto bail; + + case IB_CC_ATTR_CONGESTION_CONTROL_TABLE: + ret = cc_get_congestion_control_table(ccp, ibdev, port); + goto bail; + + /* FALLTHROUGH */ + default: + ccp->status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_smp *) ccp); + goto bail; + } + + case IB_MGMT_METHOD_SET: + switch (ccp->attr_id) { + case IB_CC_ATTR_CA_CONGESTION_SETTING: + ret = cc_set_congestion_setting(ccp, ibdev, port); + goto bail; + + case IB_CC_ATTR_CONGESTION_CONTROL_TABLE: + ret = cc_set_congestion_control_table(ccp, ibdev, port); + goto bail; + + /* FALLTHROUGH */ + default: + ccp->status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_smp *) ccp); + goto bail; + } + + case IB_MGMT_METHOD_GET_RESP: + /* + * The ib_mad module will call us to process responses + * before checking for other consumers. + * Just tell the caller to process it normally. + */ + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + + case IB_MGMT_METHOD_TRAP: + default: + ccp->status |= IB_SMP_UNSUP_METHOD; + ret = reply((struct ib_smp *) ccp); + } + +bail: + return ret; +} + +/** + * qib_process_mad - process an incoming MAD packet + * @ibdev: the infiniband device this packet came in on + * @mad_flags: MAD flags + * @port: the port number this packet came in on + * @in_wc: the work completion entry for this packet + * @in_grh: the global route header for this packet + * @in_mad: the incoming MAD + * @out_mad: any outgoing MAD reply + * + * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not + * interested in processing. + * + * Note that the verbs framework has already done the MAD sanity checks, + * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE + * MADs. + * + * This is called by the ib_mad module. + */ +int qib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + int ret; + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + + switch (in_mad->mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + ret = process_subn(ibdev, mad_flags, port, in_mad, out_mad); + goto bail; + + case IB_MGMT_CLASS_PERF_MGMT: + ret = process_perf(ibdev, port, in_mad, out_mad); + goto bail; + + case IB_MGMT_CLASS_CONG_MGMT: + if (!ppd->congestion_entries_shadow || + !qib_cc_table_size) { + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + } + ret = process_cc(ibdev, mad_flags, port, in_mad, out_mad); + goto bail; + + default: + ret = IB_MAD_RESULT_SUCCESS; + } + +bail: + return ret; +} + +static void send_handler(struct ib_mad_agent *agent, + struct ib_mad_send_wc *mad_send_wc) +{ + ib_free_send_mad(mad_send_wc->send_buf); +} + +static void xmit_wait_timer_func(unsigned long opaque) +{ + struct qib_pportdata *ppd = (struct qib_pportdata *)opaque; + struct qib_devdata *dd = dd_from_ppd(ppd); + unsigned long flags; + u8 status; + + spin_lock_irqsave(&ppd->ibport_data.lock, flags); + if (ppd->cong_stats.flags == IB_PMA_CONG_HW_CONTROL_SAMPLE) { + status = dd->f_portcntr(ppd, QIBPORTCNTR_PSSTAT); + if (status == IB_PMA_SAMPLE_STATUS_DONE) { + /* save counter cache */ + cache_hw_sample_counters(ppd); + ppd->cong_stats.flags = IB_PMA_CONG_HW_CONTROL_TIMER; + } else + goto done; + } + ppd->cong_stats.counter = xmit_wait_get_value_delta(ppd); + dd->f_set_cntr_sample(ppd, QIB_CONG_TIMER_PSINTERVAL, 0x0); +done: + spin_unlock_irqrestore(&ppd->ibport_data.lock, flags); + mod_timer(&ppd->cong_stats.timer, jiffies + HZ); +} + +int qib_create_agents(struct qib_ibdev *dev) +{ + struct qib_devdata *dd = dd_from_dev(dev); + struct ib_mad_agent *agent; + struct qib_ibport *ibp; + int p; + int ret; + + for (p = 0; p < dd->num_pports; p++) { + ibp = &dd->pport[p].ibport_data; + agent = ib_register_mad_agent(&dev->ibdev, p + 1, IB_QPT_SMI, + NULL, 0, send_handler, + NULL, NULL, 0); + if (IS_ERR(agent)) { + ret = PTR_ERR(agent); + goto err; + } + + /* Initialize xmit_wait structure */ + dd->pport[p].cong_stats.counter = 0; + init_timer(&dd->pport[p].cong_stats.timer); + dd->pport[p].cong_stats.timer.function = xmit_wait_timer_func; + dd->pport[p].cong_stats.timer.data = + (unsigned long)(&dd->pport[p]); + dd->pport[p].cong_stats.timer.expires = 0; + add_timer(&dd->pport[p].cong_stats.timer); + + ibp->send_agent = agent; + } + + return 0; + +err: + for (p = 0; p < dd->num_pports; p++) { + ibp = &dd->pport[p].ibport_data; + if (ibp->send_agent) { + agent = ibp->send_agent; + ibp->send_agent = NULL; + ib_unregister_mad_agent(agent); + } + } + + return ret; +} + +void qib_free_agents(struct qib_ibdev *dev) +{ + struct qib_devdata *dd = dd_from_dev(dev); + struct ib_mad_agent *agent; + struct qib_ibport *ibp; + int p; + + for (p = 0; p < dd->num_pports; p++) { + ibp = &dd->pport[p].ibport_data; + if (ibp->send_agent) { + agent = ibp->send_agent; + ibp->send_agent = NULL; + ib_unregister_mad_agent(agent); + } + if (ibp->sm_ah) { + ib_destroy_ah(&ibp->sm_ah->ibah); + ibp->sm_ah = NULL; + } + if (dd->pport[p].cong_stats.timer.data) + del_timer_sync(&dd->pport[p].cong_stats.timer); + } +} diff --git a/drivers/infiniband/hw/qib/qib_mad.h b/drivers/infiniband/hw/qib/qib_mad.h new file mode 100644 index 0000000..941d4d5 --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_mad.h @@ -0,0 +1,431 @@ +/* + * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _QIB_MAD_H +#define _QIB_MAD_H + +#include + +#define IB_SMP_UNSUP_VERSION cpu_to_be16(0x0004) +#define IB_SMP_UNSUP_METHOD cpu_to_be16(0x0008) +#define IB_SMP_UNSUP_METH_ATTR cpu_to_be16(0x000C) +#define IB_SMP_INVALID_FIELD cpu_to_be16(0x001C) + +struct ib_node_info { + u8 base_version; + u8 class_version; + u8 node_type; + u8 num_ports; + __be64 sys_guid; + __be64 node_guid; + __be64 port_guid; + __be16 partition_cap; + __be16 device_id; + __be32 revision; + u8 local_port_num; + u8 vendor_id[3]; +} __packed; + +struct ib_mad_notice_attr { + u8 generic_type; + u8 prod_type_msb; + __be16 prod_type_lsb; + __be16 trap_num; + __be16 issuer_lid; + __be16 toggle_count; + + union { + struct { + u8 details[54]; + } raw_data; + + struct { + __be16 reserved; + __be16 lid; /* where violation happened */ + u8 port_num; /* where violation happened */ + } __packed ntc_129_131; + + struct { + __be16 reserved; + __be16 lid; /* LID where change occurred */ + u8 reserved2; + u8 local_changes; /* low bit - local changes */ + __be32 new_cap_mask; /* new capability mask */ + u8 reserved3; + u8 change_flags; /* low 3 bits only */ + } __packed ntc_144; + + struct { + __be16 reserved; + __be16 lid; /* lid where sys guid changed */ + __be16 reserved2; + __be64 new_sys_guid; + } __packed ntc_145; + + struct { + __be16 reserved; + __be16 lid; + __be16 dr_slid; + u8 method; + u8 reserved2; + __be16 attr_id; + __be32 attr_mod; + __be64 mkey; + u8 reserved3; + u8 dr_trunc_hop; + u8 dr_rtn_path[30]; + } __packed ntc_256; + + struct { + __be16 reserved; + __be16 lid1; + __be16 lid2; + __be32 key; + __be32 sl_qp1; /* SL: high 4 bits */ + __be32 qp2; /* high 8 bits reserved */ + union ib_gid gid1; + union ib_gid gid2; + } __packed ntc_257_258; + + } details; +}; + +/* + * Generic trap/notice types + */ +#define IB_NOTICE_TYPE_FATAL 0x80 +#define IB_NOTICE_TYPE_URGENT 0x81 +#define IB_NOTICE_TYPE_SECURITY 0x82 +#define IB_NOTICE_TYPE_SM 0x83 +#define IB_NOTICE_TYPE_INFO 0x84 + +/* + * Generic trap/notice producers + */ +#define IB_NOTICE_PROD_CA cpu_to_be16(1) +#define IB_NOTICE_PROD_SWITCH cpu_to_be16(2) +#define IB_NOTICE_PROD_ROUTER cpu_to_be16(3) +#define IB_NOTICE_PROD_CLASS_MGR cpu_to_be16(4) + +/* + * Generic trap/notice numbers + */ +#define IB_NOTICE_TRAP_LLI_THRESH cpu_to_be16(129) +#define IB_NOTICE_TRAP_EBO_THRESH cpu_to_be16(130) +#define IB_NOTICE_TRAP_FLOW_UPDATE cpu_to_be16(131) +#define IB_NOTICE_TRAP_CAP_MASK_CHG cpu_to_be16(144) +#define IB_NOTICE_TRAP_SYS_GUID_CHG cpu_to_be16(145) +#define IB_NOTICE_TRAP_BAD_MKEY cpu_to_be16(256) +#define IB_NOTICE_TRAP_BAD_PKEY cpu_to_be16(257) +#define IB_NOTICE_TRAP_BAD_QKEY cpu_to_be16(258) + +/* + * Repress trap/notice flags + */ +#define IB_NOTICE_REPRESS_LLI_THRESH (1 << 0) +#define IB_NOTICE_REPRESS_EBO_THRESH (1 << 1) +#define IB_NOTICE_REPRESS_FLOW_UPDATE (1 << 2) +#define IB_NOTICE_REPRESS_CAP_MASK_CHG (1 << 3) +#define IB_NOTICE_REPRESS_SYS_GUID_CHG (1 << 4) +#define IB_NOTICE_REPRESS_BAD_MKEY (1 << 5) +#define IB_NOTICE_REPRESS_BAD_PKEY (1 << 6) +#define IB_NOTICE_REPRESS_BAD_QKEY (1 << 7) + +/* + * Generic trap/notice other local changes flags (trap 144). + */ +#define IB_NOTICE_TRAP_LSE_CHG 0x04 /* Link Speed Enable changed */ +#define IB_NOTICE_TRAP_LWE_CHG 0x02 /* Link Width Enable changed */ +#define IB_NOTICE_TRAP_NODE_DESC_CHG 0x01 + +/* + * Generic trap/notice M_Key volation flags in dr_trunc_hop (trap 256). + */ +#define IB_NOTICE_TRAP_DR_NOTICE 0x80 +#define IB_NOTICE_TRAP_DR_TRUNC 0x40 + +struct ib_vl_weight_elem { + u8 vl; /* Only low 4 bits, upper 4 bits reserved */ + u8 weight; +}; + +#define IB_VLARB_LOWPRI_0_31 1 +#define IB_VLARB_LOWPRI_32_63 2 +#define IB_VLARB_HIGHPRI_0_31 3 +#define IB_VLARB_HIGHPRI_32_63 4 + +#define IB_PMA_PORT_COUNTERS_CONG cpu_to_be16(0xFF00) + +struct ib_pma_portcounters_cong { + u8 reserved; + u8 reserved1; + __be16 port_check_rate; + __be16 symbol_error_counter; + u8 link_error_recovery_counter; + u8 link_downed_counter; + __be16 port_rcv_errors; + __be16 port_rcv_remphys_errors; + __be16 port_rcv_switch_relay_errors; + __be16 port_xmit_discards; + u8 port_xmit_constraint_errors; + u8 port_rcv_constraint_errors; + u8 reserved2; + u8 link_overrun_errors; /* LocalLink: 7:4, BufferOverrun: 3:0 */ + __be16 reserved3; + __be16 vl15_dropped; + __be64 port_xmit_data; + __be64 port_rcv_data; + __be64 port_xmit_packets; + __be64 port_rcv_packets; + __be64 port_xmit_wait; + __be64 port_adr_events; +} __packed; + +#define IB_PMA_CONG_HW_CONTROL_TIMER 0x00 +#define IB_PMA_CONG_HW_CONTROL_SAMPLE 0x01 + +#define QIB_XMIT_RATE_UNSUPPORTED 0x0 +#define QIB_XMIT_RATE_PICO 0x7 +/* number of 4nsec cycles equaling 2secs */ +#define QIB_CONG_TIMER_PSINTERVAL 0x1DCD64EC + +#define IB_PMA_SEL_CONG_ALL 0x01 +#define IB_PMA_SEL_CONG_PORT_DATA 0x02 +#define IB_PMA_SEL_CONG_XMIT 0x04 +#define IB_PMA_SEL_CONG_ROUTING 0x08 + +/* + * Congestion control class attributes + */ +#define IB_CC_ATTR_CLASSPORTINFO cpu_to_be16(0x0001) +#define IB_CC_ATTR_NOTICE cpu_to_be16(0x0002) +#define IB_CC_ATTR_CONGESTION_INFO cpu_to_be16(0x0011) +#define IB_CC_ATTR_CONGESTION_KEY_INFO cpu_to_be16(0x0012) +#define IB_CC_ATTR_CONGESTION_LOG cpu_to_be16(0x0013) +#define IB_CC_ATTR_SWITCH_CONGESTION_SETTING cpu_to_be16(0x0014) +#define IB_CC_ATTR_SWITCH_PORT_CONGESTION_SETTING cpu_to_be16(0x0015) +#define IB_CC_ATTR_CA_CONGESTION_SETTING cpu_to_be16(0x0016) +#define IB_CC_ATTR_CONGESTION_CONTROL_TABLE cpu_to_be16(0x0017) +#define IB_CC_ATTR_TIME_STAMP cpu_to_be16(0x0018) + +/* generalizations for threshold values */ +#define IB_CC_THRESHOLD_NONE 0x0 +#define IB_CC_THRESHOLD_MIN 0x1 +#define IB_CC_THRESHOLD_MAX 0xf + +/* CCA MAD header constants */ +#define IB_CC_MAD_LOGDATA_LEN 32 +#define IB_CC_MAD_MGMTDATA_LEN 192 + +struct ib_cc_mad { + u8 base_version; + u8 mgmt_class; + u8 class_version; + u8 method; + __be16 status; + __be16 class_specific; + __be64 tid; + __be16 attr_id; + __be16 resv; + __be32 attr_mod; + __be64 cckey; + + /* For CongestionLog attribute only */ + u8 log_data[IB_CC_MAD_LOGDATA_LEN]; + + u8 mgmt_data[IB_CC_MAD_MGMTDATA_LEN]; +} __packed; + +/* + * Congestion Control class portinfo capability mask bits + */ +#define IB_CC_CPI_CM_TRAP_GEN cpu_to_be16(1 << 0) +#define IB_CC_CPI_CM_GET_SET_NOTICE cpu_to_be16(1 << 1) +#define IB_CC_CPI_CM_CAP2 cpu_to_be16(1 << 2) +#define IB_CC_CPI_CM_ENHANCEDPORT0_CC cpu_to_be16(1 << 8) + +struct ib_cc_classportinfo_attr { + u8 base_version; + u8 class_version; + __be16 cap_mask; + u8 reserved[3]; + u8 resp_time_value; /* only lower 5 bits */ + union ib_gid redirect_gid; + __be32 redirect_tc_sl_fl; /* 8, 4, 20 bits respectively */ + __be16 redirect_lid; + __be16 redirect_pkey; + __be32 redirect_qp; /* only lower 24 bits */ + __be32 redirect_qkey; + union ib_gid trap_gid; + __be32 trap_tc_sl_fl; /* 8, 4, 20 bits respectively */ + __be16 trap_lid; + __be16 trap_pkey; + __be32 trap_hl_qp; /* 8, 24 bits respectively */ + __be32 trap_qkey; +} __packed; + +/* Congestion control traps */ +#define IB_CC_TRAP_KEY_VIOLATION 0x0000 + +struct ib_cc_trap_key_violation_attr { + __be16 source_lid; + u8 method; + u8 reserved1; + __be16 attrib_id; + __be32 attrib_mod; + __be32 qp; + __be64 cckey; + u8 sgid[16]; + u8 padding[24]; +} __packed; + +/* Congestion info flags */ +#define IB_CC_CI_FLAGS_CREDIT_STARVATION 0x1 +#define IB_CC_TABLE_CAP_DEFAULT 31 + +struct ib_cc_info_attr { + __be16 congestion_info; + u8 control_table_cap; /* Multiple of 64 entry unit CCTs */ +} __packed; + +struct ib_cc_key_info_attr { + __be64 cckey; + u8 protect; + __be16 lease_period; + __be16 violations; +} __packed; + +#define IB_CC_CL_CA_LOGEVENTS_LEN 208 + +struct ib_cc_log_attr { + u8 log_type; + u8 congestion_flags; + __be16 threshold_event_counter; + __be16 threshold_congestion_event_map; + __be16 current_time_stamp; + u8 log_events[IB_CC_CL_CA_LOGEVENTS_LEN]; +} __packed; + +#define IB_CC_CLEC_SERVICETYPE_RC 0x0 +#define IB_CC_CLEC_SERVICETYPE_UC 0x1 +#define IB_CC_CLEC_SERVICETYPE_RD 0x2 +#define IB_CC_CLEC_SERVICETYPE_UD 0x3 + +struct ib_cc_log_event { + u8 local_qp_cn_entry; + u8 remote_qp_number_cn_entry[3]; + u8 sl_cn_entry:4; + u8 service_type_cn_entry:4; + __be32 remote_lid_cn_entry; + __be32 timestamp_cn_entry; +} __packed; + +/* Sixteen congestion entries */ +#define IB_CC_CCS_ENTRIES 16 + +/* Port control flags */ +#define IB_CC_CCS_PC_SL_BASED 0x01 + +struct ib_cc_congestion_entry { + u8 ccti_increase; + __be16 ccti_timer; + u8 trigger_threshold; + u8 ccti_min; /* min CCTI for cc table */ +} __packed; + +struct ib_cc_congestion_entry_shadow { + u8 ccti_increase; + u16 ccti_timer; + u8 trigger_threshold; + u8 ccti_min; /* min CCTI for cc table */ +} __packed; + +struct ib_cc_congestion_setting_attr { + __be16 port_control; + __be16 control_map; + struct ib_cc_congestion_entry entries[IB_CC_CCS_ENTRIES]; +} __packed; + +struct ib_cc_congestion_setting_attr_shadow { + u16 port_control; + u16 control_map; + struct ib_cc_congestion_entry_shadow entries[IB_CC_CCS_ENTRIES]; +} __packed; + +#define IB_CC_TABLE_ENTRY_INCREASE_DEFAULT 1 +#define IB_CC_TABLE_ENTRY_TIMER_DEFAULT 1 + +/* 64 Congestion Control table entries in a single MAD */ +#define IB_CCT_ENTRIES 64 +#define IB_CCT_MIN_ENTRIES (IB_CCT_ENTRIES * 2) + +struct ib_cc_table_entry { + __be16 entry; /* shift:2, multiplier:14 */ +}; + +struct ib_cc_table_entry_shadow { + u16 entry; /* shift:2, multiplier:14 */ +}; + +struct ib_cc_table_attr { + __be16 ccti_limit; /* max CCTI for cc table */ + struct ib_cc_table_entry ccti_entries[IB_CCT_ENTRIES]; +} __packed; + +struct ib_cc_table_attr_shadow { + u16 ccti_limit; /* max CCTI for cc table */ + struct ib_cc_table_entry_shadow ccti_entries[IB_CCT_ENTRIES]; +} __packed; + +#define CC_TABLE_SHADOW_MAX \ + (IB_CC_TABLE_CAP_DEFAULT * IB_CCT_ENTRIES) + +struct cc_table_shadow { + u16 ccti_last_entry; + struct ib_cc_table_entry_shadow entries[CC_TABLE_SHADOW_MAX]; +} __packed; + +/* + * The PortSamplesControl.CounterMasks field is an array of 3 bit fields + * which specify the N'th counter's capabilities. See ch. 16.1.3.2. + * We support 5 counters which only count the mandatory quantities. + */ +#define COUNTER_MASK(q, n) (q << ((9 - n) * 3)) +#define COUNTER_MASK0_9 \ + cpu_to_be32(COUNTER_MASK(1, 0) | \ + COUNTER_MASK(1, 1) | \ + COUNTER_MASK(1, 2) | \ + COUNTER_MASK(1, 3) | \ + COUNTER_MASK(1, 4)) + +#endif /* _QIB_MAD_H */ diff --git a/drivers/infiniband/hw/qib/qib_mmap.c b/drivers/infiniband/hw/qib/qib_mmap.c new file mode 100644 index 0000000..8b73a11 --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_mmap.c @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#include "qib_verbs.h" + +/** + * qib_release_mmap_info - free mmap info structure + * @ref: a pointer to the kref within struct qib_mmap_info + */ +void qib_release_mmap_info(struct kref *ref) +{ + struct qib_mmap_info *ip = + container_of(ref, struct qib_mmap_info, ref); + struct qib_ibdev *dev = to_idev(ip->context->device); + + spin_lock_irq(&dev->pending_lock); + list_del(&ip->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + + vfree(ip->obj); + kfree(ip); +} + +/* + * open and close keep track of how many times the CQ is mapped, + * to avoid releasing it. + */ +static void qib_vma_open(struct vm_area_struct *vma) +{ + struct qib_mmap_info *ip = vma->vm_private_data; + + kref_get(&ip->ref); +} + +static void qib_vma_close(struct vm_area_struct *vma) +{ + struct qib_mmap_info *ip = vma->vm_private_data; + + kref_put(&ip->ref, qib_release_mmap_info); +} + +static struct vm_operations_struct qib_vm_ops = { + .open = qib_vma_open, + .close = qib_vma_close, +}; + +/** + * qib_mmap - create a new mmap region + * @context: the IB user context of the process making the mmap() call + * @vma: the VMA to be initialized + * Return zero if the mmap is OK. Otherwise, return an errno. + */ +int qib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + struct qib_ibdev *dev = to_idev(context->device); + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned long size = vma->vm_end - vma->vm_start; + struct qib_mmap_info *ip, *pp; + int ret = -EINVAL; + + /* + * Search the device's list of objects waiting for a mmap call. + * Normally, this list is very short since a call to create a + * CQ, QP, or SRQ is soon followed by a call to mmap(). + */ + spin_lock_irq(&dev->pending_lock); + list_for_each_entry_safe(ip, pp, &dev->pending_mmaps, + pending_mmaps) { + /* Only the creator is allowed to mmap the object */ + if (context != ip->context || (__u64) offset != ip->offset) + continue; + /* Don't allow a mmap larger than the object. */ + if (size > ip->size) + break; + + list_del_init(&ip->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + + ret = remap_vmalloc_range(vma, ip->obj, 0); + if (ret) + goto done; + vma->vm_ops = &qib_vm_ops; + vma->vm_private_data = ip; + qib_vma_open(vma); + goto done; + } + spin_unlock_irq(&dev->pending_lock); +done: + return ret; +} + +/* + * Allocate information for qib_mmap + */ +struct qib_mmap_info *qib_create_mmap_info(struct qib_ibdev *dev, + u32 size, + struct ib_ucontext *context, + void *obj) { + struct qib_mmap_info *ip; + + ip = kmalloc(sizeof *ip, GFP_KERNEL); + if (!ip) + goto bail; + + size = PAGE_ALIGN(size); + + spin_lock_irq(&dev->mmap_offset_lock); + if (dev->mmap_offset == 0) + dev->mmap_offset = PAGE_SIZE; + ip->offset = dev->mmap_offset; + dev->mmap_offset += size; + spin_unlock_irq(&dev->mmap_offset_lock); + + INIT_LIST_HEAD(&ip->pending_mmaps); + ip->size = size; + ip->context = context; + ip->obj = obj; + kref_init(&ip->ref); + +bail: + return ip; +} + +void qib_update_mmap_info(struct qib_ibdev *dev, struct qib_mmap_info *ip, + u32 size, void *obj) +{ + size = PAGE_ALIGN(size); + + spin_lock_irq(&dev->mmap_offset_lock); + if (dev->mmap_offset == 0) + dev->mmap_offset = PAGE_SIZE; + ip->offset = dev->mmap_offset; + dev->mmap_offset += size; + spin_unlock_irq(&dev->mmap_offset_lock); + + ip->size = size; + ip->obj = obj; +} diff --git a/drivers/infiniband/hw/qib/qib_mr.c b/drivers/infiniband/hw/qib/qib_mr.c new file mode 100644 index 0000000..9bbb553 --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_mr.c @@ -0,0 +1,532 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "qib.h" + +/* Fast memory region */ +struct qib_fmr { + struct ib_fmr ibfmr; + struct qib_mregion mr; /* must be last */ +}; + +static inline struct qib_fmr *to_ifmr(struct ib_fmr *ibfmr) +{ + return container_of(ibfmr, struct qib_fmr, ibfmr); +} + +static int init_qib_mregion(struct qib_mregion *mr, struct ib_pd *pd, + int count) +{ + int m, i = 0; + int rval = 0; + + m = (count + QIB_SEGSZ - 1) / QIB_SEGSZ; + for (; i < m; i++) { + mr->map[i] = kzalloc(sizeof *mr->map[0], GFP_KERNEL); + if (!mr->map[i]) + goto bail; + } + mr->mapsz = m; + init_completion(&mr->comp); + /* count returning the ptr to user */ + atomic_set(&mr->refcount, 1); + mr->pd = pd; + mr->max_segs = count; +out: + return rval; +bail: + while (i) + kfree(mr->map[--i]); + rval = -ENOMEM; + goto out; +} + +static void deinit_qib_mregion(struct qib_mregion *mr) +{ + int i = mr->mapsz; + + mr->mapsz = 0; + while (i) + kfree(mr->map[--i]); +} + + +/** + * qib_get_dma_mr - get a DMA memory region + * @pd: protection domain for this memory region + * @acc: access flags + * + * Returns the memory region on success, otherwise returns an errno. + * Note that all DMA addresses should be created via the + * struct ib_dma_mapping_ops functions (see qib_dma.c). + */ +struct ib_mr *qib_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct qib_mr *mr = NULL; + struct ib_mr *ret; + int rval; + + if (to_ipd(pd)->user) { + ret = ERR_PTR(-EPERM); + goto bail; + } + + mr = kzalloc(sizeof *mr, GFP_KERNEL); + if (!mr) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + rval = init_qib_mregion(&mr->mr, pd, 0); + if (rval) { + ret = ERR_PTR(rval); + goto bail; + } + + + rval = qib_alloc_lkey(&mr->mr, 1); + if (rval) { + ret = ERR_PTR(rval); + goto bail_mregion; + } + + mr->mr.access_flags = acc; + ret = &mr->ibmr; +done: + return ret; + +bail_mregion: + deinit_qib_mregion(&mr->mr); +bail: + kfree(mr); + goto done; +} + +static struct qib_mr *alloc_mr(int count, struct ib_pd *pd) +{ + struct qib_mr *mr; + int rval = -ENOMEM; + int m; + + /* Allocate struct plus pointers to first level page tables. */ + m = (count + QIB_SEGSZ - 1) / QIB_SEGSZ; + mr = kzalloc(sizeof *mr + m * sizeof mr->mr.map[0], GFP_KERNEL); + if (!mr) + goto bail; + + rval = init_qib_mregion(&mr->mr, pd, count); + if (rval) + goto bail; + /* + * ib_reg_phys_mr() will initialize mr->ibmr except for + * lkey and rkey. + */ + rval = qib_alloc_lkey(&mr->mr, 0); + if (rval) + goto bail_mregion; + mr->ibmr.lkey = mr->mr.lkey; + mr->ibmr.rkey = mr->mr.lkey; +done: + return mr; + +bail_mregion: + deinit_qib_mregion(&mr->mr); +bail: + kfree(mr); + mr = ERR_PTR(rval); + goto done; +} + +/** + * qib_reg_phys_mr - register a physical memory region + * @pd: protection domain for this memory region + * @buffer_list: pointer to the list of physical buffers to register + * @num_phys_buf: the number of physical buffers to register + * @iova_start: the starting address passed over IB which maps to this MR + * + * Returns the memory region on success, otherwise returns an errno. + */ +struct ib_mr *qib_reg_phys_mr(struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, int acc, u64 *iova_start) +{ + struct qib_mr *mr; + int n, m, i; + struct ib_mr *ret; + + mr = alloc_mr(num_phys_buf, pd); + if (IS_ERR(mr)) { + ret = (struct ib_mr *)mr; + goto bail; + } + + mr->mr.user_base = *iova_start; + mr->mr.iova = *iova_start; + mr->mr.access_flags = acc; + + m = 0; + n = 0; + for (i = 0; i < num_phys_buf; i++) { + mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr; + mr->mr.map[m]->segs[n].length = buffer_list[i].size; + mr->mr.length += buffer_list[i].size; + n++; + if (n == QIB_SEGSZ) { + m++; + n = 0; + } + } + + ret = &mr->ibmr; + +bail: + return ret; +} + +/** + * qib_reg_user_mr - register a userspace memory region + * @pd: protection domain for this memory region + * @start: starting userspace address + * @length: length of region to register + * @mr_access_flags: access flags for this memory region + * @udata: unused by the QLogic_IB driver + * + * Returns the memory region on success, otherwise returns an errno. + */ +struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int mr_access_flags, + struct ib_udata *udata) +{ + struct qib_mr *mr; + struct ib_umem *umem; + struct scatterlist *sg; + int n, m, entry; + struct ib_mr *ret; + + if (length == 0) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + umem = ib_umem_get(pd->uobject->context, start, length, + mr_access_flags, 0); + if (IS_ERR(umem)) + return (void *) umem; + + n = umem->nmap; + + mr = alloc_mr(n, pd); + if (IS_ERR(mr)) { + ret = (struct ib_mr *)mr; + ib_umem_release(umem); + goto bail; + } + + mr->mr.user_base = start; + mr->mr.iova = virt_addr; + mr->mr.length = length; + mr->mr.offset = umem->offset; + mr->mr.access_flags = mr_access_flags; + mr->umem = umem; + + if (is_power_of_2(umem->page_size)) + mr->mr.page_shift = ilog2(umem->page_size); + m = 0; + n = 0; + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + void *vaddr; + + vaddr = page_address(sg_page(sg)); + if (!vaddr) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + mr->mr.map[m]->segs[n].vaddr = vaddr; + mr->mr.map[m]->segs[n].length = umem->page_size; + n++; + if (n == QIB_SEGSZ) { + m++; + n = 0; + } + } + ret = &mr->ibmr; + +bail: + return ret; +} + +/** + * qib_dereg_mr - unregister and free a memory region + * @ibmr: the memory region to free + * + * Returns 0 on success. + * + * Note that this is called to free MRs created by qib_get_dma_mr() + * or qib_reg_user_mr(). + */ +int qib_dereg_mr(struct ib_mr *ibmr) +{ + struct qib_mr *mr = to_imr(ibmr); + int ret = 0; + unsigned long timeout; + + qib_free_lkey(&mr->mr); + + qib_put_mr(&mr->mr); /* will set completion if last */ + timeout = wait_for_completion_timeout(&mr->mr.comp, + 5 * HZ); + if (!timeout) { + qib_get_mr(&mr->mr); + ret = -EBUSY; + goto out; + } + deinit_qib_mregion(&mr->mr); + if (mr->umem) + ib_umem_release(mr->umem); + kfree(mr); +out: + return ret; +} + +/* + * Allocate a memory region usable with the + * IB_WR_FAST_REG_MR send work request. + * + * Return the memory region on success, otherwise return an errno. + */ +struct ib_mr *qib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len) +{ + struct qib_mr *mr; + + mr = alloc_mr(max_page_list_len, pd); + if (IS_ERR(mr)) + return (struct ib_mr *)mr; + + return &mr->ibmr; +} + +struct ib_fast_reg_page_list * +qib_alloc_fast_reg_page_list(struct ib_device *ibdev, int page_list_len) +{ + unsigned size = page_list_len * sizeof(u64); + struct ib_fast_reg_page_list *pl; + + if (size > PAGE_SIZE) + return ERR_PTR(-EINVAL); + + pl = kzalloc(sizeof *pl, GFP_KERNEL); + if (!pl) + return ERR_PTR(-ENOMEM); + + pl->page_list = kzalloc(size, GFP_KERNEL); + if (!pl->page_list) + goto err_free; + + return pl; + +err_free: + kfree(pl); + return ERR_PTR(-ENOMEM); +} + +void qib_free_fast_reg_page_list(struct ib_fast_reg_page_list *pl) +{ + kfree(pl->page_list); + kfree(pl); +} + +/** + * qib_alloc_fmr - allocate a fast memory region + * @pd: the protection domain for this memory region + * @mr_access_flags: access flags for this memory region + * @fmr_attr: fast memory region attributes + * + * Returns the memory region on success, otherwise returns an errno. + */ +struct ib_fmr *qib_alloc_fmr(struct ib_pd *pd, int mr_access_flags, + struct ib_fmr_attr *fmr_attr) +{ + struct qib_fmr *fmr; + int m; + struct ib_fmr *ret; + int rval = -ENOMEM; + + /* Allocate struct plus pointers to first level page tables. */ + m = (fmr_attr->max_pages + QIB_SEGSZ - 1) / QIB_SEGSZ; + fmr = kzalloc(sizeof *fmr + m * sizeof fmr->mr.map[0], GFP_KERNEL); + if (!fmr) + goto bail; + + rval = init_qib_mregion(&fmr->mr, pd, fmr_attr->max_pages); + if (rval) + goto bail; + + /* + * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey & + * rkey. + */ + rval = qib_alloc_lkey(&fmr->mr, 0); + if (rval) + goto bail_mregion; + fmr->ibfmr.rkey = fmr->mr.lkey; + fmr->ibfmr.lkey = fmr->mr.lkey; + /* + * Resources are allocated but no valid mapping (RKEY can't be + * used). + */ + fmr->mr.access_flags = mr_access_flags; + fmr->mr.max_segs = fmr_attr->max_pages; + fmr->mr.page_shift = fmr_attr->page_shift; + + ret = &fmr->ibfmr; +done: + return ret; + +bail_mregion: + deinit_qib_mregion(&fmr->mr); +bail: + kfree(fmr); + ret = ERR_PTR(rval); + goto done; +} + +/** + * qib_map_phys_fmr - set up a fast memory region + * @ibmfr: the fast memory region to set up + * @page_list: the list of pages to associate with the fast memory region + * @list_len: the number of pages to associate with the fast memory region + * @iova: the virtual address of the start of the fast memory region + * + * This may be called from interrupt context. + */ + +int qib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, + int list_len, u64 iova) +{ + struct qib_fmr *fmr = to_ifmr(ibfmr); + struct qib_lkey_table *rkt; + unsigned long flags; + int m, n, i; + u32 ps; + int ret; + + i = atomic_read(&fmr->mr.refcount); + if (i > 2) + return -EBUSY; + + if (list_len > fmr->mr.max_segs) { + ret = -EINVAL; + goto bail; + } + rkt = &to_idev(ibfmr->device)->lk_table; + spin_lock_irqsave(&rkt->lock, flags); + fmr->mr.user_base = iova; + fmr->mr.iova = iova; + ps = 1 << fmr->mr.page_shift; + fmr->mr.length = list_len * ps; + m = 0; + n = 0; + for (i = 0; i < list_len; i++) { + fmr->mr.map[m]->segs[n].vaddr = (void *) page_list[i]; + fmr->mr.map[m]->segs[n].length = ps; + if (++n == QIB_SEGSZ) { + m++; + n = 0; + } + } + spin_unlock_irqrestore(&rkt->lock, flags); + ret = 0; + +bail: + return ret; +} + +/** + * qib_unmap_fmr - unmap fast memory regions + * @fmr_list: the list of fast memory regions to unmap + * + * Returns 0 on success. + */ +int qib_unmap_fmr(struct list_head *fmr_list) +{ + struct qib_fmr *fmr; + struct qib_lkey_table *rkt; + unsigned long flags; + + list_for_each_entry(fmr, fmr_list, ibfmr.list) { + rkt = &to_idev(fmr->ibfmr.device)->lk_table; + spin_lock_irqsave(&rkt->lock, flags); + fmr->mr.user_base = 0; + fmr->mr.iova = 0; + fmr->mr.length = 0; + spin_unlock_irqrestore(&rkt->lock, flags); + } + return 0; +} + +/** + * qib_dealloc_fmr - deallocate a fast memory region + * @ibfmr: the fast memory region to deallocate + * + * Returns 0 on success. + */ +int qib_dealloc_fmr(struct ib_fmr *ibfmr) +{ + struct qib_fmr *fmr = to_ifmr(ibfmr); + int ret = 0; + unsigned long timeout; + + qib_free_lkey(&fmr->mr); + qib_put_mr(&fmr->mr); /* will set completion if last */ + timeout = wait_for_completion_timeout(&fmr->mr.comp, + 5 * HZ); + if (!timeout) { + qib_get_mr(&fmr->mr); + ret = -EBUSY; + goto out; + } + deinit_qib_mregion(&fmr->mr); + kfree(fmr); +out: + return ret; +} + +void mr_rcu_callback(struct rcu_head *list) +{ + struct qib_mregion *mr = container_of(list, struct qib_mregion, list); + + complete(&mr->comp); +} diff --git a/drivers/infiniband/hw/qib/qib_pcie.c b/drivers/infiniband/hw/qib/qib_pcie.c new file mode 100644 index 0000000..61a0046 --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_pcie.c @@ -0,0 +1,715 @@ +/* + * Copyright (c) 2008, 2009 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#include "qib.h" + +/* + * This file contains PCIe utility routines that are common to the + * various QLogic InfiniPath adapters + */ + +/* + * Code to adjust PCIe capabilities. + * To minimize the change footprint, we call it + * from qib_pcie_params, which every chip-specific + * file calls, even though this violates some + * expectations of harmlessness. + */ +static void qib_tune_pcie_caps(struct qib_devdata *); +static void qib_tune_pcie_coalesce(struct qib_devdata *); + +/* + * Do all the common PCIe setup and initialization. + * devdata is not yet allocated, and is not allocated until after this + * routine returns success. Therefore qib_dev_err() can't be used for error + * printing. + */ +int qib_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + int ret; + + ret = pci_enable_device(pdev); + if (ret) { + /* + * This can happen (in theory) iff: + * We did a chip reset, and then failed to reprogram the + * BAR, or the chip reset due to an internal error. We then + * unloaded the driver and reloaded it. + * + * Both reset cases set the BAR back to initial state. For + * the latter case, the AER sticky error bit at offset 0x718 + * should be set, but the Linux kernel doesn't yet know + * about that, it appears. If the original BAR was retained + * in the kernel data structures, this may be OK. + */ + qib_early_err(&pdev->dev, "pci enable failed: error %d\n", + -ret); + goto done; + } + + ret = pci_request_regions(pdev, QIB_DRV_NAME); + if (ret) { + qib_devinfo(pdev, "pci_request_regions fails: err %d\n", -ret); + goto bail; + } + + ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + if (ret) { + /* + * If the 64 bit setup fails, try 32 bit. Some systems + * do not setup 64 bit maps on systems with 2GB or less + * memory installed. + */ + ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + if (ret) { + qib_devinfo(pdev, "Unable to set DMA mask: %d\n", ret); + goto bail; + } + ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); + } else + ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); + if (ret) { + qib_early_err(&pdev->dev, + "Unable to set DMA consistent mask: %d\n", ret); + goto bail; + } + + pci_set_master(pdev); + ret = pci_enable_pcie_error_reporting(pdev); + if (ret) { + qib_early_err(&pdev->dev, + "Unable to enable pcie error reporting: %d\n", + ret); + ret = 0; + } + goto done; + +bail: + pci_disable_device(pdev); + pci_release_regions(pdev); +done: + return ret; +} + +/* + * Do remaining PCIe setup, once dd is allocated, and save away + * fields required to re-initialize after a chip reset, or for + * various other purposes + */ +int qib_pcie_ddinit(struct qib_devdata *dd, struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + unsigned long len; + resource_size_t addr; + + dd->pcidev = pdev; + pci_set_drvdata(pdev, dd); + + addr = pci_resource_start(pdev, 0); + len = pci_resource_len(pdev, 0); + +#if defined(__powerpc__) + /* There isn't a generic way to specify writethrough mappings */ + dd->kregbase = __ioremap(addr, len, _PAGE_NO_CACHE | _PAGE_WRITETHRU); +#else + dd->kregbase = ioremap_nocache(addr, len); +#endif + + if (!dd->kregbase) + return -ENOMEM; + + dd->kregend = (u64 __iomem *)((void __iomem *) dd->kregbase + len); + dd->physaddr = addr; /* used for io_remap, etc. */ + + /* + * Save BARs to rewrite after device reset. Save all 64 bits of + * BAR, just in case. + */ + dd->pcibar0 = addr; + dd->pcibar1 = addr >> 32; + dd->deviceid = ent->device; /* save for later use */ + dd->vendorid = ent->vendor; + + return 0; +} + +/* + * Do PCIe cleanup, after chip-specific cleanup, etc. Just prior + * to releasing the dd memory. + * void because none of the core pcie cleanup returns are void + */ +void qib_pcie_ddcleanup(struct qib_devdata *dd) +{ + u64 __iomem *base = (void __iomem *) dd->kregbase; + + dd->kregbase = NULL; + iounmap(base); + if (dd->piobase) + iounmap(dd->piobase); + if (dd->userbase) + iounmap(dd->userbase); + if (dd->piovl15base) + iounmap(dd->piovl15base); + + pci_disable_device(dd->pcidev); + pci_release_regions(dd->pcidev); + + pci_set_drvdata(dd->pcidev, NULL); +} + +static void qib_msix_setup(struct qib_devdata *dd, int pos, u32 *msixcnt, + struct qib_msix_entry *qib_msix_entry) +{ + int ret; + int nvec = *msixcnt; + struct msix_entry *msix_entry; + int i; + + ret = pci_msix_vec_count(dd->pcidev); + if (ret < 0) + goto do_intx; + + nvec = min(nvec, ret); + + /* We can't pass qib_msix_entry array to qib_msix_setup + * so use a dummy msix_entry array and copy the allocated + * irq back to the qib_msix_entry array. */ + msix_entry = kmalloc(nvec * sizeof(*msix_entry), GFP_KERNEL); + if (!msix_entry) + goto do_intx; + + for (i = 0; i < nvec; i++) + msix_entry[i] = qib_msix_entry[i].msix; + + ret = pci_enable_msix_range(dd->pcidev, msix_entry, 1, nvec); + if (ret < 0) + goto free_msix_entry; + else + nvec = ret; + + for (i = 0; i < nvec; i++) + qib_msix_entry[i].msix = msix_entry[i]; + + kfree(msix_entry); + *msixcnt = nvec; + return; + +free_msix_entry: + kfree(msix_entry); + +do_intx: + qib_dev_err(dd, "pci_enable_msix_range %d vectors failed: %d, " + "falling back to INTx\n", nvec, ret); + *msixcnt = 0; + qib_enable_intx(dd->pcidev); +} + +/** + * We save the msi lo and hi values, so we can restore them after + * chip reset (the kernel PCI infrastructure doesn't yet handle that + * correctly. + */ +static int qib_msi_setup(struct qib_devdata *dd, int pos) +{ + struct pci_dev *pdev = dd->pcidev; + u16 control; + int ret; + + ret = pci_enable_msi(pdev); + if (ret) + qib_dev_err(dd, + "pci_enable_msi failed: %d, interrupts may not work\n", + ret); + /* continue even if it fails, we may still be OK... */ + + pci_read_config_dword(pdev, pos + PCI_MSI_ADDRESS_LO, + &dd->msi_lo); + pci_read_config_dword(pdev, pos + PCI_MSI_ADDRESS_HI, + &dd->msi_hi); + pci_read_config_word(pdev, pos + PCI_MSI_FLAGS, &control); + /* now save the data (vector) info */ + pci_read_config_word(pdev, pos + ((control & PCI_MSI_FLAGS_64BIT) + ? 12 : 8), + &dd->msi_data); + return ret; +} + +int qib_pcie_params(struct qib_devdata *dd, u32 minw, u32 *nent, + struct qib_msix_entry *entry) +{ + u16 linkstat, speed; + int pos = 0, ret = 1; + + if (!pci_is_pcie(dd->pcidev)) { + qib_dev_err(dd, "Can't find PCI Express capability!\n"); + /* set up something... */ + dd->lbus_width = 1; + dd->lbus_speed = 2500; /* Gen1, 2.5GHz */ + goto bail; + } + + pos = dd->pcidev->msix_cap; + if (nent && *nent && pos) { + qib_msix_setup(dd, pos, nent, entry); + ret = 0; /* did it, either MSIx or INTx */ + } else { + pos = dd->pcidev->msi_cap; + if (pos) + ret = qib_msi_setup(dd, pos); + else + qib_dev_err(dd, "No PCI MSI or MSIx capability!\n"); + } + if (!pos) + qib_enable_intx(dd->pcidev); + + pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKSTA, &linkstat); + /* + * speed is bits 0-3, linkwidth is bits 4-8 + * no defines for them in headers + */ + speed = linkstat & 0xf; + linkstat >>= 4; + linkstat &= 0x1f; + dd->lbus_width = linkstat; + + switch (speed) { + case 1: + dd->lbus_speed = 2500; /* Gen1, 2.5GHz */ + break; + case 2: + dd->lbus_speed = 5000; /* Gen1, 5GHz */ + break; + default: /* not defined, assume gen1 */ + dd->lbus_speed = 2500; + break; + } + + /* + * Check against expected pcie width and complain if "wrong" + * on first initialization, not afterwards (i.e., reset). + */ + if (minw && linkstat < minw) + qib_dev_err(dd, + "PCIe width %u (x%u HCA), performance reduced\n", + linkstat, minw); + + qib_tune_pcie_caps(dd); + + qib_tune_pcie_coalesce(dd); + +bail: + /* fill in string, even on errors */ + snprintf(dd->lbus_info, sizeof(dd->lbus_info), + "PCIe,%uMHz,x%u\n", dd->lbus_speed, dd->lbus_width); + return ret; +} + +/* + * Setup pcie interrupt stuff again after a reset. I'd like to just call + * pci_enable_msi() again for msi, but when I do that, + * the MSI enable bit doesn't get set in the command word, and + * we switch to to a different interrupt vector, which is confusing, + * so I instead just do it all inline. Perhaps somehow can tie this + * into the PCIe hotplug support at some point + */ +int qib_reinit_intr(struct qib_devdata *dd) +{ + int pos; + u16 control; + int ret = 0; + + /* If we aren't using MSI, don't restore it */ + if (!dd->msi_lo) + goto bail; + + pos = dd->pcidev->msi_cap; + if (!pos) { + qib_dev_err(dd, + "Can't find MSI capability, can't restore MSI settings\n"); + ret = 0; + /* nothing special for MSIx, just MSI */ + goto bail; + } + pci_write_config_dword(dd->pcidev, pos + PCI_MSI_ADDRESS_LO, + dd->msi_lo); + pci_write_config_dword(dd->pcidev, pos + PCI_MSI_ADDRESS_HI, + dd->msi_hi); + pci_read_config_word(dd->pcidev, pos + PCI_MSI_FLAGS, &control); + if (!(control & PCI_MSI_FLAGS_ENABLE)) { + control |= PCI_MSI_FLAGS_ENABLE; + pci_write_config_word(dd->pcidev, pos + PCI_MSI_FLAGS, + control); + } + /* now rewrite the data (vector) info */ + pci_write_config_word(dd->pcidev, pos + + ((control & PCI_MSI_FLAGS_64BIT) ? 12 : 8), + dd->msi_data); + ret = 1; +bail: + if (!ret && (dd->flags & QIB_HAS_INTX)) { + qib_enable_intx(dd->pcidev); + ret = 1; + } + + /* and now set the pci master bit again */ + pci_set_master(dd->pcidev); + + return ret; +} + +/* + * Disable msi interrupt if enabled, and clear msi_lo. + * This is used primarily for the fallback to INTx, but + * is also used in reinit after reset, and during cleanup. + */ +void qib_nomsi(struct qib_devdata *dd) +{ + dd->msi_lo = 0; + pci_disable_msi(dd->pcidev); +} + +/* + * Same as qib_nosmi, but for MSIx. + */ +void qib_nomsix(struct qib_devdata *dd) +{ + pci_disable_msix(dd->pcidev); +} + +/* + * Similar to pci_intx(pdev, 1), except that we make sure + * msi(x) is off. + */ +void qib_enable_intx(struct pci_dev *pdev) +{ + u16 cw, new; + int pos; + + /* first, turn on INTx */ + pci_read_config_word(pdev, PCI_COMMAND, &cw); + new = cw & ~PCI_COMMAND_INTX_DISABLE; + if (new != cw) + pci_write_config_word(pdev, PCI_COMMAND, new); + + pos = pdev->msi_cap; + if (pos) { + /* then turn off MSI */ + pci_read_config_word(pdev, pos + PCI_MSI_FLAGS, &cw); + new = cw & ~PCI_MSI_FLAGS_ENABLE; + if (new != cw) + pci_write_config_word(pdev, pos + PCI_MSI_FLAGS, new); + } + pos = pdev->msix_cap; + if (pos) { + /* then turn off MSIx */ + pci_read_config_word(pdev, pos + PCI_MSIX_FLAGS, &cw); + new = cw & ~PCI_MSIX_FLAGS_ENABLE; + if (new != cw) + pci_write_config_word(pdev, pos + PCI_MSIX_FLAGS, new); + } +} + +/* + * These two routines are helper routines for the device reset code + * to move all the pcie code out of the chip-specific driver code. + */ +void qib_pcie_getcmd(struct qib_devdata *dd, u16 *cmd, u8 *iline, u8 *cline) +{ + pci_read_config_word(dd->pcidev, PCI_COMMAND, cmd); + pci_read_config_byte(dd->pcidev, PCI_INTERRUPT_LINE, iline); + pci_read_config_byte(dd->pcidev, PCI_CACHE_LINE_SIZE, cline); +} + +void qib_pcie_reenable(struct qib_devdata *dd, u16 cmd, u8 iline, u8 cline) +{ + int r; + r = pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0, + dd->pcibar0); + if (r) + qib_dev_err(dd, "rewrite of BAR0 failed: %d\n", r); + r = pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_1, + dd->pcibar1); + if (r) + qib_dev_err(dd, "rewrite of BAR1 failed: %d\n", r); + /* now re-enable memory access, and restore cosmetic settings */ + pci_write_config_word(dd->pcidev, PCI_COMMAND, cmd); + pci_write_config_byte(dd->pcidev, PCI_INTERRUPT_LINE, iline); + pci_write_config_byte(dd->pcidev, PCI_CACHE_LINE_SIZE, cline); + r = pci_enable_device(dd->pcidev); + if (r) + qib_dev_err(dd, + "pci_enable_device failed after reset: %d\n", r); +} + + +static int qib_pcie_coalesce; +module_param_named(pcie_coalesce, qib_pcie_coalesce, int, S_IRUGO); +MODULE_PARM_DESC(pcie_coalesce, "tune PCIe colescing on some Intel chipsets"); + +/* + * Enable PCIe completion and data coalescing, on Intel 5x00 and 7300 + * chipsets. This is known to be unsafe for some revisions of some + * of these chipsets, with some BIOS settings, and enabling it on those + * systems may result in the system crashing, and/or data corruption. + */ +static void qib_tune_pcie_coalesce(struct qib_devdata *dd) +{ + int r; + struct pci_dev *parent; + u16 devid; + u32 mask, bits, val; + + if (!qib_pcie_coalesce) + return; + + /* Find out supported and configured values for parent (root) */ + parent = dd->pcidev->bus->self; + if (parent->bus->parent) { + qib_devinfo(dd->pcidev, "Parent not root\n"); + return; + } + if (!pci_is_pcie(parent)) + return; + if (parent->vendor != 0x8086) + return; + + /* + * - bit 12: Max_rdcmp_Imt_EN: need to set to 1 + * - bit 11: COALESCE_FORCE: need to set to 0 + * - bit 10: COALESCE_EN: need to set to 1 + * (but limitations on some on some chipsets) + * + * On the Intel 5000, 5100, and 7300 chipsets, there is + * also: - bit 25:24: COALESCE_MODE, need to set to 0 + */ + devid = parent->device; + if (devid >= 0x25e2 && devid <= 0x25fa) { + /* 5000 P/V/X/Z */ + if (parent->revision <= 0xb2) + bits = 1U << 10; + else + bits = 7U << 10; + mask = (3U << 24) | (7U << 10); + } else if (devid >= 0x65e2 && devid <= 0x65fa) { + /* 5100 */ + bits = 1U << 10; + mask = (3U << 24) | (7U << 10); + } else if (devid >= 0x4021 && devid <= 0x402e) { + /* 5400 */ + bits = 7U << 10; + mask = 7U << 10; + } else if (devid >= 0x3604 && devid <= 0x360a) { + /* 7300 */ + bits = 7U << 10; + mask = (3U << 24) | (7U << 10); + } else { + /* not one of the chipsets that we know about */ + return; + } + pci_read_config_dword(parent, 0x48, &val); + val &= ~mask; + val |= bits; + r = pci_write_config_dword(parent, 0x48, val); +} + +/* + * BIOS may not set PCIe bus-utilization parameters for best performance. + * Check and optionally adjust them to maximize our throughput. + */ +static int qib_pcie_caps; +module_param_named(pcie_caps, qib_pcie_caps, int, S_IRUGO); +MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)"); + +static void qib_tune_pcie_caps(struct qib_devdata *dd) +{ + struct pci_dev *parent; + u16 rc_mpss, rc_mps, ep_mpss, ep_mps; + u16 rc_mrrs, ep_mrrs, max_mrrs; + + /* Find out supported and configured values for parent (root) */ + parent = dd->pcidev->bus->self; + if (!pci_is_root_bus(parent->bus)) { + qib_devinfo(dd->pcidev, "Parent not root\n"); + return; + } + + if (!pci_is_pcie(parent) || !pci_is_pcie(dd->pcidev)) + return; + + rc_mpss = parent->pcie_mpss; + rc_mps = ffs(pcie_get_mps(parent)) - 8; + /* Find out supported and configured values for endpoint (us) */ + ep_mpss = dd->pcidev->pcie_mpss; + ep_mps = ffs(pcie_get_mps(dd->pcidev)) - 8; + + /* Find max payload supported by root, endpoint */ + if (rc_mpss > ep_mpss) + rc_mpss = ep_mpss; + + /* If Supported greater than limit in module param, limit it */ + if (rc_mpss > (qib_pcie_caps & 7)) + rc_mpss = qib_pcie_caps & 7; + /* If less than (allowed, supported), bump root payload */ + if (rc_mpss > rc_mps) { + rc_mps = rc_mpss; + pcie_set_mps(parent, 128 << rc_mps); + } + /* If less than (allowed, supported), bump endpoint payload */ + if (rc_mpss > ep_mps) { + ep_mps = rc_mpss; + pcie_set_mps(dd->pcidev, 128 << ep_mps); + } + + /* + * Now the Read Request size. + * No field for max supported, but PCIe spec limits it to 4096, + * which is code '5' (log2(4096) - 7) + */ + max_mrrs = 5; + if (max_mrrs > ((qib_pcie_caps >> 4) & 7)) + max_mrrs = (qib_pcie_caps >> 4) & 7; + + max_mrrs = 128 << max_mrrs; + rc_mrrs = pcie_get_readrq(parent); + ep_mrrs = pcie_get_readrq(dd->pcidev); + + if (max_mrrs > rc_mrrs) { + rc_mrrs = max_mrrs; + pcie_set_readrq(parent, rc_mrrs); + } + if (max_mrrs > ep_mrrs) { + ep_mrrs = max_mrrs; + pcie_set_readrq(dd->pcidev, ep_mrrs); + } +} +/* End of PCIe capability tuning */ + +/* + * From here through qib_pci_err_handler definition is invoked via + * PCI error infrastructure, registered via pci + */ +static pci_ers_result_t +qib_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) +{ + struct qib_devdata *dd = pci_get_drvdata(pdev); + pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED; + + switch (state) { + case pci_channel_io_normal: + qib_devinfo(pdev, "State Normal, ignoring\n"); + break; + + case pci_channel_io_frozen: + qib_devinfo(pdev, "State Frozen, requesting reset\n"); + pci_disable_device(pdev); + ret = PCI_ERS_RESULT_NEED_RESET; + break; + + case pci_channel_io_perm_failure: + qib_devinfo(pdev, "State Permanent Failure, disabling\n"); + if (dd) { + /* no more register accesses! */ + dd->flags &= ~QIB_PRESENT; + qib_disable_after_error(dd); + } + /* else early, or other problem */ + ret = PCI_ERS_RESULT_DISCONNECT; + break; + + default: /* shouldn't happen */ + qib_devinfo(pdev, "QIB PCI errors detected (state %d)\n", + state); + break; + } + return ret; +} + +static pci_ers_result_t +qib_pci_mmio_enabled(struct pci_dev *pdev) +{ + u64 words = 0U; + struct qib_devdata *dd = pci_get_drvdata(pdev); + pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED; + + if (dd && dd->pport) { + words = dd->f_portcntr(dd->pport, QIBPORTCNTR_WORDRCV); + if (words == ~0ULL) + ret = PCI_ERS_RESULT_NEED_RESET; + } + qib_devinfo(pdev, + "QIB mmio_enabled function called, read wordscntr %Lx, returning %d\n", + words, ret); + return ret; +} + +static pci_ers_result_t +qib_pci_slot_reset(struct pci_dev *pdev) +{ + qib_devinfo(pdev, "QIB slot_reset function called, ignored\n"); + return PCI_ERS_RESULT_CAN_RECOVER; +} + +static pci_ers_result_t +qib_pci_link_reset(struct pci_dev *pdev) +{ + qib_devinfo(pdev, "QIB link_reset function called, ignored\n"); + return PCI_ERS_RESULT_CAN_RECOVER; +} + +static void +qib_pci_resume(struct pci_dev *pdev) +{ + struct qib_devdata *dd = pci_get_drvdata(pdev); + qib_devinfo(pdev, "QIB resume function called\n"); + pci_cleanup_aer_uncorrect_error_status(pdev); + /* + * Running jobs will fail, since it's asynchronous + * unlike sysfs-requested reset. Better than + * doing nothing. + */ + qib_init(dd, 1); /* same as re-init after reset */ +} + +const struct pci_error_handlers qib_pci_err_handler = { + .error_detected = qib_pci_error_detected, + .mmio_enabled = qib_pci_mmio_enabled, + .link_reset = qib_pci_link_reset, + .slot_reset = qib_pci_slot_reset, + .resume = qib_pci_resume, +}; diff --git a/drivers/infiniband/hw/qib/qib_pio_copy.c b/drivers/infiniband/hw/qib/qib_pio_copy.c new file mode 100644 index 0000000..10b8c44 --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_pio_copy.c @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2009 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "qib.h" + +/** + * qib_pio_copy - copy data to MMIO space, in multiples of 32-bits + * @to: destination, in MMIO space (must be 64-bit aligned) + * @from: source (must be 64-bit aligned) + * @count: number of 32-bit quantities to copy + * + * Copy data from kernel space to MMIO space, in multiples of 32 bits at a + * time. Order of access is not guaranteed, nor is a memory barrier + * performed afterwards. + */ +void qib_pio_copy(void __iomem *to, const void *from, size_t count) +{ +#ifdef CONFIG_64BIT + u64 __iomem *dst = to; + const u64 *src = from; + const u64 *end = src + (count >> 1); + + while (src < end) + __raw_writeq(*src++, dst++); + if (count & 1) + __raw_writel(*(const u32 *)src, dst); +#else + u32 __iomem *dst = to; + const u32 *src = from; + const u32 *end = src + count; + + while (src < end) + __raw_writel(*src++, dst++); +#endif +} diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c new file mode 100644 index 0000000..6ddc026 --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_qp.c @@ -0,0 +1,1376 @@ +/* + * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. * All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#ifdef CONFIG_DEBUG_FS +#include +#endif + +#include "qib.h" + +#define BITS_PER_PAGE (PAGE_SIZE*BITS_PER_BYTE) +#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) + +static inline unsigned mk_qpn(struct qib_qpn_table *qpt, + struct qpn_map *map, unsigned off) +{ + return (map - qpt->map) * BITS_PER_PAGE + off; +} + +static inline unsigned find_next_offset(struct qib_qpn_table *qpt, + struct qpn_map *map, unsigned off, + unsigned n) +{ + if (qpt->mask) { + off++; + if (((off & qpt->mask) >> 1) >= n) + off = (off | qpt->mask) + 2; + } else + off = find_next_zero_bit(map->page, BITS_PER_PAGE, off); + return off; +} + +/* + * Convert the AETH credit code into the number of credits. + */ +static u32 credit_table[31] = { + 0, /* 0 */ + 1, /* 1 */ + 2, /* 2 */ + 3, /* 3 */ + 4, /* 4 */ + 6, /* 5 */ + 8, /* 6 */ + 12, /* 7 */ + 16, /* 8 */ + 24, /* 9 */ + 32, /* A */ + 48, /* B */ + 64, /* C */ + 96, /* D */ + 128, /* E */ + 192, /* F */ + 256, /* 10 */ + 384, /* 11 */ + 512, /* 12 */ + 768, /* 13 */ + 1024, /* 14 */ + 1536, /* 15 */ + 2048, /* 16 */ + 3072, /* 17 */ + 4096, /* 18 */ + 6144, /* 19 */ + 8192, /* 1A */ + 12288, /* 1B */ + 16384, /* 1C */ + 24576, /* 1D */ + 32768 /* 1E */ +}; + +static void get_map_page(struct qib_qpn_table *qpt, struct qpn_map *map) +{ + unsigned long page = get_zeroed_page(GFP_KERNEL); + + /* + * Free the page if someone raced with us installing it. + */ + + spin_lock(&qpt->lock); + if (map->page) + free_page(page); + else + map->page = (void *)page; + spin_unlock(&qpt->lock); +} + +/* + * Allocate the next available QPN or + * zero/one for QP type IB_QPT_SMI/IB_QPT_GSI. + */ +static int alloc_qpn(struct qib_devdata *dd, struct qib_qpn_table *qpt, + enum ib_qp_type type, u8 port) +{ + u32 i, offset, max_scan, qpn; + struct qpn_map *map; + u32 ret; + + if (type == IB_QPT_SMI || type == IB_QPT_GSI) { + unsigned n; + + ret = type == IB_QPT_GSI; + n = 1 << (ret + 2 * (port - 1)); + spin_lock(&qpt->lock); + if (qpt->flags & n) + ret = -EINVAL; + else + qpt->flags |= n; + spin_unlock(&qpt->lock); + goto bail; + } + + qpn = qpt->last + 2; + if (qpn >= QPN_MAX) + qpn = 2; + if (qpt->mask && ((qpn & qpt->mask) >> 1) >= dd->n_krcv_queues) + qpn = (qpn | qpt->mask) + 2; + offset = qpn & BITS_PER_PAGE_MASK; + map = &qpt->map[qpn / BITS_PER_PAGE]; + max_scan = qpt->nmaps - !offset; + for (i = 0;;) { + if (unlikely(!map->page)) { + get_map_page(qpt, map); + if (unlikely(!map->page)) + break; + } + do { + if (!test_and_set_bit(offset, map->page)) { + qpt->last = qpn; + ret = qpn; + goto bail; + } + offset = find_next_offset(qpt, map, offset, + dd->n_krcv_queues); + qpn = mk_qpn(qpt, map, offset); + /* + * This test differs from alloc_pidmap(). + * If find_next_offset() does find a zero + * bit, we don't need to check for QPN + * wrapping around past our starting QPN. + * We just need to be sure we don't loop + * forever. + */ + } while (offset < BITS_PER_PAGE && qpn < QPN_MAX); + /* + * In order to keep the number of pages allocated to a + * minimum, we scan the all existing pages before increasing + * the size of the bitmap table. + */ + if (++i > max_scan) { + if (qpt->nmaps == QPNMAP_ENTRIES) + break; + map = &qpt->map[qpt->nmaps++]; + offset = 0; + } else if (map < &qpt->map[qpt->nmaps]) { + ++map; + offset = 0; + } else { + map = &qpt->map[0]; + offset = 2; + } + qpn = mk_qpn(qpt, map, offset); + } + + ret = -ENOMEM; + +bail: + return ret; +} + +static void free_qpn(struct qib_qpn_table *qpt, u32 qpn) +{ + struct qpn_map *map; + + map = qpt->map + qpn / BITS_PER_PAGE; + if (map->page) + clear_bit(qpn & BITS_PER_PAGE_MASK, map->page); +} + +static inline unsigned qpn_hash(struct qib_ibdev *dev, u32 qpn) +{ + return jhash_1word(qpn, dev->qp_rnd) & + (dev->qp_table_size - 1); +} + + +/* + * Put the QP into the hash table. + * The hash table holds a reference to the QP. + */ +static void insert_qp(struct qib_ibdev *dev, struct qib_qp *qp) +{ + struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + unsigned long flags; + unsigned n = qpn_hash(dev, qp->ibqp.qp_num); + + atomic_inc(&qp->refcount); + spin_lock_irqsave(&dev->qpt_lock, flags); + + if (qp->ibqp.qp_num == 0) + rcu_assign_pointer(ibp->qp0, qp); + else if (qp->ibqp.qp_num == 1) + rcu_assign_pointer(ibp->qp1, qp); + else { + qp->next = dev->qp_table[n]; + rcu_assign_pointer(dev->qp_table[n], qp); + } + + spin_unlock_irqrestore(&dev->qpt_lock, flags); +} + +/* + * Remove the QP from the table so it can't be found asynchronously by + * the receive interrupt routine. + */ +static void remove_qp(struct qib_ibdev *dev, struct qib_qp *qp) +{ + struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + unsigned n = qpn_hash(dev, qp->ibqp.qp_num); + unsigned long flags; + int removed = 1; + + spin_lock_irqsave(&dev->qpt_lock, flags); + + if (rcu_dereference_protected(ibp->qp0, + lockdep_is_held(&dev->qpt_lock)) == qp) { + rcu_assign_pointer(ibp->qp0, NULL); + } else if (rcu_dereference_protected(ibp->qp1, + lockdep_is_held(&dev->qpt_lock)) == qp) { + rcu_assign_pointer(ibp->qp1, NULL); + } else { + struct qib_qp *q; + struct qib_qp __rcu **qpp; + + removed = 0; + qpp = &dev->qp_table[n]; + for (; (q = rcu_dereference_protected(*qpp, + lockdep_is_held(&dev->qpt_lock))) != NULL; + qpp = &q->next) + if (q == qp) { + rcu_assign_pointer(*qpp, + rcu_dereference_protected(qp->next, + lockdep_is_held(&dev->qpt_lock))); + removed = 1; + break; + } + } + + spin_unlock_irqrestore(&dev->qpt_lock, flags); + if (removed) { + synchronize_rcu(); + atomic_dec(&qp->refcount); + } +} + +/** + * qib_free_all_qps - check for QPs still in use + * @qpt: the QP table to empty + * + * There should not be any QPs still in use. + * Free memory for table. + */ +unsigned qib_free_all_qps(struct qib_devdata *dd) +{ + struct qib_ibdev *dev = &dd->verbs_dev; + unsigned long flags; + struct qib_qp *qp; + unsigned n, qp_inuse = 0; + + for (n = 0; n < dd->num_pports; n++) { + struct qib_ibport *ibp = &dd->pport[n].ibport_data; + + if (!qib_mcast_tree_empty(ibp)) + qp_inuse++; + rcu_read_lock(); + if (rcu_dereference(ibp->qp0)) + qp_inuse++; + if (rcu_dereference(ibp->qp1)) + qp_inuse++; + rcu_read_unlock(); + } + + spin_lock_irqsave(&dev->qpt_lock, flags); + for (n = 0; n < dev->qp_table_size; n++) { + qp = rcu_dereference_protected(dev->qp_table[n], + lockdep_is_held(&dev->qpt_lock)); + rcu_assign_pointer(dev->qp_table[n], NULL); + + for (; qp; qp = rcu_dereference_protected(qp->next, + lockdep_is_held(&dev->qpt_lock))) + qp_inuse++; + } + spin_unlock_irqrestore(&dev->qpt_lock, flags); + synchronize_rcu(); + + return qp_inuse; +} + +/** + * qib_lookup_qpn - return the QP with the given QPN + * @qpt: the QP table + * @qpn: the QP number to look up + * + * The caller is responsible for decrementing the QP reference count + * when done. + */ +struct qib_qp *qib_lookup_qpn(struct qib_ibport *ibp, u32 qpn) +{ + struct qib_qp *qp = NULL; + + rcu_read_lock(); + if (unlikely(qpn <= 1)) { + if (qpn == 0) + qp = rcu_dereference(ibp->qp0); + else + qp = rcu_dereference(ibp->qp1); + if (qp) + atomic_inc(&qp->refcount); + } else { + struct qib_ibdev *dev = &ppd_from_ibp(ibp)->dd->verbs_dev; + unsigned n = qpn_hash(dev, qpn); + + for (qp = rcu_dereference(dev->qp_table[n]); qp; + qp = rcu_dereference(qp->next)) + if (qp->ibqp.qp_num == qpn) { + atomic_inc(&qp->refcount); + break; + } + } + rcu_read_unlock(); + return qp; +} + +/** + * qib_reset_qp - initialize the QP state to the reset state + * @qp: the QP to reset + * @type: the QP type + */ +static void qib_reset_qp(struct qib_qp *qp, enum ib_qp_type type) +{ + qp->remote_qpn = 0; + qp->qkey = 0; + qp->qp_access_flags = 0; + atomic_set(&qp->s_dma_busy, 0); + qp->s_flags &= QIB_S_SIGNAL_REQ_WR; + qp->s_hdrwords = 0; + qp->s_wqe = NULL; + qp->s_draining = 0; + qp->s_next_psn = 0; + qp->s_last_psn = 0; + qp->s_sending_psn = 0; + qp->s_sending_hpsn = 0; + qp->s_psn = 0; + qp->r_psn = 0; + qp->r_msn = 0; + if (type == IB_QPT_RC) { + qp->s_state = IB_OPCODE_RC_SEND_LAST; + qp->r_state = IB_OPCODE_RC_SEND_LAST; + } else { + qp->s_state = IB_OPCODE_UC_SEND_LAST; + qp->r_state = IB_OPCODE_UC_SEND_LAST; + } + qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE; + qp->r_nak_state = 0; + qp->r_aflags = 0; + qp->r_flags = 0; + qp->s_head = 0; + qp->s_tail = 0; + qp->s_cur = 0; + qp->s_acked = 0; + qp->s_last = 0; + qp->s_ssn = 1; + qp->s_lsn = 0; + qp->s_mig_state = IB_MIG_MIGRATED; + memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue)); + qp->r_head_ack_queue = 0; + qp->s_tail_ack_queue = 0; + qp->s_num_rd_atomic = 0; + if (qp->r_rq.wq) { + qp->r_rq.wq->head = 0; + qp->r_rq.wq->tail = 0; + } + qp->r_sge.num_sge = 0; +} + +static void clear_mr_refs(struct qib_qp *qp, int clr_sends) +{ + unsigned n; + + if (test_and_clear_bit(QIB_R_REWIND_SGE, &qp->r_aflags)) + qib_put_ss(&qp->s_rdma_read_sge); + + qib_put_ss(&qp->r_sge); + + if (clr_sends) { + while (qp->s_last != qp->s_head) { + struct qib_swqe *wqe = get_swqe_ptr(qp, qp->s_last); + unsigned i; + + for (i = 0; i < wqe->wr.num_sge; i++) { + struct qib_sge *sge = &wqe->sg_list[i]; + + qib_put_mr(sge->mr); + } + if (qp->ibqp.qp_type == IB_QPT_UD || + qp->ibqp.qp_type == IB_QPT_SMI || + qp->ibqp.qp_type == IB_QPT_GSI) + atomic_dec(&to_iah(wqe->wr.wr.ud.ah)->refcount); + if (++qp->s_last >= qp->s_size) + qp->s_last = 0; + } + if (qp->s_rdma_mr) { + qib_put_mr(qp->s_rdma_mr); + qp->s_rdma_mr = NULL; + } + } + + if (qp->ibqp.qp_type != IB_QPT_RC) + return; + + for (n = 0; n < ARRAY_SIZE(qp->s_ack_queue); n++) { + struct qib_ack_entry *e = &qp->s_ack_queue[n]; + + if (e->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST && + e->rdma_sge.mr) { + qib_put_mr(e->rdma_sge.mr); + e->rdma_sge.mr = NULL; + } + } +} + +/** + * qib_error_qp - put a QP into the error state + * @qp: the QP to put into the error state + * @err: the receive completion error to signal if a RWQE is active + * + * Flushes both send and receive work queues. + * Returns true if last WQE event should be generated. + * The QP r_lock and s_lock should be held and interrupts disabled. + * If we are already in error state, just return. + */ +int qib_error_qp(struct qib_qp *qp, enum ib_wc_status err) +{ + struct qib_ibdev *dev = to_idev(qp->ibqp.device); + struct ib_wc wc; + int ret = 0; + + if (qp->state == IB_QPS_ERR || qp->state == IB_QPS_RESET) + goto bail; + + qp->state = IB_QPS_ERR; + + if (qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR)) { + qp->s_flags &= ~(QIB_S_TIMER | QIB_S_WAIT_RNR); + del_timer(&qp->s_timer); + } + + if (qp->s_flags & QIB_S_ANY_WAIT_SEND) + qp->s_flags &= ~QIB_S_ANY_WAIT_SEND; + + spin_lock(&dev->pending_lock); + if (!list_empty(&qp->iowait) && !(qp->s_flags & QIB_S_BUSY)) { + qp->s_flags &= ~QIB_S_ANY_WAIT_IO; + list_del_init(&qp->iowait); + } + spin_unlock(&dev->pending_lock); + + if (!(qp->s_flags & QIB_S_BUSY)) { + qp->s_hdrwords = 0; + if (qp->s_rdma_mr) { + qib_put_mr(qp->s_rdma_mr); + qp->s_rdma_mr = NULL; + } + if (qp->s_tx) { + qib_put_txreq(qp->s_tx); + qp->s_tx = NULL; + } + } + + /* Schedule the sending tasklet to drain the send work queue. */ + if (qp->s_last != qp->s_head) + qib_schedule_send(qp); + + clear_mr_refs(qp, 0); + + memset(&wc, 0, sizeof(wc)); + wc.qp = &qp->ibqp; + wc.opcode = IB_WC_RECV; + + if (test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags)) { + wc.wr_id = qp->r_wr_id; + wc.status = err; + qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); + } + wc.status = IB_WC_WR_FLUSH_ERR; + + if (qp->r_rq.wq) { + struct qib_rwq *wq; + u32 head; + u32 tail; + + spin_lock(&qp->r_rq.lock); + + /* sanity check pointers before trusting them */ + wq = qp->r_rq.wq; + head = wq->head; + if (head >= qp->r_rq.size) + head = 0; + tail = wq->tail; + if (tail >= qp->r_rq.size) + tail = 0; + while (tail != head) { + wc.wr_id = get_rwqe_ptr(&qp->r_rq, tail)->wr_id; + if (++tail >= qp->r_rq.size) + tail = 0; + qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); + } + wq->tail = tail; + + spin_unlock(&qp->r_rq.lock); + } else if (qp->ibqp.event_handler) + ret = 1; + +bail: + return ret; +} + +/** + * qib_modify_qp - modify the attributes of a queue pair + * @ibqp: the queue pair who's attributes we're modifying + * @attr: the new attributes + * @attr_mask: the mask of attributes to modify + * @udata: user data for libibverbs.so + * + * Returns 0 on success, otherwise returns an errno. + */ +int qib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct qib_ibdev *dev = to_idev(ibqp->device); + struct qib_qp *qp = to_iqp(ibqp); + enum ib_qp_state cur_state, new_state; + struct ib_event ev; + int lastwqe = 0; + int mig = 0; + int ret; + u32 pmtu = 0; /* for gcc warning only */ + + spin_lock_irq(&qp->r_lock); + spin_lock(&qp->s_lock); + + cur_state = attr_mask & IB_QP_CUR_STATE ? + attr->cur_qp_state : qp->state; + new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, + attr_mask, IB_LINK_LAYER_UNSPECIFIED)) + goto inval; + + if (attr_mask & IB_QP_AV) { + if (attr->ah_attr.dlid >= QIB_MULTICAST_LID_BASE) + goto inval; + if (qib_check_ah(qp->ibqp.device, &attr->ah_attr)) + goto inval; + } + + if (attr_mask & IB_QP_ALT_PATH) { + if (attr->alt_ah_attr.dlid >= QIB_MULTICAST_LID_BASE) + goto inval; + if (qib_check_ah(qp->ibqp.device, &attr->alt_ah_attr)) + goto inval; + if (attr->alt_pkey_index >= qib_get_npkeys(dd_from_dev(dev))) + goto inval; + } + + if (attr_mask & IB_QP_PKEY_INDEX) + if (attr->pkey_index >= qib_get_npkeys(dd_from_dev(dev))) + goto inval; + + if (attr_mask & IB_QP_MIN_RNR_TIMER) + if (attr->min_rnr_timer > 31) + goto inval; + + if (attr_mask & IB_QP_PORT) + if (qp->ibqp.qp_type == IB_QPT_SMI || + qp->ibqp.qp_type == IB_QPT_GSI || + attr->port_num == 0 || + attr->port_num > ibqp->device->phys_port_cnt) + goto inval; + + if (attr_mask & IB_QP_DEST_QPN) + if (attr->dest_qp_num > QIB_QPN_MASK) + goto inval; + + if (attr_mask & IB_QP_RETRY_CNT) + if (attr->retry_cnt > 7) + goto inval; + + if (attr_mask & IB_QP_RNR_RETRY) + if (attr->rnr_retry > 7) + goto inval; + + /* + * Don't allow invalid path_mtu values. OK to set greater + * than the active mtu (or even the max_cap, if we have tuned + * that to a small mtu. We'll set qp->path_mtu + * to the lesser of requested attribute mtu and active, + * for packetizing messages. + * Note that the QP port has to be set in INIT and MTU in RTR. + */ + if (attr_mask & IB_QP_PATH_MTU) { + struct qib_devdata *dd = dd_from_dev(dev); + int mtu, pidx = qp->port_num - 1; + + mtu = ib_mtu_enum_to_int(attr->path_mtu); + if (mtu == -1) + goto inval; + if (mtu > dd->pport[pidx].ibmtu) { + switch (dd->pport[pidx].ibmtu) { + case 4096: + pmtu = IB_MTU_4096; + break; + case 2048: + pmtu = IB_MTU_2048; + break; + case 1024: + pmtu = IB_MTU_1024; + break; + case 512: + pmtu = IB_MTU_512; + break; + case 256: + pmtu = IB_MTU_256; + break; + default: + pmtu = IB_MTU_2048; + } + } else + pmtu = attr->path_mtu; + } + + if (attr_mask & IB_QP_PATH_MIG_STATE) { + if (attr->path_mig_state == IB_MIG_REARM) { + if (qp->s_mig_state == IB_MIG_ARMED) + goto inval; + if (new_state != IB_QPS_RTS) + goto inval; + } else if (attr->path_mig_state == IB_MIG_MIGRATED) { + if (qp->s_mig_state == IB_MIG_REARM) + goto inval; + if (new_state != IB_QPS_RTS && new_state != IB_QPS_SQD) + goto inval; + if (qp->s_mig_state == IB_MIG_ARMED) + mig = 1; + } else + goto inval; + } + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + if (attr->max_dest_rd_atomic > QIB_MAX_RDMA_ATOMIC) + goto inval; + + switch (new_state) { + case IB_QPS_RESET: + if (qp->state != IB_QPS_RESET) { + qp->state = IB_QPS_RESET; + spin_lock(&dev->pending_lock); + if (!list_empty(&qp->iowait)) + list_del_init(&qp->iowait); + spin_unlock(&dev->pending_lock); + qp->s_flags &= ~(QIB_S_TIMER | QIB_S_ANY_WAIT); + spin_unlock(&qp->s_lock); + spin_unlock_irq(&qp->r_lock); + /* Stop the sending work queue and retry timer */ + cancel_work_sync(&qp->s_work); + del_timer_sync(&qp->s_timer); + wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy)); + if (qp->s_tx) { + qib_put_txreq(qp->s_tx); + qp->s_tx = NULL; + } + remove_qp(dev, qp); + wait_event(qp->wait, !atomic_read(&qp->refcount)); + spin_lock_irq(&qp->r_lock); + spin_lock(&qp->s_lock); + clear_mr_refs(qp, 1); + qib_reset_qp(qp, ibqp->qp_type); + } + break; + + case IB_QPS_RTR: + /* Allow event to retrigger if QP set to RTR more than once */ + qp->r_flags &= ~QIB_R_COMM_EST; + qp->state = new_state; + break; + + case IB_QPS_SQD: + qp->s_draining = qp->s_last != qp->s_cur; + qp->state = new_state; + break; + + case IB_QPS_SQE: + if (qp->ibqp.qp_type == IB_QPT_RC) + goto inval; + qp->state = new_state; + break; + + case IB_QPS_ERR: + lastwqe = qib_error_qp(qp, IB_WC_WR_FLUSH_ERR); + break; + + default: + qp->state = new_state; + break; + } + + if (attr_mask & IB_QP_PKEY_INDEX) + qp->s_pkey_index = attr->pkey_index; + + if (attr_mask & IB_QP_PORT) + qp->port_num = attr->port_num; + + if (attr_mask & IB_QP_DEST_QPN) + qp->remote_qpn = attr->dest_qp_num; + + if (attr_mask & IB_QP_SQ_PSN) { + qp->s_next_psn = attr->sq_psn & QIB_PSN_MASK; + qp->s_psn = qp->s_next_psn; + qp->s_sending_psn = qp->s_next_psn; + qp->s_last_psn = qp->s_next_psn - 1; + qp->s_sending_hpsn = qp->s_last_psn; + } + + if (attr_mask & IB_QP_RQ_PSN) + qp->r_psn = attr->rq_psn & QIB_PSN_MASK; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + qp->qp_access_flags = attr->qp_access_flags; + + if (attr_mask & IB_QP_AV) { + qp->remote_ah_attr = attr->ah_attr; + qp->s_srate = attr->ah_attr.static_rate; + } + + if (attr_mask & IB_QP_ALT_PATH) { + qp->alt_ah_attr = attr->alt_ah_attr; + qp->s_alt_pkey_index = attr->alt_pkey_index; + } + + if (attr_mask & IB_QP_PATH_MIG_STATE) { + qp->s_mig_state = attr->path_mig_state; + if (mig) { + qp->remote_ah_attr = qp->alt_ah_attr; + qp->port_num = qp->alt_ah_attr.port_num; + qp->s_pkey_index = qp->s_alt_pkey_index; + } + } + + if (attr_mask & IB_QP_PATH_MTU) { + qp->path_mtu = pmtu; + qp->pmtu = ib_mtu_enum_to_int(pmtu); + } + + if (attr_mask & IB_QP_RETRY_CNT) { + qp->s_retry_cnt = attr->retry_cnt; + qp->s_retry = attr->retry_cnt; + } + + if (attr_mask & IB_QP_RNR_RETRY) { + qp->s_rnr_retry_cnt = attr->rnr_retry; + qp->s_rnr_retry = attr->rnr_retry; + } + + if (attr_mask & IB_QP_MIN_RNR_TIMER) + qp->r_min_rnr_timer = attr->min_rnr_timer; + + if (attr_mask & IB_QP_TIMEOUT) { + qp->timeout = attr->timeout; + qp->timeout_jiffies = + usecs_to_jiffies((4096UL * (1UL << qp->timeout)) / + 1000UL); + } + + if (attr_mask & IB_QP_QKEY) + qp->qkey = attr->qkey; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + qp->r_max_rd_atomic = attr->max_dest_rd_atomic; + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) + qp->s_max_rd_atomic = attr->max_rd_atomic; + + spin_unlock(&qp->s_lock); + spin_unlock_irq(&qp->r_lock); + + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + insert_qp(dev, qp); + + if (lastwqe) { + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } + if (mig) { + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_PATH_MIG; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } + ret = 0; + goto bail; + +inval: + spin_unlock(&qp->s_lock); + spin_unlock_irq(&qp->r_lock); + ret = -EINVAL; + +bail: + return ret; +} + +int qib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_qp_init_attr *init_attr) +{ + struct qib_qp *qp = to_iqp(ibqp); + + attr->qp_state = qp->state; + attr->cur_qp_state = attr->qp_state; + attr->path_mtu = qp->path_mtu; + attr->path_mig_state = qp->s_mig_state; + attr->qkey = qp->qkey; + attr->rq_psn = qp->r_psn & QIB_PSN_MASK; + attr->sq_psn = qp->s_next_psn & QIB_PSN_MASK; + attr->dest_qp_num = qp->remote_qpn; + attr->qp_access_flags = qp->qp_access_flags; + attr->cap.max_send_wr = qp->s_size - 1; + attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1; + attr->cap.max_send_sge = qp->s_max_sge; + attr->cap.max_recv_sge = qp->r_rq.max_sge; + attr->cap.max_inline_data = 0; + attr->ah_attr = qp->remote_ah_attr; + attr->alt_ah_attr = qp->alt_ah_attr; + attr->pkey_index = qp->s_pkey_index; + attr->alt_pkey_index = qp->s_alt_pkey_index; + attr->en_sqd_async_notify = 0; + attr->sq_draining = qp->s_draining; + attr->max_rd_atomic = qp->s_max_rd_atomic; + attr->max_dest_rd_atomic = qp->r_max_rd_atomic; + attr->min_rnr_timer = qp->r_min_rnr_timer; + attr->port_num = qp->port_num; + attr->timeout = qp->timeout; + attr->retry_cnt = qp->s_retry_cnt; + attr->rnr_retry = qp->s_rnr_retry_cnt; + attr->alt_port_num = qp->alt_ah_attr.port_num; + attr->alt_timeout = qp->alt_timeout; + + init_attr->event_handler = qp->ibqp.event_handler; + init_attr->qp_context = qp->ibqp.qp_context; + init_attr->send_cq = qp->ibqp.send_cq; + init_attr->recv_cq = qp->ibqp.recv_cq; + init_attr->srq = qp->ibqp.srq; + init_attr->cap = attr->cap; + if (qp->s_flags & QIB_S_SIGNAL_REQ_WR) + init_attr->sq_sig_type = IB_SIGNAL_REQ_WR; + else + init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; + init_attr->qp_type = qp->ibqp.qp_type; + init_attr->port_num = qp->port_num; + return 0; +} + +/** + * qib_compute_aeth - compute the AETH (syndrome + MSN) + * @qp: the queue pair to compute the AETH for + * + * Returns the AETH. + */ +__be32 qib_compute_aeth(struct qib_qp *qp) +{ + u32 aeth = qp->r_msn & QIB_MSN_MASK; + + if (qp->ibqp.srq) { + /* + * Shared receive queues don't generate credits. + * Set the credit field to the invalid value. + */ + aeth |= QIB_AETH_CREDIT_INVAL << QIB_AETH_CREDIT_SHIFT; + } else { + u32 min, max, x; + u32 credits; + struct qib_rwq *wq = qp->r_rq.wq; + u32 head; + u32 tail; + + /* sanity check pointers before trusting them */ + head = wq->head; + if (head >= qp->r_rq.size) + head = 0; + tail = wq->tail; + if (tail >= qp->r_rq.size) + tail = 0; + /* + * Compute the number of credits available (RWQEs). + * XXX Not holding the r_rq.lock here so there is a small + * chance that the pair of reads are not atomic. + */ + credits = head - tail; + if ((int)credits < 0) + credits += qp->r_rq.size; + /* + * Binary search the credit table to find the code to + * use. + */ + min = 0; + max = 31; + for (;;) { + x = (min + max) / 2; + if (credit_table[x] == credits) + break; + if (credit_table[x] > credits) + max = x; + else if (min == x) + break; + else + min = x; + } + aeth |= x << QIB_AETH_CREDIT_SHIFT; + } + return cpu_to_be32(aeth); +} + +/** + * qib_create_qp - create a queue pair for a device + * @ibpd: the protection domain who's device we create the queue pair for + * @init_attr: the attributes of the queue pair + * @udata: user data for libibverbs.so + * + * Returns the queue pair on success, otherwise returns an errno. + * + * Called by the ib_create_qp() core verbs function. + */ +struct ib_qp *qib_create_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct qib_qp *qp; + int err; + struct qib_swqe *swq = NULL; + struct qib_ibdev *dev; + struct qib_devdata *dd; + size_t sz; + size_t sg_list_sz; + struct ib_qp *ret; + + if (init_attr->cap.max_send_sge > ib_qib_max_sges || + init_attr->cap.max_send_wr > ib_qib_max_qp_wrs || + init_attr->create_flags) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + /* Check receive queue parameters if no SRQ is specified. */ + if (!init_attr->srq) { + if (init_attr->cap.max_recv_sge > ib_qib_max_sges || + init_attr->cap.max_recv_wr > ib_qib_max_qp_wrs) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + if (init_attr->cap.max_send_sge + + init_attr->cap.max_send_wr + + init_attr->cap.max_recv_sge + + init_attr->cap.max_recv_wr == 0) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + } + + switch (init_attr->qp_type) { + case IB_QPT_SMI: + case IB_QPT_GSI: + if (init_attr->port_num == 0 || + init_attr->port_num > ibpd->device->phys_port_cnt) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + case IB_QPT_UC: + case IB_QPT_RC: + case IB_QPT_UD: + sz = sizeof(struct qib_sge) * + init_attr->cap.max_send_sge + + sizeof(struct qib_swqe); + swq = vmalloc((init_attr->cap.max_send_wr + 1) * sz); + if (swq == NULL) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + sz = sizeof(*qp); + sg_list_sz = 0; + if (init_attr->srq) { + struct qib_srq *srq = to_isrq(init_attr->srq); + + if (srq->rq.max_sge > 1) + sg_list_sz = sizeof(*qp->r_sg_list) * + (srq->rq.max_sge - 1); + } else if (init_attr->cap.max_recv_sge > 1) + sg_list_sz = sizeof(*qp->r_sg_list) * + (init_attr->cap.max_recv_sge - 1); + qp = kzalloc(sz + sg_list_sz, GFP_KERNEL); + if (!qp) { + ret = ERR_PTR(-ENOMEM); + goto bail_swq; + } + RCU_INIT_POINTER(qp->next, NULL); + qp->s_hdr = kzalloc(sizeof(*qp->s_hdr), GFP_KERNEL); + if (!qp->s_hdr) { + ret = ERR_PTR(-ENOMEM); + goto bail_qp; + } + qp->timeout_jiffies = + usecs_to_jiffies((4096UL * (1UL << qp->timeout)) / + 1000UL); + if (init_attr->srq) + sz = 0; + else { + qp->r_rq.size = init_attr->cap.max_recv_wr + 1; + qp->r_rq.max_sge = init_attr->cap.max_recv_sge; + sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) + + sizeof(struct qib_rwqe); + qp->r_rq.wq = vmalloc_user(sizeof(struct qib_rwq) + + qp->r_rq.size * sz); + if (!qp->r_rq.wq) { + ret = ERR_PTR(-ENOMEM); + goto bail_qp; + } + } + + /* + * ib_create_qp() will initialize qp->ibqp + * except for qp->ibqp.qp_num. + */ + spin_lock_init(&qp->r_lock); + spin_lock_init(&qp->s_lock); + spin_lock_init(&qp->r_rq.lock); + atomic_set(&qp->refcount, 0); + init_waitqueue_head(&qp->wait); + init_waitqueue_head(&qp->wait_dma); + init_timer(&qp->s_timer); + qp->s_timer.data = (unsigned long)qp; + INIT_WORK(&qp->s_work, qib_do_send); + INIT_LIST_HEAD(&qp->iowait); + INIT_LIST_HEAD(&qp->rspwait); + qp->state = IB_QPS_RESET; + qp->s_wq = swq; + qp->s_size = init_attr->cap.max_send_wr + 1; + qp->s_max_sge = init_attr->cap.max_send_sge; + if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR) + qp->s_flags = QIB_S_SIGNAL_REQ_WR; + dev = to_idev(ibpd->device); + dd = dd_from_dev(dev); + err = alloc_qpn(dd, &dev->qpn_table, init_attr->qp_type, + init_attr->port_num); + if (err < 0) { + ret = ERR_PTR(err); + vfree(qp->r_rq.wq); + goto bail_qp; + } + qp->ibqp.qp_num = err; + qp->port_num = init_attr->port_num; + qib_reset_qp(qp, init_attr->qp_type); + break; + + default: + /* Don't support raw QPs */ + ret = ERR_PTR(-ENOSYS); + goto bail; + } + + init_attr->cap.max_inline_data = 0; + + /* + * Return the address of the RWQ as the offset to mmap. + * See qib_mmap() for details. + */ + if (udata && udata->outlen >= sizeof(__u64)) { + if (!qp->r_rq.wq) { + __u64 offset = 0; + + err = ib_copy_to_udata(udata, &offset, + sizeof(offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_ip; + } + } else { + u32 s = sizeof(struct qib_rwq) + qp->r_rq.size * sz; + + qp->ip = qib_create_mmap_info(dev, s, + ibpd->uobject->context, + qp->r_rq.wq); + if (!qp->ip) { + ret = ERR_PTR(-ENOMEM); + goto bail_ip; + } + + err = ib_copy_to_udata(udata, &(qp->ip->offset), + sizeof(qp->ip->offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_ip; + } + } + } + + spin_lock(&dev->n_qps_lock); + if (dev->n_qps_allocated == ib_qib_max_qps) { + spin_unlock(&dev->n_qps_lock); + ret = ERR_PTR(-ENOMEM); + goto bail_ip; + } + + dev->n_qps_allocated++; + spin_unlock(&dev->n_qps_lock); + + if (qp->ip) { + spin_lock_irq(&dev->pending_lock); + list_add(&qp->ip->pending_mmaps, &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + + ret = &qp->ibqp; + goto bail; + +bail_ip: + if (qp->ip) + kref_put(&qp->ip->ref, qib_release_mmap_info); + else + vfree(qp->r_rq.wq); + free_qpn(&dev->qpn_table, qp->ibqp.qp_num); +bail_qp: + kfree(qp->s_hdr); + kfree(qp); +bail_swq: + vfree(swq); +bail: + return ret; +} + +/** + * qib_destroy_qp - destroy a queue pair + * @ibqp: the queue pair to destroy + * + * Returns 0 on success. + * + * Note that this can be called while the QP is actively sending or + * receiving! + */ +int qib_destroy_qp(struct ib_qp *ibqp) +{ + struct qib_qp *qp = to_iqp(ibqp); + struct qib_ibdev *dev = to_idev(ibqp->device); + + /* Make sure HW and driver activity is stopped. */ + spin_lock_irq(&qp->s_lock); + if (qp->state != IB_QPS_RESET) { + qp->state = IB_QPS_RESET; + spin_lock(&dev->pending_lock); + if (!list_empty(&qp->iowait)) + list_del_init(&qp->iowait); + spin_unlock(&dev->pending_lock); + qp->s_flags &= ~(QIB_S_TIMER | QIB_S_ANY_WAIT); + spin_unlock_irq(&qp->s_lock); + cancel_work_sync(&qp->s_work); + del_timer_sync(&qp->s_timer); + wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy)); + if (qp->s_tx) { + qib_put_txreq(qp->s_tx); + qp->s_tx = NULL; + } + remove_qp(dev, qp); + wait_event(qp->wait, !atomic_read(&qp->refcount)); + clear_mr_refs(qp, 1); + } else + spin_unlock_irq(&qp->s_lock); + + /* all user's cleaned up, mark it available */ + free_qpn(&dev->qpn_table, qp->ibqp.qp_num); + spin_lock(&dev->n_qps_lock); + dev->n_qps_allocated--; + spin_unlock(&dev->n_qps_lock); + + if (qp->ip) + kref_put(&qp->ip->ref, qib_release_mmap_info); + else + vfree(qp->r_rq.wq); + vfree(qp->s_wq); + kfree(qp->s_hdr); + kfree(qp); + return 0; +} + +/** + * qib_init_qpn_table - initialize the QP number table for a device + * @qpt: the QPN table + */ +void qib_init_qpn_table(struct qib_devdata *dd, struct qib_qpn_table *qpt) +{ + spin_lock_init(&qpt->lock); + qpt->last = 1; /* start with QPN 2 */ + qpt->nmaps = 1; + qpt->mask = dd->qpn_mask; +} + +/** + * qib_free_qpn_table - free the QP number table for a device + * @qpt: the QPN table + */ +void qib_free_qpn_table(struct qib_qpn_table *qpt) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(qpt->map); i++) + if (qpt->map[i].page) + free_page((unsigned long) qpt->map[i].page); +} + +/** + * qib_get_credit - flush the send work queue of a QP + * @qp: the qp who's send work queue to flush + * @aeth: the Acknowledge Extended Transport Header + * + * The QP s_lock should be held. + */ +void qib_get_credit(struct qib_qp *qp, u32 aeth) +{ + u32 credit = (aeth >> QIB_AETH_CREDIT_SHIFT) & QIB_AETH_CREDIT_MASK; + + /* + * If the credit is invalid, we can send + * as many packets as we like. Otherwise, we have to + * honor the credit field. + */ + if (credit == QIB_AETH_CREDIT_INVAL) { + if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT)) { + qp->s_flags |= QIB_S_UNLIMITED_CREDIT; + if (qp->s_flags & QIB_S_WAIT_SSN_CREDIT) { + qp->s_flags &= ~QIB_S_WAIT_SSN_CREDIT; + qib_schedule_send(qp); + } + } + } else if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT)) { + /* Compute new LSN (i.e., MSN + credit) */ + credit = (aeth + credit_table[credit]) & QIB_MSN_MASK; + if (qib_cmp24(credit, qp->s_lsn) > 0) { + qp->s_lsn = credit; + if (qp->s_flags & QIB_S_WAIT_SSN_CREDIT) { + qp->s_flags &= ~QIB_S_WAIT_SSN_CREDIT; + qib_schedule_send(qp); + } + } + } +} + +#ifdef CONFIG_DEBUG_FS + +struct qib_qp_iter { + struct qib_ibdev *dev; + struct qib_qp *qp; + int n; +}; + +struct qib_qp_iter *qib_qp_iter_init(struct qib_ibdev *dev) +{ + struct qib_qp_iter *iter; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return NULL; + + iter->dev = dev; + if (qib_qp_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +int qib_qp_iter_next(struct qib_qp_iter *iter) +{ + struct qib_ibdev *dev = iter->dev; + int n = iter->n; + int ret = 1; + struct qib_qp *pqp = iter->qp; + struct qib_qp *qp; + + for (; n < dev->qp_table_size; n++) { + if (pqp) + qp = rcu_dereference(pqp->next); + else + qp = rcu_dereference(dev->qp_table[n]); + pqp = qp; + if (qp) { + iter->qp = qp; + iter->n = n; + return 0; + } + } + return ret; +} + +static const char * const qp_type_str[] = { + "SMI", "GSI", "RC", "UC", "UD", +}; + +void qib_qp_iter_print(struct seq_file *s, struct qib_qp_iter *iter) +{ + struct qib_swqe *wqe; + struct qib_qp *qp = iter->qp; + + wqe = get_swqe_ptr(qp, qp->s_last); + seq_printf(s, + "N %d QP%u %s %u %u %u f=%x %u %u %u %u %u PSN %x %x %x %x %x (%u %u %u %u %u %u) QP%u LID %x\n", + iter->n, + qp->ibqp.qp_num, + qp_type_str[qp->ibqp.qp_type], + qp->state, + wqe->wr.opcode, + qp->s_hdrwords, + qp->s_flags, + atomic_read(&qp->s_dma_busy), + !list_empty(&qp->iowait), + qp->timeout, + wqe->ssn, + qp->s_lsn, + qp->s_last_psn, + qp->s_psn, qp->s_next_psn, + qp->s_sending_psn, qp->s_sending_hpsn, + qp->s_last, qp->s_acked, qp->s_cur, + qp->s_tail, qp->s_head, qp->s_size, + qp->remote_qpn, + qp->remote_ah_attr.dlid); +} + +#endif diff --git a/drivers/infiniband/hw/qib/qib_qsfp.c b/drivers/infiniband/hw/qib/qib_qsfp.c new file mode 100644 index 0000000..fa71b1e --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_qsfp.c @@ -0,0 +1,556 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "qib.h" +#include "qib_qsfp.h" + +/* + * QSFP support for ib_qib driver, using "Two Wire Serial Interface" driver + * in qib_twsi.c + */ +#define QSFP_MAX_RETRY 4 + +static int qsfp_read(struct qib_pportdata *ppd, int addr, void *bp, int len) +{ + struct qib_devdata *dd = ppd->dd; + u32 out, mask; + int ret, cnt, pass = 0; + int stuck = 0; + u8 *buff = bp; + + ret = mutex_lock_interruptible(&dd->eep_lock); + if (ret) + goto no_unlock; + + if (dd->twsi_eeprom_dev == QIB_TWSI_NO_DEV) { + ret = -ENXIO; + goto bail; + } + + /* + * We presume, if we are called at all, that this board has + * QSFP. This is on the same i2c chain as the legacy parts, + * but only responds if the module is selected via GPIO pins. + * Further, there are very long setup and hold requirements + * on MODSEL. + */ + mask = QSFP_GPIO_MOD_SEL_N | QSFP_GPIO_MOD_RST_N | QSFP_GPIO_LP_MODE; + out = QSFP_GPIO_MOD_RST_N | QSFP_GPIO_LP_MODE; + if (ppd->hw_pidx) { + mask <<= QSFP_GPIO_PORT2_SHIFT; + out <<= QSFP_GPIO_PORT2_SHIFT; + } + + dd->f_gpio_mod(dd, out, mask, mask); + + /* + * Module could take up to 2 Msec to respond to MOD_SEL, and there + * is no way to tell if it is ready, so we must wait. + */ + msleep(2); + + /* Make sure TWSI bus is in sane state. */ + ret = qib_twsi_reset(dd); + if (ret) { + qib_dev_porterr(dd, ppd->port, + "QSFP interface Reset for read failed\n"); + ret = -EIO; + stuck = 1; + goto deselect; + } + + /* All QSFP modules are at A0 */ + + cnt = 0; + while (cnt < len) { + unsigned in_page; + int wlen = len - cnt; + in_page = addr % QSFP_PAGESIZE; + if ((in_page + wlen) > QSFP_PAGESIZE) + wlen = QSFP_PAGESIZE - in_page; + ret = qib_twsi_blk_rd(dd, QSFP_DEV, addr, buff + cnt, wlen); + /* Some QSFP's fail first try. Retry as experiment */ + if (ret && cnt == 0 && ++pass < QSFP_MAX_RETRY) + continue; + if (ret) { + /* qib_twsi_blk_rd() 1 for error, else 0 */ + ret = -EIO; + goto deselect; + } + addr += wlen; + cnt += wlen; + } + ret = cnt; + +deselect: + /* + * Module could take up to 10 uSec after transfer before + * ready to respond to MOD_SEL negation, and there is no way + * to tell if it is ready, so we must wait. + */ + udelay(10); + /* set QSFP MODSEL, RST. LP all high */ + dd->f_gpio_mod(dd, mask, mask, mask); + + /* + * Module could take up to 2 Msec to respond to MOD_SEL + * going away, and there is no way to tell if it is ready. + * so we must wait. + */ + if (stuck) + qib_dev_err(dd, "QSFP interface bus stuck non-idle\n"); + + if (pass >= QSFP_MAX_RETRY && ret) + qib_dev_porterr(dd, ppd->port, "QSFP failed even retrying\n"); + else if (pass) + qib_dev_porterr(dd, ppd->port, "QSFP retries: %d\n", pass); + + msleep(2); + +bail: + mutex_unlock(&dd->eep_lock); + +no_unlock: + return ret; +} + +/* + * qsfp_write + * We do not ordinarily write the QSFP, but this is needed to select + * the page on non-flat QSFPs, and possibly later unusual cases + */ +static int qib_qsfp_write(struct qib_pportdata *ppd, int addr, void *bp, + int len) +{ + struct qib_devdata *dd = ppd->dd; + u32 out, mask; + int ret, cnt; + u8 *buff = bp; + + ret = mutex_lock_interruptible(&dd->eep_lock); + if (ret) + goto no_unlock; + + if (dd->twsi_eeprom_dev == QIB_TWSI_NO_DEV) { + ret = -ENXIO; + goto bail; + } + + /* + * We presume, if we are called at all, that this board has + * QSFP. This is on the same i2c chain as the legacy parts, + * but only responds if the module is selected via GPIO pins. + * Further, there are very long setup and hold requirements + * on MODSEL. + */ + mask = QSFP_GPIO_MOD_SEL_N | QSFP_GPIO_MOD_RST_N | QSFP_GPIO_LP_MODE; + out = QSFP_GPIO_MOD_RST_N | QSFP_GPIO_LP_MODE; + if (ppd->hw_pidx) { + mask <<= QSFP_GPIO_PORT2_SHIFT; + out <<= QSFP_GPIO_PORT2_SHIFT; + } + dd->f_gpio_mod(dd, out, mask, mask); + + /* + * Module could take up to 2 Msec to respond to MOD_SEL, + * and there is no way to tell if it is ready, so we must wait. + */ + msleep(2); + + /* Make sure TWSI bus is in sane state. */ + ret = qib_twsi_reset(dd); + if (ret) { + qib_dev_porterr(dd, ppd->port, + "QSFP interface Reset for write failed\n"); + ret = -EIO; + goto deselect; + } + + /* All QSFP modules are at A0 */ + + cnt = 0; + while (cnt < len) { + unsigned in_page; + int wlen = len - cnt; + in_page = addr % QSFP_PAGESIZE; + if ((in_page + wlen) > QSFP_PAGESIZE) + wlen = QSFP_PAGESIZE - in_page; + ret = qib_twsi_blk_wr(dd, QSFP_DEV, addr, buff + cnt, wlen); + if (ret) { + /* qib_twsi_blk_wr() 1 for error, else 0 */ + ret = -EIO; + goto deselect; + } + addr += wlen; + cnt += wlen; + } + ret = cnt; + +deselect: + /* + * Module could take up to 10 uSec after transfer before + * ready to respond to MOD_SEL negation, and there is no way + * to tell if it is ready, so we must wait. + */ + udelay(10); + /* set QSFP MODSEL, RST, LP high */ + dd->f_gpio_mod(dd, mask, mask, mask); + /* + * Module could take up to 2 Msec to respond to MOD_SEL + * going away, and there is no way to tell if it is ready. + * so we must wait. + */ + msleep(2); + +bail: + mutex_unlock(&dd->eep_lock); + +no_unlock: + return ret; +} + +/* + * For validation, we want to check the checksums, even of the + * fields we do not otherwise use. This function reads the bytes from + * to and returns the 8lsbs of the sum, or <0 for errors + */ +static int qsfp_cks(struct qib_pportdata *ppd, int first, int next) +{ + int ret; + u16 cks; + u8 bval; + + cks = 0; + while (first < next) { + ret = qsfp_read(ppd, first, &bval, 1); + if (ret < 0) + goto bail; + cks += bval; + ++first; + } + ret = cks & 0xFF; +bail: + return ret; + +} + +int qib_refresh_qsfp_cache(struct qib_pportdata *ppd, struct qib_qsfp_cache *cp) +{ + int ret; + int idx; + u16 cks; + u8 peek[4]; + + /* ensure sane contents on invalid reads, for cable swaps */ + memset(cp, 0, sizeof(*cp)); + + if (!qib_qsfp_mod_present(ppd)) { + ret = -ENODEV; + goto bail; + } + + ret = qsfp_read(ppd, 0, peek, 3); + if (ret < 0) + goto bail; + if ((peek[0] & 0xFE) != 0x0C) + qib_dev_porterr(ppd->dd, ppd->port, + "QSFP byte0 is 0x%02X, S/B 0x0C/D\n", peek[0]); + + if ((peek[2] & 2) == 0) { + /* + * If cable is paged, rather than "flat memory", we need to + * set the page to zero, Even if it already appears to be zero. + */ + u8 poke = 0; + ret = qib_qsfp_write(ppd, 127, &poke, 1); + udelay(50); + if (ret != 1) { + qib_dev_porterr(ppd->dd, ppd->port, + "Failed QSFP Page set\n"); + goto bail; + } + } + + ret = qsfp_read(ppd, QSFP_MOD_ID_OFFS, &cp->id, 1); + if (ret < 0) + goto bail; + if ((cp->id & 0xFE) != 0x0C) + qib_dev_porterr(ppd->dd, ppd->port, + "QSFP ID byte is 0x%02X, S/B 0x0C/D\n", cp->id); + cks = cp->id; + + ret = qsfp_read(ppd, QSFP_MOD_PWR_OFFS, &cp->pwr, 1); + if (ret < 0) + goto bail; + cks += cp->pwr; + + ret = qsfp_cks(ppd, QSFP_MOD_PWR_OFFS + 1, QSFP_MOD_LEN_OFFS); + if (ret < 0) + goto bail; + cks += ret; + + ret = qsfp_read(ppd, QSFP_MOD_LEN_OFFS, &cp->len, 1); + if (ret < 0) + goto bail; + cks += cp->len; + + ret = qsfp_read(ppd, QSFP_MOD_TECH_OFFS, &cp->tech, 1); + if (ret < 0) + goto bail; + cks += cp->tech; + + ret = qsfp_read(ppd, QSFP_VEND_OFFS, &cp->vendor, QSFP_VEND_LEN); + if (ret < 0) + goto bail; + for (idx = 0; idx < QSFP_VEND_LEN; ++idx) + cks += cp->vendor[idx]; + + ret = qsfp_read(ppd, QSFP_IBXCV_OFFS, &cp->xt_xcv, 1); + if (ret < 0) + goto bail; + cks += cp->xt_xcv; + + ret = qsfp_read(ppd, QSFP_VOUI_OFFS, &cp->oui, QSFP_VOUI_LEN); + if (ret < 0) + goto bail; + for (idx = 0; idx < QSFP_VOUI_LEN; ++idx) + cks += cp->oui[idx]; + + ret = qsfp_read(ppd, QSFP_PN_OFFS, &cp->partnum, QSFP_PN_LEN); + if (ret < 0) + goto bail; + for (idx = 0; idx < QSFP_PN_LEN; ++idx) + cks += cp->partnum[idx]; + + ret = qsfp_read(ppd, QSFP_REV_OFFS, &cp->rev, QSFP_REV_LEN); + if (ret < 0) + goto bail; + for (idx = 0; idx < QSFP_REV_LEN; ++idx) + cks += cp->rev[idx]; + + ret = qsfp_read(ppd, QSFP_ATTEN_OFFS, &cp->atten, QSFP_ATTEN_LEN); + if (ret < 0) + goto bail; + for (idx = 0; idx < QSFP_ATTEN_LEN; ++idx) + cks += cp->atten[idx]; + + ret = qsfp_cks(ppd, QSFP_ATTEN_OFFS + QSFP_ATTEN_LEN, QSFP_CC_OFFS); + if (ret < 0) + goto bail; + cks += ret; + + cks &= 0xFF; + ret = qsfp_read(ppd, QSFP_CC_OFFS, &cp->cks1, 1); + if (ret < 0) + goto bail; + if (cks != cp->cks1) + qib_dev_porterr(ppd->dd, ppd->port, + "QSFP cks1 is %02X, computed %02X\n", cp->cks1, + cks); + + /* Second checksum covers 192 to (serial, date, lot) */ + ret = qsfp_cks(ppd, QSFP_CC_OFFS + 1, QSFP_SN_OFFS); + if (ret < 0) + goto bail; + cks = ret; + + ret = qsfp_read(ppd, QSFP_SN_OFFS, &cp->serial, QSFP_SN_LEN); + if (ret < 0) + goto bail; + for (idx = 0; idx < QSFP_SN_LEN; ++idx) + cks += cp->serial[idx]; + + ret = qsfp_read(ppd, QSFP_DATE_OFFS, &cp->date, QSFP_DATE_LEN); + if (ret < 0) + goto bail; + for (idx = 0; idx < QSFP_DATE_LEN; ++idx) + cks += cp->date[idx]; + + ret = qsfp_read(ppd, QSFP_LOT_OFFS, &cp->lot, QSFP_LOT_LEN); + if (ret < 0) + goto bail; + for (idx = 0; idx < QSFP_LOT_LEN; ++idx) + cks += cp->lot[idx]; + + ret = qsfp_cks(ppd, QSFP_LOT_OFFS + QSFP_LOT_LEN, QSFP_CC_EXT_OFFS); + if (ret < 0) + goto bail; + cks += ret; + + ret = qsfp_read(ppd, QSFP_CC_EXT_OFFS, &cp->cks2, 1); + if (ret < 0) + goto bail; + cks &= 0xFF; + if (cks != cp->cks2) + qib_dev_porterr(ppd->dd, ppd->port, + "QSFP cks2 is %02X, computed %02X\n", cp->cks2, + cks); + return 0; + +bail: + cp->id = 0; + return ret; +} + +const char * const qib_qsfp_devtech[16] = { + "850nm VCSEL", "1310nm VCSEL", "1550nm VCSEL", "1310nm FP", + "1310nm DFB", "1550nm DFB", "1310nm EML", "1550nm EML", + "Cu Misc", "1490nm DFB", "Cu NoEq", "Cu Eq", + "Undef", "Cu Active BothEq", "Cu FarEq", "Cu NearEq" +}; + +#define QSFP_DUMP_CHUNK 16 /* Holds longest string */ +#define QSFP_DEFAULT_HDR_CNT 224 + +static const char *pwr_codes = "1.5W2.0W2.5W3.5W"; + +int qib_qsfp_mod_present(struct qib_pportdata *ppd) +{ + u32 mask; + int ret; + + mask = QSFP_GPIO_MOD_PRS_N << + (ppd->hw_pidx * QSFP_GPIO_PORT2_SHIFT); + ret = ppd->dd->f_gpio_mod(ppd->dd, 0, 0, 0); + + return !((ret & mask) >> + ((ppd->hw_pidx * QSFP_GPIO_PORT2_SHIFT) + 3)); +} + +/* + * Initialize structures that control access to QSFP. Called once per port + * on cards that support QSFP. + */ +void qib_qsfp_init(struct qib_qsfp_data *qd, + void (*fevent)(struct work_struct *)) +{ + u32 mask, highs; + + struct qib_devdata *dd = qd->ppd->dd; + + /* Initialize work struct for later QSFP events */ + INIT_WORK(&qd->work, fevent); + + /* + * Later, we may want more validation. For now, just set up pins and + * blip reset. If module is present, call qib_refresh_qsfp_cache(), + * to do further init. + */ + mask = QSFP_GPIO_MOD_SEL_N | QSFP_GPIO_MOD_RST_N | QSFP_GPIO_LP_MODE; + highs = mask - QSFP_GPIO_MOD_RST_N; + if (qd->ppd->hw_pidx) { + mask <<= QSFP_GPIO_PORT2_SHIFT; + highs <<= QSFP_GPIO_PORT2_SHIFT; + } + dd->f_gpio_mod(dd, highs, mask, mask); + udelay(20); /* Generous RST dwell */ + + dd->f_gpio_mod(dd, mask, mask, mask); + return; +} + +void qib_qsfp_deinit(struct qib_qsfp_data *qd) +{ + /* + * There is nothing to do here for now. our work is scheduled + * with queue_work(), and flush_workqueue() from remove_one + * will block until all work setup with queue_work() + * completes. + */ +} + +int qib_qsfp_dump(struct qib_pportdata *ppd, char *buf, int len) +{ + struct qib_qsfp_cache cd; + u8 bin_buff[QSFP_DUMP_CHUNK]; + char lenstr[6]; + int sofar, ret; + int bidx = 0; + + sofar = 0; + ret = qib_refresh_qsfp_cache(ppd, &cd); + if (ret < 0) + goto bail; + + lenstr[0] = ' '; + lenstr[1] = '\0'; + if (QSFP_IS_CU(cd.tech)) + sprintf(lenstr, "%dM ", cd.len); + + sofar += scnprintf(buf + sofar, len - sofar, "PWR:%.3sW\n", pwr_codes + + (QSFP_PWR(cd.pwr) * 4)); + + sofar += scnprintf(buf + sofar, len - sofar, "TECH:%s%s\n", lenstr, + qib_qsfp_devtech[cd.tech >> 4]); + + sofar += scnprintf(buf + sofar, len - sofar, "Vendor:%.*s\n", + QSFP_VEND_LEN, cd.vendor); + + sofar += scnprintf(buf + sofar, len - sofar, "OUI:%06X\n", + QSFP_OUI(cd.oui)); + + sofar += scnprintf(buf + sofar, len - sofar, "Part#:%.*s\n", + QSFP_PN_LEN, cd.partnum); + sofar += scnprintf(buf + sofar, len - sofar, "Rev:%.*s\n", + QSFP_REV_LEN, cd.rev); + if (QSFP_IS_CU(cd.tech)) + sofar += scnprintf(buf + sofar, len - sofar, "Atten:%d, %d\n", + QSFP_ATTEN_SDR(cd.atten), + QSFP_ATTEN_DDR(cd.atten)); + sofar += scnprintf(buf + sofar, len - sofar, "Serial:%.*s\n", + QSFP_SN_LEN, cd.serial); + sofar += scnprintf(buf + sofar, len - sofar, "Date:%.*s\n", + QSFP_DATE_LEN, cd.date); + sofar += scnprintf(buf + sofar, len - sofar, "Lot:%.*s\n", + QSFP_LOT_LEN, cd.date); + + while (bidx < QSFP_DEFAULT_HDR_CNT) { + int iidx; + ret = qsfp_read(ppd, bidx, bin_buff, QSFP_DUMP_CHUNK); + if (ret < 0) + goto bail; + for (iidx = 0; iidx < ret; ++iidx) { + sofar += scnprintf(buf + sofar, len-sofar, " %02X", + bin_buff[iidx]); + } + sofar += scnprintf(buf + sofar, len - sofar, "\n"); + bidx += QSFP_DUMP_CHUNK; + } + ret = sofar; +bail: + return ret; +} diff --git a/drivers/infiniband/hw/qib/qib_qsfp.h b/drivers/infiniband/hw/qib/qib_qsfp.h new file mode 100644 index 0000000..91908f5 --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_qsfp.h @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +/* QSFP support common definitions, for ib_qib driver */ + +#define QSFP_DEV 0xA0 +#define QSFP_PWR_LAG_MSEC 2000 +#define QSFP_MODPRS_LAG_MSEC 20 + +/* + * Below are masks for various QSFP signals, for Port 1. + * Port2 equivalents are shifted by QSFP_GPIO_PORT2_SHIFT. + * _N means asserted low + */ +#define QSFP_GPIO_MOD_SEL_N (4) +#define QSFP_GPIO_MOD_PRS_N (8) +#define QSFP_GPIO_INT_N (0x10) +#define QSFP_GPIO_MOD_RST_N (0x20) +#define QSFP_GPIO_LP_MODE (0x40) +#define QSFP_GPIO_PORT2_SHIFT 5 + +#define QSFP_PAGESIZE 128 +/* Defined fields that QLogic requires of qualified cables */ +/* Byte 0 is Identifier, not checked */ +/* Byte 1 is reserved "status MSB" */ +/* Byte 2 is "status LSB" We only care that D2 "Flat Mem" is set. */ +/* + * Rest of first 128 not used, although 127 is reserved for page select + * if module is not "Flat memory". + */ +/* Byte 128 is Identifier: must be 0x0c for QSFP, or 0x0d for QSFP+ */ +#define QSFP_MOD_ID_OFFS 128 +/* + * Byte 129 is "Extended Identifier". We only care about D7,D6: Power class + * 0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W + */ +#define QSFP_MOD_PWR_OFFS 129 +/* Byte 130 is Connector type. Not QLogic req'd */ +/* Bytes 131..138 are Transceiver types, bit maps for various tech, none IB */ +/* Byte 139 is encoding. code 0x01 is 8b10b. Not QLogic req'd */ +/* byte 140 is nominal bit-rate, in units of 100Mbits/sec Not QLogic req'd */ +/* Byte 141 is Extended Rate Select. Not QLogic req'd */ +/* Bytes 142..145 are lengths for various fiber types. Not QLogic req'd */ +/* Byte 146 is length for Copper. Units of 1 meter */ +#define QSFP_MOD_LEN_OFFS 146 +/* + * Byte 147 is Device technology. D0..3 not Qlogc req'd + * D4..7 select from 15 choices, translated by table: + */ +#define QSFP_MOD_TECH_OFFS 147 +extern const char *const qib_qsfp_devtech[16]; +/* Active Equalization includes fiber, copper full EQ, and copper near Eq */ +#define QSFP_IS_ACTIVE(tech) ((0xA2FF >> ((tech) >> 4)) & 1) +/* Active Equalization includes fiber, copper full EQ, and copper far Eq */ +#define QSFP_IS_ACTIVE_FAR(tech) ((0x32FF >> ((tech) >> 4)) & 1) +/* Attenuation should be valid for copper other than full/near Eq */ +#define QSFP_HAS_ATTEN(tech) ((0x4D00 >> ((tech) >> 4)) & 1) +/* Length is only valid if technology is "copper" */ +#define QSFP_IS_CU(tech) ((0xED00 >> ((tech) >> 4)) & 1) +#define QSFP_TECH_1490 9 + +#define QSFP_OUI(oui) (((unsigned)oui[0] << 16) | ((unsigned)oui[1] << 8) | \ + oui[2]) +#define QSFP_OUI_AMPHENOL 0x415048 +#define QSFP_OUI_FINISAR 0x009065 +#define QSFP_OUI_GORE 0x002177 + +/* Bytes 148..163 are Vendor Name, Left-justified Blank-filled */ +#define QSFP_VEND_OFFS 148 +#define QSFP_VEND_LEN 16 +/* Byte 164 is IB Extended tranceiver codes Bits D0..3 are SDR,DDR,QDR,EDR */ +#define QSFP_IBXCV_OFFS 164 +/* Bytes 165..167 are Vendor OUI number */ +#define QSFP_VOUI_OFFS 165 +#define QSFP_VOUI_LEN 3 +/* Bytes 168..183 are Vendor Part Number, string */ +#define QSFP_PN_OFFS 168 +#define QSFP_PN_LEN 16 +/* Bytes 184,185 are Vendor Rev. Left Justified, Blank-filled */ +#define QSFP_REV_OFFS 184 +#define QSFP_REV_LEN 2 +/* + * Bytes 186,187 are Wavelength, if Optical. Not Qlogic req'd + * If copper, they are attenuation in dB: + * Byte 186 is at 2.5Gb/sec (SDR), Byte 187 at 5.0Gb/sec (DDR) + */ +#define QSFP_ATTEN_OFFS 186 +#define QSFP_ATTEN_LEN 2 +/* Bytes 188,189 are Wavelength tolerance, not QLogic req'd */ +/* Byte 190 is Max Case Temp. Not QLogic req'd */ +/* Byte 191 is LSB of sum of bytes 128..190. Not QLogic req'd */ +#define QSFP_CC_OFFS 191 +/* Bytes 192..195 are Options implemented in qsfp. Not Qlogic req'd */ +/* Bytes 196..211 are Serial Number, String */ +#define QSFP_SN_OFFS 196 +#define QSFP_SN_LEN 16 +/* Bytes 212..219 are date-code YYMMDD (MM==1 for Jan) */ +#define QSFP_DATE_OFFS 212 +#define QSFP_DATE_LEN 6 +/* Bytes 218,219 are optional lot-code, string */ +#define QSFP_LOT_OFFS 218 +#define QSFP_LOT_LEN 2 +/* Bytes 220, 221 indicate monitoring options, Not QLogic req'd */ +/* Byte 223 is LSB of sum of bytes 192..222 */ +#define QSFP_CC_EXT_OFFS 223 + +/* + * struct qib_qsfp_data encapsulates state of QSFP device for one port. + * it will be part of port-chip-specific data if a board supports QSFP. + * + * Since multiple board-types use QSFP, and their pport_data structs + * differ (in the chip-specific section), we need a pointer to its head. + * + * Avoiding premature optimization, we will have one work_struct per port, + * and let the (increasingly inaccurately named) eep_lock arbitrate + * access to common resources. + * + */ + +/* + * Hold the parts of the onboard EEPROM that we care about, so we aren't + * coonstantly bit-boffing + */ +struct qib_qsfp_cache { + u8 id; /* must be 0x0C or 0x0D; 0 indicates invalid EEPROM read */ + u8 pwr; /* in D6,7 */ + u8 len; /* in meters, Cu only */ + u8 tech; + char vendor[QSFP_VEND_LEN]; + u8 xt_xcv; /* Ext. tranceiver codes, 4 lsbs are IB speed supported */ + u8 oui[QSFP_VOUI_LEN]; + u8 partnum[QSFP_PN_LEN]; + u8 rev[QSFP_REV_LEN]; + u8 atten[QSFP_ATTEN_LEN]; + u8 cks1; /* Checksum of bytes 128..190 */ + u8 serial[QSFP_SN_LEN]; + u8 date[QSFP_DATE_LEN]; + u8 lot[QSFP_LOT_LEN]; + u8 cks2; /* Checsum of bytes 192..222 */ +}; + +#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3) +#define QSFP_ATTEN_SDR(attenarray) (attenarray[0]) +#define QSFP_ATTEN_DDR(attenarray) (attenarray[1]) + +struct qib_qsfp_data { + /* Helps to find our way */ + struct qib_pportdata *ppd; + struct work_struct work; + struct qib_qsfp_cache cache; + unsigned long t_insert; + u8 modpresent; +}; + +extern int qib_refresh_qsfp_cache(struct qib_pportdata *ppd, + struct qib_qsfp_cache *cp); +extern int qib_qsfp_mod_present(struct qib_pportdata *ppd); +extern void qib_qsfp_init(struct qib_qsfp_data *qd, + void (*fevent)(struct work_struct *)); +extern void qib_qsfp_deinit(struct qib_qsfp_data *qd); diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c new file mode 100644 index 0000000..2f25018 --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -0,0 +1,2290 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "qib.h" + +/* cut down ridiculously long IB macro names */ +#define OP(x) IB_OPCODE_RC_##x + +static void rc_timeout(unsigned long arg); + +static u32 restart_sge(struct qib_sge_state *ss, struct qib_swqe *wqe, + u32 psn, u32 pmtu) +{ + u32 len; + + len = ((psn - wqe->psn) & QIB_PSN_MASK) * pmtu; + ss->sge = wqe->sg_list[0]; + ss->sg_list = wqe->sg_list + 1; + ss->num_sge = wqe->wr.num_sge; + ss->total_len = wqe->length; + qib_skip_sge(ss, len, 0); + return wqe->length - len; +} + +static void start_timer(struct qib_qp *qp) +{ + qp->s_flags |= QIB_S_TIMER; + qp->s_timer.function = rc_timeout; + /* 4.096 usec. * (1 << qp->timeout) */ + qp->s_timer.expires = jiffies + qp->timeout_jiffies; + add_timer(&qp->s_timer); +} + +/** + * qib_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read) + * @dev: the device for this QP + * @qp: a pointer to the QP + * @ohdr: a pointer to the IB header being constructed + * @pmtu: the path MTU + * + * Return 1 if constructed; otherwise, return 0. + * Note that we are in the responder's side of the QP context. + * Note the QP s_lock must be held. + */ +static int qib_make_rc_ack(struct qib_ibdev *dev, struct qib_qp *qp, + struct qib_other_headers *ohdr, u32 pmtu) +{ + struct qib_ack_entry *e; + u32 hwords; + u32 len; + u32 bth0; + u32 bth2; + + /* Don't send an ACK if we aren't supposed to. */ + if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) + goto bail; + + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + + switch (qp->s_ack_state) { + case OP(RDMA_READ_RESPONSE_LAST): + case OP(RDMA_READ_RESPONSE_ONLY): + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + if (e->rdma_sge.mr) { + qib_put_mr(e->rdma_sge.mr); + e->rdma_sge.mr = NULL; + } + /* FALLTHROUGH */ + case OP(ATOMIC_ACKNOWLEDGE): + /* + * We can increment the tail pointer now that the last + * response has been sent instead of only being + * constructed. + */ + if (++qp->s_tail_ack_queue > QIB_MAX_RDMA_ATOMIC) + qp->s_tail_ack_queue = 0; + /* FALLTHROUGH */ + case OP(SEND_ONLY): + case OP(ACKNOWLEDGE): + /* Check for no next entry in the queue. */ + if (qp->r_head_ack_queue == qp->s_tail_ack_queue) { + if (qp->s_flags & QIB_S_ACK_PENDING) + goto normal; + goto bail; + } + + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + if (e->opcode == OP(RDMA_READ_REQUEST)) { + /* + * If a RDMA read response is being resent and + * we haven't seen the duplicate request yet, + * then stop sending the remaining responses the + * responder has seen until the requester resends it. + */ + len = e->rdma_sge.sge_length; + if (len && !e->rdma_sge.mr) { + qp->s_tail_ack_queue = qp->r_head_ack_queue; + goto bail; + } + /* Copy SGE state in case we need to resend */ + qp->s_rdma_mr = e->rdma_sge.mr; + if (qp->s_rdma_mr) + qib_get_mr(qp->s_rdma_mr); + qp->s_ack_rdma_sge.sge = e->rdma_sge; + qp->s_ack_rdma_sge.num_sge = 1; + qp->s_cur_sge = &qp->s_ack_rdma_sge; + if (len > pmtu) { + len = pmtu; + qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST); + } else { + qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY); + e->sent = 1; + } + ohdr->u.aeth = qib_compute_aeth(qp); + hwords++; + qp->s_ack_rdma_psn = e->psn; + bth2 = qp->s_ack_rdma_psn++ & QIB_PSN_MASK; + } else { + /* COMPARE_SWAP or FETCH_ADD */ + qp->s_cur_sge = NULL; + len = 0; + qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE); + ohdr->u.at.aeth = qib_compute_aeth(qp); + ohdr->u.at.atomic_ack_eth[0] = + cpu_to_be32(e->atomic_data >> 32); + ohdr->u.at.atomic_ack_eth[1] = + cpu_to_be32(e->atomic_data); + hwords += sizeof(ohdr->u.at) / sizeof(u32); + bth2 = e->psn & QIB_PSN_MASK; + e->sent = 1; + } + bth0 = qp->s_ack_state << 24; + break; + + case OP(RDMA_READ_RESPONSE_FIRST): + qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE); + /* FALLTHROUGH */ + case OP(RDMA_READ_RESPONSE_MIDDLE): + qp->s_cur_sge = &qp->s_ack_rdma_sge; + qp->s_rdma_mr = qp->s_ack_rdma_sge.sge.mr; + if (qp->s_rdma_mr) + qib_get_mr(qp->s_rdma_mr); + len = qp->s_ack_rdma_sge.sge.sge_length; + if (len > pmtu) + len = pmtu; + else { + ohdr->u.aeth = qib_compute_aeth(qp); + hwords++; + qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + e->sent = 1; + } + bth0 = qp->s_ack_state << 24; + bth2 = qp->s_ack_rdma_psn++ & QIB_PSN_MASK; + break; + + default: +normal: + /* + * Send a regular ACK. + * Set the s_ack_state so we wait until after sending + * the ACK before setting s_ack_state to ACKNOWLEDGE + * (see above). + */ + qp->s_ack_state = OP(SEND_ONLY); + qp->s_flags &= ~QIB_S_ACK_PENDING; + qp->s_cur_sge = NULL; + if (qp->s_nak_state) + ohdr->u.aeth = + cpu_to_be32((qp->r_msn & QIB_MSN_MASK) | + (qp->s_nak_state << + QIB_AETH_CREDIT_SHIFT)); + else + ohdr->u.aeth = qib_compute_aeth(qp); + hwords++; + len = 0; + bth0 = OP(ACKNOWLEDGE) << 24; + bth2 = qp->s_ack_psn & QIB_PSN_MASK; + } + qp->s_rdma_ack_cnt++; + qp->s_hdrwords = hwords; + qp->s_cur_size = len; + qib_make_ruc_header(qp, ohdr, bth0, bth2); + return 1; + +bail: + qp->s_ack_state = OP(ACKNOWLEDGE); + qp->s_flags &= ~(QIB_S_RESP_PENDING | QIB_S_ACK_PENDING); + return 0; +} + +/** + * qib_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC) + * @qp: a pointer to the QP + * + * Return 1 if constructed; otherwise, return 0. + */ +int qib_make_rc_req(struct qib_qp *qp) +{ + struct qib_ibdev *dev = to_idev(qp->ibqp.device); + struct qib_other_headers *ohdr; + struct qib_sge_state *ss; + struct qib_swqe *wqe; + u32 hwords; + u32 len; + u32 bth0; + u32 bth2; + u32 pmtu = qp->pmtu; + char newreq; + unsigned long flags; + int ret = 0; + int delta; + + ohdr = &qp->s_hdr->u.oth; + if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) + ohdr = &qp->s_hdr->u.l.oth; + + /* + * The lock is needed to synchronize between the sending tasklet, + * the receive interrupt handler, and timeout resends. + */ + spin_lock_irqsave(&qp->s_lock, flags); + + /* Sending responses has higher priority over sending requests. */ + if ((qp->s_flags & QIB_S_RESP_PENDING) && + qib_make_rc_ack(dev, qp, ohdr, pmtu)) + goto done; + + if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_SEND_OK)) { + if (!(ib_qib_state_ops[qp->state] & QIB_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + if (qp->s_last == qp->s_head) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (atomic_read(&qp->s_dma_busy)) { + qp->s_flags |= QIB_S_WAIT_DMA; + goto bail; + } + wqe = get_swqe_ptr(qp, qp->s_last); + qib_send_complete(qp, wqe, qp->s_last != qp->s_acked ? + IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR); + /* will get called again */ + goto done; + } + + if (qp->s_flags & (QIB_S_WAIT_RNR | QIB_S_WAIT_ACK)) + goto bail; + + if (qib_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) { + if (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) { + qp->s_flags |= QIB_S_WAIT_PSN; + goto bail; + } + qp->s_sending_psn = qp->s_psn; + qp->s_sending_hpsn = qp->s_psn - 1; + } + + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + bth0 = 0; + + /* Send a request. */ + wqe = get_swqe_ptr(qp, qp->s_cur); + switch (qp->s_state) { + default: + if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_NEXT_SEND_OK)) + goto bail; + /* + * Resend an old request or start a new one. + * + * We keep track of the current SWQE so that + * we don't reset the "furthest progress" state + * if we need to back up. + */ + newreq = 0; + if (qp->s_cur == qp->s_tail) { + /* Check if send work queue is empty. */ + if (qp->s_tail == qp->s_head) + goto bail; + /* + * If a fence is requested, wait for previous + * RDMA read and atomic operations to finish. + */ + if ((wqe->wr.send_flags & IB_SEND_FENCE) && + qp->s_num_rd_atomic) { + qp->s_flags |= QIB_S_WAIT_FENCE; + goto bail; + } + wqe->psn = qp->s_next_psn; + newreq = 1; + } + /* + * Note that we have to be careful not to modify the + * original work request since we may need to resend + * it. + */ + len = wqe->length; + ss = &qp->s_sge; + bth2 = qp->s_psn & QIB_PSN_MASK; + switch (wqe->wr.opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + /* If no credit, return. */ + if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT) && + qib_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) { + qp->s_flags |= QIB_S_WAIT_SSN_CREDIT; + goto bail; + } + wqe->lpsn = wqe->psn; + if (len > pmtu) { + wqe->lpsn += (len - 1) / pmtu; + qp->s_state = OP(SEND_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_ONLY); + else { + qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + bth2 |= IB_BTH_REQ_ACK; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_RDMA_WRITE: + if (newreq && !(qp->s_flags & QIB_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + /* FALLTHROUGH */ + case IB_WR_RDMA_WRITE_WITH_IMM: + /* If no credit, return. */ + if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT) && + qib_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) { + qp->s_flags |= QIB_S_WAIT_SSN_CREDIT; + goto bail; + } + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + hwords += sizeof(struct ib_reth) / sizeof(u32); + wqe->lpsn = wqe->psn; + if (len > pmtu) { + wqe->lpsn += (len - 1) / pmtu; + qp->s_state = OP(RDMA_WRITE_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_ONLY); + else { + qp->s_state = + OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after RETH */ + ohdr->u.rc.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + } + bth2 |= IB_BTH_REQ_ACK; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_RDMA_READ: + /* + * Don't allow more operations to be started + * than the QP limits allow. + */ + if (newreq) { + if (qp->s_num_rd_atomic >= + qp->s_max_rd_atomic) { + qp->s_flags |= QIB_S_WAIT_RDMAR; + goto bail; + } + qp->s_num_rd_atomic++; + if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + /* + * Adjust s_next_psn to count the + * expected number of responses. + */ + if (len > pmtu) + qp->s_next_psn += (len - 1) / pmtu; + wqe->lpsn = qp->s_next_psn++; + } + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + qp->s_state = OP(RDMA_READ_REQUEST); + hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); + ss = NULL; + len = 0; + bth2 |= IB_BTH_REQ_ACK; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + /* + * Don't allow more operations to be started + * than the QP limits allow. + */ + if (newreq) { + if (qp->s_num_rd_atomic >= + qp->s_max_rd_atomic) { + qp->s_flags |= QIB_S_WAIT_RDMAR; + goto bail; + } + qp->s_num_rd_atomic++; + if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + wqe->lpsn = wqe->psn; + } + if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) { + qp->s_state = OP(COMPARE_SWAP); + ohdr->u.atomic_eth.swap_data = cpu_to_be64( + wqe->wr.wr.atomic.swap); + ohdr->u.atomic_eth.compare_data = cpu_to_be64( + wqe->wr.wr.atomic.compare_add); + } else { + qp->s_state = OP(FETCH_ADD); + ohdr->u.atomic_eth.swap_data = cpu_to_be64( + wqe->wr.wr.atomic.compare_add); + ohdr->u.atomic_eth.compare_data = 0; + } + ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32( + wqe->wr.wr.atomic.remote_addr >> 32); + ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32( + wqe->wr.wr.atomic.remote_addr); + ohdr->u.atomic_eth.rkey = cpu_to_be32( + wqe->wr.wr.atomic.rkey); + hwords += sizeof(struct ib_atomic_eth) / sizeof(u32); + ss = NULL; + len = 0; + bth2 |= IB_BTH_REQ_ACK; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + default: + goto bail; + } + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_sge.total_len = wqe->length; + qp->s_len = wqe->length; + if (newreq) { + qp->s_tail++; + if (qp->s_tail >= qp->s_size) + qp->s_tail = 0; + } + if (wqe->wr.opcode == IB_WR_RDMA_READ) + qp->s_psn = wqe->lpsn + 1; + else { + qp->s_psn++; + if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0) + qp->s_next_psn = qp->s_psn; + } + break; + + case OP(RDMA_READ_RESPONSE_FIRST): + /* + * qp->s_state is normally set to the opcode of the + * last packet constructed for new requests and therefore + * is never set to RDMA read response. + * RDMA_READ_RESPONSE_FIRST is used by the ACK processing + * thread to indicate a SEND needs to be restarted from an + * earlier PSN without interferring with the sending thread. + * See qib_restart_rc(). + */ + qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu); + /* FALLTHROUGH */ + case OP(SEND_FIRST): + qp->s_state = OP(SEND_MIDDLE); + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + bth2 = qp->s_psn++ & QIB_PSN_MASK; + if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0) + qp->s_next_psn = qp->s_psn; + ss = &qp->s_sge; + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_LAST); + else { + qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + bth2 |= IB_BTH_REQ_ACK; + qp->s_cur++; + if (qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_READ_RESPONSE_LAST): + /* + * qp->s_state is normally set to the opcode of the + * last packet constructed for new requests and therefore + * is never set to RDMA read response. + * RDMA_READ_RESPONSE_LAST is used by the ACK processing + * thread to indicate a RDMA write needs to be restarted from + * an earlier PSN without interferring with the sending thread. + * See qib_restart_rc(). + */ + qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu); + /* FALLTHROUGH */ + case OP(RDMA_WRITE_FIRST): + qp->s_state = OP(RDMA_WRITE_MIDDLE); + /* FALLTHROUGH */ + case OP(RDMA_WRITE_MIDDLE): + bth2 = qp->s_psn++ & QIB_PSN_MASK; + if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0) + qp->s_next_psn = qp->s_psn; + ss = &qp->s_sge; + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_LAST); + else { + qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + } + bth2 |= IB_BTH_REQ_ACK; + qp->s_cur++; + if (qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_READ_RESPONSE_MIDDLE): + /* + * qp->s_state is normally set to the opcode of the + * last packet constructed for new requests and therefore + * is never set to RDMA read response. + * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing + * thread to indicate a RDMA read needs to be restarted from + * an earlier PSN without interferring with the sending thread. + * See qib_restart_rc(). + */ + len = ((qp->s_psn - wqe->psn) & QIB_PSN_MASK) * pmtu; + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len); + qp->s_state = OP(RDMA_READ_REQUEST); + hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); + bth2 = (qp->s_psn & QIB_PSN_MASK) | IB_BTH_REQ_ACK; + qp->s_psn = wqe->lpsn + 1; + ss = NULL; + len = 0; + qp->s_cur++; + if (qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + } + qp->s_sending_hpsn = bth2; + delta = (((int) bth2 - (int) wqe->psn) << 8) >> 8; + if (delta && delta % QIB_PSN_CREDIT == 0) + bth2 |= IB_BTH_REQ_ACK; + if (qp->s_flags & QIB_S_SEND_ONE) { + qp->s_flags &= ~QIB_S_SEND_ONE; + qp->s_flags |= QIB_S_WAIT_ACK; + bth2 |= IB_BTH_REQ_ACK; + } + qp->s_len -= len; + qp->s_hdrwords = hwords; + qp->s_cur_sge = ss; + qp->s_cur_size = len; + qib_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24), bth2); +done: + ret = 1; + goto unlock; + +bail: + qp->s_flags &= ~QIB_S_BUSY; +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +/** + * qib_send_rc_ack - Construct an ACK packet and send it + * @qp: a pointer to the QP + * + * This is called from qib_rc_rcv() and qib_kreceive(). + * Note that RDMA reads and atomics are handled in the + * send side QP state and tasklet. + */ +void qib_send_rc_ack(struct qib_qp *qp) +{ + struct qib_devdata *dd = dd_from_ibdev(qp->ibqp.device); + struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + u64 pbc; + u16 lrh0; + u32 bth0; + u32 hwords; + u32 pbufn; + u32 __iomem *piobuf; + struct qib_ib_header hdr; + struct qib_other_headers *ohdr; + u32 control; + unsigned long flags; + + spin_lock_irqsave(&qp->s_lock, flags); + + if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) + goto unlock; + + /* Don't send ACK or NAK if a RDMA read or atomic is pending. */ + if ((qp->s_flags & QIB_S_RESP_PENDING) || qp->s_rdma_ack_cnt) + goto queue_ack; + + /* Construct the header with s_lock held so APM doesn't change it. */ + ohdr = &hdr.u.oth; + lrh0 = QIB_LRH_BTH; + /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */ + hwords = 6; + if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { + hwords += qib_make_grh(ibp, &hdr.u.l.grh, + &qp->remote_ah_attr.grh, hwords, 0); + ohdr = &hdr.u.l.oth; + lrh0 = QIB_LRH_GRH; + } + /* read pkey_index w/o lock (its atomic) */ + bth0 = qib_get_pkey(ibp, qp->s_pkey_index) | (OP(ACKNOWLEDGE) << 24); + if (qp->s_mig_state == IB_MIG_MIGRATED) + bth0 |= IB_BTH_MIG_REQ; + if (qp->r_nak_state) + ohdr->u.aeth = cpu_to_be32((qp->r_msn & QIB_MSN_MASK) | + (qp->r_nak_state << + QIB_AETH_CREDIT_SHIFT)); + else + ohdr->u.aeth = qib_compute_aeth(qp); + lrh0 |= ibp->sl_to_vl[qp->remote_ah_attr.sl] << 12 | + qp->remote_ah_attr.sl << 4; + hdr.lrh[0] = cpu_to_be16(lrh0); + hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); + hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC); + hdr.lrh[3] = cpu_to_be16(ppd->lid | qp->remote_ah_attr.src_path_bits); + ohdr->bth[0] = cpu_to_be32(bth0); + ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); + ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & QIB_PSN_MASK); + + spin_unlock_irqrestore(&qp->s_lock, flags); + + /* Don't try to send ACKs if the link isn't ACTIVE */ + if (!(ppd->lflags & QIBL_LINKACTIVE)) + goto done; + + control = dd->f_setpbc_control(ppd, hwords + SIZE_OF_CRC, + qp->s_srate, lrh0 >> 12); + /* length is + 1 for the control dword */ + pbc = ((u64) control << 32) | (hwords + 1); + + piobuf = dd->f_getsendbuf(ppd, pbc, &pbufn); + if (!piobuf) { + /* + * We are out of PIO buffers at the moment. + * Pass responsibility for sending the ACK to the + * send tasklet so that when a PIO buffer becomes + * available, the ACK is sent ahead of other outgoing + * packets. + */ + spin_lock_irqsave(&qp->s_lock, flags); + goto queue_ack; + } + + /* + * Write the pbc. + * We have to flush after the PBC for correctness + * on some cpus or WC buffer can be written out of order. + */ + writeq(pbc, piobuf); + + if (dd->flags & QIB_PIO_FLUSH_WC) { + u32 *hdrp = (u32 *) &hdr; + + qib_flush_wc(); + qib_pio_copy(piobuf + 2, hdrp, hwords - 1); + qib_flush_wc(); + __raw_writel(hdrp[hwords - 1], piobuf + hwords + 1); + } else + qib_pio_copy(piobuf + 2, (u32 *) &hdr, hwords); + + if (dd->flags & QIB_USE_SPCL_TRIG) { + u32 spcl_off = (pbufn >= dd->piobcnt2k) ? 2047 : 1023; + + qib_flush_wc(); + __raw_writel(0xaebecede, piobuf + spcl_off); + } + + qib_flush_wc(); + qib_sendbuf_done(dd, pbufn); + + this_cpu_inc(ibp->pmastats->n_unicast_xmit); + goto done; + +queue_ack: + if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) { + ibp->n_rc_qacks++; + qp->s_flags |= QIB_S_ACK_PENDING | QIB_S_RESP_PENDING; + qp->s_nak_state = qp->r_nak_state; + qp->s_ack_psn = qp->r_ack_psn; + + /* Schedule the send tasklet. */ + qib_schedule_send(qp); + } +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); +done: + return; +} + +/** + * reset_psn - reset the QP state to send starting from PSN + * @qp: the QP + * @psn: the packet sequence number to restart at + * + * This is called from qib_rc_rcv() to process an incoming RC ACK + * for the given QP. + * Called at interrupt level with the QP s_lock held. + */ +static void reset_psn(struct qib_qp *qp, u32 psn) +{ + u32 n = qp->s_acked; + struct qib_swqe *wqe = get_swqe_ptr(qp, n); + u32 opcode; + + qp->s_cur = n; + + /* + * If we are starting the request from the beginning, + * let the normal send code handle initialization. + */ + if (qib_cmp24(psn, wqe->psn) <= 0) { + qp->s_state = OP(SEND_LAST); + goto done; + } + + /* Find the work request opcode corresponding to the given PSN. */ + opcode = wqe->wr.opcode; + for (;;) { + int diff; + + if (++n == qp->s_size) + n = 0; + if (n == qp->s_tail) + break; + wqe = get_swqe_ptr(qp, n); + diff = qib_cmp24(psn, wqe->psn); + if (diff < 0) + break; + qp->s_cur = n; + /* + * If we are starting the request from the beginning, + * let the normal send code handle initialization. + */ + if (diff == 0) { + qp->s_state = OP(SEND_LAST); + goto done; + } + opcode = wqe->wr.opcode; + } + + /* + * Set the state to restart in the middle of a request. + * Don't change the s_sge, s_cur_sge, or s_cur_size. + * See qib_make_rc_req(). + */ + switch (opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + qp->s_state = OP(RDMA_READ_RESPONSE_FIRST); + break; + + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + qp->s_state = OP(RDMA_READ_RESPONSE_LAST); + break; + + case IB_WR_RDMA_READ: + qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE); + break; + + default: + /* + * This case shouldn't happen since its only + * one PSN per req. + */ + qp->s_state = OP(SEND_LAST); + } +done: + qp->s_psn = psn; + /* + * Set QIB_S_WAIT_PSN as qib_rc_complete() may start the timer + * asynchronously before the send tasklet can get scheduled. + * Doing it in qib_make_rc_req() is too late. + */ + if ((qib_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) && + (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)) + qp->s_flags |= QIB_S_WAIT_PSN; +} + +/* + * Back up requester to resend the last un-ACKed request. + * The QP r_lock and s_lock should be held and interrupts disabled. + */ +static void qib_restart_rc(struct qib_qp *qp, u32 psn, int wait) +{ + struct qib_swqe *wqe = get_swqe_ptr(qp, qp->s_acked); + struct qib_ibport *ibp; + + if (qp->s_retry == 0) { + if (qp->s_mig_state == IB_MIG_ARMED) { + qib_migrate_qp(qp); + qp->s_retry = qp->s_retry_cnt; + } else if (qp->s_last == qp->s_acked) { + qib_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); + qib_error_qp(qp, IB_WC_WR_FLUSH_ERR); + return; + } else /* XXX need to handle delayed completion */ + return; + } else + qp->s_retry--; + + ibp = to_iport(qp->ibqp.device, qp->port_num); + if (wqe->wr.opcode == IB_WR_RDMA_READ) + ibp->n_rc_resends++; + else + ibp->n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK; + + qp->s_flags &= ~(QIB_S_WAIT_FENCE | QIB_S_WAIT_RDMAR | + QIB_S_WAIT_SSN_CREDIT | QIB_S_WAIT_PSN | + QIB_S_WAIT_ACK); + if (wait) + qp->s_flags |= QIB_S_SEND_ONE; + reset_psn(qp, psn); +} + +/* + * This is called from s_timer for missing responses. + */ +static void rc_timeout(unsigned long arg) +{ + struct qib_qp *qp = (struct qib_qp *)arg; + struct qib_ibport *ibp; + unsigned long flags; + + spin_lock_irqsave(&qp->r_lock, flags); + spin_lock(&qp->s_lock); + if (qp->s_flags & QIB_S_TIMER) { + ibp = to_iport(qp->ibqp.device, qp->port_num); + ibp->n_rc_timeouts++; + qp->s_flags &= ~QIB_S_TIMER; + del_timer(&qp->s_timer); + qib_restart_rc(qp, qp->s_last_psn + 1, 1); + qib_schedule_send(qp); + } + spin_unlock(&qp->s_lock); + spin_unlock_irqrestore(&qp->r_lock, flags); +} + +/* + * This is called from s_timer for RNR timeouts. + */ +void qib_rc_rnr_retry(unsigned long arg) +{ + struct qib_qp *qp = (struct qib_qp *)arg; + unsigned long flags; + + spin_lock_irqsave(&qp->s_lock, flags); + if (qp->s_flags & QIB_S_WAIT_RNR) { + qp->s_flags &= ~QIB_S_WAIT_RNR; + del_timer(&qp->s_timer); + qib_schedule_send(qp); + } + spin_unlock_irqrestore(&qp->s_lock, flags); +} + +/* + * Set qp->s_sending_psn to the next PSN after the given one. + * This would be psn+1 except when RDMA reads are present. + */ +static void reset_sending_psn(struct qib_qp *qp, u32 psn) +{ + struct qib_swqe *wqe; + u32 n = qp->s_last; + + /* Find the work request corresponding to the given PSN. */ + for (;;) { + wqe = get_swqe_ptr(qp, n); + if (qib_cmp24(psn, wqe->lpsn) <= 0) { + if (wqe->wr.opcode == IB_WR_RDMA_READ) + qp->s_sending_psn = wqe->lpsn + 1; + else + qp->s_sending_psn = psn + 1; + break; + } + if (++n == qp->s_size) + n = 0; + if (n == qp->s_tail) + break; + } +} + +/* + * This should be called with the QP s_lock held and interrupts disabled. + */ +void qib_rc_send_complete(struct qib_qp *qp, struct qib_ib_header *hdr) +{ + struct qib_other_headers *ohdr; + struct qib_swqe *wqe; + struct ib_wc wc; + unsigned i; + u32 opcode; + u32 psn; + + if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_OR_FLUSH_SEND)) + return; + + /* Find out where the BTH is */ + if ((be16_to_cpu(hdr->lrh[0]) & 3) == QIB_LRH_BTH) + ohdr = &hdr->u.oth; + else + ohdr = &hdr->u.l.oth; + + opcode = be32_to_cpu(ohdr->bth[0]) >> 24; + if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) && + opcode <= OP(ATOMIC_ACKNOWLEDGE)) { + WARN_ON(!qp->s_rdma_ack_cnt); + qp->s_rdma_ack_cnt--; + return; + } + + psn = be32_to_cpu(ohdr->bth[2]); + reset_sending_psn(qp, psn); + + /* + * Start timer after a packet requesting an ACK has been sent and + * there are still requests that haven't been acked. + */ + if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail && + !(qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR | QIB_S_WAIT_PSN)) && + (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) + start_timer(qp); + + while (qp->s_last != qp->s_acked) { + wqe = get_swqe_ptr(qp, qp->s_last); + if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) >= 0 && + qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) + break; + for (i = 0; i < wqe->wr.num_sge; i++) { + struct qib_sge *sge = &wqe->sg_list[i]; + + qib_put_mr(sge->mr); + } + /* Post a send completion queue entry if requested. */ + if (!(qp->s_flags & QIB_S_SIGNAL_REQ_WR) || + (wqe->wr.send_flags & IB_SEND_SIGNALED)) { + memset(&wc, 0, sizeof wc); + wc.wr_id = wqe->wr.wr_id; + wc.status = IB_WC_SUCCESS; + wc.opcode = ib_qib_wc_opcode[wqe->wr.opcode]; + wc.byte_len = wqe->length; + wc.qp = &qp->ibqp; + qib_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0); + } + if (++qp->s_last >= qp->s_size) + qp->s_last = 0; + } + /* + * If we were waiting for sends to complete before resending, + * and they are now complete, restart sending. + */ + if (qp->s_flags & QIB_S_WAIT_PSN && + qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { + qp->s_flags &= ~QIB_S_WAIT_PSN; + qp->s_sending_psn = qp->s_psn; + qp->s_sending_hpsn = qp->s_psn - 1; + qib_schedule_send(qp); + } +} + +static inline void update_last_psn(struct qib_qp *qp, u32 psn) +{ + qp->s_last_psn = psn; +} + +/* + * Generate a SWQE completion. + * This is similar to qib_send_complete but has to check to be sure + * that the SGEs are not being referenced if the SWQE is being resent. + */ +static struct qib_swqe *do_rc_completion(struct qib_qp *qp, + struct qib_swqe *wqe, + struct qib_ibport *ibp) +{ + struct ib_wc wc; + unsigned i; + + /* + * Don't decrement refcount and don't generate a + * completion if the SWQE is being resent until the send + * is finished. + */ + if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) < 0 || + qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { + for (i = 0; i < wqe->wr.num_sge; i++) { + struct qib_sge *sge = &wqe->sg_list[i]; + + qib_put_mr(sge->mr); + } + /* Post a send completion queue entry if requested. */ + if (!(qp->s_flags & QIB_S_SIGNAL_REQ_WR) || + (wqe->wr.send_flags & IB_SEND_SIGNALED)) { + memset(&wc, 0, sizeof wc); + wc.wr_id = wqe->wr.wr_id; + wc.status = IB_WC_SUCCESS; + wc.opcode = ib_qib_wc_opcode[wqe->wr.opcode]; + wc.byte_len = wqe->length; + wc.qp = &qp->ibqp; + qib_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0); + } + if (++qp->s_last >= qp->s_size) + qp->s_last = 0; + } else + ibp->n_rc_delayed_comp++; + + qp->s_retry = qp->s_retry_cnt; + update_last_psn(qp, wqe->lpsn); + + /* + * If we are completing a request which is in the process of + * being resent, we can stop resending it since we know the + * responder has already seen it. + */ + if (qp->s_acked == qp->s_cur) { + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + qp->s_acked = qp->s_cur; + wqe = get_swqe_ptr(qp, qp->s_cur); + if (qp->s_acked != qp->s_tail) { + qp->s_state = OP(SEND_LAST); + qp->s_psn = wqe->psn; + } + } else { + if (++qp->s_acked >= qp->s_size) + qp->s_acked = 0; + if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur) + qp->s_draining = 0; + wqe = get_swqe_ptr(qp, qp->s_acked); + } + return wqe; +} + +/** + * do_rc_ack - process an incoming RC ACK + * @qp: the QP the ACK came in on + * @psn: the packet sequence number of the ACK + * @opcode: the opcode of the request that resulted in the ACK + * + * This is called from qib_rc_rcv_resp() to process an incoming RC ACK + * for the given QP. + * Called at interrupt level with the QP s_lock held. + * Returns 1 if OK, 0 if current operation should be aborted (NAK). + */ +static int do_rc_ack(struct qib_qp *qp, u32 aeth, u32 psn, int opcode, + u64 val, struct qib_ctxtdata *rcd) +{ + struct qib_ibport *ibp; + enum ib_wc_status status; + struct qib_swqe *wqe; + int ret = 0; + u32 ack_psn; + int diff; + + /* Remove QP from retry timer */ + if (qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR)) { + qp->s_flags &= ~(QIB_S_TIMER | QIB_S_WAIT_RNR); + del_timer(&qp->s_timer); + } + + /* + * Note that NAKs implicitly ACK outstanding SEND and RDMA write + * requests and implicitly NAK RDMA read and atomic requests issued + * before the NAK'ed request. The MSN won't include the NAK'ed + * request but will include an ACK'ed request(s). + */ + ack_psn = psn; + if (aeth >> 29) + ack_psn--; + wqe = get_swqe_ptr(qp, qp->s_acked); + ibp = to_iport(qp->ibqp.device, qp->port_num); + + /* + * The MSN might be for a later WQE than the PSN indicates so + * only complete WQEs that the PSN finishes. + */ + while ((diff = qib_cmp24(ack_psn, wqe->lpsn)) >= 0) { + /* + * RDMA_READ_RESPONSE_ONLY is a special case since + * we want to generate completion events for everything + * before the RDMA read, copy the data, then generate + * the completion for the read. + */ + if (wqe->wr.opcode == IB_WR_RDMA_READ && + opcode == OP(RDMA_READ_RESPONSE_ONLY) && + diff == 0) { + ret = 1; + goto bail; + } + /* + * If this request is a RDMA read or atomic, and the ACK is + * for a later operation, this ACK NAKs the RDMA read or + * atomic. In other words, only a RDMA_READ_LAST or ONLY + * can ACK a RDMA read and likewise for atomic ops. Note + * that the NAK case can only happen if relaxed ordering is + * used and requests are sent after an RDMA read or atomic + * is sent but before the response is received. + */ + if ((wqe->wr.opcode == IB_WR_RDMA_READ && + (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) || + ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) && + (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) { + /* Retry this request. */ + if (!(qp->r_flags & QIB_R_RDMAR_SEQ)) { + qp->r_flags |= QIB_R_RDMAR_SEQ; + qib_restart_rc(qp, qp->s_last_psn + 1, 0); + if (list_empty(&qp->rspwait)) { + qp->r_flags |= QIB_R_RSP_SEND; + atomic_inc(&qp->refcount); + list_add_tail(&qp->rspwait, + &rcd->qp_wait_list); + } + } + /* + * No need to process the ACK/NAK since we are + * restarting an earlier request. + */ + goto bail; + } + if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { + u64 *vaddr = wqe->sg_list[0].vaddr; + *vaddr = val; + } + if (qp->s_num_rd_atomic && + (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) { + qp->s_num_rd_atomic--; + /* Restart sending task if fence is complete */ + if ((qp->s_flags & QIB_S_WAIT_FENCE) && + !qp->s_num_rd_atomic) { + qp->s_flags &= ~(QIB_S_WAIT_FENCE | + QIB_S_WAIT_ACK); + qib_schedule_send(qp); + } else if (qp->s_flags & QIB_S_WAIT_RDMAR) { + qp->s_flags &= ~(QIB_S_WAIT_RDMAR | + QIB_S_WAIT_ACK); + qib_schedule_send(qp); + } + } + wqe = do_rc_completion(qp, wqe, ibp); + if (qp->s_acked == qp->s_tail) + break; + } + + switch (aeth >> 29) { + case 0: /* ACK */ + ibp->n_rc_acks++; + if (qp->s_acked != qp->s_tail) { + /* + * We are expecting more ACKs so + * reset the retransmit timer. + */ + start_timer(qp); + /* + * We can stop resending the earlier packets and + * continue with the next packet the receiver wants. + */ + if (qib_cmp24(qp->s_psn, psn) <= 0) + reset_psn(qp, psn + 1); + } else if (qib_cmp24(qp->s_psn, psn) <= 0) { + qp->s_state = OP(SEND_LAST); + qp->s_psn = psn + 1; + } + if (qp->s_flags & QIB_S_WAIT_ACK) { + qp->s_flags &= ~QIB_S_WAIT_ACK; + qib_schedule_send(qp); + } + qib_get_credit(qp, aeth); + qp->s_rnr_retry = qp->s_rnr_retry_cnt; + qp->s_retry = qp->s_retry_cnt; + update_last_psn(qp, psn); + ret = 1; + goto bail; + + case 1: /* RNR NAK */ + ibp->n_rnr_naks++; + if (qp->s_acked == qp->s_tail) + goto bail; + if (qp->s_flags & QIB_S_WAIT_RNR) + goto bail; + if (qp->s_rnr_retry == 0) { + status = IB_WC_RNR_RETRY_EXC_ERR; + goto class_b; + } + if (qp->s_rnr_retry_cnt < 7) + qp->s_rnr_retry--; + + /* The last valid PSN is the previous PSN. */ + update_last_psn(qp, psn - 1); + + ibp->n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK; + + reset_psn(qp, psn); + + qp->s_flags &= ~(QIB_S_WAIT_SSN_CREDIT | QIB_S_WAIT_ACK); + qp->s_flags |= QIB_S_WAIT_RNR; + qp->s_timer.function = qib_rc_rnr_retry; + qp->s_timer.expires = jiffies + usecs_to_jiffies( + ib_qib_rnr_table[(aeth >> QIB_AETH_CREDIT_SHIFT) & + QIB_AETH_CREDIT_MASK]); + add_timer(&qp->s_timer); + goto bail; + + case 3: /* NAK */ + if (qp->s_acked == qp->s_tail) + goto bail; + /* The last valid PSN is the previous PSN. */ + update_last_psn(qp, psn - 1); + switch ((aeth >> QIB_AETH_CREDIT_SHIFT) & + QIB_AETH_CREDIT_MASK) { + case 0: /* PSN sequence error */ + ibp->n_seq_naks++; + /* + * Back up to the responder's expected PSN. + * Note that we might get a NAK in the middle of an + * RDMA READ response which terminates the RDMA + * READ. + */ + qib_restart_rc(qp, psn, 0); + qib_schedule_send(qp); + break; + + case 1: /* Invalid Request */ + status = IB_WC_REM_INV_REQ_ERR; + ibp->n_other_naks++; + goto class_b; + + case 2: /* Remote Access Error */ + status = IB_WC_REM_ACCESS_ERR; + ibp->n_other_naks++; + goto class_b; + + case 3: /* Remote Operation Error */ + status = IB_WC_REM_OP_ERR; + ibp->n_other_naks++; +class_b: + if (qp->s_last == qp->s_acked) { + qib_send_complete(qp, wqe, status); + qib_error_qp(qp, IB_WC_WR_FLUSH_ERR); + } + break; + + default: + /* Ignore other reserved NAK error codes */ + goto reserved; + } + qp->s_retry = qp->s_retry_cnt; + qp->s_rnr_retry = qp->s_rnr_retry_cnt; + goto bail; + + default: /* 2: reserved */ +reserved: + /* Ignore reserved NAK codes. */ + goto bail; + } + +bail: + return ret; +} + +/* + * We have seen an out of sequence RDMA read middle or last packet. + * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE. + */ +static void rdma_seq_err(struct qib_qp *qp, struct qib_ibport *ibp, u32 psn, + struct qib_ctxtdata *rcd) +{ + struct qib_swqe *wqe; + + /* Remove QP from retry timer */ + if (qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR)) { + qp->s_flags &= ~(QIB_S_TIMER | QIB_S_WAIT_RNR); + del_timer(&qp->s_timer); + } + + wqe = get_swqe_ptr(qp, qp->s_acked); + + while (qib_cmp24(psn, wqe->lpsn) > 0) { + if (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) + break; + wqe = do_rc_completion(qp, wqe, ibp); + } + + ibp->n_rdma_seq++; + qp->r_flags |= QIB_R_RDMAR_SEQ; + qib_restart_rc(qp, qp->s_last_psn + 1, 0); + if (list_empty(&qp->rspwait)) { + qp->r_flags |= QIB_R_RSP_SEND; + atomic_inc(&qp->refcount); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } +} + +/** + * qib_rc_rcv_resp - process an incoming RC response packet + * @ibp: the port this packet came in on + * @ohdr: the other headers for this packet + * @data: the packet data + * @tlen: the packet length + * @qp: the QP for this packet + * @opcode: the opcode for this packet + * @psn: the packet sequence number for this packet + * @hdrsize: the header length + * @pmtu: the path MTU + * + * This is called from qib_rc_rcv() to process an incoming RC response + * packet for the given QP. + * Called at interrupt level. + */ +static void qib_rc_rcv_resp(struct qib_ibport *ibp, + struct qib_other_headers *ohdr, + void *data, u32 tlen, + struct qib_qp *qp, + u32 opcode, + u32 psn, u32 hdrsize, u32 pmtu, + struct qib_ctxtdata *rcd) +{ + struct qib_swqe *wqe; + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + enum ib_wc_status status; + unsigned long flags; + int diff; + u32 pad; + u32 aeth; + u64 val; + + if (opcode != OP(RDMA_READ_RESPONSE_MIDDLE)) { + /* + * If ACK'd PSN on SDMA busy list try to make progress to + * reclaim SDMA credits. + */ + if ((qib_cmp24(psn, qp->s_sending_psn) >= 0) && + (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)) { + + /* + * If send tasklet not running attempt to progress + * SDMA queue. + */ + if (!(qp->s_flags & QIB_S_BUSY)) { + /* Acquire SDMA Lock */ + spin_lock_irqsave(&ppd->sdma_lock, flags); + /* Invoke sdma make progress */ + qib_sdma_make_progress(ppd); + /* Release SDMA Lock */ + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + } + } + } + + spin_lock_irqsave(&qp->s_lock, flags); + if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) + goto ack_done; + + /* Ignore invalid responses. */ + if (qib_cmp24(psn, qp->s_next_psn) >= 0) + goto ack_done; + + /* Ignore duplicate responses. */ + diff = qib_cmp24(psn, qp->s_last_psn); + if (unlikely(diff <= 0)) { + /* Update credits for "ghost" ACKs */ + if (diff == 0 && opcode == OP(ACKNOWLEDGE)) { + aeth = be32_to_cpu(ohdr->u.aeth); + if ((aeth >> 29) == 0) + qib_get_credit(qp, aeth); + } + goto ack_done; + } + + /* + * Skip everything other than the PSN we expect, if we are waiting + * for a reply to a restarted RDMA read or atomic op. + */ + if (qp->r_flags & QIB_R_RDMAR_SEQ) { + if (qib_cmp24(psn, qp->s_last_psn + 1) != 0) + goto ack_done; + qp->r_flags &= ~QIB_R_RDMAR_SEQ; + } + + if (unlikely(qp->s_acked == qp->s_tail)) + goto ack_done; + wqe = get_swqe_ptr(qp, qp->s_acked); + status = IB_WC_SUCCESS; + + switch (opcode) { + case OP(ACKNOWLEDGE): + case OP(ATOMIC_ACKNOWLEDGE): + case OP(RDMA_READ_RESPONSE_FIRST): + aeth = be32_to_cpu(ohdr->u.aeth); + if (opcode == OP(ATOMIC_ACKNOWLEDGE)) { + __be32 *p = ohdr->u.at.atomic_ack_eth; + + val = ((u64) be32_to_cpu(p[0]) << 32) | + be32_to_cpu(p[1]); + } else + val = 0; + if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) || + opcode != OP(RDMA_READ_RESPONSE_FIRST)) + goto ack_done; + hdrsize += 4; + wqe = get_swqe_ptr(qp, qp->s_acked); + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; + /* + * If this is a response to a resent RDMA read, we + * have to be careful to copy the data to the right + * location. + */ + qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, + wqe, psn, pmtu); + goto read_middle; + + case OP(RDMA_READ_RESPONSE_MIDDLE): + /* no AETH, no ACK */ + if (unlikely(qib_cmp24(psn, qp->s_last_psn + 1))) + goto ack_seq_err; + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; +read_middle: + if (unlikely(tlen != (hdrsize + pmtu + 4))) + goto ack_len_err; + if (unlikely(pmtu >= qp->s_rdma_read_len)) + goto ack_len_err; + + /* + * We got a response so update the timeout. + * 4.096 usec. * (1 << qp->timeout) + */ + qp->s_flags |= QIB_S_TIMER; + mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies); + if (qp->s_flags & QIB_S_WAIT_ACK) { + qp->s_flags &= ~QIB_S_WAIT_ACK; + qib_schedule_send(qp); + } + + if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE)) + qp->s_retry = qp->s_retry_cnt; + + /* + * Update the RDMA receive state but do the copy w/o + * holding the locks and blocking interrupts. + */ + qp->s_rdma_read_len -= pmtu; + update_last_psn(qp, psn); + spin_unlock_irqrestore(&qp->s_lock, flags); + qib_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0); + goto bail; + + case OP(RDMA_READ_RESPONSE_ONLY): + aeth = be32_to_cpu(ohdr->u.aeth); + if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd)) + goto ack_done; + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* + * Check that the data size is >= 0 && <= pmtu. + * Remember to account for the AETH header (4) and + * ICRC (4). + */ + if (unlikely(tlen < (hdrsize + pad + 8))) + goto ack_len_err; + /* + * If this is a response to a resent RDMA read, we + * have to be careful to copy the data to the right + * location. + */ + wqe = get_swqe_ptr(qp, qp->s_acked); + qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, + wqe, psn, pmtu); + goto read_last; + + case OP(RDMA_READ_RESPONSE_LAST): + /* ACKs READ req. */ + if (unlikely(qib_cmp24(psn, qp->s_last_psn + 1))) + goto ack_seq_err; + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* + * Check that the data size is >= 1 && <= pmtu. + * Remember to account for the AETH header (4) and + * ICRC (4). + */ + if (unlikely(tlen <= (hdrsize + pad + 8))) + goto ack_len_err; +read_last: + tlen -= hdrsize + pad + 8; + if (unlikely(tlen != qp->s_rdma_read_len)) + goto ack_len_err; + aeth = be32_to_cpu(ohdr->u.aeth); + qib_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0); + WARN_ON(qp->s_rdma_read_sge.num_sge); + (void) do_rc_ack(qp, aeth, psn, + OP(RDMA_READ_RESPONSE_LAST), 0, rcd); + goto ack_done; + } + +ack_op_err: + status = IB_WC_LOC_QP_OP_ERR; + goto ack_err; + +ack_seq_err: + rdma_seq_err(qp, ibp, psn, rcd); + goto ack_done; + +ack_len_err: + status = IB_WC_LOC_LEN_ERR; +ack_err: + if (qp->s_last == qp->s_acked) { + qib_send_complete(qp, wqe, status); + qib_error_qp(qp, IB_WC_WR_FLUSH_ERR); + } +ack_done: + spin_unlock_irqrestore(&qp->s_lock, flags); +bail: + return; +} + +/** + * qib_rc_rcv_error - process an incoming duplicate or error RC packet + * @ohdr: the other headers for this packet + * @data: the packet data + * @qp: the QP for this packet + * @opcode: the opcode for this packet + * @psn: the packet sequence number for this packet + * @diff: the difference between the PSN and the expected PSN + * + * This is called from qib_rc_rcv() to process an unexpected + * incoming RC packet for the given QP. + * Called at interrupt level. + * Return 1 if no more processing is needed; otherwise return 0 to + * schedule a response to be sent. + */ +static int qib_rc_rcv_error(struct qib_other_headers *ohdr, + void *data, + struct qib_qp *qp, + u32 opcode, + u32 psn, + int diff, + struct qib_ctxtdata *rcd) +{ + struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct qib_ack_entry *e; + unsigned long flags; + u8 i, prev; + int old_req; + + if (diff > 0) { + /* + * Packet sequence error. + * A NAK will ACK earlier sends and RDMA writes. + * Don't queue the NAK if we already sent one. + */ + if (!qp->r_nak_state) { + ibp->n_rc_seqnak++; + qp->r_nak_state = IB_NAK_PSN_ERROR; + /* Use the expected PSN. */ + qp->r_ack_psn = qp->r_psn; + /* + * Wait to send the sequence NAK until all packets + * in the receive queue have been processed. + * Otherwise, we end up propagating congestion. + */ + if (list_empty(&qp->rspwait)) { + qp->r_flags |= QIB_R_RSP_NAK; + atomic_inc(&qp->refcount); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } + } + goto done; + } + + /* + * Handle a duplicate request. Don't re-execute SEND, RDMA + * write or atomic op. Don't NAK errors, just silently drop + * the duplicate request. Note that r_sge, r_len, and + * r_rcv_len may be in use so don't modify them. + * + * We are supposed to ACK the earliest duplicate PSN but we + * can coalesce an outstanding duplicate ACK. We have to + * send the earliest so that RDMA reads can be restarted at + * the requester's expected PSN. + * + * First, find where this duplicate PSN falls within the + * ACKs previously sent. + * old_req is true if there is an older response that is scheduled + * to be sent before sending this one. + */ + e = NULL; + old_req = 1; + ibp->n_rc_dupreq++; + + spin_lock_irqsave(&qp->s_lock, flags); + + for (i = qp->r_head_ack_queue; ; i = prev) { + if (i == qp->s_tail_ack_queue) + old_req = 0; + if (i) + prev = i - 1; + else + prev = QIB_MAX_RDMA_ATOMIC; + if (prev == qp->r_head_ack_queue) { + e = NULL; + break; + } + e = &qp->s_ack_queue[prev]; + if (!e->opcode) { + e = NULL; + break; + } + if (qib_cmp24(psn, e->psn) >= 0) { + if (prev == qp->s_tail_ack_queue && + qib_cmp24(psn, e->lpsn) <= 0) + old_req = 0; + break; + } + } + switch (opcode) { + case OP(RDMA_READ_REQUEST): { + struct ib_reth *reth; + u32 offset; + u32 len; + + /* + * If we didn't find the RDMA read request in the ack queue, + * we can ignore this request. + */ + if (!e || e->opcode != OP(RDMA_READ_REQUEST)) + goto unlock_done; + /* RETH comes after BTH */ + reth = &ohdr->u.rc.reth; + /* + * Address range must be a subset of the original + * request and start on pmtu boundaries. + * We reuse the old ack_queue slot since the requester + * should not back up and request an earlier PSN for the + * same request. + */ + offset = ((psn - e->psn) & QIB_PSN_MASK) * + qp->pmtu; + len = be32_to_cpu(reth->length); + if (unlikely(offset + len != e->rdma_sge.sge_length)) + goto unlock_done; + if (e->rdma_sge.mr) { + qib_put_mr(e->rdma_sge.mr); + e->rdma_sge.mr = NULL; + } + if (len != 0) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + ok = qib_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey, + IB_ACCESS_REMOTE_READ); + if (unlikely(!ok)) + goto unlock_done; + } else { + e->rdma_sge.vaddr = NULL; + e->rdma_sge.length = 0; + e->rdma_sge.sge_length = 0; + } + e->psn = psn; + if (old_req) + goto unlock_done; + qp->s_tail_ack_queue = prev; + break; + } + + case OP(COMPARE_SWAP): + case OP(FETCH_ADD): { + /* + * If we didn't find the atomic request in the ack queue + * or the send tasklet is already backed up to send an + * earlier entry, we can ignore this request. + */ + if (!e || e->opcode != (u8) opcode || old_req) + goto unlock_done; + qp->s_tail_ack_queue = prev; + break; + } + + default: + /* + * Ignore this operation if it doesn't request an ACK + * or an earlier RDMA read or atomic is going to be resent. + */ + if (!(psn & IB_BTH_REQ_ACK) || old_req) + goto unlock_done; + /* + * Resend the most recent ACK if this request is + * after all the previous RDMA reads and atomics. + */ + if (i == qp->r_head_ack_queue) { + spin_unlock_irqrestore(&qp->s_lock, flags); + qp->r_nak_state = 0; + qp->r_ack_psn = qp->r_psn - 1; + goto send_ack; + } + /* + * Try to send a simple ACK to work around a Mellanox bug + * which doesn't accept a RDMA read response or atomic + * response as an ACK for earlier SENDs or RDMA writes. + */ + if (!(qp->s_flags & QIB_S_RESP_PENDING)) { + spin_unlock_irqrestore(&qp->s_lock, flags); + qp->r_nak_state = 0; + qp->r_ack_psn = qp->s_ack_queue[i].psn - 1; + goto send_ack; + } + /* + * Resend the RDMA read or atomic op which + * ACKs this duplicate request. + */ + qp->s_tail_ack_queue = i; + break; + } + qp->s_ack_state = OP(ACKNOWLEDGE); + qp->s_flags |= QIB_S_RESP_PENDING; + qp->r_nak_state = 0; + qib_schedule_send(qp); + +unlock_done: + spin_unlock_irqrestore(&qp->s_lock, flags); +done: + return 1; + +send_ack: + return 0; +} + +void qib_rc_error(struct qib_qp *qp, enum ib_wc_status err) +{ + unsigned long flags; + int lastwqe; + + spin_lock_irqsave(&qp->s_lock, flags); + lastwqe = qib_error_qp(qp, err); + spin_unlock_irqrestore(&qp->s_lock, flags); + + if (lastwqe) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } +} + +static inline void qib_update_ack_queue(struct qib_qp *qp, unsigned n) +{ + unsigned next; + + next = n + 1; + if (next > QIB_MAX_RDMA_ATOMIC) + next = 0; + qp->s_tail_ack_queue = next; + qp->s_ack_state = OP(ACKNOWLEDGE); +} + +/** + * qib_rc_rcv - process an incoming RC packet + * @rcd: the context pointer + * @hdr: the header of this packet + * @has_grh: true if the header has a GRH + * @data: the packet data + * @tlen: the packet length + * @qp: the QP for this packet + * + * This is called from qib_qp_rcv() to process an incoming RC packet + * for the given QP. + * Called at interrupt level. + */ +void qib_rc_rcv(struct qib_ctxtdata *rcd, struct qib_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct qib_qp *qp) +{ + struct qib_ibport *ibp = &rcd->ppd->ibport_data; + struct qib_other_headers *ohdr; + u32 opcode; + u32 hdrsize; + u32 psn; + u32 pad; + struct ib_wc wc; + u32 pmtu = qp->pmtu; + int diff; + struct ib_reth *reth; + unsigned long flags; + int ret; + + /* Check for GRH */ + if (!has_grh) { + ohdr = &hdr->u.oth; + hdrsize = 8 + 12; /* LRH + BTH */ + } else { + ohdr = &hdr->u.l.oth; + hdrsize = 8 + 40 + 12; /* LRH + GRH + BTH */ + } + + opcode = be32_to_cpu(ohdr->bth[0]); + if (qib_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode)) + return; + + psn = be32_to_cpu(ohdr->bth[2]); + opcode >>= 24; + + /* + * Process responses (ACKs) before anything else. Note that the + * packet sequence number will be for something in the send work + * queue rather than the expected receive packet sequence number. + * In other words, this QP is the requester. + */ + if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) && + opcode <= OP(ATOMIC_ACKNOWLEDGE)) { + qib_rc_rcv_resp(ibp, ohdr, data, tlen, qp, opcode, psn, + hdrsize, pmtu, rcd); + return; + } + + /* Compute 24 bits worth of difference. */ + diff = qib_cmp24(psn, qp->r_psn); + if (unlikely(diff)) { + if (qib_rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd)) + return; + goto send_ack; + } + + /* Check for opcode sequence errors. */ + switch (qp->r_state) { + case OP(SEND_FIRST): + case OP(SEND_MIDDLE): + if (opcode == OP(SEND_MIDDLE) || + opcode == OP(SEND_LAST) || + opcode == OP(SEND_LAST_WITH_IMMEDIATE)) + break; + goto nack_inv; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_MIDDLE): + if (opcode == OP(RDMA_WRITE_MIDDLE) || + opcode == OP(RDMA_WRITE_LAST) || + opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) + break; + goto nack_inv; + + default: + if (opcode == OP(SEND_MIDDLE) || + opcode == OP(SEND_LAST) || + opcode == OP(SEND_LAST_WITH_IMMEDIATE) || + opcode == OP(RDMA_WRITE_MIDDLE) || + opcode == OP(RDMA_WRITE_LAST) || + opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) + goto nack_inv; + /* + * Note that it is up to the requester to not send a new + * RDMA read or atomic operation before receiving an ACK + * for the previous operation. + */ + break; + } + + if (qp->state == IB_QPS_RTR && !(qp->r_flags & QIB_R_COMM_EST)) { + qp->r_flags |= QIB_R_COMM_EST; + if (qp->ibqp.event_handler) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_COMM_EST; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } + } + + /* OK, process the packet. */ + switch (opcode) { + case OP(SEND_FIRST): + ret = qib_get_rwqe(qp, 0); + if (ret < 0) + goto nack_op_err; + if (!ret) + goto rnr_nak; + qp->r_rcv_len = 0; + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + case OP(RDMA_WRITE_MIDDLE): +send_middle: + /* Check for invalid length PMTU or posted rwqe len. */ + if (unlikely(tlen != (hdrsize + pmtu + 4))) + goto nack_inv; + qp->r_rcv_len += pmtu; + if (unlikely(qp->r_rcv_len > qp->r_len)) + goto nack_inv; + qib_copy_sge(&qp->r_sge, data, pmtu, 1); + break; + + case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): + /* consume RWQE */ + ret = qib_get_rwqe(qp, 1); + if (ret < 0) + goto nack_op_err; + if (!ret) + goto rnr_nak; + goto send_last_imm; + + case OP(SEND_ONLY): + case OP(SEND_ONLY_WITH_IMMEDIATE): + ret = qib_get_rwqe(qp, 0); + if (ret < 0) + goto nack_op_err; + if (!ret) + goto rnr_nak; + qp->r_rcv_len = 0; + if (opcode == OP(SEND_ONLY)) + goto no_immediate_data; + /* FALLTHROUGH for SEND_ONLY_WITH_IMMEDIATE */ + case OP(SEND_LAST_WITH_IMMEDIATE): +send_last_imm: + wc.ex.imm_data = ohdr->u.imm_data; + hdrsize += 4; + wc.wc_flags = IB_WC_WITH_IMM; + goto send_last; + case OP(SEND_LAST): + case OP(RDMA_WRITE_LAST): +no_immediate_data: + wc.wc_flags = 0; + wc.ex.imm_data = 0; +send_last: + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* XXX LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) + goto nack_inv; + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + wc.byte_len = tlen + qp->r_rcv_len; + if (unlikely(wc.byte_len > qp->r_len)) + goto nack_inv; + qib_copy_sge(&qp->r_sge, data, tlen, 1); + qib_put_ss(&qp->r_sge); + qp->r_msn++; + if (!test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags)) + break; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) || + opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + else + wc.opcode = IB_WC_RECV; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = qp->remote_ah_attr.dlid; + wc.sl = qp->remote_ah_attr.sl; + /* zero fields that are N/A */ + wc.vendor_err = 0; + wc.pkey_index = 0; + wc.dlid_path_bits = 0; + wc.port_num = 0; + /* Signal completion event if the solicited bit is set. */ + qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + (ohdr->bth[0] & + cpu_to_be32(IB_BTH_SOLICITED)) != 0); + break; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_ONLY): + case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto nack_inv; + /* consume RWQE */ + reth = &ohdr->u.rc.reth; + hdrsize += sizeof(*reth); + qp->r_len = be32_to_cpu(reth->length); + qp->r_rcv_len = 0; + qp->r_sge.sg_list = NULL; + if (qp->r_len != 0) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + /* Check rkey & NAK */ + ok = qib_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr, + rkey, IB_ACCESS_REMOTE_WRITE); + if (unlikely(!ok)) + goto nack_acc; + qp->r_sge.num_sge = 1; + } else { + qp->r_sge.num_sge = 0; + qp->r_sge.sge.mr = NULL; + qp->r_sge.sge.vaddr = NULL; + qp->r_sge.sge.length = 0; + qp->r_sge.sge.sge_length = 0; + } + if (opcode == OP(RDMA_WRITE_FIRST)) + goto send_middle; + else if (opcode == OP(RDMA_WRITE_ONLY)) + goto no_immediate_data; + ret = qib_get_rwqe(qp, 1); + if (ret < 0) + goto nack_op_err; + if (!ret) + goto rnr_nak; + wc.ex.imm_data = ohdr->u.rc.imm_data; + hdrsize += 4; + wc.wc_flags = IB_WC_WITH_IMM; + goto send_last; + + case OP(RDMA_READ_REQUEST): { + struct qib_ack_entry *e; + u32 len; + u8 next; + + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) + goto nack_inv; + next = qp->r_head_ack_queue + 1; + /* s_ack_queue is size QIB_MAX_RDMA_ATOMIC+1 so use > not >= */ + if (next > QIB_MAX_RDMA_ATOMIC) + next = 0; + spin_lock_irqsave(&qp->s_lock, flags); + if (unlikely(next == qp->s_tail_ack_queue)) { + if (!qp->s_ack_queue[next].sent) + goto nack_inv_unlck; + qib_update_ack_queue(qp, next); + } + e = &qp->s_ack_queue[qp->r_head_ack_queue]; + if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) { + qib_put_mr(e->rdma_sge.mr); + e->rdma_sge.mr = NULL; + } + reth = &ohdr->u.rc.reth; + len = be32_to_cpu(reth->length); + if (len) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + /* Check rkey & NAK */ + ok = qib_rkey_ok(qp, &e->rdma_sge, len, vaddr, + rkey, IB_ACCESS_REMOTE_READ); + if (unlikely(!ok)) + goto nack_acc_unlck; + /* + * Update the next expected PSN. We add 1 later + * below, so only add the remainder here. + */ + if (len > pmtu) + qp->r_psn += (len - 1) / pmtu; + } else { + e->rdma_sge.mr = NULL; + e->rdma_sge.vaddr = NULL; + e->rdma_sge.length = 0; + e->rdma_sge.sge_length = 0; + } + e->opcode = opcode; + e->sent = 0; + e->psn = psn; + e->lpsn = qp->r_psn; + /* + * We need to increment the MSN here instead of when we + * finish sending the result since a duplicate request would + * increment it more than once. + */ + qp->r_msn++; + qp->r_psn++; + qp->r_state = opcode; + qp->r_nak_state = 0; + qp->r_head_ack_queue = next; + + /* Schedule the send tasklet. */ + qp->s_flags |= QIB_S_RESP_PENDING; + qib_schedule_send(qp); + + goto sunlock; + } + + case OP(COMPARE_SWAP): + case OP(FETCH_ADD): { + struct ib_atomic_eth *ateth; + struct qib_ack_entry *e; + u64 vaddr; + atomic64_t *maddr; + u64 sdata; + u32 rkey; + u8 next; + + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) + goto nack_inv; + next = qp->r_head_ack_queue + 1; + if (next > QIB_MAX_RDMA_ATOMIC) + next = 0; + spin_lock_irqsave(&qp->s_lock, flags); + if (unlikely(next == qp->s_tail_ack_queue)) { + if (!qp->s_ack_queue[next].sent) + goto nack_inv_unlck; + qib_update_ack_queue(qp, next); + } + e = &qp->s_ack_queue[qp->r_head_ack_queue]; + if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) { + qib_put_mr(e->rdma_sge.mr); + e->rdma_sge.mr = NULL; + } + ateth = &ohdr->u.atomic_eth; + vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) | + be32_to_cpu(ateth->vaddr[1]); + if (unlikely(vaddr & (sizeof(u64) - 1))) + goto nack_inv_unlck; + rkey = be32_to_cpu(ateth->rkey); + /* Check rkey & NAK */ + if (unlikely(!qib_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), + vaddr, rkey, + IB_ACCESS_REMOTE_ATOMIC))) + goto nack_acc_unlck; + /* Perform atomic OP and save result. */ + maddr = (atomic64_t *) qp->r_sge.sge.vaddr; + sdata = be64_to_cpu(ateth->swap_data); + e->atomic_data = (opcode == OP(FETCH_ADD)) ? + (u64) atomic64_add_return(sdata, maddr) - sdata : + (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr, + be64_to_cpu(ateth->compare_data), + sdata); + qib_put_mr(qp->r_sge.sge.mr); + qp->r_sge.num_sge = 0; + e->opcode = opcode; + e->sent = 0; + e->psn = psn; + e->lpsn = psn; + qp->r_msn++; + qp->r_psn++; + qp->r_state = opcode; + qp->r_nak_state = 0; + qp->r_head_ack_queue = next; + + /* Schedule the send tasklet. */ + qp->s_flags |= QIB_S_RESP_PENDING; + qib_schedule_send(qp); + + goto sunlock; + } + + default: + /* NAK unknown opcodes. */ + goto nack_inv; + } + qp->r_psn++; + qp->r_state = opcode; + qp->r_ack_psn = psn; + qp->r_nak_state = 0; + /* Send an ACK if requested or required. */ + if (psn & (1 << 31)) + goto send_ack; + return; + +rnr_nak: + qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer; + qp->r_ack_psn = qp->r_psn; + /* Queue RNR NAK for later */ + if (list_empty(&qp->rspwait)) { + qp->r_flags |= QIB_R_RSP_NAK; + atomic_inc(&qp->refcount); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } + return; + +nack_op_err: + qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR; + qp->r_ack_psn = qp->r_psn; + /* Queue NAK for later */ + if (list_empty(&qp->rspwait)) { + qp->r_flags |= QIB_R_RSP_NAK; + atomic_inc(&qp->refcount); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } + return; + +nack_inv_unlck: + spin_unlock_irqrestore(&qp->s_lock, flags); +nack_inv: + qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + qp->r_nak_state = IB_NAK_INVALID_REQUEST; + qp->r_ack_psn = qp->r_psn; + /* Queue NAK for later */ + if (list_empty(&qp->rspwait)) { + qp->r_flags |= QIB_R_RSP_NAK; + atomic_inc(&qp->refcount); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } + return; + +nack_acc_unlck: + spin_unlock_irqrestore(&qp->s_lock, flags); +nack_acc: + qib_rc_error(qp, IB_WC_LOC_PROT_ERR); + qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; + qp->r_ack_psn = qp->r_psn; +send_ack: + qib_send_rc_ack(qp); + return; + +sunlock: + spin_unlock_irqrestore(&qp->s_lock, flags); +} diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c new file mode 100644 index 0000000..4c07a8b --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_ruc.c @@ -0,0 +1,819 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "qib.h" +#include "qib_mad.h" + +/* + * Convert the AETH RNR timeout code into the number of microseconds. + */ +const u32 ib_qib_rnr_table[32] = { + 655360, /* 00: 655.36 */ + 10, /* 01: .01 */ + 20, /* 02 .02 */ + 30, /* 03: .03 */ + 40, /* 04: .04 */ + 60, /* 05: .06 */ + 80, /* 06: .08 */ + 120, /* 07: .12 */ + 160, /* 08: .16 */ + 240, /* 09: .24 */ + 320, /* 0A: .32 */ + 480, /* 0B: .48 */ + 640, /* 0C: .64 */ + 960, /* 0D: .96 */ + 1280, /* 0E: 1.28 */ + 1920, /* 0F: 1.92 */ + 2560, /* 10: 2.56 */ + 3840, /* 11: 3.84 */ + 5120, /* 12: 5.12 */ + 7680, /* 13: 7.68 */ + 10240, /* 14: 10.24 */ + 15360, /* 15: 15.36 */ + 20480, /* 16: 20.48 */ + 30720, /* 17: 30.72 */ + 40960, /* 18: 40.96 */ + 61440, /* 19: 61.44 */ + 81920, /* 1A: 81.92 */ + 122880, /* 1B: 122.88 */ + 163840, /* 1C: 163.84 */ + 245760, /* 1D: 245.76 */ + 327680, /* 1E: 327.68 */ + 491520 /* 1F: 491.52 */ +}; + +/* + * Validate a RWQE and fill in the SGE state. + * Return 1 if OK. + */ +static int qib_init_sge(struct qib_qp *qp, struct qib_rwqe *wqe) +{ + int i, j, ret; + struct ib_wc wc; + struct qib_lkey_table *rkt; + struct qib_pd *pd; + struct qib_sge_state *ss; + + rkt = &to_idev(qp->ibqp.device)->lk_table; + pd = to_ipd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd); + ss = &qp->r_sge; + ss->sg_list = qp->r_sg_list; + qp->r_len = 0; + for (i = j = 0; i < wqe->num_sge; i++) { + if (wqe->sg_list[i].length == 0) + continue; + /* Check LKEY */ + if (!qib_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge, + &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE)) + goto bad_lkey; + qp->r_len += wqe->sg_list[i].length; + j++; + } + ss->num_sge = j; + ss->total_len = qp->r_len; + ret = 1; + goto bail; + +bad_lkey: + while (j) { + struct qib_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge; + + qib_put_mr(sge->mr); + } + ss->num_sge = 0; + memset(&wc, 0, sizeof(wc)); + wc.wr_id = wqe->wr_id; + wc.status = IB_WC_LOC_PROT_ERR; + wc.opcode = IB_WC_RECV; + wc.qp = &qp->ibqp; + /* Signal solicited completion event. */ + qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); + ret = 0; +bail: + return ret; +} + +/** + * qib_get_rwqe - copy the next RWQE into the QP's RWQE + * @qp: the QP + * @wr_id_only: update qp->r_wr_id only, not qp->r_sge + * + * Return -1 if there is a local error, 0 if no RWQE is available, + * otherwise return 1. + * + * Can be called from interrupt level. + */ +int qib_get_rwqe(struct qib_qp *qp, int wr_id_only) +{ + unsigned long flags; + struct qib_rq *rq; + struct qib_rwq *wq; + struct qib_srq *srq; + struct qib_rwqe *wqe; + void (*handler)(struct ib_event *, void *); + u32 tail; + int ret; + + if (qp->ibqp.srq) { + srq = to_isrq(qp->ibqp.srq); + handler = srq->ibsrq.event_handler; + rq = &srq->rq; + } else { + srq = NULL; + handler = NULL; + rq = &qp->r_rq; + } + + spin_lock_irqsave(&rq->lock, flags); + if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) { + ret = 0; + goto unlock; + } + + wq = rq->wq; + tail = wq->tail; + /* Validate tail before using it since it is user writable. */ + if (tail >= rq->size) + tail = 0; + if (unlikely(tail == wq->head)) { + ret = 0; + goto unlock; + } + /* Make sure entry is read after head index is read. */ + smp_rmb(); + wqe = get_rwqe_ptr(rq, tail); + /* + * Even though we update the tail index in memory, the verbs + * consumer is not supposed to post more entries until a + * completion is generated. + */ + if (++tail >= rq->size) + tail = 0; + wq->tail = tail; + if (!wr_id_only && !qib_init_sge(qp, wqe)) { + ret = -1; + goto unlock; + } + qp->r_wr_id = wqe->wr_id; + + ret = 1; + set_bit(QIB_R_WRID_VALID, &qp->r_aflags); + if (handler) { + u32 n; + + /* + * Validate head pointer value and compute + * the number of remaining WQEs. + */ + n = wq->head; + if (n >= rq->size) + n = 0; + if (n < tail) + n += rq->size - tail; + else + n -= tail; + if (n < srq->limit) { + struct ib_event ev; + + srq->limit = 0; + spin_unlock_irqrestore(&rq->lock, flags); + ev.device = qp->ibqp.device; + ev.element.srq = qp->ibqp.srq; + ev.event = IB_EVENT_SRQ_LIMIT_REACHED; + handler(&ev, srq->ibsrq.srq_context); + goto bail; + } + } +unlock: + spin_unlock_irqrestore(&rq->lock, flags); +bail: + return ret; +} + +/* + * Switch to alternate path. + * The QP s_lock should be held and interrupts disabled. + */ +void qib_migrate_qp(struct qib_qp *qp) +{ + struct ib_event ev; + + qp->s_mig_state = IB_MIG_MIGRATED; + qp->remote_ah_attr = qp->alt_ah_attr; + qp->port_num = qp->alt_ah_attr.port_num; + qp->s_pkey_index = qp->s_alt_pkey_index; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_PATH_MIG; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); +} + +static __be64 get_sguid(struct qib_ibport *ibp, unsigned index) +{ + if (!index) { + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + + return ppd->guid; + } else + return ibp->guids[index - 1]; +} + +static int gid_ok(union ib_gid *gid, __be64 gid_prefix, __be64 id) +{ + return (gid->global.interface_id == id && + (gid->global.subnet_prefix == gid_prefix || + gid->global.subnet_prefix == IB_DEFAULT_GID_PREFIX)); +} + +/* + * + * This should be called with the QP r_lock held. + * + * The s_lock will be acquired around the qib_migrate_qp() call. + */ +int qib_ruc_check_hdr(struct qib_ibport *ibp, struct qib_ib_header *hdr, + int has_grh, struct qib_qp *qp, u32 bth0) +{ + __be64 guid; + unsigned long flags; + + if (qp->s_mig_state == IB_MIG_ARMED && (bth0 & IB_BTH_MIG_REQ)) { + if (!has_grh) { + if (qp->alt_ah_attr.ah_flags & IB_AH_GRH) + goto err; + } else { + if (!(qp->alt_ah_attr.ah_flags & IB_AH_GRH)) + goto err; + guid = get_sguid(ibp, qp->alt_ah_attr.grh.sgid_index); + if (!gid_ok(&hdr->u.l.grh.dgid, ibp->gid_prefix, guid)) + goto err; + if (!gid_ok(&hdr->u.l.grh.sgid, + qp->alt_ah_attr.grh.dgid.global.subnet_prefix, + qp->alt_ah_attr.grh.dgid.global.interface_id)) + goto err; + } + if (!qib_pkey_ok((u16)bth0, + qib_get_pkey(ibp, qp->s_alt_pkey_index))) { + qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY, + (u16)bth0, + (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF, + 0, qp->ibqp.qp_num, + hdr->lrh[3], hdr->lrh[1]); + goto err; + } + /* Validate the SLID. See Ch. 9.6.1.5 and 17.2.8 */ + if (be16_to_cpu(hdr->lrh[3]) != qp->alt_ah_attr.dlid || + ppd_from_ibp(ibp)->port != qp->alt_ah_attr.port_num) + goto err; + spin_lock_irqsave(&qp->s_lock, flags); + qib_migrate_qp(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + } else { + if (!has_grh) { + if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) + goto err; + } else { + if (!(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) + goto err; + guid = get_sguid(ibp, + qp->remote_ah_attr.grh.sgid_index); + if (!gid_ok(&hdr->u.l.grh.dgid, ibp->gid_prefix, guid)) + goto err; + if (!gid_ok(&hdr->u.l.grh.sgid, + qp->remote_ah_attr.grh.dgid.global.subnet_prefix, + qp->remote_ah_attr.grh.dgid.global.interface_id)) + goto err; + } + if (!qib_pkey_ok((u16)bth0, + qib_get_pkey(ibp, qp->s_pkey_index))) { + qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY, + (u16)bth0, + (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF, + 0, qp->ibqp.qp_num, + hdr->lrh[3], hdr->lrh[1]); + goto err; + } + /* Validate the SLID. See Ch. 9.6.1.5 */ + if (be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid || + ppd_from_ibp(ibp)->port != qp->port_num) + goto err; + if (qp->s_mig_state == IB_MIG_REARM && + !(bth0 & IB_BTH_MIG_REQ)) + qp->s_mig_state = IB_MIG_ARMED; + } + + return 0; + +err: + return 1; +} + +/** + * qib_ruc_loopback - handle UC and RC lookback requests + * @sqp: the sending QP + * + * This is called from qib_do_send() to + * forward a WQE addressed to the same HCA. + * Note that although we are single threaded due to the tasklet, we still + * have to protect against post_send(). We don't have to worry about + * receive interrupts since this is a connected protocol and all packets + * will pass through here. + */ +static void qib_ruc_loopback(struct qib_qp *sqp) +{ + struct qib_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num); + struct qib_qp *qp; + struct qib_swqe *wqe; + struct qib_sge *sge; + unsigned long flags; + struct ib_wc wc; + u64 sdata; + atomic64_t *maddr; + enum ib_wc_status send_status; + int release; + int ret; + + /* + * Note that we check the responder QP state after + * checking the requester's state. + */ + qp = qib_lookup_qpn(ibp, sqp->remote_qpn); + + spin_lock_irqsave(&sqp->s_lock, flags); + + /* Return if we are already busy processing a work request. */ + if ((sqp->s_flags & (QIB_S_BUSY | QIB_S_ANY_WAIT)) || + !(ib_qib_state_ops[sqp->state] & QIB_PROCESS_OR_FLUSH_SEND)) + goto unlock; + + sqp->s_flags |= QIB_S_BUSY; + +again: + if (sqp->s_last == sqp->s_head) + goto clr_busy; + wqe = get_swqe_ptr(sqp, sqp->s_last); + + /* Return if it is not OK to start a new work reqeust. */ + if (!(ib_qib_state_ops[sqp->state] & QIB_PROCESS_NEXT_SEND_OK)) { + if (!(ib_qib_state_ops[sqp->state] & QIB_FLUSH_SEND)) + goto clr_busy; + /* We are in the error state, flush the work request. */ + send_status = IB_WC_WR_FLUSH_ERR; + goto flush_send; + } + + /* + * We can rely on the entry not changing without the s_lock + * being held until we update s_last. + * We increment s_cur to indicate s_last is in progress. + */ + if (sqp->s_last == sqp->s_cur) { + if (++sqp->s_cur >= sqp->s_size) + sqp->s_cur = 0; + } + spin_unlock_irqrestore(&sqp->s_lock, flags); + + if (!qp || !(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) || + qp->ibqp.qp_type != sqp->ibqp.qp_type) { + ibp->n_pkt_drops++; + /* + * For RC, the requester would timeout and retry so + * shortcut the timeouts and just signal too many retries. + */ + if (sqp->ibqp.qp_type == IB_QPT_RC) + send_status = IB_WC_RETRY_EXC_ERR; + else + send_status = IB_WC_SUCCESS; + goto serr; + } + + memset(&wc, 0, sizeof wc); + send_status = IB_WC_SUCCESS; + + release = 1; + sqp->s_sge.sge = wqe->sg_list[0]; + sqp->s_sge.sg_list = wqe->sg_list + 1; + sqp->s_sge.num_sge = wqe->wr.num_sge; + sqp->s_len = wqe->length; + switch (wqe->wr.opcode) { + case IB_WR_SEND_WITH_IMM: + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = wqe->wr.ex.imm_data; + /* FALLTHROUGH */ + case IB_WR_SEND: + ret = qib_get_rwqe(qp, 0); + if (ret < 0) + goto op_err; + if (!ret) + goto rnr_nak; + break; + + case IB_WR_RDMA_WRITE_WITH_IMM: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto inv_err; + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = wqe->wr.ex.imm_data; + ret = qib_get_rwqe(qp, 1); + if (ret < 0) + goto op_err; + if (!ret) + goto rnr_nak; + /* FALLTHROUGH */ + case IB_WR_RDMA_WRITE: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto inv_err; + if (wqe->length == 0) + break; + if (unlikely(!qib_rkey_ok(qp, &qp->r_sge.sge, wqe->length, + wqe->wr.wr.rdma.remote_addr, + wqe->wr.wr.rdma.rkey, + IB_ACCESS_REMOTE_WRITE))) + goto acc_err; + qp->r_sge.sg_list = NULL; + qp->r_sge.num_sge = 1; + qp->r_sge.total_len = wqe->length; + break; + + case IB_WR_RDMA_READ: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) + goto inv_err; + if (unlikely(!qib_rkey_ok(qp, &sqp->s_sge.sge, wqe->length, + wqe->wr.wr.rdma.remote_addr, + wqe->wr.wr.rdma.rkey, + IB_ACCESS_REMOTE_READ))) + goto acc_err; + release = 0; + sqp->s_sge.sg_list = NULL; + sqp->s_sge.num_sge = 1; + qp->r_sge.sge = wqe->sg_list[0]; + qp->r_sge.sg_list = wqe->sg_list + 1; + qp->r_sge.num_sge = wqe->wr.num_sge; + qp->r_sge.total_len = wqe->length; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) + goto inv_err; + if (unlikely(!qib_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), + wqe->wr.wr.atomic.remote_addr, + wqe->wr.wr.atomic.rkey, + IB_ACCESS_REMOTE_ATOMIC))) + goto acc_err; + /* Perform atomic OP and save result. */ + maddr = (atomic64_t *) qp->r_sge.sge.vaddr; + sdata = wqe->wr.wr.atomic.compare_add; + *(u64 *) sqp->s_sge.sge.vaddr = + (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ? + (u64) atomic64_add_return(sdata, maddr) - sdata : + (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr, + sdata, wqe->wr.wr.atomic.swap); + qib_put_mr(qp->r_sge.sge.mr); + qp->r_sge.num_sge = 0; + goto send_comp; + + default: + send_status = IB_WC_LOC_QP_OP_ERR; + goto serr; + } + + sge = &sqp->s_sge.sge; + while (sqp->s_len) { + u32 len = sqp->s_len; + + if (len > sge->length) + len = sge->length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + qib_copy_sge(&qp->r_sge, sge->vaddr, len, release); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (!release) + qib_put_mr(sge->mr); + if (--sqp->s_sge.num_sge) + *sge = *sqp->s_sge.sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= QIB_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + sqp->s_len -= len; + } + if (release) + qib_put_ss(&qp->r_sge); + + if (!test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags)) + goto send_comp; + + if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM) + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + else + wc.opcode = IB_WC_RECV; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.byte_len = wqe->length; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = qp->remote_ah_attr.dlid; + wc.sl = qp->remote_ah_attr.sl; + wc.port_num = 1; + /* Signal completion event if the solicited bit is set. */ + qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + wqe->wr.send_flags & IB_SEND_SOLICITED); + +send_comp: + spin_lock_irqsave(&sqp->s_lock, flags); + ibp->n_loop_pkts++; +flush_send: + sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; + qib_send_complete(sqp, wqe, send_status); + goto again; + +rnr_nak: + /* Handle RNR NAK */ + if (qp->ibqp.qp_type == IB_QPT_UC) + goto send_comp; + ibp->n_rnr_naks++; + /* + * Note: we don't need the s_lock held since the BUSY flag + * makes this single threaded. + */ + if (sqp->s_rnr_retry == 0) { + send_status = IB_WC_RNR_RETRY_EXC_ERR; + goto serr; + } + if (sqp->s_rnr_retry_cnt < 7) + sqp->s_rnr_retry--; + spin_lock_irqsave(&sqp->s_lock, flags); + if (!(ib_qib_state_ops[sqp->state] & QIB_PROCESS_RECV_OK)) + goto clr_busy; + sqp->s_flags |= QIB_S_WAIT_RNR; + sqp->s_timer.function = qib_rc_rnr_retry; + sqp->s_timer.expires = jiffies + + usecs_to_jiffies(ib_qib_rnr_table[qp->r_min_rnr_timer]); + add_timer(&sqp->s_timer); + goto clr_busy; + +op_err: + send_status = IB_WC_REM_OP_ERR; + wc.status = IB_WC_LOC_QP_OP_ERR; + goto err; + +inv_err: + send_status = IB_WC_REM_INV_REQ_ERR; + wc.status = IB_WC_LOC_QP_OP_ERR; + goto err; + +acc_err: + send_status = IB_WC_REM_ACCESS_ERR; + wc.status = IB_WC_LOC_PROT_ERR; +err: + /* responder goes to error state */ + qib_rc_error(qp, wc.status); + +serr: + spin_lock_irqsave(&sqp->s_lock, flags); + qib_send_complete(sqp, wqe, send_status); + if (sqp->ibqp.qp_type == IB_QPT_RC) { + int lastwqe = qib_error_qp(sqp, IB_WC_WR_FLUSH_ERR); + + sqp->s_flags &= ~QIB_S_BUSY; + spin_unlock_irqrestore(&sqp->s_lock, flags); + if (lastwqe) { + struct ib_event ev; + + ev.device = sqp->ibqp.device; + ev.element.qp = &sqp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context); + } + goto done; + } +clr_busy: + sqp->s_flags &= ~QIB_S_BUSY; +unlock: + spin_unlock_irqrestore(&sqp->s_lock, flags); +done: + if (qp && atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); +} + +/** + * qib_make_grh - construct a GRH header + * @ibp: a pointer to the IB port + * @hdr: a pointer to the GRH header being constructed + * @grh: the global route address to send to + * @hwords: the number of 32 bit words of header being sent + * @nwords: the number of 32 bit words of data being sent + * + * Return the size of the header in 32 bit words. + */ +u32 qib_make_grh(struct qib_ibport *ibp, struct ib_grh *hdr, + struct ib_global_route *grh, u32 hwords, u32 nwords) +{ + hdr->version_tclass_flow = + cpu_to_be32((IB_GRH_VERSION << IB_GRH_VERSION_SHIFT) | + (grh->traffic_class << IB_GRH_TCLASS_SHIFT) | + (grh->flow_label << IB_GRH_FLOW_SHIFT)); + hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2); + /* next_hdr is defined by C8-7 in ch. 8.4.1 */ + hdr->next_hdr = IB_GRH_NEXT_HDR; + hdr->hop_limit = grh->hop_limit; + /* The SGID is 32-bit aligned. */ + hdr->sgid.global.subnet_prefix = ibp->gid_prefix; + hdr->sgid.global.interface_id = grh->sgid_index ? + ibp->guids[grh->sgid_index - 1] : ppd_from_ibp(ibp)->guid; + hdr->dgid = grh->dgid; + + /* GRH header size in 32-bit words. */ + return sizeof(struct ib_grh) / sizeof(u32); +} + +void qib_make_ruc_header(struct qib_qp *qp, struct qib_other_headers *ohdr, + u32 bth0, u32 bth2) +{ + struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + u16 lrh0; + u32 nwords; + u32 extra_bytes; + + /* Construct the header. */ + extra_bytes = -qp->s_cur_size & 3; + nwords = (qp->s_cur_size + extra_bytes) >> 2; + lrh0 = QIB_LRH_BTH; + if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { + qp->s_hdrwords += qib_make_grh(ibp, &qp->s_hdr->u.l.grh, + &qp->remote_ah_attr.grh, + qp->s_hdrwords, nwords); + lrh0 = QIB_LRH_GRH; + } + lrh0 |= ibp->sl_to_vl[qp->remote_ah_attr.sl] << 12 | + qp->remote_ah_attr.sl << 4; + qp->s_hdr->lrh[0] = cpu_to_be16(lrh0); + qp->s_hdr->lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); + qp->s_hdr->lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC); + qp->s_hdr->lrh[3] = cpu_to_be16(ppd_from_ibp(ibp)->lid | + qp->remote_ah_attr.src_path_bits); + bth0 |= qib_get_pkey(ibp, qp->s_pkey_index); + bth0 |= extra_bytes << 20; + if (qp->s_mig_state == IB_MIG_MIGRATED) + bth0 |= IB_BTH_MIG_REQ; + ohdr->bth[0] = cpu_to_be32(bth0); + ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); + ohdr->bth[2] = cpu_to_be32(bth2); + this_cpu_inc(ibp->pmastats->n_unicast_xmit); +} + +/** + * qib_do_send - perform a send on a QP + * @work: contains a pointer to the QP + * + * Process entries in the send work queue until credit or queue is + * exhausted. Only allow one CPU to send a packet per QP (tasklet). + * Otherwise, two threads could send packets out of order. + */ +void qib_do_send(struct work_struct *work) +{ + struct qib_qp *qp = container_of(work, struct qib_qp, s_work); + struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + int (*make_req)(struct qib_qp *qp); + unsigned long flags; + + if ((qp->ibqp.qp_type == IB_QPT_RC || + qp->ibqp.qp_type == IB_QPT_UC) && + (qp->remote_ah_attr.dlid & ~((1 << ppd->lmc) - 1)) == ppd->lid) { + qib_ruc_loopback(qp); + return; + } + + if (qp->ibqp.qp_type == IB_QPT_RC) + make_req = qib_make_rc_req; + else if (qp->ibqp.qp_type == IB_QPT_UC) + make_req = qib_make_uc_req; + else + make_req = qib_make_ud_req; + + spin_lock_irqsave(&qp->s_lock, flags); + + /* Return if we are already busy processing a work request. */ + if (!qib_send_ok(qp)) { + spin_unlock_irqrestore(&qp->s_lock, flags); + return; + } + + qp->s_flags |= QIB_S_BUSY; + + spin_unlock_irqrestore(&qp->s_lock, flags); + + do { + /* Check for a constructed packet to be sent. */ + if (qp->s_hdrwords != 0) { + /* + * If the packet cannot be sent now, return and + * the send tasklet will be woken up later. + */ + if (qib_verbs_send(qp, qp->s_hdr, qp->s_hdrwords, + qp->s_cur_sge, qp->s_cur_size)) + break; + /* Record that s_hdr is empty. */ + qp->s_hdrwords = 0; + } + } while (make_req(qp)); +} + +/* + * This should be called with s_lock held. + */ +void qib_send_complete(struct qib_qp *qp, struct qib_swqe *wqe, + enum ib_wc_status status) +{ + u32 old_last, last; + unsigned i; + + if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_OR_FLUSH_SEND)) + return; + + for (i = 0; i < wqe->wr.num_sge; i++) { + struct qib_sge *sge = &wqe->sg_list[i]; + + qib_put_mr(sge->mr); + } + if (qp->ibqp.qp_type == IB_QPT_UD || + qp->ibqp.qp_type == IB_QPT_SMI || + qp->ibqp.qp_type == IB_QPT_GSI) + atomic_dec(&to_iah(wqe->wr.wr.ud.ah)->refcount); + + /* See ch. 11.2.4.1 and 10.7.3.1 */ + if (!(qp->s_flags & QIB_S_SIGNAL_REQ_WR) || + (wqe->wr.send_flags & IB_SEND_SIGNALED) || + status != IB_WC_SUCCESS) { + struct ib_wc wc; + + memset(&wc, 0, sizeof wc); + wc.wr_id = wqe->wr.wr_id; + wc.status = status; + wc.opcode = ib_qib_wc_opcode[wqe->wr.opcode]; + wc.qp = &qp->ibqp; + if (status == IB_WC_SUCCESS) + wc.byte_len = wqe->length; + qib_cq_enter(to_icq(qp->ibqp.send_cq), &wc, + status != IB_WC_SUCCESS); + } + + last = qp->s_last; + old_last = last; + if (++last >= qp->s_size) + last = 0; + qp->s_last = last; + if (qp->s_acked == old_last) + qp->s_acked = last; + if (qp->s_cur == old_last) + qp->s_cur = last; + if (qp->s_tail == old_last) + qp->s_tail = last; + if (qp->state == IB_QPS_SQD && last == qp->s_cur) + qp->s_draining = 0; +} diff --git a/drivers/infiniband/hw/qib/qib_sd7220.c b/drivers/infiniband/hw/qib/qib_sd7220.c new file mode 100644 index 0000000..911205d --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_sd7220.c @@ -0,0 +1,1449 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +/* + * This file contains all of the code that is specific to the SerDes + * on the QLogic_IB 7220 chip. + */ + +#include +#include +#include +#include + +#include "qib.h" +#include "qib_7220.h" + +#define SD7220_FW_NAME "qlogic/sd7220.fw" +MODULE_FIRMWARE(SD7220_FW_NAME); + +/* + * Same as in qib_iba7220.c, but just the registers needed here. + * Could move whole set to qib_7220.h, but decided better to keep + * local. + */ +#define KREG_IDX(regname) (QIB_7220_##regname##_OFFS / sizeof(u64)) +#define kr_hwerrclear KREG_IDX(HwErrClear) +#define kr_hwerrmask KREG_IDX(HwErrMask) +#define kr_hwerrstatus KREG_IDX(HwErrStatus) +#define kr_ibcstatus KREG_IDX(IBCStatus) +#define kr_ibserdesctrl KREG_IDX(IBSerDesCtrl) +#define kr_scratch KREG_IDX(Scratch) +#define kr_xgxs_cfg KREG_IDX(XGXSCfg) +/* these are used only here, not in qib_iba7220.c */ +#define kr_ibsd_epb_access_ctrl KREG_IDX(ibsd_epb_access_ctrl) +#define kr_ibsd_epb_transaction_reg KREG_IDX(ibsd_epb_transaction_reg) +#define kr_pciesd_epb_transaction_reg KREG_IDX(pciesd_epb_transaction_reg) +#define kr_pciesd_epb_access_ctrl KREG_IDX(pciesd_epb_access_ctrl) +#define kr_serdes_ddsrxeq0 KREG_IDX(SerDes_DDSRXEQ0) + +/* + * The IBSerDesMappTable is a memory that holds values to be stored in + * various SerDes registers by IBC. + */ +#define kr_serdes_maptable KREG_IDX(IBSerDesMappTable) + +/* + * Below used for sdnum parameter, selecting one of the two sections + * used for PCIe, or the single SerDes used for IB. + */ +#define PCIE_SERDES0 0 +#define PCIE_SERDES1 1 + +/* + * The EPB requires addressing in a particular form. EPB_LOC() is intended + * to make #definitions a little more readable. + */ +#define EPB_ADDR_SHF 8 +#define EPB_LOC(chn, elt, reg) \ + (((elt & 0xf) | ((chn & 7) << 4) | ((reg & 0x3f) << 9)) << \ + EPB_ADDR_SHF) +#define EPB_IB_QUAD0_CS_SHF (25) +#define EPB_IB_QUAD0_CS (1U << EPB_IB_QUAD0_CS_SHF) +#define EPB_IB_UC_CS_SHF (26) +#define EPB_PCIE_UC_CS_SHF (27) +#define EPB_GLOBAL_WR (1U << (EPB_ADDR_SHF + 8)) + +/* Forward declarations. */ +static int qib_sd7220_reg_mod(struct qib_devdata *dd, int sdnum, u32 loc, + u32 data, u32 mask); +static int ibsd_mod_allchnls(struct qib_devdata *dd, int loc, int val, + int mask); +static int qib_sd_trimdone_poll(struct qib_devdata *dd); +static void qib_sd_trimdone_monitor(struct qib_devdata *dd, const char *where); +static int qib_sd_setvals(struct qib_devdata *dd); +static int qib_sd_early(struct qib_devdata *dd); +static int qib_sd_dactrim(struct qib_devdata *dd); +static int qib_internal_presets(struct qib_devdata *dd); +/* Tweak the register (CMUCTRL5) that contains the TRIMSELF controls */ +static int qib_sd_trimself(struct qib_devdata *dd, int val); +static int epb_access(struct qib_devdata *dd, int sdnum, int claim); +static int qib_sd7220_ib_load(struct qib_devdata *dd, + const struct firmware *fw); +static int qib_sd7220_ib_vfy(struct qib_devdata *dd, + const struct firmware *fw); + +/* + * Below keeps track of whether the "once per power-on" initialization has + * been done, because uC code Version 1.32.17 or higher allows the uC to + * be reset at will, and Automatic Equalization may require it. So the + * state of the reset "pin", is no longer valid. Instead, we check for the + * actual uC code having been loaded. + */ +static int qib_ibsd_ucode_loaded(struct qib_pportdata *ppd, + const struct firmware *fw) +{ + struct qib_devdata *dd = ppd->dd; + + if (!dd->cspec->serdes_first_init_done && + qib_sd7220_ib_vfy(dd, fw) > 0) + dd->cspec->serdes_first_init_done = 1; + return dd->cspec->serdes_first_init_done; +} + +/* repeat #define for local use. "Real" #define is in qib_iba7220.c */ +#define QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR 0x0000004000000000ULL +#define IB_MPREG5 (EPB_LOC(6, 0, 0xE) | (1L << EPB_IB_UC_CS_SHF)) +#define IB_MPREG6 (EPB_LOC(6, 0, 0xF) | (1U << EPB_IB_UC_CS_SHF)) +#define UC_PAR_CLR_D 8 +#define UC_PAR_CLR_M 0xC +#define IB_CTRL2(chn) (EPB_LOC(chn, 7, 3) | EPB_IB_QUAD0_CS) +#define START_EQ1(chan) EPB_LOC(chan, 7, 0x27) + +void qib_sd7220_clr_ibpar(struct qib_devdata *dd) +{ + int ret; + + /* clear, then re-enable parity errs */ + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_MPREG6, + UC_PAR_CLR_D, UC_PAR_CLR_M); + if (ret < 0) { + qib_dev_err(dd, "Failed clearing IBSerDes Parity err\n"); + goto bail; + } + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_MPREG6, 0, + UC_PAR_CLR_M); + + qib_read_kreg32(dd, kr_scratch); + udelay(4); + qib_write_kreg(dd, kr_hwerrclear, + QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR); + qib_read_kreg32(dd, kr_scratch); +bail: + return; +} + +/* + * After a reset or other unusual event, the epb interface may need + * to be re-synchronized, between the host and the uC. + * returns <0 for failure to resync within IBSD_RESYNC_TRIES (not expected) + */ +#define IBSD_RESYNC_TRIES 3 +#define IB_PGUDP(chn) (EPB_LOC((chn), 2, 1) | EPB_IB_QUAD0_CS) +#define IB_CMUDONE(chn) (EPB_LOC((chn), 7, 0xF) | EPB_IB_QUAD0_CS) + +static int qib_resync_ibepb(struct qib_devdata *dd) +{ + int ret, pat, tries, chn; + u32 loc; + + ret = -1; + chn = 0; + for (tries = 0; tries < (4 * IBSD_RESYNC_TRIES); ++tries) { + loc = IB_PGUDP(chn); + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, 0, 0); + if (ret < 0) { + qib_dev_err(dd, "Failed read in resync\n"); + continue; + } + if (ret != 0xF0 && ret != 0x55 && tries == 0) + qib_dev_err(dd, "unexpected pattern in resync\n"); + pat = ret ^ 0xA5; /* alternate F0 and 55 */ + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, pat, 0xFF); + if (ret < 0) { + qib_dev_err(dd, "Failed write in resync\n"); + continue; + } + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, 0, 0); + if (ret < 0) { + qib_dev_err(dd, "Failed re-read in resync\n"); + continue; + } + if (ret != pat) { + qib_dev_err(dd, "Failed compare1 in resync\n"); + continue; + } + loc = IB_CMUDONE(chn); + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, 0, 0); + if (ret < 0) { + qib_dev_err(dd, "Failed CMUDONE rd in resync\n"); + continue; + } + if ((ret & 0x70) != ((chn << 4) | 0x40)) { + qib_dev_err(dd, "Bad CMUDONE value %02X, chn %d\n", + ret, chn); + continue; + } + if (++chn == 4) + break; /* Success */ + } + return (ret > 0) ? 0 : ret; +} + +/* + * Localize the stuff that should be done to change IB uC reset + * returns <0 for errors. + */ +static int qib_ibsd_reset(struct qib_devdata *dd, int assert_rst) +{ + u64 rst_val; + int ret = 0; + unsigned long flags; + + rst_val = qib_read_kreg64(dd, kr_ibserdesctrl); + if (assert_rst) { + /* + * Vendor recommends "interrupting" uC before reset, to + * minimize possible glitches. + */ + spin_lock_irqsave(&dd->cspec->sdepb_lock, flags); + epb_access(dd, IB_7220_SERDES, 1); + rst_val |= 1ULL; + /* Squelch possible parity error from _asserting_ reset */ + qib_write_kreg(dd, kr_hwerrmask, + dd->cspec->hwerrmask & + ~QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR); + qib_write_kreg(dd, kr_ibserdesctrl, rst_val); + /* flush write, delay to ensure it took effect */ + qib_read_kreg32(dd, kr_scratch); + udelay(2); + /* once it's reset, can remove interrupt */ + epb_access(dd, IB_7220_SERDES, -1); + spin_unlock_irqrestore(&dd->cspec->sdepb_lock, flags); + } else { + /* + * Before we de-assert reset, we need to deal with + * possible glitch on the Parity-error line. + * Suppress it around the reset, both in chip-level + * hwerrmask and in IB uC control reg. uC will allow + * it again during startup. + */ + u64 val; + rst_val &= ~(1ULL); + qib_write_kreg(dd, kr_hwerrmask, + dd->cspec->hwerrmask & + ~QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR); + + ret = qib_resync_ibepb(dd); + if (ret < 0) + qib_dev_err(dd, "unable to re-sync IB EPB\n"); + + /* set uC control regs to suppress parity errs */ + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_MPREG5, 1, 1); + if (ret < 0) + goto bail; + /* IB uC code past Version 1.32.17 allow suppression of wdog */ + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_MPREG6, 0x80, + 0x80); + if (ret < 0) { + qib_dev_err(dd, "Failed to set WDOG disable\n"); + goto bail; + } + qib_write_kreg(dd, kr_ibserdesctrl, rst_val); + /* flush write, delay for startup */ + qib_read_kreg32(dd, kr_scratch); + udelay(1); + /* clear, then re-enable parity errs */ + qib_sd7220_clr_ibpar(dd); + val = qib_read_kreg64(dd, kr_hwerrstatus); + if (val & QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR) { + qib_dev_err(dd, "IBUC Parity still set after RST\n"); + dd->cspec->hwerrmask &= + ~QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR; + } + qib_write_kreg(dd, kr_hwerrmask, + dd->cspec->hwerrmask); + } + +bail: + return ret; +} + +static void qib_sd_trimdone_monitor(struct qib_devdata *dd, + const char *where) +{ + int ret, chn, baduns; + u64 val; + + if (!where) + where = "?"; + + /* give time for reset to settle out in EPB */ + udelay(2); + + ret = qib_resync_ibepb(dd); + if (ret < 0) + qib_dev_err(dd, "not able to re-sync IB EPB (%s)\n", where); + + /* Do "sacrificial read" to get EPB in sane state after reset */ + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_CTRL2(0), 0, 0); + if (ret < 0) + qib_dev_err(dd, "Failed TRIMDONE 1st read, (%s)\n", where); + + /* Check/show "summary" Trim-done bit in IBCStatus */ + val = qib_read_kreg64(dd, kr_ibcstatus); + if (!(val & (1ULL << 11))) + qib_dev_err(dd, "IBCS TRIMDONE clear (%s)\n", where); + /* + * Do "dummy read/mod/wr" to get EPB in sane state after reset + * The default value for MPREG6 is 0. + */ + udelay(2); + + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_MPREG6, 0x80, 0x80); + if (ret < 0) + qib_dev_err(dd, "Failed Dummy RMW, (%s)\n", where); + udelay(10); + + baduns = 0; + + for (chn = 3; chn >= 0; --chn) { + /* Read CTRL reg for each channel to check TRIMDONE */ + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, + IB_CTRL2(chn), 0, 0); + if (ret < 0) + qib_dev_err(dd, + "Failed checking TRIMDONE, chn %d (%s)\n", + chn, where); + + if (!(ret & 0x10)) { + int probe; + + baduns |= (1 << chn); + qib_dev_err(dd, + "TRIMDONE cleared on chn %d (%02X). (%s)\n", + chn, ret, where); + probe = qib_sd7220_reg_mod(dd, IB_7220_SERDES, + IB_PGUDP(0), 0, 0); + qib_dev_err(dd, "probe is %d (%02X)\n", + probe, probe); + probe = qib_sd7220_reg_mod(dd, IB_7220_SERDES, + IB_CTRL2(chn), 0, 0); + qib_dev_err(dd, "re-read: %d (%02X)\n", + probe, probe); + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, + IB_CTRL2(chn), 0x10, 0x10); + if (ret < 0) + qib_dev_err(dd, + "Err on TRIMDONE rewrite1\n"); + } + } + for (chn = 3; chn >= 0; --chn) { + /* Read CTRL reg for each channel to check TRIMDONE */ + if (baduns & (1 << chn)) { + qib_dev_err(dd, + "Resetting TRIMDONE on chn %d (%s)\n", + chn, where); + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, + IB_CTRL2(chn), 0x10, 0x10); + if (ret < 0) + qib_dev_err(dd, + "Failed re-setting TRIMDONE, chn %d (%s)\n", + chn, where); + } + } +} + +/* + * Below is portion of IBA7220-specific bringup_serdes() that actually + * deals with registers and memory within the SerDes itself. + * Post IB uC code version 1.32.17, was_reset being 1 is not really + * informative, so we double-check. + */ +int qib_sd7220_init(struct qib_devdata *dd) +{ + const struct firmware *fw; + int ret = 1; /* default to failure */ + int first_reset, was_reset; + + /* SERDES MPU reset recorded in D0 */ + was_reset = (qib_read_kreg64(dd, kr_ibserdesctrl) & 1); + if (!was_reset) { + /* entered with reset not asserted, we need to do it */ + qib_ibsd_reset(dd, 1); + qib_sd_trimdone_monitor(dd, "Driver-reload"); + } + + ret = request_firmware(&fw, SD7220_FW_NAME, &dd->pcidev->dev); + if (ret) { + qib_dev_err(dd, "Failed to load IB SERDES image\n"); + goto done; + } + + /* Substitute our deduced value for was_reset */ + ret = qib_ibsd_ucode_loaded(dd->pport, fw); + if (ret < 0) + goto bail; + + first_reset = !ret; /* First reset if IBSD uCode not yet loaded */ + /* + * Alter some regs per vendor latest doc, reset-defaults + * are not right for IB. + */ + ret = qib_sd_early(dd); + if (ret < 0) { + qib_dev_err(dd, "Failed to set IB SERDES early defaults\n"); + goto bail; + } + /* + * Set DAC manual trim IB. + * We only do this once after chip has been reset (usually + * same as once per system boot). + */ + if (first_reset) { + ret = qib_sd_dactrim(dd); + if (ret < 0) { + qib_dev_err(dd, "Failed IB SERDES DAC trim\n"); + goto bail; + } + } + /* + * Set various registers (DDS and RXEQ) that will be + * controlled by IBC (in 1.2 mode) to reasonable preset values + * Calling the "internal" version avoids the "check for needed" + * and "trimdone monitor" that might be counter-productive. + */ + ret = qib_internal_presets(dd); + if (ret < 0) { + qib_dev_err(dd, "Failed to set IB SERDES presets\n"); + goto bail; + } + ret = qib_sd_trimself(dd, 0x80); + if (ret < 0) { + qib_dev_err(dd, "Failed to set IB SERDES TRIMSELF\n"); + goto bail; + } + + /* Load image, then try to verify */ + ret = 0; /* Assume success */ + if (first_reset) { + int vfy; + int trim_done; + + ret = qib_sd7220_ib_load(dd, fw); + if (ret < 0) { + qib_dev_err(dd, "Failed to load IB SERDES image\n"); + goto bail; + } else { + /* Loaded image, try to verify */ + vfy = qib_sd7220_ib_vfy(dd, fw); + if (vfy != ret) { + qib_dev_err(dd, "SERDES PRAM VFY failed\n"); + goto bail; + } /* end if verified */ + } /* end if loaded */ + + /* + * Loaded and verified. Almost good... + * hold "success" in ret + */ + ret = 0; + /* + * Prev steps all worked, continue bringup + * De-assert RESET to uC, only in first reset, to allow + * trimming. + * + * Since our default setup sets START_EQ1 to + * PRESET, we need to clear that for this very first run. + */ + ret = ibsd_mod_allchnls(dd, START_EQ1(0), 0, 0x38); + if (ret < 0) { + qib_dev_err(dd, "Failed clearing START_EQ1\n"); + goto bail; + } + + qib_ibsd_reset(dd, 0); + /* + * If this is not the first reset, trimdone should be set + * already. We may need to check about this. + */ + trim_done = qib_sd_trimdone_poll(dd); + /* + * Whether or not trimdone succeeded, we need to put the + * uC back into reset to avoid a possible fight with the + * IBC state-machine. + */ + qib_ibsd_reset(dd, 1); + + if (!trim_done) { + qib_dev_err(dd, "No TRIMDONE seen\n"); + goto bail; + } + /* + * DEBUG: check each time we reset if trimdone bits have + * gotten cleared, and re-set them. + */ + qib_sd_trimdone_monitor(dd, "First-reset"); + /* Remember so we do not re-do the load, dactrim, etc. */ + dd->cspec->serdes_first_init_done = 1; + } + /* + * setup for channel training and load values for + * RxEq and DDS in tables used by IBC in IB1.2 mode + */ + ret = 0; + if (qib_sd_setvals(dd) >= 0) + goto done; +bail: + ret = 1; +done: + /* start relock timer regardless, but start at 1 second */ + set_7220_relock_poll(dd, -1); + + release_firmware(fw); + return ret; +} + +#define EPB_ACC_REQ 1 +#define EPB_ACC_GNT 0x100 +#define EPB_DATA_MASK 0xFF +#define EPB_RD (1ULL << 24) +#define EPB_TRANS_RDY (1ULL << 31) +#define EPB_TRANS_ERR (1ULL << 30) +#define EPB_TRANS_TRIES 5 + +/* + * query, claim, release ownership of the EPB (External Parallel Bus) + * for a specified SERDES. + * the "claim" parameter is >0 to claim, <0 to release, 0 to query. + * Returns <0 for errors, >0 if we had ownership, else 0. + */ +static int epb_access(struct qib_devdata *dd, int sdnum, int claim) +{ + u16 acc; + u64 accval; + int owned = 0; + u64 oct_sel = 0; + + switch (sdnum) { + case IB_7220_SERDES: + /* + * The IB SERDES "ownership" is fairly simple. A single each + * request/grant. + */ + acc = kr_ibsd_epb_access_ctrl; + break; + + case PCIE_SERDES0: + case PCIE_SERDES1: + /* PCIe SERDES has two "octants", need to select which */ + acc = kr_pciesd_epb_access_ctrl; + oct_sel = (2 << (sdnum - PCIE_SERDES0)); + break; + + default: + return 0; + } + + /* Make sure any outstanding transaction was seen */ + qib_read_kreg32(dd, kr_scratch); + udelay(15); + + accval = qib_read_kreg32(dd, acc); + + owned = !!(accval & EPB_ACC_GNT); + if (claim < 0) { + /* Need to release */ + u64 pollval; + /* + * The only writeable bits are the request and CS. + * Both should be clear + */ + u64 newval = 0; + qib_write_kreg(dd, acc, newval); + /* First read after write is not trustworthy */ + pollval = qib_read_kreg32(dd, acc); + udelay(5); + pollval = qib_read_kreg32(dd, acc); + if (pollval & EPB_ACC_GNT) + owned = -1; + } else if (claim > 0) { + /* Need to claim */ + u64 pollval; + u64 newval = EPB_ACC_REQ | oct_sel; + qib_write_kreg(dd, acc, newval); + /* First read after write is not trustworthy */ + pollval = qib_read_kreg32(dd, acc); + udelay(5); + pollval = qib_read_kreg32(dd, acc); + if (!(pollval & EPB_ACC_GNT)) + owned = -1; + } + return owned; +} + +/* + * Lemma to deal with race condition of write..read to epb regs + */ +static int epb_trans(struct qib_devdata *dd, u16 reg, u64 i_val, u64 *o_vp) +{ + int tries; + u64 transval; + + qib_write_kreg(dd, reg, i_val); + /* Throw away first read, as RDY bit may be stale */ + transval = qib_read_kreg64(dd, reg); + + for (tries = EPB_TRANS_TRIES; tries; --tries) { + transval = qib_read_kreg32(dd, reg); + if (transval & EPB_TRANS_RDY) + break; + udelay(5); + } + if (transval & EPB_TRANS_ERR) + return -1; + if (tries > 0 && o_vp) + *o_vp = transval; + return tries; +} + +/** + * qib_sd7220_reg_mod - modify SERDES register + * @dd: the qlogic_ib device + * @sdnum: which SERDES to access + * @loc: location - channel, element, register, as packed by EPB_LOC() macro. + * @wd: Write Data - value to set in register + * @mask: ones where data should be spliced into reg. + * + * Basic register read/modify/write, with un-needed acesses elided. That is, + * a mask of zero will prevent write, while a mask of 0xFF will prevent read. + * returns current (presumed, if a write was done) contents of selected + * register, or <0 if errors. + */ +static int qib_sd7220_reg_mod(struct qib_devdata *dd, int sdnum, u32 loc, + u32 wd, u32 mask) +{ + u16 trans; + u64 transval; + int owned; + int tries, ret; + unsigned long flags; + + switch (sdnum) { + case IB_7220_SERDES: + trans = kr_ibsd_epb_transaction_reg; + break; + + case PCIE_SERDES0: + case PCIE_SERDES1: + trans = kr_pciesd_epb_transaction_reg; + break; + + default: + return -1; + } + + /* + * All access is locked in software (vs other host threads) and + * hardware (vs uC access). + */ + spin_lock_irqsave(&dd->cspec->sdepb_lock, flags); + + owned = epb_access(dd, sdnum, 1); + if (owned < 0) { + spin_unlock_irqrestore(&dd->cspec->sdepb_lock, flags); + return -1; + } + ret = 0; + for (tries = EPB_TRANS_TRIES; tries; --tries) { + transval = qib_read_kreg32(dd, trans); + if (transval & EPB_TRANS_RDY) + break; + udelay(5); + } + + if (tries > 0) { + tries = 1; /* to make read-skip work */ + if (mask != 0xFF) { + /* + * Not a pure write, so need to read. + * loc encodes chip-select as well as address + */ + transval = loc | EPB_RD; + tries = epb_trans(dd, trans, transval, &transval); + } + if (tries > 0 && mask != 0) { + /* + * Not a pure read, so need to write. + */ + wd = (wd & mask) | (transval & ~mask); + transval = loc | (wd & EPB_DATA_MASK); + tries = epb_trans(dd, trans, transval, &transval); + } + } + /* else, failed to see ready, what error-handling? */ + + /* + * Release bus. Failure is an error. + */ + if (epb_access(dd, sdnum, -1) < 0) + ret = -1; + else + ret = transval & EPB_DATA_MASK; + + spin_unlock_irqrestore(&dd->cspec->sdepb_lock, flags); + if (tries <= 0) + ret = -1; + return ret; +} + +#define EPB_ROM_R (2) +#define EPB_ROM_W (1) +/* + * Below, all uC-related, use appropriate UC_CS, depending + * on which SerDes is used. + */ +#define EPB_UC_CTL EPB_LOC(6, 0, 0) +#define EPB_MADDRL EPB_LOC(6, 0, 2) +#define EPB_MADDRH EPB_LOC(6, 0, 3) +#define EPB_ROMDATA EPB_LOC(6, 0, 4) +#define EPB_RAMDATA EPB_LOC(6, 0, 5) + +/* Transfer date to/from uC Program RAM of IB or PCIe SerDes */ +static int qib_sd7220_ram_xfer(struct qib_devdata *dd, int sdnum, u32 loc, + u8 *buf, int cnt, int rd_notwr) +{ + u16 trans; + u64 transval; + u64 csbit; + int owned; + int tries; + int sofar; + int addr; + int ret; + unsigned long flags; + const char *op; + + /* Pick appropriate transaction reg and "Chip select" for this serdes */ + switch (sdnum) { + case IB_7220_SERDES: + csbit = 1ULL << EPB_IB_UC_CS_SHF; + trans = kr_ibsd_epb_transaction_reg; + break; + + case PCIE_SERDES0: + case PCIE_SERDES1: + /* PCIe SERDES has uC "chip select" in different bit, too */ + csbit = 1ULL << EPB_PCIE_UC_CS_SHF; + trans = kr_pciesd_epb_transaction_reg; + break; + + default: + return -1; + } + + op = rd_notwr ? "Rd" : "Wr"; + spin_lock_irqsave(&dd->cspec->sdepb_lock, flags); + + owned = epb_access(dd, sdnum, 1); + if (owned < 0) { + spin_unlock_irqrestore(&dd->cspec->sdepb_lock, flags); + return -1; + } + + /* + * In future code, we may need to distinguish several address ranges, + * and select various memories based on this. For now, just trim + * "loc" (location including address and memory select) to + * "addr" (address within memory). we will only support PRAM + * The memory is 8KB. + */ + addr = loc & 0x1FFF; + for (tries = EPB_TRANS_TRIES; tries; --tries) { + transval = qib_read_kreg32(dd, trans); + if (transval & EPB_TRANS_RDY) + break; + udelay(5); + } + + sofar = 0; + if (tries > 0) { + /* + * Every "memory" access is doubly-indirect. + * We set two bytes of address, then read/write + * one or mores bytes of data. + */ + + /* First, we set control to "Read" or "Write" */ + transval = csbit | EPB_UC_CTL | + (rd_notwr ? EPB_ROM_R : EPB_ROM_W); + tries = epb_trans(dd, trans, transval, &transval); + while (tries > 0 && sofar < cnt) { + if (!sofar) { + /* Only set address at start of chunk */ + int addrbyte = (addr + sofar) >> 8; + transval = csbit | EPB_MADDRH | addrbyte; + tries = epb_trans(dd, trans, transval, + &transval); + if (tries <= 0) + break; + addrbyte = (addr + sofar) & 0xFF; + transval = csbit | EPB_MADDRL | addrbyte; + tries = epb_trans(dd, trans, transval, + &transval); + if (tries <= 0) + break; + } + + if (rd_notwr) + transval = csbit | EPB_ROMDATA | EPB_RD; + else + transval = csbit | EPB_ROMDATA | buf[sofar]; + tries = epb_trans(dd, trans, transval, &transval); + if (tries <= 0) + break; + if (rd_notwr) + buf[sofar] = transval & EPB_DATA_MASK; + ++sofar; + } + /* Finally, clear control-bit for Read or Write */ + transval = csbit | EPB_UC_CTL; + tries = epb_trans(dd, trans, transval, &transval); + } + + ret = sofar; + /* Release bus. Failure is an error */ + if (epb_access(dd, sdnum, -1) < 0) + ret = -1; + + spin_unlock_irqrestore(&dd->cspec->sdepb_lock, flags); + if (tries <= 0) + ret = -1; + return ret; +} + +#define PROG_CHUNK 64 + +static int qib_sd7220_prog_ld(struct qib_devdata *dd, int sdnum, + const u8 *img, int len, int offset) +{ + int cnt, sofar, req; + + sofar = 0; + while (sofar < len) { + req = len - sofar; + if (req > PROG_CHUNK) + req = PROG_CHUNK; + cnt = qib_sd7220_ram_xfer(dd, sdnum, offset + sofar, + (u8 *)img + sofar, req, 0); + if (cnt < req) { + sofar = -1; + break; + } + sofar += req; + } + return sofar; +} + +#define VFY_CHUNK 64 +#define SD_PRAM_ERROR_LIMIT 42 + +static int qib_sd7220_prog_vfy(struct qib_devdata *dd, int sdnum, + const u8 *img, int len, int offset) +{ + int cnt, sofar, req, idx, errors; + unsigned char readback[VFY_CHUNK]; + + errors = 0; + sofar = 0; + while (sofar < len) { + req = len - sofar; + if (req > VFY_CHUNK) + req = VFY_CHUNK; + cnt = qib_sd7220_ram_xfer(dd, sdnum, sofar + offset, + readback, req, 1); + if (cnt < req) { + /* failed in read itself */ + sofar = -1; + break; + } + for (idx = 0; idx < cnt; ++idx) { + if (readback[idx] != img[idx+sofar]) + ++errors; + } + sofar += cnt; + } + return errors ? -errors : sofar; +} + +static int +qib_sd7220_ib_load(struct qib_devdata *dd, const struct firmware *fw) +{ + return qib_sd7220_prog_ld(dd, IB_7220_SERDES, fw->data, fw->size, 0); +} + +static int +qib_sd7220_ib_vfy(struct qib_devdata *dd, const struct firmware *fw) +{ + return qib_sd7220_prog_vfy(dd, IB_7220_SERDES, fw->data, fw->size, 0); +} + +/* + * IRQ not set up at this point in init, so we poll. + */ +#define IB_SERDES_TRIM_DONE (1ULL << 11) +#define TRIM_TMO (30) + +static int qib_sd_trimdone_poll(struct qib_devdata *dd) +{ + int trim_tmo, ret; + uint64_t val; + + /* + * Default to failure, so IBC will not start + * without IB_SERDES_TRIM_DONE. + */ + ret = 0; + for (trim_tmo = 0; trim_tmo < TRIM_TMO; ++trim_tmo) { + val = qib_read_kreg64(dd, kr_ibcstatus); + if (val & IB_SERDES_TRIM_DONE) { + ret = 1; + break; + } + msleep(10); + } + if (trim_tmo >= TRIM_TMO) { + qib_dev_err(dd, "No TRIMDONE in %d tries\n", trim_tmo); + ret = 0; + } + return ret; +} + +#define TX_FAST_ELT (9) + +/* + * Set the "negotiation" values for SERDES. These are used by the IB1.2 + * link negotiation. Macros below are attempt to keep the values a + * little more human-editable. + * First, values related to Drive De-emphasis Settings. + */ + +#define NUM_DDS_REGS 6 +#define DDS_REG_MAP 0x76A910 /* LSB-first list of regs (in elt 9) to mod */ + +#define DDS_VAL(amp_d, main_d, ipst_d, ipre_d, amp_s, main_s, ipst_s, ipre_s) \ + { { ((amp_d & 0x1F) << 1) | 1, ((amp_s & 0x1F) << 1) | 1, \ + (main_d << 3) | 4 | (ipre_d >> 2), \ + (main_s << 3) | 4 | (ipre_s >> 2), \ + ((ipst_d & 0xF) << 1) | ((ipre_d & 3) << 6) | 0x21, \ + ((ipst_s & 0xF) << 1) | ((ipre_s & 3) << 6) | 0x21 } } + +static struct dds_init { + uint8_t reg_vals[NUM_DDS_REGS]; +} dds_init_vals[] = { + /* DDR(FDR) SDR(HDR) */ + /* Vendor recommends below for 3m cable */ +#define DDS_3M 0 + DDS_VAL(31, 19, 12, 0, 29, 22, 9, 0), + DDS_VAL(31, 12, 15, 4, 31, 15, 15, 1), + DDS_VAL(31, 13, 15, 3, 31, 16, 15, 0), + DDS_VAL(31, 14, 15, 2, 31, 17, 14, 0), + DDS_VAL(31, 15, 15, 1, 31, 18, 13, 0), + DDS_VAL(31, 16, 15, 0, 31, 19, 12, 0), + DDS_VAL(31, 17, 14, 0, 31, 20, 11, 0), + DDS_VAL(31, 18, 13, 0, 30, 21, 10, 0), + DDS_VAL(31, 20, 11, 0, 28, 23, 8, 0), + DDS_VAL(31, 21, 10, 0, 27, 24, 7, 0), + DDS_VAL(31, 22, 9, 0, 26, 25, 6, 0), + DDS_VAL(30, 23, 8, 0, 25, 26, 5, 0), + DDS_VAL(29, 24, 7, 0, 23, 27, 4, 0), + /* Vendor recommends below for 1m cable */ +#define DDS_1M 13 + DDS_VAL(28, 25, 6, 0, 21, 28, 3, 0), + DDS_VAL(27, 26, 5, 0, 19, 29, 2, 0), + DDS_VAL(25, 27, 4, 0, 17, 30, 1, 0) +}; + +/* + * Now the RXEQ section of the table. + */ +/* Hardware packs an element number and register address thus: */ +#define RXEQ_INIT_RDESC(elt, addr) (((elt) & 0xF) | ((addr) << 4)) +#define RXEQ_VAL(elt, adr, val0, val1, val2, val3) \ + {RXEQ_INIT_RDESC((elt), (adr)), {(val0), (val1), (val2), (val3)} } + +#define RXEQ_VAL_ALL(elt, adr, val) \ + {RXEQ_INIT_RDESC((elt), (adr)), {(val), (val), (val), (val)} } + +#define RXEQ_SDR_DFELTH 0 +#define RXEQ_SDR_TLTH 0 +#define RXEQ_SDR_G1CNT_Z1CNT 0x11 +#define RXEQ_SDR_ZCNT 23 + +static struct rxeq_init { + u16 rdesc; /* in form used in SerDesDDSRXEQ */ + u8 rdata[4]; +} rxeq_init_vals[] = { + /* Set Rcv Eq. to Preset node */ + RXEQ_VAL_ALL(7, 0x27, 0x10), + /* Set DFELTHFDR/HDR thresholds */ + RXEQ_VAL(7, 8, 0, 0, 0, 0), /* FDR, was 0, 1, 2, 3 */ + RXEQ_VAL(7, 0x21, 0, 0, 0, 0), /* HDR */ + /* Set TLTHFDR/HDR theshold */ + RXEQ_VAL(7, 9, 2, 2, 2, 2), /* FDR, was 0, 2, 4, 6 */ + RXEQ_VAL(7, 0x23, 2, 2, 2, 2), /* HDR, was 0, 1, 2, 3 */ + /* Set Preamp setting 2 (ZFR/ZCNT) */ + RXEQ_VAL(7, 0x1B, 12, 12, 12, 12), /* FDR, was 12, 16, 20, 24 */ + RXEQ_VAL(7, 0x1C, 12, 12, 12, 12), /* HDR, was 12, 16, 20, 24 */ + /* Set Preamp DC gain and Setting 1 (GFR/GHR) */ + RXEQ_VAL(7, 0x1E, 16, 16, 16, 16), /* FDR, was 16, 17, 18, 20 */ + RXEQ_VAL(7, 0x1F, 16, 16, 16, 16), /* HDR, was 16, 17, 18, 20 */ + /* Toggle RELOCK (in VCDL_CTRL0) to lock to data */ + RXEQ_VAL_ALL(6, 6, 0x20), /* Set D5 High */ + RXEQ_VAL_ALL(6, 6, 0), /* Set D5 Low */ +}; + +/* There are 17 values from vendor, but IBC only accesses the first 16 */ +#define DDS_ROWS (16) +#define RXEQ_ROWS ARRAY_SIZE(rxeq_init_vals) + +static int qib_sd_setvals(struct qib_devdata *dd) +{ + int idx, midx; + int min_idx; /* Minimum index for this portion of table */ + uint32_t dds_reg_map; + u64 __iomem *taddr, *iaddr; + uint64_t data; + uint64_t sdctl; + + taddr = dd->kregbase + kr_serdes_maptable; + iaddr = dd->kregbase + kr_serdes_ddsrxeq0; + + /* + * Init the DDS section of the table. + * Each "row" of the table provokes NUM_DDS_REG writes, to the + * registers indicated in DDS_REG_MAP. + */ + sdctl = qib_read_kreg64(dd, kr_ibserdesctrl); + sdctl = (sdctl & ~(0x1f << 8)) | (NUM_DDS_REGS << 8); + sdctl = (sdctl & ~(0x1f << 13)) | (RXEQ_ROWS << 13); + qib_write_kreg(dd, kr_ibserdesctrl, sdctl); + + /* + * Iterate down table within loop for each register to store. + */ + dds_reg_map = DDS_REG_MAP; + for (idx = 0; idx < NUM_DDS_REGS; ++idx) { + data = ((dds_reg_map & 0xF) << 4) | TX_FAST_ELT; + writeq(data, iaddr + idx); + mmiowb(); + qib_read_kreg32(dd, kr_scratch); + dds_reg_map >>= 4; + for (midx = 0; midx < DDS_ROWS; ++midx) { + u64 __iomem *daddr = taddr + ((midx << 4) + idx); + data = dds_init_vals[midx].reg_vals[idx]; + writeq(data, daddr); + mmiowb(); + qib_read_kreg32(dd, kr_scratch); + } /* End inner for (vals for this reg, each row) */ + } /* end outer for (regs to be stored) */ + + /* + * Init the RXEQ section of the table. + * This runs in a different order, as the pattern of + * register references is more complex, but there are only + * four "data" values per register. + */ + min_idx = idx; /* RXEQ indices pick up where DDS left off */ + taddr += 0x100; /* RXEQ data is in second half of table */ + /* Iterate through RXEQ register addresses */ + for (idx = 0; idx < RXEQ_ROWS; ++idx) { + int didx; /* "destination" */ + int vidx; + + /* didx is offset by min_idx to address RXEQ range of regs */ + didx = idx + min_idx; + /* Store the next RXEQ register address */ + writeq(rxeq_init_vals[idx].rdesc, iaddr + didx); + mmiowb(); + qib_read_kreg32(dd, kr_scratch); + /* Iterate through RXEQ values */ + for (vidx = 0; vidx < 4; vidx++) { + data = rxeq_init_vals[idx].rdata[vidx]; + writeq(data, taddr + (vidx << 6) + idx); + mmiowb(); + qib_read_kreg32(dd, kr_scratch); + } + } /* end outer for (Reg-writes for RXEQ) */ + return 0; +} + +#define CMUCTRL5 EPB_LOC(7, 0, 0x15) +#define RXHSCTRL0(chan) EPB_LOC(chan, 6, 0) +#define VCDL_DAC2(chan) EPB_LOC(chan, 6, 5) +#define VCDL_CTRL0(chan) EPB_LOC(chan, 6, 6) +#define VCDL_CTRL2(chan) EPB_LOC(chan, 6, 8) +#define START_EQ2(chan) EPB_LOC(chan, 7, 0x28) + +/* + * Repeat a "store" across all channels of the IB SerDes. + * Although nominally it inherits the "read value" of the last + * channel it modified, the only really useful return is <0 for + * failure, >= 0 for success. The parameter 'loc' is assumed to + * be the location in some channel of the register to be modified + * The caller can specify use of the "gang write" option of EPB, + * in which case we use the specified channel data for any fields + * not explicitely written. + */ +static int ibsd_mod_allchnls(struct qib_devdata *dd, int loc, int val, + int mask) +{ + int ret = -1; + int chnl; + + if (loc & EPB_GLOBAL_WR) { + /* + * Our caller has assured us that we can set all four + * channels at once. Trust that. If mask is not 0xFF, + * we will read the _specified_ channel for our starting + * value. + */ + loc |= (1U << EPB_IB_QUAD0_CS_SHF); + chnl = (loc >> (4 + EPB_ADDR_SHF)) & 7; + if (mask != 0xFF) { + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, + loc & ~EPB_GLOBAL_WR, 0, 0); + if (ret < 0) { + int sloc = loc >> EPB_ADDR_SHF; + + qib_dev_err(dd, + "pre-read failed: elt %d, addr 0x%X, chnl %d\n", + (sloc & 0xF), + (sloc >> 9) & 0x3f, chnl); + return ret; + } + val = (ret & ~mask) | (val & mask); + } + loc &= ~(7 << (4+EPB_ADDR_SHF)); + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, val, 0xFF); + if (ret < 0) { + int sloc = loc >> EPB_ADDR_SHF; + + qib_dev_err(dd, + "Global WR failed: elt %d, addr 0x%X, val %02X\n", + (sloc & 0xF), (sloc >> 9) & 0x3f, val); + } + return ret; + } + /* Clear "channel" and set CS so we can simply iterate */ + loc &= ~(7 << (4+EPB_ADDR_SHF)); + loc |= (1U << EPB_IB_QUAD0_CS_SHF); + for (chnl = 0; chnl < 4; ++chnl) { + int cloc = loc | (chnl << (4+EPB_ADDR_SHF)); + + ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, cloc, val, mask); + if (ret < 0) { + int sloc = loc >> EPB_ADDR_SHF; + + qib_dev_err(dd, + "Write failed: elt %d, addr 0x%X, chnl %d, val 0x%02X, mask 0x%02X\n", + (sloc & 0xF), (sloc >> 9) & 0x3f, chnl, + val & 0xFF, mask & 0xFF); + break; + } + } + return ret; +} + +/* + * Set the Tx values normally modified by IBC in IB1.2 mode to default + * values, as gotten from first row of init table. + */ +static int set_dds_vals(struct qib_devdata *dd, struct dds_init *ddi) +{ + int ret; + int idx, reg, data; + uint32_t regmap; + + regmap = DDS_REG_MAP; + for (idx = 0; idx < NUM_DDS_REGS; ++idx) { + reg = (regmap & 0xF); + regmap >>= 4; + data = ddi->reg_vals[idx]; + /* Vendor says RMW not needed for these regs, use 0xFF mask */ + ret = ibsd_mod_allchnls(dd, EPB_LOC(0, 9, reg), data, 0xFF); + if (ret < 0) + break; + } + return ret; +} + +/* + * Set the Rx values normally modified by IBC in IB1.2 mode to default + * values, as gotten from selected column of init table. + */ +static int set_rxeq_vals(struct qib_devdata *dd, int vsel) +{ + int ret; + int ridx; + int cnt = ARRAY_SIZE(rxeq_init_vals); + + for (ridx = 0; ridx < cnt; ++ridx) { + int elt, reg, val, loc; + + elt = rxeq_init_vals[ridx].rdesc & 0xF; + reg = rxeq_init_vals[ridx].rdesc >> 4; + loc = EPB_LOC(0, elt, reg); + val = rxeq_init_vals[ridx].rdata[vsel]; + /* mask of 0xFF, because hardware does full-byte store. */ + ret = ibsd_mod_allchnls(dd, loc, val, 0xFF); + if (ret < 0) + break; + } + return ret; +} + +/* + * Set the default values (row 0) for DDR Driver Demphasis. + * we do this initially and whenever we turn off IB-1.2 + * + * The "default" values for Rx equalization are also stored to + * SerDes registers. Formerly (and still default), we used set 2. + * For experimenting with cables and link-partners, we allow changing + * that via a module parameter. + */ +static unsigned qib_rxeq_set = 2; +module_param_named(rxeq_default_set, qib_rxeq_set, uint, + S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(rxeq_default_set, + "Which set [0..3] of Rx Equalization values is default"); + +static int qib_internal_presets(struct qib_devdata *dd) +{ + int ret = 0; + + ret = set_dds_vals(dd, dds_init_vals + DDS_3M); + + if (ret < 0) + qib_dev_err(dd, "Failed to set default DDS values\n"); + ret = set_rxeq_vals(dd, qib_rxeq_set & 3); + if (ret < 0) + qib_dev_err(dd, "Failed to set default RXEQ values\n"); + return ret; +} + +int qib_sd7220_presets(struct qib_devdata *dd) +{ + int ret = 0; + + if (!dd->cspec->presets_needed) + return ret; + dd->cspec->presets_needed = 0; + /* Assert uC reset, so we don't clash with it. */ + qib_ibsd_reset(dd, 1); + udelay(2); + qib_sd_trimdone_monitor(dd, "link-down"); + + ret = qib_internal_presets(dd); + return ret; +} + +static int qib_sd_trimself(struct qib_devdata *dd, int val) +{ + int loc = CMUCTRL5 | (1U << EPB_IB_QUAD0_CS_SHF); + + return qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, val, 0xFF); +} + +static int qib_sd_early(struct qib_devdata *dd) +{ + int ret; + + ret = ibsd_mod_allchnls(dd, RXHSCTRL0(0) | EPB_GLOBAL_WR, 0xD4, 0xFF); + if (ret < 0) + goto bail; + ret = ibsd_mod_allchnls(dd, START_EQ1(0) | EPB_GLOBAL_WR, 0x10, 0xFF); + if (ret < 0) + goto bail; + ret = ibsd_mod_allchnls(dd, START_EQ2(0) | EPB_GLOBAL_WR, 0x30, 0xFF); +bail: + return ret; +} + +#define BACTRL(chnl) EPB_LOC(chnl, 6, 0x0E) +#define LDOUTCTRL1(chnl) EPB_LOC(chnl, 7, 6) +#define RXHSSTATUS(chnl) EPB_LOC(chnl, 6, 0xF) + +static int qib_sd_dactrim(struct qib_devdata *dd) +{ + int ret; + + ret = ibsd_mod_allchnls(dd, VCDL_DAC2(0) | EPB_GLOBAL_WR, 0x2D, 0xFF); + if (ret < 0) + goto bail; + + /* more fine-tuning of what will be default */ + ret = ibsd_mod_allchnls(dd, VCDL_CTRL2(0), 3, 0xF); + if (ret < 0) + goto bail; + + ret = ibsd_mod_allchnls(dd, BACTRL(0) | EPB_GLOBAL_WR, 0x40, 0xFF); + if (ret < 0) + goto bail; + + ret = ibsd_mod_allchnls(dd, LDOUTCTRL1(0) | EPB_GLOBAL_WR, 0x04, 0xFF); + if (ret < 0) + goto bail; + + ret = ibsd_mod_allchnls(dd, RXHSSTATUS(0) | EPB_GLOBAL_WR, 0x04, 0xFF); + if (ret < 0) + goto bail; + + /* + * Delay for max possible number of steps, with slop. + * Each step is about 4usec. + */ + udelay(415); + + ret = ibsd_mod_allchnls(dd, LDOUTCTRL1(0) | EPB_GLOBAL_WR, 0x00, 0xFF); + +bail: + return ret; +} + +#define RELOCK_FIRST_MS 3 +#define RXLSPPM(chan) EPB_LOC(chan, 0, 2) +void toggle_7220_rclkrls(struct qib_devdata *dd) +{ + int loc = RXLSPPM(0) | EPB_GLOBAL_WR; + int ret; + + ret = ibsd_mod_allchnls(dd, loc, 0, 0x80); + if (ret < 0) + qib_dev_err(dd, "RCLKRLS failed to clear D7\n"); + else { + udelay(1); + ibsd_mod_allchnls(dd, loc, 0x80, 0x80); + } + /* And again for good measure */ + udelay(1); + ret = ibsd_mod_allchnls(dd, loc, 0, 0x80); + if (ret < 0) + qib_dev_err(dd, "RCLKRLS failed to clear D7\n"); + else { + udelay(1); + ibsd_mod_allchnls(dd, loc, 0x80, 0x80); + } + /* Now reset xgxs and IBC to complete the recovery */ + dd->f_xgxs_reset(dd->pport); +} + +/* + * Shut down the timer that polls for relock occasions, if needed + * this is "hooked" from qib_7220_quiet_serdes(), which is called + * just before qib_shutdown_device() in qib_driver.c shuts down all + * the other timers + */ +void shutdown_7220_relock_poll(struct qib_devdata *dd) +{ + if (dd->cspec->relock_timer_active) + del_timer_sync(&dd->cspec->relock_timer); +} + +static unsigned qib_relock_by_timer = 1; +module_param_named(relock_by_timer, qib_relock_by_timer, uint, + S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(relock_by_timer, "Allow relock attempt if link not up"); + +static void qib_run_relock(unsigned long opaque) +{ + struct qib_devdata *dd = (struct qib_devdata *)opaque; + struct qib_pportdata *ppd = dd->pport; + struct qib_chip_specific *cs = dd->cspec; + int timeoff; + + /* + * Check link-training state for "stuck" state, when down. + * if found, try relock and schedule another try at + * exponentially growing delay, maxed at one second. + * if not stuck, our work is done. + */ + if ((dd->flags & QIB_INITTED) && !(ppd->lflags & + (QIBL_IB_AUTONEG_INPROG | QIBL_LINKINIT | QIBL_LINKARMED | + QIBL_LINKACTIVE))) { + if (qib_relock_by_timer) { + if (!(ppd->lflags & QIBL_IB_LINK_DISABLED)) + toggle_7220_rclkrls(dd); + } + /* re-set timer for next check */ + timeoff = cs->relock_interval << 1; + if (timeoff > HZ) + timeoff = HZ; + cs->relock_interval = timeoff; + } else + timeoff = HZ; + mod_timer(&cs->relock_timer, jiffies + timeoff); +} + +void set_7220_relock_poll(struct qib_devdata *dd, int ibup) +{ + struct qib_chip_specific *cs = dd->cspec; + + if (ibup) { + /* We are now up, relax timer to 1 second interval */ + if (cs->relock_timer_active) { + cs->relock_interval = HZ; + mod_timer(&cs->relock_timer, jiffies + HZ); + } + } else { + /* Transition to down, (re-)set timer to short interval. */ + unsigned int timeout; + + timeout = msecs_to_jiffies(RELOCK_FIRST_MS); + if (timeout == 0) + timeout = 1; + /* If timer has not yet been started, do so. */ + if (!cs->relock_timer_active) { + cs->relock_timer_active = 1; + init_timer(&cs->relock_timer); + cs->relock_timer.function = qib_run_relock; + cs->relock_timer.data = (unsigned long) dd; + cs->relock_interval = timeout; + cs->relock_timer.expires = jiffies + timeout; + add_timer(&cs->relock_timer); + } else { + cs->relock_interval = timeout; + mod_timer(&cs->relock_timer, jiffies + timeout); + } + } +} diff --git a/drivers/infiniband/hw/qib/qib_sdma.c b/drivers/infiniband/hw/qib/qib_sdma.c new file mode 100644 index 0000000..c6d6a54 --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_sdma.c @@ -0,0 +1,1039 @@ +/* + * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2007 - 2012 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "qib.h" +#include "qib_common.h" + +/* default pio off, sdma on */ +static ushort sdma_descq_cnt = 256; +module_param_named(sdma_descq_cnt, sdma_descq_cnt, ushort, S_IRUGO); +MODULE_PARM_DESC(sdma_descq_cnt, "Number of SDMA descq entries"); + +/* + * Bits defined in the send DMA descriptor. + */ +#define SDMA_DESC_LAST (1ULL << 11) +#define SDMA_DESC_FIRST (1ULL << 12) +#define SDMA_DESC_DMA_HEAD (1ULL << 13) +#define SDMA_DESC_USE_LARGE_BUF (1ULL << 14) +#define SDMA_DESC_INTR (1ULL << 15) +#define SDMA_DESC_COUNT_LSB 16 +#define SDMA_DESC_GEN_LSB 30 + +char *qib_sdma_state_names[] = { + [qib_sdma_state_s00_hw_down] = "s00_HwDown", + [qib_sdma_state_s10_hw_start_up_wait] = "s10_HwStartUpWait", + [qib_sdma_state_s20_idle] = "s20_Idle", + [qib_sdma_state_s30_sw_clean_up_wait] = "s30_SwCleanUpWait", + [qib_sdma_state_s40_hw_clean_up_wait] = "s40_HwCleanUpWait", + [qib_sdma_state_s50_hw_halt_wait] = "s50_HwHaltWait", + [qib_sdma_state_s99_running] = "s99_Running", +}; + +char *qib_sdma_event_names[] = { + [qib_sdma_event_e00_go_hw_down] = "e00_GoHwDown", + [qib_sdma_event_e10_go_hw_start] = "e10_GoHwStart", + [qib_sdma_event_e20_hw_started] = "e20_HwStarted", + [qib_sdma_event_e30_go_running] = "e30_GoRunning", + [qib_sdma_event_e40_sw_cleaned] = "e40_SwCleaned", + [qib_sdma_event_e50_hw_cleaned] = "e50_HwCleaned", + [qib_sdma_event_e60_hw_halted] = "e60_HwHalted", + [qib_sdma_event_e70_go_idle] = "e70_GoIdle", + [qib_sdma_event_e7220_err_halted] = "e7220_ErrHalted", + [qib_sdma_event_e7322_err_halted] = "e7322_ErrHalted", + [qib_sdma_event_e90_timer_tick] = "e90_TimerTick", +}; + +/* declare all statics here rather than keep sorting */ +static int alloc_sdma(struct qib_pportdata *); +static void sdma_complete(struct kref *); +static void sdma_finalput(struct qib_sdma_state *); +static void sdma_get(struct qib_sdma_state *); +static void sdma_put(struct qib_sdma_state *); +static void sdma_set_state(struct qib_pportdata *, enum qib_sdma_states); +static void sdma_start_sw_clean_up(struct qib_pportdata *); +static void sdma_sw_clean_up_task(unsigned long); +static void unmap_desc(struct qib_pportdata *, unsigned); + +static void sdma_get(struct qib_sdma_state *ss) +{ + kref_get(&ss->kref); +} + +static void sdma_complete(struct kref *kref) +{ + struct qib_sdma_state *ss = + container_of(kref, struct qib_sdma_state, kref); + + complete(&ss->comp); +} + +static void sdma_put(struct qib_sdma_state *ss) +{ + kref_put(&ss->kref, sdma_complete); +} + +static void sdma_finalput(struct qib_sdma_state *ss) +{ + sdma_put(ss); + wait_for_completion(&ss->comp); +} + +/* + * Complete all the sdma requests on the active list, in the correct + * order, and with appropriate processing. Called when cleaning up + * after sdma shutdown, and when new sdma requests are submitted for + * a link that is down. This matches what is done for requests + * that complete normally, it's just the full list. + * + * Must be called with sdma_lock held + */ +static void clear_sdma_activelist(struct qib_pportdata *ppd) +{ + struct qib_sdma_txreq *txp, *txp_next; + + list_for_each_entry_safe(txp, txp_next, &ppd->sdma_activelist, list) { + list_del_init(&txp->list); + if (txp->flags & QIB_SDMA_TXREQ_F_FREEDESC) { + unsigned idx; + + idx = txp->start_idx; + while (idx != txp->next_descq_idx) { + unmap_desc(ppd, idx); + if (++idx == ppd->sdma_descq_cnt) + idx = 0; + } + } + if (txp->callback) + (*txp->callback)(txp, QIB_SDMA_TXREQ_S_ABORTED); + } +} + +static void sdma_sw_clean_up_task(unsigned long opaque) +{ + struct qib_pportdata *ppd = (struct qib_pportdata *) opaque; + unsigned long flags; + + spin_lock_irqsave(&ppd->sdma_lock, flags); + + /* + * At this point, the following should always be true: + * - We are halted, so no more descriptors are getting retired. + * - We are not running, so no one is submitting new work. + * - Only we can send the e40_sw_cleaned, so we can't start + * running again until we say so. So, the active list and + * descq are ours to play with. + */ + + /* Process all retired requests. */ + qib_sdma_make_progress(ppd); + + clear_sdma_activelist(ppd); + + /* + * Resync count of added and removed. It is VERY important that + * sdma_descq_removed NEVER decrement - user_sdma depends on it. + */ + ppd->sdma_descq_removed = ppd->sdma_descq_added; + + /* + * Reset our notion of head and tail. + * Note that the HW registers will be reset when switching states + * due to calling __qib_sdma_process_event() below. + */ + ppd->sdma_descq_tail = 0; + ppd->sdma_descq_head = 0; + ppd->sdma_head_dma[0] = 0; + ppd->sdma_generation = 0; + + __qib_sdma_process_event(ppd, qib_sdma_event_e40_sw_cleaned); + + spin_unlock_irqrestore(&ppd->sdma_lock, flags); +} + +/* + * This is called when changing to state qib_sdma_state_s10_hw_start_up_wait + * as a result of send buffer errors or send DMA descriptor errors. + * We want to disarm the buffers in these cases. + */ +static void sdma_hw_start_up(struct qib_pportdata *ppd) +{ + struct qib_sdma_state *ss = &ppd->sdma_state; + unsigned bufno; + + for (bufno = ss->first_sendbuf; bufno < ss->last_sendbuf; ++bufno) + ppd->dd->f_sendctrl(ppd, QIB_SENDCTRL_DISARM_BUF(bufno)); + + ppd->dd->f_sdma_hw_start_up(ppd); +} + +static void sdma_sw_tear_down(struct qib_pportdata *ppd) +{ + struct qib_sdma_state *ss = &ppd->sdma_state; + + /* Releasing this reference means the state machine has stopped. */ + sdma_put(ss); +} + +static void sdma_start_sw_clean_up(struct qib_pportdata *ppd) +{ + tasklet_hi_schedule(&ppd->sdma_sw_clean_up_task); +} + +static void sdma_set_state(struct qib_pportdata *ppd, + enum qib_sdma_states next_state) +{ + struct qib_sdma_state *ss = &ppd->sdma_state; + struct sdma_set_state_action *action = ss->set_state_action; + unsigned op = 0; + + /* debugging bookkeeping */ + ss->previous_state = ss->current_state; + ss->previous_op = ss->current_op; + + ss->current_state = next_state; + + if (action[next_state].op_enable) + op |= QIB_SDMA_SENDCTRL_OP_ENABLE; + + if (action[next_state].op_intenable) + op |= QIB_SDMA_SENDCTRL_OP_INTENABLE; + + if (action[next_state].op_halt) + op |= QIB_SDMA_SENDCTRL_OP_HALT; + + if (action[next_state].op_drain) + op |= QIB_SDMA_SENDCTRL_OP_DRAIN; + + if (action[next_state].go_s99_running_tofalse) + ss->go_s99_running = 0; + + if (action[next_state].go_s99_running_totrue) + ss->go_s99_running = 1; + + ss->current_op = op; + + ppd->dd->f_sdma_sendctrl(ppd, ss->current_op); +} + +static void unmap_desc(struct qib_pportdata *ppd, unsigned head) +{ + __le64 *descqp = &ppd->sdma_descq[head].qw[0]; + u64 desc[2]; + dma_addr_t addr; + size_t len; + + desc[0] = le64_to_cpu(descqp[0]); + desc[1] = le64_to_cpu(descqp[1]); + + addr = (desc[1] << 32) | (desc[0] >> 32); + len = (desc[0] >> 14) & (0x7ffULL << 2); + dma_unmap_single(&ppd->dd->pcidev->dev, addr, len, DMA_TO_DEVICE); +} + +static int alloc_sdma(struct qib_pportdata *ppd) +{ + ppd->sdma_descq_cnt = sdma_descq_cnt; + if (!ppd->sdma_descq_cnt) + ppd->sdma_descq_cnt = 256; + + /* Allocate memory for SendDMA descriptor FIFO */ + ppd->sdma_descq = dma_alloc_coherent(&ppd->dd->pcidev->dev, + ppd->sdma_descq_cnt * sizeof(u64[2]), &ppd->sdma_descq_phys, + GFP_KERNEL); + + if (!ppd->sdma_descq) { + qib_dev_err(ppd->dd, + "failed to allocate SendDMA descriptor FIFO memory\n"); + goto bail; + } + + /* Allocate memory for DMA of head register to memory */ + ppd->sdma_head_dma = dma_alloc_coherent(&ppd->dd->pcidev->dev, + PAGE_SIZE, &ppd->sdma_head_phys, GFP_KERNEL); + if (!ppd->sdma_head_dma) { + qib_dev_err(ppd->dd, + "failed to allocate SendDMA head memory\n"); + goto cleanup_descq; + } + ppd->sdma_head_dma[0] = 0; + return 0; + +cleanup_descq: + dma_free_coherent(&ppd->dd->pcidev->dev, + ppd->sdma_descq_cnt * sizeof(u64[2]), (void *)ppd->sdma_descq, + ppd->sdma_descq_phys); + ppd->sdma_descq = NULL; + ppd->sdma_descq_phys = 0; +bail: + ppd->sdma_descq_cnt = 0; + return -ENOMEM; +} + +static void free_sdma(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + + if (ppd->sdma_head_dma) { + dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, + (void *)ppd->sdma_head_dma, + ppd->sdma_head_phys); + ppd->sdma_head_dma = NULL; + ppd->sdma_head_phys = 0; + } + + if (ppd->sdma_descq) { + dma_free_coherent(&dd->pcidev->dev, + ppd->sdma_descq_cnt * sizeof(u64[2]), + ppd->sdma_descq, ppd->sdma_descq_phys); + ppd->sdma_descq = NULL; + ppd->sdma_descq_phys = 0; + } +} + +static inline void make_sdma_desc(struct qib_pportdata *ppd, + u64 *sdmadesc, u64 addr, u64 dwlen, + u64 dwoffset) +{ + + WARN_ON(addr & 3); + /* SDmaPhyAddr[47:32] */ + sdmadesc[1] = addr >> 32; + /* SDmaPhyAddr[31:0] */ + sdmadesc[0] = (addr & 0xfffffffcULL) << 32; + /* SDmaGeneration[1:0] */ + sdmadesc[0] |= (ppd->sdma_generation & 3ULL) << + SDMA_DESC_GEN_LSB; + /* SDmaDwordCount[10:0] */ + sdmadesc[0] |= (dwlen & 0x7ffULL) << SDMA_DESC_COUNT_LSB; + /* SDmaBufOffset[12:2] */ + sdmadesc[0] |= dwoffset & 0x7ffULL; +} + +/* sdma_lock must be held */ +int qib_sdma_make_progress(struct qib_pportdata *ppd) +{ + struct list_head *lp = NULL; + struct qib_sdma_txreq *txp = NULL; + struct qib_devdata *dd = ppd->dd; + int progress = 0; + u16 hwhead; + u16 idx = 0; + + hwhead = dd->f_sdma_gethead(ppd); + + /* The reason for some of the complexity of this code is that + * not all descriptors have corresponding txps. So, we have to + * be able to skip over descs until we wander into the range of + * the next txp on the list. + */ + + if (!list_empty(&ppd->sdma_activelist)) { + lp = ppd->sdma_activelist.next; + txp = list_entry(lp, struct qib_sdma_txreq, list); + idx = txp->start_idx; + } + + while (ppd->sdma_descq_head != hwhead) { + /* if desc is part of this txp, unmap if needed */ + if (txp && (txp->flags & QIB_SDMA_TXREQ_F_FREEDESC) && + (idx == ppd->sdma_descq_head)) { + unmap_desc(ppd, ppd->sdma_descq_head); + if (++idx == ppd->sdma_descq_cnt) + idx = 0; + } + + /* increment dequed desc count */ + ppd->sdma_descq_removed++; + + /* advance head, wrap if needed */ + if (++ppd->sdma_descq_head == ppd->sdma_descq_cnt) + ppd->sdma_descq_head = 0; + + /* if now past this txp's descs, do the callback */ + if (txp && txp->next_descq_idx == ppd->sdma_descq_head) { + /* remove from active list */ + list_del_init(&txp->list); + if (txp->callback) + (*txp->callback)(txp, QIB_SDMA_TXREQ_S_OK); + /* see if there is another txp */ + if (list_empty(&ppd->sdma_activelist)) + txp = NULL; + else { + lp = ppd->sdma_activelist.next; + txp = list_entry(lp, struct qib_sdma_txreq, + list); + idx = txp->start_idx; + } + } + progress = 1; + } + if (progress) + qib_verbs_sdma_desc_avail(ppd, qib_sdma_descq_freecnt(ppd)); + return progress; +} + +/* + * This is called from interrupt context. + */ +void qib_sdma_intr(struct qib_pportdata *ppd) +{ + unsigned long flags; + + spin_lock_irqsave(&ppd->sdma_lock, flags); + + __qib_sdma_intr(ppd); + + spin_unlock_irqrestore(&ppd->sdma_lock, flags); +} + +void __qib_sdma_intr(struct qib_pportdata *ppd) +{ + if (__qib_sdma_running(ppd)) { + qib_sdma_make_progress(ppd); + if (!list_empty(&ppd->sdma_userpending)) + qib_user_sdma_send_desc(ppd, &ppd->sdma_userpending); + } +} + +int qib_setup_sdma(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + unsigned long flags; + int ret = 0; + + ret = alloc_sdma(ppd); + if (ret) + goto bail; + + /* set consistent sdma state */ + ppd->dd->f_sdma_init_early(ppd); + spin_lock_irqsave(&ppd->sdma_lock, flags); + sdma_set_state(ppd, qib_sdma_state_s00_hw_down); + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + + /* set up reference counting */ + kref_init(&ppd->sdma_state.kref); + init_completion(&ppd->sdma_state.comp); + + ppd->sdma_generation = 0; + ppd->sdma_descq_head = 0; + ppd->sdma_descq_removed = 0; + ppd->sdma_descq_added = 0; + + ppd->sdma_intrequest = 0; + INIT_LIST_HEAD(&ppd->sdma_userpending); + + INIT_LIST_HEAD(&ppd->sdma_activelist); + + tasklet_init(&ppd->sdma_sw_clean_up_task, sdma_sw_clean_up_task, + (unsigned long)ppd); + + ret = dd->f_init_sdma_regs(ppd); + if (ret) + goto bail_alloc; + + qib_sdma_process_event(ppd, qib_sdma_event_e10_go_hw_start); + + return 0; + +bail_alloc: + qib_teardown_sdma(ppd); +bail: + return ret; +} + +void qib_teardown_sdma(struct qib_pportdata *ppd) +{ + qib_sdma_process_event(ppd, qib_sdma_event_e00_go_hw_down); + + /* + * This waits for the state machine to exit so it is not + * necessary to kill the sdma_sw_clean_up_task to make sure + * it is not running. + */ + sdma_finalput(&ppd->sdma_state); + + free_sdma(ppd); +} + +int qib_sdma_running(struct qib_pportdata *ppd) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&ppd->sdma_lock, flags); + ret = __qib_sdma_running(ppd); + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + + return ret; +} + +/* + * Complete a request when sdma not running; likely only request + * but to simplify the code, always queue it, then process the full + * activelist. We process the entire list to ensure that this particular + * request does get it's callback, but in the correct order. + * Must be called with sdma_lock held + */ +static void complete_sdma_err_req(struct qib_pportdata *ppd, + struct qib_verbs_txreq *tx) +{ + atomic_inc(&tx->qp->s_dma_busy); + /* no sdma descriptors, so no unmap_desc */ + tx->txreq.start_idx = 0; + tx->txreq.next_descq_idx = 0; + list_add_tail(&tx->txreq.list, &ppd->sdma_activelist); + clear_sdma_activelist(ppd); +} + +/* + * This function queues one IB packet onto the send DMA queue per call. + * The caller is responsible for checking: + * 1) The number of send DMA descriptor entries is less than the size of + * the descriptor queue. + * 2) The IB SGE addresses and lengths are 32-bit aligned + * (except possibly the last SGE's length) + * 3) The SGE addresses are suitable for passing to dma_map_single(). + */ +int qib_sdma_verbs_send(struct qib_pportdata *ppd, + struct qib_sge_state *ss, u32 dwords, + struct qib_verbs_txreq *tx) +{ + unsigned long flags; + struct qib_sge *sge; + struct qib_qp *qp; + int ret = 0; + u16 tail; + __le64 *descqp; + u64 sdmadesc[2]; + u32 dwoffset; + dma_addr_t addr; + + spin_lock_irqsave(&ppd->sdma_lock, flags); + +retry: + if (unlikely(!__qib_sdma_running(ppd))) { + complete_sdma_err_req(ppd, tx); + goto unlock; + } + + if (tx->txreq.sg_count > qib_sdma_descq_freecnt(ppd)) { + if (qib_sdma_make_progress(ppd)) + goto retry; + if (ppd->dd->flags & QIB_HAS_SDMA_TIMEOUT) + ppd->dd->f_sdma_set_desc_cnt(ppd, + ppd->sdma_descq_cnt / 2); + goto busy; + } + + dwoffset = tx->hdr_dwords; + make_sdma_desc(ppd, sdmadesc, (u64) tx->txreq.addr, dwoffset, 0); + + sdmadesc[0] |= SDMA_DESC_FIRST; + if (tx->txreq.flags & QIB_SDMA_TXREQ_F_USELARGEBUF) + sdmadesc[0] |= SDMA_DESC_USE_LARGE_BUF; + + /* write to the descq */ + tail = ppd->sdma_descq_tail; + descqp = &ppd->sdma_descq[tail].qw[0]; + *descqp++ = cpu_to_le64(sdmadesc[0]); + *descqp++ = cpu_to_le64(sdmadesc[1]); + + /* increment the tail */ + if (++tail == ppd->sdma_descq_cnt) { + tail = 0; + descqp = &ppd->sdma_descq[0].qw[0]; + ++ppd->sdma_generation; + } + + tx->txreq.start_idx = tail; + + sge = &ss->sge; + while (dwords) { + u32 dw; + u32 len; + + len = dwords << 2; + if (len > sge->length) + len = sge->length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + dw = (len + 3) >> 2; + addr = dma_map_single(&ppd->dd->pcidev->dev, sge->vaddr, + dw << 2, DMA_TO_DEVICE); + if (dma_mapping_error(&ppd->dd->pcidev->dev, addr)) + goto unmap; + sdmadesc[0] = 0; + make_sdma_desc(ppd, sdmadesc, (u64) addr, dw, dwoffset); + /* SDmaUseLargeBuf has to be set in every descriptor */ + if (tx->txreq.flags & QIB_SDMA_TXREQ_F_USELARGEBUF) + sdmadesc[0] |= SDMA_DESC_USE_LARGE_BUF; + /* write to the descq */ + *descqp++ = cpu_to_le64(sdmadesc[0]); + *descqp++ = cpu_to_le64(sdmadesc[1]); + + /* increment the tail */ + if (++tail == ppd->sdma_descq_cnt) { + tail = 0; + descqp = &ppd->sdma_descq[0].qw[0]; + ++ppd->sdma_generation; + } + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= QIB_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + + dwoffset += dw; + dwords -= dw; + } + + if (!tail) + descqp = &ppd->sdma_descq[ppd->sdma_descq_cnt].qw[0]; + descqp -= 2; + descqp[0] |= cpu_to_le64(SDMA_DESC_LAST); + if (tx->txreq.flags & QIB_SDMA_TXREQ_F_HEADTOHOST) + descqp[0] |= cpu_to_le64(SDMA_DESC_DMA_HEAD); + if (tx->txreq.flags & QIB_SDMA_TXREQ_F_INTREQ) + descqp[0] |= cpu_to_le64(SDMA_DESC_INTR); + + atomic_inc(&tx->qp->s_dma_busy); + tx->txreq.next_descq_idx = tail; + ppd->dd->f_sdma_update_tail(ppd, tail); + ppd->sdma_descq_added += tx->txreq.sg_count; + list_add_tail(&tx->txreq.list, &ppd->sdma_activelist); + goto unlock; + +unmap: + for (;;) { + if (!tail) + tail = ppd->sdma_descq_cnt - 1; + else + tail--; + if (tail == ppd->sdma_descq_tail) + break; + unmap_desc(ppd, tail); + } + qp = tx->qp; + qib_put_txreq(tx); + spin_lock(&qp->r_lock); + spin_lock(&qp->s_lock); + if (qp->ibqp.qp_type == IB_QPT_RC) { + /* XXX what about error sending RDMA read responses? */ + if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) + qib_error_qp(qp, IB_WC_GENERAL_ERR); + } else if (qp->s_wqe) + qib_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR); + spin_unlock(&qp->s_lock); + spin_unlock(&qp->r_lock); + /* return zero to process the next send work request */ + goto unlock; + +busy: + qp = tx->qp; + spin_lock(&qp->s_lock); + if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) { + struct qib_ibdev *dev; + + /* + * If we couldn't queue the DMA request, save the info + * and try again later rather than destroying the + * buffer and undoing the side effects of the copy. + */ + tx->ss = ss; + tx->dwords = dwords; + qp->s_tx = tx; + dev = &ppd->dd->verbs_dev; + spin_lock(&dev->pending_lock); + if (list_empty(&qp->iowait)) { + struct qib_ibport *ibp; + + ibp = &ppd->ibport_data; + ibp->n_dmawait++; + qp->s_flags |= QIB_S_WAIT_DMA_DESC; + list_add_tail(&qp->iowait, &dev->dmawait); + } + spin_unlock(&dev->pending_lock); + qp->s_flags &= ~QIB_S_BUSY; + spin_unlock(&qp->s_lock); + ret = -EBUSY; + } else { + spin_unlock(&qp->s_lock); + qib_put_txreq(tx); + } +unlock: + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + return ret; +} + +/* + * sdma_lock should be acquired before calling this routine + */ +void dump_sdma_state(struct qib_pportdata *ppd) +{ + struct qib_sdma_desc *descq; + struct qib_sdma_txreq *txp, *txpnext; + __le64 *descqp; + u64 desc[2]; + u64 addr; + u16 gen, dwlen, dwoffset; + u16 head, tail, cnt; + + head = ppd->sdma_descq_head; + tail = ppd->sdma_descq_tail; + cnt = qib_sdma_descq_freecnt(ppd); + descq = ppd->sdma_descq; + + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA ppd->sdma_descq_head: %u\n", head); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA ppd->sdma_descq_tail: %u\n", tail); + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA sdma_descq_freecnt: %u\n", cnt); + + /* print info for each entry in the descriptor queue */ + while (head != tail) { + char flags[6] = { 'x', 'x', 'x', 'x', 'x', 0 }; + + descqp = &descq[head].qw[0]; + desc[0] = le64_to_cpu(descqp[0]); + desc[1] = le64_to_cpu(descqp[1]); + flags[0] = (desc[0] & 1<<15) ? 'I' : '-'; + flags[1] = (desc[0] & 1<<14) ? 'L' : 'S'; + flags[2] = (desc[0] & 1<<13) ? 'H' : '-'; + flags[3] = (desc[0] & 1<<12) ? 'F' : '-'; + flags[4] = (desc[0] & 1<<11) ? 'L' : '-'; + addr = (desc[1] << 32) | ((desc[0] >> 32) & 0xfffffffcULL); + gen = (desc[0] >> 30) & 3ULL; + dwlen = (desc[0] >> 14) & (0x7ffULL << 2); + dwoffset = (desc[0] & 0x7ffULL) << 2; + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA sdmadesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes offset:%u bytes\n", + head, flags, addr, gen, dwlen, dwoffset); + if (++head == ppd->sdma_descq_cnt) + head = 0; + } + + /* print dma descriptor indices from the TX requests */ + list_for_each_entry_safe(txp, txpnext, &ppd->sdma_activelist, + list) + qib_dev_porterr(ppd->dd, ppd->port, + "SDMA txp->start_idx: %u txp->next_descq_idx: %u\n", + txp->start_idx, txp->next_descq_idx); +} + +void qib_sdma_process_event(struct qib_pportdata *ppd, + enum qib_sdma_events event) +{ + unsigned long flags; + + spin_lock_irqsave(&ppd->sdma_lock, flags); + + __qib_sdma_process_event(ppd, event); + + if (ppd->sdma_state.current_state == qib_sdma_state_s99_running) + qib_verbs_sdma_desc_avail(ppd, qib_sdma_descq_freecnt(ppd)); + + spin_unlock_irqrestore(&ppd->sdma_lock, flags); +} + +void __qib_sdma_process_event(struct qib_pportdata *ppd, + enum qib_sdma_events event) +{ + struct qib_sdma_state *ss = &ppd->sdma_state; + + switch (ss->current_state) { + case qib_sdma_state_s00_hw_down: + switch (event) { + case qib_sdma_event_e00_go_hw_down: + break; + case qib_sdma_event_e30_go_running: + /* + * If down, but running requested (usually result + * of link up, then we need to start up. + * This can happen when hw down is requested while + * bringing the link up with traffic active on + * 7220, e.g. */ + ss->go_s99_running = 1; + /* fall through and start dma engine */ + case qib_sdma_event_e10_go_hw_start: + /* This reference means the state machine is started */ + sdma_get(&ppd->sdma_state); + sdma_set_state(ppd, + qib_sdma_state_s10_hw_start_up_wait); + break; + case qib_sdma_event_e20_hw_started: + break; + case qib_sdma_event_e40_sw_cleaned: + sdma_sw_tear_down(ppd); + break; + case qib_sdma_event_e50_hw_cleaned: + break; + case qib_sdma_event_e60_hw_halted: + break; + case qib_sdma_event_e70_go_idle: + break; + case qib_sdma_event_e7220_err_halted: + break; + case qib_sdma_event_e7322_err_halted: + break; + case qib_sdma_event_e90_timer_tick: + break; + } + break; + + case qib_sdma_state_s10_hw_start_up_wait: + switch (event) { + case qib_sdma_event_e00_go_hw_down: + sdma_set_state(ppd, qib_sdma_state_s00_hw_down); + sdma_sw_tear_down(ppd); + break; + case qib_sdma_event_e10_go_hw_start: + break; + case qib_sdma_event_e20_hw_started: + sdma_set_state(ppd, ss->go_s99_running ? + qib_sdma_state_s99_running : + qib_sdma_state_s20_idle); + break; + case qib_sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case qib_sdma_event_e40_sw_cleaned: + break; + case qib_sdma_event_e50_hw_cleaned: + break; + case qib_sdma_event_e60_hw_halted: + break; + case qib_sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case qib_sdma_event_e7220_err_halted: + break; + case qib_sdma_event_e7322_err_halted: + break; + case qib_sdma_event_e90_timer_tick: + break; + } + break; + + case qib_sdma_state_s20_idle: + switch (event) { + case qib_sdma_event_e00_go_hw_down: + sdma_set_state(ppd, qib_sdma_state_s00_hw_down); + sdma_sw_tear_down(ppd); + break; + case qib_sdma_event_e10_go_hw_start: + break; + case qib_sdma_event_e20_hw_started: + break; + case qib_sdma_event_e30_go_running: + sdma_set_state(ppd, qib_sdma_state_s99_running); + ss->go_s99_running = 1; + break; + case qib_sdma_event_e40_sw_cleaned: + break; + case qib_sdma_event_e50_hw_cleaned: + break; + case qib_sdma_event_e60_hw_halted: + break; + case qib_sdma_event_e70_go_idle: + break; + case qib_sdma_event_e7220_err_halted: + break; + case qib_sdma_event_e7322_err_halted: + break; + case qib_sdma_event_e90_timer_tick: + break; + } + break; + + case qib_sdma_state_s30_sw_clean_up_wait: + switch (event) { + case qib_sdma_event_e00_go_hw_down: + sdma_set_state(ppd, qib_sdma_state_s00_hw_down); + break; + case qib_sdma_event_e10_go_hw_start: + break; + case qib_sdma_event_e20_hw_started: + break; + case qib_sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case qib_sdma_event_e40_sw_cleaned: + sdma_set_state(ppd, + qib_sdma_state_s10_hw_start_up_wait); + sdma_hw_start_up(ppd); + break; + case qib_sdma_event_e50_hw_cleaned: + break; + case qib_sdma_event_e60_hw_halted: + break; + case qib_sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case qib_sdma_event_e7220_err_halted: + break; + case qib_sdma_event_e7322_err_halted: + break; + case qib_sdma_event_e90_timer_tick: + break; + } + break; + + case qib_sdma_state_s40_hw_clean_up_wait: + switch (event) { + case qib_sdma_event_e00_go_hw_down: + sdma_set_state(ppd, qib_sdma_state_s00_hw_down); + sdma_start_sw_clean_up(ppd); + break; + case qib_sdma_event_e10_go_hw_start: + break; + case qib_sdma_event_e20_hw_started: + break; + case qib_sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case qib_sdma_event_e40_sw_cleaned: + break; + case qib_sdma_event_e50_hw_cleaned: + sdma_set_state(ppd, + qib_sdma_state_s30_sw_clean_up_wait); + sdma_start_sw_clean_up(ppd); + break; + case qib_sdma_event_e60_hw_halted: + break; + case qib_sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case qib_sdma_event_e7220_err_halted: + break; + case qib_sdma_event_e7322_err_halted: + break; + case qib_sdma_event_e90_timer_tick: + break; + } + break; + + case qib_sdma_state_s50_hw_halt_wait: + switch (event) { + case qib_sdma_event_e00_go_hw_down: + sdma_set_state(ppd, qib_sdma_state_s00_hw_down); + sdma_start_sw_clean_up(ppd); + break; + case qib_sdma_event_e10_go_hw_start: + break; + case qib_sdma_event_e20_hw_started: + break; + case qib_sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case qib_sdma_event_e40_sw_cleaned: + break; + case qib_sdma_event_e50_hw_cleaned: + break; + case qib_sdma_event_e60_hw_halted: + sdma_set_state(ppd, + qib_sdma_state_s40_hw_clean_up_wait); + ppd->dd->f_sdma_hw_clean_up(ppd); + break; + case qib_sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case qib_sdma_event_e7220_err_halted: + break; + case qib_sdma_event_e7322_err_halted: + break; + case qib_sdma_event_e90_timer_tick: + break; + } + break; + + case qib_sdma_state_s99_running: + switch (event) { + case qib_sdma_event_e00_go_hw_down: + sdma_set_state(ppd, qib_sdma_state_s00_hw_down); + sdma_start_sw_clean_up(ppd); + break; + case qib_sdma_event_e10_go_hw_start: + break; + case qib_sdma_event_e20_hw_started: + break; + case qib_sdma_event_e30_go_running: + break; + case qib_sdma_event_e40_sw_cleaned: + break; + case qib_sdma_event_e50_hw_cleaned: + break; + case qib_sdma_event_e60_hw_halted: + sdma_set_state(ppd, + qib_sdma_state_s30_sw_clean_up_wait); + sdma_start_sw_clean_up(ppd); + break; + case qib_sdma_event_e70_go_idle: + sdma_set_state(ppd, qib_sdma_state_s50_hw_halt_wait); + ss->go_s99_running = 0; + break; + case qib_sdma_event_e7220_err_halted: + sdma_set_state(ppd, + qib_sdma_state_s30_sw_clean_up_wait); + sdma_start_sw_clean_up(ppd); + break; + case qib_sdma_event_e7322_err_halted: + sdma_set_state(ppd, qib_sdma_state_s50_hw_halt_wait); + break; + case qib_sdma_event_e90_timer_tick: + break; + } + break; + } + + ss->last_event = event; +} diff --git a/drivers/infiniband/hw/qib/qib_srq.c b/drivers/infiniband/hw/qib/qib_srq.c new file mode 100644 index 0000000..d623593 --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_srq.c @@ -0,0 +1,380 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "qib_verbs.h" + +/** + * qib_post_srq_receive - post a receive on a shared receive queue + * @ibsrq: the SRQ to post the receive on + * @wr: the list of work requests to post + * @bad_wr: A pointer to the first WR to cause a problem is put here + * + * This may be called from interrupt context. + */ +int qib_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct qib_srq *srq = to_isrq(ibsrq); + struct qib_rwq *wq; + unsigned long flags; + int ret; + + for (; wr; wr = wr->next) { + struct qib_rwqe *wqe; + u32 next; + int i; + + if ((unsigned) wr->num_sge > srq->rq.max_sge) { + *bad_wr = wr; + ret = -EINVAL; + goto bail; + } + + spin_lock_irqsave(&srq->rq.lock, flags); + wq = srq->rq.wq; + next = wq->head + 1; + if (next >= srq->rq.size) + next = 0; + if (next == wq->tail) { + spin_unlock_irqrestore(&srq->rq.lock, flags); + *bad_wr = wr; + ret = -ENOMEM; + goto bail; + } + + wqe = get_rwqe_ptr(&srq->rq, wq->head); + wqe->wr_id = wr->wr_id; + wqe->num_sge = wr->num_sge; + for (i = 0; i < wr->num_sge; i++) + wqe->sg_list[i] = wr->sg_list[i]; + /* Make sure queue entry is written before the head index. */ + smp_wmb(); + wq->head = next; + spin_unlock_irqrestore(&srq->rq.lock, flags); + } + ret = 0; + +bail: + return ret; +} + +/** + * qib_create_srq - create a shared receive queue + * @ibpd: the protection domain of the SRQ to create + * @srq_init_attr: the attributes of the SRQ + * @udata: data from libibverbs when creating a user SRQ + */ +struct ib_srq *qib_create_srq(struct ib_pd *ibpd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata) +{ + struct qib_ibdev *dev = to_idev(ibpd->device); + struct qib_srq *srq; + u32 sz; + struct ib_srq *ret; + + if (srq_init_attr->srq_type != IB_SRQT_BASIC) { + ret = ERR_PTR(-ENOSYS); + goto done; + } + + if (srq_init_attr->attr.max_sge == 0 || + srq_init_attr->attr.max_sge > ib_qib_max_srq_sges || + srq_init_attr->attr.max_wr == 0 || + srq_init_attr->attr.max_wr > ib_qib_max_srq_wrs) { + ret = ERR_PTR(-EINVAL); + goto done; + } + + srq = kmalloc(sizeof(*srq), GFP_KERNEL); + if (!srq) { + ret = ERR_PTR(-ENOMEM); + goto done; + } + + /* + * Need to use vmalloc() if we want to support large #s of entries. + */ + srq->rq.size = srq_init_attr->attr.max_wr + 1; + srq->rq.max_sge = srq_init_attr->attr.max_sge; + sz = sizeof(struct ib_sge) * srq->rq.max_sge + + sizeof(struct qib_rwqe); + srq->rq.wq = vmalloc_user(sizeof(struct qib_rwq) + srq->rq.size * sz); + if (!srq->rq.wq) { + ret = ERR_PTR(-ENOMEM); + goto bail_srq; + } + + /* + * Return the address of the RWQ as the offset to mmap. + * See qib_mmap() for details. + */ + if (udata && udata->outlen >= sizeof(__u64)) { + int err; + u32 s = sizeof(struct qib_rwq) + srq->rq.size * sz; + + srq->ip = + qib_create_mmap_info(dev, s, ibpd->uobject->context, + srq->rq.wq); + if (!srq->ip) { + ret = ERR_PTR(-ENOMEM); + goto bail_wq; + } + + err = ib_copy_to_udata(udata, &srq->ip->offset, + sizeof(srq->ip->offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_ip; + } + } else + srq->ip = NULL; + + /* + * ib_create_srq() will initialize srq->ibsrq. + */ + spin_lock_init(&srq->rq.lock); + srq->rq.wq->head = 0; + srq->rq.wq->tail = 0; + srq->limit = srq_init_attr->attr.srq_limit; + + spin_lock(&dev->n_srqs_lock); + if (dev->n_srqs_allocated == ib_qib_max_srqs) { + spin_unlock(&dev->n_srqs_lock); + ret = ERR_PTR(-ENOMEM); + goto bail_ip; + } + + dev->n_srqs_allocated++; + spin_unlock(&dev->n_srqs_lock); + + if (srq->ip) { + spin_lock_irq(&dev->pending_lock); + list_add(&srq->ip->pending_mmaps, &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + + ret = &srq->ibsrq; + goto done; + +bail_ip: + kfree(srq->ip); +bail_wq: + vfree(srq->rq.wq); +bail_srq: + kfree(srq); +done: + return ret; +} + +/** + * qib_modify_srq - modify a shared receive queue + * @ibsrq: the SRQ to modify + * @attr: the new attributes of the SRQ + * @attr_mask: indicates which attributes to modify + * @udata: user data for libibverbs.so + */ +int qib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, + struct ib_udata *udata) +{ + struct qib_srq *srq = to_isrq(ibsrq); + struct qib_rwq *wq; + int ret = 0; + + if (attr_mask & IB_SRQ_MAX_WR) { + struct qib_rwq *owq; + struct qib_rwqe *p; + u32 sz, size, n, head, tail; + + /* Check that the requested sizes are below the limits. */ + if ((attr->max_wr > ib_qib_max_srq_wrs) || + ((attr_mask & IB_SRQ_LIMIT) ? + attr->srq_limit : srq->limit) > attr->max_wr) { + ret = -EINVAL; + goto bail; + } + + sz = sizeof(struct qib_rwqe) + + srq->rq.max_sge * sizeof(struct ib_sge); + size = attr->max_wr + 1; + wq = vmalloc_user(sizeof(struct qib_rwq) + size * sz); + if (!wq) { + ret = -ENOMEM; + goto bail; + } + + /* Check that we can write the offset to mmap. */ + if (udata && udata->inlen >= sizeof(__u64)) { + __u64 offset_addr; + __u64 offset = 0; + + ret = ib_copy_from_udata(&offset_addr, udata, + sizeof(offset_addr)); + if (ret) + goto bail_free; + udata->outbuf = + (void __user *) (unsigned long) offset_addr; + ret = ib_copy_to_udata(udata, &offset, + sizeof(offset)); + if (ret) + goto bail_free; + } + + spin_lock_irq(&srq->rq.lock); + /* + * validate head and tail pointer values and compute + * the number of remaining WQEs. + */ + owq = srq->rq.wq; + head = owq->head; + tail = owq->tail; + if (head >= srq->rq.size || tail >= srq->rq.size) { + ret = -EINVAL; + goto bail_unlock; + } + n = head; + if (n < tail) + n += srq->rq.size - tail; + else + n -= tail; + if (size <= n) { + ret = -EINVAL; + goto bail_unlock; + } + n = 0; + p = wq->wq; + while (tail != head) { + struct qib_rwqe *wqe; + int i; + + wqe = get_rwqe_ptr(&srq->rq, tail); + p->wr_id = wqe->wr_id; + p->num_sge = wqe->num_sge; + for (i = 0; i < wqe->num_sge; i++) + p->sg_list[i] = wqe->sg_list[i]; + n++; + p = (struct qib_rwqe *)((char *) p + sz); + if (++tail >= srq->rq.size) + tail = 0; + } + srq->rq.wq = wq; + srq->rq.size = size; + wq->head = n; + wq->tail = 0; + if (attr_mask & IB_SRQ_LIMIT) + srq->limit = attr->srq_limit; + spin_unlock_irq(&srq->rq.lock); + + vfree(owq); + + if (srq->ip) { + struct qib_mmap_info *ip = srq->ip; + struct qib_ibdev *dev = to_idev(srq->ibsrq.device); + u32 s = sizeof(struct qib_rwq) + size * sz; + + qib_update_mmap_info(dev, ip, s, wq); + + /* + * Return the offset to mmap. + * See qib_mmap() for details. + */ + if (udata && udata->inlen >= sizeof(__u64)) { + ret = ib_copy_to_udata(udata, &ip->offset, + sizeof(ip->offset)); + if (ret) + goto bail; + } + + /* + * Put user mapping info onto the pending list + * unless it already is on the list. + */ + spin_lock_irq(&dev->pending_lock); + if (list_empty(&ip->pending_mmaps)) + list_add(&ip->pending_mmaps, + &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + } else if (attr_mask & IB_SRQ_LIMIT) { + spin_lock_irq(&srq->rq.lock); + if (attr->srq_limit >= srq->rq.size) + ret = -EINVAL; + else + srq->limit = attr->srq_limit; + spin_unlock_irq(&srq->rq.lock); + } + goto bail; + +bail_unlock: + spin_unlock_irq(&srq->rq.lock); +bail_free: + vfree(wq); +bail: + return ret; +} + +int qib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) +{ + struct qib_srq *srq = to_isrq(ibsrq); + + attr->max_wr = srq->rq.size - 1; + attr->max_sge = srq->rq.max_sge; + attr->srq_limit = srq->limit; + return 0; +} + +/** + * qib_destroy_srq - destroy a shared receive queue + * @ibsrq: the SRQ to destroy + */ +int qib_destroy_srq(struct ib_srq *ibsrq) +{ + struct qib_srq *srq = to_isrq(ibsrq); + struct qib_ibdev *dev = to_idev(ibsrq->device); + + spin_lock(&dev->n_srqs_lock); + dev->n_srqs_allocated--; + spin_unlock(&dev->n_srqs_lock); + if (srq->ip) + kref_put(&srq->ip->ref, qib_release_mmap_info); + else + vfree(srq->rq.wq); + kfree(srq); + + return 0; +} diff --git a/drivers/infiniband/hw/qib/qib_sysfs.c b/drivers/infiniband/hw/qib/qib_sysfs.c new file mode 100644 index 0000000..3c8e4e3 --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_sysfs.c @@ -0,0 +1,842 @@ +/* + * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. + * Copyright (c) 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include + +#include "qib.h" +#include "qib_mad.h" + +/* start of per-port functions */ +/* + * Get/Set heartbeat enable. OR of 1=enabled, 2=auto + */ +static ssize_t show_hrtbt_enb(struct qib_pportdata *ppd, char *buf) +{ + struct qib_devdata *dd = ppd->dd; + int ret; + + ret = dd->f_get_ib_cfg(ppd, QIB_IB_CFG_HRTBT); + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +static ssize_t store_hrtbt_enb(struct qib_pportdata *ppd, const char *buf, + size_t count) +{ + struct qib_devdata *dd = ppd->dd; + int ret; + u16 val; + + ret = kstrtou16(buf, 0, &val); + if (ret) { + qib_dev_err(dd, "attempt to set invalid Heartbeat enable\n"); + return ret; + } + + /* + * Set the "intentional" heartbeat enable per either of + * "Enable" and "Auto", as these are normally set together. + * This bit is consulted when leaving loopback mode, + * because entering loopback mode overrides it and automatically + * disables heartbeat. + */ + ret = dd->f_set_ib_cfg(ppd, QIB_IB_CFG_HRTBT, val); + return ret < 0 ? ret : count; +} + +static ssize_t store_loopback(struct qib_pportdata *ppd, const char *buf, + size_t count) +{ + struct qib_devdata *dd = ppd->dd; + int ret = count, r; + + r = dd->f_set_ib_loopback(ppd, buf); + if (r < 0) + ret = r; + + return ret; +} + +static ssize_t store_led_override(struct qib_pportdata *ppd, const char *buf, + size_t count) +{ + struct qib_devdata *dd = ppd->dd; + int ret; + u16 val; + + ret = kstrtou16(buf, 0, &val); + if (ret) { + qib_dev_err(dd, "attempt to set invalid LED override\n"); + return ret; + } + + qib_set_led_override(ppd, val); + return count; +} + +static ssize_t show_status(struct qib_pportdata *ppd, char *buf) +{ + ssize_t ret; + + if (!ppd->statusp) + ret = -EINVAL; + else + ret = scnprintf(buf, PAGE_SIZE, "0x%llx\n", + (unsigned long long) *(ppd->statusp)); + return ret; +} + +/* + * For userland compatibility, these offsets must remain fixed. + * They are strings for QIB_STATUS_* + */ +static const char * const qib_status_str[] = { + "Initted", + "", + "", + "", + "", + "Present", + "IB_link_up", + "IB_configured", + "", + "Fatal_Hardware_Error", + NULL, +}; + +static ssize_t show_status_str(struct qib_pportdata *ppd, char *buf) +{ + int i, any; + u64 s; + ssize_t ret; + + if (!ppd->statusp) { + ret = -EINVAL; + goto bail; + } + + s = *(ppd->statusp); + *buf = '\0'; + for (any = i = 0; s && qib_status_str[i]; i++) { + if (s & 1) { + /* if overflow */ + if (any && strlcat(buf, " ", PAGE_SIZE) >= PAGE_SIZE) + break; + if (strlcat(buf, qib_status_str[i], PAGE_SIZE) >= + PAGE_SIZE) + break; + any = 1; + } + s >>= 1; + } + if (any) + strlcat(buf, "\n", PAGE_SIZE); + + ret = strlen(buf); + +bail: + return ret; +} + +/* end of per-port functions */ + +/* + * Start of per-port file structures and support code + * Because we are fitting into other infrastructure, we have to supply the + * full set of kobject/sysfs_ops structures and routines. + */ +#define QIB_PORT_ATTR(name, mode, show, store) \ + static struct qib_port_attr qib_port_attr_##name = \ + __ATTR(name, mode, show, store) + +struct qib_port_attr { + struct attribute attr; + ssize_t (*show)(struct qib_pportdata *, char *); + ssize_t (*store)(struct qib_pportdata *, const char *, size_t); +}; + +QIB_PORT_ATTR(loopback, S_IWUSR, NULL, store_loopback); +QIB_PORT_ATTR(led_override, S_IWUSR, NULL, store_led_override); +QIB_PORT_ATTR(hrtbt_enable, S_IWUSR | S_IRUGO, show_hrtbt_enb, + store_hrtbt_enb); +QIB_PORT_ATTR(status, S_IRUGO, show_status, NULL); +QIB_PORT_ATTR(status_str, S_IRUGO, show_status_str, NULL); + +static struct attribute *port_default_attributes[] = { + &qib_port_attr_loopback.attr, + &qib_port_attr_led_override.attr, + &qib_port_attr_hrtbt_enable.attr, + &qib_port_attr_status.attr, + &qib_port_attr_status_str.attr, + NULL +}; + +/* + * Start of per-port congestion control structures and support code + */ + +/* + * Congestion control table size followed by table entries + */ +static ssize_t read_cc_table_bin(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t pos, size_t count) +{ + int ret; + struct qib_pportdata *ppd = + container_of(kobj, struct qib_pportdata, pport_cc_kobj); + + if (!qib_cc_table_size || !ppd->ccti_entries_shadow) + return -EINVAL; + + ret = ppd->total_cct_entry * sizeof(struct ib_cc_table_entry_shadow) + + sizeof(__be16); + + if (pos > ret) + return -EINVAL; + + if (count > ret - pos) + count = ret - pos; + + if (!count) + return count; + + spin_lock(&ppd->cc_shadow_lock); + memcpy(buf, ppd->ccti_entries_shadow, count); + spin_unlock(&ppd->cc_shadow_lock); + + return count; +} + +static void qib_port_release(struct kobject *kobj) +{ + /* nothing to do since memory is freed by qib_free_devdata() */ +} + +static struct kobj_type qib_port_cc_ktype = { + .release = qib_port_release, +}; + +static struct bin_attribute cc_table_bin_attr = { + .attr = {.name = "cc_table_bin", .mode = 0444}, + .read = read_cc_table_bin, + .size = PAGE_SIZE, +}; + +/* + * Congestion settings: port control, control map and an array of 16 + * entries for the congestion entries - increase, timer, event log + * trigger threshold and the minimum injection rate delay. + */ +static ssize_t read_cc_setting_bin(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t pos, size_t count) +{ + int ret; + struct qib_pportdata *ppd = + container_of(kobj, struct qib_pportdata, pport_cc_kobj); + + if (!qib_cc_table_size || !ppd->congestion_entries_shadow) + return -EINVAL; + + ret = sizeof(struct ib_cc_congestion_setting_attr_shadow); + + if (pos > ret) + return -EINVAL; + if (count > ret - pos) + count = ret - pos; + + if (!count) + return count; + + spin_lock(&ppd->cc_shadow_lock); + memcpy(buf, ppd->congestion_entries_shadow, count); + spin_unlock(&ppd->cc_shadow_lock); + + return count; +} + +static struct bin_attribute cc_setting_bin_attr = { + .attr = {.name = "cc_settings_bin", .mode = 0444}, + .read = read_cc_setting_bin, + .size = PAGE_SIZE, +}; + + +static ssize_t qib_portattr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct qib_port_attr *pattr = + container_of(attr, struct qib_port_attr, attr); + struct qib_pportdata *ppd = + container_of(kobj, struct qib_pportdata, pport_kobj); + + return pattr->show(ppd, buf); +} + +static ssize_t qib_portattr_store(struct kobject *kobj, + struct attribute *attr, const char *buf, size_t len) +{ + struct qib_port_attr *pattr = + container_of(attr, struct qib_port_attr, attr); + struct qib_pportdata *ppd = + container_of(kobj, struct qib_pportdata, pport_kobj); + + return pattr->store(ppd, buf, len); +} + + +static const struct sysfs_ops qib_port_ops = { + .show = qib_portattr_show, + .store = qib_portattr_store, +}; + +static struct kobj_type qib_port_ktype = { + .release = qib_port_release, + .sysfs_ops = &qib_port_ops, + .default_attrs = port_default_attributes +}; + +/* Start sl2vl */ + +#define QIB_SL2VL_ATTR(N) \ + static struct qib_sl2vl_attr qib_sl2vl_attr_##N = { \ + .attr = { .name = __stringify(N), .mode = 0444 }, \ + .sl = N \ + } + +struct qib_sl2vl_attr { + struct attribute attr; + int sl; +}; + +QIB_SL2VL_ATTR(0); +QIB_SL2VL_ATTR(1); +QIB_SL2VL_ATTR(2); +QIB_SL2VL_ATTR(3); +QIB_SL2VL_ATTR(4); +QIB_SL2VL_ATTR(5); +QIB_SL2VL_ATTR(6); +QIB_SL2VL_ATTR(7); +QIB_SL2VL_ATTR(8); +QIB_SL2VL_ATTR(9); +QIB_SL2VL_ATTR(10); +QIB_SL2VL_ATTR(11); +QIB_SL2VL_ATTR(12); +QIB_SL2VL_ATTR(13); +QIB_SL2VL_ATTR(14); +QIB_SL2VL_ATTR(15); + +static struct attribute *sl2vl_default_attributes[] = { + &qib_sl2vl_attr_0.attr, + &qib_sl2vl_attr_1.attr, + &qib_sl2vl_attr_2.attr, + &qib_sl2vl_attr_3.attr, + &qib_sl2vl_attr_4.attr, + &qib_sl2vl_attr_5.attr, + &qib_sl2vl_attr_6.attr, + &qib_sl2vl_attr_7.attr, + &qib_sl2vl_attr_8.attr, + &qib_sl2vl_attr_9.attr, + &qib_sl2vl_attr_10.attr, + &qib_sl2vl_attr_11.attr, + &qib_sl2vl_attr_12.attr, + &qib_sl2vl_attr_13.attr, + &qib_sl2vl_attr_14.attr, + &qib_sl2vl_attr_15.attr, + NULL +}; + +static ssize_t sl2vl_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct qib_sl2vl_attr *sattr = + container_of(attr, struct qib_sl2vl_attr, attr); + struct qib_pportdata *ppd = + container_of(kobj, struct qib_pportdata, sl2vl_kobj); + struct qib_ibport *qibp = &ppd->ibport_data; + + return sprintf(buf, "%u\n", qibp->sl_to_vl[sattr->sl]); +} + +static const struct sysfs_ops qib_sl2vl_ops = { + .show = sl2vl_attr_show, +}; + +static struct kobj_type qib_sl2vl_ktype = { + .release = qib_port_release, + .sysfs_ops = &qib_sl2vl_ops, + .default_attrs = sl2vl_default_attributes +}; + +/* End sl2vl */ + +/* Start diag_counters */ + +#define QIB_DIAGC_ATTR(N) \ + static struct qib_diagc_attr qib_diagc_attr_##N = { \ + .attr = { .name = __stringify(N), .mode = 0664 }, \ + .counter = offsetof(struct qib_ibport, n_##N) \ + } + +struct qib_diagc_attr { + struct attribute attr; + size_t counter; +}; + +QIB_DIAGC_ATTR(rc_resends); +QIB_DIAGC_ATTR(rc_acks); +QIB_DIAGC_ATTR(rc_qacks); +QIB_DIAGC_ATTR(rc_delayed_comp); +QIB_DIAGC_ATTR(seq_naks); +QIB_DIAGC_ATTR(rdma_seq); +QIB_DIAGC_ATTR(rnr_naks); +QIB_DIAGC_ATTR(other_naks); +QIB_DIAGC_ATTR(rc_timeouts); +QIB_DIAGC_ATTR(loop_pkts); +QIB_DIAGC_ATTR(pkt_drops); +QIB_DIAGC_ATTR(dmawait); +QIB_DIAGC_ATTR(unaligned); +QIB_DIAGC_ATTR(rc_dupreq); +QIB_DIAGC_ATTR(rc_seqnak); + +static struct attribute *diagc_default_attributes[] = { + &qib_diagc_attr_rc_resends.attr, + &qib_diagc_attr_rc_acks.attr, + &qib_diagc_attr_rc_qacks.attr, + &qib_diagc_attr_rc_delayed_comp.attr, + &qib_diagc_attr_seq_naks.attr, + &qib_diagc_attr_rdma_seq.attr, + &qib_diagc_attr_rnr_naks.attr, + &qib_diagc_attr_other_naks.attr, + &qib_diagc_attr_rc_timeouts.attr, + &qib_diagc_attr_loop_pkts.attr, + &qib_diagc_attr_pkt_drops.attr, + &qib_diagc_attr_dmawait.attr, + &qib_diagc_attr_unaligned.attr, + &qib_diagc_attr_rc_dupreq.attr, + &qib_diagc_attr_rc_seqnak.attr, + NULL +}; + +static ssize_t diagc_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct qib_diagc_attr *dattr = + container_of(attr, struct qib_diagc_attr, attr); + struct qib_pportdata *ppd = + container_of(kobj, struct qib_pportdata, diagc_kobj); + struct qib_ibport *qibp = &ppd->ibport_data; + + return sprintf(buf, "%u\n", *(u32 *)((char *)qibp + dattr->counter)); +} + +static ssize_t diagc_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t size) +{ + struct qib_diagc_attr *dattr = + container_of(attr, struct qib_diagc_attr, attr); + struct qib_pportdata *ppd = + container_of(kobj, struct qib_pportdata, diagc_kobj); + struct qib_ibport *qibp = &ppd->ibport_data; + u32 val; + int ret; + + ret = kstrtou32(buf, 0, &val); + if (ret) + return ret; + *(u32 *)((char *) qibp + dattr->counter) = val; + return size; +} + +static const struct sysfs_ops qib_diagc_ops = { + .show = diagc_attr_show, + .store = diagc_attr_store, +}; + +static struct kobj_type qib_diagc_ktype = { + .release = qib_port_release, + .sysfs_ops = &qib_diagc_ops, + .default_attrs = diagc_default_attributes +}; + +/* End diag_counters */ + +/* end of per-port file structures and support code */ + +/* + * Start of per-unit (or driver, in some cases, but replicated + * per unit) functions (these get a device *) + */ +static ssize_t show_rev(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct qib_ibdev *dev = + container_of(device, struct qib_ibdev, ibdev.dev); + + return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev); +} + +static ssize_t show_hca(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct qib_ibdev *dev = + container_of(device, struct qib_ibdev, ibdev.dev); + struct qib_devdata *dd = dd_from_dev(dev); + int ret; + + if (!dd->boardname) + ret = -EINVAL; + else + ret = scnprintf(buf, PAGE_SIZE, "%s\n", dd->boardname); + return ret; +} + +static ssize_t show_version(struct device *device, + struct device_attribute *attr, char *buf) +{ + /* The string printed here is already newline-terminated. */ + return scnprintf(buf, PAGE_SIZE, "%s", (char *)ib_qib_version); +} + +static ssize_t show_boardversion(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct qib_ibdev *dev = + container_of(device, struct qib_ibdev, ibdev.dev); + struct qib_devdata *dd = dd_from_dev(dev); + + /* The string printed here is already newline-terminated. */ + return scnprintf(buf, PAGE_SIZE, "%s", dd->boardversion); +} + + +static ssize_t show_localbus_info(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct qib_ibdev *dev = + container_of(device, struct qib_ibdev, ibdev.dev); + struct qib_devdata *dd = dd_from_dev(dev); + + /* The string printed here is already newline-terminated. */ + return scnprintf(buf, PAGE_SIZE, "%s", dd->lbus_info); +} + + +static ssize_t show_nctxts(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct qib_ibdev *dev = + container_of(device, struct qib_ibdev, ibdev.dev); + struct qib_devdata *dd = dd_from_dev(dev); + + /* Return the number of user ports (contexts) available. */ + /* The calculation below deals with a special case where + * cfgctxts is set to 1 on a single-port board. */ + return scnprintf(buf, PAGE_SIZE, "%u\n", + (dd->first_user_ctxt > dd->cfgctxts) ? 0 : + (dd->cfgctxts - dd->first_user_ctxt)); +} + +static ssize_t show_nfreectxts(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct qib_ibdev *dev = + container_of(device, struct qib_ibdev, ibdev.dev); + struct qib_devdata *dd = dd_from_dev(dev); + + /* Return the number of free user ports (contexts) available. */ + return scnprintf(buf, PAGE_SIZE, "%u\n", dd->freectxts); +} + +static ssize_t show_serial(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct qib_ibdev *dev = + container_of(device, struct qib_ibdev, ibdev.dev); + struct qib_devdata *dd = dd_from_dev(dev); + + buf[sizeof dd->serial] = '\0'; + memcpy(buf, dd->serial, sizeof dd->serial); + strcat(buf, "\n"); + return strlen(buf); +} + +static ssize_t store_chip_reset(struct device *device, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct qib_ibdev *dev = + container_of(device, struct qib_ibdev, ibdev.dev); + struct qib_devdata *dd = dd_from_dev(dev); + int ret; + + if (count < 5 || memcmp(buf, "reset", 5) || !dd->diag_client) { + ret = -EINVAL; + goto bail; + } + + ret = qib_reset_device(dd->unit); +bail: + return ret < 0 ? ret : count; +} + +static ssize_t show_logged_errs(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct qib_ibdev *dev = + container_of(device, struct qib_ibdev, ibdev.dev); + struct qib_devdata *dd = dd_from_dev(dev); + int idx, count; + + /* force consistency with actual EEPROM */ + if (qib_update_eeprom_log(dd) != 0) + return -ENXIO; + + count = 0; + for (idx = 0; idx < QIB_EEP_LOG_CNT; ++idx) { + count += scnprintf(buf + count, PAGE_SIZE - count, "%d%c", + dd->eep_st_errs[idx], + idx == (QIB_EEP_LOG_CNT - 1) ? '\n' : ' '); + } + + return count; +} + +/* + * Dump tempsense regs. in decimal, to ease shell-scripts. + */ +static ssize_t show_tempsense(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct qib_ibdev *dev = + container_of(device, struct qib_ibdev, ibdev.dev); + struct qib_devdata *dd = dd_from_dev(dev); + int ret; + int idx; + u8 regvals[8]; + + ret = -ENXIO; + for (idx = 0; idx < 8; ++idx) { + if (idx == 6) + continue; + ret = dd->f_tempsense_rd(dd, idx); + if (ret < 0) + break; + regvals[idx] = ret; + } + if (idx == 8) + ret = scnprintf(buf, PAGE_SIZE, "%d %d %02X %02X %d %d\n", + *(signed char *)(regvals), + *(signed char *)(regvals + 1), + regvals[2], regvals[3], + *(signed char *)(regvals + 5), + *(signed char *)(regvals + 7)); + return ret; +} + +/* + * end of per-unit (or driver, in some cases, but replicated + * per unit) functions + */ + +/* start of per-unit file structures and support code */ +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(version, S_IRUGO, show_version, NULL); +static DEVICE_ATTR(nctxts, S_IRUGO, show_nctxts, NULL); +static DEVICE_ATTR(nfreectxts, S_IRUGO, show_nfreectxts, NULL); +static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL); +static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL); +static DEVICE_ATTR(logged_errors, S_IRUGO, show_logged_errs, NULL); +static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL); +static DEVICE_ATTR(localbus_info, S_IRUGO, show_localbus_info, NULL); +static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset); + +static struct device_attribute *qib_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_hca_type, + &dev_attr_board_id, + &dev_attr_version, + &dev_attr_nctxts, + &dev_attr_nfreectxts, + &dev_attr_serial, + &dev_attr_boardversion, + &dev_attr_logged_errors, + &dev_attr_tempsense, + &dev_attr_localbus_info, + &dev_attr_chip_reset, +}; + +int qib_create_port_files(struct ib_device *ibdev, u8 port_num, + struct kobject *kobj) +{ + struct qib_pportdata *ppd; + struct qib_devdata *dd = dd_from_ibdev(ibdev); + int ret; + + if (!port_num || port_num > dd->num_pports) { + qib_dev_err(dd, + "Skipping infiniband class with invalid port %u\n", + port_num); + ret = -ENODEV; + goto bail; + } + ppd = &dd->pport[port_num - 1]; + + ret = kobject_init_and_add(&ppd->pport_kobj, &qib_port_ktype, kobj, + "linkcontrol"); + if (ret) { + qib_dev_err(dd, + "Skipping linkcontrol sysfs info, (err %d) port %u\n", + ret, port_num); + goto bail; + } + kobject_uevent(&ppd->pport_kobj, KOBJ_ADD); + + ret = kobject_init_and_add(&ppd->sl2vl_kobj, &qib_sl2vl_ktype, kobj, + "sl2vl"); + if (ret) { + qib_dev_err(dd, + "Skipping sl2vl sysfs info, (err %d) port %u\n", + ret, port_num); + goto bail_link; + } + kobject_uevent(&ppd->sl2vl_kobj, KOBJ_ADD); + + ret = kobject_init_and_add(&ppd->diagc_kobj, &qib_diagc_ktype, kobj, + "diag_counters"); + if (ret) { + qib_dev_err(dd, + "Skipping diag_counters sysfs info, (err %d) port %u\n", + ret, port_num); + goto bail_sl; + } + kobject_uevent(&ppd->diagc_kobj, KOBJ_ADD); + + if (!qib_cc_table_size || !ppd->congestion_entries_shadow) + return 0; + + ret = kobject_init_and_add(&ppd->pport_cc_kobj, &qib_port_cc_ktype, + kobj, "CCMgtA"); + if (ret) { + qib_dev_err(dd, + "Skipping Congestion Control sysfs info, (err %d) port %u\n", + ret, port_num); + goto bail_diagc; + } + + kobject_uevent(&ppd->pport_cc_kobj, KOBJ_ADD); + + ret = sysfs_create_bin_file(&ppd->pport_cc_kobj, + &cc_setting_bin_attr); + if (ret) { + qib_dev_err(dd, + "Skipping Congestion Control setting sysfs info, (err %d) port %u\n", + ret, port_num); + goto bail_cc; + } + + ret = sysfs_create_bin_file(&ppd->pport_cc_kobj, + &cc_table_bin_attr); + if (ret) { + qib_dev_err(dd, + "Skipping Congestion Control table sysfs info, (err %d) port %u\n", + ret, port_num); + goto bail_cc_entry_bin; + } + + qib_devinfo(dd->pcidev, + "IB%u: Congestion Control Agent enabled for port %d\n", + dd->unit, port_num); + + return 0; + +bail_cc_entry_bin: + sysfs_remove_bin_file(&ppd->pport_cc_kobj, &cc_setting_bin_attr); +bail_cc: + kobject_put(&ppd->pport_cc_kobj); +bail_diagc: + kobject_put(&ppd->diagc_kobj); +bail_sl: + kobject_put(&ppd->sl2vl_kobj); +bail_link: + kobject_put(&ppd->pport_kobj); +bail: + return ret; +} + +/* + * Register and create our files in /sys/class/infiniband. + */ +int qib_verbs_register_sysfs(struct qib_devdata *dd) +{ + struct ib_device *dev = &dd->verbs_dev.ibdev; + int i, ret; + + for (i = 0; i < ARRAY_SIZE(qib_attributes); ++i) { + ret = device_create_file(&dev->dev, qib_attributes[i]); + if (ret) + goto bail; + } + + return 0; +bail: + for (i = 0; i < ARRAY_SIZE(qib_attributes); ++i) + device_remove_file(&dev->dev, qib_attributes[i]); + return ret; +} + +/* + * Unregister and remove our files in /sys/class/infiniband. + */ +void qib_verbs_unregister_sysfs(struct qib_devdata *dd) +{ + struct qib_pportdata *ppd; + int i; + + for (i = 0; i < dd->num_pports; i++) { + ppd = &dd->pport[i]; + if (qib_cc_table_size && + ppd->congestion_entries_shadow) { + sysfs_remove_bin_file(&ppd->pport_cc_kobj, + &cc_setting_bin_attr); + sysfs_remove_bin_file(&ppd->pport_cc_kobj, + &cc_table_bin_attr); + kobject_put(&ppd->pport_cc_kobj); + } + kobject_put(&ppd->sl2vl_kobj); + kobject_put(&ppd->pport_kobj); + } +} diff --git a/drivers/infiniband/hw/qib/qib_twsi.c b/drivers/infiniband/hw/qib/qib_twsi.c new file mode 100644 index 0000000..647f7be --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_twsi.c @@ -0,0 +1,500 @@ +/* + * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "qib.h" + +/* + * QLogic_IB "Two Wire Serial Interface" driver. + * Originally written for a not-quite-i2c serial eeprom, which is + * still used on some supported boards. Later boards have added a + * variety of other uses, most board-specific, so the bit-boffing + * part has been split off to this file, while the other parts + * have been moved to chip-specific files. + * + * We have also dropped all pretense of fully generic (e.g. pretend + * we don't know whether '1' is the higher voltage) interface, as + * the restrictions of the generic i2c interface (e.g. no access from + * driver itself) make it unsuitable for this use. + */ + +#define READ_CMD 1 +#define WRITE_CMD 0 + +/** + * i2c_wait_for_writes - wait for a write + * @dd: the qlogic_ib device + * + * We use this instead of udelay directly, so we can make sure + * that previous register writes have been flushed all the way + * to the chip. Since we are delaying anyway, the cost doesn't + * hurt, and makes the bit twiddling more regular + */ +static void i2c_wait_for_writes(struct qib_devdata *dd) +{ + /* + * implicit read of EXTStatus is as good as explicit + * read of scratch, if all we want to do is flush + * writes. + */ + dd->f_gpio_mod(dd, 0, 0, 0); + rmb(); /* inlined, so prevent compiler reordering */ +} + +/* + * QSFP modules are allowed to hold SCL low for 500uSec. Allow twice that + * for "almost compliant" modules + */ +#define SCL_WAIT_USEC 1000 + +/* BUF_WAIT is time bus must be free between STOP or ACK and to next START. + * Should be 20, but some chips need more. + */ +#define TWSI_BUF_WAIT_USEC 60 + +static void scl_out(struct qib_devdata *dd, u8 bit) +{ + u32 mask; + + udelay(1); + + mask = 1UL << dd->gpio_scl_num; + + /* SCL is meant to be bare-drain, so never set "OUT", just DIR */ + dd->f_gpio_mod(dd, 0, bit ? 0 : mask, mask); + + /* + * Allow for slow slaves by simple + * delay for falling edge, sampling on rise. + */ + if (!bit) + udelay(2); + else { + int rise_usec; + for (rise_usec = SCL_WAIT_USEC; rise_usec > 0; rise_usec -= 2) { + if (mask & dd->f_gpio_mod(dd, 0, 0, 0)) + break; + udelay(2); + } + if (rise_usec <= 0) + qib_dev_err(dd, "SCL interface stuck low > %d uSec\n", + SCL_WAIT_USEC); + } + i2c_wait_for_writes(dd); +} + +static void sda_out(struct qib_devdata *dd, u8 bit) +{ + u32 mask; + + mask = 1UL << dd->gpio_sda_num; + + /* SDA is meant to be bare-drain, so never set "OUT", just DIR */ + dd->f_gpio_mod(dd, 0, bit ? 0 : mask, mask); + + i2c_wait_for_writes(dd); + udelay(2); +} + +static u8 sda_in(struct qib_devdata *dd, int wait) +{ + int bnum; + u32 read_val, mask; + + bnum = dd->gpio_sda_num; + mask = (1UL << bnum); + /* SDA is meant to be bare-drain, so never set "OUT", just DIR */ + dd->f_gpio_mod(dd, 0, 0, mask); + read_val = dd->f_gpio_mod(dd, 0, 0, 0); + if (wait) + i2c_wait_for_writes(dd); + return (read_val & mask) >> bnum; +} + +/** + * i2c_ackrcv - see if ack following write is true + * @dd: the qlogic_ib device + */ +static int i2c_ackrcv(struct qib_devdata *dd) +{ + u8 ack_received; + + /* AT ENTRY SCL = LOW */ + /* change direction, ignore data */ + ack_received = sda_in(dd, 1); + scl_out(dd, 1); + ack_received = sda_in(dd, 1) == 0; + scl_out(dd, 0); + return ack_received; +} + +static void stop_cmd(struct qib_devdata *dd); + +/** + * rd_byte - read a byte, sending STOP on last, else ACK + * @dd: the qlogic_ib device + * + * Returns byte shifted out of device + */ +static int rd_byte(struct qib_devdata *dd, int last) +{ + int bit_cntr, data; + + data = 0; + + for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) { + data <<= 1; + scl_out(dd, 1); + data |= sda_in(dd, 0); + scl_out(dd, 0); + } + if (last) { + scl_out(dd, 1); + stop_cmd(dd); + } else { + sda_out(dd, 0); + scl_out(dd, 1); + scl_out(dd, 0); + sda_out(dd, 1); + } + return data; +} + +/** + * wr_byte - write a byte, one bit at a time + * @dd: the qlogic_ib device + * @data: the byte to write + * + * Returns 0 if we got the following ack, otherwise 1 + */ +static int wr_byte(struct qib_devdata *dd, u8 data) +{ + int bit_cntr; + u8 bit; + + for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) { + bit = (data >> bit_cntr) & 1; + sda_out(dd, bit); + scl_out(dd, 1); + scl_out(dd, 0); + } + return (!i2c_ackrcv(dd)) ? 1 : 0; +} + +/* + * issue TWSI start sequence: + * (both clock/data high, clock high, data low while clock is high) + */ +static void start_seq(struct qib_devdata *dd) +{ + sda_out(dd, 1); + scl_out(dd, 1); + sda_out(dd, 0); + udelay(1); + scl_out(dd, 0); +} + +/** + * stop_seq - transmit the stop sequence + * @dd: the qlogic_ib device + * + * (both clock/data low, clock high, data high while clock is high) + */ +static void stop_seq(struct qib_devdata *dd) +{ + scl_out(dd, 0); + sda_out(dd, 0); + scl_out(dd, 1); + sda_out(dd, 1); +} + +/** + * stop_cmd - transmit the stop condition + * @dd: the qlogic_ib device + * + * (both clock/data low, clock high, data high while clock is high) + */ +static void stop_cmd(struct qib_devdata *dd) +{ + stop_seq(dd); + udelay(TWSI_BUF_WAIT_USEC); +} + +/** + * qib_twsi_reset - reset I2C communication + * @dd: the qlogic_ib device + */ + +int qib_twsi_reset(struct qib_devdata *dd) +{ + int clock_cycles_left = 9; + int was_high = 0; + u32 pins, mask; + + /* Both SCL and SDA should be high. If not, there + * is something wrong. + */ + mask = (1UL << dd->gpio_scl_num) | (1UL << dd->gpio_sda_num); + + /* + * Force pins to desired innocuous state. + * This is the default power-on state with out=0 and dir=0, + * So tri-stated and should be floating high (barring HW problems) + */ + dd->f_gpio_mod(dd, 0, 0, mask); + + /* + * Clock nine times to get all listeners into a sane state. + * If SDA does not go high at any point, we are wedged. + * One vendor recommends then issuing START followed by STOP. + * we cannot use our "normal" functions to do that, because + * if SCL drops between them, another vendor's part will + * wedge, dropping SDA and keeping it low forever, at the end of + * the next transaction (even if it was not the device addressed). + * So our START and STOP take place with SCL held high. + */ + while (clock_cycles_left--) { + scl_out(dd, 0); + scl_out(dd, 1); + /* Note if SDA is high, but keep clocking to sync slave */ + was_high |= sda_in(dd, 0); + } + + if (was_high) { + /* + * We saw a high, which we hope means the slave is sync'd. + * Issue START, STOP, pause for T_BUF. + */ + + pins = dd->f_gpio_mod(dd, 0, 0, 0); + if ((pins & mask) != mask) + qib_dev_err(dd, "GPIO pins not at rest: %d\n", + pins & mask); + /* Drop SDA to issue START */ + udelay(1); /* Guarantee .6 uSec setup */ + sda_out(dd, 0); + udelay(1); /* Guarantee .6 uSec hold */ + /* At this point, SCL is high, SDA low. Raise SDA for STOP */ + sda_out(dd, 1); + udelay(TWSI_BUF_WAIT_USEC); + } + + return !was_high; +} + +#define QIB_TWSI_START 0x100 +#define QIB_TWSI_STOP 0x200 + +/* Write byte to TWSI, optionally prefixed with START or suffixed with + * STOP. + * returns 0 if OK (ACK received), else != 0 + */ +static int qib_twsi_wr(struct qib_devdata *dd, int data, int flags) +{ + int ret = 1; + if (flags & QIB_TWSI_START) + start_seq(dd); + + ret = wr_byte(dd, data); /* Leaves SCL low (from i2c_ackrcv()) */ + + if (flags & QIB_TWSI_STOP) + stop_cmd(dd); + return ret; +} + +/* Added functionality for IBA7220-based cards */ +#define QIB_TEMP_DEV 0x98 + +/* + * qib_twsi_blk_rd + * Formerly called qib_eeprom_internal_read, and only used for eeprom, + * but now the general interface for data transfer from twsi devices. + * One vestige of its former role is that it recognizes a device + * QIB_TWSI_NO_DEV and does the correct operation for the legacy part, + * which responded to all TWSI device codes, interpreting them as + * address within device. On all other devices found on board handled by + * this driver, the device is followed by a one-byte "address" which selects + * the "register" or "offset" within the device from which data should + * be read. + */ +int qib_twsi_blk_rd(struct qib_devdata *dd, int dev, int addr, + void *buffer, int len) +{ + int ret; + u8 *bp = buffer; + + ret = 1; + + if (dev == QIB_TWSI_NO_DEV) { + /* legacy not-really-I2C */ + addr = (addr << 1) | READ_CMD; + ret = qib_twsi_wr(dd, addr, QIB_TWSI_START); + } else { + /* Actual I2C */ + ret = qib_twsi_wr(dd, dev | WRITE_CMD, QIB_TWSI_START); + if (ret) { + stop_cmd(dd); + ret = 1; + goto bail; + } + /* + * SFF spec claims we do _not_ stop after the addr + * but simply issue a start with the "read" dev-addr. + * Since we are implicitely waiting for ACK here, + * we need t_buf (nominally 20uSec) before that start, + * and cannot rely on the delay built in to the STOP + */ + ret = qib_twsi_wr(dd, addr, 0); + udelay(TWSI_BUF_WAIT_USEC); + + if (ret) { + qib_dev_err(dd, + "Failed to write interface read addr %02X\n", + addr); + ret = 1; + goto bail; + } + ret = qib_twsi_wr(dd, dev | READ_CMD, QIB_TWSI_START); + } + if (ret) { + stop_cmd(dd); + ret = 1; + goto bail; + } + + /* + * block devices keeps clocking data out as long as we ack, + * automatically incrementing the address. Some have "pages" + * whose boundaries will not be crossed, but the handling + * of these is left to the caller, who is in a better + * position to know. + */ + while (len-- > 0) { + /* + * Get and store data, sending ACK if length remaining, + * else STOP + */ + *bp++ = rd_byte(dd, !len); + } + + ret = 0; + +bail: + return ret; +} + +/* + * qib_twsi_blk_wr + * Formerly called qib_eeprom_internal_write, and only used for eeprom, + * but now the general interface for data transfer to twsi devices. + * One vestige of its former role is that it recognizes a device + * QIB_TWSI_NO_DEV and does the correct operation for the legacy part, + * which responded to all TWSI device codes, interpreting them as + * address within device. On all other devices found on board handled by + * this driver, the device is followed by a one-byte "address" which selects + * the "register" or "offset" within the device to which data should + * be written. + */ +int qib_twsi_blk_wr(struct qib_devdata *dd, int dev, int addr, + const void *buffer, int len) +{ + int sub_len; + const u8 *bp = buffer; + int max_wait_time, i; + int ret; + ret = 1; + + while (len > 0) { + if (dev == QIB_TWSI_NO_DEV) { + if (qib_twsi_wr(dd, (addr << 1) | WRITE_CMD, + QIB_TWSI_START)) { + goto failed_write; + } + } else { + /* Real I2C */ + if (qib_twsi_wr(dd, dev | WRITE_CMD, QIB_TWSI_START)) + goto failed_write; + ret = qib_twsi_wr(dd, addr, 0); + if (ret) { + qib_dev_err(dd, + "Failed to write interface write addr %02X\n", + addr); + goto failed_write; + } + } + + sub_len = min(len, 4); + addr += sub_len; + len -= sub_len; + + for (i = 0; i < sub_len; i++) + if (qib_twsi_wr(dd, *bp++, 0)) + goto failed_write; + + stop_cmd(dd); + + /* + * Wait for write complete by waiting for a successful + * read (the chip replies with a zero after the write + * cmd completes, and before it writes to the eeprom. + * The startcmd for the read will fail the ack until + * the writes have completed. We do this inline to avoid + * the debug prints that are in the real read routine + * if the startcmd fails. + * We also use the proper device address, so it doesn't matter + * whether we have real eeprom_dev. Legacy likes any address. + */ + max_wait_time = 100; + while (qib_twsi_wr(dd, dev | READ_CMD, QIB_TWSI_START)) { + stop_cmd(dd); + if (!--max_wait_time) + goto failed_write; + } + /* now read (and ignore) the resulting byte */ + rd_byte(dd, 1); + } + + ret = 0; + goto bail; + +failed_write: + stop_cmd(dd); + ret = 1; + +bail: + return ret; +} diff --git a/drivers/infiniband/hw/qib/qib_tx.c b/drivers/infiniband/hw/qib/qib_tx.c new file mode 100644 index 0000000..31d3561 --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_tx.c @@ -0,0 +1,571 @@ +/* + * Copyright (c) 2008, 2009, 2010 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "qib.h" + +static unsigned qib_hol_timeout_ms = 3000; +module_param_named(hol_timeout_ms, qib_hol_timeout_ms, uint, S_IRUGO); +MODULE_PARM_DESC(hol_timeout_ms, + "duration of user app suspension after link failure"); + +unsigned qib_sdma_fetch_arb = 1; +module_param_named(fetch_arb, qib_sdma_fetch_arb, uint, S_IRUGO); +MODULE_PARM_DESC(fetch_arb, "IBA7220: change SDMA descriptor arbitration"); + +/** + * qib_disarm_piobufs - cancel a range of PIO buffers + * @dd: the qlogic_ib device + * @first: the first PIO buffer to cancel + * @cnt: the number of PIO buffers to cancel + * + * Cancel a range of PIO buffers. Used at user process close, + * in case it died while writing to a PIO buffer. + */ +void qib_disarm_piobufs(struct qib_devdata *dd, unsigned first, unsigned cnt) +{ + unsigned long flags; + unsigned i; + unsigned last; + + last = first + cnt; + spin_lock_irqsave(&dd->pioavail_lock, flags); + for (i = first; i < last; i++) { + __clear_bit(i, dd->pio_need_disarm); + dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_BUF(i)); + } + spin_unlock_irqrestore(&dd->pioavail_lock, flags); +} + +/* + * This is called by a user process when it sees the DISARM_BUFS event + * bit is set. + */ +int qib_disarm_piobufs_ifneeded(struct qib_ctxtdata *rcd) +{ + struct qib_devdata *dd = rcd->dd; + unsigned i; + unsigned last; + unsigned n = 0; + + last = rcd->pio_base + rcd->piocnt; + /* + * Don't need uctxt_lock here, since user has called in to us. + * Clear at start in case more interrupts set bits while we + * are disarming + */ + if (rcd->user_event_mask) { + /* + * subctxt_cnt is 0 if not shared, so do base + * separately, first, then remaining subctxt, if any + */ + clear_bit(_QIB_EVENT_DISARM_BUFS_BIT, &rcd->user_event_mask[0]); + for (i = 1; i < rcd->subctxt_cnt; i++) + clear_bit(_QIB_EVENT_DISARM_BUFS_BIT, + &rcd->user_event_mask[i]); + } + spin_lock_irq(&dd->pioavail_lock); + for (i = rcd->pio_base; i < last; i++) { + if (__test_and_clear_bit(i, dd->pio_need_disarm)) { + n++; + dd->f_sendctrl(rcd->ppd, QIB_SENDCTRL_DISARM_BUF(i)); + } + } + spin_unlock_irq(&dd->pioavail_lock); + return 0; +} + +static struct qib_pportdata *is_sdma_buf(struct qib_devdata *dd, unsigned i) +{ + struct qib_pportdata *ppd; + unsigned pidx; + + for (pidx = 0; pidx < dd->num_pports; pidx++) { + ppd = dd->pport + pidx; + if (i >= ppd->sdma_state.first_sendbuf && + i < ppd->sdma_state.last_sendbuf) + return ppd; + } + return NULL; +} + +/* + * Return true if send buffer is being used by a user context. + * Sets _QIB_EVENT_DISARM_BUFS_BIT in user_event_mask as a side effect + */ +static int find_ctxt(struct qib_devdata *dd, unsigned bufn) +{ + struct qib_ctxtdata *rcd; + unsigned ctxt; + int ret = 0; + + spin_lock(&dd->uctxt_lock); + for (ctxt = dd->first_user_ctxt; ctxt < dd->cfgctxts; ctxt++) { + rcd = dd->rcd[ctxt]; + if (!rcd || bufn < rcd->pio_base || + bufn >= rcd->pio_base + rcd->piocnt) + continue; + if (rcd->user_event_mask) { + int i; + /* + * subctxt_cnt is 0 if not shared, so do base + * separately, first, then remaining subctxt, if any + */ + set_bit(_QIB_EVENT_DISARM_BUFS_BIT, + &rcd->user_event_mask[0]); + for (i = 1; i < rcd->subctxt_cnt; i++) + set_bit(_QIB_EVENT_DISARM_BUFS_BIT, + &rcd->user_event_mask[i]); + } + ret = 1; + break; + } + spin_unlock(&dd->uctxt_lock); + + return ret; +} + +/* + * Disarm a set of send buffers. If the buffer might be actively being + * written to, mark the buffer to be disarmed later when it is not being + * written to. + * + * This should only be called from the IRQ error handler. + */ +void qib_disarm_piobufs_set(struct qib_devdata *dd, unsigned long *mask, + unsigned cnt) +{ + struct qib_pportdata *ppd, *pppd[QIB_MAX_IB_PORTS]; + unsigned i; + unsigned long flags; + + for (i = 0; i < dd->num_pports; i++) + pppd[i] = NULL; + + for (i = 0; i < cnt; i++) { + int which; + if (!test_bit(i, mask)) + continue; + /* + * If the buffer is owned by the DMA hardware, + * reset the DMA engine. + */ + ppd = is_sdma_buf(dd, i); + if (ppd) { + pppd[ppd->port] = ppd; + continue; + } + /* + * If the kernel is writing the buffer or the buffer is + * owned by a user process, we can't clear it yet. + */ + spin_lock_irqsave(&dd->pioavail_lock, flags); + if (test_bit(i, dd->pio_writing) || + (!test_bit(i << 1, dd->pioavailkernel) && + find_ctxt(dd, i))) { + __set_bit(i, dd->pio_need_disarm); + which = 0; + } else { + which = 1; + dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_BUF(i)); + } + spin_unlock_irqrestore(&dd->pioavail_lock, flags); + } + + /* do cancel_sends once per port that had sdma piobufs in error */ + for (i = 0; i < dd->num_pports; i++) + if (pppd[i]) + qib_cancel_sends(pppd[i]); +} + +/** + * update_send_bufs - update shadow copy of the PIO availability map + * @dd: the qlogic_ib device + * + * called whenever our local copy indicates we have run out of send buffers + */ +static void update_send_bufs(struct qib_devdata *dd) +{ + unsigned long flags; + unsigned i; + const unsigned piobregs = dd->pioavregs; + + /* + * If the generation (check) bits have changed, then we update the + * busy bit for the corresponding PIO buffer. This algorithm will + * modify positions to the value they already have in some cases + * (i.e., no change), but it's faster than changing only the bits + * that have changed. + * + * We would like to do this atomicly, to avoid spinlocks in the + * critical send path, but that's not really possible, given the + * type of changes, and that this routine could be called on + * multiple cpu's simultaneously, so we lock in this routine only, + * to avoid conflicting updates; all we change is the shadow, and + * it's a single 64 bit memory location, so by definition the update + * is atomic in terms of what other cpu's can see in testing the + * bits. The spin_lock overhead isn't too bad, since it only + * happens when all buffers are in use, so only cpu overhead, not + * latency or bandwidth is affected. + */ + if (!dd->pioavailregs_dma) + return; + spin_lock_irqsave(&dd->pioavail_lock, flags); + for (i = 0; i < piobregs; i++) { + u64 pchbusy, pchg, piov, pnew; + + piov = le64_to_cpu(dd->pioavailregs_dma[i]); + pchg = dd->pioavailkernel[i] & + ~(dd->pioavailshadow[i] ^ piov); + pchbusy = pchg << QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT; + if (pchg && (pchbusy & dd->pioavailshadow[i])) { + pnew = dd->pioavailshadow[i] & ~pchbusy; + pnew |= piov & pchbusy; + dd->pioavailshadow[i] = pnew; + } + } + spin_unlock_irqrestore(&dd->pioavail_lock, flags); +} + +/* + * Debugging code and stats updates if no pio buffers available. + */ +static noinline void no_send_bufs(struct qib_devdata *dd) +{ + dd->upd_pio_shadow = 1; + + /* not atomic, but if we lose a stat count in a while, that's OK */ + qib_stats.sps_nopiobufs++; +} + +/* + * Common code for normal driver send buffer allocation, and reserved + * allocation. + * + * Do appropriate marking as busy, etc. + * Returns buffer pointer if one is found, otherwise NULL. + */ +u32 __iomem *qib_getsendbuf_range(struct qib_devdata *dd, u32 *pbufnum, + u32 first, u32 last) +{ + unsigned i, j, updated = 0; + unsigned nbufs; + unsigned long flags; + unsigned long *shadow = dd->pioavailshadow; + u32 __iomem *buf; + + if (!(dd->flags & QIB_PRESENT)) + return NULL; + + nbufs = last - first + 1; /* number in range to check */ + if (dd->upd_pio_shadow) { +update_shadow: + /* + * Minor optimization. If we had no buffers on last call, + * start out by doing the update; continue and do scan even + * if no buffers were updated, to be paranoid. + */ + update_send_bufs(dd); + updated++; + } + i = first; + /* + * While test_and_set_bit() is atomic, we do that and then the + * change_bit(), and the pair is not. See if this is the cause + * of the remaining armlaunch errors. + */ + spin_lock_irqsave(&dd->pioavail_lock, flags); + if (dd->last_pio >= first && dd->last_pio <= last) + i = dd->last_pio + 1; + if (!first) + /* adjust to min possible */ + nbufs = last - dd->min_kernel_pio + 1; + for (j = 0; j < nbufs; j++, i++) { + if (i > last) + i = !first ? dd->min_kernel_pio : first; + if (__test_and_set_bit((2 * i) + 1, shadow)) + continue; + /* flip generation bit */ + __change_bit(2 * i, shadow); + /* remember that the buffer can be written to now */ + __set_bit(i, dd->pio_writing); + if (!first && first != last) /* first == last on VL15, avoid */ + dd->last_pio = i; + break; + } + spin_unlock_irqrestore(&dd->pioavail_lock, flags); + + if (j == nbufs) { + if (!updated) + /* + * First time through; shadow exhausted, but may be + * buffers available, try an update and then rescan. + */ + goto update_shadow; + no_send_bufs(dd); + buf = NULL; + } else { + if (i < dd->piobcnt2k) + buf = (u32 __iomem *)(dd->pio2kbase + + i * dd->palign); + else if (i < dd->piobcnt2k + dd->piobcnt4k || !dd->piovl15base) + buf = (u32 __iomem *)(dd->pio4kbase + + (i - dd->piobcnt2k) * dd->align4k); + else + buf = (u32 __iomem *)(dd->piovl15base + + (i - (dd->piobcnt2k + dd->piobcnt4k)) * + dd->align4k); + if (pbufnum) + *pbufnum = i; + dd->upd_pio_shadow = 0; + } + + return buf; +} + +/* + * Record that the caller is finished writing to the buffer so we don't + * disarm it while it is being written and disarm it now if needed. + */ +void qib_sendbuf_done(struct qib_devdata *dd, unsigned n) +{ + unsigned long flags; + + spin_lock_irqsave(&dd->pioavail_lock, flags); + __clear_bit(n, dd->pio_writing); + if (__test_and_clear_bit(n, dd->pio_need_disarm)) + dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_BUF(n)); + spin_unlock_irqrestore(&dd->pioavail_lock, flags); +} + +/** + * qib_chg_pioavailkernel - change which send buffers are available for kernel + * @dd: the qlogic_ib device + * @start: the starting send buffer number + * @len: the number of send buffers + * @avail: true if the buffers are available for kernel use, false otherwise + */ +void qib_chg_pioavailkernel(struct qib_devdata *dd, unsigned start, + unsigned len, u32 avail, struct qib_ctxtdata *rcd) +{ + unsigned long flags; + unsigned end; + unsigned ostart = start; + + /* There are two bits per send buffer (busy and generation) */ + start *= 2; + end = start + len * 2; + + spin_lock_irqsave(&dd->pioavail_lock, flags); + /* Set or clear the busy bit in the shadow. */ + while (start < end) { + if (avail) { + unsigned long dma; + int i; + + /* + * The BUSY bit will never be set, because we disarm + * the user buffers before we hand them back to the + * kernel. We do have to make sure the generation + * bit is set correctly in shadow, since it could + * have changed many times while allocated to user. + * We can't use the bitmap functions on the full + * dma array because it is always little-endian, so + * we have to flip to host-order first. + * BITS_PER_LONG is slightly wrong, since it's + * always 64 bits per register in chip... + * We only work on 64 bit kernels, so that's OK. + */ + i = start / BITS_PER_LONG; + __clear_bit(QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT + start, + dd->pioavailshadow); + dma = (unsigned long) + le64_to_cpu(dd->pioavailregs_dma[i]); + if (test_bit((QLOGIC_IB_SENDPIOAVAIL_CHECK_SHIFT + + start) % BITS_PER_LONG, &dma)) + __set_bit(QLOGIC_IB_SENDPIOAVAIL_CHECK_SHIFT + + start, dd->pioavailshadow); + else + __clear_bit(QLOGIC_IB_SENDPIOAVAIL_CHECK_SHIFT + + start, dd->pioavailshadow); + __set_bit(start, dd->pioavailkernel); + if ((start >> 1) < dd->min_kernel_pio) + dd->min_kernel_pio = start >> 1; + } else { + __set_bit(start + QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT, + dd->pioavailshadow); + __clear_bit(start, dd->pioavailkernel); + if ((start >> 1) > dd->min_kernel_pio) + dd->min_kernel_pio = start >> 1; + } + start += 2; + } + + if (dd->min_kernel_pio > 0 && dd->last_pio < dd->min_kernel_pio - 1) + dd->last_pio = dd->min_kernel_pio - 1; + spin_unlock_irqrestore(&dd->pioavail_lock, flags); + + dd->f_txchk_change(dd, ostart, len, avail, rcd); +} + +/* + * Flush all sends that might be in the ready to send state, as well as any + * that are in the process of being sent. Used whenever we need to be + * sure the send side is idle. Cleans up all buffer state by canceling + * all pio buffers, and issuing an abort, which cleans up anything in the + * launch fifo. The cancel is superfluous on some chip versions, but + * it's safer to always do it. + * PIOAvail bits are updated by the chip as if a normal send had happened. + */ +void qib_cancel_sends(struct qib_pportdata *ppd) +{ + struct qib_devdata *dd = ppd->dd; + struct qib_ctxtdata *rcd; + unsigned long flags; + unsigned ctxt; + unsigned i; + unsigned last; + + /* + * Tell PSM to disarm buffers again before trying to reuse them. + * We need to be sure the rcd doesn't change out from under us + * while we do so. We hold the two locks sequentially. We might + * needlessly set some need_disarm bits as a result, if the + * context is closed after we release the uctxt_lock, but that's + * fairly benign, and safer than nesting the locks. + */ + for (ctxt = dd->first_user_ctxt; ctxt < dd->cfgctxts; ctxt++) { + spin_lock_irqsave(&dd->uctxt_lock, flags); + rcd = dd->rcd[ctxt]; + if (rcd && rcd->ppd == ppd) { + last = rcd->pio_base + rcd->piocnt; + if (rcd->user_event_mask) { + /* + * subctxt_cnt is 0 if not shared, so do base + * separately, first, then remaining subctxt, + * if any + */ + set_bit(_QIB_EVENT_DISARM_BUFS_BIT, + &rcd->user_event_mask[0]); + for (i = 1; i < rcd->subctxt_cnt; i++) + set_bit(_QIB_EVENT_DISARM_BUFS_BIT, + &rcd->user_event_mask[i]); + } + i = rcd->pio_base; + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + spin_lock_irqsave(&dd->pioavail_lock, flags); + for (; i < last; i++) + __set_bit(i, dd->pio_need_disarm); + spin_unlock_irqrestore(&dd->pioavail_lock, flags); + } else + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + } + + if (!(dd->flags & QIB_HAS_SEND_DMA)) + dd->f_sendctrl(ppd, QIB_SENDCTRL_DISARM_ALL | + QIB_SENDCTRL_FLUSH); +} + +/* + * Force an update of in-memory copy of the pioavail registers, when + * needed for any of a variety of reasons. + * If already off, this routine is a nop, on the assumption that the + * caller (or set of callers) will "do the right thing". + * This is a per-device operation, so just the first port. + */ +void qib_force_pio_avail_update(struct qib_devdata *dd) +{ + dd->f_sendctrl(dd->pport, QIB_SENDCTRL_AVAIL_BLIP); +} + +void qib_hol_down(struct qib_pportdata *ppd) +{ + /* + * Cancel sends when the link goes DOWN so that we aren't doing it + * at INIT when we might be trying to send SMI packets. + */ + if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) + qib_cancel_sends(ppd); +} + +/* + * Link is at INIT. + * We start the HoL timer so we can detect stuck packets blocking SMP replies. + * Timer may already be running, so use mod_timer, not add_timer. + */ +void qib_hol_init(struct qib_pportdata *ppd) +{ + if (ppd->hol_state != QIB_HOL_INIT) { + ppd->hol_state = QIB_HOL_INIT; + mod_timer(&ppd->hol_timer, + jiffies + msecs_to_jiffies(qib_hol_timeout_ms)); + } +} + +/* + * Link is up, continue any user processes, and ensure timer + * is a nop, if running. Let timer keep running, if set; it + * will nop when it sees the link is up. + */ +void qib_hol_up(struct qib_pportdata *ppd) +{ + ppd->hol_state = QIB_HOL_UP; +} + +/* + * This is only called via the timer. + */ +void qib_hol_event(unsigned long opaque) +{ + struct qib_pportdata *ppd = (struct qib_pportdata *)opaque; + + /* If hardware error, etc, skip. */ + if (!(ppd->dd->flags & QIB_INITTED)) + return; + + if (ppd->hol_state != QIB_HOL_UP) { + /* + * Try to flush sends in case a stuck packet is blocking + * SMP replies. + */ + qib_hol_down(ppd); + mod_timer(&ppd->hol_timer, + jiffies + msecs_to_jiffies(qib_hol_timeout_ms)); + } +} diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c new file mode 100644 index 0000000..aa3a803 --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_uc.c @@ -0,0 +1,536 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation. + * All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "qib.h" + +/* cut down ridiculously long IB macro names */ +#define OP(x) IB_OPCODE_UC_##x + +/** + * qib_make_uc_req - construct a request packet (SEND, RDMA write) + * @qp: a pointer to the QP + * + * Return 1 if constructed; otherwise, return 0. + */ +int qib_make_uc_req(struct qib_qp *qp) +{ + struct qib_other_headers *ohdr; + struct qib_swqe *wqe; + unsigned long flags; + u32 hwords; + u32 bth0; + u32 len; + u32 pmtu = qp->pmtu; + int ret = 0; + + spin_lock_irqsave(&qp->s_lock, flags); + + if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_SEND_OK)) { + if (!(ib_qib_state_ops[qp->state] & QIB_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + if (qp->s_last == qp->s_head) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (atomic_read(&qp->s_dma_busy)) { + qp->s_flags |= QIB_S_WAIT_DMA; + goto bail; + } + wqe = get_swqe_ptr(qp, qp->s_last); + qib_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + goto done; + } + + ohdr = &qp->s_hdr->u.oth; + if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) + ohdr = &qp->s_hdr->u.l.oth; + + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + bth0 = 0; + + /* Get the next send request. */ + wqe = get_swqe_ptr(qp, qp->s_cur); + qp->s_wqe = NULL; + switch (qp->s_state) { + default: + if (!(ib_qib_state_ops[qp->state] & + QIB_PROCESS_NEXT_SEND_OK)) + goto bail; + /* Check if send work queue is empty. */ + if (qp->s_cur == qp->s_head) + goto bail; + /* + * Start a new request. + */ + wqe->psn = qp->s_next_psn; + qp->s_psn = qp->s_next_psn; + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_sge.total_len = wqe->length; + len = wqe->length; + qp->s_len = len; + switch (wqe->wr.opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + if (len > pmtu) { + qp->s_state = OP(SEND_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_ONLY); + else { + qp->s_state = + OP(SEND_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + hwords += sizeof(struct ib_reth) / 4; + if (len > pmtu) { + qp->s_state = OP(RDMA_WRITE_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_ONLY); + else { + qp->s_state = + OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after the RETH */ + ohdr->u.rc.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + } + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + default: + goto bail; + } + break; + + case OP(SEND_FIRST): + qp->s_state = OP(SEND_MIDDLE); + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_LAST); + else { + qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_WRITE_FIRST): + qp->s_state = OP(RDMA_WRITE_MIDDLE); + /* FALLTHROUGH */ + case OP(RDMA_WRITE_MIDDLE): + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_LAST); + else { + qp->s_state = + OP(RDMA_WRITE_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + } + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + } + qp->s_len -= len; + qp->s_hdrwords = hwords; + qp->s_cur_sge = &qp->s_sge; + qp->s_cur_size = len; + qib_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24), + qp->s_next_psn++ & QIB_PSN_MASK); +done: + ret = 1; + goto unlock; + +bail: + qp->s_flags &= ~QIB_S_BUSY; +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +/** + * qib_uc_rcv - handle an incoming UC packet + * @ibp: the port the packet came in on + * @hdr: the header of the packet + * @has_grh: true if the packet has a GRH + * @data: the packet data + * @tlen: the length of the packet + * @qp: the QP for this packet. + * + * This is called from qib_qp_rcv() to process an incoming UC packet + * for the given QP. + * Called at interrupt level. + */ +void qib_uc_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct qib_qp *qp) +{ + struct qib_other_headers *ohdr; + u32 opcode; + u32 hdrsize; + u32 psn; + u32 pad; + struct ib_wc wc; + u32 pmtu = qp->pmtu; + struct ib_reth *reth; + int ret; + + /* Check for GRH */ + if (!has_grh) { + ohdr = &hdr->u.oth; + hdrsize = 8 + 12; /* LRH + BTH */ + } else { + ohdr = &hdr->u.l.oth; + hdrsize = 8 + 40 + 12; /* LRH + GRH + BTH */ + } + + opcode = be32_to_cpu(ohdr->bth[0]); + if (qib_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode)) + return; + + psn = be32_to_cpu(ohdr->bth[2]); + opcode >>= 24; + + /* Compare the PSN verses the expected PSN. */ + if (unlikely(qib_cmp24(psn, qp->r_psn) != 0)) { + /* + * Handle a sequence error. + * Silently drop any current message. + */ + qp->r_psn = psn; +inv: + if (qp->r_state == OP(SEND_FIRST) || + qp->r_state == OP(SEND_MIDDLE)) { + set_bit(QIB_R_REWIND_SGE, &qp->r_aflags); + qp->r_sge.num_sge = 0; + } else + qib_put_ss(&qp->r_sge); + qp->r_state = OP(SEND_LAST); + switch (opcode) { + case OP(SEND_FIRST): + case OP(SEND_ONLY): + case OP(SEND_ONLY_WITH_IMMEDIATE): + goto send_first; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_ONLY): + case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): + goto rdma_first; + + default: + goto drop; + } + } + + /* Check for opcode sequence errors. */ + switch (qp->r_state) { + case OP(SEND_FIRST): + case OP(SEND_MIDDLE): + if (opcode == OP(SEND_MIDDLE) || + opcode == OP(SEND_LAST) || + opcode == OP(SEND_LAST_WITH_IMMEDIATE)) + break; + goto inv; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_MIDDLE): + if (opcode == OP(RDMA_WRITE_MIDDLE) || + opcode == OP(RDMA_WRITE_LAST) || + opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) + break; + goto inv; + + default: + if (opcode == OP(SEND_FIRST) || + opcode == OP(SEND_ONLY) || + opcode == OP(SEND_ONLY_WITH_IMMEDIATE) || + opcode == OP(RDMA_WRITE_FIRST) || + opcode == OP(RDMA_WRITE_ONLY) || + opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) + break; + goto inv; + } + + if (qp->state == IB_QPS_RTR && !(qp->r_flags & QIB_R_COMM_EST)) { + qp->r_flags |= QIB_R_COMM_EST; + if (qp->ibqp.event_handler) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_COMM_EST; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } + } + + /* OK, process the packet. */ + switch (opcode) { + case OP(SEND_FIRST): + case OP(SEND_ONLY): + case OP(SEND_ONLY_WITH_IMMEDIATE): +send_first: + if (test_and_clear_bit(QIB_R_REWIND_SGE, &qp->r_aflags)) + qp->r_sge = qp->s_rdma_read_sge; + else { + ret = qib_get_rwqe(qp, 0); + if (ret < 0) + goto op_err; + if (!ret) + goto drop; + /* + * qp->s_rdma_read_sge will be the owner + * of the mr references. + */ + qp->s_rdma_read_sge = qp->r_sge; + } + qp->r_rcv_len = 0; + if (opcode == OP(SEND_ONLY)) + goto no_immediate_data; + else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE)) + goto send_last_imm; + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + /* Check for invalid length PMTU or posted rwqe len. */ + if (unlikely(tlen != (hdrsize + pmtu + 4))) + goto rewind; + qp->r_rcv_len += pmtu; + if (unlikely(qp->r_rcv_len > qp->r_len)) + goto rewind; + qib_copy_sge(&qp->r_sge, data, pmtu, 0); + break; + + case OP(SEND_LAST_WITH_IMMEDIATE): +send_last_imm: + wc.ex.imm_data = ohdr->u.imm_data; + hdrsize += 4; + wc.wc_flags = IB_WC_WITH_IMM; + goto send_last; + case OP(SEND_LAST): +no_immediate_data: + wc.ex.imm_data = 0; + wc.wc_flags = 0; +send_last: + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* XXX LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) + goto rewind; + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + wc.byte_len = tlen + qp->r_rcv_len; + if (unlikely(wc.byte_len > qp->r_len)) + goto rewind; + wc.opcode = IB_WC_RECV; + qib_copy_sge(&qp->r_sge, data, tlen, 0); + qib_put_ss(&qp->s_rdma_read_sge); +last_imm: + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = qp->remote_ah_attr.dlid; + wc.sl = qp->remote_ah_attr.sl; + /* zero fields that are N/A */ + wc.vendor_err = 0; + wc.pkey_index = 0; + wc.dlid_path_bits = 0; + wc.port_num = 0; + /* Signal completion event if the solicited bit is set. */ + qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + (ohdr->bth[0] & + cpu_to_be32(IB_BTH_SOLICITED)) != 0); + break; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_ONLY): + case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */ +rdma_first: + if (unlikely(!(qp->qp_access_flags & + IB_ACCESS_REMOTE_WRITE))) { + goto drop; + } + reth = &ohdr->u.rc.reth; + hdrsize += sizeof(*reth); + qp->r_len = be32_to_cpu(reth->length); + qp->r_rcv_len = 0; + qp->r_sge.sg_list = NULL; + if (qp->r_len != 0) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + /* Check rkey */ + ok = qib_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, + vaddr, rkey, IB_ACCESS_REMOTE_WRITE); + if (unlikely(!ok)) + goto drop; + qp->r_sge.num_sge = 1; + } else { + qp->r_sge.num_sge = 0; + qp->r_sge.sge.mr = NULL; + qp->r_sge.sge.vaddr = NULL; + qp->r_sge.sge.length = 0; + qp->r_sge.sge.sge_length = 0; + } + if (opcode == OP(RDMA_WRITE_ONLY)) + goto rdma_last; + else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) { + wc.ex.imm_data = ohdr->u.rc.imm_data; + goto rdma_last_imm; + } + /* FALLTHROUGH */ + case OP(RDMA_WRITE_MIDDLE): + /* Check for invalid length PMTU or posted rwqe len. */ + if (unlikely(tlen != (hdrsize + pmtu + 4))) + goto drop; + qp->r_rcv_len += pmtu; + if (unlikely(qp->r_rcv_len > qp->r_len)) + goto drop; + qib_copy_sge(&qp->r_sge, data, pmtu, 1); + break; + + case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): + wc.ex.imm_data = ohdr->u.imm_data; +rdma_last_imm: + hdrsize += 4; + wc.wc_flags = IB_WC_WITH_IMM; + + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* XXX LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) + goto drop; + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) + goto drop; + if (test_and_clear_bit(QIB_R_REWIND_SGE, &qp->r_aflags)) + qib_put_ss(&qp->s_rdma_read_sge); + else { + ret = qib_get_rwqe(qp, 1); + if (ret < 0) + goto op_err; + if (!ret) + goto drop; + } + wc.byte_len = qp->r_len; + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + qib_copy_sge(&qp->r_sge, data, tlen, 1); + qib_put_ss(&qp->r_sge); + goto last_imm; + + case OP(RDMA_WRITE_LAST): +rdma_last: + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* XXX LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) + goto drop; + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) + goto drop; + qib_copy_sge(&qp->r_sge, data, tlen, 1); + qib_put_ss(&qp->r_sge); + break; + + default: + /* Drop packet for unknown opcodes. */ + goto drop; + } + qp->r_psn++; + qp->r_state = opcode; + return; + +rewind: + set_bit(QIB_R_REWIND_SGE, &qp->r_aflags); + qp->r_sge.num_sge = 0; +drop: + ibp->n_pkt_drops++; + return; + +op_err: + qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + return; + +} diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c new file mode 100644 index 0000000..aaf7039 --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -0,0 +1,590 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "qib.h" +#include "qib_mad.h" + +/** + * qib_ud_loopback - handle send on loopback QPs + * @sqp: the sending QP + * @swqe: the send work request + * + * This is called from qib_make_ud_req() to forward a WQE addressed + * to the same HCA. + * Note that the receive interrupt handler may be calling qib_ud_rcv() + * while this is being called. + */ +static void qib_ud_loopback(struct qib_qp *sqp, struct qib_swqe *swqe) +{ + struct qib_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num); + struct qib_pportdata *ppd; + struct qib_qp *qp; + struct ib_ah_attr *ah_attr; + unsigned long flags; + struct qib_sge_state ssge; + struct qib_sge *sge; + struct ib_wc wc; + u32 length; + enum ib_qp_type sqptype, dqptype; + + qp = qib_lookup_qpn(ibp, swqe->wr.wr.ud.remote_qpn); + if (!qp) { + ibp->n_pkt_drops++; + return; + } + + sqptype = sqp->ibqp.qp_type == IB_QPT_GSI ? + IB_QPT_UD : sqp->ibqp.qp_type; + dqptype = qp->ibqp.qp_type == IB_QPT_GSI ? + IB_QPT_UD : qp->ibqp.qp_type; + + if (dqptype != sqptype || + !(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) { + ibp->n_pkt_drops++; + goto drop; + } + + ah_attr = &to_iah(swqe->wr.wr.ud.ah)->attr; + ppd = ppd_from_ibp(ibp); + + if (qp->ibqp.qp_num > 1) { + u16 pkey1; + u16 pkey2; + u16 lid; + + pkey1 = qib_get_pkey(ibp, sqp->s_pkey_index); + pkey2 = qib_get_pkey(ibp, qp->s_pkey_index); + if (unlikely(!qib_pkey_ok(pkey1, pkey2))) { + lid = ppd->lid | (ah_attr->src_path_bits & + ((1 << ppd->lmc) - 1)); + qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY, pkey1, + ah_attr->sl, + sqp->ibqp.qp_num, qp->ibqp.qp_num, + cpu_to_be16(lid), + cpu_to_be16(ah_attr->dlid)); + goto drop; + } + } + + /* + * Check that the qkey matches (except for QP0, see 9.6.1.4.1). + * Qkeys with the high order bit set mean use the + * qkey from the QP context instead of the WR (see 10.2.5). + */ + if (qp->ibqp.qp_num) { + u32 qkey; + + qkey = (int)swqe->wr.wr.ud.remote_qkey < 0 ? + sqp->qkey : swqe->wr.wr.ud.remote_qkey; + if (unlikely(qkey != qp->qkey)) { + u16 lid; + + lid = ppd->lid | (ah_attr->src_path_bits & + ((1 << ppd->lmc) - 1)); + qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_QKEY, qkey, + ah_attr->sl, + sqp->ibqp.qp_num, qp->ibqp.qp_num, + cpu_to_be16(lid), + cpu_to_be16(ah_attr->dlid)); + goto drop; + } + } + + /* + * A GRH is expected to precede the data even if not + * present on the wire. + */ + length = swqe->length; + memset(&wc, 0, sizeof wc); + wc.byte_len = length + sizeof(struct ib_grh); + + if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) { + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = swqe->wr.ex.imm_data; + } + + spin_lock_irqsave(&qp->r_lock, flags); + + /* + * Get the next work request entry to find where to put the data. + */ + if (qp->r_flags & QIB_R_REUSE_SGE) + qp->r_flags &= ~QIB_R_REUSE_SGE; + else { + int ret; + + ret = qib_get_rwqe(qp, 0); + if (ret < 0) { + qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + goto bail_unlock; + } + if (!ret) { + if (qp->ibqp.qp_num == 0) + ibp->n_vl15_dropped++; + goto bail_unlock; + } + } + /* Silently drop packets which are too big. */ + if (unlikely(wc.byte_len > qp->r_len)) { + qp->r_flags |= QIB_R_REUSE_SGE; + ibp->n_pkt_drops++; + goto bail_unlock; + } + + if (ah_attr->ah_flags & IB_AH_GRH) { + qib_copy_sge(&qp->r_sge, &ah_attr->grh, + sizeof(struct ib_grh), 1); + wc.wc_flags |= IB_WC_GRH; + } else + qib_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1); + ssge.sg_list = swqe->sg_list + 1; + ssge.sge = *swqe->sg_list; + ssge.num_sge = swqe->wr.num_sge; + sge = &ssge.sge; + while (length) { + u32 len = sge->length; + + if (len > length) + len = length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + qib_copy_sge(&qp->r_sge, sge->vaddr, len, 1); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--ssge.num_sge) + *sge = *ssge.sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= QIB_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + length -= len; + } + qib_put_ss(&qp->r_sge); + if (!test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags)) + goto bail_unlock; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.opcode = IB_WC_RECV; + wc.qp = &qp->ibqp; + wc.src_qp = sqp->ibqp.qp_num; + wc.pkey_index = qp->ibqp.qp_type == IB_QPT_GSI ? + swqe->wr.wr.ud.pkey_index : 0; + wc.slid = ppd->lid | (ah_attr->src_path_bits & ((1 << ppd->lmc) - 1)); + wc.sl = ah_attr->sl; + wc.dlid_path_bits = ah_attr->dlid & ((1 << ppd->lmc) - 1); + wc.port_num = qp->port_num; + /* Signal completion event if the solicited bit is set. */ + qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + swqe->wr.send_flags & IB_SEND_SOLICITED); + ibp->n_loop_pkts++; +bail_unlock: + spin_unlock_irqrestore(&qp->r_lock, flags); +drop: + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); +} + +/** + * qib_make_ud_req - construct a UD request packet + * @qp: the QP + * + * Return 1 if constructed; otherwise, return 0. + */ +int qib_make_ud_req(struct qib_qp *qp) +{ + struct qib_other_headers *ohdr; + struct ib_ah_attr *ah_attr; + struct qib_pportdata *ppd; + struct qib_ibport *ibp; + struct qib_swqe *wqe; + unsigned long flags; + u32 nwords; + u32 extra_bytes; + u32 bth0; + u16 lrh0; + u16 lid; + int ret = 0; + int next_cur; + + spin_lock_irqsave(&qp->s_lock, flags); + + if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_NEXT_SEND_OK)) { + if (!(ib_qib_state_ops[qp->state] & QIB_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + if (qp->s_last == qp->s_head) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (atomic_read(&qp->s_dma_busy)) { + qp->s_flags |= QIB_S_WAIT_DMA; + goto bail; + } + wqe = get_swqe_ptr(qp, qp->s_last); + qib_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + goto done; + } + + if (qp->s_cur == qp->s_head) + goto bail; + + wqe = get_swqe_ptr(qp, qp->s_cur); + next_cur = qp->s_cur + 1; + if (next_cur >= qp->s_size) + next_cur = 0; + + /* Construct the header. */ + ibp = to_iport(qp->ibqp.device, qp->port_num); + ppd = ppd_from_ibp(ibp); + ah_attr = &to_iah(wqe->wr.wr.ud.ah)->attr; + if (ah_attr->dlid >= QIB_MULTICAST_LID_BASE) { + if (ah_attr->dlid != QIB_PERMISSIVE_LID) + this_cpu_inc(ibp->pmastats->n_multicast_xmit); + else + this_cpu_inc(ibp->pmastats->n_unicast_xmit); + } else { + this_cpu_inc(ibp->pmastats->n_unicast_xmit); + lid = ah_attr->dlid & ~((1 << ppd->lmc) - 1); + if (unlikely(lid == ppd->lid)) { + /* + * If DMAs are in progress, we can't generate + * a completion for the loopback packet since + * it would be out of order. + * XXX Instead of waiting, we could queue a + * zero length descriptor so we get a callback. + */ + if (atomic_read(&qp->s_dma_busy)) { + qp->s_flags |= QIB_S_WAIT_DMA; + goto bail; + } + qp->s_cur = next_cur; + spin_unlock_irqrestore(&qp->s_lock, flags); + qib_ud_loopback(qp, wqe); + spin_lock_irqsave(&qp->s_lock, flags); + qib_send_complete(qp, wqe, IB_WC_SUCCESS); + goto done; + } + } + + qp->s_cur = next_cur; + extra_bytes = -wqe->length & 3; + nwords = (wqe->length + extra_bytes) >> 2; + + /* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */ + qp->s_hdrwords = 7; + qp->s_cur_size = wqe->length; + qp->s_cur_sge = &qp->s_sge; + qp->s_srate = ah_attr->static_rate; + qp->s_wqe = wqe; + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_sge.total_len = wqe->length; + + if (ah_attr->ah_flags & IB_AH_GRH) { + /* Header size in 32-bit words. */ + qp->s_hdrwords += qib_make_grh(ibp, &qp->s_hdr->u.l.grh, + &ah_attr->grh, + qp->s_hdrwords, nwords); + lrh0 = QIB_LRH_GRH; + ohdr = &qp->s_hdr->u.l.oth; + /* + * Don't worry about sending to locally attached multicast + * QPs. It is unspecified by the spec. what happens. + */ + } else { + /* Header size in 32-bit words. */ + lrh0 = QIB_LRH_BTH; + ohdr = &qp->s_hdr->u.oth; + } + if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { + qp->s_hdrwords++; + ohdr->u.ud.imm_data = wqe->wr.ex.imm_data; + bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24; + } else + bth0 = IB_OPCODE_UD_SEND_ONLY << 24; + lrh0 |= ah_attr->sl << 4; + if (qp->ibqp.qp_type == IB_QPT_SMI) + lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */ + else + lrh0 |= ibp->sl_to_vl[ah_attr->sl] << 12; + qp->s_hdr->lrh[0] = cpu_to_be16(lrh0); + qp->s_hdr->lrh[1] = cpu_to_be16(ah_attr->dlid); /* DEST LID */ + qp->s_hdr->lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC); + lid = ppd->lid; + if (lid) { + lid |= ah_attr->src_path_bits & ((1 << ppd->lmc) - 1); + qp->s_hdr->lrh[3] = cpu_to_be16(lid); + } else + qp->s_hdr->lrh[3] = IB_LID_PERMISSIVE; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + bth0 |= extra_bytes << 20; + bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? QIB_DEFAULT_P_KEY : + qib_get_pkey(ibp, qp->ibqp.qp_type == IB_QPT_GSI ? + wqe->wr.wr.ud.pkey_index : qp->s_pkey_index); + ohdr->bth[0] = cpu_to_be32(bth0); + /* + * Use the multicast QP if the destination LID is a multicast LID. + */ + ohdr->bth[1] = ah_attr->dlid >= QIB_MULTICAST_LID_BASE && + ah_attr->dlid != QIB_PERMISSIVE_LID ? + cpu_to_be32(QIB_MULTICAST_QPN) : + cpu_to_be32(wqe->wr.wr.ud.remote_qpn); + ohdr->bth[2] = cpu_to_be32(qp->s_next_psn++ & QIB_PSN_MASK); + /* + * Qkeys with the high order bit set mean use the + * qkey from the QP context instead of the WR (see 10.2.5). + */ + ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->wr.wr.ud.remote_qkey < 0 ? + qp->qkey : wqe->wr.wr.ud.remote_qkey); + ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num); + +done: + ret = 1; + goto unlock; + +bail: + qp->s_flags &= ~QIB_S_BUSY; +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +static unsigned qib_lookup_pkey(struct qib_ibport *ibp, u16 pkey) +{ + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + struct qib_devdata *dd = ppd->dd; + unsigned ctxt = ppd->hw_pidx; + unsigned i; + + pkey &= 0x7fff; /* remove limited/full membership bit */ + + for (i = 0; i < ARRAY_SIZE(dd->rcd[ctxt]->pkeys); ++i) + if ((dd->rcd[ctxt]->pkeys[i] & 0x7fff) == pkey) + return i; + + /* + * Should not get here, this means hardware failed to validate pkeys. + * Punt and return index 0. + */ + return 0; +} + +/** + * qib_ud_rcv - receive an incoming UD packet + * @ibp: the port the packet came in on + * @hdr: the packet header + * @has_grh: true if the packet has a GRH + * @data: the packet data + * @tlen: the packet length + * @qp: the QP the packet came on + * + * This is called from qib_qp_rcv() to process an incoming UD packet + * for the given QP. + * Called at interrupt level. + */ +void qib_ud_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct qib_qp *qp) +{ + struct qib_other_headers *ohdr; + int opcode; + u32 hdrsize; + u32 pad; + struct ib_wc wc; + u32 qkey; + u32 src_qp; + u16 dlid; + + /* Check for GRH */ + if (!has_grh) { + ohdr = &hdr->u.oth; + hdrsize = 8 + 12 + 8; /* LRH + BTH + DETH */ + } else { + ohdr = &hdr->u.l.oth; + hdrsize = 8 + 40 + 12 + 8; /* LRH + GRH + BTH + DETH */ + } + qkey = be32_to_cpu(ohdr->u.ud.deth[0]); + src_qp = be32_to_cpu(ohdr->u.ud.deth[1]) & QIB_QPN_MASK; + + /* + * Get the number of bytes the message was padded by + * and drop incomplete packets. + */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + if (unlikely(tlen < (hdrsize + pad + 4))) + goto drop; + + tlen -= hdrsize + pad + 4; + + /* + * Check that the permissive LID is only used on QP0 + * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1). + */ + if (qp->ibqp.qp_num) { + if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE || + hdr->lrh[3] == IB_LID_PERMISSIVE)) + goto drop; + if (qp->ibqp.qp_num > 1) { + u16 pkey1, pkey2; + + pkey1 = be32_to_cpu(ohdr->bth[0]); + pkey2 = qib_get_pkey(ibp, qp->s_pkey_index); + if (unlikely(!qib_pkey_ok(pkey1, pkey2))) { + qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY, + pkey1, + (be16_to_cpu(hdr->lrh[0]) >> 4) & + 0xF, + src_qp, qp->ibqp.qp_num, + hdr->lrh[3], hdr->lrh[1]); + return; + } + } + if (unlikely(qkey != qp->qkey)) { + qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_QKEY, qkey, + (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF, + src_qp, qp->ibqp.qp_num, + hdr->lrh[3], hdr->lrh[1]); + return; + } + /* Drop invalid MAD packets (see 13.5.3.1). */ + if (unlikely(qp->ibqp.qp_num == 1 && + (tlen != 256 || + (be16_to_cpu(hdr->lrh[0]) >> 12) == 15))) + goto drop; + } else { + struct ib_smp *smp; + + /* Drop invalid MAD packets (see 13.5.3.1). */ + if (tlen != 256 || (be16_to_cpu(hdr->lrh[0]) >> 12) != 15) + goto drop; + smp = (struct ib_smp *) data; + if ((hdr->lrh[1] == IB_LID_PERMISSIVE || + hdr->lrh[3] == IB_LID_PERMISSIVE) && + smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + goto drop; + } + + /* + * The opcode is in the low byte when its in network order + * (top byte when in host order). + */ + opcode = be32_to_cpu(ohdr->bth[0]) >> 24; + if (qp->ibqp.qp_num > 1 && + opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) { + wc.ex.imm_data = ohdr->u.ud.imm_data; + wc.wc_flags = IB_WC_WITH_IMM; + tlen -= sizeof(u32); + } else if (opcode == IB_OPCODE_UD_SEND_ONLY) { + wc.ex.imm_data = 0; + wc.wc_flags = 0; + } else + goto drop; + + /* + * A GRH is expected to precede the data even if not + * present on the wire. + */ + wc.byte_len = tlen + sizeof(struct ib_grh); + + /* + * Get the next work request entry to find where to put the data. + */ + if (qp->r_flags & QIB_R_REUSE_SGE) + qp->r_flags &= ~QIB_R_REUSE_SGE; + else { + int ret; + + ret = qib_get_rwqe(qp, 0); + if (ret < 0) { + qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + return; + } + if (!ret) { + if (qp->ibqp.qp_num == 0) + ibp->n_vl15_dropped++; + return; + } + } + /* Silently drop packets which are too big. */ + if (unlikely(wc.byte_len > qp->r_len)) { + qp->r_flags |= QIB_R_REUSE_SGE; + goto drop; + } + if (has_grh) { + qib_copy_sge(&qp->r_sge, &hdr->u.l.grh, + sizeof(struct ib_grh), 1); + wc.wc_flags |= IB_WC_GRH; + } else + qib_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1); + qib_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), 1); + qib_put_ss(&qp->r_sge); + if (!test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags)) + return; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.opcode = IB_WC_RECV; + wc.vendor_err = 0; + wc.qp = &qp->ibqp; + wc.src_qp = src_qp; + wc.pkey_index = qp->ibqp.qp_type == IB_QPT_GSI ? + qib_lookup_pkey(ibp, be32_to_cpu(ohdr->bth[0])) : 0; + wc.slid = be16_to_cpu(hdr->lrh[3]); + wc.sl = (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF; + dlid = be16_to_cpu(hdr->lrh[1]); + /* + * Save the LMC lower bits if the destination LID is a unicast LID. + */ + wc.dlid_path_bits = dlid >= QIB_MULTICAST_LID_BASE ? 0 : + dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1); + wc.port_num = qp->port_num; + /* Signal completion event if the solicited bit is set. */ + qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + (ohdr->bth[0] & + cpu_to_be32(IB_BTH_SOLICITED)) != 0); + return; + +drop: + ibp->n_pkt_drops++; +} diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c new file mode 100644 index 0000000..74f90b2 --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_user_pages.c @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "qib.h" + +static void __qib_release_user_pages(struct page **p, size_t num_pages, + int dirty) +{ + size_t i; + + for (i = 0; i < num_pages; i++) { + if (dirty) + set_page_dirty_lock(p[i]); + put_page(p[i]); + } +} + +/* + * Call with current->mm->mmap_sem held. + */ +static int __qib_get_user_pages(unsigned long start_page, size_t num_pages, + struct page **p) +{ + unsigned long lock_limit; + size_t got; + int ret; + + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + if (num_pages > lock_limit && !capable(CAP_IPC_LOCK)) { + ret = -ENOMEM; + goto bail; + } + + for (got = 0; got < num_pages; got += ret) { + ret = get_user_pages(current, current->mm, + start_page + got * PAGE_SIZE, + num_pages - got, 1, 1, + p + got, NULL); + if (ret < 0) + goto bail_release; + } + + current->mm->pinned_vm += num_pages; + + ret = 0; + goto bail; + +bail_release: + __qib_release_user_pages(p, got, 0); +bail: + return ret; +} + +/** + * qib_map_page - a safety wrapper around pci_map_page() + * + * A dma_addr of all 0's is interpreted by the chip as "disabled". + * Unfortunately, it can also be a valid dma_addr returned on some + * architectures. + * + * The powerpc iommu assigns dma_addrs in ascending order, so we don't + * have to bother with retries or mapping a dummy page to insure we + * don't just get the same mapping again. + * + * I'm sure we won't be so lucky with other iommu's, so FIXME. + */ +dma_addr_t qib_map_page(struct pci_dev *hwdev, struct page *page, + unsigned long offset, size_t size, int direction) +{ + dma_addr_t phys; + + phys = pci_map_page(hwdev, page, offset, size, direction); + + if (phys == 0) { + pci_unmap_page(hwdev, phys, size, direction); + phys = pci_map_page(hwdev, page, offset, size, direction); + /* + * FIXME: If we get 0 again, we should keep this page, + * map another, then free the 0 page. + */ + } + + return phys; +} + +/** + * qib_get_user_pages - lock user pages into memory + * @start_page: the start page + * @num_pages: the number of pages + * @p: the output page structures + * + * This function takes a given start page (page aligned user virtual + * address) and pins it and the following specified number of pages. For + * now, num_pages is always 1, but that will probably change at some point + * (because caller is doing expected sends on a single virtually contiguous + * buffer, so we can do all pages at once). + */ +int qib_get_user_pages(unsigned long start_page, size_t num_pages, + struct page **p) +{ + int ret; + + down_write(¤t->mm->mmap_sem); + + ret = __qib_get_user_pages(start_page, num_pages, p); + + up_write(¤t->mm->mmap_sem); + + return ret; +} + +void qib_release_user_pages(struct page **p, size_t num_pages) +{ + if (current->mm) /* during close after signal, mm can be NULL */ + down_write(¤t->mm->mmap_sem); + + __qib_release_user_pages(p, num_pages, 1); + + if (current->mm) { + current->mm->pinned_vm -= num_pages; + up_write(¤t->mm->mmap_sem); + } +} diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c new file mode 100644 index 0000000..d2806ca --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_user_sdma.c @@ -0,0 +1,1465 @@ +/* + * Copyright (c) 2007, 2008, 2009 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "qib.h" +#include "qib_user_sdma.h" + +/* minimum size of header */ +#define QIB_USER_SDMA_MIN_HEADER_LENGTH 64 +/* expected size of headers (for dma_pool) */ +#define QIB_USER_SDMA_EXP_HEADER_LENGTH 64 +/* attempt to drain the queue for 5secs */ +#define QIB_USER_SDMA_DRAIN_TIMEOUT 500 + +/* + * track how many times a process open this driver. + */ +static struct rb_root qib_user_sdma_rb_root = RB_ROOT; + +struct qib_user_sdma_rb_node { + struct rb_node node; + int refcount; + pid_t pid; +}; + +struct qib_user_sdma_pkt { + struct list_head list; /* list element */ + + u8 tiddma; /* if this is NEW tid-sdma */ + u8 largepkt; /* this is large pkt from kmalloc */ + u16 frag_size; /* frag size used by PSM */ + u16 index; /* last header index or push index */ + u16 naddr; /* dimension of addr (1..3) ... */ + u16 addrlimit; /* addr array size */ + u16 tidsmidx; /* current tidsm index */ + u16 tidsmcount; /* tidsm array item count */ + u16 payload_size; /* payload size so far for header */ + u32 bytes_togo; /* bytes for processing */ + u32 counter; /* sdma pkts queued counter for this entry */ + struct qib_tid_session_member *tidsm; /* tid session member array */ + struct qib_user_sdma_queue *pq; /* which pq this pkt belongs to */ + u64 added; /* global descq number of entries */ + + struct { + u16 offset; /* offset for kvaddr, addr */ + u16 length; /* length in page */ + u16 first_desc; /* first desc */ + u16 last_desc; /* last desc */ + u16 put_page; /* should we put_page? */ + u16 dma_mapped; /* is page dma_mapped? */ + u16 dma_length; /* for dma_unmap_page() */ + u16 padding; + struct page *page; /* may be NULL (coherent mem) */ + void *kvaddr; /* FIXME: only for pio hack */ + dma_addr_t addr; + } addr[4]; /* max pages, any more and we coalesce */ +}; + +struct qib_user_sdma_queue { + /* + * pkts sent to dma engine are queued on this + * list head. the type of the elements of this + * list are struct qib_user_sdma_pkt... + */ + struct list_head sent; + + /* + * Because above list will be accessed by both process and + * signal handler, we need a spinlock for it. + */ + spinlock_t sent_lock ____cacheline_aligned_in_smp; + + /* headers with expected length are allocated from here... */ + char header_cache_name[64]; + struct dma_pool *header_cache; + + /* packets are allocated from the slab cache... */ + char pkt_slab_name[64]; + struct kmem_cache *pkt_slab; + + /* as packets go on the queued queue, they are counted... */ + u32 counter; + u32 sent_counter; + /* pending packets, not sending yet */ + u32 num_pending; + /* sending packets, not complete yet */ + u32 num_sending; + /* global descq number of entry of last sending packet */ + u64 added; + + /* dma page table */ + struct rb_root dma_pages_root; + + struct qib_user_sdma_rb_node *sdma_rb_node; + + /* protect everything above... */ + struct mutex lock; +}; + +static struct qib_user_sdma_rb_node * +qib_user_sdma_rb_search(struct rb_root *root, pid_t pid) +{ + struct qib_user_sdma_rb_node *sdma_rb_node; + struct rb_node *node = root->rb_node; + + while (node) { + sdma_rb_node = container_of(node, + struct qib_user_sdma_rb_node, node); + if (pid < sdma_rb_node->pid) + node = node->rb_left; + else if (pid > sdma_rb_node->pid) + node = node->rb_right; + else + return sdma_rb_node; + } + return NULL; +} + +static int +qib_user_sdma_rb_insert(struct rb_root *root, struct qib_user_sdma_rb_node *new) +{ + struct rb_node **node = &(root->rb_node); + struct rb_node *parent = NULL; + struct qib_user_sdma_rb_node *got; + + while (*node) { + got = container_of(*node, struct qib_user_sdma_rb_node, node); + parent = *node; + if (new->pid < got->pid) + node = &((*node)->rb_left); + else if (new->pid > got->pid) + node = &((*node)->rb_right); + else + return 0; + } + + rb_link_node(&new->node, parent, node); + rb_insert_color(&new->node, root); + return 1; +} + +struct qib_user_sdma_queue * +qib_user_sdma_queue_create(struct device *dev, int unit, int ctxt, int sctxt) +{ + struct qib_user_sdma_queue *pq = + kmalloc(sizeof(struct qib_user_sdma_queue), GFP_KERNEL); + struct qib_user_sdma_rb_node *sdma_rb_node; + + if (!pq) + goto done; + + pq->counter = 0; + pq->sent_counter = 0; + pq->num_pending = 0; + pq->num_sending = 0; + pq->added = 0; + pq->sdma_rb_node = NULL; + + INIT_LIST_HEAD(&pq->sent); + spin_lock_init(&pq->sent_lock); + mutex_init(&pq->lock); + + snprintf(pq->pkt_slab_name, sizeof(pq->pkt_slab_name), + "qib-user-sdma-pkts-%u-%02u.%02u", unit, ctxt, sctxt); + pq->pkt_slab = kmem_cache_create(pq->pkt_slab_name, + sizeof(struct qib_user_sdma_pkt), + 0, 0, NULL); + + if (!pq->pkt_slab) + goto err_kfree; + + snprintf(pq->header_cache_name, sizeof(pq->header_cache_name), + "qib-user-sdma-headers-%u-%02u.%02u", unit, ctxt, sctxt); + pq->header_cache = dma_pool_create(pq->header_cache_name, + dev, + QIB_USER_SDMA_EXP_HEADER_LENGTH, + 4, 0); + if (!pq->header_cache) + goto err_slab; + + pq->dma_pages_root = RB_ROOT; + + sdma_rb_node = qib_user_sdma_rb_search(&qib_user_sdma_rb_root, + current->pid); + if (sdma_rb_node) { + sdma_rb_node->refcount++; + } else { + int ret; + sdma_rb_node = kmalloc(sizeof( + struct qib_user_sdma_rb_node), GFP_KERNEL); + if (!sdma_rb_node) + goto err_rb; + + sdma_rb_node->refcount = 1; + sdma_rb_node->pid = current->pid; + + ret = qib_user_sdma_rb_insert(&qib_user_sdma_rb_root, + sdma_rb_node); + BUG_ON(ret == 0); + } + pq->sdma_rb_node = sdma_rb_node; + + goto done; + +err_rb: + dma_pool_destroy(pq->header_cache); +err_slab: + kmem_cache_destroy(pq->pkt_slab); +err_kfree: + kfree(pq); + pq = NULL; + +done: + return pq; +} + +static void qib_user_sdma_init_frag(struct qib_user_sdma_pkt *pkt, + int i, u16 offset, u16 len, + u16 first_desc, u16 last_desc, + u16 put_page, u16 dma_mapped, + struct page *page, void *kvaddr, + dma_addr_t dma_addr, u16 dma_length) +{ + pkt->addr[i].offset = offset; + pkt->addr[i].length = len; + pkt->addr[i].first_desc = first_desc; + pkt->addr[i].last_desc = last_desc; + pkt->addr[i].put_page = put_page; + pkt->addr[i].dma_mapped = dma_mapped; + pkt->addr[i].page = page; + pkt->addr[i].kvaddr = kvaddr; + pkt->addr[i].addr = dma_addr; + pkt->addr[i].dma_length = dma_length; +} + +static void *qib_user_sdma_alloc_header(struct qib_user_sdma_queue *pq, + size_t len, dma_addr_t *dma_addr) +{ + void *hdr; + + if (len == QIB_USER_SDMA_EXP_HEADER_LENGTH) + hdr = dma_pool_alloc(pq->header_cache, GFP_KERNEL, + dma_addr); + else + hdr = NULL; + + if (!hdr) { + hdr = kmalloc(len, GFP_KERNEL); + if (!hdr) + return NULL; + + *dma_addr = 0; + } + + return hdr; +} + +static int qib_user_sdma_page_to_frags(const struct qib_devdata *dd, + struct qib_user_sdma_queue *pq, + struct qib_user_sdma_pkt *pkt, + struct page *page, u16 put, + u16 offset, u16 len, void *kvaddr) +{ + __le16 *pbc16; + void *pbcvaddr; + struct qib_message_header *hdr; + u16 newlen, pbclen, lastdesc, dma_mapped; + u32 vcto; + union qib_seqnum seqnum; + dma_addr_t pbcdaddr; + dma_addr_t dma_addr = + dma_map_page(&dd->pcidev->dev, + page, offset, len, DMA_TO_DEVICE); + int ret = 0; + + if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) { + /* + * dma mapping error, pkt has not managed + * this page yet, return the page here so + * the caller can ignore this page. + */ + if (put) { + put_page(page); + } else { + /* coalesce case */ + kunmap(page); + __free_page(page); + } + ret = -ENOMEM; + goto done; + } + offset = 0; + dma_mapped = 1; + + +next_fragment: + + /* + * In tid-sdma, the transfer length is restricted by + * receiver side current tid page length. + */ + if (pkt->tiddma && len > pkt->tidsm[pkt->tidsmidx].length) + newlen = pkt->tidsm[pkt->tidsmidx].length; + else + newlen = len; + + /* + * Then the transfer length is restricted by MTU. + * the last descriptor flag is determined by: + * 1. the current packet is at frag size length. + * 2. the current tid page is done if tid-sdma. + * 3. there is no more byte togo if sdma. + */ + lastdesc = 0; + if ((pkt->payload_size + newlen) >= pkt->frag_size) { + newlen = pkt->frag_size - pkt->payload_size; + lastdesc = 1; + } else if (pkt->tiddma) { + if (newlen == pkt->tidsm[pkt->tidsmidx].length) + lastdesc = 1; + } else { + if (newlen == pkt->bytes_togo) + lastdesc = 1; + } + + /* fill the next fragment in this page */ + qib_user_sdma_init_frag(pkt, pkt->naddr, /* index */ + offset, newlen, /* offset, len */ + 0, lastdesc, /* first last desc */ + put, dma_mapped, /* put page, dma mapped */ + page, kvaddr, /* struct page, virt addr */ + dma_addr, len); /* dma addr, dma length */ + pkt->bytes_togo -= newlen; + pkt->payload_size += newlen; + pkt->naddr++; + if (pkt->naddr == pkt->addrlimit) { + ret = -EFAULT; + goto done; + } + + /* If there is no more byte togo. (lastdesc==1) */ + if (pkt->bytes_togo == 0) { + /* The packet is done, header is not dma mapped yet. + * it should be from kmalloc */ + if (!pkt->addr[pkt->index].addr) { + pkt->addr[pkt->index].addr = + dma_map_single(&dd->pcidev->dev, + pkt->addr[pkt->index].kvaddr, + pkt->addr[pkt->index].dma_length, + DMA_TO_DEVICE); + if (dma_mapping_error(&dd->pcidev->dev, + pkt->addr[pkt->index].addr)) { + ret = -ENOMEM; + goto done; + } + pkt->addr[pkt->index].dma_mapped = 1; + } + + goto done; + } + + /* If tid-sdma, advance tid info. */ + if (pkt->tiddma) { + pkt->tidsm[pkt->tidsmidx].length -= newlen; + if (pkt->tidsm[pkt->tidsmidx].length) { + pkt->tidsm[pkt->tidsmidx].offset += newlen; + } else { + pkt->tidsmidx++; + if (pkt->tidsmidx == pkt->tidsmcount) { + ret = -EFAULT; + goto done; + } + } + } + + /* + * If this is NOT the last descriptor. (newlen==len) + * the current packet is not done yet, but the current + * send side page is done. + */ + if (lastdesc == 0) + goto done; + + /* + * If running this driver under PSM with message size + * fitting into one transfer unit, it is not possible + * to pass this line. otherwise, it is a buggggg. + */ + + /* + * Since the current packet is done, and there are more + * bytes togo, we need to create a new sdma header, copying + * from previous sdma header and modify both. + */ + pbclen = pkt->addr[pkt->index].length; + pbcvaddr = qib_user_sdma_alloc_header(pq, pbclen, &pbcdaddr); + if (!pbcvaddr) { + ret = -ENOMEM; + goto done; + } + /* Copy the previous sdma header to new sdma header */ + pbc16 = (__le16 *)pkt->addr[pkt->index].kvaddr; + memcpy(pbcvaddr, pbc16, pbclen); + + /* Modify the previous sdma header */ + hdr = (struct qib_message_header *)&pbc16[4]; + + /* New pbc length */ + pbc16[0] = cpu_to_le16(le16_to_cpu(pbc16[0])-(pkt->bytes_togo>>2)); + + /* New packet length */ + hdr->lrh[2] = cpu_to_be16(le16_to_cpu(pbc16[0])); + + if (pkt->tiddma) { + /* turn on the header suppression */ + hdr->iph.pkt_flags = + cpu_to_le16(le16_to_cpu(hdr->iph.pkt_flags)|0x2); + /* turn off ACK_REQ: 0x04 and EXPECTED_DONE: 0x20 */ + hdr->flags &= ~(0x04|0x20); + } else { + /* turn off extra bytes: 20-21 bits */ + hdr->bth[0] = cpu_to_be32(be32_to_cpu(hdr->bth[0])&0xFFCFFFFF); + /* turn off ACK_REQ: 0x04 */ + hdr->flags &= ~(0x04); + } + + /* New kdeth checksum */ + vcto = le32_to_cpu(hdr->iph.ver_ctxt_tid_offset); + hdr->iph.chksum = cpu_to_le16(QIB_LRH_BTH + + be16_to_cpu(hdr->lrh[2]) - + ((vcto>>16)&0xFFFF) - (vcto&0xFFFF) - + le16_to_cpu(hdr->iph.pkt_flags)); + + /* The packet is done, header is not dma mapped yet. + * it should be from kmalloc */ + if (!pkt->addr[pkt->index].addr) { + pkt->addr[pkt->index].addr = + dma_map_single(&dd->pcidev->dev, + pkt->addr[pkt->index].kvaddr, + pkt->addr[pkt->index].dma_length, + DMA_TO_DEVICE); + if (dma_mapping_error(&dd->pcidev->dev, + pkt->addr[pkt->index].addr)) { + ret = -ENOMEM; + goto done; + } + pkt->addr[pkt->index].dma_mapped = 1; + } + + /* Modify the new sdma header */ + pbc16 = (__le16 *)pbcvaddr; + hdr = (struct qib_message_header *)&pbc16[4]; + + /* New pbc length */ + pbc16[0] = cpu_to_le16(le16_to_cpu(pbc16[0])-(pkt->payload_size>>2)); + + /* New packet length */ + hdr->lrh[2] = cpu_to_be16(le16_to_cpu(pbc16[0])); + + if (pkt->tiddma) { + /* Set new tid and offset for new sdma header */ + hdr->iph.ver_ctxt_tid_offset = cpu_to_le32( + (le32_to_cpu(hdr->iph.ver_ctxt_tid_offset)&0xFF000000) + + (pkt->tidsm[pkt->tidsmidx].tid<tidsm[pkt->tidsmidx].offset>>2)); + } else { + /* Middle protocol new packet offset */ + hdr->uwords[2] += pkt->payload_size; + } + + /* New kdeth checksum */ + vcto = le32_to_cpu(hdr->iph.ver_ctxt_tid_offset); + hdr->iph.chksum = cpu_to_le16(QIB_LRH_BTH + + be16_to_cpu(hdr->lrh[2]) - + ((vcto>>16)&0xFFFF) - (vcto&0xFFFF) - + le16_to_cpu(hdr->iph.pkt_flags)); + + /* Next sequence number in new sdma header */ + seqnum.val = be32_to_cpu(hdr->bth[2]); + if (pkt->tiddma) + seqnum.seq++; + else + seqnum.pkt++; + hdr->bth[2] = cpu_to_be32(seqnum.val); + + /* Init new sdma header. */ + qib_user_sdma_init_frag(pkt, pkt->naddr, /* index */ + 0, pbclen, /* offset, len */ + 1, 0, /* first last desc */ + 0, 0, /* put page, dma mapped */ + NULL, pbcvaddr, /* struct page, virt addr */ + pbcdaddr, pbclen); /* dma addr, dma length */ + pkt->index = pkt->naddr; + pkt->payload_size = 0; + pkt->naddr++; + if (pkt->naddr == pkt->addrlimit) { + ret = -EFAULT; + goto done; + } + + /* Prepare for next fragment in this page */ + if (newlen != len) { + if (dma_mapped) { + put = 0; + dma_mapped = 0; + page = NULL; + kvaddr = NULL; + } + len -= newlen; + offset += newlen; + + goto next_fragment; + } + +done: + return ret; +} + +/* we've too many pages in the iovec, coalesce to a single page */ +static int qib_user_sdma_coalesce(const struct qib_devdata *dd, + struct qib_user_sdma_queue *pq, + struct qib_user_sdma_pkt *pkt, + const struct iovec *iov, + unsigned long niov) +{ + int ret = 0; + struct page *page = alloc_page(GFP_KERNEL); + void *mpage_save; + char *mpage; + int i; + int len = 0; + + if (!page) { + ret = -ENOMEM; + goto done; + } + + mpage = kmap(page); + mpage_save = mpage; + for (i = 0; i < niov; i++) { + int cfur; + + cfur = copy_from_user(mpage, + iov[i].iov_base, iov[i].iov_len); + if (cfur) { + ret = -EFAULT; + goto free_unmap; + } + + mpage += iov[i].iov_len; + len += iov[i].iov_len; + } + + ret = qib_user_sdma_page_to_frags(dd, pq, pkt, + page, 0, 0, len, mpage_save); + goto done; + +free_unmap: + kunmap(page); + __free_page(page); +done: + return ret; +} + +/* + * How many pages in this iovec element? + */ +static int qib_user_sdma_num_pages(const struct iovec *iov) +{ + const unsigned long addr = (unsigned long) iov->iov_base; + const unsigned long len = iov->iov_len; + const unsigned long spage = addr & PAGE_MASK; + const unsigned long epage = (addr + len - 1) & PAGE_MASK; + + return 1 + ((epage - spage) >> PAGE_SHIFT); +} + +static void qib_user_sdma_free_pkt_frag(struct device *dev, + struct qib_user_sdma_queue *pq, + struct qib_user_sdma_pkt *pkt, + int frag) +{ + const int i = frag; + + if (pkt->addr[i].page) { + /* only user data has page */ + if (pkt->addr[i].dma_mapped) + dma_unmap_page(dev, + pkt->addr[i].addr, + pkt->addr[i].dma_length, + DMA_TO_DEVICE); + + if (pkt->addr[i].kvaddr) + kunmap(pkt->addr[i].page); + + if (pkt->addr[i].put_page) + put_page(pkt->addr[i].page); + else + __free_page(pkt->addr[i].page); + } else if (pkt->addr[i].kvaddr) { + /* for headers */ + if (pkt->addr[i].dma_mapped) { + /* from kmalloc & dma mapped */ + dma_unmap_single(dev, + pkt->addr[i].addr, + pkt->addr[i].dma_length, + DMA_TO_DEVICE); + kfree(pkt->addr[i].kvaddr); + } else if (pkt->addr[i].addr) { + /* free coherent mem from cache... */ + dma_pool_free(pq->header_cache, + pkt->addr[i].kvaddr, pkt->addr[i].addr); + } else { + /* from kmalloc but not dma mapped */ + kfree(pkt->addr[i].kvaddr); + } + } +} + +/* return number of pages pinned... */ +static int qib_user_sdma_pin_pages(const struct qib_devdata *dd, + struct qib_user_sdma_queue *pq, + struct qib_user_sdma_pkt *pkt, + unsigned long addr, int tlen, int npages) +{ + struct page *pages[8]; + int i, j; + int ret = 0; + + while (npages) { + if (npages > 8) + j = 8; + else + j = npages; + + ret = get_user_pages_fast(addr, j, 0, pages); + if (ret != j) { + i = 0; + j = ret; + ret = -ENOMEM; + goto free_pages; + } + + for (i = 0; i < j; i++) { + /* map the pages... */ + unsigned long fofs = addr & ~PAGE_MASK; + int flen = ((fofs + tlen) > PAGE_SIZE) ? + (PAGE_SIZE - fofs) : tlen; + + ret = qib_user_sdma_page_to_frags(dd, pq, pkt, + pages[i], 1, fofs, flen, NULL); + if (ret < 0) { + /* current page has beed taken + * care of inside above call. + */ + i++; + goto free_pages; + } + + addr += flen; + tlen -= flen; + } + + npages -= j; + } + + goto done; + + /* if error, return all pages not managed by pkt */ +free_pages: + while (i < j) + put_page(pages[i++]); + +done: + return ret; +} + +static int qib_user_sdma_pin_pkt(const struct qib_devdata *dd, + struct qib_user_sdma_queue *pq, + struct qib_user_sdma_pkt *pkt, + const struct iovec *iov, + unsigned long niov) +{ + int ret = 0; + unsigned long idx; + + for (idx = 0; idx < niov; idx++) { + const int npages = qib_user_sdma_num_pages(iov + idx); + const unsigned long addr = (unsigned long) iov[idx].iov_base; + + ret = qib_user_sdma_pin_pages(dd, pq, pkt, addr, + iov[idx].iov_len, npages); + if (ret < 0) + goto free_pkt; + } + + goto done; + +free_pkt: + /* we need to ignore the first entry here */ + for (idx = 1; idx < pkt->naddr; idx++) + qib_user_sdma_free_pkt_frag(&dd->pcidev->dev, pq, pkt, idx); + + /* need to dma unmap the first entry, this is to restore to + * the original state so that caller can free the memory in + * error condition. Caller does not know if dma mapped or not*/ + if (pkt->addr[0].dma_mapped) { + dma_unmap_single(&dd->pcidev->dev, + pkt->addr[0].addr, + pkt->addr[0].dma_length, + DMA_TO_DEVICE); + pkt->addr[0].addr = 0; + pkt->addr[0].dma_mapped = 0; + } + +done: + return ret; +} + +static int qib_user_sdma_init_payload(const struct qib_devdata *dd, + struct qib_user_sdma_queue *pq, + struct qib_user_sdma_pkt *pkt, + const struct iovec *iov, + unsigned long niov, int npages) +{ + int ret = 0; + + if (pkt->frag_size == pkt->bytes_togo && + npages >= ARRAY_SIZE(pkt->addr)) + ret = qib_user_sdma_coalesce(dd, pq, pkt, iov, niov); + else + ret = qib_user_sdma_pin_pkt(dd, pq, pkt, iov, niov); + + return ret; +} + +/* free a packet list -- return counter value of last packet */ +static void qib_user_sdma_free_pkt_list(struct device *dev, + struct qib_user_sdma_queue *pq, + struct list_head *list) +{ + struct qib_user_sdma_pkt *pkt, *pkt_next; + + list_for_each_entry_safe(pkt, pkt_next, list, list) { + int i; + + for (i = 0; i < pkt->naddr; i++) + qib_user_sdma_free_pkt_frag(dev, pq, pkt, i); + + if (pkt->largepkt) + kfree(pkt); + else + kmem_cache_free(pq->pkt_slab, pkt); + } + INIT_LIST_HEAD(list); +} + +/* + * copy headers, coalesce etc -- pq->lock must be held + * + * we queue all the packets to list, returning the + * number of bytes total. list must be empty initially, + * as, if there is an error we clean it... + */ +static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd, + struct qib_pportdata *ppd, + struct qib_user_sdma_queue *pq, + const struct iovec *iov, + unsigned long niov, + struct list_head *list, + int *maxpkts, int *ndesc) +{ + unsigned long idx = 0; + int ret = 0; + int npkts = 0; + __le32 *pbc; + dma_addr_t dma_addr; + struct qib_user_sdma_pkt *pkt = NULL; + size_t len; + size_t nw; + u32 counter = pq->counter; + u16 frag_size; + + while (idx < niov && npkts < *maxpkts) { + const unsigned long addr = (unsigned long) iov[idx].iov_base; + const unsigned long idx_save = idx; + unsigned pktnw; + unsigned pktnwc; + int nfrags = 0; + int npages = 0; + int bytes_togo = 0; + int tiddma = 0; + int cfur; + + len = iov[idx].iov_len; + nw = len >> 2; + + if (len < QIB_USER_SDMA_MIN_HEADER_LENGTH || + len > PAGE_SIZE || len & 3 || addr & 3) { + ret = -EINVAL; + goto free_list; + } + + pbc = qib_user_sdma_alloc_header(pq, len, &dma_addr); + if (!pbc) { + ret = -ENOMEM; + goto free_list; + } + + cfur = copy_from_user(pbc, iov[idx].iov_base, len); + if (cfur) { + ret = -EFAULT; + goto free_pbc; + } + + /* + * This assignment is a bit strange. it's because the + * the pbc counts the number of 32 bit words in the full + * packet _except_ the first word of the pbc itself... + */ + pktnwc = nw - 1; + + /* + * pktnw computation yields the number of 32 bit words + * that the caller has indicated in the PBC. note that + * this is one less than the total number of words that + * goes to the send DMA engine as the first 32 bit word + * of the PBC itself is not counted. Armed with this count, + * we can verify that the packet is consistent with the + * iovec lengths. + */ + pktnw = le32_to_cpu(*pbc) & 0xFFFF; + if (pktnw < pktnwc) { + ret = -EINVAL; + goto free_pbc; + } + + idx++; + while (pktnwc < pktnw && idx < niov) { + const size_t slen = iov[idx].iov_len; + const unsigned long faddr = + (unsigned long) iov[idx].iov_base; + + if (slen & 3 || faddr & 3 || !slen) { + ret = -EINVAL; + goto free_pbc; + } + + npages += qib_user_sdma_num_pages(&iov[idx]); + + bytes_togo += slen; + pktnwc += slen >> 2; + idx++; + nfrags++; + } + + if (pktnwc != pktnw) { + ret = -EINVAL; + goto free_pbc; + } + + frag_size = ((le32_to_cpu(*pbc))>>16) & 0xFFFF; + if (((frag_size ? frag_size : bytes_togo) + len) > + ppd->ibmaxlen) { + ret = -EINVAL; + goto free_pbc; + } + + if (frag_size) { + int pktsize, tidsmsize, n; + + n = npages*((2*PAGE_SIZE/frag_size)+1); + pktsize = sizeof(*pkt) + sizeof(pkt->addr[0])*n; + + /* + * Determine if this is tid-sdma or just sdma. + */ + tiddma = (((le32_to_cpu(pbc[7])>> + QLOGIC_IB_I_TID_SHIFT)& + QLOGIC_IB_I_TID_MASK) != + QLOGIC_IB_I_TID_MASK); + + if (tiddma) + tidsmsize = iov[idx].iov_len; + else + tidsmsize = 0; + + pkt = kmalloc(pktsize+tidsmsize, GFP_KERNEL); + if (!pkt) { + ret = -ENOMEM; + goto free_pbc; + } + pkt->largepkt = 1; + pkt->frag_size = frag_size; + pkt->addrlimit = n + ARRAY_SIZE(pkt->addr); + + if (tiddma) { + char *tidsm = (char *)pkt + pktsize; + cfur = copy_from_user(tidsm, + iov[idx].iov_base, tidsmsize); + if (cfur) { + ret = -EFAULT; + goto free_pkt; + } + pkt->tidsm = + (struct qib_tid_session_member *)tidsm; + pkt->tidsmcount = tidsmsize/ + sizeof(struct qib_tid_session_member); + pkt->tidsmidx = 0; + idx++; + } + + /* + * pbc 'fill1' field is borrowed to pass frag size, + * we need to clear it after picking frag size, the + * hardware requires this field to be zero. + */ + *pbc = cpu_to_le32(le32_to_cpu(*pbc) & 0x0000FFFF); + } else { + pkt = kmem_cache_alloc(pq->pkt_slab, GFP_KERNEL); + if (!pkt) { + ret = -ENOMEM; + goto free_pbc; + } + pkt->largepkt = 0; + pkt->frag_size = bytes_togo; + pkt->addrlimit = ARRAY_SIZE(pkt->addr); + } + pkt->bytes_togo = bytes_togo; + pkt->payload_size = 0; + pkt->counter = counter; + pkt->tiddma = tiddma; + + /* setup the first header */ + qib_user_sdma_init_frag(pkt, 0, /* index */ + 0, len, /* offset, len */ + 1, 0, /* first last desc */ + 0, 0, /* put page, dma mapped */ + NULL, pbc, /* struct page, virt addr */ + dma_addr, len); /* dma addr, dma length */ + pkt->index = 0; + pkt->naddr = 1; + + if (nfrags) { + ret = qib_user_sdma_init_payload(dd, pq, pkt, + iov + idx_save + 1, + nfrags, npages); + if (ret < 0) + goto free_pkt; + } else { + /* since there is no payload, mark the + * header as the last desc. */ + pkt->addr[0].last_desc = 1; + + if (dma_addr == 0) { + /* + * the header is not dma mapped yet. + * it should be from kmalloc. + */ + dma_addr = dma_map_single(&dd->pcidev->dev, + pbc, len, DMA_TO_DEVICE); + if (dma_mapping_error(&dd->pcidev->dev, + dma_addr)) { + ret = -ENOMEM; + goto free_pkt; + } + pkt->addr[0].addr = dma_addr; + pkt->addr[0].dma_mapped = 1; + } + } + + counter++; + npkts++; + pkt->pq = pq; + pkt->index = 0; /* reset index for push on hw */ + *ndesc += pkt->naddr; + + list_add_tail(&pkt->list, list); + } + + *maxpkts = npkts; + ret = idx; + goto done; + +free_pkt: + if (pkt->largepkt) + kfree(pkt); + else + kmem_cache_free(pq->pkt_slab, pkt); +free_pbc: + if (dma_addr) + dma_pool_free(pq->header_cache, pbc, dma_addr); + else + kfree(pbc); +free_list: + qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, list); +done: + return ret; +} + +static void qib_user_sdma_set_complete_counter(struct qib_user_sdma_queue *pq, + u32 c) +{ + pq->sent_counter = c; +} + +/* try to clean out queue -- needs pq->lock */ +static int qib_user_sdma_queue_clean(struct qib_pportdata *ppd, + struct qib_user_sdma_queue *pq) +{ + struct qib_devdata *dd = ppd->dd; + struct list_head free_list; + struct qib_user_sdma_pkt *pkt; + struct qib_user_sdma_pkt *pkt_prev; + unsigned long flags; + int ret = 0; + + if (!pq->num_sending) + return 0; + + INIT_LIST_HEAD(&free_list); + + /* + * We need this spin lock here because interrupt handler + * might modify this list in qib_user_sdma_send_desc(), also + * we can not get interrupted, otherwise it is a deadlock. + */ + spin_lock_irqsave(&pq->sent_lock, flags); + list_for_each_entry_safe(pkt, pkt_prev, &pq->sent, list) { + s64 descd = ppd->sdma_descq_removed - pkt->added; + + if (descd < 0) + break; + + list_move_tail(&pkt->list, &free_list); + + /* one more packet cleaned */ + ret++; + pq->num_sending--; + } + spin_unlock_irqrestore(&pq->sent_lock, flags); + + if (!list_empty(&free_list)) { + u32 counter; + + pkt = list_entry(free_list.prev, + struct qib_user_sdma_pkt, list); + counter = pkt->counter; + + qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list); + qib_user_sdma_set_complete_counter(pq, counter); + } + + return ret; +} + +void qib_user_sdma_queue_destroy(struct qib_user_sdma_queue *pq) +{ + if (!pq) + return; + + pq->sdma_rb_node->refcount--; + if (pq->sdma_rb_node->refcount == 0) { + rb_erase(&pq->sdma_rb_node->node, &qib_user_sdma_rb_root); + kfree(pq->sdma_rb_node); + } + dma_pool_destroy(pq->header_cache); + kmem_cache_destroy(pq->pkt_slab); + kfree(pq); +} + +/* clean descriptor queue, returns > 0 if some elements cleaned */ +static int qib_user_sdma_hwqueue_clean(struct qib_pportdata *ppd) +{ + int ret; + unsigned long flags; + + spin_lock_irqsave(&ppd->sdma_lock, flags); + ret = qib_sdma_make_progress(ppd); + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + + return ret; +} + +/* we're in close, drain packets so that we can cleanup successfully... */ +void qib_user_sdma_queue_drain(struct qib_pportdata *ppd, + struct qib_user_sdma_queue *pq) +{ + struct qib_devdata *dd = ppd->dd; + unsigned long flags; + int i; + + if (!pq) + return; + + for (i = 0; i < QIB_USER_SDMA_DRAIN_TIMEOUT; i++) { + mutex_lock(&pq->lock); + if (!pq->num_pending && !pq->num_sending) { + mutex_unlock(&pq->lock); + break; + } + qib_user_sdma_hwqueue_clean(ppd); + qib_user_sdma_queue_clean(ppd, pq); + mutex_unlock(&pq->lock); + msleep(10); + } + + if (pq->num_pending || pq->num_sending) { + struct qib_user_sdma_pkt *pkt; + struct qib_user_sdma_pkt *pkt_prev; + struct list_head free_list; + + mutex_lock(&pq->lock); + spin_lock_irqsave(&ppd->sdma_lock, flags); + /* + * Since we hold sdma_lock, it is safe without sent_lock. + */ + if (pq->num_pending) { + list_for_each_entry_safe(pkt, pkt_prev, + &ppd->sdma_userpending, list) { + if (pkt->pq == pq) { + list_move_tail(&pkt->list, &pq->sent); + pq->num_pending--; + pq->num_sending++; + } + } + } + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + + qib_dev_err(dd, "user sdma lists not empty: forcing!\n"); + INIT_LIST_HEAD(&free_list); + list_splice_init(&pq->sent, &free_list); + pq->num_sending = 0; + qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list); + mutex_unlock(&pq->lock); + } +} + +static inline __le64 qib_sdma_make_desc0(u8 gen, + u64 addr, u64 dwlen, u64 dwoffset) +{ + return cpu_to_le64(/* SDmaPhyAddr[31:0] */ + ((addr & 0xfffffffcULL) << 32) | + /* SDmaGeneration[1:0] */ + ((gen & 3ULL) << 30) | + /* SDmaDwordCount[10:0] */ + ((dwlen & 0x7ffULL) << 16) | + /* SDmaBufOffset[12:2] */ + (dwoffset & 0x7ffULL)); +} + +static inline __le64 qib_sdma_make_first_desc0(__le64 descq) +{ + return descq | cpu_to_le64(1ULL << 12); +} + +static inline __le64 qib_sdma_make_last_desc0(__le64 descq) +{ + /* last */ /* dma head */ + return descq | cpu_to_le64(1ULL << 11 | 1ULL << 13); +} + +static inline __le64 qib_sdma_make_desc1(u64 addr) +{ + /* SDmaPhyAddr[47:32] */ + return cpu_to_le64(addr >> 32); +} + +static void qib_user_sdma_send_frag(struct qib_pportdata *ppd, + struct qib_user_sdma_pkt *pkt, int idx, + unsigned ofs, u16 tail, u8 gen) +{ + const u64 addr = (u64) pkt->addr[idx].addr + + (u64) pkt->addr[idx].offset; + const u64 dwlen = (u64) pkt->addr[idx].length / 4; + __le64 *descqp; + __le64 descq0; + + descqp = &ppd->sdma_descq[tail].qw[0]; + + descq0 = qib_sdma_make_desc0(gen, addr, dwlen, ofs); + if (pkt->addr[idx].first_desc) + descq0 = qib_sdma_make_first_desc0(descq0); + if (pkt->addr[idx].last_desc) { + descq0 = qib_sdma_make_last_desc0(descq0); + if (ppd->sdma_intrequest) { + descq0 |= cpu_to_le64(1ULL << 15); + ppd->sdma_intrequest = 0; + } + } + + descqp[0] = descq0; + descqp[1] = qib_sdma_make_desc1(addr); +} + +void qib_user_sdma_send_desc(struct qib_pportdata *ppd, + struct list_head *pktlist) +{ + struct qib_devdata *dd = ppd->dd; + u16 nfree, nsent; + u16 tail, tail_c; + u8 gen, gen_c; + + nfree = qib_sdma_descq_freecnt(ppd); + if (!nfree) + return; + +retry: + nsent = 0; + tail_c = tail = ppd->sdma_descq_tail; + gen_c = gen = ppd->sdma_generation; + while (!list_empty(pktlist)) { + struct qib_user_sdma_pkt *pkt = + list_entry(pktlist->next, struct qib_user_sdma_pkt, + list); + int i, j, c = 0; + unsigned ofs = 0; + u16 dtail = tail; + + for (i = pkt->index; i < pkt->naddr && nfree; i++) { + qib_user_sdma_send_frag(ppd, pkt, i, ofs, tail, gen); + ofs += pkt->addr[i].length >> 2; + + if (++tail == ppd->sdma_descq_cnt) { + tail = 0; + ++gen; + ppd->sdma_intrequest = 1; + } else if (tail == (ppd->sdma_descq_cnt>>1)) { + ppd->sdma_intrequest = 1; + } + nfree--; + if (pkt->addr[i].last_desc == 0) + continue; + + /* + * If the packet is >= 2KB mtu equivalent, we + * have to use the large buffers, and have to + * mark each descriptor as part of a large + * buffer packet. + */ + if (ofs > dd->piosize2kmax_dwords) { + for (j = pkt->index; j <= i; j++) { + ppd->sdma_descq[dtail].qw[0] |= + cpu_to_le64(1ULL << 14); + if (++dtail == ppd->sdma_descq_cnt) + dtail = 0; + } + } + c += i + 1 - pkt->index; + pkt->index = i + 1; /* index for next first */ + tail_c = dtail = tail; + gen_c = gen; + ofs = 0; /* reset for next packet */ + } + + ppd->sdma_descq_added += c; + nsent += c; + if (pkt->index == pkt->naddr) { + pkt->added = ppd->sdma_descq_added; + pkt->pq->added = pkt->added; + pkt->pq->num_pending--; + spin_lock(&pkt->pq->sent_lock); + pkt->pq->num_sending++; + list_move_tail(&pkt->list, &pkt->pq->sent); + spin_unlock(&pkt->pq->sent_lock); + } + if (!nfree || (nsent<<2) > ppd->sdma_descq_cnt) + break; + } + + /* advance the tail on the chip if necessary */ + if (ppd->sdma_descq_tail != tail_c) { + ppd->sdma_generation = gen_c; + dd->f_sdma_update_tail(ppd, tail_c); + } + + if (nfree && !list_empty(pktlist)) + goto retry; + + return; +} + +/* pq->lock must be held, get packets on the wire... */ +static int qib_user_sdma_push_pkts(struct qib_pportdata *ppd, + struct qib_user_sdma_queue *pq, + struct list_head *pktlist, int count) +{ + unsigned long flags; + + if (unlikely(!(ppd->lflags & QIBL_LINKACTIVE))) + return -ECOMM; + + /* non-blocking mode */ + if (pq->sdma_rb_node->refcount > 1) { + spin_lock_irqsave(&ppd->sdma_lock, flags); + if (unlikely(!__qib_sdma_running(ppd))) { + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + return -ECOMM; + } + pq->num_pending += count; + list_splice_tail_init(pktlist, &ppd->sdma_userpending); + qib_user_sdma_send_desc(ppd, &ppd->sdma_userpending); + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + return 0; + } + + /* In this case, descriptors from this process are not + * linked to ppd pending queue, interrupt handler + * won't update this process, it is OK to directly + * modify without sdma lock. + */ + + + pq->num_pending += count; + /* + * Blocking mode for single rail process, we must + * release/regain sdma_lock to give other process + * chance to make progress. This is important for + * performance. + */ + do { + spin_lock_irqsave(&ppd->sdma_lock, flags); + if (unlikely(!__qib_sdma_running(ppd))) { + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + return -ECOMM; + } + qib_user_sdma_send_desc(ppd, pktlist); + if (!list_empty(pktlist)) + qib_sdma_make_progress(ppd); + spin_unlock_irqrestore(&ppd->sdma_lock, flags); + } while (!list_empty(pktlist)); + + return 0; +} + +int qib_user_sdma_writev(struct qib_ctxtdata *rcd, + struct qib_user_sdma_queue *pq, + const struct iovec *iov, + unsigned long dim) +{ + struct qib_devdata *dd = rcd->dd; + struct qib_pportdata *ppd = rcd->ppd; + int ret = 0; + struct list_head list; + int npkts = 0; + + INIT_LIST_HEAD(&list); + + mutex_lock(&pq->lock); + + /* why not -ECOMM like qib_user_sdma_push_pkts() below? */ + if (!qib_sdma_running(ppd)) + goto done_unlock; + + /* if I have packets not complete yet */ + if (pq->added > ppd->sdma_descq_removed) + qib_user_sdma_hwqueue_clean(ppd); + /* if I have complete packets to be freed */ + if (pq->num_sending) + qib_user_sdma_queue_clean(ppd, pq); + + while (dim) { + int mxp = 1; + int ndesc = 0; + + ret = qib_user_sdma_queue_pkts(dd, ppd, pq, + iov, dim, &list, &mxp, &ndesc); + if (ret < 0) + goto done_unlock; + else { + dim -= ret; + iov += ret; + } + + /* force packets onto the sdma hw queue... */ + if (!list_empty(&list)) { + /* + * Lazily clean hw queue. + */ + if (qib_sdma_descq_freecnt(ppd) < ndesc) { + qib_user_sdma_hwqueue_clean(ppd); + if (pq->num_sending) + qib_user_sdma_queue_clean(ppd, pq); + } + + ret = qib_user_sdma_push_pkts(ppd, pq, &list, mxp); + if (ret < 0) + goto done_unlock; + else { + npkts += mxp; + pq->counter += mxp; + } + } + } + +done_unlock: + if (!list_empty(&list)) + qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &list); + mutex_unlock(&pq->lock); + + return (ret < 0) ? ret : npkts; +} + +int qib_user_sdma_make_progress(struct qib_pportdata *ppd, + struct qib_user_sdma_queue *pq) +{ + int ret = 0; + + mutex_lock(&pq->lock); + qib_user_sdma_hwqueue_clean(ppd); + ret = qib_user_sdma_queue_clean(ppd, pq); + mutex_unlock(&pq->lock); + + return ret; +} + +u32 qib_user_sdma_complete_counter(const struct qib_user_sdma_queue *pq) +{ + return pq ? pq->sent_counter : 0; +} + +u32 qib_user_sdma_inflight_counter(struct qib_user_sdma_queue *pq) +{ + return pq ? pq->counter : 0; +} diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.h b/drivers/infiniband/hw/qib/qib_user_sdma.h new file mode 100644 index 0000000..ce8cbaf --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_user_sdma.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include + +struct qib_user_sdma_queue; + +struct qib_user_sdma_queue * +qib_user_sdma_queue_create(struct device *dev, int unit, int port, int sport); +void qib_user_sdma_queue_destroy(struct qib_user_sdma_queue *pq); + +int qib_user_sdma_writev(struct qib_ctxtdata *pd, + struct qib_user_sdma_queue *pq, + const struct iovec *iov, + unsigned long dim); + +int qib_user_sdma_make_progress(struct qib_pportdata *ppd, + struct qib_user_sdma_queue *pq); + +void qib_user_sdma_queue_drain(struct qib_pportdata *ppd, + struct qib_user_sdma_queue *pq); + +u32 qib_user_sdma_complete_counter(const struct qib_user_sdma_queue *pq); +u32 qib_user_sdma_inflight_counter(struct qib_user_sdma_queue *pq); diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c new file mode 100644 index 0000000..9bcfbd8 --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -0,0 +1,2336 @@ +/* + * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "qib.h" +#include "qib_common.h" + +static unsigned int ib_qib_qp_table_size = 256; +module_param_named(qp_table_size, ib_qib_qp_table_size, uint, S_IRUGO); +MODULE_PARM_DESC(qp_table_size, "QP table size"); + +unsigned int ib_qib_lkey_table_size = 16; +module_param_named(lkey_table_size, ib_qib_lkey_table_size, uint, + S_IRUGO); +MODULE_PARM_DESC(lkey_table_size, + "LKEY table size in bits (2^n, 1 <= n <= 23)"); + +static unsigned int ib_qib_max_pds = 0xFFFF; +module_param_named(max_pds, ib_qib_max_pds, uint, S_IRUGO); +MODULE_PARM_DESC(max_pds, + "Maximum number of protection domains to support"); + +static unsigned int ib_qib_max_ahs = 0xFFFF; +module_param_named(max_ahs, ib_qib_max_ahs, uint, S_IRUGO); +MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support"); + +unsigned int ib_qib_max_cqes = 0x2FFFF; +module_param_named(max_cqes, ib_qib_max_cqes, uint, S_IRUGO); +MODULE_PARM_DESC(max_cqes, + "Maximum number of completion queue entries to support"); + +unsigned int ib_qib_max_cqs = 0x1FFFF; +module_param_named(max_cqs, ib_qib_max_cqs, uint, S_IRUGO); +MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support"); + +unsigned int ib_qib_max_qp_wrs = 0x3FFF; +module_param_named(max_qp_wrs, ib_qib_max_qp_wrs, uint, S_IRUGO); +MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support"); + +unsigned int ib_qib_max_qps = 16384; +module_param_named(max_qps, ib_qib_max_qps, uint, S_IRUGO); +MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support"); + +unsigned int ib_qib_max_sges = 0x60; +module_param_named(max_sges, ib_qib_max_sges, uint, S_IRUGO); +MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support"); + +unsigned int ib_qib_max_mcast_grps = 16384; +module_param_named(max_mcast_grps, ib_qib_max_mcast_grps, uint, S_IRUGO); +MODULE_PARM_DESC(max_mcast_grps, + "Maximum number of multicast groups to support"); + +unsigned int ib_qib_max_mcast_qp_attached = 16; +module_param_named(max_mcast_qp_attached, ib_qib_max_mcast_qp_attached, + uint, S_IRUGO); +MODULE_PARM_DESC(max_mcast_qp_attached, + "Maximum number of attached QPs to support"); + +unsigned int ib_qib_max_srqs = 1024; +module_param_named(max_srqs, ib_qib_max_srqs, uint, S_IRUGO); +MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support"); + +unsigned int ib_qib_max_srq_sges = 128; +module_param_named(max_srq_sges, ib_qib_max_srq_sges, uint, S_IRUGO); +MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support"); + +unsigned int ib_qib_max_srq_wrs = 0x1FFFF; +module_param_named(max_srq_wrs, ib_qib_max_srq_wrs, uint, S_IRUGO); +MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support"); + +static unsigned int ib_qib_disable_sma; +module_param_named(disable_sma, ib_qib_disable_sma, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(disable_sma, "Disable the SMA"); + +/* + * Note that it is OK to post send work requests in the SQE and ERR + * states; qib_do_send() will process them and generate error + * completions as per IB 1.2 C10-96. + */ +const int ib_qib_state_ops[IB_QPS_ERR + 1] = { + [IB_QPS_RESET] = 0, + [IB_QPS_INIT] = QIB_POST_RECV_OK, + [IB_QPS_RTR] = QIB_POST_RECV_OK | QIB_PROCESS_RECV_OK, + [IB_QPS_RTS] = QIB_POST_RECV_OK | QIB_PROCESS_RECV_OK | + QIB_POST_SEND_OK | QIB_PROCESS_SEND_OK | + QIB_PROCESS_NEXT_SEND_OK, + [IB_QPS_SQD] = QIB_POST_RECV_OK | QIB_PROCESS_RECV_OK | + QIB_POST_SEND_OK | QIB_PROCESS_SEND_OK, + [IB_QPS_SQE] = QIB_POST_RECV_OK | QIB_PROCESS_RECV_OK | + QIB_POST_SEND_OK | QIB_FLUSH_SEND, + [IB_QPS_ERR] = QIB_POST_RECV_OK | QIB_FLUSH_RECV | + QIB_POST_SEND_OK | QIB_FLUSH_SEND, +}; + +struct qib_ucontext { + struct ib_ucontext ibucontext; +}; + +static inline struct qib_ucontext *to_iucontext(struct ib_ucontext + *ibucontext) +{ + return container_of(ibucontext, struct qib_ucontext, ibucontext); +} + +/* + * Translate ib_wr_opcode into ib_wc_opcode. + */ +const enum ib_wc_opcode ib_qib_wc_opcode[] = { + [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE, + [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE, + [IB_WR_SEND] = IB_WC_SEND, + [IB_WR_SEND_WITH_IMM] = IB_WC_SEND, + [IB_WR_RDMA_READ] = IB_WC_RDMA_READ, + [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP, + [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD +}; + +/* + * System image GUID. + */ +__be64 ib_qib_sys_image_guid; + +/** + * qib_copy_sge - copy data to SGE memory + * @ss: the SGE state + * @data: the data to copy + * @length: the length of the data + */ +void qib_copy_sge(struct qib_sge_state *ss, void *data, u32 length, int release) +{ + struct qib_sge *sge = &ss->sge; + + while (length) { + u32 len = sge->length; + + if (len > length) + len = length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + memcpy(sge->vaddr, data, len); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (release) + qib_put_mr(sge->mr); + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= QIB_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + data += len; + length -= len; + } +} + +/** + * qib_skip_sge - skip over SGE memory - XXX almost dup of prev func + * @ss: the SGE state + * @length: the number of bytes to skip + */ +void qib_skip_sge(struct qib_sge_state *ss, u32 length, int release) +{ + struct qib_sge *sge = &ss->sge; + + while (length) { + u32 len = sge->length; + + if (len > length) + len = length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (release) + qib_put_mr(sge->mr); + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= QIB_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + length -= len; + } +} + +/* + * Count the number of DMA descriptors needed to send length bytes of data. + * Don't modify the qib_sge_state to get the count. + * Return zero if any of the segments is not aligned. + */ +static u32 qib_count_sge(struct qib_sge_state *ss, u32 length) +{ + struct qib_sge *sg_list = ss->sg_list; + struct qib_sge sge = ss->sge; + u8 num_sge = ss->num_sge; + u32 ndesc = 1; /* count the header */ + + while (length) { + u32 len = sge.length; + + if (len > length) + len = length; + if (len > sge.sge_length) + len = sge.sge_length; + BUG_ON(len == 0); + if (((long) sge.vaddr & (sizeof(u32) - 1)) || + (len != length && (len & (sizeof(u32) - 1)))) { + ndesc = 0; + break; + } + ndesc++; + sge.vaddr += len; + sge.length -= len; + sge.sge_length -= len; + if (sge.sge_length == 0) { + if (--num_sge) + sge = *sg_list++; + } else if (sge.length == 0 && sge.mr->lkey) { + if (++sge.n >= QIB_SEGSZ) { + if (++sge.m >= sge.mr->mapsz) + break; + sge.n = 0; + } + sge.vaddr = + sge.mr->map[sge.m]->segs[sge.n].vaddr; + sge.length = + sge.mr->map[sge.m]->segs[sge.n].length; + } + length -= len; + } + return ndesc; +} + +/* + * Copy from the SGEs to the data buffer. + */ +static void qib_copy_from_sge(void *data, struct qib_sge_state *ss, u32 length) +{ + struct qib_sge *sge = &ss->sge; + + while (length) { + u32 len = sge->length; + + if (len > length) + len = length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + memcpy(data, sge->vaddr, len); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= QIB_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + data += len; + length -= len; + } +} + +/** + * qib_post_one_send - post one RC, UC, or UD send work request + * @qp: the QP to post on + * @wr: the work request to send + */ +static int qib_post_one_send(struct qib_qp *qp, struct ib_send_wr *wr, + int *scheduled) +{ + struct qib_swqe *wqe; + u32 next; + int i; + int j; + int acc; + int ret; + unsigned long flags; + struct qib_lkey_table *rkt; + struct qib_pd *pd; + + spin_lock_irqsave(&qp->s_lock, flags); + + /* Check that state is OK to post send. */ + if (unlikely(!(ib_qib_state_ops[qp->state] & QIB_POST_SEND_OK))) + goto bail_inval; + + /* IB spec says that num_sge == 0 is OK. */ + if (wr->num_sge > qp->s_max_sge) + goto bail_inval; + + /* + * Don't allow RDMA reads or atomic operations on UC or + * undefined operations. + * Make sure buffer is large enough to hold the result for atomics. + */ + if (wr->opcode == IB_WR_FAST_REG_MR) { + if (qib_fast_reg_mr(qp, wr)) + goto bail_inval; + } else if (qp->ibqp.qp_type == IB_QPT_UC) { + if ((unsigned) wr->opcode >= IB_WR_RDMA_READ) + goto bail_inval; + } else if (qp->ibqp.qp_type != IB_QPT_RC) { + /* Check IB_QPT_SMI, IB_QPT_GSI, IB_QPT_UD opcode */ + if (wr->opcode != IB_WR_SEND && + wr->opcode != IB_WR_SEND_WITH_IMM) + goto bail_inval; + /* Check UD destination address PD */ + if (qp->ibqp.pd != wr->wr.ud.ah->pd) + goto bail_inval; + } else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD) + goto bail_inval; + else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP && + (wr->num_sge == 0 || + wr->sg_list[0].length < sizeof(u64) || + wr->sg_list[0].addr & (sizeof(u64) - 1))) + goto bail_inval; + else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic) + goto bail_inval; + + next = qp->s_head + 1; + if (next >= qp->s_size) + next = 0; + if (next == qp->s_last) { + ret = -ENOMEM; + goto bail; + } + + rkt = &to_idev(qp->ibqp.device)->lk_table; + pd = to_ipd(qp->ibqp.pd); + wqe = get_swqe_ptr(qp, qp->s_head); + wqe->wr = *wr; + wqe->length = 0; + j = 0; + if (wr->num_sge) { + acc = wr->opcode >= IB_WR_RDMA_READ ? + IB_ACCESS_LOCAL_WRITE : 0; + for (i = 0; i < wr->num_sge; i++) { + u32 length = wr->sg_list[i].length; + int ok; + + if (length == 0) + continue; + ok = qib_lkey_ok(rkt, pd, &wqe->sg_list[j], + &wr->sg_list[i], acc); + if (!ok) + goto bail_inval_free; + wqe->length += length; + j++; + } + wqe->wr.num_sge = j; + } + if (qp->ibqp.qp_type == IB_QPT_UC || + qp->ibqp.qp_type == IB_QPT_RC) { + if (wqe->length > 0x80000000U) + goto bail_inval_free; + } else if (wqe->length > (dd_from_ibdev(qp->ibqp.device)->pport + + qp->port_num - 1)->ibmtu) + goto bail_inval_free; + else + atomic_inc(&to_iah(wr->wr.ud.ah)->refcount); + wqe->ssn = qp->s_ssn++; + qp->s_head = next; + + ret = 0; + goto bail; + +bail_inval_free: + while (j) { + struct qib_sge *sge = &wqe->sg_list[--j]; + + qib_put_mr(sge->mr); + } +bail_inval: + ret = -EINVAL; +bail: + if (!ret && !wr->next && + !qib_sdma_empty( + dd_from_ibdev(qp->ibqp.device)->pport + qp->port_num - 1)) { + qib_schedule_send(qp); + *scheduled = 1; + } + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +/** + * qib_post_send - post a send on a QP + * @ibqp: the QP to post the send on + * @wr: the list of work requests to post + * @bad_wr: the first bad WR is put here + * + * This may be called from interrupt context. + */ +static int qib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + struct qib_qp *qp = to_iqp(ibqp); + int err = 0; + int scheduled = 0; + + for (; wr; wr = wr->next) { + err = qib_post_one_send(qp, wr, &scheduled); + if (err) { + *bad_wr = wr; + goto bail; + } + } + + /* Try to do the send work in the caller's context. */ + if (!scheduled) + qib_do_send(&qp->s_work); + +bail: + return err; +} + +/** + * qib_post_receive - post a receive on a QP + * @ibqp: the QP to post the receive on + * @wr: the WR to post + * @bad_wr: the first bad WR is put here + * + * This may be called from interrupt context. + */ +static int qib_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct qib_qp *qp = to_iqp(ibqp); + struct qib_rwq *wq = qp->r_rq.wq; + unsigned long flags; + int ret; + + /* Check that state is OK to post receive. */ + if (!(ib_qib_state_ops[qp->state] & QIB_POST_RECV_OK) || !wq) { + *bad_wr = wr; + ret = -EINVAL; + goto bail; + } + + for (; wr; wr = wr->next) { + struct qib_rwqe *wqe; + u32 next; + int i; + + if ((unsigned) wr->num_sge > qp->r_rq.max_sge) { + *bad_wr = wr; + ret = -EINVAL; + goto bail; + } + + spin_lock_irqsave(&qp->r_rq.lock, flags); + next = wq->head + 1; + if (next >= qp->r_rq.size) + next = 0; + if (next == wq->tail) { + spin_unlock_irqrestore(&qp->r_rq.lock, flags); + *bad_wr = wr; + ret = -ENOMEM; + goto bail; + } + + wqe = get_rwqe_ptr(&qp->r_rq, wq->head); + wqe->wr_id = wr->wr_id; + wqe->num_sge = wr->num_sge; + for (i = 0; i < wr->num_sge; i++) + wqe->sg_list[i] = wr->sg_list[i]; + /* Make sure queue entry is written before the head index. */ + smp_wmb(); + wq->head = next; + spin_unlock_irqrestore(&qp->r_rq.lock, flags); + } + ret = 0; + +bail: + return ret; +} + +/** + * qib_qp_rcv - processing an incoming packet on a QP + * @rcd: the context pointer + * @hdr: the packet header + * @has_grh: true if the packet has a GRH + * @data: the packet data + * @tlen: the packet length + * @qp: the QP the packet came on + * + * This is called from qib_ib_rcv() to process an incoming packet + * for the given QP. + * Called at interrupt level. + */ +static void qib_qp_rcv(struct qib_ctxtdata *rcd, struct qib_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct qib_qp *qp) +{ + struct qib_ibport *ibp = &rcd->ppd->ibport_data; + + spin_lock(&qp->r_lock); + + /* Check for valid receive state. */ + if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) { + ibp->n_pkt_drops++; + goto unlock; + } + + switch (qp->ibqp.qp_type) { + case IB_QPT_SMI: + case IB_QPT_GSI: + if (ib_qib_disable_sma) + break; + /* FALLTHROUGH */ + case IB_QPT_UD: + qib_ud_rcv(ibp, hdr, has_grh, data, tlen, qp); + break; + + case IB_QPT_RC: + qib_rc_rcv(rcd, hdr, has_grh, data, tlen, qp); + break; + + case IB_QPT_UC: + qib_uc_rcv(ibp, hdr, has_grh, data, tlen, qp); + break; + + default: + break; + } + +unlock: + spin_unlock(&qp->r_lock); +} + +/** + * qib_ib_rcv - process an incoming packet + * @rcd: the context pointer + * @rhdr: the header of the packet + * @data: the packet payload + * @tlen: the packet length + * + * This is called from qib_kreceive() to process an incoming packet at + * interrupt level. Tlen is the length of the header + data + CRC in bytes. + */ +void qib_ib_rcv(struct qib_ctxtdata *rcd, void *rhdr, void *data, u32 tlen) +{ + struct qib_pportdata *ppd = rcd->ppd; + struct qib_ibport *ibp = &ppd->ibport_data; + struct qib_ib_header *hdr = rhdr; + struct qib_other_headers *ohdr; + struct qib_qp *qp; + u32 qp_num; + int lnh; + u8 opcode; + u16 lid; + + /* 24 == LRH+BTH+CRC */ + if (unlikely(tlen < 24)) + goto drop; + + /* Check for a valid destination LID (see ch. 7.11.1). */ + lid = be16_to_cpu(hdr->lrh[1]); + if (lid < QIB_MULTICAST_LID_BASE) { + lid &= ~((1 << ppd->lmc) - 1); + if (unlikely(lid != ppd->lid)) + goto drop; + } + + /* Check for GRH */ + lnh = be16_to_cpu(hdr->lrh[0]) & 3; + if (lnh == QIB_LRH_BTH) + ohdr = &hdr->u.oth; + else if (lnh == QIB_LRH_GRH) { + u32 vtf; + + ohdr = &hdr->u.l.oth; + if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR) + goto drop; + vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow); + if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION) + goto drop; + } else + goto drop; + + opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0x7f; +#ifdef CONFIG_DEBUG_FS + rcd->opstats->stats[opcode].n_bytes += tlen; + rcd->opstats->stats[opcode].n_packets++; +#endif + + /* Get the destination QP number. */ + qp_num = be32_to_cpu(ohdr->bth[1]) & QIB_QPN_MASK; + if (qp_num == QIB_MULTICAST_QPN) { + struct qib_mcast *mcast; + struct qib_mcast_qp *p; + + if (lnh != QIB_LRH_GRH) + goto drop; + mcast = qib_mcast_find(ibp, &hdr->u.l.grh.dgid); + if (mcast == NULL) + goto drop; + this_cpu_inc(ibp->pmastats->n_multicast_rcv); + list_for_each_entry_rcu(p, &mcast->qp_list, list) + qib_qp_rcv(rcd, hdr, 1, data, tlen, p->qp); + /* + * Notify qib_multicast_detach() if it is waiting for us + * to finish. + */ + if (atomic_dec_return(&mcast->refcount) <= 1) + wake_up(&mcast->wait); + } else { + if (rcd->lookaside_qp) { + if (rcd->lookaside_qpn != qp_num) { + if (atomic_dec_and_test( + &rcd->lookaside_qp->refcount)) + wake_up( + &rcd->lookaside_qp->wait); + rcd->lookaside_qp = NULL; + } + } + if (!rcd->lookaside_qp) { + qp = qib_lookup_qpn(ibp, qp_num); + if (!qp) + goto drop; + rcd->lookaside_qp = qp; + rcd->lookaside_qpn = qp_num; + } else + qp = rcd->lookaside_qp; + this_cpu_inc(ibp->pmastats->n_unicast_rcv); + qib_qp_rcv(rcd, hdr, lnh == QIB_LRH_GRH, data, tlen, qp); + } + return; + +drop: + ibp->n_pkt_drops++; +} + +/* + * This is called from a timer to check for QPs + * which need kernel memory in order to send a packet. + */ +static void mem_timer(unsigned long data) +{ + struct qib_ibdev *dev = (struct qib_ibdev *) data; + struct list_head *list = &dev->memwait; + struct qib_qp *qp = NULL; + unsigned long flags; + + spin_lock_irqsave(&dev->pending_lock, flags); + if (!list_empty(list)) { + qp = list_entry(list->next, struct qib_qp, iowait); + list_del_init(&qp->iowait); + atomic_inc(&qp->refcount); + if (!list_empty(list)) + mod_timer(&dev->mem_timer, jiffies + 1); + } + spin_unlock_irqrestore(&dev->pending_lock, flags); + + if (qp) { + spin_lock_irqsave(&qp->s_lock, flags); + if (qp->s_flags & QIB_S_WAIT_KMEM) { + qp->s_flags &= ~QIB_S_WAIT_KMEM; + qib_schedule_send(qp); + } + spin_unlock_irqrestore(&qp->s_lock, flags); + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } +} + +static void update_sge(struct qib_sge_state *ss, u32 length) +{ + struct qib_sge *sge = &ss->sge; + + sge->vaddr += length; + sge->length -= length; + sge->sge_length -= length; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= QIB_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + return; + sge->n = 0; + } + sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = sge->mr->map[sge->m]->segs[sge->n].length; + } +} + +#ifdef __LITTLE_ENDIAN +static inline u32 get_upper_bits(u32 data, u32 shift) +{ + return data >> shift; +} + +static inline u32 set_upper_bits(u32 data, u32 shift) +{ + return data << shift; +} + +static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off) +{ + data <<= ((sizeof(u32) - n) * BITS_PER_BYTE); + data >>= ((sizeof(u32) - n - off) * BITS_PER_BYTE); + return data; +} +#else +static inline u32 get_upper_bits(u32 data, u32 shift) +{ + return data << shift; +} + +static inline u32 set_upper_bits(u32 data, u32 shift) +{ + return data >> shift; +} + +static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off) +{ + data >>= ((sizeof(u32) - n) * BITS_PER_BYTE); + data <<= ((sizeof(u32) - n - off) * BITS_PER_BYTE); + return data; +} +#endif + +static void copy_io(u32 __iomem *piobuf, struct qib_sge_state *ss, + u32 length, unsigned flush_wc) +{ + u32 extra = 0; + u32 data = 0; + u32 last; + + while (1) { + u32 len = ss->sge.length; + u32 off; + + if (len > length) + len = length; + if (len > ss->sge.sge_length) + len = ss->sge.sge_length; + BUG_ON(len == 0); + /* If the source address is not aligned, try to align it. */ + off = (unsigned long)ss->sge.vaddr & (sizeof(u32) - 1); + if (off) { + u32 *addr = (u32 *)((unsigned long)ss->sge.vaddr & + ~(sizeof(u32) - 1)); + u32 v = get_upper_bits(*addr, off * BITS_PER_BYTE); + u32 y; + + y = sizeof(u32) - off; + if (len > y) + len = y; + if (len + extra >= sizeof(u32)) { + data |= set_upper_bits(v, extra * + BITS_PER_BYTE); + len = sizeof(u32) - extra; + if (len == length) { + last = data; + break; + } + __raw_writel(data, piobuf); + piobuf++; + extra = 0; + data = 0; + } else { + /* Clear unused upper bytes */ + data |= clear_upper_bytes(v, len, extra); + if (len == length) { + last = data; + break; + } + extra += len; + } + } else if (extra) { + /* Source address is aligned. */ + u32 *addr = (u32 *) ss->sge.vaddr; + int shift = extra * BITS_PER_BYTE; + int ushift = 32 - shift; + u32 l = len; + + while (l >= sizeof(u32)) { + u32 v = *addr; + + data |= set_upper_bits(v, shift); + __raw_writel(data, piobuf); + data = get_upper_bits(v, ushift); + piobuf++; + addr++; + l -= sizeof(u32); + } + /* + * We still have 'extra' number of bytes leftover. + */ + if (l) { + u32 v = *addr; + + if (l + extra >= sizeof(u32)) { + data |= set_upper_bits(v, shift); + len -= l + extra - sizeof(u32); + if (len == length) { + last = data; + break; + } + __raw_writel(data, piobuf); + piobuf++; + extra = 0; + data = 0; + } else { + /* Clear unused upper bytes */ + data |= clear_upper_bytes(v, l, extra); + if (len == length) { + last = data; + break; + } + extra += l; + } + } else if (len == length) { + last = data; + break; + } + } else if (len == length) { + u32 w; + + /* + * Need to round up for the last dword in the + * packet. + */ + w = (len + 3) >> 2; + qib_pio_copy(piobuf, ss->sge.vaddr, w - 1); + piobuf += w - 1; + last = ((u32 *) ss->sge.vaddr)[w - 1]; + break; + } else { + u32 w = len >> 2; + + qib_pio_copy(piobuf, ss->sge.vaddr, w); + piobuf += w; + + extra = len & (sizeof(u32) - 1); + if (extra) { + u32 v = ((u32 *) ss->sge.vaddr)[w]; + + /* Clear unused upper bytes */ + data = clear_upper_bytes(v, extra, 0); + } + } + update_sge(ss, len); + length -= len; + } + /* Update address before sending packet. */ + update_sge(ss, length); + if (flush_wc) { + /* must flush early everything before trigger word */ + qib_flush_wc(); + __raw_writel(last, piobuf); + /* be sure trigger word is written */ + qib_flush_wc(); + } else + __raw_writel(last, piobuf); +} + +static noinline struct qib_verbs_txreq *__get_txreq(struct qib_ibdev *dev, + struct qib_qp *qp) +{ + struct qib_verbs_txreq *tx; + unsigned long flags; + + spin_lock_irqsave(&qp->s_lock, flags); + spin_lock(&dev->pending_lock); + + if (!list_empty(&dev->txreq_free)) { + struct list_head *l = dev->txreq_free.next; + + list_del(l); + spin_unlock(&dev->pending_lock); + spin_unlock_irqrestore(&qp->s_lock, flags); + tx = list_entry(l, struct qib_verbs_txreq, txreq.list); + } else { + if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK && + list_empty(&qp->iowait)) { + dev->n_txwait++; + qp->s_flags |= QIB_S_WAIT_TX; + list_add_tail(&qp->iowait, &dev->txwait); + } + qp->s_flags &= ~QIB_S_BUSY; + spin_unlock(&dev->pending_lock); + spin_unlock_irqrestore(&qp->s_lock, flags); + tx = ERR_PTR(-EBUSY); + } + return tx; +} + +static inline struct qib_verbs_txreq *get_txreq(struct qib_ibdev *dev, + struct qib_qp *qp) +{ + struct qib_verbs_txreq *tx; + unsigned long flags; + + spin_lock_irqsave(&dev->pending_lock, flags); + /* assume the list non empty */ + if (likely(!list_empty(&dev->txreq_free))) { + struct list_head *l = dev->txreq_free.next; + + list_del(l); + spin_unlock_irqrestore(&dev->pending_lock, flags); + tx = list_entry(l, struct qib_verbs_txreq, txreq.list); + } else { + /* call slow path to get the extra lock */ + spin_unlock_irqrestore(&dev->pending_lock, flags); + tx = __get_txreq(dev, qp); + } + return tx; +} + +void qib_put_txreq(struct qib_verbs_txreq *tx) +{ + struct qib_ibdev *dev; + struct qib_qp *qp; + unsigned long flags; + + qp = tx->qp; + dev = to_idev(qp->ibqp.device); + + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + if (tx->mr) { + qib_put_mr(tx->mr); + tx->mr = NULL; + } + if (tx->txreq.flags & QIB_SDMA_TXREQ_F_FREEBUF) { + tx->txreq.flags &= ~QIB_SDMA_TXREQ_F_FREEBUF; + dma_unmap_single(&dd_from_dev(dev)->pcidev->dev, + tx->txreq.addr, tx->hdr_dwords << 2, + DMA_TO_DEVICE); + kfree(tx->align_buf); + } + + spin_lock_irqsave(&dev->pending_lock, flags); + + /* Put struct back on free list */ + list_add(&tx->txreq.list, &dev->txreq_free); + + if (!list_empty(&dev->txwait)) { + /* Wake up first QP wanting a free struct */ + qp = list_entry(dev->txwait.next, struct qib_qp, iowait); + list_del_init(&qp->iowait); + atomic_inc(&qp->refcount); + spin_unlock_irqrestore(&dev->pending_lock, flags); + + spin_lock_irqsave(&qp->s_lock, flags); + if (qp->s_flags & QIB_S_WAIT_TX) { + qp->s_flags &= ~QIB_S_WAIT_TX; + qib_schedule_send(qp); + } + spin_unlock_irqrestore(&qp->s_lock, flags); + + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } else + spin_unlock_irqrestore(&dev->pending_lock, flags); +} + +/* + * This is called when there are send DMA descriptors that might be + * available. + * + * This is called with ppd->sdma_lock held. + */ +void qib_verbs_sdma_desc_avail(struct qib_pportdata *ppd, unsigned avail) +{ + struct qib_qp *qp, *nqp; + struct qib_qp *qps[20]; + struct qib_ibdev *dev; + unsigned i, n; + + n = 0; + dev = &ppd->dd->verbs_dev; + spin_lock(&dev->pending_lock); + + /* Search wait list for first QP wanting DMA descriptors. */ + list_for_each_entry_safe(qp, nqp, &dev->dmawait, iowait) { + if (qp->port_num != ppd->port) + continue; + if (n == ARRAY_SIZE(qps)) + break; + if (qp->s_tx->txreq.sg_count > avail) + break; + avail -= qp->s_tx->txreq.sg_count; + list_del_init(&qp->iowait); + atomic_inc(&qp->refcount); + qps[n++] = qp; + } + + spin_unlock(&dev->pending_lock); + + for (i = 0; i < n; i++) { + qp = qps[i]; + spin_lock(&qp->s_lock); + if (qp->s_flags & QIB_S_WAIT_DMA_DESC) { + qp->s_flags &= ~QIB_S_WAIT_DMA_DESC; + qib_schedule_send(qp); + } + spin_unlock(&qp->s_lock); + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } +} + +/* + * This is called with ppd->sdma_lock held. + */ +static void sdma_complete(struct qib_sdma_txreq *cookie, int status) +{ + struct qib_verbs_txreq *tx = + container_of(cookie, struct qib_verbs_txreq, txreq); + struct qib_qp *qp = tx->qp; + + spin_lock(&qp->s_lock); + if (tx->wqe) + qib_send_complete(qp, tx->wqe, IB_WC_SUCCESS); + else if (qp->ibqp.qp_type == IB_QPT_RC) { + struct qib_ib_header *hdr; + + if (tx->txreq.flags & QIB_SDMA_TXREQ_F_FREEBUF) + hdr = &tx->align_buf->hdr; + else { + struct qib_ibdev *dev = to_idev(qp->ibqp.device); + + hdr = &dev->pio_hdrs[tx->hdr_inx].hdr; + } + qib_rc_send_complete(qp, hdr); + } + if (atomic_dec_and_test(&qp->s_dma_busy)) { + if (qp->state == IB_QPS_RESET) + wake_up(&qp->wait_dma); + else if (qp->s_flags & QIB_S_WAIT_DMA) { + qp->s_flags &= ~QIB_S_WAIT_DMA; + qib_schedule_send(qp); + } + } + spin_unlock(&qp->s_lock); + + qib_put_txreq(tx); +} + +static int wait_kmem(struct qib_ibdev *dev, struct qib_qp *qp) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) { + spin_lock(&dev->pending_lock); + if (list_empty(&qp->iowait)) { + if (list_empty(&dev->memwait)) + mod_timer(&dev->mem_timer, jiffies + 1); + qp->s_flags |= QIB_S_WAIT_KMEM; + list_add_tail(&qp->iowait, &dev->memwait); + } + spin_unlock(&dev->pending_lock); + qp->s_flags &= ~QIB_S_BUSY; + ret = -EBUSY; + } + spin_unlock_irqrestore(&qp->s_lock, flags); + + return ret; +} + +static int qib_verbs_send_dma(struct qib_qp *qp, struct qib_ib_header *hdr, + u32 hdrwords, struct qib_sge_state *ss, u32 len, + u32 plen, u32 dwords) +{ + struct qib_ibdev *dev = to_idev(qp->ibqp.device); + struct qib_devdata *dd = dd_from_dev(dev); + struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + struct qib_verbs_txreq *tx; + struct qib_pio_header *phdr; + u32 control; + u32 ndesc; + int ret; + + tx = qp->s_tx; + if (tx) { + qp->s_tx = NULL; + /* resend previously constructed packet */ + ret = qib_sdma_verbs_send(ppd, tx->ss, tx->dwords, tx); + goto bail; + } + + tx = get_txreq(dev, qp); + if (IS_ERR(tx)) + goto bail_tx; + + control = dd->f_setpbc_control(ppd, plen, qp->s_srate, + be16_to_cpu(hdr->lrh[0]) >> 12); + tx->qp = qp; + atomic_inc(&qp->refcount); + tx->wqe = qp->s_wqe; + tx->mr = qp->s_rdma_mr; + if (qp->s_rdma_mr) + qp->s_rdma_mr = NULL; + tx->txreq.callback = sdma_complete; + if (dd->flags & QIB_HAS_SDMA_TIMEOUT) + tx->txreq.flags = QIB_SDMA_TXREQ_F_HEADTOHOST; + else + tx->txreq.flags = QIB_SDMA_TXREQ_F_INTREQ; + if (plen + 1 > dd->piosize2kmax_dwords) + tx->txreq.flags |= QIB_SDMA_TXREQ_F_USELARGEBUF; + + if (len) { + /* + * Don't try to DMA if it takes more descriptors than + * the queue holds. + */ + ndesc = qib_count_sge(ss, len); + if (ndesc >= ppd->sdma_descq_cnt) + ndesc = 0; + } else + ndesc = 1; + if (ndesc) { + phdr = &dev->pio_hdrs[tx->hdr_inx]; + phdr->pbc[0] = cpu_to_le32(plen); + phdr->pbc[1] = cpu_to_le32(control); + memcpy(&phdr->hdr, hdr, hdrwords << 2); + tx->txreq.flags |= QIB_SDMA_TXREQ_F_FREEDESC; + tx->txreq.sg_count = ndesc; + tx->txreq.addr = dev->pio_hdrs_phys + + tx->hdr_inx * sizeof(struct qib_pio_header); + tx->hdr_dwords = hdrwords + 2; /* add PBC length */ + ret = qib_sdma_verbs_send(ppd, ss, dwords, tx); + goto bail; + } + + /* Allocate a buffer and copy the header and payload to it. */ + tx->hdr_dwords = plen + 1; + phdr = kmalloc(tx->hdr_dwords << 2, GFP_ATOMIC); + if (!phdr) + goto err_tx; + phdr->pbc[0] = cpu_to_le32(plen); + phdr->pbc[1] = cpu_to_le32(control); + memcpy(&phdr->hdr, hdr, hdrwords << 2); + qib_copy_from_sge((u32 *) &phdr->hdr + hdrwords, ss, len); + + tx->txreq.addr = dma_map_single(&dd->pcidev->dev, phdr, + tx->hdr_dwords << 2, DMA_TO_DEVICE); + if (dma_mapping_error(&dd->pcidev->dev, tx->txreq.addr)) + goto map_err; + tx->align_buf = phdr; + tx->txreq.flags |= QIB_SDMA_TXREQ_F_FREEBUF; + tx->txreq.sg_count = 1; + ret = qib_sdma_verbs_send(ppd, NULL, 0, tx); + goto unaligned; + +map_err: + kfree(phdr); +err_tx: + qib_put_txreq(tx); + ret = wait_kmem(dev, qp); +unaligned: + ibp->n_unaligned++; +bail: + return ret; +bail_tx: + ret = PTR_ERR(tx); + goto bail; +} + +/* + * If we are now in the error state, return zero to flush the + * send work request. + */ +static int no_bufs_available(struct qib_qp *qp) +{ + struct qib_ibdev *dev = to_idev(qp->ibqp.device); + struct qib_devdata *dd; + unsigned long flags; + int ret = 0; + + /* + * Note that as soon as want_buffer() is called and + * possibly before it returns, qib_ib_piobufavail() + * could be called. Therefore, put QP on the I/O wait list before + * enabling the PIO avail interrupt. + */ + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) { + spin_lock(&dev->pending_lock); + if (list_empty(&qp->iowait)) { + dev->n_piowait++; + qp->s_flags |= QIB_S_WAIT_PIO; + list_add_tail(&qp->iowait, &dev->piowait); + dd = dd_from_dev(dev); + dd->f_wantpiobuf_intr(dd, 1); + } + spin_unlock(&dev->pending_lock); + qp->s_flags &= ~QIB_S_BUSY; + ret = -EBUSY; + } + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +static int qib_verbs_send_pio(struct qib_qp *qp, struct qib_ib_header *ibhdr, + u32 hdrwords, struct qib_sge_state *ss, u32 len, + u32 plen, u32 dwords) +{ + struct qib_devdata *dd = dd_from_ibdev(qp->ibqp.device); + struct qib_pportdata *ppd = dd->pport + qp->port_num - 1; + u32 *hdr = (u32 *) ibhdr; + u32 __iomem *piobuf_orig; + u32 __iomem *piobuf; + u64 pbc; + unsigned long flags; + unsigned flush_wc; + u32 control; + u32 pbufn; + + control = dd->f_setpbc_control(ppd, plen, qp->s_srate, + be16_to_cpu(ibhdr->lrh[0]) >> 12); + pbc = ((u64) control << 32) | plen; + piobuf = dd->f_getsendbuf(ppd, pbc, &pbufn); + if (unlikely(piobuf == NULL)) + return no_bufs_available(qp); + + /* + * Write the pbc. + * We have to flush after the PBC for correctness on some cpus + * or WC buffer can be written out of order. + */ + writeq(pbc, piobuf); + piobuf_orig = piobuf; + piobuf += 2; + + flush_wc = dd->flags & QIB_PIO_FLUSH_WC; + if (len == 0) { + /* + * If there is just the header portion, must flush before + * writing last word of header for correctness, and after + * the last header word (trigger word). + */ + if (flush_wc) { + qib_flush_wc(); + qib_pio_copy(piobuf, hdr, hdrwords - 1); + qib_flush_wc(); + __raw_writel(hdr[hdrwords - 1], piobuf + hdrwords - 1); + qib_flush_wc(); + } else + qib_pio_copy(piobuf, hdr, hdrwords); + goto done; + } + + if (flush_wc) + qib_flush_wc(); + qib_pio_copy(piobuf, hdr, hdrwords); + piobuf += hdrwords; + + /* The common case is aligned and contained in one segment. */ + if (likely(ss->num_sge == 1 && len <= ss->sge.length && + !((unsigned long)ss->sge.vaddr & (sizeof(u32) - 1)))) { + u32 *addr = (u32 *) ss->sge.vaddr; + + /* Update address before sending packet. */ + update_sge(ss, len); + if (flush_wc) { + qib_pio_copy(piobuf, addr, dwords - 1); + /* must flush early everything before trigger word */ + qib_flush_wc(); + __raw_writel(addr[dwords - 1], piobuf + dwords - 1); + /* be sure trigger word is written */ + qib_flush_wc(); + } else + qib_pio_copy(piobuf, addr, dwords); + goto done; + } + copy_io(piobuf, ss, len, flush_wc); +done: + if (dd->flags & QIB_USE_SPCL_TRIG) { + u32 spcl_off = (pbufn >= dd->piobcnt2k) ? 2047 : 1023; + qib_flush_wc(); + __raw_writel(0xaebecede, piobuf_orig + spcl_off); + } + qib_sendbuf_done(dd, pbufn); + if (qp->s_rdma_mr) { + qib_put_mr(qp->s_rdma_mr); + qp->s_rdma_mr = NULL; + } + if (qp->s_wqe) { + spin_lock_irqsave(&qp->s_lock, flags); + qib_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS); + spin_unlock_irqrestore(&qp->s_lock, flags); + } else if (qp->ibqp.qp_type == IB_QPT_RC) { + spin_lock_irqsave(&qp->s_lock, flags); + qib_rc_send_complete(qp, ibhdr); + spin_unlock_irqrestore(&qp->s_lock, flags); + } + return 0; +} + +/** + * qib_verbs_send - send a packet + * @qp: the QP to send on + * @hdr: the packet header + * @hdrwords: the number of 32-bit words in the header + * @ss: the SGE to send + * @len: the length of the packet in bytes + * + * Return zero if packet is sent or queued OK. + * Return non-zero and clear qp->s_flags QIB_S_BUSY otherwise. + */ +int qib_verbs_send(struct qib_qp *qp, struct qib_ib_header *hdr, + u32 hdrwords, struct qib_sge_state *ss, u32 len) +{ + struct qib_devdata *dd = dd_from_ibdev(qp->ibqp.device); + u32 plen; + int ret; + u32 dwords = (len + 3) >> 2; + + /* + * Calculate the send buffer trigger address. + * The +1 counts for the pbc control dword following the pbc length. + */ + plen = hdrwords + dwords + 1; + + /* + * VL15 packets (IB_QPT_SMI) will always use PIO, so we + * can defer SDMA restart until link goes ACTIVE without + * worrying about just how we got there. + */ + if (qp->ibqp.qp_type == IB_QPT_SMI || + !(dd->flags & QIB_HAS_SEND_DMA)) + ret = qib_verbs_send_pio(qp, hdr, hdrwords, ss, len, + plen, dwords); + else + ret = qib_verbs_send_dma(qp, hdr, hdrwords, ss, len, + plen, dwords); + + return ret; +} + +int qib_snapshot_counters(struct qib_pportdata *ppd, u64 *swords, + u64 *rwords, u64 *spkts, u64 *rpkts, + u64 *xmit_wait) +{ + int ret; + struct qib_devdata *dd = ppd->dd; + + if (!(dd->flags & QIB_PRESENT)) { + /* no hardware, freeze, etc. */ + ret = -EINVAL; + goto bail; + } + *swords = dd->f_portcntr(ppd, QIBPORTCNTR_WORDSEND); + *rwords = dd->f_portcntr(ppd, QIBPORTCNTR_WORDRCV); + *spkts = dd->f_portcntr(ppd, QIBPORTCNTR_PKTSEND); + *rpkts = dd->f_portcntr(ppd, QIBPORTCNTR_PKTRCV); + *xmit_wait = dd->f_portcntr(ppd, QIBPORTCNTR_SENDSTALL); + + ret = 0; + +bail: + return ret; +} + +/** + * qib_get_counters - get various chip counters + * @dd: the qlogic_ib device + * @cntrs: counters are placed here + * + * Return the counters needed by recv_pma_get_portcounters(). + */ +int qib_get_counters(struct qib_pportdata *ppd, + struct qib_verbs_counters *cntrs) +{ + int ret; + + if (!(ppd->dd->flags & QIB_PRESENT)) { + /* no hardware, freeze, etc. */ + ret = -EINVAL; + goto bail; + } + cntrs->symbol_error_counter = + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_IBSYMBOLERR); + cntrs->link_error_recovery_counter = + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_IBLINKERRRECOV); + /* + * The link downed counter counts when the other side downs the + * connection. We add in the number of times we downed the link + * due to local link integrity errors to compensate. + */ + cntrs->link_downed_counter = + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_IBLINKDOWN); + cntrs->port_rcv_errors = + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_RXDROPPKT) + + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_RCVOVFL) + + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_ERR_RLEN) + + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_INVALIDRLEN) + + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_ERRLINK) + + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_ERRICRC) + + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_ERRVCRC) + + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_ERRLPCRC) + + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_BADFORMAT); + cntrs->port_rcv_errors += + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_RXLOCALPHYERR); + cntrs->port_rcv_errors += + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_RXVLERR); + cntrs->port_rcv_remphys_errors = + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_RCVEBP); + cntrs->port_xmit_discards = + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_UNSUPVL); + cntrs->port_xmit_data = ppd->dd->f_portcntr(ppd, + QIBPORTCNTR_WORDSEND); + cntrs->port_rcv_data = ppd->dd->f_portcntr(ppd, + QIBPORTCNTR_WORDRCV); + cntrs->port_xmit_packets = ppd->dd->f_portcntr(ppd, + QIBPORTCNTR_PKTSEND); + cntrs->port_rcv_packets = ppd->dd->f_portcntr(ppd, + QIBPORTCNTR_PKTRCV); + cntrs->local_link_integrity_errors = + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_LLI); + cntrs->excessive_buffer_overrun_errors = + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_EXCESSBUFOVFL); + cntrs->vl15_dropped = + ppd->dd->f_portcntr(ppd, QIBPORTCNTR_VL15PKTDROP); + + ret = 0; + +bail: + return ret; +} + +/** + * qib_ib_piobufavail - callback when a PIO buffer is available + * @dd: the device pointer + * + * This is called from qib_intr() at interrupt level when a PIO buffer is + * available after qib_verbs_send() returned an error that no buffers were + * available. Disable the interrupt if there are no more QPs waiting. + */ +void qib_ib_piobufavail(struct qib_devdata *dd) +{ + struct qib_ibdev *dev = &dd->verbs_dev; + struct list_head *list; + struct qib_qp *qps[5]; + struct qib_qp *qp; + unsigned long flags; + unsigned i, n; + + list = &dev->piowait; + n = 0; + + /* + * Note: checking that the piowait list is empty and clearing + * the buffer available interrupt needs to be atomic or we + * could end up with QPs on the wait list with the interrupt + * disabled. + */ + spin_lock_irqsave(&dev->pending_lock, flags); + while (!list_empty(list)) { + if (n == ARRAY_SIZE(qps)) + goto full; + qp = list_entry(list->next, struct qib_qp, iowait); + list_del_init(&qp->iowait); + atomic_inc(&qp->refcount); + qps[n++] = qp; + } + dd->f_wantpiobuf_intr(dd, 0); +full: + spin_unlock_irqrestore(&dev->pending_lock, flags); + + for (i = 0; i < n; i++) { + qp = qps[i]; + + spin_lock_irqsave(&qp->s_lock, flags); + if (qp->s_flags & QIB_S_WAIT_PIO) { + qp->s_flags &= ~QIB_S_WAIT_PIO; + qib_schedule_send(qp); + } + spin_unlock_irqrestore(&qp->s_lock, flags); + + /* Notify qib_destroy_qp() if it is waiting. */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } +} + +static int qib_query_device(struct ib_device *ibdev, + struct ib_device_attr *props) +{ + struct qib_devdata *dd = dd_from_ibdev(ibdev); + struct qib_ibdev *dev = to_idev(ibdev); + + memset(props, 0, sizeof(*props)); + + props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR | + IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT | + IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN | + IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE; + props->page_size_cap = PAGE_SIZE; + props->vendor_id = + QIB_SRC_OUI_1 << 16 | QIB_SRC_OUI_2 << 8 | QIB_SRC_OUI_3; + props->vendor_part_id = dd->deviceid; + props->hw_ver = dd->minrev; + props->sys_image_guid = ib_qib_sys_image_guid; + props->max_mr_size = ~0ULL; + props->max_qp = ib_qib_max_qps; + props->max_qp_wr = ib_qib_max_qp_wrs; + props->max_sge = ib_qib_max_sges; + props->max_cq = ib_qib_max_cqs; + props->max_ah = ib_qib_max_ahs; + props->max_cqe = ib_qib_max_cqes; + props->max_mr = dev->lk_table.max; + props->max_fmr = dev->lk_table.max; + props->max_map_per_fmr = 32767; + props->max_pd = ib_qib_max_pds; + props->max_qp_rd_atom = QIB_MAX_RDMA_ATOMIC; + props->max_qp_init_rd_atom = 255; + /* props->max_res_rd_atom */ + props->max_srq = ib_qib_max_srqs; + props->max_srq_wr = ib_qib_max_srq_wrs; + props->max_srq_sge = ib_qib_max_srq_sges; + /* props->local_ca_ack_delay */ + props->atomic_cap = IB_ATOMIC_GLOB; + props->max_pkeys = qib_get_npkeys(dd); + props->max_mcast_grp = ib_qib_max_mcast_grps; + props->max_mcast_qp_attach = ib_qib_max_mcast_qp_attached; + props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * + props->max_mcast_grp; + + return 0; +} + +static int qib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + struct qib_devdata *dd = dd_from_ibdev(ibdev); + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + enum ib_mtu mtu; + u16 lid = ppd->lid; + + memset(props, 0, sizeof(*props)); + props->lid = lid ? lid : be16_to_cpu(IB_LID_PERMISSIVE); + props->lmc = ppd->lmc; + props->sm_lid = ibp->sm_lid; + props->sm_sl = ibp->sm_sl; + props->state = dd->f_iblink_state(ppd->lastibcstat); + props->phys_state = dd->f_ibphys_portstate(ppd->lastibcstat); + props->port_cap_flags = ibp->port_cap_flags; + props->gid_tbl_len = QIB_GUIDS_PER_PORT; + props->max_msg_sz = 0x80000000; + props->pkey_tbl_len = qib_get_npkeys(dd); + props->bad_pkey_cntr = ibp->pkey_violations; + props->qkey_viol_cntr = ibp->qkey_violations; + props->active_width = ppd->link_width_active; + /* See rate_show() */ + props->active_speed = ppd->link_speed_active; + props->max_vl_num = qib_num_vls(ppd->vls_supported); + props->init_type_reply = 0; + + props->max_mtu = qib_ibmtu ? qib_ibmtu : IB_MTU_4096; + switch (ppd->ibmtu) { + case 4096: + mtu = IB_MTU_4096; + break; + case 2048: + mtu = IB_MTU_2048; + break; + case 1024: + mtu = IB_MTU_1024; + break; + case 512: + mtu = IB_MTU_512; + break; + case 256: + mtu = IB_MTU_256; + break; + default: + mtu = IB_MTU_2048; + } + props->active_mtu = mtu; + props->subnet_timeout = ibp->subnet_timeout; + + return 0; +} + +static int qib_modify_device(struct ib_device *device, + int device_modify_mask, + struct ib_device_modify *device_modify) +{ + struct qib_devdata *dd = dd_from_ibdev(device); + unsigned i; + int ret; + + if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID | + IB_DEVICE_MODIFY_NODE_DESC)) { + ret = -EOPNOTSUPP; + goto bail; + } + + if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) { + memcpy(device->node_desc, device_modify->node_desc, 64); + for (i = 0; i < dd->num_pports; i++) { + struct qib_ibport *ibp = &dd->pport[i].ibport_data; + + qib_node_desc_chg(ibp); + } + } + + if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) { + ib_qib_sys_image_guid = + cpu_to_be64(device_modify->sys_image_guid); + for (i = 0; i < dd->num_pports; i++) { + struct qib_ibport *ibp = &dd->pport[i].ibport_data; + + qib_sys_guid_chg(ibp); + } + } + + ret = 0; + +bail: + return ret; +} + +static int qib_modify_port(struct ib_device *ibdev, u8 port, + int port_modify_mask, struct ib_port_modify *props) +{ + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + + ibp->port_cap_flags |= props->set_port_cap_mask; + ibp->port_cap_flags &= ~props->clr_port_cap_mask; + if (props->set_port_cap_mask || props->clr_port_cap_mask) + qib_cap_mask_chg(ibp); + if (port_modify_mask & IB_PORT_SHUTDOWN) + qib_set_linkstate(ppd, QIB_IB_LINKDOWN); + if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR) + ibp->qkey_violations = 0; + return 0; +} + +static int qib_query_gid(struct ib_device *ibdev, u8 port, + int index, union ib_gid *gid) +{ + struct qib_devdata *dd = dd_from_ibdev(ibdev); + int ret = 0; + + if (!port || port > dd->num_pports) + ret = -EINVAL; + else { + struct qib_ibport *ibp = to_iport(ibdev, port); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + + gid->global.subnet_prefix = ibp->gid_prefix; + if (index == 0) + gid->global.interface_id = ppd->guid; + else if (index < QIB_GUIDS_PER_PORT) + gid->global.interface_id = ibp->guids[index - 1]; + else + ret = -EINVAL; + } + + return ret; +} + +static struct ib_pd *qib_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct qib_ibdev *dev = to_idev(ibdev); + struct qib_pd *pd; + struct ib_pd *ret; + + /* + * This is actually totally arbitrary. Some correctness tests + * assume there's a maximum number of PDs that can be allocated. + * We don't actually have this limit, but we fail the test if + * we allow allocations of more than we report for this value. + */ + + pd = kmalloc(sizeof *pd, GFP_KERNEL); + if (!pd) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + spin_lock(&dev->n_pds_lock); + if (dev->n_pds_allocated == ib_qib_max_pds) { + spin_unlock(&dev->n_pds_lock); + kfree(pd); + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + dev->n_pds_allocated++; + spin_unlock(&dev->n_pds_lock); + + /* ib_alloc_pd() will initialize pd->ibpd. */ + pd->user = udata != NULL; + + ret = &pd->ibpd; + +bail: + return ret; +} + +static int qib_dealloc_pd(struct ib_pd *ibpd) +{ + struct qib_pd *pd = to_ipd(ibpd); + struct qib_ibdev *dev = to_idev(ibpd->device); + + spin_lock(&dev->n_pds_lock); + dev->n_pds_allocated--; + spin_unlock(&dev->n_pds_lock); + + kfree(pd); + + return 0; +} + +int qib_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr) +{ + /* A multicast address requires a GRH (see ch. 8.4.1). */ + if (ah_attr->dlid >= QIB_MULTICAST_LID_BASE && + ah_attr->dlid != QIB_PERMISSIVE_LID && + !(ah_attr->ah_flags & IB_AH_GRH)) + goto bail; + if ((ah_attr->ah_flags & IB_AH_GRH) && + ah_attr->grh.sgid_index >= QIB_GUIDS_PER_PORT) + goto bail; + if (ah_attr->dlid == 0) + goto bail; + if (ah_attr->port_num < 1 || + ah_attr->port_num > ibdev->phys_port_cnt) + goto bail; + if (ah_attr->static_rate != IB_RATE_PORT_CURRENT && + ib_rate_to_mult(ah_attr->static_rate) < 0) + goto bail; + if (ah_attr->sl > 15) + goto bail; + return 0; +bail: + return -EINVAL; +} + +/** + * qib_create_ah - create an address handle + * @pd: the protection domain + * @ah_attr: the attributes of the AH + * + * This may be called from interrupt context. + */ +static struct ib_ah *qib_create_ah(struct ib_pd *pd, + struct ib_ah_attr *ah_attr) +{ + struct qib_ah *ah; + struct ib_ah *ret; + struct qib_ibdev *dev = to_idev(pd->device); + unsigned long flags; + + if (qib_check_ah(pd->device, ah_attr)) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + ah = kmalloc(sizeof *ah, GFP_ATOMIC); + if (!ah) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + spin_lock_irqsave(&dev->n_ahs_lock, flags); + if (dev->n_ahs_allocated == ib_qib_max_ahs) { + spin_unlock_irqrestore(&dev->n_ahs_lock, flags); + kfree(ah); + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + dev->n_ahs_allocated++; + spin_unlock_irqrestore(&dev->n_ahs_lock, flags); + + /* ib_create_ah() will initialize ah->ibah. */ + ah->attr = *ah_attr; + atomic_set(&ah->refcount, 0); + + ret = &ah->ibah; + +bail: + return ret; +} + +struct ib_ah *qib_create_qp0_ah(struct qib_ibport *ibp, u16 dlid) +{ + struct ib_ah_attr attr; + struct ib_ah *ah = ERR_PTR(-EINVAL); + struct qib_qp *qp0; + + memset(&attr, 0, sizeof attr); + attr.dlid = dlid; + attr.port_num = ppd_from_ibp(ibp)->port; + rcu_read_lock(); + qp0 = rcu_dereference(ibp->qp0); + if (qp0) + ah = ib_create_ah(qp0->ibqp.pd, &attr); + rcu_read_unlock(); + return ah; +} + +/** + * qib_destroy_ah - destroy an address handle + * @ibah: the AH to destroy + * + * This may be called from interrupt context. + */ +static int qib_destroy_ah(struct ib_ah *ibah) +{ + struct qib_ibdev *dev = to_idev(ibah->device); + struct qib_ah *ah = to_iah(ibah); + unsigned long flags; + + if (atomic_read(&ah->refcount) != 0) + return -EBUSY; + + spin_lock_irqsave(&dev->n_ahs_lock, flags); + dev->n_ahs_allocated--; + spin_unlock_irqrestore(&dev->n_ahs_lock, flags); + + kfree(ah); + + return 0; +} + +static int qib_modify_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) +{ + struct qib_ah *ah = to_iah(ibah); + + if (qib_check_ah(ibah->device, ah_attr)) + return -EINVAL; + + ah->attr = *ah_attr; + + return 0; +} + +static int qib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) +{ + struct qib_ah *ah = to_iah(ibah); + + *ah_attr = ah->attr; + + return 0; +} + +/** + * qib_get_npkeys - return the size of the PKEY table for context 0 + * @dd: the qlogic_ib device + */ +unsigned qib_get_npkeys(struct qib_devdata *dd) +{ + return ARRAY_SIZE(dd->rcd[0]->pkeys); +} + +/* + * Return the indexed PKEY from the port PKEY table. + * No need to validate rcd[ctxt]; the port is setup if we are here. + */ +unsigned qib_get_pkey(struct qib_ibport *ibp, unsigned index) +{ + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + struct qib_devdata *dd = ppd->dd; + unsigned ctxt = ppd->hw_pidx; + unsigned ret; + + /* dd->rcd null if mini_init or some init failures */ + if (!dd->rcd || index >= ARRAY_SIZE(dd->rcd[ctxt]->pkeys)) + ret = 0; + else + ret = dd->rcd[ctxt]->pkeys[index]; + + return ret; +} + +static int qib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey) +{ + struct qib_devdata *dd = dd_from_ibdev(ibdev); + int ret; + + if (index >= qib_get_npkeys(dd)) { + ret = -EINVAL; + goto bail; + } + + *pkey = qib_get_pkey(to_iport(ibdev, port), index); + ret = 0; + +bail: + return ret; +} + +/** + * qib_alloc_ucontext - allocate a ucontest + * @ibdev: the infiniband device + * @udata: not used by the QLogic_IB driver + */ + +static struct ib_ucontext *qib_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct qib_ucontext *context; + struct ib_ucontext *ret; + + context = kmalloc(sizeof *context, GFP_KERNEL); + if (!context) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + ret = &context->ibucontext; + +bail: + return ret; +} + +static int qib_dealloc_ucontext(struct ib_ucontext *context) +{ + kfree(to_iucontext(context)); + return 0; +} + +static void init_ibport(struct qib_pportdata *ppd) +{ + struct qib_verbs_counters cntrs; + struct qib_ibport *ibp = &ppd->ibport_data; + + spin_lock_init(&ibp->lock); + /* Set the prefix to the default value (see ch. 4.1.1) */ + ibp->gid_prefix = IB_DEFAULT_GID_PREFIX; + ibp->sm_lid = be16_to_cpu(IB_LID_PERMISSIVE); + ibp->port_cap_flags = IB_PORT_SYS_IMAGE_GUID_SUP | + IB_PORT_CLIENT_REG_SUP | IB_PORT_SL_MAP_SUP | + IB_PORT_TRAP_SUP | IB_PORT_AUTO_MIGR_SUP | + IB_PORT_DR_NOTICE_SUP | IB_PORT_CAP_MASK_NOTICE_SUP | + IB_PORT_OTHER_LOCAL_CHANGES_SUP; + if (ppd->dd->flags & QIB_HAS_LINK_LATENCY) + ibp->port_cap_flags |= IB_PORT_LINK_LATENCY_SUP; + ibp->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA; + ibp->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA; + ibp->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS; + ibp->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS; + ibp->pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT; + + /* Snapshot current HW counters to "clear" them. */ + qib_get_counters(ppd, &cntrs); + ibp->z_symbol_error_counter = cntrs.symbol_error_counter; + ibp->z_link_error_recovery_counter = + cntrs.link_error_recovery_counter; + ibp->z_link_downed_counter = cntrs.link_downed_counter; + ibp->z_port_rcv_errors = cntrs.port_rcv_errors; + ibp->z_port_rcv_remphys_errors = cntrs.port_rcv_remphys_errors; + ibp->z_port_xmit_discards = cntrs.port_xmit_discards; + ibp->z_port_xmit_data = cntrs.port_xmit_data; + ibp->z_port_rcv_data = cntrs.port_rcv_data; + ibp->z_port_xmit_packets = cntrs.port_xmit_packets; + ibp->z_port_rcv_packets = cntrs.port_rcv_packets; + ibp->z_local_link_integrity_errors = + cntrs.local_link_integrity_errors; + ibp->z_excessive_buffer_overrun_errors = + cntrs.excessive_buffer_overrun_errors; + ibp->z_vl15_dropped = cntrs.vl15_dropped; + RCU_INIT_POINTER(ibp->qp0, NULL); + RCU_INIT_POINTER(ibp->qp1, NULL); +} + +/** + * qib_register_ib_device - register our device with the infiniband core + * @dd: the device data structure + * Return the allocated qib_ibdev pointer or NULL on error. + */ +int qib_register_ib_device(struct qib_devdata *dd) +{ + struct qib_ibdev *dev = &dd->verbs_dev; + struct ib_device *ibdev = &dev->ibdev; + struct qib_pportdata *ppd = dd->pport; + unsigned i, lk_tab_size; + int ret; + + dev->qp_table_size = ib_qib_qp_table_size; + get_random_bytes(&dev->qp_rnd, sizeof(dev->qp_rnd)); + dev->qp_table = kmalloc(dev->qp_table_size * sizeof *dev->qp_table, + GFP_KERNEL); + if (!dev->qp_table) { + ret = -ENOMEM; + goto err_qpt; + } + for (i = 0; i < dev->qp_table_size; i++) + RCU_INIT_POINTER(dev->qp_table[i], NULL); + + for (i = 0; i < dd->num_pports; i++) + init_ibport(ppd + i); + + /* Only need to initialize non-zero fields. */ + spin_lock_init(&dev->qpt_lock); + spin_lock_init(&dev->n_pds_lock); + spin_lock_init(&dev->n_ahs_lock); + spin_lock_init(&dev->n_cqs_lock); + spin_lock_init(&dev->n_qps_lock); + spin_lock_init(&dev->n_srqs_lock); + spin_lock_init(&dev->n_mcast_grps_lock); + init_timer(&dev->mem_timer); + dev->mem_timer.function = mem_timer; + dev->mem_timer.data = (unsigned long) dev; + + qib_init_qpn_table(dd, &dev->qpn_table); + + /* + * The top ib_qib_lkey_table_size bits are used to index the + * table. The lower 8 bits can be owned by the user (copied from + * the LKEY). The remaining bits act as a generation number or tag. + */ + spin_lock_init(&dev->lk_table.lock); + dev->lk_table.max = 1 << ib_qib_lkey_table_size; + lk_tab_size = dev->lk_table.max * sizeof(*dev->lk_table.table); + dev->lk_table.table = (struct qib_mregion __rcu **) + __get_free_pages(GFP_KERNEL, get_order(lk_tab_size)); + if (dev->lk_table.table == NULL) { + ret = -ENOMEM; + goto err_lk; + } + RCU_INIT_POINTER(dev->dma_mr, NULL); + for (i = 0; i < dev->lk_table.max; i++) + RCU_INIT_POINTER(dev->lk_table.table[i], NULL); + INIT_LIST_HEAD(&dev->pending_mmaps); + spin_lock_init(&dev->pending_lock); + dev->mmap_offset = PAGE_SIZE; + spin_lock_init(&dev->mmap_offset_lock); + INIT_LIST_HEAD(&dev->piowait); + INIT_LIST_HEAD(&dev->dmawait); + INIT_LIST_HEAD(&dev->txwait); + INIT_LIST_HEAD(&dev->memwait); + INIT_LIST_HEAD(&dev->txreq_free); + + if (ppd->sdma_descq_cnt) { + dev->pio_hdrs = dma_alloc_coherent(&dd->pcidev->dev, + ppd->sdma_descq_cnt * + sizeof(struct qib_pio_header), + &dev->pio_hdrs_phys, + GFP_KERNEL); + if (!dev->pio_hdrs) { + ret = -ENOMEM; + goto err_hdrs; + } + } + + for (i = 0; i < ppd->sdma_descq_cnt; i++) { + struct qib_verbs_txreq *tx; + + tx = kzalloc(sizeof *tx, GFP_KERNEL); + if (!tx) { + ret = -ENOMEM; + goto err_tx; + } + tx->hdr_inx = i; + list_add(&tx->txreq.list, &dev->txreq_free); + } + + /* + * The system image GUID is supposed to be the same for all + * IB HCAs in a single system but since there can be other + * device types in the system, we can't be sure this is unique. + */ + if (!ib_qib_sys_image_guid) + ib_qib_sys_image_guid = ppd->guid; + + strlcpy(ibdev->name, "qib%d", IB_DEVICE_NAME_MAX); + ibdev->owner = THIS_MODULE; + ibdev->node_guid = ppd->guid; + ibdev->uverbs_abi_ver = QIB_UVERBS_ABI_VERSION; + ibdev->uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_CREATE_AH) | + (1ull << IB_USER_VERBS_CMD_MODIFY_AH) | + (1ull << IB_USER_VERBS_CMD_QUERY_AH) | + (1ull << IB_USER_VERBS_CMD_DESTROY_AH) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_POLL_CQ) | + (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_POST_SEND) | + (1ull << IB_USER_VERBS_CMD_POST_RECV) | + (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | + (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV); + ibdev->node_type = RDMA_NODE_IB_CA; + ibdev->phys_port_cnt = dd->num_pports; + ibdev->num_comp_vectors = 1; + ibdev->dma_device = &dd->pcidev->dev; + ibdev->query_device = qib_query_device; + ibdev->modify_device = qib_modify_device; + ibdev->query_port = qib_query_port; + ibdev->modify_port = qib_modify_port; + ibdev->query_pkey = qib_query_pkey; + ibdev->query_gid = qib_query_gid; + ibdev->alloc_ucontext = qib_alloc_ucontext; + ibdev->dealloc_ucontext = qib_dealloc_ucontext; + ibdev->alloc_pd = qib_alloc_pd; + ibdev->dealloc_pd = qib_dealloc_pd; + ibdev->create_ah = qib_create_ah; + ibdev->destroy_ah = qib_destroy_ah; + ibdev->modify_ah = qib_modify_ah; + ibdev->query_ah = qib_query_ah; + ibdev->create_srq = qib_create_srq; + ibdev->modify_srq = qib_modify_srq; + ibdev->query_srq = qib_query_srq; + ibdev->destroy_srq = qib_destroy_srq; + ibdev->create_qp = qib_create_qp; + ibdev->modify_qp = qib_modify_qp; + ibdev->query_qp = qib_query_qp; + ibdev->destroy_qp = qib_destroy_qp; + ibdev->post_send = qib_post_send; + ibdev->post_recv = qib_post_receive; + ibdev->post_srq_recv = qib_post_srq_receive; + ibdev->create_cq = qib_create_cq; + ibdev->destroy_cq = qib_destroy_cq; + ibdev->resize_cq = qib_resize_cq; + ibdev->poll_cq = qib_poll_cq; + ibdev->req_notify_cq = qib_req_notify_cq; + ibdev->get_dma_mr = qib_get_dma_mr; + ibdev->reg_phys_mr = qib_reg_phys_mr; + ibdev->reg_user_mr = qib_reg_user_mr; + ibdev->dereg_mr = qib_dereg_mr; + ibdev->alloc_fast_reg_mr = qib_alloc_fast_reg_mr; + ibdev->alloc_fast_reg_page_list = qib_alloc_fast_reg_page_list; + ibdev->free_fast_reg_page_list = qib_free_fast_reg_page_list; + ibdev->alloc_fmr = qib_alloc_fmr; + ibdev->map_phys_fmr = qib_map_phys_fmr; + ibdev->unmap_fmr = qib_unmap_fmr; + ibdev->dealloc_fmr = qib_dealloc_fmr; + ibdev->attach_mcast = qib_multicast_attach; + ibdev->detach_mcast = qib_multicast_detach; + ibdev->process_mad = qib_process_mad; + ibdev->mmap = qib_mmap; + ibdev->dma_ops = &qib_dma_mapping_ops; + + snprintf(ibdev->node_desc, sizeof(ibdev->node_desc), + "Intel Infiniband HCA %s", init_utsname()->nodename); + + ret = ib_register_device(ibdev, qib_create_port_files); + if (ret) + goto err_reg; + + ret = qib_create_agents(dev); + if (ret) + goto err_agents; + + ret = qib_verbs_register_sysfs(dd); + if (ret) + goto err_class; + + goto bail; + +err_class: + qib_free_agents(dev); +err_agents: + ib_unregister_device(ibdev); +err_reg: +err_tx: + while (!list_empty(&dev->txreq_free)) { + struct list_head *l = dev->txreq_free.next; + struct qib_verbs_txreq *tx; + + list_del(l); + tx = list_entry(l, struct qib_verbs_txreq, txreq.list); + kfree(tx); + } + if (ppd->sdma_descq_cnt) + dma_free_coherent(&dd->pcidev->dev, + ppd->sdma_descq_cnt * + sizeof(struct qib_pio_header), + dev->pio_hdrs, dev->pio_hdrs_phys); +err_hdrs: + free_pages((unsigned long) dev->lk_table.table, get_order(lk_tab_size)); +err_lk: + kfree(dev->qp_table); +err_qpt: + qib_dev_err(dd, "cannot register verbs: %d!\n", -ret); +bail: + return ret; +} + +void qib_unregister_ib_device(struct qib_devdata *dd) +{ + struct qib_ibdev *dev = &dd->verbs_dev; + struct ib_device *ibdev = &dev->ibdev; + u32 qps_inuse; + unsigned lk_tab_size; + + qib_verbs_unregister_sysfs(dd); + + qib_free_agents(dev); + + ib_unregister_device(ibdev); + + if (!list_empty(&dev->piowait)) + qib_dev_err(dd, "piowait list not empty!\n"); + if (!list_empty(&dev->dmawait)) + qib_dev_err(dd, "dmawait list not empty!\n"); + if (!list_empty(&dev->txwait)) + qib_dev_err(dd, "txwait list not empty!\n"); + if (!list_empty(&dev->memwait)) + qib_dev_err(dd, "memwait list not empty!\n"); + if (dev->dma_mr) + qib_dev_err(dd, "DMA MR not NULL!\n"); + + qps_inuse = qib_free_all_qps(dd); + if (qps_inuse) + qib_dev_err(dd, "QP memory leak! %u still in use\n", + qps_inuse); + + del_timer_sync(&dev->mem_timer); + qib_free_qpn_table(&dev->qpn_table); + while (!list_empty(&dev->txreq_free)) { + struct list_head *l = dev->txreq_free.next; + struct qib_verbs_txreq *tx; + + list_del(l); + tx = list_entry(l, struct qib_verbs_txreq, txreq.list); + kfree(tx); + } + if (dd->pport->sdma_descq_cnt) + dma_free_coherent(&dd->pcidev->dev, + dd->pport->sdma_descq_cnt * + sizeof(struct qib_pio_header), + dev->pio_hdrs, dev->pio_hdrs_phys); + lk_tab_size = dev->lk_table.max * sizeof(*dev->lk_table.table); + free_pages((unsigned long) dev->lk_table.table, + get_order(lk_tab_size)); + kfree(dev->qp_table); +} + +/* + * This must be called with s_lock held. + */ +void qib_schedule_send(struct qib_qp *qp) +{ + if (qib_send_ok(qp)) { + struct qib_ibport *ibp = + to_iport(qp->ibqp.device, qp->port_num); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + + queue_work(ppd->qib_wq, &qp->s_work); + } +} diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h new file mode 100644 index 0000000..bfc8948 --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -0,0 +1,1173 @@ +/* + * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef QIB_VERBS_H +#define QIB_VERBS_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct qib_ctxtdata; +struct qib_pportdata; +struct qib_devdata; +struct qib_verbs_txreq; + +#define QIB_MAX_RDMA_ATOMIC 16 +#define QIB_GUIDS_PER_PORT 5 + +#define QPN_MAX (1 << 24) +#define QPNMAP_ENTRIES (QPN_MAX / PAGE_SIZE / BITS_PER_BYTE) + +/* + * Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define QIB_UVERBS_ABI_VERSION 2 + +/* + * Define an ib_cq_notify value that is not valid so we know when CQ + * notifications are armed. + */ +#define IB_CQ_NONE (IB_CQ_NEXT_COMP + 1) + +#define IB_SEQ_NAK (3 << 29) + +/* AETH NAK opcode values */ +#define IB_RNR_NAK 0x20 +#define IB_NAK_PSN_ERROR 0x60 +#define IB_NAK_INVALID_REQUEST 0x61 +#define IB_NAK_REMOTE_ACCESS_ERROR 0x62 +#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63 +#define IB_NAK_INVALID_RD_REQUEST 0x64 + +/* Flags for checking QP state (see ib_qib_state_ops[]) */ +#define QIB_POST_SEND_OK 0x01 +#define QIB_POST_RECV_OK 0x02 +#define QIB_PROCESS_RECV_OK 0x04 +#define QIB_PROCESS_SEND_OK 0x08 +#define QIB_PROCESS_NEXT_SEND_OK 0x10 +#define QIB_FLUSH_SEND 0x20 +#define QIB_FLUSH_RECV 0x40 +#define QIB_PROCESS_OR_FLUSH_SEND \ + (QIB_PROCESS_SEND_OK | QIB_FLUSH_SEND) + +/* IB Performance Manager status values */ +#define IB_PMA_SAMPLE_STATUS_DONE 0x00 +#define IB_PMA_SAMPLE_STATUS_STARTED 0x01 +#define IB_PMA_SAMPLE_STATUS_RUNNING 0x02 + +/* Mandatory IB performance counter select values. */ +#define IB_PMA_PORT_XMIT_DATA cpu_to_be16(0x0001) +#define IB_PMA_PORT_RCV_DATA cpu_to_be16(0x0002) +#define IB_PMA_PORT_XMIT_PKTS cpu_to_be16(0x0003) +#define IB_PMA_PORT_RCV_PKTS cpu_to_be16(0x0004) +#define IB_PMA_PORT_XMIT_WAIT cpu_to_be16(0x0005) + +#define QIB_VENDOR_IPG cpu_to_be16(0xFFA0) + +#define IB_BTH_REQ_ACK (1 << 31) +#define IB_BTH_SOLICITED (1 << 23) +#define IB_BTH_MIG_REQ (1 << 22) + +/* XXX Should be defined in ib_verbs.h enum ib_port_cap_flags */ +#define IB_PORT_OTHER_LOCAL_CHANGES_SUP (1 << 26) + +#define IB_GRH_VERSION 6 +#define IB_GRH_VERSION_MASK 0xF +#define IB_GRH_VERSION_SHIFT 28 +#define IB_GRH_TCLASS_MASK 0xFF +#define IB_GRH_TCLASS_SHIFT 20 +#define IB_GRH_FLOW_MASK 0xFFFFF +#define IB_GRH_FLOW_SHIFT 0 +#define IB_GRH_NEXT_HDR 0x1B + +#define IB_DEFAULT_GID_PREFIX cpu_to_be64(0xfe80000000000000ULL) + +/* Values for set/get portinfo VLCap OperationalVLs */ +#define IB_VL_VL0 1 +#define IB_VL_VL0_1 2 +#define IB_VL_VL0_3 3 +#define IB_VL_VL0_7 4 +#define IB_VL_VL0_14 5 + +static inline int qib_num_vls(int vls) +{ + switch (vls) { + default: + case IB_VL_VL0: + return 1; + case IB_VL_VL0_1: + return 2; + case IB_VL_VL0_3: + return 4; + case IB_VL_VL0_7: + return 8; + case IB_VL_VL0_14: + return 15; + } +} + +struct ib_reth { + __be64 vaddr; + __be32 rkey; + __be32 length; +} __packed; + +struct ib_atomic_eth { + __be32 vaddr[2]; /* unaligned so access as 2 32-bit words */ + __be32 rkey; + __be64 swap_data; + __be64 compare_data; +} __packed; + +struct qib_other_headers { + __be32 bth[3]; + union { + struct { + __be32 deth[2]; + __be32 imm_data; + } ud; + struct { + struct ib_reth reth; + __be32 imm_data; + } rc; + struct { + __be32 aeth; + __be32 atomic_ack_eth[2]; + } at; + __be32 imm_data; + __be32 aeth; + struct ib_atomic_eth atomic_eth; + } u; +} __packed; + +/* + * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes + * long (72 w/ imm_data). Only the first 56 bytes of the IB header + * will be in the eager header buffer. The remaining 12 or 16 bytes + * are in the data buffer. + */ +struct qib_ib_header { + __be16 lrh[4]; + union { + struct { + struct ib_grh grh; + struct qib_other_headers oth; + } l; + struct qib_other_headers oth; + } u; +} __packed; + +struct qib_pio_header { + __le32 pbc[2]; + struct qib_ib_header hdr; +} __packed; + +/* + * There is one struct qib_mcast for each multicast GID. + * All attached QPs are then stored as a list of + * struct qib_mcast_qp. + */ +struct qib_mcast_qp { + struct list_head list; + struct qib_qp *qp; +}; + +struct qib_mcast { + struct rb_node rb_node; + union ib_gid mgid; + struct list_head qp_list; + wait_queue_head_t wait; + atomic_t refcount; + int n_attached; +}; + +/* Protection domain */ +struct qib_pd { + struct ib_pd ibpd; + int user; /* non-zero if created from user space */ +}; + +/* Address Handle */ +struct qib_ah { + struct ib_ah ibah; + struct ib_ah_attr attr; + atomic_t refcount; +}; + +/* + * This structure is used by qib_mmap() to validate an offset + * when an mmap() request is made. The vm_area_struct then uses + * this as its vm_private_data. + */ +struct qib_mmap_info { + struct list_head pending_mmaps; + struct ib_ucontext *context; + void *obj; + __u64 offset; + struct kref ref; + unsigned size; +}; + +/* + * This structure is used to contain the head pointer, tail pointer, + * and completion queue entries as a single memory allocation so + * it can be mmap'ed into user space. + */ +struct qib_cq_wc { + u32 head; /* index of next entry to fill */ + u32 tail; /* index of next ib_poll_cq() entry */ + union { + /* these are actually size ibcq.cqe + 1 */ + struct ib_uverbs_wc uqueue[0]; + struct ib_wc kqueue[0]; + }; +}; + +/* + * The completion queue structure. + */ +struct qib_cq { + struct ib_cq ibcq; + struct kthread_work comptask; + struct qib_devdata *dd; + spinlock_t lock; /* protect changes in this struct */ + u8 notify; + u8 triggered; + struct qib_cq_wc *queue; + struct qib_mmap_info *ip; +}; + +/* + * A segment is a linear region of low physical memory. + * XXX Maybe we should use phys addr here and kmap()/kunmap(). + * Used by the verbs layer. + */ +struct qib_seg { + void *vaddr; + size_t length; +}; + +/* The number of qib_segs that fit in a page. */ +#define QIB_SEGSZ (PAGE_SIZE / sizeof(struct qib_seg)) + +struct qib_segarray { + struct qib_seg segs[QIB_SEGSZ]; +}; + +struct qib_mregion { + struct ib_pd *pd; /* shares refcnt of ibmr.pd */ + u64 user_base; /* User's address for this region */ + u64 iova; /* IB start address of this region */ + size_t length; + u32 lkey; + u32 offset; /* offset (bytes) to start of region */ + int access_flags; + u32 max_segs; /* number of qib_segs in all the arrays */ + u32 mapsz; /* size of the map array */ + u8 page_shift; /* 0 - non unform/non powerof2 sizes */ + u8 lkey_published; /* in global table */ + struct completion comp; /* complete when refcount goes to zero */ + struct rcu_head list; + atomic_t refcount; + struct qib_segarray *map[0]; /* the segments */ +}; + +/* + * These keep track of the copy progress within a memory region. + * Used by the verbs layer. + */ +struct qib_sge { + struct qib_mregion *mr; + void *vaddr; /* kernel virtual address of segment */ + u32 sge_length; /* length of the SGE */ + u32 length; /* remaining length of the segment */ + u16 m; /* current index: mr->map[m] */ + u16 n; /* current index: mr->map[m]->segs[n] */ +}; + +/* Memory region */ +struct qib_mr { + struct ib_mr ibmr; + struct ib_umem *umem; + struct qib_mregion mr; /* must be last */ +}; + +/* + * Send work request queue entry. + * The size of the sg_list is determined when the QP is created and stored + * in qp->s_max_sge. + */ +struct qib_swqe { + struct ib_send_wr wr; /* don't use wr.sg_list */ + u32 psn; /* first packet sequence number */ + u32 lpsn; /* last packet sequence number */ + u32 ssn; /* send sequence number */ + u32 length; /* total length of data in sg_list */ + struct qib_sge sg_list[0]; +}; + +/* + * Receive work request queue entry. + * The size of the sg_list is determined when the QP (or SRQ) is created + * and stored in qp->r_rq.max_sge (or srq->rq.max_sge). + */ +struct qib_rwqe { + u64 wr_id; + u8 num_sge; + struct ib_sge sg_list[0]; +}; + +/* + * This structure is used to contain the head pointer, tail pointer, + * and receive work queue entries as a single memory allocation so + * it can be mmap'ed into user space. + * Note that the wq array elements are variable size so you can't + * just index into the array to get the N'th element; + * use get_rwqe_ptr() instead. + */ +struct qib_rwq { + u32 head; /* new work requests posted to the head */ + u32 tail; /* receives pull requests from here. */ + struct qib_rwqe wq[0]; +}; + +struct qib_rq { + struct qib_rwq *wq; + u32 size; /* size of RWQE array */ + u8 max_sge; + spinlock_t lock /* protect changes in this struct */ + ____cacheline_aligned_in_smp; +}; + +struct qib_srq { + struct ib_srq ibsrq; + struct qib_rq rq; + struct qib_mmap_info *ip; + /* send signal when number of RWQEs < limit */ + u32 limit; +}; + +struct qib_sge_state { + struct qib_sge *sg_list; /* next SGE to be used if any */ + struct qib_sge sge; /* progress state for the current SGE */ + u32 total_len; + u8 num_sge; +}; + +/* + * This structure holds the information that the send tasklet needs + * to send a RDMA read response or atomic operation. + */ +struct qib_ack_entry { + u8 opcode; + u8 sent; + u32 psn; + u32 lpsn; + union { + struct qib_sge rdma_sge; + u64 atomic_data; + }; +}; + +/* + * Variables prefixed with s_ are for the requester (sender). + * Variables prefixed with r_ are for the responder (receiver). + * Variables prefixed with ack_ are for responder replies. + * + * Common variables are protected by both r_rq.lock and s_lock in that order + * which only happens in modify_qp() or changing the QP 'state'. + */ +struct qib_qp { + struct ib_qp ibqp; + /* read mostly fields above and below */ + struct ib_ah_attr remote_ah_attr; + struct ib_ah_attr alt_ah_attr; + struct qib_qp __rcu *next; /* link list for QPN hash table */ + struct qib_swqe *s_wq; /* send work queue */ + struct qib_mmap_info *ip; + struct qib_ib_header *s_hdr; /* next packet header to send */ + unsigned long timeout_jiffies; /* computed from timeout */ + + enum ib_mtu path_mtu; + u32 remote_qpn; + u32 pmtu; /* decoded from path_mtu */ + u32 qkey; /* QKEY for this QP (for UD or RD) */ + u32 s_size; /* send work queue size */ + u32 s_rnr_timeout; /* number of milliseconds for RNR timeout */ + + u8 state; /* QP state */ + u8 qp_access_flags; + u8 alt_timeout; /* Alternate path timeout for this QP */ + u8 timeout; /* Timeout for this QP */ + u8 s_srate; + u8 s_mig_state; + u8 port_num; + u8 s_pkey_index; /* PKEY index to use */ + u8 s_alt_pkey_index; /* Alternate path PKEY index to use */ + u8 r_max_rd_atomic; /* max number of RDMA read/atomic to receive */ + u8 s_max_rd_atomic; /* max number of RDMA read/atomic to send */ + u8 s_retry_cnt; /* number of times to retry */ + u8 s_rnr_retry_cnt; + u8 r_min_rnr_timer; /* retry timeout value for RNR NAKs */ + u8 s_max_sge; /* size of s_wq->sg_list */ + u8 s_draining; + + /* start of read/write fields */ + + atomic_t refcount ____cacheline_aligned_in_smp; + wait_queue_head_t wait; + + + struct qib_ack_entry s_ack_queue[QIB_MAX_RDMA_ATOMIC + 1] + ____cacheline_aligned_in_smp; + struct qib_sge_state s_rdma_read_sge; + + spinlock_t r_lock ____cacheline_aligned_in_smp; /* used for APM */ + unsigned long r_aflags; + u64 r_wr_id; /* ID for current receive WQE */ + u32 r_ack_psn; /* PSN for next ACK or atomic ACK */ + u32 r_len; /* total length of r_sge */ + u32 r_rcv_len; /* receive data len processed */ + u32 r_psn; /* expected rcv packet sequence number */ + u32 r_msn; /* message sequence number */ + + u8 r_state; /* opcode of last packet received */ + u8 r_flags; + u8 r_head_ack_queue; /* index into s_ack_queue[] */ + + struct list_head rspwait; /* link for waititing to respond */ + + struct qib_sge_state r_sge; /* current receive data */ + struct qib_rq r_rq; /* receive work queue */ + + spinlock_t s_lock ____cacheline_aligned_in_smp; + struct qib_sge_state *s_cur_sge; + u32 s_flags; + struct qib_verbs_txreq *s_tx; + struct qib_swqe *s_wqe; + struct qib_sge_state s_sge; /* current send request data */ + struct qib_mregion *s_rdma_mr; + atomic_t s_dma_busy; + u32 s_cur_size; /* size of send packet in bytes */ + u32 s_len; /* total length of s_sge */ + u32 s_rdma_read_len; /* total length of s_rdma_read_sge */ + u32 s_next_psn; /* PSN for next request */ + u32 s_last_psn; /* last response PSN processed */ + u32 s_sending_psn; /* lowest PSN that is being sent */ + u32 s_sending_hpsn; /* highest PSN that is being sent */ + u32 s_psn; /* current packet sequence number */ + u32 s_ack_rdma_psn; /* PSN for sending RDMA read responses */ + u32 s_ack_psn; /* PSN for acking sends and RDMA writes */ + u32 s_head; /* new entries added here */ + u32 s_tail; /* next entry to process */ + u32 s_cur; /* current work queue entry */ + u32 s_acked; /* last un-ACK'ed entry */ + u32 s_last; /* last completed entry */ + u32 s_ssn; /* SSN of tail entry */ + u32 s_lsn; /* limit sequence number (credit) */ + u16 s_hdrwords; /* size of s_hdr in 32 bit words */ + u16 s_rdma_ack_cnt; + u8 s_state; /* opcode of last packet sent */ + u8 s_ack_state; /* opcode of packet to ACK */ + u8 s_nak_state; /* non-zero if NAK is pending */ + u8 r_nak_state; /* non-zero if NAK is pending */ + u8 s_retry; /* requester retry counter */ + u8 s_rnr_retry; /* requester RNR retry counter */ + u8 s_num_rd_atomic; /* number of RDMA read/atomic pending */ + u8 s_tail_ack_queue; /* index into s_ack_queue[] */ + + struct qib_sge_state s_ack_rdma_sge; + struct timer_list s_timer; + struct list_head iowait; /* link for wait PIO buf */ + + struct work_struct s_work; + + wait_queue_head_t wait_dma; + + struct qib_sge r_sg_list[0] /* verified SGEs */ + ____cacheline_aligned_in_smp; +}; + +/* + * Atomic bit definitions for r_aflags. + */ +#define QIB_R_WRID_VALID 0 +#define QIB_R_REWIND_SGE 1 + +/* + * Bit definitions for r_flags. + */ +#define QIB_R_REUSE_SGE 0x01 +#define QIB_R_RDMAR_SEQ 0x02 +#define QIB_R_RSP_NAK 0x04 +#define QIB_R_RSP_SEND 0x08 +#define QIB_R_COMM_EST 0x10 + +/* + * Bit definitions for s_flags. + * + * QIB_S_SIGNAL_REQ_WR - set if QP send WRs contain completion signaled + * QIB_S_BUSY - send tasklet is processing the QP + * QIB_S_TIMER - the RC retry timer is active + * QIB_S_ACK_PENDING - an ACK is waiting to be sent after RDMA read/atomics + * QIB_S_WAIT_FENCE - waiting for all prior RDMA read or atomic SWQEs + * before processing the next SWQE + * QIB_S_WAIT_RDMAR - waiting for a RDMA read or atomic SWQE to complete + * before processing the next SWQE + * QIB_S_WAIT_RNR - waiting for RNR timeout + * QIB_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE + * QIB_S_WAIT_DMA - waiting for send DMA queue to drain before generating + * next send completion entry not via send DMA + * QIB_S_WAIT_PIO - waiting for a send buffer to be available + * QIB_S_WAIT_TX - waiting for a struct qib_verbs_txreq to be available + * QIB_S_WAIT_DMA_DESC - waiting for DMA descriptors to be available + * QIB_S_WAIT_KMEM - waiting for kernel memory to be available + * QIB_S_WAIT_PSN - waiting for a packet to exit the send DMA queue + * QIB_S_WAIT_ACK - waiting for an ACK packet before sending more requests + * QIB_S_SEND_ONE - send one packet, request ACK, then wait for ACK + */ +#define QIB_S_SIGNAL_REQ_WR 0x0001 +#define QIB_S_BUSY 0x0002 +#define QIB_S_TIMER 0x0004 +#define QIB_S_RESP_PENDING 0x0008 +#define QIB_S_ACK_PENDING 0x0010 +#define QIB_S_WAIT_FENCE 0x0020 +#define QIB_S_WAIT_RDMAR 0x0040 +#define QIB_S_WAIT_RNR 0x0080 +#define QIB_S_WAIT_SSN_CREDIT 0x0100 +#define QIB_S_WAIT_DMA 0x0200 +#define QIB_S_WAIT_PIO 0x0400 +#define QIB_S_WAIT_TX 0x0800 +#define QIB_S_WAIT_DMA_DESC 0x1000 +#define QIB_S_WAIT_KMEM 0x2000 +#define QIB_S_WAIT_PSN 0x4000 +#define QIB_S_WAIT_ACK 0x8000 +#define QIB_S_SEND_ONE 0x10000 +#define QIB_S_UNLIMITED_CREDIT 0x20000 + +/* + * Wait flags that would prevent any packet type from being sent. + */ +#define QIB_S_ANY_WAIT_IO (QIB_S_WAIT_PIO | QIB_S_WAIT_TX | \ + QIB_S_WAIT_DMA_DESC | QIB_S_WAIT_KMEM) + +/* + * Wait flags that would prevent send work requests from making progress. + */ +#define QIB_S_ANY_WAIT_SEND (QIB_S_WAIT_FENCE | QIB_S_WAIT_RDMAR | \ + QIB_S_WAIT_RNR | QIB_S_WAIT_SSN_CREDIT | QIB_S_WAIT_DMA | \ + QIB_S_WAIT_PSN | QIB_S_WAIT_ACK) + +#define QIB_S_ANY_WAIT (QIB_S_ANY_WAIT_IO | QIB_S_ANY_WAIT_SEND) + +#define QIB_PSN_CREDIT 16 + +/* + * Since struct qib_swqe is not a fixed size, we can't simply index into + * struct qib_qp.s_wq. This function does the array index computation. + */ +static inline struct qib_swqe *get_swqe_ptr(struct qib_qp *qp, + unsigned n) +{ + return (struct qib_swqe *)((char *)qp->s_wq + + (sizeof(struct qib_swqe) + + qp->s_max_sge * + sizeof(struct qib_sge)) * n); +} + +/* + * Since struct qib_rwqe is not a fixed size, we can't simply index into + * struct qib_rwq.wq. This function does the array index computation. + */ +static inline struct qib_rwqe *get_rwqe_ptr(struct qib_rq *rq, unsigned n) +{ + return (struct qib_rwqe *) + ((char *) rq->wq->wq + + (sizeof(struct qib_rwqe) + + rq->max_sge * sizeof(struct ib_sge)) * n); +} + +/* + * QPN-map pages start out as NULL, they get allocated upon + * first use and are never deallocated. This way, + * large bitmaps are not allocated unless large numbers of QPs are used. + */ +struct qpn_map { + void *page; +}; + +struct qib_qpn_table { + spinlock_t lock; /* protect changes in this struct */ + unsigned flags; /* flags for QP0/1 allocated for each port */ + u32 last; /* last QP number allocated */ + u32 nmaps; /* size of the map table */ + u16 limit; + u16 mask; + /* bit map of free QP numbers other than 0/1 */ + struct qpn_map map[QPNMAP_ENTRIES]; +}; + +struct qib_lkey_table { + spinlock_t lock; /* protect changes in this struct */ + u32 next; /* next unused index (speeds search) */ + u32 gen; /* generation count */ + u32 max; /* size of the table */ + struct qib_mregion __rcu **table; +}; + +struct qib_opcode_stats { + u64 n_packets; /* number of packets */ + u64 n_bytes; /* total number of bytes */ +}; + +struct qib_opcode_stats_perctx { + struct qib_opcode_stats stats[128]; +}; + +struct qib_pma_counters { + u64 n_unicast_xmit; /* total unicast packets sent */ + u64 n_unicast_rcv; /* total unicast packets received */ + u64 n_multicast_xmit; /* total multicast packets sent */ + u64 n_multicast_rcv; /* total multicast packets received */ +}; + +struct qib_ibport { + struct qib_qp __rcu *qp0; + struct qib_qp __rcu *qp1; + struct ib_mad_agent *send_agent; /* agent for SMI (traps) */ + struct qib_ah *sm_ah; + struct qib_ah *smi_ah; + struct rb_root mcast_tree; + spinlock_t lock; /* protect changes in this struct */ + + /* non-zero when timer is set */ + unsigned long mkey_lease_timeout; + unsigned long trap_timeout; + __be64 gid_prefix; /* in network order */ + __be64 mkey; + __be64 guids[QIB_GUIDS_PER_PORT - 1]; /* writable GUIDs */ + u64 tid; /* TID for traps */ + struct qib_pma_counters __percpu *pmastats; + u64 z_unicast_xmit; /* starting count for PMA */ + u64 z_unicast_rcv; /* starting count for PMA */ + u64 z_multicast_xmit; /* starting count for PMA */ + u64 z_multicast_rcv; /* starting count for PMA */ + u64 z_symbol_error_counter; /* starting count for PMA */ + u64 z_link_error_recovery_counter; /* starting count for PMA */ + u64 z_link_downed_counter; /* starting count for PMA */ + u64 z_port_rcv_errors; /* starting count for PMA */ + u64 z_port_rcv_remphys_errors; /* starting count for PMA */ + u64 z_port_xmit_discards; /* starting count for PMA */ + u64 z_port_xmit_data; /* starting count for PMA */ + u64 z_port_rcv_data; /* starting count for PMA */ + u64 z_port_xmit_packets; /* starting count for PMA */ + u64 z_port_rcv_packets; /* starting count for PMA */ + u32 z_local_link_integrity_errors; /* starting count for PMA */ + u32 z_excessive_buffer_overrun_errors; /* starting count for PMA */ + u32 z_vl15_dropped; /* starting count for PMA */ + u32 n_rc_resends; + u32 n_rc_acks; + u32 n_rc_qacks; + u32 n_rc_delayed_comp; + u32 n_seq_naks; + u32 n_rdma_seq; + u32 n_rnr_naks; + u32 n_other_naks; + u32 n_loop_pkts; + u32 n_pkt_drops; + u32 n_vl15_dropped; + u32 n_rc_timeouts; + u32 n_dmawait; + u32 n_unaligned; + u32 n_rc_dupreq; + u32 n_rc_seqnak; + u32 port_cap_flags; + u32 pma_sample_start; + u32 pma_sample_interval; + __be16 pma_counter_select[5]; + u16 pma_tag; + u16 pkey_violations; + u16 qkey_violations; + u16 mkey_violations; + u16 mkey_lease_period; + u16 sm_lid; + u16 repress_traps; + u8 sm_sl; + u8 mkeyprot; + u8 subnet_timeout; + u8 vl_high_limit; + u8 sl_to_vl[16]; + +}; + + +struct qib_ibdev { + struct ib_device ibdev; + struct list_head pending_mmaps; + spinlock_t mmap_offset_lock; /* protect mmap_offset */ + u32 mmap_offset; + struct qib_mregion __rcu *dma_mr; + + /* QP numbers are shared by all IB ports */ + struct qib_qpn_table qpn_table; + struct qib_lkey_table lk_table; + struct list_head piowait; /* list for wait PIO buf */ + struct list_head dmawait; /* list for wait DMA */ + struct list_head txwait; /* list for wait qib_verbs_txreq */ + struct list_head memwait; /* list for wait kernel memory */ + struct list_head txreq_free; + struct timer_list mem_timer; + struct qib_qp __rcu **qp_table; + struct qib_pio_header *pio_hdrs; + dma_addr_t pio_hdrs_phys; + /* list of QPs waiting for RNR timer */ + spinlock_t pending_lock; /* protect wait lists, PMA counters, etc. */ + u32 qp_table_size; /* size of the hash table */ + u32 qp_rnd; /* random bytes for hash */ + spinlock_t qpt_lock; + + u32 n_piowait; + u32 n_txwait; + + u32 n_pds_allocated; /* number of PDs allocated for device */ + spinlock_t n_pds_lock; + u32 n_ahs_allocated; /* number of AHs allocated for device */ + spinlock_t n_ahs_lock; + u32 n_cqs_allocated; /* number of CQs allocated for device */ + spinlock_t n_cqs_lock; + u32 n_qps_allocated; /* number of QPs allocated for device */ + spinlock_t n_qps_lock; + u32 n_srqs_allocated; /* number of SRQs allocated for device */ + spinlock_t n_srqs_lock; + u32 n_mcast_grps_allocated; /* number of mcast groups allocated */ + spinlock_t n_mcast_grps_lock; +#ifdef CONFIG_DEBUG_FS + /* per HCA debugfs */ + struct dentry *qib_ibdev_dbg; +#endif +}; + +struct qib_verbs_counters { + u64 symbol_error_counter; + u64 link_error_recovery_counter; + u64 link_downed_counter; + u64 port_rcv_errors; + u64 port_rcv_remphys_errors; + u64 port_xmit_discards; + u64 port_xmit_data; + u64 port_rcv_data; + u64 port_xmit_packets; + u64 port_rcv_packets; + u32 local_link_integrity_errors; + u32 excessive_buffer_overrun_errors; + u32 vl15_dropped; +}; + +static inline struct qib_mr *to_imr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct qib_mr, ibmr); +} + +static inline struct qib_pd *to_ipd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct qib_pd, ibpd); +} + +static inline struct qib_ah *to_iah(struct ib_ah *ibah) +{ + return container_of(ibah, struct qib_ah, ibah); +} + +static inline struct qib_cq *to_icq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct qib_cq, ibcq); +} + +static inline struct qib_srq *to_isrq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct qib_srq, ibsrq); +} + +static inline struct qib_qp *to_iqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct qib_qp, ibqp); +} + +static inline struct qib_ibdev *to_idev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct qib_ibdev, ibdev); +} + +/* + * Send if not busy or waiting for I/O and either + * a RC response is pending or we can process send work requests. + */ +static inline int qib_send_ok(struct qib_qp *qp) +{ + return !(qp->s_flags & (QIB_S_BUSY | QIB_S_ANY_WAIT_IO)) && + (qp->s_hdrwords || (qp->s_flags & QIB_S_RESP_PENDING) || + !(qp->s_flags & QIB_S_ANY_WAIT_SEND)); +} + +/* + * This must be called with s_lock held. + */ +void qib_schedule_send(struct qib_qp *qp); + +static inline int qib_pkey_ok(u16 pkey1, u16 pkey2) +{ + u16 p1 = pkey1 & 0x7FFF; + u16 p2 = pkey2 & 0x7FFF; + + /* + * Low 15 bits must be non-zero and match, and + * one of the two must be a full member. + */ + return p1 && p1 == p2 && ((__s16)pkey1 < 0 || (__s16)pkey2 < 0); +} + +void qib_bad_pqkey(struct qib_ibport *ibp, __be16 trap_num, u32 key, u32 sl, + u32 qp1, u32 qp2, __be16 lid1, __be16 lid2); +void qib_cap_mask_chg(struct qib_ibport *ibp); +void qib_sys_guid_chg(struct qib_ibport *ibp); +void qib_node_desc_chg(struct qib_ibport *ibp); +int qib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad); +int qib_create_agents(struct qib_ibdev *dev); +void qib_free_agents(struct qib_ibdev *dev); + +/* + * Compare the lower 24 bits of the two values. + * Returns an integer <, ==, or > than zero. + */ +static inline int qib_cmp24(u32 a, u32 b) +{ + return (((int) a) - ((int) b)) << 8; +} + +struct qib_mcast *qib_mcast_find(struct qib_ibport *ibp, union ib_gid *mgid); + +int qib_snapshot_counters(struct qib_pportdata *ppd, u64 *swords, + u64 *rwords, u64 *spkts, u64 *rpkts, + u64 *xmit_wait); + +int qib_get_counters(struct qib_pportdata *ppd, + struct qib_verbs_counters *cntrs); + +int qib_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); + +int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); + +int qib_mcast_tree_empty(struct qib_ibport *ibp); + +__be32 qib_compute_aeth(struct qib_qp *qp); + +struct qib_qp *qib_lookup_qpn(struct qib_ibport *ibp, u32 qpn); + +struct ib_qp *qib_create_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); + +int qib_destroy_qp(struct ib_qp *ibqp); + +int qib_error_qp(struct qib_qp *qp, enum ib_wc_status err); + +int qib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); + +int qib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_qp_init_attr *init_attr); + +unsigned qib_free_all_qps(struct qib_devdata *dd); + +void qib_init_qpn_table(struct qib_devdata *dd, struct qib_qpn_table *qpt); + +void qib_free_qpn_table(struct qib_qpn_table *qpt); + +#ifdef CONFIG_DEBUG_FS + +struct qib_qp_iter; + +struct qib_qp_iter *qib_qp_iter_init(struct qib_ibdev *dev); + +int qib_qp_iter_next(struct qib_qp_iter *iter); + +void qib_qp_iter_print(struct seq_file *s, struct qib_qp_iter *iter); + +#endif + +void qib_get_credit(struct qib_qp *qp, u32 aeth); + +unsigned qib_pkt_delay(u32 plen, u8 snd_mult, u8 rcv_mult); + +void qib_verbs_sdma_desc_avail(struct qib_pportdata *ppd, unsigned avail); + +void qib_put_txreq(struct qib_verbs_txreq *tx); + +int qib_verbs_send(struct qib_qp *qp, struct qib_ib_header *hdr, + u32 hdrwords, struct qib_sge_state *ss, u32 len); + +void qib_copy_sge(struct qib_sge_state *ss, void *data, u32 length, + int release); + +void qib_skip_sge(struct qib_sge_state *ss, u32 length, int release); + +void qib_uc_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct qib_qp *qp); + +void qib_rc_rcv(struct qib_ctxtdata *rcd, struct qib_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct qib_qp *qp); + +int qib_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr); + +struct ib_ah *qib_create_qp0_ah(struct qib_ibport *ibp, u16 dlid); + +void qib_rc_rnr_retry(unsigned long arg); + +void qib_rc_send_complete(struct qib_qp *qp, struct qib_ib_header *hdr); + +void qib_rc_error(struct qib_qp *qp, enum ib_wc_status err); + +int qib_post_ud_send(struct qib_qp *qp, struct ib_send_wr *wr); + +void qib_ud_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct qib_qp *qp); + +int qib_alloc_lkey(struct qib_mregion *mr, int dma_region); + +void qib_free_lkey(struct qib_mregion *mr); + +int qib_lkey_ok(struct qib_lkey_table *rkt, struct qib_pd *pd, + struct qib_sge *isge, struct ib_sge *sge, int acc); + +int qib_rkey_ok(struct qib_qp *qp, struct qib_sge *sge, + u32 len, u64 vaddr, u32 rkey, int acc); + +int qib_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); + +struct ib_srq *qib_create_srq(struct ib_pd *ibpd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata); + +int qib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, + struct ib_udata *udata); + +int qib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr); + +int qib_destroy_srq(struct ib_srq *ibsrq); + +int qib_cq_init(struct qib_devdata *dd); + +void qib_cq_exit(struct qib_devdata *dd); + +void qib_cq_enter(struct qib_cq *cq, struct ib_wc *entry, int sig); + +int qib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry); + +struct ib_cq *qib_create_cq(struct ib_device *ibdev, int entries, + int comp_vector, struct ib_ucontext *context, + struct ib_udata *udata); + +int qib_destroy_cq(struct ib_cq *ibcq); + +int qib_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags); + +int qib_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); + +struct ib_mr *qib_get_dma_mr(struct ib_pd *pd, int acc); + +struct ib_mr *qib_reg_phys_mr(struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, int acc, u64 *iova_start); + +struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int mr_access_flags, + struct ib_udata *udata); + +int qib_dereg_mr(struct ib_mr *ibmr); + +struct ib_mr *qib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len); + +struct ib_fast_reg_page_list *qib_alloc_fast_reg_page_list( + struct ib_device *ibdev, int page_list_len); + +void qib_free_fast_reg_page_list(struct ib_fast_reg_page_list *pl); + +int qib_fast_reg_mr(struct qib_qp *qp, struct ib_send_wr *wr); + +struct ib_fmr *qib_alloc_fmr(struct ib_pd *pd, int mr_access_flags, + struct ib_fmr_attr *fmr_attr); + +int qib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, + int list_len, u64 iova); + +int qib_unmap_fmr(struct list_head *fmr_list); + +int qib_dealloc_fmr(struct ib_fmr *ibfmr); + +static inline void qib_get_mr(struct qib_mregion *mr) +{ + atomic_inc(&mr->refcount); +} + +void mr_rcu_callback(struct rcu_head *list); + +static inline void qib_put_mr(struct qib_mregion *mr) +{ + if (unlikely(atomic_dec_and_test(&mr->refcount))) + call_rcu(&mr->list, mr_rcu_callback); +} + +static inline void qib_put_ss(struct qib_sge_state *ss) +{ + while (ss->num_sge) { + qib_put_mr(ss->sge.mr); + if (--ss->num_sge) + ss->sge = *ss->sg_list++; + } +} + + +void qib_release_mmap_info(struct kref *ref); + +struct qib_mmap_info *qib_create_mmap_info(struct qib_ibdev *dev, u32 size, + struct ib_ucontext *context, + void *obj); + +void qib_update_mmap_info(struct qib_ibdev *dev, struct qib_mmap_info *ip, + u32 size, void *obj); + +int qib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); + +int qib_get_rwqe(struct qib_qp *qp, int wr_id_only); + +void qib_migrate_qp(struct qib_qp *qp); + +int qib_ruc_check_hdr(struct qib_ibport *ibp, struct qib_ib_header *hdr, + int has_grh, struct qib_qp *qp, u32 bth0); + +u32 qib_make_grh(struct qib_ibport *ibp, struct ib_grh *hdr, + struct ib_global_route *grh, u32 hwords, u32 nwords); + +void qib_make_ruc_header(struct qib_qp *qp, struct qib_other_headers *ohdr, + u32 bth0, u32 bth2); + +void qib_do_send(struct work_struct *work); + +void qib_send_complete(struct qib_qp *qp, struct qib_swqe *wqe, + enum ib_wc_status status); + +void qib_send_rc_ack(struct qib_qp *qp); + +int qib_make_rc_req(struct qib_qp *qp); + +int qib_make_uc_req(struct qib_qp *qp); + +int qib_make_ud_req(struct qib_qp *qp); + +int qib_register_ib_device(struct qib_devdata *); + +void qib_unregister_ib_device(struct qib_devdata *); + +void qib_ib_rcv(struct qib_ctxtdata *, void *, void *, u32); + +void qib_ib_piobufavail(struct qib_devdata *); + +unsigned qib_get_npkeys(struct qib_devdata *); + +unsigned qib_get_pkey(struct qib_ibport *, unsigned); + +extern const enum ib_wc_opcode ib_qib_wc_opcode[]; + +/* + * Below HCA-independent IB PhysPortState values, returned + * by the f_ibphys_portstate() routine. + */ +#define IB_PHYSPORTSTATE_SLEEP 1 +#define IB_PHYSPORTSTATE_POLL 2 +#define IB_PHYSPORTSTATE_DISABLED 3 +#define IB_PHYSPORTSTATE_CFG_TRAIN 4 +#define IB_PHYSPORTSTATE_LINKUP 5 +#define IB_PHYSPORTSTATE_LINK_ERR_RECOVER 6 +#define IB_PHYSPORTSTATE_CFG_DEBOUNCE 8 +#define IB_PHYSPORTSTATE_CFG_IDLE 0xB +#define IB_PHYSPORTSTATE_RECOVERY_RETRAIN 0xC +#define IB_PHYSPORTSTATE_RECOVERY_WAITRMT 0xE +#define IB_PHYSPORTSTATE_RECOVERY_IDLE 0xF +#define IB_PHYSPORTSTATE_CFG_ENH 0x10 +#define IB_PHYSPORTSTATE_CFG_WAIT_ENH 0x13 + +extern const int ib_qib_state_ops[]; + +extern __be64 ib_qib_sys_image_guid; /* in network order */ + +extern unsigned int ib_qib_lkey_table_size; + +extern unsigned int ib_qib_max_cqes; + +extern unsigned int ib_qib_max_cqs; + +extern unsigned int ib_qib_max_qp_wrs; + +extern unsigned int ib_qib_max_qps; + +extern unsigned int ib_qib_max_sges; + +extern unsigned int ib_qib_max_mcast_grps; + +extern unsigned int ib_qib_max_mcast_qp_attached; + +extern unsigned int ib_qib_max_srqs; + +extern unsigned int ib_qib_max_srq_sges; + +extern unsigned int ib_qib_max_srq_wrs; + +extern const u32 ib_qib_rnr_table[]; + +extern struct ib_dma_mapping_ops qib_dma_mapping_ops; + +#endif /* QIB_VERBS_H */ diff --git a/drivers/infiniband/hw/qib/qib_verbs_mcast.c b/drivers/infiniband/hw/qib/qib_verbs_mcast.c new file mode 100644 index 0000000..dabb697 --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_verbs_mcast.c @@ -0,0 +1,368 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "qib.h" + +/** + * qib_mcast_qp_alloc - alloc a struct to link a QP to mcast GID struct + * @qp: the QP to link + */ +static struct qib_mcast_qp *qib_mcast_qp_alloc(struct qib_qp *qp) +{ + struct qib_mcast_qp *mqp; + + mqp = kmalloc(sizeof *mqp, GFP_KERNEL); + if (!mqp) + goto bail; + + mqp->qp = qp; + atomic_inc(&qp->refcount); + +bail: + return mqp; +} + +static void qib_mcast_qp_free(struct qib_mcast_qp *mqp) +{ + struct qib_qp *qp = mqp->qp; + + /* Notify qib_destroy_qp() if it is waiting. */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + + kfree(mqp); +} + +/** + * qib_mcast_alloc - allocate the multicast GID structure + * @mgid: the multicast GID + * + * A list of QPs will be attached to this structure. + */ +static struct qib_mcast *qib_mcast_alloc(union ib_gid *mgid) +{ + struct qib_mcast *mcast; + + mcast = kmalloc(sizeof *mcast, GFP_KERNEL); + if (!mcast) + goto bail; + + mcast->mgid = *mgid; + INIT_LIST_HEAD(&mcast->qp_list); + init_waitqueue_head(&mcast->wait); + atomic_set(&mcast->refcount, 0); + mcast->n_attached = 0; + +bail: + return mcast; +} + +static void qib_mcast_free(struct qib_mcast *mcast) +{ + struct qib_mcast_qp *p, *tmp; + + list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) + qib_mcast_qp_free(p); + + kfree(mcast); +} + +/** + * qib_mcast_find - search the global table for the given multicast GID + * @ibp: the IB port structure + * @mgid: the multicast GID to search for + * + * Returns NULL if not found. + * + * The caller is responsible for decrementing the reference count if found. + */ +struct qib_mcast *qib_mcast_find(struct qib_ibport *ibp, union ib_gid *mgid) +{ + struct rb_node *n; + unsigned long flags; + struct qib_mcast *mcast; + + spin_lock_irqsave(&ibp->lock, flags); + n = ibp->mcast_tree.rb_node; + while (n) { + int ret; + + mcast = rb_entry(n, struct qib_mcast, rb_node); + + ret = memcmp(mgid->raw, mcast->mgid.raw, + sizeof(union ib_gid)); + if (ret < 0) + n = n->rb_left; + else if (ret > 0) + n = n->rb_right; + else { + atomic_inc(&mcast->refcount); + spin_unlock_irqrestore(&ibp->lock, flags); + goto bail; + } + } + spin_unlock_irqrestore(&ibp->lock, flags); + + mcast = NULL; + +bail: + return mcast; +} + +/** + * qib_mcast_add - insert mcast GID into table and attach QP struct + * @mcast: the mcast GID table + * @mqp: the QP to attach + * + * Return zero if both were added. Return EEXIST if the GID was already in + * the table but the QP was added. Return ESRCH if the QP was already + * attached and neither structure was added. + */ +static int qib_mcast_add(struct qib_ibdev *dev, struct qib_ibport *ibp, + struct qib_mcast *mcast, struct qib_mcast_qp *mqp) +{ + struct rb_node **n = &ibp->mcast_tree.rb_node; + struct rb_node *pn = NULL; + int ret; + + spin_lock_irq(&ibp->lock); + + while (*n) { + struct qib_mcast *tmcast; + struct qib_mcast_qp *p; + + pn = *n; + tmcast = rb_entry(pn, struct qib_mcast, rb_node); + + ret = memcmp(mcast->mgid.raw, tmcast->mgid.raw, + sizeof(union ib_gid)); + if (ret < 0) { + n = &pn->rb_left; + continue; + } + if (ret > 0) { + n = &pn->rb_right; + continue; + } + + /* Search the QP list to see if this is already there. */ + list_for_each_entry_rcu(p, &tmcast->qp_list, list) { + if (p->qp == mqp->qp) { + ret = ESRCH; + goto bail; + } + } + if (tmcast->n_attached == ib_qib_max_mcast_qp_attached) { + ret = ENOMEM; + goto bail; + } + + tmcast->n_attached++; + + list_add_tail_rcu(&mqp->list, &tmcast->qp_list); + ret = EEXIST; + goto bail; + } + + spin_lock(&dev->n_mcast_grps_lock); + if (dev->n_mcast_grps_allocated == ib_qib_max_mcast_grps) { + spin_unlock(&dev->n_mcast_grps_lock); + ret = ENOMEM; + goto bail; + } + + dev->n_mcast_grps_allocated++; + spin_unlock(&dev->n_mcast_grps_lock); + + mcast->n_attached++; + + list_add_tail_rcu(&mqp->list, &mcast->qp_list); + + atomic_inc(&mcast->refcount); + rb_link_node(&mcast->rb_node, pn, n); + rb_insert_color(&mcast->rb_node, &ibp->mcast_tree); + + ret = 0; + +bail: + spin_unlock_irq(&ibp->lock); + + return ret; +} + +int qib_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct qib_qp *qp = to_iqp(ibqp); + struct qib_ibdev *dev = to_idev(ibqp->device); + struct qib_ibport *ibp; + struct qib_mcast *mcast; + struct qib_mcast_qp *mqp; + int ret; + + if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) { + ret = -EINVAL; + goto bail; + } + + /* + * Allocate data structures since its better to do this outside of + * spin locks and it will most likely be needed. + */ + mcast = qib_mcast_alloc(gid); + if (mcast == NULL) { + ret = -ENOMEM; + goto bail; + } + mqp = qib_mcast_qp_alloc(qp); + if (mqp == NULL) { + qib_mcast_free(mcast); + ret = -ENOMEM; + goto bail; + } + ibp = to_iport(ibqp->device, qp->port_num); + switch (qib_mcast_add(dev, ibp, mcast, mqp)) { + case ESRCH: + /* Neither was used: OK to attach the same QP twice. */ + qib_mcast_qp_free(mqp); + qib_mcast_free(mcast); + break; + + case EEXIST: /* The mcast wasn't used */ + qib_mcast_free(mcast); + break; + + case ENOMEM: + /* Exceeded the maximum number of mcast groups. */ + qib_mcast_qp_free(mqp); + qib_mcast_free(mcast); + ret = -ENOMEM; + goto bail; + + default: + break; + } + + ret = 0; + +bail: + return ret; +} + +int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct qib_qp *qp = to_iqp(ibqp); + struct qib_ibdev *dev = to_idev(ibqp->device); + struct qib_ibport *ibp = to_iport(ibqp->device, qp->port_num); + struct qib_mcast *mcast = NULL; + struct qib_mcast_qp *p, *tmp; + struct rb_node *n; + int last = 0; + int ret; + + if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) { + ret = -EINVAL; + goto bail; + } + + spin_lock_irq(&ibp->lock); + + /* Find the GID in the mcast table. */ + n = ibp->mcast_tree.rb_node; + while (1) { + if (n == NULL) { + spin_unlock_irq(&ibp->lock); + ret = -EINVAL; + goto bail; + } + + mcast = rb_entry(n, struct qib_mcast, rb_node); + ret = memcmp(gid->raw, mcast->mgid.raw, + sizeof(union ib_gid)); + if (ret < 0) + n = n->rb_left; + else if (ret > 0) + n = n->rb_right; + else + break; + } + + /* Search the QP list. */ + list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) { + if (p->qp != qp) + continue; + /* + * We found it, so remove it, but don't poison the forward + * link until we are sure there are no list walkers. + */ + list_del_rcu(&p->list); + mcast->n_attached--; + + /* If this was the last attached QP, remove the GID too. */ + if (list_empty(&mcast->qp_list)) { + rb_erase(&mcast->rb_node, &ibp->mcast_tree); + last = 1; + } + break; + } + + spin_unlock_irq(&ibp->lock); + + if (p) { + /* + * Wait for any list walkers to finish before freeing the + * list element. + */ + wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1); + qib_mcast_qp_free(p); + } + if (last) { + atomic_dec(&mcast->refcount); + wait_event(mcast->wait, !atomic_read(&mcast->refcount)); + qib_mcast_free(mcast); + spin_lock_irq(&dev->n_mcast_grps_lock); + dev->n_mcast_grps_allocated--; + spin_unlock_irq(&dev->n_mcast_grps_lock); + } + + ret = 0; + +bail: + return ret; +} + +int qib_mcast_tree_empty(struct qib_ibport *ibp) +{ + return ibp->mcast_tree.rb_node == NULL; +} diff --git a/drivers/infiniband/hw/qib/qib_wc_ppc64.c b/drivers/infiniband/hw/qib/qib_wc_ppc64.c new file mode 100644 index 0000000..673cf4c --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_wc_ppc64.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This file is conditionally built on PowerPC only. Otherwise weak symbol + * versions of the functions exported from here are used. + */ + +#include "qib.h" + +/** + * qib_enable_wc - enable write combining for MMIO writes to the device + * @dd: qlogic_ib device + * + * Nothing to do on PowerPC, so just return without error. + */ +int qib_enable_wc(struct qib_devdata *dd) +{ + return 0; +} + +/** + * qib_unordered_wc - indicate whether write combining is unordered + * + * Because our performance depends on our ability to do write + * combining mmio writes in the most efficient way, we need to + * know if we are on a processor that may reorder stores when + * write combining. + */ +int qib_unordered_wc(void) +{ + return 1; +} diff --git a/drivers/infiniband/hw/qib/qib_wc_x86_64.c b/drivers/infiniband/hw/qib/qib_wc_x86_64.c new file mode 100644 index 0000000..1d7281c --- /dev/null +++ b/drivers/infiniband/hw/qib/qib_wc_x86_64.c @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This file is conditionally built on x86_64 only. Otherwise weak symbol + * versions of the functions exported from here are used. + */ + +#include +#include +#include + +#include "qib.h" + +/** + * qib_enable_wc - enable write combining for MMIO writes to the device + * @dd: qlogic_ib device + * + * This routine is x86_64-specific; it twiddles the CPU's MTRRs to enable + * write combining. + */ +int qib_enable_wc(struct qib_devdata *dd) +{ + int ret = 0; + u64 pioaddr, piolen; + unsigned bits; + const unsigned long addr = pci_resource_start(dd->pcidev, 0); + const size_t len = pci_resource_len(dd->pcidev, 0); + + /* + * Set the PIO buffers to be WCCOMB, so we get HT bursts to the + * chip. Linux (possibly the hardware) requires it to be on a power + * of 2 address matching the length (which has to be a power of 2). + * For rev1, that means the base address, for rev2, it will be just + * the PIO buffers themselves. + * For chips with two sets of buffers, the calculations are + * somewhat more complicated; we need to sum, and the piobufbase + * register has both offsets, 2K in low 32 bits, 4K in high 32 bits. + * The buffers are still packed, so a single range covers both. + */ + if (dd->piobcnt2k && dd->piobcnt4k) { + /* 2 sizes for chip */ + unsigned long pio2kbase, pio4kbase; + pio2kbase = dd->piobufbase & 0xffffffffUL; + pio4kbase = (dd->piobufbase >> 32) & 0xffffffffUL; + if (pio2kbase < pio4kbase) { + /* all current chips */ + pioaddr = addr + pio2kbase; + piolen = pio4kbase - pio2kbase + + dd->piobcnt4k * dd->align4k; + } else { + pioaddr = addr + pio4kbase; + piolen = pio2kbase - pio4kbase + + dd->piobcnt2k * dd->palign; + } + } else { /* single buffer size (2K, currently) */ + pioaddr = addr + dd->piobufbase; + piolen = dd->piobcnt2k * dd->palign + + dd->piobcnt4k * dd->align4k; + } + + for (bits = 0; !(piolen & (1ULL << bits)); bits++) + /* do nothing */ ; + + if (piolen != (1ULL << bits)) { + piolen >>= bits; + while (piolen >>= 1) + bits++; + piolen = 1ULL << (bits + 1); + } + if (pioaddr & (piolen - 1)) { + u64 atmp; + atmp = pioaddr & ~(piolen - 1); + if (atmp < addr || (atmp + piolen) > (addr + len)) { + qib_dev_err(dd, + "No way to align address/size (%llx/%llx), no WC mtrr\n", + (unsigned long long) atmp, + (unsigned long long) piolen << 1); + ret = -ENODEV; + } else { + pioaddr = atmp; + piolen <<= 1; + } + } + + if (!ret) { + int cookie; + + cookie = mtrr_add(pioaddr, piolen, MTRR_TYPE_WRCOMB, 0); + if (cookie < 0) { + { + qib_devinfo(dd->pcidev, + "mtrr_add() WC for PIO bufs failed (%d)\n", + cookie); + ret = -EINVAL; + } + } else { + dd->wc_cookie = cookie; + dd->wc_base = (unsigned long) pioaddr; + dd->wc_len = (unsigned long) piolen; + } + } + + return ret; +} + +/** + * qib_disable_wc - disable write combining for MMIO writes to the device + * @dd: qlogic_ib device + */ +void qib_disable_wc(struct qib_devdata *dd) +{ + if (dd->wc_cookie) { + int r; + + r = mtrr_del(dd->wc_cookie, dd->wc_base, + dd->wc_len); + if (r < 0) + qib_devinfo(dd->pcidev, + "mtrr_del(%lx, %lx, %lx) failed: %d\n", + dd->wc_cookie, dd->wc_base, + dd->wc_len, r); + dd->wc_cookie = 0; /* even on failure */ + } +} + +/** + * qib_unordered_wc - indicate whether write combining is ordered + * + * Because our performance depends on our ability to do write combining mmio + * writes in the most efficient way, we need to know if we are on an Intel + * or AMD x86_64 processor. AMD x86_64 processors flush WC buffers out in + * the order completed, and so no special flushing is required to get + * correct ordering. Intel processors, however, will flush write buffers + * out in "random" orders, and so explicit ordering is needed at times. + */ +int qib_unordered_wc(void) +{ + return boot_cpu_data.x86_vendor != X86_VENDOR_AMD; +} diff --git a/drivers/infiniband/hw/usnic/Kconfig b/drivers/infiniband/hw/usnic/Kconfig new file mode 100644 index 0000000..29ab11c --- /dev/null +++ b/drivers/infiniband/hw/usnic/Kconfig @@ -0,0 +1,10 @@ +config INFINIBAND_USNIC + tristate "Verbs support for Cisco VIC" + depends on NETDEVICES && ETHERNET && INET && PCI && INTEL_IOMMU + select ENIC + select NET_VENDOR_CISCO + select PCI_IOV + select INFINIBAND_USER_ACCESS + ---help--- + This is a low-level driver for Cisco's Virtual Interface + Cards (VICs), including the VIC 1240 and 1280 cards. diff --git a/drivers/infiniband/hw/usnic/Makefile b/drivers/infiniband/hw/usnic/Makefile new file mode 100644 index 0000000..99fb2db --- /dev/null +++ b/drivers/infiniband/hw/usnic/Makefile @@ -0,0 +1,15 @@ +ccflags-y := -Idrivers/net/ethernet/cisco/enic + +obj-$(CONFIG_INFINIBAND_USNIC)+= usnic_verbs.o + +usnic_verbs-y=\ +usnic_fwd.o \ +usnic_transport.o \ +usnic_uiom.o \ +usnic_uiom_interval_tree.o \ +usnic_vnic.o \ +usnic_ib_main.o \ +usnic_ib_qp_grp.o \ +usnic_ib_sysfs.o \ +usnic_ib_verbs.o \ +usnic_debugfs.o \ diff --git a/drivers/infiniband/hw/usnic/usnic.h b/drivers/infiniband/hw/usnic/usnic.h new file mode 100644 index 0000000..5be13d8 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_H_ +#define USNIC_H_ + +#define DRV_NAME "usnic_verbs" + +#define PCI_DEVICE_ID_CISCO_VIC_USPACE_NIC 0x00cf /* User space NIC */ + +#define DRV_VERSION "1.0.3" +#define DRV_RELDATE "December 19, 2013" + +#endif /* USNIC_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_abi.h b/drivers/infiniband/hw/usnic/usnic_abi.h new file mode 100644 index 0000000..04a6622 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_abi.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + + +#ifndef USNIC_ABI_H +#define USNIC_ABI_H + +/* ABI between userspace and kernel */ +#define USNIC_UVERBS_ABI_VERSION 4 + +#define USNIC_QP_GRP_MAX_WQS 8 +#define USNIC_QP_GRP_MAX_RQS 8 +#define USNIC_QP_GRP_MAX_CQS 16 + +enum usnic_transport_type { + USNIC_TRANSPORT_UNKNOWN = 0, + USNIC_TRANSPORT_ROCE_CUSTOM = 1, + USNIC_TRANSPORT_IPV4_UDP = 2, + USNIC_TRANSPORT_MAX = 3, +}; + +struct usnic_transport_spec { + enum usnic_transport_type trans_type; + union { + struct { + uint16_t port_num; + } usnic_roce; + struct { + uint32_t sock_fd; + } udp; + }; +}; + +struct usnic_ib_create_qp_cmd { + struct usnic_transport_spec spec; +}; + +/*TODO: Future - usnic_modify_qp needs to pass in generic filters */ +struct usnic_ib_create_qp_resp { + u32 vfid; + u32 qp_grp_id; + u64 bar_bus_addr; + u32 bar_len; +/* + * WQ, RQ, CQ are explicity specified bc exposing a generic resources inteface + * expands the scope of ABI to many files. + */ + u32 wq_cnt; + u32 rq_cnt; + u32 cq_cnt; + u32 wq_idx[USNIC_QP_GRP_MAX_WQS]; + u32 rq_idx[USNIC_QP_GRP_MAX_RQS]; + u32 cq_idx[USNIC_QP_GRP_MAX_CQS]; + u32 transport; + u32 reserved[9]; +}; + +#endif /* USNIC_ABI_H */ diff --git a/drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h b/drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h new file mode 100644 index 0000000..3935672 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_CMN_PKT_HDR_H +#define USNIC_CMN_PKT_HDR_H + +#define USNIC_ROCE_ETHERTYPE (0x8915) +#define USNIC_ROCE_GRH_VER (8) +#define USNIC_PROTO_VER (1) +#define USNIC_ROCE_GRH_VER_SHIFT (4) + +#endif /* USNIC_COMMON_PKT_HDR_H */ diff --git a/drivers/infiniband/hw/usnic/usnic_common_util.h b/drivers/infiniband/hw/usnic/usnic_common_util.h new file mode 100644 index 0000000..9d737ed --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_common_util.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_CMN_UTIL_H +#define USNIC_CMN_UTIL_H + +static inline void +usnic_mac_to_gid(const char *const mac, char *raw_gid) +{ + raw_gid[0] = 0xfe; + raw_gid[1] = 0x80; + memset(&raw_gid[2], 0, 6); + raw_gid[8] = mac[0]^2; + raw_gid[9] = mac[1]; + raw_gid[10] = mac[2]; + raw_gid[11] = 0xff; + raw_gid[12] = 0xfe; + raw_gid[13] = mac[3]; + raw_gid[14] = mac[4]; + raw_gid[15] = mac[5]; +} + +static inline void +usnic_mac_ip_to_gid(const char *const mac, const __be32 inaddr, char *raw_gid) +{ + raw_gid[0] = 0xfe; + raw_gid[1] = 0x80; + memset(&raw_gid[2], 0, 2); + memcpy(&raw_gid[4], &inaddr, 4); + raw_gid[8] = mac[0]^2; + raw_gid[9] = mac[1]; + raw_gid[10] = mac[2]; + raw_gid[11] = 0xff; + raw_gid[12] = 0xfe; + raw_gid[13] = mac[3]; + raw_gid[14] = mac[4]; + raw_gid[15] = mac[5]; +} + +static inline void +usnic_write_gid_if_id_from_mac(char *mac, char *raw_gid) +{ + raw_gid[8] = mac[0]^2; + raw_gid[9] = mac[1]; + raw_gid[10] = mac[2]; + raw_gid[11] = 0xff; + raw_gid[12] = 0xfe; + raw_gid[13] = mac[3]; + raw_gid[14] = mac[4]; + raw_gid[15] = mac[5]; +} + +#endif /* USNIC_COMMON_UTIL_H */ diff --git a/drivers/infiniband/hw/usnic/usnic_debugfs.c b/drivers/infiniband/hw/usnic/usnic_debugfs.c new file mode 100644 index 0000000..5d13860 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_debugfs.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include + +#include "usnic.h" +#include "usnic_log.h" +#include "usnic_debugfs.h" +#include "usnic_ib_qp_grp.h" +#include "usnic_transport.h" + +static struct dentry *debugfs_root; +static struct dentry *flows_dentry; + +static ssize_t usnic_debugfs_buildinfo_read(struct file *f, char __user *data, + size_t count, loff_t *ppos) +{ + char buf[500]; + int res; + + if (*ppos > 0) + return 0; + + res = scnprintf(buf, sizeof(buf), + "version: %s\n" + "build date: %s\n", + DRV_VERSION, DRV_RELDATE); + + return simple_read_from_buffer(data, count, ppos, buf, res); +} + +static const struct file_operations usnic_debugfs_buildinfo_ops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = usnic_debugfs_buildinfo_read +}; + +static ssize_t flowinfo_read(struct file *f, char __user *data, + size_t count, loff_t *ppos) +{ + struct usnic_ib_qp_grp_flow *qp_flow; + int n; + int left; + char *ptr; + char buf[512]; + + qp_flow = f->private_data; + ptr = buf; + left = count; + + if (*ppos > 0) + return 0; + + spin_lock(&qp_flow->qp_grp->lock); + n = scnprintf(ptr, left, + "QP Grp ID: %d Transport: %s ", + qp_flow->qp_grp->grp_id, + usnic_transport_to_str(qp_flow->trans_type)); + UPDATE_PTR_LEFT(n, ptr, left); + if (qp_flow->trans_type == USNIC_TRANSPORT_ROCE_CUSTOM) { + n = scnprintf(ptr, left, "Port_Num:%hu\n", + qp_flow->usnic_roce.port_num); + UPDATE_PTR_LEFT(n, ptr, left); + } else if (qp_flow->trans_type == USNIC_TRANSPORT_IPV4_UDP) { + n = usnic_transport_sock_to_str(ptr, left, + qp_flow->udp.sock); + UPDATE_PTR_LEFT(n, ptr, left); + n = scnprintf(ptr, left, "\n"); + UPDATE_PTR_LEFT(n, ptr, left); + } + spin_unlock(&qp_flow->qp_grp->lock); + + return simple_read_from_buffer(data, count, ppos, buf, ptr - buf); +} + +static const struct file_operations flowinfo_ops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = flowinfo_read, +}; + +void usnic_debugfs_init(void) +{ + debugfs_root = debugfs_create_dir(DRV_NAME, NULL); + if (IS_ERR(debugfs_root)) { + usnic_err("Failed to create debugfs root dir, check if debugfs is enabled in kernel configuration\n"); + goto out_clear_root; + } + + flows_dentry = debugfs_create_dir("flows", debugfs_root); + if (IS_ERR_OR_NULL(flows_dentry)) { + usnic_err("Failed to create debugfs flow dir with err %ld\n", + PTR_ERR(flows_dentry)); + goto out_free_root; + } + + debugfs_create_file("build-info", S_IRUGO, debugfs_root, + NULL, &usnic_debugfs_buildinfo_ops); + return; + +out_free_root: + debugfs_remove_recursive(debugfs_root); +out_clear_root: + debugfs_root = NULL; +} + +void usnic_debugfs_exit(void) +{ + if (!debugfs_root) + return; + + debugfs_remove_recursive(debugfs_root); + debugfs_root = NULL; +} + +void usnic_debugfs_flow_add(struct usnic_ib_qp_grp_flow *qp_flow) +{ + if (IS_ERR_OR_NULL(flows_dentry)) + return; + + scnprintf(qp_flow->dentry_name, sizeof(qp_flow->dentry_name), + "%u", qp_flow->flow->flow_id); + qp_flow->dbgfs_dentry = debugfs_create_file(qp_flow->dentry_name, + S_IRUGO, + flows_dentry, + qp_flow, + &flowinfo_ops); + if (IS_ERR_OR_NULL(qp_flow->dbgfs_dentry)) { + usnic_err("Failed to create dbg fs entry for flow %u\n", + qp_flow->flow->flow_id); + } +} + +void usnic_debugfs_flow_remove(struct usnic_ib_qp_grp_flow *qp_flow) +{ + if (!IS_ERR_OR_NULL(qp_flow->dbgfs_dentry)) + debugfs_remove(qp_flow->dbgfs_dentry); +} diff --git a/drivers/infiniband/hw/usnic/usnic_debugfs.h b/drivers/infiniband/hw/usnic/usnic_debugfs.h new file mode 100644 index 0000000..4087d24 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_debugfs.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#ifndef USNIC_DEBUGFS_H_ +#define USNIC_DEBUGFS_H_ + +#include "usnic_ib_qp_grp.h" + +void usnic_debugfs_init(void); + +void usnic_debugfs_exit(void); +void usnic_debugfs_flow_add(struct usnic_ib_qp_grp_flow *qp_flow); +void usnic_debugfs_flow_remove(struct usnic_ib_qp_grp_flow *qp_flow); + +#endif /*!USNIC_DEBUGFS_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_fwd.c b/drivers/infiniband/hw/usnic/usnic_fwd.c new file mode 100644 index 0000000..e3c9bd9 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_fwd.c @@ -0,0 +1,350 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include + +#include "enic_api.h" +#include "usnic_common_pkt_hdr.h" +#include "usnic_fwd.h" +#include "usnic_log.h" + +static int usnic_fwd_devcmd_locked(struct usnic_fwd_dev *ufdev, int vnic_idx, + enum vnic_devcmd_cmd cmd, u64 *a0, + u64 *a1) +{ + int status; + struct net_device *netdev = ufdev->netdev; + + lockdep_assert_held(&ufdev->lock); + + status = enic_api_devcmd_proxy_by_index(netdev, + vnic_idx, + cmd, + a0, a1, + 1000); + if (status) { + if (status == ERR_EINVAL && cmd == CMD_DEL_FILTER) { + usnic_dbg("Dev %s vnic idx %u cmd %u already deleted", + ufdev->name, vnic_idx, cmd); + } else { + usnic_err("Dev %s vnic idx %u cmd %u failed with status %d\n", + ufdev->name, vnic_idx, cmd, + status); + } + } else { + usnic_dbg("Dev %s vnic idx %u cmd %u success", + ufdev->name, vnic_idx, cmd); + } + + return status; +} + +static int usnic_fwd_devcmd(struct usnic_fwd_dev *ufdev, int vnic_idx, + enum vnic_devcmd_cmd cmd, u64 *a0, u64 *a1) +{ + int status; + + spin_lock(&ufdev->lock); + status = usnic_fwd_devcmd_locked(ufdev, vnic_idx, cmd, a0, a1); + spin_unlock(&ufdev->lock); + + return status; +} + +struct usnic_fwd_dev *usnic_fwd_dev_alloc(struct pci_dev *pdev) +{ + struct usnic_fwd_dev *ufdev; + + ufdev = kzalloc(sizeof(*ufdev), GFP_KERNEL); + if (!ufdev) + return NULL; + + ufdev->pdev = pdev; + ufdev->netdev = pci_get_drvdata(pdev); + spin_lock_init(&ufdev->lock); + strncpy(ufdev->name, netdev_name(ufdev->netdev), + sizeof(ufdev->name) - 1); + + return ufdev; +} + +void usnic_fwd_dev_free(struct usnic_fwd_dev *ufdev) +{ + kfree(ufdev); +} + +void usnic_fwd_set_mac(struct usnic_fwd_dev *ufdev, char mac[ETH_ALEN]) +{ + spin_lock(&ufdev->lock); + memcpy(&ufdev->mac, mac, sizeof(ufdev->mac)); + spin_unlock(&ufdev->lock); +} + +int usnic_fwd_add_ipaddr(struct usnic_fwd_dev *ufdev, __be32 inaddr) +{ + int status; + + spin_lock(&ufdev->lock); + if (ufdev->inaddr == 0) { + ufdev->inaddr = inaddr; + status = 0; + } else { + status = -EFAULT; + } + spin_unlock(&ufdev->lock); + + return status; +} + +void usnic_fwd_del_ipaddr(struct usnic_fwd_dev *ufdev) +{ + spin_lock(&ufdev->lock); + ufdev->inaddr = 0; + spin_unlock(&ufdev->lock); +} + +void usnic_fwd_carrier_up(struct usnic_fwd_dev *ufdev) +{ + spin_lock(&ufdev->lock); + ufdev->link_up = 1; + spin_unlock(&ufdev->lock); +} + +void usnic_fwd_carrier_down(struct usnic_fwd_dev *ufdev) +{ + spin_lock(&ufdev->lock); + ufdev->link_up = 0; + spin_unlock(&ufdev->lock); +} + +void usnic_fwd_set_mtu(struct usnic_fwd_dev *ufdev, unsigned int mtu) +{ + spin_lock(&ufdev->lock); + ufdev->mtu = mtu; + spin_unlock(&ufdev->lock); +} + +static int usnic_fwd_dev_ready_locked(struct usnic_fwd_dev *ufdev) +{ + lockdep_assert_held(&ufdev->lock); + + if (!ufdev->link_up) + return -EPERM; + + return 0; +} + +static int validate_filter_locked(struct usnic_fwd_dev *ufdev, + struct filter *filter) +{ + + lockdep_assert_held(&ufdev->lock); + + if (filter->type == FILTER_IPV4_5TUPLE) { + if (!(filter->u.ipv4.flags & FILTER_FIELD_5TUP_DST_AD)) + return -EACCES; + if (!(filter->u.ipv4.flags & FILTER_FIELD_5TUP_DST_PT)) + return -EBUSY; + else if (ufdev->inaddr == 0) + return -EINVAL; + else if (filter->u.ipv4.dst_port == 0) + return -ERANGE; + else if (ntohl(ufdev->inaddr) != filter->u.ipv4.dst_addr) + return -EFAULT; + else + return 0; + } + + return 0; +} + +static void fill_tlv(struct filter_tlv *tlv, struct filter *filter, + struct filter_action *action) +{ + tlv->type = CLSF_TLV_FILTER; + tlv->length = sizeof(struct filter); + *((struct filter *)&tlv->val) = *filter; + + tlv = (struct filter_tlv *)((char *)tlv + sizeof(struct filter_tlv) + + sizeof(struct filter)); + tlv->type = CLSF_TLV_ACTION; + tlv->length = sizeof(struct filter_action); + *((struct filter_action *)&tlv->val) = *action; +} + +struct usnic_fwd_flow* +usnic_fwd_alloc_flow(struct usnic_fwd_dev *ufdev, struct filter *filter, + struct usnic_filter_action *uaction) +{ + struct filter_tlv *tlv; + struct pci_dev *pdev; + struct usnic_fwd_flow *flow; + uint64_t a0, a1; + uint64_t tlv_size; + dma_addr_t tlv_pa; + int status; + + pdev = ufdev->pdev; + tlv_size = (2*sizeof(struct filter_tlv) + sizeof(struct filter) + + sizeof(struct filter_action)); + + flow = kzalloc(sizeof(*flow), GFP_ATOMIC); + if (!flow) + return ERR_PTR(-ENOMEM); + + tlv = pci_alloc_consistent(pdev, tlv_size, &tlv_pa); + if (!tlv) { + usnic_err("Failed to allocate memory\n"); + status = -ENOMEM; + goto out_free_flow; + } + + fill_tlv(tlv, filter, &uaction->action); + + spin_lock(&ufdev->lock); + status = usnic_fwd_dev_ready_locked(ufdev); + if (status) { + usnic_err("Forwarding dev %s not ready with status %d\n", + ufdev->name, status); + goto out_free_tlv; + } + + status = validate_filter_locked(ufdev, filter); + if (status) { + usnic_err("Failed to validate filter with status %d\n", + status); + goto out_free_tlv; + } + + /* Issue Devcmd */ + a0 = tlv_pa; + a1 = tlv_size; + status = usnic_fwd_devcmd_locked(ufdev, uaction->vnic_idx, + CMD_ADD_FILTER, &a0, &a1); + if (status) { + usnic_err("VF %s Filter add failed with status:%d", + ufdev->name, status); + status = -EFAULT; + goto out_free_tlv; + } else { + usnic_dbg("VF %s FILTER ID:%llu", ufdev->name, a0); + } + + flow->flow_id = (uint32_t) a0; + flow->vnic_idx = uaction->vnic_idx; + flow->ufdev = ufdev; + +out_free_tlv: + spin_unlock(&ufdev->lock); + pci_free_consistent(pdev, tlv_size, tlv, tlv_pa); + if (!status) + return flow; +out_free_flow: + kfree(flow); + return ERR_PTR(status); +} + +int usnic_fwd_dealloc_flow(struct usnic_fwd_flow *flow) +{ + int status; + u64 a0, a1; + + a0 = flow->flow_id; + + status = usnic_fwd_devcmd(flow->ufdev, flow->vnic_idx, + CMD_DEL_FILTER, &a0, &a1); + if (status) { + if (status == ERR_EINVAL) { + usnic_dbg("Filter %u already deleted for VF Idx %u pf: %s status: %d", + flow->flow_id, flow->vnic_idx, + flow->ufdev->name, status); + } else { + usnic_err("PF %s VF Idx %u Filter: %u FILTER DELETE failed with status %d", + flow->ufdev->name, flow->vnic_idx, + flow->flow_id, status); + } + status = 0; + /* + * Log the error and fake success to the caller because if + * a flow fails to be deleted in the firmware, it is an + * unrecoverable error. + */ + } else { + usnic_dbg("PF %s VF Idx %u Filter: %u FILTER DELETED", + flow->ufdev->name, flow->vnic_idx, + flow->flow_id); + } + + kfree(flow); + return status; +} + +int usnic_fwd_enable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx) +{ + int status; + struct net_device *pf_netdev; + u64 a0, a1; + + pf_netdev = ufdev->netdev; + a0 = qp_idx; + a1 = CMD_QP_RQWQ; + + status = usnic_fwd_devcmd(ufdev, vnic_idx, CMD_QP_ENABLE, + &a0, &a1); + if (status) { + usnic_err("PF %s VNIC Index %u RQ Index: %u ENABLE Failed with status %d", + netdev_name(pf_netdev), + vnic_idx, + qp_idx, + status); + } else { + usnic_dbg("PF %s VNIC Index %u RQ Index: %u ENABLED", + netdev_name(pf_netdev), + vnic_idx, qp_idx); + } + + return status; +} + +int usnic_fwd_disable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx) +{ + int status; + u64 a0, a1; + struct net_device *pf_netdev; + + pf_netdev = ufdev->netdev; + a0 = qp_idx; + a1 = CMD_QP_RQWQ; + + status = usnic_fwd_devcmd(ufdev, vnic_idx, CMD_QP_DISABLE, + &a0, &a1); + if (status) { + usnic_err("PF %s VNIC Index %u RQ Index: %u DISABLE Failed with status %d", + netdev_name(pf_netdev), + vnic_idx, + qp_idx, + status); + } else { + usnic_dbg("PF %s VNIC Index %u RQ Index: %u DISABLED", + netdev_name(pf_netdev), + vnic_idx, + qp_idx); + } + + return status; +} diff --git a/drivers/infiniband/hw/usnic/usnic_fwd.h b/drivers/infiniband/hw/usnic/usnic_fwd.h new file mode 100644 index 0000000..93713a2 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_fwd.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_FWD_H_ +#define USNIC_FWD_H_ + +#include +#include +#include +#include + +#include "usnic_abi.h" +#include "usnic_common_pkt_hdr.h" +#include "vnic_devcmd.h" + +struct usnic_fwd_dev { + struct pci_dev *pdev; + struct net_device *netdev; + spinlock_t lock; + /* + * The following fields can be read directly off the device. + * However, they should be set by a accessor function, except name, + * which cannot be changed. + */ + bool link_up; + char mac[ETH_ALEN]; + unsigned int mtu; + __be32 inaddr; + char name[IFNAMSIZ+1]; +}; + +struct usnic_fwd_flow { + uint32_t flow_id; + struct usnic_fwd_dev *ufdev; + unsigned int vnic_idx; +}; + +struct usnic_filter_action { + int vnic_idx; + struct filter_action action; +}; + +struct usnic_fwd_dev *usnic_fwd_dev_alloc(struct pci_dev *pdev); +void usnic_fwd_dev_free(struct usnic_fwd_dev *ufdev); + +void usnic_fwd_set_mac(struct usnic_fwd_dev *ufdev, char mac[ETH_ALEN]); +int usnic_fwd_add_ipaddr(struct usnic_fwd_dev *ufdev, __be32 inaddr); +void usnic_fwd_del_ipaddr(struct usnic_fwd_dev *ufdev); +void usnic_fwd_carrier_up(struct usnic_fwd_dev *ufdev); +void usnic_fwd_carrier_down(struct usnic_fwd_dev *ufdev); +void usnic_fwd_set_mtu(struct usnic_fwd_dev *ufdev, unsigned int mtu); + +/* + * Allocate a flow on this forwarding device. Whoever calls this function, + * must monitor netdev events on ufdev's netdevice. If NETDEV_REBOOT or + * NETDEV_DOWN is seen, flow will no longer function and must be + * immediately freed by calling usnic_dealloc_flow. + */ +struct usnic_fwd_flow* +usnic_fwd_alloc_flow(struct usnic_fwd_dev *ufdev, struct filter *filter, + struct usnic_filter_action *action); +int usnic_fwd_dealloc_flow(struct usnic_fwd_flow *flow); +int usnic_fwd_enable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx); +int usnic_fwd_disable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx); + +static inline void usnic_fwd_init_usnic_filter(struct filter *filter, + uint32_t usnic_id) +{ + filter->type = FILTER_USNIC_ID; + filter->u.usnic.ethtype = USNIC_ROCE_ETHERTYPE; + filter->u.usnic.flags = FILTER_FIELD_USNIC_ETHTYPE | + FILTER_FIELD_USNIC_ID | + FILTER_FIELD_USNIC_PROTO; + filter->u.usnic.proto_version = (USNIC_ROCE_GRH_VER << + USNIC_ROCE_GRH_VER_SHIFT) | + USNIC_PROTO_VER; + filter->u.usnic.usnic_id = usnic_id; +} + +static inline void usnic_fwd_init_udp_filter(struct filter *filter, + uint32_t daddr, uint16_t dport) +{ + filter->type = FILTER_IPV4_5TUPLE; + filter->u.ipv4.flags = FILTER_FIELD_5TUP_PROTO; + filter->u.ipv4.protocol = PROTO_UDP; + + if (daddr) { + filter->u.ipv4.flags |= FILTER_FIELD_5TUP_DST_AD; + filter->u.ipv4.dst_addr = daddr; + } + + if (dport) { + filter->u.ipv4.flags |= FILTER_FIELD_5TUP_DST_PT; + filter->u.ipv4.dst_port = dport; + } +} + +#endif /* !USNIC_FWD_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib.h b/drivers/infiniband/hw/usnic/usnic_ib.h new file mode 100644 index 0000000..e5a9297 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_IB_H_ +#define USNIC_IB_H_ + +#include +#include + +#include + + +#include "usnic.h" +#include "usnic_abi.h" +#include "usnic_vnic.h" + +#define USNIC_IB_PORT_CNT 1 +#define USNIC_IB_NUM_COMP_VECTORS 1 + +extern unsigned int usnic_ib_share_vf; + +struct usnic_ib_ucontext { + struct ib_ucontext ibucontext; + /* Protected by usnic_ib_dev->usdev_lock */ + struct list_head qp_grp_list; + struct list_head link; +}; + +struct usnic_ib_pd { + struct ib_pd ibpd; + struct usnic_uiom_pd *umem_pd; +}; + +struct usnic_ib_mr { + struct ib_mr ibmr; + struct usnic_uiom_reg *umem; +}; + +struct usnic_ib_dev { + struct ib_device ib_dev; + struct pci_dev *pdev; + struct net_device *netdev; + struct usnic_fwd_dev *ufdev; + struct list_head ib_dev_link; + struct list_head vf_dev_list; + struct list_head ctx_list; + struct mutex usdev_lock; + + /* provisioning information */ + struct kref vf_cnt; + unsigned int vf_res_cnt[USNIC_VNIC_RES_TYPE_MAX]; + + /* sysfs vars for QPN reporting */ + struct kobject *qpn_kobj; +}; + +struct usnic_ib_vf { + struct usnic_ib_dev *pf; + spinlock_t lock; + struct usnic_vnic *vnic; + unsigned int qp_grp_ref_cnt; + struct usnic_ib_pd *pd; + struct list_head link; +}; + +static inline +struct usnic_ib_dev *to_usdev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct usnic_ib_dev, ib_dev); +} + +static inline +struct usnic_ib_ucontext *to_ucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct usnic_ib_ucontext, ibucontext); +} + +static inline +struct usnic_ib_pd *to_upd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct usnic_ib_pd, ibpd); +} + +static inline +struct usnic_ib_ucontext *to_uucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct usnic_ib_ucontext, ibucontext); +} + +static inline +struct usnic_ib_mr *to_umr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct usnic_ib_mr, ibmr); +} +void usnic_ib_log_vf(struct usnic_ib_vf *vf); + +#define UPDATE_PTR_LEFT(N, P, L) \ +do { \ + L -= (N); \ + P += (N); \ +} while (0) + +#endif /* USNIC_IB_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c new file mode 100644 index 0000000..0d0f986 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -0,0 +1,682 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Author: Upinder Malhi + * Author: Anant Deepak + * Author: Cesare Cantu' + * Author: Jeff Squyres + * Author: Kiran Thirumalai + * Author: Xuyang Wang + * Author: Reese Faucette + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "usnic_abi.h" +#include "usnic_common_util.h" +#include "usnic_ib.h" +#include "usnic_ib_qp_grp.h" +#include "usnic_log.h" +#include "usnic_fwd.h" +#include "usnic_debugfs.h" +#include "usnic_ib_verbs.h" +#include "usnic_transport.h" +#include "usnic_uiom.h" +#include "usnic_ib_sysfs.h" + +unsigned int usnic_log_lvl = USNIC_LOG_LVL_ERR; +unsigned int usnic_ib_share_vf = 1; + +static const char usnic_version[] = + DRV_NAME ": Cisco VIC (USNIC) Verbs Driver v" + DRV_VERSION " (" DRV_RELDATE ")\n"; + +static DEFINE_MUTEX(usnic_ib_ibdev_list_lock); +static LIST_HEAD(usnic_ib_ibdev_list); + +/* Callback dump funcs */ +static int usnic_ib_dump_vf_hdr(void *obj, char *buf, int buf_sz) +{ + struct usnic_ib_vf *vf = obj; + return scnprintf(buf, buf_sz, "PF: %s ", vf->pf->ib_dev.name); +} +/* End callback dump funcs */ + +static void usnic_ib_dump_vf(struct usnic_ib_vf *vf, char *buf, int buf_sz) +{ + usnic_vnic_dump(vf->vnic, buf, buf_sz, vf, + usnic_ib_dump_vf_hdr, + usnic_ib_qp_grp_dump_hdr, usnic_ib_qp_grp_dump_rows); +} + +void usnic_ib_log_vf(struct usnic_ib_vf *vf) +{ + char buf[1000]; + usnic_ib_dump_vf(vf, buf, sizeof(buf)); + usnic_dbg("%s\n", buf); +} + +/* Start of netdev section */ +static inline const char *usnic_ib_netdev_event_to_string(unsigned long event) +{ + const char *event2str[] = {"NETDEV_NONE", "NETDEV_UP", "NETDEV_DOWN", + "NETDEV_REBOOT", "NETDEV_CHANGE", + "NETDEV_REGISTER", "NETDEV_UNREGISTER", "NETDEV_CHANGEMTU", + "NETDEV_CHANGEADDR", "NETDEV_GOING_DOWN", "NETDEV_FEAT_CHANGE", + "NETDEV_BONDING_FAILOVER", "NETDEV_PRE_UP", + "NETDEV_PRE_TYPE_CHANGE", "NETDEV_POST_TYPE_CHANGE", + "NETDEV_POST_INT", "NETDEV_UNREGISTER_FINAL", "NETDEV_RELEASE", + "NETDEV_NOTIFY_PEERS", "NETDEV_JOIN" + }; + + if (event >= ARRAY_SIZE(event2str)) + return "UNKNOWN_NETDEV_EVENT"; + else + return event2str[event]; +} + +static void usnic_ib_qp_grp_modify_active_to_err(struct usnic_ib_dev *us_ibdev) +{ + struct usnic_ib_ucontext *ctx; + struct usnic_ib_qp_grp *qp_grp; + enum ib_qp_state cur_state; + int status; + + BUG_ON(!mutex_is_locked(&us_ibdev->usdev_lock)); + + list_for_each_entry(ctx, &us_ibdev->ctx_list, link) { + list_for_each_entry(qp_grp, &ctx->qp_grp_list, link) { + cur_state = qp_grp->state; + if (cur_state == IB_QPS_INIT || + cur_state == IB_QPS_RTR || + cur_state == IB_QPS_RTS) { + status = usnic_ib_qp_grp_modify(qp_grp, + IB_QPS_ERR, + NULL); + if (status) { + usnic_err("Failed to transistion qp grp %u from %s to %s\n", + qp_grp->grp_id, + usnic_ib_qp_grp_state_to_string + (cur_state), + usnic_ib_qp_grp_state_to_string + (IB_QPS_ERR)); + } + } + } + } +} + +static void usnic_ib_handle_usdev_event(struct usnic_ib_dev *us_ibdev, + unsigned long event) +{ + struct net_device *netdev; + struct ib_event ib_event; + + memset(&ib_event, 0, sizeof(ib_event)); + + mutex_lock(&us_ibdev->usdev_lock); + netdev = us_ibdev->netdev; + switch (event) { + case NETDEV_REBOOT: + usnic_info("PF Reset on %s\n", us_ibdev->ib_dev.name); + usnic_ib_qp_grp_modify_active_to_err(us_ibdev); + ib_event.event = IB_EVENT_PORT_ERR; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + break; + case NETDEV_UP: + case NETDEV_DOWN: + case NETDEV_CHANGE: + if (!us_ibdev->ufdev->link_up && + netif_carrier_ok(netdev)) { + usnic_fwd_carrier_up(us_ibdev->ufdev); + usnic_info("Link UP on %s\n", us_ibdev->ib_dev.name); + ib_event.event = IB_EVENT_PORT_ACTIVE; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + } else if (us_ibdev->ufdev->link_up && + !netif_carrier_ok(netdev)) { + usnic_fwd_carrier_down(us_ibdev->ufdev); + usnic_info("Link DOWN on %s\n", us_ibdev->ib_dev.name); + usnic_ib_qp_grp_modify_active_to_err(us_ibdev); + ib_event.event = IB_EVENT_PORT_ERR; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + } else { + usnic_dbg("Ignoring %s on %s\n", + usnic_ib_netdev_event_to_string(event), + us_ibdev->ib_dev.name); + } + break; + case NETDEV_CHANGEADDR: + if (!memcmp(us_ibdev->ufdev->mac, netdev->dev_addr, + sizeof(us_ibdev->ufdev->mac))) { + usnic_dbg("Ignoring addr change on %s\n", + us_ibdev->ib_dev.name); + } else { + usnic_info(" %s old mac: %pM new mac: %pM\n", + us_ibdev->ib_dev.name, + us_ibdev->ufdev->mac, + netdev->dev_addr); + usnic_fwd_set_mac(us_ibdev->ufdev, netdev->dev_addr); + usnic_ib_qp_grp_modify_active_to_err(us_ibdev); + ib_event.event = IB_EVENT_GID_CHANGE; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + } + + break; + case NETDEV_CHANGEMTU: + if (us_ibdev->ufdev->mtu != netdev->mtu) { + usnic_info("MTU Change on %s old: %u new: %u\n", + us_ibdev->ib_dev.name, + us_ibdev->ufdev->mtu, netdev->mtu); + usnic_fwd_set_mtu(us_ibdev->ufdev, netdev->mtu); + usnic_ib_qp_grp_modify_active_to_err(us_ibdev); + } else { + usnic_dbg("Ignoring MTU change on %s\n", + us_ibdev->ib_dev.name); + } + break; + default: + usnic_dbg("Ignoring event %s on %s", + usnic_ib_netdev_event_to_string(event), + us_ibdev->ib_dev.name); + } + mutex_unlock(&us_ibdev->usdev_lock); +} + +static int usnic_ib_netdevice_event(struct notifier_block *notifier, + unsigned long event, void *ptr) +{ + struct usnic_ib_dev *us_ibdev; + + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + + mutex_lock(&usnic_ib_ibdev_list_lock); + list_for_each_entry(us_ibdev, &usnic_ib_ibdev_list, ib_dev_link) { + if (us_ibdev->netdev == netdev) { + usnic_ib_handle_usdev_event(us_ibdev, event); + break; + } + } + mutex_unlock(&usnic_ib_ibdev_list_lock); + + return NOTIFY_DONE; +} + +static struct notifier_block usnic_ib_netdevice_notifier = { + .notifier_call = usnic_ib_netdevice_event +}; +/* End of netdev section */ + +/* Start of inet section */ +static int usnic_ib_handle_inet_event(struct usnic_ib_dev *us_ibdev, + unsigned long event, void *ptr) +{ + struct in_ifaddr *ifa = ptr; + struct ib_event ib_event; + + mutex_lock(&us_ibdev->usdev_lock); + + switch (event) { + case NETDEV_DOWN: + usnic_info("%s via ip notifiers", + usnic_ib_netdev_event_to_string(event)); + usnic_fwd_del_ipaddr(us_ibdev->ufdev); + usnic_ib_qp_grp_modify_active_to_err(us_ibdev); + ib_event.event = IB_EVENT_GID_CHANGE; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + break; + case NETDEV_UP: + usnic_fwd_add_ipaddr(us_ibdev->ufdev, ifa->ifa_address); + usnic_info("%s via ip notifiers: ip %pI4", + usnic_ib_netdev_event_to_string(event), + &us_ibdev->ufdev->inaddr); + ib_event.event = IB_EVENT_GID_CHANGE; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + break; + default: + usnic_info("Ignoring event %s on %s", + usnic_ib_netdev_event_to_string(event), + us_ibdev->ib_dev.name); + } + mutex_unlock(&us_ibdev->usdev_lock); + + return NOTIFY_DONE; +} + +static int usnic_ib_inetaddr_event(struct notifier_block *notifier, + unsigned long event, void *ptr) +{ + struct usnic_ib_dev *us_ibdev; + struct in_ifaddr *ifa = ptr; + struct net_device *netdev = ifa->ifa_dev->dev; + + mutex_lock(&usnic_ib_ibdev_list_lock); + list_for_each_entry(us_ibdev, &usnic_ib_ibdev_list, ib_dev_link) { + if (us_ibdev->netdev == netdev) { + usnic_ib_handle_inet_event(us_ibdev, event, ptr); + break; + } + } + mutex_unlock(&usnic_ib_ibdev_list_lock); + + return NOTIFY_DONE; +} +static struct notifier_block usnic_ib_inetaddr_notifier = { + .notifier_call = usnic_ib_inetaddr_event +}; +/* End of inet section*/ + +/* Start of PF discovery section */ +static void *usnic_ib_device_add(struct pci_dev *dev) +{ + struct usnic_ib_dev *us_ibdev; + union ib_gid gid; + struct in_ifaddr *in; + struct net_device *netdev; + + usnic_dbg("\n"); + netdev = pci_get_drvdata(dev); + + us_ibdev = (struct usnic_ib_dev *)ib_alloc_device(sizeof(*us_ibdev)); + if (IS_ERR_OR_NULL(us_ibdev)) { + usnic_err("Device %s context alloc failed\n", + netdev_name(pci_get_drvdata(dev))); + return ERR_PTR(us_ibdev ? PTR_ERR(us_ibdev) : -EFAULT); + } + + us_ibdev->ufdev = usnic_fwd_dev_alloc(dev); + if (IS_ERR_OR_NULL(us_ibdev->ufdev)) { + usnic_err("Failed to alloc ufdev for %s with err %ld\n", + pci_name(dev), PTR_ERR(us_ibdev->ufdev)); + goto err_dealloc; + } + + mutex_init(&us_ibdev->usdev_lock); + INIT_LIST_HEAD(&us_ibdev->vf_dev_list); + INIT_LIST_HEAD(&us_ibdev->ctx_list); + + us_ibdev->pdev = dev; + us_ibdev->netdev = pci_get_drvdata(dev); + us_ibdev->ib_dev.owner = THIS_MODULE; + us_ibdev->ib_dev.node_type = RDMA_NODE_USNIC_UDP; + us_ibdev->ib_dev.phys_port_cnt = USNIC_IB_PORT_CNT; + us_ibdev->ib_dev.num_comp_vectors = USNIC_IB_NUM_COMP_VECTORS; + us_ibdev->ib_dev.dma_device = &dev->dev; + us_ibdev->ib_dev.uverbs_abi_ver = USNIC_UVERBS_ABI_VERSION; + strlcpy(us_ibdev->ib_dev.name, "usnic_%d", IB_DEVICE_NAME_MAX); + + us_ibdev->ib_dev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_OPEN_QP); + + us_ibdev->ib_dev.query_device = usnic_ib_query_device; + us_ibdev->ib_dev.query_port = usnic_ib_query_port; + us_ibdev->ib_dev.query_pkey = usnic_ib_query_pkey; + us_ibdev->ib_dev.query_gid = usnic_ib_query_gid; + us_ibdev->ib_dev.get_link_layer = usnic_ib_port_link_layer; + us_ibdev->ib_dev.alloc_pd = usnic_ib_alloc_pd; + us_ibdev->ib_dev.dealloc_pd = usnic_ib_dealloc_pd; + us_ibdev->ib_dev.create_qp = usnic_ib_create_qp; + us_ibdev->ib_dev.modify_qp = usnic_ib_modify_qp; + us_ibdev->ib_dev.query_qp = usnic_ib_query_qp; + us_ibdev->ib_dev.destroy_qp = usnic_ib_destroy_qp; + us_ibdev->ib_dev.create_cq = usnic_ib_create_cq; + us_ibdev->ib_dev.destroy_cq = usnic_ib_destroy_cq; + us_ibdev->ib_dev.reg_user_mr = usnic_ib_reg_mr; + us_ibdev->ib_dev.dereg_mr = usnic_ib_dereg_mr; + us_ibdev->ib_dev.alloc_ucontext = usnic_ib_alloc_ucontext; + us_ibdev->ib_dev.dealloc_ucontext = usnic_ib_dealloc_ucontext; + us_ibdev->ib_dev.mmap = usnic_ib_mmap; + us_ibdev->ib_dev.create_ah = usnic_ib_create_ah; + us_ibdev->ib_dev.destroy_ah = usnic_ib_destroy_ah; + us_ibdev->ib_dev.post_send = usnic_ib_post_send; + us_ibdev->ib_dev.post_recv = usnic_ib_post_recv; + us_ibdev->ib_dev.poll_cq = usnic_ib_poll_cq; + us_ibdev->ib_dev.req_notify_cq = usnic_ib_req_notify_cq; + us_ibdev->ib_dev.get_dma_mr = usnic_ib_get_dma_mr; + + + if (ib_register_device(&us_ibdev->ib_dev, NULL)) + goto err_fwd_dealloc; + + usnic_fwd_set_mtu(us_ibdev->ufdev, us_ibdev->netdev->mtu); + usnic_fwd_set_mac(us_ibdev->ufdev, us_ibdev->netdev->dev_addr); + if (netif_carrier_ok(us_ibdev->netdev)) + usnic_fwd_carrier_up(us_ibdev->ufdev); + + in = ((struct in_device *)(netdev->ip_ptr))->ifa_list; + if (in != NULL) + usnic_fwd_add_ipaddr(us_ibdev->ufdev, in->ifa_address); + + usnic_mac_ip_to_gid(us_ibdev->netdev->perm_addr, + us_ibdev->ufdev->inaddr, &gid.raw[0]); + memcpy(&us_ibdev->ib_dev.node_guid, &gid.global.interface_id, + sizeof(gid.global.interface_id)); + kref_init(&us_ibdev->vf_cnt); + + usnic_info("Added ibdev: %s netdev: %s with mac %pM Link: %u MTU: %u\n", + us_ibdev->ib_dev.name, netdev_name(us_ibdev->netdev), + us_ibdev->ufdev->mac, us_ibdev->ufdev->link_up, + us_ibdev->ufdev->mtu); + return us_ibdev; + +err_fwd_dealloc: + usnic_fwd_dev_free(us_ibdev->ufdev); +err_dealloc: + usnic_err("failed -- deallocing device\n"); + ib_dealloc_device(&us_ibdev->ib_dev); + return NULL; +} + +static void usnic_ib_device_remove(struct usnic_ib_dev *us_ibdev) +{ + usnic_info("Unregistering %s\n", us_ibdev->ib_dev.name); + usnic_ib_sysfs_unregister_usdev(us_ibdev); + usnic_fwd_dev_free(us_ibdev->ufdev); + ib_unregister_device(&us_ibdev->ib_dev); + ib_dealloc_device(&us_ibdev->ib_dev); +} + +static void usnic_ib_undiscover_pf(struct kref *kref) +{ + struct usnic_ib_dev *us_ibdev, *tmp; + struct pci_dev *dev; + bool found = false; + + dev = container_of(kref, struct usnic_ib_dev, vf_cnt)->pdev; + mutex_lock(&usnic_ib_ibdev_list_lock); + list_for_each_entry_safe(us_ibdev, tmp, + &usnic_ib_ibdev_list, ib_dev_link) { + if (us_ibdev->pdev == dev) { + list_del(&us_ibdev->ib_dev_link); + usnic_ib_device_remove(us_ibdev); + found = true; + break; + } + } + + WARN(!found, "Failed to remove PF %s\n", pci_name(dev)); + + mutex_unlock(&usnic_ib_ibdev_list_lock); +} + +static struct usnic_ib_dev *usnic_ib_discover_pf(struct usnic_vnic *vnic) +{ + struct usnic_ib_dev *us_ibdev; + struct pci_dev *parent_pci, *vf_pci; + int err; + + vf_pci = usnic_vnic_get_pdev(vnic); + parent_pci = pci_physfn(vf_pci); + + BUG_ON(!parent_pci); + + mutex_lock(&usnic_ib_ibdev_list_lock); + list_for_each_entry(us_ibdev, &usnic_ib_ibdev_list, ib_dev_link) { + if (us_ibdev->pdev == parent_pci) { + kref_get(&us_ibdev->vf_cnt); + goto out; + } + } + + us_ibdev = usnic_ib_device_add(parent_pci); + if (IS_ERR_OR_NULL(us_ibdev)) { + us_ibdev = us_ibdev ? us_ibdev : ERR_PTR(-EFAULT); + goto out; + } + + err = usnic_ib_sysfs_register_usdev(us_ibdev); + if (err) { + usnic_ib_device_remove(us_ibdev); + us_ibdev = ERR_PTR(err); + goto out; + } + + list_add(&us_ibdev->ib_dev_link, &usnic_ib_ibdev_list); +out: + mutex_unlock(&usnic_ib_ibdev_list_lock); + return us_ibdev; +} +/* End of PF discovery section */ + +/* Start of PCI section */ + +static const struct pci_device_id usnic_ib_pci_ids[] = { + {PCI_DEVICE(PCI_VENDOR_ID_CISCO, PCI_DEVICE_ID_CISCO_VIC_USPACE_NIC)}, + {0,} +}; + +static int usnic_ib_pci_probe(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + int err; + struct usnic_ib_dev *pf; + struct usnic_ib_vf *vf; + enum usnic_vnic_res_type res_type; + + vf = kzalloc(sizeof(*vf), GFP_KERNEL); + if (!vf) + return -ENOMEM; + + err = pci_enable_device(pdev); + if (err) { + usnic_err("Failed to enable %s with err %d\n", + pci_name(pdev), err); + goto out_clean_vf; + } + + err = pci_request_regions(pdev, DRV_NAME); + if (err) { + usnic_err("Failed to request region for %s with err %d\n", + pci_name(pdev), err); + goto out_disable_device; + } + + pci_set_master(pdev); + pci_set_drvdata(pdev, vf); + + vf->vnic = usnic_vnic_alloc(pdev); + if (IS_ERR_OR_NULL(vf->vnic)) { + err = vf->vnic ? PTR_ERR(vf->vnic) : -ENOMEM; + usnic_err("Failed to alloc vnic for %s with err %d\n", + pci_name(pdev), err); + goto out_release_regions; + } + + pf = usnic_ib_discover_pf(vf->vnic); + if (IS_ERR_OR_NULL(pf)) { + usnic_err("Failed to discover pf of vnic %s with err%ld\n", + pci_name(pdev), PTR_ERR(pf)); + err = pf ? PTR_ERR(pf) : -EFAULT; + goto out_clean_vnic; + } + + vf->pf = pf; + spin_lock_init(&vf->lock); + mutex_lock(&pf->usdev_lock); + list_add_tail(&vf->link, &pf->vf_dev_list); + /* + * Save max settings (will be same for each VF, easier to re-write than + * to say "if (!set) { set_values(); set=1; } + */ + for (res_type = USNIC_VNIC_RES_TYPE_EOL+1; + res_type < USNIC_VNIC_RES_TYPE_MAX; + res_type++) { + pf->vf_res_cnt[res_type] = usnic_vnic_res_cnt(vf->vnic, + res_type); + } + + mutex_unlock(&pf->usdev_lock); + + usnic_info("Registering usnic VF %s into PF %s\n", pci_name(pdev), + pf->ib_dev.name); + usnic_ib_log_vf(vf); + return 0; + +out_clean_vnic: + usnic_vnic_free(vf->vnic); +out_release_regions: + pci_set_drvdata(pdev, NULL); + pci_clear_master(pdev); + pci_release_regions(pdev); +out_disable_device: + pci_disable_device(pdev); +out_clean_vf: + kfree(vf); + return err; +} + +static void usnic_ib_pci_remove(struct pci_dev *pdev) +{ + struct usnic_ib_vf *vf = pci_get_drvdata(pdev); + struct usnic_ib_dev *pf = vf->pf; + + mutex_lock(&pf->usdev_lock); + list_del(&vf->link); + mutex_unlock(&pf->usdev_lock); + + kref_put(&pf->vf_cnt, usnic_ib_undiscover_pf); + usnic_vnic_free(vf->vnic); + pci_set_drvdata(pdev, NULL); + pci_clear_master(pdev); + pci_release_regions(pdev); + pci_disable_device(pdev); + kfree(vf); + + usnic_info("Removed VF %s\n", pci_name(pdev)); +} + +/* PCI driver entry points */ +static struct pci_driver usnic_ib_pci_driver = { + .name = DRV_NAME, + .id_table = usnic_ib_pci_ids, + .probe = usnic_ib_pci_probe, + .remove = usnic_ib_pci_remove, +}; +/* End of PCI section */ + +/* Start of module section */ +static int __init usnic_ib_init(void) +{ + int err; + + printk_once(KERN_INFO "%s", usnic_version); + + err = usnic_uiom_init(DRV_NAME); + if (err) { + usnic_err("Unable to initalize umem with err %d\n", err); + return err; + } + + if (pci_register_driver(&usnic_ib_pci_driver)) { + usnic_err("Unable to register with PCI\n"); + goto out_umem_fini; + } + + err = register_netdevice_notifier(&usnic_ib_netdevice_notifier); + if (err) { + usnic_err("Failed to register netdev notifier\n"); + goto out_pci_unreg; + } + + err = register_inetaddr_notifier(&usnic_ib_inetaddr_notifier); + if (err) { + usnic_err("Failed to register inet addr notifier\n"); + goto out_unreg_netdev_notifier; + } + + err = usnic_transport_init(); + if (err) { + usnic_err("Failed to initialize transport\n"); + goto out_unreg_inetaddr_notifier; + } + + usnic_debugfs_init(); + + return 0; + +out_unreg_inetaddr_notifier: + unregister_inetaddr_notifier(&usnic_ib_inetaddr_notifier); +out_unreg_netdev_notifier: + unregister_netdevice_notifier(&usnic_ib_netdevice_notifier); +out_pci_unreg: + pci_unregister_driver(&usnic_ib_pci_driver); +out_umem_fini: + usnic_uiom_fini(); + + return err; +} + +static void __exit usnic_ib_destroy(void) +{ + usnic_dbg("\n"); + usnic_debugfs_exit(); + usnic_transport_fini(); + unregister_inetaddr_notifier(&usnic_ib_inetaddr_notifier); + unregister_netdevice_notifier(&usnic_ib_netdevice_notifier); + pci_unregister_driver(&usnic_ib_pci_driver); + usnic_uiom_fini(); +} + +MODULE_DESCRIPTION("Cisco VIC (usNIC) Verbs Driver"); +MODULE_AUTHOR("Upinder Malhi "); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); +module_param(usnic_log_lvl, uint, S_IRUGO | S_IWUSR); +module_param(usnic_ib_share_vf, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(usnic_log_lvl, " Off=0, Err=1, Info=2, Debug=3"); +MODULE_PARM_DESC(usnic_ib_share_vf, "Off=0, On=1 VF sharing amongst QPs"); +MODULE_DEVICE_TABLE(pci, usnic_ib_pci_ids); + +module_init(usnic_ib_init); +module_exit(usnic_ib_destroy); +/* End of module section */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c new file mode 100644 index 0000000..db3588d --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c @@ -0,0 +1,761 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include + +#include "usnic_log.h" +#include "usnic_vnic.h" +#include "usnic_fwd.h" +#include "usnic_uiom.h" +#include "usnic_debugfs.h" +#include "usnic_ib_qp_grp.h" +#include "usnic_ib_sysfs.h" +#include "usnic_transport.h" + +#define DFLT_RQ_IDX 0 + +const char *usnic_ib_qp_grp_state_to_string(enum ib_qp_state state) +{ + switch (state) { + case IB_QPS_RESET: + return "Rst"; + case IB_QPS_INIT: + return "Init"; + case IB_QPS_RTR: + return "RTR"; + case IB_QPS_RTS: + return "RTS"; + case IB_QPS_SQD: + return "SQD"; + case IB_QPS_SQE: + return "SQE"; + case IB_QPS_ERR: + return "ERR"; + default: + return "UNKOWN STATE"; + + } +} + +int usnic_ib_qp_grp_dump_hdr(char *buf, int buf_sz) +{ + return scnprintf(buf, buf_sz, "|QPN\t|State\t|PID\t|VF Idx\t|Fil ID"); +} + +int usnic_ib_qp_grp_dump_rows(void *obj, char *buf, int buf_sz) +{ + struct usnic_ib_qp_grp *qp_grp = obj; + struct usnic_ib_qp_grp_flow *default_flow; + if (obj) { + default_flow = list_first_entry(&qp_grp->flows_lst, + struct usnic_ib_qp_grp_flow, link); + return scnprintf(buf, buf_sz, "|%d\t|%s\t|%d\t|%hu\t|%d", + qp_grp->ibqp.qp_num, + usnic_ib_qp_grp_state_to_string( + qp_grp->state), + qp_grp->owner_pid, + usnic_vnic_get_index(qp_grp->vf->vnic), + default_flow->flow->flow_id); + } else { + return scnprintf(buf, buf_sz, "|N/A\t|N/A\t|N/A\t|N/A\t|N/A"); + } +} + +static struct usnic_vnic_res_chunk * +get_qp_res_chunk(struct usnic_ib_qp_grp *qp_grp) +{ + lockdep_assert_held(&qp_grp->lock); + /* + * The QP res chunk, used to derive qp indices, + * are just indices of the RQs + */ + return usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_RQ); +} + +static int enable_qp_grp(struct usnic_ib_qp_grp *qp_grp) +{ + + int status; + int i, vnic_idx; + struct usnic_vnic_res_chunk *res_chunk; + struct usnic_vnic_res *res; + + lockdep_assert_held(&qp_grp->lock); + + vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic); + + res_chunk = get_qp_res_chunk(qp_grp); + if (IS_ERR_OR_NULL(res_chunk)) { + usnic_err("Unable to get qp res with err %ld\n", + PTR_ERR(res_chunk)); + return res_chunk ? PTR_ERR(res_chunk) : -ENOMEM; + } + + for (i = 0; i < res_chunk->cnt; i++) { + res = res_chunk->res[i]; + status = usnic_fwd_enable_qp(qp_grp->ufdev, vnic_idx, + res->vnic_idx); + if (status) { + usnic_err("Failed to enable qp %d of %s:%d\n with err %d\n", + res->vnic_idx, qp_grp->ufdev->name, + vnic_idx, status); + goto out_err; + } + } + + return 0; + +out_err: + for (i--; i >= 0; i--) { + res = res_chunk->res[i]; + usnic_fwd_disable_qp(qp_grp->ufdev, vnic_idx, + res->vnic_idx); + } + + return status; +} + +static int disable_qp_grp(struct usnic_ib_qp_grp *qp_grp) +{ + int i, vnic_idx; + struct usnic_vnic_res_chunk *res_chunk; + struct usnic_vnic_res *res; + int status = 0; + + lockdep_assert_held(&qp_grp->lock); + vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic); + + res_chunk = get_qp_res_chunk(qp_grp); + if (IS_ERR_OR_NULL(res_chunk)) { + usnic_err("Unable to get qp res with err %ld\n", + PTR_ERR(res_chunk)); + return res_chunk ? PTR_ERR(res_chunk) : -ENOMEM; + } + + for (i = 0; i < res_chunk->cnt; i++) { + res = res_chunk->res[i]; + status = usnic_fwd_disable_qp(qp_grp->ufdev, vnic_idx, + res->vnic_idx); + if (status) { + usnic_err("Failed to disable rq %d of %s:%d\n with err %d\n", + res->vnic_idx, + qp_grp->ufdev->name, + vnic_idx, status); + } + } + + return status; + +} + +static int init_filter_action(struct usnic_ib_qp_grp *qp_grp, + struct usnic_filter_action *uaction) +{ + struct usnic_vnic_res_chunk *res_chunk; + + res_chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_RQ); + if (IS_ERR_OR_NULL(res_chunk)) { + usnic_err("Unable to get %s with err %ld\n", + usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_RQ), + PTR_ERR(res_chunk)); + return res_chunk ? PTR_ERR(res_chunk) : -ENOMEM; + } + + uaction->vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic); + uaction->action.type = FILTER_ACTION_RQ_STEERING; + uaction->action.u.rq_idx = res_chunk->res[DFLT_RQ_IDX]->vnic_idx; + + return 0; +} + +static struct usnic_ib_qp_grp_flow* +create_roce_custom_flow(struct usnic_ib_qp_grp *qp_grp, + struct usnic_transport_spec *trans_spec) +{ + uint16_t port_num; + int err; + struct filter filter; + struct usnic_filter_action uaction; + struct usnic_ib_qp_grp_flow *qp_flow; + struct usnic_fwd_flow *flow; + enum usnic_transport_type trans_type; + + trans_type = trans_spec->trans_type; + port_num = trans_spec->usnic_roce.port_num; + + /* Reserve Port */ + port_num = usnic_transport_rsrv_port(trans_type, port_num); + if (port_num == 0) + return ERR_PTR(-EINVAL); + + /* Create Flow */ + usnic_fwd_init_usnic_filter(&filter, port_num); + err = init_filter_action(qp_grp, &uaction); + if (err) + goto out_unreserve_port; + + flow = usnic_fwd_alloc_flow(qp_grp->ufdev, &filter, &uaction); + if (IS_ERR_OR_NULL(flow)) { + usnic_err("Unable to alloc flow failed with err %ld\n", + PTR_ERR(flow)); + err = flow ? PTR_ERR(flow) : -EFAULT; + goto out_unreserve_port; + } + + /* Create Flow Handle */ + qp_flow = kzalloc(sizeof(*qp_flow), GFP_ATOMIC); + if (IS_ERR_OR_NULL(qp_flow)) { + err = qp_flow ? PTR_ERR(qp_flow) : -ENOMEM; + goto out_dealloc_flow; + } + qp_flow->flow = flow; + qp_flow->trans_type = trans_type; + qp_flow->usnic_roce.port_num = port_num; + qp_flow->qp_grp = qp_grp; + return qp_flow; + +out_dealloc_flow: + usnic_fwd_dealloc_flow(flow); +out_unreserve_port: + usnic_transport_unrsrv_port(trans_type, port_num); + return ERR_PTR(err); +} + +static void release_roce_custom_flow(struct usnic_ib_qp_grp_flow *qp_flow) +{ + usnic_fwd_dealloc_flow(qp_flow->flow); + usnic_transport_unrsrv_port(qp_flow->trans_type, + qp_flow->usnic_roce.port_num); + kfree(qp_flow); +} + +static struct usnic_ib_qp_grp_flow* +create_udp_flow(struct usnic_ib_qp_grp *qp_grp, + struct usnic_transport_spec *trans_spec) +{ + struct socket *sock; + int sock_fd; + int err; + struct filter filter; + struct usnic_filter_action uaction; + struct usnic_ib_qp_grp_flow *qp_flow; + struct usnic_fwd_flow *flow; + enum usnic_transport_type trans_type; + uint32_t addr; + uint16_t port_num; + int proto; + + trans_type = trans_spec->trans_type; + sock_fd = trans_spec->udp.sock_fd; + + /* Get and check socket */ + sock = usnic_transport_get_socket(sock_fd); + if (IS_ERR_OR_NULL(sock)) + return ERR_CAST(sock); + + err = usnic_transport_sock_get_addr(sock, &proto, &addr, &port_num); + if (err) + goto out_put_sock; + + if (proto != IPPROTO_UDP) { + usnic_err("Protocol for fd %d is not UDP", sock_fd); + err = -EPERM; + goto out_put_sock; + } + + /* Create flow */ + usnic_fwd_init_udp_filter(&filter, addr, port_num); + err = init_filter_action(qp_grp, &uaction); + if (err) + goto out_put_sock; + + flow = usnic_fwd_alloc_flow(qp_grp->ufdev, &filter, &uaction); + if (IS_ERR_OR_NULL(flow)) { + usnic_err("Unable to alloc flow failed with err %ld\n", + PTR_ERR(flow)); + err = flow ? PTR_ERR(flow) : -EFAULT; + goto out_put_sock; + } + + /* Create qp_flow */ + qp_flow = kzalloc(sizeof(*qp_flow), GFP_ATOMIC); + if (IS_ERR_OR_NULL(qp_flow)) { + err = qp_flow ? PTR_ERR(qp_flow) : -ENOMEM; + goto out_dealloc_flow; + } + qp_flow->flow = flow; + qp_flow->trans_type = trans_type; + qp_flow->udp.sock = sock; + qp_flow->qp_grp = qp_grp; + return qp_flow; + +out_dealloc_flow: + usnic_fwd_dealloc_flow(flow); +out_put_sock: + usnic_transport_put_socket(sock); + return ERR_PTR(err); +} + +static void release_udp_flow(struct usnic_ib_qp_grp_flow *qp_flow) +{ + usnic_fwd_dealloc_flow(qp_flow->flow); + usnic_transport_put_socket(qp_flow->udp.sock); + kfree(qp_flow); +} + +static struct usnic_ib_qp_grp_flow* +create_and_add_flow(struct usnic_ib_qp_grp *qp_grp, + struct usnic_transport_spec *trans_spec) +{ + struct usnic_ib_qp_grp_flow *qp_flow; + enum usnic_transport_type trans_type; + + trans_type = trans_spec->trans_type; + switch (trans_type) { + case USNIC_TRANSPORT_ROCE_CUSTOM: + qp_flow = create_roce_custom_flow(qp_grp, trans_spec); + break; + case USNIC_TRANSPORT_IPV4_UDP: + qp_flow = create_udp_flow(qp_grp, trans_spec); + break; + default: + usnic_err("Unsupported transport %u\n", + trans_spec->trans_type); + return ERR_PTR(-EINVAL); + } + + if (!IS_ERR_OR_NULL(qp_flow)) { + list_add_tail(&qp_flow->link, &qp_grp->flows_lst); + usnic_debugfs_flow_add(qp_flow); + } + + + return qp_flow; +} + +static void release_and_remove_flow(struct usnic_ib_qp_grp_flow *qp_flow) +{ + usnic_debugfs_flow_remove(qp_flow); + list_del(&qp_flow->link); + + switch (qp_flow->trans_type) { + case USNIC_TRANSPORT_ROCE_CUSTOM: + release_roce_custom_flow(qp_flow); + break; + case USNIC_TRANSPORT_IPV4_UDP: + release_udp_flow(qp_flow); + break; + default: + WARN(1, "Unsupported transport %u\n", + qp_flow->trans_type); + break; + } +} + +static void release_and_remove_all_flows(struct usnic_ib_qp_grp *qp_grp) +{ + struct usnic_ib_qp_grp_flow *qp_flow, *tmp; + list_for_each_entry_safe(qp_flow, tmp, &qp_grp->flows_lst, link) + release_and_remove_flow(qp_flow); +} + +int usnic_ib_qp_grp_modify(struct usnic_ib_qp_grp *qp_grp, + enum ib_qp_state new_state, + void *data) +{ + int status = 0; + int vnic_idx; + struct ib_event ib_event; + enum ib_qp_state old_state; + struct usnic_transport_spec *trans_spec; + struct usnic_ib_qp_grp_flow *qp_flow; + + old_state = qp_grp->state; + vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic); + trans_spec = (struct usnic_transport_spec *) data; + + spin_lock(&qp_grp->lock); + switch (new_state) { + case IB_QPS_RESET: + switch (old_state) { + case IB_QPS_RESET: + /* NO-OP */ + break; + case IB_QPS_INIT: + release_and_remove_all_flows(qp_grp); + status = 0; + break; + case IB_QPS_RTR: + case IB_QPS_RTS: + case IB_QPS_ERR: + status = disable_qp_grp(qp_grp); + release_and_remove_all_flows(qp_grp); + break; + default: + status = -EINVAL; + } + break; + case IB_QPS_INIT: + switch (old_state) { + case IB_QPS_RESET: + if (trans_spec) { + qp_flow = create_and_add_flow(qp_grp, + trans_spec); + if (IS_ERR_OR_NULL(qp_flow)) { + status = qp_flow ? PTR_ERR(qp_flow) : -EFAULT; + break; + } + } else { + /* + * Optional to specify filters. + */ + status = 0; + } + break; + case IB_QPS_INIT: + if (trans_spec) { + qp_flow = create_and_add_flow(qp_grp, + trans_spec); + if (IS_ERR_OR_NULL(qp_flow)) { + status = qp_flow ? PTR_ERR(qp_flow) : -EFAULT; + break; + } + } else { + /* + * Doesn't make sense to go into INIT state + * from INIT state w/o adding filters. + */ + status = -EINVAL; + } + break; + case IB_QPS_RTR: + status = disable_qp_grp(qp_grp); + break; + case IB_QPS_RTS: + status = disable_qp_grp(qp_grp); + break; + default: + status = -EINVAL; + } + break; + case IB_QPS_RTR: + switch (old_state) { + case IB_QPS_INIT: + status = enable_qp_grp(qp_grp); + break; + default: + status = -EINVAL; + } + break; + case IB_QPS_RTS: + switch (old_state) { + case IB_QPS_RTR: + /* NO-OP FOR NOW */ + break; + default: + status = -EINVAL; + } + break; + case IB_QPS_ERR: + ib_event.device = &qp_grp->vf->pf->ib_dev; + ib_event.element.qp = &qp_grp->ibqp; + ib_event.event = IB_EVENT_QP_FATAL; + + switch (old_state) { + case IB_QPS_RESET: + qp_grp->ibqp.event_handler(&ib_event, + qp_grp->ibqp.qp_context); + break; + case IB_QPS_INIT: + release_and_remove_all_flows(qp_grp); + qp_grp->ibqp.event_handler(&ib_event, + qp_grp->ibqp.qp_context); + break; + case IB_QPS_RTR: + case IB_QPS_RTS: + status = disable_qp_grp(qp_grp); + release_and_remove_all_flows(qp_grp); + qp_grp->ibqp.event_handler(&ib_event, + qp_grp->ibqp.qp_context); + break; + default: + status = -EINVAL; + } + break; + default: + status = -EINVAL; + } + spin_unlock(&qp_grp->lock); + + if (!status) { + qp_grp->state = new_state; + usnic_info("Transistioned %u from %s to %s", + qp_grp->grp_id, + usnic_ib_qp_grp_state_to_string(old_state), + usnic_ib_qp_grp_state_to_string(new_state)); + } else { + usnic_err("Failed to transition %u from %s to %s", + qp_grp->grp_id, + usnic_ib_qp_grp_state_to_string(old_state), + usnic_ib_qp_grp_state_to_string(new_state)); + } + + return status; +} + +static struct usnic_vnic_res_chunk** +alloc_res_chunk_list(struct usnic_vnic *vnic, + struct usnic_vnic_res_spec *res_spec, void *owner_obj) +{ + enum usnic_vnic_res_type res_type; + struct usnic_vnic_res_chunk **res_chunk_list; + int err, i, res_cnt, res_lst_sz; + + for (res_lst_sz = 0; + res_spec->resources[res_lst_sz].type != USNIC_VNIC_RES_TYPE_EOL; + res_lst_sz++) { + /* Do Nothing */ + } + + res_chunk_list = kzalloc(sizeof(*res_chunk_list)*(res_lst_sz+1), + GFP_ATOMIC); + if (!res_chunk_list) + return ERR_PTR(-ENOMEM); + + for (i = 0; res_spec->resources[i].type != USNIC_VNIC_RES_TYPE_EOL; + i++) { + res_type = res_spec->resources[i].type; + res_cnt = res_spec->resources[i].cnt; + + res_chunk_list[i] = usnic_vnic_get_resources(vnic, res_type, + res_cnt, owner_obj); + if (IS_ERR_OR_NULL(res_chunk_list[i])) { + err = res_chunk_list[i] ? + PTR_ERR(res_chunk_list[i]) : -ENOMEM; + usnic_err("Failed to get %s from %s with err %d\n", + usnic_vnic_res_type_to_str(res_type), + usnic_vnic_pci_name(vnic), + err); + goto out_free_res; + } + } + + return res_chunk_list; + +out_free_res: + for (i--; i > 0; i--) + usnic_vnic_put_resources(res_chunk_list[i]); + kfree(res_chunk_list); + return ERR_PTR(err); +} + +static void free_qp_grp_res(struct usnic_vnic_res_chunk **res_chunk_list) +{ + int i; + for (i = 0; res_chunk_list[i]; i++) + usnic_vnic_put_resources(res_chunk_list[i]); + kfree(res_chunk_list); +} + +static int qp_grp_and_vf_bind(struct usnic_ib_vf *vf, + struct usnic_ib_pd *pd, + struct usnic_ib_qp_grp *qp_grp) +{ + int err; + struct pci_dev *pdev; + + lockdep_assert_held(&vf->lock); + + pdev = usnic_vnic_get_pdev(vf->vnic); + if (vf->qp_grp_ref_cnt == 0) { + err = usnic_uiom_attach_dev_to_pd(pd->umem_pd, &pdev->dev); + if (err) { + usnic_err("Failed to attach %s to domain\n", + pci_name(pdev)); + return err; + } + vf->pd = pd; + } + vf->qp_grp_ref_cnt++; + + WARN_ON(vf->pd != pd); + qp_grp->vf = vf; + + return 0; +} + +static void qp_grp_and_vf_unbind(struct usnic_ib_qp_grp *qp_grp) +{ + struct pci_dev *pdev; + struct usnic_ib_pd *pd; + + lockdep_assert_held(&qp_grp->vf->lock); + + pd = qp_grp->vf->pd; + pdev = usnic_vnic_get_pdev(qp_grp->vf->vnic); + if (--qp_grp->vf->qp_grp_ref_cnt == 0) { + qp_grp->vf->pd = NULL; + usnic_uiom_detach_dev_from_pd(pd->umem_pd, &pdev->dev); + } + qp_grp->vf = NULL; +} + +static void log_spec(struct usnic_vnic_res_spec *res_spec) +{ + char buf[512]; + usnic_vnic_spec_dump(buf, sizeof(buf), res_spec); + usnic_dbg("%s\n", buf); +} + +static int qp_grp_id_from_flow(struct usnic_ib_qp_grp_flow *qp_flow, + uint32_t *id) +{ + enum usnic_transport_type trans_type = qp_flow->trans_type; + int err; + uint16_t port_num = 0; + + switch (trans_type) { + case USNIC_TRANSPORT_ROCE_CUSTOM: + *id = qp_flow->usnic_roce.port_num; + break; + case USNIC_TRANSPORT_IPV4_UDP: + err = usnic_transport_sock_get_addr(qp_flow->udp.sock, + NULL, NULL, + &port_num); + if (err) + return err; + /* + * Copy port_num to stack first and then to *id, + * so that the short to int cast works for little + * and big endian systems. + */ + *id = port_num; + break; + default: + usnic_err("Unsupported transport %u\n", trans_type); + return -EINVAL; + } + + return 0; +} + +struct usnic_ib_qp_grp * +usnic_ib_qp_grp_create(struct usnic_fwd_dev *ufdev, struct usnic_ib_vf *vf, + struct usnic_ib_pd *pd, + struct usnic_vnic_res_spec *res_spec, + struct usnic_transport_spec *transport_spec) +{ + struct usnic_ib_qp_grp *qp_grp; + int err; + enum usnic_transport_type transport = transport_spec->trans_type; + struct usnic_ib_qp_grp_flow *qp_flow; + + lockdep_assert_held(&vf->lock); + + err = usnic_vnic_res_spec_satisfied(&min_transport_spec[transport], + res_spec); + if (err) { + usnic_err("Spec does not meet miniumum req for transport %d\n", + transport); + log_spec(res_spec); + return ERR_PTR(err); + } + + qp_grp = kzalloc(sizeof(*qp_grp), GFP_ATOMIC); + if (!qp_grp) { + usnic_err("Unable to alloc qp_grp - Out of memory\n"); + return NULL; + } + + qp_grp->res_chunk_list = alloc_res_chunk_list(vf->vnic, res_spec, + qp_grp); + if (IS_ERR_OR_NULL(qp_grp->res_chunk_list)) { + err = qp_grp->res_chunk_list ? + PTR_ERR(qp_grp->res_chunk_list) : -ENOMEM; + usnic_err("Unable to alloc res for %d with err %d\n", + qp_grp->grp_id, err); + goto out_free_qp_grp; + } + + err = qp_grp_and_vf_bind(vf, pd, qp_grp); + if (err) + goto out_free_res; + + INIT_LIST_HEAD(&qp_grp->flows_lst); + spin_lock_init(&qp_grp->lock); + qp_grp->ufdev = ufdev; + qp_grp->state = IB_QPS_RESET; + qp_grp->owner_pid = current->pid; + + qp_flow = create_and_add_flow(qp_grp, transport_spec); + if (IS_ERR_OR_NULL(qp_flow)) { + usnic_err("Unable to create and add flow with err %ld\n", + PTR_ERR(qp_flow)); + err = qp_flow ? PTR_ERR(qp_flow) : -EFAULT; + goto out_qp_grp_vf_unbind; + } + + err = qp_grp_id_from_flow(qp_flow, &qp_grp->grp_id); + if (err) + goto out_release_flow; + qp_grp->ibqp.qp_num = qp_grp->grp_id; + + usnic_ib_sysfs_qpn_add(qp_grp); + + return qp_grp; + +out_release_flow: + release_and_remove_flow(qp_flow); +out_qp_grp_vf_unbind: + qp_grp_and_vf_unbind(qp_grp); +out_free_res: + free_qp_grp_res(qp_grp->res_chunk_list); +out_free_qp_grp: + kfree(qp_grp); + + return ERR_PTR(err); +} + +void usnic_ib_qp_grp_destroy(struct usnic_ib_qp_grp *qp_grp) +{ + + WARN_ON(qp_grp->state != IB_QPS_RESET); + lockdep_assert_held(&qp_grp->vf->lock); + + release_and_remove_all_flows(qp_grp); + usnic_ib_sysfs_qpn_remove(qp_grp); + qp_grp_and_vf_unbind(qp_grp); + free_qp_grp_res(qp_grp->res_chunk_list); + kfree(qp_grp); +} + +struct usnic_vnic_res_chunk* +usnic_ib_qp_grp_get_chunk(struct usnic_ib_qp_grp *qp_grp, + enum usnic_vnic_res_type res_type) +{ + int i; + + for (i = 0; qp_grp->res_chunk_list[i]; i++) { + if (qp_grp->res_chunk_list[i]->type == res_type) + return qp_grp->res_chunk_list[i]; + } + + return ERR_PTR(-EINVAL); +} diff --git a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.h b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.h new file mode 100644 index 0000000..b0aafe8 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_IB_QP_GRP_H_ +#define USNIC_IB_QP_GRP_H_ + +#include +#include + +#include "usnic_ib.h" +#include "usnic_abi.h" +#include "usnic_fwd.h" +#include "usnic_vnic.h" + +/* + * The qp group struct represents all the hw resources needed to present a ib_qp + */ +struct usnic_ib_qp_grp { + struct ib_qp ibqp; + enum ib_qp_state state; + int grp_id; + + struct usnic_fwd_dev *ufdev; + struct usnic_ib_ucontext *ctx; + struct list_head flows_lst; + + struct usnic_vnic_res_chunk **res_chunk_list; + + pid_t owner_pid; + struct usnic_ib_vf *vf; + struct list_head link; + + spinlock_t lock; + + struct kobject kobj; +}; + +struct usnic_ib_qp_grp_flow { + struct usnic_fwd_flow *flow; + enum usnic_transport_type trans_type; + union { + struct { + uint16_t port_num; + } usnic_roce; + struct { + struct socket *sock; + } udp; + }; + struct usnic_ib_qp_grp *qp_grp; + struct list_head link; + + /* Debug FS */ + struct dentry *dbgfs_dentry; + char dentry_name[32]; +}; + +static const struct +usnic_vnic_res_spec min_transport_spec[USNIC_TRANSPORT_MAX] = { + { /*USNIC_TRANSPORT_UNKNOWN*/ + .resources = { + {.type = USNIC_VNIC_RES_TYPE_EOL, .cnt = 0,}, + }, + }, + { /*USNIC_TRANSPORT_ROCE_CUSTOM*/ + .resources = { + {.type = USNIC_VNIC_RES_TYPE_WQ, .cnt = 1,}, + {.type = USNIC_VNIC_RES_TYPE_RQ, .cnt = 1,}, + {.type = USNIC_VNIC_RES_TYPE_CQ, .cnt = 1,}, + {.type = USNIC_VNIC_RES_TYPE_EOL, .cnt = 0,}, + }, + }, + { /*USNIC_TRANSPORT_IPV4_UDP*/ + .resources = { + {.type = USNIC_VNIC_RES_TYPE_WQ, .cnt = 1,}, + {.type = USNIC_VNIC_RES_TYPE_RQ, .cnt = 1,}, + {.type = USNIC_VNIC_RES_TYPE_CQ, .cnt = 1,}, + {.type = USNIC_VNIC_RES_TYPE_EOL, .cnt = 0,}, + }, + }, +}; + +const char *usnic_ib_qp_grp_state_to_string(enum ib_qp_state state); +int usnic_ib_qp_grp_dump_hdr(char *buf, int buf_sz); +int usnic_ib_qp_grp_dump_rows(void *obj, char *buf, int buf_sz); +struct usnic_ib_qp_grp * +usnic_ib_qp_grp_create(struct usnic_fwd_dev *ufdev, struct usnic_ib_vf *vf, + struct usnic_ib_pd *pd, + struct usnic_vnic_res_spec *res_spec, + struct usnic_transport_spec *trans_spec); +void usnic_ib_qp_grp_destroy(struct usnic_ib_qp_grp *qp_grp); +int usnic_ib_qp_grp_modify(struct usnic_ib_qp_grp *qp_grp, + enum ib_qp_state new_state, + void *data); +struct usnic_vnic_res_chunk +*usnic_ib_qp_grp_get_chunk(struct usnic_ib_qp_grp *qp_grp, + enum usnic_vnic_res_type type); +static inline +struct usnic_ib_qp_grp *to_uqp_grp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct usnic_ib_qp_grp, ibqp); +} +#endif /* USNIC_IB_QP_GRP_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c new file mode 100644 index 0000000..27dc67c --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c @@ -0,0 +1,341 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include + +#include +#include + +#include "usnic_common_util.h" +#include "usnic_ib.h" +#include "usnic_ib_qp_grp.h" +#include "usnic_vnic.h" +#include "usnic_ib_verbs.h" +#include "usnic_log.h" + +static ssize_t usnic_ib_show_fw_ver(struct device *device, + struct device_attribute *attr, + char *buf) +{ + struct usnic_ib_dev *us_ibdev = + container_of(device, struct usnic_ib_dev, ib_dev.dev); + struct ethtool_drvinfo info; + + mutex_lock(&us_ibdev->usdev_lock); + us_ibdev->netdev->ethtool_ops->get_drvinfo(us_ibdev->netdev, &info); + mutex_unlock(&us_ibdev->usdev_lock); + + return scnprintf(buf, PAGE_SIZE, "%s\n", info.fw_version); +} + +static ssize_t usnic_ib_show_board(struct device *device, + struct device_attribute *attr, + char *buf) +{ + struct usnic_ib_dev *us_ibdev = + container_of(device, struct usnic_ib_dev, ib_dev.dev); + unsigned short subsystem_device_id; + + mutex_lock(&us_ibdev->usdev_lock); + subsystem_device_id = us_ibdev->pdev->subsystem_device; + mutex_unlock(&us_ibdev->usdev_lock); + + return scnprintf(buf, PAGE_SIZE, "%hu\n", subsystem_device_id); +} + +/* + * Report the configuration for this PF + */ +static ssize_t +usnic_ib_show_config(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct usnic_ib_dev *us_ibdev; + char *ptr; + unsigned left; + unsigned n; + enum usnic_vnic_res_type res_type; + + us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + + /* Buffer space limit is 1 page */ + ptr = buf; + left = PAGE_SIZE; + + mutex_lock(&us_ibdev->usdev_lock); + if (atomic_read(&us_ibdev->vf_cnt.refcount) > 0) { + char *busname; + + /* + * bus name seems to come with annoying prefix. + * Remove it if it is predictable + */ + busname = us_ibdev->pdev->bus->name; + if (strncmp(busname, "PCI Bus ", 8) == 0) + busname += 8; + + n = scnprintf(ptr, left, + "%s: %s:%d.%d, %s, %pM, %u VFs\n Per VF:", + us_ibdev->ib_dev.name, + busname, + PCI_SLOT(us_ibdev->pdev->devfn), + PCI_FUNC(us_ibdev->pdev->devfn), + netdev_name(us_ibdev->netdev), + us_ibdev->ufdev->mac, + atomic_read(&us_ibdev->vf_cnt.refcount)); + UPDATE_PTR_LEFT(n, ptr, left); + + for (res_type = USNIC_VNIC_RES_TYPE_EOL; + res_type < USNIC_VNIC_RES_TYPE_MAX; + res_type++) { + if (us_ibdev->vf_res_cnt[res_type] == 0) + continue; + n = scnprintf(ptr, left, " %d %s%s", + us_ibdev->vf_res_cnt[res_type], + usnic_vnic_res_type_to_str(res_type), + (res_type < (USNIC_VNIC_RES_TYPE_MAX - 1)) ? + "," : ""); + UPDATE_PTR_LEFT(n, ptr, left); + } + n = scnprintf(ptr, left, "\n"); + UPDATE_PTR_LEFT(n, ptr, left); + } else { + n = scnprintf(ptr, left, "%s: no VFs\n", + us_ibdev->ib_dev.name); + UPDATE_PTR_LEFT(n, ptr, left); + } + mutex_unlock(&us_ibdev->usdev_lock); + + return ptr - buf; +} + +static ssize_t +usnic_ib_show_iface(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct usnic_ib_dev *us_ibdev; + + us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + + return scnprintf(buf, PAGE_SIZE, "%s\n", + netdev_name(us_ibdev->netdev)); +} + +static ssize_t +usnic_ib_show_max_vf(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct usnic_ib_dev *us_ibdev; + + us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + + return scnprintf(buf, PAGE_SIZE, "%u\n", + atomic_read(&us_ibdev->vf_cnt.refcount)); +} + +static ssize_t +usnic_ib_show_qp_per_vf(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct usnic_ib_dev *us_ibdev; + int qp_per_vf; + + us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + qp_per_vf = max(us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_WQ], + us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_RQ]); + + return scnprintf(buf, PAGE_SIZE, + "%d\n", qp_per_vf); +} + +static ssize_t +usnic_ib_show_cq_per_vf(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct usnic_ib_dev *us_ibdev; + + us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", + us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ]); +} + +static DEVICE_ATTR(fw_ver, S_IRUGO, usnic_ib_show_fw_ver, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, usnic_ib_show_board, NULL); +static DEVICE_ATTR(config, S_IRUGO, usnic_ib_show_config, NULL); +static DEVICE_ATTR(iface, S_IRUGO, usnic_ib_show_iface, NULL); +static DEVICE_ATTR(max_vf, S_IRUGO, usnic_ib_show_max_vf, NULL); +static DEVICE_ATTR(qp_per_vf, S_IRUGO, usnic_ib_show_qp_per_vf, NULL); +static DEVICE_ATTR(cq_per_vf, S_IRUGO, usnic_ib_show_cq_per_vf, NULL); + +static struct device_attribute *usnic_class_attributes[] = { + &dev_attr_fw_ver, + &dev_attr_board_id, + &dev_attr_config, + &dev_attr_iface, + &dev_attr_max_vf, + &dev_attr_qp_per_vf, + &dev_attr_cq_per_vf, +}; + +struct qpn_attribute { + struct attribute attr; + ssize_t (*show)(struct usnic_ib_qp_grp *, char *buf); +}; + +/* + * Definitions for supporting QPN entries in sysfs + */ +static ssize_t +usnic_ib_qpn_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct usnic_ib_qp_grp *qp_grp; + struct qpn_attribute *qpn_attr; + + qp_grp = container_of(kobj, struct usnic_ib_qp_grp, kobj); + qpn_attr = container_of(attr, struct qpn_attribute, attr); + + return qpn_attr->show(qp_grp, buf); +} + +static const struct sysfs_ops usnic_ib_qpn_sysfs_ops = { + .show = usnic_ib_qpn_attr_show +}; + +#define QPN_ATTR_RO(NAME) \ +struct qpn_attribute qpn_attr_##NAME = __ATTR_RO(NAME) + +static ssize_t context_show(struct usnic_ib_qp_grp *qp_grp, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "0x%p\n", qp_grp->ctx); +} + +static ssize_t summary_show(struct usnic_ib_qp_grp *qp_grp, char *buf) +{ + int i, j, n; + int left; + char *ptr; + struct usnic_vnic_res_chunk *res_chunk; + struct usnic_vnic_res *vnic_res; + + left = PAGE_SIZE; + ptr = buf; + + n = scnprintf(ptr, left, + "QPN: %d State: (%s) PID: %u VF Idx: %hu ", + qp_grp->ibqp.qp_num, + usnic_ib_qp_grp_state_to_string(qp_grp->state), + qp_grp->owner_pid, + usnic_vnic_get_index(qp_grp->vf->vnic)); + UPDATE_PTR_LEFT(n, ptr, left); + + for (i = 0; qp_grp->res_chunk_list[i]; i++) { + res_chunk = qp_grp->res_chunk_list[i]; + for (j = 0; j < res_chunk->cnt; j++) { + vnic_res = res_chunk->res[j]; + n = scnprintf(ptr, left, "%s[%d] ", + usnic_vnic_res_type_to_str(vnic_res->type), + vnic_res->vnic_idx); + UPDATE_PTR_LEFT(n, ptr, left); + } + } + + n = scnprintf(ptr, left, "\n"); + UPDATE_PTR_LEFT(n, ptr, left); + + return ptr - buf; +} + +static QPN_ATTR_RO(context); +static QPN_ATTR_RO(summary); + +static struct attribute *usnic_ib_qpn_default_attrs[] = { + &qpn_attr_context.attr, + &qpn_attr_summary.attr, + NULL +}; + +static struct kobj_type usnic_ib_qpn_type = { + .sysfs_ops = &usnic_ib_qpn_sysfs_ops, + .default_attrs = usnic_ib_qpn_default_attrs +}; + +int usnic_ib_sysfs_register_usdev(struct usnic_ib_dev *us_ibdev) +{ + int i; + int err; + for (i = 0; i < ARRAY_SIZE(usnic_class_attributes); ++i) { + err = device_create_file(&us_ibdev->ib_dev.dev, + usnic_class_attributes[i]); + if (err) { + usnic_err("Failed to create device file %d for %s eith err %d", + i, us_ibdev->ib_dev.name, err); + return -EINVAL; + } + } + + /* create kernel object for looking at individual QPs */ + kobject_get(&us_ibdev->ib_dev.dev.kobj); + us_ibdev->qpn_kobj = kobject_create_and_add("qpn", + &us_ibdev->ib_dev.dev.kobj); + if (us_ibdev->qpn_kobj == NULL) { + kobject_put(&us_ibdev->ib_dev.dev.kobj); + return -ENOMEM; + } + + return 0; +} + +void usnic_ib_sysfs_unregister_usdev(struct usnic_ib_dev *us_ibdev) +{ + int i; + for (i = 0; i < ARRAY_SIZE(usnic_class_attributes); ++i) { + device_remove_file(&us_ibdev->ib_dev.dev, + usnic_class_attributes[i]); + } + + kobject_put(us_ibdev->qpn_kobj); +} + +void usnic_ib_sysfs_qpn_add(struct usnic_ib_qp_grp *qp_grp) +{ + struct usnic_ib_dev *us_ibdev; + int err; + + us_ibdev = qp_grp->vf->pf; + + err = kobject_init_and_add(&qp_grp->kobj, &usnic_ib_qpn_type, + kobject_get(us_ibdev->qpn_kobj), + "%d", qp_grp->grp_id); + if (err) { + kobject_put(us_ibdev->qpn_kobj); + return; + } +} + +void usnic_ib_sysfs_qpn_remove(struct usnic_ib_qp_grp *qp_grp) +{ + struct usnic_ib_dev *us_ibdev; + + us_ibdev = qp_grp->vf->pf; + + kobject_put(&qp_grp->kobj); + kobject_put(us_ibdev->qpn_kobj); +} diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h new file mode 100644 index 0000000..0d09b49 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_IB_SYSFS_H_ +#define USNIC_IB_SYSFS_H_ + +#include "usnic_ib.h" + +int usnic_ib_sysfs_register_usdev(struct usnic_ib_dev *us_ibdev); +void usnic_ib_sysfs_unregister_usdev(struct usnic_ib_dev *us_ibdev); +void usnic_ib_sysfs_qpn_add(struct usnic_ib_qp_grp *qp_grp); +void usnic_ib_sysfs_qpn_remove(struct usnic_ib_qp_grp *qp_grp); + +#endif /* !USNIC_IB_SYSFS_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c new file mode 100644 index 0000000..53bd6a2 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -0,0 +1,768 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include + +#include +#include + +#include "usnic_abi.h" +#include "usnic_ib.h" +#include "usnic_common_util.h" +#include "usnic_ib_qp_grp.h" +#include "usnic_fwd.h" +#include "usnic_log.h" +#include "usnic_uiom.h" +#include "usnic_transport.h" + +#define USNIC_DEFAULT_TRANSPORT USNIC_TRANSPORT_ROCE_CUSTOM + +static void usnic_ib_fw_string_to_u64(char *fw_ver_str, u64 *fw_ver) +{ + *fw_ver = (u64) *fw_ver_str; +} + +static int usnic_ib_fill_create_qp_resp(struct usnic_ib_qp_grp *qp_grp, + struct ib_udata *udata) +{ + struct usnic_ib_dev *us_ibdev; + struct usnic_ib_create_qp_resp resp; + struct pci_dev *pdev; + struct vnic_dev_bar *bar; + struct usnic_vnic_res_chunk *chunk; + struct usnic_ib_qp_grp_flow *default_flow; + int i, err; + + memset(&resp, 0, sizeof(resp)); + + us_ibdev = qp_grp->vf->pf; + pdev = usnic_vnic_get_pdev(qp_grp->vf->vnic); + if (!pdev) { + usnic_err("Failed to get pdev of qp_grp %d\n", + qp_grp->grp_id); + return -EFAULT; + } + + bar = usnic_vnic_get_bar(qp_grp->vf->vnic, 0); + if (!bar) { + usnic_err("Failed to get bar0 of qp_grp %d vf %s", + qp_grp->grp_id, pci_name(pdev)); + return -EFAULT; + } + + resp.vfid = usnic_vnic_get_index(qp_grp->vf->vnic); + resp.bar_bus_addr = bar->bus_addr; + resp.bar_len = bar->len; + + chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_RQ); + if (IS_ERR_OR_NULL(chunk)) { + usnic_err("Failed to get chunk %s for qp_grp %d with err %ld\n", + usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_RQ), + qp_grp->grp_id, + PTR_ERR(chunk)); + return chunk ? PTR_ERR(chunk) : -ENOMEM; + } + + WARN_ON(chunk->type != USNIC_VNIC_RES_TYPE_RQ); + resp.rq_cnt = chunk->cnt; + for (i = 0; i < chunk->cnt; i++) + resp.rq_idx[i] = chunk->res[i]->vnic_idx; + + chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_WQ); + if (IS_ERR_OR_NULL(chunk)) { + usnic_err("Failed to get chunk %s for qp_grp %d with err %ld\n", + usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_WQ), + qp_grp->grp_id, + PTR_ERR(chunk)); + return chunk ? PTR_ERR(chunk) : -ENOMEM; + } + + WARN_ON(chunk->type != USNIC_VNIC_RES_TYPE_WQ); + resp.wq_cnt = chunk->cnt; + for (i = 0; i < chunk->cnt; i++) + resp.wq_idx[i] = chunk->res[i]->vnic_idx; + + chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_CQ); + if (IS_ERR_OR_NULL(chunk)) { + usnic_err("Failed to get chunk %s for qp_grp %d with err %ld\n", + usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_CQ), + qp_grp->grp_id, + PTR_ERR(chunk)); + return chunk ? PTR_ERR(chunk) : -ENOMEM; + } + + WARN_ON(chunk->type != USNIC_VNIC_RES_TYPE_CQ); + resp.cq_cnt = chunk->cnt; + for (i = 0; i < chunk->cnt; i++) + resp.cq_idx[i] = chunk->res[i]->vnic_idx; + + default_flow = list_first_entry(&qp_grp->flows_lst, + struct usnic_ib_qp_grp_flow, link); + resp.transport = default_flow->trans_type; + + err = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (err) { + usnic_err("Failed to copy udata for %s", us_ibdev->ib_dev.name); + return err; + } + + return 0; +} + +static struct usnic_ib_qp_grp* +find_free_vf_and_create_qp_grp(struct usnic_ib_dev *us_ibdev, + struct usnic_ib_pd *pd, + struct usnic_transport_spec *trans_spec, + struct usnic_vnic_res_spec *res_spec) +{ + struct usnic_ib_vf *vf; + struct usnic_vnic *vnic; + struct usnic_ib_qp_grp *qp_grp; + struct device *dev, **dev_list; + int i, found = 0; + + BUG_ON(!mutex_is_locked(&us_ibdev->usdev_lock)); + + if (list_empty(&us_ibdev->vf_dev_list)) { + usnic_info("No vfs to allocate\n"); + return NULL; + } + + if (usnic_ib_share_vf) { + /* Try to find resouces on a used vf which is in pd */ + dev_list = usnic_uiom_get_dev_list(pd->umem_pd); + for (i = 0; dev_list[i]; i++) { + dev = dev_list[i]; + vf = pci_get_drvdata(to_pci_dev(dev)); + spin_lock(&vf->lock); + vnic = vf->vnic; + if (!usnic_vnic_check_room(vnic, res_spec)) { + usnic_dbg("Found used vnic %s from %s\n", + us_ibdev->ib_dev.name, + pci_name(usnic_vnic_get_pdev( + vnic))); + found = 1; + break; + } + spin_unlock(&vf->lock); + + } + usnic_uiom_free_dev_list(dev_list); + } + + if (!found) { + /* Try to find resources on an unused vf */ + list_for_each_entry(vf, &us_ibdev->vf_dev_list, link) { + spin_lock(&vf->lock); + vnic = vf->vnic; + if (vf->qp_grp_ref_cnt == 0 && + usnic_vnic_check_room(vnic, res_spec) == 0) { + found = 1; + break; + } + spin_unlock(&vf->lock); + } + } + + if (!found) { + usnic_info("No free qp grp found on %s\n", + us_ibdev->ib_dev.name); + return ERR_PTR(-ENOMEM); + } + + qp_grp = usnic_ib_qp_grp_create(us_ibdev->ufdev, vf, pd, res_spec, + trans_spec); + spin_unlock(&vf->lock); + if (IS_ERR_OR_NULL(qp_grp)) { + usnic_err("Failed to allocate qp_grp\n"); + return ERR_PTR(qp_grp ? PTR_ERR(qp_grp) : -ENOMEM); + } + + return qp_grp; +} + +static void qp_grp_destroy(struct usnic_ib_qp_grp *qp_grp) +{ + struct usnic_ib_vf *vf = qp_grp->vf; + + WARN_ON(qp_grp->state != IB_QPS_RESET); + + spin_lock(&vf->lock); + usnic_ib_qp_grp_destroy(qp_grp); + spin_unlock(&vf->lock); +} + +static void eth_speed_to_ib_speed(int speed, u8 *active_speed, + u8 *active_width) +{ + if (speed <= 10000) { + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_FDR10; + } else if (speed <= 20000) { + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_DDR; + } else if (speed <= 30000) { + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_QDR; + } else if (speed <= 40000) { + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_FDR10; + } else { + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_EDR; + } +} + +static int create_qp_validate_user_data(struct usnic_ib_create_qp_cmd cmd) +{ + if (cmd.spec.trans_type <= USNIC_TRANSPORT_UNKNOWN || + cmd.spec.trans_type >= USNIC_TRANSPORT_MAX) + return -EINVAL; + + return 0; +} + +/* Start of ib callback functions */ + +enum rdma_link_layer usnic_ib_port_link_layer(struct ib_device *device, + u8 port_num) +{ + return IB_LINK_LAYER_ETHERNET; +} + +int usnic_ib_query_device(struct ib_device *ibdev, + struct ib_device_attr *props) +{ + struct usnic_ib_dev *us_ibdev = to_usdev(ibdev); + union ib_gid gid; + struct ethtool_drvinfo info; + struct ethtool_cmd cmd; + int qp_per_vf; + + usnic_dbg("\n"); + mutex_lock(&us_ibdev->usdev_lock); + us_ibdev->netdev->ethtool_ops->get_drvinfo(us_ibdev->netdev, &info); + us_ibdev->netdev->ethtool_ops->get_settings(us_ibdev->netdev, &cmd); + memset(props, 0, sizeof(*props)); + usnic_mac_ip_to_gid(us_ibdev->ufdev->mac, us_ibdev->ufdev->inaddr, + &gid.raw[0]); + memcpy(&props->sys_image_guid, &gid.global.interface_id, + sizeof(gid.global.interface_id)); + usnic_ib_fw_string_to_u64(&info.fw_version[0], &props->fw_ver); + props->max_mr_size = USNIC_UIOM_MAX_MR_SIZE; + props->page_size_cap = USNIC_UIOM_PAGE_SIZE; + props->vendor_id = PCI_VENDOR_ID_CISCO; + props->vendor_part_id = PCI_DEVICE_ID_CISCO_VIC_USPACE_NIC; + props->hw_ver = us_ibdev->pdev->subsystem_device; + qp_per_vf = max(us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_WQ], + us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_RQ]); + props->max_qp = qp_per_vf * + atomic_read(&us_ibdev->vf_cnt.refcount); + props->device_cap_flags = IB_DEVICE_PORT_ACTIVE_EVENT | + IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; + props->max_cq = us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ] * + atomic_read(&us_ibdev->vf_cnt.refcount); + props->max_pd = USNIC_UIOM_MAX_PD_CNT; + props->max_mr = USNIC_UIOM_MAX_MR_CNT; + props->local_ca_ack_delay = 0; + props->max_pkeys = 0; + props->atomic_cap = IB_ATOMIC_NONE; + props->masked_atomic_cap = props->atomic_cap; + props->max_qp_rd_atom = 0; + props->max_qp_init_rd_atom = 0; + props->max_res_rd_atom = 0; + props->max_srq = 0; + props->max_srq_wr = 0; + props->max_srq_sge = 0; + props->max_fast_reg_page_list_len = 0; + props->max_mcast_grp = 0; + props->max_mcast_qp_attach = 0; + props->max_total_mcast_qp_attach = 0; + props->max_map_per_fmr = 0; + /* Owned by Userspace + * max_qp_wr, max_sge, max_sge_rd, max_cqe */ + mutex_unlock(&us_ibdev->usdev_lock); + + return 0; +} + +int usnic_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + struct usnic_ib_dev *us_ibdev = to_usdev(ibdev); + struct ethtool_cmd cmd; + + usnic_dbg("\n"); + + mutex_lock(&us_ibdev->usdev_lock); + us_ibdev->netdev->ethtool_ops->get_settings(us_ibdev->netdev, &cmd); + memset(props, 0, sizeof(*props)); + + props->lid = 0; + props->lmc = 1; + props->sm_lid = 0; + props->sm_sl = 0; + + if (!us_ibdev->ufdev->link_up) { + props->state = IB_PORT_DOWN; + props->phys_state = 3; + } else if (!us_ibdev->ufdev->inaddr) { + props->state = IB_PORT_INIT; + props->phys_state = 4; + } else { + props->state = IB_PORT_ACTIVE; + props->phys_state = 5; + } + + props->port_cap_flags = 0; + props->gid_tbl_len = 1; + props->pkey_tbl_len = 1; + props->bad_pkey_cntr = 0; + props->qkey_viol_cntr = 0; + eth_speed_to_ib_speed(cmd.speed, &props->active_speed, + &props->active_width); + props->max_mtu = IB_MTU_4096; + props->active_mtu = iboe_get_mtu(us_ibdev->ufdev->mtu); + /* Userspace will adjust for hdrs */ + props->max_msg_sz = us_ibdev->ufdev->mtu; + props->max_vl_num = 1; + mutex_unlock(&us_ibdev->usdev_lock); + + return 0; +} + +int usnic_ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct usnic_ib_qp_grp *qp_grp; + struct usnic_ib_vf *vf; + int err; + + usnic_dbg("\n"); + + memset(qp_attr, 0, sizeof(*qp_attr)); + memset(qp_init_attr, 0, sizeof(*qp_init_attr)); + + qp_grp = to_uqp_grp(qp); + vf = qp_grp->vf; + mutex_lock(&vf->pf->usdev_lock); + usnic_dbg("\n"); + qp_attr->qp_state = qp_grp->state; + qp_attr->cur_qp_state = qp_grp->state; + + switch (qp_grp->ibqp.qp_type) { + case IB_QPT_UD: + qp_attr->qkey = 0; + break; + default: + usnic_err("Unexpected qp_type %d\n", qp_grp->ibqp.qp_type); + err = -EINVAL; + goto err_out; + } + + mutex_unlock(&vf->pf->usdev_lock); + return 0; + +err_out: + mutex_unlock(&vf->pf->usdev_lock); + return err; +} + +int usnic_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid) +{ + + struct usnic_ib_dev *us_ibdev = to_usdev(ibdev); + usnic_dbg("\n"); + + if (index > 1) + return -EINVAL; + + mutex_lock(&us_ibdev->usdev_lock); + memset(&(gid->raw[0]), 0, sizeof(gid->raw)); + usnic_mac_ip_to_gid(us_ibdev->ufdev->mac, us_ibdev->ufdev->inaddr, + &gid->raw[0]); + mutex_unlock(&us_ibdev->usdev_lock); + + return 0; +} + +int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey) +{ + if (index > 1) + return -EINVAL; + + *pkey = 0xffff; + return 0; +} + +struct ib_pd *usnic_ib_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct usnic_ib_pd *pd; + void *umem_pd; + + usnic_dbg("\n"); + + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return ERR_PTR(-ENOMEM); + + umem_pd = pd->umem_pd = usnic_uiom_alloc_pd(); + if (IS_ERR_OR_NULL(umem_pd)) { + kfree(pd); + return ERR_PTR(umem_pd ? PTR_ERR(umem_pd) : -ENOMEM); + } + + usnic_info("domain 0x%p allocated for context 0x%p and device %s\n", + pd, context, ibdev->name); + return &pd->ibpd; +} + +int usnic_ib_dealloc_pd(struct ib_pd *pd) +{ + usnic_info("freeing domain 0x%p\n", pd); + + usnic_uiom_dealloc_pd((to_upd(pd))->umem_pd); + kfree(pd); + return 0; +} + +struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + int err; + struct usnic_ib_dev *us_ibdev; + struct usnic_ib_qp_grp *qp_grp; + struct usnic_ib_ucontext *ucontext; + int cq_cnt; + struct usnic_vnic_res_spec res_spec; + struct usnic_ib_create_qp_cmd cmd; + struct usnic_transport_spec trans_spec; + + usnic_dbg("\n"); + + ucontext = to_uucontext(pd->uobject->context); + us_ibdev = to_usdev(pd->device); + + if (init_attr->create_flags) + return ERR_PTR(-EINVAL); + + err = ib_copy_from_udata(&cmd, udata, sizeof(cmd)); + if (err) { + usnic_err("%s: cannot copy udata for create_qp\n", + us_ibdev->ib_dev.name); + return ERR_PTR(-EINVAL); + } + + err = create_qp_validate_user_data(cmd); + if (err) { + usnic_err("%s: Failed to validate user data\n", + us_ibdev->ib_dev.name); + return ERR_PTR(-EINVAL); + } + + if (init_attr->qp_type != IB_QPT_UD) { + usnic_err("%s asked to make a non-UD QP: %d\n", + us_ibdev->ib_dev.name, init_attr->qp_type); + return ERR_PTR(-EINVAL); + } + + trans_spec = cmd.spec; + mutex_lock(&us_ibdev->usdev_lock); + cq_cnt = (init_attr->send_cq == init_attr->recv_cq) ? 1 : 2; + res_spec = min_transport_spec[trans_spec.trans_type]; + usnic_vnic_res_spec_update(&res_spec, USNIC_VNIC_RES_TYPE_CQ, cq_cnt); + qp_grp = find_free_vf_and_create_qp_grp(us_ibdev, to_upd(pd), + &trans_spec, + &res_spec); + if (IS_ERR_OR_NULL(qp_grp)) { + err = qp_grp ? PTR_ERR(qp_grp) : -ENOMEM; + goto out_release_mutex; + } + + err = usnic_ib_fill_create_qp_resp(qp_grp, udata); + if (err) { + err = -EBUSY; + goto out_release_qp_grp; + } + + qp_grp->ctx = ucontext; + list_add_tail(&qp_grp->link, &ucontext->qp_grp_list); + usnic_ib_log_vf(qp_grp->vf); + mutex_unlock(&us_ibdev->usdev_lock); + return &qp_grp->ibqp; + +out_release_qp_grp: + qp_grp_destroy(qp_grp); +out_release_mutex: + mutex_unlock(&us_ibdev->usdev_lock); + return ERR_PTR(err); +} + +int usnic_ib_destroy_qp(struct ib_qp *qp) +{ + struct usnic_ib_qp_grp *qp_grp; + struct usnic_ib_vf *vf; + + usnic_dbg("\n"); + + qp_grp = to_uqp_grp(qp); + vf = qp_grp->vf; + mutex_lock(&vf->pf->usdev_lock); + if (usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RESET, NULL)) { + usnic_err("Failed to move qp grp %u to reset\n", + qp_grp->grp_id); + } + + list_del(&qp_grp->link); + qp_grp_destroy(qp_grp); + mutex_unlock(&vf->pf->usdev_lock); + + return 0; +} + +int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct usnic_ib_qp_grp *qp_grp; + int status; + usnic_dbg("\n"); + + qp_grp = to_uqp_grp(ibqp); + + /* TODO: Future Support All States */ + mutex_lock(&qp_grp->vf->pf->usdev_lock); + if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_INIT) { + status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_INIT, NULL); + } else if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_RTR) { + status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RTR, NULL); + } else if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_RTS) { + status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RTS, NULL); + } else { + usnic_err("Unexpected combination mask: %u state: %u\n", + attr_mask & IB_QP_STATE, attr->qp_state); + status = -EINVAL; + } + + mutex_unlock(&qp_grp->vf->pf->usdev_lock); + return status; +} + +struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev, int entries, + int vector, struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct ib_cq *cq; + + usnic_dbg("\n"); + cq = kzalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) + return ERR_PTR(-EBUSY); + + return cq; +} + +int usnic_ib_destroy_cq(struct ib_cq *cq) +{ + usnic_dbg("\n"); + kfree(cq); + return 0; +} + +struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata) +{ + struct usnic_ib_mr *mr; + int err; + + usnic_dbg("start 0x%llx va 0x%llx length 0x%llx\n", start, + virt_addr, length); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (IS_ERR_OR_NULL(mr)) + return ERR_PTR(mr ? PTR_ERR(mr) : -ENOMEM); + + mr->umem = usnic_uiom_reg_get(to_upd(pd)->umem_pd, start, length, + access_flags, 0); + if (IS_ERR_OR_NULL(mr->umem)) { + err = mr->umem ? PTR_ERR(mr->umem) : -EFAULT; + goto err_free; + } + + mr->ibmr.lkey = mr->ibmr.rkey = 0; + return &mr->ibmr; + +err_free: + kfree(mr); + return ERR_PTR(err); +} + +int usnic_ib_dereg_mr(struct ib_mr *ibmr) +{ + struct usnic_ib_mr *mr = to_umr(ibmr); + + usnic_dbg("va 0x%lx length 0x%zx\n", mr->umem->va, mr->umem->length); + + usnic_uiom_reg_release(mr->umem, ibmr->pd->uobject->context->closing); + kfree(mr); + return 0; +} + +struct ib_ucontext *usnic_ib_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct usnic_ib_ucontext *context; + struct usnic_ib_dev *us_ibdev = to_usdev(ibdev); + usnic_dbg("\n"); + + context = kmalloc(sizeof(*context), GFP_KERNEL); + if (!context) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&context->qp_grp_list); + mutex_lock(&us_ibdev->usdev_lock); + list_add_tail(&context->link, &us_ibdev->ctx_list); + mutex_unlock(&us_ibdev->usdev_lock); + + return &context->ibucontext; +} + +int usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) +{ + struct usnic_ib_ucontext *context = to_uucontext(ibcontext); + struct usnic_ib_dev *us_ibdev = to_usdev(ibcontext->device); + usnic_dbg("\n"); + + mutex_lock(&us_ibdev->usdev_lock); + BUG_ON(!list_empty(&context->qp_grp_list)); + list_del(&context->link); + mutex_unlock(&us_ibdev->usdev_lock); + kfree(context); + return 0; +} + +int usnic_ib_mmap(struct ib_ucontext *context, + struct vm_area_struct *vma) +{ + struct usnic_ib_ucontext *uctx = to_ucontext(context); + struct usnic_ib_dev *us_ibdev; + struct usnic_ib_qp_grp *qp_grp; + struct usnic_ib_vf *vf; + struct vnic_dev_bar *bar; + dma_addr_t bus_addr; + unsigned int len; + unsigned int vfid; + + usnic_dbg("\n"); + + us_ibdev = to_usdev(context->device); + vma->vm_flags |= VM_IO; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + vfid = vma->vm_pgoff; + usnic_dbg("Page Offset %lu PAGE_SHIFT %u VFID %u\n", + vma->vm_pgoff, PAGE_SHIFT, vfid); + + mutex_lock(&us_ibdev->usdev_lock); + list_for_each_entry(qp_grp, &uctx->qp_grp_list, link) { + vf = qp_grp->vf; + if (usnic_vnic_get_index(vf->vnic) == vfid) { + bar = usnic_vnic_get_bar(vf->vnic, 0); + if ((vma->vm_end - vma->vm_start) != bar->len) { + usnic_err("Bar0 Len %lu - Request map %lu\n", + bar->len, + vma->vm_end - vma->vm_start); + mutex_unlock(&us_ibdev->usdev_lock); + return -EINVAL; + } + bus_addr = bar->bus_addr; + len = bar->len; + usnic_dbg("bus: %pa vaddr: %p size: %ld\n", + &bus_addr, bar->vaddr, bar->len); + mutex_unlock(&us_ibdev->usdev_lock); + + return remap_pfn_range(vma, + vma->vm_start, + bus_addr >> PAGE_SHIFT, + len, vma->vm_page_prot); + } + } + + mutex_unlock(&us_ibdev->usdev_lock); + usnic_err("No VF %u found\n", vfid); + return -EINVAL; +} + +/* In ib callbacks section - Start of stub funcs */ +struct ib_ah *usnic_ib_create_ah(struct ib_pd *pd, + struct ib_ah_attr *ah_attr) +{ + usnic_dbg("\n"); + return ERR_PTR(-EPERM); +} + +int usnic_ib_destroy_ah(struct ib_ah *ah) +{ + usnic_dbg("\n"); + return -EINVAL; +} + +int usnic_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + usnic_dbg("\n"); + return -EINVAL; +} + +int usnic_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + usnic_dbg("\n"); + return -EINVAL; +} + +int usnic_ib_poll_cq(struct ib_cq *ibcq, int num_entries, + struct ib_wc *wc) +{ + usnic_dbg("\n"); + return -EINVAL; +} + +int usnic_ib_req_notify_cq(struct ib_cq *cq, + enum ib_cq_notify_flags flags) +{ + usnic_dbg("\n"); + return -EINVAL; +} + +struct ib_mr *usnic_ib_get_dma_mr(struct ib_pd *pd, int acc) +{ + usnic_dbg("\n"); + return ERR_PTR(-ENOMEM); +} + + +/* In ib callbacks section - End of stub funcs */ +/* End of ib callbacks section */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h new file mode 100644 index 0000000..bb864f5 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_IB_VERBS_H_ +#define USNIC_IB_VERBS_H_ + +#include "usnic_ib.h" + +enum rdma_link_layer usnic_ib_port_link_layer(struct ib_device *device, + u8 port_num); +int usnic_ib_query_device(struct ib_device *ibdev, + struct ib_device_attr *props); +int usnic_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props); +int usnic_ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr); +int usnic_ib_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid); +int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey); +struct ib_pd *usnic_ib_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata); +int usnic_ib_dealloc_pd(struct ib_pd *pd); +struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); +int usnic_ib_destroy_qp(struct ib_qp *qp); +int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); +struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev, int entries, + int vector, struct ib_ucontext *context, + struct ib_udata *udata); +int usnic_ib_destroy_cq(struct ib_cq *cq); +struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata); +int usnic_ib_dereg_mr(struct ib_mr *ibmr); +struct ib_ucontext *usnic_ib_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata); +int usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext); +int usnic_ib_mmap(struct ib_ucontext *context, + struct vm_area_struct *vma); +struct ib_ah *usnic_ib_create_ah(struct ib_pd *pd, + struct ib_ah_attr *ah_attr); +int usnic_ib_destroy_ah(struct ib_ah *ah); +int usnic_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +int usnic_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +int usnic_ib_poll_cq(struct ib_cq *ibcq, int num_entries, + struct ib_wc *wc); +int usnic_ib_req_notify_cq(struct ib_cq *cq, + enum ib_cq_notify_flags flags); +struct ib_mr *usnic_ib_get_dma_mr(struct ib_pd *pd, int acc); +#endif /* !USNIC_IB_VERBS_H */ diff --git a/drivers/infiniband/hw/usnic/usnic_log.h b/drivers/infiniband/hw/usnic/usnic_log.h new file mode 100644 index 0000000..75777a6 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_log.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_LOG_H_ +#define USNIC_LOG_H_ + +#include "usnic.h" + +extern unsigned int usnic_log_lvl; + +#define USNIC_LOG_LVL_NONE (0) +#define USNIC_LOG_LVL_ERR (1) +#define USNIC_LOG_LVL_INFO (2) +#define USNIC_LOG_LVL_DBG (3) + +#define usnic_printk(lvl, args...) \ + do { \ + printk(lvl "%s:%s:%d: ", DRV_NAME, __func__, \ + __LINE__); \ + printk(args); \ + } while (0) + +#define usnic_dbg(args...) \ + do { \ + if (unlikely(usnic_log_lvl >= USNIC_LOG_LVL_DBG)) { \ + usnic_printk(KERN_INFO, args); \ + } \ +} while (0) + +#define usnic_info(args...) \ +do { \ + if (usnic_log_lvl >= USNIC_LOG_LVL_INFO) { \ + usnic_printk(KERN_INFO, args); \ + } \ +} while (0) + +#define usnic_err(args...) \ + do { \ + if (usnic_log_lvl >= USNIC_LOG_LVL_ERR) { \ + usnic_printk(KERN_ERR, args); \ + } \ + } while (0) +#endif /* !USNIC_LOG_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_transport.c b/drivers/infiniband/hw/usnic/usnic_transport.c new file mode 100644 index 0000000..ddef6f7 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_transport.c @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include +#include + +#include "usnic_transport.h" +#include "usnic_log.h" + +/* ROCE */ +static unsigned long *roce_bitmap; +static u16 roce_next_port = 1; +#define ROCE_BITMAP_SZ ((1 << (8 /*CHAR_BIT*/ * sizeof(u16)))/8 /*CHAR BIT*/) +static DEFINE_SPINLOCK(roce_bitmap_lock); + +const char *usnic_transport_to_str(enum usnic_transport_type type) +{ + switch (type) { + case USNIC_TRANSPORT_UNKNOWN: + return "Unknown"; + case USNIC_TRANSPORT_ROCE_CUSTOM: + return "roce custom"; + case USNIC_TRANSPORT_IPV4_UDP: + return "IPv4 UDP"; + case USNIC_TRANSPORT_MAX: + return "Max?"; + default: + return "Not known"; + } +} + +int usnic_transport_sock_to_str(char *buf, int buf_sz, + struct socket *sock) +{ + int err; + uint32_t addr; + uint16_t port; + int proto; + + memset(buf, 0, buf_sz); + err = usnic_transport_sock_get_addr(sock, &proto, &addr, &port); + if (err) + return 0; + + return scnprintf(buf, buf_sz, "Proto:%u Addr:%pI4h Port:%hu", + proto, &addr, port); +} + +/* + * reserve a port number. if "0" specified, we will try to pick one + * starting at roce_next_port. roce_next_port will take on the values + * 1..4096 + */ +u16 usnic_transport_rsrv_port(enum usnic_transport_type type, u16 port_num) +{ + if (type == USNIC_TRANSPORT_ROCE_CUSTOM) { + spin_lock(&roce_bitmap_lock); + if (!port_num) { + port_num = bitmap_find_next_zero_area(roce_bitmap, + ROCE_BITMAP_SZ, + roce_next_port /* start */, + 1 /* nr */, + 0 /* align */); + roce_next_port = (port_num & 4095) + 1; + } else if (test_bit(port_num, roce_bitmap)) { + usnic_err("Failed to allocate port for %s\n", + usnic_transport_to_str(type)); + spin_unlock(&roce_bitmap_lock); + goto out_fail; + } + bitmap_set(roce_bitmap, port_num, 1); + spin_unlock(&roce_bitmap_lock); + } else { + usnic_err("Failed to allocate port - transport %s unsupported\n", + usnic_transport_to_str(type)); + goto out_fail; + } + + usnic_dbg("Allocating port %hu for %s\n", port_num, + usnic_transport_to_str(type)); + return port_num; + +out_fail: + return 0; +} + +void usnic_transport_unrsrv_port(enum usnic_transport_type type, u16 port_num) +{ + if (type == USNIC_TRANSPORT_ROCE_CUSTOM) { + spin_lock(&roce_bitmap_lock); + if (!port_num) { + usnic_err("Unreserved unvalid port num 0 for %s\n", + usnic_transport_to_str(type)); + goto out_roce_custom; + } + + if (!test_bit(port_num, roce_bitmap)) { + usnic_err("Unreserving invalid %hu for %s\n", + port_num, + usnic_transport_to_str(type)); + goto out_roce_custom; + } + bitmap_clear(roce_bitmap, port_num, 1); + usnic_dbg("Freeing port %hu for %s\n", port_num, + usnic_transport_to_str(type)); +out_roce_custom: + spin_unlock(&roce_bitmap_lock); + } else { + usnic_err("Freeing invalid port %hu for %d\n", port_num, type); + } +} + +struct socket *usnic_transport_get_socket(int sock_fd) +{ + struct socket *sock; + int err; + char buf[25]; + + /* sockfd_lookup will internally do a fget */ + sock = sockfd_lookup(sock_fd, &err); + if (!sock) { + usnic_err("Unable to lookup socket for fd %d with err %d\n", + sock_fd, err); + return ERR_PTR(-ENOENT); + } + + usnic_transport_sock_to_str(buf, sizeof(buf), sock); + usnic_dbg("Get sock %s\n", buf); + + return sock; +} + +void usnic_transport_put_socket(struct socket *sock) +{ + char buf[100]; + + usnic_transport_sock_to_str(buf, sizeof(buf), sock); + usnic_dbg("Put sock %s\n", buf); + sockfd_put(sock); +} + +int usnic_transport_sock_get_addr(struct socket *sock, int *proto, + uint32_t *addr, uint16_t *port) +{ + int len; + int err; + struct sockaddr_in sock_addr; + + err = sock->ops->getname(sock, + (struct sockaddr *)&sock_addr, + &len, 0); + if (err) + return err; + + if (sock_addr.sin_family != AF_INET) + return -EINVAL; + + if (proto) + *proto = sock->sk->sk_protocol; + if (port) + *port = ntohs(((struct sockaddr_in *)&sock_addr)->sin_port); + if (addr) + *addr = ntohl(((struct sockaddr_in *) + &sock_addr)->sin_addr.s_addr); + + return 0; +} + +int usnic_transport_init(void) +{ + roce_bitmap = kzalloc(ROCE_BITMAP_SZ, GFP_KERNEL); + if (!roce_bitmap) { + usnic_err("Failed to allocate bit map"); + return -ENOMEM; + } + + /* Do not ever allocate bit 0, hence set it here */ + bitmap_set(roce_bitmap, 0, 1); + return 0; +} + +void usnic_transport_fini(void) +{ + kfree(roce_bitmap); +} diff --git a/drivers/infiniband/hw/usnic/usnic_transport.h b/drivers/infiniband/hw/usnic/usnic_transport.h new file mode 100644 index 0000000..7e5dc6d --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_transport.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_TRANSPORT_H_ +#define USNIC_TRANSPORT_H_ + +#include "usnic_abi.h" + +const char *usnic_transport_to_str(enum usnic_transport_type trans_type); +/* + * Returns number of bytes written, excluding null terminator. If + * nothing was written, the function returns 0. + */ +int usnic_transport_sock_to_str(char *buf, int buf_sz, + struct socket *sock); +/* + * Reserve a port. If "port_num" is set, then the function will try + * to reserve that particular port. + */ +u16 usnic_transport_rsrv_port(enum usnic_transport_type type, u16 port_num); +void usnic_transport_unrsrv_port(enum usnic_transport_type type, u16 port_num); +/* + * Do a fget on the socket refered to by sock_fd and returns the socket. + * Socket will not be destroyed before usnic_transport_put_socket has + * been called. + */ +struct socket *usnic_transport_get_socket(int sock_fd); +void usnic_transport_put_socket(struct socket *sock); +/* + * Call usnic_transport_get_socket before calling *_sock_get_addr + */ +int usnic_transport_sock_get_addr(struct socket *sock, int *proto, + uint32_t *addr, uint16_t *port); +int usnic_transport_init(void); +void usnic_transport_fini(void); +#endif /* !USNIC_TRANSPORT_H */ diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c new file mode 100644 index 0000000..417de1f --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -0,0 +1,604 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2013 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "usnic_log.h" +#include "usnic_uiom.h" +#include "usnic_uiom_interval_tree.h" + +static struct workqueue_struct *usnic_uiom_wq; + +#define USNIC_UIOM_PAGE_CHUNK \ + ((PAGE_SIZE - offsetof(struct usnic_uiom_chunk, page_list)) /\ + ((void *) &((struct usnic_uiom_chunk *) 0)->page_list[1] - \ + (void *) &((struct usnic_uiom_chunk *) 0)->page_list[0])) + +static void usnic_uiom_reg_account(struct work_struct *work) +{ + struct usnic_uiom_reg *umem = container_of(work, + struct usnic_uiom_reg, work); + + down_write(&umem->mm->mmap_sem); + umem->mm->locked_vm -= umem->diff; + up_write(&umem->mm->mmap_sem); + mmput(umem->mm); + kfree(umem); +} + +static int usnic_uiom_dma_fault(struct iommu_domain *domain, + struct device *dev, + unsigned long iova, int flags, + void *token) +{ + usnic_err("Device %s iommu fault domain 0x%pK va 0x%lx flags 0x%x\n", + dev_name(dev), + domain, iova, flags); + return -ENOSYS; +} + +static void usnic_uiom_put_pages(struct list_head *chunk_list, int dirty) +{ + struct usnic_uiom_chunk *chunk, *tmp; + struct page *page; + struct scatterlist *sg; + int i; + dma_addr_t pa; + + list_for_each_entry_safe(chunk, tmp, chunk_list, list) { + for_each_sg(chunk->page_list, sg, chunk->nents, i) { + page = sg_page(sg); + pa = sg_phys(sg); + if (dirty) + set_page_dirty_lock(page); + put_page(page); + usnic_dbg("pa: %pa\n", &pa); + } + kfree(chunk); + } +} + +static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, + int dmasync, struct list_head *chunk_list) +{ + struct page **page_list; + struct scatterlist *sg; + struct usnic_uiom_chunk *chunk; + unsigned long locked; + unsigned long lock_limit; + unsigned long cur_base; + unsigned long npages; + int ret; + int off; + int i; + int flags; + dma_addr_t pa; + DEFINE_DMA_ATTRS(attrs); + + if (dmasync) + dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs); + + if (!can_do_mlock()) + return -EPERM; + + INIT_LIST_HEAD(chunk_list); + + page_list = (struct page **) __get_free_page(GFP_KERNEL); + if (!page_list) + return -ENOMEM; + + npages = PAGE_ALIGN(size + (addr & ~PAGE_MASK)) >> PAGE_SHIFT; + + down_write(¤t->mm->mmap_sem); + + locked = npages + current->mm->locked_vm; + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { + ret = -ENOMEM; + goto out; + } + + flags = IOMMU_READ | IOMMU_CACHE; + flags |= (writable) ? IOMMU_WRITE : 0; + cur_base = addr & PAGE_MASK; + ret = 0; + + while (npages) { + ret = get_user_pages(current, current->mm, cur_base, + min_t(unsigned long, npages, + PAGE_SIZE / sizeof(struct page *)), + 1, !writable, page_list, NULL); + + if (ret < 0) + goto out; + + npages -= ret; + off = 0; + + while (ret) { + chunk = kmalloc(sizeof(*chunk) + + sizeof(struct scatterlist) * + min_t(int, ret, USNIC_UIOM_PAGE_CHUNK), + GFP_KERNEL); + if (!chunk) { + ret = -ENOMEM; + goto out; + } + + chunk->nents = min_t(int, ret, USNIC_UIOM_PAGE_CHUNK); + sg_init_table(chunk->page_list, chunk->nents); + for_each_sg(chunk->page_list, sg, chunk->nents, i) { + sg_set_page(sg, page_list[i + off], + PAGE_SIZE, 0); + pa = sg_phys(sg); + usnic_dbg("va: 0x%lx pa: %pa\n", + cur_base + i*PAGE_SIZE, &pa); + } + cur_base += chunk->nents * PAGE_SIZE; + ret -= chunk->nents; + off += chunk->nents; + list_add_tail(&chunk->list, chunk_list); + } + + ret = 0; + } + +out: + if (ret < 0) + usnic_uiom_put_pages(chunk_list, 0); + else + current->mm->locked_vm = locked; + + up_write(¤t->mm->mmap_sem); + free_page((unsigned long) page_list); + return ret; +} + +static void usnic_uiom_unmap_sorted_intervals(struct list_head *intervals, + struct usnic_uiom_pd *pd) +{ + struct usnic_uiom_interval_node *interval, *tmp; + long unsigned va, size; + + list_for_each_entry_safe(interval, tmp, intervals, link) { + va = interval->start << PAGE_SHIFT; + size = ((interval->last - interval->start) + 1) << PAGE_SHIFT; + while (size > 0) { + /* Workaround for RH 970401 */ + usnic_dbg("va 0x%lx size 0x%lx", va, PAGE_SIZE); + iommu_unmap(pd->domain, va, PAGE_SIZE); + va += PAGE_SIZE; + size -= PAGE_SIZE; + } + } +} + +static void __usnic_uiom_reg_release(struct usnic_uiom_pd *pd, + struct usnic_uiom_reg *uiomr, + int dirty) +{ + int npages; + unsigned long vpn_start, vpn_last; + struct usnic_uiom_interval_node *interval, *tmp; + int writable = 0; + LIST_HEAD(rm_intervals); + + npages = PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT; + vpn_start = (uiomr->va & PAGE_MASK) >> PAGE_SHIFT; + vpn_last = vpn_start + npages - 1; + + spin_lock(&pd->lock); + usnic_uiom_remove_interval(&pd->rb_root, vpn_start, + vpn_last, &rm_intervals); + usnic_uiom_unmap_sorted_intervals(&rm_intervals, pd); + + list_for_each_entry_safe(interval, tmp, &rm_intervals, link) { + if (interval->flags & IOMMU_WRITE) + writable = 1; + list_del(&interval->link); + kfree(interval); + } + + usnic_uiom_put_pages(&uiomr->chunk_list, dirty & writable); + spin_unlock(&pd->lock); +} + +static int usnic_uiom_map_sorted_intervals(struct list_head *intervals, + struct usnic_uiom_reg *uiomr) +{ + int i, err; + size_t size; + struct usnic_uiom_chunk *chunk; + struct usnic_uiom_interval_node *interval_node; + dma_addr_t pa; + dma_addr_t pa_start = 0; + dma_addr_t pa_end = 0; + long int va_start = -EINVAL; + struct usnic_uiom_pd *pd = uiomr->pd; + long int va = uiomr->va & PAGE_MASK; + int flags = IOMMU_READ | IOMMU_CACHE; + + flags |= (uiomr->writable) ? IOMMU_WRITE : 0; + chunk = list_first_entry(&uiomr->chunk_list, struct usnic_uiom_chunk, + list); + list_for_each_entry(interval_node, intervals, link) { +iter_chunk: + for (i = 0; i < chunk->nents; i++, va += PAGE_SIZE) { + pa = sg_phys(&chunk->page_list[i]); + if ((va >> PAGE_SHIFT) < interval_node->start) + continue; + + if ((va >> PAGE_SHIFT) == interval_node->start) { + /* First page of the interval */ + va_start = va; + pa_start = pa; + pa_end = pa; + } + + WARN_ON(va_start == -EINVAL); + + if ((pa_end + PAGE_SIZE != pa) && + (pa != pa_start)) { + /* PAs are not contiguous */ + size = pa_end - pa_start + PAGE_SIZE; + usnic_dbg("va 0x%lx pa %pa size 0x%zx flags 0x%x", + va_start, &pa_start, size, flags); + err = iommu_map(pd->domain, va_start, pa_start, + size, flags); + if (err) { + usnic_err("Failed to map va 0x%lx pa %pa size 0x%zx with err %d\n", + va_start, &pa_start, size, err); + goto err_out; + } + va_start = va; + pa_start = pa; + pa_end = pa; + } + + if ((va >> PAGE_SHIFT) == interval_node->last) { + /* Last page of the interval */ + size = pa - pa_start + PAGE_SIZE; + usnic_dbg("va 0x%lx pa %pa size 0x%zx flags 0x%x\n", + va_start, &pa_start, size, flags); + err = iommu_map(pd->domain, va_start, pa_start, + size, flags); + if (err) { + usnic_err("Failed to map va 0x%lx pa %pa size 0x%zx with err %d\n", + va_start, &pa_start, size, err); + goto err_out; + } + break; + } + + if (pa != pa_start) + pa_end += PAGE_SIZE; + } + + if (i == chunk->nents) { + /* + * Hit last entry of the chunk, + * hence advance to next chunk + */ + chunk = list_first_entry(&chunk->list, + struct usnic_uiom_chunk, + list); + goto iter_chunk; + } + } + + return 0; + +err_out: + usnic_uiom_unmap_sorted_intervals(intervals, pd); + return err; +} + +struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd, + unsigned long addr, size_t size, + int writable, int dmasync) +{ + struct usnic_uiom_reg *uiomr; + unsigned long va_base, vpn_start, vpn_last; + unsigned long npages; + int offset, err; + LIST_HEAD(sorted_diff_intervals); + + /* + * Intel IOMMU map throws an error if a translation entry is + * changed from read to write. This module may not unmap + * and then remap the entry after fixing the permission + * b/c this open up a small windows where hw DMA may page fault + * Hence, make all entries to be writable. + */ + writable = 1; + + va_base = addr & PAGE_MASK; + offset = addr & ~PAGE_MASK; + npages = PAGE_ALIGN(size + offset) >> PAGE_SHIFT; + vpn_start = (addr & PAGE_MASK) >> PAGE_SHIFT; + vpn_last = vpn_start + npages - 1; + + uiomr = kmalloc(sizeof(*uiomr), GFP_KERNEL); + if (!uiomr) + return ERR_PTR(-ENOMEM); + + uiomr->va = va_base; + uiomr->offset = offset; + uiomr->length = size; + uiomr->writable = writable; + uiomr->pd = pd; + + err = usnic_uiom_get_pages(addr, size, writable, dmasync, + &uiomr->chunk_list); + if (err) { + usnic_err("Failed get_pages vpn [0x%lx,0x%lx] err %d\n", + vpn_start, vpn_last, err); + goto out_free_uiomr; + } + + spin_lock(&pd->lock); + err = usnic_uiom_get_intervals_diff(vpn_start, vpn_last, + (writable) ? IOMMU_WRITE : 0, + IOMMU_WRITE, + &pd->rb_root, + &sorted_diff_intervals); + if (err) { + usnic_err("Failed disjoint interval vpn [0x%lx,0x%lx] err %d\n", + vpn_start, vpn_last, err); + goto out_put_pages; + } + + err = usnic_uiom_map_sorted_intervals(&sorted_diff_intervals, uiomr); + if (err) { + usnic_err("Failed map interval vpn [0x%lx,0x%lx] err %d\n", + vpn_start, vpn_last, err); + goto out_put_intervals; + + } + + err = usnic_uiom_insert_interval(&pd->rb_root, vpn_start, vpn_last, + (writable) ? IOMMU_WRITE : 0); + if (err) { + usnic_err("Failed insert interval vpn [0x%lx,0x%lx] err %d\n", + vpn_start, vpn_last, err); + goto out_unmap_intervals; + } + + usnic_uiom_put_interval_set(&sorted_diff_intervals); + spin_unlock(&pd->lock); + + return uiomr; + +out_unmap_intervals: + usnic_uiom_unmap_sorted_intervals(&sorted_diff_intervals, pd); +out_put_intervals: + usnic_uiom_put_interval_set(&sorted_diff_intervals); +out_put_pages: + usnic_uiom_put_pages(&uiomr->chunk_list, 0); + spin_unlock(&pd->lock); +out_free_uiomr: + kfree(uiomr); + return ERR_PTR(err); +} + +void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, int closing) +{ + struct mm_struct *mm; + unsigned long diff; + + __usnic_uiom_reg_release(uiomr->pd, uiomr, 1); + + mm = get_task_mm(current); + if (!mm) { + kfree(uiomr); + return; + } + + diff = PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT; + + /* + * We may be called with the mm's mmap_sem already held. This + * can happen when a userspace munmap() is the call that drops + * the last reference to our file and calls our release + * method. If there are memory regions to destroy, we'll end + * up here and not be able to take the mmap_sem. In that case + * we defer the vm_locked accounting to the system workqueue. + */ + if (closing) { + if (!down_write_trylock(&mm->mmap_sem)) { + INIT_WORK(&uiomr->work, usnic_uiom_reg_account); + uiomr->mm = mm; + uiomr->diff = diff; + + queue_work(usnic_uiom_wq, &uiomr->work); + return; + } + } else + down_write(&mm->mmap_sem); + + current->mm->locked_vm -= diff; + up_write(&mm->mmap_sem); + mmput(mm); + kfree(uiomr); +} + +struct usnic_uiom_pd *usnic_uiom_alloc_pd(void) +{ + struct usnic_uiom_pd *pd; + void *domain; + + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return ERR_PTR(-ENOMEM); + + pd->domain = domain = iommu_domain_alloc(&pci_bus_type); + if (IS_ERR_OR_NULL(domain)) { + usnic_err("Failed to allocate IOMMU domain with err %ld\n", + PTR_ERR(pd->domain)); + kfree(pd); + return ERR_PTR(domain ? PTR_ERR(domain) : -ENOMEM); + } + + iommu_set_fault_handler(pd->domain, usnic_uiom_dma_fault, NULL); + + spin_lock_init(&pd->lock); + INIT_LIST_HEAD(&pd->devs); + + return pd; +} + +void usnic_uiom_dealloc_pd(struct usnic_uiom_pd *pd) +{ + iommu_domain_free(pd->domain); + kfree(pd); +} + +int usnic_uiom_attach_dev_to_pd(struct usnic_uiom_pd *pd, struct device *dev) +{ + struct usnic_uiom_dev *uiom_dev; + int err; + + uiom_dev = kzalloc(sizeof(*uiom_dev), GFP_ATOMIC); + if (!uiom_dev) + return -ENOMEM; + uiom_dev->dev = dev; + + err = iommu_attach_device(pd->domain, dev); + if (err) + goto out_free_dev; + + if (!iommu_capable(dev->bus, IOMMU_CAP_CACHE_COHERENCY)) { + usnic_err("IOMMU of %s does not support cache coherency\n", + dev_name(dev)); + err = -EINVAL; + goto out_detach_device; + } + + spin_lock(&pd->lock); + list_add_tail(&uiom_dev->link, &pd->devs); + pd->dev_cnt++; + spin_unlock(&pd->lock); + + return 0; + +out_detach_device: + iommu_detach_device(pd->domain, dev); +out_free_dev: + kfree(uiom_dev); + return err; +} + +void usnic_uiom_detach_dev_from_pd(struct usnic_uiom_pd *pd, struct device *dev) +{ + struct usnic_uiom_dev *uiom_dev; + int found = 0; + + spin_lock(&pd->lock); + list_for_each_entry(uiom_dev, &pd->devs, link) { + if (uiom_dev->dev == dev) { + found = 1; + break; + } + } + + if (!found) { + usnic_err("Unable to free dev %s - not found\n", + dev_name(dev)); + spin_unlock(&pd->lock); + return; + } + + list_del(&uiom_dev->link); + pd->dev_cnt--; + spin_unlock(&pd->lock); + + return iommu_detach_device(pd->domain, dev); +} + +struct device **usnic_uiom_get_dev_list(struct usnic_uiom_pd *pd) +{ + struct usnic_uiom_dev *uiom_dev; + struct device **devs; + int i = 0; + + spin_lock(&pd->lock); + devs = kcalloc(pd->dev_cnt + 1, sizeof(*devs), GFP_ATOMIC); + if (!devs) { + devs = ERR_PTR(-ENOMEM); + goto out; + } + + list_for_each_entry(uiom_dev, &pd->devs, link) { + devs[i++] = uiom_dev->dev; + } +out: + spin_unlock(&pd->lock); + return devs; +} + +void usnic_uiom_free_dev_list(struct device **devs) +{ + kfree(devs); +} + +int usnic_uiom_init(char *drv_name) +{ + if (!iommu_present(&pci_bus_type)) { + usnic_err("IOMMU required but not present or enabled. USNIC QPs will not function w/o enabling IOMMU\n"); + return -EPERM; + } + + usnic_uiom_wq = create_workqueue(drv_name); + if (!usnic_uiom_wq) { + usnic_err("Unable to alloc wq for drv %s\n", drv_name); + return -ENOMEM; + } + + return 0; +} + +void usnic_uiom_fini(void) +{ + flush_workqueue(usnic_uiom_wq); + destroy_workqueue(usnic_uiom_wq); +} diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.h b/drivers/infiniband/hw/usnic/usnic_uiom.h new file mode 100644 index 0000000..7044099 --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_uiom.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_UIOM_H_ +#define USNIC_UIOM_H_ + +#include +#include + +#include "usnic_uiom_interval_tree.h" + +#define USNIC_UIOM_READ (1) +#define USNIC_UIOM_WRITE (2) + +#define USNIC_UIOM_MAX_PD_CNT (1000) +#define USNIC_UIOM_MAX_MR_CNT (1000000) +#define USNIC_UIOM_MAX_MR_SIZE (~0UL) +#define USNIC_UIOM_PAGE_SIZE (PAGE_SIZE) + +struct usnic_uiom_dev { + struct device *dev; + struct list_head link; +}; + +struct usnic_uiom_pd { + struct iommu_domain *domain; + spinlock_t lock; + struct rb_root rb_root; + struct list_head devs; + int dev_cnt; +}; + +struct usnic_uiom_reg { + struct usnic_uiom_pd *pd; + unsigned long va; + size_t length; + int offset; + int page_size; + int writable; + struct list_head chunk_list; + struct work_struct work; + struct mm_struct *mm; + unsigned long diff; +}; + +struct usnic_uiom_chunk { + struct list_head list; + int nents; + struct scatterlist page_list[0]; +}; + +struct usnic_uiom_pd *usnic_uiom_alloc_pd(void); +void usnic_uiom_dealloc_pd(struct usnic_uiom_pd *pd); +int usnic_uiom_attach_dev_to_pd(struct usnic_uiom_pd *pd, struct device *dev); +void usnic_uiom_detach_dev_from_pd(struct usnic_uiom_pd *pd, + struct device *dev); +struct device **usnic_uiom_get_dev_list(struct usnic_uiom_pd *pd); +void usnic_uiom_free_dev_list(struct device **devs); +struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd, + unsigned long addr, size_t size, + int access, int dmasync); +void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, int closing); +int usnic_uiom_init(char *drv_name); +void usnic_uiom_fini(void); +#endif /* USNIC_UIOM_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c new file mode 100644 index 0000000..3a4288e --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include + +#include +#include "usnic_uiom_interval_tree.h" + +#define START(node) ((node)->start) +#define LAST(node) ((node)->last) + +#define MAKE_NODE(node, start, end, ref_cnt, flags, err, err_out) \ + do { \ + node = usnic_uiom_interval_node_alloc(start, \ + end, ref_cnt, flags); \ + if (!node) { \ + err = -ENOMEM; \ + goto err_out; \ + } \ + } while (0) + +#define MARK_FOR_ADD(node, list) (list_add_tail(&node->link, list)) + +#define MAKE_NODE_AND_APPEND(node, start, end, ref_cnt, flags, err, \ + err_out, list) \ + do { \ + MAKE_NODE(node, start, end, \ + ref_cnt, flags, err, \ + err_out); \ + MARK_FOR_ADD(node, list); \ + } while (0) + +#define FLAGS_EQUAL(flags1, flags2, mask) \ + (((flags1) & (mask)) == ((flags2) & (mask))) + +static struct usnic_uiom_interval_node* +usnic_uiom_interval_node_alloc(long int start, long int last, int ref_cnt, + int flags) +{ + struct usnic_uiom_interval_node *interval = kzalloc(sizeof(*interval), + GFP_ATOMIC); + if (!interval) + return NULL; + + interval->start = start; + interval->last = last; + interval->flags = flags; + interval->ref_cnt = ref_cnt; + + return interval; +} + +static int interval_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct usnic_uiom_interval_node *node_a, *node_b; + + node_a = list_entry(a, struct usnic_uiom_interval_node, link); + node_b = list_entry(b, struct usnic_uiom_interval_node, link); + + /* long to int */ + if (node_a->start < node_b->start) + return -1; + else if (node_a->start > node_b->start) + return 1; + + return 0; +} + +static void +find_intervals_intersection_sorted(struct rb_root *root, unsigned long start, + unsigned long last, + struct list_head *list) +{ + struct usnic_uiom_interval_node *node; + + INIT_LIST_HEAD(list); + + for (node = usnic_uiom_interval_tree_iter_first(root, start, last); + node; + node = usnic_uiom_interval_tree_iter_next(node, start, last)) + list_add_tail(&node->link, list); + + list_sort(NULL, list, interval_cmp); +} + +int usnic_uiom_get_intervals_diff(unsigned long start, unsigned long last, + int flags, int flag_mask, + struct rb_root *root, + struct list_head *diff_set) +{ + struct usnic_uiom_interval_node *interval, *tmp; + int err = 0; + long int pivot = start; + LIST_HEAD(intersection_set); + + INIT_LIST_HEAD(diff_set); + + find_intervals_intersection_sorted(root, start, last, + &intersection_set); + + list_for_each_entry(interval, &intersection_set, link) { + if (pivot < interval->start) { + MAKE_NODE_AND_APPEND(tmp, pivot, interval->start - 1, + 1, flags, err, err_out, + diff_set); + pivot = interval->start; + } + + /* + * Invariant: Set [start, pivot] is either in diff_set or root, + * but not in both. + */ + + if (pivot > interval->last) { + continue; + } else if (pivot <= interval->last && + FLAGS_EQUAL(interval->flags, flags, + flag_mask)) { + pivot = interval->last + 1; + } + } + + if (pivot <= last) + MAKE_NODE_AND_APPEND(tmp, pivot, last, 1, flags, err, err_out, + diff_set); + + return 0; + +err_out: + list_for_each_entry_safe(interval, tmp, diff_set, link) { + list_del(&interval->link); + kfree(interval); + } + + return err; +} + +void usnic_uiom_put_interval_set(struct list_head *intervals) +{ + struct usnic_uiom_interval_node *interval, *tmp; + list_for_each_entry_safe(interval, tmp, intervals, link) + kfree(interval); +} + +int usnic_uiom_insert_interval(struct rb_root *root, unsigned long start, + unsigned long last, int flags) +{ + struct usnic_uiom_interval_node *interval, *tmp; + unsigned long istart, ilast; + int iref_cnt, iflags; + unsigned long lpivot = start; + int err = 0; + LIST_HEAD(to_add); + LIST_HEAD(intersection_set); + + find_intervals_intersection_sorted(root, start, last, + &intersection_set); + + list_for_each_entry(interval, &intersection_set, link) { + /* + * Invariant - lpivot is the left edge of next interval to be + * inserted + */ + istart = interval->start; + ilast = interval->last; + iref_cnt = interval->ref_cnt; + iflags = interval->flags; + + if (istart < lpivot) { + MAKE_NODE_AND_APPEND(tmp, istart, lpivot - 1, iref_cnt, + iflags, err, err_out, &to_add); + } else if (istart > lpivot) { + MAKE_NODE_AND_APPEND(tmp, lpivot, istart - 1, 1, flags, + err, err_out, &to_add); + lpivot = istart; + } else { + lpivot = istart; + } + + if (ilast > last) { + MAKE_NODE_AND_APPEND(tmp, lpivot, last, iref_cnt + 1, + iflags | flags, err, err_out, + &to_add); + MAKE_NODE_AND_APPEND(tmp, last + 1, ilast, iref_cnt, + iflags, err, err_out, &to_add); + } else { + MAKE_NODE_AND_APPEND(tmp, lpivot, ilast, iref_cnt + 1, + iflags | flags, err, err_out, + &to_add); + } + + lpivot = ilast + 1; + } + + if (lpivot <= last) + MAKE_NODE_AND_APPEND(tmp, lpivot, last, 1, flags, err, err_out, + &to_add); + + list_for_each_entry_safe(interval, tmp, &intersection_set, link) { + usnic_uiom_interval_tree_remove(interval, root); + kfree(interval); + } + + list_for_each_entry(interval, &to_add, link) + usnic_uiom_interval_tree_insert(interval, root); + + return 0; + +err_out: + list_for_each_entry_safe(interval, tmp, &to_add, link) + kfree(interval); + + return err; +} + +void usnic_uiom_remove_interval(struct rb_root *root, unsigned long start, + unsigned long last, struct list_head *removed) +{ + struct usnic_uiom_interval_node *interval; + + for (interval = usnic_uiom_interval_tree_iter_first(root, start, last); + interval; + interval = usnic_uiom_interval_tree_iter_next(interval, + start, + last)) { + if (--interval->ref_cnt == 0) + list_add_tail(&interval->link, removed); + } + + list_for_each_entry(interval, removed, link) + usnic_uiom_interval_tree_remove(interval, root); +} + +INTERVAL_TREE_DEFINE(struct usnic_uiom_interval_node, rb, + unsigned long, __subtree_last, + START, LAST, , usnic_uiom_interval_tree) diff --git a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h new file mode 100644 index 0000000..d4f752e --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_UIOM_INTERVAL_TREE_H_ +#define USNIC_UIOM_INTERVAL_TREE_H_ + +#include + +struct usnic_uiom_interval_node { + struct rb_node rb; + struct list_head link; + unsigned long start; + unsigned long last; + unsigned long __subtree_last; + unsigned int ref_cnt; + int flags; +}; + +extern void +usnic_uiom_interval_tree_insert(struct usnic_uiom_interval_node *node, + struct rb_root *root); +extern void +usnic_uiom_interval_tree_remove(struct usnic_uiom_interval_node *node, + struct rb_root *root); +extern struct usnic_uiom_interval_node * +usnic_uiom_interval_tree_iter_first(struct rb_root *root, + unsigned long start, + unsigned long last); +extern struct usnic_uiom_interval_node * +usnic_uiom_interval_tree_iter_next(struct usnic_uiom_interval_node *node, + unsigned long start, unsigned long last); +/* + * Inserts {start...last} into {root}. If there are overlaps, + * nodes will be broken up and merged + */ +int usnic_uiom_insert_interval(struct rb_root *root, + unsigned long start, unsigned long last, + int flags); +/* + * Removed {start...last} from {root}. The nodes removed are returned in + * 'removed.' The caller is responsibile for freeing memory of nodes in + * 'removed.' + */ +void usnic_uiom_remove_interval(struct rb_root *root, + unsigned long start, unsigned long last, + struct list_head *removed); +/* + * Returns {start...last} - {root} (relative complement of {start...last} in + * {root}) in diff_set sorted ascendingly + */ +int usnic_uiom_get_intervals_diff(unsigned long start, + unsigned long last, int flags, + int flag_mask, + struct rb_root *root, + struct list_head *diff_set); +/* Call this to free diff_set returned by usnic_uiom_get_intervals_diff */ +void usnic_uiom_put_interval_set(struct list_head *intervals); +#endif /* USNIC_UIOM_INTERVAL_TREE_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_vnic.c b/drivers/infiniband/hw/usnic/usnic_vnic.c new file mode 100644 index 0000000..656b88c --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_vnic.c @@ -0,0 +1,467 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include + +#include "usnic_ib.h" +#include "vnic_resource.h" +#include "usnic_log.h" +#include "usnic_vnic.h" + +struct usnic_vnic { + struct vnic_dev *vdev; + struct vnic_dev_bar bar[PCI_NUM_RESOURCES]; + struct usnic_vnic_res_chunk chunks[USNIC_VNIC_RES_TYPE_MAX]; + spinlock_t res_lock; +}; + +static enum vnic_res_type _to_vnic_res_type(enum usnic_vnic_res_type res_type) +{ +#define DEFINE_USNIC_VNIC_RES_AT(usnic_vnic_res_t, vnic_res_type, desc, val) \ + vnic_res_type, +#define DEFINE_USNIC_VNIC_RES(usnic_vnic_res_t, vnic_res_type, desc) \ + vnic_res_type, + static enum vnic_res_type usnic_vnic_type_2_vnic_type[] = { + USNIC_VNIC_RES_TYPES}; +#undef DEFINE_USNIC_VNIC_RES +#undef DEFINE_USNIC_VNIC_RES_AT + + if (res_type >= USNIC_VNIC_RES_TYPE_MAX) + return RES_TYPE_MAX; + + return usnic_vnic_type_2_vnic_type[res_type]; +} + +const char *usnic_vnic_res_type_to_str(enum usnic_vnic_res_type res_type) +{ +#define DEFINE_USNIC_VNIC_RES_AT(usnic_vnic_res_t, vnic_res_type, desc, val) \ + desc, +#define DEFINE_USNIC_VNIC_RES(usnic_vnic_res_t, vnic_res_type, desc) \ + desc, + static const char * const usnic_vnic_res_type_desc[] = { + USNIC_VNIC_RES_TYPES}; +#undef DEFINE_USNIC_VNIC_RES +#undef DEFINE_USNIC_VNIC_RES_AT + + if (res_type >= USNIC_VNIC_RES_TYPE_MAX) + return "unknown"; + + return usnic_vnic_res_type_desc[res_type]; + +} + +const char *usnic_vnic_pci_name(struct usnic_vnic *vnic) +{ + return pci_name(usnic_vnic_get_pdev(vnic)); +} + +int usnic_vnic_dump(struct usnic_vnic *vnic, char *buf, + int buf_sz, + void *hdr_obj, + int (*printtitle)(void *, char*, int), + int (*printcols)(char *, int), + int (*printrow)(void *, char *, int)) +{ + struct usnic_vnic_res_chunk *chunk; + struct usnic_vnic_res *res; + struct vnic_dev_bar *bar0; + int i, j, offset; + + offset = 0; + bar0 = usnic_vnic_get_bar(vnic, 0); + offset += scnprintf(buf + offset, buf_sz - offset, + "VF:%hu BAR0 bus_addr=%pa vaddr=0x%p size=%ld ", + usnic_vnic_get_index(vnic), + &bar0->bus_addr, + bar0->vaddr, bar0->len); + if (printtitle) + offset += printtitle(hdr_obj, buf + offset, buf_sz - offset); + offset += scnprintf(buf + offset, buf_sz - offset, "\n"); + offset += scnprintf(buf + offset, buf_sz - offset, + "|RES\t|CTRL_PIN\t\t|IN_USE\t"); + if (printcols) + offset += printcols(buf + offset, buf_sz - offset); + offset += scnprintf(buf + offset, buf_sz - offset, "\n"); + + spin_lock(&vnic->res_lock); + for (i = 0; i < ARRAY_SIZE(vnic->chunks); i++) { + chunk = &vnic->chunks[i]; + for (j = 0; j < chunk->cnt; j++) { + res = chunk->res[j]; + offset += scnprintf(buf + offset, buf_sz - offset, + "|%s[%u]\t|0x%p\t|%u\t", + usnic_vnic_res_type_to_str(res->type), + res->vnic_idx, res->ctrl, !!res->owner); + if (printrow) { + offset += printrow(res->owner, buf + offset, + buf_sz - offset); + } + offset += scnprintf(buf + offset, buf_sz - offset, + "\n"); + } + } + spin_unlock(&vnic->res_lock); + return offset; +} + +void usnic_vnic_res_spec_update(struct usnic_vnic_res_spec *spec, + enum usnic_vnic_res_type trgt_type, + u16 cnt) +{ + int i; + + for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) { + if (spec->resources[i].type == trgt_type) { + spec->resources[i].cnt = cnt; + return; + } + } + + WARN_ON(1); +} + +int usnic_vnic_res_spec_satisfied(const struct usnic_vnic_res_spec *min_spec, + struct usnic_vnic_res_spec *res_spec) +{ + int found, i, j; + + for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) { + found = 0; + + for (j = 0; j < USNIC_VNIC_RES_TYPE_MAX; j++) { + if (res_spec->resources[i].type != + min_spec->resources[i].type) + continue; + found = 1; + if (min_spec->resources[i].cnt > + res_spec->resources[i].cnt) + return -EINVAL; + break; + } + + if (!found) + return -EINVAL; + } + return 0; +} + +int usnic_vnic_spec_dump(char *buf, int buf_sz, + struct usnic_vnic_res_spec *res_spec) +{ + enum usnic_vnic_res_type res_type; + int res_cnt; + int i; + int offset = 0; + + for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) { + res_type = res_spec->resources[i].type; + res_cnt = res_spec->resources[i].cnt; + offset += scnprintf(buf + offset, buf_sz - offset, + "Res: %s Cnt: %d ", + usnic_vnic_res_type_to_str(res_type), + res_cnt); + } + + return offset; +} + +int usnic_vnic_check_room(struct usnic_vnic *vnic, + struct usnic_vnic_res_spec *res_spec) +{ + int i; + enum usnic_vnic_res_type res_type; + int res_cnt; + + for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) { + res_type = res_spec->resources[i].type; + res_cnt = res_spec->resources[i].cnt; + + if (res_type == USNIC_VNIC_RES_TYPE_EOL) + break; + + if (res_cnt > usnic_vnic_res_free_cnt(vnic, res_type)) + return -EBUSY; + } + + return 0; +} + +int usnic_vnic_res_cnt(struct usnic_vnic *vnic, + enum usnic_vnic_res_type type) +{ + return vnic->chunks[type].cnt; +} + +int usnic_vnic_res_free_cnt(struct usnic_vnic *vnic, + enum usnic_vnic_res_type type) +{ + return vnic->chunks[type].free_cnt; +} + +struct usnic_vnic_res_chunk * +usnic_vnic_get_resources(struct usnic_vnic *vnic, enum usnic_vnic_res_type type, + int cnt, void *owner) +{ + struct usnic_vnic_res_chunk *src, *ret; + struct usnic_vnic_res *res; + int i; + + if (usnic_vnic_res_free_cnt(vnic, type) < cnt || cnt < 1 || !owner) + return ERR_PTR(-EINVAL); + + ret = kzalloc(sizeof(*ret), GFP_ATOMIC); + if (!ret) { + usnic_err("Failed to allocate chunk for %s - Out of memory\n", + usnic_vnic_pci_name(vnic)); + return ERR_PTR(-ENOMEM); + } + + ret->res = kzalloc(sizeof(*(ret->res))*cnt, GFP_ATOMIC); + if (!ret->res) { + usnic_err("Failed to allocate resources for %s. Out of memory\n", + usnic_vnic_pci_name(vnic)); + kfree(ret); + return ERR_PTR(-ENOMEM); + } + + spin_lock(&vnic->res_lock); + src = &vnic->chunks[type]; + for (i = 0; i < src->cnt && ret->cnt < cnt; i++) { + res = src->res[i]; + if (!res->owner) { + src->free_cnt--; + res->owner = owner; + ret->res[ret->cnt++] = res; + } + } + + spin_unlock(&vnic->res_lock); + ret->type = type; + ret->vnic = vnic; + WARN_ON(ret->cnt != cnt); + + return ret; +} + +void usnic_vnic_put_resources(struct usnic_vnic_res_chunk *chunk) +{ + + struct usnic_vnic_res *res; + int i; + struct usnic_vnic *vnic = chunk->vnic; + + spin_lock(&vnic->res_lock); + while ((i = --chunk->cnt) >= 0) { + res = chunk->res[i]; + chunk->res[i] = NULL; + res->owner = NULL; + vnic->chunks[res->type].free_cnt++; + } + spin_unlock(&vnic->res_lock); + + kfree(chunk->res); + kfree(chunk); +} + +u16 usnic_vnic_get_index(struct usnic_vnic *vnic) +{ + return usnic_vnic_get_pdev(vnic)->devfn - 1; +} + +static int usnic_vnic_alloc_res_chunk(struct usnic_vnic *vnic, + enum usnic_vnic_res_type type, + struct usnic_vnic_res_chunk *chunk) +{ + int cnt, err, i; + struct usnic_vnic_res *res; + + cnt = vnic_dev_get_res_count(vnic->vdev, _to_vnic_res_type(type)); + if (cnt < 1) + return -EINVAL; + + chunk->cnt = chunk->free_cnt = cnt; + chunk->res = kzalloc(sizeof(*(chunk->res))*cnt, GFP_KERNEL); + if (!chunk->res) + return -ENOMEM; + + for (i = 0; i < cnt; i++) { + res = kzalloc(sizeof(*res), GFP_KERNEL); + if (!res) { + err = -ENOMEM; + goto fail; + } + res->type = type; + res->vnic_idx = i; + res->vnic = vnic; + res->ctrl = vnic_dev_get_res(vnic->vdev, + _to_vnic_res_type(type), i); + chunk->res[i] = res; + } + + chunk->vnic = vnic; + return 0; +fail: + for (i--; i >= 0; i--) + kfree(chunk->res[i]); + kfree(chunk->res); + return err; +} + +static void usnic_vnic_free_res_chunk(struct usnic_vnic_res_chunk *chunk) +{ + int i; + for (i = 0; i < chunk->cnt; i++) + kfree(chunk->res[i]); + kfree(chunk->res); +} + +static int usnic_vnic_discover_resources(struct pci_dev *pdev, + struct usnic_vnic *vnic) +{ + enum usnic_vnic_res_type res_type; + int i; + int err = 0; + + for (i = 0; i < ARRAY_SIZE(vnic->bar); i++) { + if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM)) + continue; + vnic->bar[i].len = pci_resource_len(pdev, i); + vnic->bar[i].vaddr = pci_iomap(pdev, i, vnic->bar[i].len); + if (!vnic->bar[i].vaddr) { + usnic_err("Cannot memory-map BAR %d, aborting\n", + i); + err = -ENODEV; + goto out_clean_bar; + } + vnic->bar[i].bus_addr = pci_resource_start(pdev, i); + } + + vnic->vdev = vnic_dev_register(NULL, pdev, pdev, vnic->bar, + ARRAY_SIZE(vnic->bar)); + if (!vnic->vdev) { + usnic_err("Failed to register device %s\n", + pci_name(pdev)); + err = -EINVAL; + goto out_clean_bar; + } + + for (res_type = USNIC_VNIC_RES_TYPE_EOL + 1; + res_type < USNIC_VNIC_RES_TYPE_MAX; res_type++) { + err = usnic_vnic_alloc_res_chunk(vnic, res_type, + &vnic->chunks[res_type]); + if (err) { + usnic_err("Failed to alloc res %s with err %d\n", + usnic_vnic_res_type_to_str(res_type), + err); + goto out_clean_chunks; + } + } + + return 0; + +out_clean_chunks: + for (res_type--; res_type > USNIC_VNIC_RES_TYPE_EOL; res_type--) + usnic_vnic_free_res_chunk(&vnic->chunks[res_type]); + vnic_dev_unregister(vnic->vdev); +out_clean_bar: + for (i = 0; i < ARRAY_SIZE(vnic->bar); i++) { + if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM)) + continue; + if (!vnic->bar[i].vaddr) + break; + + iounmap(vnic->bar[i].vaddr); + } + + return err; +} + +struct pci_dev *usnic_vnic_get_pdev(struct usnic_vnic *vnic) +{ + return vnic_dev_get_pdev(vnic->vdev); +} + +struct vnic_dev_bar *usnic_vnic_get_bar(struct usnic_vnic *vnic, + int bar_num) +{ + return (bar_num < ARRAY_SIZE(vnic->bar)) ? &vnic->bar[bar_num] : NULL; +} + +static void usnic_vnic_release_resources(struct usnic_vnic *vnic) +{ + int i; + struct pci_dev *pdev; + enum usnic_vnic_res_type res_type; + + pdev = usnic_vnic_get_pdev(vnic); + + for (res_type = USNIC_VNIC_RES_TYPE_EOL + 1; + res_type < USNIC_VNIC_RES_TYPE_MAX; res_type++) + usnic_vnic_free_res_chunk(&vnic->chunks[res_type]); + + vnic_dev_unregister(vnic->vdev); + + for (i = 0; i < ARRAY_SIZE(vnic->bar); i++) { + if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM)) + continue; + iounmap(vnic->bar[i].vaddr); + } +} + +struct usnic_vnic *usnic_vnic_alloc(struct pci_dev *pdev) +{ + struct usnic_vnic *vnic; + int err = 0; + + if (!pci_is_enabled(pdev)) { + usnic_err("PCI dev %s is disabled\n", pci_name(pdev)); + return ERR_PTR(-EINVAL); + } + + vnic = kzalloc(sizeof(*vnic), GFP_KERNEL); + if (!vnic) { + usnic_err("Failed to alloc vnic for %s - out of memory\n", + pci_name(pdev)); + return ERR_PTR(-ENOMEM); + } + + spin_lock_init(&vnic->res_lock); + + err = usnic_vnic_discover_resources(pdev, vnic); + if (err) { + usnic_err("Failed to discover %s resources with err %d\n", + pci_name(pdev), err); + goto out_free_vnic; + } + + usnic_dbg("Allocated vnic for %s\n", usnic_vnic_pci_name(vnic)); + + return vnic; + +out_free_vnic: + kfree(vnic); + + return ERR_PTR(err); +} + +void usnic_vnic_free(struct usnic_vnic *vnic) +{ + usnic_vnic_release_resources(vnic); + kfree(vnic); +} diff --git a/drivers/infiniband/hw/usnic/usnic_vnic.h b/drivers/infiniband/hw/usnic/usnic_vnic.h new file mode 100644 index 0000000..14d931a --- /dev/null +++ b/drivers/infiniband/hw/usnic/usnic_vnic.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef USNIC_VNIC_H_ +#define USNIC_VNIC_H_ + +#include + +#include "vnic_dev.h" + +/* =USNIC_VNIC_RES_TYPE= =VNIC_RES= =DESC= */ +#define USNIC_VNIC_RES_TYPES \ + DEFINE_USNIC_VNIC_RES_AT(EOL, RES_TYPE_EOL, "EOL", 0) \ + DEFINE_USNIC_VNIC_RES(WQ, RES_TYPE_WQ, "WQ") \ + DEFINE_USNIC_VNIC_RES(RQ, RES_TYPE_RQ, "RQ") \ + DEFINE_USNIC_VNIC_RES(CQ, RES_TYPE_CQ, "CQ") \ + DEFINE_USNIC_VNIC_RES(INTR, RES_TYPE_INTR_CTRL, "INT") \ + DEFINE_USNIC_VNIC_RES(MAX, RES_TYPE_MAX, "MAX")\ + +#define DEFINE_USNIC_VNIC_RES_AT(usnic_vnic_res_t, vnic_res_type, desc, val) \ + USNIC_VNIC_RES_TYPE_##usnic_vnic_res_t = val, +#define DEFINE_USNIC_VNIC_RES(usnic_vnic_res_t, vnic_res_type, desc) \ + USNIC_VNIC_RES_TYPE_##usnic_vnic_res_t, +enum usnic_vnic_res_type { + USNIC_VNIC_RES_TYPES +}; +#undef DEFINE_USNIC_VNIC_RES +#undef DEFINE_USNIC_VNIC_RES_AT + +struct usnic_vnic_res { + enum usnic_vnic_res_type type; + unsigned int vnic_idx; + struct usnic_vnic *vnic; + void __iomem *ctrl; + void *owner; +}; + +struct usnic_vnic_res_chunk { + enum usnic_vnic_res_type type; + int cnt; + int free_cnt; + struct usnic_vnic_res **res; + struct usnic_vnic *vnic; +}; + +struct usnic_vnic_res_desc { + enum usnic_vnic_res_type type; + uint16_t cnt; +}; + +struct usnic_vnic_res_spec { + struct usnic_vnic_res_desc resources[USNIC_VNIC_RES_TYPE_MAX]; +}; + +const char *usnic_vnic_res_type_to_str(enum usnic_vnic_res_type res_type); +const char *usnic_vnic_pci_name(struct usnic_vnic *vnic); +int usnic_vnic_dump(struct usnic_vnic *vnic, char *buf, int buf_sz, + void *hdr_obj, + int (*printtitle)(void *, char*, int), + int (*printcols)(char *, int), + int (*printrow)(void *, char *, int)); +void usnic_vnic_res_spec_update(struct usnic_vnic_res_spec *spec, + enum usnic_vnic_res_type trgt_type, + u16 cnt); +int usnic_vnic_res_spec_satisfied(const struct usnic_vnic_res_spec *min_spec, + struct usnic_vnic_res_spec *res_spec); +int usnic_vnic_spec_dump(char *buf, int buf_sz, + struct usnic_vnic_res_spec *res_spec); +int usnic_vnic_check_room(struct usnic_vnic *vnic, + struct usnic_vnic_res_spec *res_spec); +int usnic_vnic_res_cnt(struct usnic_vnic *vnic, + enum usnic_vnic_res_type type); +int usnic_vnic_res_free_cnt(struct usnic_vnic *vnic, + enum usnic_vnic_res_type type); +struct usnic_vnic_res_chunk * +usnic_vnic_get_resources(struct usnic_vnic *vnic, + enum usnic_vnic_res_type type, + int cnt, + void *owner); +void usnic_vnic_put_resources(struct usnic_vnic_res_chunk *chunk); +struct pci_dev *usnic_vnic_get_pdev(struct usnic_vnic *vnic); +struct vnic_dev_bar *usnic_vnic_get_bar(struct usnic_vnic *vnic, + int bar_num); +struct usnic_vnic *usnic_vnic_alloc(struct pci_dev *pdev); +void usnic_vnic_free(struct usnic_vnic *vnic); +u16 usnic_vnic_get_index(struct usnic_vnic *vnic); + +#endif /*!USNIC_VNIC_H_*/ diff --git a/drivers/infiniband/ulp/Makefile b/drivers/infiniband/ulp/Makefile new file mode 100644 index 0000000..f3c7dcf --- /dev/null +++ b/drivers/infiniband/ulp/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_INFINIBAND_IPOIB) += ipoib/ +obj-$(CONFIG_INFINIBAND_SRP) += srp/ +obj-$(CONFIG_INFINIBAND_SRPT) += srpt/ +obj-$(CONFIG_INFINIBAND_ISER) += iser/ +obj-$(CONFIG_INFINIBAND_ISERT) += isert/ diff --git a/drivers/infiniband/ulp/ipoib/Kconfig b/drivers/infiniband/ulp/ipoib/Kconfig new file mode 100644 index 0000000..cda8eac --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/Kconfig @@ -0,0 +1,49 @@ +config INFINIBAND_IPOIB + tristate "IP-over-InfiniBand" + depends on NETDEVICES && INET && (IPV6 || IPV6=n) + ---help--- + Support for the IP-over-InfiniBand protocol (IPoIB). This + transports IP packets over InfiniBand so you can use your IB + device as a fancy NIC. + + See Documentation/infiniband/ipoib.txt for more information + +config INFINIBAND_IPOIB_CM + bool "IP-over-InfiniBand Connected Mode support" + depends on INFINIBAND_IPOIB + default n + ---help--- + This option enables support for IPoIB connected mode. After + enabling this option, you need to switch to connected mode + through /sys/class/net/ibXXX/mode to actually create + connections, and then increase the interface MTU with + e.g. ifconfig ib0 mtu 65520. + + WARNING: Enabling connected mode will trigger some packet + drops for multicast and UD mode traffic from this interface, + unless you limit mtu for these destinations to 2044. + +config INFINIBAND_IPOIB_DEBUG + bool "IP-over-InfiniBand debugging" if EXPERT + depends on INFINIBAND_IPOIB + default y + ---help--- + This option causes debugging code to be compiled into the + IPoIB driver. The output can be turned on via the + debug_level and mcast_debug_level module parameters (which + can also be set after the driver is loaded through sysfs). + + This option also creates a directory tree under ipoib/ in + debugfs, which contains files that expose debugging + information about IB multicast groups used by the IPoIB + driver. + +config INFINIBAND_IPOIB_DEBUG_DATA + bool "IP-over-InfiniBand data path debugging" + depends on INFINIBAND_IPOIB_DEBUG + ---help--- + This option compiles debugging code into the data path + of the IPoIB driver. The output can be turned on via the + data_debug_level module parameter; however, even with output + turned off, this debugging code will have some performance + impact. diff --git a/drivers/infiniband/ulp/ipoib/Makefile b/drivers/infiniband/ulp/ipoib/Makefile new file mode 100644 index 0000000..e5430dd --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/Makefile @@ -0,0 +1,12 @@ +obj-$(CONFIG_INFINIBAND_IPOIB) += ib_ipoib.o + +ib_ipoib-y := ipoib_main.o \ + ipoib_ib.o \ + ipoib_multicast.o \ + ipoib_verbs.o \ + ipoib_vlan.o \ + ipoib_ethtool.o \ + ipoib_netlink.o +ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_CM) += ipoib_cm.o +ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG) += ipoib_fs.o + diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h new file mode 100644 index 0000000..d7562be --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -0,0 +1,773 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPOIB_H +#define _IPOIB_H + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include + +/* constants */ + +enum ipoib_flush_level { + IPOIB_FLUSH_LIGHT, + IPOIB_FLUSH_NORMAL, + IPOIB_FLUSH_HEAVY +}; + +enum { + IPOIB_ENCAP_LEN = 4, + + IPOIB_UD_HEAD_SIZE = IB_GRH_BYTES + IPOIB_ENCAP_LEN, + IPOIB_UD_RX_SG = 2, /* max buffer needed for 4K mtu */ + + IPOIB_CM_MTU = 0x10000 - 0x10, /* padding to align header to 16 */ + IPOIB_CM_BUF_SIZE = IPOIB_CM_MTU + IPOIB_ENCAP_LEN, + IPOIB_CM_HEAD_SIZE = IPOIB_CM_BUF_SIZE % PAGE_SIZE, + IPOIB_CM_RX_SG = ALIGN(IPOIB_CM_BUF_SIZE, PAGE_SIZE) / PAGE_SIZE, + IPOIB_RX_RING_SIZE = 256, + IPOIB_TX_RING_SIZE = 128, + IPOIB_MAX_QUEUE_SIZE = 8192, + IPOIB_MIN_QUEUE_SIZE = 2, + IPOIB_CM_MAX_CONN_QP = 4096, + + IPOIB_NUM_WC = 4, + + IPOIB_MAX_PATH_REC_QUEUE = 3, + IPOIB_MAX_MCAST_QUEUE = 3, + + IPOIB_FLAG_OPER_UP = 0, + IPOIB_FLAG_INITIALIZED = 1, + IPOIB_FLAG_ADMIN_UP = 2, + IPOIB_PKEY_ASSIGNED = 3, + IPOIB_FLAG_SUBINTERFACE = 5, + IPOIB_MCAST_RUN = 6, + IPOIB_STOP_REAPER = 7, + IPOIB_FLAG_ADMIN_CM = 9, + IPOIB_FLAG_UMCAST = 10, + IPOIB_STOP_NEIGH_GC = 11, + IPOIB_NEIGH_TBL_FLUSH = 12, + + IPOIB_MAX_BACKOFF_SECONDS = 16, + + IPOIB_MCAST_FLAG_FOUND = 0, /* used in set_multicast_list */ + IPOIB_MCAST_FLAG_SENDONLY = 1, + IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */ + IPOIB_MCAST_FLAG_ATTACHED = 3, + IPOIB_MCAST_JOIN_STARTED = 4, + + MAX_SEND_CQE = 16, + IPOIB_CM_COPYBREAK = 256, + + IPOIB_NON_CHILD = 0, + IPOIB_LEGACY_CHILD = 1, + IPOIB_RTNL_CHILD = 2, +}; + +#define IPOIB_OP_RECV (1ul << 31) +#ifdef CONFIG_INFINIBAND_IPOIB_CM +#define IPOIB_OP_CM (1ul << 30) +#else +#define IPOIB_OP_CM (0) +#endif + +#define IPOIB_QPN_MASK ((__force u32) cpu_to_be32(0xFFFFFF)) + +/* structs */ + +struct ipoib_header { + __be16 proto; + u16 reserved; +}; + +struct ipoib_cb { + struct qdisc_skb_cb qdisc_cb; + u8 hwaddr[INFINIBAND_ALEN]; +}; + +static inline struct ipoib_cb *ipoib_skb_cb(const struct sk_buff *skb) +{ + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct ipoib_cb)); + return (struct ipoib_cb *)skb->cb; +} + +/* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */ +struct ipoib_mcast { + struct ib_sa_mcmember_rec mcmember; + struct ib_sa_multicast *mc; + struct ipoib_ah *ah; + + struct rb_node rb_node; + struct list_head list; + + unsigned long created; + unsigned long backoff; + + unsigned long flags; + unsigned char logcount; + + struct list_head neigh_list; + + struct sk_buff_head pkt_queue; + + struct net_device *dev; + struct completion done; +}; + +struct ipoib_rx_buf { + struct sk_buff *skb; + u64 mapping[IPOIB_UD_RX_SG]; +}; + +struct ipoib_tx_buf { + struct sk_buff *skb; + u64 mapping[MAX_SKB_FRAGS + 1]; +}; + +struct ipoib_cm_tx_buf { + struct sk_buff *skb; + u64 mapping; +}; + +struct ib_cm_id; + +struct ipoib_cm_data { + __be32 qpn; /* High byte MUST be ignored on receive */ + __be32 mtu; +}; + +/* + * Quoting 10.3.1 Queue Pair and EE Context States: + * + * Note, for QPs that are associated with an SRQ, the Consumer should take the + * QP through the Error State before invoking a Destroy QP or a Modify QP to the + * Reset State. The Consumer may invoke the Destroy QP without first performing + * a Modify QP to the Error State and waiting for the Affiliated Asynchronous + * Last WQE Reached Event. However, if the Consumer does not wait for the + * Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment + * leakage may occur. Therefore, it is good programming practice to tear down a + * QP that is associated with an SRQ by using the following process: + * + * - Put the QP in the Error State + * - Wait for the Affiliated Asynchronous Last WQE Reached Event; + * - either: + * drain the CQ by invoking the Poll CQ verb and either wait for CQ + * to be empty or the number of Poll CQ operations has exceeded + * CQ capacity size; + * - or + * post another WR that completes on the same CQ and wait for this + * WR to return as a WC; + * - and then invoke a Destroy QP or Reset QP. + * + * We use the second option and wait for a completion on the + * same CQ before destroying QPs attached to our SRQ. + */ + +enum ipoib_cm_state { + IPOIB_CM_RX_LIVE, + IPOIB_CM_RX_ERROR, /* Ignored by stale task */ + IPOIB_CM_RX_FLUSH /* Last WQE Reached event observed */ +}; + +struct ipoib_cm_rx { + struct ib_cm_id *id; + struct ib_qp *qp; + struct ipoib_cm_rx_buf *rx_ring; + struct list_head list; + struct net_device *dev; + unsigned long jiffies; + enum ipoib_cm_state state; + int recv_count; +}; + +struct ipoib_cm_tx { + struct ib_cm_id *id; + struct ib_qp *qp; + struct list_head list; + struct net_device *dev; + struct ipoib_neigh *neigh; + struct ipoib_path *path; + struct ipoib_cm_tx_buf *tx_ring; + unsigned tx_head; + unsigned tx_tail; + unsigned long flags; + u32 mtu; +}; + +struct ipoib_cm_rx_buf { + struct sk_buff *skb; + u64 mapping[IPOIB_CM_RX_SG]; +}; + +struct ipoib_cm_dev_priv { + struct ib_srq *srq; + struct ipoib_cm_rx_buf *srq_ring; + struct ib_cm_id *id; + struct list_head passive_ids; /* state: LIVE */ + struct list_head rx_error_list; /* state: ERROR */ + struct list_head rx_flush_list; /* state: FLUSH, drain not started */ + struct list_head rx_drain_list; /* state: FLUSH, drain started */ + struct list_head rx_reap_list; /* state: FLUSH, drain done */ + struct work_struct start_task; + struct work_struct reap_task; + struct work_struct skb_task; + struct work_struct rx_reap_task; + struct delayed_work stale_task; + struct sk_buff_head skb_queue; + struct list_head start_list; + struct list_head reap_list; + struct ib_wc ibwc[IPOIB_NUM_WC]; + struct ib_sge rx_sge[IPOIB_CM_RX_SG]; + struct ib_recv_wr rx_wr; + int nonsrq_conn_qp; + int max_cm_mtu; + int num_frags; +}; + +struct ipoib_ethtool_st { + u16 coalesce_usecs; + u16 max_coalesced_frames; +}; + +struct ipoib_neigh_table; + +struct ipoib_neigh_hash { + struct ipoib_neigh_table *ntbl; + struct ipoib_neigh __rcu **buckets; + struct rcu_head rcu; + u32 mask; + u32 size; +}; + +struct ipoib_neigh_table { + struct ipoib_neigh_hash __rcu *htbl; + atomic_t entries; + struct completion flushed; + struct completion deleted; +}; + +/* + * Device private locking: network stack tx_lock protects members used + * in TX fast path, lock protects everything else. lock nests inside + * of tx_lock (ie tx_lock must be acquired first if needed). + */ +struct ipoib_dev_priv { + spinlock_t lock; + + struct net_device *dev; + + struct napi_struct napi; + + unsigned long flags; + + struct rw_semaphore vlan_rwsem; + + struct rb_root path_tree; + struct list_head path_list; + + struct ipoib_neigh_table ntbl; + + struct ipoib_mcast *broadcast; + struct list_head multicast_list; + struct rb_root multicast_tree; + + struct delayed_work mcast_task; + struct work_struct carrier_on_task; + struct work_struct flush_light; + struct work_struct flush_normal; + struct work_struct flush_heavy; + struct work_struct restart_task; + struct delayed_work ah_reap_task; + struct delayed_work neigh_reap_task; + struct ib_device *ca; + u8 port; + u16 pkey; + u16 pkey_index; + struct ib_pd *pd; + struct ib_mr *mr; + struct ib_cq *recv_cq; + struct ib_cq *send_cq; + struct ib_qp *qp; + u32 qkey; + + union ib_gid local_gid; + u16 local_lid; + + unsigned int admin_mtu; + unsigned int mcast_mtu; + unsigned int max_ib_mtu; + + struct ipoib_rx_buf *rx_ring; + + struct ipoib_tx_buf *tx_ring; + unsigned tx_head; + unsigned tx_tail; + struct ib_sge tx_sge[MAX_SKB_FRAGS + 1]; + struct ib_send_wr tx_wr; + unsigned tx_outstanding; + struct ib_wc send_wc[MAX_SEND_CQE]; + + struct ib_recv_wr rx_wr; + struct ib_sge rx_sge[IPOIB_UD_RX_SG]; + + struct ib_wc ibwc[IPOIB_NUM_WC]; + + struct list_head dead_ahs; + + struct ib_event_handler event_handler; + + struct net_device *parent; + struct list_head child_intfs; + struct list_head list; + int child_type; + +#ifdef CONFIG_INFINIBAND_IPOIB_CM + struct ipoib_cm_dev_priv cm; +#endif + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG + struct list_head fs_list; + struct dentry *mcg_dentry; + struct dentry *path_dentry; +#endif + int hca_caps; + struct ipoib_ethtool_st ethtool; + struct timer_list poll_timer; +}; + +struct ipoib_ah { + struct net_device *dev; + struct ib_ah *ah; + struct list_head list; + struct kref ref; + unsigned last_send; +}; + +struct ipoib_path { + struct net_device *dev; + struct ib_sa_path_rec pathrec; + struct ipoib_ah *ah; + struct sk_buff_head queue; + + struct list_head neigh_list; + + int query_id; + struct ib_sa_query *query; + struct completion done; + + struct rb_node rb_node; + struct list_head list; + int valid; +}; + +struct ipoib_neigh { + struct ipoib_ah *ah; +#ifdef CONFIG_INFINIBAND_IPOIB_CM + struct ipoib_cm_tx *cm; +#endif + u8 daddr[INFINIBAND_ALEN]; + struct sk_buff_head queue; + + struct net_device *dev; + + struct list_head list; + struct ipoib_neigh __rcu *hnext; + struct rcu_head rcu; + atomic_t refcnt; + unsigned long alive; +}; + +#define IPOIB_UD_MTU(ib_mtu) (ib_mtu - IPOIB_ENCAP_LEN) +#define IPOIB_UD_BUF_SIZE(ib_mtu) (ib_mtu + IB_GRH_BYTES) + +static inline int ipoib_ud_need_sg(unsigned int ib_mtu) +{ + return IPOIB_UD_BUF_SIZE(ib_mtu) > PAGE_SIZE; +} + +void ipoib_neigh_dtor(struct ipoib_neigh *neigh); +static inline void ipoib_neigh_put(struct ipoib_neigh *neigh) +{ + if (atomic_dec_and_test(&neigh->refcnt)) + ipoib_neigh_dtor(neigh); +} +struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr); +struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr, + struct net_device *dev); +void ipoib_neigh_free(struct ipoib_neigh *neigh); +void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid); + +extern struct workqueue_struct *ipoib_workqueue; + +/* functions */ + +int ipoib_poll(struct napi_struct *napi, int budget); +void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr); +void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr); + +struct ipoib_ah *ipoib_create_ah(struct net_device *dev, + struct ib_pd *pd, struct ib_ah_attr *attr); +void ipoib_free_ah(struct kref *kref); +static inline void ipoib_put_ah(struct ipoib_ah *ah) +{ + kref_put(&ah->ref, ipoib_free_ah); +} +int ipoib_open(struct net_device *dev); +int ipoib_add_pkey_attr(struct net_device *dev); +int ipoib_add_umcast_attr(struct net_device *dev); + +void ipoib_send(struct net_device *dev, struct sk_buff *skb, + struct ipoib_ah *address, u32 qpn); +void ipoib_reap_ah(struct work_struct *work); + +void ipoib_mark_paths_invalid(struct net_device *dev); +void ipoib_flush_paths(struct net_device *dev); +struct ipoib_dev_priv *ipoib_intf_alloc(const char *format); + +int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port); +void ipoib_ib_dev_flush_light(struct work_struct *work); +void ipoib_ib_dev_flush_normal(struct work_struct *work); +void ipoib_ib_dev_flush_heavy(struct work_struct *work); +void ipoib_pkey_event(struct work_struct *work); +void ipoib_ib_dev_cleanup(struct net_device *dev); + +int ipoib_ib_dev_open(struct net_device *dev, int flush); +int ipoib_ib_dev_up(struct net_device *dev); +int ipoib_ib_dev_down(struct net_device *dev, int flush); +int ipoib_ib_dev_stop(struct net_device *dev, int flush); +void ipoib_pkey_dev_check_presence(struct net_device *dev); + +int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port); +void ipoib_dev_cleanup(struct net_device *dev); + +void ipoib_mcast_join_task(struct work_struct *work); +void ipoib_mcast_carrier_on_task(struct work_struct *work); +void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb); + +void ipoib_mcast_restart_task(struct work_struct *work); +int ipoib_mcast_start_thread(struct net_device *dev); +int ipoib_mcast_stop_thread(struct net_device *dev, int flush); + +void ipoib_mcast_dev_down(struct net_device *dev); +void ipoib_mcast_dev_flush(struct net_device *dev); + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG +struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev); +int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter); +void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter, + union ib_gid *gid, + unsigned long *created, + unsigned int *queuelen, + unsigned int *complete, + unsigned int *send_only); + +struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev); +int ipoib_path_iter_next(struct ipoib_path_iter *iter); +void ipoib_path_iter_read(struct ipoib_path_iter *iter, + struct ipoib_path *path); +#endif + +int ipoib_mcast_attach(struct net_device *dev, u16 mlid, + union ib_gid *mgid, int set_qkey); + +int ipoib_init_qp(struct net_device *dev); +int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca); +void ipoib_transport_dev_cleanup(struct net_device *dev); + +void ipoib_event(struct ib_event_handler *handler, + struct ib_event *record); + +int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey); +int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey); + +int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv, + u16 pkey, int child_type); + +int __init ipoib_netlink_init(void); +void __exit ipoib_netlink_fini(void); + +void ipoib_set_umcast(struct net_device *ndev, int umcast_val); +int ipoib_set_mode(struct net_device *dev, const char *buf); + +void ipoib_setup(struct net_device *dev); + +void ipoib_pkey_open(struct ipoib_dev_priv *priv); +void ipoib_drain_cq(struct net_device *dev); + +void ipoib_set_ethtool_ops(struct net_device *dev); +int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca); + +#define IPOIB_FLAGS_RC 0x80 +#define IPOIB_FLAGS_UC 0x40 + +/* We don't support UC connections at the moment */ +#define IPOIB_CM_SUPPORTED(ha) (ha[0] & (IPOIB_FLAGS_RC)) + +#ifdef CONFIG_INFINIBAND_IPOIB_CM + +extern int ipoib_max_conn_qp; + +static inline int ipoib_cm_admin_enabled(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + return IPOIB_CM_SUPPORTED(dev->dev_addr) && + test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); +} + +static inline int ipoib_cm_enabled(struct net_device *dev, u8 *hwaddr) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + return IPOIB_CM_SUPPORTED(hwaddr) && + test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); +} + +static inline int ipoib_cm_up(struct ipoib_neigh *neigh) + +{ + return test_bit(IPOIB_FLAG_OPER_UP, &neigh->cm->flags); +} + +static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh) +{ + return neigh->cm; +} + +static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx) +{ + neigh->cm = tx; +} + +static inline int ipoib_cm_has_srq(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + return !!priv->cm.srq; +} + +static inline unsigned int ipoib_cm_max_mtu(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + return priv->cm.max_cm_mtu; +} + +void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx); +int ipoib_cm_dev_open(struct net_device *dev); +void ipoib_cm_dev_stop(struct net_device *dev); +int ipoib_cm_dev_init(struct net_device *dev); +int ipoib_cm_add_mode_attr(struct net_device *dev); +void ipoib_cm_dev_cleanup(struct net_device *dev); +struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path, + struct ipoib_neigh *neigh); +void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx); +void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, + unsigned int mtu); +void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc); +void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc); +#else + +struct ipoib_cm_tx; + +#define ipoib_max_conn_qp 0 + +static inline int ipoib_cm_admin_enabled(struct net_device *dev) +{ + return 0; +} +static inline int ipoib_cm_enabled(struct net_device *dev, u8 *hwaddr) + +{ + return 0; +} + +static inline int ipoib_cm_up(struct ipoib_neigh *neigh) + +{ + return 0; +} + +static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh) +{ + return NULL; +} + +static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx) +{ +} + +static inline int ipoib_cm_has_srq(struct net_device *dev) +{ + return 0; +} + +static inline unsigned int ipoib_cm_max_mtu(struct net_device *dev) +{ + return 0; +} + +static inline +void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx) +{ + return; +} + +static inline +int ipoib_cm_dev_open(struct net_device *dev) +{ + return 0; +} + +static inline +void ipoib_cm_dev_stop(struct net_device *dev) +{ + return; +} + +static inline +int ipoib_cm_dev_init(struct net_device *dev) +{ + return -ENOSYS; +} + +static inline +void ipoib_cm_dev_cleanup(struct net_device *dev) +{ + return; +} + +static inline +struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path, + struct ipoib_neigh *neigh) +{ + return NULL; +} + +static inline +void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) +{ + return; +} + +static inline +int ipoib_cm_add_mode_attr(struct net_device *dev) +{ + return 0; +} + +static inline void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, + unsigned int mtu) +{ + dev_kfree_skb_any(skb); +} + +static inline void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) +{ +} + +static inline void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) +{ +} +#endif + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG +void ipoib_create_debug_files(struct net_device *dev); +void ipoib_delete_debug_files(struct net_device *dev); +int ipoib_register_debugfs(void); +void ipoib_unregister_debugfs(void); +#else +static inline void ipoib_create_debug_files(struct net_device *dev) { } +static inline void ipoib_delete_debug_files(struct net_device *dev) { } +static inline int ipoib_register_debugfs(void) { return 0; } +static inline void ipoib_unregister_debugfs(void) { } +#endif + +#define ipoib_printk(level, priv, format, arg...) \ + printk(level "%s: " format, ((struct ipoib_dev_priv *) priv)->dev->name , ## arg) +#define ipoib_warn(priv, format, arg...) \ + ipoib_printk(KERN_WARNING, priv, format , ## arg) + +extern int ipoib_sendq_size; +extern int ipoib_recvq_size; + +extern struct ib_sa_client ipoib_sa_client; + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG +extern int ipoib_debug_level; + +#define ipoib_dbg(priv, format, arg...) \ + do { \ + if (ipoib_debug_level > 0) \ + ipoib_printk(KERN_DEBUG, priv, format , ## arg); \ + } while (0) +#define ipoib_dbg_mcast(priv, format, arg...) \ + do { \ + if (mcast_debug_level > 0) \ + ipoib_printk(KERN_DEBUG, priv, format , ## arg); \ + } while (0) +#else /* CONFIG_INFINIBAND_IPOIB_DEBUG */ +#define ipoib_dbg(priv, format, arg...) \ + do { (void) (priv); } while (0) +#define ipoib_dbg_mcast(priv, format, arg...) \ + do { (void) (priv); } while (0) +#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA +#define ipoib_dbg_data(priv, format, arg...) \ + do { \ + if (data_debug_level > 0) \ + ipoib_printk(KERN_DEBUG, priv, format , ## arg); \ + } while (0) +#else /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */ +#define ipoib_dbg_data(priv, format, arg...) \ + do { (void) (priv); } while (0) +#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */ + +#define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff) + +extern const char ipoib_driver_version[]; + +#endif /* _IPOIB_H */ diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c new file mode 100644 index 0000000..933efce --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -0,0 +1,1615 @@ +/* + * Copyright (c) 2006 Mellanox Technologies. All rights reserved + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ipoib.h" + +int ipoib_max_conn_qp = 128; + +module_param_named(max_nonsrq_conn_qp, ipoib_max_conn_qp, int, 0444); +MODULE_PARM_DESC(max_nonsrq_conn_qp, + "Max number of connected-mode QPs per interface " + "(applied only if shared receive queue is not available)"); + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA +static int data_debug_level; + +module_param_named(cm_data_debug_level, data_debug_level, int, 0644); +MODULE_PARM_DESC(cm_data_debug_level, + "Enable data path debug tracing for connected mode if > 0"); +#endif + +#define IPOIB_CM_IETF_ID 0x1000000000000000ULL + +#define IPOIB_CM_RX_UPDATE_TIME (256 * HZ) +#define IPOIB_CM_RX_TIMEOUT (2 * 256 * HZ) +#define IPOIB_CM_RX_DELAY (3 * 256 * HZ) +#define IPOIB_CM_RX_UPDATE_MASK (0x3) + +static struct ib_qp_attr ipoib_cm_err_attr = { + .qp_state = IB_QPS_ERR +}; + +#define IPOIB_CM_RX_DRAIN_WRID 0xffffffff + +static struct ib_send_wr ipoib_cm_rx_drain_wr = { + .wr_id = IPOIB_CM_RX_DRAIN_WRID, + .opcode = IB_WR_SEND, +}; + +static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, + struct ib_cm_event *event); + +static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags, + u64 mapping[IPOIB_CM_RX_SG]) +{ + int i; + + ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); + + for (i = 0; i < frags; ++i) + ib_dma_unmap_page(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE); +} + +static int ipoib_cm_post_receive_srq(struct net_device *dev, int id) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_recv_wr *bad_wr; + int i, ret; + + priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV; + + for (i = 0; i < priv->cm.num_frags; ++i) + priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i]; + + ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr); + if (unlikely(ret)) { + ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret); + ipoib_cm_dma_unmap_rx(priv, priv->cm.num_frags - 1, + priv->cm.srq_ring[id].mapping); + dev_kfree_skb_any(priv->cm.srq_ring[id].skb); + priv->cm.srq_ring[id].skb = NULL; + } + + return ret; +} + +static int ipoib_cm_post_receive_nonsrq(struct net_device *dev, + struct ipoib_cm_rx *rx, + struct ib_recv_wr *wr, + struct ib_sge *sge, int id) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_recv_wr *bad_wr; + int i, ret; + + wr->wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV; + + for (i = 0; i < IPOIB_CM_RX_SG; ++i) + sge[i].addr = rx->rx_ring[id].mapping[i]; + + ret = ib_post_recv(rx->qp, wr, &bad_wr); + if (unlikely(ret)) { + ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret); + ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1, + rx->rx_ring[id].mapping); + dev_kfree_skb_any(rx->rx_ring[id].skb); + rx->rx_ring[id].skb = NULL; + } + + return ret; +} + +static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, + struct ipoib_cm_rx_buf *rx_ring, + int id, int frags, + u64 mapping[IPOIB_CM_RX_SG], + gfp_t gfp) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct sk_buff *skb; + int i; + + skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12); + if (unlikely(!skb)) + return NULL; + + /* + * IPoIB adds a 4 byte header. So we need 12 more bytes to align the + * IP header to a multiple of 16. + */ + skb_reserve(skb, 12); + + mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE, + DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) { + dev_kfree_skb_any(skb); + return NULL; + } + + for (i = 0; i < frags; i++) { + struct page *page = alloc_page(gfp); + + if (!page) + goto partial_error; + skb_fill_page_desc(skb, i, page, 0, PAGE_SIZE); + + mapping[i + 1] = ib_dma_map_page(priv->ca, page, + 0, PAGE_SIZE, DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, mapping[i + 1]))) + goto partial_error; + } + + rx_ring[id].skb = skb; + return skb; + +partial_error: + + ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); + + for (; i > 0; --i) + ib_dma_unmap_page(priv->ca, mapping[i], PAGE_SIZE, DMA_FROM_DEVICE); + + dev_kfree_skb_any(skb); + return NULL; +} + +static void ipoib_cm_free_rx_ring(struct net_device *dev, + struct ipoib_cm_rx_buf *rx_ring) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int i; + + for (i = 0; i < ipoib_recvq_size; ++i) + if (rx_ring[i].skb) { + ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1, + rx_ring[i].mapping); + dev_kfree_skb_any(rx_ring[i].skb); + } + + vfree(rx_ring); +} + +static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv) +{ + struct ib_send_wr *bad_wr; + struct ipoib_cm_rx *p; + + /* We only reserved 1 extra slot in CQ for drain WRs, so + * make sure we have at most 1 outstanding WR. */ + if (list_empty(&priv->cm.rx_flush_list) || + !list_empty(&priv->cm.rx_drain_list)) + return; + + /* + * QPs on flush list are error state. This way, a "flush + * error" WC will be immediately generated for each WR we post. + */ + p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list); + if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr)) + ipoib_warn(priv, "failed to post drain wr\n"); + + list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list); +} + +static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx) +{ + struct ipoib_cm_rx *p = ctx; + struct ipoib_dev_priv *priv = netdev_priv(p->dev); + unsigned long flags; + + if (event->event != IB_EVENT_QP_LAST_WQE_REACHED) + return; + + spin_lock_irqsave(&priv->lock, flags); + list_move(&p->list, &priv->cm.rx_flush_list); + p->state = IPOIB_CM_RX_FLUSH; + ipoib_cm_start_rx_drain(priv); + spin_unlock_irqrestore(&priv->lock, flags); +} + +static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev, + struct ipoib_cm_rx *p) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_init_attr attr = { + .event_handler = ipoib_cm_rx_event_handler, + .send_cq = priv->recv_cq, /* For drain WR */ + .recv_cq = priv->recv_cq, + .srq = priv->cm.srq, + .cap.max_send_wr = 1, /* For drain WR */ + .cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */ + .sq_sig_type = IB_SIGNAL_ALL_WR, + .qp_type = IB_QPT_RC, + .qp_context = p, + }; + + if (!ipoib_cm_has_srq(dev)) { + attr.cap.max_recv_wr = ipoib_recvq_size; + attr.cap.max_recv_sge = IPOIB_CM_RX_SG; + } + + return ib_create_qp(priv->pd, &attr); +} + +static int ipoib_cm_modify_rx_qp(struct net_device *dev, + struct ib_cm_id *cm_id, struct ib_qp *qp, + unsigned psn) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + + qp_attr.qp_state = IB_QPS_INIT; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for INIT: %d\n", ret); + return ret; + } + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to INIT: %d\n", ret); + return ret; + } + qp_attr.qp_state = IB_QPS_RTR; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret); + return ret; + } + qp_attr.rq_psn = psn; + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret); + return ret; + } + + /* + * Current Mellanox HCA firmware won't generate completions + * with error for drain WRs unless the QP has been moved to + * RTS first. This work-around leaves a window where a QP has + * moved to error asynchronously, but this will eventually get + * fixed in firmware, so let's not error out if modify QP + * fails. + */ + qp_attr.qp_state = IB_QPS_RTS; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret); + return 0; + } + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret); + return 0; + } + + return 0; +} + +static void ipoib_cm_init_rx_wr(struct net_device *dev, + struct ib_recv_wr *wr, + struct ib_sge *sge) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int i; + + for (i = 0; i < priv->cm.num_frags; ++i) + sge[i].lkey = priv->mr->lkey; + + sge[0].length = IPOIB_CM_HEAD_SIZE; + for (i = 1; i < priv->cm.num_frags; ++i) + sge[i].length = PAGE_SIZE; + + wr->next = NULL; + wr->sg_list = sge; + wr->num_sge = priv->cm.num_frags; +} + +static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_id, + struct ipoib_cm_rx *rx) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct { + struct ib_recv_wr wr; + struct ib_sge sge[IPOIB_CM_RX_SG]; + } *t; + int ret; + int i; + + rx->rx_ring = vzalloc(ipoib_recvq_size * sizeof *rx->rx_ring); + if (!rx->rx_ring) { + printk(KERN_WARNING "%s: failed to allocate CM non-SRQ ring (%d entries)\n", + priv->ca->name, ipoib_recvq_size); + return -ENOMEM; + } + + t = kmalloc(sizeof *t, GFP_KERNEL); + if (!t) { + ret = -ENOMEM; + goto err_free; + } + + ipoib_cm_init_rx_wr(dev, &t->wr, t->sge); + + spin_lock_irq(&priv->lock); + + if (priv->cm.nonsrq_conn_qp >= ipoib_max_conn_qp) { + spin_unlock_irq(&priv->lock); + ib_send_cm_rej(cm_id, IB_CM_REJ_NO_QP, NULL, 0, NULL, 0); + ret = -EINVAL; + goto err_free; + } else + ++priv->cm.nonsrq_conn_qp; + + spin_unlock_irq(&priv->lock); + + for (i = 0; i < ipoib_recvq_size; ++i) { + if (!ipoib_cm_alloc_rx_skb(dev, rx->rx_ring, i, IPOIB_CM_RX_SG - 1, + rx->rx_ring[i].mapping, + GFP_KERNEL)) { + ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); + ret = -ENOMEM; + goto err_count; + } + ret = ipoib_cm_post_receive_nonsrq(dev, rx, &t->wr, t->sge, i); + if (ret) { + ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq " + "failed for buf %d\n", i); + ret = -EIO; + goto err_count; + } + } + + rx->recv_count = ipoib_recvq_size; + + kfree(t); + + return 0; + +err_count: + spin_lock_irq(&priv->lock); + --priv->cm.nonsrq_conn_qp; + spin_unlock_irq(&priv->lock); + +err_free: + kfree(t); + ipoib_cm_free_rx_ring(dev, rx->rx_ring); + + return ret; +} + +static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id, + struct ib_qp *qp, struct ib_cm_req_event_param *req, + unsigned psn) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_data data = {}; + struct ib_cm_rep_param rep = {}; + + data.qpn = cpu_to_be32(priv->qp->qp_num); + data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE); + + rep.private_data = &data; + rep.private_data_len = sizeof data; + rep.flow_control = 0; + rep.rnr_retry_count = req->rnr_retry_count; + rep.srq = ipoib_cm_has_srq(dev); + rep.qp_num = qp->qp_num; + rep.starting_psn = psn; + return ib_send_cm_rep(cm_id, &rep); +} + +static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) +{ + struct net_device *dev = cm_id->context; + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_rx *p; + unsigned psn; + int ret; + + ipoib_dbg(priv, "REQ arrived\n"); + p = kzalloc(sizeof *p, GFP_KERNEL); + if (!p) + return -ENOMEM; + p->dev = dev; + p->id = cm_id; + cm_id->context = p; + p->state = IPOIB_CM_RX_LIVE; + p->jiffies = jiffies; + INIT_LIST_HEAD(&p->list); + + p->qp = ipoib_cm_create_rx_qp(dev, p); + if (IS_ERR(p->qp)) { + ret = PTR_ERR(p->qp); + goto err_qp; + } + + psn = prandom_u32() & 0xffffff; + ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn); + if (ret) + goto err_modify; + + if (!ipoib_cm_has_srq(dev)) { + ret = ipoib_cm_nonsrq_init_rx(dev, cm_id, p); + if (ret) + goto err_modify; + } + + spin_lock_irq(&priv->lock); + queue_delayed_work(ipoib_workqueue, + &priv->cm.stale_task, IPOIB_CM_RX_DELAY); + /* Add this entry to passive ids list head, but do not re-add it + * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */ + p->jiffies = jiffies; + if (p->state == IPOIB_CM_RX_LIVE) + list_move(&p->list, &priv->cm.passive_ids); + spin_unlock_irq(&priv->lock); + + ret = ipoib_cm_send_rep(dev, cm_id, p->qp, &event->param.req_rcvd, psn); + if (ret) { + ipoib_warn(priv, "failed to send REP: %d\n", ret); + if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE)) + ipoib_warn(priv, "unable to move qp to error state\n"); + } + return 0; + +err_modify: + ib_destroy_qp(p->qp); +err_qp: + kfree(p); + return ret; +} + +static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id, + struct ib_cm_event *event) +{ + struct ipoib_cm_rx *p; + struct ipoib_dev_priv *priv; + + switch (event->event) { + case IB_CM_REQ_RECEIVED: + return ipoib_cm_req_handler(cm_id, event); + case IB_CM_DREQ_RECEIVED: + p = cm_id->context; + ib_send_cm_drep(cm_id, NULL, 0); + /* Fall through */ + case IB_CM_REJ_RECEIVED: + p = cm_id->context; + priv = netdev_priv(p->dev); + if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE)) + ipoib_warn(priv, "unable to move qp to error state\n"); + /* Fall through */ + default: + return 0; + } +} +/* Adjust length of skb with fragments to match received data */ +static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space, + unsigned int length, struct sk_buff *toskb) +{ + int i, num_frags; + unsigned int size; + + /* put header into skb */ + size = min(length, hdr_space); + skb->tail += size; + skb->len += size; + length -= size; + + num_frags = skb_shinfo(skb)->nr_frags; + for (i = 0; i < num_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + if (length == 0) { + /* don't need this page */ + skb_fill_page_desc(toskb, i, skb_frag_page(frag), + 0, PAGE_SIZE); + --skb_shinfo(skb)->nr_frags; + } else { + size = min(length, (unsigned) PAGE_SIZE); + + skb_frag_size_set(frag, size); + skb->data_len += size; + skb->truesize += size; + skb->len += size; + length -= size; + } + } +} + +void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_rx_buf *rx_ring; + unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV); + struct sk_buff *skb, *newskb; + struct ipoib_cm_rx *p; + unsigned long flags; + u64 mapping[IPOIB_CM_RX_SG]; + int frags; + int has_srq; + struct sk_buff *small_skb; + + ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n", + wr_id, wc->status); + + if (unlikely(wr_id >= ipoib_recvq_size)) { + if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) { + spin_lock_irqsave(&priv->lock, flags); + list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list); + ipoib_cm_start_rx_drain(priv); + queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); + spin_unlock_irqrestore(&priv->lock, flags); + } else + ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n", + wr_id, ipoib_recvq_size); + return; + } + + p = wc->qp->qp_context; + + has_srq = ipoib_cm_has_srq(dev); + rx_ring = has_srq ? priv->cm.srq_ring : p->rx_ring; + + skb = rx_ring[wr_id].skb; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + ipoib_dbg(priv, "cm recv error " + "(status=%d, wrid=%d vend_err %x)\n", + wc->status, wr_id, wc->vendor_err); + ++dev->stats.rx_dropped; + if (has_srq) + goto repost; + else { + if (!--p->recv_count) { + spin_lock_irqsave(&priv->lock, flags); + list_move(&p->list, &priv->cm.rx_reap_list); + spin_unlock_irqrestore(&priv->lock, flags); + queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); + } + return; + } + } + + if (unlikely(!(wr_id & IPOIB_CM_RX_UPDATE_MASK))) { + if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) { + spin_lock_irqsave(&priv->lock, flags); + p->jiffies = jiffies; + /* Move this entry to list head, but do not re-add it + * if it has been moved out of list. */ + if (p->state == IPOIB_CM_RX_LIVE) + list_move(&p->list, &priv->cm.passive_ids); + spin_unlock_irqrestore(&priv->lock, flags); + } + } + + if (wc->byte_len < IPOIB_CM_COPYBREAK) { + int dlen = wc->byte_len; + + small_skb = dev_alloc_skb(dlen + 12); + if (small_skb) { + skb_reserve(small_skb, 12); + ib_dma_sync_single_for_cpu(priv->ca, rx_ring[wr_id].mapping[0], + dlen, DMA_FROM_DEVICE); + skb_copy_from_linear_data(skb, small_skb->data, dlen); + ib_dma_sync_single_for_device(priv->ca, rx_ring[wr_id].mapping[0], + dlen, DMA_FROM_DEVICE); + skb_put(small_skb, dlen); + skb = small_skb; + goto copied; + } + } + + frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len, + (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE; + + newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags, + mapping, GFP_ATOMIC); + if (unlikely(!newskb)) { + /* + * If we can't allocate a new RX buffer, dump + * this packet and reuse the old buffer. + */ + ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id); + ++dev->stats.rx_dropped; + goto repost; + } + + ipoib_cm_dma_unmap_rx(priv, frags, rx_ring[wr_id].mapping); + memcpy(rx_ring[wr_id].mapping, mapping, (frags + 1) * sizeof *mapping); + + ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n", + wc->byte_len, wc->slid); + + skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newskb); + +copied: + skb->protocol = ((struct ipoib_header *) skb->data)->proto; + skb_reset_mac_header(skb); + skb_pull(skb, IPOIB_ENCAP_LEN); + + ++dev->stats.rx_packets; + dev->stats.rx_bytes += skb->len; + + skb->dev = dev; + /* XXX get correct PACKET_ type here */ + skb->pkt_type = PACKET_HOST; + netif_receive_skb(skb); + +repost: + if (has_srq) { + if (unlikely(ipoib_cm_post_receive_srq(dev, wr_id))) + ipoib_warn(priv, "ipoib_cm_post_receive_srq failed " + "for buf %d\n", wr_id); + } else { + if (unlikely(ipoib_cm_post_receive_nonsrq(dev, p, + &priv->cm.rx_wr, + priv->cm.rx_sge, + wr_id))) { + --p->recv_count; + ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq failed " + "for buf %d\n", wr_id); + } + } +} + +static inline int post_send(struct ipoib_dev_priv *priv, + struct ipoib_cm_tx *tx, + unsigned int wr_id, + u64 addr, int len) +{ + struct ib_send_wr *bad_wr; + + priv->tx_sge[0].addr = addr; + priv->tx_sge[0].length = len; + + priv->tx_wr.num_sge = 1; + priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM; + + return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr); +} + +void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_tx_buf *tx_req; + u64 addr; + int rc; + + if (unlikely(skb->len > tx->mtu)) { + ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", + skb->len, tx->mtu); + ++dev->stats.tx_dropped; + ++dev->stats.tx_errors; + ipoib_cm_skb_too_long(dev, skb, tx->mtu - IPOIB_ENCAP_LEN); + return; + } + + ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n", + tx->tx_head, skb->len, tx->qp->qp_num); + + /* + * We put the skb into the tx_ring _before_ we call post_send() + * because it's entirely possible that the completion handler will + * run before we execute anything after the post_send(). That + * means we have to make sure everything is properly recorded and + * our state is consistent before we call post_send(). + */ + tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)]; + tx_req->skb = skb; + addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, addr))) { + ++dev->stats.tx_errors; + dev_kfree_skb_any(skb); + return; + } + + tx_req->mapping = addr; + + skb_orphan(skb); + skb_dst_drop(skb); + + rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), + addr, skb->len); + if (unlikely(rc)) { + ipoib_warn(priv, "post_send failed, error %d\n", rc); + ++dev->stats.tx_errors; + ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE); + dev_kfree_skb_any(skb); + } else { + dev->trans_start = jiffies; + ++tx->tx_head; + + if (++priv->tx_outstanding == ipoib_sendq_size) { + ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n", + tx->qp->qp_num); + netif_stop_queue(dev); + rc = ib_req_notify_cq(priv->send_cq, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + if (rc < 0) + ipoib_warn(priv, "request notify on send CQ failed\n"); + else if (rc) + ipoib_send_comp_handler(priv->send_cq, dev); + } + } +} + +void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_tx *tx = wc->qp->qp_context; + unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM; + struct ipoib_cm_tx_buf *tx_req; + unsigned long flags; + + ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n", + wr_id, wc->status); + + if (unlikely(wr_id >= ipoib_sendq_size)) { + ipoib_warn(priv, "cm send completion event with wrid %d (> %d)\n", + wr_id, ipoib_sendq_size); + return; + } + + tx_req = &tx->tx_ring[wr_id]; + + ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE); + + /* FIXME: is this right? Shouldn't we only increment on success? */ + ++dev->stats.tx_packets; + dev->stats.tx_bytes += tx_req->skb->len; + + dev_kfree_skb_any(tx_req->skb); + + netif_tx_lock(dev); + + ++tx->tx_tail; + if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && + netif_queue_stopped(dev) && + test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) + netif_wake_queue(dev); + + if (wc->status != IB_WC_SUCCESS && + wc->status != IB_WC_WR_FLUSH_ERR) { + struct ipoib_neigh *neigh; + + ipoib_dbg(priv, "failed cm send event " + "(status=%d, wrid=%d vend_err %x)\n", + wc->status, wr_id, wc->vendor_err); + + spin_lock_irqsave(&priv->lock, flags); + neigh = tx->neigh; + + if (neigh) { + neigh->cm = NULL; + ipoib_neigh_free(neigh); + + tx->neigh = NULL; + } + + if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { + list_move(&tx->list, &priv->cm.reap_list); + queue_work(ipoib_workqueue, &priv->cm.reap_task); + } + + clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags); + + spin_unlock_irqrestore(&priv->lock, flags); + } + + netif_tx_unlock(dev); +} + +int ipoib_cm_dev_open(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int ret; + + if (!IPOIB_CM_SUPPORTED(dev->dev_addr)) + return 0; + + priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev); + if (IS_ERR(priv->cm.id)) { + printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name); + ret = PTR_ERR(priv->cm.id); + goto err_cm; + } + + ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num), + 0, NULL); + if (ret) { + printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name, + IPOIB_CM_IETF_ID | priv->qp->qp_num); + goto err_listen; + } + + return 0; + +err_listen: + ib_destroy_cm_id(priv->cm.id); +err_cm: + priv->cm.id = NULL; + return ret; +} + +static void ipoib_cm_free_rx_reap_list(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_rx *rx, *n; + LIST_HEAD(list); + + spin_lock_irq(&priv->lock); + list_splice_init(&priv->cm.rx_reap_list, &list); + spin_unlock_irq(&priv->lock); + + list_for_each_entry_safe(rx, n, &list, list) { + ib_destroy_cm_id(rx->id); + ib_destroy_qp(rx->qp); + if (!ipoib_cm_has_srq(dev)) { + ipoib_cm_free_rx_ring(priv->dev, rx->rx_ring); + spin_lock_irq(&priv->lock); + --priv->cm.nonsrq_conn_qp; + spin_unlock_irq(&priv->lock); + } + kfree(rx); + } +} + +void ipoib_cm_dev_stop(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_rx *p; + unsigned long begin; + int ret; + + if (!IPOIB_CM_SUPPORTED(dev->dev_addr) || !priv->cm.id) + return; + + ib_destroy_cm_id(priv->cm.id); + priv->cm.id = NULL; + + spin_lock_irq(&priv->lock); + while (!list_empty(&priv->cm.passive_ids)) { + p = list_entry(priv->cm.passive_ids.next, typeof(*p), list); + list_move(&p->list, &priv->cm.rx_error_list); + p->state = IPOIB_CM_RX_ERROR; + spin_unlock_irq(&priv->lock); + ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE); + if (ret) + ipoib_warn(priv, "unable to move qp to error state: %d\n", ret); + spin_lock_irq(&priv->lock); + } + + /* Wait for all RX to be drained */ + begin = jiffies; + + while (!list_empty(&priv->cm.rx_error_list) || + !list_empty(&priv->cm.rx_flush_list) || + !list_empty(&priv->cm.rx_drain_list)) { + if (time_after(jiffies, begin + 5 * HZ)) { + ipoib_warn(priv, "RX drain timing out\n"); + + /* + * assume the HW is wedged and just free up everything. + */ + list_splice_init(&priv->cm.rx_flush_list, + &priv->cm.rx_reap_list); + list_splice_init(&priv->cm.rx_error_list, + &priv->cm.rx_reap_list); + list_splice_init(&priv->cm.rx_drain_list, + &priv->cm.rx_reap_list); + break; + } + spin_unlock_irq(&priv->lock); + msleep(1); + ipoib_drain_cq(dev); + spin_lock_irq(&priv->lock); + } + + spin_unlock_irq(&priv->lock); + + ipoib_cm_free_rx_reap_list(dev); + + cancel_delayed_work(&priv->cm.stale_task); +} + +static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) +{ + struct ipoib_cm_tx *p = cm_id->context; + struct ipoib_dev_priv *priv = netdev_priv(p->dev); + struct ipoib_cm_data *data = event->private_data; + struct sk_buff_head skqueue; + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + struct sk_buff *skb; + + p->mtu = be32_to_cpu(data->mtu); + + if (p->mtu <= IPOIB_ENCAP_LEN) { + ipoib_warn(priv, "Rejecting connection: mtu %d <= %d\n", + p->mtu, IPOIB_ENCAP_LEN); + return -EINVAL; + } + + qp_attr.qp_state = IB_QPS_RTR; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret); + return ret; + } + + qp_attr.rq_psn = 0 /* FIXME */; + ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret); + return ret; + } + + qp_attr.qp_state = IB_QPS_RTS; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret); + return ret; + } + ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret); + return ret; + } + + skb_queue_head_init(&skqueue); + + spin_lock_irq(&priv->lock); + set_bit(IPOIB_FLAG_OPER_UP, &p->flags); + if (p->neigh) + while ((skb = __skb_dequeue(&p->neigh->queue))) + __skb_queue_tail(&skqueue, skb); + spin_unlock_irq(&priv->lock); + + while ((skb = __skb_dequeue(&skqueue))) { + skb->dev = p->dev; + if (dev_queue_xmit(skb)) + ipoib_warn(priv, "dev_queue_xmit failed " + "to requeue packet\n"); + } + + ret = ib_send_cm_rtu(cm_id, NULL, 0); + if (ret) { + ipoib_warn(priv, "failed to send RTU: %d\n", ret); + return ret; + } + return 0; +} + +static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_cm_tx *tx) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_init_attr attr = { + .send_cq = priv->recv_cq, + .recv_cq = priv->recv_cq, + .srq = priv->cm.srq, + .cap.max_send_wr = ipoib_sendq_size, + .cap.max_send_sge = 1, + .sq_sig_type = IB_SIGNAL_ALL_WR, + .qp_type = IB_QPT_RC, + .qp_context = tx, + .create_flags = IB_QP_CREATE_USE_GFP_NOIO + }; + + struct ib_qp *tx_qp; + + tx_qp = ib_create_qp(priv->pd, &attr); + if (PTR_ERR(tx_qp) == -EINVAL) { + ipoib_warn(priv, "can't use GFP_NOIO for QPs on device %s, using GFP_KERNEL\n", + priv->ca->name); + attr.create_flags &= ~IB_QP_CREATE_USE_GFP_NOIO; + tx_qp = ib_create_qp(priv->pd, &attr); + } + return tx_qp; +} + +static int ipoib_cm_send_req(struct net_device *dev, + struct ib_cm_id *id, struct ib_qp *qp, + u32 qpn, + struct ib_sa_path_rec *pathrec) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_data data = {}; + struct ib_cm_req_param req = {}; + + data.qpn = cpu_to_be32(priv->qp->qp_num); + data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE); + + req.primary_path = pathrec; + req.alternate_path = NULL; + req.service_id = cpu_to_be64(IPOIB_CM_IETF_ID | qpn); + req.qp_num = qp->qp_num; + req.qp_type = qp->qp_type; + req.private_data = &data; + req.private_data_len = sizeof data; + req.flow_control = 0; + + req.starting_psn = 0; /* FIXME */ + + /* + * Pick some arbitrary defaults here; we could make these + * module parameters if anyone cared about setting them. + */ + req.responder_resources = 4; + req.remote_cm_response_timeout = 20; + req.local_cm_response_timeout = 20; + req.retry_count = 0; /* RFC draft warns against retries */ + req.rnr_retry_count = 0; /* RFC draft warns against retries */ + req.max_cm_retries = 15; + req.srq = ipoib_cm_has_srq(dev); + return ib_send_cm_req(id, &req); +} + +static int ipoib_cm_modify_tx_init(struct net_device *dev, + struct ib_cm_id *cm_id, struct ib_qp *qp) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + ret = ib_find_pkey(priv->ca, priv->port, priv->pkey, &qp_attr.pkey_index); + if (ret) { + ipoib_warn(priv, "pkey 0x%x not found: %d\n", priv->pkey, ret); + return ret; + } + + qp_attr.qp_state = IB_QPS_INIT; + qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE; + qp_attr.port_num = priv->port; + qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT; + + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify tx QP to INIT: %d\n", ret); + return ret; + } + return 0; +} + +static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, + struct ib_sa_path_rec *pathrec) +{ + struct ipoib_dev_priv *priv = netdev_priv(p->dev); + int ret; + + p->tx_ring = __vmalloc(ipoib_sendq_size * sizeof *p->tx_ring, + GFP_NOIO, PAGE_KERNEL); + if (!p->tx_ring) { + ipoib_warn(priv, "failed to allocate tx ring\n"); + ret = -ENOMEM; + goto err_tx; + } + memset(p->tx_ring, 0, ipoib_sendq_size * sizeof *p->tx_ring); + + p->qp = ipoib_cm_create_tx_qp(p->dev, p); + if (IS_ERR(p->qp)) { + ret = PTR_ERR(p->qp); + ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret); + goto err_qp; + } + + p->id = ib_create_cm_id(priv->ca, ipoib_cm_tx_handler, p); + if (IS_ERR(p->id)) { + ret = PTR_ERR(p->id); + ipoib_warn(priv, "failed to create tx cm id: %d\n", ret); + goto err_id; + } + + ret = ipoib_cm_modify_tx_init(p->dev, p->id, p->qp); + if (ret) { + ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret); + goto err_modify; + } + + ret = ipoib_cm_send_req(p->dev, p->id, p->qp, qpn, pathrec); + if (ret) { + ipoib_warn(priv, "failed to send cm req: %d\n", ret); + goto err_send_cm; + } + + ipoib_dbg(priv, "Request connection 0x%x for gid %pI6 qpn 0x%x\n", + p->qp->qp_num, pathrec->dgid.raw, qpn); + + return 0; + +err_send_cm: +err_modify: + ib_destroy_cm_id(p->id); +err_id: + p->id = NULL; + ib_destroy_qp(p->qp); +err_qp: + p->qp = NULL; + vfree(p->tx_ring); +err_tx: + return ret; +} + +static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p) +{ + struct ipoib_dev_priv *priv = netdev_priv(p->dev); + struct ipoib_cm_tx_buf *tx_req; + unsigned long begin; + + ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n", + p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail); + + if (p->id) + ib_destroy_cm_id(p->id); + + if (p->tx_ring) { + /* Wait for all sends to complete */ + begin = jiffies; + while ((int) p->tx_tail - (int) p->tx_head < 0) { + if (time_after(jiffies, begin + 5 * HZ)) { + ipoib_warn(priv, "timing out; %d sends not completed\n", + p->tx_head - p->tx_tail); + goto timeout; + } + + msleep(1); + } + } + +timeout: + + while ((int) p->tx_tail - (int) p->tx_head < 0) { + tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)]; + ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, + DMA_TO_DEVICE); + dev_kfree_skb_any(tx_req->skb); + ++p->tx_tail; + netif_tx_lock_bh(p->dev); + if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && + netif_queue_stopped(p->dev) && + test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) + netif_wake_queue(p->dev); + netif_tx_unlock_bh(p->dev); + } + + if (p->qp) + ib_destroy_qp(p->qp); + + vfree(p->tx_ring); + kfree(p); +} + +static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, + struct ib_cm_event *event) +{ + struct ipoib_cm_tx *tx = cm_id->context; + struct ipoib_dev_priv *priv = netdev_priv(tx->dev); + struct net_device *dev = priv->dev; + struct ipoib_neigh *neigh; + unsigned long flags; + int ret; + + switch (event->event) { + case IB_CM_DREQ_RECEIVED: + ipoib_dbg(priv, "DREQ received.\n"); + ib_send_cm_drep(cm_id, NULL, 0); + break; + case IB_CM_REP_RECEIVED: + ipoib_dbg(priv, "REP received.\n"); + ret = ipoib_cm_rep_handler(cm_id, event); + if (ret) + ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, + NULL, 0, NULL, 0); + break; + case IB_CM_REQ_ERROR: + case IB_CM_REJ_RECEIVED: + case IB_CM_TIMEWAIT_EXIT: + ipoib_dbg(priv, "CM error %d.\n", event->event); + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + neigh = tx->neigh; + + if (neigh) { + neigh->cm = NULL; + ipoib_neigh_free(neigh); + + tx->neigh = NULL; + } + + if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { + list_move(&tx->list, &priv->cm.reap_list); + queue_work(ipoib_workqueue, &priv->cm.reap_task); + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); + break; + default: + break; + } + + return 0; +} + +struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path, + struct ipoib_neigh *neigh) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_tx *tx; + + tx = kzalloc(sizeof *tx, GFP_ATOMIC); + if (!tx) + return NULL; + + neigh->cm = tx; + tx->neigh = neigh; + tx->path = path; + tx->dev = dev; + list_add(&tx->list, &priv->cm.start_list); + set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags); + queue_work(ipoib_workqueue, &priv->cm.start_task); + return tx; +} + +void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) +{ + struct ipoib_dev_priv *priv = netdev_priv(tx->dev); + unsigned long flags; + if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { + spin_lock_irqsave(&priv->lock, flags); + list_move(&tx->list, &priv->cm.reap_list); + queue_work(ipoib_workqueue, &priv->cm.reap_task); + ipoib_dbg(priv, "Reap connection for gid %pI6\n", + tx->neigh->daddr + 4); + tx->neigh = NULL; + spin_unlock_irqrestore(&priv->lock, flags); + } +} + +static void ipoib_cm_tx_start(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.start_task); + struct net_device *dev = priv->dev; + struct ipoib_neigh *neigh; + struct ipoib_cm_tx *p; + unsigned long flags; + int ret; + + struct ib_sa_path_rec pathrec; + u32 qpn; + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + + while (!list_empty(&priv->cm.start_list)) { + p = list_entry(priv->cm.start_list.next, typeof(*p), list); + list_del_init(&p->list); + neigh = p->neigh; + qpn = IPOIB_QPN(neigh->daddr); + memcpy(&pathrec, &p->path->pathrec, sizeof pathrec); + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); + + ret = ipoib_cm_tx_init(p, qpn, &pathrec); + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + + if (ret) { + neigh = p->neigh; + if (neigh) { + neigh->cm = NULL; + ipoib_neigh_free(neigh); + } + list_del(&p->list); + kfree(p); + } + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); +} + +static void ipoib_cm_tx_reap(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.reap_task); + struct net_device *dev = priv->dev; + struct ipoib_cm_tx *p; + unsigned long flags; + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + + while (!list_empty(&priv->cm.reap_list)) { + p = list_entry(priv->cm.reap_list.next, typeof(*p), list); + list_del(&p->list); + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); + ipoib_cm_tx_destroy(p); + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); +} + +static void ipoib_cm_skb_reap(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.skb_task); + struct net_device *dev = priv->dev; + struct sk_buff *skb; + unsigned long flags; + unsigned mtu = priv->mcast_mtu; + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + + while ((skb = skb_dequeue(&priv->cm.skb_queue))) { + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); + + if (skb->protocol == htons(ETH_P_IP)) + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); +#if IS_ENABLED(CONFIG_IPV6) + else if (skb->protocol == htons(ETH_P_IPV6)) + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); +#endif + dev_kfree_skb_any(skb); + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); +} + +void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, + unsigned int mtu) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int e = skb_queue_empty(&priv->cm.skb_queue); + + if (skb_dst(skb)) + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + + skb_queue_tail(&priv->cm.skb_queue, skb); + if (e) + queue_work(ipoib_workqueue, &priv->cm.skb_task); +} + +static void ipoib_cm_rx_reap(struct work_struct *work) +{ + ipoib_cm_free_rx_reap_list(container_of(work, struct ipoib_dev_priv, + cm.rx_reap_task)->dev); +} + +static void ipoib_cm_stale_task(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.stale_task.work); + struct ipoib_cm_rx *p; + int ret; + + spin_lock_irq(&priv->lock); + while (!list_empty(&priv->cm.passive_ids)) { + /* List is sorted by LRU, start from tail, + * stop when we see a recently used entry */ + p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list); + if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT)) + break; + list_move(&p->list, &priv->cm.rx_error_list); + p->state = IPOIB_CM_RX_ERROR; + spin_unlock_irq(&priv->lock); + ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE); + if (ret) + ipoib_warn(priv, "unable to move qp to error state: %d\n", ret); + spin_lock_irq(&priv->lock); + } + + if (!list_empty(&priv->cm.passive_ids)) + queue_delayed_work(ipoib_workqueue, + &priv->cm.stale_task, IPOIB_CM_RX_DELAY); + spin_unlock_irq(&priv->lock); +} + + +static ssize_t show_mode(struct device *d, struct device_attribute *attr, + char *buf) +{ + struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(d)); + + if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags)) + return sprintf(buf, "connected\n"); + else + return sprintf(buf, "datagram\n"); +} + +static ssize_t set_mode(struct device *d, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct net_device *dev = to_net_dev(d); + int ret; + + if (!rtnl_trylock()) + return restart_syscall(); + + ret = ipoib_set_mode(dev, buf); + + rtnl_unlock(); + + if (!ret) + return count; + + return ret; +} + +static DEVICE_ATTR(mode, S_IWUSR | S_IRUGO, show_mode, set_mode); + +int ipoib_cm_add_mode_attr(struct net_device *dev) +{ + return device_create_file(&dev->dev, &dev_attr_mode); +} + +static void ipoib_cm_create_srq(struct net_device *dev, int max_sge) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_srq_init_attr srq_init_attr = { + .srq_type = IB_SRQT_BASIC, + .attr = { + .max_wr = ipoib_recvq_size, + .max_sge = max_sge + } + }; + + priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr); + if (IS_ERR(priv->cm.srq)) { + if (PTR_ERR(priv->cm.srq) != -ENOSYS) + printk(KERN_WARNING "%s: failed to allocate SRQ, error %ld\n", + priv->ca->name, PTR_ERR(priv->cm.srq)); + priv->cm.srq = NULL; + return; + } + + priv->cm.srq_ring = vzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring); + if (!priv->cm.srq_ring) { + printk(KERN_WARNING "%s: failed to allocate CM SRQ ring (%d entries)\n", + priv->ca->name, ipoib_recvq_size); + ib_destroy_srq(priv->cm.srq); + priv->cm.srq = NULL; + return; + } + +} + +int ipoib_cm_dev_init(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int i, ret; + struct ib_device_attr attr; + + INIT_LIST_HEAD(&priv->cm.passive_ids); + INIT_LIST_HEAD(&priv->cm.reap_list); + INIT_LIST_HEAD(&priv->cm.start_list); + INIT_LIST_HEAD(&priv->cm.rx_error_list); + INIT_LIST_HEAD(&priv->cm.rx_flush_list); + INIT_LIST_HEAD(&priv->cm.rx_drain_list); + INIT_LIST_HEAD(&priv->cm.rx_reap_list); + INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start); + INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap); + INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap); + INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap); + INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task); + + skb_queue_head_init(&priv->cm.skb_queue); + + ret = ib_query_device(priv->ca, &attr); + if (ret) { + printk(KERN_WARNING "ib_query_device() failed with %d\n", ret); + return ret; + } + + ipoib_dbg(priv, "max_srq_sge=%d\n", attr.max_srq_sge); + + attr.max_srq_sge = min_t(int, IPOIB_CM_RX_SG, attr.max_srq_sge); + ipoib_cm_create_srq(dev, attr.max_srq_sge); + if (ipoib_cm_has_srq(dev)) { + priv->cm.max_cm_mtu = attr.max_srq_sge * PAGE_SIZE - 0x10; + priv->cm.num_frags = attr.max_srq_sge; + ipoib_dbg(priv, "max_cm_mtu = 0x%x, num_frags=%d\n", + priv->cm.max_cm_mtu, priv->cm.num_frags); + } else { + priv->cm.max_cm_mtu = IPOIB_CM_MTU; + priv->cm.num_frags = IPOIB_CM_RX_SG; + } + + ipoib_cm_init_rx_wr(dev, &priv->cm.rx_wr, priv->cm.rx_sge); + + if (ipoib_cm_has_srq(dev)) { + for (i = 0; i < ipoib_recvq_size; ++i) { + if (!ipoib_cm_alloc_rx_skb(dev, priv->cm.srq_ring, i, + priv->cm.num_frags - 1, + priv->cm.srq_ring[i].mapping, + GFP_KERNEL)) { + ipoib_warn(priv, "failed to allocate " + "receive buffer %d\n", i); + ipoib_cm_dev_cleanup(dev); + return -ENOMEM; + } + + if (ipoib_cm_post_receive_srq(dev, i)) { + ipoib_warn(priv, "ipoib_cm_post_receive_srq " + "failed for buf %d\n", i); + ipoib_cm_dev_cleanup(dev); + return -EIO; + } + } + } + + priv->dev->dev_addr[0] = IPOIB_FLAGS_RC; + return 0; +} + +void ipoib_cm_dev_cleanup(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int ret; + + if (!priv->cm.srq) + return; + + ipoib_dbg(priv, "Cleanup ipoib connected mode.\n"); + + ret = ib_destroy_srq(priv->cm.srq); + if (ret) + ipoib_warn(priv, "ib_destroy_srq failed: %d\n", ret); + + priv->cm.srq = NULL; + if (!priv->cm.srq_ring) + return; + + ipoib_cm_free_rx_ring(dev, priv->cm.srq_ring); + priv->cm.srq_ring = NULL; +} diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c new file mode 100644 index 0000000..078cadd --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2007 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "ipoib.h" + +static void ipoib_get_drvinfo(struct net_device *netdev, + struct ethtool_drvinfo *drvinfo) +{ + struct ipoib_dev_priv *priv = netdev_priv(netdev); + struct ib_device_attr *attr; + + attr = kmalloc(sizeof(*attr), GFP_KERNEL); + if (attr && !ib_query_device(priv->ca, attr)) + snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), + "%d.%d.%d", (int)(attr->fw_ver >> 32), + (int)(attr->fw_ver >> 16) & 0xffff, + (int)attr->fw_ver & 0xffff); + kfree(attr); + + strlcpy(drvinfo->bus_info, dev_name(priv->ca->dma_device), + sizeof(drvinfo->bus_info)); + + strlcpy(drvinfo->version, ipoib_driver_version, + sizeof(drvinfo->version)); + + strlcpy(drvinfo->driver, "ib_ipoib", sizeof(drvinfo->driver)); +} + +static int ipoib_get_coalesce(struct net_device *dev, + struct ethtool_coalesce *coal) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + coal->rx_coalesce_usecs = priv->ethtool.coalesce_usecs; + coal->rx_max_coalesced_frames = priv->ethtool.max_coalesced_frames; + + return 0; +} + +static int ipoib_set_coalesce(struct net_device *dev, + struct ethtool_coalesce *coal) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int ret; + + /* + * These values are saved in the private data and returned + * when ipoib_get_coalesce() is called + */ + if (coal->rx_coalesce_usecs > 0xffff || + coal->rx_max_coalesced_frames > 0xffff) + return -EINVAL; + + ret = ib_modify_cq(priv->recv_cq, coal->rx_max_coalesced_frames, + coal->rx_coalesce_usecs); + if (ret && ret != -ENOSYS) { + ipoib_warn(priv, "failed modifying CQ (%d)\n", ret); + return ret; + } + + priv->ethtool.coalesce_usecs = coal->rx_coalesce_usecs; + priv->ethtool.max_coalesced_frames = coal->rx_max_coalesced_frames; + + return 0; +} + +static const struct ethtool_ops ipoib_ethtool_ops = { + .get_drvinfo = ipoib_get_drvinfo, + .get_coalesce = ipoib_get_coalesce, + .set_coalesce = ipoib_set_coalesce, +}; + +void ipoib_set_ethtool_ops(struct net_device *dev) +{ + dev->ethtool_ops = &ipoib_ethtool_ops; +} diff --git a/drivers/infiniband/ulp/ipoib/ipoib_fs.c b/drivers/infiniband/ulp/ipoib/ipoib_fs.c new file mode 100644 index 0000000..6bd5740 --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/ipoib_fs.c @@ -0,0 +1,297 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +struct file_operations; + +#include +#include + +#include "ipoib.h" + +static struct dentry *ipoib_root; + +static void format_gid(union ib_gid *gid, char *buf) +{ + int i, n; + + for (n = 0, i = 0; i < 8; ++i) { + n += sprintf(buf + n, "%x", + be16_to_cpu(((__be16 *) gid->raw)[i])); + if (i < 7) + buf[n++] = ':'; + } +} + +static void *ipoib_mcg_seq_start(struct seq_file *file, loff_t *pos) +{ + struct ipoib_mcast_iter *iter; + loff_t n = *pos; + + iter = ipoib_mcast_iter_init(file->private); + if (!iter) + return NULL; + + while (n--) { + if (ipoib_mcast_iter_next(iter)) { + kfree(iter); + return NULL; + } + } + + return iter; +} + +static void *ipoib_mcg_seq_next(struct seq_file *file, void *iter_ptr, + loff_t *pos) +{ + struct ipoib_mcast_iter *iter = iter_ptr; + + (*pos)++; + + if (ipoib_mcast_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +static void ipoib_mcg_seq_stop(struct seq_file *file, void *iter_ptr) +{ + /* nothing for now */ +} + +static int ipoib_mcg_seq_show(struct seq_file *file, void *iter_ptr) +{ + struct ipoib_mcast_iter *iter = iter_ptr; + char gid_buf[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"]; + union ib_gid mgid; + unsigned long created; + unsigned int queuelen, complete, send_only; + + if (!iter) + return 0; + + ipoib_mcast_iter_read(iter, &mgid, &created, &queuelen, + &complete, &send_only); + + format_gid(&mgid, gid_buf); + + seq_printf(file, + "GID: %s\n" + " created: %10ld\n" + " queuelen: %9d\n" + " complete: %9s\n" + " send_only: %8s\n" + "\n", + gid_buf, created, queuelen, + complete ? "yes" : "no", + send_only ? "yes" : "no"); + + return 0; +} + +static const struct seq_operations ipoib_mcg_seq_ops = { + .start = ipoib_mcg_seq_start, + .next = ipoib_mcg_seq_next, + .stop = ipoib_mcg_seq_stop, + .show = ipoib_mcg_seq_show, +}; + +static int ipoib_mcg_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + + ret = seq_open(file, &ipoib_mcg_seq_ops); + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->i_private; + + return 0; +} + +static const struct file_operations ipoib_mcg_fops = { + .owner = THIS_MODULE, + .open = ipoib_mcg_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +static void *ipoib_path_seq_start(struct seq_file *file, loff_t *pos) +{ + struct ipoib_path_iter *iter; + loff_t n = *pos; + + iter = ipoib_path_iter_init(file->private); + if (!iter) + return NULL; + + while (n--) { + if (ipoib_path_iter_next(iter)) { + kfree(iter); + return NULL; + } + } + + return iter; +} + +static void *ipoib_path_seq_next(struct seq_file *file, void *iter_ptr, + loff_t *pos) +{ + struct ipoib_path_iter *iter = iter_ptr; + + (*pos)++; + + if (ipoib_path_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +static void ipoib_path_seq_stop(struct seq_file *file, void *iter_ptr) +{ + /* nothing for now */ +} + +static int ipoib_path_seq_show(struct seq_file *file, void *iter_ptr) +{ + struct ipoib_path_iter *iter = iter_ptr; + char gid_buf[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"]; + struct ipoib_path path; + int rate; + + if (!iter) + return 0; + + ipoib_path_iter_read(iter, &path); + + format_gid(&path.pathrec.dgid, gid_buf); + + seq_printf(file, + "GID: %s\n" + " complete: %6s\n", + gid_buf, path.pathrec.dlid ? "yes" : "no"); + + if (path.pathrec.dlid) { + rate = ib_rate_to_mbps(path.pathrec.rate); + + seq_printf(file, + " DLID: 0x%04x\n" + " SL: %12d\n" + " rate: %8d.%d Gb/sec\n", + be16_to_cpu(path.pathrec.dlid), + path.pathrec.sl, + rate / 1000, rate % 1000); + } + + seq_putc(file, '\n'); + + return 0; +} + +static const struct seq_operations ipoib_path_seq_ops = { + .start = ipoib_path_seq_start, + .next = ipoib_path_seq_next, + .stop = ipoib_path_seq_stop, + .show = ipoib_path_seq_show, +}; + +static int ipoib_path_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + + ret = seq_open(file, &ipoib_path_seq_ops); + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->i_private; + + return 0; +} + +static const struct file_operations ipoib_path_fops = { + .owner = THIS_MODULE, + .open = ipoib_path_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +void ipoib_create_debug_files(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + char name[IFNAMSIZ + sizeof "_path"]; + + snprintf(name, sizeof name, "%s_mcg", dev->name); + priv->mcg_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, + ipoib_root, dev, &ipoib_mcg_fops); + if (!priv->mcg_dentry) + ipoib_warn(priv, "failed to create mcg debug file\n"); + + snprintf(name, sizeof name, "%s_path", dev->name); + priv->path_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, + ipoib_root, dev, &ipoib_path_fops); + if (!priv->path_dentry) + ipoib_warn(priv, "failed to create path debug file\n"); +} + +void ipoib_delete_debug_files(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + debugfs_remove(priv->mcg_dentry); + debugfs_remove(priv->path_dentry); +} + +int ipoib_register_debugfs(void) +{ + ipoib_root = debugfs_create_dir("ipoib", NULL); + return ipoib_root ? 0 : -ENOMEM; +} + +void ipoib_unregister_debugfs(void) +{ + debugfs_remove(ipoib_root); +} diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c new file mode 100644 index 0000000..72626c3 --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -0,0 +1,1106 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2004, 2005 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include +#include + +#include "ipoib.h" + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA +static int data_debug_level; + +module_param(data_debug_level, int, 0644); +MODULE_PARM_DESC(data_debug_level, + "Enable data path debug tracing if > 0"); +#endif + +static DEFINE_MUTEX(pkey_mutex); + +struct ipoib_ah *ipoib_create_ah(struct net_device *dev, + struct ib_pd *pd, struct ib_ah_attr *attr) +{ + struct ipoib_ah *ah; + struct ib_ah *vah; + + ah = kmalloc(sizeof *ah, GFP_KERNEL); + if (!ah) + return ERR_PTR(-ENOMEM); + + ah->dev = dev; + ah->last_send = 0; + kref_init(&ah->ref); + + vah = ib_create_ah(pd, attr); + if (IS_ERR(vah)) { + kfree(ah); + ah = (struct ipoib_ah *)vah; + } else { + ah->ah = vah; + ipoib_dbg(netdev_priv(dev), "Created ah %p\n", ah->ah); + } + + return ah; +} + +void ipoib_free_ah(struct kref *kref) +{ + struct ipoib_ah *ah = container_of(kref, struct ipoib_ah, ref); + struct ipoib_dev_priv *priv = netdev_priv(ah->dev); + + unsigned long flags; + + spin_lock_irqsave(&priv->lock, flags); + list_add_tail(&ah->list, &priv->dead_ahs); + spin_unlock_irqrestore(&priv->lock, flags); +} + +static void ipoib_ud_dma_unmap_rx(struct ipoib_dev_priv *priv, + u64 mapping[IPOIB_UD_RX_SG]) +{ + if (ipoib_ud_need_sg(priv->max_ib_mtu)) { + ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_UD_HEAD_SIZE, + DMA_FROM_DEVICE); + ib_dma_unmap_page(priv->ca, mapping[1], PAGE_SIZE, + DMA_FROM_DEVICE); + } else + ib_dma_unmap_single(priv->ca, mapping[0], + IPOIB_UD_BUF_SIZE(priv->max_ib_mtu), + DMA_FROM_DEVICE); +} + +static void ipoib_ud_skb_put_frags(struct ipoib_dev_priv *priv, + struct sk_buff *skb, + unsigned int length) +{ + if (ipoib_ud_need_sg(priv->max_ib_mtu)) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[0]; + unsigned int size; + /* + * There is only two buffers needed for max_payload = 4K, + * first buf size is IPOIB_UD_HEAD_SIZE + */ + skb->tail += IPOIB_UD_HEAD_SIZE; + skb->len += length; + + size = length - IPOIB_UD_HEAD_SIZE; + + skb_frag_size_set(frag, size); + skb->data_len += size; + skb->truesize += PAGE_SIZE; + } else + skb_put(skb, length); + +} + +static int ipoib_ib_post_receive(struct net_device *dev, int id) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_recv_wr *bad_wr; + int ret; + + priv->rx_wr.wr_id = id | IPOIB_OP_RECV; + priv->rx_sge[0].addr = priv->rx_ring[id].mapping[0]; + priv->rx_sge[1].addr = priv->rx_ring[id].mapping[1]; + + + ret = ib_post_recv(priv->qp, &priv->rx_wr, &bad_wr); + if (unlikely(ret)) { + ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret); + ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[id].mapping); + dev_kfree_skb_any(priv->rx_ring[id].skb); + priv->rx_ring[id].skb = NULL; + } + + return ret; +} + +static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct sk_buff *skb; + int buf_size; + int tailroom; + u64 *mapping; + + if (ipoib_ud_need_sg(priv->max_ib_mtu)) { + buf_size = IPOIB_UD_HEAD_SIZE; + tailroom = 128; /* reserve some tailroom for IP/TCP headers */ + } else { + buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); + tailroom = 0; + } + + skb = dev_alloc_skb(buf_size + tailroom + 4); + if (unlikely(!skb)) + return NULL; + + /* + * IB will leave a 40 byte gap for a GRH and IPoIB adds a 4 byte + * header. So we need 4 more bytes to get to 48 and align the + * IP header to a multiple of 16. + */ + skb_reserve(skb, 4); + + mapping = priv->rx_ring[id].mapping; + mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size, + DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) + goto error; + + if (ipoib_ud_need_sg(priv->max_ib_mtu)) { + struct page *page = alloc_page(GFP_ATOMIC); + if (!page) + goto partial_error; + skb_fill_page_desc(skb, 0, page, 0, PAGE_SIZE); + mapping[1] = + ib_dma_map_page(priv->ca, page, + 0, PAGE_SIZE, DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, mapping[1]))) + goto partial_error; + } + + priv->rx_ring[id].skb = skb; + return skb; + +partial_error: + ib_dma_unmap_single(priv->ca, mapping[0], buf_size, DMA_FROM_DEVICE); +error: + dev_kfree_skb_any(skb); + return NULL; +} + +static int ipoib_ib_post_receives(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int i; + + for (i = 0; i < ipoib_recvq_size; ++i) { + if (!ipoib_alloc_rx_skb(dev, i)) { + ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); + return -ENOMEM; + } + if (ipoib_ib_post_receive(dev, i)) { + ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i); + return -EIO; + } + } + + return 0; +} + +static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV; + struct sk_buff *skb; + u64 mapping[IPOIB_UD_RX_SG]; + union ib_gid *dgid; + + ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n", + wr_id, wc->status); + + if (unlikely(wr_id >= ipoib_recvq_size)) { + ipoib_warn(priv, "recv completion event with wrid %d (> %d)\n", + wr_id, ipoib_recvq_size); + return; + } + + skb = priv->rx_ring[wr_id].skb; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + if (wc->status != IB_WC_WR_FLUSH_ERR) + ipoib_warn(priv, "failed recv event " + "(status=%d, wrid=%d vend_err %x)\n", + wc->status, wr_id, wc->vendor_err); + ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[wr_id].mapping); + dev_kfree_skb_any(skb); + priv->rx_ring[wr_id].skb = NULL; + return; + } + + /* + * Drop packets that this interface sent, ie multicast packets + * that the HCA has replicated. + */ + if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num) + goto repost; + + memcpy(mapping, priv->rx_ring[wr_id].mapping, + IPOIB_UD_RX_SG * sizeof *mapping); + + /* + * If we can't allocate a new RX buffer, dump + * this packet and reuse the old buffer. + */ + if (unlikely(!ipoib_alloc_rx_skb(dev, wr_id))) { + ++dev->stats.rx_dropped; + goto repost; + } + + ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n", + wc->byte_len, wc->slid); + + ipoib_ud_dma_unmap_rx(priv, mapping); + ipoib_ud_skb_put_frags(priv, skb, wc->byte_len); + + /* First byte of dgid signals multicast when 0xff */ + dgid = &((struct ib_grh *)skb->data)->dgid; + + if (!(wc->wc_flags & IB_WC_GRH) || dgid->raw[0] != 0xff) + skb->pkt_type = PACKET_HOST; + else if (memcmp(dgid, dev->broadcast + 4, sizeof(union ib_gid)) == 0) + skb->pkt_type = PACKET_BROADCAST; + else + skb->pkt_type = PACKET_MULTICAST; + + skb_pull(skb, IB_GRH_BYTES); + + skb->protocol = ((struct ipoib_header *) skb->data)->proto; + skb_reset_mac_header(skb); + skb_pull(skb, IPOIB_ENCAP_LEN); + + ++dev->stats.rx_packets; + dev->stats.rx_bytes += skb->len; + + skb->dev = dev; + if ((dev->features & NETIF_F_RXCSUM) && + likely(wc->wc_flags & IB_WC_IP_CSUM_OK)) + skb->ip_summed = CHECKSUM_UNNECESSARY; + + napi_gro_receive(&priv->napi, skb); + +repost: + if (unlikely(ipoib_ib_post_receive(dev, wr_id))) + ipoib_warn(priv, "ipoib_ib_post_receive failed " + "for buf %d\n", wr_id); +} + +static int ipoib_dma_map_tx(struct ib_device *ca, + struct ipoib_tx_buf *tx_req) +{ + struct sk_buff *skb = tx_req->skb; + u64 *mapping = tx_req->mapping; + int i; + int off; + + if (skb_headlen(skb)) { + mapping[0] = ib_dma_map_single(ca, skb->data, skb_headlen(skb), + DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(ca, mapping[0]))) + return -EIO; + + off = 1; + } else + off = 0; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + mapping[i + off] = ib_dma_map_page(ca, + skb_frag_page(frag), + frag->page_offset, skb_frag_size(frag), + DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(ca, mapping[i + off]))) + goto partial_error; + } + return 0; + +partial_error: + for (; i > 0; --i) { + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1]; + + ib_dma_unmap_page(ca, mapping[i - !off], skb_frag_size(frag), DMA_TO_DEVICE); + } + + if (off) + ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE); + + return -EIO; +} + +static void ipoib_dma_unmap_tx(struct ib_device *ca, + struct ipoib_tx_buf *tx_req) +{ + struct sk_buff *skb = tx_req->skb; + u64 *mapping = tx_req->mapping; + int i; + int off; + + if (skb_headlen(skb)) { + ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE); + off = 1; + } else + off = 0; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + ib_dma_unmap_page(ca, mapping[i + off], skb_frag_size(frag), + DMA_TO_DEVICE); + } +} + +static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + unsigned int wr_id = wc->wr_id; + struct ipoib_tx_buf *tx_req; + + ipoib_dbg_data(priv, "send completion: id %d, status: %d\n", + wr_id, wc->status); + + if (unlikely(wr_id >= ipoib_sendq_size)) { + ipoib_warn(priv, "send completion event with wrid %d (> %d)\n", + wr_id, ipoib_sendq_size); + return; + } + + tx_req = &priv->tx_ring[wr_id]; + + ipoib_dma_unmap_tx(priv->ca, tx_req); + + ++dev->stats.tx_packets; + dev->stats.tx_bytes += tx_req->skb->len; + + dev_kfree_skb_any(tx_req->skb); + + ++priv->tx_tail; + if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && + netif_queue_stopped(dev) && + test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) + netif_wake_queue(dev); + + if (wc->status != IB_WC_SUCCESS && + wc->status != IB_WC_WR_FLUSH_ERR) + ipoib_warn(priv, "failed send event " + "(status=%d, wrid=%d vend_err %x)\n", + wc->status, wr_id, wc->vendor_err); +} + +static int poll_tx(struct ipoib_dev_priv *priv) +{ + int n, i; + + n = ib_poll_cq(priv->send_cq, MAX_SEND_CQE, priv->send_wc); + for (i = 0; i < n; ++i) + ipoib_ib_handle_tx_wc(priv->dev, priv->send_wc + i); + + return n == MAX_SEND_CQE; +} + +int ipoib_poll(struct napi_struct *napi, int budget) +{ + struct ipoib_dev_priv *priv = container_of(napi, struct ipoib_dev_priv, napi); + struct net_device *dev = priv->dev; + int done; + int t; + int n, i; + + done = 0; + +poll_more: + while (done < budget) { + int max = (budget - done); + + t = min(IPOIB_NUM_WC, max); + n = ib_poll_cq(priv->recv_cq, t, priv->ibwc); + + for (i = 0; i < n; i++) { + struct ib_wc *wc = priv->ibwc + i; + + if (wc->wr_id & IPOIB_OP_RECV) { + ++done; + if (wc->wr_id & IPOIB_OP_CM) + ipoib_cm_handle_rx_wc(dev, wc); + else + ipoib_ib_handle_rx_wc(dev, wc); + } else + ipoib_cm_handle_tx_wc(priv->dev, wc); + } + + if (n != t) + break; + } + + if (done < budget) { + napi_complete(napi); + if (unlikely(ib_req_notify_cq(priv->recv_cq, + IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS)) && + napi_reschedule(napi)) + goto poll_more; + } + + return done; +} + +void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr) +{ + struct net_device *dev = dev_ptr; + struct ipoib_dev_priv *priv = netdev_priv(dev); + + napi_schedule(&priv->napi); +} + +static void drain_tx_cq(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + netif_tx_lock(dev); + while (poll_tx(priv)) + ; /* nothing */ + + if (netif_queue_stopped(dev)) + mod_timer(&priv->poll_timer, jiffies + 1); + + netif_tx_unlock(dev); +} + +void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev_ptr); + + mod_timer(&priv->poll_timer, jiffies); +} + +static inline int post_send(struct ipoib_dev_priv *priv, + unsigned int wr_id, + struct ib_ah *address, u32 qpn, + struct ipoib_tx_buf *tx_req, + void *head, int hlen) +{ + struct ib_send_wr *bad_wr; + int i, off; + struct sk_buff *skb = tx_req->skb; + skb_frag_t *frags = skb_shinfo(skb)->frags; + int nr_frags = skb_shinfo(skb)->nr_frags; + u64 *mapping = tx_req->mapping; + + if (skb_headlen(skb)) { + priv->tx_sge[0].addr = mapping[0]; + priv->tx_sge[0].length = skb_headlen(skb); + off = 1; + } else + off = 0; + + for (i = 0; i < nr_frags; ++i) { + priv->tx_sge[i + off].addr = mapping[i + off]; + priv->tx_sge[i + off].length = skb_frag_size(&frags[i]); + } + priv->tx_wr.num_sge = nr_frags + off; + priv->tx_wr.wr_id = wr_id; + priv->tx_wr.wr.ud.remote_qpn = qpn; + priv->tx_wr.wr.ud.ah = address; + + if (head) { + priv->tx_wr.wr.ud.mss = skb_shinfo(skb)->gso_size; + priv->tx_wr.wr.ud.header = head; + priv->tx_wr.wr.ud.hlen = hlen; + priv->tx_wr.opcode = IB_WR_LSO; + } else + priv->tx_wr.opcode = IB_WR_SEND; + + return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr); +} + +void ipoib_send(struct net_device *dev, struct sk_buff *skb, + struct ipoib_ah *address, u32 qpn) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_tx_buf *tx_req; + int hlen, rc; + void *phead; + + if (skb_is_gso(skb)) { + hlen = skb_transport_offset(skb) + tcp_hdrlen(skb); + phead = skb->data; + if (unlikely(!skb_pull(skb, hlen))) { + ipoib_warn(priv, "linear data too small\n"); + ++dev->stats.tx_dropped; + ++dev->stats.tx_errors; + dev_kfree_skb_any(skb); + return; + } + } else { + if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) { + ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", + skb->len, priv->mcast_mtu + IPOIB_ENCAP_LEN); + ++dev->stats.tx_dropped; + ++dev->stats.tx_errors; + ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu); + return; + } + phead = NULL; + hlen = 0; + } + + ipoib_dbg_data(priv, "sending packet, length=%d address=%p qpn=0x%06x\n", + skb->len, address, qpn); + + /* + * We put the skb into the tx_ring _before_ we call post_send() + * because it's entirely possible that the completion handler will + * run before we execute anything after the post_send(). That + * means we have to make sure everything is properly recorded and + * our state is consistent before we call post_send(). + */ + tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)]; + tx_req->skb = skb; + if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) { + ++dev->stats.tx_errors; + dev_kfree_skb_any(skb); + return; + } + + if (skb->ip_summed == CHECKSUM_PARTIAL) + priv->tx_wr.send_flags |= IB_SEND_IP_CSUM; + else + priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM; + + if (++priv->tx_outstanding == ipoib_sendq_size) { + ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n"); + if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP)) + ipoib_warn(priv, "request notify on send CQ failed\n"); + netif_stop_queue(dev); + } + + skb_orphan(skb); + skb_dst_drop(skb); + + rc = post_send(priv, priv->tx_head & (ipoib_sendq_size - 1), + address->ah, qpn, tx_req, phead, hlen); + if (unlikely(rc)) { + ipoib_warn(priv, "post_send failed, error %d\n", rc); + ++dev->stats.tx_errors; + --priv->tx_outstanding; + ipoib_dma_unmap_tx(priv->ca, tx_req); + dev_kfree_skb_any(skb); + if (netif_queue_stopped(dev)) + netif_wake_queue(dev); + } else { + dev->trans_start = jiffies; + + address->last_send = priv->tx_head; + ++priv->tx_head; + } + + if (unlikely(priv->tx_outstanding > MAX_SEND_CQE)) + while (poll_tx(priv)) + ; /* nothing */ +} + +static void __ipoib_reap_ah(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_ah *ah, *tah; + LIST_HEAD(remove_list); + unsigned long flags; + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + + list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list) + if ((int) priv->tx_tail - (int) ah->last_send >= 0) { + list_del(&ah->list); + ib_destroy_ah(ah->ah); + kfree(ah); + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); +} + +void ipoib_reap_ah(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, ah_reap_task.work); + struct net_device *dev = priv->dev; + + __ipoib_reap_ah(dev); + + if (!test_bit(IPOIB_STOP_REAPER, &priv->flags)) + queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, + round_jiffies_relative(HZ)); +} + +static void ipoib_ib_tx_timer_func(unsigned long ctx) +{ + drain_tx_cq((struct net_device *)ctx); +} + +int ipoib_ib_dev_open(struct net_device *dev, int flush) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int ret; + + ipoib_pkey_dev_check_presence(dev); + + if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { + ipoib_warn(priv, "P_Key 0x%04x is %s\n", priv->pkey, + (!(priv->pkey & 0x7fff) ? "Invalid" : "not found")); + return -1; + } + + ret = ipoib_init_qp(dev); + if (ret) { + ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret); + return -1; + } + + ret = ipoib_ib_post_receives(dev); + if (ret) { + ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret); + goto dev_stop; + } + + ret = ipoib_cm_dev_open(dev); + if (ret) { + ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret); + goto dev_stop; + } + + clear_bit(IPOIB_STOP_REAPER, &priv->flags); + queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, + round_jiffies_relative(HZ)); + + if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) + napi_enable(&priv->napi); + + return 0; +dev_stop: + if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) + napi_enable(&priv->napi); + ipoib_ib_dev_stop(dev, flush); + return -1; +} + +void ipoib_pkey_dev_check_presence(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + if (!(priv->pkey & 0x7fff) || + ib_find_pkey(priv->ca, priv->port, priv->pkey, + &priv->pkey_index)) + clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + else + set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); +} + +int ipoib_ib_dev_up(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_pkey_dev_check_presence(dev); + + if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { + ipoib_dbg(priv, "PKEY is not assigned.\n"); + return 0; + } + + set_bit(IPOIB_FLAG_OPER_UP, &priv->flags); + + return ipoib_mcast_start_thread(dev); +} + +int ipoib_ib_dev_down(struct net_device *dev, int flush) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_dbg(priv, "downing ib_dev\n"); + + clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags); + netif_carrier_off(dev); + + ipoib_mcast_stop_thread(dev, flush); + ipoib_mcast_dev_flush(dev); + + ipoib_flush_paths(dev); + + return 0; +} + +static int recvs_pending(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int pending = 0; + int i; + + for (i = 0; i < ipoib_recvq_size; ++i) + if (priv->rx_ring[i].skb) + ++pending; + + return pending; +} + +void ipoib_drain_cq(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int i, n; + + /* + * We call completion handling routines that expect to be + * called from the BH-disabled NAPI poll context, so disable + * BHs here too. + */ + local_bh_disable(); + + do { + n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc); + for (i = 0; i < n; ++i) { + /* + * Convert any successful completions to flush + * errors to avoid passing packets up the + * stack after bringing the device down. + */ + if (priv->ibwc[i].status == IB_WC_SUCCESS) + priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR; + + if (priv->ibwc[i].wr_id & IPOIB_OP_RECV) { + if (priv->ibwc[i].wr_id & IPOIB_OP_CM) + ipoib_cm_handle_rx_wc(dev, priv->ibwc + i); + else + ipoib_ib_handle_rx_wc(dev, priv->ibwc + i); + } else + ipoib_cm_handle_tx_wc(dev, priv->ibwc + i); + } + } while (n == IPOIB_NUM_WC); + + while (poll_tx(priv)) + ; /* nothing */ + + local_bh_enable(); +} + +int ipoib_ib_dev_stop(struct net_device *dev, int flush) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_attr qp_attr; + unsigned long begin; + struct ipoib_tx_buf *tx_req; + int i; + + if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) + napi_disable(&priv->napi); + + ipoib_cm_dev_stop(dev); + + /* + * Move our QP to the error state and then reinitialize in + * when all work requests have completed or have been flushed. + */ + qp_attr.qp_state = IB_QPS_ERR; + if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) + ipoib_warn(priv, "Failed to modify QP to ERROR state\n"); + + /* Wait for all sends and receives to complete */ + begin = jiffies; + + while (priv->tx_head != priv->tx_tail || recvs_pending(dev)) { + if (time_after(jiffies, begin + 5 * HZ)) { + ipoib_warn(priv, "timing out; %d sends %d receives not completed\n", + priv->tx_head - priv->tx_tail, recvs_pending(dev)); + + /* + * assume the HW is wedged and just free up + * all our pending work requests. + */ + while ((int) priv->tx_tail - (int) priv->tx_head < 0) { + tx_req = &priv->tx_ring[priv->tx_tail & + (ipoib_sendq_size - 1)]; + ipoib_dma_unmap_tx(priv->ca, tx_req); + dev_kfree_skb_any(tx_req->skb); + ++priv->tx_tail; + --priv->tx_outstanding; + } + + for (i = 0; i < ipoib_recvq_size; ++i) { + struct ipoib_rx_buf *rx_req; + + rx_req = &priv->rx_ring[i]; + if (!rx_req->skb) + continue; + ipoib_ud_dma_unmap_rx(priv, + priv->rx_ring[i].mapping); + dev_kfree_skb_any(rx_req->skb); + rx_req->skb = NULL; + } + + goto timeout; + } + + ipoib_drain_cq(dev); + + msleep(1); + } + + ipoib_dbg(priv, "All sends and receives done.\n"); + +timeout: + del_timer_sync(&priv->poll_timer); + qp_attr.qp_state = IB_QPS_RESET; + if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) + ipoib_warn(priv, "Failed to modify QP to RESET state\n"); + + /* Wait for all AHs to be reaped */ + set_bit(IPOIB_STOP_REAPER, &priv->flags); + cancel_delayed_work(&priv->ah_reap_task); + if (flush) + flush_workqueue(ipoib_workqueue); + + begin = jiffies; + + while (!list_empty(&priv->dead_ahs)) { + __ipoib_reap_ah(dev); + + if (time_after(jiffies, begin + HZ)) { + ipoib_warn(priv, "timing out; will leak address handles\n"); + break; + } + + msleep(1); + } + + ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP); + + return 0; +} + +int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + priv->ca = ca; + priv->port = port; + priv->qp = NULL; + + if (ipoib_transport_dev_init(dev, ca)) { + printk(KERN_WARNING "%s: ipoib_transport_dev_init failed\n", ca->name); + return -ENODEV; + } + + setup_timer(&priv->poll_timer, ipoib_ib_tx_timer_func, + (unsigned long) dev); + + if (dev->flags & IFF_UP) { + if (ipoib_ib_dev_open(dev, 1)) { + ipoib_transport_dev_cleanup(dev); + return -ENODEV; + } + } + + return 0; +} + +/* + * Takes whatever value which is in pkey index 0 and updates priv->pkey + * returns 0 if the pkey value was changed. + */ +static inline int update_parent_pkey(struct ipoib_dev_priv *priv) +{ + int result; + u16 prev_pkey; + + prev_pkey = priv->pkey; + result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey); + if (result) { + ipoib_warn(priv, "ib_query_pkey port %d failed (ret = %d)\n", + priv->port, result); + return result; + } + + priv->pkey |= 0x8000; + + if (prev_pkey != priv->pkey) { + ipoib_dbg(priv, "pkey changed from 0x%x to 0x%x\n", + prev_pkey, priv->pkey); + /* + * Update the pkey in the broadcast address, while making sure to set + * the full membership bit, so that we join the right broadcast group. + */ + priv->dev->broadcast[8] = priv->pkey >> 8; + priv->dev->broadcast[9] = priv->pkey & 0xff; + return 0; + } + + return 1; +} +/* + * returns 0 if pkey value was found in a different slot. + */ +static inline int update_child_pkey(struct ipoib_dev_priv *priv) +{ + u16 old_index = priv->pkey_index; + + priv->pkey_index = 0; + ipoib_pkey_dev_check_presence(priv->dev); + + if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) && + (old_index == priv->pkey_index)) + return 1; + return 0; +} + +static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, + enum ipoib_flush_level level) +{ + struct ipoib_dev_priv *cpriv; + struct net_device *dev = priv->dev; + int result; + + down_read(&priv->vlan_rwsem); + + /* + * Flush any child interfaces too -- they might be up even if + * the parent is down. + */ + list_for_each_entry(cpriv, &priv->child_intfs, list) + __ipoib_ib_dev_flush(cpriv, level); + + up_read(&priv->vlan_rwsem); + + if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) && + level != IPOIB_FLUSH_HEAVY) { + ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n"); + return; + } + + if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { + /* interface is down. update pkey and leave. */ + if (level == IPOIB_FLUSH_HEAVY) { + if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) + update_parent_pkey(priv); + else + update_child_pkey(priv); + } + ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n"); + return; + } + + if (level == IPOIB_FLUSH_HEAVY) { + /* child devices chase their origin pkey value, while non-child + * (parent) devices should always takes what present in pkey index 0 + */ + if (test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { + result = update_child_pkey(priv); + if (result) { + /* restart QP only if P_Key index is changed */ + ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n"); + return; + } + + } else { + result = update_parent_pkey(priv); + /* restart QP only if P_Key value changed */ + if (result) { + ipoib_dbg(priv, "Not flushing - P_Key value not changed.\n"); + return; + } + } + } + + if (level == IPOIB_FLUSH_LIGHT) { + ipoib_mark_paths_invalid(dev); + ipoib_mcast_dev_flush(dev); + } + + if (level >= IPOIB_FLUSH_NORMAL) + ipoib_ib_dev_down(dev, 0); + + if (level == IPOIB_FLUSH_HEAVY) { + if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) + ipoib_ib_dev_stop(dev, 0); + if (ipoib_ib_dev_open(dev, 0) != 0) + return; + if (netif_queue_stopped(dev)) + netif_start_queue(dev); + } + + /* + * The device could have been brought down between the start and when + * we get here, don't bring it back up if it's not configured up + */ + if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { + if (level >= IPOIB_FLUSH_NORMAL) + ipoib_ib_dev_up(dev); + ipoib_mcast_restart_task(&priv->restart_task); + } +} + +void ipoib_ib_dev_flush_light(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, flush_light); + + __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT); +} + +void ipoib_ib_dev_flush_normal(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, flush_normal); + + __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL); +} + +void ipoib_ib_dev_flush_heavy(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, flush_heavy); + + __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY); +} + +void ipoib_ib_dev_cleanup(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_dbg(priv, "cleaning up ib_dev\n"); + /* + * We must make sure there are no more (path) completions + * that may wish to touch priv fields that are no longer valid + */ + ipoib_flush_paths(dev); + + ipoib_mcast_stop_thread(dev, 1); + ipoib_mcast_dev_flush(dev); + + ipoib_transport_dev_cleanup(dev); +} + + diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c new file mode 100644 index 0000000..58b5aa3 --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -0,0 +1,1793 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ipoib.h" + +#include + +#include +#include +#include +#include + +#include /* For ARPHRD_xxx */ + +#include +#include + +#include +#include + +#define DRV_VERSION "1.0.0" + +const char ipoib_driver_version[] = DRV_VERSION; + +MODULE_AUTHOR("Roland Dreier"); +MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); + +int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE; +int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE; + +module_param_named(send_queue_size, ipoib_sendq_size, int, 0444); +MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue"); +module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444); +MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue"); + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG +int ipoib_debug_level; + +module_param_named(debug_level, ipoib_debug_level, int, 0644); +MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); +#endif + +struct ipoib_path_iter { + struct net_device *dev; + struct ipoib_path path; +}; + +static const u8 ipv4_bcast_addr[] = { + 0x00, 0xff, 0xff, 0xff, + 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff +}; + +struct workqueue_struct *ipoib_workqueue; + +struct ib_sa_client ipoib_sa_client; + +static void ipoib_add_one(struct ib_device *device); +static void ipoib_remove_one(struct ib_device *device); +static void ipoib_neigh_reclaim(struct rcu_head *rp); + +static struct ib_client ipoib_client = { + .name = "ipoib", + .add = ipoib_add_one, + .remove = ipoib_remove_one +}; + +int ipoib_open(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_dbg(priv, "bringing up interface\n"); + + netif_carrier_off(dev); + + set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); + + if (ipoib_ib_dev_open(dev, 1)) { + if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) + return 0; + goto err_disable; + } + + if (ipoib_ib_dev_up(dev)) + goto err_stop; + + if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { + struct ipoib_dev_priv *cpriv; + + /* Bring up any child interfaces too */ + down_read(&priv->vlan_rwsem); + list_for_each_entry(cpriv, &priv->child_intfs, list) { + int flags; + + flags = cpriv->dev->flags; + if (flags & IFF_UP) + continue; + + dev_change_flags(cpriv->dev, flags | IFF_UP); + } + up_read(&priv->vlan_rwsem); + } + + netif_start_queue(dev); + + return 0; + +err_stop: + ipoib_ib_dev_stop(dev, 1); + +err_disable: + clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); + + return -EINVAL; +} + +static int ipoib_stop(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_dbg(priv, "stopping interface\n"); + + clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); + + netif_stop_queue(dev); + + ipoib_ib_dev_down(dev, 1); + ipoib_ib_dev_stop(dev, 0); + + if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { + struct ipoib_dev_priv *cpriv; + + /* Bring down any child interfaces too */ + down_read(&priv->vlan_rwsem); + list_for_each_entry(cpriv, &priv->child_intfs, list) { + int flags; + + flags = cpriv->dev->flags; + if (!(flags & IFF_UP)) + continue; + + dev_change_flags(cpriv->dev, flags & ~IFF_UP); + } + up_read(&priv->vlan_rwsem); + } + + return 0; +} + +static void ipoib_uninit(struct net_device *dev) +{ + ipoib_dev_cleanup(dev); +} + +static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags)) + features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO); + + return features; +} + +static int ipoib_change_mtu(struct net_device *dev, int new_mtu) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + /* dev->mtu > 2K ==> connected mode */ + if (ipoib_cm_admin_enabled(dev)) { + if (new_mtu > ipoib_cm_max_mtu(dev)) + return -EINVAL; + + if (new_mtu > priv->mcast_mtu) + ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n", + priv->mcast_mtu); + + dev->mtu = new_mtu; + return 0; + } + + if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu)) + return -EINVAL; + + priv->admin_mtu = new_mtu; + + dev->mtu = min(priv->mcast_mtu, priv->admin_mtu); + + return 0; +} + +int ipoib_set_mode(struct net_device *dev, const char *buf) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + /* flush paths if we switch modes so that connections are restarted */ + if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) { + set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); + ipoib_warn(priv, "enabling connected mode " + "will cause multicast packet drops\n"); + netdev_update_features(dev); + rtnl_unlock(); + priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM; + + ipoib_flush_paths(dev); + rtnl_lock(); + return 0; + } + + if (!strcmp(buf, "datagram\n")) { + clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); + netdev_update_features(dev); + dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu)); + rtnl_unlock(); + ipoib_flush_paths(dev); + rtnl_lock(); + return 0; + } + + return -EINVAL; +} + +static struct ipoib_path *__path_find(struct net_device *dev, void *gid) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct rb_node *n = priv->path_tree.rb_node; + struct ipoib_path *path; + int ret; + + while (n) { + path = rb_entry(n, struct ipoib_path, rb_node); + + ret = memcmp(gid, path->pathrec.dgid.raw, + sizeof (union ib_gid)); + + if (ret < 0) + n = n->rb_left; + else if (ret > 0) + n = n->rb_right; + else + return path; + } + + return NULL; +} + +static int __path_add(struct net_device *dev, struct ipoib_path *path) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct rb_node **n = &priv->path_tree.rb_node; + struct rb_node *pn = NULL; + struct ipoib_path *tpath; + int ret; + + while (*n) { + pn = *n; + tpath = rb_entry(pn, struct ipoib_path, rb_node); + + ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw, + sizeof (union ib_gid)); + if (ret < 0) + n = &pn->rb_left; + else if (ret > 0) + n = &pn->rb_right; + else + return -EEXIST; + } + + rb_link_node(&path->rb_node, pn, n); + rb_insert_color(&path->rb_node, &priv->path_tree); + + list_add_tail(&path->list, &priv->path_list); + + return 0; +} + +static void path_free(struct net_device *dev, struct ipoib_path *path) +{ + struct sk_buff *skb; + + while ((skb = __skb_dequeue(&path->queue))) + dev_kfree_skb_irq(skb); + + ipoib_dbg(netdev_priv(dev), "path_free\n"); + + /* remove all neigh connected to this path */ + ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw); + + if (path->ah) + ipoib_put_ah(path->ah); + + kfree(path); +} + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG + +struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev) +{ + struct ipoib_path_iter *iter; + + iter = kmalloc(sizeof *iter, GFP_KERNEL); + if (!iter) + return NULL; + + iter->dev = dev; + memset(iter->path.pathrec.dgid.raw, 0, 16); + + if (ipoib_path_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +int ipoib_path_iter_next(struct ipoib_path_iter *iter) +{ + struct ipoib_dev_priv *priv = netdev_priv(iter->dev); + struct rb_node *n; + struct ipoib_path *path; + int ret = 1; + + spin_lock_irq(&priv->lock); + + n = rb_first(&priv->path_tree); + + while (n) { + path = rb_entry(n, struct ipoib_path, rb_node); + + if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw, + sizeof (union ib_gid)) < 0) { + iter->path = *path; + ret = 0; + break; + } + + n = rb_next(n); + } + + spin_unlock_irq(&priv->lock); + + return ret; +} + +void ipoib_path_iter_read(struct ipoib_path_iter *iter, + struct ipoib_path *path) +{ + *path = iter->path; +} + +#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ + +void ipoib_mark_paths_invalid(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_path *path, *tp; + + spin_lock_irq(&priv->lock); + + list_for_each_entry_safe(path, tp, &priv->path_list, list) { + ipoib_dbg(priv, "mark path LID 0x%04x GID %pI6 invalid\n", + be16_to_cpu(path->pathrec.dlid), + path->pathrec.dgid.raw); + path->valid = 0; + } + + spin_unlock_irq(&priv->lock); +} + +void ipoib_flush_paths(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_path *path, *tp; + LIST_HEAD(remove_list); + unsigned long flags; + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + + list_splice_init(&priv->path_list, &remove_list); + + list_for_each_entry(path, &remove_list, list) + rb_erase(&path->rb_node, &priv->path_tree); + + list_for_each_entry_safe(path, tp, &remove_list, list) { + if (path->query) + ib_sa_cancel_query(path->query_id, path->query); + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); + wait_for_completion(&path->done); + path_free(dev, path); + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); +} + +static void path_rec_completion(int status, + struct ib_sa_path_rec *pathrec, + void *path_ptr) +{ + struct ipoib_path *path = path_ptr; + struct net_device *dev = path->dev; + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_ah *ah = NULL; + struct ipoib_ah *old_ah = NULL; + struct ipoib_neigh *neigh, *tn; + struct sk_buff_head skqueue; + struct sk_buff *skb; + unsigned long flags; + + if (!status) + ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n", + be16_to_cpu(pathrec->dlid), pathrec->dgid.raw); + else + ipoib_dbg(priv, "PathRec status %d for GID %pI6\n", + status, path->pathrec.dgid.raw); + + skb_queue_head_init(&skqueue); + + if (!status) { + struct ib_ah_attr av; + + if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av)) + ah = ipoib_create_ah(dev, priv->pd, &av); + } + + spin_lock_irqsave(&priv->lock, flags); + + if (!IS_ERR_OR_NULL(ah)) { + path->pathrec = *pathrec; + + old_ah = path->ah; + path->ah = ah; + + ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n", + ah, be16_to_cpu(pathrec->dlid), pathrec->sl); + + while ((skb = __skb_dequeue(&path->queue))) + __skb_queue_tail(&skqueue, skb); + + list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) { + if (neigh->ah) { + WARN_ON(neigh->ah != old_ah); + /* + * Dropping the ah reference inside + * priv->lock is safe here, because we + * will hold one more reference from + * the original value of path->ah (ie + * old_ah). + */ + ipoib_put_ah(neigh->ah); + } + kref_get(&path->ah->ref); + neigh->ah = path->ah; + + if (ipoib_cm_enabled(dev, neigh->daddr)) { + if (!ipoib_cm_get(neigh)) + ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, + path, + neigh)); + if (!ipoib_cm_get(neigh)) { + ipoib_neigh_free(neigh); + continue; + } + } + + while ((skb = __skb_dequeue(&neigh->queue))) + __skb_queue_tail(&skqueue, skb); + } + path->valid = 1; + } + + path->query = NULL; + complete(&path->done); + + spin_unlock_irqrestore(&priv->lock, flags); + + if (IS_ERR_OR_NULL(ah)) + ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw); + + if (old_ah) + ipoib_put_ah(old_ah); + + while ((skb = __skb_dequeue(&skqueue))) { + skb->dev = dev; + if (dev_queue_xmit(skb)) + ipoib_warn(priv, "dev_queue_xmit failed " + "to requeue packet\n"); + } +} + +static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_path *path; + + if (!priv->broadcast) + return NULL; + + path = kzalloc(sizeof *path, GFP_ATOMIC); + if (!path) + return NULL; + + path->dev = dev; + + skb_queue_head_init(&path->queue); + + INIT_LIST_HEAD(&path->neigh_list); + + memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid)); + path->pathrec.sgid = priv->local_gid; + path->pathrec.pkey = cpu_to_be16(priv->pkey); + path->pathrec.numb_path = 1; + path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class; + + return path; +} + +static int path_rec_start(struct net_device *dev, + struct ipoib_path *path) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_dbg(priv, "Start path record lookup for %pI6\n", + path->pathrec.dgid.raw); + + init_completion(&path->done); + + path->query_id = + ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port, + &path->pathrec, + IB_SA_PATH_REC_DGID | + IB_SA_PATH_REC_SGID | + IB_SA_PATH_REC_NUMB_PATH | + IB_SA_PATH_REC_TRAFFIC_CLASS | + IB_SA_PATH_REC_PKEY, + 1000, GFP_ATOMIC, + path_rec_completion, + path, &path->query); + if (path->query_id < 0) { + ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id); + path->query = NULL; + complete(&path->done); + return path->query_id; + } + + return 0; +} + +static void neigh_add_path(struct sk_buff *skb, u8 *daddr, + struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_path *path; + struct ipoib_neigh *neigh; + unsigned long flags; + + spin_lock_irqsave(&priv->lock, flags); + neigh = ipoib_neigh_alloc(daddr, dev); + if (!neigh) { + spin_unlock_irqrestore(&priv->lock, flags); + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + return; + } + + path = __path_find(dev, daddr + 4); + if (!path) { + path = path_rec_create(dev, daddr + 4); + if (!path) + goto err_path; + + __path_add(dev, path); + } + + list_add_tail(&neigh->list, &path->neigh_list); + + if (path->ah) { + kref_get(&path->ah->ref); + neigh->ah = path->ah; + + if (ipoib_cm_enabled(dev, neigh->daddr)) { + if (!ipoib_cm_get(neigh)) + ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh)); + if (!ipoib_cm_get(neigh)) { + ipoib_neigh_free(neigh); + goto err_drop; + } + if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) + __skb_queue_tail(&neigh->queue, skb); + else { + ipoib_warn(priv, "queue length limit %d. Packet drop.\n", + skb_queue_len(&neigh->queue)); + goto err_drop; + } + } else { + spin_unlock_irqrestore(&priv->lock, flags); + ipoib_send(dev, skb, path->ah, IPOIB_QPN(daddr)); + ipoib_neigh_put(neigh); + return; + } + } else { + neigh->ah = NULL; + + if (!path->query && path_rec_start(dev, path)) + goto err_path; + + __skb_queue_tail(&neigh->queue, skb); + } + + spin_unlock_irqrestore(&priv->lock, flags); + ipoib_neigh_put(neigh); + return; + +err_path: + ipoib_neigh_free(neigh); +err_drop: + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + + spin_unlock_irqrestore(&priv->lock, flags); + ipoib_neigh_put(neigh); +} + +static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, + struct ipoib_cb *cb) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_path *path; + unsigned long flags; + + spin_lock_irqsave(&priv->lock, flags); + + path = __path_find(dev, cb->hwaddr + 4); + if (!path || !path->valid) { + int new_path = 0; + + if (!path) { + path = path_rec_create(dev, cb->hwaddr + 4); + new_path = 1; + } + if (path) { + __skb_queue_tail(&path->queue, skb); + + if (!path->query && path_rec_start(dev, path)) { + spin_unlock_irqrestore(&priv->lock, flags); + if (new_path) + path_free(dev, path); + return; + } else + __path_add(dev, path); + } else { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + } + + spin_unlock_irqrestore(&priv->lock, flags); + return; + } + + if (path->ah) { + ipoib_dbg(priv, "Send unicast ARP to %04x\n", + be16_to_cpu(path->pathrec.dlid)); + + spin_unlock_irqrestore(&priv->lock, flags); + ipoib_send(dev, skb, path->ah, IPOIB_QPN(cb->hwaddr)); + return; + } else if ((path->query || !path_rec_start(dev, path)) && + skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { + __skb_queue_tail(&path->queue, skb); + } else { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + } + + spin_unlock_irqrestore(&priv->lock, flags); +} + +static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_neigh *neigh; + struct ipoib_cb *cb = ipoib_skb_cb(skb); + struct ipoib_header *header; + unsigned long flags; + + header = (struct ipoib_header *) skb->data; + + if (unlikely(cb->hwaddr[4] == 0xff)) { + /* multicast, arrange "if" according to probability */ + if ((header->proto != htons(ETH_P_IP)) && + (header->proto != htons(ETH_P_IPV6)) && + (header->proto != htons(ETH_P_ARP)) && + (header->proto != htons(ETH_P_RARP)) && + (header->proto != htons(ETH_P_TIPC))) { + /* ethertype not supported by IPoIB */ + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + return NETDEV_TX_OK; + } + /* Add in the P_Key for multicast*/ + cb->hwaddr[8] = (priv->pkey >> 8) & 0xff; + cb->hwaddr[9] = priv->pkey & 0xff; + + neigh = ipoib_neigh_get(dev, cb->hwaddr); + if (likely(neigh)) + goto send_using_neigh; + ipoib_mcast_send(dev, cb->hwaddr, skb); + return NETDEV_TX_OK; + } + + /* unicast, arrange "switch" according to probability */ + switch (header->proto) { + case htons(ETH_P_IP): + case htons(ETH_P_IPV6): + case htons(ETH_P_TIPC): + neigh = ipoib_neigh_get(dev, cb->hwaddr); + if (unlikely(!neigh)) { + neigh_add_path(skb, cb->hwaddr, dev); + return NETDEV_TX_OK; + } + break; + case htons(ETH_P_ARP): + case htons(ETH_P_RARP): + /* for unicast ARP and RARP should always perform path find */ + unicast_arp_send(skb, dev, cb); + return NETDEV_TX_OK; + default: + /* ethertype not supported by IPoIB */ + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + return NETDEV_TX_OK; + } + +send_using_neigh: + /* note we now hold a ref to neigh */ + if (ipoib_cm_get(neigh)) { + if (ipoib_cm_up(neigh)) { + ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); + goto unref; + } + } else if (neigh->ah) { + ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(cb->hwaddr)); + goto unref; + } + + if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { + spin_lock_irqsave(&priv->lock, flags); + __skb_queue_tail(&neigh->queue, skb); + spin_unlock_irqrestore(&priv->lock, flags); + } else { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + } + +unref: + ipoib_neigh_put(neigh); + + return NETDEV_TX_OK; +} + +static void ipoib_timeout(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_warn(priv, "transmit timeout: latency %d msecs\n", + jiffies_to_msecs(jiffies - dev->trans_start)); + ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n", + netif_queue_stopped(dev), + priv->tx_head, priv->tx_tail); + /* XXX reset QP, etc. */ +} + +static int ipoib_hard_header(struct sk_buff *skb, + struct net_device *dev, + unsigned short type, + const void *daddr, const void *saddr, unsigned len) +{ + struct ipoib_header *header; + struct ipoib_cb *cb = ipoib_skb_cb(skb); + + header = (struct ipoib_header *) skb_push(skb, sizeof *header); + + header->proto = htons(type); + header->reserved = 0; + + /* + * we don't rely on dst_entry structure, always stuff the + * destination address into skb->cb so we can figure out where + * to send the packet later. + */ + memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN); + + return sizeof *header; +} + +static void ipoib_set_mcast_list(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { + ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set"); + return; + } + + queue_work(ipoib_workqueue, &priv->restart_task); +} + +static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr) +{ + /* + * Use only the address parts that contributes to spreading + * The subnet prefix is not used as one can not connect to + * same remote port (GUID) using the same remote QPN via two + * different subnets. + */ + /* qpn octets[1:4) & port GUID octets[12:20) */ + u32 *d32 = (u32 *) daddr; + u32 hv; + + hv = jhash_3words(d32[3], d32[4], IPOIB_QPN_MASK & d32[0], 0); + return hv & htbl->mask; +} + +struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + struct ipoib_neigh *neigh = NULL; + u32 hash_val; + + rcu_read_lock_bh(); + + htbl = rcu_dereference_bh(ntbl->htbl); + + if (!htbl) + goto out_unlock; + + hash_val = ipoib_addr_hash(htbl, daddr); + for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]); + neigh != NULL; + neigh = rcu_dereference_bh(neigh->hnext)) { + if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) { + /* found, take one ref on behalf of the caller */ + if (!atomic_inc_not_zero(&neigh->refcnt)) { + /* deleted */ + neigh = NULL; + goto out_unlock; + } + neigh->alive = jiffies; + goto out_unlock; + } + } + +out_unlock: + rcu_read_unlock_bh(); + return neigh; +} + +static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv) +{ + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + unsigned long neigh_obsolete; + unsigned long dt; + unsigned long flags; + int i; + + if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) + return; + + spin_lock_irqsave(&priv->lock, flags); + + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + + if (!htbl) + goto out_unlock; + + /* neigh is obsolete if it was idle for two GC periods */ + dt = 2 * arp_tbl.gc_interval; + neigh_obsolete = jiffies - dt; + /* handle possible race condition */ + if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) + goto out_unlock; + + for (i = 0; i < htbl->size; i++) { + struct ipoib_neigh *neigh; + struct ipoib_neigh __rcu **np = &htbl->buckets[i]; + + while ((neigh = rcu_dereference_protected(*np, + lockdep_is_held(&priv->lock))) != NULL) { + /* was the neigh idle for two GC periods */ + if (time_after(neigh_obsolete, neigh->alive)) { + rcu_assign_pointer(*np, + rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&priv->lock))); + /* remove from path/mc list */ + list_del(&neigh->list); + call_rcu(&neigh->rcu, ipoib_neigh_reclaim); + } else { + np = &neigh->hnext; + } + + } + } + +out_unlock: + spin_unlock_irqrestore(&priv->lock, flags); +} + +static void ipoib_reap_neigh(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, neigh_reap_task.work); + + __ipoib_reap_neigh(priv); + + if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) + queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, + arp_tbl.gc_interval); +} + + +static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr, + struct net_device *dev) +{ + struct ipoib_neigh *neigh; + + neigh = kzalloc(sizeof *neigh, GFP_ATOMIC); + if (!neigh) + return NULL; + + neigh->dev = dev; + memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr)); + skb_queue_head_init(&neigh->queue); + INIT_LIST_HEAD(&neigh->list); + ipoib_cm_set(neigh, NULL); + /* one ref on behalf of the caller */ + atomic_set(&neigh->refcnt, 1); + + return neigh; +} + +struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr, + struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + struct ipoib_neigh *neigh; + u32 hash_val; + + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + if (!htbl) { + neigh = NULL; + goto out_unlock; + } + + /* need to add a new neigh, but maybe some other thread succeeded? + * recalc hash, maybe hash resize took place so we do a search + */ + hash_val = ipoib_addr_hash(htbl, daddr); + for (neigh = rcu_dereference_protected(htbl->buckets[hash_val], + lockdep_is_held(&priv->lock)); + neigh != NULL; + neigh = rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&priv->lock))) { + if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) { + /* found, take one ref on behalf of the caller */ + if (!atomic_inc_not_zero(&neigh->refcnt)) { + /* deleted */ + neigh = NULL; + break; + } + neigh->alive = jiffies; + goto out_unlock; + } + } + + neigh = ipoib_neigh_ctor(daddr, dev); + if (!neigh) + goto out_unlock; + + /* one ref on behalf of the hash table */ + atomic_inc(&neigh->refcnt); + neigh->alive = jiffies; + /* put in hash */ + rcu_assign_pointer(neigh->hnext, + rcu_dereference_protected(htbl->buckets[hash_val], + lockdep_is_held(&priv->lock))); + rcu_assign_pointer(htbl->buckets[hash_val], neigh); + atomic_inc(&ntbl->entries); + +out_unlock: + + return neigh; +} + +void ipoib_neigh_dtor(struct ipoib_neigh *neigh) +{ + /* neigh reference count was dropprd to zero */ + struct net_device *dev = neigh->dev; + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct sk_buff *skb; + if (neigh->ah) + ipoib_put_ah(neigh->ah); + while ((skb = __skb_dequeue(&neigh->queue))) { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + } + if (ipoib_cm_get(neigh)) + ipoib_cm_destroy_tx(ipoib_cm_get(neigh)); + ipoib_dbg(netdev_priv(dev), + "neigh free for %06x %pI6\n", + IPOIB_QPN(neigh->daddr), + neigh->daddr + 4); + kfree(neigh); + if (atomic_dec_and_test(&priv->ntbl.entries)) { + if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags)) + complete(&priv->ntbl.flushed); + } +} + +static void ipoib_neigh_reclaim(struct rcu_head *rp) +{ + /* Called as a result of removal from hash table */ + struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu); + /* note TX context may hold another ref */ + ipoib_neigh_put(neigh); +} + +void ipoib_neigh_free(struct ipoib_neigh *neigh) +{ + struct net_device *dev = neigh->dev; + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + struct ipoib_neigh __rcu **np; + struct ipoib_neigh *n; + u32 hash_val; + + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + if (!htbl) + return; + + hash_val = ipoib_addr_hash(htbl, neigh->daddr); + np = &htbl->buckets[hash_val]; + for (n = rcu_dereference_protected(*np, + lockdep_is_held(&priv->lock)); + n != NULL; + n = rcu_dereference_protected(*np, + lockdep_is_held(&priv->lock))) { + if (n == neigh) { + /* found */ + rcu_assign_pointer(*np, + rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&priv->lock))); + /* remove from parent list */ + list_del(&neigh->list); + call_rcu(&neigh->rcu, ipoib_neigh_reclaim); + return; + } else { + np = &n->hnext; + } + } +} + +static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv) +{ + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + struct ipoib_neigh **buckets; + u32 size; + + clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); + ntbl->htbl = NULL; + htbl = kzalloc(sizeof(*htbl), GFP_KERNEL); + if (!htbl) + return -ENOMEM; + set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); + size = roundup_pow_of_two(arp_tbl.gc_thresh3); + buckets = kzalloc(size * sizeof(*buckets), GFP_KERNEL); + if (!buckets) { + kfree(htbl); + return -ENOMEM; + } + htbl->size = size; + htbl->mask = (size - 1); + htbl->buckets = buckets; + ntbl->htbl = htbl; + htbl->ntbl = ntbl; + atomic_set(&ntbl->entries, 0); + + /* start garbage collection */ + clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); + queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, + arp_tbl.gc_interval); + + return 0; +} + +static void neigh_hash_free_rcu(struct rcu_head *head) +{ + struct ipoib_neigh_hash *htbl = container_of(head, + struct ipoib_neigh_hash, + rcu); + struct ipoib_neigh __rcu **buckets = htbl->buckets; + struct ipoib_neigh_table *ntbl = htbl->ntbl; + + kfree(buckets); + kfree(htbl); + complete(&ntbl->deleted); +} + +void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + unsigned long flags; + int i; + + /* remove all neigh connected to a given path or mcast */ + spin_lock_irqsave(&priv->lock, flags); + + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + + if (!htbl) + goto out_unlock; + + for (i = 0; i < htbl->size; i++) { + struct ipoib_neigh *neigh; + struct ipoib_neigh __rcu **np = &htbl->buckets[i]; + + while ((neigh = rcu_dereference_protected(*np, + lockdep_is_held(&priv->lock))) != NULL) { + /* delete neighs belong to this parent */ + if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) { + rcu_assign_pointer(*np, + rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&priv->lock))); + /* remove from parent list */ + list_del(&neigh->list); + call_rcu(&neigh->rcu, ipoib_neigh_reclaim); + } else { + np = &neigh->hnext; + } + + } + } +out_unlock: + spin_unlock_irqrestore(&priv->lock, flags); +} + +static void ipoib_flush_neighs(struct ipoib_dev_priv *priv) +{ + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + unsigned long flags; + int i, wait_flushed = 0; + + init_completion(&priv->ntbl.flushed); + + spin_lock_irqsave(&priv->lock, flags); + + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + if (!htbl) + goto out_unlock; + + wait_flushed = atomic_read(&priv->ntbl.entries); + if (!wait_flushed) + goto free_htbl; + + for (i = 0; i < htbl->size; i++) { + struct ipoib_neigh *neigh; + struct ipoib_neigh __rcu **np = &htbl->buckets[i]; + + while ((neigh = rcu_dereference_protected(*np, + lockdep_is_held(&priv->lock))) != NULL) { + rcu_assign_pointer(*np, + rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&priv->lock))); + /* remove from path/mc list */ + list_del(&neigh->list); + call_rcu(&neigh->rcu, ipoib_neigh_reclaim); + } + } + +free_htbl: + rcu_assign_pointer(ntbl->htbl, NULL); + call_rcu(&htbl->rcu, neigh_hash_free_rcu); + +out_unlock: + spin_unlock_irqrestore(&priv->lock, flags); + if (wait_flushed) + wait_for_completion(&priv->ntbl.flushed); +} + +static void ipoib_neigh_hash_uninit(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int stopped; + + ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n"); + init_completion(&priv->ntbl.deleted); + set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); + + /* Stop GC if called at init fail need to cancel work */ + stopped = test_and_set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); + if (!stopped) + cancel_delayed_work(&priv->neigh_reap_task); + + ipoib_flush_neighs(priv); + + wait_for_completion(&priv->ntbl.deleted); +} + + +int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + if (ipoib_neigh_hash_init(priv) < 0) + goto out; + /* Allocate RX/TX "rings" to hold queued skbs */ + priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, + GFP_KERNEL); + if (!priv->rx_ring) { + printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", + ca->name, ipoib_recvq_size); + goto out_neigh_hash_cleanup; + } + + priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring); + if (!priv->tx_ring) { + printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n", + ca->name, ipoib_sendq_size); + goto out_rx_ring_cleanup; + } + + /* priv->tx_head, tx_tail & tx_outstanding are already 0 */ + + if (ipoib_ib_dev_init(dev, ca, port)) + goto out_tx_ring_cleanup; + + return 0; + +out_tx_ring_cleanup: + vfree(priv->tx_ring); + +out_rx_ring_cleanup: + kfree(priv->rx_ring); + +out_neigh_hash_cleanup: + ipoib_neigh_hash_uninit(dev); +out: + return -ENOMEM; +} + +void ipoib_dev_cleanup(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv; + LIST_HEAD(head); + + ASSERT_RTNL(); + + ipoib_delete_debug_files(dev); + + /* Delete any child interfaces first */ + list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { + /* Stop GC on child */ + set_bit(IPOIB_STOP_NEIGH_GC, &cpriv->flags); + cancel_delayed_work(&cpriv->neigh_reap_task); + unregister_netdevice_queue(cpriv->dev, &head); + } + unregister_netdevice_many(&head); + + ipoib_ib_dev_cleanup(dev); + + kfree(priv->rx_ring); + vfree(priv->tx_ring); + + priv->rx_ring = NULL; + priv->tx_ring = NULL; + + ipoib_neigh_hash_uninit(dev); +} + +static const struct header_ops ipoib_header_ops = { + .create = ipoib_hard_header, +}; + +static const struct net_device_ops ipoib_netdev_ops = { + .ndo_uninit = ipoib_uninit, + .ndo_open = ipoib_open, + .ndo_stop = ipoib_stop, + .ndo_change_mtu = ipoib_change_mtu, + .ndo_fix_features = ipoib_fix_features, + .ndo_start_xmit = ipoib_start_xmit, + .ndo_tx_timeout = ipoib_timeout, + .ndo_set_rx_mode = ipoib_set_mcast_list, +}; + +void ipoib_setup(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + dev->netdev_ops = &ipoib_netdev_ops; + dev->header_ops = &ipoib_header_ops; + + ipoib_set_ethtool_ops(dev); + + netif_napi_add(dev, &priv->napi, ipoib_poll, NAPI_POLL_WEIGHT); + + dev->watchdog_timeo = HZ; + + dev->flags |= IFF_BROADCAST | IFF_MULTICAST; + + dev->hard_header_len = IPOIB_ENCAP_LEN; + dev->addr_len = INFINIBAND_ALEN; + dev->type = ARPHRD_INFINIBAND; + dev->tx_queue_len = ipoib_sendq_size * 2; + dev->features = (NETIF_F_VLAN_CHALLENGED | + NETIF_F_HIGHDMA); + netif_keep_dst(dev); + + memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); + + priv->dev = dev; + + spin_lock_init(&priv->lock); + + init_rwsem(&priv->vlan_rwsem); + + INIT_LIST_HEAD(&priv->path_list); + INIT_LIST_HEAD(&priv->child_intfs); + INIT_LIST_HEAD(&priv->dead_ahs); + INIT_LIST_HEAD(&priv->multicast_list); + + INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); + INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task); + INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light); + INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal); + INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy); + INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); + INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); + INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh); +} + +struct ipoib_dev_priv *ipoib_intf_alloc(const char *name) +{ + struct net_device *dev; + + dev = alloc_netdev((int)sizeof(struct ipoib_dev_priv), name, + NET_NAME_UNKNOWN, ipoib_setup); + if (!dev) + return NULL; + + return netdev_priv(dev); +} + +static ssize_t show_pkey(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev)); + + return sprintf(buf, "0x%04x\n", priv->pkey); +} +static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL); + +static ssize_t show_umcast(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev)); + + return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags)); +} + +void ipoib_set_umcast(struct net_device *ndev, int umcast_val) +{ + struct ipoib_dev_priv *priv = netdev_priv(ndev); + + if (umcast_val > 0) { + set_bit(IPOIB_FLAG_UMCAST, &priv->flags); + ipoib_warn(priv, "ignoring multicast groups joined directly " + "by userspace\n"); + } else + clear_bit(IPOIB_FLAG_UMCAST, &priv->flags); +} + +static ssize_t set_umcast(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned long umcast_val = simple_strtoul(buf, NULL, 0); + + ipoib_set_umcast(to_net_dev(dev), umcast_val); + + return count; +} +static DEVICE_ATTR(umcast, S_IWUSR | S_IRUGO, show_umcast, set_umcast); + +int ipoib_add_umcast_attr(struct net_device *dev) +{ + return device_create_file(&dev->dev, &dev_attr_umcast); +} + +static ssize_t create_child(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int pkey; + int ret; + + if (sscanf(buf, "%i", &pkey) != 1) + return -EINVAL; + + if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000) + return -EINVAL; + + /* + * Set the full membership bit, so that we join the right + * broadcast group, etc. + */ + pkey |= 0x8000; + + ret = ipoib_vlan_add(to_net_dev(dev), pkey); + + return ret ? ret : count; +} +static DEVICE_ATTR(create_child, S_IWUSR, NULL, create_child); + +static ssize_t delete_child(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int pkey; + int ret; + + if (sscanf(buf, "%i", &pkey) != 1) + return -EINVAL; + + if (pkey < 0 || pkey > 0xffff) + return -EINVAL; + + ret = ipoib_vlan_delete(to_net_dev(dev), pkey); + + return ret ? ret : count; + +} +static DEVICE_ATTR(delete_child, S_IWUSR, NULL, delete_child); + +int ipoib_add_pkey_attr(struct net_device *dev) +{ + return device_create_file(&dev->dev, &dev_attr_pkey); +} + +int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca) +{ + struct ib_device_attr *device_attr; + int result = -ENOMEM; + + device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL); + if (!device_attr) { + printk(KERN_WARNING "%s: allocation of %zu bytes failed\n", + hca->name, sizeof *device_attr); + return result; + } + + result = ib_query_device(hca, device_attr); + if (result) { + printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n", + hca->name, result); + kfree(device_attr); + return result; + } + priv->hca_caps = device_attr->device_cap_flags; + + kfree(device_attr); + + if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { + priv->dev->hw_features = NETIF_F_SG | + NETIF_F_IP_CSUM | NETIF_F_RXCSUM; + + if (priv->hca_caps & IB_DEVICE_UD_TSO) + priv->dev->hw_features |= NETIF_F_TSO; + + priv->dev->features |= priv->dev->hw_features; + } + + return 0; +} + +static struct net_device *ipoib_add_port(const char *format, + struct ib_device *hca, u8 port) +{ + struct ipoib_dev_priv *priv; + struct ib_port_attr attr; + int result = -ENOMEM; + + priv = ipoib_intf_alloc(format); + if (!priv) + goto alloc_mem_failed; + + SET_NETDEV_DEV(priv->dev, hca->dma_device); + priv->dev->dev_id = port - 1; + + if (!ib_query_port(hca, port, &attr)) + priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu); + else { + printk(KERN_WARNING "%s: ib_query_port %d failed\n", + hca->name, port); + goto device_init_failed; + } + + /* MTU will be reset when mcast join happens */ + priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); + priv->mcast_mtu = priv->admin_mtu = priv->dev->mtu; + + priv->dev->neigh_priv_len = sizeof(struct ipoib_neigh); + + result = ib_query_pkey(hca, port, 0, &priv->pkey); + if (result) { + printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n", + hca->name, port, result); + goto device_init_failed; + } + + if (ipoib_set_dev_features(priv, hca)) + goto device_init_failed; + + /* + * Set the full membership bit, so that we join the right + * broadcast group, etc. + */ + priv->pkey |= 0x8000; + + priv->dev->broadcast[8] = priv->pkey >> 8; + priv->dev->broadcast[9] = priv->pkey & 0xff; + + result = ib_query_gid(hca, port, 0, &priv->local_gid); + if (result) { + printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n", + hca->name, port, result); + goto device_init_failed; + } else + memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); + + result = ipoib_dev_init(priv->dev, hca, port); + if (result < 0) { + printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n", + hca->name, port, result); + goto device_init_failed; + } + + INIT_IB_EVENT_HANDLER(&priv->event_handler, + priv->ca, ipoib_event); + result = ib_register_event_handler(&priv->event_handler); + if (result < 0) { + printk(KERN_WARNING "%s: ib_register_event_handler failed for " + "port %d (ret = %d)\n", + hca->name, port, result); + goto event_failed; + } + + result = register_netdev(priv->dev); + if (result) { + printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n", + hca->name, port, result); + goto register_failed; + } + + ipoib_create_debug_files(priv->dev); + + if (ipoib_cm_add_mode_attr(priv->dev)) + goto sysfs_failed; + if (ipoib_add_pkey_attr(priv->dev)) + goto sysfs_failed; + if (ipoib_add_umcast_attr(priv->dev)) + goto sysfs_failed; + if (device_create_file(&priv->dev->dev, &dev_attr_create_child)) + goto sysfs_failed; + if (device_create_file(&priv->dev->dev, &dev_attr_delete_child)) + goto sysfs_failed; + + return priv->dev; + +sysfs_failed: + ipoib_delete_debug_files(priv->dev); + unregister_netdev(priv->dev); + +register_failed: + ib_unregister_event_handler(&priv->event_handler); + /* Stop GC if started before flush */ + set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); + cancel_delayed_work(&priv->neigh_reap_task); + flush_workqueue(ipoib_workqueue); + +event_failed: + ipoib_dev_cleanup(priv->dev); + +device_init_failed: + free_netdev(priv->dev); + +alloc_mem_failed: + return ERR_PTR(result); +} + +static void ipoib_add_one(struct ib_device *device) +{ + struct list_head *dev_list; + struct net_device *dev; + struct ipoib_dev_priv *priv; + int s, e, p; + + if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) + return; + + dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL); + if (!dev_list) + return; + + INIT_LIST_HEAD(dev_list); + + if (device->node_type == RDMA_NODE_IB_SWITCH) { + s = 0; + e = 0; + } else { + s = 1; + e = device->phys_port_cnt; + } + + for (p = s; p <= e; ++p) { + if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND) + continue; + dev = ipoib_add_port("ib%d", device, p); + if (!IS_ERR(dev)) { + priv = netdev_priv(dev); + list_add_tail(&priv->list, dev_list); + } + } + + ib_set_client_data(device, &ipoib_client, dev_list); +} + +static void ipoib_remove_one(struct ib_device *device) +{ + struct ipoib_dev_priv *priv, *tmp; + struct list_head *dev_list; + + if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) + return; + + dev_list = ib_get_client_data(device, &ipoib_client); + if (!dev_list) + return; + + list_for_each_entry_safe(priv, tmp, dev_list, list) { + ib_unregister_event_handler(&priv->event_handler); + + rtnl_lock(); + dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); + rtnl_unlock(); + + /* Stop GC */ + set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); + cancel_delayed_work(&priv->neigh_reap_task); + flush_workqueue(ipoib_workqueue); + + unregister_netdev(priv->dev); + free_netdev(priv->dev); + } + + kfree(dev_list); +} + +static int __init ipoib_init_module(void) +{ + int ret; + + ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size); + ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE); + ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE); + + ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size); + ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE); + ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE); +#ifdef CONFIG_INFINIBAND_IPOIB_CM + ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP); +#endif + + /* + * When copying small received packets, we only copy from the + * linear data part of the SKB, so we rely on this condition. + */ + BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE); + + ret = ipoib_register_debugfs(); + if (ret) + return ret; + + /* + * We create our own workqueue mainly because we want to be + * able to flush it when devices are being removed. We can't + * use schedule_work()/flush_scheduled_work() because both + * unregister_netdev() and linkwatch_event take the rtnl lock, + * so flush_scheduled_work() can deadlock during device + * removal. + */ + ipoib_workqueue = create_singlethread_workqueue("ipoib"); + if (!ipoib_workqueue) { + ret = -ENOMEM; + goto err_fs; + } + + ib_sa_register_client(&ipoib_sa_client); + + ret = ib_register_client(&ipoib_client); + if (ret) + goto err_sa; + + ret = ipoib_netlink_init(); + if (ret) + goto err_client; + + return 0; + +err_client: + ib_unregister_client(&ipoib_client); + +err_sa: + ib_sa_unregister_client(&ipoib_sa_client); + destroy_workqueue(ipoib_workqueue); + +err_fs: + ipoib_unregister_debugfs(); + + return ret; +} + +static void __exit ipoib_cleanup_module(void) +{ + ipoib_netlink_fini(); + ib_unregister_client(&ipoib_client); + ib_sa_unregister_client(&ipoib_sa_client); + ipoib_unregister_debugfs(); + destroy_workqueue(ipoib_workqueue); +} + +module_init(ipoib_init_module); +module_exit(ipoib_cleanup_module); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c new file mode 100644 index 0000000..ffb83b5 --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -0,0 +1,963 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "ipoib.h" + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG +static int mcast_debug_level; + +module_param(mcast_debug_level, int, 0644); +MODULE_PARM_DESC(mcast_debug_level, + "Enable multicast debug tracing if > 0"); +#endif + +static DEFINE_MUTEX(mcast_mutex); + +struct ipoib_mcast_iter { + struct net_device *dev; + union ib_gid mgid; + unsigned long created; + unsigned int queuelen; + unsigned int complete; + unsigned int send_only; +}; + +static void ipoib_mcast_free(struct ipoib_mcast *mcast) +{ + struct net_device *dev = mcast->dev; + int tx_dropped = 0; + + ipoib_dbg_mcast(netdev_priv(dev), "deleting multicast group %pI6\n", + mcast->mcmember.mgid.raw); + + /* remove all neigh connected to this mcast */ + ipoib_del_neighs_by_gid(dev, mcast->mcmember.mgid.raw); + + if (mcast->ah) + ipoib_put_ah(mcast->ah); + + while (!skb_queue_empty(&mcast->pkt_queue)) { + ++tx_dropped; + dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue)); + } + + netif_tx_lock_bh(dev); + dev->stats.tx_dropped += tx_dropped; + netif_tx_unlock_bh(dev); + + kfree(mcast); +} + +static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev, + int can_sleep) +{ + struct ipoib_mcast *mcast; + + mcast = kzalloc(sizeof *mcast, can_sleep ? GFP_KERNEL : GFP_ATOMIC); + if (!mcast) + return NULL; + + mcast->dev = dev; + mcast->created = jiffies; + mcast->backoff = 1; + + INIT_LIST_HEAD(&mcast->list); + INIT_LIST_HEAD(&mcast->neigh_list); + skb_queue_head_init(&mcast->pkt_queue); + + return mcast; +} + +static struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct rb_node *n = priv->multicast_tree.rb_node; + + while (n) { + struct ipoib_mcast *mcast; + int ret; + + mcast = rb_entry(n, struct ipoib_mcast, rb_node); + + ret = memcmp(mgid, mcast->mcmember.mgid.raw, + sizeof (union ib_gid)); + if (ret < 0) + n = n->rb_left; + else if (ret > 0) + n = n->rb_right; + else + return mcast; + } + + return NULL; +} + +static int __ipoib_mcast_add(struct net_device *dev, struct ipoib_mcast *mcast) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct rb_node **n = &priv->multicast_tree.rb_node, *pn = NULL; + + while (*n) { + struct ipoib_mcast *tmcast; + int ret; + + pn = *n; + tmcast = rb_entry(pn, struct ipoib_mcast, rb_node); + + ret = memcmp(mcast->mcmember.mgid.raw, tmcast->mcmember.mgid.raw, + sizeof (union ib_gid)); + if (ret < 0) + n = &pn->rb_left; + else if (ret > 0) + n = &pn->rb_right; + else + return -EEXIST; + } + + rb_link_node(&mcast->rb_node, pn, n); + rb_insert_color(&mcast->rb_node, &priv->multicast_tree); + + return 0; +} + +static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, + struct ib_sa_mcmember_rec *mcmember) +{ + struct net_device *dev = mcast->dev; + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_ah *ah; + int ret; + int set_qkey = 0; + + mcast->mcmember = *mcmember; + + /* Set the multicast MTU and cached Q_Key before we attach if it's + * the broadcast group. + */ + if (!memcmp(mcast->mcmember.mgid.raw, priv->dev->broadcast + 4, + sizeof (union ib_gid))) { + spin_lock_irq(&priv->lock); + if (!priv->broadcast) { + spin_unlock_irq(&priv->lock); + return -EAGAIN; + } + priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu)); + priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey); + spin_unlock_irq(&priv->lock); + priv->tx_wr.wr.ud.remote_qkey = priv->qkey; + set_qkey = 1; + + if (!ipoib_cm_admin_enabled(dev)) { + rtnl_lock(); + dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu)); + rtnl_unlock(); + } + } + + if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { + if (test_and_set_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { + ipoib_warn(priv, "multicast group %pI6 already attached\n", + mcast->mcmember.mgid.raw); + + return 0; + } + + ret = ipoib_mcast_attach(dev, be16_to_cpu(mcast->mcmember.mlid), + &mcast->mcmember.mgid, set_qkey); + if (ret < 0) { + ipoib_warn(priv, "couldn't attach QP to multicast group %pI6\n", + mcast->mcmember.mgid.raw); + + clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags); + return ret; + } + } + + { + struct ib_ah_attr av = { + .dlid = be16_to_cpu(mcast->mcmember.mlid), + .port_num = priv->port, + .sl = mcast->mcmember.sl, + .ah_flags = IB_AH_GRH, + .static_rate = mcast->mcmember.rate, + .grh = { + .flow_label = be32_to_cpu(mcast->mcmember.flow_label), + .hop_limit = mcast->mcmember.hop_limit, + .sgid_index = 0, + .traffic_class = mcast->mcmember.traffic_class + } + }; + av.grh.dgid = mcast->mcmember.mgid; + + ah = ipoib_create_ah(dev, priv->pd, &av); + if (IS_ERR(ah)) { + ipoib_warn(priv, "ib_address_create failed %ld\n", + -PTR_ERR(ah)); + /* use original error */ + return PTR_ERR(ah); + } else { + spin_lock_irq(&priv->lock); + mcast->ah = ah; + spin_unlock_irq(&priv->lock); + + ipoib_dbg_mcast(priv, "MGID %pI6 AV %p, LID 0x%04x, SL %d\n", + mcast->mcmember.mgid.raw, + mcast->ah->ah, + be16_to_cpu(mcast->mcmember.mlid), + mcast->mcmember.sl); + } + } + + /* actually send any queued packets */ + netif_tx_lock_bh(dev); + while (!skb_queue_empty(&mcast->pkt_queue)) { + struct sk_buff *skb = skb_dequeue(&mcast->pkt_queue); + + netif_tx_unlock_bh(dev); + + skb->dev = dev; + if (dev_queue_xmit(skb)) + ipoib_warn(priv, "dev_queue_xmit failed to requeue packet\n"); + + netif_tx_lock_bh(dev); + } + netif_tx_unlock_bh(dev); + + return 0; +} + +static int +ipoib_mcast_sendonly_join_complete(int status, + struct ib_sa_multicast *multicast) +{ + struct ipoib_mcast *mcast = multicast->context; + struct net_device *dev = mcast->dev; + + /* We trap for port events ourselves. */ + if (status == -ENETRESET) + return 0; + + if (!status) + status = ipoib_mcast_join_finish(mcast, &multicast->rec); + + if (status) { + if (mcast->logcount++ < 20) + ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for %pI6, status %d\n", + mcast->mcmember.mgid.raw, status); + + /* Flush out any queued packets */ + netif_tx_lock_bh(dev); + while (!skb_queue_empty(&mcast->pkt_queue)) { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue)); + } + netif_tx_unlock_bh(dev); + + /* Clear the busy flag so we try again */ + status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, + &mcast->flags); + } + return status; +} + +static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast) +{ + struct net_device *dev = mcast->dev; + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_sa_mcmember_rec rec = { +#if 0 /* Some SMs don't support send-only yet */ + .join_state = 4 +#else + .join_state = 1 +#endif + }; + int ret = 0; + + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { + ipoib_dbg_mcast(priv, "device shutting down, no multicast joins\n"); + return -ENODEV; + } + + if (test_and_set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) { + ipoib_dbg_mcast(priv, "multicast entry busy, skipping\n"); + return -EBUSY; + } + + rec.mgid = mcast->mcmember.mgid; + rec.port_gid = priv->local_gid; + rec.pkey = cpu_to_be16(priv->pkey); + + mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, + priv->port, &rec, + IB_SA_MCMEMBER_REC_MGID | + IB_SA_MCMEMBER_REC_PORT_GID | + IB_SA_MCMEMBER_REC_PKEY | + IB_SA_MCMEMBER_REC_JOIN_STATE, + GFP_ATOMIC, + ipoib_mcast_sendonly_join_complete, + mcast); + if (IS_ERR(mcast->mc)) { + ret = PTR_ERR(mcast->mc); + clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + ipoib_warn(priv, "ib_sa_join_multicast failed (ret = %d)\n", + ret); + } else { + ipoib_dbg_mcast(priv, "no multicast record for %pI6, starting join\n", + mcast->mcmember.mgid.raw); + } + + return ret; +} + +void ipoib_mcast_carrier_on_task(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + carrier_on_task); + struct ib_port_attr attr; + + /* + * Take rtnl_lock to avoid racing with ipoib_stop() and + * turning the carrier back on while a device is being + * removed. + */ + if (ib_query_port(priv->ca, priv->port, &attr) || + attr.state != IB_PORT_ACTIVE) { + ipoib_dbg(priv, "Keeping carrier off until IB port is active\n"); + return; + } + + rtnl_lock(); + netif_carrier_on(priv->dev); + rtnl_unlock(); +} + +static int ipoib_mcast_join_complete(int status, + struct ib_sa_multicast *multicast) +{ + struct ipoib_mcast *mcast = multicast->context; + struct net_device *dev = mcast->dev; + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_dbg_mcast(priv, "join completion for %pI6 (status %d)\n", + mcast->mcmember.mgid.raw, status); + + /* We trap for port events ourselves. */ + if (status == -ENETRESET) { + status = 0; + goto out; + } + + if (!status) + status = ipoib_mcast_join_finish(mcast, &multicast->rec); + + if (!status) { + mcast->backoff = 1; + mutex_lock(&mcast_mutex); + if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) + queue_delayed_work(ipoib_workqueue, + &priv->mcast_task, 0); + mutex_unlock(&mcast_mutex); + + /* + * Defer carrier on work to ipoib_workqueue to avoid a + * deadlock on rtnl_lock here. + */ + if (mcast == priv->broadcast) + queue_work(ipoib_workqueue, &priv->carrier_on_task); + + status = 0; + goto out; + } + + if (mcast->logcount++ < 20) { + if (status == -ETIMEDOUT || status == -EAGAIN) { + ipoib_dbg_mcast(priv, "multicast join failed for %pI6, status %d\n", + mcast->mcmember.mgid.raw, status); + } else { + ipoib_warn(priv, "multicast join failed for %pI6, status %d\n", + mcast->mcmember.mgid.raw, status); + } + } + + mcast->backoff *= 2; + if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) + mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; + + /* Clear the busy flag so we try again */ + status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + + mutex_lock(&mcast_mutex); + spin_lock_irq(&priv->lock); + if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) + queue_delayed_work(ipoib_workqueue, &priv->mcast_task, + mcast->backoff * HZ); + spin_unlock_irq(&priv->lock); + mutex_unlock(&mcast_mutex); +out: + complete(&mcast->done); + return status; +} + +static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, + int create) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_sa_mcmember_rec rec = { + .join_state = 1 + }; + ib_sa_comp_mask comp_mask; + int ret = 0; + + ipoib_dbg_mcast(priv, "joining MGID %pI6\n", mcast->mcmember.mgid.raw); + + rec.mgid = mcast->mcmember.mgid; + rec.port_gid = priv->local_gid; + rec.pkey = cpu_to_be16(priv->pkey); + + comp_mask = + IB_SA_MCMEMBER_REC_MGID | + IB_SA_MCMEMBER_REC_PORT_GID | + IB_SA_MCMEMBER_REC_PKEY | + IB_SA_MCMEMBER_REC_JOIN_STATE; + + if (create) { + comp_mask |= + IB_SA_MCMEMBER_REC_QKEY | + IB_SA_MCMEMBER_REC_MTU_SELECTOR | + IB_SA_MCMEMBER_REC_MTU | + IB_SA_MCMEMBER_REC_TRAFFIC_CLASS | + IB_SA_MCMEMBER_REC_RATE_SELECTOR | + IB_SA_MCMEMBER_REC_RATE | + IB_SA_MCMEMBER_REC_SL | + IB_SA_MCMEMBER_REC_FLOW_LABEL | + IB_SA_MCMEMBER_REC_HOP_LIMIT; + + rec.qkey = priv->broadcast->mcmember.qkey; + rec.mtu_selector = IB_SA_EQ; + rec.mtu = priv->broadcast->mcmember.mtu; + rec.traffic_class = priv->broadcast->mcmember.traffic_class; + rec.rate_selector = IB_SA_EQ; + rec.rate = priv->broadcast->mcmember.rate; + rec.sl = priv->broadcast->mcmember.sl; + rec.flow_label = priv->broadcast->mcmember.flow_label; + rec.hop_limit = priv->broadcast->mcmember.hop_limit; + } + + set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + init_completion(&mcast->done); + set_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags); + + mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, + &rec, comp_mask, GFP_KERNEL, + ipoib_mcast_join_complete, mcast); + if (IS_ERR(mcast->mc)) { + clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + complete(&mcast->done); + ret = PTR_ERR(mcast->mc); + ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret); + + mcast->backoff *= 2; + if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) + mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; + + mutex_lock(&mcast_mutex); + if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) + queue_delayed_work(ipoib_workqueue, + &priv->mcast_task, + mcast->backoff * HZ); + mutex_unlock(&mcast_mutex); + } +} + +void ipoib_mcast_join_task(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, mcast_task.work); + struct net_device *dev = priv->dev; + struct ib_port_attr port_attr; + + if (!test_bit(IPOIB_MCAST_RUN, &priv->flags)) + return; + + if (ib_query_port(priv->ca, priv->port, &port_attr) || + port_attr.state != IB_PORT_ACTIVE) { + ipoib_dbg(priv, "port state is not ACTIVE (state = %d) suspending join task\n", + port_attr.state); + return; + } + priv->local_lid = port_attr.lid; + + if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid)) + ipoib_warn(priv, "ib_query_gid() failed\n"); + else + memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); + + if (!priv->broadcast) { + struct ipoib_mcast *broadcast; + + if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) + return; + + broadcast = ipoib_mcast_alloc(dev, 1); + if (!broadcast) { + ipoib_warn(priv, "failed to allocate broadcast group\n"); + mutex_lock(&mcast_mutex); + if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) + queue_delayed_work(ipoib_workqueue, + &priv->mcast_task, HZ); + mutex_unlock(&mcast_mutex); + return; + } + + spin_lock_irq(&priv->lock); + memcpy(broadcast->mcmember.mgid.raw, priv->dev->broadcast + 4, + sizeof (union ib_gid)); + priv->broadcast = broadcast; + + __ipoib_mcast_add(dev, priv->broadcast); + spin_unlock_irq(&priv->lock); + } + + if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { + if (!test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) + ipoib_mcast_join(dev, priv->broadcast, 0); + return; + } + + while (1) { + struct ipoib_mcast *mcast = NULL; + + spin_lock_irq(&priv->lock); + list_for_each_entry(mcast, &priv->multicast_list, list) { + if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) + && !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) + && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { + /* Found the next unjoined group */ + break; + } + } + spin_unlock_irq(&priv->lock); + + if (&mcast->list == &priv->multicast_list) { + /* All done */ + break; + } + + ipoib_mcast_join(dev, mcast, 1); + return; + } + + ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n"); + + clear_bit(IPOIB_MCAST_RUN, &priv->flags); +} + +int ipoib_mcast_start_thread(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_dbg_mcast(priv, "starting multicast thread\n"); + + mutex_lock(&mcast_mutex); + if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags)) + queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0); + mutex_unlock(&mcast_mutex); + + return 0; +} + +int ipoib_mcast_stop_thread(struct net_device *dev, int flush) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_dbg_mcast(priv, "stopping multicast thread\n"); + + mutex_lock(&mcast_mutex); + clear_bit(IPOIB_MCAST_RUN, &priv->flags); + cancel_delayed_work(&priv->mcast_task); + mutex_unlock(&mcast_mutex); + + if (flush) + flush_workqueue(ipoib_workqueue); + + return 0; +} + +static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int ret = 0; + + if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) + ib_sa_free_multicast(mcast->mc); + + if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { + ipoib_dbg_mcast(priv, "leaving MGID %pI6\n", + mcast->mcmember.mgid.raw); + + /* Remove ourselves from the multicast group */ + ret = ib_detach_mcast(priv->qp, &mcast->mcmember.mgid, + be16_to_cpu(mcast->mcmember.mlid)); + if (ret) + ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret); + } + + return 0; +} + +void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_mcast *mcast; + unsigned long flags; + void *mgid = daddr + 4; + + spin_lock_irqsave(&priv->lock, flags); + + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags) || + !priv->broadcast || + !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + goto unlock; + } + + mcast = __ipoib_mcast_find(dev, mgid); + if (!mcast) { + /* Let's create a new send only group now */ + ipoib_dbg_mcast(priv, "setting up send only multicast group for %pI6\n", + mgid); + + mcast = ipoib_mcast_alloc(dev, 0); + if (!mcast) { + ipoib_warn(priv, "unable to allocate memory for " + "multicast structure\n"); + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + goto out; + } + + set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags); + memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid)); + __ipoib_mcast_add(dev, mcast); + list_add_tail(&mcast->list, &priv->multicast_list); + } + + if (!mcast->ah) { + if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE) + skb_queue_tail(&mcast->pkt_queue, skb); + else { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + } + + if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) + ipoib_dbg_mcast(priv, "no address vector, " + "but multicast join already started\n"); + else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) + ipoib_mcast_sendonly_join(mcast); + + /* + * If lookup completes between here and out:, don't + * want to send packet twice. + */ + mcast = NULL; + } + +out: + if (mcast && mcast->ah) { + struct ipoib_neigh *neigh; + + spin_unlock_irqrestore(&priv->lock, flags); + neigh = ipoib_neigh_get(dev, daddr); + spin_lock_irqsave(&priv->lock, flags); + if (!neigh) { + neigh = ipoib_neigh_alloc(daddr, dev); + if (neigh) { + kref_get(&mcast->ah->ref); + neigh->ah = mcast->ah; + list_add_tail(&neigh->list, &mcast->neigh_list); + } + } + spin_unlock_irqrestore(&priv->lock, flags); + ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN); + if (neigh) + ipoib_neigh_put(neigh); + return; + } + +unlock: + spin_unlock_irqrestore(&priv->lock, flags); +} + +void ipoib_mcast_dev_flush(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + LIST_HEAD(remove_list); + struct ipoib_mcast *mcast, *tmcast; + unsigned long flags; + + ipoib_dbg_mcast(priv, "flushing multicast list\n"); + + spin_lock_irqsave(&priv->lock, flags); + + list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) { + list_del(&mcast->list); + rb_erase(&mcast->rb_node, &priv->multicast_tree); + list_add_tail(&mcast->list, &remove_list); + } + + if (priv->broadcast) { + rb_erase(&priv->broadcast->rb_node, &priv->multicast_tree); + list_add_tail(&priv->broadcast->list, &remove_list); + priv->broadcast = NULL; + } + + spin_unlock_irqrestore(&priv->lock, flags); + + /* seperate between the wait to the leave*/ + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) + if (test_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags)) + wait_for_completion(&mcast->done); + + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { + ipoib_mcast_leave(dev, mcast); + ipoib_mcast_free(mcast); + } +} + +static int ipoib_mcast_addr_is_valid(const u8 *addr, const u8 *broadcast) +{ + /* reserved QPN, prefix, scope */ + if (memcmp(addr, broadcast, 6)) + return 0; + /* signature lower, pkey */ + if (memcmp(addr + 7, broadcast + 7, 3)) + return 0; + return 1; +} + +void ipoib_mcast_restart_task(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, restart_task); + struct net_device *dev = priv->dev; + struct netdev_hw_addr *ha; + struct ipoib_mcast *mcast, *tmcast; + LIST_HEAD(remove_list); + unsigned long flags; + struct ib_sa_mcmember_rec rec; + + ipoib_dbg_mcast(priv, "restarting multicast task\n"); + + ipoib_mcast_stop_thread(dev, 0); + + local_irq_save(flags); + netif_addr_lock(dev); + spin_lock(&priv->lock); + + /* + * Unfortunately, the networking core only gives us a list of all of + * the multicast hardware addresses. We need to figure out which ones + * are new and which ones have been removed + */ + + /* Clear out the found flag */ + list_for_each_entry(mcast, &priv->multicast_list, list) + clear_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags); + + /* Mark all of the entries that are found or don't exist */ + netdev_for_each_mc_addr(ha, dev) { + union ib_gid mgid; + + if (!ipoib_mcast_addr_is_valid(ha->addr, dev->broadcast)) + continue; + + memcpy(mgid.raw, ha->addr + 4, sizeof mgid); + + mcast = __ipoib_mcast_find(dev, &mgid); + if (!mcast || test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { + struct ipoib_mcast *nmcast; + + /* ignore group which is directly joined by userspace */ + if (test_bit(IPOIB_FLAG_UMCAST, &priv->flags) && + !ib_sa_get_mcmember_rec(priv->ca, priv->port, &mgid, &rec)) { + ipoib_dbg_mcast(priv, "ignoring multicast entry for mgid %pI6\n", + mgid.raw); + continue; + } + + /* Not found or send-only group, let's add a new entry */ + ipoib_dbg_mcast(priv, "adding multicast entry for mgid %pI6\n", + mgid.raw); + + nmcast = ipoib_mcast_alloc(dev, 0); + if (!nmcast) { + ipoib_warn(priv, "unable to allocate memory for multicast structure\n"); + continue; + } + + set_bit(IPOIB_MCAST_FLAG_FOUND, &nmcast->flags); + + nmcast->mcmember.mgid = mgid; + + if (mcast) { + /* Destroy the send only entry */ + list_move_tail(&mcast->list, &remove_list); + + rb_replace_node(&mcast->rb_node, + &nmcast->rb_node, + &priv->multicast_tree); + } else + __ipoib_mcast_add(dev, nmcast); + + list_add_tail(&nmcast->list, &priv->multicast_list); + } + + if (mcast) + set_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags); + } + + /* Remove all of the entries don't exist anymore */ + list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) { + if (!test_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags) && + !test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { + ipoib_dbg_mcast(priv, "deleting multicast group %pI6\n", + mcast->mcmember.mgid.raw); + + rb_erase(&mcast->rb_node, &priv->multicast_tree); + + /* Move to the remove list */ + list_move_tail(&mcast->list, &remove_list); + } + } + + spin_unlock(&priv->lock); + netif_addr_unlock(dev); + local_irq_restore(flags); + + /* We have to cancel outside of the spinlock */ + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { + ipoib_mcast_leave(mcast->dev, mcast); + ipoib_mcast_free(mcast); + } + + if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) + ipoib_mcast_start_thread(dev); +} + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG + +struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev) +{ + struct ipoib_mcast_iter *iter; + + iter = kmalloc(sizeof *iter, GFP_KERNEL); + if (!iter) + return NULL; + + iter->dev = dev; + memset(iter->mgid.raw, 0, 16); + + if (ipoib_mcast_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter) +{ + struct ipoib_dev_priv *priv = netdev_priv(iter->dev); + struct rb_node *n; + struct ipoib_mcast *mcast; + int ret = 1; + + spin_lock_irq(&priv->lock); + + n = rb_first(&priv->multicast_tree); + + while (n) { + mcast = rb_entry(n, struct ipoib_mcast, rb_node); + + if (memcmp(iter->mgid.raw, mcast->mcmember.mgid.raw, + sizeof (union ib_gid)) < 0) { + iter->mgid = mcast->mcmember.mgid; + iter->created = mcast->created; + iter->queuelen = skb_queue_len(&mcast->pkt_queue); + iter->complete = !!mcast->ah; + iter->send_only = !!(mcast->flags & (1 << IPOIB_MCAST_FLAG_SENDONLY)); + + ret = 0; + + break; + } + + n = rb_next(n); + } + + spin_unlock_irq(&priv->lock); + + return ret; +} + +void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter, + union ib_gid *mgid, + unsigned long *created, + unsigned int *queuelen, + unsigned int *complete, + unsigned int *send_only) +{ + *mgid = iter->mgid; + *created = iter->created; + *queuelen = iter->queuelen; + *complete = iter->complete; + *send_only = iter->send_only; +} + +#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ diff --git a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c new file mode 100644 index 0000000..cdc7df4 --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. - All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include /* For ARPHRD_xxx */ +#include +#include +#include "ipoib.h" + +static const struct nla_policy ipoib_policy[IFLA_IPOIB_MAX + 1] = { + [IFLA_IPOIB_PKEY] = { .type = NLA_U16 }, + [IFLA_IPOIB_MODE] = { .type = NLA_U16 }, + [IFLA_IPOIB_UMCAST] = { .type = NLA_U16 }, +}; + +static int ipoib_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + u16 val; + + if (nla_put_u16(skb, IFLA_IPOIB_PKEY, priv->pkey)) + goto nla_put_failure; + + val = test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); + if (nla_put_u16(skb, IFLA_IPOIB_MODE, val)) + goto nla_put_failure; + + val = test_bit(IPOIB_FLAG_UMCAST, &priv->flags); + if (nla_put_u16(skb, IFLA_IPOIB_UMCAST, val)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int ipoib_changelink(struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + u16 mode, umcast; + int ret = 0; + + if (data[IFLA_IPOIB_MODE]) { + mode = nla_get_u16(data[IFLA_IPOIB_MODE]); + if (mode == IPOIB_MODE_DATAGRAM) + ret = ipoib_set_mode(dev, "datagram\n"); + else if (mode == IPOIB_MODE_CONNECTED) + ret = ipoib_set_mode(dev, "connected\n"); + else + ret = -EINVAL; + + if (ret < 0) + goto out_err; + } + + if (data[IFLA_IPOIB_UMCAST]) { + umcast = nla_get_u16(data[IFLA_IPOIB_UMCAST]); + ipoib_set_umcast(dev, umcast); + } + +out_err: + return ret; +} + +static int ipoib_new_child_link(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct net_device *pdev; + struct ipoib_dev_priv *ppriv; + u16 child_pkey; + int err; + + if (!tb[IFLA_LINK]) + return -EINVAL; + + pdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); + if (!pdev || pdev->type != ARPHRD_INFINIBAND) + return -ENODEV; + + ppriv = netdev_priv(pdev); + + if (test_bit(IPOIB_FLAG_SUBINTERFACE, &ppriv->flags)) { + ipoib_warn(ppriv, "child creation disallowed for child devices\n"); + return -EINVAL; + } + + if (!data || !data[IFLA_IPOIB_PKEY]) { + ipoib_dbg(ppriv, "no pkey specified, using parent pkey\n"); + child_pkey = ppriv->pkey; + } else + child_pkey = nla_get_u16(data[IFLA_IPOIB_PKEY]); + + if (child_pkey == 0 || child_pkey == 0x8000) + return -EINVAL; + + /* + * Set the full membership bit, so that we join the right + * broadcast group, etc. + */ + child_pkey |= 0x8000; + + err = __ipoib_vlan_add(ppriv, netdev_priv(dev), child_pkey, IPOIB_RTNL_CHILD); + + if (!err && data) + err = ipoib_changelink(dev, tb, data); + return err; +} + +static void ipoib_unregister_child_dev(struct net_device *dev, struct list_head *head) +{ + struct ipoib_dev_priv *priv, *ppriv; + + priv = netdev_priv(dev); + ppriv = netdev_priv(priv->parent); + + down_write(&ppriv->vlan_rwsem); + unregister_netdevice_queue(dev, head); + list_del(&priv->list); + up_write(&ppriv->vlan_rwsem); +} + +static size_t ipoib_get_size(const struct net_device *dev) +{ + return nla_total_size(2) + /* IFLA_IPOIB_PKEY */ + nla_total_size(2) + /* IFLA_IPOIB_MODE */ + nla_total_size(2); /* IFLA_IPOIB_UMCAST */ +} + +static struct rtnl_link_ops ipoib_link_ops __read_mostly = { + .kind = "ipoib", + .maxtype = IFLA_IPOIB_MAX, + .policy = ipoib_policy, + .priv_size = sizeof(struct ipoib_dev_priv), + .setup = ipoib_setup, + .newlink = ipoib_new_child_link, + .changelink = ipoib_changelink, + .dellink = ipoib_unregister_child_dev, + .get_size = ipoib_get_size, + .fill_info = ipoib_fill_info, +}; + +int __init ipoib_netlink_init(void) +{ + return rtnl_link_register(&ipoib_link_ops); +} + +void __exit ipoib_netlink_fini(void) +{ + rtnl_link_unregister(&ipoib_link_ops); +} + +MODULE_ALIAS_RTNL_LINK("ipoib"); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c new file mode 100644 index 0000000..c56d5d4 --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -0,0 +1,297 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "ipoib.h" + +int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid, int set_qkey) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_attr *qp_attr = NULL; + int ret; + u16 pkey_index; + + if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) { + clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + ret = -ENXIO; + goto out; + } + set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + + if (set_qkey) { + ret = -ENOMEM; + qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL); + if (!qp_attr) + goto out; + + /* set correct QKey for QP */ + qp_attr->qkey = priv->qkey; + ret = ib_modify_qp(priv->qp, qp_attr, IB_QP_QKEY); + if (ret) { + ipoib_warn(priv, "failed to modify QP, ret = %d\n", ret); + goto out; + } + } + + /* attach QP to multicast group */ + ret = ib_attach_mcast(priv->qp, mgid, mlid); + if (ret) + ipoib_warn(priv, "failed to attach to multicast group, ret = %d\n", ret); + +out: + kfree(qp_attr); + return ret; +} + +int ipoib_init_qp(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int ret; + struct ib_qp_attr qp_attr; + int attr_mask; + + if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) + return -1; + + qp_attr.qp_state = IB_QPS_INIT; + qp_attr.qkey = 0; + qp_attr.port_num = priv->port; + qp_attr.pkey_index = priv->pkey_index; + attr_mask = + IB_QP_QKEY | + IB_QP_PORT | + IB_QP_PKEY_INDEX | + IB_QP_STATE; + ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to init, ret = %d\n", ret); + goto out_fail; + } + + qp_attr.qp_state = IB_QPS_RTR; + /* Can't set this in a INIT->RTR transition */ + attr_mask &= ~IB_QP_PORT; + ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTR, ret = %d\n", ret); + goto out_fail; + } + + qp_attr.qp_state = IB_QPS_RTS; + qp_attr.sq_psn = 0; + attr_mask |= IB_QP_SQ_PSN; + attr_mask &= ~IB_QP_PKEY_INDEX; + ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTS, ret = %d\n", ret); + goto out_fail; + } + + return 0; + +out_fail: + qp_attr.qp_state = IB_QPS_RESET; + if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) + ipoib_warn(priv, "Failed to modify QP to RESET state\n"); + + return ret; +} + +int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_init_attr init_attr = { + .cap = { + .max_send_wr = ipoib_sendq_size, + .max_recv_wr = ipoib_recvq_size, + .max_send_sge = 1, + .max_recv_sge = IPOIB_UD_RX_SG + }, + .sq_sig_type = IB_SIGNAL_ALL_WR, + .qp_type = IB_QPT_UD + }; + + int ret, size; + int i; + + priv->pd = ib_alloc_pd(priv->ca); + if (IS_ERR(priv->pd)) { + printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name); + return -ENODEV; + } + + priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(priv->mr)) { + printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name); + goto out_free_pd; + } + + size = ipoib_recvq_size + 1; + ret = ipoib_cm_dev_init(dev); + if (!ret) { + size += ipoib_sendq_size; + if (ipoib_cm_has_srq(dev)) + size += ipoib_recvq_size + 1; /* 1 extra for rx_drain_qp */ + else + size += ipoib_recvq_size * ipoib_max_conn_qp; + } + + priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size, 0); + if (IS_ERR(priv->recv_cq)) { + printk(KERN_WARNING "%s: failed to create receive CQ\n", ca->name); + goto out_free_mr; + } + + priv->send_cq = ib_create_cq(priv->ca, ipoib_send_comp_handler, NULL, + dev, ipoib_sendq_size, 0); + if (IS_ERR(priv->send_cq)) { + printk(KERN_WARNING "%s: failed to create send CQ\n", ca->name); + goto out_free_recv_cq; + } + + if (ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP)) + goto out_free_send_cq; + + init_attr.send_cq = priv->send_cq; + init_attr.recv_cq = priv->recv_cq; + + if (priv->hca_caps & IB_DEVICE_UD_TSO) + init_attr.create_flags |= IB_QP_CREATE_IPOIB_UD_LSO; + + if (priv->hca_caps & IB_DEVICE_BLOCK_MULTICAST_LOOPBACK) + init_attr.create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; + + if (priv->hca_caps & IB_DEVICE_MANAGED_FLOW_STEERING) + init_attr.create_flags |= IB_QP_CREATE_NETIF_QP; + + if (dev->features & NETIF_F_SG) + init_attr.cap.max_send_sge = MAX_SKB_FRAGS + 1; + + priv->qp = ib_create_qp(priv->pd, &init_attr); + if (IS_ERR(priv->qp)) { + printk(KERN_WARNING "%s: failed to create QP\n", ca->name); + goto out_free_send_cq; + } + + priv->dev->dev_addr[1] = (priv->qp->qp_num >> 16) & 0xff; + priv->dev->dev_addr[2] = (priv->qp->qp_num >> 8) & 0xff; + priv->dev->dev_addr[3] = (priv->qp->qp_num ) & 0xff; + + for (i = 0; i < MAX_SKB_FRAGS + 1; ++i) + priv->tx_sge[i].lkey = priv->mr->lkey; + + priv->tx_wr.opcode = IB_WR_SEND; + priv->tx_wr.sg_list = priv->tx_sge; + priv->tx_wr.send_flags = IB_SEND_SIGNALED; + + priv->rx_sge[0].lkey = priv->mr->lkey; + if (ipoib_ud_need_sg(priv->max_ib_mtu)) { + priv->rx_sge[0].length = IPOIB_UD_HEAD_SIZE; + priv->rx_sge[1].length = PAGE_SIZE; + priv->rx_sge[1].lkey = priv->mr->lkey; + priv->rx_wr.num_sge = IPOIB_UD_RX_SG; + } else { + priv->rx_sge[0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); + priv->rx_wr.num_sge = 1; + } + priv->rx_wr.next = NULL; + priv->rx_wr.sg_list = priv->rx_sge; + + return 0; + +out_free_send_cq: + ib_destroy_cq(priv->send_cq); + +out_free_recv_cq: + ib_destroy_cq(priv->recv_cq); + +out_free_mr: + ib_dereg_mr(priv->mr); + ipoib_cm_dev_cleanup(dev); + +out_free_pd: + ib_dealloc_pd(priv->pd); + return -ENODEV; +} + +void ipoib_transport_dev_cleanup(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + if (priv->qp) { + if (ib_destroy_qp(priv->qp)) + ipoib_warn(priv, "ib_qp_destroy failed\n"); + + priv->qp = NULL; + clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + } + + if (ib_destroy_cq(priv->send_cq)) + ipoib_warn(priv, "ib_cq_destroy (send) failed\n"); + + if (ib_destroy_cq(priv->recv_cq)) + ipoib_warn(priv, "ib_cq_destroy (recv) failed\n"); + + ipoib_cm_dev_cleanup(dev); + + if (ib_dereg_mr(priv->mr)) + ipoib_warn(priv, "ib_dereg_mr failed\n"); + + if (ib_dealloc_pd(priv->pd)) + ipoib_warn(priv, "ib_dealloc_pd failed\n"); +} + +void ipoib_event(struct ib_event_handler *handler, + struct ib_event *record) +{ + struct ipoib_dev_priv *priv = + container_of(handler, struct ipoib_dev_priv, event_handler); + + if (record->element.port_num != priv->port) + return; + + ipoib_dbg(priv, "Event %d on device %s port %d\n", record->event, + record->device->name, record->element.port_num); + + if (record->event == IB_EVENT_SM_CHANGE || + record->event == IB_EVENT_CLIENT_REREGISTER) { + queue_work(ipoib_workqueue, &priv->flush_light); + } else if (record->event == IB_EVENT_PORT_ERR || + record->event == IB_EVENT_PORT_ACTIVE || + record->event == IB_EVENT_LID_CHANGE) { + queue_work(ipoib_workqueue, &priv->flush_normal); + } else if (record->event == IB_EVENT_PKEY_CHANGE) { + queue_work(ipoib_workqueue, &priv->flush_heavy); + } +} diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c new file mode 100644 index 0000000..9fad7b5 --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -0,0 +1,209 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include +#include + +#include + +#include "ipoib.h" + +static ssize_t show_parent(struct device *d, struct device_attribute *attr, + char *buf) +{ + struct net_device *dev = to_net_dev(d); + struct ipoib_dev_priv *priv = netdev_priv(dev); + + return sprintf(buf, "%s\n", priv->parent->name); +} +static DEVICE_ATTR(parent, S_IRUGO, show_parent, NULL); + +int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv, + u16 pkey, int type) +{ + int result; + + priv->max_ib_mtu = ppriv->max_ib_mtu; + /* MTU will be reset when mcast join happens */ + priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); + priv->mcast_mtu = priv->admin_mtu = priv->dev->mtu; + set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags); + + result = ipoib_set_dev_features(priv, ppriv->ca); + if (result) + goto err; + + priv->pkey = pkey; + + memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, INFINIBAND_ALEN); + priv->dev->broadcast[8] = pkey >> 8; + priv->dev->broadcast[9] = pkey & 0xff; + + result = ipoib_dev_init(priv->dev, ppriv->ca, ppriv->port); + if (result < 0) { + ipoib_warn(ppriv, "failed to initialize subinterface: " + "device %s, port %d", + ppriv->ca->name, ppriv->port); + goto err; + } + + result = register_netdevice(priv->dev); + if (result) { + ipoib_warn(priv, "failed to initialize; error %i", result); + goto register_failed; + } + + priv->parent = ppriv->dev; + + ipoib_create_debug_files(priv->dev); + + /* RTNL childs don't need proprietary sysfs entries */ + if (type == IPOIB_LEGACY_CHILD) { + if (ipoib_cm_add_mode_attr(priv->dev)) + goto sysfs_failed; + if (ipoib_add_pkey_attr(priv->dev)) + goto sysfs_failed; + if (ipoib_add_umcast_attr(priv->dev)) + goto sysfs_failed; + + if (device_create_file(&priv->dev->dev, &dev_attr_parent)) + goto sysfs_failed; + } + + priv->child_type = type; + priv->dev->iflink = ppriv->dev->ifindex; + list_add_tail(&priv->list, &ppriv->child_intfs); + + return 0; + +sysfs_failed: + result = -ENOMEM; + ipoib_delete_debug_files(priv->dev); + unregister_netdevice(priv->dev); + +register_failed: + ipoib_dev_cleanup(priv->dev); + +err: + return result; +} + +int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) +{ + struct ipoib_dev_priv *ppriv, *priv; + char intf_name[IFNAMSIZ]; + struct ipoib_dev_priv *tpriv; + int result; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + ppriv = netdev_priv(pdev); + + snprintf(intf_name, sizeof intf_name, "%s.%04x", + ppriv->dev->name, pkey); + priv = ipoib_intf_alloc(intf_name); + if (!priv) + return -ENOMEM; + + if (!rtnl_trylock()) + return restart_syscall(); + + down_write(&ppriv->vlan_rwsem); + + /* + * First ensure this isn't a duplicate. We check the parent device and + * then all of the legacy child interfaces to make sure the Pkey + * doesn't match. + */ + if (ppriv->pkey == pkey) { + result = -ENOTUNIQ; + goto out; + } + + list_for_each_entry(tpriv, &ppriv->child_intfs, list) { + if (tpriv->pkey == pkey && + tpriv->child_type == IPOIB_LEGACY_CHILD) { + result = -ENOTUNIQ; + goto out; + } + } + + result = __ipoib_vlan_add(ppriv, priv, pkey, IPOIB_LEGACY_CHILD); + +out: + up_write(&ppriv->vlan_rwsem); + + if (result) + free_netdev(priv->dev); + + rtnl_unlock(); + + return result; +} + +int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) +{ + struct ipoib_dev_priv *ppriv, *priv, *tpriv; + struct net_device *dev = NULL; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + ppriv = netdev_priv(pdev); + + if (!rtnl_trylock()) + return restart_syscall(); + + down_write(&ppriv->vlan_rwsem); + list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) { + if (priv->pkey == pkey && + priv->child_type == IPOIB_LEGACY_CHILD) { + unregister_netdevice(priv->dev); + list_del(&priv->list); + dev = priv->dev; + break; + } + } + up_write(&ppriv->vlan_rwsem); + + rtnl_unlock(); + + if (dev) { + free_netdev(dev); + return 0; + } + + return -ENODEV; +} diff --git a/drivers/infiniband/ulp/iser/Kconfig b/drivers/infiniband/ulp/iser/Kconfig new file mode 100644 index 0000000..d00af71 --- /dev/null +++ b/drivers/infiniband/ulp/iser/Kconfig @@ -0,0 +1,12 @@ +config INFINIBAND_ISER + tristate "iSCSI Extensions for RDMA (iSER)" + depends on SCSI && INET && INFINIBAND_ADDR_TRANS + select SCSI_ISCSI_ATTRS + ---help--- + Support for the iSCSI Extensions for RDMA (iSER) Protocol + over InfiniBand. This allows you to access storage devices + that speak iSCSI over iSER over InfiniBand. + + The iSER protocol is defined by IETF. + See + and diff --git a/drivers/infiniband/ulp/iser/Makefile b/drivers/infiniband/ulp/iser/Makefile new file mode 100644 index 0000000..fe6cd15 --- /dev/null +++ b/drivers/infiniband/ulp/iser/Makefile @@ -0,0 +1,4 @@ +obj-$(CONFIG_INFINIBAND_ISER) += ib_iser.o + +ib_iser-y := iser_verbs.o iser_initiator.o iser_memory.o \ + iscsi_iser.o diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c new file mode 100644 index 0000000..f42ab14 --- /dev/null +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -0,0 +1,1038 @@ +/* + * iSCSI Initiator over iSER Data-Path + * + * Copyright (C) 2004 Dmitry Yusupov + * Copyright (C) 2004 Alex Aizman + * Copyright (C) 2005 Mike Christie + * Copyright (c) 2005, 2006 Voltaire, Inc. All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved. + * maintained by openib-general@openib.org + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Credits: + * Christoph Hellwig + * FUJITA Tomonori + * Arne Redlich + * Zhenyu Wang + * Modified by: + * Erez Zilber + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "iscsi_iser.h" + +static struct scsi_host_template iscsi_iser_sht; +static struct iscsi_transport iscsi_iser_transport; +static struct scsi_transport_template *iscsi_iser_scsi_transport; + +static unsigned int iscsi_max_lun = 512; +module_param_named(max_lun, iscsi_max_lun, uint, S_IRUGO); + +int iser_debug_level = 0; +bool iser_pi_enable = false; +int iser_pi_guard = 1; + +MODULE_DESCRIPTION("iSER (iSCSI Extensions for RDMA) Datamover"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Alex Nezhinsky, Dan Bar Dov, Or Gerlitz"); +MODULE_VERSION(DRV_VER); + +module_param_named(debug_level, iser_debug_level, int, 0644); +MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:disabled)"); + +module_param_named(pi_enable, iser_pi_enable, bool, 0644); +MODULE_PARM_DESC(pi_enable, "Enable T10-PI offload support (default:disabled)"); + +module_param_named(pi_guard, iser_pi_guard, int, 0644); +MODULE_PARM_DESC(pi_guard, "T10-PI guard_type, 0:CRC|1:IP_CSUM (default:IP_CSUM)"); + +static struct workqueue_struct *release_wq; +struct iser_global ig; + +/* + * iscsi_iser_recv() - Process a successfull recv completion + * @conn: iscsi connection + * @hdr: iscsi header + * @rx_data: buffer containing receive data payload + * @rx_data_len: length of rx_data + * + * Notes: In case of data length errors or iscsi PDU completion failures + * this routine will signal iscsi layer of connection failure. + */ +void +iscsi_iser_recv(struct iscsi_conn *conn, struct iscsi_hdr *hdr, + char *rx_data, int rx_data_len) +{ + int rc = 0; + int datalen; + int ahslen; + + /* verify PDU length */ + datalen = ntoh24(hdr->dlength); + if (datalen > rx_data_len || (datalen + 4) < rx_data_len) { + iser_err("wrong datalen %d (hdr), %d (IB)\n", + datalen, rx_data_len); + rc = ISCSI_ERR_DATALEN; + goto error; + } + + if (datalen != rx_data_len) + iser_dbg("aligned datalen (%d) hdr, %d (IB)\n", + datalen, rx_data_len); + + /* read AHS */ + ahslen = hdr->hlength * 4; + + rc = iscsi_complete_pdu(conn, hdr, rx_data, rx_data_len); + if (rc && rc != ISCSI_ERR_NO_SCSI_CMD) + goto error; + + return; +error: + iscsi_conn_failure(conn, rc); +} + +/** + * iscsi_iser_pdu_alloc() - allocate an iscsi-iser PDU + * @task: iscsi task + * @opcode: iscsi command opcode + * + * Netes: This routine can't fail, just assign iscsi task + * hdr and max hdr size. + */ +static int +iscsi_iser_pdu_alloc(struct iscsi_task *task, uint8_t opcode) +{ + struct iscsi_iser_task *iser_task = task->dd_data; + + task->hdr = (struct iscsi_hdr *)&iser_task->desc.iscsi_header; + task->hdr_max = sizeof(iser_task->desc.iscsi_header); + + return 0; +} + +int iser_initialize_task_headers(struct iscsi_task *task, + struct iser_tx_desc *tx_desc) +{ + struct iser_conn *iser_conn = task->conn->dd_data; + struct iser_device *device = iser_conn->ib_conn.device; + struct iscsi_iser_task *iser_task = task->dd_data; + u64 dma_addr; + + dma_addr = ib_dma_map_single(device->ib_device, (void *)tx_desc, + ISER_HEADERS_LEN, DMA_TO_DEVICE); + if (ib_dma_mapping_error(device->ib_device, dma_addr)) + return -ENOMEM; + + tx_desc->dma_addr = dma_addr; + tx_desc->tx_sg[0].addr = tx_desc->dma_addr; + tx_desc->tx_sg[0].length = ISER_HEADERS_LEN; + tx_desc->tx_sg[0].lkey = device->mr->lkey; + + iser_task->iser_conn = iser_conn; + return 0; +} + +/** + * iscsi_iser_task_init() - Initialize iscsi-iser task + * @task: iscsi task + * + * Initialize the task for the scsi command or mgmt command. + * + * Return: Returns zero on success or -ENOMEM when failing + * to init task headers (dma mapping error). + */ +static int +iscsi_iser_task_init(struct iscsi_task *task) +{ + struct iscsi_iser_task *iser_task = task->dd_data; + + if (iser_initialize_task_headers(task, &iser_task->desc)) + return -ENOMEM; + + /* mgmt task */ + if (!task->sc) + return 0; + + iser_task->command_sent = 0; + iser_task_rdma_init(iser_task); + iser_task->sc = task->sc; + + return 0; +} + +/** + * iscsi_iser_mtask_xmit() - xmit management (immediate) task + * @conn: iscsi connection + * @task: task management task + * + * Notes: + * The function can return -EAGAIN in which case caller must + * call it again later, or recover. '0' return code means successful + * xmit. + * + **/ +static int +iscsi_iser_mtask_xmit(struct iscsi_conn *conn, struct iscsi_task *task) +{ + int error = 0; + + iser_dbg("mtask xmit [cid %d itt 0x%x]\n", conn->id, task->itt); + + error = iser_send_control(conn, task); + + /* since iser xmits control with zero copy, tasks can not be recycled + * right after sending them. + * The recycling scheme is based on whether a response is expected + * - if yes, the task is recycled at iscsi_complete_pdu + * - if no, the task is recycled at iser_snd_completion + */ + return error; +} + +static int +iscsi_iser_task_xmit_unsol_data(struct iscsi_conn *conn, + struct iscsi_task *task) +{ + struct iscsi_r2t_info *r2t = &task->unsol_r2t; + struct iscsi_data hdr; + int error = 0; + + /* Send data-out PDUs while there's still unsolicited data to send */ + while (iscsi_task_has_unsol_data(task)) { + iscsi_prep_data_out_pdu(task, r2t, &hdr); + iser_dbg("Sending data-out: itt 0x%x, data count %d\n", + hdr.itt, r2t->data_count); + + /* the buffer description has been passed with the command */ + /* Send the command */ + error = iser_send_data_out(conn, task, &hdr); + if (error) { + r2t->datasn--; + goto iscsi_iser_task_xmit_unsol_data_exit; + } + r2t->sent += r2t->data_count; + iser_dbg("Need to send %d more as data-out PDUs\n", + r2t->data_length - r2t->sent); + } + +iscsi_iser_task_xmit_unsol_data_exit: + return error; +} + +/** + * iscsi_iser_task_xmit() - xmit iscsi-iser task + * @task: iscsi task + * + * Return: zero on success or escalates $error on failure. + */ +static int +iscsi_iser_task_xmit(struct iscsi_task *task) +{ + struct iscsi_conn *conn = task->conn; + struct iscsi_iser_task *iser_task = task->dd_data; + int error = 0; + + if (!task->sc) + return iscsi_iser_mtask_xmit(conn, task); + + if (task->sc->sc_data_direction == DMA_TO_DEVICE) { + BUG_ON(scsi_bufflen(task->sc) == 0); + + iser_dbg("cmd [itt %x total %d imm %d unsol_data %d\n", + task->itt, scsi_bufflen(task->sc), + task->imm_count, task->unsol_r2t.data_length); + } + + iser_dbg("ctask xmit [cid %d itt 0x%x]\n", + conn->id, task->itt); + + /* Send the cmd PDU */ + if (!iser_task->command_sent) { + error = iser_send_command(conn, task); + if (error) + goto iscsi_iser_task_xmit_exit; + iser_task->command_sent = 1; + } + + /* Send unsolicited data-out PDU(s) if necessary */ + if (iscsi_task_has_unsol_data(task)) + error = iscsi_iser_task_xmit_unsol_data(conn, task); + + iscsi_iser_task_xmit_exit: + return error; +} + +/** + * iscsi_iser_cleanup_task() - cleanup an iscsi-iser task + * @task: iscsi task + * + * Notes: In case the RDMA device is already NULL (might have + * been removed in DEVICE_REMOVAL CM event it will bail-out + * without doing dma unmapping. + */ +static void iscsi_iser_cleanup_task(struct iscsi_task *task) +{ + struct iscsi_iser_task *iser_task = task->dd_data; + struct iser_tx_desc *tx_desc = &iser_task->desc; + struct iser_conn *iser_conn = task->conn->dd_data; + struct iser_device *device = iser_conn->ib_conn.device; + + /* DEVICE_REMOVAL event might have already released the device */ + if (!device) + return; + + ib_dma_unmap_single(device->ib_device, + tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE); + + /* mgmt tasks do not need special cleanup */ + if (!task->sc) + return; + + if (iser_task->status == ISER_TASK_STATUS_STARTED) { + iser_task->status = ISER_TASK_STATUS_COMPLETED; + iser_task_rdma_finalize(iser_task); + } +} + +/** + * iscsi_iser_check_protection() - check protection information status of task. + * @task: iscsi task + * @sector: error sector if exsists (output) + * + * Return: zero if no data-integrity errors have occured + * 0x1: data-integrity error occured in the guard-block + * 0x2: data-integrity error occured in the reference tag + * 0x3: data-integrity error occured in the application tag + * + * In addition the error sector is marked. + */ +static u8 +iscsi_iser_check_protection(struct iscsi_task *task, sector_t *sector) +{ + struct iscsi_iser_task *iser_task = task->dd_data; + + if (iser_task->dir[ISER_DIR_IN]) + return iser_check_task_pi_status(iser_task, ISER_DIR_IN, + sector); + else + return iser_check_task_pi_status(iser_task, ISER_DIR_OUT, + sector); +} + +/** + * iscsi_iser_conn_create() - create a new iscsi-iser connection + * @cls_session: iscsi class connection + * @conn_idx: connection index within the session (for MCS) + * + * Return: iscsi_cls_conn when iscsi_conn_setup succeeds or NULL + * otherwise. + */ +static struct iscsi_cls_conn * +iscsi_iser_conn_create(struct iscsi_cls_session *cls_session, + uint32_t conn_idx) +{ + struct iscsi_conn *conn; + struct iscsi_cls_conn *cls_conn; + + cls_conn = iscsi_conn_setup(cls_session, 0, conn_idx); + if (!cls_conn) + return NULL; + conn = cls_conn->dd_data; + + /* + * due to issues with the login code re iser sematics + * this not set in iscsi_conn_setup - FIXME + */ + conn->max_recv_dlength = ISER_RECV_DATA_SEG_LEN; + + return cls_conn; +} + +/** + * iscsi_iser_conn_bind() - bind iscsi and iser connection structures + * @cls_session: iscsi class session + * @cls_conn: iscsi class connection + * @transport_eph: transport end-point handle + * @is_leading: indicate if this is the session leading connection (MCS) + * + * Return: zero on success, $error if iscsi_conn_bind fails and + * -EINVAL in case end-point doesn't exsits anymore or iser connection + * state is not UP (teardown already started). + */ +static int +iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session, + struct iscsi_cls_conn *cls_conn, + uint64_t transport_eph, + int is_leading) +{ + struct iscsi_conn *conn = cls_conn->dd_data; + struct iser_conn *iser_conn; + struct iscsi_endpoint *ep; + int error; + + error = iscsi_conn_bind(cls_session, cls_conn, is_leading); + if (error) + return error; + + /* the transport ep handle comes from user space so it must be + * verified against the global ib connections list */ + ep = iscsi_lookup_endpoint(transport_eph); + if (!ep) { + iser_err("can't bind eph %llx\n", + (unsigned long long)transport_eph); + return -EINVAL; + } + iser_conn = ep->dd_data; + + mutex_lock(&iser_conn->state_mutex); + if (iser_conn->state != ISER_CONN_UP) { + error = -EINVAL; + iser_err("iser_conn %p state is %d, teardown started\n", + iser_conn, iser_conn->state); + goto out; + } + + error = iser_alloc_rx_descriptors(iser_conn, conn->session); + if (error) + goto out; + + /* binds the iSER connection retrieved from the previously + * connected ep_handle to the iSCSI layer connection. exchanges + * connection pointers */ + iser_info("binding iscsi conn %p to iser_conn %p\n", conn, iser_conn); + + conn->dd_data = iser_conn; + iser_conn->iscsi_conn = conn; + +out: + mutex_unlock(&iser_conn->state_mutex); + return error; +} + +/** + * iscsi_iser_conn_start() - start iscsi-iser connection + * @cls_conn: iscsi class connection + * + * Notes: Here iser intialize (or re-initialize) stop_completion as + * from this point iscsi must call conn_stop in session/connection + * teardown so iser transport must wait for it. + */ +static int +iscsi_iser_conn_start(struct iscsi_cls_conn *cls_conn) +{ + struct iscsi_conn *iscsi_conn; + struct iser_conn *iser_conn; + + iscsi_conn = cls_conn->dd_data; + iser_conn = iscsi_conn->dd_data; + reinit_completion(&iser_conn->stop_completion); + + return iscsi_conn_start(cls_conn); +} + +/** + * iscsi_iser_conn_stop() - stop iscsi-iser connection + * @cls_conn: iscsi class connection + * @flag: indicate if recover or terminate (passed as is) + * + * Notes: Calling iscsi_conn_stop might theoretically race with + * DEVICE_REMOVAL event and dereference a previously freed RDMA device + * handle, so we call it under iser the state lock to protect against + * this kind of race. + */ +static void +iscsi_iser_conn_stop(struct iscsi_cls_conn *cls_conn, int flag) +{ + struct iscsi_conn *conn = cls_conn->dd_data; + struct iser_conn *iser_conn = conn->dd_data; + + iser_info("stopping iscsi_conn: %p, iser_conn: %p\n", conn, iser_conn); + + /* + * Userspace may have goofed up and not bound the connection or + * might have only partially setup the connection. + */ + if (iser_conn) { + mutex_lock(&iser_conn->state_mutex); + iscsi_conn_stop(cls_conn, flag); + iser_conn_terminate(iser_conn); + + /* unbind */ + iser_conn->iscsi_conn = NULL; + conn->dd_data = NULL; + + complete(&iser_conn->stop_completion); + mutex_unlock(&iser_conn->state_mutex); + } else { + iscsi_conn_stop(cls_conn, flag); + } +} + +/** + * iscsi_iser_session_destroy() - destroy iscsi-iser session + * @cls_session: iscsi class session + * + * Removes and free iscsi host. + */ +static void +iscsi_iser_session_destroy(struct iscsi_cls_session *cls_session) +{ + struct Scsi_Host *shost = iscsi_session_to_shost(cls_session); + + iscsi_session_teardown(cls_session); + iscsi_host_remove(shost); + iscsi_host_free(shost); +} + +static inline unsigned int +iser_dif_prot_caps(int prot_caps) +{ + return ((prot_caps & IB_PROT_T10DIF_TYPE_1) ? SHOST_DIF_TYPE1_PROTECTION | + SHOST_DIX_TYPE1_PROTECTION : 0) | + ((prot_caps & IB_PROT_T10DIF_TYPE_2) ? SHOST_DIF_TYPE2_PROTECTION | + SHOST_DIX_TYPE2_PROTECTION : 0) | + ((prot_caps & IB_PROT_T10DIF_TYPE_3) ? SHOST_DIF_TYPE3_PROTECTION | + SHOST_DIX_TYPE3_PROTECTION : 0); +} + +/** + * iscsi_iser_session_create() - create an iscsi-iser session + * @ep: iscsi end-point handle + * @cmds_max: maximum commands in this session + * @qdepth: session command queue depth + * @initial_cmdsn: initiator command sequnce number + * + * Allocates and adds a scsi host, expose DIF supprot if + * exists, and sets up an iscsi session. + */ +static struct iscsi_cls_session * +iscsi_iser_session_create(struct iscsi_endpoint *ep, + uint16_t cmds_max, uint16_t qdepth, + uint32_t initial_cmdsn) +{ + struct iscsi_cls_session *cls_session; + struct iscsi_session *session; + struct Scsi_Host *shost; + struct iser_conn *iser_conn = NULL; + struct ib_conn *ib_conn; + + shost = iscsi_host_alloc(&iscsi_iser_sht, 0, 0); + if (!shost) + return NULL; + shost->transportt = iscsi_iser_scsi_transport; + shost->cmd_per_lun = qdepth; + shost->max_lun = iscsi_max_lun; + shost->max_id = 0; + shost->max_channel = 0; + shost->max_cmd_len = 16; + + /* + * older userspace tools (before 2.0-870) did not pass us + * the leading conn's ep so this will be NULL; + */ + if (ep) { + iser_conn = ep->dd_data; + ib_conn = &iser_conn->ib_conn; + if (ib_conn->pi_support) { + u32 sig_caps = ib_conn->device->dev_attr.sig_prot_cap; + + scsi_host_set_prot(shost, iser_dif_prot_caps(sig_caps)); + if (iser_pi_guard) + scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP); + else + scsi_host_set_guard(shost, SHOST_DIX_GUARD_CRC); + } + } + + if (iscsi_host_add(shost, ep ? + ib_conn->device->ib_device->dma_device : NULL)) + goto free_host; + + if (cmds_max > ISER_DEF_XMIT_CMDS_MAX) { + iser_info("cmds_max changed from %u to %u\n", + cmds_max, ISER_DEF_XMIT_CMDS_MAX); + cmds_max = ISER_DEF_XMIT_CMDS_MAX; + } + + cls_session = iscsi_session_setup(&iscsi_iser_transport, shost, + cmds_max, 0, + sizeof(struct iscsi_iser_task), + initial_cmdsn, 0); + if (!cls_session) + goto remove_host; + session = cls_session->dd_data; + + shost->can_queue = session->scsi_cmds_max; + return cls_session; + +remove_host: + iscsi_host_remove(shost); +free_host: + iscsi_host_free(shost); + return NULL; +} + +static int +iscsi_iser_set_param(struct iscsi_cls_conn *cls_conn, + enum iscsi_param param, char *buf, int buflen) +{ + int value; + + switch (param) { + case ISCSI_PARAM_MAX_RECV_DLENGTH: + /* TBD */ + break; + case ISCSI_PARAM_HDRDGST_EN: + sscanf(buf, "%d", &value); + if (value) { + iser_err("DataDigest wasn't negotiated to None\n"); + return -EPROTO; + } + break; + case ISCSI_PARAM_DATADGST_EN: + sscanf(buf, "%d", &value); + if (value) { + iser_err("DataDigest wasn't negotiated to None\n"); + return -EPROTO; + } + break; + case ISCSI_PARAM_IFMARKER_EN: + sscanf(buf, "%d", &value); + if (value) { + iser_err("IFMarker wasn't negotiated to No\n"); + return -EPROTO; + } + break; + case ISCSI_PARAM_OFMARKER_EN: + sscanf(buf, "%d", &value); + if (value) { + iser_err("OFMarker wasn't negotiated to No\n"); + return -EPROTO; + } + break; + default: + return iscsi_set_param(cls_conn, param, buf, buflen); + } + + return 0; +} + +/** + * iscsi_iser_set_param() - set class connection parameter + * @cls_conn: iscsi class connection + * @stats: iscsi stats to output + * + * Output connection statistics. + */ +static void +iscsi_iser_conn_get_stats(struct iscsi_cls_conn *cls_conn, struct iscsi_stats *stats) +{ + struct iscsi_conn *conn = cls_conn->dd_data; + + stats->txdata_octets = conn->txdata_octets; + stats->rxdata_octets = conn->rxdata_octets; + stats->scsicmd_pdus = conn->scsicmd_pdus_cnt; + stats->dataout_pdus = conn->dataout_pdus_cnt; + stats->scsirsp_pdus = conn->scsirsp_pdus_cnt; + stats->datain_pdus = conn->datain_pdus_cnt; /* always 0 */ + stats->r2t_pdus = conn->r2t_pdus_cnt; /* always 0 */ + stats->tmfcmd_pdus = conn->tmfcmd_pdus_cnt; + stats->tmfrsp_pdus = conn->tmfrsp_pdus_cnt; + stats->custom_length = 4; + strcpy(stats->custom[0].desc, "qp_tx_queue_full"); + stats->custom[0].value = 0; /* TB iser_conn->qp_tx_queue_full; */ + strcpy(stats->custom[1].desc, "fmr_map_not_avail"); + stats->custom[1].value = 0; /* TB iser_conn->fmr_map_not_avail */; + strcpy(stats->custom[2].desc, "eh_abort_cnt"); + stats->custom[2].value = conn->eh_abort_cnt; + strcpy(stats->custom[3].desc, "fmr_unalign_cnt"); + stats->custom[3].value = conn->fmr_unalign_cnt; +} + +static int iscsi_iser_get_ep_param(struct iscsi_endpoint *ep, + enum iscsi_param param, char *buf) +{ + struct iser_conn *iser_conn = ep->dd_data; + int len; + + switch (param) { + case ISCSI_PARAM_CONN_PORT: + case ISCSI_PARAM_CONN_ADDRESS: + if (!iser_conn || !iser_conn->ib_conn.cma_id) + return -ENOTCONN; + + return iscsi_conn_get_addr_param((struct sockaddr_storage *) + &iser_conn->ib_conn.cma_id->route.addr.dst_addr, + param, buf); + break; + default: + return -ENOSYS; + } + + return len; +} + +/** + * iscsi_iser_ep_connect() - Initiate iSER connection establishment + * @shost: scsi_host + * @dst_addr: destination address + * @non-blocking: indicate if routine can block + * + * Allocate an iscsi endpoint, an iser_conn structure and bind them. + * After that start RDMA connection establishment via rdma_cm. We + * don't allocate iser_conn embedded in iscsi_endpoint since in teardown + * the endpoint will be destroyed at ep_disconnect while iser_conn will + * cleanup its resources asynchronuously. + * + * Return: iscsi_endpoint created by iscsi layer or ERR_PTR(error) + * if fails. + */ +static struct iscsi_endpoint * +iscsi_iser_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr, + int non_blocking) +{ + int err; + struct iser_conn *iser_conn; + struct iscsi_endpoint *ep; + + ep = iscsi_create_endpoint(0); + if (!ep) + return ERR_PTR(-ENOMEM); + + iser_conn = kzalloc(sizeof(*iser_conn), GFP_KERNEL); + if (!iser_conn) { + err = -ENOMEM; + goto failure; + } + + ep->dd_data = iser_conn; + iser_conn->ep = ep; + iser_conn_init(iser_conn); + + err = iser_connect(iser_conn, NULL, dst_addr, non_blocking); + if (err) + goto failure; + + return ep; +failure: + iscsi_destroy_endpoint(ep); + return ERR_PTR(err); +} + +/** + * iscsi_iser_ep_poll() - poll for iser connection establishment to complete + * @ep: iscsi endpoint (created at ep_connect) + * @timeout_ms: polling timeout allowed in ms. + * + * This routine boils down to waiting for up_completion signaling + * that cma_id got CONNECTED event. + * + * Return: 1 if succeeded in connection establishment, 0 if timeout expired + * (libiscsi will retry will kick in) or -1 if interrupted by signal + * or more likely iser connection state transitioned to TEMINATING or + * DOWN during the wait period. + */ +static int +iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms) +{ + struct iser_conn *iser_conn; + int rc; + + iser_conn = ep->dd_data; + rc = wait_for_completion_interruptible_timeout(&iser_conn->up_completion, + msecs_to_jiffies(timeout_ms)); + /* if conn establishment failed, return error code to iscsi */ + if (rc == 0) { + mutex_lock(&iser_conn->state_mutex); + if (iser_conn->state == ISER_CONN_TERMINATING || + iser_conn->state == ISER_CONN_DOWN) + rc = -1; + mutex_unlock(&iser_conn->state_mutex); + } + + iser_info("ib conn %p rc = %d\n", iser_conn, rc); + + if (rc > 0) + return 1; /* success, this is the equivalent of POLLOUT */ + else if (!rc) + return 0; /* timeout */ + else + return rc; /* signal */ +} + +/** + * iscsi_iser_ep_disconnect() - Initiate connection teardown process + * @ep: iscsi endpoint handle + * + * This routine is not blocked by iser and RDMA termination process + * completion as we queue a deffered work for iser/RDMA destruction + * and cleanup or actually call it immediately in case we didn't pass + * iscsi conn bind/start stage, thus it is safe. + */ +static void +iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep) +{ + struct iser_conn *iser_conn; + + iser_conn = ep->dd_data; + iser_info("ep %p iser conn %p state %d\n", + ep, iser_conn, iser_conn->state); + + mutex_lock(&iser_conn->state_mutex); + iser_conn_terminate(iser_conn); + + /* + * if iser_conn and iscsi_conn are bound, we must wait for + * iscsi_conn_stop and flush errors completion before freeing + * the iser resources. Otherwise we are safe to free resources + * immediately. + */ + if (iser_conn->iscsi_conn) { + INIT_WORK(&iser_conn->release_work, iser_release_work); + queue_work(release_wq, &iser_conn->release_work); + mutex_unlock(&iser_conn->state_mutex); + } else { + iser_conn->state = ISER_CONN_DOWN; + mutex_unlock(&iser_conn->state_mutex); + iser_conn_release(iser_conn); + } + iscsi_destroy_endpoint(ep); +} + +static umode_t iser_attr_is_visible(int param_type, int param) +{ + switch (param_type) { + case ISCSI_HOST_PARAM: + switch (param) { + case ISCSI_HOST_PARAM_NETDEV_NAME: + case ISCSI_HOST_PARAM_HWADDRESS: + case ISCSI_HOST_PARAM_INITIATOR_NAME: + return S_IRUGO; + default: + return 0; + } + case ISCSI_PARAM: + switch (param) { + case ISCSI_PARAM_MAX_RECV_DLENGTH: + case ISCSI_PARAM_MAX_XMIT_DLENGTH: + case ISCSI_PARAM_HDRDGST_EN: + case ISCSI_PARAM_DATADGST_EN: + case ISCSI_PARAM_CONN_ADDRESS: + case ISCSI_PARAM_CONN_PORT: + case ISCSI_PARAM_EXP_STATSN: + case ISCSI_PARAM_PERSISTENT_ADDRESS: + case ISCSI_PARAM_PERSISTENT_PORT: + case ISCSI_PARAM_PING_TMO: + case ISCSI_PARAM_RECV_TMO: + case ISCSI_PARAM_INITIAL_R2T_EN: + case ISCSI_PARAM_MAX_R2T: + case ISCSI_PARAM_IMM_DATA_EN: + case ISCSI_PARAM_FIRST_BURST: + case ISCSI_PARAM_MAX_BURST: + case ISCSI_PARAM_PDU_INORDER_EN: + case ISCSI_PARAM_DATASEQ_INORDER_EN: + case ISCSI_PARAM_TARGET_NAME: + case ISCSI_PARAM_TPGT: + case ISCSI_PARAM_USERNAME: + case ISCSI_PARAM_PASSWORD: + case ISCSI_PARAM_USERNAME_IN: + case ISCSI_PARAM_PASSWORD_IN: + case ISCSI_PARAM_FAST_ABORT: + case ISCSI_PARAM_ABORT_TMO: + case ISCSI_PARAM_LU_RESET_TMO: + case ISCSI_PARAM_TGT_RESET_TMO: + case ISCSI_PARAM_IFACE_NAME: + case ISCSI_PARAM_INITIATOR_NAME: + case ISCSI_PARAM_DISCOVERY_SESS: + return S_IRUGO; + default: + return 0; + } + } + + return 0; +} + +static struct scsi_host_template iscsi_iser_sht = { + .module = THIS_MODULE, + .name = "iSCSI Initiator over iSER", + .queuecommand = iscsi_queuecommand, + .change_queue_depth = iscsi_change_queue_depth, + .sg_tablesize = ISCSI_ISER_SG_TABLESIZE, + .max_sectors = 1024, + .cmd_per_lun = ISER_DEF_CMD_PER_LUN, + .eh_abort_handler = iscsi_eh_abort, + .eh_device_reset_handler= iscsi_eh_device_reset, + .eh_target_reset_handler = iscsi_eh_recover_target, + .target_alloc = iscsi_target_alloc, + .use_clustering = DISABLE_CLUSTERING, + .proc_name = "iscsi_iser", + .this_id = -1, +}; + +static struct iscsi_transport iscsi_iser_transport = { + .owner = THIS_MODULE, + .name = "iser", + .caps = CAP_RECOVERY_L0 | CAP_MULTI_R2T | CAP_TEXT_NEGO, + /* session management */ + .create_session = iscsi_iser_session_create, + .destroy_session = iscsi_iser_session_destroy, + /* connection management */ + .create_conn = iscsi_iser_conn_create, + .bind_conn = iscsi_iser_conn_bind, + .destroy_conn = iscsi_conn_teardown, + .attr_is_visible = iser_attr_is_visible, + .set_param = iscsi_iser_set_param, + .get_conn_param = iscsi_conn_get_param, + .get_ep_param = iscsi_iser_get_ep_param, + .get_session_param = iscsi_session_get_param, + .start_conn = iscsi_iser_conn_start, + .stop_conn = iscsi_iser_conn_stop, + /* iscsi host params */ + .get_host_param = iscsi_host_get_param, + .set_host_param = iscsi_host_set_param, + /* IO */ + .send_pdu = iscsi_conn_send_pdu, + .get_stats = iscsi_iser_conn_get_stats, + .init_task = iscsi_iser_task_init, + .xmit_task = iscsi_iser_task_xmit, + .cleanup_task = iscsi_iser_cleanup_task, + .alloc_pdu = iscsi_iser_pdu_alloc, + .check_protection = iscsi_iser_check_protection, + /* recovery */ + .session_recovery_timedout = iscsi_session_recovery_timedout, + + .ep_connect = iscsi_iser_ep_connect, + .ep_poll = iscsi_iser_ep_poll, + .ep_disconnect = iscsi_iser_ep_disconnect +}; + +static int __init iser_init(void) +{ + int err; + + iser_dbg("Starting iSER datamover...\n"); + + if (iscsi_max_lun < 1) { + iser_err("Invalid max_lun value of %u\n", iscsi_max_lun); + return -EINVAL; + } + + memset(&ig, 0, sizeof(struct iser_global)); + + ig.desc_cache = kmem_cache_create("iser_descriptors", + sizeof(struct iser_tx_desc), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (ig.desc_cache == NULL) + return -ENOMEM; + + /* device init is called only after the first addr resolution */ + mutex_init(&ig.device_list_mutex); + INIT_LIST_HEAD(&ig.device_list); + mutex_init(&ig.connlist_mutex); + INIT_LIST_HEAD(&ig.connlist); + + release_wq = alloc_workqueue("release workqueue", 0, 0); + if (!release_wq) { + iser_err("failed to allocate release workqueue\n"); + return -ENOMEM; + } + + iscsi_iser_scsi_transport = iscsi_register_transport( + &iscsi_iser_transport); + if (!iscsi_iser_scsi_transport) { + iser_err("iscsi_register_transport failed\n"); + err = -EINVAL; + goto register_transport_failure; + } + + return 0; + +register_transport_failure: + kmem_cache_destroy(ig.desc_cache); + + return err; +} + +static void __exit iser_exit(void) +{ + struct iser_conn *iser_conn, *n; + int connlist_empty; + + iser_dbg("Removing iSER datamover...\n"); + destroy_workqueue(release_wq); + + mutex_lock(&ig.connlist_mutex); + connlist_empty = list_empty(&ig.connlist); + mutex_unlock(&ig.connlist_mutex); + + if (!connlist_empty) { + iser_err("Error cleanup stage completed but we still have iser " + "connections, destroying them anyway.\n"); + list_for_each_entry_safe(iser_conn, n, &ig.connlist, + conn_list) { + iser_conn_release(iser_conn); + } + } + + iscsi_unregister_transport(&iscsi_iser_transport); + kmem_cache_destroy(ig.desc_cache); +} + +module_init(iser_init); +module_exit(iser_exit); diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h new file mode 100644 index 0000000..cd4174c --- /dev/null +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -0,0 +1,666 @@ +/* + * iSER transport for the Open iSCSI Initiator & iSER transport internals + * + * Copyright (C) 2004 Dmitry Yusupov + * Copyright (C) 2004 Alex Aizman + * Copyright (C) 2005 Mike Christie + * based on code maintained by open-iscsi@googlegroups.com + * + * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ISCSI_ISER_H__ +#define __ISCSI_ISER_H__ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#define DRV_NAME "iser" +#define PFX DRV_NAME ": " +#define DRV_VER "1.4.8" + +#define iser_dbg(fmt, arg...) \ + do { \ + if (iser_debug_level > 2) \ + printk(KERN_DEBUG PFX "%s: " fmt,\ + __func__ , ## arg); \ + } while (0) + +#define iser_warn(fmt, arg...) \ + do { \ + if (iser_debug_level > 0) \ + pr_warn(PFX "%s: " fmt, \ + __func__ , ## arg); \ + } while (0) + +#define iser_info(fmt, arg...) \ + do { \ + if (iser_debug_level > 1) \ + pr_info(PFX "%s: " fmt, \ + __func__ , ## arg); \ + } while (0) + +#define iser_err(fmt, arg...) \ + do { \ + printk(KERN_ERR PFX "%s: " fmt, \ + __func__ , ## arg); \ + } while (0) + +#define SHIFT_4K 12 +#define SIZE_4K (1ULL << SHIFT_4K) +#define MASK_4K (~(SIZE_4K-1)) + /* support up to 512KB in one RDMA */ +#define ISCSI_ISER_SG_TABLESIZE (0x80000 >> SHIFT_4K) +#define ISER_DEF_XMIT_CMDS_DEFAULT 512 +#if ISCSI_DEF_XMIT_CMDS_MAX > ISER_DEF_XMIT_CMDS_DEFAULT + #define ISER_DEF_XMIT_CMDS_MAX ISCSI_DEF_XMIT_CMDS_MAX +#else + #define ISER_DEF_XMIT_CMDS_MAX ISER_DEF_XMIT_CMDS_DEFAULT +#endif +#define ISER_DEF_CMD_PER_LUN ISER_DEF_XMIT_CMDS_MAX + +/* QP settings */ +/* Maximal bounds on received asynchronous PDUs */ +#define ISER_MAX_RX_MISC_PDUS 4 /* NOOP_IN(2) , ASYNC_EVENT(2) */ + +#define ISER_MAX_TX_MISC_PDUS 6 /* NOOP_OUT(2), TEXT(1), * + * SCSI_TMFUNC(2), LOGOUT(1) */ + +#define ISER_QP_MAX_RECV_DTOS (ISER_DEF_XMIT_CMDS_MAX) + +#define ISER_MIN_POSTED_RX (ISER_DEF_XMIT_CMDS_MAX >> 2) + +/* the max TX (send) WR supported by the iSER QP is defined by * + * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect * + * to have at max for SCSI command. The tx posting & completion handling code * + * supports -EAGAIN scheme where tx is suspended till the QP has room for more * + * send WR. D=8 comes from 64K/8K */ + +#define ISER_INFLIGHT_DATAOUTS 8 + +#define ISER_QP_MAX_REQ_DTOS (ISER_DEF_XMIT_CMDS_MAX * \ + (1 + ISER_INFLIGHT_DATAOUTS) + \ + ISER_MAX_TX_MISC_PDUS + \ + ISER_MAX_RX_MISC_PDUS) + +/* Max registration work requests per command */ +#define ISER_MAX_REG_WR_PER_CMD 5 + +/* For Signature we don't support DATAOUTs so no need to make room for them */ +#define ISER_QP_SIG_MAX_REQ_DTOS (ISER_DEF_XMIT_CMDS_MAX * \ + (1 + ISER_MAX_REG_WR_PER_CMD) + \ + ISER_MAX_TX_MISC_PDUS + \ + ISER_MAX_RX_MISC_PDUS) + +#define ISER_WC_BATCH_COUNT 16 +#define ISER_SIGNAL_CMD_COUNT 32 + +#define ISER_VER 0x10 +#define ISER_WSV 0x08 +#define ISER_RSV 0x04 + +#define ISER_FASTREG_LI_WRID 0xffffffffffffffffULL +#define ISER_BEACON_WRID 0xfffffffffffffffeULL + +/** + * struct iser_hdr - iSER header + * + * @flags: flags support (zbva, remote_inv) + * @rsvd: reserved + * @write_stag: write rkey + * @write_va: write virtual address + * @reaf_stag: read rkey + * @read_va: read virtual address + */ +struct iser_hdr { + u8 flags; + u8 rsvd[3]; + __be32 write_stag; + __be64 write_va; + __be32 read_stag; + __be64 read_va; +} __attribute__((packed)); + + +#define ISER_ZBVA_NOT_SUPPORTED 0x80 +#define ISER_SEND_W_INV_NOT_SUPPORTED 0x40 + +struct iser_cm_hdr { + u8 flags; + u8 rsvd[3]; +} __packed; + +/* Constant PDU lengths calculations */ +#define ISER_HEADERS_LEN (sizeof(struct iser_hdr) + sizeof(struct iscsi_hdr)) + +#define ISER_RECV_DATA_SEG_LEN 128 +#define ISER_RX_PAYLOAD_SIZE (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN) +#define ISER_RX_LOGIN_SIZE (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN) + +/* Length of an object name string */ +#define ISER_OBJECT_NAME_SIZE 64 + +enum iser_conn_state { + ISER_CONN_INIT, /* descriptor allocd, no conn */ + ISER_CONN_PENDING, /* in the process of being established */ + ISER_CONN_UP, /* up and running */ + ISER_CONN_TERMINATING, /* in the process of being terminated */ + ISER_CONN_DOWN, /* shut down */ + ISER_CONN_STATES_NUM +}; + +enum iser_task_status { + ISER_TASK_STATUS_INIT = 0, + ISER_TASK_STATUS_STARTED, + ISER_TASK_STATUS_COMPLETED +}; + +enum iser_data_dir { + ISER_DIR_IN = 0, /* to initiator */ + ISER_DIR_OUT, /* from initiator */ + ISER_DIRS_NUM +}; + +/** + * struct iser_data_buf - iSER data buffer + * + * @buf: pointer to the sg list + * @size: num entries of this sg + * @data_len: total beffer byte len + * @dma_nents: returned by dma_map_sg + * @copy_buf: allocated copy buf for SGs unaligned + * for rdma which are copied + * @sg_single: SG-ified clone of a non SG SC or + * unaligned SG + */ +struct iser_data_buf { + void *buf; + unsigned int size; + unsigned long data_len; + unsigned int dma_nents; + char *copy_buf; + struct scatterlist sg_single; + }; + +/* fwd declarations */ +struct iser_device; +struct iscsi_iser_task; +struct iscsi_endpoint; + +/** + * struct iser_mem_reg - iSER memory registration info + * + * @lkey: MR local key + * @rkey: MR remote key + * @va: MR start address (buffer va) + * @len: MR length + * @mem_h: pointer to registration context (FMR/Fastreg) + * @is_mr: indicates weather we registered the buffer + */ +struct iser_mem_reg { + u32 lkey; + u32 rkey; + u64 va; + u64 len; + void *mem_h; + int is_mr; +}; + +/** + * struct iser_regd_buf - iSER buffer registration desc + * + * @reg: memory registration info + * @virt_addr: virtual address of buffer + * @device: reference to iser device + * @direction: dma direction (for dma_unmap) + * @data_size: data buffer size in bytes + */ +struct iser_regd_buf { + struct iser_mem_reg reg; + void *virt_addr; + struct iser_device *device; + enum dma_data_direction direction; + unsigned int data_size; +}; + +enum iser_desc_type { + ISCSI_TX_CONTROL , + ISCSI_TX_SCSI_COMMAND, + ISCSI_TX_DATAOUT +}; + +/** + * struct iser_tx_desc - iSER TX descriptor (for send wr_id) + * + * @iser_header: iser header + * @iscsi_header: iscsi header + * @type: command/control/dataout + * @dam_addr: header buffer dma_address + * @tx_sg: sg[0] points to iser/iscsi headers + * sg[1] optionally points to either of immediate data + * unsolicited data-out or control + * @num_sge: number sges used on this TX task + */ +struct iser_tx_desc { + struct iser_hdr iser_header; + struct iscsi_hdr iscsi_header; + enum iser_desc_type type; + u64 dma_addr; + struct ib_sge tx_sg[2]; + int num_sge; +}; + +#define ISER_RX_PAD_SIZE (256 - (ISER_RX_PAYLOAD_SIZE + \ + sizeof(u64) + sizeof(struct ib_sge))) +/** + * struct iser_rx_desc - iSER RX descriptor (for recv wr_id) + * + * @iser_header: iser header + * @iscsi_header: iscsi header + * @data: received data segment + * @dma_addr: receive buffer dma address + * @rx_sg: ib_sge of receive buffer + * @pad: for sense data TODO: Modify to maximum sense length supported + */ +struct iser_rx_desc { + struct iser_hdr iser_header; + struct iscsi_hdr iscsi_header; + char data[ISER_RECV_DATA_SEG_LEN]; + u64 dma_addr; + struct ib_sge rx_sg; + char pad[ISER_RX_PAD_SIZE]; +} __attribute__((packed)); + +#define ISER_MAX_CQ 4 + +struct iser_conn; +struct ib_conn; +struct iscsi_iser_task; + +/** + * struct iser_comp - iSER completion context + * + * @device: pointer to device handle + * @cq: completion queue + * @wcs: work completion array + * @tasklet: Tasklet handle + * @active_qps: Number of active QPs attached + * to completion context + */ +struct iser_comp { + struct iser_device *device; + struct ib_cq *cq; + struct ib_wc wcs[ISER_WC_BATCH_COUNT]; + struct tasklet_struct tasklet; + int active_qps; +}; + +/** + * struct iser_device - iSER device handle + * + * @ib_device: RDMA device + * @pd: Protection Domain for this device + * @dev_attr: Device attributes container + * @mr: Global DMA memory region + * @event_handler: IB events handle routine + * @ig_list: entry in devices list + * @refcount: Reference counter, dominated by open iser connections + * @comps_used: Number of completion contexts used, Min between online + * cpus and device max completion vectors + * @comps: Dinamically allocated array of completion handlers + * Memory registration pool Function pointers (FMR or Fastreg): + * @iser_alloc_rdma_reg_res: Allocation of memory regions pool + * @iser_free_rdma_reg_res: Free of memory regions pool + * @iser_reg_rdma_mem: Memory registration routine + * @iser_unreg_rdma_mem: Memory deregistration routine + */ +struct iser_device { + struct ib_device *ib_device; + struct ib_pd *pd; + struct ib_device_attr dev_attr; + struct ib_mr *mr; + struct ib_event_handler event_handler; + struct list_head ig_list; + int refcount; + int comps_used; + struct iser_comp comps[ISER_MAX_CQ]; + int (*iser_alloc_rdma_reg_res)(struct ib_conn *ib_conn, + unsigned cmds_max); + void (*iser_free_rdma_reg_res)(struct ib_conn *ib_conn); + int (*iser_reg_rdma_mem)(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir); + void (*iser_unreg_rdma_mem)(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir); +}; + +#define ISER_CHECK_GUARD 0xc0 +#define ISER_CHECK_REFTAG 0x0f +#define ISER_CHECK_APPTAG 0x30 + +enum iser_reg_indicator { + ISER_DATA_KEY_VALID = 1 << 0, + ISER_PROT_KEY_VALID = 1 << 1, + ISER_SIG_KEY_VALID = 1 << 2, + ISER_FASTREG_PROTECTED = 1 << 3, +}; + +/** + * struct iser_pi_context - Protection information context + * + * @prot_mr: protection memory region + * @prot_frpl: protection fastreg page list + * @sig_mr: signature feature enabled memory region + */ +struct iser_pi_context { + struct ib_mr *prot_mr; + struct ib_fast_reg_page_list *prot_frpl; + struct ib_mr *sig_mr; +}; + +/** + * struct fast_reg_descriptor - Fast registration descriptor + * + * @list: entry in connection fastreg pool + * @data_mr: data memory region + * @data_frpl: data fastreg page list + * @pi_ctx: protection information context + * @reg_indicators: fast registration indicators + */ +struct fast_reg_descriptor { + struct list_head list; + struct ib_mr *data_mr; + struct ib_fast_reg_page_list *data_frpl; + struct iser_pi_context *pi_ctx; + u8 reg_indicators; +}; + +/** + * struct ib_conn - Infiniband related objects + * + * @cma_id: rdma_cm connection maneger handle + * @qp: Connection Queue-pair + * @post_recv_buf_count: post receive counter + * @rx_wr: receive work request for batch posts + * @device: reference to iser device + * @comp: iser completion context + * @pi_support: Indicate device T10-PI support + * @beacon: beacon send wr to signal all flush errors were drained + * @flush_comp: completes when all connection completions consumed + * @lock: protects fmr/fastreg pool + * @union.fmr: + * @pool: FMR pool for fast registrations + * @page_vec: page vector to hold mapped commands pages + * used for registration + * @union.fastreg: + * @pool: Fast registration descriptors pool for fast + * registrations + * @pool_size: Size of pool + */ +struct ib_conn { + struct rdma_cm_id *cma_id; + struct ib_qp *qp; + int post_recv_buf_count; + struct ib_recv_wr rx_wr[ISER_MIN_POSTED_RX]; + struct iser_device *device; + struct iser_comp *comp; + bool pi_support; + struct ib_send_wr beacon; + struct completion flush_comp; + spinlock_t lock; + union { + struct { + struct ib_fmr_pool *pool; + struct iser_page_vec *page_vec; + } fmr; + struct { + struct list_head pool; + int pool_size; + } fastreg; + }; +}; + +/** + * struct iser_conn - iSER connection context + * + * @ib_conn: connection RDMA resources + * @iscsi_conn: link to matching iscsi connection + * @ep: transport handle + * @state: connection logical state + * @qp_max_recv_dtos: maximum number of data outs, corresponds + * to max number of post recvs + * @qp_max_recv_dtos_mask: (qp_max_recv_dtos - 1) + * @min_posted_rx: (qp_max_recv_dtos >> 2) + * @name: connection peer portal + * @release_work: deffered work for release job + * @state_mutex: protects iser onnection state + * @stop_completion: conn_stop completion + * @ib_completion: RDMA cleanup completion + * @up_completion: connection establishment completed + * (state is ISER_CONN_UP) + * @conn_list: entry in ig conn list + * @login_buf: login data buffer (stores login parameters) + * @login_req_buf: login request buffer + * @login_req_dma: login request buffer dma address + * @login_resp_buf: login response buffer + * @login_resp_dma: login response buffer dma address + * @rx_desc_head: head of rx_descs cyclic buffer + * @rx_descs: rx buffers array (cyclic buffer) + * @num_rx_descs: number of rx descriptors + */ +struct iser_conn { + struct ib_conn ib_conn; + struct iscsi_conn *iscsi_conn; + struct iscsi_endpoint *ep; + enum iser_conn_state state; + unsigned qp_max_recv_dtos; + unsigned qp_max_recv_dtos_mask; + unsigned min_posted_rx; + char name[ISER_OBJECT_NAME_SIZE]; + struct work_struct release_work; + struct mutex state_mutex; + struct completion stop_completion; + struct completion ib_completion; + struct completion up_completion; + struct list_head conn_list; + + char *login_buf; + char *login_req_buf, *login_resp_buf; + u64 login_req_dma, login_resp_dma; + unsigned int rx_desc_head; + struct iser_rx_desc *rx_descs; + u32 num_rx_descs; +}; + +/** + * struct iscsi_iser_task - iser task context + * + * @desc: TX descriptor + * @iser_conn: link to iser connection + * @status: current task status + * @sc: link to scsi command + * @command_sent: indicate if command was sent + * @dir: iser data direction + * @rdma_regd: task rdma registration desc + * @data: iser data buffer desc + * @data_copy: iser data copy buffer desc (bounce buffer) + * @prot: iser protection buffer desc + * @prot_copy: iser protection copy buffer desc (bounce buffer) + */ +struct iscsi_iser_task { + struct iser_tx_desc desc; + struct iser_conn *iser_conn; + enum iser_task_status status; + struct scsi_cmnd *sc; + int command_sent; + int dir[ISER_DIRS_NUM]; + struct iser_regd_buf rdma_regd[ISER_DIRS_NUM]; + struct iser_data_buf data[ISER_DIRS_NUM]; + struct iser_data_buf data_copy[ISER_DIRS_NUM]; + struct iser_data_buf prot[ISER_DIRS_NUM]; + struct iser_data_buf prot_copy[ISER_DIRS_NUM]; +}; + +struct iser_page_vec { + u64 *pages; + int length; + int offset; + int data_size; +}; + +/** + * struct iser_global: iSER global context + * + * @device_list_mutex: protects device_list + * @device_list: iser devices global list + * @connlist_mutex: protects connlist + * @connlist: iser connections global list + * @desc_cache: kmem cache for tx dataout + */ +struct iser_global { + struct mutex device_list_mutex; + struct list_head device_list; + struct mutex connlist_mutex; + struct list_head connlist; + struct kmem_cache *desc_cache; +}; + +extern struct iser_global ig; +extern int iser_debug_level; +extern bool iser_pi_enable; +extern int iser_pi_guard; + +int iser_send_control(struct iscsi_conn *conn, + struct iscsi_task *task); + +int iser_send_command(struct iscsi_conn *conn, + struct iscsi_task *task); + +int iser_send_data_out(struct iscsi_conn *conn, + struct iscsi_task *task, + struct iscsi_data *hdr); + +void iscsi_iser_recv(struct iscsi_conn *conn, + struct iscsi_hdr *hdr, + char *rx_data, + int rx_data_len); + +void iser_conn_init(struct iser_conn *iser_conn); + +void iser_conn_release(struct iser_conn *iser_conn); + +int iser_conn_terminate(struct iser_conn *iser_conn); + +void iser_release_work(struct work_struct *work); + +void iser_rcv_completion(struct iser_rx_desc *desc, + unsigned long dto_xfer_len, + struct ib_conn *ib_conn); + +void iser_snd_completion(struct iser_tx_desc *desc, + struct ib_conn *ib_conn); + +void iser_task_rdma_init(struct iscsi_iser_task *task); + +void iser_task_rdma_finalize(struct iscsi_iser_task *task); + +void iser_free_rx_descriptors(struct iser_conn *iser_conn); + +void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, + struct iser_data_buf *mem, + struct iser_data_buf *mem_copy, + enum iser_data_dir cmd_dir); + +int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *task, + enum iser_data_dir cmd_dir); +int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *task, + enum iser_data_dir cmd_dir); + +int iser_connect(struct iser_conn *iser_conn, + struct sockaddr *src_addr, + struct sockaddr *dst_addr, + int non_blocking); + +int iser_reg_page_vec(struct ib_conn *ib_conn, + struct iser_page_vec *page_vec, + struct iser_mem_reg *mem_reg); + +void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir); +void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir); + +int iser_post_recvl(struct iser_conn *iser_conn); +int iser_post_recvm(struct iser_conn *iser_conn, int count); +int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc, + bool signal); + +int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, + struct iser_data_buf *data, + enum iser_data_dir iser_dir, + enum dma_data_direction dma_dir); + +void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task, + struct iser_data_buf *data); +int iser_initialize_task_headers(struct iscsi_task *task, + struct iser_tx_desc *tx_desc); +int iser_alloc_rx_descriptors(struct iser_conn *iser_conn, + struct iscsi_session *session); +int iser_create_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max); +void iser_free_fmr_pool(struct ib_conn *ib_conn); +int iser_create_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max); +void iser_free_fastreg_pool(struct ib_conn *ib_conn); +u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir, sector_t *sector); +#endif diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c new file mode 100644 index 0000000..5a489ea --- /dev/null +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -0,0 +1,732 @@ +/* + * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include + +#include "iscsi_iser.h" + +/* Register user buffer memory and initialize passive rdma + * dto descriptor. Data size is stored in + * task->data[ISER_DIR_IN].data_len, Protection size + * os stored in task->prot[ISER_DIR_IN].data_len + */ +static int iser_prepare_read_cmd(struct iscsi_task *task) + +{ + struct iscsi_iser_task *iser_task = task->dd_data; + struct iser_device *device = iser_task->iser_conn->ib_conn.device; + struct iser_regd_buf *regd_buf; + int err; + struct iser_hdr *hdr = &iser_task->desc.iser_header; + struct iser_data_buf *buf_in = &iser_task->data[ISER_DIR_IN]; + + err = iser_dma_map_task_data(iser_task, + buf_in, + ISER_DIR_IN, + DMA_FROM_DEVICE); + if (err) + return err; + + if (scsi_prot_sg_count(iser_task->sc)) { + struct iser_data_buf *pbuf_in = &iser_task->prot[ISER_DIR_IN]; + + err = iser_dma_map_task_data(iser_task, + pbuf_in, + ISER_DIR_IN, + DMA_FROM_DEVICE); + if (err) + return err; + } + + err = device->iser_reg_rdma_mem(iser_task, ISER_DIR_IN); + if (err) { + iser_err("Failed to set up Data-IN RDMA\n"); + return err; + } + regd_buf = &iser_task->rdma_regd[ISER_DIR_IN]; + + hdr->flags |= ISER_RSV; + hdr->read_stag = cpu_to_be32(regd_buf->reg.rkey); + hdr->read_va = cpu_to_be64(regd_buf->reg.va); + + iser_dbg("Cmd itt:%d READ tags RKEY:%#.4X VA:%#llX\n", + task->itt, regd_buf->reg.rkey, + (unsigned long long)regd_buf->reg.va); + + return 0; +} + +/* Register user buffer memory and initialize passive rdma + * dto descriptor. Data size is stored in + * task->data[ISER_DIR_OUT].data_len, Protection size + * is stored at task->prot[ISER_DIR_OUT].data_len + */ +static int +iser_prepare_write_cmd(struct iscsi_task *task, + unsigned int imm_sz, + unsigned int unsol_sz, + unsigned int edtl) +{ + struct iscsi_iser_task *iser_task = task->dd_data; + struct iser_device *device = iser_task->iser_conn->ib_conn.device; + struct iser_regd_buf *regd_buf; + int err; + struct iser_hdr *hdr = &iser_task->desc.iser_header; + struct iser_data_buf *buf_out = &iser_task->data[ISER_DIR_OUT]; + struct ib_sge *tx_dsg = &iser_task->desc.tx_sg[1]; + + err = iser_dma_map_task_data(iser_task, + buf_out, + ISER_DIR_OUT, + DMA_TO_DEVICE); + if (err) + return err; + + if (scsi_prot_sg_count(iser_task->sc)) { + struct iser_data_buf *pbuf_out = &iser_task->prot[ISER_DIR_OUT]; + + err = iser_dma_map_task_data(iser_task, + pbuf_out, + ISER_DIR_OUT, + DMA_TO_DEVICE); + if (err) + return err; + } + + err = device->iser_reg_rdma_mem(iser_task, ISER_DIR_OUT); + if (err != 0) { + iser_err("Failed to register write cmd RDMA mem\n"); + return err; + } + + regd_buf = &iser_task->rdma_regd[ISER_DIR_OUT]; + + if (unsol_sz < edtl) { + hdr->flags |= ISER_WSV; + hdr->write_stag = cpu_to_be32(regd_buf->reg.rkey); + hdr->write_va = cpu_to_be64(regd_buf->reg.va + unsol_sz); + + iser_dbg("Cmd itt:%d, WRITE tags, RKEY:%#.4X " + "VA:%#llX + unsol:%d\n", + task->itt, regd_buf->reg.rkey, + (unsigned long long)regd_buf->reg.va, unsol_sz); + } + + if (imm_sz > 0) { + iser_dbg("Cmd itt:%d, WRITE, adding imm.data sz: %d\n", + task->itt, imm_sz); + tx_dsg->addr = regd_buf->reg.va; + tx_dsg->length = imm_sz; + tx_dsg->lkey = regd_buf->reg.lkey; + iser_task->desc.num_sge = 2; + } + + return 0; +} + +/* creates a new tx descriptor and adds header regd buffer */ +static void iser_create_send_desc(struct iser_conn *iser_conn, + struct iser_tx_desc *tx_desc) +{ + struct iser_device *device = iser_conn->ib_conn.device; + + ib_dma_sync_single_for_cpu(device->ib_device, + tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE); + + memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr)); + tx_desc->iser_header.flags = ISER_VER; + + tx_desc->num_sge = 1; + + if (tx_desc->tx_sg[0].lkey != device->mr->lkey) { + tx_desc->tx_sg[0].lkey = device->mr->lkey; + iser_dbg("sdesc %p lkey mismatch, fixing\n", tx_desc); + } +} + +static void iser_free_login_buf(struct iser_conn *iser_conn) +{ + struct iser_device *device = iser_conn->ib_conn.device; + + if (!iser_conn->login_buf) + return; + + if (iser_conn->login_req_dma) + ib_dma_unmap_single(device->ib_device, + iser_conn->login_req_dma, + ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE); + + if (iser_conn->login_resp_dma) + ib_dma_unmap_single(device->ib_device, + iser_conn->login_resp_dma, + ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE); + + kfree(iser_conn->login_buf); + + /* make sure we never redo any unmapping */ + iser_conn->login_req_dma = 0; + iser_conn->login_resp_dma = 0; + iser_conn->login_buf = NULL; +} + +static int iser_alloc_login_buf(struct iser_conn *iser_conn) +{ + struct iser_device *device = iser_conn->ib_conn.device; + int req_err, resp_err; + + BUG_ON(device == NULL); + + iser_conn->login_buf = kmalloc(ISCSI_DEF_MAX_RECV_SEG_LEN + + ISER_RX_LOGIN_SIZE, GFP_KERNEL); + if (!iser_conn->login_buf) + goto out_err; + + iser_conn->login_req_buf = iser_conn->login_buf; + iser_conn->login_resp_buf = iser_conn->login_buf + + ISCSI_DEF_MAX_RECV_SEG_LEN; + + iser_conn->login_req_dma = ib_dma_map_single(device->ib_device, + iser_conn->login_req_buf, + ISCSI_DEF_MAX_RECV_SEG_LEN, + DMA_TO_DEVICE); + + iser_conn->login_resp_dma = ib_dma_map_single(device->ib_device, + iser_conn->login_resp_buf, + ISER_RX_LOGIN_SIZE, + DMA_FROM_DEVICE); + + req_err = ib_dma_mapping_error(device->ib_device, + iser_conn->login_req_dma); + resp_err = ib_dma_mapping_error(device->ib_device, + iser_conn->login_resp_dma); + + if (req_err || resp_err) { + if (req_err) + iser_conn->login_req_dma = 0; + if (resp_err) + iser_conn->login_resp_dma = 0; + goto free_login_buf; + } + return 0; + +free_login_buf: + iser_free_login_buf(iser_conn); + +out_err: + iser_err("unable to alloc or map login buf\n"); + return -ENOMEM; +} + +int iser_alloc_rx_descriptors(struct iser_conn *iser_conn, + struct iscsi_session *session) +{ + int i, j; + u64 dma_addr; + struct iser_rx_desc *rx_desc; + struct ib_sge *rx_sg; + struct ib_conn *ib_conn = &iser_conn->ib_conn; + struct iser_device *device = ib_conn->device; + + iser_conn->qp_max_recv_dtos = session->cmds_max; + iser_conn->qp_max_recv_dtos_mask = session->cmds_max - 1; /* cmds_max is 2^N */ + iser_conn->min_posted_rx = iser_conn->qp_max_recv_dtos >> 2; + + if (device->iser_alloc_rdma_reg_res(ib_conn, session->scsi_cmds_max)) + goto create_rdma_reg_res_failed; + + if (iser_alloc_login_buf(iser_conn)) + goto alloc_login_buf_fail; + + iser_conn->num_rx_descs = session->cmds_max; + iser_conn->rx_descs = kmalloc(iser_conn->num_rx_descs * + sizeof(struct iser_rx_desc), GFP_KERNEL); + if (!iser_conn->rx_descs) + goto rx_desc_alloc_fail; + + rx_desc = iser_conn->rx_descs; + + for (i = 0; i < iser_conn->qp_max_recv_dtos; i++, rx_desc++) { + dma_addr = ib_dma_map_single(device->ib_device, (void *)rx_desc, + ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(device->ib_device, dma_addr)) + goto rx_desc_dma_map_failed; + + rx_desc->dma_addr = dma_addr; + + rx_sg = &rx_desc->rx_sg; + rx_sg->addr = rx_desc->dma_addr; + rx_sg->length = ISER_RX_PAYLOAD_SIZE; + rx_sg->lkey = device->mr->lkey; + } + + iser_conn->rx_desc_head = 0; + return 0; + +rx_desc_dma_map_failed: + rx_desc = iser_conn->rx_descs; + for (j = 0; j < i; j++, rx_desc++) + ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr, + ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + kfree(iser_conn->rx_descs); + iser_conn->rx_descs = NULL; +rx_desc_alloc_fail: + iser_free_login_buf(iser_conn); +alloc_login_buf_fail: + device->iser_free_rdma_reg_res(ib_conn); +create_rdma_reg_res_failed: + iser_err("failed allocating rx descriptors / data buffers\n"); + return -ENOMEM; +} + +void iser_free_rx_descriptors(struct iser_conn *iser_conn) +{ + int i; + struct iser_rx_desc *rx_desc; + struct ib_conn *ib_conn = &iser_conn->ib_conn; + struct iser_device *device = ib_conn->device; + + if (!iser_conn->rx_descs) + goto free_login_buf; + + if (device->iser_free_rdma_reg_res) + device->iser_free_rdma_reg_res(ib_conn); + + rx_desc = iser_conn->rx_descs; + for (i = 0; i < iser_conn->qp_max_recv_dtos; i++, rx_desc++) + ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr, + ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + kfree(iser_conn->rx_descs); + /* make sure we never redo any unmapping */ + iser_conn->rx_descs = NULL; + +free_login_buf: + iser_free_login_buf(iser_conn); +} + +static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req) +{ + struct iser_conn *iser_conn = conn->dd_data; + struct ib_conn *ib_conn = &iser_conn->ib_conn; + struct iscsi_session *session = conn->session; + + iser_dbg("req op %x flags %x\n", req->opcode, req->flags); + /* check if this is the last login - going to full feature phase */ + if ((req->flags & ISCSI_FULL_FEATURE_PHASE) != ISCSI_FULL_FEATURE_PHASE) + return 0; + + /* + * Check that there is one posted recv buffer + * (for the last login response). + */ + WARN_ON(ib_conn->post_recv_buf_count != 1); + + if (session->discovery_sess) { + iser_info("Discovery session, re-using login RX buffer\n"); + return 0; + } else + iser_info("Normal session, posting batch of RX %d buffers\n", + iser_conn->min_posted_rx); + + /* Initial post receive buffers */ + if (iser_post_recvm(iser_conn, iser_conn->min_posted_rx)) + return -ENOMEM; + + return 0; +} + +static inline bool iser_signal_comp(int sig_count) +{ + return ((sig_count % ISER_SIGNAL_CMD_COUNT) == 0); +} + +/** + * iser_send_command - send command PDU + */ +int iser_send_command(struct iscsi_conn *conn, + struct iscsi_task *task) +{ + struct iser_conn *iser_conn = conn->dd_data; + struct iscsi_iser_task *iser_task = task->dd_data; + unsigned long edtl; + int err; + struct iser_data_buf *data_buf, *prot_buf; + struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)task->hdr; + struct scsi_cmnd *sc = task->sc; + struct iser_tx_desc *tx_desc = &iser_task->desc; + static unsigned sig_count; + + edtl = ntohl(hdr->data_length); + + /* build the tx desc regd header and add it to the tx desc dto */ + tx_desc->type = ISCSI_TX_SCSI_COMMAND; + iser_create_send_desc(iser_conn, tx_desc); + + if (hdr->flags & ISCSI_FLAG_CMD_READ) { + data_buf = &iser_task->data[ISER_DIR_IN]; + prot_buf = &iser_task->prot[ISER_DIR_IN]; + } else { + data_buf = &iser_task->data[ISER_DIR_OUT]; + prot_buf = &iser_task->prot[ISER_DIR_OUT]; + } + + if (scsi_sg_count(sc)) { /* using a scatter list */ + data_buf->buf = scsi_sglist(sc); + data_buf->size = scsi_sg_count(sc); + } + data_buf->data_len = scsi_bufflen(sc); + + if (scsi_prot_sg_count(sc)) { + prot_buf->buf = scsi_prot_sglist(sc); + prot_buf->size = scsi_prot_sg_count(sc); + prot_buf->data_len = data_buf->data_len >> + ilog2(sc->device->sector_size) * 8; + } + + if (hdr->flags & ISCSI_FLAG_CMD_READ) { + err = iser_prepare_read_cmd(task); + if (err) + goto send_command_error; + } + if (hdr->flags & ISCSI_FLAG_CMD_WRITE) { + err = iser_prepare_write_cmd(task, + task->imm_count, + task->imm_count + + task->unsol_r2t.data_length, + edtl); + if (err) + goto send_command_error; + } + + iser_task->status = ISER_TASK_STATUS_STARTED; + + err = iser_post_send(&iser_conn->ib_conn, tx_desc, + iser_signal_comp(++sig_count)); + if (!err) + return 0; + +send_command_error: + iser_err("conn %p failed task->itt %d err %d\n",conn, task->itt, err); + return err; +} + +/** + * iser_send_data_out - send data out PDU + */ +int iser_send_data_out(struct iscsi_conn *conn, + struct iscsi_task *task, + struct iscsi_data *hdr) +{ + struct iser_conn *iser_conn = conn->dd_data; + struct iscsi_iser_task *iser_task = task->dd_data; + struct iser_tx_desc *tx_desc = NULL; + struct iser_regd_buf *regd_buf; + unsigned long buf_offset; + unsigned long data_seg_len; + uint32_t itt; + int err = 0; + struct ib_sge *tx_dsg; + + itt = (__force uint32_t)hdr->itt; + data_seg_len = ntoh24(hdr->dlength); + buf_offset = ntohl(hdr->offset); + + iser_dbg("%s itt %d dseg_len %d offset %d\n", + __func__,(int)itt,(int)data_seg_len,(int)buf_offset); + + tx_desc = kmem_cache_zalloc(ig.desc_cache, GFP_ATOMIC); + if (tx_desc == NULL) { + iser_err("Failed to alloc desc for post dataout\n"); + return -ENOMEM; + } + + tx_desc->type = ISCSI_TX_DATAOUT; + tx_desc->iser_header.flags = ISER_VER; + memcpy(&tx_desc->iscsi_header, hdr, sizeof(struct iscsi_hdr)); + + /* build the tx desc */ + iser_initialize_task_headers(task, tx_desc); + + regd_buf = &iser_task->rdma_regd[ISER_DIR_OUT]; + tx_dsg = &tx_desc->tx_sg[1]; + tx_dsg->addr = regd_buf->reg.va + buf_offset; + tx_dsg->length = data_seg_len; + tx_dsg->lkey = regd_buf->reg.lkey; + tx_desc->num_sge = 2; + + if (buf_offset + data_seg_len > iser_task->data[ISER_DIR_OUT].data_len) { + iser_err("Offset:%ld & DSL:%ld in Data-Out " + "inconsistent with total len:%ld, itt:%d\n", + buf_offset, data_seg_len, + iser_task->data[ISER_DIR_OUT].data_len, itt); + err = -EINVAL; + goto send_data_out_error; + } + iser_dbg("data-out itt: %d, offset: %ld, sz: %ld\n", + itt, buf_offset, data_seg_len); + + + err = iser_post_send(&iser_conn->ib_conn, tx_desc, true); + if (!err) + return 0; + +send_data_out_error: + kmem_cache_free(ig.desc_cache, tx_desc); + iser_err("conn %p failed err %d\n",conn, err); + return err; +} + +int iser_send_control(struct iscsi_conn *conn, + struct iscsi_task *task) +{ + struct iser_conn *iser_conn = conn->dd_data; + struct iscsi_iser_task *iser_task = task->dd_data; + struct iser_tx_desc *mdesc = &iser_task->desc; + unsigned long data_seg_len; + int err = 0; + struct iser_device *device; + + /* build the tx desc regd header and add it to the tx desc dto */ + mdesc->type = ISCSI_TX_CONTROL; + iser_create_send_desc(iser_conn, mdesc); + + device = iser_conn->ib_conn.device; + + data_seg_len = ntoh24(task->hdr->dlength); + + if (data_seg_len > 0) { + struct ib_sge *tx_dsg = &mdesc->tx_sg[1]; + if (task != conn->login_task) { + iser_err("data present on non login task!!!\n"); + goto send_control_error; + } + + ib_dma_sync_single_for_cpu(device->ib_device, + iser_conn->login_req_dma, task->data_count, + DMA_TO_DEVICE); + + memcpy(iser_conn->login_req_buf, task->data, task->data_count); + + ib_dma_sync_single_for_device(device->ib_device, + iser_conn->login_req_dma, task->data_count, + DMA_TO_DEVICE); + + tx_dsg->addr = iser_conn->login_req_dma; + tx_dsg->length = task->data_count; + tx_dsg->lkey = device->mr->lkey; + mdesc->num_sge = 2; + } + + if (task == conn->login_task) { + iser_dbg("op %x dsl %lx, posting login rx buffer\n", + task->hdr->opcode, data_seg_len); + err = iser_post_recvl(iser_conn); + if (err) + goto send_control_error; + err = iser_post_rx_bufs(conn, task->hdr); + if (err) + goto send_control_error; + } + + err = iser_post_send(&iser_conn->ib_conn, mdesc, true); + if (!err) + return 0; + +send_control_error: + iser_err("conn %p failed err %d\n",conn, err); + return err; +} + +/** + * iser_rcv_dto_completion - recv DTO completion + */ +void iser_rcv_completion(struct iser_rx_desc *rx_desc, + unsigned long rx_xfer_len, + struct ib_conn *ib_conn) +{ + struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn, + ib_conn); + struct iscsi_hdr *hdr; + u64 rx_dma; + int rx_buflen, outstanding, count, err; + + /* differentiate between login to all other PDUs */ + if ((char *)rx_desc == iser_conn->login_resp_buf) { + rx_dma = iser_conn->login_resp_dma; + rx_buflen = ISER_RX_LOGIN_SIZE; + } else { + rx_dma = rx_desc->dma_addr; + rx_buflen = ISER_RX_PAYLOAD_SIZE; + } + + ib_dma_sync_single_for_cpu(ib_conn->device->ib_device, rx_dma, + rx_buflen, DMA_FROM_DEVICE); + + hdr = &rx_desc->iscsi_header; + + iser_dbg("op 0x%x itt 0x%x dlen %d\n", hdr->opcode, + hdr->itt, (int)(rx_xfer_len - ISER_HEADERS_LEN)); + + iscsi_iser_recv(iser_conn->iscsi_conn, hdr, rx_desc->data, + rx_xfer_len - ISER_HEADERS_LEN); + + ib_dma_sync_single_for_device(ib_conn->device->ib_device, rx_dma, + rx_buflen, DMA_FROM_DEVICE); + + /* decrementing conn->post_recv_buf_count only --after-- freeing the * + * task eliminates the need to worry on tasks which are completed in * + * parallel to the execution of iser_conn_term. So the code that waits * + * for the posted rx bufs refcount to become zero handles everything */ + ib_conn->post_recv_buf_count--; + + if (rx_dma == iser_conn->login_resp_dma) + return; + + outstanding = ib_conn->post_recv_buf_count; + if (outstanding + iser_conn->min_posted_rx <= iser_conn->qp_max_recv_dtos) { + count = min(iser_conn->qp_max_recv_dtos - outstanding, + iser_conn->min_posted_rx); + err = iser_post_recvm(iser_conn, count); + if (err) + iser_err("posting %d rx bufs err %d\n", count, err); + } +} + +void iser_snd_completion(struct iser_tx_desc *tx_desc, + struct ib_conn *ib_conn) +{ + struct iscsi_task *task; + struct iser_device *device = ib_conn->device; + + if (tx_desc->type == ISCSI_TX_DATAOUT) { + ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr, + ISER_HEADERS_LEN, DMA_TO_DEVICE); + kmem_cache_free(ig.desc_cache, tx_desc); + tx_desc = NULL; + } + + if (tx_desc && tx_desc->type == ISCSI_TX_CONTROL) { + /* this arithmetic is legal by libiscsi dd_data allocation */ + task = (void *) ((long)(void *)tx_desc - + sizeof(struct iscsi_task)); + if (task->hdr->itt == RESERVED_ITT) + iscsi_put_task(task); + } +} + +void iser_task_rdma_init(struct iscsi_iser_task *iser_task) + +{ + iser_task->status = ISER_TASK_STATUS_INIT; + + iser_task->dir[ISER_DIR_IN] = 0; + iser_task->dir[ISER_DIR_OUT] = 0; + + iser_task->data[ISER_DIR_IN].data_len = 0; + iser_task->data[ISER_DIR_OUT].data_len = 0; + + iser_task->prot[ISER_DIR_IN].data_len = 0; + iser_task->prot[ISER_DIR_OUT].data_len = 0; + + memset(&iser_task->rdma_regd[ISER_DIR_IN], 0, + sizeof(struct iser_regd_buf)); + memset(&iser_task->rdma_regd[ISER_DIR_OUT], 0, + sizeof(struct iser_regd_buf)); +} + +void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task) +{ + struct iser_device *device = iser_task->iser_conn->ib_conn.device; + int is_rdma_data_aligned = 1; + int is_rdma_prot_aligned = 1; + int prot_count = scsi_prot_sg_count(iser_task->sc); + + /* if we were reading, copy back to unaligned sglist, + * anyway dma_unmap and free the copy + */ + if (iser_task->data_copy[ISER_DIR_IN].copy_buf != NULL) { + is_rdma_data_aligned = 0; + iser_finalize_rdma_unaligned_sg(iser_task, + &iser_task->data[ISER_DIR_IN], + &iser_task->data_copy[ISER_DIR_IN], + ISER_DIR_IN); + } + + if (iser_task->data_copy[ISER_DIR_OUT].copy_buf != NULL) { + is_rdma_data_aligned = 0; + iser_finalize_rdma_unaligned_sg(iser_task, + &iser_task->data[ISER_DIR_OUT], + &iser_task->data_copy[ISER_DIR_OUT], + ISER_DIR_OUT); + } + + if (iser_task->prot_copy[ISER_DIR_IN].copy_buf != NULL) { + is_rdma_prot_aligned = 0; + iser_finalize_rdma_unaligned_sg(iser_task, + &iser_task->prot[ISER_DIR_IN], + &iser_task->prot_copy[ISER_DIR_IN], + ISER_DIR_IN); + } + + if (iser_task->prot_copy[ISER_DIR_OUT].copy_buf != NULL) { + is_rdma_prot_aligned = 0; + iser_finalize_rdma_unaligned_sg(iser_task, + &iser_task->prot[ISER_DIR_OUT], + &iser_task->prot_copy[ISER_DIR_OUT], + ISER_DIR_OUT); + } + + if (iser_task->dir[ISER_DIR_IN]) { + device->iser_unreg_rdma_mem(iser_task, ISER_DIR_IN); + if (is_rdma_data_aligned) + iser_dma_unmap_task_data(iser_task, + &iser_task->data[ISER_DIR_IN]); + if (prot_count && is_rdma_prot_aligned) + iser_dma_unmap_task_data(iser_task, + &iser_task->prot[ISER_DIR_IN]); + } + + if (iser_task->dir[ISER_DIR_OUT]) { + device->iser_unreg_rdma_mem(iser_task, ISER_DIR_OUT); + if (is_rdma_data_aligned) + iser_dma_unmap_task_data(iser_task, + &iser_task->data[ISER_DIR_OUT]); + if (prot_count && is_rdma_prot_aligned) + iser_dma_unmap_task_data(iser_task, + &iser_task->prot[ISER_DIR_OUT]); + } +} diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c new file mode 100644 index 0000000..6c5ce35 --- /dev/null +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -0,0 +1,797 @@ +/* + * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include + +#include "iscsi_iser.h" + +#define ISER_KMALLOC_THRESHOLD 0x20000 /* 128K - kmalloc limit */ + +/** + * iser_start_rdma_unaligned_sg + */ +static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, + struct iser_data_buf *data, + struct iser_data_buf *data_copy, + enum iser_data_dir cmd_dir) +{ + struct ib_device *dev = iser_task->iser_conn->ib_conn.device->ib_device; + struct scatterlist *sgl = (struct scatterlist *)data->buf; + struct scatterlist *sg; + char *mem = NULL; + unsigned long cmd_data_len = 0; + int dma_nents, i; + + for_each_sg(sgl, sg, data->size, i) + cmd_data_len += ib_sg_dma_len(dev, sg); + + if (cmd_data_len > ISER_KMALLOC_THRESHOLD) + mem = (void *)__get_free_pages(GFP_ATOMIC, + ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT); + else + mem = kmalloc(cmd_data_len, GFP_ATOMIC); + + if (mem == NULL) { + iser_err("Failed to allocate mem size %d %d for copying sglist\n", + data->size, (int)cmd_data_len); + return -ENOMEM; + } + + if (cmd_dir == ISER_DIR_OUT) { + /* copy the unaligned sg the buffer which is used for RDMA */ + int i; + char *p, *from; + + sgl = (struct scatterlist *)data->buf; + p = mem; + for_each_sg(sgl, sg, data->size, i) { + from = kmap_atomic(sg_page(sg)); + memcpy(p, + from + sg->offset, + sg->length); + kunmap_atomic(from); + p += sg->length; + } + } + + sg_init_one(&data_copy->sg_single, mem, cmd_data_len); + data_copy->buf = &data_copy->sg_single; + data_copy->size = 1; + data_copy->copy_buf = mem; + + dma_nents = ib_dma_map_sg(dev, &data_copy->sg_single, 1, + (cmd_dir == ISER_DIR_OUT) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE); + BUG_ON(dma_nents == 0); + + data_copy->dma_nents = dma_nents; + data_copy->data_len = cmd_data_len; + + return 0; +} + +/** + * iser_finalize_rdma_unaligned_sg + */ + +void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, + struct iser_data_buf *data, + struct iser_data_buf *data_copy, + enum iser_data_dir cmd_dir) +{ + struct ib_device *dev; + unsigned long cmd_data_len; + + dev = iser_task->iser_conn->ib_conn.device->ib_device; + + ib_dma_unmap_sg(dev, &data_copy->sg_single, 1, + (cmd_dir == ISER_DIR_OUT) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE); + + if (cmd_dir == ISER_DIR_IN) { + char *mem; + struct scatterlist *sgl, *sg; + unsigned char *p, *to; + unsigned int sg_size; + int i; + + /* copy back read RDMA to unaligned sg */ + mem = data_copy->copy_buf; + + sgl = (struct scatterlist *)data->buf; + sg_size = data->size; + + p = mem; + for_each_sg(sgl, sg, sg_size, i) { + to = kmap_atomic(sg_page(sg)); + memcpy(to + sg->offset, + p, + sg->length); + kunmap_atomic(to); + p += sg->length; + } + } + + cmd_data_len = data->data_len; + + if (cmd_data_len > ISER_KMALLOC_THRESHOLD) + free_pages((unsigned long)data_copy->copy_buf, + ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT); + else + kfree(data_copy->copy_buf); + + data_copy->copy_buf = NULL; +} + +#define IS_4K_ALIGNED(addr) ((((unsigned long)addr) & ~MASK_4K) == 0) + +/** + * iser_sg_to_page_vec - Translates scatterlist entries to physical addresses + * and returns the length of resulting physical address array (may be less than + * the original due to possible compaction). + * + * we build a "page vec" under the assumption that the SG meets the RDMA + * alignment requirements. Other then the first and last SG elements, all + * the "internal" elements can be compacted into a list whose elements are + * dma addresses of physical pages. The code supports also the weird case + * where --few fragments of the same page-- are present in the SG as + * consecutive elements. Also, it handles one entry SG. + */ + +static int iser_sg_to_page_vec(struct iser_data_buf *data, + struct ib_device *ibdev, u64 *pages, + int *offset, int *data_size) +{ + struct scatterlist *sg, *sgl = (struct scatterlist *)data->buf; + u64 start_addr, end_addr, page, chunk_start = 0; + unsigned long total_sz = 0; + unsigned int dma_len; + int i, new_chunk, cur_page, last_ent = data->dma_nents - 1; + + /* compute the offset of first element */ + *offset = (u64) sgl[0].offset & ~MASK_4K; + + new_chunk = 1; + cur_page = 0; + for_each_sg(sgl, sg, data->dma_nents, i) { + start_addr = ib_sg_dma_address(ibdev, sg); + if (new_chunk) + chunk_start = start_addr; + dma_len = ib_sg_dma_len(ibdev, sg); + end_addr = start_addr + dma_len; + total_sz += dma_len; + + /* collect page fragments until aligned or end of SG list */ + if (!IS_4K_ALIGNED(end_addr) && i < last_ent) { + new_chunk = 0; + continue; + } + new_chunk = 1; + + /* address of the first page in the contiguous chunk; + masking relevant for the very first SG entry, + which might be unaligned */ + page = chunk_start & MASK_4K; + do { + pages[cur_page++] = page; + page += SIZE_4K; + } while (page < end_addr); + } + + *data_size = total_sz; + iser_dbg("page_vec->data_size:%d cur_page %d\n", + *data_size, cur_page); + return cur_page; +} + + +/** + * iser_data_buf_aligned_len - Tries to determine the maximal correctly aligned + * for RDMA sub-list of a scatter-gather list of memory buffers, and returns + * the number of entries which are aligned correctly. Supports the case where + * consecutive SG elements are actually fragments of the same physcial page. + */ +static int iser_data_buf_aligned_len(struct iser_data_buf *data, + struct ib_device *ibdev) +{ + struct scatterlist *sgl, *sg, *next_sg = NULL; + u64 start_addr, end_addr; + int i, ret_len, start_check = 0; + + if (data->dma_nents == 1) + return 1; + + sgl = (struct scatterlist *)data->buf; + start_addr = ib_sg_dma_address(ibdev, sgl); + + for_each_sg(sgl, sg, data->dma_nents, i) { + if (start_check && !IS_4K_ALIGNED(start_addr)) + break; + + next_sg = sg_next(sg); + if (!next_sg) + break; + + end_addr = start_addr + ib_sg_dma_len(ibdev, sg); + start_addr = ib_sg_dma_address(ibdev, next_sg); + + if (end_addr == start_addr) { + start_check = 0; + continue; + } else + start_check = 1; + + if (!IS_4K_ALIGNED(end_addr)) + break; + } + ret_len = (next_sg) ? i : i+1; + iser_dbg("Found %d aligned entries out of %d in sg:0x%p\n", + ret_len, data->dma_nents, data); + return ret_len; +} + +static void iser_data_buf_dump(struct iser_data_buf *data, + struct ib_device *ibdev) +{ + struct scatterlist *sgl = (struct scatterlist *)data->buf; + struct scatterlist *sg; + int i; + + for_each_sg(sgl, sg, data->dma_nents, i) + iser_dbg("sg[%d] dma_addr:0x%lX page:0x%p " + "off:0x%x sz:0x%x dma_len:0x%x\n", + i, (unsigned long)ib_sg_dma_address(ibdev, sg), + sg_page(sg), sg->offset, + sg->length, ib_sg_dma_len(ibdev, sg)); +} + +static void iser_dump_page_vec(struct iser_page_vec *page_vec) +{ + int i; + + iser_err("page vec length %d data size %d\n", + page_vec->length, page_vec->data_size); + for (i = 0; i < page_vec->length; i++) + iser_err("%d %lx\n",i,(unsigned long)page_vec->pages[i]); +} + +static void iser_page_vec_build(struct iser_data_buf *data, + struct iser_page_vec *page_vec, + struct ib_device *ibdev) +{ + int page_vec_len = 0; + + page_vec->length = 0; + page_vec->offset = 0; + + iser_dbg("Translating sg sz: %d\n", data->dma_nents); + page_vec_len = iser_sg_to_page_vec(data, ibdev, page_vec->pages, + &page_vec->offset, + &page_vec->data_size); + iser_dbg("sg len %d page_vec_len %d\n", data->dma_nents, page_vec_len); + + page_vec->length = page_vec_len; + + if (page_vec_len * SIZE_4K < page_vec->data_size) { + iser_err("page_vec too short to hold this SG\n"); + iser_data_buf_dump(data, ibdev); + iser_dump_page_vec(page_vec); + BUG(); + } +} + +int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, + struct iser_data_buf *data, + enum iser_data_dir iser_dir, + enum dma_data_direction dma_dir) +{ + struct ib_device *dev; + + iser_task->dir[iser_dir] = 1; + dev = iser_task->iser_conn->ib_conn.device->ib_device; + + data->dma_nents = ib_dma_map_sg(dev, data->buf, data->size, dma_dir); + if (data->dma_nents == 0) { + iser_err("dma_map_sg failed!!!\n"); + return -EINVAL; + } + return 0; +} + +void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task, + struct iser_data_buf *data) +{ + struct ib_device *dev; + + dev = iser_task->iser_conn->ib_conn.device->ib_device; + ib_dma_unmap_sg(dev, data->buf, data->size, DMA_FROM_DEVICE); +} + +static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task, + struct ib_device *ibdev, + struct iser_data_buf *mem, + struct iser_data_buf *mem_copy, + enum iser_data_dir cmd_dir, + int aligned_len) +{ + struct iscsi_conn *iscsi_conn = iser_task->iser_conn->iscsi_conn; + + iscsi_conn->fmr_unalign_cnt++; + iser_warn("rdma alignment violation (%d/%d aligned) or FMR not supported\n", + aligned_len, mem->size); + + if (iser_debug_level > 0) + iser_data_buf_dump(mem, ibdev); + + /* unmap the command data before accessing it */ + iser_dma_unmap_task_data(iser_task, mem); + + /* allocate copy buf, if we are writing, copy the */ + /* unaligned scatterlist, dma map the copy */ + if (iser_start_rdma_unaligned_sg(iser_task, mem, mem_copy, cmd_dir) != 0) + return -ENOMEM; + + return 0; +} + +/** + * iser_reg_rdma_mem_fmr - Registers memory intended for RDMA, + * using FMR (if possible) obtaining rkey and va + * + * returns 0 on success, errno code on failure + */ +int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir) +{ + struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn; + struct iser_device *device = ib_conn->device; + struct ib_device *ibdev = device->ib_device; + struct iser_data_buf *mem = &iser_task->data[cmd_dir]; + struct iser_regd_buf *regd_buf; + int aligned_len; + int err; + int i; + struct scatterlist *sg; + + regd_buf = &iser_task->rdma_regd[cmd_dir]; + + aligned_len = iser_data_buf_aligned_len(mem, ibdev); + if (aligned_len != mem->dma_nents) { + err = fall_to_bounce_buf(iser_task, ibdev, mem, + &iser_task->data_copy[cmd_dir], + cmd_dir, aligned_len); + if (err) { + iser_err("failed to allocate bounce buffer\n"); + return err; + } + mem = &iser_task->data_copy[cmd_dir]; + } + + /* if there a single dma entry, FMR is not needed */ + if (mem->dma_nents == 1) { + sg = (struct scatterlist *)mem->buf; + + regd_buf->reg.lkey = device->mr->lkey; + regd_buf->reg.rkey = device->mr->rkey; + regd_buf->reg.len = ib_sg_dma_len(ibdev, &sg[0]); + regd_buf->reg.va = ib_sg_dma_address(ibdev, &sg[0]); + regd_buf->reg.is_mr = 0; + + iser_dbg("PHYSICAL Mem.register: lkey: 0x%08X rkey: 0x%08X " + "va: 0x%08lX sz: %ld]\n", + (unsigned int)regd_buf->reg.lkey, + (unsigned int)regd_buf->reg.rkey, + (unsigned long)regd_buf->reg.va, + (unsigned long)regd_buf->reg.len); + } else { /* use FMR for multiple dma entries */ + iser_page_vec_build(mem, ib_conn->fmr.page_vec, ibdev); + err = iser_reg_page_vec(ib_conn, ib_conn->fmr.page_vec, + ®d_buf->reg); + if (err && err != -EAGAIN) { + iser_data_buf_dump(mem, ibdev); + iser_err("mem->dma_nents = %d (dlength = 0x%x)\n", + mem->dma_nents, + ntoh24(iser_task->desc.iscsi_header.dlength)); + iser_err("page_vec: data_size = 0x%x, length = %d, offset = 0x%x\n", + ib_conn->fmr.page_vec->data_size, + ib_conn->fmr.page_vec->length, + ib_conn->fmr.page_vec->offset); + for (i = 0; i < ib_conn->fmr.page_vec->length; i++) + iser_err("page_vec[%d] = 0x%llx\n", i, + (unsigned long long)ib_conn->fmr.page_vec->pages[i]); + } + if (err) + return err; + } + return 0; +} + +static inline void +iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs, + struct ib_sig_domain *domain) +{ + domain->sig_type = IB_SIG_TYPE_T10_DIF; + domain->sig.dif.pi_interval = sc->device->sector_size; + domain->sig.dif.ref_tag = scsi_get_lba(sc) & 0xffffffff; + /* + * At the moment we hard code those, but in the future + * we will take them from sc. + */ + domain->sig.dif.apptag_check_mask = 0xffff; + domain->sig.dif.app_escape = true; + domain->sig.dif.ref_escape = true; + if (scsi_get_prot_type(sc) == SCSI_PROT_DIF_TYPE1 || + scsi_get_prot_type(sc) == SCSI_PROT_DIF_TYPE2) + domain->sig.dif.ref_remap = true; +}; + +static int +iser_set_sig_attrs(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs) +{ + switch (scsi_get_prot_op(sc)) { + case SCSI_PROT_WRITE_INSERT: + case SCSI_PROT_READ_STRIP: + sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE; + iser_set_dif_domain(sc, sig_attrs, &sig_attrs->wire); + sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; + break; + case SCSI_PROT_READ_INSERT: + case SCSI_PROT_WRITE_STRIP: + sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE; + iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem); + /* + * At the moment we use this modparam to tell what is + * the memory bg_type, in the future we will take it + * from sc. + */ + sig_attrs->mem.sig.dif.bg_type = iser_pi_guard ? IB_T10DIF_CSUM : + IB_T10DIF_CRC; + break; + case SCSI_PROT_READ_PASS: + case SCSI_PROT_WRITE_PASS: + iser_set_dif_domain(sc, sig_attrs, &sig_attrs->wire); + sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; + iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem); + /* + * At the moment we use this modparam to tell what is + * the memory bg_type, in the future we will take it + * from sc. + */ + sig_attrs->mem.sig.dif.bg_type = iser_pi_guard ? IB_T10DIF_CSUM : + IB_T10DIF_CRC; + break; + default: + iser_err("Unsupported PI operation %d\n", + scsi_get_prot_op(sc)); + return -EINVAL; + } + + return 0; +} + +static int +iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask) +{ + switch (scsi_get_prot_type(sc)) { + case SCSI_PROT_DIF_TYPE0: + break; + case SCSI_PROT_DIF_TYPE1: + case SCSI_PROT_DIF_TYPE2: + *mask = ISER_CHECK_GUARD | ISER_CHECK_REFTAG; + break; + case SCSI_PROT_DIF_TYPE3: + *mask = ISER_CHECK_GUARD; + break; + default: + iser_err("Unsupported protection type %d\n", + scsi_get_prot_type(sc)); + return -EINVAL; + } + + return 0; +} + +static int +iser_reg_sig_mr(struct iscsi_iser_task *iser_task, + struct fast_reg_descriptor *desc, struct ib_sge *data_sge, + struct ib_sge *prot_sge, struct ib_sge *sig_sge) +{ + struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn; + struct iser_pi_context *pi_ctx = desc->pi_ctx; + struct ib_send_wr sig_wr, inv_wr; + struct ib_send_wr *bad_wr, *wr = NULL; + struct ib_sig_attrs sig_attrs; + int ret; + u32 key; + + memset(&sig_attrs, 0, sizeof(sig_attrs)); + ret = iser_set_sig_attrs(iser_task->sc, &sig_attrs); + if (ret) + goto err; + + ret = iser_set_prot_checks(iser_task->sc, &sig_attrs.check_mask); + if (ret) + goto err; + + if (!(desc->reg_indicators & ISER_SIG_KEY_VALID)) { + memset(&inv_wr, 0, sizeof(inv_wr)); + inv_wr.opcode = IB_WR_LOCAL_INV; + inv_wr.wr_id = ISER_FASTREG_LI_WRID; + inv_wr.ex.invalidate_rkey = pi_ctx->sig_mr->rkey; + wr = &inv_wr; + /* Bump the key */ + key = (u8)(pi_ctx->sig_mr->rkey & 0x000000FF); + ib_update_fast_reg_key(pi_ctx->sig_mr, ++key); + } + + memset(&sig_wr, 0, sizeof(sig_wr)); + sig_wr.opcode = IB_WR_REG_SIG_MR; + sig_wr.wr_id = ISER_FASTREG_LI_WRID; + sig_wr.sg_list = data_sge; + sig_wr.num_sge = 1; + sig_wr.wr.sig_handover.sig_attrs = &sig_attrs; + sig_wr.wr.sig_handover.sig_mr = pi_ctx->sig_mr; + if (scsi_prot_sg_count(iser_task->sc)) + sig_wr.wr.sig_handover.prot = prot_sge; + sig_wr.wr.sig_handover.access_flags = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE; + + if (!wr) + wr = &sig_wr; + else + wr->next = &sig_wr; + + ret = ib_post_send(ib_conn->qp, wr, &bad_wr); + if (ret) { + iser_err("reg_sig_mr failed, ret:%d\n", ret); + goto err; + } + desc->reg_indicators &= ~ISER_SIG_KEY_VALID; + + sig_sge->lkey = pi_ctx->sig_mr->lkey; + sig_sge->addr = 0; + sig_sge->length = data_sge->length + prot_sge->length; + if (scsi_get_prot_op(iser_task->sc) == SCSI_PROT_WRITE_INSERT || + scsi_get_prot_op(iser_task->sc) == SCSI_PROT_READ_STRIP) { + sig_sge->length += (data_sge->length / + iser_task->sc->device->sector_size) * 8; + } + + iser_dbg("sig_sge: addr: 0x%llx length: %u lkey: 0x%x\n", + sig_sge->addr, sig_sge->length, + sig_sge->lkey); +err: + return ret; +} + +static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, + struct iser_regd_buf *regd_buf, + struct iser_data_buf *mem, + enum iser_reg_indicator ind, + struct ib_sge *sge) +{ + struct fast_reg_descriptor *desc = regd_buf->reg.mem_h; + struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn; + struct iser_device *device = ib_conn->device; + struct ib_device *ibdev = device->ib_device; + struct ib_mr *mr; + struct ib_fast_reg_page_list *frpl; + struct ib_send_wr fastreg_wr, inv_wr; + struct ib_send_wr *bad_wr, *wr = NULL; + u8 key; + int ret, offset, size, plen; + + /* if there a single dma entry, dma mr suffices */ + if (mem->dma_nents == 1) { + struct scatterlist *sg = (struct scatterlist *)mem->buf; + + sge->lkey = device->mr->lkey; + sge->addr = ib_sg_dma_address(ibdev, &sg[0]); + sge->length = ib_sg_dma_len(ibdev, &sg[0]); + + iser_dbg("Single DMA entry: lkey=0x%x, addr=0x%llx, length=0x%x\n", + sge->lkey, sge->addr, sge->length); + return 0; + } + + if (ind == ISER_DATA_KEY_VALID) { + mr = desc->data_mr; + frpl = desc->data_frpl; + } else { + mr = desc->pi_ctx->prot_mr; + frpl = desc->pi_ctx->prot_frpl; + } + + plen = iser_sg_to_page_vec(mem, device->ib_device, frpl->page_list, + &offset, &size); + if (plen * SIZE_4K < size) { + iser_err("fast reg page_list too short to hold this SG\n"); + return -EINVAL; + } + + if (!(desc->reg_indicators & ind)) { + memset(&inv_wr, 0, sizeof(inv_wr)); + inv_wr.wr_id = ISER_FASTREG_LI_WRID; + inv_wr.opcode = IB_WR_LOCAL_INV; + inv_wr.ex.invalidate_rkey = mr->rkey; + wr = &inv_wr; + /* Bump the key */ + key = (u8)(mr->rkey & 0x000000FF); + ib_update_fast_reg_key(mr, ++key); + } + + /* Prepare FASTREG WR */ + memset(&fastreg_wr, 0, sizeof(fastreg_wr)); + fastreg_wr.wr_id = ISER_FASTREG_LI_WRID; + fastreg_wr.opcode = IB_WR_FAST_REG_MR; + fastreg_wr.wr.fast_reg.iova_start = frpl->page_list[0] + offset; + fastreg_wr.wr.fast_reg.page_list = frpl; + fastreg_wr.wr.fast_reg.page_list_len = plen; + fastreg_wr.wr.fast_reg.page_shift = SHIFT_4K; + fastreg_wr.wr.fast_reg.length = size; + fastreg_wr.wr.fast_reg.rkey = mr->rkey; + fastreg_wr.wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ); + + if (!wr) + wr = &fastreg_wr; + else + wr->next = &fastreg_wr; + + ret = ib_post_send(ib_conn->qp, wr, &bad_wr); + if (ret) { + iser_err("fast registration failed, ret:%d\n", ret); + return ret; + } + desc->reg_indicators &= ~ind; + + sge->lkey = mr->lkey; + sge->addr = frpl->page_list[0] + offset; + sge->length = size; + + return ret; +} + +/** + * iser_reg_rdma_mem_fastreg - Registers memory intended for RDMA, + * using Fast Registration WR (if possible) obtaining rkey and va + * + * returns 0 on success, errno code on failure + */ +int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir) +{ + struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn; + struct iser_device *device = ib_conn->device; + struct ib_device *ibdev = device->ib_device; + struct iser_data_buf *mem = &iser_task->data[cmd_dir]; + struct iser_regd_buf *regd_buf = &iser_task->rdma_regd[cmd_dir]; + struct fast_reg_descriptor *desc = NULL; + struct ib_sge data_sge; + int err, aligned_len; + unsigned long flags; + + aligned_len = iser_data_buf_aligned_len(mem, ibdev); + if (aligned_len != mem->dma_nents) { + err = fall_to_bounce_buf(iser_task, ibdev, mem, + &iser_task->data_copy[cmd_dir], + cmd_dir, aligned_len); + if (err) { + iser_err("failed to allocate bounce buffer\n"); + return err; + } + mem = &iser_task->data_copy[cmd_dir]; + } + + if (mem->dma_nents != 1 || + scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) { + spin_lock_irqsave(&ib_conn->lock, flags); + desc = list_first_entry(&ib_conn->fastreg.pool, + struct fast_reg_descriptor, list); + list_del(&desc->list); + spin_unlock_irqrestore(&ib_conn->lock, flags); + regd_buf->reg.mem_h = desc; + } + + err = iser_fast_reg_mr(iser_task, regd_buf, mem, + ISER_DATA_KEY_VALID, &data_sge); + if (err) + goto err_reg; + + if (scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) { + struct ib_sge prot_sge, sig_sge; + + memset(&prot_sge, 0, sizeof(prot_sge)); + if (scsi_prot_sg_count(iser_task->sc)) { + mem = &iser_task->prot[cmd_dir]; + aligned_len = iser_data_buf_aligned_len(mem, ibdev); + if (aligned_len != mem->dma_nents) { + err = fall_to_bounce_buf(iser_task, ibdev, mem, + &iser_task->prot_copy[cmd_dir], + cmd_dir, aligned_len); + if (err) { + iser_err("failed to allocate bounce buffer\n"); + return err; + } + mem = &iser_task->prot_copy[cmd_dir]; + } + + err = iser_fast_reg_mr(iser_task, regd_buf, mem, + ISER_PROT_KEY_VALID, &prot_sge); + if (err) + goto err_reg; + } + + err = iser_reg_sig_mr(iser_task, desc, &data_sge, + &prot_sge, &sig_sge); + if (err) { + iser_err("Failed to register signature mr\n"); + return err; + } + desc->reg_indicators |= ISER_FASTREG_PROTECTED; + + regd_buf->reg.lkey = sig_sge.lkey; + regd_buf->reg.rkey = desc->pi_ctx->sig_mr->rkey; + regd_buf->reg.va = sig_sge.addr; + regd_buf->reg.len = sig_sge.length; + regd_buf->reg.is_mr = 1; + } else { + if (desc) { + regd_buf->reg.rkey = desc->data_mr->rkey; + regd_buf->reg.is_mr = 1; + } else { + regd_buf->reg.rkey = device->mr->rkey; + regd_buf->reg.is_mr = 0; + } + + regd_buf->reg.lkey = data_sge.lkey; + regd_buf->reg.va = data_sge.addr; + regd_buf->reg.len = data_sge.length; + } + + return 0; +err_reg: + if (desc) { + spin_lock_irqsave(&ib_conn->lock, flags); + list_add_tail(&desc->list, &ib_conn->fastreg.pool); + spin_unlock_irqrestore(&ib_conn->lock, flags); + } + + return err; +} diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c new file mode 100644 index 0000000..67225bb --- /dev/null +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -0,0 +1,1310 @@ +/* + * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include + +#include "iscsi_iser.h" + +#define ISCSI_ISER_MAX_CONN 8 +#define ISER_MAX_RX_LEN (ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN) +#define ISER_MAX_TX_LEN (ISER_QP_MAX_REQ_DTOS * ISCSI_ISER_MAX_CONN) +#define ISER_MAX_CQ_LEN (ISER_MAX_RX_LEN + ISER_MAX_TX_LEN + \ + ISCSI_ISER_MAX_CONN) + +static int iser_cq_poll_limit = 512; + +static void iser_cq_tasklet_fn(unsigned long data); +static void iser_cq_callback(struct ib_cq *cq, void *cq_context); + +static void iser_cq_event_callback(struct ib_event *cause, void *context) +{ + iser_err("got cq event %d \n", cause->event); +} + +static void iser_qp_event_callback(struct ib_event *cause, void *context) +{ + iser_err("got qp event %d\n",cause->event); +} + +static void iser_event_handler(struct ib_event_handler *handler, + struct ib_event *event) +{ + iser_err("async event %d on device %s port %d\n", event->event, + event->device->name, event->element.port_num); +} + +/** + * iser_create_device_ib_res - creates Protection Domain (PD), Completion + * Queue (CQ), DMA Memory Region (DMA MR) with the device associated with + * the adapator. + * + * returns 0 on success, -1 on failure + */ +static int iser_create_device_ib_res(struct iser_device *device) +{ + struct ib_device_attr *dev_attr = &device->dev_attr; + int ret, i; + + ret = ib_query_device(device->ib_device, dev_attr); + if (ret) { + pr_warn("Query device failed for %s\n", device->ib_device->name); + return ret; + } + + /* Assign function handles - based on FMR support */ + if (device->ib_device->alloc_fmr && device->ib_device->dealloc_fmr && + device->ib_device->map_phys_fmr && device->ib_device->unmap_fmr) { + iser_info("FMR supported, using FMR for registration\n"); + device->iser_alloc_rdma_reg_res = iser_create_fmr_pool; + device->iser_free_rdma_reg_res = iser_free_fmr_pool; + device->iser_reg_rdma_mem = iser_reg_rdma_mem_fmr; + device->iser_unreg_rdma_mem = iser_unreg_mem_fmr; + } else + if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { + iser_info("FastReg supported, using FastReg for registration\n"); + device->iser_alloc_rdma_reg_res = iser_create_fastreg_pool; + device->iser_free_rdma_reg_res = iser_free_fastreg_pool; + device->iser_reg_rdma_mem = iser_reg_rdma_mem_fastreg; + device->iser_unreg_rdma_mem = iser_unreg_mem_fastreg; + } else { + iser_err("IB device does not support FMRs nor FastRegs, can't register memory\n"); + return -1; + } + + device->comps_used = min(ISER_MAX_CQ, + device->ib_device->num_comp_vectors); + iser_info("using %d CQs, device %s supports %d vectors\n", + device->comps_used, device->ib_device->name, + device->ib_device->num_comp_vectors); + + device->pd = ib_alloc_pd(device->ib_device); + if (IS_ERR(device->pd)) + goto pd_err; + + for (i = 0; i < device->comps_used; i++) { + struct iser_comp *comp = &device->comps[i]; + + comp->device = device; + comp->cq = ib_create_cq(device->ib_device, + iser_cq_callback, + iser_cq_event_callback, + (void *)comp, + ISER_MAX_CQ_LEN, i); + if (IS_ERR(comp->cq)) { + comp->cq = NULL; + goto cq_err; + } + + if (ib_req_notify_cq(comp->cq, IB_CQ_NEXT_COMP)) + goto cq_err; + + tasklet_init(&comp->tasklet, iser_cq_tasklet_fn, + (unsigned long)comp); + } + + device->mr = ib_get_dma_mr(device->pd, IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ); + if (IS_ERR(device->mr)) + goto dma_mr_err; + + INIT_IB_EVENT_HANDLER(&device->event_handler, device->ib_device, + iser_event_handler); + if (ib_register_event_handler(&device->event_handler)) + goto handler_err; + + return 0; + +handler_err: + ib_dereg_mr(device->mr); +dma_mr_err: + for (i = 0; i < device->comps_used; i++) + tasklet_kill(&device->comps[i].tasklet); +cq_err: + for (i = 0; i < device->comps_used; i++) { + struct iser_comp *comp = &device->comps[i]; + + if (comp->cq) + ib_destroy_cq(comp->cq); + } + ib_dealloc_pd(device->pd); +pd_err: + iser_err("failed to allocate an IB resource\n"); + return -1; +} + +/** + * iser_free_device_ib_res - destroy/dealloc/dereg the DMA MR, + * CQ and PD created with the device associated with the adapator. + */ +static void iser_free_device_ib_res(struct iser_device *device) +{ + int i; + BUG_ON(device->mr == NULL); + + for (i = 0; i < device->comps_used; i++) { + struct iser_comp *comp = &device->comps[i]; + + tasklet_kill(&comp->tasklet); + ib_destroy_cq(comp->cq); + comp->cq = NULL; + } + + (void)ib_unregister_event_handler(&device->event_handler); + (void)ib_dereg_mr(device->mr); + (void)ib_dealloc_pd(device->pd); + + device->mr = NULL; + device->pd = NULL; +} + +/** + * iser_create_fmr_pool - Creates FMR pool and page_vector + * + * returns 0 on success, or errno code on failure + */ +int iser_create_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max) +{ + struct iser_device *device = ib_conn->device; + struct ib_fmr_pool_param params; + int ret = -ENOMEM; + + ib_conn->fmr.page_vec = kmalloc(sizeof(*ib_conn->fmr.page_vec) + + (sizeof(u64)*(ISCSI_ISER_SG_TABLESIZE + 1)), + GFP_KERNEL); + if (!ib_conn->fmr.page_vec) + return ret; + + ib_conn->fmr.page_vec->pages = (u64 *)(ib_conn->fmr.page_vec + 1); + + params.page_shift = SHIFT_4K; + /* when the first/last SG element are not start/end * + * page aligned, the map whould be of N+1 pages */ + params.max_pages_per_fmr = ISCSI_ISER_SG_TABLESIZE + 1; + /* make the pool size twice the max number of SCSI commands * + * the ML is expected to queue, watermark for unmap at 50% */ + params.pool_size = cmds_max * 2; + params.dirty_watermark = cmds_max; + params.cache = 0; + params.flush_function = NULL; + params.access = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ); + + ib_conn->fmr.pool = ib_create_fmr_pool(device->pd, ¶ms); + if (!IS_ERR(ib_conn->fmr.pool)) + return 0; + + /* no FMR => no need for page_vec */ + kfree(ib_conn->fmr.page_vec); + ib_conn->fmr.page_vec = NULL; + + ret = PTR_ERR(ib_conn->fmr.pool); + ib_conn->fmr.pool = NULL; + if (ret != -ENOSYS) { + iser_err("FMR allocation failed, err %d\n", ret); + return ret; + } else { + iser_warn("FMRs are not supported, using unaligned mode\n"); + return 0; + } +} + +/** + * iser_free_fmr_pool - releases the FMR pool and page vec + */ +void iser_free_fmr_pool(struct ib_conn *ib_conn) +{ + iser_info("freeing conn %p fmr pool %p\n", + ib_conn, ib_conn->fmr.pool); + + if (ib_conn->fmr.pool != NULL) + ib_destroy_fmr_pool(ib_conn->fmr.pool); + + ib_conn->fmr.pool = NULL; + + kfree(ib_conn->fmr.page_vec); + ib_conn->fmr.page_vec = NULL; +} + +static int +iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd, + bool pi_enable, struct fast_reg_descriptor *desc) +{ + int ret; + + desc->data_frpl = ib_alloc_fast_reg_page_list(ib_device, + ISCSI_ISER_SG_TABLESIZE + 1); + if (IS_ERR(desc->data_frpl)) { + ret = PTR_ERR(desc->data_frpl); + iser_err("Failed to allocate ib_fast_reg_page_list err=%d\n", + ret); + return PTR_ERR(desc->data_frpl); + } + + desc->data_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE + 1); + if (IS_ERR(desc->data_mr)) { + ret = PTR_ERR(desc->data_mr); + iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret); + goto fast_reg_mr_failure; + } + desc->reg_indicators |= ISER_DATA_KEY_VALID; + + if (pi_enable) { + struct ib_mr_init_attr mr_init_attr = {0}; + struct iser_pi_context *pi_ctx = NULL; + + desc->pi_ctx = kzalloc(sizeof(*desc->pi_ctx), GFP_KERNEL); + if (!desc->pi_ctx) { + iser_err("Failed to allocate pi context\n"); + ret = -ENOMEM; + goto pi_ctx_alloc_failure; + } + pi_ctx = desc->pi_ctx; + + pi_ctx->prot_frpl = ib_alloc_fast_reg_page_list(ib_device, + ISCSI_ISER_SG_TABLESIZE); + if (IS_ERR(pi_ctx->prot_frpl)) { + ret = PTR_ERR(pi_ctx->prot_frpl); + iser_err("Failed to allocate prot frpl ret=%d\n", + ret); + goto prot_frpl_failure; + } + + pi_ctx->prot_mr = ib_alloc_fast_reg_mr(pd, + ISCSI_ISER_SG_TABLESIZE + 1); + if (IS_ERR(pi_ctx->prot_mr)) { + ret = PTR_ERR(pi_ctx->prot_mr); + iser_err("Failed to allocate prot frmr ret=%d\n", + ret); + goto prot_mr_failure; + } + desc->reg_indicators |= ISER_PROT_KEY_VALID; + + mr_init_attr.max_reg_descriptors = 2; + mr_init_attr.flags |= IB_MR_SIGNATURE_EN; + pi_ctx->sig_mr = ib_create_mr(pd, &mr_init_attr); + if (IS_ERR(pi_ctx->sig_mr)) { + ret = PTR_ERR(pi_ctx->sig_mr); + iser_err("Failed to allocate signature enabled mr err=%d\n", + ret); + goto sig_mr_failure; + } + desc->reg_indicators |= ISER_SIG_KEY_VALID; + } + desc->reg_indicators &= ~ISER_FASTREG_PROTECTED; + + iser_dbg("Create fr_desc %p page_list %p\n", + desc, desc->data_frpl->page_list); + + return 0; +sig_mr_failure: + ib_dereg_mr(desc->pi_ctx->prot_mr); +prot_mr_failure: + ib_free_fast_reg_page_list(desc->pi_ctx->prot_frpl); +prot_frpl_failure: + kfree(desc->pi_ctx); +pi_ctx_alloc_failure: + ib_dereg_mr(desc->data_mr); +fast_reg_mr_failure: + ib_free_fast_reg_page_list(desc->data_frpl); + + return ret; +} + +/** + * iser_create_fastreg_pool - Creates pool of fast_reg descriptors + * for fast registration work requests. + * returns 0 on success, or errno code on failure + */ +int iser_create_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max) +{ + struct iser_device *device = ib_conn->device; + struct fast_reg_descriptor *desc; + int i, ret; + + INIT_LIST_HEAD(&ib_conn->fastreg.pool); + ib_conn->fastreg.pool_size = 0; + for (i = 0; i < cmds_max; i++) { + desc = kzalloc(sizeof(*desc), GFP_KERNEL); + if (!desc) { + iser_err("Failed to allocate a new fast_reg descriptor\n"); + ret = -ENOMEM; + goto err; + } + + ret = iser_create_fastreg_desc(device->ib_device, device->pd, + ib_conn->pi_support, desc); + if (ret) { + iser_err("Failed to create fastreg descriptor err=%d\n", + ret); + kfree(desc); + goto err; + } + + list_add_tail(&desc->list, &ib_conn->fastreg.pool); + ib_conn->fastreg.pool_size++; + } + + return 0; + +err: + iser_free_fastreg_pool(ib_conn); + return ret; +} + +/** + * iser_free_fastreg_pool - releases the pool of fast_reg descriptors + */ +void iser_free_fastreg_pool(struct ib_conn *ib_conn) +{ + struct fast_reg_descriptor *desc, *tmp; + int i = 0; + + if (list_empty(&ib_conn->fastreg.pool)) + return; + + iser_info("freeing conn %p fr pool\n", ib_conn); + + list_for_each_entry_safe(desc, tmp, &ib_conn->fastreg.pool, list) { + list_del(&desc->list); + ib_free_fast_reg_page_list(desc->data_frpl); + ib_dereg_mr(desc->data_mr); + if (desc->pi_ctx) { + ib_free_fast_reg_page_list(desc->pi_ctx->prot_frpl); + ib_dereg_mr(desc->pi_ctx->prot_mr); + ib_destroy_mr(desc->pi_ctx->sig_mr); + kfree(desc->pi_ctx); + } + kfree(desc); + ++i; + } + + if (i < ib_conn->fastreg.pool_size) + iser_warn("pool still has %d regions registered\n", + ib_conn->fastreg.pool_size - i); +} + +/** + * iser_create_ib_conn_res - Queue-Pair (QP) + * + * returns 0 on success, -1 on failure + */ +static int iser_create_ib_conn_res(struct ib_conn *ib_conn) +{ + struct iser_device *device; + struct ib_qp_init_attr init_attr; + int ret = -ENOMEM; + int index, min_index = 0; + + BUG_ON(ib_conn->device == NULL); + + device = ib_conn->device; + + memset(&init_attr, 0, sizeof init_attr); + + mutex_lock(&ig.connlist_mutex); + /* select the CQ with the minimal number of usages */ + for (index = 0; index < device->comps_used; index++) { + if (device->comps[index].active_qps < + device->comps[min_index].active_qps) + min_index = index; + } + ib_conn->comp = &device->comps[min_index]; + ib_conn->comp->active_qps++; + mutex_unlock(&ig.connlist_mutex); + iser_info("cq index %d used for ib_conn %p\n", min_index, ib_conn); + + init_attr.event_handler = iser_qp_event_callback; + init_attr.qp_context = (void *)ib_conn; + init_attr.send_cq = ib_conn->comp->cq; + init_attr.recv_cq = ib_conn->comp->cq; + init_attr.cap.max_recv_wr = ISER_QP_MAX_RECV_DTOS; + init_attr.cap.max_send_sge = 2; + init_attr.cap.max_recv_sge = 1; + init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + init_attr.qp_type = IB_QPT_RC; + if (ib_conn->pi_support) { + init_attr.cap.max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS + 1; + init_attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN; + } else { + init_attr.cap.max_send_wr = ISER_QP_MAX_REQ_DTOS + 1; + } + + ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr); + if (ret) + goto out_err; + + ib_conn->qp = ib_conn->cma_id->qp; + iser_info("setting conn %p cma_id %p qp %p\n", + ib_conn, ib_conn->cma_id, + ib_conn->cma_id->qp); + return ret; + +out_err: + iser_err("unable to alloc mem or create resource, err %d\n", ret); + return ret; +} + +/** + * based on the resolved device node GUID see if there already allocated + * device for this device. If there's no such, create one. + */ +static +struct iser_device *iser_device_find_by_ib_device(struct rdma_cm_id *cma_id) +{ + struct iser_device *device; + + mutex_lock(&ig.device_list_mutex); + + list_for_each_entry(device, &ig.device_list, ig_list) + /* find if there's a match using the node GUID */ + if (device->ib_device->node_guid == cma_id->device->node_guid) + goto inc_refcnt; + + device = kzalloc(sizeof *device, GFP_KERNEL); + if (device == NULL) + goto out; + + /* assign this device to the device */ + device->ib_device = cma_id->device; + /* init the device and link it into ig device list */ + if (iser_create_device_ib_res(device)) { + kfree(device); + device = NULL; + goto out; + } + list_add(&device->ig_list, &ig.device_list); + +inc_refcnt: + device->refcount++; +out: + mutex_unlock(&ig.device_list_mutex); + return device; +} + +/* if there's no demand for this device, release it */ +static void iser_device_try_release(struct iser_device *device) +{ + mutex_lock(&ig.device_list_mutex); + device->refcount--; + iser_info("device %p refcount %d\n", device, device->refcount); + if (!device->refcount) { + iser_free_device_ib_res(device); + list_del(&device->ig_list); + kfree(device); + } + mutex_unlock(&ig.device_list_mutex); +} + +/** + * Called with state mutex held + **/ +static int iser_conn_state_comp_exch(struct iser_conn *iser_conn, + enum iser_conn_state comp, + enum iser_conn_state exch) +{ + int ret; + + ret = (iser_conn->state == comp); + if (ret) + iser_conn->state = exch; + + return ret; +} + +void iser_release_work(struct work_struct *work) +{ + struct iser_conn *iser_conn; + + iser_conn = container_of(work, struct iser_conn, release_work); + + /* Wait for conn_stop to complete */ + wait_for_completion(&iser_conn->stop_completion); + /* Wait for IB resouces cleanup to complete */ + wait_for_completion(&iser_conn->ib_completion); + + mutex_lock(&iser_conn->state_mutex); + iser_conn->state = ISER_CONN_DOWN; + mutex_unlock(&iser_conn->state_mutex); + + iser_conn_release(iser_conn); +} + +/** + * iser_free_ib_conn_res - release IB related resources + * @iser_conn: iser connection struct + * @destroy_device: indicator if we need to try to release + * the iser device (only iscsi shutdown and DEVICE_REMOVAL + * will use this. + * + * This routine is called with the iser state mutex held + * so the cm_id removal is out of here. It is Safe to + * be invoked multiple times. + */ +static void iser_free_ib_conn_res(struct iser_conn *iser_conn, + bool destroy_device) +{ + struct ib_conn *ib_conn = &iser_conn->ib_conn; + struct iser_device *device = ib_conn->device; + + iser_info("freeing conn %p cma_id %p qp %p\n", + iser_conn, ib_conn->cma_id, ib_conn->qp); + + iser_free_rx_descriptors(iser_conn); + + if (ib_conn->qp != NULL) { + ib_conn->comp->active_qps--; + rdma_destroy_qp(ib_conn->cma_id); + ib_conn->qp = NULL; + } + + if (destroy_device && device != NULL) { + iser_device_try_release(device); + ib_conn->device = NULL; + } +} + +/** + * Frees all conn objects and deallocs conn descriptor + */ +void iser_conn_release(struct iser_conn *iser_conn) +{ + struct ib_conn *ib_conn = &iser_conn->ib_conn; + + mutex_lock(&ig.connlist_mutex); + list_del(&iser_conn->conn_list); + mutex_unlock(&ig.connlist_mutex); + + mutex_lock(&iser_conn->state_mutex); + if (iser_conn->state != ISER_CONN_DOWN) + iser_warn("iser conn %p state %d, expected state down.\n", + iser_conn, iser_conn->state); + /* + * In case we never got to bind stage, we still need to + * release IB resources (which is safe to call more than once). + */ + iser_free_ib_conn_res(iser_conn, true); + mutex_unlock(&iser_conn->state_mutex); + + if (ib_conn->cma_id != NULL) { + rdma_destroy_id(ib_conn->cma_id); + ib_conn->cma_id = NULL; + } + + kfree(iser_conn); +} + +/** + * triggers start of the disconnect procedures and wait for them to be done + * Called with state mutex held + */ +int iser_conn_terminate(struct iser_conn *iser_conn) +{ + struct ib_conn *ib_conn = &iser_conn->ib_conn; + struct ib_send_wr *bad_wr; + int err = 0; + + /* terminate the iser conn only if the conn state is UP */ + if (!iser_conn_state_comp_exch(iser_conn, ISER_CONN_UP, + ISER_CONN_TERMINATING)) + return 0; + + iser_info("iser_conn %p state %d\n", iser_conn, iser_conn->state); + + /* suspend queuing of new iscsi commands */ + if (iser_conn->iscsi_conn) + iscsi_suspend_queue(iser_conn->iscsi_conn); + + /* + * In case we didn't already clean up the cma_id (peer initiated + * a disconnection), we need to Cause the CMA to change the QP + * state to ERROR. + */ + if (ib_conn->cma_id) { + err = rdma_disconnect(ib_conn->cma_id); + if (err) + iser_err("Failed to disconnect, conn: 0x%p err %d\n", + iser_conn, err); + + /* post an indication that all flush errors were consumed */ + err = ib_post_send(ib_conn->qp, &ib_conn->beacon, &bad_wr); + if (err) + iser_err("conn %p failed to post beacon", ib_conn); + + wait_for_completion(&ib_conn->flush_comp); + } + + return 1; +} + +/** + * Called with state mutex held + **/ +static void iser_connect_error(struct rdma_cm_id *cma_id) +{ + struct iser_conn *iser_conn; + + iser_conn = (struct iser_conn *)cma_id->context; + iser_conn->state = ISER_CONN_DOWN; +} + +/** + * Called with state mutex held + **/ +static void iser_addr_handler(struct rdma_cm_id *cma_id) +{ + struct iser_device *device; + struct iser_conn *iser_conn; + struct ib_conn *ib_conn; + int ret; + + iser_conn = (struct iser_conn *)cma_id->context; + if (iser_conn->state != ISER_CONN_PENDING) + /* bailout */ + return; + + ib_conn = &iser_conn->ib_conn; + device = iser_device_find_by_ib_device(cma_id); + if (!device) { + iser_err("device lookup/creation failed\n"); + iser_connect_error(cma_id); + return; + } + + ib_conn->device = device; + + /* connection T10-PI support */ + if (iser_pi_enable) { + if (!(device->dev_attr.device_cap_flags & + IB_DEVICE_SIGNATURE_HANDOVER)) { + iser_warn("T10-PI requested but not supported on %s, " + "continue without T10-PI\n", + ib_conn->device->ib_device->name); + ib_conn->pi_support = false; + } else { + ib_conn->pi_support = true; + } + } + + ret = rdma_resolve_route(cma_id, 1000); + if (ret) { + iser_err("resolve route failed: %d\n", ret); + iser_connect_error(cma_id); + return; + } +} + +/** + * Called with state mutex held + **/ +static void iser_route_handler(struct rdma_cm_id *cma_id) +{ + struct rdma_conn_param conn_param; + int ret; + struct iser_cm_hdr req_hdr; + struct iser_conn *iser_conn = (struct iser_conn *)cma_id->context; + struct ib_conn *ib_conn = &iser_conn->ib_conn; + struct iser_device *device = ib_conn->device; + + if (iser_conn->state != ISER_CONN_PENDING) + /* bailout */ + return; + + ret = iser_create_ib_conn_res(ib_conn); + if (ret) + goto failure; + + memset(&conn_param, 0, sizeof conn_param); + conn_param.responder_resources = device->dev_attr.max_qp_rd_atom; + conn_param.initiator_depth = 1; + conn_param.retry_count = 7; + conn_param.rnr_retry_count = 6; + + memset(&req_hdr, 0, sizeof(req_hdr)); + req_hdr.flags = (ISER_ZBVA_NOT_SUPPORTED | + ISER_SEND_W_INV_NOT_SUPPORTED); + conn_param.private_data = (void *)&req_hdr; + conn_param.private_data_len = sizeof(struct iser_cm_hdr); + + ret = rdma_connect(cma_id, &conn_param); + if (ret) { + iser_err("failure connecting: %d\n", ret); + goto failure; + } + + return; +failure: + iser_connect_error(cma_id); +} + +static void iser_connected_handler(struct rdma_cm_id *cma_id) +{ + struct iser_conn *iser_conn; + struct ib_qp_attr attr; + struct ib_qp_init_attr init_attr; + + iser_conn = (struct iser_conn *)cma_id->context; + if (iser_conn->state != ISER_CONN_PENDING) + /* bailout */ + return; + + (void)ib_query_qp(cma_id->qp, &attr, ~0, &init_attr); + iser_info("remote qpn:%x my qpn:%x\n", attr.dest_qp_num, cma_id->qp->qp_num); + + iser_conn->state = ISER_CONN_UP; + complete(&iser_conn->up_completion); +} + +static void iser_disconnected_handler(struct rdma_cm_id *cma_id) +{ + struct iser_conn *iser_conn = (struct iser_conn *)cma_id->context; + + if (iser_conn_terminate(iser_conn)) { + if (iser_conn->iscsi_conn) + iscsi_conn_failure(iser_conn->iscsi_conn, + ISCSI_ERR_CONN_FAILED); + else + iser_err("iscsi_iser connection isn't bound\n"); + } +} + +static void iser_cleanup_handler(struct rdma_cm_id *cma_id, + bool destroy_device) +{ + struct iser_conn *iser_conn = (struct iser_conn *)cma_id->context; + + /* + * We are not guaranteed that we visited disconnected_handler + * by now, call it here to be safe that we handle CM drep + * and flush errors. + */ + iser_disconnected_handler(cma_id); + iser_free_ib_conn_res(iser_conn, destroy_device); + complete(&iser_conn->ib_completion); +}; + +static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) +{ + struct iser_conn *iser_conn; + int ret = 0; + + iser_conn = (struct iser_conn *)cma_id->context; + iser_info("event %d status %d conn %p id %p\n", + event->event, event->status, cma_id->context, cma_id); + + mutex_lock(&iser_conn->state_mutex); + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + iser_addr_handler(cma_id); + break; + case RDMA_CM_EVENT_ROUTE_RESOLVED: + iser_route_handler(cma_id); + break; + case RDMA_CM_EVENT_ESTABLISHED: + iser_connected_handler(cma_id); + break; + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_REJECTED: + iser_connect_error(cma_id); + break; + case RDMA_CM_EVENT_DISCONNECTED: + case RDMA_CM_EVENT_ADDR_CHANGE: + iser_disconnected_handler(cma_id); + break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + /* + * we *must* destroy the device as we cannot rely + * on iscsid to be around to initiate error handling. + * also implicitly destroy the cma_id. + */ + iser_cleanup_handler(cma_id, true); + iser_conn->ib_conn.cma_id = NULL; + ret = 1; + break; + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + iser_cleanup_handler(cma_id, false); + break; + default: + iser_err("Unexpected RDMA CM event (%d)\n", event->event); + break; + } + mutex_unlock(&iser_conn->state_mutex); + + return ret; +} + +void iser_conn_init(struct iser_conn *iser_conn) +{ + iser_conn->state = ISER_CONN_INIT; + iser_conn->ib_conn.post_recv_buf_count = 0; + init_completion(&iser_conn->ib_conn.flush_comp); + init_completion(&iser_conn->stop_completion); + init_completion(&iser_conn->ib_completion); + init_completion(&iser_conn->up_completion); + INIT_LIST_HEAD(&iser_conn->conn_list); + spin_lock_init(&iser_conn->ib_conn.lock); + mutex_init(&iser_conn->state_mutex); +} + + /** + * starts the process of connecting to the target + * sleeps until the connection is established or rejected + */ +int iser_connect(struct iser_conn *iser_conn, + struct sockaddr *src_addr, + struct sockaddr *dst_addr, + int non_blocking) +{ + struct ib_conn *ib_conn = &iser_conn->ib_conn; + int err = 0; + + mutex_lock(&iser_conn->state_mutex); + + sprintf(iser_conn->name, "%pISp", dst_addr); + + iser_info("connecting to: %s\n", iser_conn->name); + + /* the device is known only --after-- address resolution */ + ib_conn->device = NULL; + + iser_conn->state = ISER_CONN_PENDING; + + ib_conn->beacon.wr_id = ISER_BEACON_WRID; + ib_conn->beacon.opcode = IB_WR_SEND; + + ib_conn->cma_id = rdma_create_id(iser_cma_handler, + (void *)iser_conn, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(ib_conn->cma_id)) { + err = PTR_ERR(ib_conn->cma_id); + iser_err("rdma_create_id failed: %d\n", err); + goto id_failure; + } + + err = rdma_resolve_addr(ib_conn->cma_id, src_addr, dst_addr, 1000); + if (err) { + iser_err("rdma_resolve_addr failed: %d\n", err); + goto addr_failure; + } + + if (!non_blocking) { + wait_for_completion_interruptible(&iser_conn->up_completion); + + if (iser_conn->state != ISER_CONN_UP) { + err = -EIO; + goto connect_failure; + } + } + mutex_unlock(&iser_conn->state_mutex); + + mutex_lock(&ig.connlist_mutex); + list_add(&iser_conn->conn_list, &ig.connlist); + mutex_unlock(&ig.connlist_mutex); + return 0; + +id_failure: + ib_conn->cma_id = NULL; +addr_failure: + iser_conn->state = ISER_CONN_DOWN; +connect_failure: + mutex_unlock(&iser_conn->state_mutex); + iser_conn_release(iser_conn); + return err; +} + +/** + * iser_reg_page_vec - Register physical memory + * + * returns: 0 on success, errno code on failure + */ +int iser_reg_page_vec(struct ib_conn *ib_conn, + struct iser_page_vec *page_vec, + struct iser_mem_reg *mem_reg) +{ + struct ib_pool_fmr *mem; + u64 io_addr; + u64 *page_list; + int status; + + page_list = page_vec->pages; + io_addr = page_list[0]; + + mem = ib_fmr_pool_map_phys(ib_conn->fmr.pool, + page_list, + page_vec->length, + io_addr); + + if (IS_ERR(mem)) { + status = (int)PTR_ERR(mem); + iser_err("ib_fmr_pool_map_phys failed: %d\n", status); + return status; + } + + mem_reg->lkey = mem->fmr->lkey; + mem_reg->rkey = mem->fmr->rkey; + mem_reg->len = page_vec->length * SIZE_4K; + mem_reg->va = io_addr; + mem_reg->is_mr = 1; + mem_reg->mem_h = (void *)mem; + + mem_reg->va += page_vec->offset; + mem_reg->len = page_vec->data_size; + + iser_dbg("PHYSICAL Mem.register, [PHYS p_array: 0x%p, sz: %d, " + "entry[0]: (0x%08lx,%ld)] -> " + "[lkey: 0x%08X mem_h: 0x%p va: 0x%08lX sz: %ld]\n", + page_vec, page_vec->length, + (unsigned long)page_vec->pages[0], + (unsigned long)page_vec->data_size, + (unsigned int)mem_reg->lkey, mem_reg->mem_h, + (unsigned long)mem_reg->va, (unsigned long)mem_reg->len); + return 0; +} + +/** + * Unregister (previosuly registered using FMR) memory. + * If memory is non-FMR does nothing. + */ +void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir) +{ + struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; + int ret; + + if (!reg->is_mr) + return; + + iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n",reg->mem_h); + + ret = ib_fmr_pool_unmap((struct ib_pool_fmr *)reg->mem_h); + if (ret) + iser_err("ib_fmr_pool_unmap failed %d\n", ret); + + reg->mem_h = NULL; +} + +void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir) +{ + struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; + struct iser_conn *iser_conn = iser_task->iser_conn; + struct ib_conn *ib_conn = &iser_conn->ib_conn; + struct fast_reg_descriptor *desc = reg->mem_h; + + if (!reg->is_mr) + return; + + reg->mem_h = NULL; + reg->is_mr = 0; + spin_lock_bh(&ib_conn->lock); + list_add_tail(&desc->list, &ib_conn->fastreg.pool); + spin_unlock_bh(&ib_conn->lock); +} + +int iser_post_recvl(struct iser_conn *iser_conn) +{ + struct ib_recv_wr rx_wr, *rx_wr_failed; + struct ib_conn *ib_conn = &iser_conn->ib_conn; + struct ib_sge sge; + int ib_ret; + + sge.addr = iser_conn->login_resp_dma; + sge.length = ISER_RX_LOGIN_SIZE; + sge.lkey = ib_conn->device->mr->lkey; + + rx_wr.wr_id = (unsigned long)iser_conn->login_resp_buf; + rx_wr.sg_list = &sge; + rx_wr.num_sge = 1; + rx_wr.next = NULL; + + ib_conn->post_recv_buf_count++; + ib_ret = ib_post_recv(ib_conn->qp, &rx_wr, &rx_wr_failed); + if (ib_ret) { + iser_err("ib_post_recv failed ret=%d\n", ib_ret); + ib_conn->post_recv_buf_count--; + } + return ib_ret; +} + +int iser_post_recvm(struct iser_conn *iser_conn, int count) +{ + struct ib_recv_wr *rx_wr, *rx_wr_failed; + int i, ib_ret; + struct ib_conn *ib_conn = &iser_conn->ib_conn; + unsigned int my_rx_head = iser_conn->rx_desc_head; + struct iser_rx_desc *rx_desc; + + for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) { + rx_desc = &iser_conn->rx_descs[my_rx_head]; + rx_wr->wr_id = (unsigned long)rx_desc; + rx_wr->sg_list = &rx_desc->rx_sg; + rx_wr->num_sge = 1; + rx_wr->next = rx_wr + 1; + my_rx_head = (my_rx_head + 1) & iser_conn->qp_max_recv_dtos_mask; + } + + rx_wr--; + rx_wr->next = NULL; /* mark end of work requests list */ + + ib_conn->post_recv_buf_count += count; + ib_ret = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &rx_wr_failed); + if (ib_ret) { + iser_err("ib_post_recv failed ret=%d\n", ib_ret); + ib_conn->post_recv_buf_count -= count; + } else + iser_conn->rx_desc_head = my_rx_head; + return ib_ret; +} + + +/** + * iser_start_send - Initiate a Send DTO operation + * + * returns 0 on success, -1 on failure + */ +int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc, + bool signal) +{ + int ib_ret; + struct ib_send_wr send_wr, *send_wr_failed; + + ib_dma_sync_single_for_device(ib_conn->device->ib_device, + tx_desc->dma_addr, ISER_HEADERS_LEN, + DMA_TO_DEVICE); + + send_wr.next = NULL; + send_wr.wr_id = (unsigned long)tx_desc; + send_wr.sg_list = tx_desc->tx_sg; + send_wr.num_sge = tx_desc->num_sge; + send_wr.opcode = IB_WR_SEND; + send_wr.send_flags = signal ? IB_SEND_SIGNALED : 0; + + ib_ret = ib_post_send(ib_conn->qp, &send_wr, &send_wr_failed); + if (ib_ret) + iser_err("ib_post_send failed, ret:%d\n", ib_ret); + + return ib_ret; +} + +/** + * is_iser_tx_desc - Indicate if the completion wr_id + * is a TX descriptor or not. + * @iser_conn: iser connection + * @wr_id: completion WR identifier + * + * Since we cannot rely on wc opcode in FLUSH errors + * we must work around it by checking if the wr_id address + * falls in the iser connection rx_descs buffer. If so + * it is an RX descriptor, otherwize it is a TX. + */ +static inline bool +is_iser_tx_desc(struct iser_conn *iser_conn, void *wr_id) +{ + void *start = iser_conn->rx_descs; + int len = iser_conn->num_rx_descs * sizeof(*iser_conn->rx_descs); + + if (wr_id >= start && wr_id < start + len) + return false; + + return true; +} + +/** + * iser_handle_comp_error() - Handle error completion + * @ib_conn: connection RDMA resources + * @wc: work completion + * + * Notes: We may handle a FLUSH error completion and in this case + * we only cleanup in case TX type was DATAOUT. For non-FLUSH + * error completion we should also notify iscsi layer that + * connection is failed (in case we passed bind stage). + */ +static void +iser_handle_comp_error(struct ib_conn *ib_conn, + struct ib_wc *wc) +{ + struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn, + ib_conn); + + if (wc->status != IB_WC_WR_FLUSH_ERR) + if (iser_conn->iscsi_conn) + iscsi_conn_failure(iser_conn->iscsi_conn, + ISCSI_ERR_CONN_FAILED); + + if (is_iser_tx_desc(iser_conn, (void *)wc->wr_id)) { + struct iser_tx_desc *desc = (struct iser_tx_desc *)wc->wr_id; + + if (desc->type == ISCSI_TX_DATAOUT) + kmem_cache_free(ig.desc_cache, desc); + } else { + ib_conn->post_recv_buf_count--; + } +} + +/** + * iser_handle_wc - handle a single work completion + * @wc: work completion + * + * Soft-IRQ context, work completion can be either + * SEND or RECV, and can turn out successful or + * with error (or flush error). + */ +static void iser_handle_wc(struct ib_wc *wc) +{ + struct ib_conn *ib_conn; + struct iser_tx_desc *tx_desc; + struct iser_rx_desc *rx_desc; + + ib_conn = wc->qp->qp_context; + if (wc->status == IB_WC_SUCCESS) { + if (wc->opcode == IB_WC_RECV) { + rx_desc = (struct iser_rx_desc *)wc->wr_id; + iser_rcv_completion(rx_desc, wc->byte_len, + ib_conn); + } else + if (wc->opcode == IB_WC_SEND) { + tx_desc = (struct iser_tx_desc *)wc->wr_id; + iser_snd_completion(tx_desc, ib_conn); + } else { + iser_err("Unknown wc opcode %d\n", wc->opcode); + } + } else { + if (wc->status != IB_WC_WR_FLUSH_ERR) + iser_err("wr id %llx status %d vend_err %x\n", + wc->wr_id, wc->status, wc->vendor_err); + else + iser_dbg("flush error: wr id %llx\n", wc->wr_id); + + if (wc->wr_id != ISER_FASTREG_LI_WRID && + wc->wr_id != ISER_BEACON_WRID) + iser_handle_comp_error(ib_conn, wc); + + /* complete in case all flush errors were consumed */ + if (wc->wr_id == ISER_BEACON_WRID) + complete(&ib_conn->flush_comp); + } +} + +/** + * iser_cq_tasklet_fn - iSER completion polling loop + * @data: iSER completion context + * + * Soft-IRQ context, polling connection CQ until + * either CQ was empty or we exausted polling budget + */ +static void iser_cq_tasklet_fn(unsigned long data) +{ + struct iser_comp *comp = (struct iser_comp *)data; + struct ib_cq *cq = comp->cq; + struct ib_wc *const wcs = comp->wcs; + int i, n, completed = 0; + + while ((n = ib_poll_cq(cq, ARRAY_SIZE(comp->wcs), wcs)) > 0) { + for (i = 0; i < n; i++) + iser_handle_wc(&wcs[i]); + + completed += n; + if (completed >= iser_cq_poll_limit) + break; + } + + /* + * It is assumed here that arming CQ only once its empty + * would not cause interrupts to be missed. + */ + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + + iser_dbg("got %d completions\n", completed); +} + +static void iser_cq_callback(struct ib_cq *cq, void *cq_context) +{ + struct iser_comp *comp = cq_context; + + tasklet_schedule(&comp->tasklet); +} + +u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir, sector_t *sector) +{ + struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; + struct fast_reg_descriptor *desc = reg->mem_h; + unsigned long sector_size = iser_task->sc->device->sector_size; + struct ib_mr_status mr_status; + int ret; + + if (desc && desc->reg_indicators & ISER_FASTREG_PROTECTED) { + desc->reg_indicators &= ~ISER_FASTREG_PROTECTED; + ret = ib_check_mr_status(desc->pi_ctx->sig_mr, + IB_MR_CHECK_SIG_STATUS, &mr_status); + if (ret) { + pr_err("ib_check_mr_status failed, ret %d\n", ret); + goto err; + } + + if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) { + sector_t sector_off = mr_status.sig_err.sig_err_offset; + + do_div(sector_off, sector_size + 8); + *sector = scsi_get_lba(iser_task->sc) + sector_off; + + pr_err("PI error found type %d at sector %llx " + "expected %x vs actual %x\n", + mr_status.sig_err.err_type, + (unsigned long long)*sector, + mr_status.sig_err.expected, + mr_status.sig_err.actual); + + switch (mr_status.sig_err.err_type) { + case IB_SIG_BAD_GUARD: + return 0x1; + case IB_SIG_BAD_REFTAG: + return 0x3; + case IB_SIG_BAD_APPTAG: + return 0x2; + } + } + } + + return 0; +err: + /* Not alot we can do here, return ambiguous guard error */ + return 0x1; +} diff --git a/drivers/infiniband/ulp/isert/Kconfig b/drivers/infiniband/ulp/isert/Kconfig new file mode 100644 index 0000000..02f9759 --- /dev/null +++ b/drivers/infiniband/ulp/isert/Kconfig @@ -0,0 +1,5 @@ +config INFINIBAND_ISERT + tristate "iSCSI Extensions for RDMA (iSER) target support" + depends on INET && INFINIBAND_ADDR_TRANS && TARGET_CORE && ISCSI_TARGET + ---help--- + Support for iSCSI Extensions for RDMA (iSER) Target on Infiniband fabrics. diff --git a/drivers/infiniband/ulp/isert/Makefile b/drivers/infiniband/ulp/isert/Makefile new file mode 100644 index 0000000..c8bf242 --- /dev/null +++ b/drivers/infiniband/ulp/isert/Makefile @@ -0,0 +1,2 @@ +ccflags-y := -Idrivers/target -Idrivers/target/iscsi +obj-$(CONFIG_INFINIBAND_ISERT) += ib_isert.o diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c new file mode 100644 index 0000000..10641b7 --- /dev/null +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -0,0 +1,3313 @@ +/******************************************************************************* + * This file contains iSCSI extentions for RDMA (iSER) Verbs + * + * (c) Copyright 2013 Datera, Inc. + * + * Nicholas A. Bellinger + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + ****************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "isert_proto.h" +#include "ib_isert.h" + +#define ISERT_MAX_CONN 8 +#define ISER_MAX_RX_CQ_LEN (ISERT_QP_MAX_RECV_DTOS * ISERT_MAX_CONN) +#define ISER_MAX_TX_CQ_LEN (ISERT_QP_MAX_REQ_DTOS * ISERT_MAX_CONN) + +static DEFINE_MUTEX(device_list_mutex); +static LIST_HEAD(device_list); +static struct workqueue_struct *isert_rx_wq; +static struct workqueue_struct *isert_comp_wq; + +static void +isert_unmap_cmd(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn); +static int +isert_map_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd, + struct isert_rdma_wr *wr); +static void +isert_unreg_rdma(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn); +static int +isert_reg_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd, + struct isert_rdma_wr *wr); +static int +isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd); + +static void +isert_qp_event_callback(struct ib_event *e, void *context) +{ + struct isert_conn *isert_conn = (struct isert_conn *)context; + + pr_err("isert_qp_event_callback event: %d\n", e->event); + switch (e->event) { + case IB_EVENT_COMM_EST: + rdma_notify(isert_conn->conn_cm_id, IB_EVENT_COMM_EST); + break; + case IB_EVENT_QP_LAST_WQE_REACHED: + pr_warn("Reached TX IB_EVENT_QP_LAST_WQE_REACHED:\n"); + break; + default: + break; + } +} + +static int +isert_query_device(struct ib_device *ib_dev, struct ib_device_attr *devattr) +{ + int ret; + + ret = ib_query_device(ib_dev, devattr); + if (ret) { + pr_err("ib_query_device() failed: %d\n", ret); + return ret; + } + pr_debug("devattr->max_sge: %d\n", devattr->max_sge); + pr_debug("devattr->max_sge_rd: %d\n", devattr->max_sge_rd); + + return 0; +} + +static int +isert_conn_setup_qp(struct isert_conn *isert_conn, struct rdma_cm_id *cma_id, + u8 protection) +{ + struct isert_device *device = isert_conn->conn_device; + struct ib_qp_init_attr attr; + int ret, index, min_index = 0; + + mutex_lock(&device_list_mutex); + for (index = 0; index < device->cqs_used; index++) + if (device->cq_active_qps[index] < + device->cq_active_qps[min_index]) + min_index = index; + device->cq_active_qps[min_index]++; + pr_debug("isert_conn_setup_qp: Using min_index: %d\n", min_index); + mutex_unlock(&device_list_mutex); + + memset(&attr, 0, sizeof(struct ib_qp_init_attr)); + attr.event_handler = isert_qp_event_callback; + attr.qp_context = isert_conn; + attr.send_cq = device->dev_tx_cq[min_index]; + attr.recv_cq = device->dev_rx_cq[min_index]; + attr.cap.max_send_wr = ISERT_QP_MAX_REQ_DTOS; + attr.cap.max_recv_wr = ISERT_QP_MAX_RECV_DTOS; + /* + * FIXME: Use devattr.max_sge - 2 for max_send_sge as + * work-around for RDMA_READs with ConnectX-2. + * + * Also, still make sure to have at least two SGEs for + * outgoing control PDU responses. + */ + attr.cap.max_send_sge = max(2, device->dev_attr.max_sge - 2); + isert_conn->max_sge = attr.cap.max_send_sge; + + attr.cap.max_recv_sge = 1; + attr.sq_sig_type = IB_SIGNAL_REQ_WR; + attr.qp_type = IB_QPT_RC; + if (protection) + attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN; + + pr_debug("isert_conn_setup_qp cma_id->device: %p\n", + cma_id->device); + pr_debug("isert_conn_setup_qp conn_pd->device: %p\n", + isert_conn->conn_pd->device); + + ret = rdma_create_qp(cma_id, isert_conn->conn_pd, &attr); + if (ret) { + pr_err("rdma_create_qp failed for cma_id %d\n", ret); + return ret; + } + isert_conn->conn_qp = cma_id->qp; + pr_debug("rdma_create_qp() returned success >>>>>>>>>>>>>>>>>>>>>>>>>.\n"); + + return 0; +} + +static void +isert_cq_event_callback(struct ib_event *e, void *context) +{ + pr_debug("isert_cq_event_callback event: %d\n", e->event); +} + +static int +isert_alloc_rx_descriptors(struct isert_conn *isert_conn) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct iser_rx_desc *rx_desc; + struct ib_sge *rx_sg; + u64 dma_addr; + int i, j; + + isert_conn->conn_rx_descs = kzalloc(ISERT_QP_MAX_RECV_DTOS * + sizeof(struct iser_rx_desc), GFP_KERNEL); + if (!isert_conn->conn_rx_descs) + goto fail; + + rx_desc = isert_conn->conn_rx_descs; + + for (i = 0; i < ISERT_QP_MAX_RECV_DTOS; i++, rx_desc++) { + dma_addr = ib_dma_map_single(ib_dev, (void *)rx_desc, + ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ib_dev, dma_addr)) + goto dma_map_fail; + + rx_desc->dma_addr = dma_addr; + + rx_sg = &rx_desc->rx_sg; + rx_sg->addr = rx_desc->dma_addr; + rx_sg->length = ISER_RX_PAYLOAD_SIZE; + rx_sg->lkey = isert_conn->conn_mr->lkey; + } + + isert_conn->conn_rx_desc_head = 0; + return 0; + +dma_map_fail: + rx_desc = isert_conn->conn_rx_descs; + for (j = 0; j < i; j++, rx_desc++) { + ib_dma_unmap_single(ib_dev, rx_desc->dma_addr, + ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + } + kfree(isert_conn->conn_rx_descs); + isert_conn->conn_rx_descs = NULL; +fail: + return -ENOMEM; +} + +static void +isert_free_rx_descriptors(struct isert_conn *isert_conn) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct iser_rx_desc *rx_desc; + int i; + + if (!isert_conn->conn_rx_descs) + return; + + rx_desc = isert_conn->conn_rx_descs; + for (i = 0; i < ISERT_QP_MAX_RECV_DTOS; i++, rx_desc++) { + ib_dma_unmap_single(ib_dev, rx_desc->dma_addr, + ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + } + + kfree(isert_conn->conn_rx_descs); + isert_conn->conn_rx_descs = NULL; +} + +static void isert_cq_tx_work(struct work_struct *); +static void isert_cq_tx_callback(struct ib_cq *, void *); +static void isert_cq_rx_work(struct work_struct *); +static void isert_cq_rx_callback(struct ib_cq *, void *); + +static int +isert_create_device_ib_res(struct isert_device *device) +{ + struct ib_device *ib_dev = device->ib_device; + struct isert_cq_desc *cq_desc; + struct ib_device_attr *dev_attr; + int ret = 0, i, j; + int max_rx_cqe, max_tx_cqe; + + dev_attr = &device->dev_attr; + ret = isert_query_device(ib_dev, dev_attr); + if (ret) + return ret; + + max_rx_cqe = min(ISER_MAX_RX_CQ_LEN, dev_attr->max_cqe); + max_tx_cqe = min(ISER_MAX_TX_CQ_LEN, dev_attr->max_cqe); + + /* asign function handlers */ + if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS && + dev_attr->device_cap_flags & IB_DEVICE_SIGNATURE_HANDOVER) { + device->use_fastreg = 1; + device->reg_rdma_mem = isert_reg_rdma; + device->unreg_rdma_mem = isert_unreg_rdma; + } else { + device->use_fastreg = 0; + device->reg_rdma_mem = isert_map_rdma; + device->unreg_rdma_mem = isert_unmap_cmd; + } + + /* Check signature cap */ + device->pi_capable = dev_attr->device_cap_flags & + IB_DEVICE_SIGNATURE_HANDOVER ? true : false; + + device->cqs_used = min_t(int, num_online_cpus(), + device->ib_device->num_comp_vectors); + device->cqs_used = min(ISERT_MAX_CQ, device->cqs_used); + pr_debug("Using %d CQs, device %s supports %d vectors support " + "Fast registration %d pi_capable %d\n", + device->cqs_used, device->ib_device->name, + device->ib_device->num_comp_vectors, device->use_fastreg, + device->pi_capable); + device->cq_desc = kzalloc(sizeof(struct isert_cq_desc) * + device->cqs_used, GFP_KERNEL); + if (!device->cq_desc) { + pr_err("Unable to allocate device->cq_desc\n"); + return -ENOMEM; + } + cq_desc = device->cq_desc; + + for (i = 0; i < device->cqs_used; i++) { + cq_desc[i].device = device; + cq_desc[i].cq_index = i; + + INIT_WORK(&cq_desc[i].cq_rx_work, isert_cq_rx_work); + device->dev_rx_cq[i] = ib_create_cq(device->ib_device, + isert_cq_rx_callback, + isert_cq_event_callback, + (void *)&cq_desc[i], + max_rx_cqe, i); + if (IS_ERR(device->dev_rx_cq[i])) { + ret = PTR_ERR(device->dev_rx_cq[i]); + device->dev_rx_cq[i] = NULL; + goto out_cq; + } + + INIT_WORK(&cq_desc[i].cq_tx_work, isert_cq_tx_work); + device->dev_tx_cq[i] = ib_create_cq(device->ib_device, + isert_cq_tx_callback, + isert_cq_event_callback, + (void *)&cq_desc[i], + max_tx_cqe, i); + if (IS_ERR(device->dev_tx_cq[i])) { + ret = PTR_ERR(device->dev_tx_cq[i]); + device->dev_tx_cq[i] = NULL; + goto out_cq; + } + + ret = ib_req_notify_cq(device->dev_rx_cq[i], IB_CQ_NEXT_COMP); + if (ret) + goto out_cq; + + ret = ib_req_notify_cq(device->dev_tx_cq[i], IB_CQ_NEXT_COMP); + if (ret) + goto out_cq; + } + + return 0; + +out_cq: + for (j = 0; j < i; j++) { + cq_desc = &device->cq_desc[j]; + + if (device->dev_rx_cq[j]) { + cancel_work_sync(&cq_desc->cq_rx_work); + ib_destroy_cq(device->dev_rx_cq[j]); + } + if (device->dev_tx_cq[j]) { + cancel_work_sync(&cq_desc->cq_tx_work); + ib_destroy_cq(device->dev_tx_cq[j]); + } + } + kfree(device->cq_desc); + + return ret; +} + +static void +isert_free_device_ib_res(struct isert_device *device) +{ + struct isert_cq_desc *cq_desc; + int i; + + for (i = 0; i < device->cqs_used; i++) { + cq_desc = &device->cq_desc[i]; + + cancel_work_sync(&cq_desc->cq_rx_work); + cancel_work_sync(&cq_desc->cq_tx_work); + ib_destroy_cq(device->dev_rx_cq[i]); + ib_destroy_cq(device->dev_tx_cq[i]); + device->dev_rx_cq[i] = NULL; + device->dev_tx_cq[i] = NULL; + } + + kfree(device->cq_desc); +} + +static void +isert_device_try_release(struct isert_device *device) +{ + mutex_lock(&device_list_mutex); + device->refcount--; + if (!device->refcount) { + isert_free_device_ib_res(device); + list_del(&device->dev_node); + kfree(device); + } + mutex_unlock(&device_list_mutex); +} + +static struct isert_device * +isert_device_find_by_ib_dev(struct rdma_cm_id *cma_id) +{ + struct isert_device *device; + int ret; + + mutex_lock(&device_list_mutex); + list_for_each_entry(device, &device_list, dev_node) { + if (device->ib_device->node_guid == cma_id->device->node_guid) { + device->refcount++; + mutex_unlock(&device_list_mutex); + return device; + } + } + + device = kzalloc(sizeof(struct isert_device), GFP_KERNEL); + if (!device) { + mutex_unlock(&device_list_mutex); + return ERR_PTR(-ENOMEM); + } + + INIT_LIST_HEAD(&device->dev_node); + + device->ib_device = cma_id->device; + ret = isert_create_device_ib_res(device); + if (ret) { + kfree(device); + mutex_unlock(&device_list_mutex); + return ERR_PTR(ret); + } + + device->refcount++; + list_add_tail(&device->dev_node, &device_list); + mutex_unlock(&device_list_mutex); + + return device; +} + +static void +isert_conn_free_fastreg_pool(struct isert_conn *isert_conn) +{ + struct fast_reg_descriptor *fr_desc, *tmp; + int i = 0; + + if (list_empty(&isert_conn->conn_fr_pool)) + return; + + pr_debug("Freeing conn %p fastreg pool", isert_conn); + + list_for_each_entry_safe(fr_desc, tmp, + &isert_conn->conn_fr_pool, list) { + list_del(&fr_desc->list); + ib_free_fast_reg_page_list(fr_desc->data_frpl); + ib_dereg_mr(fr_desc->data_mr); + if (fr_desc->pi_ctx) { + ib_free_fast_reg_page_list(fr_desc->pi_ctx->prot_frpl); + ib_dereg_mr(fr_desc->pi_ctx->prot_mr); + ib_destroy_mr(fr_desc->pi_ctx->sig_mr); + kfree(fr_desc->pi_ctx); + } + kfree(fr_desc); + ++i; + } + + if (i < isert_conn->conn_fr_pool_size) + pr_warn("Pool still has %d regions registered\n", + isert_conn->conn_fr_pool_size - i); +} + +static int +isert_create_fr_desc(struct ib_device *ib_device, struct ib_pd *pd, + struct fast_reg_descriptor *fr_desc, u8 protection) +{ + int ret; + + fr_desc->data_frpl = ib_alloc_fast_reg_page_list(ib_device, + ISCSI_ISER_SG_TABLESIZE); + if (IS_ERR(fr_desc->data_frpl)) { + pr_err("Failed to allocate data frpl err=%ld\n", + PTR_ERR(fr_desc->data_frpl)); + return PTR_ERR(fr_desc->data_frpl); + } + + fr_desc->data_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE); + if (IS_ERR(fr_desc->data_mr)) { + pr_err("Failed to allocate data frmr err=%ld\n", + PTR_ERR(fr_desc->data_mr)); + ret = PTR_ERR(fr_desc->data_mr); + goto err_data_frpl; + } + pr_debug("Create fr_desc %p page_list %p\n", + fr_desc, fr_desc->data_frpl->page_list); + fr_desc->ind |= ISERT_DATA_KEY_VALID; + + if (protection) { + struct ib_mr_init_attr mr_init_attr = {0}; + struct pi_context *pi_ctx; + + fr_desc->pi_ctx = kzalloc(sizeof(*fr_desc->pi_ctx), GFP_KERNEL); + if (!fr_desc->pi_ctx) { + pr_err("Failed to allocate pi context\n"); + ret = -ENOMEM; + goto err_data_mr; + } + pi_ctx = fr_desc->pi_ctx; + + pi_ctx->prot_frpl = ib_alloc_fast_reg_page_list(ib_device, + ISCSI_ISER_SG_TABLESIZE); + if (IS_ERR(pi_ctx->prot_frpl)) { + pr_err("Failed to allocate prot frpl err=%ld\n", + PTR_ERR(pi_ctx->prot_frpl)); + ret = PTR_ERR(pi_ctx->prot_frpl); + goto err_pi_ctx; + } + + pi_ctx->prot_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE); + if (IS_ERR(pi_ctx->prot_mr)) { + pr_err("Failed to allocate prot frmr err=%ld\n", + PTR_ERR(pi_ctx->prot_mr)); + ret = PTR_ERR(pi_ctx->prot_mr); + goto err_prot_frpl; + } + fr_desc->ind |= ISERT_PROT_KEY_VALID; + + mr_init_attr.max_reg_descriptors = 2; + mr_init_attr.flags |= IB_MR_SIGNATURE_EN; + pi_ctx->sig_mr = ib_create_mr(pd, &mr_init_attr); + if (IS_ERR(pi_ctx->sig_mr)) { + pr_err("Failed to allocate signature enabled mr err=%ld\n", + PTR_ERR(pi_ctx->sig_mr)); + ret = PTR_ERR(pi_ctx->sig_mr); + goto err_prot_mr; + } + fr_desc->ind |= ISERT_SIG_KEY_VALID; + } + fr_desc->ind &= ~ISERT_PROTECTED; + + return 0; +err_prot_mr: + ib_dereg_mr(fr_desc->pi_ctx->prot_mr); +err_prot_frpl: + ib_free_fast_reg_page_list(fr_desc->pi_ctx->prot_frpl); +err_pi_ctx: + kfree(fr_desc->pi_ctx); +err_data_mr: + ib_dereg_mr(fr_desc->data_mr); +err_data_frpl: + ib_free_fast_reg_page_list(fr_desc->data_frpl); + + return ret; +} + +static int +isert_conn_create_fastreg_pool(struct isert_conn *isert_conn, u8 pi_support) +{ + struct fast_reg_descriptor *fr_desc; + struct isert_device *device = isert_conn->conn_device; + struct se_session *se_sess = isert_conn->conn->sess->se_sess; + struct se_node_acl *se_nacl = se_sess->se_node_acl; + int i, ret, tag_num; + /* + * Setup the number of FRMRs based upon the number of tags + * available to session in iscsi_target_locate_portal(). + */ + tag_num = max_t(u32, ISCSIT_MIN_TAGS, se_nacl->queue_depth); + tag_num = (tag_num * 2) + ISCSIT_EXTRA_TAGS; + + isert_conn->conn_fr_pool_size = 0; + for (i = 0; i < tag_num; i++) { + fr_desc = kzalloc(sizeof(*fr_desc), GFP_KERNEL); + if (!fr_desc) { + pr_err("Failed to allocate fast_reg descriptor\n"); + ret = -ENOMEM; + goto err; + } + + ret = isert_create_fr_desc(device->ib_device, + isert_conn->conn_pd, fr_desc, + pi_support); + if (ret) { + pr_err("Failed to create fastreg descriptor err=%d\n", + ret); + kfree(fr_desc); + goto err; + } + + list_add_tail(&fr_desc->list, &isert_conn->conn_fr_pool); + isert_conn->conn_fr_pool_size++; + } + + pr_debug("Creating conn %p fastreg pool size=%d", + isert_conn, isert_conn->conn_fr_pool_size); + + return 0; + +err: + isert_conn_free_fastreg_pool(isert_conn); + return ret; +} + +static int +isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) +{ + struct iscsi_np *np = cma_id->context; + struct isert_np *isert_np = np->np_context; + struct isert_conn *isert_conn; + struct isert_device *device; + struct ib_device *ib_dev = cma_id->device; + int ret = 0; + u8 pi_support; + + spin_lock_bh(&np->np_thread_lock); + if (!np->enabled) { + spin_unlock_bh(&np->np_thread_lock); + pr_debug("iscsi_np is not enabled, reject connect request\n"); + return rdma_reject(cma_id, NULL, 0); + } + spin_unlock_bh(&np->np_thread_lock); + + pr_debug("Entering isert_connect_request cma_id: %p, context: %p\n", + cma_id, cma_id->context); + + isert_conn = kzalloc(sizeof(struct isert_conn), GFP_KERNEL); + if (!isert_conn) { + pr_err("Unable to allocate isert_conn\n"); + return -ENOMEM; + } + isert_conn->state = ISER_CONN_INIT; + INIT_LIST_HEAD(&isert_conn->conn_accept_node); + init_completion(&isert_conn->conn_login_comp); + init_completion(&isert_conn->conn_wait); + init_completion(&isert_conn->conn_wait_comp_err); + kref_init(&isert_conn->conn_kref); + mutex_init(&isert_conn->conn_mutex); + spin_lock_init(&isert_conn->conn_lock); + INIT_LIST_HEAD(&isert_conn->conn_fr_pool); + + cma_id->context = isert_conn; + isert_conn->conn_cm_id = cma_id; + + isert_conn->login_buf = kzalloc(ISCSI_DEF_MAX_RECV_SEG_LEN + + ISER_RX_LOGIN_SIZE, GFP_KERNEL); + if (!isert_conn->login_buf) { + pr_err("Unable to allocate isert_conn->login_buf\n"); + ret = -ENOMEM; + goto out; + } + + isert_conn->login_req_buf = isert_conn->login_buf; + isert_conn->login_rsp_buf = isert_conn->login_buf + + ISCSI_DEF_MAX_RECV_SEG_LEN; + pr_debug("Set login_buf: %p login_req_buf: %p login_rsp_buf: %p\n", + isert_conn->login_buf, isert_conn->login_req_buf, + isert_conn->login_rsp_buf); + + isert_conn->login_req_dma = ib_dma_map_single(ib_dev, + (void *)isert_conn->login_req_buf, + ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_FROM_DEVICE); + + ret = ib_dma_mapping_error(ib_dev, isert_conn->login_req_dma); + if (ret) { + pr_err("ib_dma_mapping_error failed for login_req_dma: %d\n", + ret); + isert_conn->login_req_dma = 0; + goto out_login_buf; + } + + isert_conn->login_rsp_dma = ib_dma_map_single(ib_dev, + (void *)isert_conn->login_rsp_buf, + ISER_RX_LOGIN_SIZE, DMA_TO_DEVICE); + + ret = ib_dma_mapping_error(ib_dev, isert_conn->login_rsp_dma); + if (ret) { + pr_err("ib_dma_mapping_error failed for login_rsp_dma: %d\n", + ret); + isert_conn->login_rsp_dma = 0; + goto out_req_dma_map; + } + + device = isert_device_find_by_ib_dev(cma_id); + if (IS_ERR(device)) { + ret = PTR_ERR(device); + goto out_rsp_dma_map; + } + + /* Set max inflight RDMA READ requests */ + isert_conn->initiator_depth = min_t(u8, + event->param.conn.initiator_depth, + device->dev_attr.max_qp_init_rd_atom); + pr_debug("Using initiator_depth: %u\n", isert_conn->initiator_depth); + + isert_conn->conn_device = device; + isert_conn->conn_pd = ib_alloc_pd(isert_conn->conn_device->ib_device); + if (IS_ERR(isert_conn->conn_pd)) { + ret = PTR_ERR(isert_conn->conn_pd); + pr_err("ib_alloc_pd failed for conn %p: ret=%d\n", + isert_conn, ret); + goto out_pd; + } + + isert_conn->conn_mr = ib_get_dma_mr(isert_conn->conn_pd, + IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(isert_conn->conn_mr)) { + ret = PTR_ERR(isert_conn->conn_mr); + pr_err("ib_get_dma_mr failed for conn %p: ret=%d\n", + isert_conn, ret); + goto out_mr; + } + + pi_support = np->tpg_np->tpg->tpg_attrib.t10_pi; + if (pi_support && !device->pi_capable) { + pr_err("Protection information requested but not supported, " + "rejecting connect request\n"); + ret = rdma_reject(cma_id, NULL, 0); + goto out_mr; + } + + ret = isert_conn_setup_qp(isert_conn, cma_id, pi_support); + if (ret) + goto out_conn_dev; + + mutex_lock(&isert_np->np_accept_mutex); + list_add_tail(&isert_conn->conn_accept_node, &isert_np->np_accept_list); + mutex_unlock(&isert_np->np_accept_mutex); + + pr_debug("isert_connect_request() up np_sem np: %p\n", np); + up(&isert_np->np_sem); + return 0; + +out_conn_dev: + ib_dereg_mr(isert_conn->conn_mr); +out_mr: + ib_dealloc_pd(isert_conn->conn_pd); +out_pd: + isert_device_try_release(device); +out_rsp_dma_map: + ib_dma_unmap_single(ib_dev, isert_conn->login_rsp_dma, + ISER_RX_LOGIN_SIZE, DMA_TO_DEVICE); +out_req_dma_map: + ib_dma_unmap_single(ib_dev, isert_conn->login_req_dma, + ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_FROM_DEVICE); +out_login_buf: + kfree(isert_conn->login_buf); +out: + kfree(isert_conn); + return ret; +} + +static void +isert_connect_release(struct isert_conn *isert_conn) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct isert_device *device = isert_conn->conn_device; + int cq_index; + + pr_debug("Entering isert_connect_release(): >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n"); + + if (device && device->use_fastreg) + isert_conn_free_fastreg_pool(isert_conn); + + if (isert_conn->conn_qp) { + cq_index = ((struct isert_cq_desc *) + isert_conn->conn_qp->recv_cq->cq_context)->cq_index; + pr_debug("isert_connect_release: cq_index: %d\n", cq_index); + isert_conn->conn_device->cq_active_qps[cq_index]--; + + rdma_destroy_qp(isert_conn->conn_cm_id); + } + + isert_free_rx_descriptors(isert_conn); + rdma_destroy_id(isert_conn->conn_cm_id); + + ib_dereg_mr(isert_conn->conn_mr); + ib_dealloc_pd(isert_conn->conn_pd); + + if (isert_conn->login_buf) { + ib_dma_unmap_single(ib_dev, isert_conn->login_rsp_dma, + ISER_RX_LOGIN_SIZE, DMA_TO_DEVICE); + ib_dma_unmap_single(ib_dev, isert_conn->login_req_dma, + ISCSI_DEF_MAX_RECV_SEG_LEN, + DMA_FROM_DEVICE); + kfree(isert_conn->login_buf); + } + kfree(isert_conn); + + if (device) + isert_device_try_release(device); + + pr_debug("Leaving isert_connect_release >>>>>>>>>>>>\n"); +} + +static void +isert_connected_handler(struct rdma_cm_id *cma_id) +{ + struct isert_conn *isert_conn = cma_id->context; + + kref_get(&isert_conn->conn_kref); +} + +static void +isert_release_conn_kref(struct kref *kref) +{ + struct isert_conn *isert_conn = container_of(kref, + struct isert_conn, conn_kref); + + pr_debug("Calling isert_connect_release for final kref %s/%d\n", + current->comm, current->pid); + + isert_connect_release(isert_conn); +} + +static void +isert_put_conn(struct isert_conn *isert_conn) +{ + kref_put(&isert_conn->conn_kref, isert_release_conn_kref); +} + +static void +isert_disconnect_work(struct work_struct *work) +{ + struct isert_conn *isert_conn = container_of(work, + struct isert_conn, conn_logout_work); + + pr_debug("isert_disconnect_work(): >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n"); + mutex_lock(&isert_conn->conn_mutex); + if (isert_conn->state == ISER_CONN_UP) + isert_conn->state = ISER_CONN_TERMINATING; + + if (isert_conn->post_recv_buf_count == 0 && + atomic_read(&isert_conn->post_send_buf_count) == 0) { + mutex_unlock(&isert_conn->conn_mutex); + goto wake_up; + } + if (!isert_conn->conn_cm_id) { + mutex_unlock(&isert_conn->conn_mutex); + isert_put_conn(isert_conn); + return; + } + + if (isert_conn->disconnect) { + /* Send DREQ/DREP towards our initiator */ + rdma_disconnect(isert_conn->conn_cm_id); + } + + mutex_unlock(&isert_conn->conn_mutex); + +wake_up: + complete(&isert_conn->conn_wait); +} + +static int +isert_disconnected_handler(struct rdma_cm_id *cma_id, bool disconnect) +{ + struct isert_conn *isert_conn; + + if (!cma_id->qp) { + struct isert_np *isert_np = cma_id->context; + + isert_np->np_cm_id = NULL; + return -1; + } + + isert_conn = (struct isert_conn *)cma_id->context; + + isert_conn->disconnect = disconnect; + INIT_WORK(&isert_conn->conn_logout_work, isert_disconnect_work); + schedule_work(&isert_conn->conn_logout_work); + + return 0; +} + +static int +isert_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) +{ + int ret = 0; + bool disconnect = false; + + pr_debug("isert_cma_handler: event %d status %d conn %p id %p\n", + event->event, event->status, cma_id->context, cma_id); + + switch (event->event) { + case RDMA_CM_EVENT_CONNECT_REQUEST: + ret = isert_connect_request(cma_id, event); + if (ret) + pr_err("isert_cma_handler failed RDMA_CM_EVENT: 0x%08x %d\n", + event->event, ret); + break; + case RDMA_CM_EVENT_ESTABLISHED: + isert_connected_handler(cma_id); + break; + case RDMA_CM_EVENT_ADDR_CHANGE: /* FALLTHRU */ + case RDMA_CM_EVENT_DISCONNECTED: /* FALLTHRU */ + case RDMA_CM_EVENT_DEVICE_REMOVAL: /* FALLTHRU */ + disconnect = true; + case RDMA_CM_EVENT_TIMEWAIT_EXIT: /* FALLTHRU */ + ret = isert_disconnected_handler(cma_id, disconnect); + break; + case RDMA_CM_EVENT_CONNECT_ERROR: + default: + pr_err("Unhandled RDMA CMA event: %d\n", event->event); + break; + } + + return ret; +} + +static int +isert_post_recv(struct isert_conn *isert_conn, u32 count) +{ + struct ib_recv_wr *rx_wr, *rx_wr_failed; + int i, ret; + unsigned int rx_head = isert_conn->conn_rx_desc_head; + struct iser_rx_desc *rx_desc; + + for (rx_wr = isert_conn->conn_rx_wr, i = 0; i < count; i++, rx_wr++) { + rx_desc = &isert_conn->conn_rx_descs[rx_head]; + rx_wr->wr_id = (unsigned long)rx_desc; + rx_wr->sg_list = &rx_desc->rx_sg; + rx_wr->num_sge = 1; + rx_wr->next = rx_wr + 1; + rx_head = (rx_head + 1) & (ISERT_QP_MAX_RECV_DTOS - 1); + } + + rx_wr--; + rx_wr->next = NULL; /* mark end of work requests list */ + + isert_conn->post_recv_buf_count += count; + ret = ib_post_recv(isert_conn->conn_qp, isert_conn->conn_rx_wr, + &rx_wr_failed); + if (ret) { + pr_err("ib_post_recv() failed with ret: %d\n", ret); + isert_conn->post_recv_buf_count -= count; + } else { + pr_debug("isert_post_recv(): Posted %d RX buffers\n", count); + isert_conn->conn_rx_desc_head = rx_head; + } + return ret; +} + +static int +isert_post_send(struct isert_conn *isert_conn, struct iser_tx_desc *tx_desc) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct ib_send_wr send_wr, *send_wr_failed; + int ret; + + ib_dma_sync_single_for_device(ib_dev, tx_desc->dma_addr, + ISER_HEADERS_LEN, DMA_TO_DEVICE); + + send_wr.next = NULL; + send_wr.wr_id = (unsigned long)tx_desc; + send_wr.sg_list = tx_desc->tx_sg; + send_wr.num_sge = tx_desc->num_sge; + send_wr.opcode = IB_WR_SEND; + send_wr.send_flags = IB_SEND_SIGNALED; + + atomic_inc(&isert_conn->post_send_buf_count); + + ret = ib_post_send(isert_conn->conn_qp, &send_wr, &send_wr_failed); + if (ret) { + pr_err("ib_post_send() failed, ret: %d\n", ret); + atomic_dec(&isert_conn->post_send_buf_count); + } + + return ret; +} + +static void +isert_create_send_desc(struct isert_conn *isert_conn, + struct isert_cmd *isert_cmd, + struct iser_tx_desc *tx_desc) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + + ib_dma_sync_single_for_cpu(ib_dev, tx_desc->dma_addr, + ISER_HEADERS_LEN, DMA_TO_DEVICE); + + memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr)); + tx_desc->iser_header.flags = ISER_VER; + + tx_desc->num_sge = 1; + tx_desc->isert_cmd = isert_cmd; + + if (tx_desc->tx_sg[0].lkey != isert_conn->conn_mr->lkey) { + tx_desc->tx_sg[0].lkey = isert_conn->conn_mr->lkey; + pr_debug("tx_desc %p lkey mismatch, fixing\n", tx_desc); + } +} + +static int +isert_init_tx_hdrs(struct isert_conn *isert_conn, + struct iser_tx_desc *tx_desc) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + u64 dma_addr; + + dma_addr = ib_dma_map_single(ib_dev, (void *)tx_desc, + ISER_HEADERS_LEN, DMA_TO_DEVICE); + if (ib_dma_mapping_error(ib_dev, dma_addr)) { + pr_err("ib_dma_mapping_error() failed\n"); + return -ENOMEM; + } + + tx_desc->dma_addr = dma_addr; + tx_desc->tx_sg[0].addr = tx_desc->dma_addr; + tx_desc->tx_sg[0].length = ISER_HEADERS_LEN; + tx_desc->tx_sg[0].lkey = isert_conn->conn_mr->lkey; + + pr_debug("isert_init_tx_hdrs: Setup tx_sg[0].addr: 0x%llx length: %u" + " lkey: 0x%08x\n", tx_desc->tx_sg[0].addr, + tx_desc->tx_sg[0].length, tx_desc->tx_sg[0].lkey); + + return 0; +} + +static void +isert_init_send_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, + struct ib_send_wr *send_wr, bool coalesce) +{ + struct iser_tx_desc *tx_desc = &isert_cmd->tx_desc; + + isert_cmd->rdma_wr.iser_ib_op = ISER_IB_SEND; + send_wr->wr_id = (unsigned long)&isert_cmd->tx_desc; + send_wr->opcode = IB_WR_SEND; + send_wr->sg_list = &tx_desc->tx_sg[0]; + send_wr->num_sge = isert_cmd->tx_desc.num_sge; + /* + * Coalesce send completion interrupts by only setting IB_SEND_SIGNALED + * bit for every ISERT_COMP_BATCH_COUNT number of ib_post_send() calls. + */ + mutex_lock(&isert_conn->conn_mutex); + if (coalesce && isert_conn->state == ISER_CONN_UP && + ++isert_conn->conn_comp_batch < ISERT_COMP_BATCH_COUNT) { + tx_desc->llnode_active = true; + llist_add(&tx_desc->comp_llnode, &isert_conn->conn_comp_llist); + mutex_unlock(&isert_conn->conn_mutex); + return; + } + isert_conn->conn_comp_batch = 0; + tx_desc->comp_llnode_batch = llist_del_all(&isert_conn->conn_comp_llist); + mutex_unlock(&isert_conn->conn_mutex); + + send_wr->send_flags = IB_SEND_SIGNALED; +} + +static int +isert_rdma_post_recvl(struct isert_conn *isert_conn) +{ + struct ib_recv_wr rx_wr, *rx_wr_fail; + struct ib_sge sge; + int ret; + + memset(&sge, 0, sizeof(struct ib_sge)); + sge.addr = isert_conn->login_req_dma; + sge.length = ISER_RX_LOGIN_SIZE; + sge.lkey = isert_conn->conn_mr->lkey; + + pr_debug("Setup sge: addr: %llx length: %d 0x%08x\n", + sge.addr, sge.length, sge.lkey); + + memset(&rx_wr, 0, sizeof(struct ib_recv_wr)); + rx_wr.wr_id = (unsigned long)isert_conn->login_req_buf; + rx_wr.sg_list = &sge; + rx_wr.num_sge = 1; + + isert_conn->post_recv_buf_count++; + ret = ib_post_recv(isert_conn->conn_qp, &rx_wr, &rx_wr_fail); + if (ret) { + pr_err("ib_post_recv() failed: %d\n", ret); + isert_conn->post_recv_buf_count--; + } + + pr_debug("ib_post_recv(): returned success >>>>>>>>>>>>>>>>>>>>>>>>\n"); + return ret; +} + +static int +isert_put_login_tx(struct iscsi_conn *conn, struct iscsi_login *login, + u32 length) +{ + struct isert_conn *isert_conn = conn->context; + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct iser_tx_desc *tx_desc = &isert_conn->conn_login_tx_desc; + int ret; + + isert_create_send_desc(isert_conn, NULL, tx_desc); + + memcpy(&tx_desc->iscsi_header, &login->rsp[0], + sizeof(struct iscsi_hdr)); + + isert_init_tx_hdrs(isert_conn, tx_desc); + + if (length > 0) { + struct ib_sge *tx_dsg = &tx_desc->tx_sg[1]; + + ib_dma_sync_single_for_cpu(ib_dev, isert_conn->login_rsp_dma, + length, DMA_TO_DEVICE); + + memcpy(isert_conn->login_rsp_buf, login->rsp_buf, length); + + ib_dma_sync_single_for_device(ib_dev, isert_conn->login_rsp_dma, + length, DMA_TO_DEVICE); + + tx_dsg->addr = isert_conn->login_rsp_dma; + tx_dsg->length = length; + tx_dsg->lkey = isert_conn->conn_mr->lkey; + tx_desc->num_sge = 2; + } + if (!login->login_failed) { + if (login->login_complete) { + if (!conn->sess->sess_ops->SessionType && + isert_conn->conn_device->use_fastreg) { + /* Normal Session and fastreg is used */ + u8 pi_support = login->np->tpg_np->tpg->tpg_attrib.t10_pi; + + ret = isert_conn_create_fastreg_pool(isert_conn, + pi_support); + if (ret) { + pr_err("Conn: %p failed to create" + " fastreg pool\n", isert_conn); + return ret; + } + } + + ret = isert_alloc_rx_descriptors(isert_conn); + if (ret) + return ret; + + ret = isert_post_recv(isert_conn, ISERT_MIN_POSTED_RX); + if (ret) + return ret; + + isert_conn->state = ISER_CONN_UP; + goto post_send; + } + + ret = isert_rdma_post_recvl(isert_conn); + if (ret) + return ret; + } +post_send: + ret = isert_post_send(isert_conn, tx_desc); + if (ret) + return ret; + + return 0; +} + +static void +isert_rx_login_req(struct iser_rx_desc *rx_desc, int rx_buflen, + struct isert_conn *isert_conn) +{ + struct iscsi_conn *conn = isert_conn->conn; + struct iscsi_login *login = conn->conn_login; + int size; + + if (!login) { + pr_err("conn->conn_login is NULL\n"); + dump_stack(); + return; + } + + if (login->first_request) { + struct iscsi_login_req *login_req = + (struct iscsi_login_req *)&rx_desc->iscsi_header; + /* + * Setup the initial iscsi_login values from the leading + * login request PDU. + */ + login->leading_connection = (!login_req->tsih) ? 1 : 0; + login->current_stage = + (login_req->flags & ISCSI_FLAG_LOGIN_CURRENT_STAGE_MASK) + >> 2; + login->version_min = login_req->min_version; + login->version_max = login_req->max_version; + memcpy(login->isid, login_req->isid, 6); + login->cmd_sn = be32_to_cpu(login_req->cmdsn); + login->init_task_tag = login_req->itt; + login->initial_exp_statsn = be32_to_cpu(login_req->exp_statsn); + login->cid = be16_to_cpu(login_req->cid); + login->tsih = be16_to_cpu(login_req->tsih); + } + + memcpy(&login->req[0], (void *)&rx_desc->iscsi_header, ISCSI_HDR_LEN); + + size = min(rx_buflen, MAX_KEY_VALUE_PAIRS); + pr_debug("Using login payload size: %d, rx_buflen: %d MAX_KEY_VALUE_PAIRS: %d\n", + size, rx_buflen, MAX_KEY_VALUE_PAIRS); + memcpy(login->req_buf, &rx_desc->data[0], size); + + if (login->first_request) { + complete(&isert_conn->conn_login_comp); + return; + } + schedule_delayed_work(&conn->login_work, 0); +} + +static struct iscsi_cmd +*isert_allocate_cmd(struct iscsi_conn *conn) +{ + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct isert_cmd *isert_cmd; + struct iscsi_cmd *cmd; + + cmd = iscsit_allocate_cmd(conn, TASK_INTERRUPTIBLE); + if (!cmd) { + pr_err("Unable to allocate iscsi_cmd + isert_cmd\n"); + return NULL; + } + isert_cmd = iscsit_priv_cmd(cmd); + isert_cmd->conn = isert_conn; + isert_cmd->iscsi_cmd = cmd; + + return cmd; +} + +static int +isert_handle_scsi_cmd(struct isert_conn *isert_conn, + struct isert_cmd *isert_cmd, struct iscsi_cmd *cmd, + struct iser_rx_desc *rx_desc, unsigned char *buf) +{ + struct iscsi_conn *conn = isert_conn->conn; + struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)buf; + struct scatterlist *sg; + int imm_data, imm_data_len, unsol_data, sg_nents, rc; + bool dump_payload = false; + + rc = iscsit_setup_scsi_cmd(conn, cmd, buf); + if (rc < 0) + return rc; + + imm_data = cmd->immediate_data; + imm_data_len = cmd->first_burst_len; + unsol_data = cmd->unsolicited_data; + + rc = iscsit_process_scsi_cmd(conn, cmd, hdr); + if (rc < 0) { + return 0; + } else if (rc > 0) { + dump_payload = true; + goto sequence_cmd; + } + + if (!imm_data) + return 0; + + sg = &cmd->se_cmd.t_data_sg[0]; + sg_nents = max(1UL, DIV_ROUND_UP(imm_data_len, PAGE_SIZE)); + + pr_debug("Copying Immediate SG: %p sg_nents: %u from %p imm_data_len: %d\n", + sg, sg_nents, &rx_desc->data[0], imm_data_len); + + sg_copy_from_buffer(sg, sg_nents, &rx_desc->data[0], imm_data_len); + + cmd->write_data_done += imm_data_len; + + if (cmd->write_data_done == cmd->se_cmd.data_length) { + spin_lock_bh(&cmd->istate_lock); + cmd->cmd_flags |= ICF_GOT_LAST_DATAOUT; + cmd->i_state = ISTATE_RECEIVED_LAST_DATAOUT; + spin_unlock_bh(&cmd->istate_lock); + } + +sequence_cmd: + rc = iscsit_sequence_cmd(conn, cmd, buf, hdr->cmdsn); + + if (!rc && dump_payload == false && unsol_data) + iscsit_set_unsoliticed_dataout(cmd); + else if (dump_payload && imm_data) + target_put_sess_cmd(conn->sess->se_sess, &cmd->se_cmd); + + return 0; +} + +static int +isert_handle_iscsi_dataout(struct isert_conn *isert_conn, + struct iser_rx_desc *rx_desc, unsigned char *buf) +{ + struct scatterlist *sg_start; + struct iscsi_conn *conn = isert_conn->conn; + struct iscsi_cmd *cmd = NULL; + struct iscsi_data *hdr = (struct iscsi_data *)buf; + u32 unsol_data_len = ntoh24(hdr->dlength); + int rc, sg_nents, sg_off, page_off; + + rc = iscsit_check_dataout_hdr(conn, buf, &cmd); + if (rc < 0) + return rc; + else if (!cmd) + return 0; + /* + * FIXME: Unexpected unsolicited_data out + */ + if (!cmd->unsolicited_data) { + pr_err("Received unexpected solicited data payload\n"); + dump_stack(); + return -1; + } + + pr_debug("Unsolicited DataOut unsol_data_len: %u, write_data_done: %u, data_length: %u\n", + unsol_data_len, cmd->write_data_done, cmd->se_cmd.data_length); + + sg_off = cmd->write_data_done / PAGE_SIZE; + sg_start = &cmd->se_cmd.t_data_sg[sg_off]; + sg_nents = max(1UL, DIV_ROUND_UP(unsol_data_len, PAGE_SIZE)); + page_off = cmd->write_data_done % PAGE_SIZE; + /* + * FIXME: Non page-aligned unsolicited_data out + */ + if (page_off) { + pr_err("Received unexpected non-page aligned data payload\n"); + dump_stack(); + return -1; + } + pr_debug("Copying DataOut: sg_start: %p, sg_off: %u sg_nents: %u from %p %u\n", + sg_start, sg_off, sg_nents, &rx_desc->data[0], unsol_data_len); + + sg_copy_from_buffer(sg_start, sg_nents, &rx_desc->data[0], + unsol_data_len); + + rc = iscsit_check_dataout_payload(cmd, hdr, false); + if (rc < 0) + return rc; + + return 0; +} + +static int +isert_handle_nop_out(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, + struct iscsi_cmd *cmd, struct iser_rx_desc *rx_desc, + unsigned char *buf) +{ + struct iscsi_conn *conn = isert_conn->conn; + struct iscsi_nopout *hdr = (struct iscsi_nopout *)buf; + int rc; + + rc = iscsit_setup_nop_out(conn, cmd, hdr); + if (rc < 0) + return rc; + /* + * FIXME: Add support for NOPOUT payload using unsolicited RDMA payload + */ + + return iscsit_process_nop_out(conn, cmd, hdr); +} + +static int +isert_handle_text_cmd(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, + struct iscsi_cmd *cmd, struct iser_rx_desc *rx_desc, + struct iscsi_text *hdr) +{ + struct iscsi_conn *conn = isert_conn->conn; + u32 payload_length = ntoh24(hdr->dlength); + int rc; + unsigned char *text_in; + + rc = iscsit_setup_text_cmd(conn, cmd, hdr); + if (rc < 0) + return rc; + + text_in = kzalloc(payload_length, GFP_KERNEL); + if (!text_in) { + pr_err("Unable to allocate text_in of payload_length: %u\n", + payload_length); + return -ENOMEM; + } + cmd->text_in_ptr = text_in; + + memcpy(cmd->text_in_ptr, &rx_desc->data[0], payload_length); + + return iscsit_process_text_cmd(conn, cmd, hdr); +} + +static int +isert_rx_opcode(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc, + uint32_t read_stag, uint64_t read_va, + uint32_t write_stag, uint64_t write_va) +{ + struct iscsi_hdr *hdr = &rx_desc->iscsi_header; + struct iscsi_conn *conn = isert_conn->conn; + struct iscsi_session *sess = conn->sess; + struct iscsi_cmd *cmd; + struct isert_cmd *isert_cmd; + int ret = -EINVAL; + u8 opcode = (hdr->opcode & ISCSI_OPCODE_MASK); + + if (sess->sess_ops->SessionType && + (!(opcode & ISCSI_OP_TEXT) || !(opcode & ISCSI_OP_LOGOUT))) { + pr_err("Got illegal opcode: 0x%02x in SessionType=Discovery," + " ignoring\n", opcode); + return 0; + } + + switch (opcode) { + case ISCSI_OP_SCSI_CMD: + cmd = isert_allocate_cmd(conn); + if (!cmd) + break; + + isert_cmd = iscsit_priv_cmd(cmd); + isert_cmd->read_stag = read_stag; + isert_cmd->read_va = read_va; + isert_cmd->write_stag = write_stag; + isert_cmd->write_va = write_va; + + ret = isert_handle_scsi_cmd(isert_conn, isert_cmd, cmd, + rx_desc, (unsigned char *)hdr); + break; + case ISCSI_OP_NOOP_OUT: + cmd = isert_allocate_cmd(conn); + if (!cmd) + break; + + isert_cmd = iscsit_priv_cmd(cmd); + ret = isert_handle_nop_out(isert_conn, isert_cmd, cmd, + rx_desc, (unsigned char *)hdr); + break; + case ISCSI_OP_SCSI_DATA_OUT: + ret = isert_handle_iscsi_dataout(isert_conn, rx_desc, + (unsigned char *)hdr); + break; + case ISCSI_OP_SCSI_TMFUNC: + cmd = isert_allocate_cmd(conn); + if (!cmd) + break; + + ret = iscsit_handle_task_mgt_cmd(conn, cmd, + (unsigned char *)hdr); + break; + case ISCSI_OP_LOGOUT: + cmd = isert_allocate_cmd(conn); + if (!cmd) + break; + + ret = iscsit_handle_logout_cmd(conn, cmd, (unsigned char *)hdr); + if (ret > 0) + wait_for_completion_timeout(&conn->conn_logout_comp, + SECONDS_FOR_LOGOUT_COMP * + HZ); + break; + case ISCSI_OP_TEXT: + cmd = isert_allocate_cmd(conn); + if (!cmd) + break; + + isert_cmd = iscsit_priv_cmd(cmd); + ret = isert_handle_text_cmd(isert_conn, isert_cmd, cmd, + rx_desc, (struct iscsi_text *)hdr); + break; + default: + pr_err("Got unknown iSCSI OpCode: 0x%02x\n", opcode); + dump_stack(); + break; + } + + return ret; +} + +static void +isert_rx_do_work(struct iser_rx_desc *rx_desc, struct isert_conn *isert_conn) +{ + struct iser_hdr *iser_hdr = &rx_desc->iser_header; + uint64_t read_va = 0, write_va = 0; + uint32_t read_stag = 0, write_stag = 0; + int rc; + + switch (iser_hdr->flags & 0xF0) { + case ISCSI_CTRL: + if (iser_hdr->flags & ISER_RSV) { + read_stag = be32_to_cpu(iser_hdr->read_stag); + read_va = be64_to_cpu(iser_hdr->read_va); + pr_debug("ISER_RSV: read_stag: 0x%08x read_va: 0x%16llx\n", + read_stag, (unsigned long long)read_va); + } + if (iser_hdr->flags & ISER_WSV) { + write_stag = be32_to_cpu(iser_hdr->write_stag); + write_va = be64_to_cpu(iser_hdr->write_va); + pr_debug("ISER_WSV: write__stag: 0x%08x write_va: 0x%16llx\n", + write_stag, (unsigned long long)write_va); + } + + pr_debug("ISER ISCSI_CTRL PDU\n"); + break; + case ISER_HELLO: + pr_err("iSER Hello message\n"); + break; + default: + pr_warn("Unknown iSER hdr flags: 0x%02x\n", iser_hdr->flags); + break; + } + + rc = isert_rx_opcode(isert_conn, rx_desc, + read_stag, read_va, write_stag, write_va); +} + +static void +isert_rx_completion(struct iser_rx_desc *desc, struct isert_conn *isert_conn, + unsigned long xfer_len) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct iscsi_hdr *hdr; + u64 rx_dma; + int rx_buflen, outstanding; + + if ((char *)desc == isert_conn->login_req_buf) { + rx_dma = isert_conn->login_req_dma; + rx_buflen = ISER_RX_LOGIN_SIZE; + pr_debug("ISER login_buf: Using rx_dma: 0x%llx, rx_buflen: %d\n", + rx_dma, rx_buflen); + } else { + rx_dma = desc->dma_addr; + rx_buflen = ISER_RX_PAYLOAD_SIZE; + pr_debug("ISER req_buf: Using rx_dma: 0x%llx, rx_buflen: %d\n", + rx_dma, rx_buflen); + } + + ib_dma_sync_single_for_cpu(ib_dev, rx_dma, rx_buflen, DMA_FROM_DEVICE); + + hdr = &desc->iscsi_header; + pr_debug("iSCSI opcode: 0x%02x, ITT: 0x%08x, flags: 0x%02x dlen: %d\n", + hdr->opcode, hdr->itt, hdr->flags, + (int)(xfer_len - ISER_HEADERS_LEN)); + + if ((char *)desc == isert_conn->login_req_buf) + isert_rx_login_req(desc, xfer_len - ISER_HEADERS_LEN, + isert_conn); + else + isert_rx_do_work(desc, isert_conn); + + ib_dma_sync_single_for_device(ib_dev, rx_dma, rx_buflen, + DMA_FROM_DEVICE); + + isert_conn->post_recv_buf_count--; + pr_debug("iSERT: Decremented post_recv_buf_count: %d\n", + isert_conn->post_recv_buf_count); + + if ((char *)desc == isert_conn->login_req_buf) + return; + + outstanding = isert_conn->post_recv_buf_count; + if (outstanding + ISERT_MIN_POSTED_RX <= ISERT_QP_MAX_RECV_DTOS) { + int err, count = min(ISERT_QP_MAX_RECV_DTOS - outstanding, + ISERT_MIN_POSTED_RX); + err = isert_post_recv(isert_conn, count); + if (err) { + pr_err("isert_post_recv() count: %d failed, %d\n", + count, err); + } + } +} + +static int +isert_map_data_buf(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, + struct scatterlist *sg, u32 nents, u32 length, u32 offset, + enum iser_ib_op_code op, struct isert_data_buf *data) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + + data->dma_dir = op == ISER_IB_RDMA_WRITE ? + DMA_TO_DEVICE : DMA_FROM_DEVICE; + + data->len = length - offset; + data->offset = offset; + data->sg_off = data->offset / PAGE_SIZE; + + data->sg = &sg[data->sg_off]; + data->nents = min_t(unsigned int, nents - data->sg_off, + ISCSI_ISER_SG_TABLESIZE); + data->len = min_t(unsigned int, data->len, ISCSI_ISER_SG_TABLESIZE * + PAGE_SIZE); + + data->dma_nents = ib_dma_map_sg(ib_dev, data->sg, data->nents, + data->dma_dir); + if (unlikely(!data->dma_nents)) { + pr_err("Cmd: unable to dma map SGs %p\n", sg); + return -EINVAL; + } + + pr_debug("Mapped cmd: %p count: %u sg: %p sg_nents: %u rdma_len %d\n", + isert_cmd, data->dma_nents, data->sg, data->nents, data->len); + + return 0; +} + +static void +isert_unmap_data_buf(struct isert_conn *isert_conn, struct isert_data_buf *data) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + + ib_dma_unmap_sg(ib_dev, data->sg, data->nents, data->dma_dir); + memset(data, 0, sizeof(*data)); +} + + + +static void +isert_unmap_cmd(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn) +{ + struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; + + pr_debug("isert_unmap_cmd: %p\n", isert_cmd); + + if (wr->data.sg) { + pr_debug("isert_unmap_cmd: %p unmap_sg op\n", isert_cmd); + isert_unmap_data_buf(isert_conn, &wr->data); + } + + if (wr->send_wr) { + pr_debug("isert_unmap_cmd: %p free send_wr\n", isert_cmd); + kfree(wr->send_wr); + wr->send_wr = NULL; + } + + if (wr->ib_sge) { + pr_debug("isert_unmap_cmd: %p free ib_sge\n", isert_cmd); + kfree(wr->ib_sge); + wr->ib_sge = NULL; + } +} + +static void +isert_unreg_rdma(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn) +{ + struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; + LIST_HEAD(unmap_list); + + pr_debug("unreg_fastreg_cmd: %p\n", isert_cmd); + + if (wr->fr_desc) { + pr_debug("unreg_fastreg_cmd: %p free fr_desc %p\n", + isert_cmd, wr->fr_desc); + if (wr->fr_desc->ind & ISERT_PROTECTED) { + isert_unmap_data_buf(isert_conn, &wr->prot); + wr->fr_desc->ind &= ~ISERT_PROTECTED; + } + spin_lock_bh(&isert_conn->conn_lock); + list_add_tail(&wr->fr_desc->list, &isert_conn->conn_fr_pool); + spin_unlock_bh(&isert_conn->conn_lock); + wr->fr_desc = NULL; + } + + if (wr->data.sg) { + pr_debug("unreg_fastreg_cmd: %p unmap_sg op\n", isert_cmd); + isert_unmap_data_buf(isert_conn, &wr->data); + } + + wr->ib_sge = NULL; + wr->send_wr = NULL; +} + +static void +isert_put_cmd(struct isert_cmd *isert_cmd, bool comp_err) +{ + struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; + struct isert_conn *isert_conn = isert_cmd->conn; + struct iscsi_conn *conn = isert_conn->conn; + struct isert_device *device = isert_conn->conn_device; + + pr_debug("Entering isert_put_cmd: %p\n", isert_cmd); + + switch (cmd->iscsi_opcode) { + case ISCSI_OP_SCSI_CMD: + spin_lock_bh(&conn->cmd_lock); + if (!list_empty(&cmd->i_conn_node)) + list_del_init(&cmd->i_conn_node); + spin_unlock_bh(&conn->cmd_lock); + + if (cmd->data_direction == DMA_TO_DEVICE) { + iscsit_stop_dataout_timer(cmd); + /* + * Check for special case during comp_err where + * WRITE_PENDING has been handed off from core, + * but requires an extra target_put_sess_cmd() + * before transport_generic_free_cmd() below. + */ + if (comp_err && + cmd->se_cmd.t_state == TRANSPORT_WRITE_PENDING) { + struct se_cmd *se_cmd = &cmd->se_cmd; + + target_put_sess_cmd(se_cmd->se_sess, se_cmd); + } + } + + device->unreg_rdma_mem(isert_cmd, isert_conn); + transport_generic_free_cmd(&cmd->se_cmd, 0); + break; + case ISCSI_OP_SCSI_TMFUNC: + spin_lock_bh(&conn->cmd_lock); + if (!list_empty(&cmd->i_conn_node)) + list_del_init(&cmd->i_conn_node); + spin_unlock_bh(&conn->cmd_lock); + + transport_generic_free_cmd(&cmd->se_cmd, 0); + break; + case ISCSI_OP_REJECT: + case ISCSI_OP_NOOP_OUT: + case ISCSI_OP_TEXT: + spin_lock_bh(&conn->cmd_lock); + if (!list_empty(&cmd->i_conn_node)) + list_del_init(&cmd->i_conn_node); + spin_unlock_bh(&conn->cmd_lock); + + /* + * Handle special case for REJECT when iscsi_add_reject*() has + * overwritten the original iscsi_opcode assignment, and the + * associated cmd->se_cmd needs to be released. + */ + if (cmd->se_cmd.se_tfo != NULL) { + pr_debug("Calling transport_generic_free_cmd from" + " isert_put_cmd for 0x%02x\n", + cmd->iscsi_opcode); + transport_generic_free_cmd(&cmd->se_cmd, 0); + break; + } + /* + * Fall-through + */ + default: + iscsit_release_cmd(cmd); + break; + } +} + +static void +isert_unmap_tx_desc(struct iser_tx_desc *tx_desc, struct ib_device *ib_dev) +{ + if (tx_desc->dma_addr != 0) { + pr_debug("Calling ib_dma_unmap_single for tx_desc->dma_addr\n"); + ib_dma_unmap_single(ib_dev, tx_desc->dma_addr, + ISER_HEADERS_LEN, DMA_TO_DEVICE); + tx_desc->dma_addr = 0; + } +} + +static void +isert_completion_put(struct iser_tx_desc *tx_desc, struct isert_cmd *isert_cmd, + struct ib_device *ib_dev, bool comp_err) +{ + if (isert_cmd->pdu_buf_dma != 0) { + pr_debug("Calling ib_dma_unmap_single for isert_cmd->pdu_buf_dma\n"); + ib_dma_unmap_single(ib_dev, isert_cmd->pdu_buf_dma, + isert_cmd->pdu_buf_len, DMA_TO_DEVICE); + isert_cmd->pdu_buf_dma = 0; + } + + isert_unmap_tx_desc(tx_desc, ib_dev); + isert_put_cmd(isert_cmd, comp_err); +} + +static int +isert_check_pi_status(struct se_cmd *se_cmd, struct ib_mr *sig_mr) +{ + struct ib_mr_status mr_status; + int ret; + + ret = ib_check_mr_status(sig_mr, IB_MR_CHECK_SIG_STATUS, &mr_status); + if (ret) { + pr_err("ib_check_mr_status failed, ret %d\n", ret); + goto fail_mr_status; + } + + if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) { + u64 sec_offset_err; + u32 block_size = se_cmd->se_dev->dev_attrib.block_size + 8; + + switch (mr_status.sig_err.err_type) { + case IB_SIG_BAD_GUARD: + se_cmd->pi_err = TCM_LOGICAL_BLOCK_GUARD_CHECK_FAILED; + break; + case IB_SIG_BAD_REFTAG: + se_cmd->pi_err = TCM_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED; + break; + case IB_SIG_BAD_APPTAG: + se_cmd->pi_err = TCM_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED; + break; + } + sec_offset_err = mr_status.sig_err.sig_err_offset; + do_div(sec_offset_err, block_size); + se_cmd->bad_sector = sec_offset_err + se_cmd->t_task_lba; + + pr_err("isert: PI error found type %d at sector 0x%llx " + "expected 0x%x vs actual 0x%x\n", + mr_status.sig_err.err_type, + (unsigned long long)se_cmd->bad_sector, + mr_status.sig_err.expected, + mr_status.sig_err.actual); + ret = 1; + } + +fail_mr_status: + return ret; +} + +static void +isert_completion_rdma_write(struct iser_tx_desc *tx_desc, + struct isert_cmd *isert_cmd) +{ + struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; + struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; + struct se_cmd *se_cmd = &cmd->se_cmd; + struct isert_conn *isert_conn = isert_cmd->conn; + struct isert_device *device = isert_conn->conn_device; + int ret = 0; + + if (wr->fr_desc && wr->fr_desc->ind & ISERT_PROTECTED) { + ret = isert_check_pi_status(se_cmd, + wr->fr_desc->pi_ctx->sig_mr); + wr->fr_desc->ind &= ~ISERT_PROTECTED; + } + + device->unreg_rdma_mem(isert_cmd, isert_conn); + wr->send_wr_num = 0; + if (ret) + transport_send_check_condition_and_sense(se_cmd, + se_cmd->pi_err, 0); + else + isert_put_response(isert_conn->conn, cmd); +} + +static void +isert_completion_rdma_read(struct iser_tx_desc *tx_desc, + struct isert_cmd *isert_cmd) +{ + struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; + struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; + struct se_cmd *se_cmd = &cmd->se_cmd; + struct isert_conn *isert_conn = isert_cmd->conn; + struct isert_device *device = isert_conn->conn_device; + int ret = 0; + + if (wr->fr_desc && wr->fr_desc->ind & ISERT_PROTECTED) { + ret = isert_check_pi_status(se_cmd, + wr->fr_desc->pi_ctx->sig_mr); + wr->fr_desc->ind &= ~ISERT_PROTECTED; + } + + iscsit_stop_dataout_timer(cmd); + device->unreg_rdma_mem(isert_cmd, isert_conn); + cmd->write_data_done = wr->data.len; + wr->send_wr_num = 0; + + pr_debug("Cmd: %p RDMA_READ comp calling execute_cmd\n", isert_cmd); + spin_lock_bh(&cmd->istate_lock); + cmd->cmd_flags |= ICF_GOT_LAST_DATAOUT; + cmd->i_state = ISTATE_RECEIVED_LAST_DATAOUT; + spin_unlock_bh(&cmd->istate_lock); + + if (ret) + transport_send_check_condition_and_sense(se_cmd, + se_cmd->pi_err, 0); + else + target_execute_cmd(se_cmd); +} + +static void +isert_do_control_comp(struct work_struct *work) +{ + struct isert_cmd *isert_cmd = container_of(work, + struct isert_cmd, comp_work); + struct isert_conn *isert_conn = isert_cmd->conn; + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; + + switch (cmd->i_state) { + case ISTATE_SEND_TASKMGTRSP: + pr_debug("Calling iscsit_tmr_post_handler >>>>>>>>>>>>>>>>>\n"); + + atomic_dec(&isert_conn->post_send_buf_count); + iscsit_tmr_post_handler(cmd, cmd->conn); + + cmd->i_state = ISTATE_SENT_STATUS; + isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev, false); + break; + case ISTATE_SEND_REJECT: + pr_debug("Got isert_do_control_comp ISTATE_SEND_REJECT: >>>\n"); + atomic_dec(&isert_conn->post_send_buf_count); + + cmd->i_state = ISTATE_SENT_STATUS; + isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev, false); + break; + case ISTATE_SEND_LOGOUTRSP: + pr_debug("Calling iscsit_logout_post_handler >>>>>>>>>>>>>>\n"); + + atomic_dec(&isert_conn->post_send_buf_count); + iscsit_logout_post_handler(cmd, cmd->conn); + break; + case ISTATE_SEND_TEXTRSP: + atomic_dec(&isert_conn->post_send_buf_count); + cmd->i_state = ISTATE_SENT_STATUS; + isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev, false); + break; + default: + pr_err("Unknown do_control_comp i_state %d\n", cmd->i_state); + dump_stack(); + break; + } +} + +static void +isert_response_completion(struct iser_tx_desc *tx_desc, + struct isert_cmd *isert_cmd, + struct isert_conn *isert_conn, + struct ib_device *ib_dev) +{ + struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; + struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; + + if (cmd->i_state == ISTATE_SEND_TASKMGTRSP || + cmd->i_state == ISTATE_SEND_LOGOUTRSP || + cmd->i_state == ISTATE_SEND_REJECT || + cmd->i_state == ISTATE_SEND_TEXTRSP) { + isert_unmap_tx_desc(tx_desc, ib_dev); + + INIT_WORK(&isert_cmd->comp_work, isert_do_control_comp); + queue_work(isert_comp_wq, &isert_cmd->comp_work); + return; + } + + /** + * If send_wr_num is 0 this means that we got + * RDMA completion and we cleared it and we should + * simply decrement the response post. else the + * response is incorporated in send_wr_num, just + * sub it. + **/ + if (wr->send_wr_num) + atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count); + else + atomic_dec(&isert_conn->post_send_buf_count); + + cmd->i_state = ISTATE_SENT_STATUS; + isert_completion_put(tx_desc, isert_cmd, ib_dev, false); +} + +static void +__isert_send_completion(struct iser_tx_desc *tx_desc, + struct isert_conn *isert_conn) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct isert_cmd *isert_cmd = tx_desc->isert_cmd; + struct isert_rdma_wr *wr; + + if (!isert_cmd) { + atomic_dec(&isert_conn->post_send_buf_count); + isert_unmap_tx_desc(tx_desc, ib_dev); + return; + } + wr = &isert_cmd->rdma_wr; + + switch (wr->iser_ib_op) { + case ISER_IB_RECV: + pr_err("isert_send_completion: Got ISER_IB_RECV\n"); + dump_stack(); + break; + case ISER_IB_SEND: + pr_debug("isert_send_completion: Got ISER_IB_SEND\n"); + isert_response_completion(tx_desc, isert_cmd, + isert_conn, ib_dev); + break; + case ISER_IB_RDMA_WRITE: + pr_debug("isert_send_completion: Got ISER_IB_RDMA_WRITE\n"); + atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count); + isert_completion_rdma_write(tx_desc, isert_cmd); + break; + case ISER_IB_RDMA_READ: + pr_debug("isert_send_completion: Got ISER_IB_RDMA_READ:\n"); + + atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count); + isert_completion_rdma_read(tx_desc, isert_cmd); + break; + default: + pr_err("Unknown wr->iser_ib_op: 0x%02x\n", wr->iser_ib_op); + dump_stack(); + break; + } +} + +static void +isert_send_completion(struct iser_tx_desc *tx_desc, + struct isert_conn *isert_conn) +{ + struct llist_node *llnode = tx_desc->comp_llnode_batch; + struct iser_tx_desc *t; + /* + * Drain coalesced completion llist starting from comp_llnode_batch + * setup in isert_init_send_wr(), and then complete trailing tx_desc. + */ + while (llnode) { + t = llist_entry(llnode, struct iser_tx_desc, comp_llnode); + llnode = llist_next(llnode); + __isert_send_completion(t, isert_conn); + } + __isert_send_completion(tx_desc, isert_conn); +} + +static void +isert_cq_drain_comp_llist(struct isert_conn *isert_conn, struct ib_device *ib_dev) +{ + struct llist_node *llnode; + struct isert_rdma_wr *wr; + struct iser_tx_desc *t; + + mutex_lock(&isert_conn->conn_mutex); + llnode = llist_del_all(&isert_conn->conn_comp_llist); + isert_conn->conn_comp_batch = 0; + mutex_unlock(&isert_conn->conn_mutex); + + while (llnode) { + t = llist_entry(llnode, struct iser_tx_desc, comp_llnode); + llnode = llist_next(llnode); + wr = &t->isert_cmd->rdma_wr; + + /** + * If send_wr_num is 0 this means that we got + * RDMA completion and we cleared it and we should + * simply decrement the response post. else the + * response is incorporated in send_wr_num, just + * sub it. + **/ + if (wr->send_wr_num) + atomic_sub(wr->send_wr_num, + &isert_conn->post_send_buf_count); + else + atomic_dec(&isert_conn->post_send_buf_count); + + isert_completion_put(t, t->isert_cmd, ib_dev, true); + } +} + +static void +isert_cq_tx_comp_err(struct iser_tx_desc *tx_desc, struct isert_conn *isert_conn) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct isert_cmd *isert_cmd = tx_desc->isert_cmd; + struct llist_node *llnode = tx_desc->comp_llnode_batch; + struct isert_rdma_wr *wr; + struct iser_tx_desc *t; + + while (llnode) { + t = llist_entry(llnode, struct iser_tx_desc, comp_llnode); + llnode = llist_next(llnode); + wr = &t->isert_cmd->rdma_wr; + + /** + * If send_wr_num is 0 this means that we got + * RDMA completion and we cleared it and we should + * simply decrement the response post. else the + * response is incorporated in send_wr_num, just + * sub it. + **/ + if (wr->send_wr_num) + atomic_sub(wr->send_wr_num, + &isert_conn->post_send_buf_count); + else + atomic_dec(&isert_conn->post_send_buf_count); + + isert_completion_put(t, t->isert_cmd, ib_dev, true); + } + tx_desc->comp_llnode_batch = NULL; + + if (!isert_cmd) + isert_unmap_tx_desc(tx_desc, ib_dev); + else + isert_completion_put(tx_desc, isert_cmd, ib_dev, true); +} + +static void +isert_cq_rx_comp_err(struct isert_conn *isert_conn) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct iscsi_conn *conn = isert_conn->conn; + + if (isert_conn->post_recv_buf_count) + return; + + isert_cq_drain_comp_llist(isert_conn, ib_dev); + + if (conn->sess) { + target_sess_cmd_list_set_waiting(conn->sess->se_sess); + target_wait_for_sess_cmds(conn->sess->se_sess); + } + + while (atomic_read(&isert_conn->post_send_buf_count)) + msleep(3000); + + mutex_lock(&isert_conn->conn_mutex); + isert_conn->state = ISER_CONN_DOWN; + mutex_unlock(&isert_conn->conn_mutex); + + iscsit_cause_connection_reinstatement(isert_conn->conn, 0); + + complete(&isert_conn->conn_wait_comp_err); +} + +static void +isert_cq_tx_work(struct work_struct *work) +{ + struct isert_cq_desc *cq_desc = container_of(work, + struct isert_cq_desc, cq_tx_work); + struct isert_device *device = cq_desc->device; + int cq_index = cq_desc->cq_index; + struct ib_cq *tx_cq = device->dev_tx_cq[cq_index]; + struct isert_conn *isert_conn; + struct iser_tx_desc *tx_desc; + struct ib_wc wc; + + while (ib_poll_cq(tx_cq, 1, &wc) == 1) { + tx_desc = (struct iser_tx_desc *)(unsigned long)wc.wr_id; + isert_conn = wc.qp->qp_context; + + if (wc.status == IB_WC_SUCCESS) { + isert_send_completion(tx_desc, isert_conn); + } else { + pr_debug("TX wc.status != IB_WC_SUCCESS >>>>>>>>>>>>>>\n"); + pr_debug("TX wc.status: 0x%08x\n", wc.status); + pr_debug("TX wc.vendor_err: 0x%08x\n", wc.vendor_err); + + if (wc.wr_id != ISER_FASTREG_LI_WRID) { + if (tx_desc->llnode_active) + continue; + + atomic_dec(&isert_conn->post_send_buf_count); + isert_cq_tx_comp_err(tx_desc, isert_conn); + } + } + } + + ib_req_notify_cq(tx_cq, IB_CQ_NEXT_COMP); +} + +static void +isert_cq_tx_callback(struct ib_cq *cq, void *context) +{ + struct isert_cq_desc *cq_desc = (struct isert_cq_desc *)context; + + queue_work(isert_comp_wq, &cq_desc->cq_tx_work); +} + +static void +isert_cq_rx_work(struct work_struct *work) +{ + struct isert_cq_desc *cq_desc = container_of(work, + struct isert_cq_desc, cq_rx_work); + struct isert_device *device = cq_desc->device; + int cq_index = cq_desc->cq_index; + struct ib_cq *rx_cq = device->dev_rx_cq[cq_index]; + struct isert_conn *isert_conn; + struct iser_rx_desc *rx_desc; + struct ib_wc wc; + unsigned long xfer_len; + + while (ib_poll_cq(rx_cq, 1, &wc) == 1) { + rx_desc = (struct iser_rx_desc *)(unsigned long)wc.wr_id; + isert_conn = wc.qp->qp_context; + + if (wc.status == IB_WC_SUCCESS) { + xfer_len = (unsigned long)wc.byte_len; + isert_rx_completion(rx_desc, isert_conn, xfer_len); + } else { + pr_debug("RX wc.status != IB_WC_SUCCESS >>>>>>>>>>>>>>\n"); + if (wc.status != IB_WC_WR_FLUSH_ERR) { + pr_debug("RX wc.status: 0x%08x\n", wc.status); + pr_debug("RX wc.vendor_err: 0x%08x\n", + wc.vendor_err); + } + isert_conn->post_recv_buf_count--; + isert_cq_rx_comp_err(isert_conn); + } + } + + ib_req_notify_cq(rx_cq, IB_CQ_NEXT_COMP); +} + +static void +isert_cq_rx_callback(struct ib_cq *cq, void *context) +{ + struct isert_cq_desc *cq_desc = (struct isert_cq_desc *)context; + + queue_work(isert_rx_wq, &cq_desc->cq_rx_work); +} + +static int +isert_post_response(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd) +{ + struct ib_send_wr *wr_failed; + int ret; + + atomic_inc(&isert_conn->post_send_buf_count); + + ret = ib_post_send(isert_conn->conn_qp, &isert_cmd->tx_desc.send_wr, + &wr_failed); + if (ret) { + pr_err("ib_post_send failed with %d\n", ret); + atomic_dec(&isert_conn->post_send_buf_count); + return ret; + } + return ret; +} + +static int +isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; + struct iscsi_scsi_rsp *hdr = (struct iscsi_scsi_rsp *) + &isert_cmd->tx_desc.iscsi_header; + + isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); + iscsit_build_rsp_pdu(cmd, conn, true, hdr); + isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + /* + * Attach SENSE DATA payload to iSCSI Response PDU + */ + if (cmd->se_cmd.sense_buffer && + ((cmd->se_cmd.se_cmd_flags & SCF_TRANSPORT_TASK_SENSE) || + (cmd->se_cmd.se_cmd_flags & SCF_EMULATED_TASK_SENSE))) { + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct ib_sge *tx_dsg = &isert_cmd->tx_desc.tx_sg[1]; + u32 padding, pdu_len; + + put_unaligned_be16(cmd->se_cmd.scsi_sense_length, + cmd->sense_buffer); + cmd->se_cmd.scsi_sense_length += sizeof(__be16); + + padding = -(cmd->se_cmd.scsi_sense_length) & 3; + hton24(hdr->dlength, (u32)cmd->se_cmd.scsi_sense_length); + pdu_len = cmd->se_cmd.scsi_sense_length + padding; + + isert_cmd->pdu_buf_dma = ib_dma_map_single(ib_dev, + (void *)cmd->sense_buffer, pdu_len, + DMA_TO_DEVICE); + + isert_cmd->pdu_buf_len = pdu_len; + tx_dsg->addr = isert_cmd->pdu_buf_dma; + tx_dsg->length = pdu_len; + tx_dsg->lkey = isert_conn->conn_mr->lkey; + isert_cmd->tx_desc.num_sge = 2; + } + + isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); + + pr_debug("Posting SCSI Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); + + return isert_post_response(isert_conn, isert_cmd); +} + +static void +isert_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct isert_device *device = isert_conn->conn_device; + + spin_lock_bh(&conn->cmd_lock); + if (!list_empty(&cmd->i_conn_node)) + list_del_init(&cmd->i_conn_node); + spin_unlock_bh(&conn->cmd_lock); + + if (cmd->data_direction == DMA_TO_DEVICE) + iscsit_stop_dataout_timer(cmd); + + device->unreg_rdma_mem(isert_cmd, isert_conn); +} + +static enum target_prot_op +isert_get_sup_prot_ops(struct iscsi_conn *conn) +{ + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct isert_device *device = isert_conn->conn_device; + + if (device->pi_capable) + return TARGET_PROT_ALL; + + return TARGET_PROT_NORMAL; +} + +static int +isert_put_nopin(struct iscsi_cmd *cmd, struct iscsi_conn *conn, + bool nopout_response) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; + + isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); + iscsit_build_nopin_rsp(cmd, conn, (struct iscsi_nopin *) + &isert_cmd->tx_desc.iscsi_header, + nopout_response); + isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); + + pr_debug("Posting NOPIN Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); + + return isert_post_response(isert_conn, isert_cmd); +} + +static int +isert_put_logout_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; + + isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); + iscsit_build_logout_rsp(cmd, conn, (struct iscsi_logout_rsp *) + &isert_cmd->tx_desc.iscsi_header); + isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); + + pr_debug("Posting Logout Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); + + return isert_post_response(isert_conn, isert_cmd); +} + +static int +isert_put_tm_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; + + isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); + iscsit_build_task_mgt_rsp(cmd, conn, (struct iscsi_tm_rsp *) + &isert_cmd->tx_desc.iscsi_header); + isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); + + pr_debug("Posting Task Management Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); + + return isert_post_response(isert_conn, isert_cmd); +} + +static int +isert_put_reject(struct iscsi_cmd *cmd, struct iscsi_conn *conn) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct ib_sge *tx_dsg = &isert_cmd->tx_desc.tx_sg[1]; + struct iscsi_reject *hdr = + (struct iscsi_reject *)&isert_cmd->tx_desc.iscsi_header; + + isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); + iscsit_build_reject(cmd, conn, hdr); + isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + + hton24(hdr->dlength, ISCSI_HDR_LEN); + isert_cmd->pdu_buf_dma = ib_dma_map_single(ib_dev, + (void *)cmd->buf_ptr, ISCSI_HDR_LEN, + DMA_TO_DEVICE); + isert_cmd->pdu_buf_len = ISCSI_HDR_LEN; + tx_dsg->addr = isert_cmd->pdu_buf_dma; + tx_dsg->length = ISCSI_HDR_LEN; + tx_dsg->lkey = isert_conn->conn_mr->lkey; + isert_cmd->tx_desc.num_sge = 2; + + isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); + + pr_debug("Posting Reject IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); + + return isert_post_response(isert_conn, isert_cmd); +} + +static int +isert_put_text_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; + struct iscsi_text_rsp *hdr = + (struct iscsi_text_rsp *)&isert_cmd->tx_desc.iscsi_header; + u32 txt_rsp_len; + int rc; + + isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); + rc = iscsit_build_text_rsp(cmd, conn, hdr, ISCSI_INFINIBAND); + if (rc < 0) + return rc; + + txt_rsp_len = rc; + isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + + if (txt_rsp_len) { + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct ib_sge *tx_dsg = &isert_cmd->tx_desc.tx_sg[1]; + void *txt_rsp_buf = cmd->buf_ptr; + + isert_cmd->pdu_buf_dma = ib_dma_map_single(ib_dev, + txt_rsp_buf, txt_rsp_len, DMA_TO_DEVICE); + + isert_cmd->pdu_buf_len = txt_rsp_len; + tx_dsg->addr = isert_cmd->pdu_buf_dma; + tx_dsg->length = txt_rsp_len; + tx_dsg->lkey = isert_conn->conn_mr->lkey; + isert_cmd->tx_desc.num_sge = 2; + } + isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); + + pr_debug("Posting Text Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); + + return isert_post_response(isert_conn, isert_cmd); +} + +static int +isert_build_rdma_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, + struct ib_sge *ib_sge, struct ib_send_wr *send_wr, + u32 data_left, u32 offset) +{ + struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; + struct scatterlist *sg_start, *tmp_sg; + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + u32 sg_off, page_off; + int i = 0, sg_nents; + + sg_off = offset / PAGE_SIZE; + sg_start = &cmd->se_cmd.t_data_sg[sg_off]; + sg_nents = min(cmd->se_cmd.t_data_nents - sg_off, isert_conn->max_sge); + page_off = offset % PAGE_SIZE; + + send_wr->sg_list = ib_sge; + send_wr->num_sge = sg_nents; + send_wr->wr_id = (unsigned long)&isert_cmd->tx_desc; + /* + * Perform mapping of TCM scatterlist memory ib_sge dma_addr. + */ + for_each_sg(sg_start, tmp_sg, sg_nents, i) { + pr_debug("ISER RDMA from SGL dma_addr: 0x%16llx dma_len: %u, page_off: %u\n", + (unsigned long long)tmp_sg->dma_address, + tmp_sg->length, page_off); + + ib_sge->addr = ib_sg_dma_address(ib_dev, tmp_sg) + page_off; + ib_sge->length = min_t(u32, data_left, + ib_sg_dma_len(ib_dev, tmp_sg) - page_off); + ib_sge->lkey = isert_conn->conn_mr->lkey; + + pr_debug("RDMA ib_sge: addr: 0x%16llx length: %u lkey: %08x\n", + ib_sge->addr, ib_sge->length, ib_sge->lkey); + page_off = 0; + data_left -= ib_sge->length; + ib_sge++; + pr_debug("Incrementing ib_sge pointer to %p\n", ib_sge); + } + + pr_debug("Set outgoing sg_list: %p num_sg: %u from TCM SGLs\n", + send_wr->sg_list, send_wr->num_sge); + + return sg_nents; +} + +static int +isert_map_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd, + struct isert_rdma_wr *wr) +{ + struct se_cmd *se_cmd = &cmd->se_cmd; + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct isert_data_buf *data = &wr->data; + struct ib_send_wr *send_wr; + struct ib_sge *ib_sge; + u32 offset, data_len, data_left, rdma_write_max, va_offset = 0; + int ret = 0, i, ib_sge_cnt; + + isert_cmd->tx_desc.isert_cmd = isert_cmd; + + offset = wr->iser_ib_op == ISER_IB_RDMA_READ ? cmd->write_data_done : 0; + ret = isert_map_data_buf(isert_conn, isert_cmd, se_cmd->t_data_sg, + se_cmd->t_data_nents, se_cmd->data_length, + offset, wr->iser_ib_op, &wr->data); + if (ret) + return ret; + + data_left = data->len; + offset = data->offset; + + ib_sge = kzalloc(sizeof(struct ib_sge) * data->nents, GFP_KERNEL); + if (!ib_sge) { + pr_warn("Unable to allocate ib_sge\n"); + ret = -ENOMEM; + goto unmap_cmd; + } + wr->ib_sge = ib_sge; + + wr->send_wr_num = DIV_ROUND_UP(data->nents, isert_conn->max_sge); + wr->send_wr = kzalloc(sizeof(struct ib_send_wr) * wr->send_wr_num, + GFP_KERNEL); + if (!wr->send_wr) { + pr_debug("Unable to allocate wr->send_wr\n"); + ret = -ENOMEM; + goto unmap_cmd; + } + + wr->isert_cmd = isert_cmd; + rdma_write_max = isert_conn->max_sge * PAGE_SIZE; + + for (i = 0; i < wr->send_wr_num; i++) { + send_wr = &isert_cmd->rdma_wr.send_wr[i]; + data_len = min(data_left, rdma_write_max); + + send_wr->send_flags = 0; + if (wr->iser_ib_op == ISER_IB_RDMA_WRITE) { + send_wr->opcode = IB_WR_RDMA_WRITE; + send_wr->wr.rdma.remote_addr = isert_cmd->read_va + offset; + send_wr->wr.rdma.rkey = isert_cmd->read_stag; + if (i + 1 == wr->send_wr_num) + send_wr->next = &isert_cmd->tx_desc.send_wr; + else + send_wr->next = &wr->send_wr[i + 1]; + } else { + send_wr->opcode = IB_WR_RDMA_READ; + send_wr->wr.rdma.remote_addr = isert_cmd->write_va + va_offset; + send_wr->wr.rdma.rkey = isert_cmd->write_stag; + if (i + 1 == wr->send_wr_num) + send_wr->send_flags = IB_SEND_SIGNALED; + else + send_wr->next = &wr->send_wr[i + 1]; + } + + ib_sge_cnt = isert_build_rdma_wr(isert_conn, isert_cmd, ib_sge, + send_wr, data_len, offset); + ib_sge += ib_sge_cnt; + + offset += data_len; + va_offset += data_len; + data_left -= data_len; + } + + return 0; +unmap_cmd: + isert_unmap_data_buf(isert_conn, data); + + return ret; +} + +static int +isert_map_fr_pagelist(struct ib_device *ib_dev, + struct scatterlist *sg_start, int sg_nents, u64 *fr_pl) +{ + u64 start_addr, end_addr, page, chunk_start = 0; + struct scatterlist *tmp_sg; + int i = 0, new_chunk, last_ent, n_pages; + + n_pages = 0; + new_chunk = 1; + last_ent = sg_nents - 1; + for_each_sg(sg_start, tmp_sg, sg_nents, i) { + start_addr = ib_sg_dma_address(ib_dev, tmp_sg); + if (new_chunk) + chunk_start = start_addr; + end_addr = start_addr + ib_sg_dma_len(ib_dev, tmp_sg); + + pr_debug("SGL[%d] dma_addr: 0x%16llx len: %u\n", + i, (unsigned long long)tmp_sg->dma_address, + tmp_sg->length); + + if ((end_addr & ~PAGE_MASK) && i < last_ent) { + new_chunk = 0; + continue; + } + new_chunk = 1; + + page = chunk_start & PAGE_MASK; + do { + fr_pl[n_pages++] = page; + pr_debug("Mapped page_list[%d] page_addr: 0x%16llx\n", + n_pages - 1, page); + page += PAGE_SIZE; + } while (page < end_addr); + } + + return n_pages; +} + +static int +isert_fast_reg_mr(struct isert_conn *isert_conn, + struct fast_reg_descriptor *fr_desc, + struct isert_data_buf *mem, + enum isert_indicator ind, + struct ib_sge *sge) +{ + struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct ib_mr *mr; + struct ib_fast_reg_page_list *frpl; + struct ib_send_wr fr_wr, inv_wr; + struct ib_send_wr *bad_wr, *wr = NULL; + int ret, pagelist_len; + u32 page_off; + u8 key; + + if (mem->dma_nents == 1) { + sge->lkey = isert_conn->conn_mr->lkey; + sge->addr = ib_sg_dma_address(ib_dev, &mem->sg[0]); + sge->length = ib_sg_dma_len(ib_dev, &mem->sg[0]); + pr_debug("%s:%d sge: addr: 0x%llx length: %u lkey: %x\n", + __func__, __LINE__, sge->addr, sge->length, + sge->lkey); + return 0; + } + + if (ind == ISERT_DATA_KEY_VALID) { + /* Registering data buffer */ + mr = fr_desc->data_mr; + frpl = fr_desc->data_frpl; + } else { + /* Registering protection buffer */ + mr = fr_desc->pi_ctx->prot_mr; + frpl = fr_desc->pi_ctx->prot_frpl; + } + + page_off = mem->offset % PAGE_SIZE; + + pr_debug("Use fr_desc %p sg_nents %d offset %u\n", + fr_desc, mem->nents, mem->offset); + + pagelist_len = isert_map_fr_pagelist(ib_dev, mem->sg, mem->nents, + &frpl->page_list[0]); + + if (!(fr_desc->ind & ISERT_DATA_KEY_VALID)) { + memset(&inv_wr, 0, sizeof(inv_wr)); + inv_wr.wr_id = ISER_FASTREG_LI_WRID; + inv_wr.opcode = IB_WR_LOCAL_INV; + inv_wr.ex.invalidate_rkey = mr->rkey; + wr = &inv_wr; + /* Bump the key */ + key = (u8)(mr->rkey & 0x000000FF); + ib_update_fast_reg_key(mr, ++key); + } + + /* Prepare FASTREG WR */ + memset(&fr_wr, 0, sizeof(fr_wr)); + fr_wr.wr_id = ISER_FASTREG_LI_WRID; + fr_wr.opcode = IB_WR_FAST_REG_MR; + fr_wr.wr.fast_reg.iova_start = frpl->page_list[0] + page_off; + fr_wr.wr.fast_reg.page_list = frpl; + fr_wr.wr.fast_reg.page_list_len = pagelist_len; + fr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + fr_wr.wr.fast_reg.length = mem->len; + fr_wr.wr.fast_reg.rkey = mr->rkey; + fr_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE; + + if (!wr) + wr = &fr_wr; + else + wr->next = &fr_wr; + + ret = ib_post_send(isert_conn->conn_qp, wr, &bad_wr); + if (ret) { + pr_err("fast registration failed, ret:%d\n", ret); + return ret; + } + fr_desc->ind &= ~ind; + + sge->lkey = mr->lkey; + sge->addr = frpl->page_list[0] + page_off; + sge->length = mem->len; + + pr_debug("%s:%d sge: addr: 0x%llx length: %u lkey: %x\n", + __func__, __LINE__, sge->addr, sge->length, + sge->lkey); + + return ret; +} + +static inline void +isert_set_dif_domain(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs, + struct ib_sig_domain *domain) +{ + domain->sig_type = IB_SIG_TYPE_T10_DIF; + domain->sig.dif.bg_type = IB_T10DIF_CRC; + domain->sig.dif.pi_interval = se_cmd->se_dev->dev_attrib.block_size; + domain->sig.dif.ref_tag = se_cmd->reftag_seed; + /* + * At the moment we hard code those, but if in the future + * the target core would like to use it, we will take it + * from se_cmd. + */ + domain->sig.dif.apptag_check_mask = 0xffff; + domain->sig.dif.app_escape = true; + domain->sig.dif.ref_escape = true; + if (se_cmd->prot_type == TARGET_DIF_TYPE1_PROT || + se_cmd->prot_type == TARGET_DIF_TYPE2_PROT) + domain->sig.dif.ref_remap = true; +}; + +static int +isert_set_sig_attrs(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs) +{ + switch (se_cmd->prot_op) { + case TARGET_PROT_DIN_INSERT: + case TARGET_PROT_DOUT_STRIP: + sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE; + isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->wire); + break; + case TARGET_PROT_DOUT_INSERT: + case TARGET_PROT_DIN_STRIP: + sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE; + isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->mem); + break; + case TARGET_PROT_DIN_PASS: + case TARGET_PROT_DOUT_PASS: + isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->wire); + isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->mem); + break; + default: + pr_err("Unsupported PI operation %d\n", se_cmd->prot_op); + return -EINVAL; + } + + return 0; +} + +static inline u8 +isert_set_prot_checks(u8 prot_checks) +{ + return (prot_checks & TARGET_DIF_CHECK_GUARD ? 0xc0 : 0) | + (prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x30 : 0) | + (prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x0f : 0); +} + +static int +isert_reg_sig_mr(struct isert_conn *isert_conn, struct se_cmd *se_cmd, + struct fast_reg_descriptor *fr_desc, + struct ib_sge *data_sge, struct ib_sge *prot_sge, + struct ib_sge *sig_sge) +{ + struct ib_send_wr sig_wr, inv_wr; + struct ib_send_wr *bad_wr, *wr = NULL; + struct pi_context *pi_ctx = fr_desc->pi_ctx; + struct ib_sig_attrs sig_attrs; + int ret; + u32 key; + + memset(&sig_attrs, 0, sizeof(sig_attrs)); + ret = isert_set_sig_attrs(se_cmd, &sig_attrs); + if (ret) + goto err; + + sig_attrs.check_mask = isert_set_prot_checks(se_cmd->prot_checks); + + if (!(fr_desc->ind & ISERT_SIG_KEY_VALID)) { + memset(&inv_wr, 0, sizeof(inv_wr)); + inv_wr.opcode = IB_WR_LOCAL_INV; + inv_wr.wr_id = ISER_FASTREG_LI_WRID; + inv_wr.ex.invalidate_rkey = pi_ctx->sig_mr->rkey; + wr = &inv_wr; + /* Bump the key */ + key = (u8)(pi_ctx->sig_mr->rkey & 0x000000FF); + ib_update_fast_reg_key(pi_ctx->sig_mr, ++key); + } + + memset(&sig_wr, 0, sizeof(sig_wr)); + sig_wr.opcode = IB_WR_REG_SIG_MR; + sig_wr.wr_id = ISER_FASTREG_LI_WRID; + sig_wr.sg_list = data_sge; + sig_wr.num_sge = 1; + sig_wr.wr.sig_handover.access_flags = IB_ACCESS_LOCAL_WRITE; + sig_wr.wr.sig_handover.sig_attrs = &sig_attrs; + sig_wr.wr.sig_handover.sig_mr = pi_ctx->sig_mr; + if (se_cmd->t_prot_sg) + sig_wr.wr.sig_handover.prot = prot_sge; + + if (!wr) + wr = &sig_wr; + else + wr->next = &sig_wr; + + ret = ib_post_send(isert_conn->conn_qp, wr, &bad_wr); + if (ret) { + pr_err("fast registration failed, ret:%d\n", ret); + goto err; + } + fr_desc->ind &= ~ISERT_SIG_KEY_VALID; + + sig_sge->lkey = pi_ctx->sig_mr->lkey; + sig_sge->addr = 0; + sig_sge->length = se_cmd->data_length; + if (se_cmd->prot_op != TARGET_PROT_DIN_STRIP && + se_cmd->prot_op != TARGET_PROT_DOUT_INSERT) + /* + * We have protection guards on the wire + * so we need to set a larget transfer + */ + sig_sge->length += se_cmd->prot_length; + + pr_debug("sig_sge: addr: 0x%llx length: %u lkey: %x\n", + sig_sge->addr, sig_sge->length, + sig_sge->lkey); +err: + return ret; +} + +static int +isert_reg_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd, + struct isert_rdma_wr *wr) +{ + struct se_cmd *se_cmd = &cmd->se_cmd; + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = conn->context; + struct ib_sge data_sge; + struct ib_send_wr *send_wr; + struct fast_reg_descriptor *fr_desc = NULL; + u32 offset; + int ret = 0; + unsigned long flags; + + isert_cmd->tx_desc.isert_cmd = isert_cmd; + + offset = wr->iser_ib_op == ISER_IB_RDMA_READ ? cmd->write_data_done : 0; + ret = isert_map_data_buf(isert_conn, isert_cmd, se_cmd->t_data_sg, + se_cmd->t_data_nents, se_cmd->data_length, + offset, wr->iser_ib_op, &wr->data); + if (ret) + return ret; + + if (wr->data.dma_nents != 1 || + se_cmd->prot_op != TARGET_PROT_NORMAL) { + spin_lock_irqsave(&isert_conn->conn_lock, flags); + fr_desc = list_first_entry(&isert_conn->conn_fr_pool, + struct fast_reg_descriptor, list); + list_del(&fr_desc->list); + spin_unlock_irqrestore(&isert_conn->conn_lock, flags); + wr->fr_desc = fr_desc; + } + + ret = isert_fast_reg_mr(isert_conn, fr_desc, &wr->data, + ISERT_DATA_KEY_VALID, &data_sge); + if (ret) + goto unmap_cmd; + + if (se_cmd->prot_op != TARGET_PROT_NORMAL) { + struct ib_sge prot_sge, sig_sge; + + if (se_cmd->t_prot_sg) { + ret = isert_map_data_buf(isert_conn, isert_cmd, + se_cmd->t_prot_sg, + se_cmd->t_prot_nents, + se_cmd->prot_length, + 0, wr->iser_ib_op, &wr->prot); + if (ret) + goto unmap_cmd; + + ret = isert_fast_reg_mr(isert_conn, fr_desc, &wr->prot, + ISERT_PROT_KEY_VALID, &prot_sge); + if (ret) + goto unmap_prot_cmd; + } + + ret = isert_reg_sig_mr(isert_conn, se_cmd, fr_desc, + &data_sge, &prot_sge, &sig_sge); + if (ret) + goto unmap_prot_cmd; + + fr_desc->ind |= ISERT_PROTECTED; + memcpy(&wr->s_ib_sge, &sig_sge, sizeof(sig_sge)); + } else + memcpy(&wr->s_ib_sge, &data_sge, sizeof(data_sge)); + + wr->ib_sge = &wr->s_ib_sge; + wr->send_wr_num = 1; + memset(&wr->s_send_wr, 0, sizeof(*send_wr)); + wr->send_wr = &wr->s_send_wr; + wr->isert_cmd = isert_cmd; + + send_wr = &isert_cmd->rdma_wr.s_send_wr; + send_wr->sg_list = &wr->s_ib_sge; + send_wr->num_sge = 1; + send_wr->wr_id = (unsigned long)&isert_cmd->tx_desc; + if (wr->iser_ib_op == ISER_IB_RDMA_WRITE) { + send_wr->opcode = IB_WR_RDMA_WRITE; + send_wr->wr.rdma.remote_addr = isert_cmd->read_va; + send_wr->wr.rdma.rkey = isert_cmd->read_stag; + send_wr->send_flags = se_cmd->prot_op == TARGET_PROT_NORMAL ? + 0 : IB_SEND_SIGNALED; + } else { + send_wr->opcode = IB_WR_RDMA_READ; + send_wr->wr.rdma.remote_addr = isert_cmd->write_va; + send_wr->wr.rdma.rkey = isert_cmd->write_stag; + send_wr->send_flags = IB_SEND_SIGNALED; + } + + return 0; +unmap_prot_cmd: + if (se_cmd->t_prot_sg) + isert_unmap_data_buf(isert_conn, &wr->prot); +unmap_cmd: + if (fr_desc) { + spin_lock_irqsave(&isert_conn->conn_lock, flags); + list_add_tail(&fr_desc->list, &isert_conn->conn_fr_pool); + spin_unlock_irqrestore(&isert_conn->conn_lock, flags); + } + isert_unmap_data_buf(isert_conn, &wr->data); + + return ret; +} + +static int +isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd) +{ + struct se_cmd *se_cmd = &cmd->se_cmd; + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct isert_device *device = isert_conn->conn_device; + struct ib_send_wr *wr_failed; + int rc; + + pr_debug("Cmd: %p RDMA_WRITE data_length: %u\n", + isert_cmd, se_cmd->data_length); + wr->iser_ib_op = ISER_IB_RDMA_WRITE; + rc = device->reg_rdma_mem(conn, cmd, wr); + if (rc) { + pr_err("Cmd: %p failed to prepare RDMA res\n", isert_cmd); + return rc; + } + + if (se_cmd->prot_op == TARGET_PROT_NORMAL) { + /* + * Build isert_conn->tx_desc for iSCSI response PDU and attach + */ + isert_create_send_desc(isert_conn, isert_cmd, + &isert_cmd->tx_desc); + iscsit_build_rsp_pdu(cmd, conn, true, (struct iscsi_scsi_rsp *) + &isert_cmd->tx_desc.iscsi_header); + isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + isert_init_send_wr(isert_conn, isert_cmd, + &isert_cmd->tx_desc.send_wr, false); + isert_cmd->rdma_wr.s_send_wr.next = &isert_cmd->tx_desc.send_wr; + wr->send_wr_num += 1; + } + + atomic_add(wr->send_wr_num, &isert_conn->post_send_buf_count); + + rc = ib_post_send(isert_conn->conn_qp, wr->send_wr, &wr_failed); + if (rc) { + pr_warn("ib_post_send() failed for IB_WR_RDMA_WRITE\n"); + atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count); + } + + if (se_cmd->prot_op == TARGET_PROT_NORMAL) + pr_debug("Cmd: %p posted RDMA_WRITE + Response for iSER Data " + "READ\n", isert_cmd); + else + pr_debug("Cmd: %p posted RDMA_WRITE for iSER Data READ\n", + isert_cmd); + + return 1; +} + +static int +isert_get_dataout(struct iscsi_conn *conn, struct iscsi_cmd *cmd, bool recovery) +{ + struct se_cmd *se_cmd = &cmd->se_cmd; + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct isert_device *device = isert_conn->conn_device; + struct ib_send_wr *wr_failed; + int rc; + + pr_debug("Cmd: %p RDMA_READ data_length: %u write_data_done: %u\n", + isert_cmd, se_cmd->data_length, cmd->write_data_done); + wr->iser_ib_op = ISER_IB_RDMA_READ; + rc = device->reg_rdma_mem(conn, cmd, wr); + if (rc) { + pr_err("Cmd: %p failed to prepare RDMA res\n", isert_cmd); + return rc; + } + + atomic_add(wr->send_wr_num, &isert_conn->post_send_buf_count); + + rc = ib_post_send(isert_conn->conn_qp, wr->send_wr, &wr_failed); + if (rc) { + pr_warn("ib_post_send() failed for IB_WR_RDMA_READ\n"); + atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count); + } + pr_debug("Cmd: %p posted RDMA_READ memory for ISER Data WRITE\n", + isert_cmd); + + return 0; +} + +static int +isert_immediate_queue(struct iscsi_conn *conn, struct iscsi_cmd *cmd, int state) +{ + int ret; + + switch (state) { + case ISTATE_SEND_NOPIN_WANT_RESPONSE: + ret = isert_put_nopin(cmd, conn, false); + break; + default: + pr_err("Unknown immediate state: 0x%02x\n", state); + ret = -EINVAL; + break; + } + + return ret; +} + +static int +isert_response_queue(struct iscsi_conn *conn, struct iscsi_cmd *cmd, int state) +{ + int ret; + + switch (state) { + case ISTATE_SEND_LOGOUTRSP: + ret = isert_put_logout_rsp(cmd, conn); + if (!ret) { + pr_debug("Returning iSER Logout -EAGAIN\n"); + ret = -EAGAIN; + } + break; + case ISTATE_SEND_NOPIN: + ret = isert_put_nopin(cmd, conn, true); + break; + case ISTATE_SEND_TASKMGTRSP: + ret = isert_put_tm_rsp(cmd, conn); + break; + case ISTATE_SEND_REJECT: + ret = isert_put_reject(cmd, conn); + break; + case ISTATE_SEND_TEXTRSP: + ret = isert_put_text_rsp(cmd, conn); + break; + case ISTATE_SEND_STATUS: + /* + * Special case for sending non GOOD SCSI status from TX thread + * context during pre se_cmd excecution failure. + */ + ret = isert_put_response(conn, cmd); + break; + default: + pr_err("Unknown response state: 0x%02x\n", state); + ret = -EINVAL; + break; + } + + return ret; +} + +static int +isert_setup_np(struct iscsi_np *np, + struct __kernel_sockaddr_storage *ksockaddr) +{ + struct isert_np *isert_np; + struct rdma_cm_id *isert_lid; + struct sockaddr *sa; + int ret; + + isert_np = kzalloc(sizeof(struct isert_np), GFP_KERNEL); + if (!isert_np) { + pr_err("Unable to allocate struct isert_np\n"); + return -ENOMEM; + } + sema_init(&isert_np->np_sem, 0); + mutex_init(&isert_np->np_accept_mutex); + INIT_LIST_HEAD(&isert_np->np_accept_list); + init_completion(&isert_np->np_login_comp); + + sa = (struct sockaddr *)ksockaddr; + pr_debug("ksockaddr: %p, sa: %p\n", ksockaddr, sa); + /* + * Setup the np->np_sockaddr from the passed sockaddr setup + * in iscsi_target_configfs.c code.. + */ + memcpy(&np->np_sockaddr, ksockaddr, + sizeof(struct __kernel_sockaddr_storage)); + + isert_lid = rdma_create_id(isert_cma_handler, np, RDMA_PS_TCP, + IB_QPT_RC); + if (IS_ERR(isert_lid)) { + pr_err("rdma_create_id() for isert_listen_handler failed: %ld\n", + PTR_ERR(isert_lid)); + ret = PTR_ERR(isert_lid); + goto out; + } + + ret = rdma_bind_addr(isert_lid, sa); + if (ret) { + pr_err("rdma_bind_addr() for isert_lid failed: %d\n", ret); + goto out_lid; + } + + ret = rdma_listen(isert_lid, ISERT_RDMA_LISTEN_BACKLOG); + if (ret) { + pr_err("rdma_listen() for isert_lid failed: %d\n", ret); + goto out_lid; + } + + isert_np->np_cm_id = isert_lid; + np->np_context = isert_np; + pr_debug("Setup isert_lid->context: %p\n", isert_lid->context); + + return 0; + +out_lid: + rdma_destroy_id(isert_lid); +out: + kfree(isert_np); + return ret; +} + +static int +isert_rdma_accept(struct isert_conn *isert_conn) +{ + struct rdma_cm_id *cm_id = isert_conn->conn_cm_id; + struct rdma_conn_param cp; + int ret; + + memset(&cp, 0, sizeof(struct rdma_conn_param)); + cp.initiator_depth = isert_conn->initiator_depth; + cp.retry_count = 7; + cp.rnr_retry_count = 7; + + pr_debug("Before rdma_accept >>>>>>>>>>>>>>>>>>>>.\n"); + + ret = rdma_accept(cm_id, &cp); + if (ret) { + pr_err("rdma_accept() failed with: %d\n", ret); + return ret; + } + + pr_debug("After rdma_accept >>>>>>>>>>>>>>>>>>>>>.\n"); + + return 0; +} + +static int +isert_get_login_rx(struct iscsi_conn *conn, struct iscsi_login *login) +{ + struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + int ret; + + pr_debug("isert_get_login_rx before conn_login_comp conn: %p\n", conn); + /* + * For login requests after the first PDU, isert_rx_login_req() will + * kick schedule_delayed_work(&conn->login_work) as the packet is + * received, which turns this callback from iscsi_target_do_login_rx() + * into a NOP. + */ + if (!login->first_request) + return 0; + + ret = wait_for_completion_interruptible(&isert_conn->conn_login_comp); + if (ret) + return ret; + + pr_debug("isert_get_login_rx processing login->req: %p\n", login->req); + return 0; +} + +static void +isert_set_conn_info(struct iscsi_np *np, struct iscsi_conn *conn, + struct isert_conn *isert_conn) +{ + struct rdma_cm_id *cm_id = isert_conn->conn_cm_id; + struct rdma_route *cm_route = &cm_id->route; + struct sockaddr_in *sock_in; + struct sockaddr_in6 *sock_in6; + + conn->login_family = np->np_sockaddr.ss_family; + + if (np->np_sockaddr.ss_family == AF_INET6) { + sock_in6 = (struct sockaddr_in6 *)&cm_route->addr.dst_addr; + snprintf(conn->login_ip, sizeof(conn->login_ip), "%pI6c", + &sock_in6->sin6_addr.in6_u); + conn->login_port = ntohs(sock_in6->sin6_port); + + sock_in6 = (struct sockaddr_in6 *)&cm_route->addr.src_addr; + snprintf(conn->local_ip, sizeof(conn->local_ip), "%pI6c", + &sock_in6->sin6_addr.in6_u); + conn->local_port = ntohs(sock_in6->sin6_port); + } else { + sock_in = (struct sockaddr_in *)&cm_route->addr.dst_addr; + sprintf(conn->login_ip, "%pI4", + &sock_in->sin_addr.s_addr); + conn->login_port = ntohs(sock_in->sin_port); + + sock_in = (struct sockaddr_in *)&cm_route->addr.src_addr; + sprintf(conn->local_ip, "%pI4", + &sock_in->sin_addr.s_addr); + conn->local_port = ntohs(sock_in->sin_port); + } +} + +static int +isert_accept_np(struct iscsi_np *np, struct iscsi_conn *conn) +{ + struct isert_np *isert_np = (struct isert_np *)np->np_context; + struct isert_conn *isert_conn; + int max_accept = 0, ret; + +accept_wait: + ret = down_interruptible(&isert_np->np_sem); + if (ret || max_accept > 5) + return -ENODEV; + + spin_lock_bh(&np->np_thread_lock); + if (np->np_thread_state >= ISCSI_NP_THREAD_RESET) { + spin_unlock_bh(&np->np_thread_lock); + pr_debug("np_thread_state %d for isert_accept_np\n", + np->np_thread_state); + /** + * No point in stalling here when np_thread + * is in state RESET/SHUTDOWN/EXIT - bail + **/ + return -ENODEV; + } + spin_unlock_bh(&np->np_thread_lock); + + mutex_lock(&isert_np->np_accept_mutex); + if (list_empty(&isert_np->np_accept_list)) { + mutex_unlock(&isert_np->np_accept_mutex); + max_accept++; + goto accept_wait; + } + isert_conn = list_first_entry(&isert_np->np_accept_list, + struct isert_conn, conn_accept_node); + list_del_init(&isert_conn->conn_accept_node); + mutex_unlock(&isert_np->np_accept_mutex); + + conn->context = isert_conn; + isert_conn->conn = conn; + max_accept = 0; + + ret = isert_rdma_post_recvl(isert_conn); + if (ret) + return ret; + + ret = isert_rdma_accept(isert_conn); + if (ret) + return ret; + + isert_set_conn_info(np, conn, isert_conn); + + pr_debug("Processing isert_accept_np: isert_conn: %p\n", isert_conn); + return 0; +} + +static void +isert_free_np(struct iscsi_np *np) +{ + struct isert_np *isert_np = (struct isert_np *)np->np_context; + + if (isert_np->np_cm_id) + rdma_destroy_id(isert_np->np_cm_id); + + np->np_context = NULL; + kfree(isert_np); +} + +static void isert_wait_conn(struct iscsi_conn *conn) +{ + struct isert_conn *isert_conn = conn->context; + + pr_debug("isert_wait_conn: Starting \n"); + + mutex_lock(&isert_conn->conn_mutex); + if (isert_conn->conn_cm_id && !isert_conn->disconnect) { + pr_debug("Calling rdma_disconnect from isert_wait_conn\n"); + rdma_disconnect(isert_conn->conn_cm_id); + } + /* + * Only wait for conn_wait_comp_err if the isert_conn made it + * into full feature phase.. + */ + if (isert_conn->state == ISER_CONN_INIT) { + mutex_unlock(&isert_conn->conn_mutex); + return; + } + if (isert_conn->state == ISER_CONN_UP) + isert_conn->state = ISER_CONN_TERMINATING; + mutex_unlock(&isert_conn->conn_mutex); + + wait_for_completion(&isert_conn->conn_wait_comp_err); + + wait_for_completion(&isert_conn->conn_wait); + isert_put_conn(isert_conn); +} + +static void isert_free_conn(struct iscsi_conn *conn) +{ + struct isert_conn *isert_conn = conn->context; + + isert_put_conn(isert_conn); +} + +static struct iscsit_transport iser_target_transport = { + .name = "IB/iSER", + .transport_type = ISCSI_INFINIBAND, + .priv_size = sizeof(struct isert_cmd), + .owner = THIS_MODULE, + .iscsit_setup_np = isert_setup_np, + .iscsit_accept_np = isert_accept_np, + .iscsit_free_np = isert_free_np, + .iscsit_wait_conn = isert_wait_conn, + .iscsit_free_conn = isert_free_conn, + .iscsit_get_login_rx = isert_get_login_rx, + .iscsit_put_login_tx = isert_put_login_tx, + .iscsit_immediate_queue = isert_immediate_queue, + .iscsit_response_queue = isert_response_queue, + .iscsit_get_dataout = isert_get_dataout, + .iscsit_queue_data_in = isert_put_datain, + .iscsit_queue_status = isert_put_response, + .iscsit_aborted_task = isert_aborted_task, + .iscsit_get_sup_prot_ops = isert_get_sup_prot_ops, +}; + +static int __init isert_init(void) +{ + int ret; + + isert_rx_wq = alloc_workqueue("isert_rx_wq", 0, 0); + if (!isert_rx_wq) { + pr_err("Unable to allocate isert_rx_wq\n"); + return -ENOMEM; + } + + isert_comp_wq = alloc_workqueue("isert_comp_wq", 0, 0); + if (!isert_comp_wq) { + pr_err("Unable to allocate isert_comp_wq\n"); + ret = -ENOMEM; + goto destroy_rx_wq; + } + + iscsit_register_transport(&iser_target_transport); + pr_debug("iSER_TARGET[0] - Loaded iser_target_transport\n"); + return 0; + +destroy_rx_wq: + destroy_workqueue(isert_rx_wq); + return ret; +} + +static void __exit isert_exit(void) +{ + flush_scheduled_work(); + destroy_workqueue(isert_comp_wq); + destroy_workqueue(isert_rx_wq); + iscsit_unregister_transport(&iser_target_transport); + pr_debug("iSER_TARGET[0] - Released iser_target_transport\n"); +} + +MODULE_DESCRIPTION("iSER-Target for mainline target infrastructure"); +MODULE_VERSION("0.1"); +MODULE_AUTHOR("nab@Linux-iSCSI.org"); +MODULE_LICENSE("GPL"); + +module_init(isert_init); +module_exit(isert_exit); diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h new file mode 100644 index 0000000..04f51f7 --- /dev/null +++ b/drivers/infiniband/ulp/isert/ib_isert.h @@ -0,0 +1,190 @@ +#include +#include +#include +#include +#include + +#define ISERT_RDMA_LISTEN_BACKLOG 10 +#define ISCSI_ISER_SG_TABLESIZE 256 +#define ISER_FASTREG_LI_WRID 0xffffffffffffffffULL + +enum isert_desc_type { + ISCSI_TX_CONTROL, + ISCSI_TX_DATAIN +}; + +enum iser_ib_op_code { + ISER_IB_RECV, + ISER_IB_SEND, + ISER_IB_RDMA_WRITE, + ISER_IB_RDMA_READ, +}; + +enum iser_conn_state { + ISER_CONN_INIT, + ISER_CONN_UP, + ISER_CONN_TERMINATING, + ISER_CONN_DOWN, +}; + +struct iser_rx_desc { + struct iser_hdr iser_header; + struct iscsi_hdr iscsi_header; + char data[ISER_RECV_DATA_SEG_LEN]; + u64 dma_addr; + struct ib_sge rx_sg; + char pad[ISER_RX_PAD_SIZE]; +} __packed; + +struct iser_tx_desc { + struct iser_hdr iser_header; + struct iscsi_hdr iscsi_header; + enum isert_desc_type type; + u64 dma_addr; + struct ib_sge tx_sg[2]; + int num_sge; + struct isert_cmd *isert_cmd; + struct llist_node *comp_llnode_batch; + struct llist_node comp_llnode; + bool llnode_active; + struct ib_send_wr send_wr; +} __packed; + +enum isert_indicator { + ISERT_PROTECTED = 1 << 0, + ISERT_DATA_KEY_VALID = 1 << 1, + ISERT_PROT_KEY_VALID = 1 << 2, + ISERT_SIG_KEY_VALID = 1 << 3, +}; + +struct pi_context { + struct ib_mr *prot_mr; + struct ib_fast_reg_page_list *prot_frpl; + struct ib_mr *sig_mr; +}; + +struct fast_reg_descriptor { + struct list_head list; + struct ib_mr *data_mr; + struct ib_fast_reg_page_list *data_frpl; + u8 ind; + struct pi_context *pi_ctx; +}; + +struct isert_data_buf { + struct scatterlist *sg; + int nents; + u32 sg_off; + u32 len; /* cur_rdma_length */ + u32 offset; + unsigned int dma_nents; + enum dma_data_direction dma_dir; +}; + +struct isert_rdma_wr { + struct list_head wr_list; + struct isert_cmd *isert_cmd; + enum iser_ib_op_code iser_ib_op; + struct ib_sge *ib_sge; + struct ib_sge s_ib_sge; + int send_wr_num; + struct ib_send_wr *send_wr; + struct ib_send_wr s_send_wr; + struct isert_data_buf data; + struct isert_data_buf prot; + struct fast_reg_descriptor *fr_desc; +}; + +struct isert_cmd { + uint32_t read_stag; + uint32_t write_stag; + uint64_t read_va; + uint64_t write_va; + u64 pdu_buf_dma; + u32 pdu_buf_len; + u32 read_va_off; + u32 write_va_off; + u32 rdma_wr_num; + struct isert_conn *conn; + struct iscsi_cmd *iscsi_cmd; + struct iser_tx_desc tx_desc; + struct isert_rdma_wr rdma_wr; + struct work_struct comp_work; +}; + +struct isert_device; + +struct isert_conn { + enum iser_conn_state state; + int post_recv_buf_count; + atomic_t post_send_buf_count; + u32 responder_resources; + u32 initiator_depth; + u32 max_sge; + char *login_buf; + char *login_req_buf; + char *login_rsp_buf; + u64 login_req_dma; + u64 login_rsp_dma; + unsigned int conn_rx_desc_head; + struct iser_rx_desc *conn_rx_descs; + struct ib_recv_wr conn_rx_wr[ISERT_MIN_POSTED_RX]; + struct iscsi_conn *conn; + struct list_head conn_accept_node; + struct completion conn_login_comp; + struct iser_tx_desc conn_login_tx_desc; + struct rdma_cm_id *conn_cm_id; + struct ib_pd *conn_pd; + struct ib_mr *conn_mr; + struct ib_qp *conn_qp; + struct isert_device *conn_device; + struct work_struct conn_logout_work; + struct mutex conn_mutex; + struct completion conn_wait; + struct completion conn_wait_comp_err; + struct kref conn_kref; + struct list_head conn_fr_pool; + int conn_fr_pool_size; + /* lock to protect fastreg pool */ + spinlock_t conn_lock; +#define ISERT_COMP_BATCH_COUNT 8 + int conn_comp_batch; + struct llist_head conn_comp_llist; + bool disconnect; +}; + +#define ISERT_MAX_CQ 64 + +struct isert_cq_desc { + struct isert_device *device; + int cq_index; + struct work_struct cq_rx_work; + struct work_struct cq_tx_work; +}; + +struct isert_device { + int use_fastreg; + bool pi_capable; + int cqs_used; + int refcount; + int cq_active_qps[ISERT_MAX_CQ]; + struct ib_device *ib_device; + struct ib_cq *dev_rx_cq[ISERT_MAX_CQ]; + struct ib_cq *dev_tx_cq[ISERT_MAX_CQ]; + struct isert_cq_desc *cq_desc; + struct list_head dev_node; + struct ib_device_attr dev_attr; + int (*reg_rdma_mem)(struct iscsi_conn *conn, + struct iscsi_cmd *cmd, + struct isert_rdma_wr *wr); + void (*unreg_rdma_mem)(struct isert_cmd *isert_cmd, + struct isert_conn *isert_conn); +}; + +struct isert_np { + struct semaphore np_sem; + struct rdma_cm_id *np_cm_id; + struct mutex np_accept_mutex; + struct list_head np_accept_list; + struct completion np_login_comp; +}; diff --git a/drivers/infiniband/ulp/isert/isert_proto.h b/drivers/infiniband/ulp/isert/isert_proto.h new file mode 100644 index 0000000..4dccd31 --- /dev/null +++ b/drivers/infiniband/ulp/isert/isert_proto.h @@ -0,0 +1,47 @@ +/* From iscsi_iser.h */ + +struct iser_hdr { + u8 flags; + u8 rsvd[3]; + __be32 write_stag; /* write rkey */ + __be64 write_va; + __be32 read_stag; /* read rkey */ + __be64 read_va; +} __packed; + +/*Constant PDU lengths calculations */ +#define ISER_HEADERS_LEN (sizeof(struct iser_hdr) + sizeof(struct iscsi_hdr)) + +#define ISER_RECV_DATA_SEG_LEN 8192 +#define ISER_RX_PAYLOAD_SIZE (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN) +#define ISER_RX_LOGIN_SIZE (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN) + +/* QP settings */ +/* Maximal bounds on received asynchronous PDUs */ +#define ISERT_MAX_TX_MISC_PDUS 4 /* NOOP_IN(2) , ASYNC_EVENT(2) */ + +#define ISERT_MAX_RX_MISC_PDUS 6 /* NOOP_OUT(2), TEXT(1), * + * SCSI_TMFUNC(2), LOGOUT(1) */ + +#define ISCSI_DEF_XMIT_CMDS_MAX 128 /* from libiscsi.h, must be power of 2 */ + +#define ISERT_QP_MAX_RECV_DTOS (ISCSI_DEF_XMIT_CMDS_MAX) + +#define ISERT_MIN_POSTED_RX (ISCSI_DEF_XMIT_CMDS_MAX >> 2) + +#define ISERT_INFLIGHT_DATAOUTS 8 + +#define ISERT_QP_MAX_REQ_DTOS (ISCSI_DEF_XMIT_CMDS_MAX * \ + (1 + ISERT_INFLIGHT_DATAOUTS) + \ + ISERT_MAX_TX_MISC_PDUS + \ + ISERT_MAX_RX_MISC_PDUS) + +#define ISER_RX_PAD_SIZE (ISER_RECV_DATA_SEG_LEN + 4096 - \ + (ISER_RX_PAYLOAD_SIZE + sizeof(u64) + sizeof(struct ib_sge))) + +#define ISER_VER 0x10 +#define ISER_WSV 0x08 +#define ISER_RSV 0x04 +#define ISCSI_CTRL 0x10 +#define ISER_HELLO 0x20 +#define ISER_HELLORPLY 0x30 diff --git a/drivers/infiniband/ulp/srp/Kbuild b/drivers/infiniband/ulp/srp/Kbuild new file mode 100644 index 0000000..a16c73c --- /dev/null +++ b/drivers/infiniband/ulp/srp/Kbuild @@ -0,0 +1 @@ +obj-$(CONFIG_INFINIBAND_SRP) += ib_srp.o diff --git a/drivers/infiniband/ulp/srp/Kconfig b/drivers/infiniband/ulp/srp/Kconfig new file mode 100644 index 0000000..c74ee96 --- /dev/null +++ b/drivers/infiniband/ulp/srp/Kconfig @@ -0,0 +1,12 @@ +config INFINIBAND_SRP + tristate "InfiniBand SCSI RDMA Protocol" + depends on SCSI + select SCSI_SRP_ATTRS + ---help--- + Support for the SCSI RDMA Protocol over InfiniBand. This + allows you to access storage devices that speak SRP over + InfiniBand. + + The SRP protocol is defined by the INCITS T10 technical + committee. See . + diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c new file mode 100644 index 0000000..62d2a18 --- /dev/null +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -0,0 +1,3373 @@ +/* + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include "ib_srp.h" + +#define DRV_NAME "ib_srp" +#define PFX DRV_NAME ": " +#define DRV_VERSION "1.0" +#define DRV_RELDATE "July 1, 2013" + +MODULE_AUTHOR("Roland Dreier"); +MODULE_DESCRIPTION("InfiniBand SCSI RDMA Protocol initiator " + "v" DRV_VERSION " (" DRV_RELDATE ")"); +MODULE_LICENSE("Dual BSD/GPL"); + +static unsigned int srp_sg_tablesize; +static unsigned int cmd_sg_entries; +static unsigned int indirect_sg_entries; +static bool allow_ext_sg; +static bool prefer_fr; +static bool register_always; +static int topspin_workarounds = 1; + +module_param(srp_sg_tablesize, uint, 0444); +MODULE_PARM_DESC(srp_sg_tablesize, "Deprecated name for cmd_sg_entries"); + +module_param(cmd_sg_entries, uint, 0444); +MODULE_PARM_DESC(cmd_sg_entries, + "Default number of gather/scatter entries in the SRP command (default is 12, max 255)"); + +module_param(indirect_sg_entries, uint, 0444); +MODULE_PARM_DESC(indirect_sg_entries, + "Default max number of gather/scatter entries (default is 12, max is " __stringify(SCSI_MAX_SG_CHAIN_SEGMENTS) ")"); + +module_param(allow_ext_sg, bool, 0444); +MODULE_PARM_DESC(allow_ext_sg, + "Default behavior when there are more than cmd_sg_entries S/G entries after mapping; fails the request when false (default false)"); + +module_param(topspin_workarounds, int, 0444); +MODULE_PARM_DESC(topspin_workarounds, + "Enable workarounds for Topspin/Cisco SRP target bugs if != 0"); + +module_param(prefer_fr, bool, 0444); +MODULE_PARM_DESC(prefer_fr, +"Whether to use fast registration if both FMR and fast registration are supported"); + +module_param(register_always, bool, 0444); +MODULE_PARM_DESC(register_always, + "Use memory registration even for contiguous memory regions"); + +static struct kernel_param_ops srp_tmo_ops; + +static int srp_reconnect_delay = 10; +module_param_cb(reconnect_delay, &srp_tmo_ops, &srp_reconnect_delay, + S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(reconnect_delay, "Time between successive reconnect attempts"); + +static int srp_fast_io_fail_tmo = 15; +module_param_cb(fast_io_fail_tmo, &srp_tmo_ops, &srp_fast_io_fail_tmo, + S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(fast_io_fail_tmo, + "Number of seconds between the observation of a transport" + " layer error and failing all I/O. \"off\" means that this" + " functionality is disabled."); + +static int srp_dev_loss_tmo = 600; +module_param_cb(dev_loss_tmo, &srp_tmo_ops, &srp_dev_loss_tmo, + S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(dev_loss_tmo, + "Maximum number of seconds that the SRP transport should" + " insulate transport layer errors. After this time has been" + " exceeded the SCSI host is removed. Should be" + " between 1 and " __stringify(SCSI_DEVICE_BLOCK_MAX_TIMEOUT) + " if fast_io_fail_tmo has not been set. \"off\" means that" + " this functionality is disabled."); + +static void srp_add_one(struct ib_device *device); +static void srp_remove_one(struct ib_device *device); +static void srp_recv_completion(struct ib_cq *cq, void *target_ptr); +static void srp_send_completion(struct ib_cq *cq, void *target_ptr); +static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event); + +static struct scsi_transport_template *ib_srp_transport_template; +static struct workqueue_struct *srp_remove_wq; + +static struct ib_client srp_client = { + .name = "srp", + .add = srp_add_one, + .remove = srp_remove_one +}; + +static struct ib_sa_client srp_sa_client; + +static int srp_tmo_get(char *buffer, const struct kernel_param *kp) +{ + int tmo = *(int *)kp->arg; + + if (tmo >= 0) + return sprintf(buffer, "%d", tmo); + else + return sprintf(buffer, "off"); +} + +static int srp_tmo_set(const char *val, const struct kernel_param *kp) +{ + int tmo, res; + + if (strncmp(val, "off", 3) != 0) { + res = kstrtoint(val, 0, &tmo); + if (res) + goto out; + } else { + tmo = -1; + } + if (kp->arg == &srp_reconnect_delay) + res = srp_tmo_valid(tmo, srp_fast_io_fail_tmo, + srp_dev_loss_tmo); + else if (kp->arg == &srp_fast_io_fail_tmo) + res = srp_tmo_valid(srp_reconnect_delay, tmo, srp_dev_loss_tmo); + else + res = srp_tmo_valid(srp_reconnect_delay, srp_fast_io_fail_tmo, + tmo); + if (res) + goto out; + *(int *)kp->arg = tmo; + +out: + return res; +} + +static struct kernel_param_ops srp_tmo_ops = { + .get = srp_tmo_get, + .set = srp_tmo_set, +}; + +static inline struct srp_target_port *host_to_target(struct Scsi_Host *host) +{ + return (struct srp_target_port *) host->hostdata; +} + +static const char *srp_target_info(struct Scsi_Host *host) +{ + return host_to_target(host)->target_name; +} + +static int srp_target_is_topspin(struct srp_target_port *target) +{ + static const u8 topspin_oui[3] = { 0x00, 0x05, 0xad }; + static const u8 cisco_oui[3] = { 0x00, 0x1b, 0x0d }; + + return topspin_workarounds && + (!memcmp(&target->ioc_guid, topspin_oui, sizeof topspin_oui) || + !memcmp(&target->ioc_guid, cisco_oui, sizeof cisco_oui)); +} + +static struct srp_iu *srp_alloc_iu(struct srp_host *host, size_t size, + gfp_t gfp_mask, + enum dma_data_direction direction) +{ + struct srp_iu *iu; + + iu = kmalloc(sizeof *iu, gfp_mask); + if (!iu) + goto out; + + iu->buf = kzalloc(size, gfp_mask); + if (!iu->buf) + goto out_free_iu; + + iu->dma = ib_dma_map_single(host->srp_dev->dev, iu->buf, size, + direction); + if (ib_dma_mapping_error(host->srp_dev->dev, iu->dma)) + goto out_free_buf; + + iu->size = size; + iu->direction = direction; + + return iu; + +out_free_buf: + kfree(iu->buf); +out_free_iu: + kfree(iu); +out: + return NULL; +} + +static void srp_free_iu(struct srp_host *host, struct srp_iu *iu) +{ + if (!iu) + return; + + ib_dma_unmap_single(host->srp_dev->dev, iu->dma, iu->size, + iu->direction); + kfree(iu->buf); + kfree(iu); +} + +static void srp_qp_event(struct ib_event *event, void *context) +{ + pr_debug("QP event %d\n", event->event); +} + +static int srp_init_qp(struct srp_target_port *target, + struct ib_qp *qp) +{ + struct ib_qp_attr *attr; + int ret; + + attr = kmalloc(sizeof *attr, GFP_KERNEL); + if (!attr) + return -ENOMEM; + + ret = ib_find_pkey(target->srp_host->srp_dev->dev, + target->srp_host->port, + be16_to_cpu(target->path.pkey), + &attr->pkey_index); + if (ret) + goto out; + + attr->qp_state = IB_QPS_INIT; + attr->qp_access_flags = (IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE); + attr->port_num = target->srp_host->port; + + ret = ib_modify_qp(qp, attr, + IB_QP_STATE | + IB_QP_PKEY_INDEX | + IB_QP_ACCESS_FLAGS | + IB_QP_PORT); + +out: + kfree(attr); + return ret; +} + +static int srp_new_cm_id(struct srp_target_port *target) +{ + struct ib_cm_id *new_cm_id; + + new_cm_id = ib_create_cm_id(target->srp_host->srp_dev->dev, + srp_cm_handler, target); + if (IS_ERR(new_cm_id)) + return PTR_ERR(new_cm_id); + + if (target->cm_id) + ib_destroy_cm_id(target->cm_id); + target->cm_id = new_cm_id; + + return 0; +} + +static struct ib_fmr_pool *srp_alloc_fmr_pool(struct srp_target_port *target) +{ + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_fmr_pool_param fmr_param; + + memset(&fmr_param, 0, sizeof(fmr_param)); + fmr_param.pool_size = target->scsi_host->can_queue; + fmr_param.dirty_watermark = fmr_param.pool_size / 4; + fmr_param.cache = 1; + fmr_param.max_pages_per_fmr = dev->max_pages_per_mr; + fmr_param.page_shift = ilog2(dev->mr_page_size); + fmr_param.access = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ); + + return ib_create_fmr_pool(dev->pd, &fmr_param); +} + +/** + * srp_destroy_fr_pool() - free the resources owned by a pool + * @pool: Fast registration pool to be destroyed. + */ +static void srp_destroy_fr_pool(struct srp_fr_pool *pool) +{ + int i; + struct srp_fr_desc *d; + + if (!pool) + return; + + for (i = 0, d = &pool->desc[0]; i < pool->size; i++, d++) { + if (d->frpl) + ib_free_fast_reg_page_list(d->frpl); + if (d->mr) + ib_dereg_mr(d->mr); + } + kfree(pool); +} + +/** + * srp_create_fr_pool() - allocate and initialize a pool for fast registration + * @device: IB device to allocate fast registration descriptors for. + * @pd: Protection domain associated with the FR descriptors. + * @pool_size: Number of descriptors to allocate. + * @max_page_list_len: Maximum fast registration work request page list length. + */ +static struct srp_fr_pool *srp_create_fr_pool(struct ib_device *device, + struct ib_pd *pd, int pool_size, + int max_page_list_len) +{ + struct srp_fr_pool *pool; + struct srp_fr_desc *d; + struct ib_mr *mr; + struct ib_fast_reg_page_list *frpl; + int i, ret = -EINVAL; + + if (pool_size <= 0) + goto err; + ret = -ENOMEM; + pool = kzalloc(sizeof(struct srp_fr_pool) + + pool_size * sizeof(struct srp_fr_desc), GFP_KERNEL); + if (!pool) + goto err; + pool->size = pool_size; + pool->max_page_list_len = max_page_list_len; + spin_lock_init(&pool->lock); + INIT_LIST_HEAD(&pool->free_list); + + for (i = 0, d = &pool->desc[0]; i < pool->size; i++, d++) { + mr = ib_alloc_fast_reg_mr(pd, max_page_list_len); + if (IS_ERR(mr)) { + ret = PTR_ERR(mr); + goto destroy_pool; + } + d->mr = mr; + frpl = ib_alloc_fast_reg_page_list(device, max_page_list_len); + if (IS_ERR(frpl)) { + ret = PTR_ERR(frpl); + goto destroy_pool; + } + d->frpl = frpl; + list_add_tail(&d->entry, &pool->free_list); + } + +out: + return pool; + +destroy_pool: + srp_destroy_fr_pool(pool); + +err: + pool = ERR_PTR(ret); + goto out; +} + +/** + * srp_fr_pool_get() - obtain a descriptor suitable for fast registration + * @pool: Pool to obtain descriptor from. + */ +static struct srp_fr_desc *srp_fr_pool_get(struct srp_fr_pool *pool) +{ + struct srp_fr_desc *d = NULL; + unsigned long flags; + + spin_lock_irqsave(&pool->lock, flags); + if (!list_empty(&pool->free_list)) { + d = list_first_entry(&pool->free_list, typeof(*d), entry); + list_del(&d->entry); + } + spin_unlock_irqrestore(&pool->lock, flags); + + return d; +} + +/** + * srp_fr_pool_put() - put an FR descriptor back in the free list + * @pool: Pool the descriptor was allocated from. + * @desc: Pointer to an array of fast registration descriptor pointers. + * @n: Number of descriptors to put back. + * + * Note: The caller must already have queued an invalidation request for + * desc->mr->rkey before calling this function. + */ +static void srp_fr_pool_put(struct srp_fr_pool *pool, struct srp_fr_desc **desc, + int n) +{ + unsigned long flags; + int i; + + spin_lock_irqsave(&pool->lock, flags); + for (i = 0; i < n; i++) + list_add(&desc[i]->entry, &pool->free_list); + spin_unlock_irqrestore(&pool->lock, flags); +} + +static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target) +{ + struct srp_device *dev = target->srp_host->srp_dev; + + return srp_create_fr_pool(dev->dev, dev->pd, + target->scsi_host->can_queue, + dev->max_pages_per_mr); +} + +static int srp_create_target_ib(struct srp_target_port *target) +{ + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_qp_init_attr *init_attr; + struct ib_cq *recv_cq, *send_cq; + struct ib_qp *qp; + struct ib_fmr_pool *fmr_pool = NULL; + struct srp_fr_pool *fr_pool = NULL; + const int m = 1 + dev->use_fast_reg; + int ret; + + init_attr = kzalloc(sizeof *init_attr, GFP_KERNEL); + if (!init_attr) + return -ENOMEM; + + recv_cq = ib_create_cq(dev->dev, srp_recv_completion, NULL, target, + target->queue_size, target->comp_vector); + if (IS_ERR(recv_cq)) { + ret = PTR_ERR(recv_cq); + goto err; + } + + send_cq = ib_create_cq(dev->dev, srp_send_completion, NULL, target, + m * target->queue_size, target->comp_vector); + if (IS_ERR(send_cq)) { + ret = PTR_ERR(send_cq); + goto err_recv_cq; + } + + ib_req_notify_cq(recv_cq, IB_CQ_NEXT_COMP); + + init_attr->event_handler = srp_qp_event; + init_attr->cap.max_send_wr = m * target->queue_size; + init_attr->cap.max_recv_wr = target->queue_size; + init_attr->cap.max_recv_sge = 1; + init_attr->cap.max_send_sge = 1; + init_attr->sq_sig_type = IB_SIGNAL_REQ_WR; + init_attr->qp_type = IB_QPT_RC; + init_attr->send_cq = send_cq; + init_attr->recv_cq = recv_cq; + + qp = ib_create_qp(dev->pd, init_attr); + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); + goto err_send_cq; + } + + ret = srp_init_qp(target, qp); + if (ret) + goto err_qp; + + if (dev->use_fast_reg && dev->has_fr) { + fr_pool = srp_alloc_fr_pool(target); + if (IS_ERR(fr_pool)) { + ret = PTR_ERR(fr_pool); + shost_printk(KERN_WARNING, target->scsi_host, PFX + "FR pool allocation failed (%d)\n", ret); + goto err_qp; + } + if (target->fr_pool) + srp_destroy_fr_pool(target->fr_pool); + target->fr_pool = fr_pool; + } else if (!dev->use_fast_reg && dev->has_fmr) { + fmr_pool = srp_alloc_fmr_pool(target); + if (IS_ERR(fmr_pool)) { + ret = PTR_ERR(fmr_pool); + shost_printk(KERN_WARNING, target->scsi_host, PFX + "FMR pool allocation failed (%d)\n", ret); + goto err_qp; + } + if (target->fmr_pool) + ib_destroy_fmr_pool(target->fmr_pool); + target->fmr_pool = fmr_pool; + } + + if (target->qp) + ib_destroy_qp(target->qp); + if (target->recv_cq) + ib_destroy_cq(target->recv_cq); + if (target->send_cq) + ib_destroy_cq(target->send_cq); + + target->qp = qp; + target->recv_cq = recv_cq; + target->send_cq = send_cq; + + kfree(init_attr); + return 0; + +err_qp: + ib_destroy_qp(qp); + +err_send_cq: + ib_destroy_cq(send_cq); + +err_recv_cq: + ib_destroy_cq(recv_cq); + +err: + kfree(init_attr); + return ret; +} + +/* + * Note: this function may be called without srp_alloc_iu_bufs() having been + * invoked. Hence the target->[rt]x_ring checks. + */ +static void srp_free_target_ib(struct srp_target_port *target) +{ + struct srp_device *dev = target->srp_host->srp_dev; + int i; + + if (dev->use_fast_reg) { + if (target->fr_pool) + srp_destroy_fr_pool(target->fr_pool); + } else { + if (target->fmr_pool) + ib_destroy_fmr_pool(target->fmr_pool); + } + ib_destroy_qp(target->qp); + ib_destroy_cq(target->send_cq); + ib_destroy_cq(target->recv_cq); + + target->qp = NULL; + target->send_cq = target->recv_cq = NULL; + + if (target->rx_ring) { + for (i = 0; i < target->queue_size; ++i) + srp_free_iu(target->srp_host, target->rx_ring[i]); + kfree(target->rx_ring); + target->rx_ring = NULL; + } + if (target->tx_ring) { + for (i = 0; i < target->queue_size; ++i) + srp_free_iu(target->srp_host, target->tx_ring[i]); + kfree(target->tx_ring); + target->tx_ring = NULL; + } +} + +static void srp_path_rec_completion(int status, + struct ib_sa_path_rec *pathrec, + void *target_ptr) +{ + struct srp_target_port *target = target_ptr; + + target->status = status; + if (status) + shost_printk(KERN_ERR, target->scsi_host, + PFX "Got failed path rec status %d\n", status); + else + target->path = *pathrec; + complete(&target->done); +} + +static int srp_lookup_path(struct srp_target_port *target) +{ + int ret; + + target->path.numb_path = 1; + + init_completion(&target->done); + + target->path_query_id = ib_sa_path_rec_get(&srp_sa_client, + target->srp_host->srp_dev->dev, + target->srp_host->port, + &target->path, + IB_SA_PATH_REC_SERVICE_ID | + IB_SA_PATH_REC_DGID | + IB_SA_PATH_REC_SGID | + IB_SA_PATH_REC_NUMB_PATH | + IB_SA_PATH_REC_PKEY, + SRP_PATH_REC_TIMEOUT_MS, + GFP_KERNEL, + srp_path_rec_completion, + target, &target->path_query); + if (target->path_query_id < 0) + return target->path_query_id; + + ret = wait_for_completion_interruptible(&target->done); + if (ret < 0) + return ret; + + if (target->status < 0) + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Path record query failed\n"); + + return target->status; +} + +static int srp_send_req(struct srp_target_port *target) +{ + struct { + struct ib_cm_req_param param; + struct srp_login_req priv; + } *req = NULL; + int status; + + req = kzalloc(sizeof *req, GFP_KERNEL); + if (!req) + return -ENOMEM; + + req->param.primary_path = &target->path; + req->param.alternate_path = NULL; + req->param.service_id = target->service_id; + req->param.qp_num = target->qp->qp_num; + req->param.qp_type = target->qp->qp_type; + req->param.private_data = &req->priv; + req->param.private_data_len = sizeof req->priv; + req->param.flow_control = 1; + + get_random_bytes(&req->param.starting_psn, 4); + req->param.starting_psn &= 0xffffff; + + /* + * Pick some arbitrary defaults here; we could make these + * module parameters if anyone cared about setting them. + */ + req->param.responder_resources = 4; + req->param.remote_cm_response_timeout = 20; + req->param.local_cm_response_timeout = 20; + req->param.retry_count = target->tl_retry_count; + req->param.rnr_retry_count = 7; + req->param.max_cm_retries = 15; + + req->priv.opcode = SRP_LOGIN_REQ; + req->priv.tag = 0; + req->priv.req_it_iu_len = cpu_to_be32(target->max_iu_len); + req->priv.req_buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT | + SRP_BUF_FORMAT_INDIRECT); + /* + * In the published SRP specification (draft rev. 16a), the + * port identifier format is 8 bytes of ID extension followed + * by 8 bytes of GUID. Older drafts put the two halves in the + * opposite order, so that the GUID comes first. + * + * Targets conforming to these obsolete drafts can be + * recognized by the I/O Class they report. + */ + if (target->io_class == SRP_REV10_IB_IO_CLASS) { + memcpy(req->priv.initiator_port_id, + &target->path.sgid.global.interface_id, 8); + memcpy(req->priv.initiator_port_id + 8, + &target->initiator_ext, 8); + memcpy(req->priv.target_port_id, &target->ioc_guid, 8); + memcpy(req->priv.target_port_id + 8, &target->id_ext, 8); + } else { + memcpy(req->priv.initiator_port_id, + &target->initiator_ext, 8); + memcpy(req->priv.initiator_port_id + 8, + &target->path.sgid.global.interface_id, 8); + memcpy(req->priv.target_port_id, &target->id_ext, 8); + memcpy(req->priv.target_port_id + 8, &target->ioc_guid, 8); + } + + /* + * Topspin/Cisco SRP targets will reject our login unless we + * zero out the first 8 bytes of our initiator port ID and set + * the second 8 bytes to the local node GUID. + */ + if (srp_target_is_topspin(target)) { + shost_printk(KERN_DEBUG, target->scsi_host, + PFX "Topspin/Cisco initiator port ID workaround " + "activated for target GUID %016llx\n", + (unsigned long long) be64_to_cpu(target->ioc_guid)); + memset(req->priv.initiator_port_id, 0, 8); + memcpy(req->priv.initiator_port_id + 8, + &target->srp_host->srp_dev->dev->node_guid, 8); + } + + status = ib_send_cm_req(target->cm_id, &req->param); + + kfree(req); + + return status; +} + +static bool srp_queue_remove_work(struct srp_target_port *target) +{ + bool changed = false; + + spin_lock_irq(&target->lock); + if (target->state != SRP_TARGET_REMOVED) { + target->state = SRP_TARGET_REMOVED; + changed = true; + } + spin_unlock_irq(&target->lock); + + if (changed) + queue_work(srp_remove_wq, &target->remove_work); + + return changed; +} + +static bool srp_change_conn_state(struct srp_target_port *target, + bool connected) +{ + bool changed = false; + + spin_lock_irq(&target->lock); + if (target->connected != connected) { + target->connected = connected; + changed = true; + } + spin_unlock_irq(&target->lock); + + return changed; +} + +static void srp_disconnect_target(struct srp_target_port *target) +{ + if (srp_change_conn_state(target, false)) { + /* XXX should send SRP_I_LOGOUT request */ + + if (ib_send_cm_dreq(target->cm_id, NULL, 0)) { + shost_printk(KERN_DEBUG, target->scsi_host, + PFX "Sending CM DREQ failed\n"); + } + } +} + +static void srp_free_req_data(struct srp_target_port *target) +{ + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_device *ibdev = dev->dev; + struct srp_request *req; + int i; + + if (!target->req_ring) + return; + + for (i = 0; i < target->req_ring_size; ++i) { + req = &target->req_ring[i]; + if (dev->use_fast_reg) + kfree(req->fr_list); + else + kfree(req->fmr_list); + kfree(req->map_page); + if (req->indirect_dma_addr) { + ib_dma_unmap_single(ibdev, req->indirect_dma_addr, + target->indirect_size, + DMA_TO_DEVICE); + } + kfree(req->indirect_desc); + } + + kfree(target->req_ring); + target->req_ring = NULL; +} + +static int srp_alloc_req_data(struct srp_target_port *target) +{ + struct srp_device *srp_dev = target->srp_host->srp_dev; + struct ib_device *ibdev = srp_dev->dev; + struct srp_request *req; + void *mr_list; + dma_addr_t dma_addr; + int i, ret = -ENOMEM; + + INIT_LIST_HEAD(&target->free_reqs); + + target->req_ring = kzalloc(target->req_ring_size * + sizeof(*target->req_ring), GFP_KERNEL); + if (!target->req_ring) + goto out; + + for (i = 0; i < target->req_ring_size; ++i) { + req = &target->req_ring[i]; + mr_list = kmalloc(target->cmd_sg_cnt * sizeof(void *), + GFP_KERNEL); + if (!mr_list) + goto out; + if (srp_dev->use_fast_reg) + req->fr_list = mr_list; + else + req->fmr_list = mr_list; + req->map_page = kmalloc(srp_dev->max_pages_per_mr * + sizeof(void *), GFP_KERNEL); + if (!req->map_page) + goto out; + req->indirect_desc = kmalloc(target->indirect_size, GFP_KERNEL); + if (!req->indirect_desc) + goto out; + + dma_addr = ib_dma_map_single(ibdev, req->indirect_desc, + target->indirect_size, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(ibdev, dma_addr)) + goto out; + + req->indirect_dma_addr = dma_addr; + req->index = i; + list_add_tail(&req->list, &target->free_reqs); + } + ret = 0; + +out: + return ret; +} + +/** + * srp_del_scsi_host_attr() - Remove attributes defined in the host template. + * @shost: SCSI host whose attributes to remove from sysfs. + * + * Note: Any attributes defined in the host template and that did not exist + * before invocation of this function will be ignored. + */ +static void srp_del_scsi_host_attr(struct Scsi_Host *shost) +{ + struct device_attribute **attr; + + for (attr = shost->hostt->shost_attrs; attr && *attr; ++attr) + device_remove_file(&shost->shost_dev, *attr); +} + +static void srp_remove_target(struct srp_target_port *target) +{ + WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED); + + srp_del_scsi_host_attr(target->scsi_host); + srp_rport_get(target->rport); + srp_remove_host(target->scsi_host); + scsi_remove_host(target->scsi_host); + srp_stop_rport_timers(target->rport); + srp_disconnect_target(target); + ib_destroy_cm_id(target->cm_id); + srp_free_target_ib(target); + cancel_work_sync(&target->tl_err_work); + srp_rport_put(target->rport); + srp_free_req_data(target); + + spin_lock(&target->srp_host->target_lock); + list_del(&target->list); + spin_unlock(&target->srp_host->target_lock); + + scsi_host_put(target->scsi_host); +} + +static void srp_remove_work(struct work_struct *work) +{ + struct srp_target_port *target = + container_of(work, struct srp_target_port, remove_work); + + WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED); + + srp_remove_target(target); +} + +static void srp_rport_delete(struct srp_rport *rport) +{ + struct srp_target_port *target = rport->lld_data; + + srp_queue_remove_work(target); +} + +static int srp_connect_target(struct srp_target_port *target) +{ + int retries = 3; + int ret; + + WARN_ON_ONCE(target->connected); + + target->qp_in_error = false; + + ret = srp_lookup_path(target); + if (ret) + return ret; + + while (1) { + init_completion(&target->done); + ret = srp_send_req(target); + if (ret) + return ret; + ret = wait_for_completion_interruptible(&target->done); + if (ret < 0) + return ret; + + /* + * The CM event handling code will set status to + * SRP_PORT_REDIRECT if we get a port redirect REJ + * back, or SRP_DLID_REDIRECT if we get a lid/qp + * redirect REJ back. + */ + switch (target->status) { + case 0: + srp_change_conn_state(target, true); + return 0; + + case SRP_PORT_REDIRECT: + ret = srp_lookup_path(target); + if (ret) + return ret; + break; + + case SRP_DLID_REDIRECT: + break; + + case SRP_STALE_CONN: + /* Our current CM id was stale, and is now in timewait. + * Try to reconnect with a new one. + */ + if (!retries-- || srp_new_cm_id(target)) { + shost_printk(KERN_ERR, target->scsi_host, PFX + "giving up on stale connection\n"); + target->status = -ECONNRESET; + return target->status; + } + + shost_printk(KERN_ERR, target->scsi_host, PFX + "retrying stale connection\n"); + break; + + default: + return target->status; + } + } +} + +static int srp_inv_rkey(struct srp_target_port *target, u32 rkey) +{ + struct ib_send_wr *bad_wr; + struct ib_send_wr wr = { + .opcode = IB_WR_LOCAL_INV, + .wr_id = LOCAL_INV_WR_ID_MASK, + .next = NULL, + .num_sge = 0, + .send_flags = 0, + .ex.invalidate_rkey = rkey, + }; + + return ib_post_send(target->qp, &wr, &bad_wr); +} + +static void srp_unmap_data(struct scsi_cmnd *scmnd, + struct srp_target_port *target, + struct srp_request *req) +{ + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_device *ibdev = dev->dev; + int i, res; + + if (!scsi_sglist(scmnd) || + (scmnd->sc_data_direction != DMA_TO_DEVICE && + scmnd->sc_data_direction != DMA_FROM_DEVICE)) + return; + + if (dev->use_fast_reg) { + struct srp_fr_desc **pfr; + + for (i = req->nmdesc, pfr = req->fr_list; i > 0; i--, pfr++) { + res = srp_inv_rkey(target, (*pfr)->mr->rkey); + if (res < 0) { + shost_printk(KERN_ERR, target->scsi_host, PFX + "Queueing INV WR for rkey %#x failed (%d)\n", + (*pfr)->mr->rkey, res); + queue_work(system_long_wq, + &target->tl_err_work); + } + } + if (req->nmdesc) + srp_fr_pool_put(target->fr_pool, req->fr_list, + req->nmdesc); + } else { + struct ib_pool_fmr **pfmr; + + for (i = req->nmdesc, pfmr = req->fmr_list; i > 0; i--, pfmr++) + ib_fmr_pool_unmap(*pfmr); + } + + ib_dma_unmap_sg(ibdev, scsi_sglist(scmnd), scsi_sg_count(scmnd), + scmnd->sc_data_direction); +} + +/** + * srp_claim_req - Take ownership of the scmnd associated with a request. + * @target: SRP target port. + * @req: SRP request. + * @sdev: If not NULL, only take ownership for this SCSI device. + * @scmnd: If NULL, take ownership of @req->scmnd. If not NULL, only take + * ownership of @req->scmnd if it equals @scmnd. + * + * Return value: + * Either NULL or a pointer to the SCSI command the caller became owner of. + */ +static struct scsi_cmnd *srp_claim_req(struct srp_target_port *target, + struct srp_request *req, + struct scsi_device *sdev, + struct scsi_cmnd *scmnd) +{ + unsigned long flags; + + spin_lock_irqsave(&target->lock, flags); + if (req->scmnd && + (!sdev || req->scmnd->device == sdev) && + (!scmnd || req->scmnd == scmnd)) { + scmnd = req->scmnd; + req->scmnd = NULL; + } else { + scmnd = NULL; + } + spin_unlock_irqrestore(&target->lock, flags); + + return scmnd; +} + +/** + * srp_free_req() - Unmap data and add request to the free request list. + * @target: SRP target port. + * @req: Request to be freed. + * @scmnd: SCSI command associated with @req. + * @req_lim_delta: Amount to be added to @target->req_lim. + */ +static void srp_free_req(struct srp_target_port *target, + struct srp_request *req, struct scsi_cmnd *scmnd, + s32 req_lim_delta) +{ + unsigned long flags; + + srp_unmap_data(scmnd, target, req); + + spin_lock_irqsave(&target->lock, flags); + target->req_lim += req_lim_delta; + list_add_tail(&req->list, &target->free_reqs); + spin_unlock_irqrestore(&target->lock, flags); +} + +static void srp_finish_req(struct srp_target_port *target, + struct srp_request *req, struct scsi_device *sdev, + int result) +{ + struct scsi_cmnd *scmnd = srp_claim_req(target, req, sdev, NULL); + + if (scmnd) { + srp_free_req(target, req, scmnd, 0); + scmnd->result = result; + scmnd->scsi_done(scmnd); + } +} + +static void srp_terminate_io(struct srp_rport *rport) +{ + struct srp_target_port *target = rport->lld_data; + struct Scsi_Host *shost = target->scsi_host; + struct scsi_device *sdev; + int i; + + /* + * Invoking srp_terminate_io() while srp_queuecommand() is running + * is not safe. Hence the warning statement below. + */ + shost_for_each_device(sdev, shost) + WARN_ON_ONCE(sdev->request_queue->request_fn_active); + + for (i = 0; i < target->req_ring_size; ++i) { + struct srp_request *req = &target->req_ring[i]; + srp_finish_req(target, req, NULL, DID_TRANSPORT_FAILFAST << 16); + } +} + +/* + * It is up to the caller to ensure that srp_rport_reconnect() calls are + * serialized and that no concurrent srp_queuecommand(), srp_abort(), + * srp_reset_device() or srp_reset_host() calls will occur while this function + * is in progress. One way to realize that is not to call this function + * directly but to call srp_reconnect_rport() instead since that last function + * serializes calls of this function via rport->mutex and also blocks + * srp_queuecommand() calls before invoking this function. + */ +static int srp_rport_reconnect(struct srp_rport *rport) +{ + struct srp_target_port *target = rport->lld_data; + int i, ret; + + srp_disconnect_target(target); + /* + * Now get a new local CM ID so that we avoid confusing the target in + * case things are really fouled up. Doing so also ensures that all CM + * callbacks will have finished before a new QP is allocated. + */ + ret = srp_new_cm_id(target); + + for (i = 0; i < target->req_ring_size; ++i) { + struct srp_request *req = &target->req_ring[i]; + srp_finish_req(target, req, NULL, DID_RESET << 16); + } + + /* + * Whether or not creating a new CM ID succeeded, create a new + * QP. This guarantees that all callback functions for the old QP have + * finished before any send requests are posted on the new QP. + */ + ret += srp_create_target_ib(target); + + INIT_LIST_HEAD(&target->free_tx); + for (i = 0; i < target->queue_size; ++i) + list_add(&target->tx_ring[i]->list, &target->free_tx); + + if (ret == 0) + ret = srp_connect_target(target); + + if (ret == 0) + shost_printk(KERN_INFO, target->scsi_host, + PFX "reconnect succeeded\n"); + + return ret; +} + +static void srp_map_desc(struct srp_map_state *state, dma_addr_t dma_addr, + unsigned int dma_len, u32 rkey) +{ + struct srp_direct_buf *desc = state->desc; + + desc->va = cpu_to_be64(dma_addr); + desc->key = cpu_to_be32(rkey); + desc->len = cpu_to_be32(dma_len); + + state->total_len += dma_len; + state->desc++; + state->ndesc++; +} + +static int srp_map_finish_fmr(struct srp_map_state *state, + struct srp_target_port *target) +{ + struct ib_pool_fmr *fmr; + u64 io_addr = 0; + + fmr = ib_fmr_pool_map_phys(target->fmr_pool, state->pages, + state->npages, io_addr); + if (IS_ERR(fmr)) + return PTR_ERR(fmr); + + *state->next_fmr++ = fmr; + state->nmdesc++; + + srp_map_desc(state, 0, state->dma_len, fmr->fmr->rkey); + + return 0; +} + +static int srp_map_finish_fr(struct srp_map_state *state, + struct srp_target_port *target) +{ + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_send_wr *bad_wr; + struct ib_send_wr wr; + struct srp_fr_desc *desc; + u32 rkey; + + desc = srp_fr_pool_get(target->fr_pool); + if (!desc) + return -ENOMEM; + + rkey = ib_inc_rkey(desc->mr->rkey); + ib_update_fast_reg_key(desc->mr, rkey); + + memcpy(desc->frpl->page_list, state->pages, + sizeof(state->pages[0]) * state->npages); + + memset(&wr, 0, sizeof(wr)); + wr.opcode = IB_WR_FAST_REG_MR; + wr.wr_id = FAST_REG_WR_ID_MASK; + wr.wr.fast_reg.iova_start = state->base_dma_addr; + wr.wr.fast_reg.page_list = desc->frpl; + wr.wr.fast_reg.page_list_len = state->npages; + wr.wr.fast_reg.page_shift = ilog2(dev->mr_page_size); + wr.wr.fast_reg.length = state->dma_len; + wr.wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE); + wr.wr.fast_reg.rkey = desc->mr->lkey; + + *state->next_fr++ = desc; + state->nmdesc++; + + srp_map_desc(state, state->base_dma_addr, state->dma_len, + desc->mr->rkey); + + return ib_post_send(target->qp, &wr, &bad_wr); +} + +static int srp_finish_mapping(struct srp_map_state *state, + struct srp_target_port *target) +{ + int ret = 0; + + if (state->npages == 0) + return 0; + + if (state->npages == 1 && !register_always) + srp_map_desc(state, state->base_dma_addr, state->dma_len, + target->rkey); + else + ret = target->srp_host->srp_dev->use_fast_reg ? + srp_map_finish_fr(state, target) : + srp_map_finish_fmr(state, target); + + if (ret == 0) { + state->npages = 0; + state->dma_len = 0; + } + + return ret; +} + +static void srp_map_update_start(struct srp_map_state *state, + struct scatterlist *sg, int sg_index, + dma_addr_t dma_addr) +{ + state->unmapped_sg = sg; + state->unmapped_index = sg_index; + state->unmapped_addr = dma_addr; +} + +static int srp_map_sg_entry(struct srp_map_state *state, + struct srp_target_port *target, + struct scatterlist *sg, int sg_index, + bool use_mr) +{ + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_device *ibdev = dev->dev; + dma_addr_t dma_addr = ib_sg_dma_address(ibdev, sg); + unsigned int dma_len = ib_sg_dma_len(ibdev, sg); + unsigned int len; + int ret; + + if (!dma_len) + return 0; + + if (!use_mr) { + /* + * Once we're in direct map mode for a request, we don't + * go back to FMR or FR mode, so no need to update anything + * other than the descriptor. + */ + srp_map_desc(state, dma_addr, dma_len, target->rkey); + return 0; + } + + /* + * Since not all RDMA HW drivers support non-zero page offsets for + * FMR, if we start at an offset into a page, don't merge into the + * current FMR mapping. Finish it out, and use the kernel's MR for + * this sg entry. + */ + if ((!dev->use_fast_reg && dma_addr & ~dev->mr_page_mask) || + dma_len > dev->mr_max_size) { + ret = srp_finish_mapping(state, target); + if (ret) + return ret; + + srp_map_desc(state, dma_addr, dma_len, target->rkey); + srp_map_update_start(state, NULL, 0, 0); + return 0; + } + + /* + * If this is the first sg that will be mapped via FMR or via FR, save + * our position. We need to know the first unmapped entry, its index, + * and the first unmapped address within that entry to be able to + * restart mapping after an error. + */ + if (!state->unmapped_sg) + srp_map_update_start(state, sg, sg_index, dma_addr); + + while (dma_len) { + unsigned offset = dma_addr & ~dev->mr_page_mask; + if (state->npages == dev->max_pages_per_mr || offset != 0) { + ret = srp_finish_mapping(state, target); + if (ret) + return ret; + + srp_map_update_start(state, sg, sg_index, dma_addr); + } + + len = min_t(unsigned int, dma_len, dev->mr_page_size - offset); + + if (!state->npages) + state->base_dma_addr = dma_addr; + state->pages[state->npages++] = dma_addr & dev->mr_page_mask; + state->dma_len += len; + dma_addr += len; + dma_len -= len; + } + + /* + * If the last entry of the MR wasn't a full page, then we need to + * close it out and start a new one -- we can only merge at page + * boundries. + */ + ret = 0; + if (len != dev->mr_page_size) { + ret = srp_finish_mapping(state, target); + if (!ret) + srp_map_update_start(state, NULL, 0, 0); + } + return ret; +} + +static int srp_map_sg(struct srp_map_state *state, + struct srp_target_port *target, struct srp_request *req, + struct scatterlist *scat, int count) +{ + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_device *ibdev = dev->dev; + struct scatterlist *sg; + int i; + bool use_mr; + + state->desc = req->indirect_desc; + state->pages = req->map_page; + if (dev->use_fast_reg) { + state->next_fr = req->fr_list; + use_mr = !!target->fr_pool; + } else { + state->next_fmr = req->fmr_list; + use_mr = !!target->fmr_pool; + } + + for_each_sg(scat, sg, count, i) { + if (srp_map_sg_entry(state, target, sg, i, use_mr)) { + /* + * Memory registration failed, so backtrack to the + * first unmapped entry and continue on without using + * memory registration. + */ + dma_addr_t dma_addr; + unsigned int dma_len; + +backtrack: + sg = state->unmapped_sg; + i = state->unmapped_index; + + dma_addr = ib_sg_dma_address(ibdev, sg); + dma_len = ib_sg_dma_len(ibdev, sg); + dma_len -= (state->unmapped_addr - dma_addr); + dma_addr = state->unmapped_addr; + use_mr = false; + srp_map_desc(state, dma_addr, dma_len, target->rkey); + } + } + + if (use_mr && srp_finish_mapping(state, target)) + goto backtrack; + + req->nmdesc = state->nmdesc; + + return 0; +} + +static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, + struct srp_request *req) +{ + struct scatterlist *scat; + struct srp_cmd *cmd = req->cmd->buf; + int len, nents, count; + struct srp_device *dev; + struct ib_device *ibdev; + struct srp_map_state state; + struct srp_indirect_buf *indirect_hdr; + u32 table_len; + u8 fmt; + + if (!scsi_sglist(scmnd) || scmnd->sc_data_direction == DMA_NONE) + return sizeof (struct srp_cmd); + + if (scmnd->sc_data_direction != DMA_FROM_DEVICE && + scmnd->sc_data_direction != DMA_TO_DEVICE) { + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Unhandled data direction %d\n", + scmnd->sc_data_direction); + return -EINVAL; + } + + nents = scsi_sg_count(scmnd); + scat = scsi_sglist(scmnd); + + dev = target->srp_host->srp_dev; + ibdev = dev->dev; + + count = ib_dma_map_sg(ibdev, scat, nents, scmnd->sc_data_direction); + if (unlikely(count == 0)) + return -EIO; + + fmt = SRP_DATA_DESC_DIRECT; + len = sizeof (struct srp_cmd) + sizeof (struct srp_direct_buf); + + if (count == 1 && !register_always) { + /* + * The midlayer only generated a single gather/scatter + * entry, or DMA mapping coalesced everything to a + * single entry. So a direct descriptor along with + * the DMA MR suffices. + */ + struct srp_direct_buf *buf = (void *) cmd->add_data; + + buf->va = cpu_to_be64(ib_sg_dma_address(ibdev, scat)); + buf->key = cpu_to_be32(target->rkey); + buf->len = cpu_to_be32(ib_sg_dma_len(ibdev, scat)); + + req->nmdesc = 0; + goto map_complete; + } + + /* + * We have more than one scatter/gather entry, so build our indirect + * descriptor table, trying to merge as many entries as we can. + */ + indirect_hdr = (void *) cmd->add_data; + + ib_dma_sync_single_for_cpu(ibdev, req->indirect_dma_addr, + target->indirect_size, DMA_TO_DEVICE); + + memset(&state, 0, sizeof(state)); + srp_map_sg(&state, target, req, scat, count); + + /* We've mapped the request, now pull as much of the indirect + * descriptor table as we can into the command buffer. If this + * target is not using an external indirect table, we are + * guaranteed to fit into the command, as the SCSI layer won't + * give us more S/G entries than we allow. + */ + if (state.ndesc == 1) { + /* + * Memory registration collapsed the sg-list into one entry, + * so use a direct descriptor. + */ + struct srp_direct_buf *buf = (void *) cmd->add_data; + + *buf = req->indirect_desc[0]; + goto map_complete; + } + + if (unlikely(target->cmd_sg_cnt < state.ndesc && + !target->allow_ext_sg)) { + shost_printk(KERN_ERR, target->scsi_host, + "Could not fit S/G list into SRP_CMD\n"); + return -EIO; + } + + count = min(state.ndesc, target->cmd_sg_cnt); + table_len = state.ndesc * sizeof (struct srp_direct_buf); + + fmt = SRP_DATA_DESC_INDIRECT; + len = sizeof(struct srp_cmd) + sizeof (struct srp_indirect_buf); + len += count * sizeof (struct srp_direct_buf); + + memcpy(indirect_hdr->desc_list, req->indirect_desc, + count * sizeof (struct srp_direct_buf)); + + indirect_hdr->table_desc.va = cpu_to_be64(req->indirect_dma_addr); + indirect_hdr->table_desc.key = cpu_to_be32(target->rkey); + indirect_hdr->table_desc.len = cpu_to_be32(table_len); + indirect_hdr->len = cpu_to_be32(state.total_len); + + if (scmnd->sc_data_direction == DMA_TO_DEVICE) + cmd->data_out_desc_cnt = count; + else + cmd->data_in_desc_cnt = count; + + ib_dma_sync_single_for_device(ibdev, req->indirect_dma_addr, table_len, + DMA_TO_DEVICE); + +map_complete: + if (scmnd->sc_data_direction == DMA_TO_DEVICE) + cmd->buf_fmt = fmt << 4; + else + cmd->buf_fmt = fmt; + + return len; +} + +/* + * Return an IU and possible credit to the free pool + */ +static void srp_put_tx_iu(struct srp_target_port *target, struct srp_iu *iu, + enum srp_iu_type iu_type) +{ + unsigned long flags; + + spin_lock_irqsave(&target->lock, flags); + list_add(&iu->list, &target->free_tx); + if (iu_type != SRP_IU_RSP) + ++target->req_lim; + spin_unlock_irqrestore(&target->lock, flags); +} + +/* + * Must be called with target->lock held to protect req_lim and free_tx. + * If IU is not sent, it must be returned using srp_put_tx_iu(). + * + * Note: + * An upper limit for the number of allocated information units for each + * request type is: + * - SRP_IU_CMD: SRP_CMD_SQ_SIZE, since the SCSI mid-layer never queues + * more than Scsi_Host.can_queue requests. + * - SRP_IU_TSK_MGMT: SRP_TSK_MGMT_SQ_SIZE. + * - SRP_IU_RSP: 1, since a conforming SRP target never sends more than + * one unanswered SRP request to an initiator. + */ +static struct srp_iu *__srp_get_tx_iu(struct srp_target_port *target, + enum srp_iu_type iu_type) +{ + s32 rsv = (iu_type == SRP_IU_TSK_MGMT) ? 0 : SRP_TSK_MGMT_SQ_SIZE; + struct srp_iu *iu; + + srp_send_completion(target->send_cq, target); + + if (list_empty(&target->free_tx)) + return NULL; + + /* Initiator responses to target requests do not consume credits */ + if (iu_type != SRP_IU_RSP) { + if (target->req_lim <= rsv) { + ++target->zero_req_lim; + return NULL; + } + + --target->req_lim; + } + + iu = list_first_entry(&target->free_tx, struct srp_iu, list); + list_del(&iu->list); + return iu; +} + +static int srp_post_send(struct srp_target_port *target, + struct srp_iu *iu, int len) +{ + struct ib_sge list; + struct ib_send_wr wr, *bad_wr; + + list.addr = iu->dma; + list.length = len; + list.lkey = target->lkey; + + wr.next = NULL; + wr.wr_id = (uintptr_t) iu; + wr.sg_list = &list; + wr.num_sge = 1; + wr.opcode = IB_WR_SEND; + wr.send_flags = IB_SEND_SIGNALED; + + return ib_post_send(target->qp, &wr, &bad_wr); +} + +static int srp_post_recv(struct srp_target_port *target, struct srp_iu *iu) +{ + struct ib_recv_wr wr, *bad_wr; + struct ib_sge list; + + list.addr = iu->dma; + list.length = iu->size; + list.lkey = target->lkey; + + wr.next = NULL; + wr.wr_id = (uintptr_t) iu; + wr.sg_list = &list; + wr.num_sge = 1; + + return ib_post_recv(target->qp, &wr, &bad_wr); +} + +static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp) +{ + struct srp_request *req; + struct scsi_cmnd *scmnd; + unsigned long flags; + + if (unlikely(rsp->tag & SRP_TAG_TSK_MGMT)) { + spin_lock_irqsave(&target->lock, flags); + target->req_lim += be32_to_cpu(rsp->req_lim_delta); + spin_unlock_irqrestore(&target->lock, flags); + + target->tsk_mgmt_status = -1; + if (be32_to_cpu(rsp->resp_data_len) >= 4) + target->tsk_mgmt_status = rsp->data[3]; + complete(&target->tsk_mgmt_done); + } else { + req = &target->req_ring[rsp->tag]; + scmnd = srp_claim_req(target, req, NULL, NULL); + if (!scmnd) { + shost_printk(KERN_ERR, target->scsi_host, + "Null scmnd for RSP w/tag %016llx\n", + (unsigned long long) rsp->tag); + + spin_lock_irqsave(&target->lock, flags); + target->req_lim += be32_to_cpu(rsp->req_lim_delta); + spin_unlock_irqrestore(&target->lock, flags); + + return; + } + scmnd->result = rsp->status; + + if (rsp->flags & SRP_RSP_FLAG_SNSVALID) { + memcpy(scmnd->sense_buffer, rsp->data + + be32_to_cpu(rsp->resp_data_len), + min_t(int, be32_to_cpu(rsp->sense_data_len), + SCSI_SENSE_BUFFERSIZE)); + } + + if (unlikely(rsp->flags & SRP_RSP_FLAG_DIUNDER)) + scsi_set_resid(scmnd, be32_to_cpu(rsp->data_in_res_cnt)); + else if (unlikely(rsp->flags & SRP_RSP_FLAG_DIOVER)) + scsi_set_resid(scmnd, -be32_to_cpu(rsp->data_in_res_cnt)); + else if (unlikely(rsp->flags & SRP_RSP_FLAG_DOUNDER)) + scsi_set_resid(scmnd, be32_to_cpu(rsp->data_out_res_cnt)); + else if (unlikely(rsp->flags & SRP_RSP_FLAG_DOOVER)) + scsi_set_resid(scmnd, -be32_to_cpu(rsp->data_out_res_cnt)); + + srp_free_req(target, req, scmnd, + be32_to_cpu(rsp->req_lim_delta)); + + scmnd->host_scribble = NULL; + scmnd->scsi_done(scmnd); + } +} + +static int srp_response_common(struct srp_target_port *target, s32 req_delta, + void *rsp, int len) +{ + struct ib_device *dev = target->srp_host->srp_dev->dev; + unsigned long flags; + struct srp_iu *iu; + int err; + + spin_lock_irqsave(&target->lock, flags); + target->req_lim += req_delta; + iu = __srp_get_tx_iu(target, SRP_IU_RSP); + spin_unlock_irqrestore(&target->lock, flags); + + if (!iu) { + shost_printk(KERN_ERR, target->scsi_host, PFX + "no IU available to send response\n"); + return 1; + } + + ib_dma_sync_single_for_cpu(dev, iu->dma, len, DMA_TO_DEVICE); + memcpy(iu->buf, rsp, len); + ib_dma_sync_single_for_device(dev, iu->dma, len, DMA_TO_DEVICE); + + err = srp_post_send(target, iu, len); + if (err) { + shost_printk(KERN_ERR, target->scsi_host, PFX + "unable to post response: %d\n", err); + srp_put_tx_iu(target, iu, SRP_IU_RSP); + } + + return err; +} + +static void srp_process_cred_req(struct srp_target_port *target, + struct srp_cred_req *req) +{ + struct srp_cred_rsp rsp = { + .opcode = SRP_CRED_RSP, + .tag = req->tag, + }; + s32 delta = be32_to_cpu(req->req_lim_delta); + + if (srp_response_common(target, delta, &rsp, sizeof rsp)) + shost_printk(KERN_ERR, target->scsi_host, PFX + "problems processing SRP_CRED_REQ\n"); +} + +static void srp_process_aer_req(struct srp_target_port *target, + struct srp_aer_req *req) +{ + struct srp_aer_rsp rsp = { + .opcode = SRP_AER_RSP, + .tag = req->tag, + }; + s32 delta = be32_to_cpu(req->req_lim_delta); + + shost_printk(KERN_ERR, target->scsi_host, PFX + "ignoring AER for LUN %llu\n", be64_to_cpu(req->lun)); + + if (srp_response_common(target, delta, &rsp, sizeof rsp)) + shost_printk(KERN_ERR, target->scsi_host, PFX + "problems processing SRP_AER_REQ\n"); +} + +static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc) +{ + struct ib_device *dev = target->srp_host->srp_dev->dev; + struct srp_iu *iu = (struct srp_iu *) (uintptr_t) wc->wr_id; + int res; + u8 opcode; + + ib_dma_sync_single_for_cpu(dev, iu->dma, target->max_ti_iu_len, + DMA_FROM_DEVICE); + + opcode = *(u8 *) iu->buf; + + if (0) { + shost_printk(KERN_ERR, target->scsi_host, + PFX "recv completion, opcode 0x%02x\n", opcode); + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 8, 1, + iu->buf, wc->byte_len, true); + } + + switch (opcode) { + case SRP_RSP: + srp_process_rsp(target, iu->buf); + break; + + case SRP_CRED_REQ: + srp_process_cred_req(target, iu->buf); + break; + + case SRP_AER_REQ: + srp_process_aer_req(target, iu->buf); + break; + + case SRP_T_LOGOUT: + /* XXX Handle target logout */ + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Got target logout request\n"); + break; + + default: + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Unhandled SRP opcode 0x%02x\n", opcode); + break; + } + + ib_dma_sync_single_for_device(dev, iu->dma, target->max_ti_iu_len, + DMA_FROM_DEVICE); + + res = srp_post_recv(target, iu); + if (res != 0) + shost_printk(KERN_ERR, target->scsi_host, + PFX "Recv failed with error code %d\n", res); +} + +/** + * srp_tl_err_work() - handle a transport layer error + * @work: Work structure embedded in an SRP target port. + * + * Note: This function may get invoked before the rport has been created, + * hence the target->rport test. + */ +static void srp_tl_err_work(struct work_struct *work) +{ + struct srp_target_port *target; + + target = container_of(work, struct srp_target_port, tl_err_work); + if (target->rport) + srp_start_tl_fail_timers(target->rport); +} + +static void srp_handle_qp_err(u64 wr_id, enum ib_wc_status wc_status, + bool send_err, struct srp_target_port *target) +{ + if (target->connected && !target->qp_in_error) { + if (wr_id & LOCAL_INV_WR_ID_MASK) { + shost_printk(KERN_ERR, target->scsi_host, PFX + "LOCAL_INV failed with status %d\n", + wc_status); + } else if (wr_id & FAST_REG_WR_ID_MASK) { + shost_printk(KERN_ERR, target->scsi_host, PFX + "FAST_REG_MR failed status %d\n", + wc_status); + } else { + shost_printk(KERN_ERR, target->scsi_host, + PFX "failed %s status %d for iu %p\n", + send_err ? "send" : "receive", + wc_status, (void *)(uintptr_t)wr_id); + } + queue_work(system_long_wq, &target->tl_err_work); + } + target->qp_in_error = true; +} + +static void srp_recv_completion(struct ib_cq *cq, void *target_ptr) +{ + struct srp_target_port *target = target_ptr; + struct ib_wc wc; + + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + while (ib_poll_cq(cq, 1, &wc) > 0) { + if (likely(wc.status == IB_WC_SUCCESS)) { + srp_handle_recv(target, &wc); + } else { + srp_handle_qp_err(wc.wr_id, wc.status, false, target); + } + } +} + +static void srp_send_completion(struct ib_cq *cq, void *target_ptr) +{ + struct srp_target_port *target = target_ptr; + struct ib_wc wc; + struct srp_iu *iu; + + while (ib_poll_cq(cq, 1, &wc) > 0) { + if (likely(wc.status == IB_WC_SUCCESS)) { + iu = (struct srp_iu *) (uintptr_t) wc.wr_id; + list_add(&iu->list, &target->free_tx); + } else { + srp_handle_qp_err(wc.wr_id, wc.status, true, target); + } + } +} + +static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) +{ + struct srp_target_port *target = host_to_target(shost); + struct srp_rport *rport = target->rport; + struct srp_request *req; + struct srp_iu *iu; + struct srp_cmd *cmd; + struct ib_device *dev; + unsigned long flags; + int len, ret; + const bool in_scsi_eh = !in_interrupt() && current == shost->ehandler; + + /* + * The SCSI EH thread is the only context from which srp_queuecommand() + * can get invoked for blocked devices (SDEV_BLOCK / + * SDEV_CREATED_BLOCK). Avoid racing with srp_reconnect_rport() by + * locking the rport mutex if invoked from inside the SCSI EH. + */ + if (in_scsi_eh) + mutex_lock(&rport->mutex); + + scmnd->result = srp_chkready(target->rport); + if (unlikely(scmnd->result)) + goto err; + + spin_lock_irqsave(&target->lock, flags); + iu = __srp_get_tx_iu(target, SRP_IU_CMD); + if (!iu) + goto err_unlock; + + req = list_first_entry(&target->free_reqs, struct srp_request, list); + list_del(&req->list); + spin_unlock_irqrestore(&target->lock, flags); + + dev = target->srp_host->srp_dev->dev; + ib_dma_sync_single_for_cpu(dev, iu->dma, target->max_iu_len, + DMA_TO_DEVICE); + + scmnd->host_scribble = (void *) req; + + cmd = iu->buf; + memset(cmd, 0, sizeof *cmd); + + cmd->opcode = SRP_CMD; + cmd->lun = cpu_to_be64((u64) scmnd->device->lun << 48); + cmd->tag = req->index; + memcpy(cmd->cdb, scmnd->cmnd, scmnd->cmd_len); + + req->scmnd = scmnd; + req->cmd = iu; + + len = srp_map_data(scmnd, target, req); + if (len < 0) { + shost_printk(KERN_ERR, target->scsi_host, + PFX "Failed to map data (%d)\n", len); + /* + * If we ran out of memory descriptors (-ENOMEM) because an + * application is queuing many requests with more than + * max_pages_per_mr sg-list elements, tell the SCSI mid-layer + * to reduce queue depth temporarily. + */ + scmnd->result = len == -ENOMEM ? + DID_OK << 16 | QUEUE_FULL << 1 : DID_ERROR << 16; + goto err_iu; + } + + ib_dma_sync_single_for_device(dev, iu->dma, target->max_iu_len, + DMA_TO_DEVICE); + + if (srp_post_send(target, iu, len)) { + shost_printk(KERN_ERR, target->scsi_host, PFX "Send failed\n"); + goto err_unmap; + } + + ret = 0; + +unlock_rport: + if (in_scsi_eh) + mutex_unlock(&rport->mutex); + + return ret; + +err_unmap: + srp_unmap_data(scmnd, target, req); + +err_iu: + srp_put_tx_iu(target, iu, SRP_IU_CMD); + + /* + * Avoid that the loops that iterate over the request ring can + * encounter a dangling SCSI command pointer. + */ + req->scmnd = NULL; + + spin_lock_irqsave(&target->lock, flags); + list_add(&req->list, &target->free_reqs); + +err_unlock: + spin_unlock_irqrestore(&target->lock, flags); + +err: + if (scmnd->result) { + scmnd->scsi_done(scmnd); + ret = 0; + } else { + ret = SCSI_MLQUEUE_HOST_BUSY; + } + + goto unlock_rport; +} + +/* + * Note: the resources allocated in this function are freed in + * srp_free_target_ib(). + */ +static int srp_alloc_iu_bufs(struct srp_target_port *target) +{ + int i; + + target->rx_ring = kzalloc(target->queue_size * sizeof(*target->rx_ring), + GFP_KERNEL); + if (!target->rx_ring) + goto err_no_ring; + target->tx_ring = kzalloc(target->queue_size * sizeof(*target->tx_ring), + GFP_KERNEL); + if (!target->tx_ring) + goto err_no_ring; + + for (i = 0; i < target->queue_size; ++i) { + target->rx_ring[i] = srp_alloc_iu(target->srp_host, + target->max_ti_iu_len, + GFP_KERNEL, DMA_FROM_DEVICE); + if (!target->rx_ring[i]) + goto err; + } + + for (i = 0; i < target->queue_size; ++i) { + target->tx_ring[i] = srp_alloc_iu(target->srp_host, + target->max_iu_len, + GFP_KERNEL, DMA_TO_DEVICE); + if (!target->tx_ring[i]) + goto err; + + list_add(&target->tx_ring[i]->list, &target->free_tx); + } + + return 0; + +err: + for (i = 0; i < target->queue_size; ++i) { + srp_free_iu(target->srp_host, target->rx_ring[i]); + srp_free_iu(target->srp_host, target->tx_ring[i]); + } + + +err_no_ring: + kfree(target->tx_ring); + target->tx_ring = NULL; + kfree(target->rx_ring); + target->rx_ring = NULL; + + return -ENOMEM; +} + +static uint32_t srp_compute_rq_tmo(struct ib_qp_attr *qp_attr, int attr_mask) +{ + uint64_t T_tr_ns, max_compl_time_ms; + uint32_t rq_tmo_jiffies; + + /* + * According to section 11.2.4.2 in the IBTA spec (Modify Queue Pair, + * table 91), both the QP timeout and the retry count have to be set + * for RC QP's during the RTR to RTS transition. + */ + WARN_ON_ONCE((attr_mask & (IB_QP_TIMEOUT | IB_QP_RETRY_CNT)) != + (IB_QP_TIMEOUT | IB_QP_RETRY_CNT)); + + /* + * Set target->rq_tmo_jiffies to one second more than the largest time + * it can take before an error completion is generated. See also + * C9-140..142 in the IBTA spec for more information about how to + * convert the QP Local ACK Timeout value to nanoseconds. + */ + T_tr_ns = 4096 * (1ULL << qp_attr->timeout); + max_compl_time_ms = qp_attr->retry_cnt * 4 * T_tr_ns; + do_div(max_compl_time_ms, NSEC_PER_MSEC); + rq_tmo_jiffies = msecs_to_jiffies(max_compl_time_ms + 1000); + + return rq_tmo_jiffies; +} + +static void srp_cm_rep_handler(struct ib_cm_id *cm_id, + struct srp_login_rsp *lrsp, + struct srp_target_port *target) +{ + struct ib_qp_attr *qp_attr = NULL; + int attr_mask = 0; + int ret; + int i; + + if (lrsp->opcode == SRP_LOGIN_RSP) { + target->max_ti_iu_len = be32_to_cpu(lrsp->max_ti_iu_len); + target->req_lim = be32_to_cpu(lrsp->req_lim_delta); + + /* + * Reserve credits for task management so we don't + * bounce requests back to the SCSI mid-layer. + */ + target->scsi_host->can_queue + = min(target->req_lim - SRP_TSK_MGMT_SQ_SIZE, + target->scsi_host->can_queue); + target->scsi_host->cmd_per_lun + = min_t(int, target->scsi_host->can_queue, + target->scsi_host->cmd_per_lun); + } else { + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Unhandled RSP opcode %#x\n", lrsp->opcode); + ret = -ECONNRESET; + goto error; + } + + if (!target->rx_ring) { + ret = srp_alloc_iu_bufs(target); + if (ret) + goto error; + } + + ret = -ENOMEM; + qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL); + if (!qp_attr) + goto error; + + qp_attr->qp_state = IB_QPS_RTR; + ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); + if (ret) + goto error_free; + + ret = ib_modify_qp(target->qp, qp_attr, attr_mask); + if (ret) + goto error_free; + + for (i = 0; i < target->queue_size; i++) { + struct srp_iu *iu = target->rx_ring[i]; + ret = srp_post_recv(target, iu); + if (ret) + goto error_free; + } + + qp_attr->qp_state = IB_QPS_RTS; + ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); + if (ret) + goto error_free; + + target->rq_tmo_jiffies = srp_compute_rq_tmo(qp_attr, attr_mask); + + ret = ib_modify_qp(target->qp, qp_attr, attr_mask); + if (ret) + goto error_free; + + ret = ib_send_cm_rtu(cm_id, NULL, 0); + +error_free: + kfree(qp_attr); + +error: + target->status = ret; +} + +static void srp_cm_rej_handler(struct ib_cm_id *cm_id, + struct ib_cm_event *event, + struct srp_target_port *target) +{ + struct Scsi_Host *shost = target->scsi_host; + struct ib_class_port_info *cpi; + int opcode; + + switch (event->param.rej_rcvd.reason) { + case IB_CM_REJ_PORT_CM_REDIRECT: + cpi = event->param.rej_rcvd.ari; + target->path.dlid = cpi->redirect_lid; + target->path.pkey = cpi->redirect_pkey; + cm_id->remote_cm_qpn = be32_to_cpu(cpi->redirect_qp) & 0x00ffffff; + memcpy(target->path.dgid.raw, cpi->redirect_gid, 16); + + target->status = target->path.dlid ? + SRP_DLID_REDIRECT : SRP_PORT_REDIRECT; + break; + + case IB_CM_REJ_PORT_REDIRECT: + if (srp_target_is_topspin(target)) { + /* + * Topspin/Cisco SRP gateways incorrectly send + * reject reason code 25 when they mean 24 + * (port redirect). + */ + memcpy(target->path.dgid.raw, + event->param.rej_rcvd.ari, 16); + + shost_printk(KERN_DEBUG, shost, + PFX "Topspin/Cisco redirect to target port GID %016llx%016llx\n", + (unsigned long long) be64_to_cpu(target->path.dgid.global.subnet_prefix), + (unsigned long long) be64_to_cpu(target->path.dgid.global.interface_id)); + + target->status = SRP_PORT_REDIRECT; + } else { + shost_printk(KERN_WARNING, shost, + " REJ reason: IB_CM_REJ_PORT_REDIRECT\n"); + target->status = -ECONNRESET; + } + break; + + case IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID: + shost_printk(KERN_WARNING, shost, + " REJ reason: IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID\n"); + target->status = -ECONNRESET; + break; + + case IB_CM_REJ_CONSUMER_DEFINED: + opcode = *(u8 *) event->private_data; + if (opcode == SRP_LOGIN_REJ) { + struct srp_login_rej *rej = event->private_data; + u32 reason = be32_to_cpu(rej->reason); + + if (reason == SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE) + shost_printk(KERN_WARNING, shost, + PFX "SRP_LOGIN_REJ: requested max_it_iu_len too large\n"); + else + shost_printk(KERN_WARNING, shost, PFX + "SRP LOGIN from %pI6 to %pI6 REJECTED, reason 0x%08x\n", + target->path.sgid.raw, + target->orig_dgid, reason); + } else + shost_printk(KERN_WARNING, shost, + " REJ reason: IB_CM_REJ_CONSUMER_DEFINED," + " opcode 0x%02x\n", opcode); + target->status = -ECONNRESET; + break; + + case IB_CM_REJ_STALE_CONN: + shost_printk(KERN_WARNING, shost, " REJ reason: stale connection\n"); + target->status = SRP_STALE_CONN; + break; + + default: + shost_printk(KERN_WARNING, shost, " REJ reason 0x%x\n", + event->param.rej_rcvd.reason); + target->status = -ECONNRESET; + } +} + +static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) +{ + struct srp_target_port *target = cm_id->context; + int comp = 0; + + switch (event->event) { + case IB_CM_REQ_ERROR: + shost_printk(KERN_DEBUG, target->scsi_host, + PFX "Sending CM REQ failed\n"); + comp = 1; + target->status = -ECONNRESET; + break; + + case IB_CM_REP_RECEIVED: + comp = 1; + srp_cm_rep_handler(cm_id, event->private_data, target); + break; + + case IB_CM_REJ_RECEIVED: + shost_printk(KERN_DEBUG, target->scsi_host, PFX "REJ received\n"); + comp = 1; + + srp_cm_rej_handler(cm_id, event, target); + break; + + case IB_CM_DREQ_RECEIVED: + shost_printk(KERN_WARNING, target->scsi_host, + PFX "DREQ received - connection closed\n"); + srp_change_conn_state(target, false); + if (ib_send_cm_drep(cm_id, NULL, 0)) + shost_printk(KERN_ERR, target->scsi_host, + PFX "Sending CM DREP failed\n"); + queue_work(system_long_wq, &target->tl_err_work); + break; + + case IB_CM_TIMEWAIT_EXIT: + shost_printk(KERN_ERR, target->scsi_host, + PFX "connection closed\n"); + comp = 1; + + target->status = 0; + break; + + case IB_CM_MRA_RECEIVED: + case IB_CM_DREQ_ERROR: + case IB_CM_DREP_RECEIVED: + break; + + default: + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Unhandled CM event %d\n", event->event); + break; + } + + if (comp) + complete(&target->done); + + return 0; +} + +/** + * srp_change_queue_type - changing device queue tag type + * @sdev: scsi device struct + * @tag_type: requested tag type + * + * Returns queue tag type. + */ +static int +srp_change_queue_type(struct scsi_device *sdev, int tag_type) +{ + if (sdev->tagged_supported) { + scsi_set_tag_type(sdev, tag_type); + if (tag_type) + scsi_activate_tcq(sdev, sdev->queue_depth); + else + scsi_deactivate_tcq(sdev, sdev->queue_depth); + } else + tag_type = 0; + + return tag_type; +} + +/** + * srp_change_queue_depth - setting device queue depth + * @sdev: scsi device struct + * @qdepth: requested queue depth + * @reason: SCSI_QDEPTH_DEFAULT/SCSI_QDEPTH_QFULL/SCSI_QDEPTH_RAMP_UP + * (see include/scsi/scsi_host.h for definition) + * + * Returns queue depth. + */ +static int +srp_change_queue_depth(struct scsi_device *sdev, int qdepth, int reason) +{ + struct Scsi_Host *shost = sdev->host; + int max_depth; + if (reason == SCSI_QDEPTH_DEFAULT || reason == SCSI_QDEPTH_RAMP_UP) { + max_depth = shost->can_queue; + if (!sdev->tagged_supported) + max_depth = 1; + if (qdepth > max_depth) + qdepth = max_depth; + scsi_adjust_queue_depth(sdev, scsi_get_tag_type(sdev), qdepth); + } else if (reason == SCSI_QDEPTH_QFULL) + scsi_track_queue_full(sdev, qdepth); + else + return -EOPNOTSUPP; + + return sdev->queue_depth; +} + +static int srp_send_tsk_mgmt(struct srp_target_port *target, + u64 req_tag, unsigned int lun, u8 func) +{ + struct srp_rport *rport = target->rport; + struct ib_device *dev = target->srp_host->srp_dev->dev; + struct srp_iu *iu; + struct srp_tsk_mgmt *tsk_mgmt; + + if (!target->connected || target->qp_in_error) + return -1; + + init_completion(&target->tsk_mgmt_done); + + /* + * Lock the rport mutex to avoid that srp_create_target_ib() is + * invoked while a task management function is being sent. + */ + mutex_lock(&rport->mutex); + spin_lock_irq(&target->lock); + iu = __srp_get_tx_iu(target, SRP_IU_TSK_MGMT); + spin_unlock_irq(&target->lock); + + if (!iu) { + mutex_unlock(&rport->mutex); + + return -1; + } + + ib_dma_sync_single_for_cpu(dev, iu->dma, sizeof *tsk_mgmt, + DMA_TO_DEVICE); + tsk_mgmt = iu->buf; + memset(tsk_mgmt, 0, sizeof *tsk_mgmt); + + tsk_mgmt->opcode = SRP_TSK_MGMT; + tsk_mgmt->lun = cpu_to_be64((u64) lun << 48); + tsk_mgmt->tag = req_tag | SRP_TAG_TSK_MGMT; + tsk_mgmt->tsk_mgmt_func = func; + tsk_mgmt->task_tag = req_tag; + + ib_dma_sync_single_for_device(dev, iu->dma, sizeof *tsk_mgmt, + DMA_TO_DEVICE); + if (srp_post_send(target, iu, sizeof *tsk_mgmt)) { + srp_put_tx_iu(target, iu, SRP_IU_TSK_MGMT); + mutex_unlock(&rport->mutex); + + return -1; + } + mutex_unlock(&rport->mutex); + + if (!wait_for_completion_timeout(&target->tsk_mgmt_done, + msecs_to_jiffies(SRP_ABORT_TIMEOUT_MS))) + return -1; + + return 0; +} + +static int srp_abort(struct scsi_cmnd *scmnd) +{ + struct srp_target_port *target = host_to_target(scmnd->device->host); + struct srp_request *req = (struct srp_request *) scmnd->host_scribble; + int ret; + + shost_printk(KERN_ERR, target->scsi_host, "SRP abort called\n"); + + if (!req || !srp_claim_req(target, req, NULL, scmnd)) + return SUCCESS; + if (srp_send_tsk_mgmt(target, req->index, scmnd->device->lun, + SRP_TSK_ABORT_TASK) == 0) + ret = SUCCESS; + else if (target->rport->state == SRP_RPORT_LOST) + ret = FAST_IO_FAIL; + else + ret = FAILED; + srp_free_req(target, req, scmnd, 0); + scmnd->result = DID_ABORT << 16; + scmnd->scsi_done(scmnd); + + return ret; +} + +static int srp_reset_device(struct scsi_cmnd *scmnd) +{ + struct srp_target_port *target = host_to_target(scmnd->device->host); + int i; + + shost_printk(KERN_ERR, target->scsi_host, "SRP reset_device called\n"); + + if (srp_send_tsk_mgmt(target, SRP_TAG_NO_REQ, scmnd->device->lun, + SRP_TSK_LUN_RESET)) + return FAILED; + if (target->tsk_mgmt_status) + return FAILED; + + for (i = 0; i < target->req_ring_size; ++i) { + struct srp_request *req = &target->req_ring[i]; + srp_finish_req(target, req, scmnd->device, DID_RESET << 16); + } + + return SUCCESS; +} + +static int srp_reset_host(struct scsi_cmnd *scmnd) +{ + struct srp_target_port *target = host_to_target(scmnd->device->host); + + shost_printk(KERN_ERR, target->scsi_host, PFX "SRP reset_host called\n"); + + return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED; +} + +static int srp_slave_configure(struct scsi_device *sdev) +{ + struct Scsi_Host *shost = sdev->host; + struct srp_target_port *target = host_to_target(shost); + struct request_queue *q = sdev->request_queue; + unsigned long timeout; + + if (sdev->type == TYPE_DISK) { + timeout = max_t(unsigned, 30 * HZ, target->rq_tmo_jiffies); + blk_queue_rq_timeout(q, timeout); + } + + return 0; +} + +static ssize_t show_id_ext(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "0x%016llx\n", + (unsigned long long) be64_to_cpu(target->id_ext)); +} + +static ssize_t show_ioc_guid(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "0x%016llx\n", + (unsigned long long) be64_to_cpu(target->ioc_guid)); +} + +static ssize_t show_service_id(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "0x%016llx\n", + (unsigned long long) be64_to_cpu(target->service_id)); +} + +static ssize_t show_pkey(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "0x%04x\n", be16_to_cpu(target->path.pkey)); +} + +static ssize_t show_sgid(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%pI6\n", target->path.sgid.raw); +} + +static ssize_t show_dgid(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%pI6\n", target->path.dgid.raw); +} + +static ssize_t show_orig_dgid(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%pI6\n", target->orig_dgid); +} + +static ssize_t show_req_lim(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%d\n", target->req_lim); +} + +static ssize_t show_zero_req_lim(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%d\n", target->zero_req_lim); +} + +static ssize_t show_local_ib_port(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%d\n", target->srp_host->port); +} + +static ssize_t show_local_ib_device(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%s\n", target->srp_host->srp_dev->dev->name); +} + +static ssize_t show_comp_vector(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%d\n", target->comp_vector); +} + +static ssize_t show_tl_retry_count(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%d\n", target->tl_retry_count); +} + +static ssize_t show_cmd_sg_entries(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%u\n", target->cmd_sg_cnt); +} + +static ssize_t show_allow_ext_sg(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%s\n", target->allow_ext_sg ? "true" : "false"); +} + +static DEVICE_ATTR(id_ext, S_IRUGO, show_id_ext, NULL); +static DEVICE_ATTR(ioc_guid, S_IRUGO, show_ioc_guid, NULL); +static DEVICE_ATTR(service_id, S_IRUGO, show_service_id, NULL); +static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL); +static DEVICE_ATTR(sgid, S_IRUGO, show_sgid, NULL); +static DEVICE_ATTR(dgid, S_IRUGO, show_dgid, NULL); +static DEVICE_ATTR(orig_dgid, S_IRUGO, show_orig_dgid, NULL); +static DEVICE_ATTR(req_lim, S_IRUGO, show_req_lim, NULL); +static DEVICE_ATTR(zero_req_lim, S_IRUGO, show_zero_req_lim, NULL); +static DEVICE_ATTR(local_ib_port, S_IRUGO, show_local_ib_port, NULL); +static DEVICE_ATTR(local_ib_device, S_IRUGO, show_local_ib_device, NULL); +static DEVICE_ATTR(comp_vector, S_IRUGO, show_comp_vector, NULL); +static DEVICE_ATTR(tl_retry_count, S_IRUGO, show_tl_retry_count, NULL); +static DEVICE_ATTR(cmd_sg_entries, S_IRUGO, show_cmd_sg_entries, NULL); +static DEVICE_ATTR(allow_ext_sg, S_IRUGO, show_allow_ext_sg, NULL); + +static struct device_attribute *srp_host_attrs[] = { + &dev_attr_id_ext, + &dev_attr_ioc_guid, + &dev_attr_service_id, + &dev_attr_pkey, + &dev_attr_sgid, + &dev_attr_dgid, + &dev_attr_orig_dgid, + &dev_attr_req_lim, + &dev_attr_zero_req_lim, + &dev_attr_local_ib_port, + &dev_attr_local_ib_device, + &dev_attr_comp_vector, + &dev_attr_tl_retry_count, + &dev_attr_cmd_sg_entries, + &dev_attr_allow_ext_sg, + NULL +}; + +static struct scsi_host_template srp_template = { + .module = THIS_MODULE, + .name = "InfiniBand SRP initiator", + .proc_name = DRV_NAME, + .slave_configure = srp_slave_configure, + .info = srp_target_info, + .queuecommand = srp_queuecommand, + .change_queue_depth = srp_change_queue_depth, + .change_queue_type = srp_change_queue_type, + .eh_abort_handler = srp_abort, + .eh_device_reset_handler = srp_reset_device, + .eh_host_reset_handler = srp_reset_host, + .skip_settle_delay = true, + .sg_tablesize = SRP_DEF_SG_TABLESIZE, + .can_queue = SRP_DEFAULT_CMD_SQ_SIZE, + .this_id = -1, + .cmd_per_lun = SRP_DEFAULT_CMD_SQ_SIZE, + .use_clustering = ENABLE_CLUSTERING, + .shost_attrs = srp_host_attrs +}; + +static int srp_add_target(struct srp_host *host, struct srp_target_port *target) +{ + struct srp_rport_identifiers ids; + struct srp_rport *rport; + + sprintf(target->target_name, "SRP.T10:%016llX", + (unsigned long long) be64_to_cpu(target->id_ext)); + + if (scsi_add_host(target->scsi_host, host->srp_dev->dev->dma_device)) + return -ENODEV; + + memcpy(ids.port_id, &target->id_ext, 8); + memcpy(ids.port_id + 8, &target->ioc_guid, 8); + ids.roles = SRP_RPORT_ROLE_TARGET; + rport = srp_rport_add(target->scsi_host, &ids); + if (IS_ERR(rport)) { + scsi_remove_host(target->scsi_host); + return PTR_ERR(rport); + } + + rport->lld_data = target; + target->rport = rport; + + spin_lock(&host->target_lock); + list_add_tail(&target->list, &host->target_list); + spin_unlock(&host->target_lock); + + target->state = SRP_TARGET_LIVE; + + scsi_scan_target(&target->scsi_host->shost_gendev, + 0, target->scsi_id, SCAN_WILD_CARD, 0); + + return 0; +} + +static void srp_release_dev(struct device *dev) +{ + struct srp_host *host = + container_of(dev, struct srp_host, dev); + + complete(&host->released); +} + +static struct class srp_class = { + .name = "infiniband_srp", + .dev_release = srp_release_dev +}; + +/** + * srp_conn_unique() - check whether the connection to a target is unique + * @host: SRP host. + * @target: SRP target port. + */ +static bool srp_conn_unique(struct srp_host *host, + struct srp_target_port *target) +{ + struct srp_target_port *t; + bool ret = false; + + if (target->state == SRP_TARGET_REMOVED) + goto out; + + ret = true; + + spin_lock(&host->target_lock); + list_for_each_entry(t, &host->target_list, list) { + if (t != target && + target->id_ext == t->id_ext && + target->ioc_guid == t->ioc_guid && + target->initiator_ext == t->initiator_ext) { + ret = false; + break; + } + } + spin_unlock(&host->target_lock); + +out: + return ret; +} + +/* + * Target ports are added by writing + * + * id_ext=,ioc_guid=,dgid=, + * pkey=,service_id= + * + * to the add_target sysfs attribute. + */ +enum { + SRP_OPT_ERR = 0, + SRP_OPT_ID_EXT = 1 << 0, + SRP_OPT_IOC_GUID = 1 << 1, + SRP_OPT_DGID = 1 << 2, + SRP_OPT_PKEY = 1 << 3, + SRP_OPT_SERVICE_ID = 1 << 4, + SRP_OPT_MAX_SECT = 1 << 5, + SRP_OPT_MAX_CMD_PER_LUN = 1 << 6, + SRP_OPT_IO_CLASS = 1 << 7, + SRP_OPT_INITIATOR_EXT = 1 << 8, + SRP_OPT_CMD_SG_ENTRIES = 1 << 9, + SRP_OPT_ALLOW_EXT_SG = 1 << 10, + SRP_OPT_SG_TABLESIZE = 1 << 11, + SRP_OPT_COMP_VECTOR = 1 << 12, + SRP_OPT_TL_RETRY_COUNT = 1 << 13, + SRP_OPT_QUEUE_SIZE = 1 << 14, + SRP_OPT_ALL = (SRP_OPT_ID_EXT | + SRP_OPT_IOC_GUID | + SRP_OPT_DGID | + SRP_OPT_PKEY | + SRP_OPT_SERVICE_ID), +}; + +static const match_table_t srp_opt_tokens = { + { SRP_OPT_ID_EXT, "id_ext=%s" }, + { SRP_OPT_IOC_GUID, "ioc_guid=%s" }, + { SRP_OPT_DGID, "dgid=%s" }, + { SRP_OPT_PKEY, "pkey=%x" }, + { SRP_OPT_SERVICE_ID, "service_id=%s" }, + { SRP_OPT_MAX_SECT, "max_sect=%d" }, + { SRP_OPT_MAX_CMD_PER_LUN, "max_cmd_per_lun=%d" }, + { SRP_OPT_IO_CLASS, "io_class=%x" }, + { SRP_OPT_INITIATOR_EXT, "initiator_ext=%s" }, + { SRP_OPT_CMD_SG_ENTRIES, "cmd_sg_entries=%u" }, + { SRP_OPT_ALLOW_EXT_SG, "allow_ext_sg=%u" }, + { SRP_OPT_SG_TABLESIZE, "sg_tablesize=%u" }, + { SRP_OPT_COMP_VECTOR, "comp_vector=%u" }, + { SRP_OPT_TL_RETRY_COUNT, "tl_retry_count=%u" }, + { SRP_OPT_QUEUE_SIZE, "queue_size=%d" }, + { SRP_OPT_ERR, NULL } +}; + +static int srp_parse_options(const char *buf, struct srp_target_port *target) +{ + char *options, *sep_opt; + char *p; + char dgid[3]; + substring_t args[MAX_OPT_ARGS]; + int opt_mask = 0; + int token; + int ret = -EINVAL; + int i; + + options = kstrdup(buf, GFP_KERNEL); + if (!options) + return -ENOMEM; + + sep_opt = options; + while ((p = strsep(&sep_opt, ",")) != NULL) { + if (!*p) + continue; + + token = match_token(p, srp_opt_tokens, args); + opt_mask |= token; + + switch (token) { + case SRP_OPT_ID_EXT: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + target->id_ext = cpu_to_be64(simple_strtoull(p, NULL, 16)); + kfree(p); + break; + + case SRP_OPT_IOC_GUID: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + target->ioc_guid = cpu_to_be64(simple_strtoull(p, NULL, 16)); + kfree(p); + break; + + case SRP_OPT_DGID: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + if (strlen(p) != 32) { + pr_warn("bad dest GID parameter '%s'\n", p); + kfree(p); + goto out; + } + + for (i = 0; i < 16; ++i) { + strlcpy(dgid, p + i * 2, 3); + target->path.dgid.raw[i] = simple_strtoul(dgid, NULL, 16); + } + kfree(p); + memcpy(target->orig_dgid, target->path.dgid.raw, 16); + break; + + case SRP_OPT_PKEY: + if (match_hex(args, &token)) { + pr_warn("bad P_Key parameter '%s'\n", p); + goto out; + } + target->path.pkey = cpu_to_be16(token); + break; + + case SRP_OPT_SERVICE_ID: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + target->service_id = cpu_to_be64(simple_strtoull(p, NULL, 16)); + target->path.service_id = target->service_id; + kfree(p); + break; + + case SRP_OPT_MAX_SECT: + if (match_int(args, &token)) { + pr_warn("bad max sect parameter '%s'\n", p); + goto out; + } + target->scsi_host->max_sectors = token; + break; + + case SRP_OPT_QUEUE_SIZE: + if (match_int(args, &token) || token < 1) { + pr_warn("bad queue_size parameter '%s'\n", p); + goto out; + } + target->scsi_host->can_queue = token; + target->queue_size = token + SRP_RSP_SQ_SIZE + + SRP_TSK_MGMT_SQ_SIZE; + if (!(opt_mask & SRP_OPT_MAX_CMD_PER_LUN)) + target->scsi_host->cmd_per_lun = token; + break; + + case SRP_OPT_MAX_CMD_PER_LUN: + if (match_int(args, &token) || token < 1) { + pr_warn("bad max cmd_per_lun parameter '%s'\n", + p); + goto out; + } + target->scsi_host->cmd_per_lun = token; + break; + + case SRP_OPT_IO_CLASS: + if (match_hex(args, &token)) { + pr_warn("bad IO class parameter '%s'\n", p); + goto out; + } + if (token != SRP_REV10_IB_IO_CLASS && + token != SRP_REV16A_IB_IO_CLASS) { + pr_warn("unknown IO class parameter value %x specified (use %x or %x).\n", + token, SRP_REV10_IB_IO_CLASS, + SRP_REV16A_IB_IO_CLASS); + goto out; + } + target->io_class = token; + break; + + case SRP_OPT_INITIATOR_EXT: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + target->initiator_ext = cpu_to_be64(simple_strtoull(p, NULL, 16)); + kfree(p); + break; + + case SRP_OPT_CMD_SG_ENTRIES: + if (match_int(args, &token) || token < 1 || token > 255) { + pr_warn("bad max cmd_sg_entries parameter '%s'\n", + p); + goto out; + } + target->cmd_sg_cnt = token; + break; + + case SRP_OPT_ALLOW_EXT_SG: + if (match_int(args, &token)) { + pr_warn("bad allow_ext_sg parameter '%s'\n", p); + goto out; + } + target->allow_ext_sg = !!token; + break; + + case SRP_OPT_SG_TABLESIZE: + if (match_int(args, &token) || token < 1 || + token > SCSI_MAX_SG_CHAIN_SEGMENTS) { + pr_warn("bad max sg_tablesize parameter '%s'\n", + p); + goto out; + } + target->sg_tablesize = token; + break; + + case SRP_OPT_COMP_VECTOR: + if (match_int(args, &token) || token < 0) { + pr_warn("bad comp_vector parameter '%s'\n", p); + goto out; + } + target->comp_vector = token; + break; + + case SRP_OPT_TL_RETRY_COUNT: + if (match_int(args, &token) || token < 2 || token > 7) { + pr_warn("bad tl_retry_count parameter '%s' (must be a number between 2 and 7)\n", + p); + goto out; + } + target->tl_retry_count = token; + break; + + default: + pr_warn("unknown parameter or missing value '%s' in target creation request\n", + p); + goto out; + } + } + + if ((opt_mask & SRP_OPT_ALL) == SRP_OPT_ALL) + ret = 0; + else + for (i = 0; i < ARRAY_SIZE(srp_opt_tokens); ++i) + if ((srp_opt_tokens[i].token & SRP_OPT_ALL) && + !(srp_opt_tokens[i].token & opt_mask)) + pr_warn("target creation request is missing parameter '%s'\n", + srp_opt_tokens[i].pattern); + + if (target->scsi_host->cmd_per_lun > target->scsi_host->can_queue + && (opt_mask & SRP_OPT_MAX_CMD_PER_LUN)) + pr_warn("cmd_per_lun = %d > queue_size = %d\n", + target->scsi_host->cmd_per_lun, + target->scsi_host->can_queue); + +out: + kfree(options); + return ret; +} + +static ssize_t srp_create_target(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct srp_host *host = + container_of(dev, struct srp_host, dev); + struct Scsi_Host *target_host; + struct srp_target_port *target; + struct srp_device *srp_dev = host->srp_dev; + struct ib_device *ibdev = srp_dev->dev; + int ret; + + target_host = scsi_host_alloc(&srp_template, + sizeof (struct srp_target_port)); + if (!target_host) + return -ENOMEM; + + target_host->transportt = ib_srp_transport_template; + target_host->max_channel = 0; + target_host->max_id = 1; + target_host->max_lun = SRP_MAX_LUN; + target_host->max_cmd_len = sizeof ((struct srp_cmd *) (void *) 0L)->cdb; + + target = host_to_target(target_host); + + target->io_class = SRP_REV16A_IB_IO_CLASS; + target->scsi_host = target_host; + target->srp_host = host; + target->lkey = host->srp_dev->mr->lkey; + target->rkey = host->srp_dev->mr->rkey; + target->cmd_sg_cnt = cmd_sg_entries; + target->sg_tablesize = indirect_sg_entries ? : cmd_sg_entries; + target->allow_ext_sg = allow_ext_sg; + target->tl_retry_count = 7; + target->queue_size = SRP_DEFAULT_QUEUE_SIZE; + + mutex_lock(&host->add_target_mutex); + + ret = srp_parse_options(buf, target); + if (ret) + goto err; + + target->req_ring_size = target->queue_size - SRP_TSK_MGMT_SQ_SIZE; + + if (!srp_conn_unique(target->srp_host, target)) { + shost_printk(KERN_INFO, target->scsi_host, + PFX "Already connected to target port with id_ext=%016llx;ioc_guid=%016llx;initiator_ext=%016llx\n", + be64_to_cpu(target->id_ext), + be64_to_cpu(target->ioc_guid), + be64_to_cpu(target->initiator_ext)); + ret = -EEXIST; + goto err; + } + + if (!srp_dev->has_fmr && !srp_dev->has_fr && !target->allow_ext_sg && + target->cmd_sg_cnt < target->sg_tablesize) { + pr_warn("No MR pool and no external indirect descriptors, limiting sg_tablesize to cmd_sg_cnt\n"); + target->sg_tablesize = target->cmd_sg_cnt; + } + + target_host->sg_tablesize = target->sg_tablesize; + target->indirect_size = target->sg_tablesize * + sizeof (struct srp_direct_buf); + target->max_iu_len = sizeof (struct srp_cmd) + + sizeof (struct srp_indirect_buf) + + target->cmd_sg_cnt * sizeof (struct srp_direct_buf); + + INIT_WORK(&target->tl_err_work, srp_tl_err_work); + INIT_WORK(&target->remove_work, srp_remove_work); + spin_lock_init(&target->lock); + INIT_LIST_HEAD(&target->free_tx); + ret = srp_alloc_req_data(target); + if (ret) + goto err_free_mem; + + ret = ib_query_gid(ibdev, host->port, 0, &target->path.sgid); + if (ret) + goto err_free_mem; + + ret = srp_create_target_ib(target); + if (ret) + goto err_free_mem; + + ret = srp_new_cm_id(target); + if (ret) + goto err_free_ib; + + ret = srp_connect_target(target); + if (ret) { + shost_printk(KERN_ERR, target->scsi_host, + PFX "Connection failed\n"); + goto err_cm_id; + } + + ret = srp_add_target(host, target); + if (ret) + goto err_disconnect; + + shost_printk(KERN_DEBUG, target->scsi_host, PFX + "new target: id_ext %016llx ioc_guid %016llx pkey %04x service_id %016llx sgid %pI6 dgid %pI6\n", + be64_to_cpu(target->id_ext), + be64_to_cpu(target->ioc_guid), + be16_to_cpu(target->path.pkey), + be64_to_cpu(target->service_id), + target->path.sgid.raw, target->path.dgid.raw); + + ret = count; + +out: + mutex_unlock(&host->add_target_mutex); + return ret; + +err_disconnect: + srp_disconnect_target(target); + +err_cm_id: + ib_destroy_cm_id(target->cm_id); + +err_free_ib: + srp_free_target_ib(target); + +err_free_mem: + srp_free_req_data(target); + +err: + scsi_host_put(target_host); + goto out; +} + +static DEVICE_ATTR(add_target, S_IWUSR, NULL, srp_create_target); + +static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_host *host = container_of(dev, struct srp_host, dev); + + return sprintf(buf, "%s\n", host->srp_dev->dev->name); +} + +static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); + +static ssize_t show_port(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_host *host = container_of(dev, struct srp_host, dev); + + return sprintf(buf, "%d\n", host->port); +} + +static DEVICE_ATTR(port, S_IRUGO, show_port, NULL); + +static struct srp_host *srp_add_port(struct srp_device *device, u8 port) +{ + struct srp_host *host; + + host = kzalloc(sizeof *host, GFP_KERNEL); + if (!host) + return NULL; + + INIT_LIST_HEAD(&host->target_list); + spin_lock_init(&host->target_lock); + init_completion(&host->released); + mutex_init(&host->add_target_mutex); + host->srp_dev = device; + host->port = port; + + host->dev.class = &srp_class; + host->dev.parent = device->dev->dma_device; + dev_set_name(&host->dev, "srp-%s-%d", device->dev->name, port); + + if (device_register(&host->dev)) + goto free_host; + if (device_create_file(&host->dev, &dev_attr_add_target)) + goto err_class; + if (device_create_file(&host->dev, &dev_attr_ibdev)) + goto err_class; + if (device_create_file(&host->dev, &dev_attr_port)) + goto err_class; + + return host; + +err_class: + device_unregister(&host->dev); + +free_host: + kfree(host); + + return NULL; +} + +static void srp_add_one(struct ib_device *device) +{ + struct srp_device *srp_dev; + struct ib_device_attr *dev_attr; + struct srp_host *host; + int mr_page_shift, s, e, p; + u64 max_pages_per_mr; + + dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); + if (!dev_attr) + return; + + if (ib_query_device(device, dev_attr)) { + pr_warn("Query device failed for %s\n", device->name); + goto free_attr; + } + + srp_dev = kmalloc(sizeof *srp_dev, GFP_KERNEL); + if (!srp_dev) + goto free_attr; + + srp_dev->has_fmr = (device->alloc_fmr && device->dealloc_fmr && + device->map_phys_fmr && device->unmap_fmr); + srp_dev->has_fr = (dev_attr->device_cap_flags & + IB_DEVICE_MEM_MGT_EXTENSIONS); + if (!srp_dev->has_fmr && !srp_dev->has_fr) + dev_warn(&device->dev, "neither FMR nor FR is supported\n"); + + srp_dev->use_fast_reg = (srp_dev->has_fr && + (!srp_dev->has_fmr || prefer_fr)); + + /* + * Use the smallest page size supported by the HCA, down to a + * minimum of 4096 bytes. We're unlikely to build large sglists + * out of smaller entries. + */ + mr_page_shift = max(12, ffs(dev_attr->page_size_cap) - 1); + srp_dev->mr_page_size = 1 << mr_page_shift; + srp_dev->mr_page_mask = ~((u64) srp_dev->mr_page_size - 1); + max_pages_per_mr = dev_attr->max_mr_size; + do_div(max_pages_per_mr, srp_dev->mr_page_size); + srp_dev->max_pages_per_mr = min_t(u64, SRP_MAX_PAGES_PER_MR, + max_pages_per_mr); + if (srp_dev->use_fast_reg) { + srp_dev->max_pages_per_mr = + min_t(u32, srp_dev->max_pages_per_mr, + dev_attr->max_fast_reg_page_list_len); + } + srp_dev->mr_max_size = srp_dev->mr_page_size * + srp_dev->max_pages_per_mr; + pr_debug("%s: mr_page_shift = %d, dev_attr->max_mr_size = %#llx, dev_attr->max_fast_reg_page_list_len = %u, max_pages_per_mr = %d, mr_max_size = %#x\n", + device->name, mr_page_shift, dev_attr->max_mr_size, + dev_attr->max_fast_reg_page_list_len, + srp_dev->max_pages_per_mr, srp_dev->mr_max_size); + + INIT_LIST_HEAD(&srp_dev->dev_list); + + srp_dev->dev = device; + srp_dev->pd = ib_alloc_pd(device); + if (IS_ERR(srp_dev->pd)) + goto free_dev; + + srp_dev->mr = ib_get_dma_mr(srp_dev->pd, + IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE); + if (IS_ERR(srp_dev->mr)) + goto err_pd; + + if (device->node_type == RDMA_NODE_IB_SWITCH) { + s = 0; + e = 0; + } else { + s = 1; + e = device->phys_port_cnt; + } + + for (p = s; p <= e; ++p) { + host = srp_add_port(srp_dev, p); + if (host) + list_add_tail(&host->list, &srp_dev->dev_list); + } + + ib_set_client_data(device, &srp_client, srp_dev); + + goto free_attr; + +err_pd: + ib_dealloc_pd(srp_dev->pd); + +free_dev: + kfree(srp_dev); + +free_attr: + kfree(dev_attr); +} + +static void srp_remove_one(struct ib_device *device) +{ + struct srp_device *srp_dev; + struct srp_host *host, *tmp_host; + struct srp_target_port *target; + + srp_dev = ib_get_client_data(device, &srp_client); + if (!srp_dev) + return; + + list_for_each_entry_safe(host, tmp_host, &srp_dev->dev_list, list) { + device_unregister(&host->dev); + /* + * Wait for the sysfs entry to go away, so that no new + * target ports can be created. + */ + wait_for_completion(&host->released); + + /* + * Remove all target ports. + */ + spin_lock(&host->target_lock); + list_for_each_entry(target, &host->target_list, list) + srp_queue_remove_work(target); + spin_unlock(&host->target_lock); + + /* + * Wait for tl_err and target port removal tasks. + */ + flush_workqueue(system_long_wq); + flush_workqueue(srp_remove_wq); + + kfree(host); + } + + ib_dereg_mr(srp_dev->mr); + ib_dealloc_pd(srp_dev->pd); + + kfree(srp_dev); +} + +static struct srp_function_template ib_srp_transport_functions = { + .has_rport_state = true, + .reset_timer_if_blocked = true, + .reconnect_delay = &srp_reconnect_delay, + .fast_io_fail_tmo = &srp_fast_io_fail_tmo, + .dev_loss_tmo = &srp_dev_loss_tmo, + .reconnect = srp_rport_reconnect, + .rport_delete = srp_rport_delete, + .terminate_rport_io = srp_terminate_io, +}; + +static int __init srp_init_module(void) +{ + int ret; + + BUILD_BUG_ON(FIELD_SIZEOF(struct ib_wc, wr_id) < sizeof(void *)); + + if (srp_sg_tablesize) { + pr_warn("srp_sg_tablesize is deprecated, please use cmd_sg_entries\n"); + if (!cmd_sg_entries) + cmd_sg_entries = srp_sg_tablesize; + } + + if (!cmd_sg_entries) + cmd_sg_entries = SRP_DEF_SG_TABLESIZE; + + if (cmd_sg_entries > 255) { + pr_warn("Clamping cmd_sg_entries to 255\n"); + cmd_sg_entries = 255; + } + + if (!indirect_sg_entries) + indirect_sg_entries = cmd_sg_entries; + else if (indirect_sg_entries < cmd_sg_entries) { + pr_warn("Bumping up indirect_sg_entries to match cmd_sg_entries (%u)\n", + cmd_sg_entries); + indirect_sg_entries = cmd_sg_entries; + } + + srp_remove_wq = create_workqueue("srp_remove"); + if (!srp_remove_wq) { + ret = -ENOMEM; + goto out; + } + + ret = -ENOMEM; + ib_srp_transport_template = + srp_attach_transport(&ib_srp_transport_functions); + if (!ib_srp_transport_template) + goto destroy_wq; + + ret = class_register(&srp_class); + if (ret) { + pr_err("couldn't register class infiniband_srp\n"); + goto release_tr; + } + + ib_sa_register_client(&srp_sa_client); + + ret = ib_register_client(&srp_client); + if (ret) { + pr_err("couldn't register IB client\n"); + goto unreg_sa; + } + +out: + return ret; + +unreg_sa: + ib_sa_unregister_client(&srp_sa_client); + class_unregister(&srp_class); + +release_tr: + srp_release_transport(ib_srp_transport_template); + +destroy_wq: + destroy_workqueue(srp_remove_wq); + goto out; +} + +static void __exit srp_cleanup_module(void) +{ + ib_unregister_client(&srp_client); + ib_sa_unregister_client(&srp_sa_client); + class_unregister(&srp_class); + srp_release_transport(ib_srp_transport_template); + destroy_workqueue(srp_remove_wq); +} + +module_init(srp_init_module); +module_exit(srp_cleanup_module); diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h new file mode 100644 index 0000000..e46ecb1 --- /dev/null +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -0,0 +1,279 @@ +/* + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IB_SRP_H +#define IB_SRP_H + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +enum { + SRP_PATH_REC_TIMEOUT_MS = 1000, + SRP_ABORT_TIMEOUT_MS = 5000, + + SRP_PORT_REDIRECT = 1, + SRP_DLID_REDIRECT = 2, + SRP_STALE_CONN = 3, + + SRP_MAX_LUN = 512, + SRP_DEF_SG_TABLESIZE = 12, + + SRP_DEFAULT_QUEUE_SIZE = 1 << 6, + SRP_RSP_SQ_SIZE = 1, + SRP_TSK_MGMT_SQ_SIZE = 1, + SRP_DEFAULT_CMD_SQ_SIZE = SRP_DEFAULT_QUEUE_SIZE - SRP_RSP_SQ_SIZE - + SRP_TSK_MGMT_SQ_SIZE, + + SRP_TAG_NO_REQ = ~0U, + SRP_TAG_TSK_MGMT = 1U << 31, + + SRP_MAX_PAGES_PER_MR = 512, + + LOCAL_INV_WR_ID_MASK = 1, + FAST_REG_WR_ID_MASK = 2, +}; + +enum srp_target_state { + SRP_TARGET_LIVE, + SRP_TARGET_REMOVED, +}; + +enum srp_iu_type { + SRP_IU_CMD, + SRP_IU_TSK_MGMT, + SRP_IU_RSP, +}; + +/* + * @mr_page_mask: HCA memory registration page mask. + * @mr_page_size: HCA memory registration page size. + * @mr_max_size: Maximum size in bytes of a single FMR / FR registration + * request. + */ +struct srp_device { + struct list_head dev_list; + struct ib_device *dev; + struct ib_pd *pd; + struct ib_mr *mr; + u64 mr_page_mask; + int mr_page_size; + int mr_max_size; + int max_pages_per_mr; + bool has_fmr; + bool has_fr; + bool use_fast_reg; +}; + +struct srp_host { + struct srp_device *srp_dev; + u8 port; + struct device dev; + struct list_head target_list; + spinlock_t target_lock; + struct completion released; + struct list_head list; + struct mutex add_target_mutex; +}; + +struct srp_request { + struct list_head list; + struct scsi_cmnd *scmnd; + struct srp_iu *cmd; + union { + struct ib_pool_fmr **fmr_list; + struct srp_fr_desc **fr_list; + }; + u64 *map_page; + struct srp_direct_buf *indirect_desc; + dma_addr_t indirect_dma_addr; + short nmdesc; + short index; +}; + +struct srp_target_port { + /* These are RW in the hot path, and commonly used together */ + struct list_head free_tx; + struct list_head free_reqs; + spinlock_t lock; + s32 req_lim; + + /* These are read-only in the hot path */ + struct ib_cq *send_cq ____cacheline_aligned_in_smp; + struct ib_cq *recv_cq; + struct ib_qp *qp; + union { + struct ib_fmr_pool *fmr_pool; + struct srp_fr_pool *fr_pool; + }; + u32 lkey; + u32 rkey; + enum srp_target_state state; + unsigned int max_iu_len; + unsigned int cmd_sg_cnt; + unsigned int indirect_size; + bool allow_ext_sg; + + /* Everything above this point is used in the hot path of + * command processing. Try to keep them packed into cachelines. + */ + + __be64 id_ext; + __be64 ioc_guid; + __be64 service_id; + __be64 initiator_ext; + u16 io_class; + struct srp_host *srp_host; + struct Scsi_Host *scsi_host; + struct srp_rport *rport; + char target_name[32]; + unsigned int scsi_id; + unsigned int sg_tablesize; + int queue_size; + int req_ring_size; + int comp_vector; + int tl_retry_count; + + struct ib_sa_path_rec path; + __be16 orig_dgid[8]; + struct ib_sa_query *path_query; + int path_query_id; + + u32 rq_tmo_jiffies; + bool connected; + + struct ib_cm_id *cm_id; + + int max_ti_iu_len; + + int zero_req_lim; + + struct srp_iu **tx_ring; + struct srp_iu **rx_ring; + struct srp_request *req_ring; + + struct work_struct tl_err_work; + struct work_struct remove_work; + + struct list_head list; + struct completion done; + int status; + bool qp_in_error; + + struct completion tsk_mgmt_done; + u8 tsk_mgmt_status; +}; + +struct srp_iu { + struct list_head list; + u64 dma; + void *buf; + size_t size; + enum dma_data_direction direction; +}; + +/** + * struct srp_fr_desc - fast registration work request arguments + * @entry: Entry in srp_fr_pool.free_list. + * @mr: Memory region. + * @frpl: Fast registration page list. + */ +struct srp_fr_desc { + struct list_head entry; + struct ib_mr *mr; + struct ib_fast_reg_page_list *frpl; +}; + +/** + * struct srp_fr_pool - pool of fast registration descriptors + * + * An entry is available for allocation if and only if it occurs in @free_list. + * + * @size: Number of descriptors in this pool. + * @max_page_list_len: Maximum fast registration work request page list length. + * @lock: Protects free_list. + * @free_list: List of free descriptors. + * @desc: Fast registration descriptor pool. + */ +struct srp_fr_pool { + int size; + int max_page_list_len; + spinlock_t lock; + struct list_head free_list; + struct srp_fr_desc desc[0]; +}; + +/** + * struct srp_map_state - per-request DMA memory mapping state + * @desc: Pointer to the element of the SRP buffer descriptor array + * that is being filled in. + * @pages: Array with DMA addresses of pages being considered for + * memory registration. + * @base_dma_addr: DMA address of the first page that has not yet been mapped. + * @dma_len: Number of bytes that will be registered with the next + * FMR or FR memory registration call. + * @total_len: Total number of bytes in the sg-list being mapped. + * @npages: Number of page addresses in the pages[] array. + * @nmdesc: Number of FMR or FR memory descriptors used for mapping. + * @ndesc: Number of SRP buffer descriptors that have been filled in. + * @unmapped_sg: First element of the sg-list that is mapped via FMR or FR. + * @unmapped_index: Index of the first element mapped via FMR or FR. + * @unmapped_addr: DMA address of the first element mapped via FMR or FR. + */ +struct srp_map_state { + union { + struct ib_pool_fmr **next_fmr; + struct srp_fr_desc **next_fr; + }; + struct srp_direct_buf *desc; + u64 *pages; + dma_addr_t base_dma_addr; + u32 dma_len; + u32 total_len; + unsigned int npages; + unsigned int nmdesc; + unsigned int ndesc; + struct scatterlist *unmapped_sg; + int unmapped_index; + dma_addr_t unmapped_addr; +}; + +#endif /* IB_SRP_H */ diff --git a/drivers/infiniband/ulp/srpt/Kconfig b/drivers/infiniband/ulp/srpt/Kconfig new file mode 100644 index 0000000..31ee83d --- /dev/null +++ b/drivers/infiniband/ulp/srpt/Kconfig @@ -0,0 +1,12 @@ +config INFINIBAND_SRPT + tristate "InfiniBand SCSI RDMA Protocol target support" + depends on INFINIBAND && TARGET_CORE + ---help--- + + Support for the SCSI RDMA Protocol (SRP) Target driver. The + SRP protocol is a protocol that allows an initiator to access + a block storage device on another host (target) over a network + that supports the RDMA protocol. Currently the RDMA protocol is + supported by InfiniBand and by iWarp network hardware. More + information about the SRP protocol can be found on the website + of the INCITS T10 technical committee (http://www.t10.org/). diff --git a/drivers/infiniband/ulp/srpt/Makefile b/drivers/infiniband/ulp/srpt/Makefile new file mode 100644 index 0000000..e3ee4bd --- /dev/null +++ b/drivers/infiniband/ulp/srpt/Makefile @@ -0,0 +1,2 @@ +ccflags-y := -Idrivers/target +obj-$(CONFIG_INFINIBAND_SRPT) += ib_srpt.o diff --git a/drivers/infiniband/ulp/srpt/ib_dm_mad.h b/drivers/infiniband/ulp/srpt/ib_dm_mad.h new file mode 100644 index 0000000..fb1de1f --- /dev/null +++ b/drivers/infiniband/ulp/srpt/ib_dm_mad.h @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2006 - 2009 Mellanox Technology Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef IB_DM_MAD_H +#define IB_DM_MAD_H + +#include + +#include + +enum { + /* + * See also section 13.4.7 Status Field, table 115 MAD Common Status + * Field Bit Values and also section 16.3.1.1 Status Field in the + * InfiniBand Architecture Specification. + */ + DM_MAD_STATUS_UNSUP_METHOD = 0x0008, + DM_MAD_STATUS_UNSUP_METHOD_ATTR = 0x000c, + DM_MAD_STATUS_INVALID_FIELD = 0x001c, + DM_MAD_STATUS_NO_IOC = 0x0100, + + /* + * See also the Device Management chapter, section 16.3.3 Attributes, + * table 279 Device Management Attributes in the InfiniBand + * Architecture Specification. + */ + DM_ATTR_CLASS_PORT_INFO = 0x01, + DM_ATTR_IOU_INFO = 0x10, + DM_ATTR_IOC_PROFILE = 0x11, + DM_ATTR_SVC_ENTRIES = 0x12 +}; + +struct ib_dm_hdr { + u8 reserved[28]; +}; + +/* + * Structure of management datagram sent by the SRP target implementation. + * Contains a management datagram header, reliable multi-packet transaction + * protocol (RMPP) header and ib_dm_hdr. Notes: + * - The SRP target implementation does not use RMPP or ib_dm_hdr when sending + * management datagrams. + * - The header size must be exactly 64 bytes (IB_MGMT_DEVICE_HDR), since this + * is the header size that is passed to ib_create_send_mad() in ib_srpt.c. + * - The maximum supported size for a management datagram when not using RMPP + * is 256 bytes -- 64 bytes header and 192 (IB_MGMT_DEVICE_DATA) bytes data. + */ +struct ib_dm_mad { + struct ib_mad_hdr mad_hdr; + struct ib_rmpp_hdr rmpp_hdr; + struct ib_dm_hdr dm_hdr; + u8 data[IB_MGMT_DEVICE_DATA]; +}; + +/* + * IOUnitInfo as defined in section 16.3.3.3 IOUnitInfo of the InfiniBand + * Architecture Specification. + */ +struct ib_dm_iou_info { + __be16 change_id; + u8 max_controllers; + u8 op_rom; + u8 controller_list[128]; +}; + +/* + * IOControllerprofile as defined in section 16.3.3.4 IOControllerProfile of + * the InfiniBand Architecture Specification. + */ +struct ib_dm_ioc_profile { + __be64 guid; + __be32 vendor_id; + __be32 device_id; + __be16 device_version; + __be16 reserved1; + __be32 subsys_vendor_id; + __be32 subsys_device_id; + __be16 io_class; + __be16 io_subclass; + __be16 protocol; + __be16 protocol_version; + __be16 service_conn; + __be16 initiators_supported; + __be16 send_queue_depth; + u8 reserved2; + u8 rdma_read_depth; + __be32 send_size; + __be32 rdma_size; + u8 op_cap_mask; + u8 svc_cap_mask; + u8 num_svc_entries; + u8 reserved3[9]; + u8 id_string[64]; +}; + +struct ib_dm_svc_entry { + u8 name[40]; + __be64 id; +}; + +/* + * See also section 16.3.3.5 ServiceEntries in the InfiniBand Architecture + * Specification. See also section B.7, table B.8 in the T10 SRP r16a document. + */ +struct ib_dm_svc_entries { + struct ib_dm_svc_entry service_entries[4]; +}; + +#endif diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c new file mode 100644 index 0000000..dc82968 --- /dev/null +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -0,0 +1,4051 @@ +/* + * Copyright (c) 2006 - 2009 Mellanox Technology Inc. All rights reserved. + * Copyright (C) 2008 - 2011 Bart Van Assche . + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ib_srpt.h" + +/* Name of this kernel module. */ +#define DRV_NAME "ib_srpt" +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "2011-02-14" + +#define SRPT_ID_STRING "Linux SRP target" + +#undef pr_fmt +#define pr_fmt(fmt) DRV_NAME " " fmt + +MODULE_AUTHOR("Vu Pham and Bart Van Assche"); +MODULE_DESCRIPTION("InfiniBand SCSI RDMA Protocol target " + "v" DRV_VERSION " (" DRV_RELDATE ")"); +MODULE_LICENSE("Dual BSD/GPL"); + +/* + * Global Variables + */ + +static u64 srpt_service_guid; +static DEFINE_SPINLOCK(srpt_dev_lock); /* Protects srpt_dev_list. */ +static LIST_HEAD(srpt_dev_list); /* List of srpt_device structures. */ + +static unsigned srp_max_req_size = DEFAULT_MAX_REQ_SIZE; +module_param(srp_max_req_size, int, 0444); +MODULE_PARM_DESC(srp_max_req_size, + "Maximum size of SRP request messages in bytes."); + +static int srpt_srq_size = DEFAULT_SRPT_SRQ_SIZE; +module_param(srpt_srq_size, int, 0444); +MODULE_PARM_DESC(srpt_srq_size, + "Shared receive queue (SRQ) size."); + +static int srpt_get_u64_x(char *buffer, struct kernel_param *kp) +{ + return sprintf(buffer, "0x%016llx", *(u64 *)kp->arg); +} +module_param_call(srpt_service_guid, NULL, srpt_get_u64_x, &srpt_service_guid, + 0444); +MODULE_PARM_DESC(srpt_service_guid, + "Using this value for ioc_guid, id_ext, and cm_listen_id" + " instead of using the node_guid of the first HCA."); + +static struct ib_client srpt_client; +static struct target_fabric_configfs *srpt_target; +static void srpt_release_channel(struct srpt_rdma_ch *ch); +static int srpt_queue_status(struct se_cmd *cmd); + +/** + * opposite_dma_dir() - Swap DMA_TO_DEVICE and DMA_FROM_DEVICE. + */ +static inline +enum dma_data_direction opposite_dma_dir(enum dma_data_direction dir) +{ + switch (dir) { + case DMA_TO_DEVICE: return DMA_FROM_DEVICE; + case DMA_FROM_DEVICE: return DMA_TO_DEVICE; + default: return dir; + } +} + +/** + * srpt_sdev_name() - Return the name associated with the HCA. + * + * Examples are ib0, ib1, ... + */ +static inline const char *srpt_sdev_name(struct srpt_device *sdev) +{ + return sdev->device->name; +} + +static enum rdma_ch_state srpt_get_ch_state(struct srpt_rdma_ch *ch) +{ + unsigned long flags; + enum rdma_ch_state state; + + spin_lock_irqsave(&ch->spinlock, flags); + state = ch->state; + spin_unlock_irqrestore(&ch->spinlock, flags); + return state; +} + +static enum rdma_ch_state +srpt_set_ch_state(struct srpt_rdma_ch *ch, enum rdma_ch_state new_state) +{ + unsigned long flags; + enum rdma_ch_state prev; + + spin_lock_irqsave(&ch->spinlock, flags); + prev = ch->state; + ch->state = new_state; + spin_unlock_irqrestore(&ch->spinlock, flags); + return prev; +} + +/** + * srpt_test_and_set_ch_state() - Test and set the channel state. + * + * Returns true if and only if the channel state has been set to the new state. + */ +static bool +srpt_test_and_set_ch_state(struct srpt_rdma_ch *ch, enum rdma_ch_state old, + enum rdma_ch_state new) +{ + unsigned long flags; + enum rdma_ch_state prev; + + spin_lock_irqsave(&ch->spinlock, flags); + prev = ch->state; + if (prev == old) + ch->state = new; + spin_unlock_irqrestore(&ch->spinlock, flags); + return prev == old; +} + +/** + * srpt_event_handler() - Asynchronous IB event callback function. + * + * Callback function called by the InfiniBand core when an asynchronous IB + * event occurs. This callback may occur in interrupt context. See also + * section 11.5.2, Set Asynchronous Event Handler in the InfiniBand + * Architecture Specification. + */ +static void srpt_event_handler(struct ib_event_handler *handler, + struct ib_event *event) +{ + struct srpt_device *sdev; + struct srpt_port *sport; + + sdev = ib_get_client_data(event->device, &srpt_client); + if (!sdev || sdev->device != event->device) + return; + + pr_debug("ASYNC event= %d on device= %s\n", event->event, + srpt_sdev_name(sdev)); + + switch (event->event) { + case IB_EVENT_PORT_ERR: + if (event->element.port_num <= sdev->device->phys_port_cnt) { + sport = &sdev->port[event->element.port_num - 1]; + sport->lid = 0; + sport->sm_lid = 0; + } + break; + case IB_EVENT_PORT_ACTIVE: + case IB_EVENT_LID_CHANGE: + case IB_EVENT_PKEY_CHANGE: + case IB_EVENT_SM_CHANGE: + case IB_EVENT_CLIENT_REREGISTER: + case IB_EVENT_GID_CHANGE: + /* Refresh port data asynchronously. */ + if (event->element.port_num <= sdev->device->phys_port_cnt) { + sport = &sdev->port[event->element.port_num - 1]; + if (!sport->lid && !sport->sm_lid) + schedule_work(&sport->work); + } + break; + default: + printk(KERN_ERR "received unrecognized IB event %d\n", + event->event); + break; + } +} + +/** + * srpt_srq_event() - SRQ event callback function. + */ +static void srpt_srq_event(struct ib_event *event, void *ctx) +{ + printk(KERN_INFO "SRQ event %d\n", event->event); +} + +/** + * srpt_qp_event() - QP event callback function. + */ +static void srpt_qp_event(struct ib_event *event, struct srpt_rdma_ch *ch) +{ + pr_debug("QP event %d on cm_id=%p sess_name=%s state=%d\n", + event->event, ch->cm_id, ch->sess_name, srpt_get_ch_state(ch)); + + switch (event->event) { + case IB_EVENT_COMM_EST: + ib_cm_notify(ch->cm_id, event->event); + break; + case IB_EVENT_QP_LAST_WQE_REACHED: + if (srpt_test_and_set_ch_state(ch, CH_DRAINING, + CH_RELEASING)) + srpt_release_channel(ch); + else + pr_debug("%s: state %d - ignored LAST_WQE.\n", + ch->sess_name, srpt_get_ch_state(ch)); + break; + default: + printk(KERN_ERR "received unrecognized IB QP event %d\n", + event->event); + break; + } +} + +/** + * srpt_set_ioc() - Helper function for initializing an IOUnitInfo structure. + * + * @slot: one-based slot number. + * @value: four-bit value. + * + * Copies the lowest four bits of value in element slot of the array of four + * bit elements called c_list (controller list). The index slot is one-based. + */ +static void srpt_set_ioc(u8 *c_list, u32 slot, u8 value) +{ + u16 id; + u8 tmp; + + id = (slot - 1) / 2; + if (slot & 0x1) { + tmp = c_list[id] & 0xf; + c_list[id] = (value << 4) | tmp; + } else { + tmp = c_list[id] & 0xf0; + c_list[id] = (value & 0xf) | tmp; + } +} + +/** + * srpt_get_class_port_info() - Copy ClassPortInfo to a management datagram. + * + * See also section 16.3.3.1 ClassPortInfo in the InfiniBand Architecture + * Specification. + */ +static void srpt_get_class_port_info(struct ib_dm_mad *mad) +{ + struct ib_class_port_info *cif; + + cif = (struct ib_class_port_info *)mad->data; + memset(cif, 0, sizeof *cif); + cif->base_version = 1; + cif->class_version = 1; + cif->resp_time_value = 20; + + mad->mad_hdr.status = 0; +} + +/** + * srpt_get_iou() - Write IOUnitInfo to a management datagram. + * + * See also section 16.3.3.3 IOUnitInfo in the InfiniBand Architecture + * Specification. See also section B.7, table B.6 in the SRP r16a document. + */ +static void srpt_get_iou(struct ib_dm_mad *mad) +{ + struct ib_dm_iou_info *ioui; + u8 slot; + int i; + + ioui = (struct ib_dm_iou_info *)mad->data; + ioui->change_id = __constant_cpu_to_be16(1); + ioui->max_controllers = 16; + + /* set present for slot 1 and empty for the rest */ + srpt_set_ioc(ioui->controller_list, 1, 1); + for (i = 1, slot = 2; i < 16; i++, slot++) + srpt_set_ioc(ioui->controller_list, slot, 0); + + mad->mad_hdr.status = 0; +} + +/** + * srpt_get_ioc() - Write IOControllerprofile to a management datagram. + * + * See also section 16.3.3.4 IOControllerProfile in the InfiniBand + * Architecture Specification. See also section B.7, table B.7 in the SRP + * r16a document. + */ +static void srpt_get_ioc(struct srpt_port *sport, u32 slot, + struct ib_dm_mad *mad) +{ + struct srpt_device *sdev = sport->sdev; + struct ib_dm_ioc_profile *iocp; + + iocp = (struct ib_dm_ioc_profile *)mad->data; + + if (!slot || slot > 16) { + mad->mad_hdr.status + = __constant_cpu_to_be16(DM_MAD_STATUS_INVALID_FIELD); + return; + } + + if (slot > 2) { + mad->mad_hdr.status + = __constant_cpu_to_be16(DM_MAD_STATUS_NO_IOC); + return; + } + + memset(iocp, 0, sizeof *iocp); + strcpy(iocp->id_string, SRPT_ID_STRING); + iocp->guid = cpu_to_be64(srpt_service_guid); + iocp->vendor_id = cpu_to_be32(sdev->dev_attr.vendor_id); + iocp->device_id = cpu_to_be32(sdev->dev_attr.vendor_part_id); + iocp->device_version = cpu_to_be16(sdev->dev_attr.hw_ver); + iocp->subsys_vendor_id = cpu_to_be32(sdev->dev_attr.vendor_id); + iocp->subsys_device_id = 0x0; + iocp->io_class = __constant_cpu_to_be16(SRP_REV16A_IB_IO_CLASS); + iocp->io_subclass = __constant_cpu_to_be16(SRP_IO_SUBCLASS); + iocp->protocol = __constant_cpu_to_be16(SRP_PROTOCOL); + iocp->protocol_version = __constant_cpu_to_be16(SRP_PROTOCOL_VERSION); + iocp->send_queue_depth = cpu_to_be16(sdev->srq_size); + iocp->rdma_read_depth = 4; + iocp->send_size = cpu_to_be32(srp_max_req_size); + iocp->rdma_size = cpu_to_be32(min(sport->port_attrib.srp_max_rdma_size, + 1U << 24)); + iocp->num_svc_entries = 1; + iocp->op_cap_mask = SRP_SEND_TO_IOC | SRP_SEND_FROM_IOC | + SRP_RDMA_READ_FROM_IOC | SRP_RDMA_WRITE_FROM_IOC; + + mad->mad_hdr.status = 0; +} + +/** + * srpt_get_svc_entries() - Write ServiceEntries to a management datagram. + * + * See also section 16.3.3.5 ServiceEntries in the InfiniBand Architecture + * Specification. See also section B.7, table B.8 in the SRP r16a document. + */ +static void srpt_get_svc_entries(u64 ioc_guid, + u16 slot, u8 hi, u8 lo, struct ib_dm_mad *mad) +{ + struct ib_dm_svc_entries *svc_entries; + + WARN_ON(!ioc_guid); + + if (!slot || slot > 16) { + mad->mad_hdr.status + = __constant_cpu_to_be16(DM_MAD_STATUS_INVALID_FIELD); + return; + } + + if (slot > 2 || lo > hi || hi > 1) { + mad->mad_hdr.status + = __constant_cpu_to_be16(DM_MAD_STATUS_NO_IOC); + return; + } + + svc_entries = (struct ib_dm_svc_entries *)mad->data; + memset(svc_entries, 0, sizeof *svc_entries); + svc_entries->service_entries[0].id = cpu_to_be64(ioc_guid); + snprintf(svc_entries->service_entries[0].name, + sizeof(svc_entries->service_entries[0].name), + "%s%016llx", + SRP_SERVICE_NAME_PREFIX, + ioc_guid); + + mad->mad_hdr.status = 0; +} + +/** + * srpt_mgmt_method_get() - Process a received management datagram. + * @sp: source port through which the MAD has been received. + * @rq_mad: received MAD. + * @rsp_mad: response MAD. + */ +static void srpt_mgmt_method_get(struct srpt_port *sp, struct ib_mad *rq_mad, + struct ib_dm_mad *rsp_mad) +{ + u16 attr_id; + u32 slot; + u8 hi, lo; + + attr_id = be16_to_cpu(rq_mad->mad_hdr.attr_id); + switch (attr_id) { + case DM_ATTR_CLASS_PORT_INFO: + srpt_get_class_port_info(rsp_mad); + break; + case DM_ATTR_IOU_INFO: + srpt_get_iou(rsp_mad); + break; + case DM_ATTR_IOC_PROFILE: + slot = be32_to_cpu(rq_mad->mad_hdr.attr_mod); + srpt_get_ioc(sp, slot, rsp_mad); + break; + case DM_ATTR_SVC_ENTRIES: + slot = be32_to_cpu(rq_mad->mad_hdr.attr_mod); + hi = (u8) ((slot >> 8) & 0xff); + lo = (u8) (slot & 0xff); + slot = (u16) ((slot >> 16) & 0xffff); + srpt_get_svc_entries(srpt_service_guid, + slot, hi, lo, rsp_mad); + break; + default: + rsp_mad->mad_hdr.status = + __constant_cpu_to_be16(DM_MAD_STATUS_UNSUP_METHOD_ATTR); + break; + } +} + +/** + * srpt_mad_send_handler() - Post MAD-send callback function. + */ +static void srpt_mad_send_handler(struct ib_mad_agent *mad_agent, + struct ib_mad_send_wc *mad_wc) +{ + ib_destroy_ah(mad_wc->send_buf->ah); + ib_free_send_mad(mad_wc->send_buf); +} + +/** + * srpt_mad_recv_handler() - MAD reception callback function. + */ +static void srpt_mad_recv_handler(struct ib_mad_agent *mad_agent, + struct ib_mad_recv_wc *mad_wc) +{ + struct srpt_port *sport = (struct srpt_port *)mad_agent->context; + struct ib_ah *ah; + struct ib_mad_send_buf *rsp; + struct ib_dm_mad *dm_mad; + + if (!mad_wc || !mad_wc->recv_buf.mad) + return; + + ah = ib_create_ah_from_wc(mad_agent->qp->pd, mad_wc->wc, + mad_wc->recv_buf.grh, mad_agent->port_num); + if (IS_ERR(ah)) + goto err; + + BUILD_BUG_ON(offsetof(struct ib_dm_mad, data) != IB_MGMT_DEVICE_HDR); + + rsp = ib_create_send_mad(mad_agent, mad_wc->wc->src_qp, + mad_wc->wc->pkey_index, 0, + IB_MGMT_DEVICE_HDR, IB_MGMT_DEVICE_DATA, + GFP_KERNEL); + if (IS_ERR(rsp)) + goto err_rsp; + + rsp->ah = ah; + + dm_mad = rsp->mad; + memcpy(dm_mad, mad_wc->recv_buf.mad, sizeof *dm_mad); + dm_mad->mad_hdr.method = IB_MGMT_METHOD_GET_RESP; + dm_mad->mad_hdr.status = 0; + + switch (mad_wc->recv_buf.mad->mad_hdr.method) { + case IB_MGMT_METHOD_GET: + srpt_mgmt_method_get(sport, mad_wc->recv_buf.mad, dm_mad); + break; + case IB_MGMT_METHOD_SET: + dm_mad->mad_hdr.status = + __constant_cpu_to_be16(DM_MAD_STATUS_UNSUP_METHOD_ATTR); + break; + default: + dm_mad->mad_hdr.status = + __constant_cpu_to_be16(DM_MAD_STATUS_UNSUP_METHOD); + break; + } + + if (!ib_post_send_mad(rsp, NULL)) { + ib_free_recv_mad(mad_wc); + /* will destroy_ah & free_send_mad in send completion */ + return; + } + + ib_free_send_mad(rsp); + +err_rsp: + ib_destroy_ah(ah); +err: + ib_free_recv_mad(mad_wc); +} + +/** + * srpt_refresh_port() - Configure a HCA port. + * + * Enable InfiniBand management datagram processing, update the cached sm_lid, + * lid and gid values, and register a callback function for processing MADs + * on the specified port. + * + * Note: It is safe to call this function more than once for the same port. + */ +static int srpt_refresh_port(struct srpt_port *sport) +{ + struct ib_mad_reg_req reg_req; + struct ib_port_modify port_modify; + struct ib_port_attr port_attr; + int ret; + + memset(&port_modify, 0, sizeof port_modify); + port_modify.set_port_cap_mask = IB_PORT_DEVICE_MGMT_SUP; + port_modify.clr_port_cap_mask = 0; + + ret = ib_modify_port(sport->sdev->device, sport->port, 0, &port_modify); + if (ret) + goto err_mod_port; + + ret = ib_query_port(sport->sdev->device, sport->port, &port_attr); + if (ret) + goto err_query_port; + + sport->sm_lid = port_attr.sm_lid; + sport->lid = port_attr.lid; + + ret = ib_query_gid(sport->sdev->device, sport->port, 0, &sport->gid); + if (ret) + goto err_query_port; + + if (!sport->mad_agent) { + memset(®_req, 0, sizeof reg_req); + reg_req.mgmt_class = IB_MGMT_CLASS_DEVICE_MGMT; + reg_req.mgmt_class_version = IB_MGMT_BASE_VERSION; + set_bit(IB_MGMT_METHOD_GET, reg_req.method_mask); + set_bit(IB_MGMT_METHOD_SET, reg_req.method_mask); + + sport->mad_agent = ib_register_mad_agent(sport->sdev->device, + sport->port, + IB_QPT_GSI, + ®_req, 0, + srpt_mad_send_handler, + srpt_mad_recv_handler, + sport, 0); + if (IS_ERR(sport->mad_agent)) { + ret = PTR_ERR(sport->mad_agent); + sport->mad_agent = NULL; + goto err_query_port; + } + } + + return 0; + +err_query_port: + + port_modify.set_port_cap_mask = 0; + port_modify.clr_port_cap_mask = IB_PORT_DEVICE_MGMT_SUP; + ib_modify_port(sport->sdev->device, sport->port, 0, &port_modify); + +err_mod_port: + + return ret; +} + +/** + * srpt_unregister_mad_agent() - Unregister MAD callback functions. + * + * Note: It is safe to call this function more than once for the same device. + */ +static void srpt_unregister_mad_agent(struct srpt_device *sdev) +{ + struct ib_port_modify port_modify = { + .clr_port_cap_mask = IB_PORT_DEVICE_MGMT_SUP, + }; + struct srpt_port *sport; + int i; + + for (i = 1; i <= sdev->device->phys_port_cnt; i++) { + sport = &sdev->port[i - 1]; + WARN_ON(sport->port != i); + if (ib_modify_port(sdev->device, i, 0, &port_modify) < 0) + printk(KERN_ERR "disabling MAD processing failed.\n"); + if (sport->mad_agent) { + ib_unregister_mad_agent(sport->mad_agent); + sport->mad_agent = NULL; + } + } +} + +/** + * srpt_alloc_ioctx() - Allocate an SRPT I/O context structure. + */ +static struct srpt_ioctx *srpt_alloc_ioctx(struct srpt_device *sdev, + int ioctx_size, int dma_size, + enum dma_data_direction dir) +{ + struct srpt_ioctx *ioctx; + + ioctx = kmalloc(ioctx_size, GFP_KERNEL); + if (!ioctx) + goto err; + + ioctx->buf = kmalloc(dma_size, GFP_KERNEL); + if (!ioctx->buf) + goto err_free_ioctx; + + ioctx->dma = ib_dma_map_single(sdev->device, ioctx->buf, dma_size, dir); + if (ib_dma_mapping_error(sdev->device, ioctx->dma)) + goto err_free_buf; + + return ioctx; + +err_free_buf: + kfree(ioctx->buf); +err_free_ioctx: + kfree(ioctx); +err: + return NULL; +} + +/** + * srpt_free_ioctx() - Free an SRPT I/O context structure. + */ +static void srpt_free_ioctx(struct srpt_device *sdev, struct srpt_ioctx *ioctx, + int dma_size, enum dma_data_direction dir) +{ + if (!ioctx) + return; + + ib_dma_unmap_single(sdev->device, ioctx->dma, dma_size, dir); + kfree(ioctx->buf); + kfree(ioctx); +} + +/** + * srpt_alloc_ioctx_ring() - Allocate a ring of SRPT I/O context structures. + * @sdev: Device to allocate the I/O context ring for. + * @ring_size: Number of elements in the I/O context ring. + * @ioctx_size: I/O context size. + * @dma_size: DMA buffer size. + * @dir: DMA data direction. + */ +static struct srpt_ioctx **srpt_alloc_ioctx_ring(struct srpt_device *sdev, + int ring_size, int ioctx_size, + int dma_size, enum dma_data_direction dir) +{ + struct srpt_ioctx **ring; + int i; + + WARN_ON(ioctx_size != sizeof(struct srpt_recv_ioctx) + && ioctx_size != sizeof(struct srpt_send_ioctx)); + + ring = kmalloc(ring_size * sizeof(ring[0]), GFP_KERNEL); + if (!ring) + goto out; + for (i = 0; i < ring_size; ++i) { + ring[i] = srpt_alloc_ioctx(sdev, ioctx_size, dma_size, dir); + if (!ring[i]) + goto err; + ring[i]->index = i; + } + goto out; + +err: + while (--i >= 0) + srpt_free_ioctx(sdev, ring[i], dma_size, dir); + kfree(ring); + ring = NULL; +out: + return ring; +} + +/** + * srpt_free_ioctx_ring() - Free the ring of SRPT I/O context structures. + */ +static void srpt_free_ioctx_ring(struct srpt_ioctx **ioctx_ring, + struct srpt_device *sdev, int ring_size, + int dma_size, enum dma_data_direction dir) +{ + int i; + + for (i = 0; i < ring_size; ++i) + srpt_free_ioctx(sdev, ioctx_ring[i], dma_size, dir); + kfree(ioctx_ring); +} + +/** + * srpt_get_cmd_state() - Get the state of a SCSI command. + */ +static enum srpt_command_state srpt_get_cmd_state(struct srpt_send_ioctx *ioctx) +{ + enum srpt_command_state state; + unsigned long flags; + + BUG_ON(!ioctx); + + spin_lock_irqsave(&ioctx->spinlock, flags); + state = ioctx->state; + spin_unlock_irqrestore(&ioctx->spinlock, flags); + return state; +} + +/** + * srpt_set_cmd_state() - Set the state of a SCSI command. + * + * Does not modify the state of aborted commands. Returns the previous command + * state. + */ +static enum srpt_command_state srpt_set_cmd_state(struct srpt_send_ioctx *ioctx, + enum srpt_command_state new) +{ + enum srpt_command_state previous; + unsigned long flags; + + BUG_ON(!ioctx); + + spin_lock_irqsave(&ioctx->spinlock, flags); + previous = ioctx->state; + if (previous != SRPT_STATE_DONE) + ioctx->state = new; + spin_unlock_irqrestore(&ioctx->spinlock, flags); + + return previous; +} + +/** + * srpt_test_and_set_cmd_state() - Test and set the state of a command. + * + * Returns true if and only if the previous command state was equal to 'old'. + */ +static bool srpt_test_and_set_cmd_state(struct srpt_send_ioctx *ioctx, + enum srpt_command_state old, + enum srpt_command_state new) +{ + enum srpt_command_state previous; + unsigned long flags; + + WARN_ON(!ioctx); + WARN_ON(old == SRPT_STATE_DONE); + WARN_ON(new == SRPT_STATE_NEW); + + spin_lock_irqsave(&ioctx->spinlock, flags); + previous = ioctx->state; + if (previous == old) + ioctx->state = new; + spin_unlock_irqrestore(&ioctx->spinlock, flags); + return previous == old; +} + +/** + * srpt_post_recv() - Post an IB receive request. + */ +static int srpt_post_recv(struct srpt_device *sdev, + struct srpt_recv_ioctx *ioctx) +{ + struct ib_sge list; + struct ib_recv_wr wr, *bad_wr; + + BUG_ON(!sdev); + wr.wr_id = encode_wr_id(SRPT_RECV, ioctx->ioctx.index); + + list.addr = ioctx->ioctx.dma; + list.length = srp_max_req_size; + list.lkey = sdev->mr->lkey; + + wr.next = NULL; + wr.sg_list = &list; + wr.num_sge = 1; + + return ib_post_srq_recv(sdev->srq, &wr, &bad_wr); +} + +/** + * srpt_post_send() - Post an IB send request. + * + * Returns zero upon success and a non-zero value upon failure. + */ +static int srpt_post_send(struct srpt_rdma_ch *ch, + struct srpt_send_ioctx *ioctx, int len) +{ + struct ib_sge list; + struct ib_send_wr wr, *bad_wr; + struct srpt_device *sdev = ch->sport->sdev; + int ret; + + atomic_inc(&ch->req_lim); + + ret = -ENOMEM; + if (unlikely(atomic_dec_return(&ch->sq_wr_avail) < 0)) { + printk(KERN_WARNING "IB send queue full (needed 1)\n"); + goto out; + } + + ib_dma_sync_single_for_device(sdev->device, ioctx->ioctx.dma, len, + DMA_TO_DEVICE); + + list.addr = ioctx->ioctx.dma; + list.length = len; + list.lkey = sdev->mr->lkey; + + wr.next = NULL; + wr.wr_id = encode_wr_id(SRPT_SEND, ioctx->ioctx.index); + wr.sg_list = &list; + wr.num_sge = 1; + wr.opcode = IB_WR_SEND; + wr.send_flags = IB_SEND_SIGNALED; + + ret = ib_post_send(ch->qp, &wr, &bad_wr); + +out: + if (ret < 0) { + atomic_inc(&ch->sq_wr_avail); + atomic_dec(&ch->req_lim); + } + return ret; +} + +/** + * srpt_get_desc_tbl() - Parse the data descriptors of an SRP_CMD request. + * @ioctx: Pointer to the I/O context associated with the request. + * @srp_cmd: Pointer to the SRP_CMD request data. + * @dir: Pointer to the variable to which the transfer direction will be + * written. + * @data_len: Pointer to the variable to which the total data length of all + * descriptors in the SRP_CMD request will be written. + * + * This function initializes ioctx->nrbuf and ioctx->r_bufs. + * + * Returns -EINVAL when the SRP_CMD request contains inconsistent descriptors; + * -ENOMEM when memory allocation fails and zero upon success. + */ +static int srpt_get_desc_tbl(struct srpt_send_ioctx *ioctx, + struct srp_cmd *srp_cmd, + enum dma_data_direction *dir, u64 *data_len) +{ + struct srp_indirect_buf *idb; + struct srp_direct_buf *db; + unsigned add_cdb_offset; + int ret; + + /* + * The pointer computations below will only be compiled correctly + * if srp_cmd::add_data is declared as s8*, u8*, s8[] or u8[], so check + * whether srp_cmd::add_data has been declared as a byte pointer. + */ + BUILD_BUG_ON(!__same_type(srp_cmd->add_data[0], (s8)0) + && !__same_type(srp_cmd->add_data[0], (u8)0)); + + BUG_ON(!dir); + BUG_ON(!data_len); + + ret = 0; + *data_len = 0; + + /* + * The lower four bits of the buffer format field contain the DATA-IN + * buffer descriptor format, and the highest four bits contain the + * DATA-OUT buffer descriptor format. + */ + *dir = DMA_NONE; + if (srp_cmd->buf_fmt & 0xf) + /* DATA-IN: transfer data from target to initiator (read). */ + *dir = DMA_FROM_DEVICE; + else if (srp_cmd->buf_fmt >> 4) + /* DATA-OUT: transfer data from initiator to target (write). */ + *dir = DMA_TO_DEVICE; + + /* + * According to the SRP spec, the lower two bits of the 'ADDITIONAL + * CDB LENGTH' field are reserved and the size in bytes of this field + * is four times the value specified in bits 3..7. Hence the "& ~3". + */ + add_cdb_offset = srp_cmd->add_cdb_len & ~3; + if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_DIRECT) || + ((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_DIRECT)) { + ioctx->n_rbuf = 1; + ioctx->rbufs = &ioctx->single_rbuf; + + db = (struct srp_direct_buf *)(srp_cmd->add_data + + add_cdb_offset); + memcpy(ioctx->rbufs, db, sizeof *db); + *data_len = be32_to_cpu(db->len); + } else if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_INDIRECT) || + ((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_INDIRECT)) { + idb = (struct srp_indirect_buf *)(srp_cmd->add_data + + add_cdb_offset); + + ioctx->n_rbuf = be32_to_cpu(idb->table_desc.len) / sizeof *db; + + if (ioctx->n_rbuf > + (srp_cmd->data_out_desc_cnt + srp_cmd->data_in_desc_cnt)) { + printk(KERN_ERR "received unsupported SRP_CMD request" + " type (%u out + %u in != %u / %zu)\n", + srp_cmd->data_out_desc_cnt, + srp_cmd->data_in_desc_cnt, + be32_to_cpu(idb->table_desc.len), + sizeof(*db)); + ioctx->n_rbuf = 0; + ret = -EINVAL; + goto out; + } + + if (ioctx->n_rbuf == 1) + ioctx->rbufs = &ioctx->single_rbuf; + else { + ioctx->rbufs = + kmalloc(ioctx->n_rbuf * sizeof *db, GFP_ATOMIC); + if (!ioctx->rbufs) { + ioctx->n_rbuf = 0; + ret = -ENOMEM; + goto out; + } + } + + db = idb->desc_list; + memcpy(ioctx->rbufs, db, ioctx->n_rbuf * sizeof *db); + *data_len = be32_to_cpu(idb->len); + } +out: + return ret; +} + +/** + * srpt_init_ch_qp() - Initialize queue pair attributes. + * + * Initialized the attributes of queue pair 'qp' by allowing local write, + * remote read and remote write. Also transitions 'qp' to state IB_QPS_INIT. + */ +static int srpt_init_ch_qp(struct srpt_rdma_ch *ch, struct ib_qp *qp) +{ + struct ib_qp_attr *attr; + int ret; + + attr = kzalloc(sizeof *attr, GFP_KERNEL); + if (!attr) + return -ENOMEM; + + attr->qp_state = IB_QPS_INIT; + attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE; + attr->port_num = ch->sport->port; + attr->pkey_index = 0; + + ret = ib_modify_qp(qp, attr, + IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PORT | + IB_QP_PKEY_INDEX); + + kfree(attr); + return ret; +} + +/** + * srpt_ch_qp_rtr() - Change the state of a channel to 'ready to receive' (RTR). + * @ch: channel of the queue pair. + * @qp: queue pair to change the state of. + * + * Returns zero upon success and a negative value upon failure. + * + * Note: currently a struct ib_qp_attr takes 136 bytes on a 64-bit system. + * If this structure ever becomes larger, it might be necessary to allocate + * it dynamically instead of on the stack. + */ +static int srpt_ch_qp_rtr(struct srpt_rdma_ch *ch, struct ib_qp *qp) +{ + struct ib_qp_attr qp_attr; + int attr_mask; + int ret; + + qp_attr.qp_state = IB_QPS_RTR; + ret = ib_cm_init_qp_attr(ch->cm_id, &qp_attr, &attr_mask); + if (ret) + goto out; + + qp_attr.max_dest_rd_atomic = 4; + + ret = ib_modify_qp(qp, &qp_attr, attr_mask); + +out: + return ret; +} + +/** + * srpt_ch_qp_rts() - Change the state of a channel to 'ready to send' (RTS). + * @ch: channel of the queue pair. + * @qp: queue pair to change the state of. + * + * Returns zero upon success and a negative value upon failure. + * + * Note: currently a struct ib_qp_attr takes 136 bytes on a 64-bit system. + * If this structure ever becomes larger, it might be necessary to allocate + * it dynamically instead of on the stack. + */ +static int srpt_ch_qp_rts(struct srpt_rdma_ch *ch, struct ib_qp *qp) +{ + struct ib_qp_attr qp_attr; + int attr_mask; + int ret; + + qp_attr.qp_state = IB_QPS_RTS; + ret = ib_cm_init_qp_attr(ch->cm_id, &qp_attr, &attr_mask); + if (ret) + goto out; + + qp_attr.max_rd_atomic = 4; + + ret = ib_modify_qp(qp, &qp_attr, attr_mask); + +out: + return ret; +} + +/** + * srpt_ch_qp_err() - Set the channel queue pair state to 'error'. + */ +static int srpt_ch_qp_err(struct srpt_rdma_ch *ch) +{ + struct ib_qp_attr qp_attr; + + qp_attr.qp_state = IB_QPS_ERR; + return ib_modify_qp(ch->qp, &qp_attr, IB_QP_STATE); +} + +/** + * srpt_unmap_sg_to_ib_sge() - Unmap an IB SGE list. + */ +static void srpt_unmap_sg_to_ib_sge(struct srpt_rdma_ch *ch, + struct srpt_send_ioctx *ioctx) +{ + struct scatterlist *sg; + enum dma_data_direction dir; + + BUG_ON(!ch); + BUG_ON(!ioctx); + BUG_ON(ioctx->n_rdma && !ioctx->rdma_ius); + + while (ioctx->n_rdma) + kfree(ioctx->rdma_ius[--ioctx->n_rdma].sge); + + kfree(ioctx->rdma_ius); + ioctx->rdma_ius = NULL; + + if (ioctx->mapped_sg_count) { + sg = ioctx->sg; + WARN_ON(!sg); + dir = ioctx->cmd.data_direction; + BUG_ON(dir == DMA_NONE); + ib_dma_unmap_sg(ch->sport->sdev->device, sg, ioctx->sg_cnt, + opposite_dma_dir(dir)); + ioctx->mapped_sg_count = 0; + } +} + +/** + * srpt_map_sg_to_ib_sge() - Map an SG list to an IB SGE list. + */ +static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch, + struct srpt_send_ioctx *ioctx) +{ + struct ib_device *dev = ch->sport->sdev->device; + struct se_cmd *cmd; + struct scatterlist *sg, *sg_orig; + int sg_cnt; + enum dma_data_direction dir; + struct rdma_iu *riu; + struct srp_direct_buf *db; + dma_addr_t dma_addr; + struct ib_sge *sge; + u64 raddr; + u32 rsize; + u32 tsize; + u32 dma_len; + int count, nrdma; + int i, j, k; + + BUG_ON(!ch); + BUG_ON(!ioctx); + cmd = &ioctx->cmd; + dir = cmd->data_direction; + BUG_ON(dir == DMA_NONE); + + ioctx->sg = sg = sg_orig = cmd->t_data_sg; + ioctx->sg_cnt = sg_cnt = cmd->t_data_nents; + + count = ib_dma_map_sg(ch->sport->sdev->device, sg, sg_cnt, + opposite_dma_dir(dir)); + if (unlikely(!count)) + return -EAGAIN; + + ioctx->mapped_sg_count = count; + + if (ioctx->rdma_ius && ioctx->n_rdma_ius) + nrdma = ioctx->n_rdma_ius; + else { + nrdma = (count + SRPT_DEF_SG_PER_WQE - 1) / SRPT_DEF_SG_PER_WQE + + ioctx->n_rbuf; + + ioctx->rdma_ius = kzalloc(nrdma * sizeof *riu, GFP_KERNEL); + if (!ioctx->rdma_ius) + goto free_mem; + + ioctx->n_rdma_ius = nrdma; + } + + db = ioctx->rbufs; + tsize = cmd->data_length; + dma_len = ib_sg_dma_len(dev, &sg[0]); + riu = ioctx->rdma_ius; + + /* + * For each remote desc - calculate the #ib_sge. + * If #ib_sge < SRPT_DEF_SG_PER_WQE per rdma operation then + * each remote desc rdma_iu is required a rdma wr; + * else + * we need to allocate extra rdma_iu to carry extra #ib_sge in + * another rdma wr + */ + for (i = 0, j = 0; + j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) { + rsize = be32_to_cpu(db->len); + raddr = be64_to_cpu(db->va); + riu->raddr = raddr; + riu->rkey = be32_to_cpu(db->key); + riu->sge_cnt = 0; + + /* calculate how many sge required for this remote_buf */ + while (rsize > 0 && tsize > 0) { + + if (rsize >= dma_len) { + tsize -= dma_len; + rsize -= dma_len; + raddr += dma_len; + + if (tsize > 0) { + ++j; + if (j < count) { + sg = sg_next(sg); + dma_len = ib_sg_dma_len( + dev, sg); + } + } + } else { + tsize -= rsize; + dma_len -= rsize; + rsize = 0; + } + + ++riu->sge_cnt; + + if (rsize > 0 && riu->sge_cnt == SRPT_DEF_SG_PER_WQE) { + ++ioctx->n_rdma; + riu->sge = + kmalloc(riu->sge_cnt * sizeof *riu->sge, + GFP_KERNEL); + if (!riu->sge) + goto free_mem; + + ++riu; + riu->sge_cnt = 0; + riu->raddr = raddr; + riu->rkey = be32_to_cpu(db->key); + } + } + + ++ioctx->n_rdma; + riu->sge = kmalloc(riu->sge_cnt * sizeof *riu->sge, + GFP_KERNEL); + if (!riu->sge) + goto free_mem; + } + + db = ioctx->rbufs; + tsize = cmd->data_length; + riu = ioctx->rdma_ius; + sg = sg_orig; + dma_len = ib_sg_dma_len(dev, &sg[0]); + dma_addr = ib_sg_dma_address(dev, &sg[0]); + + /* this second loop is really mapped sg_addres to rdma_iu->ib_sge */ + for (i = 0, j = 0; + j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) { + rsize = be32_to_cpu(db->len); + sge = riu->sge; + k = 0; + + while (rsize > 0 && tsize > 0) { + sge->addr = dma_addr; + sge->lkey = ch->sport->sdev->mr->lkey; + + if (rsize >= dma_len) { + sge->length = + (tsize < dma_len) ? tsize : dma_len; + tsize -= dma_len; + rsize -= dma_len; + + if (tsize > 0) { + ++j; + if (j < count) { + sg = sg_next(sg); + dma_len = ib_sg_dma_len( + dev, sg); + dma_addr = ib_sg_dma_address( + dev, sg); + } + } + } else { + sge->length = (tsize < rsize) ? tsize : rsize; + tsize -= rsize; + dma_len -= rsize; + dma_addr += rsize; + rsize = 0; + } + + ++k; + if (k == riu->sge_cnt && rsize > 0 && tsize > 0) { + ++riu; + sge = riu->sge; + k = 0; + } else if (rsize > 0 && tsize > 0) + ++sge; + } + } + + return 0; + +free_mem: + srpt_unmap_sg_to_ib_sge(ch, ioctx); + + return -ENOMEM; +} + +/** + * srpt_get_send_ioctx() - Obtain an I/O context for sending to the initiator. + */ +static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch) +{ + struct srpt_send_ioctx *ioctx; + unsigned long flags; + + BUG_ON(!ch); + + ioctx = NULL; + spin_lock_irqsave(&ch->spinlock, flags); + if (!list_empty(&ch->free_list)) { + ioctx = list_first_entry(&ch->free_list, + struct srpt_send_ioctx, free_list); + list_del(&ioctx->free_list); + } + spin_unlock_irqrestore(&ch->spinlock, flags); + + if (!ioctx) + return ioctx; + + BUG_ON(ioctx->ch != ch); + spin_lock_init(&ioctx->spinlock); + ioctx->state = SRPT_STATE_NEW; + ioctx->n_rbuf = 0; + ioctx->rbufs = NULL; + ioctx->n_rdma = 0; + ioctx->n_rdma_ius = 0; + ioctx->rdma_ius = NULL; + ioctx->mapped_sg_count = 0; + init_completion(&ioctx->tx_done); + ioctx->queue_status_only = false; + /* + * transport_init_se_cmd() does not initialize all fields, so do it + * here. + */ + memset(&ioctx->cmd, 0, sizeof(ioctx->cmd)); + memset(&ioctx->sense_data, 0, sizeof(ioctx->sense_data)); + + return ioctx; +} + +/** + * srpt_abort_cmd() - Abort a SCSI command. + * @ioctx: I/O context associated with the SCSI command. + * @context: Preferred execution context. + */ +static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) +{ + enum srpt_command_state state; + unsigned long flags; + + BUG_ON(!ioctx); + + /* + * If the command is in a state where the target core is waiting for + * the ib_srpt driver, change the state to the next state. Changing + * the state of the command from SRPT_STATE_NEED_DATA to + * SRPT_STATE_DATA_IN ensures that srpt_xmit_response() will call this + * function a second time. + */ + + spin_lock_irqsave(&ioctx->spinlock, flags); + state = ioctx->state; + switch (state) { + case SRPT_STATE_NEED_DATA: + ioctx->state = SRPT_STATE_DATA_IN; + break; + case SRPT_STATE_DATA_IN: + case SRPT_STATE_CMD_RSP_SENT: + case SRPT_STATE_MGMT_RSP_SENT: + ioctx->state = SRPT_STATE_DONE; + break; + default: + break; + } + spin_unlock_irqrestore(&ioctx->spinlock, flags); + + if (state == SRPT_STATE_DONE) { + struct srpt_rdma_ch *ch = ioctx->ch; + + BUG_ON(ch->sess == NULL); + + target_put_sess_cmd(ch->sess, &ioctx->cmd); + goto out; + } + + pr_debug("Aborting cmd with state %d and tag %lld\n", state, + ioctx->tag); + + switch (state) { + case SRPT_STATE_NEW: + case SRPT_STATE_DATA_IN: + case SRPT_STATE_MGMT: + /* + * Do nothing - defer abort processing until + * srpt_queue_response() is invoked. + */ + WARN_ON(!transport_check_aborted_status(&ioctx->cmd, false)); + break; + case SRPT_STATE_NEED_DATA: + /* DMA_TO_DEVICE (write) - RDMA read error. */ + + /* XXX(hch): this is a horrible layering violation.. */ + spin_lock_irqsave(&ioctx->cmd.t_state_lock, flags); + ioctx->cmd.transport_state &= ~CMD_T_ACTIVE; + spin_unlock_irqrestore(&ioctx->cmd.t_state_lock, flags); + break; + case SRPT_STATE_CMD_RSP_SENT: + /* + * SRP_RSP sending failed or the SRP_RSP send completion has + * not been received in time. + */ + srpt_unmap_sg_to_ib_sge(ioctx->ch, ioctx); + target_put_sess_cmd(ioctx->ch->sess, &ioctx->cmd); + break; + case SRPT_STATE_MGMT_RSP_SENT: + srpt_set_cmd_state(ioctx, SRPT_STATE_DONE); + target_put_sess_cmd(ioctx->ch->sess, &ioctx->cmd); + break; + default: + WARN(1, "Unexpected command state (%d)", state); + break; + } + +out: + return state; +} + +/** + * srpt_handle_send_err_comp() - Process an IB_WC_SEND error completion. + */ +static void srpt_handle_send_err_comp(struct srpt_rdma_ch *ch, u64 wr_id) +{ + struct srpt_send_ioctx *ioctx; + enum srpt_command_state state; + struct se_cmd *cmd; + u32 index; + + atomic_inc(&ch->sq_wr_avail); + + index = idx_from_wr_id(wr_id); + ioctx = ch->ioctx_ring[index]; + state = srpt_get_cmd_state(ioctx); + cmd = &ioctx->cmd; + + WARN_ON(state != SRPT_STATE_CMD_RSP_SENT + && state != SRPT_STATE_MGMT_RSP_SENT + && state != SRPT_STATE_NEED_DATA + && state != SRPT_STATE_DONE); + + /* If SRP_RSP sending failed, undo the ch->req_lim change. */ + if (state == SRPT_STATE_CMD_RSP_SENT + || state == SRPT_STATE_MGMT_RSP_SENT) + atomic_dec(&ch->req_lim); + + srpt_abort_cmd(ioctx); +} + +/** + * srpt_handle_send_comp() - Process an IB send completion notification. + */ +static void srpt_handle_send_comp(struct srpt_rdma_ch *ch, + struct srpt_send_ioctx *ioctx) +{ + enum srpt_command_state state; + + atomic_inc(&ch->sq_wr_avail); + + state = srpt_set_cmd_state(ioctx, SRPT_STATE_DONE); + + if (WARN_ON(state != SRPT_STATE_CMD_RSP_SENT + && state != SRPT_STATE_MGMT_RSP_SENT + && state != SRPT_STATE_DONE)) + pr_debug("state = %d\n", state); + + if (state != SRPT_STATE_DONE) { + srpt_unmap_sg_to_ib_sge(ch, ioctx); + transport_generic_free_cmd(&ioctx->cmd, 0); + } else { + printk(KERN_ERR "IB completion has been received too late for" + " wr_id = %u.\n", ioctx->ioctx.index); + } +} + +/** + * srpt_handle_rdma_comp() - Process an IB RDMA completion notification. + * + * XXX: what is now target_execute_cmd used to be asynchronous, and unmapping + * the data that has been transferred via IB RDMA had to be postponed until the + * check_stop_free() callback. None of this is necessary anymore and needs to + * be cleaned up. + */ +static void srpt_handle_rdma_comp(struct srpt_rdma_ch *ch, + struct srpt_send_ioctx *ioctx, + enum srpt_opcode opcode) +{ + WARN_ON(ioctx->n_rdma <= 0); + atomic_add(ioctx->n_rdma, &ch->sq_wr_avail); + + if (opcode == SRPT_RDMA_READ_LAST) { + if (srpt_test_and_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA, + SRPT_STATE_DATA_IN)) + target_execute_cmd(&ioctx->cmd); + else + printk(KERN_ERR "%s[%d]: wrong state = %d\n", __func__, + __LINE__, srpt_get_cmd_state(ioctx)); + } else if (opcode == SRPT_RDMA_ABORT) { + ioctx->rdma_aborted = true; + } else { + WARN(true, "unexpected opcode %d\n", opcode); + } +} + +/** + * srpt_handle_rdma_err_comp() - Process an IB RDMA error completion. + */ +static void srpt_handle_rdma_err_comp(struct srpt_rdma_ch *ch, + struct srpt_send_ioctx *ioctx, + enum srpt_opcode opcode) +{ + struct se_cmd *cmd; + enum srpt_command_state state; + + cmd = &ioctx->cmd; + state = srpt_get_cmd_state(ioctx); + switch (opcode) { + case SRPT_RDMA_READ_LAST: + if (ioctx->n_rdma <= 0) { + printk(KERN_ERR "Received invalid RDMA read" + " error completion with idx %d\n", + ioctx->ioctx.index); + break; + } + atomic_add(ioctx->n_rdma, &ch->sq_wr_avail); + if (state == SRPT_STATE_NEED_DATA) + srpt_abort_cmd(ioctx); + else + printk(KERN_ERR "%s[%d]: wrong state = %d\n", + __func__, __LINE__, state); + break; + case SRPT_RDMA_WRITE_LAST: + break; + default: + printk(KERN_ERR "%s[%d]: opcode = %u\n", __func__, + __LINE__, opcode); + break; + } +} + +/** + * srpt_build_cmd_rsp() - Build an SRP_RSP response. + * @ch: RDMA channel through which the request has been received. + * @ioctx: I/O context associated with the SRP_CMD request. The response will + * be built in the buffer ioctx->buf points at and hence this function will + * overwrite the request data. + * @tag: tag of the request for which this response is being generated. + * @status: value for the STATUS field of the SRP_RSP information unit. + * + * Returns the size in bytes of the SRP_RSP response. + * + * An SRP_RSP response contains a SCSI status or service response. See also + * section 6.9 in the SRP r16a document for the format of an SRP_RSP + * response. See also SPC-2 for more information about sense data. + */ +static int srpt_build_cmd_rsp(struct srpt_rdma_ch *ch, + struct srpt_send_ioctx *ioctx, u64 tag, + int status) +{ + struct srp_rsp *srp_rsp; + const u8 *sense_data; + int sense_data_len, max_sense_len; + + /* + * The lowest bit of all SAM-3 status codes is zero (see also + * paragraph 5.3 in SAM-3). + */ + WARN_ON(status & 1); + + srp_rsp = ioctx->ioctx.buf; + BUG_ON(!srp_rsp); + + sense_data = ioctx->sense_data; + sense_data_len = ioctx->cmd.scsi_sense_length; + WARN_ON(sense_data_len > sizeof(ioctx->sense_data)); + + memset(srp_rsp, 0, sizeof *srp_rsp); + srp_rsp->opcode = SRP_RSP; + srp_rsp->req_lim_delta = + __constant_cpu_to_be32(1 + atomic_xchg(&ch->req_lim_delta, 0)); + srp_rsp->tag = tag; + srp_rsp->status = status; + + if (sense_data_len) { + BUILD_BUG_ON(MIN_MAX_RSP_SIZE <= sizeof(*srp_rsp)); + max_sense_len = ch->max_ti_iu_len - sizeof(*srp_rsp); + if (sense_data_len > max_sense_len) { + printk(KERN_WARNING "truncated sense data from %d to %d" + " bytes\n", sense_data_len, max_sense_len); + sense_data_len = max_sense_len; + } + + srp_rsp->flags |= SRP_RSP_FLAG_SNSVALID; + srp_rsp->sense_data_len = cpu_to_be32(sense_data_len); + memcpy(srp_rsp + 1, sense_data, sense_data_len); + } + + return sizeof(*srp_rsp) + sense_data_len; +} + +/** + * srpt_build_tskmgmt_rsp() - Build a task management response. + * @ch: RDMA channel through which the request has been received. + * @ioctx: I/O context in which the SRP_RSP response will be built. + * @rsp_code: RSP_CODE that will be stored in the response. + * @tag: Tag of the request for which this response is being generated. + * + * Returns the size in bytes of the SRP_RSP response. + * + * An SRP_RSP response contains a SCSI status or service response. See also + * section 6.9 in the SRP r16a document for the format of an SRP_RSP + * response. + */ +static int srpt_build_tskmgmt_rsp(struct srpt_rdma_ch *ch, + struct srpt_send_ioctx *ioctx, + u8 rsp_code, u64 tag) +{ + struct srp_rsp *srp_rsp; + int resp_data_len; + int resp_len; + + resp_data_len = 4; + resp_len = sizeof(*srp_rsp) + resp_data_len; + + srp_rsp = ioctx->ioctx.buf; + BUG_ON(!srp_rsp); + memset(srp_rsp, 0, sizeof *srp_rsp); + + srp_rsp->opcode = SRP_RSP; + srp_rsp->req_lim_delta = __constant_cpu_to_be32(1 + + atomic_xchg(&ch->req_lim_delta, 0)); + srp_rsp->tag = tag; + + srp_rsp->flags |= SRP_RSP_FLAG_RSPVALID; + srp_rsp->resp_data_len = cpu_to_be32(resp_data_len); + srp_rsp->data[3] = rsp_code; + + return resp_len; +} + +#define NO_SUCH_LUN ((uint64_t)-1LL) + +/* + * SCSI LUN addressing method. See also SAM-2 and the section about + * eight byte LUNs. + */ +enum scsi_lun_addr_method { + SCSI_LUN_ADDR_METHOD_PERIPHERAL = 0, + SCSI_LUN_ADDR_METHOD_FLAT = 1, + SCSI_LUN_ADDR_METHOD_LUN = 2, + SCSI_LUN_ADDR_METHOD_EXTENDED_LUN = 3, +}; + +/* + * srpt_unpack_lun() - Convert from network LUN to linear LUN. + * + * Convert an 2-byte, 4-byte, 6-byte or 8-byte LUN structure in network byte + * order (big endian) to a linear LUN. Supports three LUN addressing methods: + * peripheral, flat and logical unit. See also SAM-2, section 4.9.4 (page 40). + */ +static uint64_t srpt_unpack_lun(const uint8_t *lun, int len) +{ + uint64_t res = NO_SUCH_LUN; + int addressing_method; + + if (unlikely(len < 2)) { + printk(KERN_ERR "Illegal LUN length %d, expected 2 bytes or " + "more", len); + goto out; + } + + switch (len) { + case 8: + if ((*((__be64 *)lun) & + __constant_cpu_to_be64(0x0000FFFFFFFFFFFFLL)) != 0) + goto out_err; + break; + case 4: + if (*((__be16 *)&lun[2]) != 0) + goto out_err; + break; + case 6: + if (*((__be32 *)&lun[2]) != 0) + goto out_err; + break; + case 2: + break; + default: + goto out_err; + } + + addressing_method = (*lun) >> 6; /* highest two bits of byte 0 */ + switch (addressing_method) { + case SCSI_LUN_ADDR_METHOD_PERIPHERAL: + case SCSI_LUN_ADDR_METHOD_FLAT: + case SCSI_LUN_ADDR_METHOD_LUN: + res = *(lun + 1) | (((*lun) & 0x3f) << 8); + break; + + case SCSI_LUN_ADDR_METHOD_EXTENDED_LUN: + default: + printk(KERN_ERR "Unimplemented LUN addressing method %u", + addressing_method); + break; + } + +out: + return res; + +out_err: + printk(KERN_ERR "Support for multi-level LUNs has not yet been" + " implemented"); + goto out; +} + +static int srpt_check_stop_free(struct se_cmd *cmd) +{ + struct srpt_send_ioctx *ioctx = container_of(cmd, + struct srpt_send_ioctx, cmd); + + return target_put_sess_cmd(ioctx->ch->sess, &ioctx->cmd); +} + +/** + * srpt_handle_cmd() - Process SRP_CMD. + */ +static int srpt_handle_cmd(struct srpt_rdma_ch *ch, + struct srpt_recv_ioctx *recv_ioctx, + struct srpt_send_ioctx *send_ioctx) +{ + struct se_cmd *cmd; + struct srp_cmd *srp_cmd; + uint64_t unpacked_lun; + u64 data_len; + enum dma_data_direction dir; + sense_reason_t ret; + int rc; + + BUG_ON(!send_ioctx); + + srp_cmd = recv_ioctx->ioctx.buf; + cmd = &send_ioctx->cmd; + send_ioctx->tag = srp_cmd->tag; + + switch (srp_cmd->task_attr) { + case SRP_CMD_SIMPLE_Q: + cmd->sam_task_attr = MSG_SIMPLE_TAG; + break; + case SRP_CMD_ORDERED_Q: + default: + cmd->sam_task_attr = MSG_ORDERED_TAG; + break; + case SRP_CMD_HEAD_OF_Q: + cmd->sam_task_attr = MSG_HEAD_TAG; + break; + case SRP_CMD_ACA: + cmd->sam_task_attr = MSG_ACA_TAG; + break; + } + + if (srpt_get_desc_tbl(send_ioctx, srp_cmd, &dir, &data_len)) { + printk(KERN_ERR "0x%llx: parsing SRP descriptor table failed.\n", + srp_cmd->tag); + ret = TCM_INVALID_CDB_FIELD; + goto send_sense; + } + + unpacked_lun = srpt_unpack_lun((uint8_t *)&srp_cmd->lun, + sizeof(srp_cmd->lun)); + rc = target_submit_cmd(cmd, ch->sess, srp_cmd->cdb, + &send_ioctx->sense_data[0], unpacked_lun, data_len, + MSG_SIMPLE_TAG, dir, TARGET_SCF_ACK_KREF); + if (rc != 0) { + ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; + goto send_sense; + } + return 0; + +send_sense: + transport_send_check_condition_and_sense(cmd, ret, 0); + return -1; +} + +/** + * srpt_rx_mgmt_fn_tag() - Process a task management function by tag. + * @ch: RDMA channel of the task management request. + * @fn: Task management function to perform. + * @req_tag: Tag of the SRP task management request. + * @mgmt_ioctx: I/O context of the task management request. + * + * Returns zero if the target core will process the task management + * request asynchronously. + * + * Note: It is assumed that the initiator serializes tag-based task management + * requests. + */ +static int srpt_rx_mgmt_fn_tag(struct srpt_send_ioctx *ioctx, u64 tag) +{ + struct srpt_device *sdev; + struct srpt_rdma_ch *ch; + struct srpt_send_ioctx *target; + int ret, i; + + ret = -EINVAL; + ch = ioctx->ch; + BUG_ON(!ch); + BUG_ON(!ch->sport); + sdev = ch->sport->sdev; + BUG_ON(!sdev); + spin_lock_irq(&sdev->spinlock); + for (i = 0; i < ch->rq_size; ++i) { + target = ch->ioctx_ring[i]; + if (target->cmd.se_lun == ioctx->cmd.se_lun && + target->tag == tag && + srpt_get_cmd_state(target) != SRPT_STATE_DONE) { + ret = 0; + /* now let the target core abort &target->cmd; */ + break; + } + } + spin_unlock_irq(&sdev->spinlock); + return ret; +} + +static int srp_tmr_to_tcm(int fn) +{ + switch (fn) { + case SRP_TSK_ABORT_TASK: + return TMR_ABORT_TASK; + case SRP_TSK_ABORT_TASK_SET: + return TMR_ABORT_TASK_SET; + case SRP_TSK_CLEAR_TASK_SET: + return TMR_CLEAR_TASK_SET; + case SRP_TSK_LUN_RESET: + return TMR_LUN_RESET; + case SRP_TSK_CLEAR_ACA: + return TMR_CLEAR_ACA; + default: + return -1; + } +} + +/** + * srpt_handle_tsk_mgmt() - Process an SRP_TSK_MGMT information unit. + * + * Returns 0 if and only if the request will be processed by the target core. + * + * For more information about SRP_TSK_MGMT information units, see also section + * 6.7 in the SRP r16a document. + */ +static void srpt_handle_tsk_mgmt(struct srpt_rdma_ch *ch, + struct srpt_recv_ioctx *recv_ioctx, + struct srpt_send_ioctx *send_ioctx) +{ + struct srp_tsk_mgmt *srp_tsk; + struct se_cmd *cmd; + struct se_session *sess = ch->sess; + uint64_t unpacked_lun; + uint32_t tag = 0; + int tcm_tmr; + int rc; + + BUG_ON(!send_ioctx); + + srp_tsk = recv_ioctx->ioctx.buf; + cmd = &send_ioctx->cmd; + + pr_debug("recv tsk_mgmt fn %d for task_tag %lld and cmd tag %lld" + " cm_id %p sess %p\n", srp_tsk->tsk_mgmt_func, + srp_tsk->task_tag, srp_tsk->tag, ch->cm_id, ch->sess); + + srpt_set_cmd_state(send_ioctx, SRPT_STATE_MGMT); + send_ioctx->tag = srp_tsk->tag; + tcm_tmr = srp_tmr_to_tcm(srp_tsk->tsk_mgmt_func); + if (tcm_tmr < 0) { + send_ioctx->cmd.se_tmr_req->response = + TMR_TASK_MGMT_FUNCTION_NOT_SUPPORTED; + goto fail; + } + unpacked_lun = srpt_unpack_lun((uint8_t *)&srp_tsk->lun, + sizeof(srp_tsk->lun)); + + if (srp_tsk->tsk_mgmt_func == SRP_TSK_ABORT_TASK) { + rc = srpt_rx_mgmt_fn_tag(send_ioctx, srp_tsk->task_tag); + if (rc < 0) { + send_ioctx->cmd.se_tmr_req->response = + TMR_TASK_DOES_NOT_EXIST; + goto fail; + } + tag = srp_tsk->task_tag; + } + rc = target_submit_tmr(&send_ioctx->cmd, sess, NULL, unpacked_lun, + srp_tsk, tcm_tmr, GFP_KERNEL, tag, + TARGET_SCF_ACK_KREF); + if (rc != 0) { + send_ioctx->cmd.se_tmr_req->response = TMR_FUNCTION_REJECTED; + goto fail; + } + return; +fail: + transport_send_check_condition_and_sense(cmd, 0, 0); // XXX: +} + +/** + * srpt_handle_new_iu() - Process a newly received information unit. + * @ch: RDMA channel through which the information unit has been received. + * @ioctx: SRPT I/O context associated with the information unit. + */ +static void srpt_handle_new_iu(struct srpt_rdma_ch *ch, + struct srpt_recv_ioctx *recv_ioctx, + struct srpt_send_ioctx *send_ioctx) +{ + struct srp_cmd *srp_cmd; + enum rdma_ch_state ch_state; + + BUG_ON(!ch); + BUG_ON(!recv_ioctx); + + ib_dma_sync_single_for_cpu(ch->sport->sdev->device, + recv_ioctx->ioctx.dma, srp_max_req_size, + DMA_FROM_DEVICE); + + ch_state = srpt_get_ch_state(ch); + if (unlikely(ch_state == CH_CONNECTING)) { + list_add_tail(&recv_ioctx->wait_list, &ch->cmd_wait_list); + goto out; + } + + if (unlikely(ch_state != CH_LIVE)) + goto out; + + srp_cmd = recv_ioctx->ioctx.buf; + if (srp_cmd->opcode == SRP_CMD || srp_cmd->opcode == SRP_TSK_MGMT) { + if (!send_ioctx) + send_ioctx = srpt_get_send_ioctx(ch); + if (unlikely(!send_ioctx)) { + list_add_tail(&recv_ioctx->wait_list, + &ch->cmd_wait_list); + goto out; + } + } + + switch (srp_cmd->opcode) { + case SRP_CMD: + srpt_handle_cmd(ch, recv_ioctx, send_ioctx); + break; + case SRP_TSK_MGMT: + srpt_handle_tsk_mgmt(ch, recv_ioctx, send_ioctx); + break; + case SRP_I_LOGOUT: + printk(KERN_ERR "Not yet implemented: SRP_I_LOGOUT\n"); + break; + case SRP_CRED_RSP: + pr_debug("received SRP_CRED_RSP\n"); + break; + case SRP_AER_RSP: + pr_debug("received SRP_AER_RSP\n"); + break; + case SRP_RSP: + printk(KERN_ERR "Received SRP_RSP\n"); + break; + default: + printk(KERN_ERR "received IU with unknown opcode 0x%x\n", + srp_cmd->opcode); + break; + } + + srpt_post_recv(ch->sport->sdev, recv_ioctx); +out: + return; +} + +static void srpt_process_rcv_completion(struct ib_cq *cq, + struct srpt_rdma_ch *ch, + struct ib_wc *wc) +{ + struct srpt_device *sdev = ch->sport->sdev; + struct srpt_recv_ioctx *ioctx; + u32 index; + + index = idx_from_wr_id(wc->wr_id); + if (wc->status == IB_WC_SUCCESS) { + int req_lim; + + req_lim = atomic_dec_return(&ch->req_lim); + if (unlikely(req_lim < 0)) + printk(KERN_ERR "req_lim = %d < 0\n", req_lim); + ioctx = sdev->ioctx_ring[index]; + srpt_handle_new_iu(ch, ioctx, NULL); + } else { + printk(KERN_INFO "receiving failed for idx %u with status %d\n", + index, wc->status); + } +} + +/** + * srpt_process_send_completion() - Process an IB send completion. + * + * Note: Although this has not yet been observed during tests, at least in + * theory it is possible that the srpt_get_send_ioctx() call invoked by + * srpt_handle_new_iu() fails. This is possible because the req_lim_delta + * value in each response is set to one, and it is possible that this response + * makes the initiator send a new request before the send completion for that + * response has been processed. This could e.g. happen if the call to + * srpt_put_send_iotcx() is delayed because of a higher priority interrupt or + * if IB retransmission causes generation of the send completion to be + * delayed. Incoming information units for which srpt_get_send_ioctx() fails + * are queued on cmd_wait_list. The code below processes these delayed + * requests one at a time. + */ +static void srpt_process_send_completion(struct ib_cq *cq, + struct srpt_rdma_ch *ch, + struct ib_wc *wc) +{ + struct srpt_send_ioctx *send_ioctx; + uint32_t index; + enum srpt_opcode opcode; + + index = idx_from_wr_id(wc->wr_id); + opcode = opcode_from_wr_id(wc->wr_id); + send_ioctx = ch->ioctx_ring[index]; + if (wc->status == IB_WC_SUCCESS) { + if (opcode == SRPT_SEND) + srpt_handle_send_comp(ch, send_ioctx); + else { + WARN_ON(opcode != SRPT_RDMA_ABORT && + wc->opcode != IB_WC_RDMA_READ); + srpt_handle_rdma_comp(ch, send_ioctx, opcode); + } + } else { + if (opcode == SRPT_SEND) { + printk(KERN_INFO "sending response for idx %u failed" + " with status %d\n", index, wc->status); + srpt_handle_send_err_comp(ch, wc->wr_id); + } else if (opcode != SRPT_RDMA_MID) { + printk(KERN_INFO "RDMA t %d for idx %u failed with" + " status %d", opcode, index, wc->status); + srpt_handle_rdma_err_comp(ch, send_ioctx, opcode); + } + } + + while (unlikely(opcode == SRPT_SEND + && !list_empty(&ch->cmd_wait_list) + && srpt_get_ch_state(ch) == CH_LIVE + && (send_ioctx = srpt_get_send_ioctx(ch)) != NULL)) { + struct srpt_recv_ioctx *recv_ioctx; + + recv_ioctx = list_first_entry(&ch->cmd_wait_list, + struct srpt_recv_ioctx, + wait_list); + list_del(&recv_ioctx->wait_list); + srpt_handle_new_iu(ch, recv_ioctx, send_ioctx); + } +} + +static void srpt_process_completion(struct ib_cq *cq, struct srpt_rdma_ch *ch) +{ + struct ib_wc *const wc = ch->wc; + int i, n; + + WARN_ON(cq != ch->cq); + + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + while ((n = ib_poll_cq(cq, ARRAY_SIZE(ch->wc), wc)) > 0) { + for (i = 0; i < n; i++) { + if (opcode_from_wr_id(wc[i].wr_id) == SRPT_RECV) + srpt_process_rcv_completion(cq, ch, &wc[i]); + else + srpt_process_send_completion(cq, ch, &wc[i]); + } + } +} + +/** + * srpt_completion() - IB completion queue callback function. + * + * Notes: + * - It is guaranteed that a completion handler will never be invoked + * concurrently on two different CPUs for the same completion queue. See also + * Documentation/infiniband/core_locking.txt and the implementation of + * handle_edge_irq() in kernel/irq/chip.c. + * - When threaded IRQs are enabled, completion handlers are invoked in thread + * context instead of interrupt context. + */ +static void srpt_completion(struct ib_cq *cq, void *ctx) +{ + struct srpt_rdma_ch *ch = ctx; + + wake_up_interruptible(&ch->wait_queue); +} + +static int srpt_compl_thread(void *arg) +{ + struct srpt_rdma_ch *ch; + + /* Hibernation / freezing of the SRPT kernel thread is not supported. */ + current->flags |= PF_NOFREEZE; + + ch = arg; + BUG_ON(!ch); + printk(KERN_INFO "Session %s: kernel thread %s (PID %d) started\n", + ch->sess_name, ch->thread->comm, current->pid); + while (!kthread_should_stop()) { + wait_event_interruptible(ch->wait_queue, + (srpt_process_completion(ch->cq, ch), + kthread_should_stop())); + } + printk(KERN_INFO "Session %s: kernel thread %s (PID %d) stopped\n", + ch->sess_name, ch->thread->comm, current->pid); + return 0; +} + +/** + * srpt_create_ch_ib() - Create receive and send completion queues. + */ +static int srpt_create_ch_ib(struct srpt_rdma_ch *ch) +{ + struct ib_qp_init_attr *qp_init; + struct srpt_port *sport = ch->sport; + struct srpt_device *sdev = sport->sdev; + u32 srp_sq_size = sport->port_attrib.srp_sq_size; + int ret; + + WARN_ON(ch->rq_size < 1); + + ret = -ENOMEM; + qp_init = kzalloc(sizeof *qp_init, GFP_KERNEL); + if (!qp_init) + goto out; + +retry: + ch->cq = ib_create_cq(sdev->device, srpt_completion, NULL, ch, + ch->rq_size + srp_sq_size, 0); + if (IS_ERR(ch->cq)) { + ret = PTR_ERR(ch->cq); + printk(KERN_ERR "failed to create CQ cqe= %d ret= %d\n", + ch->rq_size + srp_sq_size, ret); + goto out; + } + + qp_init->qp_context = (void *)ch; + qp_init->event_handler + = (void(*)(struct ib_event *, void*))srpt_qp_event; + qp_init->send_cq = ch->cq; + qp_init->recv_cq = ch->cq; + qp_init->srq = sdev->srq; + qp_init->sq_sig_type = IB_SIGNAL_REQ_WR; + qp_init->qp_type = IB_QPT_RC; + qp_init->cap.max_send_wr = srp_sq_size; + qp_init->cap.max_send_sge = SRPT_DEF_SG_PER_WQE; + + ch->qp = ib_create_qp(sdev->pd, qp_init); + if (IS_ERR(ch->qp)) { + ret = PTR_ERR(ch->qp); + if (ret == -ENOMEM) { + srp_sq_size /= 2; + if (srp_sq_size >= MIN_SRPT_SQ_SIZE) { + ib_destroy_cq(ch->cq); + goto retry; + } + } + printk(KERN_ERR "failed to create_qp ret= %d\n", ret); + goto err_destroy_cq; + } + + atomic_set(&ch->sq_wr_avail, qp_init->cap.max_send_wr); + + pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n", + __func__, ch->cq->cqe, qp_init->cap.max_send_sge, + qp_init->cap.max_send_wr, ch->cm_id); + + ret = srpt_init_ch_qp(ch, ch->qp); + if (ret) + goto err_destroy_qp; + + init_waitqueue_head(&ch->wait_queue); + + pr_debug("creating thread for session %s\n", ch->sess_name); + + ch->thread = kthread_run(srpt_compl_thread, ch, "ib_srpt_compl"); + if (IS_ERR(ch->thread)) { + printk(KERN_ERR "failed to create kernel thread %ld\n", + PTR_ERR(ch->thread)); + ch->thread = NULL; + goto err_destroy_qp; + } + +out: + kfree(qp_init); + return ret; + +err_destroy_qp: + ib_destroy_qp(ch->qp); +err_destroy_cq: + ib_destroy_cq(ch->cq); + goto out; +} + +static void srpt_destroy_ch_ib(struct srpt_rdma_ch *ch) +{ + if (ch->thread) + kthread_stop(ch->thread); + + ib_destroy_qp(ch->qp); + ib_destroy_cq(ch->cq); +} + +/** + * __srpt_close_ch() - Close an RDMA channel by setting the QP error state. + * + * Reset the QP and make sure all resources associated with the channel will + * be deallocated at an appropriate time. + * + * Note: The caller must hold ch->sport->sdev->spinlock. + */ +static void __srpt_close_ch(struct srpt_rdma_ch *ch) +{ + struct srpt_device *sdev; + enum rdma_ch_state prev_state; + unsigned long flags; + + sdev = ch->sport->sdev; + + spin_lock_irqsave(&ch->spinlock, flags); + prev_state = ch->state; + switch (prev_state) { + case CH_CONNECTING: + case CH_LIVE: + ch->state = CH_DISCONNECTING; + break; + default: + break; + } + spin_unlock_irqrestore(&ch->spinlock, flags); + + switch (prev_state) { + case CH_CONNECTING: + ib_send_cm_rej(ch->cm_id, IB_CM_REJ_NO_RESOURCES, NULL, 0, + NULL, 0); + /* fall through */ + case CH_LIVE: + if (ib_send_cm_dreq(ch->cm_id, NULL, 0) < 0) + printk(KERN_ERR "sending CM DREQ failed.\n"); + break; + case CH_DISCONNECTING: + break; + case CH_DRAINING: + case CH_RELEASING: + break; + } +} + +/** + * srpt_close_ch() - Close an RDMA channel. + */ +static void srpt_close_ch(struct srpt_rdma_ch *ch) +{ + struct srpt_device *sdev; + + sdev = ch->sport->sdev; + spin_lock_irq(&sdev->spinlock); + __srpt_close_ch(ch); + spin_unlock_irq(&sdev->spinlock); +} + +/** + * srpt_shutdown_session() - Whether or not a session may be shut down. + */ +static int srpt_shutdown_session(struct se_session *se_sess) +{ + struct srpt_rdma_ch *ch = se_sess->fabric_sess_ptr; + unsigned long flags; + + spin_lock_irqsave(&ch->spinlock, flags); + if (ch->in_shutdown) { + spin_unlock_irqrestore(&ch->spinlock, flags); + return true; + } + + ch->in_shutdown = true; + target_sess_cmd_list_set_waiting(se_sess); + spin_unlock_irqrestore(&ch->spinlock, flags); + + return true; +} + +/** + * srpt_drain_channel() - Drain a channel by resetting the IB queue pair. + * @cm_id: Pointer to the CM ID of the channel to be drained. + * + * Note: Must be called from inside srpt_cm_handler to avoid a race between + * accessing sdev->spinlock and the call to kfree(sdev) in srpt_remove_one() + * (the caller of srpt_cm_handler holds the cm_id spinlock; srpt_remove_one() + * waits until all target sessions for the associated IB device have been + * unregistered and target session registration involves a call to + * ib_destroy_cm_id(), which locks the cm_id spinlock and hence waits until + * this function has finished). + */ +static void srpt_drain_channel(struct ib_cm_id *cm_id) +{ + struct srpt_device *sdev; + struct srpt_rdma_ch *ch; + int ret; + bool do_reset = false; + + WARN_ON_ONCE(irqs_disabled()); + + sdev = cm_id->context; + BUG_ON(!sdev); + spin_lock_irq(&sdev->spinlock); + list_for_each_entry(ch, &sdev->rch_list, list) { + if (ch->cm_id == cm_id) { + do_reset = srpt_test_and_set_ch_state(ch, + CH_CONNECTING, CH_DRAINING) || + srpt_test_and_set_ch_state(ch, + CH_LIVE, CH_DRAINING) || + srpt_test_and_set_ch_state(ch, + CH_DISCONNECTING, CH_DRAINING); + break; + } + } + spin_unlock_irq(&sdev->spinlock); + + if (do_reset) { + if (ch->sess) + srpt_shutdown_session(ch->sess); + + ret = srpt_ch_qp_err(ch); + if (ret < 0) + printk(KERN_ERR "Setting queue pair in error state" + " failed: %d\n", ret); + } +} + +/** + * srpt_find_channel() - Look up an RDMA channel. + * @cm_id: Pointer to the CM ID of the channel to be looked up. + * + * Return NULL if no matching RDMA channel has been found. + */ +static struct srpt_rdma_ch *srpt_find_channel(struct srpt_device *sdev, + struct ib_cm_id *cm_id) +{ + struct srpt_rdma_ch *ch; + bool found; + + WARN_ON_ONCE(irqs_disabled()); + BUG_ON(!sdev); + + found = false; + spin_lock_irq(&sdev->spinlock); + list_for_each_entry(ch, &sdev->rch_list, list) { + if (ch->cm_id == cm_id) { + found = true; + break; + } + } + spin_unlock_irq(&sdev->spinlock); + + return found ? ch : NULL; +} + +/** + * srpt_release_channel() - Release channel resources. + * + * Schedules the actual release because: + * - Calling the ib_destroy_cm_id() call from inside an IB CM callback would + * trigger a deadlock. + * - It is not safe to call TCM transport_* functions from interrupt context. + */ +static void srpt_release_channel(struct srpt_rdma_ch *ch) +{ + schedule_work(&ch->release_work); +} + +static void srpt_release_channel_work(struct work_struct *w) +{ + struct srpt_rdma_ch *ch; + struct srpt_device *sdev; + struct se_session *se_sess; + + ch = container_of(w, struct srpt_rdma_ch, release_work); + pr_debug("ch = %p; ch->sess = %p; release_done = %p\n", ch, ch->sess, + ch->release_done); + + sdev = ch->sport->sdev; + BUG_ON(!sdev); + + se_sess = ch->sess; + BUG_ON(!se_sess); + + target_wait_for_sess_cmds(se_sess); + + transport_deregister_session_configfs(se_sess); + transport_deregister_session(se_sess); + ch->sess = NULL; + + ib_destroy_cm_id(ch->cm_id); + + srpt_destroy_ch_ib(ch); + + srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_ring, + ch->sport->sdev, ch->rq_size, + ch->rsp_size, DMA_TO_DEVICE); + + spin_lock_irq(&sdev->spinlock); + list_del(&ch->list); + spin_unlock_irq(&sdev->spinlock); + + if (ch->release_done) + complete(ch->release_done); + + wake_up(&sdev->ch_releaseQ); + + kfree(ch); +} + +static struct srpt_node_acl *__srpt_lookup_acl(struct srpt_port *sport, + u8 i_port_id[16]) +{ + struct srpt_node_acl *nacl; + + list_for_each_entry(nacl, &sport->port_acl_list, list) + if (memcmp(nacl->i_port_id, i_port_id, + sizeof(nacl->i_port_id)) == 0) + return nacl; + + return NULL; +} + +static struct srpt_node_acl *srpt_lookup_acl(struct srpt_port *sport, + u8 i_port_id[16]) +{ + struct srpt_node_acl *nacl; + + spin_lock_irq(&sport->port_acl_lock); + nacl = __srpt_lookup_acl(sport, i_port_id); + spin_unlock_irq(&sport->port_acl_lock); + + return nacl; +} + +/** + * srpt_cm_req_recv() - Process the event IB_CM_REQ_RECEIVED. + * + * Ownership of the cm_id is transferred to the target session if this + * functions returns zero. Otherwise the caller remains the owner of cm_id. + */ +static int srpt_cm_req_recv(struct ib_cm_id *cm_id, + struct ib_cm_req_event_param *param, + void *private_data) +{ + struct srpt_device *sdev = cm_id->context; + struct srpt_port *sport = &sdev->port[param->port - 1]; + struct srp_login_req *req; + struct srp_login_rsp *rsp; + struct srp_login_rej *rej; + struct ib_cm_rep_param *rep_param; + struct srpt_rdma_ch *ch, *tmp_ch; + struct srpt_node_acl *nacl; + u32 it_iu_len; + int i; + int ret = 0; + + WARN_ON_ONCE(irqs_disabled()); + + if (WARN_ON(!sdev || !private_data)) + return -EINVAL; + + req = (struct srp_login_req *)private_data; + + it_iu_len = be32_to_cpu(req->req_it_iu_len); + + printk(KERN_INFO "Received SRP_LOGIN_REQ with i_port_id 0x%llx:0x%llx," + " t_port_id 0x%llx:0x%llx and it_iu_len %d on port %d" + " (guid=0x%llx:0x%llx)\n", + be64_to_cpu(*(__be64 *)&req->initiator_port_id[0]), + be64_to_cpu(*(__be64 *)&req->initiator_port_id[8]), + be64_to_cpu(*(__be64 *)&req->target_port_id[0]), + be64_to_cpu(*(__be64 *)&req->target_port_id[8]), + it_iu_len, + param->port, + be64_to_cpu(*(__be64 *)&sdev->port[param->port - 1].gid.raw[0]), + be64_to_cpu(*(__be64 *)&sdev->port[param->port - 1].gid.raw[8])); + + rsp = kzalloc(sizeof *rsp, GFP_KERNEL); + rej = kzalloc(sizeof *rej, GFP_KERNEL); + rep_param = kzalloc(sizeof *rep_param, GFP_KERNEL); + + if (!rsp || !rej || !rep_param) { + ret = -ENOMEM; + goto out; + } + + if (it_iu_len > srp_max_req_size || it_iu_len < 64) { + rej->reason = __constant_cpu_to_be32( + SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE); + ret = -EINVAL; + printk(KERN_ERR "rejected SRP_LOGIN_REQ because its" + " length (%d bytes) is out of range (%d .. %d)\n", + it_iu_len, 64, srp_max_req_size); + goto reject; + } + + if (!sport->enabled) { + rej->reason = __constant_cpu_to_be32( + SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); + ret = -EINVAL; + printk(KERN_ERR "rejected SRP_LOGIN_REQ because the target port" + " has not yet been enabled\n"); + goto reject; + } + + if ((req->req_flags & SRP_MTCH_ACTION) == SRP_MULTICHAN_SINGLE) { + rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_NO_CHAN; + + spin_lock_irq(&sdev->spinlock); + + list_for_each_entry_safe(ch, tmp_ch, &sdev->rch_list, list) { + if (!memcmp(ch->i_port_id, req->initiator_port_id, 16) + && !memcmp(ch->t_port_id, req->target_port_id, 16) + && param->port == ch->sport->port + && param->listen_id == ch->sport->sdev->cm_id + && ch->cm_id) { + enum rdma_ch_state ch_state; + + ch_state = srpt_get_ch_state(ch); + if (ch_state != CH_CONNECTING + && ch_state != CH_LIVE) + continue; + + /* found an existing channel */ + pr_debug("Found existing channel %s" + " cm_id= %p state= %d\n", + ch->sess_name, ch->cm_id, ch_state); + + __srpt_close_ch(ch); + + rsp->rsp_flags = + SRP_LOGIN_RSP_MULTICHAN_TERMINATED; + } + } + + spin_unlock_irq(&sdev->spinlock); + + } else + rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_MAINTAINED; + + if (*(__be64 *)req->target_port_id != cpu_to_be64(srpt_service_guid) + || *(__be64 *)(req->target_port_id + 8) != + cpu_to_be64(srpt_service_guid)) { + rej->reason = __constant_cpu_to_be32( + SRP_LOGIN_REJ_UNABLE_ASSOCIATE_CHANNEL); + ret = -ENOMEM; + printk(KERN_ERR "rejected SRP_LOGIN_REQ because it" + " has an invalid target port identifier.\n"); + goto reject; + } + + ch = kzalloc(sizeof *ch, GFP_KERNEL); + if (!ch) { + rej->reason = __constant_cpu_to_be32( + SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); + printk(KERN_ERR "rejected SRP_LOGIN_REQ because no memory.\n"); + ret = -ENOMEM; + goto reject; + } + + INIT_WORK(&ch->release_work, srpt_release_channel_work); + memcpy(ch->i_port_id, req->initiator_port_id, 16); + memcpy(ch->t_port_id, req->target_port_id, 16); + ch->sport = &sdev->port[param->port - 1]; + ch->cm_id = cm_id; + /* + * Avoid QUEUE_FULL conditions by limiting the number of buffers used + * for the SRP protocol to the command queue size. + */ + ch->rq_size = SRPT_RQ_SIZE; + spin_lock_init(&ch->spinlock); + ch->state = CH_CONNECTING; + INIT_LIST_HEAD(&ch->cmd_wait_list); + ch->rsp_size = ch->sport->port_attrib.srp_max_rsp_size; + + ch->ioctx_ring = (struct srpt_send_ioctx **) + srpt_alloc_ioctx_ring(ch->sport->sdev, ch->rq_size, + sizeof(*ch->ioctx_ring[0]), + ch->rsp_size, DMA_TO_DEVICE); + if (!ch->ioctx_ring) + goto free_ch; + + INIT_LIST_HEAD(&ch->free_list); + for (i = 0; i < ch->rq_size; i++) { + ch->ioctx_ring[i]->ch = ch; + list_add_tail(&ch->ioctx_ring[i]->free_list, &ch->free_list); + } + + ret = srpt_create_ch_ib(ch); + if (ret) { + rej->reason = __constant_cpu_to_be32( + SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); + printk(KERN_ERR "rejected SRP_LOGIN_REQ because creating" + " a new RDMA channel failed.\n"); + goto free_ring; + } + + ret = srpt_ch_qp_rtr(ch, ch->qp); + if (ret) { + rej->reason = __constant_cpu_to_be32( + SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); + printk(KERN_ERR "rejected SRP_LOGIN_REQ because enabling" + " RTR failed (error code = %d)\n", ret); + goto destroy_ib; + } + /* + * Use the initator port identifier as the session name. + */ + snprintf(ch->sess_name, sizeof(ch->sess_name), "0x%016llx%016llx", + be64_to_cpu(*(__be64 *)ch->i_port_id), + be64_to_cpu(*(__be64 *)(ch->i_port_id + 8))); + + pr_debug("registering session %s\n", ch->sess_name); + + nacl = srpt_lookup_acl(sport, ch->i_port_id); + if (!nacl) { + printk(KERN_INFO "Rejected login because no ACL has been" + " configured yet for initiator %s.\n", ch->sess_name); + rej->reason = __constant_cpu_to_be32( + SRP_LOGIN_REJ_CHANNEL_LIMIT_REACHED); + goto destroy_ib; + } + + ch->sess = transport_init_session(TARGET_PROT_NORMAL); + if (IS_ERR(ch->sess)) { + rej->reason = __constant_cpu_to_be32( + SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); + pr_debug("Failed to create session\n"); + goto deregister_session; + } + ch->sess->se_node_acl = &nacl->nacl; + transport_register_session(&sport->port_tpg_1, &nacl->nacl, ch->sess, ch); + + pr_debug("Establish connection sess=%p name=%s cm_id=%p\n", ch->sess, + ch->sess_name, ch->cm_id); + + /* create srp_login_response */ + rsp->opcode = SRP_LOGIN_RSP; + rsp->tag = req->tag; + rsp->max_it_iu_len = req->req_it_iu_len; + rsp->max_ti_iu_len = req->req_it_iu_len; + ch->max_ti_iu_len = it_iu_len; + rsp->buf_fmt = __constant_cpu_to_be16(SRP_BUF_FORMAT_DIRECT + | SRP_BUF_FORMAT_INDIRECT); + rsp->req_lim_delta = cpu_to_be32(ch->rq_size); + atomic_set(&ch->req_lim, ch->rq_size); + atomic_set(&ch->req_lim_delta, 0); + + /* create cm reply */ + rep_param->qp_num = ch->qp->qp_num; + rep_param->private_data = (void *)rsp; + rep_param->private_data_len = sizeof *rsp; + rep_param->rnr_retry_count = 7; + rep_param->flow_control = 1; + rep_param->failover_accepted = 0; + rep_param->srq = 1; + rep_param->responder_resources = 4; + rep_param->initiator_depth = 4; + + ret = ib_send_cm_rep(cm_id, rep_param); + if (ret) { + printk(KERN_ERR "sending SRP_LOGIN_REQ response failed" + " (error code = %d)\n", ret); + goto release_channel; + } + + spin_lock_irq(&sdev->spinlock); + list_add_tail(&ch->list, &sdev->rch_list); + spin_unlock_irq(&sdev->spinlock); + + goto out; + +release_channel: + srpt_set_ch_state(ch, CH_RELEASING); + transport_deregister_session_configfs(ch->sess); + +deregister_session: + transport_deregister_session(ch->sess); + ch->sess = NULL; + +destroy_ib: + srpt_destroy_ch_ib(ch); + +free_ring: + srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_ring, + ch->sport->sdev, ch->rq_size, + ch->rsp_size, DMA_TO_DEVICE); +free_ch: + kfree(ch); + +reject: + rej->opcode = SRP_LOGIN_REJ; + rej->tag = req->tag; + rej->buf_fmt = __constant_cpu_to_be16(SRP_BUF_FORMAT_DIRECT + | SRP_BUF_FORMAT_INDIRECT); + + ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, + (void *)rej, sizeof *rej); + +out: + kfree(rep_param); + kfree(rsp); + kfree(rej); + + return ret; +} + +static void srpt_cm_rej_recv(struct ib_cm_id *cm_id) +{ + printk(KERN_INFO "Received IB REJ for cm_id %p.\n", cm_id); + srpt_drain_channel(cm_id); +} + +/** + * srpt_cm_rtu_recv() - Process an IB_CM_RTU_RECEIVED or USER_ESTABLISHED event. + * + * An IB_CM_RTU_RECEIVED message indicates that the connection is established + * and that the recipient may begin transmitting (RTU = ready to use). + */ +static void srpt_cm_rtu_recv(struct ib_cm_id *cm_id) +{ + struct srpt_rdma_ch *ch; + int ret; + + ch = srpt_find_channel(cm_id->context, cm_id); + BUG_ON(!ch); + + if (srpt_test_and_set_ch_state(ch, CH_CONNECTING, CH_LIVE)) { + struct srpt_recv_ioctx *ioctx, *ioctx_tmp; + + ret = srpt_ch_qp_rts(ch, ch->qp); + + list_for_each_entry_safe(ioctx, ioctx_tmp, &ch->cmd_wait_list, + wait_list) { + list_del(&ioctx->wait_list); + srpt_handle_new_iu(ch, ioctx, NULL); + } + if (ret) + srpt_close_ch(ch); + } +} + +static void srpt_cm_timewait_exit(struct ib_cm_id *cm_id) +{ + printk(KERN_INFO "Received IB TimeWait exit for cm_id %p.\n", cm_id); + srpt_drain_channel(cm_id); +} + +static void srpt_cm_rep_error(struct ib_cm_id *cm_id) +{ + printk(KERN_INFO "Received IB REP error for cm_id %p.\n", cm_id); + srpt_drain_channel(cm_id); +} + +/** + * srpt_cm_dreq_recv() - Process reception of a DREQ message. + */ +static void srpt_cm_dreq_recv(struct ib_cm_id *cm_id) +{ + struct srpt_rdma_ch *ch; + unsigned long flags; + bool send_drep = false; + + ch = srpt_find_channel(cm_id->context, cm_id); + BUG_ON(!ch); + + pr_debug("cm_id= %p ch->state= %d\n", cm_id, srpt_get_ch_state(ch)); + + spin_lock_irqsave(&ch->spinlock, flags); + switch (ch->state) { + case CH_CONNECTING: + case CH_LIVE: + send_drep = true; + ch->state = CH_DISCONNECTING; + break; + case CH_DISCONNECTING: + case CH_DRAINING: + case CH_RELEASING: + WARN(true, "unexpected channel state %d\n", ch->state); + break; + } + spin_unlock_irqrestore(&ch->spinlock, flags); + + if (send_drep) { + if (ib_send_cm_drep(ch->cm_id, NULL, 0) < 0) + printk(KERN_ERR "Sending IB DREP failed.\n"); + printk(KERN_INFO "Received DREQ and sent DREP for session %s.\n", + ch->sess_name); + } +} + +/** + * srpt_cm_drep_recv() - Process reception of a DREP message. + */ +static void srpt_cm_drep_recv(struct ib_cm_id *cm_id) +{ + printk(KERN_INFO "Received InfiniBand DREP message for cm_id %p.\n", + cm_id); + srpt_drain_channel(cm_id); +} + +/** + * srpt_cm_handler() - IB connection manager callback function. + * + * A non-zero return value will cause the caller destroy the CM ID. + * + * Note: srpt_cm_handler() must only return a non-zero value when transferring + * ownership of the cm_id to a channel by srpt_cm_req_recv() failed. Returning + * a non-zero value in any other case will trigger a race with the + * ib_destroy_cm_id() call in srpt_release_channel(). + */ +static int srpt_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) +{ + int ret; + + ret = 0; + switch (event->event) { + case IB_CM_REQ_RECEIVED: + ret = srpt_cm_req_recv(cm_id, &event->param.req_rcvd, + event->private_data); + break; + case IB_CM_REJ_RECEIVED: + srpt_cm_rej_recv(cm_id); + break; + case IB_CM_RTU_RECEIVED: + case IB_CM_USER_ESTABLISHED: + srpt_cm_rtu_recv(cm_id); + break; + case IB_CM_DREQ_RECEIVED: + srpt_cm_dreq_recv(cm_id); + break; + case IB_CM_DREP_RECEIVED: + srpt_cm_drep_recv(cm_id); + break; + case IB_CM_TIMEWAIT_EXIT: + srpt_cm_timewait_exit(cm_id); + break; + case IB_CM_REP_ERROR: + srpt_cm_rep_error(cm_id); + break; + case IB_CM_DREQ_ERROR: + printk(KERN_INFO "Received IB DREQ ERROR event.\n"); + break; + case IB_CM_MRA_RECEIVED: + printk(KERN_INFO "Received IB MRA event\n"); + break; + default: + printk(KERN_ERR "received unrecognized IB CM event %d\n", + event->event); + break; + } + + return ret; +} + +/** + * srpt_perform_rdmas() - Perform IB RDMA. + * + * Returns zero upon success or a negative number upon failure. + */ +static int srpt_perform_rdmas(struct srpt_rdma_ch *ch, + struct srpt_send_ioctx *ioctx) +{ + struct ib_send_wr wr; + struct ib_send_wr *bad_wr; + struct rdma_iu *riu; + int i; + int ret; + int sq_wr_avail; + enum dma_data_direction dir; + const int n_rdma = ioctx->n_rdma; + + dir = ioctx->cmd.data_direction; + if (dir == DMA_TO_DEVICE) { + /* write */ + ret = -ENOMEM; + sq_wr_avail = atomic_sub_return(n_rdma, &ch->sq_wr_avail); + if (sq_wr_avail < 0) { + printk(KERN_WARNING "IB send queue full (needed %d)\n", + n_rdma); + goto out; + } + } + + ioctx->rdma_aborted = false; + ret = 0; + riu = ioctx->rdma_ius; + memset(&wr, 0, sizeof wr); + + for (i = 0; i < n_rdma; ++i, ++riu) { + if (dir == DMA_FROM_DEVICE) { + wr.opcode = IB_WR_RDMA_WRITE; + wr.wr_id = encode_wr_id(i == n_rdma - 1 ? + SRPT_RDMA_WRITE_LAST : + SRPT_RDMA_MID, + ioctx->ioctx.index); + } else { + wr.opcode = IB_WR_RDMA_READ; + wr.wr_id = encode_wr_id(i == n_rdma - 1 ? + SRPT_RDMA_READ_LAST : + SRPT_RDMA_MID, + ioctx->ioctx.index); + } + wr.next = NULL; + wr.wr.rdma.remote_addr = riu->raddr; + wr.wr.rdma.rkey = riu->rkey; + wr.num_sge = riu->sge_cnt; + wr.sg_list = riu->sge; + + /* only get completion event for the last rdma write */ + if (i == (n_rdma - 1) && dir == DMA_TO_DEVICE) + wr.send_flags = IB_SEND_SIGNALED; + + ret = ib_post_send(ch->qp, &wr, &bad_wr); + if (ret) + break; + } + + if (ret) + printk(KERN_ERR "%s[%d]: ib_post_send() returned %d for %d/%d", + __func__, __LINE__, ret, i, n_rdma); + if (ret && i > 0) { + wr.num_sge = 0; + wr.wr_id = encode_wr_id(SRPT_RDMA_ABORT, ioctx->ioctx.index); + wr.send_flags = IB_SEND_SIGNALED; + while (ch->state == CH_LIVE && + ib_post_send(ch->qp, &wr, &bad_wr) != 0) { + printk(KERN_INFO "Trying to abort failed RDMA transfer [%d]", + ioctx->ioctx.index); + msleep(1000); + } + while (ch->state != CH_RELEASING && !ioctx->rdma_aborted) { + printk(KERN_INFO "Waiting until RDMA abort finished [%d]", + ioctx->ioctx.index); + msleep(1000); + } + } +out: + if (unlikely(dir == DMA_TO_DEVICE && ret < 0)) + atomic_add(n_rdma, &ch->sq_wr_avail); + return ret; +} + +/** + * srpt_xfer_data() - Start data transfer from initiator to target. + */ +static int srpt_xfer_data(struct srpt_rdma_ch *ch, + struct srpt_send_ioctx *ioctx) +{ + int ret; + + ret = srpt_map_sg_to_ib_sge(ch, ioctx); + if (ret) { + printk(KERN_ERR "%s[%d] ret=%d\n", __func__, __LINE__, ret); + goto out; + } + + ret = srpt_perform_rdmas(ch, ioctx); + if (ret) { + if (ret == -EAGAIN || ret == -ENOMEM) + printk(KERN_INFO "%s[%d] queue full -- ret=%d\n", + __func__, __LINE__, ret); + else + printk(KERN_ERR "%s[%d] fatal error -- ret=%d\n", + __func__, __LINE__, ret); + goto out_unmap; + } + +out: + return ret; +out_unmap: + srpt_unmap_sg_to_ib_sge(ch, ioctx); + goto out; +} + +static int srpt_write_pending_status(struct se_cmd *se_cmd) +{ + struct srpt_send_ioctx *ioctx; + + ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd); + return srpt_get_cmd_state(ioctx) == SRPT_STATE_NEED_DATA; +} + +/* + * srpt_write_pending() - Start data transfer from initiator to target (write). + */ +static int srpt_write_pending(struct se_cmd *se_cmd) +{ + struct srpt_rdma_ch *ch; + struct srpt_send_ioctx *ioctx; + enum srpt_command_state new_state; + enum rdma_ch_state ch_state; + int ret; + + ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd); + + new_state = srpt_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA); + WARN_ON(new_state == SRPT_STATE_DONE); + + ch = ioctx->ch; + BUG_ON(!ch); + + ch_state = srpt_get_ch_state(ch); + switch (ch_state) { + case CH_CONNECTING: + WARN(true, "unexpected channel state %d\n", ch_state); + ret = -EINVAL; + goto out; + case CH_LIVE: + break; + case CH_DISCONNECTING: + case CH_DRAINING: + case CH_RELEASING: + pr_debug("cmd with tag %lld: channel disconnecting\n", + ioctx->tag); + srpt_set_cmd_state(ioctx, SRPT_STATE_DATA_IN); + ret = -EINVAL; + goto out; + } + ret = srpt_xfer_data(ch, ioctx); + +out: + return ret; +} + +static u8 tcm_to_srp_tsk_mgmt_status(const int tcm_mgmt_status) +{ + switch (tcm_mgmt_status) { + case TMR_FUNCTION_COMPLETE: + return SRP_TSK_MGMT_SUCCESS; + case TMR_FUNCTION_REJECTED: + return SRP_TSK_MGMT_FUNC_NOT_SUPP; + } + return SRP_TSK_MGMT_FAILED; +} + +/** + * srpt_queue_response() - Transmits the response to a SCSI command. + * + * Callback function called by the TCM core. Must not block since it can be + * invoked on the context of the IB completion handler. + */ +static void srpt_queue_response(struct se_cmd *cmd) +{ + struct srpt_rdma_ch *ch; + struct srpt_send_ioctx *ioctx; + enum srpt_command_state state; + unsigned long flags; + int ret; + enum dma_data_direction dir; + int resp_len; + u8 srp_tm_status; + + ioctx = container_of(cmd, struct srpt_send_ioctx, cmd); + ch = ioctx->ch; + BUG_ON(!ch); + + spin_lock_irqsave(&ioctx->spinlock, flags); + state = ioctx->state; + switch (state) { + case SRPT_STATE_NEW: + case SRPT_STATE_DATA_IN: + ioctx->state = SRPT_STATE_CMD_RSP_SENT; + break; + case SRPT_STATE_MGMT: + ioctx->state = SRPT_STATE_MGMT_RSP_SENT; + break; + default: + WARN(true, "ch %p; cmd %d: unexpected command state %d\n", + ch, ioctx->ioctx.index, ioctx->state); + break; + } + spin_unlock_irqrestore(&ioctx->spinlock, flags); + + if (unlikely(transport_check_aborted_status(&ioctx->cmd, false) + || WARN_ON_ONCE(state == SRPT_STATE_CMD_RSP_SENT))) { + atomic_inc(&ch->req_lim_delta); + srpt_abort_cmd(ioctx); + return; + } + + dir = ioctx->cmd.data_direction; + + /* For read commands, transfer the data to the initiator. */ + if (dir == DMA_FROM_DEVICE && ioctx->cmd.data_length && + !ioctx->queue_status_only) { + ret = srpt_xfer_data(ch, ioctx); + if (ret) { + printk(KERN_ERR "xfer_data failed for tag %llu\n", + ioctx->tag); + return; + } + } + + if (state != SRPT_STATE_MGMT) + resp_len = srpt_build_cmd_rsp(ch, ioctx, ioctx->tag, + cmd->scsi_status); + else { + srp_tm_status + = tcm_to_srp_tsk_mgmt_status(cmd->se_tmr_req->response); + resp_len = srpt_build_tskmgmt_rsp(ch, ioctx, srp_tm_status, + ioctx->tag); + } + ret = srpt_post_send(ch, ioctx, resp_len); + if (ret) { + printk(KERN_ERR "sending cmd response failed for tag %llu\n", + ioctx->tag); + srpt_unmap_sg_to_ib_sge(ch, ioctx); + srpt_set_cmd_state(ioctx, SRPT_STATE_DONE); + target_put_sess_cmd(ioctx->ch->sess, &ioctx->cmd); + } +} + +static int srpt_queue_data_in(struct se_cmd *cmd) +{ + srpt_queue_response(cmd); + return 0; +} + +static void srpt_queue_tm_rsp(struct se_cmd *cmd) +{ + srpt_queue_response(cmd); +} + +static void srpt_aborted_task(struct se_cmd *cmd) +{ + struct srpt_send_ioctx *ioctx = container_of(cmd, + struct srpt_send_ioctx, cmd); + + srpt_unmap_sg_to_ib_sge(ioctx->ch, ioctx); +} + +static int srpt_queue_status(struct se_cmd *cmd) +{ + struct srpt_send_ioctx *ioctx; + + ioctx = container_of(cmd, struct srpt_send_ioctx, cmd); + BUG_ON(ioctx->sense_data != cmd->sense_buffer); + if (cmd->se_cmd_flags & + (SCF_TRANSPORT_TASK_SENSE | SCF_EMULATED_TASK_SENSE)) + WARN_ON(cmd->scsi_status != SAM_STAT_CHECK_CONDITION); + ioctx->queue_status_only = true; + srpt_queue_response(cmd); + return 0; +} + +static void srpt_refresh_port_work(struct work_struct *work) +{ + struct srpt_port *sport = container_of(work, struct srpt_port, work); + + srpt_refresh_port(sport); +} + +static int srpt_ch_list_empty(struct srpt_device *sdev) +{ + int res; + + spin_lock_irq(&sdev->spinlock); + res = list_empty(&sdev->rch_list); + spin_unlock_irq(&sdev->spinlock); + + return res; +} + +/** + * srpt_release_sdev() - Free the channel resources associated with a target. + */ +static int srpt_release_sdev(struct srpt_device *sdev) +{ + struct srpt_rdma_ch *ch, *tmp_ch; + int res; + + WARN_ON_ONCE(irqs_disabled()); + + BUG_ON(!sdev); + + spin_lock_irq(&sdev->spinlock); + list_for_each_entry_safe(ch, tmp_ch, &sdev->rch_list, list) + __srpt_close_ch(ch); + spin_unlock_irq(&sdev->spinlock); + + res = wait_event_interruptible(sdev->ch_releaseQ, + srpt_ch_list_empty(sdev)); + if (res) + printk(KERN_ERR "%s: interrupted.\n", __func__); + + return 0; +} + +static struct srpt_port *__srpt_lookup_port(const char *name) +{ + struct ib_device *dev; + struct srpt_device *sdev; + struct srpt_port *sport; + int i; + + list_for_each_entry(sdev, &srpt_dev_list, list) { + dev = sdev->device; + if (!dev) + continue; + + for (i = 0; i < dev->phys_port_cnt; i++) { + sport = &sdev->port[i]; + + if (!strcmp(sport->port_guid, name)) + return sport; + } + } + + return NULL; +} + +static struct srpt_port *srpt_lookup_port(const char *name) +{ + struct srpt_port *sport; + + spin_lock(&srpt_dev_lock); + sport = __srpt_lookup_port(name); + spin_unlock(&srpt_dev_lock); + + return sport; +} + +/** + * srpt_add_one() - Infiniband device addition callback function. + */ +static void srpt_add_one(struct ib_device *device) +{ + struct srpt_device *sdev; + struct srpt_port *sport; + struct ib_srq_init_attr srq_attr; + int i; + + pr_debug("device = %p, device->dma_ops = %p\n", device, + device->dma_ops); + + sdev = kzalloc(sizeof *sdev, GFP_KERNEL); + if (!sdev) + goto err; + + sdev->device = device; + INIT_LIST_HEAD(&sdev->rch_list); + init_waitqueue_head(&sdev->ch_releaseQ); + spin_lock_init(&sdev->spinlock); + + if (ib_query_device(device, &sdev->dev_attr)) + goto free_dev; + + sdev->pd = ib_alloc_pd(device); + if (IS_ERR(sdev->pd)) + goto free_dev; + + sdev->mr = ib_get_dma_mr(sdev->pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(sdev->mr)) + goto err_pd; + + sdev->srq_size = min(srpt_srq_size, sdev->dev_attr.max_srq_wr); + + srq_attr.event_handler = srpt_srq_event; + srq_attr.srq_context = (void *)sdev; + srq_attr.attr.max_wr = sdev->srq_size; + srq_attr.attr.max_sge = 1; + srq_attr.attr.srq_limit = 0; + srq_attr.srq_type = IB_SRQT_BASIC; + + sdev->srq = ib_create_srq(sdev->pd, &srq_attr); + if (IS_ERR(sdev->srq)) + goto err_mr; + + pr_debug("%s: create SRQ #wr= %d max_allow=%d dev= %s\n", + __func__, sdev->srq_size, sdev->dev_attr.max_srq_wr, + device->name); + + if (!srpt_service_guid) + srpt_service_guid = be64_to_cpu(device->node_guid); + + sdev->cm_id = ib_create_cm_id(device, srpt_cm_handler, sdev); + if (IS_ERR(sdev->cm_id)) + goto err_srq; + + /* print out target login information */ + pr_debug("Target login info: id_ext=%016llx,ioc_guid=%016llx," + "pkey=ffff,service_id=%016llx\n", srpt_service_guid, + srpt_service_guid, srpt_service_guid); + + /* + * We do not have a consistent service_id (ie. also id_ext of target_id) + * to identify this target. We currently use the guid of the first HCA + * in the system as service_id; therefore, the target_id will change + * if this HCA is gone bad and replaced by different HCA + */ + if (ib_cm_listen(sdev->cm_id, cpu_to_be64(srpt_service_guid), 0, NULL)) + goto err_cm; + + INIT_IB_EVENT_HANDLER(&sdev->event_handler, sdev->device, + srpt_event_handler); + if (ib_register_event_handler(&sdev->event_handler)) + goto err_cm; + + sdev->ioctx_ring = (struct srpt_recv_ioctx **) + srpt_alloc_ioctx_ring(sdev, sdev->srq_size, + sizeof(*sdev->ioctx_ring[0]), + srp_max_req_size, DMA_FROM_DEVICE); + if (!sdev->ioctx_ring) + goto err_event; + + for (i = 0; i < sdev->srq_size; ++i) + srpt_post_recv(sdev, sdev->ioctx_ring[i]); + + WARN_ON(sdev->device->phys_port_cnt > ARRAY_SIZE(sdev->port)); + + for (i = 1; i <= sdev->device->phys_port_cnt; i++) { + sport = &sdev->port[i - 1]; + sport->sdev = sdev; + sport->port = i; + sport->port_attrib.srp_max_rdma_size = DEFAULT_MAX_RDMA_SIZE; + sport->port_attrib.srp_max_rsp_size = DEFAULT_MAX_RSP_SIZE; + sport->port_attrib.srp_sq_size = DEF_SRPT_SQ_SIZE; + INIT_WORK(&sport->work, srpt_refresh_port_work); + INIT_LIST_HEAD(&sport->port_acl_list); + spin_lock_init(&sport->port_acl_lock); + + if (srpt_refresh_port(sport)) { + printk(KERN_ERR "MAD registration failed for %s-%d.\n", + srpt_sdev_name(sdev), i); + goto err_ring; + } + snprintf(sport->port_guid, sizeof(sport->port_guid), + "0x%016llx%016llx", + be64_to_cpu(sport->gid.global.subnet_prefix), + be64_to_cpu(sport->gid.global.interface_id)); + } + + spin_lock(&srpt_dev_lock); + list_add_tail(&sdev->list, &srpt_dev_list); + spin_unlock(&srpt_dev_lock); + +out: + ib_set_client_data(device, &srpt_client, sdev); + pr_debug("added %s.\n", device->name); + return; + +err_ring: + srpt_free_ioctx_ring((struct srpt_ioctx **)sdev->ioctx_ring, sdev, + sdev->srq_size, srp_max_req_size, + DMA_FROM_DEVICE); +err_event: + ib_unregister_event_handler(&sdev->event_handler); +err_cm: + ib_destroy_cm_id(sdev->cm_id); +err_srq: + ib_destroy_srq(sdev->srq); +err_mr: + ib_dereg_mr(sdev->mr); +err_pd: + ib_dealloc_pd(sdev->pd); +free_dev: + kfree(sdev); +err: + sdev = NULL; + printk(KERN_INFO "%s(%s) failed.\n", __func__, device->name); + goto out; +} + +/** + * srpt_remove_one() - InfiniBand device removal callback function. + */ +static void srpt_remove_one(struct ib_device *device) +{ + struct srpt_device *sdev; + int i; + + sdev = ib_get_client_data(device, &srpt_client); + if (!sdev) { + printk(KERN_INFO "%s(%s): nothing to do.\n", __func__, + device->name); + return; + } + + srpt_unregister_mad_agent(sdev); + + ib_unregister_event_handler(&sdev->event_handler); + + /* Cancel any work queued by the just unregistered IB event handler. */ + for (i = 0; i < sdev->device->phys_port_cnt; i++) + cancel_work_sync(&sdev->port[i].work); + + ib_destroy_cm_id(sdev->cm_id); + + /* + * Unregistering a target must happen after destroying sdev->cm_id + * such that no new SRP_LOGIN_REQ information units can arrive while + * destroying the target. + */ + spin_lock(&srpt_dev_lock); + list_del(&sdev->list); + spin_unlock(&srpt_dev_lock); + srpt_release_sdev(sdev); + + ib_destroy_srq(sdev->srq); + ib_dereg_mr(sdev->mr); + ib_dealloc_pd(sdev->pd); + + srpt_free_ioctx_ring((struct srpt_ioctx **)sdev->ioctx_ring, sdev, + sdev->srq_size, srp_max_req_size, DMA_FROM_DEVICE); + sdev->ioctx_ring = NULL; + kfree(sdev); +} + +static struct ib_client srpt_client = { + .name = DRV_NAME, + .add = srpt_add_one, + .remove = srpt_remove_one +}; + +static int srpt_check_true(struct se_portal_group *se_tpg) +{ + return 1; +} + +static int srpt_check_false(struct se_portal_group *se_tpg) +{ + return 0; +} + +static char *srpt_get_fabric_name(void) +{ + return "srpt"; +} + +static u8 srpt_get_fabric_proto_ident(struct se_portal_group *se_tpg) +{ + return SCSI_TRANSPORTID_PROTOCOLID_SRP; +} + +static char *srpt_get_fabric_wwn(struct se_portal_group *tpg) +{ + struct srpt_port *sport = container_of(tpg, struct srpt_port, port_tpg_1); + + return sport->port_guid; +} + +static u16 srpt_get_tag(struct se_portal_group *tpg) +{ + return 1; +} + +static u32 srpt_get_default_depth(struct se_portal_group *se_tpg) +{ + return 1; +} + +static u32 srpt_get_pr_transport_id(struct se_portal_group *se_tpg, + struct se_node_acl *se_nacl, + struct t10_pr_registration *pr_reg, + int *format_code, unsigned char *buf) +{ + struct srpt_node_acl *nacl; + struct spc_rdma_transport_id *tr_id; + + nacl = container_of(se_nacl, struct srpt_node_acl, nacl); + tr_id = (void *)buf; + tr_id->protocol_identifier = SCSI_TRANSPORTID_PROTOCOLID_SRP; + memcpy(tr_id->i_port_id, nacl->i_port_id, sizeof(tr_id->i_port_id)); + return sizeof(*tr_id); +} + +static u32 srpt_get_pr_transport_id_len(struct se_portal_group *se_tpg, + struct se_node_acl *se_nacl, + struct t10_pr_registration *pr_reg, + int *format_code) +{ + *format_code = 0; + return sizeof(struct spc_rdma_transport_id); +} + +static char *srpt_parse_pr_out_transport_id(struct se_portal_group *se_tpg, + const char *buf, u32 *out_tid_len, + char **port_nexus_ptr) +{ + struct spc_rdma_transport_id *tr_id; + + *port_nexus_ptr = NULL; + *out_tid_len = sizeof(struct spc_rdma_transport_id); + tr_id = (void *)buf; + return (char *)tr_id->i_port_id; +} + +static struct se_node_acl *srpt_alloc_fabric_acl(struct se_portal_group *se_tpg) +{ + struct srpt_node_acl *nacl; + + nacl = kzalloc(sizeof(struct srpt_node_acl), GFP_KERNEL); + if (!nacl) { + printk(KERN_ERR "Unable to allocate struct srpt_node_acl\n"); + return NULL; + } + + return &nacl->nacl; +} + +static void srpt_release_fabric_acl(struct se_portal_group *se_tpg, + struct se_node_acl *se_nacl) +{ + struct srpt_node_acl *nacl; + + nacl = container_of(se_nacl, struct srpt_node_acl, nacl); + kfree(nacl); +} + +static u32 srpt_tpg_get_inst_index(struct se_portal_group *se_tpg) +{ + return 1; +} + +static void srpt_release_cmd(struct se_cmd *se_cmd) +{ + struct srpt_send_ioctx *ioctx = container_of(se_cmd, + struct srpt_send_ioctx, cmd); + struct srpt_rdma_ch *ch = ioctx->ch; + unsigned long flags; + + WARN_ON(ioctx->state != SRPT_STATE_DONE); + WARN_ON(ioctx->mapped_sg_count != 0); + + if (ioctx->n_rbuf > 1) { + kfree(ioctx->rbufs); + ioctx->rbufs = NULL; + ioctx->n_rbuf = 0; + } + + spin_lock_irqsave(&ch->spinlock, flags); + list_add(&ioctx->free_list, &ch->free_list); + spin_unlock_irqrestore(&ch->spinlock, flags); +} + +/** + * srpt_close_session() - Forcibly close a session. + * + * Callback function invoked by the TCM core to clean up sessions associated + * with a node ACL when the user invokes + * rmdir /sys/kernel/config/target/$driver/$port/$tpg/acls/$i_port_id + */ +static void srpt_close_session(struct se_session *se_sess) +{ + DECLARE_COMPLETION_ONSTACK(release_done); + struct srpt_rdma_ch *ch; + struct srpt_device *sdev; + int res; + + ch = se_sess->fabric_sess_ptr; + WARN_ON(ch->sess != se_sess); + + pr_debug("ch %p state %d\n", ch, srpt_get_ch_state(ch)); + + sdev = ch->sport->sdev; + spin_lock_irq(&sdev->spinlock); + BUG_ON(ch->release_done); + ch->release_done = &release_done; + __srpt_close_ch(ch); + spin_unlock_irq(&sdev->spinlock); + + res = wait_for_completion_timeout(&release_done, 60 * HZ); + WARN_ON(res <= 0); +} + +/** + * srpt_sess_get_index() - Return the value of scsiAttIntrPortIndex (SCSI-MIB). + * + * A quote from RFC 4455 (SCSI-MIB) about this MIB object: + * This object represents an arbitrary integer used to uniquely identify a + * particular attached remote initiator port to a particular SCSI target port + * within a particular SCSI target device within a particular SCSI instance. + */ +static u32 srpt_sess_get_index(struct se_session *se_sess) +{ + return 0; +} + +static void srpt_set_default_node_attrs(struct se_node_acl *nacl) +{ +} + +static u32 srpt_get_task_tag(struct se_cmd *se_cmd) +{ + struct srpt_send_ioctx *ioctx; + + ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd); + return ioctx->tag; +} + +/* Note: only used from inside debug printk's by the TCM core. */ +static int srpt_get_tcm_cmd_state(struct se_cmd *se_cmd) +{ + struct srpt_send_ioctx *ioctx; + + ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd); + return srpt_get_cmd_state(ioctx); +} + +/** + * srpt_parse_i_port_id() - Parse an initiator port ID. + * @name: ASCII representation of a 128-bit initiator port ID. + * @i_port_id: Binary 128-bit port ID. + */ +static int srpt_parse_i_port_id(u8 i_port_id[16], const char *name) +{ + const char *p; + unsigned len, count, leading_zero_bytes; + int ret, rc; + + p = name; + if (strncasecmp(p, "0x", 2) == 0) + p += 2; + ret = -EINVAL; + len = strlen(p); + if (len % 2) + goto out; + count = min(len / 2, 16U); + leading_zero_bytes = 16 - count; + memset(i_port_id, 0, leading_zero_bytes); + rc = hex2bin(i_port_id + leading_zero_bytes, p, count); + if (rc < 0) + pr_debug("hex2bin failed for srpt_parse_i_port_id: %d\n", rc); + ret = 0; +out: + return ret; +} + +/* + * configfs callback function invoked for + * mkdir /sys/kernel/config/target/$driver/$port/$tpg/acls/$i_port_id + */ +static struct se_node_acl *srpt_make_nodeacl(struct se_portal_group *tpg, + struct config_group *group, + const char *name) +{ + struct srpt_port *sport = container_of(tpg, struct srpt_port, port_tpg_1); + struct se_node_acl *se_nacl, *se_nacl_new; + struct srpt_node_acl *nacl; + int ret = 0; + u32 nexus_depth = 1; + u8 i_port_id[16]; + + if (srpt_parse_i_port_id(i_port_id, name) < 0) { + printk(KERN_ERR "invalid initiator port ID %s\n", name); + ret = -EINVAL; + goto err; + } + + se_nacl_new = srpt_alloc_fabric_acl(tpg); + if (!se_nacl_new) { + ret = -ENOMEM; + goto err; + } + /* + * nacl_new may be released by core_tpg_add_initiator_node_acl() + * when converting a node ACL from demo mode to explict + */ + se_nacl = core_tpg_add_initiator_node_acl(tpg, se_nacl_new, name, + nexus_depth); + if (IS_ERR(se_nacl)) { + ret = PTR_ERR(se_nacl); + goto err; + } + /* Locate our struct srpt_node_acl and set sdev and i_port_id. */ + nacl = container_of(se_nacl, struct srpt_node_acl, nacl); + memcpy(&nacl->i_port_id[0], &i_port_id[0], 16); + nacl->sport = sport; + + spin_lock_irq(&sport->port_acl_lock); + list_add_tail(&nacl->list, &sport->port_acl_list); + spin_unlock_irq(&sport->port_acl_lock); + + return se_nacl; +err: + return ERR_PTR(ret); +} + +/* + * configfs callback function invoked for + * rmdir /sys/kernel/config/target/$driver/$port/$tpg/acls/$i_port_id + */ +static void srpt_drop_nodeacl(struct se_node_acl *se_nacl) +{ + struct srpt_node_acl *nacl; + struct srpt_device *sdev; + struct srpt_port *sport; + + nacl = container_of(se_nacl, struct srpt_node_acl, nacl); + sport = nacl->sport; + sdev = sport->sdev; + spin_lock_irq(&sport->port_acl_lock); + list_del(&nacl->list); + spin_unlock_irq(&sport->port_acl_lock); + core_tpg_del_initiator_node_acl(&sport->port_tpg_1, se_nacl, 1); + srpt_release_fabric_acl(NULL, se_nacl); +} + +static ssize_t srpt_tpg_attrib_show_srp_max_rdma_size( + struct se_portal_group *se_tpg, + char *page) +{ + struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + + return sprintf(page, "%u\n", sport->port_attrib.srp_max_rdma_size); +} + +static ssize_t srpt_tpg_attrib_store_srp_max_rdma_size( + struct se_portal_group *se_tpg, + const char *page, + size_t count) +{ + struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + unsigned long val; + int ret; + + ret = kstrtoul(page, 0, &val); + if (ret < 0) { + pr_err("kstrtoul() failed with ret: %d\n", ret); + return -EINVAL; + } + if (val > MAX_SRPT_RDMA_SIZE) { + pr_err("val: %lu exceeds MAX_SRPT_RDMA_SIZE: %d\n", val, + MAX_SRPT_RDMA_SIZE); + return -EINVAL; + } + if (val < DEFAULT_MAX_RDMA_SIZE) { + pr_err("val: %lu smaller than DEFAULT_MAX_RDMA_SIZE: %d\n", + val, DEFAULT_MAX_RDMA_SIZE); + return -EINVAL; + } + sport->port_attrib.srp_max_rdma_size = val; + + return count; +} + +TF_TPG_ATTRIB_ATTR(srpt, srp_max_rdma_size, S_IRUGO | S_IWUSR); + +static ssize_t srpt_tpg_attrib_show_srp_max_rsp_size( + struct se_portal_group *se_tpg, + char *page) +{ + struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + + return sprintf(page, "%u\n", sport->port_attrib.srp_max_rsp_size); +} + +static ssize_t srpt_tpg_attrib_store_srp_max_rsp_size( + struct se_portal_group *se_tpg, + const char *page, + size_t count) +{ + struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + unsigned long val; + int ret; + + ret = kstrtoul(page, 0, &val); + if (ret < 0) { + pr_err("kstrtoul() failed with ret: %d\n", ret); + return -EINVAL; + } + if (val > MAX_SRPT_RSP_SIZE) { + pr_err("val: %lu exceeds MAX_SRPT_RSP_SIZE: %d\n", val, + MAX_SRPT_RSP_SIZE); + return -EINVAL; + } + if (val < MIN_MAX_RSP_SIZE) { + pr_err("val: %lu smaller than MIN_MAX_RSP_SIZE: %d\n", val, + MIN_MAX_RSP_SIZE); + return -EINVAL; + } + sport->port_attrib.srp_max_rsp_size = val; + + return count; +} + +TF_TPG_ATTRIB_ATTR(srpt, srp_max_rsp_size, S_IRUGO | S_IWUSR); + +static ssize_t srpt_tpg_attrib_show_srp_sq_size( + struct se_portal_group *se_tpg, + char *page) +{ + struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + + return sprintf(page, "%u\n", sport->port_attrib.srp_sq_size); +} + +static ssize_t srpt_tpg_attrib_store_srp_sq_size( + struct se_portal_group *se_tpg, + const char *page, + size_t count) +{ + struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + unsigned long val; + int ret; + + ret = kstrtoul(page, 0, &val); + if (ret < 0) { + pr_err("kstrtoul() failed with ret: %d\n", ret); + return -EINVAL; + } + if (val > MAX_SRPT_SRQ_SIZE) { + pr_err("val: %lu exceeds MAX_SRPT_SRQ_SIZE: %d\n", val, + MAX_SRPT_SRQ_SIZE); + return -EINVAL; + } + if (val < MIN_SRPT_SRQ_SIZE) { + pr_err("val: %lu smaller than MIN_SRPT_SRQ_SIZE: %d\n", val, + MIN_SRPT_SRQ_SIZE); + return -EINVAL; + } + sport->port_attrib.srp_sq_size = val; + + return count; +} + +TF_TPG_ATTRIB_ATTR(srpt, srp_sq_size, S_IRUGO | S_IWUSR); + +static struct configfs_attribute *srpt_tpg_attrib_attrs[] = { + &srpt_tpg_attrib_srp_max_rdma_size.attr, + &srpt_tpg_attrib_srp_max_rsp_size.attr, + &srpt_tpg_attrib_srp_sq_size.attr, + NULL, +}; + +static ssize_t srpt_tpg_show_enable( + struct se_portal_group *se_tpg, + char *page) +{ + struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + + return snprintf(page, PAGE_SIZE, "%d\n", (sport->enabled) ? 1: 0); +} + +static ssize_t srpt_tpg_store_enable( + struct se_portal_group *se_tpg, + const char *page, + size_t count) +{ + struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + unsigned long tmp; + int ret; + + ret = kstrtoul(page, 0, &tmp); + if (ret < 0) { + printk(KERN_ERR "Unable to extract srpt_tpg_store_enable\n"); + return -EINVAL; + } + + if ((tmp != 0) && (tmp != 1)) { + printk(KERN_ERR "Illegal value for srpt_tpg_store_enable: %lu\n", tmp); + return -EINVAL; + } + if (tmp == 1) + sport->enabled = true; + else + sport->enabled = false; + + return count; +} + +TF_TPG_BASE_ATTR(srpt, enable, S_IRUGO | S_IWUSR); + +static struct configfs_attribute *srpt_tpg_attrs[] = { + &srpt_tpg_enable.attr, + NULL, +}; + +/** + * configfs callback invoked for + * mkdir /sys/kernel/config/target/$driver/$port/$tpg + */ +static struct se_portal_group *srpt_make_tpg(struct se_wwn *wwn, + struct config_group *group, + const char *name) +{ + struct srpt_port *sport = container_of(wwn, struct srpt_port, port_wwn); + int res; + + /* Initialize sport->port_wwn and sport->port_tpg_1 */ + res = core_tpg_register(&srpt_target->tf_ops, &sport->port_wwn, + &sport->port_tpg_1, sport, TRANSPORT_TPG_TYPE_NORMAL); + if (res) + return ERR_PTR(res); + + return &sport->port_tpg_1; +} + +/** + * configfs callback invoked for + * rmdir /sys/kernel/config/target/$driver/$port/$tpg + */ +static void srpt_drop_tpg(struct se_portal_group *tpg) +{ + struct srpt_port *sport = container_of(tpg, + struct srpt_port, port_tpg_1); + + sport->enabled = false; + core_tpg_deregister(&sport->port_tpg_1); +} + +/** + * configfs callback invoked for + * mkdir /sys/kernel/config/target/$driver/$port + */ +static struct se_wwn *srpt_make_tport(struct target_fabric_configfs *tf, + struct config_group *group, + const char *name) +{ + struct srpt_port *sport; + int ret; + + sport = srpt_lookup_port(name); + pr_debug("make_tport(%s)\n", name); + ret = -EINVAL; + if (!sport) + goto err; + + return &sport->port_wwn; + +err: + return ERR_PTR(ret); +} + +/** + * configfs callback invoked for + * rmdir /sys/kernel/config/target/$driver/$port + */ +static void srpt_drop_tport(struct se_wwn *wwn) +{ + struct srpt_port *sport = container_of(wwn, struct srpt_port, port_wwn); + + pr_debug("drop_tport(%s\n", config_item_name(&sport->port_wwn.wwn_group.cg_item)); +} + +static ssize_t srpt_wwn_show_attr_version(struct target_fabric_configfs *tf, + char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%s\n", DRV_VERSION); +} + +TF_WWN_ATTR_RO(srpt, version); + +static struct configfs_attribute *srpt_wwn_attrs[] = { + &srpt_wwn_version.attr, + NULL, +}; + +static struct target_core_fabric_ops srpt_template = { + .get_fabric_name = srpt_get_fabric_name, + .get_fabric_proto_ident = srpt_get_fabric_proto_ident, + .tpg_get_wwn = srpt_get_fabric_wwn, + .tpg_get_tag = srpt_get_tag, + .tpg_get_default_depth = srpt_get_default_depth, + .tpg_get_pr_transport_id = srpt_get_pr_transport_id, + .tpg_get_pr_transport_id_len = srpt_get_pr_transport_id_len, + .tpg_parse_pr_out_transport_id = srpt_parse_pr_out_transport_id, + .tpg_check_demo_mode = srpt_check_false, + .tpg_check_demo_mode_cache = srpt_check_true, + .tpg_check_demo_mode_write_protect = srpt_check_true, + .tpg_check_prod_mode_write_protect = srpt_check_false, + .tpg_alloc_fabric_acl = srpt_alloc_fabric_acl, + .tpg_release_fabric_acl = srpt_release_fabric_acl, + .tpg_get_inst_index = srpt_tpg_get_inst_index, + .release_cmd = srpt_release_cmd, + .check_stop_free = srpt_check_stop_free, + .shutdown_session = srpt_shutdown_session, + .close_session = srpt_close_session, + .sess_get_index = srpt_sess_get_index, + .sess_get_initiator_sid = NULL, + .write_pending = srpt_write_pending, + .write_pending_status = srpt_write_pending_status, + .set_default_node_attributes = srpt_set_default_node_attrs, + .get_task_tag = srpt_get_task_tag, + .get_cmd_state = srpt_get_tcm_cmd_state, + .queue_data_in = srpt_queue_data_in, + .queue_status = srpt_queue_status, + .queue_tm_rsp = srpt_queue_tm_rsp, + .aborted_task = srpt_aborted_task, + /* + * Setup function pointers for generic logic in + * target_core_fabric_configfs.c + */ + .fabric_make_wwn = srpt_make_tport, + .fabric_drop_wwn = srpt_drop_tport, + .fabric_make_tpg = srpt_make_tpg, + .fabric_drop_tpg = srpt_drop_tpg, + .fabric_post_link = NULL, + .fabric_pre_unlink = NULL, + .fabric_make_np = NULL, + .fabric_drop_np = NULL, + .fabric_make_nodeacl = srpt_make_nodeacl, + .fabric_drop_nodeacl = srpt_drop_nodeacl, +}; + +/** + * srpt_init_module() - Kernel module initialization. + * + * Note: Since ib_register_client() registers callback functions, and since at + * least one of these callback functions (srpt_add_one()) calls target core + * functions, this driver must be registered with the target core before + * ib_register_client() is called. + */ +static int __init srpt_init_module(void) +{ + int ret; + + ret = -EINVAL; + if (srp_max_req_size < MIN_MAX_REQ_SIZE) { + printk(KERN_ERR "invalid value %d for kernel module parameter" + " srp_max_req_size -- must be at least %d.\n", + srp_max_req_size, MIN_MAX_REQ_SIZE); + goto out; + } + + if (srpt_srq_size < MIN_SRPT_SRQ_SIZE + || srpt_srq_size > MAX_SRPT_SRQ_SIZE) { + printk(KERN_ERR "invalid value %d for kernel module parameter" + " srpt_srq_size -- must be in the range [%d..%d].\n", + srpt_srq_size, MIN_SRPT_SRQ_SIZE, MAX_SRPT_SRQ_SIZE); + goto out; + } + + srpt_target = target_fabric_configfs_init(THIS_MODULE, "srpt"); + if (IS_ERR(srpt_target)) { + printk(KERN_ERR "couldn't register\n"); + ret = PTR_ERR(srpt_target); + goto out; + } + + srpt_target->tf_ops = srpt_template; + + /* + * Set up default attribute lists. + */ + srpt_target->tf_cit_tmpl.tfc_wwn_cit.ct_attrs = srpt_wwn_attrs; + srpt_target->tf_cit_tmpl.tfc_tpg_base_cit.ct_attrs = srpt_tpg_attrs; + srpt_target->tf_cit_tmpl.tfc_tpg_attrib_cit.ct_attrs = srpt_tpg_attrib_attrs; + srpt_target->tf_cit_tmpl.tfc_tpg_param_cit.ct_attrs = NULL; + srpt_target->tf_cit_tmpl.tfc_tpg_np_base_cit.ct_attrs = NULL; + srpt_target->tf_cit_tmpl.tfc_tpg_nacl_base_cit.ct_attrs = NULL; + srpt_target->tf_cit_tmpl.tfc_tpg_nacl_attrib_cit.ct_attrs = NULL; + srpt_target->tf_cit_tmpl.tfc_tpg_nacl_auth_cit.ct_attrs = NULL; + srpt_target->tf_cit_tmpl.tfc_tpg_nacl_param_cit.ct_attrs = NULL; + + ret = target_fabric_configfs_register(srpt_target); + if (ret < 0) { + printk(KERN_ERR "couldn't register\n"); + goto out_free_target; + } + + ret = ib_register_client(&srpt_client); + if (ret) { + printk(KERN_ERR "couldn't register IB client\n"); + goto out_unregister_target; + } + + return 0; + +out_unregister_target: + target_fabric_configfs_deregister(srpt_target); + srpt_target = NULL; +out_free_target: + if (srpt_target) + target_fabric_configfs_free(srpt_target); +out: + return ret; +} + +static void __exit srpt_cleanup_module(void) +{ + ib_unregister_client(&srpt_client); + target_fabric_configfs_deregister(srpt_target); + srpt_target = NULL; +} + +module_init(srpt_init_module); +module_exit(srpt_cleanup_module); diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h new file mode 100644 index 0000000..3dae156 --- /dev/null +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -0,0 +1,443 @@ +/* + * Copyright (c) 2006 - 2009 Mellanox Technology Inc. All rights reserved. + * Copyright (C) 2009 - 2010 Bart Van Assche . + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef IB_SRPT_H +#define IB_SRPT_H + +#include +#include +#include + +#include +#include +#include + +#include + +#include "ib_dm_mad.h" + +/* + * The prefix the ServiceName field must start with in the device management + * ServiceEntries attribute pair. See also the SRP specification. + */ +#define SRP_SERVICE_NAME_PREFIX "SRP.T10:" + +enum { + /* + * SRP IOControllerProfile attributes for SRP target ports that have + * not been defined in . Source: section B.7, table B.7 + * in the SRP specification. + */ + SRP_PROTOCOL = 0x0108, + SRP_PROTOCOL_VERSION = 0x0001, + SRP_IO_SUBCLASS = 0x609e, + SRP_SEND_TO_IOC = 0x01, + SRP_SEND_FROM_IOC = 0x02, + SRP_RDMA_READ_FROM_IOC = 0x08, + SRP_RDMA_WRITE_FROM_IOC = 0x20, + + /* + * srp_login_cmd.req_flags bitmasks. See also table 9 in the SRP + * specification. + */ + SRP_MTCH_ACTION = 0x03, /* MULTI-CHANNEL ACTION */ + SRP_LOSOLNT = 0x10, /* logout solicited notification */ + SRP_CRSOLNT = 0x20, /* credit request solicited notification */ + SRP_AESOLNT = 0x40, /* asynchronous event solicited notification */ + + /* + * srp_cmd.sol_nt / srp_tsk_mgmt.sol_not bitmasks. See also tables + * 18 and 20 in the SRP specification. + */ + SRP_SCSOLNT = 0x02, /* SCSOLNT = successful solicited notification */ + SRP_UCSOLNT = 0x04, /* UCSOLNT = unsuccessful solicited notification */ + + /* + * srp_rsp.sol_not / srp_t_logout.sol_not bitmasks. See also tables + * 16 and 22 in the SRP specification. + */ + SRP_SOLNT = 0x01, /* SOLNT = solicited notification */ + + /* See also table 24 in the SRP specification. */ + SRP_TSK_MGMT_SUCCESS = 0x00, + SRP_TSK_MGMT_FUNC_NOT_SUPP = 0x04, + SRP_TSK_MGMT_FAILED = 0x05, + + /* See also table 21 in the SRP specification. */ + SRP_CMD_SIMPLE_Q = 0x0, + SRP_CMD_HEAD_OF_Q = 0x1, + SRP_CMD_ORDERED_Q = 0x2, + SRP_CMD_ACA = 0x4, + + SRP_LOGIN_RSP_MULTICHAN_NO_CHAN = 0x0, + SRP_LOGIN_RSP_MULTICHAN_TERMINATED = 0x1, + SRP_LOGIN_RSP_MULTICHAN_MAINTAINED = 0x2, + + SRPT_DEF_SG_TABLESIZE = 128, + SRPT_DEF_SG_PER_WQE = 16, + + MIN_SRPT_SQ_SIZE = 16, + DEF_SRPT_SQ_SIZE = 4096, + SRPT_RQ_SIZE = 128, + MIN_SRPT_SRQ_SIZE = 4, + DEFAULT_SRPT_SRQ_SIZE = 4095, + MAX_SRPT_SRQ_SIZE = 65535, + MAX_SRPT_RDMA_SIZE = 1U << 24, + MAX_SRPT_RSP_SIZE = 1024, + + MIN_MAX_REQ_SIZE = 996, + DEFAULT_MAX_REQ_SIZE + = sizeof(struct srp_cmd)/*48*/ + + sizeof(struct srp_indirect_buf)/*20*/ + + 128 * sizeof(struct srp_direct_buf)/*16*/, + + MIN_MAX_RSP_SIZE = sizeof(struct srp_rsp)/*36*/ + 4, + DEFAULT_MAX_RSP_SIZE = 256, /* leaves 220 bytes for sense data */ + + DEFAULT_MAX_RDMA_SIZE = 65536, +}; + +enum srpt_opcode { + SRPT_RECV, + SRPT_SEND, + SRPT_RDMA_MID, + SRPT_RDMA_ABORT, + SRPT_RDMA_READ_LAST, + SRPT_RDMA_WRITE_LAST, +}; + +static inline u64 encode_wr_id(u8 opcode, u32 idx) +{ + return ((u64)opcode << 32) | idx; +} +static inline enum srpt_opcode opcode_from_wr_id(u64 wr_id) +{ + return wr_id >> 32; +} +static inline u32 idx_from_wr_id(u64 wr_id) +{ + return (u32)wr_id; +} + +struct rdma_iu { + u64 raddr; + u32 rkey; + struct ib_sge *sge; + u32 sge_cnt; + int mem_id; +}; + +/** + * enum srpt_command_state - SCSI command state managed by SRPT. + * @SRPT_STATE_NEW: New command arrived and is being processed. + * @SRPT_STATE_NEED_DATA: Processing a write or bidir command and waiting + * for data arrival. + * @SRPT_STATE_DATA_IN: Data for the write or bidir command arrived and is + * being processed. + * @SRPT_STATE_CMD_RSP_SENT: SRP_RSP for SRP_CMD has been sent. + * @SRPT_STATE_MGMT: Processing a SCSI task management command. + * @SRPT_STATE_MGMT_RSP_SENT: SRP_RSP for SRP_TSK_MGMT has been sent. + * @SRPT_STATE_DONE: Command processing finished successfully, command + * processing has been aborted or command processing + * failed. + */ +enum srpt_command_state { + SRPT_STATE_NEW = 0, + SRPT_STATE_NEED_DATA = 1, + SRPT_STATE_DATA_IN = 2, + SRPT_STATE_CMD_RSP_SENT = 3, + SRPT_STATE_MGMT = 4, + SRPT_STATE_MGMT_RSP_SENT = 5, + SRPT_STATE_DONE = 6, +}; + +/** + * struct srpt_ioctx - Shared SRPT I/O context information. + * @buf: Pointer to the buffer. + * @dma: DMA address of the buffer. + * @index: Index of the I/O context in its ioctx_ring array. + */ +struct srpt_ioctx { + void *buf; + dma_addr_t dma; + uint32_t index; +}; + +/** + * struct srpt_recv_ioctx - SRPT receive I/O context. + * @ioctx: See above. + * @wait_list: Node for insertion in srpt_rdma_ch.cmd_wait_list. + */ +struct srpt_recv_ioctx { + struct srpt_ioctx ioctx; + struct list_head wait_list; +}; + +/** + * struct srpt_send_ioctx - SRPT send I/O context. + * @ioctx: See above. + * @ch: Channel pointer. + * @free_list: Node in srpt_rdma_ch.free_list. + * @n_rbuf: Number of data buffers in the received SRP command. + * @rbufs: Pointer to SRP data buffer array. + * @single_rbuf: SRP data buffer if the command has only a single buffer. + * @sg: Pointer to sg-list associated with this I/O context. + * @sg_cnt: SG-list size. + * @mapped_sg_count: ib_dma_map_sg() return value. + * @n_rdma_ius: Number of elements in the rdma_ius array. + * @rdma_ius: Array with information about the RDMA mapping. + * @tag: Tag of the received SRP information unit. + * @spinlock: Protects 'state'. + * @state: I/O context state. + * @rdma_aborted: If initiating a multipart RDMA transfer failed, whether + * the already initiated transfers have finished. + * @cmd: Target core command data structure. + * @sense_data: SCSI sense data. + */ +struct srpt_send_ioctx { + struct srpt_ioctx ioctx; + struct srpt_rdma_ch *ch; + struct rdma_iu *rdma_ius; + struct srp_direct_buf *rbufs; + struct srp_direct_buf single_rbuf; + struct scatterlist *sg; + struct list_head free_list; + spinlock_t spinlock; + enum srpt_command_state state; + bool rdma_aborted; + struct se_cmd cmd; + struct completion tx_done; + u64 tag; + int sg_cnt; + int mapped_sg_count; + u16 n_rdma_ius; + u8 n_rdma; + u8 n_rbuf; + bool queue_status_only; + u8 sense_data[SCSI_SENSE_BUFFERSIZE]; +}; + +/** + * enum rdma_ch_state - SRP channel state. + * @CH_CONNECTING: QP is in RTR state; waiting for RTU. + * @CH_LIVE: QP is in RTS state. + * @CH_DISCONNECTING: DREQ has been received; waiting for DREP + * or DREQ has been send and waiting for DREP + * or . + * @CH_DRAINING: QP is in ERR state; waiting for last WQE event. + * @CH_RELEASING: Last WQE event has been received; releasing resources. + */ +enum rdma_ch_state { + CH_CONNECTING, + CH_LIVE, + CH_DISCONNECTING, + CH_DRAINING, + CH_RELEASING +}; + +/** + * struct srpt_rdma_ch - RDMA channel. + * @wait_queue: Allows the kernel thread to wait for more work. + * @thread: Kernel thread that processes the IB queues associated with + * the channel. + * @cm_id: IB CM ID associated with the channel. + * @qp: IB queue pair used for communicating over this channel. + * @cq: IB completion queue for this channel. + * @rq_size: IB receive queue size. + * @rsp_size IB response message size in bytes. + * @sq_wr_avail: number of work requests available in the send queue. + * @sport: pointer to the information of the HCA port used by this + * channel. + * @i_port_id: 128-bit initiator port identifier copied from SRP_LOGIN_REQ. + * @t_port_id: 128-bit target port identifier copied from SRP_LOGIN_REQ. + * @max_ti_iu_len: maximum target-to-initiator information unit length. + * @req_lim: request limit: maximum number of requests that may be sent + * by the initiator without having received a response. + * @req_lim_delta: Number of credits not yet sent back to the initiator. + * @spinlock: Protects free_list and state. + * @free_list: Head of list with free send I/O contexts. + * @state: channel state. See also enum rdma_ch_state. + * @ioctx_ring: Send ring. + * @wc: IB work completion array for srpt_process_completion(). + * @list: Node for insertion in the srpt_device.rch_list list. + * @cmd_wait_list: List of SCSI commands that arrived before the RTU event. This + * list contains struct srpt_ioctx elements and is protected + * against concurrent modification by the cm_id spinlock. + * @sess: Session information associated with this SRP channel. + * @sess_name: Session name. + * @release_work: Allows scheduling of srpt_release_channel(). + * @release_done: Enables waiting for srpt_release_channel() completion. + */ +struct srpt_rdma_ch { + wait_queue_head_t wait_queue; + struct task_struct *thread; + struct ib_cm_id *cm_id; + struct ib_qp *qp; + struct ib_cq *cq; + int rq_size; + u32 rsp_size; + atomic_t sq_wr_avail; + struct srpt_port *sport; + u8 i_port_id[16]; + u8 t_port_id[16]; + int max_ti_iu_len; + atomic_t req_lim; + atomic_t req_lim_delta; + spinlock_t spinlock; + struct list_head free_list; + enum rdma_ch_state state; + struct srpt_send_ioctx **ioctx_ring; + struct ib_wc wc[16]; + struct list_head list; + struct list_head cmd_wait_list; + struct se_session *sess; + u8 sess_name[36]; + struct work_struct release_work; + struct completion *release_done; + bool in_shutdown; +}; + +/** + * struct srpt_port_attib - Attributes for SRPT port + * @srp_max_rdma_size: Maximum size of SRP RDMA transfers for new connections. + * @srp_max_rsp_size: Maximum size of SRP response messages in bytes. + * @srp_sq_size: Shared receive queue (SRQ) size. + */ +struct srpt_port_attrib { + u32 srp_max_rdma_size; + u32 srp_max_rsp_size; + u32 srp_sq_size; +}; + +/** + * struct srpt_port - Information associated by SRPT with a single IB port. + * @sdev: backpointer to the HCA information. + * @mad_agent: per-port management datagram processing information. + * @enabled: Whether or not this target port is enabled. + * @port_guid: ASCII representation of Port GUID + * @port: one-based port number. + * @sm_lid: cached value of the port's sm_lid. + * @lid: cached value of the port's lid. + * @gid: cached value of the port's gid. + * @port_acl_lock spinlock for port_acl_list: + * @work: work structure for refreshing the aforementioned cached values. + * @port_tpg_1 Target portal group = 1 data. + * @port_wwn: Target core WWN data. + * @port_acl_list: Head of the list with all node ACLs for this port. + */ +struct srpt_port { + struct srpt_device *sdev; + struct ib_mad_agent *mad_agent; + bool enabled; + u8 port_guid[64]; + u8 port; + u16 sm_lid; + u16 lid; + union ib_gid gid; + spinlock_t port_acl_lock; + struct work_struct work; + struct se_portal_group port_tpg_1; + struct se_wwn port_wwn; + struct list_head port_acl_list; + struct srpt_port_attrib port_attrib; +}; + +/** + * struct srpt_device - Information associated by SRPT with a single HCA. + * @device: Backpointer to the struct ib_device managed by the IB core. + * @pd: IB protection domain. + * @mr: L_Key (local key) with write access to all local memory. + * @srq: Per-HCA SRQ (shared receive queue). + * @cm_id: Connection identifier. + * @dev_attr: Attributes of the InfiniBand device as obtained during the + * ib_client.add() callback. + * @srq_size: SRQ size. + * @ioctx_ring: Per-HCA SRQ. + * @rch_list: Per-device channel list -- see also srpt_rdma_ch.list. + * @ch_releaseQ: Enables waiting for removal from rch_list. + * @spinlock: Protects rch_list and tpg. + * @port: Information about the ports owned by this HCA. + * @event_handler: Per-HCA asynchronous IB event handler. + * @list: Node in srpt_dev_list. + */ +struct srpt_device { + struct ib_device *device; + struct ib_pd *pd; + struct ib_mr *mr; + struct ib_srq *srq; + struct ib_cm_id *cm_id; + struct ib_device_attr dev_attr; + int srq_size; + struct srpt_recv_ioctx **ioctx_ring; + struct list_head rch_list; + wait_queue_head_t ch_releaseQ; + spinlock_t spinlock; + struct srpt_port port[2]; + struct ib_event_handler event_handler; + struct list_head list; +}; + +/** + * struct srpt_node_acl - Per-initiator ACL data (managed via configfs). + * @i_port_id: 128-bit SRP initiator port ID. + * @sport: port information. + * @nacl: Target core node ACL information. + * @list: Element of the per-HCA ACL list. + */ +struct srpt_node_acl { + u8 i_port_id[16]; + struct srpt_port *sport; + struct se_node_acl nacl; + struct list_head list; +}; + +/* + * SRP-releated SCSI persistent reservation definitions. + * + * See also SPC4r28, section 7.6.1 (Protocol specific parameters introduction). + * See also SPC4r28, section 7.6.4.5 (TransportID for initiator ports using + * SCSI over an RDMA interface). + */ + +enum { + SCSI_TRANSPORTID_PROTOCOLID_SRP = 4, +}; + +struct spc_rdma_transport_id { + uint8_t protocol_identifier; + uint8_t reserved[7]; + uint8_t i_port_id[16]; +}; + +#endif /* IB_SRPT_H */ diff --git a/drivers/net/ethernet/chelsio/cxgb3/Makefile b/drivers/net/ethernet/chelsio/cxgb3/Makefile new file mode 100644 index 0000000..29aff78 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb3/Makefile @@ -0,0 +1,8 @@ +# +# Chelsio T3 driver +# + +obj-$(CONFIG_CHELSIO_T3) += cxgb3.o + +cxgb3-objs := cxgb3_main.o ael1002.o vsc8211.o t3_hw.o mc5.o \ + xgmac.o sge.o l2t.o cxgb3_offload.o aq100x.o diff --git a/drivers/net/ethernet/chelsio/cxgb3/adapter.h b/drivers/net/ethernet/chelsio/cxgb3/adapter.h new file mode 100644 index 0000000..8b395b5 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb3/adapter.h @@ -0,0 +1,334 @@ +/* + * Copyright (c) 2003-2008 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* This file should not be included directly. Include common.h instead. */ + +#ifndef __T3_ADAPTER_H__ +#define __T3_ADAPTER_H__ + +#include +#include +#include +#include +#include +#include +#include +#include "t3cdev.h" +#include + +struct adapter; +struct sge_qset; +struct port_info; + +enum mac_idx_types { + LAN_MAC_IDX = 0, + SAN_MAC_IDX, + + MAX_MAC_IDX +}; + +struct iscsi_config { + __u8 mac_addr[ETH_ALEN]; + __u32 flags; + int (*send)(struct port_info *pi, struct sk_buff **skb); + int (*recv)(struct port_info *pi, struct sk_buff *skb); +}; + +struct port_info { + struct adapter *adapter; + struct sge_qset *qs; + u8 port_id; + u8 nqsets; + u8 first_qset; + struct cphy phy; + struct cmac mac; + struct link_config link_config; + struct net_device_stats netstats; + int activity; + __be32 iscsi_ipv4addr; + struct iscsi_config iscsic; + + int link_fault; /* link fault was detected */ +}; + +enum { /* adapter flags */ + FULL_INIT_DONE = (1 << 0), + USING_MSI = (1 << 1), + USING_MSIX = (1 << 2), + QUEUES_BOUND = (1 << 3), + TP_PARITY_INIT = (1 << 4), + NAPI_INIT = (1 << 5), +}; + +struct fl_pg_chunk { + struct page *page; + void *va; + unsigned int offset; + unsigned long *p_cnt; + dma_addr_t mapping; +}; + +struct rx_desc; +struct rx_sw_desc; + +struct sge_fl { /* SGE per free-buffer list state */ + unsigned int buf_size; /* size of each Rx buffer */ + unsigned int credits; /* # of available Rx buffers */ + unsigned int pend_cred; /* new buffers since last FL DB ring */ + unsigned int size; /* capacity of free list */ + unsigned int cidx; /* consumer index */ + unsigned int pidx; /* producer index */ + unsigned int gen; /* free list generation */ + struct fl_pg_chunk pg_chunk;/* page chunk cache */ + unsigned int use_pages; /* whether FL uses pages or sk_buffs */ + unsigned int order; /* order of page allocations */ + unsigned int alloc_size; /* size of allocated buffer */ + struct rx_desc *desc; /* address of HW Rx descriptor ring */ + struct rx_sw_desc *sdesc; /* address of SW Rx descriptor ring */ + dma_addr_t phys_addr; /* physical address of HW ring start */ + unsigned int cntxt_id; /* SGE context id for the free list */ + unsigned long empty; /* # of times queue ran out of buffers */ + unsigned long alloc_failed; /* # of times buffer allocation failed */ +}; + +/* + * Bundle size for grouping offload RX packets for delivery to the stack. + * Don't make this too big as we do prefetch on each packet in a bundle. + */ +# define RX_BUNDLE_SIZE 8 + +struct rsp_desc; + +struct sge_rspq { /* state for an SGE response queue */ + unsigned int credits; /* # of pending response credits */ + unsigned int size; /* capacity of response queue */ + unsigned int cidx; /* consumer index */ + unsigned int gen; /* current generation bit */ + unsigned int polling; /* is the queue serviced through NAPI? */ + unsigned int holdoff_tmr; /* interrupt holdoff timer in 100ns */ + unsigned int next_holdoff; /* holdoff time for next interrupt */ + unsigned int rx_recycle_buf; /* whether recycling occurred + within current sop-eop */ + struct rsp_desc *desc; /* address of HW response ring */ + dma_addr_t phys_addr; /* physical address of the ring */ + unsigned int cntxt_id; /* SGE context id for the response q */ + spinlock_t lock; /* guards response processing */ + struct sk_buff_head rx_queue; /* offload packet receive queue */ + struct sk_buff *pg_skb; /* used to build frag list in napi handler */ + + unsigned long offload_pkts; + unsigned long offload_bundles; + unsigned long eth_pkts; /* # of ethernet packets */ + unsigned long pure_rsps; /* # of pure (non-data) responses */ + unsigned long imm_data; /* responses with immediate data */ + unsigned long rx_drops; /* # of packets dropped due to no mem */ + unsigned long async_notif; /* # of asynchronous notification events */ + unsigned long empty; /* # of times queue ran out of credits */ + unsigned long nomem; /* # of responses deferred due to no mem */ + unsigned long unhandled_irqs; /* # of spurious intrs */ + unsigned long starved; + unsigned long restarted; +}; + +struct tx_desc; +struct tx_sw_desc; + +struct sge_txq { /* state for an SGE Tx queue */ + unsigned long flags; /* HW DMA fetch status */ + unsigned int in_use; /* # of in-use Tx descriptors */ + unsigned int size; /* # of descriptors */ + unsigned int processed; /* total # of descs HW has processed */ + unsigned int cleaned; /* total # of descs SW has reclaimed */ + unsigned int stop_thres; /* SW TX queue suspend threshold */ + unsigned int cidx; /* consumer index */ + unsigned int pidx; /* producer index */ + unsigned int gen; /* current value of generation bit */ + unsigned int unacked; /* Tx descriptors used since last COMPL */ + struct tx_desc *desc; /* address of HW Tx descriptor ring */ + struct tx_sw_desc *sdesc; /* address of SW Tx descriptor ring */ + spinlock_t lock; /* guards enqueueing of new packets */ + unsigned int token; /* WR token */ + dma_addr_t phys_addr; /* physical address of the ring */ + struct sk_buff_head sendq; /* List of backpressured offload packets */ + struct tasklet_struct qresume_tsk; /* restarts the queue */ + unsigned int cntxt_id; /* SGE context id for the Tx q */ + unsigned long stops; /* # of times q has been stopped */ + unsigned long restarts; /* # of queue restarts */ +}; + +enum { /* per port SGE statistics */ + SGE_PSTAT_TSO, /* # of TSO requests */ + SGE_PSTAT_RX_CSUM_GOOD, /* # of successful RX csum offloads */ + SGE_PSTAT_TX_CSUM, /* # of TX checksum offloads */ + SGE_PSTAT_VLANEX, /* # of VLAN tag extractions */ + SGE_PSTAT_VLANINS, /* # of VLAN tag insertions */ + + SGE_PSTAT_MAX /* must be last */ +}; + +struct napi_gro_fraginfo; + +struct sge_qset { /* an SGE queue set */ + struct adapter *adap; + struct napi_struct napi; + struct sge_rspq rspq; + struct sge_fl fl[SGE_RXQ_PER_SET]; + struct sge_txq txq[SGE_TXQ_PER_SET]; + int nomem; + void *lro_va; + struct net_device *netdev; + struct netdev_queue *tx_q; /* associated netdev TX queue */ + unsigned long txq_stopped; /* which Tx queues are stopped */ + struct timer_list tx_reclaim_timer; /* reclaims TX buffers */ + struct timer_list rx_reclaim_timer; /* reclaims RX buffers */ + unsigned long port_stats[SGE_PSTAT_MAX]; +} ____cacheline_aligned; + +struct sge { + struct sge_qset qs[SGE_QSETS]; + spinlock_t reg_lock; /* guards non-atomic SGE registers (eg context) */ +}; + +struct adapter { + struct t3cdev tdev; + struct list_head adapter_list; + void __iomem *regs; + struct pci_dev *pdev; + unsigned long registered_device_map; + unsigned long open_device_map; + unsigned long flags; + + const char *name; + int msg_enable; + unsigned int mmio_len; + + struct adapter_params params; + unsigned int slow_intr_mask; + unsigned long irq_stats[IRQ_NUM_STATS]; + + int msix_nvectors; + struct { + unsigned short vec; + char desc[22]; + } msix_info[SGE_QSETS + 1]; + + /* T3 modules */ + struct sge sge; + struct mc7 pmrx; + struct mc7 pmtx; + struct mc7 cm; + struct mc5 mc5; + + struct net_device *port[MAX_NPORTS]; + unsigned int check_task_cnt; + struct delayed_work adap_check_task; + struct work_struct ext_intr_handler_task; + struct work_struct fatal_error_handler_task; + struct work_struct link_fault_handler_task; + + struct work_struct db_full_task; + struct work_struct db_empty_task; + struct work_struct db_drop_task; + + struct dentry *debugfs_root; + + struct mutex mdio_lock; + spinlock_t stats_lock; + spinlock_t work_lock; + + struct sk_buff *nofail_skb; +}; + +static inline u32 t3_read_reg(struct adapter *adapter, u32 reg_addr) +{ + u32 val = readl(adapter->regs + reg_addr); + + CH_DBG(adapter, MMIO, "read register 0x%x value 0x%x\n", reg_addr, val); + return val; +} + +static inline void t3_write_reg(struct adapter *adapter, u32 reg_addr, u32 val) +{ + CH_DBG(adapter, MMIO, "setting register 0x%x to 0x%x\n", reg_addr, val); + writel(val, adapter->regs + reg_addr); +} + +static inline struct port_info *adap2pinfo(struct adapter *adap, int idx) +{ + return netdev_priv(adap->port[idx]); +} + +static inline int phy2portid(struct cphy *phy) +{ + struct adapter *adap = phy->adapter; + struct port_info *port0 = adap2pinfo(adap, 0); + + return &port0->phy == phy ? 0 : 1; +} + +#define OFFLOAD_DEVMAP_BIT 15 + +#define tdev2adap(d) container_of(d, struct adapter, tdev) + +static inline int offload_running(struct adapter *adapter) +{ + return test_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map); +} + +int t3_offload_tx(struct t3cdev *tdev, struct sk_buff *skb); + +void t3_os_ext_intr_handler(struct adapter *adapter); +void t3_os_link_changed(struct adapter *adapter, int port_id, int link_status, + int speed, int duplex, int fc); +void t3_os_phymod_changed(struct adapter *adap, int port_id); +void t3_os_link_fault(struct adapter *adapter, int port_id, int state); +void t3_os_link_fault_handler(struct adapter *adapter, int port_id); + +void t3_sge_start(struct adapter *adap); +void t3_sge_stop(struct adapter *adap); +void t3_start_sge_timers(struct adapter *adap); +void t3_stop_sge_timers(struct adapter *adap); +void t3_free_sge_resources(struct adapter *adap); +void t3_sge_err_intr_handler(struct adapter *adapter); +irq_handler_t t3_intr_handler(struct adapter *adap, int polling); +netdev_tx_t t3_eth_xmit(struct sk_buff *skb, struct net_device *dev); +int t3_mgmt_tx(struct adapter *adap, struct sk_buff *skb); +void t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p); +int t3_sge_alloc_qset(struct adapter *adapter, unsigned int id, int nports, + int irq_vec_idx, const struct qset_params *p, + int ntxq, struct net_device *dev, + struct netdev_queue *netdevq); +extern struct workqueue_struct *cxgb3_wq; + +int t3_get_edc_fw(struct cphy *phy, int edc_idx, int size); + +#endif /* __T3_ADAPTER_H__ */ diff --git a/drivers/net/ethernet/chelsio/cxgb3/ael1002.c b/drivers/net/ethernet/chelsio/cxgb3/ael1002.c new file mode 100644 index 0000000..2028da9 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb3/ael1002.c @@ -0,0 +1,941 @@ +/* + * Copyright (c) 2005-2008 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "common.h" +#include "regs.h" + +enum { + AEL100X_TX_CONFIG1 = 0xc002, + AEL1002_PWR_DOWN_HI = 0xc011, + AEL1002_PWR_DOWN_LO = 0xc012, + AEL1002_XFI_EQL = 0xc015, + AEL1002_LB_EN = 0xc017, + AEL_OPT_SETTINGS = 0xc017, + AEL_I2C_CTRL = 0xc30a, + AEL_I2C_DATA = 0xc30b, + AEL_I2C_STAT = 0xc30c, + AEL2005_GPIO_CTRL = 0xc214, + AEL2005_GPIO_STAT = 0xc215, + + AEL2020_GPIO_INTR = 0xc103, /* Latch High (LH) */ + AEL2020_GPIO_CTRL = 0xc108, /* Store Clear (SC) */ + AEL2020_GPIO_STAT = 0xc10c, /* Read Only (RO) */ + AEL2020_GPIO_CFG = 0xc110, /* Read Write (RW) */ + + AEL2020_GPIO_SDA = 0, /* IN: i2c serial data */ + AEL2020_GPIO_MODDET = 1, /* IN: Module Detect */ + AEL2020_GPIO_0 = 3, /* IN: unassigned */ + AEL2020_GPIO_1 = 2, /* OUT: unassigned */ + AEL2020_GPIO_LSTAT = AEL2020_GPIO_1, /* wired to link status LED */ +}; + +enum { edc_none, edc_sr, edc_twinax }; + +/* PHY module I2C device address */ +enum { + MODULE_DEV_ADDR = 0xa0, + SFF_DEV_ADDR = 0xa2, +}; + +/* PHY transceiver type */ +enum { + phy_transtype_unknown = 0, + phy_transtype_sfp = 3, + phy_transtype_xfp = 6, +}; + +#define AEL2005_MODDET_IRQ 4 + +struct reg_val { + unsigned short mmd_addr; + unsigned short reg_addr; + unsigned short clear_bits; + unsigned short set_bits; +}; + +static int set_phy_regs(struct cphy *phy, const struct reg_val *rv) +{ + int err; + + for (err = 0; rv->mmd_addr && !err; rv++) { + if (rv->clear_bits == 0xffff) + err = t3_mdio_write(phy, rv->mmd_addr, rv->reg_addr, + rv->set_bits); + else + err = t3_mdio_change_bits(phy, rv->mmd_addr, + rv->reg_addr, rv->clear_bits, + rv->set_bits); + } + return err; +} + +static void ael100x_txon(struct cphy *phy) +{ + int tx_on_gpio = + phy->mdio.prtad == 0 ? F_GPIO7_OUT_VAL : F_GPIO2_OUT_VAL; + + msleep(100); + t3_set_reg_field(phy->adapter, A_T3DBG_GPIO_EN, 0, tx_on_gpio); + msleep(30); +} + +/* + * Read an 8-bit word from a device attached to the PHY's i2c bus. + */ +static int ael_i2c_rd(struct cphy *phy, int dev_addr, int word_addr) +{ + int i, err; + unsigned int stat, data; + + err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AEL_I2C_CTRL, + (dev_addr << 8) | (1 << 8) | word_addr); + if (err) + return err; + + for (i = 0; i < 200; i++) { + msleep(1); + err = t3_mdio_read(phy, MDIO_MMD_PMAPMD, AEL_I2C_STAT, &stat); + if (err) + return err; + if ((stat & 3) == 1) { + err = t3_mdio_read(phy, MDIO_MMD_PMAPMD, AEL_I2C_DATA, + &data); + if (err) + return err; + return data >> 8; + } + } + CH_WARN(phy->adapter, "PHY %u i2c read of dev.addr %#x.%#x timed out\n", + phy->mdio.prtad, dev_addr, word_addr); + return -ETIMEDOUT; +} + +static int ael1002_power_down(struct cphy *phy, int enable) +{ + int err; + + err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, MDIO_PMA_TXDIS, !!enable); + if (!err) + err = mdio_set_flag(&phy->mdio, phy->mdio.prtad, + MDIO_MMD_PMAPMD, MDIO_CTRL1, + MDIO_CTRL1_LPOWER, enable); + return err; +} + +static int ael1002_reset(struct cphy *phy, int wait) +{ + int err; + + if ((err = ael1002_power_down(phy, 0)) || + (err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AEL100X_TX_CONFIG1, 1)) || + (err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AEL1002_PWR_DOWN_HI, 0)) || + (err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AEL1002_PWR_DOWN_LO, 0)) || + (err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AEL1002_XFI_EQL, 0x18)) || + (err = t3_mdio_change_bits(phy, MDIO_MMD_PMAPMD, AEL1002_LB_EN, + 0, 1 << 5))) + return err; + return 0; +} + +static int ael1002_intr_noop(struct cphy *phy) +{ + return 0; +} + +/* + * Get link status for a 10GBASE-R device. + */ +static int get_link_status_r(struct cphy *phy, int *link_ok, int *speed, + int *duplex, int *fc) +{ + if (link_ok) { + unsigned int stat0, stat1, stat2; + int err = t3_mdio_read(phy, MDIO_MMD_PMAPMD, + MDIO_PMA_RXDET, &stat0); + + if (!err) + err = t3_mdio_read(phy, MDIO_MMD_PCS, + MDIO_PCS_10GBRT_STAT1, &stat1); + if (!err) + err = t3_mdio_read(phy, MDIO_MMD_PHYXS, + MDIO_PHYXS_LNSTAT, &stat2); + if (err) + return err; + *link_ok = (stat0 & stat1 & (stat2 >> 12)) & 1; + } + if (speed) + *speed = SPEED_10000; + if (duplex) + *duplex = DUPLEX_FULL; + return 0; +} + +static struct cphy_ops ael1002_ops = { + .reset = ael1002_reset, + .intr_enable = ael1002_intr_noop, + .intr_disable = ael1002_intr_noop, + .intr_clear = ael1002_intr_noop, + .intr_handler = ael1002_intr_noop, + .get_link_status = get_link_status_r, + .power_down = ael1002_power_down, + .mmds = MDIO_DEVS_PMAPMD | MDIO_DEVS_PCS | MDIO_DEVS_PHYXS, +}; + +int t3_ael1002_phy_prep(struct cphy *phy, struct adapter *adapter, + int phy_addr, const struct mdio_ops *mdio_ops) +{ + cphy_init(phy, adapter, phy_addr, &ael1002_ops, mdio_ops, + SUPPORTED_10000baseT_Full | SUPPORTED_AUI | SUPPORTED_FIBRE, + "10GBASE-R"); + ael100x_txon(phy); + return 0; +} + +static int ael1006_reset(struct cphy *phy, int wait) +{ + return t3_phy_reset(phy, MDIO_MMD_PMAPMD, wait); +} + +static struct cphy_ops ael1006_ops = { + .reset = ael1006_reset, + .intr_enable = t3_phy_lasi_intr_enable, + .intr_disable = t3_phy_lasi_intr_disable, + .intr_clear = t3_phy_lasi_intr_clear, + .intr_handler = t3_phy_lasi_intr_handler, + .get_link_status = get_link_status_r, + .power_down = ael1002_power_down, + .mmds = MDIO_DEVS_PMAPMD | MDIO_DEVS_PCS | MDIO_DEVS_PHYXS, +}; + +int t3_ael1006_phy_prep(struct cphy *phy, struct adapter *adapter, + int phy_addr, const struct mdio_ops *mdio_ops) +{ + cphy_init(phy, adapter, phy_addr, &ael1006_ops, mdio_ops, + SUPPORTED_10000baseT_Full | SUPPORTED_AUI | SUPPORTED_FIBRE, + "10GBASE-SR"); + ael100x_txon(phy); + return 0; +} + +/* + * Decode our module type. + */ +static int ael2xxx_get_module_type(struct cphy *phy, int delay_ms) +{ + int v; + + if (delay_ms) + msleep(delay_ms); + + /* see SFF-8472 for below */ + v = ael_i2c_rd(phy, MODULE_DEV_ADDR, 3); + if (v < 0) + return v; + + if (v == 0x10) + return phy_modtype_sr; + if (v == 0x20) + return phy_modtype_lr; + if (v == 0x40) + return phy_modtype_lrm; + + v = ael_i2c_rd(phy, MODULE_DEV_ADDR, 6); + if (v < 0) + return v; + if (v != 4) + goto unknown; + + v = ael_i2c_rd(phy, MODULE_DEV_ADDR, 10); + if (v < 0) + return v; + + if (v & 0x80) { + v = ael_i2c_rd(phy, MODULE_DEV_ADDR, 0x12); + if (v < 0) + return v; + return v > 10 ? phy_modtype_twinax_long : phy_modtype_twinax; + } +unknown: + return phy_modtype_unknown; +} + +/* + * Code to support the Aeluros/NetLogic 2005 10Gb PHY. + */ +static int ael2005_setup_sr_edc(struct cphy *phy) +{ + static const struct reg_val regs[] = { + { MDIO_MMD_PMAPMD, 0xc003, 0xffff, 0x181 }, + { MDIO_MMD_PMAPMD, 0xc010, 0xffff, 0x448a }, + { MDIO_MMD_PMAPMD, 0xc04a, 0xffff, 0x5200 }, + { 0, 0, 0, 0 } + }; + + int i, err; + + err = set_phy_regs(phy, regs); + if (err) + return err; + + msleep(50); + + if (phy->priv != edc_sr) + err = t3_get_edc_fw(phy, EDC_OPT_AEL2005, + EDC_OPT_AEL2005_SIZE); + if (err) + return err; + + for (i = 0; i < EDC_OPT_AEL2005_SIZE / sizeof(u16) && !err; i += 2) + err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, + phy->phy_cache[i], + phy->phy_cache[i + 1]); + if (!err) + phy->priv = edc_sr; + return err; +} + +static int ael2005_setup_twinax_edc(struct cphy *phy, int modtype) +{ + static const struct reg_val regs[] = { + { MDIO_MMD_PMAPMD, 0xc04a, 0xffff, 0x5a00 }, + { 0, 0, 0, 0 } + }; + static const struct reg_val preemphasis[] = { + { MDIO_MMD_PMAPMD, 0xc014, 0xffff, 0xfe16 }, + { MDIO_MMD_PMAPMD, 0xc015, 0xffff, 0xa000 }, + { 0, 0, 0, 0 } + }; + int i, err; + + err = set_phy_regs(phy, regs); + if (!err && modtype == phy_modtype_twinax_long) + err = set_phy_regs(phy, preemphasis); + if (err) + return err; + + msleep(50); + + if (phy->priv != edc_twinax) + err = t3_get_edc_fw(phy, EDC_TWX_AEL2005, + EDC_TWX_AEL2005_SIZE); + if (err) + return err; + + for (i = 0; i < EDC_TWX_AEL2005_SIZE / sizeof(u16) && !err; i += 2) + err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, + phy->phy_cache[i], + phy->phy_cache[i + 1]); + if (!err) + phy->priv = edc_twinax; + return err; +} + +static int ael2005_get_module_type(struct cphy *phy, int delay_ms) +{ + int v; + unsigned int stat; + + v = t3_mdio_read(phy, MDIO_MMD_PMAPMD, AEL2005_GPIO_CTRL, &stat); + if (v) + return v; + + if (stat & (1 << 8)) /* module absent */ + return phy_modtype_none; + + return ael2xxx_get_module_type(phy, delay_ms); +} + +static int ael2005_intr_enable(struct cphy *phy) +{ + int err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AEL2005_GPIO_CTRL, 0x200); + return err ? err : t3_phy_lasi_intr_enable(phy); +} + +static int ael2005_intr_disable(struct cphy *phy) +{ + int err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AEL2005_GPIO_CTRL, 0x100); + return err ? err : t3_phy_lasi_intr_disable(phy); +} + +static int ael2005_intr_clear(struct cphy *phy) +{ + int err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AEL2005_GPIO_CTRL, 0xd00); + return err ? err : t3_phy_lasi_intr_clear(phy); +} + +static int ael2005_reset(struct cphy *phy, int wait) +{ + static const struct reg_val regs0[] = { + { MDIO_MMD_PMAPMD, 0xc001, 0, 1 << 5 }, + { MDIO_MMD_PMAPMD, 0xc017, 0, 1 << 5 }, + { MDIO_MMD_PMAPMD, 0xc013, 0xffff, 0xf341 }, + { MDIO_MMD_PMAPMD, 0xc210, 0xffff, 0x8000 }, + { MDIO_MMD_PMAPMD, 0xc210, 0xffff, 0x8100 }, + { MDIO_MMD_PMAPMD, 0xc210, 0xffff, 0x8000 }, + { MDIO_MMD_PMAPMD, 0xc210, 0xffff, 0 }, + { 0, 0, 0, 0 } + }; + static const struct reg_val regs1[] = { + { MDIO_MMD_PMAPMD, 0xca00, 0xffff, 0x0080 }, + { MDIO_MMD_PMAPMD, 0xca12, 0xffff, 0 }, + { 0, 0, 0, 0 } + }; + + int err; + unsigned int lasi_ctrl; + + err = t3_mdio_read(phy, MDIO_MMD_PMAPMD, MDIO_PMA_LASI_CTRL, + &lasi_ctrl); + if (err) + return err; + + err = t3_phy_reset(phy, MDIO_MMD_PMAPMD, 0); + if (err) + return err; + + msleep(125); + phy->priv = edc_none; + err = set_phy_regs(phy, regs0); + if (err) + return err; + + msleep(50); + + err = ael2005_get_module_type(phy, 0); + if (err < 0) + return err; + phy->modtype = err; + + if (err == phy_modtype_twinax || err == phy_modtype_twinax_long) + err = ael2005_setup_twinax_edc(phy, err); + else + err = ael2005_setup_sr_edc(phy); + if (err) + return err; + + err = set_phy_regs(phy, regs1); + if (err) + return err; + + /* reset wipes out interrupts, reenable them if they were on */ + if (lasi_ctrl & 1) + err = ael2005_intr_enable(phy); + return err; +} + +static int ael2005_intr_handler(struct cphy *phy) +{ + unsigned int stat; + int ret, edc_needed, cause = 0; + + ret = t3_mdio_read(phy, MDIO_MMD_PMAPMD, AEL2005_GPIO_STAT, &stat); + if (ret) + return ret; + + if (stat & AEL2005_MODDET_IRQ) { + ret = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AEL2005_GPIO_CTRL, + 0xd00); + if (ret) + return ret; + + /* modules have max 300 ms init time after hot plug */ + ret = ael2005_get_module_type(phy, 300); + if (ret < 0) + return ret; + + phy->modtype = ret; + if (ret == phy_modtype_none) + edc_needed = phy->priv; /* on unplug retain EDC */ + else if (ret == phy_modtype_twinax || + ret == phy_modtype_twinax_long) + edc_needed = edc_twinax; + else + edc_needed = edc_sr; + + if (edc_needed != phy->priv) { + ret = ael2005_reset(phy, 0); + return ret ? ret : cphy_cause_module_change; + } + cause = cphy_cause_module_change; + } + + ret = t3_phy_lasi_intr_handler(phy); + if (ret < 0) + return ret; + + ret |= cause; + return ret ? ret : cphy_cause_link_change; +} + +static struct cphy_ops ael2005_ops = { + .reset = ael2005_reset, + .intr_enable = ael2005_intr_enable, + .intr_disable = ael2005_intr_disable, + .intr_clear = ael2005_intr_clear, + .intr_handler = ael2005_intr_handler, + .get_link_status = get_link_status_r, + .power_down = ael1002_power_down, + .mmds = MDIO_DEVS_PMAPMD | MDIO_DEVS_PCS | MDIO_DEVS_PHYXS, +}; + +int t3_ael2005_phy_prep(struct cphy *phy, struct adapter *adapter, + int phy_addr, const struct mdio_ops *mdio_ops) +{ + cphy_init(phy, adapter, phy_addr, &ael2005_ops, mdio_ops, + SUPPORTED_10000baseT_Full | SUPPORTED_AUI | SUPPORTED_FIBRE | + SUPPORTED_IRQ, "10GBASE-R"); + msleep(125); + return t3_mdio_change_bits(phy, MDIO_MMD_PMAPMD, AEL_OPT_SETTINGS, 0, + 1 << 5); +} + +/* + * Setup EDC and other parameters for operation with an optical module. + */ +static int ael2020_setup_sr_edc(struct cphy *phy) +{ + static const struct reg_val regs[] = { + /* set CDR offset to 10 */ + { MDIO_MMD_PMAPMD, 0xcc01, 0xffff, 0x488a }, + + /* adjust 10G RX bias current */ + { MDIO_MMD_PMAPMD, 0xcb1b, 0xffff, 0x0200 }, + { MDIO_MMD_PMAPMD, 0xcb1c, 0xffff, 0x00f0 }, + { MDIO_MMD_PMAPMD, 0xcc06, 0xffff, 0x00e0 }, + + /* end */ + { 0, 0, 0, 0 } + }; + int err; + + err = set_phy_regs(phy, regs); + msleep(50); + if (err) + return err; + + phy->priv = edc_sr; + return 0; +} + +/* + * Setup EDC and other parameters for operation with an TWINAX module. + */ +static int ael2020_setup_twinax_edc(struct cphy *phy, int modtype) +{ + /* set uC to 40MHz */ + static const struct reg_val uCclock40MHz[] = { + { MDIO_MMD_PMAPMD, 0xff28, 0xffff, 0x4001 }, + { MDIO_MMD_PMAPMD, 0xff2a, 0xffff, 0x0002 }, + { 0, 0, 0, 0 } + }; + + /* activate uC clock */ + static const struct reg_val uCclockActivate[] = { + { MDIO_MMD_PMAPMD, 0xd000, 0xffff, 0x5200 }, + { 0, 0, 0, 0 } + }; + + /* set PC to start of SRAM and activate uC */ + static const struct reg_val uCactivate[] = { + { MDIO_MMD_PMAPMD, 0xd080, 0xffff, 0x0100 }, + { MDIO_MMD_PMAPMD, 0xd092, 0xffff, 0x0000 }, + { 0, 0, 0, 0 } + }; + int i, err; + + /* set uC clock and activate it */ + err = set_phy_regs(phy, uCclock40MHz); + msleep(500); + if (err) + return err; + err = set_phy_regs(phy, uCclockActivate); + msleep(500); + if (err) + return err; + + if (phy->priv != edc_twinax) + err = t3_get_edc_fw(phy, EDC_TWX_AEL2020, + EDC_TWX_AEL2020_SIZE); + if (err) + return err; + + for (i = 0; i < EDC_TWX_AEL2020_SIZE / sizeof(u16) && !err; i += 2) + err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, + phy->phy_cache[i], + phy->phy_cache[i + 1]); + /* activate uC */ + err = set_phy_regs(phy, uCactivate); + if (!err) + phy->priv = edc_twinax; + return err; +} + +/* + * Return Module Type. + */ +static int ael2020_get_module_type(struct cphy *phy, int delay_ms) +{ + int v; + unsigned int stat; + + v = t3_mdio_read(phy, MDIO_MMD_PMAPMD, AEL2020_GPIO_STAT, &stat); + if (v) + return v; + + if (stat & (0x1 << (AEL2020_GPIO_MODDET*4))) { + /* module absent */ + return phy_modtype_none; + } + + return ael2xxx_get_module_type(phy, delay_ms); +} + +/* + * Enable PHY interrupts. We enable "Module Detection" interrupts (on any + * state transition) and then generic Link Alarm Status Interrupt (LASI). + */ +static int ael2020_intr_enable(struct cphy *phy) +{ + static const struct reg_val regs[] = { + /* output Module's Loss Of Signal (LOS) to LED */ + { MDIO_MMD_PMAPMD, AEL2020_GPIO_CFG+AEL2020_GPIO_LSTAT, + 0xffff, 0x4 }, + { MDIO_MMD_PMAPMD, AEL2020_GPIO_CTRL, + 0xffff, 0x8 << (AEL2020_GPIO_LSTAT*4) }, + + /* enable module detect status change interrupts */ + { MDIO_MMD_PMAPMD, AEL2020_GPIO_CTRL, + 0xffff, 0x2 << (AEL2020_GPIO_MODDET*4) }, + + /* end */ + { 0, 0, 0, 0 } + }; + int err, link_ok = 0; + + /* set up "link status" LED and enable module change interrupts */ + err = set_phy_regs(phy, regs); + if (err) + return err; + + err = get_link_status_r(phy, &link_ok, NULL, NULL, NULL); + if (err) + return err; + if (link_ok) + t3_link_changed(phy->adapter, + phy2portid(phy)); + + err = t3_phy_lasi_intr_enable(phy); + if (err) + return err; + + return 0; +} + +/* + * Disable PHY interrupts. The mirror of the above ... + */ +static int ael2020_intr_disable(struct cphy *phy) +{ + static const struct reg_val regs[] = { + /* reset "link status" LED to "off" */ + { MDIO_MMD_PMAPMD, AEL2020_GPIO_CTRL, + 0xffff, 0xb << (AEL2020_GPIO_LSTAT*4) }, + + /* disable module detect status change interrupts */ + { MDIO_MMD_PMAPMD, AEL2020_GPIO_CTRL, + 0xffff, 0x1 << (AEL2020_GPIO_MODDET*4) }, + + /* end */ + { 0, 0, 0, 0 } + }; + int err; + + /* turn off "link status" LED and disable module change interrupts */ + err = set_phy_regs(phy, regs); + if (err) + return err; + + return t3_phy_lasi_intr_disable(phy); +} + +/* + * Clear PHY interrupt state. + */ +static int ael2020_intr_clear(struct cphy *phy) +{ + /* + * The GPIO Interrupt register on the AEL2020 is a "Latching High" + * (LH) register which is cleared to the current state when it's read. + * Thus, we simply read the register and discard the result. + */ + unsigned int stat; + int err = t3_mdio_read(phy, MDIO_MMD_PMAPMD, AEL2020_GPIO_INTR, &stat); + return err ? err : t3_phy_lasi_intr_clear(phy); +} + +static const struct reg_val ael2020_reset_regs[] = { + /* Erratum #2: CDRLOL asserted, causing PMA link down status */ + { MDIO_MMD_PMAPMD, 0xc003, 0xffff, 0x3101 }, + + /* force XAUI to send LF when RX_LOS is asserted */ + { MDIO_MMD_PMAPMD, 0xcd40, 0xffff, 0x0001 }, + + /* allow writes to transceiver module EEPROM on i2c bus */ + { MDIO_MMD_PMAPMD, 0xff02, 0xffff, 0x0023 }, + { MDIO_MMD_PMAPMD, 0xff03, 0xffff, 0x0000 }, + { MDIO_MMD_PMAPMD, 0xff04, 0xffff, 0x0000 }, + + /* end */ + { 0, 0, 0, 0 } +}; +/* + * Reset the PHY and put it into a canonical operating state. + */ +static int ael2020_reset(struct cphy *phy, int wait) +{ + int err; + unsigned int lasi_ctrl; + + /* grab current interrupt state */ + err = t3_mdio_read(phy, MDIO_MMD_PMAPMD, MDIO_PMA_LASI_CTRL, + &lasi_ctrl); + if (err) + return err; + + err = t3_phy_reset(phy, MDIO_MMD_PMAPMD, 125); + if (err) + return err; + msleep(100); + + /* basic initialization for all module types */ + phy->priv = edc_none; + err = set_phy_regs(phy, ael2020_reset_regs); + if (err) + return err; + + /* determine module type and perform appropriate initialization */ + err = ael2020_get_module_type(phy, 0); + if (err < 0) + return err; + phy->modtype = (u8)err; + if (err == phy_modtype_twinax || err == phy_modtype_twinax_long) + err = ael2020_setup_twinax_edc(phy, err); + else + err = ael2020_setup_sr_edc(phy); + if (err) + return err; + + /* reset wipes out interrupts, reenable them if they were on */ + if (lasi_ctrl & 1) + err = ael2005_intr_enable(phy); + return err; +} + +/* + * Handle a PHY interrupt. + */ +static int ael2020_intr_handler(struct cphy *phy) +{ + unsigned int stat; + int ret, edc_needed, cause = 0; + + ret = t3_mdio_read(phy, MDIO_MMD_PMAPMD, AEL2020_GPIO_INTR, &stat); + if (ret) + return ret; + + if (stat & (0x1 << AEL2020_GPIO_MODDET)) { + /* modules have max 300 ms init time after hot plug */ + ret = ael2020_get_module_type(phy, 300); + if (ret < 0) + return ret; + + phy->modtype = (u8)ret; + if (ret == phy_modtype_none) + edc_needed = phy->priv; /* on unplug retain EDC */ + else if (ret == phy_modtype_twinax || + ret == phy_modtype_twinax_long) + edc_needed = edc_twinax; + else + edc_needed = edc_sr; + + if (edc_needed != phy->priv) { + ret = ael2020_reset(phy, 0); + return ret ? ret : cphy_cause_module_change; + } + cause = cphy_cause_module_change; + } + + ret = t3_phy_lasi_intr_handler(phy); + if (ret < 0) + return ret; + + ret |= cause; + return ret ? ret : cphy_cause_link_change; +} + +static struct cphy_ops ael2020_ops = { + .reset = ael2020_reset, + .intr_enable = ael2020_intr_enable, + .intr_disable = ael2020_intr_disable, + .intr_clear = ael2020_intr_clear, + .intr_handler = ael2020_intr_handler, + .get_link_status = get_link_status_r, + .power_down = ael1002_power_down, + .mmds = MDIO_DEVS_PMAPMD | MDIO_DEVS_PCS | MDIO_DEVS_PHYXS, +}; + +int t3_ael2020_phy_prep(struct cphy *phy, struct adapter *adapter, int phy_addr, + const struct mdio_ops *mdio_ops) +{ + int err; + + cphy_init(phy, adapter, phy_addr, &ael2020_ops, mdio_ops, + SUPPORTED_10000baseT_Full | SUPPORTED_AUI | SUPPORTED_FIBRE | + SUPPORTED_IRQ, "10GBASE-R"); + msleep(125); + + err = set_phy_regs(phy, ael2020_reset_regs); + if (err) + return err; + return 0; +} + +/* + * Get link status for a 10GBASE-X device. + */ +static int get_link_status_x(struct cphy *phy, int *link_ok, int *speed, + int *duplex, int *fc) +{ + if (link_ok) { + unsigned int stat0, stat1, stat2; + int err = t3_mdio_read(phy, MDIO_MMD_PMAPMD, + MDIO_PMA_RXDET, &stat0); + + if (!err) + err = t3_mdio_read(phy, MDIO_MMD_PCS, + MDIO_PCS_10GBX_STAT1, &stat1); + if (!err) + err = t3_mdio_read(phy, MDIO_MMD_PHYXS, + MDIO_PHYXS_LNSTAT, &stat2); + if (err) + return err; + *link_ok = (stat0 & (stat1 >> 12) & (stat2 >> 12)) & 1; + } + if (speed) + *speed = SPEED_10000; + if (duplex) + *duplex = DUPLEX_FULL; + return 0; +} + +static struct cphy_ops qt2045_ops = { + .reset = ael1006_reset, + .intr_enable = t3_phy_lasi_intr_enable, + .intr_disable = t3_phy_lasi_intr_disable, + .intr_clear = t3_phy_lasi_intr_clear, + .intr_handler = t3_phy_lasi_intr_handler, + .get_link_status = get_link_status_x, + .power_down = ael1002_power_down, + .mmds = MDIO_DEVS_PMAPMD | MDIO_DEVS_PCS | MDIO_DEVS_PHYXS, +}; + +int t3_qt2045_phy_prep(struct cphy *phy, struct adapter *adapter, + int phy_addr, const struct mdio_ops *mdio_ops) +{ + unsigned int stat; + + cphy_init(phy, adapter, phy_addr, &qt2045_ops, mdio_ops, + SUPPORTED_10000baseT_Full | SUPPORTED_AUI | SUPPORTED_TP, + "10GBASE-CX4"); + + /* + * Some cards where the PHY is supposed to be at address 0 actually + * have it at 1. + */ + if (!phy_addr && + !t3_mdio_read(phy, MDIO_MMD_PMAPMD, MDIO_STAT1, &stat) && + stat == 0xffff) + phy->mdio.prtad = 1; + return 0; +} + +static int xaui_direct_reset(struct cphy *phy, int wait) +{ + return 0; +} + +static int xaui_direct_get_link_status(struct cphy *phy, int *link_ok, + int *speed, int *duplex, int *fc) +{ + if (link_ok) { + unsigned int status; + int prtad = phy->mdio.prtad; + + status = t3_read_reg(phy->adapter, + XGM_REG(A_XGM_SERDES_STAT0, prtad)) | + t3_read_reg(phy->adapter, + XGM_REG(A_XGM_SERDES_STAT1, prtad)) | + t3_read_reg(phy->adapter, + XGM_REG(A_XGM_SERDES_STAT2, prtad)) | + t3_read_reg(phy->adapter, + XGM_REG(A_XGM_SERDES_STAT3, prtad)); + *link_ok = !(status & F_LOWSIG0); + } + if (speed) + *speed = SPEED_10000; + if (duplex) + *duplex = DUPLEX_FULL; + return 0; +} + +static int xaui_direct_power_down(struct cphy *phy, int enable) +{ + return 0; +} + +static struct cphy_ops xaui_direct_ops = { + .reset = xaui_direct_reset, + .intr_enable = ael1002_intr_noop, + .intr_disable = ael1002_intr_noop, + .intr_clear = ael1002_intr_noop, + .intr_handler = ael1002_intr_noop, + .get_link_status = xaui_direct_get_link_status, + .power_down = xaui_direct_power_down, +}; + +int t3_xaui_direct_phy_prep(struct cphy *phy, struct adapter *adapter, + int phy_addr, const struct mdio_ops *mdio_ops) +{ + cphy_init(phy, adapter, phy_addr, &xaui_direct_ops, mdio_ops, + SUPPORTED_10000baseT_Full | SUPPORTED_AUI | SUPPORTED_TP, + "10GBASE-CX4"); + return 0; +} diff --git a/drivers/net/ethernet/chelsio/cxgb3/aq100x.c b/drivers/net/ethernet/chelsio/cxgb3/aq100x.c new file mode 100644 index 0000000..341b7ef --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb3/aq100x.c @@ -0,0 +1,354 @@ +/* + * Copyright (c) 2005-2008 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "common.h" +#include "regs.h" + +enum { + /* MDIO_DEV_PMA_PMD registers */ + AQ_LINK_STAT = 0xe800, + AQ_IMASK_PMA = 0xf000, + + /* MDIO_DEV_XGXS registers */ + AQ_XAUI_RX_CFG = 0xc400, + AQ_XAUI_TX_CFG = 0xe400, + + /* MDIO_DEV_ANEG registers */ + AQ_1G_CTRL = 0xc400, + AQ_ANEG_STAT = 0xc800, + + /* MDIO_DEV_VEND1 registers */ + AQ_FW_VERSION = 0x0020, + AQ_IFLAG_GLOBAL = 0xfc00, + AQ_IMASK_GLOBAL = 0xff00, +}; + +enum { + IMASK_PMA = 1 << 2, + IMASK_GLOBAL = 1 << 15, + ADV_1G_FULL = 1 << 15, + ADV_1G_HALF = 1 << 14, + ADV_10G_FULL = 1 << 12, + AQ_RESET = (1 << 14) | (1 << 15), + AQ_LOWPOWER = 1 << 12, +}; + +static int aq100x_reset(struct cphy *phy, int wait) +{ + /* + * Ignore the caller specified wait time; always wait for the reset to + * complete. Can take up to 3s. + */ + int err = t3_phy_reset(phy, MDIO_MMD_VEND1, 3000); + + if (err) + CH_WARN(phy->adapter, "PHY%d: reset failed (0x%x).\n", + phy->mdio.prtad, err); + + return err; +} + +static int aq100x_intr_enable(struct cphy *phy) +{ + int err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AQ_IMASK_PMA, IMASK_PMA); + if (err) + return err; + + err = t3_mdio_write(phy, MDIO_MMD_VEND1, AQ_IMASK_GLOBAL, IMASK_GLOBAL); + return err; +} + +static int aq100x_intr_disable(struct cphy *phy) +{ + return t3_mdio_write(phy, MDIO_MMD_VEND1, AQ_IMASK_GLOBAL, 0); +} + +static int aq100x_intr_clear(struct cphy *phy) +{ + unsigned int v; + + t3_mdio_read(phy, MDIO_MMD_VEND1, AQ_IFLAG_GLOBAL, &v); + t3_mdio_read(phy, MDIO_MMD_PMAPMD, MDIO_STAT1, &v); + + return 0; +} + +static int aq100x_intr_handler(struct cphy *phy) +{ + int err; + unsigned int cause, v; + + err = t3_mdio_read(phy, MDIO_MMD_VEND1, AQ_IFLAG_GLOBAL, &cause); + if (err) + return err; + + /* Read (and reset) the latching version of the status */ + t3_mdio_read(phy, MDIO_MMD_PMAPMD, MDIO_STAT1, &v); + + return cphy_cause_link_change; +} + +static int aq100x_power_down(struct cphy *phy, int off) +{ + return mdio_set_flag(&phy->mdio, phy->mdio.prtad, + MDIO_MMD_PMAPMD, MDIO_CTRL1, + MDIO_CTRL1_LPOWER, off); +} + +static int aq100x_autoneg_enable(struct cphy *phy) +{ + int err; + + err = aq100x_power_down(phy, 0); + if (!err) + err = mdio_set_flag(&phy->mdio, phy->mdio.prtad, + MDIO_MMD_AN, MDIO_CTRL1, + BMCR_ANENABLE | BMCR_ANRESTART, 1); + + return err; +} + +static int aq100x_autoneg_restart(struct cphy *phy) +{ + int err; + + err = aq100x_power_down(phy, 0); + if (!err) + err = mdio_set_flag(&phy->mdio, phy->mdio.prtad, + MDIO_MMD_AN, MDIO_CTRL1, + BMCR_ANENABLE | BMCR_ANRESTART, 1); + + return err; +} + +static int aq100x_advertise(struct cphy *phy, unsigned int advertise_map) +{ + unsigned int adv; + int err; + + /* 10G advertisement */ + adv = 0; + if (advertise_map & ADVERTISED_10000baseT_Full) + adv |= ADV_10G_FULL; + err = t3_mdio_change_bits(phy, MDIO_MMD_AN, MDIO_AN_10GBT_CTRL, + ADV_10G_FULL, adv); + if (err) + return err; + + /* 1G advertisement */ + adv = 0; + if (advertise_map & ADVERTISED_1000baseT_Full) + adv |= ADV_1G_FULL; + if (advertise_map & ADVERTISED_1000baseT_Half) + adv |= ADV_1G_HALF; + err = t3_mdio_change_bits(phy, MDIO_MMD_AN, AQ_1G_CTRL, + ADV_1G_FULL | ADV_1G_HALF, adv); + if (err) + return err; + + /* 100M, pause advertisement */ + adv = 0; + if (advertise_map & ADVERTISED_100baseT_Half) + adv |= ADVERTISE_100HALF; + if (advertise_map & ADVERTISED_100baseT_Full) + adv |= ADVERTISE_100FULL; + if (advertise_map & ADVERTISED_Pause) + adv |= ADVERTISE_PAUSE_CAP; + if (advertise_map & ADVERTISED_Asym_Pause) + adv |= ADVERTISE_PAUSE_ASYM; + err = t3_mdio_change_bits(phy, MDIO_MMD_AN, MDIO_AN_ADVERTISE, + 0xfe0, adv); + + return err; +} + +static int aq100x_set_loopback(struct cphy *phy, int mmd, int dir, int enable) +{ + return mdio_set_flag(&phy->mdio, phy->mdio.prtad, + MDIO_MMD_PMAPMD, MDIO_CTRL1, + BMCR_LOOPBACK, enable); +} + +static int aq100x_set_speed_duplex(struct cphy *phy, int speed, int duplex) +{ + /* no can do */ + return -1; +} + +static int aq100x_get_link_status(struct cphy *phy, int *link_ok, + int *speed, int *duplex, int *fc) +{ + int err; + unsigned int v; + + if (link_ok) { + err = t3_mdio_read(phy, MDIO_MMD_PMAPMD, AQ_LINK_STAT, &v); + if (err) + return err; + + *link_ok = v & 1; + if (!*link_ok) + return 0; + } + + err = t3_mdio_read(phy, MDIO_MMD_AN, AQ_ANEG_STAT, &v); + if (err) + return err; + + if (speed) { + switch (v & 0x6) { + case 0x6: + *speed = SPEED_10000; + break; + case 0x4: + *speed = SPEED_1000; + break; + case 0x2: + *speed = SPEED_100; + break; + case 0x0: + *speed = SPEED_10; + break; + } + } + + if (duplex) + *duplex = v & 1 ? DUPLEX_FULL : DUPLEX_HALF; + + return 0; +} + +static struct cphy_ops aq100x_ops = { + .reset = aq100x_reset, + .intr_enable = aq100x_intr_enable, + .intr_disable = aq100x_intr_disable, + .intr_clear = aq100x_intr_clear, + .intr_handler = aq100x_intr_handler, + .autoneg_enable = aq100x_autoneg_enable, + .autoneg_restart = aq100x_autoneg_restart, + .advertise = aq100x_advertise, + .set_loopback = aq100x_set_loopback, + .set_speed_duplex = aq100x_set_speed_duplex, + .get_link_status = aq100x_get_link_status, + .power_down = aq100x_power_down, + .mmds = MDIO_DEVS_PMAPMD | MDIO_DEVS_PCS | MDIO_DEVS_PHYXS, +}; + +int t3_aq100x_phy_prep(struct cphy *phy, struct adapter *adapter, int phy_addr, + const struct mdio_ops *mdio_ops) +{ + unsigned int v, v2, gpio, wait; + int err; + + cphy_init(phy, adapter, phy_addr, &aq100x_ops, mdio_ops, + SUPPORTED_1000baseT_Full | SUPPORTED_10000baseT_Full | + SUPPORTED_TP | SUPPORTED_Autoneg | SUPPORTED_AUI, + "1000/10GBASE-T"); + + /* + * The PHY has been out of reset ever since the system powered up. So + * we do a hard reset over here. + */ + gpio = phy_addr ? F_GPIO10_OUT_VAL : F_GPIO6_OUT_VAL; + t3_set_reg_field(adapter, A_T3DBG_GPIO_EN, gpio, 0); + msleep(1); + t3_set_reg_field(adapter, A_T3DBG_GPIO_EN, gpio, gpio); + + /* + * Give it enough time to load the firmware and get ready for mdio. + */ + msleep(1000); + wait = 500; /* in 10ms increments */ + do { + err = t3_mdio_read(phy, MDIO_MMD_VEND1, MDIO_CTRL1, &v); + if (err || v == 0xffff) { + + /* Allow prep_adapter to succeed when ffff is read */ + + CH_WARN(adapter, "PHY%d: reset failed (0x%x, 0x%x).\n", + phy_addr, err, v); + goto done; + } + + v &= AQ_RESET; + if (v) + msleep(10); + } while (v && --wait); + if (v) { + CH_WARN(adapter, "PHY%d: reset timed out (0x%x).\n", + phy_addr, v); + + goto done; /* let prep_adapter succeed */ + } + + /* Datasheet says 3s max but this has been observed */ + wait = (500 - wait) * 10 + 1000; + if (wait > 3000) + CH_WARN(adapter, "PHY%d: reset took %ums\n", phy_addr, wait); + + /* Firmware version check. */ + t3_mdio_read(phy, MDIO_MMD_VEND1, AQ_FW_VERSION, &v); + if (v != 101) + CH_WARN(adapter, "PHY%d: unsupported firmware %d\n", + phy_addr, v); + + /* + * The PHY should start in really-low-power mode. Prepare it for normal + * operations. + */ + err = t3_mdio_read(phy, MDIO_MMD_VEND1, MDIO_CTRL1, &v); + if (err) + return err; + if (v & AQ_LOWPOWER) { + err = t3_mdio_change_bits(phy, MDIO_MMD_VEND1, MDIO_CTRL1, + AQ_LOWPOWER, 0); + if (err) + return err; + msleep(10); + } else + CH_WARN(adapter, "PHY%d does not start in low power mode.\n", + phy_addr); + + /* + * Verify XAUI settings, but let prep succeed no matter what. + */ + v = v2 = 0; + t3_mdio_read(phy, MDIO_MMD_PHYXS, AQ_XAUI_RX_CFG, &v); + t3_mdio_read(phy, MDIO_MMD_PHYXS, AQ_XAUI_TX_CFG, &v2); + if (v != 0x1b || v2 != 0x1b) + CH_WARN(adapter, + "PHY%d: incorrect XAUI settings (0x%x, 0x%x).\n", + phy_addr, v, v2); + +done: + return err; +} diff --git a/drivers/net/ethernet/chelsio/cxgb3/common.h b/drivers/net/ethernet/chelsio/cxgb3/common.h new file mode 100644 index 0000000..4424809 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb3/common.h @@ -0,0 +1,773 @@ +/* + * Copyright (c) 2005-2008 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __CHELSIO_COMMON_H +#define __CHELSIO_COMMON_H + +#include +#include +#include +#include +#include +#include +#include +#include "version.h" + +#define CH_ERR(adap, fmt, ...) dev_err(&adap->pdev->dev, fmt, ##__VA_ARGS__) +#define CH_WARN(adap, fmt, ...) dev_warn(&adap->pdev->dev, fmt, ##__VA_ARGS__) +#define CH_ALERT(adap, fmt, ...) dev_alert(&adap->pdev->dev, fmt, ##__VA_ARGS__) + +/* + * More powerful macro that selectively prints messages based on msg_enable. + * For info and debugging messages. + */ +#define CH_MSG(adapter, level, category, fmt, ...) do { \ + if ((adapter)->msg_enable & NETIF_MSG_##category) \ + dev_printk(KERN_##level, &adapter->pdev->dev, fmt, \ + ## __VA_ARGS__); \ +} while (0) + +#ifdef DEBUG +# define CH_DBG(adapter, category, fmt, ...) \ + CH_MSG(adapter, DEBUG, category, fmt, ## __VA_ARGS__) +#else +# define CH_DBG(adapter, category, fmt, ...) +#endif + +/* Additional NETIF_MSG_* categories */ +#define NETIF_MSG_MMIO 0x8000000 + +enum { + MAX_NPORTS = 2, /* max # of ports */ + MAX_FRAME_SIZE = 10240, /* max MAC frame size, including header + FCS */ + EEPROMSIZE = 8192, /* Serial EEPROM size */ + SERNUM_LEN = 16, /* Serial # length */ + RSS_TABLE_SIZE = 64, /* size of RSS lookup and mapping tables */ + TCB_SIZE = 128, /* TCB size */ + NMTUS = 16, /* size of MTU table */ + NCCTRL_WIN = 32, /* # of congestion control windows */ + PROTO_SRAM_LINES = 128, /* size of TP sram */ +}; + +#define MAX_RX_COALESCING_LEN 12288U + +enum { + PAUSE_RX = 1 << 0, + PAUSE_TX = 1 << 1, + PAUSE_AUTONEG = 1 << 2 +}; + +enum { + SUPPORTED_IRQ = 1 << 24 +}; + +enum { /* adapter interrupt-maintained statistics */ + STAT_ULP_CH0_PBL_OOB, + STAT_ULP_CH1_PBL_OOB, + STAT_PCI_CORR_ECC, + + IRQ_NUM_STATS /* keep last */ +}; + +#define TP_VERSION_MAJOR 1 +#define TP_VERSION_MINOR 1 +#define TP_VERSION_MICRO 0 + +#define S_TP_VERSION_MAJOR 16 +#define M_TP_VERSION_MAJOR 0xFF +#define V_TP_VERSION_MAJOR(x) ((x) << S_TP_VERSION_MAJOR) +#define G_TP_VERSION_MAJOR(x) \ + (((x) >> S_TP_VERSION_MAJOR) & M_TP_VERSION_MAJOR) + +#define S_TP_VERSION_MINOR 8 +#define M_TP_VERSION_MINOR 0xFF +#define V_TP_VERSION_MINOR(x) ((x) << S_TP_VERSION_MINOR) +#define G_TP_VERSION_MINOR(x) \ + (((x) >> S_TP_VERSION_MINOR) & M_TP_VERSION_MINOR) + +#define S_TP_VERSION_MICRO 0 +#define M_TP_VERSION_MICRO 0xFF +#define V_TP_VERSION_MICRO(x) ((x) << S_TP_VERSION_MICRO) +#define G_TP_VERSION_MICRO(x) \ + (((x) >> S_TP_VERSION_MICRO) & M_TP_VERSION_MICRO) + +enum { + SGE_QSETS = 8, /* # of SGE Tx/Rx/RspQ sets */ + SGE_RXQ_PER_SET = 2, /* # of Rx queues per set */ + SGE_TXQ_PER_SET = 3 /* # of Tx queues per set */ +}; + +enum sge_context_type { /* SGE egress context types */ + SGE_CNTXT_RDMA = 0, + SGE_CNTXT_ETH = 2, + SGE_CNTXT_OFLD = 4, + SGE_CNTXT_CTRL = 5 +}; + +enum { + AN_PKT_SIZE = 32, /* async notification packet size */ + IMMED_PKT_SIZE = 48 /* packet size for immediate data */ +}; + +struct sg_ent { /* SGE scatter/gather entry */ + __be32 len[2]; + __be64 addr[2]; +}; + +#ifndef SGE_NUM_GENBITS +/* Must be 1 or 2 */ +# define SGE_NUM_GENBITS 2 +#endif + +#define TX_DESC_FLITS 16U +#define WR_FLITS (TX_DESC_FLITS + 1 - SGE_NUM_GENBITS) + +struct cphy; +struct adapter; + +struct mdio_ops { + int (*read)(struct net_device *dev, int phy_addr, int mmd_addr, + u16 reg_addr); + int (*write)(struct net_device *dev, int phy_addr, int mmd_addr, + u16 reg_addr, u16 val); + unsigned mode_support; +}; + +struct adapter_info { + unsigned char nports0; /* # of ports on channel 0 */ + unsigned char nports1; /* # of ports on channel 1 */ + unsigned char phy_base_addr; /* MDIO PHY base address */ + unsigned int gpio_out; /* GPIO output settings */ + unsigned char gpio_intr[MAX_NPORTS]; /* GPIO PHY IRQ pins */ + unsigned long caps; /* adapter capabilities */ + const struct mdio_ops *mdio_ops; /* MDIO operations */ + const char *desc; /* product description */ +}; + +struct mc5_stats { + unsigned long parity_err; + unsigned long active_rgn_full; + unsigned long nfa_srch_err; + unsigned long unknown_cmd; + unsigned long reqq_parity_err; + unsigned long dispq_parity_err; + unsigned long del_act_empty; +}; + +struct mc7_stats { + unsigned long corr_err; + unsigned long uncorr_err; + unsigned long parity_err; + unsigned long addr_err; +}; + +struct mac_stats { + u64 tx_octets; /* total # of octets in good frames */ + u64 tx_octets_bad; /* total # of octets in error frames */ + u64 tx_frames; /* all good frames */ + u64 tx_mcast_frames; /* good multicast frames */ + u64 tx_bcast_frames; /* good broadcast frames */ + u64 tx_pause; /* # of transmitted pause frames */ + u64 tx_deferred; /* frames with deferred transmissions */ + u64 tx_late_collisions; /* # of late collisions */ + u64 tx_total_collisions; /* # of total collisions */ + u64 tx_excess_collisions; /* frame errors from excessive collissions */ + u64 tx_underrun; /* # of Tx FIFO underruns */ + u64 tx_len_errs; /* # of Tx length errors */ + u64 tx_mac_internal_errs; /* # of internal MAC errors on Tx */ + u64 tx_excess_deferral; /* # of frames with excessive deferral */ + u64 tx_fcs_errs; /* # of frames with bad FCS */ + + u64 tx_frames_64; /* # of Tx frames in a particular range */ + u64 tx_frames_65_127; + u64 tx_frames_128_255; + u64 tx_frames_256_511; + u64 tx_frames_512_1023; + u64 tx_frames_1024_1518; + u64 tx_frames_1519_max; + + u64 rx_octets; /* total # of octets in good frames */ + u64 rx_octets_bad; /* total # of octets in error frames */ + u64 rx_frames; /* all good frames */ + u64 rx_mcast_frames; /* good multicast frames */ + u64 rx_bcast_frames; /* good broadcast frames */ + u64 rx_pause; /* # of received pause frames */ + u64 rx_fcs_errs; /* # of received frames with bad FCS */ + u64 rx_align_errs; /* alignment errors */ + u64 rx_symbol_errs; /* symbol errors */ + u64 rx_data_errs; /* data errors */ + u64 rx_sequence_errs; /* sequence errors */ + u64 rx_runt; /* # of runt frames */ + u64 rx_jabber; /* # of jabber frames */ + u64 rx_short; /* # of short frames */ + u64 rx_too_long; /* # of oversized frames */ + u64 rx_mac_internal_errs; /* # of internal MAC errors on Rx */ + + u64 rx_frames_64; /* # of Rx frames in a particular range */ + u64 rx_frames_65_127; + u64 rx_frames_128_255; + u64 rx_frames_256_511; + u64 rx_frames_512_1023; + u64 rx_frames_1024_1518; + u64 rx_frames_1519_max; + + u64 rx_cong_drops; /* # of Rx drops due to SGE congestion */ + + unsigned long tx_fifo_parity_err; + unsigned long rx_fifo_parity_err; + unsigned long tx_fifo_urun; + unsigned long rx_fifo_ovfl; + unsigned long serdes_signal_loss; + unsigned long xaui_pcs_ctc_err; + unsigned long xaui_pcs_align_change; + + unsigned long num_toggled; /* # times toggled TxEn due to stuck TX */ + unsigned long num_resets; /* # times reset due to stuck TX */ + + unsigned long link_faults; /* # detected link faults */ +}; + +struct tp_mib_stats { + u32 ipInReceive_hi; + u32 ipInReceive_lo; + u32 ipInHdrErrors_hi; + u32 ipInHdrErrors_lo; + u32 ipInAddrErrors_hi; + u32 ipInAddrErrors_lo; + u32 ipInUnknownProtos_hi; + u32 ipInUnknownProtos_lo; + u32 ipInDiscards_hi; + u32 ipInDiscards_lo; + u32 ipInDelivers_hi; + u32 ipInDelivers_lo; + u32 ipOutRequests_hi; + u32 ipOutRequests_lo; + u32 ipOutDiscards_hi; + u32 ipOutDiscards_lo; + u32 ipOutNoRoutes_hi; + u32 ipOutNoRoutes_lo; + u32 ipReasmTimeout; + u32 ipReasmReqds; + u32 ipReasmOKs; + u32 ipReasmFails; + + u32 reserved[8]; + + u32 tcpActiveOpens; + u32 tcpPassiveOpens; + u32 tcpAttemptFails; + u32 tcpEstabResets; + u32 tcpOutRsts; + u32 tcpCurrEstab; + u32 tcpInSegs_hi; + u32 tcpInSegs_lo; + u32 tcpOutSegs_hi; + u32 tcpOutSegs_lo; + u32 tcpRetransSeg_hi; + u32 tcpRetransSeg_lo; + u32 tcpInErrs_hi; + u32 tcpInErrs_lo; + u32 tcpRtoMin; + u32 tcpRtoMax; +}; + +struct tp_params { + unsigned int nchan; /* # of channels */ + unsigned int pmrx_size; /* total PMRX capacity */ + unsigned int pmtx_size; /* total PMTX capacity */ + unsigned int cm_size; /* total CM capacity */ + unsigned int chan_rx_size; /* per channel Rx size */ + unsigned int chan_tx_size; /* per channel Tx size */ + unsigned int rx_pg_size; /* Rx page size */ + unsigned int tx_pg_size; /* Tx page size */ + unsigned int rx_num_pgs; /* # of Rx pages */ + unsigned int tx_num_pgs; /* # of Tx pages */ + unsigned int ntimer_qs; /* # of timer queues */ +}; + +struct qset_params { /* SGE queue set parameters */ + unsigned int polling; /* polling/interrupt service for rspq */ + unsigned int coalesce_usecs; /* irq coalescing timer */ + unsigned int rspq_size; /* # of entries in response queue */ + unsigned int fl_size; /* # of entries in regular free list */ + unsigned int jumbo_size; /* # of entries in jumbo free list */ + unsigned int txq_size[SGE_TXQ_PER_SET]; /* Tx queue sizes */ + unsigned int cong_thres; /* FL congestion threshold */ + unsigned int vector; /* Interrupt (line or vector) number */ +}; + +struct sge_params { + unsigned int max_pkt_size; /* max offload pkt size */ + struct qset_params qset[SGE_QSETS]; +}; + +struct mc5_params { + unsigned int mode; /* selects MC5 width */ + unsigned int nservers; /* size of server region */ + unsigned int nfilters; /* size of filter region */ + unsigned int nroutes; /* size of routing region */ +}; + +/* Default MC5 region sizes */ +enum { + DEFAULT_NSERVERS = 512, + DEFAULT_NFILTERS = 128 +}; + +/* MC5 modes, these must be non-0 */ +enum { + MC5_MODE_144_BIT = 1, + MC5_MODE_72_BIT = 2 +}; + +/* MC5 min active region size */ +enum { MC5_MIN_TIDS = 16 }; + +struct vpd_params { + unsigned int cclk; + unsigned int mclk; + unsigned int uclk; + unsigned int mdc; + unsigned int mem_timing; + u8 sn[SERNUM_LEN + 1]; + u8 eth_base[6]; + u8 port_type[MAX_NPORTS]; + unsigned short xauicfg[2]; +}; + +struct pci_params { + unsigned int vpd_cap_addr; + unsigned short speed; + unsigned char width; + unsigned char variant; +}; + +enum { + PCI_VARIANT_PCI, + PCI_VARIANT_PCIX_MODE1_PARITY, + PCI_VARIANT_PCIX_MODE1_ECC, + PCI_VARIANT_PCIX_266_MODE2, + PCI_VARIANT_PCIE +}; + +struct adapter_params { + struct sge_params sge; + struct mc5_params mc5; + struct tp_params tp; + struct vpd_params vpd; + struct pci_params pci; + + const struct adapter_info *info; + + unsigned short mtus[NMTUS]; + unsigned short a_wnd[NCCTRL_WIN]; + unsigned short b_wnd[NCCTRL_WIN]; + + unsigned int nports; /* # of ethernet ports */ + unsigned int chan_map; /* bitmap of in-use Tx channels */ + unsigned int stats_update_period; /* MAC stats accumulation period */ + unsigned int linkpoll_period; /* link poll period in 0.1s */ + unsigned int rev; /* chip revision */ + unsigned int offload; +}; + +enum { /* chip revisions */ + T3_REV_A = 0, + T3_REV_B = 2, + T3_REV_B2 = 3, + T3_REV_C = 4, +}; + +struct trace_params { + u32 sip; + u32 sip_mask; + u32 dip; + u32 dip_mask; + u16 sport; + u16 sport_mask; + u16 dport; + u16 dport_mask; + u32 vlan:12; + u32 vlan_mask:12; + u32 intf:4; + u32 intf_mask:4; + u8 proto; + u8 proto_mask; +}; + +struct link_config { + unsigned int supported; /* link capabilities */ + unsigned int advertising; /* advertised capabilities */ + unsigned short requested_speed; /* speed user has requested */ + unsigned short speed; /* actual link speed */ + unsigned char requested_duplex; /* duplex user has requested */ + unsigned char duplex; /* actual link duplex */ + unsigned char requested_fc; /* flow control user has requested */ + unsigned char fc; /* actual link flow control */ + unsigned char autoneg; /* autonegotiating? */ + unsigned int link_ok; /* link up? */ +}; + +#define SPEED_INVALID 0xffff +#define DUPLEX_INVALID 0xff + +struct mc5 { + struct adapter *adapter; + unsigned int tcam_size; + unsigned char part_type; + unsigned char parity_enabled; + unsigned char mode; + struct mc5_stats stats; +}; + +static inline unsigned int t3_mc5_size(const struct mc5 *p) +{ + return p->tcam_size; +} + +struct mc7 { + struct adapter *adapter; /* backpointer to adapter */ + unsigned int size; /* memory size in bytes */ + unsigned int width; /* MC7 interface width */ + unsigned int offset; /* register address offset for MC7 instance */ + const char *name; /* name of MC7 instance */ + struct mc7_stats stats; /* MC7 statistics */ +}; + +static inline unsigned int t3_mc7_size(const struct mc7 *p) +{ + return p->size; +} + +struct cmac { + struct adapter *adapter; + unsigned int offset; + unsigned int nucast; /* # of address filters for unicast MACs */ + unsigned int tx_tcnt; + unsigned int tx_xcnt; + u64 tx_mcnt; + unsigned int rx_xcnt; + unsigned int rx_ocnt; + u64 rx_mcnt; + unsigned int toggle_cnt; + unsigned int txen; + u64 rx_pause; + struct mac_stats stats; +}; + +enum { + MAC_DIRECTION_RX = 1, + MAC_DIRECTION_TX = 2, + MAC_RXFIFO_SIZE = 32768 +}; + +/* PHY loopback direction */ +enum { + PHY_LOOPBACK_TX = 1, + PHY_LOOPBACK_RX = 2 +}; + +/* PHY interrupt types */ +enum { + cphy_cause_link_change = 1, + cphy_cause_fifo_error = 2, + cphy_cause_module_change = 4, +}; + +/* PHY module types */ +enum { + phy_modtype_none, + phy_modtype_sr, + phy_modtype_lr, + phy_modtype_lrm, + phy_modtype_twinax, + phy_modtype_twinax_long, + phy_modtype_unknown +}; + +/* PHY operations */ +struct cphy_ops { + int (*reset)(struct cphy *phy, int wait); + + int (*intr_enable)(struct cphy *phy); + int (*intr_disable)(struct cphy *phy); + int (*intr_clear)(struct cphy *phy); + int (*intr_handler)(struct cphy *phy); + + int (*autoneg_enable)(struct cphy *phy); + int (*autoneg_restart)(struct cphy *phy); + + int (*advertise)(struct cphy *phy, unsigned int advertise_map); + int (*set_loopback)(struct cphy *phy, int mmd, int dir, int enable); + int (*set_speed_duplex)(struct cphy *phy, int speed, int duplex); + int (*get_link_status)(struct cphy *phy, int *link_ok, int *speed, + int *duplex, int *fc); + int (*power_down)(struct cphy *phy, int enable); + + u32 mmds; +}; +enum { + EDC_OPT_AEL2005 = 0, + EDC_OPT_AEL2005_SIZE = 1084, + EDC_TWX_AEL2005 = 1, + EDC_TWX_AEL2005_SIZE = 1464, + EDC_TWX_AEL2020 = 2, + EDC_TWX_AEL2020_SIZE = 1628, + EDC_MAX_SIZE = EDC_TWX_AEL2020_SIZE, /* Max cache size */ +}; + +/* A PHY instance */ +struct cphy { + u8 modtype; /* PHY module type */ + short priv; /* scratch pad */ + unsigned int caps; /* PHY capabilities */ + struct adapter *adapter; /* associated adapter */ + const char *desc; /* PHY description */ + unsigned long fifo_errors; /* FIFO over/under-flows */ + const struct cphy_ops *ops; /* PHY operations */ + struct mdio_if_info mdio; + u16 phy_cache[EDC_MAX_SIZE]; /* EDC cache */ +}; + +/* Convenience MDIO read/write wrappers */ +static inline int t3_mdio_read(struct cphy *phy, int mmd, int reg, + unsigned int *valp) +{ + int rc = phy->mdio.mdio_read(phy->mdio.dev, phy->mdio.prtad, mmd, reg); + *valp = (rc >= 0) ? rc : -1; + return (rc >= 0) ? 0 : rc; +} + +static inline int t3_mdio_write(struct cphy *phy, int mmd, int reg, + unsigned int val) +{ + return phy->mdio.mdio_write(phy->mdio.dev, phy->mdio.prtad, mmd, + reg, val); +} + +/* Convenience initializer */ +static inline void cphy_init(struct cphy *phy, struct adapter *adapter, + int phy_addr, struct cphy_ops *phy_ops, + const struct mdio_ops *mdio_ops, + unsigned int caps, const char *desc) +{ + phy->caps = caps; + phy->adapter = adapter; + phy->desc = desc; + phy->ops = phy_ops; + if (mdio_ops) { + phy->mdio.prtad = phy_addr; + phy->mdio.mmds = phy_ops->mmds; + phy->mdio.mode_support = mdio_ops->mode_support; + phy->mdio.mdio_read = mdio_ops->read; + phy->mdio.mdio_write = mdio_ops->write; + } +} + +/* Accumulate MAC statistics every 180 seconds. For 1G we multiply by 10. */ +#define MAC_STATS_ACCUM_SECS 180 + +#define XGM_REG(reg_addr, idx) \ + ((reg_addr) + (idx) * (XGMAC0_1_BASE_ADDR - XGMAC0_0_BASE_ADDR)) + +struct addr_val_pair { + unsigned int reg_addr; + unsigned int val; +}; + +#include "adapter.h" + +#ifndef PCI_VENDOR_ID_CHELSIO +# define PCI_VENDOR_ID_CHELSIO 0x1425 +#endif + +#define for_each_port(adapter, iter) \ + for (iter = 0; iter < (adapter)->params.nports; ++iter) + +#define adapter_info(adap) ((adap)->params.info) + +static inline int uses_xaui(const struct adapter *adap) +{ + return adapter_info(adap)->caps & SUPPORTED_AUI; +} + +static inline int is_10G(const struct adapter *adap) +{ + return adapter_info(adap)->caps & SUPPORTED_10000baseT_Full; +} + +static inline int is_offload(const struct adapter *adap) +{ + return adap->params.offload; +} + +static inline unsigned int core_ticks_per_usec(const struct adapter *adap) +{ + return adap->params.vpd.cclk / 1000; +} + +static inline unsigned int is_pcie(const struct adapter *adap) +{ + return adap->params.pci.variant == PCI_VARIANT_PCIE; +} + +void t3_set_reg_field(struct adapter *adap, unsigned int addr, u32 mask, + u32 val); +void t3_write_regs(struct adapter *adapter, const struct addr_val_pair *p, + int n, unsigned int offset); +int t3_wait_op_done_val(struct adapter *adapter, int reg, u32 mask, + int polarity, int attempts, int delay, u32 *valp); +static inline int t3_wait_op_done(struct adapter *adapter, int reg, u32 mask, + int polarity, int attempts, int delay) +{ + return t3_wait_op_done_val(adapter, reg, mask, polarity, attempts, + delay, NULL); +} +int t3_mdio_change_bits(struct cphy *phy, int mmd, int reg, unsigned int clear, + unsigned int set); +int t3_phy_reset(struct cphy *phy, int mmd, int wait); +int t3_phy_advertise(struct cphy *phy, unsigned int advert); +int t3_phy_advertise_fiber(struct cphy *phy, unsigned int advert); +int t3_set_phy_speed_duplex(struct cphy *phy, int speed, int duplex); +int t3_phy_lasi_intr_enable(struct cphy *phy); +int t3_phy_lasi_intr_disable(struct cphy *phy); +int t3_phy_lasi_intr_clear(struct cphy *phy); +int t3_phy_lasi_intr_handler(struct cphy *phy); + +void t3_intr_enable(struct adapter *adapter); +void t3_intr_disable(struct adapter *adapter); +void t3_intr_clear(struct adapter *adapter); +void t3_xgm_intr_enable(struct adapter *adapter, int idx); +void t3_xgm_intr_disable(struct adapter *adapter, int idx); +void t3_port_intr_enable(struct adapter *adapter, int idx); +void t3_port_intr_disable(struct adapter *adapter, int idx); +int t3_slow_intr_handler(struct adapter *adapter); +int t3_phy_intr_handler(struct adapter *adapter); + +void t3_link_changed(struct adapter *adapter, int port_id); +void t3_link_fault(struct adapter *adapter, int port_id); +int t3_link_start(struct cphy *phy, struct cmac *mac, struct link_config *lc); +const struct adapter_info *t3_get_adapter_info(unsigned int board_id); +int t3_seeprom_read(struct adapter *adapter, u32 addr, __le32 *data); +int t3_seeprom_write(struct adapter *adapter, u32 addr, __le32 data); +int t3_seeprom_wp(struct adapter *adapter, int enable); +int t3_get_tp_version(struct adapter *adapter, u32 *vers); +int t3_check_tpsram_version(struct adapter *adapter); +int t3_check_tpsram(struct adapter *adapter, const u8 *tp_ram, + unsigned int size); +int t3_set_proto_sram(struct adapter *adap, const u8 *data); +int t3_load_fw(struct adapter *adapter, const u8 * fw_data, unsigned int size); +int t3_get_fw_version(struct adapter *adapter, u32 *vers); +int t3_check_fw_version(struct adapter *adapter); +int t3_init_hw(struct adapter *adapter, u32 fw_params); +int t3_reset_adapter(struct adapter *adapter); +int t3_prep_adapter(struct adapter *adapter, const struct adapter_info *ai, + int reset); +int t3_replay_prep_adapter(struct adapter *adapter); +void t3_led_ready(struct adapter *adapter); +void t3_fatal_err(struct adapter *adapter); +void t3_set_vlan_accel(struct adapter *adapter, unsigned int ports, int on); +void t3_config_rss(struct adapter *adapter, unsigned int rss_config, + const u8 * cpus, const u16 *rspq); +int t3_cim_ctl_blk_read(struct adapter *adap, unsigned int addr, + unsigned int n, unsigned int *valp); +int t3_mc7_bd_read(struct mc7 *mc7, unsigned int start, unsigned int n, + u64 *buf); + +int t3_mac_reset(struct cmac *mac); +void t3b_pcs_reset(struct cmac *mac); +void t3_mac_disable_exact_filters(struct cmac *mac); +void t3_mac_enable_exact_filters(struct cmac *mac); +int t3_mac_enable(struct cmac *mac, int which); +int t3_mac_disable(struct cmac *mac, int which); +int t3_mac_set_mtu(struct cmac *mac, unsigned int mtu); +int t3_mac_set_rx_mode(struct cmac *mac, struct net_device *dev); +int t3_mac_set_address(struct cmac *mac, unsigned int idx, u8 addr[6]); +int t3_mac_set_num_ucast(struct cmac *mac, int n); +const struct mac_stats *t3_mac_update_stats(struct cmac *mac); +int t3_mac_set_speed_duplex_fc(struct cmac *mac, int speed, int duplex, int fc); +int t3b2_mac_watchdog_task(struct cmac *mac); + +void t3_mc5_prep(struct adapter *adapter, struct mc5 *mc5, int mode); +int t3_mc5_init(struct mc5 *mc5, unsigned int nservers, unsigned int nfilters, + unsigned int nroutes); +void t3_mc5_intr_handler(struct mc5 *mc5); + +void t3_tp_set_offload_mode(struct adapter *adap, int enable); +void t3_tp_get_mib_stats(struct adapter *adap, struct tp_mib_stats *tps); +void t3_load_mtus(struct adapter *adap, unsigned short mtus[NMTUS], + unsigned short alpha[NCCTRL_WIN], + unsigned short beta[NCCTRL_WIN], unsigned short mtu_cap); +void t3_config_trace_filter(struct adapter *adapter, + const struct trace_params *tp, int filter_index, + int invert, int enable); +int t3_config_sched(struct adapter *adap, unsigned int kbps, int sched); + +void t3_sge_prep(struct adapter *adap, struct sge_params *p); +void t3_sge_init(struct adapter *adap, struct sge_params *p); +int t3_sge_init_ecntxt(struct adapter *adapter, unsigned int id, int gts_enable, + enum sge_context_type type, int respq, u64 base_addr, + unsigned int size, unsigned int token, int gen, + unsigned int cidx); +int t3_sge_init_flcntxt(struct adapter *adapter, unsigned int id, + int gts_enable, u64 base_addr, unsigned int size, + unsigned int esize, unsigned int cong_thres, int gen, + unsigned int cidx); +int t3_sge_init_rspcntxt(struct adapter *adapter, unsigned int id, + int irq_vec_idx, u64 base_addr, unsigned int size, + unsigned int fl_thres, int gen, unsigned int cidx); +int t3_sge_init_cqcntxt(struct adapter *adapter, unsigned int id, u64 base_addr, + unsigned int size, int rspq, int ovfl_mode, + unsigned int credits, unsigned int credit_thres); +int t3_sge_enable_ecntxt(struct adapter *adapter, unsigned int id, int enable); +int t3_sge_disable_fl(struct adapter *adapter, unsigned int id); +int t3_sge_disable_rspcntxt(struct adapter *adapter, unsigned int id); +int t3_sge_disable_cqcntxt(struct adapter *adapter, unsigned int id); +int t3_sge_cqcntxt_op(struct adapter *adapter, unsigned int id, unsigned int op, + unsigned int credits); + +int t3_vsc8211_phy_prep(struct cphy *phy, struct adapter *adapter, + int phy_addr, const struct mdio_ops *mdio_ops); +int t3_ael1002_phy_prep(struct cphy *phy, struct adapter *adapter, + int phy_addr, const struct mdio_ops *mdio_ops); +int t3_ael1006_phy_prep(struct cphy *phy, struct adapter *adapter, + int phy_addr, const struct mdio_ops *mdio_ops); +int t3_ael2005_phy_prep(struct cphy *phy, struct adapter *adapter, + int phy_addr, const struct mdio_ops *mdio_ops); +int t3_ael2020_phy_prep(struct cphy *phy, struct adapter *adapter, + int phy_addr, const struct mdio_ops *mdio_ops); +int t3_qt2045_phy_prep(struct cphy *phy, struct adapter *adapter, int phy_addr, + const struct mdio_ops *mdio_ops); +int t3_xaui_direct_phy_prep(struct cphy *phy, struct adapter *adapter, + int phy_addr, const struct mdio_ops *mdio_ops); +int t3_aq100x_phy_prep(struct cphy *phy, struct adapter *adapter, + int phy_addr, const struct mdio_ops *mdio_ops); +#endif /* __CHELSIO_COMMON_H */ diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_ctl_defs.h b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_ctl_defs.h new file mode 100644 index 0000000..369fe71 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_ctl_defs.h @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2003-2008 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _CXGB3_OFFLOAD_CTL_DEFS_H +#define _CXGB3_OFFLOAD_CTL_DEFS_H + +enum { + GET_MAX_OUTSTANDING_WR = 0, + GET_TX_MAX_CHUNK = 1, + GET_TID_RANGE = 2, + GET_STID_RANGE = 3, + GET_RTBL_RANGE = 4, + GET_L2T_CAPACITY = 5, + GET_MTUS = 6, + GET_WR_LEN = 7, + GET_IFF_FROM_MAC = 8, + GET_DDP_PARAMS = 9, + GET_PORTS = 10, + + ULP_ISCSI_GET_PARAMS = 11, + ULP_ISCSI_SET_PARAMS = 12, + + RDMA_GET_PARAMS = 13, + RDMA_CQ_OP = 14, + RDMA_CQ_SETUP = 15, + RDMA_CQ_DISABLE = 16, + RDMA_CTRL_QP_SETUP = 17, + RDMA_GET_MEM = 18, + RDMA_GET_MIB = 19, + + GET_RX_PAGE_INFO = 50, + GET_ISCSI_IPV4ADDR = 51, + + GET_EMBEDDED_INFO = 70, +}; + +/* + * Structure used to describe a TID range. Valid TIDs are [base, base+num). + */ +struct tid_range { + unsigned int base; /* first TID */ + unsigned int num; /* number of TIDs in range */ +}; + +/* + * Structure used to request the size and contents of the MTU table. + */ +struct mtutab { + unsigned int size; /* # of entries in the MTU table */ + const unsigned short *mtus; /* the MTU table values */ +}; + +struct net_device; + +/* + * Structure used to request the adapter net_device owning a given MAC address. + */ +struct iff_mac { + struct net_device *dev; /* the net_device */ + const unsigned char *mac_addr; /* MAC address to lookup */ + u16 vlan_tag; +}; + +/* Structure used to request a port's iSCSI IPv4 address */ +struct iscsi_ipv4addr { + struct net_device *dev; /* the net_device */ + __be32 ipv4addr; /* the return iSCSI IPv4 address */ +}; + +struct pci_dev; + +/* + * Structure used to request the TCP DDP parameters. + */ +struct ddp_params { + unsigned int llimit; /* TDDP region start address */ + unsigned int ulimit; /* TDDP region end address */ + unsigned int tag_mask; /* TDDP tag mask */ + struct pci_dev *pdev; +}; + +struct adap_ports { + unsigned int nports; /* number of ports on this adapter */ + struct net_device *lldevs[2]; +}; + +/* + * Structure used to return information to the iscsi layer. + */ +struct ulp_iscsi_info { + unsigned int offset; + unsigned int llimit; + unsigned int ulimit; + unsigned int tagmask; + u8 pgsz_factor[4]; + unsigned int max_rxsz; + unsigned int max_txsz; + struct pci_dev *pdev; +}; + +/* + * Structure used to return information to the RDMA layer. + */ +struct rdma_info { + unsigned int tpt_base; /* TPT base address */ + unsigned int tpt_top; /* TPT last entry address */ + unsigned int pbl_base; /* PBL base address */ + unsigned int pbl_top; /* PBL last entry address */ + unsigned int rqt_base; /* RQT base address */ + unsigned int rqt_top; /* RQT last entry address */ + unsigned int udbell_len; /* user doorbell region length */ + unsigned long udbell_physbase; /* user doorbell physical start addr */ + void __iomem *kdb_addr; /* kernel doorbell register address */ + struct pci_dev *pdev; /* associated PCI device */ +}; + +/* + * Structure used to request an operation on an RDMA completion queue. + */ +struct rdma_cq_op { + unsigned int id; + unsigned int op; + unsigned int credits; +}; + +/* + * Structure used to setup RDMA completion queues. + */ +struct rdma_cq_setup { + unsigned int id; + unsigned long long base_addr; + unsigned int size; + unsigned int credits; + unsigned int credit_thres; + unsigned int ovfl_mode; +}; + +/* + * Structure used to setup the RDMA control egress context. + */ +struct rdma_ctrlqp_setup { + unsigned long long base_addr; + unsigned int size; +}; + +/* + * Offload TX/RX page information. + */ +struct ofld_page_info { + unsigned int page_size; /* Page size, should be a power of 2 */ + unsigned int num; /* Number of pages */ +}; + +/* + * Structure used to get firmware and protocol engine versions. + */ +struct ch_embedded_info { + u32 fw_vers; + u32 tp_vers; +}; +#endif /* _CXGB3_OFFLOAD_CTL_DEFS_H */ diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_defs.h b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_defs.h new file mode 100644 index 0000000..920d918 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_defs.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2006-2008 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _CHELSIO_DEFS_H +#define _CHELSIO_DEFS_H + +#include +#include + +#include "t3cdev.h" + +#include "cxgb3_offload.h" + +#define VALIDATE_TID 1 + +void *cxgb_alloc_mem(unsigned long size); +void cxgb_free_mem(void *addr); + +/* + * Map an ATID or STID to their entries in the corresponding TID tables. + */ +static inline union active_open_entry *atid2entry(const struct tid_info *t, + unsigned int atid) +{ + return &t->atid_tab[atid - t->atid_base]; +} + +static inline union listen_entry *stid2entry(const struct tid_info *t, + unsigned int stid) +{ + return &t->stid_tab[stid - t->stid_base]; +} + +/* + * Find the connection corresponding to a TID. + */ +static inline struct t3c_tid_entry *lookup_tid(const struct tid_info *t, + unsigned int tid) +{ + struct t3c_tid_entry *t3c_tid = tid < t->ntids ? + &(t->tid_tab[tid]) : NULL; + + return (t3c_tid && t3c_tid->client) ? t3c_tid : NULL; +} + +/* + * Find the connection corresponding to a server TID. + */ +static inline struct t3c_tid_entry *lookup_stid(const struct tid_info *t, + unsigned int tid) +{ + union listen_entry *e; + + if (tid < t->stid_base || tid >= t->stid_base + t->nstids) + return NULL; + + e = stid2entry(t, tid); + if ((void *)e->next >= (void *)t->tid_tab && + (void *)e->next < (void *)&t->atid_tab[t->natids]) + return NULL; + + return &e->t3c_tid; +} + +/* + * Find the connection corresponding to an active-open TID. + */ +static inline struct t3c_tid_entry *lookup_atid(const struct tid_info *t, + unsigned int tid) +{ + union active_open_entry *e; + + if (tid < t->atid_base || tid >= t->atid_base + t->natids) + return NULL; + + e = atid2entry(t, tid); + if ((void *)e->next >= (void *)t->tid_tab && + (void *)e->next < (void *)&t->atid_tab[t->natids]) + return NULL; + + return &e->t3c_tid; +} + +int attach_t3cdev(struct t3cdev *dev); +void detach_t3cdev(struct t3cdev *dev); +#endif diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_ioctl.h b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_ioctl.h new file mode 100644 index 0000000..b19e437 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_ioctl.h @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2003-2008 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __CHIOCTL_H__ +#define __CHIOCTL_H__ + +/* + * Ioctl commands specific to this driver. + */ +enum { + CHELSIO_GETMTUTAB = 1029, + CHELSIO_SETMTUTAB = 1030, + CHELSIO_SET_PM = 1032, + CHELSIO_GET_PM = 1033, + CHELSIO_GET_MEM = 1038, + CHELSIO_LOAD_FW = 1041, + CHELSIO_SET_TRACE_FILTER = 1044, + CHELSIO_SET_QSET_PARAMS = 1045, + CHELSIO_GET_QSET_PARAMS = 1046, + CHELSIO_SET_QSET_NUM = 1047, + CHELSIO_GET_QSET_NUM = 1048, +}; + +struct ch_reg { + uint32_t cmd; + uint32_t addr; + uint32_t val; +}; + +struct ch_cntxt { + uint32_t cmd; + uint32_t cntxt_type; + uint32_t cntxt_id; + uint32_t data[4]; +}; + +/* context types */ +enum { CNTXT_TYPE_EGRESS, CNTXT_TYPE_FL, CNTXT_TYPE_RSP, CNTXT_TYPE_CQ }; + +struct ch_desc { + uint32_t cmd; + uint32_t queue_num; + uint32_t idx; + uint32_t size; + uint8_t data[128]; +}; + +struct ch_mem_range { + uint32_t cmd; + uint32_t mem_id; + uint32_t addr; + uint32_t len; + uint32_t version; + uint8_t buf[0]; +}; + +struct ch_qset_params { + uint32_t cmd; + uint32_t qset_idx; + int32_t txq_size[3]; + int32_t rspq_size; + int32_t fl_size[2]; + int32_t intr_lat; + int32_t polling; + int32_t lro; + int32_t cong_thres; + int32_t vector; + int32_t qnum; +}; + +struct ch_pktsched_params { + uint32_t cmd; + uint8_t sched; + uint8_t idx; + uint8_t min; + uint8_t max; + uint8_t binding; +}; + +#ifndef TCB_SIZE +# define TCB_SIZE 128 +#endif + +/* TCB size in 32-bit words */ +#define TCB_WORDS (TCB_SIZE / 4) + +enum { MEM_CM, MEM_PMRX, MEM_PMTX }; /* ch_mem_range.mem_id values */ + +struct ch_mtus { + uint32_t cmd; + uint32_t nmtus; + uint16_t mtus[NMTUS]; +}; + +struct ch_pm { + uint32_t cmd; + uint32_t tx_pg_sz; + uint32_t tx_num_pg; + uint32_t rx_pg_sz; + uint32_t rx_num_pg; + uint32_t pm_total; +}; + +struct ch_tcam { + uint32_t cmd; + uint32_t tcam_size; + uint32_t nservers; + uint32_t nroutes; + uint32_t nfilters; +}; + +struct ch_tcb { + uint32_t cmd; + uint32_t tcb_index; + uint32_t tcb_data[TCB_WORDS]; +}; + +struct ch_tcam_word { + uint32_t cmd; + uint32_t addr; + uint32_t buf[3]; +}; + +struct ch_trace { + uint32_t cmd; + uint32_t sip; + uint32_t sip_mask; + uint32_t dip; + uint32_t dip_mask; + uint16_t sport; + uint16_t sport_mask; + uint16_t dport; + uint16_t dport_mask; + uint32_t vlan:12; + uint32_t vlan_mask:12; + uint32_t intf:4; + uint32_t intf_mask:4; + uint8_t proto; + uint8_t proto_mask; + uint8_t invert_match:1; + uint8_t config_tx:1; + uint8_t config_rx:1; + uint8_t trace_tx:1; + uint8_t trace_rx:1; +}; + +#define SIOCCHIOCTL SIOCDEVPRIVATE + +#endif diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c new file mode 100644 index 0000000..db76f70 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c @@ -0,0 +1,3438 @@ +/* + * Copyright (c) 2003-2008 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" +#include "cxgb3_ioctl.h" +#include "regs.h" +#include "cxgb3_offload.h" +#include "version.h" + +#include "cxgb3_ctl_defs.h" +#include "t3_cpl.h" +#include "firmware_exports.h" + +enum { + MAX_TXQ_ENTRIES = 16384, + MAX_CTRL_TXQ_ENTRIES = 1024, + MAX_RSPQ_ENTRIES = 16384, + MAX_RX_BUFFERS = 16384, + MAX_RX_JUMBO_BUFFERS = 16384, + MIN_TXQ_ENTRIES = 4, + MIN_CTRL_TXQ_ENTRIES = 4, + MIN_RSPQ_ENTRIES = 32, + MIN_FL_ENTRIES = 32 +}; + +#define PORT_MASK ((1 << MAX_NPORTS) - 1) + +#define DFLT_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK | \ + NETIF_MSG_TIMER | NETIF_MSG_IFDOWN | NETIF_MSG_IFUP |\ + NETIF_MSG_RX_ERR | NETIF_MSG_TX_ERR) + +#define EEPROM_MAGIC 0x38E2F10C + +#define CH_DEVICE(devid, idx) \ + { PCI_VENDOR_ID_CHELSIO, devid, PCI_ANY_ID, PCI_ANY_ID, 0, 0, idx } + +static const struct pci_device_id cxgb3_pci_tbl[] = { + CH_DEVICE(0x20, 0), /* PE9000 */ + CH_DEVICE(0x21, 1), /* T302E */ + CH_DEVICE(0x22, 2), /* T310E */ + CH_DEVICE(0x23, 3), /* T320X */ + CH_DEVICE(0x24, 1), /* T302X */ + CH_DEVICE(0x25, 3), /* T320E */ + CH_DEVICE(0x26, 2), /* T310X */ + CH_DEVICE(0x30, 2), /* T3B10 */ + CH_DEVICE(0x31, 3), /* T3B20 */ + CH_DEVICE(0x32, 1), /* T3B02 */ + CH_DEVICE(0x35, 6), /* T3C20-derived T3C10 */ + CH_DEVICE(0x36, 3), /* S320E-CR */ + CH_DEVICE(0x37, 7), /* N320E-G2 */ + {0,} +}; + +MODULE_DESCRIPTION(DRV_DESC); +MODULE_AUTHOR("Chelsio Communications"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); +MODULE_DEVICE_TABLE(pci, cxgb3_pci_tbl); + +static int dflt_msg_enable = DFLT_MSG_ENABLE; + +module_param(dflt_msg_enable, int, 0644); +MODULE_PARM_DESC(dflt_msg_enable, "Chelsio T3 default message enable bitmap"); + +/* + * The driver uses the best interrupt scheme available on a platform in the + * order MSI-X, MSI, legacy pin interrupts. This parameter determines which + * of these schemes the driver may consider as follows: + * + * msi = 2: choose from among all three options + * msi = 1: only consider MSI and pin interrupts + * msi = 0: force pin interrupts + */ +static int msi = 2; + +module_param(msi, int, 0644); +MODULE_PARM_DESC(msi, "whether to use MSI or MSI-X"); + +/* + * The driver enables offload as a default. + * To disable it, use ofld_disable = 1. + */ + +static int ofld_disable = 0; + +module_param(ofld_disable, int, 0644); +MODULE_PARM_DESC(ofld_disable, "whether to enable offload at init time or not"); + +/* + * We have work elements that we need to cancel when an interface is taken + * down. Normally the work elements would be executed by keventd but that + * can deadlock because of linkwatch. If our close method takes the rtnl + * lock and linkwatch is ahead of our work elements in keventd, linkwatch + * will block keventd as it needs the rtnl lock, and we'll deadlock waiting + * for our work to complete. Get our own work queue to solve this. + */ +struct workqueue_struct *cxgb3_wq; + +/** + * link_report - show link status and link speed/duplex + * @p: the port whose settings are to be reported + * + * Shows the link status, speed, and duplex of a port. + */ +static void link_report(struct net_device *dev) +{ + if (!netif_carrier_ok(dev)) + netdev_info(dev, "link down\n"); + else { + const char *s = "10Mbps"; + const struct port_info *p = netdev_priv(dev); + + switch (p->link_config.speed) { + case SPEED_10000: + s = "10Gbps"; + break; + case SPEED_1000: + s = "1000Mbps"; + break; + case SPEED_100: + s = "100Mbps"; + break; + } + + netdev_info(dev, "link up, %s, %s-duplex\n", + s, p->link_config.duplex == DUPLEX_FULL + ? "full" : "half"); + } +} + +static void enable_tx_fifo_drain(struct adapter *adapter, + struct port_info *pi) +{ + t3_set_reg_field(adapter, A_XGM_TXFIFO_CFG + pi->mac.offset, 0, + F_ENDROPPKT); + t3_write_reg(adapter, A_XGM_RX_CTRL + pi->mac.offset, 0); + t3_write_reg(adapter, A_XGM_TX_CTRL + pi->mac.offset, F_TXEN); + t3_write_reg(adapter, A_XGM_RX_CTRL + pi->mac.offset, F_RXEN); +} + +static void disable_tx_fifo_drain(struct adapter *adapter, + struct port_info *pi) +{ + t3_set_reg_field(adapter, A_XGM_TXFIFO_CFG + pi->mac.offset, + F_ENDROPPKT, 0); +} + +void t3_os_link_fault(struct adapter *adap, int port_id, int state) +{ + struct net_device *dev = adap->port[port_id]; + struct port_info *pi = netdev_priv(dev); + + if (state == netif_carrier_ok(dev)) + return; + + if (state) { + struct cmac *mac = &pi->mac; + + netif_carrier_on(dev); + + disable_tx_fifo_drain(adap, pi); + + /* Clear local faults */ + t3_xgm_intr_disable(adap, pi->port_id); + t3_read_reg(adap, A_XGM_INT_STATUS + + pi->mac.offset); + t3_write_reg(adap, + A_XGM_INT_CAUSE + pi->mac.offset, + F_XGM_INT); + + t3_set_reg_field(adap, + A_XGM_INT_ENABLE + + pi->mac.offset, + F_XGM_INT, F_XGM_INT); + t3_xgm_intr_enable(adap, pi->port_id); + + t3_mac_enable(mac, MAC_DIRECTION_TX); + } else { + netif_carrier_off(dev); + + /* Flush TX FIFO */ + enable_tx_fifo_drain(adap, pi); + } + link_report(dev); +} + +/** + * t3_os_link_changed - handle link status changes + * @adapter: the adapter associated with the link change + * @port_id: the port index whose limk status has changed + * @link_stat: the new status of the link + * @speed: the new speed setting + * @duplex: the new duplex setting + * @pause: the new flow-control setting + * + * This is the OS-dependent handler for link status changes. The OS + * neutral handler takes care of most of the processing for these events, + * then calls this handler for any OS-specific processing. + */ +void t3_os_link_changed(struct adapter *adapter, int port_id, int link_stat, + int speed, int duplex, int pause) +{ + struct net_device *dev = adapter->port[port_id]; + struct port_info *pi = netdev_priv(dev); + struct cmac *mac = &pi->mac; + + /* Skip changes from disabled ports. */ + if (!netif_running(dev)) + return; + + if (link_stat != netif_carrier_ok(dev)) { + if (link_stat) { + disable_tx_fifo_drain(adapter, pi); + + t3_mac_enable(mac, MAC_DIRECTION_RX); + + /* Clear local faults */ + t3_xgm_intr_disable(adapter, pi->port_id); + t3_read_reg(adapter, A_XGM_INT_STATUS + + pi->mac.offset); + t3_write_reg(adapter, + A_XGM_INT_CAUSE + pi->mac.offset, + F_XGM_INT); + + t3_set_reg_field(adapter, + A_XGM_INT_ENABLE + pi->mac.offset, + F_XGM_INT, F_XGM_INT); + t3_xgm_intr_enable(adapter, pi->port_id); + + netif_carrier_on(dev); + } else { + netif_carrier_off(dev); + + t3_xgm_intr_disable(adapter, pi->port_id); + t3_read_reg(adapter, A_XGM_INT_STATUS + pi->mac.offset); + t3_set_reg_field(adapter, + A_XGM_INT_ENABLE + pi->mac.offset, + F_XGM_INT, 0); + + if (is_10G(adapter)) + pi->phy.ops->power_down(&pi->phy, 1); + + t3_read_reg(adapter, A_XGM_INT_STATUS + pi->mac.offset); + t3_mac_disable(mac, MAC_DIRECTION_RX); + t3_link_start(&pi->phy, mac, &pi->link_config); + + /* Flush TX FIFO */ + enable_tx_fifo_drain(adapter, pi); + } + + link_report(dev); + } +} + +/** + * t3_os_phymod_changed - handle PHY module changes + * @phy: the PHY reporting the module change + * @mod_type: new module type + * + * This is the OS-dependent handler for PHY module changes. It is + * invoked when a PHY module is removed or inserted for any OS-specific + * processing. + */ +void t3_os_phymod_changed(struct adapter *adap, int port_id) +{ + static const char *mod_str[] = { + NULL, "SR", "LR", "LRM", "TWINAX", "TWINAX", "unknown" + }; + + const struct net_device *dev = adap->port[port_id]; + const struct port_info *pi = netdev_priv(dev); + + if (pi->phy.modtype == phy_modtype_none) + netdev_info(dev, "PHY module unplugged\n"); + else + netdev_info(dev, "%s PHY module inserted\n", + mod_str[pi->phy.modtype]); +} + +static void cxgb_set_rxmode(struct net_device *dev) +{ + struct port_info *pi = netdev_priv(dev); + + t3_mac_set_rx_mode(&pi->mac, dev); +} + +/** + * link_start - enable a port + * @dev: the device to enable + * + * Performs the MAC and PHY actions needed to enable a port. + */ +static void link_start(struct net_device *dev) +{ + struct port_info *pi = netdev_priv(dev); + struct cmac *mac = &pi->mac; + + t3_mac_reset(mac); + t3_mac_set_num_ucast(mac, MAX_MAC_IDX); + t3_mac_set_mtu(mac, dev->mtu); + t3_mac_set_address(mac, LAN_MAC_IDX, dev->dev_addr); + t3_mac_set_address(mac, SAN_MAC_IDX, pi->iscsic.mac_addr); + t3_mac_set_rx_mode(mac, dev); + t3_link_start(&pi->phy, mac, &pi->link_config); + t3_mac_enable(mac, MAC_DIRECTION_RX | MAC_DIRECTION_TX); +} + +static inline void cxgb_disable_msi(struct adapter *adapter) +{ + if (adapter->flags & USING_MSIX) { + pci_disable_msix(adapter->pdev); + adapter->flags &= ~USING_MSIX; + } else if (adapter->flags & USING_MSI) { + pci_disable_msi(adapter->pdev); + adapter->flags &= ~USING_MSI; + } +} + +/* + * Interrupt handler for asynchronous events used with MSI-X. + */ +static irqreturn_t t3_async_intr_handler(int irq, void *cookie) +{ + t3_slow_intr_handler(cookie); + return IRQ_HANDLED; +} + +/* + * Name the MSI-X interrupts. + */ +static void name_msix_vecs(struct adapter *adap) +{ + int i, j, msi_idx = 1, n = sizeof(adap->msix_info[0].desc) - 1; + + snprintf(adap->msix_info[0].desc, n, "%s", adap->name); + adap->msix_info[0].desc[n] = 0; + + for_each_port(adap, j) { + struct net_device *d = adap->port[j]; + const struct port_info *pi = netdev_priv(d); + + for (i = 0; i < pi->nqsets; i++, msi_idx++) { + snprintf(adap->msix_info[msi_idx].desc, n, + "%s-%d", d->name, pi->first_qset + i); + adap->msix_info[msi_idx].desc[n] = 0; + } + } +} + +static int request_msix_data_irqs(struct adapter *adap) +{ + int i, j, err, qidx = 0; + + for_each_port(adap, i) { + int nqsets = adap2pinfo(adap, i)->nqsets; + + for (j = 0; j < nqsets; ++j) { + err = request_irq(adap->msix_info[qidx + 1].vec, + t3_intr_handler(adap, + adap->sge.qs[qidx]. + rspq.polling), 0, + adap->msix_info[qidx + 1].desc, + &adap->sge.qs[qidx]); + if (err) { + while (--qidx >= 0) + free_irq(adap->msix_info[qidx + 1].vec, + &adap->sge.qs[qidx]); + return err; + } + qidx++; + } + } + return 0; +} + +static void free_irq_resources(struct adapter *adapter) +{ + if (adapter->flags & USING_MSIX) { + int i, n = 0; + + free_irq(adapter->msix_info[0].vec, adapter); + for_each_port(adapter, i) + n += adap2pinfo(adapter, i)->nqsets; + + for (i = 0; i < n; ++i) + free_irq(adapter->msix_info[i + 1].vec, + &adapter->sge.qs[i]); + } else + free_irq(adapter->pdev->irq, adapter); +} + +static int await_mgmt_replies(struct adapter *adap, unsigned long init_cnt, + unsigned long n) +{ + int attempts = 10; + + while (adap->sge.qs[0].rspq.offload_pkts < init_cnt + n) { + if (!--attempts) + return -ETIMEDOUT; + msleep(10); + } + return 0; +} + +static int init_tp_parity(struct adapter *adap) +{ + int i; + struct sk_buff *skb; + struct cpl_set_tcb_field *greq; + unsigned long cnt = adap->sge.qs[0].rspq.offload_pkts; + + t3_tp_set_offload_mode(adap, 1); + + for (i = 0; i < 16; i++) { + struct cpl_smt_write_req *req; + + skb = alloc_skb(sizeof(*req), GFP_KERNEL); + if (!skb) + skb = adap->nofail_skb; + if (!skb) + goto alloc_skb_fail; + + req = (struct cpl_smt_write_req *)__skb_put(skb, sizeof(*req)); + memset(req, 0, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SMT_WRITE_REQ, i)); + req->mtu_idx = NMTUS - 1; + req->iff = i; + t3_mgmt_tx(adap, skb); + if (skb == adap->nofail_skb) { + await_mgmt_replies(adap, cnt, i + 1); + adap->nofail_skb = alloc_skb(sizeof(*greq), GFP_KERNEL); + if (!adap->nofail_skb) + goto alloc_skb_fail; + } + } + + for (i = 0; i < 2048; i++) { + struct cpl_l2t_write_req *req; + + skb = alloc_skb(sizeof(*req), GFP_KERNEL); + if (!skb) + skb = adap->nofail_skb; + if (!skb) + goto alloc_skb_fail; + + req = (struct cpl_l2t_write_req *)__skb_put(skb, sizeof(*req)); + memset(req, 0, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, i)); + req->params = htonl(V_L2T_W_IDX(i)); + t3_mgmt_tx(adap, skb); + if (skb == adap->nofail_skb) { + await_mgmt_replies(adap, cnt, 16 + i + 1); + adap->nofail_skb = alloc_skb(sizeof(*greq), GFP_KERNEL); + if (!adap->nofail_skb) + goto alloc_skb_fail; + } + } + + for (i = 0; i < 2048; i++) { + struct cpl_rte_write_req *req; + + skb = alloc_skb(sizeof(*req), GFP_KERNEL); + if (!skb) + skb = adap->nofail_skb; + if (!skb) + goto alloc_skb_fail; + + req = (struct cpl_rte_write_req *)__skb_put(skb, sizeof(*req)); + memset(req, 0, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RTE_WRITE_REQ, i)); + req->l2t_idx = htonl(V_L2T_W_IDX(i)); + t3_mgmt_tx(adap, skb); + if (skb == adap->nofail_skb) { + await_mgmt_replies(adap, cnt, 16 + 2048 + i + 1); + adap->nofail_skb = alloc_skb(sizeof(*greq), GFP_KERNEL); + if (!adap->nofail_skb) + goto alloc_skb_fail; + } + } + + skb = alloc_skb(sizeof(*greq), GFP_KERNEL); + if (!skb) + skb = adap->nofail_skb; + if (!skb) + goto alloc_skb_fail; + + greq = (struct cpl_set_tcb_field *)__skb_put(skb, sizeof(*greq)); + memset(greq, 0, sizeof(*greq)); + greq->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(greq) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, 0)); + greq->mask = cpu_to_be64(1); + t3_mgmt_tx(adap, skb); + + i = await_mgmt_replies(adap, cnt, 16 + 2048 + 2048 + 1); + if (skb == adap->nofail_skb) { + i = await_mgmt_replies(adap, cnt, 16 + 2048 + 2048 + 1); + adap->nofail_skb = alloc_skb(sizeof(*greq), GFP_KERNEL); + } + + t3_tp_set_offload_mode(adap, 0); + return i; + +alloc_skb_fail: + t3_tp_set_offload_mode(adap, 0); + return -ENOMEM; +} + +/** + * setup_rss - configure RSS + * @adap: the adapter + * + * Sets up RSS to distribute packets to multiple receive queues. We + * configure the RSS CPU lookup table to distribute to the number of HW + * receive queues, and the response queue lookup table to narrow that + * down to the response queues actually configured for each port. + * We always configure the RSS mapping for two ports since the mapping + * table has plenty of entries. + */ +static void setup_rss(struct adapter *adap) +{ + int i; + unsigned int nq0 = adap2pinfo(adap, 0)->nqsets; + unsigned int nq1 = adap->port[1] ? adap2pinfo(adap, 1)->nqsets : 1; + u8 cpus[SGE_QSETS + 1]; + u16 rspq_map[RSS_TABLE_SIZE]; + + for (i = 0; i < SGE_QSETS; ++i) + cpus[i] = i; + cpus[SGE_QSETS] = 0xff; /* terminator */ + + for (i = 0; i < RSS_TABLE_SIZE / 2; ++i) { + rspq_map[i] = i % nq0; + rspq_map[i + RSS_TABLE_SIZE / 2] = (i % nq1) + nq0; + } + + t3_config_rss(adap, F_RQFEEDBACKENABLE | F_TNLLKPEN | F_TNLMAPEN | + F_TNLPRTEN | F_TNL2TUPEN | F_TNL4TUPEN | + V_RRCPLCPUSIZE(6) | F_HASHTOEPLITZ, cpus, rspq_map); +} + +static void ring_dbs(struct adapter *adap) +{ + int i, j; + + for (i = 0; i < SGE_QSETS; i++) { + struct sge_qset *qs = &adap->sge.qs[i]; + + if (qs->adap) + for (j = 0; j < SGE_TXQ_PER_SET; j++) + t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX | V_EGRCNTX(qs->txq[j].cntxt_id)); + } +} + +static void init_napi(struct adapter *adap) +{ + int i; + + for (i = 0; i < SGE_QSETS; i++) { + struct sge_qset *qs = &adap->sge.qs[i]; + + if (qs->adap) + netif_napi_add(qs->netdev, &qs->napi, qs->napi.poll, + 64); + } + + /* + * netif_napi_add() can be called only once per napi_struct because it + * adds each new napi_struct to a list. Be careful not to call it a + * second time, e.g., during EEH recovery, by making a note of it. + */ + adap->flags |= NAPI_INIT; +} + +/* + * Wait until all NAPI handlers are descheduled. This includes the handlers of + * both netdevices representing interfaces and the dummy ones for the extra + * queues. + */ +static void quiesce_rx(struct adapter *adap) +{ + int i; + + for (i = 0; i < SGE_QSETS; i++) + if (adap->sge.qs[i].adap) + napi_disable(&adap->sge.qs[i].napi); +} + +static void enable_all_napi(struct adapter *adap) +{ + int i; + for (i = 0; i < SGE_QSETS; i++) + if (adap->sge.qs[i].adap) + napi_enable(&adap->sge.qs[i].napi); +} + +/** + * setup_sge_qsets - configure SGE Tx/Rx/response queues + * @adap: the adapter + * + * Determines how many sets of SGE queues to use and initializes them. + * We support multiple queue sets per port if we have MSI-X, otherwise + * just one queue set per port. + */ +static int setup_sge_qsets(struct adapter *adap) +{ + int i, j, err, irq_idx = 0, qset_idx = 0; + unsigned int ntxq = SGE_TXQ_PER_SET; + + if (adap->params.rev > 0 && !(adap->flags & USING_MSI)) + irq_idx = -1; + + for_each_port(adap, i) { + struct net_device *dev = adap->port[i]; + struct port_info *pi = netdev_priv(dev); + + pi->qs = &adap->sge.qs[pi->first_qset]; + for (j = 0; j < pi->nqsets; ++j, ++qset_idx) { + err = t3_sge_alloc_qset(adap, qset_idx, 1, + (adap->flags & USING_MSIX) ? qset_idx + 1 : + irq_idx, + &adap->params.sge.qset[qset_idx], ntxq, dev, + netdev_get_tx_queue(dev, j)); + if (err) { + t3_free_sge_resources(adap); + return err; + } + } + } + + return 0; +} + +static ssize_t attr_show(struct device *d, char *buf, + ssize_t(*format) (struct net_device *, char *)) +{ + ssize_t len; + + /* Synchronize with ioctls that may shut down the device */ + rtnl_lock(); + len = (*format) (to_net_dev(d), buf); + rtnl_unlock(); + return len; +} + +static ssize_t attr_store(struct device *d, + const char *buf, size_t len, + ssize_t(*set) (struct net_device *, unsigned int), + unsigned int min_val, unsigned int max_val) +{ + char *endp; + ssize_t ret; + unsigned int val; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + val = simple_strtoul(buf, &endp, 0); + if (endp == buf || val < min_val || val > max_val) + return -EINVAL; + + rtnl_lock(); + ret = (*set) (to_net_dev(d), val); + if (!ret) + ret = len; + rtnl_unlock(); + return ret; +} + +#define CXGB3_SHOW(name, val_expr) \ +static ssize_t format_##name(struct net_device *dev, char *buf) \ +{ \ + struct port_info *pi = netdev_priv(dev); \ + struct adapter *adap = pi->adapter; \ + return sprintf(buf, "%u\n", val_expr); \ +} \ +static ssize_t show_##name(struct device *d, struct device_attribute *attr, \ + char *buf) \ +{ \ + return attr_show(d, buf, format_##name); \ +} + +static ssize_t set_nfilters(struct net_device *dev, unsigned int val) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adap = pi->adapter; + int min_tids = is_offload(adap) ? MC5_MIN_TIDS : 0; + + if (adap->flags & FULL_INIT_DONE) + return -EBUSY; + if (val && adap->params.rev == 0) + return -EINVAL; + if (val > t3_mc5_size(&adap->mc5) - adap->params.mc5.nservers - + min_tids) + return -EINVAL; + adap->params.mc5.nfilters = val; + return 0; +} + +static ssize_t store_nfilters(struct device *d, struct device_attribute *attr, + const char *buf, size_t len) +{ + return attr_store(d, buf, len, set_nfilters, 0, ~0); +} + +static ssize_t set_nservers(struct net_device *dev, unsigned int val) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adap = pi->adapter; + + if (adap->flags & FULL_INIT_DONE) + return -EBUSY; + if (val > t3_mc5_size(&adap->mc5) - adap->params.mc5.nfilters - + MC5_MIN_TIDS) + return -EINVAL; + adap->params.mc5.nservers = val; + return 0; +} + +static ssize_t store_nservers(struct device *d, struct device_attribute *attr, + const char *buf, size_t len) +{ + return attr_store(d, buf, len, set_nservers, 0, ~0); +} + +#define CXGB3_ATTR_R(name, val_expr) \ +CXGB3_SHOW(name, val_expr) \ +static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) + +#define CXGB3_ATTR_RW(name, val_expr, store_method) \ +CXGB3_SHOW(name, val_expr) \ +static DEVICE_ATTR(name, S_IRUGO | S_IWUSR, show_##name, store_method) + +CXGB3_ATTR_R(cam_size, t3_mc5_size(&adap->mc5)); +CXGB3_ATTR_RW(nfilters, adap->params.mc5.nfilters, store_nfilters); +CXGB3_ATTR_RW(nservers, adap->params.mc5.nservers, store_nservers); + +static struct attribute *cxgb3_attrs[] = { + &dev_attr_cam_size.attr, + &dev_attr_nfilters.attr, + &dev_attr_nservers.attr, + NULL +}; + +static struct attribute_group cxgb3_attr_group = {.attrs = cxgb3_attrs }; + +static ssize_t tm_attr_show(struct device *d, + char *buf, int sched) +{ + struct port_info *pi = netdev_priv(to_net_dev(d)); + struct adapter *adap = pi->adapter; + unsigned int v, addr, bpt, cpt; + ssize_t len; + + addr = A_TP_TX_MOD_Q1_Q0_RATE_LIMIT - sched / 2; + rtnl_lock(); + t3_write_reg(adap, A_TP_TM_PIO_ADDR, addr); + v = t3_read_reg(adap, A_TP_TM_PIO_DATA); + if (sched & 1) + v >>= 16; + bpt = (v >> 8) & 0xff; + cpt = v & 0xff; + if (!cpt) + len = sprintf(buf, "disabled\n"); + else { + v = (adap->params.vpd.cclk * 1000) / cpt; + len = sprintf(buf, "%u Kbps\n", (v * bpt) / 125); + } + rtnl_unlock(); + return len; +} + +static ssize_t tm_attr_store(struct device *d, + const char *buf, size_t len, int sched) +{ + struct port_info *pi = netdev_priv(to_net_dev(d)); + struct adapter *adap = pi->adapter; + unsigned int val; + char *endp; + ssize_t ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + val = simple_strtoul(buf, &endp, 0); + if (endp == buf || val > 10000000) + return -EINVAL; + + rtnl_lock(); + ret = t3_config_sched(adap, val, sched); + if (!ret) + ret = len; + rtnl_unlock(); + return ret; +} + +#define TM_ATTR(name, sched) \ +static ssize_t show_##name(struct device *d, struct device_attribute *attr, \ + char *buf) \ +{ \ + return tm_attr_show(d, buf, sched); \ +} \ +static ssize_t store_##name(struct device *d, struct device_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + return tm_attr_store(d, buf, len, sched); \ +} \ +static DEVICE_ATTR(name, S_IRUGO | S_IWUSR, show_##name, store_##name) + +TM_ATTR(sched0, 0); +TM_ATTR(sched1, 1); +TM_ATTR(sched2, 2); +TM_ATTR(sched3, 3); +TM_ATTR(sched4, 4); +TM_ATTR(sched5, 5); +TM_ATTR(sched6, 6); +TM_ATTR(sched7, 7); + +static struct attribute *offload_attrs[] = { + &dev_attr_sched0.attr, + &dev_attr_sched1.attr, + &dev_attr_sched2.attr, + &dev_attr_sched3.attr, + &dev_attr_sched4.attr, + &dev_attr_sched5.attr, + &dev_attr_sched6.attr, + &dev_attr_sched7.attr, + NULL +}; + +static struct attribute_group offload_attr_group = {.attrs = offload_attrs }; + +/* + * Sends an sk_buff to an offload queue driver + * after dealing with any active network taps. + */ +static inline int offload_tx(struct t3cdev *tdev, struct sk_buff *skb) +{ + int ret; + + local_bh_disable(); + ret = t3_offload_tx(tdev, skb); + local_bh_enable(); + return ret; +} + +static int write_smt_entry(struct adapter *adapter, int idx) +{ + struct cpl_smt_write_req *req; + struct port_info *pi = netdev_priv(adapter->port[idx]); + struct sk_buff *skb = alloc_skb(sizeof(*req), GFP_KERNEL); + + if (!skb) + return -ENOMEM; + + req = (struct cpl_smt_write_req *)__skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SMT_WRITE_REQ, idx)); + req->mtu_idx = NMTUS - 1; /* should be 0 but there's a T3 bug */ + req->iff = idx; + memcpy(req->src_mac0, adapter->port[idx]->dev_addr, ETH_ALEN); + memcpy(req->src_mac1, pi->iscsic.mac_addr, ETH_ALEN); + skb->priority = 1; + offload_tx(&adapter->tdev, skb); + return 0; +} + +static int init_smt(struct adapter *adapter) +{ + int i; + + for_each_port(adapter, i) + write_smt_entry(adapter, i); + return 0; +} + +static void init_port_mtus(struct adapter *adapter) +{ + unsigned int mtus = adapter->port[0]->mtu; + + if (adapter->port[1]) + mtus |= adapter->port[1]->mtu << 16; + t3_write_reg(adapter, A_TP_MTU_PORT_TABLE, mtus); +} + +static int send_pktsched_cmd(struct adapter *adap, int sched, int qidx, int lo, + int hi, int port) +{ + struct sk_buff *skb; + struct mngt_pktsched_wr *req; + int ret; + + skb = alloc_skb(sizeof(*req), GFP_KERNEL); + if (!skb) + skb = adap->nofail_skb; + if (!skb) + return -ENOMEM; + + req = (struct mngt_pktsched_wr *)skb_put(skb, sizeof(*req)); + req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_MNGT)); + req->mngt_opcode = FW_MNGTOPCODE_PKTSCHED_SET; + req->sched = sched; + req->idx = qidx; + req->min = lo; + req->max = hi; + req->binding = port; + ret = t3_mgmt_tx(adap, skb); + if (skb == adap->nofail_skb) { + adap->nofail_skb = alloc_skb(sizeof(struct cpl_set_tcb_field), + GFP_KERNEL); + if (!adap->nofail_skb) + ret = -ENOMEM; + } + + return ret; +} + +static int bind_qsets(struct adapter *adap) +{ + int i, j, err = 0; + + for_each_port(adap, i) { + const struct port_info *pi = adap2pinfo(adap, i); + + for (j = 0; j < pi->nqsets; ++j) { + int ret = send_pktsched_cmd(adap, 1, + pi->first_qset + j, -1, + -1, i); + if (ret) + err = ret; + } + } + + return err; +} + +#define FW_VERSION __stringify(FW_VERSION_MAJOR) "." \ + __stringify(FW_VERSION_MINOR) "." __stringify(FW_VERSION_MICRO) +#define FW_FNAME "cxgb3/t3fw-" FW_VERSION ".bin" +#define TPSRAM_VERSION __stringify(TP_VERSION_MAJOR) "." \ + __stringify(TP_VERSION_MINOR) "." __stringify(TP_VERSION_MICRO) +#define TPSRAM_NAME "cxgb3/t3%c_psram-" TPSRAM_VERSION ".bin" +#define AEL2005_OPT_EDC_NAME "cxgb3/ael2005_opt_edc.bin" +#define AEL2005_TWX_EDC_NAME "cxgb3/ael2005_twx_edc.bin" +#define AEL2020_TWX_EDC_NAME "cxgb3/ael2020_twx_edc.bin" +MODULE_FIRMWARE(FW_FNAME); +MODULE_FIRMWARE("cxgb3/t3b_psram-" TPSRAM_VERSION ".bin"); +MODULE_FIRMWARE("cxgb3/t3c_psram-" TPSRAM_VERSION ".bin"); +MODULE_FIRMWARE(AEL2005_OPT_EDC_NAME); +MODULE_FIRMWARE(AEL2005_TWX_EDC_NAME); +MODULE_FIRMWARE(AEL2020_TWX_EDC_NAME); + +static inline const char *get_edc_fw_name(int edc_idx) +{ + const char *fw_name = NULL; + + switch (edc_idx) { + case EDC_OPT_AEL2005: + fw_name = AEL2005_OPT_EDC_NAME; + break; + case EDC_TWX_AEL2005: + fw_name = AEL2005_TWX_EDC_NAME; + break; + case EDC_TWX_AEL2020: + fw_name = AEL2020_TWX_EDC_NAME; + break; + } + return fw_name; +} + +int t3_get_edc_fw(struct cphy *phy, int edc_idx, int size) +{ + struct adapter *adapter = phy->adapter; + const struct firmware *fw; + char buf[64]; + u32 csum; + const __be32 *p; + u16 *cache = phy->phy_cache; + int i, ret; + + snprintf(buf, sizeof(buf), get_edc_fw_name(edc_idx)); + + ret = request_firmware(&fw, buf, &adapter->pdev->dev); + if (ret < 0) { + dev_err(&adapter->pdev->dev, + "could not upgrade firmware: unable to load %s\n", + buf); + return ret; + } + + /* check size, take checksum in account */ + if (fw->size > size + 4) { + CH_ERR(adapter, "firmware image too large %u, expected %d\n", + (unsigned int)fw->size, size + 4); + ret = -EINVAL; + } + + /* compute checksum */ + p = (const __be32 *)fw->data; + for (csum = 0, i = 0; i < fw->size / sizeof(csum); i++) + csum += ntohl(p[i]); + + if (csum != 0xffffffff) { + CH_ERR(adapter, "corrupted firmware image, checksum %u\n", + csum); + ret = -EINVAL; + } + + for (i = 0; i < size / 4 ; i++) { + *cache++ = (be32_to_cpu(p[i]) & 0xffff0000) >> 16; + *cache++ = be32_to_cpu(p[i]) & 0xffff; + } + + release_firmware(fw); + + return ret; +} + +static int upgrade_fw(struct adapter *adap) +{ + int ret; + const struct firmware *fw; + struct device *dev = &adap->pdev->dev; + + ret = request_firmware(&fw, FW_FNAME, dev); + if (ret < 0) { + dev_err(dev, "could not upgrade firmware: unable to load %s\n", + FW_FNAME); + return ret; + } + ret = t3_load_fw(adap, fw->data, fw->size); + release_firmware(fw); + + if (ret == 0) + dev_info(dev, "successful upgrade to firmware %d.%d.%d\n", + FW_VERSION_MAJOR, FW_VERSION_MINOR, FW_VERSION_MICRO); + else + dev_err(dev, "failed to upgrade to firmware %d.%d.%d\n", + FW_VERSION_MAJOR, FW_VERSION_MINOR, FW_VERSION_MICRO); + + return ret; +} + +static inline char t3rev2char(struct adapter *adapter) +{ + char rev = 0; + + switch(adapter->params.rev) { + case T3_REV_B: + case T3_REV_B2: + rev = 'b'; + break; + case T3_REV_C: + rev = 'c'; + break; + } + return rev; +} + +static int update_tpsram(struct adapter *adap) +{ + const struct firmware *tpsram; + char buf[64]; + struct device *dev = &adap->pdev->dev; + int ret; + char rev; + + rev = t3rev2char(adap); + if (!rev) + return 0; + + snprintf(buf, sizeof(buf), TPSRAM_NAME, rev); + + ret = request_firmware(&tpsram, buf, dev); + if (ret < 0) { + dev_err(dev, "could not load TP SRAM: unable to load %s\n", + buf); + return ret; + } + + ret = t3_check_tpsram(adap, tpsram->data, tpsram->size); + if (ret) + goto release_tpsram; + + ret = t3_set_proto_sram(adap, tpsram->data); + if (ret == 0) + dev_info(dev, + "successful update of protocol engine " + "to %d.%d.%d\n", + TP_VERSION_MAJOR, TP_VERSION_MINOR, TP_VERSION_MICRO); + else + dev_err(dev, "failed to update of protocol engine %d.%d.%d\n", + TP_VERSION_MAJOR, TP_VERSION_MINOR, TP_VERSION_MICRO); + if (ret) + dev_err(dev, "loading protocol SRAM failed\n"); + +release_tpsram: + release_firmware(tpsram); + + return ret; +} + +/** + * t3_synchronize_rx - wait for current Rx processing on a port to complete + * @adap: the adapter + * @p: the port + * + * Ensures that current Rx processing on any of the queues associated with + * the given port completes before returning. We do this by acquiring and + * releasing the locks of the response queues associated with the port. + */ +static void t3_synchronize_rx(struct adapter *adap, const struct port_info *p) +{ + int i; + + for (i = p->first_qset; i < p->first_qset + p->nqsets; i++) { + struct sge_rspq *q = &adap->sge.qs[i].rspq; + + spin_lock_irq(&q->lock); + spin_unlock_irq(&q->lock); + } +} + +static void cxgb_vlan_mode(struct net_device *dev, netdev_features_t features) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adapter = pi->adapter; + + if (adapter->params.rev > 0) { + t3_set_vlan_accel(adapter, 1 << pi->port_id, + features & NETIF_F_HW_VLAN_CTAG_RX); + } else { + /* single control for all ports */ + unsigned int i, have_vlans = features & NETIF_F_HW_VLAN_CTAG_RX; + + for_each_port(adapter, i) + have_vlans |= + adapter->port[i]->features & + NETIF_F_HW_VLAN_CTAG_RX; + + t3_set_vlan_accel(adapter, 1, have_vlans); + } + t3_synchronize_rx(adapter, pi); +} + +/** + * cxgb_up - enable the adapter + * @adapter: adapter being enabled + * + * Called when the first port is enabled, this function performs the + * actions necessary to make an adapter operational, such as completing + * the initialization of HW modules, and enabling interrupts. + * + * Must be called with the rtnl lock held. + */ +static int cxgb_up(struct adapter *adap) +{ + int i, err; + + if (!(adap->flags & FULL_INIT_DONE)) { + err = t3_check_fw_version(adap); + if (err == -EINVAL) { + err = upgrade_fw(adap); + CH_WARN(adap, "FW upgrade to %d.%d.%d %s\n", + FW_VERSION_MAJOR, FW_VERSION_MINOR, + FW_VERSION_MICRO, err ? "failed" : "succeeded"); + } + + err = t3_check_tpsram_version(adap); + if (err == -EINVAL) { + err = update_tpsram(adap); + CH_WARN(adap, "TP upgrade to %d.%d.%d %s\n", + TP_VERSION_MAJOR, TP_VERSION_MINOR, + TP_VERSION_MICRO, err ? "failed" : "succeeded"); + } + + /* + * Clear interrupts now to catch errors if t3_init_hw fails. + * We clear them again later as initialization may trigger + * conditions that can interrupt. + */ + t3_intr_clear(adap); + + err = t3_init_hw(adap, 0); + if (err) + goto out; + + t3_set_reg_field(adap, A_TP_PARA_REG5, 0, F_RXDDPOFFINIT); + t3_write_reg(adap, A_ULPRX_TDDP_PSZ, V_HPZ0(PAGE_SHIFT - 12)); + + err = setup_sge_qsets(adap); + if (err) + goto out; + + for_each_port(adap, i) + cxgb_vlan_mode(adap->port[i], adap->port[i]->features); + + setup_rss(adap); + if (!(adap->flags & NAPI_INIT)) + init_napi(adap); + + t3_start_sge_timers(adap); + adap->flags |= FULL_INIT_DONE; + } + + t3_intr_clear(adap); + + if (adap->flags & USING_MSIX) { + name_msix_vecs(adap); + err = request_irq(adap->msix_info[0].vec, + t3_async_intr_handler, 0, + adap->msix_info[0].desc, adap); + if (err) + goto irq_err; + + err = request_msix_data_irqs(adap); + if (err) { + free_irq(adap->msix_info[0].vec, adap); + goto irq_err; + } + } else if ((err = request_irq(adap->pdev->irq, + t3_intr_handler(adap, + adap->sge.qs[0].rspq. + polling), + (adap->flags & USING_MSI) ? + 0 : IRQF_SHARED, + adap->name, adap))) + goto irq_err; + + enable_all_napi(adap); + t3_sge_start(adap); + t3_intr_enable(adap); + + if (adap->params.rev >= T3_REV_C && !(adap->flags & TP_PARITY_INIT) && + is_offload(adap) && init_tp_parity(adap) == 0) + adap->flags |= TP_PARITY_INIT; + + if (adap->flags & TP_PARITY_INIT) { + t3_write_reg(adap, A_TP_INT_CAUSE, + F_CMCACHEPERR | F_ARPLUTPERR); + t3_write_reg(adap, A_TP_INT_ENABLE, 0x7fbfffff); + } + + if (!(adap->flags & QUEUES_BOUND)) { + int ret = bind_qsets(adap); + + if (ret < 0) { + CH_ERR(adap, "failed to bind qsets, err %d\n", ret); + t3_intr_disable(adap); + free_irq_resources(adap); + err = ret; + goto out; + } + adap->flags |= QUEUES_BOUND; + } + +out: + return err; +irq_err: + CH_ERR(adap, "request_irq failed, err %d\n", err); + goto out; +} + +/* + * Release resources when all the ports and offloading have been stopped. + */ +static void cxgb_down(struct adapter *adapter, int on_wq) +{ + t3_sge_stop(adapter); + spin_lock_irq(&adapter->work_lock); /* sync with PHY intr task */ + t3_intr_disable(adapter); + spin_unlock_irq(&adapter->work_lock); + + free_irq_resources(adapter); + quiesce_rx(adapter); + t3_sge_stop(adapter); + if (!on_wq) + flush_workqueue(cxgb3_wq);/* wait for external IRQ handler */ +} + +static void schedule_chk_task(struct adapter *adap) +{ + unsigned int timeo; + + timeo = adap->params.linkpoll_period ? + (HZ * adap->params.linkpoll_period) / 10 : + adap->params.stats_update_period * HZ; + if (timeo) + queue_delayed_work(cxgb3_wq, &adap->adap_check_task, timeo); +} + +static int offload_open(struct net_device *dev) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adapter = pi->adapter; + struct t3cdev *tdev = dev2t3cdev(dev); + int adap_up = adapter->open_device_map & PORT_MASK; + int err; + + if (test_and_set_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map)) + return 0; + + if (!adap_up && (err = cxgb_up(adapter)) < 0) + goto out; + + t3_tp_set_offload_mode(adapter, 1); + tdev->lldev = adapter->port[0]; + err = cxgb3_offload_activate(adapter); + if (err) + goto out; + + init_port_mtus(adapter); + t3_load_mtus(adapter, adapter->params.mtus, adapter->params.a_wnd, + adapter->params.b_wnd, + adapter->params.rev == 0 ? + adapter->port[0]->mtu : 0xffff); + init_smt(adapter); + + if (sysfs_create_group(&tdev->lldev->dev.kobj, &offload_attr_group)) + dev_dbg(&dev->dev, "cannot create sysfs group\n"); + + /* Call back all registered clients */ + cxgb3_add_clients(tdev); + +out: + /* restore them in case the offload module has changed them */ + if (err) { + t3_tp_set_offload_mode(adapter, 0); + clear_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map); + cxgb3_set_dummy_ops(tdev); + } + return err; +} + +static int offload_close(struct t3cdev *tdev) +{ + struct adapter *adapter = tdev2adap(tdev); + struct t3c_data *td = T3C_DATA(tdev); + + if (!test_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map)) + return 0; + + /* Call back all registered clients */ + cxgb3_remove_clients(tdev); + + sysfs_remove_group(&tdev->lldev->dev.kobj, &offload_attr_group); + + /* Flush work scheduled while releasing TIDs */ + flush_work(&td->tid_release_task); + + tdev->lldev = NULL; + cxgb3_set_dummy_ops(tdev); + t3_tp_set_offload_mode(adapter, 0); + clear_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map); + + if (!adapter->open_device_map) + cxgb_down(adapter, 0); + + cxgb3_offload_deactivate(adapter); + return 0; +} + +static int cxgb_open(struct net_device *dev) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adapter = pi->adapter; + int other_ports = adapter->open_device_map & PORT_MASK; + int err; + + if (!adapter->open_device_map && (err = cxgb_up(adapter)) < 0) + return err; + + set_bit(pi->port_id, &adapter->open_device_map); + if (is_offload(adapter) && !ofld_disable) { + err = offload_open(dev); + if (err) + pr_warn("Could not initialize offload capabilities\n"); + } + + netif_set_real_num_tx_queues(dev, pi->nqsets); + err = netif_set_real_num_rx_queues(dev, pi->nqsets); + if (err) + return err; + link_start(dev); + t3_port_intr_enable(adapter, pi->port_id); + netif_tx_start_all_queues(dev); + if (!other_ports) + schedule_chk_task(adapter); + + cxgb3_event_notify(&adapter->tdev, OFFLOAD_PORT_UP, pi->port_id); + return 0; +} + +static int __cxgb_close(struct net_device *dev, int on_wq) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adapter = pi->adapter; + + + if (!adapter->open_device_map) + return 0; + + /* Stop link fault interrupts */ + t3_xgm_intr_disable(adapter, pi->port_id); + t3_read_reg(adapter, A_XGM_INT_STATUS + pi->mac.offset); + + t3_port_intr_disable(adapter, pi->port_id); + netif_tx_stop_all_queues(dev); + pi->phy.ops->power_down(&pi->phy, 1); + netif_carrier_off(dev); + t3_mac_disable(&pi->mac, MAC_DIRECTION_TX | MAC_DIRECTION_RX); + + spin_lock_irq(&adapter->work_lock); /* sync with update task */ + clear_bit(pi->port_id, &adapter->open_device_map); + spin_unlock_irq(&adapter->work_lock); + + if (!(adapter->open_device_map & PORT_MASK)) + cancel_delayed_work_sync(&adapter->adap_check_task); + + if (!adapter->open_device_map) + cxgb_down(adapter, on_wq); + + cxgb3_event_notify(&adapter->tdev, OFFLOAD_PORT_DOWN, pi->port_id); + return 0; +} + +static int cxgb_close(struct net_device *dev) +{ + return __cxgb_close(dev, 0); +} + +static struct net_device_stats *cxgb_get_stats(struct net_device *dev) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adapter = pi->adapter; + struct net_device_stats *ns = &pi->netstats; + const struct mac_stats *pstats; + + spin_lock(&adapter->stats_lock); + pstats = t3_mac_update_stats(&pi->mac); + spin_unlock(&adapter->stats_lock); + + ns->tx_bytes = pstats->tx_octets; + ns->tx_packets = pstats->tx_frames; + ns->rx_bytes = pstats->rx_octets; + ns->rx_packets = pstats->rx_frames; + ns->multicast = pstats->rx_mcast_frames; + + ns->tx_errors = pstats->tx_underrun; + ns->rx_errors = pstats->rx_symbol_errs + pstats->rx_fcs_errs + + pstats->rx_too_long + pstats->rx_jabber + pstats->rx_short + + pstats->rx_fifo_ovfl; + + /* detailed rx_errors */ + ns->rx_length_errors = pstats->rx_jabber + pstats->rx_too_long; + ns->rx_over_errors = 0; + ns->rx_crc_errors = pstats->rx_fcs_errs; + ns->rx_frame_errors = pstats->rx_symbol_errs; + ns->rx_fifo_errors = pstats->rx_fifo_ovfl; + ns->rx_missed_errors = pstats->rx_cong_drops; + + /* detailed tx_errors */ + ns->tx_aborted_errors = 0; + ns->tx_carrier_errors = 0; + ns->tx_fifo_errors = pstats->tx_underrun; + ns->tx_heartbeat_errors = 0; + ns->tx_window_errors = 0; + return ns; +} + +static u32 get_msglevel(struct net_device *dev) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adapter = pi->adapter; + + return adapter->msg_enable; +} + +static void set_msglevel(struct net_device *dev, u32 val) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adapter = pi->adapter; + + adapter->msg_enable = val; +} + +static char stats_strings[][ETH_GSTRING_LEN] = { + "TxOctetsOK ", + "TxFramesOK ", + "TxMulticastFramesOK", + "TxBroadcastFramesOK", + "TxPauseFrames ", + "TxUnderrun ", + "TxExtUnderrun ", + + "TxFrames64 ", + "TxFrames65To127 ", + "TxFrames128To255 ", + "TxFrames256To511 ", + "TxFrames512To1023 ", + "TxFrames1024To1518 ", + "TxFrames1519ToMax ", + + "RxOctetsOK ", + "RxFramesOK ", + "RxMulticastFramesOK", + "RxBroadcastFramesOK", + "RxPauseFrames ", + "RxFCSErrors ", + "RxSymbolErrors ", + "RxShortErrors ", + "RxJabberErrors ", + "RxLengthErrors ", + "RxFIFOoverflow ", + + "RxFrames64 ", + "RxFrames65To127 ", + "RxFrames128To255 ", + "RxFrames256To511 ", + "RxFrames512To1023 ", + "RxFrames1024To1518 ", + "RxFrames1519ToMax ", + + "PhyFIFOErrors ", + "TSO ", + "VLANextractions ", + "VLANinsertions ", + "TxCsumOffload ", + "RxCsumGood ", + "LroAggregated ", + "LroFlushed ", + "LroNoDesc ", + "RxDrops ", + + "CheckTXEnToggled ", + "CheckResets ", + + "LinkFaults ", +}; + +static int get_sset_count(struct net_device *dev, int sset) +{ + switch (sset) { + case ETH_SS_STATS: + return ARRAY_SIZE(stats_strings); + default: + return -EOPNOTSUPP; + } +} + +#define T3_REGMAP_SIZE (3 * 1024) + +static int get_regs_len(struct net_device *dev) +{ + return T3_REGMAP_SIZE; +} + +static int get_eeprom_len(struct net_device *dev) +{ + return EEPROMSIZE; +} + +static void get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adapter = pi->adapter; + u32 fw_vers = 0; + u32 tp_vers = 0; + + spin_lock(&adapter->stats_lock); + t3_get_fw_version(adapter, &fw_vers); + t3_get_tp_version(adapter, &tp_vers); + spin_unlock(&adapter->stats_lock); + + strlcpy(info->driver, DRV_NAME, sizeof(info->driver)); + strlcpy(info->version, DRV_VERSION, sizeof(info->version)); + strlcpy(info->bus_info, pci_name(adapter->pdev), + sizeof(info->bus_info)); + if (fw_vers) + snprintf(info->fw_version, sizeof(info->fw_version), + "%s %u.%u.%u TP %u.%u.%u", + G_FW_VERSION_TYPE(fw_vers) ? "T" : "N", + G_FW_VERSION_MAJOR(fw_vers), + G_FW_VERSION_MINOR(fw_vers), + G_FW_VERSION_MICRO(fw_vers), + G_TP_VERSION_MAJOR(tp_vers), + G_TP_VERSION_MINOR(tp_vers), + G_TP_VERSION_MICRO(tp_vers)); +} + +static void get_strings(struct net_device *dev, u32 stringset, u8 * data) +{ + if (stringset == ETH_SS_STATS) + memcpy(data, stats_strings, sizeof(stats_strings)); +} + +static unsigned long collect_sge_port_stats(struct adapter *adapter, + struct port_info *p, int idx) +{ + int i; + unsigned long tot = 0; + + for (i = p->first_qset; i < p->first_qset + p->nqsets; ++i) + tot += adapter->sge.qs[i].port_stats[idx]; + return tot; +} + +static void get_stats(struct net_device *dev, struct ethtool_stats *stats, + u64 *data) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adapter = pi->adapter; + const struct mac_stats *s; + + spin_lock(&adapter->stats_lock); + s = t3_mac_update_stats(&pi->mac); + spin_unlock(&adapter->stats_lock); + + *data++ = s->tx_octets; + *data++ = s->tx_frames; + *data++ = s->tx_mcast_frames; + *data++ = s->tx_bcast_frames; + *data++ = s->tx_pause; + *data++ = s->tx_underrun; + *data++ = s->tx_fifo_urun; + + *data++ = s->tx_frames_64; + *data++ = s->tx_frames_65_127; + *data++ = s->tx_frames_128_255; + *data++ = s->tx_frames_256_511; + *data++ = s->tx_frames_512_1023; + *data++ = s->tx_frames_1024_1518; + *data++ = s->tx_frames_1519_max; + + *data++ = s->rx_octets; + *data++ = s->rx_frames; + *data++ = s->rx_mcast_frames; + *data++ = s->rx_bcast_frames; + *data++ = s->rx_pause; + *data++ = s->rx_fcs_errs; + *data++ = s->rx_symbol_errs; + *data++ = s->rx_short; + *data++ = s->rx_jabber; + *data++ = s->rx_too_long; + *data++ = s->rx_fifo_ovfl; + + *data++ = s->rx_frames_64; + *data++ = s->rx_frames_65_127; + *data++ = s->rx_frames_128_255; + *data++ = s->rx_frames_256_511; + *data++ = s->rx_frames_512_1023; + *data++ = s->rx_frames_1024_1518; + *data++ = s->rx_frames_1519_max; + + *data++ = pi->phy.fifo_errors; + + *data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_TSO); + *data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_VLANEX); + *data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_VLANINS); + *data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_TX_CSUM); + *data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_RX_CSUM_GOOD); + *data++ = 0; + *data++ = 0; + *data++ = 0; + *data++ = s->rx_cong_drops; + + *data++ = s->num_toggled; + *data++ = s->num_resets; + + *data++ = s->link_faults; +} + +static inline void reg_block_dump(struct adapter *ap, void *buf, + unsigned int start, unsigned int end) +{ + u32 *p = buf + start; + + for (; start <= end; start += sizeof(u32)) + *p++ = t3_read_reg(ap, start); +} + +static void get_regs(struct net_device *dev, struct ethtool_regs *regs, + void *buf) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *ap = pi->adapter; + + /* + * Version scheme: + * bits 0..9: chip version + * bits 10..15: chip revision + * bit 31: set for PCIe cards + */ + regs->version = 3 | (ap->params.rev << 10) | (is_pcie(ap) << 31); + + /* + * We skip the MAC statistics registers because they are clear-on-read. + * Also reading multi-register stats would need to synchronize with the + * periodic mac stats accumulation. Hard to justify the complexity. + */ + memset(buf, 0, T3_REGMAP_SIZE); + reg_block_dump(ap, buf, 0, A_SG_RSPQ_CREDIT_RETURN); + reg_block_dump(ap, buf, A_SG_HI_DRB_HI_THRSH, A_ULPRX_PBL_ULIMIT); + reg_block_dump(ap, buf, A_ULPTX_CONFIG, A_MPS_INT_CAUSE); + reg_block_dump(ap, buf, A_CPL_SWITCH_CNTRL, A_CPL_MAP_TBL_DATA); + reg_block_dump(ap, buf, A_SMB_GLOBAL_TIME_CFG, A_XGM_SERDES_STAT3); + reg_block_dump(ap, buf, A_XGM_SERDES_STATUS0, + XGM_REG(A_XGM_SERDES_STAT3, 1)); + reg_block_dump(ap, buf, XGM_REG(A_XGM_SERDES_STATUS0, 1), + XGM_REG(A_XGM_RX_SPI4_SOP_EOP_CNT, 1)); +} + +static int restart_autoneg(struct net_device *dev) +{ + struct port_info *p = netdev_priv(dev); + + if (!netif_running(dev)) + return -EAGAIN; + if (p->link_config.autoneg != AUTONEG_ENABLE) + return -EINVAL; + p->phy.ops->autoneg_restart(&p->phy); + return 0; +} + +static int set_phys_id(struct net_device *dev, + enum ethtool_phys_id_state state) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adapter = pi->adapter; + + switch (state) { + case ETHTOOL_ID_ACTIVE: + return 1; /* cycle on/off once per second */ + + case ETHTOOL_ID_OFF: + t3_set_reg_field(adapter, A_T3DBG_GPIO_EN, F_GPIO0_OUT_VAL, 0); + break; + + case ETHTOOL_ID_ON: + case ETHTOOL_ID_INACTIVE: + t3_set_reg_field(adapter, A_T3DBG_GPIO_EN, F_GPIO0_OUT_VAL, + F_GPIO0_OUT_VAL); + } + + return 0; +} + +static int get_settings(struct net_device *dev, struct ethtool_cmd *cmd) +{ + struct port_info *p = netdev_priv(dev); + + cmd->supported = p->link_config.supported; + cmd->advertising = p->link_config.advertising; + + if (netif_carrier_ok(dev)) { + ethtool_cmd_speed_set(cmd, p->link_config.speed); + cmd->duplex = p->link_config.duplex; + } else { + ethtool_cmd_speed_set(cmd, SPEED_UNKNOWN); + cmd->duplex = DUPLEX_UNKNOWN; + } + + cmd->port = (cmd->supported & SUPPORTED_TP) ? PORT_TP : PORT_FIBRE; + cmd->phy_address = p->phy.mdio.prtad; + cmd->transceiver = XCVR_EXTERNAL; + cmd->autoneg = p->link_config.autoneg; + cmd->maxtxpkt = 0; + cmd->maxrxpkt = 0; + return 0; +} + +static int speed_duplex_to_caps(int speed, int duplex) +{ + int cap = 0; + + switch (speed) { + case SPEED_10: + if (duplex == DUPLEX_FULL) + cap = SUPPORTED_10baseT_Full; + else + cap = SUPPORTED_10baseT_Half; + break; + case SPEED_100: + if (duplex == DUPLEX_FULL) + cap = SUPPORTED_100baseT_Full; + else + cap = SUPPORTED_100baseT_Half; + break; + case SPEED_1000: + if (duplex == DUPLEX_FULL) + cap = SUPPORTED_1000baseT_Full; + else + cap = SUPPORTED_1000baseT_Half; + break; + case SPEED_10000: + if (duplex == DUPLEX_FULL) + cap = SUPPORTED_10000baseT_Full; + } + return cap; +} + +#define ADVERTISED_MASK (ADVERTISED_10baseT_Half | ADVERTISED_10baseT_Full | \ + ADVERTISED_100baseT_Half | ADVERTISED_100baseT_Full | \ + ADVERTISED_1000baseT_Half | ADVERTISED_1000baseT_Full | \ + ADVERTISED_10000baseT_Full) + +static int set_settings(struct net_device *dev, struct ethtool_cmd *cmd) +{ + struct port_info *p = netdev_priv(dev); + struct link_config *lc = &p->link_config; + + if (!(lc->supported & SUPPORTED_Autoneg)) { + /* + * PHY offers a single speed/duplex. See if that's what's + * being requested. + */ + if (cmd->autoneg == AUTONEG_DISABLE) { + u32 speed = ethtool_cmd_speed(cmd); + int cap = speed_duplex_to_caps(speed, cmd->duplex); + if (lc->supported & cap) + return 0; + } + return -EINVAL; + } + + if (cmd->autoneg == AUTONEG_DISABLE) { + u32 speed = ethtool_cmd_speed(cmd); + int cap = speed_duplex_to_caps(speed, cmd->duplex); + + if (!(lc->supported & cap) || (speed == SPEED_1000)) + return -EINVAL; + lc->requested_speed = speed; + lc->requested_duplex = cmd->duplex; + lc->advertising = 0; + } else { + cmd->advertising &= ADVERTISED_MASK; + cmd->advertising &= lc->supported; + if (!cmd->advertising) + return -EINVAL; + lc->requested_speed = SPEED_INVALID; + lc->requested_duplex = DUPLEX_INVALID; + lc->advertising = cmd->advertising | ADVERTISED_Autoneg; + } + lc->autoneg = cmd->autoneg; + if (netif_running(dev)) + t3_link_start(&p->phy, &p->mac, lc); + return 0; +} + +static void get_pauseparam(struct net_device *dev, + struct ethtool_pauseparam *epause) +{ + struct port_info *p = netdev_priv(dev); + + epause->autoneg = (p->link_config.requested_fc & PAUSE_AUTONEG) != 0; + epause->rx_pause = (p->link_config.fc & PAUSE_RX) != 0; + epause->tx_pause = (p->link_config.fc & PAUSE_TX) != 0; +} + +static int set_pauseparam(struct net_device *dev, + struct ethtool_pauseparam *epause) +{ + struct port_info *p = netdev_priv(dev); + struct link_config *lc = &p->link_config; + + if (epause->autoneg == AUTONEG_DISABLE) + lc->requested_fc = 0; + else if (lc->supported & SUPPORTED_Autoneg) + lc->requested_fc = PAUSE_AUTONEG; + else + return -EINVAL; + + if (epause->rx_pause) + lc->requested_fc |= PAUSE_RX; + if (epause->tx_pause) + lc->requested_fc |= PAUSE_TX; + if (lc->autoneg == AUTONEG_ENABLE) { + if (netif_running(dev)) + t3_link_start(&p->phy, &p->mac, lc); + } else { + lc->fc = lc->requested_fc & (PAUSE_RX | PAUSE_TX); + if (netif_running(dev)) + t3_mac_set_speed_duplex_fc(&p->mac, -1, -1, lc->fc); + } + return 0; +} + +static void get_sge_param(struct net_device *dev, struct ethtool_ringparam *e) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adapter = pi->adapter; + const struct qset_params *q = &adapter->params.sge.qset[pi->first_qset]; + + e->rx_max_pending = MAX_RX_BUFFERS; + e->rx_jumbo_max_pending = MAX_RX_JUMBO_BUFFERS; + e->tx_max_pending = MAX_TXQ_ENTRIES; + + e->rx_pending = q->fl_size; + e->rx_mini_pending = q->rspq_size; + e->rx_jumbo_pending = q->jumbo_size; + e->tx_pending = q->txq_size[0]; +} + +static int set_sge_param(struct net_device *dev, struct ethtool_ringparam *e) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adapter = pi->adapter; + struct qset_params *q; + int i; + + if (e->rx_pending > MAX_RX_BUFFERS || + e->rx_jumbo_pending > MAX_RX_JUMBO_BUFFERS || + e->tx_pending > MAX_TXQ_ENTRIES || + e->rx_mini_pending > MAX_RSPQ_ENTRIES || + e->rx_mini_pending < MIN_RSPQ_ENTRIES || + e->rx_pending < MIN_FL_ENTRIES || + e->rx_jumbo_pending < MIN_FL_ENTRIES || + e->tx_pending < adapter->params.nports * MIN_TXQ_ENTRIES) + return -EINVAL; + + if (adapter->flags & FULL_INIT_DONE) + return -EBUSY; + + q = &adapter->params.sge.qset[pi->first_qset]; + for (i = 0; i < pi->nqsets; ++i, ++q) { + q->rspq_size = e->rx_mini_pending; + q->fl_size = e->rx_pending; + q->jumbo_size = e->rx_jumbo_pending; + q->txq_size[0] = e->tx_pending; + q->txq_size[1] = e->tx_pending; + q->txq_size[2] = e->tx_pending; + } + return 0; +} + +static int set_coalesce(struct net_device *dev, struct ethtool_coalesce *c) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adapter = pi->adapter; + struct qset_params *qsp; + struct sge_qset *qs; + int i; + + if (c->rx_coalesce_usecs * 10 > M_NEWTIMER) + return -EINVAL; + + for (i = 0; i < pi->nqsets; i++) { + qsp = &adapter->params.sge.qset[i]; + qs = &adapter->sge.qs[i]; + qsp->coalesce_usecs = c->rx_coalesce_usecs; + t3_update_qset_coalesce(qs, qsp); + } + + return 0; +} + +static int get_coalesce(struct net_device *dev, struct ethtool_coalesce *c) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adapter = pi->adapter; + struct qset_params *q = adapter->params.sge.qset; + + c->rx_coalesce_usecs = q->coalesce_usecs; + return 0; +} + +static int get_eeprom(struct net_device *dev, struct ethtool_eeprom *e, + u8 * data) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adapter = pi->adapter; + int i, err = 0; + + u8 *buf = kmalloc(EEPROMSIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + e->magic = EEPROM_MAGIC; + for (i = e->offset & ~3; !err && i < e->offset + e->len; i += 4) + err = t3_seeprom_read(adapter, i, (__le32 *) & buf[i]); + + if (!err) + memcpy(data, buf + e->offset, e->len); + kfree(buf); + return err; +} + +static int set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, + u8 * data) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adapter = pi->adapter; + u32 aligned_offset, aligned_len; + __le32 *p; + u8 *buf; + int err; + + if (eeprom->magic != EEPROM_MAGIC) + return -EINVAL; + + aligned_offset = eeprom->offset & ~3; + aligned_len = (eeprom->len + (eeprom->offset & 3) + 3) & ~3; + + if (aligned_offset != eeprom->offset || aligned_len != eeprom->len) { + buf = kmalloc(aligned_len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + err = t3_seeprom_read(adapter, aligned_offset, (__le32 *) buf); + if (!err && aligned_len > 4) + err = t3_seeprom_read(adapter, + aligned_offset + aligned_len - 4, + (__le32 *) & buf[aligned_len - 4]); + if (err) + goto out; + memcpy(buf + (eeprom->offset & 3), data, eeprom->len); + } else + buf = data; + + err = t3_seeprom_wp(adapter, 0); + if (err) + goto out; + + for (p = (__le32 *) buf; !err && aligned_len; aligned_len -= 4, p++) { + err = t3_seeprom_write(adapter, aligned_offset, *p); + aligned_offset += 4; + } + + if (!err) + err = t3_seeprom_wp(adapter, 1); +out: + if (buf != data) + kfree(buf); + return err; +} + +static void get_wol(struct net_device *dev, struct ethtool_wolinfo *wol) +{ + wol->supported = 0; + wol->wolopts = 0; + memset(&wol->sopass, 0, sizeof(wol->sopass)); +} + +static const struct ethtool_ops cxgb_ethtool_ops = { + .get_settings = get_settings, + .set_settings = set_settings, + .get_drvinfo = get_drvinfo, + .get_msglevel = get_msglevel, + .set_msglevel = set_msglevel, + .get_ringparam = get_sge_param, + .set_ringparam = set_sge_param, + .get_coalesce = get_coalesce, + .set_coalesce = set_coalesce, + .get_eeprom_len = get_eeprom_len, + .get_eeprom = get_eeprom, + .set_eeprom = set_eeprom, + .get_pauseparam = get_pauseparam, + .set_pauseparam = set_pauseparam, + .get_link = ethtool_op_get_link, + .get_strings = get_strings, + .set_phys_id = set_phys_id, + .nway_reset = restart_autoneg, + .get_sset_count = get_sset_count, + .get_ethtool_stats = get_stats, + .get_regs_len = get_regs_len, + .get_regs = get_regs, + .get_wol = get_wol, +}; + +static int in_range(int val, int lo, int hi) +{ + return val < 0 || (val <= hi && val >= lo); +} + +static int cxgb_extension_ioctl(struct net_device *dev, void __user *useraddr) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adapter = pi->adapter; + u32 cmd; + int ret; + + if (copy_from_user(&cmd, useraddr, sizeof(cmd))) + return -EFAULT; + + switch (cmd) { + case CHELSIO_SET_QSET_PARAMS:{ + int i; + struct qset_params *q; + struct ch_qset_params t; + int q1 = pi->first_qset; + int nqsets = pi->nqsets; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (copy_from_user(&t, useraddr, sizeof(t))) + return -EFAULT; + if (t.qset_idx >= SGE_QSETS) + return -EINVAL; + if (!in_range(t.intr_lat, 0, M_NEWTIMER) || + !in_range(t.cong_thres, 0, 255) || + !in_range(t.txq_size[0], MIN_TXQ_ENTRIES, + MAX_TXQ_ENTRIES) || + !in_range(t.txq_size[1], MIN_TXQ_ENTRIES, + MAX_TXQ_ENTRIES) || + !in_range(t.txq_size[2], MIN_CTRL_TXQ_ENTRIES, + MAX_CTRL_TXQ_ENTRIES) || + !in_range(t.fl_size[0], MIN_FL_ENTRIES, + MAX_RX_BUFFERS) || + !in_range(t.fl_size[1], MIN_FL_ENTRIES, + MAX_RX_JUMBO_BUFFERS) || + !in_range(t.rspq_size, MIN_RSPQ_ENTRIES, + MAX_RSPQ_ENTRIES)) + return -EINVAL; + + if ((adapter->flags & FULL_INIT_DONE) && + (t.rspq_size >= 0 || t.fl_size[0] >= 0 || + t.fl_size[1] >= 0 || t.txq_size[0] >= 0 || + t.txq_size[1] >= 0 || t.txq_size[2] >= 0 || + t.polling >= 0 || t.cong_thres >= 0)) + return -EBUSY; + + /* Allow setting of any available qset when offload enabled */ + if (test_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map)) { + q1 = 0; + for_each_port(adapter, i) { + pi = adap2pinfo(adapter, i); + nqsets += pi->first_qset + pi->nqsets; + } + } + + if (t.qset_idx < q1) + return -EINVAL; + if (t.qset_idx > q1 + nqsets - 1) + return -EINVAL; + + q = &adapter->params.sge.qset[t.qset_idx]; + + if (t.rspq_size >= 0) + q->rspq_size = t.rspq_size; + if (t.fl_size[0] >= 0) + q->fl_size = t.fl_size[0]; + if (t.fl_size[1] >= 0) + q->jumbo_size = t.fl_size[1]; + if (t.txq_size[0] >= 0) + q->txq_size[0] = t.txq_size[0]; + if (t.txq_size[1] >= 0) + q->txq_size[1] = t.txq_size[1]; + if (t.txq_size[2] >= 0) + q->txq_size[2] = t.txq_size[2]; + if (t.cong_thres >= 0) + q->cong_thres = t.cong_thres; + if (t.intr_lat >= 0) { + struct sge_qset *qs = + &adapter->sge.qs[t.qset_idx]; + + q->coalesce_usecs = t.intr_lat; + t3_update_qset_coalesce(qs, q); + } + if (t.polling >= 0) { + if (adapter->flags & USING_MSIX) + q->polling = t.polling; + else { + /* No polling with INTx for T3A */ + if (adapter->params.rev == 0 && + !(adapter->flags & USING_MSI)) + t.polling = 0; + + for (i = 0; i < SGE_QSETS; i++) { + q = &adapter->params.sge. + qset[i]; + q->polling = t.polling; + } + } + } + + if (t.lro >= 0) { + if (t.lro) + dev->wanted_features |= NETIF_F_GRO; + else + dev->wanted_features &= ~NETIF_F_GRO; + netdev_update_features(dev); + } + + break; + } + case CHELSIO_GET_QSET_PARAMS:{ + struct qset_params *q; + struct ch_qset_params t; + int q1 = pi->first_qset; + int nqsets = pi->nqsets; + int i; + + if (copy_from_user(&t, useraddr, sizeof(t))) + return -EFAULT; + + /* Display qsets for all ports when offload enabled */ + if (test_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map)) { + q1 = 0; + for_each_port(adapter, i) { + pi = adap2pinfo(adapter, i); + nqsets = pi->first_qset + pi->nqsets; + } + } + + if (t.qset_idx >= nqsets) + return -EINVAL; + + q = &adapter->params.sge.qset[q1 + t.qset_idx]; + t.rspq_size = q->rspq_size; + t.txq_size[0] = q->txq_size[0]; + t.txq_size[1] = q->txq_size[1]; + t.txq_size[2] = q->txq_size[2]; + t.fl_size[0] = q->fl_size; + t.fl_size[1] = q->jumbo_size; + t.polling = q->polling; + t.lro = !!(dev->features & NETIF_F_GRO); + t.intr_lat = q->coalesce_usecs; + t.cong_thres = q->cong_thres; + t.qnum = q1; + + if (adapter->flags & USING_MSIX) + t.vector = adapter->msix_info[q1 + t.qset_idx + 1].vec; + else + t.vector = adapter->pdev->irq; + + if (copy_to_user(useraddr, &t, sizeof(t))) + return -EFAULT; + break; + } + case CHELSIO_SET_QSET_NUM:{ + struct ch_reg edata; + unsigned int i, first_qset = 0, other_qsets = 0; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (adapter->flags & FULL_INIT_DONE) + return -EBUSY; + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + if (edata.val < 1 || + (edata.val > 1 && !(adapter->flags & USING_MSIX))) + return -EINVAL; + + for_each_port(adapter, i) + if (adapter->port[i] && adapter->port[i] != dev) + other_qsets += adap2pinfo(adapter, i)->nqsets; + + if (edata.val + other_qsets > SGE_QSETS) + return -EINVAL; + + pi->nqsets = edata.val; + + for_each_port(adapter, i) + if (adapter->port[i]) { + pi = adap2pinfo(adapter, i); + pi->first_qset = first_qset; + first_qset += pi->nqsets; + } + break; + } + case CHELSIO_GET_QSET_NUM:{ + struct ch_reg edata; + + memset(&edata, 0, sizeof(struct ch_reg)); + + edata.cmd = CHELSIO_GET_QSET_NUM; + edata.val = pi->nqsets; + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + break; + } + case CHELSIO_LOAD_FW:{ + u8 *fw_data; + struct ch_mem_range t; + + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + if (copy_from_user(&t, useraddr, sizeof(t))) + return -EFAULT; + /* Check t.len sanity ? */ + fw_data = memdup_user(useraddr + sizeof(t), t.len); + if (IS_ERR(fw_data)) + return PTR_ERR(fw_data); + + ret = t3_load_fw(adapter, fw_data, t.len); + kfree(fw_data); + if (ret) + return ret; + break; + } + case CHELSIO_SETMTUTAB:{ + struct ch_mtus m; + int i; + + if (!is_offload(adapter)) + return -EOPNOTSUPP; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (offload_running(adapter)) + return -EBUSY; + if (copy_from_user(&m, useraddr, sizeof(m))) + return -EFAULT; + if (m.nmtus != NMTUS) + return -EINVAL; + if (m.mtus[0] < 81) /* accommodate SACK */ + return -EINVAL; + + /* MTUs must be in ascending order */ + for (i = 1; i < NMTUS; ++i) + if (m.mtus[i] < m.mtus[i - 1]) + return -EINVAL; + + memcpy(adapter->params.mtus, m.mtus, + sizeof(adapter->params.mtus)); + break; + } + case CHELSIO_GET_PM:{ + struct tp_params *p = &adapter->params.tp; + struct ch_pm m = {.cmd = CHELSIO_GET_PM }; + + if (!is_offload(adapter)) + return -EOPNOTSUPP; + m.tx_pg_sz = p->tx_pg_size; + m.tx_num_pg = p->tx_num_pgs; + m.rx_pg_sz = p->rx_pg_size; + m.rx_num_pg = p->rx_num_pgs; + m.pm_total = p->pmtx_size + p->chan_rx_size * p->nchan; + if (copy_to_user(useraddr, &m, sizeof(m))) + return -EFAULT; + break; + } + case CHELSIO_SET_PM:{ + struct ch_pm m; + struct tp_params *p = &adapter->params.tp; + + if (!is_offload(adapter)) + return -EOPNOTSUPP; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (adapter->flags & FULL_INIT_DONE) + return -EBUSY; + if (copy_from_user(&m, useraddr, sizeof(m))) + return -EFAULT; + if (!is_power_of_2(m.rx_pg_sz) || + !is_power_of_2(m.tx_pg_sz)) + return -EINVAL; /* not power of 2 */ + if (!(m.rx_pg_sz & 0x14000)) + return -EINVAL; /* not 16KB or 64KB */ + if (!(m.tx_pg_sz & 0x1554000)) + return -EINVAL; + if (m.tx_num_pg == -1) + m.tx_num_pg = p->tx_num_pgs; + if (m.rx_num_pg == -1) + m.rx_num_pg = p->rx_num_pgs; + if (m.tx_num_pg % 24 || m.rx_num_pg % 24) + return -EINVAL; + if (m.rx_num_pg * m.rx_pg_sz > p->chan_rx_size || + m.tx_num_pg * m.tx_pg_sz > p->chan_tx_size) + return -EINVAL; + p->rx_pg_size = m.rx_pg_sz; + p->tx_pg_size = m.tx_pg_sz; + p->rx_num_pgs = m.rx_num_pg; + p->tx_num_pgs = m.tx_num_pg; + break; + } + case CHELSIO_GET_MEM:{ + struct ch_mem_range t; + struct mc7 *mem; + u64 buf[32]; + + if (!is_offload(adapter)) + return -EOPNOTSUPP; + if (!(adapter->flags & FULL_INIT_DONE)) + return -EIO; /* need the memory controllers */ + if (copy_from_user(&t, useraddr, sizeof(t))) + return -EFAULT; + if ((t.addr & 7) || (t.len & 7)) + return -EINVAL; + if (t.mem_id == MEM_CM) + mem = &adapter->cm; + else if (t.mem_id == MEM_PMRX) + mem = &adapter->pmrx; + else if (t.mem_id == MEM_PMTX) + mem = &adapter->pmtx; + else + return -EINVAL; + + /* + * Version scheme: + * bits 0..9: chip version + * bits 10..15: chip revision + */ + t.version = 3 | (adapter->params.rev << 10); + if (copy_to_user(useraddr, &t, sizeof(t))) + return -EFAULT; + + /* + * Read 256 bytes at a time as len can be large and we don't + * want to use huge intermediate buffers. + */ + useraddr += sizeof(t); /* advance to start of buffer */ + while (t.len) { + unsigned int chunk = + min_t(unsigned int, t.len, sizeof(buf)); + + ret = + t3_mc7_bd_read(mem, t.addr / 8, chunk / 8, + buf); + if (ret) + return ret; + if (copy_to_user(useraddr, buf, chunk)) + return -EFAULT; + useraddr += chunk; + t.addr += chunk; + t.len -= chunk; + } + break; + } + case CHELSIO_SET_TRACE_FILTER:{ + struct ch_trace t; + const struct trace_params *tp; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (!offload_running(adapter)) + return -EAGAIN; + if (copy_from_user(&t, useraddr, sizeof(t))) + return -EFAULT; + + tp = (const struct trace_params *)&t.sip; + if (t.config_tx) + t3_config_trace_filter(adapter, tp, 0, + t.invert_match, + t.trace_tx); + if (t.config_rx) + t3_config_trace_filter(adapter, tp, 1, + t.invert_match, + t.trace_rx); + break; + } + default: + return -EOPNOTSUPP; + } + return 0; +} + +static int cxgb_ioctl(struct net_device *dev, struct ifreq *req, int cmd) +{ + struct mii_ioctl_data *data = if_mii(req); + struct port_info *pi = netdev_priv(dev); + struct adapter *adapter = pi->adapter; + + switch (cmd) { + case SIOCGMIIREG: + case SIOCSMIIREG: + /* Convert phy_id from older PRTAD/DEVAD format */ + if (is_10G(adapter) && + !mdio_phy_id_is_c45(data->phy_id) && + (data->phy_id & 0x1f00) && + !(data->phy_id & 0xe0e0)) + data->phy_id = mdio_phy_id_c45(data->phy_id >> 8, + data->phy_id & 0x1f); + /* FALLTHRU */ + case SIOCGMIIPHY: + return mdio_mii_ioctl(&pi->phy.mdio, data, cmd); + case SIOCCHIOCTL: + return cxgb_extension_ioctl(dev, req->ifr_data); + default: + return -EOPNOTSUPP; + } +} + +static int cxgb_change_mtu(struct net_device *dev, int new_mtu) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adapter = pi->adapter; + int ret; + + if (new_mtu < 81) /* accommodate SACK */ + return -EINVAL; + if ((ret = t3_mac_set_mtu(&pi->mac, new_mtu))) + return ret; + dev->mtu = new_mtu; + init_port_mtus(adapter); + if (adapter->params.rev == 0 && offload_running(adapter)) + t3_load_mtus(adapter, adapter->params.mtus, + adapter->params.a_wnd, adapter->params.b_wnd, + adapter->port[0]->mtu); + return 0; +} + +static int cxgb_set_mac_addr(struct net_device *dev, void *p) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adapter = pi->adapter; + struct sockaddr *addr = p; + + if (!is_valid_ether_addr(addr->sa_data)) + return -EADDRNOTAVAIL; + + memcpy(dev->dev_addr, addr->sa_data, dev->addr_len); + t3_mac_set_address(&pi->mac, LAN_MAC_IDX, dev->dev_addr); + if (offload_running(adapter)) + write_smt_entry(adapter, pi->port_id); + return 0; +} + +static netdev_features_t cxgb_fix_features(struct net_device *dev, + netdev_features_t features) +{ + /* + * Since there is no support for separate rx/tx vlan accel + * enable/disable make sure tx flag is always in same state as rx. + */ + if (features & NETIF_F_HW_VLAN_CTAG_RX) + features |= NETIF_F_HW_VLAN_CTAG_TX; + else + features &= ~NETIF_F_HW_VLAN_CTAG_TX; + + return features; +} + +static int cxgb_set_features(struct net_device *dev, netdev_features_t features) +{ + netdev_features_t changed = dev->features ^ features; + + if (changed & NETIF_F_HW_VLAN_CTAG_RX) + cxgb_vlan_mode(dev, features); + + return 0; +} + +#ifdef CONFIG_NET_POLL_CONTROLLER +static void cxgb_netpoll(struct net_device *dev) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adapter = pi->adapter; + int qidx; + + for (qidx = pi->first_qset; qidx < pi->first_qset + pi->nqsets; qidx++) { + struct sge_qset *qs = &adapter->sge.qs[qidx]; + void *source; + + if (adapter->flags & USING_MSIX) + source = qs; + else + source = adapter; + + t3_intr_handler(adapter, qs->rspq.polling) (0, source); + } +} +#endif + +/* + * Periodic accumulation of MAC statistics. + */ +static void mac_stats_update(struct adapter *adapter) +{ + int i; + + for_each_port(adapter, i) { + struct net_device *dev = adapter->port[i]; + struct port_info *p = netdev_priv(dev); + + if (netif_running(dev)) { + spin_lock(&adapter->stats_lock); + t3_mac_update_stats(&p->mac); + spin_unlock(&adapter->stats_lock); + } + } +} + +static void check_link_status(struct adapter *adapter) +{ + int i; + + for_each_port(adapter, i) { + struct net_device *dev = adapter->port[i]; + struct port_info *p = netdev_priv(dev); + int link_fault; + + spin_lock_irq(&adapter->work_lock); + link_fault = p->link_fault; + spin_unlock_irq(&adapter->work_lock); + + if (link_fault) { + t3_link_fault(adapter, i); + continue; + } + + if (!(p->phy.caps & SUPPORTED_IRQ) && netif_running(dev)) { + t3_xgm_intr_disable(adapter, i); + t3_read_reg(adapter, A_XGM_INT_STATUS + p->mac.offset); + + t3_link_changed(adapter, i); + t3_xgm_intr_enable(adapter, i); + } + } +} + +static void check_t3b2_mac(struct adapter *adapter) +{ + int i; + + if (!rtnl_trylock()) /* synchronize with ifdown */ + return; + + for_each_port(adapter, i) { + struct net_device *dev = adapter->port[i]; + struct port_info *p = netdev_priv(dev); + int status; + + if (!netif_running(dev)) + continue; + + status = 0; + if (netif_running(dev) && netif_carrier_ok(dev)) + status = t3b2_mac_watchdog_task(&p->mac); + if (status == 1) + p->mac.stats.num_toggled++; + else if (status == 2) { + struct cmac *mac = &p->mac; + + t3_mac_set_mtu(mac, dev->mtu); + t3_mac_set_address(mac, LAN_MAC_IDX, dev->dev_addr); + cxgb_set_rxmode(dev); + t3_link_start(&p->phy, mac, &p->link_config); + t3_mac_enable(mac, MAC_DIRECTION_RX | MAC_DIRECTION_TX); + t3_port_intr_enable(adapter, p->port_id); + p->mac.stats.num_resets++; + } + } + rtnl_unlock(); +} + + +static void t3_adap_check_task(struct work_struct *work) +{ + struct adapter *adapter = container_of(work, struct adapter, + adap_check_task.work); + const struct adapter_params *p = &adapter->params; + int port; + unsigned int v, status, reset; + + adapter->check_task_cnt++; + + check_link_status(adapter); + + /* Accumulate MAC stats if needed */ + if (!p->linkpoll_period || + (adapter->check_task_cnt * p->linkpoll_period) / 10 >= + p->stats_update_period) { + mac_stats_update(adapter); + adapter->check_task_cnt = 0; + } + + if (p->rev == T3_REV_B2) + check_t3b2_mac(adapter); + + /* + * Scan the XGMAC's to check for various conditions which we want to + * monitor in a periodic polling manner rather than via an interrupt + * condition. This is used for conditions which would otherwise flood + * the system with interrupts and we only really need to know that the + * conditions are "happening" ... For each condition we count the + * detection of the condition and reset it for the next polling loop. + */ + for_each_port(adapter, port) { + struct cmac *mac = &adap2pinfo(adapter, port)->mac; + u32 cause; + + cause = t3_read_reg(adapter, A_XGM_INT_CAUSE + mac->offset); + reset = 0; + if (cause & F_RXFIFO_OVERFLOW) { + mac->stats.rx_fifo_ovfl++; + reset |= F_RXFIFO_OVERFLOW; + } + + t3_write_reg(adapter, A_XGM_INT_CAUSE + mac->offset, reset); + } + + /* + * We do the same as above for FL_EMPTY interrupts. + */ + status = t3_read_reg(adapter, A_SG_INT_CAUSE); + reset = 0; + + if (status & F_FLEMPTY) { + struct sge_qset *qs = &adapter->sge.qs[0]; + int i = 0; + + reset |= F_FLEMPTY; + + v = (t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS) >> S_FL0EMPTY) & + 0xffff; + + while (v) { + qs->fl[i].empty += (v & 1); + if (i) + qs++; + i ^= 1; + v >>= 1; + } + } + + t3_write_reg(adapter, A_SG_INT_CAUSE, reset); + + /* Schedule the next check update if any port is active. */ + spin_lock_irq(&adapter->work_lock); + if (adapter->open_device_map & PORT_MASK) + schedule_chk_task(adapter); + spin_unlock_irq(&adapter->work_lock); +} + +static void db_full_task(struct work_struct *work) +{ + struct adapter *adapter = container_of(work, struct adapter, + db_full_task); + + cxgb3_event_notify(&adapter->tdev, OFFLOAD_DB_FULL, 0); +} + +static void db_empty_task(struct work_struct *work) +{ + struct adapter *adapter = container_of(work, struct adapter, + db_empty_task); + + cxgb3_event_notify(&adapter->tdev, OFFLOAD_DB_EMPTY, 0); +} + +static void db_drop_task(struct work_struct *work) +{ + struct adapter *adapter = container_of(work, struct adapter, + db_drop_task); + unsigned long delay = 1000; + unsigned short r; + + cxgb3_event_notify(&adapter->tdev, OFFLOAD_DB_DROP, 0); + + /* + * Sleep a while before ringing the driver qset dbs. + * The delay is between 1000-2023 usecs. + */ + get_random_bytes(&r, 2); + delay += r & 1023; + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(usecs_to_jiffies(delay)); + ring_dbs(adapter); +} + +/* + * Processes external (PHY) interrupts in process context. + */ +static void ext_intr_task(struct work_struct *work) +{ + struct adapter *adapter = container_of(work, struct adapter, + ext_intr_handler_task); + int i; + + /* Disable link fault interrupts */ + for_each_port(adapter, i) { + struct net_device *dev = adapter->port[i]; + struct port_info *p = netdev_priv(dev); + + t3_xgm_intr_disable(adapter, i); + t3_read_reg(adapter, A_XGM_INT_STATUS + p->mac.offset); + } + + /* Re-enable link fault interrupts */ + t3_phy_intr_handler(adapter); + + for_each_port(adapter, i) + t3_xgm_intr_enable(adapter, i); + + /* Now reenable external interrupts */ + spin_lock_irq(&adapter->work_lock); + if (adapter->slow_intr_mask) { + adapter->slow_intr_mask |= F_T3DBG; + t3_write_reg(adapter, A_PL_INT_CAUSE0, F_T3DBG); + t3_write_reg(adapter, A_PL_INT_ENABLE0, + adapter->slow_intr_mask); + } + spin_unlock_irq(&adapter->work_lock); +} + +/* + * Interrupt-context handler for external (PHY) interrupts. + */ +void t3_os_ext_intr_handler(struct adapter *adapter) +{ + /* + * Schedule a task to handle external interrupts as they may be slow + * and we use a mutex to protect MDIO registers. We disable PHY + * interrupts in the meantime and let the task reenable them when + * it's done. + */ + spin_lock(&adapter->work_lock); + if (adapter->slow_intr_mask) { + adapter->slow_intr_mask &= ~F_T3DBG; + t3_write_reg(adapter, A_PL_INT_ENABLE0, + adapter->slow_intr_mask); + queue_work(cxgb3_wq, &adapter->ext_intr_handler_task); + } + spin_unlock(&adapter->work_lock); +} + +void t3_os_link_fault_handler(struct adapter *adapter, int port_id) +{ + struct net_device *netdev = adapter->port[port_id]; + struct port_info *pi = netdev_priv(netdev); + + spin_lock(&adapter->work_lock); + pi->link_fault = 1; + spin_unlock(&adapter->work_lock); +} + +static int t3_adapter_error(struct adapter *adapter, int reset, int on_wq) +{ + int i, ret = 0; + + if (is_offload(adapter) && + test_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map)) { + cxgb3_event_notify(&adapter->tdev, OFFLOAD_STATUS_DOWN, 0); + offload_close(&adapter->tdev); + } + + /* Stop all ports */ + for_each_port(adapter, i) { + struct net_device *netdev = adapter->port[i]; + + if (netif_running(netdev)) + __cxgb_close(netdev, on_wq); + } + + /* Stop SGE timers */ + t3_stop_sge_timers(adapter); + + adapter->flags &= ~FULL_INIT_DONE; + + if (reset) + ret = t3_reset_adapter(adapter); + + pci_disable_device(adapter->pdev); + + return ret; +} + +static int t3_reenable_adapter(struct adapter *adapter) +{ + if (pci_enable_device(adapter->pdev)) { + dev_err(&adapter->pdev->dev, + "Cannot re-enable PCI device after reset.\n"); + goto err; + } + pci_set_master(adapter->pdev); + pci_restore_state(adapter->pdev); + pci_save_state(adapter->pdev); + + /* Free sge resources */ + t3_free_sge_resources(adapter); + + if (t3_replay_prep_adapter(adapter)) + goto err; + + return 0; +err: + return -1; +} + +static void t3_resume_ports(struct adapter *adapter) +{ + int i; + + /* Restart the ports */ + for_each_port(adapter, i) { + struct net_device *netdev = adapter->port[i]; + + if (netif_running(netdev)) { + if (cxgb_open(netdev)) { + dev_err(&adapter->pdev->dev, + "can't bring device back up" + " after reset\n"); + continue; + } + } + } + + if (is_offload(adapter) && !ofld_disable) + cxgb3_event_notify(&adapter->tdev, OFFLOAD_STATUS_UP, 0); +} + +/* + * processes a fatal error. + * Bring the ports down, reset the chip, bring the ports back up. + */ +static void fatal_error_task(struct work_struct *work) +{ + struct adapter *adapter = container_of(work, struct adapter, + fatal_error_handler_task); + int err = 0; + + rtnl_lock(); + err = t3_adapter_error(adapter, 1, 1); + if (!err) + err = t3_reenable_adapter(adapter); + if (!err) + t3_resume_ports(adapter); + + CH_ALERT(adapter, "adapter reset %s\n", err ? "failed" : "succeeded"); + rtnl_unlock(); +} + +void t3_fatal_err(struct adapter *adapter) +{ + unsigned int fw_status[4]; + + if (adapter->flags & FULL_INIT_DONE) { + t3_sge_stop(adapter); + t3_write_reg(adapter, A_XGM_TX_CTRL, 0); + t3_write_reg(adapter, A_XGM_RX_CTRL, 0); + t3_write_reg(adapter, XGM_REG(A_XGM_TX_CTRL, 1), 0); + t3_write_reg(adapter, XGM_REG(A_XGM_RX_CTRL, 1), 0); + + spin_lock(&adapter->work_lock); + t3_intr_disable(adapter); + queue_work(cxgb3_wq, &adapter->fatal_error_handler_task); + spin_unlock(&adapter->work_lock); + } + CH_ALERT(adapter, "encountered fatal error, operation suspended\n"); + if (!t3_cim_ctl_blk_read(adapter, 0xa0, 4, fw_status)) + CH_ALERT(adapter, "FW status: 0x%x, 0x%x, 0x%x, 0x%x\n", + fw_status[0], fw_status[1], + fw_status[2], fw_status[3]); +} + +/** + * t3_io_error_detected - called when PCI error is detected + * @pdev: Pointer to PCI device + * @state: The current pci connection state + * + * This function is called after a PCI bus error affecting + * this device has been detected. + */ +static pci_ers_result_t t3_io_error_detected(struct pci_dev *pdev, + pci_channel_state_t state) +{ + struct adapter *adapter = pci_get_drvdata(pdev); + + if (state == pci_channel_io_perm_failure) + return PCI_ERS_RESULT_DISCONNECT; + + t3_adapter_error(adapter, 0, 0); + + /* Request a slot reset. */ + return PCI_ERS_RESULT_NEED_RESET; +} + +/** + * t3_io_slot_reset - called after the pci bus has been reset. + * @pdev: Pointer to PCI device + * + * Restart the card from scratch, as if from a cold-boot. + */ +static pci_ers_result_t t3_io_slot_reset(struct pci_dev *pdev) +{ + struct adapter *adapter = pci_get_drvdata(pdev); + + if (!t3_reenable_adapter(adapter)) + return PCI_ERS_RESULT_RECOVERED; + + return PCI_ERS_RESULT_DISCONNECT; +} + +/** + * t3_io_resume - called when traffic can start flowing again. + * @pdev: Pointer to PCI device + * + * This callback is called when the error recovery driver tells us that + * its OK to resume normal operation. + */ +static void t3_io_resume(struct pci_dev *pdev) +{ + struct adapter *adapter = pci_get_drvdata(pdev); + + CH_ALERT(adapter, "adapter recovering, PEX ERR 0x%x\n", + t3_read_reg(adapter, A_PCIE_PEX_ERR)); + + rtnl_lock(); + t3_resume_ports(adapter); + rtnl_unlock(); +} + +static const struct pci_error_handlers t3_err_handler = { + .error_detected = t3_io_error_detected, + .slot_reset = t3_io_slot_reset, + .resume = t3_io_resume, +}; + +/* + * Set the number of qsets based on the number of CPUs and the number of ports, + * not to exceed the number of available qsets, assuming there are enough qsets + * per port in HW. + */ +static void set_nqsets(struct adapter *adap) +{ + int i, j = 0; + int num_cpus = netif_get_num_default_rss_queues(); + int hwports = adap->params.nports; + int nqsets = adap->msix_nvectors - 1; + + if (adap->params.rev > 0 && adap->flags & USING_MSIX) { + if (hwports == 2 && + (hwports * nqsets > SGE_QSETS || + num_cpus >= nqsets / hwports)) + nqsets /= hwports; + if (nqsets > num_cpus) + nqsets = num_cpus; + if (nqsets < 1 || hwports == 4) + nqsets = 1; + } else + nqsets = 1; + + for_each_port(adap, i) { + struct port_info *pi = adap2pinfo(adap, i); + + pi->first_qset = j; + pi->nqsets = nqsets; + j = pi->first_qset + nqsets; + + dev_info(&adap->pdev->dev, + "Port %d using %d queue sets.\n", i, nqsets); + } +} + +static int cxgb_enable_msix(struct adapter *adap) +{ + struct msix_entry entries[SGE_QSETS + 1]; + int vectors; + int i; + + vectors = ARRAY_SIZE(entries); + for (i = 0; i < vectors; ++i) + entries[i].entry = i; + + vectors = pci_enable_msix_range(adap->pdev, entries, + adap->params.nports + 1, vectors); + if (vectors < 0) + return vectors; + + for (i = 0; i < vectors; ++i) + adap->msix_info[i].vec = entries[i].vector; + adap->msix_nvectors = vectors; + + return 0; +} + +static void print_port_info(struct adapter *adap, const struct adapter_info *ai) +{ + static const char *pci_variant[] = { + "PCI", "PCI-X", "PCI-X ECC", "PCI-X 266", "PCI Express" + }; + + int i; + char buf[80]; + + if (is_pcie(adap)) + snprintf(buf, sizeof(buf), "%s x%d", + pci_variant[adap->params.pci.variant], + adap->params.pci.width); + else + snprintf(buf, sizeof(buf), "%s %dMHz/%d-bit", + pci_variant[adap->params.pci.variant], + adap->params.pci.speed, adap->params.pci.width); + + for_each_port(adap, i) { + struct net_device *dev = adap->port[i]; + const struct port_info *pi = netdev_priv(dev); + + if (!test_bit(i, &adap->registered_device_map)) + continue; + netdev_info(dev, "%s %s %sNIC (rev %d) %s%s\n", + ai->desc, pi->phy.desc, + is_offload(adap) ? "R" : "", adap->params.rev, buf, + (adap->flags & USING_MSIX) ? " MSI-X" : + (adap->flags & USING_MSI) ? " MSI" : ""); + if (adap->name == dev->name && adap->params.vpd.mclk) + pr_info("%s: %uMB CM, %uMB PMTX, %uMB PMRX, S/N: %s\n", + adap->name, t3_mc7_size(&adap->cm) >> 20, + t3_mc7_size(&adap->pmtx) >> 20, + t3_mc7_size(&adap->pmrx) >> 20, + adap->params.vpd.sn); + } +} + +static const struct net_device_ops cxgb_netdev_ops = { + .ndo_open = cxgb_open, + .ndo_stop = cxgb_close, + .ndo_start_xmit = t3_eth_xmit, + .ndo_get_stats = cxgb_get_stats, + .ndo_validate_addr = eth_validate_addr, + .ndo_set_rx_mode = cxgb_set_rxmode, + .ndo_do_ioctl = cxgb_ioctl, + .ndo_change_mtu = cxgb_change_mtu, + .ndo_set_mac_address = cxgb_set_mac_addr, + .ndo_fix_features = cxgb_fix_features, + .ndo_set_features = cxgb_set_features, +#ifdef CONFIG_NET_POLL_CONTROLLER + .ndo_poll_controller = cxgb_netpoll, +#endif +}; + +static void cxgb3_init_iscsi_mac(struct net_device *dev) +{ + struct port_info *pi = netdev_priv(dev); + + memcpy(pi->iscsic.mac_addr, dev->dev_addr, ETH_ALEN); + pi->iscsic.mac_addr[3] |= 0x80; +} + +#define TSO_FLAGS (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_TSO_ECN) +#define VLAN_FEAT (NETIF_F_SG | NETIF_F_IP_CSUM | TSO_FLAGS | \ + NETIF_F_IPV6_CSUM | NETIF_F_HIGHDMA) +static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + int i, err, pci_using_dac = 0; + resource_size_t mmio_start, mmio_len; + const struct adapter_info *ai; + struct adapter *adapter = NULL; + struct port_info *pi; + + pr_info_once("%s - version %s\n", DRV_DESC, DRV_VERSION); + + if (!cxgb3_wq) { + cxgb3_wq = create_singlethread_workqueue(DRV_NAME); + if (!cxgb3_wq) { + pr_err("cannot initialize work queue\n"); + return -ENOMEM; + } + } + + err = pci_enable_device(pdev); + if (err) { + dev_err(&pdev->dev, "cannot enable PCI device\n"); + goto out; + } + + err = pci_request_regions(pdev, DRV_NAME); + if (err) { + /* Just info, some other driver may have claimed the device. */ + dev_info(&pdev->dev, "cannot obtain PCI resources\n"); + goto out_disable_device; + } + + if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) { + pci_using_dac = 1; + err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); + if (err) { + dev_err(&pdev->dev, "unable to obtain 64-bit DMA for " + "coherent allocations\n"); + goto out_release_regions; + } + } else if ((err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) != 0) { + dev_err(&pdev->dev, "no usable DMA configuration\n"); + goto out_release_regions; + } + + pci_set_master(pdev); + pci_save_state(pdev); + + mmio_start = pci_resource_start(pdev, 0); + mmio_len = pci_resource_len(pdev, 0); + ai = t3_get_adapter_info(ent->driver_data); + + adapter = kzalloc(sizeof(*adapter), GFP_KERNEL); + if (!adapter) { + err = -ENOMEM; + goto out_release_regions; + } + + adapter->nofail_skb = + alloc_skb(sizeof(struct cpl_set_tcb_field), GFP_KERNEL); + if (!adapter->nofail_skb) { + dev_err(&pdev->dev, "cannot allocate nofail buffer\n"); + err = -ENOMEM; + goto out_free_adapter; + } + + adapter->regs = ioremap_nocache(mmio_start, mmio_len); + if (!adapter->regs) { + dev_err(&pdev->dev, "cannot map device registers\n"); + err = -ENOMEM; + goto out_free_adapter; + } + + adapter->pdev = pdev; + adapter->name = pci_name(pdev); + adapter->msg_enable = dflt_msg_enable; + adapter->mmio_len = mmio_len; + + mutex_init(&adapter->mdio_lock); + spin_lock_init(&adapter->work_lock); + spin_lock_init(&adapter->stats_lock); + + INIT_LIST_HEAD(&adapter->adapter_list); + INIT_WORK(&adapter->ext_intr_handler_task, ext_intr_task); + INIT_WORK(&adapter->fatal_error_handler_task, fatal_error_task); + + INIT_WORK(&adapter->db_full_task, db_full_task); + INIT_WORK(&adapter->db_empty_task, db_empty_task); + INIT_WORK(&adapter->db_drop_task, db_drop_task); + + INIT_DELAYED_WORK(&adapter->adap_check_task, t3_adap_check_task); + + for (i = 0; i < ai->nports0 + ai->nports1; ++i) { + struct net_device *netdev; + + netdev = alloc_etherdev_mq(sizeof(struct port_info), SGE_QSETS); + if (!netdev) { + err = -ENOMEM; + goto out_free_dev; + } + + SET_NETDEV_DEV(netdev, &pdev->dev); + + adapter->port[i] = netdev; + pi = netdev_priv(netdev); + pi->adapter = adapter; + pi->port_id = i; + netif_carrier_off(netdev); + netdev->irq = pdev->irq; + netdev->mem_start = mmio_start; + netdev->mem_end = mmio_start + mmio_len - 1; + netdev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | + NETIF_F_TSO | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_RX; + netdev->features |= netdev->hw_features | + NETIF_F_HW_VLAN_CTAG_TX; + netdev->vlan_features |= netdev->features & VLAN_FEAT; + if (pci_using_dac) + netdev->features |= NETIF_F_HIGHDMA; + + netdev->netdev_ops = &cxgb_netdev_ops; + netdev->ethtool_ops = &cxgb_ethtool_ops; + } + + pci_set_drvdata(pdev, adapter); + if (t3_prep_adapter(adapter, ai, 1) < 0) { + err = -ENODEV; + goto out_free_dev; + } + + /* + * The card is now ready to go. If any errors occur during device + * registration we do not fail the whole card but rather proceed only + * with the ports we manage to register successfully. However we must + * register at least one net device. + */ + for_each_port(adapter, i) { + err = register_netdev(adapter->port[i]); + if (err) + dev_warn(&pdev->dev, + "cannot register net device %s, skipping\n", + adapter->port[i]->name); + else { + /* + * Change the name we use for messages to the name of + * the first successfully registered interface. + */ + if (!adapter->registered_device_map) + adapter->name = adapter->port[i]->name; + + __set_bit(i, &adapter->registered_device_map); + } + } + if (!adapter->registered_device_map) { + dev_err(&pdev->dev, "could not register any net devices\n"); + goto out_free_dev; + } + + for_each_port(adapter, i) + cxgb3_init_iscsi_mac(adapter->port[i]); + + /* Driver's ready. Reflect it on LEDs */ + t3_led_ready(adapter); + + if (is_offload(adapter)) { + __set_bit(OFFLOAD_DEVMAP_BIT, &adapter->registered_device_map); + cxgb3_adapter_ofld(adapter); + } + + /* See what interrupts we'll be using */ + if (msi > 1 && cxgb_enable_msix(adapter) == 0) + adapter->flags |= USING_MSIX; + else if (msi > 0 && pci_enable_msi(pdev) == 0) + adapter->flags |= USING_MSI; + + set_nqsets(adapter); + + err = sysfs_create_group(&adapter->port[0]->dev.kobj, + &cxgb3_attr_group); + + print_port_info(adapter, ai); + return 0; + +out_free_dev: + iounmap(adapter->regs); + for (i = ai->nports0 + ai->nports1 - 1; i >= 0; --i) + if (adapter->port[i]) + free_netdev(adapter->port[i]); + +out_free_adapter: + kfree(adapter); + +out_release_regions: + pci_release_regions(pdev); +out_disable_device: + pci_disable_device(pdev); +out: + return err; +} + +static void remove_one(struct pci_dev *pdev) +{ + struct adapter *adapter = pci_get_drvdata(pdev); + + if (adapter) { + int i; + + t3_sge_stop(adapter); + sysfs_remove_group(&adapter->port[0]->dev.kobj, + &cxgb3_attr_group); + + if (is_offload(adapter)) { + cxgb3_adapter_unofld(adapter); + if (test_bit(OFFLOAD_DEVMAP_BIT, + &adapter->open_device_map)) + offload_close(&adapter->tdev); + } + + for_each_port(adapter, i) + if (test_bit(i, &adapter->registered_device_map)) + unregister_netdev(adapter->port[i]); + + t3_stop_sge_timers(adapter); + t3_free_sge_resources(adapter); + cxgb_disable_msi(adapter); + + for_each_port(adapter, i) + if (adapter->port[i]) + free_netdev(adapter->port[i]); + + iounmap(adapter->regs); + if (adapter->nofail_skb) + kfree_skb(adapter->nofail_skb); + kfree(adapter); + pci_release_regions(pdev); + pci_disable_device(pdev); + } +} + +static struct pci_driver driver = { + .name = DRV_NAME, + .id_table = cxgb3_pci_tbl, + .probe = init_one, + .remove = remove_one, + .err_handler = &t3_err_handler, +}; + +static int __init cxgb3_init_module(void) +{ + int ret; + + cxgb3_offload_init(); + + ret = pci_register_driver(&driver); + return ret; +} + +static void __exit cxgb3_cleanup_module(void) +{ + pci_unregister_driver(&driver); + if (cxgb3_wq) + destroy_workqueue(cxgb3_wq); +} + +module_init(cxgb3_init_module); +module_exit(cxgb3_cleanup_module); diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c new file mode 100644 index 0000000..b0cbb2b --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c @@ -0,0 +1,1427 @@ +/* + * Copyright (c) 2006-2008 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" +#include "regs.h" +#include "cxgb3_ioctl.h" +#include "cxgb3_ctl_defs.h" +#include "cxgb3_defs.h" +#include "l2t.h" +#include "firmware_exports.h" +#include "cxgb3_offload.h" + +static LIST_HEAD(client_list); +static LIST_HEAD(ofld_dev_list); +static DEFINE_MUTEX(cxgb3_db_lock); + +static DEFINE_RWLOCK(adapter_list_lock); +static LIST_HEAD(adapter_list); + +static const unsigned int MAX_ATIDS = 64 * 1024; +static const unsigned int ATID_BASE = 0x10000; + +static void cxgb_neigh_update(struct neighbour *neigh); +static void cxgb_redirect(struct dst_entry *old, struct dst_entry *new, + struct neighbour *neigh, const void *daddr); + +static inline int offload_activated(struct t3cdev *tdev) +{ + const struct adapter *adapter = tdev2adap(tdev); + + return test_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map); +} + +/** + * cxgb3_register_client - register an offload client + * @client: the client + * + * Add the client to the client list, + * and call backs the client for each activated offload device + */ +void cxgb3_register_client(struct cxgb3_client *client) +{ + struct t3cdev *tdev; + + mutex_lock(&cxgb3_db_lock); + list_add_tail(&client->client_list, &client_list); + + if (client->add) { + list_for_each_entry(tdev, &ofld_dev_list, ofld_dev_list) { + if (offload_activated(tdev)) + client->add(tdev); + } + } + mutex_unlock(&cxgb3_db_lock); +} + +EXPORT_SYMBOL(cxgb3_register_client); + +/** + * cxgb3_unregister_client - unregister an offload client + * @client: the client + * + * Remove the client to the client list, + * and call backs the client for each activated offload device. + */ +void cxgb3_unregister_client(struct cxgb3_client *client) +{ + struct t3cdev *tdev; + + mutex_lock(&cxgb3_db_lock); + list_del(&client->client_list); + + if (client->remove) { + list_for_each_entry(tdev, &ofld_dev_list, ofld_dev_list) { + if (offload_activated(tdev)) + client->remove(tdev); + } + } + mutex_unlock(&cxgb3_db_lock); +} + +EXPORT_SYMBOL(cxgb3_unregister_client); + +/** + * cxgb3_add_clients - activate registered clients for an offload device + * @tdev: the offload device + * + * Call backs all registered clients once a offload device is activated + */ +void cxgb3_add_clients(struct t3cdev *tdev) +{ + struct cxgb3_client *client; + + mutex_lock(&cxgb3_db_lock); + list_for_each_entry(client, &client_list, client_list) { + if (client->add) + client->add(tdev); + } + mutex_unlock(&cxgb3_db_lock); +} + +/** + * cxgb3_remove_clients - deactivates registered clients + * for an offload device + * @tdev: the offload device + * + * Call backs all registered clients once a offload device is deactivated + */ +void cxgb3_remove_clients(struct t3cdev *tdev) +{ + struct cxgb3_client *client; + + mutex_lock(&cxgb3_db_lock); + list_for_each_entry(client, &client_list, client_list) { + if (client->remove) + client->remove(tdev); + } + mutex_unlock(&cxgb3_db_lock); +} + +void cxgb3_event_notify(struct t3cdev *tdev, u32 event, u32 port) +{ + struct cxgb3_client *client; + + mutex_lock(&cxgb3_db_lock); + list_for_each_entry(client, &client_list, client_list) { + if (client->event_handler) + client->event_handler(tdev, event, port); + } + mutex_unlock(&cxgb3_db_lock); +} + +static struct net_device *get_iff_from_mac(struct adapter *adapter, + const unsigned char *mac, + unsigned int vlan) +{ + int i; + + for_each_port(adapter, i) { + struct net_device *dev = adapter->port[i]; + + if (ether_addr_equal(dev->dev_addr, mac)) { + rcu_read_lock(); + if (vlan && vlan != VLAN_VID_MASK) { + dev = __vlan_find_dev_deep_rcu(dev, htons(ETH_P_8021Q), vlan); + } else if (netif_is_bond_slave(dev)) { + struct net_device *upper_dev; + + while ((upper_dev = + netdev_master_upper_dev_get_rcu(dev))) + dev = upper_dev; + } + rcu_read_unlock(); + return dev; + } + } + return NULL; +} + +static int cxgb_ulp_iscsi_ctl(struct adapter *adapter, unsigned int req, + void *data) +{ + int i; + int ret = 0; + unsigned int val = 0; + struct ulp_iscsi_info *uiip = data; + + switch (req) { + case ULP_ISCSI_GET_PARAMS: + uiip->pdev = adapter->pdev; + uiip->llimit = t3_read_reg(adapter, A_ULPRX_ISCSI_LLIMIT); + uiip->ulimit = t3_read_reg(adapter, A_ULPRX_ISCSI_ULIMIT); + uiip->tagmask = t3_read_reg(adapter, A_ULPRX_ISCSI_TAGMASK); + + val = t3_read_reg(adapter, A_ULPRX_ISCSI_PSZ); + for (i = 0; i < 4; i++, val >>= 8) + uiip->pgsz_factor[i] = val & 0xFF; + + val = t3_read_reg(adapter, A_TP_PARA_REG7); + uiip->max_txsz = + uiip->max_rxsz = min((val >> S_PMMAXXFERLEN0)&M_PMMAXXFERLEN0, + (val >> S_PMMAXXFERLEN1)&M_PMMAXXFERLEN1); + /* + * On tx, the iscsi pdu has to be <= tx page size and has to + * fit into the Tx PM FIFO. + */ + val = min(adapter->params.tp.tx_pg_size, + t3_read_reg(adapter, A_PM1_TX_CFG) >> 17); + uiip->max_txsz = min(val, uiip->max_txsz); + + /* set MaxRxData to 16224 */ + val = t3_read_reg(adapter, A_TP_PARA_REG2); + if ((val >> S_MAXRXDATA) != 0x3f60) { + val &= (M_RXCOALESCESIZE << S_RXCOALESCESIZE); + val |= V_MAXRXDATA(0x3f60); + pr_info("%s, iscsi set MaxRxData to 16224 (0x%x)\n", + adapter->name, val); + t3_write_reg(adapter, A_TP_PARA_REG2, val); + } + + /* + * on rx, the iscsi pdu has to be < rx page size and the + * the max rx data length programmed in TP + */ + val = min(adapter->params.tp.rx_pg_size, + ((t3_read_reg(adapter, A_TP_PARA_REG2)) >> + S_MAXRXDATA) & M_MAXRXDATA); + uiip->max_rxsz = min(val, uiip->max_rxsz); + break; + case ULP_ISCSI_SET_PARAMS: + t3_write_reg(adapter, A_ULPRX_ISCSI_TAGMASK, uiip->tagmask); + /* program the ddp page sizes */ + for (i = 0; i < 4; i++) + val |= (uiip->pgsz_factor[i] & 0xF) << (8 * i); + if (val && (val != t3_read_reg(adapter, A_ULPRX_ISCSI_PSZ))) { + pr_info("%s, setting iscsi pgsz 0x%x, %u,%u,%u,%u\n", + adapter->name, val, uiip->pgsz_factor[0], + uiip->pgsz_factor[1], uiip->pgsz_factor[2], + uiip->pgsz_factor[3]); + t3_write_reg(adapter, A_ULPRX_ISCSI_PSZ, val); + } + break; + default: + ret = -EOPNOTSUPP; + } + return ret; +} + +/* Response queue used for RDMA events. */ +#define ASYNC_NOTIF_RSPQ 0 + +static int cxgb_rdma_ctl(struct adapter *adapter, unsigned int req, void *data) +{ + int ret = 0; + + switch (req) { + case RDMA_GET_PARAMS: { + struct rdma_info *rdma = data; + struct pci_dev *pdev = adapter->pdev; + + rdma->udbell_physbase = pci_resource_start(pdev, 2); + rdma->udbell_len = pci_resource_len(pdev, 2); + rdma->tpt_base = + t3_read_reg(adapter, A_ULPTX_TPT_LLIMIT); + rdma->tpt_top = t3_read_reg(adapter, A_ULPTX_TPT_ULIMIT); + rdma->pbl_base = + t3_read_reg(adapter, A_ULPTX_PBL_LLIMIT); + rdma->pbl_top = t3_read_reg(adapter, A_ULPTX_PBL_ULIMIT); + rdma->rqt_base = t3_read_reg(adapter, A_ULPRX_RQ_LLIMIT); + rdma->rqt_top = t3_read_reg(adapter, A_ULPRX_RQ_ULIMIT); + rdma->kdb_addr = adapter->regs + A_SG_KDOORBELL; + rdma->pdev = pdev; + break; + } + case RDMA_CQ_OP:{ + unsigned long flags; + struct rdma_cq_op *rdma = data; + + /* may be called in any context */ + spin_lock_irqsave(&adapter->sge.reg_lock, flags); + ret = t3_sge_cqcntxt_op(adapter, rdma->id, rdma->op, + rdma->credits); + spin_unlock_irqrestore(&adapter->sge.reg_lock, flags); + break; + } + case RDMA_GET_MEM:{ + struct ch_mem_range *t = data; + struct mc7 *mem; + + if ((t->addr & 7) || (t->len & 7)) + return -EINVAL; + if (t->mem_id == MEM_CM) + mem = &adapter->cm; + else if (t->mem_id == MEM_PMRX) + mem = &adapter->pmrx; + else if (t->mem_id == MEM_PMTX) + mem = &adapter->pmtx; + else + return -EINVAL; + + ret = + t3_mc7_bd_read(mem, t->addr / 8, t->len / 8, + (u64 *) t->buf); + if (ret) + return ret; + break; + } + case RDMA_CQ_SETUP:{ + struct rdma_cq_setup *rdma = data; + + spin_lock_irq(&adapter->sge.reg_lock); + ret = + t3_sge_init_cqcntxt(adapter, rdma->id, + rdma->base_addr, rdma->size, + ASYNC_NOTIF_RSPQ, + rdma->ovfl_mode, rdma->credits, + rdma->credit_thres); + spin_unlock_irq(&adapter->sge.reg_lock); + break; + } + case RDMA_CQ_DISABLE: + spin_lock_irq(&adapter->sge.reg_lock); + ret = t3_sge_disable_cqcntxt(adapter, *(unsigned int *)data); + spin_unlock_irq(&adapter->sge.reg_lock); + break; + case RDMA_CTRL_QP_SETUP:{ + struct rdma_ctrlqp_setup *rdma = data; + + spin_lock_irq(&adapter->sge.reg_lock); + ret = t3_sge_init_ecntxt(adapter, FW_RI_SGEEC_START, 0, + SGE_CNTXT_RDMA, + ASYNC_NOTIF_RSPQ, + rdma->base_addr, rdma->size, + FW_RI_TID_START, 1, 0); + spin_unlock_irq(&adapter->sge.reg_lock); + break; + } + case RDMA_GET_MIB: { + spin_lock(&adapter->stats_lock); + t3_tp_get_mib_stats(adapter, (struct tp_mib_stats *)data); + spin_unlock(&adapter->stats_lock); + break; + } + default: + ret = -EOPNOTSUPP; + } + return ret; +} + +static int cxgb_offload_ctl(struct t3cdev *tdev, unsigned int req, void *data) +{ + struct adapter *adapter = tdev2adap(tdev); + struct tid_range *tid; + struct mtutab *mtup; + struct iff_mac *iffmacp; + struct ddp_params *ddpp; + struct adap_ports *ports; + struct ofld_page_info *rx_page_info; + struct tp_params *tp = &adapter->params.tp; + int i; + + switch (req) { + case GET_MAX_OUTSTANDING_WR: + *(unsigned int *)data = FW_WR_NUM; + break; + case GET_WR_LEN: + *(unsigned int *)data = WR_FLITS; + break; + case GET_TX_MAX_CHUNK: + *(unsigned int *)data = 1 << 20; /* 1MB */ + break; + case GET_TID_RANGE: + tid = data; + tid->num = t3_mc5_size(&adapter->mc5) - + adapter->params.mc5.nroutes - + adapter->params.mc5.nfilters - adapter->params.mc5.nservers; + tid->base = 0; + break; + case GET_STID_RANGE: + tid = data; + tid->num = adapter->params.mc5.nservers; + tid->base = t3_mc5_size(&adapter->mc5) - tid->num - + adapter->params.mc5.nfilters - adapter->params.mc5.nroutes; + break; + case GET_L2T_CAPACITY: + *(unsigned int *)data = 2048; + break; + case GET_MTUS: + mtup = data; + mtup->size = NMTUS; + mtup->mtus = adapter->params.mtus; + break; + case GET_IFF_FROM_MAC: + iffmacp = data; + iffmacp->dev = get_iff_from_mac(adapter, iffmacp->mac_addr, + iffmacp->vlan_tag & + VLAN_VID_MASK); + break; + case GET_DDP_PARAMS: + ddpp = data; + ddpp->llimit = t3_read_reg(adapter, A_ULPRX_TDDP_LLIMIT); + ddpp->ulimit = t3_read_reg(adapter, A_ULPRX_TDDP_ULIMIT); + ddpp->tag_mask = t3_read_reg(adapter, A_ULPRX_TDDP_TAGMASK); + break; + case GET_PORTS: + ports = data; + ports->nports = adapter->params.nports; + for_each_port(adapter, i) + ports->lldevs[i] = adapter->port[i]; + break; + case ULP_ISCSI_GET_PARAMS: + case ULP_ISCSI_SET_PARAMS: + if (!offload_running(adapter)) + return -EAGAIN; + return cxgb_ulp_iscsi_ctl(adapter, req, data); + case RDMA_GET_PARAMS: + case RDMA_CQ_OP: + case RDMA_CQ_SETUP: + case RDMA_CQ_DISABLE: + case RDMA_CTRL_QP_SETUP: + case RDMA_GET_MEM: + case RDMA_GET_MIB: + if (!offload_running(adapter)) + return -EAGAIN; + return cxgb_rdma_ctl(adapter, req, data); + case GET_RX_PAGE_INFO: + rx_page_info = data; + rx_page_info->page_size = tp->rx_pg_size; + rx_page_info->num = tp->rx_num_pgs; + break; + case GET_ISCSI_IPV4ADDR: { + struct iscsi_ipv4addr *p = data; + struct port_info *pi = netdev_priv(p->dev); + p->ipv4addr = pi->iscsi_ipv4addr; + break; + } + case GET_EMBEDDED_INFO: { + struct ch_embedded_info *e = data; + + spin_lock(&adapter->stats_lock); + t3_get_fw_version(adapter, &e->fw_vers); + t3_get_tp_version(adapter, &e->tp_vers); + spin_unlock(&adapter->stats_lock); + break; + } + default: + return -EOPNOTSUPP; + } + return 0; +} + +/* + * Dummy handler for Rx offload packets in case we get an offload packet before + * proper processing is setup. This complains and drops the packet as it isn't + * normal to get offload packets at this stage. + */ +static int rx_offload_blackhole(struct t3cdev *dev, struct sk_buff **skbs, + int n) +{ + while (n--) + dev_kfree_skb_any(skbs[n]); + return 0; +} + +static void dummy_neigh_update(struct t3cdev *dev, struct neighbour *neigh) +{ +} + +void cxgb3_set_dummy_ops(struct t3cdev *dev) +{ + dev->recv = rx_offload_blackhole; + dev->neigh_update = dummy_neigh_update; +} + +/* + * Free an active-open TID. + */ +void *cxgb3_free_atid(struct t3cdev *tdev, int atid) +{ + struct tid_info *t = &(T3C_DATA(tdev))->tid_maps; + union active_open_entry *p = atid2entry(t, atid); + void *ctx = p->t3c_tid.ctx; + + spin_lock_bh(&t->atid_lock); + p->next = t->afree; + t->afree = p; + t->atids_in_use--; + spin_unlock_bh(&t->atid_lock); + + return ctx; +} + +EXPORT_SYMBOL(cxgb3_free_atid); + +/* + * Free a server TID and return it to the free pool. + */ +void cxgb3_free_stid(struct t3cdev *tdev, int stid) +{ + struct tid_info *t = &(T3C_DATA(tdev))->tid_maps; + union listen_entry *p = stid2entry(t, stid); + + spin_lock_bh(&t->stid_lock); + p->next = t->sfree; + t->sfree = p; + t->stids_in_use--; + spin_unlock_bh(&t->stid_lock); +} + +EXPORT_SYMBOL(cxgb3_free_stid); + +void cxgb3_insert_tid(struct t3cdev *tdev, struct cxgb3_client *client, + void *ctx, unsigned int tid) +{ + struct tid_info *t = &(T3C_DATA(tdev))->tid_maps; + + t->tid_tab[tid].client = client; + t->tid_tab[tid].ctx = ctx; + atomic_inc(&t->tids_in_use); +} + +EXPORT_SYMBOL(cxgb3_insert_tid); + +/* + * Populate a TID_RELEASE WR. The skb must be already propely sized. + */ +static inline void mk_tid_release(struct sk_buff *skb, unsigned int tid) +{ + struct cpl_tid_release *req; + + skb->priority = CPL_PRIORITY_SETUP; + req = (struct cpl_tid_release *)__skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid)); +} + +static void t3_process_tid_release_list(struct work_struct *work) +{ + struct t3c_data *td = container_of(work, struct t3c_data, + tid_release_task); + struct sk_buff *skb; + struct t3cdev *tdev = td->dev; + + + spin_lock_bh(&td->tid_release_lock); + while (td->tid_release_list) { + struct t3c_tid_entry *p = td->tid_release_list; + + td->tid_release_list = p->ctx; + spin_unlock_bh(&td->tid_release_lock); + + skb = alloc_skb(sizeof(struct cpl_tid_release), + GFP_KERNEL); + if (!skb) + skb = td->nofail_skb; + if (!skb) { + spin_lock_bh(&td->tid_release_lock); + p->ctx = (void *)td->tid_release_list; + td->tid_release_list = p; + break; + } + mk_tid_release(skb, p - td->tid_maps.tid_tab); + cxgb3_ofld_send(tdev, skb); + p->ctx = NULL; + if (skb == td->nofail_skb) + td->nofail_skb = + alloc_skb(sizeof(struct cpl_tid_release), + GFP_KERNEL); + spin_lock_bh(&td->tid_release_lock); + } + td->release_list_incomplete = (td->tid_release_list == NULL) ? 0 : 1; + spin_unlock_bh(&td->tid_release_lock); + + if (!td->nofail_skb) + td->nofail_skb = + alloc_skb(sizeof(struct cpl_tid_release), + GFP_KERNEL); +} + +/* use ctx as a next pointer in the tid release list */ +void cxgb3_queue_tid_release(struct t3cdev *tdev, unsigned int tid) +{ + struct t3c_data *td = T3C_DATA(tdev); + struct t3c_tid_entry *p = &td->tid_maps.tid_tab[tid]; + + spin_lock_bh(&td->tid_release_lock); + p->ctx = (void *)td->tid_release_list; + p->client = NULL; + td->tid_release_list = p; + if (!p->ctx || td->release_list_incomplete) + schedule_work(&td->tid_release_task); + spin_unlock_bh(&td->tid_release_lock); +} + +EXPORT_SYMBOL(cxgb3_queue_tid_release); + +/* + * Remove a tid from the TID table. A client may defer processing its last + * CPL message if it is locked at the time it arrives, and while the message + * sits in the client's backlog the TID may be reused for another connection. + * To handle this we atomically switch the TID association if it still points + * to the original client context. + */ +void cxgb3_remove_tid(struct t3cdev *tdev, void *ctx, unsigned int tid) +{ + struct tid_info *t = &(T3C_DATA(tdev))->tid_maps; + + BUG_ON(tid >= t->ntids); + if (tdev->type == T3A) + (void)cmpxchg(&t->tid_tab[tid].ctx, ctx, NULL); + else { + struct sk_buff *skb; + + skb = alloc_skb(sizeof(struct cpl_tid_release), GFP_ATOMIC); + if (likely(skb)) { + mk_tid_release(skb, tid); + cxgb3_ofld_send(tdev, skb); + t->tid_tab[tid].ctx = NULL; + } else + cxgb3_queue_tid_release(tdev, tid); + } + atomic_dec(&t->tids_in_use); +} + +EXPORT_SYMBOL(cxgb3_remove_tid); + +int cxgb3_alloc_atid(struct t3cdev *tdev, struct cxgb3_client *client, + void *ctx) +{ + int atid = -1; + struct tid_info *t = &(T3C_DATA(tdev))->tid_maps; + + spin_lock_bh(&t->atid_lock); + if (t->afree && + t->atids_in_use + atomic_read(&t->tids_in_use) + MC5_MIN_TIDS <= + t->ntids) { + union active_open_entry *p = t->afree; + + atid = (p - t->atid_tab) + t->atid_base; + t->afree = p->next; + p->t3c_tid.ctx = ctx; + p->t3c_tid.client = client; + t->atids_in_use++; + } + spin_unlock_bh(&t->atid_lock); + return atid; +} + +EXPORT_SYMBOL(cxgb3_alloc_atid); + +int cxgb3_alloc_stid(struct t3cdev *tdev, struct cxgb3_client *client, + void *ctx) +{ + int stid = -1; + struct tid_info *t = &(T3C_DATA(tdev))->tid_maps; + + spin_lock_bh(&t->stid_lock); + if (t->sfree) { + union listen_entry *p = t->sfree; + + stid = (p - t->stid_tab) + t->stid_base; + t->sfree = p->next; + p->t3c_tid.ctx = ctx; + p->t3c_tid.client = client; + t->stids_in_use++; + } + spin_unlock_bh(&t->stid_lock); + return stid; +} + +EXPORT_SYMBOL(cxgb3_alloc_stid); + +/* Get the t3cdev associated with a net_device */ +struct t3cdev *dev2t3cdev(struct net_device *dev) +{ + const struct port_info *pi = netdev_priv(dev); + + return (struct t3cdev *)pi->adapter; +} + +EXPORT_SYMBOL(dev2t3cdev); + +static int do_smt_write_rpl(struct t3cdev *dev, struct sk_buff *skb) +{ + struct cpl_smt_write_rpl *rpl = cplhdr(skb); + + if (rpl->status != CPL_ERR_NONE) + pr_err("Unexpected SMT_WRITE_RPL status %u for entry %u\n", + rpl->status, GET_TID(rpl)); + + return CPL_RET_BUF_DONE; +} + +static int do_l2t_write_rpl(struct t3cdev *dev, struct sk_buff *skb) +{ + struct cpl_l2t_write_rpl *rpl = cplhdr(skb); + + if (rpl->status != CPL_ERR_NONE) + pr_err("Unexpected L2T_WRITE_RPL status %u for entry %u\n", + rpl->status, GET_TID(rpl)); + + return CPL_RET_BUF_DONE; +} + +static int do_rte_write_rpl(struct t3cdev *dev, struct sk_buff *skb) +{ + struct cpl_rte_write_rpl *rpl = cplhdr(skb); + + if (rpl->status != CPL_ERR_NONE) + pr_err("Unexpected RTE_WRITE_RPL status %u for entry %u\n", + rpl->status, GET_TID(rpl)); + + return CPL_RET_BUF_DONE; +} + +static int do_act_open_rpl(struct t3cdev *dev, struct sk_buff *skb) +{ + struct cpl_act_open_rpl *rpl = cplhdr(skb); + unsigned int atid = G_TID(ntohl(rpl->atid)); + struct t3c_tid_entry *t3c_tid; + + t3c_tid = lookup_atid(&(T3C_DATA(dev))->tid_maps, atid); + if (t3c_tid && t3c_tid->ctx && t3c_tid->client && + t3c_tid->client->handlers && + t3c_tid->client->handlers[CPL_ACT_OPEN_RPL]) { + return t3c_tid->client->handlers[CPL_ACT_OPEN_RPL] (dev, skb, + t3c_tid-> + ctx); + } else { + pr_err("%s: received clientless CPL command 0x%x\n", + dev->name, CPL_ACT_OPEN_RPL); + return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG; + } +} + +static int do_stid_rpl(struct t3cdev *dev, struct sk_buff *skb) +{ + union opcode_tid *p = cplhdr(skb); + unsigned int stid = G_TID(ntohl(p->opcode_tid)); + struct t3c_tid_entry *t3c_tid; + + t3c_tid = lookup_stid(&(T3C_DATA(dev))->tid_maps, stid); + if (t3c_tid && t3c_tid->ctx && t3c_tid->client->handlers && + t3c_tid->client->handlers[p->opcode]) { + return t3c_tid->client->handlers[p->opcode] (dev, skb, + t3c_tid->ctx); + } else { + pr_err("%s: received clientless CPL command 0x%x\n", + dev->name, p->opcode); + return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG; + } +} + +static int do_hwtid_rpl(struct t3cdev *dev, struct sk_buff *skb) +{ + union opcode_tid *p = cplhdr(skb); + unsigned int hwtid = G_TID(ntohl(p->opcode_tid)); + struct t3c_tid_entry *t3c_tid; + + t3c_tid = lookup_tid(&(T3C_DATA(dev))->tid_maps, hwtid); + if (t3c_tid && t3c_tid->ctx && t3c_tid->client->handlers && + t3c_tid->client->handlers[p->opcode]) { + return t3c_tid->client->handlers[p->opcode] + (dev, skb, t3c_tid->ctx); + } else { + pr_err("%s: received clientless CPL command 0x%x\n", + dev->name, p->opcode); + return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG; + } +} + +static int do_cr(struct t3cdev *dev, struct sk_buff *skb) +{ + struct cpl_pass_accept_req *req = cplhdr(skb); + unsigned int stid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); + struct tid_info *t = &(T3C_DATA(dev))->tid_maps; + struct t3c_tid_entry *t3c_tid; + unsigned int tid = GET_TID(req); + + if (unlikely(tid >= t->ntids)) { + printk("%s: passive open TID %u too large\n", + dev->name, tid); + t3_fatal_err(tdev2adap(dev)); + return CPL_RET_BUF_DONE; + } + + t3c_tid = lookup_stid(t, stid); + if (t3c_tid && t3c_tid->ctx && t3c_tid->client->handlers && + t3c_tid->client->handlers[CPL_PASS_ACCEPT_REQ]) { + return t3c_tid->client->handlers[CPL_PASS_ACCEPT_REQ] + (dev, skb, t3c_tid->ctx); + } else { + pr_err("%s: received clientless CPL command 0x%x\n", + dev->name, CPL_PASS_ACCEPT_REQ); + return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG; + } +} + +/* + * Returns an sk_buff for a reply CPL message of size len. If the input + * sk_buff has no other users it is trimmed and reused, otherwise a new buffer + * is allocated. The input skb must be of size at least len. Note that this + * operation does not destroy the original skb data even if it decides to reuse + * the buffer. + */ +static struct sk_buff *cxgb3_get_cpl_reply_skb(struct sk_buff *skb, size_t len, + gfp_t gfp) +{ + if (likely(!skb_cloned(skb))) { + BUG_ON(skb->len < len); + __skb_trim(skb, len); + skb_get(skb); + } else { + skb = alloc_skb(len, gfp); + if (skb) + __skb_put(skb, len); + } + return skb; +} + +static int do_abort_req_rss(struct t3cdev *dev, struct sk_buff *skb) +{ + union opcode_tid *p = cplhdr(skb); + unsigned int hwtid = G_TID(ntohl(p->opcode_tid)); + struct t3c_tid_entry *t3c_tid; + + t3c_tid = lookup_tid(&(T3C_DATA(dev))->tid_maps, hwtid); + if (t3c_tid && t3c_tid->ctx && t3c_tid->client->handlers && + t3c_tid->client->handlers[p->opcode]) { + return t3c_tid->client->handlers[p->opcode] + (dev, skb, t3c_tid->ctx); + } else { + struct cpl_abort_req_rss *req = cplhdr(skb); + struct cpl_abort_rpl *rpl; + struct sk_buff *reply_skb; + unsigned int tid = GET_TID(req); + u8 cmd = req->status; + + if (req->status == CPL_ERR_RTX_NEG_ADVICE || + req->status == CPL_ERR_PERSIST_NEG_ADVICE) + goto out; + + reply_skb = cxgb3_get_cpl_reply_skb(skb, + sizeof(struct + cpl_abort_rpl), + GFP_ATOMIC); + + if (!reply_skb) { + printk("do_abort_req_rss: couldn't get skb!\n"); + goto out; + } + reply_skb->priority = CPL_PRIORITY_DATA; + __skb_put(reply_skb, sizeof(struct cpl_abort_rpl)); + rpl = cplhdr(reply_skb); + rpl->wr.wr_hi = + htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); + rpl->wr.wr_lo = htonl(V_WR_TID(tid)); + OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid)); + rpl->cmd = cmd; + cxgb3_ofld_send(dev, reply_skb); +out: + return CPL_RET_BUF_DONE; + } +} + +static int do_act_establish(struct t3cdev *dev, struct sk_buff *skb) +{ + struct cpl_act_establish *req = cplhdr(skb); + unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); + struct tid_info *t = &(T3C_DATA(dev))->tid_maps; + struct t3c_tid_entry *t3c_tid; + unsigned int tid = GET_TID(req); + + if (unlikely(tid >= t->ntids)) { + printk("%s: active establish TID %u too large\n", + dev->name, tid); + t3_fatal_err(tdev2adap(dev)); + return CPL_RET_BUF_DONE; + } + + t3c_tid = lookup_atid(t, atid); + if (t3c_tid && t3c_tid->ctx && t3c_tid->client->handlers && + t3c_tid->client->handlers[CPL_ACT_ESTABLISH]) { + return t3c_tid->client->handlers[CPL_ACT_ESTABLISH] + (dev, skb, t3c_tid->ctx); + } else { + pr_err("%s: received clientless CPL command 0x%x\n", + dev->name, CPL_ACT_ESTABLISH); + return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG; + } +} + +static int do_trace(struct t3cdev *dev, struct sk_buff *skb) +{ + struct cpl_trace_pkt *p = cplhdr(skb); + + skb->protocol = htons(0xffff); + skb->dev = dev->lldev; + skb_pull(skb, sizeof(*p)); + skb_reset_mac_header(skb); + netif_receive_skb(skb); + return 0; +} + +/* + * That skb would better have come from process_responses() where we abuse + * ->priority and ->csum to carry our data. NB: if we get to per-arch + * ->csum, the things might get really interesting here. + */ + +static inline u32 get_hwtid(struct sk_buff *skb) +{ + return ntohl((__force __be32)skb->priority) >> 8 & 0xfffff; +} + +static inline u32 get_opcode(struct sk_buff *skb) +{ + return G_OPCODE(ntohl((__force __be32)skb->csum)); +} + +static int do_term(struct t3cdev *dev, struct sk_buff *skb) +{ + unsigned int hwtid = get_hwtid(skb); + unsigned int opcode = get_opcode(skb); + struct t3c_tid_entry *t3c_tid; + + t3c_tid = lookup_tid(&(T3C_DATA(dev))->tid_maps, hwtid); + if (t3c_tid && t3c_tid->ctx && t3c_tid->client->handlers && + t3c_tid->client->handlers[opcode]) { + return t3c_tid->client->handlers[opcode] (dev, skb, + t3c_tid->ctx); + } else { + pr_err("%s: received clientless CPL command 0x%x\n", + dev->name, opcode); + return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG; + } +} + +static int nb_callback(struct notifier_block *self, unsigned long event, + void *ctx) +{ + switch (event) { + case (NETEVENT_NEIGH_UPDATE):{ + cxgb_neigh_update((struct neighbour *)ctx); + break; + } + case (NETEVENT_REDIRECT):{ + struct netevent_redirect *nr = ctx; + cxgb_redirect(nr->old, nr->new, nr->neigh, + nr->daddr); + cxgb_neigh_update(nr->neigh); + break; + } + default: + break; + } + return 0; +} + +static struct notifier_block nb = { + .notifier_call = nb_callback +}; + +/* + * Process a received packet with an unknown/unexpected CPL opcode. + */ +static int do_bad_cpl(struct t3cdev *dev, struct sk_buff *skb) +{ + pr_err("%s: received bad CPL command 0x%x\n", dev->name, *skb->data); + return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG; +} + +/* + * Handlers for each CPL opcode + */ +static cpl_handler_func cpl_handlers[NUM_CPL_CMDS]; + +/* + * Add a new handler to the CPL dispatch table. A NULL handler may be supplied + * to unregister an existing handler. + */ +void t3_register_cpl_handler(unsigned int opcode, cpl_handler_func h) +{ + if (opcode < NUM_CPL_CMDS) + cpl_handlers[opcode] = h ? h : do_bad_cpl; + else + pr_err("T3C: handler registration for opcode %x failed\n", + opcode); +} + +EXPORT_SYMBOL(t3_register_cpl_handler); + +/* + * T3CDEV's receive method. + */ +static int process_rx(struct t3cdev *dev, struct sk_buff **skbs, int n) +{ + while (n--) { + struct sk_buff *skb = *skbs++; + unsigned int opcode = get_opcode(skb); + int ret = cpl_handlers[opcode] (dev, skb); + +#if VALIDATE_TID + if (ret & CPL_RET_UNKNOWN_TID) { + union opcode_tid *p = cplhdr(skb); + + pr_err("%s: CPL message (opcode %u) had unknown TID %u\n", + dev->name, opcode, G_TID(ntohl(p->opcode_tid))); + } +#endif + if (ret & CPL_RET_BUF_DONE) + kfree_skb(skb); + } + return 0; +} + +/* + * Sends an sk_buff to a T3C driver after dealing with any active network taps. + */ +int cxgb3_ofld_send(struct t3cdev *dev, struct sk_buff *skb) +{ + int r; + + local_bh_disable(); + r = dev->send(dev, skb); + local_bh_enable(); + return r; +} + +EXPORT_SYMBOL(cxgb3_ofld_send); + +static int is_offloading(struct net_device *dev) +{ + struct adapter *adapter; + int i; + + read_lock_bh(&adapter_list_lock); + list_for_each_entry(adapter, &adapter_list, adapter_list) { + for_each_port(adapter, i) { + if (dev == adapter->port[i]) { + read_unlock_bh(&adapter_list_lock); + return 1; + } + } + } + read_unlock_bh(&adapter_list_lock); + return 0; +} + +static void cxgb_neigh_update(struct neighbour *neigh) +{ + struct net_device *dev; + + if (!neigh) + return; + dev = neigh->dev; + if (dev && (is_offloading(dev))) { + struct t3cdev *tdev = dev2t3cdev(dev); + + BUG_ON(!tdev); + t3_l2t_update(tdev, neigh); + } +} + +static void set_l2t_ix(struct t3cdev *tdev, u32 tid, struct l2t_entry *e) +{ + struct sk_buff *skb; + struct cpl_set_tcb_field *req; + + skb = alloc_skb(sizeof(*req), GFP_ATOMIC); + if (!skb) { + pr_err("%s: cannot allocate skb!\n", __func__); + return; + } + skb->priority = CPL_PRIORITY_CONTROL; + req = (struct cpl_set_tcb_field *)skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid)); + req->reply = 0; + req->cpu_idx = 0; + req->word = htons(W_TCB_L2T_IX); + req->mask = cpu_to_be64(V_TCB_L2T_IX(M_TCB_L2T_IX)); + req->val = cpu_to_be64(V_TCB_L2T_IX(e->idx)); + tdev->send(tdev, skb); +} + +static void cxgb_redirect(struct dst_entry *old, struct dst_entry *new, + struct neighbour *neigh, + const void *daddr) +{ + struct net_device *dev; + struct tid_info *ti; + struct t3cdev *tdev; + u32 tid; + int update_tcb; + struct l2t_entry *e; + struct t3c_tid_entry *te; + + dev = neigh->dev; + + if (!is_offloading(dev)) + return; + tdev = dev2t3cdev(dev); + BUG_ON(!tdev); + + /* Add new L2T entry */ + e = t3_l2t_get(tdev, new, dev, daddr); + if (!e) { + pr_err("%s: couldn't allocate new l2t entry!\n", __func__); + return; + } + + /* Walk tid table and notify clients of dst change. */ + ti = &(T3C_DATA(tdev))->tid_maps; + for (tid = 0; tid < ti->ntids; tid++) { + te = lookup_tid(ti, tid); + BUG_ON(!te); + if (te && te->ctx && te->client && te->client->redirect) { + update_tcb = te->client->redirect(te->ctx, old, new, e); + if (update_tcb) { + rcu_read_lock(); + l2t_hold(L2DATA(tdev), e); + rcu_read_unlock(); + set_l2t_ix(tdev, tid, e); + } + } + } + l2t_release(tdev, e); +} + +/* + * Allocate a chunk of memory using kmalloc or, if that fails, vmalloc. + * The allocated memory is cleared. + */ +void *cxgb_alloc_mem(unsigned long size) +{ + void *p = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); + + if (!p) + p = vzalloc(size); + return p; +} + +/* + * Free memory allocated through t3_alloc_mem(). + */ +void cxgb_free_mem(void *addr) +{ + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); +} + +/* + * Allocate and initialize the TID tables. Returns 0 on success. + */ +static int init_tid_tabs(struct tid_info *t, unsigned int ntids, + unsigned int natids, unsigned int nstids, + unsigned int atid_base, unsigned int stid_base) +{ + unsigned long size = ntids * sizeof(*t->tid_tab) + + natids * sizeof(*t->atid_tab) + nstids * sizeof(*t->stid_tab); + + t->tid_tab = cxgb_alloc_mem(size); + if (!t->tid_tab) + return -ENOMEM; + + t->stid_tab = (union listen_entry *)&t->tid_tab[ntids]; + t->atid_tab = (union active_open_entry *)&t->stid_tab[nstids]; + t->ntids = ntids; + t->nstids = nstids; + t->stid_base = stid_base; + t->sfree = NULL; + t->natids = natids; + t->atid_base = atid_base; + t->afree = NULL; + t->stids_in_use = t->atids_in_use = 0; + atomic_set(&t->tids_in_use, 0); + spin_lock_init(&t->stid_lock); + spin_lock_init(&t->atid_lock); + + /* + * Setup the free lists for stid_tab and atid_tab. + */ + if (nstids) { + while (--nstids) + t->stid_tab[nstids - 1].next = &t->stid_tab[nstids]; + t->sfree = t->stid_tab; + } + if (natids) { + while (--natids) + t->atid_tab[natids - 1].next = &t->atid_tab[natids]; + t->afree = t->atid_tab; + } + return 0; +} + +static void free_tid_maps(struct tid_info *t) +{ + cxgb_free_mem(t->tid_tab); +} + +static inline void add_adapter(struct adapter *adap) +{ + write_lock_bh(&adapter_list_lock); + list_add_tail(&adap->adapter_list, &adapter_list); + write_unlock_bh(&adapter_list_lock); +} + +static inline void remove_adapter(struct adapter *adap) +{ + write_lock_bh(&adapter_list_lock); + list_del(&adap->adapter_list); + write_unlock_bh(&adapter_list_lock); +} + +int cxgb3_offload_activate(struct adapter *adapter) +{ + struct t3cdev *dev = &adapter->tdev; + int natids, err; + struct t3c_data *t; + struct tid_range stid_range, tid_range; + struct mtutab mtutab; + unsigned int l2t_capacity; + struct l2t_data *l2td; + + t = kzalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return -ENOMEM; + + err = -EOPNOTSUPP; + if (dev->ctl(dev, GET_TX_MAX_CHUNK, &t->tx_max_chunk) < 0 || + dev->ctl(dev, GET_MAX_OUTSTANDING_WR, &t->max_wrs) < 0 || + dev->ctl(dev, GET_L2T_CAPACITY, &l2t_capacity) < 0 || + dev->ctl(dev, GET_MTUS, &mtutab) < 0 || + dev->ctl(dev, GET_TID_RANGE, &tid_range) < 0 || + dev->ctl(dev, GET_STID_RANGE, &stid_range) < 0) + goto out_free; + + err = -ENOMEM; + l2td = t3_init_l2t(l2t_capacity); + if (!l2td) + goto out_free; + + natids = min(tid_range.num / 2, MAX_ATIDS); + err = init_tid_tabs(&t->tid_maps, tid_range.num, natids, + stid_range.num, ATID_BASE, stid_range.base); + if (err) + goto out_free_l2t; + + t->mtus = mtutab.mtus; + t->nmtus = mtutab.size; + + INIT_WORK(&t->tid_release_task, t3_process_tid_release_list); + spin_lock_init(&t->tid_release_lock); + INIT_LIST_HEAD(&t->list_node); + t->dev = dev; + + RCU_INIT_POINTER(dev->l2opt, l2td); + T3C_DATA(dev) = t; + dev->recv = process_rx; + dev->neigh_update = t3_l2t_update; + + /* Register netevent handler once */ + if (list_empty(&adapter_list)) + register_netevent_notifier(&nb); + + t->nofail_skb = alloc_skb(sizeof(struct cpl_tid_release), GFP_KERNEL); + t->release_list_incomplete = 0; + + add_adapter(adapter); + return 0; + +out_free_l2t: + t3_free_l2t(l2td); +out_free: + kfree(t); + return err; +} + +static void clean_l2_data(struct rcu_head *head) +{ + struct l2t_data *d = container_of(head, struct l2t_data, rcu_head); + t3_free_l2t(d); +} + + +void cxgb3_offload_deactivate(struct adapter *adapter) +{ + struct t3cdev *tdev = &adapter->tdev; + struct t3c_data *t = T3C_DATA(tdev); + struct l2t_data *d; + + remove_adapter(adapter); + if (list_empty(&adapter_list)) + unregister_netevent_notifier(&nb); + + free_tid_maps(&t->tid_maps); + T3C_DATA(tdev) = NULL; + rcu_read_lock(); + d = L2DATA(tdev); + rcu_read_unlock(); + RCU_INIT_POINTER(tdev->l2opt, NULL); + call_rcu(&d->rcu_head, clean_l2_data); + if (t->nofail_skb) + kfree_skb(t->nofail_skb); + kfree(t); +} + +static inline void register_tdev(struct t3cdev *tdev) +{ + static int unit; + + mutex_lock(&cxgb3_db_lock); + snprintf(tdev->name, sizeof(tdev->name), "ofld_dev%d", unit++); + list_add_tail(&tdev->ofld_dev_list, &ofld_dev_list); + mutex_unlock(&cxgb3_db_lock); +} + +static inline void unregister_tdev(struct t3cdev *tdev) +{ + mutex_lock(&cxgb3_db_lock); + list_del(&tdev->ofld_dev_list); + mutex_unlock(&cxgb3_db_lock); +} + +static inline int adap2type(struct adapter *adapter) +{ + int type = 0; + + switch (adapter->params.rev) { + case T3_REV_A: + type = T3A; + break; + case T3_REV_B: + case T3_REV_B2: + type = T3B; + break; + case T3_REV_C: + type = T3C; + break; + } + return type; +} + +void cxgb3_adapter_ofld(struct adapter *adapter) +{ + struct t3cdev *tdev = &adapter->tdev; + + INIT_LIST_HEAD(&tdev->ofld_dev_list); + + cxgb3_set_dummy_ops(tdev); + tdev->send = t3_offload_tx; + tdev->ctl = cxgb_offload_ctl; + tdev->type = adap2type(adapter); + + register_tdev(tdev); +} + +void cxgb3_adapter_unofld(struct adapter *adapter) +{ + struct t3cdev *tdev = &adapter->tdev; + + tdev->recv = NULL; + tdev->neigh_update = NULL; + + unregister_tdev(tdev); +} + +void __init cxgb3_offload_init(void) +{ + int i; + + for (i = 0; i < NUM_CPL_CMDS; ++i) + cpl_handlers[i] = do_bad_cpl; + + t3_register_cpl_handler(CPL_SMT_WRITE_RPL, do_smt_write_rpl); + t3_register_cpl_handler(CPL_L2T_WRITE_RPL, do_l2t_write_rpl); + t3_register_cpl_handler(CPL_RTE_WRITE_RPL, do_rte_write_rpl); + t3_register_cpl_handler(CPL_PASS_OPEN_RPL, do_stid_rpl); + t3_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_stid_rpl); + t3_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_cr); + t3_register_cpl_handler(CPL_PASS_ESTABLISH, do_hwtid_rpl); + t3_register_cpl_handler(CPL_ABORT_RPL_RSS, do_hwtid_rpl); + t3_register_cpl_handler(CPL_ABORT_RPL, do_hwtid_rpl); + t3_register_cpl_handler(CPL_RX_URG_NOTIFY, do_hwtid_rpl); + t3_register_cpl_handler(CPL_RX_DATA, do_hwtid_rpl); + t3_register_cpl_handler(CPL_TX_DATA_ACK, do_hwtid_rpl); + t3_register_cpl_handler(CPL_TX_DMA_ACK, do_hwtid_rpl); + t3_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl); + t3_register_cpl_handler(CPL_PEER_CLOSE, do_hwtid_rpl); + t3_register_cpl_handler(CPL_CLOSE_CON_RPL, do_hwtid_rpl); + t3_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req_rss); + t3_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish); + t3_register_cpl_handler(CPL_SET_TCB_RPL, do_hwtid_rpl); + t3_register_cpl_handler(CPL_GET_TCB_RPL, do_hwtid_rpl); + t3_register_cpl_handler(CPL_RDMA_TERMINATE, do_term); + t3_register_cpl_handler(CPL_RDMA_EC_STATUS, do_hwtid_rpl); + t3_register_cpl_handler(CPL_TRACE_PKT, do_trace); + t3_register_cpl_handler(CPL_RX_DATA_DDP, do_hwtid_rpl); + t3_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_hwtid_rpl); + t3_register_cpl_handler(CPL_ISCSI_HDR, do_hwtid_rpl); +} diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.h b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.h new file mode 100644 index 0000000..929c298 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.h @@ -0,0 +1,209 @@ +/* + * Copyright (c) 2006-2008 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _CXGB3_OFFLOAD_H +#define _CXGB3_OFFLOAD_H + +#include +#include + +#include "l2t.h" + +#include "t3cdev.h" +#include "t3_cpl.h" + +struct adapter; + +void cxgb3_offload_init(void); + +void cxgb3_adapter_ofld(struct adapter *adapter); +void cxgb3_adapter_unofld(struct adapter *adapter); +int cxgb3_offload_activate(struct adapter *adapter); +void cxgb3_offload_deactivate(struct adapter *adapter); + +void cxgb3_set_dummy_ops(struct t3cdev *dev); + +struct t3cdev *dev2t3cdev(struct net_device *dev); + +/* + * Client registration. Users of T3 driver must register themselves. + * The T3 driver will call the add function of every client for each T3 + * adapter activated, passing up the t3cdev ptr. Each client fills out an + * array of callback functions to process CPL messages. + */ + +void cxgb3_register_client(struct cxgb3_client *client); +void cxgb3_unregister_client(struct cxgb3_client *client); +void cxgb3_add_clients(struct t3cdev *tdev); +void cxgb3_remove_clients(struct t3cdev *tdev); +void cxgb3_event_notify(struct t3cdev *tdev, u32 event, u32 port); + +typedef int (*cxgb3_cpl_handler_func)(struct t3cdev *dev, + struct sk_buff *skb, void *ctx); + +enum { + OFFLOAD_STATUS_UP, + OFFLOAD_STATUS_DOWN, + OFFLOAD_PORT_DOWN, + OFFLOAD_PORT_UP, + OFFLOAD_DB_FULL, + OFFLOAD_DB_EMPTY, + OFFLOAD_DB_DROP +}; + +struct cxgb3_client { + char *name; + void (*add) (struct t3cdev *); + void (*remove) (struct t3cdev *); + cxgb3_cpl_handler_func *handlers; + int (*redirect)(void *ctx, struct dst_entry *old, + struct dst_entry *new, struct l2t_entry *l2t); + struct list_head client_list; + void (*event_handler)(struct t3cdev *tdev, u32 event, u32 port); +}; + +/* + * TID allocation services. + */ +int cxgb3_alloc_atid(struct t3cdev *dev, struct cxgb3_client *client, + void *ctx); +int cxgb3_alloc_stid(struct t3cdev *dev, struct cxgb3_client *client, + void *ctx); +void *cxgb3_free_atid(struct t3cdev *dev, int atid); +void cxgb3_free_stid(struct t3cdev *dev, int stid); +void cxgb3_insert_tid(struct t3cdev *dev, struct cxgb3_client *client, + void *ctx, unsigned int tid); +void cxgb3_queue_tid_release(struct t3cdev *dev, unsigned int tid); +void cxgb3_remove_tid(struct t3cdev *dev, void *ctx, unsigned int tid); + +struct t3c_tid_entry { + struct cxgb3_client *client; + void *ctx; +}; + +/* CPL message priority levels */ +enum { + CPL_PRIORITY_DATA = 0, /* data messages */ + CPL_PRIORITY_SETUP = 1, /* connection setup messages */ + CPL_PRIORITY_TEARDOWN = 0, /* connection teardown messages */ + CPL_PRIORITY_LISTEN = 1, /* listen start/stop messages */ + CPL_PRIORITY_ACK = 1, /* RX ACK messages */ + CPL_PRIORITY_CONTROL = 1 /* offload control messages */ +}; + +/* Flags for return value of CPL message handlers */ +enum { + CPL_RET_BUF_DONE = 1, /* buffer processing done, buffer may be freed */ + CPL_RET_BAD_MSG = 2, /* bad CPL message (e.g., unknown opcode) */ + CPL_RET_UNKNOWN_TID = 4 /* unexpected unknown TID */ +}; + +typedef int (*cpl_handler_func)(struct t3cdev *dev, struct sk_buff *skb); + +/* + * Returns a pointer to the first byte of the CPL header in an sk_buff that + * contains a CPL message. + */ +static inline void *cplhdr(struct sk_buff *skb) +{ + return skb->data; +} + +void t3_register_cpl_handler(unsigned int opcode, cpl_handler_func h); + +union listen_entry { + struct t3c_tid_entry t3c_tid; + union listen_entry *next; +}; + +union active_open_entry { + struct t3c_tid_entry t3c_tid; + union active_open_entry *next; +}; + +/* + * Holds the size, base address, free list start, etc of the TID, server TID, + * and active-open TID tables for a offload device. + * The tables themselves are allocated dynamically. + */ +struct tid_info { + struct t3c_tid_entry *tid_tab; + unsigned int ntids; + atomic_t tids_in_use; + + union listen_entry *stid_tab; + unsigned int nstids; + unsigned int stid_base; + + union active_open_entry *atid_tab; + unsigned int natids; + unsigned int atid_base; + + /* + * The following members are accessed R/W so we put them in their own + * cache lines. + * + * XXX We could combine the atid fields above with the lock here since + * atids are use once (unlike other tids). OTOH the above fields are + * usually in cache due to tid_tab. + */ + spinlock_t atid_lock ____cacheline_aligned_in_smp; + union active_open_entry *afree; + unsigned int atids_in_use; + + spinlock_t stid_lock ____cacheline_aligned; + union listen_entry *sfree; + unsigned int stids_in_use; +}; + +struct t3c_data { + struct list_head list_node; + struct t3cdev *dev; + unsigned int tx_max_chunk; /* max payload for TX_DATA */ + unsigned int max_wrs; /* max in-flight WRs per connection */ + unsigned int nmtus; + const unsigned short *mtus; + struct tid_info tid_maps; + + struct t3c_tid_entry *tid_release_list; + spinlock_t tid_release_lock; + struct work_struct tid_release_task; + + struct sk_buff *nofail_skb; + unsigned int release_list_incomplete; +}; + +/* + * t3cdev -> t3c_data accessor + */ +#define T3C_DATA(dev) (*(struct t3c_data **)&(dev)->l4opt) + +#endif diff --git a/drivers/net/ethernet/chelsio/cxgb3/firmware_exports.h b/drivers/net/ethernet/chelsio/cxgb3/firmware_exports.h new file mode 100644 index 0000000..0d9b0e6 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb3/firmware_exports.h @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2004-2008 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _FIRMWARE_EXPORTS_H_ +#define _FIRMWARE_EXPORTS_H_ + +/* WR OPCODES supported by the firmware. + */ +#define FW_WROPCODE_FORWARD 0x01 +#define FW_WROPCODE_BYPASS 0x05 + +#define FW_WROPCODE_TUNNEL_TX_PKT 0x03 + +#define FW_WROPOCDE_ULPTX_DATA_SGL 0x00 +#define FW_WROPCODE_ULPTX_MEM_READ 0x02 +#define FW_WROPCODE_ULPTX_PKT 0x04 +#define FW_WROPCODE_ULPTX_INVALIDATE 0x06 + +#define FW_WROPCODE_TUNNEL_RX_PKT 0x07 + +#define FW_WROPCODE_OFLD_GETTCB_RPL 0x08 +#define FW_WROPCODE_OFLD_CLOSE_CON 0x09 +#define FW_WROPCODE_OFLD_TP_ABORT_CON_REQ 0x0A +#define FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL 0x0F +#define FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ 0x0B +#define FW_WROPCODE_OFLD_TP_ABORT_CON_RPL 0x0C +#define FW_WROPCODE_OFLD_TX_DATA 0x0D +#define FW_WROPCODE_OFLD_TX_DATA_ACK 0x0E + +#define FW_WROPCODE_RI_RDMA_INIT 0x10 +#define FW_WROPCODE_RI_RDMA_WRITE 0x11 +#define FW_WROPCODE_RI_RDMA_READ_REQ 0x12 +#define FW_WROPCODE_RI_RDMA_READ_RESP 0x13 +#define FW_WROPCODE_RI_SEND 0x14 +#define FW_WROPCODE_RI_TERMINATE 0x15 +#define FW_WROPCODE_RI_RDMA_READ 0x16 +#define FW_WROPCODE_RI_RECEIVE 0x17 +#define FW_WROPCODE_RI_BIND_MW 0x18 +#define FW_WROPCODE_RI_FASTREGISTER_MR 0x19 +#define FW_WROPCODE_RI_LOCAL_INV 0x1A +#define FW_WROPCODE_RI_MODIFY_QP 0x1B +#define FW_WROPCODE_RI_BYPASS 0x1C + +#define FW_WROPOCDE_RSVD 0x1E + +#define FW_WROPCODE_SGE_EGRESSCONTEXT_RR 0x1F + +#define FW_WROPCODE_MNGT 0x1D +#define FW_MNGTOPCODE_PKTSCHED_SET 0x00 + +/* Maximum size of a WR sent from the host, limited by the SGE. + * + * Note: WR coming from ULP or TP are only limited by CIM. + */ +#define FW_WR_SIZE 128 + +/* Maximum number of outstanding WRs sent from the host. Value must be + * programmed in the CTRL/TUNNEL/QP SGE Egress Context and used by + * offload modules to limit the number of WRs per connection. + */ +#define FW_T3_WR_NUM 16 +#define FW_N3_WR_NUM 7 + +#ifndef N3 +# define FW_WR_NUM FW_T3_WR_NUM +#else +# define FW_WR_NUM FW_N3_WR_NUM +#endif + +/* FW_TUNNEL_NUM corresponds to the number of supported TUNNEL Queues. These + * queues must start at SGE Egress Context FW_TUNNEL_SGEEC_START and must + * start at 'TID' (or 'uP Token') FW_TUNNEL_TID_START. + * + * Ingress Traffic (e.g. DMA completion credit) for TUNNEL Queue[i] is sent + * to RESP Queue[i]. + */ +#define FW_TUNNEL_NUM 8 +#define FW_TUNNEL_SGEEC_START 8 +#define FW_TUNNEL_TID_START 65544 + +/* FW_CTRL_NUM corresponds to the number of supported CTRL Queues. These queues + * must start at SGE Egress Context FW_CTRL_SGEEC_START and must start at 'TID' + * (or 'uP Token') FW_CTRL_TID_START. + * + * Ingress Traffic for CTRL Queue[i] is sent to RESP Queue[i]. + */ +#define FW_CTRL_NUM 8 +#define FW_CTRL_SGEEC_START 65528 +#define FW_CTRL_TID_START 65536 + +/* FW_OFLD_NUM corresponds to the number of supported OFFLOAD Queues. These + * queues must start at SGE Egress Context FW_OFLD_SGEEC_START. + * + * Note: the 'uP Token' in the SGE Egress Context fields is irrelevant for + * OFFLOAD Queues, as the host is responsible for providing the correct TID in + * every WR. + * + * Ingress Trafffic for OFFLOAD Queue[i] is sent to RESP Queue[i]. + */ +#define FW_OFLD_NUM 8 +#define FW_OFLD_SGEEC_START 0 + +/* + * + */ +#define FW_RI_NUM 1 +#define FW_RI_SGEEC_START 65527 +#define FW_RI_TID_START 65552 + +/* + * The RX_PKT_TID + */ +#define FW_RX_PKT_NUM 1 +#define FW_RX_PKT_TID_START 65553 + +/* FW_WRC_NUM corresponds to the number of Work Request Context that supported + * by the firmware. + */ +#define FW_WRC_NUM \ + (65536 + FW_TUNNEL_NUM + FW_CTRL_NUM + FW_RI_NUM + FW_RX_PKT_NUM) + +/* + * FW type and version. + */ +#define S_FW_VERSION_TYPE 28 +#define M_FW_VERSION_TYPE 0xF +#define V_FW_VERSION_TYPE(x) ((x) << S_FW_VERSION_TYPE) +#define G_FW_VERSION_TYPE(x) \ + (((x) >> S_FW_VERSION_TYPE) & M_FW_VERSION_TYPE) + +#define S_FW_VERSION_MAJOR 16 +#define M_FW_VERSION_MAJOR 0xFFF +#define V_FW_VERSION_MAJOR(x) ((x) << S_FW_VERSION_MAJOR) +#define G_FW_VERSION_MAJOR(x) \ + (((x) >> S_FW_VERSION_MAJOR) & M_FW_VERSION_MAJOR) + +#define S_FW_VERSION_MINOR 8 +#define M_FW_VERSION_MINOR 0xFF +#define V_FW_VERSION_MINOR(x) ((x) << S_FW_VERSION_MINOR) +#define G_FW_VERSION_MINOR(x) \ + (((x) >> S_FW_VERSION_MINOR) & M_FW_VERSION_MINOR) + +#define S_FW_VERSION_MICRO 0 +#define M_FW_VERSION_MICRO 0xFF +#define V_FW_VERSION_MICRO(x) ((x) << S_FW_VERSION_MICRO) +#define G_FW_VERSION_MICRO(x) \ + (((x) >> S_FW_VERSION_MICRO) & M_FW_VERSION_MICRO) + +#endif /* _FIRMWARE_EXPORTS_H_ */ diff --git a/drivers/net/ethernet/chelsio/cxgb3/l2t.c b/drivers/net/ethernet/chelsio/cxgb3/l2t.c new file mode 100644 index 0000000..5f226ed --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb3/l2t.c @@ -0,0 +1,470 @@ +/* + * Copyright (c) 2003-2008 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "common.h" +#include "t3cdev.h" +#include "cxgb3_defs.h" +#include "l2t.h" +#include "t3_cpl.h" +#include "firmware_exports.h" + +#define VLAN_NONE 0xfff + +/* + * Module locking notes: There is a RW lock protecting the L2 table as a + * whole plus a spinlock per L2T entry. Entry lookups and allocations happen + * under the protection of the table lock, individual entry changes happen + * while holding that entry's spinlock. The table lock nests outside the + * entry locks. Allocations of new entries take the table lock as writers so + * no other lookups can happen while allocating new entries. Entry updates + * take the table lock as readers so multiple entries can be updated in + * parallel. An L2T entry can be dropped by decrementing its reference count + * and therefore can happen in parallel with entry allocation but no entry + * can change state or increment its ref count during allocation as both of + * these perform lookups. + */ + +static inline unsigned int vlan_prio(const struct l2t_entry *e) +{ + return e->vlan >> 13; +} + +static inline unsigned int arp_hash(u32 key, int ifindex, + const struct l2t_data *d) +{ + return jhash_2words(key, ifindex, 0) & (d->nentries - 1); +} + +static inline void neigh_replace(struct l2t_entry *e, struct neighbour *n) +{ + neigh_hold(n); + if (e->neigh) + neigh_release(e->neigh); + e->neigh = n; +} + +/* + * Set up an L2T entry and send any packets waiting in the arp queue. The + * supplied skb is used for the CPL_L2T_WRITE_REQ. Must be called with the + * entry locked. + */ +static int setup_l2e_send_pending(struct t3cdev *dev, struct sk_buff *skb, + struct l2t_entry *e) +{ + struct cpl_l2t_write_req *req; + struct sk_buff *tmp; + + if (!skb) { + skb = alloc_skb(sizeof(*req), GFP_ATOMIC); + if (!skb) + return -ENOMEM; + } + + req = (struct cpl_l2t_write_req *)__skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, e->idx)); + req->params = htonl(V_L2T_W_IDX(e->idx) | V_L2T_W_IFF(e->smt_idx) | + V_L2T_W_VLAN(e->vlan & VLAN_VID_MASK) | + V_L2T_W_PRIO(vlan_prio(e))); + memcpy(e->dmac, e->neigh->ha, sizeof(e->dmac)); + memcpy(req->dst_mac, e->dmac, sizeof(req->dst_mac)); + skb->priority = CPL_PRIORITY_CONTROL; + cxgb3_ofld_send(dev, skb); + + skb_queue_walk_safe(&e->arpq, skb, tmp) { + __skb_unlink(skb, &e->arpq); + cxgb3_ofld_send(dev, skb); + } + e->state = L2T_STATE_VALID; + + return 0; +} + +/* + * Add a packet to the an L2T entry's queue of packets awaiting resolution. + * Must be called with the entry's lock held. + */ +static inline void arpq_enqueue(struct l2t_entry *e, struct sk_buff *skb) +{ + __skb_queue_tail(&e->arpq, skb); +} + +int t3_l2t_send_slow(struct t3cdev *dev, struct sk_buff *skb, + struct l2t_entry *e) +{ +again: + switch (e->state) { + case L2T_STATE_STALE: /* entry is stale, kick off revalidation */ + neigh_event_send(e->neigh, NULL); + spin_lock_bh(&e->lock); + if (e->state == L2T_STATE_STALE) + e->state = L2T_STATE_VALID; + spin_unlock_bh(&e->lock); + case L2T_STATE_VALID: /* fast-path, send the packet on */ + return cxgb3_ofld_send(dev, skb); + case L2T_STATE_RESOLVING: + spin_lock_bh(&e->lock); + if (e->state != L2T_STATE_RESOLVING) { + /* ARP already completed */ + spin_unlock_bh(&e->lock); + goto again; + } + arpq_enqueue(e, skb); + spin_unlock_bh(&e->lock); + + /* + * Only the first packet added to the arpq should kick off + * resolution. However, because the alloc_skb below can fail, + * we allow each packet added to the arpq to retry resolution + * as a way of recovering from transient memory exhaustion. + * A better way would be to use a work request to retry L2T + * entries when there's no memory. + */ + if (!neigh_event_send(e->neigh, NULL)) { + skb = alloc_skb(sizeof(struct cpl_l2t_write_req), + GFP_ATOMIC); + if (!skb) + break; + + spin_lock_bh(&e->lock); + if (!skb_queue_empty(&e->arpq)) + setup_l2e_send_pending(dev, skb, e); + else /* we lost the race */ + __kfree_skb(skb); + spin_unlock_bh(&e->lock); + } + } + return 0; +} + +EXPORT_SYMBOL(t3_l2t_send_slow); + +void t3_l2t_send_event(struct t3cdev *dev, struct l2t_entry *e) +{ +again: + switch (e->state) { + case L2T_STATE_STALE: /* entry is stale, kick off revalidation */ + neigh_event_send(e->neigh, NULL); + spin_lock_bh(&e->lock); + if (e->state == L2T_STATE_STALE) { + e->state = L2T_STATE_VALID; + } + spin_unlock_bh(&e->lock); + return; + case L2T_STATE_VALID: /* fast-path, send the packet on */ + return; + case L2T_STATE_RESOLVING: + spin_lock_bh(&e->lock); + if (e->state != L2T_STATE_RESOLVING) { + /* ARP already completed */ + spin_unlock_bh(&e->lock); + goto again; + } + spin_unlock_bh(&e->lock); + + /* + * Only the first packet added to the arpq should kick off + * resolution. However, because the alloc_skb below can fail, + * we allow each packet added to the arpq to retry resolution + * as a way of recovering from transient memory exhaustion. + * A better way would be to use a work request to retry L2T + * entries when there's no memory. + */ + neigh_event_send(e->neigh, NULL); + } +} + +EXPORT_SYMBOL(t3_l2t_send_event); + +/* + * Allocate a free L2T entry. Must be called with l2t_data.lock held. + */ +static struct l2t_entry *alloc_l2e(struct l2t_data *d) +{ + struct l2t_entry *end, *e, **p; + + if (!atomic_read(&d->nfree)) + return NULL; + + /* there's definitely a free entry */ + for (e = d->rover, end = &d->l2tab[d->nentries]; e != end; ++e) + if (atomic_read(&e->refcnt) == 0) + goto found; + + for (e = &d->l2tab[1]; atomic_read(&e->refcnt); ++e) ; +found: + d->rover = e + 1; + atomic_dec(&d->nfree); + + /* + * The entry we found may be an inactive entry that is + * presently in the hash table. We need to remove it. + */ + if (e->state != L2T_STATE_UNUSED) { + int hash = arp_hash(e->addr, e->ifindex, d); + + for (p = &d->l2tab[hash].first; *p; p = &(*p)->next) + if (*p == e) { + *p = e->next; + break; + } + e->state = L2T_STATE_UNUSED; + } + return e; +} + +/* + * Called when an L2T entry has no more users. The entry is left in the hash + * table since it is likely to be reused but we also bump nfree to indicate + * that the entry can be reallocated for a different neighbor. We also drop + * the existing neighbor reference in case the neighbor is going away and is + * waiting on our reference. + * + * Because entries can be reallocated to other neighbors once their ref count + * drops to 0 we need to take the entry's lock to avoid races with a new + * incarnation. + */ +void t3_l2e_free(struct l2t_data *d, struct l2t_entry *e) +{ + spin_lock_bh(&e->lock); + if (atomic_read(&e->refcnt) == 0) { /* hasn't been recycled */ + if (e->neigh) { + neigh_release(e->neigh); + e->neigh = NULL; + } + } + spin_unlock_bh(&e->lock); + atomic_inc(&d->nfree); +} + +EXPORT_SYMBOL(t3_l2e_free); + +/* + * Update an L2T entry that was previously used for the same next hop as neigh. + * Must be called with softirqs disabled. + */ +static inline void reuse_entry(struct l2t_entry *e, struct neighbour *neigh) +{ + unsigned int nud_state; + + spin_lock(&e->lock); /* avoid race with t3_l2t_free */ + + if (neigh != e->neigh) + neigh_replace(e, neigh); + nud_state = neigh->nud_state; + if (memcmp(e->dmac, neigh->ha, sizeof(e->dmac)) || + !(nud_state & NUD_VALID)) + e->state = L2T_STATE_RESOLVING; + else if (nud_state & NUD_CONNECTED) + e->state = L2T_STATE_VALID; + else + e->state = L2T_STATE_STALE; + spin_unlock(&e->lock); +} + +struct l2t_entry *t3_l2t_get(struct t3cdev *cdev, struct dst_entry *dst, + struct net_device *dev, const void *daddr) +{ + struct l2t_entry *e = NULL; + struct neighbour *neigh; + struct port_info *p; + struct l2t_data *d; + int hash; + u32 addr; + int ifidx; + int smt_idx; + + rcu_read_lock(); + neigh = dst_neigh_lookup(dst, daddr); + if (!neigh) + goto done_rcu; + + addr = *(u32 *) neigh->primary_key; + ifidx = neigh->dev->ifindex; + + if (!dev) + dev = neigh->dev; + p = netdev_priv(dev); + smt_idx = p->port_id; + + d = L2DATA(cdev); + if (!d) + goto done_rcu; + + hash = arp_hash(addr, ifidx, d); + + write_lock_bh(&d->lock); + for (e = d->l2tab[hash].first; e; e = e->next) + if (e->addr == addr && e->ifindex == ifidx && + e->smt_idx == smt_idx) { + l2t_hold(d, e); + if (atomic_read(&e->refcnt) == 1) + reuse_entry(e, neigh); + goto done_unlock; + } + + /* Need to allocate a new entry */ + e = alloc_l2e(d); + if (e) { + spin_lock(&e->lock); /* avoid race with t3_l2t_free */ + e->next = d->l2tab[hash].first; + d->l2tab[hash].first = e; + e->state = L2T_STATE_RESOLVING; + e->addr = addr; + e->ifindex = ifidx; + e->smt_idx = smt_idx; + atomic_set(&e->refcnt, 1); + neigh_replace(e, neigh); + if (neigh->dev->priv_flags & IFF_802_1Q_VLAN) + e->vlan = vlan_dev_vlan_id(neigh->dev); + else + e->vlan = VLAN_NONE; + spin_unlock(&e->lock); + } +done_unlock: + write_unlock_bh(&d->lock); +done_rcu: + if (neigh) + neigh_release(neigh); + rcu_read_unlock(); + return e; +} + +EXPORT_SYMBOL(t3_l2t_get); + +/* + * Called when address resolution fails for an L2T entry to handle packets + * on the arpq head. If a packet specifies a failure handler it is invoked, + * otherwise the packets is sent to the offload device. + * + * XXX: maybe we should abandon the latter behavior and just require a failure + * handler. + */ +static void handle_failed_resolution(struct t3cdev *dev, struct sk_buff_head *arpq) +{ + struct sk_buff *skb, *tmp; + + skb_queue_walk_safe(arpq, skb, tmp) { + struct l2t_skb_cb *cb = L2T_SKB_CB(skb); + + __skb_unlink(skb, arpq); + if (cb->arp_failure_handler) + cb->arp_failure_handler(dev, skb); + else + cxgb3_ofld_send(dev, skb); + } +} + +/* + * Called when the host's ARP layer makes a change to some entry that is + * loaded into the HW L2 table. + */ +void t3_l2t_update(struct t3cdev *dev, struct neighbour *neigh) +{ + struct sk_buff_head arpq; + struct l2t_entry *e; + struct l2t_data *d = L2DATA(dev); + u32 addr = *(u32 *) neigh->primary_key; + int ifidx = neigh->dev->ifindex; + int hash = arp_hash(addr, ifidx, d); + + read_lock_bh(&d->lock); + for (e = d->l2tab[hash].first; e; e = e->next) + if (e->addr == addr && e->ifindex == ifidx) { + spin_lock(&e->lock); + goto found; + } + read_unlock_bh(&d->lock); + return; + +found: + __skb_queue_head_init(&arpq); + + read_unlock(&d->lock); + if (atomic_read(&e->refcnt)) { + if (neigh != e->neigh) + neigh_replace(e, neigh); + + if (e->state == L2T_STATE_RESOLVING) { + if (neigh->nud_state & NUD_FAILED) { + skb_queue_splice_init(&e->arpq, &arpq); + } else if (neigh->nud_state & (NUD_CONNECTED|NUD_STALE)) + setup_l2e_send_pending(dev, NULL, e); + } else { + e->state = neigh->nud_state & NUD_CONNECTED ? + L2T_STATE_VALID : L2T_STATE_STALE; + if (!ether_addr_equal(e->dmac, neigh->ha)) + setup_l2e_send_pending(dev, NULL, e); + } + } + spin_unlock_bh(&e->lock); + + if (!skb_queue_empty(&arpq)) + handle_failed_resolution(dev, &arpq); +} + +struct l2t_data *t3_init_l2t(unsigned int l2t_capacity) +{ + struct l2t_data *d; + int i, size = sizeof(*d) + l2t_capacity * sizeof(struct l2t_entry); + + d = cxgb_alloc_mem(size); + if (!d) + return NULL; + + d->nentries = l2t_capacity; + d->rover = &d->l2tab[1]; /* entry 0 is not used */ + atomic_set(&d->nfree, l2t_capacity - 1); + rwlock_init(&d->lock); + + for (i = 0; i < l2t_capacity; ++i) { + d->l2tab[i].idx = i; + d->l2tab[i].state = L2T_STATE_UNUSED; + __skb_queue_head_init(&d->l2tab[i].arpq); + spin_lock_init(&d->l2tab[i].lock); + atomic_set(&d->l2tab[i].refcnt, 0); + } + return d; +} + +void t3_free_l2t(struct l2t_data *d) +{ + cxgb_free_mem(d); +} + diff --git a/drivers/net/ethernet/chelsio/cxgb3/l2t.h b/drivers/net/ethernet/chelsio/cxgb3/l2t.h new file mode 100644 index 0000000..8cffcdf --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb3/l2t.h @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2003-2008 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _CHELSIO_L2T_H +#define _CHELSIO_L2T_H + +#include +#include "t3cdev.h" +#include + +enum { + L2T_STATE_VALID, /* entry is up to date */ + L2T_STATE_STALE, /* entry may be used but needs revalidation */ + L2T_STATE_RESOLVING, /* entry needs address resolution */ + L2T_STATE_UNUSED /* entry not in use */ +}; + +struct neighbour; +struct sk_buff; + +/* + * Each L2T entry plays multiple roles. First of all, it keeps state for the + * corresponding entry of the HW L2 table and maintains a queue of offload + * packets awaiting address resolution. Second, it is a node of a hash table + * chain, where the nodes of the chain are linked together through their next + * pointer. Finally, each node is a bucket of a hash table, pointing to the + * first element in its chain through its first pointer. + */ +struct l2t_entry { + u16 state; /* entry state */ + u16 idx; /* entry index */ + u32 addr; /* dest IP address */ + int ifindex; /* neighbor's net_device's ifindex */ + u16 smt_idx; /* SMT index */ + u16 vlan; /* VLAN TCI (id: bits 0-11, prio: 13-15 */ + struct neighbour *neigh; /* associated neighbour */ + struct l2t_entry *first; /* start of hash chain */ + struct l2t_entry *next; /* next l2t_entry on chain */ + struct sk_buff_head arpq; /* queue of packets awaiting resolution */ + spinlock_t lock; + atomic_t refcnt; /* entry reference count */ + u8 dmac[6]; /* neighbour's MAC address */ +}; + +struct l2t_data { + unsigned int nentries; /* number of entries */ + struct l2t_entry *rover; /* starting point for next allocation */ + atomic_t nfree; /* number of free entries */ + rwlock_t lock; + struct l2t_entry l2tab[0]; + struct rcu_head rcu_head; /* to handle rcu cleanup */ +}; + +typedef void (*arp_failure_handler_func)(struct t3cdev * dev, + struct sk_buff * skb); + +/* + * Callback stored in an skb to handle address resolution failure. + */ +struct l2t_skb_cb { + arp_failure_handler_func arp_failure_handler; +}; + +#define L2T_SKB_CB(skb) ((struct l2t_skb_cb *)(skb)->cb) + +static inline void set_arp_failure_handler(struct sk_buff *skb, + arp_failure_handler_func hnd) +{ + L2T_SKB_CB(skb)->arp_failure_handler = hnd; +} + +/* + * Getting to the L2 data from an offload device. + */ +#define L2DATA(cdev) (rcu_dereference((cdev)->l2opt)) + +#define W_TCB_L2T_IX 0 +#define S_TCB_L2T_IX 7 +#define M_TCB_L2T_IX 0x7ffULL +#define V_TCB_L2T_IX(x) ((x) << S_TCB_L2T_IX) + +void t3_l2e_free(struct l2t_data *d, struct l2t_entry *e); +void t3_l2t_update(struct t3cdev *dev, struct neighbour *neigh); +struct l2t_entry *t3_l2t_get(struct t3cdev *cdev, struct dst_entry *dst, + struct net_device *dev, const void *daddr); +int t3_l2t_send_slow(struct t3cdev *dev, struct sk_buff *skb, + struct l2t_entry *e); +void t3_l2t_send_event(struct t3cdev *dev, struct l2t_entry *e); +struct l2t_data *t3_init_l2t(unsigned int l2t_capacity); +void t3_free_l2t(struct l2t_data *d); + +int cxgb3_ofld_send(struct t3cdev *dev, struct sk_buff *skb); + +static inline int l2t_send(struct t3cdev *dev, struct sk_buff *skb, + struct l2t_entry *e) +{ + if (likely(e->state == L2T_STATE_VALID)) + return cxgb3_ofld_send(dev, skb); + return t3_l2t_send_slow(dev, skb, e); +} + +static inline void l2t_release(struct t3cdev *t, struct l2t_entry *e) +{ + struct l2t_data *d; + + rcu_read_lock(); + d = L2DATA(t); + + if (atomic_dec_and_test(&e->refcnt) && d) + t3_l2e_free(d, e); + + rcu_read_unlock(); +} + +static inline void l2t_hold(struct l2t_data *d, struct l2t_entry *e) +{ + if (d && atomic_add_return(1, &e->refcnt) == 1) /* 0 -> 1 transition */ + atomic_dec(&d->nfree); +} + +#endif diff --git a/drivers/net/ethernet/chelsio/cxgb3/mc5.c b/drivers/net/ethernet/chelsio/cxgb3/mc5.c new file mode 100644 index 0000000..e13b7fe --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb3/mc5.c @@ -0,0 +1,438 @@ +/* + * Copyright (c) 2003-2008 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "common.h" +#include "regs.h" + +enum { + IDT75P52100 = 4, + IDT75N43102 = 5 +}; + +/* DBGI command mode */ +enum { + DBGI_MODE_MBUS = 0, + DBGI_MODE_IDT52100 = 5 +}; + +/* IDT 75P52100 commands */ +#define IDT_CMD_READ 0 +#define IDT_CMD_WRITE 1 +#define IDT_CMD_SEARCH 2 +#define IDT_CMD_LEARN 3 + +/* IDT LAR register address and value for 144-bit mode (low 32 bits) */ +#define IDT_LAR_ADR0 0x180006 +#define IDT_LAR_MODE144 0xffff0000 + +/* IDT SCR and SSR addresses (low 32 bits) */ +#define IDT_SCR_ADR0 0x180000 +#define IDT_SSR0_ADR0 0x180002 +#define IDT_SSR1_ADR0 0x180004 + +/* IDT GMR base address (low 32 bits) */ +#define IDT_GMR_BASE_ADR0 0x180020 + +/* IDT data and mask array base addresses (low 32 bits) */ +#define IDT_DATARY_BASE_ADR0 0 +#define IDT_MSKARY_BASE_ADR0 0x80000 + +/* IDT 75N43102 commands */ +#define IDT4_CMD_SEARCH144 3 +#define IDT4_CMD_WRITE 4 +#define IDT4_CMD_READ 5 + +/* IDT 75N43102 SCR address (low 32 bits) */ +#define IDT4_SCR_ADR0 0x3 + +/* IDT 75N43102 GMR base addresses (low 32 bits) */ +#define IDT4_GMR_BASE0 0x10 +#define IDT4_GMR_BASE1 0x20 +#define IDT4_GMR_BASE2 0x30 + +/* IDT 75N43102 data and mask array base addresses (low 32 bits) */ +#define IDT4_DATARY_BASE_ADR0 0x1000000 +#define IDT4_MSKARY_BASE_ADR0 0x2000000 + +#define MAX_WRITE_ATTEMPTS 5 + +#define MAX_ROUTES 2048 + +/* + * Issue a command to the TCAM and wait for its completion. The address and + * any data required by the command must have been setup by the caller. + */ +static int mc5_cmd_write(struct adapter *adapter, u32 cmd) +{ + t3_write_reg(adapter, A_MC5_DB_DBGI_REQ_CMD, cmd); + return t3_wait_op_done(adapter, A_MC5_DB_DBGI_RSP_STATUS, + F_DBGIRSPVALID, 1, MAX_WRITE_ATTEMPTS, 1); +} + +static inline void dbgi_wr_addr3(struct adapter *adapter, u32 v1, u32 v2, + u32 v3) +{ + t3_write_reg(adapter, A_MC5_DB_DBGI_REQ_ADDR0, v1); + t3_write_reg(adapter, A_MC5_DB_DBGI_REQ_ADDR1, v2); + t3_write_reg(adapter, A_MC5_DB_DBGI_REQ_ADDR2, v3); +} + +static inline void dbgi_wr_data3(struct adapter *adapter, u32 v1, u32 v2, + u32 v3) +{ + t3_write_reg(adapter, A_MC5_DB_DBGI_REQ_DATA0, v1); + t3_write_reg(adapter, A_MC5_DB_DBGI_REQ_DATA1, v2); + t3_write_reg(adapter, A_MC5_DB_DBGI_REQ_DATA2, v3); +} + +static inline void dbgi_rd_rsp3(struct adapter *adapter, u32 *v1, u32 *v2, + u32 *v3) +{ + *v1 = t3_read_reg(adapter, A_MC5_DB_DBGI_RSP_DATA0); + *v2 = t3_read_reg(adapter, A_MC5_DB_DBGI_RSP_DATA1); + *v3 = t3_read_reg(adapter, A_MC5_DB_DBGI_RSP_DATA2); +} + +/* + * Write data to the TCAM register at address (0, 0, addr_lo) using the TCAM + * command cmd. The data to be written must have been set up by the caller. + * Returns -1 on failure, 0 on success. + */ +static int mc5_write(struct adapter *adapter, u32 addr_lo, u32 cmd) +{ + t3_write_reg(adapter, A_MC5_DB_DBGI_REQ_ADDR0, addr_lo); + if (mc5_cmd_write(adapter, cmd) == 0) + return 0; + CH_ERR(adapter, "MC5 timeout writing to TCAM address 0x%x\n", + addr_lo); + return -1; +} + +static int init_mask_data_array(struct mc5 *mc5, u32 mask_array_base, + u32 data_array_base, u32 write_cmd, + int addr_shift) +{ + unsigned int i; + struct adapter *adap = mc5->adapter; + + /* + * We need the size of the TCAM data and mask arrays in terms of + * 72-bit entries. + */ + unsigned int size72 = mc5->tcam_size; + unsigned int server_base = t3_read_reg(adap, A_MC5_DB_SERVER_INDEX); + + if (mc5->mode == MC5_MODE_144_BIT) { + size72 *= 2; /* 1 144-bit entry is 2 72-bit entries */ + server_base *= 2; + } + + /* Clear the data array */ + dbgi_wr_data3(adap, 0, 0, 0); + for (i = 0; i < size72; i++) + if (mc5_write(adap, data_array_base + (i << addr_shift), + write_cmd)) + return -1; + + /* Initialize the mask array. */ + dbgi_wr_data3(adap, 0xffffffff, 0xffffffff, 0xff); + for (i = 0; i < size72; i++) { + if (i == server_base) /* entering server or routing region */ + t3_write_reg(adap, A_MC5_DB_DBGI_REQ_DATA0, + mc5->mode == MC5_MODE_144_BIT ? + 0xfffffff9 : 0xfffffffd); + if (mc5_write(adap, mask_array_base + (i << addr_shift), + write_cmd)) + return -1; + } + return 0; +} + +static int init_idt52100(struct mc5 *mc5) +{ + int i; + struct adapter *adap = mc5->adapter; + + t3_write_reg(adap, A_MC5_DB_RSP_LATENCY, + V_RDLAT(0x15) | V_LRNLAT(0x15) | V_SRCHLAT(0x15)); + t3_write_reg(adap, A_MC5_DB_PART_ID_INDEX, 2); + + /* + * Use GMRs 14-15 for ELOOKUP, GMRs 12-13 for SYN lookups, and + * GMRs 8-9 for ACK- and AOPEN searches. + */ + t3_write_reg(adap, A_MC5_DB_POPEN_DATA_WR_CMD, IDT_CMD_WRITE); + t3_write_reg(adap, A_MC5_DB_POPEN_MASK_WR_CMD, IDT_CMD_WRITE); + t3_write_reg(adap, A_MC5_DB_AOPEN_SRCH_CMD, IDT_CMD_SEARCH); + t3_write_reg(adap, A_MC5_DB_AOPEN_LRN_CMD, IDT_CMD_LEARN); + t3_write_reg(adap, A_MC5_DB_SYN_SRCH_CMD, IDT_CMD_SEARCH | 0x6000); + t3_write_reg(adap, A_MC5_DB_SYN_LRN_CMD, IDT_CMD_LEARN); + t3_write_reg(adap, A_MC5_DB_ACK_SRCH_CMD, IDT_CMD_SEARCH); + t3_write_reg(adap, A_MC5_DB_ACK_LRN_CMD, IDT_CMD_LEARN); + t3_write_reg(adap, A_MC5_DB_ILOOKUP_CMD, IDT_CMD_SEARCH); + t3_write_reg(adap, A_MC5_DB_ELOOKUP_CMD, IDT_CMD_SEARCH | 0x7000); + t3_write_reg(adap, A_MC5_DB_DATA_WRITE_CMD, IDT_CMD_WRITE); + t3_write_reg(adap, A_MC5_DB_DATA_READ_CMD, IDT_CMD_READ); + + /* Set DBGI command mode for IDT TCAM. */ + t3_write_reg(adap, A_MC5_DB_DBGI_CONFIG, DBGI_MODE_IDT52100); + + /* Set up LAR */ + dbgi_wr_data3(adap, IDT_LAR_MODE144, 0, 0); + if (mc5_write(adap, IDT_LAR_ADR0, IDT_CMD_WRITE)) + goto err; + + /* Set up SSRs */ + dbgi_wr_data3(adap, 0xffffffff, 0xffffffff, 0); + if (mc5_write(adap, IDT_SSR0_ADR0, IDT_CMD_WRITE) || + mc5_write(adap, IDT_SSR1_ADR0, IDT_CMD_WRITE)) + goto err; + + /* Set up GMRs */ + for (i = 0; i < 32; ++i) { + if (i >= 12 && i < 15) + dbgi_wr_data3(adap, 0xfffffff9, 0xffffffff, 0xff); + else if (i == 15) + dbgi_wr_data3(adap, 0xfffffff9, 0xffff8007, 0xff); + else + dbgi_wr_data3(adap, 0xffffffff, 0xffffffff, 0xff); + + if (mc5_write(adap, IDT_GMR_BASE_ADR0 + i, IDT_CMD_WRITE)) + goto err; + } + + /* Set up SCR */ + dbgi_wr_data3(adap, 1, 0, 0); + if (mc5_write(adap, IDT_SCR_ADR0, IDT_CMD_WRITE)) + goto err; + + return init_mask_data_array(mc5, IDT_MSKARY_BASE_ADR0, + IDT_DATARY_BASE_ADR0, IDT_CMD_WRITE, 0); +err: + return -EIO; +} + +static int init_idt43102(struct mc5 *mc5) +{ + int i; + struct adapter *adap = mc5->adapter; + + t3_write_reg(adap, A_MC5_DB_RSP_LATENCY, + adap->params.rev == 0 ? V_RDLAT(0xd) | V_SRCHLAT(0x11) : + V_RDLAT(0xd) | V_SRCHLAT(0x12)); + + /* + * Use GMRs 24-25 for ELOOKUP, GMRs 20-21 for SYN lookups, and no mask + * for ACK- and AOPEN searches. + */ + t3_write_reg(adap, A_MC5_DB_POPEN_DATA_WR_CMD, IDT4_CMD_WRITE); + t3_write_reg(adap, A_MC5_DB_POPEN_MASK_WR_CMD, IDT4_CMD_WRITE); + t3_write_reg(adap, A_MC5_DB_AOPEN_SRCH_CMD, + IDT4_CMD_SEARCH144 | 0x3800); + t3_write_reg(adap, A_MC5_DB_SYN_SRCH_CMD, IDT4_CMD_SEARCH144); + t3_write_reg(adap, A_MC5_DB_ACK_SRCH_CMD, IDT4_CMD_SEARCH144 | 0x3800); + t3_write_reg(adap, A_MC5_DB_ILOOKUP_CMD, IDT4_CMD_SEARCH144 | 0x3800); + t3_write_reg(adap, A_MC5_DB_ELOOKUP_CMD, IDT4_CMD_SEARCH144 | 0x800); + t3_write_reg(adap, A_MC5_DB_DATA_WRITE_CMD, IDT4_CMD_WRITE); + t3_write_reg(adap, A_MC5_DB_DATA_READ_CMD, IDT4_CMD_READ); + + t3_write_reg(adap, A_MC5_DB_PART_ID_INDEX, 3); + + /* Set DBGI command mode for IDT TCAM. */ + t3_write_reg(adap, A_MC5_DB_DBGI_CONFIG, DBGI_MODE_IDT52100); + + /* Set up GMRs */ + dbgi_wr_data3(adap, 0xffffffff, 0xffffffff, 0xff); + for (i = 0; i < 7; ++i) + if (mc5_write(adap, IDT4_GMR_BASE0 + i, IDT4_CMD_WRITE)) + goto err; + + for (i = 0; i < 4; ++i) + if (mc5_write(adap, IDT4_GMR_BASE2 + i, IDT4_CMD_WRITE)) + goto err; + + dbgi_wr_data3(adap, 0xfffffff9, 0xffffffff, 0xff); + if (mc5_write(adap, IDT4_GMR_BASE1, IDT4_CMD_WRITE) || + mc5_write(adap, IDT4_GMR_BASE1 + 1, IDT4_CMD_WRITE) || + mc5_write(adap, IDT4_GMR_BASE1 + 4, IDT4_CMD_WRITE)) + goto err; + + dbgi_wr_data3(adap, 0xfffffff9, 0xffff8007, 0xff); + if (mc5_write(adap, IDT4_GMR_BASE1 + 5, IDT4_CMD_WRITE)) + goto err; + + /* Set up SCR */ + dbgi_wr_data3(adap, 0xf0000000, 0, 0); + if (mc5_write(adap, IDT4_SCR_ADR0, IDT4_CMD_WRITE)) + goto err; + + return init_mask_data_array(mc5, IDT4_MSKARY_BASE_ADR0, + IDT4_DATARY_BASE_ADR0, IDT4_CMD_WRITE, 1); +err: + return -EIO; +} + +/* Put MC5 in DBGI mode. */ +static inline void mc5_dbgi_mode_enable(const struct mc5 *mc5) +{ + t3_write_reg(mc5->adapter, A_MC5_DB_CONFIG, + V_TMMODE(mc5->mode == MC5_MODE_72_BIT) | F_DBGIEN); +} + +/* Put MC5 in M-Bus mode. */ +static void mc5_dbgi_mode_disable(const struct mc5 *mc5) +{ + t3_write_reg(mc5->adapter, A_MC5_DB_CONFIG, + V_TMMODE(mc5->mode == MC5_MODE_72_BIT) | + V_COMPEN(mc5->mode == MC5_MODE_72_BIT) | + V_PRTYEN(mc5->parity_enabled) | F_MBUSEN); +} + +/* + * Initialization that requires the OS and protocol layers to already + * be initialized goes here. + */ +int t3_mc5_init(struct mc5 *mc5, unsigned int nservers, unsigned int nfilters, + unsigned int nroutes) +{ + u32 cfg; + int err; + unsigned int tcam_size = mc5->tcam_size; + struct adapter *adap = mc5->adapter; + + if (!tcam_size) + return 0; + + if (nroutes > MAX_ROUTES || nroutes + nservers + nfilters > tcam_size) + return -EINVAL; + + /* Reset the TCAM */ + cfg = t3_read_reg(adap, A_MC5_DB_CONFIG) & ~F_TMMODE; + cfg |= V_TMMODE(mc5->mode == MC5_MODE_72_BIT) | F_TMRST; + t3_write_reg(adap, A_MC5_DB_CONFIG, cfg); + if (t3_wait_op_done(adap, A_MC5_DB_CONFIG, F_TMRDY, 1, 500, 0)) { + CH_ERR(adap, "TCAM reset timed out\n"); + return -1; + } + + t3_write_reg(adap, A_MC5_DB_ROUTING_TABLE_INDEX, tcam_size - nroutes); + t3_write_reg(adap, A_MC5_DB_FILTER_TABLE, + tcam_size - nroutes - nfilters); + t3_write_reg(adap, A_MC5_DB_SERVER_INDEX, + tcam_size - nroutes - nfilters - nservers); + + mc5->parity_enabled = 1; + + /* All the TCAM addresses we access have only the low 32 bits non 0 */ + t3_write_reg(adap, A_MC5_DB_DBGI_REQ_ADDR1, 0); + t3_write_reg(adap, A_MC5_DB_DBGI_REQ_ADDR2, 0); + + mc5_dbgi_mode_enable(mc5); + + switch (mc5->part_type) { + case IDT75P52100: + err = init_idt52100(mc5); + break; + case IDT75N43102: + err = init_idt43102(mc5); + break; + default: + CH_ERR(adap, "Unsupported TCAM type %d\n", mc5->part_type); + err = -EINVAL; + break; + } + + mc5_dbgi_mode_disable(mc5); + return err; +} + + +#define MC5_INT_FATAL (F_PARITYERR | F_REQQPARERR | F_DISPQPARERR) + +/* + * MC5 interrupt handler + */ +void t3_mc5_intr_handler(struct mc5 *mc5) +{ + struct adapter *adap = mc5->adapter; + u32 cause = t3_read_reg(adap, A_MC5_DB_INT_CAUSE); + + if ((cause & F_PARITYERR) && mc5->parity_enabled) { + CH_ALERT(adap, "MC5 parity error\n"); + mc5->stats.parity_err++; + } + + if (cause & F_REQQPARERR) { + CH_ALERT(adap, "MC5 request queue parity error\n"); + mc5->stats.reqq_parity_err++; + } + + if (cause & F_DISPQPARERR) { + CH_ALERT(adap, "MC5 dispatch queue parity error\n"); + mc5->stats.dispq_parity_err++; + } + + if (cause & F_ACTRGNFULL) + mc5->stats.active_rgn_full++; + if (cause & F_NFASRCHFAIL) + mc5->stats.nfa_srch_err++; + if (cause & F_UNKNOWNCMD) + mc5->stats.unknown_cmd++; + if (cause & F_DELACTEMPTY) + mc5->stats.del_act_empty++; + if (cause & MC5_INT_FATAL) + t3_fatal_err(adap); + + t3_write_reg(adap, A_MC5_DB_INT_CAUSE, cause); +} + +void t3_mc5_prep(struct adapter *adapter, struct mc5 *mc5, int mode) +{ +#define K * 1024 + + static unsigned int tcam_part_size[] = { /* in K 72-bit entries */ + 64 K, 128 K, 256 K, 32 K + }; + +#undef K + + u32 cfg = t3_read_reg(adapter, A_MC5_DB_CONFIG); + + mc5->adapter = adapter; + mc5->mode = (unsigned char)mode; + mc5->part_type = (unsigned char)G_TMTYPE(cfg); + if (cfg & F_TMTYPEHI) + mc5->part_type |= 4; + + mc5->tcam_size = tcam_part_size[G_TMPARTSIZE(cfg)]; + if (mode == MC5_MODE_144_BIT) + mc5->tcam_size /= 2; +} diff --git a/drivers/net/ethernet/chelsio/cxgb3/regs.h b/drivers/net/ethernet/chelsio/cxgb3/regs.h new file mode 100644 index 0000000..81029b8 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb3/regs.h @@ -0,0 +1,2563 @@ +#define A_SG_CONTROL 0x0 + +#define S_CONGMODE 29 +#define V_CONGMODE(x) ((x) << S_CONGMODE) +#define F_CONGMODE V_CONGMODE(1U) + +#define S_TNLFLMODE 28 +#define V_TNLFLMODE(x) ((x) << S_TNLFLMODE) +#define F_TNLFLMODE V_TNLFLMODE(1U) + +#define S_FATLPERREN 27 +#define V_FATLPERREN(x) ((x) << S_FATLPERREN) +#define F_FATLPERREN V_FATLPERREN(1U) + +#define S_DROPPKT 20 +#define V_DROPPKT(x) ((x) << S_DROPPKT) +#define F_DROPPKT V_DROPPKT(1U) + +#define S_EGRGENCTRL 19 +#define V_EGRGENCTRL(x) ((x) << S_EGRGENCTRL) +#define F_EGRGENCTRL V_EGRGENCTRL(1U) + +#define S_USERSPACESIZE 14 +#define M_USERSPACESIZE 0x1f +#define V_USERSPACESIZE(x) ((x) << S_USERSPACESIZE) + +#define S_HOSTPAGESIZE 11 +#define M_HOSTPAGESIZE 0x7 +#define V_HOSTPAGESIZE(x) ((x) << S_HOSTPAGESIZE) + +#define S_FLMODE 9 +#define V_FLMODE(x) ((x) << S_FLMODE) +#define F_FLMODE V_FLMODE(1U) + +#define S_PKTSHIFT 6 +#define M_PKTSHIFT 0x7 +#define V_PKTSHIFT(x) ((x) << S_PKTSHIFT) + +#define S_ONEINTMULTQ 5 +#define V_ONEINTMULTQ(x) ((x) << S_ONEINTMULTQ) +#define F_ONEINTMULTQ V_ONEINTMULTQ(1U) + +#define S_BIGENDIANINGRESS 2 +#define V_BIGENDIANINGRESS(x) ((x) << S_BIGENDIANINGRESS) +#define F_BIGENDIANINGRESS V_BIGENDIANINGRESS(1U) + +#define S_ISCSICOALESCING 1 +#define V_ISCSICOALESCING(x) ((x) << S_ISCSICOALESCING) +#define F_ISCSICOALESCING V_ISCSICOALESCING(1U) + +#define S_GLOBALENABLE 0 +#define V_GLOBALENABLE(x) ((x) << S_GLOBALENABLE) +#define F_GLOBALENABLE V_GLOBALENABLE(1U) + +#define S_AVOIDCQOVFL 24 +#define V_AVOIDCQOVFL(x) ((x) << S_AVOIDCQOVFL) +#define F_AVOIDCQOVFL V_AVOIDCQOVFL(1U) + +#define S_OPTONEINTMULTQ 23 +#define V_OPTONEINTMULTQ(x) ((x) << S_OPTONEINTMULTQ) +#define F_OPTONEINTMULTQ V_OPTONEINTMULTQ(1U) + +#define S_CQCRDTCTRL 22 +#define V_CQCRDTCTRL(x) ((x) << S_CQCRDTCTRL) +#define F_CQCRDTCTRL V_CQCRDTCTRL(1U) + +#define A_SG_KDOORBELL 0x4 + +#define S_SELEGRCNTX 31 +#define V_SELEGRCNTX(x) ((x) << S_SELEGRCNTX) +#define F_SELEGRCNTX V_SELEGRCNTX(1U) + +#define S_EGRCNTX 0 +#define M_EGRCNTX 0xffff +#define V_EGRCNTX(x) ((x) << S_EGRCNTX) + +#define A_SG_GTS 0x8 + +#define S_RSPQ 29 +#define M_RSPQ 0x7 +#define V_RSPQ(x) ((x) << S_RSPQ) +#define G_RSPQ(x) (((x) >> S_RSPQ) & M_RSPQ) + +#define S_NEWTIMER 16 +#define M_NEWTIMER 0x1fff +#define V_NEWTIMER(x) ((x) << S_NEWTIMER) + +#define S_NEWINDEX 0 +#define M_NEWINDEX 0xffff +#define V_NEWINDEX(x) ((x) << S_NEWINDEX) + +#define A_SG_CONTEXT_CMD 0xc + +#define S_CONTEXT_CMD_OPCODE 28 +#define M_CONTEXT_CMD_OPCODE 0xf +#define V_CONTEXT_CMD_OPCODE(x) ((x) << S_CONTEXT_CMD_OPCODE) + +#define S_CONTEXT_CMD_BUSY 27 +#define V_CONTEXT_CMD_BUSY(x) ((x) << S_CONTEXT_CMD_BUSY) +#define F_CONTEXT_CMD_BUSY V_CONTEXT_CMD_BUSY(1U) + +#define S_CQ_CREDIT 20 + +#define M_CQ_CREDIT 0x7f + +#define V_CQ_CREDIT(x) ((x) << S_CQ_CREDIT) + +#define G_CQ_CREDIT(x) (((x) >> S_CQ_CREDIT) & M_CQ_CREDIT) + +#define S_CQ 19 + +#define V_CQ(x) ((x) << S_CQ) +#define F_CQ V_CQ(1U) + +#define S_RESPONSEQ 18 +#define V_RESPONSEQ(x) ((x) << S_RESPONSEQ) +#define F_RESPONSEQ V_RESPONSEQ(1U) + +#define S_EGRESS 17 +#define V_EGRESS(x) ((x) << S_EGRESS) +#define F_EGRESS V_EGRESS(1U) + +#define S_FREELIST 16 +#define V_FREELIST(x) ((x) << S_FREELIST) +#define F_FREELIST V_FREELIST(1U) + +#define S_CONTEXT 0 +#define M_CONTEXT 0xffff +#define V_CONTEXT(x) ((x) << S_CONTEXT) + +#define G_CONTEXT(x) (((x) >> S_CONTEXT) & M_CONTEXT) + +#define A_SG_CONTEXT_DATA0 0x10 + +#define A_SG_CONTEXT_DATA1 0x14 + +#define A_SG_CONTEXT_DATA2 0x18 + +#define A_SG_CONTEXT_DATA3 0x1c + +#define A_SG_CONTEXT_MASK0 0x20 + +#define A_SG_CONTEXT_MASK1 0x24 + +#define A_SG_CONTEXT_MASK2 0x28 + +#define A_SG_CONTEXT_MASK3 0x2c + +#define A_SG_RSPQ_CREDIT_RETURN 0x30 + +#define S_CREDITS 0 +#define M_CREDITS 0xffff +#define V_CREDITS(x) ((x) << S_CREDITS) + +#define A_SG_DATA_INTR 0x34 + +#define S_ERRINTR 31 +#define V_ERRINTR(x) ((x) << S_ERRINTR) +#define F_ERRINTR V_ERRINTR(1U) + +#define A_SG_HI_DRB_HI_THRSH 0x38 + +#define A_SG_HI_DRB_LO_THRSH 0x3c + +#define A_SG_LO_DRB_HI_THRSH 0x40 + +#define A_SG_LO_DRB_LO_THRSH 0x44 + +#define A_SG_RSPQ_FL_STATUS 0x4c + +#define S_RSPQ0DISABLED 8 + +#define S_FL0EMPTY 16 +#define V_FL0EMPTY(x) ((x) << S_FL0EMPTY) +#define F_FL0EMPTY V_FL0EMPTY(1U) + +#define A_SG_EGR_RCQ_DRB_THRSH 0x54 + +#define S_HIRCQDRBTHRSH 16 +#define M_HIRCQDRBTHRSH 0x7ff +#define V_HIRCQDRBTHRSH(x) ((x) << S_HIRCQDRBTHRSH) + +#define S_LORCQDRBTHRSH 0 +#define M_LORCQDRBTHRSH 0x7ff +#define V_LORCQDRBTHRSH(x) ((x) << S_LORCQDRBTHRSH) + +#define A_SG_EGR_CNTX_BADDR 0x58 + +#define A_SG_INT_CAUSE 0x5c + +#define S_HIRCQPARITYERROR 31 +#define V_HIRCQPARITYERROR(x) ((x) << S_HIRCQPARITYERROR) +#define F_HIRCQPARITYERROR V_HIRCQPARITYERROR(1U) + +#define S_LORCQPARITYERROR 30 +#define V_LORCQPARITYERROR(x) ((x) << S_LORCQPARITYERROR) +#define F_LORCQPARITYERROR V_LORCQPARITYERROR(1U) + +#define S_HIDRBPARITYERROR 29 +#define V_HIDRBPARITYERROR(x) ((x) << S_HIDRBPARITYERROR) +#define F_HIDRBPARITYERROR V_HIDRBPARITYERROR(1U) + +#define S_LODRBPARITYERROR 28 +#define V_LODRBPARITYERROR(x) ((x) << S_LODRBPARITYERROR) +#define F_LODRBPARITYERROR V_LODRBPARITYERROR(1U) + +#define S_FLPARITYERROR 22 +#define M_FLPARITYERROR 0x3f +#define V_FLPARITYERROR(x) ((x) << S_FLPARITYERROR) +#define G_FLPARITYERROR(x) (((x) >> S_FLPARITYERROR) & M_FLPARITYERROR) + +#define S_ITPARITYERROR 20 +#define M_ITPARITYERROR 0x3 +#define V_ITPARITYERROR(x) ((x) << S_ITPARITYERROR) +#define G_ITPARITYERROR(x) (((x) >> S_ITPARITYERROR) & M_ITPARITYERROR) + +#define S_IRPARITYERROR 19 +#define V_IRPARITYERROR(x) ((x) << S_IRPARITYERROR) +#define F_IRPARITYERROR V_IRPARITYERROR(1U) + +#define S_RCPARITYERROR 18 +#define V_RCPARITYERROR(x) ((x) << S_RCPARITYERROR) +#define F_RCPARITYERROR V_RCPARITYERROR(1U) + +#define S_OCPARITYERROR 17 +#define V_OCPARITYERROR(x) ((x) << S_OCPARITYERROR) +#define F_OCPARITYERROR V_OCPARITYERROR(1U) + +#define S_CPPARITYERROR 16 +#define V_CPPARITYERROR(x) ((x) << S_CPPARITYERROR) +#define F_CPPARITYERROR V_CPPARITYERROR(1U) + +#define S_R_REQ_FRAMINGERROR 15 +#define V_R_REQ_FRAMINGERROR(x) ((x) << S_R_REQ_FRAMINGERROR) +#define F_R_REQ_FRAMINGERROR V_R_REQ_FRAMINGERROR(1U) + +#define S_UC_REQ_FRAMINGERROR 14 +#define V_UC_REQ_FRAMINGERROR(x) ((x) << S_UC_REQ_FRAMINGERROR) +#define F_UC_REQ_FRAMINGERROR V_UC_REQ_FRAMINGERROR(1U) + +#define S_HICTLDRBDROPERR 13 +#define V_HICTLDRBDROPERR(x) ((x) << S_HICTLDRBDROPERR) +#define F_HICTLDRBDROPERR V_HICTLDRBDROPERR(1U) + +#define S_LOCTLDRBDROPERR 12 +#define V_LOCTLDRBDROPERR(x) ((x) << S_LOCTLDRBDROPERR) +#define F_LOCTLDRBDROPERR V_LOCTLDRBDROPERR(1U) + +#define S_HIPIODRBDROPERR 11 +#define V_HIPIODRBDROPERR(x) ((x) << S_HIPIODRBDROPERR) +#define F_HIPIODRBDROPERR V_HIPIODRBDROPERR(1U) + +#define S_LOPIODRBDROPERR 10 +#define V_LOPIODRBDROPERR(x) ((x) << S_LOPIODRBDROPERR) +#define F_LOPIODRBDROPERR V_LOPIODRBDROPERR(1U) + +#define S_HIPRIORITYDBFULL 7 +#define V_HIPRIORITYDBFULL(x) ((x) << S_HIPRIORITYDBFULL) +#define F_HIPRIORITYDBFULL V_HIPRIORITYDBFULL(1U) + +#define S_HIPRIORITYDBEMPTY 6 +#define V_HIPRIORITYDBEMPTY(x) ((x) << S_HIPRIORITYDBEMPTY) +#define F_HIPRIORITYDBEMPTY V_HIPRIORITYDBEMPTY(1U) + +#define S_LOPRIORITYDBFULL 5 +#define V_LOPRIORITYDBFULL(x) ((x) << S_LOPRIORITYDBFULL) +#define F_LOPRIORITYDBFULL V_LOPRIORITYDBFULL(1U) + +#define S_LOPRIORITYDBEMPTY 4 +#define V_LOPRIORITYDBEMPTY(x) ((x) << S_LOPRIORITYDBEMPTY) +#define F_LOPRIORITYDBEMPTY V_LOPRIORITYDBEMPTY(1U) + +#define S_RSPQDISABLED 3 +#define V_RSPQDISABLED(x) ((x) << S_RSPQDISABLED) +#define F_RSPQDISABLED V_RSPQDISABLED(1U) + +#define S_RSPQCREDITOVERFOW 2 +#define V_RSPQCREDITOVERFOW(x) ((x) << S_RSPQCREDITOVERFOW) +#define F_RSPQCREDITOVERFOW V_RSPQCREDITOVERFOW(1U) + +#define S_FLEMPTY 1 +#define V_FLEMPTY(x) ((x) << S_FLEMPTY) +#define F_FLEMPTY V_FLEMPTY(1U) + +#define A_SG_INT_ENABLE 0x60 + +#define A_SG_CMDQ_CREDIT_TH 0x64 + +#define S_TIMEOUT 8 +#define M_TIMEOUT 0xffffff +#define V_TIMEOUT(x) ((x) << S_TIMEOUT) + +#define S_THRESHOLD 0 +#define M_THRESHOLD 0xff +#define V_THRESHOLD(x) ((x) << S_THRESHOLD) + +#define A_SG_TIMER_TICK 0x68 + +#define A_SG_CQ_CONTEXT_BADDR 0x6c + +#define A_SG_OCO_BASE 0x70 + +#define S_BASE1 16 +#define M_BASE1 0xffff +#define V_BASE1(x) ((x) << S_BASE1) + +#define A_SG_DRB_PRI_THRESH 0x74 + +#define A_PCIX_INT_ENABLE 0x80 + +#define S_MSIXPARERR 22 +#define M_MSIXPARERR 0x7 + +#define V_MSIXPARERR(x) ((x) << S_MSIXPARERR) + +#define S_CFPARERR 18 +#define M_CFPARERR 0xf + +#define V_CFPARERR(x) ((x) << S_CFPARERR) + +#define S_RFPARERR 14 +#define M_RFPARERR 0xf + +#define V_RFPARERR(x) ((x) << S_RFPARERR) + +#define S_WFPARERR 12 +#define M_WFPARERR 0x3 + +#define V_WFPARERR(x) ((x) << S_WFPARERR) + +#define S_PIOPARERR 11 +#define V_PIOPARERR(x) ((x) << S_PIOPARERR) +#define F_PIOPARERR V_PIOPARERR(1U) + +#define S_DETUNCECCERR 10 +#define V_DETUNCECCERR(x) ((x) << S_DETUNCECCERR) +#define F_DETUNCECCERR V_DETUNCECCERR(1U) + +#define S_DETCORECCERR 9 +#define V_DETCORECCERR(x) ((x) << S_DETCORECCERR) +#define F_DETCORECCERR V_DETCORECCERR(1U) + +#define S_RCVSPLCMPERR 8 +#define V_RCVSPLCMPERR(x) ((x) << S_RCVSPLCMPERR) +#define F_RCVSPLCMPERR V_RCVSPLCMPERR(1U) + +#define S_UNXSPLCMP 7 +#define V_UNXSPLCMP(x) ((x) << S_UNXSPLCMP) +#define F_UNXSPLCMP V_UNXSPLCMP(1U) + +#define S_SPLCMPDIS 6 +#define V_SPLCMPDIS(x) ((x) << S_SPLCMPDIS) +#define F_SPLCMPDIS V_SPLCMPDIS(1U) + +#define S_DETPARERR 5 +#define V_DETPARERR(x) ((x) << S_DETPARERR) +#define F_DETPARERR V_DETPARERR(1U) + +#define S_SIGSYSERR 4 +#define V_SIGSYSERR(x) ((x) << S_SIGSYSERR) +#define F_SIGSYSERR V_SIGSYSERR(1U) + +#define S_RCVMSTABT 3 +#define V_RCVMSTABT(x) ((x) << S_RCVMSTABT) +#define F_RCVMSTABT V_RCVMSTABT(1U) + +#define S_RCVTARABT 2 +#define V_RCVTARABT(x) ((x) << S_RCVTARABT) +#define F_RCVTARABT V_RCVTARABT(1U) + +#define S_SIGTARABT 1 +#define V_SIGTARABT(x) ((x) << S_SIGTARABT) +#define F_SIGTARABT V_SIGTARABT(1U) + +#define S_MSTDETPARERR 0 +#define V_MSTDETPARERR(x) ((x) << S_MSTDETPARERR) +#define F_MSTDETPARERR V_MSTDETPARERR(1U) + +#define A_PCIX_INT_CAUSE 0x84 + +#define A_PCIX_CFG 0x88 + +#define S_DMASTOPEN 19 +#define V_DMASTOPEN(x) ((x) << S_DMASTOPEN) +#define F_DMASTOPEN V_DMASTOPEN(1U) + +#define S_CLIDECEN 18 +#define V_CLIDECEN(x) ((x) << S_CLIDECEN) +#define F_CLIDECEN V_CLIDECEN(1U) + +#define A_PCIX_MODE 0x8c + +#define S_PCLKRANGE 6 +#define M_PCLKRANGE 0x3 +#define V_PCLKRANGE(x) ((x) << S_PCLKRANGE) +#define G_PCLKRANGE(x) (((x) >> S_PCLKRANGE) & M_PCLKRANGE) + +#define S_PCIXINITPAT 2 +#define M_PCIXINITPAT 0xf +#define V_PCIXINITPAT(x) ((x) << S_PCIXINITPAT) +#define G_PCIXINITPAT(x) (((x) >> S_PCIXINITPAT) & M_PCIXINITPAT) + +#define S_64BIT 0 +#define V_64BIT(x) ((x) << S_64BIT) +#define F_64BIT V_64BIT(1U) + +#define A_PCIE_INT_ENABLE 0x80 + +#define S_BISTERR 15 +#define M_BISTERR 0xff + +#define V_BISTERR(x) ((x) << S_BISTERR) + +#define S_TXPARERR 18 +#define V_TXPARERR(x) ((x) << S_TXPARERR) +#define F_TXPARERR V_TXPARERR(1U) + +#define S_RXPARERR 17 +#define V_RXPARERR(x) ((x) << S_RXPARERR) +#define F_RXPARERR V_RXPARERR(1U) + +#define S_RETRYLUTPARERR 16 +#define V_RETRYLUTPARERR(x) ((x) << S_RETRYLUTPARERR) +#define F_RETRYLUTPARERR V_RETRYLUTPARERR(1U) + +#define S_RETRYBUFPARERR 15 +#define V_RETRYBUFPARERR(x) ((x) << S_RETRYBUFPARERR) +#define F_RETRYBUFPARERR V_RETRYBUFPARERR(1U) + +#define S_PCIE_MSIXPARERR 12 +#define M_PCIE_MSIXPARERR 0x7 + +#define V_PCIE_MSIXPARERR(x) ((x) << S_PCIE_MSIXPARERR) + +#define S_PCIE_CFPARERR 11 +#define V_PCIE_CFPARERR(x) ((x) << S_PCIE_CFPARERR) +#define F_PCIE_CFPARERR V_PCIE_CFPARERR(1U) + +#define S_PCIE_RFPARERR 10 +#define V_PCIE_RFPARERR(x) ((x) << S_PCIE_RFPARERR) +#define F_PCIE_RFPARERR V_PCIE_RFPARERR(1U) + +#define S_PCIE_WFPARERR 9 +#define V_PCIE_WFPARERR(x) ((x) << S_PCIE_WFPARERR) +#define F_PCIE_WFPARERR V_PCIE_WFPARERR(1U) + +#define S_PCIE_PIOPARERR 8 +#define V_PCIE_PIOPARERR(x) ((x) << S_PCIE_PIOPARERR) +#define F_PCIE_PIOPARERR V_PCIE_PIOPARERR(1U) + +#define S_UNXSPLCPLERRC 7 +#define V_UNXSPLCPLERRC(x) ((x) << S_UNXSPLCPLERRC) +#define F_UNXSPLCPLERRC V_UNXSPLCPLERRC(1U) + +#define S_UNXSPLCPLERRR 6 +#define V_UNXSPLCPLERRR(x) ((x) << S_UNXSPLCPLERRR) +#define F_UNXSPLCPLERRR V_UNXSPLCPLERRR(1U) + +#define S_PEXERR 0 +#define V_PEXERR(x) ((x) << S_PEXERR) +#define F_PEXERR V_PEXERR(1U) + +#define A_PCIE_INT_CAUSE 0x84 + +#define S_PCIE_DMASTOPEN 24 +#define V_PCIE_DMASTOPEN(x) ((x) << S_PCIE_DMASTOPEN) +#define F_PCIE_DMASTOPEN V_PCIE_DMASTOPEN(1U) + +#define A_PCIE_CFG 0x88 + +#define S_ENABLELINKDWNDRST 21 +#define V_ENABLELINKDWNDRST(x) ((x) << S_ENABLELINKDWNDRST) +#define F_ENABLELINKDWNDRST V_ENABLELINKDWNDRST(1U) + +#define S_ENABLELINKDOWNRST 20 +#define V_ENABLELINKDOWNRST(x) ((x) << S_ENABLELINKDOWNRST) +#define F_ENABLELINKDOWNRST V_ENABLELINKDOWNRST(1U) + +#define S_PCIE_CLIDECEN 16 +#define V_PCIE_CLIDECEN(x) ((x) << S_PCIE_CLIDECEN) +#define F_PCIE_CLIDECEN V_PCIE_CLIDECEN(1U) + +#define S_CRSTWRMMODE 0 +#define V_CRSTWRMMODE(x) ((x) << S_CRSTWRMMODE) +#define F_CRSTWRMMODE V_CRSTWRMMODE(1U) + +#define A_PCIE_MODE 0x8c + +#define S_NUMFSTTRNSEQRX 10 +#define M_NUMFSTTRNSEQRX 0xff +#define V_NUMFSTTRNSEQRX(x) ((x) << S_NUMFSTTRNSEQRX) +#define G_NUMFSTTRNSEQRX(x) (((x) >> S_NUMFSTTRNSEQRX) & M_NUMFSTTRNSEQRX) + +#define A_PCIE_PEX_CTRL0 0x98 + +#define S_NUMFSTTRNSEQ 22 +#define M_NUMFSTTRNSEQ 0xff +#define V_NUMFSTTRNSEQ(x) ((x) << S_NUMFSTTRNSEQ) +#define G_NUMFSTTRNSEQ(x) (((x) >> S_NUMFSTTRNSEQ) & M_NUMFSTTRNSEQ) + +#define S_REPLAYLMT 2 +#define M_REPLAYLMT 0xfffff + +#define V_REPLAYLMT(x) ((x) << S_REPLAYLMT) + +#define A_PCIE_PEX_CTRL1 0x9c + +#define S_T3A_ACKLAT 0 +#define M_T3A_ACKLAT 0x7ff + +#define V_T3A_ACKLAT(x) ((x) << S_T3A_ACKLAT) + +#define S_ACKLAT 0 +#define M_ACKLAT 0x1fff + +#define V_ACKLAT(x) ((x) << S_ACKLAT) + +#define A_PCIE_PEX_ERR 0xa4 + +#define A_T3DBG_GPIO_EN 0xd0 + +#define S_GPIO11_OEN 27 +#define V_GPIO11_OEN(x) ((x) << S_GPIO11_OEN) +#define F_GPIO11_OEN V_GPIO11_OEN(1U) + +#define S_GPIO10_OEN 26 +#define V_GPIO10_OEN(x) ((x) << S_GPIO10_OEN) +#define F_GPIO10_OEN V_GPIO10_OEN(1U) + +#define S_GPIO7_OEN 23 +#define V_GPIO7_OEN(x) ((x) << S_GPIO7_OEN) +#define F_GPIO7_OEN V_GPIO7_OEN(1U) + +#define S_GPIO6_OEN 22 +#define V_GPIO6_OEN(x) ((x) << S_GPIO6_OEN) +#define F_GPIO6_OEN V_GPIO6_OEN(1U) + +#define S_GPIO5_OEN 21 +#define V_GPIO5_OEN(x) ((x) << S_GPIO5_OEN) +#define F_GPIO5_OEN V_GPIO5_OEN(1U) + +#define S_GPIO4_OEN 20 +#define V_GPIO4_OEN(x) ((x) << S_GPIO4_OEN) +#define F_GPIO4_OEN V_GPIO4_OEN(1U) + +#define S_GPIO2_OEN 18 +#define V_GPIO2_OEN(x) ((x) << S_GPIO2_OEN) +#define F_GPIO2_OEN V_GPIO2_OEN(1U) + +#define S_GPIO1_OEN 17 +#define V_GPIO1_OEN(x) ((x) << S_GPIO1_OEN) +#define F_GPIO1_OEN V_GPIO1_OEN(1U) + +#define S_GPIO0_OEN 16 +#define V_GPIO0_OEN(x) ((x) << S_GPIO0_OEN) +#define F_GPIO0_OEN V_GPIO0_OEN(1U) + +#define S_GPIO10_OUT_VAL 10 +#define V_GPIO10_OUT_VAL(x) ((x) << S_GPIO10_OUT_VAL) +#define F_GPIO10_OUT_VAL V_GPIO10_OUT_VAL(1U) + +#define S_GPIO7_OUT_VAL 7 +#define V_GPIO7_OUT_VAL(x) ((x) << S_GPIO7_OUT_VAL) +#define F_GPIO7_OUT_VAL V_GPIO7_OUT_VAL(1U) + +#define S_GPIO6_OUT_VAL 6 +#define V_GPIO6_OUT_VAL(x) ((x) << S_GPIO6_OUT_VAL) +#define F_GPIO6_OUT_VAL V_GPIO6_OUT_VAL(1U) + +#define S_GPIO5_OUT_VAL 5 +#define V_GPIO5_OUT_VAL(x) ((x) << S_GPIO5_OUT_VAL) +#define F_GPIO5_OUT_VAL V_GPIO5_OUT_VAL(1U) + +#define S_GPIO4_OUT_VAL 4 +#define V_GPIO4_OUT_VAL(x) ((x) << S_GPIO4_OUT_VAL) +#define F_GPIO4_OUT_VAL V_GPIO4_OUT_VAL(1U) + +#define S_GPIO2_OUT_VAL 2 +#define V_GPIO2_OUT_VAL(x) ((x) << S_GPIO2_OUT_VAL) +#define F_GPIO2_OUT_VAL V_GPIO2_OUT_VAL(1U) + +#define S_GPIO1_OUT_VAL 1 +#define V_GPIO1_OUT_VAL(x) ((x) << S_GPIO1_OUT_VAL) +#define F_GPIO1_OUT_VAL V_GPIO1_OUT_VAL(1U) + +#define S_GPIO0_OUT_VAL 0 +#define V_GPIO0_OUT_VAL(x) ((x) << S_GPIO0_OUT_VAL) +#define F_GPIO0_OUT_VAL V_GPIO0_OUT_VAL(1U) + +#define A_T3DBG_INT_ENABLE 0xd8 + +#define S_GPIO11 11 +#define V_GPIO11(x) ((x) << S_GPIO11) +#define F_GPIO11 V_GPIO11(1U) + +#define S_GPIO10 10 +#define V_GPIO10(x) ((x) << S_GPIO10) +#define F_GPIO10 V_GPIO10(1U) + +#define S_GPIO9 9 +#define V_GPIO9(x) ((x) << S_GPIO9) +#define F_GPIO9 V_GPIO9(1U) + +#define S_GPIO7 7 +#define V_GPIO7(x) ((x) << S_GPIO7) +#define F_GPIO7 V_GPIO7(1U) + +#define S_GPIO6 6 +#define V_GPIO6(x) ((x) << S_GPIO6) +#define F_GPIO6 V_GPIO6(1U) + +#define S_GPIO5 5 +#define V_GPIO5(x) ((x) << S_GPIO5) +#define F_GPIO5 V_GPIO5(1U) + +#define S_GPIO4 4 +#define V_GPIO4(x) ((x) << S_GPIO4) +#define F_GPIO4 V_GPIO4(1U) + +#define S_GPIO3 3 +#define V_GPIO3(x) ((x) << S_GPIO3) +#define F_GPIO3 V_GPIO3(1U) + +#define S_GPIO2 2 +#define V_GPIO2(x) ((x) << S_GPIO2) +#define F_GPIO2 V_GPIO2(1U) + +#define S_GPIO1 1 +#define V_GPIO1(x) ((x) << S_GPIO1) +#define F_GPIO1 V_GPIO1(1U) + +#define S_GPIO0 0 +#define V_GPIO0(x) ((x) << S_GPIO0) +#define F_GPIO0 V_GPIO0(1U) + +#define A_T3DBG_INT_CAUSE 0xdc + +#define A_T3DBG_GPIO_ACT_LOW 0xf0 + +#define MC7_PMRX_BASE_ADDR 0x100 + +#define A_MC7_CFG 0x100 + +#define S_IFEN 13 +#define V_IFEN(x) ((x) << S_IFEN) +#define F_IFEN V_IFEN(1U) + +#define S_TERM150 11 +#define V_TERM150(x) ((x) << S_TERM150) +#define F_TERM150 V_TERM150(1U) + +#define S_SLOW 10 +#define V_SLOW(x) ((x) << S_SLOW) +#define F_SLOW V_SLOW(1U) + +#define S_WIDTH 8 +#define M_WIDTH 0x3 +#define V_WIDTH(x) ((x) << S_WIDTH) +#define G_WIDTH(x) (((x) >> S_WIDTH) & M_WIDTH) + +#define S_BKS 6 +#define V_BKS(x) ((x) << S_BKS) +#define F_BKS V_BKS(1U) + +#define S_ORG 5 +#define V_ORG(x) ((x) << S_ORG) +#define F_ORG V_ORG(1U) + +#define S_DEN 2 +#define M_DEN 0x7 +#define V_DEN(x) ((x) << S_DEN) +#define G_DEN(x) (((x) >> S_DEN) & M_DEN) + +#define S_RDY 1 +#define V_RDY(x) ((x) << S_RDY) +#define F_RDY V_RDY(1U) + +#define S_CLKEN 0 +#define V_CLKEN(x) ((x) << S_CLKEN) +#define F_CLKEN V_CLKEN(1U) + +#define A_MC7_MODE 0x104 + +#define S_BUSY 31 +#define V_BUSY(x) ((x) << S_BUSY) +#define F_BUSY V_BUSY(1U) + +#define A_MC7_EXT_MODE1 0x108 + +#define A_MC7_EXT_MODE2 0x10c + +#define A_MC7_EXT_MODE3 0x110 + +#define A_MC7_PRE 0x114 + +#define A_MC7_REF 0x118 + +#define S_PREREFDIV 1 +#define M_PREREFDIV 0x3fff +#define V_PREREFDIV(x) ((x) << S_PREREFDIV) + +#define S_PERREFEN 0 +#define V_PERREFEN(x) ((x) << S_PERREFEN) +#define F_PERREFEN V_PERREFEN(1U) + +#define A_MC7_DLL 0x11c + +#define S_DLLENB 1 +#define V_DLLENB(x) ((x) << S_DLLENB) +#define F_DLLENB V_DLLENB(1U) + +#define S_DLLRST 0 +#define V_DLLRST(x) ((x) << S_DLLRST) +#define F_DLLRST V_DLLRST(1U) + +#define A_MC7_PARM 0x120 + +#define S_ACTTOPREDLY 26 +#define M_ACTTOPREDLY 0xf +#define V_ACTTOPREDLY(x) ((x) << S_ACTTOPREDLY) + +#define S_ACTTORDWRDLY 23 +#define M_ACTTORDWRDLY 0x7 +#define V_ACTTORDWRDLY(x) ((x) << S_ACTTORDWRDLY) + +#define S_PRECYC 20 +#define M_PRECYC 0x7 +#define V_PRECYC(x) ((x) << S_PRECYC) + +#define S_REFCYC 13 +#define M_REFCYC 0x7f +#define V_REFCYC(x) ((x) << S_REFCYC) + +#define S_BKCYC 8 +#define M_BKCYC 0x1f +#define V_BKCYC(x) ((x) << S_BKCYC) + +#define S_WRTORDDLY 4 +#define M_WRTORDDLY 0xf +#define V_WRTORDDLY(x) ((x) << S_WRTORDDLY) + +#define S_RDTOWRDLY 0 +#define M_RDTOWRDLY 0xf +#define V_RDTOWRDLY(x) ((x) << S_RDTOWRDLY) + +#define A_MC7_CAL 0x128 + +#define S_CAL_FAULT 30 +#define V_CAL_FAULT(x) ((x) << S_CAL_FAULT) +#define F_CAL_FAULT V_CAL_FAULT(1U) + +#define S_SGL_CAL_EN 20 +#define V_SGL_CAL_EN(x) ((x) << S_SGL_CAL_EN) +#define F_SGL_CAL_EN V_SGL_CAL_EN(1U) + +#define A_MC7_ERR_ADDR 0x12c + +#define A_MC7_ECC 0x130 + +#define S_ECCCHKEN 1 +#define V_ECCCHKEN(x) ((x) << S_ECCCHKEN) +#define F_ECCCHKEN V_ECCCHKEN(1U) + +#define S_ECCGENEN 0 +#define V_ECCGENEN(x) ((x) << S_ECCGENEN) +#define F_ECCGENEN V_ECCGENEN(1U) + +#define A_MC7_CE_ADDR 0x134 + +#define A_MC7_CE_DATA0 0x138 + +#define A_MC7_CE_DATA1 0x13c + +#define A_MC7_CE_DATA2 0x140 + +#define S_DATA 0 +#define M_DATA 0xff + +#define G_DATA(x) (((x) >> S_DATA) & M_DATA) + +#define A_MC7_UE_ADDR 0x144 + +#define A_MC7_UE_DATA0 0x148 + +#define A_MC7_UE_DATA1 0x14c + +#define A_MC7_UE_DATA2 0x150 + +#define A_MC7_BD_ADDR 0x154 + +#define S_ADDR 3 + +#define M_ADDR 0x1fffffff + +#define A_MC7_BD_DATA0 0x158 + +#define A_MC7_BD_DATA1 0x15c + +#define A_MC7_BD_OP 0x164 + +#define S_OP 0 + +#define V_OP(x) ((x) << S_OP) +#define F_OP V_OP(1U) + +#define A_MC7_BIST_ADDR_BEG 0x168 + +#define A_MC7_BIST_ADDR_END 0x16c + +#define A_MC7_BIST_DATA 0x170 + +#define A_MC7_BIST_OP 0x174 + +#define S_CONT 3 +#define V_CONT(x) ((x) << S_CONT) +#define F_CONT V_CONT(1U) + +#define A_MC7_INT_ENABLE 0x178 + +#define S_AE 17 +#define V_AE(x) ((x) << S_AE) +#define F_AE V_AE(1U) + +#define S_PE 2 +#define M_PE 0x7fff + +#define V_PE(x) ((x) << S_PE) + +#define G_PE(x) (((x) >> S_PE) & M_PE) + +#define S_UE 1 +#define V_UE(x) ((x) << S_UE) +#define F_UE V_UE(1U) + +#define S_CE 0 +#define V_CE(x) ((x) << S_CE) +#define F_CE V_CE(1U) + +#define A_MC7_INT_CAUSE 0x17c + +#define MC7_PMTX_BASE_ADDR 0x180 + +#define MC7_CM_BASE_ADDR 0x200 + +#define A_CIM_BOOT_CFG 0x280 + +#define S_BOOTADDR 2 +#define M_BOOTADDR 0x3fffffff +#define V_BOOTADDR(x) ((x) << S_BOOTADDR) + +#define A_CIM_SDRAM_BASE_ADDR 0x28c + +#define A_CIM_SDRAM_ADDR_SIZE 0x290 + +#define A_CIM_HOST_INT_ENABLE 0x298 + +#define S_DTAGPARERR 28 +#define V_DTAGPARERR(x) ((x) << S_DTAGPARERR) +#define F_DTAGPARERR V_DTAGPARERR(1U) + +#define S_ITAGPARERR 27 +#define V_ITAGPARERR(x) ((x) << S_ITAGPARERR) +#define F_ITAGPARERR V_ITAGPARERR(1U) + +#define S_IBQTPPARERR 26 +#define V_IBQTPPARERR(x) ((x) << S_IBQTPPARERR) +#define F_IBQTPPARERR V_IBQTPPARERR(1U) + +#define S_IBQULPPARERR 25 +#define V_IBQULPPARERR(x) ((x) << S_IBQULPPARERR) +#define F_IBQULPPARERR V_IBQULPPARERR(1U) + +#define S_IBQSGEHIPARERR 24 +#define V_IBQSGEHIPARERR(x) ((x) << S_IBQSGEHIPARERR) +#define F_IBQSGEHIPARERR V_IBQSGEHIPARERR(1U) + +#define S_IBQSGELOPARERR 23 +#define V_IBQSGELOPARERR(x) ((x) << S_IBQSGELOPARERR) +#define F_IBQSGELOPARERR V_IBQSGELOPARERR(1U) + +#define S_OBQULPLOPARERR 22 +#define V_OBQULPLOPARERR(x) ((x) << S_OBQULPLOPARERR) +#define F_OBQULPLOPARERR V_OBQULPLOPARERR(1U) + +#define S_OBQULPHIPARERR 21 +#define V_OBQULPHIPARERR(x) ((x) << S_OBQULPHIPARERR) +#define F_OBQULPHIPARERR V_OBQULPHIPARERR(1U) + +#define S_OBQSGEPARERR 20 +#define V_OBQSGEPARERR(x) ((x) << S_OBQSGEPARERR) +#define F_OBQSGEPARERR V_OBQSGEPARERR(1U) + +#define S_DCACHEPARERR 19 +#define V_DCACHEPARERR(x) ((x) << S_DCACHEPARERR) +#define F_DCACHEPARERR V_DCACHEPARERR(1U) + +#define S_ICACHEPARERR 18 +#define V_ICACHEPARERR(x) ((x) << S_ICACHEPARERR) +#define F_ICACHEPARERR V_ICACHEPARERR(1U) + +#define S_DRAMPARERR 17 +#define V_DRAMPARERR(x) ((x) << S_DRAMPARERR) +#define F_DRAMPARERR V_DRAMPARERR(1U) + +#define A_CIM_HOST_INT_CAUSE 0x29c + +#define S_BLKWRPLINT 12 +#define V_BLKWRPLINT(x) ((x) << S_BLKWRPLINT) +#define F_BLKWRPLINT V_BLKWRPLINT(1U) + +#define S_BLKRDPLINT 11 +#define V_BLKRDPLINT(x) ((x) << S_BLKRDPLINT) +#define F_BLKRDPLINT V_BLKRDPLINT(1U) + +#define S_BLKWRCTLINT 10 +#define V_BLKWRCTLINT(x) ((x) << S_BLKWRCTLINT) +#define F_BLKWRCTLINT V_BLKWRCTLINT(1U) + +#define S_BLKRDCTLINT 9 +#define V_BLKRDCTLINT(x) ((x) << S_BLKRDCTLINT) +#define F_BLKRDCTLINT V_BLKRDCTLINT(1U) + +#define S_BLKWRFLASHINT 8 +#define V_BLKWRFLASHINT(x) ((x) << S_BLKWRFLASHINT) +#define F_BLKWRFLASHINT V_BLKWRFLASHINT(1U) + +#define S_BLKRDFLASHINT 7 +#define V_BLKRDFLASHINT(x) ((x) << S_BLKRDFLASHINT) +#define F_BLKRDFLASHINT V_BLKRDFLASHINT(1U) + +#define S_SGLWRFLASHINT 6 +#define V_SGLWRFLASHINT(x) ((x) << S_SGLWRFLASHINT) +#define F_SGLWRFLASHINT V_SGLWRFLASHINT(1U) + +#define S_WRBLKFLASHINT 5 +#define V_WRBLKFLASHINT(x) ((x) << S_WRBLKFLASHINT) +#define F_WRBLKFLASHINT V_WRBLKFLASHINT(1U) + +#define S_BLKWRBOOTINT 4 +#define V_BLKWRBOOTINT(x) ((x) << S_BLKWRBOOTINT) +#define F_BLKWRBOOTINT V_BLKWRBOOTINT(1U) + +#define S_FLASHRANGEINT 2 +#define V_FLASHRANGEINT(x) ((x) << S_FLASHRANGEINT) +#define F_FLASHRANGEINT V_FLASHRANGEINT(1U) + +#define S_SDRAMRANGEINT 1 +#define V_SDRAMRANGEINT(x) ((x) << S_SDRAMRANGEINT) +#define F_SDRAMRANGEINT V_SDRAMRANGEINT(1U) + +#define S_RSVDSPACEINT 0 +#define V_RSVDSPACEINT(x) ((x) << S_RSVDSPACEINT) +#define F_RSVDSPACEINT V_RSVDSPACEINT(1U) + +#define A_CIM_HOST_ACC_CTRL 0x2b0 + +#define S_HOSTBUSY 17 +#define V_HOSTBUSY(x) ((x) << S_HOSTBUSY) +#define F_HOSTBUSY V_HOSTBUSY(1U) + +#define A_CIM_HOST_ACC_DATA 0x2b4 + +#define A_CIM_IBQ_DBG_CFG 0x2c0 + +#define S_IBQDBGADDR 16 +#define M_IBQDBGADDR 0x1ff +#define V_IBQDBGADDR(x) ((x) << S_IBQDBGADDR) +#define G_IBQDBGADDR(x) (((x) >> S_IBQDBGADDR) & M_IBQDBGADDR) + +#define S_IBQDBGQID 3 +#define M_IBQDBGQID 0x3 +#define V_IBQDBGQID(x) ((x) << S_IBQDBGQID) +#define G_IBQDBGQID(x) (((x) >> S_IBQDBGQID) & M_IBQDBGQID) + +#define S_IBQDBGWR 2 +#define V_IBQDBGWR(x) ((x) << S_IBQDBGWR) +#define F_IBQDBGWR V_IBQDBGWR(1U) + +#define S_IBQDBGBUSY 1 +#define V_IBQDBGBUSY(x) ((x) << S_IBQDBGBUSY) +#define F_IBQDBGBUSY V_IBQDBGBUSY(1U) + +#define S_IBQDBGEN 0 +#define V_IBQDBGEN(x) ((x) << S_IBQDBGEN) +#define F_IBQDBGEN V_IBQDBGEN(1U) + +#define A_CIM_IBQ_DBG_DATA 0x2c8 + +#define A_TP_IN_CONFIG 0x300 + +#define S_RXFBARBPRIO 25 +#define V_RXFBARBPRIO(x) ((x) << S_RXFBARBPRIO) +#define F_RXFBARBPRIO V_RXFBARBPRIO(1U) + +#define S_TXFBARBPRIO 24 +#define V_TXFBARBPRIO(x) ((x) << S_TXFBARBPRIO) +#define F_TXFBARBPRIO V_TXFBARBPRIO(1U) + +#define S_NICMODE 14 +#define V_NICMODE(x) ((x) << S_NICMODE) +#define F_NICMODE V_NICMODE(1U) + +#define S_IPV6ENABLE 15 +#define V_IPV6ENABLE(x) ((x) << S_IPV6ENABLE) +#define F_IPV6ENABLE V_IPV6ENABLE(1U) + +#define A_TP_OUT_CONFIG 0x304 + +#define S_VLANEXTRACTIONENABLE 12 + +#define A_TP_GLOBAL_CONFIG 0x308 + +#define S_TXPACINGENABLE 24 +#define V_TXPACINGENABLE(x) ((x) << S_TXPACINGENABLE) +#define F_TXPACINGENABLE V_TXPACINGENABLE(1U) + +#define S_PATHMTU 15 +#define V_PATHMTU(x) ((x) << S_PATHMTU) +#define F_PATHMTU V_PATHMTU(1U) + +#define S_IPCHECKSUMOFFLOAD 13 +#define V_IPCHECKSUMOFFLOAD(x) ((x) << S_IPCHECKSUMOFFLOAD) +#define F_IPCHECKSUMOFFLOAD V_IPCHECKSUMOFFLOAD(1U) + +#define S_UDPCHECKSUMOFFLOAD 12 +#define V_UDPCHECKSUMOFFLOAD(x) ((x) << S_UDPCHECKSUMOFFLOAD) +#define F_UDPCHECKSUMOFFLOAD V_UDPCHECKSUMOFFLOAD(1U) + +#define S_TCPCHECKSUMOFFLOAD 11 +#define V_TCPCHECKSUMOFFLOAD(x) ((x) << S_TCPCHECKSUMOFFLOAD) +#define F_TCPCHECKSUMOFFLOAD V_TCPCHECKSUMOFFLOAD(1U) + +#define S_IPTTL 0 +#define M_IPTTL 0xff +#define V_IPTTL(x) ((x) << S_IPTTL) + +#define A_TP_CMM_MM_BASE 0x314 + +#define A_TP_CMM_TIMER_BASE 0x318 + +#define S_CMTIMERMAXNUM 28 +#define M_CMTIMERMAXNUM 0x3 +#define V_CMTIMERMAXNUM(x) ((x) << S_CMTIMERMAXNUM) + +#define A_TP_PMM_SIZE 0x31c + +#define A_TP_PMM_TX_BASE 0x320 + +#define A_TP_PMM_RX_BASE 0x328 + +#define A_TP_PMM_RX_PAGE_SIZE 0x32c + +#define A_TP_PMM_RX_MAX_PAGE 0x330 + +#define A_TP_PMM_TX_PAGE_SIZE 0x334 + +#define A_TP_PMM_TX_MAX_PAGE 0x338 + +#define A_TP_TCP_OPTIONS 0x340 + +#define S_MTUDEFAULT 16 +#define M_MTUDEFAULT 0xffff +#define V_MTUDEFAULT(x) ((x) << S_MTUDEFAULT) + +#define S_MTUENABLE 10 +#define V_MTUENABLE(x) ((x) << S_MTUENABLE) +#define F_MTUENABLE V_MTUENABLE(1U) + +#define S_SACKRX 8 +#define V_SACKRX(x) ((x) << S_SACKRX) +#define F_SACKRX V_SACKRX(1U) + +#define S_SACKMODE 4 + +#define M_SACKMODE 0x3 + +#define V_SACKMODE(x) ((x) << S_SACKMODE) + +#define S_WINDOWSCALEMODE 2 +#define M_WINDOWSCALEMODE 0x3 +#define V_WINDOWSCALEMODE(x) ((x) << S_WINDOWSCALEMODE) + +#define S_TIMESTAMPSMODE 0 + +#define M_TIMESTAMPSMODE 0x3 + +#define V_TIMESTAMPSMODE(x) ((x) << S_TIMESTAMPSMODE) + +#define A_TP_DACK_CONFIG 0x344 + +#define S_AUTOSTATE3 30 +#define M_AUTOSTATE3 0x3 +#define V_AUTOSTATE3(x) ((x) << S_AUTOSTATE3) + +#define S_AUTOSTATE2 28 +#define M_AUTOSTATE2 0x3 +#define V_AUTOSTATE2(x) ((x) << S_AUTOSTATE2) + +#define S_AUTOSTATE1 26 +#define M_AUTOSTATE1 0x3 +#define V_AUTOSTATE1(x) ((x) << S_AUTOSTATE1) + +#define S_BYTETHRESHOLD 5 +#define M_BYTETHRESHOLD 0xfffff +#define V_BYTETHRESHOLD(x) ((x) << S_BYTETHRESHOLD) + +#define S_MSSTHRESHOLD 3 +#define M_MSSTHRESHOLD 0x3 +#define V_MSSTHRESHOLD(x) ((x) << S_MSSTHRESHOLD) + +#define S_AUTOCAREFUL 2 +#define V_AUTOCAREFUL(x) ((x) << S_AUTOCAREFUL) +#define F_AUTOCAREFUL V_AUTOCAREFUL(1U) + +#define S_AUTOENABLE 1 +#define V_AUTOENABLE(x) ((x) << S_AUTOENABLE) +#define F_AUTOENABLE V_AUTOENABLE(1U) + +#define S_DACK_MODE 0 +#define V_DACK_MODE(x) ((x) << S_DACK_MODE) +#define F_DACK_MODE V_DACK_MODE(1U) + +#define A_TP_PC_CONFIG 0x348 + +#define S_TXTOSQUEUEMAPMODE 26 +#define V_TXTOSQUEUEMAPMODE(x) ((x) << S_TXTOSQUEUEMAPMODE) +#define F_TXTOSQUEUEMAPMODE V_TXTOSQUEUEMAPMODE(1U) + +#define S_ENABLEEPCMDAFULL 23 +#define V_ENABLEEPCMDAFULL(x) ((x) << S_ENABLEEPCMDAFULL) +#define F_ENABLEEPCMDAFULL V_ENABLEEPCMDAFULL(1U) + +#define S_MODULATEUNIONMODE 22 +#define V_MODULATEUNIONMODE(x) ((x) << S_MODULATEUNIONMODE) +#define F_MODULATEUNIONMODE V_MODULATEUNIONMODE(1U) + +#define S_TXDEFERENABLE 20 +#define V_TXDEFERENABLE(x) ((x) << S_TXDEFERENABLE) +#define F_TXDEFERENABLE V_TXDEFERENABLE(1U) + +#define S_RXCONGESTIONMODE 19 +#define V_RXCONGESTIONMODE(x) ((x) << S_RXCONGESTIONMODE) +#define F_RXCONGESTIONMODE V_RXCONGESTIONMODE(1U) + +#define S_HEARBEATDACK 16 +#define V_HEARBEATDACK(x) ((x) << S_HEARBEATDACK) +#define F_HEARBEATDACK V_HEARBEATDACK(1U) + +#define S_TXCONGESTIONMODE 15 +#define V_TXCONGESTIONMODE(x) ((x) << S_TXCONGESTIONMODE) +#define F_TXCONGESTIONMODE V_TXCONGESTIONMODE(1U) + +#define S_ENABLEOCSPIFULL 30 +#define V_ENABLEOCSPIFULL(x) ((x) << S_ENABLEOCSPIFULL) +#define F_ENABLEOCSPIFULL V_ENABLEOCSPIFULL(1U) + +#define S_LOCKTID 28 +#define V_LOCKTID(x) ((x) << S_LOCKTID) +#define F_LOCKTID V_LOCKTID(1U) + +#define S_TABLELATENCYDELTA 0 +#define M_TABLELATENCYDELTA 0xf +#define V_TABLELATENCYDELTA(x) ((x) << S_TABLELATENCYDELTA) +#define G_TABLELATENCYDELTA(x) \ + (((x) >> S_TABLELATENCYDELTA) & M_TABLELATENCYDELTA) + +#define A_TP_PC_CONFIG2 0x34c + +#define S_DISBLEDAPARBIT0 15 +#define V_DISBLEDAPARBIT0(x) ((x) << S_DISBLEDAPARBIT0) +#define F_DISBLEDAPARBIT0 V_DISBLEDAPARBIT0(1U) + +#define S_ENABLEARPMISS 13 +#define V_ENABLEARPMISS(x) ((x) << S_ENABLEARPMISS) +#define F_ENABLEARPMISS V_ENABLEARPMISS(1U) + +#define S_ENABLENONOFDTNLSYN 12 +#define V_ENABLENONOFDTNLSYN(x) ((x) << S_ENABLENONOFDTNLSYN) +#define F_ENABLENONOFDTNLSYN V_ENABLENONOFDTNLSYN(1U) + +#define S_ENABLEIPV6RSS 11 +#define V_ENABLEIPV6RSS(x) ((x) << S_ENABLEIPV6RSS) +#define F_ENABLEIPV6RSS V_ENABLEIPV6RSS(1U) + +#define S_CHDRAFULL 4 +#define V_CHDRAFULL(x) ((x) << S_CHDRAFULL) +#define F_CHDRAFULL V_CHDRAFULL(1U) + +#define A_TP_TCP_BACKOFF_REG0 0x350 + +#define A_TP_TCP_BACKOFF_REG1 0x354 + +#define A_TP_TCP_BACKOFF_REG2 0x358 + +#define A_TP_TCP_BACKOFF_REG3 0x35c + +#define A_TP_PARA_REG2 0x368 + +#define S_MAXRXDATA 16 +#define M_MAXRXDATA 0xffff +#define V_MAXRXDATA(x) ((x) << S_MAXRXDATA) + +#define S_RXCOALESCESIZE 0 +#define M_RXCOALESCESIZE 0xffff +#define V_RXCOALESCESIZE(x) ((x) << S_RXCOALESCESIZE) + +#define A_TP_PARA_REG3 0x36c + +#define S_TXDATAACKIDX 16 +#define M_TXDATAACKIDX 0xf + +#define V_TXDATAACKIDX(x) ((x) << S_TXDATAACKIDX) + +#define S_TXPACEAUTOSTRICT 10 +#define V_TXPACEAUTOSTRICT(x) ((x) << S_TXPACEAUTOSTRICT) +#define F_TXPACEAUTOSTRICT V_TXPACEAUTOSTRICT(1U) + +#define S_TXPACEFIXED 9 +#define V_TXPACEFIXED(x) ((x) << S_TXPACEFIXED) +#define F_TXPACEFIXED V_TXPACEFIXED(1U) + +#define S_TXPACEAUTO 8 +#define V_TXPACEAUTO(x) ((x) << S_TXPACEAUTO) +#define F_TXPACEAUTO V_TXPACEAUTO(1U) + +#define S_RXCOALESCEENABLE 1 +#define V_RXCOALESCEENABLE(x) ((x) << S_RXCOALESCEENABLE) +#define F_RXCOALESCEENABLE V_RXCOALESCEENABLE(1U) + +#define S_RXCOALESCEPSHEN 0 +#define V_RXCOALESCEPSHEN(x) ((x) << S_RXCOALESCEPSHEN) +#define F_RXCOALESCEPSHEN V_RXCOALESCEPSHEN(1U) + +#define A_TP_PARA_REG4 0x370 + +#define A_TP_PARA_REG5 0x374 + +#define S_RXDDPOFFINIT 3 +#define V_RXDDPOFFINIT(x) ((x) << S_RXDDPOFFINIT) +#define F_RXDDPOFFINIT V_RXDDPOFFINIT(1U) + +#define A_TP_PARA_REG6 0x378 + +#define S_T3A_ENABLEESND 13 +#define V_T3A_ENABLEESND(x) ((x) << S_T3A_ENABLEESND) +#define F_T3A_ENABLEESND V_T3A_ENABLEESND(1U) + +#define S_ENABLEESND 11 +#define V_ENABLEESND(x) ((x) << S_ENABLEESND) +#define F_ENABLEESND V_ENABLEESND(1U) + +#define A_TP_PARA_REG7 0x37c + +#define S_PMMAXXFERLEN1 16 +#define M_PMMAXXFERLEN1 0xffff +#define V_PMMAXXFERLEN1(x) ((x) << S_PMMAXXFERLEN1) + +#define S_PMMAXXFERLEN0 0 +#define M_PMMAXXFERLEN0 0xffff +#define V_PMMAXXFERLEN0(x) ((x) << S_PMMAXXFERLEN0) + +#define A_TP_TIMER_RESOLUTION 0x390 + +#define S_TIMERRESOLUTION 16 +#define M_TIMERRESOLUTION 0xff +#define V_TIMERRESOLUTION(x) ((x) << S_TIMERRESOLUTION) + +#define S_TIMESTAMPRESOLUTION 8 +#define M_TIMESTAMPRESOLUTION 0xff +#define V_TIMESTAMPRESOLUTION(x) ((x) << S_TIMESTAMPRESOLUTION) + +#define S_DELAYEDACKRESOLUTION 0 +#define M_DELAYEDACKRESOLUTION 0xff +#define V_DELAYEDACKRESOLUTION(x) ((x) << S_DELAYEDACKRESOLUTION) + +#define A_TP_MSL 0x394 + +#define A_TP_RXT_MIN 0x398 + +#define A_TP_RXT_MAX 0x39c + +#define A_TP_PERS_MIN 0x3a0 + +#define A_TP_PERS_MAX 0x3a4 + +#define A_TP_KEEP_IDLE 0x3a8 + +#define A_TP_KEEP_INTVL 0x3ac + +#define A_TP_INIT_SRTT 0x3b0 + +#define A_TP_DACK_TIMER 0x3b4 + +#define A_TP_FINWAIT2_TIMER 0x3b8 + +#define A_TP_SHIFT_CNT 0x3c0 + +#define S_SYNSHIFTMAX 24 + +#define M_SYNSHIFTMAX 0xff + +#define V_SYNSHIFTMAX(x) ((x) << S_SYNSHIFTMAX) + +#define S_RXTSHIFTMAXR1 20 + +#define M_RXTSHIFTMAXR1 0xf + +#define V_RXTSHIFTMAXR1(x) ((x) << S_RXTSHIFTMAXR1) + +#define S_RXTSHIFTMAXR2 16 + +#define M_RXTSHIFTMAXR2 0xf + +#define V_RXTSHIFTMAXR2(x) ((x) << S_RXTSHIFTMAXR2) + +#define S_PERSHIFTBACKOFFMAX 12 +#define M_PERSHIFTBACKOFFMAX 0xf +#define V_PERSHIFTBACKOFFMAX(x) ((x) << S_PERSHIFTBACKOFFMAX) + +#define S_PERSHIFTMAX 8 +#define M_PERSHIFTMAX 0xf +#define V_PERSHIFTMAX(x) ((x) << S_PERSHIFTMAX) + +#define S_KEEPALIVEMAX 0 + +#define M_KEEPALIVEMAX 0xff + +#define V_KEEPALIVEMAX(x) ((x) << S_KEEPALIVEMAX) + +#define A_TP_MTU_PORT_TABLE 0x3d0 + +#define A_TP_CCTRL_TABLE 0x3dc + +#define A_TP_MTU_TABLE 0x3e4 + +#define A_TP_RSS_MAP_TABLE 0x3e8 + +#define A_TP_RSS_LKP_TABLE 0x3ec + +#define A_TP_RSS_CONFIG 0x3f0 + +#define S_TNL4TUPEN 29 +#define V_TNL4TUPEN(x) ((x) << S_TNL4TUPEN) +#define F_TNL4TUPEN V_TNL4TUPEN(1U) + +#define S_TNL2TUPEN 28 +#define V_TNL2TUPEN(x) ((x) << S_TNL2TUPEN) +#define F_TNL2TUPEN V_TNL2TUPEN(1U) + +#define S_TNLPRTEN 26 +#define V_TNLPRTEN(x) ((x) << S_TNLPRTEN) +#define F_TNLPRTEN V_TNLPRTEN(1U) + +#define S_TNLMAPEN 25 +#define V_TNLMAPEN(x) ((x) << S_TNLMAPEN) +#define F_TNLMAPEN V_TNLMAPEN(1U) + +#define S_TNLLKPEN 24 +#define V_TNLLKPEN(x) ((x) << S_TNLLKPEN) +#define F_TNLLKPEN V_TNLLKPEN(1U) + +#define S_RRCPLMAPEN 7 +#define V_RRCPLMAPEN(x) ((x) << S_RRCPLMAPEN) +#define F_RRCPLMAPEN V_RRCPLMAPEN(1U) + +#define S_RRCPLCPUSIZE 4 +#define M_RRCPLCPUSIZE 0x7 +#define V_RRCPLCPUSIZE(x) ((x) << S_RRCPLCPUSIZE) + +#define S_RQFEEDBACKENABLE 3 +#define V_RQFEEDBACKENABLE(x) ((x) << S_RQFEEDBACKENABLE) +#define F_RQFEEDBACKENABLE V_RQFEEDBACKENABLE(1U) + +#define S_HASHTOEPLITZ 2 +#define V_HASHTOEPLITZ(x) ((x) << S_HASHTOEPLITZ) +#define F_HASHTOEPLITZ V_HASHTOEPLITZ(1U) + +#define S_DISABLE 0 + +#define A_TP_TM_PIO_ADDR 0x418 + +#define A_TP_TM_PIO_DATA 0x41c + +#define A_TP_TX_MOD_QUE_TABLE 0x420 + +#define A_TP_TX_RESOURCE_LIMIT 0x424 + +#define A_TP_TX_MOD_QUEUE_REQ_MAP 0x428 + +#define S_TX_MOD_QUEUE_REQ_MAP 0 +#define M_TX_MOD_QUEUE_REQ_MAP 0xff +#define V_TX_MOD_QUEUE_REQ_MAP(x) ((x) << S_TX_MOD_QUEUE_REQ_MAP) + +#define A_TP_TX_MOD_QUEUE_WEIGHT1 0x42c + +#define A_TP_TX_MOD_QUEUE_WEIGHT0 0x430 + +#define A_TP_MOD_CHANNEL_WEIGHT 0x434 + +#define A_TP_MOD_RATE_LIMIT 0x438 + +#define A_TP_PIO_ADDR 0x440 + +#define A_TP_PIO_DATA 0x444 + +#define A_TP_RESET 0x44c + +#define S_FLSTINITENABLE 1 +#define V_FLSTINITENABLE(x) ((x) << S_FLSTINITENABLE) +#define F_FLSTINITENABLE V_FLSTINITENABLE(1U) + +#define S_TPRESET 0 +#define V_TPRESET(x) ((x) << S_TPRESET) +#define F_TPRESET V_TPRESET(1U) + +#define A_TP_CMM_MM_RX_FLST_BASE 0x460 + +#define A_TP_CMM_MM_TX_FLST_BASE 0x464 + +#define A_TP_CMM_MM_PS_FLST_BASE 0x468 + +#define A_TP_MIB_INDEX 0x450 + +#define A_TP_MIB_RDATA 0x454 + +#define A_TP_CMM_MM_MAX_PSTRUCT 0x46c + +#define A_TP_INT_ENABLE 0x470 + +#define S_FLMTXFLSTEMPTY 30 +#define V_FLMTXFLSTEMPTY(x) ((x) << S_FLMTXFLSTEMPTY) +#define F_FLMTXFLSTEMPTY V_FLMTXFLSTEMPTY(1U) + +#define S_FLMRXFLSTEMPTY 29 +#define V_FLMRXFLSTEMPTY(x) ((x) << S_FLMRXFLSTEMPTY) +#define F_FLMRXFLSTEMPTY V_FLMRXFLSTEMPTY(1U) + +#define S_ARPLUTPERR 26 +#define V_ARPLUTPERR(x) ((x) << S_ARPLUTPERR) +#define F_ARPLUTPERR V_ARPLUTPERR(1U) + +#define S_CMCACHEPERR 24 +#define V_CMCACHEPERR(x) ((x) << S_CMCACHEPERR) +#define F_CMCACHEPERR V_CMCACHEPERR(1U) + +#define A_TP_INT_CAUSE 0x474 + +#define A_TP_TX_MOD_Q1_Q0_RATE_LIMIT 0x8 + +#define A_TP_TX_DROP_CFG_CH0 0x12b + +#define A_TP_TX_DROP_MODE 0x12f + +#define A_TP_EGRESS_CONFIG 0x145 + +#define S_REWRITEFORCETOSIZE 0 +#define V_REWRITEFORCETOSIZE(x) ((x) << S_REWRITEFORCETOSIZE) +#define F_REWRITEFORCETOSIZE V_REWRITEFORCETOSIZE(1U) + +#define A_TP_TX_TRC_KEY0 0x20 + +#define A_TP_RX_TRC_KEY0 0x120 + +#define A_TP_TX_DROP_CNT_CH0 0x12d + +#define S_TXDROPCNTCH0RCVD 0 +#define M_TXDROPCNTCH0RCVD 0xffff +#define V_TXDROPCNTCH0RCVD(x) ((x) << S_TXDROPCNTCH0RCVD) +#define G_TXDROPCNTCH0RCVD(x) (((x) >> S_TXDROPCNTCH0RCVD) & \ + M_TXDROPCNTCH0RCVD) + +#define A_TP_PROXY_FLOW_CNTL 0x4b0 + +#define A_TP_EMBED_OP_FIELD0 0x4e8 +#define A_TP_EMBED_OP_FIELD1 0x4ec +#define A_TP_EMBED_OP_FIELD2 0x4f0 +#define A_TP_EMBED_OP_FIELD3 0x4f4 +#define A_TP_EMBED_OP_FIELD4 0x4f8 +#define A_TP_EMBED_OP_FIELD5 0x4fc + +#define A_ULPRX_CTL 0x500 + +#define S_ROUND_ROBIN 4 +#define V_ROUND_ROBIN(x) ((x) << S_ROUND_ROBIN) +#define F_ROUND_ROBIN V_ROUND_ROBIN(1U) + +#define A_ULPRX_INT_ENABLE 0x504 + +#define S_DATASELFRAMEERR0 7 +#define V_DATASELFRAMEERR0(x) ((x) << S_DATASELFRAMEERR0) +#define F_DATASELFRAMEERR0 V_DATASELFRAMEERR0(1U) + +#define S_DATASELFRAMEERR1 6 +#define V_DATASELFRAMEERR1(x) ((x) << S_DATASELFRAMEERR1) +#define F_DATASELFRAMEERR1 V_DATASELFRAMEERR1(1U) + +#define S_PCMDMUXPERR 5 +#define V_PCMDMUXPERR(x) ((x) << S_PCMDMUXPERR) +#define F_PCMDMUXPERR V_PCMDMUXPERR(1U) + +#define S_ARBFPERR 4 +#define V_ARBFPERR(x) ((x) << S_ARBFPERR) +#define F_ARBFPERR V_ARBFPERR(1U) + +#define S_ARBPF0PERR 3 +#define V_ARBPF0PERR(x) ((x) << S_ARBPF0PERR) +#define F_ARBPF0PERR V_ARBPF0PERR(1U) + +#define S_ARBPF1PERR 2 +#define V_ARBPF1PERR(x) ((x) << S_ARBPF1PERR) +#define F_ARBPF1PERR V_ARBPF1PERR(1U) + +#define S_PARERRPCMD 1 +#define V_PARERRPCMD(x) ((x) << S_PARERRPCMD) +#define F_PARERRPCMD V_PARERRPCMD(1U) + +#define S_PARERRDATA 0 +#define V_PARERRDATA(x) ((x) << S_PARERRDATA) +#define F_PARERRDATA V_PARERRDATA(1U) + +#define A_ULPRX_INT_CAUSE 0x508 + +#define A_ULPRX_ISCSI_LLIMIT 0x50c + +#define A_ULPRX_ISCSI_ULIMIT 0x510 + +#define A_ULPRX_ISCSI_TAGMASK 0x514 + +#define A_ULPRX_ISCSI_PSZ 0x518 + +#define A_ULPRX_TDDP_LLIMIT 0x51c + +#define A_ULPRX_TDDP_ULIMIT 0x520 +#define A_ULPRX_TDDP_PSZ 0x528 + +#define S_HPZ0 0 +#define M_HPZ0 0xf +#define V_HPZ0(x) ((x) << S_HPZ0) +#define G_HPZ0(x) (((x) >> S_HPZ0) & M_HPZ0) + +#define A_ULPRX_STAG_LLIMIT 0x52c + +#define A_ULPRX_STAG_ULIMIT 0x530 + +#define A_ULPRX_RQ_LLIMIT 0x534 + +#define A_ULPRX_RQ_ULIMIT 0x538 + +#define A_ULPRX_PBL_LLIMIT 0x53c + +#define A_ULPRX_PBL_ULIMIT 0x540 + +#define A_ULPRX_TDDP_TAGMASK 0x524 + +#define A_ULPTX_CONFIG 0x580 + +#define S_CFG_CQE_SOP_MASK 1 +#define V_CFG_CQE_SOP_MASK(x) ((x) << S_CFG_CQE_SOP_MASK) +#define F_CFG_CQE_SOP_MASK V_CFG_CQE_SOP_MASK(1U) + +#define S_CFG_RR_ARB 0 +#define V_CFG_RR_ARB(x) ((x) << S_CFG_RR_ARB) +#define F_CFG_RR_ARB V_CFG_RR_ARB(1U) + +#define A_ULPTX_INT_ENABLE 0x584 + +#define S_PBL_BOUND_ERR_CH1 1 +#define V_PBL_BOUND_ERR_CH1(x) ((x) << S_PBL_BOUND_ERR_CH1) +#define F_PBL_BOUND_ERR_CH1 V_PBL_BOUND_ERR_CH1(1U) + +#define S_PBL_BOUND_ERR_CH0 0 +#define V_PBL_BOUND_ERR_CH0(x) ((x) << S_PBL_BOUND_ERR_CH0) +#define F_PBL_BOUND_ERR_CH0 V_PBL_BOUND_ERR_CH0(1U) + +#define A_ULPTX_INT_CAUSE 0x588 + +#define A_ULPTX_TPT_LLIMIT 0x58c + +#define A_ULPTX_TPT_ULIMIT 0x590 + +#define A_ULPTX_PBL_LLIMIT 0x594 + +#define A_ULPTX_PBL_ULIMIT 0x598 + +#define A_ULPTX_DMA_WEIGHT 0x5ac + +#define S_D1_WEIGHT 16 +#define M_D1_WEIGHT 0xffff +#define V_D1_WEIGHT(x) ((x) << S_D1_WEIGHT) + +#define S_D0_WEIGHT 0 +#define M_D0_WEIGHT 0xffff +#define V_D0_WEIGHT(x) ((x) << S_D0_WEIGHT) + +#define A_PM1_RX_CFG 0x5c0 +#define A_PM1_RX_MODE 0x5c4 + +#define A_PM1_RX_INT_ENABLE 0x5d8 + +#define S_ZERO_E_CMD_ERROR 18 +#define V_ZERO_E_CMD_ERROR(x) ((x) << S_ZERO_E_CMD_ERROR) +#define F_ZERO_E_CMD_ERROR V_ZERO_E_CMD_ERROR(1U) + +#define S_IESPI0_FIFO2X_RX_FRAMING_ERROR 17 +#define V_IESPI0_FIFO2X_RX_FRAMING_ERROR(x) ((x) << S_IESPI0_FIFO2X_RX_FRAMING_ERROR) +#define F_IESPI0_FIFO2X_RX_FRAMING_ERROR V_IESPI0_FIFO2X_RX_FRAMING_ERROR(1U) + +#define S_IESPI1_FIFO2X_RX_FRAMING_ERROR 16 +#define V_IESPI1_FIFO2X_RX_FRAMING_ERROR(x) ((x) << S_IESPI1_FIFO2X_RX_FRAMING_ERROR) +#define F_IESPI1_FIFO2X_RX_FRAMING_ERROR V_IESPI1_FIFO2X_RX_FRAMING_ERROR(1U) + +#define S_IESPI0_RX_FRAMING_ERROR 15 +#define V_IESPI0_RX_FRAMING_ERROR(x) ((x) << S_IESPI0_RX_FRAMING_ERROR) +#define F_IESPI0_RX_FRAMING_ERROR V_IESPI0_RX_FRAMING_ERROR(1U) + +#define S_IESPI1_RX_FRAMING_ERROR 14 +#define V_IESPI1_RX_FRAMING_ERROR(x) ((x) << S_IESPI1_RX_FRAMING_ERROR) +#define F_IESPI1_RX_FRAMING_ERROR V_IESPI1_RX_FRAMING_ERROR(1U) + +#define S_IESPI0_TX_FRAMING_ERROR 13 +#define V_IESPI0_TX_FRAMING_ERROR(x) ((x) << S_IESPI0_TX_FRAMING_ERROR) +#define F_IESPI0_TX_FRAMING_ERROR V_IESPI0_TX_FRAMING_ERROR(1U) + +#define S_IESPI1_TX_FRAMING_ERROR 12 +#define V_IESPI1_TX_FRAMING_ERROR(x) ((x) << S_IESPI1_TX_FRAMING_ERROR) +#define F_IESPI1_TX_FRAMING_ERROR V_IESPI1_TX_FRAMING_ERROR(1U) + +#define S_OCSPI0_RX_FRAMING_ERROR 11 +#define V_OCSPI0_RX_FRAMING_ERROR(x) ((x) << S_OCSPI0_RX_FRAMING_ERROR) +#define F_OCSPI0_RX_FRAMING_ERROR V_OCSPI0_RX_FRAMING_ERROR(1U) + +#define S_OCSPI1_RX_FRAMING_ERROR 10 +#define V_OCSPI1_RX_FRAMING_ERROR(x) ((x) << S_OCSPI1_RX_FRAMING_ERROR) +#define F_OCSPI1_RX_FRAMING_ERROR V_OCSPI1_RX_FRAMING_ERROR(1U) + +#define S_OCSPI0_TX_FRAMING_ERROR 9 +#define V_OCSPI0_TX_FRAMING_ERROR(x) ((x) << S_OCSPI0_TX_FRAMING_ERROR) +#define F_OCSPI0_TX_FRAMING_ERROR V_OCSPI0_TX_FRAMING_ERROR(1U) + +#define S_OCSPI1_TX_FRAMING_ERROR 8 +#define V_OCSPI1_TX_FRAMING_ERROR(x) ((x) << S_OCSPI1_TX_FRAMING_ERROR) +#define F_OCSPI1_TX_FRAMING_ERROR V_OCSPI1_TX_FRAMING_ERROR(1U) + +#define S_OCSPI0_OFIFO2X_TX_FRAMING_ERROR 7 +#define V_OCSPI0_OFIFO2X_TX_FRAMING_ERROR(x) ((x) << S_OCSPI0_OFIFO2X_TX_FRAMING_ERROR) +#define F_OCSPI0_OFIFO2X_TX_FRAMING_ERROR V_OCSPI0_OFIFO2X_TX_FRAMING_ERROR(1U) + +#define S_OCSPI1_OFIFO2X_TX_FRAMING_ERROR 6 +#define V_OCSPI1_OFIFO2X_TX_FRAMING_ERROR(x) ((x) << S_OCSPI1_OFIFO2X_TX_FRAMING_ERROR) +#define F_OCSPI1_OFIFO2X_TX_FRAMING_ERROR V_OCSPI1_OFIFO2X_TX_FRAMING_ERROR(1U) + +#define S_IESPI_PAR_ERROR 3 +#define M_IESPI_PAR_ERROR 0x7 + +#define V_IESPI_PAR_ERROR(x) ((x) << S_IESPI_PAR_ERROR) + +#define S_OCSPI_PAR_ERROR 0 +#define M_OCSPI_PAR_ERROR 0x7 + +#define V_OCSPI_PAR_ERROR(x) ((x) << S_OCSPI_PAR_ERROR) + +#define A_PM1_RX_INT_CAUSE 0x5dc + +#define A_PM1_TX_CFG 0x5e0 +#define A_PM1_TX_MODE 0x5e4 + +#define A_PM1_TX_INT_ENABLE 0x5f8 + +#define S_ZERO_C_CMD_ERROR 18 +#define V_ZERO_C_CMD_ERROR(x) ((x) << S_ZERO_C_CMD_ERROR) +#define F_ZERO_C_CMD_ERROR V_ZERO_C_CMD_ERROR(1U) + +#define S_ICSPI0_FIFO2X_RX_FRAMING_ERROR 17 +#define V_ICSPI0_FIFO2X_RX_FRAMING_ERROR(x) ((x) << S_ICSPI0_FIFO2X_RX_FRAMING_ERROR) +#define F_ICSPI0_FIFO2X_RX_FRAMING_ERROR V_ICSPI0_FIFO2X_RX_FRAMING_ERROR(1U) + +#define S_ICSPI1_FIFO2X_RX_FRAMING_ERROR 16 +#define V_ICSPI1_FIFO2X_RX_FRAMING_ERROR(x) ((x) << S_ICSPI1_FIFO2X_RX_FRAMING_ERROR) +#define F_ICSPI1_FIFO2X_RX_FRAMING_ERROR V_ICSPI1_FIFO2X_RX_FRAMING_ERROR(1U) + +#define S_ICSPI0_RX_FRAMING_ERROR 15 +#define V_ICSPI0_RX_FRAMING_ERROR(x) ((x) << S_ICSPI0_RX_FRAMING_ERROR) +#define F_ICSPI0_RX_FRAMING_ERROR V_ICSPI0_RX_FRAMING_ERROR(1U) + +#define S_ICSPI1_RX_FRAMING_ERROR 14 +#define V_ICSPI1_RX_FRAMING_ERROR(x) ((x) << S_ICSPI1_RX_FRAMING_ERROR) +#define F_ICSPI1_RX_FRAMING_ERROR V_ICSPI1_RX_FRAMING_ERROR(1U) + +#define S_ICSPI0_TX_FRAMING_ERROR 13 +#define V_ICSPI0_TX_FRAMING_ERROR(x) ((x) << S_ICSPI0_TX_FRAMING_ERROR) +#define F_ICSPI0_TX_FRAMING_ERROR V_ICSPI0_TX_FRAMING_ERROR(1U) + +#define S_ICSPI1_TX_FRAMING_ERROR 12 +#define V_ICSPI1_TX_FRAMING_ERROR(x) ((x) << S_ICSPI1_TX_FRAMING_ERROR) +#define F_ICSPI1_TX_FRAMING_ERROR V_ICSPI1_TX_FRAMING_ERROR(1U) + +#define S_OESPI0_RX_FRAMING_ERROR 11 +#define V_OESPI0_RX_FRAMING_ERROR(x) ((x) << S_OESPI0_RX_FRAMING_ERROR) +#define F_OESPI0_RX_FRAMING_ERROR V_OESPI0_RX_FRAMING_ERROR(1U) + +#define S_OESPI1_RX_FRAMING_ERROR 10 +#define V_OESPI1_RX_FRAMING_ERROR(x) ((x) << S_OESPI1_RX_FRAMING_ERROR) +#define F_OESPI1_RX_FRAMING_ERROR V_OESPI1_RX_FRAMING_ERROR(1U) + +#define S_OESPI0_TX_FRAMING_ERROR 9 +#define V_OESPI0_TX_FRAMING_ERROR(x) ((x) << S_OESPI0_TX_FRAMING_ERROR) +#define F_OESPI0_TX_FRAMING_ERROR V_OESPI0_TX_FRAMING_ERROR(1U) + +#define S_OESPI1_TX_FRAMING_ERROR 8 +#define V_OESPI1_TX_FRAMING_ERROR(x) ((x) << S_OESPI1_TX_FRAMING_ERROR) +#define F_OESPI1_TX_FRAMING_ERROR V_OESPI1_TX_FRAMING_ERROR(1U) + +#define S_OESPI0_OFIFO2X_TX_FRAMING_ERROR 7 +#define V_OESPI0_OFIFO2X_TX_FRAMING_ERROR(x) ((x) << S_OESPI0_OFIFO2X_TX_FRAMING_ERROR) +#define F_OESPI0_OFIFO2X_TX_FRAMING_ERROR V_OESPI0_OFIFO2X_TX_FRAMING_ERROR(1U) + +#define S_OESPI1_OFIFO2X_TX_FRAMING_ERROR 6 +#define V_OESPI1_OFIFO2X_TX_FRAMING_ERROR(x) ((x) << S_OESPI1_OFIFO2X_TX_FRAMING_ERROR) +#define F_OESPI1_OFIFO2X_TX_FRAMING_ERROR V_OESPI1_OFIFO2X_TX_FRAMING_ERROR(1U) + +#define S_ICSPI_PAR_ERROR 3 +#define M_ICSPI_PAR_ERROR 0x7 + +#define V_ICSPI_PAR_ERROR(x) ((x) << S_ICSPI_PAR_ERROR) + +#define S_OESPI_PAR_ERROR 0 +#define M_OESPI_PAR_ERROR 0x7 + +#define V_OESPI_PAR_ERROR(x) ((x) << S_OESPI_PAR_ERROR) + +#define A_PM1_TX_INT_CAUSE 0x5fc + +#define A_MPS_CFG 0x600 + +#define S_TPRXPORTEN 4 +#define V_TPRXPORTEN(x) ((x) << S_TPRXPORTEN) +#define F_TPRXPORTEN V_TPRXPORTEN(1U) + +#define S_TPTXPORT1EN 3 +#define V_TPTXPORT1EN(x) ((x) << S_TPTXPORT1EN) +#define F_TPTXPORT1EN V_TPTXPORT1EN(1U) + +#define S_TPTXPORT0EN 2 +#define V_TPTXPORT0EN(x) ((x) << S_TPTXPORT0EN) +#define F_TPTXPORT0EN V_TPTXPORT0EN(1U) + +#define S_PORT1ACTIVE 1 +#define V_PORT1ACTIVE(x) ((x) << S_PORT1ACTIVE) +#define F_PORT1ACTIVE V_PORT1ACTIVE(1U) + +#define S_PORT0ACTIVE 0 +#define V_PORT0ACTIVE(x) ((x) << S_PORT0ACTIVE) +#define F_PORT0ACTIVE V_PORT0ACTIVE(1U) + +#define S_ENFORCEPKT 11 +#define V_ENFORCEPKT(x) ((x) << S_ENFORCEPKT) +#define F_ENFORCEPKT V_ENFORCEPKT(1U) + +#define A_MPS_INT_ENABLE 0x61c + +#define S_MCAPARERRENB 6 +#define M_MCAPARERRENB 0x7 + +#define V_MCAPARERRENB(x) ((x) << S_MCAPARERRENB) + +#define S_RXTPPARERRENB 4 +#define M_RXTPPARERRENB 0x3 + +#define V_RXTPPARERRENB(x) ((x) << S_RXTPPARERRENB) + +#define S_TX1TPPARERRENB 2 +#define M_TX1TPPARERRENB 0x3 + +#define V_TX1TPPARERRENB(x) ((x) << S_TX1TPPARERRENB) + +#define S_TX0TPPARERRENB 0 +#define M_TX0TPPARERRENB 0x3 + +#define V_TX0TPPARERRENB(x) ((x) << S_TX0TPPARERRENB) + +#define A_MPS_INT_CAUSE 0x620 + +#define S_MCAPARERR 6 +#define M_MCAPARERR 0x7 + +#define V_MCAPARERR(x) ((x) << S_MCAPARERR) + +#define S_RXTPPARERR 4 +#define M_RXTPPARERR 0x3 + +#define V_RXTPPARERR(x) ((x) << S_RXTPPARERR) + +#define S_TX1TPPARERR 2 +#define M_TX1TPPARERR 0x3 + +#define V_TX1TPPARERR(x) ((x) << S_TX1TPPARERR) + +#define S_TX0TPPARERR 0 +#define M_TX0TPPARERR 0x3 + +#define V_TX0TPPARERR(x) ((x) << S_TX0TPPARERR) + +#define A_CPL_SWITCH_CNTRL 0x640 + +#define A_CPL_INTR_ENABLE 0x650 + +#define S_CIM_OP_MAP_PERR 5 +#define V_CIM_OP_MAP_PERR(x) ((x) << S_CIM_OP_MAP_PERR) +#define F_CIM_OP_MAP_PERR V_CIM_OP_MAP_PERR(1U) + +#define S_CIM_OVFL_ERROR 4 +#define V_CIM_OVFL_ERROR(x) ((x) << S_CIM_OVFL_ERROR) +#define F_CIM_OVFL_ERROR V_CIM_OVFL_ERROR(1U) + +#define S_TP_FRAMING_ERROR 3 +#define V_TP_FRAMING_ERROR(x) ((x) << S_TP_FRAMING_ERROR) +#define F_TP_FRAMING_ERROR V_TP_FRAMING_ERROR(1U) + +#define S_SGE_FRAMING_ERROR 2 +#define V_SGE_FRAMING_ERROR(x) ((x) << S_SGE_FRAMING_ERROR) +#define F_SGE_FRAMING_ERROR V_SGE_FRAMING_ERROR(1U) + +#define S_CIM_FRAMING_ERROR 1 +#define V_CIM_FRAMING_ERROR(x) ((x) << S_CIM_FRAMING_ERROR) +#define F_CIM_FRAMING_ERROR V_CIM_FRAMING_ERROR(1U) + +#define S_ZERO_SWITCH_ERROR 0 +#define V_ZERO_SWITCH_ERROR(x) ((x) << S_ZERO_SWITCH_ERROR) +#define F_ZERO_SWITCH_ERROR V_ZERO_SWITCH_ERROR(1U) + +#define A_CPL_INTR_CAUSE 0x654 + +#define A_CPL_MAP_TBL_DATA 0x65c + +#define A_SMB_GLOBAL_TIME_CFG 0x660 + +#define A_I2C_CFG 0x6a0 + +#define S_I2C_CLKDIV 0 +#define M_I2C_CLKDIV 0xfff +#define V_I2C_CLKDIV(x) ((x) << S_I2C_CLKDIV) + +#define A_MI1_CFG 0x6b0 + +#define S_CLKDIV 5 +#define M_CLKDIV 0xff +#define V_CLKDIV(x) ((x) << S_CLKDIV) + +#define S_ST 3 + +#define M_ST 0x3 + +#define V_ST(x) ((x) << S_ST) + +#define G_ST(x) (((x) >> S_ST) & M_ST) + +#define S_PREEN 2 +#define V_PREEN(x) ((x) << S_PREEN) +#define F_PREEN V_PREEN(1U) + +#define S_MDIINV 1 +#define V_MDIINV(x) ((x) << S_MDIINV) +#define F_MDIINV V_MDIINV(1U) + +#define S_MDIEN 0 +#define V_MDIEN(x) ((x) << S_MDIEN) +#define F_MDIEN V_MDIEN(1U) + +#define A_MI1_ADDR 0x6b4 + +#define S_PHYADDR 5 +#define M_PHYADDR 0x1f +#define V_PHYADDR(x) ((x) << S_PHYADDR) + +#define S_REGADDR 0 +#define M_REGADDR 0x1f +#define V_REGADDR(x) ((x) << S_REGADDR) + +#define A_MI1_DATA 0x6b8 + +#define A_MI1_OP 0x6bc + +#define S_MDI_OP 0 +#define M_MDI_OP 0x3 +#define V_MDI_OP(x) ((x) << S_MDI_OP) + +#define A_SF_DATA 0x6d8 + +#define A_SF_OP 0x6dc + +#define S_BYTECNT 1 +#define M_BYTECNT 0x3 +#define V_BYTECNT(x) ((x) << S_BYTECNT) + +#define A_PL_INT_ENABLE0 0x6e0 + +#define S_T3DBG 23 +#define V_T3DBG(x) ((x) << S_T3DBG) +#define F_T3DBG V_T3DBG(1U) + +#define S_XGMAC0_1 20 +#define V_XGMAC0_1(x) ((x) << S_XGMAC0_1) +#define F_XGMAC0_1 V_XGMAC0_1(1U) + +#define S_XGMAC0_0 19 +#define V_XGMAC0_0(x) ((x) << S_XGMAC0_0) +#define F_XGMAC0_0 V_XGMAC0_0(1U) + +#define S_MC5A 18 +#define V_MC5A(x) ((x) << S_MC5A) +#define F_MC5A V_MC5A(1U) + +#define S_CPL_SWITCH 12 +#define V_CPL_SWITCH(x) ((x) << S_CPL_SWITCH) +#define F_CPL_SWITCH V_CPL_SWITCH(1U) + +#define S_MPS0 11 +#define V_MPS0(x) ((x) << S_MPS0) +#define F_MPS0 V_MPS0(1U) + +#define S_PM1_TX 10 +#define V_PM1_TX(x) ((x) << S_PM1_TX) +#define F_PM1_TX V_PM1_TX(1U) + +#define S_PM1_RX 9 +#define V_PM1_RX(x) ((x) << S_PM1_RX) +#define F_PM1_RX V_PM1_RX(1U) + +#define S_ULP2_TX 8 +#define V_ULP2_TX(x) ((x) << S_ULP2_TX) +#define F_ULP2_TX V_ULP2_TX(1U) + +#define S_ULP2_RX 7 +#define V_ULP2_RX(x) ((x) << S_ULP2_RX) +#define F_ULP2_RX V_ULP2_RX(1U) + +#define S_TP1 6 +#define V_TP1(x) ((x) << S_TP1) +#define F_TP1 V_TP1(1U) + +#define S_CIM 5 +#define V_CIM(x) ((x) << S_CIM) +#define F_CIM V_CIM(1U) + +#define S_MC7_CM 4 +#define V_MC7_CM(x) ((x) << S_MC7_CM) +#define F_MC7_CM V_MC7_CM(1U) + +#define S_MC7_PMTX 3 +#define V_MC7_PMTX(x) ((x) << S_MC7_PMTX) +#define F_MC7_PMTX V_MC7_PMTX(1U) + +#define S_MC7_PMRX 2 +#define V_MC7_PMRX(x) ((x) << S_MC7_PMRX) +#define F_MC7_PMRX V_MC7_PMRX(1U) + +#define S_PCIM0 1 +#define V_PCIM0(x) ((x) << S_PCIM0) +#define F_PCIM0 V_PCIM0(1U) + +#define S_SGE3 0 +#define V_SGE3(x) ((x) << S_SGE3) +#define F_SGE3 V_SGE3(1U) + +#define A_PL_INT_CAUSE0 0x6e4 + +#define A_PL_RST 0x6f0 + +#define S_FATALPERREN 4 +#define V_FATALPERREN(x) ((x) << S_FATALPERREN) +#define F_FATALPERREN V_FATALPERREN(1U) + +#define S_CRSTWRM 1 +#define V_CRSTWRM(x) ((x) << S_CRSTWRM) +#define F_CRSTWRM V_CRSTWRM(1U) + +#define A_PL_REV 0x6f4 + +#define A_PL_CLI 0x6f8 + +#define A_MC5_DB_CONFIG 0x704 + +#define S_TMTYPEHI 30 +#define V_TMTYPEHI(x) ((x) << S_TMTYPEHI) +#define F_TMTYPEHI V_TMTYPEHI(1U) + +#define S_TMPARTSIZE 28 +#define M_TMPARTSIZE 0x3 +#define V_TMPARTSIZE(x) ((x) << S_TMPARTSIZE) +#define G_TMPARTSIZE(x) (((x) >> S_TMPARTSIZE) & M_TMPARTSIZE) + +#define S_TMTYPE 26 +#define M_TMTYPE 0x3 +#define V_TMTYPE(x) ((x) << S_TMTYPE) +#define G_TMTYPE(x) (((x) >> S_TMTYPE) & M_TMTYPE) + +#define S_COMPEN 17 +#define V_COMPEN(x) ((x) << S_COMPEN) +#define F_COMPEN V_COMPEN(1U) + +#define S_PRTYEN 6 +#define V_PRTYEN(x) ((x) << S_PRTYEN) +#define F_PRTYEN V_PRTYEN(1U) + +#define S_MBUSEN 5 +#define V_MBUSEN(x) ((x) << S_MBUSEN) +#define F_MBUSEN V_MBUSEN(1U) + +#define S_DBGIEN 4 +#define V_DBGIEN(x) ((x) << S_DBGIEN) +#define F_DBGIEN V_DBGIEN(1U) + +#define S_TMRDY 2 +#define V_TMRDY(x) ((x) << S_TMRDY) +#define F_TMRDY V_TMRDY(1U) + +#define S_TMRST 1 +#define V_TMRST(x) ((x) << S_TMRST) +#define F_TMRST V_TMRST(1U) + +#define S_TMMODE 0 +#define V_TMMODE(x) ((x) << S_TMMODE) +#define F_TMMODE V_TMMODE(1U) + +#define A_MC5_DB_ROUTING_TABLE_INDEX 0x70c + +#define A_MC5_DB_FILTER_TABLE 0x710 + +#define A_MC5_DB_SERVER_INDEX 0x714 + +#define A_MC5_DB_RSP_LATENCY 0x720 + +#define S_RDLAT 16 +#define M_RDLAT 0x1f +#define V_RDLAT(x) ((x) << S_RDLAT) + +#define S_LRNLAT 8 +#define M_LRNLAT 0x1f +#define V_LRNLAT(x) ((x) << S_LRNLAT) + +#define S_SRCHLAT 0 +#define M_SRCHLAT 0x1f +#define V_SRCHLAT(x) ((x) << S_SRCHLAT) + +#define A_MC5_DB_PART_ID_INDEX 0x72c + +#define A_MC5_DB_INT_ENABLE 0x740 + +#define S_DELACTEMPTY 18 +#define V_DELACTEMPTY(x) ((x) << S_DELACTEMPTY) +#define F_DELACTEMPTY V_DELACTEMPTY(1U) + +#define S_DISPQPARERR 17 +#define V_DISPQPARERR(x) ((x) << S_DISPQPARERR) +#define F_DISPQPARERR V_DISPQPARERR(1U) + +#define S_REQQPARERR 16 +#define V_REQQPARERR(x) ((x) << S_REQQPARERR) +#define F_REQQPARERR V_REQQPARERR(1U) + +#define S_UNKNOWNCMD 15 +#define V_UNKNOWNCMD(x) ((x) << S_UNKNOWNCMD) +#define F_UNKNOWNCMD V_UNKNOWNCMD(1U) + +#define S_NFASRCHFAIL 8 +#define V_NFASRCHFAIL(x) ((x) << S_NFASRCHFAIL) +#define F_NFASRCHFAIL V_NFASRCHFAIL(1U) + +#define S_ACTRGNFULL 7 +#define V_ACTRGNFULL(x) ((x) << S_ACTRGNFULL) +#define F_ACTRGNFULL V_ACTRGNFULL(1U) + +#define S_PARITYERR 6 +#define V_PARITYERR(x) ((x) << S_PARITYERR) +#define F_PARITYERR V_PARITYERR(1U) + +#define A_MC5_DB_INT_CAUSE 0x744 + +#define A_MC5_DB_DBGI_CONFIG 0x774 + +#define A_MC5_DB_DBGI_REQ_CMD 0x778 + +#define A_MC5_DB_DBGI_REQ_ADDR0 0x77c + +#define A_MC5_DB_DBGI_REQ_ADDR1 0x780 + +#define A_MC5_DB_DBGI_REQ_ADDR2 0x784 + +#define A_MC5_DB_DBGI_REQ_DATA0 0x788 + +#define A_MC5_DB_DBGI_REQ_DATA1 0x78c + +#define A_MC5_DB_DBGI_REQ_DATA2 0x790 + +#define A_MC5_DB_DBGI_RSP_STATUS 0x7b0 + +#define S_DBGIRSPVALID 0 +#define V_DBGIRSPVALID(x) ((x) << S_DBGIRSPVALID) +#define F_DBGIRSPVALID V_DBGIRSPVALID(1U) + +#define A_MC5_DB_DBGI_RSP_DATA0 0x7b4 + +#define A_MC5_DB_DBGI_RSP_DATA1 0x7b8 + +#define A_MC5_DB_DBGI_RSP_DATA2 0x7bc + +#define A_MC5_DB_POPEN_DATA_WR_CMD 0x7cc + +#define A_MC5_DB_POPEN_MASK_WR_CMD 0x7d0 + +#define A_MC5_DB_AOPEN_SRCH_CMD 0x7d4 + +#define A_MC5_DB_AOPEN_LRN_CMD 0x7d8 + +#define A_MC5_DB_SYN_SRCH_CMD 0x7dc + +#define A_MC5_DB_SYN_LRN_CMD 0x7e0 + +#define A_MC5_DB_ACK_SRCH_CMD 0x7e4 + +#define A_MC5_DB_ACK_LRN_CMD 0x7e8 + +#define A_MC5_DB_ILOOKUP_CMD 0x7ec + +#define A_MC5_DB_ELOOKUP_CMD 0x7f0 + +#define A_MC5_DB_DATA_WRITE_CMD 0x7f4 + +#define A_MC5_DB_DATA_READ_CMD 0x7f8 + +#define XGMAC0_0_BASE_ADDR 0x800 + +#define A_XGM_TX_CTRL 0x800 + +#define S_TXEN 0 +#define V_TXEN(x) ((x) << S_TXEN) +#define F_TXEN V_TXEN(1U) + +#define A_XGM_TX_CFG 0x804 + +#define S_TXPAUSEEN 0 +#define V_TXPAUSEEN(x) ((x) << S_TXPAUSEEN) +#define F_TXPAUSEEN V_TXPAUSEEN(1U) + +#define A_XGM_TX_PAUSE_QUANTA 0x808 + +#define A_XGM_RX_CTRL 0x80c + +#define S_RXEN 0 +#define V_RXEN(x) ((x) << S_RXEN) +#define F_RXEN V_RXEN(1U) + +#define A_XGM_RX_CFG 0x810 + +#define S_DISPAUSEFRAMES 9 +#define V_DISPAUSEFRAMES(x) ((x) << S_DISPAUSEFRAMES) +#define F_DISPAUSEFRAMES V_DISPAUSEFRAMES(1U) + +#define S_EN1536BFRAMES 8 +#define V_EN1536BFRAMES(x) ((x) << S_EN1536BFRAMES) +#define F_EN1536BFRAMES V_EN1536BFRAMES(1U) + +#define S_ENJUMBO 7 +#define V_ENJUMBO(x) ((x) << S_ENJUMBO) +#define F_ENJUMBO V_ENJUMBO(1U) + +#define S_RMFCS 6 +#define V_RMFCS(x) ((x) << S_RMFCS) +#define F_RMFCS V_RMFCS(1U) + +#define S_ENHASHMCAST 2 +#define V_ENHASHMCAST(x) ((x) << S_ENHASHMCAST) +#define F_ENHASHMCAST V_ENHASHMCAST(1U) + +#define S_COPYALLFRAMES 0 +#define V_COPYALLFRAMES(x) ((x) << S_COPYALLFRAMES) +#define F_COPYALLFRAMES V_COPYALLFRAMES(1U) + +#define S_DISBCAST 1 +#define V_DISBCAST(x) ((x) << S_DISBCAST) +#define F_DISBCAST V_DISBCAST(1U) + +#define A_XGM_RX_HASH_LOW 0x814 + +#define A_XGM_RX_HASH_HIGH 0x818 + +#define A_XGM_RX_EXACT_MATCH_LOW_1 0x81c + +#define A_XGM_RX_EXACT_MATCH_HIGH_1 0x820 + +#define A_XGM_RX_EXACT_MATCH_LOW_2 0x824 + +#define A_XGM_RX_EXACT_MATCH_LOW_3 0x82c + +#define A_XGM_RX_EXACT_MATCH_LOW_4 0x834 + +#define A_XGM_RX_EXACT_MATCH_LOW_5 0x83c + +#define A_XGM_RX_EXACT_MATCH_LOW_6 0x844 + +#define A_XGM_RX_EXACT_MATCH_LOW_7 0x84c + +#define A_XGM_RX_EXACT_MATCH_LOW_8 0x854 + +#define A_XGM_INT_STATUS 0x86c + +#define S_LINKFAULTCHANGE 9 +#define V_LINKFAULTCHANGE(x) ((x) << S_LINKFAULTCHANGE) +#define F_LINKFAULTCHANGE V_LINKFAULTCHANGE(1U) + +#define A_XGM_XGM_INT_ENABLE 0x874 +#define A_XGM_XGM_INT_DISABLE 0x878 + +#define A_XGM_STAT_CTRL 0x880 + +#define S_CLRSTATS 2 +#define V_CLRSTATS(x) ((x) << S_CLRSTATS) +#define F_CLRSTATS V_CLRSTATS(1U) + +#define A_XGM_RXFIFO_CFG 0x884 + +#define S_RXFIFO_EMPTY 31 +#define V_RXFIFO_EMPTY(x) ((x) << S_RXFIFO_EMPTY) +#define F_RXFIFO_EMPTY V_RXFIFO_EMPTY(1U) + +#define S_RXFIFOPAUSEHWM 17 +#define M_RXFIFOPAUSEHWM 0xfff + +#define V_RXFIFOPAUSEHWM(x) ((x) << S_RXFIFOPAUSEHWM) + +#define G_RXFIFOPAUSEHWM(x) (((x) >> S_RXFIFOPAUSEHWM) & M_RXFIFOPAUSEHWM) + +#define S_RXFIFOPAUSELWM 5 +#define M_RXFIFOPAUSELWM 0xfff + +#define V_RXFIFOPAUSELWM(x) ((x) << S_RXFIFOPAUSELWM) + +#define G_RXFIFOPAUSELWM(x) (((x) >> S_RXFIFOPAUSELWM) & M_RXFIFOPAUSELWM) + +#define S_RXSTRFRWRD 1 +#define V_RXSTRFRWRD(x) ((x) << S_RXSTRFRWRD) +#define F_RXSTRFRWRD V_RXSTRFRWRD(1U) + +#define S_DISERRFRAMES 0 +#define V_DISERRFRAMES(x) ((x) << S_DISERRFRAMES) +#define F_DISERRFRAMES V_DISERRFRAMES(1U) + +#define A_XGM_TXFIFO_CFG 0x888 + +#define S_UNDERUNFIX 22 +#define V_UNDERUNFIX(x) ((x) << S_UNDERUNFIX) +#define F_UNDERUNFIX V_UNDERUNFIX(1U) + +#define S_TXIPG 13 +#define M_TXIPG 0xff +#define V_TXIPG(x) ((x) << S_TXIPG) +#define G_TXIPG(x) (((x) >> S_TXIPG) & M_TXIPG) + +#define S_TXFIFOTHRESH 4 +#define M_TXFIFOTHRESH 0x1ff + +#define V_TXFIFOTHRESH(x) ((x) << S_TXFIFOTHRESH) + +#define S_ENDROPPKT 21 +#define V_ENDROPPKT(x) ((x) << S_ENDROPPKT) +#define F_ENDROPPKT V_ENDROPPKT(1U) + +#define A_XGM_SERDES_CTRL 0x890 +#define A_XGM_SERDES_CTRL0 0x8e0 + +#define S_SERDESRESET_ 24 +#define V_SERDESRESET_(x) ((x) << S_SERDESRESET_) +#define F_SERDESRESET_ V_SERDESRESET_(1U) + +#define S_RXENABLE 4 +#define V_RXENABLE(x) ((x) << S_RXENABLE) +#define F_RXENABLE V_RXENABLE(1U) + +#define S_TXENABLE 3 +#define V_TXENABLE(x) ((x) << S_TXENABLE) +#define F_TXENABLE V_TXENABLE(1U) + +#define A_XGM_PAUSE_TIMER 0x890 + +#define A_XGM_RGMII_IMP 0x89c + +#define S_XGM_IMPSETUPDATE 6 +#define V_XGM_IMPSETUPDATE(x) ((x) << S_XGM_IMPSETUPDATE) +#define F_XGM_IMPSETUPDATE V_XGM_IMPSETUPDATE(1U) + +#define S_RGMIIIMPPD 3 +#define M_RGMIIIMPPD 0x7 +#define V_RGMIIIMPPD(x) ((x) << S_RGMIIIMPPD) + +#define S_RGMIIIMPPU 0 +#define M_RGMIIIMPPU 0x7 +#define V_RGMIIIMPPU(x) ((x) << S_RGMIIIMPPU) + +#define S_CALRESET 8 +#define V_CALRESET(x) ((x) << S_CALRESET) +#define F_CALRESET V_CALRESET(1U) + +#define S_CALUPDATE 7 +#define V_CALUPDATE(x) ((x) << S_CALUPDATE) +#define F_CALUPDATE V_CALUPDATE(1U) + +#define A_XGM_XAUI_IMP 0x8a0 + +#define S_CALBUSY 31 +#define V_CALBUSY(x) ((x) << S_CALBUSY) +#define F_CALBUSY V_CALBUSY(1U) + +#define S_XGM_CALFAULT 29 +#define V_XGM_CALFAULT(x) ((x) << S_XGM_CALFAULT) +#define F_XGM_CALFAULT V_XGM_CALFAULT(1U) + +#define S_CALIMP 24 +#define M_CALIMP 0x1f +#define V_CALIMP(x) ((x) << S_CALIMP) +#define G_CALIMP(x) (((x) >> S_CALIMP) & M_CALIMP) + +#define S_XAUIIMP 0 +#define M_XAUIIMP 0x7 +#define V_XAUIIMP(x) ((x) << S_XAUIIMP) + +#define A_XGM_RX_MAX_PKT_SIZE 0x8a8 + +#define S_RXMAXFRAMERSIZE 17 +#define M_RXMAXFRAMERSIZE 0x3fff +#define V_RXMAXFRAMERSIZE(x) ((x) << S_RXMAXFRAMERSIZE) +#define G_RXMAXFRAMERSIZE(x) (((x) >> S_RXMAXFRAMERSIZE) & M_RXMAXFRAMERSIZE) + +#define S_RXENFRAMER 14 +#define V_RXENFRAMER(x) ((x) << S_RXENFRAMER) +#define F_RXENFRAMER V_RXENFRAMER(1U) + +#define S_RXMAXPKTSIZE 0 +#define M_RXMAXPKTSIZE 0x3fff +#define V_RXMAXPKTSIZE(x) ((x) << S_RXMAXPKTSIZE) +#define G_RXMAXPKTSIZE(x) (((x) >> S_RXMAXPKTSIZE) & M_RXMAXPKTSIZE) + +#define A_XGM_RESET_CTRL 0x8ac + +#define S_XGMAC_STOP_EN 4 +#define V_XGMAC_STOP_EN(x) ((x) << S_XGMAC_STOP_EN) +#define F_XGMAC_STOP_EN V_XGMAC_STOP_EN(1U) + +#define S_XG2G_RESET_ 3 +#define V_XG2G_RESET_(x) ((x) << S_XG2G_RESET_) +#define F_XG2G_RESET_ V_XG2G_RESET_(1U) + +#define S_RGMII_RESET_ 2 +#define V_RGMII_RESET_(x) ((x) << S_RGMII_RESET_) +#define F_RGMII_RESET_ V_RGMII_RESET_(1U) + +#define S_PCS_RESET_ 1 +#define V_PCS_RESET_(x) ((x) << S_PCS_RESET_) +#define F_PCS_RESET_ V_PCS_RESET_(1U) + +#define S_MAC_RESET_ 0 +#define V_MAC_RESET_(x) ((x) << S_MAC_RESET_) +#define F_MAC_RESET_ V_MAC_RESET_(1U) + +#define A_XGM_PORT_CFG 0x8b8 + +#define S_CLKDIVRESET_ 3 +#define V_CLKDIVRESET_(x) ((x) << S_CLKDIVRESET_) +#define F_CLKDIVRESET_ V_CLKDIVRESET_(1U) + +#define S_PORTSPEED 1 +#define M_PORTSPEED 0x3 + +#define V_PORTSPEED(x) ((x) << S_PORTSPEED) + +#define S_ENRGMII 0 +#define V_ENRGMII(x) ((x) << S_ENRGMII) +#define F_ENRGMII V_ENRGMII(1U) + +#define A_XGM_INT_ENABLE 0x8d4 + +#define S_TXFIFO_PRTY_ERR 17 +#define M_TXFIFO_PRTY_ERR 0x7 + +#define V_TXFIFO_PRTY_ERR(x) ((x) << S_TXFIFO_PRTY_ERR) + +#define S_RXFIFO_PRTY_ERR 14 +#define M_RXFIFO_PRTY_ERR 0x7 + +#define V_RXFIFO_PRTY_ERR(x) ((x) << S_RXFIFO_PRTY_ERR) + +#define S_TXFIFO_UNDERRUN 13 +#define V_TXFIFO_UNDERRUN(x) ((x) << S_TXFIFO_UNDERRUN) +#define F_TXFIFO_UNDERRUN V_TXFIFO_UNDERRUN(1U) + +#define S_RXFIFO_OVERFLOW 12 +#define V_RXFIFO_OVERFLOW(x) ((x) << S_RXFIFO_OVERFLOW) +#define F_RXFIFO_OVERFLOW V_RXFIFO_OVERFLOW(1U) + +#define S_SERDES_LOS 4 +#define M_SERDES_LOS 0xf + +#define V_SERDES_LOS(x) ((x) << S_SERDES_LOS) + +#define S_XAUIPCSCTCERR 3 +#define V_XAUIPCSCTCERR(x) ((x) << S_XAUIPCSCTCERR) +#define F_XAUIPCSCTCERR V_XAUIPCSCTCERR(1U) + +#define S_XAUIPCSALIGNCHANGE 2 +#define V_XAUIPCSALIGNCHANGE(x) ((x) << S_XAUIPCSALIGNCHANGE) +#define F_XAUIPCSALIGNCHANGE V_XAUIPCSALIGNCHANGE(1U) + +#define S_XGM_INT 0 +#define V_XGM_INT(x) ((x) << S_XGM_INT) +#define F_XGM_INT V_XGM_INT(1U) + +#define A_XGM_INT_CAUSE 0x8d8 + +#define A_XGM_XAUI_ACT_CTRL 0x8dc + +#define S_TXACTENABLE 1 +#define V_TXACTENABLE(x) ((x) << S_TXACTENABLE) +#define F_TXACTENABLE V_TXACTENABLE(1U) + +#define S_RESET3 23 +#define V_RESET3(x) ((x) << S_RESET3) +#define F_RESET3 V_RESET3(1U) + +#define S_RESET2 22 +#define V_RESET2(x) ((x) << S_RESET2) +#define F_RESET2 V_RESET2(1U) + +#define S_RESET1 21 +#define V_RESET1(x) ((x) << S_RESET1) +#define F_RESET1 V_RESET1(1U) + +#define S_RESET0 20 +#define V_RESET0(x) ((x) << S_RESET0) +#define F_RESET0 V_RESET0(1U) + +#define S_PWRDN3 19 +#define V_PWRDN3(x) ((x) << S_PWRDN3) +#define F_PWRDN3 V_PWRDN3(1U) + +#define S_PWRDN2 18 +#define V_PWRDN2(x) ((x) << S_PWRDN2) +#define F_PWRDN2 V_PWRDN2(1U) + +#define S_PWRDN1 17 +#define V_PWRDN1(x) ((x) << S_PWRDN1) +#define F_PWRDN1 V_PWRDN1(1U) + +#define S_PWRDN0 16 +#define V_PWRDN0(x) ((x) << S_PWRDN0) +#define F_PWRDN0 V_PWRDN0(1U) + +#define S_RESETPLL23 15 +#define V_RESETPLL23(x) ((x) << S_RESETPLL23) +#define F_RESETPLL23 V_RESETPLL23(1U) + +#define S_RESETPLL01 14 +#define V_RESETPLL01(x) ((x) << S_RESETPLL01) +#define F_RESETPLL01 V_RESETPLL01(1U) + +#define A_XGM_SERDES_STAT0 0x8f0 +#define A_XGM_SERDES_STAT1 0x8f4 +#define A_XGM_SERDES_STAT2 0x8f8 + +#define S_LOWSIG0 0 +#define V_LOWSIG0(x) ((x) << S_LOWSIG0) +#define F_LOWSIG0 V_LOWSIG0(1U) + +#define A_XGM_SERDES_STAT3 0x8fc + +#define A_XGM_STAT_TX_BYTE_LOW 0x900 + +#define A_XGM_STAT_TX_BYTE_HIGH 0x904 + +#define A_XGM_STAT_TX_FRAME_LOW 0x908 + +#define A_XGM_STAT_TX_FRAME_HIGH 0x90c + +#define A_XGM_STAT_TX_BCAST 0x910 + +#define A_XGM_STAT_TX_MCAST 0x914 + +#define A_XGM_STAT_TX_PAUSE 0x918 + +#define A_XGM_STAT_TX_64B_FRAMES 0x91c + +#define A_XGM_STAT_TX_65_127B_FRAMES 0x920 + +#define A_XGM_STAT_TX_128_255B_FRAMES 0x924 + +#define A_XGM_STAT_TX_256_511B_FRAMES 0x928 + +#define A_XGM_STAT_TX_512_1023B_FRAMES 0x92c + +#define A_XGM_STAT_TX_1024_1518B_FRAMES 0x930 + +#define A_XGM_STAT_TX_1519_MAXB_FRAMES 0x934 + +#define A_XGM_STAT_TX_ERR_FRAMES 0x938 + +#define A_XGM_STAT_RX_BYTES_LOW 0x93c + +#define A_XGM_STAT_RX_BYTES_HIGH 0x940 + +#define A_XGM_STAT_RX_FRAMES_LOW 0x944 + +#define A_XGM_STAT_RX_FRAMES_HIGH 0x948 + +#define A_XGM_STAT_RX_BCAST_FRAMES 0x94c + +#define A_XGM_STAT_RX_MCAST_FRAMES 0x950 + +#define A_XGM_STAT_RX_PAUSE_FRAMES 0x954 + +#define A_XGM_STAT_RX_64B_FRAMES 0x958 + +#define A_XGM_STAT_RX_65_127B_FRAMES 0x95c + +#define A_XGM_STAT_RX_128_255B_FRAMES 0x960 + +#define A_XGM_STAT_RX_256_511B_FRAMES 0x964 + +#define A_XGM_STAT_RX_512_1023B_FRAMES 0x968 + +#define A_XGM_STAT_RX_1024_1518B_FRAMES 0x96c + +#define A_XGM_STAT_RX_1519_MAXB_FRAMES 0x970 + +#define A_XGM_STAT_RX_SHORT_FRAMES 0x974 + +#define A_XGM_STAT_RX_OVERSIZE_FRAMES 0x978 + +#define A_XGM_STAT_RX_JABBER_FRAMES 0x97c + +#define A_XGM_STAT_RX_CRC_ERR_FRAMES 0x980 + +#define A_XGM_STAT_RX_LENGTH_ERR_FRAMES 0x984 + +#define A_XGM_STAT_RX_SYM_CODE_ERR_FRAMES 0x988 + +#define A_XGM_SERDES_STATUS0 0x98c + +#define A_XGM_SERDES_STATUS1 0x990 + +#define S_CMULOCK 31 +#define V_CMULOCK(x) ((x) << S_CMULOCK) +#define F_CMULOCK V_CMULOCK(1U) + +#define A_XGM_RX_MAX_PKT_SIZE_ERR_CNT 0x9a4 + +#define A_XGM_TX_SPI4_SOP_EOP_CNT 0x9a8 + +#define S_TXSPI4SOPCNT 16 +#define M_TXSPI4SOPCNT 0xffff +#define V_TXSPI4SOPCNT(x) ((x) << S_TXSPI4SOPCNT) +#define G_TXSPI4SOPCNT(x) (((x) >> S_TXSPI4SOPCNT) & M_TXSPI4SOPCNT) + +#define A_XGM_RX_SPI4_SOP_EOP_CNT 0x9ac + +#define XGMAC0_1_BASE_ADDR 0xa00 diff --git a/drivers/net/ethernet/chelsio/cxgb3/sge.c b/drivers/net/ethernet/chelsio/cxgb3/sge.c new file mode 100644 index 0000000..3dfcf60 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb3/sge.c @@ -0,0 +1,3305 @@ +/* + * Copyright (c) 2005-2008 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "common.h" +#include "regs.h" +#include "sge_defs.h" +#include "t3_cpl.h" +#include "firmware_exports.h" +#include "cxgb3_offload.h" + +#define USE_GTS 0 + +#define SGE_RX_SM_BUF_SIZE 1536 + +#define SGE_RX_COPY_THRES 256 +#define SGE_RX_PULL_LEN 128 + +#define SGE_PG_RSVD SMP_CACHE_BYTES +/* + * Page chunk size for FL0 buffers if FL0 is to be populated with page chunks. + * It must be a divisor of PAGE_SIZE. If set to 0 FL0 will use sk_buffs + * directly. + */ +#define FL0_PG_CHUNK_SIZE 2048 +#define FL0_PG_ORDER 0 +#define FL0_PG_ALLOC_SIZE (PAGE_SIZE << FL0_PG_ORDER) +#define FL1_PG_CHUNK_SIZE (PAGE_SIZE > 8192 ? 16384 : 8192) +#define FL1_PG_ORDER (PAGE_SIZE > 8192 ? 0 : 1) +#define FL1_PG_ALLOC_SIZE (PAGE_SIZE << FL1_PG_ORDER) + +#define SGE_RX_DROP_THRES 16 +#define RX_RECLAIM_PERIOD (HZ/4) + +/* + * Max number of Rx buffers we replenish at a time. + */ +#define MAX_RX_REFILL 16U +/* + * Period of the Tx buffer reclaim timer. This timer does not need to run + * frequently as Tx buffers are usually reclaimed by new Tx packets. + */ +#define TX_RECLAIM_PERIOD (HZ / 4) +#define TX_RECLAIM_TIMER_CHUNK 64U +#define TX_RECLAIM_CHUNK 16U + +/* WR size in bytes */ +#define WR_LEN (WR_FLITS * 8) + +/* + * Types of Tx queues in each queue set. Order here matters, do not change. + */ +enum { TXQ_ETH, TXQ_OFLD, TXQ_CTRL }; + +/* Values for sge_txq.flags */ +enum { + TXQ_RUNNING = 1 << 0, /* fetch engine is running */ + TXQ_LAST_PKT_DB = 1 << 1, /* last packet rang the doorbell */ +}; + +struct tx_desc { + __be64 flit[TX_DESC_FLITS]; +}; + +struct rx_desc { + __be32 addr_lo; + __be32 len_gen; + __be32 gen2; + __be32 addr_hi; +}; + +struct tx_sw_desc { /* SW state per Tx descriptor */ + struct sk_buff *skb; + u8 eop; /* set if last descriptor for packet */ + u8 addr_idx; /* buffer index of first SGL entry in descriptor */ + u8 fragidx; /* first page fragment associated with descriptor */ + s8 sflit; /* start flit of first SGL entry in descriptor */ +}; + +struct rx_sw_desc { /* SW state per Rx descriptor */ + union { + struct sk_buff *skb; + struct fl_pg_chunk pg_chunk; + }; + DEFINE_DMA_UNMAP_ADDR(dma_addr); +}; + +struct rsp_desc { /* response queue descriptor */ + struct rss_header rss_hdr; + __be32 flags; + __be32 len_cq; + u8 imm_data[47]; + u8 intr_gen; +}; + +/* + * Holds unmapping information for Tx packets that need deferred unmapping. + * This structure lives at skb->head and must be allocated by callers. + */ +struct deferred_unmap_info { + struct pci_dev *pdev; + dma_addr_t addr[MAX_SKB_FRAGS + 1]; +}; + +/* + * Maps a number of flits to the number of Tx descriptors that can hold them. + * The formula is + * + * desc = 1 + (flits - 2) / (WR_FLITS - 1). + * + * HW allows up to 4 descriptors to be combined into a WR. + */ +static u8 flit_desc_map[] = { + 0, +#if SGE_NUM_GENBITS == 1 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 +#elif SGE_NUM_GENBITS == 2 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, +#else +# error "SGE_NUM_GENBITS must be 1 or 2" +#endif +}; + +static inline struct sge_qset *fl_to_qset(const struct sge_fl *q, int qidx) +{ + return container_of(q, struct sge_qset, fl[qidx]); +} + +static inline struct sge_qset *rspq_to_qset(const struct sge_rspq *q) +{ + return container_of(q, struct sge_qset, rspq); +} + +static inline struct sge_qset *txq_to_qset(const struct sge_txq *q, int qidx) +{ + return container_of(q, struct sge_qset, txq[qidx]); +} + +/** + * refill_rspq - replenish an SGE response queue + * @adapter: the adapter + * @q: the response queue to replenish + * @credits: how many new responses to make available + * + * Replenishes a response queue by making the supplied number of responses + * available to HW. + */ +static inline void refill_rspq(struct adapter *adapter, + const struct sge_rspq *q, unsigned int credits) +{ + rmb(); + t3_write_reg(adapter, A_SG_RSPQ_CREDIT_RETURN, + V_RSPQ(q->cntxt_id) | V_CREDITS(credits)); +} + +/** + * need_skb_unmap - does the platform need unmapping of sk_buffs? + * + * Returns true if the platform needs sk_buff unmapping. The compiler + * optimizes away unnecessary code if this returns true. + */ +static inline int need_skb_unmap(void) +{ +#ifdef CONFIG_NEED_DMA_MAP_STATE + return 1; +#else + return 0; +#endif +} + +/** + * unmap_skb - unmap a packet main body and its page fragments + * @skb: the packet + * @q: the Tx queue containing Tx descriptors for the packet + * @cidx: index of Tx descriptor + * @pdev: the PCI device + * + * Unmap the main body of an sk_buff and its page fragments, if any. + * Because of the fairly complicated structure of our SGLs and the desire + * to conserve space for metadata, the information necessary to unmap an + * sk_buff is spread across the sk_buff itself (buffer lengths), the HW Tx + * descriptors (the physical addresses of the various data buffers), and + * the SW descriptor state (assorted indices). The send functions + * initialize the indices for the first packet descriptor so we can unmap + * the buffers held in the first Tx descriptor here, and we have enough + * information at this point to set the state for the next Tx descriptor. + * + * Note that it is possible to clean up the first descriptor of a packet + * before the send routines have written the next descriptors, but this + * race does not cause any problem. We just end up writing the unmapping + * info for the descriptor first. + */ +static inline void unmap_skb(struct sk_buff *skb, struct sge_txq *q, + unsigned int cidx, struct pci_dev *pdev) +{ + const struct sg_ent *sgp; + struct tx_sw_desc *d = &q->sdesc[cidx]; + int nfrags, frag_idx, curflit, j = d->addr_idx; + + sgp = (struct sg_ent *)&q->desc[cidx].flit[d->sflit]; + frag_idx = d->fragidx; + + if (frag_idx == 0 && skb_headlen(skb)) { + pci_unmap_single(pdev, be64_to_cpu(sgp->addr[0]), + skb_headlen(skb), PCI_DMA_TODEVICE); + j = 1; + } + + curflit = d->sflit + 1 + j; + nfrags = skb_shinfo(skb)->nr_frags; + + while (frag_idx < nfrags && curflit < WR_FLITS) { + pci_unmap_page(pdev, be64_to_cpu(sgp->addr[j]), + skb_frag_size(&skb_shinfo(skb)->frags[frag_idx]), + PCI_DMA_TODEVICE); + j ^= 1; + if (j == 0) { + sgp++; + curflit++; + } + curflit++; + frag_idx++; + } + + if (frag_idx < nfrags) { /* SGL continues into next Tx descriptor */ + d = cidx + 1 == q->size ? q->sdesc : d + 1; + d->fragidx = frag_idx; + d->addr_idx = j; + d->sflit = curflit - WR_FLITS - j; /* sflit can be -1 */ + } +} + +/** + * free_tx_desc - reclaims Tx descriptors and their buffers + * @adapter: the adapter + * @q: the Tx queue to reclaim descriptors from + * @n: the number of descriptors to reclaim + * + * Reclaims Tx descriptors from an SGE Tx queue and frees the associated + * Tx buffers. Called with the Tx queue lock held. + */ +static void free_tx_desc(struct adapter *adapter, struct sge_txq *q, + unsigned int n) +{ + struct tx_sw_desc *d; + struct pci_dev *pdev = adapter->pdev; + unsigned int cidx = q->cidx; + + const int need_unmap = need_skb_unmap() && + q->cntxt_id >= FW_TUNNEL_SGEEC_START; + + d = &q->sdesc[cidx]; + while (n--) { + if (d->skb) { /* an SGL is present */ + if (need_unmap) + unmap_skb(d->skb, q, cidx, pdev); + if (d->eop) { + dev_consume_skb_any(d->skb); + d->skb = NULL; + } + } + ++d; + if (++cidx == q->size) { + cidx = 0; + d = q->sdesc; + } + } + q->cidx = cidx; +} + +/** + * reclaim_completed_tx - reclaims completed Tx descriptors + * @adapter: the adapter + * @q: the Tx queue to reclaim completed descriptors from + * @chunk: maximum number of descriptors to reclaim + * + * Reclaims Tx descriptors that the SGE has indicated it has processed, + * and frees the associated buffers if possible. Called with the Tx + * queue's lock held. + */ +static inline unsigned int reclaim_completed_tx(struct adapter *adapter, + struct sge_txq *q, + unsigned int chunk) +{ + unsigned int reclaim = q->processed - q->cleaned; + + reclaim = min(chunk, reclaim); + if (reclaim) { + free_tx_desc(adapter, q, reclaim); + q->cleaned += reclaim; + q->in_use -= reclaim; + } + return q->processed - q->cleaned; +} + +/** + * should_restart_tx - are there enough resources to restart a Tx queue? + * @q: the Tx queue + * + * Checks if there are enough descriptors to restart a suspended Tx queue. + */ +static inline int should_restart_tx(const struct sge_txq *q) +{ + unsigned int r = q->processed - q->cleaned; + + return q->in_use - r < (q->size >> 1); +} + +static void clear_rx_desc(struct pci_dev *pdev, const struct sge_fl *q, + struct rx_sw_desc *d) +{ + if (q->use_pages && d->pg_chunk.page) { + (*d->pg_chunk.p_cnt)--; + if (!*d->pg_chunk.p_cnt) + pci_unmap_page(pdev, + d->pg_chunk.mapping, + q->alloc_size, PCI_DMA_FROMDEVICE); + + put_page(d->pg_chunk.page); + d->pg_chunk.page = NULL; + } else { + pci_unmap_single(pdev, dma_unmap_addr(d, dma_addr), + q->buf_size, PCI_DMA_FROMDEVICE); + kfree_skb(d->skb); + d->skb = NULL; + } +} + +/** + * free_rx_bufs - free the Rx buffers on an SGE free list + * @pdev: the PCI device associated with the adapter + * @rxq: the SGE free list to clean up + * + * Release the buffers on an SGE free-buffer Rx queue. HW fetching from + * this queue should be stopped before calling this function. + */ +static void free_rx_bufs(struct pci_dev *pdev, struct sge_fl *q) +{ + unsigned int cidx = q->cidx; + + while (q->credits--) { + struct rx_sw_desc *d = &q->sdesc[cidx]; + + + clear_rx_desc(pdev, q, d); + if (++cidx == q->size) + cidx = 0; + } + + if (q->pg_chunk.page) { + __free_pages(q->pg_chunk.page, q->order); + q->pg_chunk.page = NULL; + } +} + +/** + * add_one_rx_buf - add a packet buffer to a free-buffer list + * @va: buffer start VA + * @len: the buffer length + * @d: the HW Rx descriptor to write + * @sd: the SW Rx descriptor to write + * @gen: the generation bit value + * @pdev: the PCI device associated with the adapter + * + * Add a buffer of the given length to the supplied HW and SW Rx + * descriptors. + */ +static inline int add_one_rx_buf(void *va, unsigned int len, + struct rx_desc *d, struct rx_sw_desc *sd, + unsigned int gen, struct pci_dev *pdev) +{ + dma_addr_t mapping; + + mapping = pci_map_single(pdev, va, len, PCI_DMA_FROMDEVICE); + if (unlikely(pci_dma_mapping_error(pdev, mapping))) + return -ENOMEM; + + dma_unmap_addr_set(sd, dma_addr, mapping); + + d->addr_lo = cpu_to_be32(mapping); + d->addr_hi = cpu_to_be32((u64) mapping >> 32); + wmb(); + d->len_gen = cpu_to_be32(V_FLD_GEN1(gen)); + d->gen2 = cpu_to_be32(V_FLD_GEN2(gen)); + return 0; +} + +static inline int add_one_rx_chunk(dma_addr_t mapping, struct rx_desc *d, + unsigned int gen) +{ + d->addr_lo = cpu_to_be32(mapping); + d->addr_hi = cpu_to_be32((u64) mapping >> 32); + wmb(); + d->len_gen = cpu_to_be32(V_FLD_GEN1(gen)); + d->gen2 = cpu_to_be32(V_FLD_GEN2(gen)); + return 0; +} + +static int alloc_pg_chunk(struct adapter *adapter, struct sge_fl *q, + struct rx_sw_desc *sd, gfp_t gfp, + unsigned int order) +{ + if (!q->pg_chunk.page) { + dma_addr_t mapping; + + q->pg_chunk.page = alloc_pages(gfp, order); + if (unlikely(!q->pg_chunk.page)) + return -ENOMEM; + q->pg_chunk.va = page_address(q->pg_chunk.page); + q->pg_chunk.p_cnt = q->pg_chunk.va + (PAGE_SIZE << order) - + SGE_PG_RSVD; + q->pg_chunk.offset = 0; + mapping = pci_map_page(adapter->pdev, q->pg_chunk.page, + 0, q->alloc_size, PCI_DMA_FROMDEVICE); + q->pg_chunk.mapping = mapping; + } + sd->pg_chunk = q->pg_chunk; + + prefetch(sd->pg_chunk.p_cnt); + + q->pg_chunk.offset += q->buf_size; + if (q->pg_chunk.offset == (PAGE_SIZE << order)) + q->pg_chunk.page = NULL; + else { + q->pg_chunk.va += q->buf_size; + get_page(q->pg_chunk.page); + } + + if (sd->pg_chunk.offset == 0) + *sd->pg_chunk.p_cnt = 1; + else + *sd->pg_chunk.p_cnt += 1; + + return 0; +} + +static inline void ring_fl_db(struct adapter *adap, struct sge_fl *q) +{ + if (q->pend_cred >= q->credits / 4) { + q->pend_cred = 0; + wmb(); + t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id)); + } +} + +/** + * refill_fl - refill an SGE free-buffer list + * @adapter: the adapter + * @q: the free-list to refill + * @n: the number of new buffers to allocate + * @gfp: the gfp flags for allocating new buffers + * + * (Re)populate an SGE free-buffer list with up to @n new packet buffers, + * allocated with the supplied gfp flags. The caller must assure that + * @n does not exceed the queue's capacity. + */ +static int refill_fl(struct adapter *adap, struct sge_fl *q, int n, gfp_t gfp) +{ + struct rx_sw_desc *sd = &q->sdesc[q->pidx]; + struct rx_desc *d = &q->desc[q->pidx]; + unsigned int count = 0; + + while (n--) { + dma_addr_t mapping; + int err; + + if (q->use_pages) { + if (unlikely(alloc_pg_chunk(adap, q, sd, gfp, + q->order))) { +nomem: q->alloc_failed++; + break; + } + mapping = sd->pg_chunk.mapping + sd->pg_chunk.offset; + dma_unmap_addr_set(sd, dma_addr, mapping); + + add_one_rx_chunk(mapping, d, q->gen); + pci_dma_sync_single_for_device(adap->pdev, mapping, + q->buf_size - SGE_PG_RSVD, + PCI_DMA_FROMDEVICE); + } else { + void *buf_start; + + struct sk_buff *skb = alloc_skb(q->buf_size, gfp); + if (!skb) + goto nomem; + + sd->skb = skb; + buf_start = skb->data; + err = add_one_rx_buf(buf_start, q->buf_size, d, sd, + q->gen, adap->pdev); + if (unlikely(err)) { + clear_rx_desc(adap->pdev, q, sd); + break; + } + } + + d++; + sd++; + if (++q->pidx == q->size) { + q->pidx = 0; + q->gen ^= 1; + sd = q->sdesc; + d = q->desc; + } + count++; + } + + q->credits += count; + q->pend_cred += count; + ring_fl_db(adap, q); + + return count; +} + +static inline void __refill_fl(struct adapter *adap, struct sge_fl *fl) +{ + refill_fl(adap, fl, min(MAX_RX_REFILL, fl->size - fl->credits), + GFP_ATOMIC | __GFP_COMP); +} + +/** + * recycle_rx_buf - recycle a receive buffer + * @adapter: the adapter + * @q: the SGE free list + * @idx: index of buffer to recycle + * + * Recycles the specified buffer on the given free list by adding it at + * the next available slot on the list. + */ +static void recycle_rx_buf(struct adapter *adap, struct sge_fl *q, + unsigned int idx) +{ + struct rx_desc *from = &q->desc[idx]; + struct rx_desc *to = &q->desc[q->pidx]; + + q->sdesc[q->pidx] = q->sdesc[idx]; + to->addr_lo = from->addr_lo; /* already big endian */ + to->addr_hi = from->addr_hi; /* likewise */ + wmb(); + to->len_gen = cpu_to_be32(V_FLD_GEN1(q->gen)); + to->gen2 = cpu_to_be32(V_FLD_GEN2(q->gen)); + + if (++q->pidx == q->size) { + q->pidx = 0; + q->gen ^= 1; + } + + q->credits++; + q->pend_cred++; + ring_fl_db(adap, q); +} + +/** + * alloc_ring - allocate resources for an SGE descriptor ring + * @pdev: the PCI device + * @nelem: the number of descriptors + * @elem_size: the size of each descriptor + * @sw_size: the size of the SW state associated with each ring element + * @phys: the physical address of the allocated ring + * @metadata: address of the array holding the SW state for the ring + * + * Allocates resources for an SGE descriptor ring, such as Tx queues, + * free buffer lists, or response queues. Each SGE ring requires + * space for its HW descriptors plus, optionally, space for the SW state + * associated with each HW entry (the metadata). The function returns + * three values: the virtual address for the HW ring (the return value + * of the function), the physical address of the HW ring, and the address + * of the SW ring. + */ +static void *alloc_ring(struct pci_dev *pdev, size_t nelem, size_t elem_size, + size_t sw_size, dma_addr_t * phys, void *metadata) +{ + size_t len = nelem * elem_size; + void *s = NULL; + void *p = dma_alloc_coherent(&pdev->dev, len, phys, GFP_KERNEL); + + if (!p) + return NULL; + if (sw_size && metadata) { + s = kcalloc(nelem, sw_size, GFP_KERNEL); + + if (!s) { + dma_free_coherent(&pdev->dev, len, p, *phys); + return NULL; + } + *(void **)metadata = s; + } + memset(p, 0, len); + return p; +} + +/** + * t3_reset_qset - reset a sge qset + * @q: the queue set + * + * Reset the qset structure. + * the NAPI structure is preserved in the event of + * the qset's reincarnation, for example during EEH recovery. + */ +static void t3_reset_qset(struct sge_qset *q) +{ + if (q->adap && + !(q->adap->flags & NAPI_INIT)) { + memset(q, 0, sizeof(*q)); + return; + } + + q->adap = NULL; + memset(&q->rspq, 0, sizeof(q->rspq)); + memset(q->fl, 0, sizeof(struct sge_fl) * SGE_RXQ_PER_SET); + memset(q->txq, 0, sizeof(struct sge_txq) * SGE_TXQ_PER_SET); + q->txq_stopped = 0; + q->tx_reclaim_timer.function = NULL; /* for t3_stop_sge_timers() */ + q->rx_reclaim_timer.function = NULL; + q->nomem = 0; + napi_free_frags(&q->napi); +} + + +/** + * free_qset - free the resources of an SGE queue set + * @adapter: the adapter owning the queue set + * @q: the queue set + * + * Release the HW and SW resources associated with an SGE queue set, such + * as HW contexts, packet buffers, and descriptor rings. Traffic to the + * queue set must be quiesced prior to calling this. + */ +static void t3_free_qset(struct adapter *adapter, struct sge_qset *q) +{ + int i; + struct pci_dev *pdev = adapter->pdev; + + for (i = 0; i < SGE_RXQ_PER_SET; ++i) + if (q->fl[i].desc) { + spin_lock_irq(&adapter->sge.reg_lock); + t3_sge_disable_fl(adapter, q->fl[i].cntxt_id); + spin_unlock_irq(&adapter->sge.reg_lock); + free_rx_bufs(pdev, &q->fl[i]); + kfree(q->fl[i].sdesc); + dma_free_coherent(&pdev->dev, + q->fl[i].size * + sizeof(struct rx_desc), q->fl[i].desc, + q->fl[i].phys_addr); + } + + for (i = 0; i < SGE_TXQ_PER_SET; ++i) + if (q->txq[i].desc) { + spin_lock_irq(&adapter->sge.reg_lock); + t3_sge_enable_ecntxt(adapter, q->txq[i].cntxt_id, 0); + spin_unlock_irq(&adapter->sge.reg_lock); + if (q->txq[i].sdesc) { + free_tx_desc(adapter, &q->txq[i], + q->txq[i].in_use); + kfree(q->txq[i].sdesc); + } + dma_free_coherent(&pdev->dev, + q->txq[i].size * + sizeof(struct tx_desc), + q->txq[i].desc, q->txq[i].phys_addr); + __skb_queue_purge(&q->txq[i].sendq); + } + + if (q->rspq.desc) { + spin_lock_irq(&adapter->sge.reg_lock); + t3_sge_disable_rspcntxt(adapter, q->rspq.cntxt_id); + spin_unlock_irq(&adapter->sge.reg_lock); + dma_free_coherent(&pdev->dev, + q->rspq.size * sizeof(struct rsp_desc), + q->rspq.desc, q->rspq.phys_addr); + } + + t3_reset_qset(q); +} + +/** + * init_qset_cntxt - initialize an SGE queue set context info + * @qs: the queue set + * @id: the queue set id + * + * Initializes the TIDs and context ids for the queues of a queue set. + */ +static void init_qset_cntxt(struct sge_qset *qs, unsigned int id) +{ + qs->rspq.cntxt_id = id; + qs->fl[0].cntxt_id = 2 * id; + qs->fl[1].cntxt_id = 2 * id + 1; + qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id; + qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id; + qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id; + qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id; + qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id; +} + +/** + * sgl_len - calculates the size of an SGL of the given capacity + * @n: the number of SGL entries + * + * Calculates the number of flits needed for a scatter/gather list that + * can hold the given number of entries. + */ +static inline unsigned int sgl_len(unsigned int n) +{ + /* alternatively: 3 * (n / 2) + 2 * (n & 1) */ + return (3 * n) / 2 + (n & 1); +} + +/** + * flits_to_desc - returns the num of Tx descriptors for the given flits + * @n: the number of flits + * + * Calculates the number of Tx descriptors needed for the supplied number + * of flits. + */ +static inline unsigned int flits_to_desc(unsigned int n) +{ + BUG_ON(n >= ARRAY_SIZE(flit_desc_map)); + return flit_desc_map[n]; +} + +/** + * get_packet - return the next ingress packet buffer from a free list + * @adap: the adapter that received the packet + * @fl: the SGE free list holding the packet + * @len: the packet length including any SGE padding + * @drop_thres: # of remaining buffers before we start dropping packets + * + * Get the next packet from a free list and complete setup of the + * sk_buff. If the packet is small we make a copy and recycle the + * original buffer, otherwise we use the original buffer itself. If a + * positive drop threshold is supplied packets are dropped and their + * buffers recycled if (a) the number of remaining buffers is under the + * threshold and the packet is too big to copy, or (b) the packet should + * be copied but there is no memory for the copy. + */ +static struct sk_buff *get_packet(struct adapter *adap, struct sge_fl *fl, + unsigned int len, unsigned int drop_thres) +{ + struct sk_buff *skb = NULL; + struct rx_sw_desc *sd = &fl->sdesc[fl->cidx]; + + prefetch(sd->skb->data); + fl->credits--; + + if (len <= SGE_RX_COPY_THRES) { + skb = alloc_skb(len, GFP_ATOMIC); + if (likely(skb != NULL)) { + __skb_put(skb, len); + pci_dma_sync_single_for_cpu(adap->pdev, + dma_unmap_addr(sd, dma_addr), len, + PCI_DMA_FROMDEVICE); + memcpy(skb->data, sd->skb->data, len); + pci_dma_sync_single_for_device(adap->pdev, + dma_unmap_addr(sd, dma_addr), len, + PCI_DMA_FROMDEVICE); + } else if (!drop_thres) + goto use_orig_buf; +recycle: + recycle_rx_buf(adap, fl, fl->cidx); + return skb; + } + + if (unlikely(fl->credits < drop_thres) && + refill_fl(adap, fl, min(MAX_RX_REFILL, fl->size - fl->credits - 1), + GFP_ATOMIC | __GFP_COMP) == 0) + goto recycle; + +use_orig_buf: + pci_unmap_single(adap->pdev, dma_unmap_addr(sd, dma_addr), + fl->buf_size, PCI_DMA_FROMDEVICE); + skb = sd->skb; + skb_put(skb, len); + __refill_fl(adap, fl); + return skb; +} + +/** + * get_packet_pg - return the next ingress packet buffer from a free list + * @adap: the adapter that received the packet + * @fl: the SGE free list holding the packet + * @len: the packet length including any SGE padding + * @drop_thres: # of remaining buffers before we start dropping packets + * + * Get the next packet from a free list populated with page chunks. + * If the packet is small we make a copy and recycle the original buffer, + * otherwise we attach the original buffer as a page fragment to a fresh + * sk_buff. If a positive drop threshold is supplied packets are dropped + * and their buffers recycled if (a) the number of remaining buffers is + * under the threshold and the packet is too big to copy, or (b) there's + * no system memory. + * + * Note: this function is similar to @get_packet but deals with Rx buffers + * that are page chunks rather than sk_buffs. + */ +static struct sk_buff *get_packet_pg(struct adapter *adap, struct sge_fl *fl, + struct sge_rspq *q, unsigned int len, + unsigned int drop_thres) +{ + struct sk_buff *newskb, *skb; + struct rx_sw_desc *sd = &fl->sdesc[fl->cidx]; + + dma_addr_t dma_addr = dma_unmap_addr(sd, dma_addr); + + newskb = skb = q->pg_skb; + if (!skb && (len <= SGE_RX_COPY_THRES)) { + newskb = alloc_skb(len, GFP_ATOMIC); + if (likely(newskb != NULL)) { + __skb_put(newskb, len); + pci_dma_sync_single_for_cpu(adap->pdev, dma_addr, len, + PCI_DMA_FROMDEVICE); + memcpy(newskb->data, sd->pg_chunk.va, len); + pci_dma_sync_single_for_device(adap->pdev, dma_addr, + len, + PCI_DMA_FROMDEVICE); + } else if (!drop_thres) + return NULL; +recycle: + fl->credits--; + recycle_rx_buf(adap, fl, fl->cidx); + q->rx_recycle_buf++; + return newskb; + } + + if (unlikely(q->rx_recycle_buf || (!skb && fl->credits <= drop_thres))) + goto recycle; + + prefetch(sd->pg_chunk.p_cnt); + + if (!skb) + newskb = alloc_skb(SGE_RX_PULL_LEN, GFP_ATOMIC); + + if (unlikely(!newskb)) { + if (!drop_thres) + return NULL; + goto recycle; + } + + pci_dma_sync_single_for_cpu(adap->pdev, dma_addr, len, + PCI_DMA_FROMDEVICE); + (*sd->pg_chunk.p_cnt)--; + if (!*sd->pg_chunk.p_cnt && sd->pg_chunk.page != fl->pg_chunk.page) + pci_unmap_page(adap->pdev, + sd->pg_chunk.mapping, + fl->alloc_size, + PCI_DMA_FROMDEVICE); + if (!skb) { + __skb_put(newskb, SGE_RX_PULL_LEN); + memcpy(newskb->data, sd->pg_chunk.va, SGE_RX_PULL_LEN); + skb_fill_page_desc(newskb, 0, sd->pg_chunk.page, + sd->pg_chunk.offset + SGE_RX_PULL_LEN, + len - SGE_RX_PULL_LEN); + newskb->len = len; + newskb->data_len = len - SGE_RX_PULL_LEN; + newskb->truesize += newskb->data_len; + } else { + skb_fill_page_desc(newskb, skb_shinfo(newskb)->nr_frags, + sd->pg_chunk.page, + sd->pg_chunk.offset, len); + newskb->len += len; + newskb->data_len += len; + newskb->truesize += len; + } + + fl->credits--; + /* + * We do not refill FLs here, we let the caller do it to overlap a + * prefetch. + */ + return newskb; +} + +/** + * get_imm_packet - return the next ingress packet buffer from a response + * @resp: the response descriptor containing the packet data + * + * Return a packet containing the immediate data of the given response. + */ +static inline struct sk_buff *get_imm_packet(const struct rsp_desc *resp) +{ + struct sk_buff *skb = alloc_skb(IMMED_PKT_SIZE, GFP_ATOMIC); + + if (skb) { + __skb_put(skb, IMMED_PKT_SIZE); + skb_copy_to_linear_data(skb, resp->imm_data, IMMED_PKT_SIZE); + } + return skb; +} + +/** + * calc_tx_descs - calculate the number of Tx descriptors for a packet + * @skb: the packet + * + * Returns the number of Tx descriptors needed for the given Ethernet + * packet. Ethernet packets require addition of WR and CPL headers. + */ +static inline unsigned int calc_tx_descs(const struct sk_buff *skb) +{ + unsigned int flits; + + if (skb->len <= WR_LEN - sizeof(struct cpl_tx_pkt)) + return 1; + + flits = sgl_len(skb_shinfo(skb)->nr_frags + 1) + 2; + if (skb_shinfo(skb)->gso_size) + flits++; + return flits_to_desc(flits); +} + +/** + * make_sgl - populate a scatter/gather list for a packet + * @skb: the packet + * @sgp: the SGL to populate + * @start: start address of skb main body data to include in the SGL + * @len: length of skb main body data to include in the SGL + * @pdev: the PCI device + * + * Generates a scatter/gather list for the buffers that make up a packet + * and returns the SGL size in 8-byte words. The caller must size the SGL + * appropriately. + */ +static inline unsigned int make_sgl(const struct sk_buff *skb, + struct sg_ent *sgp, unsigned char *start, + unsigned int len, struct pci_dev *pdev) +{ + dma_addr_t mapping; + unsigned int i, j = 0, nfrags; + + if (len) { + mapping = pci_map_single(pdev, start, len, PCI_DMA_TODEVICE); + sgp->len[0] = cpu_to_be32(len); + sgp->addr[0] = cpu_to_be64(mapping); + j = 1; + } + + nfrags = skb_shinfo(skb)->nr_frags; + for (i = 0; i < nfrags; i++) { + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + mapping = skb_frag_dma_map(&pdev->dev, frag, 0, skb_frag_size(frag), + DMA_TO_DEVICE); + sgp->len[j] = cpu_to_be32(skb_frag_size(frag)); + sgp->addr[j] = cpu_to_be64(mapping); + j ^= 1; + if (j == 0) + ++sgp; + } + if (j) + sgp->len[j] = 0; + return ((nfrags + (len != 0)) * 3) / 2 + j; +} + +/** + * check_ring_tx_db - check and potentially ring a Tx queue's doorbell + * @adap: the adapter + * @q: the Tx queue + * + * Ring the doorbel if a Tx queue is asleep. There is a natural race, + * where the HW is going to sleep just after we checked, however, + * then the interrupt handler will detect the outstanding TX packet + * and ring the doorbell for us. + * + * When GTS is disabled we unconditionally ring the doorbell. + */ +static inline void check_ring_tx_db(struct adapter *adap, struct sge_txq *q) +{ +#if USE_GTS + clear_bit(TXQ_LAST_PKT_DB, &q->flags); + if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) { + set_bit(TXQ_LAST_PKT_DB, &q->flags); + t3_write_reg(adap, A_SG_KDOORBELL, + F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id)); + } +#else + wmb(); /* write descriptors before telling HW */ + t3_write_reg(adap, A_SG_KDOORBELL, + F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id)); +#endif +} + +static inline void wr_gen2(struct tx_desc *d, unsigned int gen) +{ +#if SGE_NUM_GENBITS == 2 + d->flit[TX_DESC_FLITS - 1] = cpu_to_be64(gen); +#endif +} + +/** + * write_wr_hdr_sgl - write a WR header and, optionally, SGL + * @ndesc: number of Tx descriptors spanned by the SGL + * @skb: the packet corresponding to the WR + * @d: first Tx descriptor to be written + * @pidx: index of above descriptors + * @q: the SGE Tx queue + * @sgl: the SGL + * @flits: number of flits to the start of the SGL in the first descriptor + * @sgl_flits: the SGL size in flits + * @gen: the Tx descriptor generation + * @wr_hi: top 32 bits of WR header based on WR type (big endian) + * @wr_lo: low 32 bits of WR header based on WR type (big endian) + * + * Write a work request header and an associated SGL. If the SGL is + * small enough to fit into one Tx descriptor it has already been written + * and we just need to write the WR header. Otherwise we distribute the + * SGL across the number of descriptors it spans. + */ +static void write_wr_hdr_sgl(unsigned int ndesc, struct sk_buff *skb, + struct tx_desc *d, unsigned int pidx, + const struct sge_txq *q, + const struct sg_ent *sgl, + unsigned int flits, unsigned int sgl_flits, + unsigned int gen, __be32 wr_hi, + __be32 wr_lo) +{ + struct work_request_hdr *wrp = (struct work_request_hdr *)d; + struct tx_sw_desc *sd = &q->sdesc[pidx]; + + sd->skb = skb; + if (need_skb_unmap()) { + sd->fragidx = 0; + sd->addr_idx = 0; + sd->sflit = flits; + } + + if (likely(ndesc == 1)) { + sd->eop = 1; + wrp->wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) | + V_WR_SGLSFLT(flits)) | wr_hi; + wmb(); + wrp->wr_lo = htonl(V_WR_LEN(flits + sgl_flits) | + V_WR_GEN(gen)) | wr_lo; + wr_gen2(d, gen); + } else { + unsigned int ogen = gen; + const u64 *fp = (const u64 *)sgl; + struct work_request_hdr *wp = wrp; + + wrp->wr_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) | + V_WR_SGLSFLT(flits)) | wr_hi; + + while (sgl_flits) { + unsigned int avail = WR_FLITS - flits; + + if (avail > sgl_flits) + avail = sgl_flits; + memcpy(&d->flit[flits], fp, avail * sizeof(*fp)); + sgl_flits -= avail; + ndesc--; + if (!sgl_flits) + break; + + fp += avail; + d++; + sd->eop = 0; + sd++; + if (++pidx == q->size) { + pidx = 0; + gen ^= 1; + d = q->desc; + sd = q->sdesc; + } + + sd->skb = skb; + wrp = (struct work_request_hdr *)d; + wrp->wr_hi = htonl(V_WR_DATATYPE(1) | + V_WR_SGLSFLT(1)) | wr_hi; + wrp->wr_lo = htonl(V_WR_LEN(min(WR_FLITS, + sgl_flits + 1)) | + V_WR_GEN(gen)) | wr_lo; + wr_gen2(d, gen); + flits = 1; + } + sd->eop = 1; + wrp->wr_hi |= htonl(F_WR_EOP); + wmb(); + wp->wr_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo; + wr_gen2((struct tx_desc *)wp, ogen); + WARN_ON(ndesc != 0); + } +} + +/** + * write_tx_pkt_wr - write a TX_PKT work request + * @adap: the adapter + * @skb: the packet to send + * @pi: the egress interface + * @pidx: index of the first Tx descriptor to write + * @gen: the generation value to use + * @q: the Tx queue + * @ndesc: number of descriptors the packet will occupy + * @compl: the value of the COMPL bit to use + * + * Generate a TX_PKT work request to send the supplied packet. + */ +static void write_tx_pkt_wr(struct adapter *adap, struct sk_buff *skb, + const struct port_info *pi, + unsigned int pidx, unsigned int gen, + struct sge_txq *q, unsigned int ndesc, + unsigned int compl) +{ + unsigned int flits, sgl_flits, cntrl, tso_info; + struct sg_ent *sgp, sgl[MAX_SKB_FRAGS / 2 + 1]; + struct tx_desc *d = &q->desc[pidx]; + struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)d; + + cpl->len = htonl(skb->len); + cntrl = V_TXPKT_INTF(pi->port_id); + + if (vlan_tx_tag_present(skb)) + cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(vlan_tx_tag_get(skb)); + + tso_info = V_LSO_MSS(skb_shinfo(skb)->gso_size); + if (tso_info) { + int eth_type; + struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)cpl; + + d->flit[2] = 0; + cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO); + hdr->cntrl = htonl(cntrl); + eth_type = skb_network_offset(skb) == ETH_HLEN ? + CPL_ETH_II : CPL_ETH_II_VLAN; + tso_info |= V_LSO_ETH_TYPE(eth_type) | + V_LSO_IPHDR_WORDS(ip_hdr(skb)->ihl) | + V_LSO_TCPHDR_WORDS(tcp_hdr(skb)->doff); + hdr->lso_info = htonl(tso_info); + flits = 3; + } else { + cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT); + cntrl |= F_TXPKT_IPCSUM_DIS; /* SW calculates IP csum */ + cntrl |= V_TXPKT_L4CSUM_DIS(skb->ip_summed != CHECKSUM_PARTIAL); + cpl->cntrl = htonl(cntrl); + + if (skb->len <= WR_LEN - sizeof(*cpl)) { + q->sdesc[pidx].skb = NULL; + if (!skb->data_len) + skb_copy_from_linear_data(skb, &d->flit[2], + skb->len); + else + skb_copy_bits(skb, 0, &d->flit[2], skb->len); + + flits = (skb->len + 7) / 8 + 2; + cpl->wr.wr_hi = htonl(V_WR_BCNTLFLT(skb->len & 7) | + V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) + | F_WR_SOP | F_WR_EOP | compl); + wmb(); + cpl->wr.wr_lo = htonl(V_WR_LEN(flits) | V_WR_GEN(gen) | + V_WR_TID(q->token)); + wr_gen2(d, gen); + dev_consume_skb_any(skb); + return; + } + + flits = 2; + } + + sgp = ndesc == 1 ? (struct sg_ent *)&d->flit[flits] : sgl; + sgl_flits = make_sgl(skb, sgp, skb->data, skb_headlen(skb), adap->pdev); + + write_wr_hdr_sgl(ndesc, skb, d, pidx, q, sgl, flits, sgl_flits, gen, + htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | compl), + htonl(V_WR_TID(q->token))); +} + +static inline void t3_stop_tx_queue(struct netdev_queue *txq, + struct sge_qset *qs, struct sge_txq *q) +{ + netif_tx_stop_queue(txq); + set_bit(TXQ_ETH, &qs->txq_stopped); + q->stops++; +} + +/** + * eth_xmit - add a packet to the Ethernet Tx queue + * @skb: the packet + * @dev: the egress net device + * + * Add a packet to an SGE Tx queue. Runs with softirqs disabled. + */ +netdev_tx_t t3_eth_xmit(struct sk_buff *skb, struct net_device *dev) +{ + int qidx; + unsigned int ndesc, pidx, credits, gen, compl; + const struct port_info *pi = netdev_priv(dev); + struct adapter *adap = pi->adapter; + struct netdev_queue *txq; + struct sge_qset *qs; + struct sge_txq *q; + + /* + * The chip min packet length is 9 octets but play safe and reject + * anything shorter than an Ethernet header. + */ + if (unlikely(skb->len < ETH_HLEN)) { + dev_kfree_skb_any(skb); + return NETDEV_TX_OK; + } + + qidx = skb_get_queue_mapping(skb); + qs = &pi->qs[qidx]; + q = &qs->txq[TXQ_ETH]; + txq = netdev_get_tx_queue(dev, qidx); + + reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK); + + credits = q->size - q->in_use; + ndesc = calc_tx_descs(skb); + + if (unlikely(credits < ndesc)) { + t3_stop_tx_queue(txq, qs, q); + dev_err(&adap->pdev->dev, + "%s: Tx ring %u full while queue awake!\n", + dev->name, q->cntxt_id & 7); + return NETDEV_TX_BUSY; + } + + q->in_use += ndesc; + if (unlikely(credits - ndesc < q->stop_thres)) { + t3_stop_tx_queue(txq, qs, q); + + if (should_restart_tx(q) && + test_and_clear_bit(TXQ_ETH, &qs->txq_stopped)) { + q->restarts++; + netif_tx_start_queue(txq); + } + } + + gen = q->gen; + q->unacked += ndesc; + compl = (q->unacked & 8) << (S_WR_COMPL - 3); + q->unacked &= 7; + pidx = q->pidx; + q->pidx += ndesc; + if (q->pidx >= q->size) { + q->pidx -= q->size; + q->gen ^= 1; + } + + /* update port statistics */ + if (skb->ip_summed == CHECKSUM_PARTIAL) + qs->port_stats[SGE_PSTAT_TX_CSUM]++; + if (skb_shinfo(skb)->gso_size) + qs->port_stats[SGE_PSTAT_TSO]++; + if (vlan_tx_tag_present(skb)) + qs->port_stats[SGE_PSTAT_VLANINS]++; + + /* + * We do not use Tx completion interrupts to free DMAd Tx packets. + * This is good for performance but means that we rely on new Tx + * packets arriving to run the destructors of completed packets, + * which open up space in their sockets' send queues. Sometimes + * we do not get such new packets causing Tx to stall. A single + * UDP transmitter is a good example of this situation. We have + * a clean up timer that periodically reclaims completed packets + * but it doesn't run often enough (nor do we want it to) to prevent + * lengthy stalls. A solution to this problem is to run the + * destructor early, after the packet is queued but before it's DMAd. + * A cons is that we lie to socket memory accounting, but the amount + * of extra memory is reasonable (limited by the number of Tx + * descriptors), the packets do actually get freed quickly by new + * packets almost always, and for protocols like TCP that wait for + * acks to really free up the data the extra memory is even less. + * On the positive side we run the destructors on the sending CPU + * rather than on a potentially different completing CPU, usually a + * good thing. We also run them without holding our Tx queue lock, + * unlike what reclaim_completed_tx() would otherwise do. + * + * Run the destructor before telling the DMA engine about the packet + * to make sure it doesn't complete and get freed prematurely. + */ + if (likely(!skb_shared(skb))) + skb_orphan(skb); + + write_tx_pkt_wr(adap, skb, pi, pidx, gen, q, ndesc, compl); + check_ring_tx_db(adap, q); + return NETDEV_TX_OK; +} + +/** + * write_imm - write a packet into a Tx descriptor as immediate data + * @d: the Tx descriptor to write + * @skb: the packet + * @len: the length of packet data to write as immediate data + * @gen: the generation bit value to write + * + * Writes a packet as immediate data into a Tx descriptor. The packet + * contains a work request at its beginning. We must write the packet + * carefully so the SGE doesn't read it accidentally before it's written + * in its entirety. + */ +static inline void write_imm(struct tx_desc *d, struct sk_buff *skb, + unsigned int len, unsigned int gen) +{ + struct work_request_hdr *from = (struct work_request_hdr *)skb->data; + struct work_request_hdr *to = (struct work_request_hdr *)d; + + if (likely(!skb->data_len)) + memcpy(&to[1], &from[1], len - sizeof(*from)); + else + skb_copy_bits(skb, sizeof(*from), &to[1], len - sizeof(*from)); + + to->wr_hi = from->wr_hi | htonl(F_WR_SOP | F_WR_EOP | + V_WR_BCNTLFLT(len & 7)); + wmb(); + to->wr_lo = from->wr_lo | htonl(V_WR_GEN(gen) | + V_WR_LEN((len + 7) / 8)); + wr_gen2(d, gen); + kfree_skb(skb); +} + +/** + * check_desc_avail - check descriptor availability on a send queue + * @adap: the adapter + * @q: the send queue + * @skb: the packet needing the descriptors + * @ndesc: the number of Tx descriptors needed + * @qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL) + * + * Checks if the requested number of Tx descriptors is available on an + * SGE send queue. If the queue is already suspended or not enough + * descriptors are available the packet is queued for later transmission. + * Must be called with the Tx queue locked. + * + * Returns 0 if enough descriptors are available, 1 if there aren't + * enough descriptors and the packet has been queued, and 2 if the caller + * needs to retry because there weren't enough descriptors at the + * beginning of the call but some freed up in the mean time. + */ +static inline int check_desc_avail(struct adapter *adap, struct sge_txq *q, + struct sk_buff *skb, unsigned int ndesc, + unsigned int qid) +{ + if (unlikely(!skb_queue_empty(&q->sendq))) { + addq_exit:__skb_queue_tail(&q->sendq, skb); + return 1; + } + if (unlikely(q->size - q->in_use < ndesc)) { + struct sge_qset *qs = txq_to_qset(q, qid); + + set_bit(qid, &qs->txq_stopped); + smp_mb__after_atomic(); + + if (should_restart_tx(q) && + test_and_clear_bit(qid, &qs->txq_stopped)) + return 2; + + q->stops++; + goto addq_exit; + } + return 0; +} + +/** + * reclaim_completed_tx_imm - reclaim completed control-queue Tx descs + * @q: the SGE control Tx queue + * + * This is a variant of reclaim_completed_tx() that is used for Tx queues + * that send only immediate data (presently just the control queues) and + * thus do not have any sk_buffs to release. + */ +static inline void reclaim_completed_tx_imm(struct sge_txq *q) +{ + unsigned int reclaim = q->processed - q->cleaned; + + q->in_use -= reclaim; + q->cleaned += reclaim; +} + +static inline int immediate(const struct sk_buff *skb) +{ + return skb->len <= WR_LEN; +} + +/** + * ctrl_xmit - send a packet through an SGE control Tx queue + * @adap: the adapter + * @q: the control queue + * @skb: the packet + * + * Send a packet through an SGE control Tx queue. Packets sent through + * a control queue must fit entirely as immediate data in a single Tx + * descriptor and have no page fragments. + */ +static int ctrl_xmit(struct adapter *adap, struct sge_txq *q, + struct sk_buff *skb) +{ + int ret; + struct work_request_hdr *wrp = (struct work_request_hdr *)skb->data; + + if (unlikely(!immediate(skb))) { + WARN_ON(1); + dev_kfree_skb(skb); + return NET_XMIT_SUCCESS; + } + + wrp->wr_hi |= htonl(F_WR_SOP | F_WR_EOP); + wrp->wr_lo = htonl(V_WR_TID(q->token)); + + spin_lock(&q->lock); + again:reclaim_completed_tx_imm(q); + + ret = check_desc_avail(adap, q, skb, 1, TXQ_CTRL); + if (unlikely(ret)) { + if (ret == 1) { + spin_unlock(&q->lock); + return NET_XMIT_CN; + } + goto again; + } + + write_imm(&q->desc[q->pidx], skb, skb->len, q->gen); + + q->in_use++; + if (++q->pidx >= q->size) { + q->pidx = 0; + q->gen ^= 1; + } + spin_unlock(&q->lock); + wmb(); + t3_write_reg(adap, A_SG_KDOORBELL, + F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id)); + return NET_XMIT_SUCCESS; +} + +/** + * restart_ctrlq - restart a suspended control queue + * @qs: the queue set cotaining the control queue + * + * Resumes transmission on a suspended Tx control queue. + */ +static void restart_ctrlq(unsigned long data) +{ + struct sk_buff *skb; + struct sge_qset *qs = (struct sge_qset *)data; + struct sge_txq *q = &qs->txq[TXQ_CTRL]; + + spin_lock(&q->lock); + again:reclaim_completed_tx_imm(q); + + while (q->in_use < q->size && + (skb = __skb_dequeue(&q->sendq)) != NULL) { + + write_imm(&q->desc[q->pidx], skb, skb->len, q->gen); + + if (++q->pidx >= q->size) { + q->pidx = 0; + q->gen ^= 1; + } + q->in_use++; + } + + if (!skb_queue_empty(&q->sendq)) { + set_bit(TXQ_CTRL, &qs->txq_stopped); + smp_mb__after_atomic(); + + if (should_restart_tx(q) && + test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) + goto again; + q->stops++; + } + + spin_unlock(&q->lock); + wmb(); + t3_write_reg(qs->adap, A_SG_KDOORBELL, + F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id)); +} + +/* + * Send a management message through control queue 0 + */ +int t3_mgmt_tx(struct adapter *adap, struct sk_buff *skb) +{ + int ret; + local_bh_disable(); + ret = ctrl_xmit(adap, &adap->sge.qs[0].txq[TXQ_CTRL], skb); + local_bh_enable(); + + return ret; +} + +/** + * deferred_unmap_destructor - unmap a packet when it is freed + * @skb: the packet + * + * This is the packet destructor used for Tx packets that need to remain + * mapped until they are freed rather than until their Tx descriptors are + * freed. + */ +static void deferred_unmap_destructor(struct sk_buff *skb) +{ + int i; + const dma_addr_t *p; + const struct skb_shared_info *si; + const struct deferred_unmap_info *dui; + + dui = (struct deferred_unmap_info *)skb->head; + p = dui->addr; + + if (skb_tail_pointer(skb) - skb_transport_header(skb)) + pci_unmap_single(dui->pdev, *p++, skb_tail_pointer(skb) - + skb_transport_header(skb), PCI_DMA_TODEVICE); + + si = skb_shinfo(skb); + for (i = 0; i < si->nr_frags; i++) + pci_unmap_page(dui->pdev, *p++, skb_frag_size(&si->frags[i]), + PCI_DMA_TODEVICE); +} + +static void setup_deferred_unmapping(struct sk_buff *skb, struct pci_dev *pdev, + const struct sg_ent *sgl, int sgl_flits) +{ + dma_addr_t *p; + struct deferred_unmap_info *dui; + + dui = (struct deferred_unmap_info *)skb->head; + dui->pdev = pdev; + for (p = dui->addr; sgl_flits >= 3; sgl++, sgl_flits -= 3) { + *p++ = be64_to_cpu(sgl->addr[0]); + *p++ = be64_to_cpu(sgl->addr[1]); + } + if (sgl_flits) + *p = be64_to_cpu(sgl->addr[0]); +} + +/** + * write_ofld_wr - write an offload work request + * @adap: the adapter + * @skb: the packet to send + * @q: the Tx queue + * @pidx: index of the first Tx descriptor to write + * @gen: the generation value to use + * @ndesc: number of descriptors the packet will occupy + * + * Write an offload work request to send the supplied packet. The packet + * data already carry the work request with most fields populated. + */ +static void write_ofld_wr(struct adapter *adap, struct sk_buff *skb, + struct sge_txq *q, unsigned int pidx, + unsigned int gen, unsigned int ndesc) +{ + unsigned int sgl_flits, flits; + struct work_request_hdr *from; + struct sg_ent *sgp, sgl[MAX_SKB_FRAGS / 2 + 1]; + struct tx_desc *d = &q->desc[pidx]; + + if (immediate(skb)) { + q->sdesc[pidx].skb = NULL; + write_imm(d, skb, skb->len, gen); + return; + } + + /* Only TX_DATA builds SGLs */ + + from = (struct work_request_hdr *)skb->data; + memcpy(&d->flit[1], &from[1], + skb_transport_offset(skb) - sizeof(*from)); + + flits = skb_transport_offset(skb) / 8; + sgp = ndesc == 1 ? (struct sg_ent *)&d->flit[flits] : sgl; + sgl_flits = make_sgl(skb, sgp, skb_transport_header(skb), + skb_tail_pointer(skb) - + skb_transport_header(skb), + adap->pdev); + if (need_skb_unmap()) { + setup_deferred_unmapping(skb, adap->pdev, sgp, sgl_flits); + skb->destructor = deferred_unmap_destructor; + } + + write_wr_hdr_sgl(ndesc, skb, d, pidx, q, sgl, flits, sgl_flits, + gen, from->wr_hi, from->wr_lo); +} + +/** + * calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet + * @skb: the packet + * + * Returns the number of Tx descriptors needed for the given offload + * packet. These packets are already fully constructed. + */ +static inline unsigned int calc_tx_descs_ofld(const struct sk_buff *skb) +{ + unsigned int flits, cnt; + + if (skb->len <= WR_LEN) + return 1; /* packet fits as immediate data */ + + flits = skb_transport_offset(skb) / 8; /* headers */ + cnt = skb_shinfo(skb)->nr_frags; + if (skb_tail_pointer(skb) != skb_transport_header(skb)) + cnt++; + return flits_to_desc(flits + sgl_len(cnt)); +} + +/** + * ofld_xmit - send a packet through an offload queue + * @adap: the adapter + * @q: the Tx offload queue + * @skb: the packet + * + * Send an offload packet through an SGE offload queue. + */ +static int ofld_xmit(struct adapter *adap, struct sge_txq *q, + struct sk_buff *skb) +{ + int ret; + unsigned int ndesc = calc_tx_descs_ofld(skb), pidx, gen; + + spin_lock(&q->lock); +again: reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK); + + ret = check_desc_avail(adap, q, skb, ndesc, TXQ_OFLD); + if (unlikely(ret)) { + if (ret == 1) { + skb->priority = ndesc; /* save for restart */ + spin_unlock(&q->lock); + return NET_XMIT_CN; + } + goto again; + } + + gen = q->gen; + q->in_use += ndesc; + pidx = q->pidx; + q->pidx += ndesc; + if (q->pidx >= q->size) { + q->pidx -= q->size; + q->gen ^= 1; + } + spin_unlock(&q->lock); + + write_ofld_wr(adap, skb, q, pidx, gen, ndesc); + check_ring_tx_db(adap, q); + return NET_XMIT_SUCCESS; +} + +/** + * restart_offloadq - restart a suspended offload queue + * @qs: the queue set cotaining the offload queue + * + * Resumes transmission on a suspended Tx offload queue. + */ +static void restart_offloadq(unsigned long data) +{ + struct sk_buff *skb; + struct sge_qset *qs = (struct sge_qset *)data; + struct sge_txq *q = &qs->txq[TXQ_OFLD]; + const struct port_info *pi = netdev_priv(qs->netdev); + struct adapter *adap = pi->adapter; + + spin_lock(&q->lock); +again: reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK); + + while ((skb = skb_peek(&q->sendq)) != NULL) { + unsigned int gen, pidx; + unsigned int ndesc = skb->priority; + + if (unlikely(q->size - q->in_use < ndesc)) { + set_bit(TXQ_OFLD, &qs->txq_stopped); + smp_mb__after_atomic(); + + if (should_restart_tx(q) && + test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) + goto again; + q->stops++; + break; + } + + gen = q->gen; + q->in_use += ndesc; + pidx = q->pidx; + q->pidx += ndesc; + if (q->pidx >= q->size) { + q->pidx -= q->size; + q->gen ^= 1; + } + __skb_unlink(skb, &q->sendq); + spin_unlock(&q->lock); + + write_ofld_wr(adap, skb, q, pidx, gen, ndesc); + spin_lock(&q->lock); + } + spin_unlock(&q->lock); + +#if USE_GTS + set_bit(TXQ_RUNNING, &q->flags); + set_bit(TXQ_LAST_PKT_DB, &q->flags); +#endif + wmb(); + t3_write_reg(adap, A_SG_KDOORBELL, + F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id)); +} + +/** + * queue_set - return the queue set a packet should use + * @skb: the packet + * + * Maps a packet to the SGE queue set it should use. The desired queue + * set is carried in bits 1-3 in the packet's priority. + */ +static inline int queue_set(const struct sk_buff *skb) +{ + return skb->priority >> 1; +} + +/** + * is_ctrl_pkt - return whether an offload packet is a control packet + * @skb: the packet + * + * Determines whether an offload packet should use an OFLD or a CTRL + * Tx queue. This is indicated by bit 0 in the packet's priority. + */ +static inline int is_ctrl_pkt(const struct sk_buff *skb) +{ + return skb->priority & 1; +} + +/** + * t3_offload_tx - send an offload packet + * @tdev: the offload device to send to + * @skb: the packet + * + * Sends an offload packet. We use the packet priority to select the + * appropriate Tx queue as follows: bit 0 indicates whether the packet + * should be sent as regular or control, bits 1-3 select the queue set. + */ +int t3_offload_tx(struct t3cdev *tdev, struct sk_buff *skb) +{ + struct adapter *adap = tdev2adap(tdev); + struct sge_qset *qs = &adap->sge.qs[queue_set(skb)]; + + if (unlikely(is_ctrl_pkt(skb))) + return ctrl_xmit(adap, &qs->txq[TXQ_CTRL], skb); + + return ofld_xmit(adap, &qs->txq[TXQ_OFLD], skb); +} + +/** + * offload_enqueue - add an offload packet to an SGE offload receive queue + * @q: the SGE response queue + * @skb: the packet + * + * Add a new offload packet to an SGE response queue's offload packet + * queue. If the packet is the first on the queue it schedules the RX + * softirq to process the queue. + */ +static inline void offload_enqueue(struct sge_rspq *q, struct sk_buff *skb) +{ + int was_empty = skb_queue_empty(&q->rx_queue); + + __skb_queue_tail(&q->rx_queue, skb); + + if (was_empty) { + struct sge_qset *qs = rspq_to_qset(q); + + napi_schedule(&qs->napi); + } +} + +/** + * deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts + * @tdev: the offload device that will be receiving the packets + * @q: the SGE response queue that assembled the bundle + * @skbs: the partial bundle + * @n: the number of packets in the bundle + * + * Delivers a (partial) bundle of Rx offload packets to an offload device. + */ +static inline void deliver_partial_bundle(struct t3cdev *tdev, + struct sge_rspq *q, + struct sk_buff *skbs[], int n) +{ + if (n) { + q->offload_bundles++; + tdev->recv(tdev, skbs, n); + } +} + +/** + * ofld_poll - NAPI handler for offload packets in interrupt mode + * @dev: the network device doing the polling + * @budget: polling budget + * + * The NAPI handler for offload packets when a response queue is serviced + * by the hard interrupt handler, i.e., when it's operating in non-polling + * mode. Creates small packet batches and sends them through the offload + * receive handler. Batches need to be of modest size as we do prefetches + * on the packets in each. + */ +static int ofld_poll(struct napi_struct *napi, int budget) +{ + struct sge_qset *qs = container_of(napi, struct sge_qset, napi); + struct sge_rspq *q = &qs->rspq; + struct adapter *adapter = qs->adap; + int work_done = 0; + + while (work_done < budget) { + struct sk_buff *skb, *tmp, *skbs[RX_BUNDLE_SIZE]; + struct sk_buff_head queue; + int ngathered; + + spin_lock_irq(&q->lock); + __skb_queue_head_init(&queue); + skb_queue_splice_init(&q->rx_queue, &queue); + if (skb_queue_empty(&queue)) { + napi_complete(napi); + spin_unlock_irq(&q->lock); + return work_done; + } + spin_unlock_irq(&q->lock); + + ngathered = 0; + skb_queue_walk_safe(&queue, skb, tmp) { + if (work_done >= budget) + break; + work_done++; + + __skb_unlink(skb, &queue); + prefetch(skb->data); + skbs[ngathered] = skb; + if (++ngathered == RX_BUNDLE_SIZE) { + q->offload_bundles++; + adapter->tdev.recv(&adapter->tdev, skbs, + ngathered); + ngathered = 0; + } + } + if (!skb_queue_empty(&queue)) { + /* splice remaining packets back onto Rx queue */ + spin_lock_irq(&q->lock); + skb_queue_splice(&queue, &q->rx_queue); + spin_unlock_irq(&q->lock); + } + deliver_partial_bundle(&adapter->tdev, q, skbs, ngathered); + } + + return work_done; +} + +/** + * rx_offload - process a received offload packet + * @tdev: the offload device receiving the packet + * @rq: the response queue that received the packet + * @skb: the packet + * @rx_gather: a gather list of packets if we are building a bundle + * @gather_idx: index of the next available slot in the bundle + * + * Process an ingress offload pakcet and add it to the offload ingress + * queue. Returns the index of the next available slot in the bundle. + */ +static inline int rx_offload(struct t3cdev *tdev, struct sge_rspq *rq, + struct sk_buff *skb, struct sk_buff *rx_gather[], + unsigned int gather_idx) +{ + skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + + if (rq->polling) { + rx_gather[gather_idx++] = skb; + if (gather_idx == RX_BUNDLE_SIZE) { + tdev->recv(tdev, rx_gather, RX_BUNDLE_SIZE); + gather_idx = 0; + rq->offload_bundles++; + } + } else + offload_enqueue(rq, skb); + + return gather_idx; +} + +/** + * restart_tx - check whether to restart suspended Tx queues + * @qs: the queue set to resume + * + * Restarts suspended Tx queues of an SGE queue set if they have enough + * free resources to resume operation. + */ +static void restart_tx(struct sge_qset *qs) +{ + if (test_bit(TXQ_ETH, &qs->txq_stopped) && + should_restart_tx(&qs->txq[TXQ_ETH]) && + test_and_clear_bit(TXQ_ETH, &qs->txq_stopped)) { + qs->txq[TXQ_ETH].restarts++; + if (netif_running(qs->netdev)) + netif_tx_wake_queue(qs->tx_q); + } + + if (test_bit(TXQ_OFLD, &qs->txq_stopped) && + should_restart_tx(&qs->txq[TXQ_OFLD]) && + test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) { + qs->txq[TXQ_OFLD].restarts++; + tasklet_schedule(&qs->txq[TXQ_OFLD].qresume_tsk); + } + if (test_bit(TXQ_CTRL, &qs->txq_stopped) && + should_restart_tx(&qs->txq[TXQ_CTRL]) && + test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) { + qs->txq[TXQ_CTRL].restarts++; + tasklet_schedule(&qs->txq[TXQ_CTRL].qresume_tsk); + } +} + +/** + * cxgb3_arp_process - process an ARP request probing a private IP address + * @adapter: the adapter + * @skb: the skbuff containing the ARP request + * + * Check if the ARP request is probing the private IP address + * dedicated to iSCSI, generate an ARP reply if so. + */ +static void cxgb3_arp_process(struct port_info *pi, struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct arphdr *arp; + unsigned char *arp_ptr; + unsigned char *sha; + __be32 sip, tip; + + if (!dev) + return; + + skb_reset_network_header(skb); + arp = arp_hdr(skb); + + if (arp->ar_op != htons(ARPOP_REQUEST)) + return; + + arp_ptr = (unsigned char *)(arp + 1); + sha = arp_ptr; + arp_ptr += dev->addr_len; + memcpy(&sip, arp_ptr, sizeof(sip)); + arp_ptr += sizeof(sip); + arp_ptr += dev->addr_len; + memcpy(&tip, arp_ptr, sizeof(tip)); + + if (tip != pi->iscsi_ipv4addr) + return; + + arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, + pi->iscsic.mac_addr, sha); + +} + +static inline int is_arp(struct sk_buff *skb) +{ + return skb->protocol == htons(ETH_P_ARP); +} + +static void cxgb3_process_iscsi_prov_pack(struct port_info *pi, + struct sk_buff *skb) +{ + if (is_arp(skb)) { + cxgb3_arp_process(pi, skb); + return; + } + + if (pi->iscsic.recv) + pi->iscsic.recv(pi, skb); + +} + +/** + * rx_eth - process an ingress ethernet packet + * @adap: the adapter + * @rq: the response queue that received the packet + * @skb: the packet + * @pad: amount of padding at the start of the buffer + * + * Process an ingress ethernet pakcet and deliver it to the stack. + * The padding is 2 if the packet was delivered in an Rx buffer and 0 + * if it was immediate data in a response. + */ +static void rx_eth(struct adapter *adap, struct sge_rspq *rq, + struct sk_buff *skb, int pad, int lro) +{ + struct cpl_rx_pkt *p = (struct cpl_rx_pkt *)(skb->data + pad); + struct sge_qset *qs = rspq_to_qset(rq); + struct port_info *pi; + + skb_pull(skb, sizeof(*p) + pad); + skb->protocol = eth_type_trans(skb, adap->port[p->iff]); + pi = netdev_priv(skb->dev); + if ((skb->dev->features & NETIF_F_RXCSUM) && p->csum_valid && + p->csum == htons(0xffff) && !p->fragment) { + qs->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++; + skb->ip_summed = CHECKSUM_UNNECESSARY; + } else + skb_checksum_none_assert(skb); + skb_record_rx_queue(skb, qs - &adap->sge.qs[pi->first_qset]); + + if (p->vlan_valid) { + qs->port_stats[SGE_PSTAT_VLANEX]++; + __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(p->vlan)); + } + if (rq->polling) { + if (lro) + napi_gro_receive(&qs->napi, skb); + else { + if (unlikely(pi->iscsic.flags)) + cxgb3_process_iscsi_prov_pack(pi, skb); + netif_receive_skb(skb); + } + } else + netif_rx(skb); +} + +static inline int is_eth_tcp(u32 rss) +{ + return G_HASHTYPE(ntohl(rss)) == RSS_HASH_4_TUPLE; +} + +/** + * lro_add_page - add a page chunk to an LRO session + * @adap: the adapter + * @qs: the associated queue set + * @fl: the free list containing the page chunk to add + * @len: packet length + * @complete: Indicates the last fragment of a frame + * + * Add a received packet contained in a page chunk to an existing LRO + * session. + */ +static void lro_add_page(struct adapter *adap, struct sge_qset *qs, + struct sge_fl *fl, int len, int complete) +{ + struct rx_sw_desc *sd = &fl->sdesc[fl->cidx]; + struct port_info *pi = netdev_priv(qs->netdev); + struct sk_buff *skb = NULL; + struct cpl_rx_pkt *cpl; + struct skb_frag_struct *rx_frag; + int nr_frags; + int offset = 0; + + if (!qs->nomem) { + skb = napi_get_frags(&qs->napi); + qs->nomem = !skb; + } + + fl->credits--; + + pci_dma_sync_single_for_cpu(adap->pdev, + dma_unmap_addr(sd, dma_addr), + fl->buf_size - SGE_PG_RSVD, + PCI_DMA_FROMDEVICE); + + (*sd->pg_chunk.p_cnt)--; + if (!*sd->pg_chunk.p_cnt && sd->pg_chunk.page != fl->pg_chunk.page) + pci_unmap_page(adap->pdev, + sd->pg_chunk.mapping, + fl->alloc_size, + PCI_DMA_FROMDEVICE); + + if (!skb) { + put_page(sd->pg_chunk.page); + if (complete) + qs->nomem = 0; + return; + } + + rx_frag = skb_shinfo(skb)->frags; + nr_frags = skb_shinfo(skb)->nr_frags; + + if (!nr_frags) { + offset = 2 + sizeof(struct cpl_rx_pkt); + cpl = qs->lro_va = sd->pg_chunk.va + 2; + + if ((qs->netdev->features & NETIF_F_RXCSUM) && + cpl->csum_valid && cpl->csum == htons(0xffff)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + qs->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++; + } else + skb->ip_summed = CHECKSUM_NONE; + } else + cpl = qs->lro_va; + + len -= offset; + + rx_frag += nr_frags; + __skb_frag_set_page(rx_frag, sd->pg_chunk.page); + rx_frag->page_offset = sd->pg_chunk.offset + offset; + skb_frag_size_set(rx_frag, len); + + skb->len += len; + skb->data_len += len; + skb->truesize += len; + skb_shinfo(skb)->nr_frags++; + + if (!complete) + return; + + skb_record_rx_queue(skb, qs - &adap->sge.qs[pi->first_qset]); + + if (cpl->vlan_valid) { + qs->port_stats[SGE_PSTAT_VLANEX]++; + __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(cpl->vlan)); + } + napi_gro_frags(&qs->napi); +} + +/** + * handle_rsp_cntrl_info - handles control information in a response + * @qs: the queue set corresponding to the response + * @flags: the response control flags + * + * Handles the control information of an SGE response, such as GTS + * indications and completion credits for the queue set's Tx queues. + * HW coalesces credits, we don't do any extra SW coalescing. + */ +static inline void handle_rsp_cntrl_info(struct sge_qset *qs, u32 flags) +{ + unsigned int credits; + +#if USE_GTS + if (flags & F_RSPD_TXQ0_GTS) + clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags); +#endif + + credits = G_RSPD_TXQ0_CR(flags); + if (credits) + qs->txq[TXQ_ETH].processed += credits; + + credits = G_RSPD_TXQ2_CR(flags); + if (credits) + qs->txq[TXQ_CTRL].processed += credits; + +# if USE_GTS + if (flags & F_RSPD_TXQ1_GTS) + clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags); +# endif + credits = G_RSPD_TXQ1_CR(flags); + if (credits) + qs->txq[TXQ_OFLD].processed += credits; +} + +/** + * check_ring_db - check if we need to ring any doorbells + * @adapter: the adapter + * @qs: the queue set whose Tx queues are to be examined + * @sleeping: indicates which Tx queue sent GTS + * + * Checks if some of a queue set's Tx queues need to ring their doorbells + * to resume transmission after idling while they still have unprocessed + * descriptors. + */ +static void check_ring_db(struct adapter *adap, struct sge_qset *qs, + unsigned int sleeping) +{ + if (sleeping & F_RSPD_TXQ0_GTS) { + struct sge_txq *txq = &qs->txq[TXQ_ETH]; + + if (txq->cleaned + txq->in_use != txq->processed && + !test_and_set_bit(TXQ_LAST_PKT_DB, &txq->flags)) { + set_bit(TXQ_RUNNING, &txq->flags); + t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX | + V_EGRCNTX(txq->cntxt_id)); + } + } + + if (sleeping & F_RSPD_TXQ1_GTS) { + struct sge_txq *txq = &qs->txq[TXQ_OFLD]; + + if (txq->cleaned + txq->in_use != txq->processed && + !test_and_set_bit(TXQ_LAST_PKT_DB, &txq->flags)) { + set_bit(TXQ_RUNNING, &txq->flags); + t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX | + V_EGRCNTX(txq->cntxt_id)); + } + } +} + +/** + * is_new_response - check if a response is newly written + * @r: the response descriptor + * @q: the response queue + * + * Returns true if a response descriptor contains a yet unprocessed + * response. + */ +static inline int is_new_response(const struct rsp_desc *r, + const struct sge_rspq *q) +{ + return (r->intr_gen & F_RSPD_GEN2) == q->gen; +} + +static inline void clear_rspq_bufstate(struct sge_rspq * const q) +{ + q->pg_skb = NULL; + q->rx_recycle_buf = 0; +} + +#define RSPD_GTS_MASK (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS) +#define RSPD_CTRL_MASK (RSPD_GTS_MASK | \ + V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \ + V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \ + V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR)) + +/* How long to delay the next interrupt in case of memory shortage, in 0.1us. */ +#define NOMEM_INTR_DELAY 2500 + +/** + * process_responses - process responses from an SGE response queue + * @adap: the adapter + * @qs: the queue set to which the response queue belongs + * @budget: how many responses can be processed in this round + * + * Process responses from an SGE response queue up to the supplied budget. + * Responses include received packets as well as credits and other events + * for the queues that belong to the response queue's queue set. + * A negative budget is effectively unlimited. + * + * Additionally choose the interrupt holdoff time for the next interrupt + * on this queue. If the system is under memory shortage use a fairly + * long delay to help recovery. + */ +static int process_responses(struct adapter *adap, struct sge_qset *qs, + int budget) +{ + struct sge_rspq *q = &qs->rspq; + struct rsp_desc *r = &q->desc[q->cidx]; + int budget_left = budget; + unsigned int sleeping = 0; + struct sk_buff *offload_skbs[RX_BUNDLE_SIZE]; + int ngathered = 0; + + q->next_holdoff = q->holdoff_tmr; + + while (likely(budget_left && is_new_response(r, q))) { + int packet_complete, eth, ethpad = 2; + int lro = !!(qs->netdev->features & NETIF_F_GRO); + struct sk_buff *skb = NULL; + u32 len, flags; + __be32 rss_hi, rss_lo; + + rmb(); + eth = r->rss_hdr.opcode == CPL_RX_PKT; + rss_hi = *(const __be32 *)r; + rss_lo = r->rss_hdr.rss_hash_val; + flags = ntohl(r->flags); + + if (unlikely(flags & F_RSPD_ASYNC_NOTIF)) { + skb = alloc_skb(AN_PKT_SIZE, GFP_ATOMIC); + if (!skb) + goto no_mem; + + memcpy(__skb_put(skb, AN_PKT_SIZE), r, AN_PKT_SIZE); + skb->data[0] = CPL_ASYNC_NOTIF; + rss_hi = htonl(CPL_ASYNC_NOTIF << 24); + q->async_notif++; + } else if (flags & F_RSPD_IMM_DATA_VALID) { + skb = get_imm_packet(r); + if (unlikely(!skb)) { +no_mem: + q->next_holdoff = NOMEM_INTR_DELAY; + q->nomem++; + /* consume one credit since we tried */ + budget_left--; + break; + } + q->imm_data++; + ethpad = 0; + } else if ((len = ntohl(r->len_cq)) != 0) { + struct sge_fl *fl; + + lro &= eth && is_eth_tcp(rss_hi); + + fl = (len & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0]; + if (fl->use_pages) { + void *addr = fl->sdesc[fl->cidx].pg_chunk.va; + + prefetch(addr); +#if L1_CACHE_BYTES < 128 + prefetch(addr + L1_CACHE_BYTES); +#endif + __refill_fl(adap, fl); + if (lro > 0) { + lro_add_page(adap, qs, fl, + G_RSPD_LEN(len), + flags & F_RSPD_EOP); + goto next_fl; + } + + skb = get_packet_pg(adap, fl, q, + G_RSPD_LEN(len), + eth ? + SGE_RX_DROP_THRES : 0); + q->pg_skb = skb; + } else + skb = get_packet(adap, fl, G_RSPD_LEN(len), + eth ? SGE_RX_DROP_THRES : 0); + if (unlikely(!skb)) { + if (!eth) + goto no_mem; + q->rx_drops++; + } else if (unlikely(r->rss_hdr.opcode == CPL_TRACE_PKT)) + __skb_pull(skb, 2); +next_fl: + if (++fl->cidx == fl->size) + fl->cidx = 0; + } else + q->pure_rsps++; + + if (flags & RSPD_CTRL_MASK) { + sleeping |= flags & RSPD_GTS_MASK; + handle_rsp_cntrl_info(qs, flags); + } + + r++; + if (unlikely(++q->cidx == q->size)) { + q->cidx = 0; + q->gen ^= 1; + r = q->desc; + } + prefetch(r); + + if (++q->credits >= (q->size / 4)) { + refill_rspq(adap, q, q->credits); + q->credits = 0; + } + + packet_complete = flags & + (F_RSPD_EOP | F_RSPD_IMM_DATA_VALID | + F_RSPD_ASYNC_NOTIF); + + if (skb != NULL && packet_complete) { + if (eth) + rx_eth(adap, q, skb, ethpad, lro); + else { + q->offload_pkts++; + /* Preserve the RSS info in csum & priority */ + skb->csum = rss_hi; + skb->priority = rss_lo; + ngathered = rx_offload(&adap->tdev, q, skb, + offload_skbs, + ngathered); + } + + if (flags & F_RSPD_EOP) + clear_rspq_bufstate(q); + } + --budget_left; + } + + deliver_partial_bundle(&adap->tdev, q, offload_skbs, ngathered); + + if (sleeping) + check_ring_db(adap, qs, sleeping); + + smp_mb(); /* commit Tx queue .processed updates */ + if (unlikely(qs->txq_stopped != 0)) + restart_tx(qs); + + budget -= budget_left; + return budget; +} + +static inline int is_pure_response(const struct rsp_desc *r) +{ + __be32 n = r->flags & htonl(F_RSPD_ASYNC_NOTIF | F_RSPD_IMM_DATA_VALID); + + return (n | r->len_cq) == 0; +} + +/** + * napi_rx_handler - the NAPI handler for Rx processing + * @napi: the napi instance + * @budget: how many packets we can process in this round + * + * Handler for new data events when using NAPI. + */ +static int napi_rx_handler(struct napi_struct *napi, int budget) +{ + struct sge_qset *qs = container_of(napi, struct sge_qset, napi); + struct adapter *adap = qs->adap; + int work_done = process_responses(adap, qs, budget); + + if (likely(work_done < budget)) { + napi_complete(napi); + + /* + * Because we don't atomically flush the following + * write it is possible that in very rare cases it can + * reach the device in a way that races with a new + * response being written plus an error interrupt + * causing the NAPI interrupt handler below to return + * unhandled status to the OS. To protect against + * this would require flushing the write and doing + * both the write and the flush with interrupts off. + * Way too expensive and unjustifiable given the + * rarity of the race. + * + * The race cannot happen at all with MSI-X. + */ + t3_write_reg(adap, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) | + V_NEWTIMER(qs->rspq.next_holdoff) | + V_NEWINDEX(qs->rspq.cidx)); + } + return work_done; +} + +/* + * Returns true if the device is already scheduled for polling. + */ +static inline int napi_is_scheduled(struct napi_struct *napi) +{ + return test_bit(NAPI_STATE_SCHED, &napi->state); +} + +/** + * process_pure_responses - process pure responses from a response queue + * @adap: the adapter + * @qs: the queue set owning the response queue + * @r: the first pure response to process + * + * A simpler version of process_responses() that handles only pure (i.e., + * non data-carrying) responses. Such respones are too light-weight to + * justify calling a softirq under NAPI, so we handle them specially in + * the interrupt handler. The function is called with a pointer to a + * response, which the caller must ensure is a valid pure response. + * + * Returns 1 if it encounters a valid data-carrying response, 0 otherwise. + */ +static int process_pure_responses(struct adapter *adap, struct sge_qset *qs, + struct rsp_desc *r) +{ + struct sge_rspq *q = &qs->rspq; + unsigned int sleeping = 0; + + do { + u32 flags = ntohl(r->flags); + + r++; + if (unlikely(++q->cidx == q->size)) { + q->cidx = 0; + q->gen ^= 1; + r = q->desc; + } + prefetch(r); + + if (flags & RSPD_CTRL_MASK) { + sleeping |= flags & RSPD_GTS_MASK; + handle_rsp_cntrl_info(qs, flags); + } + + q->pure_rsps++; + if (++q->credits >= (q->size / 4)) { + refill_rspq(adap, q, q->credits); + q->credits = 0; + } + if (!is_new_response(r, q)) + break; + rmb(); + } while (is_pure_response(r)); + + if (sleeping) + check_ring_db(adap, qs, sleeping); + + smp_mb(); /* commit Tx queue .processed updates */ + if (unlikely(qs->txq_stopped != 0)) + restart_tx(qs); + + return is_new_response(r, q); +} + +/** + * handle_responses - decide what to do with new responses in NAPI mode + * @adap: the adapter + * @q: the response queue + * + * This is used by the NAPI interrupt handlers to decide what to do with + * new SGE responses. If there are no new responses it returns -1. If + * there are new responses and they are pure (i.e., non-data carrying) + * it handles them straight in hard interrupt context as they are very + * cheap and don't deliver any packets. Finally, if there are any data + * signaling responses it schedules the NAPI handler. Returns 1 if it + * schedules NAPI, 0 if all new responses were pure. + * + * The caller must ascertain NAPI is not already running. + */ +static inline int handle_responses(struct adapter *adap, struct sge_rspq *q) +{ + struct sge_qset *qs = rspq_to_qset(q); + struct rsp_desc *r = &q->desc[q->cidx]; + + if (!is_new_response(r, q)) + return -1; + rmb(); + if (is_pure_response(r) && process_pure_responses(adap, qs, r) == 0) { + t3_write_reg(adap, A_SG_GTS, V_RSPQ(q->cntxt_id) | + V_NEWTIMER(q->holdoff_tmr) | V_NEWINDEX(q->cidx)); + return 0; + } + napi_schedule(&qs->napi); + return 1; +} + +/* + * The MSI-X interrupt handler for an SGE response queue for the non-NAPI case + * (i.e., response queue serviced in hard interrupt). + */ +static irqreturn_t t3_sge_intr_msix(int irq, void *cookie) +{ + struct sge_qset *qs = cookie; + struct adapter *adap = qs->adap; + struct sge_rspq *q = &qs->rspq; + + spin_lock(&q->lock); + if (process_responses(adap, qs, -1) == 0) + q->unhandled_irqs++; + t3_write_reg(adap, A_SG_GTS, V_RSPQ(q->cntxt_id) | + V_NEWTIMER(q->next_holdoff) | V_NEWINDEX(q->cidx)); + spin_unlock(&q->lock); + return IRQ_HANDLED; +} + +/* + * The MSI-X interrupt handler for an SGE response queue for the NAPI case + * (i.e., response queue serviced by NAPI polling). + */ +static irqreturn_t t3_sge_intr_msix_napi(int irq, void *cookie) +{ + struct sge_qset *qs = cookie; + struct sge_rspq *q = &qs->rspq; + + spin_lock(&q->lock); + + if (handle_responses(qs->adap, q) < 0) + q->unhandled_irqs++; + spin_unlock(&q->lock); + return IRQ_HANDLED; +} + +/* + * The non-NAPI MSI interrupt handler. This needs to handle data events from + * SGE response queues as well as error and other async events as they all use + * the same MSI vector. We use one SGE response queue per port in this mode + * and protect all response queues with queue 0's lock. + */ +static irqreturn_t t3_intr_msi(int irq, void *cookie) +{ + int new_packets = 0; + struct adapter *adap = cookie; + struct sge_rspq *q = &adap->sge.qs[0].rspq; + + spin_lock(&q->lock); + + if (process_responses(adap, &adap->sge.qs[0], -1)) { + t3_write_reg(adap, A_SG_GTS, V_RSPQ(q->cntxt_id) | + V_NEWTIMER(q->next_holdoff) | V_NEWINDEX(q->cidx)); + new_packets = 1; + } + + if (adap->params.nports == 2 && + process_responses(adap, &adap->sge.qs[1], -1)) { + struct sge_rspq *q1 = &adap->sge.qs[1].rspq; + + t3_write_reg(adap, A_SG_GTS, V_RSPQ(q1->cntxt_id) | + V_NEWTIMER(q1->next_holdoff) | + V_NEWINDEX(q1->cidx)); + new_packets = 1; + } + + if (!new_packets && t3_slow_intr_handler(adap) == 0) + q->unhandled_irqs++; + + spin_unlock(&q->lock); + return IRQ_HANDLED; +} + +static int rspq_check_napi(struct sge_qset *qs) +{ + struct sge_rspq *q = &qs->rspq; + + if (!napi_is_scheduled(&qs->napi) && + is_new_response(&q->desc[q->cidx], q)) { + napi_schedule(&qs->napi); + return 1; + } + return 0; +} + +/* + * The MSI interrupt handler for the NAPI case (i.e., response queues serviced + * by NAPI polling). Handles data events from SGE response queues as well as + * error and other async events as they all use the same MSI vector. We use + * one SGE response queue per port in this mode and protect all response + * queues with queue 0's lock. + */ +static irqreturn_t t3_intr_msi_napi(int irq, void *cookie) +{ + int new_packets; + struct adapter *adap = cookie; + struct sge_rspq *q = &adap->sge.qs[0].rspq; + + spin_lock(&q->lock); + + new_packets = rspq_check_napi(&adap->sge.qs[0]); + if (adap->params.nports == 2) + new_packets += rspq_check_napi(&adap->sge.qs[1]); + if (!new_packets && t3_slow_intr_handler(adap) == 0) + q->unhandled_irqs++; + + spin_unlock(&q->lock); + return IRQ_HANDLED; +} + +/* + * A helper function that processes responses and issues GTS. + */ +static inline int process_responses_gts(struct adapter *adap, + struct sge_rspq *rq) +{ + int work; + + work = process_responses(adap, rspq_to_qset(rq), -1); + t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) | + V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx)); + return work; +} + +/* + * The legacy INTx interrupt handler. This needs to handle data events from + * SGE response queues as well as error and other async events as they all use + * the same interrupt pin. We use one SGE response queue per port in this mode + * and protect all response queues with queue 0's lock. + */ +static irqreturn_t t3_intr(int irq, void *cookie) +{ + int work_done, w0, w1; + struct adapter *adap = cookie; + struct sge_rspq *q0 = &adap->sge.qs[0].rspq; + struct sge_rspq *q1 = &adap->sge.qs[1].rspq; + + spin_lock(&q0->lock); + + w0 = is_new_response(&q0->desc[q0->cidx], q0); + w1 = adap->params.nports == 2 && + is_new_response(&q1->desc[q1->cidx], q1); + + if (likely(w0 | w1)) { + t3_write_reg(adap, A_PL_CLI, 0); + t3_read_reg(adap, A_PL_CLI); /* flush */ + + if (likely(w0)) + process_responses_gts(adap, q0); + + if (w1) + process_responses_gts(adap, q1); + + work_done = w0 | w1; + } else + work_done = t3_slow_intr_handler(adap); + + spin_unlock(&q0->lock); + return IRQ_RETVAL(work_done != 0); +} + +/* + * Interrupt handler for legacy INTx interrupts for T3B-based cards. + * Handles data events from SGE response queues as well as error and other + * async events as they all use the same interrupt pin. We use one SGE + * response queue per port in this mode and protect all response queues with + * queue 0's lock. + */ +static irqreturn_t t3b_intr(int irq, void *cookie) +{ + u32 map; + struct adapter *adap = cookie; + struct sge_rspq *q0 = &adap->sge.qs[0].rspq; + + t3_write_reg(adap, A_PL_CLI, 0); + map = t3_read_reg(adap, A_SG_DATA_INTR); + + if (unlikely(!map)) /* shared interrupt, most likely */ + return IRQ_NONE; + + spin_lock(&q0->lock); + + if (unlikely(map & F_ERRINTR)) + t3_slow_intr_handler(adap); + + if (likely(map & 1)) + process_responses_gts(adap, q0); + + if (map & 2) + process_responses_gts(adap, &adap->sge.qs[1].rspq); + + spin_unlock(&q0->lock); + return IRQ_HANDLED; +} + +/* + * NAPI interrupt handler for legacy INTx interrupts for T3B-based cards. + * Handles data events from SGE response queues as well as error and other + * async events as they all use the same interrupt pin. We use one SGE + * response queue per port in this mode and protect all response queues with + * queue 0's lock. + */ +static irqreturn_t t3b_intr_napi(int irq, void *cookie) +{ + u32 map; + struct adapter *adap = cookie; + struct sge_qset *qs0 = &adap->sge.qs[0]; + struct sge_rspq *q0 = &qs0->rspq; + + t3_write_reg(adap, A_PL_CLI, 0); + map = t3_read_reg(adap, A_SG_DATA_INTR); + + if (unlikely(!map)) /* shared interrupt, most likely */ + return IRQ_NONE; + + spin_lock(&q0->lock); + + if (unlikely(map & F_ERRINTR)) + t3_slow_intr_handler(adap); + + if (likely(map & 1)) + napi_schedule(&qs0->napi); + + if (map & 2) + napi_schedule(&adap->sge.qs[1].napi); + + spin_unlock(&q0->lock); + return IRQ_HANDLED; +} + +/** + * t3_intr_handler - select the top-level interrupt handler + * @adap: the adapter + * @polling: whether using NAPI to service response queues + * + * Selects the top-level interrupt handler based on the type of interrupts + * (MSI-X, MSI, or legacy) and whether NAPI will be used to service the + * response queues. + */ +irq_handler_t t3_intr_handler(struct adapter *adap, int polling) +{ + if (adap->flags & USING_MSIX) + return polling ? t3_sge_intr_msix_napi : t3_sge_intr_msix; + if (adap->flags & USING_MSI) + return polling ? t3_intr_msi_napi : t3_intr_msi; + if (adap->params.rev > 0) + return polling ? t3b_intr_napi : t3b_intr; + return t3_intr; +} + +#define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \ + F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \ + V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \ + F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \ + F_HIRCQPARITYERROR) +#define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR) +#define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \ + F_RSPQDISABLED) + +/** + * t3_sge_err_intr_handler - SGE async event interrupt handler + * @adapter: the adapter + * + * Interrupt handler for SGE asynchronous (non-data) events. + */ +void t3_sge_err_intr_handler(struct adapter *adapter) +{ + unsigned int v, status = t3_read_reg(adapter, A_SG_INT_CAUSE) & + ~F_FLEMPTY; + + if (status & SGE_PARERR) + CH_ALERT(adapter, "SGE parity error (0x%x)\n", + status & SGE_PARERR); + if (status & SGE_FRAMINGERR) + CH_ALERT(adapter, "SGE framing error (0x%x)\n", + status & SGE_FRAMINGERR); + + if (status & F_RSPQCREDITOVERFOW) + CH_ALERT(adapter, "SGE response queue credit overflow\n"); + + if (status & F_RSPQDISABLED) { + v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS); + + CH_ALERT(adapter, + "packet delivered to disabled response queue " + "(0x%x)\n", (v >> S_RSPQ0DISABLED) & 0xff); + } + + if (status & (F_HIPIODRBDROPERR | F_LOPIODRBDROPERR)) + queue_work(cxgb3_wq, &adapter->db_drop_task); + + if (status & (F_HIPRIORITYDBFULL | F_LOPRIORITYDBFULL)) + queue_work(cxgb3_wq, &adapter->db_full_task); + + if (status & (F_HIPRIORITYDBEMPTY | F_LOPRIORITYDBEMPTY)) + queue_work(cxgb3_wq, &adapter->db_empty_task); + + t3_write_reg(adapter, A_SG_INT_CAUSE, status); + if (status & SGE_FATALERR) + t3_fatal_err(adapter); +} + +/** + * sge_timer_tx - perform periodic maintenance of an SGE qset + * @data: the SGE queue set to maintain + * + * Runs periodically from a timer to perform maintenance of an SGE queue + * set. It performs two tasks: + * + * Cleans up any completed Tx descriptors that may still be pending. + * Normal descriptor cleanup happens when new packets are added to a Tx + * queue so this timer is relatively infrequent and does any cleanup only + * if the Tx queue has not seen any new packets in a while. We make a + * best effort attempt to reclaim descriptors, in that we don't wait + * around if we cannot get a queue's lock (which most likely is because + * someone else is queueing new packets and so will also handle the clean + * up). Since control queues use immediate data exclusively we don't + * bother cleaning them up here. + * + */ +static void sge_timer_tx(unsigned long data) +{ + struct sge_qset *qs = (struct sge_qset *)data; + struct port_info *pi = netdev_priv(qs->netdev); + struct adapter *adap = pi->adapter; + unsigned int tbd[SGE_TXQ_PER_SET] = {0, 0}; + unsigned long next_period; + + if (__netif_tx_trylock(qs->tx_q)) { + tbd[TXQ_ETH] = reclaim_completed_tx(adap, &qs->txq[TXQ_ETH], + TX_RECLAIM_TIMER_CHUNK); + __netif_tx_unlock(qs->tx_q); + } + + if (spin_trylock(&qs->txq[TXQ_OFLD].lock)) { + tbd[TXQ_OFLD] = reclaim_completed_tx(adap, &qs->txq[TXQ_OFLD], + TX_RECLAIM_TIMER_CHUNK); + spin_unlock(&qs->txq[TXQ_OFLD].lock); + } + + next_period = TX_RECLAIM_PERIOD >> + (max(tbd[TXQ_ETH], tbd[TXQ_OFLD]) / + TX_RECLAIM_TIMER_CHUNK); + mod_timer(&qs->tx_reclaim_timer, jiffies + next_period); +} + +/** + * sge_timer_rx - perform periodic maintenance of an SGE qset + * @data: the SGE queue set to maintain + * + * a) Replenishes Rx queues that have run out due to memory shortage. + * Normally new Rx buffers are added when existing ones are consumed but + * when out of memory a queue can become empty. We try to add only a few + * buffers here, the queue will be replenished fully as these new buffers + * are used up if memory shortage has subsided. + * + * b) Return coalesced response queue credits in case a response queue is + * starved. + * + */ +static void sge_timer_rx(unsigned long data) +{ + spinlock_t *lock; + struct sge_qset *qs = (struct sge_qset *)data; + struct port_info *pi = netdev_priv(qs->netdev); + struct adapter *adap = pi->adapter; + u32 status; + + lock = adap->params.rev > 0 ? + &qs->rspq.lock : &adap->sge.qs[0].rspq.lock; + + if (!spin_trylock_irq(lock)) + goto out; + + if (napi_is_scheduled(&qs->napi)) + goto unlock; + + if (adap->params.rev < 4) { + status = t3_read_reg(adap, A_SG_RSPQ_FL_STATUS); + + if (status & (1 << qs->rspq.cntxt_id)) { + qs->rspq.starved++; + if (qs->rspq.credits) { + qs->rspq.credits--; + refill_rspq(adap, &qs->rspq, 1); + qs->rspq.restarted++; + t3_write_reg(adap, A_SG_RSPQ_FL_STATUS, + 1 << qs->rspq.cntxt_id); + } + } + } + + if (qs->fl[0].credits < qs->fl[0].size) + __refill_fl(adap, &qs->fl[0]); + if (qs->fl[1].credits < qs->fl[1].size) + __refill_fl(adap, &qs->fl[1]); + +unlock: + spin_unlock_irq(lock); +out: + mod_timer(&qs->rx_reclaim_timer, jiffies + RX_RECLAIM_PERIOD); +} + +/** + * t3_update_qset_coalesce - update coalescing settings for a queue set + * @qs: the SGE queue set + * @p: new queue set parameters + * + * Update the coalescing settings for an SGE queue set. Nothing is done + * if the queue set is not initialized yet. + */ +void t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p) +{ + qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);/* can't be 0 */ + qs->rspq.polling = p->polling; + qs->napi.poll = p->polling ? napi_rx_handler : ofld_poll; +} + +/** + * t3_sge_alloc_qset - initialize an SGE queue set + * @adapter: the adapter + * @id: the queue set id + * @nports: how many Ethernet ports will be using this queue set + * @irq_vec_idx: the IRQ vector index for response queue interrupts + * @p: configuration parameters for this queue set + * @ntxq: number of Tx queues for the queue set + * @netdev: net device associated with this queue set + * @netdevq: net device TX queue associated with this queue set + * + * Allocate resources and initialize an SGE queue set. A queue set + * comprises a response queue, two Rx free-buffer queues, and up to 3 + * Tx queues. The Tx queues are assigned roles in the order Ethernet + * queue, offload queue, and control queue. + */ +int t3_sge_alloc_qset(struct adapter *adapter, unsigned int id, int nports, + int irq_vec_idx, const struct qset_params *p, + int ntxq, struct net_device *dev, + struct netdev_queue *netdevq) +{ + int i, avail, ret = -ENOMEM; + struct sge_qset *q = &adapter->sge.qs[id]; + + init_qset_cntxt(q, id); + setup_timer(&q->tx_reclaim_timer, sge_timer_tx, (unsigned long)q); + setup_timer(&q->rx_reclaim_timer, sge_timer_rx, (unsigned long)q); + + q->fl[0].desc = alloc_ring(adapter->pdev, p->fl_size, + sizeof(struct rx_desc), + sizeof(struct rx_sw_desc), + &q->fl[0].phys_addr, &q->fl[0].sdesc); + if (!q->fl[0].desc) + goto err; + + q->fl[1].desc = alloc_ring(adapter->pdev, p->jumbo_size, + sizeof(struct rx_desc), + sizeof(struct rx_sw_desc), + &q->fl[1].phys_addr, &q->fl[1].sdesc); + if (!q->fl[1].desc) + goto err; + + q->rspq.desc = alloc_ring(adapter->pdev, p->rspq_size, + sizeof(struct rsp_desc), 0, + &q->rspq.phys_addr, NULL); + if (!q->rspq.desc) + goto err; + + for (i = 0; i < ntxq; ++i) { + /* + * The control queue always uses immediate data so does not + * need to keep track of any sk_buffs. + */ + size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc); + + q->txq[i].desc = alloc_ring(adapter->pdev, p->txq_size[i], + sizeof(struct tx_desc), sz, + &q->txq[i].phys_addr, + &q->txq[i].sdesc); + if (!q->txq[i].desc) + goto err; + + q->txq[i].gen = 1; + q->txq[i].size = p->txq_size[i]; + spin_lock_init(&q->txq[i].lock); + skb_queue_head_init(&q->txq[i].sendq); + } + + tasklet_init(&q->txq[TXQ_OFLD].qresume_tsk, restart_offloadq, + (unsigned long)q); + tasklet_init(&q->txq[TXQ_CTRL].qresume_tsk, restart_ctrlq, + (unsigned long)q); + + q->fl[0].gen = q->fl[1].gen = 1; + q->fl[0].size = p->fl_size; + q->fl[1].size = p->jumbo_size; + + q->rspq.gen = 1; + q->rspq.size = p->rspq_size; + spin_lock_init(&q->rspq.lock); + skb_queue_head_init(&q->rspq.rx_queue); + + q->txq[TXQ_ETH].stop_thres = nports * + flits_to_desc(sgl_len(MAX_SKB_FRAGS + 1) + 3); + +#if FL0_PG_CHUNK_SIZE > 0 + q->fl[0].buf_size = FL0_PG_CHUNK_SIZE; +#else + q->fl[0].buf_size = SGE_RX_SM_BUF_SIZE + sizeof(struct cpl_rx_data); +#endif +#if FL1_PG_CHUNK_SIZE > 0 + q->fl[1].buf_size = FL1_PG_CHUNK_SIZE; +#else + q->fl[1].buf_size = is_offload(adapter) ? + (16 * 1024) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) : + MAX_FRAME_SIZE + 2 + sizeof(struct cpl_rx_pkt); +#endif + + q->fl[0].use_pages = FL0_PG_CHUNK_SIZE > 0; + q->fl[1].use_pages = FL1_PG_CHUNK_SIZE > 0; + q->fl[0].order = FL0_PG_ORDER; + q->fl[1].order = FL1_PG_ORDER; + q->fl[0].alloc_size = FL0_PG_ALLOC_SIZE; + q->fl[1].alloc_size = FL1_PG_ALLOC_SIZE; + + spin_lock_irq(&adapter->sge.reg_lock); + + /* FL threshold comparison uses < */ + ret = t3_sge_init_rspcntxt(adapter, q->rspq.cntxt_id, irq_vec_idx, + q->rspq.phys_addr, q->rspq.size, + q->fl[0].buf_size - SGE_PG_RSVD, 1, 0); + if (ret) + goto err_unlock; + + for (i = 0; i < SGE_RXQ_PER_SET; ++i) { + ret = t3_sge_init_flcntxt(adapter, q->fl[i].cntxt_id, 0, + q->fl[i].phys_addr, q->fl[i].size, + q->fl[i].buf_size - SGE_PG_RSVD, + p->cong_thres, 1, 0); + if (ret) + goto err_unlock; + } + + ret = t3_sge_init_ecntxt(adapter, q->txq[TXQ_ETH].cntxt_id, USE_GTS, + SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr, + q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token, + 1, 0); + if (ret) + goto err_unlock; + + if (ntxq > 1) { + ret = t3_sge_init_ecntxt(adapter, q->txq[TXQ_OFLD].cntxt_id, + USE_GTS, SGE_CNTXT_OFLD, id, + q->txq[TXQ_OFLD].phys_addr, + q->txq[TXQ_OFLD].size, 0, 1, 0); + if (ret) + goto err_unlock; + } + + if (ntxq > 2) { + ret = t3_sge_init_ecntxt(adapter, q->txq[TXQ_CTRL].cntxt_id, 0, + SGE_CNTXT_CTRL, id, + q->txq[TXQ_CTRL].phys_addr, + q->txq[TXQ_CTRL].size, + q->txq[TXQ_CTRL].token, 1, 0); + if (ret) + goto err_unlock; + } + + spin_unlock_irq(&adapter->sge.reg_lock); + + q->adap = adapter; + q->netdev = dev; + q->tx_q = netdevq; + t3_update_qset_coalesce(q, p); + + avail = refill_fl(adapter, &q->fl[0], q->fl[0].size, + GFP_KERNEL | __GFP_COMP); + if (!avail) { + CH_ALERT(adapter, "free list queue 0 initialization failed\n"); + goto err; + } + if (avail < q->fl[0].size) + CH_WARN(adapter, "free list queue 0 enabled with %d credits\n", + avail); + + avail = refill_fl(adapter, &q->fl[1], q->fl[1].size, + GFP_KERNEL | __GFP_COMP); + if (avail < q->fl[1].size) + CH_WARN(adapter, "free list queue 1 enabled with %d credits\n", + avail); + refill_rspq(adapter, &q->rspq, q->rspq.size - 1); + + t3_write_reg(adapter, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) | + V_NEWTIMER(q->rspq.holdoff_tmr)); + + return 0; + +err_unlock: + spin_unlock_irq(&adapter->sge.reg_lock); +err: + t3_free_qset(adapter, q); + return ret; +} + +/** + * t3_start_sge_timers - start SGE timer call backs + * @adap: the adapter + * + * Starts each SGE queue set's timer call back + */ +void t3_start_sge_timers(struct adapter *adap) +{ + int i; + + for (i = 0; i < SGE_QSETS; ++i) { + struct sge_qset *q = &adap->sge.qs[i]; + + if (q->tx_reclaim_timer.function) + mod_timer(&q->tx_reclaim_timer, jiffies + TX_RECLAIM_PERIOD); + + if (q->rx_reclaim_timer.function) + mod_timer(&q->rx_reclaim_timer, jiffies + RX_RECLAIM_PERIOD); + } +} + +/** + * t3_stop_sge_timers - stop SGE timer call backs + * @adap: the adapter + * + * Stops each SGE queue set's timer call back + */ +void t3_stop_sge_timers(struct adapter *adap) +{ + int i; + + for (i = 0; i < SGE_QSETS; ++i) { + struct sge_qset *q = &adap->sge.qs[i]; + + if (q->tx_reclaim_timer.function) + del_timer_sync(&q->tx_reclaim_timer); + if (q->rx_reclaim_timer.function) + del_timer_sync(&q->rx_reclaim_timer); + } +} + +/** + * t3_free_sge_resources - free SGE resources + * @adap: the adapter + * + * Frees resources used by the SGE queue sets. + */ +void t3_free_sge_resources(struct adapter *adap) +{ + int i; + + for (i = 0; i < SGE_QSETS; ++i) + t3_free_qset(adap, &adap->sge.qs[i]); +} + +/** + * t3_sge_start - enable SGE + * @adap: the adapter + * + * Enables the SGE for DMAs. This is the last step in starting packet + * transfers. + */ +void t3_sge_start(struct adapter *adap) +{ + t3_set_reg_field(adap, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE); +} + +/** + * t3_sge_stop - disable SGE operation + * @adap: the adapter + * + * Disables the DMA engine. This can be called in emeregencies (e.g., + * from error interrupts) or from normal process context. In the latter + * case it also disables any pending queue restart tasklets. Note that + * if it is called in interrupt context it cannot disable the restart + * tasklets as it cannot wait, however the tasklets will have no effect + * since the doorbells are disabled and the driver will call this again + * later from process context, at which time the tasklets will be stopped + * if they are still running. + */ +void t3_sge_stop(struct adapter *adap) +{ + t3_set_reg_field(adap, A_SG_CONTROL, F_GLOBALENABLE, 0); + if (!in_interrupt()) { + int i; + + for (i = 0; i < SGE_QSETS; ++i) { + struct sge_qset *qs = &adap->sge.qs[i]; + + tasklet_kill(&qs->txq[TXQ_OFLD].qresume_tsk); + tasklet_kill(&qs->txq[TXQ_CTRL].qresume_tsk); + } + } +} + +/** + * t3_sge_init - initialize SGE + * @adap: the adapter + * @p: the SGE parameters + * + * Performs SGE initialization needed every time after a chip reset. + * We do not initialize any of the queue sets here, instead the driver + * top-level must request those individually. We also do not enable DMA + * here, that should be done after the queues have been set up. + */ +void t3_sge_init(struct adapter *adap, struct sge_params *p) +{ + unsigned int ctrl, ups = ffs(pci_resource_len(adap->pdev, 2) >> 12); + + ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL | + F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN | + V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS | + V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING; +#if SGE_NUM_GENBITS == 1 + ctrl |= F_EGRGENCTRL; +#endif + if (adap->params.rev > 0) { + if (!(adap->flags & (USING_MSIX | USING_MSI))) + ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ; + } + t3_write_reg(adap, A_SG_CONTROL, ctrl); + t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) | + V_LORCQDRBTHRSH(512)); + t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10); + t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) | + V_TIMEOUT(200 * core_ticks_per_usec(adap))); + t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH, + adap->params.rev < T3_REV_C ? 1000 : 500); + t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256); + t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000); + t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256); + t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff)); + t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024); +} + +/** + * t3_sge_prep - one-time SGE initialization + * @adap: the associated adapter + * @p: SGE parameters + * + * Performs one-time initialization of SGE SW state. Includes determining + * defaults for the assorted SGE parameters, which admins can change until + * they are used to initialize the SGE. + */ +void t3_sge_prep(struct adapter *adap, struct sge_params *p) +{ + int i; + + p->max_pkt_size = (16 * 1024) - sizeof(struct cpl_rx_data) - + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + + for (i = 0; i < SGE_QSETS; ++i) { + struct qset_params *q = p->qset + i; + + q->polling = adap->params.rev > 0; + q->coalesce_usecs = 5; + q->rspq_size = 1024; + q->fl_size = 1024; + q->jumbo_size = 512; + q->txq_size[TXQ_ETH] = 1024; + q->txq_size[TXQ_OFLD] = 1024; + q->txq_size[TXQ_CTRL] = 256; + q->cong_thres = 0; + } + + spin_lock_init(&adap->sge.reg_lock); +} diff --git a/drivers/net/ethernet/chelsio/cxgb3/sge_defs.h b/drivers/net/ethernet/chelsio/cxgb3/sge_defs.h new file mode 100644 index 0000000..29b6c80 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb3/sge_defs.h @@ -0,0 +1,255 @@ +/* + * This file is automatically generated --- any changes will be lost. + */ + +#ifndef _SGE_DEFS_H +#define _SGE_DEFS_H + +#define S_EC_CREDITS 0 +#define M_EC_CREDITS 0x7FFF +#define V_EC_CREDITS(x) ((x) << S_EC_CREDITS) +#define G_EC_CREDITS(x) (((x) >> S_EC_CREDITS) & M_EC_CREDITS) + +#define S_EC_GTS 15 +#define V_EC_GTS(x) ((x) << S_EC_GTS) +#define F_EC_GTS V_EC_GTS(1U) + +#define S_EC_INDEX 16 +#define M_EC_INDEX 0xFFFF +#define V_EC_INDEX(x) ((x) << S_EC_INDEX) +#define G_EC_INDEX(x) (((x) >> S_EC_INDEX) & M_EC_INDEX) + +#define S_EC_SIZE 0 +#define M_EC_SIZE 0xFFFF +#define V_EC_SIZE(x) ((x) << S_EC_SIZE) +#define G_EC_SIZE(x) (((x) >> S_EC_SIZE) & M_EC_SIZE) + +#define S_EC_BASE_LO 16 +#define M_EC_BASE_LO 0xFFFF +#define V_EC_BASE_LO(x) ((x) << S_EC_BASE_LO) +#define G_EC_BASE_LO(x) (((x) >> S_EC_BASE_LO) & M_EC_BASE_LO) + +#define S_EC_BASE_HI 0 +#define M_EC_BASE_HI 0xF +#define V_EC_BASE_HI(x) ((x) << S_EC_BASE_HI) +#define G_EC_BASE_HI(x) (((x) >> S_EC_BASE_HI) & M_EC_BASE_HI) + +#define S_EC_RESPQ 4 +#define M_EC_RESPQ 0x7 +#define V_EC_RESPQ(x) ((x) << S_EC_RESPQ) +#define G_EC_RESPQ(x) (((x) >> S_EC_RESPQ) & M_EC_RESPQ) + +#define S_EC_TYPE 7 +#define M_EC_TYPE 0x7 +#define V_EC_TYPE(x) ((x) << S_EC_TYPE) +#define G_EC_TYPE(x) (((x) >> S_EC_TYPE) & M_EC_TYPE) + +#define S_EC_GEN 10 +#define V_EC_GEN(x) ((x) << S_EC_GEN) +#define F_EC_GEN V_EC_GEN(1U) + +#define S_EC_UP_TOKEN 11 +#define M_EC_UP_TOKEN 0xFFFFF +#define V_EC_UP_TOKEN(x) ((x) << S_EC_UP_TOKEN) +#define G_EC_UP_TOKEN(x) (((x) >> S_EC_UP_TOKEN) & M_EC_UP_TOKEN) + +#define S_EC_VALID 31 +#define V_EC_VALID(x) ((x) << S_EC_VALID) +#define F_EC_VALID V_EC_VALID(1U) + +#define S_RQ_MSI_VEC 20 +#define M_RQ_MSI_VEC 0x3F +#define V_RQ_MSI_VEC(x) ((x) << S_RQ_MSI_VEC) +#define G_RQ_MSI_VEC(x) (((x) >> S_RQ_MSI_VEC) & M_RQ_MSI_VEC) + +#define S_RQ_INTR_EN 26 +#define V_RQ_INTR_EN(x) ((x) << S_RQ_INTR_EN) +#define F_RQ_INTR_EN V_RQ_INTR_EN(1U) + +#define S_RQ_GEN 28 +#define V_RQ_GEN(x) ((x) << S_RQ_GEN) +#define F_RQ_GEN V_RQ_GEN(1U) + +#define S_CQ_INDEX 0 +#define M_CQ_INDEX 0xFFFF +#define V_CQ_INDEX(x) ((x) << S_CQ_INDEX) +#define G_CQ_INDEX(x) (((x) >> S_CQ_INDEX) & M_CQ_INDEX) + +#define S_CQ_SIZE 16 +#define M_CQ_SIZE 0xFFFF +#define V_CQ_SIZE(x) ((x) << S_CQ_SIZE) +#define G_CQ_SIZE(x) (((x) >> S_CQ_SIZE) & M_CQ_SIZE) + +#define S_CQ_BASE_HI 0 +#define M_CQ_BASE_HI 0xFFFFF +#define V_CQ_BASE_HI(x) ((x) << S_CQ_BASE_HI) +#define G_CQ_BASE_HI(x) (((x) >> S_CQ_BASE_HI) & M_CQ_BASE_HI) + +#define S_CQ_RSPQ 20 +#define M_CQ_RSPQ 0x3F +#define V_CQ_RSPQ(x) ((x) << S_CQ_RSPQ) +#define G_CQ_RSPQ(x) (((x) >> S_CQ_RSPQ) & M_CQ_RSPQ) + +#define S_CQ_ASYNC_NOTIF 26 +#define V_CQ_ASYNC_NOTIF(x) ((x) << S_CQ_ASYNC_NOTIF) +#define F_CQ_ASYNC_NOTIF V_CQ_ASYNC_NOTIF(1U) + +#define S_CQ_ARMED 27 +#define V_CQ_ARMED(x) ((x) << S_CQ_ARMED) +#define F_CQ_ARMED V_CQ_ARMED(1U) + +#define S_CQ_ASYNC_NOTIF_SOL 28 +#define V_CQ_ASYNC_NOTIF_SOL(x) ((x) << S_CQ_ASYNC_NOTIF_SOL) +#define F_CQ_ASYNC_NOTIF_SOL V_CQ_ASYNC_NOTIF_SOL(1U) + +#define S_CQ_GEN 29 +#define V_CQ_GEN(x) ((x) << S_CQ_GEN) +#define F_CQ_GEN V_CQ_GEN(1U) + +#define S_CQ_ERR 30 +#define V_CQ_ERR(x) ((x) << S_CQ_ERR) +#define F_CQ_ERR V_CQ_ERR(1U) + +#define S_CQ_OVERFLOW_MODE 31 +#define V_CQ_OVERFLOW_MODE(x) ((x) << S_CQ_OVERFLOW_MODE) +#define F_CQ_OVERFLOW_MODE V_CQ_OVERFLOW_MODE(1U) + +#define S_CQ_CREDITS 0 +#define M_CQ_CREDITS 0xFFFF +#define V_CQ_CREDITS(x) ((x) << S_CQ_CREDITS) +#define G_CQ_CREDITS(x) (((x) >> S_CQ_CREDITS) & M_CQ_CREDITS) + +#define S_CQ_CREDIT_THRES 16 +#define M_CQ_CREDIT_THRES 0x1FFF +#define V_CQ_CREDIT_THRES(x) ((x) << S_CQ_CREDIT_THRES) +#define G_CQ_CREDIT_THRES(x) (((x) >> S_CQ_CREDIT_THRES) & M_CQ_CREDIT_THRES) + +#define S_FL_BASE_HI 0 +#define M_FL_BASE_HI 0xFFFFF +#define V_FL_BASE_HI(x) ((x) << S_FL_BASE_HI) +#define G_FL_BASE_HI(x) (((x) >> S_FL_BASE_HI) & M_FL_BASE_HI) + +#define S_FL_INDEX_LO 20 +#define M_FL_INDEX_LO 0xFFF +#define V_FL_INDEX_LO(x) ((x) << S_FL_INDEX_LO) +#define G_FL_INDEX_LO(x) (((x) >> S_FL_INDEX_LO) & M_FL_INDEX_LO) + +#define S_FL_INDEX_HI 0 +#define M_FL_INDEX_HI 0xF +#define V_FL_INDEX_HI(x) ((x) << S_FL_INDEX_HI) +#define G_FL_INDEX_HI(x) (((x) >> S_FL_INDEX_HI) & M_FL_INDEX_HI) + +#define S_FL_SIZE 4 +#define M_FL_SIZE 0xFFFF +#define V_FL_SIZE(x) ((x) << S_FL_SIZE) +#define G_FL_SIZE(x) (((x) >> S_FL_SIZE) & M_FL_SIZE) + +#define S_FL_GEN 20 +#define V_FL_GEN(x) ((x) << S_FL_GEN) +#define F_FL_GEN V_FL_GEN(1U) + +#define S_FL_ENTRY_SIZE_LO 21 +#define M_FL_ENTRY_SIZE_LO 0x7FF +#define V_FL_ENTRY_SIZE_LO(x) ((x) << S_FL_ENTRY_SIZE_LO) +#define G_FL_ENTRY_SIZE_LO(x) (((x) >> S_FL_ENTRY_SIZE_LO) & M_FL_ENTRY_SIZE_LO) + +#define S_FL_ENTRY_SIZE_HI 0 +#define M_FL_ENTRY_SIZE_HI 0x1FFFFF +#define V_FL_ENTRY_SIZE_HI(x) ((x) << S_FL_ENTRY_SIZE_HI) +#define G_FL_ENTRY_SIZE_HI(x) (((x) >> S_FL_ENTRY_SIZE_HI) & M_FL_ENTRY_SIZE_HI) + +#define S_FL_CONG_THRES 21 +#define M_FL_CONG_THRES 0x3FF +#define V_FL_CONG_THRES(x) ((x) << S_FL_CONG_THRES) +#define G_FL_CONG_THRES(x) (((x) >> S_FL_CONG_THRES) & M_FL_CONG_THRES) + +#define S_FL_GTS 31 +#define V_FL_GTS(x) ((x) << S_FL_GTS) +#define F_FL_GTS V_FL_GTS(1U) + +#define S_FLD_GEN1 31 +#define V_FLD_GEN1(x) ((x) << S_FLD_GEN1) +#define F_FLD_GEN1 V_FLD_GEN1(1U) + +#define S_FLD_GEN2 0 +#define V_FLD_GEN2(x) ((x) << S_FLD_GEN2) +#define F_FLD_GEN2 V_FLD_GEN2(1U) + +#define S_RSPD_TXQ1_CR 0 +#define M_RSPD_TXQ1_CR 0x7F +#define V_RSPD_TXQ1_CR(x) ((x) << S_RSPD_TXQ1_CR) +#define G_RSPD_TXQ1_CR(x) (((x) >> S_RSPD_TXQ1_CR) & M_RSPD_TXQ1_CR) + +#define S_RSPD_TXQ1_GTS 7 +#define V_RSPD_TXQ1_GTS(x) ((x) << S_RSPD_TXQ1_GTS) +#define F_RSPD_TXQ1_GTS V_RSPD_TXQ1_GTS(1U) + +#define S_RSPD_TXQ2_CR 8 +#define M_RSPD_TXQ2_CR 0x7F +#define V_RSPD_TXQ2_CR(x) ((x) << S_RSPD_TXQ2_CR) +#define G_RSPD_TXQ2_CR(x) (((x) >> S_RSPD_TXQ2_CR) & M_RSPD_TXQ2_CR) + +#define S_RSPD_TXQ2_GTS 15 +#define V_RSPD_TXQ2_GTS(x) ((x) << S_RSPD_TXQ2_GTS) +#define F_RSPD_TXQ2_GTS V_RSPD_TXQ2_GTS(1U) + +#define S_RSPD_TXQ0_CR 16 +#define M_RSPD_TXQ0_CR 0x7F +#define V_RSPD_TXQ0_CR(x) ((x) << S_RSPD_TXQ0_CR) +#define G_RSPD_TXQ0_CR(x) (((x) >> S_RSPD_TXQ0_CR) & M_RSPD_TXQ0_CR) + +#define S_RSPD_TXQ0_GTS 23 +#define V_RSPD_TXQ0_GTS(x) ((x) << S_RSPD_TXQ0_GTS) +#define F_RSPD_TXQ0_GTS V_RSPD_TXQ0_GTS(1U) + +#define S_RSPD_EOP 24 +#define V_RSPD_EOP(x) ((x) << S_RSPD_EOP) +#define F_RSPD_EOP V_RSPD_EOP(1U) + +#define S_RSPD_SOP 25 +#define V_RSPD_SOP(x) ((x) << S_RSPD_SOP) +#define F_RSPD_SOP V_RSPD_SOP(1U) + +#define S_RSPD_ASYNC_NOTIF 26 +#define V_RSPD_ASYNC_NOTIF(x) ((x) << S_RSPD_ASYNC_NOTIF) +#define F_RSPD_ASYNC_NOTIF V_RSPD_ASYNC_NOTIF(1U) + +#define S_RSPD_FL0_GTS 27 +#define V_RSPD_FL0_GTS(x) ((x) << S_RSPD_FL0_GTS) +#define F_RSPD_FL0_GTS V_RSPD_FL0_GTS(1U) + +#define S_RSPD_FL1_GTS 28 +#define V_RSPD_FL1_GTS(x) ((x) << S_RSPD_FL1_GTS) +#define F_RSPD_FL1_GTS V_RSPD_FL1_GTS(1U) + +#define S_RSPD_IMM_DATA_VALID 29 +#define V_RSPD_IMM_DATA_VALID(x) ((x) << S_RSPD_IMM_DATA_VALID) +#define F_RSPD_IMM_DATA_VALID V_RSPD_IMM_DATA_VALID(1U) + +#define S_RSPD_OFFLOAD 30 +#define V_RSPD_OFFLOAD(x) ((x) << S_RSPD_OFFLOAD) +#define F_RSPD_OFFLOAD V_RSPD_OFFLOAD(1U) + +#define S_RSPD_GEN1 31 +#define V_RSPD_GEN1(x) ((x) << S_RSPD_GEN1) +#define F_RSPD_GEN1 V_RSPD_GEN1(1U) + +#define S_RSPD_LEN 0 +#define M_RSPD_LEN 0x7FFFFFFF +#define V_RSPD_LEN(x) ((x) << S_RSPD_LEN) +#define G_RSPD_LEN(x) (((x) >> S_RSPD_LEN) & M_RSPD_LEN) + +#define S_RSPD_FLQ 31 +#define V_RSPD_FLQ(x) ((x) << S_RSPD_FLQ) +#define F_RSPD_FLQ V_RSPD_FLQ(1U) + +#define S_RSPD_GEN2 0 +#define V_RSPD_GEN2(x) ((x) << S_RSPD_GEN2) +#define F_RSPD_GEN2 V_RSPD_GEN2(1U) + +#define S_RSPD_INR_VEC 1 +#define M_RSPD_INR_VEC 0x7F +#define V_RSPD_INR_VEC(x) ((x) << S_RSPD_INR_VEC) +#define G_RSPD_INR_VEC(x) (((x) >> S_RSPD_INR_VEC) & M_RSPD_INR_VEC) + +#endif /* _SGE_DEFS_H */ diff --git a/drivers/net/ethernet/chelsio/cxgb3/t3_cpl.h b/drivers/net/ethernet/chelsio/cxgb3/t3_cpl.h new file mode 100644 index 0000000..852c399 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb3/t3_cpl.h @@ -0,0 +1,1495 @@ +/* + * Copyright (c) 2004-2008 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef T3_CPL_H +#define T3_CPL_H + +#if !defined(__LITTLE_ENDIAN_BITFIELD) && !defined(__BIG_ENDIAN_BITFIELD) +# include +#endif + +enum CPL_opcode { + CPL_PASS_OPEN_REQ = 0x1, + CPL_PASS_ACCEPT_RPL = 0x2, + CPL_ACT_OPEN_REQ = 0x3, + CPL_SET_TCB = 0x4, + CPL_SET_TCB_FIELD = 0x5, + CPL_GET_TCB = 0x6, + CPL_PCMD = 0x7, + CPL_CLOSE_CON_REQ = 0x8, + CPL_CLOSE_LISTSRV_REQ = 0x9, + CPL_ABORT_REQ = 0xA, + CPL_ABORT_RPL = 0xB, + CPL_TX_DATA = 0xC, + CPL_RX_DATA_ACK = 0xD, + CPL_TX_PKT = 0xE, + CPL_RTE_DELETE_REQ = 0xF, + CPL_RTE_WRITE_REQ = 0x10, + CPL_RTE_READ_REQ = 0x11, + CPL_L2T_WRITE_REQ = 0x12, + CPL_L2T_READ_REQ = 0x13, + CPL_SMT_WRITE_REQ = 0x14, + CPL_SMT_READ_REQ = 0x15, + CPL_TX_PKT_LSO = 0x16, + CPL_PCMD_READ = 0x17, + CPL_BARRIER = 0x18, + CPL_TID_RELEASE = 0x1A, + + CPL_CLOSE_LISTSRV_RPL = 0x20, + CPL_ERROR = 0x21, + CPL_GET_TCB_RPL = 0x22, + CPL_L2T_WRITE_RPL = 0x23, + CPL_PCMD_READ_RPL = 0x24, + CPL_PCMD_RPL = 0x25, + CPL_PEER_CLOSE = 0x26, + CPL_RTE_DELETE_RPL = 0x27, + CPL_RTE_WRITE_RPL = 0x28, + CPL_RX_DDP_COMPLETE = 0x29, + CPL_RX_PHYS_ADDR = 0x2A, + CPL_RX_PKT = 0x2B, + CPL_RX_URG_NOTIFY = 0x2C, + CPL_SET_TCB_RPL = 0x2D, + CPL_SMT_WRITE_RPL = 0x2E, + CPL_TX_DATA_ACK = 0x2F, + + CPL_ABORT_REQ_RSS = 0x30, + CPL_ABORT_RPL_RSS = 0x31, + CPL_CLOSE_CON_RPL = 0x32, + CPL_ISCSI_HDR = 0x33, + CPL_L2T_READ_RPL = 0x34, + CPL_RDMA_CQE = 0x35, + CPL_RDMA_CQE_READ_RSP = 0x36, + CPL_RDMA_CQE_ERR = 0x37, + CPL_RTE_READ_RPL = 0x38, + CPL_RX_DATA = 0x39, + + CPL_ACT_OPEN_RPL = 0x40, + CPL_PASS_OPEN_RPL = 0x41, + CPL_RX_DATA_DDP = 0x42, + CPL_SMT_READ_RPL = 0x43, + + CPL_ACT_ESTABLISH = 0x50, + CPL_PASS_ESTABLISH = 0x51, + + CPL_PASS_ACCEPT_REQ = 0x70, + + CPL_ASYNC_NOTIF = 0x80, /* fake opcode for async notifications */ + + CPL_TX_DMA_ACK = 0xA0, + CPL_RDMA_READ_REQ = 0xA1, + CPL_RDMA_TERMINATE = 0xA2, + CPL_TRACE_PKT = 0xA3, + CPL_RDMA_EC_STATUS = 0xA5, + + NUM_CPL_CMDS /* must be last and previous entries must be sorted */ +}; + +enum CPL_error { + CPL_ERR_NONE = 0, + CPL_ERR_TCAM_PARITY = 1, + CPL_ERR_TCAM_FULL = 3, + CPL_ERR_CONN_RESET = 20, + CPL_ERR_CONN_EXIST = 22, + CPL_ERR_ARP_MISS = 23, + CPL_ERR_BAD_SYN = 24, + CPL_ERR_CONN_TIMEDOUT = 30, + CPL_ERR_XMIT_TIMEDOUT = 31, + CPL_ERR_PERSIST_TIMEDOUT = 32, + CPL_ERR_FINWAIT2_TIMEDOUT = 33, + CPL_ERR_KEEPALIVE_TIMEDOUT = 34, + CPL_ERR_RTX_NEG_ADVICE = 35, + CPL_ERR_PERSIST_NEG_ADVICE = 36, + CPL_ERR_ABORT_FAILED = 42, + CPL_ERR_GENERAL = 99 +}; + +enum { + CPL_CONN_POLICY_AUTO = 0, + CPL_CONN_POLICY_ASK = 1, + CPL_CONN_POLICY_DENY = 3 +}; + +enum { + ULP_MODE_NONE = 0, + ULP_MODE_ISCSI = 2, + ULP_MODE_RDMA = 4, + ULP_MODE_TCPDDP = 5 +}; + +enum { + ULP_CRC_HEADER = 1 << 0, + ULP_CRC_DATA = 1 << 1 +}; + +enum { + CPL_PASS_OPEN_ACCEPT, + CPL_PASS_OPEN_REJECT +}; + +enum { + CPL_ABORT_SEND_RST = 0, + CPL_ABORT_NO_RST, + CPL_ABORT_POST_CLOSE_REQ = 2 +}; + +enum { /* TX_PKT_LSO ethernet types */ + CPL_ETH_II, + CPL_ETH_II_VLAN, + CPL_ETH_802_3, + CPL_ETH_802_3_VLAN +}; + +enum { /* TCP congestion control algorithms */ + CONG_ALG_RENO, + CONG_ALG_TAHOE, + CONG_ALG_NEWRENO, + CONG_ALG_HIGHSPEED +}; + +enum { /* RSS hash type */ + RSS_HASH_NONE = 0, + RSS_HASH_2_TUPLE = 1, + RSS_HASH_4_TUPLE = 2, + RSS_HASH_TCPV6 = 3 +}; + +union opcode_tid { + __be32 opcode_tid; + __u8 opcode; +}; + +#define S_OPCODE 24 +#define V_OPCODE(x) ((x) << S_OPCODE) +#define G_OPCODE(x) (((x) >> S_OPCODE) & 0xFF) +#define G_TID(x) ((x) & 0xFFFFFF) + +#define S_QNUM 0 +#define G_QNUM(x) (((x) >> S_QNUM) & 0xFFFF) + +#define S_HASHTYPE 22 +#define M_HASHTYPE 0x3 +#define G_HASHTYPE(x) (((x) >> S_HASHTYPE) & M_HASHTYPE) + +/* tid is assumed to be 24-bits */ +#define MK_OPCODE_TID(opcode, tid) (V_OPCODE(opcode) | (tid)) + +#define OPCODE_TID(cmd) ((cmd)->ot.opcode_tid) + +/* extract the TID from a CPL command */ +#define GET_TID(cmd) (G_TID(ntohl(OPCODE_TID(cmd)))) + +struct tcp_options { + __be16 mss; + __u8 wsf; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8:5; + __u8 ecn:1; + __u8 sack:1; + __u8 tstamp:1; +#else + __u8 tstamp:1; + __u8 sack:1; + __u8 ecn:1; + __u8:5; +#endif +}; + +struct rss_header { + __u8 opcode; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 cpu_idx:6; + __u8 hash_type:2; +#else + __u8 hash_type:2; + __u8 cpu_idx:6; +#endif + __be16 cq_idx; + __be32 rss_hash_val; +}; + +#ifndef CHELSIO_FW +struct work_request_hdr { + __be32 wr_hi; + __be32 wr_lo; +}; + +/* wr_hi fields */ +#define S_WR_SGE_CREDITS 0 +#define M_WR_SGE_CREDITS 0xFF +#define V_WR_SGE_CREDITS(x) ((x) << S_WR_SGE_CREDITS) +#define G_WR_SGE_CREDITS(x) (((x) >> S_WR_SGE_CREDITS) & M_WR_SGE_CREDITS) + +#define S_WR_SGLSFLT 8 +#define M_WR_SGLSFLT 0xFF +#define V_WR_SGLSFLT(x) ((x) << S_WR_SGLSFLT) +#define G_WR_SGLSFLT(x) (((x) >> S_WR_SGLSFLT) & M_WR_SGLSFLT) + +#define S_WR_BCNTLFLT 16 +#define M_WR_BCNTLFLT 0xF +#define V_WR_BCNTLFLT(x) ((x) << S_WR_BCNTLFLT) +#define G_WR_BCNTLFLT(x) (((x) >> S_WR_BCNTLFLT) & M_WR_BCNTLFLT) + +#define S_WR_DATATYPE 20 +#define V_WR_DATATYPE(x) ((x) << S_WR_DATATYPE) +#define F_WR_DATATYPE V_WR_DATATYPE(1U) + +#define S_WR_COMPL 21 +#define V_WR_COMPL(x) ((x) << S_WR_COMPL) +#define F_WR_COMPL V_WR_COMPL(1U) + +#define S_WR_EOP 22 +#define V_WR_EOP(x) ((x) << S_WR_EOP) +#define F_WR_EOP V_WR_EOP(1U) + +#define S_WR_SOP 23 +#define V_WR_SOP(x) ((x) << S_WR_SOP) +#define F_WR_SOP V_WR_SOP(1U) + +#define S_WR_OP 24 +#define M_WR_OP 0xFF +#define V_WR_OP(x) ((x) << S_WR_OP) +#define G_WR_OP(x) (((x) >> S_WR_OP) & M_WR_OP) + +/* wr_lo fields */ +#define S_WR_LEN 0 +#define M_WR_LEN 0xFF +#define V_WR_LEN(x) ((x) << S_WR_LEN) +#define G_WR_LEN(x) (((x) >> S_WR_LEN) & M_WR_LEN) + +#define S_WR_TID 8 +#define M_WR_TID 0xFFFFF +#define V_WR_TID(x) ((x) << S_WR_TID) +#define G_WR_TID(x) (((x) >> S_WR_TID) & M_WR_TID) + +#define S_WR_CR_FLUSH 30 +#define V_WR_CR_FLUSH(x) ((x) << S_WR_CR_FLUSH) +#define F_WR_CR_FLUSH V_WR_CR_FLUSH(1U) + +#define S_WR_GEN 31 +#define V_WR_GEN(x) ((x) << S_WR_GEN) +#define F_WR_GEN V_WR_GEN(1U) + +# define WR_HDR struct work_request_hdr wr +# define RSS_HDR +#else +# define WR_HDR +# define RSS_HDR struct rss_header rss_hdr; +#endif + +/* option 0 lower-half fields */ +#define S_CPL_STATUS 0 +#define M_CPL_STATUS 0xFF +#define V_CPL_STATUS(x) ((x) << S_CPL_STATUS) +#define G_CPL_STATUS(x) (((x) >> S_CPL_STATUS) & M_CPL_STATUS) + +#define S_INJECT_TIMER 6 +#define V_INJECT_TIMER(x) ((x) << S_INJECT_TIMER) +#define F_INJECT_TIMER V_INJECT_TIMER(1U) + +#define S_NO_OFFLOAD 7 +#define V_NO_OFFLOAD(x) ((x) << S_NO_OFFLOAD) +#define F_NO_OFFLOAD V_NO_OFFLOAD(1U) + +#define S_ULP_MODE 8 +#define M_ULP_MODE 0xF +#define V_ULP_MODE(x) ((x) << S_ULP_MODE) +#define G_ULP_MODE(x) (((x) >> S_ULP_MODE) & M_ULP_MODE) + +#define S_RCV_BUFSIZ 12 +#define M_RCV_BUFSIZ 0x3FFF +#define V_RCV_BUFSIZ(x) ((x) << S_RCV_BUFSIZ) +#define G_RCV_BUFSIZ(x) (((x) >> S_RCV_BUFSIZ) & M_RCV_BUFSIZ) + +#define S_TOS 26 +#define M_TOS 0x3F +#define V_TOS(x) ((x) << S_TOS) +#define G_TOS(x) (((x) >> S_TOS) & M_TOS) + +/* option 0 upper-half fields */ +#define S_DELACK 0 +#define V_DELACK(x) ((x) << S_DELACK) +#define F_DELACK V_DELACK(1U) + +#define S_NO_CONG 1 +#define V_NO_CONG(x) ((x) << S_NO_CONG) +#define F_NO_CONG V_NO_CONG(1U) + +#define S_SRC_MAC_SEL 2 +#define M_SRC_MAC_SEL 0x3 +#define V_SRC_MAC_SEL(x) ((x) << S_SRC_MAC_SEL) +#define G_SRC_MAC_SEL(x) (((x) >> S_SRC_MAC_SEL) & M_SRC_MAC_SEL) + +#define S_L2T_IDX 4 +#define M_L2T_IDX 0x7FF +#define V_L2T_IDX(x) ((x) << S_L2T_IDX) +#define G_L2T_IDX(x) (((x) >> S_L2T_IDX) & M_L2T_IDX) + +#define S_TX_CHANNEL 15 +#define V_TX_CHANNEL(x) ((x) << S_TX_CHANNEL) +#define F_TX_CHANNEL V_TX_CHANNEL(1U) + +#define S_TCAM_BYPASS 16 +#define V_TCAM_BYPASS(x) ((x) << S_TCAM_BYPASS) +#define F_TCAM_BYPASS V_TCAM_BYPASS(1U) + +#define S_NAGLE 17 +#define V_NAGLE(x) ((x) << S_NAGLE) +#define F_NAGLE V_NAGLE(1U) + +#define S_WND_SCALE 18 +#define M_WND_SCALE 0xF +#define V_WND_SCALE(x) ((x) << S_WND_SCALE) +#define G_WND_SCALE(x) (((x) >> S_WND_SCALE) & M_WND_SCALE) + +#define S_KEEP_ALIVE 22 +#define V_KEEP_ALIVE(x) ((x) << S_KEEP_ALIVE) +#define F_KEEP_ALIVE V_KEEP_ALIVE(1U) + +#define S_MAX_RETRANS 23 +#define M_MAX_RETRANS 0xF +#define V_MAX_RETRANS(x) ((x) << S_MAX_RETRANS) +#define G_MAX_RETRANS(x) (((x) >> S_MAX_RETRANS) & M_MAX_RETRANS) + +#define S_MAX_RETRANS_OVERRIDE 27 +#define V_MAX_RETRANS_OVERRIDE(x) ((x) << S_MAX_RETRANS_OVERRIDE) +#define F_MAX_RETRANS_OVERRIDE V_MAX_RETRANS_OVERRIDE(1U) + +#define S_MSS_IDX 28 +#define M_MSS_IDX 0xF +#define V_MSS_IDX(x) ((x) << S_MSS_IDX) +#define G_MSS_IDX(x) (((x) >> S_MSS_IDX) & M_MSS_IDX) + +/* option 1 fields */ +#define S_RSS_ENABLE 0 +#define V_RSS_ENABLE(x) ((x) << S_RSS_ENABLE) +#define F_RSS_ENABLE V_RSS_ENABLE(1U) + +#define S_RSS_MASK_LEN 1 +#define M_RSS_MASK_LEN 0x7 +#define V_RSS_MASK_LEN(x) ((x) << S_RSS_MASK_LEN) +#define G_RSS_MASK_LEN(x) (((x) >> S_RSS_MASK_LEN) & M_RSS_MASK_LEN) + +#define S_CPU_IDX 4 +#define M_CPU_IDX 0x3F +#define V_CPU_IDX(x) ((x) << S_CPU_IDX) +#define G_CPU_IDX(x) (((x) >> S_CPU_IDX) & M_CPU_IDX) + +#define S_MAC_MATCH_VALID 18 +#define V_MAC_MATCH_VALID(x) ((x) << S_MAC_MATCH_VALID) +#define F_MAC_MATCH_VALID V_MAC_MATCH_VALID(1U) + +#define S_CONN_POLICY 19 +#define M_CONN_POLICY 0x3 +#define V_CONN_POLICY(x) ((x) << S_CONN_POLICY) +#define G_CONN_POLICY(x) (((x) >> S_CONN_POLICY) & M_CONN_POLICY) + +#define S_SYN_DEFENSE 21 +#define V_SYN_DEFENSE(x) ((x) << S_SYN_DEFENSE) +#define F_SYN_DEFENSE V_SYN_DEFENSE(1U) + +#define S_VLAN_PRI 22 +#define M_VLAN_PRI 0x3 +#define V_VLAN_PRI(x) ((x) << S_VLAN_PRI) +#define G_VLAN_PRI(x) (((x) >> S_VLAN_PRI) & M_VLAN_PRI) + +#define S_VLAN_PRI_VALID 24 +#define V_VLAN_PRI_VALID(x) ((x) << S_VLAN_PRI_VALID) +#define F_VLAN_PRI_VALID V_VLAN_PRI_VALID(1U) + +#define S_PKT_TYPE 25 +#define M_PKT_TYPE 0x3 +#define V_PKT_TYPE(x) ((x) << S_PKT_TYPE) +#define G_PKT_TYPE(x) (((x) >> S_PKT_TYPE) & M_PKT_TYPE) + +#define S_MAC_MATCH 27 +#define M_MAC_MATCH 0x1F +#define V_MAC_MATCH(x) ((x) << S_MAC_MATCH) +#define G_MAC_MATCH(x) (((x) >> S_MAC_MATCH) & M_MAC_MATCH) + +/* option 2 fields */ +#define S_CPU_INDEX 0 +#define M_CPU_INDEX 0x7F +#define V_CPU_INDEX(x) ((x) << S_CPU_INDEX) +#define G_CPU_INDEX(x) (((x) >> S_CPU_INDEX) & M_CPU_INDEX) + +#define S_CPU_INDEX_VALID 7 +#define V_CPU_INDEX_VALID(x) ((x) << S_CPU_INDEX_VALID) +#define F_CPU_INDEX_VALID V_CPU_INDEX_VALID(1U) + +#define S_RX_COALESCE 8 +#define M_RX_COALESCE 0x3 +#define V_RX_COALESCE(x) ((x) << S_RX_COALESCE) +#define G_RX_COALESCE(x) (((x) >> S_RX_COALESCE) & M_RX_COALESCE) + +#define S_RX_COALESCE_VALID 10 +#define V_RX_COALESCE_VALID(x) ((x) << S_RX_COALESCE_VALID) +#define F_RX_COALESCE_VALID V_RX_COALESCE_VALID(1U) + +#define S_CONG_CONTROL_FLAVOR 11 +#define M_CONG_CONTROL_FLAVOR 0x3 +#define V_CONG_CONTROL_FLAVOR(x) ((x) << S_CONG_CONTROL_FLAVOR) +#define G_CONG_CONTROL_FLAVOR(x) (((x) >> S_CONG_CONTROL_FLAVOR) & M_CONG_CONTROL_FLAVOR) + +#define S_PACING_FLAVOR 13 +#define M_PACING_FLAVOR 0x3 +#define V_PACING_FLAVOR(x) ((x) << S_PACING_FLAVOR) +#define G_PACING_FLAVOR(x) (((x) >> S_PACING_FLAVOR) & M_PACING_FLAVOR) + +#define S_FLAVORS_VALID 15 +#define V_FLAVORS_VALID(x) ((x) << S_FLAVORS_VALID) +#define F_FLAVORS_VALID V_FLAVORS_VALID(1U) + +#define S_RX_FC_DISABLE 16 +#define V_RX_FC_DISABLE(x) ((x) << S_RX_FC_DISABLE) +#define F_RX_FC_DISABLE V_RX_FC_DISABLE(1U) + +#define S_RX_FC_VALID 17 +#define V_RX_FC_VALID(x) ((x) << S_RX_FC_VALID) +#define F_RX_FC_VALID V_RX_FC_VALID(1U) + +struct cpl_pass_open_req { + WR_HDR; + union opcode_tid ot; + __be16 local_port; + __be16 peer_port; + __be32 local_ip; + __be32 peer_ip; + __be32 opt0h; + __be32 opt0l; + __be32 peer_netmask; + __be32 opt1; +}; + +struct cpl_pass_open_rpl { + RSS_HDR union opcode_tid ot; + __be16 local_port; + __be16 peer_port; + __be32 local_ip; + __be32 peer_ip; + __u8 resvd[7]; + __u8 status; +}; + +struct cpl_pass_establish { + RSS_HDR union opcode_tid ot; + __be16 local_port; + __be16 peer_port; + __be32 local_ip; + __be32 peer_ip; + __be32 tos_tid; + __be16 l2t_idx; + __be16 tcp_opt; + __be32 snd_isn; + __be32 rcv_isn; +}; + +/* cpl_pass_establish.tos_tid fields */ +#define S_PASS_OPEN_TID 0 +#define M_PASS_OPEN_TID 0xFFFFFF +#define V_PASS_OPEN_TID(x) ((x) << S_PASS_OPEN_TID) +#define G_PASS_OPEN_TID(x) (((x) >> S_PASS_OPEN_TID) & M_PASS_OPEN_TID) + +#define S_PASS_OPEN_TOS 24 +#define M_PASS_OPEN_TOS 0xFF +#define V_PASS_OPEN_TOS(x) ((x) << S_PASS_OPEN_TOS) +#define G_PASS_OPEN_TOS(x) (((x) >> S_PASS_OPEN_TOS) & M_PASS_OPEN_TOS) + +/* cpl_pass_establish.l2t_idx fields */ +#define S_L2T_IDX16 5 +#define M_L2T_IDX16 0x7FF +#define V_L2T_IDX16(x) ((x) << S_L2T_IDX16) +#define G_L2T_IDX16(x) (((x) >> S_L2T_IDX16) & M_L2T_IDX16) + +/* cpl_pass_establish.tcp_opt fields (also applies act_open_establish) */ +#define G_TCPOPT_WSCALE_OK(x) (((x) >> 5) & 1) +#define G_TCPOPT_SACK(x) (((x) >> 6) & 1) +#define G_TCPOPT_TSTAMP(x) (((x) >> 7) & 1) +#define G_TCPOPT_SND_WSCALE(x) (((x) >> 8) & 0xf) +#define G_TCPOPT_MSS(x) (((x) >> 12) & 0xf) + +struct cpl_pass_accept_req { + RSS_HDR union opcode_tid ot; + __be16 local_port; + __be16 peer_port; + __be32 local_ip; + __be32 peer_ip; + __be32 tos_tid; + struct tcp_options tcp_options; + __u8 dst_mac[6]; + __be16 vlan_tag; + __u8 src_mac[6]; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8:3; + __u8 addr_idx:3; + __u8 port_idx:1; + __u8 exact_match:1; +#else + __u8 exact_match:1; + __u8 port_idx:1; + __u8 addr_idx:3; + __u8:3; +#endif + __u8 rsvd; + __be32 rcv_isn; + __be32 rsvd2; +}; + +struct cpl_pass_accept_rpl { + WR_HDR; + union opcode_tid ot; + __be32 opt2; + __be32 rsvd; + __be32 peer_ip; + __be32 opt0h; + __be32 opt0l_status; +}; + +struct cpl_act_open_req { + WR_HDR; + union opcode_tid ot; + __be16 local_port; + __be16 peer_port; + __be32 local_ip; + __be32 peer_ip; + __be32 opt0h; + __be32 opt0l; + __be32 params; + __be32 opt2; +}; + +/* cpl_act_open_req.params fields */ +#define S_AOPEN_VLAN_PRI 9 +#define M_AOPEN_VLAN_PRI 0x3 +#define V_AOPEN_VLAN_PRI(x) ((x) << S_AOPEN_VLAN_PRI) +#define G_AOPEN_VLAN_PRI(x) (((x) >> S_AOPEN_VLAN_PRI) & M_AOPEN_VLAN_PRI) + +#define S_AOPEN_VLAN_PRI_VALID 11 +#define V_AOPEN_VLAN_PRI_VALID(x) ((x) << S_AOPEN_VLAN_PRI_VALID) +#define F_AOPEN_VLAN_PRI_VALID V_AOPEN_VLAN_PRI_VALID(1U) + +#define S_AOPEN_PKT_TYPE 12 +#define M_AOPEN_PKT_TYPE 0x3 +#define V_AOPEN_PKT_TYPE(x) ((x) << S_AOPEN_PKT_TYPE) +#define G_AOPEN_PKT_TYPE(x) (((x) >> S_AOPEN_PKT_TYPE) & M_AOPEN_PKT_TYPE) + +#define S_AOPEN_MAC_MATCH 14 +#define M_AOPEN_MAC_MATCH 0x1F +#define V_AOPEN_MAC_MATCH(x) ((x) << S_AOPEN_MAC_MATCH) +#define G_AOPEN_MAC_MATCH(x) (((x) >> S_AOPEN_MAC_MATCH) & M_AOPEN_MAC_MATCH) + +#define S_AOPEN_MAC_MATCH_VALID 19 +#define V_AOPEN_MAC_MATCH_VALID(x) ((x) << S_AOPEN_MAC_MATCH_VALID) +#define F_AOPEN_MAC_MATCH_VALID V_AOPEN_MAC_MATCH_VALID(1U) + +#define S_AOPEN_IFF_VLAN 20 +#define M_AOPEN_IFF_VLAN 0xFFF +#define V_AOPEN_IFF_VLAN(x) ((x) << S_AOPEN_IFF_VLAN) +#define G_AOPEN_IFF_VLAN(x) (((x) >> S_AOPEN_IFF_VLAN) & M_AOPEN_IFF_VLAN) + +struct cpl_act_open_rpl { + RSS_HDR union opcode_tid ot; + __be16 local_port; + __be16 peer_port; + __be32 local_ip; + __be32 peer_ip; + __be32 atid; + __u8 rsvd[3]; + __u8 status; +}; + +struct cpl_act_establish { + RSS_HDR union opcode_tid ot; + __be16 local_port; + __be16 peer_port; + __be32 local_ip; + __be32 peer_ip; + __be32 tos_tid; + __be16 l2t_idx; + __be16 tcp_opt; + __be32 snd_isn; + __be32 rcv_isn; +}; + +struct cpl_get_tcb { + WR_HDR; + union opcode_tid ot; + __be16 cpuno; + __be16 rsvd; +}; + +struct cpl_get_tcb_rpl { + RSS_HDR union opcode_tid ot; + __u8 rsvd; + __u8 status; + __be16 len; +}; + +struct cpl_set_tcb { + WR_HDR; + union opcode_tid ot; + __u8 reply; + __u8 cpu_idx; + __be16 len; +}; + +/* cpl_set_tcb.reply fields */ +#define S_NO_REPLY 7 +#define V_NO_REPLY(x) ((x) << S_NO_REPLY) +#define F_NO_REPLY V_NO_REPLY(1U) + +struct cpl_set_tcb_field { + WR_HDR; + union opcode_tid ot; + __u8 reply; + __u8 cpu_idx; + __be16 word; + __be64 mask; + __be64 val; +}; + +struct cpl_set_tcb_rpl { + RSS_HDR union opcode_tid ot; + __u8 rsvd[3]; + __u8 status; +}; + +struct cpl_pcmd { + WR_HDR; + union opcode_tid ot; + __u8 rsvd[3]; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 src:1; + __u8 bundle:1; + __u8 channel:1; + __u8:5; +#else + __u8:5; + __u8 channel:1; + __u8 bundle:1; + __u8 src:1; +#endif + __be32 pcmd_parm[2]; +}; + +struct cpl_pcmd_reply { + RSS_HDR union opcode_tid ot; + __u8 status; + __u8 rsvd; + __be16 len; +}; + +struct cpl_close_con_req { + WR_HDR; + union opcode_tid ot; + __be32 rsvd; +}; + +struct cpl_close_con_rpl { + RSS_HDR union opcode_tid ot; + __u8 rsvd[3]; + __u8 status; + __be32 snd_nxt; + __be32 rcv_nxt; +}; + +struct cpl_close_listserv_req { + WR_HDR; + union opcode_tid ot; + __u8 rsvd0; + __u8 cpu_idx; + __be16 rsvd1; +}; + +struct cpl_close_listserv_rpl { + RSS_HDR union opcode_tid ot; + __u8 rsvd[3]; + __u8 status; +}; + +struct cpl_abort_req_rss { + RSS_HDR union opcode_tid ot; + __be32 rsvd0; + __u8 rsvd1; + __u8 status; + __u8 rsvd2[6]; +}; + +struct cpl_abort_req { + WR_HDR; + union opcode_tid ot; + __be32 rsvd0; + __u8 rsvd1; + __u8 cmd; + __u8 rsvd2[6]; +}; + +struct cpl_abort_rpl_rss { + RSS_HDR union opcode_tid ot; + __be32 rsvd0; + __u8 rsvd1; + __u8 status; + __u8 rsvd2[6]; +}; + +struct cpl_abort_rpl { + WR_HDR; + union opcode_tid ot; + __be32 rsvd0; + __u8 rsvd1; + __u8 cmd; + __u8 rsvd2[6]; +}; + +struct cpl_peer_close { + RSS_HDR union opcode_tid ot; + __be32 rcv_nxt; +}; + +struct tx_data_wr { + __be32 wr_hi; + __be32 wr_lo; + __be32 len; + __be32 flags; + __be32 sndseq; + __be32 param; +}; + +/* tx_data_wr.flags fields */ +#define S_TX_ACK_PAGES 21 +#define M_TX_ACK_PAGES 0x7 +#define V_TX_ACK_PAGES(x) ((x) << S_TX_ACK_PAGES) +#define G_TX_ACK_PAGES(x) (((x) >> S_TX_ACK_PAGES) & M_TX_ACK_PAGES) + +/* tx_data_wr.param fields */ +#define S_TX_PORT 0 +#define M_TX_PORT 0x7 +#define V_TX_PORT(x) ((x) << S_TX_PORT) +#define G_TX_PORT(x) (((x) >> S_TX_PORT) & M_TX_PORT) + +#define S_TX_MSS 4 +#define M_TX_MSS 0xF +#define V_TX_MSS(x) ((x) << S_TX_MSS) +#define G_TX_MSS(x) (((x) >> S_TX_MSS) & M_TX_MSS) + +#define S_TX_QOS 8 +#define M_TX_QOS 0xFF +#define V_TX_QOS(x) ((x) << S_TX_QOS) +#define G_TX_QOS(x) (((x) >> S_TX_QOS) & M_TX_QOS) + +#define S_TX_SNDBUF 16 +#define M_TX_SNDBUF 0xFFFF +#define V_TX_SNDBUF(x) ((x) << S_TX_SNDBUF) +#define G_TX_SNDBUF(x) (((x) >> S_TX_SNDBUF) & M_TX_SNDBUF) + +struct cpl_tx_data { + union opcode_tid ot; + __be32 len; + __be32 rsvd; + __be16 urg; + __be16 flags; +}; + +/* cpl_tx_data.flags fields */ +#define S_TX_ULP_SUBMODE 6 +#define M_TX_ULP_SUBMODE 0xF +#define V_TX_ULP_SUBMODE(x) ((x) << S_TX_ULP_SUBMODE) +#define G_TX_ULP_SUBMODE(x) (((x) >> S_TX_ULP_SUBMODE) & M_TX_ULP_SUBMODE) + +#define S_TX_ULP_MODE 10 +#define M_TX_ULP_MODE 0xF +#define V_TX_ULP_MODE(x) ((x) << S_TX_ULP_MODE) +#define G_TX_ULP_MODE(x) (((x) >> S_TX_ULP_MODE) & M_TX_ULP_MODE) + +#define S_TX_SHOVE 14 +#define V_TX_SHOVE(x) ((x) << S_TX_SHOVE) +#define F_TX_SHOVE V_TX_SHOVE(1U) + +#define S_TX_MORE 15 +#define V_TX_MORE(x) ((x) << S_TX_MORE) +#define F_TX_MORE V_TX_MORE(1U) + +/* additional tx_data_wr.flags fields */ +#define S_TX_CPU_IDX 0 +#define M_TX_CPU_IDX 0x3F +#define V_TX_CPU_IDX(x) ((x) << S_TX_CPU_IDX) +#define G_TX_CPU_IDX(x) (((x) >> S_TX_CPU_IDX) & M_TX_CPU_IDX) + +#define S_TX_URG 16 +#define V_TX_URG(x) ((x) << S_TX_URG) +#define F_TX_URG V_TX_URG(1U) + +#define S_TX_CLOSE 17 +#define V_TX_CLOSE(x) ((x) << S_TX_CLOSE) +#define F_TX_CLOSE V_TX_CLOSE(1U) + +#define S_TX_INIT 18 +#define V_TX_INIT(x) ((x) << S_TX_INIT) +#define F_TX_INIT V_TX_INIT(1U) + +#define S_TX_IMM_ACK 19 +#define V_TX_IMM_ACK(x) ((x) << S_TX_IMM_ACK) +#define F_TX_IMM_ACK V_TX_IMM_ACK(1U) + +#define S_TX_IMM_DMA 20 +#define V_TX_IMM_DMA(x) ((x) << S_TX_IMM_DMA) +#define F_TX_IMM_DMA V_TX_IMM_DMA(1U) + +struct cpl_tx_data_ack { + RSS_HDR union opcode_tid ot; + __be32 ack_seq; +}; + +struct cpl_wr_ack { + RSS_HDR union opcode_tid ot; + __be16 credits; + __be16 rsvd; + __be32 snd_nxt; + __be32 snd_una; +}; + +struct cpl_rdma_ec_status { + RSS_HDR union opcode_tid ot; + __u8 rsvd[3]; + __u8 status; +}; + +struct mngt_pktsched_wr { + __be32 wr_hi; + __be32 wr_lo; + __u8 mngt_opcode; + __u8 rsvd[7]; + __u8 sched; + __u8 idx; + __u8 min; + __u8 max; + __u8 binding; + __u8 rsvd1[3]; +}; + +struct cpl_iscsi_hdr { + RSS_HDR union opcode_tid ot; + __be16 pdu_len_ddp; + __be16 len; + __be32 seq; + __be16 urg; + __u8 rsvd; + __u8 status; +}; + +/* cpl_iscsi_hdr.pdu_len_ddp fields */ +#define S_ISCSI_PDU_LEN 0 +#define M_ISCSI_PDU_LEN 0x7FFF +#define V_ISCSI_PDU_LEN(x) ((x) << S_ISCSI_PDU_LEN) +#define G_ISCSI_PDU_LEN(x) (((x) >> S_ISCSI_PDU_LEN) & M_ISCSI_PDU_LEN) + +#define S_ISCSI_DDP 15 +#define V_ISCSI_DDP(x) ((x) << S_ISCSI_DDP) +#define F_ISCSI_DDP V_ISCSI_DDP(1U) + +struct cpl_rx_data { + RSS_HDR union opcode_tid ot; + __be16 rsvd; + __be16 len; + __be32 seq; + __be16 urg; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 dack_mode:2; + __u8 psh:1; + __u8 heartbeat:1; + __u8:4; +#else + __u8:4; + __u8 heartbeat:1; + __u8 psh:1; + __u8 dack_mode:2; +#endif + __u8 status; +}; + +struct cpl_rx_data_ack { + WR_HDR; + union opcode_tid ot; + __be32 credit_dack; +}; + +/* cpl_rx_data_ack.ack_seq fields */ +#define S_RX_CREDITS 0 +#define M_RX_CREDITS 0x7FFFFFF +#define V_RX_CREDITS(x) ((x) << S_RX_CREDITS) +#define G_RX_CREDITS(x) (((x) >> S_RX_CREDITS) & M_RX_CREDITS) + +#define S_RX_MODULATE 27 +#define V_RX_MODULATE(x) ((x) << S_RX_MODULATE) +#define F_RX_MODULATE V_RX_MODULATE(1U) + +#define S_RX_FORCE_ACK 28 +#define V_RX_FORCE_ACK(x) ((x) << S_RX_FORCE_ACK) +#define F_RX_FORCE_ACK V_RX_FORCE_ACK(1U) + +#define S_RX_DACK_MODE 29 +#define M_RX_DACK_MODE 0x3 +#define V_RX_DACK_MODE(x) ((x) << S_RX_DACK_MODE) +#define G_RX_DACK_MODE(x) (((x) >> S_RX_DACK_MODE) & M_RX_DACK_MODE) + +#define S_RX_DACK_CHANGE 31 +#define V_RX_DACK_CHANGE(x) ((x) << S_RX_DACK_CHANGE) +#define F_RX_DACK_CHANGE V_RX_DACK_CHANGE(1U) + +struct cpl_rx_urg_notify { + RSS_HDR union opcode_tid ot; + __be32 seq; +}; + +struct cpl_rx_ddp_complete { + RSS_HDR union opcode_tid ot; + __be32 ddp_report; +}; + +struct cpl_rx_data_ddp { + RSS_HDR union opcode_tid ot; + __be16 urg; + __be16 len; + __be32 seq; + union { + __be32 nxt_seq; + __be32 ddp_report; + }; + __be32 ulp_crc; + __be32 ddpvld_status; +}; + +/* cpl_rx_data_ddp.ddpvld_status fields */ +#define S_DDP_STATUS 0 +#define M_DDP_STATUS 0xFF +#define V_DDP_STATUS(x) ((x) << S_DDP_STATUS) +#define G_DDP_STATUS(x) (((x) >> S_DDP_STATUS) & M_DDP_STATUS) + +#define S_DDP_VALID 15 +#define M_DDP_VALID 0x1FFFF +#define V_DDP_VALID(x) ((x) << S_DDP_VALID) +#define G_DDP_VALID(x) (((x) >> S_DDP_VALID) & M_DDP_VALID) + +#define S_DDP_PPOD_MISMATCH 15 +#define V_DDP_PPOD_MISMATCH(x) ((x) << S_DDP_PPOD_MISMATCH) +#define F_DDP_PPOD_MISMATCH V_DDP_PPOD_MISMATCH(1U) + +#define S_DDP_PDU 16 +#define V_DDP_PDU(x) ((x) << S_DDP_PDU) +#define F_DDP_PDU V_DDP_PDU(1U) + +#define S_DDP_LLIMIT_ERR 17 +#define V_DDP_LLIMIT_ERR(x) ((x) << S_DDP_LLIMIT_ERR) +#define F_DDP_LLIMIT_ERR V_DDP_LLIMIT_ERR(1U) + +#define S_DDP_PPOD_PARITY_ERR 18 +#define V_DDP_PPOD_PARITY_ERR(x) ((x) << S_DDP_PPOD_PARITY_ERR) +#define F_DDP_PPOD_PARITY_ERR V_DDP_PPOD_PARITY_ERR(1U) + +#define S_DDP_PADDING_ERR 19 +#define V_DDP_PADDING_ERR(x) ((x) << S_DDP_PADDING_ERR) +#define F_DDP_PADDING_ERR V_DDP_PADDING_ERR(1U) + +#define S_DDP_HDRCRC_ERR 20 +#define V_DDP_HDRCRC_ERR(x) ((x) << S_DDP_HDRCRC_ERR) +#define F_DDP_HDRCRC_ERR V_DDP_HDRCRC_ERR(1U) + +#define S_DDP_DATACRC_ERR 21 +#define V_DDP_DATACRC_ERR(x) ((x) << S_DDP_DATACRC_ERR) +#define F_DDP_DATACRC_ERR V_DDP_DATACRC_ERR(1U) + +#define S_DDP_INVALID_TAG 22 +#define V_DDP_INVALID_TAG(x) ((x) << S_DDP_INVALID_TAG) +#define F_DDP_INVALID_TAG V_DDP_INVALID_TAG(1U) + +#define S_DDP_ULIMIT_ERR 23 +#define V_DDP_ULIMIT_ERR(x) ((x) << S_DDP_ULIMIT_ERR) +#define F_DDP_ULIMIT_ERR V_DDP_ULIMIT_ERR(1U) + +#define S_DDP_OFFSET_ERR 24 +#define V_DDP_OFFSET_ERR(x) ((x) << S_DDP_OFFSET_ERR) +#define F_DDP_OFFSET_ERR V_DDP_OFFSET_ERR(1U) + +#define S_DDP_COLOR_ERR 25 +#define V_DDP_COLOR_ERR(x) ((x) << S_DDP_COLOR_ERR) +#define F_DDP_COLOR_ERR V_DDP_COLOR_ERR(1U) + +#define S_DDP_TID_MISMATCH 26 +#define V_DDP_TID_MISMATCH(x) ((x) << S_DDP_TID_MISMATCH) +#define F_DDP_TID_MISMATCH V_DDP_TID_MISMATCH(1U) + +#define S_DDP_INVALID_PPOD 27 +#define V_DDP_INVALID_PPOD(x) ((x) << S_DDP_INVALID_PPOD) +#define F_DDP_INVALID_PPOD V_DDP_INVALID_PPOD(1U) + +#define S_DDP_ULP_MODE 28 +#define M_DDP_ULP_MODE 0xF +#define V_DDP_ULP_MODE(x) ((x) << S_DDP_ULP_MODE) +#define G_DDP_ULP_MODE(x) (((x) >> S_DDP_ULP_MODE) & M_DDP_ULP_MODE) + +/* cpl_rx_data_ddp.ddp_report fields */ +#define S_DDP_OFFSET 0 +#define M_DDP_OFFSET 0x3FFFFF +#define V_DDP_OFFSET(x) ((x) << S_DDP_OFFSET) +#define G_DDP_OFFSET(x) (((x) >> S_DDP_OFFSET) & M_DDP_OFFSET) + +#define S_DDP_URG 24 +#define V_DDP_URG(x) ((x) << S_DDP_URG) +#define F_DDP_URG V_DDP_URG(1U) + +#define S_DDP_PSH 25 +#define V_DDP_PSH(x) ((x) << S_DDP_PSH) +#define F_DDP_PSH V_DDP_PSH(1U) + +#define S_DDP_BUF_COMPLETE 26 +#define V_DDP_BUF_COMPLETE(x) ((x) << S_DDP_BUF_COMPLETE) +#define F_DDP_BUF_COMPLETE V_DDP_BUF_COMPLETE(1U) + +#define S_DDP_BUF_TIMED_OUT 27 +#define V_DDP_BUF_TIMED_OUT(x) ((x) << S_DDP_BUF_TIMED_OUT) +#define F_DDP_BUF_TIMED_OUT V_DDP_BUF_TIMED_OUT(1U) + +#define S_DDP_BUF_IDX 28 +#define V_DDP_BUF_IDX(x) ((x) << S_DDP_BUF_IDX) +#define F_DDP_BUF_IDX V_DDP_BUF_IDX(1U) + +struct cpl_tx_pkt { + WR_HDR; + __be32 cntrl; + __be32 len; +}; + +struct cpl_tx_pkt_lso { + WR_HDR; + __be32 cntrl; + __be32 len; + + __be32 rsvd; + __be32 lso_info; +}; + +/* cpl_tx_pkt*.cntrl fields */ +#define S_TXPKT_VLAN 0 +#define M_TXPKT_VLAN 0xFFFF +#define V_TXPKT_VLAN(x) ((x) << S_TXPKT_VLAN) +#define G_TXPKT_VLAN(x) (((x) >> S_TXPKT_VLAN) & M_TXPKT_VLAN) + +#define S_TXPKT_INTF 16 +#define M_TXPKT_INTF 0xF +#define V_TXPKT_INTF(x) ((x) << S_TXPKT_INTF) +#define G_TXPKT_INTF(x) (((x) >> S_TXPKT_INTF) & M_TXPKT_INTF) + +#define S_TXPKT_IPCSUM_DIS 20 +#define V_TXPKT_IPCSUM_DIS(x) ((x) << S_TXPKT_IPCSUM_DIS) +#define F_TXPKT_IPCSUM_DIS V_TXPKT_IPCSUM_DIS(1U) + +#define S_TXPKT_L4CSUM_DIS 21 +#define V_TXPKT_L4CSUM_DIS(x) ((x) << S_TXPKT_L4CSUM_DIS) +#define F_TXPKT_L4CSUM_DIS V_TXPKT_L4CSUM_DIS(1U) + +#define S_TXPKT_VLAN_VLD 22 +#define V_TXPKT_VLAN_VLD(x) ((x) << S_TXPKT_VLAN_VLD) +#define F_TXPKT_VLAN_VLD V_TXPKT_VLAN_VLD(1U) + +#define S_TXPKT_LOOPBACK 23 +#define V_TXPKT_LOOPBACK(x) ((x) << S_TXPKT_LOOPBACK) +#define F_TXPKT_LOOPBACK V_TXPKT_LOOPBACK(1U) + +#define S_TXPKT_OPCODE 24 +#define M_TXPKT_OPCODE 0xFF +#define V_TXPKT_OPCODE(x) ((x) << S_TXPKT_OPCODE) +#define G_TXPKT_OPCODE(x) (((x) >> S_TXPKT_OPCODE) & M_TXPKT_OPCODE) + +/* cpl_tx_pkt_lso.lso_info fields */ +#define S_LSO_MSS 0 +#define M_LSO_MSS 0x3FFF +#define V_LSO_MSS(x) ((x) << S_LSO_MSS) +#define G_LSO_MSS(x) (((x) >> S_LSO_MSS) & M_LSO_MSS) + +#define S_LSO_ETH_TYPE 14 +#define M_LSO_ETH_TYPE 0x3 +#define V_LSO_ETH_TYPE(x) ((x) << S_LSO_ETH_TYPE) +#define G_LSO_ETH_TYPE(x) (((x) >> S_LSO_ETH_TYPE) & M_LSO_ETH_TYPE) + +#define S_LSO_TCPHDR_WORDS 16 +#define M_LSO_TCPHDR_WORDS 0xF +#define V_LSO_TCPHDR_WORDS(x) ((x) << S_LSO_TCPHDR_WORDS) +#define G_LSO_TCPHDR_WORDS(x) (((x) >> S_LSO_TCPHDR_WORDS) & M_LSO_TCPHDR_WORDS) + +#define S_LSO_IPHDR_WORDS 20 +#define M_LSO_IPHDR_WORDS 0xF +#define V_LSO_IPHDR_WORDS(x) ((x) << S_LSO_IPHDR_WORDS) +#define G_LSO_IPHDR_WORDS(x) (((x) >> S_LSO_IPHDR_WORDS) & M_LSO_IPHDR_WORDS) + +#define S_LSO_IPV6 24 +#define V_LSO_IPV6(x) ((x) << S_LSO_IPV6) +#define F_LSO_IPV6 V_LSO_IPV6(1U) + +struct cpl_trace_pkt { +#ifdef CHELSIO_FW + __u8 rss_opcode; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 err:1; + __u8:7; +#else + __u8:7; + __u8 err:1; +#endif + __u8 rsvd0; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 qid:4; + __u8:4; +#else + __u8:4; + __u8 qid:4; +#endif + __be32 tstamp; +#endif /* CHELSIO_FW */ + + __u8 opcode; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 iff:4; + __u8:4; +#else + __u8:4; + __u8 iff:4; +#endif + __u8 rsvd[4]; + __be16 len; +}; + +struct cpl_rx_pkt { + RSS_HDR __u8 opcode; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 iff:4; + __u8 csum_valid:1; + __u8 ipmi_pkt:1; + __u8 vlan_valid:1; + __u8 fragment:1; +#else + __u8 fragment:1; + __u8 vlan_valid:1; + __u8 ipmi_pkt:1; + __u8 csum_valid:1; + __u8 iff:4; +#endif + __be16 csum; + __be16 vlan; + __be16 len; +}; + +struct cpl_l2t_write_req { + WR_HDR; + union opcode_tid ot; + __be32 params; + __u8 rsvd[2]; + __u8 dst_mac[6]; +}; + +/* cpl_l2t_write_req.params fields */ +#define S_L2T_W_IDX 0 +#define M_L2T_W_IDX 0x7FF +#define V_L2T_W_IDX(x) ((x) << S_L2T_W_IDX) +#define G_L2T_W_IDX(x) (((x) >> S_L2T_W_IDX) & M_L2T_W_IDX) + +#define S_L2T_W_VLAN 11 +#define M_L2T_W_VLAN 0xFFF +#define V_L2T_W_VLAN(x) ((x) << S_L2T_W_VLAN) +#define G_L2T_W_VLAN(x) (((x) >> S_L2T_W_VLAN) & M_L2T_W_VLAN) + +#define S_L2T_W_IFF 23 +#define M_L2T_W_IFF 0xF +#define V_L2T_W_IFF(x) ((x) << S_L2T_W_IFF) +#define G_L2T_W_IFF(x) (((x) >> S_L2T_W_IFF) & M_L2T_W_IFF) + +#define S_L2T_W_PRIO 27 +#define M_L2T_W_PRIO 0x7 +#define V_L2T_W_PRIO(x) ((x) << S_L2T_W_PRIO) +#define G_L2T_W_PRIO(x) (((x) >> S_L2T_W_PRIO) & M_L2T_W_PRIO) + +struct cpl_l2t_write_rpl { + RSS_HDR union opcode_tid ot; + __u8 status; + __u8 rsvd[3]; +}; + +struct cpl_l2t_read_req { + WR_HDR; + union opcode_tid ot; + __be16 rsvd; + __be16 l2t_idx; +}; + +struct cpl_l2t_read_rpl { + RSS_HDR union opcode_tid ot; + __be32 params; + __u8 rsvd[2]; + __u8 dst_mac[6]; +}; + +/* cpl_l2t_read_rpl.params fields */ +#define S_L2T_R_PRIO 0 +#define M_L2T_R_PRIO 0x7 +#define V_L2T_R_PRIO(x) ((x) << S_L2T_R_PRIO) +#define G_L2T_R_PRIO(x) (((x) >> S_L2T_R_PRIO) & M_L2T_R_PRIO) + +#define S_L2T_R_VLAN 8 +#define M_L2T_R_VLAN 0xFFF +#define V_L2T_R_VLAN(x) ((x) << S_L2T_R_VLAN) +#define G_L2T_R_VLAN(x) (((x) >> S_L2T_R_VLAN) & M_L2T_R_VLAN) + +#define S_L2T_R_IFF 20 +#define M_L2T_R_IFF 0xF +#define V_L2T_R_IFF(x) ((x) << S_L2T_R_IFF) +#define G_L2T_R_IFF(x) (((x) >> S_L2T_R_IFF) & M_L2T_R_IFF) + +#define S_L2T_STATUS 24 +#define M_L2T_STATUS 0xFF +#define V_L2T_STATUS(x) ((x) << S_L2T_STATUS) +#define G_L2T_STATUS(x) (((x) >> S_L2T_STATUS) & M_L2T_STATUS) + +struct cpl_smt_write_req { + WR_HDR; + union opcode_tid ot; + __u8 rsvd0; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 mtu_idx:4; + __u8 iff:4; +#else + __u8 iff:4; + __u8 mtu_idx:4; +#endif + __be16 rsvd2; + __be16 rsvd3; + __u8 src_mac1[6]; + __be16 rsvd4; + __u8 src_mac0[6]; +}; + +struct cpl_smt_write_rpl { + RSS_HDR union opcode_tid ot; + __u8 status; + __u8 rsvd[3]; +}; + +struct cpl_smt_read_req { + WR_HDR; + union opcode_tid ot; + __u8 rsvd0; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8:4; + __u8 iff:4; +#else + __u8 iff:4; + __u8:4; +#endif + __be16 rsvd2; +}; + +struct cpl_smt_read_rpl { + RSS_HDR union opcode_tid ot; + __u8 status; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 mtu_idx:4; + __u8:4; +#else + __u8:4; + __u8 mtu_idx:4; +#endif + __be16 rsvd2; + __be16 rsvd3; + __u8 src_mac1[6]; + __be16 rsvd4; + __u8 src_mac0[6]; +}; + +struct cpl_rte_delete_req { + WR_HDR; + union opcode_tid ot; + __be32 params; +}; + +/* { cpl_rte_delete_req, cpl_rte_read_req }.params fields */ +#define S_RTE_REQ_LUT_IX 8 +#define M_RTE_REQ_LUT_IX 0x7FF +#define V_RTE_REQ_LUT_IX(x) ((x) << S_RTE_REQ_LUT_IX) +#define G_RTE_REQ_LUT_IX(x) (((x) >> S_RTE_REQ_LUT_IX) & M_RTE_REQ_LUT_IX) + +#define S_RTE_REQ_LUT_BASE 19 +#define M_RTE_REQ_LUT_BASE 0x7FF +#define V_RTE_REQ_LUT_BASE(x) ((x) << S_RTE_REQ_LUT_BASE) +#define G_RTE_REQ_LUT_BASE(x) (((x) >> S_RTE_REQ_LUT_BASE) & M_RTE_REQ_LUT_BASE) + +#define S_RTE_READ_REQ_SELECT 31 +#define V_RTE_READ_REQ_SELECT(x) ((x) << S_RTE_READ_REQ_SELECT) +#define F_RTE_READ_REQ_SELECT V_RTE_READ_REQ_SELECT(1U) + +struct cpl_rte_delete_rpl { + RSS_HDR union opcode_tid ot; + __u8 status; + __u8 rsvd[3]; +}; + +struct cpl_rte_write_req { + WR_HDR; + union opcode_tid ot; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8:6; + __u8 write_tcam:1; + __u8 write_l2t_lut:1; +#else + __u8 write_l2t_lut:1; + __u8 write_tcam:1; + __u8:6; +#endif + __u8 rsvd[3]; + __be32 lut_params; + __be16 rsvd2; + __be16 l2t_idx; + __be32 netmask; + __be32 faddr; +}; + +/* cpl_rte_write_req.lut_params fields */ +#define S_RTE_WRITE_REQ_LUT_IX 10 +#define M_RTE_WRITE_REQ_LUT_IX 0x7FF +#define V_RTE_WRITE_REQ_LUT_IX(x) ((x) << S_RTE_WRITE_REQ_LUT_IX) +#define G_RTE_WRITE_REQ_LUT_IX(x) (((x) >> S_RTE_WRITE_REQ_LUT_IX) & M_RTE_WRITE_REQ_LUT_IX) + +#define S_RTE_WRITE_REQ_LUT_BASE 21 +#define M_RTE_WRITE_REQ_LUT_BASE 0x7FF +#define V_RTE_WRITE_REQ_LUT_BASE(x) ((x) << S_RTE_WRITE_REQ_LUT_BASE) +#define G_RTE_WRITE_REQ_LUT_BASE(x) (((x) >> S_RTE_WRITE_REQ_LUT_BASE) & M_RTE_WRITE_REQ_LUT_BASE) + +struct cpl_rte_write_rpl { + RSS_HDR union opcode_tid ot; + __u8 status; + __u8 rsvd[3]; +}; + +struct cpl_rte_read_req { + WR_HDR; + union opcode_tid ot; + __be32 params; +}; + +struct cpl_rte_read_rpl { + RSS_HDR union opcode_tid ot; + __u8 status; + __u8 rsvd0; + __be16 l2t_idx; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8:7; + __u8 select:1; +#else + __u8 select:1; + __u8:7; +#endif + __u8 rsvd2[3]; + __be32 addr; +}; + +struct cpl_tid_release { + WR_HDR; + union opcode_tid ot; + __be32 rsvd; +}; + +struct cpl_barrier { + WR_HDR; + __u8 opcode; + __u8 rsvd[7]; +}; + +struct cpl_rdma_read_req { + __u8 opcode; + __u8 rsvd[15]; +}; + +struct cpl_rdma_terminate { +#ifdef CHELSIO_FW + __u8 opcode; + __u8 rsvd[2]; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 rspq:3; + __u8:5; +#else + __u8:5; + __u8 rspq:3; +#endif + __be32 tid_len; +#endif + __be32 msn; + __be32 mo; + __u8 data[0]; +}; + +/* cpl_rdma_terminate.tid_len fields */ +#define S_FLIT_CNT 0 +#define M_FLIT_CNT 0xFF +#define V_FLIT_CNT(x) ((x) << S_FLIT_CNT) +#define G_FLIT_CNT(x) (((x) >> S_FLIT_CNT) & M_FLIT_CNT) + +#define S_TERM_TID 8 +#define M_TERM_TID 0xFFFFF +#define V_TERM_TID(x) ((x) << S_TERM_TID) +#define G_TERM_TID(x) (((x) >> S_TERM_TID) & M_TERM_TID) + +/* ULP_TX opcodes */ +enum { ULP_MEM_READ = 2, ULP_MEM_WRITE = 3, ULP_TXPKT = 4 }; + +#define S_ULPTX_CMD 28 +#define M_ULPTX_CMD 0xF +#define V_ULPTX_CMD(x) ((x) << S_ULPTX_CMD) + +#define S_ULPTX_NFLITS 0 +#define M_ULPTX_NFLITS 0xFF +#define V_ULPTX_NFLITS(x) ((x) << S_ULPTX_NFLITS) + +struct ulp_mem_io { + WR_HDR; + __be32 cmd_lock_addr; + __be32 len; +}; + +/* ulp_mem_io.cmd_lock_addr fields */ +#define S_ULP_MEMIO_ADDR 0 +#define M_ULP_MEMIO_ADDR 0x7FFFFFF +#define V_ULP_MEMIO_ADDR(x) ((x) << S_ULP_MEMIO_ADDR) +#define S_ULP_MEMIO_LOCK 27 +#define V_ULP_MEMIO_LOCK(x) ((x) << S_ULP_MEMIO_LOCK) +#define F_ULP_MEMIO_LOCK V_ULP_MEMIO_LOCK(1U) + +/* ulp_mem_io.len fields */ +#define S_ULP_MEMIO_DATA_LEN 28 +#define M_ULP_MEMIO_DATA_LEN 0xF +#define V_ULP_MEMIO_DATA_LEN(x) ((x) << S_ULP_MEMIO_DATA_LEN) + +#endif /* T3_CPL_H */ diff --git a/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c b/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c new file mode 100644 index 0000000..c74a898 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c @@ -0,0 +1,3777 @@ +/* + * Copyright (c) 2003-2008 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "common.h" +#include "regs.h" +#include "sge_defs.h" +#include "firmware_exports.h" + +static void t3_port_intr_clear(struct adapter *adapter, int idx); + +/** + * t3_wait_op_done_val - wait until an operation is completed + * @adapter: the adapter performing the operation + * @reg: the register to check for completion + * @mask: a single-bit field within @reg that indicates completion + * @polarity: the value of the field when the operation is completed + * @attempts: number of check iterations + * @delay: delay in usecs between iterations + * @valp: where to store the value of the register at completion time + * + * Wait until an operation is completed by checking a bit in a register + * up to @attempts times. If @valp is not NULL the value of the register + * at the time it indicated completion is stored there. Returns 0 if the + * operation completes and -EAGAIN otherwise. + */ + +int t3_wait_op_done_val(struct adapter *adapter, int reg, u32 mask, + int polarity, int attempts, int delay, u32 *valp) +{ + while (1) { + u32 val = t3_read_reg(adapter, reg); + + if (!!(val & mask) == polarity) { + if (valp) + *valp = val; + return 0; + } + if (--attempts == 0) + return -EAGAIN; + if (delay) + udelay(delay); + } +} + +/** + * t3_write_regs - write a bunch of registers + * @adapter: the adapter to program + * @p: an array of register address/register value pairs + * @n: the number of address/value pairs + * @offset: register address offset + * + * Takes an array of register address/register value pairs and writes each + * value to the corresponding register. Register addresses are adjusted + * by the supplied offset. + */ +void t3_write_regs(struct adapter *adapter, const struct addr_val_pair *p, + int n, unsigned int offset) +{ + while (n--) { + t3_write_reg(adapter, p->reg_addr + offset, p->val); + p++; + } +} + +/** + * t3_set_reg_field - set a register field to a value + * @adapter: the adapter to program + * @addr: the register address + * @mask: specifies the portion of the register to modify + * @val: the new value for the register field + * + * Sets a register field specified by the supplied mask to the + * given value. + */ +void t3_set_reg_field(struct adapter *adapter, unsigned int addr, u32 mask, + u32 val) +{ + u32 v = t3_read_reg(adapter, addr) & ~mask; + + t3_write_reg(adapter, addr, v | val); + t3_read_reg(adapter, addr); /* flush */ +} + +/** + * t3_read_indirect - read indirectly addressed registers + * @adap: the adapter + * @addr_reg: register holding the indirect address + * @data_reg: register holding the value of the indirect register + * @vals: where the read register values are stored + * @start_idx: index of first indirect register to read + * @nregs: how many indirect registers to read + * + * Reads registers that are accessed indirectly through an address/data + * register pair. + */ +static void t3_read_indirect(struct adapter *adap, unsigned int addr_reg, + unsigned int data_reg, u32 *vals, + unsigned int nregs, unsigned int start_idx) +{ + while (nregs--) { + t3_write_reg(adap, addr_reg, start_idx); + *vals++ = t3_read_reg(adap, data_reg); + start_idx++; + } +} + +/** + * t3_mc7_bd_read - read from MC7 through backdoor accesses + * @mc7: identifies MC7 to read from + * @start: index of first 64-bit word to read + * @n: number of 64-bit words to read + * @buf: where to store the read result + * + * Read n 64-bit words from MC7 starting at word start, using backdoor + * accesses. + */ +int t3_mc7_bd_read(struct mc7 *mc7, unsigned int start, unsigned int n, + u64 *buf) +{ + static const int shift[] = { 0, 0, 16, 24 }; + static const int step[] = { 0, 32, 16, 8 }; + + unsigned int size64 = mc7->size / 8; /* # of 64-bit words */ + struct adapter *adap = mc7->adapter; + + if (start >= size64 || start + n > size64) + return -EINVAL; + + start *= (8 << mc7->width); + while (n--) { + int i; + u64 val64 = 0; + + for (i = (1 << mc7->width) - 1; i >= 0; --i) { + int attempts = 10; + u32 val; + + t3_write_reg(adap, mc7->offset + A_MC7_BD_ADDR, start); + t3_write_reg(adap, mc7->offset + A_MC7_BD_OP, 0); + val = t3_read_reg(adap, mc7->offset + A_MC7_BD_OP); + while ((val & F_BUSY) && attempts--) + val = t3_read_reg(adap, + mc7->offset + A_MC7_BD_OP); + if (val & F_BUSY) + return -EIO; + + val = t3_read_reg(adap, mc7->offset + A_MC7_BD_DATA1); + if (mc7->width == 0) { + val64 = t3_read_reg(adap, + mc7->offset + + A_MC7_BD_DATA0); + val64 |= (u64) val << 32; + } else { + if (mc7->width > 1) + val >>= shift[mc7->width]; + val64 |= (u64) val << (step[mc7->width] * i); + } + start += 8; + } + *buf++ = val64; + } + return 0; +} + +/* + * Initialize MI1. + */ +static void mi1_init(struct adapter *adap, const struct adapter_info *ai) +{ + u32 clkdiv = adap->params.vpd.cclk / (2 * adap->params.vpd.mdc) - 1; + u32 val = F_PREEN | V_CLKDIV(clkdiv); + + t3_write_reg(adap, A_MI1_CFG, val); +} + +#define MDIO_ATTEMPTS 20 + +/* + * MI1 read/write operations for clause 22 PHYs. + */ +static int t3_mi1_read(struct net_device *dev, int phy_addr, int mmd_addr, + u16 reg_addr) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adapter = pi->adapter; + int ret; + u32 addr = V_REGADDR(reg_addr) | V_PHYADDR(phy_addr); + + mutex_lock(&adapter->mdio_lock); + t3_set_reg_field(adapter, A_MI1_CFG, V_ST(M_ST), V_ST(1)); + t3_write_reg(adapter, A_MI1_ADDR, addr); + t3_write_reg(adapter, A_MI1_OP, V_MDI_OP(2)); + ret = t3_wait_op_done(adapter, A_MI1_OP, F_BUSY, 0, MDIO_ATTEMPTS, 10); + if (!ret) + ret = t3_read_reg(adapter, A_MI1_DATA); + mutex_unlock(&adapter->mdio_lock); + return ret; +} + +static int t3_mi1_write(struct net_device *dev, int phy_addr, int mmd_addr, + u16 reg_addr, u16 val) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adapter = pi->adapter; + int ret; + u32 addr = V_REGADDR(reg_addr) | V_PHYADDR(phy_addr); + + mutex_lock(&adapter->mdio_lock); + t3_set_reg_field(adapter, A_MI1_CFG, V_ST(M_ST), V_ST(1)); + t3_write_reg(adapter, A_MI1_ADDR, addr); + t3_write_reg(adapter, A_MI1_DATA, val); + t3_write_reg(adapter, A_MI1_OP, V_MDI_OP(1)); + ret = t3_wait_op_done(adapter, A_MI1_OP, F_BUSY, 0, MDIO_ATTEMPTS, 10); + mutex_unlock(&adapter->mdio_lock); + return ret; +} + +static const struct mdio_ops mi1_mdio_ops = { + .read = t3_mi1_read, + .write = t3_mi1_write, + .mode_support = MDIO_SUPPORTS_C22 +}; + +/* + * Performs the address cycle for clause 45 PHYs. + * Must be called with the MDIO_LOCK held. + */ +static int mi1_wr_addr(struct adapter *adapter, int phy_addr, int mmd_addr, + int reg_addr) +{ + u32 addr = V_REGADDR(mmd_addr) | V_PHYADDR(phy_addr); + + t3_set_reg_field(adapter, A_MI1_CFG, V_ST(M_ST), 0); + t3_write_reg(adapter, A_MI1_ADDR, addr); + t3_write_reg(adapter, A_MI1_DATA, reg_addr); + t3_write_reg(adapter, A_MI1_OP, V_MDI_OP(0)); + return t3_wait_op_done(adapter, A_MI1_OP, F_BUSY, 0, + MDIO_ATTEMPTS, 10); +} + +/* + * MI1 read/write operations for indirect-addressed PHYs. + */ +static int mi1_ext_read(struct net_device *dev, int phy_addr, int mmd_addr, + u16 reg_addr) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adapter = pi->adapter; + int ret; + + mutex_lock(&adapter->mdio_lock); + ret = mi1_wr_addr(adapter, phy_addr, mmd_addr, reg_addr); + if (!ret) { + t3_write_reg(adapter, A_MI1_OP, V_MDI_OP(3)); + ret = t3_wait_op_done(adapter, A_MI1_OP, F_BUSY, 0, + MDIO_ATTEMPTS, 10); + if (!ret) + ret = t3_read_reg(adapter, A_MI1_DATA); + } + mutex_unlock(&adapter->mdio_lock); + return ret; +} + +static int mi1_ext_write(struct net_device *dev, int phy_addr, int mmd_addr, + u16 reg_addr, u16 val) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adapter = pi->adapter; + int ret; + + mutex_lock(&adapter->mdio_lock); + ret = mi1_wr_addr(adapter, phy_addr, mmd_addr, reg_addr); + if (!ret) { + t3_write_reg(adapter, A_MI1_DATA, val); + t3_write_reg(adapter, A_MI1_OP, V_MDI_OP(1)); + ret = t3_wait_op_done(adapter, A_MI1_OP, F_BUSY, 0, + MDIO_ATTEMPTS, 10); + } + mutex_unlock(&adapter->mdio_lock); + return ret; +} + +static const struct mdio_ops mi1_mdio_ext_ops = { + .read = mi1_ext_read, + .write = mi1_ext_write, + .mode_support = MDIO_SUPPORTS_C45 | MDIO_EMULATE_C22 +}; + +/** + * t3_mdio_change_bits - modify the value of a PHY register + * @phy: the PHY to operate on + * @mmd: the device address + * @reg: the register address + * @clear: what part of the register value to mask off + * @set: what part of the register value to set + * + * Changes the value of a PHY register by applying a mask to its current + * value and ORing the result with a new value. + */ +int t3_mdio_change_bits(struct cphy *phy, int mmd, int reg, unsigned int clear, + unsigned int set) +{ + int ret; + unsigned int val; + + ret = t3_mdio_read(phy, mmd, reg, &val); + if (!ret) { + val &= ~clear; + ret = t3_mdio_write(phy, mmd, reg, val | set); + } + return ret; +} + +/** + * t3_phy_reset - reset a PHY block + * @phy: the PHY to operate on + * @mmd: the device address of the PHY block to reset + * @wait: how long to wait for the reset to complete in 1ms increments + * + * Resets a PHY block and optionally waits for the reset to complete. + * @mmd should be 0 for 10/100/1000 PHYs and the device address to reset + * for 10G PHYs. + */ +int t3_phy_reset(struct cphy *phy, int mmd, int wait) +{ + int err; + unsigned int ctl; + + err = t3_mdio_change_bits(phy, mmd, MDIO_CTRL1, MDIO_CTRL1_LPOWER, + MDIO_CTRL1_RESET); + if (err || !wait) + return err; + + do { + err = t3_mdio_read(phy, mmd, MDIO_CTRL1, &ctl); + if (err) + return err; + ctl &= MDIO_CTRL1_RESET; + if (ctl) + msleep(1); + } while (ctl && --wait); + + return ctl ? -1 : 0; +} + +/** + * t3_phy_advertise - set the PHY advertisement registers for autoneg + * @phy: the PHY to operate on + * @advert: bitmap of capabilities the PHY should advertise + * + * Sets a 10/100/1000 PHY's advertisement registers to advertise the + * requested capabilities. + */ +int t3_phy_advertise(struct cphy *phy, unsigned int advert) +{ + int err; + unsigned int val = 0; + + err = t3_mdio_read(phy, MDIO_DEVAD_NONE, MII_CTRL1000, &val); + if (err) + return err; + + val &= ~(ADVERTISE_1000HALF | ADVERTISE_1000FULL); + if (advert & ADVERTISED_1000baseT_Half) + val |= ADVERTISE_1000HALF; + if (advert & ADVERTISED_1000baseT_Full) + val |= ADVERTISE_1000FULL; + + err = t3_mdio_write(phy, MDIO_DEVAD_NONE, MII_CTRL1000, val); + if (err) + return err; + + val = 1; + if (advert & ADVERTISED_10baseT_Half) + val |= ADVERTISE_10HALF; + if (advert & ADVERTISED_10baseT_Full) + val |= ADVERTISE_10FULL; + if (advert & ADVERTISED_100baseT_Half) + val |= ADVERTISE_100HALF; + if (advert & ADVERTISED_100baseT_Full) + val |= ADVERTISE_100FULL; + if (advert & ADVERTISED_Pause) + val |= ADVERTISE_PAUSE_CAP; + if (advert & ADVERTISED_Asym_Pause) + val |= ADVERTISE_PAUSE_ASYM; + return t3_mdio_write(phy, MDIO_DEVAD_NONE, MII_ADVERTISE, val); +} + +/** + * t3_phy_advertise_fiber - set fiber PHY advertisement register + * @phy: the PHY to operate on + * @advert: bitmap of capabilities the PHY should advertise + * + * Sets a fiber PHY's advertisement register to advertise the + * requested capabilities. + */ +int t3_phy_advertise_fiber(struct cphy *phy, unsigned int advert) +{ + unsigned int val = 0; + + if (advert & ADVERTISED_1000baseT_Half) + val |= ADVERTISE_1000XHALF; + if (advert & ADVERTISED_1000baseT_Full) + val |= ADVERTISE_1000XFULL; + if (advert & ADVERTISED_Pause) + val |= ADVERTISE_1000XPAUSE; + if (advert & ADVERTISED_Asym_Pause) + val |= ADVERTISE_1000XPSE_ASYM; + return t3_mdio_write(phy, MDIO_DEVAD_NONE, MII_ADVERTISE, val); +} + +/** + * t3_set_phy_speed_duplex - force PHY speed and duplex + * @phy: the PHY to operate on + * @speed: requested PHY speed + * @duplex: requested PHY duplex + * + * Force a 10/100/1000 PHY's speed and duplex. This also disables + * auto-negotiation except for GigE, where auto-negotiation is mandatory. + */ +int t3_set_phy_speed_duplex(struct cphy *phy, int speed, int duplex) +{ + int err; + unsigned int ctl; + + err = t3_mdio_read(phy, MDIO_DEVAD_NONE, MII_BMCR, &ctl); + if (err) + return err; + + if (speed >= 0) { + ctl &= ~(BMCR_SPEED100 | BMCR_SPEED1000 | BMCR_ANENABLE); + if (speed == SPEED_100) + ctl |= BMCR_SPEED100; + else if (speed == SPEED_1000) + ctl |= BMCR_SPEED1000; + } + if (duplex >= 0) { + ctl &= ~(BMCR_FULLDPLX | BMCR_ANENABLE); + if (duplex == DUPLEX_FULL) + ctl |= BMCR_FULLDPLX; + } + if (ctl & BMCR_SPEED1000) /* auto-negotiation required for GigE */ + ctl |= BMCR_ANENABLE; + return t3_mdio_write(phy, MDIO_DEVAD_NONE, MII_BMCR, ctl); +} + +int t3_phy_lasi_intr_enable(struct cphy *phy) +{ + return t3_mdio_write(phy, MDIO_MMD_PMAPMD, MDIO_PMA_LASI_CTRL, + MDIO_PMA_LASI_LSALARM); +} + +int t3_phy_lasi_intr_disable(struct cphy *phy) +{ + return t3_mdio_write(phy, MDIO_MMD_PMAPMD, MDIO_PMA_LASI_CTRL, 0); +} + +int t3_phy_lasi_intr_clear(struct cphy *phy) +{ + u32 val; + + return t3_mdio_read(phy, MDIO_MMD_PMAPMD, MDIO_PMA_LASI_STAT, &val); +} + +int t3_phy_lasi_intr_handler(struct cphy *phy) +{ + unsigned int status; + int err = t3_mdio_read(phy, MDIO_MMD_PMAPMD, MDIO_PMA_LASI_STAT, + &status); + + if (err) + return err; + return (status & MDIO_PMA_LASI_LSALARM) ? cphy_cause_link_change : 0; +} + +static const struct adapter_info t3_adap_info[] = { + {1, 1, 0, + F_GPIO2_OEN | F_GPIO4_OEN | + F_GPIO2_OUT_VAL | F_GPIO4_OUT_VAL, { S_GPIO3, S_GPIO5 }, 0, + &mi1_mdio_ops, "Chelsio PE9000"}, + {1, 1, 0, + F_GPIO2_OEN | F_GPIO4_OEN | + F_GPIO2_OUT_VAL | F_GPIO4_OUT_VAL, { S_GPIO3, S_GPIO5 }, 0, + &mi1_mdio_ops, "Chelsio T302"}, + {1, 0, 0, + F_GPIO1_OEN | F_GPIO6_OEN | F_GPIO7_OEN | F_GPIO10_OEN | + F_GPIO11_OEN | F_GPIO1_OUT_VAL | F_GPIO6_OUT_VAL | F_GPIO10_OUT_VAL, + { 0 }, SUPPORTED_10000baseT_Full | SUPPORTED_AUI, + &mi1_mdio_ext_ops, "Chelsio T310"}, + {1, 1, 0, + F_GPIO1_OEN | F_GPIO2_OEN | F_GPIO4_OEN | F_GPIO5_OEN | F_GPIO6_OEN | + F_GPIO7_OEN | F_GPIO10_OEN | F_GPIO11_OEN | F_GPIO1_OUT_VAL | + F_GPIO5_OUT_VAL | F_GPIO6_OUT_VAL | F_GPIO10_OUT_VAL, + { S_GPIO9, S_GPIO3 }, SUPPORTED_10000baseT_Full | SUPPORTED_AUI, + &mi1_mdio_ext_ops, "Chelsio T320"}, + {}, + {}, + {1, 0, 0, + F_GPIO1_OEN | F_GPIO2_OEN | F_GPIO4_OEN | F_GPIO6_OEN | F_GPIO7_OEN | + F_GPIO10_OEN | F_GPIO1_OUT_VAL | F_GPIO6_OUT_VAL | F_GPIO10_OUT_VAL, + { S_GPIO9 }, SUPPORTED_10000baseT_Full | SUPPORTED_AUI, + &mi1_mdio_ext_ops, "Chelsio T310" }, + {1, 0, 0, + F_GPIO1_OEN | F_GPIO6_OEN | F_GPIO7_OEN | + F_GPIO1_OUT_VAL | F_GPIO6_OUT_VAL, + { S_GPIO9 }, SUPPORTED_10000baseT_Full | SUPPORTED_AUI, + &mi1_mdio_ext_ops, "Chelsio N320E-G2" }, +}; + +/* + * Return the adapter_info structure with a given index. Out-of-range indices + * return NULL. + */ +const struct adapter_info *t3_get_adapter_info(unsigned int id) +{ + return id < ARRAY_SIZE(t3_adap_info) ? &t3_adap_info[id] : NULL; +} + +struct port_type_info { + int (*phy_prep)(struct cphy *phy, struct adapter *adapter, + int phy_addr, const struct mdio_ops *ops); +}; + +static const struct port_type_info port_types[] = { + { NULL }, + { t3_ael1002_phy_prep }, + { t3_vsc8211_phy_prep }, + { NULL}, + { t3_xaui_direct_phy_prep }, + { t3_ael2005_phy_prep }, + { t3_qt2045_phy_prep }, + { t3_ael1006_phy_prep }, + { NULL }, + { t3_aq100x_phy_prep }, + { t3_ael2020_phy_prep }, +}; + +#define VPD_ENTRY(name, len) \ + u8 name##_kword[2]; u8 name##_len; u8 name##_data[len] + +/* + * Partial EEPROM Vital Product Data structure. Includes only the ID and + * VPD-R sections. + */ +struct t3_vpd { + u8 id_tag; + u8 id_len[2]; + u8 id_data[16]; + u8 vpdr_tag; + u8 vpdr_len[2]; + VPD_ENTRY(pn, 16); /* part number */ + VPD_ENTRY(ec, 16); /* EC level */ + VPD_ENTRY(sn, SERNUM_LEN); /* serial number */ + VPD_ENTRY(na, 12); /* MAC address base */ + VPD_ENTRY(cclk, 6); /* core clock */ + VPD_ENTRY(mclk, 6); /* mem clock */ + VPD_ENTRY(uclk, 6); /* uP clk */ + VPD_ENTRY(mdc, 6); /* MDIO clk */ + VPD_ENTRY(mt, 2); /* mem timing */ + VPD_ENTRY(xaui0cfg, 6); /* XAUI0 config */ + VPD_ENTRY(xaui1cfg, 6); /* XAUI1 config */ + VPD_ENTRY(port0, 2); /* PHY0 complex */ + VPD_ENTRY(port1, 2); /* PHY1 complex */ + VPD_ENTRY(port2, 2); /* PHY2 complex */ + VPD_ENTRY(port3, 2); /* PHY3 complex */ + VPD_ENTRY(rv, 1); /* csum */ + u32 pad; /* for multiple-of-4 sizing and alignment */ +}; + +#define EEPROM_MAX_POLL 40 +#define EEPROM_STAT_ADDR 0x4000 +#define VPD_BASE 0xc00 + +/** + * t3_seeprom_read - read a VPD EEPROM location + * @adapter: adapter to read + * @addr: EEPROM address + * @data: where to store the read data + * + * Read a 32-bit word from a location in VPD EEPROM using the card's PCI + * VPD ROM capability. A zero is written to the flag bit when the + * address is written to the control register. The hardware device will + * set the flag to 1 when 4 bytes have been read into the data register. + */ +int t3_seeprom_read(struct adapter *adapter, u32 addr, __le32 *data) +{ + u16 val; + int attempts = EEPROM_MAX_POLL; + u32 v; + unsigned int base = adapter->params.pci.vpd_cap_addr; + + if ((addr >= EEPROMSIZE && addr != EEPROM_STAT_ADDR) || (addr & 3)) + return -EINVAL; + + pci_write_config_word(adapter->pdev, base + PCI_VPD_ADDR, addr); + do { + udelay(10); + pci_read_config_word(adapter->pdev, base + PCI_VPD_ADDR, &val); + } while (!(val & PCI_VPD_ADDR_F) && --attempts); + + if (!(val & PCI_VPD_ADDR_F)) { + CH_ERR(adapter, "reading EEPROM address 0x%x failed\n", addr); + return -EIO; + } + pci_read_config_dword(adapter->pdev, base + PCI_VPD_DATA, &v); + *data = cpu_to_le32(v); + return 0; +} + +/** + * t3_seeprom_write - write a VPD EEPROM location + * @adapter: adapter to write + * @addr: EEPROM address + * @data: value to write + * + * Write a 32-bit word to a location in VPD EEPROM using the card's PCI + * VPD ROM capability. + */ +int t3_seeprom_write(struct adapter *adapter, u32 addr, __le32 data) +{ + u16 val; + int attempts = EEPROM_MAX_POLL; + unsigned int base = adapter->params.pci.vpd_cap_addr; + + if ((addr >= EEPROMSIZE && addr != EEPROM_STAT_ADDR) || (addr & 3)) + return -EINVAL; + + pci_write_config_dword(adapter->pdev, base + PCI_VPD_DATA, + le32_to_cpu(data)); + pci_write_config_word(adapter->pdev,base + PCI_VPD_ADDR, + addr | PCI_VPD_ADDR_F); + do { + msleep(1); + pci_read_config_word(adapter->pdev, base + PCI_VPD_ADDR, &val); + } while ((val & PCI_VPD_ADDR_F) && --attempts); + + if (val & PCI_VPD_ADDR_F) { + CH_ERR(adapter, "write to EEPROM address 0x%x failed\n", addr); + return -EIO; + } + return 0; +} + +/** + * t3_seeprom_wp - enable/disable EEPROM write protection + * @adapter: the adapter + * @enable: 1 to enable write protection, 0 to disable it + * + * Enables or disables write protection on the serial EEPROM. + */ +int t3_seeprom_wp(struct adapter *adapter, int enable) +{ + return t3_seeprom_write(adapter, EEPROM_STAT_ADDR, enable ? 0xc : 0); +} + +/** + * get_vpd_params - read VPD parameters from VPD EEPROM + * @adapter: adapter to read + * @p: where to store the parameters + * + * Reads card parameters stored in VPD EEPROM. + */ +static int get_vpd_params(struct adapter *adapter, struct vpd_params *p) +{ + int i, addr, ret; + struct t3_vpd vpd; + + /* + * Card information is normally at VPD_BASE but some early cards had + * it at 0. + */ + ret = t3_seeprom_read(adapter, VPD_BASE, (__le32 *)&vpd); + if (ret) + return ret; + addr = vpd.id_tag == 0x82 ? VPD_BASE : 0; + + for (i = 0; i < sizeof(vpd); i += 4) { + ret = t3_seeprom_read(adapter, addr + i, + (__le32 *)((u8 *)&vpd + i)); + if (ret) + return ret; + } + + p->cclk = simple_strtoul(vpd.cclk_data, NULL, 10); + p->mclk = simple_strtoul(vpd.mclk_data, NULL, 10); + p->uclk = simple_strtoul(vpd.uclk_data, NULL, 10); + p->mdc = simple_strtoul(vpd.mdc_data, NULL, 10); + p->mem_timing = simple_strtoul(vpd.mt_data, NULL, 10); + memcpy(p->sn, vpd.sn_data, SERNUM_LEN); + + /* Old eeproms didn't have port information */ + if (adapter->params.rev == 0 && !vpd.port0_data[0]) { + p->port_type[0] = uses_xaui(adapter) ? 1 : 2; + p->port_type[1] = uses_xaui(adapter) ? 6 : 2; + } else { + p->port_type[0] = hex_to_bin(vpd.port0_data[0]); + p->port_type[1] = hex_to_bin(vpd.port1_data[0]); + p->xauicfg[0] = simple_strtoul(vpd.xaui0cfg_data, NULL, 16); + p->xauicfg[1] = simple_strtoul(vpd.xaui1cfg_data, NULL, 16); + } + + for (i = 0; i < 6; i++) + p->eth_base[i] = hex_to_bin(vpd.na_data[2 * i]) * 16 + + hex_to_bin(vpd.na_data[2 * i + 1]); + return 0; +} + +/* serial flash and firmware constants */ +enum { + SF_ATTEMPTS = 5, /* max retries for SF1 operations */ + SF_SEC_SIZE = 64 * 1024, /* serial flash sector size */ + SF_SIZE = SF_SEC_SIZE * 8, /* serial flash size */ + + /* flash command opcodes */ + SF_PROG_PAGE = 2, /* program page */ + SF_WR_DISABLE = 4, /* disable writes */ + SF_RD_STATUS = 5, /* read status register */ + SF_WR_ENABLE = 6, /* enable writes */ + SF_RD_DATA_FAST = 0xb, /* read flash */ + SF_ERASE_SECTOR = 0xd8, /* erase sector */ + + FW_FLASH_BOOT_ADDR = 0x70000, /* start address of FW in flash */ + FW_VERS_ADDR = 0x7fffc, /* flash address holding FW version */ + FW_MIN_SIZE = 8 /* at least version and csum */ +}; + +/** + * sf1_read - read data from the serial flash + * @adapter: the adapter + * @byte_cnt: number of bytes to read + * @cont: whether another operation will be chained + * @valp: where to store the read data + * + * Reads up to 4 bytes of data from the serial flash. The location of + * the read needs to be specified prior to calling this by issuing the + * appropriate commands to the serial flash. + */ +static int sf1_read(struct adapter *adapter, unsigned int byte_cnt, int cont, + u32 *valp) +{ + int ret; + + if (!byte_cnt || byte_cnt > 4) + return -EINVAL; + if (t3_read_reg(adapter, A_SF_OP) & F_BUSY) + return -EBUSY; + t3_write_reg(adapter, A_SF_OP, V_CONT(cont) | V_BYTECNT(byte_cnt - 1)); + ret = t3_wait_op_done(adapter, A_SF_OP, F_BUSY, 0, SF_ATTEMPTS, 10); + if (!ret) + *valp = t3_read_reg(adapter, A_SF_DATA); + return ret; +} + +/** + * sf1_write - write data to the serial flash + * @adapter: the adapter + * @byte_cnt: number of bytes to write + * @cont: whether another operation will be chained + * @val: value to write + * + * Writes up to 4 bytes of data to the serial flash. The location of + * the write needs to be specified prior to calling this by issuing the + * appropriate commands to the serial flash. + */ +static int sf1_write(struct adapter *adapter, unsigned int byte_cnt, int cont, + u32 val) +{ + if (!byte_cnt || byte_cnt > 4) + return -EINVAL; + if (t3_read_reg(adapter, A_SF_OP) & F_BUSY) + return -EBUSY; + t3_write_reg(adapter, A_SF_DATA, val); + t3_write_reg(adapter, A_SF_OP, + V_CONT(cont) | V_BYTECNT(byte_cnt - 1) | V_OP(1)); + return t3_wait_op_done(adapter, A_SF_OP, F_BUSY, 0, SF_ATTEMPTS, 10); +} + +/** + * flash_wait_op - wait for a flash operation to complete + * @adapter: the adapter + * @attempts: max number of polls of the status register + * @delay: delay between polls in ms + * + * Wait for a flash operation to complete by polling the status register. + */ +static int flash_wait_op(struct adapter *adapter, int attempts, int delay) +{ + int ret; + u32 status; + + while (1) { + if ((ret = sf1_write(adapter, 1, 1, SF_RD_STATUS)) != 0 || + (ret = sf1_read(adapter, 1, 0, &status)) != 0) + return ret; + if (!(status & 1)) + return 0; + if (--attempts == 0) + return -EAGAIN; + if (delay) + msleep(delay); + } +} + +/** + * t3_read_flash - read words from serial flash + * @adapter: the adapter + * @addr: the start address for the read + * @nwords: how many 32-bit words to read + * @data: where to store the read data + * @byte_oriented: whether to store data as bytes or as words + * + * Read the specified number of 32-bit words from the serial flash. + * If @byte_oriented is set the read data is stored as a byte array + * (i.e., big-endian), otherwise as 32-bit words in the platform's + * natural endianess. + */ +static int t3_read_flash(struct adapter *adapter, unsigned int addr, + unsigned int nwords, u32 *data, int byte_oriented) +{ + int ret; + + if (addr + nwords * sizeof(u32) > SF_SIZE || (addr & 3)) + return -EINVAL; + + addr = swab32(addr) | SF_RD_DATA_FAST; + + if ((ret = sf1_write(adapter, 4, 1, addr)) != 0 || + (ret = sf1_read(adapter, 1, 1, data)) != 0) + return ret; + + for (; nwords; nwords--, data++) { + ret = sf1_read(adapter, 4, nwords > 1, data); + if (ret) + return ret; + if (byte_oriented) + *data = htonl(*data); + } + return 0; +} + +/** + * t3_write_flash - write up to a page of data to the serial flash + * @adapter: the adapter + * @addr: the start address to write + * @n: length of data to write + * @data: the data to write + * + * Writes up to a page of data (256 bytes) to the serial flash starting + * at the given address. + */ +static int t3_write_flash(struct adapter *adapter, unsigned int addr, + unsigned int n, const u8 *data) +{ + int ret; + u32 buf[64]; + unsigned int i, c, left, val, offset = addr & 0xff; + + if (addr + n > SF_SIZE || offset + n > 256) + return -EINVAL; + + val = swab32(addr) | SF_PROG_PAGE; + + if ((ret = sf1_write(adapter, 1, 0, SF_WR_ENABLE)) != 0 || + (ret = sf1_write(adapter, 4, 1, val)) != 0) + return ret; + + for (left = n; left; left -= c) { + c = min(left, 4U); + for (val = 0, i = 0; i < c; ++i) + val = (val << 8) + *data++; + + ret = sf1_write(adapter, c, c != left, val); + if (ret) + return ret; + } + if ((ret = flash_wait_op(adapter, 5, 1)) != 0) + return ret; + + /* Read the page to verify the write succeeded */ + ret = t3_read_flash(adapter, addr & ~0xff, ARRAY_SIZE(buf), buf, 1); + if (ret) + return ret; + + if (memcmp(data - n, (u8 *) buf + offset, n)) + return -EIO; + return 0; +} + +/** + * t3_get_tp_version - read the tp sram version + * @adapter: the adapter + * @vers: where to place the version + * + * Reads the protocol sram version from sram. + */ +int t3_get_tp_version(struct adapter *adapter, u32 *vers) +{ + int ret; + + /* Get version loaded in SRAM */ + t3_write_reg(adapter, A_TP_EMBED_OP_FIELD0, 0); + ret = t3_wait_op_done(adapter, A_TP_EMBED_OP_FIELD0, + 1, 1, 5, 1); + if (ret) + return ret; + + *vers = t3_read_reg(adapter, A_TP_EMBED_OP_FIELD1); + + return 0; +} + +/** + * t3_check_tpsram_version - read the tp sram version + * @adapter: the adapter + * + * Reads the protocol sram version from flash. + */ +int t3_check_tpsram_version(struct adapter *adapter) +{ + int ret; + u32 vers; + unsigned int major, minor; + + if (adapter->params.rev == T3_REV_A) + return 0; + + + ret = t3_get_tp_version(adapter, &vers); + if (ret) + return ret; + + major = G_TP_VERSION_MAJOR(vers); + minor = G_TP_VERSION_MINOR(vers); + + if (major == TP_VERSION_MAJOR && minor == TP_VERSION_MINOR) + return 0; + else { + CH_ERR(adapter, "found wrong TP version (%u.%u), " + "driver compiled for version %d.%d\n", major, minor, + TP_VERSION_MAJOR, TP_VERSION_MINOR); + } + return -EINVAL; +} + +/** + * t3_check_tpsram - check if provided protocol SRAM + * is compatible with this driver + * @adapter: the adapter + * @tp_sram: the firmware image to write + * @size: image size + * + * Checks if an adapter's tp sram is compatible with the driver. + * Returns 0 if the versions are compatible, a negative error otherwise. + */ +int t3_check_tpsram(struct adapter *adapter, const u8 *tp_sram, + unsigned int size) +{ + u32 csum; + unsigned int i; + const __be32 *p = (const __be32 *)tp_sram; + + /* Verify checksum */ + for (csum = 0, i = 0; i < size / sizeof(csum); i++) + csum += ntohl(p[i]); + if (csum != 0xffffffff) { + CH_ERR(adapter, "corrupted protocol SRAM image, checksum %u\n", + csum); + return -EINVAL; + } + + return 0; +} + +enum fw_version_type { + FW_VERSION_N3, + FW_VERSION_T3 +}; + +/** + * t3_get_fw_version - read the firmware version + * @adapter: the adapter + * @vers: where to place the version + * + * Reads the FW version from flash. + */ +int t3_get_fw_version(struct adapter *adapter, u32 *vers) +{ + return t3_read_flash(adapter, FW_VERS_ADDR, 1, vers, 0); +} + +/** + * t3_check_fw_version - check if the FW is compatible with this driver + * @adapter: the adapter + * + * Checks if an adapter's FW is compatible with the driver. Returns 0 + * if the versions are compatible, a negative error otherwise. + */ +int t3_check_fw_version(struct adapter *adapter) +{ + int ret; + u32 vers; + unsigned int type, major, minor; + + ret = t3_get_fw_version(adapter, &vers); + if (ret) + return ret; + + type = G_FW_VERSION_TYPE(vers); + major = G_FW_VERSION_MAJOR(vers); + minor = G_FW_VERSION_MINOR(vers); + + if (type == FW_VERSION_T3 && major == FW_VERSION_MAJOR && + minor == FW_VERSION_MINOR) + return 0; + else if (major != FW_VERSION_MAJOR || minor < FW_VERSION_MINOR) + CH_WARN(adapter, "found old FW minor version(%u.%u), " + "driver compiled for version %u.%u\n", major, minor, + FW_VERSION_MAJOR, FW_VERSION_MINOR); + else { + CH_WARN(adapter, "found newer FW version(%u.%u), " + "driver compiled for version %u.%u\n", major, minor, + FW_VERSION_MAJOR, FW_VERSION_MINOR); + return 0; + } + return -EINVAL; +} + +/** + * t3_flash_erase_sectors - erase a range of flash sectors + * @adapter: the adapter + * @start: the first sector to erase + * @end: the last sector to erase + * + * Erases the sectors in the given range. + */ +static int t3_flash_erase_sectors(struct adapter *adapter, int start, int end) +{ + while (start <= end) { + int ret; + + if ((ret = sf1_write(adapter, 1, 0, SF_WR_ENABLE)) != 0 || + (ret = sf1_write(adapter, 4, 0, + SF_ERASE_SECTOR | (start << 8))) != 0 || + (ret = flash_wait_op(adapter, 5, 500)) != 0) + return ret; + start++; + } + return 0; +} + +/** + * t3_load_fw - download firmware + * @adapter: the adapter + * @fw_data: the firmware image to write + * @size: image size + * + * Write the supplied firmware image to the card's serial flash. + * The FW image has the following sections: @size - 8 bytes of code and + * data, followed by 4 bytes of FW version, followed by the 32-bit + * 1's complement checksum of the whole image. + */ +int t3_load_fw(struct adapter *adapter, const u8 *fw_data, unsigned int size) +{ + u32 csum; + unsigned int i; + const __be32 *p = (const __be32 *)fw_data; + int ret, addr, fw_sector = FW_FLASH_BOOT_ADDR >> 16; + + if ((size & 3) || size < FW_MIN_SIZE) + return -EINVAL; + if (size > FW_VERS_ADDR + 8 - FW_FLASH_BOOT_ADDR) + return -EFBIG; + + for (csum = 0, i = 0; i < size / sizeof(csum); i++) + csum += ntohl(p[i]); + if (csum != 0xffffffff) { + CH_ERR(adapter, "corrupted firmware image, checksum %u\n", + csum); + return -EINVAL; + } + + ret = t3_flash_erase_sectors(adapter, fw_sector, fw_sector); + if (ret) + goto out; + + size -= 8; /* trim off version and checksum */ + for (addr = FW_FLASH_BOOT_ADDR; size;) { + unsigned int chunk_size = min(size, 256U); + + ret = t3_write_flash(adapter, addr, chunk_size, fw_data); + if (ret) + goto out; + + addr += chunk_size; + fw_data += chunk_size; + size -= chunk_size; + } + + ret = t3_write_flash(adapter, FW_VERS_ADDR, 4, fw_data); +out: + if (ret) + CH_ERR(adapter, "firmware download failed, error %d\n", ret); + return ret; +} + +#define CIM_CTL_BASE 0x2000 + +/** + * t3_cim_ctl_blk_read - read a block from CIM control region + * + * @adap: the adapter + * @addr: the start address within the CIM control region + * @n: number of words to read + * @valp: where to store the result + * + * Reads a block of 4-byte words from the CIM control region. + */ +int t3_cim_ctl_blk_read(struct adapter *adap, unsigned int addr, + unsigned int n, unsigned int *valp) +{ + int ret = 0; + + if (t3_read_reg(adap, A_CIM_HOST_ACC_CTRL) & F_HOSTBUSY) + return -EBUSY; + + for ( ; !ret && n--; addr += 4) { + t3_write_reg(adap, A_CIM_HOST_ACC_CTRL, CIM_CTL_BASE + addr); + ret = t3_wait_op_done(adap, A_CIM_HOST_ACC_CTRL, F_HOSTBUSY, + 0, 5, 2); + if (!ret) + *valp++ = t3_read_reg(adap, A_CIM_HOST_ACC_DATA); + } + return ret; +} + +static void t3_gate_rx_traffic(struct cmac *mac, u32 *rx_cfg, + u32 *rx_hash_high, u32 *rx_hash_low) +{ + /* stop Rx unicast traffic */ + t3_mac_disable_exact_filters(mac); + + /* stop broadcast, multicast, promiscuous mode traffic */ + *rx_cfg = t3_read_reg(mac->adapter, A_XGM_RX_CFG); + t3_set_reg_field(mac->adapter, A_XGM_RX_CFG, + F_ENHASHMCAST | F_DISBCAST | F_COPYALLFRAMES, + F_DISBCAST); + + *rx_hash_high = t3_read_reg(mac->adapter, A_XGM_RX_HASH_HIGH); + t3_write_reg(mac->adapter, A_XGM_RX_HASH_HIGH, 0); + + *rx_hash_low = t3_read_reg(mac->adapter, A_XGM_RX_HASH_LOW); + t3_write_reg(mac->adapter, A_XGM_RX_HASH_LOW, 0); + + /* Leave time to drain max RX fifo */ + msleep(1); +} + +static void t3_open_rx_traffic(struct cmac *mac, u32 rx_cfg, + u32 rx_hash_high, u32 rx_hash_low) +{ + t3_mac_enable_exact_filters(mac); + t3_set_reg_field(mac->adapter, A_XGM_RX_CFG, + F_ENHASHMCAST | F_DISBCAST | F_COPYALLFRAMES, + rx_cfg); + t3_write_reg(mac->adapter, A_XGM_RX_HASH_HIGH, rx_hash_high); + t3_write_reg(mac->adapter, A_XGM_RX_HASH_LOW, rx_hash_low); +} + +/** + * t3_link_changed - handle interface link changes + * @adapter: the adapter + * @port_id: the port index that changed link state + * + * Called when a port's link settings change to propagate the new values + * to the associated PHY and MAC. After performing the common tasks it + * invokes an OS-specific handler. + */ +void t3_link_changed(struct adapter *adapter, int port_id) +{ + int link_ok, speed, duplex, fc; + struct port_info *pi = adap2pinfo(adapter, port_id); + struct cphy *phy = &pi->phy; + struct cmac *mac = &pi->mac; + struct link_config *lc = &pi->link_config; + + phy->ops->get_link_status(phy, &link_ok, &speed, &duplex, &fc); + + if (!lc->link_ok && link_ok) { + u32 rx_cfg, rx_hash_high, rx_hash_low; + u32 status; + + t3_xgm_intr_enable(adapter, port_id); + t3_gate_rx_traffic(mac, &rx_cfg, &rx_hash_high, &rx_hash_low); + t3_write_reg(adapter, A_XGM_RX_CTRL + mac->offset, 0); + t3_mac_enable(mac, MAC_DIRECTION_RX); + + status = t3_read_reg(adapter, A_XGM_INT_STATUS + mac->offset); + if (status & F_LINKFAULTCHANGE) { + mac->stats.link_faults++; + pi->link_fault = 1; + } + t3_open_rx_traffic(mac, rx_cfg, rx_hash_high, rx_hash_low); + } + + if (lc->requested_fc & PAUSE_AUTONEG) + fc &= lc->requested_fc; + else + fc = lc->requested_fc & (PAUSE_RX | PAUSE_TX); + + if (link_ok == lc->link_ok && speed == lc->speed && + duplex == lc->duplex && fc == lc->fc) + return; /* nothing changed */ + + if (link_ok != lc->link_ok && adapter->params.rev > 0 && + uses_xaui(adapter)) { + if (link_ok) + t3b_pcs_reset(mac); + t3_write_reg(adapter, A_XGM_XAUI_ACT_CTRL + mac->offset, + link_ok ? F_TXACTENABLE | F_RXEN : 0); + } + lc->link_ok = link_ok; + lc->speed = speed < 0 ? SPEED_INVALID : speed; + lc->duplex = duplex < 0 ? DUPLEX_INVALID : duplex; + + if (link_ok && speed >= 0 && lc->autoneg == AUTONEG_ENABLE) { + /* Set MAC speed, duplex, and flow control to match PHY. */ + t3_mac_set_speed_duplex_fc(mac, speed, duplex, fc); + lc->fc = fc; + } + + t3_os_link_changed(adapter, port_id, link_ok && !pi->link_fault, + speed, duplex, fc); +} + +void t3_link_fault(struct adapter *adapter, int port_id) +{ + struct port_info *pi = adap2pinfo(adapter, port_id); + struct cmac *mac = &pi->mac; + struct cphy *phy = &pi->phy; + struct link_config *lc = &pi->link_config; + int link_ok, speed, duplex, fc, link_fault; + u32 rx_cfg, rx_hash_high, rx_hash_low; + + t3_gate_rx_traffic(mac, &rx_cfg, &rx_hash_high, &rx_hash_low); + + if (adapter->params.rev > 0 && uses_xaui(adapter)) + t3_write_reg(adapter, A_XGM_XAUI_ACT_CTRL + mac->offset, 0); + + t3_write_reg(adapter, A_XGM_RX_CTRL + mac->offset, 0); + t3_mac_enable(mac, MAC_DIRECTION_RX); + + t3_open_rx_traffic(mac, rx_cfg, rx_hash_high, rx_hash_low); + + link_fault = t3_read_reg(adapter, + A_XGM_INT_STATUS + mac->offset); + link_fault &= F_LINKFAULTCHANGE; + + link_ok = lc->link_ok; + speed = lc->speed; + duplex = lc->duplex; + fc = lc->fc; + + phy->ops->get_link_status(phy, &link_ok, &speed, &duplex, &fc); + + if (link_fault) { + lc->link_ok = 0; + lc->speed = SPEED_INVALID; + lc->duplex = DUPLEX_INVALID; + + t3_os_link_fault(adapter, port_id, 0); + + /* Account link faults only when the phy reports a link up */ + if (link_ok) + mac->stats.link_faults++; + } else { + if (link_ok) + t3_write_reg(adapter, A_XGM_XAUI_ACT_CTRL + mac->offset, + F_TXACTENABLE | F_RXEN); + + pi->link_fault = 0; + lc->link_ok = (unsigned char)link_ok; + lc->speed = speed < 0 ? SPEED_INVALID : speed; + lc->duplex = duplex < 0 ? DUPLEX_INVALID : duplex; + t3_os_link_fault(adapter, port_id, link_ok); + } +} + +/** + * t3_link_start - apply link configuration to MAC/PHY + * @phy: the PHY to setup + * @mac: the MAC to setup + * @lc: the requested link configuration + * + * Set up a port's MAC and PHY according to a desired link configuration. + * - If the PHY can auto-negotiate first decide what to advertise, then + * enable/disable auto-negotiation as desired, and reset. + * - If the PHY does not auto-negotiate just reset it. + * - If auto-negotiation is off set the MAC to the proper speed/duplex/FC, + * otherwise do it later based on the outcome of auto-negotiation. + */ +int t3_link_start(struct cphy *phy, struct cmac *mac, struct link_config *lc) +{ + unsigned int fc = lc->requested_fc & (PAUSE_RX | PAUSE_TX); + + lc->link_ok = 0; + if (lc->supported & SUPPORTED_Autoneg) { + lc->advertising &= ~(ADVERTISED_Asym_Pause | ADVERTISED_Pause); + if (fc) { + lc->advertising |= ADVERTISED_Asym_Pause; + if (fc & PAUSE_RX) + lc->advertising |= ADVERTISED_Pause; + } + phy->ops->advertise(phy, lc->advertising); + + if (lc->autoneg == AUTONEG_DISABLE) { + lc->speed = lc->requested_speed; + lc->duplex = lc->requested_duplex; + lc->fc = (unsigned char)fc; + t3_mac_set_speed_duplex_fc(mac, lc->speed, lc->duplex, + fc); + /* Also disables autoneg */ + phy->ops->set_speed_duplex(phy, lc->speed, lc->duplex); + } else + phy->ops->autoneg_enable(phy); + } else { + t3_mac_set_speed_duplex_fc(mac, -1, -1, fc); + lc->fc = (unsigned char)fc; + phy->ops->reset(phy, 0); + } + return 0; +} + +/** + * t3_set_vlan_accel - control HW VLAN extraction + * @adapter: the adapter + * @ports: bitmap of adapter ports to operate on + * @on: enable (1) or disable (0) HW VLAN extraction + * + * Enables or disables HW extraction of VLAN tags for the given port. + */ +void t3_set_vlan_accel(struct adapter *adapter, unsigned int ports, int on) +{ + t3_set_reg_field(adapter, A_TP_OUT_CONFIG, + ports << S_VLANEXTRACTIONENABLE, + on ? (ports << S_VLANEXTRACTIONENABLE) : 0); +} + +struct intr_info { + unsigned int mask; /* bits to check in interrupt status */ + const char *msg; /* message to print or NULL */ + short stat_idx; /* stat counter to increment or -1 */ + unsigned short fatal; /* whether the condition reported is fatal */ +}; + +/** + * t3_handle_intr_status - table driven interrupt handler + * @adapter: the adapter that generated the interrupt + * @reg: the interrupt status register to process + * @mask: a mask to apply to the interrupt status + * @acts: table of interrupt actions + * @stats: statistics counters tracking interrupt occurrences + * + * A table driven interrupt handler that applies a set of masks to an + * interrupt status word and performs the corresponding actions if the + * interrupts described by the mask have occurred. The actions include + * optionally printing a warning or alert message, and optionally + * incrementing a stat counter. The table is terminated by an entry + * specifying mask 0. Returns the number of fatal interrupt conditions. + */ +static int t3_handle_intr_status(struct adapter *adapter, unsigned int reg, + unsigned int mask, + const struct intr_info *acts, + unsigned long *stats) +{ + int fatal = 0; + unsigned int status = t3_read_reg(adapter, reg) & mask; + + for (; acts->mask; ++acts) { + if (!(status & acts->mask)) + continue; + if (acts->fatal) { + fatal++; + CH_ALERT(adapter, "%s (0x%x)\n", + acts->msg, status & acts->mask); + status &= ~acts->mask; + } else if (acts->msg) + CH_WARN(adapter, "%s (0x%x)\n", + acts->msg, status & acts->mask); + if (acts->stat_idx >= 0) + stats[acts->stat_idx]++; + } + if (status) /* clear processed interrupts */ + t3_write_reg(adapter, reg, status); + return fatal; +} + +#define SGE_INTR_MASK (F_RSPQDISABLED | \ + F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR | \ + F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \ + F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \ + V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \ + F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \ + F_HIRCQPARITYERROR | F_LOPRIORITYDBFULL | \ + F_HIPRIORITYDBFULL | F_LOPRIORITYDBEMPTY | \ + F_HIPRIORITYDBEMPTY | F_HIPIODRBDROPERR | \ + F_LOPIODRBDROPERR) +#define MC5_INTR_MASK (F_PARITYERR | F_ACTRGNFULL | F_UNKNOWNCMD | \ + F_REQQPARERR | F_DISPQPARERR | F_DELACTEMPTY | \ + F_NFASRCHFAIL) +#define MC7_INTR_MASK (F_AE | F_UE | F_CE | V_PE(M_PE)) +#define XGM_INTR_MASK (V_TXFIFO_PRTY_ERR(M_TXFIFO_PRTY_ERR) | \ + V_RXFIFO_PRTY_ERR(M_RXFIFO_PRTY_ERR) | \ + F_TXFIFO_UNDERRUN) +#define PCIX_INTR_MASK (F_MSTDETPARERR | F_SIGTARABT | F_RCVTARABT | \ + F_RCVMSTABT | F_SIGSYSERR | F_DETPARERR | \ + F_SPLCMPDIS | F_UNXSPLCMP | F_RCVSPLCMPERR | \ + F_DETCORECCERR | F_DETUNCECCERR | F_PIOPARERR | \ + V_WFPARERR(M_WFPARERR) | V_RFPARERR(M_RFPARERR) | \ + V_CFPARERR(M_CFPARERR) /* | V_MSIXPARERR(M_MSIXPARERR) */) +#define PCIE_INTR_MASK (F_UNXSPLCPLERRR | F_UNXSPLCPLERRC | F_PCIE_PIOPARERR |\ + F_PCIE_WFPARERR | F_PCIE_RFPARERR | F_PCIE_CFPARERR | \ + /* V_PCIE_MSIXPARERR(M_PCIE_MSIXPARERR) | */ \ + F_RETRYBUFPARERR | F_RETRYLUTPARERR | F_RXPARERR | \ + F_TXPARERR | V_BISTERR(M_BISTERR)) +#define ULPRX_INTR_MASK (F_PARERRDATA | F_PARERRPCMD | F_ARBPF1PERR | \ + F_ARBPF0PERR | F_ARBFPERR | F_PCMDMUXPERR | \ + F_DATASELFRAMEERR1 | F_DATASELFRAMEERR0) +#define ULPTX_INTR_MASK 0xfc +#define CPLSW_INTR_MASK (F_CIM_OP_MAP_PERR | F_TP_FRAMING_ERROR | \ + F_SGE_FRAMING_ERROR | F_CIM_FRAMING_ERROR | \ + F_ZERO_SWITCH_ERROR) +#define CIM_INTR_MASK (F_BLKWRPLINT | F_BLKRDPLINT | F_BLKWRCTLINT | \ + F_BLKRDCTLINT | F_BLKWRFLASHINT | F_BLKRDFLASHINT | \ + F_SGLWRFLASHINT | F_WRBLKFLASHINT | F_BLKWRBOOTINT | \ + F_FLASHRANGEINT | F_SDRAMRANGEINT | F_RSVDSPACEINT | \ + F_DRAMPARERR | F_ICACHEPARERR | F_DCACHEPARERR | \ + F_OBQSGEPARERR | F_OBQULPHIPARERR | F_OBQULPLOPARERR | \ + F_IBQSGELOPARERR | F_IBQSGEHIPARERR | F_IBQULPPARERR | \ + F_IBQTPPARERR | F_ITAGPARERR | F_DTAGPARERR) +#define PMTX_INTR_MASK (F_ZERO_C_CMD_ERROR | ICSPI_FRM_ERR | OESPI_FRM_ERR | \ + V_ICSPI_PAR_ERROR(M_ICSPI_PAR_ERROR) | \ + V_OESPI_PAR_ERROR(M_OESPI_PAR_ERROR)) +#define PMRX_INTR_MASK (F_ZERO_E_CMD_ERROR | IESPI_FRM_ERR | OCSPI_FRM_ERR | \ + V_IESPI_PAR_ERROR(M_IESPI_PAR_ERROR) | \ + V_OCSPI_PAR_ERROR(M_OCSPI_PAR_ERROR)) +#define MPS_INTR_MASK (V_TX0TPPARERRENB(M_TX0TPPARERRENB) | \ + V_TX1TPPARERRENB(M_TX1TPPARERRENB) | \ + V_RXTPPARERRENB(M_RXTPPARERRENB) | \ + V_MCAPARERRENB(M_MCAPARERRENB)) +#define XGM_EXTRA_INTR_MASK (F_LINKFAULTCHANGE) +#define PL_INTR_MASK (F_T3DBG | F_XGMAC0_0 | F_XGMAC0_1 | F_MC5A | F_PM1_TX | \ + F_PM1_RX | F_ULP2_TX | F_ULP2_RX | F_TP1 | F_CIM | \ + F_MC7_CM | F_MC7_PMTX | F_MC7_PMRX | F_SGE3 | F_PCIM0 | \ + F_MPS0 | F_CPL_SWITCH) +/* + * Interrupt handler for the PCIX1 module. + */ +static void pci_intr_handler(struct adapter *adapter) +{ + static const struct intr_info pcix1_intr_info[] = { + {F_MSTDETPARERR, "PCI master detected parity error", -1, 1}, + {F_SIGTARABT, "PCI signaled target abort", -1, 1}, + {F_RCVTARABT, "PCI received target abort", -1, 1}, + {F_RCVMSTABT, "PCI received master abort", -1, 1}, + {F_SIGSYSERR, "PCI signaled system error", -1, 1}, + {F_DETPARERR, "PCI detected parity error", -1, 1}, + {F_SPLCMPDIS, "PCI split completion discarded", -1, 1}, + {F_UNXSPLCMP, "PCI unexpected split completion error", -1, 1}, + {F_RCVSPLCMPERR, "PCI received split completion error", -1, + 1}, + {F_DETCORECCERR, "PCI correctable ECC error", + STAT_PCI_CORR_ECC, 0}, + {F_DETUNCECCERR, "PCI uncorrectable ECC error", -1, 1}, + {F_PIOPARERR, "PCI PIO FIFO parity error", -1, 1}, + {V_WFPARERR(M_WFPARERR), "PCI write FIFO parity error", -1, + 1}, + {V_RFPARERR(M_RFPARERR), "PCI read FIFO parity error", -1, + 1}, + {V_CFPARERR(M_CFPARERR), "PCI command FIFO parity error", -1, + 1}, + {V_MSIXPARERR(M_MSIXPARERR), "PCI MSI-X table/PBA parity " + "error", -1, 1}, + {0} + }; + + if (t3_handle_intr_status(adapter, A_PCIX_INT_CAUSE, PCIX_INTR_MASK, + pcix1_intr_info, adapter->irq_stats)) + t3_fatal_err(adapter); +} + +/* + * Interrupt handler for the PCIE module. + */ +static void pcie_intr_handler(struct adapter *adapter) +{ + static const struct intr_info pcie_intr_info[] = { + {F_PEXERR, "PCI PEX error", -1, 1}, + {F_UNXSPLCPLERRR, + "PCI unexpected split completion DMA read error", -1, 1}, + {F_UNXSPLCPLERRC, + "PCI unexpected split completion DMA command error", -1, 1}, + {F_PCIE_PIOPARERR, "PCI PIO FIFO parity error", -1, 1}, + {F_PCIE_WFPARERR, "PCI write FIFO parity error", -1, 1}, + {F_PCIE_RFPARERR, "PCI read FIFO parity error", -1, 1}, + {F_PCIE_CFPARERR, "PCI command FIFO parity error", -1, 1}, + {V_PCIE_MSIXPARERR(M_PCIE_MSIXPARERR), + "PCI MSI-X table/PBA parity error", -1, 1}, + {F_RETRYBUFPARERR, "PCI retry buffer parity error", -1, 1}, + {F_RETRYLUTPARERR, "PCI retry LUT parity error", -1, 1}, + {F_RXPARERR, "PCI Rx parity error", -1, 1}, + {F_TXPARERR, "PCI Tx parity error", -1, 1}, + {V_BISTERR(M_BISTERR), "PCI BIST error", -1, 1}, + {0} + }; + + if (t3_read_reg(adapter, A_PCIE_INT_CAUSE) & F_PEXERR) + CH_ALERT(adapter, "PEX error code 0x%x\n", + t3_read_reg(adapter, A_PCIE_PEX_ERR)); + + if (t3_handle_intr_status(adapter, A_PCIE_INT_CAUSE, PCIE_INTR_MASK, + pcie_intr_info, adapter->irq_stats)) + t3_fatal_err(adapter); +} + +/* + * TP interrupt handler. + */ +static void tp_intr_handler(struct adapter *adapter) +{ + static const struct intr_info tp_intr_info[] = { + {0xffffff, "TP parity error", -1, 1}, + {0x1000000, "TP out of Rx pages", -1, 1}, + {0x2000000, "TP out of Tx pages", -1, 1}, + {0} + }; + + static const struct intr_info tp_intr_info_t3c[] = { + {0x1fffffff, "TP parity error", -1, 1}, + {F_FLMRXFLSTEMPTY, "TP out of Rx pages", -1, 1}, + {F_FLMTXFLSTEMPTY, "TP out of Tx pages", -1, 1}, + {0} + }; + + if (t3_handle_intr_status(adapter, A_TP_INT_CAUSE, 0xffffffff, + adapter->params.rev < T3_REV_C ? + tp_intr_info : tp_intr_info_t3c, NULL)) + t3_fatal_err(adapter); +} + +/* + * CIM interrupt handler. + */ +static void cim_intr_handler(struct adapter *adapter) +{ + static const struct intr_info cim_intr_info[] = { + {F_RSVDSPACEINT, "CIM reserved space write", -1, 1}, + {F_SDRAMRANGEINT, "CIM SDRAM address out of range", -1, 1}, + {F_FLASHRANGEINT, "CIM flash address out of range", -1, 1}, + {F_BLKWRBOOTINT, "CIM block write to boot space", -1, 1}, + {F_WRBLKFLASHINT, "CIM write to cached flash space", -1, 1}, + {F_SGLWRFLASHINT, "CIM single write to flash space", -1, 1}, + {F_BLKRDFLASHINT, "CIM block read from flash space", -1, 1}, + {F_BLKWRFLASHINT, "CIM block write to flash space", -1, 1}, + {F_BLKRDCTLINT, "CIM block read from CTL space", -1, 1}, + {F_BLKWRCTLINT, "CIM block write to CTL space", -1, 1}, + {F_BLKRDPLINT, "CIM block read from PL space", -1, 1}, + {F_BLKWRPLINT, "CIM block write to PL space", -1, 1}, + {F_DRAMPARERR, "CIM DRAM parity error", -1, 1}, + {F_ICACHEPARERR, "CIM icache parity error", -1, 1}, + {F_DCACHEPARERR, "CIM dcache parity error", -1, 1}, + {F_OBQSGEPARERR, "CIM OBQ SGE parity error", -1, 1}, + {F_OBQULPHIPARERR, "CIM OBQ ULPHI parity error", -1, 1}, + {F_OBQULPLOPARERR, "CIM OBQ ULPLO parity error", -1, 1}, + {F_IBQSGELOPARERR, "CIM IBQ SGELO parity error", -1, 1}, + {F_IBQSGEHIPARERR, "CIM IBQ SGEHI parity error", -1, 1}, + {F_IBQULPPARERR, "CIM IBQ ULP parity error", -1, 1}, + {F_IBQTPPARERR, "CIM IBQ TP parity error", -1, 1}, + {F_ITAGPARERR, "CIM itag parity error", -1, 1}, + {F_DTAGPARERR, "CIM dtag parity error", -1, 1}, + {0} + }; + + if (t3_handle_intr_status(adapter, A_CIM_HOST_INT_CAUSE, 0xffffffff, + cim_intr_info, NULL)) + t3_fatal_err(adapter); +} + +/* + * ULP RX interrupt handler. + */ +static void ulprx_intr_handler(struct adapter *adapter) +{ + static const struct intr_info ulprx_intr_info[] = { + {F_PARERRDATA, "ULP RX data parity error", -1, 1}, + {F_PARERRPCMD, "ULP RX command parity error", -1, 1}, + {F_ARBPF1PERR, "ULP RX ArbPF1 parity error", -1, 1}, + {F_ARBPF0PERR, "ULP RX ArbPF0 parity error", -1, 1}, + {F_ARBFPERR, "ULP RX ArbF parity error", -1, 1}, + {F_PCMDMUXPERR, "ULP RX PCMDMUX parity error", -1, 1}, + {F_DATASELFRAMEERR1, "ULP RX frame error", -1, 1}, + {F_DATASELFRAMEERR0, "ULP RX frame error", -1, 1}, + {0} + }; + + if (t3_handle_intr_status(adapter, A_ULPRX_INT_CAUSE, 0xffffffff, + ulprx_intr_info, NULL)) + t3_fatal_err(adapter); +} + +/* + * ULP TX interrupt handler. + */ +static void ulptx_intr_handler(struct adapter *adapter) +{ + static const struct intr_info ulptx_intr_info[] = { + {F_PBL_BOUND_ERR_CH0, "ULP TX channel 0 PBL out of bounds", + STAT_ULP_CH0_PBL_OOB, 0}, + {F_PBL_BOUND_ERR_CH1, "ULP TX channel 1 PBL out of bounds", + STAT_ULP_CH1_PBL_OOB, 0}, + {0xfc, "ULP TX parity error", -1, 1}, + {0} + }; + + if (t3_handle_intr_status(adapter, A_ULPTX_INT_CAUSE, 0xffffffff, + ulptx_intr_info, adapter->irq_stats)) + t3_fatal_err(adapter); +} + +#define ICSPI_FRM_ERR (F_ICSPI0_FIFO2X_RX_FRAMING_ERROR | \ + F_ICSPI1_FIFO2X_RX_FRAMING_ERROR | F_ICSPI0_RX_FRAMING_ERROR | \ + F_ICSPI1_RX_FRAMING_ERROR | F_ICSPI0_TX_FRAMING_ERROR | \ + F_ICSPI1_TX_FRAMING_ERROR) +#define OESPI_FRM_ERR (F_OESPI0_RX_FRAMING_ERROR | \ + F_OESPI1_RX_FRAMING_ERROR | F_OESPI0_TX_FRAMING_ERROR | \ + F_OESPI1_TX_FRAMING_ERROR | F_OESPI0_OFIFO2X_TX_FRAMING_ERROR | \ + F_OESPI1_OFIFO2X_TX_FRAMING_ERROR) + +/* + * PM TX interrupt handler. + */ +static void pmtx_intr_handler(struct adapter *adapter) +{ + static const struct intr_info pmtx_intr_info[] = { + {F_ZERO_C_CMD_ERROR, "PMTX 0-length pcmd", -1, 1}, + {ICSPI_FRM_ERR, "PMTX ispi framing error", -1, 1}, + {OESPI_FRM_ERR, "PMTX ospi framing error", -1, 1}, + {V_ICSPI_PAR_ERROR(M_ICSPI_PAR_ERROR), + "PMTX ispi parity error", -1, 1}, + {V_OESPI_PAR_ERROR(M_OESPI_PAR_ERROR), + "PMTX ospi parity error", -1, 1}, + {0} + }; + + if (t3_handle_intr_status(adapter, A_PM1_TX_INT_CAUSE, 0xffffffff, + pmtx_intr_info, NULL)) + t3_fatal_err(adapter); +} + +#define IESPI_FRM_ERR (F_IESPI0_FIFO2X_RX_FRAMING_ERROR | \ + F_IESPI1_FIFO2X_RX_FRAMING_ERROR | F_IESPI0_RX_FRAMING_ERROR | \ + F_IESPI1_RX_FRAMING_ERROR | F_IESPI0_TX_FRAMING_ERROR | \ + F_IESPI1_TX_FRAMING_ERROR) +#define OCSPI_FRM_ERR (F_OCSPI0_RX_FRAMING_ERROR | \ + F_OCSPI1_RX_FRAMING_ERROR | F_OCSPI0_TX_FRAMING_ERROR | \ + F_OCSPI1_TX_FRAMING_ERROR | F_OCSPI0_OFIFO2X_TX_FRAMING_ERROR | \ + F_OCSPI1_OFIFO2X_TX_FRAMING_ERROR) + +/* + * PM RX interrupt handler. + */ +static void pmrx_intr_handler(struct adapter *adapter) +{ + static const struct intr_info pmrx_intr_info[] = { + {F_ZERO_E_CMD_ERROR, "PMRX 0-length pcmd", -1, 1}, + {IESPI_FRM_ERR, "PMRX ispi framing error", -1, 1}, + {OCSPI_FRM_ERR, "PMRX ospi framing error", -1, 1}, + {V_IESPI_PAR_ERROR(M_IESPI_PAR_ERROR), + "PMRX ispi parity error", -1, 1}, + {V_OCSPI_PAR_ERROR(M_OCSPI_PAR_ERROR), + "PMRX ospi parity error", -1, 1}, + {0} + }; + + if (t3_handle_intr_status(adapter, A_PM1_RX_INT_CAUSE, 0xffffffff, + pmrx_intr_info, NULL)) + t3_fatal_err(adapter); +} + +/* + * CPL switch interrupt handler. + */ +static void cplsw_intr_handler(struct adapter *adapter) +{ + static const struct intr_info cplsw_intr_info[] = { + {F_CIM_OP_MAP_PERR, "CPL switch CIM parity error", -1, 1}, + {F_CIM_OVFL_ERROR, "CPL switch CIM overflow", -1, 1}, + {F_TP_FRAMING_ERROR, "CPL switch TP framing error", -1, 1}, + {F_SGE_FRAMING_ERROR, "CPL switch SGE framing error", -1, 1}, + {F_CIM_FRAMING_ERROR, "CPL switch CIM framing error", -1, 1}, + {F_ZERO_SWITCH_ERROR, "CPL switch no-switch error", -1, 1}, + {0} + }; + + if (t3_handle_intr_status(adapter, A_CPL_INTR_CAUSE, 0xffffffff, + cplsw_intr_info, NULL)) + t3_fatal_err(adapter); +} + +/* + * MPS interrupt handler. + */ +static void mps_intr_handler(struct adapter *adapter) +{ + static const struct intr_info mps_intr_info[] = { + {0x1ff, "MPS parity error", -1, 1}, + {0} + }; + + if (t3_handle_intr_status(adapter, A_MPS_INT_CAUSE, 0xffffffff, + mps_intr_info, NULL)) + t3_fatal_err(adapter); +} + +#define MC7_INTR_FATAL (F_UE | V_PE(M_PE) | F_AE) + +/* + * MC7 interrupt handler. + */ +static void mc7_intr_handler(struct mc7 *mc7) +{ + struct adapter *adapter = mc7->adapter; + u32 cause = t3_read_reg(adapter, mc7->offset + A_MC7_INT_CAUSE); + + if (cause & F_CE) { + mc7->stats.corr_err++; + CH_WARN(adapter, "%s MC7 correctable error at addr 0x%x, " + "data 0x%x 0x%x 0x%x\n", mc7->name, + t3_read_reg(adapter, mc7->offset + A_MC7_CE_ADDR), + t3_read_reg(adapter, mc7->offset + A_MC7_CE_DATA0), + t3_read_reg(adapter, mc7->offset + A_MC7_CE_DATA1), + t3_read_reg(adapter, mc7->offset + A_MC7_CE_DATA2)); + } + + if (cause & F_UE) { + mc7->stats.uncorr_err++; + CH_ALERT(adapter, "%s MC7 uncorrectable error at addr 0x%x, " + "data 0x%x 0x%x 0x%x\n", mc7->name, + t3_read_reg(adapter, mc7->offset + A_MC7_UE_ADDR), + t3_read_reg(adapter, mc7->offset + A_MC7_UE_DATA0), + t3_read_reg(adapter, mc7->offset + A_MC7_UE_DATA1), + t3_read_reg(adapter, mc7->offset + A_MC7_UE_DATA2)); + } + + if (G_PE(cause)) { + mc7->stats.parity_err++; + CH_ALERT(adapter, "%s MC7 parity error 0x%x\n", + mc7->name, G_PE(cause)); + } + + if (cause & F_AE) { + u32 addr = 0; + + if (adapter->params.rev > 0) + addr = t3_read_reg(adapter, + mc7->offset + A_MC7_ERR_ADDR); + mc7->stats.addr_err++; + CH_ALERT(adapter, "%s MC7 address error: 0x%x\n", + mc7->name, addr); + } + + if (cause & MC7_INTR_FATAL) + t3_fatal_err(adapter); + + t3_write_reg(adapter, mc7->offset + A_MC7_INT_CAUSE, cause); +} + +#define XGM_INTR_FATAL (V_TXFIFO_PRTY_ERR(M_TXFIFO_PRTY_ERR) | \ + V_RXFIFO_PRTY_ERR(M_RXFIFO_PRTY_ERR)) +/* + * XGMAC interrupt handler. + */ +static int mac_intr_handler(struct adapter *adap, unsigned int idx) +{ + struct cmac *mac = &adap2pinfo(adap, idx)->mac; + /* + * We mask out interrupt causes for which we're not taking interrupts. + * This allows us to use polling logic to monitor some of the other + * conditions when taking interrupts would impose too much load on the + * system. + */ + u32 cause = t3_read_reg(adap, A_XGM_INT_CAUSE + mac->offset) & + ~F_RXFIFO_OVERFLOW; + + if (cause & V_TXFIFO_PRTY_ERR(M_TXFIFO_PRTY_ERR)) { + mac->stats.tx_fifo_parity_err++; + CH_ALERT(adap, "port%d: MAC TX FIFO parity error\n", idx); + } + if (cause & V_RXFIFO_PRTY_ERR(M_RXFIFO_PRTY_ERR)) { + mac->stats.rx_fifo_parity_err++; + CH_ALERT(adap, "port%d: MAC RX FIFO parity error\n", idx); + } + if (cause & F_TXFIFO_UNDERRUN) + mac->stats.tx_fifo_urun++; + if (cause & F_RXFIFO_OVERFLOW) + mac->stats.rx_fifo_ovfl++; + if (cause & V_SERDES_LOS(M_SERDES_LOS)) + mac->stats.serdes_signal_loss++; + if (cause & F_XAUIPCSCTCERR) + mac->stats.xaui_pcs_ctc_err++; + if (cause & F_XAUIPCSALIGNCHANGE) + mac->stats.xaui_pcs_align_change++; + if (cause & F_XGM_INT) { + t3_set_reg_field(adap, + A_XGM_INT_ENABLE + mac->offset, + F_XGM_INT, 0); + mac->stats.link_faults++; + + t3_os_link_fault_handler(adap, idx); + } + + if (cause & XGM_INTR_FATAL) + t3_fatal_err(adap); + + t3_write_reg(adap, A_XGM_INT_CAUSE + mac->offset, cause); + return cause != 0; +} + +/* + * Interrupt handler for PHY events. + */ +int t3_phy_intr_handler(struct adapter *adapter) +{ + u32 i, cause = t3_read_reg(adapter, A_T3DBG_INT_CAUSE); + + for_each_port(adapter, i) { + struct port_info *p = adap2pinfo(adapter, i); + + if (!(p->phy.caps & SUPPORTED_IRQ)) + continue; + + if (cause & (1 << adapter_info(adapter)->gpio_intr[i])) { + int phy_cause = p->phy.ops->intr_handler(&p->phy); + + if (phy_cause & cphy_cause_link_change) + t3_link_changed(adapter, i); + if (phy_cause & cphy_cause_fifo_error) + p->phy.fifo_errors++; + if (phy_cause & cphy_cause_module_change) + t3_os_phymod_changed(adapter, i); + } + } + + t3_write_reg(adapter, A_T3DBG_INT_CAUSE, cause); + return 0; +} + +/* + * T3 slow path (non-data) interrupt handler. + */ +int t3_slow_intr_handler(struct adapter *adapter) +{ + u32 cause = t3_read_reg(adapter, A_PL_INT_CAUSE0); + + cause &= adapter->slow_intr_mask; + if (!cause) + return 0; + if (cause & F_PCIM0) { + if (is_pcie(adapter)) + pcie_intr_handler(adapter); + else + pci_intr_handler(adapter); + } + if (cause & F_SGE3) + t3_sge_err_intr_handler(adapter); + if (cause & F_MC7_PMRX) + mc7_intr_handler(&adapter->pmrx); + if (cause & F_MC7_PMTX) + mc7_intr_handler(&adapter->pmtx); + if (cause & F_MC7_CM) + mc7_intr_handler(&adapter->cm); + if (cause & F_CIM) + cim_intr_handler(adapter); + if (cause & F_TP1) + tp_intr_handler(adapter); + if (cause & F_ULP2_RX) + ulprx_intr_handler(adapter); + if (cause & F_ULP2_TX) + ulptx_intr_handler(adapter); + if (cause & F_PM1_RX) + pmrx_intr_handler(adapter); + if (cause & F_PM1_TX) + pmtx_intr_handler(adapter); + if (cause & F_CPL_SWITCH) + cplsw_intr_handler(adapter); + if (cause & F_MPS0) + mps_intr_handler(adapter); + if (cause & F_MC5A) + t3_mc5_intr_handler(&adapter->mc5); + if (cause & F_XGMAC0_0) + mac_intr_handler(adapter, 0); + if (cause & F_XGMAC0_1) + mac_intr_handler(adapter, 1); + if (cause & F_T3DBG) + t3_os_ext_intr_handler(adapter); + + /* Clear the interrupts just processed. */ + t3_write_reg(adapter, A_PL_INT_CAUSE0, cause); + t3_read_reg(adapter, A_PL_INT_CAUSE0); /* flush */ + return 1; +} + +static unsigned int calc_gpio_intr(struct adapter *adap) +{ + unsigned int i, gpi_intr = 0; + + for_each_port(adap, i) + if ((adap2pinfo(adap, i)->phy.caps & SUPPORTED_IRQ) && + adapter_info(adap)->gpio_intr[i]) + gpi_intr |= 1 << adapter_info(adap)->gpio_intr[i]; + return gpi_intr; +} + +/** + * t3_intr_enable - enable interrupts + * @adapter: the adapter whose interrupts should be enabled + * + * Enable interrupts by setting the interrupt enable registers of the + * various HW modules and then enabling the top-level interrupt + * concentrator. + */ +void t3_intr_enable(struct adapter *adapter) +{ + static const struct addr_val_pair intr_en_avp[] = { + {A_SG_INT_ENABLE, SGE_INTR_MASK}, + {A_MC7_INT_ENABLE, MC7_INTR_MASK}, + {A_MC7_INT_ENABLE - MC7_PMRX_BASE_ADDR + MC7_PMTX_BASE_ADDR, + MC7_INTR_MASK}, + {A_MC7_INT_ENABLE - MC7_PMRX_BASE_ADDR + MC7_CM_BASE_ADDR, + MC7_INTR_MASK}, + {A_MC5_DB_INT_ENABLE, MC5_INTR_MASK}, + {A_ULPRX_INT_ENABLE, ULPRX_INTR_MASK}, + {A_PM1_TX_INT_ENABLE, PMTX_INTR_MASK}, + {A_PM1_RX_INT_ENABLE, PMRX_INTR_MASK}, + {A_CIM_HOST_INT_ENABLE, CIM_INTR_MASK}, + {A_MPS_INT_ENABLE, MPS_INTR_MASK}, + }; + + adapter->slow_intr_mask = PL_INTR_MASK; + + t3_write_regs(adapter, intr_en_avp, ARRAY_SIZE(intr_en_avp), 0); + t3_write_reg(adapter, A_TP_INT_ENABLE, + adapter->params.rev >= T3_REV_C ? 0x2bfffff : 0x3bfffff); + + if (adapter->params.rev > 0) { + t3_write_reg(adapter, A_CPL_INTR_ENABLE, + CPLSW_INTR_MASK | F_CIM_OVFL_ERROR); + t3_write_reg(adapter, A_ULPTX_INT_ENABLE, + ULPTX_INTR_MASK | F_PBL_BOUND_ERR_CH0 | + F_PBL_BOUND_ERR_CH1); + } else { + t3_write_reg(adapter, A_CPL_INTR_ENABLE, CPLSW_INTR_MASK); + t3_write_reg(adapter, A_ULPTX_INT_ENABLE, ULPTX_INTR_MASK); + } + + t3_write_reg(adapter, A_T3DBG_INT_ENABLE, calc_gpio_intr(adapter)); + + if (is_pcie(adapter)) + t3_write_reg(adapter, A_PCIE_INT_ENABLE, PCIE_INTR_MASK); + else + t3_write_reg(adapter, A_PCIX_INT_ENABLE, PCIX_INTR_MASK); + t3_write_reg(adapter, A_PL_INT_ENABLE0, adapter->slow_intr_mask); + t3_read_reg(adapter, A_PL_INT_ENABLE0); /* flush */ +} + +/** + * t3_intr_disable - disable a card's interrupts + * @adapter: the adapter whose interrupts should be disabled + * + * Disable interrupts. We only disable the top-level interrupt + * concentrator and the SGE data interrupts. + */ +void t3_intr_disable(struct adapter *adapter) +{ + t3_write_reg(adapter, A_PL_INT_ENABLE0, 0); + t3_read_reg(adapter, A_PL_INT_ENABLE0); /* flush */ + adapter->slow_intr_mask = 0; +} + +/** + * t3_intr_clear - clear all interrupts + * @adapter: the adapter whose interrupts should be cleared + * + * Clears all interrupts. + */ +void t3_intr_clear(struct adapter *adapter) +{ + static const unsigned int cause_reg_addr[] = { + A_SG_INT_CAUSE, + A_SG_RSPQ_FL_STATUS, + A_PCIX_INT_CAUSE, + A_MC7_INT_CAUSE, + A_MC7_INT_CAUSE - MC7_PMRX_BASE_ADDR + MC7_PMTX_BASE_ADDR, + A_MC7_INT_CAUSE - MC7_PMRX_BASE_ADDR + MC7_CM_BASE_ADDR, + A_CIM_HOST_INT_CAUSE, + A_TP_INT_CAUSE, + A_MC5_DB_INT_CAUSE, + A_ULPRX_INT_CAUSE, + A_ULPTX_INT_CAUSE, + A_CPL_INTR_CAUSE, + A_PM1_TX_INT_CAUSE, + A_PM1_RX_INT_CAUSE, + A_MPS_INT_CAUSE, + A_T3DBG_INT_CAUSE, + }; + unsigned int i; + + /* Clear PHY and MAC interrupts for each port. */ + for_each_port(adapter, i) + t3_port_intr_clear(adapter, i); + + for (i = 0; i < ARRAY_SIZE(cause_reg_addr); ++i) + t3_write_reg(adapter, cause_reg_addr[i], 0xffffffff); + + if (is_pcie(adapter)) + t3_write_reg(adapter, A_PCIE_PEX_ERR, 0xffffffff); + t3_write_reg(adapter, A_PL_INT_CAUSE0, 0xffffffff); + t3_read_reg(adapter, A_PL_INT_CAUSE0); /* flush */ +} + +void t3_xgm_intr_enable(struct adapter *adapter, int idx) +{ + struct port_info *pi = adap2pinfo(adapter, idx); + + t3_write_reg(adapter, A_XGM_XGM_INT_ENABLE + pi->mac.offset, + XGM_EXTRA_INTR_MASK); +} + +void t3_xgm_intr_disable(struct adapter *adapter, int idx) +{ + struct port_info *pi = adap2pinfo(adapter, idx); + + t3_write_reg(adapter, A_XGM_XGM_INT_DISABLE + pi->mac.offset, + 0x7ff); +} + +/** + * t3_port_intr_enable - enable port-specific interrupts + * @adapter: associated adapter + * @idx: index of port whose interrupts should be enabled + * + * Enable port-specific (i.e., MAC and PHY) interrupts for the given + * adapter port. + */ +void t3_port_intr_enable(struct adapter *adapter, int idx) +{ + struct cphy *phy = &adap2pinfo(adapter, idx)->phy; + + t3_write_reg(adapter, XGM_REG(A_XGM_INT_ENABLE, idx), XGM_INTR_MASK); + t3_read_reg(adapter, XGM_REG(A_XGM_INT_ENABLE, idx)); /* flush */ + phy->ops->intr_enable(phy); +} + +/** + * t3_port_intr_disable - disable port-specific interrupts + * @adapter: associated adapter + * @idx: index of port whose interrupts should be disabled + * + * Disable port-specific (i.e., MAC and PHY) interrupts for the given + * adapter port. + */ +void t3_port_intr_disable(struct adapter *adapter, int idx) +{ + struct cphy *phy = &adap2pinfo(adapter, idx)->phy; + + t3_write_reg(adapter, XGM_REG(A_XGM_INT_ENABLE, idx), 0); + t3_read_reg(adapter, XGM_REG(A_XGM_INT_ENABLE, idx)); /* flush */ + phy->ops->intr_disable(phy); +} + +/** + * t3_port_intr_clear - clear port-specific interrupts + * @adapter: associated adapter + * @idx: index of port whose interrupts to clear + * + * Clear port-specific (i.e., MAC and PHY) interrupts for the given + * adapter port. + */ +static void t3_port_intr_clear(struct adapter *adapter, int idx) +{ + struct cphy *phy = &adap2pinfo(adapter, idx)->phy; + + t3_write_reg(adapter, XGM_REG(A_XGM_INT_CAUSE, idx), 0xffffffff); + t3_read_reg(adapter, XGM_REG(A_XGM_INT_CAUSE, idx)); /* flush */ + phy->ops->intr_clear(phy); +} + +#define SG_CONTEXT_CMD_ATTEMPTS 100 + +/** + * t3_sge_write_context - write an SGE context + * @adapter: the adapter + * @id: the context id + * @type: the context type + * + * Program an SGE context with the values already loaded in the + * CONTEXT_DATA? registers. + */ +static int t3_sge_write_context(struct adapter *adapter, unsigned int id, + unsigned int type) +{ + if (type == F_RESPONSEQ) { + /* + * Can't write the Response Queue Context bits for + * Interrupt Armed or the Reserve bits after the chip + * has been initialized out of reset. Writing to these + * bits can confuse the hardware. + */ + t3_write_reg(adapter, A_SG_CONTEXT_MASK0, 0xffffffff); + t3_write_reg(adapter, A_SG_CONTEXT_MASK1, 0xffffffff); + t3_write_reg(adapter, A_SG_CONTEXT_MASK2, 0x17ffffff); + t3_write_reg(adapter, A_SG_CONTEXT_MASK3, 0xffffffff); + } else { + t3_write_reg(adapter, A_SG_CONTEXT_MASK0, 0xffffffff); + t3_write_reg(adapter, A_SG_CONTEXT_MASK1, 0xffffffff); + t3_write_reg(adapter, A_SG_CONTEXT_MASK2, 0xffffffff); + t3_write_reg(adapter, A_SG_CONTEXT_MASK3, 0xffffffff); + } + t3_write_reg(adapter, A_SG_CONTEXT_CMD, + V_CONTEXT_CMD_OPCODE(1) | type | V_CONTEXT(id)); + return t3_wait_op_done(adapter, A_SG_CONTEXT_CMD, F_CONTEXT_CMD_BUSY, + 0, SG_CONTEXT_CMD_ATTEMPTS, 1); +} + +/** + * clear_sge_ctxt - completely clear an SGE context + * @adapter: the adapter + * @id: the context id + * @type: the context type + * + * Completely clear an SGE context. Used predominantly at post-reset + * initialization. Note in particular that we don't skip writing to any + * "sensitive bits" in the contexts the way that t3_sge_write_context() + * does ... + */ +static int clear_sge_ctxt(struct adapter *adap, unsigned int id, + unsigned int type) +{ + t3_write_reg(adap, A_SG_CONTEXT_DATA0, 0); + t3_write_reg(adap, A_SG_CONTEXT_DATA1, 0); + t3_write_reg(adap, A_SG_CONTEXT_DATA2, 0); + t3_write_reg(adap, A_SG_CONTEXT_DATA3, 0); + t3_write_reg(adap, A_SG_CONTEXT_MASK0, 0xffffffff); + t3_write_reg(adap, A_SG_CONTEXT_MASK1, 0xffffffff); + t3_write_reg(adap, A_SG_CONTEXT_MASK2, 0xffffffff); + t3_write_reg(adap, A_SG_CONTEXT_MASK3, 0xffffffff); + t3_write_reg(adap, A_SG_CONTEXT_CMD, + V_CONTEXT_CMD_OPCODE(1) | type | V_CONTEXT(id)); + return t3_wait_op_done(adap, A_SG_CONTEXT_CMD, F_CONTEXT_CMD_BUSY, + 0, SG_CONTEXT_CMD_ATTEMPTS, 1); +} + +/** + * t3_sge_init_ecntxt - initialize an SGE egress context + * @adapter: the adapter to configure + * @id: the context id + * @gts_enable: whether to enable GTS for the context + * @type: the egress context type + * @respq: associated response queue + * @base_addr: base address of queue + * @size: number of queue entries + * @token: uP token + * @gen: initial generation value for the context + * @cidx: consumer pointer + * + * Initialize an SGE egress context and make it ready for use. If the + * platform allows concurrent context operations, the caller is + * responsible for appropriate locking. + */ +int t3_sge_init_ecntxt(struct adapter *adapter, unsigned int id, int gts_enable, + enum sge_context_type type, int respq, u64 base_addr, + unsigned int size, unsigned int token, int gen, + unsigned int cidx) +{ + unsigned int credits = type == SGE_CNTXT_OFLD ? 0 : FW_WR_NUM; + + if (base_addr & 0xfff) /* must be 4K aligned */ + return -EINVAL; + if (t3_read_reg(adapter, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY) + return -EBUSY; + + base_addr >>= 12; + t3_write_reg(adapter, A_SG_CONTEXT_DATA0, V_EC_INDEX(cidx) | + V_EC_CREDITS(credits) | V_EC_GTS(gts_enable)); + t3_write_reg(adapter, A_SG_CONTEXT_DATA1, V_EC_SIZE(size) | + V_EC_BASE_LO(base_addr & 0xffff)); + base_addr >>= 16; + t3_write_reg(adapter, A_SG_CONTEXT_DATA2, base_addr); + base_addr >>= 32; + t3_write_reg(adapter, A_SG_CONTEXT_DATA3, + V_EC_BASE_HI(base_addr & 0xf) | V_EC_RESPQ(respq) | + V_EC_TYPE(type) | V_EC_GEN(gen) | V_EC_UP_TOKEN(token) | + F_EC_VALID); + return t3_sge_write_context(adapter, id, F_EGRESS); +} + +/** + * t3_sge_init_flcntxt - initialize an SGE free-buffer list context + * @adapter: the adapter to configure + * @id: the context id + * @gts_enable: whether to enable GTS for the context + * @base_addr: base address of queue + * @size: number of queue entries + * @bsize: size of each buffer for this queue + * @cong_thres: threshold to signal congestion to upstream producers + * @gen: initial generation value for the context + * @cidx: consumer pointer + * + * Initialize an SGE free list context and make it ready for use. The + * caller is responsible for ensuring only one context operation occurs + * at a time. + */ +int t3_sge_init_flcntxt(struct adapter *adapter, unsigned int id, + int gts_enable, u64 base_addr, unsigned int size, + unsigned int bsize, unsigned int cong_thres, int gen, + unsigned int cidx) +{ + if (base_addr & 0xfff) /* must be 4K aligned */ + return -EINVAL; + if (t3_read_reg(adapter, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY) + return -EBUSY; + + base_addr >>= 12; + t3_write_reg(adapter, A_SG_CONTEXT_DATA0, base_addr); + base_addr >>= 32; + t3_write_reg(adapter, A_SG_CONTEXT_DATA1, + V_FL_BASE_HI((u32) base_addr) | + V_FL_INDEX_LO(cidx & M_FL_INDEX_LO)); + t3_write_reg(adapter, A_SG_CONTEXT_DATA2, V_FL_SIZE(size) | + V_FL_GEN(gen) | V_FL_INDEX_HI(cidx >> 12) | + V_FL_ENTRY_SIZE_LO(bsize & M_FL_ENTRY_SIZE_LO)); + t3_write_reg(adapter, A_SG_CONTEXT_DATA3, + V_FL_ENTRY_SIZE_HI(bsize >> (32 - S_FL_ENTRY_SIZE_LO)) | + V_FL_CONG_THRES(cong_thres) | V_FL_GTS(gts_enable)); + return t3_sge_write_context(adapter, id, F_FREELIST); +} + +/** + * t3_sge_init_rspcntxt - initialize an SGE response queue context + * @adapter: the adapter to configure + * @id: the context id + * @irq_vec_idx: MSI-X interrupt vector index, 0 if no MSI-X, -1 if no IRQ + * @base_addr: base address of queue + * @size: number of queue entries + * @fl_thres: threshold for selecting the normal or jumbo free list + * @gen: initial generation value for the context + * @cidx: consumer pointer + * + * Initialize an SGE response queue context and make it ready for use. + * The caller is responsible for ensuring only one context operation + * occurs at a time. + */ +int t3_sge_init_rspcntxt(struct adapter *adapter, unsigned int id, + int irq_vec_idx, u64 base_addr, unsigned int size, + unsigned int fl_thres, int gen, unsigned int cidx) +{ + unsigned int intr = 0; + + if (base_addr & 0xfff) /* must be 4K aligned */ + return -EINVAL; + if (t3_read_reg(adapter, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY) + return -EBUSY; + + base_addr >>= 12; + t3_write_reg(adapter, A_SG_CONTEXT_DATA0, V_CQ_SIZE(size) | + V_CQ_INDEX(cidx)); + t3_write_reg(adapter, A_SG_CONTEXT_DATA1, base_addr); + base_addr >>= 32; + if (irq_vec_idx >= 0) + intr = V_RQ_MSI_VEC(irq_vec_idx) | F_RQ_INTR_EN; + t3_write_reg(adapter, A_SG_CONTEXT_DATA2, + V_CQ_BASE_HI((u32) base_addr) | intr | V_RQ_GEN(gen)); + t3_write_reg(adapter, A_SG_CONTEXT_DATA3, fl_thres); + return t3_sge_write_context(adapter, id, F_RESPONSEQ); +} + +/** + * t3_sge_init_cqcntxt - initialize an SGE completion queue context + * @adapter: the adapter to configure + * @id: the context id + * @base_addr: base address of queue + * @size: number of queue entries + * @rspq: response queue for async notifications + * @ovfl_mode: CQ overflow mode + * @credits: completion queue credits + * @credit_thres: the credit threshold + * + * Initialize an SGE completion queue context and make it ready for use. + * The caller is responsible for ensuring only one context operation + * occurs at a time. + */ +int t3_sge_init_cqcntxt(struct adapter *adapter, unsigned int id, u64 base_addr, + unsigned int size, int rspq, int ovfl_mode, + unsigned int credits, unsigned int credit_thres) +{ + if (base_addr & 0xfff) /* must be 4K aligned */ + return -EINVAL; + if (t3_read_reg(adapter, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY) + return -EBUSY; + + base_addr >>= 12; + t3_write_reg(adapter, A_SG_CONTEXT_DATA0, V_CQ_SIZE(size)); + t3_write_reg(adapter, A_SG_CONTEXT_DATA1, base_addr); + base_addr >>= 32; + t3_write_reg(adapter, A_SG_CONTEXT_DATA2, + V_CQ_BASE_HI((u32) base_addr) | V_CQ_RSPQ(rspq) | + V_CQ_GEN(1) | V_CQ_OVERFLOW_MODE(ovfl_mode) | + V_CQ_ERR(ovfl_mode)); + t3_write_reg(adapter, A_SG_CONTEXT_DATA3, V_CQ_CREDITS(credits) | + V_CQ_CREDIT_THRES(credit_thres)); + return t3_sge_write_context(adapter, id, F_CQ); +} + +/** + * t3_sge_enable_ecntxt - enable/disable an SGE egress context + * @adapter: the adapter + * @id: the egress context id + * @enable: enable (1) or disable (0) the context + * + * Enable or disable an SGE egress context. The caller is responsible for + * ensuring only one context operation occurs at a time. + */ +int t3_sge_enable_ecntxt(struct adapter *adapter, unsigned int id, int enable) +{ + if (t3_read_reg(adapter, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY) + return -EBUSY; + + t3_write_reg(adapter, A_SG_CONTEXT_MASK0, 0); + t3_write_reg(adapter, A_SG_CONTEXT_MASK1, 0); + t3_write_reg(adapter, A_SG_CONTEXT_MASK2, 0); + t3_write_reg(adapter, A_SG_CONTEXT_MASK3, F_EC_VALID); + t3_write_reg(adapter, A_SG_CONTEXT_DATA3, V_EC_VALID(enable)); + t3_write_reg(adapter, A_SG_CONTEXT_CMD, + V_CONTEXT_CMD_OPCODE(1) | F_EGRESS | V_CONTEXT(id)); + return t3_wait_op_done(adapter, A_SG_CONTEXT_CMD, F_CONTEXT_CMD_BUSY, + 0, SG_CONTEXT_CMD_ATTEMPTS, 1); +} + +/** + * t3_sge_disable_fl - disable an SGE free-buffer list + * @adapter: the adapter + * @id: the free list context id + * + * Disable an SGE free-buffer list. The caller is responsible for + * ensuring only one context operation occurs at a time. + */ +int t3_sge_disable_fl(struct adapter *adapter, unsigned int id) +{ + if (t3_read_reg(adapter, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY) + return -EBUSY; + + t3_write_reg(adapter, A_SG_CONTEXT_MASK0, 0); + t3_write_reg(adapter, A_SG_CONTEXT_MASK1, 0); + t3_write_reg(adapter, A_SG_CONTEXT_MASK2, V_FL_SIZE(M_FL_SIZE)); + t3_write_reg(adapter, A_SG_CONTEXT_MASK3, 0); + t3_write_reg(adapter, A_SG_CONTEXT_DATA2, 0); + t3_write_reg(adapter, A_SG_CONTEXT_CMD, + V_CONTEXT_CMD_OPCODE(1) | F_FREELIST | V_CONTEXT(id)); + return t3_wait_op_done(adapter, A_SG_CONTEXT_CMD, F_CONTEXT_CMD_BUSY, + 0, SG_CONTEXT_CMD_ATTEMPTS, 1); +} + +/** + * t3_sge_disable_rspcntxt - disable an SGE response queue + * @adapter: the adapter + * @id: the response queue context id + * + * Disable an SGE response queue. The caller is responsible for + * ensuring only one context operation occurs at a time. + */ +int t3_sge_disable_rspcntxt(struct adapter *adapter, unsigned int id) +{ + if (t3_read_reg(adapter, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY) + return -EBUSY; + + t3_write_reg(adapter, A_SG_CONTEXT_MASK0, V_CQ_SIZE(M_CQ_SIZE)); + t3_write_reg(adapter, A_SG_CONTEXT_MASK1, 0); + t3_write_reg(adapter, A_SG_CONTEXT_MASK2, 0); + t3_write_reg(adapter, A_SG_CONTEXT_MASK3, 0); + t3_write_reg(adapter, A_SG_CONTEXT_DATA0, 0); + t3_write_reg(adapter, A_SG_CONTEXT_CMD, + V_CONTEXT_CMD_OPCODE(1) | F_RESPONSEQ | V_CONTEXT(id)); + return t3_wait_op_done(adapter, A_SG_CONTEXT_CMD, F_CONTEXT_CMD_BUSY, + 0, SG_CONTEXT_CMD_ATTEMPTS, 1); +} + +/** + * t3_sge_disable_cqcntxt - disable an SGE completion queue + * @adapter: the adapter + * @id: the completion queue context id + * + * Disable an SGE completion queue. The caller is responsible for + * ensuring only one context operation occurs at a time. + */ +int t3_sge_disable_cqcntxt(struct adapter *adapter, unsigned int id) +{ + if (t3_read_reg(adapter, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY) + return -EBUSY; + + t3_write_reg(adapter, A_SG_CONTEXT_MASK0, V_CQ_SIZE(M_CQ_SIZE)); + t3_write_reg(adapter, A_SG_CONTEXT_MASK1, 0); + t3_write_reg(adapter, A_SG_CONTEXT_MASK2, 0); + t3_write_reg(adapter, A_SG_CONTEXT_MASK3, 0); + t3_write_reg(adapter, A_SG_CONTEXT_DATA0, 0); + t3_write_reg(adapter, A_SG_CONTEXT_CMD, + V_CONTEXT_CMD_OPCODE(1) | F_CQ | V_CONTEXT(id)); + return t3_wait_op_done(adapter, A_SG_CONTEXT_CMD, F_CONTEXT_CMD_BUSY, + 0, SG_CONTEXT_CMD_ATTEMPTS, 1); +} + +/** + * t3_sge_cqcntxt_op - perform an operation on a completion queue context + * @adapter: the adapter + * @id: the context id + * @op: the operation to perform + * + * Perform the selected operation on an SGE completion queue context. + * The caller is responsible for ensuring only one context operation + * occurs at a time. + */ +int t3_sge_cqcntxt_op(struct adapter *adapter, unsigned int id, unsigned int op, + unsigned int credits) +{ + u32 val; + + if (t3_read_reg(adapter, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY) + return -EBUSY; + + t3_write_reg(adapter, A_SG_CONTEXT_DATA0, credits << 16); + t3_write_reg(adapter, A_SG_CONTEXT_CMD, V_CONTEXT_CMD_OPCODE(op) | + V_CONTEXT(id) | F_CQ); + if (t3_wait_op_done_val(adapter, A_SG_CONTEXT_CMD, F_CONTEXT_CMD_BUSY, + 0, SG_CONTEXT_CMD_ATTEMPTS, 1, &val)) + return -EIO; + + if (op >= 2 && op < 7) { + if (adapter->params.rev > 0) + return G_CQ_INDEX(val); + + t3_write_reg(adapter, A_SG_CONTEXT_CMD, + V_CONTEXT_CMD_OPCODE(0) | F_CQ | V_CONTEXT(id)); + if (t3_wait_op_done(adapter, A_SG_CONTEXT_CMD, + F_CONTEXT_CMD_BUSY, 0, + SG_CONTEXT_CMD_ATTEMPTS, 1)) + return -EIO; + return G_CQ_INDEX(t3_read_reg(adapter, A_SG_CONTEXT_DATA0)); + } + return 0; +} + +/** + * t3_config_rss - configure Rx packet steering + * @adapter: the adapter + * @rss_config: RSS settings (written to TP_RSS_CONFIG) + * @cpus: values for the CPU lookup table (0xff terminated) + * @rspq: values for the response queue lookup table (0xffff terminated) + * + * Programs the receive packet steering logic. @cpus and @rspq provide + * the values for the CPU and response queue lookup tables. If they + * provide fewer values than the size of the tables the supplied values + * are used repeatedly until the tables are fully populated. + */ +void t3_config_rss(struct adapter *adapter, unsigned int rss_config, + const u8 * cpus, const u16 *rspq) +{ + int i, j, cpu_idx = 0, q_idx = 0; + + if (cpus) + for (i = 0; i < RSS_TABLE_SIZE; ++i) { + u32 val = i << 16; + + for (j = 0; j < 2; ++j) { + val |= (cpus[cpu_idx++] & 0x3f) << (8 * j); + if (cpus[cpu_idx] == 0xff) + cpu_idx = 0; + } + t3_write_reg(adapter, A_TP_RSS_LKP_TABLE, val); + } + + if (rspq) + for (i = 0; i < RSS_TABLE_SIZE; ++i) { + t3_write_reg(adapter, A_TP_RSS_MAP_TABLE, + (i << 16) | rspq[q_idx++]); + if (rspq[q_idx] == 0xffff) + q_idx = 0; + } + + t3_write_reg(adapter, A_TP_RSS_CONFIG, rss_config); +} + +/** + * t3_tp_set_offload_mode - put TP in NIC/offload mode + * @adap: the adapter + * @enable: 1 to select offload mode, 0 for regular NIC + * + * Switches TP to NIC/offload mode. + */ +void t3_tp_set_offload_mode(struct adapter *adap, int enable) +{ + if (is_offload(adap) || !enable) + t3_set_reg_field(adap, A_TP_IN_CONFIG, F_NICMODE, + V_NICMODE(!enable)); +} + +/** + * pm_num_pages - calculate the number of pages of the payload memory + * @mem_size: the size of the payload memory + * @pg_size: the size of each payload memory page + * + * Calculate the number of pages, each of the given size, that fit in a + * memory of the specified size, respecting the HW requirement that the + * number of pages must be a multiple of 24. + */ +static inline unsigned int pm_num_pages(unsigned int mem_size, + unsigned int pg_size) +{ + unsigned int n = mem_size / pg_size; + + return n - n % 24; +} + +#define mem_region(adap, start, size, reg) \ + t3_write_reg((adap), A_ ## reg, (start)); \ + start += size + +/** + * partition_mem - partition memory and configure TP memory settings + * @adap: the adapter + * @p: the TP parameters + * + * Partitions context and payload memory and configures TP's memory + * registers. + */ +static void partition_mem(struct adapter *adap, const struct tp_params *p) +{ + unsigned int m, pstructs, tids = t3_mc5_size(&adap->mc5); + unsigned int timers = 0, timers_shift = 22; + + if (adap->params.rev > 0) { + if (tids <= 16 * 1024) { + timers = 1; + timers_shift = 16; + } else if (tids <= 64 * 1024) { + timers = 2; + timers_shift = 18; + } else if (tids <= 256 * 1024) { + timers = 3; + timers_shift = 20; + } + } + + t3_write_reg(adap, A_TP_PMM_SIZE, + p->chan_rx_size | (p->chan_tx_size >> 16)); + + t3_write_reg(adap, A_TP_PMM_TX_BASE, 0); + t3_write_reg(adap, A_TP_PMM_TX_PAGE_SIZE, p->tx_pg_size); + t3_write_reg(adap, A_TP_PMM_TX_MAX_PAGE, p->tx_num_pgs); + t3_set_reg_field(adap, A_TP_PARA_REG3, V_TXDATAACKIDX(M_TXDATAACKIDX), + V_TXDATAACKIDX(fls(p->tx_pg_size) - 12)); + + t3_write_reg(adap, A_TP_PMM_RX_BASE, 0); + t3_write_reg(adap, A_TP_PMM_RX_PAGE_SIZE, p->rx_pg_size); + t3_write_reg(adap, A_TP_PMM_RX_MAX_PAGE, p->rx_num_pgs); + + pstructs = p->rx_num_pgs + p->tx_num_pgs; + /* Add a bit of headroom and make multiple of 24 */ + pstructs += 48; + pstructs -= pstructs % 24; + t3_write_reg(adap, A_TP_CMM_MM_MAX_PSTRUCT, pstructs); + + m = tids * TCB_SIZE; + mem_region(adap, m, (64 << 10) * 64, SG_EGR_CNTX_BADDR); + mem_region(adap, m, (64 << 10) * 64, SG_CQ_CONTEXT_BADDR); + t3_write_reg(adap, A_TP_CMM_TIMER_BASE, V_CMTIMERMAXNUM(timers) | m); + m += ((p->ntimer_qs - 1) << timers_shift) + (1 << 22); + mem_region(adap, m, pstructs * 64, TP_CMM_MM_BASE); + mem_region(adap, m, 64 * (pstructs / 24), TP_CMM_MM_PS_FLST_BASE); + mem_region(adap, m, 64 * (p->rx_num_pgs / 24), TP_CMM_MM_RX_FLST_BASE); + mem_region(adap, m, 64 * (p->tx_num_pgs / 24), TP_CMM_MM_TX_FLST_BASE); + + m = (m + 4095) & ~0xfff; + t3_write_reg(adap, A_CIM_SDRAM_BASE_ADDR, m); + t3_write_reg(adap, A_CIM_SDRAM_ADDR_SIZE, p->cm_size - m); + + tids = (p->cm_size - m - (3 << 20)) / 3072 - 32; + m = t3_mc5_size(&adap->mc5) - adap->params.mc5.nservers - + adap->params.mc5.nfilters - adap->params.mc5.nroutes; + if (tids < m) + adap->params.mc5.nservers += m - tids; +} + +static inline void tp_wr_indirect(struct adapter *adap, unsigned int addr, + u32 val) +{ + t3_write_reg(adap, A_TP_PIO_ADDR, addr); + t3_write_reg(adap, A_TP_PIO_DATA, val); +} + +static void tp_config(struct adapter *adap, const struct tp_params *p) +{ + t3_write_reg(adap, A_TP_GLOBAL_CONFIG, F_TXPACINGENABLE | F_PATHMTU | + F_IPCHECKSUMOFFLOAD | F_UDPCHECKSUMOFFLOAD | + F_TCPCHECKSUMOFFLOAD | V_IPTTL(64)); + t3_write_reg(adap, A_TP_TCP_OPTIONS, V_MTUDEFAULT(576) | + F_MTUENABLE | V_WINDOWSCALEMODE(1) | + V_TIMESTAMPSMODE(1) | V_SACKMODE(1) | V_SACKRX(1)); + t3_write_reg(adap, A_TP_DACK_CONFIG, V_AUTOSTATE3(1) | + V_AUTOSTATE2(1) | V_AUTOSTATE1(0) | + V_BYTETHRESHOLD(26880) | V_MSSTHRESHOLD(2) | + F_AUTOCAREFUL | F_AUTOENABLE | V_DACK_MODE(1)); + t3_set_reg_field(adap, A_TP_IN_CONFIG, F_RXFBARBPRIO | F_TXFBARBPRIO, + F_IPV6ENABLE | F_NICMODE); + t3_write_reg(adap, A_TP_TX_RESOURCE_LIMIT, 0x18141814); + t3_write_reg(adap, A_TP_PARA_REG4, 0x5050105); + t3_set_reg_field(adap, A_TP_PARA_REG6, 0, + adap->params.rev > 0 ? F_ENABLEESND : + F_T3A_ENABLEESND); + + t3_set_reg_field(adap, A_TP_PC_CONFIG, + F_ENABLEEPCMDAFULL, + F_ENABLEOCSPIFULL |F_TXDEFERENABLE | F_HEARBEATDACK | + F_TXCONGESTIONMODE | F_RXCONGESTIONMODE); + t3_set_reg_field(adap, A_TP_PC_CONFIG2, F_CHDRAFULL, + F_ENABLEIPV6RSS | F_ENABLENONOFDTNLSYN | + F_ENABLEARPMISS | F_DISBLEDAPARBIT0); + t3_write_reg(adap, A_TP_PROXY_FLOW_CNTL, 1080); + t3_write_reg(adap, A_TP_PROXY_FLOW_CNTL, 1000); + + if (adap->params.rev > 0) { + tp_wr_indirect(adap, A_TP_EGRESS_CONFIG, F_REWRITEFORCETOSIZE); + t3_set_reg_field(adap, A_TP_PARA_REG3, F_TXPACEAUTO, + F_TXPACEAUTO); + t3_set_reg_field(adap, A_TP_PC_CONFIG, F_LOCKTID, F_LOCKTID); + t3_set_reg_field(adap, A_TP_PARA_REG3, 0, F_TXPACEAUTOSTRICT); + } else + t3_set_reg_field(adap, A_TP_PARA_REG3, 0, F_TXPACEFIXED); + + if (adap->params.rev == T3_REV_C) + t3_set_reg_field(adap, A_TP_PC_CONFIG, + V_TABLELATENCYDELTA(M_TABLELATENCYDELTA), + V_TABLELATENCYDELTA(4)); + + t3_write_reg(adap, A_TP_TX_MOD_QUEUE_WEIGHT1, 0); + t3_write_reg(adap, A_TP_TX_MOD_QUEUE_WEIGHT0, 0); + t3_write_reg(adap, A_TP_MOD_CHANNEL_WEIGHT, 0); + t3_write_reg(adap, A_TP_MOD_RATE_LIMIT, 0xf2200000); +} + +/* Desired TP timer resolution in usec */ +#define TP_TMR_RES 50 + +/* TCP timer values in ms */ +#define TP_DACK_TIMER 50 +#define TP_RTO_MIN 250 + +/** + * tp_set_timers - set TP timing parameters + * @adap: the adapter to set + * @core_clk: the core clock frequency in Hz + * + * Set TP's timing parameters, such as the various timer resolutions and + * the TCP timer values. + */ +static void tp_set_timers(struct adapter *adap, unsigned int core_clk) +{ + unsigned int tre = fls(core_clk / (1000000 / TP_TMR_RES)) - 1; + unsigned int dack_re = fls(core_clk / 5000) - 1; /* 200us */ + unsigned int tstamp_re = fls(core_clk / 1000); /* 1ms, at least */ + unsigned int tps = core_clk >> tre; + + t3_write_reg(adap, A_TP_TIMER_RESOLUTION, V_TIMERRESOLUTION(tre) | + V_DELAYEDACKRESOLUTION(dack_re) | + V_TIMESTAMPRESOLUTION(tstamp_re)); + t3_write_reg(adap, A_TP_DACK_TIMER, + (core_clk >> dack_re) / (1000 / TP_DACK_TIMER)); + t3_write_reg(adap, A_TP_TCP_BACKOFF_REG0, 0x3020100); + t3_write_reg(adap, A_TP_TCP_BACKOFF_REG1, 0x7060504); + t3_write_reg(adap, A_TP_TCP_BACKOFF_REG2, 0xb0a0908); + t3_write_reg(adap, A_TP_TCP_BACKOFF_REG3, 0xf0e0d0c); + t3_write_reg(adap, A_TP_SHIFT_CNT, V_SYNSHIFTMAX(6) | + V_RXTSHIFTMAXR1(4) | V_RXTSHIFTMAXR2(15) | + V_PERSHIFTBACKOFFMAX(8) | V_PERSHIFTMAX(8) | + V_KEEPALIVEMAX(9)); + +#define SECONDS * tps + + t3_write_reg(adap, A_TP_MSL, adap->params.rev > 0 ? 0 : 2 SECONDS); + t3_write_reg(adap, A_TP_RXT_MIN, tps / (1000 / TP_RTO_MIN)); + t3_write_reg(adap, A_TP_RXT_MAX, 64 SECONDS); + t3_write_reg(adap, A_TP_PERS_MIN, 5 SECONDS); + t3_write_reg(adap, A_TP_PERS_MAX, 64 SECONDS); + t3_write_reg(adap, A_TP_KEEP_IDLE, 7200 SECONDS); + t3_write_reg(adap, A_TP_KEEP_INTVL, 75 SECONDS); + t3_write_reg(adap, A_TP_INIT_SRTT, 3 SECONDS); + t3_write_reg(adap, A_TP_FINWAIT2_TIMER, 600 SECONDS); + +#undef SECONDS +} + +/** + * t3_tp_set_coalescing_size - set receive coalescing size + * @adap: the adapter + * @size: the receive coalescing size + * @psh: whether a set PSH bit should deliver coalesced data + * + * Set the receive coalescing size and PSH bit handling. + */ +static int t3_tp_set_coalescing_size(struct adapter *adap, + unsigned int size, int psh) +{ + u32 val; + + if (size > MAX_RX_COALESCING_LEN) + return -EINVAL; + + val = t3_read_reg(adap, A_TP_PARA_REG3); + val &= ~(F_RXCOALESCEENABLE | F_RXCOALESCEPSHEN); + + if (size) { + val |= F_RXCOALESCEENABLE; + if (psh) + val |= F_RXCOALESCEPSHEN; + size = min(MAX_RX_COALESCING_LEN, size); + t3_write_reg(adap, A_TP_PARA_REG2, V_RXCOALESCESIZE(size) | + V_MAXRXDATA(MAX_RX_COALESCING_LEN)); + } + t3_write_reg(adap, A_TP_PARA_REG3, val); + return 0; +} + +/** + * t3_tp_set_max_rxsize - set the max receive size + * @adap: the adapter + * @size: the max receive size + * + * Set TP's max receive size. This is the limit that applies when + * receive coalescing is disabled. + */ +static void t3_tp_set_max_rxsize(struct adapter *adap, unsigned int size) +{ + t3_write_reg(adap, A_TP_PARA_REG7, + V_PMMAXXFERLEN0(size) | V_PMMAXXFERLEN1(size)); +} + +static void init_mtus(unsigned short mtus[]) +{ + /* + * See draft-mathis-plpmtud-00.txt for the values. The min is 88 so + * it can accommodate max size TCP/IP headers when SACK and timestamps + * are enabled and still have at least 8 bytes of payload. + */ + mtus[0] = 88; + mtus[1] = 88; + mtus[2] = 256; + mtus[3] = 512; + mtus[4] = 576; + mtus[5] = 1024; + mtus[6] = 1280; + mtus[7] = 1492; + mtus[8] = 1500; + mtus[9] = 2002; + mtus[10] = 2048; + mtus[11] = 4096; + mtus[12] = 4352; + mtus[13] = 8192; + mtus[14] = 9000; + mtus[15] = 9600; +} + +/* + * Initial congestion control parameters. + */ +static void init_cong_ctrl(unsigned short *a, unsigned short *b) +{ + a[0] = a[1] = a[2] = a[3] = a[4] = a[5] = a[6] = a[7] = a[8] = 1; + a[9] = 2; + a[10] = 3; + a[11] = 4; + a[12] = 5; + a[13] = 6; + a[14] = 7; + a[15] = 8; + a[16] = 9; + a[17] = 10; + a[18] = 14; + a[19] = 17; + a[20] = 21; + a[21] = 25; + a[22] = 30; + a[23] = 35; + a[24] = 45; + a[25] = 60; + a[26] = 80; + a[27] = 100; + a[28] = 200; + a[29] = 300; + a[30] = 400; + a[31] = 500; + + b[0] = b[1] = b[2] = b[3] = b[4] = b[5] = b[6] = b[7] = b[8] = 0; + b[9] = b[10] = 1; + b[11] = b[12] = 2; + b[13] = b[14] = b[15] = b[16] = 3; + b[17] = b[18] = b[19] = b[20] = b[21] = 4; + b[22] = b[23] = b[24] = b[25] = b[26] = b[27] = 5; + b[28] = b[29] = 6; + b[30] = b[31] = 7; +} + +/* The minimum additive increment value for the congestion control table */ +#define CC_MIN_INCR 2U + +/** + * t3_load_mtus - write the MTU and congestion control HW tables + * @adap: the adapter + * @mtus: the unrestricted values for the MTU table + * @alphs: the values for the congestion control alpha parameter + * @beta: the values for the congestion control beta parameter + * @mtu_cap: the maximum permitted effective MTU + * + * Write the MTU table with the supplied MTUs capping each at &mtu_cap. + * Update the high-speed congestion control table with the supplied alpha, + * beta, and MTUs. + */ +void t3_load_mtus(struct adapter *adap, unsigned short mtus[NMTUS], + unsigned short alpha[NCCTRL_WIN], + unsigned short beta[NCCTRL_WIN], unsigned short mtu_cap) +{ + static const unsigned int avg_pkts[NCCTRL_WIN] = { + 2, 6, 10, 14, 20, 28, 40, 56, 80, 112, 160, 224, 320, 448, 640, + 896, 1281, 1792, 2560, 3584, 5120, 7168, 10240, 14336, 20480, + 28672, 40960, 57344, 81920, 114688, 163840, 229376 + }; + + unsigned int i, w; + + for (i = 0; i < NMTUS; ++i) { + unsigned int mtu = min(mtus[i], mtu_cap); + unsigned int log2 = fls(mtu); + + if (!(mtu & ((1 << log2) >> 2))) /* round */ + log2--; + t3_write_reg(adap, A_TP_MTU_TABLE, + (i << 24) | (log2 << 16) | mtu); + + for (w = 0; w < NCCTRL_WIN; ++w) { + unsigned int inc; + + inc = max(((mtu - 40) * alpha[w]) / avg_pkts[w], + CC_MIN_INCR); + + t3_write_reg(adap, A_TP_CCTRL_TABLE, (i << 21) | + (w << 16) | (beta[w] << 13) | inc); + } + } +} + +/** + * t3_tp_get_mib_stats - read TP's MIB counters + * @adap: the adapter + * @tps: holds the returned counter values + * + * Returns the values of TP's MIB counters. + */ +void t3_tp_get_mib_stats(struct adapter *adap, struct tp_mib_stats *tps) +{ + t3_read_indirect(adap, A_TP_MIB_INDEX, A_TP_MIB_RDATA, (u32 *) tps, + sizeof(*tps) / sizeof(u32), 0); +} + +#define ulp_region(adap, name, start, len) \ + t3_write_reg((adap), A_ULPRX_ ## name ## _LLIMIT, (start)); \ + t3_write_reg((adap), A_ULPRX_ ## name ## _ULIMIT, \ + (start) + (len) - 1); \ + start += len + +#define ulptx_region(adap, name, start, len) \ + t3_write_reg((adap), A_ULPTX_ ## name ## _LLIMIT, (start)); \ + t3_write_reg((adap), A_ULPTX_ ## name ## _ULIMIT, \ + (start) + (len) - 1) + +static void ulp_config(struct adapter *adap, const struct tp_params *p) +{ + unsigned int m = p->chan_rx_size; + + ulp_region(adap, ISCSI, m, p->chan_rx_size / 8); + ulp_region(adap, TDDP, m, p->chan_rx_size / 8); + ulptx_region(adap, TPT, m, p->chan_rx_size / 4); + ulp_region(adap, STAG, m, p->chan_rx_size / 4); + ulp_region(adap, RQ, m, p->chan_rx_size / 4); + ulptx_region(adap, PBL, m, p->chan_rx_size / 4); + ulp_region(adap, PBL, m, p->chan_rx_size / 4); + t3_write_reg(adap, A_ULPRX_TDDP_TAGMASK, 0xffffffff); +} + +/** + * t3_set_proto_sram - set the contents of the protocol sram + * @adapter: the adapter + * @data: the protocol image + * + * Write the contents of the protocol SRAM. + */ +int t3_set_proto_sram(struct adapter *adap, const u8 *data) +{ + int i; + const __be32 *buf = (const __be32 *)data; + + for (i = 0; i < PROTO_SRAM_LINES; i++) { + t3_write_reg(adap, A_TP_EMBED_OP_FIELD5, be32_to_cpu(*buf++)); + t3_write_reg(adap, A_TP_EMBED_OP_FIELD4, be32_to_cpu(*buf++)); + t3_write_reg(adap, A_TP_EMBED_OP_FIELD3, be32_to_cpu(*buf++)); + t3_write_reg(adap, A_TP_EMBED_OP_FIELD2, be32_to_cpu(*buf++)); + t3_write_reg(adap, A_TP_EMBED_OP_FIELD1, be32_to_cpu(*buf++)); + + t3_write_reg(adap, A_TP_EMBED_OP_FIELD0, i << 1 | 1 << 31); + if (t3_wait_op_done(adap, A_TP_EMBED_OP_FIELD0, 1, 1, 5, 1)) + return -EIO; + } + t3_write_reg(adap, A_TP_EMBED_OP_FIELD0, 0); + + return 0; +} + +void t3_config_trace_filter(struct adapter *adapter, + const struct trace_params *tp, int filter_index, + int invert, int enable) +{ + u32 addr, key[4], mask[4]; + + key[0] = tp->sport | (tp->sip << 16); + key[1] = (tp->sip >> 16) | (tp->dport << 16); + key[2] = tp->dip; + key[3] = tp->proto | (tp->vlan << 8) | (tp->intf << 20); + + mask[0] = tp->sport_mask | (tp->sip_mask << 16); + mask[1] = (tp->sip_mask >> 16) | (tp->dport_mask << 16); + mask[2] = tp->dip_mask; + mask[3] = tp->proto_mask | (tp->vlan_mask << 8) | (tp->intf_mask << 20); + + if (invert) + key[3] |= (1 << 29); + if (enable) + key[3] |= (1 << 28); + + addr = filter_index ? A_TP_RX_TRC_KEY0 : A_TP_TX_TRC_KEY0; + tp_wr_indirect(adapter, addr++, key[0]); + tp_wr_indirect(adapter, addr++, mask[0]); + tp_wr_indirect(adapter, addr++, key[1]); + tp_wr_indirect(adapter, addr++, mask[1]); + tp_wr_indirect(adapter, addr++, key[2]); + tp_wr_indirect(adapter, addr++, mask[2]); + tp_wr_indirect(adapter, addr++, key[3]); + tp_wr_indirect(adapter, addr, mask[3]); + t3_read_reg(adapter, A_TP_PIO_DATA); +} + +/** + * t3_config_sched - configure a HW traffic scheduler + * @adap: the adapter + * @kbps: target rate in Kbps + * @sched: the scheduler index + * + * Configure a HW scheduler for the target rate + */ +int t3_config_sched(struct adapter *adap, unsigned int kbps, int sched) +{ + unsigned int v, tps, cpt, bpt, delta, mindelta = ~0; + unsigned int clk = adap->params.vpd.cclk * 1000; + unsigned int selected_cpt = 0, selected_bpt = 0; + + if (kbps > 0) { + kbps *= 125; /* -> bytes */ + for (cpt = 1; cpt <= 255; cpt++) { + tps = clk / cpt; + bpt = (kbps + tps / 2) / tps; + if (bpt > 0 && bpt <= 255) { + v = bpt * tps; + delta = v >= kbps ? v - kbps : kbps - v; + if (delta <= mindelta) { + mindelta = delta; + selected_cpt = cpt; + selected_bpt = bpt; + } + } else if (selected_cpt) + break; + } + if (!selected_cpt) + return -EINVAL; + } + t3_write_reg(adap, A_TP_TM_PIO_ADDR, + A_TP_TX_MOD_Q1_Q0_RATE_LIMIT - sched / 2); + v = t3_read_reg(adap, A_TP_TM_PIO_DATA); + if (sched & 1) + v = (v & 0xffff) | (selected_cpt << 16) | (selected_bpt << 24); + else + v = (v & 0xffff0000) | selected_cpt | (selected_bpt << 8); + t3_write_reg(adap, A_TP_TM_PIO_DATA, v); + return 0; +} + +static int tp_init(struct adapter *adap, const struct tp_params *p) +{ + int busy = 0; + + tp_config(adap, p); + t3_set_vlan_accel(adap, 3, 0); + + if (is_offload(adap)) { + tp_set_timers(adap, adap->params.vpd.cclk * 1000); + t3_write_reg(adap, A_TP_RESET, F_FLSTINITENABLE); + busy = t3_wait_op_done(adap, A_TP_RESET, F_FLSTINITENABLE, + 0, 1000, 5); + if (busy) + CH_ERR(adap, "TP initialization timed out\n"); + } + + if (!busy) + t3_write_reg(adap, A_TP_RESET, F_TPRESET); + return busy; +} + +/* + * Perform the bits of HW initialization that are dependent on the Tx + * channels being used. + */ +static void chan_init_hw(struct adapter *adap, unsigned int chan_map) +{ + int i; + + if (chan_map != 3) { /* one channel */ + t3_set_reg_field(adap, A_ULPRX_CTL, F_ROUND_ROBIN, 0); + t3_set_reg_field(adap, A_ULPTX_CONFIG, F_CFG_RR_ARB, 0); + t3_write_reg(adap, A_MPS_CFG, F_TPRXPORTEN | F_ENFORCEPKT | + (chan_map == 1 ? F_TPTXPORT0EN | F_PORT0ACTIVE : + F_TPTXPORT1EN | F_PORT1ACTIVE)); + t3_write_reg(adap, A_PM1_TX_CFG, + chan_map == 1 ? 0xffffffff : 0); + } else { /* two channels */ + t3_set_reg_field(adap, A_ULPRX_CTL, 0, F_ROUND_ROBIN); + t3_set_reg_field(adap, A_ULPTX_CONFIG, 0, F_CFG_RR_ARB); + t3_write_reg(adap, A_ULPTX_DMA_WEIGHT, + V_D1_WEIGHT(16) | V_D0_WEIGHT(16)); + t3_write_reg(adap, A_MPS_CFG, F_TPTXPORT0EN | F_TPTXPORT1EN | + F_TPRXPORTEN | F_PORT0ACTIVE | F_PORT1ACTIVE | + F_ENFORCEPKT); + t3_write_reg(adap, A_PM1_TX_CFG, 0x80008000); + t3_set_reg_field(adap, A_TP_PC_CONFIG, 0, F_TXTOSQUEUEMAPMODE); + t3_write_reg(adap, A_TP_TX_MOD_QUEUE_REQ_MAP, + V_TX_MOD_QUEUE_REQ_MAP(0xaa)); + for (i = 0; i < 16; i++) + t3_write_reg(adap, A_TP_TX_MOD_QUE_TABLE, + (i << 16) | 0x1010); + } +} + +static int calibrate_xgm(struct adapter *adapter) +{ + if (uses_xaui(adapter)) { + unsigned int v, i; + + for (i = 0; i < 5; ++i) { + t3_write_reg(adapter, A_XGM_XAUI_IMP, 0); + t3_read_reg(adapter, A_XGM_XAUI_IMP); + msleep(1); + v = t3_read_reg(adapter, A_XGM_XAUI_IMP); + if (!(v & (F_XGM_CALFAULT | F_CALBUSY))) { + t3_write_reg(adapter, A_XGM_XAUI_IMP, + V_XAUIIMP(G_CALIMP(v) >> 2)); + return 0; + } + } + CH_ERR(adapter, "MAC calibration failed\n"); + return -1; + } else { + t3_write_reg(adapter, A_XGM_RGMII_IMP, + V_RGMIIIMPPD(2) | V_RGMIIIMPPU(3)); + t3_set_reg_field(adapter, A_XGM_RGMII_IMP, F_XGM_IMPSETUPDATE, + F_XGM_IMPSETUPDATE); + } + return 0; +} + +static void calibrate_xgm_t3b(struct adapter *adapter) +{ + if (!uses_xaui(adapter)) { + t3_write_reg(adapter, A_XGM_RGMII_IMP, F_CALRESET | + F_CALUPDATE | V_RGMIIIMPPD(2) | V_RGMIIIMPPU(3)); + t3_set_reg_field(adapter, A_XGM_RGMII_IMP, F_CALRESET, 0); + t3_set_reg_field(adapter, A_XGM_RGMII_IMP, 0, + F_XGM_IMPSETUPDATE); + t3_set_reg_field(adapter, A_XGM_RGMII_IMP, F_XGM_IMPSETUPDATE, + 0); + t3_set_reg_field(adapter, A_XGM_RGMII_IMP, F_CALUPDATE, 0); + t3_set_reg_field(adapter, A_XGM_RGMII_IMP, 0, F_CALUPDATE); + } +} + +struct mc7_timing_params { + unsigned char ActToPreDly; + unsigned char ActToRdWrDly; + unsigned char PreCyc; + unsigned char RefCyc[5]; + unsigned char BkCyc; + unsigned char WrToRdDly; + unsigned char RdToWrDly; +}; + +/* + * Write a value to a register and check that the write completed. These + * writes normally complete in a cycle or two, so one read should suffice. + * The very first read exists to flush the posted write to the device. + */ +static int wrreg_wait(struct adapter *adapter, unsigned int addr, u32 val) +{ + t3_write_reg(adapter, addr, val); + t3_read_reg(adapter, addr); /* flush */ + if (!(t3_read_reg(adapter, addr) & F_BUSY)) + return 0; + CH_ERR(adapter, "write to MC7 register 0x%x timed out\n", addr); + return -EIO; +} + +static int mc7_init(struct mc7 *mc7, unsigned int mc7_clock, int mem_type) +{ + static const unsigned int mc7_mode[] = { + 0x632, 0x642, 0x652, 0x432, 0x442 + }; + static const struct mc7_timing_params mc7_timings[] = { + {12, 3, 4, {20, 28, 34, 52, 0}, 15, 6, 4}, + {12, 4, 5, {20, 28, 34, 52, 0}, 16, 7, 4}, + {12, 5, 6, {20, 28, 34, 52, 0}, 17, 8, 4}, + {9, 3, 4, {15, 21, 26, 39, 0}, 12, 6, 4}, + {9, 4, 5, {15, 21, 26, 39, 0}, 13, 7, 4} + }; + + u32 val; + unsigned int width, density, slow, attempts; + struct adapter *adapter = mc7->adapter; + const struct mc7_timing_params *p = &mc7_timings[mem_type]; + + if (!mc7->size) + return 0; + + val = t3_read_reg(adapter, mc7->offset + A_MC7_CFG); + slow = val & F_SLOW; + width = G_WIDTH(val); + density = G_DEN(val); + + t3_write_reg(adapter, mc7->offset + A_MC7_CFG, val | F_IFEN); + val = t3_read_reg(adapter, mc7->offset + A_MC7_CFG); /* flush */ + msleep(1); + + if (!slow) { + t3_write_reg(adapter, mc7->offset + A_MC7_CAL, F_SGL_CAL_EN); + t3_read_reg(adapter, mc7->offset + A_MC7_CAL); + msleep(1); + if (t3_read_reg(adapter, mc7->offset + A_MC7_CAL) & + (F_BUSY | F_SGL_CAL_EN | F_CAL_FAULT)) { + CH_ERR(adapter, "%s MC7 calibration timed out\n", + mc7->name); + goto out_fail; + } + } + + t3_write_reg(adapter, mc7->offset + A_MC7_PARM, + V_ACTTOPREDLY(p->ActToPreDly) | + V_ACTTORDWRDLY(p->ActToRdWrDly) | V_PRECYC(p->PreCyc) | + V_REFCYC(p->RefCyc[density]) | V_BKCYC(p->BkCyc) | + V_WRTORDDLY(p->WrToRdDly) | V_RDTOWRDLY(p->RdToWrDly)); + + t3_write_reg(adapter, mc7->offset + A_MC7_CFG, + val | F_CLKEN | F_TERM150); + t3_read_reg(adapter, mc7->offset + A_MC7_CFG); /* flush */ + + if (!slow) + t3_set_reg_field(adapter, mc7->offset + A_MC7_DLL, F_DLLENB, + F_DLLENB); + udelay(1); + + val = slow ? 3 : 6; + if (wrreg_wait(adapter, mc7->offset + A_MC7_PRE, 0) || + wrreg_wait(adapter, mc7->offset + A_MC7_EXT_MODE2, 0) || + wrreg_wait(adapter, mc7->offset + A_MC7_EXT_MODE3, 0) || + wrreg_wait(adapter, mc7->offset + A_MC7_EXT_MODE1, val)) + goto out_fail; + + if (!slow) { + t3_write_reg(adapter, mc7->offset + A_MC7_MODE, 0x100); + t3_set_reg_field(adapter, mc7->offset + A_MC7_DLL, F_DLLRST, 0); + udelay(5); + } + + if (wrreg_wait(adapter, mc7->offset + A_MC7_PRE, 0) || + wrreg_wait(adapter, mc7->offset + A_MC7_REF, 0) || + wrreg_wait(adapter, mc7->offset + A_MC7_REF, 0) || + wrreg_wait(adapter, mc7->offset + A_MC7_MODE, + mc7_mode[mem_type]) || + wrreg_wait(adapter, mc7->offset + A_MC7_EXT_MODE1, val | 0x380) || + wrreg_wait(adapter, mc7->offset + A_MC7_EXT_MODE1, val)) + goto out_fail; + + /* clock value is in KHz */ + mc7_clock = mc7_clock * 7812 + mc7_clock / 2; /* ns */ + mc7_clock /= 1000000; /* KHz->MHz, ns->us */ + + t3_write_reg(adapter, mc7->offset + A_MC7_REF, + F_PERREFEN | V_PREREFDIV(mc7_clock)); + t3_read_reg(adapter, mc7->offset + A_MC7_REF); /* flush */ + + t3_write_reg(adapter, mc7->offset + A_MC7_ECC, F_ECCGENEN | F_ECCCHKEN); + t3_write_reg(adapter, mc7->offset + A_MC7_BIST_DATA, 0); + t3_write_reg(adapter, mc7->offset + A_MC7_BIST_ADDR_BEG, 0); + t3_write_reg(adapter, mc7->offset + A_MC7_BIST_ADDR_END, + (mc7->size << width) - 1); + t3_write_reg(adapter, mc7->offset + A_MC7_BIST_OP, V_OP(1)); + t3_read_reg(adapter, mc7->offset + A_MC7_BIST_OP); /* flush */ + + attempts = 50; + do { + msleep(250); + val = t3_read_reg(adapter, mc7->offset + A_MC7_BIST_OP); + } while ((val & F_BUSY) && --attempts); + if (val & F_BUSY) { + CH_ERR(adapter, "%s MC7 BIST timed out\n", mc7->name); + goto out_fail; + } + + /* Enable normal memory accesses. */ + t3_set_reg_field(adapter, mc7->offset + A_MC7_CFG, 0, F_RDY); + return 0; + +out_fail: + return -1; +} + +static void config_pcie(struct adapter *adap) +{ + static const u16 ack_lat[4][6] = { + {237, 416, 559, 1071, 2095, 4143}, + {128, 217, 289, 545, 1057, 2081}, + {73, 118, 154, 282, 538, 1050}, + {67, 107, 86, 150, 278, 534} + }; + static const u16 rpl_tmr[4][6] = { + {711, 1248, 1677, 3213, 6285, 12429}, + {384, 651, 867, 1635, 3171, 6243}, + {219, 354, 462, 846, 1614, 3150}, + {201, 321, 258, 450, 834, 1602} + }; + + u16 val, devid; + unsigned int log2_width, pldsize; + unsigned int fst_trn_rx, fst_trn_tx, acklat, rpllmt; + + pcie_capability_read_word(adap->pdev, PCI_EXP_DEVCTL, &val); + pldsize = (val & PCI_EXP_DEVCTL_PAYLOAD) >> 5; + + pci_read_config_word(adap->pdev, 0x2, &devid); + if (devid == 0x37) { + pcie_capability_write_word(adap->pdev, PCI_EXP_DEVCTL, + val & ~PCI_EXP_DEVCTL_READRQ & + ~PCI_EXP_DEVCTL_PAYLOAD); + pldsize = 0; + } + + pcie_capability_read_word(adap->pdev, PCI_EXP_LNKCTL, &val); + + fst_trn_tx = G_NUMFSTTRNSEQ(t3_read_reg(adap, A_PCIE_PEX_CTRL0)); + fst_trn_rx = adap->params.rev == 0 ? fst_trn_tx : + G_NUMFSTTRNSEQRX(t3_read_reg(adap, A_PCIE_MODE)); + log2_width = fls(adap->params.pci.width) - 1; + acklat = ack_lat[log2_width][pldsize]; + if (val & PCI_EXP_LNKCTL_ASPM_L0S) /* check LOsEnable */ + acklat += fst_trn_tx * 4; + rpllmt = rpl_tmr[log2_width][pldsize] + fst_trn_rx * 4; + + if (adap->params.rev == 0) + t3_set_reg_field(adap, A_PCIE_PEX_CTRL1, + V_T3A_ACKLAT(M_T3A_ACKLAT), + V_T3A_ACKLAT(acklat)); + else + t3_set_reg_field(adap, A_PCIE_PEX_CTRL1, V_ACKLAT(M_ACKLAT), + V_ACKLAT(acklat)); + + t3_set_reg_field(adap, A_PCIE_PEX_CTRL0, V_REPLAYLMT(M_REPLAYLMT), + V_REPLAYLMT(rpllmt)); + + t3_write_reg(adap, A_PCIE_PEX_ERR, 0xffffffff); + t3_set_reg_field(adap, A_PCIE_CFG, 0, + F_ENABLELINKDWNDRST | F_ENABLELINKDOWNRST | + F_PCIE_DMASTOPEN | F_PCIE_CLIDECEN); +} + +/* + * Initialize and configure T3 HW modules. This performs the + * initialization steps that need to be done once after a card is reset. + * MAC and PHY initialization is handled separarely whenever a port is enabled. + * + * fw_params are passed to FW and their value is platform dependent. Only the + * top 8 bits are available for use, the rest must be 0. + */ +int t3_init_hw(struct adapter *adapter, u32 fw_params) +{ + int err = -EIO, attempts, i; + const struct vpd_params *vpd = &adapter->params.vpd; + + if (adapter->params.rev > 0) + calibrate_xgm_t3b(adapter); + else if (calibrate_xgm(adapter)) + goto out_err; + + if (vpd->mclk) { + partition_mem(adapter, &adapter->params.tp); + + if (mc7_init(&adapter->pmrx, vpd->mclk, vpd->mem_timing) || + mc7_init(&adapter->pmtx, vpd->mclk, vpd->mem_timing) || + mc7_init(&adapter->cm, vpd->mclk, vpd->mem_timing) || + t3_mc5_init(&adapter->mc5, adapter->params.mc5.nservers, + adapter->params.mc5.nfilters, + adapter->params.mc5.nroutes)) + goto out_err; + + for (i = 0; i < 32; i++) + if (clear_sge_ctxt(adapter, i, F_CQ)) + goto out_err; + } + + if (tp_init(adapter, &adapter->params.tp)) + goto out_err; + + t3_tp_set_coalescing_size(adapter, + min(adapter->params.sge.max_pkt_size, + MAX_RX_COALESCING_LEN), 1); + t3_tp_set_max_rxsize(adapter, + min(adapter->params.sge.max_pkt_size, 16384U)); + ulp_config(adapter, &adapter->params.tp); + + if (is_pcie(adapter)) + config_pcie(adapter); + else + t3_set_reg_field(adapter, A_PCIX_CFG, 0, + F_DMASTOPEN | F_CLIDECEN); + + if (adapter->params.rev == T3_REV_C) + t3_set_reg_field(adapter, A_ULPTX_CONFIG, 0, + F_CFG_CQE_SOP_MASK); + + t3_write_reg(adapter, A_PM1_RX_CFG, 0xffffffff); + t3_write_reg(adapter, A_PM1_RX_MODE, 0); + t3_write_reg(adapter, A_PM1_TX_MODE, 0); + chan_init_hw(adapter, adapter->params.chan_map); + t3_sge_init(adapter, &adapter->params.sge); + t3_set_reg_field(adapter, A_PL_RST, 0, F_FATALPERREN); + + t3_write_reg(adapter, A_T3DBG_GPIO_ACT_LOW, calc_gpio_intr(adapter)); + + t3_write_reg(adapter, A_CIM_HOST_ACC_DATA, vpd->uclk | fw_params); + t3_write_reg(adapter, A_CIM_BOOT_CFG, + V_BOOTADDR(FW_FLASH_BOOT_ADDR >> 2)); + t3_read_reg(adapter, A_CIM_BOOT_CFG); /* flush */ + + attempts = 100; + do { /* wait for uP to initialize */ + msleep(20); + } while (t3_read_reg(adapter, A_CIM_HOST_ACC_DATA) && --attempts); + if (!attempts) { + CH_ERR(adapter, "uP initialization timed out\n"); + goto out_err; + } + + err = 0; +out_err: + return err; +} + +/** + * get_pci_mode - determine a card's PCI mode + * @adapter: the adapter + * @p: where to store the PCI settings + * + * Determines a card's PCI mode and associated parameters, such as speed + * and width. + */ +static void get_pci_mode(struct adapter *adapter, struct pci_params *p) +{ + static unsigned short speed_map[] = { 33, 66, 100, 133 }; + u32 pci_mode; + + if (pci_is_pcie(adapter->pdev)) { + u16 val; + + p->variant = PCI_VARIANT_PCIE; + pcie_capability_read_word(adapter->pdev, PCI_EXP_LNKSTA, &val); + p->width = (val >> 4) & 0x3f; + return; + } + + pci_mode = t3_read_reg(adapter, A_PCIX_MODE); + p->speed = speed_map[G_PCLKRANGE(pci_mode)]; + p->width = (pci_mode & F_64BIT) ? 64 : 32; + pci_mode = G_PCIXINITPAT(pci_mode); + if (pci_mode == 0) + p->variant = PCI_VARIANT_PCI; + else if (pci_mode < 4) + p->variant = PCI_VARIANT_PCIX_MODE1_PARITY; + else if (pci_mode < 8) + p->variant = PCI_VARIANT_PCIX_MODE1_ECC; + else + p->variant = PCI_VARIANT_PCIX_266_MODE2; +} + +/** + * init_link_config - initialize a link's SW state + * @lc: structure holding the link state + * @ai: information about the current card + * + * Initializes the SW state maintained for each link, including the link's + * capabilities and default speed/duplex/flow-control/autonegotiation + * settings. + */ +static void init_link_config(struct link_config *lc, unsigned int caps) +{ + lc->supported = caps; + lc->requested_speed = lc->speed = SPEED_INVALID; + lc->requested_duplex = lc->duplex = DUPLEX_INVALID; + lc->requested_fc = lc->fc = PAUSE_RX | PAUSE_TX; + if (lc->supported & SUPPORTED_Autoneg) { + lc->advertising = lc->supported; + lc->autoneg = AUTONEG_ENABLE; + lc->requested_fc |= PAUSE_AUTONEG; + } else { + lc->advertising = 0; + lc->autoneg = AUTONEG_DISABLE; + } +} + +/** + * mc7_calc_size - calculate MC7 memory size + * @cfg: the MC7 configuration + * + * Calculates the size of an MC7 memory in bytes from the value of its + * configuration register. + */ +static unsigned int mc7_calc_size(u32 cfg) +{ + unsigned int width = G_WIDTH(cfg); + unsigned int banks = !!(cfg & F_BKS) + 1; + unsigned int org = !!(cfg & F_ORG) + 1; + unsigned int density = G_DEN(cfg); + unsigned int MBs = ((256 << density) * banks) / (org << width); + + return MBs << 20; +} + +static void mc7_prep(struct adapter *adapter, struct mc7 *mc7, + unsigned int base_addr, const char *name) +{ + u32 cfg; + + mc7->adapter = adapter; + mc7->name = name; + mc7->offset = base_addr - MC7_PMRX_BASE_ADDR; + cfg = t3_read_reg(adapter, mc7->offset + A_MC7_CFG); + mc7->size = G_DEN(cfg) == M_DEN ? 0 : mc7_calc_size(cfg); + mc7->width = G_WIDTH(cfg); +} + +static void mac_prep(struct cmac *mac, struct adapter *adapter, int index) +{ + u16 devid; + + mac->adapter = adapter; + pci_read_config_word(adapter->pdev, 0x2, &devid); + + if (devid == 0x37 && !adapter->params.vpd.xauicfg[1]) + index = 0; + mac->offset = (XGMAC0_1_BASE_ADDR - XGMAC0_0_BASE_ADDR) * index; + mac->nucast = 1; + + if (adapter->params.rev == 0 && uses_xaui(adapter)) { + t3_write_reg(adapter, A_XGM_SERDES_CTRL + mac->offset, + is_10G(adapter) ? 0x2901c04 : 0x2301c04); + t3_set_reg_field(adapter, A_XGM_PORT_CFG + mac->offset, + F_ENRGMII, 0); + } +} + +static void early_hw_init(struct adapter *adapter, + const struct adapter_info *ai) +{ + u32 val = V_PORTSPEED(is_10G(adapter) ? 3 : 2); + + mi1_init(adapter, ai); + t3_write_reg(adapter, A_I2C_CFG, /* set for 80KHz */ + V_I2C_CLKDIV(adapter->params.vpd.cclk / 80 - 1)); + t3_write_reg(adapter, A_T3DBG_GPIO_EN, + ai->gpio_out | F_GPIO0_OEN | F_GPIO0_OUT_VAL); + t3_write_reg(adapter, A_MC5_DB_SERVER_INDEX, 0); + t3_write_reg(adapter, A_SG_OCO_BASE, V_BASE1(0xfff)); + + if (adapter->params.rev == 0 || !uses_xaui(adapter)) + val |= F_ENRGMII; + + /* Enable MAC clocks so we can access the registers */ + t3_write_reg(adapter, A_XGM_PORT_CFG, val); + t3_read_reg(adapter, A_XGM_PORT_CFG); + + val |= F_CLKDIVRESET_; + t3_write_reg(adapter, A_XGM_PORT_CFG, val); + t3_read_reg(adapter, A_XGM_PORT_CFG); + t3_write_reg(adapter, XGM_REG(A_XGM_PORT_CFG, 1), val); + t3_read_reg(adapter, A_XGM_PORT_CFG); +} + +/* + * Reset the adapter. + * Older PCIe cards lose their config space during reset, PCI-X + * ones don't. + */ +int t3_reset_adapter(struct adapter *adapter) +{ + int i, save_and_restore_pcie = + adapter->params.rev < T3_REV_B2 && is_pcie(adapter); + uint16_t devid = 0; + + if (save_and_restore_pcie) + pci_save_state(adapter->pdev); + t3_write_reg(adapter, A_PL_RST, F_CRSTWRM | F_CRSTWRMMODE); + + /* + * Delay. Give Some time to device to reset fully. + * XXX The delay time should be modified. + */ + for (i = 0; i < 10; i++) { + msleep(50); + pci_read_config_word(adapter->pdev, 0x00, &devid); + if (devid == 0x1425) + break; + } + + if (devid != 0x1425) + return -1; + + if (save_and_restore_pcie) + pci_restore_state(adapter->pdev); + return 0; +} + +static int init_parity(struct adapter *adap) +{ + int i, err, addr; + + if (t3_read_reg(adap, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY) + return -EBUSY; + + for (err = i = 0; !err && i < 16; i++) + err = clear_sge_ctxt(adap, i, F_EGRESS); + for (i = 0xfff0; !err && i <= 0xffff; i++) + err = clear_sge_ctxt(adap, i, F_EGRESS); + for (i = 0; !err && i < SGE_QSETS; i++) + err = clear_sge_ctxt(adap, i, F_RESPONSEQ); + if (err) + return err; + + t3_write_reg(adap, A_CIM_IBQ_DBG_DATA, 0); + for (i = 0; i < 4; i++) + for (addr = 0; addr <= M_IBQDBGADDR; addr++) { + t3_write_reg(adap, A_CIM_IBQ_DBG_CFG, F_IBQDBGEN | + F_IBQDBGWR | V_IBQDBGQID(i) | + V_IBQDBGADDR(addr)); + err = t3_wait_op_done(adap, A_CIM_IBQ_DBG_CFG, + F_IBQDBGBUSY, 0, 2, 1); + if (err) + return err; + } + return 0; +} + +/* + * Initialize adapter SW state for the various HW modules, set initial values + * for some adapter tunables, take PHYs out of reset, and initialize the MDIO + * interface. + */ +int t3_prep_adapter(struct adapter *adapter, const struct adapter_info *ai, + int reset) +{ + int ret; + unsigned int i, j = -1; + + get_pci_mode(adapter, &adapter->params.pci); + + adapter->params.info = ai; + adapter->params.nports = ai->nports0 + ai->nports1; + adapter->params.chan_map = (!!ai->nports0) | (!!ai->nports1 << 1); + adapter->params.rev = t3_read_reg(adapter, A_PL_REV); + /* + * We used to only run the "adapter check task" once a second if + * we had PHYs which didn't support interrupts (we would check + * their link status once a second). Now we check other conditions + * in that routine which could potentially impose a very high + * interrupt load on the system. As such, we now always scan the + * adapter state once a second ... + */ + adapter->params.linkpoll_period = 10; + adapter->params.stats_update_period = is_10G(adapter) ? + MAC_STATS_ACCUM_SECS : (MAC_STATS_ACCUM_SECS * 10); + adapter->params.pci.vpd_cap_addr = + pci_find_capability(adapter->pdev, PCI_CAP_ID_VPD); + ret = get_vpd_params(adapter, &adapter->params.vpd); + if (ret < 0) + return ret; + + if (reset && t3_reset_adapter(adapter)) + return -1; + + t3_sge_prep(adapter, &adapter->params.sge); + + if (adapter->params.vpd.mclk) { + struct tp_params *p = &adapter->params.tp; + + mc7_prep(adapter, &adapter->pmrx, MC7_PMRX_BASE_ADDR, "PMRX"); + mc7_prep(adapter, &adapter->pmtx, MC7_PMTX_BASE_ADDR, "PMTX"); + mc7_prep(adapter, &adapter->cm, MC7_CM_BASE_ADDR, "CM"); + + p->nchan = adapter->params.chan_map == 3 ? 2 : 1; + p->pmrx_size = t3_mc7_size(&adapter->pmrx); + p->pmtx_size = t3_mc7_size(&adapter->pmtx); + p->cm_size = t3_mc7_size(&adapter->cm); + p->chan_rx_size = p->pmrx_size / 2; /* only 1 Rx channel */ + p->chan_tx_size = p->pmtx_size / p->nchan; + p->rx_pg_size = 64 * 1024; + p->tx_pg_size = is_10G(adapter) ? 64 * 1024 : 16 * 1024; + p->rx_num_pgs = pm_num_pages(p->chan_rx_size, p->rx_pg_size); + p->tx_num_pgs = pm_num_pages(p->chan_tx_size, p->tx_pg_size); + p->ntimer_qs = p->cm_size >= (128 << 20) || + adapter->params.rev > 0 ? 12 : 6; + } + + adapter->params.offload = t3_mc7_size(&adapter->pmrx) && + t3_mc7_size(&adapter->pmtx) && + t3_mc7_size(&adapter->cm); + + if (is_offload(adapter)) { + adapter->params.mc5.nservers = DEFAULT_NSERVERS; + adapter->params.mc5.nfilters = adapter->params.rev > 0 ? + DEFAULT_NFILTERS : 0; + adapter->params.mc5.nroutes = 0; + t3_mc5_prep(adapter, &adapter->mc5, MC5_MODE_144_BIT); + + init_mtus(adapter->params.mtus); + init_cong_ctrl(adapter->params.a_wnd, adapter->params.b_wnd); + } + + early_hw_init(adapter, ai); + ret = init_parity(adapter); + if (ret) + return ret; + + for_each_port(adapter, i) { + u8 hw_addr[6]; + const struct port_type_info *pti; + struct port_info *p = adap2pinfo(adapter, i); + + while (!adapter->params.vpd.port_type[++j]) + ; + + pti = &port_types[adapter->params.vpd.port_type[j]]; + if (!pti->phy_prep) { + CH_ALERT(adapter, "Invalid port type index %d\n", + adapter->params.vpd.port_type[j]); + return -EINVAL; + } + + p->phy.mdio.dev = adapter->port[i]; + ret = pti->phy_prep(&p->phy, adapter, ai->phy_base_addr + j, + ai->mdio_ops); + if (ret) + return ret; + mac_prep(&p->mac, adapter, j); + + /* + * The VPD EEPROM stores the base Ethernet address for the + * card. A port's address is derived from the base by adding + * the port's index to the base's low octet. + */ + memcpy(hw_addr, adapter->params.vpd.eth_base, 5); + hw_addr[5] = adapter->params.vpd.eth_base[5] + i; + + memcpy(adapter->port[i]->dev_addr, hw_addr, + ETH_ALEN); + init_link_config(&p->link_config, p->phy.caps); + p->phy.ops->power_down(&p->phy, 1); + + /* + * If the PHY doesn't support interrupts for link status + * changes, schedule a scan of the adapter links at least + * once a second. + */ + if (!(p->phy.caps & SUPPORTED_IRQ) && + adapter->params.linkpoll_period > 10) + adapter->params.linkpoll_period = 10; + } + + return 0; +} + +void t3_led_ready(struct adapter *adapter) +{ + t3_set_reg_field(adapter, A_T3DBG_GPIO_EN, F_GPIO0_OUT_VAL, + F_GPIO0_OUT_VAL); +} + +int t3_replay_prep_adapter(struct adapter *adapter) +{ + const struct adapter_info *ai = adapter->params.info; + unsigned int i, j = -1; + int ret; + + early_hw_init(adapter, ai); + ret = init_parity(adapter); + if (ret) + return ret; + + for_each_port(adapter, i) { + const struct port_type_info *pti; + struct port_info *p = adap2pinfo(adapter, i); + + while (!adapter->params.vpd.port_type[++j]) + ; + + pti = &port_types[adapter->params.vpd.port_type[j]]; + ret = pti->phy_prep(&p->phy, adapter, p->phy.mdio.prtad, NULL); + if (ret) + return ret; + p->phy.ops->power_down(&p->phy, 1); + } + +return 0; +} + diff --git a/drivers/net/ethernet/chelsio/cxgb3/t3cdev.h b/drivers/net/ethernet/chelsio/cxgb3/t3cdev.h new file mode 100644 index 0000000..705713b --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb3/t3cdev.h @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2006-2008 Chelsio Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _T3CDEV_H_ +#define _T3CDEV_H_ + +#include +#include +#include +#include +#include +#include + +#define T3CNAMSIZ 16 + +struct cxgb3_client; + +enum t3ctype { + T3A = 0, + T3B, + T3C, +}; + +struct t3cdev { + char name[T3CNAMSIZ]; /* T3C device name */ + enum t3ctype type; + struct list_head ofld_dev_list; /* for list linking */ + struct net_device *lldev; /* LL dev associated with T3C messages */ + struct proc_dir_entry *proc_dir; /* root of proc dir for this T3C */ + int (*send)(struct t3cdev *dev, struct sk_buff *skb); + int (*recv)(struct t3cdev *dev, struct sk_buff **skb, int n); + int (*ctl)(struct t3cdev *dev, unsigned int req, void *data); + void (*neigh_update)(struct t3cdev *dev, struct neighbour *neigh); + void *priv; /* driver private data */ + void *l2opt; /* optional layer 2 data */ + void *l3opt; /* optional layer 3 data */ + void *l4opt; /* optional layer 4 data */ + void *ulp; /* ulp stuff */ + void *ulp_iscsi; /* ulp iscsi */ +}; + +#endif /* _T3CDEV_H_ */ diff --git a/drivers/net/ethernet/chelsio/cxgb3/version.h b/drivers/net/ethernet/chelsio/cxgb3/version.h new file mode 100644 index 0000000..165bfb9 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb3/version.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2003-2008 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +/* $Date: 2006/10/31 18:57:51 $ $RCSfile: version.h,v $ $Revision: 1.3 $ */ +#ifndef __CHELSIO_VERSION_H +#define __CHELSIO_VERSION_H +#define DRV_DESC "Chelsio T3 Network Driver" +#define DRV_NAME "cxgb3" +/* Driver version */ +#define DRV_VERSION "1.1.5-ko" + +/* Firmware version */ +#define FW_VERSION_MAJOR 7 +#define FW_VERSION_MINOR 12 +#define FW_VERSION_MICRO 0 +#endif /* __CHELSIO_VERSION_H */ diff --git a/drivers/net/ethernet/chelsio/cxgb3/vsc8211.c b/drivers/net/ethernet/chelsio/cxgb3/vsc8211.c new file mode 100644 index 0000000..4f9a1c2 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb3/vsc8211.c @@ -0,0 +1,416 @@ +/* + * Copyright (c) 2005-2008 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "common.h" + +/* VSC8211 PHY specific registers. */ +enum { + VSC8211_SIGDET_CTRL = 19, + VSC8211_EXT_CTRL = 23, + VSC8211_INTR_ENABLE = 25, + VSC8211_INTR_STATUS = 26, + VSC8211_LED_CTRL = 27, + VSC8211_AUX_CTRL_STAT = 28, + VSC8211_EXT_PAGE_AXS = 31, +}; + +enum { + VSC_INTR_RX_ERR = 1 << 0, + VSC_INTR_MS_ERR = 1 << 1, /* master/slave resolution error */ + VSC_INTR_CABLE = 1 << 2, /* cable impairment */ + VSC_INTR_FALSE_CARR = 1 << 3, /* false carrier */ + VSC_INTR_MEDIA_CHG = 1 << 4, /* AMS media change */ + VSC_INTR_RX_FIFO = 1 << 5, /* Rx FIFO over/underflow */ + VSC_INTR_TX_FIFO = 1 << 6, /* Tx FIFO over/underflow */ + VSC_INTR_DESCRAMBL = 1 << 7, /* descrambler lock-lost */ + VSC_INTR_SYMBOL_ERR = 1 << 8, /* symbol error */ + VSC_INTR_NEG_DONE = 1 << 10, /* autoneg done */ + VSC_INTR_NEG_ERR = 1 << 11, /* autoneg error */ + VSC_INTR_DPLX_CHG = 1 << 12, /* duplex change */ + VSC_INTR_LINK_CHG = 1 << 13, /* link change */ + VSC_INTR_SPD_CHG = 1 << 14, /* speed change */ + VSC_INTR_ENABLE = 1 << 15, /* interrupt enable */ +}; + +enum { + VSC_CTRL_CLAUSE37_VIEW = 1 << 4, /* Switch to Clause 37 view */ + VSC_CTRL_MEDIA_MODE_HI = 0xf000 /* High part of media mode select */ +}; + +#define CFG_CHG_INTR_MASK (VSC_INTR_LINK_CHG | VSC_INTR_NEG_ERR | \ + VSC_INTR_DPLX_CHG | VSC_INTR_SPD_CHG | \ + VSC_INTR_NEG_DONE) +#define INTR_MASK (CFG_CHG_INTR_MASK | VSC_INTR_TX_FIFO | VSC_INTR_RX_FIFO | \ + VSC_INTR_ENABLE) + +/* PHY specific auxiliary control & status register fields */ +#define S_ACSR_ACTIPHY_TMR 0 +#define M_ACSR_ACTIPHY_TMR 0x3 +#define V_ACSR_ACTIPHY_TMR(x) ((x) << S_ACSR_ACTIPHY_TMR) + +#define S_ACSR_SPEED 3 +#define M_ACSR_SPEED 0x3 +#define G_ACSR_SPEED(x) (((x) >> S_ACSR_SPEED) & M_ACSR_SPEED) + +#define S_ACSR_DUPLEX 5 +#define F_ACSR_DUPLEX (1 << S_ACSR_DUPLEX) + +#define S_ACSR_ACTIPHY 6 +#define F_ACSR_ACTIPHY (1 << S_ACSR_ACTIPHY) + +/* + * Reset the PHY. This PHY completes reset immediately so we never wait. + */ +static int vsc8211_reset(struct cphy *cphy, int wait) +{ + return t3_phy_reset(cphy, MDIO_DEVAD_NONE, 0); +} + +static int vsc8211_intr_enable(struct cphy *cphy) +{ + return t3_mdio_write(cphy, MDIO_DEVAD_NONE, VSC8211_INTR_ENABLE, + INTR_MASK); +} + +static int vsc8211_intr_disable(struct cphy *cphy) +{ + return t3_mdio_write(cphy, MDIO_DEVAD_NONE, VSC8211_INTR_ENABLE, 0); +} + +static int vsc8211_intr_clear(struct cphy *cphy) +{ + u32 val; + + /* Clear PHY interrupts by reading the register. */ + return t3_mdio_read(cphy, MDIO_DEVAD_NONE, VSC8211_INTR_STATUS, &val); +} + +static int vsc8211_autoneg_enable(struct cphy *cphy) +{ + return t3_mdio_change_bits(cphy, MDIO_DEVAD_NONE, MII_BMCR, + BMCR_PDOWN | BMCR_ISOLATE, + BMCR_ANENABLE | BMCR_ANRESTART); +} + +static int vsc8211_autoneg_restart(struct cphy *cphy) +{ + return t3_mdio_change_bits(cphy, MDIO_DEVAD_NONE, MII_BMCR, + BMCR_PDOWN | BMCR_ISOLATE, + BMCR_ANRESTART); +} + +static int vsc8211_get_link_status(struct cphy *cphy, int *link_ok, + int *speed, int *duplex, int *fc) +{ + unsigned int bmcr, status, lpa, adv; + int err, sp = -1, dplx = -1, pause = 0; + + err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, MII_BMCR, &bmcr); + if (!err) + err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, MII_BMSR, &status); + if (err) + return err; + + if (link_ok) { + /* + * BMSR_LSTATUS is latch-low, so if it is 0 we need to read it + * once more to get the current link state. + */ + if (!(status & BMSR_LSTATUS)) + err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, MII_BMSR, + &status); + if (err) + return err; + *link_ok = (status & BMSR_LSTATUS) != 0; + } + if (!(bmcr & BMCR_ANENABLE)) { + dplx = (bmcr & BMCR_FULLDPLX) ? DUPLEX_FULL : DUPLEX_HALF; + if (bmcr & BMCR_SPEED1000) + sp = SPEED_1000; + else if (bmcr & BMCR_SPEED100) + sp = SPEED_100; + else + sp = SPEED_10; + } else if (status & BMSR_ANEGCOMPLETE) { + err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, VSC8211_AUX_CTRL_STAT, + &status); + if (err) + return err; + + dplx = (status & F_ACSR_DUPLEX) ? DUPLEX_FULL : DUPLEX_HALF; + sp = G_ACSR_SPEED(status); + if (sp == 0) + sp = SPEED_10; + else if (sp == 1) + sp = SPEED_100; + else + sp = SPEED_1000; + + if (fc && dplx == DUPLEX_FULL) { + err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, MII_LPA, + &lpa); + if (!err) + err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, + MII_ADVERTISE, &adv); + if (err) + return err; + + if (lpa & adv & ADVERTISE_PAUSE_CAP) + pause = PAUSE_RX | PAUSE_TX; + else if ((lpa & ADVERTISE_PAUSE_CAP) && + (lpa & ADVERTISE_PAUSE_ASYM) && + (adv & ADVERTISE_PAUSE_ASYM)) + pause = PAUSE_TX; + else if ((lpa & ADVERTISE_PAUSE_ASYM) && + (adv & ADVERTISE_PAUSE_CAP)) + pause = PAUSE_RX; + } + } + if (speed) + *speed = sp; + if (duplex) + *duplex = dplx; + if (fc) + *fc = pause; + return 0; +} + +static int vsc8211_get_link_status_fiber(struct cphy *cphy, int *link_ok, + int *speed, int *duplex, int *fc) +{ + unsigned int bmcr, status, lpa, adv; + int err, sp = -1, dplx = -1, pause = 0; + + err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, MII_BMCR, &bmcr); + if (!err) + err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, MII_BMSR, &status); + if (err) + return err; + + if (link_ok) { + /* + * BMSR_LSTATUS is latch-low, so if it is 0 we need to read it + * once more to get the current link state. + */ + if (!(status & BMSR_LSTATUS)) + err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, MII_BMSR, + &status); + if (err) + return err; + *link_ok = (status & BMSR_LSTATUS) != 0; + } + if (!(bmcr & BMCR_ANENABLE)) { + dplx = (bmcr & BMCR_FULLDPLX) ? DUPLEX_FULL : DUPLEX_HALF; + if (bmcr & BMCR_SPEED1000) + sp = SPEED_1000; + else if (bmcr & BMCR_SPEED100) + sp = SPEED_100; + else + sp = SPEED_10; + } else if (status & BMSR_ANEGCOMPLETE) { + err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, MII_LPA, &lpa); + if (!err) + err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, MII_ADVERTISE, + &adv); + if (err) + return err; + + if (adv & lpa & ADVERTISE_1000XFULL) { + dplx = DUPLEX_FULL; + sp = SPEED_1000; + } else if (adv & lpa & ADVERTISE_1000XHALF) { + dplx = DUPLEX_HALF; + sp = SPEED_1000; + } + + if (fc && dplx == DUPLEX_FULL) { + if (lpa & adv & ADVERTISE_1000XPAUSE) + pause = PAUSE_RX | PAUSE_TX; + else if ((lpa & ADVERTISE_1000XPAUSE) && + (adv & lpa & ADVERTISE_1000XPSE_ASYM)) + pause = PAUSE_TX; + else if ((lpa & ADVERTISE_1000XPSE_ASYM) && + (adv & ADVERTISE_1000XPAUSE)) + pause = PAUSE_RX; + } + } + if (speed) + *speed = sp; + if (duplex) + *duplex = dplx; + if (fc) + *fc = pause; + return 0; +} + +#ifdef UNUSED +/* + * Enable/disable auto MDI/MDI-X in forced link speed mode. + */ +static int vsc8211_set_automdi(struct cphy *phy, int enable) +{ + int err; + + err = t3_mdio_write(phy, MDIO_DEVAD_NONE, VSC8211_EXT_PAGE_AXS, 0x52b5); + if (err) + return err; + + err = t3_mdio_write(phy, MDIO_DEVAD_NONE, 18, 0x12); + if (err) + return err; + + err = t3_mdio_write(phy, MDIO_DEVAD_NONE, 17, enable ? 0x2803 : 0x3003); + if (err) + return err; + + err = t3_mdio_write(phy, MDIO_DEVAD_NONE, 16, 0x87fa); + if (err) + return err; + + err = t3_mdio_write(phy, MDIO_DEVAD_NONE, VSC8211_EXT_PAGE_AXS, 0); + if (err) + return err; + + return 0; +} + +int vsc8211_set_speed_duplex(struct cphy *phy, int speed, int duplex) +{ + int err; + + err = t3_set_phy_speed_duplex(phy, speed, duplex); + if (!err) + err = vsc8211_set_automdi(phy, 1); + return err; +} +#endif /* UNUSED */ + +static int vsc8211_power_down(struct cphy *cphy, int enable) +{ + return t3_mdio_change_bits(cphy, 0, MII_BMCR, BMCR_PDOWN, + enable ? BMCR_PDOWN : 0); +} + +static int vsc8211_intr_handler(struct cphy *cphy) +{ + unsigned int cause; + int err, cphy_cause = 0; + + err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, VSC8211_INTR_STATUS, &cause); + if (err) + return err; + + cause &= INTR_MASK; + if (cause & CFG_CHG_INTR_MASK) + cphy_cause |= cphy_cause_link_change; + if (cause & (VSC_INTR_RX_FIFO | VSC_INTR_TX_FIFO)) + cphy_cause |= cphy_cause_fifo_error; + return cphy_cause; +} + +static struct cphy_ops vsc8211_ops = { + .reset = vsc8211_reset, + .intr_enable = vsc8211_intr_enable, + .intr_disable = vsc8211_intr_disable, + .intr_clear = vsc8211_intr_clear, + .intr_handler = vsc8211_intr_handler, + .autoneg_enable = vsc8211_autoneg_enable, + .autoneg_restart = vsc8211_autoneg_restart, + .advertise = t3_phy_advertise, + .set_speed_duplex = t3_set_phy_speed_duplex, + .get_link_status = vsc8211_get_link_status, + .power_down = vsc8211_power_down, +}; + +static struct cphy_ops vsc8211_fiber_ops = { + .reset = vsc8211_reset, + .intr_enable = vsc8211_intr_enable, + .intr_disable = vsc8211_intr_disable, + .intr_clear = vsc8211_intr_clear, + .intr_handler = vsc8211_intr_handler, + .autoneg_enable = vsc8211_autoneg_enable, + .autoneg_restart = vsc8211_autoneg_restart, + .advertise = t3_phy_advertise_fiber, + .set_speed_duplex = t3_set_phy_speed_duplex, + .get_link_status = vsc8211_get_link_status_fiber, + .power_down = vsc8211_power_down, +}; + +int t3_vsc8211_phy_prep(struct cphy *phy, struct adapter *adapter, + int phy_addr, const struct mdio_ops *mdio_ops) +{ + int err; + unsigned int val; + + cphy_init(phy, adapter, phy_addr, &vsc8211_ops, mdio_ops, + SUPPORTED_10baseT_Full | SUPPORTED_100baseT_Full | + SUPPORTED_1000baseT_Full | SUPPORTED_Autoneg | SUPPORTED_MII | + SUPPORTED_TP | SUPPORTED_IRQ, "10/100/1000BASE-T"); + msleep(20); /* PHY needs ~10ms to start responding to MDIO */ + + err = t3_mdio_read(phy, MDIO_DEVAD_NONE, VSC8211_EXT_CTRL, &val); + if (err) + return err; + if (val & VSC_CTRL_MEDIA_MODE_HI) { + /* copper interface, just need to configure the LEDs */ + return t3_mdio_write(phy, MDIO_DEVAD_NONE, VSC8211_LED_CTRL, + 0x100); + } + + phy->caps = SUPPORTED_1000baseT_Full | SUPPORTED_Autoneg | + SUPPORTED_MII | SUPPORTED_FIBRE | SUPPORTED_IRQ; + phy->desc = "1000BASE-X"; + phy->ops = &vsc8211_fiber_ops; + + err = t3_mdio_write(phy, MDIO_DEVAD_NONE, VSC8211_EXT_PAGE_AXS, 1); + if (err) + return err; + + err = t3_mdio_write(phy, MDIO_DEVAD_NONE, VSC8211_SIGDET_CTRL, 1); + if (err) + return err; + + err = t3_mdio_write(phy, MDIO_DEVAD_NONE, VSC8211_EXT_PAGE_AXS, 0); + if (err) + return err; + + err = t3_mdio_write(phy, MDIO_DEVAD_NONE, VSC8211_EXT_CTRL, + val | VSC_CTRL_CLAUSE37_VIEW); + if (err) + return err; + + err = vsc8211_reset(phy, 0); + if (err) + return err; + + udelay(5); /* delay after reset before next SMI */ + return 0; +} diff --git a/drivers/net/ethernet/chelsio/cxgb3/xgmac.c b/drivers/net/ethernet/chelsio/cxgb3/xgmac.c new file mode 100644 index 0000000..3af19a5 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb3/xgmac.c @@ -0,0 +1,657 @@ +/* + * Copyright (c) 2005-2008 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "common.h" +#include "regs.h" + +/* + * # of exact address filters. The first one is used for the station address, + * the rest are available for multicast addresses. + */ +#define EXACT_ADDR_FILTERS 8 + +static inline int macidx(const struct cmac *mac) +{ + return mac->offset / (XGMAC0_1_BASE_ADDR - XGMAC0_0_BASE_ADDR); +} + +static void xaui_serdes_reset(struct cmac *mac) +{ + static const unsigned int clear[] = { + F_PWRDN0 | F_PWRDN1, F_RESETPLL01, F_RESET0 | F_RESET1, + F_PWRDN2 | F_PWRDN3, F_RESETPLL23, F_RESET2 | F_RESET3 + }; + + int i; + struct adapter *adap = mac->adapter; + u32 ctrl = A_XGM_SERDES_CTRL0 + mac->offset; + + t3_write_reg(adap, ctrl, adap->params.vpd.xauicfg[macidx(mac)] | + F_RESET3 | F_RESET2 | F_RESET1 | F_RESET0 | + F_PWRDN3 | F_PWRDN2 | F_PWRDN1 | F_PWRDN0 | + F_RESETPLL23 | F_RESETPLL01); + t3_read_reg(adap, ctrl); + udelay(15); + + for (i = 0; i < ARRAY_SIZE(clear); i++) { + t3_set_reg_field(adap, ctrl, clear[i], 0); + udelay(15); + } +} + +void t3b_pcs_reset(struct cmac *mac) +{ + t3_set_reg_field(mac->adapter, A_XGM_RESET_CTRL + mac->offset, + F_PCS_RESET_, 0); + udelay(20); + t3_set_reg_field(mac->adapter, A_XGM_RESET_CTRL + mac->offset, 0, + F_PCS_RESET_); +} + +int t3_mac_reset(struct cmac *mac) +{ + static const struct addr_val_pair mac_reset_avp[] = { + {A_XGM_TX_CTRL, 0}, + {A_XGM_RX_CTRL, 0}, + {A_XGM_RX_CFG, F_DISPAUSEFRAMES | F_EN1536BFRAMES | + F_RMFCS | F_ENJUMBO | F_ENHASHMCAST}, + {A_XGM_RX_HASH_LOW, 0}, + {A_XGM_RX_HASH_HIGH, 0}, + {A_XGM_RX_EXACT_MATCH_LOW_1, 0}, + {A_XGM_RX_EXACT_MATCH_LOW_2, 0}, + {A_XGM_RX_EXACT_MATCH_LOW_3, 0}, + {A_XGM_RX_EXACT_MATCH_LOW_4, 0}, + {A_XGM_RX_EXACT_MATCH_LOW_5, 0}, + {A_XGM_RX_EXACT_MATCH_LOW_6, 0}, + {A_XGM_RX_EXACT_MATCH_LOW_7, 0}, + {A_XGM_RX_EXACT_MATCH_LOW_8, 0}, + {A_XGM_STAT_CTRL, F_CLRSTATS} + }; + u32 val; + struct adapter *adap = mac->adapter; + unsigned int oft = mac->offset; + + t3_write_reg(adap, A_XGM_RESET_CTRL + oft, F_MAC_RESET_); + t3_read_reg(adap, A_XGM_RESET_CTRL + oft); /* flush */ + + t3_write_regs(adap, mac_reset_avp, ARRAY_SIZE(mac_reset_avp), oft); + t3_set_reg_field(adap, A_XGM_RXFIFO_CFG + oft, + F_RXSTRFRWRD | F_DISERRFRAMES, + uses_xaui(adap) ? 0 : F_RXSTRFRWRD); + t3_set_reg_field(adap, A_XGM_TXFIFO_CFG + oft, 0, F_UNDERUNFIX); + + if (uses_xaui(adap)) { + if (adap->params.rev == 0) { + t3_set_reg_field(adap, A_XGM_SERDES_CTRL + oft, 0, + F_RXENABLE | F_TXENABLE); + if (t3_wait_op_done(adap, A_XGM_SERDES_STATUS1 + oft, + F_CMULOCK, 1, 5, 2)) { + CH_ERR(adap, + "MAC %d XAUI SERDES CMU lock failed\n", + macidx(mac)); + return -1; + } + t3_set_reg_field(adap, A_XGM_SERDES_CTRL + oft, 0, + F_SERDESRESET_); + } else + xaui_serdes_reset(mac); + } + + t3_set_reg_field(adap, A_XGM_RX_MAX_PKT_SIZE + oft, + V_RXMAXFRAMERSIZE(M_RXMAXFRAMERSIZE), + V_RXMAXFRAMERSIZE(MAX_FRAME_SIZE) | F_RXENFRAMER); + val = F_MAC_RESET_ | F_XGMAC_STOP_EN; + + if (is_10G(adap)) + val |= F_PCS_RESET_; + else if (uses_xaui(adap)) + val |= F_PCS_RESET_ | F_XG2G_RESET_; + else + val |= F_RGMII_RESET_ | F_XG2G_RESET_; + t3_write_reg(adap, A_XGM_RESET_CTRL + oft, val); + t3_read_reg(adap, A_XGM_RESET_CTRL + oft); /* flush */ + if ((val & F_PCS_RESET_) && adap->params.rev) { + msleep(1); + t3b_pcs_reset(mac); + } + + memset(&mac->stats, 0, sizeof(mac->stats)); + return 0; +} + +static int t3b2_mac_reset(struct cmac *mac) +{ + struct adapter *adap = mac->adapter; + unsigned int oft = mac->offset, store; + int idx = macidx(mac); + u32 val; + + if (!macidx(mac)) + t3_set_reg_field(adap, A_MPS_CFG, F_PORT0ACTIVE, 0); + else + t3_set_reg_field(adap, A_MPS_CFG, F_PORT1ACTIVE, 0); + + /* Stop NIC traffic to reduce the number of TXTOGGLES */ + t3_set_reg_field(adap, A_MPS_CFG, F_ENFORCEPKT, 0); + /* Ensure TX drains */ + t3_set_reg_field(adap, A_XGM_TX_CFG + oft, F_TXPAUSEEN, 0); + + t3_write_reg(adap, A_XGM_RESET_CTRL + oft, F_MAC_RESET_); + t3_read_reg(adap, A_XGM_RESET_CTRL + oft); /* flush */ + + /* Store A_TP_TX_DROP_CFG_CH0 */ + t3_write_reg(adap, A_TP_PIO_ADDR, A_TP_TX_DROP_CFG_CH0 + idx); + store = t3_read_reg(adap, A_TP_TX_DROP_CFG_CH0 + idx); + + msleep(10); + + /* Change DROP_CFG to 0xc0000011 */ + t3_write_reg(adap, A_TP_PIO_ADDR, A_TP_TX_DROP_CFG_CH0 + idx); + t3_write_reg(adap, A_TP_PIO_DATA, 0xc0000011); + + /* Check for xgm Rx fifo empty */ + /* Increased loop count to 1000 from 5 cover 1G and 100Mbps case */ + if (t3_wait_op_done(adap, A_XGM_RX_MAX_PKT_SIZE_ERR_CNT + oft, + 0x80000000, 1, 1000, 2)) { + CH_ERR(adap, "MAC %d Rx fifo drain failed\n", + macidx(mac)); + return -1; + } + + t3_write_reg(adap, A_XGM_RESET_CTRL + oft, 0); + t3_read_reg(adap, A_XGM_RESET_CTRL + oft); /* flush */ + + val = F_MAC_RESET_; + if (is_10G(adap)) + val |= F_PCS_RESET_; + else if (uses_xaui(adap)) + val |= F_PCS_RESET_ | F_XG2G_RESET_; + else + val |= F_RGMII_RESET_ | F_XG2G_RESET_; + t3_write_reg(adap, A_XGM_RESET_CTRL + oft, val); + t3_read_reg(adap, A_XGM_RESET_CTRL + oft); /* flush */ + if ((val & F_PCS_RESET_) && adap->params.rev) { + msleep(1); + t3b_pcs_reset(mac); + } + t3_write_reg(adap, A_XGM_RX_CFG + oft, + F_DISPAUSEFRAMES | F_EN1536BFRAMES | + F_RMFCS | F_ENJUMBO | F_ENHASHMCAST); + + /* Restore the DROP_CFG */ + t3_write_reg(adap, A_TP_PIO_ADDR, A_TP_TX_DROP_CFG_CH0 + idx); + t3_write_reg(adap, A_TP_PIO_DATA, store); + + if (!idx) + t3_set_reg_field(adap, A_MPS_CFG, 0, F_PORT0ACTIVE); + else + t3_set_reg_field(adap, A_MPS_CFG, 0, F_PORT1ACTIVE); + + /* re-enable nic traffic */ + t3_set_reg_field(adap, A_MPS_CFG, F_ENFORCEPKT, 1); + + /* Set: re-enable NIC traffic */ + t3_set_reg_field(adap, A_MPS_CFG, F_ENFORCEPKT, 1); + + return 0; +} + +/* + * Set the exact match register 'idx' to recognize the given Ethernet address. + */ +static void set_addr_filter(struct cmac *mac, int idx, const u8 * addr) +{ + u32 addr_lo, addr_hi; + unsigned int oft = mac->offset + idx * 8; + + addr_lo = (addr[3] << 24) | (addr[2] << 16) | (addr[1] << 8) | addr[0]; + addr_hi = (addr[5] << 8) | addr[4]; + + t3_write_reg(mac->adapter, A_XGM_RX_EXACT_MATCH_LOW_1 + oft, addr_lo); + t3_write_reg(mac->adapter, A_XGM_RX_EXACT_MATCH_HIGH_1 + oft, addr_hi); +} + +/* Set one of the station's unicast MAC addresses. */ +int t3_mac_set_address(struct cmac *mac, unsigned int idx, u8 addr[6]) +{ + if (idx >= mac->nucast) + return -EINVAL; + set_addr_filter(mac, idx, addr); + return 0; +} + +/* + * Specify the number of exact address filters that should be reserved for + * unicast addresses. Caller should reload the unicast and multicast addresses + * after calling this. + */ +int t3_mac_set_num_ucast(struct cmac *mac, int n) +{ + if (n > EXACT_ADDR_FILTERS) + return -EINVAL; + mac->nucast = n; + return 0; +} + +void t3_mac_disable_exact_filters(struct cmac *mac) +{ + unsigned int i, reg = mac->offset + A_XGM_RX_EXACT_MATCH_LOW_1; + + for (i = 0; i < EXACT_ADDR_FILTERS; i++, reg += 8) { + u32 v = t3_read_reg(mac->adapter, reg); + t3_write_reg(mac->adapter, reg, v); + } + t3_read_reg(mac->adapter, A_XGM_RX_EXACT_MATCH_LOW_1); /* flush */ +} + +void t3_mac_enable_exact_filters(struct cmac *mac) +{ + unsigned int i, reg = mac->offset + A_XGM_RX_EXACT_MATCH_HIGH_1; + + for (i = 0; i < EXACT_ADDR_FILTERS; i++, reg += 8) { + u32 v = t3_read_reg(mac->adapter, reg); + t3_write_reg(mac->adapter, reg, v); + } + t3_read_reg(mac->adapter, A_XGM_RX_EXACT_MATCH_LOW_1); /* flush */ +} + +/* Calculate the RX hash filter index of an Ethernet address */ +static int hash_hw_addr(const u8 * addr) +{ + int hash = 0, octet, bit, i = 0, c; + + for (octet = 0; octet < 6; ++octet) + for (c = addr[octet], bit = 0; bit < 8; c >>= 1, ++bit) { + hash ^= (c & 1) << i; + if (++i == 6) + i = 0; + } + return hash; +} + +int t3_mac_set_rx_mode(struct cmac *mac, struct net_device *dev) +{ + u32 val, hash_lo, hash_hi; + struct adapter *adap = mac->adapter; + unsigned int oft = mac->offset; + + val = t3_read_reg(adap, A_XGM_RX_CFG + oft) & ~F_COPYALLFRAMES; + if (dev->flags & IFF_PROMISC) + val |= F_COPYALLFRAMES; + t3_write_reg(adap, A_XGM_RX_CFG + oft, val); + + if (dev->flags & IFF_ALLMULTI) + hash_lo = hash_hi = 0xffffffff; + else { + struct netdev_hw_addr *ha; + int exact_addr_idx = mac->nucast; + + hash_lo = hash_hi = 0; + netdev_for_each_mc_addr(ha, dev) + if (exact_addr_idx < EXACT_ADDR_FILTERS) + set_addr_filter(mac, exact_addr_idx++, + ha->addr); + else { + int hash = hash_hw_addr(ha->addr); + + if (hash < 32) + hash_lo |= (1 << hash); + else + hash_hi |= (1 << (hash - 32)); + } + } + + t3_write_reg(adap, A_XGM_RX_HASH_LOW + oft, hash_lo); + t3_write_reg(adap, A_XGM_RX_HASH_HIGH + oft, hash_hi); + return 0; +} + +static int rx_fifo_hwm(int mtu) +{ + int hwm; + + hwm = max(MAC_RXFIFO_SIZE - 3 * mtu, (MAC_RXFIFO_SIZE * 38) / 100); + return min(hwm, MAC_RXFIFO_SIZE - 8192); +} + +int t3_mac_set_mtu(struct cmac *mac, unsigned int mtu) +{ + int hwm, lwm, divisor; + int ipg; + unsigned int thres, v, reg; + struct adapter *adap = mac->adapter; + + /* + * MAX_FRAME_SIZE inludes header + FCS, mtu doesn't. The HW max + * packet size register includes header, but not FCS. + */ + mtu += 14; + if (mtu > 1536) + mtu += 4; + + if (mtu > MAX_FRAME_SIZE - 4) + return -EINVAL; + t3_write_reg(adap, A_XGM_RX_MAX_PKT_SIZE + mac->offset, mtu); + + if (adap->params.rev >= T3_REV_B2 && + (t3_read_reg(adap, A_XGM_RX_CTRL + mac->offset) & F_RXEN)) { + t3_mac_disable_exact_filters(mac); + v = t3_read_reg(adap, A_XGM_RX_CFG + mac->offset); + t3_set_reg_field(adap, A_XGM_RX_CFG + mac->offset, + F_ENHASHMCAST | F_COPYALLFRAMES, F_DISBCAST); + + reg = adap->params.rev == T3_REV_B2 ? + A_XGM_RX_MAX_PKT_SIZE_ERR_CNT : A_XGM_RXFIFO_CFG; + + /* drain RX FIFO */ + if (t3_wait_op_done(adap, reg + mac->offset, + F_RXFIFO_EMPTY, 1, 20, 5)) { + t3_write_reg(adap, A_XGM_RX_CFG + mac->offset, v); + t3_mac_enable_exact_filters(mac); + return -EIO; + } + t3_set_reg_field(adap, A_XGM_RX_MAX_PKT_SIZE + mac->offset, + V_RXMAXPKTSIZE(M_RXMAXPKTSIZE), + V_RXMAXPKTSIZE(mtu)); + t3_write_reg(adap, A_XGM_RX_CFG + mac->offset, v); + t3_mac_enable_exact_filters(mac); + } else + t3_set_reg_field(adap, A_XGM_RX_MAX_PKT_SIZE + mac->offset, + V_RXMAXPKTSIZE(M_RXMAXPKTSIZE), + V_RXMAXPKTSIZE(mtu)); + + /* + * Adjust the PAUSE frame watermarks. We always set the LWM, and the + * HWM only if flow-control is enabled. + */ + hwm = rx_fifo_hwm(mtu); + lwm = min(3 * (int)mtu, MAC_RXFIFO_SIZE / 4); + v = t3_read_reg(adap, A_XGM_RXFIFO_CFG + mac->offset); + v &= ~V_RXFIFOPAUSELWM(M_RXFIFOPAUSELWM); + v |= V_RXFIFOPAUSELWM(lwm / 8); + if (G_RXFIFOPAUSEHWM(v)) + v = (v & ~V_RXFIFOPAUSEHWM(M_RXFIFOPAUSEHWM)) | + V_RXFIFOPAUSEHWM(hwm / 8); + + t3_write_reg(adap, A_XGM_RXFIFO_CFG + mac->offset, v); + + /* Adjust the TX FIFO threshold based on the MTU */ + thres = (adap->params.vpd.cclk * 1000) / 15625; + thres = (thres * mtu) / 1000; + if (is_10G(adap)) + thres /= 10; + thres = mtu > thres ? (mtu - thres + 7) / 8 : 0; + thres = max(thres, 8U); /* need at least 8 */ + ipg = (adap->params.rev == T3_REV_C) ? 0 : 1; + t3_set_reg_field(adap, A_XGM_TXFIFO_CFG + mac->offset, + V_TXFIFOTHRESH(M_TXFIFOTHRESH) | V_TXIPG(M_TXIPG), + V_TXFIFOTHRESH(thres) | V_TXIPG(ipg)); + + if (adap->params.rev > 0) { + divisor = (adap->params.rev == T3_REV_C) ? 64 : 8; + t3_write_reg(adap, A_XGM_PAUSE_TIMER + mac->offset, + (hwm - lwm) * 4 / divisor); + } + t3_write_reg(adap, A_XGM_TX_PAUSE_QUANTA + mac->offset, + MAC_RXFIFO_SIZE * 4 * 8 / 512); + return 0; +} + +int t3_mac_set_speed_duplex_fc(struct cmac *mac, int speed, int duplex, int fc) +{ + u32 val; + struct adapter *adap = mac->adapter; + unsigned int oft = mac->offset; + + if (duplex >= 0 && duplex != DUPLEX_FULL) + return -EINVAL; + if (speed >= 0) { + if (speed == SPEED_10) + val = V_PORTSPEED(0); + else if (speed == SPEED_100) + val = V_PORTSPEED(1); + else if (speed == SPEED_1000) + val = V_PORTSPEED(2); + else if (speed == SPEED_10000) + val = V_PORTSPEED(3); + else + return -EINVAL; + + t3_set_reg_field(adap, A_XGM_PORT_CFG + oft, + V_PORTSPEED(M_PORTSPEED), val); + } + + val = t3_read_reg(adap, A_XGM_RXFIFO_CFG + oft); + val &= ~V_RXFIFOPAUSEHWM(M_RXFIFOPAUSEHWM); + if (fc & PAUSE_TX) { + u32 rx_max_pkt_size = + G_RXMAXPKTSIZE(t3_read_reg(adap, + A_XGM_RX_MAX_PKT_SIZE + oft)); + val |= V_RXFIFOPAUSEHWM(rx_fifo_hwm(rx_max_pkt_size) / 8); + } + t3_write_reg(adap, A_XGM_RXFIFO_CFG + oft, val); + + t3_set_reg_field(adap, A_XGM_TX_CFG + oft, F_TXPAUSEEN, + (fc & PAUSE_RX) ? F_TXPAUSEEN : 0); + return 0; +} + +int t3_mac_enable(struct cmac *mac, int which) +{ + int idx = macidx(mac); + struct adapter *adap = mac->adapter; + unsigned int oft = mac->offset; + struct mac_stats *s = &mac->stats; + + if (which & MAC_DIRECTION_TX) { + t3_write_reg(adap, A_TP_PIO_ADDR, A_TP_TX_DROP_CFG_CH0 + idx); + t3_write_reg(adap, A_TP_PIO_DATA, + adap->params.rev == T3_REV_C ? + 0xc4ffff01 : 0xc0ede401); + t3_write_reg(adap, A_TP_PIO_ADDR, A_TP_TX_DROP_MODE); + t3_set_reg_field(adap, A_TP_PIO_DATA, 1 << idx, + adap->params.rev == T3_REV_C ? 0 : 1 << idx); + + t3_write_reg(adap, A_XGM_TX_CTRL + oft, F_TXEN); + + t3_write_reg(adap, A_TP_PIO_ADDR, A_TP_TX_DROP_CNT_CH0 + idx); + mac->tx_mcnt = s->tx_frames; + mac->tx_tcnt = (G_TXDROPCNTCH0RCVD(t3_read_reg(adap, + A_TP_PIO_DATA))); + mac->tx_xcnt = (G_TXSPI4SOPCNT(t3_read_reg(adap, + A_XGM_TX_SPI4_SOP_EOP_CNT + + oft))); + mac->rx_mcnt = s->rx_frames; + mac->rx_pause = s->rx_pause; + mac->rx_xcnt = (G_TXSPI4SOPCNT(t3_read_reg(adap, + A_XGM_RX_SPI4_SOP_EOP_CNT + + oft))); + mac->rx_ocnt = s->rx_fifo_ovfl; + mac->txen = F_TXEN; + mac->toggle_cnt = 0; + } + if (which & MAC_DIRECTION_RX) + t3_write_reg(adap, A_XGM_RX_CTRL + oft, F_RXEN); + return 0; +} + +int t3_mac_disable(struct cmac *mac, int which) +{ + struct adapter *adap = mac->adapter; + + if (which & MAC_DIRECTION_TX) { + t3_write_reg(adap, A_XGM_TX_CTRL + mac->offset, 0); + mac->txen = 0; + } + if (which & MAC_DIRECTION_RX) { + int val = F_MAC_RESET_; + + t3_set_reg_field(mac->adapter, A_XGM_RESET_CTRL + mac->offset, + F_PCS_RESET_, 0); + msleep(100); + t3_write_reg(adap, A_XGM_RX_CTRL + mac->offset, 0); + if (is_10G(adap)) + val |= F_PCS_RESET_; + else if (uses_xaui(adap)) + val |= F_PCS_RESET_ | F_XG2G_RESET_; + else + val |= F_RGMII_RESET_ | F_XG2G_RESET_; + t3_write_reg(mac->adapter, A_XGM_RESET_CTRL + mac->offset, val); + } + return 0; +} + +int t3b2_mac_watchdog_task(struct cmac *mac) +{ + struct adapter *adap = mac->adapter; + struct mac_stats *s = &mac->stats; + unsigned int tx_tcnt, tx_xcnt; + u64 tx_mcnt = s->tx_frames; + int status; + + status = 0; + tx_xcnt = 1; /* By default tx_xcnt is making progress */ + tx_tcnt = mac->tx_tcnt; /* If tx_mcnt is progressing ignore tx_tcnt */ + if (tx_mcnt == mac->tx_mcnt && mac->rx_pause == s->rx_pause) { + tx_xcnt = (G_TXSPI4SOPCNT(t3_read_reg(adap, + A_XGM_TX_SPI4_SOP_EOP_CNT + + mac->offset))); + if (tx_xcnt == 0) { + t3_write_reg(adap, A_TP_PIO_ADDR, + A_TP_TX_DROP_CNT_CH0 + macidx(mac)); + tx_tcnt = (G_TXDROPCNTCH0RCVD(t3_read_reg(adap, + A_TP_PIO_DATA))); + } else { + goto out; + } + } else { + mac->toggle_cnt = 0; + goto out; + } + + if ((tx_tcnt != mac->tx_tcnt) && (mac->tx_xcnt == 0)) { + if (mac->toggle_cnt > 4) { + status = 2; + goto out; + } else { + status = 1; + goto out; + } + } else { + mac->toggle_cnt = 0; + goto out; + } + +out: + mac->tx_tcnt = tx_tcnt; + mac->tx_xcnt = tx_xcnt; + mac->tx_mcnt = s->tx_frames; + mac->rx_pause = s->rx_pause; + if (status == 1) { + t3_write_reg(adap, A_XGM_TX_CTRL + mac->offset, 0); + t3_read_reg(adap, A_XGM_TX_CTRL + mac->offset); /* flush */ + t3_write_reg(adap, A_XGM_TX_CTRL + mac->offset, mac->txen); + t3_read_reg(adap, A_XGM_TX_CTRL + mac->offset); /* flush */ + mac->toggle_cnt++; + } else if (status == 2) { + t3b2_mac_reset(mac); + mac->toggle_cnt = 0; + } + return status; +} + +/* + * This function is called periodically to accumulate the current values of the + * RMON counters into the port statistics. Since the packet counters are only + * 32 bits they can overflow in ~286 secs at 10G, so the function should be + * called more frequently than that. The byte counters are 45-bit wide, they + * would overflow in ~7.8 hours. + */ +const struct mac_stats *t3_mac_update_stats(struct cmac *mac) +{ +#define RMON_READ(mac, addr) t3_read_reg(mac->adapter, addr + mac->offset) +#define RMON_UPDATE(mac, name, reg) \ + (mac)->stats.name += (u64)RMON_READ(mac, A_XGM_STAT_##reg) +#define RMON_UPDATE64(mac, name, reg_lo, reg_hi) \ + (mac)->stats.name += RMON_READ(mac, A_XGM_STAT_##reg_lo) + \ + ((u64)RMON_READ(mac, A_XGM_STAT_##reg_hi) << 32) + + u32 v, lo; + + RMON_UPDATE64(mac, rx_octets, RX_BYTES_LOW, RX_BYTES_HIGH); + RMON_UPDATE64(mac, rx_frames, RX_FRAMES_LOW, RX_FRAMES_HIGH); + RMON_UPDATE(mac, rx_mcast_frames, RX_MCAST_FRAMES); + RMON_UPDATE(mac, rx_bcast_frames, RX_BCAST_FRAMES); + RMON_UPDATE(mac, rx_fcs_errs, RX_CRC_ERR_FRAMES); + RMON_UPDATE(mac, rx_pause, RX_PAUSE_FRAMES); + RMON_UPDATE(mac, rx_jabber, RX_JABBER_FRAMES); + RMON_UPDATE(mac, rx_short, RX_SHORT_FRAMES); + RMON_UPDATE(mac, rx_symbol_errs, RX_SYM_CODE_ERR_FRAMES); + + RMON_UPDATE(mac, rx_too_long, RX_OVERSIZE_FRAMES); + + v = RMON_READ(mac, A_XGM_RX_MAX_PKT_SIZE_ERR_CNT); + if (mac->adapter->params.rev == T3_REV_B2) + v &= 0x7fffffff; + mac->stats.rx_too_long += v; + + RMON_UPDATE(mac, rx_frames_64, RX_64B_FRAMES); + RMON_UPDATE(mac, rx_frames_65_127, RX_65_127B_FRAMES); + RMON_UPDATE(mac, rx_frames_128_255, RX_128_255B_FRAMES); + RMON_UPDATE(mac, rx_frames_256_511, RX_256_511B_FRAMES); + RMON_UPDATE(mac, rx_frames_512_1023, RX_512_1023B_FRAMES); + RMON_UPDATE(mac, rx_frames_1024_1518, RX_1024_1518B_FRAMES); + RMON_UPDATE(mac, rx_frames_1519_max, RX_1519_MAXB_FRAMES); + + RMON_UPDATE64(mac, tx_octets, TX_BYTE_LOW, TX_BYTE_HIGH); + RMON_UPDATE64(mac, tx_frames, TX_FRAME_LOW, TX_FRAME_HIGH); + RMON_UPDATE(mac, tx_mcast_frames, TX_MCAST); + RMON_UPDATE(mac, tx_bcast_frames, TX_BCAST); + RMON_UPDATE(mac, tx_pause, TX_PAUSE); + /* This counts error frames in general (bad FCS, underrun, etc). */ + RMON_UPDATE(mac, tx_underrun, TX_ERR_FRAMES); + + RMON_UPDATE(mac, tx_frames_64, TX_64B_FRAMES); + RMON_UPDATE(mac, tx_frames_65_127, TX_65_127B_FRAMES); + RMON_UPDATE(mac, tx_frames_128_255, TX_128_255B_FRAMES); + RMON_UPDATE(mac, tx_frames_256_511, TX_256_511B_FRAMES); + RMON_UPDATE(mac, tx_frames_512_1023, TX_512_1023B_FRAMES); + RMON_UPDATE(mac, tx_frames_1024_1518, TX_1024_1518B_FRAMES); + RMON_UPDATE(mac, tx_frames_1519_max, TX_1519_MAXB_FRAMES); + + /* The next stat isn't clear-on-read. */ + t3_write_reg(mac->adapter, A_TP_MIB_INDEX, mac->offset ? 51 : 50); + v = t3_read_reg(mac->adapter, A_TP_MIB_RDATA); + lo = (u32) mac->stats.rx_cong_drops; + mac->stats.rx_cong_drops += (u64) (v - lo); + + return &mac->stats; +} diff --git a/drivers/net/ethernet/chelsio/cxgb4/Makefile b/drivers/net/ethernet/chelsio/cxgb4/Makefile new file mode 100644 index 0000000..1df65c9 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb4/Makefile @@ -0,0 +1,8 @@ +# +# Chelsio T4 driver +# + +obj-$(CONFIG_CHELSIO_T4) += cxgb4.o + +cxgb4-objs := cxgb4_main.o l2t.o t4_hw.o sge.o +cxgb4-$(CONFIG_CHELSIO_T4_DCB) += cxgb4_dcb.o diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h new file mode 100644 index 0000000..3c481b2 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h @@ -0,0 +1,1088 @@ +/* + * This file is part of the Chelsio T4 Ethernet driver for Linux. + * + * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __CXGB4_H__ +#define __CXGB4_H__ + +#include "t4_hw.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cxgb4_uld.h" + +#define T4FW_VERSION_MAJOR 0x01 +#define T4FW_VERSION_MINOR 0x0B +#define T4FW_VERSION_MICRO 0x1B +#define T4FW_VERSION_BUILD 0x00 + +#define T5FW_VERSION_MAJOR 0x01 +#define T5FW_VERSION_MINOR 0x0B +#define T5FW_VERSION_MICRO 0x1B +#define T5FW_VERSION_BUILD 0x00 + +#define CH_WARN(adap, fmt, ...) dev_warn(adap->pdev_dev, fmt, ## __VA_ARGS__) + +enum { + MAX_NPORTS = 4, /* max # of ports */ + SERNUM_LEN = 24, /* Serial # length */ + EC_LEN = 16, /* E/C length */ + ID_LEN = 16, /* ID length */ + PN_LEN = 16, /* Part Number length */ +}; + +enum { + MEM_EDC0, + MEM_EDC1, + MEM_MC, + MEM_MC0 = MEM_MC, + MEM_MC1 +}; + +enum { + MEMWIN0_APERTURE = 2048, + MEMWIN0_BASE = 0x1b800, + MEMWIN1_APERTURE = 32768, + MEMWIN1_BASE = 0x28000, + MEMWIN1_BASE_T5 = 0x52000, + MEMWIN2_APERTURE = 65536, + MEMWIN2_BASE = 0x30000, + MEMWIN2_APERTURE_T5 = 131072, + MEMWIN2_BASE_T5 = 0x60000, +}; + +enum dev_master { + MASTER_CANT, + MASTER_MAY, + MASTER_MUST +}; + +enum dev_state { + DEV_STATE_UNINIT, + DEV_STATE_INIT, + DEV_STATE_ERR +}; + +enum { + PAUSE_RX = 1 << 0, + PAUSE_TX = 1 << 1, + PAUSE_AUTONEG = 1 << 2 +}; + +struct port_stats { + u64 tx_octets; /* total # of octets in good frames */ + u64 tx_frames; /* all good frames */ + u64 tx_bcast_frames; /* all broadcast frames */ + u64 tx_mcast_frames; /* all multicast frames */ + u64 tx_ucast_frames; /* all unicast frames */ + u64 tx_error_frames; /* all error frames */ + + u64 tx_frames_64; /* # of Tx frames in a particular range */ + u64 tx_frames_65_127; + u64 tx_frames_128_255; + u64 tx_frames_256_511; + u64 tx_frames_512_1023; + u64 tx_frames_1024_1518; + u64 tx_frames_1519_max; + + u64 tx_drop; /* # of dropped Tx frames */ + u64 tx_pause; /* # of transmitted pause frames */ + u64 tx_ppp0; /* # of transmitted PPP prio 0 frames */ + u64 tx_ppp1; /* # of transmitted PPP prio 1 frames */ + u64 tx_ppp2; /* # of transmitted PPP prio 2 frames */ + u64 tx_ppp3; /* # of transmitted PPP prio 3 frames */ + u64 tx_ppp4; /* # of transmitted PPP prio 4 frames */ + u64 tx_ppp5; /* # of transmitted PPP prio 5 frames */ + u64 tx_ppp6; /* # of transmitted PPP prio 6 frames */ + u64 tx_ppp7; /* # of transmitted PPP prio 7 frames */ + + u64 rx_octets; /* total # of octets in good frames */ + u64 rx_frames; /* all good frames */ + u64 rx_bcast_frames; /* all broadcast frames */ + u64 rx_mcast_frames; /* all multicast frames */ + u64 rx_ucast_frames; /* all unicast frames */ + u64 rx_too_long; /* # of frames exceeding MTU */ + u64 rx_jabber; /* # of jabber frames */ + u64 rx_fcs_err; /* # of received frames with bad FCS */ + u64 rx_len_err; /* # of received frames with length error */ + u64 rx_symbol_err; /* symbol errors */ + u64 rx_runt; /* # of short frames */ + + u64 rx_frames_64; /* # of Rx frames in a particular range */ + u64 rx_frames_65_127; + u64 rx_frames_128_255; + u64 rx_frames_256_511; + u64 rx_frames_512_1023; + u64 rx_frames_1024_1518; + u64 rx_frames_1519_max; + + u64 rx_pause; /* # of received pause frames */ + u64 rx_ppp0; /* # of received PPP prio 0 frames */ + u64 rx_ppp1; /* # of received PPP prio 1 frames */ + u64 rx_ppp2; /* # of received PPP prio 2 frames */ + u64 rx_ppp3; /* # of received PPP prio 3 frames */ + u64 rx_ppp4; /* # of received PPP prio 4 frames */ + u64 rx_ppp5; /* # of received PPP prio 5 frames */ + u64 rx_ppp6; /* # of received PPP prio 6 frames */ + u64 rx_ppp7; /* # of received PPP prio 7 frames */ + + u64 rx_ovflow0; /* drops due to buffer-group 0 overflows */ + u64 rx_ovflow1; /* drops due to buffer-group 1 overflows */ + u64 rx_ovflow2; /* drops due to buffer-group 2 overflows */ + u64 rx_ovflow3; /* drops due to buffer-group 3 overflows */ + u64 rx_trunc0; /* buffer-group 0 truncated packets */ + u64 rx_trunc1; /* buffer-group 1 truncated packets */ + u64 rx_trunc2; /* buffer-group 2 truncated packets */ + u64 rx_trunc3; /* buffer-group 3 truncated packets */ +}; + +struct lb_port_stats { + u64 octets; + u64 frames; + u64 bcast_frames; + u64 mcast_frames; + u64 ucast_frames; + u64 error_frames; + + u64 frames_64; + u64 frames_65_127; + u64 frames_128_255; + u64 frames_256_511; + u64 frames_512_1023; + u64 frames_1024_1518; + u64 frames_1519_max; + + u64 drop; + + u64 ovflow0; + u64 ovflow1; + u64 ovflow2; + u64 ovflow3; + u64 trunc0; + u64 trunc1; + u64 trunc2; + u64 trunc3; +}; + +struct tp_tcp_stats { + u32 tcpOutRsts; + u64 tcpInSegs; + u64 tcpOutSegs; + u64 tcpRetransSegs; +}; + +struct tp_err_stats { + u32 macInErrs[4]; + u32 hdrInErrs[4]; + u32 tcpInErrs[4]; + u32 tnlCongDrops[4]; + u32 ofldChanDrops[4]; + u32 tnlTxDrops[4]; + u32 ofldVlanDrops[4]; + u32 tcp6InErrs[4]; + u32 ofldNoNeigh; + u32 ofldCongDefer; +}; + +struct tp_params { + unsigned int ntxchan; /* # of Tx channels */ + unsigned int tre; /* log2 of core clocks per TP tick */ + unsigned short tx_modq_map; /* TX modulation scheduler queue to */ + /* channel map */ + + uint32_t dack_re; /* DACK timer resolution */ + unsigned short tx_modq[NCHAN]; /* channel to modulation queue map */ + + u32 vlan_pri_map; /* cached TP_VLAN_PRI_MAP */ + u32 ingress_config; /* cached TP_INGRESS_CONFIG */ + + /* TP_VLAN_PRI_MAP Compressed Filter Tuple field offsets. This is a + * subset of the set of fields which may be present in the Compressed + * Filter Tuple portion of filters and TCP TCB connections. The + * fields which are present are controlled by the TP_VLAN_PRI_MAP. + * Since a variable number of fields may or may not be present, their + * shifted field positions within the Compressed Filter Tuple may + * vary, or not even be present if the field isn't selected in + * TP_VLAN_PRI_MAP. Since some of these fields are needed in various + * places we store their offsets here, or a -1 if the field isn't + * present. + */ + int vlan_shift; + int vnic_shift; + int port_shift; + int protocol_shift; +}; + +struct vpd_params { + unsigned int cclk; + u8 ec[EC_LEN + 1]; + u8 sn[SERNUM_LEN + 1]; + u8 id[ID_LEN + 1]; + u8 pn[PN_LEN + 1]; +}; + +struct pci_params { + unsigned char speed; + unsigned char width; +}; + +#define CHELSIO_CHIP_CODE(version, revision) (((version) << 4) | (revision)) +#define CHELSIO_CHIP_FPGA 0x100 +#define CHELSIO_CHIP_VERSION(code) (((code) >> 4) & 0xf) +#define CHELSIO_CHIP_RELEASE(code) ((code) & 0xf) + +#define CHELSIO_T4 0x4 +#define CHELSIO_T5 0x5 + +enum chip_type { + T4_A1 = CHELSIO_CHIP_CODE(CHELSIO_T4, 1), + T4_A2 = CHELSIO_CHIP_CODE(CHELSIO_T4, 2), + T4_FIRST_REV = T4_A1, + T4_LAST_REV = T4_A2, + + T5_A0 = CHELSIO_CHIP_CODE(CHELSIO_T5, 0), + T5_A1 = CHELSIO_CHIP_CODE(CHELSIO_T5, 1), + T5_FIRST_REV = T5_A0, + T5_LAST_REV = T5_A1, +}; + +struct adapter_params { + struct tp_params tp; + struct vpd_params vpd; + struct pci_params pci; + + unsigned int sf_size; /* serial flash size in bytes */ + unsigned int sf_nsec; /* # of flash sectors */ + unsigned int sf_fw_start; /* start of FW image in flash */ + + unsigned int fw_vers; + unsigned int tp_vers; + u8 api_vers[7]; + + unsigned short mtus[NMTUS]; + unsigned short a_wnd[NCCTRL_WIN]; + unsigned short b_wnd[NCCTRL_WIN]; + + unsigned char nports; /* # of ethernet ports */ + unsigned char portvec; + enum chip_type chip; /* chip code */ + unsigned char offload; + + unsigned char bypass; + + unsigned int ofldq_wr_cred; + bool ulptx_memwrite_dsgl; /* use of T5 DSGL allowed */ + + unsigned int max_ordird_qp; /* Max read depth per RDMA QP */ + unsigned int max_ird_adapter; /* Max read depth per adapter */ +}; + +#include "t4fw_api.h" + +#define FW_VERSION(chip) ( \ + FW_HDR_FW_VER_MAJOR_GET(chip##FW_VERSION_MAJOR) | \ + FW_HDR_FW_VER_MINOR_GET(chip##FW_VERSION_MINOR) | \ + FW_HDR_FW_VER_MICRO_GET(chip##FW_VERSION_MICRO) | \ + FW_HDR_FW_VER_BUILD_GET(chip##FW_VERSION_BUILD)) +#define FW_INTFVER(chip, intf) (FW_HDR_INTFVER_##intf) + +struct fw_info { + u8 chip; + char *fs_name; + char *fw_mod_name; + struct fw_hdr fw_hdr; +}; + + +struct trace_params { + u32 data[TRACE_LEN / 4]; + u32 mask[TRACE_LEN / 4]; + unsigned short snap_len; + unsigned short min_len; + unsigned char skip_ofst; + unsigned char skip_len; + unsigned char invert; + unsigned char port; +}; + +struct link_config { + unsigned short supported; /* link capabilities */ + unsigned short advertising; /* advertised capabilities */ + unsigned short requested_speed; /* speed user has requested */ + unsigned short speed; /* actual link speed */ + unsigned char requested_fc; /* flow control user has requested */ + unsigned char fc; /* actual link flow control */ + unsigned char autoneg; /* autonegotiating? */ + unsigned char link_ok; /* link up? */ +}; + +#define FW_LEN16(fw_struct) FW_CMD_LEN16(sizeof(fw_struct) / 16) + +enum { + MAX_ETH_QSETS = 32, /* # of Ethernet Tx/Rx queue sets */ + MAX_OFLD_QSETS = 16, /* # of offload Tx/Rx queue sets */ + MAX_CTRL_QUEUES = NCHAN, /* # of control Tx queues */ + MAX_RDMA_QUEUES = NCHAN, /* # of streaming RDMA Rx queues */ + MAX_RDMA_CIQS = NCHAN, /* # of RDMA concentrator IQs */ + MAX_ISCSI_QUEUES = NCHAN, /* # of streaming iSCSI Rx queues */ +}; + +enum { + INGQ_EXTRAS = 2, /* firmware event queue and */ + /* forwarded interrupts */ + MAX_EGRQ = MAX_ETH_QSETS*2 + MAX_OFLD_QSETS*2 + + MAX_CTRL_QUEUES + MAX_RDMA_QUEUES + MAX_ISCSI_QUEUES, + MAX_INGQ = MAX_ETH_QSETS + MAX_OFLD_QSETS + MAX_RDMA_QUEUES + + MAX_RDMA_CIQS + MAX_ISCSI_QUEUES + INGQ_EXTRAS, +}; + +struct adapter; +struct sge_rspq; + +#include "cxgb4_dcb.h" + +struct port_info { + struct adapter *adapter; + u16 viid; + s16 xact_addr_filt; /* index of exact MAC address filter */ + u16 rss_size; /* size of VI's RSS table slice */ + s8 mdio_addr; + u8 port_type; + u8 mod_type; + u8 port_id; + u8 tx_chan; + u8 lport; /* associated offload logical port */ + u8 nqsets; /* # of qsets */ + u8 first_qset; /* index of first qset */ + u8 rss_mode; + struct link_config link_cfg; + u16 *rss; +#ifdef CONFIG_CHELSIO_T4_DCB + struct port_dcb_info dcb; /* Data Center Bridging support */ +#endif +}; + +struct dentry; +struct work_struct; + +enum { /* adapter flags */ + FULL_INIT_DONE = (1 << 0), + DEV_ENABLED = (1 << 1), + USING_MSI = (1 << 2), + USING_MSIX = (1 << 3), + FW_OK = (1 << 4), + RSS_TNLALLLOOKUP = (1 << 5), + USING_SOFT_PARAMS = (1 << 6), + MASTER_PF = (1 << 7), + FW_OFLD_CONN = (1 << 9), +}; + +struct rx_sw_desc; + +struct sge_fl { /* SGE free-buffer queue state */ + unsigned int avail; /* # of available Rx buffers */ + unsigned int pend_cred; /* new buffers since last FL DB ring */ + unsigned int cidx; /* consumer index */ + unsigned int pidx; /* producer index */ + unsigned long alloc_failed; /* # of times buffer allocation failed */ + unsigned long large_alloc_failed; + unsigned long starving; + /* RO fields */ + unsigned int cntxt_id; /* SGE context id for the free list */ + unsigned int size; /* capacity of free list */ + struct rx_sw_desc *sdesc; /* address of SW Rx descriptor ring */ + __be64 *desc; /* address of HW Rx descriptor ring */ + dma_addr_t addr; /* bus address of HW ring start */ + u64 udb; /* BAR2 offset of User Doorbell area */ +}; + +/* A packet gather list */ +struct pkt_gl { + struct page_frag frags[MAX_SKB_FRAGS]; + void *va; /* virtual address of first byte */ + unsigned int nfrags; /* # of fragments */ + unsigned int tot_len; /* total length of fragments */ +}; + +typedef int (*rspq_handler_t)(struct sge_rspq *q, const __be64 *rsp, + const struct pkt_gl *gl); + +struct sge_rspq { /* state for an SGE response queue */ + struct napi_struct napi; + const __be64 *cur_desc; /* current descriptor in queue */ + unsigned int cidx; /* consumer index */ + u8 gen; /* current generation bit */ + u8 intr_params; /* interrupt holdoff parameters */ + u8 next_intr_params; /* holdoff params for next interrupt */ + u8 adaptive_rx; + u8 pktcnt_idx; /* interrupt packet threshold */ + u8 uld; /* ULD handling this queue */ + u8 idx; /* queue index within its group */ + int offset; /* offset into current Rx buffer */ + u16 cntxt_id; /* SGE context id for the response q */ + u16 abs_id; /* absolute SGE id for the response q */ + __be64 *desc; /* address of HW response ring */ + dma_addr_t phys_addr; /* physical address of the ring */ + u64 udb; /* BAR2 offset of User Doorbell area */ + unsigned int iqe_len; /* entry size */ + unsigned int size; /* capacity of response queue */ + struct adapter *adap; + struct net_device *netdev; /* associated net device */ + rspq_handler_t handler; +}; + +struct sge_eth_stats { /* Ethernet queue statistics */ + unsigned long pkts; /* # of ethernet packets */ + unsigned long lro_pkts; /* # of LRO super packets */ + unsigned long lro_merged; /* # of wire packets merged by LRO */ + unsigned long rx_cso; /* # of Rx checksum offloads */ + unsigned long vlan_ex; /* # of Rx VLAN extractions */ + unsigned long rx_drops; /* # of packets dropped due to no mem */ +}; + +struct sge_eth_rxq { /* SW Ethernet Rx queue */ + struct sge_rspq rspq; + struct sge_fl fl; + struct sge_eth_stats stats; +} ____cacheline_aligned_in_smp; + +struct sge_ofld_stats { /* offload queue statistics */ + unsigned long pkts; /* # of packets */ + unsigned long imm; /* # of immediate-data packets */ + unsigned long an; /* # of asynchronous notifications */ + unsigned long nomem; /* # of responses deferred due to no mem */ +}; + +struct sge_ofld_rxq { /* SW offload Rx queue */ + struct sge_rspq rspq; + struct sge_fl fl; + struct sge_ofld_stats stats; +} ____cacheline_aligned_in_smp; + +struct tx_desc { + __be64 flit[8]; +}; + +struct tx_sw_desc; + +struct sge_txq { + unsigned int in_use; /* # of in-use Tx descriptors */ + unsigned int size; /* # of descriptors */ + unsigned int cidx; /* SW consumer index */ + unsigned int pidx; /* producer index */ + unsigned long stops; /* # of times q has been stopped */ + unsigned long restarts; /* # of queue restarts */ + unsigned int cntxt_id; /* SGE context id for the Tx q */ + struct tx_desc *desc; /* address of HW Tx descriptor ring */ + struct tx_sw_desc *sdesc; /* address of SW Tx descriptor ring */ + struct sge_qstat *stat; /* queue status entry */ + dma_addr_t phys_addr; /* physical address of the ring */ + spinlock_t db_lock; + int db_disabled; + unsigned short db_pidx; + unsigned short db_pidx_inc; + u64 udb; /* BAR2 offset of User Doorbell area */ +}; + +struct sge_eth_txq { /* state for an SGE Ethernet Tx queue */ + struct sge_txq q; + struct netdev_queue *txq; /* associated netdev TX queue */ +#ifdef CONFIG_CHELSIO_T4_DCB + u8 dcb_prio; /* DCB Priority bound to queue */ +#endif + unsigned long tso; /* # of TSO requests */ + unsigned long tx_cso; /* # of Tx checksum offloads */ + unsigned long vlan_ins; /* # of Tx VLAN insertions */ + unsigned long mapping_err; /* # of I/O MMU packet mapping errors */ +} ____cacheline_aligned_in_smp; + +struct sge_ofld_txq { /* state for an SGE offload Tx queue */ + struct sge_txq q; + struct adapter *adap; + struct sk_buff_head sendq; /* list of backpressured packets */ + struct tasklet_struct qresume_tsk; /* restarts the queue */ + u8 full; /* the Tx ring is full */ + unsigned long mapping_err; /* # of I/O MMU packet mapping errors */ +} ____cacheline_aligned_in_smp; + +struct sge_ctrl_txq { /* state for an SGE control Tx queue */ + struct sge_txq q; + struct adapter *adap; + struct sk_buff_head sendq; /* list of backpressured packets */ + struct tasklet_struct qresume_tsk; /* restarts the queue */ + u8 full; /* the Tx ring is full */ +} ____cacheline_aligned_in_smp; + +struct sge { + struct sge_eth_txq ethtxq[MAX_ETH_QSETS]; + struct sge_ofld_txq ofldtxq[MAX_OFLD_QSETS]; + struct sge_ctrl_txq ctrlq[MAX_CTRL_QUEUES]; + + struct sge_eth_rxq ethrxq[MAX_ETH_QSETS]; + struct sge_ofld_rxq ofldrxq[MAX_OFLD_QSETS]; + struct sge_ofld_rxq rdmarxq[MAX_RDMA_QUEUES]; + struct sge_ofld_rxq rdmaciq[MAX_RDMA_CIQS]; + struct sge_rspq fw_evtq ____cacheline_aligned_in_smp; + + struct sge_rspq intrq ____cacheline_aligned_in_smp; + spinlock_t intrq_lock; + + u16 max_ethqsets; /* # of available Ethernet queue sets */ + u16 ethqsets; /* # of active Ethernet queue sets */ + u16 ethtxq_rover; /* Tx queue to clean up next */ + u16 ofldqsets; /* # of active offload queue sets */ + u16 rdmaqs; /* # of available RDMA Rx queues */ + u16 rdmaciqs; /* # of available RDMA concentrator IQs */ + u16 ofld_rxq[MAX_OFLD_QSETS]; + u16 rdma_rxq[NCHAN]; + u16 rdma_ciq[NCHAN]; + u16 timer_val[SGE_NTIMERS]; + u8 counter_val[SGE_NCOUNTERS]; + u32 fl_pg_order; /* large page allocation size */ + u32 stat_len; /* length of status page at ring end */ + u32 pktshift; /* padding between CPL & packet data */ + u32 fl_align; /* response queue message alignment */ + u32 fl_starve_thres; /* Free List starvation threshold */ + + /* State variables for detecting an SGE Ingress DMA hang */ + unsigned int idma_1s_thresh;/* SGE same State Counter 1s threshold */ + unsigned int idma_stalled[2];/* SGE synthesized stalled timers in HZ */ + unsigned int idma_state[2]; /* SGE IDMA Hang detect state */ + unsigned int idma_qid[2]; /* SGE IDMA Hung Ingress Queue ID */ + + unsigned int egr_start; + unsigned int ingr_start; + void *egr_map[MAX_EGRQ]; /* qid->queue egress queue map */ + struct sge_rspq *ingr_map[MAX_INGQ]; /* qid->queue ingress queue map */ + DECLARE_BITMAP(starving_fl, MAX_EGRQ); + DECLARE_BITMAP(txq_maperr, MAX_EGRQ); + struct timer_list rx_timer; /* refills starving FLs */ + struct timer_list tx_timer; /* checks Tx queues */ +}; + +#define for_each_ethrxq(sge, i) for (i = 0; i < (sge)->ethqsets; i++) +#define for_each_ofldrxq(sge, i) for (i = 0; i < (sge)->ofldqsets; i++) +#define for_each_rdmarxq(sge, i) for (i = 0; i < (sge)->rdmaqs; i++) +#define for_each_rdmaciq(sge, i) for (i = 0; i < (sge)->rdmaciqs; i++) + +struct l2t_data; + +#ifdef CONFIG_PCI_IOV + +/* T4 supports SRIOV on PF0-3 and T5 on PF0-7. However, the Serial + * Configuration initialization for T5 only has SR-IOV functionality enabled + * on PF0-3 in order to simplify everything. + */ +#define NUM_OF_PF_WITH_SRIOV 4 + +#endif + +struct adapter { + void __iomem *regs; + void __iomem *bar2; + u32 t4_bar0; + struct pci_dev *pdev; + struct device *pdev_dev; + unsigned int mbox; + unsigned int fn; + unsigned int flags; + enum chip_type chip; + + int msg_enable; + + struct adapter_params params; + struct cxgb4_virt_res vres; + unsigned int swintr; + + unsigned int wol; + + struct { + unsigned short vec; + char desc[IFNAMSIZ + 10]; + } msix_info[MAX_INGQ + 1]; + + struct sge sge; + + struct net_device *port[MAX_NPORTS]; + u8 chan_map[NCHAN]; /* channel -> port map */ + + u32 filter_mode; + unsigned int l2t_start; + unsigned int l2t_end; + struct l2t_data *l2t; + void *uld_handle[CXGB4_ULD_MAX]; + struct list_head list_node; + struct list_head rcu_node; + + struct tid_info tids; + void **tid_release_head; + spinlock_t tid_release_lock; + struct workqueue_struct *workq; + struct work_struct tid_release_task; + struct work_struct db_full_task; + struct work_struct db_drop_task; + bool tid_release_task_busy; + + struct dentry *debugfs_root; + + spinlock_t stats_lock; + spinlock_t win0_lock ____cacheline_aligned_in_smp; +}; + +/* Defined bit width of user definable filter tuples + */ +#define ETHTYPE_BITWIDTH 16 +#define FRAG_BITWIDTH 1 +#define MACIDX_BITWIDTH 9 +#define FCOE_BITWIDTH 1 +#define IPORT_BITWIDTH 3 +#define MATCHTYPE_BITWIDTH 3 +#define PROTO_BITWIDTH 8 +#define TOS_BITWIDTH 8 +#define PF_BITWIDTH 8 +#define VF_BITWIDTH 8 +#define IVLAN_BITWIDTH 16 +#define OVLAN_BITWIDTH 16 + +/* Filter matching rules. These consist of a set of ingress packet field + * (value, mask) tuples. The associated ingress packet field matches the + * tuple when ((field & mask) == value). (Thus a wildcard "don't care" field + * rule can be constructed by specifying a tuple of (0, 0).) A filter rule + * matches an ingress packet when all of the individual individual field + * matching rules are true. + * + * Partial field masks are always valid, however, while it may be easy to + * understand their meanings for some fields (e.g. IP address to match a + * subnet), for others making sensible partial masks is less intuitive (e.g. + * MPS match type) ... + * + * Most of the following data structures are modeled on T4 capabilities. + * Drivers for earlier chips use the subsets which make sense for those chips. + * We really need to come up with a hardware-independent mechanism to + * represent hardware filter capabilities ... + */ +struct ch_filter_tuple { + /* Compressed header matching field rules. The TP_VLAN_PRI_MAP + * register selects which of these fields will participate in the + * filter match rules -- up to a maximum of 36 bits. Because + * TP_VLAN_PRI_MAP is a global register, all filters must use the same + * set of fields. + */ + uint32_t ethtype:ETHTYPE_BITWIDTH; /* Ethernet type */ + uint32_t frag:FRAG_BITWIDTH; /* IP fragmentation header */ + uint32_t ivlan_vld:1; /* inner VLAN valid */ + uint32_t ovlan_vld:1; /* outer VLAN valid */ + uint32_t pfvf_vld:1; /* PF/VF valid */ + uint32_t macidx:MACIDX_BITWIDTH; /* exact match MAC index */ + uint32_t fcoe:FCOE_BITWIDTH; /* FCoE packet */ + uint32_t iport:IPORT_BITWIDTH; /* ingress port */ + uint32_t matchtype:MATCHTYPE_BITWIDTH; /* MPS match type */ + uint32_t proto:PROTO_BITWIDTH; /* protocol type */ + uint32_t tos:TOS_BITWIDTH; /* TOS/Traffic Type */ + uint32_t pf:PF_BITWIDTH; /* PCI-E PF ID */ + uint32_t vf:VF_BITWIDTH; /* PCI-E VF ID */ + uint32_t ivlan:IVLAN_BITWIDTH; /* inner VLAN */ + uint32_t ovlan:OVLAN_BITWIDTH; /* outer VLAN */ + + /* Uncompressed header matching field rules. These are always + * available for field rules. + */ + uint8_t lip[16]; /* local IP address (IPv4 in [3:0]) */ + uint8_t fip[16]; /* foreign IP address (IPv4 in [3:0]) */ + uint16_t lport; /* local port */ + uint16_t fport; /* foreign port */ +}; + +/* A filter ioctl command. + */ +struct ch_filter_specification { + /* Administrative fields for filter. + */ + uint32_t hitcnts:1; /* count filter hits in TCB */ + uint32_t prio:1; /* filter has priority over active/server */ + + /* Fundamental filter typing. This is the one element of filter + * matching that doesn't exist as a (value, mask) tuple. + */ + uint32_t type:1; /* 0 => IPv4, 1 => IPv6 */ + + /* Packet dispatch information. Ingress packets which match the + * filter rules will be dropped, passed to the host or switched back + * out as egress packets. + */ + uint32_t action:2; /* drop, pass, switch */ + + uint32_t rpttid:1; /* report TID in RSS hash field */ + + uint32_t dirsteer:1; /* 0 => RSS, 1 => steer to iq */ + uint32_t iq:10; /* ingress queue */ + + uint32_t maskhash:1; /* dirsteer=0: store RSS hash in TCB */ + uint32_t dirsteerhash:1;/* dirsteer=1: 0 => TCB contains RSS hash */ + /* 1 => TCB contains IQ ID */ + + /* Switch proxy/rewrite fields. An ingress packet which matches a + * filter with "switch" set will be looped back out as an egress + * packet -- potentially with some Ethernet header rewriting. + */ + uint32_t eport:2; /* egress port to switch packet out */ + uint32_t newdmac:1; /* rewrite destination MAC address */ + uint32_t newsmac:1; /* rewrite source MAC address */ + uint32_t newvlan:2; /* rewrite VLAN Tag */ + uint8_t dmac[ETH_ALEN]; /* new destination MAC address */ + uint8_t smac[ETH_ALEN]; /* new source MAC address */ + uint16_t vlan; /* VLAN Tag to insert */ + + /* Filter rule value/mask pairs. + */ + struct ch_filter_tuple val; + struct ch_filter_tuple mask; +}; + +enum { + FILTER_PASS = 0, /* default */ + FILTER_DROP, + FILTER_SWITCH +}; + +enum { + VLAN_NOCHANGE = 0, /* default */ + VLAN_REMOVE, + VLAN_INSERT, + VLAN_REWRITE +}; + +static inline int is_t5(enum chip_type chip) +{ + return CHELSIO_CHIP_VERSION(chip) == CHELSIO_T5; +} + +static inline int is_t4(enum chip_type chip) +{ + return CHELSIO_CHIP_VERSION(chip) == CHELSIO_T4; +} + +static inline u32 t4_read_reg(struct adapter *adap, u32 reg_addr) +{ + return readl(adap->regs + reg_addr); +} + +static inline void t4_write_reg(struct adapter *adap, u32 reg_addr, u32 val) +{ + writel(val, adap->regs + reg_addr); +} + +#ifndef readq +static inline u64 readq(const volatile void __iomem *addr) +{ + return readl(addr) + ((u64)readl(addr + 4) << 32); +} + +static inline void writeq(u64 val, volatile void __iomem *addr) +{ + writel(val, addr); + writel(val >> 32, addr + 4); +} +#endif + +static inline u64 t4_read_reg64(struct adapter *adap, u32 reg_addr) +{ + return readq(adap->regs + reg_addr); +} + +static inline void t4_write_reg64(struct adapter *adap, u32 reg_addr, u64 val) +{ + writeq(val, adap->regs + reg_addr); +} + +/** + * netdev2pinfo - return the port_info structure associated with a net_device + * @dev: the netdev + * + * Return the struct port_info associated with a net_device + */ +static inline struct port_info *netdev2pinfo(const struct net_device *dev) +{ + return netdev_priv(dev); +} + +/** + * adap2pinfo - return the port_info of a port + * @adap: the adapter + * @idx: the port index + * + * Return the port_info structure for the port of the given index. + */ +static inline struct port_info *adap2pinfo(struct adapter *adap, int idx) +{ + return netdev_priv(adap->port[idx]); +} + +/** + * netdev2adap - return the adapter structure associated with a net_device + * @dev: the netdev + * + * Return the struct adapter associated with a net_device + */ +static inline struct adapter *netdev2adap(const struct net_device *dev) +{ + return netdev2pinfo(dev)->adapter; +} + +void t4_os_portmod_changed(const struct adapter *adap, int port_id); +void t4_os_link_changed(struct adapter *adap, int port_id, int link_stat); + +void *t4_alloc_mem(size_t size); + +void t4_free_sge_resources(struct adapter *adap); +void t4_free_ofld_rxqs(struct adapter *adap, int n, struct sge_ofld_rxq *q); +irq_handler_t t4_intr_handler(struct adapter *adap); +netdev_tx_t t4_eth_xmit(struct sk_buff *skb, struct net_device *dev); +int t4_ethrx_handler(struct sge_rspq *q, const __be64 *rsp, + const struct pkt_gl *gl); +int t4_mgmt_tx(struct adapter *adap, struct sk_buff *skb); +int t4_ofld_send(struct adapter *adap, struct sk_buff *skb); +int t4_sge_alloc_rxq(struct adapter *adap, struct sge_rspq *iq, bool fwevtq, + struct net_device *dev, int intr_idx, + struct sge_fl *fl, rspq_handler_t hnd); +int t4_sge_alloc_eth_txq(struct adapter *adap, struct sge_eth_txq *txq, + struct net_device *dev, struct netdev_queue *netdevq, + unsigned int iqid); +int t4_sge_alloc_ctrl_txq(struct adapter *adap, struct sge_ctrl_txq *txq, + struct net_device *dev, unsigned int iqid, + unsigned int cmplqid); +int t4_sge_alloc_ofld_txq(struct adapter *adap, struct sge_ofld_txq *txq, + struct net_device *dev, unsigned int iqid); +irqreturn_t t4_sge_intr_msix(int irq, void *cookie); +int t4_sge_init(struct adapter *adap); +void t4_sge_start(struct adapter *adap); +void t4_sge_stop(struct adapter *adap); +extern int dbfifo_int_thresh; + +#define for_each_port(adapter, iter) \ + for (iter = 0; iter < (adapter)->params.nports; ++iter) + +static inline int is_bypass(struct adapter *adap) +{ + return adap->params.bypass; +} + +static inline int is_bypass_device(int device) +{ + /* this should be set based upon device capabilities */ + switch (device) { + case 0x440b: + case 0x440c: + return 1; + default: + return 0; + } +} + +static inline unsigned int core_ticks_per_usec(const struct adapter *adap) +{ + return adap->params.vpd.cclk / 1000; +} + +static inline unsigned int us_to_core_ticks(const struct adapter *adap, + unsigned int us) +{ + return (us * adap->params.vpd.cclk) / 1000; +} + +static inline unsigned int core_ticks_to_us(const struct adapter *adapter, + unsigned int ticks) +{ + /* add Core Clock / 2 to round ticks to nearest uS */ + return ((ticks * 1000 + adapter->params.vpd.cclk/2) / + adapter->params.vpd.cclk); +} + +void t4_set_reg_field(struct adapter *adap, unsigned int addr, u32 mask, + u32 val); + +int t4_wr_mbox_meat(struct adapter *adap, int mbox, const void *cmd, int size, + void *rpl, bool sleep_ok); + +static inline int t4_wr_mbox(struct adapter *adap, int mbox, const void *cmd, + int size, void *rpl) +{ + return t4_wr_mbox_meat(adap, mbox, cmd, size, rpl, true); +} + +static inline int t4_wr_mbox_ns(struct adapter *adap, int mbox, const void *cmd, + int size, void *rpl) +{ + return t4_wr_mbox_meat(adap, mbox, cmd, size, rpl, false); +} + +void t4_write_indirect(struct adapter *adap, unsigned int addr_reg, + unsigned int data_reg, const u32 *vals, + unsigned int nregs, unsigned int start_idx); +void t4_read_indirect(struct adapter *adap, unsigned int addr_reg, + unsigned int data_reg, u32 *vals, unsigned int nregs, + unsigned int start_idx); +void t4_hw_pci_read_cfg4(struct adapter *adapter, int reg, u32 *val); + +struct fw_filter_wr; + +void t4_intr_enable(struct adapter *adapter); +void t4_intr_disable(struct adapter *adapter); +int t4_slow_intr_handler(struct adapter *adapter); + +int t4_wait_dev_ready(void __iomem *regs); +int t4_link_start(struct adapter *adap, unsigned int mbox, unsigned int port, + struct link_config *lc); +int t4_restart_aneg(struct adapter *adap, unsigned int mbox, unsigned int port); + +#define T4_MEMORY_WRITE 0 +#define T4_MEMORY_READ 1 +int t4_memory_rw(struct adapter *adap, int win, int mtype, u32 addr, u32 len, + __be32 *buf, int dir); +static inline int t4_memory_write(struct adapter *adap, int mtype, u32 addr, + u32 len, __be32 *buf) +{ + return t4_memory_rw(adap, 0, mtype, addr, len, buf, 0); +} + +int t4_seeprom_wp(struct adapter *adapter, bool enable); +int get_vpd_params(struct adapter *adapter, struct vpd_params *p); +int t4_load_fw(struct adapter *adapter, const u8 *fw_data, unsigned int size); +int t4_fw_upgrade(struct adapter *adap, unsigned int mbox, + const u8 *fw_data, unsigned int size, int force); +unsigned int t4_flash_cfg_addr(struct adapter *adapter); +int t4_get_fw_version(struct adapter *adapter, u32 *vers); +int t4_get_tp_version(struct adapter *adapter, u32 *vers); +int t4_prep_fw(struct adapter *adap, struct fw_info *fw_info, + const u8 *fw_data, unsigned int fw_size, + struct fw_hdr *card_fw, enum dev_state state, int *reset); +int t4_prep_adapter(struct adapter *adapter); +int t4_init_tp_params(struct adapter *adap); +int t4_filter_field_shift(const struct adapter *adap, int filter_sel); +int t4_port_init(struct adapter *adap, int mbox, int pf, int vf); +void t4_fatal_err(struct adapter *adapter); +int t4_config_rss_range(struct adapter *adapter, int mbox, unsigned int viid, + int start, int n, const u16 *rspq, unsigned int nrspq); +int t4_config_glbl_rss(struct adapter *adapter, int mbox, unsigned int mode, + unsigned int flags); +int t4_mc_read(struct adapter *adap, int idx, u32 addr, __be32 *data, + u64 *parity); +int t4_edc_read(struct adapter *adap, int idx, u32 addr, __be32 *data, + u64 *parity); +const char *t4_get_port_type_description(enum fw_port_type port_type); +void t4_get_port_stats(struct adapter *adap, int idx, struct port_stats *p); +void t4_read_mtu_tbl(struct adapter *adap, u16 *mtus, u8 *mtu_log); +void t4_tp_wr_bits_indirect(struct adapter *adap, unsigned int addr, + unsigned int mask, unsigned int val); +void t4_tp_get_tcp_stats(struct adapter *adap, struct tp_tcp_stats *v4, + struct tp_tcp_stats *v6); +void t4_load_mtus(struct adapter *adap, const unsigned short *mtus, + const unsigned short *alpha, const unsigned short *beta); + +void t4_mk_filtdelwr(unsigned int ftid, struct fw_filter_wr *wr, int qid); + +void t4_wol_magic_enable(struct adapter *adap, unsigned int port, + const u8 *addr); +int t4_wol_pat_enable(struct adapter *adap, unsigned int port, unsigned int map, + u64 mask0, u64 mask1, unsigned int crc, bool enable); + +int t4_fw_hello(struct adapter *adap, unsigned int mbox, unsigned int evt_mbox, + enum dev_master master, enum dev_state *state); +int t4_fw_bye(struct adapter *adap, unsigned int mbox); +int t4_early_init(struct adapter *adap, unsigned int mbox); +int t4_fw_reset(struct adapter *adap, unsigned int mbox, int reset); +int t4_fixup_host_params(struct adapter *adap, unsigned int page_size, + unsigned int cache_line_size); +int t4_fw_initialize(struct adapter *adap, unsigned int mbox); +int t4_query_params(struct adapter *adap, unsigned int mbox, unsigned int pf, + unsigned int vf, unsigned int nparams, const u32 *params, + u32 *val); +int t4_set_params(struct adapter *adap, unsigned int mbox, unsigned int pf, + unsigned int vf, unsigned int nparams, const u32 *params, + const u32 *val); +int t4_set_params_nosleep(struct adapter *adap, unsigned int mbox, + unsigned int pf, unsigned int vf, + unsigned int nparams, const u32 *params, + const u32 *val); +int t4_cfg_pfvf(struct adapter *adap, unsigned int mbox, unsigned int pf, + unsigned int vf, unsigned int txq, unsigned int txq_eth_ctrl, + unsigned int rxqi, unsigned int rxq, unsigned int tc, + unsigned int vi, unsigned int cmask, unsigned int pmask, + unsigned int nexact, unsigned int rcaps, unsigned int wxcaps); +int t4_alloc_vi(struct adapter *adap, unsigned int mbox, unsigned int port, + unsigned int pf, unsigned int vf, unsigned int nmac, u8 *mac, + unsigned int *rss_size); +int t4_set_rxmode(struct adapter *adap, unsigned int mbox, unsigned int viid, + int mtu, int promisc, int all_multi, int bcast, int vlanex, + bool sleep_ok); +int t4_alloc_mac_filt(struct adapter *adap, unsigned int mbox, + unsigned int viid, bool free, unsigned int naddr, + const u8 **addr, u16 *idx, u64 *hash, bool sleep_ok); +int t4_change_mac(struct adapter *adap, unsigned int mbox, unsigned int viid, + int idx, const u8 *addr, bool persist, bool add_smt); +int t4_set_addr_hash(struct adapter *adap, unsigned int mbox, unsigned int viid, + bool ucast, u64 vec, bool sleep_ok); +int t4_enable_vi_params(struct adapter *adap, unsigned int mbox, + unsigned int viid, bool rx_en, bool tx_en, bool dcb_en); +int t4_enable_vi(struct adapter *adap, unsigned int mbox, unsigned int viid, + bool rx_en, bool tx_en); +int t4_identify_port(struct adapter *adap, unsigned int mbox, unsigned int viid, + unsigned int nblinks); +int t4_mdio_rd(struct adapter *adap, unsigned int mbox, unsigned int phy_addr, + unsigned int mmd, unsigned int reg, u16 *valp); +int t4_mdio_wr(struct adapter *adap, unsigned int mbox, unsigned int phy_addr, + unsigned int mmd, unsigned int reg, u16 val); +int t4_iq_free(struct adapter *adap, unsigned int mbox, unsigned int pf, + unsigned int vf, unsigned int iqtype, unsigned int iqid, + unsigned int fl0id, unsigned int fl1id); +int t4_eth_eq_free(struct adapter *adap, unsigned int mbox, unsigned int pf, + unsigned int vf, unsigned int eqid); +int t4_ctrl_eq_free(struct adapter *adap, unsigned int mbox, unsigned int pf, + unsigned int vf, unsigned int eqid); +int t4_ofld_eq_free(struct adapter *adap, unsigned int mbox, unsigned int pf, + unsigned int vf, unsigned int eqid); +int t4_handle_fw_rpl(struct adapter *adap, const __be64 *rpl); +void t4_db_full(struct adapter *adapter); +void t4_db_dropped(struct adapter *adapter); +int t4_fwaddrspace_write(struct adapter *adap, unsigned int mbox, + u32 addr, u32 val); +void t4_sge_decode_idma_state(struct adapter *adapter, int state); +#endif /* __CXGB4_H__ */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.c new file mode 100644 index 0000000..4fe3360 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.c @@ -0,0 +1,1149 @@ +/* + * Copyright (C) 2013-2014 Chelsio Communications. All rights reserved. + * + * Written by Anish Bhatt (anish@chelsio.com) + * Casey Leedom (leedom@chelsio.com) + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * The full GNU General Public License is included in this distribution in + * the file called "COPYING". + * + */ + +#include "cxgb4.h" + +/* DCBx version control + */ +char *dcb_ver_array[] = { + "Unknown", + "DCBx-CIN", + "DCBx-CEE 1.01", + "DCBx-IEEE", + "", "", "", + "Auto Negotiated" +}; + +/* Initialize a port's Data Center Bridging state. Typically used after a + * Link Down event. + */ +void cxgb4_dcb_state_init(struct net_device *dev) +{ + struct port_info *pi = netdev2pinfo(dev); + struct port_dcb_info *dcb = &pi->dcb; + int version_temp = dcb->dcb_version; + + memset(dcb, 0, sizeof(struct port_dcb_info)); + dcb->state = CXGB4_DCB_STATE_START; + if (version_temp) + dcb->dcb_version = version_temp; + + netdev_dbg(dev, "%s: Initializing DCB state for port[%d]\n", + __func__, pi->port_id); +} + +void cxgb4_dcb_version_init(struct net_device *dev) +{ + struct port_info *pi = netdev2pinfo(dev); + struct port_dcb_info *dcb = &pi->dcb; + + /* Any writes here are only done on kernels that exlicitly need + * a specific version, say < 2.6.38 which only support CEE + */ + dcb->dcb_version = FW_PORT_DCB_VER_AUTO; +} + +static void cxgb4_dcb_cleanup_apps(struct net_device *dev) +{ + struct port_info *pi = netdev2pinfo(dev); + struct adapter *adap = pi->adapter; + struct port_dcb_info *dcb = &pi->dcb; + struct dcb_app app; + int i, err; + + /* zero priority implies remove */ + app.priority = 0; + + for (i = 0; i < CXGB4_MAX_DCBX_APP_SUPPORTED; i++) { + /* Check if app list is exhausted */ + if (!dcb->app_priority[i].protocolid) + break; + + app.protocol = dcb->app_priority[i].protocolid; + + if (dcb->dcb_version == FW_PORT_DCB_VER_IEEE) { + app.priority = dcb->app_priority[i].user_prio_map; + app.selector = dcb->app_priority[i].sel_field + 1; + err = dcb_ieee_delapp(dev, &app); + } else { + app.selector = !!(dcb->app_priority[i].sel_field); + err = dcb_setapp(dev, &app); + } + + if (err) { + dev_err(adap->pdev_dev, + "Failed DCB Clear %s Application Priority: sel=%d, prot=%d, , err=%d\n", + dcb_ver_array[dcb->dcb_version], app.selector, + app.protocol, -err); + break; + } + } +} + +/* Finite State machine for Data Center Bridging. + */ +void cxgb4_dcb_state_fsm(struct net_device *dev, + enum cxgb4_dcb_state_input transition_to) +{ + struct port_info *pi = netdev2pinfo(dev); + struct port_dcb_info *dcb = &pi->dcb; + struct adapter *adap = pi->adapter; + enum cxgb4_dcb_state current_state = dcb->state; + + netdev_dbg(dev, "%s: State change from %d to %d for %s\n", + __func__, dcb->state, transition_to, dev->name); + + switch (current_state) { + case CXGB4_DCB_STATE_START: { + switch (transition_to) { + case CXGB4_DCB_INPUT_FW_DISABLED: { + /* we're going to use Host DCB */ + dcb->state = CXGB4_DCB_STATE_HOST; + dcb->supported = CXGB4_DCBX_HOST_SUPPORT; + break; + } + + case CXGB4_DCB_INPUT_FW_ENABLED: { + /* we're going to use Firmware DCB */ + dcb->state = CXGB4_DCB_STATE_FW_INCOMPLETE; + dcb->supported = DCB_CAP_DCBX_LLD_MANAGED; + if (dcb->dcb_version == FW_PORT_DCB_VER_IEEE) + dcb->supported |= DCB_CAP_DCBX_VER_IEEE; + else + dcb->supported |= DCB_CAP_DCBX_VER_CEE; + break; + } + + case CXGB4_DCB_INPUT_FW_INCOMPLETE: { + /* expected transition */ + break; + } + + case CXGB4_DCB_INPUT_FW_ALLSYNCED: { + dcb->state = CXGB4_DCB_STATE_FW_ALLSYNCED; + break; + } + + default: + goto bad_state_input; + } + break; + } + + case CXGB4_DCB_STATE_FW_INCOMPLETE: { + switch (transition_to) { + case CXGB4_DCB_INPUT_FW_ENABLED: { + /* we're alreaady in firmware DCB mode */ + break; + } + + case CXGB4_DCB_INPUT_FW_INCOMPLETE: { + /* we're already incomplete */ + break; + } + + case CXGB4_DCB_INPUT_FW_ALLSYNCED: { + dcb->state = CXGB4_DCB_STATE_FW_ALLSYNCED; + dcb->enabled = 1; + linkwatch_fire_event(dev); + break; + } + + default: + goto bad_state_input; + } + break; + } + + case CXGB4_DCB_STATE_FW_ALLSYNCED: { + switch (transition_to) { + case CXGB4_DCB_INPUT_FW_ENABLED: { + /* we're alreaady in firmware DCB mode */ + break; + } + + case CXGB4_DCB_INPUT_FW_INCOMPLETE: { + /* We were successfully running with firmware DCB but + * now it's telling us that it's in an "incomplete + * state. We need to reset back to a ground state + * of incomplete. + */ + cxgb4_dcb_cleanup_apps(dev); + cxgb4_dcb_state_init(dev); + dcb->state = CXGB4_DCB_STATE_FW_INCOMPLETE; + dcb->supported = CXGB4_DCBX_FW_SUPPORT; + linkwatch_fire_event(dev); + break; + } + + case CXGB4_DCB_INPUT_FW_ALLSYNCED: { + /* we're already all sync'ed + * this is only applicable for IEEE or + * when another VI already completed negotiaton + */ + dcb->enabled = 1; + linkwatch_fire_event(dev); + break; + } + + default: + goto bad_state_input; + } + break; + } + + case CXGB4_DCB_STATE_HOST: { + switch (transition_to) { + case CXGB4_DCB_INPUT_FW_DISABLED: { + /* we're alreaady in Host DCB mode */ + break; + } + + default: + goto bad_state_input; + } + break; + } + + default: + goto bad_state_transition; + } + return; + +bad_state_input: + dev_err(adap->pdev_dev, "cxgb4_dcb_state_fsm: illegal input symbol %d\n", + transition_to); + return; + +bad_state_transition: + dev_err(adap->pdev_dev, "cxgb4_dcb_state_fsm: bad state transition, state = %d, input = %d\n", + current_state, transition_to); +} + +/* Handle a DCB/DCBX update message from the firmware. + */ +void cxgb4_dcb_handle_fw_update(struct adapter *adap, + const struct fw_port_cmd *pcmd) +{ + const union fw_port_dcb *fwdcb = &pcmd->u.dcb; + int port = FW_PORT_CMD_PORTID_GET(be32_to_cpu(pcmd->op_to_portid)); + struct net_device *dev = adap->port[port]; + struct port_info *pi = netdev_priv(dev); + struct port_dcb_info *dcb = &pi->dcb; + int dcb_type = pcmd->u.dcb.pgid.type; + int dcb_running_version; + + /* Handle Firmware DCB Control messages separately since they drive + * our state machine. + */ + if (dcb_type == FW_PORT_DCB_TYPE_CONTROL) { + enum cxgb4_dcb_state_input input = + ((pcmd->u.dcb.control.all_syncd_pkd & + FW_PORT_CMD_ALL_SYNCD) + ? CXGB4_DCB_STATE_FW_ALLSYNCED + : CXGB4_DCB_STATE_FW_INCOMPLETE); + + if (dcb->dcb_version != FW_PORT_DCB_VER_UNKNOWN) { + dcb_running_version = FW_PORT_CMD_DCB_VERSION_GET( + be16_to_cpu( + pcmd->u.dcb.control.dcb_version_to_app_state)); + if (dcb_running_version == FW_PORT_DCB_VER_CEE1D01 || + dcb_running_version == FW_PORT_DCB_VER_IEEE) { + dcb->dcb_version = dcb_running_version; + dev_warn(adap->pdev_dev, "Interface %s is running %s\n", + dev->name, + dcb_ver_array[dcb->dcb_version]); + } else { + dev_warn(adap->pdev_dev, + "Something screwed up, requested firmware for %s, but firmware returned %s instead\n", + dcb_ver_array[dcb->dcb_version], + dcb_ver_array[dcb_running_version]); + dcb->dcb_version = FW_PORT_DCB_VER_UNKNOWN; + } + } + + cxgb4_dcb_state_fsm(dev, input); + return; + } + + /* It's weird, and almost certainly an error, to get Firmware DCB + * messages when we either haven't been told whether we're going to be + * doing Host or Firmware DCB; and even worse when we've been told + * that we're doing Host DCB! + */ + if (dcb->state == CXGB4_DCB_STATE_START || + dcb->state == CXGB4_DCB_STATE_HOST) { + dev_err(adap->pdev_dev, "Receiving Firmware DCB messages in State %d\n", + dcb->state); + return; + } + + /* Now handle the general Firmware DCB update messages ... + */ + switch (dcb_type) { + case FW_PORT_DCB_TYPE_PGID: + dcb->pgid = be32_to_cpu(fwdcb->pgid.pgid); + dcb->msgs |= CXGB4_DCB_FW_PGID; + break; + + case FW_PORT_DCB_TYPE_PGRATE: + dcb->pg_num_tcs_supported = fwdcb->pgrate.num_tcs_supported; + memcpy(dcb->pgrate, &fwdcb->pgrate.pgrate, + sizeof(dcb->pgrate)); + memcpy(dcb->tsa, &fwdcb->pgrate.tsa, + sizeof(dcb->tsa)); + dcb->msgs |= CXGB4_DCB_FW_PGRATE; + if (dcb->msgs & CXGB4_DCB_FW_PGID) + IEEE_FAUX_SYNC(dev, dcb); + break; + + case FW_PORT_DCB_TYPE_PRIORATE: + memcpy(dcb->priorate, &fwdcb->priorate.strict_priorate, + sizeof(dcb->priorate)); + dcb->msgs |= CXGB4_DCB_FW_PRIORATE; + break; + + case FW_PORT_DCB_TYPE_PFC: + dcb->pfcen = fwdcb->pfc.pfcen; + dcb->pfc_num_tcs_supported = fwdcb->pfc.max_pfc_tcs; + dcb->msgs |= CXGB4_DCB_FW_PFC; + IEEE_FAUX_SYNC(dev, dcb); + break; + + case FW_PORT_DCB_TYPE_APP_ID: { + const struct fw_port_app_priority *fwap = &fwdcb->app_priority; + int idx = fwap->idx; + struct app_priority *ap = &dcb->app_priority[idx]; + + struct dcb_app app = { + .protocol = be16_to_cpu(fwap->protocolid), + }; + int err; + + /* Convert from firmware format to relevant format + * when using app selector + */ + if (dcb->dcb_version == FW_PORT_DCB_VER_IEEE) { + app.selector = (fwap->sel_field + 1); + app.priority = ffs(fwap->user_prio_map) - 1; + err = dcb_ieee_setapp(dev, &app); + IEEE_FAUX_SYNC(dev, dcb); + } else { + /* Default is CEE */ + app.selector = !!(fwap->sel_field); + app.priority = fwap->user_prio_map; + err = dcb_setapp(dev, &app); + } + + if (err) + dev_err(adap->pdev_dev, + "Failed DCB Set Application Priority: sel=%d, prot=%d, prio=%d, err=%d\n", + app.selector, app.protocol, app.priority, -err); + + ap->user_prio_map = fwap->user_prio_map; + ap->sel_field = fwap->sel_field; + ap->protocolid = be16_to_cpu(fwap->protocolid); + dcb->msgs |= CXGB4_DCB_FW_APP_ID; + break; + } + + default: + dev_err(adap->pdev_dev, "Unknown DCB update type received %x\n", + dcb_type); + break; + } +} + +/* Data Center Bridging netlink operations. + */ + + +/* Get current DCB enabled/disabled state. + */ +static u8 cxgb4_getstate(struct net_device *dev) +{ + struct port_info *pi = netdev2pinfo(dev); + + return pi->dcb.enabled; +} + +/* Set DCB enabled/disabled. + */ +static u8 cxgb4_setstate(struct net_device *dev, u8 enabled) +{ + struct port_info *pi = netdev2pinfo(dev); + + /* If DCBx is host-managed, dcb is enabled by outside lldp agents */ + if (pi->dcb.state == CXGB4_DCB_STATE_HOST) { + pi->dcb.enabled = enabled; + return 0; + } + + /* Firmware doesn't provide any mechanism to control the DCB state. + */ + if (enabled != (pi->dcb.state == CXGB4_DCB_STATE_FW_ALLSYNCED)) + return 1; + + return 0; +} + +static void cxgb4_getpgtccfg(struct net_device *dev, int tc, + u8 *prio_type, u8 *pgid, u8 *bw_per, + u8 *up_tc_map, int local) +{ + struct fw_port_cmd pcmd; + struct port_info *pi = netdev2pinfo(dev); + struct adapter *adap = pi->adapter; + int err; + + *prio_type = *pgid = *bw_per = *up_tc_map = 0; + + if (local) + INIT_PORT_DCB_READ_LOCAL_CMD(pcmd, pi->port_id); + else + INIT_PORT_DCB_READ_PEER_CMD(pcmd, pi->port_id); + + pcmd.u.dcb.pgid.type = FW_PORT_DCB_TYPE_PGID; + err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd); + if (err != FW_PORT_DCB_CFG_SUCCESS) { + dev_err(adap->pdev_dev, "DCB read PGID failed with %d\n", -err); + return; + } + *pgid = (be32_to_cpu(pcmd.u.dcb.pgid.pgid) >> (tc * 4)) & 0xf; + + INIT_PORT_DCB_READ_PEER_CMD(pcmd, pi->port_id); + pcmd.u.dcb.pgrate.type = FW_PORT_DCB_TYPE_PGRATE; + err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd); + if (err != FW_PORT_DCB_CFG_SUCCESS) { + dev_err(adap->pdev_dev, "DCB read PGRATE failed with %d\n", + -err); + return; + } + + *bw_per = pcmd.u.dcb.pgrate.pgrate[*pgid]; + *up_tc_map = (1 << tc); + + /* prio_type is link strict */ + if (*pgid != 0xF) + *prio_type = 0x2; +} + +static void cxgb4_getpgtccfg_tx(struct net_device *dev, int tc, + u8 *prio_type, u8 *pgid, u8 *bw_per, + u8 *up_tc_map) +{ + /* tc 0 is written at MSB position */ + return cxgb4_getpgtccfg(dev, (7 - tc), prio_type, pgid, bw_per, + up_tc_map, 1); +} + + +static void cxgb4_getpgtccfg_rx(struct net_device *dev, int tc, + u8 *prio_type, u8 *pgid, u8 *bw_per, + u8 *up_tc_map) +{ + /* tc 0 is written at MSB position */ + return cxgb4_getpgtccfg(dev, (7 - tc), prio_type, pgid, bw_per, + up_tc_map, 0); +} + +static void cxgb4_setpgtccfg_tx(struct net_device *dev, int tc, + u8 prio_type, u8 pgid, u8 bw_per, + u8 up_tc_map) +{ + struct fw_port_cmd pcmd; + struct port_info *pi = netdev2pinfo(dev); + struct adapter *adap = pi->adapter; + int fw_tc = 7 - tc; + u32 _pgid; + int err; + + if (pgid == DCB_ATTR_VALUE_UNDEFINED) + return; + if (bw_per == DCB_ATTR_VALUE_UNDEFINED) + return; + + INIT_PORT_DCB_READ_LOCAL_CMD(pcmd, pi->port_id); + pcmd.u.dcb.pgid.type = FW_PORT_DCB_TYPE_PGID; + + err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd); + if (err != FW_PORT_DCB_CFG_SUCCESS) { + dev_err(adap->pdev_dev, "DCB read PGID failed with %d\n", -err); + return; + } + + _pgid = be32_to_cpu(pcmd.u.dcb.pgid.pgid); + _pgid &= ~(0xF << (fw_tc * 4)); + _pgid |= pgid << (fw_tc * 4); + pcmd.u.dcb.pgid.pgid = cpu_to_be32(_pgid); + + INIT_PORT_DCB_WRITE_CMD(pcmd, pi->port_id); + + err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd); + if (err != FW_PORT_DCB_CFG_SUCCESS) { + dev_err(adap->pdev_dev, "DCB write PGID failed with %d\n", + -err); + return; + } + + memset(&pcmd, 0, sizeof(struct fw_port_cmd)); + + INIT_PORT_DCB_READ_LOCAL_CMD(pcmd, pi->port_id); + pcmd.u.dcb.pgrate.type = FW_PORT_DCB_TYPE_PGRATE; + + err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd); + if (err != FW_PORT_DCB_CFG_SUCCESS) { + dev_err(adap->pdev_dev, "DCB read PGRATE failed with %d\n", + -err); + return; + } + + pcmd.u.dcb.pgrate.pgrate[pgid] = bw_per; + + INIT_PORT_DCB_WRITE_CMD(pcmd, pi->port_id); + if (pi->dcb.state == CXGB4_DCB_STATE_HOST) + pcmd.op_to_portid |= cpu_to_be32(FW_PORT_CMD_APPLY); + + err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd); + if (err != FW_PORT_DCB_CFG_SUCCESS) + dev_err(adap->pdev_dev, "DCB write PGRATE failed with %d\n", + -err); +} + +static void cxgb4_getpgbwgcfg(struct net_device *dev, int pgid, u8 *bw_per, + int local) +{ + struct fw_port_cmd pcmd; + struct port_info *pi = netdev2pinfo(dev); + struct adapter *adap = pi->adapter; + int err; + + if (local) + INIT_PORT_DCB_READ_LOCAL_CMD(pcmd, pi->port_id); + else + INIT_PORT_DCB_READ_PEER_CMD(pcmd, pi->port_id); + + pcmd.u.dcb.pgrate.type = FW_PORT_DCB_TYPE_PGRATE; + err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd); + if (err != FW_PORT_DCB_CFG_SUCCESS) { + dev_err(adap->pdev_dev, "DCB read PGRATE failed with %d\n", + -err); + return; + } + + *bw_per = pcmd.u.dcb.pgrate.pgrate[pgid]; +} + +static void cxgb4_getpgbwgcfg_tx(struct net_device *dev, int pgid, u8 *bw_per) +{ + return cxgb4_getpgbwgcfg(dev, pgid, bw_per, 1); +} + +static void cxgb4_getpgbwgcfg_rx(struct net_device *dev, int pgid, u8 *bw_per) +{ + return cxgb4_getpgbwgcfg(dev, pgid, bw_per, 0); +} + +static void cxgb4_setpgbwgcfg_tx(struct net_device *dev, int pgid, + u8 bw_per) +{ + struct fw_port_cmd pcmd; + struct port_info *pi = netdev2pinfo(dev); + struct adapter *adap = pi->adapter; + int err; + + INIT_PORT_DCB_READ_LOCAL_CMD(pcmd, pi->port_id); + pcmd.u.dcb.pgrate.type = FW_PORT_DCB_TYPE_PGRATE; + + err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd); + if (err != FW_PORT_DCB_CFG_SUCCESS) { + dev_err(adap->pdev_dev, "DCB read PGRATE failed with %d\n", + -err); + return; + } + + pcmd.u.dcb.pgrate.pgrate[pgid] = bw_per; + + INIT_PORT_DCB_WRITE_CMD(pcmd, pi->port_id); + if (pi->dcb.state == CXGB4_DCB_STATE_HOST) + pcmd.op_to_portid |= cpu_to_be32(FW_PORT_CMD_APPLY); + + err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd); + + if (err != FW_PORT_DCB_CFG_SUCCESS) + dev_err(adap->pdev_dev, "DCB write PGRATE failed with %d\n", + -err); +} + +/* Return whether the specified Traffic Class Priority has Priority Pause + * Frames enabled. + */ +static void cxgb4_getpfccfg(struct net_device *dev, int priority, u8 *pfccfg) +{ + struct port_info *pi = netdev2pinfo(dev); + struct port_dcb_info *dcb = &pi->dcb; + + if (dcb->state != CXGB4_DCB_STATE_FW_ALLSYNCED || + priority >= CXGB4_MAX_PRIORITY) + *pfccfg = 0; + else + *pfccfg = (pi->dcb.pfcen >> (7 - priority)) & 1; +} + +/* Enable/disable Priority Pause Frames for the specified Traffic Class + * Priority. + */ +static void cxgb4_setpfccfg(struct net_device *dev, int priority, u8 pfccfg) +{ + struct fw_port_cmd pcmd; + struct port_info *pi = netdev2pinfo(dev); + struct adapter *adap = pi->adapter; + int err; + + if (pi->dcb.state != CXGB4_DCB_STATE_FW_ALLSYNCED || + priority >= CXGB4_MAX_PRIORITY) + return; + + INIT_PORT_DCB_WRITE_CMD(pcmd, pi->port_id); + if (pi->dcb.state == CXGB4_DCB_STATE_HOST) + pcmd.op_to_portid |= cpu_to_be32(FW_PORT_CMD_APPLY); + + pcmd.u.dcb.pfc.type = FW_PORT_DCB_TYPE_PFC; + pcmd.u.dcb.pfc.pfcen = pi->dcb.pfcen; + + if (pfccfg) + pcmd.u.dcb.pfc.pfcen |= (1 << (7 - priority)); + else + pcmd.u.dcb.pfc.pfcen &= (~(1 << (7 - priority))); + + err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd); + if (err != FW_PORT_DCB_CFG_SUCCESS) { + dev_err(adap->pdev_dev, "DCB PFC write failed with %d\n", -err); + return; + } + + pi->dcb.pfcen = pcmd.u.dcb.pfc.pfcen; +} + +static u8 cxgb4_setall(struct net_device *dev) +{ + return 0; +} + +/* Return DCB capabilities. + */ +static u8 cxgb4_getcap(struct net_device *dev, int cap_id, u8 *caps) +{ + struct port_info *pi = netdev2pinfo(dev); + + switch (cap_id) { + case DCB_CAP_ATTR_PG: + case DCB_CAP_ATTR_PFC: + *caps = true; + break; + + case DCB_CAP_ATTR_PG_TCS: + /* 8 priorities for PG represented by bitmap */ + *caps = 0x80; + break; + + case DCB_CAP_ATTR_PFC_TCS: + /* 8 priorities for PFC represented by bitmap */ + *caps = 0x80; + break; + + case DCB_CAP_ATTR_GSP: + *caps = true; + break; + + case DCB_CAP_ATTR_UP2TC: + case DCB_CAP_ATTR_BCN: + *caps = false; + break; + + case DCB_CAP_ATTR_DCBX: + *caps = pi->dcb.supported; + break; + + default: + *caps = false; + } + + return 0; +} + +/* Return the number of Traffic Classes for the indicated Traffic Class ID. + */ +static int cxgb4_getnumtcs(struct net_device *dev, int tcs_id, u8 *num) +{ + struct port_info *pi = netdev2pinfo(dev); + + switch (tcs_id) { + case DCB_NUMTCS_ATTR_PG: + if (pi->dcb.msgs & CXGB4_DCB_FW_PGRATE) + *num = pi->dcb.pg_num_tcs_supported; + else + *num = 0x8; + break; + + case DCB_NUMTCS_ATTR_PFC: + *num = 0x8; + break; + + default: + return -EINVAL; + } + + return 0; +} + +/* Set the number of Traffic Classes supported for the indicated Traffic Class + * ID. + */ +static int cxgb4_setnumtcs(struct net_device *dev, int tcs_id, u8 num) +{ + /* Setting the number of Traffic Classes isn't supported. + */ + return -ENOSYS; +} + +/* Return whether Priority Flow Control is enabled. */ +static u8 cxgb4_getpfcstate(struct net_device *dev) +{ + struct port_info *pi = netdev2pinfo(dev); + + if (pi->dcb.state != CXGB4_DCB_STATE_FW_ALLSYNCED) + return false; + + return pi->dcb.pfcen != 0; +} + +/* Enable/disable Priority Flow Control. */ +static void cxgb4_setpfcstate(struct net_device *dev, u8 state) +{ + /* We can't enable/disable Priority Flow Control but we also can't + * return an error ... + */ +} + +/* Return the Application User Priority Map associated with the specified + * Application ID. + */ +static int __cxgb4_getapp(struct net_device *dev, u8 app_idtype, u16 app_id, + int peer) +{ + struct port_info *pi = netdev2pinfo(dev); + struct adapter *adap = pi->adapter; + int i; + + if (pi->dcb.state != CXGB4_DCB_STATE_FW_ALLSYNCED) + return 0; + + for (i = 0; i < CXGB4_MAX_DCBX_APP_SUPPORTED; i++) { + struct fw_port_cmd pcmd; + int err; + + if (peer) + INIT_PORT_DCB_READ_PEER_CMD(pcmd, pi->port_id); + else + INIT_PORT_DCB_READ_LOCAL_CMD(pcmd, pi->port_id); + + pcmd.u.dcb.app_priority.type = FW_PORT_DCB_TYPE_APP_ID; + pcmd.u.dcb.app_priority.idx = i; + + err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd); + if (err != FW_PORT_DCB_CFG_SUCCESS) { + dev_err(adap->pdev_dev, "DCB APP read failed with %d\n", + -err); + return err; + } + if (be16_to_cpu(pcmd.u.dcb.app_priority.protocolid) == app_id) + if (pcmd.u.dcb.app_priority.sel_field == app_idtype) + return pcmd.u.dcb.app_priority.user_prio_map; + + /* exhausted app list */ + if (!pcmd.u.dcb.app_priority.protocolid) + break; + } + + return -EEXIST; +} + +/* Return the Application User Priority Map associated with the specified + * Application ID. + */ +static int cxgb4_getapp(struct net_device *dev, u8 app_idtype, u16 app_id) +{ + return __cxgb4_getapp(dev, app_idtype, app_id, 0); +} + +/* Write a new Application User Priority Map for the specified Application ID + */ +static int __cxgb4_setapp(struct net_device *dev, u8 app_idtype, u16 app_id, + u8 app_prio) +{ + struct fw_port_cmd pcmd; + struct port_info *pi = netdev2pinfo(dev); + struct adapter *adap = pi->adapter; + int i, err; + + + if (pi->dcb.state != CXGB4_DCB_STATE_FW_ALLSYNCED) + return -EINVAL; + + /* DCB info gets thrown away on link up */ + if (!netif_carrier_ok(dev)) + return -ENOLINK; + + for (i = 0; i < CXGB4_MAX_DCBX_APP_SUPPORTED; i++) { + INIT_PORT_DCB_READ_LOCAL_CMD(pcmd, pi->port_id); + pcmd.u.dcb.app_priority.type = FW_PORT_DCB_TYPE_APP_ID; + pcmd.u.dcb.app_priority.idx = i; + err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd); + + if (err != FW_PORT_DCB_CFG_SUCCESS) { + dev_err(adap->pdev_dev, "DCB app table read failed with %d\n", + -err); + return err; + } + if (be16_to_cpu(pcmd.u.dcb.app_priority.protocolid) == app_id) { + /* overwrite existing app table */ + pcmd.u.dcb.app_priority.protocolid = 0; + break; + } + /* find first empty slot */ + if (!pcmd.u.dcb.app_priority.protocolid) + break; + } + + if (i == CXGB4_MAX_DCBX_APP_SUPPORTED) { + /* no empty slots available */ + dev_err(adap->pdev_dev, "DCB app table full\n"); + return -EBUSY; + } + + /* write out new app table entry */ + INIT_PORT_DCB_WRITE_CMD(pcmd, pi->port_id); + if (pi->dcb.state == CXGB4_DCB_STATE_HOST) + pcmd.op_to_portid |= cpu_to_be32(FW_PORT_CMD_APPLY); + + pcmd.u.dcb.app_priority.type = FW_PORT_DCB_TYPE_APP_ID; + pcmd.u.dcb.app_priority.protocolid = cpu_to_be16(app_id); + pcmd.u.dcb.app_priority.sel_field = app_idtype; + pcmd.u.dcb.app_priority.user_prio_map = app_prio; + pcmd.u.dcb.app_priority.idx = i; + + err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd); + if (err != FW_PORT_DCB_CFG_SUCCESS) { + dev_err(adap->pdev_dev, "DCB app table write failed with %d\n", + -err); + return err; + } + + return 0; +} + +/* Priority for CEE inside dcb_app is bitmask, with 0 being an invalid value */ +static int cxgb4_setapp(struct net_device *dev, u8 app_idtype, u16 app_id, + u8 app_prio) +{ + int ret; + struct dcb_app app = { + .selector = app_idtype, + .protocol = app_id, + .priority = app_prio, + }; + + if (app_idtype != DCB_APP_IDTYPE_ETHTYPE && + app_idtype != DCB_APP_IDTYPE_PORTNUM) + return -EINVAL; + + /* Convert app_idtype to a format that firmware understands */ + ret = __cxgb4_setapp(dev, app_idtype == DCB_APP_IDTYPE_ETHTYPE ? + app_idtype : 3, app_id, app_prio); + if (ret) + return ret; + + return dcb_setapp(dev, &app); +} + +/* Return whether IEEE Data Center Bridging has been negotiated. + */ +static inline int +cxgb4_ieee_negotiation_complete(struct net_device *dev, + enum cxgb4_dcb_fw_msgs dcb_subtype) +{ + struct port_info *pi = netdev2pinfo(dev); + struct port_dcb_info *dcb = &pi->dcb; + + if (dcb_subtype && !(dcb->msgs & dcb_subtype)) + return 0; + + return (dcb->state == CXGB4_DCB_STATE_FW_ALLSYNCED && + (dcb->supported & DCB_CAP_DCBX_VER_IEEE)); +} + +/* Fill in the Application User Priority Map associated with the + * specified Application. + * Priority for IEEE dcb_app is an integer, with 0 being a valid value + */ +static int cxgb4_ieee_getapp(struct net_device *dev, struct dcb_app *app) +{ + int prio; + + if (!cxgb4_ieee_negotiation_complete(dev, CXGB4_DCB_FW_APP_ID)) + return -EINVAL; + if (!(app->selector && app->protocol)) + return -EINVAL; + + /* Try querying firmware first, use firmware format */ + prio = __cxgb4_getapp(dev, app->selector - 1, app->protocol, 0); + + if (prio < 0) + prio = dcb_ieee_getapp_mask(dev, app); + + app->priority = ffs(prio) - 1; + return 0; +} + +/* Write a new Application User Priority Map for the specified Application ID. + * Priority for IEEE dcb_app is an integer, with 0 being a valid value + */ +static int cxgb4_ieee_setapp(struct net_device *dev, struct dcb_app *app) +{ + int ret; + + if (!cxgb4_ieee_negotiation_complete(dev, CXGB4_DCB_FW_APP_ID)) + return -EINVAL; + if (!(app->selector && app->protocol)) + return -EINVAL; + + if (!(app->selector > IEEE_8021QAZ_APP_SEL_ETHERTYPE && + app->selector < IEEE_8021QAZ_APP_SEL_ANY)) + return -EINVAL; + + /* change selector to a format that firmware understands */ + ret = __cxgb4_setapp(dev, app->selector - 1, app->protocol, + (1 << app->priority)); + if (ret) + return ret; + + return dcb_ieee_setapp(dev, app); +} + +/* Return our DCBX parameters. + */ +static u8 cxgb4_getdcbx(struct net_device *dev) +{ + struct port_info *pi = netdev2pinfo(dev); + + /* This is already set by cxgb4_set_dcb_caps, so just return it */ + return pi->dcb.supported; +} + +/* Set our DCBX parameters. + */ +static u8 cxgb4_setdcbx(struct net_device *dev, u8 dcb_request) +{ + struct port_info *pi = netdev2pinfo(dev); + + /* Filter out requests which exceed our capabilities. + */ + if ((dcb_request & (CXGB4_DCBX_FW_SUPPORT | CXGB4_DCBX_HOST_SUPPORT)) + != dcb_request) + return 1; + + /* Can't enable DCB if we haven't successfully negotiated it. + */ + if (pi->dcb.state != CXGB4_DCB_STATE_FW_ALLSYNCED) + return 1; + + /* There's currently no mechanism to allow for the firmware DCBX + * negotiation to be changed from the Host Driver. If the caller + * requests exactly the same parameters that we already have then + * we'll allow them to be successfully "set" ... + */ + if (dcb_request != pi->dcb.supported) + return 1; + + pi->dcb.supported = dcb_request; + return 0; +} + +static int cxgb4_getpeer_app(struct net_device *dev, + struct dcb_peer_app_info *info, u16 *app_count) +{ + struct fw_port_cmd pcmd; + struct port_info *pi = netdev2pinfo(dev); + struct adapter *adap = pi->adapter; + int i, err = 0; + + if (pi->dcb.state != CXGB4_DCB_STATE_FW_ALLSYNCED) + return 1; + + info->willing = 0; + info->error = 0; + + *app_count = 0; + for (i = 0; i < CXGB4_MAX_DCBX_APP_SUPPORTED; i++) { + INIT_PORT_DCB_READ_PEER_CMD(pcmd, pi->port_id); + pcmd.u.dcb.app_priority.type = FW_PORT_DCB_TYPE_APP_ID; + pcmd.u.dcb.app_priority.idx = *app_count; + err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd); + + if (err != FW_PORT_DCB_CFG_SUCCESS) { + dev_err(adap->pdev_dev, "DCB app table read failed with %d\n", + -err); + return err; + } + + /* find first empty slot */ + if (!pcmd.u.dcb.app_priority.protocolid) + break; + } + *app_count = i; + return err; +} + +static int cxgb4_getpeerapp_tbl(struct net_device *dev, struct dcb_app *table) +{ + struct fw_port_cmd pcmd; + struct port_info *pi = netdev2pinfo(dev); + struct adapter *adap = pi->adapter; + int i, err = 0; + + if (pi->dcb.state != CXGB4_DCB_STATE_FW_ALLSYNCED) + return 1; + + for (i = 0; i < CXGB4_MAX_DCBX_APP_SUPPORTED; i++) { + INIT_PORT_DCB_READ_PEER_CMD(pcmd, pi->port_id); + pcmd.u.dcb.app_priority.type = FW_PORT_DCB_TYPE_APP_ID; + pcmd.u.dcb.app_priority.idx = i; + err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd); + + if (err != FW_PORT_DCB_CFG_SUCCESS) { + dev_err(adap->pdev_dev, "DCB app table read failed with %d\n", + -err); + return err; + } + + /* find first empty slot */ + if (!pcmd.u.dcb.app_priority.protocolid) + break; + + table[i].selector = pcmd.u.dcb.app_priority.sel_field; + table[i].protocol = + be16_to_cpu(pcmd.u.dcb.app_priority.protocolid); + table[i].priority = + ffs(pcmd.u.dcb.app_priority.user_prio_map) - 1; + } + return err; +} + +/* Return Priority Group information. + */ +static int cxgb4_cee_peer_getpg(struct net_device *dev, struct cee_pg *pg) +{ + struct fw_port_cmd pcmd; + struct port_info *pi = netdev2pinfo(dev); + struct adapter *adap = pi->adapter; + u32 pgid; + int i, err; + + /* We're always "willing" -- the Switch Fabric always dictates the + * DCBX parameters to us. + */ + pg->willing = true; + + INIT_PORT_DCB_READ_PEER_CMD(pcmd, pi->port_id); + pcmd.u.dcb.pgid.type = FW_PORT_DCB_TYPE_PGID; + err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd); + if (err != FW_PORT_DCB_CFG_SUCCESS) { + dev_err(adap->pdev_dev, "DCB read PGID failed with %d\n", -err); + return err; + } + pgid = be32_to_cpu(pcmd.u.dcb.pgid.pgid); + + for (i = 0; i < CXGB4_MAX_PRIORITY; i++) + pg->prio_pg[7 - i] = (pgid >> (i * 4)) & 0xF; + + INIT_PORT_DCB_READ_PEER_CMD(pcmd, pi->port_id); + pcmd.u.dcb.pgrate.type = FW_PORT_DCB_TYPE_PGRATE; + err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd); + if (err != FW_PORT_DCB_CFG_SUCCESS) { + dev_err(adap->pdev_dev, "DCB read PGRATE failed with %d\n", + -err); + return err; + } + + for (i = 0; i < CXGB4_MAX_PRIORITY; i++) + pg->pg_bw[i] = pcmd.u.dcb.pgrate.pgrate[i]; + + return 0; +} + +/* Return Priority Flow Control information. + */ +static int cxgb4_cee_peer_getpfc(struct net_device *dev, struct cee_pfc *pfc) +{ + struct port_info *pi = netdev2pinfo(dev); + + cxgb4_getnumtcs(dev, DCB_NUMTCS_ATTR_PFC, &(pfc->tcs_supported)); + pfc->pfc_en = pi->dcb.pfcen; + + return 0; +} + +const struct dcbnl_rtnl_ops cxgb4_dcb_ops = { + .ieee_getapp = cxgb4_ieee_getapp, + .ieee_setapp = cxgb4_ieee_setapp, + + /* CEE std */ + .getstate = cxgb4_getstate, + .setstate = cxgb4_setstate, + .getpgtccfgtx = cxgb4_getpgtccfg_tx, + .getpgbwgcfgtx = cxgb4_getpgbwgcfg_tx, + .getpgtccfgrx = cxgb4_getpgtccfg_rx, + .getpgbwgcfgrx = cxgb4_getpgbwgcfg_rx, + .setpgtccfgtx = cxgb4_setpgtccfg_tx, + .setpgbwgcfgtx = cxgb4_setpgbwgcfg_tx, + .setpfccfg = cxgb4_setpfccfg, + .getpfccfg = cxgb4_getpfccfg, + .setall = cxgb4_setall, + .getcap = cxgb4_getcap, + .getnumtcs = cxgb4_getnumtcs, + .setnumtcs = cxgb4_setnumtcs, + .getpfcstate = cxgb4_getpfcstate, + .setpfcstate = cxgb4_setpfcstate, + .getapp = cxgb4_getapp, + .setapp = cxgb4_setapp, + + /* DCBX configuration */ + .getdcbx = cxgb4_getdcbx, + .setdcbx = cxgb4_setdcbx, + + /* peer apps */ + .peer_getappinfo = cxgb4_getpeer_app, + .peer_getapptable = cxgb4_getpeerapp_tbl, + + /* CEE peer */ + .cee_peer_getpg = cxgb4_cee_peer_getpg, + .cee_peer_getpfc = cxgb4_cee_peer_getpfc, +}; diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.h new file mode 100644 index 0000000..2a6aa88 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.h @@ -0,0 +1,151 @@ +/* + * Copyright (C) 2013-2014 Chelsio Communications. All rights reserved. + * + * Written by Anish Bhatt (anish@chelsio.com) + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * The full GNU General Public License is included in this distribution in + * the file called "COPYING". + * + */ + +#ifndef __CXGB4_DCB_H +#define __CXGB4_DCB_H + +#include +#include +#include + +#ifdef CONFIG_CHELSIO_T4_DCB + +#define CXGB4_DCBX_FW_SUPPORT \ + (DCB_CAP_DCBX_VER_CEE | \ + DCB_CAP_DCBX_VER_IEEE | \ + DCB_CAP_DCBX_LLD_MANAGED) +#define CXGB4_DCBX_HOST_SUPPORT \ + (DCB_CAP_DCBX_VER_CEE | \ + DCB_CAP_DCBX_VER_IEEE | \ + DCB_CAP_DCBX_HOST) + +#define CXGB4_MAX_PRIORITY CXGB4_MAX_DCBX_APP_SUPPORTED +#define CXGB4_MAX_TCS CXGB4_MAX_DCBX_APP_SUPPORTED + +#define INIT_PORT_DCB_CMD(__pcmd, __port, __op, __action) \ + do { \ + memset(&(__pcmd), 0, sizeof(__pcmd)); \ + (__pcmd).op_to_portid = \ + cpu_to_be32(FW_CMD_OP(FW_PORT_CMD) | \ + FW_CMD_REQUEST | \ + FW_CMD_##__op | \ + FW_PORT_CMD_PORTID(__port)); \ + (__pcmd).action_to_len16 = \ + cpu_to_be32(FW_PORT_CMD_ACTION(__action) | \ + FW_LEN16(pcmd)); \ + } while (0) + +#define INIT_PORT_DCB_READ_PEER_CMD(__pcmd, __port) \ + INIT_PORT_DCB_CMD(__pcmd, __port, READ, FW_PORT_ACTION_DCB_READ_RECV) + +#define INIT_PORT_DCB_READ_LOCAL_CMD(__pcmd, __port) \ + INIT_PORT_DCB_CMD(__pcmd, __port, READ, FW_PORT_ACTION_DCB_READ_TRANS) + +#define INIT_PORT_DCB_READ_SYNC_CMD(__pcmd, __port) \ + INIT_PORT_DCB_CMD(__pcmd, __port, READ, FW_PORT_ACTION_DCB_READ_DET) + +#define INIT_PORT_DCB_WRITE_CMD(__pcmd, __port) \ + INIT_PORT_DCB_CMD(__pcmd, __port, EXEC, FW_PORT_ACTION_L2_DCB_CFG) + +#define IEEE_FAUX_SYNC(__dev, __dcb) \ + do { \ + if ((__dcb)->dcb_version == FW_PORT_DCB_VER_IEEE) \ + cxgb4_dcb_state_fsm((__dev), \ + CXGB4_DCB_STATE_FW_ALLSYNCED); \ + } while (0) + +/* States we can be in for a port's Data Center Bridging. + */ +enum cxgb4_dcb_state { + CXGB4_DCB_STATE_START, /* initial unknown state */ + CXGB4_DCB_STATE_HOST, /* we're using Host DCB (if at all) */ + CXGB4_DCB_STATE_FW_INCOMPLETE, /* using firmware DCB, incomplete */ + CXGB4_DCB_STATE_FW_ALLSYNCED, /* using firmware DCB, all sync'ed */ +}; + +/* Data Center Bridging state input for the Finite State Machine. + */ +enum cxgb4_dcb_state_input { + /* Input from the firmware. + */ + CXGB4_DCB_INPUT_FW_DISABLED, /* firmware DCB disabled */ + CXGB4_DCB_INPUT_FW_ENABLED, /* firmware DCB enabled */ + CXGB4_DCB_INPUT_FW_INCOMPLETE, /* firmware reports incomplete DCB */ + CXGB4_DCB_INPUT_FW_ALLSYNCED, /* firmware reports all sync'ed */ + +}; + +/* Firmware DCB messages that we've received so far ... + */ +enum cxgb4_dcb_fw_msgs { + CXGB4_DCB_FW_PGID = 0x01, + CXGB4_DCB_FW_PGRATE = 0x02, + CXGB4_DCB_FW_PRIORATE = 0x04, + CXGB4_DCB_FW_PFC = 0x08, + CXGB4_DCB_FW_APP_ID = 0x10, +}; + +#define CXGB4_MAX_DCBX_APP_SUPPORTED 8 + +/* Data Center Bridging support; + */ +struct port_dcb_info { + enum cxgb4_dcb_state state; /* DCB State Machine */ + enum cxgb4_dcb_fw_msgs msgs; /* DCB Firmware messages received */ + unsigned int supported; /* OS DCB capabilities supported */ + bool enabled; /* OS Enabled state */ + + /* Cached copies of DCB information sent by the firmware (in Host + * Native Endian format). + */ + u32 pgid; /* Priority Group[0..7] */ + u8 dcb_version; /* Running DCBx version */ + u8 pfcen; /* Priority Flow Control[0..7] */ + u8 pg_num_tcs_supported; /* max PG Traffic Classes */ + u8 pfc_num_tcs_supported; /* max PFC Traffic Classes */ + u8 pgrate[8]; /* Priority Group Rate[0..7] */ + u8 priorate[8]; /* Priority Rate[0..7] */ + u8 tsa[8]; /* TSA Algorithm[0..7] */ + struct app_priority { /* Application Information */ + u8 user_prio_map; /* Priority Map bitfield */ + u8 sel_field; /* Protocol ID interpretation */ + u16 protocolid; /* Protocol ID */ + } app_priority[CXGB4_MAX_DCBX_APP_SUPPORTED]; +}; + +void cxgb4_dcb_state_init(struct net_device *); +void cxgb4_dcb_version_init(struct net_device *); +void cxgb4_dcb_state_fsm(struct net_device *, enum cxgb4_dcb_state_input); +void cxgb4_dcb_handle_fw_update(struct adapter *, const struct fw_port_cmd *); +void cxgb4_dcb_set_caps(struct adapter *, const struct fw_port_cmd *); +extern const struct dcbnl_rtnl_ops cxgb4_dcb_ops; + +#define CXGB4_DCB_ENABLED true + +#else /* !CONFIG_CHELSIO_T4_DCB */ + +static inline void cxgb4_dcb_state_init(struct net_device *dev) +{ +} + +#define CXGB4_DCB_ENABLED false + +#endif /* !CONFIG_CHELSIO_T4_DCB */ + +#endif /* __CXGB4_DCB_H */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c new file mode 100644 index 0000000..279873c --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -0,0 +1,6902 @@ +/* + * This file is part of the Chelsio T4 Ethernet driver for Linux. + * + * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cxgb4.h" +#include "t4_regs.h" +#include "t4_msg.h" +#include "t4fw_api.h" +#include "cxgb4_dcb.h" +#include "l2t.h" + +#include <../drivers/net/bonding/bonding.h> + +#ifdef DRV_VERSION +#undef DRV_VERSION +#endif +#define DRV_VERSION "2.0.0-ko" +#define DRV_DESC "Chelsio T4/T5 Network Driver" + +/* + * Max interrupt hold-off timer value in us. Queues fall back to this value + * under extreme memory pressure so it's largish to give the system time to + * recover. + */ +#define MAX_SGE_TIMERVAL 200U + +enum { + /* + * Physical Function provisioning constants. + */ + PFRES_NVI = 4, /* # of Virtual Interfaces */ + PFRES_NETHCTRL = 128, /* # of EQs used for ETH or CTRL Qs */ + PFRES_NIQFLINT = 128, /* # of ingress Qs/w Free List(s)/intr + */ + PFRES_NEQ = 256, /* # of egress queues */ + PFRES_NIQ = 0, /* # of ingress queues */ + PFRES_TC = 0, /* PCI-E traffic class */ + PFRES_NEXACTF = 128, /* # of exact MPS filters */ + + PFRES_R_CAPS = FW_CMD_CAP_PF, + PFRES_WX_CAPS = FW_CMD_CAP_PF, + +#ifdef CONFIG_PCI_IOV + /* + * Virtual Function provisioning constants. We need two extra Ingress + * Queues with Interrupt capability to serve as the VF's Firmware + * Event Queue and Forwarded Interrupt Queue (when using MSI mode) -- + * neither will have Free Lists associated with them). For each + * Ethernet/Control Egress Queue and for each Free List, we need an + * Egress Context. + */ + VFRES_NPORTS = 1, /* # of "ports" per VF */ + VFRES_NQSETS = 2, /* # of "Queue Sets" per VF */ + + VFRES_NVI = VFRES_NPORTS, /* # of Virtual Interfaces */ + VFRES_NETHCTRL = VFRES_NQSETS, /* # of EQs used for ETH or CTRL Qs */ + VFRES_NIQFLINT = VFRES_NQSETS+2,/* # of ingress Qs/w Free List(s)/intr */ + VFRES_NEQ = VFRES_NQSETS*2, /* # of egress queues */ + VFRES_NIQ = 0, /* # of non-fl/int ingress queues */ + VFRES_TC = 0, /* PCI-E traffic class */ + VFRES_NEXACTF = 16, /* # of exact MPS filters */ + + VFRES_R_CAPS = FW_CMD_CAP_DMAQ|FW_CMD_CAP_VF|FW_CMD_CAP_PORT, + VFRES_WX_CAPS = FW_CMD_CAP_DMAQ|FW_CMD_CAP_VF, +#endif +}; + +/* + * Provide a Port Access Rights Mask for the specified PF/VF. This is very + * static and likely not to be useful in the long run. We really need to + * implement some form of persistent configuration which the firmware + * controls. + */ +static unsigned int pfvfres_pmask(struct adapter *adapter, + unsigned int pf, unsigned int vf) +{ + unsigned int portn, portvec; + + /* + * Give PF's access to all of the ports. + */ + if (vf == 0) + return FW_PFVF_CMD_PMASK_MASK; + + /* + * For VFs, we'll assign them access to the ports based purely on the + * PF. We assign active ports in order, wrapping around if there are + * fewer active ports than PFs: e.g. active port[pf % nports]. + * Unfortunately the adapter's port_info structs haven't been + * initialized yet so we have to compute this. + */ + if (adapter->params.nports == 0) + return 0; + + portn = pf % adapter->params.nports; + portvec = adapter->params.portvec; + for (;;) { + /* + * Isolate the lowest set bit in the port vector. If we're at + * the port number that we want, return that as the pmask. + * otherwise mask that bit out of the port vector and + * decrement our port number ... + */ + unsigned int pmask = portvec ^ (portvec & (portvec-1)); + if (portn == 0) + return pmask; + portn--; + portvec &= ~pmask; + } + /*NOTREACHED*/ +} + +enum { + MAX_TXQ_ENTRIES = 16384, + MAX_CTRL_TXQ_ENTRIES = 1024, + MAX_RSPQ_ENTRIES = 16384, + MAX_RX_BUFFERS = 16384, + MIN_TXQ_ENTRIES = 32, + MIN_CTRL_TXQ_ENTRIES = 32, + MIN_RSPQ_ENTRIES = 128, + MIN_FL_ENTRIES = 16 +}; + +/* Host shadow copy of ingress filter entry. This is in host native format + * and doesn't match the ordering or bit order, etc. of the hardware of the + * firmware command. The use of bit-field structure elements is purely to + * remind ourselves of the field size limitations and save memory in the case + * where the filter table is large. + */ +struct filter_entry { + /* Administrative fields for filter. + */ + u32 valid:1; /* filter allocated and valid */ + u32 locked:1; /* filter is administratively locked */ + + u32 pending:1; /* filter action is pending firmware reply */ + u32 smtidx:8; /* Source MAC Table index for smac */ + struct l2t_entry *l2t; /* Layer Two Table entry for dmac */ + + /* The filter itself. Most of this is a straight copy of information + * provided by the extended ioctl(). Some fields are translated to + * internal forms -- for instance the Ingress Queue ID passed in from + * the ioctl() is translated into the Absolute Ingress Queue ID. + */ + struct ch_filter_specification fs; +}; + +#define DFLT_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK | \ + NETIF_MSG_TIMER | NETIF_MSG_IFDOWN | NETIF_MSG_IFUP |\ + NETIF_MSG_RX_ERR | NETIF_MSG_TX_ERR) + +#define CH_DEVICE(devid, data) { PCI_VDEVICE(CHELSIO, devid), (data) } + +static const struct pci_device_id cxgb4_pci_tbl[] = { + CH_DEVICE(0xa000, 0), /* PE10K */ + CH_DEVICE(0x4001, -1), + CH_DEVICE(0x4002, -1), + CH_DEVICE(0x4003, -1), + CH_DEVICE(0x4004, -1), + CH_DEVICE(0x4005, -1), + CH_DEVICE(0x4006, -1), + CH_DEVICE(0x4007, -1), + CH_DEVICE(0x4008, -1), + CH_DEVICE(0x4009, -1), + CH_DEVICE(0x400a, -1), + CH_DEVICE(0x400d, -1), + CH_DEVICE(0x400e, -1), + CH_DEVICE(0x4080, -1), + CH_DEVICE(0x4081, -1), + CH_DEVICE(0x4082, -1), + CH_DEVICE(0x4083, -1), + CH_DEVICE(0x4084, -1), + CH_DEVICE(0x4085, -1), + CH_DEVICE(0x4086, -1), + CH_DEVICE(0x4087, -1), + CH_DEVICE(0x4088, -1), + CH_DEVICE(0x4401, 4), + CH_DEVICE(0x4402, 4), + CH_DEVICE(0x4403, 4), + CH_DEVICE(0x4404, 4), + CH_DEVICE(0x4405, 4), + CH_DEVICE(0x4406, 4), + CH_DEVICE(0x4407, 4), + CH_DEVICE(0x4408, 4), + CH_DEVICE(0x4409, 4), + CH_DEVICE(0x440a, 4), + CH_DEVICE(0x440d, 4), + CH_DEVICE(0x440e, 4), + CH_DEVICE(0x4480, 4), + CH_DEVICE(0x4481, 4), + CH_DEVICE(0x4482, 4), + CH_DEVICE(0x4483, 4), + CH_DEVICE(0x4484, 4), + CH_DEVICE(0x4485, 4), + CH_DEVICE(0x4486, 4), + CH_DEVICE(0x4487, 4), + CH_DEVICE(0x4488, 4), + CH_DEVICE(0x5001, 4), + CH_DEVICE(0x5002, 4), + CH_DEVICE(0x5003, 4), + CH_DEVICE(0x5004, 4), + CH_DEVICE(0x5005, 4), + CH_DEVICE(0x5006, 4), + CH_DEVICE(0x5007, 4), + CH_DEVICE(0x5008, 4), + CH_DEVICE(0x5009, 4), + CH_DEVICE(0x500A, 4), + CH_DEVICE(0x500B, 4), + CH_DEVICE(0x500C, 4), + CH_DEVICE(0x500D, 4), + CH_DEVICE(0x500E, 4), + CH_DEVICE(0x500F, 4), + CH_DEVICE(0x5010, 4), + CH_DEVICE(0x5011, 4), + CH_DEVICE(0x5012, 4), + CH_DEVICE(0x5013, 4), + CH_DEVICE(0x5014, 4), + CH_DEVICE(0x5015, 4), + CH_DEVICE(0x5080, 4), + CH_DEVICE(0x5081, 4), + CH_DEVICE(0x5082, 4), + CH_DEVICE(0x5083, 4), + CH_DEVICE(0x5084, 4), + CH_DEVICE(0x5085, 4), + CH_DEVICE(0x5086, 4), + CH_DEVICE(0x5087, 4), + CH_DEVICE(0x5088, 4), + CH_DEVICE(0x5401, 4), + CH_DEVICE(0x5402, 4), + CH_DEVICE(0x5403, 4), + CH_DEVICE(0x5404, 4), + CH_DEVICE(0x5405, 4), + CH_DEVICE(0x5406, 4), + CH_DEVICE(0x5407, 4), + CH_DEVICE(0x5408, 4), + CH_DEVICE(0x5409, 4), + CH_DEVICE(0x540A, 4), + CH_DEVICE(0x540B, 4), + CH_DEVICE(0x540C, 4), + CH_DEVICE(0x540D, 4), + CH_DEVICE(0x540E, 4), + CH_DEVICE(0x540F, 4), + CH_DEVICE(0x5410, 4), + CH_DEVICE(0x5411, 4), + CH_DEVICE(0x5412, 4), + CH_DEVICE(0x5413, 4), + CH_DEVICE(0x5414, 4), + CH_DEVICE(0x5415, 4), + CH_DEVICE(0x5480, 4), + CH_DEVICE(0x5481, 4), + CH_DEVICE(0x5482, 4), + CH_DEVICE(0x5483, 4), + CH_DEVICE(0x5484, 4), + CH_DEVICE(0x5485, 4), + CH_DEVICE(0x5486, 4), + CH_DEVICE(0x5487, 4), + CH_DEVICE(0x5488, 4), + { 0, } +}; + +#define FW4_FNAME "cxgb4/t4fw.bin" +#define FW5_FNAME "cxgb4/t5fw.bin" +#define FW4_CFNAME "cxgb4/t4-config.txt" +#define FW5_CFNAME "cxgb4/t5-config.txt" + +MODULE_DESCRIPTION(DRV_DESC); +MODULE_AUTHOR("Chelsio Communications"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); +MODULE_DEVICE_TABLE(pci, cxgb4_pci_tbl); +MODULE_FIRMWARE(FW4_FNAME); +MODULE_FIRMWARE(FW5_FNAME); + +/* + * Normally we're willing to become the firmware's Master PF but will be happy + * if another PF has already become the Master and initialized the adapter. + * Setting "force_init" will cause this driver to forcibly establish itself as + * the Master PF and initialize the adapter. + */ +static uint force_init; + +module_param(force_init, uint, 0644); +MODULE_PARM_DESC(force_init, "Forcibly become Master PF and initialize adapter"); + +/* + * Normally if the firmware we connect to has Configuration File support, we + * use that and only fall back to the old Driver-based initialization if the + * Configuration File fails for some reason. If force_old_init is set, then + * we'll always use the old Driver-based initialization sequence. + */ +static uint force_old_init; + +module_param(force_old_init, uint, 0644); +MODULE_PARM_DESC(force_old_init, "Force old initialization sequence"); + +static int dflt_msg_enable = DFLT_MSG_ENABLE; + +module_param(dflt_msg_enable, int, 0644); +MODULE_PARM_DESC(dflt_msg_enable, "Chelsio T4 default message enable bitmap"); + +/* + * The driver uses the best interrupt scheme available on a platform in the + * order MSI-X, MSI, legacy INTx interrupts. This parameter determines which + * of these schemes the driver may consider as follows: + * + * msi = 2: choose from among all three options + * msi = 1: only consider MSI and INTx interrupts + * msi = 0: force INTx interrupts + */ +static int msi = 2; + +module_param(msi, int, 0644); +MODULE_PARM_DESC(msi, "whether to use INTx (0), MSI (1) or MSI-X (2)"); + +/* + * Queue interrupt hold-off timer values. Queues default to the first of these + * upon creation. + */ +static unsigned int intr_holdoff[SGE_NTIMERS - 1] = { 5, 10, 20, 50, 100 }; + +module_param_array(intr_holdoff, uint, NULL, 0644); +MODULE_PARM_DESC(intr_holdoff, "values for queue interrupt hold-off timers " + "0..4 in microseconds"); + +static unsigned int intr_cnt[SGE_NCOUNTERS - 1] = { 4, 8, 16 }; + +module_param_array(intr_cnt, uint, NULL, 0644); +MODULE_PARM_DESC(intr_cnt, + "thresholds 1..3 for queue interrupt packet counters"); + +/* + * Normally we tell the chip to deliver Ingress Packets into our DMA buffers + * offset by 2 bytes in order to have the IP headers line up on 4-byte + * boundaries. This is a requirement for many architectures which will throw + * a machine check fault if an attempt is made to access one of the 4-byte IP + * header fields on a non-4-byte boundary. And it's a major performance issue + * even on some architectures which allow it like some implementations of the + * x86 ISA. However, some architectures don't mind this and for some very + * edge-case performance sensitive applications (like forwarding large volumes + * of small packets), setting this DMA offset to 0 will decrease the number of + * PCI-E Bus transfers enough to measurably affect performance. + */ +static int rx_dma_offset = 2; + +static bool vf_acls; + +#ifdef CONFIG_PCI_IOV +module_param(vf_acls, bool, 0644); +MODULE_PARM_DESC(vf_acls, "if set enable virtualization L2 ACL enforcement"); + +/* Configure the number of PCI-E Virtual Function which are to be instantiated + * on SR-IOV Capable Physical Functions. + */ +static unsigned int num_vf[NUM_OF_PF_WITH_SRIOV]; + +module_param_array(num_vf, uint, NULL, 0644); +MODULE_PARM_DESC(num_vf, "number of VFs for each of PFs 0-3"); +#endif + +/* TX Queue select used to determine what algorithm to use for selecting TX + * queue. Select between the kernel provided function (select_queue=0) or user + * cxgb_select_queue function (select_queue=1) + * + * Default: select_queue=0 + */ +static int select_queue; +module_param(select_queue, int, 0644); +MODULE_PARM_DESC(select_queue, + "Select between kernel provided method of selecting or driver method of selecting TX queue. Default is kernel method."); + +/* + * The filter TCAM has a fixed portion and a variable portion. The fixed + * portion can match on source/destination IP IPv4/IPv6 addresses and TCP/UDP + * ports. The variable portion is 36 bits which can include things like Exact + * Match MAC Index (9 bits), Ether Type (16 bits), IP Protocol (8 bits), + * [Inner] VLAN Tag (17 bits), etc. which, if all were somehow selected, would + * far exceed the 36-bit budget for this "compressed" header portion of the + * filter. Thus, we have a scarce resource which must be carefully managed. + * + * By default we set this up to mostly match the set of filter matching + * capabilities of T3 but with accommodations for some of T4's more + * interesting features: + * + * { IP Fragment (1), MPS Match Type (3), IP Protocol (8), + * [Inner] VLAN (17), Port (3), FCoE (1) } + */ +enum { + TP_VLAN_PRI_MAP_DEFAULT = HW_TPL_FR_MT_PR_IV_P_FC, + TP_VLAN_PRI_MAP_FIRST = FCOE_SHIFT, + TP_VLAN_PRI_MAP_LAST = FRAGMENTATION_SHIFT, +}; + +static unsigned int tp_vlan_pri_map = TP_VLAN_PRI_MAP_DEFAULT; + +module_param(tp_vlan_pri_map, uint, 0644); +MODULE_PARM_DESC(tp_vlan_pri_map, "global compressed filter configuration"); + +static struct dentry *cxgb4_debugfs_root; + +static LIST_HEAD(adapter_list); +static DEFINE_MUTEX(uld_mutex); +/* Adapter list to be accessed from atomic context */ +static LIST_HEAD(adap_rcu_list); +static DEFINE_SPINLOCK(adap_rcu_lock); +static struct cxgb4_uld_info ulds[CXGB4_ULD_MAX]; +static const char *uld_str[] = { "RDMA", "iSCSI" }; + +static void link_report(struct net_device *dev) +{ + if (!netif_carrier_ok(dev)) + netdev_info(dev, "link down\n"); + else { + static const char *fc[] = { "no", "Rx", "Tx", "Tx/Rx" }; + + const char *s = "10Mbps"; + const struct port_info *p = netdev_priv(dev); + + switch (p->link_cfg.speed) { + case 10000: + s = "10Gbps"; + break; + case 1000: + s = "1000Mbps"; + break; + case 100: + s = "100Mbps"; + break; + case 40000: + s = "40Gbps"; + break; + } + + netdev_info(dev, "link up, %s, full-duplex, %s PAUSE\n", s, + fc[p->link_cfg.fc]); + } +} + +#ifdef CONFIG_CHELSIO_T4_DCB +/* Set up/tear down Data Center Bridging Priority mapping for a net device. */ +static void dcb_tx_queue_prio_enable(struct net_device *dev, int enable) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adap = pi->adapter; + struct sge_eth_txq *txq = &adap->sge.ethtxq[pi->first_qset]; + int i; + + /* We use a simple mapping of Port TX Queue Index to DCB + * Priority when we're enabling DCB. + */ + for (i = 0; i < pi->nqsets; i++, txq++) { + u32 name, value; + int err; + + name = (FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) | + FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_EQ_DCBPRIO_ETH) | + FW_PARAMS_PARAM_YZ(txq->q.cntxt_id)); + value = enable ? i : 0xffffffff; + + /* Since we can be called while atomic (from "interrupt + * level") we need to issue the Set Parameters Commannd + * without sleeping (timeout < 0). + */ + err = t4_set_params_nosleep(adap, adap->mbox, adap->fn, 0, 1, + &name, &value); + + if (err) + dev_err(adap->pdev_dev, + "Can't %s DCB Priority on port %d, TX Queue %d: err=%d\n", + enable ? "set" : "unset", pi->port_id, i, -err); + else + txq->dcb_prio = value; + } +} +#endif /* CONFIG_CHELSIO_T4_DCB */ + +void t4_os_link_changed(struct adapter *adapter, int port_id, int link_stat) +{ + struct net_device *dev = adapter->port[port_id]; + + /* Skip changes from disabled ports. */ + if (netif_running(dev) && link_stat != netif_carrier_ok(dev)) { + if (link_stat) + netif_carrier_on(dev); + else { +#ifdef CONFIG_CHELSIO_T4_DCB + cxgb4_dcb_state_init(dev); + dcb_tx_queue_prio_enable(dev, false); +#endif /* CONFIG_CHELSIO_T4_DCB */ + netif_carrier_off(dev); + } + + link_report(dev); + } +} + +void t4_os_portmod_changed(const struct adapter *adap, int port_id) +{ + static const char *mod_str[] = { + NULL, "LR", "SR", "ER", "passive DA", "active DA", "LRM" + }; + + const struct net_device *dev = adap->port[port_id]; + const struct port_info *pi = netdev_priv(dev); + + if (pi->mod_type == FW_PORT_MOD_TYPE_NONE) + netdev_info(dev, "port module unplugged\n"); + else if (pi->mod_type < ARRAY_SIZE(mod_str)) + netdev_info(dev, "%s module inserted\n", mod_str[pi->mod_type]); +} + +/* + * Configure the exact and hash address filters to handle a port's multicast + * and secondary unicast MAC addresses. + */ +static int set_addr_filters(const struct net_device *dev, bool sleep) +{ + u64 mhash = 0; + u64 uhash = 0; + bool free = true; + u16 filt_idx[7]; + const u8 *addr[7]; + int ret, naddr = 0; + const struct netdev_hw_addr *ha; + int uc_cnt = netdev_uc_count(dev); + int mc_cnt = netdev_mc_count(dev); + const struct port_info *pi = netdev_priv(dev); + unsigned int mb = pi->adapter->fn; + + /* first do the secondary unicast addresses */ + netdev_for_each_uc_addr(ha, dev) { + addr[naddr++] = ha->addr; + if (--uc_cnt == 0 || naddr >= ARRAY_SIZE(addr)) { + ret = t4_alloc_mac_filt(pi->adapter, mb, pi->viid, free, + naddr, addr, filt_idx, &uhash, sleep); + if (ret < 0) + return ret; + + free = false; + naddr = 0; + } + } + + /* next set up the multicast addresses */ + netdev_for_each_mc_addr(ha, dev) { + addr[naddr++] = ha->addr; + if (--mc_cnt == 0 || naddr >= ARRAY_SIZE(addr)) { + ret = t4_alloc_mac_filt(pi->adapter, mb, pi->viid, free, + naddr, addr, filt_idx, &mhash, sleep); + if (ret < 0) + return ret; + + free = false; + naddr = 0; + } + } + + return t4_set_addr_hash(pi->adapter, mb, pi->viid, uhash != 0, + uhash | mhash, sleep); +} + +int dbfifo_int_thresh = 10; /* 10 == 640 entry threshold */ +module_param(dbfifo_int_thresh, int, 0644); +MODULE_PARM_DESC(dbfifo_int_thresh, "doorbell fifo interrupt threshold"); + +/* + * usecs to sleep while draining the dbfifo + */ +static int dbfifo_drain_delay = 1000; +module_param(dbfifo_drain_delay, int, 0644); +MODULE_PARM_DESC(dbfifo_drain_delay, + "usecs to sleep while draining the dbfifo"); + +/* + * Set Rx properties of a port, such as promiscruity, address filters, and MTU. + * If @mtu is -1 it is left unchanged. + */ +static int set_rxmode(struct net_device *dev, int mtu, bool sleep_ok) +{ + int ret; + struct port_info *pi = netdev_priv(dev); + + ret = set_addr_filters(dev, sleep_ok); + if (ret == 0) + ret = t4_set_rxmode(pi->adapter, pi->adapter->fn, pi->viid, mtu, + (dev->flags & IFF_PROMISC) ? 1 : 0, + (dev->flags & IFF_ALLMULTI) ? 1 : 0, 1, -1, + sleep_ok); + return ret; +} + +/** + * link_start - enable a port + * @dev: the port to enable + * + * Performs the MAC and PHY actions needed to enable a port. + */ +static int link_start(struct net_device *dev) +{ + int ret; + struct port_info *pi = netdev_priv(dev); + unsigned int mb = pi->adapter->fn; + + /* + * We do not set address filters and promiscuity here, the stack does + * that step explicitly. + */ + ret = t4_set_rxmode(pi->adapter, mb, pi->viid, dev->mtu, -1, -1, -1, + !!(dev->features & NETIF_F_HW_VLAN_CTAG_RX), true); + if (ret == 0) { + ret = t4_change_mac(pi->adapter, mb, pi->viid, + pi->xact_addr_filt, dev->dev_addr, true, + true); + if (ret >= 0) { + pi->xact_addr_filt = ret; + ret = 0; + } + } + if (ret == 0) + ret = t4_link_start(pi->adapter, mb, pi->tx_chan, + &pi->link_cfg); + if (ret == 0) { + local_bh_disable(); + ret = t4_enable_vi_params(pi->adapter, mb, pi->viid, true, + true, CXGB4_DCB_ENABLED); + local_bh_enable(); + } + + return ret; +} + +int cxgb4_dcb_enabled(const struct net_device *dev) +{ +#ifdef CONFIG_CHELSIO_T4_DCB + struct port_info *pi = netdev_priv(dev); + + if (!pi->dcb.enabled) + return 0; + + return ((pi->dcb.state == CXGB4_DCB_STATE_FW_ALLSYNCED) || + (pi->dcb.state == CXGB4_DCB_STATE_HOST)); +#else + return 0; +#endif +} +EXPORT_SYMBOL(cxgb4_dcb_enabled); + +#ifdef CONFIG_CHELSIO_T4_DCB +/* Handle a Data Center Bridging update message from the firmware. */ +static void dcb_rpl(struct adapter *adap, const struct fw_port_cmd *pcmd) +{ + int port = FW_PORT_CMD_PORTID_GET(ntohl(pcmd->op_to_portid)); + struct net_device *dev = adap->port[port]; + int old_dcb_enabled = cxgb4_dcb_enabled(dev); + int new_dcb_enabled; + + cxgb4_dcb_handle_fw_update(adap, pcmd); + new_dcb_enabled = cxgb4_dcb_enabled(dev); + + /* If the DCB has become enabled or disabled on the port then we're + * going to need to set up/tear down DCB Priority parameters for the + * TX Queues associated with the port. + */ + if (new_dcb_enabled != old_dcb_enabled) + dcb_tx_queue_prio_enable(dev, new_dcb_enabled); +} +#endif /* CONFIG_CHELSIO_T4_DCB */ + +/* Clear a filter and release any of its resources that we own. This also + * clears the filter's "pending" status. + */ +static void clear_filter(struct adapter *adap, struct filter_entry *f) +{ + /* If the new or old filter have loopback rewriteing rules then we'll + * need to free any existing Layer Two Table (L2T) entries of the old + * filter rule. The firmware will handle freeing up any Source MAC + * Table (SMT) entries used for rewriting Source MAC Addresses in + * loopback rules. + */ + if (f->l2t) + cxgb4_l2t_release(f->l2t); + + /* The zeroing of the filter rule below clears the filter valid, + * pending, locked flags, l2t pointer, etc. so it's all we need for + * this operation. + */ + memset(f, 0, sizeof(*f)); +} + +/* Handle a filter write/deletion reply. + */ +static void filter_rpl(struct adapter *adap, const struct cpl_set_tcb_rpl *rpl) +{ + unsigned int idx = GET_TID(rpl); + unsigned int nidx = idx - adap->tids.ftid_base; + unsigned int ret; + struct filter_entry *f; + + if (idx >= adap->tids.ftid_base && nidx < + (adap->tids.nftids + adap->tids.nsftids)) { + idx = nidx; + ret = GET_TCB_COOKIE(rpl->cookie); + f = &adap->tids.ftid_tab[idx]; + + if (ret == FW_FILTER_WR_FLT_DELETED) { + /* Clear the filter when we get confirmation from the + * hardware that the filter has been deleted. + */ + clear_filter(adap, f); + } else if (ret == FW_FILTER_WR_SMT_TBL_FULL) { + dev_err(adap->pdev_dev, "filter %u setup failed due to full SMT\n", + idx); + clear_filter(adap, f); + } else if (ret == FW_FILTER_WR_FLT_ADDED) { + f->smtidx = (be64_to_cpu(rpl->oldval) >> 24) & 0xff; + f->pending = 0; /* asynchronous setup completed */ + f->valid = 1; + } else { + /* Something went wrong. Issue a warning about the + * problem and clear everything out. + */ + dev_err(adap->pdev_dev, "filter %u setup failed with error %u\n", + idx, ret); + clear_filter(adap, f); + } + } +} + +/* Response queue handler for the FW event queue. + */ +static int fwevtq_handler(struct sge_rspq *q, const __be64 *rsp, + const struct pkt_gl *gl) +{ + u8 opcode = ((const struct rss_header *)rsp)->opcode; + + rsp++; /* skip RSS header */ + + /* FW can send EGR_UPDATEs encapsulated in a CPL_FW4_MSG. + */ + if (unlikely(opcode == CPL_FW4_MSG && + ((const struct cpl_fw4_msg *)rsp)->type == FW_TYPE_RSSCPL)) { + rsp++; + opcode = ((const struct rss_header *)rsp)->opcode; + rsp++; + if (opcode != CPL_SGE_EGR_UPDATE) { + dev_err(q->adap->pdev_dev, "unexpected FW4/CPL %#x on FW event queue\n" + , opcode); + goto out; + } + } + + if (likely(opcode == CPL_SGE_EGR_UPDATE)) { + const struct cpl_sge_egr_update *p = (void *)rsp; + unsigned int qid = EGR_QID(ntohl(p->opcode_qid)); + struct sge_txq *txq; + + txq = q->adap->sge.egr_map[qid - q->adap->sge.egr_start]; + txq->restarts++; + if ((u8 *)txq < (u8 *)q->adap->sge.ofldtxq) { + struct sge_eth_txq *eq; + + eq = container_of(txq, struct sge_eth_txq, q); + netif_tx_wake_queue(eq->txq); + } else { + struct sge_ofld_txq *oq; + + oq = container_of(txq, struct sge_ofld_txq, q); + tasklet_schedule(&oq->qresume_tsk); + } + } else if (opcode == CPL_FW6_MSG || opcode == CPL_FW4_MSG) { + const struct cpl_fw6_msg *p = (void *)rsp; + +#ifdef CONFIG_CHELSIO_T4_DCB + const struct fw_port_cmd *pcmd = (const void *)p->data; + unsigned int cmd = FW_CMD_OP_GET(ntohl(pcmd->op_to_portid)); + unsigned int action = + FW_PORT_CMD_ACTION_GET(ntohl(pcmd->action_to_len16)); + + if (cmd == FW_PORT_CMD && + action == FW_PORT_ACTION_GET_PORT_INFO) { + int port = FW_PORT_CMD_PORTID_GET( + be32_to_cpu(pcmd->op_to_portid)); + struct net_device *dev = q->adap->port[port]; + int state_input = ((pcmd->u.info.dcbxdis_pkd & + FW_PORT_CMD_DCBXDIS) + ? CXGB4_DCB_INPUT_FW_DISABLED + : CXGB4_DCB_INPUT_FW_ENABLED); + + cxgb4_dcb_state_fsm(dev, state_input); + } + + if (cmd == FW_PORT_CMD && + action == FW_PORT_ACTION_L2_DCB_CFG) + dcb_rpl(q->adap, pcmd); + else +#endif + if (p->type == 0) + t4_handle_fw_rpl(q->adap, p->data); + } else if (opcode == CPL_L2T_WRITE_RPL) { + const struct cpl_l2t_write_rpl *p = (void *)rsp; + + do_l2t_write_rpl(q->adap, p); + } else if (opcode == CPL_SET_TCB_RPL) { + const struct cpl_set_tcb_rpl *p = (void *)rsp; + + filter_rpl(q->adap, p); + } else + dev_err(q->adap->pdev_dev, + "unexpected CPL %#x on FW event queue\n", opcode); +out: + return 0; +} + +/** + * uldrx_handler - response queue handler for ULD queues + * @q: the response queue that received the packet + * @rsp: the response queue descriptor holding the offload message + * @gl: the gather list of packet fragments + * + * Deliver an ingress offload packet to a ULD. All processing is done by + * the ULD, we just maintain statistics. + */ +static int uldrx_handler(struct sge_rspq *q, const __be64 *rsp, + const struct pkt_gl *gl) +{ + struct sge_ofld_rxq *rxq = container_of(q, struct sge_ofld_rxq, rspq); + + /* FW can send CPLs encapsulated in a CPL_FW4_MSG. + */ + if (((const struct rss_header *)rsp)->opcode == CPL_FW4_MSG && + ((const struct cpl_fw4_msg *)(rsp + 1))->type == FW_TYPE_RSSCPL) + rsp += 2; + + if (ulds[q->uld].rx_handler(q->adap->uld_handle[q->uld], rsp, gl)) { + rxq->stats.nomem++; + return -1; + } + if (gl == NULL) + rxq->stats.imm++; + else if (gl == CXGB4_MSG_AN) + rxq->stats.an++; + else + rxq->stats.pkts++; + return 0; +} + +static void disable_msi(struct adapter *adapter) +{ + if (adapter->flags & USING_MSIX) { + pci_disable_msix(adapter->pdev); + adapter->flags &= ~USING_MSIX; + } else if (adapter->flags & USING_MSI) { + pci_disable_msi(adapter->pdev); + adapter->flags &= ~USING_MSI; + } +} + +/* + * Interrupt handler for non-data events used with MSI-X. + */ +static irqreturn_t t4_nondata_intr(int irq, void *cookie) +{ + struct adapter *adap = cookie; + + u32 v = t4_read_reg(adap, MYPF_REG(PL_PF_INT_CAUSE)); + if (v & PFSW) { + adap->swintr = 1; + t4_write_reg(adap, MYPF_REG(PL_PF_INT_CAUSE), v); + } + t4_slow_intr_handler(adap); + return IRQ_HANDLED; +} + +/* + * Name the MSI-X interrupts. + */ +static void name_msix_vecs(struct adapter *adap) +{ + int i, j, msi_idx = 2, n = sizeof(adap->msix_info[0].desc); + + /* non-data interrupts */ + snprintf(adap->msix_info[0].desc, n, "%s", adap->port[0]->name); + + /* FW events */ + snprintf(adap->msix_info[1].desc, n, "%s-FWeventq", + adap->port[0]->name); + + /* Ethernet queues */ + for_each_port(adap, j) { + struct net_device *d = adap->port[j]; + const struct port_info *pi = netdev_priv(d); + + for (i = 0; i < pi->nqsets; i++, msi_idx++) + snprintf(adap->msix_info[msi_idx].desc, n, "%s-Rx%d", + d->name, i); + } + + /* offload queues */ + for_each_ofldrxq(&adap->sge, i) + snprintf(adap->msix_info[msi_idx++].desc, n, "%s-ofld%d", + adap->port[0]->name, i); + + for_each_rdmarxq(&adap->sge, i) + snprintf(adap->msix_info[msi_idx++].desc, n, "%s-rdma%d", + adap->port[0]->name, i); + + for_each_rdmaciq(&adap->sge, i) + snprintf(adap->msix_info[msi_idx++].desc, n, "%s-rdma-ciq%d", + adap->port[0]->name, i); +} + +static int request_msix_queue_irqs(struct adapter *adap) +{ + struct sge *s = &adap->sge; + int err, ethqidx, ofldqidx = 0, rdmaqidx = 0, rdmaciqqidx = 0; + int msi_index = 2; + + err = request_irq(adap->msix_info[1].vec, t4_sge_intr_msix, 0, + adap->msix_info[1].desc, &s->fw_evtq); + if (err) + return err; + + for_each_ethrxq(s, ethqidx) { + err = request_irq(adap->msix_info[msi_index].vec, + t4_sge_intr_msix, 0, + adap->msix_info[msi_index].desc, + &s->ethrxq[ethqidx].rspq); + if (err) + goto unwind; + msi_index++; + } + for_each_ofldrxq(s, ofldqidx) { + err = request_irq(adap->msix_info[msi_index].vec, + t4_sge_intr_msix, 0, + adap->msix_info[msi_index].desc, + &s->ofldrxq[ofldqidx].rspq); + if (err) + goto unwind; + msi_index++; + } + for_each_rdmarxq(s, rdmaqidx) { + err = request_irq(adap->msix_info[msi_index].vec, + t4_sge_intr_msix, 0, + adap->msix_info[msi_index].desc, + &s->rdmarxq[rdmaqidx].rspq); + if (err) + goto unwind; + msi_index++; + } + for_each_rdmaciq(s, rdmaciqqidx) { + err = request_irq(adap->msix_info[msi_index].vec, + t4_sge_intr_msix, 0, + adap->msix_info[msi_index].desc, + &s->rdmaciq[rdmaciqqidx].rspq); + if (err) + goto unwind; + msi_index++; + } + return 0; + +unwind: + while (--rdmaciqqidx >= 0) + free_irq(adap->msix_info[--msi_index].vec, + &s->rdmaciq[rdmaciqqidx].rspq); + while (--rdmaqidx >= 0) + free_irq(adap->msix_info[--msi_index].vec, + &s->rdmarxq[rdmaqidx].rspq); + while (--ofldqidx >= 0) + free_irq(adap->msix_info[--msi_index].vec, + &s->ofldrxq[ofldqidx].rspq); + while (--ethqidx >= 0) + free_irq(adap->msix_info[--msi_index].vec, + &s->ethrxq[ethqidx].rspq); + free_irq(adap->msix_info[1].vec, &s->fw_evtq); + return err; +} + +static void free_msix_queue_irqs(struct adapter *adap) +{ + int i, msi_index = 2; + struct sge *s = &adap->sge; + + free_irq(adap->msix_info[1].vec, &s->fw_evtq); + for_each_ethrxq(s, i) + free_irq(adap->msix_info[msi_index++].vec, &s->ethrxq[i].rspq); + for_each_ofldrxq(s, i) + free_irq(adap->msix_info[msi_index++].vec, &s->ofldrxq[i].rspq); + for_each_rdmarxq(s, i) + free_irq(adap->msix_info[msi_index++].vec, &s->rdmarxq[i].rspq); + for_each_rdmaciq(s, i) + free_irq(adap->msix_info[msi_index++].vec, &s->rdmaciq[i].rspq); +} + +/** + * write_rss - write the RSS table for a given port + * @pi: the port + * @queues: array of queue indices for RSS + * + * Sets up the portion of the HW RSS table for the port's VI to distribute + * packets to the Rx queues in @queues. + */ +static int write_rss(const struct port_info *pi, const u16 *queues) +{ + u16 *rss; + int i, err; + const struct sge_eth_rxq *q = &pi->adapter->sge.ethrxq[pi->first_qset]; + + rss = kmalloc(pi->rss_size * sizeof(u16), GFP_KERNEL); + if (!rss) + return -ENOMEM; + + /* map the queue indices to queue ids */ + for (i = 0; i < pi->rss_size; i++, queues++) + rss[i] = q[*queues].rspq.abs_id; + + err = t4_config_rss_range(pi->adapter, pi->adapter->fn, pi->viid, 0, + pi->rss_size, rss, pi->rss_size); + kfree(rss); + return err; +} + +/** + * setup_rss - configure RSS + * @adap: the adapter + * + * Sets up RSS for each port. + */ +static int setup_rss(struct adapter *adap) +{ + int i, err; + + for_each_port(adap, i) { + const struct port_info *pi = adap2pinfo(adap, i); + + err = write_rss(pi, pi->rss); + if (err) + return err; + } + return 0; +} + +/* + * Return the channel of the ingress queue with the given qid. + */ +static unsigned int rxq_to_chan(const struct sge *p, unsigned int qid) +{ + qid -= p->ingr_start; + return netdev2pinfo(p->ingr_map[qid]->netdev)->tx_chan; +} + +/* + * Wait until all NAPI handlers are descheduled. + */ +static void quiesce_rx(struct adapter *adap) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(adap->sge.ingr_map); i++) { + struct sge_rspq *q = adap->sge.ingr_map[i]; + + if (q && q->handler) + napi_disable(&q->napi); + } +} + +/* + * Enable NAPI scheduling and interrupt generation for all Rx queues. + */ +static void enable_rx(struct adapter *adap) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(adap->sge.ingr_map); i++) { + struct sge_rspq *q = adap->sge.ingr_map[i]; + + if (!q) + continue; + if (q->handler) + napi_enable(&q->napi); + /* 0-increment GTS to start the timer and enable interrupts */ + t4_write_reg(adap, MYPF_REG(SGE_PF_GTS), + SEINTARM(q->intr_params) | + INGRESSQID(q->cntxt_id)); + } +} + +/** + * setup_sge_queues - configure SGE Tx/Rx/response queues + * @adap: the adapter + * + * Determines how many sets of SGE queues to use and initializes them. + * We support multiple queue sets per port if we have MSI-X, otherwise + * just one queue set per port. + */ +static int setup_sge_queues(struct adapter *adap) +{ + int err, msi_idx, i, j; + struct sge *s = &adap->sge; + + bitmap_zero(s->starving_fl, MAX_EGRQ); + bitmap_zero(s->txq_maperr, MAX_EGRQ); + + if (adap->flags & USING_MSIX) + msi_idx = 1; /* vector 0 is for non-queue interrupts */ + else { + err = t4_sge_alloc_rxq(adap, &s->intrq, false, adap->port[0], 0, + NULL, NULL); + if (err) + return err; + msi_idx = -((int)s->intrq.abs_id + 1); + } + + err = t4_sge_alloc_rxq(adap, &s->fw_evtq, true, adap->port[0], + msi_idx, NULL, fwevtq_handler); + if (err) { +freeout: t4_free_sge_resources(adap); + return err; + } + + for_each_port(adap, i) { + struct net_device *dev = adap->port[i]; + struct port_info *pi = netdev_priv(dev); + struct sge_eth_rxq *q = &s->ethrxq[pi->first_qset]; + struct sge_eth_txq *t = &s->ethtxq[pi->first_qset]; + + for (j = 0; j < pi->nqsets; j++, q++) { + if (msi_idx > 0) + msi_idx++; + err = t4_sge_alloc_rxq(adap, &q->rspq, false, dev, + msi_idx, &q->fl, + t4_ethrx_handler); + if (err) + goto freeout; + q->rspq.idx = j; + memset(&q->stats, 0, sizeof(q->stats)); + } + for (j = 0; j < pi->nqsets; j++, t++) { + err = t4_sge_alloc_eth_txq(adap, t, dev, + netdev_get_tx_queue(dev, j), + s->fw_evtq.cntxt_id); + if (err) + goto freeout; + } + } + + j = s->ofldqsets / adap->params.nports; /* ofld queues per channel */ + for_each_ofldrxq(s, i) { + struct sge_ofld_rxq *q = &s->ofldrxq[i]; + struct net_device *dev = adap->port[i / j]; + + if (msi_idx > 0) + msi_idx++; + err = t4_sge_alloc_rxq(adap, &q->rspq, false, dev, msi_idx, + q->fl.size ? &q->fl : NULL, + uldrx_handler); + if (err) + goto freeout; + memset(&q->stats, 0, sizeof(q->stats)); + s->ofld_rxq[i] = q->rspq.abs_id; + err = t4_sge_alloc_ofld_txq(adap, &s->ofldtxq[i], dev, + s->fw_evtq.cntxt_id); + if (err) + goto freeout; + } + + for_each_rdmarxq(s, i) { + struct sge_ofld_rxq *q = &s->rdmarxq[i]; + + if (msi_idx > 0) + msi_idx++; + err = t4_sge_alloc_rxq(adap, &q->rspq, false, adap->port[i], + msi_idx, q->fl.size ? &q->fl : NULL, + uldrx_handler); + if (err) + goto freeout; + memset(&q->stats, 0, sizeof(q->stats)); + s->rdma_rxq[i] = q->rspq.abs_id; + } + + for_each_rdmaciq(s, i) { + struct sge_ofld_rxq *q = &s->rdmaciq[i]; + + if (msi_idx > 0) + msi_idx++; + err = t4_sge_alloc_rxq(adap, &q->rspq, false, adap->port[i], + msi_idx, q->fl.size ? &q->fl : NULL, + uldrx_handler); + if (err) + goto freeout; + memset(&q->stats, 0, sizeof(q->stats)); + s->rdma_ciq[i] = q->rspq.abs_id; + } + + for_each_port(adap, i) { + /* + * Note that ->rdmarxq[i].rspq.cntxt_id below is 0 if we don't + * have RDMA queues, and that's the right value. + */ + err = t4_sge_alloc_ctrl_txq(adap, &s->ctrlq[i], adap->port[i], + s->fw_evtq.cntxt_id, + s->rdmarxq[i].rspq.cntxt_id); + if (err) + goto freeout; + } + + t4_write_reg(adap, is_t4(adap->params.chip) ? + MPS_TRC_RSS_CONTROL : + MPS_T5_TRC_RSS_CONTROL, + RSSCONTROL(netdev2pinfo(adap->port[0])->tx_chan) | + QUEUENUMBER(s->ethrxq[0].rspq.abs_id)); + return 0; +} + +/* + * Allocate a chunk of memory using kmalloc or, if that fails, vmalloc. + * The allocated memory is cleared. + */ +void *t4_alloc_mem(size_t size) +{ + void *p = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); + + if (!p) + p = vzalloc(size); + return p; +} + +/* + * Free memory allocated through alloc_mem(). + */ +static void t4_free_mem(void *addr) +{ + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); +} + +/* Send a Work Request to write the filter at a specified index. We construct + * a Firmware Filter Work Request to have the work done and put the indicated + * filter into "pending" mode which will prevent any further actions against + * it till we get a reply from the firmware on the completion status of the + * request. + */ +static int set_filter_wr(struct adapter *adapter, int fidx) +{ + struct filter_entry *f = &adapter->tids.ftid_tab[fidx]; + struct sk_buff *skb; + struct fw_filter_wr *fwr; + unsigned int ftid; + + /* If the new filter requires loopback Destination MAC and/or VLAN + * rewriting then we need to allocate a Layer 2 Table (L2T) entry for + * the filter. + */ + if (f->fs.newdmac || f->fs.newvlan) { + /* allocate L2T entry for new filter */ + f->l2t = t4_l2t_alloc_switching(adapter->l2t); + if (f->l2t == NULL) + return -EAGAIN; + if (t4_l2t_set_switching(adapter, f->l2t, f->fs.vlan, + f->fs.eport, f->fs.dmac)) { + cxgb4_l2t_release(f->l2t); + f->l2t = NULL; + return -ENOMEM; + } + } + + ftid = adapter->tids.ftid_base + fidx; + + skb = alloc_skb(sizeof(*fwr), GFP_KERNEL | __GFP_NOFAIL); + fwr = (struct fw_filter_wr *)__skb_put(skb, sizeof(*fwr)); + memset(fwr, 0, sizeof(*fwr)); + + /* It would be nice to put most of the following in t4_hw.c but most + * of the work is translating the cxgbtool ch_filter_specification + * into the Work Request and the definition of that structure is + * currently in cxgbtool.h which isn't appropriate to pull into the + * common code. We may eventually try to come up with a more neutral + * filter specification structure but for now it's easiest to simply + * put this fairly direct code in line ... + */ + fwr->op_pkd = htonl(FW_WR_OP(FW_FILTER_WR)); + fwr->len16_pkd = htonl(FW_WR_LEN16(sizeof(*fwr)/16)); + fwr->tid_to_iq = + htonl(V_FW_FILTER_WR_TID(ftid) | + V_FW_FILTER_WR_RQTYPE(f->fs.type) | + V_FW_FILTER_WR_NOREPLY(0) | + V_FW_FILTER_WR_IQ(f->fs.iq)); + fwr->del_filter_to_l2tix = + htonl(V_FW_FILTER_WR_RPTTID(f->fs.rpttid) | + V_FW_FILTER_WR_DROP(f->fs.action == FILTER_DROP) | + V_FW_FILTER_WR_DIRSTEER(f->fs.dirsteer) | + V_FW_FILTER_WR_MASKHASH(f->fs.maskhash) | + V_FW_FILTER_WR_DIRSTEERHASH(f->fs.dirsteerhash) | + V_FW_FILTER_WR_LPBK(f->fs.action == FILTER_SWITCH) | + V_FW_FILTER_WR_DMAC(f->fs.newdmac) | + V_FW_FILTER_WR_SMAC(f->fs.newsmac) | + V_FW_FILTER_WR_INSVLAN(f->fs.newvlan == VLAN_INSERT || + f->fs.newvlan == VLAN_REWRITE) | + V_FW_FILTER_WR_RMVLAN(f->fs.newvlan == VLAN_REMOVE || + f->fs.newvlan == VLAN_REWRITE) | + V_FW_FILTER_WR_HITCNTS(f->fs.hitcnts) | + V_FW_FILTER_WR_TXCHAN(f->fs.eport) | + V_FW_FILTER_WR_PRIO(f->fs.prio) | + V_FW_FILTER_WR_L2TIX(f->l2t ? f->l2t->idx : 0)); + fwr->ethtype = htons(f->fs.val.ethtype); + fwr->ethtypem = htons(f->fs.mask.ethtype); + fwr->frag_to_ovlan_vldm = + (V_FW_FILTER_WR_FRAG(f->fs.val.frag) | + V_FW_FILTER_WR_FRAGM(f->fs.mask.frag) | + V_FW_FILTER_WR_IVLAN_VLD(f->fs.val.ivlan_vld) | + V_FW_FILTER_WR_OVLAN_VLD(f->fs.val.ovlan_vld) | + V_FW_FILTER_WR_IVLAN_VLDM(f->fs.mask.ivlan_vld) | + V_FW_FILTER_WR_OVLAN_VLDM(f->fs.mask.ovlan_vld)); + fwr->smac_sel = 0; + fwr->rx_chan_rx_rpl_iq = + htons(V_FW_FILTER_WR_RX_CHAN(0) | + V_FW_FILTER_WR_RX_RPL_IQ(adapter->sge.fw_evtq.abs_id)); + fwr->maci_to_matchtypem = + htonl(V_FW_FILTER_WR_MACI(f->fs.val.macidx) | + V_FW_FILTER_WR_MACIM(f->fs.mask.macidx) | + V_FW_FILTER_WR_FCOE(f->fs.val.fcoe) | + V_FW_FILTER_WR_FCOEM(f->fs.mask.fcoe) | + V_FW_FILTER_WR_PORT(f->fs.val.iport) | + V_FW_FILTER_WR_PORTM(f->fs.mask.iport) | + V_FW_FILTER_WR_MATCHTYPE(f->fs.val.matchtype) | + V_FW_FILTER_WR_MATCHTYPEM(f->fs.mask.matchtype)); + fwr->ptcl = f->fs.val.proto; + fwr->ptclm = f->fs.mask.proto; + fwr->ttyp = f->fs.val.tos; + fwr->ttypm = f->fs.mask.tos; + fwr->ivlan = htons(f->fs.val.ivlan); + fwr->ivlanm = htons(f->fs.mask.ivlan); + fwr->ovlan = htons(f->fs.val.ovlan); + fwr->ovlanm = htons(f->fs.mask.ovlan); + memcpy(fwr->lip, f->fs.val.lip, sizeof(fwr->lip)); + memcpy(fwr->lipm, f->fs.mask.lip, sizeof(fwr->lipm)); + memcpy(fwr->fip, f->fs.val.fip, sizeof(fwr->fip)); + memcpy(fwr->fipm, f->fs.mask.fip, sizeof(fwr->fipm)); + fwr->lp = htons(f->fs.val.lport); + fwr->lpm = htons(f->fs.mask.lport); + fwr->fp = htons(f->fs.val.fport); + fwr->fpm = htons(f->fs.mask.fport); + if (f->fs.newsmac) + memcpy(fwr->sma, f->fs.smac, sizeof(fwr->sma)); + + /* Mark the filter as "pending" and ship off the Filter Work Request. + * When we get the Work Request Reply we'll clear the pending status. + */ + f->pending = 1; + set_wr_txq(skb, CPL_PRIORITY_CONTROL, f->fs.val.iport & 0x3); + t4_ofld_send(adapter, skb); + return 0; +} + +/* Delete the filter at a specified index. + */ +static int del_filter_wr(struct adapter *adapter, int fidx) +{ + struct filter_entry *f = &adapter->tids.ftid_tab[fidx]; + struct sk_buff *skb; + struct fw_filter_wr *fwr; + unsigned int len, ftid; + + len = sizeof(*fwr); + ftid = adapter->tids.ftid_base + fidx; + + skb = alloc_skb(len, GFP_KERNEL | __GFP_NOFAIL); + fwr = (struct fw_filter_wr *)__skb_put(skb, len); + t4_mk_filtdelwr(ftid, fwr, adapter->sge.fw_evtq.abs_id); + + /* Mark the filter as "pending" and ship off the Filter Work Request. + * When we get the Work Request Reply we'll clear the pending status. + */ + f->pending = 1; + t4_mgmt_tx(adapter, skb); + return 0; +} + +static u16 cxgb_select_queue(struct net_device *dev, struct sk_buff *skb, + void *accel_priv, select_queue_fallback_t fallback) +{ + int txq; + +#ifdef CONFIG_CHELSIO_T4_DCB + /* If a Data Center Bridging has been successfully negotiated on this + * link then we'll use the skb's priority to map it to a TX Queue. + * The skb's priority is determined via the VLAN Tag Priority Code + * Point field. + */ + if (cxgb4_dcb_enabled(dev)) { + u16 vlan_tci; + int err; + + err = vlan_get_tag(skb, &vlan_tci); + if (unlikely(err)) { + if (net_ratelimit()) + netdev_warn(dev, + "TX Packet without VLAN Tag on DCB Link\n"); + txq = 0; + } else { + txq = (vlan_tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; + } + return txq; + } +#endif /* CONFIG_CHELSIO_T4_DCB */ + + if (select_queue) { + txq = (skb_rx_queue_recorded(skb) + ? skb_get_rx_queue(skb) + : smp_processor_id()); + + while (unlikely(txq >= dev->real_num_tx_queues)) + txq -= dev->real_num_tx_queues; + + return txq; + } + + return fallback(dev, skb) % dev->real_num_tx_queues; +} + +static inline int is_offload(const struct adapter *adap) +{ + return adap->params.offload; +} + +/* + * Implementation of ethtool operations. + */ + +static u32 get_msglevel(struct net_device *dev) +{ + return netdev2adap(dev)->msg_enable; +} + +static void set_msglevel(struct net_device *dev, u32 val) +{ + netdev2adap(dev)->msg_enable = val; +} + +static char stats_strings[][ETH_GSTRING_LEN] = { + "TxOctetsOK ", + "TxFramesOK ", + "TxBroadcastFrames ", + "TxMulticastFrames ", + "TxUnicastFrames ", + "TxErrorFrames ", + + "TxFrames64 ", + "TxFrames65To127 ", + "TxFrames128To255 ", + "TxFrames256To511 ", + "TxFrames512To1023 ", + "TxFrames1024To1518 ", + "TxFrames1519ToMax ", + + "TxFramesDropped ", + "TxPauseFrames ", + "TxPPP0Frames ", + "TxPPP1Frames ", + "TxPPP2Frames ", + "TxPPP3Frames ", + "TxPPP4Frames ", + "TxPPP5Frames ", + "TxPPP6Frames ", + "TxPPP7Frames ", + + "RxOctetsOK ", + "RxFramesOK ", + "RxBroadcastFrames ", + "RxMulticastFrames ", + "RxUnicastFrames ", + + "RxFramesTooLong ", + "RxJabberErrors ", + "RxFCSErrors ", + "RxLengthErrors ", + "RxSymbolErrors ", + "RxRuntFrames ", + + "RxFrames64 ", + "RxFrames65To127 ", + "RxFrames128To255 ", + "RxFrames256To511 ", + "RxFrames512To1023 ", + "RxFrames1024To1518 ", + "RxFrames1519ToMax ", + + "RxPauseFrames ", + "RxPPP0Frames ", + "RxPPP1Frames ", + "RxPPP2Frames ", + "RxPPP3Frames ", + "RxPPP4Frames ", + "RxPPP5Frames ", + "RxPPP6Frames ", + "RxPPP7Frames ", + + "RxBG0FramesDropped ", + "RxBG1FramesDropped ", + "RxBG2FramesDropped ", + "RxBG3FramesDropped ", + "RxBG0FramesTrunc ", + "RxBG1FramesTrunc ", + "RxBG2FramesTrunc ", + "RxBG3FramesTrunc ", + + "TSO ", + "TxCsumOffload ", + "RxCsumGood ", + "VLANextractions ", + "VLANinsertions ", + "GROpackets ", + "GROmerged ", + "WriteCoalSuccess ", + "WriteCoalFail ", +}; + +static int get_sset_count(struct net_device *dev, int sset) +{ + switch (sset) { + case ETH_SS_STATS: + return ARRAY_SIZE(stats_strings); + default: + return -EOPNOTSUPP; + } +} + +#define T4_REGMAP_SIZE (160 * 1024) +#define T5_REGMAP_SIZE (332 * 1024) + +static int get_regs_len(struct net_device *dev) +{ + struct adapter *adap = netdev2adap(dev); + if (is_t4(adap->params.chip)) + return T4_REGMAP_SIZE; + else + return T5_REGMAP_SIZE; +} + +static int get_eeprom_len(struct net_device *dev) +{ + return EEPROMSIZE; +} + +static void get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) +{ + struct adapter *adapter = netdev2adap(dev); + + strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver)); + strlcpy(info->version, DRV_VERSION, sizeof(info->version)); + strlcpy(info->bus_info, pci_name(adapter->pdev), + sizeof(info->bus_info)); + + if (adapter->params.fw_vers) + snprintf(info->fw_version, sizeof(info->fw_version), + "%u.%u.%u.%u, TP %u.%u.%u.%u", + FW_HDR_FW_VER_MAJOR_GET(adapter->params.fw_vers), + FW_HDR_FW_VER_MINOR_GET(adapter->params.fw_vers), + FW_HDR_FW_VER_MICRO_GET(adapter->params.fw_vers), + FW_HDR_FW_VER_BUILD_GET(adapter->params.fw_vers), + FW_HDR_FW_VER_MAJOR_GET(adapter->params.tp_vers), + FW_HDR_FW_VER_MINOR_GET(adapter->params.tp_vers), + FW_HDR_FW_VER_MICRO_GET(adapter->params.tp_vers), + FW_HDR_FW_VER_BUILD_GET(adapter->params.tp_vers)); +} + +static void get_strings(struct net_device *dev, u32 stringset, u8 *data) +{ + if (stringset == ETH_SS_STATS) + memcpy(data, stats_strings, sizeof(stats_strings)); +} + +/* + * port stats maintained per queue of the port. They should be in the same + * order as in stats_strings above. + */ +struct queue_port_stats { + u64 tso; + u64 tx_csum; + u64 rx_csum; + u64 vlan_ex; + u64 vlan_ins; + u64 gro_pkts; + u64 gro_merged; +}; + +static void collect_sge_port_stats(const struct adapter *adap, + const struct port_info *p, struct queue_port_stats *s) +{ + int i; + const struct sge_eth_txq *tx = &adap->sge.ethtxq[p->first_qset]; + const struct sge_eth_rxq *rx = &adap->sge.ethrxq[p->first_qset]; + + memset(s, 0, sizeof(*s)); + for (i = 0; i < p->nqsets; i++, rx++, tx++) { + s->tso += tx->tso; + s->tx_csum += tx->tx_cso; + s->rx_csum += rx->stats.rx_cso; + s->vlan_ex += rx->stats.vlan_ex; + s->vlan_ins += tx->vlan_ins; + s->gro_pkts += rx->stats.lro_pkts; + s->gro_merged += rx->stats.lro_merged; + } +} + +static void get_stats(struct net_device *dev, struct ethtool_stats *stats, + u64 *data) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adapter = pi->adapter; + u32 val1, val2; + + t4_get_port_stats(adapter, pi->tx_chan, (struct port_stats *)data); + + data += sizeof(struct port_stats) / sizeof(u64); + collect_sge_port_stats(adapter, pi, (struct queue_port_stats *)data); + data += sizeof(struct queue_port_stats) / sizeof(u64); + if (!is_t4(adapter->params.chip)) { + t4_write_reg(adapter, SGE_STAT_CFG, STATSOURCE_T5(7)); + val1 = t4_read_reg(adapter, SGE_STAT_TOTAL); + val2 = t4_read_reg(adapter, SGE_STAT_MATCH); + *data = val1 - val2; + data++; + *data = val2; + data++; + } else { + memset(data, 0, 2 * sizeof(u64)); + *data += 2; + } +} + +/* + * Return a version number to identify the type of adapter. The scheme is: + * - bits 0..9: chip version + * - bits 10..15: chip revision + * - bits 16..23: register dump version + */ +static inline unsigned int mk_adap_vers(const struct adapter *ap) +{ + return CHELSIO_CHIP_VERSION(ap->params.chip) | + (CHELSIO_CHIP_RELEASE(ap->params.chip) << 10) | (1 << 16); +} + +static void reg_block_dump(struct adapter *ap, void *buf, unsigned int start, + unsigned int end) +{ + u32 *p = buf + start; + + for ( ; start <= end; start += sizeof(u32)) + *p++ = t4_read_reg(ap, start); +} + +static void get_regs(struct net_device *dev, struct ethtool_regs *regs, + void *buf) +{ + static const unsigned int t4_reg_ranges[] = { + 0x1008, 0x1108, + 0x1180, 0x11b4, + 0x11fc, 0x123c, + 0x1300, 0x173c, + 0x1800, 0x18fc, + 0x3000, 0x30d8, + 0x30e0, 0x5924, + 0x5960, 0x59d4, + 0x5a00, 0x5af8, + 0x6000, 0x6098, + 0x6100, 0x6150, + 0x6200, 0x6208, + 0x6240, 0x6248, + 0x6280, 0x6338, + 0x6370, 0x638c, + 0x6400, 0x643c, + 0x6500, 0x6524, + 0x6a00, 0x6a38, + 0x6a60, 0x6a78, + 0x6b00, 0x6b84, + 0x6bf0, 0x6c84, + 0x6cf0, 0x6d84, + 0x6df0, 0x6e84, + 0x6ef0, 0x6f84, + 0x6ff0, 0x7084, + 0x70f0, 0x7184, + 0x71f0, 0x7284, + 0x72f0, 0x7384, + 0x73f0, 0x7450, + 0x7500, 0x7530, + 0x7600, 0x761c, + 0x7680, 0x76cc, + 0x7700, 0x7798, + 0x77c0, 0x77fc, + 0x7900, 0x79fc, + 0x7b00, 0x7c38, + 0x7d00, 0x7efc, + 0x8dc0, 0x8e1c, + 0x8e30, 0x8e78, + 0x8ea0, 0x8f6c, + 0x8fc0, 0x9074, + 0x90fc, 0x90fc, + 0x9400, 0x9458, + 0x9600, 0x96bc, + 0x9800, 0x9808, + 0x9820, 0x983c, + 0x9850, 0x9864, + 0x9c00, 0x9c6c, + 0x9c80, 0x9cec, + 0x9d00, 0x9d6c, + 0x9d80, 0x9dec, + 0x9e00, 0x9e6c, + 0x9e80, 0x9eec, + 0x9f00, 0x9f6c, + 0x9f80, 0x9fec, + 0xd004, 0xd03c, + 0xdfc0, 0xdfe0, + 0xe000, 0xea7c, + 0xf000, 0x11110, + 0x11118, 0x11190, + 0x19040, 0x1906c, + 0x19078, 0x19080, + 0x1908c, 0x19124, + 0x19150, 0x191b0, + 0x191d0, 0x191e8, + 0x19238, 0x1924c, + 0x193f8, 0x19474, + 0x19490, 0x194f8, + 0x19800, 0x19f30, + 0x1a000, 0x1a06c, + 0x1a0b0, 0x1a120, + 0x1a128, 0x1a138, + 0x1a190, 0x1a1c4, + 0x1a1fc, 0x1a1fc, + 0x1e040, 0x1e04c, + 0x1e284, 0x1e28c, + 0x1e2c0, 0x1e2c0, + 0x1e2e0, 0x1e2e0, + 0x1e300, 0x1e384, + 0x1e3c0, 0x1e3c8, + 0x1e440, 0x1e44c, + 0x1e684, 0x1e68c, + 0x1e6c0, 0x1e6c0, + 0x1e6e0, 0x1e6e0, + 0x1e700, 0x1e784, + 0x1e7c0, 0x1e7c8, + 0x1e840, 0x1e84c, + 0x1ea84, 0x1ea8c, + 0x1eac0, 0x1eac0, + 0x1eae0, 0x1eae0, + 0x1eb00, 0x1eb84, + 0x1ebc0, 0x1ebc8, + 0x1ec40, 0x1ec4c, + 0x1ee84, 0x1ee8c, + 0x1eec0, 0x1eec0, + 0x1eee0, 0x1eee0, + 0x1ef00, 0x1ef84, + 0x1efc0, 0x1efc8, + 0x1f040, 0x1f04c, + 0x1f284, 0x1f28c, + 0x1f2c0, 0x1f2c0, + 0x1f2e0, 0x1f2e0, + 0x1f300, 0x1f384, + 0x1f3c0, 0x1f3c8, + 0x1f440, 0x1f44c, + 0x1f684, 0x1f68c, + 0x1f6c0, 0x1f6c0, + 0x1f6e0, 0x1f6e0, + 0x1f700, 0x1f784, + 0x1f7c0, 0x1f7c8, + 0x1f840, 0x1f84c, + 0x1fa84, 0x1fa8c, + 0x1fac0, 0x1fac0, + 0x1fae0, 0x1fae0, + 0x1fb00, 0x1fb84, + 0x1fbc0, 0x1fbc8, + 0x1fc40, 0x1fc4c, + 0x1fe84, 0x1fe8c, + 0x1fec0, 0x1fec0, + 0x1fee0, 0x1fee0, + 0x1ff00, 0x1ff84, + 0x1ffc0, 0x1ffc8, + 0x20000, 0x2002c, + 0x20100, 0x2013c, + 0x20190, 0x201c8, + 0x20200, 0x20318, + 0x20400, 0x20528, + 0x20540, 0x20614, + 0x21000, 0x21040, + 0x2104c, 0x21060, + 0x210c0, 0x210ec, + 0x21200, 0x21268, + 0x21270, 0x21284, + 0x212fc, 0x21388, + 0x21400, 0x21404, + 0x21500, 0x21518, + 0x2152c, 0x2153c, + 0x21550, 0x21554, + 0x21600, 0x21600, + 0x21608, 0x21628, + 0x21630, 0x2163c, + 0x21700, 0x2171c, + 0x21780, 0x2178c, + 0x21800, 0x21c38, + 0x21c80, 0x21d7c, + 0x21e00, 0x21e04, + 0x22000, 0x2202c, + 0x22100, 0x2213c, + 0x22190, 0x221c8, + 0x22200, 0x22318, + 0x22400, 0x22528, + 0x22540, 0x22614, + 0x23000, 0x23040, + 0x2304c, 0x23060, + 0x230c0, 0x230ec, + 0x23200, 0x23268, + 0x23270, 0x23284, + 0x232fc, 0x23388, + 0x23400, 0x23404, + 0x23500, 0x23518, + 0x2352c, 0x2353c, + 0x23550, 0x23554, + 0x23600, 0x23600, + 0x23608, 0x23628, + 0x23630, 0x2363c, + 0x23700, 0x2371c, + 0x23780, 0x2378c, + 0x23800, 0x23c38, + 0x23c80, 0x23d7c, + 0x23e00, 0x23e04, + 0x24000, 0x2402c, + 0x24100, 0x2413c, + 0x24190, 0x241c8, + 0x24200, 0x24318, + 0x24400, 0x24528, + 0x24540, 0x24614, + 0x25000, 0x25040, + 0x2504c, 0x25060, + 0x250c0, 0x250ec, + 0x25200, 0x25268, + 0x25270, 0x25284, + 0x252fc, 0x25388, + 0x25400, 0x25404, + 0x25500, 0x25518, + 0x2552c, 0x2553c, + 0x25550, 0x25554, + 0x25600, 0x25600, + 0x25608, 0x25628, + 0x25630, 0x2563c, + 0x25700, 0x2571c, + 0x25780, 0x2578c, + 0x25800, 0x25c38, + 0x25c80, 0x25d7c, + 0x25e00, 0x25e04, + 0x26000, 0x2602c, + 0x26100, 0x2613c, + 0x26190, 0x261c8, + 0x26200, 0x26318, + 0x26400, 0x26528, + 0x26540, 0x26614, + 0x27000, 0x27040, + 0x2704c, 0x27060, + 0x270c0, 0x270ec, + 0x27200, 0x27268, + 0x27270, 0x27284, + 0x272fc, 0x27388, + 0x27400, 0x27404, + 0x27500, 0x27518, + 0x2752c, 0x2753c, + 0x27550, 0x27554, + 0x27600, 0x27600, + 0x27608, 0x27628, + 0x27630, 0x2763c, + 0x27700, 0x2771c, + 0x27780, 0x2778c, + 0x27800, 0x27c38, + 0x27c80, 0x27d7c, + 0x27e00, 0x27e04 + }; + + static const unsigned int t5_reg_ranges[] = { + 0x1008, 0x1148, + 0x1180, 0x11b4, + 0x11fc, 0x123c, + 0x1280, 0x173c, + 0x1800, 0x18fc, + 0x3000, 0x3028, + 0x3060, 0x30d8, + 0x30e0, 0x30fc, + 0x3140, 0x357c, + 0x35a8, 0x35cc, + 0x35ec, 0x35ec, + 0x3600, 0x5624, + 0x56cc, 0x575c, + 0x580c, 0x5814, + 0x5890, 0x58bc, + 0x5940, 0x59dc, + 0x59fc, 0x5a18, + 0x5a60, 0x5a9c, + 0x5b9c, 0x5bfc, + 0x6000, 0x6040, + 0x6058, 0x614c, + 0x7700, 0x7798, + 0x77c0, 0x78fc, + 0x7b00, 0x7c54, + 0x7d00, 0x7efc, + 0x8dc0, 0x8de0, + 0x8df8, 0x8e84, + 0x8ea0, 0x8f84, + 0x8fc0, 0x90f8, + 0x9400, 0x9470, + 0x9600, 0x96f4, + 0x9800, 0x9808, + 0x9820, 0x983c, + 0x9850, 0x9864, + 0x9c00, 0x9c6c, + 0x9c80, 0x9cec, + 0x9d00, 0x9d6c, + 0x9d80, 0x9dec, + 0x9e00, 0x9e6c, + 0x9e80, 0x9eec, + 0x9f00, 0x9f6c, + 0x9f80, 0xa020, + 0xd004, 0xd03c, + 0xdfc0, 0xdfe0, + 0xe000, 0x11088, + 0x1109c, 0x11110, + 0x11118, 0x1117c, + 0x11190, 0x11204, + 0x19040, 0x1906c, + 0x19078, 0x19080, + 0x1908c, 0x19124, + 0x19150, 0x191b0, + 0x191d0, 0x191e8, + 0x19238, 0x19290, + 0x193f8, 0x19474, + 0x19490, 0x194cc, + 0x194f0, 0x194f8, + 0x19c00, 0x19c60, + 0x19c94, 0x19e10, + 0x19e50, 0x19f34, + 0x19f40, 0x19f50, + 0x19f90, 0x19fe4, + 0x1a000, 0x1a06c, + 0x1a0b0, 0x1a120, + 0x1a128, 0x1a138, + 0x1a190, 0x1a1c4, + 0x1a1fc, 0x1a1fc, + 0x1e008, 0x1e00c, + 0x1e040, 0x1e04c, + 0x1e284, 0x1e290, + 0x1e2c0, 0x1e2c0, + 0x1e2e0, 0x1e2e0, + 0x1e300, 0x1e384, + 0x1e3c0, 0x1e3c8, + 0x1e408, 0x1e40c, + 0x1e440, 0x1e44c, + 0x1e684, 0x1e690, + 0x1e6c0, 0x1e6c0, + 0x1e6e0, 0x1e6e0, + 0x1e700, 0x1e784, + 0x1e7c0, 0x1e7c8, + 0x1e808, 0x1e80c, + 0x1e840, 0x1e84c, + 0x1ea84, 0x1ea90, + 0x1eac0, 0x1eac0, + 0x1eae0, 0x1eae0, + 0x1eb00, 0x1eb84, + 0x1ebc0, 0x1ebc8, + 0x1ec08, 0x1ec0c, + 0x1ec40, 0x1ec4c, + 0x1ee84, 0x1ee90, + 0x1eec0, 0x1eec0, + 0x1eee0, 0x1eee0, + 0x1ef00, 0x1ef84, + 0x1efc0, 0x1efc8, + 0x1f008, 0x1f00c, + 0x1f040, 0x1f04c, + 0x1f284, 0x1f290, + 0x1f2c0, 0x1f2c0, + 0x1f2e0, 0x1f2e0, + 0x1f300, 0x1f384, + 0x1f3c0, 0x1f3c8, + 0x1f408, 0x1f40c, + 0x1f440, 0x1f44c, + 0x1f684, 0x1f690, + 0x1f6c0, 0x1f6c0, + 0x1f6e0, 0x1f6e0, + 0x1f700, 0x1f784, + 0x1f7c0, 0x1f7c8, + 0x1f808, 0x1f80c, + 0x1f840, 0x1f84c, + 0x1fa84, 0x1fa90, + 0x1fac0, 0x1fac0, + 0x1fae0, 0x1fae0, + 0x1fb00, 0x1fb84, + 0x1fbc0, 0x1fbc8, + 0x1fc08, 0x1fc0c, + 0x1fc40, 0x1fc4c, + 0x1fe84, 0x1fe90, + 0x1fec0, 0x1fec0, + 0x1fee0, 0x1fee0, + 0x1ff00, 0x1ff84, + 0x1ffc0, 0x1ffc8, + 0x30000, 0x30030, + 0x30100, 0x30144, + 0x30190, 0x301d0, + 0x30200, 0x30318, + 0x30400, 0x3052c, + 0x30540, 0x3061c, + 0x30800, 0x30834, + 0x308c0, 0x30908, + 0x30910, 0x309ac, + 0x30a00, 0x30a04, + 0x30a0c, 0x30a2c, + 0x30a44, 0x30a50, + 0x30a74, 0x30c24, + 0x30d08, 0x30d14, + 0x30d1c, 0x30d20, + 0x30d3c, 0x30d50, + 0x31200, 0x3120c, + 0x31220, 0x31220, + 0x31240, 0x31240, + 0x31600, 0x31600, + 0x31608, 0x3160c, + 0x31a00, 0x31a1c, + 0x31e04, 0x31e20, + 0x31e38, 0x31e3c, + 0x31e80, 0x31e80, + 0x31e88, 0x31ea8, + 0x31eb0, 0x31eb4, + 0x31ec8, 0x31ed4, + 0x31fb8, 0x32004, + 0x32208, 0x3223c, + 0x32600, 0x32630, + 0x32a00, 0x32abc, + 0x32b00, 0x32b70, + 0x33000, 0x33048, + 0x33060, 0x3309c, + 0x330f0, 0x33148, + 0x33160, 0x3319c, + 0x331f0, 0x332e4, + 0x332f8, 0x333e4, + 0x333f8, 0x33448, + 0x33460, 0x3349c, + 0x334f0, 0x33548, + 0x33560, 0x3359c, + 0x335f0, 0x336e4, + 0x336f8, 0x337e4, + 0x337f8, 0x337fc, + 0x33814, 0x33814, + 0x3382c, 0x3382c, + 0x33880, 0x3388c, + 0x338e8, 0x338ec, + 0x33900, 0x33948, + 0x33960, 0x3399c, + 0x339f0, 0x33ae4, + 0x33af8, 0x33b10, + 0x33b28, 0x33b28, + 0x33b3c, 0x33b50, + 0x33bf0, 0x33c10, + 0x33c28, 0x33c28, + 0x33c3c, 0x33c50, + 0x33cf0, 0x33cfc, + 0x34000, 0x34030, + 0x34100, 0x34144, + 0x34190, 0x341d0, + 0x34200, 0x34318, + 0x34400, 0x3452c, + 0x34540, 0x3461c, + 0x34800, 0x34834, + 0x348c0, 0x34908, + 0x34910, 0x349ac, + 0x34a00, 0x34a04, + 0x34a0c, 0x34a2c, + 0x34a44, 0x34a50, + 0x34a74, 0x34c24, + 0x34d08, 0x34d14, + 0x34d1c, 0x34d20, + 0x34d3c, 0x34d50, + 0x35200, 0x3520c, + 0x35220, 0x35220, + 0x35240, 0x35240, + 0x35600, 0x35600, + 0x35608, 0x3560c, + 0x35a00, 0x35a1c, + 0x35e04, 0x35e20, + 0x35e38, 0x35e3c, + 0x35e80, 0x35e80, + 0x35e88, 0x35ea8, + 0x35eb0, 0x35eb4, + 0x35ec8, 0x35ed4, + 0x35fb8, 0x36004, + 0x36208, 0x3623c, + 0x36600, 0x36630, + 0x36a00, 0x36abc, + 0x36b00, 0x36b70, + 0x37000, 0x37048, + 0x37060, 0x3709c, + 0x370f0, 0x37148, + 0x37160, 0x3719c, + 0x371f0, 0x372e4, + 0x372f8, 0x373e4, + 0x373f8, 0x37448, + 0x37460, 0x3749c, + 0x374f0, 0x37548, + 0x37560, 0x3759c, + 0x375f0, 0x376e4, + 0x376f8, 0x377e4, + 0x377f8, 0x377fc, + 0x37814, 0x37814, + 0x3782c, 0x3782c, + 0x37880, 0x3788c, + 0x378e8, 0x378ec, + 0x37900, 0x37948, + 0x37960, 0x3799c, + 0x379f0, 0x37ae4, + 0x37af8, 0x37b10, + 0x37b28, 0x37b28, + 0x37b3c, 0x37b50, + 0x37bf0, 0x37c10, + 0x37c28, 0x37c28, + 0x37c3c, 0x37c50, + 0x37cf0, 0x37cfc, + 0x38000, 0x38030, + 0x38100, 0x38144, + 0x38190, 0x381d0, + 0x38200, 0x38318, + 0x38400, 0x3852c, + 0x38540, 0x3861c, + 0x38800, 0x38834, + 0x388c0, 0x38908, + 0x38910, 0x389ac, + 0x38a00, 0x38a04, + 0x38a0c, 0x38a2c, + 0x38a44, 0x38a50, + 0x38a74, 0x38c24, + 0x38d08, 0x38d14, + 0x38d1c, 0x38d20, + 0x38d3c, 0x38d50, + 0x39200, 0x3920c, + 0x39220, 0x39220, + 0x39240, 0x39240, + 0x39600, 0x39600, + 0x39608, 0x3960c, + 0x39a00, 0x39a1c, + 0x39e04, 0x39e20, + 0x39e38, 0x39e3c, + 0x39e80, 0x39e80, + 0x39e88, 0x39ea8, + 0x39eb0, 0x39eb4, + 0x39ec8, 0x39ed4, + 0x39fb8, 0x3a004, + 0x3a208, 0x3a23c, + 0x3a600, 0x3a630, + 0x3aa00, 0x3aabc, + 0x3ab00, 0x3ab70, + 0x3b000, 0x3b048, + 0x3b060, 0x3b09c, + 0x3b0f0, 0x3b148, + 0x3b160, 0x3b19c, + 0x3b1f0, 0x3b2e4, + 0x3b2f8, 0x3b3e4, + 0x3b3f8, 0x3b448, + 0x3b460, 0x3b49c, + 0x3b4f0, 0x3b548, + 0x3b560, 0x3b59c, + 0x3b5f0, 0x3b6e4, + 0x3b6f8, 0x3b7e4, + 0x3b7f8, 0x3b7fc, + 0x3b814, 0x3b814, + 0x3b82c, 0x3b82c, + 0x3b880, 0x3b88c, + 0x3b8e8, 0x3b8ec, + 0x3b900, 0x3b948, + 0x3b960, 0x3b99c, + 0x3b9f0, 0x3bae4, + 0x3baf8, 0x3bb10, + 0x3bb28, 0x3bb28, + 0x3bb3c, 0x3bb50, + 0x3bbf0, 0x3bc10, + 0x3bc28, 0x3bc28, + 0x3bc3c, 0x3bc50, + 0x3bcf0, 0x3bcfc, + 0x3c000, 0x3c030, + 0x3c100, 0x3c144, + 0x3c190, 0x3c1d0, + 0x3c200, 0x3c318, + 0x3c400, 0x3c52c, + 0x3c540, 0x3c61c, + 0x3c800, 0x3c834, + 0x3c8c0, 0x3c908, + 0x3c910, 0x3c9ac, + 0x3ca00, 0x3ca04, + 0x3ca0c, 0x3ca2c, + 0x3ca44, 0x3ca50, + 0x3ca74, 0x3cc24, + 0x3cd08, 0x3cd14, + 0x3cd1c, 0x3cd20, + 0x3cd3c, 0x3cd50, + 0x3d200, 0x3d20c, + 0x3d220, 0x3d220, + 0x3d240, 0x3d240, + 0x3d600, 0x3d600, + 0x3d608, 0x3d60c, + 0x3da00, 0x3da1c, + 0x3de04, 0x3de20, + 0x3de38, 0x3de3c, + 0x3de80, 0x3de80, + 0x3de88, 0x3dea8, + 0x3deb0, 0x3deb4, + 0x3dec8, 0x3ded4, + 0x3dfb8, 0x3e004, + 0x3e208, 0x3e23c, + 0x3e600, 0x3e630, + 0x3ea00, 0x3eabc, + 0x3eb00, 0x3eb70, + 0x3f000, 0x3f048, + 0x3f060, 0x3f09c, + 0x3f0f0, 0x3f148, + 0x3f160, 0x3f19c, + 0x3f1f0, 0x3f2e4, + 0x3f2f8, 0x3f3e4, + 0x3f3f8, 0x3f448, + 0x3f460, 0x3f49c, + 0x3f4f0, 0x3f548, + 0x3f560, 0x3f59c, + 0x3f5f0, 0x3f6e4, + 0x3f6f8, 0x3f7e4, + 0x3f7f8, 0x3f7fc, + 0x3f814, 0x3f814, + 0x3f82c, 0x3f82c, + 0x3f880, 0x3f88c, + 0x3f8e8, 0x3f8ec, + 0x3f900, 0x3f948, + 0x3f960, 0x3f99c, + 0x3f9f0, 0x3fae4, + 0x3faf8, 0x3fb10, + 0x3fb28, 0x3fb28, + 0x3fb3c, 0x3fb50, + 0x3fbf0, 0x3fc10, + 0x3fc28, 0x3fc28, + 0x3fc3c, 0x3fc50, + 0x3fcf0, 0x3fcfc, + 0x40000, 0x4000c, + 0x40040, 0x40068, + 0x40080, 0x40144, + 0x40180, 0x4018c, + 0x40200, 0x40298, + 0x402ac, 0x4033c, + 0x403f8, 0x403fc, + 0x41304, 0x413c4, + 0x41400, 0x4141c, + 0x41480, 0x414d0, + 0x44000, 0x44078, + 0x440c0, 0x44278, + 0x442c0, 0x44478, + 0x444c0, 0x44678, + 0x446c0, 0x44878, + 0x448c0, 0x449fc, + 0x45000, 0x45068, + 0x45080, 0x45084, + 0x450a0, 0x450b0, + 0x45200, 0x45268, + 0x45280, 0x45284, + 0x452a0, 0x452b0, + 0x460c0, 0x460e4, + 0x47000, 0x4708c, + 0x47200, 0x47250, + 0x47400, 0x47420, + 0x47600, 0x47618, + 0x47800, 0x47814, + 0x48000, 0x4800c, + 0x48040, 0x48068, + 0x48080, 0x48144, + 0x48180, 0x4818c, + 0x48200, 0x48298, + 0x482ac, 0x4833c, + 0x483f8, 0x483fc, + 0x49304, 0x493c4, + 0x49400, 0x4941c, + 0x49480, 0x494d0, + 0x4c000, 0x4c078, + 0x4c0c0, 0x4c278, + 0x4c2c0, 0x4c478, + 0x4c4c0, 0x4c678, + 0x4c6c0, 0x4c878, + 0x4c8c0, 0x4c9fc, + 0x4d000, 0x4d068, + 0x4d080, 0x4d084, + 0x4d0a0, 0x4d0b0, + 0x4d200, 0x4d268, + 0x4d280, 0x4d284, + 0x4d2a0, 0x4d2b0, + 0x4e0c0, 0x4e0e4, + 0x4f000, 0x4f08c, + 0x4f200, 0x4f250, + 0x4f400, 0x4f420, + 0x4f600, 0x4f618, + 0x4f800, 0x4f814, + 0x50000, 0x500cc, + 0x50400, 0x50400, + 0x50800, 0x508cc, + 0x50c00, 0x50c00, + 0x51000, 0x5101c, + 0x51300, 0x51308, + }; + + int i; + struct adapter *ap = netdev2adap(dev); + static const unsigned int *reg_ranges; + int arr_size = 0, buf_size = 0; + + if (is_t4(ap->params.chip)) { + reg_ranges = &t4_reg_ranges[0]; + arr_size = ARRAY_SIZE(t4_reg_ranges); + buf_size = T4_REGMAP_SIZE; + } else { + reg_ranges = &t5_reg_ranges[0]; + arr_size = ARRAY_SIZE(t5_reg_ranges); + buf_size = T5_REGMAP_SIZE; + } + + regs->version = mk_adap_vers(ap); + + memset(buf, 0, buf_size); + for (i = 0; i < arr_size; i += 2) + reg_block_dump(ap, buf, reg_ranges[i], reg_ranges[i + 1]); +} + +static int restart_autoneg(struct net_device *dev) +{ + struct port_info *p = netdev_priv(dev); + + if (!netif_running(dev)) + return -EAGAIN; + if (p->link_cfg.autoneg != AUTONEG_ENABLE) + return -EINVAL; + t4_restart_aneg(p->adapter, p->adapter->fn, p->tx_chan); + return 0; +} + +static int identify_port(struct net_device *dev, + enum ethtool_phys_id_state state) +{ + unsigned int val; + struct adapter *adap = netdev2adap(dev); + + if (state == ETHTOOL_ID_ACTIVE) + val = 0xffff; + else if (state == ETHTOOL_ID_INACTIVE) + val = 0; + else + return -EINVAL; + + return t4_identify_port(adap, adap->fn, netdev2pinfo(dev)->viid, val); +} + +static unsigned int from_fw_linkcaps(unsigned int type, unsigned int caps) +{ + unsigned int v = 0; + + if (type == FW_PORT_TYPE_BT_SGMII || type == FW_PORT_TYPE_BT_XFI || + type == FW_PORT_TYPE_BT_XAUI) { + v |= SUPPORTED_TP; + if (caps & FW_PORT_CAP_SPEED_100M) + v |= SUPPORTED_100baseT_Full; + if (caps & FW_PORT_CAP_SPEED_1G) + v |= SUPPORTED_1000baseT_Full; + if (caps & FW_PORT_CAP_SPEED_10G) + v |= SUPPORTED_10000baseT_Full; + } else if (type == FW_PORT_TYPE_KX4 || type == FW_PORT_TYPE_KX) { + v |= SUPPORTED_Backplane; + if (caps & FW_PORT_CAP_SPEED_1G) + v |= SUPPORTED_1000baseKX_Full; + if (caps & FW_PORT_CAP_SPEED_10G) + v |= SUPPORTED_10000baseKX4_Full; + } else if (type == FW_PORT_TYPE_KR) + v |= SUPPORTED_Backplane | SUPPORTED_10000baseKR_Full; + else if (type == FW_PORT_TYPE_BP_AP) + v |= SUPPORTED_Backplane | SUPPORTED_10000baseR_FEC | + SUPPORTED_10000baseKR_Full | SUPPORTED_1000baseKX_Full; + else if (type == FW_PORT_TYPE_BP4_AP) + v |= SUPPORTED_Backplane | SUPPORTED_10000baseR_FEC | + SUPPORTED_10000baseKR_Full | SUPPORTED_1000baseKX_Full | + SUPPORTED_10000baseKX4_Full; + else if (type == FW_PORT_TYPE_FIBER_XFI || + type == FW_PORT_TYPE_FIBER_XAUI || type == FW_PORT_TYPE_SFP) { + v |= SUPPORTED_FIBRE; + if (caps & FW_PORT_CAP_SPEED_1G) + v |= SUPPORTED_1000baseT_Full; + if (caps & FW_PORT_CAP_SPEED_10G) + v |= SUPPORTED_10000baseT_Full; + } else if (type == FW_PORT_TYPE_BP40_BA) + v |= SUPPORTED_40000baseSR4_Full; + + if (caps & FW_PORT_CAP_ANEG) + v |= SUPPORTED_Autoneg; + return v; +} + +static unsigned int to_fw_linkcaps(unsigned int caps) +{ + unsigned int v = 0; + + if (caps & ADVERTISED_100baseT_Full) + v |= FW_PORT_CAP_SPEED_100M; + if (caps & ADVERTISED_1000baseT_Full) + v |= FW_PORT_CAP_SPEED_1G; + if (caps & ADVERTISED_10000baseT_Full) + v |= FW_PORT_CAP_SPEED_10G; + if (caps & ADVERTISED_40000baseSR4_Full) + v |= FW_PORT_CAP_SPEED_40G; + return v; +} + +static int get_settings(struct net_device *dev, struct ethtool_cmd *cmd) +{ + const struct port_info *p = netdev_priv(dev); + + if (p->port_type == FW_PORT_TYPE_BT_SGMII || + p->port_type == FW_PORT_TYPE_BT_XFI || + p->port_type == FW_PORT_TYPE_BT_XAUI) + cmd->port = PORT_TP; + else if (p->port_type == FW_PORT_TYPE_FIBER_XFI || + p->port_type == FW_PORT_TYPE_FIBER_XAUI) + cmd->port = PORT_FIBRE; + else if (p->port_type == FW_PORT_TYPE_SFP || + p->port_type == FW_PORT_TYPE_QSFP_10G || + p->port_type == FW_PORT_TYPE_QSFP) { + if (p->mod_type == FW_PORT_MOD_TYPE_LR || + p->mod_type == FW_PORT_MOD_TYPE_SR || + p->mod_type == FW_PORT_MOD_TYPE_ER || + p->mod_type == FW_PORT_MOD_TYPE_LRM) + cmd->port = PORT_FIBRE; + else if (p->mod_type == FW_PORT_MOD_TYPE_TWINAX_PASSIVE || + p->mod_type == FW_PORT_MOD_TYPE_TWINAX_ACTIVE) + cmd->port = PORT_DA; + else + cmd->port = PORT_OTHER; + } else + cmd->port = PORT_OTHER; + + if (p->mdio_addr >= 0) { + cmd->phy_address = p->mdio_addr; + cmd->transceiver = XCVR_EXTERNAL; + cmd->mdio_support = p->port_type == FW_PORT_TYPE_BT_SGMII ? + MDIO_SUPPORTS_C22 : MDIO_SUPPORTS_C45; + } else { + cmd->phy_address = 0; /* not really, but no better option */ + cmd->transceiver = XCVR_INTERNAL; + cmd->mdio_support = 0; + } + + cmd->supported = from_fw_linkcaps(p->port_type, p->link_cfg.supported); + cmd->advertising = from_fw_linkcaps(p->port_type, + p->link_cfg.advertising); + ethtool_cmd_speed_set(cmd, + netif_carrier_ok(dev) ? p->link_cfg.speed : 0); + cmd->duplex = DUPLEX_FULL; + cmd->autoneg = p->link_cfg.autoneg; + cmd->maxtxpkt = 0; + cmd->maxrxpkt = 0; + return 0; +} + +static unsigned int speed_to_caps(int speed) +{ + if (speed == 100) + return FW_PORT_CAP_SPEED_100M; + if (speed == 1000) + return FW_PORT_CAP_SPEED_1G; + if (speed == 10000) + return FW_PORT_CAP_SPEED_10G; + if (speed == 40000) + return FW_PORT_CAP_SPEED_40G; + return 0; +} + +static int set_settings(struct net_device *dev, struct ethtool_cmd *cmd) +{ + unsigned int cap; + struct port_info *p = netdev_priv(dev); + struct link_config *lc = &p->link_cfg; + u32 speed = ethtool_cmd_speed(cmd); + + if (cmd->duplex != DUPLEX_FULL) /* only full-duplex supported */ + return -EINVAL; + + if (!(lc->supported & FW_PORT_CAP_ANEG)) { + /* + * PHY offers a single speed. See if that's what's + * being requested. + */ + if (cmd->autoneg == AUTONEG_DISABLE && + (lc->supported & speed_to_caps(speed))) + return 0; + return -EINVAL; + } + + if (cmd->autoneg == AUTONEG_DISABLE) { + cap = speed_to_caps(speed); + + if (!(lc->supported & cap) || + (speed == 1000) || + (speed == 10000) || + (speed == 40000)) + return -EINVAL; + lc->requested_speed = cap; + lc->advertising = 0; + } else { + cap = to_fw_linkcaps(cmd->advertising); + if (!(lc->supported & cap)) + return -EINVAL; + lc->requested_speed = 0; + lc->advertising = cap | FW_PORT_CAP_ANEG; + } + lc->autoneg = cmd->autoneg; + + if (netif_running(dev)) + return t4_link_start(p->adapter, p->adapter->fn, p->tx_chan, + lc); + return 0; +} + +static void get_pauseparam(struct net_device *dev, + struct ethtool_pauseparam *epause) +{ + struct port_info *p = netdev_priv(dev); + + epause->autoneg = (p->link_cfg.requested_fc & PAUSE_AUTONEG) != 0; + epause->rx_pause = (p->link_cfg.fc & PAUSE_RX) != 0; + epause->tx_pause = (p->link_cfg.fc & PAUSE_TX) != 0; +} + +static int set_pauseparam(struct net_device *dev, + struct ethtool_pauseparam *epause) +{ + struct port_info *p = netdev_priv(dev); + struct link_config *lc = &p->link_cfg; + + if (epause->autoneg == AUTONEG_DISABLE) + lc->requested_fc = 0; + else if (lc->supported & FW_PORT_CAP_ANEG) + lc->requested_fc = PAUSE_AUTONEG; + else + return -EINVAL; + + if (epause->rx_pause) + lc->requested_fc |= PAUSE_RX; + if (epause->tx_pause) + lc->requested_fc |= PAUSE_TX; + if (netif_running(dev)) + return t4_link_start(p->adapter, p->adapter->fn, p->tx_chan, + lc); + return 0; +} + +static void get_sge_param(struct net_device *dev, struct ethtool_ringparam *e) +{ + const struct port_info *pi = netdev_priv(dev); + const struct sge *s = &pi->adapter->sge; + + e->rx_max_pending = MAX_RX_BUFFERS; + e->rx_mini_max_pending = MAX_RSPQ_ENTRIES; + e->rx_jumbo_max_pending = 0; + e->tx_max_pending = MAX_TXQ_ENTRIES; + + e->rx_pending = s->ethrxq[pi->first_qset].fl.size - 8; + e->rx_mini_pending = s->ethrxq[pi->first_qset].rspq.size; + e->rx_jumbo_pending = 0; + e->tx_pending = s->ethtxq[pi->first_qset].q.size; +} + +static int set_sge_param(struct net_device *dev, struct ethtool_ringparam *e) +{ + int i; + const struct port_info *pi = netdev_priv(dev); + struct adapter *adapter = pi->adapter; + struct sge *s = &adapter->sge; + + if (e->rx_pending > MAX_RX_BUFFERS || e->rx_jumbo_pending || + e->tx_pending > MAX_TXQ_ENTRIES || + e->rx_mini_pending > MAX_RSPQ_ENTRIES || + e->rx_mini_pending < MIN_RSPQ_ENTRIES || + e->rx_pending < MIN_FL_ENTRIES || e->tx_pending < MIN_TXQ_ENTRIES) + return -EINVAL; + + if (adapter->flags & FULL_INIT_DONE) + return -EBUSY; + + for (i = 0; i < pi->nqsets; ++i) { + s->ethtxq[pi->first_qset + i].q.size = e->tx_pending; + s->ethrxq[pi->first_qset + i].fl.size = e->rx_pending + 8; + s->ethrxq[pi->first_qset + i].rspq.size = e->rx_mini_pending; + } + return 0; +} + +static int closest_timer(const struct sge *s, int time) +{ + int i, delta, match = 0, min_delta = INT_MAX; + + for (i = 0; i < ARRAY_SIZE(s->timer_val); i++) { + delta = time - s->timer_val[i]; + if (delta < 0) + delta = -delta; + if (delta < min_delta) { + min_delta = delta; + match = i; + } + } + return match; +} + +static int closest_thres(const struct sge *s, int thres) +{ + int i, delta, match = 0, min_delta = INT_MAX; + + for (i = 0; i < ARRAY_SIZE(s->counter_val); i++) { + delta = thres - s->counter_val[i]; + if (delta < 0) + delta = -delta; + if (delta < min_delta) { + min_delta = delta; + match = i; + } + } + return match; +} + +/* + * Return a queue's interrupt hold-off time in us. 0 means no timer. + */ +static unsigned int qtimer_val(const struct adapter *adap, + const struct sge_rspq *q) +{ + unsigned int idx = q->intr_params >> 1; + + return idx < SGE_NTIMERS ? adap->sge.timer_val[idx] : 0; +} + +/** + * set_rspq_intr_params - set a queue's interrupt holdoff parameters + * @q: the Rx queue + * @us: the hold-off time in us, or 0 to disable timer + * @cnt: the hold-off packet count, or 0 to disable counter + * + * Sets an Rx queue's interrupt hold-off time and packet count. At least + * one of the two needs to be enabled for the queue to generate interrupts. + */ +static int set_rspq_intr_params(struct sge_rspq *q, + unsigned int us, unsigned int cnt) +{ + struct adapter *adap = q->adap; + + if ((us | cnt) == 0) + cnt = 1; + + if (cnt) { + int err; + u32 v, new_idx; + + new_idx = closest_thres(&adap->sge, cnt); + if (q->desc && q->pktcnt_idx != new_idx) { + /* the queue has already been created, update it */ + v = FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) | + FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_IQ_INTCNTTHRESH) | + FW_PARAMS_PARAM_YZ(q->cntxt_id); + err = t4_set_params(adap, adap->fn, adap->fn, 0, 1, &v, + &new_idx); + if (err) + return err; + } + q->pktcnt_idx = new_idx; + } + + us = us == 0 ? 6 : closest_timer(&adap->sge, us); + q->intr_params = QINTR_TIMER_IDX(us) | (cnt > 0 ? QINTR_CNT_EN : 0); + return 0; +} + +/** + * set_rx_intr_params - set a net devices's RX interrupt holdoff paramete! + * @dev: the network device + * @us: the hold-off time in us, or 0 to disable timer + * @cnt: the hold-off packet count, or 0 to disable counter + * + * Set the RX interrupt hold-off parameters for a network device. + */ +static int set_rx_intr_params(struct net_device *dev, + unsigned int us, unsigned int cnt) +{ + int i, err; + struct port_info *pi = netdev_priv(dev); + struct adapter *adap = pi->adapter; + struct sge_eth_rxq *q = &adap->sge.ethrxq[pi->first_qset]; + + for (i = 0; i < pi->nqsets; i++, q++) { + err = set_rspq_intr_params(&q->rspq, us, cnt); + if (err) + return err; + } + return 0; +} + +static int set_adaptive_rx_setting(struct net_device *dev, int adaptive_rx) +{ + int i; + struct port_info *pi = netdev_priv(dev); + struct adapter *adap = pi->adapter; + struct sge_eth_rxq *q = &adap->sge.ethrxq[pi->first_qset]; + + for (i = 0; i < pi->nqsets; i++, q++) + q->rspq.adaptive_rx = adaptive_rx; + + return 0; +} + +static int get_adaptive_rx_setting(struct net_device *dev) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adap = pi->adapter; + struct sge_eth_rxq *q = &adap->sge.ethrxq[pi->first_qset]; + + return q->rspq.adaptive_rx; +} + +static int set_coalesce(struct net_device *dev, struct ethtool_coalesce *c) +{ + set_adaptive_rx_setting(dev, c->use_adaptive_rx_coalesce); + return set_rx_intr_params(dev, c->rx_coalesce_usecs, + c->rx_max_coalesced_frames); +} + +static int get_coalesce(struct net_device *dev, struct ethtool_coalesce *c) +{ + const struct port_info *pi = netdev_priv(dev); + const struct adapter *adap = pi->adapter; + const struct sge_rspq *rq = &adap->sge.ethrxq[pi->first_qset].rspq; + + c->rx_coalesce_usecs = qtimer_val(adap, rq); + c->rx_max_coalesced_frames = (rq->intr_params & QINTR_CNT_EN) ? + adap->sge.counter_val[rq->pktcnt_idx] : 0; + c->use_adaptive_rx_coalesce = get_adaptive_rx_setting(dev); + return 0; +} + +/** + * eeprom_ptov - translate a physical EEPROM address to virtual + * @phys_addr: the physical EEPROM address + * @fn: the PCI function number + * @sz: size of function-specific area + * + * Translate a physical EEPROM address to virtual. The first 1K is + * accessed through virtual addresses starting at 31K, the rest is + * accessed through virtual addresses starting at 0. + * + * The mapping is as follows: + * [0..1K) -> [31K..32K) + * [1K..1K+A) -> [31K-A..31K) + * [1K+A..ES) -> [0..ES-A-1K) + * + * where A = @fn * @sz, and ES = EEPROM size. + */ +static int eeprom_ptov(unsigned int phys_addr, unsigned int fn, unsigned int sz) +{ + fn *= sz; + if (phys_addr < 1024) + return phys_addr + (31 << 10); + if (phys_addr < 1024 + fn) + return 31744 - fn + phys_addr - 1024; + if (phys_addr < EEPROMSIZE) + return phys_addr - 1024 - fn; + return -EINVAL; +} + +/* + * The next two routines implement eeprom read/write from physical addresses. + */ +static int eeprom_rd_phys(struct adapter *adap, unsigned int phys_addr, u32 *v) +{ + int vaddr = eeprom_ptov(phys_addr, adap->fn, EEPROMPFSIZE); + + if (vaddr >= 0) + vaddr = pci_read_vpd(adap->pdev, vaddr, sizeof(u32), v); + return vaddr < 0 ? vaddr : 0; +} + +static int eeprom_wr_phys(struct adapter *adap, unsigned int phys_addr, u32 v) +{ + int vaddr = eeprom_ptov(phys_addr, adap->fn, EEPROMPFSIZE); + + if (vaddr >= 0) + vaddr = pci_write_vpd(adap->pdev, vaddr, sizeof(u32), &v); + return vaddr < 0 ? vaddr : 0; +} + +#define EEPROM_MAGIC 0x38E2F10C + +static int get_eeprom(struct net_device *dev, struct ethtool_eeprom *e, + u8 *data) +{ + int i, err = 0; + struct adapter *adapter = netdev2adap(dev); + + u8 *buf = kmalloc(EEPROMSIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + e->magic = EEPROM_MAGIC; + for (i = e->offset & ~3; !err && i < e->offset + e->len; i += 4) + err = eeprom_rd_phys(adapter, i, (u32 *)&buf[i]); + + if (!err) + memcpy(data, buf + e->offset, e->len); + kfree(buf); + return err; +} + +static int set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, + u8 *data) +{ + u8 *buf; + int err = 0; + u32 aligned_offset, aligned_len, *p; + struct adapter *adapter = netdev2adap(dev); + + if (eeprom->magic != EEPROM_MAGIC) + return -EINVAL; + + aligned_offset = eeprom->offset & ~3; + aligned_len = (eeprom->len + (eeprom->offset & 3) + 3) & ~3; + + if (adapter->fn > 0) { + u32 start = 1024 + adapter->fn * EEPROMPFSIZE; + + if (aligned_offset < start || + aligned_offset + aligned_len > start + EEPROMPFSIZE) + return -EPERM; + } + + if (aligned_offset != eeprom->offset || aligned_len != eeprom->len) { + /* + * RMW possibly needed for first or last words. + */ + buf = kmalloc(aligned_len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + err = eeprom_rd_phys(adapter, aligned_offset, (u32 *)buf); + if (!err && aligned_len > 4) + err = eeprom_rd_phys(adapter, + aligned_offset + aligned_len - 4, + (u32 *)&buf[aligned_len - 4]); + if (err) + goto out; + memcpy(buf + (eeprom->offset & 3), data, eeprom->len); + } else + buf = data; + + err = t4_seeprom_wp(adapter, false); + if (err) + goto out; + + for (p = (u32 *)buf; !err && aligned_len; aligned_len -= 4, p++) { + err = eeprom_wr_phys(adapter, aligned_offset, *p); + aligned_offset += 4; + } + + if (!err) + err = t4_seeprom_wp(adapter, true); +out: + if (buf != data) + kfree(buf); + return err; +} + +static int set_flash(struct net_device *netdev, struct ethtool_flash *ef) +{ + int ret; + const struct firmware *fw; + struct adapter *adap = netdev2adap(netdev); + unsigned int mbox = FW_PCIE_FW_MASTER_MASK + 1; + + ef->data[sizeof(ef->data) - 1] = '\0'; + ret = request_firmware(&fw, ef->data, adap->pdev_dev); + if (ret < 0) + return ret; + + /* If the adapter has been fully initialized then we'll go ahead and + * try to get the firmware's cooperation in upgrading to the new + * firmware image otherwise we'll try to do the entire job from the + * host ... and we always "force" the operation in this path. + */ + if (adap->flags & FULL_INIT_DONE) + mbox = adap->mbox; + + ret = t4_fw_upgrade(adap, mbox, fw->data, fw->size, 1); + release_firmware(fw); + if (!ret) + dev_info(adap->pdev_dev, "loaded firmware %s," + " reload cxgb4 driver\n", ef->data); + return ret; +} + +#define WOL_SUPPORTED (WAKE_BCAST | WAKE_MAGIC) +#define BCAST_CRC 0xa0ccc1a6 + +static void get_wol(struct net_device *dev, struct ethtool_wolinfo *wol) +{ + wol->supported = WAKE_BCAST | WAKE_MAGIC; + wol->wolopts = netdev2adap(dev)->wol; + memset(&wol->sopass, 0, sizeof(wol->sopass)); +} + +static int set_wol(struct net_device *dev, struct ethtool_wolinfo *wol) +{ + int err = 0; + struct port_info *pi = netdev_priv(dev); + + if (wol->wolopts & ~WOL_SUPPORTED) + return -EINVAL; + t4_wol_magic_enable(pi->adapter, pi->tx_chan, + (wol->wolopts & WAKE_MAGIC) ? dev->dev_addr : NULL); + if (wol->wolopts & WAKE_BCAST) { + err = t4_wol_pat_enable(pi->adapter, pi->tx_chan, 0xfe, ~0ULL, + ~0ULL, 0, false); + if (!err) + err = t4_wol_pat_enable(pi->adapter, pi->tx_chan, 1, + ~6ULL, ~0ULL, BCAST_CRC, true); + } else + t4_wol_pat_enable(pi->adapter, pi->tx_chan, 0, 0, 0, 0, false); + return err; +} + +static int cxgb_set_features(struct net_device *dev, netdev_features_t features) +{ + const struct port_info *pi = netdev_priv(dev); + netdev_features_t changed = dev->features ^ features; + int err; + + if (!(changed & NETIF_F_HW_VLAN_CTAG_RX)) + return 0; + + err = t4_set_rxmode(pi->adapter, pi->adapter->fn, pi->viid, -1, + -1, -1, -1, + !!(features & NETIF_F_HW_VLAN_CTAG_RX), true); + if (unlikely(err)) + dev->features = features ^ NETIF_F_HW_VLAN_CTAG_RX; + return err; +} + +static u32 get_rss_table_size(struct net_device *dev) +{ + const struct port_info *pi = netdev_priv(dev); + + return pi->rss_size; +} + +static int get_rss_table(struct net_device *dev, u32 *p, u8 *key) +{ + const struct port_info *pi = netdev_priv(dev); + unsigned int n = pi->rss_size; + + while (n--) + p[n] = pi->rss[n]; + return 0; +} + +static int set_rss_table(struct net_device *dev, const u32 *p, const u8 *key) +{ + unsigned int i; + struct port_info *pi = netdev_priv(dev); + + for (i = 0; i < pi->rss_size; i++) + pi->rss[i] = p[i]; + if (pi->adapter->flags & FULL_INIT_DONE) + return write_rss(pi, pi->rss); + return 0; +} + +static int get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info, + u32 *rules) +{ + const struct port_info *pi = netdev_priv(dev); + + switch (info->cmd) { + case ETHTOOL_GRXFH: { + unsigned int v = pi->rss_mode; + + info->data = 0; + switch (info->flow_type) { + case TCP_V4_FLOW: + if (v & FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN) + info->data = RXH_IP_SRC | RXH_IP_DST | + RXH_L4_B_0_1 | RXH_L4_B_2_3; + else if (v & FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN) + info->data = RXH_IP_SRC | RXH_IP_DST; + break; + case UDP_V4_FLOW: + if ((v & FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN) && + (v & FW_RSS_VI_CONFIG_CMD_UDPEN)) + info->data = RXH_IP_SRC | RXH_IP_DST | + RXH_L4_B_0_1 | RXH_L4_B_2_3; + else if (v & FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN) + info->data = RXH_IP_SRC | RXH_IP_DST; + break; + case SCTP_V4_FLOW: + case AH_ESP_V4_FLOW: + case IPV4_FLOW: + if (v & FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN) + info->data = RXH_IP_SRC | RXH_IP_DST; + break; + case TCP_V6_FLOW: + if (v & FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN) + info->data = RXH_IP_SRC | RXH_IP_DST | + RXH_L4_B_0_1 | RXH_L4_B_2_3; + else if (v & FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN) + info->data = RXH_IP_SRC | RXH_IP_DST; + break; + case UDP_V6_FLOW: + if ((v & FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN) && + (v & FW_RSS_VI_CONFIG_CMD_UDPEN)) + info->data = RXH_IP_SRC | RXH_IP_DST | + RXH_L4_B_0_1 | RXH_L4_B_2_3; + else if (v & FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN) + info->data = RXH_IP_SRC | RXH_IP_DST; + break; + case SCTP_V6_FLOW: + case AH_ESP_V6_FLOW: + case IPV6_FLOW: + if (v & FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN) + info->data = RXH_IP_SRC | RXH_IP_DST; + break; + } + return 0; + } + case ETHTOOL_GRXRINGS: + info->data = pi->nqsets; + return 0; + } + return -EOPNOTSUPP; +} + +static const struct ethtool_ops cxgb_ethtool_ops = { + .get_settings = get_settings, + .set_settings = set_settings, + .get_drvinfo = get_drvinfo, + .get_msglevel = get_msglevel, + .set_msglevel = set_msglevel, + .get_ringparam = get_sge_param, + .set_ringparam = set_sge_param, + .get_coalesce = get_coalesce, + .set_coalesce = set_coalesce, + .get_eeprom_len = get_eeprom_len, + .get_eeprom = get_eeprom, + .set_eeprom = set_eeprom, + .get_pauseparam = get_pauseparam, + .set_pauseparam = set_pauseparam, + .get_link = ethtool_op_get_link, + .get_strings = get_strings, + .set_phys_id = identify_port, + .nway_reset = restart_autoneg, + .get_sset_count = get_sset_count, + .get_ethtool_stats = get_stats, + .get_regs_len = get_regs_len, + .get_regs = get_regs, + .get_wol = get_wol, + .set_wol = set_wol, + .get_rxnfc = get_rxnfc, + .get_rxfh_indir_size = get_rss_table_size, + .get_rxfh = get_rss_table, + .set_rxfh = set_rss_table, + .flash_device = set_flash, +}; + +/* + * debugfs support + */ +static ssize_t mem_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + loff_t pos = *ppos; + loff_t avail = file_inode(file)->i_size; + unsigned int mem = (uintptr_t)file->private_data & 3; + struct adapter *adap = file->private_data - mem; + __be32 *data; + int ret; + + if (pos < 0) + return -EINVAL; + if (pos >= avail) + return 0; + if (count > avail - pos) + count = avail - pos; + + data = t4_alloc_mem(count); + if (!data) + return -ENOMEM; + + spin_lock(&adap->win0_lock); + ret = t4_memory_rw(adap, 0, mem, pos, count, data, T4_MEMORY_READ); + spin_unlock(&adap->win0_lock); + if (ret) { + t4_free_mem(data); + return ret; + } + ret = copy_to_user(buf, data, count); + + t4_free_mem(data); + if (ret) + return -EFAULT; + + *ppos = pos + count; + return count; +} + +static const struct file_operations mem_debugfs_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = mem_read, + .llseek = default_llseek, +}; + +static void add_debugfs_mem(struct adapter *adap, const char *name, + unsigned int idx, unsigned int size_mb) +{ + struct dentry *de; + + de = debugfs_create_file(name, S_IRUSR, adap->debugfs_root, + (void *)adap + idx, &mem_debugfs_fops); + if (de && de->d_inode) + de->d_inode->i_size = size_mb << 20; +} + +static int setup_debugfs(struct adapter *adap) +{ + int i; + u32 size; + + if (IS_ERR_OR_NULL(adap->debugfs_root)) + return -1; + + i = t4_read_reg(adap, MA_TARGET_MEM_ENABLE); + if (i & EDRAM0_ENABLE) { + size = t4_read_reg(adap, MA_EDRAM0_BAR); + add_debugfs_mem(adap, "edc0", MEM_EDC0, EDRAM_SIZE_GET(size)); + } + if (i & EDRAM1_ENABLE) { + size = t4_read_reg(adap, MA_EDRAM1_BAR); + add_debugfs_mem(adap, "edc1", MEM_EDC1, EDRAM_SIZE_GET(size)); + } + if (is_t4(adap->params.chip)) { + size = t4_read_reg(adap, MA_EXT_MEMORY_BAR); + if (i & EXT_MEM_ENABLE) + add_debugfs_mem(adap, "mc", MEM_MC, + EXT_MEM_SIZE_GET(size)); + } else { + if (i & EXT_MEM_ENABLE) { + size = t4_read_reg(adap, MA_EXT_MEMORY_BAR); + add_debugfs_mem(adap, "mc0", MEM_MC0, + EXT_MEM_SIZE_GET(size)); + } + if (i & EXT_MEM1_ENABLE) { + size = t4_read_reg(adap, MA_EXT_MEMORY1_BAR); + add_debugfs_mem(adap, "mc1", MEM_MC1, + EXT_MEM_SIZE_GET(size)); + } + } + if (adap->l2t) + debugfs_create_file("l2t", S_IRUSR, adap->debugfs_root, adap, + &t4_l2t_fops); + return 0; +} + +/* + * upper-layer driver support + */ + +/* + * Allocate an active-open TID and set it to the supplied value. + */ +int cxgb4_alloc_atid(struct tid_info *t, void *data) +{ + int atid = -1; + + spin_lock_bh(&t->atid_lock); + if (t->afree) { + union aopen_entry *p = t->afree; + + atid = (p - t->atid_tab) + t->atid_base; + t->afree = p->next; + p->data = data; + t->atids_in_use++; + } + spin_unlock_bh(&t->atid_lock); + return atid; +} +EXPORT_SYMBOL(cxgb4_alloc_atid); + +/* + * Release an active-open TID. + */ +void cxgb4_free_atid(struct tid_info *t, unsigned int atid) +{ + union aopen_entry *p = &t->atid_tab[atid - t->atid_base]; + + spin_lock_bh(&t->atid_lock); + p->next = t->afree; + t->afree = p; + t->atids_in_use--; + spin_unlock_bh(&t->atid_lock); +} +EXPORT_SYMBOL(cxgb4_free_atid); + +/* + * Allocate a server TID and set it to the supplied value. + */ +int cxgb4_alloc_stid(struct tid_info *t, int family, void *data) +{ + int stid; + + spin_lock_bh(&t->stid_lock); + if (family == PF_INET) { + stid = find_first_zero_bit(t->stid_bmap, t->nstids); + if (stid < t->nstids) + __set_bit(stid, t->stid_bmap); + else + stid = -1; + } else { + stid = bitmap_find_free_region(t->stid_bmap, t->nstids, 2); + if (stid < 0) + stid = -1; + } + if (stid >= 0) { + t->stid_tab[stid].data = data; + stid += t->stid_base; + /* IPv6 requires max of 520 bits or 16 cells in TCAM + * This is equivalent to 4 TIDs. With CLIP enabled it + * needs 2 TIDs. + */ + if (family == PF_INET) + t->stids_in_use++; + else + t->stids_in_use += 4; + } + spin_unlock_bh(&t->stid_lock); + return stid; +} +EXPORT_SYMBOL(cxgb4_alloc_stid); + +/* Allocate a server filter TID and set it to the supplied value. + */ +int cxgb4_alloc_sftid(struct tid_info *t, int family, void *data) +{ + int stid; + + spin_lock_bh(&t->stid_lock); + if (family == PF_INET) { + stid = find_next_zero_bit(t->stid_bmap, + t->nstids + t->nsftids, t->nstids); + if (stid < (t->nstids + t->nsftids)) + __set_bit(stid, t->stid_bmap); + else + stid = -1; + } else { + stid = -1; + } + if (stid >= 0) { + t->stid_tab[stid].data = data; + stid -= t->nstids; + stid += t->sftid_base; + t->stids_in_use++; + } + spin_unlock_bh(&t->stid_lock); + return stid; +} +EXPORT_SYMBOL(cxgb4_alloc_sftid); + +/* Release a server TID. + */ +void cxgb4_free_stid(struct tid_info *t, unsigned int stid, int family) +{ + /* Is it a server filter TID? */ + if (t->nsftids && (stid >= t->sftid_base)) { + stid -= t->sftid_base; + stid += t->nstids; + } else { + stid -= t->stid_base; + } + + spin_lock_bh(&t->stid_lock); + if (family == PF_INET) + __clear_bit(stid, t->stid_bmap); + else + bitmap_release_region(t->stid_bmap, stid, 2); + t->stid_tab[stid].data = NULL; + if (family == PF_INET) + t->stids_in_use--; + else + t->stids_in_use -= 4; + spin_unlock_bh(&t->stid_lock); +} +EXPORT_SYMBOL(cxgb4_free_stid); + +/* + * Populate a TID_RELEASE WR. Caller must properly size the skb. + */ +static void mk_tid_release(struct sk_buff *skb, unsigned int chan, + unsigned int tid) +{ + struct cpl_tid_release *req; + + set_wr_txq(skb, CPL_PRIORITY_SETUP, chan); + req = (struct cpl_tid_release *)__skb_put(skb, sizeof(*req)); + INIT_TP_WR(req, tid); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid)); +} + +/* + * Queue a TID release request and if necessary schedule a work queue to + * process it. + */ +static void cxgb4_queue_tid_release(struct tid_info *t, unsigned int chan, + unsigned int tid) +{ + void **p = &t->tid_tab[tid]; + struct adapter *adap = container_of(t, struct adapter, tids); + + spin_lock_bh(&adap->tid_release_lock); + *p = adap->tid_release_head; + /* Low 2 bits encode the Tx channel number */ + adap->tid_release_head = (void **)((uintptr_t)p | chan); + if (!adap->tid_release_task_busy) { + adap->tid_release_task_busy = true; + queue_work(adap->workq, &adap->tid_release_task); + } + spin_unlock_bh(&adap->tid_release_lock); +} + +/* + * Process the list of pending TID release requests. + */ +static void process_tid_release_list(struct work_struct *work) +{ + struct sk_buff *skb; + struct adapter *adap; + + adap = container_of(work, struct adapter, tid_release_task); + + spin_lock_bh(&adap->tid_release_lock); + while (adap->tid_release_head) { + void **p = adap->tid_release_head; + unsigned int chan = (uintptr_t)p & 3; + p = (void *)p - chan; + + adap->tid_release_head = *p; + *p = NULL; + spin_unlock_bh(&adap->tid_release_lock); + + while (!(skb = alloc_skb(sizeof(struct cpl_tid_release), + GFP_KERNEL))) + schedule_timeout_uninterruptible(1); + + mk_tid_release(skb, chan, p - adap->tids.tid_tab); + t4_ofld_send(adap, skb); + spin_lock_bh(&adap->tid_release_lock); + } + adap->tid_release_task_busy = false; + spin_unlock_bh(&adap->tid_release_lock); +} + +/* + * Release a TID and inform HW. If we are unable to allocate the release + * message we defer to a work queue. + */ +void cxgb4_remove_tid(struct tid_info *t, unsigned int chan, unsigned int tid) +{ + void *old; + struct sk_buff *skb; + struct adapter *adap = container_of(t, struct adapter, tids); + + old = t->tid_tab[tid]; + skb = alloc_skb(sizeof(struct cpl_tid_release), GFP_ATOMIC); + if (likely(skb)) { + t->tid_tab[tid] = NULL; + mk_tid_release(skb, chan, tid); + t4_ofld_send(adap, skb); + } else + cxgb4_queue_tid_release(t, chan, tid); + if (old) + atomic_dec(&t->tids_in_use); +} +EXPORT_SYMBOL(cxgb4_remove_tid); + +/* + * Allocate and initialize the TID tables. Returns 0 on success. + */ +static int tid_init(struct tid_info *t) +{ + size_t size; + unsigned int stid_bmap_size; + unsigned int natids = t->natids; + struct adapter *adap = container_of(t, struct adapter, tids); + + stid_bmap_size = BITS_TO_LONGS(t->nstids + t->nsftids); + size = t->ntids * sizeof(*t->tid_tab) + + natids * sizeof(*t->atid_tab) + + t->nstids * sizeof(*t->stid_tab) + + t->nsftids * sizeof(*t->stid_tab) + + stid_bmap_size * sizeof(long) + + t->nftids * sizeof(*t->ftid_tab) + + t->nsftids * sizeof(*t->ftid_tab); + + t->tid_tab = t4_alloc_mem(size); + if (!t->tid_tab) + return -ENOMEM; + + t->atid_tab = (union aopen_entry *)&t->tid_tab[t->ntids]; + t->stid_tab = (struct serv_entry *)&t->atid_tab[natids]; + t->stid_bmap = (unsigned long *)&t->stid_tab[t->nstids + t->nsftids]; + t->ftid_tab = (struct filter_entry *)&t->stid_bmap[stid_bmap_size]; + spin_lock_init(&t->stid_lock); + spin_lock_init(&t->atid_lock); + + t->stids_in_use = 0; + t->afree = NULL; + t->atids_in_use = 0; + atomic_set(&t->tids_in_use, 0); + + /* Setup the free list for atid_tab and clear the stid bitmap. */ + if (natids) { + while (--natids) + t->atid_tab[natids - 1].next = &t->atid_tab[natids]; + t->afree = t->atid_tab; + } + bitmap_zero(t->stid_bmap, t->nstids + t->nsftids); + /* Reserve stid 0 for T4/T5 adapters */ + if (!t->stid_base && + (is_t4(adap->params.chip) || is_t5(adap->params.chip))) + __set_bit(0, t->stid_bmap); + + return 0; +} + +int cxgb4_clip_get(const struct net_device *dev, + const struct in6_addr *lip) +{ + struct adapter *adap; + struct fw_clip_cmd c; + + adap = netdev2adap(dev); + memset(&c, 0, sizeof(c)); + c.op_to_write = htonl(FW_CMD_OP(FW_CLIP_CMD) | + FW_CMD_REQUEST | FW_CMD_WRITE); + c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_ALLOC | FW_LEN16(c)); + c.ip_hi = *(__be64 *)(lip->s6_addr); + c.ip_lo = *(__be64 *)(lip->s6_addr + 8); + return t4_wr_mbox_meat(adap, adap->mbox, &c, sizeof(c), &c, false); +} +EXPORT_SYMBOL(cxgb4_clip_get); + +int cxgb4_clip_release(const struct net_device *dev, + const struct in6_addr *lip) +{ + struct adapter *adap; + struct fw_clip_cmd c; + + adap = netdev2adap(dev); + memset(&c, 0, sizeof(c)); + c.op_to_write = htonl(FW_CMD_OP(FW_CLIP_CMD) | + FW_CMD_REQUEST | FW_CMD_READ); + c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_FREE | FW_LEN16(c)); + c.ip_hi = *(__be64 *)(lip->s6_addr); + c.ip_lo = *(__be64 *)(lip->s6_addr + 8); + return t4_wr_mbox_meat(adap, adap->mbox, &c, sizeof(c), &c, false); +} +EXPORT_SYMBOL(cxgb4_clip_release); + +/** + * cxgb4_create_server - create an IP server + * @dev: the device + * @stid: the server TID + * @sip: local IP address to bind server to + * @sport: the server's TCP port + * @queue: queue to direct messages from this server to + * + * Create an IP server for the given port and address. + * Returns <0 on error and one of the %NET_XMIT_* values on success. + */ +int cxgb4_create_server(const struct net_device *dev, unsigned int stid, + __be32 sip, __be16 sport, __be16 vlan, + unsigned int queue) +{ + unsigned int chan; + struct sk_buff *skb; + struct adapter *adap; + struct cpl_pass_open_req *req; + int ret; + + skb = alloc_skb(sizeof(*req), GFP_KERNEL); + if (!skb) + return -ENOMEM; + + adap = netdev2adap(dev); + req = (struct cpl_pass_open_req *)__skb_put(skb, sizeof(*req)); + INIT_TP_WR(req, 0); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, stid)); + req->local_port = sport; + req->peer_port = htons(0); + req->local_ip = sip; + req->peer_ip = htonl(0); + chan = rxq_to_chan(&adap->sge, queue); + req->opt0 = cpu_to_be64(TX_CHAN(chan)); + req->opt1 = cpu_to_be64(CONN_POLICY_ASK | + SYN_RSS_ENABLE | SYN_RSS_QUEUE(queue)); + ret = t4_mgmt_tx(adap, skb); + return net_xmit_eval(ret); +} +EXPORT_SYMBOL(cxgb4_create_server); + +/* cxgb4_create_server6 - create an IPv6 server + * @dev: the device + * @stid: the server TID + * @sip: local IPv6 address to bind server to + * @sport: the server's TCP port + * @queue: queue to direct messages from this server to + * + * Create an IPv6 server for the given port and address. + * Returns <0 on error and one of the %NET_XMIT_* values on success. + */ +int cxgb4_create_server6(const struct net_device *dev, unsigned int stid, + const struct in6_addr *sip, __be16 sport, + unsigned int queue) +{ + unsigned int chan; + struct sk_buff *skb; + struct adapter *adap; + struct cpl_pass_open_req6 *req; + int ret; + + skb = alloc_skb(sizeof(*req), GFP_KERNEL); + if (!skb) + return -ENOMEM; + + adap = netdev2adap(dev); + req = (struct cpl_pass_open_req6 *)__skb_put(skb, sizeof(*req)); + INIT_TP_WR(req, 0); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, stid)); + req->local_port = sport; + req->peer_port = htons(0); + req->local_ip_hi = *(__be64 *)(sip->s6_addr); + req->local_ip_lo = *(__be64 *)(sip->s6_addr + 8); + req->peer_ip_hi = cpu_to_be64(0); + req->peer_ip_lo = cpu_to_be64(0); + chan = rxq_to_chan(&adap->sge, queue); + req->opt0 = cpu_to_be64(TX_CHAN(chan)); + req->opt1 = cpu_to_be64(CONN_POLICY_ASK | + SYN_RSS_ENABLE | SYN_RSS_QUEUE(queue)); + ret = t4_mgmt_tx(adap, skb); + return net_xmit_eval(ret); +} +EXPORT_SYMBOL(cxgb4_create_server6); + +int cxgb4_remove_server(const struct net_device *dev, unsigned int stid, + unsigned int queue, bool ipv6) +{ + struct sk_buff *skb; + struct adapter *adap; + struct cpl_close_listsvr_req *req; + int ret; + + adap = netdev2adap(dev); + + skb = alloc_skb(sizeof(*req), GFP_KERNEL); + if (!skb) + return -ENOMEM; + + req = (struct cpl_close_listsvr_req *)__skb_put(skb, sizeof(*req)); + INIT_TP_WR(req, 0); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, stid)); + req->reply_ctrl = htons(NO_REPLY(0) | (ipv6 ? LISTSVR_IPV6(1) : + LISTSVR_IPV6(0)) | QUEUENO(queue)); + ret = t4_mgmt_tx(adap, skb); + return net_xmit_eval(ret); +} +EXPORT_SYMBOL(cxgb4_remove_server); + +/** + * cxgb4_best_mtu - find the entry in the MTU table closest to an MTU + * @mtus: the HW MTU table + * @mtu: the target MTU + * @idx: index of selected entry in the MTU table + * + * Returns the index and the value in the HW MTU table that is closest to + * but does not exceed @mtu, unless @mtu is smaller than any value in the + * table, in which case that smallest available value is selected. + */ +unsigned int cxgb4_best_mtu(const unsigned short *mtus, unsigned short mtu, + unsigned int *idx) +{ + unsigned int i = 0; + + while (i < NMTUS - 1 && mtus[i + 1] <= mtu) + ++i; + if (idx) + *idx = i; + return mtus[i]; +} +EXPORT_SYMBOL(cxgb4_best_mtu); + +/** + * cxgb4_best_aligned_mtu - find best MTU, [hopefully] data size aligned + * @mtus: the HW MTU table + * @header_size: Header Size + * @data_size_max: maximum Data Segment Size + * @data_size_align: desired Data Segment Size Alignment (2^N) + * @mtu_idxp: HW MTU Table Index return value pointer (possibly NULL) + * + * Similar to cxgb4_best_mtu() but instead of searching the Hardware + * MTU Table based solely on a Maximum MTU parameter, we break that + * parameter up into a Header Size and Maximum Data Segment Size, and + * provide a desired Data Segment Size Alignment. If we find an MTU in + * the Hardware MTU Table which will result in a Data Segment Size with + * the requested alignment _and_ that MTU isn't "too far" from the + * closest MTU, then we'll return that rather than the closest MTU. + */ +unsigned int cxgb4_best_aligned_mtu(const unsigned short *mtus, + unsigned short header_size, + unsigned short data_size_max, + unsigned short data_size_align, + unsigned int *mtu_idxp) +{ + unsigned short max_mtu = header_size + data_size_max; + unsigned short data_size_align_mask = data_size_align - 1; + int mtu_idx, aligned_mtu_idx; + + /* Scan the MTU Table till we find an MTU which is larger than our + * Maximum MTU or we reach the end of the table. Along the way, + * record the last MTU found, if any, which will result in a Data + * Segment Length matching the requested alignment. + */ + for (mtu_idx = 0, aligned_mtu_idx = -1; mtu_idx < NMTUS; mtu_idx++) { + unsigned short data_size = mtus[mtu_idx] - header_size; + + /* If this MTU minus the Header Size would result in a + * Data Segment Size of the desired alignment, remember it. + */ + if ((data_size & data_size_align_mask) == 0) + aligned_mtu_idx = mtu_idx; + + /* If we're not at the end of the Hardware MTU Table and the + * next element is larger than our Maximum MTU, drop out of + * the loop. + */ + if (mtu_idx+1 < NMTUS && mtus[mtu_idx+1] > max_mtu) + break; + } + + /* If we fell out of the loop because we ran to the end of the table, + * then we just have to use the last [largest] entry. + */ + if (mtu_idx == NMTUS) + mtu_idx--; + + /* If we found an MTU which resulted in the requested Data Segment + * Length alignment and that's "not far" from the largest MTU which is + * less than or equal to the maximum MTU, then use that. + */ + if (aligned_mtu_idx >= 0 && + mtu_idx - aligned_mtu_idx <= 1) + mtu_idx = aligned_mtu_idx; + + /* If the caller has passed in an MTU Index pointer, pass the + * MTU Index back. Return the MTU value. + */ + if (mtu_idxp) + *mtu_idxp = mtu_idx; + return mtus[mtu_idx]; +} +EXPORT_SYMBOL(cxgb4_best_aligned_mtu); + +/** + * cxgb4_port_chan - get the HW channel of a port + * @dev: the net device for the port + * + * Return the HW Tx channel of the given port. + */ +unsigned int cxgb4_port_chan(const struct net_device *dev) +{ + return netdev2pinfo(dev)->tx_chan; +} +EXPORT_SYMBOL(cxgb4_port_chan); + +unsigned int cxgb4_dbfifo_count(const struct net_device *dev, int lpfifo) +{ + struct adapter *adap = netdev2adap(dev); + u32 v1, v2, lp_count, hp_count; + + v1 = t4_read_reg(adap, A_SGE_DBFIFO_STATUS); + v2 = t4_read_reg(adap, SGE_DBFIFO_STATUS2); + if (is_t4(adap->params.chip)) { + lp_count = G_LP_COUNT(v1); + hp_count = G_HP_COUNT(v1); + } else { + lp_count = G_LP_COUNT_T5(v1); + hp_count = G_HP_COUNT_T5(v2); + } + return lpfifo ? lp_count : hp_count; +} +EXPORT_SYMBOL(cxgb4_dbfifo_count); + +/** + * cxgb4_port_viid - get the VI id of a port + * @dev: the net device for the port + * + * Return the VI id of the given port. + */ +unsigned int cxgb4_port_viid(const struct net_device *dev) +{ + return netdev2pinfo(dev)->viid; +} +EXPORT_SYMBOL(cxgb4_port_viid); + +/** + * cxgb4_port_idx - get the index of a port + * @dev: the net device for the port + * + * Return the index of the given port. + */ +unsigned int cxgb4_port_idx(const struct net_device *dev) +{ + return netdev2pinfo(dev)->port_id; +} +EXPORT_SYMBOL(cxgb4_port_idx); + +void cxgb4_get_tcp_stats(struct pci_dev *pdev, struct tp_tcp_stats *v4, + struct tp_tcp_stats *v6) +{ + struct adapter *adap = pci_get_drvdata(pdev); + + spin_lock(&adap->stats_lock); + t4_tp_get_tcp_stats(adap, v4, v6); + spin_unlock(&adap->stats_lock); +} +EXPORT_SYMBOL(cxgb4_get_tcp_stats); + +void cxgb4_iscsi_init(struct net_device *dev, unsigned int tag_mask, + const unsigned int *pgsz_order) +{ + struct adapter *adap = netdev2adap(dev); + + t4_write_reg(adap, ULP_RX_ISCSI_TAGMASK, tag_mask); + t4_write_reg(adap, ULP_RX_ISCSI_PSZ, HPZ0(pgsz_order[0]) | + HPZ1(pgsz_order[1]) | HPZ2(pgsz_order[2]) | + HPZ3(pgsz_order[3])); +} +EXPORT_SYMBOL(cxgb4_iscsi_init); + +int cxgb4_flush_eq_cache(struct net_device *dev) +{ + struct adapter *adap = netdev2adap(dev); + int ret; + + ret = t4_fwaddrspace_write(adap, adap->mbox, + 0xe1000000 + A_SGE_CTXT_CMD, 0x20000000); + return ret; +} +EXPORT_SYMBOL(cxgb4_flush_eq_cache); + +static int read_eq_indices(struct adapter *adap, u16 qid, u16 *pidx, u16 *cidx) +{ + u32 addr = t4_read_reg(adap, A_SGE_DBQ_CTXT_BADDR) + 24 * qid + 8; + __be64 indices; + int ret; + + spin_lock(&adap->win0_lock); + ret = t4_memory_rw(adap, 0, MEM_EDC0, addr, + sizeof(indices), (__be32 *)&indices, + T4_MEMORY_READ); + spin_unlock(&adap->win0_lock); + if (!ret) { + *cidx = (be64_to_cpu(indices) >> 25) & 0xffff; + *pidx = (be64_to_cpu(indices) >> 9) & 0xffff; + } + return ret; +} + +int cxgb4_sync_txq_pidx(struct net_device *dev, u16 qid, u16 pidx, + u16 size) +{ + struct adapter *adap = netdev2adap(dev); + u16 hw_pidx, hw_cidx; + int ret; + + ret = read_eq_indices(adap, qid, &hw_pidx, &hw_cidx); + if (ret) + goto out; + + if (pidx != hw_pidx) { + u16 delta; + + if (pidx >= hw_pidx) + delta = pidx - hw_pidx; + else + delta = size - hw_pidx + pidx; + wmb(); + t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL), + QID(qid) | PIDX(delta)); + } +out: + return ret; +} +EXPORT_SYMBOL(cxgb4_sync_txq_pidx); + +void cxgb4_disable_db_coalescing(struct net_device *dev) +{ + struct adapter *adap; + + adap = netdev2adap(dev); + t4_set_reg_field(adap, A_SGE_DOORBELL_CONTROL, F_NOCOALESCE, + F_NOCOALESCE); +} +EXPORT_SYMBOL(cxgb4_disable_db_coalescing); + +void cxgb4_enable_db_coalescing(struct net_device *dev) +{ + struct adapter *adap; + + adap = netdev2adap(dev); + t4_set_reg_field(adap, A_SGE_DOORBELL_CONTROL, F_NOCOALESCE, 0); +} +EXPORT_SYMBOL(cxgb4_enable_db_coalescing); + +int cxgb4_read_tpte(struct net_device *dev, u32 stag, __be32 *tpte) +{ + struct adapter *adap; + u32 offset, memtype, memaddr; + u32 edc0_size, edc1_size, mc0_size, mc1_size; + u32 edc0_end, edc1_end, mc0_end, mc1_end; + int ret; + + adap = netdev2adap(dev); + + offset = ((stag >> 8) * 32) + adap->vres.stag.start; + + /* Figure out where the offset lands in the Memory Type/Address scheme. + * This code assumes that the memory is laid out starting at offset 0 + * with no breaks as: EDC0, EDC1, MC0, MC1. All cards have both EDC0 + * and EDC1. Some cards will have neither MC0 nor MC1, most cards have + * MC0, and some have both MC0 and MC1. + */ + edc0_size = EDRAM_SIZE_GET(t4_read_reg(adap, MA_EDRAM0_BAR)) << 20; + edc1_size = EDRAM_SIZE_GET(t4_read_reg(adap, MA_EDRAM1_BAR)) << 20; + mc0_size = EXT_MEM_SIZE_GET(t4_read_reg(adap, MA_EXT_MEMORY_BAR)) << 20; + + edc0_end = edc0_size; + edc1_end = edc0_end + edc1_size; + mc0_end = edc1_end + mc0_size; + + if (offset < edc0_end) { + memtype = MEM_EDC0; + memaddr = offset; + } else if (offset < edc1_end) { + memtype = MEM_EDC1; + memaddr = offset - edc0_end; + } else { + if (offset < mc0_end) { + memtype = MEM_MC0; + memaddr = offset - edc1_end; + } else if (is_t4(adap->params.chip)) { + /* T4 only has a single memory channel */ + goto err; + } else { + mc1_size = EXT_MEM_SIZE_GET( + t4_read_reg(adap, + MA_EXT_MEMORY1_BAR)) << 20; + mc1_end = mc0_end + mc1_size; + if (offset < mc1_end) { + memtype = MEM_MC1; + memaddr = offset - mc0_end; + } else { + /* offset beyond the end of any memory */ + goto err; + } + } + } + + spin_lock(&adap->win0_lock); + ret = t4_memory_rw(adap, 0, memtype, memaddr, 32, tpte, T4_MEMORY_READ); + spin_unlock(&adap->win0_lock); + return ret; + +err: + dev_err(adap->pdev_dev, "stag %#x, offset %#x out of range\n", + stag, offset); + return -EINVAL; +} +EXPORT_SYMBOL(cxgb4_read_tpte); + +u64 cxgb4_read_sge_timestamp(struct net_device *dev) +{ + u32 hi, lo; + struct adapter *adap; + + adap = netdev2adap(dev); + lo = t4_read_reg(adap, SGE_TIMESTAMP_LO); + hi = GET_TSVAL(t4_read_reg(adap, SGE_TIMESTAMP_HI)); + + return ((u64)hi << 32) | (u64)lo; +} +EXPORT_SYMBOL(cxgb4_read_sge_timestamp); + +static struct pci_driver cxgb4_driver; + +static void check_neigh_update(struct neighbour *neigh) +{ + const struct device *parent; + const struct net_device *netdev = neigh->dev; + + if (netdev->priv_flags & IFF_802_1Q_VLAN) + netdev = vlan_dev_real_dev(netdev); + parent = netdev->dev.parent; + if (parent && parent->driver == &cxgb4_driver.driver) + t4_l2t_update(dev_get_drvdata(parent), neigh); +} + +static int netevent_cb(struct notifier_block *nb, unsigned long event, + void *data) +{ + switch (event) { + case NETEVENT_NEIGH_UPDATE: + check_neigh_update(data); + break; + case NETEVENT_REDIRECT: + default: + break; + } + return 0; +} + +static bool netevent_registered; +static struct notifier_block cxgb4_netevent_nb = { + .notifier_call = netevent_cb +}; + +static void drain_db_fifo(struct adapter *adap, int usecs) +{ + u32 v1, v2, lp_count, hp_count; + + do { + v1 = t4_read_reg(adap, A_SGE_DBFIFO_STATUS); + v2 = t4_read_reg(adap, SGE_DBFIFO_STATUS2); + if (is_t4(adap->params.chip)) { + lp_count = G_LP_COUNT(v1); + hp_count = G_HP_COUNT(v1); + } else { + lp_count = G_LP_COUNT_T5(v1); + hp_count = G_HP_COUNT_T5(v2); + } + + if (lp_count == 0 && hp_count == 0) + break; + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(usecs_to_jiffies(usecs)); + } while (1); +} + +static void disable_txq_db(struct sge_txq *q) +{ + unsigned long flags; + + spin_lock_irqsave(&q->db_lock, flags); + q->db_disabled = 1; + spin_unlock_irqrestore(&q->db_lock, flags); +} + +static void enable_txq_db(struct adapter *adap, struct sge_txq *q) +{ + spin_lock_irq(&q->db_lock); + if (q->db_pidx_inc) { + /* Make sure that all writes to the TX descriptors + * are committed before we tell HW about them. + */ + wmb(); + t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL), + QID(q->cntxt_id) | PIDX(q->db_pidx_inc)); + q->db_pidx_inc = 0; + } + q->db_disabled = 0; + spin_unlock_irq(&q->db_lock); +} + +static void disable_dbs(struct adapter *adap) +{ + int i; + + for_each_ethrxq(&adap->sge, i) + disable_txq_db(&adap->sge.ethtxq[i].q); + for_each_ofldrxq(&adap->sge, i) + disable_txq_db(&adap->sge.ofldtxq[i].q); + for_each_port(adap, i) + disable_txq_db(&adap->sge.ctrlq[i].q); +} + +static void enable_dbs(struct adapter *adap) +{ + int i; + + for_each_ethrxq(&adap->sge, i) + enable_txq_db(adap, &adap->sge.ethtxq[i].q); + for_each_ofldrxq(&adap->sge, i) + enable_txq_db(adap, &adap->sge.ofldtxq[i].q); + for_each_port(adap, i) + enable_txq_db(adap, &adap->sge.ctrlq[i].q); +} + +static void notify_rdma_uld(struct adapter *adap, enum cxgb4_control cmd) +{ + if (adap->uld_handle[CXGB4_ULD_RDMA]) + ulds[CXGB4_ULD_RDMA].control(adap->uld_handle[CXGB4_ULD_RDMA], + cmd); +} + +static void process_db_full(struct work_struct *work) +{ + struct adapter *adap; + + adap = container_of(work, struct adapter, db_full_task); + + drain_db_fifo(adap, dbfifo_drain_delay); + enable_dbs(adap); + notify_rdma_uld(adap, CXGB4_CONTROL_DB_EMPTY); + t4_set_reg_field(adap, SGE_INT_ENABLE3, + DBFIFO_HP_INT | DBFIFO_LP_INT, + DBFIFO_HP_INT | DBFIFO_LP_INT); +} + +static void sync_txq_pidx(struct adapter *adap, struct sge_txq *q) +{ + u16 hw_pidx, hw_cidx; + int ret; + + spin_lock_irq(&q->db_lock); + ret = read_eq_indices(adap, (u16)q->cntxt_id, &hw_pidx, &hw_cidx); + if (ret) + goto out; + if (q->db_pidx != hw_pidx) { + u16 delta; + + if (q->db_pidx >= hw_pidx) + delta = q->db_pidx - hw_pidx; + else + delta = q->size - hw_pidx + q->db_pidx; + wmb(); + t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL), + QID(q->cntxt_id) | PIDX(delta)); + } +out: + q->db_disabled = 0; + q->db_pidx_inc = 0; + spin_unlock_irq(&q->db_lock); + if (ret) + CH_WARN(adap, "DB drop recovery failed.\n"); +} +static void recover_all_queues(struct adapter *adap) +{ + int i; + + for_each_ethrxq(&adap->sge, i) + sync_txq_pidx(adap, &adap->sge.ethtxq[i].q); + for_each_ofldrxq(&adap->sge, i) + sync_txq_pidx(adap, &adap->sge.ofldtxq[i].q); + for_each_port(adap, i) + sync_txq_pidx(adap, &adap->sge.ctrlq[i].q); +} + +static void process_db_drop(struct work_struct *work) +{ + struct adapter *adap; + + adap = container_of(work, struct adapter, db_drop_task); + + if (is_t4(adap->params.chip)) { + drain_db_fifo(adap, dbfifo_drain_delay); + notify_rdma_uld(adap, CXGB4_CONTROL_DB_DROP); + drain_db_fifo(adap, dbfifo_drain_delay); + recover_all_queues(adap); + drain_db_fifo(adap, dbfifo_drain_delay); + enable_dbs(adap); + notify_rdma_uld(adap, CXGB4_CONTROL_DB_EMPTY); + } else { + u32 dropped_db = t4_read_reg(adap, 0x010ac); + u16 qid = (dropped_db >> 15) & 0x1ffff; + u16 pidx_inc = dropped_db & 0x1fff; + unsigned int s_qpp; + unsigned short udb_density; + unsigned long qpshift; + int page; + u32 udb; + + dev_warn(adap->pdev_dev, + "Dropped DB 0x%x qid %d bar2 %d coalesce %d pidx %d\n", + dropped_db, qid, + (dropped_db >> 14) & 1, + (dropped_db >> 13) & 1, + pidx_inc); + + drain_db_fifo(adap, 1); + + s_qpp = QUEUESPERPAGEPF1 * adap->fn; + udb_density = 1 << QUEUESPERPAGEPF0_GET(t4_read_reg(adap, + SGE_EGRESS_QUEUES_PER_PAGE_PF) >> s_qpp); + qpshift = PAGE_SHIFT - ilog2(udb_density); + udb = qid << qpshift; + udb &= PAGE_MASK; + page = udb / PAGE_SIZE; + udb += (qid - (page * udb_density)) * 128; + + writel(PIDX(pidx_inc), adap->bar2 + udb + 8); + + /* Re-enable BAR2 WC */ + t4_set_reg_field(adap, 0x10b0, 1<<15, 1<<15); + } + + t4_set_reg_field(adap, A_SGE_DOORBELL_CONTROL, F_DROPPED_DB, 0); +} + +void t4_db_full(struct adapter *adap) +{ + if (is_t4(adap->params.chip)) { + disable_dbs(adap); + notify_rdma_uld(adap, CXGB4_CONTROL_DB_FULL); + t4_set_reg_field(adap, SGE_INT_ENABLE3, + DBFIFO_HP_INT | DBFIFO_LP_INT, 0); + queue_work(adap->workq, &adap->db_full_task); + } +} + +void t4_db_dropped(struct adapter *adap) +{ + if (is_t4(adap->params.chip)) { + disable_dbs(adap); + notify_rdma_uld(adap, CXGB4_CONTROL_DB_FULL); + } + queue_work(adap->workq, &adap->db_drop_task); +} + +static void uld_attach(struct adapter *adap, unsigned int uld) +{ + void *handle; + struct cxgb4_lld_info lli; + unsigned short i; + + lli.pdev = adap->pdev; + lli.pf = adap->fn; + lli.l2t = adap->l2t; + lli.tids = &adap->tids; + lli.ports = adap->port; + lli.vr = &adap->vres; + lli.mtus = adap->params.mtus; + if (uld == CXGB4_ULD_RDMA) { + lli.rxq_ids = adap->sge.rdma_rxq; + lli.ciq_ids = adap->sge.rdma_ciq; + lli.nrxq = adap->sge.rdmaqs; + lli.nciq = adap->sge.rdmaciqs; + } else if (uld == CXGB4_ULD_ISCSI) { + lli.rxq_ids = adap->sge.ofld_rxq; + lli.nrxq = adap->sge.ofldqsets; + } + lli.ntxq = adap->sge.ofldqsets; + lli.nchan = adap->params.nports; + lli.nports = adap->params.nports; + lli.wr_cred = adap->params.ofldq_wr_cred; + lli.adapter_type = adap->params.chip; + lli.iscsi_iolen = MAXRXDATA_GET(t4_read_reg(adap, TP_PARA_REG2)); + lli.cclk_ps = 1000000000 / adap->params.vpd.cclk; + lli.udb_density = 1 << QUEUESPERPAGEPF0_GET( + t4_read_reg(adap, SGE_EGRESS_QUEUES_PER_PAGE_PF) >> + (adap->fn * 4)); + lli.ucq_density = 1 << QUEUESPERPAGEPF0_GET( + t4_read_reg(adap, SGE_INGRESS_QUEUES_PER_PAGE_PF) >> + (adap->fn * 4)); + lli.filt_mode = adap->params.tp.vlan_pri_map; + /* MODQ_REQ_MAP sets queues 0-3 to chan 0-3 */ + for (i = 0; i < NCHAN; i++) + lli.tx_modq[i] = i; + lli.gts_reg = adap->regs + MYPF_REG(SGE_PF_GTS); + lli.db_reg = adap->regs + MYPF_REG(SGE_PF_KDOORBELL); + lli.fw_vers = adap->params.fw_vers; + lli.dbfifo_int_thresh = dbfifo_int_thresh; + lli.sge_ingpadboundary = adap->sge.fl_align; + lli.sge_egrstatuspagesize = adap->sge.stat_len; + lli.sge_pktshift = adap->sge.pktshift; + lli.enable_fw_ofld_conn = adap->flags & FW_OFLD_CONN; + lli.max_ordird_qp = adap->params.max_ordird_qp; + lli.max_ird_adapter = adap->params.max_ird_adapter; + lli.ulptx_memwrite_dsgl = adap->params.ulptx_memwrite_dsgl; + + handle = ulds[uld].add(&lli); + if (IS_ERR(handle)) { + dev_warn(adap->pdev_dev, + "could not attach to the %s driver, error %ld\n", + uld_str[uld], PTR_ERR(handle)); + return; + } + + adap->uld_handle[uld] = handle; + + if (!netevent_registered) { + register_netevent_notifier(&cxgb4_netevent_nb); + netevent_registered = true; + } + + if (adap->flags & FULL_INIT_DONE) + ulds[uld].state_change(handle, CXGB4_STATE_UP); +} + +static void attach_ulds(struct adapter *adap) +{ + unsigned int i; + + spin_lock(&adap_rcu_lock); + list_add_tail_rcu(&adap->rcu_node, &adap_rcu_list); + spin_unlock(&adap_rcu_lock); + + mutex_lock(&uld_mutex); + list_add_tail(&adap->list_node, &adapter_list); + for (i = 0; i < CXGB4_ULD_MAX; i++) + if (ulds[i].add) + uld_attach(adap, i); + mutex_unlock(&uld_mutex); +} + +static void detach_ulds(struct adapter *adap) +{ + unsigned int i; + + mutex_lock(&uld_mutex); + list_del(&adap->list_node); + for (i = 0; i < CXGB4_ULD_MAX; i++) + if (adap->uld_handle[i]) { + ulds[i].state_change(adap->uld_handle[i], + CXGB4_STATE_DETACH); + adap->uld_handle[i] = NULL; + } + if (netevent_registered && list_empty(&adapter_list)) { + unregister_netevent_notifier(&cxgb4_netevent_nb); + netevent_registered = false; + } + mutex_unlock(&uld_mutex); + + spin_lock(&adap_rcu_lock); + list_del_rcu(&adap->rcu_node); + spin_unlock(&adap_rcu_lock); +} + +static void notify_ulds(struct adapter *adap, enum cxgb4_state new_state) +{ + unsigned int i; + + mutex_lock(&uld_mutex); + for (i = 0; i < CXGB4_ULD_MAX; i++) + if (adap->uld_handle[i]) + ulds[i].state_change(adap->uld_handle[i], new_state); + mutex_unlock(&uld_mutex); +} + +/** + * cxgb4_register_uld - register an upper-layer driver + * @type: the ULD type + * @p: the ULD methods + * + * Registers an upper-layer driver with this driver and notifies the ULD + * about any presently available devices that support its type. Returns + * %-EBUSY if a ULD of the same type is already registered. + */ +int cxgb4_register_uld(enum cxgb4_uld type, const struct cxgb4_uld_info *p) +{ + int ret = 0; + struct adapter *adap; + + if (type >= CXGB4_ULD_MAX) + return -EINVAL; + mutex_lock(&uld_mutex); + if (ulds[type].add) { + ret = -EBUSY; + goto out; + } + ulds[type] = *p; + list_for_each_entry(adap, &adapter_list, list_node) + uld_attach(adap, type); +out: mutex_unlock(&uld_mutex); + return ret; +} +EXPORT_SYMBOL(cxgb4_register_uld); + +/** + * cxgb4_unregister_uld - unregister an upper-layer driver + * @type: the ULD type + * + * Unregisters an existing upper-layer driver. + */ +int cxgb4_unregister_uld(enum cxgb4_uld type) +{ + struct adapter *adap; + + if (type >= CXGB4_ULD_MAX) + return -EINVAL; + mutex_lock(&uld_mutex); + list_for_each_entry(adap, &adapter_list, list_node) + adap->uld_handle[type] = NULL; + ulds[type].add = NULL; + mutex_unlock(&uld_mutex); + return 0; +} +EXPORT_SYMBOL(cxgb4_unregister_uld); + +/* Check if netdev on which event is occured belongs to us or not. Return + * success (true) if it belongs otherwise failure (false). + * Called with rcu_read_lock() held. + */ +#if IS_ENABLED(CONFIG_IPV6) +static bool cxgb4_netdev(const struct net_device *netdev) +{ + struct adapter *adap; + int i; + + list_for_each_entry_rcu(adap, &adap_rcu_list, rcu_node) + for (i = 0; i < MAX_NPORTS; i++) + if (adap->port[i] == netdev) + return true; + return false; +} + +static int clip_add(struct net_device *event_dev, struct inet6_ifaddr *ifa, + unsigned long event) +{ + int ret = NOTIFY_DONE; + + rcu_read_lock(); + if (cxgb4_netdev(event_dev)) { + switch (event) { + case NETDEV_UP: + ret = cxgb4_clip_get(event_dev, + (const struct in6_addr *)ifa->addr.s6_addr); + if (ret < 0) { + rcu_read_unlock(); + return ret; + } + ret = NOTIFY_OK; + break; + case NETDEV_DOWN: + cxgb4_clip_release(event_dev, + (const struct in6_addr *)ifa->addr.s6_addr); + ret = NOTIFY_OK; + break; + default: + break; + } + } + rcu_read_unlock(); + return ret; +} + +static int cxgb4_inet6addr_handler(struct notifier_block *this, + unsigned long event, void *data) +{ + struct inet6_ifaddr *ifa = data; + struct net_device *event_dev; + int ret = NOTIFY_DONE; + struct bonding *bond = netdev_priv(ifa->idev->dev); + struct list_head *iter; + struct slave *slave; + struct pci_dev *first_pdev = NULL; + + if (ifa->idev->dev->priv_flags & IFF_802_1Q_VLAN) { + event_dev = vlan_dev_real_dev(ifa->idev->dev); + ret = clip_add(event_dev, ifa, event); + } else if (ifa->idev->dev->flags & IFF_MASTER) { + /* It is possible that two different adapters are bonded in one + * bond. We need to find such different adapters and add clip + * in all of them only once. + */ + bond_for_each_slave(bond, slave, iter) { + if (!first_pdev) { + ret = clip_add(slave->dev, ifa, event); + /* If clip_add is success then only initialize + * first_pdev since it means it is our device + */ + if (ret == NOTIFY_OK) + first_pdev = to_pci_dev( + slave->dev->dev.parent); + } else if (first_pdev != + to_pci_dev(slave->dev->dev.parent)) + ret = clip_add(slave->dev, ifa, event); + } + } else + ret = clip_add(ifa->idev->dev, ifa, event); + + return ret; +} + +static struct notifier_block cxgb4_inet6addr_notifier = { + .notifier_call = cxgb4_inet6addr_handler +}; + +/* Retrieves IPv6 addresses from a root device (bond, vlan) associated with + * a physical device. + * The physical device reference is needed to send the actul CLIP command. + */ +static int update_dev_clip(struct net_device *root_dev, struct net_device *dev) +{ + struct inet6_dev *idev = NULL; + struct inet6_ifaddr *ifa; + int ret = 0; + + idev = __in6_dev_get(root_dev); + if (!idev) + return ret; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifa, &idev->addr_list, if_list) { + ret = cxgb4_clip_get(dev, + (const struct in6_addr *)ifa->addr.s6_addr); + if (ret < 0) + break; + } + read_unlock_bh(&idev->lock); + + return ret; +} + +static int update_root_dev_clip(struct net_device *dev) +{ + struct net_device *root_dev = NULL; + int i, ret = 0; + + /* First populate the real net device's IPv6 addresses */ + ret = update_dev_clip(dev, dev); + if (ret) + return ret; + + /* Parse all bond and vlan devices layered on top of the physical dev */ + root_dev = netdev_master_upper_dev_get_rcu(dev); + if (root_dev) { + ret = update_dev_clip(root_dev, dev); + if (ret) + return ret; + } + + for (i = 0; i < VLAN_N_VID; i++) { + root_dev = __vlan_find_dev_deep_rcu(dev, htons(ETH_P_8021Q), i); + if (!root_dev) + continue; + + ret = update_dev_clip(root_dev, dev); + if (ret) + break; + } + return ret; +} + +static void update_clip(const struct adapter *adap) +{ + int i; + struct net_device *dev; + int ret; + + rcu_read_lock(); + + for (i = 0; i < MAX_NPORTS; i++) { + dev = adap->port[i]; + ret = 0; + + if (dev) + ret = update_root_dev_clip(dev); + + if (ret < 0) + break; + } + rcu_read_unlock(); +} +#endif /* IS_ENABLED(CONFIG_IPV6) */ + +/** + * cxgb_up - enable the adapter + * @adap: adapter being enabled + * + * Called when the first port is enabled, this function performs the + * actions necessary to make an adapter operational, such as completing + * the initialization of HW modules, and enabling interrupts. + * + * Must be called with the rtnl lock held. + */ +static int cxgb_up(struct adapter *adap) +{ + int err; + + err = setup_sge_queues(adap); + if (err) + goto out; + err = setup_rss(adap); + if (err) + goto freeq; + + if (adap->flags & USING_MSIX) { + name_msix_vecs(adap); + err = request_irq(adap->msix_info[0].vec, t4_nondata_intr, 0, + adap->msix_info[0].desc, adap); + if (err) + goto irq_err; + + err = request_msix_queue_irqs(adap); + if (err) { + free_irq(adap->msix_info[0].vec, adap); + goto irq_err; + } + } else { + err = request_irq(adap->pdev->irq, t4_intr_handler(adap), + (adap->flags & USING_MSI) ? 0 : IRQF_SHARED, + adap->port[0]->name, adap); + if (err) + goto irq_err; + } + enable_rx(adap); + t4_sge_start(adap); + t4_intr_enable(adap); + adap->flags |= FULL_INIT_DONE; + notify_ulds(adap, CXGB4_STATE_UP); +#if IS_ENABLED(CONFIG_IPV6) + update_clip(adap); +#endif + out: + return err; + irq_err: + dev_err(adap->pdev_dev, "request_irq failed, err %d\n", err); + freeq: + t4_free_sge_resources(adap); + goto out; +} + +static void cxgb_down(struct adapter *adapter) +{ + t4_intr_disable(adapter); + cancel_work_sync(&adapter->tid_release_task); + cancel_work_sync(&adapter->db_full_task); + cancel_work_sync(&adapter->db_drop_task); + adapter->tid_release_task_busy = false; + adapter->tid_release_head = NULL; + + if (adapter->flags & USING_MSIX) { + free_msix_queue_irqs(adapter); + free_irq(adapter->msix_info[0].vec, adapter); + } else + free_irq(adapter->pdev->irq, adapter); + quiesce_rx(adapter); + t4_sge_stop(adapter); + t4_free_sge_resources(adapter); + adapter->flags &= ~FULL_INIT_DONE; +} + +/* + * net_device operations + */ +static int cxgb_open(struct net_device *dev) +{ + int err; + struct port_info *pi = netdev_priv(dev); + struct adapter *adapter = pi->adapter; + + netif_carrier_off(dev); + + if (!(adapter->flags & FULL_INIT_DONE)) { + err = cxgb_up(adapter); + if (err < 0) + return err; + } + + err = link_start(dev); + if (!err) + netif_tx_start_all_queues(dev); + return err; +} + +static int cxgb_close(struct net_device *dev) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adapter = pi->adapter; + + netif_tx_stop_all_queues(dev); + netif_carrier_off(dev); + return t4_enable_vi(adapter, adapter->fn, pi->viid, false, false); +} + +/* Return an error number if the indicated filter isn't writable ... + */ +static int writable_filter(struct filter_entry *f) +{ + if (f->locked) + return -EPERM; + if (f->pending) + return -EBUSY; + + return 0; +} + +/* Delete the filter at the specified index (if valid). The checks for all + * the common problems with doing this like the filter being locked, currently + * pending in another operation, etc. + */ +static int delete_filter(struct adapter *adapter, unsigned int fidx) +{ + struct filter_entry *f; + int ret; + + if (fidx >= adapter->tids.nftids + adapter->tids.nsftids) + return -EINVAL; + + f = &adapter->tids.ftid_tab[fidx]; + ret = writable_filter(f); + if (ret) + return ret; + if (f->valid) + return del_filter_wr(adapter, fidx); + + return 0; +} + +int cxgb4_create_server_filter(const struct net_device *dev, unsigned int stid, + __be32 sip, __be16 sport, __be16 vlan, + unsigned int queue, unsigned char port, unsigned char mask) +{ + int ret; + struct filter_entry *f; + struct adapter *adap; + int i; + u8 *val; + + adap = netdev2adap(dev); + + /* Adjust stid to correct filter index */ + stid -= adap->tids.sftid_base; + stid += adap->tids.nftids; + + /* Check to make sure the filter requested is writable ... + */ + f = &adap->tids.ftid_tab[stid]; + ret = writable_filter(f); + if (ret) + return ret; + + /* Clear out any old resources being used by the filter before + * we start constructing the new filter. + */ + if (f->valid) + clear_filter(adap, f); + + /* Clear out filter specifications */ + memset(&f->fs, 0, sizeof(struct ch_filter_specification)); + f->fs.val.lport = cpu_to_be16(sport); + f->fs.mask.lport = ~0; + val = (u8 *)&sip; + if ((val[0] | val[1] | val[2] | val[3]) != 0) { + for (i = 0; i < 4; i++) { + f->fs.val.lip[i] = val[i]; + f->fs.mask.lip[i] = ~0; + } + if (adap->params.tp.vlan_pri_map & F_PORT) { + f->fs.val.iport = port; + f->fs.mask.iport = mask; + } + } + + if (adap->params.tp.vlan_pri_map & F_PROTOCOL) { + f->fs.val.proto = IPPROTO_TCP; + f->fs.mask.proto = ~0; + } + + f->fs.dirsteer = 1; + f->fs.iq = queue; + /* Mark filter as locked */ + f->locked = 1; + f->fs.rpttid = 1; + + ret = set_filter_wr(adap, stid); + if (ret) { + clear_filter(adap, f); + return ret; + } + + return 0; +} +EXPORT_SYMBOL(cxgb4_create_server_filter); + +int cxgb4_remove_server_filter(const struct net_device *dev, unsigned int stid, + unsigned int queue, bool ipv6) +{ + int ret; + struct filter_entry *f; + struct adapter *adap; + + adap = netdev2adap(dev); + + /* Adjust stid to correct filter index */ + stid -= adap->tids.sftid_base; + stid += adap->tids.nftids; + + f = &adap->tids.ftid_tab[stid]; + /* Unlock the filter */ + f->locked = 0; + + ret = delete_filter(adap, stid); + if (ret) + return ret; + + return 0; +} +EXPORT_SYMBOL(cxgb4_remove_server_filter); + +static struct rtnl_link_stats64 *cxgb_get_stats(struct net_device *dev, + struct rtnl_link_stats64 *ns) +{ + struct port_stats stats; + struct port_info *p = netdev_priv(dev); + struct adapter *adapter = p->adapter; + + /* Block retrieving statistics during EEH error + * recovery. Otherwise, the recovery might fail + * and the PCI device will be removed permanently + */ + spin_lock(&adapter->stats_lock); + if (!netif_device_present(dev)) { + spin_unlock(&adapter->stats_lock); + return ns; + } + t4_get_port_stats(adapter, p->tx_chan, &stats); + spin_unlock(&adapter->stats_lock); + + ns->tx_bytes = stats.tx_octets; + ns->tx_packets = stats.tx_frames; + ns->rx_bytes = stats.rx_octets; + ns->rx_packets = stats.rx_frames; + ns->multicast = stats.rx_mcast_frames; + + /* detailed rx_errors */ + ns->rx_length_errors = stats.rx_jabber + stats.rx_too_long + + stats.rx_runt; + ns->rx_over_errors = 0; + ns->rx_crc_errors = stats.rx_fcs_err; + ns->rx_frame_errors = stats.rx_symbol_err; + ns->rx_fifo_errors = stats.rx_ovflow0 + stats.rx_ovflow1 + + stats.rx_ovflow2 + stats.rx_ovflow3 + + stats.rx_trunc0 + stats.rx_trunc1 + + stats.rx_trunc2 + stats.rx_trunc3; + ns->rx_missed_errors = 0; + + /* detailed tx_errors */ + ns->tx_aborted_errors = 0; + ns->tx_carrier_errors = 0; + ns->tx_fifo_errors = 0; + ns->tx_heartbeat_errors = 0; + ns->tx_window_errors = 0; + + ns->tx_errors = stats.tx_error_frames; + ns->rx_errors = stats.rx_symbol_err + stats.rx_fcs_err + + ns->rx_length_errors + stats.rx_len_err + ns->rx_fifo_errors; + return ns; +} + +static int cxgb_ioctl(struct net_device *dev, struct ifreq *req, int cmd) +{ + unsigned int mbox; + int ret = 0, prtad, devad; + struct port_info *pi = netdev_priv(dev); + struct mii_ioctl_data *data = (struct mii_ioctl_data *)&req->ifr_data; + + switch (cmd) { + case SIOCGMIIPHY: + if (pi->mdio_addr < 0) + return -EOPNOTSUPP; + data->phy_id = pi->mdio_addr; + break; + case SIOCGMIIREG: + case SIOCSMIIREG: + if (mdio_phy_id_is_c45(data->phy_id)) { + prtad = mdio_phy_id_prtad(data->phy_id); + devad = mdio_phy_id_devad(data->phy_id); + } else if (data->phy_id < 32) { + prtad = data->phy_id; + devad = 0; + data->reg_num &= 0x1f; + } else + return -EINVAL; + + mbox = pi->adapter->fn; + if (cmd == SIOCGMIIREG) + ret = t4_mdio_rd(pi->adapter, mbox, prtad, devad, + data->reg_num, &data->val_out); + else + ret = t4_mdio_wr(pi->adapter, mbox, prtad, devad, + data->reg_num, data->val_in); + break; + default: + return -EOPNOTSUPP; + } + return ret; +} + +static void cxgb_set_rxmode(struct net_device *dev) +{ + /* unfortunately we can't return errors to the stack */ + set_rxmode(dev, -1, false); +} + +static int cxgb_change_mtu(struct net_device *dev, int new_mtu) +{ + int ret; + struct port_info *pi = netdev_priv(dev); + + if (new_mtu < 81 || new_mtu > MAX_MTU) /* accommodate SACK */ + return -EINVAL; + ret = t4_set_rxmode(pi->adapter, pi->adapter->fn, pi->viid, new_mtu, -1, + -1, -1, -1, true); + if (!ret) + dev->mtu = new_mtu; + return ret; +} + +static int cxgb_set_mac_addr(struct net_device *dev, void *p) +{ + int ret; + struct sockaddr *addr = p; + struct port_info *pi = netdev_priv(dev); + + if (!is_valid_ether_addr(addr->sa_data)) + return -EADDRNOTAVAIL; + + ret = t4_change_mac(pi->adapter, pi->adapter->fn, pi->viid, + pi->xact_addr_filt, addr->sa_data, true, true); + if (ret < 0) + return ret; + + memcpy(dev->dev_addr, addr->sa_data, dev->addr_len); + pi->xact_addr_filt = ret; + return 0; +} + +#ifdef CONFIG_NET_POLL_CONTROLLER +static void cxgb_netpoll(struct net_device *dev) +{ + struct port_info *pi = netdev_priv(dev); + struct adapter *adap = pi->adapter; + + if (adap->flags & USING_MSIX) { + int i; + struct sge_eth_rxq *rx = &adap->sge.ethrxq[pi->first_qset]; + + for (i = pi->nqsets; i; i--, rx++) + t4_sge_intr_msix(0, &rx->rspq); + } else + t4_intr_handler(adap)(0, adap); +} +#endif + +static const struct net_device_ops cxgb4_netdev_ops = { + .ndo_open = cxgb_open, + .ndo_stop = cxgb_close, + .ndo_start_xmit = t4_eth_xmit, + .ndo_select_queue = cxgb_select_queue, + .ndo_get_stats64 = cxgb_get_stats, + .ndo_set_rx_mode = cxgb_set_rxmode, + .ndo_set_mac_address = cxgb_set_mac_addr, + .ndo_set_features = cxgb_set_features, + .ndo_validate_addr = eth_validate_addr, + .ndo_do_ioctl = cxgb_ioctl, + .ndo_change_mtu = cxgb_change_mtu, +#ifdef CONFIG_NET_POLL_CONTROLLER + .ndo_poll_controller = cxgb_netpoll, +#endif +}; + +void t4_fatal_err(struct adapter *adap) +{ + t4_set_reg_field(adap, SGE_CONTROL, GLOBALENABLE, 0); + t4_intr_disable(adap); + dev_alert(adap->pdev_dev, "encountered fatal error, adapter stopped\n"); +} + +/* Return the specified PCI-E Configuration Space register from our Physical + * Function. We try first via a Firmware LDST Command since we prefer to let + * the firmware own all of these registers, but if that fails we go for it + * directly ourselves. + */ +static u32 t4_read_pcie_cfg4(struct adapter *adap, int reg) +{ + struct fw_ldst_cmd ldst_cmd; + u32 val; + int ret; + + /* Construct and send the Firmware LDST Command to retrieve the + * specified PCI-E Configuration Space register. + */ + memset(&ldst_cmd, 0, sizeof(ldst_cmd)); + ldst_cmd.op_to_addrspace = + htonl(FW_CMD_OP(FW_LDST_CMD) | + FW_CMD_REQUEST | + FW_CMD_READ | + FW_LDST_CMD_ADDRSPACE(FW_LDST_ADDRSPC_FUNC_PCIE)); + ldst_cmd.cycles_to_len16 = htonl(FW_LEN16(ldst_cmd)); + ldst_cmd.u.pcie.select_naccess = FW_LDST_CMD_NACCESS(1); + ldst_cmd.u.pcie.ctrl_to_fn = + (FW_LDST_CMD_LC | FW_LDST_CMD_FN(adap->fn)); + ldst_cmd.u.pcie.r = reg; + ret = t4_wr_mbox(adap, adap->mbox, &ldst_cmd, sizeof(ldst_cmd), + &ldst_cmd); + + /* If the LDST Command suucceeded, exctract the returned register + * value. Otherwise read it directly ourself. + */ + if (ret == 0) + val = ntohl(ldst_cmd.u.pcie.data[0]); + else + t4_hw_pci_read_cfg4(adap, reg, &val); + + return val; +} + +static void setup_memwin(struct adapter *adap) +{ + u32 mem_win0_base, mem_win1_base, mem_win2_base, mem_win2_aperture; + + if (is_t4(adap->params.chip)) { + u32 bar0; + + /* Truncation intentional: we only read the bottom 32-bits of + * the 64-bit BAR0/BAR1 ... We use the hardware backdoor + * mechanism to read BAR0 instead of using + * pci_resource_start() because we could be operating from + * within a Virtual Machine which is trapping our accesses to + * our Configuration Space and we need to set up the PCI-E + * Memory Window decoders with the actual addresses which will + * be coming across the PCI-E link. + */ + bar0 = t4_read_pcie_cfg4(adap, PCI_BASE_ADDRESS_0); + bar0 &= PCI_BASE_ADDRESS_MEM_MASK; + adap->t4_bar0 = bar0; + + mem_win0_base = bar0 + MEMWIN0_BASE; + mem_win1_base = bar0 + MEMWIN1_BASE; + mem_win2_base = bar0 + MEMWIN2_BASE; + mem_win2_aperture = MEMWIN2_APERTURE; + } else { + /* For T5, only relative offset inside the PCIe BAR is passed */ + mem_win0_base = MEMWIN0_BASE; + mem_win1_base = MEMWIN1_BASE; + mem_win2_base = MEMWIN2_BASE_T5; + mem_win2_aperture = MEMWIN2_APERTURE_T5; + } + t4_write_reg(adap, PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_BASE_WIN, 0), + mem_win0_base | BIR(0) | + WINDOW(ilog2(MEMWIN0_APERTURE) - 10)); + t4_write_reg(adap, PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_BASE_WIN, 1), + mem_win1_base | BIR(0) | + WINDOW(ilog2(MEMWIN1_APERTURE) - 10)); + t4_write_reg(adap, PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_BASE_WIN, 2), + mem_win2_base | BIR(0) | + WINDOW(ilog2(mem_win2_aperture) - 10)); + t4_read_reg(adap, PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_BASE_WIN, 2)); +} + +static void setup_memwin_rdma(struct adapter *adap) +{ + if (adap->vres.ocq.size) { + u32 start; + unsigned int sz_kb; + + start = t4_read_pcie_cfg4(adap, PCI_BASE_ADDRESS_2); + start &= PCI_BASE_ADDRESS_MEM_MASK; + start += OCQ_WIN_OFFSET(adap->pdev, &adap->vres); + sz_kb = roundup_pow_of_two(adap->vres.ocq.size) >> 10; + t4_write_reg(adap, + PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_BASE_WIN, 3), + start | BIR(1) | WINDOW(ilog2(sz_kb))); + t4_write_reg(adap, + PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_OFFSET, 3), + adap->vres.ocq.start); + t4_read_reg(adap, + PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_OFFSET, 3)); + } +} + +static int adap_init1(struct adapter *adap, struct fw_caps_config_cmd *c) +{ + u32 v; + int ret; + + /* get device capabilities */ + memset(c, 0, sizeof(*c)); + c->op_to_write = htonl(FW_CMD_OP(FW_CAPS_CONFIG_CMD) | + FW_CMD_REQUEST | FW_CMD_READ); + c->cfvalid_to_len16 = htonl(FW_LEN16(*c)); + ret = t4_wr_mbox(adap, adap->fn, c, sizeof(*c), c); + if (ret < 0) + return ret; + + /* select capabilities we'll be using */ + if (c->niccaps & htons(FW_CAPS_CONFIG_NIC_VM)) { + if (!vf_acls) + c->niccaps ^= htons(FW_CAPS_CONFIG_NIC_VM); + else + c->niccaps = htons(FW_CAPS_CONFIG_NIC_VM); + } else if (vf_acls) { + dev_err(adap->pdev_dev, "virtualization ACLs not supported"); + return ret; + } + c->op_to_write = htonl(FW_CMD_OP(FW_CAPS_CONFIG_CMD) | + FW_CMD_REQUEST | FW_CMD_WRITE); + ret = t4_wr_mbox(adap, adap->fn, c, sizeof(*c), NULL); + if (ret < 0) + return ret; + + ret = t4_config_glbl_rss(adap, adap->fn, + FW_RSS_GLB_CONFIG_CMD_MODE_BASICVIRTUAL, + FW_RSS_GLB_CONFIG_CMD_TNLMAPEN | + FW_RSS_GLB_CONFIG_CMD_TNLALLLKP); + if (ret < 0) + return ret; + + ret = t4_cfg_pfvf(adap, adap->fn, adap->fn, 0, MAX_EGRQ, 64, MAX_INGQ, + 0, 0, 4, 0xf, 0xf, 16, FW_CMD_CAP_PF, FW_CMD_CAP_PF); + if (ret < 0) + return ret; + + t4_sge_init(adap); + + /* tweak some settings */ + t4_write_reg(adap, TP_SHIFT_CNT, 0x64f8849); + t4_write_reg(adap, ULP_RX_TDDP_PSZ, HPZ0(PAGE_SHIFT - 12)); + t4_write_reg(adap, TP_PIO_ADDR, TP_INGRESS_CONFIG); + v = t4_read_reg(adap, TP_PIO_DATA); + t4_write_reg(adap, TP_PIO_DATA, v & ~CSUM_HAS_PSEUDO_HDR); + + /* first 4 Tx modulation queues point to consecutive Tx channels */ + adap->params.tp.tx_modq_map = 0xE4; + t4_write_reg(adap, A_TP_TX_MOD_QUEUE_REQ_MAP, + V_TX_MOD_QUEUE_REQ_MAP(adap->params.tp.tx_modq_map)); + + /* associate each Tx modulation queue with consecutive Tx channels */ + v = 0x84218421; + t4_write_indirect(adap, TP_PIO_ADDR, TP_PIO_DATA, + &v, 1, A_TP_TX_SCHED_HDR); + t4_write_indirect(adap, TP_PIO_ADDR, TP_PIO_DATA, + &v, 1, A_TP_TX_SCHED_FIFO); + t4_write_indirect(adap, TP_PIO_ADDR, TP_PIO_DATA, + &v, 1, A_TP_TX_SCHED_PCMD); + +#define T4_TX_MODQ_10G_WEIGHT_DEFAULT 16 /* in KB units */ + if (is_offload(adap)) { + t4_write_reg(adap, A_TP_TX_MOD_QUEUE_WEIGHT0, + V_TX_MODQ_WEIGHT0(T4_TX_MODQ_10G_WEIGHT_DEFAULT) | + V_TX_MODQ_WEIGHT1(T4_TX_MODQ_10G_WEIGHT_DEFAULT) | + V_TX_MODQ_WEIGHT2(T4_TX_MODQ_10G_WEIGHT_DEFAULT) | + V_TX_MODQ_WEIGHT3(T4_TX_MODQ_10G_WEIGHT_DEFAULT)); + t4_write_reg(adap, A_TP_TX_MOD_CHANNEL_WEIGHT, + V_TX_MODQ_WEIGHT0(T4_TX_MODQ_10G_WEIGHT_DEFAULT) | + V_TX_MODQ_WEIGHT1(T4_TX_MODQ_10G_WEIGHT_DEFAULT) | + V_TX_MODQ_WEIGHT2(T4_TX_MODQ_10G_WEIGHT_DEFAULT) | + V_TX_MODQ_WEIGHT3(T4_TX_MODQ_10G_WEIGHT_DEFAULT)); + } + + /* get basic stuff going */ + return t4_early_init(adap, adap->fn); +} + +/* + * Max # of ATIDs. The absolute HW max is 16K but we keep it lower. + */ +#define MAX_ATIDS 8192U + +/* + * Phase 0 of initialization: contact FW, obtain config, perform basic init. + * + * If the firmware we're dealing with has Configuration File support, then + * we use that to perform all configuration + */ + +/* + * Tweak configuration based on module parameters, etc. Most of these have + * defaults assigned to them by Firmware Configuration Files (if we're using + * them) but need to be explicitly set if we're using hard-coded + * initialization. But even in the case of using Firmware Configuration + * Files, we'd like to expose the ability to change these via module + * parameters so these are essentially common tweaks/settings for + * Configuration Files and hard-coded initialization ... + */ +static int adap_init0_tweaks(struct adapter *adapter) +{ + /* + * Fix up various Host-Dependent Parameters like Page Size, Cache + * Line Size, etc. The firmware default is for a 4KB Page Size and + * 64B Cache Line Size ... + */ + t4_fixup_host_params(adapter, PAGE_SIZE, L1_CACHE_BYTES); + + /* + * Process module parameters which affect early initialization. + */ + if (rx_dma_offset != 2 && rx_dma_offset != 0) { + dev_err(&adapter->pdev->dev, + "Ignoring illegal rx_dma_offset=%d, using 2\n", + rx_dma_offset); + rx_dma_offset = 2; + } + t4_set_reg_field(adapter, SGE_CONTROL, + PKTSHIFT_MASK, + PKTSHIFT(rx_dma_offset)); + + /* + * Don't include the "IP Pseudo Header" in CPL_RX_PKT checksums: Linux + * adds the pseudo header itself. + */ + t4_tp_wr_bits_indirect(adapter, TP_INGRESS_CONFIG, + CSUM_HAS_PSEUDO_HDR, 0); + + return 0; +} + +/* + * Attempt to initialize the adapter via a Firmware Configuration File. + */ +static int adap_init0_config(struct adapter *adapter, int reset) +{ + struct fw_caps_config_cmd caps_cmd; + const struct firmware *cf; + unsigned long mtype = 0, maddr = 0; + u32 finiver, finicsum, cfcsum; + int ret; + int config_issued = 0; + char *fw_config_file, fw_config_file_path[256]; + char *config_name = NULL; + + /* + * Reset device if necessary. + */ + if (reset) { + ret = t4_fw_reset(adapter, adapter->mbox, + PIORSTMODE | PIORST); + if (ret < 0) + goto bye; + } + + /* + * If we have a T4 configuration file under /lib/firmware/cxgb4/, + * then use that. Otherwise, use the configuration file stored + * in the adapter flash ... + */ + switch (CHELSIO_CHIP_VERSION(adapter->params.chip)) { + case CHELSIO_T4: + fw_config_file = FW4_CFNAME; + break; + case CHELSIO_T5: + fw_config_file = FW5_CFNAME; + break; + default: + dev_err(adapter->pdev_dev, "Device %d is not supported\n", + adapter->pdev->device); + ret = -EINVAL; + goto bye; + } + + ret = request_firmware(&cf, fw_config_file, adapter->pdev_dev); + if (ret < 0) { + config_name = "On FLASH"; + mtype = FW_MEMTYPE_CF_FLASH; + maddr = t4_flash_cfg_addr(adapter); + } else { + u32 params[7], val[7]; + + sprintf(fw_config_file_path, + "/lib/firmware/%s", fw_config_file); + config_name = fw_config_file_path; + + if (cf->size >= FLASH_CFG_MAX_SIZE) + ret = -ENOMEM; + else { + params[0] = (FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | + FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_CF)); + ret = t4_query_params(adapter, adapter->mbox, + adapter->fn, 0, 1, params, val); + if (ret == 0) { + /* + * For t4_memory_rw() below addresses and + * sizes have to be in terms of multiples of 4 + * bytes. So, if the Configuration File isn't + * a multiple of 4 bytes in length we'll have + * to write that out separately since we can't + * guarantee that the bytes following the + * residual byte in the buffer returned by + * request_firmware() are zeroed out ... + */ + size_t resid = cf->size & 0x3; + size_t size = cf->size & ~0x3; + __be32 *data = (__be32 *)cf->data; + + mtype = FW_PARAMS_PARAM_Y_GET(val[0]); + maddr = FW_PARAMS_PARAM_Z_GET(val[0]) << 16; + + spin_lock(&adapter->win0_lock); + ret = t4_memory_rw(adapter, 0, mtype, maddr, + size, data, T4_MEMORY_WRITE); + if (ret == 0 && resid != 0) { + union { + __be32 word; + char buf[4]; + } last; + int i; + + last.word = data[size >> 2]; + for (i = resid; i < 4; i++) + last.buf[i] = 0; + ret = t4_memory_rw(adapter, 0, mtype, + maddr + size, + 4, &last.word, + T4_MEMORY_WRITE); + } + spin_unlock(&adapter->win0_lock); + } + } + + release_firmware(cf); + if (ret) + goto bye; + } + + /* + * Issue a Capability Configuration command to the firmware to get it + * to parse the Configuration File. We don't use t4_fw_config_file() + * because we want the ability to modify various features after we've + * processed the configuration file ... + */ + memset(&caps_cmd, 0, sizeof(caps_cmd)); + caps_cmd.op_to_write = + htonl(FW_CMD_OP(FW_CAPS_CONFIG_CMD) | + FW_CMD_REQUEST | + FW_CMD_READ); + caps_cmd.cfvalid_to_len16 = + htonl(FW_CAPS_CONFIG_CMD_CFVALID | + FW_CAPS_CONFIG_CMD_MEMTYPE_CF(mtype) | + FW_CAPS_CONFIG_CMD_MEMADDR64K_CF(maddr >> 16) | + FW_LEN16(caps_cmd)); + ret = t4_wr_mbox(adapter, adapter->mbox, &caps_cmd, sizeof(caps_cmd), + &caps_cmd); + + /* If the CAPS_CONFIG failed with an ENOENT (for a Firmware + * Configuration File in FLASH), our last gasp effort is to use the + * Firmware Configuration File which is embedded in the firmware. A + * very few early versions of the firmware didn't have one embedded + * but we can ignore those. + */ + if (ret == -ENOENT) { + memset(&caps_cmd, 0, sizeof(caps_cmd)); + caps_cmd.op_to_write = + htonl(FW_CMD_OP(FW_CAPS_CONFIG_CMD) | + FW_CMD_REQUEST | + FW_CMD_READ); + caps_cmd.cfvalid_to_len16 = htonl(FW_LEN16(caps_cmd)); + ret = t4_wr_mbox(adapter, adapter->mbox, &caps_cmd, + sizeof(caps_cmd), &caps_cmd); + config_name = "Firmware Default"; + } + + config_issued = 1; + if (ret < 0) + goto bye; + + finiver = ntohl(caps_cmd.finiver); + finicsum = ntohl(caps_cmd.finicsum); + cfcsum = ntohl(caps_cmd.cfcsum); + if (finicsum != cfcsum) + dev_warn(adapter->pdev_dev, "Configuration File checksum "\ + "mismatch: [fini] csum=%#x, computed csum=%#x\n", + finicsum, cfcsum); + + /* + * And now tell the firmware to use the configuration we just loaded. + */ + caps_cmd.op_to_write = + htonl(FW_CMD_OP(FW_CAPS_CONFIG_CMD) | + FW_CMD_REQUEST | + FW_CMD_WRITE); + caps_cmd.cfvalid_to_len16 = htonl(FW_LEN16(caps_cmd)); + ret = t4_wr_mbox(adapter, adapter->mbox, &caps_cmd, sizeof(caps_cmd), + NULL); + if (ret < 0) + goto bye; + + /* + * Tweak configuration based on system architecture, module + * parameters, etc. + */ + ret = adap_init0_tweaks(adapter); + if (ret < 0) + goto bye; + + /* + * And finally tell the firmware to initialize itself using the + * parameters from the Configuration File. + */ + ret = t4_fw_initialize(adapter, adapter->mbox); + if (ret < 0) + goto bye; + + /* + * Return successfully and note that we're operating with parameters + * not supplied by the driver, rather than from hard-wired + * initialization constants burried in the driver. + */ + adapter->flags |= USING_SOFT_PARAMS; + dev_info(adapter->pdev_dev, "Successfully configured using Firmware "\ + "Configuration File \"%s\", version %#x, computed checksum %#x\n", + config_name, finiver, cfcsum); + return 0; + + /* + * Something bad happened. Return the error ... (If the "error" + * is that there's no Configuration File on the adapter we don't + * want to issue a warning since this is fairly common.) + */ +bye: + if (config_issued && ret != -ENOENT) + dev_warn(adapter->pdev_dev, "\"%s\" configuration file error %d\n", + config_name, -ret); + return ret; +} + +/* + * Attempt to initialize the adapter via hard-coded, driver supplied + * parameters ... + */ +static int adap_init0_no_config(struct adapter *adapter, int reset) +{ + struct sge *s = &adapter->sge; + struct fw_caps_config_cmd caps_cmd; + u32 v; + int i, ret; + + /* + * Reset device if necessary + */ + if (reset) { + ret = t4_fw_reset(adapter, adapter->mbox, + PIORSTMODE | PIORST); + if (ret < 0) + goto bye; + } + + /* + * Get device capabilities and select which we'll be using. + */ + memset(&caps_cmd, 0, sizeof(caps_cmd)); + caps_cmd.op_to_write = htonl(FW_CMD_OP(FW_CAPS_CONFIG_CMD) | + FW_CMD_REQUEST | FW_CMD_READ); + caps_cmd.cfvalid_to_len16 = htonl(FW_LEN16(caps_cmd)); + ret = t4_wr_mbox(adapter, adapter->mbox, &caps_cmd, sizeof(caps_cmd), + &caps_cmd); + if (ret < 0) + goto bye; + + if (caps_cmd.niccaps & htons(FW_CAPS_CONFIG_NIC_VM)) { + if (!vf_acls) + caps_cmd.niccaps ^= htons(FW_CAPS_CONFIG_NIC_VM); + else + caps_cmd.niccaps = htons(FW_CAPS_CONFIG_NIC_VM); + } else if (vf_acls) { + dev_err(adapter->pdev_dev, "virtualization ACLs not supported"); + goto bye; + } + caps_cmd.op_to_write = htonl(FW_CMD_OP(FW_CAPS_CONFIG_CMD) | + FW_CMD_REQUEST | FW_CMD_WRITE); + ret = t4_wr_mbox(adapter, adapter->mbox, &caps_cmd, sizeof(caps_cmd), + NULL); + if (ret < 0) + goto bye; + + /* + * Tweak configuration based on system architecture, module + * parameters, etc. + */ + ret = adap_init0_tweaks(adapter); + if (ret < 0) + goto bye; + + /* + * Select RSS Global Mode we want to use. We use "Basic Virtual" + * mode which maps each Virtual Interface to its own section of + * the RSS Table and we turn on all map and hash enables ... + */ + adapter->flags |= RSS_TNLALLLOOKUP; + ret = t4_config_glbl_rss(adapter, adapter->mbox, + FW_RSS_GLB_CONFIG_CMD_MODE_BASICVIRTUAL, + FW_RSS_GLB_CONFIG_CMD_TNLMAPEN | + FW_RSS_GLB_CONFIG_CMD_HASHTOEPLITZ | + ((adapter->flags & RSS_TNLALLLOOKUP) ? + FW_RSS_GLB_CONFIG_CMD_TNLALLLKP : 0)); + if (ret < 0) + goto bye; + + /* + * Set up our own fundamental resource provisioning ... + */ + ret = t4_cfg_pfvf(adapter, adapter->mbox, adapter->fn, 0, + PFRES_NEQ, PFRES_NETHCTRL, + PFRES_NIQFLINT, PFRES_NIQ, + PFRES_TC, PFRES_NVI, + FW_PFVF_CMD_CMASK_MASK, + pfvfres_pmask(adapter, adapter->fn, 0), + PFRES_NEXACTF, + PFRES_R_CAPS, PFRES_WX_CAPS); + if (ret < 0) + goto bye; + + /* + * Perform low level SGE initialization. We need to do this before we + * send the firmware the INITIALIZE command because that will cause + * any other PF Drivers which are waiting for the Master + * Initialization to proceed forward. + */ + for (i = 0; i < SGE_NTIMERS - 1; i++) + s->timer_val[i] = min(intr_holdoff[i], MAX_SGE_TIMERVAL); + s->timer_val[SGE_NTIMERS - 1] = MAX_SGE_TIMERVAL; + s->counter_val[0] = 1; + for (i = 1; i < SGE_NCOUNTERS; i++) + s->counter_val[i] = min(intr_cnt[i - 1], + THRESHOLD_0_GET(THRESHOLD_0_MASK)); + t4_sge_init(adapter); + +#ifdef CONFIG_PCI_IOV + /* + * Provision resource limits for Virtual Functions. We currently + * grant them all the same static resource limits except for the Port + * Access Rights Mask which we're assigning based on the PF. All of + * the static provisioning stuff for both the PF and VF really needs + * to be managed in a persistent manner for each device which the + * firmware controls. + */ + { + int pf, vf; + + for (pf = 0; pf < ARRAY_SIZE(num_vf); pf++) { + if (num_vf[pf] <= 0) + continue; + + /* VF numbering starts at 1! */ + for (vf = 1; vf <= num_vf[pf]; vf++) { + ret = t4_cfg_pfvf(adapter, adapter->mbox, + pf, vf, + VFRES_NEQ, VFRES_NETHCTRL, + VFRES_NIQFLINT, VFRES_NIQ, + VFRES_TC, VFRES_NVI, + FW_PFVF_CMD_CMASK_MASK, + pfvfres_pmask( + adapter, pf, vf), + VFRES_NEXACTF, + VFRES_R_CAPS, VFRES_WX_CAPS); + if (ret < 0) + dev_warn(adapter->pdev_dev, + "failed to "\ + "provision pf/vf=%d/%d; " + "err=%d\n", pf, vf, ret); + } + } + } +#endif + + /* + * Set up the default filter mode. Later we'll want to implement this + * via a firmware command, etc. ... This needs to be done before the + * firmare initialization command ... If the selected set of fields + * isn't equal to the default value, we'll need to make sure that the + * field selections will fit in the 36-bit budget. + */ + if (tp_vlan_pri_map != TP_VLAN_PRI_MAP_DEFAULT) { + int j, bits = 0; + + for (j = TP_VLAN_PRI_MAP_FIRST; j <= TP_VLAN_PRI_MAP_LAST; j++) + switch (tp_vlan_pri_map & (1 << j)) { + case 0: + /* compressed filter field not enabled */ + break; + case FCOE_MASK: + bits += 1; + break; + case PORT_MASK: + bits += 3; + break; + case VNIC_ID_MASK: + bits += 17; + break; + case VLAN_MASK: + bits += 17; + break; + case TOS_MASK: + bits += 8; + break; + case PROTOCOL_MASK: + bits += 8; + break; + case ETHERTYPE_MASK: + bits += 16; + break; + case MACMATCH_MASK: + bits += 9; + break; + case MPSHITTYPE_MASK: + bits += 3; + break; + case FRAGMENTATION_MASK: + bits += 1; + break; + } + + if (bits > 36) { + dev_err(adapter->pdev_dev, + "tp_vlan_pri_map=%#x needs %d bits > 36;"\ + " using %#x\n", tp_vlan_pri_map, bits, + TP_VLAN_PRI_MAP_DEFAULT); + tp_vlan_pri_map = TP_VLAN_PRI_MAP_DEFAULT; + } + } + v = tp_vlan_pri_map; + t4_write_indirect(adapter, TP_PIO_ADDR, TP_PIO_DATA, + &v, 1, TP_VLAN_PRI_MAP); + + /* + * We need Five Tuple Lookup mode to be set in TP_GLOBAL_CONFIG order + * to support any of the compressed filter fields above. Newer + * versions of the firmware do this automatically but it doesn't hurt + * to set it here. Meanwhile, we do _not_ need to set Lookup Every + * Packet in TP_INGRESS_CONFIG to support matching non-TCP packets + * since the firmware automatically turns this on and off when we have + * a non-zero number of filters active (since it does have a + * performance impact). + */ + if (tp_vlan_pri_map) + t4_set_reg_field(adapter, TP_GLOBAL_CONFIG, + FIVETUPLELOOKUP_MASK, + FIVETUPLELOOKUP_MASK); + + /* + * Tweak some settings. + */ + t4_write_reg(adapter, TP_SHIFT_CNT, SYNSHIFTMAX(6) | + RXTSHIFTMAXR1(4) | RXTSHIFTMAXR2(15) | + PERSHIFTBACKOFFMAX(8) | PERSHIFTMAX(8) | + KEEPALIVEMAXR1(4) | KEEPALIVEMAXR2(9)); + + /* + * Get basic stuff going by issuing the Firmware Initialize command. + * Note that this _must_ be after all PFVF commands ... + */ + ret = t4_fw_initialize(adapter, adapter->mbox); + if (ret < 0) + goto bye; + + /* + * Return successfully! + */ + dev_info(adapter->pdev_dev, "Successfully configured using built-in "\ + "driver parameters\n"); + return 0; + + /* + * Something bad happened. Return the error ... + */ +bye: + return ret; +} + +static struct fw_info fw_info_array[] = { + { + .chip = CHELSIO_T4, + .fs_name = FW4_CFNAME, + .fw_mod_name = FW4_FNAME, + .fw_hdr = { + .chip = FW_HDR_CHIP_T4, + .fw_ver = __cpu_to_be32(FW_VERSION(T4)), + .intfver_nic = FW_INTFVER(T4, NIC), + .intfver_vnic = FW_INTFVER(T4, VNIC), + .intfver_ri = FW_INTFVER(T4, RI), + .intfver_iscsi = FW_INTFVER(T4, ISCSI), + .intfver_fcoe = FW_INTFVER(T4, FCOE), + }, + }, { + .chip = CHELSIO_T5, + .fs_name = FW5_CFNAME, + .fw_mod_name = FW5_FNAME, + .fw_hdr = { + .chip = FW_HDR_CHIP_T5, + .fw_ver = __cpu_to_be32(FW_VERSION(T5)), + .intfver_nic = FW_INTFVER(T5, NIC), + .intfver_vnic = FW_INTFVER(T5, VNIC), + .intfver_ri = FW_INTFVER(T5, RI), + .intfver_iscsi = FW_INTFVER(T5, ISCSI), + .intfver_fcoe = FW_INTFVER(T5, FCOE), + }, + } +}; + +static struct fw_info *find_fw_info(int chip) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(fw_info_array); i++) { + if (fw_info_array[i].chip == chip) + return &fw_info_array[i]; + } + return NULL; +} + +/* + * Phase 0 of initialization: contact FW, obtain config, perform basic init. + */ +static int adap_init0(struct adapter *adap) +{ + int ret; + u32 v, port_vec; + enum dev_state state; + u32 params[7], val[7]; + struct fw_caps_config_cmd caps_cmd; + int reset = 1; + + /* + * Contact FW, advertising Master capability (and potentially forcing + * ourselves as the Master PF if our module parameter force_init is + * set). + */ + ret = t4_fw_hello(adap, adap->mbox, adap->fn, + force_init ? MASTER_MUST : MASTER_MAY, + &state); + if (ret < 0) { + dev_err(adap->pdev_dev, "could not connect to FW, error %d\n", + ret); + return ret; + } + if (ret == adap->mbox) + adap->flags |= MASTER_PF; + if (force_init && state == DEV_STATE_INIT) + state = DEV_STATE_UNINIT; + + /* + * If we're the Master PF Driver and the device is uninitialized, + * then let's consider upgrading the firmware ... (We always want + * to check the firmware version number in order to A. get it for + * later reporting and B. to warn if the currently loaded firmware + * is excessively mismatched relative to the driver.) + */ + t4_get_fw_version(adap, &adap->params.fw_vers); + t4_get_tp_version(adap, &adap->params.tp_vers); + if ((adap->flags & MASTER_PF) && state != DEV_STATE_INIT) { + struct fw_info *fw_info; + struct fw_hdr *card_fw; + const struct firmware *fw; + const u8 *fw_data = NULL; + unsigned int fw_size = 0; + + /* This is the firmware whose headers the driver was compiled + * against + */ + fw_info = find_fw_info(CHELSIO_CHIP_VERSION(adap->params.chip)); + if (fw_info == NULL) { + dev_err(adap->pdev_dev, + "unable to get firmware info for chip %d.\n", + CHELSIO_CHIP_VERSION(adap->params.chip)); + return -EINVAL; + } + + /* allocate memory to read the header of the firmware on the + * card + */ + card_fw = t4_alloc_mem(sizeof(*card_fw)); + + /* Get FW from from /lib/firmware/ */ + ret = request_firmware(&fw, fw_info->fw_mod_name, + adap->pdev_dev); + if (ret < 0) { + dev_err(adap->pdev_dev, + "unable to load firmware image %s, error %d\n", + fw_info->fw_mod_name, ret); + } else { + fw_data = fw->data; + fw_size = fw->size; + } + + /* upgrade FW logic */ + ret = t4_prep_fw(adap, fw_info, fw_data, fw_size, card_fw, + state, &reset); + + /* Cleaning up */ + if (fw != NULL) + release_firmware(fw); + t4_free_mem(card_fw); + + if (ret < 0) + goto bye; + } + + /* + * Grab VPD parameters. This should be done after we establish a + * connection to the firmware since some of the VPD parameters + * (notably the Core Clock frequency) are retrieved via requests to + * the firmware. On the other hand, we need these fairly early on + * so we do this right after getting ahold of the firmware. + */ + ret = get_vpd_params(adap, &adap->params.vpd); + if (ret < 0) + goto bye; + + /* + * Find out what ports are available to us. Note that we need to do + * this before calling adap_init0_no_config() since it needs nports + * and portvec ... + */ + v = + FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | + FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_PORTVEC); + ret = t4_query_params(adap, adap->mbox, adap->fn, 0, 1, &v, &port_vec); + if (ret < 0) + goto bye; + + adap->params.nports = hweight32(port_vec); + adap->params.portvec = port_vec; + + /* + * If the firmware is initialized already (and we're not forcing a + * master initialization), note that we're living with existing + * adapter parameters. Otherwise, it's time to try initializing the + * adapter ... + */ + if (state == DEV_STATE_INIT) { + dev_info(adap->pdev_dev, "Coming up as %s: "\ + "Adapter already initialized\n", + adap->flags & MASTER_PF ? "MASTER" : "SLAVE"); + adap->flags |= USING_SOFT_PARAMS; + } else { + dev_info(adap->pdev_dev, "Coming up as MASTER: "\ + "Initializing adapter\n"); + + /* + * If the firmware doesn't support Configuration + * Files warn user and exit, + */ + if (ret < 0) + dev_warn(adap->pdev_dev, "Firmware doesn't support " + "configuration file.\n"); + if (force_old_init) + ret = adap_init0_no_config(adap, reset); + else { + /* + * Find out whether we're dealing with a version of + * the firmware which has configuration file support. + */ + params[0] = (FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | + FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_CF)); + ret = t4_query_params(adap, adap->mbox, adap->fn, 0, 1, + params, val); + + /* + * If the firmware doesn't support Configuration + * Files, use the old Driver-based, hard-wired + * initialization. Otherwise, try using the + * Configuration File support and fall back to the + * Driver-based initialization if there's no + * Configuration File found. + */ + if (ret < 0) + ret = adap_init0_no_config(adap, reset); + else { + /* + * The firmware provides us with a memory + * buffer where we can load a Configuration + * File from the host if we want to override + * the Configuration File in flash. + */ + + ret = adap_init0_config(adap, reset); + if (ret == -ENOENT) { + dev_info(adap->pdev_dev, + "No Configuration File present " + "on adapter. Using hard-wired " + "configuration parameters.\n"); + ret = adap_init0_no_config(adap, reset); + } + } + } + if (ret < 0) { + dev_err(adap->pdev_dev, + "could not initialize adapter, error %d\n", + -ret); + goto bye; + } + } + + /* + * If we're living with non-hard-coded parameters (either from a + * Firmware Configuration File or values programmed by a different PF + * Driver), give the SGE code a chance to pull in anything that it + * needs ... Note that this must be called after we retrieve our VPD + * parameters in order to know how to convert core ticks to seconds. + */ + if (adap->flags & USING_SOFT_PARAMS) { + ret = t4_sge_init(adap); + if (ret < 0) + goto bye; + } + + if (is_bypass_device(adap->pdev->device)) + adap->params.bypass = 1; + + /* + * Grab some of our basic fundamental operating parameters. + */ +#define FW_PARAM_DEV(param) \ + (FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | \ + FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_##param)) + +#define FW_PARAM_PFVF(param) \ + FW_PARAMS_MNEM(FW_PARAMS_MNEM_PFVF) | \ + FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_PFVF_##param)| \ + FW_PARAMS_PARAM_Y(0) | \ + FW_PARAMS_PARAM_Z(0) + + params[0] = FW_PARAM_PFVF(EQ_START); + params[1] = FW_PARAM_PFVF(L2T_START); + params[2] = FW_PARAM_PFVF(L2T_END); + params[3] = FW_PARAM_PFVF(FILTER_START); + params[4] = FW_PARAM_PFVF(FILTER_END); + params[5] = FW_PARAM_PFVF(IQFLINT_START); + ret = t4_query_params(adap, adap->mbox, adap->fn, 0, 6, params, val); + if (ret < 0) + goto bye; + adap->sge.egr_start = val[0]; + adap->l2t_start = val[1]; + adap->l2t_end = val[2]; + adap->tids.ftid_base = val[3]; + adap->tids.nftids = val[4] - val[3] + 1; + adap->sge.ingr_start = val[5]; + + /* query params related to active filter region */ + params[0] = FW_PARAM_PFVF(ACTIVE_FILTER_START); + params[1] = FW_PARAM_PFVF(ACTIVE_FILTER_END); + ret = t4_query_params(adap, adap->mbox, adap->fn, 0, 2, params, val); + /* If Active filter size is set we enable establishing + * offload connection through firmware work request + */ + if ((val[0] != val[1]) && (ret >= 0)) { + adap->flags |= FW_OFLD_CONN; + adap->tids.aftid_base = val[0]; + adap->tids.aftid_end = val[1]; + } + + /* If we're running on newer firmware, let it know that we're + * prepared to deal with encapsulated CPL messages. Older + * firmware won't understand this and we'll just get + * unencapsulated messages ... + */ + params[0] = FW_PARAM_PFVF(CPLFW4MSG_ENCAP); + val[0] = 1; + (void) t4_set_params(adap, adap->mbox, adap->fn, 0, 1, params, val); + + /* + * Find out whether we're allowed to use the T5+ ULPTX MEMWRITE DSGL + * capability. Earlier versions of the firmware didn't have the + * ULPTX_MEMWRITE_DSGL so we'll interpret a query failure as no + * permission to use ULPTX MEMWRITE DSGL. + */ + if (is_t4(adap->params.chip)) { + adap->params.ulptx_memwrite_dsgl = false; + } else { + params[0] = FW_PARAM_DEV(ULPTX_MEMWRITE_DSGL); + ret = t4_query_params(adap, adap->mbox, adap->fn, 0, + 1, params, val); + adap->params.ulptx_memwrite_dsgl = (ret == 0 && val[0] != 0); + } + + /* + * Get device capabilities so we can determine what resources we need + * to manage. + */ + memset(&caps_cmd, 0, sizeof(caps_cmd)); + caps_cmd.op_to_write = htonl(FW_CMD_OP(FW_CAPS_CONFIG_CMD) | + FW_CMD_REQUEST | FW_CMD_READ); + caps_cmd.cfvalid_to_len16 = htonl(FW_LEN16(caps_cmd)); + ret = t4_wr_mbox(adap, adap->mbox, &caps_cmd, sizeof(caps_cmd), + &caps_cmd); + if (ret < 0) + goto bye; + + if (caps_cmd.ofldcaps) { + /* query offload-related parameters */ + params[0] = FW_PARAM_DEV(NTID); + params[1] = FW_PARAM_PFVF(SERVER_START); + params[2] = FW_PARAM_PFVF(SERVER_END); + params[3] = FW_PARAM_PFVF(TDDP_START); + params[4] = FW_PARAM_PFVF(TDDP_END); + params[5] = FW_PARAM_DEV(FLOWC_BUFFIFO_SZ); + ret = t4_query_params(adap, adap->mbox, adap->fn, 0, 6, + params, val); + if (ret < 0) + goto bye; + adap->tids.ntids = val[0]; + adap->tids.natids = min(adap->tids.ntids / 2, MAX_ATIDS); + adap->tids.stid_base = val[1]; + adap->tids.nstids = val[2] - val[1] + 1; + /* + * Setup server filter region. Divide the availble filter + * region into two parts. Regular filters get 1/3rd and server + * filters get 2/3rd part. This is only enabled if workarond + * path is enabled. + * 1. For regular filters. + * 2. Server filter: This are special filters which are used + * to redirect SYN packets to offload queue. + */ + if (adap->flags & FW_OFLD_CONN && !is_bypass(adap)) { + adap->tids.sftid_base = adap->tids.ftid_base + + DIV_ROUND_UP(adap->tids.nftids, 3); + adap->tids.nsftids = adap->tids.nftids - + DIV_ROUND_UP(adap->tids.nftids, 3); + adap->tids.nftids = adap->tids.sftid_base - + adap->tids.ftid_base; + } + adap->vres.ddp.start = val[3]; + adap->vres.ddp.size = val[4] - val[3] + 1; + adap->params.ofldq_wr_cred = val[5]; + + adap->params.offload = 1; + } + if (caps_cmd.rdmacaps) { + params[0] = FW_PARAM_PFVF(STAG_START); + params[1] = FW_PARAM_PFVF(STAG_END); + params[2] = FW_PARAM_PFVF(RQ_START); + params[3] = FW_PARAM_PFVF(RQ_END); + params[4] = FW_PARAM_PFVF(PBL_START); + params[5] = FW_PARAM_PFVF(PBL_END); + ret = t4_query_params(adap, adap->mbox, adap->fn, 0, 6, + params, val); + if (ret < 0) + goto bye; + adap->vres.stag.start = val[0]; + adap->vres.stag.size = val[1] - val[0] + 1; + adap->vres.rq.start = val[2]; + adap->vres.rq.size = val[3] - val[2] + 1; + adap->vres.pbl.start = val[4]; + adap->vres.pbl.size = val[5] - val[4] + 1; + + params[0] = FW_PARAM_PFVF(SQRQ_START); + params[1] = FW_PARAM_PFVF(SQRQ_END); + params[2] = FW_PARAM_PFVF(CQ_START); + params[3] = FW_PARAM_PFVF(CQ_END); + params[4] = FW_PARAM_PFVF(OCQ_START); + params[5] = FW_PARAM_PFVF(OCQ_END); + ret = t4_query_params(adap, adap->mbox, adap->fn, 0, 6, params, + val); + if (ret < 0) + goto bye; + adap->vres.qp.start = val[0]; + adap->vres.qp.size = val[1] - val[0] + 1; + adap->vres.cq.start = val[2]; + adap->vres.cq.size = val[3] - val[2] + 1; + adap->vres.ocq.start = val[4]; + adap->vres.ocq.size = val[5] - val[4] + 1; + + params[0] = FW_PARAM_DEV(MAXORDIRD_QP); + params[1] = FW_PARAM_DEV(MAXIRD_ADAPTER); + ret = t4_query_params(adap, adap->mbox, adap->fn, 0, 2, params, + val); + if (ret < 0) { + adap->params.max_ordird_qp = 8; + adap->params.max_ird_adapter = 32 * adap->tids.ntids; + ret = 0; + } else { + adap->params.max_ordird_qp = val[0]; + adap->params.max_ird_adapter = val[1]; + } + dev_info(adap->pdev_dev, + "max_ordird_qp %d max_ird_adapter %d\n", + adap->params.max_ordird_qp, + adap->params.max_ird_adapter); + } + if (caps_cmd.iscsicaps) { + params[0] = FW_PARAM_PFVF(ISCSI_START); + params[1] = FW_PARAM_PFVF(ISCSI_END); + ret = t4_query_params(adap, adap->mbox, adap->fn, 0, 2, + params, val); + if (ret < 0) + goto bye; + adap->vres.iscsi.start = val[0]; + adap->vres.iscsi.size = val[1] - val[0] + 1; + } +#undef FW_PARAM_PFVF +#undef FW_PARAM_DEV + + /* The MTU/MSS Table is initialized by now, so load their values. If + * we're initializing the adapter, then we'll make any modifications + * we want to the MTU/MSS Table and also initialize the congestion + * parameters. + */ + t4_read_mtu_tbl(adap, adap->params.mtus, NULL); + if (state != DEV_STATE_INIT) { + int i; + + /* The default MTU Table contains values 1492 and 1500. + * However, for TCP, it's better to have two values which are + * a multiple of 8 +/- 4 bytes apart near this popular MTU. + * This allows us to have a TCP Data Payload which is a + * multiple of 8 regardless of what combination of TCP Options + * are in use (always a multiple of 4 bytes) which is + * important for performance reasons. For instance, if no + * options are in use, then we have a 20-byte IP header and a + * 20-byte TCP header. In this case, a 1500-byte MSS would + * result in a TCP Data Payload of 1500 - 40 == 1460 bytes + * which is not a multiple of 8. So using an MSS of 1488 in + * this case results in a TCP Data Payload of 1448 bytes which + * is a multiple of 8. On the other hand, if 12-byte TCP Time + * Stamps have been negotiated, then an MTU of 1500 bytes + * results in a TCP Data Payload of 1448 bytes which, as + * above, is a multiple of 8 bytes ... + */ + for (i = 0; i < NMTUS; i++) + if (adap->params.mtus[i] == 1492) { + adap->params.mtus[i] = 1488; + break; + } + + t4_load_mtus(adap, adap->params.mtus, adap->params.a_wnd, + adap->params.b_wnd); + } + t4_init_tp_params(adap); + adap->flags |= FW_OK; + return 0; + + /* + * Something bad happened. If a command timed out or failed with EIO + * FW does not operate within its spec or something catastrophic + * happened to HW/FW, stop issuing commands. + */ +bye: + if (ret != -ETIMEDOUT && ret != -EIO) + t4_fw_bye(adap, adap->mbox); + return ret; +} + +/* EEH callbacks */ + +static pci_ers_result_t eeh_err_detected(struct pci_dev *pdev, + pci_channel_state_t state) +{ + int i; + struct adapter *adap = pci_get_drvdata(pdev); + + if (!adap) + goto out; + + rtnl_lock(); + adap->flags &= ~FW_OK; + notify_ulds(adap, CXGB4_STATE_START_RECOVERY); + spin_lock(&adap->stats_lock); + for_each_port(adap, i) { + struct net_device *dev = adap->port[i]; + + netif_device_detach(dev); + netif_carrier_off(dev); + } + spin_unlock(&adap->stats_lock); + if (adap->flags & FULL_INIT_DONE) + cxgb_down(adap); + rtnl_unlock(); + if ((adap->flags & DEV_ENABLED)) { + pci_disable_device(pdev); + adap->flags &= ~DEV_ENABLED; + } +out: return state == pci_channel_io_perm_failure ? + PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET; +} + +static pci_ers_result_t eeh_slot_reset(struct pci_dev *pdev) +{ + int i, ret; + struct fw_caps_config_cmd c; + struct adapter *adap = pci_get_drvdata(pdev); + + if (!adap) { + pci_restore_state(pdev); + pci_save_state(pdev); + return PCI_ERS_RESULT_RECOVERED; + } + + if (!(adap->flags & DEV_ENABLED)) { + if (pci_enable_device(pdev)) { + dev_err(&pdev->dev, "Cannot reenable PCI " + "device after reset\n"); + return PCI_ERS_RESULT_DISCONNECT; + } + adap->flags |= DEV_ENABLED; + } + + pci_set_master(pdev); + pci_restore_state(pdev); + pci_save_state(pdev); + pci_cleanup_aer_uncorrect_error_status(pdev); + + if (t4_wait_dev_ready(adap->regs) < 0) + return PCI_ERS_RESULT_DISCONNECT; + if (t4_fw_hello(adap, adap->fn, adap->fn, MASTER_MUST, NULL) < 0) + return PCI_ERS_RESULT_DISCONNECT; + adap->flags |= FW_OK; + if (adap_init1(adap, &c)) + return PCI_ERS_RESULT_DISCONNECT; + + for_each_port(adap, i) { + struct port_info *p = adap2pinfo(adap, i); + + ret = t4_alloc_vi(adap, adap->fn, p->tx_chan, adap->fn, 0, 1, + NULL, NULL); + if (ret < 0) + return PCI_ERS_RESULT_DISCONNECT; + p->viid = ret; + p->xact_addr_filt = -1; + } + + t4_load_mtus(adap, adap->params.mtus, adap->params.a_wnd, + adap->params.b_wnd); + setup_memwin(adap); + if (cxgb_up(adap)) + return PCI_ERS_RESULT_DISCONNECT; + return PCI_ERS_RESULT_RECOVERED; +} + +static void eeh_resume(struct pci_dev *pdev) +{ + int i; + struct adapter *adap = pci_get_drvdata(pdev); + + if (!adap) + return; + + rtnl_lock(); + for_each_port(adap, i) { + struct net_device *dev = adap->port[i]; + + if (netif_running(dev)) { + link_start(dev); + cxgb_set_rxmode(dev); + } + netif_device_attach(dev); + } + rtnl_unlock(); +} + +static const struct pci_error_handlers cxgb4_eeh = { + .error_detected = eeh_err_detected, + .slot_reset = eeh_slot_reset, + .resume = eeh_resume, +}; + +static inline bool is_x_10g_port(const struct link_config *lc) +{ + return (lc->supported & FW_PORT_CAP_SPEED_10G) != 0 || + (lc->supported & FW_PORT_CAP_SPEED_40G) != 0; +} + +static inline void init_rspq(struct adapter *adap, struct sge_rspq *q, + unsigned int us, unsigned int cnt, + unsigned int size, unsigned int iqe_size) +{ + q->adap = adap; + set_rspq_intr_params(q, us, cnt); + q->iqe_len = iqe_size; + q->size = size; +} + +/* + * Perform default configuration of DMA queues depending on the number and type + * of ports we found and the number of available CPUs. Most settings can be + * modified by the admin prior to actual use. + */ +static void cfg_queues(struct adapter *adap) +{ + struct sge *s = &adap->sge; + int i, n10g = 0, qidx = 0; +#ifndef CONFIG_CHELSIO_T4_DCB + int q10g = 0; +#endif + int ciq_size; + + for_each_port(adap, i) + n10g += is_x_10g_port(&adap2pinfo(adap, i)->link_cfg); +#ifdef CONFIG_CHELSIO_T4_DCB + /* For Data Center Bridging support we need to be able to support up + * to 8 Traffic Priorities; each of which will be assigned to its + * own TX Queue in order to prevent Head-Of-Line Blocking. + */ + if (adap->params.nports * 8 > MAX_ETH_QSETS) { + dev_err(adap->pdev_dev, "MAX_ETH_QSETS=%d < %d!\n", + MAX_ETH_QSETS, adap->params.nports * 8); + BUG_ON(1); + } + + for_each_port(adap, i) { + struct port_info *pi = adap2pinfo(adap, i); + + pi->first_qset = qidx; + pi->nqsets = 8; + qidx += pi->nqsets; + } +#else /* !CONFIG_CHELSIO_T4_DCB */ + /* + * We default to 1 queue per non-10G port and up to # of cores queues + * per 10G port. + */ + if (n10g) + q10g = (MAX_ETH_QSETS - (adap->params.nports - n10g)) / n10g; + if (q10g > netif_get_num_default_rss_queues()) + q10g = netif_get_num_default_rss_queues(); + + for_each_port(adap, i) { + struct port_info *pi = adap2pinfo(adap, i); + + pi->first_qset = qidx; + pi->nqsets = is_x_10g_port(&pi->link_cfg) ? q10g : 1; + qidx += pi->nqsets; + } +#endif /* !CONFIG_CHELSIO_T4_DCB */ + + s->ethqsets = qidx; + s->max_ethqsets = qidx; /* MSI-X may lower it later */ + + if (is_offload(adap)) { + /* + * For offload we use 1 queue/channel if all ports are up to 1G, + * otherwise we divide all available queues amongst the channels + * capped by the number of available cores. + */ + if (n10g) { + i = min_t(int, ARRAY_SIZE(s->ofldrxq), + num_online_cpus()); + s->ofldqsets = roundup(i, adap->params.nports); + } else + s->ofldqsets = adap->params.nports; + /* For RDMA one Rx queue per channel suffices */ + s->rdmaqs = adap->params.nports; + s->rdmaciqs = adap->params.nports; + } + + for (i = 0; i < ARRAY_SIZE(s->ethrxq); i++) { + struct sge_eth_rxq *r = &s->ethrxq[i]; + + init_rspq(adap, &r->rspq, 5, 10, 1024, 64); + r->fl.size = 72; + } + + for (i = 0; i < ARRAY_SIZE(s->ethtxq); i++) + s->ethtxq[i].q.size = 1024; + + for (i = 0; i < ARRAY_SIZE(s->ctrlq); i++) + s->ctrlq[i].q.size = 512; + + for (i = 0; i < ARRAY_SIZE(s->ofldtxq); i++) + s->ofldtxq[i].q.size = 1024; + + for (i = 0; i < ARRAY_SIZE(s->ofldrxq); i++) { + struct sge_ofld_rxq *r = &s->ofldrxq[i]; + + init_rspq(adap, &r->rspq, 5, 1, 1024, 64); + r->rspq.uld = CXGB4_ULD_ISCSI; + r->fl.size = 72; + } + + for (i = 0; i < ARRAY_SIZE(s->rdmarxq); i++) { + struct sge_ofld_rxq *r = &s->rdmarxq[i]; + + init_rspq(adap, &r->rspq, 5, 1, 511, 64); + r->rspq.uld = CXGB4_ULD_RDMA; + r->fl.size = 72; + } + + ciq_size = 64 + adap->vres.cq.size + adap->tids.nftids; + if (ciq_size > SGE_MAX_IQ_SIZE) { + CH_WARN(adap, "CIQ size too small for available IQs\n"); + ciq_size = SGE_MAX_IQ_SIZE; + } + + for (i = 0; i < ARRAY_SIZE(s->rdmaciq); i++) { + struct sge_ofld_rxq *r = &s->rdmaciq[i]; + + init_rspq(adap, &r->rspq, 5, 1, ciq_size, 64); + r->rspq.uld = CXGB4_ULD_RDMA; + } + + init_rspq(adap, &s->fw_evtq, 0, 1, 1024, 64); + init_rspq(adap, &s->intrq, 0, 1, 2 * MAX_INGQ, 64); +} + +/* + * Reduce the number of Ethernet queues across all ports to at most n. + * n provides at least one queue per port. + */ +static void reduce_ethqs(struct adapter *adap, int n) +{ + int i; + struct port_info *pi; + + while (n < adap->sge.ethqsets) + for_each_port(adap, i) { + pi = adap2pinfo(adap, i); + if (pi->nqsets > 1) { + pi->nqsets--; + adap->sge.ethqsets--; + if (adap->sge.ethqsets <= n) + break; + } + } + + n = 0; + for_each_port(adap, i) { + pi = adap2pinfo(adap, i); + pi->first_qset = n; + n += pi->nqsets; + } +} + +/* 2 MSI-X vectors needed for the FW queue and non-data interrupts */ +#define EXTRA_VECS 2 + +static int enable_msix(struct adapter *adap) +{ + int ofld_need = 0; + int i, want, need; + struct sge *s = &adap->sge; + unsigned int nchan = adap->params.nports; + struct msix_entry entries[MAX_INGQ + 1]; + + for (i = 0; i < ARRAY_SIZE(entries); ++i) + entries[i].entry = i; + + want = s->max_ethqsets + EXTRA_VECS; + if (is_offload(adap)) { + want += s->rdmaqs + s->rdmaciqs + s->ofldqsets; + /* need nchan for each possible ULD */ + ofld_need = 3 * nchan; + } +#ifdef CONFIG_CHELSIO_T4_DCB + /* For Data Center Bridging we need 8 Ethernet TX Priority Queues for + * each port. + */ + need = 8 * adap->params.nports + EXTRA_VECS + ofld_need; +#else + need = adap->params.nports + EXTRA_VECS + ofld_need; +#endif + want = pci_enable_msix_range(adap->pdev, entries, need, want); + if (want < 0) + return want; + + /* + * Distribute available vectors to the various queue groups. + * Every group gets its minimum requirement and NIC gets top + * priority for leftovers. + */ + i = want - EXTRA_VECS - ofld_need; + if (i < s->max_ethqsets) { + s->max_ethqsets = i; + if (i < s->ethqsets) + reduce_ethqs(adap, i); + } + if (is_offload(adap)) { + i = want - EXTRA_VECS - s->max_ethqsets; + i -= ofld_need - nchan; + s->ofldqsets = (i / nchan) * nchan; /* round down */ + } + for (i = 0; i < want; ++i) + adap->msix_info[i].vec = entries[i].vector; + + return 0; +} + +#undef EXTRA_VECS + +static int init_rss(struct adapter *adap) +{ + unsigned int i, j; + + for_each_port(adap, i) { + struct port_info *pi = adap2pinfo(adap, i); + + pi->rss = kcalloc(pi->rss_size, sizeof(u16), GFP_KERNEL); + if (!pi->rss) + return -ENOMEM; + for (j = 0; j < pi->rss_size; j++) + pi->rss[j] = ethtool_rxfh_indir_default(j, pi->nqsets); + } + return 0; +} + +static void print_port_info(const struct net_device *dev) +{ + char buf[80]; + char *bufp = buf; + const char *spd = ""; + const struct port_info *pi = netdev_priv(dev); + const struct adapter *adap = pi->adapter; + + if (adap->params.pci.speed == PCI_EXP_LNKSTA_CLS_2_5GB) + spd = " 2.5 GT/s"; + else if (adap->params.pci.speed == PCI_EXP_LNKSTA_CLS_5_0GB) + spd = " 5 GT/s"; + else if (adap->params.pci.speed == PCI_EXP_LNKSTA_CLS_8_0GB) + spd = " 8 GT/s"; + + if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_100M) + bufp += sprintf(bufp, "100/"); + if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_1G) + bufp += sprintf(bufp, "1000/"); + if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_10G) + bufp += sprintf(bufp, "10G/"); + if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_40G) + bufp += sprintf(bufp, "40G/"); + if (bufp != buf) + --bufp; + sprintf(bufp, "BASE-%s", t4_get_port_type_description(pi->port_type)); + + netdev_info(dev, "Chelsio %s rev %d %s %sNIC PCIe x%d%s%s\n", + adap->params.vpd.id, + CHELSIO_CHIP_RELEASE(adap->params.chip), buf, + is_offload(adap) ? "R" : "", adap->params.pci.width, spd, + (adap->flags & USING_MSIX) ? " MSI-X" : + (adap->flags & USING_MSI) ? " MSI" : ""); + netdev_info(dev, "S/N: %s, P/N: %s\n", + adap->params.vpd.sn, adap->params.vpd.pn); +} + +static void enable_pcie_relaxed_ordering(struct pci_dev *dev) +{ + pcie_capability_set_word(dev, PCI_EXP_DEVCTL, PCI_EXP_DEVCTL_RELAX_EN); +} + +/* + * Free the following resources: + * - memory used for tables + * - MSI/MSI-X + * - net devices + * - resources FW is holding for us + */ +static void free_some_resources(struct adapter *adapter) +{ + unsigned int i; + + t4_free_mem(adapter->l2t); + t4_free_mem(adapter->tids.tid_tab); + disable_msi(adapter); + + for_each_port(adapter, i) + if (adapter->port[i]) { + kfree(adap2pinfo(adapter, i)->rss); + free_netdev(adapter->port[i]); + } + if (adapter->flags & FW_OK) + t4_fw_bye(adapter, adapter->fn); +} + +#define TSO_FLAGS (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_TSO_ECN) +#define VLAN_FEAT (NETIF_F_SG | NETIF_F_IP_CSUM | TSO_FLAGS | \ + NETIF_F_IPV6_CSUM | NETIF_F_HIGHDMA) +#define SEGMENT_SIZE 128 + +static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + int func, i, err, s_qpp, qpp, num_seg; + struct port_info *pi; + bool highdma = false; + struct adapter *adapter = NULL; + void __iomem *regs; + + printk_once(KERN_INFO "%s - version %s\n", DRV_DESC, DRV_VERSION); + + err = pci_request_regions(pdev, KBUILD_MODNAME); + if (err) { + /* Just info, some other driver may have claimed the device. */ + dev_info(&pdev->dev, "cannot obtain PCI resources\n"); + return err; + } + + err = pci_enable_device(pdev); + if (err) { + dev_err(&pdev->dev, "cannot enable PCI device\n"); + goto out_release_regions; + } + + regs = pci_ioremap_bar(pdev, 0); + if (!regs) { + dev_err(&pdev->dev, "cannot map device registers\n"); + err = -ENOMEM; + goto out_disable_device; + } + + err = t4_wait_dev_ready(regs); + if (err < 0) + goto out_unmap_bar0; + + /* We control everything through one PF */ + func = SOURCEPF_GET(readl(regs + PL_WHOAMI)); + if (func != ent->driver_data) { + iounmap(regs); + pci_disable_device(pdev); + pci_save_state(pdev); /* to restore SR-IOV later */ + goto sriov; + } + + if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) { + highdma = true; + err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); + if (err) { + dev_err(&pdev->dev, "unable to obtain 64-bit DMA for " + "coherent allocations\n"); + goto out_unmap_bar0; + } + } else { + err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + if (err) { + dev_err(&pdev->dev, "no usable DMA configuration\n"); + goto out_unmap_bar0; + } + } + + pci_enable_pcie_error_reporting(pdev); + enable_pcie_relaxed_ordering(pdev); + pci_set_master(pdev); + pci_save_state(pdev); + + adapter = kzalloc(sizeof(*adapter), GFP_KERNEL); + if (!adapter) { + err = -ENOMEM; + goto out_unmap_bar0; + } + + adapter->workq = create_singlethread_workqueue("cxgb4"); + if (!adapter->workq) { + err = -ENOMEM; + goto out_free_adapter; + } + + /* PCI device has been enabled */ + adapter->flags |= DEV_ENABLED; + + adapter->regs = regs; + adapter->pdev = pdev; + adapter->pdev_dev = &pdev->dev; + adapter->mbox = func; + adapter->fn = func; + adapter->msg_enable = dflt_msg_enable; + memset(adapter->chan_map, 0xff, sizeof(adapter->chan_map)); + + spin_lock_init(&adapter->stats_lock); + spin_lock_init(&adapter->tid_release_lock); + spin_lock_init(&adapter->win0_lock); + + INIT_WORK(&adapter->tid_release_task, process_tid_release_list); + INIT_WORK(&adapter->db_full_task, process_db_full); + INIT_WORK(&adapter->db_drop_task, process_db_drop); + + err = t4_prep_adapter(adapter); + if (err) + goto out_free_adapter; + + + if (!is_t4(adapter->params.chip)) { + s_qpp = QUEUESPERPAGEPF1 * adapter->fn; + qpp = 1 << QUEUESPERPAGEPF0_GET(t4_read_reg(adapter, + SGE_EGRESS_QUEUES_PER_PAGE_PF) >> s_qpp); + num_seg = PAGE_SIZE / SEGMENT_SIZE; + + /* Each segment size is 128B. Write coalescing is enabled only + * when SGE_EGRESS_QUEUES_PER_PAGE_PF reg value for the + * queue is less no of segments that can be accommodated in + * a page size. + */ + if (qpp > num_seg) { + dev_err(&pdev->dev, + "Incorrect number of egress queues per page\n"); + err = -EINVAL; + goto out_free_adapter; + } + adapter->bar2 = ioremap_wc(pci_resource_start(pdev, 2), + pci_resource_len(pdev, 2)); + if (!adapter->bar2) { + dev_err(&pdev->dev, "cannot map device bar2 region\n"); + err = -ENOMEM; + goto out_free_adapter; + } + } + + setup_memwin(adapter); + err = adap_init0(adapter); + setup_memwin_rdma(adapter); + if (err) + goto out_unmap_bar; + + for_each_port(adapter, i) { + struct net_device *netdev; + + netdev = alloc_etherdev_mq(sizeof(struct port_info), + MAX_ETH_QSETS); + if (!netdev) { + err = -ENOMEM; + goto out_free_dev; + } + + SET_NETDEV_DEV(netdev, &pdev->dev); + + adapter->port[i] = netdev; + pi = netdev_priv(netdev); + pi->adapter = adapter; + pi->xact_addr_filt = -1; + pi->port_id = i; + netdev->irq = pdev->irq; + + netdev->hw_features = NETIF_F_SG | TSO_FLAGS | + NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | + NETIF_F_RXCSUM | NETIF_F_RXHASH | + NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX; + if (highdma) + netdev->hw_features |= NETIF_F_HIGHDMA; + netdev->features |= netdev->hw_features; + netdev->vlan_features = netdev->features & VLAN_FEAT; + + netdev->priv_flags |= IFF_UNICAST_FLT; + + netdev->netdev_ops = &cxgb4_netdev_ops; +#ifdef CONFIG_CHELSIO_T4_DCB + netdev->dcbnl_ops = &cxgb4_dcb_ops; + cxgb4_dcb_state_init(netdev); +#endif + netdev->ethtool_ops = &cxgb_ethtool_ops; + } + + pci_set_drvdata(pdev, adapter); + + if (adapter->flags & FW_OK) { + err = t4_port_init(adapter, func, func, 0); + if (err) + goto out_free_dev; + } + + /* + * Configure queues and allocate tables now, they can be needed as + * soon as the first register_netdev completes. + */ + cfg_queues(adapter); + + adapter->l2t = t4_init_l2t(); + if (!adapter->l2t) { + /* We tolerate a lack of L2T, giving up some functionality */ + dev_warn(&pdev->dev, "could not allocate L2T, continuing\n"); + adapter->params.offload = 0; + } + + if (is_offload(adapter) && tid_init(&adapter->tids) < 0) { + dev_warn(&pdev->dev, "could not allocate TID table, " + "continuing\n"); + adapter->params.offload = 0; + } + + /* See what interrupts we'll be using */ + if (msi > 1 && enable_msix(adapter) == 0) + adapter->flags |= USING_MSIX; + else if (msi > 0 && pci_enable_msi(pdev) == 0) + adapter->flags |= USING_MSI; + + err = init_rss(adapter); + if (err) + goto out_free_dev; + + /* + * The card is now ready to go. If any errors occur during device + * registration we do not fail the whole card but rather proceed only + * with the ports we manage to register successfully. However we must + * register at least one net device. + */ + for_each_port(adapter, i) { + pi = adap2pinfo(adapter, i); + netif_set_real_num_tx_queues(adapter->port[i], pi->nqsets); + netif_set_real_num_rx_queues(adapter->port[i], pi->nqsets); + + err = register_netdev(adapter->port[i]); + if (err) + break; + adapter->chan_map[pi->tx_chan] = i; + print_port_info(adapter->port[i]); + } + if (i == 0) { + dev_err(&pdev->dev, "could not register any net devices\n"); + goto out_free_dev; + } + if (err) { + dev_warn(&pdev->dev, "only %d net devices registered\n", i); + err = 0; + } + + if (cxgb4_debugfs_root) { + adapter->debugfs_root = debugfs_create_dir(pci_name(pdev), + cxgb4_debugfs_root); + setup_debugfs(adapter); + } + + /* PCIe EEH recovery on powerpc platforms needs fundamental reset */ + pdev->needs_freset = 1; + + if (is_offload(adapter)) + attach_ulds(adapter); + +sriov: +#ifdef CONFIG_PCI_IOV + if (func < ARRAY_SIZE(num_vf) && num_vf[func] > 0) + if (pci_enable_sriov(pdev, num_vf[func]) == 0) + dev_info(&pdev->dev, + "instantiated %u virtual functions\n", + num_vf[func]); +#endif + return 0; + + out_free_dev: + free_some_resources(adapter); + out_unmap_bar: + if (!is_t4(adapter->params.chip)) + iounmap(adapter->bar2); + out_free_adapter: + if (adapter->workq) + destroy_workqueue(adapter->workq); + + kfree(adapter); + out_unmap_bar0: + iounmap(regs); + out_disable_device: + pci_disable_pcie_error_reporting(pdev); + pci_disable_device(pdev); + out_release_regions: + pci_release_regions(pdev); + return err; +} + +static void remove_one(struct pci_dev *pdev) +{ + struct adapter *adapter = pci_get_drvdata(pdev); + +#ifdef CONFIG_PCI_IOV + pci_disable_sriov(pdev); + +#endif + + if (adapter) { + int i; + + /* Tear down per-adapter Work Queue first since it can contain + * references to our adapter data structure. + */ + destroy_workqueue(adapter->workq); + + if (is_offload(adapter)) + detach_ulds(adapter); + + for_each_port(adapter, i) + if (adapter->port[i]->reg_state == NETREG_REGISTERED) + unregister_netdev(adapter->port[i]); + + debugfs_remove_recursive(adapter->debugfs_root); + + /* If we allocated filters, free up state associated with any + * valid filters ... + */ + if (adapter->tids.ftid_tab) { + struct filter_entry *f = &adapter->tids.ftid_tab[0]; + for (i = 0; i < (adapter->tids.nftids + + adapter->tids.nsftids); i++, f++) + if (f->valid) + clear_filter(adapter, f); + } + + if (adapter->flags & FULL_INIT_DONE) + cxgb_down(adapter); + + free_some_resources(adapter); + iounmap(adapter->regs); + if (!is_t4(adapter->params.chip)) + iounmap(adapter->bar2); + pci_disable_pcie_error_reporting(pdev); + if ((adapter->flags & DEV_ENABLED)) { + pci_disable_device(pdev); + adapter->flags &= ~DEV_ENABLED; + } + pci_release_regions(pdev); + synchronize_rcu(); + kfree(adapter); + } else + pci_release_regions(pdev); +} + +static struct pci_driver cxgb4_driver = { + .name = KBUILD_MODNAME, + .id_table = cxgb4_pci_tbl, + .probe = init_one, + .remove = remove_one, + .shutdown = remove_one, + .err_handler = &cxgb4_eeh, +}; + +static int __init cxgb4_init_module(void) +{ + int ret; + + /* Debugfs support is optional, just warn if this fails */ + cxgb4_debugfs_root = debugfs_create_dir(KBUILD_MODNAME, NULL); + if (!cxgb4_debugfs_root) + pr_warn("could not create debugfs entry, continuing\n"); + + ret = pci_register_driver(&cxgb4_driver); + if (ret < 0) + debugfs_remove(cxgb4_debugfs_root); + +#if IS_ENABLED(CONFIG_IPV6) + register_inet6addr_notifier(&cxgb4_inet6addr_notifier); +#endif + + return ret; +} + +static void __exit cxgb4_cleanup_module(void) +{ +#if IS_ENABLED(CONFIG_IPV6) + unregister_inet6addr_notifier(&cxgb4_inet6addr_notifier); +#endif + pci_unregister_driver(&cxgb4_driver); + debugfs_remove(cxgb4_debugfs_root); /* NULL ok */ +} + +module_init(cxgb4_init_module); +module_exit(cxgb4_cleanup_module); diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h new file mode 100644 index 0000000..1366ba6 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h @@ -0,0 +1,307 @@ +/* + * This file is part of the Chelsio T4 Ethernet driver for Linux. + * + * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __CXGB4_OFLD_H +#define __CXGB4_OFLD_H + +#include +#include +#include +#include +#include + +/* CPL message priority levels */ +enum { + CPL_PRIORITY_DATA = 0, /* data messages */ + CPL_PRIORITY_SETUP = 1, /* connection setup messages */ + CPL_PRIORITY_TEARDOWN = 0, /* connection teardown messages */ + CPL_PRIORITY_LISTEN = 1, /* listen start/stop messages */ + CPL_PRIORITY_ACK = 1, /* RX ACK messages */ + CPL_PRIORITY_CONTROL = 1 /* control messages */ +}; + +#define INIT_TP_WR(w, tid) do { \ + (w)->wr.wr_hi = htonl(FW_WR_OP(FW_TP_WR) | \ + FW_WR_IMMDLEN(sizeof(*w) - sizeof(w->wr))); \ + (w)->wr.wr_mid = htonl(FW_WR_LEN16(DIV_ROUND_UP(sizeof(*w), 16)) | \ + FW_WR_FLOWID(tid)); \ + (w)->wr.wr_lo = cpu_to_be64(0); \ +} while (0) + +#define INIT_TP_WR_CPL(w, cpl, tid) do { \ + INIT_TP_WR(w, tid); \ + OPCODE_TID(w) = htonl(MK_OPCODE_TID(cpl, tid)); \ +} while (0) + +#define INIT_ULPTX_WR(w, wrlen, atomic, tid) do { \ + (w)->wr.wr_hi = htonl(FW_WR_OP(FW_ULPTX_WR) | FW_WR_ATOMIC(atomic)); \ + (w)->wr.wr_mid = htonl(FW_WR_LEN16(DIV_ROUND_UP(wrlen, 16)) | \ + FW_WR_FLOWID(tid)); \ + (w)->wr.wr_lo = cpu_to_be64(0); \ +} while (0) + +/* Special asynchronous notification message */ +#define CXGB4_MSG_AN ((void *)1) + +struct serv_entry { + void *data; +}; + +union aopen_entry { + void *data; + union aopen_entry *next; +}; + +/* + * Holds the size, base address, free list start, etc of the TID, server TID, + * and active-open TID tables. The tables themselves are allocated dynamically. + */ +struct tid_info { + void **tid_tab; + unsigned int ntids; + + struct serv_entry *stid_tab; + unsigned long *stid_bmap; + unsigned int nstids; + unsigned int stid_base; + + union aopen_entry *atid_tab; + unsigned int natids; + unsigned int atid_base; + + struct filter_entry *ftid_tab; + unsigned int nftids; + unsigned int ftid_base; + unsigned int aftid_base; + unsigned int aftid_end; + /* Server filter region */ + unsigned int sftid_base; + unsigned int nsftids; + + spinlock_t atid_lock ____cacheline_aligned_in_smp; + union aopen_entry *afree; + unsigned int atids_in_use; + + spinlock_t stid_lock; + unsigned int stids_in_use; + + atomic_t tids_in_use; +}; + +static inline void *lookup_tid(const struct tid_info *t, unsigned int tid) +{ + return tid < t->ntids ? t->tid_tab[tid] : NULL; +} + +static inline void *lookup_atid(const struct tid_info *t, unsigned int atid) +{ + return atid < t->natids ? t->atid_tab[atid].data : NULL; +} + +static inline void *lookup_stid(const struct tid_info *t, unsigned int stid) +{ + /* Is it a server filter TID? */ + if (t->nsftids && (stid >= t->sftid_base)) { + stid -= t->sftid_base; + stid += t->nstids; + } else { + stid -= t->stid_base; + } + + return stid < (t->nstids + t->nsftids) ? t->stid_tab[stid].data : NULL; +} + +static inline void cxgb4_insert_tid(struct tid_info *t, void *data, + unsigned int tid) +{ + t->tid_tab[tid] = data; + atomic_inc(&t->tids_in_use); +} + +int cxgb4_alloc_atid(struct tid_info *t, void *data); +int cxgb4_alloc_stid(struct tid_info *t, int family, void *data); +int cxgb4_alloc_sftid(struct tid_info *t, int family, void *data); +void cxgb4_free_atid(struct tid_info *t, unsigned int atid); +void cxgb4_free_stid(struct tid_info *t, unsigned int stid, int family); +void cxgb4_remove_tid(struct tid_info *t, unsigned int qid, unsigned int tid); + +struct in6_addr; + +int cxgb4_create_server(const struct net_device *dev, unsigned int stid, + __be32 sip, __be16 sport, __be16 vlan, + unsigned int queue); +int cxgb4_create_server6(const struct net_device *dev, unsigned int stid, + const struct in6_addr *sip, __be16 sport, + unsigned int queue); +int cxgb4_remove_server(const struct net_device *dev, unsigned int stid, + unsigned int queue, bool ipv6); +int cxgb4_create_server_filter(const struct net_device *dev, unsigned int stid, + __be32 sip, __be16 sport, __be16 vlan, + unsigned int queue, + unsigned char port, unsigned char mask); +int cxgb4_remove_server_filter(const struct net_device *dev, unsigned int stid, + unsigned int queue, bool ipv6); +int cxgb4_clip_get(const struct net_device *dev, const struct in6_addr *lip); +int cxgb4_clip_release(const struct net_device *dev, + const struct in6_addr *lip); + +static inline void set_wr_txq(struct sk_buff *skb, int prio, int queue) +{ + skb_set_queue_mapping(skb, (queue << 1) | prio); +} + +enum cxgb4_uld { + CXGB4_ULD_RDMA, + CXGB4_ULD_ISCSI, + CXGB4_ULD_MAX +}; + +enum cxgb4_state { + CXGB4_STATE_UP, + CXGB4_STATE_START_RECOVERY, + CXGB4_STATE_DOWN, + CXGB4_STATE_DETACH +}; + +enum cxgb4_control { + CXGB4_CONTROL_DB_FULL, + CXGB4_CONTROL_DB_EMPTY, + CXGB4_CONTROL_DB_DROP, +}; + +struct pci_dev; +struct l2t_data; +struct net_device; +struct pkt_gl; +struct tp_tcp_stats; + +struct cxgb4_range { + unsigned int start; + unsigned int size; +}; + +struct cxgb4_virt_res { /* virtualized HW resources */ + struct cxgb4_range ddp; + struct cxgb4_range iscsi; + struct cxgb4_range stag; + struct cxgb4_range rq; + struct cxgb4_range pbl; + struct cxgb4_range qp; + struct cxgb4_range cq; + struct cxgb4_range ocq; +}; + +#define OCQ_WIN_OFFSET(pdev, vres) \ + (pci_resource_len((pdev), 2) - roundup_pow_of_two((vres)->ocq.size)) + +/* + * Block of information the LLD provides to ULDs attaching to a device. + */ +struct cxgb4_lld_info { + struct pci_dev *pdev; /* associated PCI device */ + struct l2t_data *l2t; /* L2 table */ + struct tid_info *tids; /* TID table */ + struct net_device **ports; /* device ports */ + const struct cxgb4_virt_res *vr; /* assorted HW resources */ + const unsigned short *mtus; /* MTU table */ + const unsigned short *rxq_ids; /* the ULD's Rx queue ids */ + const unsigned short *ciq_ids; /* the ULD's concentrator IQ ids */ + unsigned short nrxq; /* # of Rx queues */ + unsigned short ntxq; /* # of Tx queues */ + unsigned short nciq; /* # of concentrator IQ */ + unsigned char nchan:4; /* # of channels */ + unsigned char nports:4; /* # of ports */ + unsigned char wr_cred; /* WR 16-byte credits */ + unsigned char adapter_type; /* type of adapter */ + unsigned char fw_api_ver; /* FW API version */ + unsigned int fw_vers; /* FW version */ + unsigned int iscsi_iolen; /* iSCSI max I/O length */ + unsigned int cclk_ps; /* Core clock period in psec */ + unsigned short udb_density; /* # of user DB/page */ + unsigned short ucq_density; /* # of user CQs/page */ + unsigned short filt_mode; /* filter optional components */ + unsigned short tx_modq[NCHAN]; /* maps each tx channel to a */ + /* scheduler queue */ + void __iomem *gts_reg; /* address of GTS register */ + void __iomem *db_reg; /* address of kernel doorbell */ + int dbfifo_int_thresh; /* doorbell fifo int threshold */ + unsigned int sge_ingpadboundary; /* SGE ingress padding boundary */ + unsigned int sge_egrstatuspagesize; /* SGE egress status page size */ + unsigned int sge_pktshift; /* Padding between CPL and */ + /* packet data */ + unsigned int pf; /* Physical Function we're using */ + bool enable_fw_ofld_conn; /* Enable connection through fw */ + /* WR */ + unsigned int max_ordird_qp; /* Max ORD/IRD depth per RDMA QP */ + unsigned int max_ird_adapter; /* Max IRD memory per adapter */ + bool ulptx_memwrite_dsgl; /* use of T5 DSGL allowed */ +}; + +struct cxgb4_uld_info { + const char *name; + void *(*add)(const struct cxgb4_lld_info *p); + int (*rx_handler)(void *handle, const __be64 *rsp, + const struct pkt_gl *gl); + int (*state_change)(void *handle, enum cxgb4_state new_state); + int (*control)(void *handle, enum cxgb4_control control, ...); +}; + +int cxgb4_register_uld(enum cxgb4_uld type, const struct cxgb4_uld_info *p); +int cxgb4_unregister_uld(enum cxgb4_uld type); +int cxgb4_ofld_send(struct net_device *dev, struct sk_buff *skb); +unsigned int cxgb4_dbfifo_count(const struct net_device *dev, int lpfifo); +unsigned int cxgb4_port_chan(const struct net_device *dev); +unsigned int cxgb4_port_viid(const struct net_device *dev); +unsigned int cxgb4_port_idx(const struct net_device *dev); +unsigned int cxgb4_best_mtu(const unsigned short *mtus, unsigned short mtu, + unsigned int *idx); +unsigned int cxgb4_best_aligned_mtu(const unsigned short *mtus, + unsigned short header_size, + unsigned short data_size_max, + unsigned short data_size_align, + unsigned int *mtu_idxp); +void cxgb4_get_tcp_stats(struct pci_dev *pdev, struct tp_tcp_stats *v4, + struct tp_tcp_stats *v6); +void cxgb4_iscsi_init(struct net_device *dev, unsigned int tag_mask, + const unsigned int *pgsz_order); +struct sk_buff *cxgb4_pktgl_to_skb(const struct pkt_gl *gl, + unsigned int skb_len, unsigned int pull_len); +int cxgb4_sync_txq_pidx(struct net_device *dev, u16 qid, u16 pidx, u16 size); +int cxgb4_flush_eq_cache(struct net_device *dev); +void cxgb4_disable_db_coalescing(struct net_device *dev); +void cxgb4_enable_db_coalescing(struct net_device *dev); +int cxgb4_read_tpte(struct net_device *dev, u32 stag, __be32 *tpte); +u64 cxgb4_read_sge_timestamp(struct net_device *dev); + +#endif /* !__CXGB4_OFLD_H */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/l2t.c b/drivers/net/ethernet/chelsio/cxgb4/l2t.c new file mode 100644 index 0000000..9604139 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb4/l2t.c @@ -0,0 +1,665 @@ +/* + * This file is part of the Chelsio T4 Ethernet driver for Linux. + * + * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cxgb4.h" +#include "l2t.h" +#include "t4_msg.h" +#include "t4fw_api.h" +#include "t4_regs.h" + +#define VLAN_NONE 0xfff + +/* identifies sync vs async L2T_WRITE_REQs */ +#define F_SYNC_WR (1 << 12) + +enum { + L2T_STATE_VALID, /* entry is up to date */ + L2T_STATE_STALE, /* entry may be used but needs revalidation */ + L2T_STATE_RESOLVING, /* entry needs address resolution */ + L2T_STATE_SYNC_WRITE, /* synchronous write of entry underway */ + + /* when state is one of the below the entry is not hashed */ + L2T_STATE_SWITCHING, /* entry is being used by a switching filter */ + L2T_STATE_UNUSED /* entry not in use */ +}; + +struct l2t_data { + rwlock_t lock; + atomic_t nfree; /* number of free entries */ + struct l2t_entry *rover; /* starting point for next allocation */ + struct l2t_entry l2tab[L2T_SIZE]; +}; + +static inline unsigned int vlan_prio(const struct l2t_entry *e) +{ + return e->vlan >> 13; +} + +static inline void l2t_hold(struct l2t_data *d, struct l2t_entry *e) +{ + if (atomic_add_return(1, &e->refcnt) == 1) /* 0 -> 1 transition */ + atomic_dec(&d->nfree); +} + +/* + * To avoid having to check address families we do not allow v4 and v6 + * neighbors to be on the same hash chain. We keep v4 entries in the first + * half of available hash buckets and v6 in the second. + */ +enum { + L2T_SZ_HALF = L2T_SIZE / 2, + L2T_HASH_MASK = L2T_SZ_HALF - 1 +}; + +static inline unsigned int arp_hash(const u32 *key, int ifindex) +{ + return jhash_2words(*key, ifindex, 0) & L2T_HASH_MASK; +} + +static inline unsigned int ipv6_hash(const u32 *key, int ifindex) +{ + u32 xor = key[0] ^ key[1] ^ key[2] ^ key[3]; + + return L2T_SZ_HALF + (jhash_2words(xor, ifindex, 0) & L2T_HASH_MASK); +} + +static unsigned int addr_hash(const u32 *addr, int addr_len, int ifindex) +{ + return addr_len == 4 ? arp_hash(addr, ifindex) : + ipv6_hash(addr, ifindex); +} + +/* + * Checks if an L2T entry is for the given IP/IPv6 address. It does not check + * whether the L2T entry and the address are of the same address family. + * Callers ensure an address is only checked against L2T entries of the same + * family, something made trivial by the separation of IP and IPv6 hash chains + * mentioned above. Returns 0 if there's a match, + */ +static int addreq(const struct l2t_entry *e, const u32 *addr) +{ + if (e->v6) + return (e->addr[0] ^ addr[0]) | (e->addr[1] ^ addr[1]) | + (e->addr[2] ^ addr[2]) | (e->addr[3] ^ addr[3]); + return e->addr[0] ^ addr[0]; +} + +static void neigh_replace(struct l2t_entry *e, struct neighbour *n) +{ + neigh_hold(n); + if (e->neigh) + neigh_release(e->neigh); + e->neigh = n; +} + +/* + * Write an L2T entry. Must be called with the entry locked. + * The write may be synchronous or asynchronous. + */ +static int write_l2e(struct adapter *adap, struct l2t_entry *e, int sync) +{ + struct sk_buff *skb; + struct cpl_l2t_write_req *req; + + skb = alloc_skb(sizeof(*req), GFP_ATOMIC); + if (!skb) + return -ENOMEM; + + req = (struct cpl_l2t_write_req *)__skb_put(skb, sizeof(*req)); + INIT_TP_WR(req, 0); + + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, + e->idx | (sync ? F_SYNC_WR : 0) | + TID_QID(adap->sge.fw_evtq.abs_id))); + req->params = htons(L2T_W_PORT(e->lport) | L2T_W_NOREPLY(!sync)); + req->l2t_idx = htons(e->idx); + req->vlan = htons(e->vlan); + if (e->neigh && !(e->neigh->dev->flags & IFF_LOOPBACK)) + memcpy(e->dmac, e->neigh->ha, sizeof(e->dmac)); + memcpy(req->dst_mac, e->dmac, sizeof(req->dst_mac)); + + set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0); + t4_ofld_send(adap, skb); + + if (sync && e->state != L2T_STATE_SWITCHING) + e->state = L2T_STATE_SYNC_WRITE; + return 0; +} + +/* + * Send packets waiting in an L2T entry's ARP queue. Must be called with the + * entry locked. + */ +static void send_pending(struct adapter *adap, struct l2t_entry *e) +{ + while (e->arpq_head) { + struct sk_buff *skb = e->arpq_head; + + e->arpq_head = skb->next; + skb->next = NULL; + t4_ofld_send(adap, skb); + } + e->arpq_tail = NULL; +} + +/* + * Process a CPL_L2T_WRITE_RPL. Wake up the ARP queue if it completes a + * synchronous L2T_WRITE. Note that the TID in the reply is really the L2T + * index it refers to. + */ +void do_l2t_write_rpl(struct adapter *adap, const struct cpl_l2t_write_rpl *rpl) +{ + unsigned int tid = GET_TID(rpl); + unsigned int idx = tid & (L2T_SIZE - 1); + + if (unlikely(rpl->status != CPL_ERR_NONE)) { + dev_err(adap->pdev_dev, + "Unexpected L2T_WRITE_RPL status %u for entry %u\n", + rpl->status, idx); + return; + } + + if (tid & F_SYNC_WR) { + struct l2t_entry *e = &adap->l2t->l2tab[idx]; + + spin_lock(&e->lock); + if (e->state != L2T_STATE_SWITCHING) { + send_pending(adap, e); + e->state = (e->neigh->nud_state & NUD_STALE) ? + L2T_STATE_STALE : L2T_STATE_VALID; + } + spin_unlock(&e->lock); + } +} + +/* + * Add a packet to an L2T entry's queue of packets awaiting resolution. + * Must be called with the entry's lock held. + */ +static inline void arpq_enqueue(struct l2t_entry *e, struct sk_buff *skb) +{ + skb->next = NULL; + if (e->arpq_head) + e->arpq_tail->next = skb; + else + e->arpq_head = skb; + e->arpq_tail = skb; +} + +int cxgb4_l2t_send(struct net_device *dev, struct sk_buff *skb, + struct l2t_entry *e) +{ + struct adapter *adap = netdev2adap(dev); + +again: + switch (e->state) { + case L2T_STATE_STALE: /* entry is stale, kick off revalidation */ + neigh_event_send(e->neigh, NULL); + spin_lock_bh(&e->lock); + if (e->state == L2T_STATE_STALE) + e->state = L2T_STATE_VALID; + spin_unlock_bh(&e->lock); + case L2T_STATE_VALID: /* fast-path, send the packet on */ + return t4_ofld_send(adap, skb); + case L2T_STATE_RESOLVING: + case L2T_STATE_SYNC_WRITE: + spin_lock_bh(&e->lock); + if (e->state != L2T_STATE_SYNC_WRITE && + e->state != L2T_STATE_RESOLVING) { + spin_unlock_bh(&e->lock); + goto again; + } + arpq_enqueue(e, skb); + spin_unlock_bh(&e->lock); + + if (e->state == L2T_STATE_RESOLVING && + !neigh_event_send(e->neigh, NULL)) { + spin_lock_bh(&e->lock); + if (e->state == L2T_STATE_RESOLVING && e->arpq_head) + write_l2e(adap, e, 1); + spin_unlock_bh(&e->lock); + } + } + return 0; +} +EXPORT_SYMBOL(cxgb4_l2t_send); + +/* + * Allocate a free L2T entry. Must be called with l2t_data.lock held. + */ +static struct l2t_entry *alloc_l2e(struct l2t_data *d) +{ + struct l2t_entry *end, *e, **p; + + if (!atomic_read(&d->nfree)) + return NULL; + + /* there's definitely a free entry */ + for (e = d->rover, end = &d->l2tab[L2T_SIZE]; e != end; ++e) + if (atomic_read(&e->refcnt) == 0) + goto found; + + for (e = d->l2tab; atomic_read(&e->refcnt); ++e) + ; +found: + d->rover = e + 1; + atomic_dec(&d->nfree); + + /* + * The entry we found may be an inactive entry that is + * presently in the hash table. We need to remove it. + */ + if (e->state < L2T_STATE_SWITCHING) + for (p = &d->l2tab[e->hash].first; *p; p = &(*p)->next) + if (*p == e) { + *p = e->next; + e->next = NULL; + break; + } + + e->state = L2T_STATE_UNUSED; + return e; +} + +/* + * Called when an L2T entry has no more users. + */ +static void t4_l2e_free(struct l2t_entry *e) +{ + struct l2t_data *d; + + spin_lock_bh(&e->lock); + if (atomic_read(&e->refcnt) == 0) { /* hasn't been recycled */ + if (e->neigh) { + neigh_release(e->neigh); + e->neigh = NULL; + } + while (e->arpq_head) { + struct sk_buff *skb = e->arpq_head; + + e->arpq_head = skb->next; + kfree_skb(skb); + } + e->arpq_tail = NULL; + } + spin_unlock_bh(&e->lock); + + d = container_of(e, struct l2t_data, l2tab[e->idx]); + atomic_inc(&d->nfree); +} + +void cxgb4_l2t_release(struct l2t_entry *e) +{ + if (atomic_dec_and_test(&e->refcnt)) + t4_l2e_free(e); +} +EXPORT_SYMBOL(cxgb4_l2t_release); + +/* + * Update an L2T entry that was previously used for the same next hop as neigh. + * Must be called with softirqs disabled. + */ +static void reuse_entry(struct l2t_entry *e, struct neighbour *neigh) +{ + unsigned int nud_state; + + spin_lock(&e->lock); /* avoid race with t4_l2t_free */ + if (neigh != e->neigh) + neigh_replace(e, neigh); + nud_state = neigh->nud_state; + if (memcmp(e->dmac, neigh->ha, sizeof(e->dmac)) || + !(nud_state & NUD_VALID)) + e->state = L2T_STATE_RESOLVING; + else if (nud_state & NUD_CONNECTED) + e->state = L2T_STATE_VALID; + else + e->state = L2T_STATE_STALE; + spin_unlock(&e->lock); +} + +struct l2t_entry *cxgb4_l2t_get(struct l2t_data *d, struct neighbour *neigh, + const struct net_device *physdev, + unsigned int priority) +{ + u8 lport; + u16 vlan; + struct l2t_entry *e; + int addr_len = neigh->tbl->key_len; + u32 *addr = (u32 *)neigh->primary_key; + int ifidx = neigh->dev->ifindex; + int hash = addr_hash(addr, addr_len, ifidx); + + if (neigh->dev->flags & IFF_LOOPBACK) + lport = netdev2pinfo(physdev)->tx_chan + 4; + else + lport = netdev2pinfo(physdev)->lport; + + if (neigh->dev->priv_flags & IFF_802_1Q_VLAN) + vlan = vlan_dev_vlan_id(neigh->dev); + else + vlan = VLAN_NONE; + + write_lock_bh(&d->lock); + for (e = d->l2tab[hash].first; e; e = e->next) + if (!addreq(e, addr) && e->ifindex == ifidx && + e->vlan == vlan && e->lport == lport) { + l2t_hold(d, e); + if (atomic_read(&e->refcnt) == 1) + reuse_entry(e, neigh); + goto done; + } + + /* Need to allocate a new entry */ + e = alloc_l2e(d); + if (e) { + spin_lock(&e->lock); /* avoid race with t4_l2t_free */ + e->state = L2T_STATE_RESOLVING; + if (neigh->dev->flags & IFF_LOOPBACK) + memcpy(e->dmac, physdev->dev_addr, sizeof(e->dmac)); + memcpy(e->addr, addr, addr_len); + e->ifindex = ifidx; + e->hash = hash; + e->lport = lport; + e->v6 = addr_len == 16; + atomic_set(&e->refcnt, 1); + neigh_replace(e, neigh); + e->vlan = vlan; + e->next = d->l2tab[hash].first; + d->l2tab[hash].first = e; + spin_unlock(&e->lock); + } +done: + write_unlock_bh(&d->lock); + return e; +} +EXPORT_SYMBOL(cxgb4_l2t_get); + +u64 cxgb4_select_ntuple(struct net_device *dev, + const struct l2t_entry *l2t) +{ + struct adapter *adap = netdev2adap(dev); + struct tp_params *tp = &adap->params.tp; + u64 ntuple = 0; + + /* Initialize each of the fields which we care about which are present + * in the Compressed Filter Tuple. + */ + if (tp->vlan_shift >= 0 && l2t->vlan != VLAN_NONE) + ntuple |= (u64)(F_FT_VLAN_VLD | l2t->vlan) << tp->vlan_shift; + + if (tp->port_shift >= 0) + ntuple |= (u64)l2t->lport << tp->port_shift; + + if (tp->protocol_shift >= 0) + ntuple |= (u64)IPPROTO_TCP << tp->protocol_shift; + + if (tp->vnic_shift >= 0) { + u32 viid = cxgb4_port_viid(dev); + u32 vf = FW_VIID_VIN_GET(viid); + u32 pf = FW_VIID_PFN_GET(viid); + u32 vld = FW_VIID_VIVLD_GET(viid); + + ntuple |= (u64)(V_FT_VNID_ID_VF(vf) | + V_FT_VNID_ID_PF(pf) | + V_FT_VNID_ID_VLD(vld)) << tp->vnic_shift; + } + + return ntuple; +} +EXPORT_SYMBOL(cxgb4_select_ntuple); + +/* + * Called when address resolution fails for an L2T entry to handle packets + * on the arpq head. If a packet specifies a failure handler it is invoked, + * otherwise the packet is sent to the device. + */ +static void handle_failed_resolution(struct adapter *adap, struct sk_buff *arpq) +{ + while (arpq) { + struct sk_buff *skb = arpq; + const struct l2t_skb_cb *cb = L2T_SKB_CB(skb); + + arpq = skb->next; + skb->next = NULL; + if (cb->arp_err_handler) + cb->arp_err_handler(cb->handle, skb); + else + t4_ofld_send(adap, skb); + } +} + +/* + * Called when the host's neighbor layer makes a change to some entry that is + * loaded into the HW L2 table. + */ +void t4_l2t_update(struct adapter *adap, struct neighbour *neigh) +{ + struct l2t_entry *e; + struct sk_buff *arpq = NULL; + struct l2t_data *d = adap->l2t; + int addr_len = neigh->tbl->key_len; + u32 *addr = (u32 *) neigh->primary_key; + int ifidx = neigh->dev->ifindex; + int hash = addr_hash(addr, addr_len, ifidx); + + read_lock_bh(&d->lock); + for (e = d->l2tab[hash].first; e; e = e->next) + if (!addreq(e, addr) && e->ifindex == ifidx) { + spin_lock(&e->lock); + if (atomic_read(&e->refcnt)) + goto found; + spin_unlock(&e->lock); + break; + } + read_unlock_bh(&d->lock); + return; + + found: + read_unlock(&d->lock); + + if (neigh != e->neigh) + neigh_replace(e, neigh); + + if (e->state == L2T_STATE_RESOLVING) { + if (neigh->nud_state & NUD_FAILED) { + arpq = e->arpq_head; + e->arpq_head = e->arpq_tail = NULL; + } else if ((neigh->nud_state & (NUD_CONNECTED | NUD_STALE)) && + e->arpq_head) { + write_l2e(adap, e, 1); + } + } else { + e->state = neigh->nud_state & NUD_CONNECTED ? + L2T_STATE_VALID : L2T_STATE_STALE; + if (memcmp(e->dmac, neigh->ha, sizeof(e->dmac))) + write_l2e(adap, e, 0); + } + + spin_unlock_bh(&e->lock); + + if (arpq) + handle_failed_resolution(adap, arpq); +} + +/* Allocate an L2T entry for use by a switching rule. Such need to be + * explicitly freed and while busy they are not on any hash chain, so normal + * address resolution updates do not see them. + */ +struct l2t_entry *t4_l2t_alloc_switching(struct l2t_data *d) +{ + struct l2t_entry *e; + + write_lock_bh(&d->lock); + e = alloc_l2e(d); + if (e) { + spin_lock(&e->lock); /* avoid race with t4_l2t_free */ + e->state = L2T_STATE_SWITCHING; + atomic_set(&e->refcnt, 1); + spin_unlock(&e->lock); + } + write_unlock_bh(&d->lock); + return e; +} + +/* Sets/updates the contents of a switching L2T entry that has been allocated + * with an earlier call to @t4_l2t_alloc_switching. + */ +int t4_l2t_set_switching(struct adapter *adap, struct l2t_entry *e, u16 vlan, + u8 port, u8 *eth_addr) +{ + e->vlan = vlan; + e->lport = port; + memcpy(e->dmac, eth_addr, ETH_ALEN); + return write_l2e(adap, e, 0); +} + +struct l2t_data *t4_init_l2t(void) +{ + int i; + struct l2t_data *d; + + d = t4_alloc_mem(sizeof(*d)); + if (!d) + return NULL; + + d->rover = d->l2tab; + atomic_set(&d->nfree, L2T_SIZE); + rwlock_init(&d->lock); + + for (i = 0; i < L2T_SIZE; ++i) { + d->l2tab[i].idx = i; + d->l2tab[i].state = L2T_STATE_UNUSED; + spin_lock_init(&d->l2tab[i].lock); + atomic_set(&d->l2tab[i].refcnt, 0); + } + return d; +} + +static inline void *l2t_get_idx(struct seq_file *seq, loff_t pos) +{ + struct l2t_entry *l2tab = seq->private; + + return pos >= L2T_SIZE ? NULL : &l2tab[pos]; +} + +static void *l2t_seq_start(struct seq_file *seq, loff_t *pos) +{ + return *pos ? l2t_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *l2t_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + v = l2t_get_idx(seq, *pos); + if (v) + ++*pos; + return v; +} + +static void l2t_seq_stop(struct seq_file *seq, void *v) +{ +} + +static char l2e_state(const struct l2t_entry *e) +{ + switch (e->state) { + case L2T_STATE_VALID: return 'V'; + case L2T_STATE_STALE: return 'S'; + case L2T_STATE_SYNC_WRITE: return 'W'; + case L2T_STATE_RESOLVING: return e->arpq_head ? 'A' : 'R'; + case L2T_STATE_SWITCHING: return 'X'; + default: + return 'U'; + } +} + +static int l2t_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, " Idx IP address " + "Ethernet address VLAN/P LP State Users Port\n"); + else { + char ip[60]; + struct l2t_entry *e = v; + + spin_lock_bh(&e->lock); + if (e->state == L2T_STATE_SWITCHING) + ip[0] = '\0'; + else + sprintf(ip, e->v6 ? "%pI6c" : "%pI4", e->addr); + seq_printf(seq, "%4u %-25s %17pM %4d %u %2u %c %5u %s\n", + e->idx, ip, e->dmac, + e->vlan & VLAN_VID_MASK, vlan_prio(e), e->lport, + l2e_state(e), atomic_read(&e->refcnt), + e->neigh ? e->neigh->dev->name : ""); + spin_unlock_bh(&e->lock); + } + return 0; +} + +static const struct seq_operations l2t_seq_ops = { + .start = l2t_seq_start, + .next = l2t_seq_next, + .stop = l2t_seq_stop, + .show = l2t_seq_show +}; + +static int l2t_seq_open(struct inode *inode, struct file *file) +{ + int rc = seq_open(file, &l2t_seq_ops); + + if (!rc) { + struct adapter *adap = inode->i_private; + struct seq_file *seq = file->private_data; + + seq->private = adap->l2t->l2tab; + } + return rc; +} + +const struct file_operations t4_l2t_fops = { + .owner = THIS_MODULE, + .open = l2t_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; diff --git a/drivers/net/ethernet/chelsio/cxgb4/l2t.h b/drivers/net/ethernet/chelsio/cxgb4/l2t.h new file mode 100644 index 0000000..a30126c --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb4/l2t.h @@ -0,0 +1,111 @@ +/* + * This file is part of the Chelsio T4 Ethernet driver for Linux. + * + * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __CXGB4_L2T_H +#define __CXGB4_L2T_H + +#include +#include +#include + +struct adapter; +struct l2t_data; +struct neighbour; +struct net_device; +struct file_operations; +struct cpl_l2t_write_rpl; + +/* + * Each L2T entry plays multiple roles. First of all, it keeps state for the + * corresponding entry of the HW L2 table and maintains a queue of offload + * packets awaiting address resolution. Second, it is a node of a hash table + * chain, where the nodes of the chain are linked together through their next + * pointer. Finally, each node is a bucket of a hash table, pointing to the + * first element in its chain through its first pointer. + */ +struct l2t_entry { + u16 state; /* entry state */ + u16 idx; /* entry index */ + u32 addr[4]; /* next hop IP or IPv6 address */ + int ifindex; /* neighbor's net_device's ifindex */ + struct neighbour *neigh; /* associated neighbour */ + struct l2t_entry *first; /* start of hash chain */ + struct l2t_entry *next; /* next l2t_entry on chain */ + struct sk_buff *arpq_head; /* queue of packets awaiting resolution */ + struct sk_buff *arpq_tail; + spinlock_t lock; + atomic_t refcnt; /* entry reference count */ + u16 hash; /* hash bucket the entry is on */ + u16 vlan; /* VLAN TCI (id: bits 0-11, prio: 13-15 */ + u8 v6; /* whether entry is for IPv6 */ + u8 lport; /* associated offload logical interface */ + u8 dmac[ETH_ALEN]; /* neighbour's MAC address */ +}; + +typedef void (*arp_err_handler_t)(void *handle, struct sk_buff *skb); + +/* + * Callback stored in an skb to handle address resolution failure. + */ +struct l2t_skb_cb { + void *handle; + arp_err_handler_t arp_err_handler; +}; + +#define L2T_SKB_CB(skb) ((struct l2t_skb_cb *)(skb)->cb) + +static inline void t4_set_arp_err_handler(struct sk_buff *skb, void *handle, + arp_err_handler_t handler) +{ + L2T_SKB_CB(skb)->handle = handle; + L2T_SKB_CB(skb)->arp_err_handler = handler; +} + +void cxgb4_l2t_release(struct l2t_entry *e); +int cxgb4_l2t_send(struct net_device *dev, struct sk_buff *skb, + struct l2t_entry *e); +struct l2t_entry *cxgb4_l2t_get(struct l2t_data *d, struct neighbour *neigh, + const struct net_device *physdev, + unsigned int priority); +u64 cxgb4_select_ntuple(struct net_device *dev, + const struct l2t_entry *l2t); +void t4_l2t_update(struct adapter *adap, struct neighbour *neigh); +struct l2t_entry *t4_l2t_alloc_switching(struct l2t_data *d); +int t4_l2t_set_switching(struct adapter *adap, struct l2t_entry *e, u16 vlan, + u8 port, u8 *eth_addr); +struct l2t_data *t4_init_l2t(void); +void do_l2t_write_rpl(struct adapter *p, const struct cpl_l2t_write_rpl *rpl); + +extern const struct file_operations t4_l2t_fops; +#endif /* __CXGB4_L2T_H */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c new file mode 100644 index 0000000..39f2b13 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@ -0,0 +1,2988 @@ +/* + * This file is part of the Chelsio T4 Ethernet driver for Linux. + * + * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cxgb4.h" +#include "t4_regs.h" +#include "t4_msg.h" +#include "t4fw_api.h" + +/* + * Rx buffer size. We use largish buffers if possible but settle for single + * pages under memory shortage. + */ +#if PAGE_SHIFT >= 16 +# define FL_PG_ORDER 0 +#else +# define FL_PG_ORDER (16 - PAGE_SHIFT) +#endif + +/* RX_PULL_LEN should be <= RX_COPY_THRES */ +#define RX_COPY_THRES 256 +#define RX_PULL_LEN 128 + +/* + * Main body length for sk_buffs used for Rx Ethernet packets with fragments. + * Should be >= RX_PULL_LEN but possibly bigger to give pskb_may_pull some room. + */ +#define RX_PKT_SKB_LEN 512 + +/* + * Max number of Tx descriptors we clean up at a time. Should be modest as + * freeing skbs isn't cheap and it happens while holding locks. We just need + * to free packets faster than they arrive, we eventually catch up and keep + * the amortized cost reasonable. Must be >= 2 * TXQ_STOP_THRES. + */ +#define MAX_TX_RECLAIM 16 + +/* + * Max number of Rx buffers we replenish at a time. Again keep this modest, + * allocating buffers isn't cheap either. + */ +#define MAX_RX_REFILL 16U + +/* + * Period of the Rx queue check timer. This timer is infrequent as it has + * something to do only when the system experiences severe memory shortage. + */ +#define RX_QCHECK_PERIOD (HZ / 2) + +/* + * Period of the Tx queue check timer. + */ +#define TX_QCHECK_PERIOD (HZ / 2) + +/* SGE Hung Ingress DMA Threshold Warning time (in Hz) and Warning Repeat Rate + * (in RX_QCHECK_PERIOD multiples). If we find one of the SGE Ingress DMA + * State Machines in the same state for this amount of time (in HZ) then we'll + * issue a warning about a potential hang. We'll repeat the warning as the + * SGE Ingress DMA Channel appears to be hung every N RX_QCHECK_PERIODs till + * the situation clears. If the situation clears, we'll note that as well. + */ +#define SGE_IDMA_WARN_THRESH (1 * HZ) +#define SGE_IDMA_WARN_REPEAT (20 * RX_QCHECK_PERIOD) + +/* + * Max number of Tx descriptors to be reclaimed by the Tx timer. + */ +#define MAX_TIMER_TX_RECLAIM 100 + +/* + * Timer index used when backing off due to memory shortage. + */ +#define NOMEM_TMR_IDX (SGE_NTIMERS - 1) + +/* + * An FL with <= FL_STARVE_THRES buffers is starving and a periodic timer will + * attempt to refill it. + */ +#define FL_STARVE_THRES 4 + +/* + * Suspend an Ethernet Tx queue with fewer available descriptors than this. + * This is the same as calc_tx_descs() for a TSO packet with + * nr_frags == MAX_SKB_FRAGS. + */ +#define ETHTXQ_STOP_THRES \ + (1 + DIV_ROUND_UP((3 * MAX_SKB_FRAGS) / 2 + (MAX_SKB_FRAGS & 1), 8)) + +/* + * Suspension threshold for non-Ethernet Tx queues. We require enough room + * for a full sized WR. + */ +#define TXQ_STOP_THRES (SGE_MAX_WR_LEN / sizeof(struct tx_desc)) + +/* + * Max Tx descriptor space we allow for an Ethernet packet to be inlined + * into a WR. + */ +#define MAX_IMM_TX_PKT_LEN 128 + +/* + * Max size of a WR sent through a control Tx queue. + */ +#define MAX_CTRL_WR_LEN SGE_MAX_WR_LEN + +struct tx_sw_desc { /* SW state per Tx descriptor */ + struct sk_buff *skb; + struct ulptx_sgl *sgl; +}; + +struct rx_sw_desc { /* SW state per Rx descriptor */ + struct page *page; + dma_addr_t dma_addr; +}; + +/* + * Rx buffer sizes for "useskbs" Free List buffers (one ingress packet pe skb + * buffer). We currently only support two sizes for 1500- and 9000-byte MTUs. + * We could easily support more but there doesn't seem to be much need for + * that ... + */ +#define FL_MTU_SMALL 1500 +#define FL_MTU_LARGE 9000 + +static inline unsigned int fl_mtu_bufsize(struct adapter *adapter, + unsigned int mtu) +{ + struct sge *s = &adapter->sge; + + return ALIGN(s->pktshift + ETH_HLEN + VLAN_HLEN + mtu, s->fl_align); +} + +#define FL_MTU_SMALL_BUFSIZE(adapter) fl_mtu_bufsize(adapter, FL_MTU_SMALL) +#define FL_MTU_LARGE_BUFSIZE(adapter) fl_mtu_bufsize(adapter, FL_MTU_LARGE) + +/* + * Bits 0..3 of rx_sw_desc.dma_addr have special meaning. The hardware uses + * these to specify the buffer size as an index into the SGE Free List Buffer + * Size register array. We also use bit 4, when the buffer has been unmapped + * for DMA, but this is of course never sent to the hardware and is only used + * to prevent double unmappings. All of the above requires that the Free List + * Buffers which we allocate have the bottom 5 bits free (0) -- i.e. are + * 32-byte or or a power of 2 greater in alignment. Since the SGE's minimal + * Free List Buffer alignment is 32 bytes, this works out for us ... + */ +enum { + RX_BUF_FLAGS = 0x1f, /* bottom five bits are special */ + RX_BUF_SIZE = 0x0f, /* bottom three bits are for buf sizes */ + RX_UNMAPPED_BUF = 0x10, /* buffer is not mapped */ + + /* + * XXX We shouldn't depend on being able to use these indices. + * XXX Especially when some other Master PF has initialized the + * XXX adapter or we use the Firmware Configuration File. We + * XXX should really search through the Host Buffer Size register + * XXX array for the appropriately sized buffer indices. + */ + RX_SMALL_PG_BUF = 0x0, /* small (PAGE_SIZE) page buffer */ + RX_LARGE_PG_BUF = 0x1, /* buffer large (FL_PG_ORDER) page buffer */ + + RX_SMALL_MTU_BUF = 0x2, /* small MTU buffer */ + RX_LARGE_MTU_BUF = 0x3, /* large MTU buffer */ +}; + +static int timer_pkt_quota[] = {1, 1, 2, 3, 4, 5}; +#define MIN_NAPI_WORK 1 + +static inline dma_addr_t get_buf_addr(const struct rx_sw_desc *d) +{ + return d->dma_addr & ~(dma_addr_t)RX_BUF_FLAGS; +} + +static inline bool is_buf_mapped(const struct rx_sw_desc *d) +{ + return !(d->dma_addr & RX_UNMAPPED_BUF); +} + +/** + * txq_avail - return the number of available slots in a Tx queue + * @q: the Tx queue + * + * Returns the number of descriptors in a Tx queue available to write new + * packets. + */ +static inline unsigned int txq_avail(const struct sge_txq *q) +{ + return q->size - 1 - q->in_use; +} + +/** + * fl_cap - return the capacity of a free-buffer list + * @fl: the FL + * + * Returns the capacity of a free-buffer list. The capacity is less than + * the size because one descriptor needs to be left unpopulated, otherwise + * HW will think the FL is empty. + */ +static inline unsigned int fl_cap(const struct sge_fl *fl) +{ + return fl->size - 8; /* 1 descriptor = 8 buffers */ +} + +static inline bool fl_starving(const struct sge_fl *fl) +{ + return fl->avail - fl->pend_cred <= FL_STARVE_THRES; +} + +static int map_skb(struct device *dev, const struct sk_buff *skb, + dma_addr_t *addr) +{ + const skb_frag_t *fp, *end; + const struct skb_shared_info *si; + + *addr = dma_map_single(dev, skb->data, skb_headlen(skb), DMA_TO_DEVICE); + if (dma_mapping_error(dev, *addr)) + goto out_err; + + si = skb_shinfo(skb); + end = &si->frags[si->nr_frags]; + + for (fp = si->frags; fp < end; fp++) { + *++addr = skb_frag_dma_map(dev, fp, 0, skb_frag_size(fp), + DMA_TO_DEVICE); + if (dma_mapping_error(dev, *addr)) + goto unwind; + } + return 0; + +unwind: + while (fp-- > si->frags) + dma_unmap_page(dev, *--addr, skb_frag_size(fp), DMA_TO_DEVICE); + + dma_unmap_single(dev, addr[-1], skb_headlen(skb), DMA_TO_DEVICE); +out_err: + return -ENOMEM; +} + +#ifdef CONFIG_NEED_DMA_MAP_STATE +static void unmap_skb(struct device *dev, const struct sk_buff *skb, + const dma_addr_t *addr) +{ + const skb_frag_t *fp, *end; + const struct skb_shared_info *si; + + dma_unmap_single(dev, *addr++, skb_headlen(skb), DMA_TO_DEVICE); + + si = skb_shinfo(skb); + end = &si->frags[si->nr_frags]; + for (fp = si->frags; fp < end; fp++) + dma_unmap_page(dev, *addr++, skb_frag_size(fp), DMA_TO_DEVICE); +} + +/** + * deferred_unmap_destructor - unmap a packet when it is freed + * @skb: the packet + * + * This is the packet destructor used for Tx packets that need to remain + * mapped until they are freed rather than until their Tx descriptors are + * freed. + */ +static void deferred_unmap_destructor(struct sk_buff *skb) +{ + unmap_skb(skb->dev->dev.parent, skb, (dma_addr_t *)skb->head); +} +#endif + +static void unmap_sgl(struct device *dev, const struct sk_buff *skb, + const struct ulptx_sgl *sgl, const struct sge_txq *q) +{ + const struct ulptx_sge_pair *p; + unsigned int nfrags = skb_shinfo(skb)->nr_frags; + + if (likely(skb_headlen(skb))) + dma_unmap_single(dev, be64_to_cpu(sgl->addr0), ntohl(sgl->len0), + DMA_TO_DEVICE); + else { + dma_unmap_page(dev, be64_to_cpu(sgl->addr0), ntohl(sgl->len0), + DMA_TO_DEVICE); + nfrags--; + } + + /* + * the complexity below is because of the possibility of a wrap-around + * in the middle of an SGL + */ + for (p = sgl->sge; nfrags >= 2; nfrags -= 2) { + if (likely((u8 *)(p + 1) <= (u8 *)q->stat)) { +unmap: dma_unmap_page(dev, be64_to_cpu(p->addr[0]), + ntohl(p->len[0]), DMA_TO_DEVICE); + dma_unmap_page(dev, be64_to_cpu(p->addr[1]), + ntohl(p->len[1]), DMA_TO_DEVICE); + p++; + } else if ((u8 *)p == (u8 *)q->stat) { + p = (const struct ulptx_sge_pair *)q->desc; + goto unmap; + } else if ((u8 *)p + 8 == (u8 *)q->stat) { + const __be64 *addr = (const __be64 *)q->desc; + + dma_unmap_page(dev, be64_to_cpu(addr[0]), + ntohl(p->len[0]), DMA_TO_DEVICE); + dma_unmap_page(dev, be64_to_cpu(addr[1]), + ntohl(p->len[1]), DMA_TO_DEVICE); + p = (const struct ulptx_sge_pair *)&addr[2]; + } else { + const __be64 *addr = (const __be64 *)q->desc; + + dma_unmap_page(dev, be64_to_cpu(p->addr[0]), + ntohl(p->len[0]), DMA_TO_DEVICE); + dma_unmap_page(dev, be64_to_cpu(addr[0]), + ntohl(p->len[1]), DMA_TO_DEVICE); + p = (const struct ulptx_sge_pair *)&addr[1]; + } + } + if (nfrags) { + __be64 addr; + + if ((u8 *)p == (u8 *)q->stat) + p = (const struct ulptx_sge_pair *)q->desc; + addr = (u8 *)p + 16 <= (u8 *)q->stat ? p->addr[0] : + *(const __be64 *)q->desc; + dma_unmap_page(dev, be64_to_cpu(addr), ntohl(p->len[0]), + DMA_TO_DEVICE); + } +} + +/** + * free_tx_desc - reclaims Tx descriptors and their buffers + * @adapter: the adapter + * @q: the Tx queue to reclaim descriptors from + * @n: the number of descriptors to reclaim + * @unmap: whether the buffers should be unmapped for DMA + * + * Reclaims Tx descriptors from an SGE Tx queue and frees the associated + * Tx buffers. Called with the Tx queue lock held. + */ +static void free_tx_desc(struct adapter *adap, struct sge_txq *q, + unsigned int n, bool unmap) +{ + struct tx_sw_desc *d; + unsigned int cidx = q->cidx; + struct device *dev = adap->pdev_dev; + + d = &q->sdesc[cidx]; + while (n--) { + if (d->skb) { /* an SGL is present */ + if (unmap) + unmap_sgl(dev, d->skb, d->sgl, q); + dev_consume_skb_any(d->skb); + d->skb = NULL; + } + ++d; + if (++cidx == q->size) { + cidx = 0; + d = q->sdesc; + } + } + q->cidx = cidx; +} + +/* + * Return the number of reclaimable descriptors in a Tx queue. + */ +static inline int reclaimable(const struct sge_txq *q) +{ + int hw_cidx = ntohs(q->stat->cidx); + hw_cidx -= q->cidx; + return hw_cidx < 0 ? hw_cidx + q->size : hw_cidx; +} + +/** + * reclaim_completed_tx - reclaims completed Tx descriptors + * @adap: the adapter + * @q: the Tx queue to reclaim completed descriptors from + * @unmap: whether the buffers should be unmapped for DMA + * + * Reclaims Tx descriptors that the SGE has indicated it has processed, + * and frees the associated buffers if possible. Called with the Tx + * queue locked. + */ +static inline void reclaim_completed_tx(struct adapter *adap, struct sge_txq *q, + bool unmap) +{ + int avail = reclaimable(q); + + if (avail) { + /* + * Limit the amount of clean up work we do at a time to keep + * the Tx lock hold time O(1). + */ + if (avail > MAX_TX_RECLAIM) + avail = MAX_TX_RECLAIM; + + free_tx_desc(adap, q, avail, unmap); + q->in_use -= avail; + } +} + +static inline int get_buf_size(struct adapter *adapter, + const struct rx_sw_desc *d) +{ + struct sge *s = &adapter->sge; + unsigned int rx_buf_size_idx = d->dma_addr & RX_BUF_SIZE; + int buf_size; + + switch (rx_buf_size_idx) { + case RX_SMALL_PG_BUF: + buf_size = PAGE_SIZE; + break; + + case RX_LARGE_PG_BUF: + buf_size = PAGE_SIZE << s->fl_pg_order; + break; + + case RX_SMALL_MTU_BUF: + buf_size = FL_MTU_SMALL_BUFSIZE(adapter); + break; + + case RX_LARGE_MTU_BUF: + buf_size = FL_MTU_LARGE_BUFSIZE(adapter); + break; + + default: + BUG_ON(1); + } + + return buf_size; +} + +/** + * free_rx_bufs - free the Rx buffers on an SGE free list + * @adap: the adapter + * @q: the SGE free list to free buffers from + * @n: how many buffers to free + * + * Release the next @n buffers on an SGE free-buffer Rx queue. The + * buffers must be made inaccessible to HW before calling this function. + */ +static void free_rx_bufs(struct adapter *adap, struct sge_fl *q, int n) +{ + while (n--) { + struct rx_sw_desc *d = &q->sdesc[q->cidx]; + + if (is_buf_mapped(d)) + dma_unmap_page(adap->pdev_dev, get_buf_addr(d), + get_buf_size(adap, d), + PCI_DMA_FROMDEVICE); + put_page(d->page); + d->page = NULL; + if (++q->cidx == q->size) + q->cidx = 0; + q->avail--; + } +} + +/** + * unmap_rx_buf - unmap the current Rx buffer on an SGE free list + * @adap: the adapter + * @q: the SGE free list + * + * Unmap the current buffer on an SGE free-buffer Rx queue. The + * buffer must be made inaccessible to HW before calling this function. + * + * This is similar to @free_rx_bufs above but does not free the buffer. + * Do note that the FL still loses any further access to the buffer. + */ +static void unmap_rx_buf(struct adapter *adap, struct sge_fl *q) +{ + struct rx_sw_desc *d = &q->sdesc[q->cidx]; + + if (is_buf_mapped(d)) + dma_unmap_page(adap->pdev_dev, get_buf_addr(d), + get_buf_size(adap, d), PCI_DMA_FROMDEVICE); + d->page = NULL; + if (++q->cidx == q->size) + q->cidx = 0; + q->avail--; +} + +static inline void ring_fl_db(struct adapter *adap, struct sge_fl *q) +{ + u32 val; + if (q->pend_cred >= 8) { + val = PIDX(q->pend_cred / 8); + if (!is_t4(adap->params.chip)) + val |= DBTYPE(1); + val |= DBPRIO(1); + wmb(); + + /* If we're on T4, use the old doorbell mechanism; otherwise + * use the new BAR2 mechanism. + */ + if (is_t4(adap->params.chip)) { + t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL), + val | QID(q->cntxt_id)); + } else { + writel(val, adap->bar2 + q->udb + SGE_UDB_KDOORBELL); + + /* This Write memory Barrier will force the write to + * the User Doorbell area to be flushed. + */ + wmb(); + } + q->pend_cred &= 7; + } +} + +static inline void set_rx_sw_desc(struct rx_sw_desc *sd, struct page *pg, + dma_addr_t mapping) +{ + sd->page = pg; + sd->dma_addr = mapping; /* includes size low bits */ +} + +/** + * refill_fl - refill an SGE Rx buffer ring + * @adap: the adapter + * @q: the ring to refill + * @n: the number of new buffers to allocate + * @gfp: the gfp flags for the allocations + * + * (Re)populate an SGE free-buffer queue with up to @n new packet buffers, + * allocated with the supplied gfp flags. The caller must assure that + * @n does not exceed the queue's capacity. If afterwards the queue is + * found critically low mark it as starving in the bitmap of starving FLs. + * + * Returns the number of buffers allocated. + */ +static unsigned int refill_fl(struct adapter *adap, struct sge_fl *q, int n, + gfp_t gfp) +{ + struct sge *s = &adap->sge; + struct page *pg; + dma_addr_t mapping; + unsigned int cred = q->avail; + __be64 *d = &q->desc[q->pidx]; + struct rx_sw_desc *sd = &q->sdesc[q->pidx]; + + gfp |= __GFP_NOWARN | __GFP_COLD; + + if (s->fl_pg_order == 0) + goto alloc_small_pages; + + /* + * Prefer large buffers + */ + while (n) { + pg = alloc_pages(gfp | __GFP_COMP, s->fl_pg_order); + if (unlikely(!pg)) { + q->large_alloc_failed++; + break; /* fall back to single pages */ + } + + mapping = dma_map_page(adap->pdev_dev, pg, 0, + PAGE_SIZE << s->fl_pg_order, + PCI_DMA_FROMDEVICE); + if (unlikely(dma_mapping_error(adap->pdev_dev, mapping))) { + __free_pages(pg, s->fl_pg_order); + goto out; /* do not try small pages for this error */ + } + mapping |= RX_LARGE_PG_BUF; + *d++ = cpu_to_be64(mapping); + + set_rx_sw_desc(sd, pg, mapping); + sd++; + + q->avail++; + if (++q->pidx == q->size) { + q->pidx = 0; + sd = q->sdesc; + d = q->desc; + } + n--; + } + +alloc_small_pages: + while (n--) { + pg = __skb_alloc_page(gfp, NULL); + if (unlikely(!pg)) { + q->alloc_failed++; + break; + } + + mapping = dma_map_page(adap->pdev_dev, pg, 0, PAGE_SIZE, + PCI_DMA_FROMDEVICE); + if (unlikely(dma_mapping_error(adap->pdev_dev, mapping))) { + put_page(pg); + goto out; + } + *d++ = cpu_to_be64(mapping); + + set_rx_sw_desc(sd, pg, mapping); + sd++; + + q->avail++; + if (++q->pidx == q->size) { + q->pidx = 0; + sd = q->sdesc; + d = q->desc; + } + } + +out: cred = q->avail - cred; + q->pend_cred += cred; + ring_fl_db(adap, q); + + if (unlikely(fl_starving(q))) { + smp_wmb(); + set_bit(q->cntxt_id - adap->sge.egr_start, + adap->sge.starving_fl); + } + + return cred; +} + +static inline void __refill_fl(struct adapter *adap, struct sge_fl *fl) +{ + refill_fl(adap, fl, min(MAX_RX_REFILL, fl_cap(fl) - fl->avail), + GFP_ATOMIC); +} + +/** + * alloc_ring - allocate resources for an SGE descriptor ring + * @dev: the PCI device's core device + * @nelem: the number of descriptors + * @elem_size: the size of each descriptor + * @sw_size: the size of the SW state associated with each ring element + * @phys: the physical address of the allocated ring + * @metadata: address of the array holding the SW state for the ring + * @stat_size: extra space in HW ring for status information + * @node: preferred node for memory allocations + * + * Allocates resources for an SGE descriptor ring, such as Tx queues, + * free buffer lists, or response queues. Each SGE ring requires + * space for its HW descriptors plus, optionally, space for the SW state + * associated with each HW entry (the metadata). The function returns + * three values: the virtual address for the HW ring (the return value + * of the function), the bus address of the HW ring, and the address + * of the SW ring. + */ +static void *alloc_ring(struct device *dev, size_t nelem, size_t elem_size, + size_t sw_size, dma_addr_t *phys, void *metadata, + size_t stat_size, int node) +{ + size_t len = nelem * elem_size + stat_size; + void *s = NULL; + void *p = dma_alloc_coherent(dev, len, phys, GFP_KERNEL); + + if (!p) + return NULL; + if (sw_size) { + s = kzalloc_node(nelem * sw_size, GFP_KERNEL, node); + + if (!s) { + dma_free_coherent(dev, len, p, *phys); + return NULL; + } + } + if (metadata) + *(void **)metadata = s; + memset(p, 0, len); + return p; +} + +/** + * sgl_len - calculates the size of an SGL of the given capacity + * @n: the number of SGL entries + * + * Calculates the number of flits needed for a scatter/gather list that + * can hold the given number of entries. + */ +static inline unsigned int sgl_len(unsigned int n) +{ + n--; + return (3 * n) / 2 + (n & 1) + 2; +} + +/** + * flits_to_desc - returns the num of Tx descriptors for the given flits + * @n: the number of flits + * + * Returns the number of Tx descriptors needed for the supplied number + * of flits. + */ +static inline unsigned int flits_to_desc(unsigned int n) +{ + BUG_ON(n > SGE_MAX_WR_LEN / 8); + return DIV_ROUND_UP(n, 8); +} + +/** + * is_eth_imm - can an Ethernet packet be sent as immediate data? + * @skb: the packet + * + * Returns whether an Ethernet packet is small enough to fit as + * immediate data. Return value corresponds to headroom required. + */ +static inline int is_eth_imm(const struct sk_buff *skb) +{ + int hdrlen = skb_shinfo(skb)->gso_size ? + sizeof(struct cpl_tx_pkt_lso_core) : 0; + + hdrlen += sizeof(struct cpl_tx_pkt); + if (skb->len <= MAX_IMM_TX_PKT_LEN - hdrlen) + return hdrlen; + return 0; +} + +/** + * calc_tx_flits - calculate the number of flits for a packet Tx WR + * @skb: the packet + * + * Returns the number of flits needed for a Tx WR for the given Ethernet + * packet, including the needed WR and CPL headers. + */ +static inline unsigned int calc_tx_flits(const struct sk_buff *skb) +{ + unsigned int flits; + int hdrlen = is_eth_imm(skb); + + if (hdrlen) + return DIV_ROUND_UP(skb->len + hdrlen, sizeof(__be64)); + + flits = sgl_len(skb_shinfo(skb)->nr_frags + 1) + 4; + if (skb_shinfo(skb)->gso_size) + flits += 2; + return flits; +} + +/** + * calc_tx_descs - calculate the number of Tx descriptors for a packet + * @skb: the packet + * + * Returns the number of Tx descriptors needed for the given Ethernet + * packet, including the needed WR and CPL headers. + */ +static inline unsigned int calc_tx_descs(const struct sk_buff *skb) +{ + return flits_to_desc(calc_tx_flits(skb)); +} + +/** + * write_sgl - populate a scatter/gather list for a packet + * @skb: the packet + * @q: the Tx queue we are writing into + * @sgl: starting location for writing the SGL + * @end: points right after the end of the SGL + * @start: start offset into skb main-body data to include in the SGL + * @addr: the list of bus addresses for the SGL elements + * + * Generates a gather list for the buffers that make up a packet. + * The caller must provide adequate space for the SGL that will be written. + * The SGL includes all of the packet's page fragments and the data in its + * main body except for the first @start bytes. @sgl must be 16-byte + * aligned and within a Tx descriptor with available space. @end points + * right after the end of the SGL but does not account for any potential + * wrap around, i.e., @end > @sgl. + */ +static void write_sgl(const struct sk_buff *skb, struct sge_txq *q, + struct ulptx_sgl *sgl, u64 *end, unsigned int start, + const dma_addr_t *addr) +{ + unsigned int i, len; + struct ulptx_sge_pair *to; + const struct skb_shared_info *si = skb_shinfo(skb); + unsigned int nfrags = si->nr_frags; + struct ulptx_sge_pair buf[MAX_SKB_FRAGS / 2 + 1]; + + len = skb_headlen(skb) - start; + if (likely(len)) { + sgl->len0 = htonl(len); + sgl->addr0 = cpu_to_be64(addr[0] + start); + nfrags++; + } else { + sgl->len0 = htonl(skb_frag_size(&si->frags[0])); + sgl->addr0 = cpu_to_be64(addr[1]); + } + + sgl->cmd_nsge = htonl(ULPTX_CMD(ULP_TX_SC_DSGL) | ULPTX_NSGE(nfrags)); + if (likely(--nfrags == 0)) + return; + /* + * Most of the complexity below deals with the possibility we hit the + * end of the queue in the middle of writing the SGL. For this case + * only we create the SGL in a temporary buffer and then copy it. + */ + to = (u8 *)end > (u8 *)q->stat ? buf : sgl->sge; + + for (i = (nfrags != si->nr_frags); nfrags >= 2; nfrags -= 2, to++) { + to->len[0] = cpu_to_be32(skb_frag_size(&si->frags[i])); + to->len[1] = cpu_to_be32(skb_frag_size(&si->frags[++i])); + to->addr[0] = cpu_to_be64(addr[i]); + to->addr[1] = cpu_to_be64(addr[++i]); + } + if (nfrags) { + to->len[0] = cpu_to_be32(skb_frag_size(&si->frags[i])); + to->len[1] = cpu_to_be32(0); + to->addr[0] = cpu_to_be64(addr[i + 1]); + } + if (unlikely((u8 *)end > (u8 *)q->stat)) { + unsigned int part0 = (u8 *)q->stat - (u8 *)sgl->sge, part1; + + if (likely(part0)) + memcpy(sgl->sge, buf, part0); + part1 = (u8 *)end - (u8 *)q->stat; + memcpy(q->desc, (u8 *)buf + part0, part1); + end = (void *)q->desc + part1; + } + if ((uintptr_t)end & 8) /* 0-pad to multiple of 16 */ + *end = 0; +} + +/* This function copies a tx_desc struct to memory mapped BAR2 space(user space + * writes). For coalesced WR SGE, fetches data from the FIFO instead of from + * Host. + */ +static void cxgb_pio_copy(u64 __iomem *dst, struct tx_desc *desc) +{ + int count = sizeof(*desc) / sizeof(u64); + u64 *src = (u64 *)desc; + + while (count) { + writeq(*src, dst); + src++; + dst++; + count--; + } +} + +/** + * ring_tx_db - check and potentially ring a Tx queue's doorbell + * @adap: the adapter + * @q: the Tx queue + * @n: number of new descriptors to give to HW + * + * Ring the doorbel for a Tx queue. + */ +static inline void ring_tx_db(struct adapter *adap, struct sge_txq *q, int n) +{ + wmb(); /* write descriptors before telling HW */ + + if (is_t4(adap->params.chip)) { + u32 val = PIDX(n); + unsigned long flags; + + /* For T4 we need to participate in the Doorbell Recovery + * mechanism. + */ + spin_lock_irqsave(&q->db_lock, flags); + if (!q->db_disabled) + t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL), + QID(q->cntxt_id) | val); + else + q->db_pidx_inc += n; + q->db_pidx = q->pidx; + spin_unlock_irqrestore(&q->db_lock, flags); + } else { + u32 val = PIDX_T5(n); + + /* T4 and later chips share the same PIDX field offset within + * the doorbell, but T5 and later shrank the field in order to + * gain a bit for Doorbell Priority. The field was absurdly + * large in the first place (14 bits) so we just use the T5 + * and later limits and warn if a Queue ID is too large. + */ + WARN_ON(val & DBPRIO(1)); + + /* For T5 and later we use the Write-Combine mapped BAR2 User + * Doorbell mechanism. If we're only writing a single TX + * Descriptor and TX Write Combining hasn't been disabled, we + * can use the Write Combining Gather Buffer; otherwise we use + * the simple doorbell. + */ + if (n == 1) { + int index = (q->pidx + ? (q->pidx - 1) + : (q->size - 1)); + + cxgb_pio_copy(adap->bar2 + q->udb + SGE_UDB_WCDOORBELL, + q->desc + index); + } else { + writel(val, adap->bar2 + q->udb + SGE_UDB_KDOORBELL); + } + + /* This Write Memory Barrier will force the write to the User + * Doorbell area to be flushed. This is needed to prevent + * writes on different CPUs for the same queue from hitting + * the adapter out of order. This is required when some Work + * Requests take the Write Combine Gather Buffer path (user + * doorbell area offset [SGE_UDB_WCDOORBELL..+63]) and some + * take the traditional path where we simply increment the + * PIDX (User Doorbell area SGE_UDB_KDOORBELL) and have the + * hardware DMA read the actual Work Request. + */ + wmb(); + } +} + +/** + * inline_tx_skb - inline a packet's data into Tx descriptors + * @skb: the packet + * @q: the Tx queue where the packet will be inlined + * @pos: starting position in the Tx queue where to inline the packet + * + * Inline a packet's contents directly into Tx descriptors, starting at + * the given position within the Tx DMA ring. + * Most of the complexity of this operation is dealing with wrap arounds + * in the middle of the packet we want to inline. + */ +static void inline_tx_skb(const struct sk_buff *skb, const struct sge_txq *q, + void *pos) +{ + u64 *p; + int left = (void *)q->stat - pos; + + if (likely(skb->len <= left)) { + if (likely(!skb->data_len)) + skb_copy_from_linear_data(skb, pos, skb->len); + else + skb_copy_bits(skb, 0, pos, skb->len); + pos += skb->len; + } else { + skb_copy_bits(skb, 0, pos, left); + skb_copy_bits(skb, left, q->desc, skb->len - left); + pos = (void *)q->desc + (skb->len - left); + } + + /* 0-pad to multiple of 16 */ + p = PTR_ALIGN(pos, 8); + if ((uintptr_t)p & 8) + *p = 0; +} + +/* + * Figure out what HW csum a packet wants and return the appropriate control + * bits. + */ +static u64 hwcsum(const struct sk_buff *skb) +{ + int csum_type; + const struct iphdr *iph = ip_hdr(skb); + + if (iph->version == 4) { + if (iph->protocol == IPPROTO_TCP) + csum_type = TX_CSUM_TCPIP; + else if (iph->protocol == IPPROTO_UDP) + csum_type = TX_CSUM_UDPIP; + else { +nocsum: /* + * unknown protocol, disable HW csum + * and hope a bad packet is detected + */ + return TXPKT_L4CSUM_DIS; + } + } else { + /* + * this doesn't work with extension headers + */ + const struct ipv6hdr *ip6h = (const struct ipv6hdr *)iph; + + if (ip6h->nexthdr == IPPROTO_TCP) + csum_type = TX_CSUM_TCPIP6; + else if (ip6h->nexthdr == IPPROTO_UDP) + csum_type = TX_CSUM_UDPIP6; + else + goto nocsum; + } + + if (likely(csum_type >= TX_CSUM_TCPIP)) + return TXPKT_CSUM_TYPE(csum_type) | + TXPKT_IPHDR_LEN(skb_network_header_len(skb)) | + TXPKT_ETHHDR_LEN(skb_network_offset(skb) - ETH_HLEN); + else { + int start = skb_transport_offset(skb); + + return TXPKT_CSUM_TYPE(csum_type) | TXPKT_CSUM_START(start) | + TXPKT_CSUM_LOC(start + skb->csum_offset); + } +} + +static void eth_txq_stop(struct sge_eth_txq *q) +{ + netif_tx_stop_queue(q->txq); + q->q.stops++; +} + +static inline void txq_advance(struct sge_txq *q, unsigned int n) +{ + q->in_use += n; + q->pidx += n; + if (q->pidx >= q->size) + q->pidx -= q->size; +} + +/** + * t4_eth_xmit - add a packet to an Ethernet Tx queue + * @skb: the packet + * @dev: the egress net device + * + * Add a packet to an SGE Ethernet Tx queue. Runs with softirqs disabled. + */ +netdev_tx_t t4_eth_xmit(struct sk_buff *skb, struct net_device *dev) +{ + int len; + u32 wr_mid; + u64 cntrl, *end; + int qidx, credits; + unsigned int flits, ndesc; + struct adapter *adap; + struct sge_eth_txq *q; + const struct port_info *pi; + struct fw_eth_tx_pkt_wr *wr; + struct cpl_tx_pkt_core *cpl; + const struct skb_shared_info *ssi; + dma_addr_t addr[MAX_SKB_FRAGS + 1]; + bool immediate = false; + + /* + * The chip min packet length is 10 octets but play safe and reject + * anything shorter than an Ethernet header. + */ + if (unlikely(skb->len < ETH_HLEN)) { +out_free: dev_kfree_skb_any(skb); + return NETDEV_TX_OK; + } + + pi = netdev_priv(dev); + adap = pi->adapter; + qidx = skb_get_queue_mapping(skb); + q = &adap->sge.ethtxq[qidx + pi->first_qset]; + + reclaim_completed_tx(adap, &q->q, true); + + flits = calc_tx_flits(skb); + ndesc = flits_to_desc(flits); + credits = txq_avail(&q->q) - ndesc; + + if (unlikely(credits < 0)) { + eth_txq_stop(q); + dev_err(adap->pdev_dev, + "%s: Tx ring %u full while queue awake!\n", + dev->name, qidx); + return NETDEV_TX_BUSY; + } + + if (is_eth_imm(skb)) + immediate = true; + + if (!immediate && + unlikely(map_skb(adap->pdev_dev, skb, addr) < 0)) { + q->mapping_err++; + goto out_free; + } + + wr_mid = FW_WR_LEN16(DIV_ROUND_UP(flits, 2)); + if (unlikely(credits < ETHTXQ_STOP_THRES)) { + eth_txq_stop(q); + wr_mid |= FW_WR_EQUEQ | FW_WR_EQUIQ; + } + + wr = (void *)&q->q.desc[q->q.pidx]; + wr->equiq_to_len16 = htonl(wr_mid); + wr->r3 = cpu_to_be64(0); + end = (u64 *)wr + flits; + + len = immediate ? skb->len : 0; + ssi = skb_shinfo(skb); + if (ssi->gso_size) { + struct cpl_tx_pkt_lso *lso = (void *)wr; + bool v6 = (ssi->gso_type & SKB_GSO_TCPV6) != 0; + int l3hdr_len = skb_network_header_len(skb); + int eth_xtra_len = skb_network_offset(skb) - ETH_HLEN; + + len += sizeof(*lso); + wr->op_immdlen = htonl(FW_WR_OP(FW_ETH_TX_PKT_WR) | + FW_WR_IMMDLEN(len)); + lso->c.lso_ctrl = htonl(LSO_OPCODE(CPL_TX_PKT_LSO) | + LSO_FIRST_SLICE | LSO_LAST_SLICE | + LSO_IPV6(v6) | + LSO_ETHHDR_LEN(eth_xtra_len / 4) | + LSO_IPHDR_LEN(l3hdr_len / 4) | + LSO_TCPHDR_LEN(tcp_hdr(skb)->doff)); + lso->c.ipid_ofst = htons(0); + lso->c.mss = htons(ssi->gso_size); + lso->c.seqno_offset = htonl(0); + if (is_t4(adap->params.chip)) + lso->c.len = htonl(skb->len); + else + lso->c.len = htonl(LSO_T5_XFER_SIZE(skb->len)); + cpl = (void *)(lso + 1); + cntrl = TXPKT_CSUM_TYPE(v6 ? TX_CSUM_TCPIP6 : TX_CSUM_TCPIP) | + TXPKT_IPHDR_LEN(l3hdr_len) | + TXPKT_ETHHDR_LEN(eth_xtra_len); + q->tso++; + q->tx_cso += ssi->gso_segs; + } else { + len += sizeof(*cpl); + wr->op_immdlen = htonl(FW_WR_OP(FW_ETH_TX_PKT_WR) | + FW_WR_IMMDLEN(len)); + cpl = (void *)(wr + 1); + if (skb->ip_summed == CHECKSUM_PARTIAL) { + cntrl = hwcsum(skb) | TXPKT_IPCSUM_DIS; + q->tx_cso++; + } else + cntrl = TXPKT_L4CSUM_DIS | TXPKT_IPCSUM_DIS; + } + + if (vlan_tx_tag_present(skb)) { + q->vlan_ins++; + cntrl |= TXPKT_VLAN_VLD | TXPKT_VLAN(vlan_tx_tag_get(skb)); + } + + cpl->ctrl0 = htonl(TXPKT_OPCODE(CPL_TX_PKT_XT) | + TXPKT_INTF(pi->tx_chan) | TXPKT_PF(adap->fn)); + cpl->pack = htons(0); + cpl->len = htons(skb->len); + cpl->ctrl1 = cpu_to_be64(cntrl); + + if (immediate) { + inline_tx_skb(skb, &q->q, cpl + 1); + dev_consume_skb_any(skb); + } else { + int last_desc; + + write_sgl(skb, &q->q, (struct ulptx_sgl *)(cpl + 1), end, 0, + addr); + skb_orphan(skb); + + last_desc = q->q.pidx + ndesc - 1; + if (last_desc >= q->q.size) + last_desc -= q->q.size; + q->q.sdesc[last_desc].skb = skb; + q->q.sdesc[last_desc].sgl = (struct ulptx_sgl *)(cpl + 1); + } + + txq_advance(&q->q, ndesc); + + ring_tx_db(adap, &q->q, ndesc); + return NETDEV_TX_OK; +} + +/** + * reclaim_completed_tx_imm - reclaim completed control-queue Tx descs + * @q: the SGE control Tx queue + * + * This is a variant of reclaim_completed_tx() that is used for Tx queues + * that send only immediate data (presently just the control queues) and + * thus do not have any sk_buffs to release. + */ +static inline void reclaim_completed_tx_imm(struct sge_txq *q) +{ + int hw_cidx = ntohs(q->stat->cidx); + int reclaim = hw_cidx - q->cidx; + + if (reclaim < 0) + reclaim += q->size; + + q->in_use -= reclaim; + q->cidx = hw_cidx; +} + +/** + * is_imm - check whether a packet can be sent as immediate data + * @skb: the packet + * + * Returns true if a packet can be sent as a WR with immediate data. + */ +static inline int is_imm(const struct sk_buff *skb) +{ + return skb->len <= MAX_CTRL_WR_LEN; +} + +/** + * ctrlq_check_stop - check if a control queue is full and should stop + * @q: the queue + * @wr: most recent WR written to the queue + * + * Check if a control queue has become full and should be stopped. + * We clean up control queue descriptors very lazily, only when we are out. + * If the queue is still full after reclaiming any completed descriptors + * we suspend it and have the last WR wake it up. + */ +static void ctrlq_check_stop(struct sge_ctrl_txq *q, struct fw_wr_hdr *wr) +{ + reclaim_completed_tx_imm(&q->q); + if (unlikely(txq_avail(&q->q) < TXQ_STOP_THRES)) { + wr->lo |= htonl(FW_WR_EQUEQ | FW_WR_EQUIQ); + q->q.stops++; + q->full = 1; + } +} + +/** + * ctrl_xmit - send a packet through an SGE control Tx queue + * @q: the control queue + * @skb: the packet + * + * Send a packet through an SGE control Tx queue. Packets sent through + * a control queue must fit entirely as immediate data. + */ +static int ctrl_xmit(struct sge_ctrl_txq *q, struct sk_buff *skb) +{ + unsigned int ndesc; + struct fw_wr_hdr *wr; + + if (unlikely(!is_imm(skb))) { + WARN_ON(1); + dev_kfree_skb(skb); + return NET_XMIT_DROP; + } + + ndesc = DIV_ROUND_UP(skb->len, sizeof(struct tx_desc)); + spin_lock(&q->sendq.lock); + + if (unlikely(q->full)) { + skb->priority = ndesc; /* save for restart */ + __skb_queue_tail(&q->sendq, skb); + spin_unlock(&q->sendq.lock); + return NET_XMIT_CN; + } + + wr = (struct fw_wr_hdr *)&q->q.desc[q->q.pidx]; + inline_tx_skb(skb, &q->q, wr); + + txq_advance(&q->q, ndesc); + if (unlikely(txq_avail(&q->q) < TXQ_STOP_THRES)) + ctrlq_check_stop(q, wr); + + ring_tx_db(q->adap, &q->q, ndesc); + spin_unlock(&q->sendq.lock); + + kfree_skb(skb); + return NET_XMIT_SUCCESS; +} + +/** + * restart_ctrlq - restart a suspended control queue + * @data: the control queue to restart + * + * Resumes transmission on a suspended Tx control queue. + */ +static void restart_ctrlq(unsigned long data) +{ + struct sk_buff *skb; + unsigned int written = 0; + struct sge_ctrl_txq *q = (struct sge_ctrl_txq *)data; + + spin_lock(&q->sendq.lock); + reclaim_completed_tx_imm(&q->q); + BUG_ON(txq_avail(&q->q) < TXQ_STOP_THRES); /* q should be empty */ + + while ((skb = __skb_dequeue(&q->sendq)) != NULL) { + struct fw_wr_hdr *wr; + unsigned int ndesc = skb->priority; /* previously saved */ + + /* + * Write descriptors and free skbs outside the lock to limit + * wait times. q->full is still set so new skbs will be queued. + */ + spin_unlock(&q->sendq.lock); + + wr = (struct fw_wr_hdr *)&q->q.desc[q->q.pidx]; + inline_tx_skb(skb, &q->q, wr); + kfree_skb(skb); + + written += ndesc; + txq_advance(&q->q, ndesc); + if (unlikely(txq_avail(&q->q) < TXQ_STOP_THRES)) { + unsigned long old = q->q.stops; + + ctrlq_check_stop(q, wr); + if (q->q.stops != old) { /* suspended anew */ + spin_lock(&q->sendq.lock); + goto ringdb; + } + } + if (written > 16) { + ring_tx_db(q->adap, &q->q, written); + written = 0; + } + spin_lock(&q->sendq.lock); + } + q->full = 0; +ringdb: if (written) + ring_tx_db(q->adap, &q->q, written); + spin_unlock(&q->sendq.lock); +} + +/** + * t4_mgmt_tx - send a management message + * @adap: the adapter + * @skb: the packet containing the management message + * + * Send a management message through control queue 0. + */ +int t4_mgmt_tx(struct adapter *adap, struct sk_buff *skb) +{ + int ret; + + local_bh_disable(); + ret = ctrl_xmit(&adap->sge.ctrlq[0], skb); + local_bh_enable(); + return ret; +} + +/** + * is_ofld_imm - check whether a packet can be sent as immediate data + * @skb: the packet + * + * Returns true if a packet can be sent as an offload WR with immediate + * data. We currently use the same limit as for Ethernet packets. + */ +static inline int is_ofld_imm(const struct sk_buff *skb) +{ + return skb->len <= MAX_IMM_TX_PKT_LEN; +} + +/** + * calc_tx_flits_ofld - calculate # of flits for an offload packet + * @skb: the packet + * + * Returns the number of flits needed for the given offload packet. + * These packets are already fully constructed and no additional headers + * will be added. + */ +static inline unsigned int calc_tx_flits_ofld(const struct sk_buff *skb) +{ + unsigned int flits, cnt; + + if (is_ofld_imm(skb)) + return DIV_ROUND_UP(skb->len, 8); + + flits = skb_transport_offset(skb) / 8U; /* headers */ + cnt = skb_shinfo(skb)->nr_frags; + if (skb_tail_pointer(skb) != skb_transport_header(skb)) + cnt++; + return flits + sgl_len(cnt); +} + +/** + * txq_stop_maperr - stop a Tx queue due to I/O MMU exhaustion + * @adap: the adapter + * @q: the queue to stop + * + * Mark a Tx queue stopped due to I/O MMU exhaustion and resulting + * inability to map packets. A periodic timer attempts to restart + * queues so marked. + */ +static void txq_stop_maperr(struct sge_ofld_txq *q) +{ + q->mapping_err++; + q->q.stops++; + set_bit(q->q.cntxt_id - q->adap->sge.egr_start, + q->adap->sge.txq_maperr); +} + +/** + * ofldtxq_stop - stop an offload Tx queue that has become full + * @q: the queue to stop + * @skb: the packet causing the queue to become full + * + * Stops an offload Tx queue that has become full and modifies the packet + * being written to request a wakeup. + */ +static void ofldtxq_stop(struct sge_ofld_txq *q, struct sk_buff *skb) +{ + struct fw_wr_hdr *wr = (struct fw_wr_hdr *)skb->data; + + wr->lo |= htonl(FW_WR_EQUEQ | FW_WR_EQUIQ); + q->q.stops++; + q->full = 1; +} + +/** + * service_ofldq - restart a suspended offload queue + * @q: the offload queue + * + * Services an offload Tx queue by moving packets from its packet queue + * to the HW Tx ring. The function starts and ends with the queue locked. + */ +static void service_ofldq(struct sge_ofld_txq *q) +{ + u64 *pos; + int credits; + struct sk_buff *skb; + unsigned int written = 0; + unsigned int flits, ndesc; + + while ((skb = skb_peek(&q->sendq)) != NULL && !q->full) { + /* + * We drop the lock but leave skb on sendq, thus retaining + * exclusive access to the state of the queue. + */ + spin_unlock(&q->sendq.lock); + + reclaim_completed_tx(q->adap, &q->q, false); + + flits = skb->priority; /* previously saved */ + ndesc = flits_to_desc(flits); + credits = txq_avail(&q->q) - ndesc; + BUG_ON(credits < 0); + if (unlikely(credits < TXQ_STOP_THRES)) + ofldtxq_stop(q, skb); + + pos = (u64 *)&q->q.desc[q->q.pidx]; + if (is_ofld_imm(skb)) + inline_tx_skb(skb, &q->q, pos); + else if (map_skb(q->adap->pdev_dev, skb, + (dma_addr_t *)skb->head)) { + txq_stop_maperr(q); + spin_lock(&q->sendq.lock); + break; + } else { + int last_desc, hdr_len = skb_transport_offset(skb); + + memcpy(pos, skb->data, hdr_len); + write_sgl(skb, &q->q, (void *)pos + hdr_len, + pos + flits, hdr_len, + (dma_addr_t *)skb->head); +#ifdef CONFIG_NEED_DMA_MAP_STATE + skb->dev = q->adap->port[0]; + skb->destructor = deferred_unmap_destructor; +#endif + last_desc = q->q.pidx + ndesc - 1; + if (last_desc >= q->q.size) + last_desc -= q->q.size; + q->q.sdesc[last_desc].skb = skb; + } + + txq_advance(&q->q, ndesc); + written += ndesc; + if (unlikely(written > 32)) { + ring_tx_db(q->adap, &q->q, written); + written = 0; + } + + spin_lock(&q->sendq.lock); + __skb_unlink(skb, &q->sendq); + if (is_ofld_imm(skb)) + kfree_skb(skb); + } + if (likely(written)) + ring_tx_db(q->adap, &q->q, written); +} + +/** + * ofld_xmit - send a packet through an offload queue + * @q: the Tx offload queue + * @skb: the packet + * + * Send an offload packet through an SGE offload queue. + */ +static int ofld_xmit(struct sge_ofld_txq *q, struct sk_buff *skb) +{ + skb->priority = calc_tx_flits_ofld(skb); /* save for restart */ + spin_lock(&q->sendq.lock); + __skb_queue_tail(&q->sendq, skb); + if (q->sendq.qlen == 1) + service_ofldq(q); + spin_unlock(&q->sendq.lock); + return NET_XMIT_SUCCESS; +} + +/** + * restart_ofldq - restart a suspended offload queue + * @data: the offload queue to restart + * + * Resumes transmission on a suspended Tx offload queue. + */ +static void restart_ofldq(unsigned long data) +{ + struct sge_ofld_txq *q = (struct sge_ofld_txq *)data; + + spin_lock(&q->sendq.lock); + q->full = 0; /* the queue actually is completely empty now */ + service_ofldq(q); + spin_unlock(&q->sendq.lock); +} + +/** + * skb_txq - return the Tx queue an offload packet should use + * @skb: the packet + * + * Returns the Tx queue an offload packet should use as indicated by bits + * 1-15 in the packet's queue_mapping. + */ +static inline unsigned int skb_txq(const struct sk_buff *skb) +{ + return skb->queue_mapping >> 1; +} + +/** + * is_ctrl_pkt - return whether an offload packet is a control packet + * @skb: the packet + * + * Returns whether an offload packet should use an OFLD or a CTRL + * Tx queue as indicated by bit 0 in the packet's queue_mapping. + */ +static inline unsigned int is_ctrl_pkt(const struct sk_buff *skb) +{ + return skb->queue_mapping & 1; +} + +static inline int ofld_send(struct adapter *adap, struct sk_buff *skb) +{ + unsigned int idx = skb_txq(skb); + + if (unlikely(is_ctrl_pkt(skb))) { + /* Single ctrl queue is a requirement for LE workaround path */ + if (adap->tids.nsftids) + idx = 0; + return ctrl_xmit(&adap->sge.ctrlq[idx], skb); + } + return ofld_xmit(&adap->sge.ofldtxq[idx], skb); +} + +/** + * t4_ofld_send - send an offload packet + * @adap: the adapter + * @skb: the packet + * + * Sends an offload packet. We use the packet queue_mapping to select the + * appropriate Tx queue as follows: bit 0 indicates whether the packet + * should be sent as regular or control, bits 1-15 select the queue. + */ +int t4_ofld_send(struct adapter *adap, struct sk_buff *skb) +{ + int ret; + + local_bh_disable(); + ret = ofld_send(adap, skb); + local_bh_enable(); + return ret; +} + +/** + * cxgb4_ofld_send - send an offload packet + * @dev: the net device + * @skb: the packet + * + * Sends an offload packet. This is an exported version of @t4_ofld_send, + * intended for ULDs. + */ +int cxgb4_ofld_send(struct net_device *dev, struct sk_buff *skb) +{ + return t4_ofld_send(netdev2adap(dev), skb); +} +EXPORT_SYMBOL(cxgb4_ofld_send); + +static inline void copy_frags(struct sk_buff *skb, + const struct pkt_gl *gl, unsigned int offset) +{ + int i; + + /* usually there's just one frag */ + __skb_fill_page_desc(skb, 0, gl->frags[0].page, + gl->frags[0].offset + offset, + gl->frags[0].size - offset); + skb_shinfo(skb)->nr_frags = gl->nfrags; + for (i = 1; i < gl->nfrags; i++) + __skb_fill_page_desc(skb, i, gl->frags[i].page, + gl->frags[i].offset, + gl->frags[i].size); + + /* get a reference to the last page, we don't own it */ + get_page(gl->frags[gl->nfrags - 1].page); +} + +/** + * cxgb4_pktgl_to_skb - build an sk_buff from a packet gather list + * @gl: the gather list + * @skb_len: size of sk_buff main body if it carries fragments + * @pull_len: amount of data to move to the sk_buff's main body + * + * Builds an sk_buff from the given packet gather list. Returns the + * sk_buff or %NULL if sk_buff allocation failed. + */ +struct sk_buff *cxgb4_pktgl_to_skb(const struct pkt_gl *gl, + unsigned int skb_len, unsigned int pull_len) +{ + struct sk_buff *skb; + + /* + * Below we rely on RX_COPY_THRES being less than the smallest Rx buffer + * size, which is expected since buffers are at least PAGE_SIZEd. + * In this case packets up to RX_COPY_THRES have only one fragment. + */ + if (gl->tot_len <= RX_COPY_THRES) { + skb = dev_alloc_skb(gl->tot_len); + if (unlikely(!skb)) + goto out; + __skb_put(skb, gl->tot_len); + skb_copy_to_linear_data(skb, gl->va, gl->tot_len); + } else { + skb = dev_alloc_skb(skb_len); + if (unlikely(!skb)) + goto out; + __skb_put(skb, pull_len); + skb_copy_to_linear_data(skb, gl->va, pull_len); + + copy_frags(skb, gl, pull_len); + skb->len = gl->tot_len; + skb->data_len = skb->len - pull_len; + skb->truesize += skb->data_len; + } +out: return skb; +} +EXPORT_SYMBOL(cxgb4_pktgl_to_skb); + +/** + * t4_pktgl_free - free a packet gather list + * @gl: the gather list + * + * Releases the pages of a packet gather list. We do not own the last + * page on the list and do not free it. + */ +static void t4_pktgl_free(const struct pkt_gl *gl) +{ + int n; + const struct page_frag *p; + + for (p = gl->frags, n = gl->nfrags - 1; n--; p++) + put_page(p->page); +} + +/* + * Process an MPS trace packet. Give it an unused protocol number so it won't + * be delivered to anyone and send it to the stack for capture. + */ +static noinline int handle_trace_pkt(struct adapter *adap, + const struct pkt_gl *gl) +{ + struct sk_buff *skb; + + skb = cxgb4_pktgl_to_skb(gl, RX_PULL_LEN, RX_PULL_LEN); + if (unlikely(!skb)) { + t4_pktgl_free(gl); + return 0; + } + + if (is_t4(adap->params.chip)) + __skb_pull(skb, sizeof(struct cpl_trace_pkt)); + else + __skb_pull(skb, sizeof(struct cpl_t5_trace_pkt)); + + skb_reset_mac_header(skb); + skb->protocol = htons(0xffff); + skb->dev = adap->port[0]; + netif_receive_skb(skb); + return 0; +} + +static void do_gro(struct sge_eth_rxq *rxq, const struct pkt_gl *gl, + const struct cpl_rx_pkt *pkt) +{ + struct adapter *adapter = rxq->rspq.adap; + struct sge *s = &adapter->sge; + int ret; + struct sk_buff *skb; + + skb = napi_get_frags(&rxq->rspq.napi); + if (unlikely(!skb)) { + t4_pktgl_free(gl); + rxq->stats.rx_drops++; + return; + } + + copy_frags(skb, gl, s->pktshift); + skb->len = gl->tot_len - s->pktshift; + skb->data_len = skb->len; + skb->truesize += skb->data_len; + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb_record_rx_queue(skb, rxq->rspq.idx); + if (rxq->rspq.netdev->features & NETIF_F_RXHASH) + skb_set_hash(skb, (__force u32)pkt->rsshdr.hash_val, + PKT_HASH_TYPE_L3); + + if (unlikely(pkt->vlan_ex)) { + __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(pkt->vlan)); + rxq->stats.vlan_ex++; + } + ret = napi_gro_frags(&rxq->rspq.napi); + if (ret == GRO_HELD) + rxq->stats.lro_pkts++; + else if (ret == GRO_MERGED || ret == GRO_MERGED_FREE) + rxq->stats.lro_merged++; + rxq->stats.pkts++; + rxq->stats.rx_cso++; +} + +/** + * t4_ethrx_handler - process an ingress ethernet packet + * @q: the response queue that received the packet + * @rsp: the response queue descriptor holding the RX_PKT message + * @si: the gather list of packet fragments + * + * Process an ingress ethernet packet and deliver it to the stack. + */ +int t4_ethrx_handler(struct sge_rspq *q, const __be64 *rsp, + const struct pkt_gl *si) +{ + bool csum_ok; + struct sk_buff *skb; + const struct cpl_rx_pkt *pkt; + struct sge_eth_rxq *rxq = container_of(q, struct sge_eth_rxq, rspq); + struct sge *s = &q->adap->sge; + int cpl_trace_pkt = is_t4(q->adap->params.chip) ? + CPL_TRACE_PKT : CPL_TRACE_PKT_T5; + + if (unlikely(*(u8 *)rsp == cpl_trace_pkt)) + return handle_trace_pkt(q->adap, si); + + pkt = (const struct cpl_rx_pkt *)rsp; + csum_ok = pkt->csum_calc && !pkt->err_vec && + (q->netdev->features & NETIF_F_RXCSUM); + if ((pkt->l2info & htonl(RXF_TCP)) && + (q->netdev->features & NETIF_F_GRO) && csum_ok && !pkt->ip_frag) { + do_gro(rxq, si, pkt); + return 0; + } + + skb = cxgb4_pktgl_to_skb(si, RX_PKT_SKB_LEN, RX_PULL_LEN); + if (unlikely(!skb)) { + t4_pktgl_free(si); + rxq->stats.rx_drops++; + return 0; + } + + __skb_pull(skb, s->pktshift); /* remove ethernet header padding */ + skb->protocol = eth_type_trans(skb, q->netdev); + skb_record_rx_queue(skb, q->idx); + if (skb->dev->features & NETIF_F_RXHASH) + skb_set_hash(skb, (__force u32)pkt->rsshdr.hash_val, + PKT_HASH_TYPE_L3); + + rxq->stats.pkts++; + + if (csum_ok && (pkt->l2info & htonl(RXF_UDP | RXF_TCP))) { + if (!pkt->ip_frag) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + rxq->stats.rx_cso++; + } else if (pkt->l2info & htonl(RXF_IP)) { + __sum16 c = (__force __sum16)pkt->csum; + skb->csum = csum_unfold(c); + skb->ip_summed = CHECKSUM_COMPLETE; + rxq->stats.rx_cso++; + } + } else + skb_checksum_none_assert(skb); + + if (unlikely(pkt->vlan_ex)) { + __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(pkt->vlan)); + rxq->stats.vlan_ex++; + } + netif_receive_skb(skb); + return 0; +} + +/** + * restore_rx_bufs - put back a packet's Rx buffers + * @si: the packet gather list + * @q: the SGE free list + * @frags: number of FL buffers to restore + * + * Puts back on an FL the Rx buffers associated with @si. The buffers + * have already been unmapped and are left unmapped, we mark them so to + * prevent further unmapping attempts. + * + * This function undoes a series of @unmap_rx_buf calls when we find out + * that the current packet can't be processed right away afterall and we + * need to come back to it later. This is a very rare event and there's + * no effort to make this particularly efficient. + */ +static void restore_rx_bufs(const struct pkt_gl *si, struct sge_fl *q, + int frags) +{ + struct rx_sw_desc *d; + + while (frags--) { + if (q->cidx == 0) + q->cidx = q->size - 1; + else + q->cidx--; + d = &q->sdesc[q->cidx]; + d->page = si->frags[frags].page; + d->dma_addr |= RX_UNMAPPED_BUF; + q->avail++; + } +} + +/** + * is_new_response - check if a response is newly written + * @r: the response descriptor + * @q: the response queue + * + * Returns true if a response descriptor contains a yet unprocessed + * response. + */ +static inline bool is_new_response(const struct rsp_ctrl *r, + const struct sge_rspq *q) +{ + return RSPD_GEN(r->type_gen) == q->gen; +} + +/** + * rspq_next - advance to the next entry in a response queue + * @q: the queue + * + * Updates the state of a response queue to advance it to the next entry. + */ +static inline void rspq_next(struct sge_rspq *q) +{ + q->cur_desc = (void *)q->cur_desc + q->iqe_len; + if (unlikely(++q->cidx == q->size)) { + q->cidx = 0; + q->gen ^= 1; + q->cur_desc = q->desc; + } +} + +/** + * process_responses - process responses from an SGE response queue + * @q: the ingress queue to process + * @budget: how many responses can be processed in this round + * + * Process responses from an SGE response queue up to the supplied budget. + * Responses include received packets as well as control messages from FW + * or HW. + * + * Additionally choose the interrupt holdoff time for the next interrupt + * on this queue. If the system is under memory shortage use a fairly + * long delay to help recovery. + */ +static int process_responses(struct sge_rspq *q, int budget) +{ + int ret, rsp_type; + int budget_left = budget; + const struct rsp_ctrl *rc; + struct sge_eth_rxq *rxq = container_of(q, struct sge_eth_rxq, rspq); + struct adapter *adapter = q->adap; + struct sge *s = &adapter->sge; + + while (likely(budget_left)) { + rc = (void *)q->cur_desc + (q->iqe_len - sizeof(*rc)); + if (!is_new_response(rc, q)) + break; + + rmb(); + rsp_type = RSPD_TYPE(rc->type_gen); + if (likely(rsp_type == RSP_TYPE_FLBUF)) { + struct page_frag *fp; + struct pkt_gl si; + const struct rx_sw_desc *rsd; + u32 len = ntohl(rc->pldbuflen_qid), bufsz, frags; + + if (len & RSPD_NEWBUF) { + if (likely(q->offset > 0)) { + free_rx_bufs(q->adap, &rxq->fl, 1); + q->offset = 0; + } + len = RSPD_LEN(len); + } + si.tot_len = len; + + /* gather packet fragments */ + for (frags = 0, fp = si.frags; ; frags++, fp++) { + rsd = &rxq->fl.sdesc[rxq->fl.cidx]; + bufsz = get_buf_size(adapter, rsd); + fp->page = rsd->page; + fp->offset = q->offset; + fp->size = min(bufsz, len); + len -= fp->size; + if (!len) + break; + unmap_rx_buf(q->adap, &rxq->fl); + } + + /* + * Last buffer remains mapped so explicitly make it + * coherent for CPU access. + */ + dma_sync_single_for_cpu(q->adap->pdev_dev, + get_buf_addr(rsd), + fp->size, DMA_FROM_DEVICE); + + si.va = page_address(si.frags[0].page) + + si.frags[0].offset; + prefetch(si.va); + + si.nfrags = frags + 1; + ret = q->handler(q, q->cur_desc, &si); + if (likely(ret == 0)) + q->offset += ALIGN(fp->size, s->fl_align); + else + restore_rx_bufs(&si, &rxq->fl, frags); + } else if (likely(rsp_type == RSP_TYPE_CPL)) { + ret = q->handler(q, q->cur_desc, NULL); + } else { + ret = q->handler(q, (const __be64 *)rc, CXGB4_MSG_AN); + } + + if (unlikely(ret)) { + /* couldn't process descriptor, back off for recovery */ + q->next_intr_params = QINTR_TIMER_IDX(NOMEM_TMR_IDX); + break; + } + + rspq_next(q); + budget_left--; + } + + if (q->offset >= 0 && rxq->fl.size - rxq->fl.avail >= 16) + __refill_fl(q->adap, &rxq->fl); + return budget - budget_left; +} + +/** + * napi_rx_handler - the NAPI handler for Rx processing + * @napi: the napi instance + * @budget: how many packets we can process in this round + * + * Handler for new data events when using NAPI. This does not need any + * locking or protection from interrupts as data interrupts are off at + * this point and other adapter interrupts do not interfere (the latter + * in not a concern at all with MSI-X as non-data interrupts then have + * a separate handler). + */ +static int napi_rx_handler(struct napi_struct *napi, int budget) +{ + unsigned int params; + struct sge_rspq *q = container_of(napi, struct sge_rspq, napi); + int work_done = process_responses(q, budget); + u32 val; + + if (likely(work_done < budget)) { + int timer_index; + + napi_complete(napi); + timer_index = QINTR_TIMER_IDX_GET(q->next_intr_params); + + if (q->adaptive_rx) { + if (work_done > max(timer_pkt_quota[timer_index], + MIN_NAPI_WORK)) + timer_index = (timer_index + 1); + else + timer_index = timer_index - 1; + + timer_index = clamp(timer_index, 0, SGE_TIMERREGS - 1); + q->next_intr_params = QINTR_TIMER_IDX(timer_index) | + V_QINTR_CNT_EN; + params = q->next_intr_params; + } else { + params = q->next_intr_params; + q->next_intr_params = q->intr_params; + } + } else + params = QINTR_TIMER_IDX(7); + + val = CIDXINC(work_done) | SEINTARM(params); + if (is_t4(q->adap->params.chip)) { + t4_write_reg(q->adap, MYPF_REG(SGE_PF_GTS), + val | INGRESSQID((u32)q->cntxt_id)); + } else { + writel(val, q->adap->bar2 + q->udb + SGE_UDB_GTS); + wmb(); + } + return work_done; +} + +/* + * The MSI-X interrupt handler for an SGE response queue. + */ +irqreturn_t t4_sge_intr_msix(int irq, void *cookie) +{ + struct sge_rspq *q = cookie; + + napi_schedule(&q->napi); + return IRQ_HANDLED; +} + +/* + * Process the indirect interrupt entries in the interrupt queue and kick off + * NAPI for each queue that has generated an entry. + */ +static unsigned int process_intrq(struct adapter *adap) +{ + unsigned int credits; + const struct rsp_ctrl *rc; + struct sge_rspq *q = &adap->sge.intrq; + u32 val; + + spin_lock(&adap->sge.intrq_lock); + for (credits = 0; ; credits++) { + rc = (void *)q->cur_desc + (q->iqe_len - sizeof(*rc)); + if (!is_new_response(rc, q)) + break; + + rmb(); + if (RSPD_TYPE(rc->type_gen) == RSP_TYPE_INTR) { + unsigned int qid = ntohl(rc->pldbuflen_qid); + + qid -= adap->sge.ingr_start; + napi_schedule(&adap->sge.ingr_map[qid]->napi); + } + + rspq_next(q); + } + + val = CIDXINC(credits) | SEINTARM(q->intr_params); + if (is_t4(adap->params.chip)) { + t4_write_reg(adap, MYPF_REG(SGE_PF_GTS), + val | INGRESSQID(q->cntxt_id)); + } else { + writel(val, adap->bar2 + q->udb + SGE_UDB_GTS); + wmb(); + } + spin_unlock(&adap->sge.intrq_lock); + return credits; +} + +/* + * The MSI interrupt handler, which handles data events from SGE response queues + * as well as error and other async events as they all use the same MSI vector. + */ +static irqreturn_t t4_intr_msi(int irq, void *cookie) +{ + struct adapter *adap = cookie; + + t4_slow_intr_handler(adap); + process_intrq(adap); + return IRQ_HANDLED; +} + +/* + * Interrupt handler for legacy INTx interrupts. + * Handles data events from SGE response queues as well as error and other + * async events as they all use the same interrupt line. + */ +static irqreturn_t t4_intr_intx(int irq, void *cookie) +{ + struct adapter *adap = cookie; + + t4_write_reg(adap, MYPF_REG(PCIE_PF_CLI), 0); + if (t4_slow_intr_handler(adap) | process_intrq(adap)) + return IRQ_HANDLED; + return IRQ_NONE; /* probably shared interrupt */ +} + +/** + * t4_intr_handler - select the top-level interrupt handler + * @adap: the adapter + * + * Selects the top-level interrupt handler based on the type of interrupts + * (MSI-X, MSI, or INTx). + */ +irq_handler_t t4_intr_handler(struct adapter *adap) +{ + if (adap->flags & USING_MSIX) + return t4_sge_intr_msix; + if (adap->flags & USING_MSI) + return t4_intr_msi; + return t4_intr_intx; +} + +static void sge_rx_timer_cb(unsigned long data) +{ + unsigned long m; + unsigned int i, idma_same_state_cnt[2]; + struct adapter *adap = (struct adapter *)data; + struct sge *s = &adap->sge; + + for (i = 0; i < ARRAY_SIZE(s->starving_fl); i++) + for (m = s->starving_fl[i]; m; m &= m - 1) { + struct sge_eth_rxq *rxq; + unsigned int id = __ffs(m) + i * BITS_PER_LONG; + struct sge_fl *fl = s->egr_map[id]; + + clear_bit(id, s->starving_fl); + smp_mb__after_atomic(); + + if (fl_starving(fl)) { + rxq = container_of(fl, struct sge_eth_rxq, fl); + if (napi_reschedule(&rxq->rspq.napi)) + fl->starving++; + else + set_bit(id, s->starving_fl); + } + } + + t4_write_reg(adap, SGE_DEBUG_INDEX, 13); + idma_same_state_cnt[0] = t4_read_reg(adap, SGE_DEBUG_DATA_HIGH); + idma_same_state_cnt[1] = t4_read_reg(adap, SGE_DEBUG_DATA_LOW); + + for (i = 0; i < 2; i++) { + u32 debug0, debug11; + + /* If the Ingress DMA Same State Counter ("timer") is less + * than 1s, then we can reset our synthesized Stall Timer and + * continue. If we have previously emitted warnings about a + * potential stalled Ingress Queue, issue a note indicating + * that the Ingress Queue has resumed forward progress. + */ + if (idma_same_state_cnt[i] < s->idma_1s_thresh) { + if (s->idma_stalled[i] >= SGE_IDMA_WARN_THRESH) + CH_WARN(adap, "SGE idma%d, queue%u,resumed after %d sec\n", + i, s->idma_qid[i], + s->idma_stalled[i]/HZ); + s->idma_stalled[i] = 0; + continue; + } + + /* Synthesize an SGE Ingress DMA Same State Timer in the Hz + * domain. The first time we get here it'll be because we + * passed the 1s Threshold; each additional time it'll be + * because the RX Timer Callback is being fired on its regular + * schedule. + * + * If the stall is below our Potential Hung Ingress Queue + * Warning Threshold, continue. + */ + if (s->idma_stalled[i] == 0) + s->idma_stalled[i] = HZ; + else + s->idma_stalled[i] += RX_QCHECK_PERIOD; + + if (s->idma_stalled[i] < SGE_IDMA_WARN_THRESH) + continue; + + /* We'll issue a warning every SGE_IDMA_WARN_REPEAT Hz */ + if (((s->idma_stalled[i] - HZ) % SGE_IDMA_WARN_REPEAT) != 0) + continue; + + /* Read and save the SGE IDMA State and Queue ID information. + * We do this every time in case it changes across time ... + */ + t4_write_reg(adap, SGE_DEBUG_INDEX, 0); + debug0 = t4_read_reg(adap, SGE_DEBUG_DATA_LOW); + s->idma_state[i] = (debug0 >> (i * 9)) & 0x3f; + + t4_write_reg(adap, SGE_DEBUG_INDEX, 11); + debug11 = t4_read_reg(adap, SGE_DEBUG_DATA_LOW); + s->idma_qid[i] = (debug11 >> (i * 16)) & 0xffff; + + CH_WARN(adap, "SGE idma%u, queue%u, maybe stuck state%u %dsecs (debug0=%#x, debug11=%#x)\n", + i, s->idma_qid[i], s->idma_state[i], + s->idma_stalled[i]/HZ, debug0, debug11); + t4_sge_decode_idma_state(adap, s->idma_state[i]); + } + + mod_timer(&s->rx_timer, jiffies + RX_QCHECK_PERIOD); +} + +static void sge_tx_timer_cb(unsigned long data) +{ + unsigned long m; + unsigned int i, budget; + struct adapter *adap = (struct adapter *)data; + struct sge *s = &adap->sge; + + for (i = 0; i < ARRAY_SIZE(s->txq_maperr); i++) + for (m = s->txq_maperr[i]; m; m &= m - 1) { + unsigned long id = __ffs(m) + i * BITS_PER_LONG; + struct sge_ofld_txq *txq = s->egr_map[id]; + + clear_bit(id, s->txq_maperr); + tasklet_schedule(&txq->qresume_tsk); + } + + budget = MAX_TIMER_TX_RECLAIM; + i = s->ethtxq_rover; + do { + struct sge_eth_txq *q = &s->ethtxq[i]; + + if (q->q.in_use && + time_after_eq(jiffies, q->txq->trans_start + HZ / 100) && + __netif_tx_trylock(q->txq)) { + int avail = reclaimable(&q->q); + + if (avail) { + if (avail > budget) + avail = budget; + + free_tx_desc(adap, &q->q, avail, true); + q->q.in_use -= avail; + budget -= avail; + } + __netif_tx_unlock(q->txq); + } + + if (++i >= s->ethqsets) + i = 0; + } while (budget && i != s->ethtxq_rover); + s->ethtxq_rover = i; + mod_timer(&s->tx_timer, jiffies + (budget ? TX_QCHECK_PERIOD : 2)); +} + +/** + * udb_address - return the BAR2 User Doorbell address for a Queue + * @adap: the adapter + * @cntxt_id: the Queue Context ID + * @qpp: Queues Per Page (for all PFs) + * + * Returns the BAR2 address of the user Doorbell associated with the + * indicated Queue Context ID. Note that this is only applicable + * for T5 and later. + */ +static u64 udb_address(struct adapter *adap, unsigned int cntxt_id, + unsigned int qpp) +{ + u64 udb; + unsigned int s_qpp; + unsigned short udb_density; + unsigned long qpshift; + int page; + + BUG_ON(is_t4(adap->params.chip)); + + s_qpp = (QUEUESPERPAGEPF0 + + (QUEUESPERPAGEPF1 - QUEUESPERPAGEPF0) * adap->fn); + udb_density = 1 << ((qpp >> s_qpp) & QUEUESPERPAGEPF0_MASK); + qpshift = PAGE_SHIFT - ilog2(udb_density); + udb = (u64)cntxt_id << qpshift; + udb &= PAGE_MASK; + page = udb / PAGE_SIZE; + udb += (cntxt_id - (page * udb_density)) * SGE_UDB_SIZE; + + return udb; +} + +static u64 udb_address_eq(struct adapter *adap, unsigned int cntxt_id) +{ + return udb_address(adap, cntxt_id, + t4_read_reg(adap, SGE_EGRESS_QUEUES_PER_PAGE_PF)); +} + +static u64 udb_address_iq(struct adapter *adap, unsigned int cntxt_id) +{ + return udb_address(adap, cntxt_id, + t4_read_reg(adap, SGE_INGRESS_QUEUES_PER_PAGE_PF)); +} + +int t4_sge_alloc_rxq(struct adapter *adap, struct sge_rspq *iq, bool fwevtq, + struct net_device *dev, int intr_idx, + struct sge_fl *fl, rspq_handler_t hnd) +{ + int ret, flsz = 0; + struct fw_iq_cmd c; + struct sge *s = &adap->sge; + struct port_info *pi = netdev_priv(dev); + + /* Size needs to be multiple of 16, including status entry. */ + iq->size = roundup(iq->size, 16); + + iq->desc = alloc_ring(adap->pdev_dev, iq->size, iq->iqe_len, 0, + &iq->phys_addr, NULL, 0, NUMA_NO_NODE); + if (!iq->desc) + return -ENOMEM; + + memset(&c, 0, sizeof(c)); + c.op_to_vfn = htonl(FW_CMD_OP(FW_IQ_CMD) | FW_CMD_REQUEST | + FW_CMD_WRITE | FW_CMD_EXEC | + FW_IQ_CMD_PFN(adap->fn) | FW_IQ_CMD_VFN(0)); + c.alloc_to_len16 = htonl(FW_IQ_CMD_ALLOC | FW_IQ_CMD_IQSTART(1) | + FW_LEN16(c)); + c.type_to_iqandstindex = htonl(FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) | + FW_IQ_CMD_IQASYNCH(fwevtq) | FW_IQ_CMD_VIID(pi->viid) | + FW_IQ_CMD_IQANDST(intr_idx < 0) | FW_IQ_CMD_IQANUD(1) | + FW_IQ_CMD_IQANDSTINDEX(intr_idx >= 0 ? intr_idx : + -intr_idx - 1)); + c.iqdroprss_to_iqesize = htons(FW_IQ_CMD_IQPCIECH(pi->tx_chan) | + FW_IQ_CMD_IQGTSMODE | + FW_IQ_CMD_IQINTCNTTHRESH(iq->pktcnt_idx) | + FW_IQ_CMD_IQESIZE(ilog2(iq->iqe_len) - 4)); + c.iqsize = htons(iq->size); + c.iqaddr = cpu_to_be64(iq->phys_addr); + + if (fl) { + fl->size = roundup(fl->size, 8); + fl->desc = alloc_ring(adap->pdev_dev, fl->size, sizeof(__be64), + sizeof(struct rx_sw_desc), &fl->addr, + &fl->sdesc, s->stat_len, NUMA_NO_NODE); + if (!fl->desc) + goto fl_nomem; + + flsz = fl->size / 8 + s->stat_len / sizeof(struct tx_desc); + c.iqns_to_fl0congen = htonl(FW_IQ_CMD_FL0PACKEN(1) | + FW_IQ_CMD_FL0FETCHRO(1) | + FW_IQ_CMD_FL0DATARO(1) | + FW_IQ_CMD_FL0PADEN(1)); + c.fl0dcaen_to_fl0cidxfthresh = htons(FW_IQ_CMD_FL0FBMIN(2) | + FW_IQ_CMD_FL0FBMAX(3)); + c.fl0size = htons(flsz); + c.fl0addr = cpu_to_be64(fl->addr); + } + + ret = t4_wr_mbox(adap, adap->fn, &c, sizeof(c), &c); + if (ret) + goto err; + + netif_napi_add(dev, &iq->napi, napi_rx_handler, 64); + iq->cur_desc = iq->desc; + iq->cidx = 0; + iq->gen = 1; + iq->next_intr_params = iq->intr_params; + iq->cntxt_id = ntohs(c.iqid); + iq->abs_id = ntohs(c.physiqid); + if (!is_t4(adap->params.chip)) + iq->udb = udb_address_iq(adap, iq->cntxt_id); + iq->size--; /* subtract status entry */ + iq->netdev = dev; + iq->handler = hnd; + + /* set offset to -1 to distinguish ingress queues without FL */ + iq->offset = fl ? 0 : -1; + + adap->sge.ingr_map[iq->cntxt_id - adap->sge.ingr_start] = iq; + + if (fl) { + fl->cntxt_id = ntohs(c.fl0id); + fl->avail = fl->pend_cred = 0; + fl->pidx = fl->cidx = 0; + fl->alloc_failed = fl->large_alloc_failed = fl->starving = 0; + adap->sge.egr_map[fl->cntxt_id - adap->sge.egr_start] = fl; + + /* Note, we must initialize the Free List User Doorbell + * address before refilling the Free List! + */ + if (!is_t4(adap->params.chip)) + fl->udb = udb_address_eq(adap, fl->cntxt_id); + refill_fl(adap, fl, fl_cap(fl), GFP_KERNEL); + } + return 0; + +fl_nomem: + ret = -ENOMEM; +err: + if (iq->desc) { + dma_free_coherent(adap->pdev_dev, iq->size * iq->iqe_len, + iq->desc, iq->phys_addr); + iq->desc = NULL; + } + if (fl && fl->desc) { + kfree(fl->sdesc); + fl->sdesc = NULL; + dma_free_coherent(adap->pdev_dev, flsz * sizeof(struct tx_desc), + fl->desc, fl->addr); + fl->desc = NULL; + } + return ret; +} + +static void init_txq(struct adapter *adap, struct sge_txq *q, unsigned int id) +{ + q->cntxt_id = id; + if (!is_t4(adap->params.chip)) + q->udb = udb_address_eq(adap, q->cntxt_id); + + q->in_use = 0; + q->cidx = q->pidx = 0; + q->stops = q->restarts = 0; + q->stat = (void *)&q->desc[q->size]; + spin_lock_init(&q->db_lock); + adap->sge.egr_map[id - adap->sge.egr_start] = q; +} + +int t4_sge_alloc_eth_txq(struct adapter *adap, struct sge_eth_txq *txq, + struct net_device *dev, struct netdev_queue *netdevq, + unsigned int iqid) +{ + int ret, nentries; + struct fw_eq_eth_cmd c; + struct sge *s = &adap->sge; + struct port_info *pi = netdev_priv(dev); + + /* Add status entries */ + nentries = txq->q.size + s->stat_len / sizeof(struct tx_desc); + + txq->q.desc = alloc_ring(adap->pdev_dev, txq->q.size, + sizeof(struct tx_desc), sizeof(struct tx_sw_desc), + &txq->q.phys_addr, &txq->q.sdesc, s->stat_len, + netdev_queue_numa_node_read(netdevq)); + if (!txq->q.desc) + return -ENOMEM; + + memset(&c, 0, sizeof(c)); + c.op_to_vfn = htonl(FW_CMD_OP(FW_EQ_ETH_CMD) | FW_CMD_REQUEST | + FW_CMD_WRITE | FW_CMD_EXEC | + FW_EQ_ETH_CMD_PFN(adap->fn) | FW_EQ_ETH_CMD_VFN(0)); + c.alloc_to_len16 = htonl(FW_EQ_ETH_CMD_ALLOC | + FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c)); + c.viid_pkd = htonl(FW_EQ_ETH_CMD_AUTOEQUEQE | + FW_EQ_ETH_CMD_VIID(pi->viid)); + c.fetchszm_to_iqid = htonl(FW_EQ_ETH_CMD_HOSTFCMODE(2) | + FW_EQ_ETH_CMD_PCIECHN(pi->tx_chan) | + FW_EQ_ETH_CMD_FETCHRO(1) | + FW_EQ_ETH_CMD_IQID(iqid)); + c.dcaen_to_eqsize = htonl(FW_EQ_ETH_CMD_FBMIN(2) | + FW_EQ_ETH_CMD_FBMAX(3) | + FW_EQ_ETH_CMD_CIDXFTHRESH(5) | + FW_EQ_ETH_CMD_EQSIZE(nentries)); + c.eqaddr = cpu_to_be64(txq->q.phys_addr); + + ret = t4_wr_mbox(adap, adap->fn, &c, sizeof(c), &c); + if (ret) { + kfree(txq->q.sdesc); + txq->q.sdesc = NULL; + dma_free_coherent(adap->pdev_dev, + nentries * sizeof(struct tx_desc), + txq->q.desc, txq->q.phys_addr); + txq->q.desc = NULL; + return ret; + } + + init_txq(adap, &txq->q, FW_EQ_ETH_CMD_EQID_GET(ntohl(c.eqid_pkd))); + txq->txq = netdevq; + txq->tso = txq->tx_cso = txq->vlan_ins = 0; + txq->mapping_err = 0; + return 0; +} + +int t4_sge_alloc_ctrl_txq(struct adapter *adap, struct sge_ctrl_txq *txq, + struct net_device *dev, unsigned int iqid, + unsigned int cmplqid) +{ + int ret, nentries; + struct fw_eq_ctrl_cmd c; + struct sge *s = &adap->sge; + struct port_info *pi = netdev_priv(dev); + + /* Add status entries */ + nentries = txq->q.size + s->stat_len / sizeof(struct tx_desc); + + txq->q.desc = alloc_ring(adap->pdev_dev, nentries, + sizeof(struct tx_desc), 0, &txq->q.phys_addr, + NULL, 0, NUMA_NO_NODE); + if (!txq->q.desc) + return -ENOMEM; + + c.op_to_vfn = htonl(FW_CMD_OP(FW_EQ_CTRL_CMD) | FW_CMD_REQUEST | + FW_CMD_WRITE | FW_CMD_EXEC | + FW_EQ_CTRL_CMD_PFN(adap->fn) | + FW_EQ_CTRL_CMD_VFN(0)); + c.alloc_to_len16 = htonl(FW_EQ_CTRL_CMD_ALLOC | + FW_EQ_CTRL_CMD_EQSTART | FW_LEN16(c)); + c.cmpliqid_eqid = htonl(FW_EQ_CTRL_CMD_CMPLIQID(cmplqid)); + c.physeqid_pkd = htonl(0); + c.fetchszm_to_iqid = htonl(FW_EQ_CTRL_CMD_HOSTFCMODE(2) | + FW_EQ_CTRL_CMD_PCIECHN(pi->tx_chan) | + FW_EQ_CTRL_CMD_FETCHRO | + FW_EQ_CTRL_CMD_IQID(iqid)); + c.dcaen_to_eqsize = htonl(FW_EQ_CTRL_CMD_FBMIN(2) | + FW_EQ_CTRL_CMD_FBMAX(3) | + FW_EQ_CTRL_CMD_CIDXFTHRESH(5) | + FW_EQ_CTRL_CMD_EQSIZE(nentries)); + c.eqaddr = cpu_to_be64(txq->q.phys_addr); + + ret = t4_wr_mbox(adap, adap->fn, &c, sizeof(c), &c); + if (ret) { + dma_free_coherent(adap->pdev_dev, + nentries * sizeof(struct tx_desc), + txq->q.desc, txq->q.phys_addr); + txq->q.desc = NULL; + return ret; + } + + init_txq(adap, &txq->q, FW_EQ_CTRL_CMD_EQID_GET(ntohl(c.cmpliqid_eqid))); + txq->adap = adap; + skb_queue_head_init(&txq->sendq); + tasklet_init(&txq->qresume_tsk, restart_ctrlq, (unsigned long)txq); + txq->full = 0; + return 0; +} + +int t4_sge_alloc_ofld_txq(struct adapter *adap, struct sge_ofld_txq *txq, + struct net_device *dev, unsigned int iqid) +{ + int ret, nentries; + struct fw_eq_ofld_cmd c; + struct sge *s = &adap->sge; + struct port_info *pi = netdev_priv(dev); + + /* Add status entries */ + nentries = txq->q.size + s->stat_len / sizeof(struct tx_desc); + + txq->q.desc = alloc_ring(adap->pdev_dev, txq->q.size, + sizeof(struct tx_desc), sizeof(struct tx_sw_desc), + &txq->q.phys_addr, &txq->q.sdesc, s->stat_len, + NUMA_NO_NODE); + if (!txq->q.desc) + return -ENOMEM; + + memset(&c, 0, sizeof(c)); + c.op_to_vfn = htonl(FW_CMD_OP(FW_EQ_OFLD_CMD) | FW_CMD_REQUEST | + FW_CMD_WRITE | FW_CMD_EXEC | + FW_EQ_OFLD_CMD_PFN(adap->fn) | + FW_EQ_OFLD_CMD_VFN(0)); + c.alloc_to_len16 = htonl(FW_EQ_OFLD_CMD_ALLOC | + FW_EQ_OFLD_CMD_EQSTART | FW_LEN16(c)); + c.fetchszm_to_iqid = htonl(FW_EQ_OFLD_CMD_HOSTFCMODE(2) | + FW_EQ_OFLD_CMD_PCIECHN(pi->tx_chan) | + FW_EQ_OFLD_CMD_FETCHRO(1) | + FW_EQ_OFLD_CMD_IQID(iqid)); + c.dcaen_to_eqsize = htonl(FW_EQ_OFLD_CMD_FBMIN(2) | + FW_EQ_OFLD_CMD_FBMAX(3) | + FW_EQ_OFLD_CMD_CIDXFTHRESH(5) | + FW_EQ_OFLD_CMD_EQSIZE(nentries)); + c.eqaddr = cpu_to_be64(txq->q.phys_addr); + + ret = t4_wr_mbox(adap, adap->fn, &c, sizeof(c), &c); + if (ret) { + kfree(txq->q.sdesc); + txq->q.sdesc = NULL; + dma_free_coherent(adap->pdev_dev, + nentries * sizeof(struct tx_desc), + txq->q.desc, txq->q.phys_addr); + txq->q.desc = NULL; + return ret; + } + + init_txq(adap, &txq->q, FW_EQ_OFLD_CMD_EQID_GET(ntohl(c.eqid_pkd))); + txq->adap = adap; + skb_queue_head_init(&txq->sendq); + tasklet_init(&txq->qresume_tsk, restart_ofldq, (unsigned long)txq); + txq->full = 0; + txq->mapping_err = 0; + return 0; +} + +static void free_txq(struct adapter *adap, struct sge_txq *q) +{ + struct sge *s = &adap->sge; + + dma_free_coherent(adap->pdev_dev, + q->size * sizeof(struct tx_desc) + s->stat_len, + q->desc, q->phys_addr); + q->cntxt_id = 0; + q->sdesc = NULL; + q->desc = NULL; +} + +static void free_rspq_fl(struct adapter *adap, struct sge_rspq *rq, + struct sge_fl *fl) +{ + struct sge *s = &adap->sge; + unsigned int fl_id = fl ? fl->cntxt_id : 0xffff; + + adap->sge.ingr_map[rq->cntxt_id - adap->sge.ingr_start] = NULL; + t4_iq_free(adap, adap->fn, adap->fn, 0, FW_IQ_TYPE_FL_INT_CAP, + rq->cntxt_id, fl_id, 0xffff); + dma_free_coherent(adap->pdev_dev, (rq->size + 1) * rq->iqe_len, + rq->desc, rq->phys_addr); + netif_napi_del(&rq->napi); + rq->netdev = NULL; + rq->cntxt_id = rq->abs_id = 0; + rq->desc = NULL; + + if (fl) { + free_rx_bufs(adap, fl, fl->avail); + dma_free_coherent(adap->pdev_dev, fl->size * 8 + s->stat_len, + fl->desc, fl->addr); + kfree(fl->sdesc); + fl->sdesc = NULL; + fl->cntxt_id = 0; + fl->desc = NULL; + } +} + +/** + * t4_free_ofld_rxqs - free a block of consecutive Rx queues + * @adap: the adapter + * @n: number of queues + * @q: pointer to first queue + * + * Release the resources of a consecutive block of offload Rx queues. + */ +void t4_free_ofld_rxqs(struct adapter *adap, int n, struct sge_ofld_rxq *q) +{ + for ( ; n; n--, q++) + if (q->rspq.desc) + free_rspq_fl(adap, &q->rspq, + q->fl.size ? &q->fl : NULL); +} + +/** + * t4_free_sge_resources - free SGE resources + * @adap: the adapter + * + * Frees resources used by the SGE queue sets. + */ +void t4_free_sge_resources(struct adapter *adap) +{ + int i; + struct sge_eth_rxq *eq = adap->sge.ethrxq; + struct sge_eth_txq *etq = adap->sge.ethtxq; + + /* clean up Ethernet Tx/Rx queues */ + for (i = 0; i < adap->sge.ethqsets; i++, eq++, etq++) { + if (eq->rspq.desc) + free_rspq_fl(adap, &eq->rspq, + eq->fl.size ? &eq->fl : NULL); + if (etq->q.desc) { + t4_eth_eq_free(adap, adap->fn, adap->fn, 0, + etq->q.cntxt_id); + free_tx_desc(adap, &etq->q, etq->q.in_use, true); + kfree(etq->q.sdesc); + free_txq(adap, &etq->q); + } + } + + /* clean up RDMA and iSCSI Rx queues */ + t4_free_ofld_rxqs(adap, adap->sge.ofldqsets, adap->sge.ofldrxq); + t4_free_ofld_rxqs(adap, adap->sge.rdmaqs, adap->sge.rdmarxq); + t4_free_ofld_rxqs(adap, adap->sge.rdmaciqs, adap->sge.rdmaciq); + + /* clean up offload Tx queues */ + for (i = 0; i < ARRAY_SIZE(adap->sge.ofldtxq); i++) { + struct sge_ofld_txq *q = &adap->sge.ofldtxq[i]; + + if (q->q.desc) { + tasklet_kill(&q->qresume_tsk); + t4_ofld_eq_free(adap, adap->fn, adap->fn, 0, + q->q.cntxt_id); + free_tx_desc(adap, &q->q, q->q.in_use, false); + kfree(q->q.sdesc); + __skb_queue_purge(&q->sendq); + free_txq(adap, &q->q); + } + } + + /* clean up control Tx queues */ + for (i = 0; i < ARRAY_SIZE(adap->sge.ctrlq); i++) { + struct sge_ctrl_txq *cq = &adap->sge.ctrlq[i]; + + if (cq->q.desc) { + tasklet_kill(&cq->qresume_tsk); + t4_ctrl_eq_free(adap, adap->fn, adap->fn, 0, + cq->q.cntxt_id); + __skb_queue_purge(&cq->sendq); + free_txq(adap, &cq->q); + } + } + + if (adap->sge.fw_evtq.desc) + free_rspq_fl(adap, &adap->sge.fw_evtq, NULL); + + if (adap->sge.intrq.desc) + free_rspq_fl(adap, &adap->sge.intrq, NULL); + + /* clear the reverse egress queue map */ + memset(adap->sge.egr_map, 0, sizeof(adap->sge.egr_map)); +} + +void t4_sge_start(struct adapter *adap) +{ + adap->sge.ethtxq_rover = 0; + mod_timer(&adap->sge.rx_timer, jiffies + RX_QCHECK_PERIOD); + mod_timer(&adap->sge.tx_timer, jiffies + TX_QCHECK_PERIOD); +} + +/** + * t4_sge_stop - disable SGE operation + * @adap: the adapter + * + * Stop tasklets and timers associated with the DMA engine. Note that + * this is effective only if measures have been taken to disable any HW + * events that may restart them. + */ +void t4_sge_stop(struct adapter *adap) +{ + int i; + struct sge *s = &adap->sge; + + if (in_interrupt()) /* actions below require waiting */ + return; + + if (s->rx_timer.function) + del_timer_sync(&s->rx_timer); + if (s->tx_timer.function) + del_timer_sync(&s->tx_timer); + + for (i = 0; i < ARRAY_SIZE(s->ofldtxq); i++) { + struct sge_ofld_txq *q = &s->ofldtxq[i]; + + if (q->q.desc) + tasklet_kill(&q->qresume_tsk); + } + for (i = 0; i < ARRAY_SIZE(s->ctrlq); i++) { + struct sge_ctrl_txq *cq = &s->ctrlq[i]; + + if (cq->q.desc) + tasklet_kill(&cq->qresume_tsk); + } +} + +/** + * t4_sge_init - initialize SGE + * @adap: the adapter + * + * Performs SGE initialization needed every time after a chip reset. + * We do not initialize any of the queues here, instead the driver + * top-level must request them individually. + * + * Called in two different modes: + * + * 1. Perform actual hardware initialization and record hard-coded + * parameters which were used. This gets used when we're the + * Master PF and the Firmware Configuration File support didn't + * work for some reason. + * + * 2. We're not the Master PF or initialization was performed with + * a Firmware Configuration File. In this case we need to grab + * any of the SGE operating parameters that we need to have in + * order to do our job and make sure we can live with them ... + */ + +static int t4_sge_init_soft(struct adapter *adap) +{ + struct sge *s = &adap->sge; + u32 fl_small_pg, fl_large_pg, fl_small_mtu, fl_large_mtu; + u32 timer_value_0_and_1, timer_value_2_and_3, timer_value_4_and_5; + u32 ingress_rx_threshold; + + /* + * Verify that CPL messages are going to the Ingress Queue for + * process_responses() and that only packet data is going to the + * Free Lists. + */ + if ((t4_read_reg(adap, SGE_CONTROL) & RXPKTCPLMODE_MASK) != + RXPKTCPLMODE(X_RXPKTCPLMODE_SPLIT)) { + dev_err(adap->pdev_dev, "bad SGE CPL MODE\n"); + return -EINVAL; + } + + /* + * Validate the Host Buffer Register Array indices that we want to + * use ... + * + * XXX Note that we should really read through the Host Buffer Size + * XXX register array and find the indices of the Buffer Sizes which + * XXX meet our needs! + */ + #define READ_FL_BUF(x) \ + t4_read_reg(adap, SGE_FL_BUFFER_SIZE0+(x)*sizeof(u32)) + + fl_small_pg = READ_FL_BUF(RX_SMALL_PG_BUF); + fl_large_pg = READ_FL_BUF(RX_LARGE_PG_BUF); + fl_small_mtu = READ_FL_BUF(RX_SMALL_MTU_BUF); + fl_large_mtu = READ_FL_BUF(RX_LARGE_MTU_BUF); + + /* We only bother using the Large Page logic if the Large Page Buffer + * is larger than our Page Size Buffer. + */ + if (fl_large_pg <= fl_small_pg) + fl_large_pg = 0; + + #undef READ_FL_BUF + + /* The Page Size Buffer must be exactly equal to our Page Size and the + * Large Page Size Buffer should be 0 (per above) or a power of 2. + */ + if (fl_small_pg != PAGE_SIZE || + (fl_large_pg & (fl_large_pg-1)) != 0) { + dev_err(adap->pdev_dev, "bad SGE FL page buffer sizes [%d, %d]\n", + fl_small_pg, fl_large_pg); + return -EINVAL; + } + if (fl_large_pg) + s->fl_pg_order = ilog2(fl_large_pg) - PAGE_SHIFT; + + if (fl_small_mtu < FL_MTU_SMALL_BUFSIZE(adap) || + fl_large_mtu < FL_MTU_LARGE_BUFSIZE(adap)) { + dev_err(adap->pdev_dev, "bad SGE FL MTU sizes [%d, %d]\n", + fl_small_mtu, fl_large_mtu); + return -EINVAL; + } + + /* + * Retrieve our RX interrupt holdoff timer values and counter + * threshold values from the SGE parameters. + */ + timer_value_0_and_1 = t4_read_reg(adap, SGE_TIMER_VALUE_0_AND_1); + timer_value_2_and_3 = t4_read_reg(adap, SGE_TIMER_VALUE_2_AND_3); + timer_value_4_and_5 = t4_read_reg(adap, SGE_TIMER_VALUE_4_AND_5); + s->timer_val[0] = core_ticks_to_us(adap, + TIMERVALUE0_GET(timer_value_0_and_1)); + s->timer_val[1] = core_ticks_to_us(adap, + TIMERVALUE1_GET(timer_value_0_and_1)); + s->timer_val[2] = core_ticks_to_us(adap, + TIMERVALUE2_GET(timer_value_2_and_3)); + s->timer_val[3] = core_ticks_to_us(adap, + TIMERVALUE3_GET(timer_value_2_and_3)); + s->timer_val[4] = core_ticks_to_us(adap, + TIMERVALUE4_GET(timer_value_4_and_5)); + s->timer_val[5] = core_ticks_to_us(adap, + TIMERVALUE5_GET(timer_value_4_and_5)); + + ingress_rx_threshold = t4_read_reg(adap, SGE_INGRESS_RX_THRESHOLD); + s->counter_val[0] = THRESHOLD_0_GET(ingress_rx_threshold); + s->counter_val[1] = THRESHOLD_1_GET(ingress_rx_threshold); + s->counter_val[2] = THRESHOLD_2_GET(ingress_rx_threshold); + s->counter_val[3] = THRESHOLD_3_GET(ingress_rx_threshold); + + return 0; +} + +static int t4_sge_init_hard(struct adapter *adap) +{ + struct sge *s = &adap->sge; + + /* + * Set up our basic SGE mode to deliver CPL messages to our Ingress + * Queue and Packet Date to the Free List. + */ + t4_set_reg_field(adap, SGE_CONTROL, RXPKTCPLMODE_MASK, + RXPKTCPLMODE_MASK); + + /* + * Set up to drop DOORBELL writes when the DOORBELL FIFO overflows + * and generate an interrupt when this occurs so we can recover. + */ + if (is_t4(adap->params.chip)) { + t4_set_reg_field(adap, A_SGE_DBFIFO_STATUS, + V_HP_INT_THRESH(M_HP_INT_THRESH) | + V_LP_INT_THRESH(M_LP_INT_THRESH), + V_HP_INT_THRESH(dbfifo_int_thresh) | + V_LP_INT_THRESH(dbfifo_int_thresh)); + } else { + t4_set_reg_field(adap, A_SGE_DBFIFO_STATUS, + V_LP_INT_THRESH_T5(M_LP_INT_THRESH_T5), + V_LP_INT_THRESH_T5(dbfifo_int_thresh)); + t4_set_reg_field(adap, SGE_DBFIFO_STATUS2, + V_HP_INT_THRESH_T5(M_HP_INT_THRESH_T5), + V_HP_INT_THRESH_T5(dbfifo_int_thresh)); + } + t4_set_reg_field(adap, A_SGE_DOORBELL_CONTROL, F_ENABLE_DROP, + F_ENABLE_DROP); + + /* + * SGE_FL_BUFFER_SIZE0 (RX_SMALL_PG_BUF) is set up by + * t4_fixup_host_params(). + */ + s->fl_pg_order = FL_PG_ORDER; + if (s->fl_pg_order) + t4_write_reg(adap, + SGE_FL_BUFFER_SIZE0+RX_LARGE_PG_BUF*sizeof(u32), + PAGE_SIZE << FL_PG_ORDER); + t4_write_reg(adap, SGE_FL_BUFFER_SIZE0+RX_SMALL_MTU_BUF*sizeof(u32), + FL_MTU_SMALL_BUFSIZE(adap)); + t4_write_reg(adap, SGE_FL_BUFFER_SIZE0+RX_LARGE_MTU_BUF*sizeof(u32), + FL_MTU_LARGE_BUFSIZE(adap)); + + /* + * Note that the SGE Ingress Packet Count Interrupt Threshold and + * Timer Holdoff values must be supplied by our caller. + */ + t4_write_reg(adap, SGE_INGRESS_RX_THRESHOLD, + THRESHOLD_0(s->counter_val[0]) | + THRESHOLD_1(s->counter_val[1]) | + THRESHOLD_2(s->counter_val[2]) | + THRESHOLD_3(s->counter_val[3])); + t4_write_reg(adap, SGE_TIMER_VALUE_0_AND_1, + TIMERVALUE0(us_to_core_ticks(adap, s->timer_val[0])) | + TIMERVALUE1(us_to_core_ticks(adap, s->timer_val[1]))); + t4_write_reg(adap, SGE_TIMER_VALUE_2_AND_3, + TIMERVALUE2(us_to_core_ticks(adap, s->timer_val[2])) | + TIMERVALUE3(us_to_core_ticks(adap, s->timer_val[3]))); + t4_write_reg(adap, SGE_TIMER_VALUE_4_AND_5, + TIMERVALUE4(us_to_core_ticks(adap, s->timer_val[4])) | + TIMERVALUE5(us_to_core_ticks(adap, s->timer_val[5]))); + + return 0; +} + +int t4_sge_init(struct adapter *adap) +{ + struct sge *s = &adap->sge; + u32 sge_control, sge_control2, sge_conm_ctrl; + unsigned int ingpadboundary, ingpackboundary; + int ret, egress_threshold; + + /* + * Ingress Padding Boundary and Egress Status Page Size are set up by + * t4_fixup_host_params(). + */ + sge_control = t4_read_reg(adap, SGE_CONTROL); + s->pktshift = PKTSHIFT_GET(sge_control); + s->stat_len = (sge_control & EGRSTATUSPAGESIZE_MASK) ? 128 : 64; + + /* T4 uses a single control field to specify both the PCIe Padding and + * Packing Boundary. T5 introduced the ability to specify these + * separately. The actual Ingress Packet Data alignment boundary + * within Packed Buffer Mode is the maximum of these two + * specifications. + */ + ingpadboundary = 1 << (INGPADBOUNDARY_GET(sge_control) + + X_INGPADBOUNDARY_SHIFT); + if (is_t4(adap->params.chip)) { + s->fl_align = ingpadboundary; + } else { + /* T5 has a different interpretation of one of the PCIe Packing + * Boundary values. + */ + sge_control2 = t4_read_reg(adap, SGE_CONTROL2_A); + ingpackboundary = INGPACKBOUNDARY_G(sge_control2); + if (ingpackboundary == INGPACKBOUNDARY_16B_X) + ingpackboundary = 16; + else + ingpackboundary = 1 << (ingpackboundary + + INGPACKBOUNDARY_SHIFT_X); + + s->fl_align = max(ingpadboundary, ingpackboundary); + } + + if (adap->flags & USING_SOFT_PARAMS) + ret = t4_sge_init_soft(adap); + else + ret = t4_sge_init_hard(adap); + if (ret < 0) + return ret; + + /* + * A FL with <= fl_starve_thres buffers is starving and a periodic + * timer will attempt to refill it. This needs to be larger than the + * SGE's Egress Congestion Threshold. If it isn't, then we can get + * stuck waiting for new packets while the SGE is waiting for us to + * give it more Free List entries. (Note that the SGE's Egress + * Congestion Threshold is in units of 2 Free List pointers.) For T4, + * there was only a single field to control this. For T5 there's the + * original field which now only applies to Unpacked Mode Free List + * buffers and a new field which only applies to Packed Mode Free List + * buffers. + */ + sge_conm_ctrl = t4_read_reg(adap, SGE_CONM_CTRL); + if (is_t4(adap->params.chip)) + egress_threshold = EGRTHRESHOLD_GET(sge_conm_ctrl); + else + egress_threshold = EGRTHRESHOLDPACKING_GET(sge_conm_ctrl); + s->fl_starve_thres = 2*egress_threshold + 1; + + setup_timer(&s->rx_timer, sge_rx_timer_cb, (unsigned long)adap); + setup_timer(&s->tx_timer, sge_tx_timer_cb, (unsigned long)adap); + s->idma_1s_thresh = core_ticks_per_usec(adap) * 1000000; /* 1 s */ + s->idma_stalled[0] = 0; + s->idma_stalled[1] = 0; + spin_lock_init(&s->intrq_lock); + + return 0; +} diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c new file mode 100644 index 0000000..163a2a1 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c @@ -0,0 +1,4164 @@ +/* + * This file is part of the Chelsio T4 Ethernet driver for Linux. + * + * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include "cxgb4.h" +#include "t4_regs.h" +#include "t4fw_api.h" + +/** + * t4_wait_op_done_val - wait until an operation is completed + * @adapter: the adapter performing the operation + * @reg: the register to check for completion + * @mask: a single-bit field within @reg that indicates completion + * @polarity: the value of the field when the operation is completed + * @attempts: number of check iterations + * @delay: delay in usecs between iterations + * @valp: where to store the value of the register at completion time + * + * Wait until an operation is completed by checking a bit in a register + * up to @attempts times. If @valp is not NULL the value of the register + * at the time it indicated completion is stored there. Returns 0 if the + * operation completes and -EAGAIN otherwise. + */ +static int t4_wait_op_done_val(struct adapter *adapter, int reg, u32 mask, + int polarity, int attempts, int delay, u32 *valp) +{ + while (1) { + u32 val = t4_read_reg(adapter, reg); + + if (!!(val & mask) == polarity) { + if (valp) + *valp = val; + return 0; + } + if (--attempts == 0) + return -EAGAIN; + if (delay) + udelay(delay); + } +} + +static inline int t4_wait_op_done(struct adapter *adapter, int reg, u32 mask, + int polarity, int attempts, int delay) +{ + return t4_wait_op_done_val(adapter, reg, mask, polarity, attempts, + delay, NULL); +} + +/** + * t4_set_reg_field - set a register field to a value + * @adapter: the adapter to program + * @addr: the register address + * @mask: specifies the portion of the register to modify + * @val: the new value for the register field + * + * Sets a register field specified by the supplied mask to the + * given value. + */ +void t4_set_reg_field(struct adapter *adapter, unsigned int addr, u32 mask, + u32 val) +{ + u32 v = t4_read_reg(adapter, addr) & ~mask; + + t4_write_reg(adapter, addr, v | val); + (void) t4_read_reg(adapter, addr); /* flush */ +} + +/** + * t4_read_indirect - read indirectly addressed registers + * @adap: the adapter + * @addr_reg: register holding the indirect address + * @data_reg: register holding the value of the indirect register + * @vals: where the read register values are stored + * @nregs: how many indirect registers to read + * @start_idx: index of first indirect register to read + * + * Reads registers that are accessed indirectly through an address/data + * register pair. + */ +void t4_read_indirect(struct adapter *adap, unsigned int addr_reg, + unsigned int data_reg, u32 *vals, + unsigned int nregs, unsigned int start_idx) +{ + while (nregs--) { + t4_write_reg(adap, addr_reg, start_idx); + *vals++ = t4_read_reg(adap, data_reg); + start_idx++; + } +} + +/** + * t4_write_indirect - write indirectly addressed registers + * @adap: the adapter + * @addr_reg: register holding the indirect addresses + * @data_reg: register holding the value for the indirect registers + * @vals: values to write + * @nregs: how many indirect registers to write + * @start_idx: address of first indirect register to write + * + * Writes a sequential block of registers that are accessed indirectly + * through an address/data register pair. + */ +void t4_write_indirect(struct adapter *adap, unsigned int addr_reg, + unsigned int data_reg, const u32 *vals, + unsigned int nregs, unsigned int start_idx) +{ + while (nregs--) { + t4_write_reg(adap, addr_reg, start_idx++); + t4_write_reg(adap, data_reg, *vals++); + } +} + +/* + * Read a 32-bit PCI Configuration Space register via the PCI-E backdoor + * mechanism. This guarantees that we get the real value even if we're + * operating within a Virtual Machine and the Hypervisor is trapping our + * Configuration Space accesses. + */ +void t4_hw_pci_read_cfg4(struct adapter *adap, int reg, u32 *val) +{ + u32 req = ENABLE | FUNCTION(adap->fn) | reg; + + if (is_t4(adap->params.chip)) + req |= F_LOCALCFG; + + t4_write_reg(adap, PCIE_CFG_SPACE_REQ, req); + *val = t4_read_reg(adap, PCIE_CFG_SPACE_DATA); + + /* Reset ENABLE to 0 so reads of PCIE_CFG_SPACE_DATA won't cause a + * Configuration Space read. (None of the other fields matter when + * ENABLE is 0 so a simple register write is easier than a + * read-modify-write via t4_set_reg_field().) + */ + t4_write_reg(adap, PCIE_CFG_SPACE_REQ, 0); +} + +/* + * t4_report_fw_error - report firmware error + * @adap: the adapter + * + * The adapter firmware can indicate error conditions to the host. + * If the firmware has indicated an error, print out the reason for + * the firmware error. + */ +static void t4_report_fw_error(struct adapter *adap) +{ + static const char *const reason[] = { + "Crash", /* PCIE_FW_EVAL_CRASH */ + "During Device Preparation", /* PCIE_FW_EVAL_PREP */ + "During Device Configuration", /* PCIE_FW_EVAL_CONF */ + "During Device Initialization", /* PCIE_FW_EVAL_INIT */ + "Unexpected Event", /* PCIE_FW_EVAL_UNEXPECTEDEVENT */ + "Insufficient Airflow", /* PCIE_FW_EVAL_OVERHEAT */ + "Device Shutdown", /* PCIE_FW_EVAL_DEVICESHUTDOWN */ + "Reserved", /* reserved */ + }; + u32 pcie_fw; + + pcie_fw = t4_read_reg(adap, MA_PCIE_FW); + if (pcie_fw & FW_PCIE_FW_ERR) + dev_err(adap->pdev_dev, "Firmware reports adapter error: %s\n", + reason[FW_PCIE_FW_EVAL_GET(pcie_fw)]); +} + +/* + * Get the reply to a mailbox command and store it in @rpl in big-endian order. + */ +static void get_mbox_rpl(struct adapter *adap, __be64 *rpl, int nflit, + u32 mbox_addr) +{ + for ( ; nflit; nflit--, mbox_addr += 8) + *rpl++ = cpu_to_be64(t4_read_reg64(adap, mbox_addr)); +} + +/* + * Handle a FW assertion reported in a mailbox. + */ +static void fw_asrt(struct adapter *adap, u32 mbox_addr) +{ + struct fw_debug_cmd asrt; + + get_mbox_rpl(adap, (__be64 *)&asrt, sizeof(asrt) / 8, mbox_addr); + dev_alert(adap->pdev_dev, + "FW assertion at %.16s:%u, val0 %#x, val1 %#x\n", + asrt.u.assert.filename_0_7, ntohl(asrt.u.assert.line), + ntohl(asrt.u.assert.x), ntohl(asrt.u.assert.y)); +} + +static void dump_mbox(struct adapter *adap, int mbox, u32 data_reg) +{ + dev_err(adap->pdev_dev, + "mbox %d: %llx %llx %llx %llx %llx %llx %llx %llx\n", mbox, + (unsigned long long)t4_read_reg64(adap, data_reg), + (unsigned long long)t4_read_reg64(adap, data_reg + 8), + (unsigned long long)t4_read_reg64(adap, data_reg + 16), + (unsigned long long)t4_read_reg64(adap, data_reg + 24), + (unsigned long long)t4_read_reg64(adap, data_reg + 32), + (unsigned long long)t4_read_reg64(adap, data_reg + 40), + (unsigned long long)t4_read_reg64(adap, data_reg + 48), + (unsigned long long)t4_read_reg64(adap, data_reg + 56)); +} + +/** + * t4_wr_mbox_meat - send a command to FW through the given mailbox + * @adap: the adapter + * @mbox: index of the mailbox to use + * @cmd: the command to write + * @size: command length in bytes + * @rpl: where to optionally store the reply + * @sleep_ok: if true we may sleep while awaiting command completion + * + * Sends the given command to FW through the selected mailbox and waits + * for the FW to execute the command. If @rpl is not %NULL it is used to + * store the FW's reply to the command. The command and its optional + * reply are of the same length. FW can take up to %FW_CMD_MAX_TIMEOUT ms + * to respond. @sleep_ok determines whether we may sleep while awaiting + * the response. If sleeping is allowed we use progressive backoff + * otherwise we spin. + * + * The return value is 0 on success or a negative errno on failure. A + * failure can happen either because we are not able to execute the + * command or FW executes it but signals an error. In the latter case + * the return value is the error code indicated by FW (negated). + */ +int t4_wr_mbox_meat(struct adapter *adap, int mbox, const void *cmd, int size, + void *rpl, bool sleep_ok) +{ + static const int delay[] = { + 1, 1, 3, 5, 10, 10, 20, 50, 100, 200 + }; + + u32 v; + u64 res; + int i, ms, delay_idx; + const __be64 *p = cmd; + u32 data_reg = PF_REG(mbox, CIM_PF_MAILBOX_DATA); + u32 ctl_reg = PF_REG(mbox, CIM_PF_MAILBOX_CTRL); + + if ((size & 15) || size > MBOX_LEN) + return -EINVAL; + + /* + * If the device is off-line, as in EEH, commands will time out. + * Fail them early so we don't waste time waiting. + */ + if (adap->pdev->error_state != pci_channel_io_normal) + return -EIO; + + v = MBOWNER_GET(t4_read_reg(adap, ctl_reg)); + for (i = 0; v == MBOX_OWNER_NONE && i < 3; i++) + v = MBOWNER_GET(t4_read_reg(adap, ctl_reg)); + + if (v != MBOX_OWNER_DRV) + return v ? -EBUSY : -ETIMEDOUT; + + for (i = 0; i < size; i += 8) + t4_write_reg64(adap, data_reg + i, be64_to_cpu(*p++)); + + t4_write_reg(adap, ctl_reg, MBMSGVALID | MBOWNER(MBOX_OWNER_FW)); + t4_read_reg(adap, ctl_reg); /* flush write */ + + delay_idx = 0; + ms = delay[0]; + + for (i = 0; i < FW_CMD_MAX_TIMEOUT; i += ms) { + if (sleep_ok) { + ms = delay[delay_idx]; /* last element may repeat */ + if (delay_idx < ARRAY_SIZE(delay) - 1) + delay_idx++; + msleep(ms); + } else + mdelay(ms); + + v = t4_read_reg(adap, ctl_reg); + if (MBOWNER_GET(v) == MBOX_OWNER_DRV) { + if (!(v & MBMSGVALID)) { + t4_write_reg(adap, ctl_reg, 0); + continue; + } + + res = t4_read_reg64(adap, data_reg); + if (FW_CMD_OP_GET(res >> 32) == FW_DEBUG_CMD) { + fw_asrt(adap, data_reg); + res = FW_CMD_RETVAL(EIO); + } else if (rpl) + get_mbox_rpl(adap, rpl, size / 8, data_reg); + + if (FW_CMD_RETVAL_GET((int)res)) + dump_mbox(adap, mbox, data_reg); + t4_write_reg(adap, ctl_reg, 0); + return -FW_CMD_RETVAL_GET((int)res); + } + } + + dump_mbox(adap, mbox, data_reg); + dev_err(adap->pdev_dev, "command %#x in mailbox %d timed out\n", + *(const u8 *)cmd, mbox); + t4_report_fw_error(adap); + return -ETIMEDOUT; +} + +/** + * t4_mc_read - read from MC through backdoor accesses + * @adap: the adapter + * @addr: address of first byte requested + * @idx: which MC to access + * @data: 64 bytes of data containing the requested address + * @ecc: where to store the corresponding 64-bit ECC word + * + * Read 64 bytes of data from MC starting at a 64-byte-aligned address + * that covers the requested address @addr. If @parity is not %NULL it + * is assigned the 64-bit ECC word for the read data. + */ +int t4_mc_read(struct adapter *adap, int idx, u32 addr, __be32 *data, u64 *ecc) +{ + int i; + u32 mc_bist_cmd, mc_bist_cmd_addr, mc_bist_cmd_len; + u32 mc_bist_status_rdata, mc_bist_data_pattern; + + if (is_t4(adap->params.chip)) { + mc_bist_cmd = MC_BIST_CMD; + mc_bist_cmd_addr = MC_BIST_CMD_ADDR; + mc_bist_cmd_len = MC_BIST_CMD_LEN; + mc_bist_status_rdata = MC_BIST_STATUS_RDATA; + mc_bist_data_pattern = MC_BIST_DATA_PATTERN; + } else { + mc_bist_cmd = MC_REG(MC_P_BIST_CMD, idx); + mc_bist_cmd_addr = MC_REG(MC_P_BIST_CMD_ADDR, idx); + mc_bist_cmd_len = MC_REG(MC_P_BIST_CMD_LEN, idx); + mc_bist_status_rdata = MC_REG(MC_P_BIST_STATUS_RDATA, idx); + mc_bist_data_pattern = MC_REG(MC_P_BIST_DATA_PATTERN, idx); + } + + if (t4_read_reg(adap, mc_bist_cmd) & START_BIST) + return -EBUSY; + t4_write_reg(adap, mc_bist_cmd_addr, addr & ~0x3fU); + t4_write_reg(adap, mc_bist_cmd_len, 64); + t4_write_reg(adap, mc_bist_data_pattern, 0xc); + t4_write_reg(adap, mc_bist_cmd, BIST_OPCODE(1) | START_BIST | + BIST_CMD_GAP(1)); + i = t4_wait_op_done(adap, mc_bist_cmd, START_BIST, 0, 10, 1); + if (i) + return i; + +#define MC_DATA(i) MC_BIST_STATUS_REG(mc_bist_status_rdata, i) + + for (i = 15; i >= 0; i--) + *data++ = htonl(t4_read_reg(adap, MC_DATA(i))); + if (ecc) + *ecc = t4_read_reg64(adap, MC_DATA(16)); +#undef MC_DATA + return 0; +} + +/** + * t4_edc_read - read from EDC through backdoor accesses + * @adap: the adapter + * @idx: which EDC to access + * @addr: address of first byte requested + * @data: 64 bytes of data containing the requested address + * @ecc: where to store the corresponding 64-bit ECC word + * + * Read 64 bytes of data from EDC starting at a 64-byte-aligned address + * that covers the requested address @addr. If @parity is not %NULL it + * is assigned the 64-bit ECC word for the read data. + */ +int t4_edc_read(struct adapter *adap, int idx, u32 addr, __be32 *data, u64 *ecc) +{ + int i; + u32 edc_bist_cmd, edc_bist_cmd_addr, edc_bist_cmd_len; + u32 edc_bist_cmd_data_pattern, edc_bist_status_rdata; + + if (is_t4(adap->params.chip)) { + edc_bist_cmd = EDC_REG(EDC_BIST_CMD, idx); + edc_bist_cmd_addr = EDC_REG(EDC_BIST_CMD_ADDR, idx); + edc_bist_cmd_len = EDC_REG(EDC_BIST_CMD_LEN, idx); + edc_bist_cmd_data_pattern = EDC_REG(EDC_BIST_DATA_PATTERN, + idx); + edc_bist_status_rdata = EDC_REG(EDC_BIST_STATUS_RDATA, + idx); + } else { + edc_bist_cmd = EDC_REG_T5(EDC_H_BIST_CMD, idx); + edc_bist_cmd_addr = EDC_REG_T5(EDC_H_BIST_CMD_ADDR, idx); + edc_bist_cmd_len = EDC_REG_T5(EDC_H_BIST_CMD_LEN, idx); + edc_bist_cmd_data_pattern = + EDC_REG_T5(EDC_H_BIST_DATA_PATTERN, idx); + edc_bist_status_rdata = + EDC_REG_T5(EDC_H_BIST_STATUS_RDATA, idx); + } + + if (t4_read_reg(adap, edc_bist_cmd) & START_BIST) + return -EBUSY; + t4_write_reg(adap, edc_bist_cmd_addr, addr & ~0x3fU); + t4_write_reg(adap, edc_bist_cmd_len, 64); + t4_write_reg(adap, edc_bist_cmd_data_pattern, 0xc); + t4_write_reg(adap, edc_bist_cmd, + BIST_OPCODE(1) | BIST_CMD_GAP(1) | START_BIST); + i = t4_wait_op_done(adap, edc_bist_cmd, START_BIST, 0, 10, 1); + if (i) + return i; + +#define EDC_DATA(i) (EDC_BIST_STATUS_REG(edc_bist_status_rdata, i)) + + for (i = 15; i >= 0; i--) + *data++ = htonl(t4_read_reg(adap, EDC_DATA(i))); + if (ecc) + *ecc = t4_read_reg64(adap, EDC_DATA(16)); +#undef EDC_DATA + return 0; +} + +/** + * t4_memory_rw - read/write EDC 0, EDC 1 or MC via PCIE memory window + * @adap: the adapter + * @win: PCI-E Memory Window to use + * @mtype: memory type: MEM_EDC0, MEM_EDC1 or MEM_MC + * @addr: address within indicated memory type + * @len: amount of memory to transfer + * @buf: host memory buffer + * @dir: direction of transfer T4_MEMORY_READ (1) or T4_MEMORY_WRITE (0) + * + * Reads/writes an [almost] arbitrary memory region in the firmware: the + * firmware memory address and host buffer must be aligned on 32-bit + * boudaries; the length may be arbitrary. The memory is transferred as + * a raw byte sequence from/to the firmware's memory. If this memory + * contains data structures which contain multi-byte integers, it's the + * caller's responsibility to perform appropriate byte order conversions. + */ +int t4_memory_rw(struct adapter *adap, int win, int mtype, u32 addr, + u32 len, __be32 *buf, int dir) +{ + u32 pos, offset, resid, memoffset; + u32 edc_size, mc_size, win_pf, mem_reg, mem_aperture, mem_base; + + /* Argument sanity checks ... + */ + if (addr & 0x3) + return -EINVAL; + + /* It's convenient to be able to handle lengths which aren't a + * multiple of 32-bits because we often end up transferring files to + * the firmware. So we'll handle that by normalizing the length here + * and then handling any residual transfer at the end. + */ + resid = len & 0x3; + len -= resid; + + /* Offset into the region of memory which is being accessed + * MEM_EDC0 = 0 + * MEM_EDC1 = 1 + * MEM_MC = 2 -- T4 + * MEM_MC0 = 2 -- For T5 + * MEM_MC1 = 3 -- For T5 + */ + edc_size = EDRAM_SIZE_GET(t4_read_reg(adap, MA_EDRAM0_BAR)); + if (mtype != MEM_MC1) + memoffset = (mtype * (edc_size * 1024 * 1024)); + else { + mc_size = EXT_MEM_SIZE_GET(t4_read_reg(adap, + MA_EXT_MEMORY_BAR)); + memoffset = (MEM_MC0 * edc_size + mc_size) * 1024 * 1024; + } + + /* Determine the PCIE_MEM_ACCESS_OFFSET */ + addr = addr + memoffset; + + /* Each PCI-E Memory Window is programmed with a window size -- or + * "aperture" -- which controls the granularity of its mapping onto + * adapter memory. We need to grab that aperture in order to know + * how to use the specified window. The window is also programmed + * with the base address of the Memory Window in BAR0's address + * space. For T4 this is an absolute PCI-E Bus Address. For T5 + * the address is relative to BAR0. + */ + mem_reg = t4_read_reg(adap, + PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_BASE_WIN, + win)); + mem_aperture = 1 << (GET_WINDOW(mem_reg) + 10); + mem_base = GET_PCIEOFST(mem_reg) << 10; + if (is_t4(adap->params.chip)) + mem_base -= adap->t4_bar0; + win_pf = is_t4(adap->params.chip) ? 0 : V_PFNUM(adap->fn); + + /* Calculate our initial PCI-E Memory Window Position and Offset into + * that Window. + */ + pos = addr & ~(mem_aperture-1); + offset = addr - pos; + + /* Set up initial PCI-E Memory Window to cover the start of our + * transfer. (Read it back to ensure that changes propagate before we + * attempt to use the new value.) + */ + t4_write_reg(adap, + PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_OFFSET, win), + pos | win_pf); + t4_read_reg(adap, + PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_OFFSET, win)); + + /* Transfer data to/from the adapter as long as there's an integral + * number of 32-bit transfers to complete. + */ + while (len > 0) { + if (dir == T4_MEMORY_READ) + *buf++ = (__force __be32) t4_read_reg(adap, + mem_base + offset); + else + t4_write_reg(adap, mem_base + offset, + (__force u32) *buf++); + offset += sizeof(__be32); + len -= sizeof(__be32); + + /* If we've reached the end of our current window aperture, + * move the PCI-E Memory Window on to the next. Note that + * doing this here after "len" may be 0 allows us to set up + * the PCI-E Memory Window for a possible final residual + * transfer below ... + */ + if (offset == mem_aperture) { + pos += mem_aperture; + offset = 0; + t4_write_reg(adap, + PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_OFFSET, + win), pos | win_pf); + t4_read_reg(adap, + PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_OFFSET, + win)); + } + } + + /* If the original transfer had a length which wasn't a multiple of + * 32-bits, now's where we need to finish off the transfer of the + * residual amount. The PCI-E Memory Window has already been moved + * above (if necessary) to cover this final transfer. + */ + if (resid) { + union { + __be32 word; + char byte[4]; + } last; + unsigned char *bp; + int i; + + if (dir == T4_MEMORY_READ) { + last.word = (__force __be32) t4_read_reg(adap, + mem_base + offset); + for (bp = (unsigned char *)buf, i = resid; i < 4; i++) + bp[i] = last.byte[i]; + } else { + last.word = *buf; + for (i = resid; i < 4; i++) + last.byte[i] = 0; + t4_write_reg(adap, mem_base + offset, + (__force u32) last.word); + } + } + + return 0; +} + +#define EEPROM_STAT_ADDR 0x7bfc +#define VPD_BASE 0x400 +#define VPD_BASE_OLD 0 +#define VPD_LEN 1024 +#define CHELSIO_VPD_UNIQUE_ID 0x82 + +/** + * t4_seeprom_wp - enable/disable EEPROM write protection + * @adapter: the adapter + * @enable: whether to enable or disable write protection + * + * Enables or disables write protection on the serial EEPROM. + */ +int t4_seeprom_wp(struct adapter *adapter, bool enable) +{ + unsigned int v = enable ? 0xc : 0; + int ret = pci_write_vpd(adapter->pdev, EEPROM_STAT_ADDR, 4, &v); + return ret < 0 ? ret : 0; +} + +/** + * get_vpd_params - read VPD parameters from VPD EEPROM + * @adapter: adapter to read + * @p: where to store the parameters + * + * Reads card parameters stored in VPD EEPROM. + */ +int get_vpd_params(struct adapter *adapter, struct vpd_params *p) +{ + u32 cclk_param, cclk_val; + int i, ret, addr; + int ec, sn, pn; + u8 *vpd, csum; + unsigned int vpdr_len, kw_offset, id_len; + + vpd = vmalloc(VPD_LEN); + if (!vpd) + return -ENOMEM; + + ret = pci_read_vpd(adapter->pdev, VPD_BASE, sizeof(u32), vpd); + if (ret < 0) + goto out; + + /* The VPD shall have a unique identifier specified by the PCI SIG. + * For chelsio adapters, the identifier is 0x82. The first byte of a VPD + * shall be CHELSIO_VPD_UNIQUE_ID (0x82). The VPD programming software + * is expected to automatically put this entry at the + * beginning of the VPD. + */ + addr = *vpd == CHELSIO_VPD_UNIQUE_ID ? VPD_BASE : VPD_BASE_OLD; + + ret = pci_read_vpd(adapter->pdev, addr, VPD_LEN, vpd); + if (ret < 0) + goto out; + + if (vpd[0] != PCI_VPD_LRDT_ID_STRING) { + dev_err(adapter->pdev_dev, "missing VPD ID string\n"); + ret = -EINVAL; + goto out; + } + + id_len = pci_vpd_lrdt_size(vpd); + if (id_len > ID_LEN) + id_len = ID_LEN; + + i = pci_vpd_find_tag(vpd, 0, VPD_LEN, PCI_VPD_LRDT_RO_DATA); + if (i < 0) { + dev_err(adapter->pdev_dev, "missing VPD-R section\n"); + ret = -EINVAL; + goto out; + } + + vpdr_len = pci_vpd_lrdt_size(&vpd[i]); + kw_offset = i + PCI_VPD_LRDT_TAG_SIZE; + if (vpdr_len + kw_offset > VPD_LEN) { + dev_err(adapter->pdev_dev, "bad VPD-R length %u\n", vpdr_len); + ret = -EINVAL; + goto out; + } + +#define FIND_VPD_KW(var, name) do { \ + var = pci_vpd_find_info_keyword(vpd, kw_offset, vpdr_len, name); \ + if (var < 0) { \ + dev_err(adapter->pdev_dev, "missing VPD keyword " name "\n"); \ + ret = -EINVAL; \ + goto out; \ + } \ + var += PCI_VPD_INFO_FLD_HDR_SIZE; \ +} while (0) + + FIND_VPD_KW(i, "RV"); + for (csum = 0; i >= 0; i--) + csum += vpd[i]; + + if (csum) { + dev_err(adapter->pdev_dev, + "corrupted VPD EEPROM, actual csum %u\n", csum); + ret = -EINVAL; + goto out; + } + + FIND_VPD_KW(ec, "EC"); + FIND_VPD_KW(sn, "SN"); + FIND_VPD_KW(pn, "PN"); +#undef FIND_VPD_KW + + memcpy(p->id, vpd + PCI_VPD_LRDT_TAG_SIZE, id_len); + strim(p->id); + memcpy(p->ec, vpd + ec, EC_LEN); + strim(p->ec); + i = pci_vpd_info_field_size(vpd + sn - PCI_VPD_INFO_FLD_HDR_SIZE); + memcpy(p->sn, vpd + sn, min(i, SERNUM_LEN)); + strim(p->sn); + i = pci_vpd_info_field_size(vpd + pn - PCI_VPD_INFO_FLD_HDR_SIZE); + memcpy(p->pn, vpd + pn, min(i, PN_LEN)); + strim(p->pn); + + /* + * Ask firmware for the Core Clock since it knows how to translate the + * Reference Clock ('V2') VPD field into a Core Clock value ... + */ + cclk_param = (FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | + FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_CCLK)); + ret = t4_query_params(adapter, adapter->mbox, 0, 0, + 1, &cclk_param, &cclk_val); + +out: + vfree(vpd); + if (ret) + return ret; + p->cclk = cclk_val; + + return 0; +} + +/* serial flash and firmware constants */ +enum { + SF_ATTEMPTS = 10, /* max retries for SF operations */ + + /* flash command opcodes */ + SF_PROG_PAGE = 2, /* program page */ + SF_WR_DISABLE = 4, /* disable writes */ + SF_RD_STATUS = 5, /* read status register */ + SF_WR_ENABLE = 6, /* enable writes */ + SF_RD_DATA_FAST = 0xb, /* read flash */ + SF_RD_ID = 0x9f, /* read ID */ + SF_ERASE_SECTOR = 0xd8, /* erase sector */ + + FW_MAX_SIZE = 16 * SF_SEC_SIZE, +}; + +/** + * sf1_read - read data from the serial flash + * @adapter: the adapter + * @byte_cnt: number of bytes to read + * @cont: whether another operation will be chained + * @lock: whether to lock SF for PL access only + * @valp: where to store the read data + * + * Reads up to 4 bytes of data from the serial flash. The location of + * the read needs to be specified prior to calling this by issuing the + * appropriate commands to the serial flash. + */ +static int sf1_read(struct adapter *adapter, unsigned int byte_cnt, int cont, + int lock, u32 *valp) +{ + int ret; + + if (!byte_cnt || byte_cnt > 4) + return -EINVAL; + if (t4_read_reg(adapter, SF_OP) & SF_BUSY) + return -EBUSY; + cont = cont ? SF_CONT : 0; + lock = lock ? SF_LOCK : 0; + t4_write_reg(adapter, SF_OP, lock | cont | BYTECNT(byte_cnt - 1)); + ret = t4_wait_op_done(adapter, SF_OP, SF_BUSY, 0, SF_ATTEMPTS, 5); + if (!ret) + *valp = t4_read_reg(adapter, SF_DATA); + return ret; +} + +/** + * sf1_write - write data to the serial flash + * @adapter: the adapter + * @byte_cnt: number of bytes to write + * @cont: whether another operation will be chained + * @lock: whether to lock SF for PL access only + * @val: value to write + * + * Writes up to 4 bytes of data to the serial flash. The location of + * the write needs to be specified prior to calling this by issuing the + * appropriate commands to the serial flash. + */ +static int sf1_write(struct adapter *adapter, unsigned int byte_cnt, int cont, + int lock, u32 val) +{ + if (!byte_cnt || byte_cnt > 4) + return -EINVAL; + if (t4_read_reg(adapter, SF_OP) & SF_BUSY) + return -EBUSY; + cont = cont ? SF_CONT : 0; + lock = lock ? SF_LOCK : 0; + t4_write_reg(adapter, SF_DATA, val); + t4_write_reg(adapter, SF_OP, lock | + cont | BYTECNT(byte_cnt - 1) | OP_WR); + return t4_wait_op_done(adapter, SF_OP, SF_BUSY, 0, SF_ATTEMPTS, 5); +} + +/** + * flash_wait_op - wait for a flash operation to complete + * @adapter: the adapter + * @attempts: max number of polls of the status register + * @delay: delay between polls in ms + * + * Wait for a flash operation to complete by polling the status register. + */ +static int flash_wait_op(struct adapter *adapter, int attempts, int delay) +{ + int ret; + u32 status; + + while (1) { + if ((ret = sf1_write(adapter, 1, 1, 1, SF_RD_STATUS)) != 0 || + (ret = sf1_read(adapter, 1, 0, 1, &status)) != 0) + return ret; + if (!(status & 1)) + return 0; + if (--attempts == 0) + return -EAGAIN; + if (delay) + msleep(delay); + } +} + +/** + * t4_read_flash - read words from serial flash + * @adapter: the adapter + * @addr: the start address for the read + * @nwords: how many 32-bit words to read + * @data: where to store the read data + * @byte_oriented: whether to store data as bytes or as words + * + * Read the specified number of 32-bit words from the serial flash. + * If @byte_oriented is set the read data is stored as a byte array + * (i.e., big-endian), otherwise as 32-bit words in the platform's + * natural endianess. + */ +static int t4_read_flash(struct adapter *adapter, unsigned int addr, + unsigned int nwords, u32 *data, int byte_oriented) +{ + int ret; + + if (addr + nwords * sizeof(u32) > adapter->params.sf_size || (addr & 3)) + return -EINVAL; + + addr = swab32(addr) | SF_RD_DATA_FAST; + + if ((ret = sf1_write(adapter, 4, 1, 0, addr)) != 0 || + (ret = sf1_read(adapter, 1, 1, 0, data)) != 0) + return ret; + + for ( ; nwords; nwords--, data++) { + ret = sf1_read(adapter, 4, nwords > 1, nwords == 1, data); + if (nwords == 1) + t4_write_reg(adapter, SF_OP, 0); /* unlock SF */ + if (ret) + return ret; + if (byte_oriented) + *data = (__force __u32) (htonl(*data)); + } + return 0; +} + +/** + * t4_write_flash - write up to a page of data to the serial flash + * @adapter: the adapter + * @addr: the start address to write + * @n: length of data to write in bytes + * @data: the data to write + * + * Writes up to a page of data (256 bytes) to the serial flash starting + * at the given address. All the data must be written to the same page. + */ +static int t4_write_flash(struct adapter *adapter, unsigned int addr, + unsigned int n, const u8 *data) +{ + int ret; + u32 buf[64]; + unsigned int i, c, left, val, offset = addr & 0xff; + + if (addr >= adapter->params.sf_size || offset + n > SF_PAGE_SIZE) + return -EINVAL; + + val = swab32(addr) | SF_PROG_PAGE; + + if ((ret = sf1_write(adapter, 1, 0, 1, SF_WR_ENABLE)) != 0 || + (ret = sf1_write(adapter, 4, 1, 1, val)) != 0) + goto unlock; + + for (left = n; left; left -= c) { + c = min(left, 4U); + for (val = 0, i = 0; i < c; ++i) + val = (val << 8) + *data++; + + ret = sf1_write(adapter, c, c != left, 1, val); + if (ret) + goto unlock; + } + ret = flash_wait_op(adapter, 8, 1); + if (ret) + goto unlock; + + t4_write_reg(adapter, SF_OP, 0); /* unlock SF */ + + /* Read the page to verify the write succeeded */ + ret = t4_read_flash(adapter, addr & ~0xff, ARRAY_SIZE(buf), buf, 1); + if (ret) + return ret; + + if (memcmp(data - n, (u8 *)buf + offset, n)) { + dev_err(adapter->pdev_dev, + "failed to correctly write the flash page at %#x\n", + addr); + return -EIO; + } + return 0; + +unlock: + t4_write_reg(adapter, SF_OP, 0); /* unlock SF */ + return ret; +} + +/** + * t4_get_fw_version - read the firmware version + * @adapter: the adapter + * @vers: where to place the version + * + * Reads the FW version from flash. + */ +int t4_get_fw_version(struct adapter *adapter, u32 *vers) +{ + return t4_read_flash(adapter, FLASH_FW_START + + offsetof(struct fw_hdr, fw_ver), 1, + vers, 0); +} + +/** + * t4_get_tp_version - read the TP microcode version + * @adapter: the adapter + * @vers: where to place the version + * + * Reads the TP microcode version from flash. + */ +int t4_get_tp_version(struct adapter *adapter, u32 *vers) +{ + return t4_read_flash(adapter, FLASH_FW_START + + offsetof(struct fw_hdr, tp_microcode_ver), + 1, vers, 0); +} + +/* Is the given firmware API compatible with the one the driver was compiled + * with? + */ +static int fw_compatible(const struct fw_hdr *hdr1, const struct fw_hdr *hdr2) +{ + + /* short circuit if it's the exact same firmware version */ + if (hdr1->chip == hdr2->chip && hdr1->fw_ver == hdr2->fw_ver) + return 1; + +#define SAME_INTF(x) (hdr1->intfver_##x == hdr2->intfver_##x) + if (hdr1->chip == hdr2->chip && SAME_INTF(nic) && SAME_INTF(vnic) && + SAME_INTF(ri) && SAME_INTF(iscsi) && SAME_INTF(fcoe)) + return 1; +#undef SAME_INTF + + return 0; +} + +/* The firmware in the filesystem is usable, but should it be installed? + * This routine explains itself in detail if it indicates the filesystem + * firmware should be installed. + */ +static int should_install_fs_fw(struct adapter *adap, int card_fw_usable, + int k, int c) +{ + const char *reason; + + if (!card_fw_usable) { + reason = "incompatible or unusable"; + goto install; + } + + if (k > c) { + reason = "older than the version supported with this driver"; + goto install; + } + + return 0; + +install: + dev_err(adap->pdev_dev, "firmware on card (%u.%u.%u.%u) is %s, " + "installing firmware %u.%u.%u.%u on card.\n", + FW_HDR_FW_VER_MAJOR_GET(c), FW_HDR_FW_VER_MINOR_GET(c), + FW_HDR_FW_VER_MICRO_GET(c), FW_HDR_FW_VER_BUILD_GET(c), reason, + FW_HDR_FW_VER_MAJOR_GET(k), FW_HDR_FW_VER_MINOR_GET(k), + FW_HDR_FW_VER_MICRO_GET(k), FW_HDR_FW_VER_BUILD_GET(k)); + + return 1; +} + +int t4_prep_fw(struct adapter *adap, struct fw_info *fw_info, + const u8 *fw_data, unsigned int fw_size, + struct fw_hdr *card_fw, enum dev_state state, + int *reset) +{ + int ret, card_fw_usable, fs_fw_usable; + const struct fw_hdr *fs_fw; + const struct fw_hdr *drv_fw; + + drv_fw = &fw_info->fw_hdr; + + /* Read the header of the firmware on the card */ + ret = -t4_read_flash(adap, FLASH_FW_START, + sizeof(*card_fw) / sizeof(uint32_t), + (uint32_t *)card_fw, 1); + if (ret == 0) { + card_fw_usable = fw_compatible(drv_fw, (const void *)card_fw); + } else { + dev_err(adap->pdev_dev, + "Unable to read card's firmware header: %d\n", ret); + card_fw_usable = 0; + } + + if (fw_data != NULL) { + fs_fw = (const void *)fw_data; + fs_fw_usable = fw_compatible(drv_fw, fs_fw); + } else { + fs_fw = NULL; + fs_fw_usable = 0; + } + + if (card_fw_usable && card_fw->fw_ver == drv_fw->fw_ver && + (!fs_fw_usable || fs_fw->fw_ver == drv_fw->fw_ver)) { + /* Common case: the firmware on the card is an exact match and + * the filesystem one is an exact match too, or the filesystem + * one is absent/incompatible. + */ + } else if (fs_fw_usable && state == DEV_STATE_UNINIT && + should_install_fs_fw(adap, card_fw_usable, + be32_to_cpu(fs_fw->fw_ver), + be32_to_cpu(card_fw->fw_ver))) { + ret = -t4_fw_upgrade(adap, adap->mbox, fw_data, + fw_size, 0); + if (ret != 0) { + dev_err(adap->pdev_dev, + "failed to install firmware: %d\n", ret); + goto bye; + } + + /* Installed successfully, update the cached header too. */ + memcpy(card_fw, fs_fw, sizeof(*card_fw)); + card_fw_usable = 1; + *reset = 0; /* already reset as part of load_fw */ + } + + if (!card_fw_usable) { + uint32_t d, c, k; + + d = be32_to_cpu(drv_fw->fw_ver); + c = be32_to_cpu(card_fw->fw_ver); + k = fs_fw ? be32_to_cpu(fs_fw->fw_ver) : 0; + + dev_err(adap->pdev_dev, "Cannot find a usable firmware: " + "chip state %d, " + "driver compiled with %d.%d.%d.%d, " + "card has %d.%d.%d.%d, filesystem has %d.%d.%d.%d\n", + state, + FW_HDR_FW_VER_MAJOR_GET(d), FW_HDR_FW_VER_MINOR_GET(d), + FW_HDR_FW_VER_MICRO_GET(d), FW_HDR_FW_VER_BUILD_GET(d), + FW_HDR_FW_VER_MAJOR_GET(c), FW_HDR_FW_VER_MINOR_GET(c), + FW_HDR_FW_VER_MICRO_GET(c), FW_HDR_FW_VER_BUILD_GET(c), + FW_HDR_FW_VER_MAJOR_GET(k), FW_HDR_FW_VER_MINOR_GET(k), + FW_HDR_FW_VER_MICRO_GET(k), FW_HDR_FW_VER_BUILD_GET(k)); + ret = EINVAL; + goto bye; + } + + /* We're using whatever's on the card and it's known to be good. */ + adap->params.fw_vers = be32_to_cpu(card_fw->fw_ver); + adap->params.tp_vers = be32_to_cpu(card_fw->tp_microcode_ver); + +bye: + return ret; +} + +/** + * t4_flash_erase_sectors - erase a range of flash sectors + * @adapter: the adapter + * @start: the first sector to erase + * @end: the last sector to erase + * + * Erases the sectors in the given inclusive range. + */ +static int t4_flash_erase_sectors(struct adapter *adapter, int start, int end) +{ + int ret = 0; + + if (end >= adapter->params.sf_nsec) + return -EINVAL; + + while (start <= end) { + if ((ret = sf1_write(adapter, 1, 0, 1, SF_WR_ENABLE)) != 0 || + (ret = sf1_write(adapter, 4, 0, 1, + SF_ERASE_SECTOR | (start << 8))) != 0 || + (ret = flash_wait_op(adapter, 14, 500)) != 0) { + dev_err(adapter->pdev_dev, + "erase of flash sector %d failed, error %d\n", + start, ret); + break; + } + start++; + } + t4_write_reg(adapter, SF_OP, 0); /* unlock SF */ + return ret; +} + +/** + * t4_flash_cfg_addr - return the address of the flash configuration file + * @adapter: the adapter + * + * Return the address within the flash where the Firmware Configuration + * File is stored. + */ +unsigned int t4_flash_cfg_addr(struct adapter *adapter) +{ + if (adapter->params.sf_size == 0x100000) + return FLASH_FPGA_CFG_START; + else + return FLASH_CFG_START; +} + +/** + * t4_load_fw - download firmware + * @adap: the adapter + * @fw_data: the firmware image to write + * @size: image size + * + * Write the supplied firmware image to the card's serial flash. + */ +int t4_load_fw(struct adapter *adap, const u8 *fw_data, unsigned int size) +{ + u32 csum; + int ret, addr; + unsigned int i; + u8 first_page[SF_PAGE_SIZE]; + const __be32 *p = (const __be32 *)fw_data; + const struct fw_hdr *hdr = (const struct fw_hdr *)fw_data; + unsigned int sf_sec_size = adap->params.sf_size / adap->params.sf_nsec; + unsigned int fw_img_start = adap->params.sf_fw_start; + unsigned int fw_start_sec = fw_img_start / sf_sec_size; + + if (!size) { + dev_err(adap->pdev_dev, "FW image has no data\n"); + return -EINVAL; + } + if (size & 511) { + dev_err(adap->pdev_dev, + "FW image size not multiple of 512 bytes\n"); + return -EINVAL; + } + if (ntohs(hdr->len512) * 512 != size) { + dev_err(adap->pdev_dev, + "FW image size differs from size in FW header\n"); + return -EINVAL; + } + if (size > FW_MAX_SIZE) { + dev_err(adap->pdev_dev, "FW image too large, max is %u bytes\n", + FW_MAX_SIZE); + return -EFBIG; + } + + for (csum = 0, i = 0; i < size / sizeof(csum); i++) + csum += ntohl(p[i]); + + if (csum != 0xffffffff) { + dev_err(adap->pdev_dev, + "corrupted firmware image, checksum %#x\n", csum); + return -EINVAL; + } + + i = DIV_ROUND_UP(size, sf_sec_size); /* # of sectors spanned */ + ret = t4_flash_erase_sectors(adap, fw_start_sec, fw_start_sec + i - 1); + if (ret) + goto out; + + /* + * We write the correct version at the end so the driver can see a bad + * version if the FW write fails. Start by writing a copy of the + * first page with a bad version. + */ + memcpy(first_page, fw_data, SF_PAGE_SIZE); + ((struct fw_hdr *)first_page)->fw_ver = htonl(0xffffffff); + ret = t4_write_flash(adap, fw_img_start, SF_PAGE_SIZE, first_page); + if (ret) + goto out; + + addr = fw_img_start; + for (size -= SF_PAGE_SIZE; size; size -= SF_PAGE_SIZE) { + addr += SF_PAGE_SIZE; + fw_data += SF_PAGE_SIZE; + ret = t4_write_flash(adap, addr, SF_PAGE_SIZE, fw_data); + if (ret) + goto out; + } + + ret = t4_write_flash(adap, + fw_img_start + offsetof(struct fw_hdr, fw_ver), + sizeof(hdr->fw_ver), (const u8 *)&hdr->fw_ver); +out: + if (ret) + dev_err(adap->pdev_dev, "firmware download failed, error %d\n", + ret); + return ret; +} + +#define ADVERT_MASK (FW_PORT_CAP_SPEED_100M | FW_PORT_CAP_SPEED_1G |\ + FW_PORT_CAP_SPEED_10G | FW_PORT_CAP_SPEED_40G | \ + FW_PORT_CAP_ANEG) + +/** + * t4_link_start - apply link configuration to MAC/PHY + * @phy: the PHY to setup + * @mac: the MAC to setup + * @lc: the requested link configuration + * + * Set up a port's MAC and PHY according to a desired link configuration. + * - If the PHY can auto-negotiate first decide what to advertise, then + * enable/disable auto-negotiation as desired, and reset. + * - If the PHY does not auto-negotiate just reset it. + * - If auto-negotiation is off set the MAC to the proper speed/duplex/FC, + * otherwise do it later based on the outcome of auto-negotiation. + */ +int t4_link_start(struct adapter *adap, unsigned int mbox, unsigned int port, + struct link_config *lc) +{ + struct fw_port_cmd c; + unsigned int fc = 0, mdi = FW_PORT_MDI(FW_PORT_MDI_AUTO); + + lc->link_ok = 0; + if (lc->requested_fc & PAUSE_RX) + fc |= FW_PORT_CAP_FC_RX; + if (lc->requested_fc & PAUSE_TX) + fc |= FW_PORT_CAP_FC_TX; + + memset(&c, 0, sizeof(c)); + c.op_to_portid = htonl(FW_CMD_OP(FW_PORT_CMD) | FW_CMD_REQUEST | + FW_CMD_EXEC | FW_PORT_CMD_PORTID(port)); + c.action_to_len16 = htonl(FW_PORT_CMD_ACTION(FW_PORT_ACTION_L1_CFG) | + FW_LEN16(c)); + + if (!(lc->supported & FW_PORT_CAP_ANEG)) { + c.u.l1cfg.rcap = htonl((lc->supported & ADVERT_MASK) | fc); + lc->fc = lc->requested_fc & (PAUSE_RX | PAUSE_TX); + } else if (lc->autoneg == AUTONEG_DISABLE) { + c.u.l1cfg.rcap = htonl(lc->requested_speed | fc | mdi); + lc->fc = lc->requested_fc & (PAUSE_RX | PAUSE_TX); + } else + c.u.l1cfg.rcap = htonl(lc->advertising | fc | mdi); + + return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL); +} + +/** + * t4_restart_aneg - restart autonegotiation + * @adap: the adapter + * @mbox: mbox to use for the FW command + * @port: the port id + * + * Restarts autonegotiation for the selected port. + */ +int t4_restart_aneg(struct adapter *adap, unsigned int mbox, unsigned int port) +{ + struct fw_port_cmd c; + + memset(&c, 0, sizeof(c)); + c.op_to_portid = htonl(FW_CMD_OP(FW_PORT_CMD) | FW_CMD_REQUEST | + FW_CMD_EXEC | FW_PORT_CMD_PORTID(port)); + c.action_to_len16 = htonl(FW_PORT_CMD_ACTION(FW_PORT_ACTION_L1_CFG) | + FW_LEN16(c)); + c.u.l1cfg.rcap = htonl(FW_PORT_CAP_ANEG); + return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL); +} + +typedef void (*int_handler_t)(struct adapter *adap); + +struct intr_info { + unsigned int mask; /* bits to check in interrupt status */ + const char *msg; /* message to print or NULL */ + short stat_idx; /* stat counter to increment or -1 */ + unsigned short fatal; /* whether the condition reported is fatal */ + int_handler_t int_handler; /* platform-specific int handler */ +}; + +/** + * t4_handle_intr_status - table driven interrupt handler + * @adapter: the adapter that generated the interrupt + * @reg: the interrupt status register to process + * @acts: table of interrupt actions + * + * A table driven interrupt handler that applies a set of masks to an + * interrupt status word and performs the corresponding actions if the + * interrupts described by the mask have occurred. The actions include + * optionally emitting a warning or alert message. The table is terminated + * by an entry specifying mask 0. Returns the number of fatal interrupt + * conditions. + */ +static int t4_handle_intr_status(struct adapter *adapter, unsigned int reg, + const struct intr_info *acts) +{ + int fatal = 0; + unsigned int mask = 0; + unsigned int status = t4_read_reg(adapter, reg); + + for ( ; acts->mask; ++acts) { + if (!(status & acts->mask)) + continue; + if (acts->fatal) { + fatal++; + dev_alert(adapter->pdev_dev, "%s (0x%x)\n", acts->msg, + status & acts->mask); + } else if (acts->msg && printk_ratelimit()) + dev_warn(adapter->pdev_dev, "%s (0x%x)\n", acts->msg, + status & acts->mask); + if (acts->int_handler) + acts->int_handler(adapter); + mask |= acts->mask; + } + status &= mask; + if (status) /* clear processed interrupts */ + t4_write_reg(adapter, reg, status); + return fatal; +} + +/* + * Interrupt handler for the PCIE module. + */ +static void pcie_intr_handler(struct adapter *adapter) +{ + static const struct intr_info sysbus_intr_info[] = { + { RNPP, "RXNP array parity error", -1, 1 }, + { RPCP, "RXPC array parity error", -1, 1 }, + { RCIP, "RXCIF array parity error", -1, 1 }, + { RCCP, "Rx completions control array parity error", -1, 1 }, + { RFTP, "RXFT array parity error", -1, 1 }, + { 0 } + }; + static const struct intr_info pcie_port_intr_info[] = { + { TPCP, "TXPC array parity error", -1, 1 }, + { TNPP, "TXNP array parity error", -1, 1 }, + { TFTP, "TXFT array parity error", -1, 1 }, + { TCAP, "TXCA array parity error", -1, 1 }, + { TCIP, "TXCIF array parity error", -1, 1 }, + { RCAP, "RXCA array parity error", -1, 1 }, + { OTDD, "outbound request TLP discarded", -1, 1 }, + { RDPE, "Rx data parity error", -1, 1 }, + { TDUE, "Tx uncorrectable data error", -1, 1 }, + { 0 } + }; + static const struct intr_info pcie_intr_info[] = { + { MSIADDRLPERR, "MSI AddrL parity error", -1, 1 }, + { MSIADDRHPERR, "MSI AddrH parity error", -1, 1 }, + { MSIDATAPERR, "MSI data parity error", -1, 1 }, + { MSIXADDRLPERR, "MSI-X AddrL parity error", -1, 1 }, + { MSIXADDRHPERR, "MSI-X AddrH parity error", -1, 1 }, + { MSIXDATAPERR, "MSI-X data parity error", -1, 1 }, + { MSIXDIPERR, "MSI-X DI parity error", -1, 1 }, + { PIOCPLPERR, "PCI PIO completion FIFO parity error", -1, 1 }, + { PIOREQPERR, "PCI PIO request FIFO parity error", -1, 1 }, + { TARTAGPERR, "PCI PCI target tag FIFO parity error", -1, 1 }, + { CCNTPERR, "PCI CMD channel count parity error", -1, 1 }, + { CREQPERR, "PCI CMD channel request parity error", -1, 1 }, + { CRSPPERR, "PCI CMD channel response parity error", -1, 1 }, + { DCNTPERR, "PCI DMA channel count parity error", -1, 1 }, + { DREQPERR, "PCI DMA channel request parity error", -1, 1 }, + { DRSPPERR, "PCI DMA channel response parity error", -1, 1 }, + { HCNTPERR, "PCI HMA channel count parity error", -1, 1 }, + { HREQPERR, "PCI HMA channel request parity error", -1, 1 }, + { HRSPPERR, "PCI HMA channel response parity error", -1, 1 }, + { CFGSNPPERR, "PCI config snoop FIFO parity error", -1, 1 }, + { FIDPERR, "PCI FID parity error", -1, 1 }, + { INTXCLRPERR, "PCI INTx clear parity error", -1, 1 }, + { MATAGPERR, "PCI MA tag parity error", -1, 1 }, + { PIOTAGPERR, "PCI PIO tag parity error", -1, 1 }, + { RXCPLPERR, "PCI Rx completion parity error", -1, 1 }, + { RXWRPERR, "PCI Rx write parity error", -1, 1 }, + { RPLPERR, "PCI replay buffer parity error", -1, 1 }, + { PCIESINT, "PCI core secondary fault", -1, 1 }, + { PCIEPINT, "PCI core primary fault", -1, 1 }, + { UNXSPLCPLERR, "PCI unexpected split completion error", -1, 0 }, + { 0 } + }; + + static struct intr_info t5_pcie_intr_info[] = { + { MSTGRPPERR, "Master Response Read Queue parity error", + -1, 1 }, + { MSTTIMEOUTPERR, "Master Timeout FIFO parity error", -1, 1 }, + { MSIXSTIPERR, "MSI-X STI SRAM parity error", -1, 1 }, + { MSIXADDRLPERR, "MSI-X AddrL parity error", -1, 1 }, + { MSIXADDRHPERR, "MSI-X AddrH parity error", -1, 1 }, + { MSIXDATAPERR, "MSI-X data parity error", -1, 1 }, + { MSIXDIPERR, "MSI-X DI parity error", -1, 1 }, + { PIOCPLGRPPERR, "PCI PIO completion Group FIFO parity error", + -1, 1 }, + { PIOREQGRPPERR, "PCI PIO request Group FIFO parity error", + -1, 1 }, + { TARTAGPERR, "PCI PCI target tag FIFO parity error", -1, 1 }, + { MSTTAGQPERR, "PCI master tag queue parity error", -1, 1 }, + { CREQPERR, "PCI CMD channel request parity error", -1, 1 }, + { CRSPPERR, "PCI CMD channel response parity error", -1, 1 }, + { DREQWRPERR, "PCI DMA channel write request parity error", + -1, 1 }, + { DREQPERR, "PCI DMA channel request parity error", -1, 1 }, + { DRSPPERR, "PCI DMA channel response parity error", -1, 1 }, + { HREQWRPERR, "PCI HMA channel count parity error", -1, 1 }, + { HREQPERR, "PCI HMA channel request parity error", -1, 1 }, + { HRSPPERR, "PCI HMA channel response parity error", -1, 1 }, + { CFGSNPPERR, "PCI config snoop FIFO parity error", -1, 1 }, + { FIDPERR, "PCI FID parity error", -1, 1 }, + { VFIDPERR, "PCI INTx clear parity error", -1, 1 }, + { MAGRPPERR, "PCI MA group FIFO parity error", -1, 1 }, + { PIOTAGPERR, "PCI PIO tag parity error", -1, 1 }, + { IPRXHDRGRPPERR, "PCI IP Rx header group parity error", + -1, 1 }, + { IPRXDATAGRPPERR, "PCI IP Rx data group parity error", -1, 1 }, + { RPLPERR, "PCI IP replay buffer parity error", -1, 1 }, + { IPSOTPERR, "PCI IP SOT buffer parity error", -1, 1 }, + { TRGT1GRPPERR, "PCI TRGT1 group FIFOs parity error", -1, 1 }, + { READRSPERR, "Outbound read error", -1, 0 }, + { 0 } + }; + + int fat; + + if (is_t4(adapter->params.chip)) + fat = t4_handle_intr_status(adapter, + PCIE_CORE_UTL_SYSTEM_BUS_AGENT_STATUS, + sysbus_intr_info) + + t4_handle_intr_status(adapter, + PCIE_CORE_UTL_PCI_EXPRESS_PORT_STATUS, + pcie_port_intr_info) + + t4_handle_intr_status(adapter, PCIE_INT_CAUSE, + pcie_intr_info); + else + fat = t4_handle_intr_status(adapter, PCIE_INT_CAUSE, + t5_pcie_intr_info); + + if (fat) + t4_fatal_err(adapter); +} + +/* + * TP interrupt handler. + */ +static void tp_intr_handler(struct adapter *adapter) +{ + static const struct intr_info tp_intr_info[] = { + { 0x3fffffff, "TP parity error", -1, 1 }, + { FLMTXFLSTEMPTY, "TP out of Tx pages", -1, 1 }, + { 0 } + }; + + if (t4_handle_intr_status(adapter, TP_INT_CAUSE, tp_intr_info)) + t4_fatal_err(adapter); +} + +/* + * SGE interrupt handler. + */ +static void sge_intr_handler(struct adapter *adapter) +{ + u64 v; + + static const struct intr_info sge_intr_info[] = { + { ERR_CPL_EXCEED_IQE_SIZE, + "SGE received CPL exceeding IQE size", -1, 1 }, + { ERR_INVALID_CIDX_INC, + "SGE GTS CIDX increment too large", -1, 0 }, + { ERR_CPL_OPCODE_0, "SGE received 0-length CPL", -1, 0 }, + { DBFIFO_LP_INT, NULL, -1, 0, t4_db_full }, + { DBFIFO_HP_INT, NULL, -1, 0, t4_db_full }, + { ERR_DROPPED_DB, NULL, -1, 0, t4_db_dropped }, + { ERR_DATA_CPL_ON_HIGH_QID1 | ERR_DATA_CPL_ON_HIGH_QID0, + "SGE IQID > 1023 received CPL for FL", -1, 0 }, + { ERR_BAD_DB_PIDX3, "SGE DBP 3 pidx increment too large", -1, + 0 }, + { ERR_BAD_DB_PIDX2, "SGE DBP 2 pidx increment too large", -1, + 0 }, + { ERR_BAD_DB_PIDX1, "SGE DBP 1 pidx increment too large", -1, + 0 }, + { ERR_BAD_DB_PIDX0, "SGE DBP 0 pidx increment too large", -1, + 0 }, + { ERR_ING_CTXT_PRIO, + "SGE too many priority ingress contexts", -1, 0 }, + { ERR_EGR_CTXT_PRIO, + "SGE too many priority egress contexts", -1, 0 }, + { INGRESS_SIZE_ERR, "SGE illegal ingress QID", -1, 0 }, + { EGRESS_SIZE_ERR, "SGE illegal egress QID", -1, 0 }, + { 0 } + }; + + v = (u64)t4_read_reg(adapter, SGE_INT_CAUSE1) | + ((u64)t4_read_reg(adapter, SGE_INT_CAUSE2) << 32); + if (v) { + dev_alert(adapter->pdev_dev, "SGE parity error (%#llx)\n", + (unsigned long long)v); + t4_write_reg(adapter, SGE_INT_CAUSE1, v); + t4_write_reg(adapter, SGE_INT_CAUSE2, v >> 32); + } + + if (t4_handle_intr_status(adapter, SGE_INT_CAUSE3, sge_intr_info) || + v != 0) + t4_fatal_err(adapter); +} + +/* + * CIM interrupt handler. + */ +static void cim_intr_handler(struct adapter *adapter) +{ + static const struct intr_info cim_intr_info[] = { + { PREFDROPINT, "CIM control register prefetch drop", -1, 1 }, + { OBQPARERR, "CIM OBQ parity error", -1, 1 }, + { IBQPARERR, "CIM IBQ parity error", -1, 1 }, + { MBUPPARERR, "CIM mailbox uP parity error", -1, 1 }, + { MBHOSTPARERR, "CIM mailbox host parity error", -1, 1 }, + { TIEQINPARERRINT, "CIM TIEQ outgoing parity error", -1, 1 }, + { TIEQOUTPARERRINT, "CIM TIEQ incoming parity error", -1, 1 }, + { 0 } + }; + static const struct intr_info cim_upintr_info[] = { + { RSVDSPACEINT, "CIM reserved space access", -1, 1 }, + { ILLTRANSINT, "CIM illegal transaction", -1, 1 }, + { ILLWRINT, "CIM illegal write", -1, 1 }, + { ILLRDINT, "CIM illegal read", -1, 1 }, + { ILLRDBEINT, "CIM illegal read BE", -1, 1 }, + { ILLWRBEINT, "CIM illegal write BE", -1, 1 }, + { SGLRDBOOTINT, "CIM single read from boot space", -1, 1 }, + { SGLWRBOOTINT, "CIM single write to boot space", -1, 1 }, + { BLKWRBOOTINT, "CIM block write to boot space", -1, 1 }, + { SGLRDFLASHINT, "CIM single read from flash space", -1, 1 }, + { SGLWRFLASHINT, "CIM single write to flash space", -1, 1 }, + { BLKWRFLASHINT, "CIM block write to flash space", -1, 1 }, + { SGLRDEEPROMINT, "CIM single EEPROM read", -1, 1 }, + { SGLWREEPROMINT, "CIM single EEPROM write", -1, 1 }, + { BLKRDEEPROMINT, "CIM block EEPROM read", -1, 1 }, + { BLKWREEPROMINT, "CIM block EEPROM write", -1, 1 }, + { SGLRDCTLINT , "CIM single read from CTL space", -1, 1 }, + { SGLWRCTLINT , "CIM single write to CTL space", -1, 1 }, + { BLKRDCTLINT , "CIM block read from CTL space", -1, 1 }, + { BLKWRCTLINT , "CIM block write to CTL space", -1, 1 }, + { SGLRDPLINT , "CIM single read from PL space", -1, 1 }, + { SGLWRPLINT , "CIM single write to PL space", -1, 1 }, + { BLKRDPLINT , "CIM block read from PL space", -1, 1 }, + { BLKWRPLINT , "CIM block write to PL space", -1, 1 }, + { REQOVRLOOKUPINT , "CIM request FIFO overwrite", -1, 1 }, + { RSPOVRLOOKUPINT , "CIM response FIFO overwrite", -1, 1 }, + { TIMEOUTINT , "CIM PIF timeout", -1, 1 }, + { TIMEOUTMAINT , "CIM PIF MA timeout", -1, 1 }, + { 0 } + }; + + int fat; + + if (t4_read_reg(adapter, MA_PCIE_FW) & FW_PCIE_FW_ERR) + t4_report_fw_error(adapter); + + fat = t4_handle_intr_status(adapter, CIM_HOST_INT_CAUSE, + cim_intr_info) + + t4_handle_intr_status(adapter, CIM_HOST_UPACC_INT_CAUSE, + cim_upintr_info); + if (fat) + t4_fatal_err(adapter); +} + +/* + * ULP RX interrupt handler. + */ +static void ulprx_intr_handler(struct adapter *adapter) +{ + static const struct intr_info ulprx_intr_info[] = { + { 0x1800000, "ULPRX context error", -1, 1 }, + { 0x7fffff, "ULPRX parity error", -1, 1 }, + { 0 } + }; + + if (t4_handle_intr_status(adapter, ULP_RX_INT_CAUSE, ulprx_intr_info)) + t4_fatal_err(adapter); +} + +/* + * ULP TX interrupt handler. + */ +static void ulptx_intr_handler(struct adapter *adapter) +{ + static const struct intr_info ulptx_intr_info[] = { + { PBL_BOUND_ERR_CH3, "ULPTX channel 3 PBL out of bounds", -1, + 0 }, + { PBL_BOUND_ERR_CH2, "ULPTX channel 2 PBL out of bounds", -1, + 0 }, + { PBL_BOUND_ERR_CH1, "ULPTX channel 1 PBL out of bounds", -1, + 0 }, + { PBL_BOUND_ERR_CH0, "ULPTX channel 0 PBL out of bounds", -1, + 0 }, + { 0xfffffff, "ULPTX parity error", -1, 1 }, + { 0 } + }; + + if (t4_handle_intr_status(adapter, ULP_TX_INT_CAUSE, ulptx_intr_info)) + t4_fatal_err(adapter); +} + +/* + * PM TX interrupt handler. + */ +static void pmtx_intr_handler(struct adapter *adapter) +{ + static const struct intr_info pmtx_intr_info[] = { + { PCMD_LEN_OVFL0, "PMTX channel 0 pcmd too large", -1, 1 }, + { PCMD_LEN_OVFL1, "PMTX channel 1 pcmd too large", -1, 1 }, + { PCMD_LEN_OVFL2, "PMTX channel 2 pcmd too large", -1, 1 }, + { ZERO_C_CMD_ERROR, "PMTX 0-length pcmd", -1, 1 }, + { PMTX_FRAMING_ERROR, "PMTX framing error", -1, 1 }, + { OESPI_PAR_ERROR, "PMTX oespi parity error", -1, 1 }, + { DB_OPTIONS_PAR_ERROR, "PMTX db_options parity error", -1, 1 }, + { ICSPI_PAR_ERROR, "PMTX icspi parity error", -1, 1 }, + { C_PCMD_PAR_ERROR, "PMTX c_pcmd parity error", -1, 1}, + { 0 } + }; + + if (t4_handle_intr_status(adapter, PM_TX_INT_CAUSE, pmtx_intr_info)) + t4_fatal_err(adapter); +} + +/* + * PM RX interrupt handler. + */ +static void pmrx_intr_handler(struct adapter *adapter) +{ + static const struct intr_info pmrx_intr_info[] = { + { ZERO_E_CMD_ERROR, "PMRX 0-length pcmd", -1, 1 }, + { PMRX_FRAMING_ERROR, "PMRX framing error", -1, 1 }, + { OCSPI_PAR_ERROR, "PMRX ocspi parity error", -1, 1 }, + { DB_OPTIONS_PAR_ERROR, "PMRX db_options parity error", -1, 1 }, + { IESPI_PAR_ERROR, "PMRX iespi parity error", -1, 1 }, + { E_PCMD_PAR_ERROR, "PMRX e_pcmd parity error", -1, 1}, + { 0 } + }; + + if (t4_handle_intr_status(adapter, PM_RX_INT_CAUSE, pmrx_intr_info)) + t4_fatal_err(adapter); +} + +/* + * CPL switch interrupt handler. + */ +static void cplsw_intr_handler(struct adapter *adapter) +{ + static const struct intr_info cplsw_intr_info[] = { + { CIM_OP_MAP_PERR, "CPLSW CIM op_map parity error", -1, 1 }, + { CIM_OVFL_ERROR, "CPLSW CIM overflow", -1, 1 }, + { TP_FRAMING_ERROR, "CPLSW TP framing error", -1, 1 }, + { SGE_FRAMING_ERROR, "CPLSW SGE framing error", -1, 1 }, + { CIM_FRAMING_ERROR, "CPLSW CIM framing error", -1, 1 }, + { ZERO_SWITCH_ERROR, "CPLSW no-switch error", -1, 1 }, + { 0 } + }; + + if (t4_handle_intr_status(adapter, CPL_INTR_CAUSE, cplsw_intr_info)) + t4_fatal_err(adapter); +} + +/* + * LE interrupt handler. + */ +static void le_intr_handler(struct adapter *adap) +{ + static const struct intr_info le_intr_info[] = { + { LIPMISS, "LE LIP miss", -1, 0 }, + { LIP0, "LE 0 LIP error", -1, 0 }, + { PARITYERR, "LE parity error", -1, 1 }, + { UNKNOWNCMD, "LE unknown command", -1, 1 }, + { REQQPARERR, "LE request queue parity error", -1, 1 }, + { 0 } + }; + + if (t4_handle_intr_status(adap, LE_DB_INT_CAUSE, le_intr_info)) + t4_fatal_err(adap); +} + +/* + * MPS interrupt handler. + */ +static void mps_intr_handler(struct adapter *adapter) +{ + static const struct intr_info mps_rx_intr_info[] = { + { 0xffffff, "MPS Rx parity error", -1, 1 }, + { 0 } + }; + static const struct intr_info mps_tx_intr_info[] = { + { TPFIFO, "MPS Tx TP FIFO parity error", -1, 1 }, + { NCSIFIFO, "MPS Tx NC-SI FIFO parity error", -1, 1 }, + { TXDATAFIFO, "MPS Tx data FIFO parity error", -1, 1 }, + { TXDESCFIFO, "MPS Tx desc FIFO parity error", -1, 1 }, + { BUBBLE, "MPS Tx underflow", -1, 1 }, + { SECNTERR, "MPS Tx SOP/EOP error", -1, 1 }, + { FRMERR, "MPS Tx framing error", -1, 1 }, + { 0 } + }; + static const struct intr_info mps_trc_intr_info[] = { + { FILTMEM, "MPS TRC filter parity error", -1, 1 }, + { PKTFIFO, "MPS TRC packet FIFO parity error", -1, 1 }, + { MISCPERR, "MPS TRC misc parity error", -1, 1 }, + { 0 } + }; + static const struct intr_info mps_stat_sram_intr_info[] = { + { 0x1fffff, "MPS statistics SRAM parity error", -1, 1 }, + { 0 } + }; + static const struct intr_info mps_stat_tx_intr_info[] = { + { 0xfffff, "MPS statistics Tx FIFO parity error", -1, 1 }, + { 0 } + }; + static const struct intr_info mps_stat_rx_intr_info[] = { + { 0xffffff, "MPS statistics Rx FIFO parity error", -1, 1 }, + { 0 } + }; + static const struct intr_info mps_cls_intr_info[] = { + { MATCHSRAM, "MPS match SRAM parity error", -1, 1 }, + { MATCHTCAM, "MPS match TCAM parity error", -1, 1 }, + { HASHSRAM, "MPS hash SRAM parity error", -1, 1 }, + { 0 } + }; + + int fat; + + fat = t4_handle_intr_status(adapter, MPS_RX_PERR_INT_CAUSE, + mps_rx_intr_info) + + t4_handle_intr_status(adapter, MPS_TX_INT_CAUSE, + mps_tx_intr_info) + + t4_handle_intr_status(adapter, MPS_TRC_INT_CAUSE, + mps_trc_intr_info) + + t4_handle_intr_status(adapter, MPS_STAT_PERR_INT_CAUSE_SRAM, + mps_stat_sram_intr_info) + + t4_handle_intr_status(adapter, MPS_STAT_PERR_INT_CAUSE_TX_FIFO, + mps_stat_tx_intr_info) + + t4_handle_intr_status(adapter, MPS_STAT_PERR_INT_CAUSE_RX_FIFO, + mps_stat_rx_intr_info) + + t4_handle_intr_status(adapter, MPS_CLS_INT_CAUSE, + mps_cls_intr_info); + + t4_write_reg(adapter, MPS_INT_CAUSE, CLSINT | TRCINT | + RXINT | TXINT | STATINT); + t4_read_reg(adapter, MPS_INT_CAUSE); /* flush */ + if (fat) + t4_fatal_err(adapter); +} + +#define MEM_INT_MASK (PERR_INT_CAUSE | ECC_CE_INT_CAUSE | ECC_UE_INT_CAUSE) + +/* + * EDC/MC interrupt handler. + */ +static void mem_intr_handler(struct adapter *adapter, int idx) +{ + static const char name[4][7] = { "EDC0", "EDC1", "MC/MC0", "MC1" }; + + unsigned int addr, cnt_addr, v; + + if (idx <= MEM_EDC1) { + addr = EDC_REG(EDC_INT_CAUSE, idx); + cnt_addr = EDC_REG(EDC_ECC_STATUS, idx); + } else if (idx == MEM_MC) { + if (is_t4(adapter->params.chip)) { + addr = MC_INT_CAUSE; + cnt_addr = MC_ECC_STATUS; + } else { + addr = MC_P_INT_CAUSE; + cnt_addr = MC_P_ECC_STATUS; + } + } else { + addr = MC_REG(MC_P_INT_CAUSE, 1); + cnt_addr = MC_REG(MC_P_ECC_STATUS, 1); + } + + v = t4_read_reg(adapter, addr) & MEM_INT_MASK; + if (v & PERR_INT_CAUSE) + dev_alert(adapter->pdev_dev, "%s FIFO parity error\n", + name[idx]); + if (v & ECC_CE_INT_CAUSE) { + u32 cnt = ECC_CECNT_GET(t4_read_reg(adapter, cnt_addr)); + + t4_write_reg(adapter, cnt_addr, ECC_CECNT_MASK); + if (printk_ratelimit()) + dev_warn(adapter->pdev_dev, + "%u %s correctable ECC data error%s\n", + cnt, name[idx], cnt > 1 ? "s" : ""); + } + if (v & ECC_UE_INT_CAUSE) + dev_alert(adapter->pdev_dev, + "%s uncorrectable ECC data error\n", name[idx]); + + t4_write_reg(adapter, addr, v); + if (v & (PERR_INT_CAUSE | ECC_UE_INT_CAUSE)) + t4_fatal_err(adapter); +} + +/* + * MA interrupt handler. + */ +static void ma_intr_handler(struct adapter *adap) +{ + u32 v, status = t4_read_reg(adap, MA_INT_CAUSE); + + if (status & MEM_PERR_INT_CAUSE) { + dev_alert(adap->pdev_dev, + "MA parity error, parity status %#x\n", + t4_read_reg(adap, MA_PARITY_ERROR_STATUS)); + if (is_t5(adap->params.chip)) + dev_alert(adap->pdev_dev, + "MA parity error, parity status %#x\n", + t4_read_reg(adap, + MA_PARITY_ERROR_STATUS2)); + } + if (status & MEM_WRAP_INT_CAUSE) { + v = t4_read_reg(adap, MA_INT_WRAP_STATUS); + dev_alert(adap->pdev_dev, "MA address wrap-around error by " + "client %u to address %#x\n", + MEM_WRAP_CLIENT_NUM_GET(v), + MEM_WRAP_ADDRESS_GET(v) << 4); + } + t4_write_reg(adap, MA_INT_CAUSE, status); + t4_fatal_err(adap); +} + +/* + * SMB interrupt handler. + */ +static void smb_intr_handler(struct adapter *adap) +{ + static const struct intr_info smb_intr_info[] = { + { MSTTXFIFOPARINT, "SMB master Tx FIFO parity error", -1, 1 }, + { MSTRXFIFOPARINT, "SMB master Rx FIFO parity error", -1, 1 }, + { SLVFIFOPARINT, "SMB slave FIFO parity error", -1, 1 }, + { 0 } + }; + + if (t4_handle_intr_status(adap, SMB_INT_CAUSE, smb_intr_info)) + t4_fatal_err(adap); +} + +/* + * NC-SI interrupt handler. + */ +static void ncsi_intr_handler(struct adapter *adap) +{ + static const struct intr_info ncsi_intr_info[] = { + { CIM_DM_PRTY_ERR, "NC-SI CIM parity error", -1, 1 }, + { MPS_DM_PRTY_ERR, "NC-SI MPS parity error", -1, 1 }, + { TXFIFO_PRTY_ERR, "NC-SI Tx FIFO parity error", -1, 1 }, + { RXFIFO_PRTY_ERR, "NC-SI Rx FIFO parity error", -1, 1 }, + { 0 } + }; + + if (t4_handle_intr_status(adap, NCSI_INT_CAUSE, ncsi_intr_info)) + t4_fatal_err(adap); +} + +/* + * XGMAC interrupt handler. + */ +static void xgmac_intr_handler(struct adapter *adap, int port) +{ + u32 v, int_cause_reg; + + if (is_t4(adap->params.chip)) + int_cause_reg = PORT_REG(port, XGMAC_PORT_INT_CAUSE); + else + int_cause_reg = T5_PORT_REG(port, MAC_PORT_INT_CAUSE); + + v = t4_read_reg(adap, int_cause_reg); + + v &= TXFIFO_PRTY_ERR | RXFIFO_PRTY_ERR; + if (!v) + return; + + if (v & TXFIFO_PRTY_ERR) + dev_alert(adap->pdev_dev, "XGMAC %d Tx FIFO parity error\n", + port); + if (v & RXFIFO_PRTY_ERR) + dev_alert(adap->pdev_dev, "XGMAC %d Rx FIFO parity error\n", + port); + t4_write_reg(adap, PORT_REG(port, XGMAC_PORT_INT_CAUSE), v); + t4_fatal_err(adap); +} + +/* + * PL interrupt handler. + */ +static void pl_intr_handler(struct adapter *adap) +{ + static const struct intr_info pl_intr_info[] = { + { FATALPERR, "T4 fatal parity error", -1, 1 }, + { PERRVFID, "PL VFID_MAP parity error", -1, 1 }, + { 0 } + }; + + if (t4_handle_intr_status(adap, PL_PL_INT_CAUSE, pl_intr_info)) + t4_fatal_err(adap); +} + +#define PF_INTR_MASK (PFSW) +#define GLBL_INTR_MASK (CIM | MPS | PL | PCIE | MC | EDC0 | \ + EDC1 | LE | TP | MA | PM_TX | PM_RX | ULP_RX | \ + CPL_SWITCH | SGE | ULP_TX) + +/** + * t4_slow_intr_handler - control path interrupt handler + * @adapter: the adapter + * + * T4 interrupt handler for non-data global interrupt events, e.g., errors. + * The designation 'slow' is because it involves register reads, while + * data interrupts typically don't involve any MMIOs. + */ +int t4_slow_intr_handler(struct adapter *adapter) +{ + u32 cause = t4_read_reg(adapter, PL_INT_CAUSE); + + if (!(cause & GLBL_INTR_MASK)) + return 0; + if (cause & CIM) + cim_intr_handler(adapter); + if (cause & MPS) + mps_intr_handler(adapter); + if (cause & NCSI) + ncsi_intr_handler(adapter); + if (cause & PL) + pl_intr_handler(adapter); + if (cause & SMB) + smb_intr_handler(adapter); + if (cause & XGMAC0) + xgmac_intr_handler(adapter, 0); + if (cause & XGMAC1) + xgmac_intr_handler(adapter, 1); + if (cause & XGMAC_KR0) + xgmac_intr_handler(adapter, 2); + if (cause & XGMAC_KR1) + xgmac_intr_handler(adapter, 3); + if (cause & PCIE) + pcie_intr_handler(adapter); + if (cause & MC) + mem_intr_handler(adapter, MEM_MC); + if (!is_t4(adapter->params.chip) && (cause & MC1)) + mem_intr_handler(adapter, MEM_MC1); + if (cause & EDC0) + mem_intr_handler(adapter, MEM_EDC0); + if (cause & EDC1) + mem_intr_handler(adapter, MEM_EDC1); + if (cause & LE) + le_intr_handler(adapter); + if (cause & TP) + tp_intr_handler(adapter); + if (cause & MA) + ma_intr_handler(adapter); + if (cause & PM_TX) + pmtx_intr_handler(adapter); + if (cause & PM_RX) + pmrx_intr_handler(adapter); + if (cause & ULP_RX) + ulprx_intr_handler(adapter); + if (cause & CPL_SWITCH) + cplsw_intr_handler(adapter); + if (cause & SGE) + sge_intr_handler(adapter); + if (cause & ULP_TX) + ulptx_intr_handler(adapter); + + /* Clear the interrupts just processed for which we are the master. */ + t4_write_reg(adapter, PL_INT_CAUSE, cause & GLBL_INTR_MASK); + (void) t4_read_reg(adapter, PL_INT_CAUSE); /* flush */ + return 1; +} + +/** + * t4_intr_enable - enable interrupts + * @adapter: the adapter whose interrupts should be enabled + * + * Enable PF-specific interrupts for the calling function and the top-level + * interrupt concentrator for global interrupts. Interrupts are already + * enabled at each module, here we just enable the roots of the interrupt + * hierarchies. + * + * Note: this function should be called only when the driver manages + * non PF-specific interrupts from the various HW modules. Only one PCI + * function at a time should be doing this. + */ +void t4_intr_enable(struct adapter *adapter) +{ + u32 pf = SOURCEPF_GET(t4_read_reg(adapter, PL_WHOAMI)); + + t4_write_reg(adapter, SGE_INT_ENABLE3, ERR_CPL_EXCEED_IQE_SIZE | + ERR_INVALID_CIDX_INC | ERR_CPL_OPCODE_0 | + ERR_DROPPED_DB | ERR_DATA_CPL_ON_HIGH_QID1 | + ERR_DATA_CPL_ON_HIGH_QID0 | ERR_BAD_DB_PIDX3 | + ERR_BAD_DB_PIDX2 | ERR_BAD_DB_PIDX1 | + ERR_BAD_DB_PIDX0 | ERR_ING_CTXT_PRIO | + ERR_EGR_CTXT_PRIO | INGRESS_SIZE_ERR | + DBFIFO_HP_INT | DBFIFO_LP_INT | + EGRESS_SIZE_ERR); + t4_write_reg(adapter, MYPF_REG(PL_PF_INT_ENABLE), PF_INTR_MASK); + t4_set_reg_field(adapter, PL_INT_MAP0, 0, 1 << pf); +} + +/** + * t4_intr_disable - disable interrupts + * @adapter: the adapter whose interrupts should be disabled + * + * Disable interrupts. We only disable the top-level interrupt + * concentrators. The caller must be a PCI function managing global + * interrupts. + */ +void t4_intr_disable(struct adapter *adapter) +{ + u32 pf = SOURCEPF_GET(t4_read_reg(adapter, PL_WHOAMI)); + + t4_write_reg(adapter, MYPF_REG(PL_PF_INT_ENABLE), 0); + t4_set_reg_field(adapter, PL_INT_MAP0, 1 << pf, 0); +} + +/** + * hash_mac_addr - return the hash value of a MAC address + * @addr: the 48-bit Ethernet MAC address + * + * Hashes a MAC address according to the hash function used by HW inexact + * (hash) address matching. + */ +static int hash_mac_addr(const u8 *addr) +{ + u32 a = ((u32)addr[0] << 16) | ((u32)addr[1] << 8) | addr[2]; + u32 b = ((u32)addr[3] << 16) | ((u32)addr[4] << 8) | addr[5]; + a ^= b; + a ^= (a >> 12); + a ^= (a >> 6); + return a & 0x3f; +} + +/** + * t4_config_rss_range - configure a portion of the RSS mapping table + * @adapter: the adapter + * @mbox: mbox to use for the FW command + * @viid: virtual interface whose RSS subtable is to be written + * @start: start entry in the table to write + * @n: how many table entries to write + * @rspq: values for the response queue lookup table + * @nrspq: number of values in @rspq + * + * Programs the selected part of the VI's RSS mapping table with the + * provided values. If @nrspq < @n the supplied values are used repeatedly + * until the full table range is populated. + * + * The caller must ensure the values in @rspq are in the range allowed for + * @viid. + */ +int t4_config_rss_range(struct adapter *adapter, int mbox, unsigned int viid, + int start, int n, const u16 *rspq, unsigned int nrspq) +{ + int ret; + const u16 *rsp = rspq; + const u16 *rsp_end = rspq + nrspq; + struct fw_rss_ind_tbl_cmd cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.op_to_viid = htonl(FW_CMD_OP(FW_RSS_IND_TBL_CMD) | + FW_CMD_REQUEST | FW_CMD_WRITE | + FW_RSS_IND_TBL_CMD_VIID(viid)); + cmd.retval_len16 = htonl(FW_LEN16(cmd)); + + /* each fw_rss_ind_tbl_cmd takes up to 32 entries */ + while (n > 0) { + int nq = min(n, 32); + __be32 *qp = &cmd.iq0_to_iq2; + + cmd.niqid = htons(nq); + cmd.startidx = htons(start); + + start += nq; + n -= nq; + + while (nq > 0) { + unsigned int v; + + v = FW_RSS_IND_TBL_CMD_IQ0(*rsp); + if (++rsp >= rsp_end) + rsp = rspq; + v |= FW_RSS_IND_TBL_CMD_IQ1(*rsp); + if (++rsp >= rsp_end) + rsp = rspq; + v |= FW_RSS_IND_TBL_CMD_IQ2(*rsp); + if (++rsp >= rsp_end) + rsp = rspq; + + *qp++ = htonl(v); + nq -= 3; + } + + ret = t4_wr_mbox(adapter, mbox, &cmd, sizeof(cmd), NULL); + if (ret) + return ret; + } + return 0; +} + +/** + * t4_config_glbl_rss - configure the global RSS mode + * @adapter: the adapter + * @mbox: mbox to use for the FW command + * @mode: global RSS mode + * @flags: mode-specific flags + * + * Sets the global RSS mode. + */ +int t4_config_glbl_rss(struct adapter *adapter, int mbox, unsigned int mode, + unsigned int flags) +{ + struct fw_rss_glb_config_cmd c; + + memset(&c, 0, sizeof(c)); + c.op_to_write = htonl(FW_CMD_OP(FW_RSS_GLB_CONFIG_CMD) | + FW_CMD_REQUEST | FW_CMD_WRITE); + c.retval_len16 = htonl(FW_LEN16(c)); + if (mode == FW_RSS_GLB_CONFIG_CMD_MODE_MANUAL) { + c.u.manual.mode_pkd = htonl(FW_RSS_GLB_CONFIG_CMD_MODE(mode)); + } else if (mode == FW_RSS_GLB_CONFIG_CMD_MODE_BASICVIRTUAL) { + c.u.basicvirtual.mode_pkd = + htonl(FW_RSS_GLB_CONFIG_CMD_MODE(mode)); + c.u.basicvirtual.synmapen_to_hashtoeplitz = htonl(flags); + } else + return -EINVAL; + return t4_wr_mbox(adapter, mbox, &c, sizeof(c), NULL); +} + +/** + * t4_tp_get_tcp_stats - read TP's TCP MIB counters + * @adap: the adapter + * @v4: holds the TCP/IP counter values + * @v6: holds the TCP/IPv6 counter values + * + * Returns the values of TP's TCP/IP and TCP/IPv6 MIB counters. + * Either @v4 or @v6 may be %NULL to skip the corresponding stats. + */ +void t4_tp_get_tcp_stats(struct adapter *adap, struct tp_tcp_stats *v4, + struct tp_tcp_stats *v6) +{ + u32 val[TP_MIB_TCP_RXT_SEG_LO - TP_MIB_TCP_OUT_RST + 1]; + +#define STAT_IDX(x) ((TP_MIB_TCP_##x) - TP_MIB_TCP_OUT_RST) +#define STAT(x) val[STAT_IDX(x)] +#define STAT64(x) (((u64)STAT(x##_HI) << 32) | STAT(x##_LO)) + + if (v4) { + t4_read_indirect(adap, TP_MIB_INDEX, TP_MIB_DATA, val, + ARRAY_SIZE(val), TP_MIB_TCP_OUT_RST); + v4->tcpOutRsts = STAT(OUT_RST); + v4->tcpInSegs = STAT64(IN_SEG); + v4->tcpOutSegs = STAT64(OUT_SEG); + v4->tcpRetransSegs = STAT64(RXT_SEG); + } + if (v6) { + t4_read_indirect(adap, TP_MIB_INDEX, TP_MIB_DATA, val, + ARRAY_SIZE(val), TP_MIB_TCP_V6OUT_RST); + v6->tcpOutRsts = STAT(OUT_RST); + v6->tcpInSegs = STAT64(IN_SEG); + v6->tcpOutSegs = STAT64(OUT_SEG); + v6->tcpRetransSegs = STAT64(RXT_SEG); + } +#undef STAT64 +#undef STAT +#undef STAT_IDX +} + +/** + * t4_read_mtu_tbl - returns the values in the HW path MTU table + * @adap: the adapter + * @mtus: where to store the MTU values + * @mtu_log: where to store the MTU base-2 log (may be %NULL) + * + * Reads the HW path MTU table. + */ +void t4_read_mtu_tbl(struct adapter *adap, u16 *mtus, u8 *mtu_log) +{ + u32 v; + int i; + + for (i = 0; i < NMTUS; ++i) { + t4_write_reg(adap, TP_MTU_TABLE, + MTUINDEX(0xff) | MTUVALUE(i)); + v = t4_read_reg(adap, TP_MTU_TABLE); + mtus[i] = MTUVALUE_GET(v); + if (mtu_log) + mtu_log[i] = MTUWIDTH_GET(v); + } +} + +/** + * t4_tp_wr_bits_indirect - set/clear bits in an indirect TP register + * @adap: the adapter + * @addr: the indirect TP register address + * @mask: specifies the field within the register to modify + * @val: new value for the field + * + * Sets a field of an indirect TP register to the given value. + */ +void t4_tp_wr_bits_indirect(struct adapter *adap, unsigned int addr, + unsigned int mask, unsigned int val) +{ + t4_write_reg(adap, TP_PIO_ADDR, addr); + val |= t4_read_reg(adap, TP_PIO_DATA) & ~mask; + t4_write_reg(adap, TP_PIO_DATA, val); +} + +/** + * init_cong_ctrl - initialize congestion control parameters + * @a: the alpha values for congestion control + * @b: the beta values for congestion control + * + * Initialize the congestion control parameters. + */ +static void init_cong_ctrl(unsigned short *a, unsigned short *b) +{ + a[0] = a[1] = a[2] = a[3] = a[4] = a[5] = a[6] = a[7] = a[8] = 1; + a[9] = 2; + a[10] = 3; + a[11] = 4; + a[12] = 5; + a[13] = 6; + a[14] = 7; + a[15] = 8; + a[16] = 9; + a[17] = 10; + a[18] = 14; + a[19] = 17; + a[20] = 21; + a[21] = 25; + a[22] = 30; + a[23] = 35; + a[24] = 45; + a[25] = 60; + a[26] = 80; + a[27] = 100; + a[28] = 200; + a[29] = 300; + a[30] = 400; + a[31] = 500; + + b[0] = b[1] = b[2] = b[3] = b[4] = b[5] = b[6] = b[7] = b[8] = 0; + b[9] = b[10] = 1; + b[11] = b[12] = 2; + b[13] = b[14] = b[15] = b[16] = 3; + b[17] = b[18] = b[19] = b[20] = b[21] = 4; + b[22] = b[23] = b[24] = b[25] = b[26] = b[27] = 5; + b[28] = b[29] = 6; + b[30] = b[31] = 7; +} + +/* The minimum additive increment value for the congestion control table */ +#define CC_MIN_INCR 2U + +/** + * t4_load_mtus - write the MTU and congestion control HW tables + * @adap: the adapter + * @mtus: the values for the MTU table + * @alpha: the values for the congestion control alpha parameter + * @beta: the values for the congestion control beta parameter + * + * Write the HW MTU table with the supplied MTUs and the high-speed + * congestion control table with the supplied alpha, beta, and MTUs. + * We write the two tables together because the additive increments + * depend on the MTUs. + */ +void t4_load_mtus(struct adapter *adap, const unsigned short *mtus, + const unsigned short *alpha, const unsigned short *beta) +{ + static const unsigned int avg_pkts[NCCTRL_WIN] = { + 2, 6, 10, 14, 20, 28, 40, 56, 80, 112, 160, 224, 320, 448, 640, + 896, 1281, 1792, 2560, 3584, 5120, 7168, 10240, 14336, 20480, + 28672, 40960, 57344, 81920, 114688, 163840, 229376 + }; + + unsigned int i, w; + + for (i = 0; i < NMTUS; ++i) { + unsigned int mtu = mtus[i]; + unsigned int log2 = fls(mtu); + + if (!(mtu & ((1 << log2) >> 2))) /* round */ + log2--; + t4_write_reg(adap, TP_MTU_TABLE, MTUINDEX(i) | + MTUWIDTH(log2) | MTUVALUE(mtu)); + + for (w = 0; w < NCCTRL_WIN; ++w) { + unsigned int inc; + + inc = max(((mtu - 40) * alpha[w]) / avg_pkts[w], + CC_MIN_INCR); + + t4_write_reg(adap, TP_CCTRL_TABLE, (i << 21) | + (w << 16) | (beta[w] << 13) | inc); + } + } +} + +/** + * get_mps_bg_map - return the buffer groups associated with a port + * @adap: the adapter + * @idx: the port index + * + * Returns a bitmap indicating which MPS buffer groups are associated + * with the given port. Bit i is set if buffer group i is used by the + * port. + */ +static unsigned int get_mps_bg_map(struct adapter *adap, int idx) +{ + u32 n = NUMPORTS_GET(t4_read_reg(adap, MPS_CMN_CTL)); + + if (n == 0) + return idx == 0 ? 0xf : 0; + if (n == 1) + return idx < 2 ? (3 << (2 * idx)) : 0; + return 1 << idx; +} + +/** + * t4_get_port_type_description - return Port Type string description + * @port_type: firmware Port Type enumeration + */ +const char *t4_get_port_type_description(enum fw_port_type port_type) +{ + static const char *const port_type_description[] = { + "R XFI", + "R XAUI", + "T SGMII", + "T XFI", + "T XAUI", + "KX4", + "CX4", + "KX", + "KR", + "R SFP+", + "KR/KX", + "KR/KX/KX4", + "R QSFP_10G", + "", + "R QSFP", + "R BP40_BA", + }; + + if (port_type < ARRAY_SIZE(port_type_description)) + return port_type_description[port_type]; + return "UNKNOWN"; +} + +/** + * t4_get_port_stats - collect port statistics + * @adap: the adapter + * @idx: the port index + * @p: the stats structure to fill + * + * Collect statistics related to the given port from HW. + */ +void t4_get_port_stats(struct adapter *adap, int idx, struct port_stats *p) +{ + u32 bgmap = get_mps_bg_map(adap, idx); + +#define GET_STAT(name) \ + t4_read_reg64(adap, \ + (is_t4(adap->params.chip) ? PORT_REG(idx, MPS_PORT_STAT_##name##_L) : \ + T5_PORT_REG(idx, MPS_PORT_STAT_##name##_L))) +#define GET_STAT_COM(name) t4_read_reg64(adap, MPS_STAT_##name##_L) + + p->tx_octets = GET_STAT(TX_PORT_BYTES); + p->tx_frames = GET_STAT(TX_PORT_FRAMES); + p->tx_bcast_frames = GET_STAT(TX_PORT_BCAST); + p->tx_mcast_frames = GET_STAT(TX_PORT_MCAST); + p->tx_ucast_frames = GET_STAT(TX_PORT_UCAST); + p->tx_error_frames = GET_STAT(TX_PORT_ERROR); + p->tx_frames_64 = GET_STAT(TX_PORT_64B); + p->tx_frames_65_127 = GET_STAT(TX_PORT_65B_127B); + p->tx_frames_128_255 = GET_STAT(TX_PORT_128B_255B); + p->tx_frames_256_511 = GET_STAT(TX_PORT_256B_511B); + p->tx_frames_512_1023 = GET_STAT(TX_PORT_512B_1023B); + p->tx_frames_1024_1518 = GET_STAT(TX_PORT_1024B_1518B); + p->tx_frames_1519_max = GET_STAT(TX_PORT_1519B_MAX); + p->tx_drop = GET_STAT(TX_PORT_DROP); + p->tx_pause = GET_STAT(TX_PORT_PAUSE); + p->tx_ppp0 = GET_STAT(TX_PORT_PPP0); + p->tx_ppp1 = GET_STAT(TX_PORT_PPP1); + p->tx_ppp2 = GET_STAT(TX_PORT_PPP2); + p->tx_ppp3 = GET_STAT(TX_PORT_PPP3); + p->tx_ppp4 = GET_STAT(TX_PORT_PPP4); + p->tx_ppp5 = GET_STAT(TX_PORT_PPP5); + p->tx_ppp6 = GET_STAT(TX_PORT_PPP6); + p->tx_ppp7 = GET_STAT(TX_PORT_PPP7); + + p->rx_octets = GET_STAT(RX_PORT_BYTES); + p->rx_frames = GET_STAT(RX_PORT_FRAMES); + p->rx_bcast_frames = GET_STAT(RX_PORT_BCAST); + p->rx_mcast_frames = GET_STAT(RX_PORT_MCAST); + p->rx_ucast_frames = GET_STAT(RX_PORT_UCAST); + p->rx_too_long = GET_STAT(RX_PORT_MTU_ERROR); + p->rx_jabber = GET_STAT(RX_PORT_MTU_CRC_ERROR); + p->rx_fcs_err = GET_STAT(RX_PORT_CRC_ERROR); + p->rx_len_err = GET_STAT(RX_PORT_LEN_ERROR); + p->rx_symbol_err = GET_STAT(RX_PORT_SYM_ERROR); + p->rx_runt = GET_STAT(RX_PORT_LESS_64B); + p->rx_frames_64 = GET_STAT(RX_PORT_64B); + p->rx_frames_65_127 = GET_STAT(RX_PORT_65B_127B); + p->rx_frames_128_255 = GET_STAT(RX_PORT_128B_255B); + p->rx_frames_256_511 = GET_STAT(RX_PORT_256B_511B); + p->rx_frames_512_1023 = GET_STAT(RX_PORT_512B_1023B); + p->rx_frames_1024_1518 = GET_STAT(RX_PORT_1024B_1518B); + p->rx_frames_1519_max = GET_STAT(RX_PORT_1519B_MAX); + p->rx_pause = GET_STAT(RX_PORT_PAUSE); + p->rx_ppp0 = GET_STAT(RX_PORT_PPP0); + p->rx_ppp1 = GET_STAT(RX_PORT_PPP1); + p->rx_ppp2 = GET_STAT(RX_PORT_PPP2); + p->rx_ppp3 = GET_STAT(RX_PORT_PPP3); + p->rx_ppp4 = GET_STAT(RX_PORT_PPP4); + p->rx_ppp5 = GET_STAT(RX_PORT_PPP5); + p->rx_ppp6 = GET_STAT(RX_PORT_PPP6); + p->rx_ppp7 = GET_STAT(RX_PORT_PPP7); + + p->rx_ovflow0 = (bgmap & 1) ? GET_STAT_COM(RX_BG_0_MAC_DROP_FRAME) : 0; + p->rx_ovflow1 = (bgmap & 2) ? GET_STAT_COM(RX_BG_1_MAC_DROP_FRAME) : 0; + p->rx_ovflow2 = (bgmap & 4) ? GET_STAT_COM(RX_BG_2_MAC_DROP_FRAME) : 0; + p->rx_ovflow3 = (bgmap & 8) ? GET_STAT_COM(RX_BG_3_MAC_DROP_FRAME) : 0; + p->rx_trunc0 = (bgmap & 1) ? GET_STAT_COM(RX_BG_0_MAC_TRUNC_FRAME) : 0; + p->rx_trunc1 = (bgmap & 2) ? GET_STAT_COM(RX_BG_1_MAC_TRUNC_FRAME) : 0; + p->rx_trunc2 = (bgmap & 4) ? GET_STAT_COM(RX_BG_2_MAC_TRUNC_FRAME) : 0; + p->rx_trunc3 = (bgmap & 8) ? GET_STAT_COM(RX_BG_3_MAC_TRUNC_FRAME) : 0; + +#undef GET_STAT +#undef GET_STAT_COM +} + +/** + * t4_wol_magic_enable - enable/disable magic packet WoL + * @adap: the adapter + * @port: the physical port index + * @addr: MAC address expected in magic packets, %NULL to disable + * + * Enables/disables magic packet wake-on-LAN for the selected port. + */ +void t4_wol_magic_enable(struct adapter *adap, unsigned int port, + const u8 *addr) +{ + u32 mag_id_reg_l, mag_id_reg_h, port_cfg_reg; + + if (is_t4(adap->params.chip)) { + mag_id_reg_l = PORT_REG(port, XGMAC_PORT_MAGIC_MACID_LO); + mag_id_reg_h = PORT_REG(port, XGMAC_PORT_MAGIC_MACID_HI); + port_cfg_reg = PORT_REG(port, XGMAC_PORT_CFG2); + } else { + mag_id_reg_l = T5_PORT_REG(port, MAC_PORT_MAGIC_MACID_LO); + mag_id_reg_h = T5_PORT_REG(port, MAC_PORT_MAGIC_MACID_HI); + port_cfg_reg = T5_PORT_REG(port, MAC_PORT_CFG2); + } + + if (addr) { + t4_write_reg(adap, mag_id_reg_l, + (addr[2] << 24) | (addr[3] << 16) | + (addr[4] << 8) | addr[5]); + t4_write_reg(adap, mag_id_reg_h, + (addr[0] << 8) | addr[1]); + } + t4_set_reg_field(adap, port_cfg_reg, MAGICEN, + addr ? MAGICEN : 0); +} + +/** + * t4_wol_pat_enable - enable/disable pattern-based WoL + * @adap: the adapter + * @port: the physical port index + * @map: bitmap of which HW pattern filters to set + * @mask0: byte mask for bytes 0-63 of a packet + * @mask1: byte mask for bytes 64-127 of a packet + * @crc: Ethernet CRC for selected bytes + * @enable: enable/disable switch + * + * Sets the pattern filters indicated in @map to mask out the bytes + * specified in @mask0/@mask1 in received packets and compare the CRC of + * the resulting packet against @crc. If @enable is %true pattern-based + * WoL is enabled, otherwise disabled. + */ +int t4_wol_pat_enable(struct adapter *adap, unsigned int port, unsigned int map, + u64 mask0, u64 mask1, unsigned int crc, bool enable) +{ + int i; + u32 port_cfg_reg; + + if (is_t4(adap->params.chip)) + port_cfg_reg = PORT_REG(port, XGMAC_PORT_CFG2); + else + port_cfg_reg = T5_PORT_REG(port, MAC_PORT_CFG2); + + if (!enable) { + t4_set_reg_field(adap, port_cfg_reg, PATEN, 0); + return 0; + } + if (map > 0xff) + return -EINVAL; + +#define EPIO_REG(name) \ + (is_t4(adap->params.chip) ? PORT_REG(port, XGMAC_PORT_EPIO_##name) : \ + T5_PORT_REG(port, MAC_PORT_EPIO_##name)) + + t4_write_reg(adap, EPIO_REG(DATA1), mask0 >> 32); + t4_write_reg(adap, EPIO_REG(DATA2), mask1); + t4_write_reg(adap, EPIO_REG(DATA3), mask1 >> 32); + + for (i = 0; i < NWOL_PAT; i++, map >>= 1) { + if (!(map & 1)) + continue; + + /* write byte masks */ + t4_write_reg(adap, EPIO_REG(DATA0), mask0); + t4_write_reg(adap, EPIO_REG(OP), ADDRESS(i) | EPIOWR); + t4_read_reg(adap, EPIO_REG(OP)); /* flush */ + if (t4_read_reg(adap, EPIO_REG(OP)) & SF_BUSY) + return -ETIMEDOUT; + + /* write CRC */ + t4_write_reg(adap, EPIO_REG(DATA0), crc); + t4_write_reg(adap, EPIO_REG(OP), ADDRESS(i + 32) | EPIOWR); + t4_read_reg(adap, EPIO_REG(OP)); /* flush */ + if (t4_read_reg(adap, EPIO_REG(OP)) & SF_BUSY) + return -ETIMEDOUT; + } +#undef EPIO_REG + + t4_set_reg_field(adap, PORT_REG(port, XGMAC_PORT_CFG2), 0, PATEN); + return 0; +} + +/* t4_mk_filtdelwr - create a delete filter WR + * @ftid: the filter ID + * @wr: the filter work request to populate + * @qid: ingress queue to receive the delete notification + * + * Creates a filter work request to delete the supplied filter. If @qid is + * negative the delete notification is suppressed. + */ +void t4_mk_filtdelwr(unsigned int ftid, struct fw_filter_wr *wr, int qid) +{ + memset(wr, 0, sizeof(*wr)); + wr->op_pkd = htonl(FW_WR_OP(FW_FILTER_WR)); + wr->len16_pkd = htonl(FW_WR_LEN16(sizeof(*wr) / 16)); + wr->tid_to_iq = htonl(V_FW_FILTER_WR_TID(ftid) | + V_FW_FILTER_WR_NOREPLY(qid < 0)); + wr->del_filter_to_l2tix = htonl(F_FW_FILTER_WR_DEL_FILTER); + if (qid >= 0) + wr->rx_chan_rx_rpl_iq = htons(V_FW_FILTER_WR_RX_RPL_IQ(qid)); +} + +#define INIT_CMD(var, cmd, rd_wr) do { \ + (var).op_to_write = htonl(FW_CMD_OP(FW_##cmd##_CMD) | \ + FW_CMD_REQUEST | FW_CMD_##rd_wr); \ + (var).retval_len16 = htonl(FW_LEN16(var)); \ +} while (0) + +int t4_fwaddrspace_write(struct adapter *adap, unsigned int mbox, + u32 addr, u32 val) +{ + struct fw_ldst_cmd c; + + memset(&c, 0, sizeof(c)); + c.op_to_addrspace = htonl(FW_CMD_OP(FW_LDST_CMD) | FW_CMD_REQUEST | + FW_CMD_WRITE | + FW_LDST_CMD_ADDRSPACE(FW_LDST_ADDRSPC_FIRMWARE)); + c.cycles_to_len16 = htonl(FW_LEN16(c)); + c.u.addrval.addr = htonl(addr); + c.u.addrval.val = htonl(val); + + return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL); +} + +/** + * t4_mdio_rd - read a PHY register through MDIO + * @adap: the adapter + * @mbox: mailbox to use for the FW command + * @phy_addr: the PHY address + * @mmd: the PHY MMD to access (0 for clause 22 PHYs) + * @reg: the register to read + * @valp: where to store the value + * + * Issues a FW command through the given mailbox to read a PHY register. + */ +int t4_mdio_rd(struct adapter *adap, unsigned int mbox, unsigned int phy_addr, + unsigned int mmd, unsigned int reg, u16 *valp) +{ + int ret; + struct fw_ldst_cmd c; + + memset(&c, 0, sizeof(c)); + c.op_to_addrspace = htonl(FW_CMD_OP(FW_LDST_CMD) | FW_CMD_REQUEST | + FW_CMD_READ | FW_LDST_CMD_ADDRSPACE(FW_LDST_ADDRSPC_MDIO)); + c.cycles_to_len16 = htonl(FW_LEN16(c)); + c.u.mdio.paddr_mmd = htons(FW_LDST_CMD_PADDR(phy_addr) | + FW_LDST_CMD_MMD(mmd)); + c.u.mdio.raddr = htons(reg); + + ret = t4_wr_mbox(adap, mbox, &c, sizeof(c), &c); + if (ret == 0) + *valp = ntohs(c.u.mdio.rval); + return ret; +} + +/** + * t4_mdio_wr - write a PHY register through MDIO + * @adap: the adapter + * @mbox: mailbox to use for the FW command + * @phy_addr: the PHY address + * @mmd: the PHY MMD to access (0 for clause 22 PHYs) + * @reg: the register to write + * @valp: value to write + * + * Issues a FW command through the given mailbox to write a PHY register. + */ +int t4_mdio_wr(struct adapter *adap, unsigned int mbox, unsigned int phy_addr, + unsigned int mmd, unsigned int reg, u16 val) +{ + struct fw_ldst_cmd c; + + memset(&c, 0, sizeof(c)); + c.op_to_addrspace = htonl(FW_CMD_OP(FW_LDST_CMD) | FW_CMD_REQUEST | + FW_CMD_WRITE | FW_LDST_CMD_ADDRSPACE(FW_LDST_ADDRSPC_MDIO)); + c.cycles_to_len16 = htonl(FW_LEN16(c)); + c.u.mdio.paddr_mmd = htons(FW_LDST_CMD_PADDR(phy_addr) | + FW_LDST_CMD_MMD(mmd)); + c.u.mdio.raddr = htons(reg); + c.u.mdio.rval = htons(val); + + return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL); +} + +/** + * t4_sge_decode_idma_state - decode the idma state + * @adap: the adapter + * @state: the state idma is stuck in + */ +void t4_sge_decode_idma_state(struct adapter *adapter, int state) +{ + static const char * const t4_decode[] = { + "IDMA_IDLE", + "IDMA_PUSH_MORE_CPL_FIFO", + "IDMA_PUSH_CPL_MSG_HEADER_TO_FIFO", + "Not used", + "IDMA_PHYSADDR_SEND_PCIEHDR", + "IDMA_PHYSADDR_SEND_PAYLOAD_FIRST", + "IDMA_PHYSADDR_SEND_PAYLOAD", + "IDMA_SEND_FIFO_TO_IMSG", + "IDMA_FL_REQ_DATA_FL_PREP", + "IDMA_FL_REQ_DATA_FL", + "IDMA_FL_DROP", + "IDMA_FL_H_REQ_HEADER_FL", + "IDMA_FL_H_SEND_PCIEHDR", + "IDMA_FL_H_PUSH_CPL_FIFO", + "IDMA_FL_H_SEND_CPL", + "IDMA_FL_H_SEND_IP_HDR_FIRST", + "IDMA_FL_H_SEND_IP_HDR", + "IDMA_FL_H_REQ_NEXT_HEADER_FL", + "IDMA_FL_H_SEND_NEXT_PCIEHDR", + "IDMA_FL_H_SEND_IP_HDR_PADDING", + "IDMA_FL_D_SEND_PCIEHDR", + "IDMA_FL_D_SEND_CPL_AND_IP_HDR", + "IDMA_FL_D_REQ_NEXT_DATA_FL", + "IDMA_FL_SEND_PCIEHDR", + "IDMA_FL_PUSH_CPL_FIFO", + "IDMA_FL_SEND_CPL", + "IDMA_FL_SEND_PAYLOAD_FIRST", + "IDMA_FL_SEND_PAYLOAD", + "IDMA_FL_REQ_NEXT_DATA_FL", + "IDMA_FL_SEND_NEXT_PCIEHDR", + "IDMA_FL_SEND_PADDING", + "IDMA_FL_SEND_COMPLETION_TO_IMSG", + "IDMA_FL_SEND_FIFO_TO_IMSG", + "IDMA_FL_REQ_DATAFL_DONE", + "IDMA_FL_REQ_HEADERFL_DONE", + }; + static const char * const t5_decode[] = { + "IDMA_IDLE", + "IDMA_ALMOST_IDLE", + "IDMA_PUSH_MORE_CPL_FIFO", + "IDMA_PUSH_CPL_MSG_HEADER_TO_FIFO", + "IDMA_SGEFLRFLUSH_SEND_PCIEHDR", + "IDMA_PHYSADDR_SEND_PCIEHDR", + "IDMA_PHYSADDR_SEND_PAYLOAD_FIRST", + "IDMA_PHYSADDR_SEND_PAYLOAD", + "IDMA_SEND_FIFO_TO_IMSG", + "IDMA_FL_REQ_DATA_FL", + "IDMA_FL_DROP", + "IDMA_FL_DROP_SEND_INC", + "IDMA_FL_H_REQ_HEADER_FL", + "IDMA_FL_H_SEND_PCIEHDR", + "IDMA_FL_H_PUSH_CPL_FIFO", + "IDMA_FL_H_SEND_CPL", + "IDMA_FL_H_SEND_IP_HDR_FIRST", + "IDMA_FL_H_SEND_IP_HDR", + "IDMA_FL_H_REQ_NEXT_HEADER_FL", + "IDMA_FL_H_SEND_NEXT_PCIEHDR", + "IDMA_FL_H_SEND_IP_HDR_PADDING", + "IDMA_FL_D_SEND_PCIEHDR", + "IDMA_FL_D_SEND_CPL_AND_IP_HDR", + "IDMA_FL_D_REQ_NEXT_DATA_FL", + "IDMA_FL_SEND_PCIEHDR", + "IDMA_FL_PUSH_CPL_FIFO", + "IDMA_FL_SEND_CPL", + "IDMA_FL_SEND_PAYLOAD_FIRST", + "IDMA_FL_SEND_PAYLOAD", + "IDMA_FL_REQ_NEXT_DATA_FL", + "IDMA_FL_SEND_NEXT_PCIEHDR", + "IDMA_FL_SEND_PADDING", + "IDMA_FL_SEND_COMPLETION_TO_IMSG", + }; + static const u32 sge_regs[] = { + SGE_DEBUG_DATA_LOW_INDEX_2, + SGE_DEBUG_DATA_LOW_INDEX_3, + SGE_DEBUG_DATA_HIGH_INDEX_10, + }; + const char **sge_idma_decode; + int sge_idma_decode_nstates; + int i; + + if (is_t4(adapter->params.chip)) { + sge_idma_decode = (const char **)t4_decode; + sge_idma_decode_nstates = ARRAY_SIZE(t4_decode); + } else { + sge_idma_decode = (const char **)t5_decode; + sge_idma_decode_nstates = ARRAY_SIZE(t5_decode); + } + + if (state < sge_idma_decode_nstates) + CH_WARN(adapter, "idma state %s\n", sge_idma_decode[state]); + else + CH_WARN(adapter, "idma state %d unknown\n", state); + + for (i = 0; i < ARRAY_SIZE(sge_regs); i++) + CH_WARN(adapter, "SGE register %#x value %#x\n", + sge_regs[i], t4_read_reg(adapter, sge_regs[i])); +} + +/** + * t4_fw_hello - establish communication with FW + * @adap: the adapter + * @mbox: mailbox to use for the FW command + * @evt_mbox: mailbox to receive async FW events + * @master: specifies the caller's willingness to be the device master + * @state: returns the current device state (if non-NULL) + * + * Issues a command to establish communication with FW. Returns either + * an error (negative integer) or the mailbox of the Master PF. + */ +int t4_fw_hello(struct adapter *adap, unsigned int mbox, unsigned int evt_mbox, + enum dev_master master, enum dev_state *state) +{ + int ret; + struct fw_hello_cmd c; + u32 v; + unsigned int master_mbox; + int retries = FW_CMD_HELLO_RETRIES; + +retry: + memset(&c, 0, sizeof(c)); + INIT_CMD(c, HELLO, WRITE); + c.err_to_clearinit = htonl( + FW_HELLO_CMD_MASTERDIS(master == MASTER_CANT) | + FW_HELLO_CMD_MASTERFORCE(master == MASTER_MUST) | + FW_HELLO_CMD_MBMASTER(master == MASTER_MUST ? mbox : + FW_HELLO_CMD_MBMASTER_MASK) | + FW_HELLO_CMD_MBASYNCNOT(evt_mbox) | + FW_HELLO_CMD_STAGE(fw_hello_cmd_stage_os) | + FW_HELLO_CMD_CLEARINIT); + + /* + * Issue the HELLO command to the firmware. If it's not successful + * but indicates that we got a "busy" or "timeout" condition, retry + * the HELLO until we exhaust our retry limit. If we do exceed our + * retry limit, check to see if the firmware left us any error + * information and report that if so. + */ + ret = t4_wr_mbox(adap, mbox, &c, sizeof(c), &c); + if (ret < 0) { + if ((ret == -EBUSY || ret == -ETIMEDOUT) && retries-- > 0) + goto retry; + if (t4_read_reg(adap, MA_PCIE_FW) & FW_PCIE_FW_ERR) + t4_report_fw_error(adap); + return ret; + } + + v = ntohl(c.err_to_clearinit); + master_mbox = FW_HELLO_CMD_MBMASTER_GET(v); + if (state) { + if (v & FW_HELLO_CMD_ERR) + *state = DEV_STATE_ERR; + else if (v & FW_HELLO_CMD_INIT) + *state = DEV_STATE_INIT; + else + *state = DEV_STATE_UNINIT; + } + + /* + * If we're not the Master PF then we need to wait around for the + * Master PF Driver to finish setting up the adapter. + * + * Note that we also do this wait if we're a non-Master-capable PF and + * there is no current Master PF; a Master PF may show up momentarily + * and we wouldn't want to fail pointlessly. (This can happen when an + * OS loads lots of different drivers rapidly at the same time). In + * this case, the Master PF returned by the firmware will be + * FW_PCIE_FW_MASTER_MASK so the test below will work ... + */ + if ((v & (FW_HELLO_CMD_ERR|FW_HELLO_CMD_INIT)) == 0 && + master_mbox != mbox) { + int waiting = FW_CMD_HELLO_TIMEOUT; + + /* + * Wait for the firmware to either indicate an error or + * initialized state. If we see either of these we bail out + * and report the issue to the caller. If we exhaust the + * "hello timeout" and we haven't exhausted our retries, try + * again. Otherwise bail with a timeout error. + */ + for (;;) { + u32 pcie_fw; + + msleep(50); + waiting -= 50; + + /* + * If neither Error nor Initialialized are indicated + * by the firmware keep waiting till we exaust our + * timeout ... and then retry if we haven't exhausted + * our retries ... + */ + pcie_fw = t4_read_reg(adap, MA_PCIE_FW); + if (!(pcie_fw & (FW_PCIE_FW_ERR|FW_PCIE_FW_INIT))) { + if (waiting <= 0) { + if (retries-- > 0) + goto retry; + + return -ETIMEDOUT; + } + continue; + } + + /* + * We either have an Error or Initialized condition + * report errors preferentially. + */ + if (state) { + if (pcie_fw & FW_PCIE_FW_ERR) + *state = DEV_STATE_ERR; + else if (pcie_fw & FW_PCIE_FW_INIT) + *state = DEV_STATE_INIT; + } + + /* + * If we arrived before a Master PF was selected and + * there's not a valid Master PF, grab its identity + * for our caller. + */ + if (master_mbox == FW_PCIE_FW_MASTER_MASK && + (pcie_fw & FW_PCIE_FW_MASTER_VLD)) + master_mbox = FW_PCIE_FW_MASTER_GET(pcie_fw); + break; + } + } + + return master_mbox; +} + +/** + * t4_fw_bye - end communication with FW + * @adap: the adapter + * @mbox: mailbox to use for the FW command + * + * Issues a command to terminate communication with FW. + */ +int t4_fw_bye(struct adapter *adap, unsigned int mbox) +{ + struct fw_bye_cmd c; + + memset(&c, 0, sizeof(c)); + INIT_CMD(c, BYE, WRITE); + return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL); +} + +/** + * t4_init_cmd - ask FW to initialize the device + * @adap: the adapter + * @mbox: mailbox to use for the FW command + * + * Issues a command to FW to partially initialize the device. This + * performs initialization that generally doesn't depend on user input. + */ +int t4_early_init(struct adapter *adap, unsigned int mbox) +{ + struct fw_initialize_cmd c; + + memset(&c, 0, sizeof(c)); + INIT_CMD(c, INITIALIZE, WRITE); + return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL); +} + +/** + * t4_fw_reset - issue a reset to FW + * @adap: the adapter + * @mbox: mailbox to use for the FW command + * @reset: specifies the type of reset to perform + * + * Issues a reset command of the specified type to FW. + */ +int t4_fw_reset(struct adapter *adap, unsigned int mbox, int reset) +{ + struct fw_reset_cmd c; + + memset(&c, 0, sizeof(c)); + INIT_CMD(c, RESET, WRITE); + c.val = htonl(reset); + return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL); +} + +/** + * t4_fw_halt - issue a reset/halt to FW and put uP into RESET + * @adap: the adapter + * @mbox: mailbox to use for the FW RESET command (if desired) + * @force: force uP into RESET even if FW RESET command fails + * + * Issues a RESET command to firmware (if desired) with a HALT indication + * and then puts the microprocessor into RESET state. The RESET command + * will only be issued if a legitimate mailbox is provided (mbox <= + * FW_PCIE_FW_MASTER_MASK). + * + * This is generally used in order for the host to safely manipulate the + * adapter without fear of conflicting with whatever the firmware might + * be doing. The only way out of this state is to RESTART the firmware + * ... + */ +static int t4_fw_halt(struct adapter *adap, unsigned int mbox, int force) +{ + int ret = 0; + + /* + * If a legitimate mailbox is provided, issue a RESET command + * with a HALT indication. + */ + if (mbox <= FW_PCIE_FW_MASTER_MASK) { + struct fw_reset_cmd c; + + memset(&c, 0, sizeof(c)); + INIT_CMD(c, RESET, WRITE); + c.val = htonl(PIORST | PIORSTMODE); + c.halt_pkd = htonl(FW_RESET_CMD_HALT(1U)); + ret = t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL); + } + + /* + * Normally we won't complete the operation if the firmware RESET + * command fails but if our caller insists we'll go ahead and put the + * uP into RESET. This can be useful if the firmware is hung or even + * missing ... We'll have to take the risk of putting the uP into + * RESET without the cooperation of firmware in that case. + * + * We also force the firmware's HALT flag to be on in case we bypassed + * the firmware RESET command above or we're dealing with old firmware + * which doesn't have the HALT capability. This will serve as a flag + * for the incoming firmware to know that it's coming out of a HALT + * rather than a RESET ... if it's new enough to understand that ... + */ + if (ret == 0 || force) { + t4_set_reg_field(adap, CIM_BOOT_CFG, UPCRST, UPCRST); + t4_set_reg_field(adap, PCIE_FW, FW_PCIE_FW_HALT, + FW_PCIE_FW_HALT); + } + + /* + * And we always return the result of the firmware RESET command + * even when we force the uP into RESET ... + */ + return ret; +} + +/** + * t4_fw_restart - restart the firmware by taking the uP out of RESET + * @adap: the adapter + * @reset: if we want to do a RESET to restart things + * + * Restart firmware previously halted by t4_fw_halt(). On successful + * return the previous PF Master remains as the new PF Master and there + * is no need to issue a new HELLO command, etc. + * + * We do this in two ways: + * + * 1. If we're dealing with newer firmware we'll simply want to take + * the chip's microprocessor out of RESET. This will cause the + * firmware to start up from its start vector. And then we'll loop + * until the firmware indicates it's started again (PCIE_FW.HALT + * reset to 0) or we timeout. + * + * 2. If we're dealing with older firmware then we'll need to RESET + * the chip since older firmware won't recognize the PCIE_FW.HALT + * flag and automatically RESET itself on startup. + */ +static int t4_fw_restart(struct adapter *adap, unsigned int mbox, int reset) +{ + if (reset) { + /* + * Since we're directing the RESET instead of the firmware + * doing it automatically, we need to clear the PCIE_FW.HALT + * bit. + */ + t4_set_reg_field(adap, PCIE_FW, FW_PCIE_FW_HALT, 0); + + /* + * If we've been given a valid mailbox, first try to get the + * firmware to do the RESET. If that works, great and we can + * return success. Otherwise, if we haven't been given a + * valid mailbox or the RESET command failed, fall back to + * hitting the chip with a hammer. + */ + if (mbox <= FW_PCIE_FW_MASTER_MASK) { + t4_set_reg_field(adap, CIM_BOOT_CFG, UPCRST, 0); + msleep(100); + if (t4_fw_reset(adap, mbox, + PIORST | PIORSTMODE) == 0) + return 0; + } + + t4_write_reg(adap, PL_RST, PIORST | PIORSTMODE); + msleep(2000); + } else { + int ms; + + t4_set_reg_field(adap, CIM_BOOT_CFG, UPCRST, 0); + for (ms = 0; ms < FW_CMD_MAX_TIMEOUT; ) { + if (!(t4_read_reg(adap, PCIE_FW) & FW_PCIE_FW_HALT)) + return 0; + msleep(100); + ms += 100; + } + return -ETIMEDOUT; + } + return 0; +} + +/** + * t4_fw_upgrade - perform all of the steps necessary to upgrade FW + * @adap: the adapter + * @mbox: mailbox to use for the FW RESET command (if desired) + * @fw_data: the firmware image to write + * @size: image size + * @force: force upgrade even if firmware doesn't cooperate + * + * Perform all of the steps necessary for upgrading an adapter's + * firmware image. Normally this requires the cooperation of the + * existing firmware in order to halt all existing activities + * but if an invalid mailbox token is passed in we skip that step + * (though we'll still put the adapter microprocessor into RESET in + * that case). + * + * On successful return the new firmware will have been loaded and + * the adapter will have been fully RESET losing all previous setup + * state. On unsuccessful return the adapter may be completely hosed ... + * positive errno indicates that the adapter is ~probably~ intact, a + * negative errno indicates that things are looking bad ... + */ +int t4_fw_upgrade(struct adapter *adap, unsigned int mbox, + const u8 *fw_data, unsigned int size, int force) +{ + const struct fw_hdr *fw_hdr = (const struct fw_hdr *)fw_data; + int reset, ret; + + ret = t4_fw_halt(adap, mbox, force); + if (ret < 0 && !force) + return ret; + + ret = t4_load_fw(adap, fw_data, size); + if (ret < 0) + return ret; + + /* + * Older versions of the firmware don't understand the new + * PCIE_FW.HALT flag and so won't know to perform a RESET when they + * restart. So for newly loaded older firmware we'll have to do the + * RESET for it so it starts up on a clean slate. We can tell if + * the newly loaded firmware will handle this right by checking + * its header flags to see if it advertises the capability. + */ + reset = ((ntohl(fw_hdr->flags) & FW_HDR_FLAGS_RESET_HALT) == 0); + return t4_fw_restart(adap, mbox, reset); +} + +/** + * t4_fixup_host_params - fix up host-dependent parameters + * @adap: the adapter + * @page_size: the host's Base Page Size + * @cache_line_size: the host's Cache Line Size + * + * Various registers in T4 contain values which are dependent on the + * host's Base Page and Cache Line Sizes. This function will fix all of + * those registers with the appropriate values as passed in ... + */ +int t4_fixup_host_params(struct adapter *adap, unsigned int page_size, + unsigned int cache_line_size) +{ + unsigned int page_shift = fls(page_size) - 1; + unsigned int sge_hps = page_shift - 10; + unsigned int stat_len = cache_line_size > 64 ? 128 : 64; + unsigned int fl_align = cache_line_size < 32 ? 32 : cache_line_size; + unsigned int fl_align_log = fls(fl_align) - 1; + + t4_write_reg(adap, SGE_HOST_PAGE_SIZE, + HOSTPAGESIZEPF0(sge_hps) | + HOSTPAGESIZEPF1(sge_hps) | + HOSTPAGESIZEPF2(sge_hps) | + HOSTPAGESIZEPF3(sge_hps) | + HOSTPAGESIZEPF4(sge_hps) | + HOSTPAGESIZEPF5(sge_hps) | + HOSTPAGESIZEPF6(sge_hps) | + HOSTPAGESIZEPF7(sge_hps)); + + if (is_t4(adap->params.chip)) { + t4_set_reg_field(adap, SGE_CONTROL, + INGPADBOUNDARY_MASK | + EGRSTATUSPAGESIZE_MASK, + INGPADBOUNDARY(fl_align_log - 5) | + EGRSTATUSPAGESIZE(stat_len != 64)); + } else { + /* T5 introduced the separation of the Free List Padding and + * Packing Boundaries. Thus, we can select a smaller Padding + * Boundary to avoid uselessly chewing up PCIe Link and Memory + * Bandwidth, and use a Packing Boundary which is large enough + * to avoid false sharing between CPUs, etc. + * + * For the PCI Link, the smaller the Padding Boundary the + * better. For the Memory Controller, a smaller Padding + * Boundary is better until we cross under the Memory Line + * Size (the minimum unit of transfer to/from Memory). If we + * have a Padding Boundary which is smaller than the Memory + * Line Size, that'll involve a Read-Modify-Write cycle on the + * Memory Controller which is never good. For T5 the smallest + * Padding Boundary which we can select is 32 bytes which is + * larger than any known Memory Controller Line Size so we'll + * use that. + * + * T5 has a different interpretation of the "0" value for the + * Packing Boundary. This corresponds to 16 bytes instead of + * the expected 32 bytes. We never have a Packing Boundary + * less than 32 bytes so we can't use that special value but + * on the other hand, if we wanted 32 bytes, the best we can + * really do is 64 bytes. + */ + if (fl_align <= 32) { + fl_align = 64; + fl_align_log = 6; + } + t4_set_reg_field(adap, SGE_CONTROL, + INGPADBOUNDARY_MASK | + EGRSTATUSPAGESIZE_MASK, + INGPADBOUNDARY(INGPCIEBOUNDARY_32B_X) | + EGRSTATUSPAGESIZE(stat_len != 64)); + t4_set_reg_field(adap, SGE_CONTROL2_A, + INGPACKBOUNDARY_V(INGPACKBOUNDARY_M), + INGPACKBOUNDARY_V(fl_align_log - + INGPACKBOUNDARY_SHIFT_X)); + } + /* + * Adjust various SGE Free List Host Buffer Sizes. + * + * This is something of a crock since we're using fixed indices into + * the array which are also known by the sge.c code and the T4 + * Firmware Configuration File. We need to come up with a much better + * approach to managing this array. For now, the first four entries + * are: + * + * 0: Host Page Size + * 1: 64KB + * 2: Buffer size corresponding to 1500 byte MTU (unpacked mode) + * 3: Buffer size corresponding to 9000 byte MTU (unpacked mode) + * + * For the single-MTU buffers in unpacked mode we need to include + * space for the SGE Control Packet Shift, 14 byte Ethernet header, + * possible 4 byte VLAN tag, all rounded up to the next Ingress Packet + * Padding boundry. All of these are accommodated in the Factory + * Default Firmware Configuration File but we need to adjust it for + * this host's cache line size. + */ + t4_write_reg(adap, SGE_FL_BUFFER_SIZE0, page_size); + t4_write_reg(adap, SGE_FL_BUFFER_SIZE2, + (t4_read_reg(adap, SGE_FL_BUFFER_SIZE2) + fl_align-1) + & ~(fl_align-1)); + t4_write_reg(adap, SGE_FL_BUFFER_SIZE3, + (t4_read_reg(adap, SGE_FL_BUFFER_SIZE3) + fl_align-1) + & ~(fl_align-1)); + + t4_write_reg(adap, ULP_RX_TDDP_PSZ, HPZ0(page_shift - 12)); + + return 0; +} + +/** + * t4_fw_initialize - ask FW to initialize the device + * @adap: the adapter + * @mbox: mailbox to use for the FW command + * + * Issues a command to FW to partially initialize the device. This + * performs initialization that generally doesn't depend on user input. + */ +int t4_fw_initialize(struct adapter *adap, unsigned int mbox) +{ + struct fw_initialize_cmd c; + + memset(&c, 0, sizeof(c)); + INIT_CMD(c, INITIALIZE, WRITE); + return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL); +} + +/** + * t4_query_params - query FW or device parameters + * @adap: the adapter + * @mbox: mailbox to use for the FW command + * @pf: the PF + * @vf: the VF + * @nparams: the number of parameters + * @params: the parameter names + * @val: the parameter values + * + * Reads the value of FW or device parameters. Up to 7 parameters can be + * queried at once. + */ +int t4_query_params(struct adapter *adap, unsigned int mbox, unsigned int pf, + unsigned int vf, unsigned int nparams, const u32 *params, + u32 *val) +{ + int i, ret; + struct fw_params_cmd c; + __be32 *p = &c.param[0].mnem; + + if (nparams > 7) + return -EINVAL; + + memset(&c, 0, sizeof(c)); + c.op_to_vfn = htonl(FW_CMD_OP(FW_PARAMS_CMD) | FW_CMD_REQUEST | + FW_CMD_READ | FW_PARAMS_CMD_PFN(pf) | + FW_PARAMS_CMD_VFN(vf)); + c.retval_len16 = htonl(FW_LEN16(c)); + for (i = 0; i < nparams; i++, p += 2) + *p = htonl(*params++); + + ret = t4_wr_mbox(adap, mbox, &c, sizeof(c), &c); + if (ret == 0) + for (i = 0, p = &c.param[0].val; i < nparams; i++, p += 2) + *val++ = ntohl(*p); + return ret; +} + +/** + * t4_set_params_nosleep - sets FW or device parameters + * @adap: the adapter + * @mbox: mailbox to use for the FW command + * @pf: the PF + * @vf: the VF + * @nparams: the number of parameters + * @params: the parameter names + * @val: the parameter values + * + * Does not ever sleep + * Sets the value of FW or device parameters. Up to 7 parameters can be + * specified at once. + */ +int t4_set_params_nosleep(struct adapter *adap, unsigned int mbox, + unsigned int pf, unsigned int vf, + unsigned int nparams, const u32 *params, + const u32 *val) +{ + struct fw_params_cmd c; + __be32 *p = &c.param[0].mnem; + + if (nparams > 7) + return -EINVAL; + + memset(&c, 0, sizeof(c)); + c.op_to_vfn = cpu_to_be32(FW_CMD_OP(FW_PARAMS_CMD) | + FW_CMD_REQUEST | FW_CMD_WRITE | + FW_PARAMS_CMD_PFN(pf) | + FW_PARAMS_CMD_VFN(vf)); + c.retval_len16 = cpu_to_be32(FW_LEN16(c)); + + while (nparams--) { + *p++ = cpu_to_be32(*params++); + *p++ = cpu_to_be32(*val++); + } + + return t4_wr_mbox_ns(adap, mbox, &c, sizeof(c), NULL); +} + +/** + * t4_set_params - sets FW or device parameters + * @adap: the adapter + * @mbox: mailbox to use for the FW command + * @pf: the PF + * @vf: the VF + * @nparams: the number of parameters + * @params: the parameter names + * @val: the parameter values + * + * Sets the value of FW or device parameters. Up to 7 parameters can be + * specified at once. + */ +int t4_set_params(struct adapter *adap, unsigned int mbox, unsigned int pf, + unsigned int vf, unsigned int nparams, const u32 *params, + const u32 *val) +{ + struct fw_params_cmd c; + __be32 *p = &c.param[0].mnem; + + if (nparams > 7) + return -EINVAL; + + memset(&c, 0, sizeof(c)); + c.op_to_vfn = htonl(FW_CMD_OP(FW_PARAMS_CMD) | FW_CMD_REQUEST | + FW_CMD_WRITE | FW_PARAMS_CMD_PFN(pf) | + FW_PARAMS_CMD_VFN(vf)); + c.retval_len16 = htonl(FW_LEN16(c)); + while (nparams--) { + *p++ = htonl(*params++); + *p++ = htonl(*val++); + } + + return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL); +} + +/** + * t4_cfg_pfvf - configure PF/VF resource limits + * @adap: the adapter + * @mbox: mailbox to use for the FW command + * @pf: the PF being configured + * @vf: the VF being configured + * @txq: the max number of egress queues + * @txq_eth_ctrl: the max number of egress Ethernet or control queues + * @rxqi: the max number of interrupt-capable ingress queues + * @rxq: the max number of interruptless ingress queues + * @tc: the PCI traffic class + * @vi: the max number of virtual interfaces + * @cmask: the channel access rights mask for the PF/VF + * @pmask: the port access rights mask for the PF/VF + * @nexact: the maximum number of exact MPS filters + * @rcaps: read capabilities + * @wxcaps: write/execute capabilities + * + * Configures resource limits and capabilities for a physical or virtual + * function. + */ +int t4_cfg_pfvf(struct adapter *adap, unsigned int mbox, unsigned int pf, + unsigned int vf, unsigned int txq, unsigned int txq_eth_ctrl, + unsigned int rxqi, unsigned int rxq, unsigned int tc, + unsigned int vi, unsigned int cmask, unsigned int pmask, + unsigned int nexact, unsigned int rcaps, unsigned int wxcaps) +{ + struct fw_pfvf_cmd c; + + memset(&c, 0, sizeof(c)); + c.op_to_vfn = htonl(FW_CMD_OP(FW_PFVF_CMD) | FW_CMD_REQUEST | + FW_CMD_WRITE | FW_PFVF_CMD_PFN(pf) | + FW_PFVF_CMD_VFN(vf)); + c.retval_len16 = htonl(FW_LEN16(c)); + c.niqflint_niq = htonl(FW_PFVF_CMD_NIQFLINT(rxqi) | + FW_PFVF_CMD_NIQ(rxq)); + c.type_to_neq = htonl(FW_PFVF_CMD_CMASK(cmask) | + FW_PFVF_CMD_PMASK(pmask) | + FW_PFVF_CMD_NEQ(txq)); + c.tc_to_nexactf = htonl(FW_PFVF_CMD_TC(tc) | FW_PFVF_CMD_NVI(vi) | + FW_PFVF_CMD_NEXACTF(nexact)); + c.r_caps_to_nethctrl = htonl(FW_PFVF_CMD_R_CAPS(rcaps) | + FW_PFVF_CMD_WX_CAPS(wxcaps) | + FW_PFVF_CMD_NETHCTRL(txq_eth_ctrl)); + return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL); +} + +/** + * t4_alloc_vi - allocate a virtual interface + * @adap: the adapter + * @mbox: mailbox to use for the FW command + * @port: physical port associated with the VI + * @pf: the PF owning the VI + * @vf: the VF owning the VI + * @nmac: number of MAC addresses needed (1 to 5) + * @mac: the MAC addresses of the VI + * @rss_size: size of RSS table slice associated with this VI + * + * Allocates a virtual interface for the given physical port. If @mac is + * not %NULL it contains the MAC addresses of the VI as assigned by FW. + * @mac should be large enough to hold @nmac Ethernet addresses, they are + * stored consecutively so the space needed is @nmac * 6 bytes. + * Returns a negative error number or the non-negative VI id. + */ +int t4_alloc_vi(struct adapter *adap, unsigned int mbox, unsigned int port, + unsigned int pf, unsigned int vf, unsigned int nmac, u8 *mac, + unsigned int *rss_size) +{ + int ret; + struct fw_vi_cmd c; + + memset(&c, 0, sizeof(c)); + c.op_to_vfn = htonl(FW_CMD_OP(FW_VI_CMD) | FW_CMD_REQUEST | + FW_CMD_WRITE | FW_CMD_EXEC | + FW_VI_CMD_PFN(pf) | FW_VI_CMD_VFN(vf)); + c.alloc_to_len16 = htonl(FW_VI_CMD_ALLOC | FW_LEN16(c)); + c.portid_pkd = FW_VI_CMD_PORTID(port); + c.nmac = nmac - 1; + + ret = t4_wr_mbox(adap, mbox, &c, sizeof(c), &c); + if (ret) + return ret; + + if (mac) { + memcpy(mac, c.mac, sizeof(c.mac)); + switch (nmac) { + case 5: + memcpy(mac + 24, c.nmac3, sizeof(c.nmac3)); + case 4: + memcpy(mac + 18, c.nmac2, sizeof(c.nmac2)); + case 3: + memcpy(mac + 12, c.nmac1, sizeof(c.nmac1)); + case 2: + memcpy(mac + 6, c.nmac0, sizeof(c.nmac0)); + } + } + if (rss_size) + *rss_size = FW_VI_CMD_RSSSIZE_GET(ntohs(c.rsssize_pkd)); + return FW_VI_CMD_VIID_GET(ntohs(c.type_viid)); +} + +/** + * t4_set_rxmode - set Rx properties of a virtual interface + * @adap: the adapter + * @mbox: mailbox to use for the FW command + * @viid: the VI id + * @mtu: the new MTU or -1 + * @promisc: 1 to enable promiscuous mode, 0 to disable it, -1 no change + * @all_multi: 1 to enable all-multi mode, 0 to disable it, -1 no change + * @bcast: 1 to enable broadcast Rx, 0 to disable it, -1 no change + * @vlanex: 1 to enable HW VLAN extraction, 0 to disable it, -1 no change + * @sleep_ok: if true we may sleep while awaiting command completion + * + * Sets Rx properties of a virtual interface. + */ +int t4_set_rxmode(struct adapter *adap, unsigned int mbox, unsigned int viid, + int mtu, int promisc, int all_multi, int bcast, int vlanex, + bool sleep_ok) +{ + struct fw_vi_rxmode_cmd c; + + /* convert to FW values */ + if (mtu < 0) + mtu = FW_RXMODE_MTU_NO_CHG; + if (promisc < 0) + promisc = FW_VI_RXMODE_CMD_PROMISCEN_MASK; + if (all_multi < 0) + all_multi = FW_VI_RXMODE_CMD_ALLMULTIEN_MASK; + if (bcast < 0) + bcast = FW_VI_RXMODE_CMD_BROADCASTEN_MASK; + if (vlanex < 0) + vlanex = FW_VI_RXMODE_CMD_VLANEXEN_MASK; + + memset(&c, 0, sizeof(c)); + c.op_to_viid = htonl(FW_CMD_OP(FW_VI_RXMODE_CMD) | FW_CMD_REQUEST | + FW_CMD_WRITE | FW_VI_RXMODE_CMD_VIID(viid)); + c.retval_len16 = htonl(FW_LEN16(c)); + c.mtu_to_vlanexen = htonl(FW_VI_RXMODE_CMD_MTU(mtu) | + FW_VI_RXMODE_CMD_PROMISCEN(promisc) | + FW_VI_RXMODE_CMD_ALLMULTIEN(all_multi) | + FW_VI_RXMODE_CMD_BROADCASTEN(bcast) | + FW_VI_RXMODE_CMD_VLANEXEN(vlanex)); + return t4_wr_mbox_meat(adap, mbox, &c, sizeof(c), NULL, sleep_ok); +} + +/** + * t4_alloc_mac_filt - allocates exact-match filters for MAC addresses + * @adap: the adapter + * @mbox: mailbox to use for the FW command + * @viid: the VI id + * @free: if true any existing filters for this VI id are first removed + * @naddr: the number of MAC addresses to allocate filters for (up to 7) + * @addr: the MAC address(es) + * @idx: where to store the index of each allocated filter + * @hash: pointer to hash address filter bitmap + * @sleep_ok: call is allowed to sleep + * + * Allocates an exact-match filter for each of the supplied addresses and + * sets it to the corresponding address. If @idx is not %NULL it should + * have at least @naddr entries, each of which will be set to the index of + * the filter allocated for the corresponding MAC address. If a filter + * could not be allocated for an address its index is set to 0xffff. + * If @hash is not %NULL addresses that fail to allocate an exact filter + * are hashed and update the hash filter bitmap pointed at by @hash. + * + * Returns a negative error number or the number of filters allocated. + */ +int t4_alloc_mac_filt(struct adapter *adap, unsigned int mbox, + unsigned int viid, bool free, unsigned int naddr, + const u8 **addr, u16 *idx, u64 *hash, bool sleep_ok) +{ + int i, ret; + struct fw_vi_mac_cmd c; + struct fw_vi_mac_exact *p; + unsigned int max_naddr = is_t4(adap->params.chip) ? + NUM_MPS_CLS_SRAM_L_INSTANCES : + NUM_MPS_T5_CLS_SRAM_L_INSTANCES; + + if (naddr > 7) + return -EINVAL; + + memset(&c, 0, sizeof(c)); + c.op_to_viid = htonl(FW_CMD_OP(FW_VI_MAC_CMD) | FW_CMD_REQUEST | + FW_CMD_WRITE | (free ? FW_CMD_EXEC : 0) | + FW_VI_MAC_CMD_VIID(viid)); + c.freemacs_to_len16 = htonl(FW_VI_MAC_CMD_FREEMACS(free) | + FW_CMD_LEN16((naddr + 2) / 2)); + + for (i = 0, p = c.u.exact; i < naddr; i++, p++) { + p->valid_to_idx = htons(FW_VI_MAC_CMD_VALID | + FW_VI_MAC_CMD_IDX(FW_VI_MAC_ADD_MAC)); + memcpy(p->macaddr, addr[i], sizeof(p->macaddr)); + } + + ret = t4_wr_mbox_meat(adap, mbox, &c, sizeof(c), &c, sleep_ok); + if (ret) + return ret; + + for (i = 0, p = c.u.exact; i < naddr; i++, p++) { + u16 index = FW_VI_MAC_CMD_IDX_GET(ntohs(p->valid_to_idx)); + + if (idx) + idx[i] = index >= max_naddr ? 0xffff : index; + if (index < max_naddr) + ret++; + else if (hash) + *hash |= (1ULL << hash_mac_addr(addr[i])); + } + return ret; +} + +/** + * t4_change_mac - modifies the exact-match filter for a MAC address + * @adap: the adapter + * @mbox: mailbox to use for the FW command + * @viid: the VI id + * @idx: index of existing filter for old value of MAC address, or -1 + * @addr: the new MAC address value + * @persist: whether a new MAC allocation should be persistent + * @add_smt: if true also add the address to the HW SMT + * + * Modifies an exact-match filter and sets it to the new MAC address. + * Note that in general it is not possible to modify the value of a given + * filter so the generic way to modify an address filter is to free the one + * being used by the old address value and allocate a new filter for the + * new address value. @idx can be -1 if the address is a new addition. + * + * Returns a negative error number or the index of the filter with the new + * MAC value. + */ +int t4_change_mac(struct adapter *adap, unsigned int mbox, unsigned int viid, + int idx, const u8 *addr, bool persist, bool add_smt) +{ + int ret, mode; + struct fw_vi_mac_cmd c; + struct fw_vi_mac_exact *p = c.u.exact; + unsigned int max_mac_addr = is_t4(adap->params.chip) ? + NUM_MPS_CLS_SRAM_L_INSTANCES : + NUM_MPS_T5_CLS_SRAM_L_INSTANCES; + + if (idx < 0) /* new allocation */ + idx = persist ? FW_VI_MAC_ADD_PERSIST_MAC : FW_VI_MAC_ADD_MAC; + mode = add_smt ? FW_VI_MAC_SMT_AND_MPSTCAM : FW_VI_MAC_MPS_TCAM_ENTRY; + + memset(&c, 0, sizeof(c)); + c.op_to_viid = htonl(FW_CMD_OP(FW_VI_MAC_CMD) | FW_CMD_REQUEST | + FW_CMD_WRITE | FW_VI_MAC_CMD_VIID(viid)); + c.freemacs_to_len16 = htonl(FW_CMD_LEN16(1)); + p->valid_to_idx = htons(FW_VI_MAC_CMD_VALID | + FW_VI_MAC_CMD_SMAC_RESULT(mode) | + FW_VI_MAC_CMD_IDX(idx)); + memcpy(p->macaddr, addr, sizeof(p->macaddr)); + + ret = t4_wr_mbox(adap, mbox, &c, sizeof(c), &c); + if (ret == 0) { + ret = FW_VI_MAC_CMD_IDX_GET(ntohs(p->valid_to_idx)); + if (ret >= max_mac_addr) + ret = -ENOMEM; + } + return ret; +} + +/** + * t4_set_addr_hash - program the MAC inexact-match hash filter + * @adap: the adapter + * @mbox: mailbox to use for the FW command + * @viid: the VI id + * @ucast: whether the hash filter should also match unicast addresses + * @vec: the value to be written to the hash filter + * @sleep_ok: call is allowed to sleep + * + * Sets the 64-bit inexact-match hash filter for a virtual interface. + */ +int t4_set_addr_hash(struct adapter *adap, unsigned int mbox, unsigned int viid, + bool ucast, u64 vec, bool sleep_ok) +{ + struct fw_vi_mac_cmd c; + + memset(&c, 0, sizeof(c)); + c.op_to_viid = htonl(FW_CMD_OP(FW_VI_MAC_CMD) | FW_CMD_REQUEST | + FW_CMD_WRITE | FW_VI_ENABLE_CMD_VIID(viid)); + c.freemacs_to_len16 = htonl(FW_VI_MAC_CMD_HASHVECEN | + FW_VI_MAC_CMD_HASHUNIEN(ucast) | + FW_CMD_LEN16(1)); + c.u.hash.hashvec = cpu_to_be64(vec); + return t4_wr_mbox_meat(adap, mbox, &c, sizeof(c), NULL, sleep_ok); +} + +/** + * t4_enable_vi_params - enable/disable a virtual interface + * @adap: the adapter + * @mbox: mailbox to use for the FW command + * @viid: the VI id + * @rx_en: 1=enable Rx, 0=disable Rx + * @tx_en: 1=enable Tx, 0=disable Tx + * @dcb_en: 1=enable delivery of Data Center Bridging messages. + * + * Enables/disables a virtual interface. Note that setting DCB Enable + * only makes sense when enabling a Virtual Interface ... + */ +int t4_enable_vi_params(struct adapter *adap, unsigned int mbox, + unsigned int viid, bool rx_en, bool tx_en, bool dcb_en) +{ + struct fw_vi_enable_cmd c; + + memset(&c, 0, sizeof(c)); + c.op_to_viid = htonl(FW_CMD_OP(FW_VI_ENABLE_CMD) | FW_CMD_REQUEST | + FW_CMD_EXEC | FW_VI_ENABLE_CMD_VIID(viid)); + + c.ien_to_len16 = htonl(FW_VI_ENABLE_CMD_IEN(rx_en) | + FW_VI_ENABLE_CMD_EEN(tx_en) | FW_LEN16(c) | + FW_VI_ENABLE_CMD_DCB_INFO(dcb_en)); + return t4_wr_mbox_ns(adap, mbox, &c, sizeof(c), NULL); +} + +/** + * t4_enable_vi - enable/disable a virtual interface + * @adap: the adapter + * @mbox: mailbox to use for the FW command + * @viid: the VI id + * @rx_en: 1=enable Rx, 0=disable Rx + * @tx_en: 1=enable Tx, 0=disable Tx + * + * Enables/disables a virtual interface. + */ +int t4_enable_vi(struct adapter *adap, unsigned int mbox, unsigned int viid, + bool rx_en, bool tx_en) +{ + return t4_enable_vi_params(adap, mbox, viid, rx_en, tx_en, 0); +} + +/** + * t4_identify_port - identify a VI's port by blinking its LED + * @adap: the adapter + * @mbox: mailbox to use for the FW command + * @viid: the VI id + * @nblinks: how many times to blink LED at 2.5 Hz + * + * Identifies a VI's port by blinking its LED. + */ +int t4_identify_port(struct adapter *adap, unsigned int mbox, unsigned int viid, + unsigned int nblinks) +{ + struct fw_vi_enable_cmd c; + + memset(&c, 0, sizeof(c)); + c.op_to_viid = htonl(FW_CMD_OP(FW_VI_ENABLE_CMD) | FW_CMD_REQUEST | + FW_CMD_EXEC | FW_VI_ENABLE_CMD_VIID(viid)); + c.ien_to_len16 = htonl(FW_VI_ENABLE_CMD_LED | FW_LEN16(c)); + c.blinkdur = htons(nblinks); + return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL); +} + +/** + * t4_iq_free - free an ingress queue and its FLs + * @adap: the adapter + * @mbox: mailbox to use for the FW command + * @pf: the PF owning the queues + * @vf: the VF owning the queues + * @iqtype: the ingress queue type + * @iqid: ingress queue id + * @fl0id: FL0 queue id or 0xffff if no attached FL0 + * @fl1id: FL1 queue id or 0xffff if no attached FL1 + * + * Frees an ingress queue and its associated FLs, if any. + */ +int t4_iq_free(struct adapter *adap, unsigned int mbox, unsigned int pf, + unsigned int vf, unsigned int iqtype, unsigned int iqid, + unsigned int fl0id, unsigned int fl1id) +{ + struct fw_iq_cmd c; + + memset(&c, 0, sizeof(c)); + c.op_to_vfn = htonl(FW_CMD_OP(FW_IQ_CMD) | FW_CMD_REQUEST | + FW_CMD_EXEC | FW_IQ_CMD_PFN(pf) | + FW_IQ_CMD_VFN(vf)); + c.alloc_to_len16 = htonl(FW_IQ_CMD_FREE | FW_LEN16(c)); + c.type_to_iqandstindex = htonl(FW_IQ_CMD_TYPE(iqtype)); + c.iqid = htons(iqid); + c.fl0id = htons(fl0id); + c.fl1id = htons(fl1id); + return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL); +} + +/** + * t4_eth_eq_free - free an Ethernet egress queue + * @adap: the adapter + * @mbox: mailbox to use for the FW command + * @pf: the PF owning the queue + * @vf: the VF owning the queue + * @eqid: egress queue id + * + * Frees an Ethernet egress queue. + */ +int t4_eth_eq_free(struct adapter *adap, unsigned int mbox, unsigned int pf, + unsigned int vf, unsigned int eqid) +{ + struct fw_eq_eth_cmd c; + + memset(&c, 0, sizeof(c)); + c.op_to_vfn = htonl(FW_CMD_OP(FW_EQ_ETH_CMD) | FW_CMD_REQUEST | + FW_CMD_EXEC | FW_EQ_ETH_CMD_PFN(pf) | + FW_EQ_ETH_CMD_VFN(vf)); + c.alloc_to_len16 = htonl(FW_EQ_ETH_CMD_FREE | FW_LEN16(c)); + c.eqid_pkd = htonl(FW_EQ_ETH_CMD_EQID(eqid)); + return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL); +} + +/** + * t4_ctrl_eq_free - free a control egress queue + * @adap: the adapter + * @mbox: mailbox to use for the FW command + * @pf: the PF owning the queue + * @vf: the VF owning the queue + * @eqid: egress queue id + * + * Frees a control egress queue. + */ +int t4_ctrl_eq_free(struct adapter *adap, unsigned int mbox, unsigned int pf, + unsigned int vf, unsigned int eqid) +{ + struct fw_eq_ctrl_cmd c; + + memset(&c, 0, sizeof(c)); + c.op_to_vfn = htonl(FW_CMD_OP(FW_EQ_CTRL_CMD) | FW_CMD_REQUEST | + FW_CMD_EXEC | FW_EQ_CTRL_CMD_PFN(pf) | + FW_EQ_CTRL_CMD_VFN(vf)); + c.alloc_to_len16 = htonl(FW_EQ_CTRL_CMD_FREE | FW_LEN16(c)); + c.cmpliqid_eqid = htonl(FW_EQ_CTRL_CMD_EQID(eqid)); + return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL); +} + +/** + * t4_ofld_eq_free - free an offload egress queue + * @adap: the adapter + * @mbox: mailbox to use for the FW command + * @pf: the PF owning the queue + * @vf: the VF owning the queue + * @eqid: egress queue id + * + * Frees a control egress queue. + */ +int t4_ofld_eq_free(struct adapter *adap, unsigned int mbox, unsigned int pf, + unsigned int vf, unsigned int eqid) +{ + struct fw_eq_ofld_cmd c; + + memset(&c, 0, sizeof(c)); + c.op_to_vfn = htonl(FW_CMD_OP(FW_EQ_OFLD_CMD) | FW_CMD_REQUEST | + FW_CMD_EXEC | FW_EQ_OFLD_CMD_PFN(pf) | + FW_EQ_OFLD_CMD_VFN(vf)); + c.alloc_to_len16 = htonl(FW_EQ_OFLD_CMD_FREE | FW_LEN16(c)); + c.eqid_pkd = htonl(FW_EQ_OFLD_CMD_EQID(eqid)); + return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL); +} + +/** + * t4_handle_fw_rpl - process a FW reply message + * @adap: the adapter + * @rpl: start of the FW message + * + * Processes a FW message, such as link state change messages. + */ +int t4_handle_fw_rpl(struct adapter *adap, const __be64 *rpl) +{ + u8 opcode = *(const u8 *)rpl; + + if (opcode == FW_PORT_CMD) { /* link/module state change message */ + int speed = 0, fc = 0; + const struct fw_port_cmd *p = (void *)rpl; + int chan = FW_PORT_CMD_PORTID_GET(ntohl(p->op_to_portid)); + int port = adap->chan_map[chan]; + struct port_info *pi = adap2pinfo(adap, port); + struct link_config *lc = &pi->link_cfg; + u32 stat = ntohl(p->u.info.lstatus_to_modtype); + int link_ok = (stat & FW_PORT_CMD_LSTATUS) != 0; + u32 mod = FW_PORT_CMD_MODTYPE_GET(stat); + + if (stat & FW_PORT_CMD_RXPAUSE) + fc |= PAUSE_RX; + if (stat & FW_PORT_CMD_TXPAUSE) + fc |= PAUSE_TX; + if (stat & FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_100M)) + speed = 100; + else if (stat & FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_1G)) + speed = 1000; + else if (stat & FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_10G)) + speed = 10000; + else if (stat & FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_40G)) + speed = 40000; + + if (link_ok != lc->link_ok || speed != lc->speed || + fc != lc->fc) { /* something changed */ + lc->link_ok = link_ok; + lc->speed = speed; + lc->fc = fc; + lc->supported = be16_to_cpu(p->u.info.pcap); + t4_os_link_changed(adap, port, link_ok); + } + if (mod != pi->mod_type) { + pi->mod_type = mod; + t4_os_portmod_changed(adap, port); + } + } + return 0; +} + +static void get_pci_mode(struct adapter *adapter, struct pci_params *p) +{ + u16 val; + + if (pci_is_pcie(adapter->pdev)) { + pcie_capability_read_word(adapter->pdev, PCI_EXP_LNKSTA, &val); + p->speed = val & PCI_EXP_LNKSTA_CLS; + p->width = (val & PCI_EXP_LNKSTA_NLW) >> 4; + } +} + +/** + * init_link_config - initialize a link's SW state + * @lc: structure holding the link state + * @caps: link capabilities + * + * Initializes the SW state maintained for each link, including the link's + * capabilities and default speed/flow-control/autonegotiation settings. + */ +static void init_link_config(struct link_config *lc, unsigned int caps) +{ + lc->supported = caps; + lc->requested_speed = 0; + lc->speed = 0; + lc->requested_fc = lc->fc = PAUSE_RX | PAUSE_TX; + if (lc->supported & FW_PORT_CAP_ANEG) { + lc->advertising = lc->supported & ADVERT_MASK; + lc->autoneg = AUTONEG_ENABLE; + lc->requested_fc |= PAUSE_AUTONEG; + } else { + lc->advertising = 0; + lc->autoneg = AUTONEG_DISABLE; + } +} + +#define CIM_PF_NOACCESS 0xeeeeeeee + +int t4_wait_dev_ready(void __iomem *regs) +{ + u32 whoami; + + whoami = readl(regs + PL_WHOAMI); + if (whoami != 0xffffffff && whoami != CIM_PF_NOACCESS) + return 0; + + msleep(500); + whoami = readl(regs + PL_WHOAMI); + return (whoami != 0xffffffff && whoami != CIM_PF_NOACCESS ? 0 : -EIO); +} + +struct flash_desc { + u32 vendor_and_model_id; + u32 size_mb; +}; + +static int get_flash_params(struct adapter *adap) +{ + /* Table for non-Numonix supported flash parts. Numonix parts are left + * to the preexisting code. All flash parts have 64KB sectors. + */ + static struct flash_desc supported_flash[] = { + { 0x150201, 4 << 20 }, /* Spansion 4MB S25FL032P */ + }; + + int ret; + u32 info; + + ret = sf1_write(adap, 1, 1, 0, SF_RD_ID); + if (!ret) + ret = sf1_read(adap, 3, 0, 1, &info); + t4_write_reg(adap, SF_OP, 0); /* unlock SF */ + if (ret) + return ret; + + for (ret = 0; ret < ARRAY_SIZE(supported_flash); ++ret) + if (supported_flash[ret].vendor_and_model_id == info) { + adap->params.sf_size = supported_flash[ret].size_mb; + adap->params.sf_nsec = + adap->params.sf_size / SF_SEC_SIZE; + return 0; + } + + if ((info & 0xff) != 0x20) /* not a Numonix flash */ + return -EINVAL; + info >>= 16; /* log2 of size */ + if (info >= 0x14 && info < 0x18) + adap->params.sf_nsec = 1 << (info - 16); + else if (info == 0x18) + adap->params.sf_nsec = 64; + else + return -EINVAL; + adap->params.sf_size = 1 << info; + adap->params.sf_fw_start = + t4_read_reg(adap, CIM_BOOT_CFG) & BOOTADDR_MASK; + + if (adap->params.sf_size < FLASH_MIN_SIZE) + dev_warn(adap->pdev_dev, "WARNING!!! FLASH size %#x < %#x!!!\n", + adap->params.sf_size, FLASH_MIN_SIZE); + return 0; +} + +/** + * t4_prep_adapter - prepare SW and HW for operation + * @adapter: the adapter + * @reset: if true perform a HW reset + * + * Initialize adapter SW state for the various HW modules, set initial + * values for some adapter tunables, take PHYs out of reset, and + * initialize the MDIO interface. + */ +int t4_prep_adapter(struct adapter *adapter) +{ + int ret, ver; + uint16_t device_id; + u32 pl_rev; + + get_pci_mode(adapter, &adapter->params.pci); + pl_rev = G_REV(t4_read_reg(adapter, PL_REV)); + + ret = get_flash_params(adapter); + if (ret < 0) { + dev_err(adapter->pdev_dev, "error %d identifying flash\n", ret); + return ret; + } + + /* Retrieve adapter's device ID + */ + pci_read_config_word(adapter->pdev, PCI_DEVICE_ID, &device_id); + ver = device_id >> 12; + adapter->params.chip = 0; + switch (ver) { + case CHELSIO_T4: + adapter->params.chip |= CHELSIO_CHIP_CODE(CHELSIO_T4, pl_rev); + break; + case CHELSIO_T5: + adapter->params.chip |= CHELSIO_CHIP_CODE(CHELSIO_T5, pl_rev); + break; + default: + dev_err(adapter->pdev_dev, "Device %d is not supported\n", + device_id); + return -EINVAL; + } + + init_cong_ctrl(adapter->params.a_wnd, adapter->params.b_wnd); + + /* + * Default port for debugging in case we can't reach FW. + */ + adapter->params.nports = 1; + adapter->params.portvec = 1; + adapter->params.vpd.cclk = 50000; + return 0; +} + +/** + * t4_init_tp_params - initialize adap->params.tp + * @adap: the adapter + * + * Initialize various fields of the adapter's TP Parameters structure. + */ +int t4_init_tp_params(struct adapter *adap) +{ + int chan; + u32 v; + + v = t4_read_reg(adap, TP_TIMER_RESOLUTION); + adap->params.tp.tre = TIMERRESOLUTION_GET(v); + adap->params.tp.dack_re = DELAYEDACKRESOLUTION_GET(v); + + /* MODQ_REQ_MAP defaults to setting queues 0-3 to chan 0-3 */ + for (chan = 0; chan < NCHAN; chan++) + adap->params.tp.tx_modq[chan] = chan; + + /* Cache the adapter's Compressed Filter Mode and global Incress + * Configuration. + */ + t4_read_indirect(adap, TP_PIO_ADDR, TP_PIO_DATA, + &adap->params.tp.vlan_pri_map, 1, + TP_VLAN_PRI_MAP); + t4_read_indirect(adap, TP_PIO_ADDR, TP_PIO_DATA, + &adap->params.tp.ingress_config, 1, + TP_INGRESS_CONFIG); + + /* Now that we have TP_VLAN_PRI_MAP cached, we can calculate the field + * shift positions of several elements of the Compressed Filter Tuple + * for this adapter which we need frequently ... + */ + adap->params.tp.vlan_shift = t4_filter_field_shift(adap, F_VLAN); + adap->params.tp.vnic_shift = t4_filter_field_shift(adap, F_VNIC_ID); + adap->params.tp.port_shift = t4_filter_field_shift(adap, F_PORT); + adap->params.tp.protocol_shift = t4_filter_field_shift(adap, + F_PROTOCOL); + + /* If TP_INGRESS_CONFIG.VNID == 0, then TP_VLAN_PRI_MAP.VNIC_ID + * represents the presense of an Outer VLAN instead of a VNIC ID. + */ + if ((adap->params.tp.ingress_config & F_VNIC) == 0) + adap->params.tp.vnic_shift = -1; + + return 0; +} + +/** + * t4_filter_field_shift - calculate filter field shift + * @adap: the adapter + * @filter_sel: the desired field (from TP_VLAN_PRI_MAP bits) + * + * Return the shift position of a filter field within the Compressed + * Filter Tuple. The filter field is specified via its selection bit + * within TP_VLAN_PRI_MAL (filter mode). E.g. F_VLAN. + */ +int t4_filter_field_shift(const struct adapter *adap, int filter_sel) +{ + unsigned int filter_mode = adap->params.tp.vlan_pri_map; + unsigned int sel; + int field_shift; + + if ((filter_mode & filter_sel) == 0) + return -1; + + for (sel = 1, field_shift = 0; sel < filter_sel; sel <<= 1) { + switch (filter_mode & sel) { + case F_FCOE: + field_shift += W_FT_FCOE; + break; + case F_PORT: + field_shift += W_FT_PORT; + break; + case F_VNIC_ID: + field_shift += W_FT_VNIC_ID; + break; + case F_VLAN: + field_shift += W_FT_VLAN; + break; + case F_TOS: + field_shift += W_FT_TOS; + break; + case F_PROTOCOL: + field_shift += W_FT_PROTOCOL; + break; + case F_ETHERTYPE: + field_shift += W_FT_ETHERTYPE; + break; + case F_MACMATCH: + field_shift += W_FT_MACMATCH; + break; + case F_MPSHITTYPE: + field_shift += W_FT_MPSHITTYPE; + break; + case F_FRAGMENTATION: + field_shift += W_FT_FRAGMENTATION; + break; + } + } + return field_shift; +} + +int t4_port_init(struct adapter *adap, int mbox, int pf, int vf) +{ + u8 addr[6]; + int ret, i, j = 0; + struct fw_port_cmd c; + struct fw_rss_vi_config_cmd rvc; + + memset(&c, 0, sizeof(c)); + memset(&rvc, 0, sizeof(rvc)); + + for_each_port(adap, i) { + unsigned int rss_size; + struct port_info *p = adap2pinfo(adap, i); + + while ((adap->params.portvec & (1 << j)) == 0) + j++; + + c.op_to_portid = htonl(FW_CMD_OP(FW_PORT_CMD) | + FW_CMD_REQUEST | FW_CMD_READ | + FW_PORT_CMD_PORTID(j)); + c.action_to_len16 = htonl( + FW_PORT_CMD_ACTION(FW_PORT_ACTION_GET_PORT_INFO) | + FW_LEN16(c)); + ret = t4_wr_mbox(adap, mbox, &c, sizeof(c), &c); + if (ret) + return ret; + + ret = t4_alloc_vi(adap, mbox, j, pf, vf, 1, addr, &rss_size); + if (ret < 0) + return ret; + + p->viid = ret; + p->tx_chan = j; + p->lport = j; + p->rss_size = rss_size; + memcpy(adap->port[i]->dev_addr, addr, ETH_ALEN); + adap->port[i]->dev_port = j; + + ret = ntohl(c.u.info.lstatus_to_modtype); + p->mdio_addr = (ret & FW_PORT_CMD_MDIOCAP) ? + FW_PORT_CMD_MDIOADDR_GET(ret) : -1; + p->port_type = FW_PORT_CMD_PTYPE_GET(ret); + p->mod_type = FW_PORT_MOD_TYPE_NA; + + rvc.op_to_viid = htonl(FW_CMD_OP(FW_RSS_VI_CONFIG_CMD) | + FW_CMD_REQUEST | FW_CMD_READ | + FW_RSS_VI_CONFIG_CMD_VIID(p->viid)); + rvc.retval_len16 = htonl(FW_LEN16(rvc)); + ret = t4_wr_mbox(adap, mbox, &rvc, sizeof(rvc), &rvc); + if (ret) + return ret; + p->rss_mode = ntohl(rvc.u.basicvirtual.defaultq_to_udpen); + + init_link_config(&p->link_cfg, ntohs(c.u.info.pcap)); + j++; + } + return 0; +} diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.h b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.h new file mode 100644 index 0000000..c19a90e --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.h @@ -0,0 +1,227 @@ +/* + * This file is part of the Chelsio T4 Ethernet driver for Linux. + * + * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __T4_HW_H +#define __T4_HW_H + +#include + +enum { + NCHAN = 4, /* # of HW channels */ + MAX_MTU = 9600, /* max MAC MTU, excluding header + FCS */ + EEPROMSIZE = 17408, /* Serial EEPROM physical size */ + EEPROMVSIZE = 32768, /* Serial EEPROM virtual address space size */ + EEPROMPFSIZE = 1024, /* EEPROM writable area size for PFn, n>0 */ + RSS_NENTRIES = 2048, /* # of entries in RSS mapping table */ + TCB_SIZE = 128, /* TCB size */ + NMTUS = 16, /* size of MTU table */ + NCCTRL_WIN = 32, /* # of congestion control windows */ + L2T_SIZE = 4096, /* # of L2T entries */ + MBOX_LEN = 64, /* mailbox size in bytes */ + TRACE_LEN = 112, /* length of trace data and mask */ + FILTER_OPT_LEN = 36, /* filter tuple width for optional components */ + NWOL_PAT = 8, /* # of WoL patterns */ + WOL_PAT_LEN = 128, /* length of WoL patterns */ +}; + +enum { + SF_PAGE_SIZE = 256, /* serial flash page size */ + SF_SEC_SIZE = 64 * 1024, /* serial flash sector size */ +}; + +enum { RSP_TYPE_FLBUF, RSP_TYPE_CPL, RSP_TYPE_INTR }; /* response entry types */ + +enum { MBOX_OWNER_NONE, MBOX_OWNER_FW, MBOX_OWNER_DRV }; /* mailbox owners */ + +enum { + SGE_MAX_WR_LEN = 512, /* max WR size in bytes */ + SGE_NTIMERS = 6, /* # of interrupt holdoff timer values */ + SGE_NCOUNTERS = 4, /* # of interrupt packet counter values */ + SGE_MAX_IQ_SIZE = 65520, + + SGE_TIMER_RSTRT_CNTR = 6, /* restart RX packet threshold counter */ + SGE_TIMER_UPD_CIDX = 7, /* update cidx only */ + + SGE_EQ_IDXSIZE = 64, /* egress queue pidx/cidx unit size */ + + SGE_INTRDST_PCI = 0, /* interrupt destination is PCI-E */ + SGE_INTRDST_IQ = 1, /* destination is an ingress queue */ + + SGE_UPDATEDEL_NONE = 0, /* ingress queue pidx update delivery */ + SGE_UPDATEDEL_INTR = 1, /* interrupt */ + SGE_UPDATEDEL_STPG = 2, /* status page */ + SGE_UPDATEDEL_BOTH = 3, /* interrupt and status page */ + + SGE_HOSTFCMODE_NONE = 0, /* egress queue cidx updates */ + SGE_HOSTFCMODE_IQ = 1, /* sent to ingress queue */ + SGE_HOSTFCMODE_STPG = 2, /* sent to status page */ + SGE_HOSTFCMODE_BOTH = 3, /* ingress queue and status page */ + + SGE_FETCHBURSTMIN_16B = 0,/* egress queue descriptor fetch minimum */ + SGE_FETCHBURSTMIN_32B = 1, + SGE_FETCHBURSTMIN_64B = 2, + SGE_FETCHBURSTMIN_128B = 3, + + SGE_FETCHBURSTMAX_64B = 0,/* egress queue descriptor fetch maximum */ + SGE_FETCHBURSTMAX_128B = 1, + SGE_FETCHBURSTMAX_256B = 2, + SGE_FETCHBURSTMAX_512B = 3, + + SGE_CIDXFLUSHTHRESH_1 = 0,/* egress queue cidx flush threshold */ + SGE_CIDXFLUSHTHRESH_2 = 1, + SGE_CIDXFLUSHTHRESH_4 = 2, + SGE_CIDXFLUSHTHRESH_8 = 3, + SGE_CIDXFLUSHTHRESH_16 = 4, + SGE_CIDXFLUSHTHRESH_32 = 5, + SGE_CIDXFLUSHTHRESH_64 = 6, + SGE_CIDXFLUSHTHRESH_128 = 7, + + SGE_INGPADBOUNDARY_SHIFT = 5,/* ingress queue pad boundary */ +}; + +struct sge_qstat { /* data written to SGE queue status entries */ + __be32 qid; + __be16 cidx; + __be16 pidx; +}; + +/* + * Structure for last 128 bits of response descriptors + */ +struct rsp_ctrl { + __be32 hdrbuflen_pidx; + __be32 pldbuflen_qid; + union { + u8 type_gen; + __be64 last_flit; + }; +}; + +#define RSPD_NEWBUF 0x80000000U +#define RSPD_LEN(x) (((x) >> 0) & 0x7fffffffU) +#define RSPD_QID(x) RSPD_LEN(x) + +#define RSPD_GEN(x) ((x) >> 7) +#define RSPD_TYPE(x) (((x) >> 4) & 3) + +#define V_QINTR_CNT_EN 0x0 +#define QINTR_CNT_EN 0x1 +#define QINTR_TIMER_IDX(x) ((x) << 1) +#define QINTR_TIMER_IDX_GET(x) (((x) >> 1) & 0x7) + +/* + * Flash layout. + */ +#define FLASH_START(start) ((start) * SF_SEC_SIZE) +#define FLASH_MAX_SIZE(nsecs) ((nsecs) * SF_SEC_SIZE) + +enum { + /* + * Various Expansion-ROM boot images, etc. + */ + FLASH_EXP_ROM_START_SEC = 0, + FLASH_EXP_ROM_NSECS = 6, + FLASH_EXP_ROM_START = FLASH_START(FLASH_EXP_ROM_START_SEC), + FLASH_EXP_ROM_MAX_SIZE = FLASH_MAX_SIZE(FLASH_EXP_ROM_NSECS), + + /* + * iSCSI Boot Firmware Table (iBFT) and other driver-related + * parameters ... + */ + FLASH_IBFT_START_SEC = 6, + FLASH_IBFT_NSECS = 1, + FLASH_IBFT_START = FLASH_START(FLASH_IBFT_START_SEC), + FLASH_IBFT_MAX_SIZE = FLASH_MAX_SIZE(FLASH_IBFT_NSECS), + + /* + * Boot configuration data. + */ + FLASH_BOOTCFG_START_SEC = 7, + FLASH_BOOTCFG_NSECS = 1, + FLASH_BOOTCFG_START = FLASH_START(FLASH_BOOTCFG_START_SEC), + FLASH_BOOTCFG_MAX_SIZE = FLASH_MAX_SIZE(FLASH_BOOTCFG_NSECS), + + /* + * Location of firmware image in FLASH. + */ + FLASH_FW_START_SEC = 8, + FLASH_FW_NSECS = 16, + FLASH_FW_START = FLASH_START(FLASH_FW_START_SEC), + FLASH_FW_MAX_SIZE = FLASH_MAX_SIZE(FLASH_FW_NSECS), + + /* + * iSCSI persistent/crash information. + */ + FLASH_ISCSI_CRASH_START_SEC = 29, + FLASH_ISCSI_CRASH_NSECS = 1, + FLASH_ISCSI_CRASH_START = FLASH_START(FLASH_ISCSI_CRASH_START_SEC), + FLASH_ISCSI_CRASH_MAX_SIZE = FLASH_MAX_SIZE(FLASH_ISCSI_CRASH_NSECS), + + /* + * FCoE persistent/crash information. + */ + FLASH_FCOE_CRASH_START_SEC = 30, + FLASH_FCOE_CRASH_NSECS = 1, + FLASH_FCOE_CRASH_START = FLASH_START(FLASH_FCOE_CRASH_START_SEC), + FLASH_FCOE_CRASH_MAX_SIZE = FLASH_MAX_SIZE(FLASH_FCOE_CRASH_NSECS), + + /* + * Location of Firmware Configuration File in FLASH. Since the FPGA + * "FLASH" is smaller we need to store the Configuration File in a + * different location -- which will overlap the end of the firmware + * image if firmware ever gets that large ... + */ + FLASH_CFG_START_SEC = 31, + FLASH_CFG_NSECS = 1, + FLASH_CFG_START = FLASH_START(FLASH_CFG_START_SEC), + FLASH_CFG_MAX_SIZE = FLASH_MAX_SIZE(FLASH_CFG_NSECS), + + /* We don't support FLASH devices which can't support the full + * standard set of sections which we need for normal + * operations. + */ + FLASH_MIN_SIZE = FLASH_CFG_START + FLASH_CFG_MAX_SIZE, + + FLASH_FPGA_CFG_START_SEC = 15, + FLASH_FPGA_CFG_START = FLASH_START(FLASH_FPGA_CFG_START_SEC), + + /* + * Sectors 32-63 are reserved for FLASH failover. + */ +}; + +#undef FLASH_START +#undef FLASH_MAX_SIZE + +#endif /* __T4_HW_H */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h new file mode 100644 index 0000000..5f4db23 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h @@ -0,0 +1,841 @@ +/* + * This file is part of the Chelsio T4 Ethernet driver for Linux. + * + * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __T4_MSG_H +#define __T4_MSG_H + +#include + +enum { + CPL_PASS_OPEN_REQ = 0x1, + CPL_PASS_ACCEPT_RPL = 0x2, + CPL_ACT_OPEN_REQ = 0x3, + CPL_SET_TCB_FIELD = 0x5, + CPL_GET_TCB = 0x6, + CPL_CLOSE_CON_REQ = 0x8, + CPL_CLOSE_LISTSRV_REQ = 0x9, + CPL_ABORT_REQ = 0xA, + CPL_ABORT_RPL = 0xB, + CPL_RX_DATA_ACK = 0xD, + CPL_TX_PKT = 0xE, + CPL_L2T_WRITE_REQ = 0x12, + CPL_TID_RELEASE = 0x1A, + + CPL_CLOSE_LISTSRV_RPL = 0x20, + CPL_L2T_WRITE_RPL = 0x23, + CPL_PASS_OPEN_RPL = 0x24, + CPL_ACT_OPEN_RPL = 0x25, + CPL_PEER_CLOSE = 0x26, + CPL_ABORT_REQ_RSS = 0x2B, + CPL_ABORT_RPL_RSS = 0x2D, + + CPL_CLOSE_CON_RPL = 0x32, + CPL_ISCSI_HDR = 0x33, + CPL_RDMA_CQE = 0x35, + CPL_RDMA_CQE_READ_RSP = 0x36, + CPL_RDMA_CQE_ERR = 0x37, + CPL_RX_DATA = 0x39, + CPL_SET_TCB_RPL = 0x3A, + CPL_RX_PKT = 0x3B, + CPL_RX_DDP_COMPLETE = 0x3F, + + CPL_ACT_ESTABLISH = 0x40, + CPL_PASS_ESTABLISH = 0x41, + CPL_RX_DATA_DDP = 0x42, + CPL_PASS_ACCEPT_REQ = 0x44, + CPL_TRACE_PKT_T5 = 0x48, + CPL_RX_ISCSI_DDP = 0x49, + + CPL_RDMA_READ_REQ = 0x60, + + CPL_PASS_OPEN_REQ6 = 0x81, + CPL_ACT_OPEN_REQ6 = 0x83, + + CPL_RDMA_TERMINATE = 0xA2, + CPL_RDMA_WRITE = 0xA4, + CPL_SGE_EGR_UPDATE = 0xA5, + + CPL_TRACE_PKT = 0xB0, + CPL_ISCSI_DATA = 0xB2, + + CPL_FW4_MSG = 0xC0, + CPL_FW4_PLD = 0xC1, + CPL_FW4_ACK = 0xC3, + + CPL_FW6_MSG = 0xE0, + CPL_FW6_PLD = 0xE1, + CPL_TX_PKT_LSO = 0xED, + CPL_TX_PKT_XT = 0xEE, + + NUM_CPL_CMDS +}; + +enum CPL_error { + CPL_ERR_NONE = 0, + CPL_ERR_TCAM_FULL = 3, + CPL_ERR_BAD_LENGTH = 15, + CPL_ERR_BAD_ROUTE = 18, + CPL_ERR_CONN_RESET = 20, + CPL_ERR_CONN_EXIST_SYNRECV = 21, + CPL_ERR_CONN_EXIST = 22, + CPL_ERR_ARP_MISS = 23, + CPL_ERR_BAD_SYN = 24, + CPL_ERR_CONN_TIMEDOUT = 30, + CPL_ERR_XMIT_TIMEDOUT = 31, + CPL_ERR_PERSIST_TIMEDOUT = 32, + CPL_ERR_FINWAIT2_TIMEDOUT = 33, + CPL_ERR_KEEPALIVE_TIMEDOUT = 34, + CPL_ERR_RTX_NEG_ADVICE = 35, + CPL_ERR_PERSIST_NEG_ADVICE = 36, + CPL_ERR_KEEPALV_NEG_ADVICE = 37, + CPL_ERR_ABORT_FAILED = 42, + CPL_ERR_IWARP_FLM = 50, +}; + +enum { + ULP_MODE_NONE = 0, + ULP_MODE_ISCSI = 2, + ULP_MODE_RDMA = 4, + ULP_MODE_TCPDDP = 5, + ULP_MODE_FCOE = 6, +}; + +enum { + ULP_CRC_HEADER = 1 << 0, + ULP_CRC_DATA = 1 << 1 +}; + +enum { + CPL_ABORT_SEND_RST = 0, + CPL_ABORT_NO_RST, +}; + +enum { /* TX_PKT_XT checksum types */ + TX_CSUM_TCP = 0, + TX_CSUM_UDP = 1, + TX_CSUM_CRC16 = 4, + TX_CSUM_CRC32 = 5, + TX_CSUM_CRC32C = 6, + TX_CSUM_FCOE = 7, + TX_CSUM_TCPIP = 8, + TX_CSUM_UDPIP = 9, + TX_CSUM_TCPIP6 = 10, + TX_CSUM_UDPIP6 = 11, + TX_CSUM_IP = 12, +}; + +union opcode_tid { + __be32 opcode_tid; + u8 opcode; +}; + +#define CPL_OPCODE(x) ((x) << 24) +#define G_CPL_OPCODE(x) (((x) >> 24) & 0xFF) +#define MK_OPCODE_TID(opcode, tid) (CPL_OPCODE(opcode) | (tid)) +#define OPCODE_TID(cmd) ((cmd)->ot.opcode_tid) +#define GET_TID(cmd) (ntohl(OPCODE_TID(cmd)) & 0xFFFFFF) + +/* partitioning of TID fields that also carry a queue id */ +#define GET_TID_TID(x) ((x) & 0x3fff) +#define GET_TID_QID(x) (((x) >> 14) & 0x3ff) +#define TID_QID(x) ((x) << 14) + +struct rss_header { + u8 opcode; +#if defined(__LITTLE_ENDIAN_BITFIELD) + u8 channel:2; + u8 filter_hit:1; + u8 filter_tid:1; + u8 hash_type:2; + u8 ipv6:1; + u8 send2fw:1; +#else + u8 send2fw:1; + u8 ipv6:1; + u8 hash_type:2; + u8 filter_tid:1; + u8 filter_hit:1; + u8 channel:2; +#endif + __be16 qid; + __be32 hash_val; +}; + +struct work_request_hdr { + __be32 wr_hi; + __be32 wr_mid; + __be64 wr_lo; +}; + +/* wr_hi fields */ +#define S_WR_OP 24 +#define V_WR_OP(x) ((__u64)(x) << S_WR_OP) + +#define WR_HDR struct work_request_hdr wr + +/* option 0 fields */ +#define S_MSS_IDX 60 +#define M_MSS_IDX 0xF +#define V_MSS_IDX(x) ((__u64)(x) << S_MSS_IDX) +#define G_MSS_IDX(x) (((x) >> S_MSS_IDX) & M_MSS_IDX) + +/* option 2 fields */ +#define S_RSS_QUEUE 0 +#define M_RSS_QUEUE 0x3FF +#define V_RSS_QUEUE(x) ((x) << S_RSS_QUEUE) +#define G_RSS_QUEUE(x) (((x) >> S_RSS_QUEUE) & M_RSS_QUEUE) + +struct cpl_pass_open_req { + WR_HDR; + union opcode_tid ot; + __be16 local_port; + __be16 peer_port; + __be32 local_ip; + __be32 peer_ip; + __be64 opt0; +#define TX_CHAN(x) ((x) << 2) +#define NO_CONG(x) ((x) << 4) +#define DELACK(x) ((x) << 5) +#define ULP_MODE(x) ((x) << 8) +#define RCV_BUFSIZ(x) ((x) << 12) +#define RCV_BUFSIZ_MASK 0x3FFU +#define DSCP(x) ((x) << 22) +#define SMAC_SEL(x) ((u64)(x) << 28) +#define L2T_IDX(x) ((u64)(x) << 36) +#define TCAM_BYPASS(x) ((u64)(x) << 48) +#define NAGLE(x) ((u64)(x) << 49) +#define WND_SCALE(x) ((u64)(x) << 50) +#define KEEP_ALIVE(x) ((u64)(x) << 54) +#define MSS_IDX(x) ((u64)(x) << 60) + __be64 opt1; +#define SYN_RSS_ENABLE (1 << 0) +#define SYN_RSS_QUEUE(x) ((x) << 2) +#define CONN_POLICY_ASK (1 << 22) +}; + +struct cpl_pass_open_req6 { + WR_HDR; + union opcode_tid ot; + __be16 local_port; + __be16 peer_port; + __be64 local_ip_hi; + __be64 local_ip_lo; + __be64 peer_ip_hi; + __be64 peer_ip_lo; + __be64 opt0; + __be64 opt1; +}; + +struct cpl_pass_open_rpl { + union opcode_tid ot; + u8 rsvd[3]; + u8 status; +}; + +struct cpl_pass_accept_rpl { + WR_HDR; + union opcode_tid ot; + __be32 opt2; +#define RSS_QUEUE(x) ((x) << 0) +#define RSS_QUEUE_VALID (1 << 10) +#define RX_COALESCE_VALID(x) ((x) << 11) +#define RX_COALESCE(x) ((x) << 12) +#define PACE(x) ((x) << 16) +#define RX_FC_VALID ((1U) << 19) +#define RX_FC_DISABLE ((1U) << 20) +#define TX_QUEUE(x) ((x) << 23) +#define RX_CHANNEL(x) ((x) << 26) +#define CCTRL_ECN(x) ((x) << 27) +#define WND_SCALE_EN(x) ((x) << 28) +#define TSTAMPS_EN(x) ((x) << 29) +#define SACK_EN(x) ((x) << 30) +#define T5_OPT_2_VALID ((1U) << 31) + __be64 opt0; +}; + +struct cpl_t5_pass_accept_rpl { + WR_HDR; + union opcode_tid ot; + __be32 opt2; + __be64 opt0; + __be32 iss; + __be32 rsvd; +}; + +struct cpl_act_open_req { + WR_HDR; + union opcode_tid ot; + __be16 local_port; + __be16 peer_port; + __be32 local_ip; + __be32 peer_ip; + __be64 opt0; + __be32 params; + __be32 opt2; +}; + +#define S_FILTER_TUPLE 24 +#define M_FILTER_TUPLE 0xFFFFFFFFFF +#define V_FILTER_TUPLE(x) ((x) << S_FILTER_TUPLE) +#define G_FILTER_TUPLE(x) (((x) >> S_FILTER_TUPLE) & M_FILTER_TUPLE) +struct cpl_t5_act_open_req { + WR_HDR; + union opcode_tid ot; + __be16 local_port; + __be16 peer_port; + __be32 local_ip; + __be32 peer_ip; + __be64 opt0; + __be32 rsvd; + __be32 opt2; + __be64 params; +}; + +struct cpl_act_open_req6 { + WR_HDR; + union opcode_tid ot; + __be16 local_port; + __be16 peer_port; + __be64 local_ip_hi; + __be64 local_ip_lo; + __be64 peer_ip_hi; + __be64 peer_ip_lo; + __be64 opt0; + __be32 params; + __be32 opt2; +}; + +struct cpl_t5_act_open_req6 { + WR_HDR; + union opcode_tid ot; + __be16 local_port; + __be16 peer_port; + __be64 local_ip_hi; + __be64 local_ip_lo; + __be64 peer_ip_hi; + __be64 peer_ip_lo; + __be64 opt0; + __be32 rsvd; + __be32 opt2; + __be64 params; +}; + +struct cpl_act_open_rpl { + union opcode_tid ot; + __be32 atid_status; +#define GET_AOPEN_STATUS(x) ((x) & 0xff) +#define GET_AOPEN_ATID(x) (((x) >> 8) & 0xffffff) +}; + +struct cpl_pass_establish { + union opcode_tid ot; + __be32 rsvd; + __be32 tos_stid; +#define PASS_OPEN_TID(x) ((x) << 0) +#define PASS_OPEN_TOS(x) ((x) << 24) +#define GET_PASS_OPEN_TID(x) (((x) >> 0) & 0xFFFFFF) +#define GET_POPEN_TID(x) ((x) & 0xffffff) +#define GET_POPEN_TOS(x) (((x) >> 24) & 0xff) + __be16 mac_idx; + __be16 tcp_opt; +#define GET_TCPOPT_WSCALE_OK(x) (((x) >> 5) & 1) +#define GET_TCPOPT_SACK(x) (((x) >> 6) & 1) +#define GET_TCPOPT_TSTAMP(x) (((x) >> 7) & 1) +#define GET_TCPOPT_SND_WSCALE(x) (((x) >> 8) & 0xf) +#define GET_TCPOPT_MSS(x) (((x) >> 12) & 0xf) + __be32 snd_isn; + __be32 rcv_isn; +}; + +struct cpl_act_establish { + union opcode_tid ot; + __be32 rsvd; + __be32 tos_atid; + __be16 mac_idx; + __be16 tcp_opt; + __be32 snd_isn; + __be32 rcv_isn; +}; + +struct cpl_get_tcb { + WR_HDR; + union opcode_tid ot; + __be16 reply_ctrl; +#define QUEUENO(x) ((x) << 0) +#define REPLY_CHAN(x) ((x) << 14) +#define NO_REPLY(x) ((x) << 15) + __be16 cookie; +}; + +struct cpl_set_tcb_field { + WR_HDR; + union opcode_tid ot; + __be16 reply_ctrl; + __be16 word_cookie; +#define TCB_WORD(x) ((x) << 0) +#define TCB_COOKIE(x) ((x) << 5) +#define GET_TCB_COOKIE(x) (((x) >> 5) & 7) + __be64 mask; + __be64 val; +}; + +struct cpl_set_tcb_rpl { + union opcode_tid ot; + __be16 rsvd; + u8 cookie; + u8 status; + __be64 oldval; +}; + +struct cpl_close_con_req { + WR_HDR; + union opcode_tid ot; + __be32 rsvd; +}; + +struct cpl_close_con_rpl { + union opcode_tid ot; + u8 rsvd[3]; + u8 status; + __be32 snd_nxt; + __be32 rcv_nxt; +}; + +struct cpl_close_listsvr_req { + WR_HDR; + union opcode_tid ot; + __be16 reply_ctrl; +#define LISTSVR_IPV6(x) ((x) << 14) + __be16 rsvd; +}; + +struct cpl_close_listsvr_rpl { + union opcode_tid ot; + u8 rsvd[3]; + u8 status; +}; + +struct cpl_abort_req_rss { + union opcode_tid ot; + u8 rsvd[3]; + u8 status; +}; + +struct cpl_abort_req { + WR_HDR; + union opcode_tid ot; + __be32 rsvd0; + u8 rsvd1; + u8 cmd; + u8 rsvd2[6]; +}; + +struct cpl_abort_rpl_rss { + union opcode_tid ot; + u8 rsvd[3]; + u8 status; +}; + +struct cpl_abort_rpl { + WR_HDR; + union opcode_tid ot; + __be32 rsvd0; + u8 rsvd1; + u8 cmd; + u8 rsvd2[6]; +}; + +struct cpl_peer_close { + union opcode_tid ot; + __be32 rcv_nxt; +}; + +struct cpl_tid_release { + WR_HDR; + union opcode_tid ot; + __be32 rsvd; +}; + +struct cpl_tx_pkt_core { + __be32 ctrl0; +#define TXPKT_VF(x) ((x) << 0) +#define TXPKT_PF(x) ((x) << 8) +#define TXPKT_VF_VLD (1 << 11) +#define TXPKT_OVLAN_IDX(x) ((x) << 12) +#define TXPKT_INTF(x) ((x) << 16) +#define TXPKT_INS_OVLAN (1 << 21) +#define TXPKT_OPCODE(x) ((x) << 24) + __be16 pack; + __be16 len; + __be64 ctrl1; +#define TXPKT_CSUM_END(x) ((x) << 12) +#define TXPKT_CSUM_START(x) ((x) << 20) +#define TXPKT_IPHDR_LEN(x) ((u64)(x) << 20) +#define TXPKT_CSUM_LOC(x) ((u64)(x) << 30) +#define TXPKT_ETHHDR_LEN(x) ((u64)(x) << 34) +#define TXPKT_CSUM_TYPE(x) ((u64)(x) << 40) +#define TXPKT_VLAN(x) ((u64)(x) << 44) +#define TXPKT_VLAN_VLD (1ULL << 60) +#define TXPKT_IPCSUM_DIS (1ULL << 62) +#define TXPKT_L4CSUM_DIS (1ULL << 63) +}; + +struct cpl_tx_pkt { + WR_HDR; + struct cpl_tx_pkt_core c; +}; + +#define cpl_tx_pkt_xt cpl_tx_pkt + +struct cpl_tx_pkt_lso_core { + __be32 lso_ctrl; +#define LSO_TCPHDR_LEN(x) ((x) << 0) +#define LSO_IPHDR_LEN(x) ((x) << 4) +#define LSO_ETHHDR_LEN(x) ((x) << 16) +#define LSO_IPV6(x) ((x) << 20) +#define LSO_LAST_SLICE (1 << 22) +#define LSO_FIRST_SLICE (1 << 23) +#define LSO_OPCODE(x) ((x) << 24) +#define LSO_T5_XFER_SIZE(x) ((x) << 0) + __be16 ipid_ofst; + __be16 mss; + __be32 seqno_offset; + __be32 len; + /* encapsulated CPL (TX_PKT, TX_PKT_XT or TX_DATA) follows here */ +}; + +struct cpl_tx_pkt_lso { + WR_HDR; + struct cpl_tx_pkt_lso_core c; + /* encapsulated CPL (TX_PKT, TX_PKT_XT or TX_DATA) follows here */ +}; + +struct cpl_iscsi_hdr { + union opcode_tid ot; + __be16 pdu_len_ddp; +#define ISCSI_PDU_LEN(x) ((x) & 0x7FFF) +#define ISCSI_DDP (1 << 15) + __be16 len; + __be32 seq; + __be16 urg; + u8 rsvd; + u8 status; +}; + +struct cpl_rx_data { + union opcode_tid ot; + __be16 rsvd; + __be16 len; + __be32 seq; + __be16 urg; +#if defined(__LITTLE_ENDIAN_BITFIELD) + u8 dack_mode:2; + u8 psh:1; + u8 heartbeat:1; + u8 ddp_off:1; + u8 :3; +#else + u8 :3; + u8 ddp_off:1; + u8 heartbeat:1; + u8 psh:1; + u8 dack_mode:2; +#endif + u8 status; +}; + +struct cpl_rx_data_ack { + WR_HDR; + union opcode_tid ot; + __be32 credit_dack; +#define RX_CREDITS(x) ((x) << 0) +#define RX_FORCE_ACK(x) ((x) << 28) +}; + +struct cpl_rx_pkt { + struct rss_header rsshdr; + u8 opcode; +#if defined(__LITTLE_ENDIAN_BITFIELD) + u8 iff:4; + u8 csum_calc:1; + u8 ipmi_pkt:1; + u8 vlan_ex:1; + u8 ip_frag:1; +#else + u8 ip_frag:1; + u8 vlan_ex:1; + u8 ipmi_pkt:1; + u8 csum_calc:1; + u8 iff:4; +#endif + __be16 csum; + __be16 vlan; + __be16 len; + __be32 l2info; +#define RXF_UDP (1 << 22) +#define RXF_TCP (1 << 23) +#define RXF_IP (1 << 24) +#define RXF_IP6 (1 << 25) + __be16 hdr_len; + __be16 err_vec; +}; + +/* rx_pkt.l2info fields */ +#define S_RX_ETHHDR_LEN 0 +#define M_RX_ETHHDR_LEN 0x1F +#define V_RX_ETHHDR_LEN(x) ((x) << S_RX_ETHHDR_LEN) +#define G_RX_ETHHDR_LEN(x) (((x) >> S_RX_ETHHDR_LEN) & M_RX_ETHHDR_LEN) + +#define S_RX_T5_ETHHDR_LEN 0 +#define M_RX_T5_ETHHDR_LEN 0x3F +#define V_RX_T5_ETHHDR_LEN(x) ((x) << S_RX_T5_ETHHDR_LEN) +#define G_RX_T5_ETHHDR_LEN(x) (((x) >> S_RX_T5_ETHHDR_LEN) & M_RX_T5_ETHHDR_LEN) + +#define S_RX_MACIDX 8 +#define M_RX_MACIDX 0x1FF +#define V_RX_MACIDX(x) ((x) << S_RX_MACIDX) +#define G_RX_MACIDX(x) (((x) >> S_RX_MACIDX) & M_RX_MACIDX) + +#define S_RXF_SYN 21 +#define V_RXF_SYN(x) ((x) << S_RXF_SYN) +#define F_RXF_SYN V_RXF_SYN(1U) + +#define S_RX_CHAN 28 +#define M_RX_CHAN 0xF +#define V_RX_CHAN(x) ((x) << S_RX_CHAN) +#define G_RX_CHAN(x) (((x) >> S_RX_CHAN) & M_RX_CHAN) + +/* rx_pkt.hdr_len fields */ +#define S_RX_TCPHDR_LEN 0 +#define M_RX_TCPHDR_LEN 0x3F +#define V_RX_TCPHDR_LEN(x) ((x) << S_RX_TCPHDR_LEN) +#define G_RX_TCPHDR_LEN(x) (((x) >> S_RX_TCPHDR_LEN) & M_RX_TCPHDR_LEN) + +#define S_RX_IPHDR_LEN 6 +#define M_RX_IPHDR_LEN 0x3FF +#define V_RX_IPHDR_LEN(x) ((x) << S_RX_IPHDR_LEN) +#define G_RX_IPHDR_LEN(x) (((x) >> S_RX_IPHDR_LEN) & M_RX_IPHDR_LEN) + +struct cpl_trace_pkt { + u8 opcode; + u8 intf; +#if defined(__LITTLE_ENDIAN_BITFIELD) + u8 runt:4; + u8 filter_hit:4; + u8 :6; + u8 err:1; + u8 trunc:1; +#else + u8 filter_hit:4; + u8 runt:4; + u8 trunc:1; + u8 err:1; + u8 :6; +#endif + __be16 rsvd; + __be16 len; + __be64 tstamp; +}; + +struct cpl_t5_trace_pkt { + __u8 opcode; + __u8 intf; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 runt:4; + __u8 filter_hit:4; + __u8:6; + __u8 err:1; + __u8 trunc:1; +#else + __u8 filter_hit:4; + __u8 runt:4; + __u8 trunc:1; + __u8 err:1; + __u8:6; +#endif + __be16 rsvd; + __be16 len; + __be64 tstamp; + __be64 rsvd1; +}; + +struct cpl_l2t_write_req { + WR_HDR; + union opcode_tid ot; + __be16 params; +#define L2T_W_INFO(x) ((x) << 2) +#define L2T_W_PORT(x) ((x) << 8) +#define L2T_W_NOREPLY(x) ((x) << 15) + __be16 l2t_idx; + __be16 vlan; + u8 dst_mac[6]; +}; + +struct cpl_l2t_write_rpl { + union opcode_tid ot; + u8 status; + u8 rsvd[3]; +}; + +struct cpl_rdma_terminate { + union opcode_tid ot; + __be16 rsvd; + __be16 len; +}; + +struct cpl_sge_egr_update { + __be32 opcode_qid; +#define EGR_QID(x) ((x) & 0x1FFFF) + __be16 cidx; + __be16 pidx; +}; + +/* cpl_fw*.type values */ +enum { + FW_TYPE_CMD_RPL = 0, + FW_TYPE_WR_RPL = 1, + FW_TYPE_CQE = 2, + FW_TYPE_OFLD_CONNECTION_WR_RPL = 3, + FW_TYPE_RSSCPL = 4, +}; + +struct cpl_fw4_pld { + u8 opcode; + u8 rsvd0[3]; + u8 type; + u8 rsvd1; + __be16 len; + __be64 data; + __be64 rsvd2; +}; + +struct cpl_fw6_pld { + u8 opcode; + u8 rsvd[5]; + __be16 len; + __be64 data[4]; +}; + +struct cpl_fw4_msg { + u8 opcode; + u8 type; + __be16 rsvd0; + __be32 rsvd1; + __be64 data[2]; +}; + +struct cpl_fw4_ack { + union opcode_tid ot; + u8 credits; + u8 rsvd0[2]; + u8 seq_vld; + __be32 snd_nxt; + __be32 snd_una; + __be64 rsvd1; +}; + +struct cpl_fw6_msg { + u8 opcode; + u8 type; + __be16 rsvd0; + __be32 rsvd1; + __be64 data[4]; +}; + +/* cpl_fw6_msg.type values */ +enum { + FW6_TYPE_CMD_RPL = 0, + FW6_TYPE_WR_RPL = 1, + FW6_TYPE_CQE = 2, + FW6_TYPE_OFLD_CONNECTION_WR_RPL = 3, + FW6_TYPE_RSSCPL = FW_TYPE_RSSCPL, +}; + +struct cpl_fw6_msg_ofld_connection_wr_rpl { + __u64 cookie; + __be32 tid; /* or atid in case of active failure */ + __u8 t_state; + __u8 retval; + __u8 rsvd[2]; +}; + +enum { + ULP_TX_MEM_READ = 2, + ULP_TX_MEM_WRITE = 3, + ULP_TX_PKT = 4 +}; + +enum { + ULP_TX_SC_NOOP = 0x80, + ULP_TX_SC_IMM = 0x81, + ULP_TX_SC_DSGL = 0x82, + ULP_TX_SC_ISGL = 0x83 +}; + +struct ulptx_sge_pair { + __be32 len[2]; + __be64 addr[2]; +}; + +struct ulptx_sgl { + __be32 cmd_nsge; +#define ULPTX_CMD(x) ((x) << 24) +#define ULPTX_NSGE(x) ((x) << 0) +#define ULPTX_MORE (1U << 23) + __be32 len0; + __be64 addr0; + struct ulptx_sge_pair sge[0]; +}; + +struct ulp_mem_io { + WR_HDR; + __be32 cmd; +#define ULP_MEMIO_ORDER(x) ((x) << 23) + __be32 len16; /* command length */ + __be32 dlen; /* data length in 32-byte units */ +#define ULP_MEMIO_DATA_LEN(x) ((x) << 0) + __be32 lock_addr; +#define ULP_MEMIO_ADDR(x) ((x) << 0) +#define ULP_MEMIO_LOCK(x) ((x) << 31) +}; + +#define S_T5_ULP_MEMIO_IMM 23 +#define V_T5_ULP_MEMIO_IMM(x) ((x) << S_T5_ULP_MEMIO_IMM) +#define F_T5_ULP_MEMIO_IMM V_T5_ULP_MEMIO_IMM(1U) + +#define S_T5_ULP_MEMIO_ORDER 22 +#define V_T5_ULP_MEMIO_ORDER(x) ((x) << S_T5_ULP_MEMIO_ORDER) +#define F_T5_ULP_MEMIO_ORDER V_T5_ULP_MEMIO_ORDER(1U) + +#endif /* __T4_MSG_H */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h new file mode 100644 index 0000000..8d2de10 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h @@ -0,0 +1,1344 @@ +/* + * This file is part of the Chelsio T4 Ethernet driver for Linux. + * + * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __T4_REGS_H +#define __T4_REGS_H + +#define MYPF_BASE 0x1b000 +#define MYPF_REG(reg_addr) (MYPF_BASE + (reg_addr)) + +#define PF0_BASE 0x1e000 +#define PF0_REG(reg_addr) (PF0_BASE + (reg_addr)) + +#define PF_STRIDE 0x400 +#define PF_BASE(idx) (PF0_BASE + (idx) * PF_STRIDE) +#define PF_REG(idx, reg) (PF_BASE(idx) + (reg)) + +#define MYPORT_BASE 0x1c000 +#define MYPORT_REG(reg_addr) (MYPORT_BASE + (reg_addr)) + +#define PORT0_BASE 0x20000 +#define PORT0_REG(reg_addr) (PORT0_BASE + (reg_addr)) + +#define PORT_STRIDE 0x2000 +#define PORT_BASE(idx) (PORT0_BASE + (idx) * PORT_STRIDE) +#define PORT_REG(idx, reg) (PORT_BASE(idx) + (reg)) + +#define EDC_STRIDE (EDC_1_BASE_ADDR - EDC_0_BASE_ADDR) +#define EDC_REG(reg, idx) (reg + EDC_STRIDE * idx) + +#define PCIE_MEM_ACCESS_REG(reg_addr, idx) ((reg_addr) + (idx) * 8) +#define PCIE_MAILBOX_REG(reg_addr, idx) ((reg_addr) + (idx) * 8) +#define MC_BIST_STATUS_REG(reg_addr, idx) ((reg_addr) + (idx) * 4) +#define EDC_BIST_STATUS_REG(reg_addr, idx) ((reg_addr) + (idx) * 4) + +#define SGE_PF_KDOORBELL 0x0 +#define QID_MASK 0xffff8000U +#define QID_SHIFT 15 +#define QID(x) ((x) << QID_SHIFT) +#define DBPRIO(x) ((x) << 14) +#define DBTYPE(x) ((x) << 13) +#define PIDX_MASK 0x00003fffU +#define PIDX_SHIFT 0 +#define PIDX(x) ((x) << PIDX_SHIFT) +#define PIDX_SHIFT_T5 0 +#define PIDX_T5(x) ((x) << PIDX_SHIFT_T5) + + +#define SGE_TIMERREGS 6 +#define SGE_PF_GTS 0x4 +#define INGRESSQID_MASK 0xffff0000U +#define INGRESSQID_SHIFT 16 +#define INGRESSQID(x) ((x) << INGRESSQID_SHIFT) +#define TIMERREG_MASK 0x0000e000U +#define TIMERREG_SHIFT 13 +#define TIMERREG(x) ((x) << TIMERREG_SHIFT) +#define SEINTARM_MASK 0x00001000U +#define SEINTARM_SHIFT 12 +#define SEINTARM(x) ((x) << SEINTARM_SHIFT) +#define CIDXINC_MASK 0x00000fffU +#define CIDXINC_SHIFT 0 +#define CIDXINC(x) ((x) << CIDXINC_SHIFT) + +#define X_RXPKTCPLMODE_SPLIT 1 +#define X_INGPADBOUNDARY_SHIFT 5 + +#define SGE_CONTROL 0x1008 +#define SGE_CONTROL2_A 0x1124 +#define DCASYSTYPE 0x00080000U +#define RXPKTCPLMODE_MASK 0x00040000U +#define RXPKTCPLMODE_SHIFT 18 +#define RXPKTCPLMODE(x) ((x) << RXPKTCPLMODE_SHIFT) +#define EGRSTATUSPAGESIZE_MASK 0x00020000U +#define EGRSTATUSPAGESIZE_SHIFT 17 +#define EGRSTATUSPAGESIZE(x) ((x) << EGRSTATUSPAGESIZE_SHIFT) +#define PKTSHIFT_MASK 0x00001c00U +#define PKTSHIFT_SHIFT 10 +#define PKTSHIFT(x) ((x) << PKTSHIFT_SHIFT) +#define PKTSHIFT_GET(x) (((x) & PKTSHIFT_MASK) >> PKTSHIFT_SHIFT) +#define INGPCIEBOUNDARY_32B_X 0 +#define INGPCIEBOUNDARY_MASK 0x00000380U +#define INGPCIEBOUNDARY_SHIFT 7 +#define INGPCIEBOUNDARY(x) ((x) << INGPCIEBOUNDARY_SHIFT) +#define INGPADBOUNDARY_MASK 0x00000070U +#define INGPADBOUNDARY_SHIFT 4 +#define INGPADBOUNDARY(x) ((x) << INGPADBOUNDARY_SHIFT) +#define INGPADBOUNDARY_GET(x) (((x) & INGPADBOUNDARY_MASK) \ + >> INGPADBOUNDARY_SHIFT) +#define INGPACKBOUNDARY_16B_X 0 +#define INGPACKBOUNDARY_SHIFT_X 5 + +#define INGPACKBOUNDARY_S 16 +#define INGPACKBOUNDARY_M 0x7U +#define INGPACKBOUNDARY_V(x) ((x) << INGPACKBOUNDARY_S) +#define INGPACKBOUNDARY_G(x) (((x) >> INGPACKBOUNDARY_S) \ + & INGPACKBOUNDARY_M) +#define EGRPCIEBOUNDARY_MASK 0x0000000eU +#define EGRPCIEBOUNDARY_SHIFT 1 +#define EGRPCIEBOUNDARY(x) ((x) << EGRPCIEBOUNDARY_SHIFT) +#define GLOBALENABLE 0x00000001U + +#define SGE_HOST_PAGE_SIZE 0x100c + +#define HOSTPAGESIZEPF7_MASK 0x0000000fU +#define HOSTPAGESIZEPF7_SHIFT 28 +#define HOSTPAGESIZEPF7(x) ((x) << HOSTPAGESIZEPF7_SHIFT) + +#define HOSTPAGESIZEPF6_MASK 0x0000000fU +#define HOSTPAGESIZEPF6_SHIFT 24 +#define HOSTPAGESIZEPF6(x) ((x) << HOSTPAGESIZEPF6_SHIFT) + +#define HOSTPAGESIZEPF5_MASK 0x0000000fU +#define HOSTPAGESIZEPF5_SHIFT 20 +#define HOSTPAGESIZEPF5(x) ((x) << HOSTPAGESIZEPF5_SHIFT) + +#define HOSTPAGESIZEPF4_MASK 0x0000000fU +#define HOSTPAGESIZEPF4_SHIFT 16 +#define HOSTPAGESIZEPF4(x) ((x) << HOSTPAGESIZEPF4_SHIFT) + +#define HOSTPAGESIZEPF3_MASK 0x0000000fU +#define HOSTPAGESIZEPF3_SHIFT 12 +#define HOSTPAGESIZEPF3(x) ((x) << HOSTPAGESIZEPF3_SHIFT) + +#define HOSTPAGESIZEPF2_MASK 0x0000000fU +#define HOSTPAGESIZEPF2_SHIFT 8 +#define HOSTPAGESIZEPF2(x) ((x) << HOSTPAGESIZEPF2_SHIFT) + +#define HOSTPAGESIZEPF1_MASK 0x0000000fU +#define HOSTPAGESIZEPF1_SHIFT 4 +#define HOSTPAGESIZEPF1(x) ((x) << HOSTPAGESIZEPF1_SHIFT) + +#define HOSTPAGESIZEPF0_MASK 0x0000000fU +#define HOSTPAGESIZEPF0_SHIFT 0 +#define HOSTPAGESIZEPF0(x) ((x) << HOSTPAGESIZEPF0_SHIFT) + +#define SGE_EGRESS_QUEUES_PER_PAGE_PF 0x1010 +#define QUEUESPERPAGEPF0_MASK 0x0000000fU +#define QUEUESPERPAGEPF0_GET(x) ((x) & QUEUESPERPAGEPF0_MASK) + +#define QUEUESPERPAGEPF0 0 +#define QUEUESPERPAGEPF1 4 + +/* T5 and later support a new BAR2-based doorbell mechanism for Egress Queues. + * The User Doorbells are each 128 bytes in length with a Simple Doorbell at + * offsets 8x and a Write Combining single 64-byte Egress Queue Unit + * (X_IDXSIZE_UNIT) Gather Buffer interface at offset 64. For Ingress Queues, + * we have a Going To Sleep register at offsets 8x+4. + * + * As noted above, we have many instances of the Simple Doorbell and Going To + * Sleep registers at offsets 8x and 8x+4, respectively. We want to use a + * non-64-byte aligned offset for the Simple Doorbell in order to attempt to + * avoid buffering of the writes to the Simple Doorbell and we want to use a + * non-contiguous offset for the Going To Sleep writes in order to avoid + * possible combining between them. + */ +#define SGE_UDB_SIZE 128 +#define SGE_UDB_KDOORBELL 8 +#define SGE_UDB_GTS 20 +#define SGE_UDB_WCDOORBELL 64 + +#define SGE_INT_CAUSE1 0x1024 +#define SGE_INT_CAUSE2 0x1030 +#define SGE_INT_CAUSE3 0x103c +#define ERR_FLM_DBP 0x80000000U +#define ERR_FLM_IDMA1 0x40000000U +#define ERR_FLM_IDMA0 0x20000000U +#define ERR_FLM_HINT 0x10000000U +#define ERR_PCIE_ERROR3 0x08000000U +#define ERR_PCIE_ERROR2 0x04000000U +#define ERR_PCIE_ERROR1 0x02000000U +#define ERR_PCIE_ERROR0 0x01000000U +#define ERR_TIMER_ABOVE_MAX_QID 0x00800000U +#define ERR_CPL_EXCEED_IQE_SIZE 0x00400000U +#define ERR_INVALID_CIDX_INC 0x00200000U +#define ERR_ITP_TIME_PAUSED 0x00100000U +#define ERR_CPL_OPCODE_0 0x00080000U +#define ERR_DROPPED_DB 0x00040000U +#define ERR_DATA_CPL_ON_HIGH_QID1 0x00020000U +#define ERR_DATA_CPL_ON_HIGH_QID0 0x00010000U +#define ERR_BAD_DB_PIDX3 0x00008000U +#define ERR_BAD_DB_PIDX2 0x00004000U +#define ERR_BAD_DB_PIDX1 0x00002000U +#define ERR_BAD_DB_PIDX0 0x00001000U +#define ERR_ING_PCIE_CHAN 0x00000800U +#define ERR_ING_CTXT_PRIO 0x00000400U +#define ERR_EGR_CTXT_PRIO 0x00000200U +#define DBFIFO_HP_INT 0x00000100U +#define DBFIFO_LP_INT 0x00000080U +#define REG_ADDRESS_ERR 0x00000040U +#define INGRESS_SIZE_ERR 0x00000020U +#define EGRESS_SIZE_ERR 0x00000010U +#define ERR_INV_CTXT3 0x00000008U +#define ERR_INV_CTXT2 0x00000004U +#define ERR_INV_CTXT1 0x00000002U +#define ERR_INV_CTXT0 0x00000001U + +#define SGE_INT_ENABLE3 0x1040 +#define SGE_FL_BUFFER_SIZE0 0x1044 +#define SGE_FL_BUFFER_SIZE1 0x1048 +#define SGE_FL_BUFFER_SIZE2 0x104c +#define SGE_FL_BUFFER_SIZE3 0x1050 +#define SGE_FL_BUFFER_SIZE4 0x1054 +#define SGE_FL_BUFFER_SIZE5 0x1058 +#define SGE_FL_BUFFER_SIZE6 0x105c +#define SGE_FL_BUFFER_SIZE7 0x1060 +#define SGE_FL_BUFFER_SIZE8 0x1064 + +#define SGE_INGRESS_RX_THRESHOLD 0x10a0 +#define THRESHOLD_0_MASK 0x3f000000U +#define THRESHOLD_0_SHIFT 24 +#define THRESHOLD_0(x) ((x) << THRESHOLD_0_SHIFT) +#define THRESHOLD_0_GET(x) (((x) & THRESHOLD_0_MASK) >> THRESHOLD_0_SHIFT) +#define THRESHOLD_1_MASK 0x003f0000U +#define THRESHOLD_1_SHIFT 16 +#define THRESHOLD_1(x) ((x) << THRESHOLD_1_SHIFT) +#define THRESHOLD_1_GET(x) (((x) & THRESHOLD_1_MASK) >> THRESHOLD_1_SHIFT) +#define THRESHOLD_2_MASK 0x00003f00U +#define THRESHOLD_2_SHIFT 8 +#define THRESHOLD_2(x) ((x) << THRESHOLD_2_SHIFT) +#define THRESHOLD_2_GET(x) (((x) & THRESHOLD_2_MASK) >> THRESHOLD_2_SHIFT) +#define THRESHOLD_3_MASK 0x0000003fU +#define THRESHOLD_3_SHIFT 0 +#define THRESHOLD_3(x) ((x) << THRESHOLD_3_SHIFT) +#define THRESHOLD_3_GET(x) (((x) & THRESHOLD_3_MASK) >> THRESHOLD_3_SHIFT) + +#define SGE_CONM_CTRL 0x1094 +#define EGRTHRESHOLD_MASK 0x00003f00U +#define EGRTHRESHOLDshift 8 +#define EGRTHRESHOLD(x) ((x) << EGRTHRESHOLDshift) +#define EGRTHRESHOLD_GET(x) (((x) & EGRTHRESHOLD_MASK) >> EGRTHRESHOLDshift) + +#define EGRTHRESHOLDPACKING_MASK 0x3fU +#define EGRTHRESHOLDPACKING_SHIFT 14 +#define EGRTHRESHOLDPACKING(x) ((x) << EGRTHRESHOLDPACKING_SHIFT) +#define EGRTHRESHOLDPACKING_GET(x) (((x) >> EGRTHRESHOLDPACKING_SHIFT) & \ + EGRTHRESHOLDPACKING_MASK) + +#define SGE_DBFIFO_STATUS 0x10a4 +#define HP_INT_THRESH_SHIFT 28 +#define HP_INT_THRESH_MASK 0xfU +#define HP_INT_THRESH(x) ((x) << HP_INT_THRESH_SHIFT) +#define LP_INT_THRESH_SHIFT 12 +#define LP_INT_THRESH_MASK 0xfU +#define LP_INT_THRESH(x) ((x) << LP_INT_THRESH_SHIFT) + +#define SGE_DOORBELL_CONTROL 0x10a8 +#define ENABLE_DROP (1 << 13) + +#define S_NOCOALESCE 26 +#define V_NOCOALESCE(x) ((x) << S_NOCOALESCE) +#define F_NOCOALESCE V_NOCOALESCE(1U) + +#define SGE_TIMESTAMP_LO 0x1098 +#define SGE_TIMESTAMP_HI 0x109c +#define S_TSVAL 0 +#define M_TSVAL 0xfffffffU +#define GET_TSVAL(x) (((x) >> S_TSVAL) & M_TSVAL) + +#define SGE_TIMER_VALUE_0_AND_1 0x10b8 +#define TIMERVALUE0_MASK 0xffff0000U +#define TIMERVALUE0_SHIFT 16 +#define TIMERVALUE0(x) ((x) << TIMERVALUE0_SHIFT) +#define TIMERVALUE0_GET(x) (((x) & TIMERVALUE0_MASK) >> TIMERVALUE0_SHIFT) +#define TIMERVALUE1_MASK 0x0000ffffU +#define TIMERVALUE1_SHIFT 0 +#define TIMERVALUE1(x) ((x) << TIMERVALUE1_SHIFT) +#define TIMERVALUE1_GET(x) (((x) & TIMERVALUE1_MASK) >> TIMERVALUE1_SHIFT) + +#define SGE_TIMER_VALUE_2_AND_3 0x10bc +#define TIMERVALUE2_MASK 0xffff0000U +#define TIMERVALUE2_SHIFT 16 +#define TIMERVALUE2(x) ((x) << TIMERVALUE2_SHIFT) +#define TIMERVALUE2_GET(x) (((x) & TIMERVALUE2_MASK) >> TIMERVALUE2_SHIFT) +#define TIMERVALUE3_MASK 0x0000ffffU +#define TIMERVALUE3_SHIFT 0 +#define TIMERVALUE3(x) ((x) << TIMERVALUE3_SHIFT) +#define TIMERVALUE3_GET(x) (((x) & TIMERVALUE3_MASK) >> TIMERVALUE3_SHIFT) + +#define SGE_TIMER_VALUE_4_AND_5 0x10c0 +#define TIMERVALUE4_MASK 0xffff0000U +#define TIMERVALUE4_SHIFT 16 +#define TIMERVALUE4(x) ((x) << TIMERVALUE4_SHIFT) +#define TIMERVALUE4_GET(x) (((x) & TIMERVALUE4_MASK) >> TIMERVALUE4_SHIFT) +#define TIMERVALUE5_MASK 0x0000ffffU +#define TIMERVALUE5_SHIFT 0 +#define TIMERVALUE5(x) ((x) << TIMERVALUE5_SHIFT) +#define TIMERVALUE5_GET(x) (((x) & TIMERVALUE5_MASK) >> TIMERVALUE5_SHIFT) + +#define SGE_DEBUG_INDEX 0x10cc +#define SGE_DEBUG_DATA_HIGH 0x10d0 +#define SGE_DEBUG_DATA_LOW 0x10d4 +#define SGE_DEBUG_DATA_LOW_INDEX_2 0x12c8 +#define SGE_DEBUG_DATA_LOW_INDEX_3 0x12cc +#define SGE_DEBUG_DATA_HIGH_INDEX_10 0x12a8 +#define SGE_INGRESS_QUEUES_PER_PAGE_PF 0x10f4 + +#define S_HP_INT_THRESH 28 +#define M_HP_INT_THRESH 0xfU +#define V_HP_INT_THRESH(x) ((x) << S_HP_INT_THRESH) +#define S_LP_INT_THRESH_T5 18 +#define V_LP_INT_THRESH_T5(x) ((x) << S_LP_INT_THRESH_T5) +#define M_LP_COUNT_T5 0x3ffffU +#define G_LP_COUNT_T5(x) (((x) >> S_LP_COUNT) & M_LP_COUNT_T5) +#define M_HP_COUNT 0x7ffU +#define S_HP_COUNT 16 +#define G_HP_COUNT(x) (((x) >> S_HP_COUNT) & M_HP_COUNT) +#define S_LP_INT_THRESH 12 +#define M_LP_INT_THRESH 0xfU +#define M_LP_INT_THRESH_T5 0xfffU +#define V_LP_INT_THRESH(x) ((x) << S_LP_INT_THRESH) +#define M_LP_COUNT 0x7ffU +#define S_LP_COUNT 0 +#define G_LP_COUNT(x) (((x) >> S_LP_COUNT) & M_LP_COUNT) +#define A_SGE_DBFIFO_STATUS 0x10a4 + +#define SGE_STAT_TOTAL 0x10e4 +#define SGE_STAT_MATCH 0x10e8 + +#define SGE_STAT_CFG 0x10ec +#define S_STATSOURCE_T5 9 +#define STATSOURCE_T5(x) ((x) << S_STATSOURCE_T5) + +#define SGE_DBFIFO_STATUS2 0x1118 +#define M_HP_COUNT_T5 0x3ffU +#define G_HP_COUNT_T5(x) ((x) & M_HP_COUNT_T5) +#define S_HP_INT_THRESH_T5 10 +#define M_HP_INT_THRESH_T5 0xfU +#define V_HP_INT_THRESH_T5(x) ((x) << S_HP_INT_THRESH_T5) + +#define S_ENABLE_DROP 13 +#define V_ENABLE_DROP(x) ((x) << S_ENABLE_DROP) +#define F_ENABLE_DROP V_ENABLE_DROP(1U) +#define S_DROPPED_DB 0 +#define V_DROPPED_DB(x) ((x) << S_DROPPED_DB) +#define F_DROPPED_DB V_DROPPED_DB(1U) +#define A_SGE_DOORBELL_CONTROL 0x10a8 + +#define A_SGE_CTXT_CMD 0x11fc +#define A_SGE_DBQ_CTXT_BADDR 0x1084 + +#define PCIE_PF_CFG 0x40 +#define AIVEC(x) ((x) << 4) +#define AIVEC_MASK 0x3ffU + +#define PCIE_PF_CLI 0x44 +#define PCIE_INT_CAUSE 0x3004 +#define UNXSPLCPLERR 0x20000000U +#define PCIEPINT 0x10000000U +#define PCIESINT 0x08000000U +#define RPLPERR 0x04000000U +#define RXWRPERR 0x02000000U +#define RXCPLPERR 0x01000000U +#define PIOTAGPERR 0x00800000U +#define MATAGPERR 0x00400000U +#define INTXCLRPERR 0x00200000U +#define FIDPERR 0x00100000U +#define CFGSNPPERR 0x00080000U +#define HRSPPERR 0x00040000U +#define HREQPERR 0x00020000U +#define HCNTPERR 0x00010000U +#define DRSPPERR 0x00008000U +#define DREQPERR 0x00004000U +#define DCNTPERR 0x00002000U +#define CRSPPERR 0x00001000U +#define CREQPERR 0x00000800U +#define CCNTPERR 0x00000400U +#define TARTAGPERR 0x00000200U +#define PIOREQPERR 0x00000100U +#define PIOCPLPERR 0x00000080U +#define MSIXDIPERR 0x00000040U +#define MSIXDATAPERR 0x00000020U +#define MSIXADDRHPERR 0x00000010U +#define MSIXADDRLPERR 0x00000008U +#define MSIDATAPERR 0x00000004U +#define MSIADDRHPERR 0x00000002U +#define MSIADDRLPERR 0x00000001U + +#define READRSPERR 0x20000000U +#define TRGT1GRPPERR 0x10000000U +#define IPSOTPERR 0x08000000U +#define IPRXDATAGRPPERR 0x02000000U +#define IPRXHDRGRPPERR 0x01000000U +#define MAGRPPERR 0x00400000U +#define VFIDPERR 0x00200000U +#define HREQWRPERR 0x00010000U +#define DREQWRPERR 0x00002000U +#define MSTTAGQPERR 0x00000400U +#define PIOREQGRPPERR 0x00000100U +#define PIOCPLGRPPERR 0x00000080U +#define MSIXSTIPERR 0x00000004U +#define MSTTIMEOUTPERR 0x00000002U +#define MSTGRPPERR 0x00000001U + +#define PCIE_NONFAT_ERR 0x3010 +#define PCIE_CFG_SPACE_REQ 0x3060 +#define PCIE_CFG_SPACE_DATA 0x3064 +#define PCIE_MEM_ACCESS_BASE_WIN 0x3068 +#define S_PCIEOFST 10 +#define M_PCIEOFST 0x3fffffU +#define GET_PCIEOFST(x) (((x) >> S_PCIEOFST) & M_PCIEOFST) +#define PCIEOFST_MASK 0xfffffc00U +#define BIR_MASK 0x00000300U +#define BIR_SHIFT 8 +#define BIR(x) ((x) << BIR_SHIFT) +#define WINDOW_MASK 0x000000ffU +#define WINDOW_SHIFT 0 +#define WINDOW(x) ((x) << WINDOW_SHIFT) +#define GET_WINDOW(x) (((x) >> WINDOW_SHIFT) & WINDOW_MASK) +#define PCIE_MEM_ACCESS_OFFSET 0x306c +#define ENABLE (1U << 30) +#define FUNCTION(x) ((x) << 12) +#define F_LOCALCFG (1U << 28) + +#define S_PFNUM 0 +#define V_PFNUM(x) ((x) << S_PFNUM) + +#define PCIE_FW 0x30b8 +#define PCIE_FW_ERR 0x80000000U +#define PCIE_FW_INIT 0x40000000U +#define PCIE_FW_HALT 0x20000000U +#define PCIE_FW_MASTER_VLD 0x00008000U +#define PCIE_FW_MASTER(x) ((x) << 12) +#define PCIE_FW_MASTER_MASK 0x7 +#define PCIE_FW_MASTER_GET(x) (((x) >> 12) & PCIE_FW_MASTER_MASK) + +#define PCIE_CORE_UTL_SYSTEM_BUS_AGENT_STATUS 0x5908 +#define RNPP 0x80000000U +#define RPCP 0x20000000U +#define RCIP 0x08000000U +#define RCCP 0x04000000U +#define RFTP 0x00800000U +#define PTRP 0x00100000U + +#define PCIE_CORE_UTL_PCI_EXPRESS_PORT_STATUS 0x59a4 +#define TPCP 0x40000000U +#define TNPP 0x20000000U +#define TFTP 0x10000000U +#define TCAP 0x08000000U +#define TCIP 0x04000000U +#define RCAP 0x02000000U +#define PLUP 0x00800000U +#define PLDN 0x00400000U +#define OTDD 0x00200000U +#define GTRP 0x00100000U +#define RDPE 0x00040000U +#define TDCE 0x00020000U +#define TDUE 0x00010000U + +#define MC_INT_CAUSE 0x7518 +#define MC_P_INT_CAUSE 0x41318 +#define ECC_UE_INT_CAUSE 0x00000004U +#define ECC_CE_INT_CAUSE 0x00000002U +#define PERR_INT_CAUSE 0x00000001U + +#define MC_ECC_STATUS 0x751c +#define MC_P_ECC_STATUS 0x4131c +#define ECC_CECNT_MASK 0xffff0000U +#define ECC_CECNT_SHIFT 16 +#define ECC_CECNT(x) ((x) << ECC_CECNT_SHIFT) +#define ECC_CECNT_GET(x) (((x) & ECC_CECNT_MASK) >> ECC_CECNT_SHIFT) +#define ECC_UECNT_MASK 0x0000ffffU +#define ECC_UECNT_SHIFT 0 +#define ECC_UECNT(x) ((x) << ECC_UECNT_SHIFT) +#define ECC_UECNT_GET(x) (((x) & ECC_UECNT_MASK) >> ECC_UECNT_SHIFT) + +#define MC_BIST_CMD 0x7600 +#define START_BIST 0x80000000U +#define BIST_CMD_GAP_MASK 0x0000ff00U +#define BIST_CMD_GAP_SHIFT 8 +#define BIST_CMD_GAP(x) ((x) << BIST_CMD_GAP_SHIFT) +#define BIST_OPCODE_MASK 0x00000003U +#define BIST_OPCODE_SHIFT 0 +#define BIST_OPCODE(x) ((x) << BIST_OPCODE_SHIFT) + +#define MC_BIST_CMD_ADDR 0x7604 +#define MC_BIST_CMD_LEN 0x7608 +#define MC_BIST_DATA_PATTERN 0x760c +#define BIST_DATA_TYPE_MASK 0x0000000fU +#define BIST_DATA_TYPE_SHIFT 0 +#define BIST_DATA_TYPE(x) ((x) << BIST_DATA_TYPE_SHIFT) + +#define MC_BIST_STATUS_RDATA 0x7688 + +#define MA_EDRAM0_BAR 0x77c0 +#define MA_EDRAM1_BAR 0x77c4 +#define EDRAM_SIZE_MASK 0xfffU +#define EDRAM_SIZE_GET(x) ((x) & EDRAM_SIZE_MASK) + +#define MA_EXT_MEMORY_BAR 0x77c8 +#define EXT_MEM_SIZE_MASK 0x00000fffU +#define EXT_MEM_SIZE_SHIFT 0 +#define EXT_MEM_SIZE_GET(x) (((x) & EXT_MEM_SIZE_MASK) >> EXT_MEM_SIZE_SHIFT) + +#define MA_TARGET_MEM_ENABLE 0x77d8 +#define EXT_MEM1_ENABLE 0x00000010U +#define EXT_MEM_ENABLE 0x00000004U +#define EDRAM1_ENABLE 0x00000002U +#define EDRAM0_ENABLE 0x00000001U + +#define MA_INT_CAUSE 0x77e0 +#define MEM_PERR_INT_CAUSE 0x00000002U +#define MEM_WRAP_INT_CAUSE 0x00000001U + +#define MA_INT_WRAP_STATUS 0x77e4 +#define MEM_WRAP_ADDRESS_MASK 0xfffffff0U +#define MEM_WRAP_ADDRESS_SHIFT 4 +#define MEM_WRAP_ADDRESS_GET(x) (((x) & MEM_WRAP_ADDRESS_MASK) >> MEM_WRAP_ADDRESS_SHIFT) +#define MEM_WRAP_CLIENT_NUM_MASK 0x0000000fU +#define MEM_WRAP_CLIENT_NUM_SHIFT 0 +#define MEM_WRAP_CLIENT_NUM_GET(x) (((x) & MEM_WRAP_CLIENT_NUM_MASK) >> MEM_WRAP_CLIENT_NUM_SHIFT) +#define MA_PCIE_FW 0x30b8 +#define MA_PARITY_ERROR_STATUS 0x77f4 +#define MA_PARITY_ERROR_STATUS2 0x7804 + +#define MA_EXT_MEMORY1_BAR 0x7808 +#define EDC_0_BASE_ADDR 0x7900 + +#define EDC_BIST_CMD 0x7904 +#define EDC_BIST_CMD_ADDR 0x7908 +#define EDC_BIST_CMD_LEN 0x790c +#define EDC_BIST_DATA_PATTERN 0x7910 +#define EDC_BIST_STATUS_RDATA 0x7928 +#define EDC_INT_CAUSE 0x7978 +#define ECC_UE_PAR 0x00000020U +#define ECC_CE_PAR 0x00000010U +#define PERR_PAR_CAUSE 0x00000008U + +#define EDC_ECC_STATUS 0x797c + +#define EDC_1_BASE_ADDR 0x7980 + +#define CIM_BOOT_CFG 0x7b00 +#define BOOTADDR_MASK 0xffffff00U +#define UPCRST 0x1U + +#define CIM_PF_MAILBOX_DATA 0x240 +#define CIM_PF_MAILBOX_CTRL 0x280 +#define MBMSGVALID 0x00000008U +#define MBINTREQ 0x00000004U +#define MBOWNER_MASK 0x00000003U +#define MBOWNER_SHIFT 0 +#define MBOWNER(x) ((x) << MBOWNER_SHIFT) +#define MBOWNER_GET(x) (((x) & MBOWNER_MASK) >> MBOWNER_SHIFT) + +#define CIM_PF_HOST_INT_ENABLE 0x288 +#define MBMSGRDYINTEN(x) ((x) << 19) + +#define CIM_PF_HOST_INT_CAUSE 0x28c +#define MBMSGRDYINT 0x00080000U + +#define CIM_HOST_INT_CAUSE 0x7b2c +#define TIEQOUTPARERRINT 0x00100000U +#define TIEQINPARERRINT 0x00080000U +#define MBHOSTPARERR 0x00040000U +#define MBUPPARERR 0x00020000U +#define IBQPARERR 0x0001f800U +#define IBQTP0PARERR 0x00010000U +#define IBQTP1PARERR 0x00008000U +#define IBQULPPARERR 0x00004000U +#define IBQSGELOPARERR 0x00002000U +#define IBQSGEHIPARERR 0x00001000U +#define IBQNCSIPARERR 0x00000800U +#define OBQPARERR 0x000007e0U +#define OBQULP0PARERR 0x00000400U +#define OBQULP1PARERR 0x00000200U +#define OBQULP2PARERR 0x00000100U +#define OBQULP3PARERR 0x00000080U +#define OBQSGEPARERR 0x00000040U +#define OBQNCSIPARERR 0x00000020U +#define PREFDROPINT 0x00000002U +#define UPACCNONZERO 0x00000001U + +#define CIM_HOST_UPACC_INT_CAUSE 0x7b34 +#define EEPROMWRINT 0x40000000U +#define TIMEOUTMAINT 0x20000000U +#define TIMEOUTINT 0x10000000U +#define RSPOVRLOOKUPINT 0x08000000U +#define REQOVRLOOKUPINT 0x04000000U +#define BLKWRPLINT 0x02000000U +#define BLKRDPLINT 0x01000000U +#define SGLWRPLINT 0x00800000U +#define SGLRDPLINT 0x00400000U +#define BLKWRCTLINT 0x00200000U +#define BLKRDCTLINT 0x00100000U +#define SGLWRCTLINT 0x00080000U +#define SGLRDCTLINT 0x00040000U +#define BLKWREEPROMINT 0x00020000U +#define BLKRDEEPROMINT 0x00010000U +#define SGLWREEPROMINT 0x00008000U +#define SGLRDEEPROMINT 0x00004000U +#define BLKWRFLASHINT 0x00002000U +#define BLKRDFLASHINT 0x00001000U +#define SGLWRFLASHINT 0x00000800U +#define SGLRDFLASHINT 0x00000400U +#define BLKWRBOOTINT 0x00000200U +#define BLKRDBOOTINT 0x00000100U +#define SGLWRBOOTINT 0x00000080U +#define SGLRDBOOTINT 0x00000040U +#define ILLWRBEINT 0x00000020U +#define ILLRDBEINT 0x00000010U +#define ILLRDINT 0x00000008U +#define ILLWRINT 0x00000004U +#define ILLTRANSINT 0x00000002U +#define RSVDSPACEINT 0x00000001U + +#define TP_OUT_CONFIG 0x7d04 +#define VLANEXTENABLE_MASK 0x0000f000U +#define VLANEXTENABLE_SHIFT 12 + +#define TP_GLOBAL_CONFIG 0x7d08 +#define FIVETUPLELOOKUP_SHIFT 17 +#define FIVETUPLELOOKUP_MASK 0x00060000U +#define FIVETUPLELOOKUP(x) ((x) << FIVETUPLELOOKUP_SHIFT) +#define FIVETUPLELOOKUP_GET(x) (((x) & FIVETUPLELOOKUP_MASK) >> \ + FIVETUPLELOOKUP_SHIFT) + +#define TP_PARA_REG2 0x7d68 +#define MAXRXDATA_MASK 0xffff0000U +#define MAXRXDATA_SHIFT 16 +#define MAXRXDATA_GET(x) (((x) & MAXRXDATA_MASK) >> MAXRXDATA_SHIFT) + +#define TP_TIMER_RESOLUTION 0x7d90 +#define TIMERRESOLUTION_MASK 0x00ff0000U +#define TIMERRESOLUTION_SHIFT 16 +#define TIMERRESOLUTION_GET(x) (((x) & TIMERRESOLUTION_MASK) >> TIMERRESOLUTION_SHIFT) +#define DELAYEDACKRESOLUTION_MASK 0x000000ffU +#define DELAYEDACKRESOLUTION_SHIFT 0 +#define DELAYEDACKRESOLUTION_GET(x) \ + (((x) & DELAYEDACKRESOLUTION_MASK) >> DELAYEDACKRESOLUTION_SHIFT) + +#define TP_SHIFT_CNT 0x7dc0 +#define SYNSHIFTMAX_SHIFT 24 +#define SYNSHIFTMAX_MASK 0xff000000U +#define SYNSHIFTMAX(x) ((x) << SYNSHIFTMAX_SHIFT) +#define SYNSHIFTMAX_GET(x) (((x) & SYNSHIFTMAX_MASK) >> \ + SYNSHIFTMAX_SHIFT) +#define RXTSHIFTMAXR1_SHIFT 20 +#define RXTSHIFTMAXR1_MASK 0x00f00000U +#define RXTSHIFTMAXR1(x) ((x) << RXTSHIFTMAXR1_SHIFT) +#define RXTSHIFTMAXR1_GET(x) (((x) & RXTSHIFTMAXR1_MASK) >> \ + RXTSHIFTMAXR1_SHIFT) +#define RXTSHIFTMAXR2_SHIFT 16 +#define RXTSHIFTMAXR2_MASK 0x000f0000U +#define RXTSHIFTMAXR2(x) ((x) << RXTSHIFTMAXR2_SHIFT) +#define RXTSHIFTMAXR2_GET(x) (((x) & RXTSHIFTMAXR2_MASK) >> \ + RXTSHIFTMAXR2_SHIFT) +#define PERSHIFTBACKOFFMAX_SHIFT 12 +#define PERSHIFTBACKOFFMAX_MASK 0x0000f000U +#define PERSHIFTBACKOFFMAX(x) ((x) << PERSHIFTBACKOFFMAX_SHIFT) +#define PERSHIFTBACKOFFMAX_GET(x) (((x) & PERSHIFTBACKOFFMAX_MASK) >> \ + PERSHIFTBACKOFFMAX_SHIFT) +#define PERSHIFTMAX_SHIFT 8 +#define PERSHIFTMAX_MASK 0x00000f00U +#define PERSHIFTMAX(x) ((x) << PERSHIFTMAX_SHIFT) +#define PERSHIFTMAX_GET(x) (((x) & PERSHIFTMAX_MASK) >> \ + PERSHIFTMAX_SHIFT) +#define KEEPALIVEMAXR1_SHIFT 4 +#define KEEPALIVEMAXR1_MASK 0x000000f0U +#define KEEPALIVEMAXR1(x) ((x) << KEEPALIVEMAXR1_SHIFT) +#define KEEPALIVEMAXR1_GET(x) (((x) & KEEPALIVEMAXR1_MASK) >> \ + KEEPALIVEMAXR1_SHIFT) +#define KEEPALIVEMAXR2_SHIFT 0 +#define KEEPALIVEMAXR2_MASK 0x0000000fU +#define KEEPALIVEMAXR2(x) ((x) << KEEPALIVEMAXR2_SHIFT) +#define KEEPALIVEMAXR2_GET(x) (((x) & KEEPALIVEMAXR2_MASK) >> \ + KEEPALIVEMAXR2_SHIFT) + +#define TP_CCTRL_TABLE 0x7ddc +#define TP_MTU_TABLE 0x7de4 +#define MTUINDEX_MASK 0xff000000U +#define MTUINDEX_SHIFT 24 +#define MTUINDEX(x) ((x) << MTUINDEX_SHIFT) +#define MTUWIDTH_MASK 0x000f0000U +#define MTUWIDTH_SHIFT 16 +#define MTUWIDTH(x) ((x) << MTUWIDTH_SHIFT) +#define MTUWIDTH_GET(x) (((x) & MTUWIDTH_MASK) >> MTUWIDTH_SHIFT) +#define MTUVALUE_MASK 0x00003fffU +#define MTUVALUE_SHIFT 0 +#define MTUVALUE(x) ((x) << MTUVALUE_SHIFT) +#define MTUVALUE_GET(x) (((x) & MTUVALUE_MASK) >> MTUVALUE_SHIFT) + +#define TP_RSS_LKP_TABLE 0x7dec +#define LKPTBLROWVLD 0x80000000U +#define LKPTBLQUEUE1_MASK 0x000ffc00U +#define LKPTBLQUEUE1_SHIFT 10 +#define LKPTBLQUEUE1(x) ((x) << LKPTBLQUEUE1_SHIFT) +#define LKPTBLQUEUE1_GET(x) (((x) & LKPTBLQUEUE1_MASK) >> LKPTBLQUEUE1_SHIFT) +#define LKPTBLQUEUE0_MASK 0x000003ffU +#define LKPTBLQUEUE0_SHIFT 0 +#define LKPTBLQUEUE0(x) ((x) << LKPTBLQUEUE0_SHIFT) +#define LKPTBLQUEUE0_GET(x) (((x) & LKPTBLQUEUE0_MASK) >> LKPTBLQUEUE0_SHIFT) + +#define TP_PIO_ADDR 0x7e40 +#define TP_PIO_DATA 0x7e44 +#define TP_MIB_INDEX 0x7e50 +#define TP_MIB_DATA 0x7e54 +#define TP_INT_CAUSE 0x7e74 +#define FLMTXFLSTEMPTY 0x40000000U + +#define TP_VLAN_PRI_MAP 0x140 +#define FRAGMENTATION_SHIFT 9 +#define FRAGMENTATION_MASK 0x00000200U +#define MPSHITTYPE_MASK 0x00000100U +#define MACMATCH_MASK 0x00000080U +#define ETHERTYPE_MASK 0x00000040U +#define PROTOCOL_MASK 0x00000020U +#define TOS_MASK 0x00000010U +#define VLAN_MASK 0x00000008U +#define VNIC_ID_MASK 0x00000004U +#define PORT_MASK 0x00000002U +#define FCOE_SHIFT 0 +#define FCOE_MASK 0x00000001U + +#define TP_INGRESS_CONFIG 0x141 +#define VNIC 0x00000800U +#define CSUM_HAS_PSEUDO_HDR 0x00000400U +#define RM_OVLAN 0x00000200U +#define LOOKUPEVERYPKT 0x00000100U + +#define TP_MIB_MAC_IN_ERR_0 0x0 +#define TP_MIB_TCP_OUT_RST 0xc +#define TP_MIB_TCP_IN_SEG_HI 0x10 +#define TP_MIB_TCP_IN_SEG_LO 0x11 +#define TP_MIB_TCP_OUT_SEG_HI 0x12 +#define TP_MIB_TCP_OUT_SEG_LO 0x13 +#define TP_MIB_TCP_RXT_SEG_HI 0x14 +#define TP_MIB_TCP_RXT_SEG_LO 0x15 +#define TP_MIB_TNL_CNG_DROP_0 0x18 +#define TP_MIB_TCP_V6IN_ERR_0 0x28 +#define TP_MIB_TCP_V6OUT_RST 0x2c +#define TP_MIB_OFD_ARP_DROP 0x36 +#define TP_MIB_TNL_DROP_0 0x44 +#define TP_MIB_OFD_VLN_DROP_0 0x58 + +#define ULP_TX_INT_CAUSE 0x8dcc +#define PBL_BOUND_ERR_CH3 0x80000000U +#define PBL_BOUND_ERR_CH2 0x40000000U +#define PBL_BOUND_ERR_CH1 0x20000000U +#define PBL_BOUND_ERR_CH0 0x10000000U + +#define PM_RX_INT_CAUSE 0x8fdc +#define ZERO_E_CMD_ERROR 0x00400000U +#define PMRX_FRAMING_ERROR 0x003ffff0U +#define OCSPI_PAR_ERROR 0x00000008U +#define DB_OPTIONS_PAR_ERROR 0x00000004U +#define IESPI_PAR_ERROR 0x00000002U +#define E_PCMD_PAR_ERROR 0x00000001U + +#define PM_TX_INT_CAUSE 0x8ffc +#define PCMD_LEN_OVFL0 0x80000000U +#define PCMD_LEN_OVFL1 0x40000000U +#define PCMD_LEN_OVFL2 0x20000000U +#define ZERO_C_CMD_ERROR 0x10000000U +#define PMTX_FRAMING_ERROR 0x0ffffff0U +#define OESPI_PAR_ERROR 0x00000008U +#define ICSPI_PAR_ERROR 0x00000002U +#define C_PCMD_PAR_ERROR 0x00000001U + +#define MPS_PORT_STAT_TX_PORT_BYTES_L 0x400 +#define MPS_PORT_STAT_TX_PORT_BYTES_H 0x404 +#define MPS_PORT_STAT_TX_PORT_FRAMES_L 0x408 +#define MPS_PORT_STAT_TX_PORT_FRAMES_H 0x40c +#define MPS_PORT_STAT_TX_PORT_BCAST_L 0x410 +#define MPS_PORT_STAT_TX_PORT_BCAST_H 0x414 +#define MPS_PORT_STAT_TX_PORT_MCAST_L 0x418 +#define MPS_PORT_STAT_TX_PORT_MCAST_H 0x41c +#define MPS_PORT_STAT_TX_PORT_UCAST_L 0x420 +#define MPS_PORT_STAT_TX_PORT_UCAST_H 0x424 +#define MPS_PORT_STAT_TX_PORT_ERROR_L 0x428 +#define MPS_PORT_STAT_TX_PORT_ERROR_H 0x42c +#define MPS_PORT_STAT_TX_PORT_64B_L 0x430 +#define MPS_PORT_STAT_TX_PORT_64B_H 0x434 +#define MPS_PORT_STAT_TX_PORT_65B_127B_L 0x438 +#define MPS_PORT_STAT_TX_PORT_65B_127B_H 0x43c +#define MPS_PORT_STAT_TX_PORT_128B_255B_L 0x440 +#define MPS_PORT_STAT_TX_PORT_128B_255B_H 0x444 +#define MPS_PORT_STAT_TX_PORT_256B_511B_L 0x448 +#define MPS_PORT_STAT_TX_PORT_256B_511B_H 0x44c +#define MPS_PORT_STAT_TX_PORT_512B_1023B_L 0x450 +#define MPS_PORT_STAT_TX_PORT_512B_1023B_H 0x454 +#define MPS_PORT_STAT_TX_PORT_1024B_1518B_L 0x458 +#define MPS_PORT_STAT_TX_PORT_1024B_1518B_H 0x45c +#define MPS_PORT_STAT_TX_PORT_1519B_MAX_L 0x460 +#define MPS_PORT_STAT_TX_PORT_1519B_MAX_H 0x464 +#define MPS_PORT_STAT_TX_PORT_DROP_L 0x468 +#define MPS_PORT_STAT_TX_PORT_DROP_H 0x46c +#define MPS_PORT_STAT_TX_PORT_PAUSE_L 0x470 +#define MPS_PORT_STAT_TX_PORT_PAUSE_H 0x474 +#define MPS_PORT_STAT_TX_PORT_PPP0_L 0x478 +#define MPS_PORT_STAT_TX_PORT_PPP0_H 0x47c +#define MPS_PORT_STAT_TX_PORT_PPP1_L 0x480 +#define MPS_PORT_STAT_TX_PORT_PPP1_H 0x484 +#define MPS_PORT_STAT_TX_PORT_PPP2_L 0x488 +#define MPS_PORT_STAT_TX_PORT_PPP2_H 0x48c +#define MPS_PORT_STAT_TX_PORT_PPP3_L 0x490 +#define MPS_PORT_STAT_TX_PORT_PPP3_H 0x494 +#define MPS_PORT_STAT_TX_PORT_PPP4_L 0x498 +#define MPS_PORT_STAT_TX_PORT_PPP4_H 0x49c +#define MPS_PORT_STAT_TX_PORT_PPP5_L 0x4a0 +#define MPS_PORT_STAT_TX_PORT_PPP5_H 0x4a4 +#define MPS_PORT_STAT_TX_PORT_PPP6_L 0x4a8 +#define MPS_PORT_STAT_TX_PORT_PPP6_H 0x4ac +#define MPS_PORT_STAT_TX_PORT_PPP7_L 0x4b0 +#define MPS_PORT_STAT_TX_PORT_PPP7_H 0x4b4 +#define MPS_PORT_STAT_LB_PORT_BYTES_L 0x4c0 +#define MPS_PORT_STAT_LB_PORT_BYTES_H 0x4c4 +#define MPS_PORT_STAT_LB_PORT_FRAMES_L 0x4c8 +#define MPS_PORT_STAT_LB_PORT_FRAMES_H 0x4cc +#define MPS_PORT_STAT_LB_PORT_BCAST_L 0x4d0 +#define MPS_PORT_STAT_LB_PORT_BCAST_H 0x4d4 +#define MPS_PORT_STAT_LB_PORT_MCAST_L 0x4d8 +#define MPS_PORT_STAT_LB_PORT_MCAST_H 0x4dc +#define MPS_PORT_STAT_LB_PORT_UCAST_L 0x4e0 +#define MPS_PORT_STAT_LB_PORT_UCAST_H 0x4e4 +#define MPS_PORT_STAT_LB_PORT_ERROR_L 0x4e8 +#define MPS_PORT_STAT_LB_PORT_ERROR_H 0x4ec +#define MPS_PORT_STAT_LB_PORT_64B_L 0x4f0 +#define MPS_PORT_STAT_LB_PORT_64B_H 0x4f4 +#define MPS_PORT_STAT_LB_PORT_65B_127B_L 0x4f8 +#define MPS_PORT_STAT_LB_PORT_65B_127B_H 0x4fc +#define MPS_PORT_STAT_LB_PORT_128B_255B_L 0x500 +#define MPS_PORT_STAT_LB_PORT_128B_255B_H 0x504 +#define MPS_PORT_STAT_LB_PORT_256B_511B_L 0x508 +#define MPS_PORT_STAT_LB_PORT_256B_511B_H 0x50c +#define MPS_PORT_STAT_LB_PORT_512B_1023B_L 0x510 +#define MPS_PORT_STAT_LB_PORT_512B_1023B_H 0x514 +#define MPS_PORT_STAT_LB_PORT_1024B_1518B_L 0x518 +#define MPS_PORT_STAT_LB_PORT_1024B_1518B_H 0x51c +#define MPS_PORT_STAT_LB_PORT_1519B_MAX_L 0x520 +#define MPS_PORT_STAT_LB_PORT_1519B_MAX_H 0x524 +#define MPS_PORT_STAT_LB_PORT_DROP_FRAMES 0x528 +#define MPS_PORT_STAT_RX_PORT_BYTES_L 0x540 +#define MPS_PORT_STAT_RX_PORT_BYTES_H 0x544 +#define MPS_PORT_STAT_RX_PORT_FRAMES_L 0x548 +#define MPS_PORT_STAT_RX_PORT_FRAMES_H 0x54c +#define MPS_PORT_STAT_RX_PORT_BCAST_L 0x550 +#define MPS_PORT_STAT_RX_PORT_BCAST_H 0x554 +#define MPS_PORT_STAT_RX_PORT_MCAST_L 0x558 +#define MPS_PORT_STAT_RX_PORT_MCAST_H 0x55c +#define MPS_PORT_STAT_RX_PORT_UCAST_L 0x560 +#define MPS_PORT_STAT_RX_PORT_UCAST_H 0x564 +#define MPS_PORT_STAT_RX_PORT_MTU_ERROR_L 0x568 +#define MPS_PORT_STAT_RX_PORT_MTU_ERROR_H 0x56c +#define MPS_PORT_STAT_RX_PORT_MTU_CRC_ERROR_L 0x570 +#define MPS_PORT_STAT_RX_PORT_MTU_CRC_ERROR_H 0x574 +#define MPS_PORT_STAT_RX_PORT_CRC_ERROR_L 0x578 +#define MPS_PORT_STAT_RX_PORT_CRC_ERROR_H 0x57c +#define MPS_PORT_STAT_RX_PORT_LEN_ERROR_L 0x580 +#define MPS_PORT_STAT_RX_PORT_LEN_ERROR_H 0x584 +#define MPS_PORT_STAT_RX_PORT_SYM_ERROR_L 0x588 +#define MPS_PORT_STAT_RX_PORT_SYM_ERROR_H 0x58c +#define MPS_PORT_STAT_RX_PORT_64B_L 0x590 +#define MPS_PORT_STAT_RX_PORT_64B_H 0x594 +#define MPS_PORT_STAT_RX_PORT_65B_127B_L 0x598 +#define MPS_PORT_STAT_RX_PORT_65B_127B_H 0x59c +#define MPS_PORT_STAT_RX_PORT_128B_255B_L 0x5a0 +#define MPS_PORT_STAT_RX_PORT_128B_255B_H 0x5a4 +#define MPS_PORT_STAT_RX_PORT_256B_511B_L 0x5a8 +#define MPS_PORT_STAT_RX_PORT_256B_511B_H 0x5ac +#define MPS_PORT_STAT_RX_PORT_512B_1023B_L 0x5b0 +#define MPS_PORT_STAT_RX_PORT_512B_1023B_H 0x5b4 +#define MPS_PORT_STAT_RX_PORT_1024B_1518B_L 0x5b8 +#define MPS_PORT_STAT_RX_PORT_1024B_1518B_H 0x5bc +#define MPS_PORT_STAT_RX_PORT_1519B_MAX_L 0x5c0 +#define MPS_PORT_STAT_RX_PORT_1519B_MAX_H 0x5c4 +#define MPS_PORT_STAT_RX_PORT_PAUSE_L 0x5c8 +#define MPS_PORT_STAT_RX_PORT_PAUSE_H 0x5cc +#define MPS_PORT_STAT_RX_PORT_PPP0_L 0x5d0 +#define MPS_PORT_STAT_RX_PORT_PPP0_H 0x5d4 +#define MPS_PORT_STAT_RX_PORT_PPP1_L 0x5d8 +#define MPS_PORT_STAT_RX_PORT_PPP1_H 0x5dc +#define MPS_PORT_STAT_RX_PORT_PPP2_L 0x5e0 +#define MPS_PORT_STAT_RX_PORT_PPP2_H 0x5e4 +#define MPS_PORT_STAT_RX_PORT_PPP3_L 0x5e8 +#define MPS_PORT_STAT_RX_PORT_PPP3_H 0x5ec +#define MPS_PORT_STAT_RX_PORT_PPP4_L 0x5f0 +#define MPS_PORT_STAT_RX_PORT_PPP4_H 0x5f4 +#define MPS_PORT_STAT_RX_PORT_PPP5_L 0x5f8 +#define MPS_PORT_STAT_RX_PORT_PPP5_H 0x5fc +#define MPS_PORT_STAT_RX_PORT_PPP6_L 0x600 +#define MPS_PORT_STAT_RX_PORT_PPP6_H 0x604 +#define MPS_PORT_STAT_RX_PORT_PPP7_L 0x608 +#define MPS_PORT_STAT_RX_PORT_PPP7_H 0x60c +#define MPS_PORT_STAT_RX_PORT_LESS_64B_L 0x610 +#define MPS_PORT_STAT_RX_PORT_LESS_64B_H 0x614 +#define MAC_PORT_CFG2 0x818 +#define MAC_PORT_MAGIC_MACID_LO 0x824 +#define MAC_PORT_MAGIC_MACID_HI 0x828 +#define MAC_PORT_EPIO_DATA0 0x8c0 +#define MAC_PORT_EPIO_DATA1 0x8c4 +#define MAC_PORT_EPIO_DATA2 0x8c8 +#define MAC_PORT_EPIO_DATA3 0x8cc +#define MAC_PORT_EPIO_OP 0x8d0 + +#define MPS_CMN_CTL 0x9000 +#define NUMPORTS_MASK 0x00000003U +#define NUMPORTS_SHIFT 0 +#define NUMPORTS_GET(x) (((x) & NUMPORTS_MASK) >> NUMPORTS_SHIFT) + +#define MPS_INT_CAUSE 0x9008 +#define STATINT 0x00000020U +#define TXINT 0x00000010U +#define RXINT 0x00000008U +#define TRCINT 0x00000004U +#define CLSINT 0x00000002U +#define PLINT 0x00000001U + +#define MPS_TX_INT_CAUSE 0x9408 +#define PORTERR 0x00010000U +#define FRMERR 0x00008000U +#define SECNTERR 0x00004000U +#define BUBBLE 0x00002000U +#define TXDESCFIFO 0x00001e00U +#define TXDATAFIFO 0x000001e0U +#define NCSIFIFO 0x00000010U +#define TPFIFO 0x0000000fU + +#define MPS_STAT_PERR_INT_CAUSE_SRAM 0x9614 +#define MPS_STAT_PERR_INT_CAUSE_TX_FIFO 0x9620 +#define MPS_STAT_PERR_INT_CAUSE_RX_FIFO 0x962c + +#define MPS_STAT_RX_BG_0_MAC_DROP_FRAME_L 0x9640 +#define MPS_STAT_RX_BG_0_MAC_DROP_FRAME_H 0x9644 +#define MPS_STAT_RX_BG_1_MAC_DROP_FRAME_L 0x9648 +#define MPS_STAT_RX_BG_1_MAC_DROP_FRAME_H 0x964c +#define MPS_STAT_RX_BG_2_MAC_DROP_FRAME_L 0x9650 +#define MPS_STAT_RX_BG_2_MAC_DROP_FRAME_H 0x9654 +#define MPS_STAT_RX_BG_3_MAC_DROP_FRAME_L 0x9658 +#define MPS_STAT_RX_BG_3_MAC_DROP_FRAME_H 0x965c +#define MPS_STAT_RX_BG_0_LB_DROP_FRAME_L 0x9660 +#define MPS_STAT_RX_BG_0_LB_DROP_FRAME_H 0x9664 +#define MPS_STAT_RX_BG_1_LB_DROP_FRAME_L 0x9668 +#define MPS_STAT_RX_BG_1_LB_DROP_FRAME_H 0x966c +#define MPS_STAT_RX_BG_2_LB_DROP_FRAME_L 0x9670 +#define MPS_STAT_RX_BG_2_LB_DROP_FRAME_H 0x9674 +#define MPS_STAT_RX_BG_3_LB_DROP_FRAME_L 0x9678 +#define MPS_STAT_RX_BG_3_LB_DROP_FRAME_H 0x967c +#define MPS_STAT_RX_BG_0_MAC_TRUNC_FRAME_L 0x9680 +#define MPS_STAT_RX_BG_0_MAC_TRUNC_FRAME_H 0x9684 +#define MPS_STAT_RX_BG_1_MAC_TRUNC_FRAME_L 0x9688 +#define MPS_STAT_RX_BG_1_MAC_TRUNC_FRAME_H 0x968c +#define MPS_STAT_RX_BG_2_MAC_TRUNC_FRAME_L 0x9690 +#define MPS_STAT_RX_BG_2_MAC_TRUNC_FRAME_H 0x9694 +#define MPS_STAT_RX_BG_3_MAC_TRUNC_FRAME_L 0x9698 +#define MPS_STAT_RX_BG_3_MAC_TRUNC_FRAME_H 0x969c +#define MPS_STAT_RX_BG_0_LB_TRUNC_FRAME_L 0x96a0 +#define MPS_STAT_RX_BG_0_LB_TRUNC_FRAME_H 0x96a4 +#define MPS_STAT_RX_BG_1_LB_TRUNC_FRAME_L 0x96a8 +#define MPS_STAT_RX_BG_1_LB_TRUNC_FRAME_H 0x96ac +#define MPS_STAT_RX_BG_2_LB_TRUNC_FRAME_L 0x96b0 +#define MPS_STAT_RX_BG_2_LB_TRUNC_FRAME_H 0x96b4 +#define MPS_STAT_RX_BG_3_LB_TRUNC_FRAME_L 0x96b8 +#define MPS_STAT_RX_BG_3_LB_TRUNC_FRAME_H 0x96bc +#define MPS_TRC_CFG 0x9800 +#define TRCFIFOEMPTY 0x00000010U +#define TRCIGNOREDROPINPUT 0x00000008U +#define TRCKEEPDUPLICATES 0x00000004U +#define TRCEN 0x00000002U +#define TRCMULTIFILTER 0x00000001U + +#define MPS_TRC_RSS_CONTROL 0x9808 +#define MPS_T5_TRC_RSS_CONTROL 0xa00c +#define RSSCONTROL_MASK 0x00ff0000U +#define RSSCONTROL_SHIFT 16 +#define RSSCONTROL(x) ((x) << RSSCONTROL_SHIFT) +#define QUEUENUMBER_MASK 0x0000ffffU +#define QUEUENUMBER_SHIFT 0 +#define QUEUENUMBER(x) ((x) << QUEUENUMBER_SHIFT) + +#define MPS_TRC_FILTER_MATCH_CTL_A 0x9810 +#define TFINVERTMATCH 0x01000000U +#define TFPKTTOOLARGE 0x00800000U +#define TFEN 0x00400000U +#define TFPORT_MASK 0x003c0000U +#define TFPORT_SHIFT 18 +#define TFPORT(x) ((x) << TFPORT_SHIFT) +#define TFPORT_GET(x) (((x) & TFPORT_MASK) >> TFPORT_SHIFT) +#define TFDROP 0x00020000U +#define TFSOPEOPERR 0x00010000U +#define TFLENGTH_MASK 0x00001f00U +#define TFLENGTH_SHIFT 8 +#define TFLENGTH(x) ((x) << TFLENGTH_SHIFT) +#define TFLENGTH_GET(x) (((x) & TFLENGTH_MASK) >> TFLENGTH_SHIFT) +#define TFOFFSET_MASK 0x0000001fU +#define TFOFFSET_SHIFT 0 +#define TFOFFSET(x) ((x) << TFOFFSET_SHIFT) +#define TFOFFSET_GET(x) (((x) & TFOFFSET_MASK) >> TFOFFSET_SHIFT) + +#define MPS_TRC_FILTER_MATCH_CTL_B 0x9820 +#define TFMINPKTSIZE_MASK 0x01ff0000U +#define TFMINPKTSIZE_SHIFT 16 +#define TFMINPKTSIZE(x) ((x) << TFMINPKTSIZE_SHIFT) +#define TFMINPKTSIZE_GET(x) (((x) & TFMINPKTSIZE_MASK) >> TFMINPKTSIZE_SHIFT) +#define TFCAPTUREMAX_MASK 0x00003fffU +#define TFCAPTUREMAX_SHIFT 0 +#define TFCAPTUREMAX(x) ((x) << TFCAPTUREMAX_SHIFT) +#define TFCAPTUREMAX_GET(x) (((x) & TFCAPTUREMAX_MASK) >> TFCAPTUREMAX_SHIFT) + +#define MPS_TRC_INT_CAUSE 0x985c +#define MISCPERR 0x00000100U +#define PKTFIFO 0x000000f0U +#define FILTMEM 0x0000000fU + +#define MPS_TRC_FILTER0_MATCH 0x9c00 +#define MPS_TRC_FILTER0_DONT_CARE 0x9c80 +#define MPS_TRC_FILTER1_MATCH 0x9d00 +#define MPS_CLS_INT_CAUSE 0xd028 +#define PLERRENB 0x00000008U +#define HASHSRAM 0x00000004U +#define MATCHTCAM 0x00000002U +#define MATCHSRAM 0x00000001U + +#define MPS_RX_PERR_INT_CAUSE 0x11074 + +#define CPL_INTR_CAUSE 0x19054 +#define CIM_OP_MAP_PERR 0x00000020U +#define CIM_OVFL_ERROR 0x00000010U +#define TP_FRAMING_ERROR 0x00000008U +#define SGE_FRAMING_ERROR 0x00000004U +#define CIM_FRAMING_ERROR 0x00000002U +#define ZERO_SWITCH_ERROR 0x00000001U + +#define SMB_INT_CAUSE 0x19090 +#define MSTTXFIFOPARINT 0x00200000U +#define MSTRXFIFOPARINT 0x00100000U +#define SLVFIFOPARINT 0x00080000U + +#define ULP_RX_INT_CAUSE 0x19158 +#define ULP_RX_ISCSI_TAGMASK 0x19164 +#define ULP_RX_ISCSI_PSZ 0x19168 +#define HPZ3_MASK 0x0f000000U +#define HPZ3_SHIFT 24 +#define HPZ3(x) ((x) << HPZ3_SHIFT) +#define HPZ2_MASK 0x000f0000U +#define HPZ2_SHIFT 16 +#define HPZ2(x) ((x) << HPZ2_SHIFT) +#define HPZ1_MASK 0x00000f00U +#define HPZ1_SHIFT 8 +#define HPZ1(x) ((x) << HPZ1_SHIFT) +#define HPZ0_MASK 0x0000000fU +#define HPZ0_SHIFT 0 +#define HPZ0(x) ((x) << HPZ0_SHIFT) + +#define ULP_RX_TDDP_PSZ 0x19178 + +#define SF_DATA 0x193f8 +#define SF_OP 0x193fc +#define SF_BUSY 0x80000000U +#define SF_LOCK 0x00000010U +#define SF_CONT 0x00000008U +#define BYTECNT_MASK 0x00000006U +#define BYTECNT_SHIFT 1 +#define BYTECNT(x) ((x) << BYTECNT_SHIFT) +#define OP_WR 0x00000001U + +#define PL_PF_INT_CAUSE 0x3c0 +#define PFSW 0x00000008U +#define PFSGE 0x00000004U +#define PFCIM 0x00000002U +#define PFMPS 0x00000001U + +#define PL_PF_INT_ENABLE 0x3c4 +#define PL_PF_CTL 0x3c8 +#define SWINT 0x00000001U + +#define PL_WHOAMI 0x19400 +#define SOURCEPF_MASK 0x00000700U +#define SOURCEPF_SHIFT 8 +#define SOURCEPF(x) ((x) << SOURCEPF_SHIFT) +#define SOURCEPF_GET(x) (((x) & SOURCEPF_MASK) >> SOURCEPF_SHIFT) +#define ISVF 0x00000080U +#define VFID_MASK 0x0000007fU +#define VFID_SHIFT 0 +#define VFID(x) ((x) << VFID_SHIFT) +#define VFID_GET(x) (((x) & VFID_MASK) >> VFID_SHIFT) + +#define PL_INT_CAUSE 0x1940c +#define ULP_TX 0x08000000U +#define SGE 0x04000000U +#define HMA 0x02000000U +#define CPL_SWITCH 0x01000000U +#define ULP_RX 0x00800000U +#define PM_RX 0x00400000U +#define PM_TX 0x00200000U +#define MA 0x00100000U +#define TP 0x00080000U +#define LE 0x00040000U +#define EDC1 0x00020000U +#define EDC0 0x00010000U +#define MC 0x00008000U +#define PCIE 0x00004000U +#define PMU 0x00002000U +#define XGMAC_KR1 0x00001000U +#define XGMAC_KR0 0x00000800U +#define XGMAC1 0x00000400U +#define XGMAC0 0x00000200U +#define SMB 0x00000100U +#define SF 0x00000080U +#define PL 0x00000040U +#define NCSI 0x00000020U +#define MPS 0x00000010U +#define MI 0x00000008U +#define DBG 0x00000004U +#define I2CM 0x00000002U +#define CIM 0x00000001U + +#define MC1 0x31 +#define PL_INT_ENABLE 0x19410 +#define PL_INT_MAP0 0x19414 +#define PL_RST 0x19428 +#define PIORST 0x00000002U +#define PIORSTMODE 0x00000001U + +#define PL_PL_INT_CAUSE 0x19430 +#define FATALPERR 0x00000010U +#define PERRVFID 0x00000001U + +#define PL_REV 0x1943c + +#define S_REV 0 +#define M_REV 0xfU +#define V_REV(x) ((x) << S_REV) +#define G_REV(x) (((x) >> S_REV) & M_REV) + +#define LE_DB_CONFIG 0x19c04 +#define HASHEN 0x00100000U + +#define LE_DB_SERVER_INDEX 0x19c18 +#define LE_DB_ACT_CNT_IPV4 0x19c20 +#define LE_DB_ACT_CNT_IPV6 0x19c24 + +#define LE_DB_INT_CAUSE 0x19c3c +#define REQQPARERR 0x00010000U +#define UNKNOWNCMD 0x00008000U +#define PARITYERR 0x00000040U +#define LIPMISS 0x00000020U +#define LIP0 0x00000010U + +#define LE_DB_TID_HASHBASE 0x19df8 + +#define NCSI_INT_CAUSE 0x1a0d8 +#define CIM_DM_PRTY_ERR 0x00000100U +#define MPS_DM_PRTY_ERR 0x00000080U +#define TXFIFO_PRTY_ERR 0x00000002U +#define RXFIFO_PRTY_ERR 0x00000001U + +#define XGMAC_PORT_CFG2 0x1018 +#define PATEN 0x00040000U +#define MAGICEN 0x00020000U + +#define XGMAC_PORT_MAGIC_MACID_LO 0x1024 +#define XGMAC_PORT_MAGIC_MACID_HI 0x1028 + +#define XGMAC_PORT_EPIO_DATA0 0x10c0 +#define XGMAC_PORT_EPIO_DATA1 0x10c4 +#define XGMAC_PORT_EPIO_DATA2 0x10c8 +#define XGMAC_PORT_EPIO_DATA3 0x10cc +#define XGMAC_PORT_EPIO_OP 0x10d0 +#define EPIOWR 0x00000100U +#define ADDRESS_MASK 0x000000ffU +#define ADDRESS_SHIFT 0 +#define ADDRESS(x) ((x) << ADDRESS_SHIFT) + +#define MAC_PORT_INT_CAUSE 0x8dc +#define XGMAC_PORT_INT_CAUSE 0x10dc + +#define A_TP_TX_MOD_QUEUE_REQ_MAP 0x7e28 + +#define A_TP_TX_MOD_CHANNEL_WEIGHT 0x7e34 + +#define S_TX_MOD_QUEUE_REQ_MAP 0 +#define M_TX_MOD_QUEUE_REQ_MAP 0xffffU +#define V_TX_MOD_QUEUE_REQ_MAP(x) ((x) << S_TX_MOD_QUEUE_REQ_MAP) + +#define A_TP_TX_MOD_QUEUE_WEIGHT0 0x7e30 + +#define S_TX_MODQ_WEIGHT3 24 +#define M_TX_MODQ_WEIGHT3 0xffU +#define V_TX_MODQ_WEIGHT3(x) ((x) << S_TX_MODQ_WEIGHT3) + +#define S_TX_MODQ_WEIGHT2 16 +#define M_TX_MODQ_WEIGHT2 0xffU +#define V_TX_MODQ_WEIGHT2(x) ((x) << S_TX_MODQ_WEIGHT2) + +#define S_TX_MODQ_WEIGHT1 8 +#define M_TX_MODQ_WEIGHT1 0xffU +#define V_TX_MODQ_WEIGHT1(x) ((x) << S_TX_MODQ_WEIGHT1) + +#define S_TX_MODQ_WEIGHT0 0 +#define M_TX_MODQ_WEIGHT0 0xffU +#define V_TX_MODQ_WEIGHT0(x) ((x) << S_TX_MODQ_WEIGHT0) + +#define A_TP_TX_SCHED_HDR 0x23 + +#define A_TP_TX_SCHED_FIFO 0x24 + +#define A_TP_TX_SCHED_PCMD 0x25 + +#define S_VNIC 11 +#define V_VNIC(x) ((x) << S_VNIC) +#define F_VNIC V_VNIC(1U) + +#define S_FRAGMENTATION 9 +#define V_FRAGMENTATION(x) ((x) << S_FRAGMENTATION) +#define F_FRAGMENTATION V_FRAGMENTATION(1U) + +#define S_MPSHITTYPE 8 +#define V_MPSHITTYPE(x) ((x) << S_MPSHITTYPE) +#define F_MPSHITTYPE V_MPSHITTYPE(1U) + +#define S_MACMATCH 7 +#define V_MACMATCH(x) ((x) << S_MACMATCH) +#define F_MACMATCH V_MACMATCH(1U) + +#define S_ETHERTYPE 6 +#define V_ETHERTYPE(x) ((x) << S_ETHERTYPE) +#define F_ETHERTYPE V_ETHERTYPE(1U) + +#define S_PROTOCOL 5 +#define V_PROTOCOL(x) ((x) << S_PROTOCOL) +#define F_PROTOCOL V_PROTOCOL(1U) + +#define S_TOS 4 +#define V_TOS(x) ((x) << S_TOS) +#define F_TOS V_TOS(1U) + +#define S_VLAN 3 +#define V_VLAN(x) ((x) << S_VLAN) +#define F_VLAN V_VLAN(1U) + +#define S_VNIC_ID 2 +#define V_VNIC_ID(x) ((x) << S_VNIC_ID) +#define F_VNIC_ID V_VNIC_ID(1U) + +#define S_PORT 1 +#define V_PORT(x) ((x) << S_PORT) +#define F_PORT V_PORT(1U) + +#define S_FCOE 0 +#define V_FCOE(x) ((x) << S_FCOE) +#define F_FCOE V_FCOE(1U) + +#define NUM_MPS_CLS_SRAM_L_INSTANCES 336 +#define NUM_MPS_T5_CLS_SRAM_L_INSTANCES 512 + +#define T5_PORT0_BASE 0x30000 +#define T5_PORT_STRIDE 0x4000 +#define T5_PORT_BASE(idx) (T5_PORT0_BASE + (idx) * T5_PORT_STRIDE) +#define T5_PORT_REG(idx, reg) (T5_PORT_BASE(idx) + (reg)) + +#define MC_0_BASE_ADDR 0x40000 +#define MC_1_BASE_ADDR 0x48000 +#define MC_STRIDE (MC_1_BASE_ADDR - MC_0_BASE_ADDR) +#define MC_REG(reg, idx) (reg + MC_STRIDE * idx) + +#define MC_P_BIST_CMD 0x41400 +#define MC_P_BIST_CMD_ADDR 0x41404 +#define MC_P_BIST_CMD_LEN 0x41408 +#define MC_P_BIST_DATA_PATTERN 0x4140c +#define MC_P_BIST_STATUS_RDATA 0x41488 +#define EDC_T50_BASE_ADDR 0x50000 +#define EDC_H_BIST_CMD 0x50004 +#define EDC_H_BIST_CMD_ADDR 0x50008 +#define EDC_H_BIST_CMD_LEN 0x5000c +#define EDC_H_BIST_DATA_PATTERN 0x50010 +#define EDC_H_BIST_STATUS_RDATA 0x50028 + +#define EDC_T51_BASE_ADDR 0x50800 +#define EDC_STRIDE_T5 (EDC_T51_BASE_ADDR - EDC_T50_BASE_ADDR) +#define EDC_REG_T5(reg, idx) (reg + EDC_STRIDE_T5 * idx) + +#define A_PL_VF_REV 0x4 +#define A_PL_VF_WHOAMI 0x0 +#define A_PL_VF_REVISION 0x8 + +#define S_CHIPID 4 +#define M_CHIPID 0xfU +#define V_CHIPID(x) ((x) << S_CHIPID) +#define G_CHIPID(x) (((x) >> S_CHIPID) & M_CHIPID) + +/* TP_VLAN_PRI_MAP controls which subset of fields will be present in the + * Compressed Filter Tuple for LE filters. Each bit set in TP_VLAN_PRI_MAP + * selects for a particular field being present. These fields, when present + * in the Compressed Filter Tuple, have the following widths in bits. + */ +#define W_FT_FCOE 1 +#define W_FT_PORT 3 +#define W_FT_VNIC_ID 17 +#define W_FT_VLAN 17 +#define W_FT_TOS 8 +#define W_FT_PROTOCOL 8 +#define W_FT_ETHERTYPE 16 +#define W_FT_MACMATCH 9 +#define W_FT_MPSHITTYPE 3 +#define W_FT_FRAGMENTATION 1 + +/* Some of the Compressed Filter Tuple fields have internal structure. These + * bit shifts/masks describe those structures. All shifts are relative to the + * base position of the fields within the Compressed Filter Tuple + */ +#define S_FT_VLAN_VLD 16 +#define V_FT_VLAN_VLD(x) ((x) << S_FT_VLAN_VLD) +#define F_FT_VLAN_VLD V_FT_VLAN_VLD(1U) + +#define S_FT_VNID_ID_VF 0 +#define V_FT_VNID_ID_VF(x) ((x) << S_FT_VNID_ID_VF) + +#define S_FT_VNID_ID_PF 7 +#define V_FT_VNID_ID_PF(x) ((x) << S_FT_VNID_ID_PF) + +#define S_FT_VNID_ID_VLD 16 +#define V_FT_VNID_ID_VLD(x) ((x) << S_FT_VNID_ID_VLD) + +#endif /* __T4_REGS_H */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h new file mode 100644 index 0000000..3409756 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h @@ -0,0 +1,2282 @@ +/* + * This file is part of the Chelsio T4 Ethernet driver for Linux. + * + * Copyright (c) 2009-2014 Chelsio Communications, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _T4FW_INTERFACE_H_ +#define _T4FW_INTERFACE_H_ + +enum fw_retval { + FW_SUCCESS = 0, /* completed sucessfully */ + FW_EPERM = 1, /* operation not permitted */ + FW_ENOENT = 2, /* no such file or directory */ + FW_EIO = 5, /* input/output error; hw bad */ + FW_ENOEXEC = 8, /* exec format error; inv microcode */ + FW_EAGAIN = 11, /* try again */ + FW_ENOMEM = 12, /* out of memory */ + FW_EFAULT = 14, /* bad address; fw bad */ + FW_EBUSY = 16, /* resource busy */ + FW_EEXIST = 17, /* file exists */ + FW_ENODEV = 19, /* no such device */ + FW_EINVAL = 22, /* invalid argument */ + FW_ENOSPC = 28, /* no space left on device */ + FW_ENOSYS = 38, /* functionality not implemented */ + FW_ENODATA = 61, /* no data available */ + FW_EPROTO = 71, /* protocol error */ + FW_EADDRINUSE = 98, /* address already in use */ + FW_EADDRNOTAVAIL = 99, /* cannot assigned requested address */ + FW_ENETDOWN = 100, /* network is down */ + FW_ENETUNREACH = 101, /* network is unreachable */ + FW_ENOBUFS = 105, /* no buffer space available */ + FW_ETIMEDOUT = 110, /* timeout */ + FW_EINPROGRESS = 115, /* fw internal */ + FW_SCSI_ABORT_REQUESTED = 128, /* */ + FW_SCSI_ABORT_TIMEDOUT = 129, /* */ + FW_SCSI_ABORTED = 130, /* */ + FW_SCSI_CLOSE_REQUESTED = 131, /* */ + FW_ERR_LINK_DOWN = 132, /* */ + FW_RDEV_NOT_READY = 133, /* */ + FW_ERR_RDEV_LOST = 134, /* */ + FW_ERR_RDEV_LOGO = 135, /* */ + FW_FCOE_NO_XCHG = 136, /* */ + FW_SCSI_RSP_ERR = 137, /* */ + FW_ERR_RDEV_IMPL_LOGO = 138, /* */ + FW_SCSI_UNDER_FLOW_ERR = 139, /* */ + FW_SCSI_OVER_FLOW_ERR = 140, /* */ + FW_SCSI_DDP_ERR = 141, /* DDP error*/ + FW_SCSI_TASK_ERR = 142, /* No SCSI tasks available */ +}; + +#define FW_T4VF_SGE_BASE_ADDR 0x0000 +#define FW_T4VF_MPS_BASE_ADDR 0x0100 +#define FW_T4VF_PL_BASE_ADDR 0x0200 +#define FW_T4VF_MBDATA_BASE_ADDR 0x0240 +#define FW_T4VF_CIM_BASE_ADDR 0x0300 + +enum fw_wr_opcodes { + FW_FILTER_WR = 0x02, + FW_ULPTX_WR = 0x04, + FW_TP_WR = 0x05, + FW_ETH_TX_PKT_WR = 0x08, + FW_OFLD_CONNECTION_WR = 0x2f, + FW_FLOWC_WR = 0x0a, + FW_OFLD_TX_DATA_WR = 0x0b, + FW_CMD_WR = 0x10, + FW_ETH_TX_PKT_VM_WR = 0x11, + FW_RI_RES_WR = 0x0c, + FW_RI_INIT_WR = 0x0d, + FW_RI_RDMA_WRITE_WR = 0x14, + FW_RI_SEND_WR = 0x15, + FW_RI_RDMA_READ_WR = 0x16, + FW_RI_RECV_WR = 0x17, + FW_RI_BIND_MW_WR = 0x18, + FW_RI_FR_NSMR_WR = 0x19, + FW_RI_INV_LSTAG_WR = 0x1a, + FW_LASTC2E_WR = 0x40 +}; + +struct fw_wr_hdr { + __be32 hi; + __be32 lo; +}; + +#define FW_WR_OP(x) ((x) << 24) +#define FW_WR_OP_GET(x) (((x) >> 24) & 0xff) +#define FW_WR_ATOMIC(x) ((x) << 23) +#define FW_WR_FLUSH(x) ((x) << 22) +#define FW_WR_COMPL(x) ((x) << 21) +#define FW_WR_IMMDLEN_MASK 0xff +#define FW_WR_IMMDLEN(x) ((x) << 0) + +#define FW_WR_EQUIQ (1U << 31) +#define FW_WR_EQUEQ (1U << 30) +#define FW_WR_FLOWID(x) ((x) << 8) +#define FW_WR_LEN16(x) ((x) << 0) + +#define HW_TPL_FR_MT_PR_IV_P_FC 0X32B +#define HW_TPL_FR_MT_PR_OV_P_FC 0X327 + +/* filter wr reply code in cookie in CPL_SET_TCB_RPL */ +enum fw_filter_wr_cookie { + FW_FILTER_WR_SUCCESS, + FW_FILTER_WR_FLT_ADDED, + FW_FILTER_WR_FLT_DELETED, + FW_FILTER_WR_SMT_TBL_FULL, + FW_FILTER_WR_EINVAL, +}; + +struct fw_filter_wr { + __be32 op_pkd; + __be32 len16_pkd; + __be64 r3; + __be32 tid_to_iq; + __be32 del_filter_to_l2tix; + __be16 ethtype; + __be16 ethtypem; + __u8 frag_to_ovlan_vldm; + __u8 smac_sel; + __be16 rx_chan_rx_rpl_iq; + __be32 maci_to_matchtypem; + __u8 ptcl; + __u8 ptclm; + __u8 ttyp; + __u8 ttypm; + __be16 ivlan; + __be16 ivlanm; + __be16 ovlan; + __be16 ovlanm; + __u8 lip[16]; + __u8 lipm[16]; + __u8 fip[16]; + __u8 fipm[16]; + __be16 lp; + __be16 lpm; + __be16 fp; + __be16 fpm; + __be16 r7; + __u8 sma[6]; +}; + +#define S_FW_FILTER_WR_TID 12 +#define M_FW_FILTER_WR_TID 0xfffff +#define V_FW_FILTER_WR_TID(x) ((x) << S_FW_FILTER_WR_TID) +#define G_FW_FILTER_WR_TID(x) \ + (((x) >> S_FW_FILTER_WR_TID) & M_FW_FILTER_WR_TID) + +#define S_FW_FILTER_WR_RQTYPE 11 +#define M_FW_FILTER_WR_RQTYPE 0x1 +#define V_FW_FILTER_WR_RQTYPE(x) ((x) << S_FW_FILTER_WR_RQTYPE) +#define G_FW_FILTER_WR_RQTYPE(x) \ + (((x) >> S_FW_FILTER_WR_RQTYPE) & M_FW_FILTER_WR_RQTYPE) +#define F_FW_FILTER_WR_RQTYPE V_FW_FILTER_WR_RQTYPE(1U) + +#define S_FW_FILTER_WR_NOREPLY 10 +#define M_FW_FILTER_WR_NOREPLY 0x1 +#define V_FW_FILTER_WR_NOREPLY(x) ((x) << S_FW_FILTER_WR_NOREPLY) +#define G_FW_FILTER_WR_NOREPLY(x) \ + (((x) >> S_FW_FILTER_WR_NOREPLY) & M_FW_FILTER_WR_NOREPLY) +#define F_FW_FILTER_WR_NOREPLY V_FW_FILTER_WR_NOREPLY(1U) + +#define S_FW_FILTER_WR_IQ 0 +#define M_FW_FILTER_WR_IQ 0x3ff +#define V_FW_FILTER_WR_IQ(x) ((x) << S_FW_FILTER_WR_IQ) +#define G_FW_FILTER_WR_IQ(x) \ + (((x) >> S_FW_FILTER_WR_IQ) & M_FW_FILTER_WR_IQ) + +#define S_FW_FILTER_WR_DEL_FILTER 31 +#define M_FW_FILTER_WR_DEL_FILTER 0x1 +#define V_FW_FILTER_WR_DEL_FILTER(x) ((x) << S_FW_FILTER_WR_DEL_FILTER) +#define G_FW_FILTER_WR_DEL_FILTER(x) \ + (((x) >> S_FW_FILTER_WR_DEL_FILTER) & M_FW_FILTER_WR_DEL_FILTER) +#define F_FW_FILTER_WR_DEL_FILTER V_FW_FILTER_WR_DEL_FILTER(1U) + +#define S_FW_FILTER_WR_RPTTID 25 +#define M_FW_FILTER_WR_RPTTID 0x1 +#define V_FW_FILTER_WR_RPTTID(x) ((x) << S_FW_FILTER_WR_RPTTID) +#define G_FW_FILTER_WR_RPTTID(x) \ + (((x) >> S_FW_FILTER_WR_RPTTID) & M_FW_FILTER_WR_RPTTID) +#define F_FW_FILTER_WR_RPTTID V_FW_FILTER_WR_RPTTID(1U) + +#define S_FW_FILTER_WR_DROP 24 +#define M_FW_FILTER_WR_DROP 0x1 +#define V_FW_FILTER_WR_DROP(x) ((x) << S_FW_FILTER_WR_DROP) +#define G_FW_FILTER_WR_DROP(x) \ + (((x) >> S_FW_FILTER_WR_DROP) & M_FW_FILTER_WR_DROP) +#define F_FW_FILTER_WR_DROP V_FW_FILTER_WR_DROP(1U) + +#define S_FW_FILTER_WR_DIRSTEER 23 +#define M_FW_FILTER_WR_DIRSTEER 0x1 +#define V_FW_FILTER_WR_DIRSTEER(x) ((x) << S_FW_FILTER_WR_DIRSTEER) +#define G_FW_FILTER_WR_DIRSTEER(x) \ + (((x) >> S_FW_FILTER_WR_DIRSTEER) & M_FW_FILTER_WR_DIRSTEER) +#define F_FW_FILTER_WR_DIRSTEER V_FW_FILTER_WR_DIRSTEER(1U) + +#define S_FW_FILTER_WR_MASKHASH 22 +#define M_FW_FILTER_WR_MASKHASH 0x1 +#define V_FW_FILTER_WR_MASKHASH(x) ((x) << S_FW_FILTER_WR_MASKHASH) +#define G_FW_FILTER_WR_MASKHASH(x) \ + (((x) >> S_FW_FILTER_WR_MASKHASH) & M_FW_FILTER_WR_MASKHASH) +#define F_FW_FILTER_WR_MASKHASH V_FW_FILTER_WR_MASKHASH(1U) + +#define S_FW_FILTER_WR_DIRSTEERHASH 21 +#define M_FW_FILTER_WR_DIRSTEERHASH 0x1 +#define V_FW_FILTER_WR_DIRSTEERHASH(x) ((x) << S_FW_FILTER_WR_DIRSTEERHASH) +#define G_FW_FILTER_WR_DIRSTEERHASH(x) \ + (((x) >> S_FW_FILTER_WR_DIRSTEERHASH) & M_FW_FILTER_WR_DIRSTEERHASH) +#define F_FW_FILTER_WR_DIRSTEERHASH V_FW_FILTER_WR_DIRSTEERHASH(1U) + +#define S_FW_FILTER_WR_LPBK 20 +#define M_FW_FILTER_WR_LPBK 0x1 +#define V_FW_FILTER_WR_LPBK(x) ((x) << S_FW_FILTER_WR_LPBK) +#define G_FW_FILTER_WR_LPBK(x) \ + (((x) >> S_FW_FILTER_WR_LPBK) & M_FW_FILTER_WR_LPBK) +#define F_FW_FILTER_WR_LPBK V_FW_FILTER_WR_LPBK(1U) + +#define S_FW_FILTER_WR_DMAC 19 +#define M_FW_FILTER_WR_DMAC 0x1 +#define V_FW_FILTER_WR_DMAC(x) ((x) << S_FW_FILTER_WR_DMAC) +#define G_FW_FILTER_WR_DMAC(x) \ + (((x) >> S_FW_FILTER_WR_DMAC) & M_FW_FILTER_WR_DMAC) +#define F_FW_FILTER_WR_DMAC V_FW_FILTER_WR_DMAC(1U) + +#define S_FW_FILTER_WR_SMAC 18 +#define M_FW_FILTER_WR_SMAC 0x1 +#define V_FW_FILTER_WR_SMAC(x) ((x) << S_FW_FILTER_WR_SMAC) +#define G_FW_FILTER_WR_SMAC(x) \ + (((x) >> S_FW_FILTER_WR_SMAC) & M_FW_FILTER_WR_SMAC) +#define F_FW_FILTER_WR_SMAC V_FW_FILTER_WR_SMAC(1U) + +#define S_FW_FILTER_WR_INSVLAN 17 +#define M_FW_FILTER_WR_INSVLAN 0x1 +#define V_FW_FILTER_WR_INSVLAN(x) ((x) << S_FW_FILTER_WR_INSVLAN) +#define G_FW_FILTER_WR_INSVLAN(x) \ + (((x) >> S_FW_FILTER_WR_INSVLAN) & M_FW_FILTER_WR_INSVLAN) +#define F_FW_FILTER_WR_INSVLAN V_FW_FILTER_WR_INSVLAN(1U) + +#define S_FW_FILTER_WR_RMVLAN 16 +#define M_FW_FILTER_WR_RMVLAN 0x1 +#define V_FW_FILTER_WR_RMVLAN(x) ((x) << S_FW_FILTER_WR_RMVLAN) +#define G_FW_FILTER_WR_RMVLAN(x) \ + (((x) >> S_FW_FILTER_WR_RMVLAN) & M_FW_FILTER_WR_RMVLAN) +#define F_FW_FILTER_WR_RMVLAN V_FW_FILTER_WR_RMVLAN(1U) + +#define S_FW_FILTER_WR_HITCNTS 15 +#define M_FW_FILTER_WR_HITCNTS 0x1 +#define V_FW_FILTER_WR_HITCNTS(x) ((x) << S_FW_FILTER_WR_HITCNTS) +#define G_FW_FILTER_WR_HITCNTS(x) \ + (((x) >> S_FW_FILTER_WR_HITCNTS) & M_FW_FILTER_WR_HITCNTS) +#define F_FW_FILTER_WR_HITCNTS V_FW_FILTER_WR_HITCNTS(1U) + +#define S_FW_FILTER_WR_TXCHAN 13 +#define M_FW_FILTER_WR_TXCHAN 0x3 +#define V_FW_FILTER_WR_TXCHAN(x) ((x) << S_FW_FILTER_WR_TXCHAN) +#define G_FW_FILTER_WR_TXCHAN(x) \ + (((x) >> S_FW_FILTER_WR_TXCHAN) & M_FW_FILTER_WR_TXCHAN) + +#define S_FW_FILTER_WR_PRIO 12 +#define M_FW_FILTER_WR_PRIO 0x1 +#define V_FW_FILTER_WR_PRIO(x) ((x) << S_FW_FILTER_WR_PRIO) +#define G_FW_FILTER_WR_PRIO(x) \ + (((x) >> S_FW_FILTER_WR_PRIO) & M_FW_FILTER_WR_PRIO) +#define F_FW_FILTER_WR_PRIO V_FW_FILTER_WR_PRIO(1U) + +#define S_FW_FILTER_WR_L2TIX 0 +#define M_FW_FILTER_WR_L2TIX 0xfff +#define V_FW_FILTER_WR_L2TIX(x) ((x) << S_FW_FILTER_WR_L2TIX) +#define G_FW_FILTER_WR_L2TIX(x) \ + (((x) >> S_FW_FILTER_WR_L2TIX) & M_FW_FILTER_WR_L2TIX) + +#define S_FW_FILTER_WR_FRAG 7 +#define M_FW_FILTER_WR_FRAG 0x1 +#define V_FW_FILTER_WR_FRAG(x) ((x) << S_FW_FILTER_WR_FRAG) +#define G_FW_FILTER_WR_FRAG(x) \ + (((x) >> S_FW_FILTER_WR_FRAG) & M_FW_FILTER_WR_FRAG) +#define F_FW_FILTER_WR_FRAG V_FW_FILTER_WR_FRAG(1U) + +#define S_FW_FILTER_WR_FRAGM 6 +#define M_FW_FILTER_WR_FRAGM 0x1 +#define V_FW_FILTER_WR_FRAGM(x) ((x) << S_FW_FILTER_WR_FRAGM) +#define G_FW_FILTER_WR_FRAGM(x) \ + (((x) >> S_FW_FILTER_WR_FRAGM) & M_FW_FILTER_WR_FRAGM) +#define F_FW_FILTER_WR_FRAGM V_FW_FILTER_WR_FRAGM(1U) + +#define S_FW_FILTER_WR_IVLAN_VLD 5 +#define M_FW_FILTER_WR_IVLAN_VLD 0x1 +#define V_FW_FILTER_WR_IVLAN_VLD(x) ((x) << S_FW_FILTER_WR_IVLAN_VLD) +#define G_FW_FILTER_WR_IVLAN_VLD(x) \ + (((x) >> S_FW_FILTER_WR_IVLAN_VLD) & M_FW_FILTER_WR_IVLAN_VLD) +#define F_FW_FILTER_WR_IVLAN_VLD V_FW_FILTER_WR_IVLAN_VLD(1U) + +#define S_FW_FILTER_WR_OVLAN_VLD 4 +#define M_FW_FILTER_WR_OVLAN_VLD 0x1 +#define V_FW_FILTER_WR_OVLAN_VLD(x) ((x) << S_FW_FILTER_WR_OVLAN_VLD) +#define G_FW_FILTER_WR_OVLAN_VLD(x) \ + (((x) >> S_FW_FILTER_WR_OVLAN_VLD) & M_FW_FILTER_WR_OVLAN_VLD) +#define F_FW_FILTER_WR_OVLAN_VLD V_FW_FILTER_WR_OVLAN_VLD(1U) + +#define S_FW_FILTER_WR_IVLAN_VLDM 3 +#define M_FW_FILTER_WR_IVLAN_VLDM 0x1 +#define V_FW_FILTER_WR_IVLAN_VLDM(x) ((x) << S_FW_FILTER_WR_IVLAN_VLDM) +#define G_FW_FILTER_WR_IVLAN_VLDM(x) \ + (((x) >> S_FW_FILTER_WR_IVLAN_VLDM) & M_FW_FILTER_WR_IVLAN_VLDM) +#define F_FW_FILTER_WR_IVLAN_VLDM V_FW_FILTER_WR_IVLAN_VLDM(1U) + +#define S_FW_FILTER_WR_OVLAN_VLDM 2 +#define M_FW_FILTER_WR_OVLAN_VLDM 0x1 +#define V_FW_FILTER_WR_OVLAN_VLDM(x) ((x) << S_FW_FILTER_WR_OVLAN_VLDM) +#define G_FW_FILTER_WR_OVLAN_VLDM(x) \ + (((x) >> S_FW_FILTER_WR_OVLAN_VLDM) & M_FW_FILTER_WR_OVLAN_VLDM) +#define F_FW_FILTER_WR_OVLAN_VLDM V_FW_FILTER_WR_OVLAN_VLDM(1U) + +#define S_FW_FILTER_WR_RX_CHAN 15 +#define M_FW_FILTER_WR_RX_CHAN 0x1 +#define V_FW_FILTER_WR_RX_CHAN(x) ((x) << S_FW_FILTER_WR_RX_CHAN) +#define G_FW_FILTER_WR_RX_CHAN(x) \ + (((x) >> S_FW_FILTER_WR_RX_CHAN) & M_FW_FILTER_WR_RX_CHAN) +#define F_FW_FILTER_WR_RX_CHAN V_FW_FILTER_WR_RX_CHAN(1U) + +#define S_FW_FILTER_WR_RX_RPL_IQ 0 +#define M_FW_FILTER_WR_RX_RPL_IQ 0x3ff +#define V_FW_FILTER_WR_RX_RPL_IQ(x) ((x) << S_FW_FILTER_WR_RX_RPL_IQ) +#define G_FW_FILTER_WR_RX_RPL_IQ(x) \ + (((x) >> S_FW_FILTER_WR_RX_RPL_IQ) & M_FW_FILTER_WR_RX_RPL_IQ) + +#define S_FW_FILTER_WR_MACI 23 +#define M_FW_FILTER_WR_MACI 0x1ff +#define V_FW_FILTER_WR_MACI(x) ((x) << S_FW_FILTER_WR_MACI) +#define G_FW_FILTER_WR_MACI(x) \ + (((x) >> S_FW_FILTER_WR_MACI) & M_FW_FILTER_WR_MACI) + +#define S_FW_FILTER_WR_MACIM 14 +#define M_FW_FILTER_WR_MACIM 0x1ff +#define V_FW_FILTER_WR_MACIM(x) ((x) << S_FW_FILTER_WR_MACIM) +#define G_FW_FILTER_WR_MACIM(x) \ + (((x) >> S_FW_FILTER_WR_MACIM) & M_FW_FILTER_WR_MACIM) + +#define S_FW_FILTER_WR_FCOE 13 +#define M_FW_FILTER_WR_FCOE 0x1 +#define V_FW_FILTER_WR_FCOE(x) ((x) << S_FW_FILTER_WR_FCOE) +#define G_FW_FILTER_WR_FCOE(x) \ + (((x) >> S_FW_FILTER_WR_FCOE) & M_FW_FILTER_WR_FCOE) +#define F_FW_FILTER_WR_FCOE V_FW_FILTER_WR_FCOE(1U) + +#define S_FW_FILTER_WR_FCOEM 12 +#define M_FW_FILTER_WR_FCOEM 0x1 +#define V_FW_FILTER_WR_FCOEM(x) ((x) << S_FW_FILTER_WR_FCOEM) +#define G_FW_FILTER_WR_FCOEM(x) \ + (((x) >> S_FW_FILTER_WR_FCOEM) & M_FW_FILTER_WR_FCOEM) +#define F_FW_FILTER_WR_FCOEM V_FW_FILTER_WR_FCOEM(1U) + +#define S_FW_FILTER_WR_PORT 9 +#define M_FW_FILTER_WR_PORT 0x7 +#define V_FW_FILTER_WR_PORT(x) ((x) << S_FW_FILTER_WR_PORT) +#define G_FW_FILTER_WR_PORT(x) \ + (((x) >> S_FW_FILTER_WR_PORT) & M_FW_FILTER_WR_PORT) + +#define S_FW_FILTER_WR_PORTM 6 +#define M_FW_FILTER_WR_PORTM 0x7 +#define V_FW_FILTER_WR_PORTM(x) ((x) << S_FW_FILTER_WR_PORTM) +#define G_FW_FILTER_WR_PORTM(x) \ + (((x) >> S_FW_FILTER_WR_PORTM) & M_FW_FILTER_WR_PORTM) + +#define S_FW_FILTER_WR_MATCHTYPE 3 +#define M_FW_FILTER_WR_MATCHTYPE 0x7 +#define V_FW_FILTER_WR_MATCHTYPE(x) ((x) << S_FW_FILTER_WR_MATCHTYPE) +#define G_FW_FILTER_WR_MATCHTYPE(x) \ + (((x) >> S_FW_FILTER_WR_MATCHTYPE) & M_FW_FILTER_WR_MATCHTYPE) + +#define S_FW_FILTER_WR_MATCHTYPEM 0 +#define M_FW_FILTER_WR_MATCHTYPEM 0x7 +#define V_FW_FILTER_WR_MATCHTYPEM(x) ((x) << S_FW_FILTER_WR_MATCHTYPEM) +#define G_FW_FILTER_WR_MATCHTYPEM(x) \ + (((x) >> S_FW_FILTER_WR_MATCHTYPEM) & M_FW_FILTER_WR_MATCHTYPEM) + +struct fw_ulptx_wr { + __be32 op_to_compl; + __be32 flowid_len16; + u64 cookie; +}; + +struct fw_tp_wr { + __be32 op_to_immdlen; + __be32 flowid_len16; + u64 cookie; +}; + +struct fw_eth_tx_pkt_wr { + __be32 op_immdlen; + __be32 equiq_to_len16; + __be64 r3; +}; + +struct fw_ofld_connection_wr { + __be32 op_compl; + __be32 len16_pkd; + __u64 cookie; + __be64 r2; + __be64 r3; + struct fw_ofld_connection_le { + __be32 version_cpl; + __be32 filter; + __be32 r1; + __be16 lport; + __be16 pport; + union fw_ofld_connection_leip { + struct fw_ofld_connection_le_ipv4 { + __be32 pip; + __be32 lip; + __be64 r0; + __be64 r1; + __be64 r2; + } ipv4; + struct fw_ofld_connection_le_ipv6 { + __be64 pip_hi; + __be64 pip_lo; + __be64 lip_hi; + __be64 lip_lo; + } ipv6; + } u; + } le; + struct fw_ofld_connection_tcb { + __be32 t_state_to_astid; + __be16 cplrxdataack_cplpassacceptrpl; + __be16 rcv_adv; + __be32 rcv_nxt; + __be32 tx_max; + __be64 opt0; + __be32 opt2; + __be32 r1; + __be64 r2; + __be64 r3; + } tcb; +}; + +#define S_FW_OFLD_CONNECTION_WR_VERSION 31 +#define M_FW_OFLD_CONNECTION_WR_VERSION 0x1 +#define V_FW_OFLD_CONNECTION_WR_VERSION(x) \ + ((x) << S_FW_OFLD_CONNECTION_WR_VERSION) +#define G_FW_OFLD_CONNECTION_WR_VERSION(x) \ + (((x) >> S_FW_OFLD_CONNECTION_WR_VERSION) & \ + M_FW_OFLD_CONNECTION_WR_VERSION) +#define F_FW_OFLD_CONNECTION_WR_VERSION \ + V_FW_OFLD_CONNECTION_WR_VERSION(1U) + +#define S_FW_OFLD_CONNECTION_WR_CPL 30 +#define M_FW_OFLD_CONNECTION_WR_CPL 0x1 +#define V_FW_OFLD_CONNECTION_WR_CPL(x) ((x) << S_FW_OFLD_CONNECTION_WR_CPL) +#define G_FW_OFLD_CONNECTION_WR_CPL(x) \ + (((x) >> S_FW_OFLD_CONNECTION_WR_CPL) & M_FW_OFLD_CONNECTION_WR_CPL) +#define F_FW_OFLD_CONNECTION_WR_CPL V_FW_OFLD_CONNECTION_WR_CPL(1U) + +#define S_FW_OFLD_CONNECTION_WR_T_STATE 28 +#define M_FW_OFLD_CONNECTION_WR_T_STATE 0xf +#define V_FW_OFLD_CONNECTION_WR_T_STATE(x) \ + ((x) << S_FW_OFLD_CONNECTION_WR_T_STATE) +#define G_FW_OFLD_CONNECTION_WR_T_STATE(x) \ + (((x) >> S_FW_OFLD_CONNECTION_WR_T_STATE) & \ + M_FW_OFLD_CONNECTION_WR_T_STATE) + +#define S_FW_OFLD_CONNECTION_WR_RCV_SCALE 24 +#define M_FW_OFLD_CONNECTION_WR_RCV_SCALE 0xf +#define V_FW_OFLD_CONNECTION_WR_RCV_SCALE(x) \ + ((x) << S_FW_OFLD_CONNECTION_WR_RCV_SCALE) +#define G_FW_OFLD_CONNECTION_WR_RCV_SCALE(x) \ + (((x) >> S_FW_OFLD_CONNECTION_WR_RCV_SCALE) & \ + M_FW_OFLD_CONNECTION_WR_RCV_SCALE) + +#define S_FW_OFLD_CONNECTION_WR_ASTID 0 +#define M_FW_OFLD_CONNECTION_WR_ASTID 0xffffff +#define V_FW_OFLD_CONNECTION_WR_ASTID(x) \ + ((x) << S_FW_OFLD_CONNECTION_WR_ASTID) +#define G_FW_OFLD_CONNECTION_WR_ASTID(x) \ + (((x) >> S_FW_OFLD_CONNECTION_WR_ASTID) & M_FW_OFLD_CONNECTION_WR_ASTID) + +#define S_FW_OFLD_CONNECTION_WR_CPLRXDATAACK 15 +#define M_FW_OFLD_CONNECTION_WR_CPLRXDATAACK 0x1 +#define V_FW_OFLD_CONNECTION_WR_CPLRXDATAACK(x) \ + ((x) << S_FW_OFLD_CONNECTION_WR_CPLRXDATAACK) +#define G_FW_OFLD_CONNECTION_WR_CPLRXDATAACK(x) \ + (((x) >> S_FW_OFLD_CONNECTION_WR_CPLRXDATAACK) & \ + M_FW_OFLD_CONNECTION_WR_CPLRXDATAACK) +#define F_FW_OFLD_CONNECTION_WR_CPLRXDATAACK \ + V_FW_OFLD_CONNECTION_WR_CPLRXDATAACK(1U) + +#define S_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL 14 +#define M_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL 0x1 +#define V_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL(x) \ + ((x) << S_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL) +#define G_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL(x) \ + (((x) >> S_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL) & \ + M_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL) +#define F_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL \ + V_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL(1U) + +enum fw_flowc_mnem { + FW_FLOWC_MNEM_PFNVFN, /* PFN [15:8] VFN [7:0] */ + FW_FLOWC_MNEM_CH, + FW_FLOWC_MNEM_PORT, + FW_FLOWC_MNEM_IQID, + FW_FLOWC_MNEM_SNDNXT, + FW_FLOWC_MNEM_RCVNXT, + FW_FLOWC_MNEM_SNDBUF, + FW_FLOWC_MNEM_MSS, +}; + +struct fw_flowc_mnemval { + u8 mnemonic; + u8 r4[3]; + __be32 val; +}; + +struct fw_flowc_wr { + __be32 op_to_nparams; +#define FW_FLOWC_WR_NPARAMS(x) ((x) << 0) + __be32 flowid_len16; + struct fw_flowc_mnemval mnemval[0]; +}; + +struct fw_ofld_tx_data_wr { + __be32 op_to_immdlen; + __be32 flowid_len16; + __be32 plen; + __be32 tunnel_to_proxy; +#define FW_OFLD_TX_DATA_WR_TUNNEL(x) ((x) << 19) +#define FW_OFLD_TX_DATA_WR_SAVE(x) ((x) << 18) +#define FW_OFLD_TX_DATA_WR_FLUSH(x) ((x) << 17) +#define FW_OFLD_TX_DATA_WR_URGENT(x) ((x) << 16) +#define FW_OFLD_TX_DATA_WR_MORE(x) ((x) << 15) +#define FW_OFLD_TX_DATA_WR_SHOVE(x) ((x) << 14) +#define FW_OFLD_TX_DATA_WR_ULPMODE(x) ((x) << 10) +#define FW_OFLD_TX_DATA_WR_ULPSUBMODE(x) ((x) << 6) +}; + +struct fw_cmd_wr { + __be32 op_dma; +#define FW_CMD_WR_DMA (1U << 17) + __be32 len16_pkd; + __be64 cookie_daddr; +}; + +struct fw_eth_tx_pkt_vm_wr { + __be32 op_immdlen; + __be32 equiq_to_len16; + __be32 r3[2]; + u8 ethmacdst[6]; + u8 ethmacsrc[6]; + __be16 ethtype; + __be16 vlantci; +}; + +#define FW_CMD_MAX_TIMEOUT 10000 + +/* + * If a host driver does a HELLO and discovers that there's already a MASTER + * selected, we may have to wait for that MASTER to finish issuing RESET, + * configuration and INITIALIZE commands. Also, there's a possibility that + * our own HELLO may get lost if it happens right as the MASTER is issuign a + * RESET command, so we need to be willing to make a few retries of our HELLO. + */ +#define FW_CMD_HELLO_TIMEOUT (3 * FW_CMD_MAX_TIMEOUT) +#define FW_CMD_HELLO_RETRIES 3 + + +enum fw_cmd_opcodes { + FW_LDST_CMD = 0x01, + FW_RESET_CMD = 0x03, + FW_HELLO_CMD = 0x04, + FW_BYE_CMD = 0x05, + FW_INITIALIZE_CMD = 0x06, + FW_CAPS_CONFIG_CMD = 0x07, + FW_PARAMS_CMD = 0x08, + FW_PFVF_CMD = 0x09, + FW_IQ_CMD = 0x10, + FW_EQ_MNGT_CMD = 0x11, + FW_EQ_ETH_CMD = 0x12, + FW_EQ_CTRL_CMD = 0x13, + FW_EQ_OFLD_CMD = 0x21, + FW_VI_CMD = 0x14, + FW_VI_MAC_CMD = 0x15, + FW_VI_RXMODE_CMD = 0x16, + FW_VI_ENABLE_CMD = 0x17, + FW_ACL_MAC_CMD = 0x18, + FW_ACL_VLAN_CMD = 0x19, + FW_VI_STATS_CMD = 0x1a, + FW_PORT_CMD = 0x1b, + FW_PORT_STATS_CMD = 0x1c, + FW_PORT_LB_STATS_CMD = 0x1d, + FW_PORT_TRACE_CMD = 0x1e, + FW_PORT_TRACE_MMAP_CMD = 0x1f, + FW_RSS_IND_TBL_CMD = 0x20, + FW_RSS_GLB_CONFIG_CMD = 0x22, + FW_RSS_VI_CONFIG_CMD = 0x23, + FW_CLIP_CMD = 0x28, + FW_LASTC2E_CMD = 0x40, + FW_ERROR_CMD = 0x80, + FW_DEBUG_CMD = 0x81, +}; + +enum fw_cmd_cap { + FW_CMD_CAP_PF = 0x01, + FW_CMD_CAP_DMAQ = 0x02, + FW_CMD_CAP_PORT = 0x04, + FW_CMD_CAP_PORTPROMISC = 0x08, + FW_CMD_CAP_PORTSTATS = 0x10, + FW_CMD_CAP_VF = 0x80, +}; + +/* + * Generic command header flit0 + */ +struct fw_cmd_hdr { + __be32 hi; + __be32 lo; +}; + +#define FW_CMD_OP(x) ((x) << 24) +#define FW_CMD_OP_GET(x) (((x) >> 24) & 0xff) +#define FW_CMD_REQUEST (1U << 23) +#define FW_CMD_REQUEST_GET(x) (((x) >> 23) & 0x1) +#define FW_CMD_READ (1U << 22) +#define FW_CMD_WRITE (1U << 21) +#define FW_CMD_EXEC (1U << 20) +#define FW_CMD_RAMASK(x) ((x) << 20) +#define FW_CMD_RETVAL(x) ((x) << 8) +#define FW_CMD_RETVAL_GET(x) (((x) >> 8) & 0xff) +#define FW_CMD_LEN16(x) ((x) << 0) +#define FW_LEN16(fw_struct) FW_CMD_LEN16(sizeof(fw_struct) / 16) + +enum fw_ldst_addrspc { + FW_LDST_ADDRSPC_FIRMWARE = 0x0001, + FW_LDST_ADDRSPC_SGE_EGRC = 0x0008, + FW_LDST_ADDRSPC_SGE_INGC = 0x0009, + FW_LDST_ADDRSPC_SGE_FLMC = 0x000a, + FW_LDST_ADDRSPC_SGE_CONMC = 0x000b, + FW_LDST_ADDRSPC_TP_PIO = 0x0010, + FW_LDST_ADDRSPC_TP_TM_PIO = 0x0011, + FW_LDST_ADDRSPC_TP_MIB = 0x0012, + FW_LDST_ADDRSPC_MDIO = 0x0018, + FW_LDST_ADDRSPC_MPS = 0x0020, + FW_LDST_ADDRSPC_FUNC = 0x0028, + FW_LDST_ADDRSPC_FUNC_PCIE = 0x0029, +}; + +enum fw_ldst_mps_fid { + FW_LDST_MPS_ATRB, + FW_LDST_MPS_RPLC +}; + +enum fw_ldst_func_access_ctl { + FW_LDST_FUNC_ACC_CTL_VIID, + FW_LDST_FUNC_ACC_CTL_FID +}; + +enum fw_ldst_func_mod_index { + FW_LDST_FUNC_MPS +}; + +struct fw_ldst_cmd { + __be32 op_to_addrspace; +#define FW_LDST_CMD_ADDRSPACE(x) ((x) << 0) + __be32 cycles_to_len16; + union fw_ldst { + struct fw_ldst_addrval { + __be32 addr; + __be32 val; + } addrval; + struct fw_ldst_idctxt { + __be32 physid; + __be32 msg_pkd; + __be32 ctxt_data7; + __be32 ctxt_data6; + __be32 ctxt_data5; + __be32 ctxt_data4; + __be32 ctxt_data3; + __be32 ctxt_data2; + __be32 ctxt_data1; + __be32 ctxt_data0; + } idctxt; + struct fw_ldst_mdio { + __be16 paddr_mmd; + __be16 raddr; + __be16 vctl; + __be16 rval; + } mdio; + struct fw_ldst_mps { + __be16 fid_ctl; + __be16 rplcpf_pkd; + __be32 rplc127_96; + __be32 rplc95_64; + __be32 rplc63_32; + __be32 rplc31_0; + __be32 atrb; + __be16 vlan[16]; + } mps; + struct fw_ldst_func { + u8 access_ctl; + u8 mod_index; + __be16 ctl_id; + __be32 offset; + __be64 data0; + __be64 data1; + } func; + struct fw_ldst_pcie { + u8 ctrl_to_fn; + u8 bnum; + u8 r; + u8 ext_r; + u8 select_naccess; + u8 pcie_fn; + __be16 nset_pkd; + __be32 data[12]; + } pcie; + } u; +}; + +#define FW_LDST_CMD_MSG(x) ((x) << 31) +#define FW_LDST_CMD_PADDR(x) ((x) << 8) +#define FW_LDST_CMD_MMD(x) ((x) << 0) +#define FW_LDST_CMD_FID(x) ((x) << 15) +#define FW_LDST_CMD_CTL(x) ((x) << 0) +#define FW_LDST_CMD_RPLCPF(x) ((x) << 0) +#define FW_LDST_CMD_LC (1U << 4) +#define FW_LDST_CMD_NACCESS(x) ((x) << 0) +#define FW_LDST_CMD_FN(x) ((x) << 0) + +struct fw_reset_cmd { + __be32 op_to_write; + __be32 retval_len16; + __be32 val; + __be32 halt_pkd; +}; + +#define FW_RESET_CMD_HALT_SHIFT 31 +#define FW_RESET_CMD_HALT_MASK 0x1 +#define FW_RESET_CMD_HALT(x) ((x) << FW_RESET_CMD_HALT_SHIFT) +#define FW_RESET_CMD_HALT_GET(x) \ + (((x) >> FW_RESET_CMD_HALT_SHIFT) & FW_RESET_CMD_HALT_MASK) + +enum fw_hellow_cmd { + fw_hello_cmd_stage_os = 0x0 +}; + +struct fw_hello_cmd { + __be32 op_to_write; + __be32 retval_len16; + __be32 err_to_clearinit; +#define FW_HELLO_CMD_ERR (1U << 31) +#define FW_HELLO_CMD_INIT (1U << 30) +#define FW_HELLO_CMD_MASTERDIS(x) ((x) << 29) +#define FW_HELLO_CMD_MASTERFORCE(x) ((x) << 28) +#define FW_HELLO_CMD_MBMASTER_MASK 0xfU +#define FW_HELLO_CMD_MBMASTER_SHIFT 24 +#define FW_HELLO_CMD_MBMASTER(x) ((x) << FW_HELLO_CMD_MBMASTER_SHIFT) +#define FW_HELLO_CMD_MBMASTER_GET(x) \ + (((x) >> FW_HELLO_CMD_MBMASTER_SHIFT) & FW_HELLO_CMD_MBMASTER_MASK) +#define FW_HELLO_CMD_MBASYNCNOTINT(x) ((x) << 23) +#define FW_HELLO_CMD_MBASYNCNOT(x) ((x) << 20) +#define FW_HELLO_CMD_STAGE(x) ((x) << 17) +#define FW_HELLO_CMD_CLEARINIT (1U << 16) + __be32 fwrev; +}; + +struct fw_bye_cmd { + __be32 op_to_write; + __be32 retval_len16; + __be64 r3; +}; + +struct fw_initialize_cmd { + __be32 op_to_write; + __be32 retval_len16; + __be64 r3; +}; + +enum fw_caps_config_hm { + FW_CAPS_CONFIG_HM_PCIE = 0x00000001, + FW_CAPS_CONFIG_HM_PL = 0x00000002, + FW_CAPS_CONFIG_HM_SGE = 0x00000004, + FW_CAPS_CONFIG_HM_CIM = 0x00000008, + FW_CAPS_CONFIG_HM_ULPTX = 0x00000010, + FW_CAPS_CONFIG_HM_TP = 0x00000020, + FW_CAPS_CONFIG_HM_ULPRX = 0x00000040, + FW_CAPS_CONFIG_HM_PMRX = 0x00000080, + FW_CAPS_CONFIG_HM_PMTX = 0x00000100, + FW_CAPS_CONFIG_HM_MC = 0x00000200, + FW_CAPS_CONFIG_HM_LE = 0x00000400, + FW_CAPS_CONFIG_HM_MPS = 0x00000800, + FW_CAPS_CONFIG_HM_XGMAC = 0x00001000, + FW_CAPS_CONFIG_HM_CPLSWITCH = 0x00002000, + FW_CAPS_CONFIG_HM_T4DBG = 0x00004000, + FW_CAPS_CONFIG_HM_MI = 0x00008000, + FW_CAPS_CONFIG_HM_I2CM = 0x00010000, + FW_CAPS_CONFIG_HM_NCSI = 0x00020000, + FW_CAPS_CONFIG_HM_SMB = 0x00040000, + FW_CAPS_CONFIG_HM_MA = 0x00080000, + FW_CAPS_CONFIG_HM_EDRAM = 0x00100000, + FW_CAPS_CONFIG_HM_PMU = 0x00200000, + FW_CAPS_CONFIG_HM_UART = 0x00400000, + FW_CAPS_CONFIG_HM_SF = 0x00800000, +}; + +enum fw_caps_config_nbm { + FW_CAPS_CONFIG_NBM_IPMI = 0x00000001, + FW_CAPS_CONFIG_NBM_NCSI = 0x00000002, +}; + +enum fw_caps_config_link { + FW_CAPS_CONFIG_LINK_PPP = 0x00000001, + FW_CAPS_CONFIG_LINK_QFC = 0x00000002, + FW_CAPS_CONFIG_LINK_DCBX = 0x00000004, +}; + +enum fw_caps_config_switch { + FW_CAPS_CONFIG_SWITCH_INGRESS = 0x00000001, + FW_CAPS_CONFIG_SWITCH_EGRESS = 0x00000002, +}; + +enum fw_caps_config_nic { + FW_CAPS_CONFIG_NIC = 0x00000001, + FW_CAPS_CONFIG_NIC_VM = 0x00000002, +}; + +enum fw_caps_config_ofld { + FW_CAPS_CONFIG_OFLD = 0x00000001, +}; + +enum fw_caps_config_rdma { + FW_CAPS_CONFIG_RDMA_RDDP = 0x00000001, + FW_CAPS_CONFIG_RDMA_RDMAC = 0x00000002, +}; + +enum fw_caps_config_iscsi { + FW_CAPS_CONFIG_ISCSI_INITIATOR_PDU = 0x00000001, + FW_CAPS_CONFIG_ISCSI_TARGET_PDU = 0x00000002, + FW_CAPS_CONFIG_ISCSI_INITIATOR_CNXOFLD = 0x00000004, + FW_CAPS_CONFIG_ISCSI_TARGET_CNXOFLD = 0x00000008, +}; + +enum fw_caps_config_fcoe { + FW_CAPS_CONFIG_FCOE_INITIATOR = 0x00000001, + FW_CAPS_CONFIG_FCOE_TARGET = 0x00000002, + FW_CAPS_CONFIG_FCOE_CTRL_OFLD = 0x00000004, +}; + +enum fw_memtype_cf { + FW_MEMTYPE_CF_EDC0 = 0x0, + FW_MEMTYPE_CF_EDC1 = 0x1, + FW_MEMTYPE_CF_EXTMEM = 0x2, + FW_MEMTYPE_CF_FLASH = 0x4, + FW_MEMTYPE_CF_INTERNAL = 0x5, +}; + +struct fw_caps_config_cmd { + __be32 op_to_write; + __be32 cfvalid_to_len16; + __be32 r2; + __be32 hwmbitmap; + __be16 nbmcaps; + __be16 linkcaps; + __be16 switchcaps; + __be16 r3; + __be16 niccaps; + __be16 ofldcaps; + __be16 rdmacaps; + __be16 r4; + __be16 iscsicaps; + __be16 fcoecaps; + __be32 cfcsum; + __be32 finiver; + __be32 finicsum; +}; + +#define FW_CAPS_CONFIG_CMD_CFVALID (1U << 27) +#define FW_CAPS_CONFIG_CMD_MEMTYPE_CF(x) ((x) << 24) +#define FW_CAPS_CONFIG_CMD_MEMADDR64K_CF(x) ((x) << 16) + +/* + * params command mnemonics + */ +enum fw_params_mnem { + FW_PARAMS_MNEM_DEV = 1, /* device params */ + FW_PARAMS_MNEM_PFVF = 2, /* function params */ + FW_PARAMS_MNEM_REG = 3, /* limited register access */ + FW_PARAMS_MNEM_DMAQ = 4, /* dma queue params */ + FW_PARAMS_MNEM_LAST +}; + +/* + * device parameters + */ +enum fw_params_param_dev { + FW_PARAMS_PARAM_DEV_CCLK = 0x00, /* chip core clock in khz */ + FW_PARAMS_PARAM_DEV_PORTVEC = 0x01, /* the port vector */ + FW_PARAMS_PARAM_DEV_NTID = 0x02, /* reads the number of TIDs + * allocated by the device's + * Lookup Engine + */ + FW_PARAMS_PARAM_DEV_FLOWC_BUFFIFO_SZ = 0x03, + FW_PARAMS_PARAM_DEV_INTVER_NIC = 0x04, + FW_PARAMS_PARAM_DEV_INTVER_VNIC = 0x05, + FW_PARAMS_PARAM_DEV_INTVER_OFLD = 0x06, + FW_PARAMS_PARAM_DEV_INTVER_RI = 0x07, + FW_PARAMS_PARAM_DEV_INTVER_ISCSIPDU = 0x08, + FW_PARAMS_PARAM_DEV_INTVER_ISCSI = 0x09, + FW_PARAMS_PARAM_DEV_INTVER_FCOE = 0x0A, + FW_PARAMS_PARAM_DEV_FWREV = 0x0B, + FW_PARAMS_PARAM_DEV_TPREV = 0x0C, + FW_PARAMS_PARAM_DEV_CF = 0x0D, + FW_PARAMS_PARAM_DEV_MAXORDIRD_QP = 0x13, /* max supported QP IRD/ORD */ + FW_PARAMS_PARAM_DEV_MAXIRD_ADAPTER = 0x14, /* max supported adap IRD */ + FW_PARAMS_PARAM_DEV_ULPTX_MEMWRITE_DSGL = 0x17, +}; + +/* + * physical and virtual function parameters + */ +enum fw_params_param_pfvf { + FW_PARAMS_PARAM_PFVF_RWXCAPS = 0x00, + FW_PARAMS_PARAM_PFVF_ROUTE_START = 0x01, + FW_PARAMS_PARAM_PFVF_ROUTE_END = 0x02, + FW_PARAMS_PARAM_PFVF_CLIP_START = 0x03, + FW_PARAMS_PARAM_PFVF_CLIP_END = 0x04, + FW_PARAMS_PARAM_PFVF_FILTER_START = 0x05, + FW_PARAMS_PARAM_PFVF_FILTER_END = 0x06, + FW_PARAMS_PARAM_PFVF_SERVER_START = 0x07, + FW_PARAMS_PARAM_PFVF_SERVER_END = 0x08, + FW_PARAMS_PARAM_PFVF_TDDP_START = 0x09, + FW_PARAMS_PARAM_PFVF_TDDP_END = 0x0A, + FW_PARAMS_PARAM_PFVF_ISCSI_START = 0x0B, + FW_PARAMS_PARAM_PFVF_ISCSI_END = 0x0C, + FW_PARAMS_PARAM_PFVF_STAG_START = 0x0D, + FW_PARAMS_PARAM_PFVF_STAG_END = 0x0E, + FW_PARAMS_PARAM_PFVF_RQ_START = 0x1F, + FW_PARAMS_PARAM_PFVF_RQ_END = 0x10, + FW_PARAMS_PARAM_PFVF_PBL_START = 0x11, + FW_PARAMS_PARAM_PFVF_PBL_END = 0x12, + FW_PARAMS_PARAM_PFVF_L2T_START = 0x13, + FW_PARAMS_PARAM_PFVF_L2T_END = 0x14, + FW_PARAMS_PARAM_PFVF_SQRQ_START = 0x15, + FW_PARAMS_PARAM_PFVF_SQRQ_END = 0x16, + FW_PARAMS_PARAM_PFVF_CQ_START = 0x17, + FW_PARAMS_PARAM_PFVF_CQ_END = 0x18, + FW_PARAMS_PARAM_PFVF_SCHEDCLASS_ETH = 0x20, + FW_PARAMS_PARAM_PFVF_VIID = 0x24, + FW_PARAMS_PARAM_PFVF_CPMASK = 0x25, + FW_PARAMS_PARAM_PFVF_OCQ_START = 0x26, + FW_PARAMS_PARAM_PFVF_OCQ_END = 0x27, + FW_PARAMS_PARAM_PFVF_CONM_MAP = 0x28, + FW_PARAMS_PARAM_PFVF_IQFLINT_START = 0x29, + FW_PARAMS_PARAM_PFVF_IQFLINT_END = 0x2A, + FW_PARAMS_PARAM_PFVF_EQ_START = 0x2B, + FW_PARAMS_PARAM_PFVF_EQ_END = 0x2C, + FW_PARAMS_PARAM_PFVF_ACTIVE_FILTER_START = 0x2D, + FW_PARAMS_PARAM_PFVF_ACTIVE_FILTER_END = 0x2E, + FW_PARAMS_PARAM_PFVF_ETHOFLD_END = 0x30, + FW_PARAMS_PARAM_PFVF_CPLFW4MSG_ENCAP = 0x31 +}; + +/* + * dma queue parameters + */ +enum fw_params_param_dmaq { + FW_PARAMS_PARAM_DMAQ_IQ_DCAEN_DCACPU = 0x00, + FW_PARAMS_PARAM_DMAQ_IQ_INTCNTTHRESH = 0x01, + FW_PARAMS_PARAM_DMAQ_EQ_CMPLIQID_MNGT = 0x10, + FW_PARAMS_PARAM_DMAQ_EQ_CMPLIQID_CTRL = 0x11, + FW_PARAMS_PARAM_DMAQ_EQ_SCHEDCLASS_ETH = 0x12, + FW_PARAMS_PARAM_DMAQ_EQ_DCBPRIO_ETH = 0x13, +}; + +#define FW_PARAMS_MNEM(x) ((x) << 24) +#define FW_PARAMS_PARAM_X(x) ((x) << 16) +#define FW_PARAMS_PARAM_Y_SHIFT 8 +#define FW_PARAMS_PARAM_Y_MASK 0xffU +#define FW_PARAMS_PARAM_Y(x) ((x) << FW_PARAMS_PARAM_Y_SHIFT) +#define FW_PARAMS_PARAM_Y_GET(x) (((x) >> FW_PARAMS_PARAM_Y_SHIFT) &\ + FW_PARAMS_PARAM_Y_MASK) +#define FW_PARAMS_PARAM_Z_SHIFT 0 +#define FW_PARAMS_PARAM_Z_MASK 0xffu +#define FW_PARAMS_PARAM_Z(x) ((x) << FW_PARAMS_PARAM_Z_SHIFT) +#define FW_PARAMS_PARAM_Z_GET(x) (((x) >> FW_PARAMS_PARAM_Z_SHIFT) &\ + FW_PARAMS_PARAM_Z_MASK) +#define FW_PARAMS_PARAM_XYZ(x) ((x) << 0) +#define FW_PARAMS_PARAM_YZ(x) ((x) << 0) + +struct fw_params_cmd { + __be32 op_to_vfn; + __be32 retval_len16; + struct fw_params_param { + __be32 mnem; + __be32 val; + } param[7]; +}; + +#define FW_PARAMS_CMD_PFN(x) ((x) << 8) +#define FW_PARAMS_CMD_VFN(x) ((x) << 0) + +struct fw_pfvf_cmd { + __be32 op_to_vfn; + __be32 retval_len16; + __be32 niqflint_niq; + __be32 type_to_neq; + __be32 tc_to_nexactf; + __be32 r_caps_to_nethctrl; + __be16 nricq; + __be16 nriqp; + __be32 r4; +}; + +#define FW_PFVF_CMD_PFN(x) ((x) << 8) +#define FW_PFVF_CMD_VFN(x) ((x) << 0) + +#define FW_PFVF_CMD_NIQFLINT(x) ((x) << 20) +#define FW_PFVF_CMD_NIQFLINT_GET(x) (((x) >> 20) & 0xfff) + +#define FW_PFVF_CMD_NIQ(x) ((x) << 0) +#define FW_PFVF_CMD_NIQ_GET(x) (((x) >> 0) & 0xfffff) + +#define FW_PFVF_CMD_TYPE (1 << 31) +#define FW_PFVF_CMD_TYPE_GET(x) (((x) >> 31) & 0x1) + +#define FW_PFVF_CMD_CMASK(x) ((x) << 24) +#define FW_PFVF_CMD_CMASK_MASK 0xf +#define FW_PFVF_CMD_CMASK_GET(x) (((x) >> 24) & FW_PFVF_CMD_CMASK_MASK) + +#define FW_PFVF_CMD_PMASK(x) ((x) << 20) +#define FW_PFVF_CMD_PMASK_MASK 0xf +#define FW_PFVF_CMD_PMASK_GET(x) (((x) >> 20) & FW_PFVF_CMD_PMASK_MASK) + +#define FW_PFVF_CMD_NEQ(x) ((x) << 0) +#define FW_PFVF_CMD_NEQ_GET(x) (((x) >> 0) & 0xfffff) + +#define FW_PFVF_CMD_TC(x) ((x) << 24) +#define FW_PFVF_CMD_TC_GET(x) (((x) >> 24) & 0xff) + +#define FW_PFVF_CMD_NVI(x) ((x) << 16) +#define FW_PFVF_CMD_NVI_GET(x) (((x) >> 16) & 0xff) + +#define FW_PFVF_CMD_NEXACTF(x) ((x) << 0) +#define FW_PFVF_CMD_NEXACTF_GET(x) (((x) >> 0) & 0xffff) + +#define FW_PFVF_CMD_R_CAPS(x) ((x) << 24) +#define FW_PFVF_CMD_R_CAPS_GET(x) (((x) >> 24) & 0xff) + +#define FW_PFVF_CMD_WX_CAPS(x) ((x) << 16) +#define FW_PFVF_CMD_WX_CAPS_GET(x) (((x) >> 16) & 0xff) + +#define FW_PFVF_CMD_NETHCTRL(x) ((x) << 0) +#define FW_PFVF_CMD_NETHCTRL_GET(x) (((x) >> 0) & 0xffff) + +enum fw_iq_type { + FW_IQ_TYPE_FL_INT_CAP, + FW_IQ_TYPE_NO_FL_INT_CAP +}; + +struct fw_iq_cmd { + __be32 op_to_vfn; + __be32 alloc_to_len16; + __be16 physiqid; + __be16 iqid; + __be16 fl0id; + __be16 fl1id; + __be32 type_to_iqandstindex; + __be16 iqdroprss_to_iqesize; + __be16 iqsize; + __be64 iqaddr; + __be32 iqns_to_fl0congen; + __be16 fl0dcaen_to_fl0cidxfthresh; + __be16 fl0size; + __be64 fl0addr; + __be32 fl1cngchmap_to_fl1congen; + __be16 fl1dcaen_to_fl1cidxfthresh; + __be16 fl1size; + __be64 fl1addr; +}; + +#define FW_IQ_CMD_PFN(x) ((x) << 8) +#define FW_IQ_CMD_VFN(x) ((x) << 0) + +#define FW_IQ_CMD_ALLOC (1U << 31) +#define FW_IQ_CMD_FREE (1U << 30) +#define FW_IQ_CMD_MODIFY (1U << 29) +#define FW_IQ_CMD_IQSTART(x) ((x) << 28) +#define FW_IQ_CMD_IQSTOP(x) ((x) << 27) + +#define FW_IQ_CMD_TYPE(x) ((x) << 29) +#define FW_IQ_CMD_IQASYNCH(x) ((x) << 28) +#define FW_IQ_CMD_VIID(x) ((x) << 16) +#define FW_IQ_CMD_IQANDST(x) ((x) << 15) +#define FW_IQ_CMD_IQANUS(x) ((x) << 14) +#define FW_IQ_CMD_IQANUD(x) ((x) << 12) +#define FW_IQ_CMD_IQANDSTINDEX(x) ((x) << 0) + +#define FW_IQ_CMD_IQDROPRSS (1U << 15) +#define FW_IQ_CMD_IQGTSMODE (1U << 14) +#define FW_IQ_CMD_IQPCIECH(x) ((x) << 12) +#define FW_IQ_CMD_IQDCAEN(x) ((x) << 11) +#define FW_IQ_CMD_IQDCACPU(x) ((x) << 6) +#define FW_IQ_CMD_IQINTCNTTHRESH(x) ((x) << 4) +#define FW_IQ_CMD_IQO (1U << 3) +#define FW_IQ_CMD_IQCPRIO(x) ((x) << 2) +#define FW_IQ_CMD_IQESIZE(x) ((x) << 0) + +#define FW_IQ_CMD_IQNS(x) ((x) << 31) +#define FW_IQ_CMD_IQRO(x) ((x) << 30) +#define FW_IQ_CMD_IQFLINTIQHSEN(x) ((x) << 28) +#define FW_IQ_CMD_IQFLINTCONGEN(x) ((x) << 27) +#define FW_IQ_CMD_IQFLINTISCSIC(x) ((x) << 26) +#define FW_IQ_CMD_FL0CNGCHMAP(x) ((x) << 20) +#define FW_IQ_CMD_FL0CACHELOCK(x) ((x) << 15) +#define FW_IQ_CMD_FL0DBP(x) ((x) << 14) +#define FW_IQ_CMD_FL0DATANS(x) ((x) << 13) +#define FW_IQ_CMD_FL0DATARO(x) ((x) << 12) +#define FW_IQ_CMD_FL0CONGCIF(x) ((x) << 11) +#define FW_IQ_CMD_FL0ONCHIP(x) ((x) << 10) +#define FW_IQ_CMD_FL0STATUSPGNS(x) ((x) << 9) +#define FW_IQ_CMD_FL0STATUSPGRO(x) ((x) << 8) +#define FW_IQ_CMD_FL0FETCHNS(x) ((x) << 7) +#define FW_IQ_CMD_FL0FETCHRO(x) ((x) << 6) +#define FW_IQ_CMD_FL0HOSTFCMODE(x) ((x) << 4) +#define FW_IQ_CMD_FL0CPRIO(x) ((x) << 3) +#define FW_IQ_CMD_FL0PADEN(x) ((x) << 2) +#define FW_IQ_CMD_FL0PACKEN(x) ((x) << 1) +#define FW_IQ_CMD_FL0CONGEN (1U << 0) + +#define FW_IQ_CMD_FL0DCAEN(x) ((x) << 15) +#define FW_IQ_CMD_FL0DCACPU(x) ((x) << 10) +#define FW_IQ_CMD_FL0FBMIN(x) ((x) << 7) +#define FW_IQ_CMD_FL0FBMAX(x) ((x) << 4) +#define FW_IQ_CMD_FL0CIDXFTHRESHO (1U << 3) +#define FW_IQ_CMD_FL0CIDXFTHRESH(x) ((x) << 0) + +#define FW_IQ_CMD_FL1CNGCHMAP(x) ((x) << 20) +#define FW_IQ_CMD_FL1CACHELOCK(x) ((x) << 15) +#define FW_IQ_CMD_FL1DBP(x) ((x) << 14) +#define FW_IQ_CMD_FL1DATANS(x) ((x) << 13) +#define FW_IQ_CMD_FL1DATARO(x) ((x) << 12) +#define FW_IQ_CMD_FL1CONGCIF(x) ((x) << 11) +#define FW_IQ_CMD_FL1ONCHIP(x) ((x) << 10) +#define FW_IQ_CMD_FL1STATUSPGNS(x) ((x) << 9) +#define FW_IQ_CMD_FL1STATUSPGRO(x) ((x) << 8) +#define FW_IQ_CMD_FL1FETCHNS(x) ((x) << 7) +#define FW_IQ_CMD_FL1FETCHRO(x) ((x) << 6) +#define FW_IQ_CMD_FL1HOSTFCMODE(x) ((x) << 4) +#define FW_IQ_CMD_FL1CPRIO(x) ((x) << 3) +#define FW_IQ_CMD_FL1PADEN (1U << 2) +#define FW_IQ_CMD_FL1PACKEN (1U << 1) +#define FW_IQ_CMD_FL1CONGEN (1U << 0) + +#define FW_IQ_CMD_FL1DCAEN(x) ((x) << 15) +#define FW_IQ_CMD_FL1DCACPU(x) ((x) << 10) +#define FW_IQ_CMD_FL1FBMIN(x) ((x) << 7) +#define FW_IQ_CMD_FL1FBMAX(x) ((x) << 4) +#define FW_IQ_CMD_FL1CIDXFTHRESHO (1U << 3) +#define FW_IQ_CMD_FL1CIDXFTHRESH(x) ((x) << 0) + +struct fw_eq_eth_cmd { + __be32 op_to_vfn; + __be32 alloc_to_len16; + __be32 eqid_pkd; + __be32 physeqid_pkd; + __be32 fetchszm_to_iqid; + __be32 dcaen_to_eqsize; + __be64 eqaddr; + __be32 viid_pkd; + __be32 r8_lo; + __be64 r9; +}; + +#define FW_EQ_ETH_CMD_PFN(x) ((x) << 8) +#define FW_EQ_ETH_CMD_VFN(x) ((x) << 0) +#define FW_EQ_ETH_CMD_ALLOC (1U << 31) +#define FW_EQ_ETH_CMD_FREE (1U << 30) +#define FW_EQ_ETH_CMD_MODIFY (1U << 29) +#define FW_EQ_ETH_CMD_EQSTART (1U << 28) +#define FW_EQ_ETH_CMD_EQSTOP (1U << 27) + +#define FW_EQ_ETH_CMD_EQID(x) ((x) << 0) +#define FW_EQ_ETH_CMD_EQID_GET(x) (((x) >> 0) & 0xfffff) +#define FW_EQ_ETH_CMD_PHYSEQID(x) ((x) << 0) +#define FW_EQ_ETH_CMD_PHYSEQID_GET(x) (((x) >> 0) & 0xfffff) + +#define FW_EQ_ETH_CMD_FETCHSZM(x) ((x) << 26) +#define FW_EQ_ETH_CMD_STATUSPGNS(x) ((x) << 25) +#define FW_EQ_ETH_CMD_STATUSPGRO(x) ((x) << 24) +#define FW_EQ_ETH_CMD_FETCHNS(x) ((x) << 23) +#define FW_EQ_ETH_CMD_FETCHRO(x) ((x) << 22) +#define FW_EQ_ETH_CMD_HOSTFCMODE(x) ((x) << 20) +#define FW_EQ_ETH_CMD_CPRIO(x) ((x) << 19) +#define FW_EQ_ETH_CMD_ONCHIP(x) ((x) << 18) +#define FW_EQ_ETH_CMD_PCIECHN(x) ((x) << 16) +#define FW_EQ_ETH_CMD_IQID(x) ((x) << 0) + +#define FW_EQ_ETH_CMD_DCAEN(x) ((x) << 31) +#define FW_EQ_ETH_CMD_DCACPU(x) ((x) << 26) +#define FW_EQ_ETH_CMD_FBMIN(x) ((x) << 23) +#define FW_EQ_ETH_CMD_FBMAX(x) ((x) << 20) +#define FW_EQ_ETH_CMD_CIDXFTHRESHO(x) ((x) << 19) +#define FW_EQ_ETH_CMD_CIDXFTHRESH(x) ((x) << 16) +#define FW_EQ_ETH_CMD_EQSIZE(x) ((x) << 0) + +#define FW_EQ_ETH_CMD_AUTOEQUEQE (1U << 30) +#define FW_EQ_ETH_CMD_VIID(x) ((x) << 16) + +struct fw_eq_ctrl_cmd { + __be32 op_to_vfn; + __be32 alloc_to_len16; + __be32 cmpliqid_eqid; + __be32 physeqid_pkd; + __be32 fetchszm_to_iqid; + __be32 dcaen_to_eqsize; + __be64 eqaddr; +}; + +#define FW_EQ_CTRL_CMD_PFN(x) ((x) << 8) +#define FW_EQ_CTRL_CMD_VFN(x) ((x) << 0) + +#define FW_EQ_CTRL_CMD_ALLOC (1U << 31) +#define FW_EQ_CTRL_CMD_FREE (1U << 30) +#define FW_EQ_CTRL_CMD_MODIFY (1U << 29) +#define FW_EQ_CTRL_CMD_EQSTART (1U << 28) +#define FW_EQ_CTRL_CMD_EQSTOP (1U << 27) + +#define FW_EQ_CTRL_CMD_CMPLIQID(x) ((x) << 20) +#define FW_EQ_CTRL_CMD_EQID(x) ((x) << 0) +#define FW_EQ_CTRL_CMD_EQID_GET(x) (((x) >> 0) & 0xfffff) +#define FW_EQ_CTRL_CMD_PHYSEQID_GET(x) (((x) >> 0) & 0xfffff) + +#define FW_EQ_CTRL_CMD_FETCHSZM (1U << 26) +#define FW_EQ_CTRL_CMD_STATUSPGNS (1U << 25) +#define FW_EQ_CTRL_CMD_STATUSPGRO (1U << 24) +#define FW_EQ_CTRL_CMD_FETCHNS (1U << 23) +#define FW_EQ_CTRL_CMD_FETCHRO (1U << 22) +#define FW_EQ_CTRL_CMD_HOSTFCMODE(x) ((x) << 20) +#define FW_EQ_CTRL_CMD_CPRIO(x) ((x) << 19) +#define FW_EQ_CTRL_CMD_ONCHIP(x) ((x) << 18) +#define FW_EQ_CTRL_CMD_PCIECHN(x) ((x) << 16) +#define FW_EQ_CTRL_CMD_IQID(x) ((x) << 0) + +#define FW_EQ_CTRL_CMD_DCAEN(x) ((x) << 31) +#define FW_EQ_CTRL_CMD_DCACPU(x) ((x) << 26) +#define FW_EQ_CTRL_CMD_FBMIN(x) ((x) << 23) +#define FW_EQ_CTRL_CMD_FBMAX(x) ((x) << 20) +#define FW_EQ_CTRL_CMD_CIDXFTHRESHO(x) ((x) << 19) +#define FW_EQ_CTRL_CMD_CIDXFTHRESH(x) ((x) << 16) +#define FW_EQ_CTRL_CMD_EQSIZE(x) ((x) << 0) + +struct fw_eq_ofld_cmd { + __be32 op_to_vfn; + __be32 alloc_to_len16; + __be32 eqid_pkd; + __be32 physeqid_pkd; + __be32 fetchszm_to_iqid; + __be32 dcaen_to_eqsize; + __be64 eqaddr; +}; + +#define FW_EQ_OFLD_CMD_PFN(x) ((x) << 8) +#define FW_EQ_OFLD_CMD_VFN(x) ((x) << 0) + +#define FW_EQ_OFLD_CMD_ALLOC (1U << 31) +#define FW_EQ_OFLD_CMD_FREE (1U << 30) +#define FW_EQ_OFLD_CMD_MODIFY (1U << 29) +#define FW_EQ_OFLD_CMD_EQSTART (1U << 28) +#define FW_EQ_OFLD_CMD_EQSTOP (1U << 27) + +#define FW_EQ_OFLD_CMD_EQID(x) ((x) << 0) +#define FW_EQ_OFLD_CMD_EQID_GET(x) (((x) >> 0) & 0xfffff) +#define FW_EQ_OFLD_CMD_PHYSEQID_GET(x) (((x) >> 0) & 0xfffff) + +#define FW_EQ_OFLD_CMD_FETCHSZM(x) ((x) << 26) +#define FW_EQ_OFLD_CMD_STATUSPGNS(x) ((x) << 25) +#define FW_EQ_OFLD_CMD_STATUSPGRO(x) ((x) << 24) +#define FW_EQ_OFLD_CMD_FETCHNS(x) ((x) << 23) +#define FW_EQ_OFLD_CMD_FETCHRO(x) ((x) << 22) +#define FW_EQ_OFLD_CMD_HOSTFCMODE(x) ((x) << 20) +#define FW_EQ_OFLD_CMD_CPRIO(x) ((x) << 19) +#define FW_EQ_OFLD_CMD_ONCHIP(x) ((x) << 18) +#define FW_EQ_OFLD_CMD_PCIECHN(x) ((x) << 16) +#define FW_EQ_OFLD_CMD_IQID(x) ((x) << 0) + +#define FW_EQ_OFLD_CMD_DCAEN(x) ((x) << 31) +#define FW_EQ_OFLD_CMD_DCACPU(x) ((x) << 26) +#define FW_EQ_OFLD_CMD_FBMIN(x) ((x) << 23) +#define FW_EQ_OFLD_CMD_FBMAX(x) ((x) << 20) +#define FW_EQ_OFLD_CMD_CIDXFTHRESHO(x) ((x) << 19) +#define FW_EQ_OFLD_CMD_CIDXFTHRESH(x) ((x) << 16) +#define FW_EQ_OFLD_CMD_EQSIZE(x) ((x) << 0) + +/* + * Macros for VIID parsing: + * VIID - [10:8] PFN, [7] VI Valid, [6:0] VI number + */ +#define FW_VIID_PFN_GET(x) (((x) >> 8) & 0x7) +#define FW_VIID_VIVLD_GET(x) (((x) >> 7) & 0x1) +#define FW_VIID_VIN_GET(x) (((x) >> 0) & 0x7F) + +struct fw_vi_cmd { + __be32 op_to_vfn; + __be32 alloc_to_len16; + __be16 type_viid; + u8 mac[6]; + u8 portid_pkd; + u8 nmac; + u8 nmac0[6]; + __be16 rsssize_pkd; + u8 nmac1[6]; + __be16 idsiiq_pkd; + u8 nmac2[6]; + __be16 idseiq_pkd; + u8 nmac3[6]; + __be64 r9; + __be64 r10; +}; + +#define FW_VI_CMD_PFN(x) ((x) << 8) +#define FW_VI_CMD_VFN(x) ((x) << 0) +#define FW_VI_CMD_ALLOC (1U << 31) +#define FW_VI_CMD_FREE (1U << 30) +#define FW_VI_CMD_VIID(x) ((x) << 0) +#define FW_VI_CMD_VIID_GET(x) ((x) & 0xfff) +#define FW_VI_CMD_PORTID(x) ((x) << 4) +#define FW_VI_CMD_PORTID_GET(x) (((x) >> 4) & 0xf) +#define FW_VI_CMD_RSSSIZE_GET(x) (((x) >> 0) & 0x7ff) + +/* Special VI_MAC command index ids */ +#define FW_VI_MAC_ADD_MAC 0x3FF +#define FW_VI_MAC_ADD_PERSIST_MAC 0x3FE +#define FW_VI_MAC_MAC_BASED_FREE 0x3FD +#define FW_CLS_TCAM_NUM_ENTRIES 336 + +enum fw_vi_mac_smac { + FW_VI_MAC_MPS_TCAM_ENTRY, + FW_VI_MAC_MPS_TCAM_ONLY, + FW_VI_MAC_SMT_ONLY, + FW_VI_MAC_SMT_AND_MPSTCAM +}; + +enum fw_vi_mac_result { + FW_VI_MAC_R_SUCCESS, + FW_VI_MAC_R_F_NONEXISTENT_NOMEM, + FW_VI_MAC_R_SMAC_FAIL, + FW_VI_MAC_R_F_ACL_CHECK +}; + +struct fw_vi_mac_cmd { + __be32 op_to_viid; + __be32 freemacs_to_len16; + union fw_vi_mac { + struct fw_vi_mac_exact { + __be16 valid_to_idx; + u8 macaddr[6]; + } exact[7]; + struct fw_vi_mac_hash { + __be64 hashvec; + } hash; + } u; +}; + +#define FW_VI_MAC_CMD_VIID(x) ((x) << 0) +#define FW_VI_MAC_CMD_FREEMACS(x) ((x) << 31) +#define FW_VI_MAC_CMD_HASHVECEN (1U << 23) +#define FW_VI_MAC_CMD_HASHUNIEN(x) ((x) << 22) +#define FW_VI_MAC_CMD_VALID (1U << 15) +#define FW_VI_MAC_CMD_PRIO(x) ((x) << 12) +#define FW_VI_MAC_CMD_SMAC_RESULT(x) ((x) << 10) +#define FW_VI_MAC_CMD_SMAC_RESULT_GET(x) (((x) >> 10) & 0x3) +#define FW_VI_MAC_CMD_IDX(x) ((x) << 0) +#define FW_VI_MAC_CMD_IDX_GET(x) (((x) >> 0) & 0x3ff) + +#define FW_RXMODE_MTU_NO_CHG 65535 + +struct fw_vi_rxmode_cmd { + __be32 op_to_viid; + __be32 retval_len16; + __be32 mtu_to_vlanexen; + __be32 r4_lo; +}; + +#define FW_VI_RXMODE_CMD_VIID(x) ((x) << 0) +#define FW_VI_RXMODE_CMD_MTU_MASK 0xffff +#define FW_VI_RXMODE_CMD_MTU(x) ((x) << 16) +#define FW_VI_RXMODE_CMD_PROMISCEN_MASK 0x3 +#define FW_VI_RXMODE_CMD_PROMISCEN(x) ((x) << 14) +#define FW_VI_RXMODE_CMD_ALLMULTIEN_MASK 0x3 +#define FW_VI_RXMODE_CMD_ALLMULTIEN(x) ((x) << 12) +#define FW_VI_RXMODE_CMD_BROADCASTEN_MASK 0x3 +#define FW_VI_RXMODE_CMD_BROADCASTEN(x) ((x) << 10) +#define FW_VI_RXMODE_CMD_VLANEXEN_MASK 0x3 +#define FW_VI_RXMODE_CMD_VLANEXEN(x) ((x) << 8) + +struct fw_vi_enable_cmd { + __be32 op_to_viid; + __be32 ien_to_len16; + __be16 blinkdur; + __be16 r3; + __be32 r4; +}; + +#define FW_VI_ENABLE_CMD_VIID(x) ((x) << 0) +#define FW_VI_ENABLE_CMD_IEN(x) ((x) << 31) +#define FW_VI_ENABLE_CMD_EEN(x) ((x) << 30) +#define FW_VI_ENABLE_CMD_DCB_INFO(x) ((x) << 28) +#define FW_VI_ENABLE_CMD_LED (1U << 29) + +/* VI VF stats offset definitions */ +#define VI_VF_NUM_STATS 16 +enum fw_vi_stats_vf_index { + FW_VI_VF_STAT_TX_BCAST_BYTES_IX, + FW_VI_VF_STAT_TX_BCAST_FRAMES_IX, + FW_VI_VF_STAT_TX_MCAST_BYTES_IX, + FW_VI_VF_STAT_TX_MCAST_FRAMES_IX, + FW_VI_VF_STAT_TX_UCAST_BYTES_IX, + FW_VI_VF_STAT_TX_UCAST_FRAMES_IX, + FW_VI_VF_STAT_TX_DROP_FRAMES_IX, + FW_VI_VF_STAT_TX_OFLD_BYTES_IX, + FW_VI_VF_STAT_TX_OFLD_FRAMES_IX, + FW_VI_VF_STAT_RX_BCAST_BYTES_IX, + FW_VI_VF_STAT_RX_BCAST_FRAMES_IX, + FW_VI_VF_STAT_RX_MCAST_BYTES_IX, + FW_VI_VF_STAT_RX_MCAST_FRAMES_IX, + FW_VI_VF_STAT_RX_UCAST_BYTES_IX, + FW_VI_VF_STAT_RX_UCAST_FRAMES_IX, + FW_VI_VF_STAT_RX_ERR_FRAMES_IX +}; + +/* VI PF stats offset definitions */ +#define VI_PF_NUM_STATS 17 +enum fw_vi_stats_pf_index { + FW_VI_PF_STAT_TX_BCAST_BYTES_IX, + FW_VI_PF_STAT_TX_BCAST_FRAMES_IX, + FW_VI_PF_STAT_TX_MCAST_BYTES_IX, + FW_VI_PF_STAT_TX_MCAST_FRAMES_IX, + FW_VI_PF_STAT_TX_UCAST_BYTES_IX, + FW_VI_PF_STAT_TX_UCAST_FRAMES_IX, + FW_VI_PF_STAT_TX_OFLD_BYTES_IX, + FW_VI_PF_STAT_TX_OFLD_FRAMES_IX, + FW_VI_PF_STAT_RX_BYTES_IX, + FW_VI_PF_STAT_RX_FRAMES_IX, + FW_VI_PF_STAT_RX_BCAST_BYTES_IX, + FW_VI_PF_STAT_RX_BCAST_FRAMES_IX, + FW_VI_PF_STAT_RX_MCAST_BYTES_IX, + FW_VI_PF_STAT_RX_MCAST_FRAMES_IX, + FW_VI_PF_STAT_RX_UCAST_BYTES_IX, + FW_VI_PF_STAT_RX_UCAST_FRAMES_IX, + FW_VI_PF_STAT_RX_ERR_FRAMES_IX +}; + +struct fw_vi_stats_cmd { + __be32 op_to_viid; + __be32 retval_len16; + union fw_vi_stats { + struct fw_vi_stats_ctl { + __be16 nstats_ix; + __be16 r6; + __be32 r7; + __be64 stat0; + __be64 stat1; + __be64 stat2; + __be64 stat3; + __be64 stat4; + __be64 stat5; + } ctl; + struct fw_vi_stats_pf { + __be64 tx_bcast_bytes; + __be64 tx_bcast_frames; + __be64 tx_mcast_bytes; + __be64 tx_mcast_frames; + __be64 tx_ucast_bytes; + __be64 tx_ucast_frames; + __be64 tx_offload_bytes; + __be64 tx_offload_frames; + __be64 rx_pf_bytes; + __be64 rx_pf_frames; + __be64 rx_bcast_bytes; + __be64 rx_bcast_frames; + __be64 rx_mcast_bytes; + __be64 rx_mcast_frames; + __be64 rx_ucast_bytes; + __be64 rx_ucast_frames; + __be64 rx_err_frames; + } pf; + struct fw_vi_stats_vf { + __be64 tx_bcast_bytes; + __be64 tx_bcast_frames; + __be64 tx_mcast_bytes; + __be64 tx_mcast_frames; + __be64 tx_ucast_bytes; + __be64 tx_ucast_frames; + __be64 tx_drop_frames; + __be64 tx_offload_bytes; + __be64 tx_offload_frames; + __be64 rx_bcast_bytes; + __be64 rx_bcast_frames; + __be64 rx_mcast_bytes; + __be64 rx_mcast_frames; + __be64 rx_ucast_bytes; + __be64 rx_ucast_frames; + __be64 rx_err_frames; + } vf; + } u; +}; + +#define FW_VI_STATS_CMD_VIID(x) ((x) << 0) +#define FW_VI_STATS_CMD_NSTATS(x) ((x) << 12) +#define FW_VI_STATS_CMD_IX(x) ((x) << 0) + +struct fw_acl_mac_cmd { + __be32 op_to_vfn; + __be32 en_to_len16; + u8 nmac; + u8 r3[7]; + __be16 r4; + u8 macaddr0[6]; + __be16 r5; + u8 macaddr1[6]; + __be16 r6; + u8 macaddr2[6]; + __be16 r7; + u8 macaddr3[6]; +}; + +#define FW_ACL_MAC_CMD_PFN(x) ((x) << 8) +#define FW_ACL_MAC_CMD_VFN(x) ((x) << 0) +#define FW_ACL_MAC_CMD_EN(x) ((x) << 31) + +struct fw_acl_vlan_cmd { + __be32 op_to_vfn; + __be32 en_to_len16; + u8 nvlan; + u8 dropnovlan_fm; + u8 r3_lo[6]; + __be16 vlanid[16]; +}; + +#define FW_ACL_VLAN_CMD_PFN(x) ((x) << 8) +#define FW_ACL_VLAN_CMD_VFN(x) ((x) << 0) +#define FW_ACL_VLAN_CMD_EN(x) ((x) << 31) +#define FW_ACL_VLAN_CMD_DROPNOVLAN(x) ((x) << 7) +#define FW_ACL_VLAN_CMD_FM(x) ((x) << 6) + +enum fw_port_cap { + FW_PORT_CAP_SPEED_100M = 0x0001, + FW_PORT_CAP_SPEED_1G = 0x0002, + FW_PORT_CAP_SPEED_2_5G = 0x0004, + FW_PORT_CAP_SPEED_10G = 0x0008, + FW_PORT_CAP_SPEED_40G = 0x0010, + FW_PORT_CAP_SPEED_100G = 0x0020, + FW_PORT_CAP_FC_RX = 0x0040, + FW_PORT_CAP_FC_TX = 0x0080, + FW_PORT_CAP_ANEG = 0x0100, + FW_PORT_CAP_MDI_0 = 0x0200, + FW_PORT_CAP_MDI_1 = 0x0400, + FW_PORT_CAP_BEAN = 0x0800, + FW_PORT_CAP_PMA_LPBK = 0x1000, + FW_PORT_CAP_PCS_LPBK = 0x2000, + FW_PORT_CAP_PHYXS_LPBK = 0x4000, + FW_PORT_CAP_FAR_END_LPBK = 0x8000, +}; + +enum fw_port_mdi { + FW_PORT_MDI_UNCHANGED, + FW_PORT_MDI_AUTO, + FW_PORT_MDI_F_STRAIGHT, + FW_PORT_MDI_F_CROSSOVER +}; + +#define FW_PORT_MDI(x) ((x) << 9) + +enum fw_port_action { + FW_PORT_ACTION_L1_CFG = 0x0001, + FW_PORT_ACTION_L2_CFG = 0x0002, + FW_PORT_ACTION_GET_PORT_INFO = 0x0003, + FW_PORT_ACTION_L2_PPP_CFG = 0x0004, + FW_PORT_ACTION_L2_DCB_CFG = 0x0005, + FW_PORT_ACTION_DCB_READ_TRANS = 0x0006, + FW_PORT_ACTION_DCB_READ_RECV = 0x0007, + FW_PORT_ACTION_DCB_READ_DET = 0x0008, + FW_PORT_ACTION_LOW_PWR_TO_NORMAL = 0x0010, + FW_PORT_ACTION_L1_LOW_PWR_EN = 0x0011, + FW_PORT_ACTION_L2_WOL_MODE_EN = 0x0012, + FW_PORT_ACTION_LPBK_TO_NORMAL = 0x0020, + FW_PORT_ACTION_L1_LPBK = 0x0021, + FW_PORT_ACTION_L1_PMA_LPBK = 0x0022, + FW_PORT_ACTION_L1_PCS_LPBK = 0x0023, + FW_PORT_ACTION_L1_PHYXS_CSIDE_LPBK = 0x0024, + FW_PORT_ACTION_L1_PHYXS_ESIDE_LPBK = 0x0025, + FW_PORT_ACTION_PHY_RESET = 0x0040, + FW_PORT_ACTION_PMA_RESET = 0x0041, + FW_PORT_ACTION_PCS_RESET = 0x0042, + FW_PORT_ACTION_PHYXS_RESET = 0x0043, + FW_PORT_ACTION_DTEXS_REEST = 0x0044, + FW_PORT_ACTION_AN_RESET = 0x0045 +}; + +enum fw_port_l2cfg_ctlbf { + FW_PORT_L2_CTLBF_OVLAN0 = 0x01, + FW_PORT_L2_CTLBF_OVLAN1 = 0x02, + FW_PORT_L2_CTLBF_OVLAN2 = 0x04, + FW_PORT_L2_CTLBF_OVLAN3 = 0x08, + FW_PORT_L2_CTLBF_IVLAN = 0x10, + FW_PORT_L2_CTLBF_TXIPG = 0x20 +}; + +enum fw_port_dcb_versions { + FW_PORT_DCB_VER_UNKNOWN, + FW_PORT_DCB_VER_CEE1D0, + FW_PORT_DCB_VER_CEE1D01, + FW_PORT_DCB_VER_IEEE, + FW_PORT_DCB_VER_AUTO = 7 +}; + +enum fw_port_dcb_cfg { + FW_PORT_DCB_CFG_PG = 0x01, + FW_PORT_DCB_CFG_PFC = 0x02, + FW_PORT_DCB_CFG_APPL = 0x04 +}; + +enum fw_port_dcb_cfg_rc { + FW_PORT_DCB_CFG_SUCCESS = 0x0, + FW_PORT_DCB_CFG_ERROR = 0x1 +}; + +enum fw_port_dcb_type { + FW_PORT_DCB_TYPE_PGID = 0x00, + FW_PORT_DCB_TYPE_PGRATE = 0x01, + FW_PORT_DCB_TYPE_PRIORATE = 0x02, + FW_PORT_DCB_TYPE_PFC = 0x03, + FW_PORT_DCB_TYPE_APP_ID = 0x04, + FW_PORT_DCB_TYPE_CONTROL = 0x05, +}; + +enum fw_port_dcb_feature_state { + FW_PORT_DCB_FEATURE_STATE_PENDING = 0x0, + FW_PORT_DCB_FEATURE_STATE_SUCCESS = 0x1, + FW_PORT_DCB_FEATURE_STATE_ERROR = 0x2, + FW_PORT_DCB_FEATURE_STATE_TIMEOUT = 0x3, +}; + +struct fw_port_cmd { + __be32 op_to_portid; + __be32 action_to_len16; + union fw_port { + struct fw_port_l1cfg { + __be32 rcap; + __be32 r; + } l1cfg; + struct fw_port_l2cfg { + __u8 ctlbf; + __u8 ovlan3_to_ivlan0; + __be16 ivlantype; + __be16 txipg_force_pinfo; + __be16 mtu; + __be16 ovlan0mask; + __be16 ovlan0type; + __be16 ovlan1mask; + __be16 ovlan1type; + __be16 ovlan2mask; + __be16 ovlan2type; + __be16 ovlan3mask; + __be16 ovlan3type; + } l2cfg; + struct fw_port_info { + __be32 lstatus_to_modtype; + __be16 pcap; + __be16 acap; + __be16 mtu; + __u8 cbllen; + __u8 auxlinfo; + __u8 dcbxdis_pkd; + __u8 r8_lo[3]; + __be64 r9; + } info; + struct fw_port_diags { + __u8 diagop; + __u8 r[3]; + __be32 diagval; + } diags; + union fw_port_dcb { + struct fw_port_dcb_pgid { + __u8 type; + __u8 apply_pkd; + __u8 r10_lo[2]; + __be32 pgid; + __be64 r11; + } pgid; + struct fw_port_dcb_pgrate { + __u8 type; + __u8 apply_pkd; + __u8 r10_lo[5]; + __u8 num_tcs_supported; + __u8 pgrate[8]; + __u8 tsa[8]; + } pgrate; + struct fw_port_dcb_priorate { + __u8 type; + __u8 apply_pkd; + __u8 r10_lo[6]; + __u8 strict_priorate[8]; + } priorate; + struct fw_port_dcb_pfc { + __u8 type; + __u8 pfcen; + __u8 r10[5]; + __u8 max_pfc_tcs; + __be64 r11; + } pfc; + struct fw_port_app_priority { + __u8 type; + __u8 r10[2]; + __u8 idx; + __u8 user_prio_map; + __u8 sel_field; + __be16 protocolid; + __be64 r12; + } app_priority; + struct fw_port_dcb_control { + __u8 type; + __u8 all_syncd_pkd; + __be16 dcb_version_to_app_state; + __be32 r11; + __be64 r12; + } control; + } dcb; + } u; +}; + +#define FW_PORT_CMD_READ (1U << 22) + +#define FW_PORT_CMD_PORTID(x) ((x) << 0) +#define FW_PORT_CMD_PORTID_GET(x) (((x) >> 0) & 0xf) + +#define FW_PORT_CMD_ACTION(x) ((x) << 16) +#define FW_PORT_CMD_ACTION_GET(x) (((x) >> 16) & 0xffff) + +#define FW_PORT_CMD_CTLBF(x) ((x) << 10) +#define FW_PORT_CMD_OVLAN3(x) ((x) << 7) +#define FW_PORT_CMD_OVLAN2(x) ((x) << 6) +#define FW_PORT_CMD_OVLAN1(x) ((x) << 5) +#define FW_PORT_CMD_OVLAN0(x) ((x) << 4) +#define FW_PORT_CMD_IVLAN0(x) ((x) << 3) + +#define FW_PORT_CMD_TXIPG(x) ((x) << 19) + +#define FW_PORT_CMD_LSTATUS (1U << 31) +#define FW_PORT_CMD_LSTATUS_GET(x) (((x) >> 31) & 0x1) +#define FW_PORT_CMD_LSPEED(x) ((x) << 24) +#define FW_PORT_CMD_LSPEED_GET(x) (((x) >> 24) & 0x3f) +#define FW_PORT_CMD_TXPAUSE (1U << 23) +#define FW_PORT_CMD_RXPAUSE (1U << 22) +#define FW_PORT_CMD_MDIOCAP (1U << 21) +#define FW_PORT_CMD_MDIOADDR_GET(x) (((x) >> 16) & 0x1f) +#define FW_PORT_CMD_LPTXPAUSE (1U << 15) +#define FW_PORT_CMD_LPRXPAUSE (1U << 14) +#define FW_PORT_CMD_PTYPE_MASK 0x1f +#define FW_PORT_CMD_PTYPE_GET(x) (((x) >> 8) & FW_PORT_CMD_PTYPE_MASK) +#define FW_PORT_CMD_MODTYPE_MASK 0x1f +#define FW_PORT_CMD_MODTYPE_GET(x) (((x) >> 0) & FW_PORT_CMD_MODTYPE_MASK) + +#define FW_PORT_CMD_DCBXDIS (1U << 7) +#define FW_PORT_CMD_APPLY (1U << 7) +#define FW_PORT_CMD_ALL_SYNCD (1U << 7) +#define FW_PORT_CMD_DCB_VERSION_GET(x) (((x) >> 8) & 0xf) + +#define FW_PORT_CMD_PPPEN(x) ((x) << 31) +#define FW_PORT_CMD_TPSRC(x) ((x) << 28) +#define FW_PORT_CMD_NCSISRC(x) ((x) << 24) + +#define FW_PORT_CMD_CH0(x) ((x) << 20) +#define FW_PORT_CMD_CH1(x) ((x) << 16) +#define FW_PORT_CMD_CH2(x) ((x) << 12) +#define FW_PORT_CMD_CH3(x) ((x) << 8) +#define FW_PORT_CMD_NCSICH(x) ((x) << 4) + +enum fw_port_type { + FW_PORT_TYPE_FIBER_XFI, + FW_PORT_TYPE_FIBER_XAUI, + FW_PORT_TYPE_BT_SGMII, + FW_PORT_TYPE_BT_XFI, + FW_PORT_TYPE_BT_XAUI, + FW_PORT_TYPE_KX4, + FW_PORT_TYPE_CX4, + FW_PORT_TYPE_KX, + FW_PORT_TYPE_KR, + FW_PORT_TYPE_SFP, + FW_PORT_TYPE_BP_AP, + FW_PORT_TYPE_BP4_AP, + FW_PORT_TYPE_QSFP_10G, + FW_PORT_TYPE_QSFP, + FW_PORT_TYPE_BP40_BA, + + FW_PORT_TYPE_NONE = FW_PORT_CMD_PTYPE_MASK +}; + +enum fw_port_module_type { + FW_PORT_MOD_TYPE_NA, + FW_PORT_MOD_TYPE_LR, + FW_PORT_MOD_TYPE_SR, + FW_PORT_MOD_TYPE_ER, + FW_PORT_MOD_TYPE_TWINAX_PASSIVE, + FW_PORT_MOD_TYPE_TWINAX_ACTIVE, + FW_PORT_MOD_TYPE_LRM, + FW_PORT_MOD_TYPE_ERROR = FW_PORT_CMD_MODTYPE_MASK - 3, + FW_PORT_MOD_TYPE_UNKNOWN = FW_PORT_CMD_MODTYPE_MASK - 2, + FW_PORT_MOD_TYPE_NOTSUPPORTED = FW_PORT_CMD_MODTYPE_MASK - 1, + + FW_PORT_MOD_TYPE_NONE = FW_PORT_CMD_MODTYPE_MASK +}; + +enum fw_port_mod_sub_type { + FW_PORT_MOD_SUB_TYPE_NA, + FW_PORT_MOD_SUB_TYPE_MV88E114X = 0x1, + FW_PORT_MOD_SUB_TYPE_TN8022 = 0x2, + FW_PORT_MOD_SUB_TYPE_AQ1202 = 0x3, + FW_PORT_MOD_SUB_TYPE_88x3120 = 0x4, + FW_PORT_MOD_SUB_TYPE_BCM84834 = 0x5, + FW_PORT_MOD_SUB_TYPE_BT_VSC8634 = 0x8, + + /* The following will never been in the VPD. They are TWINAX cable + * lengths decoded from SFP+ module i2c PROMs. These should + * almost certainly go somewhere else ... + */ + FW_PORT_MOD_SUB_TYPE_TWINAX_1 = 0x9, + FW_PORT_MOD_SUB_TYPE_TWINAX_3 = 0xA, + FW_PORT_MOD_SUB_TYPE_TWINAX_5 = 0xB, + FW_PORT_MOD_SUB_TYPE_TWINAX_7 = 0xC, +}; + +/* port stats */ +#define FW_NUM_PORT_STATS 50 +#define FW_NUM_PORT_TX_STATS 23 +#define FW_NUM_PORT_RX_STATS 27 + +enum fw_port_stats_tx_index { + FW_STAT_TX_PORT_BYTES_IX, + FW_STAT_TX_PORT_FRAMES_IX, + FW_STAT_TX_PORT_BCAST_IX, + FW_STAT_TX_PORT_MCAST_IX, + FW_STAT_TX_PORT_UCAST_IX, + FW_STAT_TX_PORT_ERROR_IX, + FW_STAT_TX_PORT_64B_IX, + FW_STAT_TX_PORT_65B_127B_IX, + FW_STAT_TX_PORT_128B_255B_IX, + FW_STAT_TX_PORT_256B_511B_IX, + FW_STAT_TX_PORT_512B_1023B_IX, + FW_STAT_TX_PORT_1024B_1518B_IX, + FW_STAT_TX_PORT_1519B_MAX_IX, + FW_STAT_TX_PORT_DROP_IX, + FW_STAT_TX_PORT_PAUSE_IX, + FW_STAT_TX_PORT_PPP0_IX, + FW_STAT_TX_PORT_PPP1_IX, + FW_STAT_TX_PORT_PPP2_IX, + FW_STAT_TX_PORT_PPP3_IX, + FW_STAT_TX_PORT_PPP4_IX, + FW_STAT_TX_PORT_PPP5_IX, + FW_STAT_TX_PORT_PPP6_IX, + FW_STAT_TX_PORT_PPP7_IX +}; + +enum fw_port_stat_rx_index { + FW_STAT_RX_PORT_BYTES_IX, + FW_STAT_RX_PORT_FRAMES_IX, + FW_STAT_RX_PORT_BCAST_IX, + FW_STAT_RX_PORT_MCAST_IX, + FW_STAT_RX_PORT_UCAST_IX, + FW_STAT_RX_PORT_MTU_ERROR_IX, + FW_STAT_RX_PORT_MTU_CRC_ERROR_IX, + FW_STAT_RX_PORT_CRC_ERROR_IX, + FW_STAT_RX_PORT_LEN_ERROR_IX, + FW_STAT_RX_PORT_SYM_ERROR_IX, + FW_STAT_RX_PORT_64B_IX, + FW_STAT_RX_PORT_65B_127B_IX, + FW_STAT_RX_PORT_128B_255B_IX, + FW_STAT_RX_PORT_256B_511B_IX, + FW_STAT_RX_PORT_512B_1023B_IX, + FW_STAT_RX_PORT_1024B_1518B_IX, + FW_STAT_RX_PORT_1519B_MAX_IX, + FW_STAT_RX_PORT_PAUSE_IX, + FW_STAT_RX_PORT_PPP0_IX, + FW_STAT_RX_PORT_PPP1_IX, + FW_STAT_RX_PORT_PPP2_IX, + FW_STAT_RX_PORT_PPP3_IX, + FW_STAT_RX_PORT_PPP4_IX, + FW_STAT_RX_PORT_PPP5_IX, + FW_STAT_RX_PORT_PPP6_IX, + FW_STAT_RX_PORT_PPP7_IX, + FW_STAT_RX_PORT_LESS_64B_IX +}; + +struct fw_port_stats_cmd { + __be32 op_to_portid; + __be32 retval_len16; + union fw_port_stats { + struct fw_port_stats_ctl { + u8 nstats_bg_bm; + u8 tx_ix; + __be16 r6; + __be32 r7; + __be64 stat0; + __be64 stat1; + __be64 stat2; + __be64 stat3; + __be64 stat4; + __be64 stat5; + } ctl; + struct fw_port_stats_all { + __be64 tx_bytes; + __be64 tx_frames; + __be64 tx_bcast; + __be64 tx_mcast; + __be64 tx_ucast; + __be64 tx_error; + __be64 tx_64b; + __be64 tx_65b_127b; + __be64 tx_128b_255b; + __be64 tx_256b_511b; + __be64 tx_512b_1023b; + __be64 tx_1024b_1518b; + __be64 tx_1519b_max; + __be64 tx_drop; + __be64 tx_pause; + __be64 tx_ppp0; + __be64 tx_ppp1; + __be64 tx_ppp2; + __be64 tx_ppp3; + __be64 tx_ppp4; + __be64 tx_ppp5; + __be64 tx_ppp6; + __be64 tx_ppp7; + __be64 rx_bytes; + __be64 rx_frames; + __be64 rx_bcast; + __be64 rx_mcast; + __be64 rx_ucast; + __be64 rx_mtu_error; + __be64 rx_mtu_crc_error; + __be64 rx_crc_error; + __be64 rx_len_error; + __be64 rx_sym_error; + __be64 rx_64b; + __be64 rx_65b_127b; + __be64 rx_128b_255b; + __be64 rx_256b_511b; + __be64 rx_512b_1023b; + __be64 rx_1024b_1518b; + __be64 rx_1519b_max; + __be64 rx_pause; + __be64 rx_ppp0; + __be64 rx_ppp1; + __be64 rx_ppp2; + __be64 rx_ppp3; + __be64 rx_ppp4; + __be64 rx_ppp5; + __be64 rx_ppp6; + __be64 rx_ppp7; + __be64 rx_less_64b; + __be64 rx_bg_drop; + __be64 rx_bg_trunc; + } all; + } u; +}; + +#define FW_PORT_STATS_CMD_NSTATS(x) ((x) << 4) +#define FW_PORT_STATS_CMD_BG_BM(x) ((x) << 0) +#define FW_PORT_STATS_CMD_TX(x) ((x) << 7) +#define FW_PORT_STATS_CMD_IX(x) ((x) << 0) + +/* port loopback stats */ +#define FW_NUM_LB_STATS 16 +enum fw_port_lb_stats_index { + FW_STAT_LB_PORT_BYTES_IX, + FW_STAT_LB_PORT_FRAMES_IX, + FW_STAT_LB_PORT_BCAST_IX, + FW_STAT_LB_PORT_MCAST_IX, + FW_STAT_LB_PORT_UCAST_IX, + FW_STAT_LB_PORT_ERROR_IX, + FW_STAT_LB_PORT_64B_IX, + FW_STAT_LB_PORT_65B_127B_IX, + FW_STAT_LB_PORT_128B_255B_IX, + FW_STAT_LB_PORT_256B_511B_IX, + FW_STAT_LB_PORT_512B_1023B_IX, + FW_STAT_LB_PORT_1024B_1518B_IX, + FW_STAT_LB_PORT_1519B_MAX_IX, + FW_STAT_LB_PORT_DROP_FRAMES_IX +}; + +struct fw_port_lb_stats_cmd { + __be32 op_to_lbport; + __be32 retval_len16; + union fw_port_lb_stats { + struct fw_port_lb_stats_ctl { + u8 nstats_bg_bm; + u8 ix_pkd; + __be16 r6; + __be32 r7; + __be64 stat0; + __be64 stat1; + __be64 stat2; + __be64 stat3; + __be64 stat4; + __be64 stat5; + } ctl; + struct fw_port_lb_stats_all { + __be64 tx_bytes; + __be64 tx_frames; + __be64 tx_bcast; + __be64 tx_mcast; + __be64 tx_ucast; + __be64 tx_error; + __be64 tx_64b; + __be64 tx_65b_127b; + __be64 tx_128b_255b; + __be64 tx_256b_511b; + __be64 tx_512b_1023b; + __be64 tx_1024b_1518b; + __be64 tx_1519b_max; + __be64 rx_lb_drop; + __be64 rx_lb_trunc; + } all; + } u; +}; + +#define FW_PORT_LB_STATS_CMD_LBPORT(x) ((x) << 0) +#define FW_PORT_LB_STATS_CMD_NSTATS(x) ((x) << 4) +#define FW_PORT_LB_STATS_CMD_BG_BM(x) ((x) << 0) +#define FW_PORT_LB_STATS_CMD_IX(x) ((x) << 0) + +struct fw_rss_ind_tbl_cmd { + __be32 op_to_viid; +#define FW_RSS_IND_TBL_CMD_VIID(x) ((x) << 0) + __be32 retval_len16; + __be16 niqid; + __be16 startidx; + __be32 r3; + __be32 iq0_to_iq2; +#define FW_RSS_IND_TBL_CMD_IQ0(x) ((x) << 20) +#define FW_RSS_IND_TBL_CMD_IQ1(x) ((x) << 10) +#define FW_RSS_IND_TBL_CMD_IQ2(x) ((x) << 0) + __be32 iq3_to_iq5; + __be32 iq6_to_iq8; + __be32 iq9_to_iq11; + __be32 iq12_to_iq14; + __be32 iq15_to_iq17; + __be32 iq18_to_iq20; + __be32 iq21_to_iq23; + __be32 iq24_to_iq26; + __be32 iq27_to_iq29; + __be32 iq30_iq31; + __be32 r15_lo; +}; + +struct fw_rss_glb_config_cmd { + __be32 op_to_write; + __be32 retval_len16; + union fw_rss_glb_config { + struct fw_rss_glb_config_manual { + __be32 mode_pkd; + __be32 r3; + __be64 r4; + __be64 r5; + } manual; + struct fw_rss_glb_config_basicvirtual { + __be32 mode_pkd; + __be32 synmapen_to_hashtoeplitz; +#define FW_RSS_GLB_CONFIG_CMD_SYNMAPEN (1U << 8) +#define FW_RSS_GLB_CONFIG_CMD_SYN4TUPENIPV6 (1U << 7) +#define FW_RSS_GLB_CONFIG_CMD_SYN2TUPENIPV6 (1U << 6) +#define FW_RSS_GLB_CONFIG_CMD_SYN4TUPENIPV4 (1U << 5) +#define FW_RSS_GLB_CONFIG_CMD_SYN2TUPENIPV4 (1U << 4) +#define FW_RSS_GLB_CONFIG_CMD_OFDMAPEN (1U << 3) +#define FW_RSS_GLB_CONFIG_CMD_TNLMAPEN (1U << 2) +#define FW_RSS_GLB_CONFIG_CMD_TNLALLLKP (1U << 1) +#define FW_RSS_GLB_CONFIG_CMD_HASHTOEPLITZ (1U << 0) + __be64 r8; + __be64 r9; + } basicvirtual; + } u; +}; + +#define FW_RSS_GLB_CONFIG_CMD_MODE(x) ((x) << 28) +#define FW_RSS_GLB_CONFIG_CMD_MODE_GET(x) (((x) >> 28) & 0xf) + +#define FW_RSS_GLB_CONFIG_CMD_MODE_MANUAL 0 +#define FW_RSS_GLB_CONFIG_CMD_MODE_BASICVIRTUAL 1 + +struct fw_rss_vi_config_cmd { + __be32 op_to_viid; +#define FW_RSS_VI_CONFIG_CMD_VIID(x) ((x) << 0) + __be32 retval_len16; + union fw_rss_vi_config { + struct fw_rss_vi_config_manual { + __be64 r3; + __be64 r4; + __be64 r5; + } manual; + struct fw_rss_vi_config_basicvirtual { + __be32 r6; + __be32 defaultq_to_udpen; +#define FW_RSS_VI_CONFIG_CMD_DEFAULTQ(x) ((x) << 16) +#define FW_RSS_VI_CONFIG_CMD_DEFAULTQ_GET(x) (((x) >> 16) & 0x3ff) +#define FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN (1U << 4) +#define FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN (1U << 3) +#define FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN (1U << 2) +#define FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN (1U << 1) +#define FW_RSS_VI_CONFIG_CMD_UDPEN (1U << 0) + __be64 r9; + __be64 r10; + } basicvirtual; + } u; +}; + +struct fw_clip_cmd { + __be32 op_to_write; + __be32 alloc_to_len16; + __be64 ip_hi; + __be64 ip_lo; + __be32 r4[2]; +}; + +#define S_FW_CLIP_CMD_ALLOC 31 +#define M_FW_CLIP_CMD_ALLOC 0x1 +#define V_FW_CLIP_CMD_ALLOC(x) ((x) << S_FW_CLIP_CMD_ALLOC) +#define G_FW_CLIP_CMD_ALLOC(x) \ + (((x) >> S_FW_CLIP_CMD_ALLOC) & M_FW_CLIP_CMD_ALLOC) +#define F_FW_CLIP_CMD_ALLOC V_FW_CLIP_CMD_ALLOC(1U) + +#define S_FW_CLIP_CMD_FREE 30 +#define M_FW_CLIP_CMD_FREE 0x1 +#define V_FW_CLIP_CMD_FREE(x) ((x) << S_FW_CLIP_CMD_FREE) +#define G_FW_CLIP_CMD_FREE(x) \ + (((x) >> S_FW_CLIP_CMD_FREE) & M_FW_CLIP_CMD_FREE) +#define F_FW_CLIP_CMD_FREE V_FW_CLIP_CMD_FREE(1U) + +enum fw_error_type { + FW_ERROR_TYPE_EXCEPTION = 0x0, + FW_ERROR_TYPE_HWMODULE = 0x1, + FW_ERROR_TYPE_WR = 0x2, + FW_ERROR_TYPE_ACL = 0x3, +}; + +struct fw_error_cmd { + __be32 op_to_type; + __be32 len16_pkd; + union fw_error { + struct fw_error_exception { + __be32 info[6]; + } exception; + struct fw_error_hwmodule { + __be32 regaddr; + __be32 regval; + } hwmodule; + struct fw_error_wr { + __be16 cidx; + __be16 pfn_vfn; + __be32 eqid; + u8 wrhdr[16]; + } wr; + struct fw_error_acl { + __be16 cidx; + __be16 pfn_vfn; + __be32 eqid; + __be16 mv_pkd; + u8 val[6]; + __be64 r4; + } acl; + } u; +}; + +struct fw_debug_cmd { + __be32 op_type; +#define FW_DEBUG_CMD_TYPE_GET(x) ((x) & 0xff) + __be32 len16_pkd; + union fw_debug { + struct fw_debug_assert { + __be32 fcid; + __be32 line; + __be32 x; + __be32 y; + u8 filename_0_7[8]; + u8 filename_8_15[8]; + __be64 r3; + } assert; + struct fw_debug_prt { + __be16 dprtstridx; + __be16 r3[3]; + __be32 dprtstrparam0; + __be32 dprtstrparam1; + __be32 dprtstrparam2; + __be32 dprtstrparam3; + } prt; + } u; +}; + +#define FW_PCIE_FW_ERR (1U << 31) +#define FW_PCIE_FW_INIT (1U << 30) +#define FW_PCIE_FW_HALT (1U << 29) +#define FW_PCIE_FW_MASTER_VLD (1U << 15) +#define FW_PCIE_FW_MASTER_MASK 0x7 +#define FW_PCIE_FW_MASTER_SHIFT 12 +#define FW_PCIE_FW_MASTER(x) ((x) << FW_PCIE_FW_MASTER_SHIFT) +#define FW_PCIE_FW_MASTER_GET(x) (((x) >> FW_PCIE_FW_MASTER_SHIFT) & \ + FW_PCIE_FW_MASTER_MASK) +#define FW_PCIE_FW_EVAL_MASK 0x7 +#define FW_PCIE_FW_EVAL_SHIFT 24 +#define FW_PCIE_FW_EVAL_GET(x) (((x) >> FW_PCIE_FW_EVAL_SHIFT) & \ + FW_PCIE_FW_EVAL_MASK) + +struct fw_hdr { + u8 ver; + u8 chip; /* terminator chip type */ + __be16 len512; /* bin length in units of 512-bytes */ + __be32 fw_ver; /* firmware version */ + __be32 tp_microcode_ver; + u8 intfver_nic; + u8 intfver_vnic; + u8 intfver_ofld; + u8 intfver_ri; + u8 intfver_iscsipdu; + u8 intfver_iscsi; + u8 intfver_fcoepdu; + u8 intfver_fcoe; + __u32 reserved2; + __u32 reserved3; + __u32 reserved4; + __be32 flags; + __be32 reserved6[23]; +}; + +enum fw_hdr_chip { + FW_HDR_CHIP_T4, + FW_HDR_CHIP_T5 +}; + +#define FW_HDR_FW_VER_MAJOR_GET(x) (((x) >> 24) & 0xff) +#define FW_HDR_FW_VER_MINOR_GET(x) (((x) >> 16) & 0xff) +#define FW_HDR_FW_VER_MICRO_GET(x) (((x) >> 8) & 0xff) +#define FW_HDR_FW_VER_BUILD_GET(x) (((x) >> 0) & 0xff) + +enum fw_hdr_intfver { + FW_HDR_INTFVER_NIC = 0x00, + FW_HDR_INTFVER_VNIC = 0x00, + FW_HDR_INTFVER_OFLD = 0x00, + FW_HDR_INTFVER_RI = 0x00, + FW_HDR_INTFVER_ISCSIPDU = 0x00, + FW_HDR_INTFVER_ISCSI = 0x00, + FW_HDR_INTFVER_FCOEPDU = 0x00, + FW_HDR_INTFVER_FCOE = 0x00, +}; + +enum fw_hdr_flags { + FW_HDR_FLAGS_RESET_HALT = 0x00000001, +}; + +#endif /* _T4FW_INTERFACE_H_ */ diff --git a/drivers/net/ethernet/emulex/benet/Kconfig b/drivers/net/ethernet/emulex/benet/Kconfig new file mode 100644 index 0000000..ea94a8e --- /dev/null +++ b/drivers/net/ethernet/emulex/benet/Kconfig @@ -0,0 +1,14 @@ +config BE2NET + tristate "ServerEngines' 10Gbps NIC - BladeEngine" + depends on PCI + ---help--- + This driver implements the NIC functionality for ServerEngines' + 10Gbps network adapter - BladeEngine. + +config BE2NET_VXLAN + bool "VXLAN offload support on be2net driver" + default y + depends on BE2NET && VXLAN && !(BE2NET=y && VXLAN=m) + ---help--- + Say Y here if you want to enable VXLAN offload support on + be2net driver. diff --git a/drivers/net/ethernet/emulex/benet/Makefile b/drivers/net/ethernet/emulex/benet/Makefile new file mode 100644 index 0000000..1a91b27 --- /dev/null +++ b/drivers/net/ethernet/emulex/benet/Makefile @@ -0,0 +1,7 @@ +# +# Makefile to build the network driver for ServerEngine's BladeEngine. +# + +obj-$(CONFIG_BE2NET) += be2net.o + +be2net-y := be_main.o be_cmds.o be_ethtool.o be_roce.o diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h new file mode 100644 index 0000000..9a2d752 --- /dev/null +++ b/drivers/net/ethernet/emulex/benet/be.h @@ -0,0 +1,926 @@ +/* + * Copyright (C) 2005 - 2014 Emulex + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. The full GNU General + * Public License is included in this distribution in the file called COPYING. + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + */ + +#ifndef BE_H +#define BE_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "be_hw.h" +#include "be_roce.h" + +#define DRV_VER "10.4u" +#define DRV_NAME "be2net" +#define BE_NAME "Emulex BladeEngine2" +#define BE3_NAME "Emulex BladeEngine3" +#define OC_NAME "Emulex OneConnect" +#define OC_NAME_BE OC_NAME "(be3)" +#define OC_NAME_LANCER OC_NAME "(Lancer)" +#define OC_NAME_SH OC_NAME "(Skyhawk)" +#define DRV_DESC "Emulex OneConnect NIC Driver" + +#define BE_VENDOR_ID 0x19a2 +#define EMULEX_VENDOR_ID 0x10df +#define BE_DEVICE_ID1 0x211 +#define BE_DEVICE_ID2 0x221 +#define OC_DEVICE_ID1 0x700 /* Device Id for BE2 cards */ +#define OC_DEVICE_ID2 0x710 /* Device Id for BE3 cards */ +#define OC_DEVICE_ID3 0xe220 /* Device id for Lancer cards */ +#define OC_DEVICE_ID4 0xe228 /* Device id for VF in Lancer */ +#define OC_DEVICE_ID5 0x720 /* Device Id for Skyhawk cards */ +#define OC_DEVICE_ID6 0x728 /* Device id for VF in SkyHawk */ +#define OC_SUBSYS_DEVICE_ID1 0xE602 +#define OC_SUBSYS_DEVICE_ID2 0xE642 +#define OC_SUBSYS_DEVICE_ID3 0xE612 +#define OC_SUBSYS_DEVICE_ID4 0xE652 + +static inline char *nic_name(struct pci_dev *pdev) +{ + switch (pdev->device) { + case OC_DEVICE_ID1: + return OC_NAME; + case OC_DEVICE_ID2: + return OC_NAME_BE; + case OC_DEVICE_ID3: + case OC_DEVICE_ID4: + return OC_NAME_LANCER; + case BE_DEVICE_ID2: + return BE3_NAME; + case OC_DEVICE_ID5: + case OC_DEVICE_ID6: + return OC_NAME_SH; + default: + return BE_NAME; + } +} + +/* Number of bytes of an RX frame that are copied to skb->data */ +#define BE_HDR_LEN ((u16) 64) +/* allocate extra space to allow tunneling decapsulation without head reallocation */ +#define BE_RX_SKB_ALLOC_SIZE (BE_HDR_LEN + 64) + +#define BE_MAX_JUMBO_FRAME_SIZE 9018 +#define BE_MIN_MTU 256 +#define BE_MAX_MTU (BE_MAX_JUMBO_FRAME_SIZE - \ + (ETH_HLEN + ETH_FCS_LEN)) + +#define BE_NUM_VLANS_SUPPORTED 64 +#define BE_MAX_EQD 128u +#define BE_MAX_TX_FRAG_COUNT 30 + +#define EVNT_Q_LEN 1024 +#define TX_Q_LEN 2048 +#define TX_CQ_LEN 1024 +#define RX_Q_LEN 1024 /* Does not support any other value */ +#define RX_CQ_LEN 1024 +#define MCC_Q_LEN 128 /* total size not to exceed 8 pages */ +#define MCC_CQ_LEN 256 + +#define BE2_MAX_RSS_QS 4 +#define BE3_MAX_RSS_QS 16 +#define BE3_MAX_TX_QS 16 +#define BE3_MAX_EVT_QS 16 +#define BE3_SRIOV_MAX_EVT_QS 8 + +#define MAX_RX_QS 32 +#define MAX_EVT_QS 32 +#define MAX_TX_QS 32 + +#define MAX_ROCE_EQS 5 +#define MAX_MSIX_VECTORS 32 +#define MIN_MSIX_VECTORS 1 +#define BE_NAPI_WEIGHT 64 +#define MAX_RX_POST BE_NAPI_WEIGHT /* Frags posted at a time */ +#define RX_FRAGS_REFILL_WM (RX_Q_LEN - MAX_RX_POST) + +#define MAX_VFS 30 /* Max VFs supported by BE3 FW */ +#define FW_VER_LEN 32 + +#define RSS_INDIR_TABLE_LEN 128 +#define RSS_HASH_KEY_LEN 40 + +struct be_dma_mem { + void *va; + dma_addr_t dma; + u32 size; +}; + +struct be_queue_info { + struct be_dma_mem dma_mem; + u16 len; + u16 entry_size; /* Size of an element in the queue */ + u16 id; + u16 tail, head; + bool created; + atomic_t used; /* Number of valid elements in the queue */ +}; + +static inline u32 MODULO(u16 val, u16 limit) +{ + BUG_ON(limit & (limit - 1)); + return val & (limit - 1); +} + +static inline void index_adv(u16 *index, u16 val, u16 limit) +{ + *index = MODULO((*index + val), limit); +} + +static inline void index_inc(u16 *index, u16 limit) +{ + *index = MODULO((*index + 1), limit); +} + +static inline void *queue_head_node(struct be_queue_info *q) +{ + return q->dma_mem.va + q->head * q->entry_size; +} + +static inline void *queue_tail_node(struct be_queue_info *q) +{ + return q->dma_mem.va + q->tail * q->entry_size; +} + +static inline void *queue_index_node(struct be_queue_info *q, u16 index) +{ + return q->dma_mem.va + index * q->entry_size; +} + +static inline void queue_head_inc(struct be_queue_info *q) +{ + index_inc(&q->head, q->len); +} + +static inline void index_dec(u16 *index, u16 limit) +{ + *index = MODULO((*index - 1), limit); +} + +static inline void queue_tail_inc(struct be_queue_info *q) +{ + index_inc(&q->tail, q->len); +} + +struct be_eq_obj { + struct be_queue_info q; + char desc[32]; + + /* Adaptive interrupt coalescing (AIC) info */ + bool enable_aic; + u32 min_eqd; /* in usecs */ + u32 max_eqd; /* in usecs */ + u32 eqd; /* configured val when aic is off */ + u32 cur_eqd; /* in usecs */ + + u8 idx; /* array index */ + u8 msix_idx; + u16 spurious_intr; + struct napi_struct napi; + struct be_adapter *adapter; + +#ifdef CONFIG_NET_RX_BUSY_POLL +#define BE_EQ_IDLE 0 +#define BE_EQ_NAPI 1 /* napi owns this EQ */ +#define BE_EQ_POLL 2 /* poll owns this EQ */ +#define BE_EQ_LOCKED (BE_EQ_NAPI | BE_EQ_POLL) +#define BE_EQ_NAPI_YIELD 4 /* napi yielded this EQ */ +#define BE_EQ_POLL_YIELD 8 /* poll yielded this EQ */ +#define BE_EQ_YIELD (BE_EQ_NAPI_YIELD | BE_EQ_POLL_YIELD) +#define BE_EQ_USER_PEND (BE_EQ_POLL | BE_EQ_POLL_YIELD) + unsigned int state; + spinlock_t lock; /* lock to serialize napi and busy-poll */ +#endif /* CONFIG_NET_RX_BUSY_POLL */ +} ____cacheline_aligned_in_smp; + +struct be_aic_obj { /* Adaptive interrupt coalescing (AIC) info */ + bool enable; + u32 min_eqd; /* in usecs */ + u32 max_eqd; /* in usecs */ + u32 prev_eqd; /* in usecs */ + u32 et_eqd; /* configured val when aic is off */ + ulong jiffies; + u64 rx_pkts_prev; /* Used to calculate RX pps */ + u64 tx_reqs_prev; /* Used to calculate TX pps */ +}; + +enum { + NAPI_POLLING, + BUSY_POLLING +}; + +struct be_mcc_obj { + struct be_queue_info q; + struct be_queue_info cq; + bool rearm_cq; +}; + +struct be_tx_stats { + u64 tx_bytes; + u64 tx_pkts; + u64 tx_reqs; + u64 tx_wrbs; + u64 tx_compl; + ulong tx_jiffies; + u32 tx_stops; + u32 tx_drv_drops; /* pkts dropped by driver */ + /* the error counters are described in be_ethtool.c */ + u32 tx_hdr_parse_err; + u32 tx_dma_err; + u32 tx_tso_err; + u32 tx_spoof_check_err; + u32 tx_qinq_err; + u32 tx_internal_parity_err; + struct u64_stats_sync sync; + struct u64_stats_sync sync_compl; +}; + +struct be_tx_obj { + u32 db_offset; + struct be_queue_info q; + struct be_queue_info cq; + /* Remember the skbs that were transmitted */ + struct sk_buff *sent_skb_list[TX_Q_LEN]; + struct be_tx_stats stats; +} ____cacheline_aligned_in_smp; + +/* Struct to remember the pages posted for rx frags */ +struct be_rx_page_info { + struct page *page; + /* set to page-addr for last frag of the page & frag-addr otherwise */ + DEFINE_DMA_UNMAP_ADDR(bus); + u16 page_offset; + bool last_frag; /* last frag of the page */ +}; + +struct be_rx_stats { + u64 rx_bytes; + u64 rx_pkts; + u32 rx_drops_no_skbs; /* skb allocation errors */ + u32 rx_drops_no_frags; /* HW has no fetched frags */ + u32 rx_post_fail; /* page post alloc failures */ + u32 rx_compl; + u32 rx_mcast_pkts; + u32 rx_compl_err; /* completions with err set */ + struct u64_stats_sync sync; +}; + +struct be_rx_compl_info { + u32 rss_hash; + u16 vlan_tag; + u16 pkt_size; + u16 port; + u8 vlanf; + u8 num_rcvd; + u8 err; + u8 ipf; + u8 tcpf; + u8 udpf; + u8 ip_csum; + u8 l4_csum; + u8 ipv6; + u8 qnq; + u8 pkt_type; + u8 ip_frag; + u8 tunneled; +}; + +struct be_rx_obj { + struct be_adapter *adapter; + struct be_queue_info q; + struct be_queue_info cq; + struct be_rx_compl_info rxcp; + struct be_rx_page_info page_info_tbl[RX_Q_LEN]; + struct be_rx_stats stats; + u8 rss_id; + bool rx_post_starved; /* Zero rx frags have been posted to BE */ +} ____cacheline_aligned_in_smp; + +struct be_drv_stats { + u32 be_on_die_temperature; + u32 eth_red_drops; + u32 dma_map_errors; + u32 rx_drops_no_pbuf; + u32 rx_drops_no_txpb; + u32 rx_drops_no_erx_descr; + u32 rx_drops_no_tpre_descr; + u32 rx_drops_too_many_frags; + u32 forwarded_packets; + u32 rx_drops_mtu; + u32 rx_crc_errors; + u32 rx_alignment_symbol_errors; + u32 rx_pause_frames; + u32 rx_priority_pause_frames; + u32 rx_control_frames; + u32 rx_in_range_errors; + u32 rx_out_range_errors; + u32 rx_frame_too_long; + u32 rx_address_filtered; + u32 rx_dropped_too_small; + u32 rx_dropped_too_short; + u32 rx_dropped_header_too_small; + u32 rx_dropped_tcp_length; + u32 rx_dropped_runt; + u32 rx_ip_checksum_errs; + u32 rx_tcp_checksum_errs; + u32 rx_udp_checksum_errs; + u32 tx_pauseframes; + u32 tx_priority_pauseframes; + u32 tx_controlframes; + u32 rxpp_fifo_overflow_drop; + u32 rx_input_fifo_overflow_drop; + u32 pmem_fifo_overflow_drop; + u32 jabber_events; + u32 rx_roce_bytes_lsd; + u32 rx_roce_bytes_msd; + u32 rx_roce_frames; + u32 roce_drops_payload_len; + u32 roce_drops_crc; +}; + +/* A vlan-id of 0xFFFF must be used to clear transparent vlan-tagging */ +#define BE_RESET_VLAN_TAG_ID 0xFFFF + +struct be_vf_cfg { + unsigned char mac_addr[ETH_ALEN]; + int if_handle; + int pmac_id; + u16 vlan_tag; + u32 tx_rate; + u32 plink_tracking; +}; + +enum vf_state { + ENABLED = 0, + ASSIGNED = 1 +}; + +#define BE_FLAGS_LINK_STATUS_INIT 1 +#define BE_FLAGS_SRIOV_ENABLED (1 << 2) +#define BE_FLAGS_WORKER_SCHEDULED (1 << 3) +#define BE_FLAGS_VLAN_PROMISC (1 << 4) +#define BE_FLAGS_MCAST_PROMISC (1 << 5) +#define BE_FLAGS_NAPI_ENABLED (1 << 9) +#define BE_FLAGS_QNQ_ASYNC_EVT_RCVD (1 << 11) +#define BE_FLAGS_VXLAN_OFFLOADS (1 << 12) +#define BE_FLAGS_SETUP_DONE (1 << 13) + +#define BE_UC_PMAC_COUNT 30 +#define BE_VF_UC_PMAC_COUNT 2 + +/* Ethtool set_dump flags */ +#define LANCER_INITIATE_FW_DUMP 0x1 +#define LANCER_DELETE_FW_DUMP 0x2 + +struct phy_info { + u8 transceiver; + u8 autoneg; + u8 fc_autoneg; + u8 port_type; + u16 phy_type; + u16 interface_type; + u32 misc_params; + u16 auto_speeds_supported; + u16 fixed_speeds_supported; + int link_speed; + u32 advertising; + u32 supported; + u8 cable_type; +}; + +struct be_resources { + u16 max_vfs; /* Total VFs "really" supported by FW/HW */ + u16 max_mcast_mac; + u16 max_tx_qs; + u16 max_rss_qs; + u16 max_rx_qs; + u16 max_uc_mac; /* Max UC MACs programmable */ + u16 max_vlans; /* Number of vlans supported */ + u16 max_evt_qs; + u32 if_cap_flags; + u32 vf_if_cap_flags; /* VF if capability flags */ +}; + +struct rss_info { + u64 rss_flags; + u8 rsstable[RSS_INDIR_TABLE_LEN]; + u8 rss_queue[RSS_INDIR_TABLE_LEN]; + u8 rss_hkey[RSS_HASH_KEY_LEN]; +}; + +struct be_adapter { + struct pci_dev *pdev; + struct net_device *netdev; + + u8 __iomem *csr; /* CSR BAR used only for BE2/3 */ + u8 __iomem *db; /* Door Bell */ + + struct mutex mbox_lock; /* For serializing mbox cmds to BE card */ + struct be_dma_mem mbox_mem; + /* Mbox mem is adjusted to align to 16 bytes. The allocated addr + * is stored for freeing purpose */ + struct be_dma_mem mbox_mem_alloced; + + struct be_mcc_obj mcc_obj; + spinlock_t mcc_lock; /* For serializing mcc cmds to BE card */ + spinlock_t mcc_cq_lock; + + u16 cfg_num_qs; /* configured via set-channels */ + u16 num_evt_qs; + u16 num_msix_vec; + struct be_eq_obj eq_obj[MAX_EVT_QS]; + struct msix_entry msix_entries[MAX_MSIX_VECTORS]; + bool isr_registered; + + /* TX Rings */ + u16 num_tx_qs; + struct be_tx_obj tx_obj[MAX_TX_QS]; + + /* Rx rings */ + u16 num_rx_qs; + struct be_rx_obj rx_obj[MAX_RX_QS]; + u32 big_page_size; /* Compounded page size shared by rx wrbs */ + + struct be_drv_stats drv_stats; + struct be_aic_obj aic_obj[MAX_EVT_QS]; + u16 vlans_added; + unsigned long vids[BITS_TO_LONGS(VLAN_N_VID)]; + u8 vlan_prio_bmap; /* Available Priority BitMap */ + u16 recommended_prio; /* Recommended Priority */ + struct be_dma_mem rx_filter; /* Cmd DMA mem for rx-filter */ + + struct be_dma_mem stats_cmd; + /* Work queue used to perform periodic tasks like getting statistics */ + struct delayed_work work; + u16 work_counter; + + struct delayed_work func_recovery_work; + u32 flags; + u32 cmd_privileges; + /* Ethtool knobs and info */ + char fw_ver[FW_VER_LEN]; + char fw_on_flash[FW_VER_LEN]; + int if_handle; /* Used to configure filtering */ + u32 *pmac_id; /* MAC addr handle used by BE card */ + u32 beacon_state; /* for set_phys_id */ + + bool eeh_error; + bool fw_timeout; + bool hw_error; + + u32 port_num; + bool promiscuous; + u8 mc_type; + u32 function_mode; + u32 function_caps; + u32 rx_fc; /* Rx flow control */ + u32 tx_fc; /* Tx flow control */ + bool stats_cmd_sent; + struct { + u32 size; + u32 total_size; + u64 io_addr; + } roce_db; + u32 num_msix_roce_vec; + struct ocrdma_dev *ocrdma_dev; + struct list_head entry; + + u32 flash_status; + struct completion et_cmd_compl; + + struct be_resources pool_res; /* resources available for the port */ + struct be_resources res; /* resources available for the func */ + u16 num_vfs; /* Number of VFs provisioned by PF */ + u8 virtfn; + struct be_vf_cfg *vf_cfg; + bool be3_native; + u32 sli_family; + u8 hba_port_num; + u16 pvid; + __be16 vxlan_port; + struct phy_info phy; + u8 wol_cap; + bool wol_en; + u32 uc_macs; /* Count of secondary UC MAC programmed */ + u16 asic_rev; + u16 qnq_vid; + u32 msg_enable; + int be_get_temp_freq; + u8 pf_number; + struct rss_info rss_info; +}; + +#define be_physfn(adapter) (!adapter->virtfn) +#define be_virtfn(adapter) (adapter->virtfn) +#define sriov_enabled(adapter) (adapter->flags & \ + BE_FLAGS_SRIOV_ENABLED) + +#define for_all_vfs(adapter, vf_cfg, i) \ + for (i = 0, vf_cfg = &adapter->vf_cfg[i]; i < adapter->num_vfs; \ + i++, vf_cfg++) + +#define ON 1 +#define OFF 0 + +#define be_max_vlans(adapter) (adapter->res.max_vlans) +#define be_max_uc(adapter) (adapter->res.max_uc_mac) +#define be_max_mc(adapter) (adapter->res.max_mcast_mac) +#define be_max_vfs(adapter) (adapter->pool_res.max_vfs) +#define be_max_rss(adapter) (adapter->res.max_rss_qs) +#define be_max_txqs(adapter) (adapter->res.max_tx_qs) +#define be_max_prio_txqs(adapter) (adapter->res.max_prio_tx_qs) +#define be_max_rxqs(adapter) (adapter->res.max_rx_qs) +#define be_max_eqs(adapter) (adapter->res.max_evt_qs) +#define be_if_cap_flags(adapter) (adapter->res.if_cap_flags) + +static inline u16 be_max_qs(struct be_adapter *adapter) +{ + /* If no RSS, need atleast the one def RXQ */ + u16 num = max_t(u16, be_max_rss(adapter), 1); + + num = min(num, be_max_eqs(adapter)); + return min_t(u16, num, num_online_cpus()); +} + +/* Is BE in pvid_tagging mode */ +#define be_pvid_tagging_enabled(adapter) (adapter->pvid) + +/* Is BE in QNQ multi-channel mode */ +#define be_is_qnq_mode(adapter) (adapter->function_mode & QNQ_MODE) + +#define lancer_chip(adapter) (adapter->pdev->device == OC_DEVICE_ID3 || \ + adapter->pdev->device == OC_DEVICE_ID4) + +#define skyhawk_chip(adapter) (adapter->pdev->device == OC_DEVICE_ID5 || \ + adapter->pdev->device == OC_DEVICE_ID6) + +#define BE3_chip(adapter) (adapter->pdev->device == BE_DEVICE_ID2 || \ + adapter->pdev->device == OC_DEVICE_ID2) + +#define BE2_chip(adapter) (adapter->pdev->device == BE_DEVICE_ID1 || \ + adapter->pdev->device == OC_DEVICE_ID1) + +#define BEx_chip(adapter) (BE3_chip(adapter) || BE2_chip(adapter)) + +#define be_roce_supported(adapter) (skyhawk_chip(adapter) && \ + (adapter->function_mode & RDMA_ENABLED)) + +extern const struct ethtool_ops be_ethtool_ops; + +#define msix_enabled(adapter) (adapter->num_msix_vec > 0) +#define num_irqs(adapter) (msix_enabled(adapter) ? \ + adapter->num_msix_vec : 1) +#define tx_stats(txo) (&(txo)->stats) +#define rx_stats(rxo) (&(rxo)->stats) + +/* The default RXQ is the last RXQ */ +#define default_rxo(adpt) (&adpt->rx_obj[adpt->num_rx_qs - 1]) + +#define for_all_rx_queues(adapter, rxo, i) \ + for (i = 0, rxo = &adapter->rx_obj[i]; i < adapter->num_rx_qs; \ + i++, rxo++) + +/* Skip the default non-rss queue (last one)*/ +#define for_all_rss_queues(adapter, rxo, i) \ + for (i = 0, rxo = &adapter->rx_obj[i]; i < (adapter->num_rx_qs - 1);\ + i++, rxo++) + +#define for_all_tx_queues(adapter, txo, i) \ + for (i = 0, txo = &adapter->tx_obj[i]; i < adapter->num_tx_qs; \ + i++, txo++) + +#define for_all_evt_queues(adapter, eqo, i) \ + for (i = 0, eqo = &adapter->eq_obj[i]; i < adapter->num_evt_qs; \ + i++, eqo++) + +#define for_all_rx_queues_on_eq(adapter, eqo, rxo, i) \ + for (i = eqo->idx, rxo = &adapter->rx_obj[i]; i < adapter->num_rx_qs;\ + i += adapter->num_evt_qs, rxo += adapter->num_evt_qs) + +#define for_all_tx_queues_on_eq(adapter, eqo, txo, i) \ + for (i = eqo->idx, txo = &adapter->tx_obj[i]; i < adapter->num_tx_qs;\ + i += adapter->num_evt_qs, txo += adapter->num_evt_qs) + +#define is_mcc_eqo(eqo) (eqo->idx == 0) +#define mcc_eqo(adapter) (&adapter->eq_obj[0]) + +#define PAGE_SHIFT_4K 12 +#define PAGE_SIZE_4K (1 << PAGE_SHIFT_4K) + +/* Returns number of pages spanned by the data starting at the given addr */ +#define PAGES_4K_SPANNED(_address, size) \ + ((u32)((((size_t)(_address) & (PAGE_SIZE_4K - 1)) + \ + (size) + (PAGE_SIZE_4K - 1)) >> PAGE_SHIFT_4K)) + +/* Returns bit offset within a DWORD of a bitfield */ +#define AMAP_BIT_OFFSET(_struct, field) \ + (((size_t)&(((_struct *)0)->field))%32) + +/* Returns the bit mask of the field that is NOT shifted into location. */ +static inline u32 amap_mask(u32 bitsize) +{ + return (bitsize == 32 ? 0xFFFFFFFF : (1 << bitsize) - 1); +} + +static inline void +amap_set(void *ptr, u32 dw_offset, u32 mask, u32 offset, u32 value) +{ + u32 *dw = (u32 *) ptr + dw_offset; + *dw &= ~(mask << offset); + *dw |= (mask & value) << offset; +} + +#define AMAP_SET_BITS(_struct, field, ptr, val) \ + amap_set(ptr, \ + offsetof(_struct, field)/32, \ + amap_mask(sizeof(((_struct *)0)->field)), \ + AMAP_BIT_OFFSET(_struct, field), \ + val) + +static inline u32 amap_get(void *ptr, u32 dw_offset, u32 mask, u32 offset) +{ + u32 *dw = (u32 *) ptr; + return mask & (*(dw + dw_offset) >> offset); +} + +#define AMAP_GET_BITS(_struct, field, ptr) \ + amap_get(ptr, \ + offsetof(_struct, field)/32, \ + amap_mask(sizeof(((_struct *)0)->field)), \ + AMAP_BIT_OFFSET(_struct, field)) + +#define GET_RX_COMPL_V0_BITS(field, ptr) \ + AMAP_GET_BITS(struct amap_eth_rx_compl_v0, field, ptr) + +#define GET_RX_COMPL_V1_BITS(field, ptr) \ + AMAP_GET_BITS(struct amap_eth_rx_compl_v1, field, ptr) + +#define GET_TX_COMPL_BITS(field, ptr) \ + AMAP_GET_BITS(struct amap_eth_tx_compl, field, ptr) + +#define SET_TX_WRB_HDR_BITS(field, ptr, val) \ + AMAP_SET_BITS(struct amap_eth_hdr_wrb, field, ptr, val) + +#define be_dws_cpu_to_le(wrb, len) swap_dws(wrb, len) +#define be_dws_le_to_cpu(wrb, len) swap_dws(wrb, len) +static inline void swap_dws(void *wrb, int len) +{ +#ifdef __BIG_ENDIAN + u32 *dw = wrb; + BUG_ON(len % 4); + do { + *dw = cpu_to_le32(*dw); + dw++; + len -= 4; + } while (len); +#endif /* __BIG_ENDIAN */ +} + +#define be_cmd_status(status) (status > 0 ? -EIO : status) + +static inline u8 is_tcp_pkt(struct sk_buff *skb) +{ + u8 val = 0; + + if (ip_hdr(skb)->version == 4) + val = (ip_hdr(skb)->protocol == IPPROTO_TCP); + else if (ip_hdr(skb)->version == 6) + val = (ipv6_hdr(skb)->nexthdr == NEXTHDR_TCP); + + return val; +} + +static inline u8 is_udp_pkt(struct sk_buff *skb) +{ + u8 val = 0; + + if (ip_hdr(skb)->version == 4) + val = (ip_hdr(skb)->protocol == IPPROTO_UDP); + else if (ip_hdr(skb)->version == 6) + val = (ipv6_hdr(skb)->nexthdr == NEXTHDR_UDP); + + return val; +} + +static inline bool is_ipv4_pkt(struct sk_buff *skb) +{ + return skb->protocol == htons(ETH_P_IP) && ip_hdr(skb)->version == 4; +} + +static inline void be_vf_eth_addr_generate(struct be_adapter *adapter, u8 *mac) +{ + u32 addr; + + addr = jhash(adapter->netdev->dev_addr, ETH_ALEN, 0); + + mac[5] = (u8)(addr & 0xFF); + mac[4] = (u8)((addr >> 8) & 0xFF); + mac[3] = (u8)((addr >> 16) & 0xFF); + /* Use the OUI from the current MAC address */ + memcpy(mac, adapter->netdev->dev_addr, 3); +} + +static inline bool be_multi_rxq(const struct be_adapter *adapter) +{ + return adapter->num_rx_qs > 1; +} + +static inline bool be_error(struct be_adapter *adapter) +{ + return adapter->eeh_error || adapter->hw_error || adapter->fw_timeout; +} + +static inline bool be_hw_error(struct be_adapter *adapter) +{ + return adapter->eeh_error || adapter->hw_error; +} + +static inline void be_clear_all_error(struct be_adapter *adapter) +{ + adapter->eeh_error = false; + adapter->hw_error = false; + adapter->fw_timeout = false; +} + +static inline bool be_is_wol_excluded(struct be_adapter *adapter) +{ + struct pci_dev *pdev = adapter->pdev; + + if (!be_physfn(adapter)) + return true; + + switch (pdev->subsystem_device) { + case OC_SUBSYS_DEVICE_ID1: + case OC_SUBSYS_DEVICE_ID2: + case OC_SUBSYS_DEVICE_ID3: + case OC_SUBSYS_DEVICE_ID4: + return true; + default: + return false; + } +} + +static inline int qnq_async_evt_rcvd(struct be_adapter *adapter) +{ + return adapter->flags & BE_FLAGS_QNQ_ASYNC_EVT_RCVD; +} + +#ifdef CONFIG_NET_RX_BUSY_POLL +static inline bool be_lock_napi(struct be_eq_obj *eqo) +{ + bool status = true; + + spin_lock(&eqo->lock); /* BH is already disabled */ + if (eqo->state & BE_EQ_LOCKED) { + WARN_ON(eqo->state & BE_EQ_NAPI); + eqo->state |= BE_EQ_NAPI_YIELD; + status = false; + } else { + eqo->state = BE_EQ_NAPI; + } + spin_unlock(&eqo->lock); + return status; +} + +static inline void be_unlock_napi(struct be_eq_obj *eqo) +{ + spin_lock(&eqo->lock); /* BH is already disabled */ + + WARN_ON(eqo->state & (BE_EQ_POLL | BE_EQ_NAPI_YIELD)); + eqo->state = BE_EQ_IDLE; + + spin_unlock(&eqo->lock); +} + +static inline bool be_lock_busy_poll(struct be_eq_obj *eqo) +{ + bool status = true; + + spin_lock_bh(&eqo->lock); + if (eqo->state & BE_EQ_LOCKED) { + eqo->state |= BE_EQ_POLL_YIELD; + status = false; + } else { + eqo->state |= BE_EQ_POLL; + } + spin_unlock_bh(&eqo->lock); + return status; +} + +static inline void be_unlock_busy_poll(struct be_eq_obj *eqo) +{ + spin_lock_bh(&eqo->lock); + + WARN_ON(eqo->state & (BE_EQ_NAPI)); + eqo->state = BE_EQ_IDLE; + + spin_unlock_bh(&eqo->lock); +} + +static inline void be_enable_busy_poll(struct be_eq_obj *eqo) +{ + spin_lock_init(&eqo->lock); + eqo->state = BE_EQ_IDLE; +} + +static inline void be_disable_busy_poll(struct be_eq_obj *eqo) +{ + local_bh_disable(); + + /* It's enough to just acquire napi lock on the eqo to stop + * be_busy_poll() from processing any queueus. + */ + while (!be_lock_napi(eqo)) + mdelay(1); + + local_bh_enable(); +} + +#else /* CONFIG_NET_RX_BUSY_POLL */ + +static inline bool be_lock_napi(struct be_eq_obj *eqo) +{ + return true; +} + +static inline void be_unlock_napi(struct be_eq_obj *eqo) +{ +} + +static inline bool be_lock_busy_poll(struct be_eq_obj *eqo) +{ + return false; +} + +static inline void be_unlock_busy_poll(struct be_eq_obj *eqo) +{ +} + +static inline void be_enable_busy_poll(struct be_eq_obj *eqo) +{ +} + +static inline void be_disable_busy_poll(struct be_eq_obj *eqo) +{ +} +#endif /* CONFIG_NET_RX_BUSY_POLL */ + +void be_cq_notify(struct be_adapter *adapter, u16 qid, bool arm, + u16 num_popped); +void be_link_status_update(struct be_adapter *adapter, u8 link_status); +void be_parse_stats(struct be_adapter *adapter); +int be_load_fw(struct be_adapter *adapter, u8 *func); +bool be_is_wol_supported(struct be_adapter *adapter); +bool be_pause_supported(struct be_adapter *adapter); +u32 be_get_fw_log_level(struct be_adapter *adapter); + +static inline int fw_major_num(const char *fw_ver) +{ + int fw_major = 0; + + sscanf(fw_ver, "%d.", &fw_major); + + return fw_major; +} + +int be_update_queues(struct be_adapter *adapter); +int be_poll(struct napi_struct *napi, int budget); + +/* + * internal function to initialize-cleanup roce device. + */ +void be_roce_dev_add(struct be_adapter *); +void be_roce_dev_remove(struct be_adapter *); + +/* + * internal function to open-close roce device during ifup-ifdown. + */ +void be_roce_dev_open(struct be_adapter *); +void be_roce_dev_close(struct be_adapter *); +void be_roce_dev_shutdown(struct be_adapter *); + +#endif /* BE_H */ diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.c b/drivers/net/ethernet/emulex/benet/be_cmds.c new file mode 100644 index 0000000..fead5c6 --- /dev/null +++ b/drivers/net/ethernet/emulex/benet/be_cmds.c @@ -0,0 +1,4162 @@ +/* + * Copyright (C) 2005 - 2014 Emulex + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. The full GNU General + * Public License is included in this distribution in the file called COPYING. + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + */ + +#include +#include "be.h" +#include "be_cmds.h" + +static struct be_cmd_priv_map cmd_priv_map[] = { + { + OPCODE_ETH_ACPI_WOL_MAGIC_CONFIG, + CMD_SUBSYSTEM_ETH, + BE_PRIV_LNKMGMT | BE_PRIV_VHADM | + BE_PRIV_DEVCFG | BE_PRIV_DEVSEC + }, + { + OPCODE_COMMON_GET_FLOW_CONTROL, + CMD_SUBSYSTEM_COMMON, + BE_PRIV_LNKQUERY | BE_PRIV_VHADM | + BE_PRIV_DEVCFG | BE_PRIV_DEVSEC + }, + { + OPCODE_COMMON_SET_FLOW_CONTROL, + CMD_SUBSYSTEM_COMMON, + BE_PRIV_LNKMGMT | BE_PRIV_VHADM | + BE_PRIV_DEVCFG | BE_PRIV_DEVSEC + }, + { + OPCODE_ETH_GET_PPORT_STATS, + CMD_SUBSYSTEM_ETH, + BE_PRIV_LNKMGMT | BE_PRIV_VHADM | + BE_PRIV_DEVCFG | BE_PRIV_DEVSEC + }, + { + OPCODE_COMMON_GET_PHY_DETAILS, + CMD_SUBSYSTEM_COMMON, + BE_PRIV_LNKMGMT | BE_PRIV_VHADM | + BE_PRIV_DEVCFG | BE_PRIV_DEVSEC + } +}; + +static bool be_cmd_allowed(struct be_adapter *adapter, u8 opcode, u8 subsystem) +{ + int i; + int num_entries = sizeof(cmd_priv_map)/sizeof(struct be_cmd_priv_map); + u32 cmd_privileges = adapter->cmd_privileges; + + for (i = 0; i < num_entries; i++) + if (opcode == cmd_priv_map[i].opcode && + subsystem == cmd_priv_map[i].subsystem) + if (!(cmd_privileges & cmd_priv_map[i].priv_mask)) + return false; + + return true; +} + +static inline void *embedded_payload(struct be_mcc_wrb *wrb) +{ + return wrb->payload.embedded_payload; +} + +static void be_mcc_notify(struct be_adapter *adapter) +{ + struct be_queue_info *mccq = &adapter->mcc_obj.q; + u32 val = 0; + + if (be_error(adapter)) + return; + + val |= mccq->id & DB_MCCQ_RING_ID_MASK; + val |= 1 << DB_MCCQ_NUM_POSTED_SHIFT; + + wmb(); + iowrite32(val, adapter->db + DB_MCCQ_OFFSET); +} + +/* To check if valid bit is set, check the entire word as we don't know + * the endianness of the data (old entry is host endian while a new entry is + * little endian) */ +static inline bool be_mcc_compl_is_new(struct be_mcc_compl *compl) +{ + u32 flags; + + if (compl->flags != 0) { + flags = le32_to_cpu(compl->flags); + if (flags & CQE_FLAGS_VALID_MASK) { + compl->flags = flags; + return true; + } + } + return false; +} + +/* Need to reset the entire word that houses the valid bit */ +static inline void be_mcc_compl_use(struct be_mcc_compl *compl) +{ + compl->flags = 0; +} + +static struct be_cmd_resp_hdr *be_decode_resp_hdr(u32 tag0, u32 tag1) +{ + unsigned long addr; + + addr = tag1; + addr = ((addr << 16) << 16) | tag0; + return (void *)addr; +} + +static bool be_skip_err_log(u8 opcode, u16 base_status, u16 addl_status) +{ + if (base_status == MCC_STATUS_NOT_SUPPORTED || + base_status == MCC_STATUS_ILLEGAL_REQUEST || + addl_status == MCC_ADDL_STATUS_TOO_MANY_INTERFACES || + (opcode == OPCODE_COMMON_WRITE_FLASHROM && + (base_status == MCC_STATUS_ILLEGAL_FIELD || + addl_status == MCC_ADDL_STATUS_FLASH_IMAGE_CRC_MISMATCH))) + return true; + else + return false; +} + +/* Place holder for all the async MCC cmds wherein the caller is not in a busy + * loop (has not issued be_mcc_notify_wait()) + */ +static void be_async_cmd_process(struct be_adapter *adapter, + struct be_mcc_compl *compl, + struct be_cmd_resp_hdr *resp_hdr) +{ + enum mcc_base_status base_status = base_status(compl->status); + u8 opcode = 0, subsystem = 0; + + if (resp_hdr) { + opcode = resp_hdr->opcode; + subsystem = resp_hdr->subsystem; + } + + if (opcode == OPCODE_LOWLEVEL_LOOPBACK_TEST && + subsystem == CMD_SUBSYSTEM_LOWLEVEL) { + complete(&adapter->et_cmd_compl); + return; + } + + if ((opcode == OPCODE_COMMON_WRITE_FLASHROM || + opcode == OPCODE_COMMON_WRITE_OBJECT) && + subsystem == CMD_SUBSYSTEM_COMMON) { + adapter->flash_status = compl->status; + complete(&adapter->et_cmd_compl); + return; + } + + if ((opcode == OPCODE_ETH_GET_STATISTICS || + opcode == OPCODE_ETH_GET_PPORT_STATS) && + subsystem == CMD_SUBSYSTEM_ETH && + base_status == MCC_STATUS_SUCCESS) { + be_parse_stats(adapter); + adapter->stats_cmd_sent = false; + return; + } + + if (opcode == OPCODE_COMMON_GET_CNTL_ADDITIONAL_ATTRIBUTES && + subsystem == CMD_SUBSYSTEM_COMMON) { + if (base_status == MCC_STATUS_SUCCESS) { + struct be_cmd_resp_get_cntl_addnl_attribs *resp = + (void *)resp_hdr; + adapter->drv_stats.be_on_die_temperature = + resp->on_die_temperature; + } else { + adapter->be_get_temp_freq = 0; + } + return; + } +} + +static int be_mcc_compl_process(struct be_adapter *adapter, + struct be_mcc_compl *compl) +{ + enum mcc_base_status base_status; + enum mcc_addl_status addl_status; + struct be_cmd_resp_hdr *resp_hdr; + u8 opcode = 0, subsystem = 0; + + /* Just swap the status to host endian; mcc tag is opaquely copied + * from mcc_wrb */ + be_dws_le_to_cpu(compl, 4); + + base_status = base_status(compl->status); + addl_status = addl_status(compl->status); + + resp_hdr = be_decode_resp_hdr(compl->tag0, compl->tag1); + if (resp_hdr) { + opcode = resp_hdr->opcode; + subsystem = resp_hdr->subsystem; + } + + be_async_cmd_process(adapter, compl, resp_hdr); + + if (base_status != MCC_STATUS_SUCCESS && + !be_skip_err_log(opcode, base_status, addl_status)) { + if (base_status == MCC_STATUS_UNAUTHORIZED_REQUEST) { + dev_warn(&adapter->pdev->dev, + "VF is not privileged to issue opcode %d-%d\n", + opcode, subsystem); + } else { + dev_err(&adapter->pdev->dev, + "opcode %d-%d failed:status %d-%d\n", + opcode, subsystem, base_status, addl_status); + } + } + return compl->status; +} + +/* Link state evt is a string of bytes; no need for endian swapping */ +static void be_async_link_state_process(struct be_adapter *adapter, + struct be_mcc_compl *compl) +{ + struct be_async_event_link_state *evt = + (struct be_async_event_link_state *)compl; + + /* When link status changes, link speed must be re-queried from FW */ + adapter->phy.link_speed = -1; + + /* On BEx the FW does not send a separate link status + * notification for physical and logical link. + * On other chips just process the logical link + * status notification + */ + if (!BEx_chip(adapter) && + !(evt->port_link_status & LOGICAL_LINK_STATUS_MASK)) + return; + + /* For the initial link status do not rely on the ASYNC event as + * it may not be received in some cases. + */ + if (adapter->flags & BE_FLAGS_LINK_STATUS_INIT) + be_link_status_update(adapter, + evt->port_link_status & LINK_STATUS_MASK); +} + +/* Grp5 CoS Priority evt */ +static void be_async_grp5_cos_priority_process(struct be_adapter *adapter, + struct be_mcc_compl *compl) +{ + struct be_async_event_grp5_cos_priority *evt = + (struct be_async_event_grp5_cos_priority *)compl; + + if (evt->valid) { + adapter->vlan_prio_bmap = evt->available_priority_bmap; + adapter->recommended_prio &= ~VLAN_PRIO_MASK; + adapter->recommended_prio = + evt->reco_default_priority << VLAN_PRIO_SHIFT; + } +} + +/* Grp5 QOS Speed evt: qos_link_speed is in units of 10 Mbps */ +static void be_async_grp5_qos_speed_process(struct be_adapter *adapter, + struct be_mcc_compl *compl) +{ + struct be_async_event_grp5_qos_link_speed *evt = + (struct be_async_event_grp5_qos_link_speed *)compl; + + if (adapter->phy.link_speed >= 0 && + evt->physical_port == adapter->port_num) + adapter->phy.link_speed = le16_to_cpu(evt->qos_link_speed) * 10; +} + +/*Grp5 PVID evt*/ +static void be_async_grp5_pvid_state_process(struct be_adapter *adapter, + struct be_mcc_compl *compl) +{ + struct be_async_event_grp5_pvid_state *evt = + (struct be_async_event_grp5_pvid_state *)compl; + + if (evt->enabled) { + adapter->pvid = le16_to_cpu(evt->tag) & VLAN_VID_MASK; + dev_info(&adapter->pdev->dev, "LPVID: %d\n", adapter->pvid); + } else { + adapter->pvid = 0; + } +} + +static void be_async_grp5_evt_process(struct be_adapter *adapter, + struct be_mcc_compl *compl) +{ + u8 event_type = (compl->flags >> ASYNC_EVENT_TYPE_SHIFT) & + ASYNC_EVENT_TYPE_MASK; + + switch (event_type) { + case ASYNC_EVENT_COS_PRIORITY: + be_async_grp5_cos_priority_process(adapter, compl); + break; + case ASYNC_EVENT_QOS_SPEED: + be_async_grp5_qos_speed_process(adapter, compl); + break; + case ASYNC_EVENT_PVID_STATE: + be_async_grp5_pvid_state_process(adapter, compl); + break; + default: + break; + } +} + +static void be_async_dbg_evt_process(struct be_adapter *adapter, + struct be_mcc_compl *cmp) +{ + u8 event_type = 0; + struct be_async_event_qnq *evt = (struct be_async_event_qnq *)cmp; + + event_type = (cmp->flags >> ASYNC_EVENT_TYPE_SHIFT) & + ASYNC_EVENT_TYPE_MASK; + + switch (event_type) { + case ASYNC_DEBUG_EVENT_TYPE_QNQ: + if (evt->valid) + adapter->qnq_vid = le16_to_cpu(evt->vlan_tag); + adapter->flags |= BE_FLAGS_QNQ_ASYNC_EVT_RCVD; + break; + default: + dev_warn(&adapter->pdev->dev, "Unknown debug event 0x%x!\n", + event_type); + break; + } +} + +static inline bool is_link_state_evt(u32 flags) +{ + return ((flags >> ASYNC_EVENT_CODE_SHIFT) & ASYNC_EVENT_CODE_MASK) == + ASYNC_EVENT_CODE_LINK_STATE; +} + +static inline bool is_grp5_evt(u32 flags) +{ + return ((flags >> ASYNC_EVENT_CODE_SHIFT) & ASYNC_EVENT_CODE_MASK) == + ASYNC_EVENT_CODE_GRP_5; +} + +static inline bool is_dbg_evt(u32 flags) +{ + return ((flags >> ASYNC_EVENT_CODE_SHIFT) & ASYNC_EVENT_CODE_MASK) == + ASYNC_EVENT_CODE_QNQ; +} + +static void be_mcc_event_process(struct be_adapter *adapter, + struct be_mcc_compl *compl) +{ + if (is_link_state_evt(compl->flags)) + be_async_link_state_process(adapter, compl); + else if (is_grp5_evt(compl->flags)) + be_async_grp5_evt_process(adapter, compl); + else if (is_dbg_evt(compl->flags)) + be_async_dbg_evt_process(adapter, compl); +} + +static struct be_mcc_compl *be_mcc_compl_get(struct be_adapter *adapter) +{ + struct be_queue_info *mcc_cq = &adapter->mcc_obj.cq; + struct be_mcc_compl *compl = queue_tail_node(mcc_cq); + + if (be_mcc_compl_is_new(compl)) { + queue_tail_inc(mcc_cq); + return compl; + } + return NULL; +} + +void be_async_mcc_enable(struct be_adapter *adapter) +{ + spin_lock_bh(&adapter->mcc_cq_lock); + + be_cq_notify(adapter, adapter->mcc_obj.cq.id, true, 0); + adapter->mcc_obj.rearm_cq = true; + + spin_unlock_bh(&adapter->mcc_cq_lock); +} + +void be_async_mcc_disable(struct be_adapter *adapter) +{ + spin_lock_bh(&adapter->mcc_cq_lock); + + adapter->mcc_obj.rearm_cq = false; + be_cq_notify(adapter, adapter->mcc_obj.cq.id, false, 0); + + spin_unlock_bh(&adapter->mcc_cq_lock); +} + +int be_process_mcc(struct be_adapter *adapter) +{ + struct be_mcc_compl *compl; + int num = 0, status = 0; + struct be_mcc_obj *mcc_obj = &adapter->mcc_obj; + + spin_lock(&adapter->mcc_cq_lock); + + while ((compl = be_mcc_compl_get(adapter))) { + if (compl->flags & CQE_FLAGS_ASYNC_MASK) { + be_mcc_event_process(adapter, compl); + } else if (compl->flags & CQE_FLAGS_COMPLETED_MASK) { + status = be_mcc_compl_process(adapter, compl); + atomic_dec(&mcc_obj->q.used); + } + be_mcc_compl_use(compl); + num++; + } + + if (num) + be_cq_notify(adapter, mcc_obj->cq.id, mcc_obj->rearm_cq, num); + + spin_unlock(&adapter->mcc_cq_lock); + return status; +} + +/* Wait till no more pending mcc requests are present */ +static int be_mcc_wait_compl(struct be_adapter *adapter) +{ +#define mcc_timeout 120000 /* 12s timeout */ + int i, status = 0; + struct be_mcc_obj *mcc_obj = &adapter->mcc_obj; + + for (i = 0; i < mcc_timeout; i++) { + if (be_error(adapter)) + return -EIO; + + local_bh_disable(); + status = be_process_mcc(adapter); + local_bh_enable(); + + if (atomic_read(&mcc_obj->q.used) == 0) + break; + udelay(100); + } + if (i == mcc_timeout) { + dev_err(&adapter->pdev->dev, "FW not responding\n"); + adapter->fw_timeout = true; + return -EIO; + } + return status; +} + +/* Notify MCC requests and wait for completion */ +static int be_mcc_notify_wait(struct be_adapter *adapter) +{ + int status; + struct be_mcc_wrb *wrb; + struct be_mcc_obj *mcc_obj = &adapter->mcc_obj; + u16 index = mcc_obj->q.head; + struct be_cmd_resp_hdr *resp; + + index_dec(&index, mcc_obj->q.len); + wrb = queue_index_node(&mcc_obj->q, index); + + resp = be_decode_resp_hdr(wrb->tag0, wrb->tag1); + + be_mcc_notify(adapter); + + status = be_mcc_wait_compl(adapter); + if (status == -EIO) + goto out; + + status = (resp->base_status | + ((resp->addl_status & CQE_ADDL_STATUS_MASK) << + CQE_ADDL_STATUS_SHIFT)); +out: + return status; +} + +static int be_mbox_db_ready_wait(struct be_adapter *adapter, void __iomem *db) +{ + int msecs = 0; + u32 ready; + + do { + if (be_error(adapter)) + return -EIO; + + ready = ioread32(db); + if (ready == 0xffffffff) + return -1; + + ready &= MPU_MAILBOX_DB_RDY_MASK; + if (ready) + break; + + if (msecs > 4000) { + dev_err(&adapter->pdev->dev, "FW not responding\n"); + adapter->fw_timeout = true; + be_detect_error(adapter); + return -1; + } + + msleep(1); + msecs++; + } while (true); + + return 0; +} + +/* + * Insert the mailbox address into the doorbell in two steps + * Polls on the mbox doorbell till a command completion (or a timeout) occurs + */ +static int be_mbox_notify_wait(struct be_adapter *adapter) +{ + int status; + u32 val = 0; + void __iomem *db = adapter->db + MPU_MAILBOX_DB_OFFSET; + struct be_dma_mem *mbox_mem = &adapter->mbox_mem; + struct be_mcc_mailbox *mbox = mbox_mem->va; + struct be_mcc_compl *compl = &mbox->compl; + + /* wait for ready to be set */ + status = be_mbox_db_ready_wait(adapter, db); + if (status != 0) + return status; + + val |= MPU_MAILBOX_DB_HI_MASK; + /* at bits 2 - 31 place mbox dma addr msb bits 34 - 63 */ + val |= (upper_32_bits(mbox_mem->dma) >> 2) << 2; + iowrite32(val, db); + + /* wait for ready to be set */ + status = be_mbox_db_ready_wait(adapter, db); + if (status != 0) + return status; + + val = 0; + /* at bits 2 - 31 place mbox dma addr lsb bits 4 - 33 */ + val |= (u32)(mbox_mem->dma >> 4) << 2; + iowrite32(val, db); + + status = be_mbox_db_ready_wait(adapter, db); + if (status != 0) + return status; + + /* A cq entry has been made now */ + if (be_mcc_compl_is_new(compl)) { + status = be_mcc_compl_process(adapter, &mbox->compl); + be_mcc_compl_use(compl); + if (status) + return status; + } else { + dev_err(&adapter->pdev->dev, "invalid mailbox completion\n"); + return -1; + } + return 0; +} + +static u16 be_POST_stage_get(struct be_adapter *adapter) +{ + u32 sem; + + if (BEx_chip(adapter)) + sem = ioread32(adapter->csr + SLIPORT_SEMAPHORE_OFFSET_BEx); + else + pci_read_config_dword(adapter->pdev, + SLIPORT_SEMAPHORE_OFFSET_SH, &sem); + + return sem & POST_STAGE_MASK; +} + +static int lancer_wait_ready(struct be_adapter *adapter) +{ +#define SLIPORT_READY_TIMEOUT 30 + u32 sliport_status; + int status = 0, i; + + for (i = 0; i < SLIPORT_READY_TIMEOUT; i++) { + sliport_status = ioread32(adapter->db + SLIPORT_STATUS_OFFSET); + if (sliport_status & SLIPORT_STATUS_RDY_MASK) + break; + + msleep(1000); + } + + if (i == SLIPORT_READY_TIMEOUT) + status = -1; + + return status; +} + +static bool lancer_provisioning_error(struct be_adapter *adapter) +{ + u32 sliport_status = 0, sliport_err1 = 0, sliport_err2 = 0; + + sliport_status = ioread32(adapter->db + SLIPORT_STATUS_OFFSET); + if (sliport_status & SLIPORT_STATUS_ERR_MASK) { + sliport_err1 = ioread32(adapter->db + SLIPORT_ERROR1_OFFSET); + sliport_err2 = ioread32(adapter->db + SLIPORT_ERROR2_OFFSET); + + if (sliport_err1 == SLIPORT_ERROR_NO_RESOURCE1 && + sliport_err2 == SLIPORT_ERROR_NO_RESOURCE2) + return true; + } + return false; +} + +int lancer_test_and_set_rdy_state(struct be_adapter *adapter) +{ + int status; + u32 sliport_status, err, reset_needed; + bool resource_error; + + resource_error = lancer_provisioning_error(adapter); + if (resource_error) + return -EAGAIN; + + status = lancer_wait_ready(adapter); + if (!status) { + sliport_status = ioread32(adapter->db + SLIPORT_STATUS_OFFSET); + err = sliport_status & SLIPORT_STATUS_ERR_MASK; + reset_needed = sliport_status & SLIPORT_STATUS_RN_MASK; + if (err && reset_needed) { + iowrite32(SLI_PORT_CONTROL_IP_MASK, + adapter->db + SLIPORT_CONTROL_OFFSET); + + /* check adapter has corrected the error */ + status = lancer_wait_ready(adapter); + sliport_status = ioread32(adapter->db + + SLIPORT_STATUS_OFFSET); + sliport_status &= (SLIPORT_STATUS_ERR_MASK | + SLIPORT_STATUS_RN_MASK); + if (status || sliport_status) + status = -1; + } else if (err || reset_needed) { + status = -1; + } + } + /* Stop error recovery if error is not recoverable. + * No resource error is temporary errors and will go away + * when PF provisions resources. + */ + resource_error = lancer_provisioning_error(adapter); + if (resource_error) + status = -EAGAIN; + + return status; +} + +int be_fw_wait_ready(struct be_adapter *adapter) +{ + u16 stage; + int status, timeout = 0; + struct device *dev = &adapter->pdev->dev; + + if (lancer_chip(adapter)) { + status = lancer_wait_ready(adapter); + return status; + } + + do { + stage = be_POST_stage_get(adapter); + if (stage == POST_STAGE_ARMFW_RDY) + return 0; + + dev_info(dev, "Waiting for POST, %ds elapsed\n", timeout); + if (msleep_interruptible(2000)) { + dev_err(dev, "Waiting for POST aborted\n"); + return -EINTR; + } + timeout += 2; + } while (timeout < 60); + + dev_err(dev, "POST timeout; stage=0x%x\n", stage); + return -1; +} + +static inline struct be_sge *nonembedded_sgl(struct be_mcc_wrb *wrb) +{ + return &wrb->payload.sgl[0]; +} + +static inline void fill_wrb_tags(struct be_mcc_wrb *wrb, unsigned long addr) +{ + wrb->tag0 = addr & 0xFFFFFFFF; + wrb->tag1 = upper_32_bits(addr); +} + +/* Don't touch the hdr after it's prepared */ +/* mem will be NULL for embedded commands */ +static void be_wrb_cmd_hdr_prepare(struct be_cmd_req_hdr *req_hdr, + u8 subsystem, u8 opcode, int cmd_len, + struct be_mcc_wrb *wrb, + struct be_dma_mem *mem) +{ + struct be_sge *sge; + + req_hdr->opcode = opcode; + req_hdr->subsystem = subsystem; + req_hdr->request_length = cpu_to_le32(cmd_len - sizeof(*req_hdr)); + req_hdr->version = 0; + fill_wrb_tags(wrb, (ulong) req_hdr); + wrb->payload_length = cmd_len; + if (mem) { + wrb->embedded |= (1 & MCC_WRB_SGE_CNT_MASK) << + MCC_WRB_SGE_CNT_SHIFT; + sge = nonembedded_sgl(wrb); + sge->pa_hi = cpu_to_le32(upper_32_bits(mem->dma)); + sge->pa_lo = cpu_to_le32(mem->dma & 0xFFFFFFFF); + sge->len = cpu_to_le32(mem->size); + } else + wrb->embedded |= MCC_WRB_EMBEDDED_MASK; + be_dws_cpu_to_le(wrb, 8); +} + +static void be_cmd_page_addrs_prepare(struct phys_addr *pages, u32 max_pages, + struct be_dma_mem *mem) +{ + int i, buf_pages = min(PAGES_4K_SPANNED(mem->va, mem->size), max_pages); + u64 dma = (u64)mem->dma; + + for (i = 0; i < buf_pages; i++) { + pages[i].lo = cpu_to_le32(dma & 0xFFFFFFFF); + pages[i].hi = cpu_to_le32(upper_32_bits(dma)); + dma += PAGE_SIZE_4K; + } +} + +static inline struct be_mcc_wrb *wrb_from_mbox(struct be_adapter *adapter) +{ + struct be_dma_mem *mbox_mem = &adapter->mbox_mem; + struct be_mcc_wrb *wrb + = &((struct be_mcc_mailbox *)(mbox_mem->va))->wrb; + memset(wrb, 0, sizeof(*wrb)); + return wrb; +} + +static struct be_mcc_wrb *wrb_from_mccq(struct be_adapter *adapter) +{ + struct be_queue_info *mccq = &adapter->mcc_obj.q; + struct be_mcc_wrb *wrb; + + if (!mccq->created) + return NULL; + + if (atomic_read(&mccq->used) >= mccq->len) + return NULL; + + wrb = queue_head_node(mccq); + queue_head_inc(mccq); + atomic_inc(&mccq->used); + memset(wrb, 0, sizeof(*wrb)); + return wrb; +} + +static bool use_mcc(struct be_adapter *adapter) +{ + return adapter->mcc_obj.q.created; +} + +/* Must be used only in process context */ +static int be_cmd_lock(struct be_adapter *adapter) +{ + if (use_mcc(adapter)) { + spin_lock_bh(&adapter->mcc_lock); + return 0; + } else { + return mutex_lock_interruptible(&adapter->mbox_lock); + } +} + +/* Must be used only in process context */ +static void be_cmd_unlock(struct be_adapter *adapter) +{ + if (use_mcc(adapter)) + spin_unlock_bh(&adapter->mcc_lock); + else + return mutex_unlock(&adapter->mbox_lock); +} + +static struct be_mcc_wrb *be_cmd_copy(struct be_adapter *adapter, + struct be_mcc_wrb *wrb) +{ + struct be_mcc_wrb *dest_wrb; + + if (use_mcc(adapter)) { + dest_wrb = wrb_from_mccq(adapter); + if (!dest_wrb) + return NULL; + } else { + dest_wrb = wrb_from_mbox(adapter); + } + + memcpy(dest_wrb, wrb, sizeof(*wrb)); + if (wrb->embedded & cpu_to_le32(MCC_WRB_EMBEDDED_MASK)) + fill_wrb_tags(dest_wrb, (ulong) embedded_payload(wrb)); + + return dest_wrb; +} + +/* Must be used only in process context */ +static int be_cmd_notify_wait(struct be_adapter *adapter, + struct be_mcc_wrb *wrb) +{ + struct be_mcc_wrb *dest_wrb; + int status; + + status = be_cmd_lock(adapter); + if (status) + return status; + + dest_wrb = be_cmd_copy(adapter, wrb); + if (!dest_wrb) + return -EBUSY; + + if (use_mcc(adapter)) + status = be_mcc_notify_wait(adapter); + else + status = be_mbox_notify_wait(adapter); + + if (!status) + memcpy(wrb, dest_wrb, sizeof(*wrb)); + + be_cmd_unlock(adapter); + return status; +} + +/* Tell fw we're about to start firing cmds by writing a + * special pattern across the wrb hdr; uses mbox + */ +int be_cmd_fw_init(struct be_adapter *adapter) +{ + u8 *wrb; + int status; + + if (lancer_chip(adapter)) + return 0; + + if (mutex_lock_interruptible(&adapter->mbox_lock)) + return -1; + + wrb = (u8 *)wrb_from_mbox(adapter); + *wrb++ = 0xFF; + *wrb++ = 0x12; + *wrb++ = 0x34; + *wrb++ = 0xFF; + *wrb++ = 0xFF; + *wrb++ = 0x56; + *wrb++ = 0x78; + *wrb = 0xFF; + + status = be_mbox_notify_wait(adapter); + + mutex_unlock(&adapter->mbox_lock); + return status; +} + +/* Tell fw we're done with firing cmds by writing a + * special pattern across the wrb hdr; uses mbox + */ +int be_cmd_fw_clean(struct be_adapter *adapter) +{ + u8 *wrb; + int status; + + if (lancer_chip(adapter)) + return 0; + + if (mutex_lock_interruptible(&adapter->mbox_lock)) + return -1; + + wrb = (u8 *)wrb_from_mbox(adapter); + *wrb++ = 0xFF; + *wrb++ = 0xAA; + *wrb++ = 0xBB; + *wrb++ = 0xFF; + *wrb++ = 0xFF; + *wrb++ = 0xCC; + *wrb++ = 0xDD; + *wrb = 0xFF; + + status = be_mbox_notify_wait(adapter); + + mutex_unlock(&adapter->mbox_lock); + return status; +} + +int be_cmd_eq_create(struct be_adapter *adapter, struct be_eq_obj *eqo) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_eq_create *req; + struct be_dma_mem *q_mem = &eqo->q.dma_mem; + int status, ver = 0; + + if (mutex_lock_interruptible(&adapter->mbox_lock)) + return -1; + + wrb = wrb_from_mbox(adapter); + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_EQ_CREATE, sizeof(*req), wrb, + NULL); + + /* Support for EQ_CREATEv2 available only SH-R onwards */ + if (!(BEx_chip(adapter) || lancer_chip(adapter))) + ver = 2; + + req->hdr.version = ver; + req->num_pages = cpu_to_le16(PAGES_4K_SPANNED(q_mem->va, q_mem->size)); + + AMAP_SET_BITS(struct amap_eq_context, valid, req->context, 1); + /* 4byte eqe*/ + AMAP_SET_BITS(struct amap_eq_context, size, req->context, 0); + AMAP_SET_BITS(struct amap_eq_context, count, req->context, + __ilog2_u32(eqo->q.len / 256)); + be_dws_cpu_to_le(req->context, sizeof(req->context)); + + be_cmd_page_addrs_prepare(req->pages, ARRAY_SIZE(req->pages), q_mem); + + status = be_mbox_notify_wait(adapter); + if (!status) { + struct be_cmd_resp_eq_create *resp = embedded_payload(wrb); + + eqo->q.id = le16_to_cpu(resp->eq_id); + eqo->msix_idx = + (ver == 2) ? le16_to_cpu(resp->msix_idx) : eqo->idx; + eqo->q.created = true; + } + + mutex_unlock(&adapter->mbox_lock); + return status; +} + +/* Use MCC */ +int be_cmd_mac_addr_query(struct be_adapter *adapter, u8 *mac_addr, + bool permanent, u32 if_handle, u32 pmac_id) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_mac_query *req; + int status; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_NTWK_MAC_QUERY, sizeof(*req), wrb, + NULL); + req->type = MAC_ADDRESS_TYPE_NETWORK; + if (permanent) { + req->permanent = 1; + } else { + req->if_id = cpu_to_le16((u16)if_handle); + req->pmac_id = cpu_to_le32(pmac_id); + req->permanent = 0; + } + + status = be_mcc_notify_wait(adapter); + if (!status) { + struct be_cmd_resp_mac_query *resp = embedded_payload(wrb); + + memcpy(mac_addr, resp->mac.addr, ETH_ALEN); + } + +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +/* Uses synchronous MCCQ */ +int be_cmd_pmac_add(struct be_adapter *adapter, u8 *mac_addr, + u32 if_id, u32 *pmac_id, u32 domain) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_pmac_add *req; + int status; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_NTWK_PMAC_ADD, sizeof(*req), wrb, + NULL); + + req->hdr.domain = domain; + req->if_id = cpu_to_le32(if_id); + memcpy(req->mac_address, mac_addr, ETH_ALEN); + + status = be_mcc_notify_wait(adapter); + if (!status) { + struct be_cmd_resp_pmac_add *resp = embedded_payload(wrb); + + *pmac_id = le32_to_cpu(resp->pmac_id); + } + +err: + spin_unlock_bh(&adapter->mcc_lock); + + if (status == MCC_STATUS_UNAUTHORIZED_REQUEST) + status = -EPERM; + + return status; +} + +/* Uses synchronous MCCQ */ +int be_cmd_pmac_del(struct be_adapter *adapter, u32 if_id, int pmac_id, u32 dom) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_pmac_del *req; + int status; + + if (pmac_id == -1) + return 0; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_NTWK_PMAC_DEL, sizeof(*req), + wrb, NULL); + + req->hdr.domain = dom; + req->if_id = cpu_to_le32(if_id); + req->pmac_id = cpu_to_le32(pmac_id); + + status = be_mcc_notify_wait(adapter); + +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +/* Uses Mbox */ +int be_cmd_cq_create(struct be_adapter *adapter, struct be_queue_info *cq, + struct be_queue_info *eq, bool no_delay, int coalesce_wm) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_cq_create *req; + struct be_dma_mem *q_mem = &cq->dma_mem; + void *ctxt; + int status; + + if (mutex_lock_interruptible(&adapter->mbox_lock)) + return -1; + + wrb = wrb_from_mbox(adapter); + req = embedded_payload(wrb); + ctxt = &req->context; + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_CQ_CREATE, sizeof(*req), wrb, + NULL); + + req->num_pages = cpu_to_le16(PAGES_4K_SPANNED(q_mem->va, q_mem->size)); + + if (BEx_chip(adapter)) { + AMAP_SET_BITS(struct amap_cq_context_be, coalescwm, ctxt, + coalesce_wm); + AMAP_SET_BITS(struct amap_cq_context_be, nodelay, + ctxt, no_delay); + AMAP_SET_BITS(struct amap_cq_context_be, count, ctxt, + __ilog2_u32(cq->len / 256)); + AMAP_SET_BITS(struct amap_cq_context_be, valid, ctxt, 1); + AMAP_SET_BITS(struct amap_cq_context_be, eventable, ctxt, 1); + AMAP_SET_BITS(struct amap_cq_context_be, eqid, ctxt, eq->id); + } else { + req->hdr.version = 2; + req->page_size = 1; /* 1 for 4K */ + + /* coalesce-wm field in this cmd is not relevant to Lancer. + * Lancer uses COMMON_MODIFY_CQ to set this field + */ + if (!lancer_chip(adapter)) + AMAP_SET_BITS(struct amap_cq_context_v2, coalescwm, + ctxt, coalesce_wm); + AMAP_SET_BITS(struct amap_cq_context_v2, nodelay, ctxt, + no_delay); + AMAP_SET_BITS(struct amap_cq_context_v2, count, ctxt, + __ilog2_u32(cq->len / 256)); + AMAP_SET_BITS(struct amap_cq_context_v2, valid, ctxt, 1); + AMAP_SET_BITS(struct amap_cq_context_v2, eventable, ctxt, 1); + AMAP_SET_BITS(struct amap_cq_context_v2, eqid, ctxt, eq->id); + } + + be_dws_cpu_to_le(ctxt, sizeof(req->context)); + + be_cmd_page_addrs_prepare(req->pages, ARRAY_SIZE(req->pages), q_mem); + + status = be_mbox_notify_wait(adapter); + if (!status) { + struct be_cmd_resp_cq_create *resp = embedded_payload(wrb); + + cq->id = le16_to_cpu(resp->cq_id); + cq->created = true; + } + + mutex_unlock(&adapter->mbox_lock); + + return status; +} + +static u32 be_encoded_q_len(int q_len) +{ + u32 len_encoded = fls(q_len); /* log2(len) + 1 */ + + if (len_encoded == 16) + len_encoded = 0; + return len_encoded; +} + +static int be_cmd_mccq_ext_create(struct be_adapter *adapter, + struct be_queue_info *mccq, + struct be_queue_info *cq) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_mcc_ext_create *req; + struct be_dma_mem *q_mem = &mccq->dma_mem; + void *ctxt; + int status; + + if (mutex_lock_interruptible(&adapter->mbox_lock)) + return -1; + + wrb = wrb_from_mbox(adapter); + req = embedded_payload(wrb); + ctxt = &req->context; + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_MCC_CREATE_EXT, sizeof(*req), wrb, + NULL); + + req->num_pages = cpu_to_le16(PAGES_4K_SPANNED(q_mem->va, q_mem->size)); + if (BEx_chip(adapter)) { + AMAP_SET_BITS(struct amap_mcc_context_be, valid, ctxt, 1); + AMAP_SET_BITS(struct amap_mcc_context_be, ring_size, ctxt, + be_encoded_q_len(mccq->len)); + AMAP_SET_BITS(struct amap_mcc_context_be, cq_id, ctxt, cq->id); + } else { + req->hdr.version = 1; + req->cq_id = cpu_to_le16(cq->id); + + AMAP_SET_BITS(struct amap_mcc_context_v1, ring_size, ctxt, + be_encoded_q_len(mccq->len)); + AMAP_SET_BITS(struct amap_mcc_context_v1, valid, ctxt, 1); + AMAP_SET_BITS(struct amap_mcc_context_v1, async_cq_id, + ctxt, cq->id); + AMAP_SET_BITS(struct amap_mcc_context_v1, async_cq_valid, + ctxt, 1); + } + + /* Subscribe to Link State and Group 5 Events(bits 1 and 5 set) */ + req->async_event_bitmap[0] = cpu_to_le32(0x00000022); + req->async_event_bitmap[0] |= cpu_to_le32(1 << ASYNC_EVENT_CODE_QNQ); + be_dws_cpu_to_le(ctxt, sizeof(req->context)); + + be_cmd_page_addrs_prepare(req->pages, ARRAY_SIZE(req->pages), q_mem); + + status = be_mbox_notify_wait(adapter); + if (!status) { + struct be_cmd_resp_mcc_create *resp = embedded_payload(wrb); + + mccq->id = le16_to_cpu(resp->id); + mccq->created = true; + } + mutex_unlock(&adapter->mbox_lock); + + return status; +} + +static int be_cmd_mccq_org_create(struct be_adapter *adapter, + struct be_queue_info *mccq, + struct be_queue_info *cq) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_mcc_create *req; + struct be_dma_mem *q_mem = &mccq->dma_mem; + void *ctxt; + int status; + + if (mutex_lock_interruptible(&adapter->mbox_lock)) + return -1; + + wrb = wrb_from_mbox(adapter); + req = embedded_payload(wrb); + ctxt = &req->context; + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_MCC_CREATE, sizeof(*req), wrb, + NULL); + + req->num_pages = cpu_to_le16(PAGES_4K_SPANNED(q_mem->va, q_mem->size)); + + AMAP_SET_BITS(struct amap_mcc_context_be, valid, ctxt, 1); + AMAP_SET_BITS(struct amap_mcc_context_be, ring_size, ctxt, + be_encoded_q_len(mccq->len)); + AMAP_SET_BITS(struct amap_mcc_context_be, cq_id, ctxt, cq->id); + + be_dws_cpu_to_le(ctxt, sizeof(req->context)); + + be_cmd_page_addrs_prepare(req->pages, ARRAY_SIZE(req->pages), q_mem); + + status = be_mbox_notify_wait(adapter); + if (!status) { + struct be_cmd_resp_mcc_create *resp = embedded_payload(wrb); + + mccq->id = le16_to_cpu(resp->id); + mccq->created = true; + } + + mutex_unlock(&adapter->mbox_lock); + return status; +} + +int be_cmd_mccq_create(struct be_adapter *adapter, + struct be_queue_info *mccq, struct be_queue_info *cq) +{ + int status; + + status = be_cmd_mccq_ext_create(adapter, mccq, cq); + if (status && BEx_chip(adapter)) { + dev_warn(&adapter->pdev->dev, "Upgrade to F/W ver 2.102.235.0 " + "or newer to avoid conflicting priorities between NIC " + "and FCoE traffic"); + status = be_cmd_mccq_org_create(adapter, mccq, cq); + } + return status; +} + +int be_cmd_txq_create(struct be_adapter *adapter, struct be_tx_obj *txo) +{ + struct be_mcc_wrb wrb = {0}; + struct be_cmd_req_eth_tx_create *req; + struct be_queue_info *txq = &txo->q; + struct be_queue_info *cq = &txo->cq; + struct be_dma_mem *q_mem = &txq->dma_mem; + int status, ver = 0; + + req = embedded_payload(&wrb); + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_ETH, + OPCODE_ETH_TX_CREATE, sizeof(*req), &wrb, NULL); + + if (lancer_chip(adapter)) { + req->hdr.version = 1; + } else if (BEx_chip(adapter)) { + if (adapter->function_caps & BE_FUNCTION_CAPS_SUPER_NIC) + req->hdr.version = 2; + } else { /* For SH */ + req->hdr.version = 2; + } + + if (req->hdr.version > 0) + req->if_id = cpu_to_le16(adapter->if_handle); + req->num_pages = PAGES_4K_SPANNED(q_mem->va, q_mem->size); + req->ulp_num = BE_ULP1_NUM; + req->type = BE_ETH_TX_RING_TYPE_STANDARD; + req->cq_id = cpu_to_le16(cq->id); + req->queue_size = be_encoded_q_len(txq->len); + be_cmd_page_addrs_prepare(req->pages, ARRAY_SIZE(req->pages), q_mem); + ver = req->hdr.version; + + status = be_cmd_notify_wait(adapter, &wrb); + if (!status) { + struct be_cmd_resp_eth_tx_create *resp = embedded_payload(&wrb); + + txq->id = le16_to_cpu(resp->cid); + if (ver == 2) + txo->db_offset = le32_to_cpu(resp->db_offset); + else + txo->db_offset = DB_TXULP1_OFFSET; + txq->created = true; + } + + return status; +} + +/* Uses MCC */ +int be_cmd_rxq_create(struct be_adapter *adapter, + struct be_queue_info *rxq, u16 cq_id, u16 frag_size, + u32 if_id, u32 rss, u8 *rss_id) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_eth_rx_create *req; + struct be_dma_mem *q_mem = &rxq->dma_mem; + int status; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_ETH, + OPCODE_ETH_RX_CREATE, sizeof(*req), wrb, NULL); + + req->cq_id = cpu_to_le16(cq_id); + req->frag_size = fls(frag_size) - 1; + req->num_pages = 2; + be_cmd_page_addrs_prepare(req->pages, ARRAY_SIZE(req->pages), q_mem); + req->interface_id = cpu_to_le32(if_id); + req->max_frame_size = cpu_to_le16(BE_MAX_JUMBO_FRAME_SIZE); + req->rss_queue = cpu_to_le32(rss); + + status = be_mcc_notify_wait(adapter); + if (!status) { + struct be_cmd_resp_eth_rx_create *resp = embedded_payload(wrb); + + rxq->id = le16_to_cpu(resp->id); + rxq->created = true; + *rss_id = resp->rss_id; + } + +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +/* Generic destroyer function for all types of queues + * Uses Mbox + */ +int be_cmd_q_destroy(struct be_adapter *adapter, struct be_queue_info *q, + int queue_type) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_q_destroy *req; + u8 subsys = 0, opcode = 0; + int status; + + if (mutex_lock_interruptible(&adapter->mbox_lock)) + return -1; + + wrb = wrb_from_mbox(adapter); + req = embedded_payload(wrb); + + switch (queue_type) { + case QTYPE_EQ: + subsys = CMD_SUBSYSTEM_COMMON; + opcode = OPCODE_COMMON_EQ_DESTROY; + break; + case QTYPE_CQ: + subsys = CMD_SUBSYSTEM_COMMON; + opcode = OPCODE_COMMON_CQ_DESTROY; + break; + case QTYPE_TXQ: + subsys = CMD_SUBSYSTEM_ETH; + opcode = OPCODE_ETH_TX_DESTROY; + break; + case QTYPE_RXQ: + subsys = CMD_SUBSYSTEM_ETH; + opcode = OPCODE_ETH_RX_DESTROY; + break; + case QTYPE_MCCQ: + subsys = CMD_SUBSYSTEM_COMMON; + opcode = OPCODE_COMMON_MCC_DESTROY; + break; + default: + BUG(); + } + + be_wrb_cmd_hdr_prepare(&req->hdr, subsys, opcode, sizeof(*req), wrb, + NULL); + req->id = cpu_to_le16(q->id); + + status = be_mbox_notify_wait(adapter); + q->created = false; + + mutex_unlock(&adapter->mbox_lock); + return status; +} + +/* Uses MCC */ +int be_cmd_rxq_destroy(struct be_adapter *adapter, struct be_queue_info *q) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_q_destroy *req; + int status; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_ETH, + OPCODE_ETH_RX_DESTROY, sizeof(*req), wrb, NULL); + req->id = cpu_to_le16(q->id); + + status = be_mcc_notify_wait(adapter); + q->created = false; + +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +/* Create an rx filtering policy configuration on an i/f + * Will use MBOX only if MCCQ has not been created. + */ +int be_cmd_if_create(struct be_adapter *adapter, u32 cap_flags, u32 en_flags, + u32 *if_handle, u32 domain) +{ + struct be_mcc_wrb wrb = {0}; + struct be_cmd_req_if_create *req; + int status; + + req = embedded_payload(&wrb); + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_NTWK_INTERFACE_CREATE, + sizeof(*req), &wrb, NULL); + req->hdr.domain = domain; + req->capability_flags = cpu_to_le32(cap_flags); + req->enable_flags = cpu_to_le32(en_flags); + req->pmac_invalid = true; + + status = be_cmd_notify_wait(adapter, &wrb); + if (!status) { + struct be_cmd_resp_if_create *resp = embedded_payload(&wrb); + + *if_handle = le32_to_cpu(resp->interface_id); + + /* Hack to retrieve VF's pmac-id on BE3 */ + if (BE3_chip(adapter) && !be_physfn(adapter)) + adapter->pmac_id[0] = le32_to_cpu(resp->pmac_id); + } + return status; +} + +/* Uses MCCQ */ +int be_cmd_if_destroy(struct be_adapter *adapter, int interface_id, u32 domain) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_if_destroy *req; + int status; + + if (interface_id == -1) + return 0; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_NTWK_INTERFACE_DESTROY, + sizeof(*req), wrb, NULL); + req->hdr.domain = domain; + req->interface_id = cpu_to_le32(interface_id); + + status = be_mcc_notify_wait(adapter); +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +/* Get stats is a non embedded command: the request is not embedded inside + * WRB but is a separate dma memory block + * Uses asynchronous MCC + */ +int be_cmd_get_stats(struct be_adapter *adapter, struct be_dma_mem *nonemb_cmd) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_hdr *hdr; + int status = 0; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + hdr = nonemb_cmd->va; + + be_wrb_cmd_hdr_prepare(hdr, CMD_SUBSYSTEM_ETH, + OPCODE_ETH_GET_STATISTICS, nonemb_cmd->size, wrb, + nonemb_cmd); + + /* version 1 of the cmd is not supported only by BE2 */ + if (BE2_chip(adapter)) + hdr->version = 0; + if (BE3_chip(adapter) || lancer_chip(adapter)) + hdr->version = 1; + else + hdr->version = 2; + + be_mcc_notify(adapter); + adapter->stats_cmd_sent = true; + +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +/* Lancer Stats */ +int lancer_cmd_get_pport_stats(struct be_adapter *adapter, + struct be_dma_mem *nonemb_cmd) +{ + struct be_mcc_wrb *wrb; + struct lancer_cmd_req_pport_stats *req; + int status = 0; + + if (!be_cmd_allowed(adapter, OPCODE_ETH_GET_PPORT_STATS, + CMD_SUBSYSTEM_ETH)) + return -EPERM; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + req = nonemb_cmd->va; + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_ETH, + OPCODE_ETH_GET_PPORT_STATS, nonemb_cmd->size, + wrb, nonemb_cmd); + + req->cmd_params.params.pport_num = cpu_to_le16(adapter->hba_port_num); + req->cmd_params.params.reset_stats = 0; + + be_mcc_notify(adapter); + adapter->stats_cmd_sent = true; + +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +static int be_mac_to_link_speed(int mac_speed) +{ + switch (mac_speed) { + case PHY_LINK_SPEED_ZERO: + return 0; + case PHY_LINK_SPEED_10MBPS: + return 10; + case PHY_LINK_SPEED_100MBPS: + return 100; + case PHY_LINK_SPEED_1GBPS: + return 1000; + case PHY_LINK_SPEED_10GBPS: + return 10000; + case PHY_LINK_SPEED_20GBPS: + return 20000; + case PHY_LINK_SPEED_25GBPS: + return 25000; + case PHY_LINK_SPEED_40GBPS: + return 40000; + } + return 0; +} + +/* Uses synchronous mcc + * Returns link_speed in Mbps + */ +int be_cmd_link_status_query(struct be_adapter *adapter, u16 *link_speed, + u8 *link_status, u32 dom) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_link_status *req; + int status; + + spin_lock_bh(&adapter->mcc_lock); + + if (link_status) + *link_status = LINK_DOWN; + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_NTWK_LINK_STATUS_QUERY, + sizeof(*req), wrb, NULL); + + /* version 1 of the cmd is not supported only by BE2 */ + if (!BE2_chip(adapter)) + req->hdr.version = 1; + + req->hdr.domain = dom; + + status = be_mcc_notify_wait(adapter); + if (!status) { + struct be_cmd_resp_link_status *resp = embedded_payload(wrb); + + if (link_speed) { + *link_speed = resp->link_speed ? + le16_to_cpu(resp->link_speed) * 10 : + be_mac_to_link_speed(resp->mac_speed); + + if (!resp->logical_link_status) + *link_speed = 0; + } + if (link_status) + *link_status = resp->logical_link_status; + } + +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +/* Uses synchronous mcc */ +int be_cmd_get_die_temperature(struct be_adapter *adapter) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_get_cntl_addnl_attribs *req; + int status = 0; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_GET_CNTL_ADDITIONAL_ATTRIBUTES, + sizeof(*req), wrb, NULL); + + be_mcc_notify(adapter); + +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +/* Uses synchronous mcc */ +int be_cmd_get_reg_len(struct be_adapter *adapter, u32 *log_size) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_get_fat *req; + int status; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_MANAGE_FAT, sizeof(*req), wrb, + NULL); + req->fat_operation = cpu_to_le32(QUERY_FAT); + status = be_mcc_notify_wait(adapter); + if (!status) { + struct be_cmd_resp_get_fat *resp = embedded_payload(wrb); + + if (log_size && resp->log_size) + *log_size = le32_to_cpu(resp->log_size) - + sizeof(u32); + } +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +int be_cmd_get_regs(struct be_adapter *adapter, u32 buf_len, void *buf) +{ + struct be_dma_mem get_fat_cmd; + struct be_mcc_wrb *wrb; + struct be_cmd_req_get_fat *req; + u32 offset = 0, total_size, buf_size, + log_offset = sizeof(u32), payload_len; + int status = 0; + + if (buf_len == 0) + return -EIO; + + total_size = buf_len; + + get_fat_cmd.size = sizeof(struct be_cmd_req_get_fat) + 60*1024; + get_fat_cmd.va = pci_alloc_consistent(adapter->pdev, + get_fat_cmd.size, + &get_fat_cmd.dma); + if (!get_fat_cmd.va) { + dev_err(&adapter->pdev->dev, + "Memory allocation failure while reading FAT data\n"); + return -ENOMEM; + } + + spin_lock_bh(&adapter->mcc_lock); + + while (total_size) { + buf_size = min(total_size, (u32)60*1024); + total_size -= buf_size; + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + req = get_fat_cmd.va; + + payload_len = sizeof(struct be_cmd_req_get_fat) + buf_size; + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_MANAGE_FAT, payload_len, + wrb, &get_fat_cmd); + + req->fat_operation = cpu_to_le32(RETRIEVE_FAT); + req->read_log_offset = cpu_to_le32(log_offset); + req->read_log_length = cpu_to_le32(buf_size); + req->data_buffer_size = cpu_to_le32(buf_size); + + status = be_mcc_notify_wait(adapter); + if (!status) { + struct be_cmd_resp_get_fat *resp = get_fat_cmd.va; + + memcpy(buf + offset, + resp->data_buffer, + le32_to_cpu(resp->read_log_length)); + } else { + dev_err(&adapter->pdev->dev, "FAT Table Retrieve error\n"); + goto err; + } + offset += buf_size; + log_offset += buf_size; + } +err: + pci_free_consistent(adapter->pdev, get_fat_cmd.size, + get_fat_cmd.va, get_fat_cmd.dma); + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +/* Uses synchronous mcc */ +int be_cmd_get_fw_ver(struct be_adapter *adapter) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_get_fw_version *req; + int status; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_GET_FW_VERSION, sizeof(*req), wrb, + NULL); + status = be_mcc_notify_wait(adapter); + if (!status) { + struct be_cmd_resp_get_fw_version *resp = embedded_payload(wrb); + + strlcpy(adapter->fw_ver, resp->firmware_version_string, + sizeof(adapter->fw_ver)); + strlcpy(adapter->fw_on_flash, resp->fw_on_flash_version_string, + sizeof(adapter->fw_on_flash)); + } +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +/* set the EQ delay interval of an EQ to specified value + * Uses async mcc + */ +static int __be_cmd_modify_eqd(struct be_adapter *adapter, + struct be_set_eqd *set_eqd, int num) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_modify_eq_delay *req; + int status = 0, i; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_MODIFY_EQ_DELAY, sizeof(*req), wrb, + NULL); + + req->num_eq = cpu_to_le32(num); + for (i = 0; i < num; i++) { + req->set_eqd[i].eq_id = cpu_to_le32(set_eqd[i].eq_id); + req->set_eqd[i].phase = 0; + req->set_eqd[i].delay_multiplier = + cpu_to_le32(set_eqd[i].delay_multiplier); + } + + be_mcc_notify(adapter); +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +int be_cmd_modify_eqd(struct be_adapter *adapter, struct be_set_eqd *set_eqd, + int num) +{ + int num_eqs, i = 0; + + if (lancer_chip(adapter) && num > 8) { + while (num) { + num_eqs = min(num, 8); + __be_cmd_modify_eqd(adapter, &set_eqd[i], num_eqs); + i += num_eqs; + num -= num_eqs; + } + } else { + __be_cmd_modify_eqd(adapter, set_eqd, num); + } + + return 0; +} + +/* Uses sycnhronous mcc */ +int be_cmd_vlan_config(struct be_adapter *adapter, u32 if_id, u16 *vtag_array, + u32 num) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_vlan_config *req; + int status; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_NTWK_VLAN_CONFIG, sizeof(*req), + wrb, NULL); + + req->interface_id = if_id; + req->untagged = BE_IF_FLAGS_UNTAGGED & be_if_cap_flags(adapter) ? 1 : 0; + req->num_vlan = num; + memcpy(req->normal_vlan, vtag_array, + req->num_vlan * sizeof(vtag_array[0])); + + status = be_mcc_notify_wait(adapter); +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +int be_cmd_rx_filter(struct be_adapter *adapter, u32 flags, u32 value) +{ + struct be_mcc_wrb *wrb; + struct be_dma_mem *mem = &adapter->rx_filter; + struct be_cmd_req_rx_filter *req = mem->va; + int status; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + memset(req, 0, sizeof(*req)); + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_NTWK_RX_FILTER, sizeof(*req), + wrb, mem); + + req->if_id = cpu_to_le32(adapter->if_handle); + if (flags & IFF_PROMISC) { + req->if_flags_mask = cpu_to_le32(BE_IF_FLAGS_PROMISCUOUS | + BE_IF_FLAGS_VLAN_PROMISCUOUS | + BE_IF_FLAGS_MCAST_PROMISCUOUS); + if (value == ON) + req->if_flags = + cpu_to_le32(BE_IF_FLAGS_PROMISCUOUS | + BE_IF_FLAGS_VLAN_PROMISCUOUS | + BE_IF_FLAGS_MCAST_PROMISCUOUS); + } else if (flags & IFF_ALLMULTI) { + req->if_flags_mask = cpu_to_le32(BE_IF_FLAGS_MCAST_PROMISCUOUS); + req->if_flags = cpu_to_le32(BE_IF_FLAGS_MCAST_PROMISCUOUS); + } else if (flags & BE_FLAGS_VLAN_PROMISC) { + req->if_flags_mask = cpu_to_le32(BE_IF_FLAGS_VLAN_PROMISCUOUS); + + if (value == ON) + req->if_flags = + cpu_to_le32(BE_IF_FLAGS_VLAN_PROMISCUOUS); + } else { + struct netdev_hw_addr *ha; + int i = 0; + + req->if_flags_mask = cpu_to_le32(BE_IF_FLAGS_MULTICAST); + req->if_flags = cpu_to_le32(BE_IF_FLAGS_MULTICAST); + + /* Reset mcast promisc mode if already set by setting mask + * and not setting flags field + */ + req->if_flags_mask |= + cpu_to_le32(BE_IF_FLAGS_MCAST_PROMISCUOUS & + be_if_cap_flags(adapter)); + req->mcast_num = cpu_to_le32(netdev_mc_count(adapter->netdev)); + netdev_for_each_mc_addr(ha, adapter->netdev) + memcpy(req->mcast_mac[i++].byte, ha->addr, ETH_ALEN); + } + + if ((req->if_flags_mask & cpu_to_le32(be_if_cap_flags(adapter))) != + req->if_flags_mask) { + dev_warn(&adapter->pdev->dev, + "Cannot set rx filter flags 0x%x\n", + req->if_flags_mask); + dev_warn(&adapter->pdev->dev, + "Interface is capable of 0x%x flags only\n", + be_if_cap_flags(adapter)); + } + req->if_flags_mask &= cpu_to_le32(be_if_cap_flags(adapter)); + + status = be_mcc_notify_wait(adapter); + +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +/* Uses synchrounous mcc */ +int be_cmd_set_flow_control(struct be_adapter *adapter, u32 tx_fc, u32 rx_fc) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_set_flow_control *req; + int status; + + if (!be_cmd_allowed(adapter, OPCODE_COMMON_SET_FLOW_CONTROL, + CMD_SUBSYSTEM_COMMON)) + return -EPERM; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_SET_FLOW_CONTROL, sizeof(*req), + wrb, NULL); + + req->hdr.version = 1; + req->tx_flow_control = cpu_to_le16((u16)tx_fc); + req->rx_flow_control = cpu_to_le16((u16)rx_fc); + + status = be_mcc_notify_wait(adapter); + +err: + spin_unlock_bh(&adapter->mcc_lock); + + if (base_status(status) == MCC_STATUS_FEATURE_NOT_SUPPORTED) + return -EOPNOTSUPP; + + return status; +} + +/* Uses sycn mcc */ +int be_cmd_get_flow_control(struct be_adapter *adapter, u32 *tx_fc, u32 *rx_fc) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_get_flow_control *req; + int status; + + if (!be_cmd_allowed(adapter, OPCODE_COMMON_GET_FLOW_CONTROL, + CMD_SUBSYSTEM_COMMON)) + return -EPERM; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_GET_FLOW_CONTROL, sizeof(*req), + wrb, NULL); + + status = be_mcc_notify_wait(adapter); + if (!status) { + struct be_cmd_resp_get_flow_control *resp = + embedded_payload(wrb); + + *tx_fc = le16_to_cpu(resp->tx_flow_control); + *rx_fc = le16_to_cpu(resp->rx_flow_control); + } + +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +/* Uses mbox */ +int be_cmd_query_fw_cfg(struct be_adapter *adapter) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_query_fw_cfg *req; + int status; + + if (mutex_lock_interruptible(&adapter->mbox_lock)) + return -1; + + wrb = wrb_from_mbox(adapter); + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_QUERY_FIRMWARE_CONFIG, + sizeof(*req), wrb, NULL); + + status = be_mbox_notify_wait(adapter); + if (!status) { + struct be_cmd_resp_query_fw_cfg *resp = embedded_payload(wrb); + + adapter->port_num = le32_to_cpu(resp->phys_port); + adapter->function_mode = le32_to_cpu(resp->function_mode); + adapter->function_caps = le32_to_cpu(resp->function_caps); + adapter->asic_rev = le32_to_cpu(resp->asic_revision) & 0xFF; + dev_info(&adapter->pdev->dev, + "FW config: function_mode=0x%x, function_caps=0x%x\n", + adapter->function_mode, adapter->function_caps); + } + + mutex_unlock(&adapter->mbox_lock); + return status; +} + +/* Uses mbox */ +int be_cmd_reset_function(struct be_adapter *adapter) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_hdr *req; + int status; + + if (lancer_chip(adapter)) { + status = lancer_wait_ready(adapter); + if (!status) { + iowrite32(SLI_PORT_CONTROL_IP_MASK, + adapter->db + SLIPORT_CONTROL_OFFSET); + status = lancer_test_and_set_rdy_state(adapter); + } + if (status) { + dev_err(&adapter->pdev->dev, + "Adapter in non recoverable error\n"); + } + return status; + } + + if (mutex_lock_interruptible(&adapter->mbox_lock)) + return -1; + + wrb = wrb_from_mbox(adapter); + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(req, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_FUNCTION_RESET, sizeof(*req), wrb, + NULL); + + status = be_mbox_notify_wait(adapter); + + mutex_unlock(&adapter->mbox_lock); + return status; +} + +int be_cmd_rss_config(struct be_adapter *adapter, u8 *rsstable, + u32 rss_hash_opts, u16 table_size, const u8 *rss_hkey) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_rss_config *req; + int status; + + if (!(be_if_cap_flags(adapter) & BE_IF_FLAGS_RSS)) + return 0; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_ETH, + OPCODE_ETH_RSS_CONFIG, sizeof(*req), wrb, NULL); + + req->if_id = cpu_to_le32(adapter->if_handle); + req->enable_rss = cpu_to_le16(rss_hash_opts); + req->cpu_table_size_log2 = cpu_to_le16(fls(table_size) - 1); + + if (!BEx_chip(adapter)) + req->hdr.version = 1; + + memcpy(req->cpu_table, rsstable, table_size); + memcpy(req->hash, rss_hkey, RSS_HASH_KEY_LEN); + be_dws_cpu_to_le(req->hash, sizeof(req->hash)); + + status = be_mcc_notify_wait(adapter); +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +/* Uses sync mcc */ +int be_cmd_set_beacon_state(struct be_adapter *adapter, u8 port_num, + u8 bcn, u8 sts, u8 state) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_enable_disable_beacon *req; + int status; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_ENABLE_DISABLE_BEACON, + sizeof(*req), wrb, NULL); + + req->port_num = port_num; + req->beacon_state = state; + req->beacon_duration = bcn; + req->status_duration = sts; + + status = be_mcc_notify_wait(adapter); + +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +/* Uses sync mcc */ +int be_cmd_get_beacon_state(struct be_adapter *adapter, u8 port_num, u32 *state) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_get_beacon_state *req; + int status; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_GET_BEACON_STATE, sizeof(*req), + wrb, NULL); + + req->port_num = port_num; + + status = be_mcc_notify_wait(adapter); + if (!status) { + struct be_cmd_resp_get_beacon_state *resp = + embedded_payload(wrb); + + *state = resp->beacon_state; + } + +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +/* Uses sync mcc */ +int be_cmd_read_port_transceiver_data(struct be_adapter *adapter, + u8 page_num, u8 *data) +{ + struct be_dma_mem cmd; + struct be_mcc_wrb *wrb; + struct be_cmd_req_port_type *req; + int status; + + if (page_num > TR_PAGE_A2) + return -EINVAL; + + cmd.size = sizeof(struct be_cmd_resp_port_type); + cmd.va = pci_alloc_consistent(adapter->pdev, cmd.size, &cmd.dma); + if (!cmd.va) { + dev_err(&adapter->pdev->dev, "Memory allocation failed\n"); + return -ENOMEM; + } + memset(cmd.va, 0, cmd.size); + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + req = cmd.va; + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_READ_TRANSRECV_DATA, + cmd.size, wrb, &cmd); + + req->port = cpu_to_le32(adapter->hba_port_num); + req->page_num = cpu_to_le32(page_num); + status = be_mcc_notify_wait(adapter); + if (!status) { + struct be_cmd_resp_port_type *resp = cmd.va; + + memcpy(data, resp->page_data, PAGE_DATA_LEN); + } +err: + spin_unlock_bh(&adapter->mcc_lock); + pci_free_consistent(adapter->pdev, cmd.size, cmd.va, cmd.dma); + return status; +} + +int lancer_cmd_write_object(struct be_adapter *adapter, struct be_dma_mem *cmd, + u32 data_size, u32 data_offset, + const char *obj_name, u32 *data_written, + u8 *change_status, u8 *addn_status) +{ + struct be_mcc_wrb *wrb; + struct lancer_cmd_req_write_object *req; + struct lancer_cmd_resp_write_object *resp; + void *ctxt = NULL; + int status; + + spin_lock_bh(&adapter->mcc_lock); + adapter->flash_status = 0; + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err_unlock; + } + + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_WRITE_OBJECT, + sizeof(struct lancer_cmd_req_write_object), wrb, + NULL); + + ctxt = &req->context; + AMAP_SET_BITS(struct amap_lancer_write_obj_context, + write_length, ctxt, data_size); + + if (data_size == 0) + AMAP_SET_BITS(struct amap_lancer_write_obj_context, + eof, ctxt, 1); + else + AMAP_SET_BITS(struct amap_lancer_write_obj_context, + eof, ctxt, 0); + + be_dws_cpu_to_le(ctxt, sizeof(req->context)); + req->write_offset = cpu_to_le32(data_offset); + strlcpy(req->object_name, obj_name, sizeof(req->object_name)); + req->descriptor_count = cpu_to_le32(1); + req->buf_len = cpu_to_le32(data_size); + req->addr_low = cpu_to_le32((cmd->dma + + sizeof(struct lancer_cmd_req_write_object)) + & 0xFFFFFFFF); + req->addr_high = cpu_to_le32(upper_32_bits(cmd->dma + + sizeof(struct lancer_cmd_req_write_object))); + + be_mcc_notify(adapter); + spin_unlock_bh(&adapter->mcc_lock); + + if (!wait_for_completion_timeout(&adapter->et_cmd_compl, + msecs_to_jiffies(60000))) + status = -ETIMEDOUT; + else + status = adapter->flash_status; + + resp = embedded_payload(wrb); + if (!status) { + *data_written = le32_to_cpu(resp->actual_write_len); + *change_status = resp->change_status; + } else { + *addn_status = resp->additional_status; + } + + return status; + +err_unlock: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +int be_cmd_query_cable_type(struct be_adapter *adapter) +{ + u8 page_data[PAGE_DATA_LEN]; + int status; + + status = be_cmd_read_port_transceiver_data(adapter, TR_PAGE_A0, + page_data); + if (!status) { + switch (adapter->phy.interface_type) { + case PHY_TYPE_QSFP: + adapter->phy.cable_type = + page_data[QSFP_PLUS_CABLE_TYPE_OFFSET]; + break; + case PHY_TYPE_SFP_PLUS_10GB: + adapter->phy.cable_type = + page_data[SFP_PLUS_CABLE_TYPE_OFFSET]; + break; + default: + adapter->phy.cable_type = 0; + break; + } + } + return status; +} + +int lancer_cmd_delete_object(struct be_adapter *adapter, const char *obj_name) +{ + struct lancer_cmd_req_delete_object *req; + struct be_mcc_wrb *wrb; + int status; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_DELETE_OBJECT, + sizeof(*req), wrb, NULL); + + strlcpy(req->object_name, obj_name, sizeof(req->object_name)); + + status = be_mcc_notify_wait(adapter); +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +int lancer_cmd_read_object(struct be_adapter *adapter, struct be_dma_mem *cmd, + u32 data_size, u32 data_offset, const char *obj_name, + u32 *data_read, u32 *eof, u8 *addn_status) +{ + struct be_mcc_wrb *wrb; + struct lancer_cmd_req_read_object *req; + struct lancer_cmd_resp_read_object *resp; + int status; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err_unlock; + } + + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_READ_OBJECT, + sizeof(struct lancer_cmd_req_read_object), wrb, + NULL); + + req->desired_read_len = cpu_to_le32(data_size); + req->read_offset = cpu_to_le32(data_offset); + strcpy(req->object_name, obj_name); + req->descriptor_count = cpu_to_le32(1); + req->buf_len = cpu_to_le32(data_size); + req->addr_low = cpu_to_le32((cmd->dma & 0xFFFFFFFF)); + req->addr_high = cpu_to_le32(upper_32_bits(cmd->dma)); + + status = be_mcc_notify_wait(adapter); + + resp = embedded_payload(wrb); + if (!status) { + *data_read = le32_to_cpu(resp->actual_read_len); + *eof = le32_to_cpu(resp->eof); + } else { + *addn_status = resp->additional_status; + } + +err_unlock: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +int be_cmd_write_flashrom(struct be_adapter *adapter, struct be_dma_mem *cmd, + u32 flash_type, u32 flash_opcode, u32 buf_size) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_write_flashrom *req; + int status; + + spin_lock_bh(&adapter->mcc_lock); + adapter->flash_status = 0; + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err_unlock; + } + req = cmd->va; + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_WRITE_FLASHROM, cmd->size, wrb, + cmd); + + req->params.op_type = cpu_to_le32(flash_type); + req->params.op_code = cpu_to_le32(flash_opcode); + req->params.data_buf_size = cpu_to_le32(buf_size); + + be_mcc_notify(adapter); + spin_unlock_bh(&adapter->mcc_lock); + + if (!wait_for_completion_timeout(&adapter->et_cmd_compl, + msecs_to_jiffies(40000))) + status = -ETIMEDOUT; + else + status = adapter->flash_status; + + return status; + +err_unlock: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +int be_cmd_get_flash_crc(struct be_adapter *adapter, u8 *flashed_crc, + u16 optype, int offset) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_read_flash_crc *req; + int status; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_READ_FLASHROM, sizeof(*req), + wrb, NULL); + + req->params.op_type = cpu_to_le32(optype); + req->params.op_code = cpu_to_le32(FLASHROM_OPER_REPORT); + req->params.offset = cpu_to_le32(offset); + req->params.data_buf_size = cpu_to_le32(0x4); + + status = be_mcc_notify_wait(adapter); + if (!status) + memcpy(flashed_crc, req->crc, 4); + +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +int be_cmd_enable_magic_wol(struct be_adapter *adapter, u8 *mac, + struct be_dma_mem *nonemb_cmd) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_acpi_wol_magic_config *req; + int status; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + req = nonemb_cmd->va; + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_ETH, + OPCODE_ETH_ACPI_WOL_MAGIC_CONFIG, sizeof(*req), + wrb, nonemb_cmd); + memcpy(req->magic_mac, mac, ETH_ALEN); + + status = be_mcc_notify_wait(adapter); + +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +int be_cmd_set_loopback(struct be_adapter *adapter, u8 port_num, + u8 loopback_type, u8 enable) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_set_lmode *req; + int status; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_LOWLEVEL, + OPCODE_LOWLEVEL_SET_LOOPBACK_MODE, sizeof(*req), + wrb, NULL); + + req->src_port = port_num; + req->dest_port = port_num; + req->loopback_type = loopback_type; + req->loopback_state = enable; + + status = be_mcc_notify_wait(adapter); +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +int be_cmd_loopback_test(struct be_adapter *adapter, u32 port_num, + u32 loopback_type, u32 pkt_size, u32 num_pkts, + u64 pattern) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_loopback_test *req; + struct be_cmd_resp_loopback_test *resp; + int status; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_LOWLEVEL, + OPCODE_LOWLEVEL_LOOPBACK_TEST, sizeof(*req), wrb, + NULL); + + req->hdr.timeout = cpu_to_le32(15); + req->pattern = cpu_to_le64(pattern); + req->src_port = cpu_to_le32(port_num); + req->dest_port = cpu_to_le32(port_num); + req->pkt_size = cpu_to_le32(pkt_size); + req->num_pkts = cpu_to_le32(num_pkts); + req->loopback_type = cpu_to_le32(loopback_type); + + be_mcc_notify(adapter); + + spin_unlock_bh(&adapter->mcc_lock); + + wait_for_completion(&adapter->et_cmd_compl); + resp = embedded_payload(wrb); + status = le32_to_cpu(resp->status); + + return status; +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +int be_cmd_ddr_dma_test(struct be_adapter *adapter, u64 pattern, + u32 byte_cnt, struct be_dma_mem *cmd) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_ddrdma_test *req; + int status; + int i, j = 0; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + req = cmd->va; + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_LOWLEVEL, + OPCODE_LOWLEVEL_HOST_DDR_DMA, cmd->size, wrb, + cmd); + + req->pattern = cpu_to_le64(pattern); + req->byte_count = cpu_to_le32(byte_cnt); + for (i = 0; i < byte_cnt; i++) { + req->snd_buff[i] = (u8)(pattern >> (j*8)); + j++; + if (j > 7) + j = 0; + } + + status = be_mcc_notify_wait(adapter); + + if (!status) { + struct be_cmd_resp_ddrdma_test *resp; + + resp = cmd->va; + if ((memcmp(resp->rcv_buff, req->snd_buff, byte_cnt) != 0) || + resp->snd_err) { + status = -1; + } + } + +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +int be_cmd_get_seeprom_data(struct be_adapter *adapter, + struct be_dma_mem *nonemb_cmd) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_seeprom_read *req; + int status; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + req = nonemb_cmd->va; + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_SEEPROM_READ, sizeof(*req), wrb, + nonemb_cmd); + + status = be_mcc_notify_wait(adapter); + +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +int be_cmd_get_phy_info(struct be_adapter *adapter) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_get_phy_info *req; + struct be_dma_mem cmd; + int status; + + if (!be_cmd_allowed(adapter, OPCODE_COMMON_GET_PHY_DETAILS, + CMD_SUBSYSTEM_COMMON)) + return -EPERM; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + cmd.size = sizeof(struct be_cmd_req_get_phy_info); + cmd.va = pci_alloc_consistent(adapter->pdev, cmd.size, &cmd.dma); + if (!cmd.va) { + dev_err(&adapter->pdev->dev, "Memory alloc failure\n"); + status = -ENOMEM; + goto err; + } + + req = cmd.va; + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_GET_PHY_DETAILS, sizeof(*req), + wrb, &cmd); + + status = be_mcc_notify_wait(adapter); + if (!status) { + struct be_phy_info *resp_phy_info = + cmd.va + sizeof(struct be_cmd_req_hdr); + + adapter->phy.phy_type = le16_to_cpu(resp_phy_info->phy_type); + adapter->phy.interface_type = + le16_to_cpu(resp_phy_info->interface_type); + adapter->phy.auto_speeds_supported = + le16_to_cpu(resp_phy_info->auto_speeds_supported); + adapter->phy.fixed_speeds_supported = + le16_to_cpu(resp_phy_info->fixed_speeds_supported); + adapter->phy.misc_params = + le32_to_cpu(resp_phy_info->misc_params); + + if (BE2_chip(adapter)) { + adapter->phy.fixed_speeds_supported = + BE_SUPPORTED_SPEED_10GBPS | + BE_SUPPORTED_SPEED_1GBPS; + } + } + pci_free_consistent(adapter->pdev, cmd.size, cmd.va, cmd.dma); +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +int be_cmd_set_qos(struct be_adapter *adapter, u32 bps, u32 domain) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_set_qos *req; + int status; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_SET_QOS, sizeof(*req), wrb, NULL); + + req->hdr.domain = domain; + req->valid_bits = cpu_to_le32(BE_QOS_BITS_NIC); + req->max_bps_nic = cpu_to_le32(bps); + + status = be_mcc_notify_wait(adapter); + +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +int be_cmd_get_cntl_attributes(struct be_adapter *adapter) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_cntl_attribs *req; + struct be_cmd_resp_cntl_attribs *resp; + int status; + int payload_len = max(sizeof(*req), sizeof(*resp)); + struct mgmt_controller_attrib *attribs; + struct be_dma_mem attribs_cmd; + + if (mutex_lock_interruptible(&adapter->mbox_lock)) + return -1; + + memset(&attribs_cmd, 0, sizeof(struct be_dma_mem)); + attribs_cmd.size = sizeof(struct be_cmd_resp_cntl_attribs); + attribs_cmd.va = pci_alloc_consistent(adapter->pdev, attribs_cmd.size, + &attribs_cmd.dma); + if (!attribs_cmd.va) { + dev_err(&adapter->pdev->dev, "Memory allocation failure\n"); + status = -ENOMEM; + goto err; + } + + wrb = wrb_from_mbox(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + req = attribs_cmd.va; + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_GET_CNTL_ATTRIBUTES, payload_len, + wrb, &attribs_cmd); + + status = be_mbox_notify_wait(adapter); + if (!status) { + attribs = attribs_cmd.va + sizeof(struct be_cmd_resp_hdr); + adapter->hba_port_num = attribs->hba_attribs.phy_port; + } + +err: + mutex_unlock(&adapter->mbox_lock); + if (attribs_cmd.va) + pci_free_consistent(adapter->pdev, attribs_cmd.size, + attribs_cmd.va, attribs_cmd.dma); + return status; +} + +/* Uses mbox */ +int be_cmd_req_native_mode(struct be_adapter *adapter) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_set_func_cap *req; + int status; + + if (mutex_lock_interruptible(&adapter->mbox_lock)) + return -1; + + wrb = wrb_from_mbox(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_SET_DRIVER_FUNCTION_CAP, + sizeof(*req), wrb, NULL); + + req->valid_cap_flags = cpu_to_le32(CAPABILITY_SW_TIMESTAMPS | + CAPABILITY_BE3_NATIVE_ERX_API); + req->cap_flags = cpu_to_le32(CAPABILITY_BE3_NATIVE_ERX_API); + + status = be_mbox_notify_wait(adapter); + if (!status) { + struct be_cmd_resp_set_func_cap *resp = embedded_payload(wrb); + + adapter->be3_native = le32_to_cpu(resp->cap_flags) & + CAPABILITY_BE3_NATIVE_ERX_API; + if (!adapter->be3_native) + dev_warn(&adapter->pdev->dev, + "adapter not in advanced mode\n"); + } +err: + mutex_unlock(&adapter->mbox_lock); + return status; +} + +/* Get privilege(s) for a function */ +int be_cmd_get_fn_privileges(struct be_adapter *adapter, u32 *privilege, + u32 domain) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_get_fn_privileges *req; + int status; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_GET_FN_PRIVILEGES, sizeof(*req), + wrb, NULL); + + req->hdr.domain = domain; + + status = be_mcc_notify_wait(adapter); + if (!status) { + struct be_cmd_resp_get_fn_privileges *resp = + embedded_payload(wrb); + + *privilege = le32_to_cpu(resp->privilege_mask); + + /* In UMC mode FW does not return right privileges. + * Override with correct privilege equivalent to PF. + */ + if (BEx_chip(adapter) && be_is_mc(adapter) && + be_physfn(adapter)) + *privilege = MAX_PRIVILEGES; + } + +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +/* Set privilege(s) for a function */ +int be_cmd_set_fn_privileges(struct be_adapter *adapter, u32 privileges, + u32 domain) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_set_fn_privileges *req; + int status; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + + req = embedded_payload(wrb); + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_SET_FN_PRIVILEGES, sizeof(*req), + wrb, NULL); + req->hdr.domain = domain; + if (lancer_chip(adapter)) + req->privileges_lancer = cpu_to_le32(privileges); + else + req->privileges = cpu_to_le32(privileges); + + status = be_mcc_notify_wait(adapter); +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +/* pmac_id_valid: true => pmac_id is supplied and MAC address is requested. + * pmac_id_valid: false => pmac_id or MAC address is requested. + * If pmac_id is returned, pmac_id_valid is returned as true + */ +int be_cmd_get_mac_from_list(struct be_adapter *adapter, u8 *mac, + bool *pmac_id_valid, u32 *pmac_id, u32 if_handle, + u8 domain) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_get_mac_list *req; + int status; + int mac_count; + struct be_dma_mem get_mac_list_cmd; + int i; + + memset(&get_mac_list_cmd, 0, sizeof(struct be_dma_mem)); + get_mac_list_cmd.size = sizeof(struct be_cmd_resp_get_mac_list); + get_mac_list_cmd.va = pci_alloc_consistent(adapter->pdev, + get_mac_list_cmd.size, + &get_mac_list_cmd.dma); + + if (!get_mac_list_cmd.va) { + dev_err(&adapter->pdev->dev, + "Memory allocation failure during GET_MAC_LIST\n"); + return -ENOMEM; + } + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto out; + } + + req = get_mac_list_cmd.va; + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_GET_MAC_LIST, + get_mac_list_cmd.size, wrb, &get_mac_list_cmd); + req->hdr.domain = domain; + req->mac_type = MAC_ADDRESS_TYPE_NETWORK; + if (*pmac_id_valid) { + req->mac_id = cpu_to_le32(*pmac_id); + req->iface_id = cpu_to_le16(if_handle); + req->perm_override = 0; + } else { + req->perm_override = 1; + } + + status = be_mcc_notify_wait(adapter); + if (!status) { + struct be_cmd_resp_get_mac_list *resp = + get_mac_list_cmd.va; + + if (*pmac_id_valid) { + memcpy(mac, resp->macid_macaddr.mac_addr_id.macaddr, + ETH_ALEN); + goto out; + } + + mac_count = resp->true_mac_count + resp->pseudo_mac_count; + /* Mac list returned could contain one or more active mac_ids + * or one or more true or pseudo permanant mac addresses. + * If an active mac_id is present, return first active mac_id + * found. + */ + for (i = 0; i < mac_count; i++) { + struct get_list_macaddr *mac_entry; + u16 mac_addr_size; + u32 mac_id; + + mac_entry = &resp->macaddr_list[i]; + mac_addr_size = le16_to_cpu(mac_entry->mac_addr_size); + /* mac_id is a 32 bit value and mac_addr size + * is 6 bytes + */ + if (mac_addr_size == sizeof(u32)) { + *pmac_id_valid = true; + mac_id = mac_entry->mac_addr_id.s_mac_id.mac_id; + *pmac_id = le32_to_cpu(mac_id); + goto out; + } + } + /* If no active mac_id found, return first mac addr */ + *pmac_id_valid = false; + memcpy(mac, resp->macaddr_list[0].mac_addr_id.macaddr, + ETH_ALEN); + } + +out: + spin_unlock_bh(&adapter->mcc_lock); + pci_free_consistent(adapter->pdev, get_mac_list_cmd.size, + get_mac_list_cmd.va, get_mac_list_cmd.dma); + return status; +} + +int be_cmd_get_active_mac(struct be_adapter *adapter, u32 curr_pmac_id, + u8 *mac, u32 if_handle, bool active, u32 domain) +{ + if (!active) + be_cmd_get_mac_from_list(adapter, mac, &active, &curr_pmac_id, + if_handle, domain); + if (BEx_chip(adapter)) + return be_cmd_mac_addr_query(adapter, mac, false, + if_handle, curr_pmac_id); + else + /* Fetch the MAC address using pmac_id */ + return be_cmd_get_mac_from_list(adapter, mac, &active, + &curr_pmac_id, + if_handle, domain); +} + +int be_cmd_get_perm_mac(struct be_adapter *adapter, u8 *mac) +{ + int status; + bool pmac_valid = false; + + memset(mac, 0, ETH_ALEN); + + if (BEx_chip(adapter)) { + if (be_physfn(adapter)) + status = be_cmd_mac_addr_query(adapter, mac, true, 0, + 0); + else + status = be_cmd_mac_addr_query(adapter, mac, false, + adapter->if_handle, 0); + } else { + status = be_cmd_get_mac_from_list(adapter, mac, &pmac_valid, + NULL, adapter->if_handle, 0); + } + + return status; +} + +/* Uses synchronous MCCQ */ +int be_cmd_set_mac_list(struct be_adapter *adapter, u8 *mac_array, + u8 mac_count, u32 domain) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_set_mac_list *req; + int status; + struct be_dma_mem cmd; + + memset(&cmd, 0, sizeof(struct be_dma_mem)); + cmd.size = sizeof(struct be_cmd_req_set_mac_list); + cmd.va = dma_alloc_coherent(&adapter->pdev->dev, cmd.size, + &cmd.dma, GFP_KERNEL); + if (!cmd.va) + return -ENOMEM; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + + req = cmd.va; + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_SET_MAC_LIST, sizeof(*req), + wrb, &cmd); + + req->hdr.domain = domain; + req->mac_count = mac_count; + if (mac_count) + memcpy(req->mac, mac_array, ETH_ALEN*mac_count); + + status = be_mcc_notify_wait(adapter); + +err: + dma_free_coherent(&adapter->pdev->dev, cmd.size, cmd.va, cmd.dma); + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +/* Wrapper to delete any active MACs and provision the new mac. + * Changes to MAC_LIST are allowed iff none of the MAC addresses in the + * current list are active. + */ +int be_cmd_set_mac(struct be_adapter *adapter, u8 *mac, int if_id, u32 dom) +{ + bool active_mac = false; + u8 old_mac[ETH_ALEN]; + u32 pmac_id; + int status; + + status = be_cmd_get_mac_from_list(adapter, old_mac, &active_mac, + &pmac_id, if_id, dom); + + if (!status && active_mac) + be_cmd_pmac_del(adapter, if_id, pmac_id, dom); + + return be_cmd_set_mac_list(adapter, mac, mac ? 1 : 0, dom); +} + +int be_cmd_set_hsw_config(struct be_adapter *adapter, u16 pvid, + u32 domain, u16 intf_id, u16 hsw_mode) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_set_hsw_config *req; + void *ctxt; + int status; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + + req = embedded_payload(wrb); + ctxt = &req->context; + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_SET_HSW_CONFIG, sizeof(*req), wrb, + NULL); + + req->hdr.domain = domain; + AMAP_SET_BITS(struct amap_set_hsw_context, interface_id, ctxt, intf_id); + if (pvid) { + AMAP_SET_BITS(struct amap_set_hsw_context, pvid_valid, ctxt, 1); + AMAP_SET_BITS(struct amap_set_hsw_context, pvid, ctxt, pvid); + } + if (!BEx_chip(adapter) && hsw_mode) { + AMAP_SET_BITS(struct amap_set_hsw_context, interface_id, + ctxt, adapter->hba_port_num); + AMAP_SET_BITS(struct amap_set_hsw_context, pport, ctxt, 1); + AMAP_SET_BITS(struct amap_set_hsw_context, port_fwd_type, + ctxt, hsw_mode); + } + + be_dws_cpu_to_le(req->context, sizeof(req->context)); + status = be_mcc_notify_wait(adapter); + +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +/* Get Hyper switch config */ +int be_cmd_get_hsw_config(struct be_adapter *adapter, u16 *pvid, + u32 domain, u16 intf_id, u8 *mode) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_get_hsw_config *req; + void *ctxt; + int status; + u16 vid; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + + req = embedded_payload(wrb); + ctxt = &req->context; + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_GET_HSW_CONFIG, sizeof(*req), wrb, + NULL); + + req->hdr.domain = domain; + AMAP_SET_BITS(struct amap_get_hsw_req_context, interface_id, + ctxt, intf_id); + AMAP_SET_BITS(struct amap_get_hsw_req_context, pvid_valid, ctxt, 1); + + if (!BEx_chip(adapter) && mode) { + AMAP_SET_BITS(struct amap_get_hsw_req_context, interface_id, + ctxt, adapter->hba_port_num); + AMAP_SET_BITS(struct amap_get_hsw_req_context, pport, ctxt, 1); + } + be_dws_cpu_to_le(req->context, sizeof(req->context)); + + status = be_mcc_notify_wait(adapter); + if (!status) { + struct be_cmd_resp_get_hsw_config *resp = + embedded_payload(wrb); + + be_dws_le_to_cpu(&resp->context, sizeof(resp->context)); + vid = AMAP_GET_BITS(struct amap_get_hsw_resp_context, + pvid, &resp->context); + if (pvid) + *pvid = le16_to_cpu(vid); + if (mode) + *mode = AMAP_GET_BITS(struct amap_get_hsw_resp_context, + port_fwd_type, &resp->context); + } + +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +int be_cmd_get_acpi_wol_cap(struct be_adapter *adapter) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_acpi_wol_magic_config_v1 *req; + int status = 0; + struct be_dma_mem cmd; + + if (!be_cmd_allowed(adapter, OPCODE_ETH_ACPI_WOL_MAGIC_CONFIG, + CMD_SUBSYSTEM_ETH)) + return -EPERM; + + if (be_is_wol_excluded(adapter)) + return status; + + if (mutex_lock_interruptible(&adapter->mbox_lock)) + return -1; + + memset(&cmd, 0, sizeof(struct be_dma_mem)); + cmd.size = sizeof(struct be_cmd_resp_acpi_wol_magic_config_v1); + cmd.va = pci_alloc_consistent(adapter->pdev, cmd.size, &cmd.dma); + if (!cmd.va) { + dev_err(&adapter->pdev->dev, "Memory allocation failure\n"); + status = -ENOMEM; + goto err; + } + + wrb = wrb_from_mbox(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + + req = cmd.va; + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_ETH, + OPCODE_ETH_ACPI_WOL_MAGIC_CONFIG, + sizeof(*req), wrb, &cmd); + + req->hdr.version = 1; + req->query_options = BE_GET_WOL_CAP; + + status = be_mbox_notify_wait(adapter); + if (!status) { + struct be_cmd_resp_acpi_wol_magic_config_v1 *resp; + + resp = (struct be_cmd_resp_acpi_wol_magic_config_v1 *)cmd.va; + + adapter->wol_cap = resp->wol_settings; + if (adapter->wol_cap & BE_WOL_CAP) + adapter->wol_en = true; + } +err: + mutex_unlock(&adapter->mbox_lock); + if (cmd.va) + pci_free_consistent(adapter->pdev, cmd.size, cmd.va, cmd.dma); + return status; + +} + +int be_cmd_set_fw_log_level(struct be_adapter *adapter, u32 level) +{ + struct be_dma_mem extfat_cmd; + struct be_fat_conf_params *cfgs; + int status; + int i, j; + + memset(&extfat_cmd, 0, sizeof(struct be_dma_mem)); + extfat_cmd.size = sizeof(struct be_cmd_resp_get_ext_fat_caps); + extfat_cmd.va = pci_alloc_consistent(adapter->pdev, extfat_cmd.size, + &extfat_cmd.dma); + if (!extfat_cmd.va) + return -ENOMEM; + + status = be_cmd_get_ext_fat_capabilites(adapter, &extfat_cmd); + if (status) + goto err; + + cfgs = (struct be_fat_conf_params *) + (extfat_cmd.va + sizeof(struct be_cmd_resp_hdr)); + for (i = 0; i < le32_to_cpu(cfgs->num_modules); i++) { + u32 num_modes = le32_to_cpu(cfgs->module[i].num_modes); + + for (j = 0; j < num_modes; j++) { + if (cfgs->module[i].trace_lvl[j].mode == MODE_UART) + cfgs->module[i].trace_lvl[j].dbg_lvl = + cpu_to_le32(level); + } + } + + status = be_cmd_set_ext_fat_capabilites(adapter, &extfat_cmd, cfgs); +err: + pci_free_consistent(adapter->pdev, extfat_cmd.size, extfat_cmd.va, + extfat_cmd.dma); + return status; +} + +int be_cmd_get_fw_log_level(struct be_adapter *adapter) +{ + struct be_dma_mem extfat_cmd; + struct be_fat_conf_params *cfgs; + int status, j; + int level = 0; + + memset(&extfat_cmd, 0, sizeof(struct be_dma_mem)); + extfat_cmd.size = sizeof(struct be_cmd_resp_get_ext_fat_caps); + extfat_cmd.va = pci_alloc_consistent(adapter->pdev, extfat_cmd.size, + &extfat_cmd.dma); + + if (!extfat_cmd.va) { + dev_err(&adapter->pdev->dev, "%s: Memory allocation failure\n", + __func__); + goto err; + } + + status = be_cmd_get_ext_fat_capabilites(adapter, &extfat_cmd); + if (!status) { + cfgs = (struct be_fat_conf_params *)(extfat_cmd.va + + sizeof(struct be_cmd_resp_hdr)); + + for (j = 0; j < le32_to_cpu(cfgs->module[0].num_modes); j++) { + if (cfgs->module[0].trace_lvl[j].mode == MODE_UART) + level = cfgs->module[0].trace_lvl[j].dbg_lvl; + } + } + pci_free_consistent(adapter->pdev, extfat_cmd.size, extfat_cmd.va, + extfat_cmd.dma); +err: + return level; +} + +int be_cmd_get_ext_fat_capabilites(struct be_adapter *adapter, + struct be_dma_mem *cmd) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_get_ext_fat_caps *req; + int status; + + if (mutex_lock_interruptible(&adapter->mbox_lock)) + return -1; + + wrb = wrb_from_mbox(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + + req = cmd->va; + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_GET_EXT_FAT_CAPABILITES, + cmd->size, wrb, cmd); + req->parameter_type = cpu_to_le32(1); + + status = be_mbox_notify_wait(adapter); +err: + mutex_unlock(&adapter->mbox_lock); + return status; +} + +int be_cmd_set_ext_fat_capabilites(struct be_adapter *adapter, + struct be_dma_mem *cmd, + struct be_fat_conf_params *configs) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_set_ext_fat_caps *req; + int status; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + + req = cmd->va; + memcpy(&req->set_params, configs, sizeof(struct be_fat_conf_params)); + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_SET_EXT_FAT_CAPABILITES, + cmd->size, wrb, cmd); + + status = be_mcc_notify_wait(adapter); +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +int be_cmd_query_port_name(struct be_adapter *adapter, u8 *port_name) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_get_port_name *req; + int status; + + if (!lancer_chip(adapter)) { + *port_name = adapter->hba_port_num + '0'; + return 0; + } + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_GET_PORT_NAME, sizeof(*req), wrb, + NULL); + req->hdr.version = 1; + + status = be_mcc_notify_wait(adapter); + if (!status) { + struct be_cmd_resp_get_port_name *resp = embedded_payload(wrb); + + *port_name = resp->port_name[adapter->hba_port_num]; + } else { + *port_name = adapter->hba_port_num + '0'; + } +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +/* Descriptor type */ +enum { + FUNC_DESC = 1, + VFT_DESC = 2 +}; + +static struct be_nic_res_desc *be_get_nic_desc(u8 *buf, u32 desc_count, + int desc_type) +{ + struct be_res_desc_hdr *hdr = (struct be_res_desc_hdr *)buf; + struct be_nic_res_desc *nic; + int i; + + for (i = 0; i < desc_count; i++) { + if (hdr->desc_type == NIC_RESOURCE_DESC_TYPE_V0 || + hdr->desc_type == NIC_RESOURCE_DESC_TYPE_V1) { + nic = (struct be_nic_res_desc *)hdr; + if (desc_type == FUNC_DESC || + (desc_type == VFT_DESC && + nic->flags & (1 << VFT_SHIFT))) + return nic; + } + + hdr->desc_len = hdr->desc_len ? : RESOURCE_DESC_SIZE_V0; + hdr = (void *)hdr + hdr->desc_len; + } + return NULL; +} + +static struct be_nic_res_desc *be_get_vft_desc(u8 *buf, u32 desc_count) +{ + return be_get_nic_desc(buf, desc_count, VFT_DESC); +} + +static struct be_nic_res_desc *be_get_func_nic_desc(u8 *buf, u32 desc_count) +{ + return be_get_nic_desc(buf, desc_count, FUNC_DESC); +} + +static struct be_pcie_res_desc *be_get_pcie_desc(u8 devfn, u8 *buf, + u32 desc_count) +{ + struct be_res_desc_hdr *hdr = (struct be_res_desc_hdr *)buf; + struct be_pcie_res_desc *pcie; + int i; + + for (i = 0; i < desc_count; i++) { + if ((hdr->desc_type == PCIE_RESOURCE_DESC_TYPE_V0 || + hdr->desc_type == PCIE_RESOURCE_DESC_TYPE_V1)) { + pcie = (struct be_pcie_res_desc *)hdr; + if (pcie->pf_num == devfn) + return pcie; + } + + hdr->desc_len = hdr->desc_len ? : RESOURCE_DESC_SIZE_V0; + hdr = (void *)hdr + hdr->desc_len; + } + return NULL; +} + +static struct be_port_res_desc *be_get_port_desc(u8 *buf, u32 desc_count) +{ + struct be_res_desc_hdr *hdr = (struct be_res_desc_hdr *)buf; + int i; + + for (i = 0; i < desc_count; i++) { + if (hdr->desc_type == PORT_RESOURCE_DESC_TYPE_V1) + return (struct be_port_res_desc *)hdr; + + hdr->desc_len = hdr->desc_len ? : RESOURCE_DESC_SIZE_V0; + hdr = (void *)hdr + hdr->desc_len; + } + return NULL; +} + +static void be_copy_nic_desc(struct be_resources *res, + struct be_nic_res_desc *desc) +{ + res->max_uc_mac = le16_to_cpu(desc->unicast_mac_count); + res->max_vlans = le16_to_cpu(desc->vlan_count); + res->max_mcast_mac = le16_to_cpu(desc->mcast_mac_count); + res->max_tx_qs = le16_to_cpu(desc->txq_count); + res->max_rss_qs = le16_to_cpu(desc->rssq_count); + res->max_rx_qs = le16_to_cpu(desc->rq_count); + res->max_evt_qs = le16_to_cpu(desc->eq_count); + /* Clear flags that driver is not interested in */ + res->if_cap_flags = le32_to_cpu(desc->cap_flags) & + BE_IF_CAP_FLAGS_WANT; + /* Need 1 RXQ as the default RXQ */ + if (res->max_rss_qs && res->max_rss_qs == res->max_rx_qs) + res->max_rss_qs -= 1; +} + +/* Uses Mbox */ +int be_cmd_get_func_config(struct be_adapter *adapter, struct be_resources *res) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_get_func_config *req; + int status; + struct be_dma_mem cmd; + + if (mutex_lock_interruptible(&adapter->mbox_lock)) + return -1; + + memset(&cmd, 0, sizeof(struct be_dma_mem)); + cmd.size = sizeof(struct be_cmd_resp_get_func_config); + cmd.va = pci_alloc_consistent(adapter->pdev, cmd.size, &cmd.dma); + if (!cmd.va) { + dev_err(&adapter->pdev->dev, "Memory alloc failure\n"); + status = -ENOMEM; + goto err; + } + + wrb = wrb_from_mbox(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + + req = cmd.va; + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_GET_FUNC_CONFIG, + cmd.size, wrb, &cmd); + + if (skyhawk_chip(adapter)) + req->hdr.version = 1; + + status = be_mbox_notify_wait(adapter); + if (!status) { + struct be_cmd_resp_get_func_config *resp = cmd.va; + u32 desc_count = le32_to_cpu(resp->desc_count); + struct be_nic_res_desc *desc; + + desc = be_get_func_nic_desc(resp->func_param, desc_count); + if (!desc) { + status = -EINVAL; + goto err; + } + + adapter->pf_number = desc->pf_num; + be_copy_nic_desc(res, desc); + } +err: + mutex_unlock(&adapter->mbox_lock); + if (cmd.va) + pci_free_consistent(adapter->pdev, cmd.size, cmd.va, cmd.dma); + return status; +} + +/* Will use MBOX only if MCCQ has not been created */ +int be_cmd_get_profile_config(struct be_adapter *adapter, + struct be_resources *res, u8 domain) +{ + struct be_cmd_resp_get_profile_config *resp; + struct be_cmd_req_get_profile_config *req; + struct be_nic_res_desc *vf_res; + struct be_pcie_res_desc *pcie; + struct be_port_res_desc *port; + struct be_nic_res_desc *nic; + struct be_mcc_wrb wrb = {0}; + struct be_dma_mem cmd; + u32 desc_count; + int status; + + memset(&cmd, 0, sizeof(struct be_dma_mem)); + cmd.size = sizeof(struct be_cmd_resp_get_profile_config); + cmd.va = pci_alloc_consistent(adapter->pdev, cmd.size, &cmd.dma); + if (!cmd.va) + return -ENOMEM; + + req = cmd.va; + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_GET_PROFILE_CONFIG, + cmd.size, &wrb, &cmd); + + req->hdr.domain = domain; + if (!lancer_chip(adapter)) + req->hdr.version = 1; + req->type = ACTIVE_PROFILE_TYPE; + + status = be_cmd_notify_wait(adapter, &wrb); + if (status) + goto err; + + resp = cmd.va; + desc_count = le32_to_cpu(resp->desc_count); + + pcie = be_get_pcie_desc(adapter->pdev->devfn, resp->func_param, + desc_count); + if (pcie) + res->max_vfs = le16_to_cpu(pcie->num_vfs); + + port = be_get_port_desc(resp->func_param, desc_count); + if (port) + adapter->mc_type = port->mc_type; + + nic = be_get_func_nic_desc(resp->func_param, desc_count); + if (nic) + be_copy_nic_desc(res, nic); + + vf_res = be_get_vft_desc(resp->func_param, desc_count); + if (vf_res) + res->vf_if_cap_flags = vf_res->cap_flags; +err: + if (cmd.va) + pci_free_consistent(adapter->pdev, cmd.size, cmd.va, cmd.dma); + return status; +} + +/* Will use MBOX only if MCCQ has not been created */ +static int be_cmd_set_profile_config(struct be_adapter *adapter, void *desc, + int size, int count, u8 version, u8 domain) +{ + struct be_cmd_req_set_profile_config *req; + struct be_mcc_wrb wrb = {0}; + struct be_dma_mem cmd; + int status; + + memset(&cmd, 0, sizeof(struct be_dma_mem)); + cmd.size = sizeof(struct be_cmd_req_set_profile_config); + cmd.va = pci_alloc_consistent(adapter->pdev, cmd.size, &cmd.dma); + if (!cmd.va) + return -ENOMEM; + + req = cmd.va; + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_SET_PROFILE_CONFIG, cmd.size, + &wrb, &cmd); + req->hdr.version = version; + req->hdr.domain = domain; + req->desc_count = cpu_to_le32(count); + memcpy(req->desc, desc, size); + + status = be_cmd_notify_wait(adapter, &wrb); + + if (cmd.va) + pci_free_consistent(adapter->pdev, cmd.size, cmd.va, cmd.dma); + return status; +} + +/* Mark all fields invalid */ +static void be_reset_nic_desc(struct be_nic_res_desc *nic) +{ + memset(nic, 0, sizeof(*nic)); + nic->unicast_mac_count = 0xFFFF; + nic->mcc_count = 0xFFFF; + nic->vlan_count = 0xFFFF; + nic->mcast_mac_count = 0xFFFF; + nic->txq_count = 0xFFFF; + nic->rq_count = 0xFFFF; + nic->rssq_count = 0xFFFF; + nic->lro_count = 0xFFFF; + nic->cq_count = 0xFFFF; + nic->toe_conn_count = 0xFFFF; + nic->eq_count = 0xFFFF; + nic->iface_count = 0xFFFF; + nic->link_param = 0xFF; + nic->channel_id_param = cpu_to_le16(0xF000); + nic->acpi_params = 0xFF; + nic->wol_param = 0x0F; + nic->tunnel_iface_count = 0xFFFF; + nic->direct_tenant_iface_count = 0xFFFF; + nic->bw_min = 0xFFFFFFFF; + nic->bw_max = 0xFFFFFFFF; +} + +/* Mark all fields invalid */ +static void be_reset_pcie_desc(struct be_pcie_res_desc *pcie) +{ + memset(pcie, 0, sizeof(*pcie)); + pcie->sriov_state = 0xFF; + pcie->pf_state = 0xFF; + pcie->pf_type = 0xFF; + pcie->num_vfs = 0xFFFF; +} + +int be_cmd_config_qos(struct be_adapter *adapter, u32 max_rate, u16 link_speed, + u8 domain) +{ + struct be_nic_res_desc nic_desc; + u32 bw_percent; + u16 version = 0; + + if (BE3_chip(adapter)) + return be_cmd_set_qos(adapter, max_rate / 10, domain); + + be_reset_nic_desc(&nic_desc); + nic_desc.pf_num = adapter->pf_number; + nic_desc.vf_num = domain; + if (lancer_chip(adapter)) { + nic_desc.hdr.desc_type = NIC_RESOURCE_DESC_TYPE_V0; + nic_desc.hdr.desc_len = RESOURCE_DESC_SIZE_V0; + nic_desc.flags = (1 << QUN_SHIFT) | (1 << IMM_SHIFT) | + (1 << NOSV_SHIFT); + nic_desc.bw_max = cpu_to_le32(max_rate / 10); + } else { + version = 1; + nic_desc.hdr.desc_type = NIC_RESOURCE_DESC_TYPE_V1; + nic_desc.hdr.desc_len = RESOURCE_DESC_SIZE_V1; + nic_desc.flags = (1 << IMM_SHIFT) | (1 << NOSV_SHIFT); + bw_percent = max_rate ? (max_rate * 100) / link_speed : 100; + nic_desc.bw_max = cpu_to_le32(bw_percent); + } + + return be_cmd_set_profile_config(adapter, &nic_desc, + nic_desc.hdr.desc_len, + 1, version, domain); +} + +int be_cmd_set_sriov_config(struct be_adapter *adapter, + struct be_resources res, u16 num_vfs) +{ + struct { + struct be_pcie_res_desc pcie; + struct be_nic_res_desc nic_vft; + } __packed desc; + u16 vf_q_count; + + if (BEx_chip(adapter) || lancer_chip(adapter)) + return 0; + + /* PF PCIE descriptor */ + be_reset_pcie_desc(&desc.pcie); + desc.pcie.hdr.desc_type = PCIE_RESOURCE_DESC_TYPE_V1; + desc.pcie.hdr.desc_len = RESOURCE_DESC_SIZE_V1; + desc.pcie.flags = (1 << IMM_SHIFT) | (1 << NOSV_SHIFT); + desc.pcie.pf_num = adapter->pdev->devfn; + desc.pcie.sriov_state = num_vfs ? 1 : 0; + desc.pcie.num_vfs = cpu_to_le16(num_vfs); + + /* VF NIC Template descriptor */ + be_reset_nic_desc(&desc.nic_vft); + desc.nic_vft.hdr.desc_type = NIC_RESOURCE_DESC_TYPE_V1; + desc.nic_vft.hdr.desc_len = RESOURCE_DESC_SIZE_V1; + desc.nic_vft.flags = (1 << VFT_SHIFT) | (1 << IMM_SHIFT) | + (1 << NOSV_SHIFT); + desc.nic_vft.pf_num = adapter->pdev->devfn; + desc.nic_vft.vf_num = 0; + + if (num_vfs && res.vf_if_cap_flags & BE_IF_FLAGS_RSS) { + /* If number of VFs requested is 8 less than max supported, + * assign 8 queue pairs to the PF and divide the remaining + * resources evenly among the VFs + */ + if (num_vfs < (be_max_vfs(adapter) - 8)) + vf_q_count = (res.max_rss_qs - 8) / num_vfs; + else + vf_q_count = res.max_rss_qs / num_vfs; + + desc.nic_vft.rq_count = cpu_to_le16(vf_q_count); + desc.nic_vft.txq_count = cpu_to_le16(vf_q_count); + desc.nic_vft.rssq_count = cpu_to_le16(vf_q_count - 1); + desc.nic_vft.cq_count = cpu_to_le16(3 * vf_q_count); + } else { + desc.nic_vft.txq_count = cpu_to_le16(1); + desc.nic_vft.rq_count = cpu_to_le16(1); + desc.nic_vft.rssq_count = cpu_to_le16(0); + /* One CQ for each TX, RX and MCCQ */ + desc.nic_vft.cq_count = cpu_to_le16(3); + } + + return be_cmd_set_profile_config(adapter, &desc, + 2 * RESOURCE_DESC_SIZE_V1, 2, 1, 0); +} + +int be_cmd_manage_iface(struct be_adapter *adapter, u32 iface, u8 op) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_manage_iface_filters *req; + int status; + + if (iface == 0xFFFFFFFF) + return -1; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_MANAGE_IFACE_FILTERS, sizeof(*req), + wrb, NULL); + req->op = op; + req->target_iface_id = cpu_to_le32(iface); + + status = be_mcc_notify_wait(adapter); +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +int be_cmd_set_vxlan_port(struct be_adapter *adapter, __be16 port) +{ + struct be_port_res_desc port_desc; + + memset(&port_desc, 0, sizeof(port_desc)); + port_desc.hdr.desc_type = PORT_RESOURCE_DESC_TYPE_V1; + port_desc.hdr.desc_len = RESOURCE_DESC_SIZE_V1; + port_desc.flags = (1 << IMM_SHIFT) | (1 << NOSV_SHIFT); + port_desc.link_num = adapter->hba_port_num; + if (port) { + port_desc.nv_flags = NV_TYPE_VXLAN | (1 << SOCVID_SHIFT) | + (1 << RCVID_SHIFT); + port_desc.nv_port = swab16(port); + } else { + port_desc.nv_flags = NV_TYPE_DISABLED; + port_desc.nv_port = 0; + } + + return be_cmd_set_profile_config(adapter, &port_desc, + RESOURCE_DESC_SIZE_V1, 1, 1, 0); +} + +int be_cmd_get_if_id(struct be_adapter *adapter, struct be_vf_cfg *vf_cfg, + int vf_num) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_get_iface_list *req; + struct be_cmd_resp_get_iface_list *resp; + int status; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_GET_IFACE_LIST, sizeof(*resp), + wrb, NULL); + req->hdr.domain = vf_num + 1; + + status = be_mcc_notify_wait(adapter); + if (!status) { + resp = (struct be_cmd_resp_get_iface_list *)req; + vf_cfg->if_handle = le32_to_cpu(resp->if_desc.if_id); + } + +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +static int lancer_wait_idle(struct be_adapter *adapter) +{ +#define SLIPORT_IDLE_TIMEOUT 30 + u32 reg_val; + int status = 0, i; + + for (i = 0; i < SLIPORT_IDLE_TIMEOUT; i++) { + reg_val = ioread32(adapter->db + PHYSDEV_CONTROL_OFFSET); + if ((reg_val & PHYSDEV_CONTROL_INP_MASK) == 0) + break; + + ssleep(1); + } + + if (i == SLIPORT_IDLE_TIMEOUT) + status = -1; + + return status; +} + +int lancer_physdev_ctrl(struct be_adapter *adapter, u32 mask) +{ + int status = 0; + + status = lancer_wait_idle(adapter); + if (status) + return status; + + iowrite32(mask, adapter->db + PHYSDEV_CONTROL_OFFSET); + + return status; +} + +/* Routine to check whether dump image is present or not */ +bool dump_present(struct be_adapter *adapter) +{ + u32 sliport_status = 0; + + sliport_status = ioread32(adapter->db + SLIPORT_STATUS_OFFSET); + return !!(sliport_status & SLIPORT_STATUS_DIP_MASK); +} + +int lancer_initiate_dump(struct be_adapter *adapter) +{ + struct device *dev = &adapter->pdev->dev; + int status; + + if (dump_present(adapter)) { + dev_info(dev, "Previous dump not cleared, not forcing dump\n"); + return -EEXIST; + } + + /* give firmware reset and diagnostic dump */ + status = lancer_physdev_ctrl(adapter, PHYSDEV_CONTROL_FW_RESET_MASK | + PHYSDEV_CONTROL_DD_MASK); + if (status < 0) { + dev_err(dev, "FW reset failed\n"); + return status; + } + + status = lancer_wait_idle(adapter); + if (status) + return status; + + if (!dump_present(adapter)) { + dev_err(dev, "FW dump not generated\n"); + return -EIO; + } + + return 0; +} + +int lancer_delete_dump(struct be_adapter *adapter) +{ + int status; + + status = lancer_cmd_delete_object(adapter, LANCER_FW_DUMP_FILE); + return be_cmd_status(status); +} + +/* Uses sync mcc */ +int be_cmd_enable_vf(struct be_adapter *adapter, u8 domain) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_enable_disable_vf *req; + int status; + + if (BEx_chip(adapter)) + return 0; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_ENABLE_DISABLE_VF, sizeof(*req), + wrb, NULL); + + req->hdr.domain = domain; + req->enable = 1; + status = be_mcc_notify_wait(adapter); +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +int be_cmd_intr_set(struct be_adapter *adapter, bool intr_enable) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_intr_set *req; + int status; + + if (mutex_lock_interruptible(&adapter->mbox_lock)) + return -1; + + wrb = wrb_from_mbox(adapter); + + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_SET_INTERRUPT_ENABLE, sizeof(*req), + wrb, NULL); + + req->intr_enabled = intr_enable; + + status = be_mbox_notify_wait(adapter); + + mutex_unlock(&adapter->mbox_lock); + return status; +} + +/* Uses MBOX */ +int be_cmd_get_active_profile(struct be_adapter *adapter, u16 *profile_id) +{ + struct be_cmd_req_get_active_profile *req; + struct be_mcc_wrb *wrb; + int status; + + if (mutex_lock_interruptible(&adapter->mbox_lock)) + return -1; + + wrb = wrb_from_mbox(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_GET_ACTIVE_PROFILE, sizeof(*req), + wrb, NULL); + + status = be_mbox_notify_wait(adapter); + if (!status) { + struct be_cmd_resp_get_active_profile *resp = + embedded_payload(wrb); + + *profile_id = le16_to_cpu(resp->active_profile_id); + } + +err: + mutex_unlock(&adapter->mbox_lock); + return status; +} + +int be_cmd_set_logical_link_config(struct be_adapter *adapter, + int link_state, u8 domain) +{ + struct be_mcc_wrb *wrb; + struct be_cmd_req_set_ll_link *req; + int status; + + if (BEx_chip(adapter) || lancer_chip(adapter)) + return 0; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + + req = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON, + OPCODE_COMMON_SET_LOGICAL_LINK_CONFIG, + sizeof(*req), wrb, NULL); + + req->hdr.version = 1; + req->hdr.domain = domain; + + if (link_state == IFLA_VF_LINK_STATE_ENABLE) + req->link_config |= 1; + + if (link_state == IFLA_VF_LINK_STATE_AUTO) + req->link_config |= 1 << PLINK_TRACK_SHIFT; + + status = be_mcc_notify_wait(adapter); +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} + +int be_roce_mcc_cmd(void *netdev_handle, void *wrb_payload, + int wrb_payload_size, u16 *cmd_status, u16 *ext_status) +{ + struct be_adapter *adapter = netdev_priv(netdev_handle); + struct be_mcc_wrb *wrb; + struct be_cmd_req_hdr *hdr = (struct be_cmd_req_hdr *)wrb_payload; + struct be_cmd_req_hdr *req; + struct be_cmd_resp_hdr *resp; + int status; + + spin_lock_bh(&adapter->mcc_lock); + + wrb = wrb_from_mccq(adapter); + if (!wrb) { + status = -EBUSY; + goto err; + } + req = embedded_payload(wrb); + resp = embedded_payload(wrb); + + be_wrb_cmd_hdr_prepare(req, hdr->subsystem, + hdr->opcode, wrb_payload_size, wrb, NULL); + memcpy(req, wrb_payload, wrb_payload_size); + be_dws_cpu_to_le(req, wrb_payload_size); + + status = be_mcc_notify_wait(adapter); + if (cmd_status) + *cmd_status = (status & 0xffff); + if (ext_status) + *ext_status = 0; + memcpy(wrb_payload, resp, sizeof(*resp) + resp->response_length); + be_dws_le_to_cpu(wrb_payload, sizeof(*resp) + resp->response_length); +err: + spin_unlock_bh(&adapter->mcc_lock); + return status; +} +EXPORT_SYMBOL(be_roce_mcc_cmd); diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.h b/drivers/net/ethernet/emulex/benet/be_cmds.h new file mode 100644 index 0000000..eb5085d --- /dev/null +++ b/drivers/net/ethernet/emulex/benet/be_cmds.h @@ -0,0 +1,2154 @@ +/* + * Copyright (C) 2005 - 2014 Emulex + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. The full GNU General + * Public License is included in this distribution in the file called COPYING. + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + */ + +/* + * The driver sends configuration and managements command requests to the + * firmware in the BE. These requests are communicated to the processor + * using Work Request Blocks (WRBs) submitted to the MCC-WRB ring or via one + * WRB inside a MAILBOX. + * The commands are serviced by the ARM processor in the BladeEngine's MPU. + */ + +struct be_sge { + u32 pa_lo; + u32 pa_hi; + u32 len; +}; + +#define MCC_WRB_EMBEDDED_MASK 1 /* bit 0 of dword 0*/ +#define MCC_WRB_SGE_CNT_SHIFT 3 /* bits 3 - 7 of dword 0 */ +#define MCC_WRB_SGE_CNT_MASK 0x1F /* bits 3 - 7 of dword 0 */ +struct be_mcc_wrb { + u32 embedded; /* dword 0 */ + u32 payload_length; /* dword 1 */ + u32 tag0; /* dword 2 */ + u32 tag1; /* dword 3 */ + u32 rsvd; /* dword 4 */ + union { + u8 embedded_payload[236]; /* used by embedded cmds */ + struct be_sge sgl[19]; /* used by non-embedded cmds */ + } payload; +}; + +#define CQE_FLAGS_VALID_MASK (1 << 31) +#define CQE_FLAGS_ASYNC_MASK (1 << 30) +#define CQE_FLAGS_COMPLETED_MASK (1 << 28) +#define CQE_FLAGS_CONSUMED_MASK (1 << 27) + +/* Completion Status */ +enum mcc_base_status { + MCC_STATUS_SUCCESS = 0, + MCC_STATUS_FAILED = 1, + MCC_STATUS_ILLEGAL_REQUEST = 2, + MCC_STATUS_ILLEGAL_FIELD = 3, + MCC_STATUS_INSUFFICIENT_BUFFER = 4, + MCC_STATUS_UNAUTHORIZED_REQUEST = 5, + MCC_STATUS_NOT_SUPPORTED = 66, + MCC_STATUS_FEATURE_NOT_SUPPORTED = 68 +}; + +/* Additional status */ +enum mcc_addl_status { + MCC_ADDL_STATUS_INSUFFICIENT_RESOURCES = 0x16, + MCC_ADDL_STATUS_FLASH_IMAGE_CRC_MISMATCH = 0x4d, + MCC_ADDL_STATUS_TOO_MANY_INTERFACES = 0x4a +}; + +#define CQE_BASE_STATUS_MASK 0xFFFF +#define CQE_BASE_STATUS_SHIFT 0 /* bits 0 - 15 */ +#define CQE_ADDL_STATUS_MASK 0xFF +#define CQE_ADDL_STATUS_SHIFT 16 /* bits 16 - 31 */ + +#define base_status(status) \ + ((enum mcc_base_status) \ + (status > 0 ? (status & CQE_BASE_STATUS_MASK) : 0)) +#define addl_status(status) \ + ((enum mcc_addl_status) \ + (status > 0 ? (status >> CQE_ADDL_STATUS_SHIFT) & \ + CQE_ADDL_STATUS_MASK : 0)) + +struct be_mcc_compl { + u32 status; /* dword 0 */ + u32 tag0; /* dword 1 */ + u32 tag1; /* dword 2 */ + u32 flags; /* dword 3 */ +}; + +/* When the async bit of mcc_compl flags is set, flags + * is interpreted as follows: + */ +#define ASYNC_EVENT_CODE_SHIFT 8 /* bits 8 - 15 */ +#define ASYNC_EVENT_CODE_MASK 0xFF +#define ASYNC_EVENT_TYPE_SHIFT 16 +#define ASYNC_EVENT_TYPE_MASK 0xFF +#define ASYNC_EVENT_CODE_LINK_STATE 0x1 +#define ASYNC_EVENT_CODE_GRP_5 0x5 +#define ASYNC_EVENT_QOS_SPEED 0x1 +#define ASYNC_EVENT_COS_PRIORITY 0x2 +#define ASYNC_EVENT_PVID_STATE 0x3 +#define ASYNC_EVENT_CODE_QNQ 0x6 +#define ASYNC_DEBUG_EVENT_TYPE_QNQ 1 + +enum { + LINK_DOWN = 0x0, + LINK_UP = 0x1 +}; +#define LINK_STATUS_MASK 0x1 +#define LOGICAL_LINK_STATUS_MASK 0x2 + +/* When the event code of compl->flags is link-state, the mcc_compl + * must be interpreted as follows + */ +struct be_async_event_link_state { + u8 physical_port; + u8 port_link_status; + u8 port_duplex; + u8 port_speed; + u8 port_fault; + u8 rsvd0[7]; + u32 flags; +} __packed; + +/* When the event code of compl->flags is GRP-5 and event_type is QOS_SPEED + * the mcc_compl must be interpreted as follows + */ +struct be_async_event_grp5_qos_link_speed { + u8 physical_port; + u8 rsvd[5]; + u16 qos_link_speed; + u32 event_tag; + u32 flags; +} __packed; + +/* When the event code of compl->flags is GRP5 and event type is + * CoS-Priority, the mcc_compl must be interpreted as follows + */ +struct be_async_event_grp5_cos_priority { + u8 physical_port; + u8 available_priority_bmap; + u8 reco_default_priority; + u8 valid; + u8 rsvd0; + u8 event_tag; + u32 flags; +} __packed; + +/* When the event code of compl->flags is GRP5 and event type is + * PVID state, the mcc_compl must be interpreted as follows + */ +struct be_async_event_grp5_pvid_state { + u8 enabled; + u8 rsvd0; + u16 tag; + u32 event_tag; + u32 rsvd1; + u32 flags; +} __packed; + +/* async event indicating outer VLAN tag in QnQ */ +struct be_async_event_qnq { + u8 valid; /* Indicates if outer VLAN is valid */ + u8 rsvd0; + u16 vlan_tag; + u32 event_tag; + u8 rsvd1[4]; + u32 flags; +} __packed; + +struct be_mcc_mailbox { + struct be_mcc_wrb wrb; + struct be_mcc_compl compl; +}; + +#define CMD_SUBSYSTEM_COMMON 0x1 +#define CMD_SUBSYSTEM_ETH 0x3 +#define CMD_SUBSYSTEM_LOWLEVEL 0xb + +#define OPCODE_COMMON_NTWK_MAC_QUERY 1 +#define OPCODE_COMMON_NTWK_MAC_SET 2 +#define OPCODE_COMMON_NTWK_MULTICAST_SET 3 +#define OPCODE_COMMON_NTWK_VLAN_CONFIG 4 +#define OPCODE_COMMON_NTWK_LINK_STATUS_QUERY 5 +#define OPCODE_COMMON_READ_FLASHROM 6 +#define OPCODE_COMMON_WRITE_FLASHROM 7 +#define OPCODE_COMMON_CQ_CREATE 12 +#define OPCODE_COMMON_EQ_CREATE 13 +#define OPCODE_COMMON_MCC_CREATE 21 +#define OPCODE_COMMON_SET_QOS 28 +#define OPCODE_COMMON_MCC_CREATE_EXT 90 +#define OPCODE_COMMON_SEEPROM_READ 30 +#define OPCODE_COMMON_GET_CNTL_ATTRIBUTES 32 +#define OPCODE_COMMON_NTWK_RX_FILTER 34 +#define OPCODE_COMMON_GET_FW_VERSION 35 +#define OPCODE_COMMON_SET_FLOW_CONTROL 36 +#define OPCODE_COMMON_GET_FLOW_CONTROL 37 +#define OPCODE_COMMON_SET_FRAME_SIZE 39 +#define OPCODE_COMMON_MODIFY_EQ_DELAY 41 +#define OPCODE_COMMON_FIRMWARE_CONFIG 42 +#define OPCODE_COMMON_NTWK_INTERFACE_CREATE 50 +#define OPCODE_COMMON_NTWK_INTERFACE_DESTROY 51 +#define OPCODE_COMMON_MCC_DESTROY 53 +#define OPCODE_COMMON_CQ_DESTROY 54 +#define OPCODE_COMMON_EQ_DESTROY 55 +#define OPCODE_COMMON_QUERY_FIRMWARE_CONFIG 58 +#define OPCODE_COMMON_NTWK_PMAC_ADD 59 +#define OPCODE_COMMON_NTWK_PMAC_DEL 60 +#define OPCODE_COMMON_FUNCTION_RESET 61 +#define OPCODE_COMMON_MANAGE_FAT 68 +#define OPCODE_COMMON_ENABLE_DISABLE_BEACON 69 +#define OPCODE_COMMON_GET_BEACON_STATE 70 +#define OPCODE_COMMON_READ_TRANSRECV_DATA 73 +#define OPCODE_COMMON_GET_PORT_NAME 77 +#define OPCODE_COMMON_SET_LOGICAL_LINK_CONFIG 80 +#define OPCODE_COMMON_SET_INTERRUPT_ENABLE 89 +#define OPCODE_COMMON_SET_FN_PRIVILEGES 100 +#define OPCODE_COMMON_GET_PHY_DETAILS 102 +#define OPCODE_COMMON_SET_DRIVER_FUNCTION_CAP 103 +#define OPCODE_COMMON_GET_CNTL_ADDITIONAL_ATTRIBUTES 121 +#define OPCODE_COMMON_GET_EXT_FAT_CAPABILITES 125 +#define OPCODE_COMMON_SET_EXT_FAT_CAPABILITES 126 +#define OPCODE_COMMON_GET_MAC_LIST 147 +#define OPCODE_COMMON_SET_MAC_LIST 148 +#define OPCODE_COMMON_GET_HSW_CONFIG 152 +#define OPCODE_COMMON_GET_FUNC_CONFIG 160 +#define OPCODE_COMMON_GET_PROFILE_CONFIG 164 +#define OPCODE_COMMON_SET_PROFILE_CONFIG 165 +#define OPCODE_COMMON_GET_ACTIVE_PROFILE 167 +#define OPCODE_COMMON_SET_HSW_CONFIG 153 +#define OPCODE_COMMON_GET_FN_PRIVILEGES 170 +#define OPCODE_COMMON_READ_OBJECT 171 +#define OPCODE_COMMON_WRITE_OBJECT 172 +#define OPCODE_COMMON_DELETE_OBJECT 174 +#define OPCODE_COMMON_MANAGE_IFACE_FILTERS 193 +#define OPCODE_COMMON_GET_IFACE_LIST 194 +#define OPCODE_COMMON_ENABLE_DISABLE_VF 196 + +#define OPCODE_ETH_RSS_CONFIG 1 +#define OPCODE_ETH_ACPI_CONFIG 2 +#define OPCODE_ETH_PROMISCUOUS 3 +#define OPCODE_ETH_GET_STATISTICS 4 +#define OPCODE_ETH_TX_CREATE 7 +#define OPCODE_ETH_RX_CREATE 8 +#define OPCODE_ETH_TX_DESTROY 9 +#define OPCODE_ETH_RX_DESTROY 10 +#define OPCODE_ETH_ACPI_WOL_MAGIC_CONFIG 12 +#define OPCODE_ETH_GET_PPORT_STATS 18 + +#define OPCODE_LOWLEVEL_HOST_DDR_DMA 17 +#define OPCODE_LOWLEVEL_LOOPBACK_TEST 18 +#define OPCODE_LOWLEVEL_SET_LOOPBACK_MODE 19 + +struct be_cmd_req_hdr { + u8 opcode; /* dword 0 */ + u8 subsystem; /* dword 0 */ + u8 port_number; /* dword 0 */ + u8 domain; /* dword 0 */ + u32 timeout; /* dword 1 */ + u32 request_length; /* dword 2 */ + u8 version; /* dword 3 */ + u8 rsvd[3]; /* dword 3 */ +}; + +#define RESP_HDR_INFO_OPCODE_SHIFT 0 /* bits 0 - 7 */ +#define RESP_HDR_INFO_SUBSYS_SHIFT 8 /* bits 8 - 15 */ +struct be_cmd_resp_hdr { + u8 opcode; /* dword 0 */ + u8 subsystem; /* dword 0 */ + u8 rsvd[2]; /* dword 0 */ + u8 base_status; /* dword 1 */ + u8 addl_status; /* dword 1 */ + u8 rsvd1[2]; /* dword 1 */ + u32 response_length; /* dword 2 */ + u32 actual_resp_len; /* dword 3 */ +}; + +struct phys_addr { + u32 lo; + u32 hi; +}; + +/************************** + * BE Command definitions * + **************************/ + +/* Pseudo amap definition in which each bit of the actual structure is defined + * as a byte: used to calculate offset/shift/mask of each field */ +struct amap_eq_context { + u8 cidx[13]; /* dword 0*/ + u8 rsvd0[3]; /* dword 0*/ + u8 epidx[13]; /* dword 0*/ + u8 valid; /* dword 0*/ + u8 rsvd1; /* dword 0*/ + u8 size; /* dword 0*/ + u8 pidx[13]; /* dword 1*/ + u8 rsvd2[3]; /* dword 1*/ + u8 pd[10]; /* dword 1*/ + u8 count[3]; /* dword 1*/ + u8 solevent; /* dword 1*/ + u8 stalled; /* dword 1*/ + u8 armed; /* dword 1*/ + u8 rsvd3[4]; /* dword 2*/ + u8 func[8]; /* dword 2*/ + u8 rsvd4; /* dword 2*/ + u8 delaymult[10]; /* dword 2*/ + u8 rsvd5[2]; /* dword 2*/ + u8 phase[2]; /* dword 2*/ + u8 nodelay; /* dword 2*/ + u8 rsvd6[4]; /* dword 2*/ + u8 rsvd7[32]; /* dword 3*/ +} __packed; + +struct be_cmd_req_eq_create { + struct be_cmd_req_hdr hdr; + u16 num_pages; /* sword */ + u16 rsvd0; /* sword */ + u8 context[sizeof(struct amap_eq_context) / 8]; + struct phys_addr pages[8]; +} __packed; + +struct be_cmd_resp_eq_create { + struct be_cmd_resp_hdr resp_hdr; + u16 eq_id; /* sword */ + u16 msix_idx; /* available only in v2 */ +} __packed; + +/******************** Mac query ***************************/ +enum { + MAC_ADDRESS_TYPE_STORAGE = 0x0, + MAC_ADDRESS_TYPE_NETWORK = 0x1, + MAC_ADDRESS_TYPE_PD = 0x2, + MAC_ADDRESS_TYPE_MANAGEMENT = 0x3 +}; + +struct mac_addr { + u16 size_of_struct; + u8 addr[ETH_ALEN]; +} __packed; + +struct be_cmd_req_mac_query { + struct be_cmd_req_hdr hdr; + u8 type; + u8 permanent; + u16 if_id; + u32 pmac_id; +} __packed; + +struct be_cmd_resp_mac_query { + struct be_cmd_resp_hdr hdr; + struct mac_addr mac; +}; + +/******************** PMac Add ***************************/ +struct be_cmd_req_pmac_add { + struct be_cmd_req_hdr hdr; + u32 if_id; + u8 mac_address[ETH_ALEN]; + u8 rsvd0[2]; +} __packed; + +struct be_cmd_resp_pmac_add { + struct be_cmd_resp_hdr hdr; + u32 pmac_id; +}; + +/******************** PMac Del ***************************/ +struct be_cmd_req_pmac_del { + struct be_cmd_req_hdr hdr; + u32 if_id; + u32 pmac_id; +}; + +/******************** Create CQ ***************************/ +/* Pseudo amap definition in which each bit of the actual structure is defined + * as a byte: used to calculate offset/shift/mask of each field */ +struct amap_cq_context_be { + u8 cidx[11]; /* dword 0*/ + u8 rsvd0; /* dword 0*/ + u8 coalescwm[2]; /* dword 0*/ + u8 nodelay; /* dword 0*/ + u8 epidx[11]; /* dword 0*/ + u8 rsvd1; /* dword 0*/ + u8 count[2]; /* dword 0*/ + u8 valid; /* dword 0*/ + u8 solevent; /* dword 0*/ + u8 eventable; /* dword 0*/ + u8 pidx[11]; /* dword 1*/ + u8 rsvd2; /* dword 1*/ + u8 pd[10]; /* dword 1*/ + u8 eqid[8]; /* dword 1*/ + u8 stalled; /* dword 1*/ + u8 armed; /* dword 1*/ + u8 rsvd3[4]; /* dword 2*/ + u8 func[8]; /* dword 2*/ + u8 rsvd4[20]; /* dword 2*/ + u8 rsvd5[32]; /* dword 3*/ +} __packed; + +struct amap_cq_context_v2 { + u8 rsvd0[12]; /* dword 0*/ + u8 coalescwm[2]; /* dword 0*/ + u8 nodelay; /* dword 0*/ + u8 rsvd1[12]; /* dword 0*/ + u8 count[2]; /* dword 0*/ + u8 valid; /* dword 0*/ + u8 rsvd2; /* dword 0*/ + u8 eventable; /* dword 0*/ + u8 eqid[16]; /* dword 1*/ + u8 rsvd3[15]; /* dword 1*/ + u8 armed; /* dword 1*/ + u8 rsvd4[32]; /* dword 2*/ + u8 rsvd5[32]; /* dword 3*/ +} __packed; + +struct be_cmd_req_cq_create { + struct be_cmd_req_hdr hdr; + u16 num_pages; + u8 page_size; + u8 rsvd0; + u8 context[sizeof(struct amap_cq_context_be) / 8]; + struct phys_addr pages[8]; +} __packed; + + +struct be_cmd_resp_cq_create { + struct be_cmd_resp_hdr hdr; + u16 cq_id; + u16 rsvd0; +} __packed; + +struct be_cmd_req_get_fat { + struct be_cmd_req_hdr hdr; + u32 fat_operation; + u32 read_log_offset; + u32 read_log_length; + u32 data_buffer_size; + u32 data_buffer[1]; +} __packed; + +struct be_cmd_resp_get_fat { + struct be_cmd_resp_hdr hdr; + u32 log_size; + u32 read_log_length; + u32 rsvd[2]; + u32 data_buffer[1]; +} __packed; + + +/******************** Create MCCQ ***************************/ +/* Pseudo amap definition in which each bit of the actual structure is defined + * as a byte: used to calculate offset/shift/mask of each field */ +struct amap_mcc_context_be { + u8 con_index[14]; + u8 rsvd0[2]; + u8 ring_size[4]; + u8 fetch_wrb; + u8 fetch_r2t; + u8 cq_id[10]; + u8 prod_index[14]; + u8 fid[8]; + u8 pdid[9]; + u8 valid; + u8 rsvd1[32]; + u8 rsvd2[32]; +} __packed; + +struct amap_mcc_context_v1 { + u8 async_cq_id[16]; + u8 ring_size[4]; + u8 rsvd0[12]; + u8 rsvd1[31]; + u8 valid; + u8 async_cq_valid[1]; + u8 rsvd2[31]; + u8 rsvd3[32]; +} __packed; + +struct be_cmd_req_mcc_create { + struct be_cmd_req_hdr hdr; + u16 num_pages; + u16 cq_id; + u8 context[sizeof(struct amap_mcc_context_be) / 8]; + struct phys_addr pages[8]; +} __packed; + +struct be_cmd_req_mcc_ext_create { + struct be_cmd_req_hdr hdr; + u16 num_pages; + u16 cq_id; + u32 async_event_bitmap[1]; + u8 context[sizeof(struct amap_mcc_context_v1) / 8]; + struct phys_addr pages[8]; +} __packed; + +struct be_cmd_resp_mcc_create { + struct be_cmd_resp_hdr hdr; + u16 id; + u16 rsvd0; +} __packed; + +/******************** Create TxQ ***************************/ +#define BE_ETH_TX_RING_TYPE_STANDARD 2 +#define BE_ULP1_NUM 1 + +struct be_cmd_req_eth_tx_create { + struct be_cmd_req_hdr hdr; + u8 num_pages; + u8 ulp_num; + u16 type; + u16 if_id; + u8 queue_size; + u8 rsvd0; + u32 rsvd1; + u16 cq_id; + u16 rsvd2; + u32 rsvd3[13]; + struct phys_addr pages[8]; +} __packed; + +struct be_cmd_resp_eth_tx_create { + struct be_cmd_resp_hdr hdr; + u16 cid; + u16 rid; + u32 db_offset; + u32 rsvd0[4]; +} __packed; + +/******************** Create RxQ ***************************/ +struct be_cmd_req_eth_rx_create { + struct be_cmd_req_hdr hdr; + u16 cq_id; + u8 frag_size; + u8 num_pages; + struct phys_addr pages[2]; + u32 interface_id; + u16 max_frame_size; + u16 rsvd0; + u32 rss_queue; +} __packed; + +struct be_cmd_resp_eth_rx_create { + struct be_cmd_resp_hdr hdr; + u16 id; + u8 rss_id; + u8 rsvd0; +} __packed; + +/******************** Q Destroy ***************************/ +/* Type of Queue to be destroyed */ +enum { + QTYPE_EQ = 1, + QTYPE_CQ, + QTYPE_TXQ, + QTYPE_RXQ, + QTYPE_MCCQ +}; + +struct be_cmd_req_q_destroy { + struct be_cmd_req_hdr hdr; + u16 id; + u16 bypass_flush; /* valid only for rx q destroy */ +} __packed; + +/************ I/f Create (it's actually I/f Config Create)**********/ + +/* Capability flags for the i/f */ +enum be_if_flags { + BE_IF_FLAGS_RSS = 0x4, + BE_IF_FLAGS_PROMISCUOUS = 0x8, + BE_IF_FLAGS_BROADCAST = 0x10, + BE_IF_FLAGS_UNTAGGED = 0x20, + BE_IF_FLAGS_ULP = 0x40, + BE_IF_FLAGS_VLAN_PROMISCUOUS = 0x80, + BE_IF_FLAGS_VLAN = 0x100, + BE_IF_FLAGS_MCAST_PROMISCUOUS = 0x200, + BE_IF_FLAGS_PASS_L2_ERRORS = 0x400, + BE_IF_FLAGS_PASS_L3L4_ERRORS = 0x800, + BE_IF_FLAGS_MULTICAST = 0x1000 +}; + +#define BE_IF_CAP_FLAGS_WANT (BE_IF_FLAGS_RSS | BE_IF_FLAGS_PROMISCUOUS |\ + BE_IF_FLAGS_BROADCAST | BE_IF_FLAGS_VLAN_PROMISCUOUS |\ + BE_IF_FLAGS_VLAN | BE_IF_FLAGS_MCAST_PROMISCUOUS |\ + BE_IF_FLAGS_PASS_L3L4_ERRORS | BE_IF_FLAGS_MULTICAST |\ + BE_IF_FLAGS_UNTAGGED) + +/* An RX interface is an object with one or more MAC addresses and + * filtering capabilities. */ +struct be_cmd_req_if_create { + struct be_cmd_req_hdr hdr; + u32 version; /* ignore currently */ + u32 capability_flags; + u32 enable_flags; + u8 mac_addr[ETH_ALEN]; + u8 rsvd0; + u8 pmac_invalid; /* if set, don't attach the mac addr to the i/f */ + u32 vlan_tag; /* not used currently */ +} __packed; + +struct be_cmd_resp_if_create { + struct be_cmd_resp_hdr hdr; + u32 interface_id; + u32 pmac_id; +}; + +/****** I/f Destroy(it's actually I/f Config Destroy )**********/ +struct be_cmd_req_if_destroy { + struct be_cmd_req_hdr hdr; + u32 interface_id; +}; + +/*************** HW Stats Get **********************************/ +struct be_port_rxf_stats_v0 { + u32 rx_bytes_lsd; /* dword 0*/ + u32 rx_bytes_msd; /* dword 1*/ + u32 rx_total_frames; /* dword 2*/ + u32 rx_unicast_frames; /* dword 3*/ + u32 rx_multicast_frames; /* dword 4*/ + u32 rx_broadcast_frames; /* dword 5*/ + u32 rx_crc_errors; /* dword 6*/ + u32 rx_alignment_symbol_errors; /* dword 7*/ + u32 rx_pause_frames; /* dword 8*/ + u32 rx_control_frames; /* dword 9*/ + u32 rx_in_range_errors; /* dword 10*/ + u32 rx_out_range_errors; /* dword 11*/ + u32 rx_frame_too_long; /* dword 12*/ + u32 rx_address_filtered; /* dword 13*/ + u32 rx_vlan_filtered; /* dword 14*/ + u32 rx_dropped_too_small; /* dword 15*/ + u32 rx_dropped_too_short; /* dword 16*/ + u32 rx_dropped_header_too_small; /* dword 17*/ + u32 rx_dropped_tcp_length; /* dword 18*/ + u32 rx_dropped_runt; /* dword 19*/ + u32 rx_64_byte_packets; /* dword 20*/ + u32 rx_65_127_byte_packets; /* dword 21*/ + u32 rx_128_256_byte_packets; /* dword 22*/ + u32 rx_256_511_byte_packets; /* dword 23*/ + u32 rx_512_1023_byte_packets; /* dword 24*/ + u32 rx_1024_1518_byte_packets; /* dword 25*/ + u32 rx_1519_2047_byte_packets; /* dword 26*/ + u32 rx_2048_4095_byte_packets; /* dword 27*/ + u32 rx_4096_8191_byte_packets; /* dword 28*/ + u32 rx_8192_9216_byte_packets; /* dword 29*/ + u32 rx_ip_checksum_errs; /* dword 30*/ + u32 rx_tcp_checksum_errs; /* dword 31*/ + u32 rx_udp_checksum_errs; /* dword 32*/ + u32 rx_non_rss_packets; /* dword 33*/ + u32 rx_ipv4_packets; /* dword 34*/ + u32 rx_ipv6_packets; /* dword 35*/ + u32 rx_ipv4_bytes_lsd; /* dword 36*/ + u32 rx_ipv4_bytes_msd; /* dword 37*/ + u32 rx_ipv6_bytes_lsd; /* dword 38*/ + u32 rx_ipv6_bytes_msd; /* dword 39*/ + u32 rx_chute1_packets; /* dword 40*/ + u32 rx_chute2_packets; /* dword 41*/ + u32 rx_chute3_packets; /* dword 42*/ + u32 rx_management_packets; /* dword 43*/ + u32 rx_switched_unicast_packets; /* dword 44*/ + u32 rx_switched_multicast_packets; /* dword 45*/ + u32 rx_switched_broadcast_packets; /* dword 46*/ + u32 tx_bytes_lsd; /* dword 47*/ + u32 tx_bytes_msd; /* dword 48*/ + u32 tx_unicastframes; /* dword 49*/ + u32 tx_multicastframes; /* dword 50*/ + u32 tx_broadcastframes; /* dword 51*/ + u32 tx_pauseframes; /* dword 52*/ + u32 tx_controlframes; /* dword 53*/ + u32 tx_64_byte_packets; /* dword 54*/ + u32 tx_65_127_byte_packets; /* dword 55*/ + u32 tx_128_256_byte_packets; /* dword 56*/ + u32 tx_256_511_byte_packets; /* dword 57*/ + u32 tx_512_1023_byte_packets; /* dword 58*/ + u32 tx_1024_1518_byte_packets; /* dword 59*/ + u32 tx_1519_2047_byte_packets; /* dword 60*/ + u32 tx_2048_4095_byte_packets; /* dword 61*/ + u32 tx_4096_8191_byte_packets; /* dword 62*/ + u32 tx_8192_9216_byte_packets; /* dword 63*/ + u32 rx_fifo_overflow; /* dword 64*/ + u32 rx_input_fifo_overflow; /* dword 65*/ +}; + +struct be_rxf_stats_v0 { + struct be_port_rxf_stats_v0 port[2]; + u32 rx_drops_no_pbuf; /* dword 132*/ + u32 rx_drops_no_txpb; /* dword 133*/ + u32 rx_drops_no_erx_descr; /* dword 134*/ + u32 rx_drops_no_tpre_descr; /* dword 135*/ + u32 management_rx_port_packets; /* dword 136*/ + u32 management_rx_port_bytes; /* dword 137*/ + u32 management_rx_port_pause_frames; /* dword 138*/ + u32 management_rx_port_errors; /* dword 139*/ + u32 management_tx_port_packets; /* dword 140*/ + u32 management_tx_port_bytes; /* dword 141*/ + u32 management_tx_port_pause; /* dword 142*/ + u32 management_rx_port_rxfifo_overflow; /* dword 143*/ + u32 rx_drops_too_many_frags; /* dword 144*/ + u32 rx_drops_invalid_ring; /* dword 145*/ + u32 forwarded_packets; /* dword 146*/ + u32 rx_drops_mtu; /* dword 147*/ + u32 rsvd0[7]; + u32 port0_jabber_events; + u32 port1_jabber_events; + u32 rsvd1[6]; +}; + +struct be_erx_stats_v0 { + u32 rx_drops_no_fragments[44]; /* dwordS 0 to 43*/ + u32 rsvd[4]; +}; + +struct be_pmem_stats { + u32 eth_red_drops; + u32 rsvd[5]; +}; + +struct be_hw_stats_v0 { + struct be_rxf_stats_v0 rxf; + u32 rsvd[48]; + struct be_erx_stats_v0 erx; + struct be_pmem_stats pmem; +}; + +struct be_cmd_req_get_stats_v0 { + struct be_cmd_req_hdr hdr; + u8 rsvd[sizeof(struct be_hw_stats_v0)]; +}; + +struct be_cmd_resp_get_stats_v0 { + struct be_cmd_resp_hdr hdr; + struct be_hw_stats_v0 hw_stats; +}; + +struct lancer_pport_stats { + u32 tx_packets_lo; + u32 tx_packets_hi; + u32 tx_unicast_packets_lo; + u32 tx_unicast_packets_hi; + u32 tx_multicast_packets_lo; + u32 tx_multicast_packets_hi; + u32 tx_broadcast_packets_lo; + u32 tx_broadcast_packets_hi; + u32 tx_bytes_lo; + u32 tx_bytes_hi; + u32 tx_unicast_bytes_lo; + u32 tx_unicast_bytes_hi; + u32 tx_multicast_bytes_lo; + u32 tx_multicast_bytes_hi; + u32 tx_broadcast_bytes_lo; + u32 tx_broadcast_bytes_hi; + u32 tx_discards_lo; + u32 tx_discards_hi; + u32 tx_errors_lo; + u32 tx_errors_hi; + u32 tx_pause_frames_lo; + u32 tx_pause_frames_hi; + u32 tx_pause_on_frames_lo; + u32 tx_pause_on_frames_hi; + u32 tx_pause_off_frames_lo; + u32 tx_pause_off_frames_hi; + u32 tx_internal_mac_errors_lo; + u32 tx_internal_mac_errors_hi; + u32 tx_control_frames_lo; + u32 tx_control_frames_hi; + u32 tx_packets_64_bytes_lo; + u32 tx_packets_64_bytes_hi; + u32 tx_packets_65_to_127_bytes_lo; + u32 tx_packets_65_to_127_bytes_hi; + u32 tx_packets_128_to_255_bytes_lo; + u32 tx_packets_128_to_255_bytes_hi; + u32 tx_packets_256_to_511_bytes_lo; + u32 tx_packets_256_to_511_bytes_hi; + u32 tx_packets_512_to_1023_bytes_lo; + u32 tx_packets_512_to_1023_bytes_hi; + u32 tx_packets_1024_to_1518_bytes_lo; + u32 tx_packets_1024_to_1518_bytes_hi; + u32 tx_packets_1519_to_2047_bytes_lo; + u32 tx_packets_1519_to_2047_bytes_hi; + u32 tx_packets_2048_to_4095_bytes_lo; + u32 tx_packets_2048_to_4095_bytes_hi; + u32 tx_packets_4096_to_8191_bytes_lo; + u32 tx_packets_4096_to_8191_bytes_hi; + u32 tx_packets_8192_to_9216_bytes_lo; + u32 tx_packets_8192_to_9216_bytes_hi; + u32 tx_lso_packets_lo; + u32 tx_lso_packets_hi; + u32 rx_packets_lo; + u32 rx_packets_hi; + u32 rx_unicast_packets_lo; + u32 rx_unicast_packets_hi; + u32 rx_multicast_packets_lo; + u32 rx_multicast_packets_hi; + u32 rx_broadcast_packets_lo; + u32 rx_broadcast_packets_hi; + u32 rx_bytes_lo; + u32 rx_bytes_hi; + u32 rx_unicast_bytes_lo; + u32 rx_unicast_bytes_hi; + u32 rx_multicast_bytes_lo; + u32 rx_multicast_bytes_hi; + u32 rx_broadcast_bytes_lo; + u32 rx_broadcast_bytes_hi; + u32 rx_unknown_protos; + u32 rsvd_69; /* Word 69 is reserved */ + u32 rx_discards_lo; + u32 rx_discards_hi; + u32 rx_errors_lo; + u32 rx_errors_hi; + u32 rx_crc_errors_lo; + u32 rx_crc_errors_hi; + u32 rx_alignment_errors_lo; + u32 rx_alignment_errors_hi; + u32 rx_symbol_errors_lo; + u32 rx_symbol_errors_hi; + u32 rx_pause_frames_lo; + u32 rx_pause_frames_hi; + u32 rx_pause_on_frames_lo; + u32 rx_pause_on_frames_hi; + u32 rx_pause_off_frames_lo; + u32 rx_pause_off_frames_hi; + u32 rx_frames_too_long_lo; + u32 rx_frames_too_long_hi; + u32 rx_internal_mac_errors_lo; + u32 rx_internal_mac_errors_hi; + u32 rx_undersize_packets; + u32 rx_oversize_packets; + u32 rx_fragment_packets; + u32 rx_jabbers; + u32 rx_control_frames_lo; + u32 rx_control_frames_hi; + u32 rx_control_frames_unknown_opcode_lo; + u32 rx_control_frames_unknown_opcode_hi; + u32 rx_in_range_errors; + u32 rx_out_of_range_errors; + u32 rx_address_filtered; + u32 rx_vlan_filtered; + u32 rx_dropped_too_small; + u32 rx_dropped_too_short; + u32 rx_dropped_header_too_small; + u32 rx_dropped_invalid_tcp_length; + u32 rx_dropped_runt; + u32 rx_ip_checksum_errors; + u32 rx_tcp_checksum_errors; + u32 rx_udp_checksum_errors; + u32 rx_non_rss_packets; + u32 rsvd_111; + u32 rx_ipv4_packets_lo; + u32 rx_ipv4_packets_hi; + u32 rx_ipv6_packets_lo; + u32 rx_ipv6_packets_hi; + u32 rx_ipv4_bytes_lo; + u32 rx_ipv4_bytes_hi; + u32 rx_ipv6_bytes_lo; + u32 rx_ipv6_bytes_hi; + u32 rx_nic_packets_lo; + u32 rx_nic_packets_hi; + u32 rx_tcp_packets_lo; + u32 rx_tcp_packets_hi; + u32 rx_iscsi_packets_lo; + u32 rx_iscsi_packets_hi; + u32 rx_management_packets_lo; + u32 rx_management_packets_hi; + u32 rx_switched_unicast_packets_lo; + u32 rx_switched_unicast_packets_hi; + u32 rx_switched_multicast_packets_lo; + u32 rx_switched_multicast_packets_hi; + u32 rx_switched_broadcast_packets_lo; + u32 rx_switched_broadcast_packets_hi; + u32 num_forwards_lo; + u32 num_forwards_hi; + u32 rx_fifo_overflow; + u32 rx_input_fifo_overflow; + u32 rx_drops_too_many_frags_lo; + u32 rx_drops_too_many_frags_hi; + u32 rx_drops_invalid_queue; + u32 rsvd_141; + u32 rx_drops_mtu_lo; + u32 rx_drops_mtu_hi; + u32 rx_packets_64_bytes_lo; + u32 rx_packets_64_bytes_hi; + u32 rx_packets_65_to_127_bytes_lo; + u32 rx_packets_65_to_127_bytes_hi; + u32 rx_packets_128_to_255_bytes_lo; + u32 rx_packets_128_to_255_bytes_hi; + u32 rx_packets_256_to_511_bytes_lo; + u32 rx_packets_256_to_511_bytes_hi; + u32 rx_packets_512_to_1023_bytes_lo; + u32 rx_packets_512_to_1023_bytes_hi; + u32 rx_packets_1024_to_1518_bytes_lo; + u32 rx_packets_1024_to_1518_bytes_hi; + u32 rx_packets_1519_to_2047_bytes_lo; + u32 rx_packets_1519_to_2047_bytes_hi; + u32 rx_packets_2048_to_4095_bytes_lo; + u32 rx_packets_2048_to_4095_bytes_hi; + u32 rx_packets_4096_to_8191_bytes_lo; + u32 rx_packets_4096_to_8191_bytes_hi; + u32 rx_packets_8192_to_9216_bytes_lo; + u32 rx_packets_8192_to_9216_bytes_hi; +}; + +struct pport_stats_params { + u16 pport_num; + u8 rsvd; + u8 reset_stats; +}; + +struct lancer_cmd_req_pport_stats { + struct be_cmd_req_hdr hdr; + union { + struct pport_stats_params params; + u8 rsvd[sizeof(struct lancer_pport_stats)]; + } cmd_params; +}; + +struct lancer_cmd_resp_pport_stats { + struct be_cmd_resp_hdr hdr; + struct lancer_pport_stats pport_stats; +}; + +static inline struct lancer_pport_stats* + pport_stats_from_cmd(struct be_adapter *adapter) +{ + struct lancer_cmd_resp_pport_stats *cmd = adapter->stats_cmd.va; + return &cmd->pport_stats; +} + +struct be_cmd_req_get_cntl_addnl_attribs { + struct be_cmd_req_hdr hdr; + u8 rsvd[8]; +}; + +struct be_cmd_resp_get_cntl_addnl_attribs { + struct be_cmd_resp_hdr hdr; + u16 ipl_file_number; + u8 ipl_file_version; + u8 rsvd0; + u8 on_die_temperature; /* in degrees centigrade*/ + u8 rsvd1[3]; +}; + +struct be_cmd_req_vlan_config { + struct be_cmd_req_hdr hdr; + u8 interface_id; + u8 promiscuous; + u8 untagged; + u8 num_vlan; + u16 normal_vlan[64]; +} __packed; + +/******************* RX FILTER ******************************/ +#define BE_MAX_MC 64 /* set mcast promisc if > 64 */ +struct macaddr { + u8 byte[ETH_ALEN]; +}; + +struct be_cmd_req_rx_filter { + struct be_cmd_req_hdr hdr; + u32 global_flags_mask; + u32 global_flags; + u32 if_flags_mask; + u32 if_flags; + u32 if_id; + u32 mcast_num; + struct macaddr mcast_mac[BE_MAX_MC]; +}; + +/******************** Link Status Query *******************/ +struct be_cmd_req_link_status { + struct be_cmd_req_hdr hdr; + u32 rsvd; +}; + +enum { + PHY_LINK_DUPLEX_NONE = 0x0, + PHY_LINK_DUPLEX_HALF = 0x1, + PHY_LINK_DUPLEX_FULL = 0x2 +}; + +enum { + PHY_LINK_SPEED_ZERO = 0x0, /* => No link */ + PHY_LINK_SPEED_10MBPS = 0x1, + PHY_LINK_SPEED_100MBPS = 0x2, + PHY_LINK_SPEED_1GBPS = 0x3, + PHY_LINK_SPEED_10GBPS = 0x4, + PHY_LINK_SPEED_20GBPS = 0x5, + PHY_LINK_SPEED_25GBPS = 0x6, + PHY_LINK_SPEED_40GBPS = 0x7 +}; + +struct be_cmd_resp_link_status { + struct be_cmd_resp_hdr hdr; + u8 physical_port; + u8 mac_duplex; + u8 mac_speed; + u8 mac_fault; + u8 mgmt_mac_duplex; + u8 mgmt_mac_speed; + u16 link_speed; + u8 logical_link_status; + u8 rsvd1[3]; +} __packed; + +/******************** Port Identification ***************************/ +/* Identifies the type of port attached to NIC */ +struct be_cmd_req_port_type { + struct be_cmd_req_hdr hdr; + __le32 page_num; + __le32 port; +}; + +enum { + TR_PAGE_A0 = 0xa0, + TR_PAGE_A2 = 0xa2 +}; + +/* From SFF-8436 QSFP+ spec */ +#define QSFP_PLUS_CABLE_TYPE_OFFSET 0x83 +#define QSFP_PLUS_CR4_CABLE 0x8 +#define QSFP_PLUS_SR4_CABLE 0x4 +#define QSFP_PLUS_LR4_CABLE 0x2 + +/* From SFF-8472 spec */ +#define SFP_PLUS_SFF_8472_COMP 0x5E +#define SFP_PLUS_CABLE_TYPE_OFFSET 0x8 +#define SFP_PLUS_COPPER_CABLE 0x4 + +#define PAGE_DATA_LEN 256 +struct be_cmd_resp_port_type { + struct be_cmd_resp_hdr hdr; + u32 page_num; + u32 port; + u8 page_data[PAGE_DATA_LEN]; +}; + +/******************** Get FW Version *******************/ +struct be_cmd_req_get_fw_version { + struct be_cmd_req_hdr hdr; + u8 rsvd0[FW_VER_LEN]; + u8 rsvd1[FW_VER_LEN]; +} __packed; + +struct be_cmd_resp_get_fw_version { + struct be_cmd_resp_hdr hdr; + u8 firmware_version_string[FW_VER_LEN]; + u8 fw_on_flash_version_string[FW_VER_LEN]; +} __packed; + +/******************** Set Flow Contrl *******************/ +struct be_cmd_req_set_flow_control { + struct be_cmd_req_hdr hdr; + u16 tx_flow_control; + u16 rx_flow_control; +} __packed; + +/******************** Get Flow Contrl *******************/ +struct be_cmd_req_get_flow_control { + struct be_cmd_req_hdr hdr; + u32 rsvd; +}; + +struct be_cmd_resp_get_flow_control { + struct be_cmd_resp_hdr hdr; + u16 tx_flow_control; + u16 rx_flow_control; +} __packed; + +/******************** Modify EQ Delay *******************/ +struct be_set_eqd { + u32 eq_id; + u32 phase; + u32 delay_multiplier; +}; + +struct be_cmd_req_modify_eq_delay { + struct be_cmd_req_hdr hdr; + u32 num_eq; + struct be_set_eqd set_eqd[MAX_EVT_QS]; +} __packed; + +/******************** Get FW Config *******************/ +/* The HW can come up in either of the following multi-channel modes + * based on the skew/IPL. + */ +#define RDMA_ENABLED 0x4 +#define QNQ_MODE 0x400 +#define VNIC_MODE 0x20000 +#define UMC_ENABLED 0x1000000 +struct be_cmd_req_query_fw_cfg { + struct be_cmd_req_hdr hdr; + u32 rsvd[31]; +}; + +struct be_cmd_resp_query_fw_cfg { + struct be_cmd_resp_hdr hdr; + u32 be_config_number; + u32 asic_revision; + u32 phys_port; + u32 function_mode; + u32 rsvd[26]; + u32 function_caps; +}; + +/******************** RSS Config ****************************************/ +/* RSS type Input parameters used to compute RX hash + * RSS_ENABLE_IPV4 SRC IPv4, DST IPv4 + * RSS_ENABLE_TCP_IPV4 SRC IPv4, DST IPv4, TCP SRC PORT, TCP DST PORT + * RSS_ENABLE_IPV6 SRC IPv6, DST IPv6 + * RSS_ENABLE_TCP_IPV6 SRC IPv6, DST IPv6, TCP SRC PORT, TCP DST PORT + * RSS_ENABLE_UDP_IPV4 SRC IPv4, DST IPv4, UDP SRC PORT, UDP DST PORT + * RSS_ENABLE_UDP_IPV6 SRC IPv6, DST IPv6, UDP SRC PORT, UDP DST PORT + * + * When multiple RSS types are enabled, HW picks the best hash policy + * based on the type of the received packet. + */ +#define RSS_ENABLE_NONE 0x0 +#define RSS_ENABLE_IPV4 0x1 +#define RSS_ENABLE_TCP_IPV4 0x2 +#define RSS_ENABLE_IPV6 0x4 +#define RSS_ENABLE_TCP_IPV6 0x8 +#define RSS_ENABLE_UDP_IPV4 0x10 +#define RSS_ENABLE_UDP_IPV6 0x20 + +#define L3_RSS_FLAGS (RXH_IP_DST | RXH_IP_SRC) +#define L4_RSS_FLAGS (RXH_L4_B_0_1 | RXH_L4_B_2_3) + +struct be_cmd_req_rss_config { + struct be_cmd_req_hdr hdr; + u32 if_id; + u16 enable_rss; + u16 cpu_table_size_log2; + u32 hash[10]; + u8 cpu_table[128]; + u8 flush; + u8 rsvd0[3]; +}; + +/******************** Port Beacon ***************************/ + +#define BEACON_STATE_ENABLED 0x1 +#define BEACON_STATE_DISABLED 0x0 + +struct be_cmd_req_enable_disable_beacon { + struct be_cmd_req_hdr hdr; + u8 port_num; + u8 beacon_state; + u8 beacon_duration; + u8 status_duration; +} __packed; + +struct be_cmd_req_get_beacon_state { + struct be_cmd_req_hdr hdr; + u8 port_num; + u8 rsvd0; + u16 rsvd1; +} __packed; + +struct be_cmd_resp_get_beacon_state { + struct be_cmd_resp_hdr resp_hdr; + u8 beacon_state; + u8 rsvd0[3]; +} __packed; + +/****************** Firmware Flash ******************/ +struct flashrom_params { + u32 op_code; + u32 op_type; + u32 data_buf_size; + u32 offset; +}; + +struct be_cmd_write_flashrom { + struct be_cmd_req_hdr hdr; + struct flashrom_params params; + u8 data_buf[32768]; + u8 rsvd[4]; +} __packed; + +/* cmd to read flash crc */ +struct be_cmd_read_flash_crc { + struct be_cmd_req_hdr hdr; + struct flashrom_params params; + u8 crc[4]; + u8 rsvd[4]; +} __packed; + +/**************** Lancer Firmware Flash ************/ +struct amap_lancer_write_obj_context { + u8 write_length[24]; + u8 reserved1[7]; + u8 eof; +} __packed; + +struct lancer_cmd_req_write_object { + struct be_cmd_req_hdr hdr; + u8 context[sizeof(struct amap_lancer_write_obj_context) / 8]; + u32 write_offset; + u8 object_name[104]; + u32 descriptor_count; + u32 buf_len; + u32 addr_low; + u32 addr_high; +}; + +#define LANCER_NO_RESET_NEEDED 0x00 +#define LANCER_FW_RESET_NEEDED 0x02 +struct lancer_cmd_resp_write_object { + u8 opcode; + u8 subsystem; + u8 rsvd1[2]; + u8 status; + u8 additional_status; + u8 rsvd2[2]; + u32 resp_len; + u32 actual_resp_len; + u32 actual_write_len; + u8 change_status; + u8 rsvd3[3]; +}; + +/************************ Lancer Read FW info **************/ +#define LANCER_READ_FILE_CHUNK (32*1024) +#define LANCER_READ_FILE_EOF_MASK 0x80000000 + +#define LANCER_FW_DUMP_FILE "/dbg/dump.bin" +#define LANCER_VPD_PF_FILE "/vpd/ntr_pf.vpd" +#define LANCER_VPD_VF_FILE "/vpd/ntr_vf.vpd" + +struct lancer_cmd_req_read_object { + struct be_cmd_req_hdr hdr; + u32 desired_read_len; + u32 read_offset; + u8 object_name[104]; + u32 descriptor_count; + u32 buf_len; + u32 addr_low; + u32 addr_high; +}; + +struct lancer_cmd_resp_read_object { + u8 opcode; + u8 subsystem; + u8 rsvd1[2]; + u8 status; + u8 additional_status; + u8 rsvd2[2]; + u32 resp_len; + u32 actual_resp_len; + u32 actual_read_len; + u32 eof; +}; + +struct lancer_cmd_req_delete_object { + struct be_cmd_req_hdr hdr; + u32 rsvd1; + u32 rsvd2; + u8 object_name[104]; +}; + +/************************ WOL *******************************/ +struct be_cmd_req_acpi_wol_magic_config{ + struct be_cmd_req_hdr hdr; + u32 rsvd0[145]; + u8 magic_mac[6]; + u8 rsvd2[2]; +} __packed; + +struct be_cmd_req_acpi_wol_magic_config_v1 { + struct be_cmd_req_hdr hdr; + u8 rsvd0[2]; + u8 query_options; + u8 rsvd1[5]; + u32 rsvd2[288]; + u8 magic_mac[6]; + u8 rsvd3[22]; +} __packed; + +struct be_cmd_resp_acpi_wol_magic_config_v1 { + struct be_cmd_resp_hdr hdr; + u8 rsvd0[2]; + u8 wol_settings; + u8 rsvd1[5]; + u32 rsvd2[295]; +} __packed; + +#define BE_GET_WOL_CAP 2 + +#define BE_WOL_CAP 0x1 +#define BE_PME_D0_CAP 0x8 +#define BE_PME_D1_CAP 0x10 +#define BE_PME_D2_CAP 0x20 +#define BE_PME_D3HOT_CAP 0x40 +#define BE_PME_D3COLD_CAP 0x80 + +/********************** LoopBack test *********************/ +struct be_cmd_req_loopback_test { + struct be_cmd_req_hdr hdr; + u32 loopback_type; + u32 num_pkts; + u64 pattern; + u32 src_port; + u32 dest_port; + u32 pkt_size; +}; + +struct be_cmd_resp_loopback_test { + struct be_cmd_resp_hdr resp_hdr; + u32 status; + u32 num_txfer; + u32 num_rx; + u32 miscomp_off; + u32 ticks_compl; +}; + +struct be_cmd_req_set_lmode { + struct be_cmd_req_hdr hdr; + u8 src_port; + u8 dest_port; + u8 loopback_type; + u8 loopback_state; +}; + +/********************** DDR DMA test *********************/ +struct be_cmd_req_ddrdma_test { + struct be_cmd_req_hdr hdr; + u64 pattern; + u32 byte_count; + u32 rsvd0; + u8 snd_buff[4096]; + u8 rsvd1[4096]; +}; + +struct be_cmd_resp_ddrdma_test { + struct be_cmd_resp_hdr hdr; + u64 pattern; + u32 byte_cnt; + u32 snd_err; + u8 rsvd0[4096]; + u8 rcv_buff[4096]; +}; + +/*********************** SEEPROM Read ***********************/ + +#define BE_READ_SEEPROM_LEN 1024 +struct be_cmd_req_seeprom_read { + struct be_cmd_req_hdr hdr; + u8 rsvd0[BE_READ_SEEPROM_LEN]; +}; + +struct be_cmd_resp_seeprom_read { + struct be_cmd_req_hdr hdr; + u8 seeprom_data[BE_READ_SEEPROM_LEN]; +}; + +enum { + PHY_TYPE_CX4_10GB = 0, + PHY_TYPE_XFP_10GB, + PHY_TYPE_SFP_1GB, + PHY_TYPE_SFP_PLUS_10GB, + PHY_TYPE_KR_10GB, + PHY_TYPE_KX4_10GB, + PHY_TYPE_BASET_10GB, + PHY_TYPE_BASET_1GB, + PHY_TYPE_BASEX_1GB, + PHY_TYPE_SGMII, + PHY_TYPE_QSFP, + PHY_TYPE_KR4_40GB, + PHY_TYPE_KR2_20GB, + PHY_TYPE_DISABLED = 255 +}; + +#define BE_SUPPORTED_SPEED_NONE 0 +#define BE_SUPPORTED_SPEED_10MBPS 1 +#define BE_SUPPORTED_SPEED_100MBPS 2 +#define BE_SUPPORTED_SPEED_1GBPS 4 +#define BE_SUPPORTED_SPEED_10GBPS 8 +#define BE_SUPPORTED_SPEED_20GBPS 0x10 +#define BE_SUPPORTED_SPEED_40GBPS 0x20 + +#define BE_AN_EN 0x2 +#define BE_PAUSE_SYM_EN 0x80 + +/* MAC speed valid values */ +#define SPEED_DEFAULT 0x0 +#define SPEED_FORCED_10GB 0x1 +#define SPEED_FORCED_1GB 0x2 +#define SPEED_AUTONEG_10GB 0x3 +#define SPEED_AUTONEG_1GB 0x4 +#define SPEED_AUTONEG_100MB 0x5 +#define SPEED_AUTONEG_10GB_1GB 0x6 +#define SPEED_AUTONEG_10GB_1GB_100MB 0x7 +#define SPEED_AUTONEG_1GB_100MB 0x8 +#define SPEED_AUTONEG_10MB 0x9 +#define SPEED_AUTONEG_1GB_100MB_10MB 0xa +#define SPEED_AUTONEG_100MB_10MB 0xb +#define SPEED_FORCED_100MB 0xc +#define SPEED_FORCED_10MB 0xd + +struct be_cmd_req_get_phy_info { + struct be_cmd_req_hdr hdr; + u8 rsvd0[24]; +}; + +struct be_phy_info { + u16 phy_type; + u16 interface_type; + u32 misc_params; + u16 ext_phy_details; + u16 rsvd; + u16 auto_speeds_supported; + u16 fixed_speeds_supported; + u32 future_use[2]; +}; + +struct be_cmd_resp_get_phy_info { + struct be_cmd_req_hdr hdr; + struct be_phy_info phy_info; +}; + +/*********************** Set QOS ***********************/ + +#define BE_QOS_BITS_NIC 1 + +struct be_cmd_req_set_qos { + struct be_cmd_req_hdr hdr; + u32 valid_bits; + u32 max_bps_nic; + u32 rsvd[7]; +}; + +/*********************** Controller Attributes ***********************/ +struct be_cmd_req_cntl_attribs { + struct be_cmd_req_hdr hdr; +}; + +struct be_cmd_resp_cntl_attribs { + struct be_cmd_resp_hdr hdr; + struct mgmt_controller_attrib attribs; +}; + +/*********************** Set driver function ***********************/ +#define CAPABILITY_SW_TIMESTAMPS 2 +#define CAPABILITY_BE3_NATIVE_ERX_API 4 + +struct be_cmd_req_set_func_cap { + struct be_cmd_req_hdr hdr; + u32 valid_cap_flags; + u32 cap_flags; + u8 rsvd[212]; +}; + +struct be_cmd_resp_set_func_cap { + struct be_cmd_resp_hdr hdr; + u32 valid_cap_flags; + u32 cap_flags; + u8 rsvd[212]; +}; + +/*********************** Function Privileges ***********************/ +enum { + BE_PRIV_DEFAULT = 0x1, + BE_PRIV_LNKQUERY = 0x2, + BE_PRIV_LNKSTATS = 0x4, + BE_PRIV_LNKMGMT = 0x8, + BE_PRIV_LNKDIAG = 0x10, + BE_PRIV_UTILQUERY = 0x20, + BE_PRIV_FILTMGMT = 0x40, + BE_PRIV_IFACEMGMT = 0x80, + BE_PRIV_VHADM = 0x100, + BE_PRIV_DEVCFG = 0x200, + BE_PRIV_DEVSEC = 0x400 +}; +#define MAX_PRIVILEGES (BE_PRIV_VHADM | BE_PRIV_DEVCFG | \ + BE_PRIV_DEVSEC) +#define MIN_PRIVILEGES BE_PRIV_DEFAULT + +struct be_cmd_priv_map { + u8 opcode; + u8 subsystem; + u32 priv_mask; +}; + +struct be_cmd_req_get_fn_privileges { + struct be_cmd_req_hdr hdr; + u32 rsvd; +}; + +struct be_cmd_resp_get_fn_privileges { + struct be_cmd_resp_hdr hdr; + u32 privilege_mask; +}; + +struct be_cmd_req_set_fn_privileges { + struct be_cmd_req_hdr hdr; + u32 privileges; /* Used by BE3, SH-R */ + u32 privileges_lancer; /* Used by Lancer */ +}; + +/******************** GET/SET_MACLIST **************************/ +#define BE_MAX_MAC 64 +struct be_cmd_req_get_mac_list { + struct be_cmd_req_hdr hdr; + u8 mac_type; + u8 perm_override; + u16 iface_id; + u32 mac_id; + u32 rsvd[3]; +} __packed; + +struct get_list_macaddr { + u16 mac_addr_size; + union { + u8 macaddr[6]; + struct { + u8 rsvd[2]; + u32 mac_id; + } __packed s_mac_id; + } __packed mac_addr_id; +} __packed; + +struct be_cmd_resp_get_mac_list { + struct be_cmd_resp_hdr hdr; + struct get_list_macaddr fd_macaddr; /* Factory default mac */ + struct get_list_macaddr macid_macaddr; /* soft mac */ + u8 true_mac_count; + u8 pseudo_mac_count; + u8 mac_list_size; + u8 rsvd; + /* perm override mac */ + struct get_list_macaddr macaddr_list[BE_MAX_MAC]; +} __packed; + +struct be_cmd_req_set_mac_list { + struct be_cmd_req_hdr hdr; + u8 mac_count; + u8 rsvd1; + u16 rsvd2; + struct macaddr mac[BE_MAX_MAC]; +} __packed; + +/*********************** HSW Config ***********************/ +#define PORT_FWD_TYPE_VEPA 0x3 +#define PORT_FWD_TYPE_VEB 0x2 + +struct amap_set_hsw_context { + u8 interface_id[16]; + u8 rsvd0[14]; + u8 pvid_valid; + u8 pport; + u8 rsvd1[6]; + u8 port_fwd_type[3]; + u8 rsvd2[7]; + u8 pvid[16]; + u8 rsvd3[32]; + u8 rsvd4[32]; + u8 rsvd5[32]; +} __packed; + +struct be_cmd_req_set_hsw_config { + struct be_cmd_req_hdr hdr; + u8 context[sizeof(struct amap_set_hsw_context) / 8]; +} __packed; + +struct amap_get_hsw_req_context { + u8 interface_id[16]; + u8 rsvd0[14]; + u8 pvid_valid; + u8 pport; +} __packed; + +struct amap_get_hsw_resp_context { + u8 rsvd0[6]; + u8 port_fwd_type[3]; + u8 rsvd1[7]; + u8 pvid[16]; + u8 rsvd2[32]; + u8 rsvd3[32]; + u8 rsvd4[32]; +} __packed; + +struct be_cmd_req_get_hsw_config { + struct be_cmd_req_hdr hdr; + u8 context[sizeof(struct amap_get_hsw_req_context) / 8]; +} __packed; + +struct be_cmd_resp_get_hsw_config { + struct be_cmd_resp_hdr hdr; + u8 context[sizeof(struct amap_get_hsw_resp_context) / 8]; + u32 rsvd; +}; + +/******************* get port names ***************/ +struct be_cmd_req_get_port_name { + struct be_cmd_req_hdr hdr; + u32 rsvd0; +}; + +struct be_cmd_resp_get_port_name { + struct be_cmd_req_hdr hdr; + u8 port_name[4]; +}; + +/*************** HW Stats Get v1 **********************************/ +#define BE_TXP_SW_SZ 48 +struct be_port_rxf_stats_v1 { + u32 rsvd0[12]; + u32 rx_crc_errors; + u32 rx_alignment_symbol_errors; + u32 rx_pause_frames; + u32 rx_priority_pause_frames; + u32 rx_control_frames; + u32 rx_in_range_errors; + u32 rx_out_range_errors; + u32 rx_frame_too_long; + u32 rx_address_filtered; + u32 rx_dropped_too_small; + u32 rx_dropped_too_short; + u32 rx_dropped_header_too_small; + u32 rx_dropped_tcp_length; + u32 rx_dropped_runt; + u32 rsvd1[10]; + u32 rx_ip_checksum_errs; + u32 rx_tcp_checksum_errs; + u32 rx_udp_checksum_errs; + u32 rsvd2[7]; + u32 rx_switched_unicast_packets; + u32 rx_switched_multicast_packets; + u32 rx_switched_broadcast_packets; + u32 rsvd3[3]; + u32 tx_pauseframes; + u32 tx_priority_pauseframes; + u32 tx_controlframes; + u32 rsvd4[10]; + u32 rxpp_fifo_overflow_drop; + u32 rx_input_fifo_overflow_drop; + u32 pmem_fifo_overflow_drop; + u32 jabber_events; + u32 rsvd5[3]; +}; + + +struct be_rxf_stats_v1 { + struct be_port_rxf_stats_v1 port[4]; + u32 rsvd0[2]; + u32 rx_drops_no_pbuf; + u32 rx_drops_no_txpb; + u32 rx_drops_no_erx_descr; + u32 rx_drops_no_tpre_descr; + u32 rsvd1[6]; + u32 rx_drops_too_many_frags; + u32 rx_drops_invalid_ring; + u32 forwarded_packets; + u32 rx_drops_mtu; + u32 rsvd2[14]; +}; + +struct be_erx_stats_v1 { + u32 rx_drops_no_fragments[68]; /* dwordS 0 to 67*/ + u32 rsvd[4]; +}; + +struct be_port_rxf_stats_v2 { + u32 rsvd0[10]; + u32 roce_bytes_received_lsd; + u32 roce_bytes_received_msd; + u32 rsvd1[5]; + u32 roce_frames_received; + u32 rx_crc_errors; + u32 rx_alignment_symbol_errors; + u32 rx_pause_frames; + u32 rx_priority_pause_frames; + u32 rx_control_frames; + u32 rx_in_range_errors; + u32 rx_out_range_errors; + u32 rx_frame_too_long; + u32 rx_address_filtered; + u32 rx_dropped_too_small; + u32 rx_dropped_too_short; + u32 rx_dropped_header_too_small; + u32 rx_dropped_tcp_length; + u32 rx_dropped_runt; + u32 rsvd2[10]; + u32 rx_ip_checksum_errs; + u32 rx_tcp_checksum_errs; + u32 rx_udp_checksum_errs; + u32 rsvd3[7]; + u32 rx_switched_unicast_packets; + u32 rx_switched_multicast_packets; + u32 rx_switched_broadcast_packets; + u32 rsvd4[3]; + u32 tx_pauseframes; + u32 tx_priority_pauseframes; + u32 tx_controlframes; + u32 rsvd5[10]; + u32 rxpp_fifo_overflow_drop; + u32 rx_input_fifo_overflow_drop; + u32 pmem_fifo_overflow_drop; + u32 jabber_events; + u32 rsvd6[3]; + u32 rx_drops_payload_size; + u32 rx_drops_clipped_header; + u32 rx_drops_crc; + u32 roce_drops_payload_len; + u32 roce_drops_crc; + u32 rsvd7[19]; +}; + +struct be_rxf_stats_v2 { + struct be_port_rxf_stats_v2 port[4]; + u32 rsvd0[2]; + u32 rx_drops_no_pbuf; + u32 rx_drops_no_txpb; + u32 rx_drops_no_erx_descr; + u32 rx_drops_no_tpre_descr; + u32 rsvd1[6]; + u32 rx_drops_too_many_frags; + u32 rx_drops_invalid_ring; + u32 forwarded_packets; + u32 rx_drops_mtu; + u32 rsvd2[35]; +}; + +struct be_hw_stats_v1 { + struct be_rxf_stats_v1 rxf; + u32 rsvd0[BE_TXP_SW_SZ]; + struct be_erx_stats_v1 erx; + struct be_pmem_stats pmem; + u32 rsvd1[18]; +}; + +struct be_cmd_req_get_stats_v1 { + struct be_cmd_req_hdr hdr; + u8 rsvd[sizeof(struct be_hw_stats_v1)]; +}; + +struct be_cmd_resp_get_stats_v1 { + struct be_cmd_resp_hdr hdr; + struct be_hw_stats_v1 hw_stats; +}; + +struct be_erx_stats_v2 { + u32 rx_drops_no_fragments[136]; /* dwordS 0 to 135*/ + u32 rsvd[3]; +}; + +struct be_hw_stats_v2 { + struct be_rxf_stats_v2 rxf; + u32 rsvd0[BE_TXP_SW_SZ]; + struct be_erx_stats_v2 erx; + struct be_pmem_stats pmem; + u32 rsvd1[18]; +}; + +struct be_cmd_req_get_stats_v2 { + struct be_cmd_req_hdr hdr; + u8 rsvd[sizeof(struct be_hw_stats_v2)]; +}; + +struct be_cmd_resp_get_stats_v2 { + struct be_cmd_resp_hdr hdr; + struct be_hw_stats_v2 hw_stats; +}; + +/************** get fat capabilites *******************/ +#define MAX_MODULES 27 +#define MAX_MODES 4 +#define MODE_UART 0 +#define FW_LOG_LEVEL_DEFAULT 48 +#define FW_LOG_LEVEL_FATAL 64 + +struct ext_fat_mode { + u8 mode; + u8 rsvd0; + u16 port_mask; + u32 dbg_lvl; + u64 fun_mask; +} __packed; + +struct ext_fat_modules { + u8 modules_str[32]; + u32 modules_id; + u32 num_modes; + struct ext_fat_mode trace_lvl[MAX_MODES]; +} __packed; + +struct be_fat_conf_params { + u32 max_log_entries; + u32 log_entry_size; + u8 log_type; + u8 max_log_funs; + u8 max_log_ports; + u8 rsvd0; + u32 supp_modes; + u32 num_modules; + struct ext_fat_modules module[MAX_MODULES]; +} __packed; + +struct be_cmd_req_get_ext_fat_caps { + struct be_cmd_req_hdr hdr; + u32 parameter_type; +}; + +struct be_cmd_resp_get_ext_fat_caps { + struct be_cmd_resp_hdr hdr; + struct be_fat_conf_params get_params; +}; + +struct be_cmd_req_set_ext_fat_caps { + struct be_cmd_req_hdr hdr; + struct be_fat_conf_params set_params; +}; + +#define RESOURCE_DESC_SIZE_V0 72 +#define RESOURCE_DESC_SIZE_V1 88 +#define PCIE_RESOURCE_DESC_TYPE_V0 0x40 +#define NIC_RESOURCE_DESC_TYPE_V0 0x41 +#define PCIE_RESOURCE_DESC_TYPE_V1 0x50 +#define NIC_RESOURCE_DESC_TYPE_V1 0x51 +#define PORT_RESOURCE_DESC_TYPE_V1 0x55 +#define MAX_RESOURCE_DESC 264 + +#define VFT_SHIFT 3 /* VF template */ +#define IMM_SHIFT 6 /* Immediate */ +#define NOSV_SHIFT 7 /* No save */ + +struct be_res_desc_hdr { + u8 desc_type; + u8 desc_len; +} __packed; + +struct be_port_res_desc { + struct be_res_desc_hdr hdr; + u8 rsvd0; + u8 flags; + u8 link_num; + u8 mc_type; + u16 rsvd1; + +#define NV_TYPE_MASK 0x3 /* bits 0-1 */ +#define NV_TYPE_DISABLED 1 +#define NV_TYPE_VXLAN 3 +#define SOCVID_SHIFT 2 /* Strip outer vlan */ +#define RCVID_SHIFT 4 /* Report vlan */ + u8 nv_flags; + u8 rsvd2; + __le16 nv_port; /* vxlan/gre port */ + u32 rsvd3[19]; +} __packed; + +struct be_pcie_res_desc { + struct be_res_desc_hdr hdr; + u8 rsvd0; + u8 flags; + u16 rsvd1; + u8 pf_num; + u8 rsvd2; + u32 rsvd3; + u8 sriov_state; + u8 pf_state; + u8 pf_type; + u8 rsvd4; + u16 num_vfs; + u16 rsvd5; + u32 rsvd6[17]; +} __packed; + +struct be_nic_res_desc { + struct be_res_desc_hdr hdr; + u8 rsvd1; + +#define QUN_SHIFT 4 /* QoS is in absolute units */ + u8 flags; + u8 vf_num; + u8 rsvd2; + u8 pf_num; + u8 rsvd3; + u16 unicast_mac_count; + u8 rsvd4[6]; + u16 mcc_count; + u16 vlan_count; + u16 mcast_mac_count; + u16 txq_count; + u16 rq_count; + u16 rssq_count; + u16 lro_count; + u16 cq_count; + u16 toe_conn_count; + u16 eq_count; + u16 vlan_id; + u16 iface_count; + u32 cap_flags; + u8 link_param; + u8 rsvd6; + u16 channel_id_param; + u32 bw_min; + u32 bw_max; + u8 acpi_params; + u8 wol_param; + u16 rsvd7; + u16 tunnel_iface_count; + u16 direct_tenant_iface_count; + u32 rsvd8[6]; +} __packed; + +/************ Multi-Channel type ***********/ +enum mc_type { + MC_NONE = 0x01, + UMC = 0x02, + FLEX10 = 0x03, + vNIC1 = 0x04, + nPAR = 0x05, + UFP = 0x06, + vNIC2 = 0x07 +}; + +/* Is BE in a multi-channel mode */ +static inline bool be_is_mc(struct be_adapter *adapter) +{ + return adapter->mc_type > MC_NONE; +} + +struct be_cmd_req_get_func_config { + struct be_cmd_req_hdr hdr; +}; + +struct be_cmd_resp_get_func_config { + struct be_cmd_resp_hdr hdr; + u32 desc_count; + u8 func_param[MAX_RESOURCE_DESC * RESOURCE_DESC_SIZE_V1]; +}; + +#define ACTIVE_PROFILE_TYPE 0x2 +struct be_cmd_req_get_profile_config { + struct be_cmd_req_hdr hdr; + u8 rsvd; + u8 type; + u16 rsvd1; +}; + +struct be_cmd_resp_get_profile_config { + struct be_cmd_resp_hdr hdr; + u32 desc_count; + u8 func_param[MAX_RESOURCE_DESC * RESOURCE_DESC_SIZE_V1]; +}; + +struct be_cmd_req_set_profile_config { + struct be_cmd_req_hdr hdr; + u32 rsvd; + u32 desc_count; + u8 desc[2 * RESOURCE_DESC_SIZE_V1]; +} __packed; + +struct be_cmd_req_get_active_profile { + struct be_cmd_req_hdr hdr; + u32 rsvd; +} __packed; + +struct be_cmd_resp_get_active_profile { + struct be_cmd_resp_hdr hdr; + u16 active_profile_id; + u16 next_profile_id; +} __packed; + +struct be_cmd_enable_disable_vf { + struct be_cmd_req_hdr hdr; + u8 enable; + u8 rsvd[3]; +}; + +struct be_cmd_req_intr_set { + struct be_cmd_req_hdr hdr; + u8 intr_enabled; + u8 rsvd[3]; +}; + +static inline bool check_privilege(struct be_adapter *adapter, u32 flags) +{ + return flags & adapter->cmd_privileges ? true : false; +} + +/************** Get IFACE LIST *******************/ +struct be_if_desc { + u32 if_id; + u32 cap_flags; + u32 en_flags; +}; + +struct be_cmd_req_get_iface_list { + struct be_cmd_req_hdr hdr; +}; + +struct be_cmd_resp_get_iface_list { + struct be_cmd_req_hdr hdr; + u32 if_cnt; + struct be_if_desc if_desc; +}; + +/*************** Set logical link ********************/ +#define PLINK_TRACK_SHIFT 8 +struct be_cmd_req_set_ll_link { + struct be_cmd_req_hdr hdr; + u32 link_config; /* Bit 0: UP_DOWN, Bit 9: PLINK */ +}; + +/************** Manage IFACE Filters *******************/ +#define OP_CONVERT_NORMAL_TO_TUNNEL 0 +#define OP_CONVERT_TUNNEL_TO_NORMAL 1 + +struct be_cmd_req_manage_iface_filters { + struct be_cmd_req_hdr hdr; + u8 op; + u8 rsvd0; + u8 flags; + u8 rsvd1; + u32 tunnel_iface_id; + u32 target_iface_id; + u8 mac[6]; + u16 vlan_tag; + u32 tenant_id; + u32 filter_id; + u32 cap_flags; + u32 cap_control_flags; +} __packed; + +int be_pci_fnum_get(struct be_adapter *adapter); +int be_fw_wait_ready(struct be_adapter *adapter); +int be_cmd_mac_addr_query(struct be_adapter *adapter, u8 *mac_addr, + bool permanent, u32 if_handle, u32 pmac_id); +int be_cmd_pmac_add(struct be_adapter *adapter, u8 *mac_addr, u32 if_id, + u32 *pmac_id, u32 domain); +int be_cmd_pmac_del(struct be_adapter *adapter, u32 if_id, int pmac_id, + u32 domain); +int be_cmd_if_create(struct be_adapter *adapter, u32 cap_flags, u32 en_flags, + u32 *if_handle, u32 domain); +int be_cmd_if_destroy(struct be_adapter *adapter, int if_handle, u32 domain); +int be_cmd_eq_create(struct be_adapter *adapter, struct be_eq_obj *eqo); +int be_cmd_cq_create(struct be_adapter *adapter, struct be_queue_info *cq, + struct be_queue_info *eq, bool no_delay, + int num_cqe_dma_coalesce); +int be_cmd_mccq_create(struct be_adapter *adapter, struct be_queue_info *mccq, + struct be_queue_info *cq); +int be_cmd_txq_create(struct be_adapter *adapter, struct be_tx_obj *txo); +int be_cmd_rxq_create(struct be_adapter *adapter, struct be_queue_info *rxq, + u16 cq_id, u16 frag_size, u32 if_id, u32 rss, u8 *rss_id); +int be_cmd_q_destroy(struct be_adapter *adapter, struct be_queue_info *q, + int type); +int be_cmd_rxq_destroy(struct be_adapter *adapter, struct be_queue_info *q); +int be_cmd_link_status_query(struct be_adapter *adapter, u16 *link_speed, + u8 *link_status, u32 dom); +int be_cmd_reset(struct be_adapter *adapter); +int be_cmd_get_stats(struct be_adapter *adapter, struct be_dma_mem *nonemb_cmd); +int lancer_cmd_get_pport_stats(struct be_adapter *adapter, + struct be_dma_mem *nonemb_cmd); +int be_cmd_get_fw_ver(struct be_adapter *adapter); +int be_cmd_modify_eqd(struct be_adapter *adapter, struct be_set_eqd *, int num); +int be_cmd_vlan_config(struct be_adapter *adapter, u32 if_id, u16 *vtag_array, + u32 num); +int be_cmd_rx_filter(struct be_adapter *adapter, u32 flags, u32 status); +int be_cmd_set_flow_control(struct be_adapter *adapter, u32 tx_fc, u32 rx_fc); +int be_cmd_get_flow_control(struct be_adapter *adapter, u32 *tx_fc, u32 *rx_fc); +int be_cmd_query_fw_cfg(struct be_adapter *adapter); +int be_cmd_reset_function(struct be_adapter *adapter); +int be_cmd_rss_config(struct be_adapter *adapter, u8 *rsstable, + u32 rss_hash_opts, u16 table_size, const u8 *rss_hkey); +int be_process_mcc(struct be_adapter *adapter); +int be_cmd_set_beacon_state(struct be_adapter *adapter, u8 port_num, u8 beacon, + u8 status, u8 state); +int be_cmd_get_beacon_state(struct be_adapter *adapter, u8 port_num, + u32 *state); +int be_cmd_read_port_transceiver_data(struct be_adapter *adapter, + u8 page_num, u8 *data); +int be_cmd_query_cable_type(struct be_adapter *adapter); +int be_cmd_write_flashrom(struct be_adapter *adapter, struct be_dma_mem *cmd, + u32 flash_oper, u32 flash_opcode, u32 buf_size); +int lancer_cmd_write_object(struct be_adapter *adapter, struct be_dma_mem *cmd, + u32 data_size, u32 data_offset, + const char *obj_name, u32 *data_written, + u8 *change_status, u8 *addn_status); +int lancer_cmd_read_object(struct be_adapter *adapter, struct be_dma_mem *cmd, + u32 data_size, u32 data_offset, const char *obj_name, + u32 *data_read, u32 *eof, u8 *addn_status); +int lancer_cmd_delete_object(struct be_adapter *adapter, const char *obj_name); +int be_cmd_get_flash_crc(struct be_adapter *adapter, u8 *flashed_crc, + u16 optype, int offset); +int be_cmd_enable_magic_wol(struct be_adapter *adapter, u8 *mac, + struct be_dma_mem *nonemb_cmd); +int be_cmd_fw_init(struct be_adapter *adapter); +int be_cmd_fw_clean(struct be_adapter *adapter); +void be_async_mcc_enable(struct be_adapter *adapter); +void be_async_mcc_disable(struct be_adapter *adapter); +int be_cmd_loopback_test(struct be_adapter *adapter, u32 port_num, + u32 loopback_type, u32 pkt_size, u32 num_pkts, + u64 pattern); +int be_cmd_ddr_dma_test(struct be_adapter *adapter, u64 pattern, u32 byte_cnt, + struct be_dma_mem *cmd); +int be_cmd_get_seeprom_data(struct be_adapter *adapter, + struct be_dma_mem *nonemb_cmd); +int be_cmd_set_loopback(struct be_adapter *adapter, u8 port_num, + u8 loopback_type, u8 enable); +int be_cmd_get_phy_info(struct be_adapter *adapter); +int be_cmd_config_qos(struct be_adapter *adapter, u32 max_rate, + u16 link_speed, u8 domain); +void be_detect_error(struct be_adapter *adapter); +int be_cmd_get_die_temperature(struct be_adapter *adapter); +int be_cmd_get_cntl_attributes(struct be_adapter *adapter); +int be_cmd_req_native_mode(struct be_adapter *adapter); +int be_cmd_get_reg_len(struct be_adapter *adapter, u32 *log_size); +int be_cmd_get_regs(struct be_adapter *adapter, u32 buf_len, void *buf); +int be_cmd_get_fn_privileges(struct be_adapter *adapter, u32 *privilege, + u32 domain); +int be_cmd_set_fn_privileges(struct be_adapter *adapter, u32 privileges, + u32 vf_num); +int be_cmd_get_mac_from_list(struct be_adapter *adapter, u8 *mac, + bool *pmac_id_active, u32 *pmac_id, + u32 if_handle, u8 domain); +int be_cmd_get_active_mac(struct be_adapter *adapter, u32 pmac_id, u8 *mac, + u32 if_handle, bool active, u32 domain); +int be_cmd_get_perm_mac(struct be_adapter *adapter, u8 *mac); +int be_cmd_set_mac_list(struct be_adapter *adapter, u8 *mac_array, u8 mac_count, + u32 domain); +int be_cmd_set_mac(struct be_adapter *adapter, u8 *mac, int if_id, u32 dom); +int be_cmd_set_hsw_config(struct be_adapter *adapter, u16 pvid, u32 domain, + u16 intf_id, u16 hsw_mode); +int be_cmd_get_hsw_config(struct be_adapter *adapter, u16 *pvid, u32 domain, + u16 intf_id, u8 *mode); +int be_cmd_get_acpi_wol_cap(struct be_adapter *adapter); +int be_cmd_set_fw_log_level(struct be_adapter *adapter, u32 level); +int be_cmd_get_fw_log_level(struct be_adapter *adapter); +int be_cmd_get_ext_fat_capabilites(struct be_adapter *adapter, + struct be_dma_mem *cmd); +int be_cmd_set_ext_fat_capabilites(struct be_adapter *adapter, + struct be_dma_mem *cmd, + struct be_fat_conf_params *cfgs); +int lancer_physdev_ctrl(struct be_adapter *adapter, u32 mask); +int lancer_initiate_dump(struct be_adapter *adapter); +int lancer_delete_dump(struct be_adapter *adapter); +bool dump_present(struct be_adapter *adapter); +int lancer_test_and_set_rdy_state(struct be_adapter *adapter); +int be_cmd_query_port_name(struct be_adapter *adapter, u8 *port_name); +int be_cmd_get_func_config(struct be_adapter *adapter, + struct be_resources *res); +int be_cmd_get_profile_config(struct be_adapter *adapter, + struct be_resources *res, u8 domain); +int be_cmd_get_active_profile(struct be_adapter *adapter, u16 *profile); +int be_cmd_get_if_id(struct be_adapter *adapter, struct be_vf_cfg *vf_cfg, + int vf_num); +int be_cmd_enable_vf(struct be_adapter *adapter, u8 domain); +int be_cmd_intr_set(struct be_adapter *adapter, bool intr_enable); +int be_cmd_set_logical_link_config(struct be_adapter *adapter, + int link_state, u8 domain); +int be_cmd_set_vxlan_port(struct be_adapter *adapter, __be16 port); +int be_cmd_manage_iface(struct be_adapter *adapter, u32 iface, u8 op); +int be_cmd_set_sriov_config(struct be_adapter *adapter, + struct be_resources res, u16 num_vfs); diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c b/drivers/net/ethernet/emulex/benet/be_ethtool.c new file mode 100644 index 0000000..e42a791 --- /dev/null +++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c @@ -0,0 +1,1314 @@ +/* + * Copyright (C) 2005 - 2014 Emulex + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. The full GNU General + * Public License is included in this distribution in the file called COPYING. + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + */ + +#include "be.h" +#include "be_cmds.h" +#include + +struct be_ethtool_stat { + char desc[ETH_GSTRING_LEN]; + int type; + int size; + int offset; +}; + +enum {DRVSTAT_TX, DRVSTAT_RX, DRVSTAT}; +#define FIELDINFO(_struct, field) FIELD_SIZEOF(_struct, field), \ + offsetof(_struct, field) +#define DRVSTAT_TX_INFO(field) #field, DRVSTAT_TX,\ + FIELDINFO(struct be_tx_stats, field) +#define DRVSTAT_RX_INFO(field) #field, DRVSTAT_RX,\ + FIELDINFO(struct be_rx_stats, field) +#define DRVSTAT_INFO(field) #field, DRVSTAT,\ + FIELDINFO(struct be_drv_stats, field) + +static const struct be_ethtool_stat et_stats[] = { + {DRVSTAT_INFO(rx_crc_errors)}, + {DRVSTAT_INFO(rx_alignment_symbol_errors)}, + {DRVSTAT_INFO(rx_pause_frames)}, + {DRVSTAT_INFO(rx_control_frames)}, + /* Received packets dropped when the Ethernet length field + * is not equal to the actual Ethernet data length. + */ + {DRVSTAT_INFO(rx_in_range_errors)}, + /* Received packets dropped when their length field is >= 1501 bytes + * and <= 1535 bytes. + */ + {DRVSTAT_INFO(rx_out_range_errors)}, + /* Received packets dropped when they are longer than 9216 bytes */ + {DRVSTAT_INFO(rx_frame_too_long)}, + /* Received packets dropped when they don't pass the unicast or + * multicast address filtering. + */ + {DRVSTAT_INFO(rx_address_filtered)}, + /* Received packets dropped when IP packet length field is less than + * the IP header length field. + */ + {DRVSTAT_INFO(rx_dropped_too_small)}, + /* Received packets dropped when IP length field is greater than + * the actual packet length. + */ + {DRVSTAT_INFO(rx_dropped_too_short)}, + /* Received packets dropped when the IP header length field is less + * than 5. + */ + {DRVSTAT_INFO(rx_dropped_header_too_small)}, + /* Received packets dropped when the TCP header length field is less + * than 5 or the TCP header length + IP header length is more + * than IP packet length. + */ + {DRVSTAT_INFO(rx_dropped_tcp_length)}, + {DRVSTAT_INFO(rx_dropped_runt)}, + /* Number of received packets dropped when a fifo for descriptors going + * into the packet demux block overflows. In normal operation, this + * fifo must never overflow. + */ + {DRVSTAT_INFO(rxpp_fifo_overflow_drop)}, + /* Received packets dropped when the RX block runs out of space in + * one of its input FIFOs. This could happen due a long burst of + * minimum-sized (64b) frames in the receive path. + * This counter may also be erroneously incremented rarely. + */ + {DRVSTAT_INFO(rx_input_fifo_overflow_drop)}, + {DRVSTAT_INFO(rx_ip_checksum_errs)}, + {DRVSTAT_INFO(rx_tcp_checksum_errs)}, + {DRVSTAT_INFO(rx_udp_checksum_errs)}, + {DRVSTAT_INFO(tx_pauseframes)}, + {DRVSTAT_INFO(tx_controlframes)}, + {DRVSTAT_INFO(rx_priority_pause_frames)}, + {DRVSTAT_INFO(tx_priority_pauseframes)}, + /* Received packets dropped when an internal fifo going into + * main packet buffer tank (PMEM) overflows. + */ + {DRVSTAT_INFO(pmem_fifo_overflow_drop)}, + {DRVSTAT_INFO(jabber_events)}, + /* Received packets dropped due to lack of available HW packet buffers + * used to temporarily hold the received packets. + */ + {DRVSTAT_INFO(rx_drops_no_pbuf)}, + /* Received packets dropped due to input receive buffer + * descriptor fifo overflowing. + */ + {DRVSTAT_INFO(rx_drops_no_erx_descr)}, + /* Packets dropped because the internal FIFO to the offloaded TCP + * receive processing block is full. This could happen only for + * offloaded iSCSI or FCoE trarffic. + */ + {DRVSTAT_INFO(rx_drops_no_tpre_descr)}, + /* Received packets dropped when they need more than 8 + * receive buffers. This cannot happen as the driver configures + * 2048 byte receive buffers. + */ + {DRVSTAT_INFO(rx_drops_too_many_frags)}, + {DRVSTAT_INFO(forwarded_packets)}, + /* Received packets dropped when the frame length + * is more than 9018 bytes + */ + {DRVSTAT_INFO(rx_drops_mtu)}, + /* Number of dma mapping errors */ + {DRVSTAT_INFO(dma_map_errors)}, + /* Number of packets dropped due to random early drop function */ + {DRVSTAT_INFO(eth_red_drops)}, + {DRVSTAT_INFO(be_on_die_temperature)}, + {DRVSTAT_INFO(rx_roce_bytes_lsd)}, + {DRVSTAT_INFO(rx_roce_bytes_msd)}, + {DRVSTAT_INFO(rx_roce_frames)}, + {DRVSTAT_INFO(roce_drops_payload_len)}, + {DRVSTAT_INFO(roce_drops_crc)} +}; + +#define ETHTOOL_STATS_NUM ARRAY_SIZE(et_stats) + +/* Stats related to multi RX queues: get_stats routine assumes bytes, pkts + * are first and second members respectively. + */ +static const struct be_ethtool_stat et_rx_stats[] = { + {DRVSTAT_RX_INFO(rx_bytes)},/* If moving this member see above note */ + {DRVSTAT_RX_INFO(rx_pkts)}, /* If moving this member see above note */ + {DRVSTAT_RX_INFO(rx_compl)}, + {DRVSTAT_RX_INFO(rx_compl_err)}, + {DRVSTAT_RX_INFO(rx_mcast_pkts)}, + /* Number of page allocation failures while posting receive buffers + * to HW. + */ + {DRVSTAT_RX_INFO(rx_post_fail)}, + /* Recevied packets dropped due to skb allocation failure */ + {DRVSTAT_RX_INFO(rx_drops_no_skbs)}, + /* Received packets dropped due to lack of available fetched buffers + * posted by the driver. + */ + {DRVSTAT_RX_INFO(rx_drops_no_frags)} +}; + +#define ETHTOOL_RXSTATS_NUM (ARRAY_SIZE(et_rx_stats)) + +/* Stats related to multi TX queues: get_stats routine assumes compl is the + * first member + */ +static const struct be_ethtool_stat et_tx_stats[] = { + {DRVSTAT_TX_INFO(tx_compl)}, /* If moving this member see above note */ + /* This counter is incremented when the HW encounters an error while + * parsing the packet header of an outgoing TX request. This counter is + * applicable only for BE2, BE3 and Skyhawk based adapters. + */ + {DRVSTAT_TX_INFO(tx_hdr_parse_err)}, + /* This counter is incremented when an error occurs in the DMA + * operation associated with the TX request from the host to the device. + */ + {DRVSTAT_TX_INFO(tx_dma_err)}, + /* This counter is incremented when MAC or VLAN spoof checking is + * enabled on the interface and the TX request fails the spoof check + * in HW. + */ + {DRVSTAT_TX_INFO(tx_spoof_check_err)}, + /* This counter is incremented when the HW encounters an error while + * performing TSO offload. This counter is applicable only for Lancer + * adapters. + */ + {DRVSTAT_TX_INFO(tx_tso_err)}, + /* This counter is incremented when the HW detects Q-in-Q style VLAN + * tagging in a packet and such tagging is not expected on the outgoing + * interface. This counter is applicable only for Lancer adapters. + */ + {DRVSTAT_TX_INFO(tx_qinq_err)}, + /* This counter is incremented when the HW detects parity errors in the + * packet data. This counter is applicable only for Lancer adapters. + */ + {DRVSTAT_TX_INFO(tx_internal_parity_err)}, + {DRVSTAT_TX_INFO(tx_bytes)}, + {DRVSTAT_TX_INFO(tx_pkts)}, + /* Number of skbs queued for trasmission by the driver */ + {DRVSTAT_TX_INFO(tx_reqs)}, + /* Number of TX work request blocks DMAed to HW */ + {DRVSTAT_TX_INFO(tx_wrbs)}, + /* Number of times the TX queue was stopped due to lack + * of spaces in the TXQ. + */ + {DRVSTAT_TX_INFO(tx_stops)}, + /* Pkts dropped in the driver's transmit path */ + {DRVSTAT_TX_INFO(tx_drv_drops)} +}; + +#define ETHTOOL_TXSTATS_NUM (ARRAY_SIZE(et_tx_stats)) + +static const char et_self_tests[][ETH_GSTRING_LEN] = { + "MAC Loopback test", + "PHY Loopback test", + "External Loopback test", + "DDR DMA test", + "Link test" +}; + +#define ETHTOOL_TESTS_NUM ARRAY_SIZE(et_self_tests) +#define BE_MAC_LOOPBACK 0x0 +#define BE_PHY_LOOPBACK 0x1 +#define BE_ONE_PORT_EXT_LOOPBACK 0x2 +#define BE_NO_LOOPBACK 0xff + +static void be_get_drvinfo(struct net_device *netdev, + struct ethtool_drvinfo *drvinfo) +{ + struct be_adapter *adapter = netdev_priv(netdev); + + strlcpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver)); + strlcpy(drvinfo->version, DRV_VER, sizeof(drvinfo->version)); + if (!memcmp(adapter->fw_ver, adapter->fw_on_flash, FW_VER_LEN)) + strlcpy(drvinfo->fw_version, adapter->fw_ver, + sizeof(drvinfo->fw_version)); + else + snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), + "%s [%s]", adapter->fw_ver, adapter->fw_on_flash); + + strlcpy(drvinfo->bus_info, pci_name(adapter->pdev), + sizeof(drvinfo->bus_info)); + drvinfo->testinfo_len = 0; + drvinfo->regdump_len = 0; + drvinfo->eedump_len = 0; +} + +static u32 lancer_cmd_get_file_len(struct be_adapter *adapter, u8 *file_name) +{ + u32 data_read = 0, eof; + u8 addn_status; + struct be_dma_mem data_len_cmd; + int status; + + memset(&data_len_cmd, 0, sizeof(data_len_cmd)); + /* data_offset and data_size should be 0 to get reg len */ + status = lancer_cmd_read_object(adapter, &data_len_cmd, 0, 0, + file_name, &data_read, &eof, + &addn_status); + + return data_read; +} + +static int lancer_cmd_read_file(struct be_adapter *adapter, u8 *file_name, + u32 buf_len, void *buf) +{ + struct be_dma_mem read_cmd; + u32 read_len = 0, total_read_len = 0, chunk_size; + u32 eof = 0; + u8 addn_status; + int status = 0; + + read_cmd.size = LANCER_READ_FILE_CHUNK; + read_cmd.va = pci_alloc_consistent(adapter->pdev, read_cmd.size, + &read_cmd.dma); + + if (!read_cmd.va) { + dev_err(&adapter->pdev->dev, + "Memory allocation failure while reading dump\n"); + return -ENOMEM; + } + + while ((total_read_len < buf_len) && !eof) { + chunk_size = min_t(u32, (buf_len - total_read_len), + LANCER_READ_FILE_CHUNK); + chunk_size = ALIGN(chunk_size, 4); + status = lancer_cmd_read_object(adapter, &read_cmd, chunk_size, + total_read_len, file_name, + &read_len, &eof, &addn_status); + if (!status) { + memcpy(buf + total_read_len, read_cmd.va, read_len); + total_read_len += read_len; + eof &= LANCER_READ_FILE_EOF_MASK; + } else { + status = -EIO; + break; + } + } + pci_free_consistent(adapter->pdev, read_cmd.size, read_cmd.va, + read_cmd.dma); + + return status; +} + +static int be_get_reg_len(struct net_device *netdev) +{ + struct be_adapter *adapter = netdev_priv(netdev); + u32 log_size = 0; + + if (!check_privilege(adapter, MAX_PRIVILEGES)) + return 0; + + if (be_physfn(adapter)) { + if (lancer_chip(adapter)) + log_size = lancer_cmd_get_file_len(adapter, + LANCER_FW_DUMP_FILE); + else + be_cmd_get_reg_len(adapter, &log_size); + } + return log_size; +} + +static void +be_get_regs(struct net_device *netdev, struct ethtool_regs *regs, void *buf) +{ + struct be_adapter *adapter = netdev_priv(netdev); + + if (be_physfn(adapter)) { + memset(buf, 0, regs->len); + if (lancer_chip(adapter)) + lancer_cmd_read_file(adapter, LANCER_FW_DUMP_FILE, + regs->len, buf); + else + be_cmd_get_regs(adapter, regs->len, buf); + } +} + +static int be_get_coalesce(struct net_device *netdev, + struct ethtool_coalesce *et) +{ + struct be_adapter *adapter = netdev_priv(netdev); + struct be_aic_obj *aic = &adapter->aic_obj[0]; + + et->rx_coalesce_usecs = aic->prev_eqd; + et->rx_coalesce_usecs_high = aic->max_eqd; + et->rx_coalesce_usecs_low = aic->min_eqd; + + et->tx_coalesce_usecs = aic->prev_eqd; + et->tx_coalesce_usecs_high = aic->max_eqd; + et->tx_coalesce_usecs_low = aic->min_eqd; + + et->use_adaptive_rx_coalesce = aic->enable; + et->use_adaptive_tx_coalesce = aic->enable; + + return 0; +} + +/* TX attributes are ignored. Only RX attributes are considered + * eqd cmd is issued in the worker thread. + */ +static int be_set_coalesce(struct net_device *netdev, + struct ethtool_coalesce *et) +{ + struct be_adapter *adapter = netdev_priv(netdev); + struct be_aic_obj *aic = &adapter->aic_obj[0]; + struct be_eq_obj *eqo; + int i; + + for_all_evt_queues(adapter, eqo, i) { + aic->enable = et->use_adaptive_rx_coalesce; + aic->max_eqd = min(et->rx_coalesce_usecs_high, BE_MAX_EQD); + aic->min_eqd = min(et->rx_coalesce_usecs_low, aic->max_eqd); + aic->et_eqd = min(et->rx_coalesce_usecs, aic->max_eqd); + aic->et_eqd = max(aic->et_eqd, aic->min_eqd); + aic++; + } + + return 0; +} + +static void be_get_ethtool_stats(struct net_device *netdev, + struct ethtool_stats *stats, uint64_t *data) +{ + struct be_adapter *adapter = netdev_priv(netdev); + struct be_rx_obj *rxo; + struct be_tx_obj *txo; + void *p; + unsigned int i, j, base = 0, start; + + for (i = 0; i < ETHTOOL_STATS_NUM; i++) { + p = (u8 *)&adapter->drv_stats + et_stats[i].offset; + data[i] = *(u32 *)p; + } + base += ETHTOOL_STATS_NUM; + + for_all_rx_queues(adapter, rxo, j) { + struct be_rx_stats *stats = rx_stats(rxo); + + do { + start = u64_stats_fetch_begin_irq(&stats->sync); + data[base] = stats->rx_bytes; + data[base + 1] = stats->rx_pkts; + } while (u64_stats_fetch_retry_irq(&stats->sync, start)); + + for (i = 2; i < ETHTOOL_RXSTATS_NUM; i++) { + p = (u8 *)stats + et_rx_stats[i].offset; + data[base + i] = *(u32 *)p; + } + base += ETHTOOL_RXSTATS_NUM; + } + + for_all_tx_queues(adapter, txo, j) { + struct be_tx_stats *stats = tx_stats(txo); + + do { + start = u64_stats_fetch_begin_irq(&stats->sync_compl); + data[base] = stats->tx_compl; + } while (u64_stats_fetch_retry_irq(&stats->sync_compl, start)); + + do { + start = u64_stats_fetch_begin_irq(&stats->sync); + for (i = 1; i < ETHTOOL_TXSTATS_NUM; i++) { + p = (u8 *)stats + et_tx_stats[i].offset; + data[base + i] = + (et_tx_stats[i].size == sizeof(u64)) ? + *(u64 *)p : *(u32 *)p; + } + } while (u64_stats_fetch_retry_irq(&stats->sync, start)); + base += ETHTOOL_TXSTATS_NUM; + } +} + +static void be_get_stat_strings(struct net_device *netdev, uint32_t stringset, + uint8_t *data) +{ + struct be_adapter *adapter = netdev_priv(netdev); + int i, j; + + switch (stringset) { + case ETH_SS_STATS: + for (i = 0; i < ETHTOOL_STATS_NUM; i++) { + memcpy(data, et_stats[i].desc, ETH_GSTRING_LEN); + data += ETH_GSTRING_LEN; + } + for (i = 0; i < adapter->num_rx_qs; i++) { + for (j = 0; j < ETHTOOL_RXSTATS_NUM; j++) { + sprintf(data, "rxq%d: %s", i, + et_rx_stats[j].desc); + data += ETH_GSTRING_LEN; + } + } + for (i = 0; i < adapter->num_tx_qs; i++) { + for (j = 0; j < ETHTOOL_TXSTATS_NUM; j++) { + sprintf(data, "txq%d: %s", i, + et_tx_stats[j].desc); + data += ETH_GSTRING_LEN; + } + } + break; + case ETH_SS_TEST: + for (i = 0; i < ETHTOOL_TESTS_NUM; i++) { + memcpy(data, et_self_tests[i], ETH_GSTRING_LEN); + data += ETH_GSTRING_LEN; + } + break; + } +} + +static int be_get_sset_count(struct net_device *netdev, int stringset) +{ + struct be_adapter *adapter = netdev_priv(netdev); + + switch (stringset) { + case ETH_SS_TEST: + return ETHTOOL_TESTS_NUM; + case ETH_SS_STATS: + return ETHTOOL_STATS_NUM + + adapter->num_rx_qs * ETHTOOL_RXSTATS_NUM + + adapter->num_tx_qs * ETHTOOL_TXSTATS_NUM; + default: + return -EINVAL; + } +} + +static u32 be_get_port_type(struct be_adapter *adapter) +{ + u32 port; + + switch (adapter->phy.interface_type) { + case PHY_TYPE_BASET_1GB: + case PHY_TYPE_BASEX_1GB: + case PHY_TYPE_SGMII: + port = PORT_TP; + break; + case PHY_TYPE_SFP_PLUS_10GB: + if (adapter->phy.cable_type & SFP_PLUS_COPPER_CABLE) + port = PORT_DA; + else + port = PORT_FIBRE; + break; + case PHY_TYPE_QSFP: + if (adapter->phy.cable_type & QSFP_PLUS_CR4_CABLE) + port = PORT_DA; + else + port = PORT_FIBRE; + break; + case PHY_TYPE_XFP_10GB: + case PHY_TYPE_SFP_1GB: + port = PORT_FIBRE; + break; + case PHY_TYPE_BASET_10GB: + port = PORT_TP; + break; + default: + port = PORT_OTHER; + } + + return port; +} + +static u32 convert_to_et_setting(struct be_adapter *adapter, u32 if_speeds) +{ + u32 val = 0; + + switch (adapter->phy.interface_type) { + case PHY_TYPE_BASET_1GB: + case PHY_TYPE_BASEX_1GB: + case PHY_TYPE_SGMII: + val |= SUPPORTED_TP; + if (if_speeds & BE_SUPPORTED_SPEED_1GBPS) + val |= SUPPORTED_1000baseT_Full; + if (if_speeds & BE_SUPPORTED_SPEED_100MBPS) + val |= SUPPORTED_100baseT_Full; + if (if_speeds & BE_SUPPORTED_SPEED_10MBPS) + val |= SUPPORTED_10baseT_Full; + break; + case PHY_TYPE_KX4_10GB: + val |= SUPPORTED_Backplane; + if (if_speeds & BE_SUPPORTED_SPEED_1GBPS) + val |= SUPPORTED_1000baseKX_Full; + if (if_speeds & BE_SUPPORTED_SPEED_10GBPS) + val |= SUPPORTED_10000baseKX4_Full; + break; + case PHY_TYPE_KR2_20GB: + val |= SUPPORTED_Backplane; + if (if_speeds & BE_SUPPORTED_SPEED_10GBPS) + val |= SUPPORTED_10000baseKR_Full; + if (if_speeds & BE_SUPPORTED_SPEED_20GBPS) + val |= SUPPORTED_20000baseKR2_Full; + break; + case PHY_TYPE_KR_10GB: + val |= SUPPORTED_Backplane | + SUPPORTED_10000baseKR_Full; + break; + case PHY_TYPE_KR4_40GB: + val |= SUPPORTED_Backplane; + if (if_speeds & BE_SUPPORTED_SPEED_10GBPS) + val |= SUPPORTED_10000baseKR_Full; + if (if_speeds & BE_SUPPORTED_SPEED_40GBPS) + val |= SUPPORTED_40000baseKR4_Full; + break; + case PHY_TYPE_QSFP: + if (if_speeds & BE_SUPPORTED_SPEED_40GBPS) { + switch (adapter->phy.cable_type) { + case QSFP_PLUS_CR4_CABLE: + val |= SUPPORTED_40000baseCR4_Full; + break; + case QSFP_PLUS_LR4_CABLE: + val |= SUPPORTED_40000baseLR4_Full; + break; + default: + val |= SUPPORTED_40000baseSR4_Full; + break; + } + } + case PHY_TYPE_SFP_PLUS_10GB: + case PHY_TYPE_XFP_10GB: + case PHY_TYPE_SFP_1GB: + val |= SUPPORTED_FIBRE; + if (if_speeds & BE_SUPPORTED_SPEED_10GBPS) + val |= SUPPORTED_10000baseT_Full; + if (if_speeds & BE_SUPPORTED_SPEED_1GBPS) + val |= SUPPORTED_1000baseT_Full; + break; + case PHY_TYPE_BASET_10GB: + val |= SUPPORTED_TP; + if (if_speeds & BE_SUPPORTED_SPEED_10GBPS) + val |= SUPPORTED_10000baseT_Full; + if (if_speeds & BE_SUPPORTED_SPEED_1GBPS) + val |= SUPPORTED_1000baseT_Full; + if (if_speeds & BE_SUPPORTED_SPEED_100MBPS) + val |= SUPPORTED_100baseT_Full; + break; + default: + val |= SUPPORTED_TP; + } + + return val; +} + +bool be_pause_supported(struct be_adapter *adapter) +{ + return (adapter->phy.interface_type == PHY_TYPE_SFP_PLUS_10GB || + adapter->phy.interface_type == PHY_TYPE_XFP_10GB) ? + false : true; +} + +static int be_get_settings(struct net_device *netdev, struct ethtool_cmd *ecmd) +{ + struct be_adapter *adapter = netdev_priv(netdev); + u8 link_status; + u16 link_speed = 0; + int status; + u32 auto_speeds; + u32 fixed_speeds; + + if (adapter->phy.link_speed < 0) { + status = be_cmd_link_status_query(adapter, &link_speed, + &link_status, 0); + if (!status) + be_link_status_update(adapter, link_status); + ethtool_cmd_speed_set(ecmd, link_speed); + + status = be_cmd_get_phy_info(adapter); + if (!status) { + auto_speeds = adapter->phy.auto_speeds_supported; + fixed_speeds = adapter->phy.fixed_speeds_supported; + + be_cmd_query_cable_type(adapter); + + ecmd->supported = + convert_to_et_setting(adapter, + auto_speeds | + fixed_speeds); + ecmd->advertising = + convert_to_et_setting(adapter, auto_speeds); + + ecmd->port = be_get_port_type(adapter); + + if (adapter->phy.auto_speeds_supported) { + ecmd->supported |= SUPPORTED_Autoneg; + ecmd->autoneg = AUTONEG_ENABLE; + ecmd->advertising |= ADVERTISED_Autoneg; + } + + ecmd->supported |= SUPPORTED_Pause; + if (be_pause_supported(adapter)) + ecmd->advertising |= ADVERTISED_Pause; + + switch (adapter->phy.interface_type) { + case PHY_TYPE_KR_10GB: + case PHY_TYPE_KX4_10GB: + ecmd->transceiver = XCVR_INTERNAL; + break; + default: + ecmd->transceiver = XCVR_EXTERNAL; + break; + } + } else { + ecmd->port = PORT_OTHER; + ecmd->autoneg = AUTONEG_DISABLE; + ecmd->transceiver = XCVR_DUMMY1; + } + + /* Save for future use */ + adapter->phy.link_speed = ethtool_cmd_speed(ecmd); + adapter->phy.port_type = ecmd->port; + adapter->phy.transceiver = ecmd->transceiver; + adapter->phy.autoneg = ecmd->autoneg; + adapter->phy.advertising = ecmd->advertising; + adapter->phy.supported = ecmd->supported; + } else { + ethtool_cmd_speed_set(ecmd, adapter->phy.link_speed); + ecmd->port = adapter->phy.port_type; + ecmd->transceiver = adapter->phy.transceiver; + ecmd->autoneg = adapter->phy.autoneg; + ecmd->advertising = adapter->phy.advertising; + ecmd->supported = adapter->phy.supported; + } + + ecmd->duplex = netif_carrier_ok(netdev) ? DUPLEX_FULL : DUPLEX_UNKNOWN; + ecmd->phy_address = adapter->port_num; + + return 0; +} + +static void be_get_ringparam(struct net_device *netdev, + struct ethtool_ringparam *ring) +{ + struct be_adapter *adapter = netdev_priv(netdev); + + ring->rx_max_pending = adapter->rx_obj[0].q.len; + ring->rx_pending = adapter->rx_obj[0].q.len; + ring->tx_max_pending = adapter->tx_obj[0].q.len; + ring->tx_pending = adapter->tx_obj[0].q.len; +} + +static void +be_get_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *ecmd) +{ + struct be_adapter *adapter = netdev_priv(netdev); + + be_cmd_get_flow_control(adapter, &ecmd->tx_pause, &ecmd->rx_pause); + ecmd->autoneg = adapter->phy.fc_autoneg; +} + +static int +be_set_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *ecmd) +{ + struct be_adapter *adapter = netdev_priv(netdev); + int status; + + if (ecmd->autoneg != adapter->phy.fc_autoneg) + return -EINVAL; + adapter->tx_fc = ecmd->tx_pause; + adapter->rx_fc = ecmd->rx_pause; + + status = be_cmd_set_flow_control(adapter, + adapter->tx_fc, adapter->rx_fc); + if (status) + dev_warn(&adapter->pdev->dev, "Pause param set failed\n"); + + return be_cmd_status(status); +} + +static int be_set_phys_id(struct net_device *netdev, + enum ethtool_phys_id_state state) +{ + struct be_adapter *adapter = netdev_priv(netdev); + + switch (state) { + case ETHTOOL_ID_ACTIVE: + be_cmd_get_beacon_state(adapter, adapter->hba_port_num, + &adapter->beacon_state); + return 1; /* cycle on/off once per second */ + + case ETHTOOL_ID_ON: + be_cmd_set_beacon_state(adapter, adapter->hba_port_num, 0, 0, + BEACON_STATE_ENABLED); + break; + + case ETHTOOL_ID_OFF: + be_cmd_set_beacon_state(adapter, adapter->hba_port_num, 0, 0, + BEACON_STATE_DISABLED); + break; + + case ETHTOOL_ID_INACTIVE: + be_cmd_set_beacon_state(adapter, adapter->hba_port_num, 0, 0, + adapter->beacon_state); + } + + return 0; +} + +static int be_set_dump(struct net_device *netdev, struct ethtool_dump *dump) +{ + struct be_adapter *adapter = netdev_priv(netdev); + struct device *dev = &adapter->pdev->dev; + int status; + + if (!lancer_chip(adapter) || + !check_privilege(adapter, MAX_PRIVILEGES)) + return -EOPNOTSUPP; + + switch (dump->flag) { + case LANCER_INITIATE_FW_DUMP: + status = lancer_initiate_dump(adapter); + if (!status) + dev_info(dev, "FW dump initiated successfully\n"); + break; + case LANCER_DELETE_FW_DUMP: + status = lancer_delete_dump(adapter); + if (!status) + dev_info(dev, "FW dump deleted successfully\n"); + break; + default: + dev_err(dev, "Invalid dump level: 0x%x\n", dump->flag); + return -EINVAL; + } + return status; +} + +static void be_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) +{ + struct be_adapter *adapter = netdev_priv(netdev); + + if (adapter->wol_cap & BE_WOL_CAP) { + wol->supported |= WAKE_MAGIC; + if (adapter->wol_en) + wol->wolopts |= WAKE_MAGIC; + } else { + wol->wolopts = 0; + } + memset(&wol->sopass, 0, sizeof(wol->sopass)); +} + +static int be_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) +{ + struct be_adapter *adapter = netdev_priv(netdev); + + if (wol->wolopts & ~WAKE_MAGIC) + return -EOPNOTSUPP; + + if (!(adapter->wol_cap & BE_WOL_CAP)) { + dev_warn(&adapter->pdev->dev, "WOL not supported\n"); + return -EOPNOTSUPP; + } + + if (wol->wolopts & WAKE_MAGIC) + adapter->wol_en = true; + else + adapter->wol_en = false; + + return 0; +} + +static int be_test_ddr_dma(struct be_adapter *adapter) +{ + int ret, i; + struct be_dma_mem ddrdma_cmd; + static const u64 pattern[2] = { + 0x5a5a5a5a5a5a5a5aULL, 0xa5a5a5a5a5a5a5a5ULL + }; + + ddrdma_cmd.size = sizeof(struct be_cmd_req_ddrdma_test); + ddrdma_cmd.va = dma_alloc_coherent(&adapter->pdev->dev, ddrdma_cmd.size, + &ddrdma_cmd.dma, GFP_KERNEL); + if (!ddrdma_cmd.va) + return -ENOMEM; + + for (i = 0; i < 2; i++) { + ret = be_cmd_ddr_dma_test(adapter, pattern[i], + 4096, &ddrdma_cmd); + if (ret != 0) + goto err; + } + +err: + dma_free_coherent(&adapter->pdev->dev, ddrdma_cmd.size, ddrdma_cmd.va, + ddrdma_cmd.dma); + return be_cmd_status(ret); +} + +static u64 be_loopback_test(struct be_adapter *adapter, u8 loopback_type, + u64 *status) +{ + be_cmd_set_loopback(adapter, adapter->hba_port_num, loopback_type, 1); + *status = be_cmd_loopback_test(adapter, adapter->hba_port_num, + loopback_type, 1500, 2, 0xabc); + be_cmd_set_loopback(adapter, adapter->hba_port_num, BE_NO_LOOPBACK, 1); + return *status; +} + +static void be_self_test(struct net_device *netdev, struct ethtool_test *test, + u64 *data) +{ + struct be_adapter *adapter = netdev_priv(netdev); + int status; + u8 link_status = 0; + + if (adapter->function_caps & BE_FUNCTION_CAPS_SUPER_NIC) { + dev_err(&adapter->pdev->dev, "Self test not supported\n"); + test->flags |= ETH_TEST_FL_FAILED; + return; + } + + memset(data, 0, sizeof(u64) * ETHTOOL_TESTS_NUM); + + if (test->flags & ETH_TEST_FL_OFFLINE) { + if (be_loopback_test(adapter, BE_MAC_LOOPBACK, &data[0]) != 0) + test->flags |= ETH_TEST_FL_FAILED; + + if (be_loopback_test(adapter, BE_PHY_LOOPBACK, &data[1]) != 0) + test->flags |= ETH_TEST_FL_FAILED; + + if (test->flags & ETH_TEST_FL_EXTERNAL_LB) { + if (be_loopback_test(adapter, BE_ONE_PORT_EXT_LOOPBACK, + &data[2]) != 0) + test->flags |= ETH_TEST_FL_FAILED; + test->flags |= ETH_TEST_FL_EXTERNAL_LB_DONE; + } + } + + if (!lancer_chip(adapter) && be_test_ddr_dma(adapter) != 0) { + data[3] = 1; + test->flags |= ETH_TEST_FL_FAILED; + } + + status = be_cmd_link_status_query(adapter, NULL, &link_status, 0); + if (status) { + test->flags |= ETH_TEST_FL_FAILED; + data[4] = -1; + } else if (!link_status) { + test->flags |= ETH_TEST_FL_FAILED; + data[4] = 1; + } +} + +static int be_do_flash(struct net_device *netdev, struct ethtool_flash *efl) +{ + struct be_adapter *adapter = netdev_priv(netdev); + + return be_load_fw(adapter, efl->data); +} + +static int be_get_eeprom_len(struct net_device *netdev) +{ + struct be_adapter *adapter = netdev_priv(netdev); + + if (!check_privilege(adapter, MAX_PRIVILEGES)) + return 0; + + if (lancer_chip(adapter)) { + if (be_physfn(adapter)) + return lancer_cmd_get_file_len(adapter, + LANCER_VPD_PF_FILE); + else + return lancer_cmd_get_file_len(adapter, + LANCER_VPD_VF_FILE); + } else { + return BE_READ_SEEPROM_LEN; + } +} + +static int be_read_eeprom(struct net_device *netdev, + struct ethtool_eeprom *eeprom, uint8_t *data) +{ + struct be_adapter *adapter = netdev_priv(netdev); + struct be_dma_mem eeprom_cmd; + struct be_cmd_resp_seeprom_read *resp; + int status; + + if (!eeprom->len) + return -EINVAL; + + if (lancer_chip(adapter)) { + if (be_physfn(adapter)) + return lancer_cmd_read_file(adapter, LANCER_VPD_PF_FILE, + eeprom->len, data); + else + return lancer_cmd_read_file(adapter, LANCER_VPD_VF_FILE, + eeprom->len, data); + } + + eeprom->magic = BE_VENDOR_ID | (adapter->pdev->device<<16); + + memset(&eeprom_cmd, 0, sizeof(struct be_dma_mem)); + eeprom_cmd.size = sizeof(struct be_cmd_req_seeprom_read); + eeprom_cmd.va = dma_alloc_coherent(&adapter->pdev->dev, eeprom_cmd.size, + &eeprom_cmd.dma, GFP_KERNEL); + + if (!eeprom_cmd.va) + return -ENOMEM; + + status = be_cmd_get_seeprom_data(adapter, &eeprom_cmd); + + if (!status) { + resp = eeprom_cmd.va; + memcpy(data, resp->seeprom_data + eeprom->offset, eeprom->len); + } + dma_free_coherent(&adapter->pdev->dev, eeprom_cmd.size, eeprom_cmd.va, + eeprom_cmd.dma); + + return be_cmd_status(status); +} + +static u32 be_get_msg_level(struct net_device *netdev) +{ + struct be_adapter *adapter = netdev_priv(netdev); + + return adapter->msg_enable; +} + +static void be_set_msg_level(struct net_device *netdev, u32 level) +{ + struct be_adapter *adapter = netdev_priv(netdev); + + if (adapter->msg_enable == level) + return; + + if ((level & NETIF_MSG_HW) != (adapter->msg_enable & NETIF_MSG_HW)) + if (BEx_chip(adapter)) + be_cmd_set_fw_log_level(adapter, level & NETIF_MSG_HW ? + FW_LOG_LEVEL_DEFAULT : + FW_LOG_LEVEL_FATAL); + adapter->msg_enable = level; +} + +static u64 be_get_rss_hash_opts(struct be_adapter *adapter, u64 flow_type) +{ + u64 data = 0; + + switch (flow_type) { + case TCP_V4_FLOW: + if (adapter->rss_info.rss_flags & RSS_ENABLE_IPV4) + data |= RXH_IP_DST | RXH_IP_SRC; + if (adapter->rss_info.rss_flags & RSS_ENABLE_TCP_IPV4) + data |= RXH_L4_B_0_1 | RXH_L4_B_2_3; + break; + case UDP_V4_FLOW: + if (adapter->rss_info.rss_flags & RSS_ENABLE_IPV4) + data |= RXH_IP_DST | RXH_IP_SRC; + if (adapter->rss_info.rss_flags & RSS_ENABLE_UDP_IPV4) + data |= RXH_L4_B_0_1 | RXH_L4_B_2_3; + break; + case TCP_V6_FLOW: + if (adapter->rss_info.rss_flags & RSS_ENABLE_IPV6) + data |= RXH_IP_DST | RXH_IP_SRC; + if (adapter->rss_info.rss_flags & RSS_ENABLE_TCP_IPV6) + data |= RXH_L4_B_0_1 | RXH_L4_B_2_3; + break; + case UDP_V6_FLOW: + if (adapter->rss_info.rss_flags & RSS_ENABLE_IPV6) + data |= RXH_IP_DST | RXH_IP_SRC; + if (adapter->rss_info.rss_flags & RSS_ENABLE_UDP_IPV6) + data |= RXH_L4_B_0_1 | RXH_L4_B_2_3; + break; + } + + return data; +} + +static int be_get_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd, + u32 *rule_locs) +{ + struct be_adapter *adapter = netdev_priv(netdev); + + if (!be_multi_rxq(adapter)) { + dev_info(&adapter->pdev->dev, + "ethtool::get_rxnfc: RX flow hashing is disabled\n"); + return -EINVAL; + } + + switch (cmd->cmd) { + case ETHTOOL_GRXFH: + cmd->data = be_get_rss_hash_opts(adapter, cmd->flow_type); + break; + case ETHTOOL_GRXRINGS: + cmd->data = adapter->num_rx_qs - 1; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int be_set_rss_hash_opts(struct be_adapter *adapter, + struct ethtool_rxnfc *cmd) +{ + struct be_rx_obj *rxo; + int status = 0, i, j; + u8 rsstable[128]; + u32 rss_flags = adapter->rss_info.rss_flags; + + if (cmd->data != L3_RSS_FLAGS && + cmd->data != (L3_RSS_FLAGS | L4_RSS_FLAGS)) + return -EINVAL; + + switch (cmd->flow_type) { + case TCP_V4_FLOW: + if (cmd->data == L3_RSS_FLAGS) + rss_flags &= ~RSS_ENABLE_TCP_IPV4; + else if (cmd->data == (L3_RSS_FLAGS | L4_RSS_FLAGS)) + rss_flags |= RSS_ENABLE_IPV4 | + RSS_ENABLE_TCP_IPV4; + break; + case TCP_V6_FLOW: + if (cmd->data == L3_RSS_FLAGS) + rss_flags &= ~RSS_ENABLE_TCP_IPV6; + else if (cmd->data == (L3_RSS_FLAGS | L4_RSS_FLAGS)) + rss_flags |= RSS_ENABLE_IPV6 | + RSS_ENABLE_TCP_IPV6; + break; + case UDP_V4_FLOW: + if ((cmd->data == (L3_RSS_FLAGS | L4_RSS_FLAGS)) && + BEx_chip(adapter)) + return -EINVAL; + + if (cmd->data == L3_RSS_FLAGS) + rss_flags &= ~RSS_ENABLE_UDP_IPV4; + else if (cmd->data == (L3_RSS_FLAGS | L4_RSS_FLAGS)) + rss_flags |= RSS_ENABLE_IPV4 | + RSS_ENABLE_UDP_IPV4; + break; + case UDP_V6_FLOW: + if ((cmd->data == (L3_RSS_FLAGS | L4_RSS_FLAGS)) && + BEx_chip(adapter)) + return -EINVAL; + + if (cmd->data == L3_RSS_FLAGS) + rss_flags &= ~RSS_ENABLE_UDP_IPV6; + else if (cmd->data == (L3_RSS_FLAGS | L4_RSS_FLAGS)) + rss_flags |= RSS_ENABLE_IPV6 | + RSS_ENABLE_UDP_IPV6; + break; + default: + return -EINVAL; + } + + if (rss_flags == adapter->rss_info.rss_flags) + return status; + + if (be_multi_rxq(adapter)) { + for (j = 0; j < 128; j += adapter->num_rx_qs - 1) { + for_all_rss_queues(adapter, rxo, i) { + if ((j + i) >= 128) + break; + rsstable[j + i] = rxo->rss_id; + } + } + } + + status = be_cmd_rss_config(adapter, adapter->rss_info.rsstable, + rss_flags, 128, adapter->rss_info.rss_hkey); + if (!status) + adapter->rss_info.rss_flags = rss_flags; + + return be_cmd_status(status); +} + +static int be_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd) +{ + struct be_adapter *adapter = netdev_priv(netdev); + int status = 0; + + if (!be_multi_rxq(adapter)) { + dev_err(&adapter->pdev->dev, + "ethtool::set_rxnfc: RX flow hashing is disabled\n"); + return -EINVAL; + } + + switch (cmd->cmd) { + case ETHTOOL_SRXFH: + status = be_set_rss_hash_opts(adapter, cmd); + break; + default: + return -EINVAL; + } + + return status; +} + +static void be_get_channels(struct net_device *netdev, + struct ethtool_channels *ch) +{ + struct be_adapter *adapter = netdev_priv(netdev); + + ch->combined_count = adapter->num_evt_qs; + ch->max_combined = be_max_qs(adapter); +} + +static int be_set_channels(struct net_device *netdev, + struct ethtool_channels *ch) +{ + struct be_adapter *adapter = netdev_priv(netdev); + int status; + + if (ch->rx_count || ch->tx_count || ch->other_count || + !ch->combined_count || ch->combined_count > be_max_qs(adapter)) + return -EINVAL; + + adapter->cfg_num_qs = ch->combined_count; + + status = be_update_queues(adapter); + return be_cmd_status(status); +} + +static u32 be_get_rxfh_indir_size(struct net_device *netdev) +{ + return RSS_INDIR_TABLE_LEN; +} + +static u32 be_get_rxfh_key_size(struct net_device *netdev) +{ + return RSS_HASH_KEY_LEN; +} + +static int be_get_rxfh(struct net_device *netdev, u32 *indir, u8 *hkey) +{ + struct be_adapter *adapter = netdev_priv(netdev); + int i; + struct rss_info *rss = &adapter->rss_info; + + if (indir) { + for (i = 0; i < RSS_INDIR_TABLE_LEN; i++) + indir[i] = rss->rss_queue[i]; + } + + if (hkey) + memcpy(hkey, rss->rss_hkey, RSS_HASH_KEY_LEN); + + return 0; +} + +static int be_set_rxfh(struct net_device *netdev, const u32 *indir, + const u8 *hkey) +{ + int rc = 0, i, j; + struct be_adapter *adapter = netdev_priv(netdev); + u8 rsstable[RSS_INDIR_TABLE_LEN]; + + if (indir) { + struct be_rx_obj *rxo; + + for (i = 0; i < RSS_INDIR_TABLE_LEN; i++) { + j = indir[i]; + rxo = &adapter->rx_obj[j]; + rsstable[i] = rxo->rss_id; + adapter->rss_info.rss_queue[i] = j; + } + } else { + memcpy(rsstable, adapter->rss_info.rsstable, + RSS_INDIR_TABLE_LEN); + } + + if (!hkey) + hkey = adapter->rss_info.rss_hkey; + + rc = be_cmd_rss_config(adapter, rsstable, + adapter->rss_info.rss_flags, + RSS_INDIR_TABLE_LEN, hkey); + if (rc) { + adapter->rss_info.rss_flags = RSS_ENABLE_NONE; + return -EIO; + } + memcpy(adapter->rss_info.rss_hkey, hkey, RSS_HASH_KEY_LEN); + memcpy(adapter->rss_info.rsstable, rsstable, + RSS_INDIR_TABLE_LEN); + return 0; +} + +static int be_get_module_info(struct net_device *netdev, + struct ethtool_modinfo *modinfo) +{ + struct be_adapter *adapter = netdev_priv(netdev); + u8 page_data[PAGE_DATA_LEN]; + int status; + + if (!check_privilege(adapter, MAX_PRIVILEGES)) + return -EOPNOTSUPP; + + status = be_cmd_read_port_transceiver_data(adapter, TR_PAGE_A0, + page_data); + if (!status) { + if (!page_data[SFP_PLUS_SFF_8472_COMP]) { + modinfo->type = ETH_MODULE_SFF_8079; + modinfo->eeprom_len = PAGE_DATA_LEN; + } else { + modinfo->type = ETH_MODULE_SFF_8472; + modinfo->eeprom_len = 2 * PAGE_DATA_LEN; + } + } + return be_cmd_status(status); +} + +static int be_get_module_eeprom(struct net_device *netdev, + struct ethtool_eeprom *eeprom, u8 *data) +{ + struct be_adapter *adapter = netdev_priv(netdev); + int status; + + if (!check_privilege(adapter, MAX_PRIVILEGES)) + return -EOPNOTSUPP; + + status = be_cmd_read_port_transceiver_data(adapter, TR_PAGE_A0, + data); + if (status) + goto err; + + if (eeprom->offset + eeprom->len > PAGE_DATA_LEN) { + status = be_cmd_read_port_transceiver_data(adapter, + TR_PAGE_A2, + data + + PAGE_DATA_LEN); + if (status) + goto err; + } + if (eeprom->offset) + memcpy(data, data + eeprom->offset, eeprom->len); +err: + return be_cmd_status(status); +} + +const struct ethtool_ops be_ethtool_ops = { + .get_settings = be_get_settings, + .get_drvinfo = be_get_drvinfo, + .get_wol = be_get_wol, + .set_wol = be_set_wol, + .get_link = ethtool_op_get_link, + .get_eeprom_len = be_get_eeprom_len, + .get_eeprom = be_read_eeprom, + .get_coalesce = be_get_coalesce, + .set_coalesce = be_set_coalesce, + .get_ringparam = be_get_ringparam, + .get_pauseparam = be_get_pauseparam, + .set_pauseparam = be_set_pauseparam, + .get_strings = be_get_stat_strings, + .set_phys_id = be_set_phys_id, + .set_dump = be_set_dump, + .get_msglevel = be_get_msg_level, + .set_msglevel = be_set_msg_level, + .get_sset_count = be_get_sset_count, + .get_ethtool_stats = be_get_ethtool_stats, + .get_regs_len = be_get_reg_len, + .get_regs = be_get_regs, + .flash_device = be_do_flash, + .self_test = be_self_test, + .get_rxnfc = be_get_rxnfc, + .set_rxnfc = be_set_rxnfc, + .get_rxfh_indir_size = be_get_rxfh_indir_size, + .get_rxfh_key_size = be_get_rxfh_key_size, + .get_rxfh = be_get_rxfh, + .set_rxfh = be_set_rxfh, + .get_channels = be_get_channels, + .set_channels = be_set_channels, + .get_module_info = be_get_module_info, + .get_module_eeprom = be_get_module_eeprom +}; diff --git a/drivers/net/ethernet/emulex/benet/be_hw.h b/drivers/net/ethernet/emulex/benet/be_hw.h new file mode 100644 index 0000000..295ee08 --- /dev/null +++ b/drivers/net/ethernet/emulex/benet/be_hw.h @@ -0,0 +1,572 @@ +/* + * Copyright (C) 2005 - 2014 Emulex + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. The full GNU General + * Public License is included in this distribution in the file called COPYING. + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + */ + +/********* Mailbox door bell *************/ +/* Used for driver communication with the FW. + * The software must write this register twice to post any command. First, + * it writes the register with hi=1 and the upper bits of the physical address + * for the MAILBOX structure. Software must poll the ready bit until this + * is acknowledged. Then, sotware writes the register with hi=0 with the lower + * bits in the address. It must poll the ready bit until the command is + * complete. Upon completion, the MAILBOX will contain a valid completion + * queue entry. + */ +#define MPU_MAILBOX_DB_OFFSET 0x160 +#define MPU_MAILBOX_DB_RDY_MASK 0x1 /* bit 0 */ +#define MPU_MAILBOX_DB_HI_MASK 0x2 /* bit 1 */ + +#define MPU_EP_CONTROL 0 + +/********** MPU semphore: used for SH & BE *************/ +#define SLIPORT_SEMAPHORE_OFFSET_BEx 0xac /* CSR BAR offset */ +#define SLIPORT_SEMAPHORE_OFFSET_SH 0x94 /* PCI-CFG offset */ +#define POST_STAGE_MASK 0x0000FFFF +#define POST_ERR_MASK 0x1 +#define POST_ERR_SHIFT 31 + +/* MPU semphore POST stage values */ +#define POST_STAGE_AWAITING_HOST_RDY 0x1 /* FW awaiting goahead from host */ +#define POST_STAGE_HOST_RDY 0x2 /* Host has given go-ahed to FW */ +#define POST_STAGE_BE_RESET 0x3 /* Host wants to reset chip */ +#define POST_STAGE_ARMFW_RDY 0xc000 /* FW is done with POST */ + + +/* Lancer SLIPORT registers */ +#define SLIPORT_STATUS_OFFSET 0x404 +#define SLIPORT_CONTROL_OFFSET 0x408 +#define SLIPORT_ERROR1_OFFSET 0x40C +#define SLIPORT_ERROR2_OFFSET 0x410 +#define PHYSDEV_CONTROL_OFFSET 0x414 + +#define SLIPORT_STATUS_ERR_MASK 0x80000000 +#define SLIPORT_STATUS_DIP_MASK 0x02000000 +#define SLIPORT_STATUS_RN_MASK 0x01000000 +#define SLIPORT_STATUS_RDY_MASK 0x00800000 +#define SLI_PORT_CONTROL_IP_MASK 0x08000000 +#define PHYSDEV_CONTROL_FW_RESET_MASK 0x00000002 +#define PHYSDEV_CONTROL_DD_MASK 0x00000004 +#define PHYSDEV_CONTROL_INP_MASK 0x40000000 + +#define SLIPORT_ERROR_NO_RESOURCE1 0x2 +#define SLIPORT_ERROR_NO_RESOURCE2 0x9 + +#define SLIPORT_ERROR_FW_RESET1 0x2 +#define SLIPORT_ERROR_FW_RESET2 0x0 + +/********* Memory BAR register ************/ +#define PCICFG_MEMBAR_CTRL_INT_CTRL_OFFSET 0xfc +/* Host Interrupt Enable, if set interrupts are enabled although "PCI Interrupt + * Disable" may still globally block interrupts in addition to individual + * interrupt masks; a mechanism for the device driver to block all interrupts + * atomically without having to arbitrate for the PCI Interrupt Disable bit + * with the OS. + */ +#define MEMBAR_CTRL_INT_CTRL_HOSTINTR_MASK (1 << 29) /* bit 29 */ + +/********* PCI Function Capability *********/ +#define BE_FUNCTION_CAPS_RSS 0x2 +#define BE_FUNCTION_CAPS_SUPER_NIC 0x40 + +/********* Power management (WOL) **********/ +#define PCICFG_PM_CONTROL_OFFSET 0x44 +#define PCICFG_PM_CONTROL_MASK 0x108 /* bits 3 & 8 */ + +/********* Online Control Registers *******/ +#define PCICFG_ONLINE0 0xB0 +#define PCICFG_ONLINE1 0xB4 + +/********* UE Status and Mask Registers ***/ +#define PCICFG_UE_STATUS_LOW 0xA0 +#define PCICFG_UE_STATUS_HIGH 0xA4 +#define PCICFG_UE_STATUS_LOW_MASK 0xA8 +#define PCICFG_UE_STATUS_HI_MASK 0xAC + +/******** SLI_INTF ***********************/ +#define SLI_INTF_REG_OFFSET 0x58 +#define SLI_INTF_VALID_MASK 0xE0000000 +#define SLI_INTF_VALID 0xC0000000 +#define SLI_INTF_HINT2_MASK 0x1F000000 +#define SLI_INTF_HINT2_SHIFT 24 +#define SLI_INTF_HINT1_MASK 0x00FF0000 +#define SLI_INTF_HINT1_SHIFT 16 +#define SLI_INTF_FAMILY_MASK 0x00000F00 +#define SLI_INTF_FAMILY_SHIFT 8 +#define SLI_INTF_IF_TYPE_MASK 0x0000F000 +#define SLI_INTF_IF_TYPE_SHIFT 12 +#define SLI_INTF_REV_MASK 0x000000F0 +#define SLI_INTF_REV_SHIFT 4 +#define SLI_INTF_FT_MASK 0x00000001 + +#define SLI_INTF_TYPE_2 2 +#define SLI_INTF_TYPE_3 3 + +/********* ISR0 Register offset **********/ +#define CEV_ISR0_OFFSET 0xC18 +#define CEV_ISR_SIZE 4 + +/********* Event Q door bell *************/ +#define DB_EQ_OFFSET DB_CQ_OFFSET +#define DB_EQ_RING_ID_MASK 0x1FF /* bits 0 - 8 */ +#define DB_EQ_RING_ID_EXT_MASK 0x3e00 /* bits 9-13 */ +#define DB_EQ_RING_ID_EXT_MASK_SHIFT (2) /* qid bits 9-13 placing at 11-15 */ + +/* Clear the interrupt for this eq */ +#define DB_EQ_CLR_SHIFT (9) /* bit 9 */ +/* Must be 1 */ +#define DB_EQ_EVNT_SHIFT (10) /* bit 10 */ +/* Number of event entries processed */ +#define DB_EQ_NUM_POPPED_SHIFT (16) /* bits 16 - 28 */ +/* Rearm bit */ +#define DB_EQ_REARM_SHIFT (29) /* bit 29 */ + +/********* Compl Q door bell *************/ +#define DB_CQ_OFFSET 0x120 +#define DB_CQ_RING_ID_MASK 0x3FF /* bits 0 - 9 */ +#define DB_CQ_RING_ID_EXT_MASK 0x7C00 /* bits 10-14 */ +#define DB_CQ_RING_ID_EXT_MASK_SHIFT (1) /* qid bits 10-14 + placing at 11-15 */ + +/* Number of event entries processed */ +#define DB_CQ_NUM_POPPED_SHIFT (16) /* bits 16 - 28 */ +/* Rearm bit */ +#define DB_CQ_REARM_SHIFT (29) /* bit 29 */ + +/********** TX ULP door bell *************/ +#define DB_TXULP1_OFFSET 0x60 +#define DB_TXULP_RING_ID_MASK 0x7FF /* bits 0 - 10 */ +/* Number of tx entries posted */ +#define DB_TXULP_NUM_POSTED_SHIFT (16) /* bits 16 - 29 */ +#define DB_TXULP_NUM_POSTED_MASK 0x3FFF /* bits 16 - 29 */ + +/********** RQ(erx) door bell ************/ +#define DB_RQ_OFFSET 0x100 +#define DB_RQ_RING_ID_MASK 0x3FF /* bits 0 - 9 */ +/* Number of rx frags posted */ +#define DB_RQ_NUM_POSTED_SHIFT (24) /* bits 24 - 31 */ + +/********** MCC door bell ************/ +#define DB_MCCQ_OFFSET 0x140 +#define DB_MCCQ_RING_ID_MASK 0x7FF /* bits 0 - 10 */ +/* Number of entries posted */ +#define DB_MCCQ_NUM_POSTED_SHIFT (16) /* bits 16 - 29 */ + +/********** SRIOV VF PCICFG OFFSET ********/ +#define SRIOV_VF_PCICFG_OFFSET (4096) + +/********** FAT TABLE ********/ +#define RETRIEVE_FAT 0 +#define QUERY_FAT 1 + +/* Flashrom related descriptors */ +#define MAX_FLASH_COMP 32 +#define IMAGE_TYPE_FIRMWARE 160 +#define IMAGE_TYPE_BOOTCODE 224 +#define IMAGE_TYPE_OPTIONROM 32 + +#define NUM_FLASHDIR_ENTRIES 32 + +#define OPTYPE_ISCSI_ACTIVE 0 +#define OPTYPE_REDBOOT 1 +#define OPTYPE_BIOS 2 +#define OPTYPE_PXE_BIOS 3 +#define OPTYPE_FCOE_BIOS 8 +#define OPTYPE_ISCSI_BACKUP 9 +#define OPTYPE_FCOE_FW_ACTIVE 10 +#define OPTYPE_FCOE_FW_BACKUP 11 +#define OPTYPE_NCSI_FW 13 +#define OPTYPE_REDBOOT_DIR 18 +#define OPTYPE_REDBOOT_CONFIG 19 +#define OPTYPE_SH_PHY_FW 21 +#define OPTYPE_FLASHISM_JUMPVECTOR 22 +#define OPTYPE_UFI_DIR 23 +#define OPTYPE_PHY_FW 99 +#define TN_8022 13 + +#define FLASHROM_OPER_PHY_FLASH 9 +#define FLASHROM_OPER_PHY_SAVE 10 +#define FLASHROM_OPER_FLASH 1 +#define FLASHROM_OPER_SAVE 2 +#define FLASHROM_OPER_REPORT 4 + +#define FLASH_IMAGE_MAX_SIZE_g2 (1310720) /* Max firmware image size */ +#define FLASH_BIOS_IMAGE_MAX_SIZE_g2 (262144) /* Max OPTION ROM image sz */ +#define FLASH_REDBOOT_IMAGE_MAX_SIZE_g2 (262144) /* Max Redboot image sz */ +#define FLASH_IMAGE_MAX_SIZE_g3 (2097152) /* Max firmware image size */ +#define FLASH_BIOS_IMAGE_MAX_SIZE_g3 (524288) /* Max OPTION ROM image sz */ +#define FLASH_REDBOOT_IMAGE_MAX_SIZE_g3 (1048576) /* Max Redboot image sz */ +#define FLASH_NCSI_IMAGE_MAX_SIZE_g3 (262144) +#define FLASH_PHY_FW_IMAGE_MAX_SIZE_g3 262144 + +#define FLASH_NCSI_MAGIC (0x16032009) +#define FLASH_NCSI_DISABLED (0) +#define FLASH_NCSI_ENABLED (1) + +#define FLASH_NCSI_BITFILE_HDR_OFFSET (0x600000) + +/* Offsets for components on Flash. */ +#define FLASH_iSCSI_PRIMARY_IMAGE_START_g2 (1048576) +#define FLASH_iSCSI_BACKUP_IMAGE_START_g2 (2359296) +#define FLASH_FCoE_PRIMARY_IMAGE_START_g2 (3670016) +#define FLASH_FCoE_BACKUP_IMAGE_START_g2 (4980736) +#define FLASH_iSCSI_BIOS_START_g2 (7340032) +#define FLASH_PXE_BIOS_START_g2 (7864320) +#define FLASH_FCoE_BIOS_START_g2 (524288) +#define FLASH_REDBOOT_START_g2 (0) + +#define FLASH_NCSI_START_g3 (15990784) +#define FLASH_iSCSI_PRIMARY_IMAGE_START_g3 (2097152) +#define FLASH_iSCSI_BACKUP_IMAGE_START_g3 (4194304) +#define FLASH_FCoE_PRIMARY_IMAGE_START_g3 (6291456) +#define FLASH_FCoE_BACKUP_IMAGE_START_g3 (8388608) +#define FLASH_iSCSI_BIOS_START_g3 (12582912) +#define FLASH_PXE_BIOS_START_g3 (13107200) +#define FLASH_FCoE_BIOS_START_g3 (13631488) +#define FLASH_REDBOOT_START_g3 (262144) +#define FLASH_PHY_FW_START_g3 1310720 + +#define IMAGE_NCSI 16 +#define IMAGE_OPTION_ROM_PXE 32 +#define IMAGE_OPTION_ROM_FCoE 33 +#define IMAGE_OPTION_ROM_ISCSI 34 +#define IMAGE_FLASHISM_JUMPVECTOR 48 +#define IMAGE_FLASH_ISM 49 +#define IMAGE_JUMP_VECTOR 50 +#define IMAGE_FIRMWARE_iSCSI 160 +#define IMAGE_FIRMWARE_COMP_iSCSI 161 +#define IMAGE_FIRMWARE_FCoE 162 +#define IMAGE_FIRMWARE_COMP_FCoE 163 +#define IMAGE_FIRMWARE_BACKUP_iSCSI 176 +#define IMAGE_FIRMWARE_BACKUP_COMP_iSCSI 177 +#define IMAGE_FIRMWARE_BACKUP_FCoE 178 +#define IMAGE_FIRMWARE_BACKUP_COMP_FCoE 179 +#define IMAGE_FIRMWARE_PHY 192 +#define IMAGE_REDBOOT_DIR 208 +#define IMAGE_REDBOOT_CONFIG 209 +#define IMAGE_UFI_DIR 210 +#define IMAGE_BOOT_CODE 224 + +/************* Rx Packet Type Encoding **************/ +#define BE_UNICAST_PACKET 0 +#define BE_MULTICAST_PACKET 1 +#define BE_BROADCAST_PACKET 2 +#define BE_RSVD_PACKET 3 + +/* + * BE descriptors: host memory data structures whose formats + * are hardwired in BE silicon. + */ +/* Event Queue Descriptor */ +#define EQ_ENTRY_VALID_MASK 0x1 /* bit 0 */ +#define EQ_ENTRY_RES_ID_MASK 0xFFFF /* bits 16 - 31 */ +#define EQ_ENTRY_RES_ID_SHIFT 16 + +struct be_eq_entry { + u32 evt; +}; + +/* TX Queue Descriptor */ +#define ETH_WRB_FRAG_LEN_MASK 0xFFFF +struct be_eth_wrb { + u32 frag_pa_hi; /* dword 0 */ + u32 frag_pa_lo; /* dword 1 */ + u32 rsvd0; /* dword 2 */ + u32 frag_len; /* dword 3: bits 0 - 15 */ +} __packed; + +/* Pseudo amap definition for eth_hdr_wrb in which each bit of the + * actual structure is defined as a byte : used to calculate + * offset/shift/mask of each field */ +struct amap_eth_hdr_wrb { + u8 rsvd0[32]; /* dword 0 */ + u8 rsvd1[32]; /* dword 1 */ + u8 complete; /* dword 2 */ + u8 event; + u8 crc; + u8 forward; + u8 lso6; + u8 mgmt; + u8 ipcs; + u8 udpcs; + u8 tcpcs; + u8 lso; + u8 vlan; + u8 gso[2]; + u8 num_wrb[5]; + u8 lso_mss[14]; + u8 len[16]; /* dword 3 */ + u8 vlan_tag[16]; +} __packed; + +struct be_eth_hdr_wrb { + u32 dw[4]; +}; + +/********* Tx Compl Status Encoding *********/ +#define BE_TX_COMP_HDR_PARSE_ERR 0x2 +#define BE_TX_COMP_NDMA_ERR 0x3 +#define BE_TX_COMP_ACL_ERR 0x5 + +#define LANCER_TX_COMP_LSO_ERR 0x1 +#define LANCER_TX_COMP_HSW_DROP_MAC_ERR 0x3 +#define LANCER_TX_COMP_HSW_DROP_VLAN_ERR 0x5 +#define LANCER_TX_COMP_QINQ_ERR 0x7 +#define LANCER_TX_COMP_PARITY_ERR 0xb +#define LANCER_TX_COMP_DMA_ERR 0xd + +/* TX Compl Queue Descriptor */ + +/* Pseudo amap definition for eth_tx_compl in which each bit of the + * actual structure is defined as a byte: used to calculate + * offset/shift/mask of each field */ +struct amap_eth_tx_compl { + u8 wrb_index[16]; /* dword 0 */ + u8 ct[2]; /* dword 0 */ + u8 port[2]; /* dword 0 */ + u8 rsvd0[8]; /* dword 0 */ + u8 status[4]; /* dword 0 */ + u8 user_bytes[16]; /* dword 1 */ + u8 nwh_bytes[8]; /* dword 1 */ + u8 lso; /* dword 1 */ + u8 cast_enc[2]; /* dword 1 */ + u8 rsvd1[5]; /* dword 1 */ + u8 rsvd2[32]; /* dword 2 */ + u8 pkts[16]; /* dword 3 */ + u8 ringid[11]; /* dword 3 */ + u8 hash_val[4]; /* dword 3 */ + u8 valid; /* dword 3 */ +} __packed; + +struct be_eth_tx_compl { + u32 dw[4]; +}; + +/* RX Queue Descriptor */ +struct be_eth_rx_d { + u32 fragpa_hi; + u32 fragpa_lo; +}; + +/* RX Compl Queue Descriptor */ + +/* Pseudo amap definition for BE2 and BE3 legacy mode eth_rx_compl in which + * each bit of the actual structure is defined as a byte: used to calculate + * offset/shift/mask of each field */ +struct amap_eth_rx_compl_v0 { + u8 vlan_tag[16]; /* dword 0 */ + u8 pktsize[14]; /* dword 0 */ + u8 port; /* dword 0 */ + u8 ip_opt; /* dword 0 */ + u8 err; /* dword 1 */ + u8 rsshp; /* dword 1 */ + u8 ipf; /* dword 1 */ + u8 tcpf; /* dword 1 */ + u8 udpf; /* dword 1 */ + u8 ipcksm; /* dword 1 */ + u8 l4_cksm; /* dword 1 */ + u8 ip_version; /* dword 1 */ + u8 macdst[6]; /* dword 1 */ + u8 vtp; /* dword 1 */ + u8 ip_frag; /* dword 1 */ + u8 fragndx[10]; /* dword 1 */ + u8 ct[2]; /* dword 1 */ + u8 sw; /* dword 1 */ + u8 numfrags[3]; /* dword 1 */ + u8 rss_flush; /* dword 2 */ + u8 cast_enc[2]; /* dword 2 */ + u8 qnq; /* dword 2 */ + u8 rss_bank; /* dword 2 */ + u8 rsvd1[23]; /* dword 2 */ + u8 lro_pkt; /* dword 2 */ + u8 rsvd2[2]; /* dword 2 */ + u8 valid; /* dword 2 */ + u8 rsshash[32]; /* dword 3 */ +} __packed; + +/* Pseudo amap definition for BE3 native mode eth_rx_compl in which + * each bit of the actual structure is defined as a byte: used to calculate + * offset/shift/mask of each field */ +struct amap_eth_rx_compl_v1 { + u8 vlan_tag[16]; /* dword 0 */ + u8 pktsize[14]; /* dword 0 */ + u8 vtp; /* dword 0 */ + u8 ip_opt; /* dword 0 */ + u8 err; /* dword 1 */ + u8 rsshp; /* dword 1 */ + u8 ipf; /* dword 1 */ + u8 tcpf; /* dword 1 */ + u8 udpf; /* dword 1 */ + u8 ipcksm; /* dword 1 */ + u8 l4_cksm; /* dword 1 */ + u8 ip_version; /* dword 1 */ + u8 macdst[7]; /* dword 1 */ + u8 rsvd0; /* dword 1 */ + u8 fragndx[10]; /* dword 1 */ + u8 ct[2]; /* dword 1 */ + u8 sw; /* dword 1 */ + u8 numfrags[3]; /* dword 1 */ + u8 rss_flush; /* dword 2 */ + u8 cast_enc[2]; /* dword 2 */ + u8 qnq; /* dword 2 */ + u8 rss_bank; /* dword 2 */ + u8 port[2]; /* dword 2 */ + u8 vntagp; /* dword 2 */ + u8 header_len[8]; /* dword 2 */ + u8 header_split[2]; /* dword 2 */ + u8 rsvd1[12]; /* dword 2 */ + u8 tunneled; + u8 valid; /* dword 2 */ + u8 rsshash[32]; /* dword 3 */ +} __packed; + +struct be_eth_rx_compl { + u32 dw[4]; +}; + +struct mgmt_hba_attribs { + u8 flashrom_version_string[32]; + u8 manufacturer_name[32]; + u32 supported_modes; + u32 rsvd0[3]; + u8 ncsi_ver_string[12]; + u32 default_extended_timeout; + u8 controller_model_number[32]; + u8 controller_description[64]; + u8 controller_serial_number[32]; + u8 ip_version_string[32]; + u8 firmware_version_string[32]; + u8 bios_version_string[32]; + u8 redboot_version_string[32]; + u8 driver_version_string[32]; + u8 fw_on_flash_version_string[32]; + u32 functionalities_supported; + u16 max_cdblength; + u8 asic_revision; + u8 generational_guid[16]; + u8 hba_port_count; + u16 default_link_down_timeout; + u8 iscsi_ver_min_max; + u8 multifunction_device; + u8 cache_valid; + u8 hba_status; + u8 max_domains_supported; + u8 phy_port; + u32 firmware_post_status; + u32 hba_mtu[8]; + u32 rsvd1[4]; +}; + +struct mgmt_controller_attrib { + struct mgmt_hba_attribs hba_attribs; + u16 pci_vendor_id; + u16 pci_device_id; + u16 pci_sub_vendor_id; + u16 pci_sub_system_id; + u8 pci_bus_number; + u8 pci_device_number; + u8 pci_function_number; + u8 interface_type; + u64 unique_identifier; + u32 rsvd0[5]; +}; + +struct controller_id { + u32 vendor; + u32 device; + u32 subvendor; + u32 subdevice; +}; + +struct flash_comp { + unsigned long offset; + int optype; + int size; + int img_type; +}; + +struct image_hdr { + u32 imageid; + u32 imageoffset; + u32 imagelength; + u32 image_checksum; + u8 image_version[32]; +}; +struct flash_file_hdr_g2 { + u8 sign[32]; + u32 cksum; + u32 antidote; + struct controller_id cont_id; + u32 file_len; + u32 chunk_num; + u32 total_chunks; + u32 num_imgs; + u8 build[24]; +}; + +struct flash_file_hdr_g3 { + u8 sign[52]; + u8 ufi_version[4]; + u32 file_len; + u32 cksum; + u32 antidote; + u32 num_imgs; + u8 build[24]; + u8 asic_type_rev; + u8 rsvd[31]; +}; + +struct flash_section_hdr { + u32 format_rev; + u32 cksum; + u32 antidote; + u32 num_images; + u8 id_string[128]; + u32 rsvd[4]; +} __packed; + +struct flash_section_hdr_g2 { + u32 format_rev; + u32 cksum; + u32 antidote; + u32 build_num; + u8 id_string[128]; + u32 rsvd[8]; +} __packed; + +struct flash_section_entry { + u32 type; + u32 offset; + u32 pad_size; + u32 image_size; + u32 cksum; + u32 entry_point; + u16 optype; + u16 rsvd0; + u32 rsvd1; + u8 ver_data[32]; +} __packed; + +struct flash_section_info { + u8 cookie[32]; + struct flash_section_hdr fsec_hdr; + struct flash_section_entry fsec_entry[32]; +} __packed; + +struct flash_section_info_g2 { + u8 cookie[32]; + struct flash_section_hdr_g2 fsec_hdr; + struct flash_section_entry fsec_entry[32]; +} __packed; diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c new file mode 100644 index 0000000..597c463 --- /dev/null +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -0,0 +1,5211 @@ +/* + * Copyright (C) 2005 - 2014 Emulex + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. The full GNU General + * Public License is included in this distribution in the file called COPYING. + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + */ + +#include +#include +#include "be.h" +#include "be_cmds.h" +#include +#include +#include +#include +#include + +MODULE_VERSION(DRV_VER); +MODULE_DEVICE_TABLE(pci, be_dev_ids); +MODULE_DESCRIPTION(DRV_DESC " " DRV_VER); +MODULE_AUTHOR("Emulex Corporation"); +MODULE_LICENSE("GPL"); + +static unsigned int num_vfs; +module_param(num_vfs, uint, S_IRUGO); +MODULE_PARM_DESC(num_vfs, "Number of PCI VFs to initialize"); + +static ushort rx_frag_size = 2048; +module_param(rx_frag_size, ushort, S_IRUGO); +MODULE_PARM_DESC(rx_frag_size, "Size of a fragment that holds rcvd data."); + +static const struct pci_device_id be_dev_ids[] = { + { PCI_DEVICE(BE_VENDOR_ID, BE_DEVICE_ID1) }, + { PCI_DEVICE(BE_VENDOR_ID, BE_DEVICE_ID2) }, + { PCI_DEVICE(BE_VENDOR_ID, OC_DEVICE_ID1) }, + { PCI_DEVICE(BE_VENDOR_ID, OC_DEVICE_ID2) }, + { PCI_DEVICE(EMULEX_VENDOR_ID, OC_DEVICE_ID3)}, + { PCI_DEVICE(EMULEX_VENDOR_ID, OC_DEVICE_ID4)}, + { PCI_DEVICE(EMULEX_VENDOR_ID, OC_DEVICE_ID5)}, + { PCI_DEVICE(EMULEX_VENDOR_ID, OC_DEVICE_ID6)}, + { 0 } +}; +MODULE_DEVICE_TABLE(pci, be_dev_ids); +/* UE Status Low CSR */ +static const char * const ue_status_low_desc[] = { + "CEV", + "CTX", + "DBUF", + "ERX", + "Host", + "MPU", + "NDMA", + "PTC ", + "RDMA ", + "RXF ", + "RXIPS ", + "RXULP0 ", + "RXULP1 ", + "RXULP2 ", + "TIM ", + "TPOST ", + "TPRE ", + "TXIPS ", + "TXULP0 ", + "TXULP1 ", + "UC ", + "WDMA ", + "TXULP2 ", + "HOST1 ", + "P0_OB_LINK ", + "P1_OB_LINK ", + "HOST_GPIO ", + "MBOX ", + "ERX2 ", + "SPARE ", + "JTAG ", + "MPU_INTPEND " +}; + +/* UE Status High CSR */ +static const char * const ue_status_hi_desc[] = { + "LPCMEMHOST", + "MGMT_MAC", + "PCS0ONLINE", + "MPU_IRAM", + "PCS1ONLINE", + "PCTL0", + "PCTL1", + "PMEM", + "RR", + "TXPB", + "RXPP", + "XAUI", + "TXP", + "ARM", + "IPC", + "HOST2", + "HOST3", + "HOST4", + "HOST5", + "HOST6", + "HOST7", + "ECRC", + "Poison TLP", + "NETC", + "PERIPH", + "LLTXULP", + "D2P", + "RCON", + "LDMA", + "LLTXP", + "LLTXPB", + "Unknown" +}; + +static void be_queue_free(struct be_adapter *adapter, struct be_queue_info *q) +{ + struct be_dma_mem *mem = &q->dma_mem; + + if (mem->va) { + dma_free_coherent(&adapter->pdev->dev, mem->size, mem->va, + mem->dma); + mem->va = NULL; + } +} + +static int be_queue_alloc(struct be_adapter *adapter, struct be_queue_info *q, + u16 len, u16 entry_size) +{ + struct be_dma_mem *mem = &q->dma_mem; + + memset(q, 0, sizeof(*q)); + q->len = len; + q->entry_size = entry_size; + mem->size = len * entry_size; + mem->va = dma_zalloc_coherent(&adapter->pdev->dev, mem->size, &mem->dma, + GFP_KERNEL); + if (!mem->va) + return -ENOMEM; + return 0; +} + +static void be_reg_intr_set(struct be_adapter *adapter, bool enable) +{ + u32 reg, enabled; + + pci_read_config_dword(adapter->pdev, PCICFG_MEMBAR_CTRL_INT_CTRL_OFFSET, + ®); + enabled = reg & MEMBAR_CTRL_INT_CTRL_HOSTINTR_MASK; + + if (!enabled && enable) + reg |= MEMBAR_CTRL_INT_CTRL_HOSTINTR_MASK; + else if (enabled && !enable) + reg &= ~MEMBAR_CTRL_INT_CTRL_HOSTINTR_MASK; + else + return; + + pci_write_config_dword(adapter->pdev, + PCICFG_MEMBAR_CTRL_INT_CTRL_OFFSET, reg); +} + +static void be_intr_set(struct be_adapter *adapter, bool enable) +{ + int status = 0; + + /* On lancer interrupts can't be controlled via this register */ + if (lancer_chip(adapter)) + return; + + if (adapter->eeh_error) + return; + + status = be_cmd_intr_set(adapter, enable); + if (status) + be_reg_intr_set(adapter, enable); +} + +static void be_rxq_notify(struct be_adapter *adapter, u16 qid, u16 posted) +{ + u32 val = 0; + + val |= qid & DB_RQ_RING_ID_MASK; + val |= posted << DB_RQ_NUM_POSTED_SHIFT; + + wmb(); + iowrite32(val, adapter->db + DB_RQ_OFFSET); +} + +static void be_txq_notify(struct be_adapter *adapter, struct be_tx_obj *txo, + u16 posted) +{ + u32 val = 0; + + val |= txo->q.id & DB_TXULP_RING_ID_MASK; + val |= (posted & DB_TXULP_NUM_POSTED_MASK) << DB_TXULP_NUM_POSTED_SHIFT; + + wmb(); + iowrite32(val, adapter->db + txo->db_offset); +} + +static void be_eq_notify(struct be_adapter *adapter, u16 qid, + bool arm, bool clear_int, u16 num_popped) +{ + u32 val = 0; + + val |= qid & DB_EQ_RING_ID_MASK; + val |= ((qid & DB_EQ_RING_ID_EXT_MASK) << DB_EQ_RING_ID_EXT_MASK_SHIFT); + + if (adapter->eeh_error) + return; + + if (arm) + val |= 1 << DB_EQ_REARM_SHIFT; + if (clear_int) + val |= 1 << DB_EQ_CLR_SHIFT; + val |= 1 << DB_EQ_EVNT_SHIFT; + val |= num_popped << DB_EQ_NUM_POPPED_SHIFT; + iowrite32(val, adapter->db + DB_EQ_OFFSET); +} + +void be_cq_notify(struct be_adapter *adapter, u16 qid, bool arm, u16 num_popped) +{ + u32 val = 0; + + val |= qid & DB_CQ_RING_ID_MASK; + val |= ((qid & DB_CQ_RING_ID_EXT_MASK) << + DB_CQ_RING_ID_EXT_MASK_SHIFT); + + if (adapter->eeh_error) + return; + + if (arm) + val |= 1 << DB_CQ_REARM_SHIFT; + val |= num_popped << DB_CQ_NUM_POPPED_SHIFT; + iowrite32(val, adapter->db + DB_CQ_OFFSET); +} + +static int be_mac_addr_set(struct net_device *netdev, void *p) +{ + struct be_adapter *adapter = netdev_priv(netdev); + struct device *dev = &adapter->pdev->dev; + struct sockaddr *addr = p; + int status; + u8 mac[ETH_ALEN]; + u32 old_pmac_id = adapter->pmac_id[0], curr_pmac_id = 0; + + if (!is_valid_ether_addr(addr->sa_data)) + return -EADDRNOTAVAIL; + + /* Proceed further only if, User provided MAC is different + * from active MAC + */ + if (ether_addr_equal(addr->sa_data, netdev->dev_addr)) + return 0; + + /* The PMAC_ADD cmd may fail if the VF doesn't have FILTMGMT + * privilege or if PF did not provision the new MAC address. + * On BE3, this cmd will always fail if the VF doesn't have the + * FILTMGMT privilege. This failure is OK, only if the PF programmed + * the MAC for the VF. + */ + status = be_cmd_pmac_add(adapter, (u8 *)addr->sa_data, + adapter->if_handle, &adapter->pmac_id[0], 0); + if (!status) { + curr_pmac_id = adapter->pmac_id[0]; + + /* Delete the old programmed MAC. This call may fail if the + * old MAC was already deleted by the PF driver. + */ + if (adapter->pmac_id[0] != old_pmac_id) + be_cmd_pmac_del(adapter, adapter->if_handle, + old_pmac_id, 0); + } + + /* Decide if the new MAC is successfully activated only after + * querying the FW + */ + status = be_cmd_get_active_mac(adapter, curr_pmac_id, mac, + adapter->if_handle, true, 0); + if (status) + goto err; + + /* The MAC change did not happen, either due to lack of privilege + * or PF didn't pre-provision. + */ + if (!ether_addr_equal(addr->sa_data, mac)) { + status = -EPERM; + goto err; + } + + memcpy(netdev->dev_addr, addr->sa_data, netdev->addr_len); + dev_info(dev, "MAC address changed to %pM\n", mac); + return 0; +err: + dev_warn(dev, "MAC address change to %pM failed\n", addr->sa_data); + return status; +} + +/* BE2 supports only v0 cmd */ +static void *hw_stats_from_cmd(struct be_adapter *adapter) +{ + if (BE2_chip(adapter)) { + struct be_cmd_resp_get_stats_v0 *cmd = adapter->stats_cmd.va; + + return &cmd->hw_stats; + } else if (BE3_chip(adapter)) { + struct be_cmd_resp_get_stats_v1 *cmd = adapter->stats_cmd.va; + + return &cmd->hw_stats; + } else { + struct be_cmd_resp_get_stats_v2 *cmd = adapter->stats_cmd.va; + + return &cmd->hw_stats; + } +} + +/* BE2 supports only v0 cmd */ +static void *be_erx_stats_from_cmd(struct be_adapter *adapter) +{ + if (BE2_chip(adapter)) { + struct be_hw_stats_v0 *hw_stats = hw_stats_from_cmd(adapter); + + return &hw_stats->erx; + } else if (BE3_chip(adapter)) { + struct be_hw_stats_v1 *hw_stats = hw_stats_from_cmd(adapter); + + return &hw_stats->erx; + } else { + struct be_hw_stats_v2 *hw_stats = hw_stats_from_cmd(adapter); + + return &hw_stats->erx; + } +} + +static void populate_be_v0_stats(struct be_adapter *adapter) +{ + struct be_hw_stats_v0 *hw_stats = hw_stats_from_cmd(adapter); + struct be_pmem_stats *pmem_sts = &hw_stats->pmem; + struct be_rxf_stats_v0 *rxf_stats = &hw_stats->rxf; + struct be_port_rxf_stats_v0 *port_stats = + &rxf_stats->port[adapter->port_num]; + struct be_drv_stats *drvs = &adapter->drv_stats; + + be_dws_le_to_cpu(hw_stats, sizeof(*hw_stats)); + drvs->rx_pause_frames = port_stats->rx_pause_frames; + drvs->rx_crc_errors = port_stats->rx_crc_errors; + drvs->rx_control_frames = port_stats->rx_control_frames; + drvs->rx_in_range_errors = port_stats->rx_in_range_errors; + drvs->rx_frame_too_long = port_stats->rx_frame_too_long; + drvs->rx_dropped_runt = port_stats->rx_dropped_runt; + drvs->rx_ip_checksum_errs = port_stats->rx_ip_checksum_errs; + drvs->rx_tcp_checksum_errs = port_stats->rx_tcp_checksum_errs; + drvs->rx_udp_checksum_errs = port_stats->rx_udp_checksum_errs; + drvs->rxpp_fifo_overflow_drop = port_stats->rx_fifo_overflow; + drvs->rx_dropped_tcp_length = port_stats->rx_dropped_tcp_length; + drvs->rx_dropped_too_small = port_stats->rx_dropped_too_small; + drvs->rx_dropped_too_short = port_stats->rx_dropped_too_short; + drvs->rx_out_range_errors = port_stats->rx_out_range_errors; + drvs->rx_input_fifo_overflow_drop = port_stats->rx_input_fifo_overflow; + drvs->rx_dropped_header_too_small = + port_stats->rx_dropped_header_too_small; + drvs->rx_address_filtered = + port_stats->rx_address_filtered + + port_stats->rx_vlan_filtered; + drvs->rx_alignment_symbol_errors = + port_stats->rx_alignment_symbol_errors; + + drvs->tx_pauseframes = port_stats->tx_pauseframes; + drvs->tx_controlframes = port_stats->tx_controlframes; + + if (adapter->port_num) + drvs->jabber_events = rxf_stats->port1_jabber_events; + else + drvs->jabber_events = rxf_stats->port0_jabber_events; + drvs->rx_drops_no_pbuf = rxf_stats->rx_drops_no_pbuf; + drvs->rx_drops_no_erx_descr = rxf_stats->rx_drops_no_erx_descr; + drvs->forwarded_packets = rxf_stats->forwarded_packets; + drvs->rx_drops_mtu = rxf_stats->rx_drops_mtu; + drvs->rx_drops_no_tpre_descr = rxf_stats->rx_drops_no_tpre_descr; + drvs->rx_drops_too_many_frags = rxf_stats->rx_drops_too_many_frags; + adapter->drv_stats.eth_red_drops = pmem_sts->eth_red_drops; +} + +static void populate_be_v1_stats(struct be_adapter *adapter) +{ + struct be_hw_stats_v1 *hw_stats = hw_stats_from_cmd(adapter); + struct be_pmem_stats *pmem_sts = &hw_stats->pmem; + struct be_rxf_stats_v1 *rxf_stats = &hw_stats->rxf; + struct be_port_rxf_stats_v1 *port_stats = + &rxf_stats->port[adapter->port_num]; + struct be_drv_stats *drvs = &adapter->drv_stats; + + be_dws_le_to_cpu(hw_stats, sizeof(*hw_stats)); + drvs->pmem_fifo_overflow_drop = port_stats->pmem_fifo_overflow_drop; + drvs->rx_priority_pause_frames = port_stats->rx_priority_pause_frames; + drvs->rx_pause_frames = port_stats->rx_pause_frames; + drvs->rx_crc_errors = port_stats->rx_crc_errors; + drvs->rx_control_frames = port_stats->rx_control_frames; + drvs->rx_in_range_errors = port_stats->rx_in_range_errors; + drvs->rx_frame_too_long = port_stats->rx_frame_too_long; + drvs->rx_dropped_runt = port_stats->rx_dropped_runt; + drvs->rx_ip_checksum_errs = port_stats->rx_ip_checksum_errs; + drvs->rx_tcp_checksum_errs = port_stats->rx_tcp_checksum_errs; + drvs->rx_udp_checksum_errs = port_stats->rx_udp_checksum_errs; + drvs->rx_dropped_tcp_length = port_stats->rx_dropped_tcp_length; + drvs->rx_dropped_too_small = port_stats->rx_dropped_too_small; + drvs->rx_dropped_too_short = port_stats->rx_dropped_too_short; + drvs->rx_out_range_errors = port_stats->rx_out_range_errors; + drvs->rx_dropped_header_too_small = + port_stats->rx_dropped_header_too_small; + drvs->rx_input_fifo_overflow_drop = + port_stats->rx_input_fifo_overflow_drop; + drvs->rx_address_filtered = port_stats->rx_address_filtered; + drvs->rx_alignment_symbol_errors = + port_stats->rx_alignment_symbol_errors; + drvs->rxpp_fifo_overflow_drop = port_stats->rxpp_fifo_overflow_drop; + drvs->tx_pauseframes = port_stats->tx_pauseframes; + drvs->tx_controlframes = port_stats->tx_controlframes; + drvs->tx_priority_pauseframes = port_stats->tx_priority_pauseframes; + drvs->jabber_events = port_stats->jabber_events; + drvs->rx_drops_no_pbuf = rxf_stats->rx_drops_no_pbuf; + drvs->rx_drops_no_erx_descr = rxf_stats->rx_drops_no_erx_descr; + drvs->forwarded_packets = rxf_stats->forwarded_packets; + drvs->rx_drops_mtu = rxf_stats->rx_drops_mtu; + drvs->rx_drops_no_tpre_descr = rxf_stats->rx_drops_no_tpre_descr; + drvs->rx_drops_too_many_frags = rxf_stats->rx_drops_too_many_frags; + adapter->drv_stats.eth_red_drops = pmem_sts->eth_red_drops; +} + +static void populate_be_v2_stats(struct be_adapter *adapter) +{ + struct be_hw_stats_v2 *hw_stats = hw_stats_from_cmd(adapter); + struct be_pmem_stats *pmem_sts = &hw_stats->pmem; + struct be_rxf_stats_v2 *rxf_stats = &hw_stats->rxf; + struct be_port_rxf_stats_v2 *port_stats = + &rxf_stats->port[adapter->port_num]; + struct be_drv_stats *drvs = &adapter->drv_stats; + + be_dws_le_to_cpu(hw_stats, sizeof(*hw_stats)); + drvs->pmem_fifo_overflow_drop = port_stats->pmem_fifo_overflow_drop; + drvs->rx_priority_pause_frames = port_stats->rx_priority_pause_frames; + drvs->rx_pause_frames = port_stats->rx_pause_frames; + drvs->rx_crc_errors = port_stats->rx_crc_errors; + drvs->rx_control_frames = port_stats->rx_control_frames; + drvs->rx_in_range_errors = port_stats->rx_in_range_errors; + drvs->rx_frame_too_long = port_stats->rx_frame_too_long; + drvs->rx_dropped_runt = port_stats->rx_dropped_runt; + drvs->rx_ip_checksum_errs = port_stats->rx_ip_checksum_errs; + drvs->rx_tcp_checksum_errs = port_stats->rx_tcp_checksum_errs; + drvs->rx_udp_checksum_errs = port_stats->rx_udp_checksum_errs; + drvs->rx_dropped_tcp_length = port_stats->rx_dropped_tcp_length; + drvs->rx_dropped_too_small = port_stats->rx_dropped_too_small; + drvs->rx_dropped_too_short = port_stats->rx_dropped_too_short; + drvs->rx_out_range_errors = port_stats->rx_out_range_errors; + drvs->rx_dropped_header_too_small = + port_stats->rx_dropped_header_too_small; + drvs->rx_input_fifo_overflow_drop = + port_stats->rx_input_fifo_overflow_drop; + drvs->rx_address_filtered = port_stats->rx_address_filtered; + drvs->rx_alignment_symbol_errors = + port_stats->rx_alignment_symbol_errors; + drvs->rxpp_fifo_overflow_drop = port_stats->rxpp_fifo_overflow_drop; + drvs->tx_pauseframes = port_stats->tx_pauseframes; + drvs->tx_controlframes = port_stats->tx_controlframes; + drvs->tx_priority_pauseframes = port_stats->tx_priority_pauseframes; + drvs->jabber_events = port_stats->jabber_events; + drvs->rx_drops_no_pbuf = rxf_stats->rx_drops_no_pbuf; + drvs->rx_drops_no_erx_descr = rxf_stats->rx_drops_no_erx_descr; + drvs->forwarded_packets = rxf_stats->forwarded_packets; + drvs->rx_drops_mtu = rxf_stats->rx_drops_mtu; + drvs->rx_drops_no_tpre_descr = rxf_stats->rx_drops_no_tpre_descr; + drvs->rx_drops_too_many_frags = rxf_stats->rx_drops_too_many_frags; + adapter->drv_stats.eth_red_drops = pmem_sts->eth_red_drops; + if (be_roce_supported(adapter)) { + drvs->rx_roce_bytes_lsd = port_stats->roce_bytes_received_lsd; + drvs->rx_roce_bytes_msd = port_stats->roce_bytes_received_msd; + drvs->rx_roce_frames = port_stats->roce_frames_received; + drvs->roce_drops_crc = port_stats->roce_drops_crc; + drvs->roce_drops_payload_len = + port_stats->roce_drops_payload_len; + } +} + +static void populate_lancer_stats(struct be_adapter *adapter) +{ + struct be_drv_stats *drvs = &adapter->drv_stats; + struct lancer_pport_stats *pport_stats = pport_stats_from_cmd(adapter); + + be_dws_le_to_cpu(pport_stats, sizeof(*pport_stats)); + drvs->rx_pause_frames = pport_stats->rx_pause_frames_lo; + drvs->rx_crc_errors = pport_stats->rx_crc_errors_lo; + drvs->rx_control_frames = pport_stats->rx_control_frames_lo; + drvs->rx_in_range_errors = pport_stats->rx_in_range_errors; + drvs->rx_frame_too_long = pport_stats->rx_frames_too_long_lo; + drvs->rx_dropped_runt = pport_stats->rx_dropped_runt; + drvs->rx_ip_checksum_errs = pport_stats->rx_ip_checksum_errors; + drvs->rx_tcp_checksum_errs = pport_stats->rx_tcp_checksum_errors; + drvs->rx_udp_checksum_errs = pport_stats->rx_udp_checksum_errors; + drvs->rx_dropped_tcp_length = + pport_stats->rx_dropped_invalid_tcp_length; + drvs->rx_dropped_too_small = pport_stats->rx_dropped_too_small; + drvs->rx_dropped_too_short = pport_stats->rx_dropped_too_short; + drvs->rx_out_range_errors = pport_stats->rx_out_of_range_errors; + drvs->rx_dropped_header_too_small = + pport_stats->rx_dropped_header_too_small; + drvs->rx_input_fifo_overflow_drop = pport_stats->rx_fifo_overflow; + drvs->rx_address_filtered = + pport_stats->rx_address_filtered + + pport_stats->rx_vlan_filtered; + drvs->rx_alignment_symbol_errors = pport_stats->rx_symbol_errors_lo; + drvs->rxpp_fifo_overflow_drop = pport_stats->rx_fifo_overflow; + drvs->tx_pauseframes = pport_stats->tx_pause_frames_lo; + drvs->tx_controlframes = pport_stats->tx_control_frames_lo; + drvs->jabber_events = pport_stats->rx_jabbers; + drvs->forwarded_packets = pport_stats->num_forwards_lo; + drvs->rx_drops_mtu = pport_stats->rx_drops_mtu_lo; + drvs->rx_drops_too_many_frags = + pport_stats->rx_drops_too_many_frags_lo; +} + +static void accumulate_16bit_val(u32 *acc, u16 val) +{ +#define lo(x) (x & 0xFFFF) +#define hi(x) (x & 0xFFFF0000) + bool wrapped = val < lo(*acc); + u32 newacc = hi(*acc) + val; + + if (wrapped) + newacc += 65536; + ACCESS_ONCE(*acc) = newacc; +} + +static void populate_erx_stats(struct be_adapter *adapter, + struct be_rx_obj *rxo, u32 erx_stat) +{ + if (!BEx_chip(adapter)) + rx_stats(rxo)->rx_drops_no_frags = erx_stat; + else + /* below erx HW counter can actually wrap around after + * 65535. Driver accumulates a 32-bit value + */ + accumulate_16bit_val(&rx_stats(rxo)->rx_drops_no_frags, + (u16)erx_stat); +} + +void be_parse_stats(struct be_adapter *adapter) +{ + struct be_erx_stats_v2 *erx = be_erx_stats_from_cmd(adapter); + struct be_rx_obj *rxo; + int i; + u32 erx_stat; + + if (lancer_chip(adapter)) { + populate_lancer_stats(adapter); + } else { + if (BE2_chip(adapter)) + populate_be_v0_stats(adapter); + else if (BE3_chip(adapter)) + /* for BE3 */ + populate_be_v1_stats(adapter); + else + populate_be_v2_stats(adapter); + + /* erx_v2 is longer than v0, v1. use v2 for v0, v1 access */ + for_all_rx_queues(adapter, rxo, i) { + erx_stat = erx->rx_drops_no_fragments[rxo->q.id]; + populate_erx_stats(adapter, rxo, erx_stat); + } + } +} + +static struct rtnl_link_stats64 *be_get_stats64(struct net_device *netdev, + struct rtnl_link_stats64 *stats) +{ + struct be_adapter *adapter = netdev_priv(netdev); + struct be_drv_stats *drvs = &adapter->drv_stats; + struct be_rx_obj *rxo; + struct be_tx_obj *txo; + u64 pkts, bytes; + unsigned int start; + int i; + + for_all_rx_queues(adapter, rxo, i) { + const struct be_rx_stats *rx_stats = rx_stats(rxo); + + do { + start = u64_stats_fetch_begin_irq(&rx_stats->sync); + pkts = rx_stats(rxo)->rx_pkts; + bytes = rx_stats(rxo)->rx_bytes; + } while (u64_stats_fetch_retry_irq(&rx_stats->sync, start)); + stats->rx_packets += pkts; + stats->rx_bytes += bytes; + stats->multicast += rx_stats(rxo)->rx_mcast_pkts; + stats->rx_dropped += rx_stats(rxo)->rx_drops_no_skbs + + rx_stats(rxo)->rx_drops_no_frags; + } + + for_all_tx_queues(adapter, txo, i) { + const struct be_tx_stats *tx_stats = tx_stats(txo); + + do { + start = u64_stats_fetch_begin_irq(&tx_stats->sync); + pkts = tx_stats(txo)->tx_pkts; + bytes = tx_stats(txo)->tx_bytes; + } while (u64_stats_fetch_retry_irq(&tx_stats->sync, start)); + stats->tx_packets += pkts; + stats->tx_bytes += bytes; + } + + /* bad pkts received */ + stats->rx_errors = drvs->rx_crc_errors + + drvs->rx_alignment_symbol_errors + + drvs->rx_in_range_errors + + drvs->rx_out_range_errors + + drvs->rx_frame_too_long + + drvs->rx_dropped_too_small + + drvs->rx_dropped_too_short + + drvs->rx_dropped_header_too_small + + drvs->rx_dropped_tcp_length + + drvs->rx_dropped_runt; + + /* detailed rx errors */ + stats->rx_length_errors = drvs->rx_in_range_errors + + drvs->rx_out_range_errors + + drvs->rx_frame_too_long; + + stats->rx_crc_errors = drvs->rx_crc_errors; + + /* frame alignment errors */ + stats->rx_frame_errors = drvs->rx_alignment_symbol_errors; + + /* receiver fifo overrun */ + /* drops_no_pbuf is no per i/f, it's per BE card */ + stats->rx_fifo_errors = drvs->rxpp_fifo_overflow_drop + + drvs->rx_input_fifo_overflow_drop + + drvs->rx_drops_no_pbuf; + return stats; +} + +void be_link_status_update(struct be_adapter *adapter, u8 link_status) +{ + struct net_device *netdev = adapter->netdev; + + if (!(adapter->flags & BE_FLAGS_LINK_STATUS_INIT)) { + netif_carrier_off(netdev); + adapter->flags |= BE_FLAGS_LINK_STATUS_INIT; + } + + if (link_status) + netif_carrier_on(netdev); + else + netif_carrier_off(netdev); +} + +static void be_tx_stats_update(struct be_tx_obj *txo, + u32 wrb_cnt, u32 copied, u32 gso_segs, + bool stopped) +{ + struct be_tx_stats *stats = tx_stats(txo); + + u64_stats_update_begin(&stats->sync); + stats->tx_reqs++; + stats->tx_wrbs += wrb_cnt; + stats->tx_bytes += copied; + stats->tx_pkts += (gso_segs ? gso_segs : 1); + if (stopped) + stats->tx_stops++; + u64_stats_update_end(&stats->sync); +} + +/* Determine number of WRB entries needed to xmit data in an skb */ +static u32 wrb_cnt_for_skb(struct be_adapter *adapter, struct sk_buff *skb, + bool *dummy) +{ + int cnt = (skb->len > skb->data_len); + + cnt += skb_shinfo(skb)->nr_frags; + + /* to account for hdr wrb */ + cnt++; + if (lancer_chip(adapter) || !(cnt & 1)) { + *dummy = false; + } else { + /* add a dummy to make it an even num */ + cnt++; + *dummy = true; + } + BUG_ON(cnt > BE_MAX_TX_FRAG_COUNT); + return cnt; +} + +static inline void wrb_fill(struct be_eth_wrb *wrb, u64 addr, int len) +{ + wrb->frag_pa_hi = upper_32_bits(addr); + wrb->frag_pa_lo = addr & 0xFFFFFFFF; + wrb->frag_len = len & ETH_WRB_FRAG_LEN_MASK; + wrb->rsvd0 = 0; +} + +static inline u16 be_get_tx_vlan_tag(struct be_adapter *adapter, + struct sk_buff *skb) +{ + u8 vlan_prio; + u16 vlan_tag; + + vlan_tag = vlan_tx_tag_get(skb); + vlan_prio = (vlan_tag & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; + /* If vlan priority provided by OS is NOT in available bmap */ + if (!(adapter->vlan_prio_bmap & (1 << vlan_prio))) + vlan_tag = (vlan_tag & ~VLAN_PRIO_MASK) | + adapter->recommended_prio; + + return vlan_tag; +} + +/* Used only for IP tunnel packets */ +static u16 skb_inner_ip_proto(struct sk_buff *skb) +{ + return (inner_ip_hdr(skb)->version == 4) ? + inner_ip_hdr(skb)->protocol : inner_ipv6_hdr(skb)->nexthdr; +} + +static u16 skb_ip_proto(struct sk_buff *skb) +{ + return (ip_hdr(skb)->version == 4) ? + ip_hdr(skb)->protocol : ipv6_hdr(skb)->nexthdr; +} + +static void wrb_fill_hdr(struct be_adapter *adapter, struct be_eth_hdr_wrb *hdr, + struct sk_buff *skb, u32 wrb_cnt, u32 len, + bool skip_hw_vlan) +{ + u16 vlan_tag, proto; + + memset(hdr, 0, sizeof(*hdr)); + + SET_TX_WRB_HDR_BITS(crc, hdr, 1); + + if (skb_is_gso(skb)) { + SET_TX_WRB_HDR_BITS(lso, hdr, 1); + SET_TX_WRB_HDR_BITS(lso_mss, hdr, skb_shinfo(skb)->gso_size); + if (skb_is_gso_v6(skb) && !lancer_chip(adapter)) + SET_TX_WRB_HDR_BITS(lso6, hdr, 1); + } else if (skb->ip_summed == CHECKSUM_PARTIAL) { + if (skb->encapsulation) { + SET_TX_WRB_HDR_BITS(ipcs, hdr, 1); + proto = skb_inner_ip_proto(skb); + } else { + proto = skb_ip_proto(skb); + } + if (proto == IPPROTO_TCP) + SET_TX_WRB_HDR_BITS(tcpcs, hdr, 1); + else if (proto == IPPROTO_UDP) + SET_TX_WRB_HDR_BITS(udpcs, hdr, 1); + } + + if (vlan_tx_tag_present(skb)) { + SET_TX_WRB_HDR_BITS(vlan, hdr, 1); + vlan_tag = be_get_tx_vlan_tag(adapter, skb); + SET_TX_WRB_HDR_BITS(vlan_tag, hdr, vlan_tag); + } + + /* To skip HW VLAN tagging: evt = 1, compl = 0 */ + SET_TX_WRB_HDR_BITS(complete, hdr, !skip_hw_vlan); + SET_TX_WRB_HDR_BITS(event, hdr, 1); + SET_TX_WRB_HDR_BITS(num_wrb, hdr, wrb_cnt); + SET_TX_WRB_HDR_BITS(len, hdr, len); +} + +static void unmap_tx_frag(struct device *dev, struct be_eth_wrb *wrb, + bool unmap_single) +{ + dma_addr_t dma; + + be_dws_le_to_cpu(wrb, sizeof(*wrb)); + + dma = (u64)wrb->frag_pa_hi << 32 | (u64)wrb->frag_pa_lo; + if (wrb->frag_len) { + if (unmap_single) + dma_unmap_single(dev, dma, wrb->frag_len, + DMA_TO_DEVICE); + else + dma_unmap_page(dev, dma, wrb->frag_len, DMA_TO_DEVICE); + } +} + +static int make_tx_wrbs(struct be_adapter *adapter, struct be_queue_info *txq, + struct sk_buff *skb, u32 wrb_cnt, bool dummy_wrb, + bool skip_hw_vlan) +{ + dma_addr_t busaddr; + int i, copied = 0; + struct device *dev = &adapter->pdev->dev; + struct sk_buff *first_skb = skb; + struct be_eth_wrb *wrb; + struct be_eth_hdr_wrb *hdr; + bool map_single = false; + u16 map_head; + + hdr = queue_head_node(txq); + queue_head_inc(txq); + map_head = txq->head; + + if (skb->len > skb->data_len) { + int len = skb_headlen(skb); + + busaddr = dma_map_single(dev, skb->data, len, DMA_TO_DEVICE); + if (dma_mapping_error(dev, busaddr)) + goto dma_err; + map_single = true; + wrb = queue_head_node(txq); + wrb_fill(wrb, busaddr, len); + be_dws_cpu_to_le(wrb, sizeof(*wrb)); + queue_head_inc(txq); + copied += len; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + const struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i]; + + busaddr = skb_frag_dma_map(dev, frag, 0, + skb_frag_size(frag), DMA_TO_DEVICE); + if (dma_mapping_error(dev, busaddr)) + goto dma_err; + wrb = queue_head_node(txq); + wrb_fill(wrb, busaddr, skb_frag_size(frag)); + be_dws_cpu_to_le(wrb, sizeof(*wrb)); + queue_head_inc(txq); + copied += skb_frag_size(frag); + } + + if (dummy_wrb) { + wrb = queue_head_node(txq); + wrb_fill(wrb, 0, 0); + be_dws_cpu_to_le(wrb, sizeof(*wrb)); + queue_head_inc(txq); + } + + wrb_fill_hdr(adapter, hdr, first_skb, wrb_cnt, copied, skip_hw_vlan); + be_dws_cpu_to_le(hdr, sizeof(*hdr)); + + return copied; +dma_err: + txq->head = map_head; + while (copied) { + wrb = queue_head_node(txq); + unmap_tx_frag(dev, wrb, map_single); + map_single = false; + copied -= wrb->frag_len; + adapter->drv_stats.dma_map_errors++; + queue_head_inc(txq); + } + return 0; +} + +static struct sk_buff *be_insert_vlan_in_pkt(struct be_adapter *adapter, + struct sk_buff *skb, + bool *skip_hw_vlan) +{ + u16 vlan_tag = 0; + + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(!skb)) + return skb; + + if (vlan_tx_tag_present(skb)) + vlan_tag = be_get_tx_vlan_tag(adapter, skb); + + if (qnq_async_evt_rcvd(adapter) && adapter->pvid) { + if (!vlan_tag) + vlan_tag = adapter->pvid; + /* f/w workaround to set skip_hw_vlan = 1, informs the F/W to + * skip VLAN insertion + */ + if (skip_hw_vlan) + *skip_hw_vlan = true; + } + + if (vlan_tag) { + skb = __vlan_put_tag(skb, htons(ETH_P_8021Q), vlan_tag); + if (unlikely(!skb)) + return skb; + skb->vlan_tci = 0; + } + + /* Insert the outer VLAN, if any */ + if (adapter->qnq_vid) { + vlan_tag = adapter->qnq_vid; + skb = __vlan_put_tag(skb, htons(ETH_P_8021Q), vlan_tag); + if (unlikely(!skb)) + return skb; + if (skip_hw_vlan) + *skip_hw_vlan = true; + } + + return skb; +} + +static bool be_ipv6_exthdr_check(struct sk_buff *skb) +{ + struct ethhdr *eh = (struct ethhdr *)skb->data; + u16 offset = ETH_HLEN; + + if (eh->h_proto == htons(ETH_P_IPV6)) { + struct ipv6hdr *ip6h = (struct ipv6hdr *)(skb->data + offset); + + offset += sizeof(struct ipv6hdr); + if (ip6h->nexthdr != NEXTHDR_TCP && + ip6h->nexthdr != NEXTHDR_UDP) { + struct ipv6_opt_hdr *ehdr = + (struct ipv6_opt_hdr *)(skb->data + offset); + + /* offending pkt: 2nd byte following IPv6 hdr is 0xff */ + if (ehdr->hdrlen == 0xff) + return true; + } + } + return false; +} + +static int be_vlan_tag_tx_chk(struct be_adapter *adapter, struct sk_buff *skb) +{ + return vlan_tx_tag_present(skb) || adapter->pvid || adapter->qnq_vid; +} + +static int be_ipv6_tx_stall_chk(struct be_adapter *adapter, struct sk_buff *skb) +{ + return BE3_chip(adapter) && be_ipv6_exthdr_check(skb); +} + +static struct sk_buff *be_lancer_xmit_workarounds(struct be_adapter *adapter, + struct sk_buff *skb, + bool *skip_hw_vlan) +{ + struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; + unsigned int eth_hdr_len; + struct iphdr *ip; + + /* For padded packets, BE HW modifies tot_len field in IP header + * incorrecly when VLAN tag is inserted by HW. + * For padded packets, Lancer computes incorrect checksum. + */ + eth_hdr_len = ntohs(skb->protocol) == ETH_P_8021Q ? + VLAN_ETH_HLEN : ETH_HLEN; + if (skb->len <= 60 && + (lancer_chip(adapter) || vlan_tx_tag_present(skb)) && + is_ipv4_pkt(skb)) { + ip = (struct iphdr *)ip_hdr(skb); + pskb_trim(skb, eth_hdr_len + ntohs(ip->tot_len)); + } + + /* If vlan tag is already inlined in the packet, skip HW VLAN + * tagging in pvid-tagging mode + */ + if (be_pvid_tagging_enabled(adapter) && + veh->h_vlan_proto == htons(ETH_P_8021Q)) + *skip_hw_vlan = true; + + /* HW has a bug wherein it will calculate CSUM for VLAN + * pkts even though it is disabled. + * Manually insert VLAN in pkt. + */ + if (skb->ip_summed != CHECKSUM_PARTIAL && + vlan_tx_tag_present(skb)) { + skb = be_insert_vlan_in_pkt(adapter, skb, skip_hw_vlan); + if (unlikely(!skb)) + goto err; + } + + /* HW may lockup when VLAN HW tagging is requested on + * certain ipv6 packets. Drop such pkts if the HW workaround to + * skip HW tagging is not enabled by FW. + */ + if (unlikely(be_ipv6_tx_stall_chk(adapter, skb) && + (adapter->pvid || adapter->qnq_vid) && + !qnq_async_evt_rcvd(adapter))) + goto tx_drop; + + /* Manual VLAN tag insertion to prevent: + * ASIC lockup when the ASIC inserts VLAN tag into + * certain ipv6 packets. Insert VLAN tags in driver, + * and set event, completion, vlan bits accordingly + * in the Tx WRB. + */ + if (be_ipv6_tx_stall_chk(adapter, skb) && + be_vlan_tag_tx_chk(adapter, skb)) { + skb = be_insert_vlan_in_pkt(adapter, skb, skip_hw_vlan); + if (unlikely(!skb)) + goto err; + } + + return skb; +tx_drop: + dev_kfree_skb_any(skb); +err: + return NULL; +} + +static struct sk_buff *be_xmit_workarounds(struct be_adapter *adapter, + struct sk_buff *skb, + bool *skip_hw_vlan) +{ + /* Lancer, SH-R ASICs have a bug wherein Packets that are 32 bytes or + * less may cause a transmit stall on that port. So the work-around is + * to pad short packets (<= 32 bytes) to a 36-byte length. + */ + if (unlikely(!BEx_chip(adapter) && skb->len <= 32)) { + if (skb_padto(skb, 36)) + return NULL; + skb->len = 36; + } + + if (BEx_chip(adapter) || lancer_chip(adapter)) { + skb = be_lancer_xmit_workarounds(adapter, skb, skip_hw_vlan); + if (!skb) + return NULL; + } + + return skb; +} + +static netdev_tx_t be_xmit(struct sk_buff *skb, struct net_device *netdev) +{ + struct be_adapter *adapter = netdev_priv(netdev); + struct be_tx_obj *txo = &adapter->tx_obj[skb_get_queue_mapping(skb)]; + struct be_queue_info *txq = &txo->q; + bool dummy_wrb, stopped = false; + u32 wrb_cnt = 0, copied = 0; + bool skip_hw_vlan = false; + u32 start = txq->head; + + skb = be_xmit_workarounds(adapter, skb, &skip_hw_vlan); + if (!skb) { + tx_stats(txo)->tx_drv_drops++; + return NETDEV_TX_OK; + } + + wrb_cnt = wrb_cnt_for_skb(adapter, skb, &dummy_wrb); + + copied = make_tx_wrbs(adapter, txq, skb, wrb_cnt, dummy_wrb, + skip_hw_vlan); + if (copied) { + int gso_segs = skb_shinfo(skb)->gso_segs; + + /* record the sent skb in the sent_skb table */ + BUG_ON(txo->sent_skb_list[start]); + txo->sent_skb_list[start] = skb; + + /* Ensure txq has space for the next skb; Else stop the queue + * *BEFORE* ringing the tx doorbell, so that we serialze the + * tx compls of the current transmit which'll wake up the queue + */ + atomic_add(wrb_cnt, &txq->used); + if ((BE_MAX_TX_FRAG_COUNT + atomic_read(&txq->used)) >= + txq->len) { + netif_stop_subqueue(netdev, skb_get_queue_mapping(skb)); + stopped = true; + } + + be_txq_notify(adapter, txo, wrb_cnt); + + be_tx_stats_update(txo, wrb_cnt, copied, gso_segs, stopped); + } else { + txq->head = start; + tx_stats(txo)->tx_drv_drops++; + dev_kfree_skb_any(skb); + } + return NETDEV_TX_OK; +} + +static int be_change_mtu(struct net_device *netdev, int new_mtu) +{ + struct be_adapter *adapter = netdev_priv(netdev); + struct device *dev = &adapter->pdev->dev; + + if (new_mtu < BE_MIN_MTU || new_mtu > BE_MAX_MTU) { + dev_info(dev, "MTU must be between %d and %d bytes\n", + BE_MIN_MTU, BE_MAX_MTU); + return -EINVAL; + } + + dev_info(dev, "MTU changed from %d to %d bytes\n", + netdev->mtu, new_mtu); + netdev->mtu = new_mtu; + return 0; +} + +/* + * A max of 64 (BE_NUM_VLANS_SUPPORTED) vlans can be configured in BE. + * If the user configures more, place BE in vlan promiscuous mode. + */ +static int be_vid_config(struct be_adapter *adapter) +{ + struct device *dev = &adapter->pdev->dev; + u16 vids[BE_NUM_VLANS_SUPPORTED]; + u16 num = 0, i = 0; + int status = 0; + + /* No need to further configure vids if in promiscuous mode */ + if (adapter->promiscuous) + return 0; + + if (adapter->vlans_added > be_max_vlans(adapter)) + goto set_vlan_promisc; + + /* Construct VLAN Table to give to HW */ + for_each_set_bit(i, adapter->vids, VLAN_N_VID) + vids[num++] = cpu_to_le16(i); + + status = be_cmd_vlan_config(adapter, adapter->if_handle, vids, num); + if (status) { + /* Set to VLAN promisc mode as setting VLAN filter failed */ + if (addl_status(status) == + MCC_ADDL_STATUS_INSUFFICIENT_RESOURCES) + goto set_vlan_promisc; + dev_err(dev, "Setting HW VLAN filtering failed\n"); + } else { + if (adapter->flags & BE_FLAGS_VLAN_PROMISC) { + /* hw VLAN filtering re-enabled. */ + status = be_cmd_rx_filter(adapter, + BE_FLAGS_VLAN_PROMISC, OFF); + if (!status) { + dev_info(dev, + "Disabling VLAN Promiscuous mode\n"); + adapter->flags &= ~BE_FLAGS_VLAN_PROMISC; + } + } + } + + return status; + +set_vlan_promisc: + if (adapter->flags & BE_FLAGS_VLAN_PROMISC) + return 0; + + status = be_cmd_rx_filter(adapter, BE_FLAGS_VLAN_PROMISC, ON); + if (!status) { + dev_info(dev, "Enable VLAN Promiscuous mode\n"); + adapter->flags |= BE_FLAGS_VLAN_PROMISC; + } else + dev_err(dev, "Failed to enable VLAN Promiscuous mode\n"); + return status; +} + +static int be_vlan_add_vid(struct net_device *netdev, __be16 proto, u16 vid) +{ + struct be_adapter *adapter = netdev_priv(netdev); + int status = 0; + + /* Packets with VID 0 are always received by Lancer by default */ + if (lancer_chip(adapter) && vid == 0) + return status; + + if (test_bit(vid, adapter->vids)) + return status; + + set_bit(vid, adapter->vids); + adapter->vlans_added++; + + status = be_vid_config(adapter); + if (status) { + adapter->vlans_added--; + clear_bit(vid, adapter->vids); + } + + return status; +} + +static int be_vlan_rem_vid(struct net_device *netdev, __be16 proto, u16 vid) +{ + struct be_adapter *adapter = netdev_priv(netdev); + + /* Packets with VID 0 are always received by Lancer by default */ + if (lancer_chip(adapter) && vid == 0) + return 0; + + clear_bit(vid, adapter->vids); + adapter->vlans_added--; + + return be_vid_config(adapter); +} + +static void be_clear_promisc(struct be_adapter *adapter) +{ + adapter->promiscuous = false; + adapter->flags &= ~(BE_FLAGS_VLAN_PROMISC | BE_FLAGS_MCAST_PROMISC); + + be_cmd_rx_filter(adapter, IFF_PROMISC, OFF); +} + +static void be_set_rx_mode(struct net_device *netdev) +{ + struct be_adapter *adapter = netdev_priv(netdev); + int status; + + if (netdev->flags & IFF_PROMISC) { + be_cmd_rx_filter(adapter, IFF_PROMISC, ON); + adapter->promiscuous = true; + goto done; + } + + /* BE was previously in promiscuous mode; disable it */ + if (adapter->promiscuous) { + be_clear_promisc(adapter); + if (adapter->vlans_added) + be_vid_config(adapter); + } + + /* Enable multicast promisc if num configured exceeds what we support */ + if (netdev->flags & IFF_ALLMULTI || + netdev_mc_count(netdev) > be_max_mc(adapter)) + goto set_mcast_promisc; + + if (netdev_uc_count(netdev) != adapter->uc_macs) { + struct netdev_hw_addr *ha; + int i = 1; /* First slot is claimed by the Primary MAC */ + + for (; adapter->uc_macs > 0; adapter->uc_macs--, i++) { + be_cmd_pmac_del(adapter, adapter->if_handle, + adapter->pmac_id[i], 0); + } + + if (netdev_uc_count(netdev) > be_max_uc(adapter)) { + be_cmd_rx_filter(adapter, IFF_PROMISC, ON); + adapter->promiscuous = true; + goto done; + } + + netdev_for_each_uc_addr(ha, adapter->netdev) { + adapter->uc_macs++; /* First slot is for Primary MAC */ + be_cmd_pmac_add(adapter, (u8 *)ha->addr, + adapter->if_handle, + &adapter->pmac_id[adapter->uc_macs], 0); + } + } + + status = be_cmd_rx_filter(adapter, IFF_MULTICAST, ON); + if (!status) { + if (adapter->flags & BE_FLAGS_MCAST_PROMISC) + adapter->flags &= ~BE_FLAGS_MCAST_PROMISC; + goto done; + } + +set_mcast_promisc: + if (adapter->flags & BE_FLAGS_MCAST_PROMISC) + return; + + /* Set to MCAST promisc mode if setting MULTICAST address fails + * or if num configured exceeds what we support + */ + status = be_cmd_rx_filter(adapter, IFF_ALLMULTI, ON); + if (!status) + adapter->flags |= BE_FLAGS_MCAST_PROMISC; +done: + return; +} + +static int be_set_vf_mac(struct net_device *netdev, int vf, u8 *mac) +{ + struct be_adapter *adapter = netdev_priv(netdev); + struct be_vf_cfg *vf_cfg = &adapter->vf_cfg[vf]; + int status; + + if (!sriov_enabled(adapter)) + return -EPERM; + + if (!is_valid_ether_addr(mac) || vf >= adapter->num_vfs) + return -EINVAL; + + /* Proceed further only if user provided MAC is different + * from active MAC + */ + if (ether_addr_equal(mac, vf_cfg->mac_addr)) + return 0; + + if (BEx_chip(adapter)) { + be_cmd_pmac_del(adapter, vf_cfg->if_handle, vf_cfg->pmac_id, + vf + 1); + + status = be_cmd_pmac_add(adapter, mac, vf_cfg->if_handle, + &vf_cfg->pmac_id, vf + 1); + } else { + status = be_cmd_set_mac(adapter, mac, vf_cfg->if_handle, + vf + 1); + } + + if (status) { + dev_err(&adapter->pdev->dev, "MAC %pM set on VF %d Failed: %#x", + mac, vf, status); + return be_cmd_status(status); + } + + ether_addr_copy(vf_cfg->mac_addr, mac); + + return 0; +} + +static int be_get_vf_config(struct net_device *netdev, int vf, + struct ifla_vf_info *vi) +{ + struct be_adapter *adapter = netdev_priv(netdev); + struct be_vf_cfg *vf_cfg = &adapter->vf_cfg[vf]; + + if (!sriov_enabled(adapter)) + return -EPERM; + + if (vf >= adapter->num_vfs) + return -EINVAL; + + vi->vf = vf; + vi->max_tx_rate = vf_cfg->tx_rate; + vi->min_tx_rate = 0; + vi->vlan = vf_cfg->vlan_tag & VLAN_VID_MASK; + vi->qos = vf_cfg->vlan_tag >> VLAN_PRIO_SHIFT; + memcpy(&vi->mac, vf_cfg->mac_addr, ETH_ALEN); + vi->linkstate = adapter->vf_cfg[vf].plink_tracking; + + return 0; +} + +static int be_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos) +{ + struct be_adapter *adapter = netdev_priv(netdev); + struct be_vf_cfg *vf_cfg = &adapter->vf_cfg[vf]; + int status = 0; + + if (!sriov_enabled(adapter)) + return -EPERM; + + if (vf >= adapter->num_vfs || vlan > 4095 || qos > 7) + return -EINVAL; + + if (vlan || qos) { + vlan |= qos << VLAN_PRIO_SHIFT; + if (vf_cfg->vlan_tag != vlan) + status = be_cmd_set_hsw_config(adapter, vlan, vf + 1, + vf_cfg->if_handle, 0); + } else { + /* Reset Transparent Vlan Tagging. */ + status = be_cmd_set_hsw_config(adapter, BE_RESET_VLAN_TAG_ID, + vf + 1, vf_cfg->if_handle, 0); + } + + if (status) { + dev_err(&adapter->pdev->dev, + "VLAN %d config on VF %d failed : %#x\n", vlan, + vf, status); + return be_cmd_status(status); + } + + vf_cfg->vlan_tag = vlan; + + return 0; +} + +static int be_set_vf_tx_rate(struct net_device *netdev, int vf, + int min_tx_rate, int max_tx_rate) +{ + struct be_adapter *adapter = netdev_priv(netdev); + struct device *dev = &adapter->pdev->dev; + int percent_rate, status = 0; + u16 link_speed = 0; + u8 link_status; + + if (!sriov_enabled(adapter)) + return -EPERM; + + if (vf >= adapter->num_vfs) + return -EINVAL; + + if (min_tx_rate) + return -EINVAL; + + if (!max_tx_rate) + goto config_qos; + + status = be_cmd_link_status_query(adapter, &link_speed, + &link_status, 0); + if (status) + goto err; + + if (!link_status) { + dev_err(dev, "TX-rate setting not allowed when link is down\n"); + status = -ENETDOWN; + goto err; + } + + if (max_tx_rate < 100 || max_tx_rate > link_speed) { + dev_err(dev, "TX-rate must be between 100 and %d Mbps\n", + link_speed); + status = -EINVAL; + goto err; + } + + /* On Skyhawk the QOS setting must be done only as a % value */ + percent_rate = link_speed / 100; + if (skyhawk_chip(adapter) && (max_tx_rate % percent_rate)) { + dev_err(dev, "TX-rate must be a multiple of %d Mbps\n", + percent_rate); + status = -EINVAL; + goto err; + } + +config_qos: + status = be_cmd_config_qos(adapter, max_tx_rate, link_speed, vf + 1); + if (status) + goto err; + + adapter->vf_cfg[vf].tx_rate = max_tx_rate; + return 0; + +err: + dev_err(dev, "TX-rate setting of %dMbps on VF%d failed\n", + max_tx_rate, vf); + return be_cmd_status(status); +} + +static int be_set_vf_link_state(struct net_device *netdev, int vf, + int link_state) +{ + struct be_adapter *adapter = netdev_priv(netdev); + int status; + + if (!sriov_enabled(adapter)) + return -EPERM; + + if (vf >= adapter->num_vfs) + return -EINVAL; + + status = be_cmd_set_logical_link_config(adapter, link_state, vf+1); + if (status) { + dev_err(&adapter->pdev->dev, + "Link state change on VF %d failed: %#x\n", vf, status); + return be_cmd_status(status); + } + + adapter->vf_cfg[vf].plink_tracking = link_state; + + return 0; +} + +static void be_aic_update(struct be_aic_obj *aic, u64 rx_pkts, u64 tx_pkts, + ulong now) +{ + aic->rx_pkts_prev = rx_pkts; + aic->tx_reqs_prev = tx_pkts; + aic->jiffies = now; +} + +static void be_eqd_update(struct be_adapter *adapter) +{ + struct be_set_eqd set_eqd[MAX_EVT_QS]; + int eqd, i, num = 0, start; + struct be_aic_obj *aic; + struct be_eq_obj *eqo; + struct be_rx_obj *rxo; + struct be_tx_obj *txo; + u64 rx_pkts, tx_pkts; + ulong now; + u32 pps, delta; + + for_all_evt_queues(adapter, eqo, i) { + aic = &adapter->aic_obj[eqo->idx]; + if (!aic->enable) { + if (aic->jiffies) + aic->jiffies = 0; + eqd = aic->et_eqd; + goto modify_eqd; + } + + rxo = &adapter->rx_obj[eqo->idx]; + do { + start = u64_stats_fetch_begin_irq(&rxo->stats.sync); + rx_pkts = rxo->stats.rx_pkts; + } while (u64_stats_fetch_retry_irq(&rxo->stats.sync, start)); + + txo = &adapter->tx_obj[eqo->idx]; + do { + start = u64_stats_fetch_begin_irq(&txo->stats.sync); + tx_pkts = txo->stats.tx_reqs; + } while (u64_stats_fetch_retry_irq(&txo->stats.sync, start)); + + /* Skip, if wrapped around or first calculation */ + now = jiffies; + if (!aic->jiffies || time_before(now, aic->jiffies) || + rx_pkts < aic->rx_pkts_prev || + tx_pkts < aic->tx_reqs_prev) { + be_aic_update(aic, rx_pkts, tx_pkts, now); + continue; + } + + delta = jiffies_to_msecs(now - aic->jiffies); + pps = (((u32)(rx_pkts - aic->rx_pkts_prev) * 1000) / delta) + + (((u32)(tx_pkts - aic->tx_reqs_prev) * 1000) / delta); + eqd = (pps / 15000) << 2; + + if (eqd < 8) + eqd = 0; + eqd = min_t(u32, eqd, aic->max_eqd); + eqd = max_t(u32, eqd, aic->min_eqd); + + be_aic_update(aic, rx_pkts, tx_pkts, now); +modify_eqd: + if (eqd != aic->prev_eqd) { + set_eqd[num].delay_multiplier = (eqd * 65)/100; + set_eqd[num].eq_id = eqo->q.id; + aic->prev_eqd = eqd; + num++; + } + } + + if (num) + be_cmd_modify_eqd(adapter, set_eqd, num); +} + +static void be_rx_stats_update(struct be_rx_obj *rxo, + struct be_rx_compl_info *rxcp) +{ + struct be_rx_stats *stats = rx_stats(rxo); + + u64_stats_update_begin(&stats->sync); + stats->rx_compl++; + stats->rx_bytes += rxcp->pkt_size; + stats->rx_pkts++; + if (rxcp->pkt_type == BE_MULTICAST_PACKET) + stats->rx_mcast_pkts++; + if (rxcp->err) + stats->rx_compl_err++; + u64_stats_update_end(&stats->sync); +} + +static inline bool csum_passed(struct be_rx_compl_info *rxcp) +{ + /* L4 checksum is not reliable for non TCP/UDP packets. + * Also ignore ipcksm for ipv6 pkts + */ + return (rxcp->tcpf || rxcp->udpf) && rxcp->l4_csum && + (rxcp->ip_csum || rxcp->ipv6) && !rxcp->err; +} + +static struct be_rx_page_info *get_rx_page_info(struct be_rx_obj *rxo) +{ + struct be_adapter *adapter = rxo->adapter; + struct be_rx_page_info *rx_page_info; + struct be_queue_info *rxq = &rxo->q; + u16 frag_idx = rxq->tail; + + rx_page_info = &rxo->page_info_tbl[frag_idx]; + BUG_ON(!rx_page_info->page); + + if (rx_page_info->last_frag) { + dma_unmap_page(&adapter->pdev->dev, + dma_unmap_addr(rx_page_info, bus), + adapter->big_page_size, DMA_FROM_DEVICE); + rx_page_info->last_frag = false; + } else { + dma_sync_single_for_cpu(&adapter->pdev->dev, + dma_unmap_addr(rx_page_info, bus), + rx_frag_size, DMA_FROM_DEVICE); + } + + queue_tail_inc(rxq); + atomic_dec(&rxq->used); + return rx_page_info; +} + +/* Throwaway the data in the Rx completion */ +static void be_rx_compl_discard(struct be_rx_obj *rxo, + struct be_rx_compl_info *rxcp) +{ + struct be_rx_page_info *page_info; + u16 i, num_rcvd = rxcp->num_rcvd; + + for (i = 0; i < num_rcvd; i++) { + page_info = get_rx_page_info(rxo); + put_page(page_info->page); + memset(page_info, 0, sizeof(*page_info)); + } +} + +/* + * skb_fill_rx_data forms a complete skb for an ether frame + * indicated by rxcp. + */ +static void skb_fill_rx_data(struct be_rx_obj *rxo, struct sk_buff *skb, + struct be_rx_compl_info *rxcp) +{ + struct be_rx_page_info *page_info; + u16 i, j; + u16 hdr_len, curr_frag_len, remaining; + u8 *start; + + page_info = get_rx_page_info(rxo); + start = page_address(page_info->page) + page_info->page_offset; + prefetch(start); + + /* Copy data in the first descriptor of this completion */ + curr_frag_len = min(rxcp->pkt_size, rx_frag_size); + + skb->len = curr_frag_len; + if (curr_frag_len <= BE_HDR_LEN) { /* tiny packet */ + memcpy(skb->data, start, curr_frag_len); + /* Complete packet has now been moved to data */ + put_page(page_info->page); + skb->data_len = 0; + skb->tail += curr_frag_len; + } else { + hdr_len = ETH_HLEN; + memcpy(skb->data, start, hdr_len); + skb_shinfo(skb)->nr_frags = 1; + skb_frag_set_page(skb, 0, page_info->page); + skb_shinfo(skb)->frags[0].page_offset = + page_info->page_offset + hdr_len; + skb_frag_size_set(&skb_shinfo(skb)->frags[0], + curr_frag_len - hdr_len); + skb->data_len = curr_frag_len - hdr_len; + skb->truesize += rx_frag_size; + skb->tail += hdr_len; + } + page_info->page = NULL; + + if (rxcp->pkt_size <= rx_frag_size) { + BUG_ON(rxcp->num_rcvd != 1); + return; + } + + /* More frags present for this completion */ + remaining = rxcp->pkt_size - curr_frag_len; + for (i = 1, j = 0; i < rxcp->num_rcvd; i++) { + page_info = get_rx_page_info(rxo); + curr_frag_len = min(remaining, rx_frag_size); + + /* Coalesce all frags from the same physical page in one slot */ + if (page_info->page_offset == 0) { + /* Fresh page */ + j++; + skb_frag_set_page(skb, j, page_info->page); + skb_shinfo(skb)->frags[j].page_offset = + page_info->page_offset; + skb_frag_size_set(&skb_shinfo(skb)->frags[j], 0); + skb_shinfo(skb)->nr_frags++; + } else { + put_page(page_info->page); + } + + skb_frag_size_add(&skb_shinfo(skb)->frags[j], curr_frag_len); + skb->len += curr_frag_len; + skb->data_len += curr_frag_len; + skb->truesize += rx_frag_size; + remaining -= curr_frag_len; + page_info->page = NULL; + } + BUG_ON(j > MAX_SKB_FRAGS); +} + +/* Process the RX completion indicated by rxcp when GRO is disabled */ +static void be_rx_compl_process(struct be_rx_obj *rxo, struct napi_struct *napi, + struct be_rx_compl_info *rxcp) +{ + struct be_adapter *adapter = rxo->adapter; + struct net_device *netdev = adapter->netdev; + struct sk_buff *skb; + + skb = netdev_alloc_skb_ip_align(netdev, BE_RX_SKB_ALLOC_SIZE); + if (unlikely(!skb)) { + rx_stats(rxo)->rx_drops_no_skbs++; + be_rx_compl_discard(rxo, rxcp); + return; + } + + skb_fill_rx_data(rxo, skb, rxcp); + + if (likely((netdev->features & NETIF_F_RXCSUM) && csum_passed(rxcp))) + skb->ip_summed = CHECKSUM_UNNECESSARY; + else + skb_checksum_none_assert(skb); + + skb->protocol = eth_type_trans(skb, netdev); + skb_record_rx_queue(skb, rxo - &adapter->rx_obj[0]); + if (netdev->features & NETIF_F_RXHASH) + skb_set_hash(skb, rxcp->rss_hash, PKT_HASH_TYPE_L3); + + skb->csum_level = rxcp->tunneled; + skb_mark_napi_id(skb, napi); + + if (rxcp->vlanf) + __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), rxcp->vlan_tag); + + netif_receive_skb(skb); +} + +/* Process the RX completion indicated by rxcp when GRO is enabled */ +static void be_rx_compl_process_gro(struct be_rx_obj *rxo, + struct napi_struct *napi, + struct be_rx_compl_info *rxcp) +{ + struct be_adapter *adapter = rxo->adapter; + struct be_rx_page_info *page_info; + struct sk_buff *skb = NULL; + u16 remaining, curr_frag_len; + u16 i, j; + + skb = napi_get_frags(napi); + if (!skb) { + be_rx_compl_discard(rxo, rxcp); + return; + } + + remaining = rxcp->pkt_size; + for (i = 0, j = -1; i < rxcp->num_rcvd; i++) { + page_info = get_rx_page_info(rxo); + + curr_frag_len = min(remaining, rx_frag_size); + + /* Coalesce all frags from the same physical page in one slot */ + if (i == 0 || page_info->page_offset == 0) { + /* First frag or Fresh page */ + j++; + skb_frag_set_page(skb, j, page_info->page); + skb_shinfo(skb)->frags[j].page_offset = + page_info->page_offset; + skb_frag_size_set(&skb_shinfo(skb)->frags[j], 0); + } else { + put_page(page_info->page); + } + skb_frag_size_add(&skb_shinfo(skb)->frags[j], curr_frag_len); + skb->truesize += rx_frag_size; + remaining -= curr_frag_len; + memset(page_info, 0, sizeof(*page_info)); + } + BUG_ON(j > MAX_SKB_FRAGS); + + skb_shinfo(skb)->nr_frags = j + 1; + skb->len = rxcp->pkt_size; + skb->data_len = rxcp->pkt_size; + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb_record_rx_queue(skb, rxo - &adapter->rx_obj[0]); + if (adapter->netdev->features & NETIF_F_RXHASH) + skb_set_hash(skb, rxcp->rss_hash, PKT_HASH_TYPE_L3); + + skb->csum_level = rxcp->tunneled; + skb_mark_napi_id(skb, napi); + + if (rxcp->vlanf) + __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), rxcp->vlan_tag); + + napi_gro_frags(napi); +} + +static void be_parse_rx_compl_v1(struct be_eth_rx_compl *compl, + struct be_rx_compl_info *rxcp) +{ + rxcp->pkt_size = GET_RX_COMPL_V1_BITS(pktsize, compl); + rxcp->vlanf = GET_RX_COMPL_V1_BITS(vtp, compl); + rxcp->err = GET_RX_COMPL_V1_BITS(err, compl); + rxcp->tcpf = GET_RX_COMPL_V1_BITS(tcpf, compl); + rxcp->udpf = GET_RX_COMPL_V1_BITS(udpf, compl); + rxcp->ip_csum = GET_RX_COMPL_V1_BITS(ipcksm, compl); + rxcp->l4_csum = GET_RX_COMPL_V1_BITS(l4_cksm, compl); + rxcp->ipv6 = GET_RX_COMPL_V1_BITS(ip_version, compl); + rxcp->num_rcvd = GET_RX_COMPL_V1_BITS(numfrags, compl); + rxcp->pkt_type = GET_RX_COMPL_V1_BITS(cast_enc, compl); + rxcp->rss_hash = GET_RX_COMPL_V1_BITS(rsshash, compl); + if (rxcp->vlanf) { + rxcp->qnq = GET_RX_COMPL_V1_BITS(qnq, compl); + rxcp->vlan_tag = GET_RX_COMPL_V1_BITS(vlan_tag, compl); + } + rxcp->port = GET_RX_COMPL_V1_BITS(port, compl); + rxcp->tunneled = + GET_RX_COMPL_V1_BITS(tunneled, compl); +} + +static void be_parse_rx_compl_v0(struct be_eth_rx_compl *compl, + struct be_rx_compl_info *rxcp) +{ + rxcp->pkt_size = GET_RX_COMPL_V0_BITS(pktsize, compl); + rxcp->vlanf = GET_RX_COMPL_V0_BITS(vtp, compl); + rxcp->err = GET_RX_COMPL_V0_BITS(err, compl); + rxcp->tcpf = GET_RX_COMPL_V0_BITS(tcpf, compl); + rxcp->udpf = GET_RX_COMPL_V0_BITS(udpf, compl); + rxcp->ip_csum = GET_RX_COMPL_V0_BITS(ipcksm, compl); + rxcp->l4_csum = GET_RX_COMPL_V0_BITS(l4_cksm, compl); + rxcp->ipv6 = GET_RX_COMPL_V0_BITS(ip_version, compl); + rxcp->num_rcvd = GET_RX_COMPL_V0_BITS(numfrags, compl); + rxcp->pkt_type = GET_RX_COMPL_V0_BITS(cast_enc, compl); + rxcp->rss_hash = GET_RX_COMPL_V0_BITS(rsshash, compl); + if (rxcp->vlanf) { + rxcp->qnq = GET_RX_COMPL_V0_BITS(qnq, compl); + rxcp->vlan_tag = GET_RX_COMPL_V0_BITS(vlan_tag, compl); + } + rxcp->port = GET_RX_COMPL_V0_BITS(port, compl); + rxcp->ip_frag = GET_RX_COMPL_V0_BITS(ip_frag, compl); +} + +static struct be_rx_compl_info *be_rx_compl_get(struct be_rx_obj *rxo) +{ + struct be_eth_rx_compl *compl = queue_tail_node(&rxo->cq); + struct be_rx_compl_info *rxcp = &rxo->rxcp; + struct be_adapter *adapter = rxo->adapter; + + /* For checking the valid bit it is Ok to use either definition as the + * valid bit is at the same position in both v0 and v1 Rx compl */ + if (compl->dw[offsetof(struct amap_eth_rx_compl_v1, valid) / 32] == 0) + return NULL; + + rmb(); + be_dws_le_to_cpu(compl, sizeof(*compl)); + + if (adapter->be3_native) + be_parse_rx_compl_v1(compl, rxcp); + else + be_parse_rx_compl_v0(compl, rxcp); + + if (rxcp->ip_frag) + rxcp->l4_csum = 0; + + if (rxcp->vlanf) { + /* In QNQ modes, if qnq bit is not set, then the packet was + * tagged only with the transparent outer vlan-tag and must + * not be treated as a vlan packet by host + */ + if (be_is_qnq_mode(adapter) && !rxcp->qnq) + rxcp->vlanf = 0; + + if (!lancer_chip(adapter)) + rxcp->vlan_tag = swab16(rxcp->vlan_tag); + + if (adapter->pvid == (rxcp->vlan_tag & VLAN_VID_MASK) && + !test_bit(rxcp->vlan_tag, adapter->vids)) + rxcp->vlanf = 0; + } + + /* As the compl has been parsed, reset it; we wont touch it again */ + compl->dw[offsetof(struct amap_eth_rx_compl_v1, valid) / 32] = 0; + + queue_tail_inc(&rxo->cq); + return rxcp; +} + +static inline struct page *be_alloc_pages(u32 size, gfp_t gfp) +{ + u32 order = get_order(size); + + if (order > 0) + gfp |= __GFP_COMP; + return alloc_pages(gfp, order); +} + +/* + * Allocate a page, split it to fragments of size rx_frag_size and post as + * receive buffers to BE + */ +static void be_post_rx_frags(struct be_rx_obj *rxo, gfp_t gfp, u32 frags_needed) +{ + struct be_adapter *adapter = rxo->adapter; + struct be_rx_page_info *page_info = NULL, *prev_page_info = NULL; + struct be_queue_info *rxq = &rxo->q; + struct page *pagep = NULL; + struct device *dev = &adapter->pdev->dev; + struct be_eth_rx_d *rxd; + u64 page_dmaaddr = 0, frag_dmaaddr; + u32 posted, page_offset = 0, notify = 0; + + page_info = &rxo->page_info_tbl[rxq->head]; + for (posted = 0; posted < frags_needed && !page_info->page; posted++) { + if (!pagep) { + pagep = be_alloc_pages(adapter->big_page_size, gfp); + if (unlikely(!pagep)) { + rx_stats(rxo)->rx_post_fail++; + break; + } + page_dmaaddr = dma_map_page(dev, pagep, 0, + adapter->big_page_size, + DMA_FROM_DEVICE); + if (dma_mapping_error(dev, page_dmaaddr)) { + put_page(pagep); + pagep = NULL; + adapter->drv_stats.dma_map_errors++; + break; + } + page_offset = 0; + } else { + get_page(pagep); + page_offset += rx_frag_size; + } + page_info->page_offset = page_offset; + page_info->page = pagep; + + rxd = queue_head_node(rxq); + frag_dmaaddr = page_dmaaddr + page_info->page_offset; + rxd->fragpa_lo = cpu_to_le32(frag_dmaaddr & 0xFFFFFFFF); + rxd->fragpa_hi = cpu_to_le32(upper_32_bits(frag_dmaaddr)); + + /* Any space left in the current big page for another frag? */ + if ((page_offset + rx_frag_size + rx_frag_size) > + adapter->big_page_size) { + pagep = NULL; + page_info->last_frag = true; + dma_unmap_addr_set(page_info, bus, page_dmaaddr); + } else { + dma_unmap_addr_set(page_info, bus, frag_dmaaddr); + } + + prev_page_info = page_info; + queue_head_inc(rxq); + page_info = &rxo->page_info_tbl[rxq->head]; + } + + /* Mark the last frag of a page when we break out of the above loop + * with no more slots available in the RXQ + */ + if (pagep) { + prev_page_info->last_frag = true; + dma_unmap_addr_set(prev_page_info, bus, page_dmaaddr); + } + + if (posted) { + atomic_add(posted, &rxq->used); + if (rxo->rx_post_starved) + rxo->rx_post_starved = false; + do { + notify = min(256u, posted); + be_rxq_notify(adapter, rxq->id, notify); + posted -= notify; + } while (posted); + } else if (atomic_read(&rxq->used) == 0) { + /* Let be_worker replenish when memory is available */ + rxo->rx_post_starved = true; + } +} + +static struct be_eth_tx_compl *be_tx_compl_get(struct be_queue_info *tx_cq) +{ + struct be_eth_tx_compl *txcp = queue_tail_node(tx_cq); + + if (txcp->dw[offsetof(struct amap_eth_tx_compl, valid) / 32] == 0) + return NULL; + + rmb(); + be_dws_le_to_cpu(txcp, sizeof(*txcp)); + + txcp->dw[offsetof(struct amap_eth_tx_compl, valid) / 32] = 0; + + queue_tail_inc(tx_cq); + return txcp; +} + +static u16 be_tx_compl_process(struct be_adapter *adapter, + struct be_tx_obj *txo, u16 last_index) +{ + struct be_queue_info *txq = &txo->q; + struct be_eth_wrb *wrb; + struct sk_buff **sent_skbs = txo->sent_skb_list; + struct sk_buff *sent_skb; + u16 cur_index, num_wrbs = 1; /* account for hdr wrb */ + bool unmap_skb_hdr = true; + + sent_skb = sent_skbs[txq->tail]; + BUG_ON(!sent_skb); + sent_skbs[txq->tail] = NULL; + + /* skip header wrb */ + queue_tail_inc(txq); + + do { + cur_index = txq->tail; + wrb = queue_tail_node(txq); + unmap_tx_frag(&adapter->pdev->dev, wrb, + (unmap_skb_hdr && skb_headlen(sent_skb))); + unmap_skb_hdr = false; + + num_wrbs++; + queue_tail_inc(txq); + } while (cur_index != last_index); + + dev_consume_skb_any(sent_skb); + return num_wrbs; +} + +/* Return the number of events in the event queue */ +static inline int events_get(struct be_eq_obj *eqo) +{ + struct be_eq_entry *eqe; + int num = 0; + + do { + eqe = queue_tail_node(&eqo->q); + if (eqe->evt == 0) + break; + + rmb(); + eqe->evt = 0; + num++; + queue_tail_inc(&eqo->q); + } while (true); + + return num; +} + +/* Leaves the EQ is disarmed state */ +static void be_eq_clean(struct be_eq_obj *eqo) +{ + int num = events_get(eqo); + + be_eq_notify(eqo->adapter, eqo->q.id, false, true, num); +} + +static void be_rx_cq_clean(struct be_rx_obj *rxo) +{ + struct be_rx_page_info *page_info; + struct be_queue_info *rxq = &rxo->q; + struct be_queue_info *rx_cq = &rxo->cq; + struct be_rx_compl_info *rxcp; + struct be_adapter *adapter = rxo->adapter; + int flush_wait = 0; + + /* Consume pending rx completions. + * Wait for the flush completion (identified by zero num_rcvd) + * to arrive. Notify CQ even when there are no more CQ entries + * for HW to flush partially coalesced CQ entries. + * In Lancer, there is no need to wait for flush compl. + */ + for (;;) { + rxcp = be_rx_compl_get(rxo); + if (!rxcp) { + if (lancer_chip(adapter)) + break; + + if (flush_wait++ > 10 || be_hw_error(adapter)) { + dev_warn(&adapter->pdev->dev, + "did not receive flush compl\n"); + break; + } + be_cq_notify(adapter, rx_cq->id, true, 0); + mdelay(1); + } else { + be_rx_compl_discard(rxo, rxcp); + be_cq_notify(adapter, rx_cq->id, false, 1); + if (rxcp->num_rcvd == 0) + break; + } + } + + /* After cleanup, leave the CQ in unarmed state */ + be_cq_notify(adapter, rx_cq->id, false, 0); + + /* Then free posted rx buffers that were not used */ + while (atomic_read(&rxq->used) > 0) { + page_info = get_rx_page_info(rxo); + put_page(page_info->page); + memset(page_info, 0, sizeof(*page_info)); + } + BUG_ON(atomic_read(&rxq->used)); + rxq->tail = 0; + rxq->head = 0; +} + +static void be_tx_compl_clean(struct be_adapter *adapter) +{ + struct be_tx_obj *txo; + struct be_queue_info *txq; + struct be_eth_tx_compl *txcp; + u16 end_idx, cmpl = 0, timeo = 0, num_wrbs = 0; + struct sk_buff *sent_skb; + bool dummy_wrb; + int i, pending_txqs; + + /* Stop polling for compls when HW has been silent for 10ms */ + do { + pending_txqs = adapter->num_tx_qs; + + for_all_tx_queues(adapter, txo, i) { + cmpl = 0; + num_wrbs = 0; + txq = &txo->q; + while ((txcp = be_tx_compl_get(&txo->cq))) { + end_idx = GET_TX_COMPL_BITS(wrb_index, txcp); + num_wrbs += be_tx_compl_process(adapter, txo, + end_idx); + cmpl++; + } + if (cmpl) { + be_cq_notify(adapter, txo->cq.id, false, cmpl); + atomic_sub(num_wrbs, &txq->used); + timeo = 0; + } + if (atomic_read(&txq->used) == 0) + pending_txqs--; + } + + if (pending_txqs == 0 || ++timeo > 10 || be_hw_error(adapter)) + break; + + mdelay(1); + } while (true); + + for_all_tx_queues(adapter, txo, i) { + txq = &txo->q; + if (atomic_read(&txq->used)) + dev_err(&adapter->pdev->dev, "%d pending tx-compls\n", + atomic_read(&txq->used)); + + /* free posted tx for which compls will never arrive */ + while (atomic_read(&txq->used)) { + sent_skb = txo->sent_skb_list[txq->tail]; + end_idx = txq->tail; + num_wrbs = wrb_cnt_for_skb(adapter, sent_skb, + &dummy_wrb); + index_adv(&end_idx, num_wrbs - 1, txq->len); + num_wrbs = be_tx_compl_process(adapter, txo, end_idx); + atomic_sub(num_wrbs, &txq->used); + } + } +} + +static void be_evt_queues_destroy(struct be_adapter *adapter) +{ + struct be_eq_obj *eqo; + int i; + + for_all_evt_queues(adapter, eqo, i) { + if (eqo->q.created) { + be_eq_clean(eqo); + be_cmd_q_destroy(adapter, &eqo->q, QTYPE_EQ); + napi_hash_del(&eqo->napi); + netif_napi_del(&eqo->napi); + } + be_queue_free(adapter, &eqo->q); + } +} + +static int be_evt_queues_create(struct be_adapter *adapter) +{ + struct be_queue_info *eq; + struct be_eq_obj *eqo; + struct be_aic_obj *aic; + int i, rc; + + adapter->num_evt_qs = min_t(u16, num_irqs(adapter), + adapter->cfg_num_qs); + + for_all_evt_queues(adapter, eqo, i) { + netif_napi_add(adapter->netdev, &eqo->napi, be_poll, + BE_NAPI_WEIGHT); + napi_hash_add(&eqo->napi); + aic = &adapter->aic_obj[i]; + eqo->adapter = adapter; + eqo->idx = i; + aic->max_eqd = BE_MAX_EQD; + aic->enable = true; + + eq = &eqo->q; + rc = be_queue_alloc(adapter, eq, EVNT_Q_LEN, + sizeof(struct be_eq_entry)); + if (rc) + return rc; + + rc = be_cmd_eq_create(adapter, eqo); + if (rc) + return rc; + } + return 0; +} + +static void be_mcc_queues_destroy(struct be_adapter *adapter) +{ + struct be_queue_info *q; + + q = &adapter->mcc_obj.q; + if (q->created) + be_cmd_q_destroy(adapter, q, QTYPE_MCCQ); + be_queue_free(adapter, q); + + q = &adapter->mcc_obj.cq; + if (q->created) + be_cmd_q_destroy(adapter, q, QTYPE_CQ); + be_queue_free(adapter, q); +} + +/* Must be called only after TX qs are created as MCC shares TX EQ */ +static int be_mcc_queues_create(struct be_adapter *adapter) +{ + struct be_queue_info *q, *cq; + + cq = &adapter->mcc_obj.cq; + if (be_queue_alloc(adapter, cq, MCC_CQ_LEN, + sizeof(struct be_mcc_compl))) + goto err; + + /* Use the default EQ for MCC completions */ + if (be_cmd_cq_create(adapter, cq, &mcc_eqo(adapter)->q, true, 0)) + goto mcc_cq_free; + + q = &adapter->mcc_obj.q; + if (be_queue_alloc(adapter, q, MCC_Q_LEN, sizeof(struct be_mcc_wrb))) + goto mcc_cq_destroy; + + if (be_cmd_mccq_create(adapter, q, cq)) + goto mcc_q_free; + + return 0; + +mcc_q_free: + be_queue_free(adapter, q); +mcc_cq_destroy: + be_cmd_q_destroy(adapter, cq, QTYPE_CQ); +mcc_cq_free: + be_queue_free(adapter, cq); +err: + return -1; +} + +static void be_tx_queues_destroy(struct be_adapter *adapter) +{ + struct be_queue_info *q; + struct be_tx_obj *txo; + u8 i; + + for_all_tx_queues(adapter, txo, i) { + q = &txo->q; + if (q->created) + be_cmd_q_destroy(adapter, q, QTYPE_TXQ); + be_queue_free(adapter, q); + + q = &txo->cq; + if (q->created) + be_cmd_q_destroy(adapter, q, QTYPE_CQ); + be_queue_free(adapter, q); + } +} + +static int be_tx_qs_create(struct be_adapter *adapter) +{ + struct be_queue_info *cq, *eq; + struct be_tx_obj *txo; + int status, i; + + adapter->num_tx_qs = min(adapter->num_evt_qs, be_max_txqs(adapter)); + + for_all_tx_queues(adapter, txo, i) { + cq = &txo->cq; + status = be_queue_alloc(adapter, cq, TX_CQ_LEN, + sizeof(struct be_eth_tx_compl)); + if (status) + return status; + + u64_stats_init(&txo->stats.sync); + u64_stats_init(&txo->stats.sync_compl); + + /* If num_evt_qs is less than num_tx_qs, then more than + * one txq share an eq + */ + eq = &adapter->eq_obj[i % adapter->num_evt_qs].q; + status = be_cmd_cq_create(adapter, cq, eq, false, 3); + if (status) + return status; + + status = be_queue_alloc(adapter, &txo->q, TX_Q_LEN, + sizeof(struct be_eth_wrb)); + if (status) + return status; + + status = be_cmd_txq_create(adapter, txo); + if (status) + return status; + } + + dev_info(&adapter->pdev->dev, "created %d TX queue(s)\n", + adapter->num_tx_qs); + return 0; +} + +static void be_rx_cqs_destroy(struct be_adapter *adapter) +{ + struct be_queue_info *q; + struct be_rx_obj *rxo; + int i; + + for_all_rx_queues(adapter, rxo, i) { + q = &rxo->cq; + if (q->created) + be_cmd_q_destroy(adapter, q, QTYPE_CQ); + be_queue_free(adapter, q); + } +} + +static int be_rx_cqs_create(struct be_adapter *adapter) +{ + struct be_queue_info *eq, *cq; + struct be_rx_obj *rxo; + int rc, i; + + /* We can create as many RSS rings as there are EQs. */ + adapter->num_rx_qs = adapter->num_evt_qs; + + /* We'll use RSS only if atleast 2 RSS rings are supported. + * When RSS is used, we'll need a default RXQ for non-IP traffic. + */ + if (adapter->num_rx_qs > 1) + adapter->num_rx_qs++; + + adapter->big_page_size = (1 << get_order(rx_frag_size)) * PAGE_SIZE; + for_all_rx_queues(adapter, rxo, i) { + rxo->adapter = adapter; + cq = &rxo->cq; + rc = be_queue_alloc(adapter, cq, RX_CQ_LEN, + sizeof(struct be_eth_rx_compl)); + if (rc) + return rc; + + u64_stats_init(&rxo->stats.sync); + eq = &adapter->eq_obj[i % adapter->num_evt_qs].q; + rc = be_cmd_cq_create(adapter, cq, eq, false, 3); + if (rc) + return rc; + } + + dev_info(&adapter->pdev->dev, + "created %d RSS queue(s) and 1 default RX queue\n", + adapter->num_rx_qs - 1); + return 0; +} + +static irqreturn_t be_intx(int irq, void *dev) +{ + struct be_eq_obj *eqo = dev; + struct be_adapter *adapter = eqo->adapter; + int num_evts = 0; + + /* IRQ is not expected when NAPI is scheduled as the EQ + * will not be armed. + * But, this can happen on Lancer INTx where it takes + * a while to de-assert INTx or in BE2 where occasionaly + * an interrupt may be raised even when EQ is unarmed. + * If NAPI is already scheduled, then counting & notifying + * events will orphan them. + */ + if (napi_schedule_prep(&eqo->napi)) { + num_evts = events_get(eqo); + __napi_schedule(&eqo->napi); + if (num_evts) + eqo->spurious_intr = 0; + } + be_eq_notify(adapter, eqo->q.id, false, true, num_evts); + + /* Return IRQ_HANDLED only for the the first spurious intr + * after a valid intr to stop the kernel from branding + * this irq as a bad one! + */ + if (num_evts || eqo->spurious_intr++ == 0) + return IRQ_HANDLED; + else + return IRQ_NONE; +} + +static irqreturn_t be_msix(int irq, void *dev) +{ + struct be_eq_obj *eqo = dev; + + be_eq_notify(eqo->adapter, eqo->q.id, false, true, 0); + napi_schedule(&eqo->napi); + return IRQ_HANDLED; +} + +static inline bool do_gro(struct be_rx_compl_info *rxcp) +{ + return (rxcp->tcpf && !rxcp->err && rxcp->l4_csum) ? true : false; +} + +static int be_process_rx(struct be_rx_obj *rxo, struct napi_struct *napi, + int budget, int polling) +{ + struct be_adapter *adapter = rxo->adapter; + struct be_queue_info *rx_cq = &rxo->cq; + struct be_rx_compl_info *rxcp; + u32 work_done; + u32 frags_consumed = 0; + + for (work_done = 0; work_done < budget; work_done++) { + rxcp = be_rx_compl_get(rxo); + if (!rxcp) + break; + + /* Is it a flush compl that has no data */ + if (unlikely(rxcp->num_rcvd == 0)) + goto loop_continue; + + /* Discard compl with partial DMA Lancer B0 */ + if (unlikely(!rxcp->pkt_size)) { + be_rx_compl_discard(rxo, rxcp); + goto loop_continue; + } + + /* On BE drop pkts that arrive due to imperfect filtering in + * promiscuous mode on some skews + */ + if (unlikely(rxcp->port != adapter->port_num && + !lancer_chip(adapter))) { + be_rx_compl_discard(rxo, rxcp); + goto loop_continue; + } + + /* Don't do gro when we're busy_polling */ + if (do_gro(rxcp) && polling != BUSY_POLLING) + be_rx_compl_process_gro(rxo, napi, rxcp); + else + be_rx_compl_process(rxo, napi, rxcp); + +loop_continue: + frags_consumed += rxcp->num_rcvd; + be_rx_stats_update(rxo, rxcp); + } + + if (work_done) { + be_cq_notify(adapter, rx_cq->id, true, work_done); + + /* When an rx-obj gets into post_starved state, just + * let be_worker do the posting. + */ + if (atomic_read(&rxo->q.used) < RX_FRAGS_REFILL_WM && + !rxo->rx_post_starved) + be_post_rx_frags(rxo, GFP_ATOMIC, + max_t(u32, MAX_RX_POST, + frags_consumed)); + } + + return work_done; +} + +static inline void be_update_tx_err(struct be_tx_obj *txo, u32 status) +{ + switch (status) { + case BE_TX_COMP_HDR_PARSE_ERR: + tx_stats(txo)->tx_hdr_parse_err++; + break; + case BE_TX_COMP_NDMA_ERR: + tx_stats(txo)->tx_dma_err++; + break; + case BE_TX_COMP_ACL_ERR: + tx_stats(txo)->tx_spoof_check_err++; + break; + } +} + +static inline void lancer_update_tx_err(struct be_tx_obj *txo, u32 status) +{ + switch (status) { + case LANCER_TX_COMP_LSO_ERR: + tx_stats(txo)->tx_tso_err++; + break; + case LANCER_TX_COMP_HSW_DROP_MAC_ERR: + case LANCER_TX_COMP_HSW_DROP_VLAN_ERR: + tx_stats(txo)->tx_spoof_check_err++; + break; + case LANCER_TX_COMP_QINQ_ERR: + tx_stats(txo)->tx_qinq_err++; + break; + case LANCER_TX_COMP_PARITY_ERR: + tx_stats(txo)->tx_internal_parity_err++; + break; + case LANCER_TX_COMP_DMA_ERR: + tx_stats(txo)->tx_dma_err++; + break; + } +} + +static void be_process_tx(struct be_adapter *adapter, struct be_tx_obj *txo, + int idx) +{ + struct be_eth_tx_compl *txcp; + int num_wrbs = 0, work_done = 0; + u32 compl_status; + u16 last_idx; + + while ((txcp = be_tx_compl_get(&txo->cq))) { + last_idx = GET_TX_COMPL_BITS(wrb_index, txcp); + num_wrbs += be_tx_compl_process(adapter, txo, last_idx); + work_done++; + + compl_status = GET_TX_COMPL_BITS(status, txcp); + if (compl_status) { + if (lancer_chip(adapter)) + lancer_update_tx_err(txo, compl_status); + else + be_update_tx_err(txo, compl_status); + } + } + + if (work_done) { + be_cq_notify(adapter, txo->cq.id, true, work_done); + atomic_sub(num_wrbs, &txo->q.used); + + /* As Tx wrbs have been freed up, wake up netdev queue + * if it was stopped due to lack of tx wrbs. */ + if (__netif_subqueue_stopped(adapter->netdev, idx) && + atomic_read(&txo->q.used) < txo->q.len / 2) { + netif_wake_subqueue(adapter->netdev, idx); + } + + u64_stats_update_begin(&tx_stats(txo)->sync_compl); + tx_stats(txo)->tx_compl += work_done; + u64_stats_update_end(&tx_stats(txo)->sync_compl); + } +} + +int be_poll(struct napi_struct *napi, int budget) +{ + struct be_eq_obj *eqo = container_of(napi, struct be_eq_obj, napi); + struct be_adapter *adapter = eqo->adapter; + int max_work = 0, work, i, num_evts; + struct be_rx_obj *rxo; + struct be_tx_obj *txo; + + num_evts = events_get(eqo); + + for_all_tx_queues_on_eq(adapter, eqo, txo, i) + be_process_tx(adapter, txo, i); + + if (be_lock_napi(eqo)) { + /* This loop will iterate twice for EQ0 in which + * completions of the last RXQ (default one) are also processed + * For other EQs the loop iterates only once + */ + for_all_rx_queues_on_eq(adapter, eqo, rxo, i) { + work = be_process_rx(rxo, napi, budget, NAPI_POLLING); + max_work = max(work, max_work); + } + be_unlock_napi(eqo); + } else { + max_work = budget; + } + + if (is_mcc_eqo(eqo)) + be_process_mcc(adapter); + + if (max_work < budget) { + napi_complete(napi); + be_eq_notify(adapter, eqo->q.id, true, false, num_evts); + } else { + /* As we'll continue in polling mode, count and clear events */ + be_eq_notify(adapter, eqo->q.id, false, false, num_evts); + } + return max_work; +} + +#ifdef CONFIG_NET_RX_BUSY_POLL +static int be_busy_poll(struct napi_struct *napi) +{ + struct be_eq_obj *eqo = container_of(napi, struct be_eq_obj, napi); + struct be_adapter *adapter = eqo->adapter; + struct be_rx_obj *rxo; + int i, work = 0; + + if (!be_lock_busy_poll(eqo)) + return LL_FLUSH_BUSY; + + for_all_rx_queues_on_eq(adapter, eqo, rxo, i) { + work = be_process_rx(rxo, napi, 4, BUSY_POLLING); + if (work) + break; + } + + be_unlock_busy_poll(eqo); + return work; +} +#endif + +void be_detect_error(struct be_adapter *adapter) +{ + u32 ue_lo = 0, ue_hi = 0, ue_lo_mask = 0, ue_hi_mask = 0; + u32 sliport_status = 0, sliport_err1 = 0, sliport_err2 = 0; + u32 i; + bool error_detected = false; + struct device *dev = &adapter->pdev->dev; + struct net_device *netdev = adapter->netdev; + + if (be_hw_error(adapter)) + return; + + if (lancer_chip(adapter)) { + sliport_status = ioread32(adapter->db + SLIPORT_STATUS_OFFSET); + if (sliport_status & SLIPORT_STATUS_ERR_MASK) { + sliport_err1 = ioread32(adapter->db + + SLIPORT_ERROR1_OFFSET); + sliport_err2 = ioread32(adapter->db + + SLIPORT_ERROR2_OFFSET); + adapter->hw_error = true; + /* Do not log error messages if its a FW reset */ + if (sliport_err1 == SLIPORT_ERROR_FW_RESET1 && + sliport_err2 == SLIPORT_ERROR_FW_RESET2) { + dev_info(dev, "Firmware update in progress\n"); + } else { + error_detected = true; + dev_err(dev, "Error detected in the card\n"); + dev_err(dev, "ERR: sliport status 0x%x\n", + sliport_status); + dev_err(dev, "ERR: sliport error1 0x%x\n", + sliport_err1); + dev_err(dev, "ERR: sliport error2 0x%x\n", + sliport_err2); + } + } + } else { + pci_read_config_dword(adapter->pdev, + PCICFG_UE_STATUS_LOW, &ue_lo); + pci_read_config_dword(adapter->pdev, + PCICFG_UE_STATUS_HIGH, &ue_hi); + pci_read_config_dword(adapter->pdev, + PCICFG_UE_STATUS_LOW_MASK, &ue_lo_mask); + pci_read_config_dword(adapter->pdev, + PCICFG_UE_STATUS_HI_MASK, &ue_hi_mask); + + ue_lo = (ue_lo & ~ue_lo_mask); + ue_hi = (ue_hi & ~ue_hi_mask); + + /* On certain platforms BE hardware can indicate spurious UEs. + * Allow HW to stop working completely in case of a real UE. + * Hence not setting the hw_error for UE detection. + */ + + if (ue_lo || ue_hi) { + error_detected = true; + dev_err(dev, + "Unrecoverable Error detected in the adapter"); + dev_err(dev, "Please reboot server to recover"); + if (skyhawk_chip(adapter)) + adapter->hw_error = true; + for (i = 0; ue_lo; ue_lo >>= 1, i++) { + if (ue_lo & 1) + dev_err(dev, "UE: %s bit set\n", + ue_status_low_desc[i]); + } + for (i = 0; ue_hi; ue_hi >>= 1, i++) { + if (ue_hi & 1) + dev_err(dev, "UE: %s bit set\n", + ue_status_hi_desc[i]); + } + } + } + if (error_detected) + netif_carrier_off(netdev); +} + +static void be_msix_disable(struct be_adapter *adapter) +{ + if (msix_enabled(adapter)) { + pci_disable_msix(adapter->pdev); + adapter->num_msix_vec = 0; + adapter->num_msix_roce_vec = 0; + } +} + +static int be_msix_enable(struct be_adapter *adapter) +{ + int i, num_vec; + struct device *dev = &adapter->pdev->dev; + + /* If RoCE is supported, program the max number of NIC vectors that + * may be configured via set-channels, along with vectors needed for + * RoCe. Else, just program the number we'll use initially. + */ + if (be_roce_supported(adapter)) + num_vec = min_t(int, 2 * be_max_eqs(adapter), + 2 * num_online_cpus()); + else + num_vec = adapter->cfg_num_qs; + + for (i = 0; i < num_vec; i++) + adapter->msix_entries[i].entry = i; + + num_vec = pci_enable_msix_range(adapter->pdev, adapter->msix_entries, + MIN_MSIX_VECTORS, num_vec); + if (num_vec < 0) + goto fail; + + if (be_roce_supported(adapter) && num_vec > MIN_MSIX_VECTORS) { + adapter->num_msix_roce_vec = num_vec / 2; + dev_info(dev, "enabled %d MSI-x vector(s) for RoCE\n", + adapter->num_msix_roce_vec); + } + + adapter->num_msix_vec = num_vec - adapter->num_msix_roce_vec; + + dev_info(dev, "enabled %d MSI-x vector(s) for NIC\n", + adapter->num_msix_vec); + return 0; + +fail: + dev_warn(dev, "MSIx enable failed\n"); + + /* INTx is not supported in VFs, so fail probe if enable_msix fails */ + if (!be_physfn(adapter)) + return num_vec; + return 0; +} + +static inline int be_msix_vec_get(struct be_adapter *adapter, + struct be_eq_obj *eqo) +{ + return adapter->msix_entries[eqo->msix_idx].vector; +} + +static int be_msix_register(struct be_adapter *adapter) +{ + struct net_device *netdev = adapter->netdev; + struct be_eq_obj *eqo; + int status, i, vec; + + for_all_evt_queues(adapter, eqo, i) { + sprintf(eqo->desc, "%s-q%d", netdev->name, i); + vec = be_msix_vec_get(adapter, eqo); + status = request_irq(vec, be_msix, 0, eqo->desc, eqo); + if (status) + goto err_msix; + } + + return 0; +err_msix: + for (i--, eqo = &adapter->eq_obj[i]; i >= 0; i--, eqo--) + free_irq(be_msix_vec_get(adapter, eqo), eqo); + dev_warn(&adapter->pdev->dev, "MSIX Request IRQ failed - err %d\n", + status); + be_msix_disable(adapter); + return status; +} + +static int be_irq_register(struct be_adapter *adapter) +{ + struct net_device *netdev = adapter->netdev; + int status; + + if (msix_enabled(adapter)) { + status = be_msix_register(adapter); + if (status == 0) + goto done; + /* INTx is not supported for VF */ + if (!be_physfn(adapter)) + return status; + } + + /* INTx: only the first EQ is used */ + netdev->irq = adapter->pdev->irq; + status = request_irq(netdev->irq, be_intx, IRQF_SHARED, netdev->name, + &adapter->eq_obj[0]); + if (status) { + dev_err(&adapter->pdev->dev, + "INTx request IRQ failed - err %d\n", status); + return status; + } +done: + adapter->isr_registered = true; + return 0; +} + +static void be_irq_unregister(struct be_adapter *adapter) +{ + struct net_device *netdev = adapter->netdev; + struct be_eq_obj *eqo; + int i; + + if (!adapter->isr_registered) + return; + + /* INTx */ + if (!msix_enabled(adapter)) { + free_irq(netdev->irq, &adapter->eq_obj[0]); + goto done; + } + + /* MSIx */ + for_all_evt_queues(adapter, eqo, i) + free_irq(be_msix_vec_get(adapter, eqo), eqo); + +done: + adapter->isr_registered = false; +} + +static void be_rx_qs_destroy(struct be_adapter *adapter) +{ + struct be_queue_info *q; + struct be_rx_obj *rxo; + int i; + + for_all_rx_queues(adapter, rxo, i) { + q = &rxo->q; + if (q->created) { + be_cmd_rxq_destroy(adapter, q); + be_rx_cq_clean(rxo); + } + be_queue_free(adapter, q); + } +} + +static int be_close(struct net_device *netdev) +{ + struct be_adapter *adapter = netdev_priv(netdev); + struct be_eq_obj *eqo; + int i; + + /* This protection is needed as be_close() may be called even when the + * adapter is in cleared state (after eeh perm failure) + */ + if (!(adapter->flags & BE_FLAGS_SETUP_DONE)) + return 0; + + be_roce_dev_close(adapter); + + if (adapter->flags & BE_FLAGS_NAPI_ENABLED) { + for_all_evt_queues(adapter, eqo, i) { + napi_disable(&eqo->napi); + be_disable_busy_poll(eqo); + } + adapter->flags &= ~BE_FLAGS_NAPI_ENABLED; + } + + be_async_mcc_disable(adapter); + + /* Wait for all pending tx completions to arrive so that + * all tx skbs are freed. + */ + netif_tx_disable(netdev); + be_tx_compl_clean(adapter); + + be_rx_qs_destroy(adapter); + + for (i = 1; i < (adapter->uc_macs + 1); i++) + be_cmd_pmac_del(adapter, adapter->if_handle, + adapter->pmac_id[i], 0); + adapter->uc_macs = 0; + + for_all_evt_queues(adapter, eqo, i) { + if (msix_enabled(adapter)) + synchronize_irq(be_msix_vec_get(adapter, eqo)); + else + synchronize_irq(netdev->irq); + be_eq_clean(eqo); + } + + be_irq_unregister(adapter); + + return 0; +} + +static int be_rx_qs_create(struct be_adapter *adapter) +{ + struct be_rx_obj *rxo; + int rc, i, j; + u8 rss_hkey[RSS_HASH_KEY_LEN]; + struct rss_info *rss = &adapter->rss_info; + + for_all_rx_queues(adapter, rxo, i) { + rc = be_queue_alloc(adapter, &rxo->q, RX_Q_LEN, + sizeof(struct be_eth_rx_d)); + if (rc) + return rc; + } + + /* The FW would like the default RXQ to be created first */ + rxo = default_rxo(adapter); + rc = be_cmd_rxq_create(adapter, &rxo->q, rxo->cq.id, rx_frag_size, + adapter->if_handle, false, &rxo->rss_id); + if (rc) + return rc; + + for_all_rss_queues(adapter, rxo, i) { + rc = be_cmd_rxq_create(adapter, &rxo->q, rxo->cq.id, + rx_frag_size, adapter->if_handle, + true, &rxo->rss_id); + if (rc) + return rc; + } + + if (be_multi_rxq(adapter)) { + for (j = 0; j < RSS_INDIR_TABLE_LEN; + j += adapter->num_rx_qs - 1) { + for_all_rss_queues(adapter, rxo, i) { + if ((j + i) >= RSS_INDIR_TABLE_LEN) + break; + rss->rsstable[j + i] = rxo->rss_id; + rss->rss_queue[j + i] = i; + } + } + rss->rss_flags = RSS_ENABLE_TCP_IPV4 | RSS_ENABLE_IPV4 | + RSS_ENABLE_TCP_IPV6 | RSS_ENABLE_IPV6; + + if (!BEx_chip(adapter)) + rss->rss_flags |= RSS_ENABLE_UDP_IPV4 | + RSS_ENABLE_UDP_IPV6; + } else { + /* Disable RSS, if only default RX Q is created */ + rss->rss_flags = RSS_ENABLE_NONE; + } + + get_random_bytes(rss_hkey, RSS_HASH_KEY_LEN); + rc = be_cmd_rss_config(adapter, rss->rsstable, rss->rss_flags, + 128, rss_hkey); + if (rc) { + rss->rss_flags = RSS_ENABLE_NONE; + return rc; + } + + memcpy(rss->rss_hkey, rss_hkey, RSS_HASH_KEY_LEN); + + /* First time posting */ + for_all_rx_queues(adapter, rxo, i) + be_post_rx_frags(rxo, GFP_KERNEL, MAX_RX_POST); + return 0; +} + +static int be_open(struct net_device *netdev) +{ + struct be_adapter *adapter = netdev_priv(netdev); + struct be_eq_obj *eqo; + struct be_rx_obj *rxo; + struct be_tx_obj *txo; + u8 link_status; + int status, i; + + status = be_rx_qs_create(adapter); + if (status) + goto err; + + status = be_irq_register(adapter); + if (status) + goto err; + + for_all_rx_queues(adapter, rxo, i) + be_cq_notify(adapter, rxo->cq.id, true, 0); + + for_all_tx_queues(adapter, txo, i) + be_cq_notify(adapter, txo->cq.id, true, 0); + + be_async_mcc_enable(adapter); + + for_all_evt_queues(adapter, eqo, i) { + napi_enable(&eqo->napi); + be_enable_busy_poll(eqo); + be_eq_notify(adapter, eqo->q.id, true, true, 0); + } + adapter->flags |= BE_FLAGS_NAPI_ENABLED; + + status = be_cmd_link_status_query(adapter, NULL, &link_status, 0); + if (!status) + be_link_status_update(adapter, link_status); + + netif_tx_start_all_queues(netdev); + be_roce_dev_open(adapter); + +#ifdef CONFIG_BE2NET_VXLAN + if (skyhawk_chip(adapter)) + vxlan_get_rx_port(netdev); +#endif + + return 0; +err: + be_close(adapter->netdev); + return -EIO; +} + +static int be_setup_wol(struct be_adapter *adapter, bool enable) +{ + struct be_dma_mem cmd; + int status = 0; + u8 mac[ETH_ALEN]; + + memset(mac, 0, ETH_ALEN); + + cmd.size = sizeof(struct be_cmd_req_acpi_wol_magic_config); + cmd.va = dma_zalloc_coherent(&adapter->pdev->dev, cmd.size, &cmd.dma, + GFP_KERNEL); + if (!cmd.va) + return -ENOMEM; + + if (enable) { + status = pci_write_config_dword(adapter->pdev, + PCICFG_PM_CONTROL_OFFSET, + PCICFG_PM_CONTROL_MASK); + if (status) { + dev_err(&adapter->pdev->dev, + "Could not enable Wake-on-lan\n"); + dma_free_coherent(&adapter->pdev->dev, cmd.size, cmd.va, + cmd.dma); + return status; + } + status = be_cmd_enable_magic_wol(adapter, + adapter->netdev->dev_addr, + &cmd); + pci_enable_wake(adapter->pdev, PCI_D3hot, 1); + pci_enable_wake(adapter->pdev, PCI_D3cold, 1); + } else { + status = be_cmd_enable_magic_wol(adapter, mac, &cmd); + pci_enable_wake(adapter->pdev, PCI_D3hot, 0); + pci_enable_wake(adapter->pdev, PCI_D3cold, 0); + } + + dma_free_coherent(&adapter->pdev->dev, cmd.size, cmd.va, cmd.dma); + return status; +} + +/* + * Generate a seed MAC address from the PF MAC Address using jhash. + * MAC Address for VFs are assigned incrementally starting from the seed. + * These addresses are programmed in the ASIC by the PF and the VF driver + * queries for the MAC address during its probe. + */ +static int be_vf_eth_addr_config(struct be_adapter *adapter) +{ + u32 vf; + int status = 0; + u8 mac[ETH_ALEN]; + struct be_vf_cfg *vf_cfg; + + be_vf_eth_addr_generate(adapter, mac); + + for_all_vfs(adapter, vf_cfg, vf) { + if (BEx_chip(adapter)) + status = be_cmd_pmac_add(adapter, mac, + vf_cfg->if_handle, + &vf_cfg->pmac_id, vf + 1); + else + status = be_cmd_set_mac(adapter, mac, vf_cfg->if_handle, + vf + 1); + + if (status) + dev_err(&adapter->pdev->dev, + "Mac address assignment failed for VF %d\n", + vf); + else + memcpy(vf_cfg->mac_addr, mac, ETH_ALEN); + + mac[5] += 1; + } + return status; +} + +static int be_vfs_mac_query(struct be_adapter *adapter) +{ + int status, vf; + u8 mac[ETH_ALEN]; + struct be_vf_cfg *vf_cfg; + + for_all_vfs(adapter, vf_cfg, vf) { + status = be_cmd_get_active_mac(adapter, vf_cfg->pmac_id, + mac, vf_cfg->if_handle, + false, vf+1); + if (status) + return status; + memcpy(vf_cfg->mac_addr, mac, ETH_ALEN); + } + return 0; +} + +static void be_vf_clear(struct be_adapter *adapter) +{ + struct be_vf_cfg *vf_cfg; + u32 vf; + + if (pci_vfs_assigned(adapter->pdev)) { + dev_warn(&adapter->pdev->dev, + "VFs are assigned to VMs: not disabling VFs\n"); + goto done; + } + + pci_disable_sriov(adapter->pdev); + + for_all_vfs(adapter, vf_cfg, vf) { + if (BEx_chip(adapter)) + be_cmd_pmac_del(adapter, vf_cfg->if_handle, + vf_cfg->pmac_id, vf + 1); + else + be_cmd_set_mac(adapter, NULL, vf_cfg->if_handle, + vf + 1); + + be_cmd_if_destroy(adapter, vf_cfg->if_handle, vf + 1); + } +done: + kfree(adapter->vf_cfg); + adapter->num_vfs = 0; + adapter->flags &= ~BE_FLAGS_SRIOV_ENABLED; +} + +static void be_clear_queues(struct be_adapter *adapter) +{ + be_mcc_queues_destroy(adapter); + be_rx_cqs_destroy(adapter); + be_tx_queues_destroy(adapter); + be_evt_queues_destroy(adapter); +} + +static void be_cancel_worker(struct be_adapter *adapter) +{ + if (adapter->flags & BE_FLAGS_WORKER_SCHEDULED) { + cancel_delayed_work_sync(&adapter->work); + adapter->flags &= ~BE_FLAGS_WORKER_SCHEDULED; + } +} + +static void be_mac_clear(struct be_adapter *adapter) +{ + int i; + + if (adapter->pmac_id) { + for (i = 0; i < (adapter->uc_macs + 1); i++) + be_cmd_pmac_del(adapter, adapter->if_handle, + adapter->pmac_id[i], 0); + adapter->uc_macs = 0; + + kfree(adapter->pmac_id); + adapter->pmac_id = NULL; + } +} + +#ifdef CONFIG_BE2NET_VXLAN +static void be_disable_vxlan_offloads(struct be_adapter *adapter) +{ + if (adapter->flags & BE_FLAGS_VXLAN_OFFLOADS) + be_cmd_manage_iface(adapter, adapter->if_handle, + OP_CONVERT_TUNNEL_TO_NORMAL); + + if (adapter->vxlan_port) + be_cmd_set_vxlan_port(adapter, 0); + + adapter->flags &= ~BE_FLAGS_VXLAN_OFFLOADS; + adapter->vxlan_port = 0; +} +#endif + +static int be_clear(struct be_adapter *adapter) +{ + be_cancel_worker(adapter); + + if (sriov_enabled(adapter)) + be_vf_clear(adapter); + + /* Re-configure FW to distribute resources evenly across max-supported + * number of VFs, only when VFs are not already enabled. + */ + if (be_physfn(adapter) && !pci_vfs_assigned(adapter->pdev)) + be_cmd_set_sriov_config(adapter, adapter->pool_res, + pci_sriov_get_totalvfs(adapter->pdev)); + +#ifdef CONFIG_BE2NET_VXLAN + be_disable_vxlan_offloads(adapter); +#endif + /* delete the primary mac along with the uc-mac list */ + be_mac_clear(adapter); + + be_cmd_if_destroy(adapter, adapter->if_handle, 0); + + be_clear_queues(adapter); + + be_msix_disable(adapter); + adapter->flags &= ~BE_FLAGS_SETUP_DONE; + return 0; +} + +static int be_vfs_if_create(struct be_adapter *adapter) +{ + struct be_resources res = {0}; + struct be_vf_cfg *vf_cfg; + u32 cap_flags, en_flags, vf; + int status = 0; + + cap_flags = BE_IF_FLAGS_UNTAGGED | BE_IF_FLAGS_BROADCAST | + BE_IF_FLAGS_MULTICAST; + + for_all_vfs(adapter, vf_cfg, vf) { + if (!BE3_chip(adapter)) { + status = be_cmd_get_profile_config(adapter, &res, + vf + 1); + if (!status) + cap_flags = res.if_cap_flags; + } + + /* If a FW profile exists, then cap_flags are updated */ + en_flags = cap_flags & (BE_IF_FLAGS_UNTAGGED | + BE_IF_FLAGS_BROADCAST | + BE_IF_FLAGS_MULTICAST); + status = + be_cmd_if_create(adapter, cap_flags, en_flags, + &vf_cfg->if_handle, vf + 1); + if (status) + goto err; + } +err: + return status; +} + +static int be_vf_setup_init(struct be_adapter *adapter) +{ + struct be_vf_cfg *vf_cfg; + int vf; + + adapter->vf_cfg = kcalloc(adapter->num_vfs, sizeof(*vf_cfg), + GFP_KERNEL); + if (!adapter->vf_cfg) + return -ENOMEM; + + for_all_vfs(adapter, vf_cfg, vf) { + vf_cfg->if_handle = -1; + vf_cfg->pmac_id = -1; + } + return 0; +} + +static int be_vf_setup(struct be_adapter *adapter) +{ + struct device *dev = &adapter->pdev->dev; + struct be_vf_cfg *vf_cfg; + int status, old_vfs, vf; + u32 privileges; + + old_vfs = pci_num_vf(adapter->pdev); + + status = be_vf_setup_init(adapter); + if (status) + goto err; + + if (old_vfs) { + for_all_vfs(adapter, vf_cfg, vf) { + status = be_cmd_get_if_id(adapter, vf_cfg, vf); + if (status) + goto err; + } + + status = be_vfs_mac_query(adapter); + if (status) + goto err; + } else { + status = be_vfs_if_create(adapter); + if (status) + goto err; + + status = be_vf_eth_addr_config(adapter); + if (status) + goto err; + } + + for_all_vfs(adapter, vf_cfg, vf) { + /* Allow VFs to programs MAC/VLAN filters */ + status = be_cmd_get_fn_privileges(adapter, &privileges, vf + 1); + if (!status && !(privileges & BE_PRIV_FILTMGMT)) { + status = be_cmd_set_fn_privileges(adapter, + privileges | + BE_PRIV_FILTMGMT, + vf + 1); + if (!status) + dev_info(dev, "VF%d has FILTMGMT privilege\n", + vf); + } + + /* Allow full available bandwidth */ + if (!old_vfs) + be_cmd_config_qos(adapter, 0, 0, vf + 1); + + if (!old_vfs) { + be_cmd_enable_vf(adapter, vf + 1); + be_cmd_set_logical_link_config(adapter, + IFLA_VF_LINK_STATE_AUTO, + vf+1); + } + } + + if (!old_vfs) { + status = pci_enable_sriov(adapter->pdev, adapter->num_vfs); + if (status) { + dev_err(dev, "SRIOV enable failed\n"); + adapter->num_vfs = 0; + goto err; + } + } + + adapter->flags |= BE_FLAGS_SRIOV_ENABLED; + return 0; +err: + dev_err(dev, "VF setup failed\n"); + be_vf_clear(adapter); + return status; +} + +/* Converting function_mode bits on BE3 to SH mc_type enums */ + +static u8 be_convert_mc_type(u32 function_mode) +{ + if (function_mode & VNIC_MODE && function_mode & QNQ_MODE) + return vNIC1; + else if (function_mode & QNQ_MODE) + return FLEX10; + else if (function_mode & VNIC_MODE) + return vNIC2; + else if (function_mode & UMC_ENABLED) + return UMC; + else + return MC_NONE; +} + +/* On BE2/BE3 FW does not suggest the supported limits */ +static void BEx_get_resources(struct be_adapter *adapter, + struct be_resources *res) +{ + bool use_sriov = adapter->num_vfs ? 1 : 0; + + if (be_physfn(adapter)) + res->max_uc_mac = BE_UC_PMAC_COUNT; + else + res->max_uc_mac = BE_VF_UC_PMAC_COUNT; + + adapter->mc_type = be_convert_mc_type(adapter->function_mode); + + if (be_is_mc(adapter)) { + /* Assuming that there are 4 channels per port, + * when multi-channel is enabled + */ + if (be_is_qnq_mode(adapter)) + res->max_vlans = BE_NUM_VLANS_SUPPORTED/8; + else + /* In a non-qnq multichannel mode, the pvid + * takes up one vlan entry + */ + res->max_vlans = (BE_NUM_VLANS_SUPPORTED / 4) - 1; + } else { + res->max_vlans = BE_NUM_VLANS_SUPPORTED; + } + + res->max_mcast_mac = BE_MAX_MC; + + /* 1) For BE3 1Gb ports, FW does not support multiple TXQs + * 2) Create multiple TX rings on a BE3-R multi-channel interface + * *only* if it is RSS-capable. + */ + if (BE2_chip(adapter) || use_sriov || (adapter->port_num > 1) || + !be_physfn(adapter) || (be_is_mc(adapter) && + !(adapter->function_caps & BE_FUNCTION_CAPS_RSS))) { + res->max_tx_qs = 1; + } else if (adapter->function_caps & BE_FUNCTION_CAPS_SUPER_NIC) { + struct be_resources super_nic_res = {0}; + + /* On a SuperNIC profile, the driver needs to use the + * GET_PROFILE_CONFIG cmd to query the per-function TXQ limits + */ + be_cmd_get_profile_config(adapter, &super_nic_res, 0); + /* Some old versions of BE3 FW don't report max_tx_qs value */ + res->max_tx_qs = super_nic_res.max_tx_qs ? : BE3_MAX_TX_QS; + } else { + res->max_tx_qs = BE3_MAX_TX_QS; + } + + if ((adapter->function_caps & BE_FUNCTION_CAPS_RSS) && + !use_sriov && be_physfn(adapter)) + res->max_rss_qs = (adapter->be3_native) ? + BE3_MAX_RSS_QS : BE2_MAX_RSS_QS; + res->max_rx_qs = res->max_rss_qs + 1; + + if (be_physfn(adapter)) + res->max_evt_qs = (be_max_vfs(adapter) > 0) ? + BE3_SRIOV_MAX_EVT_QS : BE3_MAX_EVT_QS; + else + res->max_evt_qs = 1; + + res->if_cap_flags = BE_IF_CAP_FLAGS_WANT; + if (!(adapter->function_caps & BE_FUNCTION_CAPS_RSS)) + res->if_cap_flags &= ~BE_IF_FLAGS_RSS; +} + +static void be_setup_init(struct be_adapter *adapter) +{ + adapter->vlan_prio_bmap = 0xff; + adapter->phy.link_speed = -1; + adapter->if_handle = -1; + adapter->be3_native = false; + adapter->promiscuous = false; + if (be_physfn(adapter)) + adapter->cmd_privileges = MAX_PRIVILEGES; + else + adapter->cmd_privileges = MIN_PRIVILEGES; +} + +static int be_get_sriov_config(struct be_adapter *adapter) +{ + struct device *dev = &adapter->pdev->dev; + struct be_resources res = {0}; + int max_vfs, old_vfs; + + /* Some old versions of BE3 FW don't report max_vfs value */ + be_cmd_get_profile_config(adapter, &res, 0); + + if (BE3_chip(adapter) && !res.max_vfs) { + max_vfs = pci_sriov_get_totalvfs(adapter->pdev); + res.max_vfs = max_vfs > 0 ? min(MAX_VFS, max_vfs) : 0; + } + + adapter->pool_res = res; + + if (!be_max_vfs(adapter)) { + if (num_vfs) + dev_warn(dev, "SRIOV is disabled. Ignoring num_vfs\n"); + adapter->num_vfs = 0; + return 0; + } + + pci_sriov_set_totalvfs(adapter->pdev, be_max_vfs(adapter)); + + /* validate num_vfs module param */ + old_vfs = pci_num_vf(adapter->pdev); + if (old_vfs) { + dev_info(dev, "%d VFs are already enabled\n", old_vfs); + if (old_vfs != num_vfs) + dev_warn(dev, "Ignoring num_vfs=%d setting\n", num_vfs); + adapter->num_vfs = old_vfs; + } else { + if (num_vfs > be_max_vfs(adapter)) { + dev_info(dev, "Resources unavailable to init %d VFs\n", + num_vfs); + dev_info(dev, "Limiting to %d VFs\n", + be_max_vfs(adapter)); + } + adapter->num_vfs = min_t(u16, num_vfs, be_max_vfs(adapter)); + } + + return 0; +} + +static int be_get_resources(struct be_adapter *adapter) +{ + struct device *dev = &adapter->pdev->dev; + struct be_resources res = {0}; + int status; + + if (BEx_chip(adapter)) { + BEx_get_resources(adapter, &res); + adapter->res = res; + } + + /* For Lancer, SH etc read per-function resource limits from FW. + * GET_FUNC_CONFIG returns per function guaranteed limits. + * GET_PROFILE_CONFIG returns PCI-E related limits PF-pool limits + */ + if (!BEx_chip(adapter)) { + status = be_cmd_get_func_config(adapter, &res); + if (status) + return status; + + /* If RoCE may be enabled stash away half the EQs for RoCE */ + if (be_roce_supported(adapter)) + res.max_evt_qs /= 2; + adapter->res = res; + } + + dev_info(dev, "Max: txqs %d, rxqs %d, rss %d, eqs %d, vfs %d\n", + be_max_txqs(adapter), be_max_rxqs(adapter), + be_max_rss(adapter), be_max_eqs(adapter), + be_max_vfs(adapter)); + dev_info(dev, "Max: uc-macs %d, mc-macs %d, vlans %d\n", + be_max_uc(adapter), be_max_mc(adapter), + be_max_vlans(adapter)); + + return 0; +} + +static void be_sriov_config(struct be_adapter *adapter) +{ + struct device *dev = &adapter->pdev->dev; + int status; + + status = be_get_sriov_config(adapter); + if (status) { + dev_err(dev, "Failed to query SR-IOV configuration\n"); + dev_err(dev, "SR-IOV cannot be enabled\n"); + return; + } + + /* When the HW is in SRIOV capable configuration, the PF-pool + * resources are equally distributed across the max-number of + * VFs. The user may request only a subset of the max-vfs to be + * enabled. Based on num_vfs, redistribute the resources across + * num_vfs so that each VF will have access to more number of + * resources. This facility is not available in BE3 FW. + * Also, this is done by FW in Lancer chip. + */ + if (be_max_vfs(adapter) && !pci_num_vf(adapter->pdev)) { + status = be_cmd_set_sriov_config(adapter, + adapter->pool_res, + adapter->num_vfs); + if (status) + dev_err(dev, "Failed to optimize SR-IOV resources\n"); + } +} + +static int be_get_config(struct be_adapter *adapter) +{ + u16 profile_id; + int status; + + status = be_cmd_query_fw_cfg(adapter); + if (status) + return status; + + if (be_physfn(adapter)) { + status = be_cmd_get_active_profile(adapter, &profile_id); + if (!status) + dev_info(&adapter->pdev->dev, + "Using profile 0x%x\n", profile_id); + } + + if (!BE2_chip(adapter) && be_physfn(adapter)) + be_sriov_config(adapter); + + status = be_get_resources(adapter); + if (status) + return status; + + adapter->pmac_id = kcalloc(be_max_uc(adapter), + sizeof(*adapter->pmac_id), GFP_KERNEL); + if (!adapter->pmac_id) + return -ENOMEM; + + /* Sanitize cfg_num_qs based on HW and platform limits */ + adapter->cfg_num_qs = min(adapter->cfg_num_qs, be_max_qs(adapter)); + + return 0; +} + +static int be_mac_setup(struct be_adapter *adapter) +{ + u8 mac[ETH_ALEN]; + int status; + + if (is_zero_ether_addr(adapter->netdev->dev_addr)) { + status = be_cmd_get_perm_mac(adapter, mac); + if (status) + return status; + + memcpy(adapter->netdev->dev_addr, mac, ETH_ALEN); + memcpy(adapter->netdev->perm_addr, mac, ETH_ALEN); + } else { + /* Maybe the HW was reset; dev_addr must be re-programmed */ + memcpy(mac, adapter->netdev->dev_addr, ETH_ALEN); + } + + /* For BE3-R VFs, the PF programs the initial MAC address */ + if (!(BEx_chip(adapter) && be_virtfn(adapter))) + be_cmd_pmac_add(adapter, mac, adapter->if_handle, + &adapter->pmac_id[0], 0); + return 0; +} + +static void be_schedule_worker(struct be_adapter *adapter) +{ + schedule_delayed_work(&adapter->work, msecs_to_jiffies(1000)); + adapter->flags |= BE_FLAGS_WORKER_SCHEDULED; +} + +static int be_setup_queues(struct be_adapter *adapter) +{ + struct net_device *netdev = adapter->netdev; + int status; + + status = be_evt_queues_create(adapter); + if (status) + goto err; + + status = be_tx_qs_create(adapter); + if (status) + goto err; + + status = be_rx_cqs_create(adapter); + if (status) + goto err; + + status = be_mcc_queues_create(adapter); + if (status) + goto err; + + status = netif_set_real_num_rx_queues(netdev, adapter->num_rx_qs); + if (status) + goto err; + + status = netif_set_real_num_tx_queues(netdev, adapter->num_tx_qs); + if (status) + goto err; + + return 0; +err: + dev_err(&adapter->pdev->dev, "queue_setup failed\n"); + return status; +} + +int be_update_queues(struct be_adapter *adapter) +{ + struct net_device *netdev = adapter->netdev; + int status; + + if (netif_running(netdev)) + be_close(netdev); + + be_cancel_worker(adapter); + + /* If any vectors have been shared with RoCE we cannot re-program + * the MSIx table. + */ + if (!adapter->num_msix_roce_vec) + be_msix_disable(adapter); + + be_clear_queues(adapter); + + if (!msix_enabled(adapter)) { + status = be_msix_enable(adapter); + if (status) + return status; + } + + status = be_setup_queues(adapter); + if (status) + return status; + + be_schedule_worker(adapter); + + if (netif_running(netdev)) + status = be_open(netdev); + + return status; +} + +static int be_setup(struct be_adapter *adapter) +{ + struct device *dev = &adapter->pdev->dev; + u32 tx_fc, rx_fc, en_flags; + int status; + + be_setup_init(adapter); + + if (!lancer_chip(adapter)) + be_cmd_req_native_mode(adapter); + + status = be_get_config(adapter); + if (status) + goto err; + + status = be_msix_enable(adapter); + if (status) + goto err; + + en_flags = BE_IF_FLAGS_UNTAGGED | BE_IF_FLAGS_BROADCAST | + BE_IF_FLAGS_MULTICAST | BE_IF_FLAGS_PASS_L3L4_ERRORS; + if (adapter->function_caps & BE_FUNCTION_CAPS_RSS) + en_flags |= BE_IF_FLAGS_RSS; + en_flags = en_flags & be_if_cap_flags(adapter); + status = be_cmd_if_create(adapter, be_if_cap_flags(adapter), en_flags, + &adapter->if_handle, 0); + if (status) + goto err; + + /* Updating real_num_tx/rx_queues() requires rtnl_lock() */ + rtnl_lock(); + status = be_setup_queues(adapter); + rtnl_unlock(); + if (status) + goto err; + + be_cmd_get_fn_privileges(adapter, &adapter->cmd_privileges, 0); + + status = be_mac_setup(adapter); + if (status) + goto err; + + be_cmd_get_fw_ver(adapter); + dev_info(dev, "FW version is %s\n", adapter->fw_ver); + + if (BE2_chip(adapter) && fw_major_num(adapter->fw_ver) < 4) { + dev_err(dev, "Firmware on card is old(%s), IRQs may not work", + adapter->fw_ver); + dev_err(dev, "Please upgrade firmware to version >= 4.0\n"); + } + + if (adapter->vlans_added) + be_vid_config(adapter); + + be_set_rx_mode(adapter->netdev); + + be_cmd_get_acpi_wol_cap(adapter); + + be_cmd_get_flow_control(adapter, &tx_fc, &rx_fc); + + if (rx_fc != adapter->rx_fc || tx_fc != adapter->tx_fc) + be_cmd_set_flow_control(adapter, adapter->tx_fc, + adapter->rx_fc); + + if (be_physfn(adapter)) + be_cmd_set_logical_link_config(adapter, + IFLA_VF_LINK_STATE_AUTO, 0); + + if (adapter->num_vfs) + be_vf_setup(adapter); + + status = be_cmd_get_phy_info(adapter); + if (!status && be_pause_supported(adapter)) + adapter->phy.fc_autoneg = 1; + + be_schedule_worker(adapter); + adapter->flags |= BE_FLAGS_SETUP_DONE; + return 0; +err: + be_clear(adapter); + return status; +} + +#ifdef CONFIG_NET_POLL_CONTROLLER +static void be_netpoll(struct net_device *netdev) +{ + struct be_adapter *adapter = netdev_priv(netdev); + struct be_eq_obj *eqo; + int i; + + for_all_evt_queues(adapter, eqo, i) { + be_eq_notify(eqo->adapter, eqo->q.id, false, true, 0); + napi_schedule(&eqo->napi); + } +} +#endif + +static char flash_cookie[2][16] = {"*** SE FLAS", "H DIRECTORY *** "}; + +static bool phy_flashing_required(struct be_adapter *adapter) +{ + return (adapter->phy.phy_type == TN_8022 && + adapter->phy.interface_type == PHY_TYPE_BASET_10GB); +} + +static bool is_comp_in_ufi(struct be_adapter *adapter, + struct flash_section_info *fsec, int type) +{ + int i = 0, img_type = 0; + struct flash_section_info_g2 *fsec_g2 = NULL; + + if (BE2_chip(adapter)) + fsec_g2 = (struct flash_section_info_g2 *)fsec; + + for (i = 0; i < MAX_FLASH_COMP; i++) { + if (fsec_g2) + img_type = le32_to_cpu(fsec_g2->fsec_entry[i].type); + else + img_type = le32_to_cpu(fsec->fsec_entry[i].type); + + if (img_type == type) + return true; + } + return false; + +} + +static struct flash_section_info *get_fsec_info(struct be_adapter *adapter, + int header_size, + const struct firmware *fw) +{ + struct flash_section_info *fsec = NULL; + const u8 *p = fw->data; + + p += header_size; + while (p < (fw->data + fw->size)) { + fsec = (struct flash_section_info *)p; + if (!memcmp(flash_cookie, fsec->cookie, sizeof(flash_cookie))) + return fsec; + p += 32; + } + return NULL; +} + +static int be_check_flash_crc(struct be_adapter *adapter, const u8 *p, + u32 img_offset, u32 img_size, int hdr_size, + u16 img_optype, bool *crc_match) +{ + u32 crc_offset; + int status; + u8 crc[4]; + + status = be_cmd_get_flash_crc(adapter, crc, img_optype, img_size - 4); + if (status) + return status; + + crc_offset = hdr_size + img_offset + img_size - 4; + + /* Skip flashing, if crc of flashed region matches */ + if (!memcmp(crc, p + crc_offset, 4)) + *crc_match = true; + else + *crc_match = false; + + return status; +} + +static int be_flash(struct be_adapter *adapter, const u8 *img, + struct be_dma_mem *flash_cmd, int optype, int img_size) +{ + struct be_cmd_write_flashrom *req = flash_cmd->va; + u32 total_bytes, flash_op, num_bytes; + int status; + + total_bytes = img_size; + while (total_bytes) { + num_bytes = min_t(u32, 32*1024, total_bytes); + + total_bytes -= num_bytes; + + if (!total_bytes) { + if (optype == OPTYPE_PHY_FW) + flash_op = FLASHROM_OPER_PHY_FLASH; + else + flash_op = FLASHROM_OPER_FLASH; + } else { + if (optype == OPTYPE_PHY_FW) + flash_op = FLASHROM_OPER_PHY_SAVE; + else + flash_op = FLASHROM_OPER_SAVE; + } + + memcpy(req->data_buf, img, num_bytes); + img += num_bytes; + status = be_cmd_write_flashrom(adapter, flash_cmd, optype, + flash_op, num_bytes); + if (base_status(status) == MCC_STATUS_ILLEGAL_REQUEST && + optype == OPTYPE_PHY_FW) + break; + else if (status) + return status; + } + return 0; +} + +/* For BE2, BE3 and BE3-R */ +static int be_flash_BEx(struct be_adapter *adapter, + const struct firmware *fw, + struct be_dma_mem *flash_cmd, int num_of_images) +{ + int img_hdrs_size = (num_of_images * sizeof(struct image_hdr)); + struct device *dev = &adapter->pdev->dev; + struct flash_section_info *fsec = NULL; + int status, i, filehdr_size, num_comp; + const struct flash_comp *pflashcomp; + bool crc_match; + const u8 *p; + + struct flash_comp gen3_flash_types[] = { + { FLASH_iSCSI_PRIMARY_IMAGE_START_g3, OPTYPE_ISCSI_ACTIVE, + FLASH_IMAGE_MAX_SIZE_g3, IMAGE_FIRMWARE_iSCSI}, + { FLASH_REDBOOT_START_g3, OPTYPE_REDBOOT, + FLASH_REDBOOT_IMAGE_MAX_SIZE_g3, IMAGE_BOOT_CODE}, + { FLASH_iSCSI_BIOS_START_g3, OPTYPE_BIOS, + FLASH_BIOS_IMAGE_MAX_SIZE_g3, IMAGE_OPTION_ROM_ISCSI}, + { FLASH_PXE_BIOS_START_g3, OPTYPE_PXE_BIOS, + FLASH_BIOS_IMAGE_MAX_SIZE_g3, IMAGE_OPTION_ROM_PXE}, + { FLASH_FCoE_BIOS_START_g3, OPTYPE_FCOE_BIOS, + FLASH_BIOS_IMAGE_MAX_SIZE_g3, IMAGE_OPTION_ROM_FCoE}, + { FLASH_iSCSI_BACKUP_IMAGE_START_g3, OPTYPE_ISCSI_BACKUP, + FLASH_IMAGE_MAX_SIZE_g3, IMAGE_FIRMWARE_BACKUP_iSCSI}, + { FLASH_FCoE_PRIMARY_IMAGE_START_g3, OPTYPE_FCOE_FW_ACTIVE, + FLASH_IMAGE_MAX_SIZE_g3, IMAGE_FIRMWARE_FCoE}, + { FLASH_FCoE_BACKUP_IMAGE_START_g3, OPTYPE_FCOE_FW_BACKUP, + FLASH_IMAGE_MAX_SIZE_g3, IMAGE_FIRMWARE_BACKUP_FCoE}, + { FLASH_NCSI_START_g3, OPTYPE_NCSI_FW, + FLASH_NCSI_IMAGE_MAX_SIZE_g3, IMAGE_NCSI}, + { FLASH_PHY_FW_START_g3, OPTYPE_PHY_FW, + FLASH_PHY_FW_IMAGE_MAX_SIZE_g3, IMAGE_FIRMWARE_PHY} + }; + + struct flash_comp gen2_flash_types[] = { + { FLASH_iSCSI_PRIMARY_IMAGE_START_g2, OPTYPE_ISCSI_ACTIVE, + FLASH_IMAGE_MAX_SIZE_g2, IMAGE_FIRMWARE_iSCSI}, + { FLASH_REDBOOT_START_g2, OPTYPE_REDBOOT, + FLASH_REDBOOT_IMAGE_MAX_SIZE_g2, IMAGE_BOOT_CODE}, + { FLASH_iSCSI_BIOS_START_g2, OPTYPE_BIOS, + FLASH_BIOS_IMAGE_MAX_SIZE_g2, IMAGE_OPTION_ROM_ISCSI}, + { FLASH_PXE_BIOS_START_g2, OPTYPE_PXE_BIOS, + FLASH_BIOS_IMAGE_MAX_SIZE_g2, IMAGE_OPTION_ROM_PXE}, + { FLASH_FCoE_BIOS_START_g2, OPTYPE_FCOE_BIOS, + FLASH_BIOS_IMAGE_MAX_SIZE_g2, IMAGE_OPTION_ROM_FCoE}, + { FLASH_iSCSI_BACKUP_IMAGE_START_g2, OPTYPE_ISCSI_BACKUP, + FLASH_IMAGE_MAX_SIZE_g2, IMAGE_FIRMWARE_BACKUP_iSCSI}, + { FLASH_FCoE_PRIMARY_IMAGE_START_g2, OPTYPE_FCOE_FW_ACTIVE, + FLASH_IMAGE_MAX_SIZE_g2, IMAGE_FIRMWARE_FCoE}, + { FLASH_FCoE_BACKUP_IMAGE_START_g2, OPTYPE_FCOE_FW_BACKUP, + FLASH_IMAGE_MAX_SIZE_g2, IMAGE_FIRMWARE_BACKUP_FCoE} + }; + + if (BE3_chip(adapter)) { + pflashcomp = gen3_flash_types; + filehdr_size = sizeof(struct flash_file_hdr_g3); + num_comp = ARRAY_SIZE(gen3_flash_types); + } else { + pflashcomp = gen2_flash_types; + filehdr_size = sizeof(struct flash_file_hdr_g2); + num_comp = ARRAY_SIZE(gen2_flash_types); + } + + /* Get flash section info*/ + fsec = get_fsec_info(adapter, filehdr_size + img_hdrs_size, fw); + if (!fsec) { + dev_err(dev, "Invalid Cookie. FW image may be corrupted\n"); + return -1; + } + for (i = 0; i < num_comp; i++) { + if (!is_comp_in_ufi(adapter, fsec, pflashcomp[i].img_type)) + continue; + + if ((pflashcomp[i].optype == OPTYPE_NCSI_FW) && + memcmp(adapter->fw_ver, "3.102.148.0", 11) < 0) + continue; + + if (pflashcomp[i].optype == OPTYPE_PHY_FW && + !phy_flashing_required(adapter)) + continue; + + if (pflashcomp[i].optype == OPTYPE_REDBOOT) { + status = be_check_flash_crc(adapter, fw->data, + pflashcomp[i].offset, + pflashcomp[i].size, + filehdr_size + + img_hdrs_size, + OPTYPE_REDBOOT, &crc_match); + if (status) { + dev_err(dev, + "Could not get CRC for 0x%x region\n", + pflashcomp[i].optype); + continue; + } + + if (crc_match) + continue; + } + + p = fw->data + filehdr_size + pflashcomp[i].offset + + img_hdrs_size; + if (p + pflashcomp[i].size > fw->data + fw->size) + return -1; + + status = be_flash(adapter, p, flash_cmd, pflashcomp[i].optype, + pflashcomp[i].size); + if (status) { + dev_err(dev, "Flashing section type 0x%x failed\n", + pflashcomp[i].img_type); + return status; + } + } + return 0; +} + +static u16 be_get_img_optype(struct flash_section_entry fsec_entry) +{ + u32 img_type = le32_to_cpu(fsec_entry.type); + u16 img_optype = le16_to_cpu(fsec_entry.optype); + + if (img_optype != 0xFFFF) + return img_optype; + + switch (img_type) { + case IMAGE_FIRMWARE_iSCSI: + img_optype = OPTYPE_ISCSI_ACTIVE; + break; + case IMAGE_BOOT_CODE: + img_optype = OPTYPE_REDBOOT; + break; + case IMAGE_OPTION_ROM_ISCSI: + img_optype = OPTYPE_BIOS; + break; + case IMAGE_OPTION_ROM_PXE: + img_optype = OPTYPE_PXE_BIOS; + break; + case IMAGE_OPTION_ROM_FCoE: + img_optype = OPTYPE_FCOE_BIOS; + break; + case IMAGE_FIRMWARE_BACKUP_iSCSI: + img_optype = OPTYPE_ISCSI_BACKUP; + break; + case IMAGE_NCSI: + img_optype = OPTYPE_NCSI_FW; + break; + case IMAGE_FLASHISM_JUMPVECTOR: + img_optype = OPTYPE_FLASHISM_JUMPVECTOR; + break; + case IMAGE_FIRMWARE_PHY: + img_optype = OPTYPE_SH_PHY_FW; + break; + case IMAGE_REDBOOT_DIR: + img_optype = OPTYPE_REDBOOT_DIR; + break; + case IMAGE_REDBOOT_CONFIG: + img_optype = OPTYPE_REDBOOT_CONFIG; + break; + case IMAGE_UFI_DIR: + img_optype = OPTYPE_UFI_DIR; + break; + default: + break; + } + + return img_optype; +} + +static int be_flash_skyhawk(struct be_adapter *adapter, + const struct firmware *fw, + struct be_dma_mem *flash_cmd, int num_of_images) +{ + int img_hdrs_size = num_of_images * sizeof(struct image_hdr); + struct device *dev = &adapter->pdev->dev; + struct flash_section_info *fsec = NULL; + u32 img_offset, img_size, img_type; + int status, i, filehdr_size; + bool crc_match, old_fw_img; + u16 img_optype; + const u8 *p; + + filehdr_size = sizeof(struct flash_file_hdr_g3); + fsec = get_fsec_info(adapter, filehdr_size + img_hdrs_size, fw); + if (!fsec) { + dev_err(dev, "Invalid Cookie. FW image may be corrupted\n"); + return -EINVAL; + } + + for (i = 0; i < le32_to_cpu(fsec->fsec_hdr.num_images); i++) { + img_offset = le32_to_cpu(fsec->fsec_entry[i].offset); + img_size = le32_to_cpu(fsec->fsec_entry[i].pad_size); + img_type = le32_to_cpu(fsec->fsec_entry[i].type); + img_optype = be_get_img_optype(fsec->fsec_entry[i]); + old_fw_img = fsec->fsec_entry[i].optype == 0xFFFF; + + if (img_optype == 0xFFFF) + continue; + /* Don't bother verifying CRC if an old FW image is being + * flashed + */ + if (old_fw_img) + goto flash; + + status = be_check_flash_crc(adapter, fw->data, img_offset, + img_size, filehdr_size + + img_hdrs_size, img_optype, + &crc_match); + /* The current FW image on the card does not recognize the new + * FLASH op_type. The FW download is partially complete. + * Reboot the server now to enable FW image to recognize the + * new FLASH op_type. To complete the remaining process, + * download the same FW again after the reboot. + */ + if (base_status(status) == MCC_STATUS_ILLEGAL_REQUEST || + base_status(status) == MCC_STATUS_ILLEGAL_FIELD) { + dev_err(dev, "Flash incomplete. Reset the server\n"); + dev_err(dev, "Download FW image again after reset\n"); + return -EAGAIN; + } else if (status) { + dev_err(dev, "Could not get CRC for 0x%x region\n", + img_optype); + return -EFAULT; + } + + if (crc_match) + continue; + +flash: + p = fw->data + filehdr_size + img_offset + img_hdrs_size; + if (p + img_size > fw->data + fw->size) + return -1; + + status = be_flash(adapter, p, flash_cmd, img_optype, img_size); + /* For old FW images ignore ILLEGAL_FIELD error or errors on + * UFI_DIR region + */ + if (old_fw_img && + (base_status(status) == MCC_STATUS_ILLEGAL_FIELD || + (img_optype == OPTYPE_UFI_DIR && + base_status(status) == MCC_STATUS_FAILED))) { + continue; + } else if (status) { + dev_err(dev, "Flashing section type 0x%x failed\n", + img_type); + return -EFAULT; + } + } + return 0; +} + +static int lancer_fw_download(struct be_adapter *adapter, + const struct firmware *fw) +{ +#define LANCER_FW_DOWNLOAD_CHUNK (32 * 1024) +#define LANCER_FW_DOWNLOAD_LOCATION "/prg" + struct device *dev = &adapter->pdev->dev; + struct be_dma_mem flash_cmd; + const u8 *data_ptr = NULL; + u8 *dest_image_ptr = NULL; + size_t image_size = 0; + u32 chunk_size = 0; + u32 data_written = 0; + u32 offset = 0; + int status = 0; + u8 add_status = 0; + u8 change_status; + + if (!IS_ALIGNED(fw->size, sizeof(u32))) { + dev_err(dev, "FW image size should be multiple of 4\n"); + return -EINVAL; + } + + flash_cmd.size = sizeof(struct lancer_cmd_req_write_object) + + LANCER_FW_DOWNLOAD_CHUNK; + flash_cmd.va = dma_alloc_coherent(dev, flash_cmd.size, + &flash_cmd.dma, GFP_KERNEL); + if (!flash_cmd.va) + return -ENOMEM; + + dest_image_ptr = flash_cmd.va + + sizeof(struct lancer_cmd_req_write_object); + image_size = fw->size; + data_ptr = fw->data; + + while (image_size) { + chunk_size = min_t(u32, image_size, LANCER_FW_DOWNLOAD_CHUNK); + + /* Copy the image chunk content. */ + memcpy(dest_image_ptr, data_ptr, chunk_size); + + status = lancer_cmd_write_object(adapter, &flash_cmd, + chunk_size, offset, + LANCER_FW_DOWNLOAD_LOCATION, + &data_written, &change_status, + &add_status); + if (status) + break; + + offset += data_written; + data_ptr += data_written; + image_size -= data_written; + } + + if (!status) { + /* Commit the FW written */ + status = lancer_cmd_write_object(adapter, &flash_cmd, + 0, offset, + LANCER_FW_DOWNLOAD_LOCATION, + &data_written, &change_status, + &add_status); + } + + dma_free_coherent(dev, flash_cmd.size, flash_cmd.va, flash_cmd.dma); + if (status) { + dev_err(dev, "Firmware load error\n"); + return be_cmd_status(status); + } + + dev_info(dev, "Firmware flashed successfully\n"); + + if (change_status == LANCER_FW_RESET_NEEDED) { + dev_info(dev, "Resetting adapter to activate new FW\n"); + status = lancer_physdev_ctrl(adapter, + PHYSDEV_CONTROL_FW_RESET_MASK); + if (status) { + dev_err(dev, "Adapter busy, could not reset FW\n"); + dev_err(dev, "Reboot server to activate new FW\n"); + } + } else if (change_status != LANCER_NO_RESET_NEEDED) { + dev_info(dev, "Reboot server to activate new FW\n"); + } + + return 0; +} + +#define UFI_TYPE2 2 +#define UFI_TYPE3 3 +#define UFI_TYPE3R 10 +#define UFI_TYPE4 4 +static int be_get_ufi_type(struct be_adapter *adapter, + struct flash_file_hdr_g3 *fhdr) +{ + if (!fhdr) + goto be_get_ufi_exit; + + if (skyhawk_chip(adapter) && fhdr->build[0] == '4') + return UFI_TYPE4; + else if (BE3_chip(adapter) && fhdr->build[0] == '3') { + if (fhdr->asic_type_rev == 0x10) + return UFI_TYPE3R; + else + return UFI_TYPE3; + } else if (BE2_chip(adapter) && fhdr->build[0] == '2') + return UFI_TYPE2; + +be_get_ufi_exit: + dev_err(&adapter->pdev->dev, + "UFI and Interface are not compatible for flashing\n"); + return -1; +} + +static int be_fw_download(struct be_adapter *adapter, const struct firmware* fw) +{ + struct flash_file_hdr_g3 *fhdr3; + struct image_hdr *img_hdr_ptr = NULL; + struct be_dma_mem flash_cmd; + const u8 *p; + int status = 0, i = 0, num_imgs = 0, ufi_type = 0; + + flash_cmd.size = sizeof(struct be_cmd_write_flashrom); + flash_cmd.va = dma_alloc_coherent(&adapter->pdev->dev, flash_cmd.size, + &flash_cmd.dma, GFP_KERNEL); + if (!flash_cmd.va) { + status = -ENOMEM; + goto be_fw_exit; + } + + p = fw->data; + fhdr3 = (struct flash_file_hdr_g3 *)p; + + ufi_type = be_get_ufi_type(adapter, fhdr3); + + num_imgs = le32_to_cpu(fhdr3->num_imgs); + for (i = 0; i < num_imgs; i++) { + img_hdr_ptr = (struct image_hdr *)(fw->data + + (sizeof(struct flash_file_hdr_g3) + + i * sizeof(struct image_hdr))); + if (le32_to_cpu(img_hdr_ptr->imageid) == 1) { + switch (ufi_type) { + case UFI_TYPE4: + status = be_flash_skyhawk(adapter, fw, + &flash_cmd, num_imgs); + break; + case UFI_TYPE3R: + status = be_flash_BEx(adapter, fw, &flash_cmd, + num_imgs); + break; + case UFI_TYPE3: + /* Do not flash this ufi on BE3-R cards */ + if (adapter->asic_rev < 0x10) + status = be_flash_BEx(adapter, fw, + &flash_cmd, + num_imgs); + else { + status = -EINVAL; + dev_err(&adapter->pdev->dev, + "Can't load BE3 UFI on BE3R\n"); + } + } + } + } + + if (ufi_type == UFI_TYPE2) + status = be_flash_BEx(adapter, fw, &flash_cmd, 0); + else if (ufi_type == -1) + status = -EINVAL; + + dma_free_coherent(&adapter->pdev->dev, flash_cmd.size, flash_cmd.va, + flash_cmd.dma); + if (status) { + dev_err(&adapter->pdev->dev, "Firmware load error\n"); + goto be_fw_exit; + } + + dev_info(&adapter->pdev->dev, "Firmware flashed successfully\n"); + +be_fw_exit: + return status; +} + +int be_load_fw(struct be_adapter *adapter, u8 *fw_file) +{ + const struct firmware *fw; + int status; + + if (!netif_running(adapter->netdev)) { + dev_err(&adapter->pdev->dev, + "Firmware load not allowed (interface is down)\n"); + return -ENETDOWN; + } + + status = request_firmware(&fw, fw_file, &adapter->pdev->dev); + if (status) + goto fw_exit; + + dev_info(&adapter->pdev->dev, "Flashing firmware file %s\n", fw_file); + + if (lancer_chip(adapter)) + status = lancer_fw_download(adapter, fw); + else + status = be_fw_download(adapter, fw); + + if (!status) + be_cmd_get_fw_ver(adapter); + +fw_exit: + release_firmware(fw); + return status; +} + +static int be_ndo_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh) +{ + struct be_adapter *adapter = netdev_priv(dev); + struct nlattr *attr, *br_spec; + int rem; + int status = 0; + u16 mode = 0; + + if (!sriov_enabled(adapter)) + return -EOPNOTSUPP; + + br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); + if (!br_spec) + return -EINVAL; + + nla_for_each_nested(attr, br_spec, rem) { + if (nla_type(attr) != IFLA_BRIDGE_MODE) + continue; + + if (nla_len(attr) < sizeof(mode)) + return -EINVAL; + + mode = nla_get_u16(attr); + if (mode != BRIDGE_MODE_VEPA && mode != BRIDGE_MODE_VEB) + return -EINVAL; + + status = be_cmd_set_hsw_config(adapter, 0, 0, + adapter->if_handle, + mode == BRIDGE_MODE_VEPA ? + PORT_FWD_TYPE_VEPA : + PORT_FWD_TYPE_VEB); + if (status) + goto err; + + dev_info(&adapter->pdev->dev, "enabled switch mode: %s\n", + mode == BRIDGE_MODE_VEPA ? "VEPA" : "VEB"); + + return status; + } +err: + dev_err(&adapter->pdev->dev, "Failed to set switch mode %s\n", + mode == BRIDGE_MODE_VEPA ? "VEPA" : "VEB"); + + return status; +} + +static int be_ndo_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, + struct net_device *dev, u32 filter_mask) +{ + struct be_adapter *adapter = netdev_priv(dev); + int status = 0; + u8 hsw_mode; + + if (!sriov_enabled(adapter)) + return 0; + + /* BE and Lancer chips support VEB mode only */ + if (BEx_chip(adapter) || lancer_chip(adapter)) { + hsw_mode = PORT_FWD_TYPE_VEB; + } else { + status = be_cmd_get_hsw_config(adapter, NULL, 0, + adapter->if_handle, &hsw_mode); + if (status) + return 0; + } + + return ndo_dflt_bridge_getlink(skb, pid, seq, dev, + hsw_mode == PORT_FWD_TYPE_VEPA ? + BRIDGE_MODE_VEPA : BRIDGE_MODE_VEB); +} + +#ifdef CONFIG_BE2NET_VXLAN +static void be_add_vxlan_port(struct net_device *netdev, sa_family_t sa_family, + __be16 port) +{ + struct be_adapter *adapter = netdev_priv(netdev); + struct device *dev = &adapter->pdev->dev; + int status; + + if (lancer_chip(adapter) || BEx_chip(adapter)) + return; + + if (adapter->flags & BE_FLAGS_VXLAN_OFFLOADS) { + dev_warn(dev, "Cannot add UDP port %d for VxLAN offloads\n", + be16_to_cpu(port)); + dev_info(dev, + "Only one UDP port supported for VxLAN offloads\n"); + return; + } + + status = be_cmd_manage_iface(adapter, adapter->if_handle, + OP_CONVERT_NORMAL_TO_TUNNEL); + if (status) { + dev_warn(dev, "Failed to convert normal interface to tunnel\n"); + goto err; + } + + status = be_cmd_set_vxlan_port(adapter, port); + if (status) { + dev_warn(dev, "Failed to add VxLAN port\n"); + goto err; + } + adapter->flags |= BE_FLAGS_VXLAN_OFFLOADS; + adapter->vxlan_port = port; + + dev_info(dev, "Enabled VxLAN offloads for UDP port %d\n", + be16_to_cpu(port)); + return; +err: + be_disable_vxlan_offloads(adapter); +} + +static void be_del_vxlan_port(struct net_device *netdev, sa_family_t sa_family, + __be16 port) +{ + struct be_adapter *adapter = netdev_priv(netdev); + + if (lancer_chip(adapter) || BEx_chip(adapter)) + return; + + if (adapter->vxlan_port != port) + return; + + be_disable_vxlan_offloads(adapter); + + dev_info(&adapter->pdev->dev, + "Disabled VxLAN offloads for UDP port %d\n", + be16_to_cpu(port)); +} + +static bool be_gso_check(struct sk_buff *skb, struct net_device *dev) +{ + return vxlan_gso_check(skb); +} +#endif + +static const struct net_device_ops be_netdev_ops = { + .ndo_open = be_open, + .ndo_stop = be_close, + .ndo_start_xmit = be_xmit, + .ndo_set_rx_mode = be_set_rx_mode, + .ndo_set_mac_address = be_mac_addr_set, + .ndo_change_mtu = be_change_mtu, + .ndo_get_stats64 = be_get_stats64, + .ndo_validate_addr = eth_validate_addr, + .ndo_vlan_rx_add_vid = be_vlan_add_vid, + .ndo_vlan_rx_kill_vid = be_vlan_rem_vid, + .ndo_set_vf_mac = be_set_vf_mac, + .ndo_set_vf_vlan = be_set_vf_vlan, + .ndo_set_vf_rate = be_set_vf_tx_rate, + .ndo_get_vf_config = be_get_vf_config, + .ndo_set_vf_link_state = be_set_vf_link_state, +#ifdef CONFIG_NET_POLL_CONTROLLER + .ndo_poll_controller = be_netpoll, +#endif + .ndo_bridge_setlink = be_ndo_bridge_setlink, + .ndo_bridge_getlink = be_ndo_bridge_getlink, +#ifdef CONFIG_NET_RX_BUSY_POLL + .ndo_busy_poll = be_busy_poll, +#endif +#ifdef CONFIG_BE2NET_VXLAN + .ndo_add_vxlan_port = be_add_vxlan_port, + .ndo_del_vxlan_port = be_del_vxlan_port, + .ndo_gso_check = be_gso_check, +#endif +}; + +static void be_netdev_init(struct net_device *netdev) +{ + struct be_adapter *adapter = netdev_priv(netdev); + + if (skyhawk_chip(adapter)) { + netdev->hw_enc_features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | + NETIF_F_TSO | NETIF_F_TSO6 | + NETIF_F_GSO_UDP_TUNNEL; + netdev->hw_features |= NETIF_F_GSO_UDP_TUNNEL; + } + netdev->hw_features |= NETIF_F_SG | NETIF_F_TSO | NETIF_F_TSO6 | + NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_RXCSUM | + NETIF_F_HW_VLAN_CTAG_TX; + if (be_multi_rxq(adapter)) + netdev->hw_features |= NETIF_F_RXHASH; + + netdev->features |= netdev->hw_features | + NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_FILTER; + + netdev->vlan_features |= NETIF_F_SG | NETIF_F_TSO | NETIF_F_TSO6 | + NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; + + netdev->priv_flags |= IFF_UNICAST_FLT; + + netdev->flags |= IFF_MULTICAST; + + netif_set_gso_max_size(netdev, 65535 - ETH_HLEN); + + netdev->netdev_ops = &be_netdev_ops; + + netdev->ethtool_ops = &be_ethtool_ops; +} + +static void be_unmap_pci_bars(struct be_adapter *adapter) +{ + if (adapter->csr) + pci_iounmap(adapter->pdev, adapter->csr); + if (adapter->db) + pci_iounmap(adapter->pdev, adapter->db); +} + +static int db_bar(struct be_adapter *adapter) +{ + if (lancer_chip(adapter) || !be_physfn(adapter)) + return 0; + else + return 4; +} + +static int be_roce_map_pci_bars(struct be_adapter *adapter) +{ + if (skyhawk_chip(adapter)) { + adapter->roce_db.size = 4096; + adapter->roce_db.io_addr = pci_resource_start(adapter->pdev, + db_bar(adapter)); + adapter->roce_db.total_size = pci_resource_len(adapter->pdev, + db_bar(adapter)); + } + return 0; +} + +static int be_map_pci_bars(struct be_adapter *adapter) +{ + u8 __iomem *addr; + + if (BEx_chip(adapter) && be_physfn(adapter)) { + adapter->csr = pci_iomap(adapter->pdev, 2, 0); + if (!adapter->csr) + return -ENOMEM; + } + + addr = pci_iomap(adapter->pdev, db_bar(adapter), 0); + if (!addr) + goto pci_map_err; + adapter->db = addr; + + be_roce_map_pci_bars(adapter); + return 0; + +pci_map_err: + dev_err(&adapter->pdev->dev, "Error in mapping PCI BARs\n"); + be_unmap_pci_bars(adapter); + return -ENOMEM; +} + +static void be_ctrl_cleanup(struct be_adapter *adapter) +{ + struct be_dma_mem *mem = &adapter->mbox_mem_alloced; + + be_unmap_pci_bars(adapter); + + if (mem->va) + dma_free_coherent(&adapter->pdev->dev, mem->size, mem->va, + mem->dma); + + mem = &adapter->rx_filter; + if (mem->va) + dma_free_coherent(&adapter->pdev->dev, mem->size, mem->va, + mem->dma); +} + +static int be_ctrl_init(struct be_adapter *adapter) +{ + struct be_dma_mem *mbox_mem_alloc = &adapter->mbox_mem_alloced; + struct be_dma_mem *mbox_mem_align = &adapter->mbox_mem; + struct be_dma_mem *rx_filter = &adapter->rx_filter; + u32 sli_intf; + int status; + + pci_read_config_dword(adapter->pdev, SLI_INTF_REG_OFFSET, &sli_intf); + adapter->sli_family = (sli_intf & SLI_INTF_FAMILY_MASK) >> + SLI_INTF_FAMILY_SHIFT; + adapter->virtfn = (sli_intf & SLI_INTF_FT_MASK) ? 1 : 0; + + status = be_map_pci_bars(adapter); + if (status) + goto done; + + mbox_mem_alloc->size = sizeof(struct be_mcc_mailbox) + 16; + mbox_mem_alloc->va = dma_alloc_coherent(&adapter->pdev->dev, + mbox_mem_alloc->size, + &mbox_mem_alloc->dma, + GFP_KERNEL); + if (!mbox_mem_alloc->va) { + status = -ENOMEM; + goto unmap_pci_bars; + } + mbox_mem_align->size = sizeof(struct be_mcc_mailbox); + mbox_mem_align->va = PTR_ALIGN(mbox_mem_alloc->va, 16); + mbox_mem_align->dma = PTR_ALIGN(mbox_mem_alloc->dma, 16); + memset(mbox_mem_align->va, 0, sizeof(struct be_mcc_mailbox)); + + rx_filter->size = sizeof(struct be_cmd_req_rx_filter); + rx_filter->va = dma_zalloc_coherent(&adapter->pdev->dev, + rx_filter->size, &rx_filter->dma, + GFP_KERNEL); + if (!rx_filter->va) { + status = -ENOMEM; + goto free_mbox; + } + + mutex_init(&adapter->mbox_lock); + spin_lock_init(&adapter->mcc_lock); + spin_lock_init(&adapter->mcc_cq_lock); + + init_completion(&adapter->et_cmd_compl); + pci_save_state(adapter->pdev); + return 0; + +free_mbox: + dma_free_coherent(&adapter->pdev->dev, mbox_mem_alloc->size, + mbox_mem_alloc->va, mbox_mem_alloc->dma); + +unmap_pci_bars: + be_unmap_pci_bars(adapter); + +done: + return status; +} + +static void be_stats_cleanup(struct be_adapter *adapter) +{ + struct be_dma_mem *cmd = &adapter->stats_cmd; + + if (cmd->va) + dma_free_coherent(&adapter->pdev->dev, cmd->size, + cmd->va, cmd->dma); +} + +static int be_stats_init(struct be_adapter *adapter) +{ + struct be_dma_mem *cmd = &adapter->stats_cmd; + + if (lancer_chip(adapter)) + cmd->size = sizeof(struct lancer_cmd_req_pport_stats); + else if (BE2_chip(adapter)) + cmd->size = sizeof(struct be_cmd_req_get_stats_v0); + else if (BE3_chip(adapter)) + cmd->size = sizeof(struct be_cmd_req_get_stats_v1); + else + /* ALL non-BE ASICs */ + cmd->size = sizeof(struct be_cmd_req_get_stats_v2); + + cmd->va = dma_zalloc_coherent(&adapter->pdev->dev, cmd->size, &cmd->dma, + GFP_KERNEL); + if (!cmd->va) + return -ENOMEM; + return 0; +} + +static void be_remove(struct pci_dev *pdev) +{ + struct be_adapter *adapter = pci_get_drvdata(pdev); + + if (!adapter) + return; + + be_roce_dev_remove(adapter); + be_intr_set(adapter, false); + + cancel_delayed_work_sync(&adapter->func_recovery_work); + + unregister_netdev(adapter->netdev); + + be_clear(adapter); + + /* tell fw we're done with firing cmds */ + be_cmd_fw_clean(adapter); + + be_stats_cleanup(adapter); + + be_ctrl_cleanup(adapter); + + pci_disable_pcie_error_reporting(pdev); + + pci_release_regions(pdev); + pci_disable_device(pdev); + + free_netdev(adapter->netdev); +} + +static int be_get_initial_config(struct be_adapter *adapter) +{ + int status, level; + + status = be_cmd_get_cntl_attributes(adapter); + if (status) + return status; + + /* Must be a power of 2 or else MODULO will BUG_ON */ + adapter->be_get_temp_freq = 64; + + if (BEx_chip(adapter)) { + level = be_cmd_get_fw_log_level(adapter); + adapter->msg_enable = + level <= FW_LOG_LEVEL_DEFAULT ? NETIF_MSG_HW : 0; + } + + adapter->cfg_num_qs = netif_get_num_default_rss_queues(); + return 0; +} + +static int lancer_recover_func(struct be_adapter *adapter) +{ + struct device *dev = &adapter->pdev->dev; + int status; + + status = lancer_test_and_set_rdy_state(adapter); + if (status) + goto err; + + if (netif_running(adapter->netdev)) + be_close(adapter->netdev); + + be_clear(adapter); + + be_clear_all_error(adapter); + + status = be_setup(adapter); + if (status) + goto err; + + if (netif_running(adapter->netdev)) { + status = be_open(adapter->netdev); + if (status) + goto err; + } + + dev_err(dev, "Adapter recovery successful\n"); + return 0; +err: + if (status == -EAGAIN) + dev_err(dev, "Waiting for resource provisioning\n"); + else + dev_err(dev, "Adapter recovery failed\n"); + + return status; +} + +static void be_func_recovery_task(struct work_struct *work) +{ + struct be_adapter *adapter = + container_of(work, struct be_adapter, func_recovery_work.work); + int status = 0; + + be_detect_error(adapter); + + if (adapter->hw_error && lancer_chip(adapter)) { + rtnl_lock(); + netif_device_detach(adapter->netdev); + rtnl_unlock(); + + status = lancer_recover_func(adapter); + if (!status) + netif_device_attach(adapter->netdev); + } + + /* In Lancer, for all errors other than provisioning error (-EAGAIN), + * no need to attempt further recovery. + */ + if (!status || status == -EAGAIN) + schedule_delayed_work(&adapter->func_recovery_work, + msecs_to_jiffies(1000)); +} + +static void be_worker(struct work_struct *work) +{ + struct be_adapter *adapter = + container_of(work, struct be_adapter, work.work); + struct be_rx_obj *rxo; + int i; + + /* when interrupts are not yet enabled, just reap any pending + * mcc completions */ + if (!netif_running(adapter->netdev)) { + local_bh_disable(); + be_process_mcc(adapter); + local_bh_enable(); + goto reschedule; + } + + if (!adapter->stats_cmd_sent) { + if (lancer_chip(adapter)) + lancer_cmd_get_pport_stats(adapter, + &adapter->stats_cmd); + else + be_cmd_get_stats(adapter, &adapter->stats_cmd); + } + + if (be_physfn(adapter) && + MODULO(adapter->work_counter, adapter->be_get_temp_freq) == 0) + be_cmd_get_die_temperature(adapter); + + for_all_rx_queues(adapter, rxo, i) { + /* Replenish RX-queues starved due to memory + * allocation failures. + */ + if (rxo->rx_post_starved) + be_post_rx_frags(rxo, GFP_KERNEL, MAX_RX_POST); + } + + be_eqd_update(adapter); + +reschedule: + adapter->work_counter++; + schedule_delayed_work(&adapter->work, msecs_to_jiffies(1000)); +} + +/* If any VFs are already enabled don't FLR the PF */ +static bool be_reset_required(struct be_adapter *adapter) +{ + return pci_num_vf(adapter->pdev) ? false : true; +} + +static char *mc_name(struct be_adapter *adapter) +{ + char *str = ""; /* default */ + + switch (adapter->mc_type) { + case UMC: + str = "UMC"; + break; + case FLEX10: + str = "FLEX10"; + break; + case vNIC1: + str = "vNIC-1"; + break; + case nPAR: + str = "nPAR"; + break; + case UFP: + str = "UFP"; + break; + case vNIC2: + str = "vNIC-2"; + break; + default: + str = ""; + } + + return str; +} + +static inline char *func_name(struct be_adapter *adapter) +{ + return be_physfn(adapter) ? "PF" : "VF"; +} + +static int be_probe(struct pci_dev *pdev, const struct pci_device_id *pdev_id) +{ + int status = 0; + struct be_adapter *adapter; + struct net_device *netdev; + char port_name; + + dev_info(&pdev->dev, "%s version is %s\n", DRV_NAME, DRV_VER); + + status = pci_enable_device(pdev); + if (status) + goto do_none; + + status = pci_request_regions(pdev, DRV_NAME); + if (status) + goto disable_dev; + pci_set_master(pdev); + + netdev = alloc_etherdev_mqs(sizeof(*adapter), MAX_TX_QS, MAX_RX_QS); + if (!netdev) { + status = -ENOMEM; + goto rel_reg; + } + adapter = netdev_priv(netdev); + adapter->pdev = pdev; + pci_set_drvdata(pdev, adapter); + adapter->netdev = netdev; + SET_NETDEV_DEV(netdev, &pdev->dev); + + status = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); + if (!status) { + netdev->features |= NETIF_F_HIGHDMA; + } else { + status = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); + if (status) { + dev_err(&pdev->dev, "Could not set PCI DMA Mask\n"); + goto free_netdev; + } + } + + status = pci_enable_pcie_error_reporting(pdev); + if (!status) + dev_info(&pdev->dev, "PCIe error reporting enabled\n"); + + status = be_ctrl_init(adapter); + if (status) + goto free_netdev; + + /* sync up with fw's ready state */ + if (be_physfn(adapter)) { + status = be_fw_wait_ready(adapter); + if (status) + goto ctrl_clean; + } + + if (be_reset_required(adapter)) { + status = be_cmd_reset_function(adapter); + if (status) + goto ctrl_clean; + + /* Wait for interrupts to quiesce after an FLR */ + msleep(100); + } + + /* Allow interrupts for other ULPs running on NIC function */ + be_intr_set(adapter, true); + + /* tell fw we're ready to fire cmds */ + status = be_cmd_fw_init(adapter); + if (status) + goto ctrl_clean; + + status = be_stats_init(adapter); + if (status) + goto ctrl_clean; + + status = be_get_initial_config(adapter); + if (status) + goto stats_clean; + + INIT_DELAYED_WORK(&adapter->work, be_worker); + INIT_DELAYED_WORK(&adapter->func_recovery_work, be_func_recovery_task); + adapter->rx_fc = true; + adapter->tx_fc = true; + + status = be_setup(adapter); + if (status) + goto stats_clean; + + be_netdev_init(netdev); + status = register_netdev(netdev); + if (status != 0) + goto unsetup; + + be_roce_dev_add(adapter); + + schedule_delayed_work(&adapter->func_recovery_work, + msecs_to_jiffies(1000)); + + be_cmd_query_port_name(adapter, &port_name); + + dev_info(&pdev->dev, "%s: %s %s port %c\n", nic_name(pdev), + func_name(adapter), mc_name(adapter), port_name); + + return 0; + +unsetup: + be_clear(adapter); +stats_clean: + be_stats_cleanup(adapter); +ctrl_clean: + be_ctrl_cleanup(adapter); +free_netdev: + free_netdev(netdev); +rel_reg: + pci_release_regions(pdev); +disable_dev: + pci_disable_device(pdev); +do_none: + dev_err(&pdev->dev, "%s initialization failed\n", nic_name(pdev)); + return status; +} + +static int be_suspend(struct pci_dev *pdev, pm_message_t state) +{ + struct be_adapter *adapter = pci_get_drvdata(pdev); + struct net_device *netdev = adapter->netdev; + + if (adapter->wol_en) + be_setup_wol(adapter, true); + + be_intr_set(adapter, false); + cancel_delayed_work_sync(&adapter->func_recovery_work); + + netif_device_detach(netdev); + if (netif_running(netdev)) { + rtnl_lock(); + be_close(netdev); + rtnl_unlock(); + } + be_clear(adapter); + + pci_save_state(pdev); + pci_disable_device(pdev); + pci_set_power_state(pdev, pci_choose_state(pdev, state)); + return 0; +} + +static int be_resume(struct pci_dev *pdev) +{ + int status = 0; + struct be_adapter *adapter = pci_get_drvdata(pdev); + struct net_device *netdev = adapter->netdev; + + netif_device_detach(netdev); + + status = pci_enable_device(pdev); + if (status) + return status; + + pci_set_power_state(pdev, PCI_D0); + pci_restore_state(pdev); + + status = be_fw_wait_ready(adapter); + if (status) + return status; + + be_intr_set(adapter, true); + /* tell fw we're ready to fire cmds */ + status = be_cmd_fw_init(adapter); + if (status) + return status; + + be_setup(adapter); + if (netif_running(netdev)) { + rtnl_lock(); + be_open(netdev); + rtnl_unlock(); + } + + schedule_delayed_work(&adapter->func_recovery_work, + msecs_to_jiffies(1000)); + netif_device_attach(netdev); + + if (adapter->wol_en) + be_setup_wol(adapter, false); + + return 0; +} + +/* + * An FLR will stop BE from DMAing any data. + */ +static void be_shutdown(struct pci_dev *pdev) +{ + struct be_adapter *adapter = pci_get_drvdata(pdev); + + if (!adapter) + return; + + be_roce_dev_shutdown(adapter); + cancel_delayed_work_sync(&adapter->work); + cancel_delayed_work_sync(&adapter->func_recovery_work); + + netif_device_detach(adapter->netdev); + + be_cmd_reset_function(adapter); + + pci_disable_device(pdev); +} + +static pci_ers_result_t be_eeh_err_detected(struct pci_dev *pdev, + pci_channel_state_t state) +{ + struct be_adapter *adapter = pci_get_drvdata(pdev); + struct net_device *netdev = adapter->netdev; + + dev_err(&adapter->pdev->dev, "EEH error detected\n"); + + if (!adapter->eeh_error) { + adapter->eeh_error = true; + + cancel_delayed_work_sync(&adapter->func_recovery_work); + + rtnl_lock(); + netif_device_detach(netdev); + if (netif_running(netdev)) + be_close(netdev); + rtnl_unlock(); + + be_clear(adapter); + } + + if (state == pci_channel_io_perm_failure) + return PCI_ERS_RESULT_DISCONNECT; + + pci_disable_device(pdev); + + /* The error could cause the FW to trigger a flash debug dump. + * Resetting the card while flash dump is in progress + * can cause it not to recover; wait for it to finish. + * Wait only for first function as it is needed only once per + * adapter. + */ + if (pdev->devfn == 0) + ssleep(30); + + return PCI_ERS_RESULT_NEED_RESET; +} + +static pci_ers_result_t be_eeh_reset(struct pci_dev *pdev) +{ + struct be_adapter *adapter = pci_get_drvdata(pdev); + int status; + + dev_info(&adapter->pdev->dev, "EEH reset\n"); + + status = pci_enable_device(pdev); + if (status) + return PCI_ERS_RESULT_DISCONNECT; + + pci_set_master(pdev); + pci_set_power_state(pdev, PCI_D0); + pci_restore_state(pdev); + + /* Check if card is ok and fw is ready */ + dev_info(&adapter->pdev->dev, + "Waiting for FW to be ready after EEH reset\n"); + status = be_fw_wait_ready(adapter); + if (status) + return PCI_ERS_RESULT_DISCONNECT; + + pci_cleanup_aer_uncorrect_error_status(pdev); + be_clear_all_error(adapter); + return PCI_ERS_RESULT_RECOVERED; +} + +static void be_eeh_resume(struct pci_dev *pdev) +{ + int status = 0; + struct be_adapter *adapter = pci_get_drvdata(pdev); + struct net_device *netdev = adapter->netdev; + + dev_info(&adapter->pdev->dev, "EEH resume\n"); + + pci_save_state(pdev); + + status = be_cmd_reset_function(adapter); + if (status) + goto err; + + /* On some BE3 FW versions, after a HW reset, + * interrupts will remain disabled for each function. + * So, explicitly enable interrupts + */ + be_intr_set(adapter, true); + + /* tell fw we're ready to fire cmds */ + status = be_cmd_fw_init(adapter); + if (status) + goto err; + + status = be_setup(adapter); + if (status) + goto err; + + if (netif_running(netdev)) { + status = be_open(netdev); + if (status) + goto err; + } + + schedule_delayed_work(&adapter->func_recovery_work, + msecs_to_jiffies(1000)); + netif_device_attach(netdev); + return; +err: + dev_err(&adapter->pdev->dev, "EEH resume failed\n"); +} + +static const struct pci_error_handlers be_eeh_handlers = { + .error_detected = be_eeh_err_detected, + .slot_reset = be_eeh_reset, + .resume = be_eeh_resume, +}; + +static struct pci_driver be_driver = { + .name = DRV_NAME, + .id_table = be_dev_ids, + .probe = be_probe, + .remove = be_remove, + .suspend = be_suspend, + .resume = be_resume, + .shutdown = be_shutdown, + .err_handler = &be_eeh_handlers +}; + +static int __init be_init_module(void) +{ + if (rx_frag_size != 8192 && rx_frag_size != 4096 && + rx_frag_size != 2048) { + printk(KERN_WARNING DRV_NAME + " : Module param rx_frag_size must be 2048/4096/8192." + " Using 2048\n"); + rx_frag_size = 2048; + } + + return pci_register_driver(&be_driver); +} +module_init(be_init_module); + +static void __exit be_exit_module(void) +{ + pci_unregister_driver(&be_driver); +} +module_exit(be_exit_module); diff --git a/drivers/net/ethernet/emulex/benet/be_roce.c b/drivers/net/ethernet/emulex/benet/be_roce.c new file mode 100644 index 0000000..1328664 --- /dev/null +++ b/drivers/net/ethernet/emulex/benet/be_roce.c @@ -0,0 +1,200 @@ +/* + * Copyright (C) 2005 - 2014 Emulex + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. The full GNU General + * Public License is included in this distribution in the file called COPYING. + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + */ + +#include +#include +#include +#include + +#include "be.h" +#include "be_cmds.h" + +static struct ocrdma_driver *ocrdma_drv; +static LIST_HEAD(be_adapter_list); +static DEFINE_MUTEX(be_adapter_list_lock); + +static void _be_roce_dev_add(struct be_adapter *adapter) +{ + struct be_dev_info dev_info; + int i, num_vec; + struct pci_dev *pdev = adapter->pdev; + + if (!ocrdma_drv) + return; + + if (ocrdma_drv->be_abi_version != BE_ROCE_ABI_VERSION) { + dev_warn(&pdev->dev, "Cannot initialize RoCE due to ocrdma ABI mismatch\n"); + return; + } + + if (pdev->device == OC_DEVICE_ID5) { + /* only msix is supported on these devices */ + if (!msix_enabled(adapter)) + return; + /* DPP region address and length */ + dev_info.dpp_unmapped_addr = pci_resource_start(pdev, 2); + dev_info.dpp_unmapped_len = pci_resource_len(pdev, 2); + } else { + dev_info.dpp_unmapped_addr = 0; + dev_info.dpp_unmapped_len = 0; + } + dev_info.pdev = adapter->pdev; + dev_info.db = adapter->db; + dev_info.unmapped_db = adapter->roce_db.io_addr; + dev_info.db_page_size = adapter->roce_db.size; + dev_info.db_total_size = adapter->roce_db.total_size; + dev_info.netdev = adapter->netdev; + memcpy(dev_info.mac_addr, adapter->netdev->dev_addr, ETH_ALEN); + dev_info.dev_family = adapter->sli_family; + if (msix_enabled(adapter)) { + /* provide all the vectors, so that EQ creation response + * can decide which one to use. + */ + num_vec = adapter->num_msix_vec + adapter->num_msix_roce_vec; + dev_info.intr_mode = BE_INTERRUPT_MODE_MSIX; + dev_info.msix.num_vectors = min(num_vec, MAX_MSIX_VECTORS); + /* provide start index of the vector, + * so in case of linear usage, + * it can use the base as starting point. + */ + dev_info.msix.start_vector = adapter->num_evt_qs; + for (i = 0; i < dev_info.msix.num_vectors; i++) { + dev_info.msix.vector_list[i] = + adapter->msix_entries[i].vector; + } + } else { + dev_info.msix.num_vectors = 0; + dev_info.intr_mode = BE_INTERRUPT_MODE_INTX; + } + adapter->ocrdma_dev = ocrdma_drv->add(&dev_info); +} + +void be_roce_dev_add(struct be_adapter *adapter) +{ + if (be_roce_supported(adapter)) { + INIT_LIST_HEAD(&adapter->entry); + mutex_lock(&be_adapter_list_lock); + list_add_tail(&adapter->entry, &be_adapter_list); + + /* invoke add() routine of roce driver only if + * valid driver registered with add method and add() is not yet + * invoked on a given adapter. + */ + _be_roce_dev_add(adapter); + mutex_unlock(&be_adapter_list_lock); + } +} + +static void _be_roce_dev_remove(struct be_adapter *adapter) +{ + if (ocrdma_drv && ocrdma_drv->remove && adapter->ocrdma_dev) + ocrdma_drv->remove(adapter->ocrdma_dev); + adapter->ocrdma_dev = NULL; +} + +void be_roce_dev_remove(struct be_adapter *adapter) +{ + if (be_roce_supported(adapter)) { + mutex_lock(&be_adapter_list_lock); + _be_roce_dev_remove(adapter); + list_del(&adapter->entry); + mutex_unlock(&be_adapter_list_lock); + } +} + +static void _be_roce_dev_open(struct be_adapter *adapter) +{ + if (ocrdma_drv && adapter->ocrdma_dev && + ocrdma_drv->state_change_handler) + ocrdma_drv->state_change_handler(adapter->ocrdma_dev, + BE_DEV_UP); +} + +void be_roce_dev_open(struct be_adapter *adapter) +{ + if (be_roce_supported(adapter)) { + mutex_lock(&be_adapter_list_lock); + _be_roce_dev_open(adapter); + mutex_unlock(&be_adapter_list_lock); + } +} + +static void _be_roce_dev_close(struct be_adapter *adapter) +{ + if (ocrdma_drv && adapter->ocrdma_dev && + ocrdma_drv->state_change_handler) + ocrdma_drv->state_change_handler(adapter->ocrdma_dev, + BE_DEV_DOWN); +} + +void be_roce_dev_close(struct be_adapter *adapter) +{ + if (be_roce_supported(adapter)) { + mutex_lock(&be_adapter_list_lock); + _be_roce_dev_close(adapter); + mutex_unlock(&be_adapter_list_lock); + } +} + +void be_roce_dev_shutdown(struct be_adapter *adapter) +{ + if (be_roce_supported(adapter)) { + mutex_lock(&be_adapter_list_lock); + if (ocrdma_drv && adapter->ocrdma_dev && + ocrdma_drv->state_change_handler) + ocrdma_drv->state_change_handler(adapter->ocrdma_dev, + BE_DEV_SHUTDOWN); + mutex_unlock(&be_adapter_list_lock); + } +} + +int be_roce_register_driver(struct ocrdma_driver *drv) +{ + struct be_adapter *dev; + + mutex_lock(&be_adapter_list_lock); + if (ocrdma_drv) { + mutex_unlock(&be_adapter_list_lock); + return -EINVAL; + } + ocrdma_drv = drv; + list_for_each_entry(dev, &be_adapter_list, entry) { + struct net_device *netdev; + + _be_roce_dev_add(dev); + netdev = dev->netdev; + if (netif_running(netdev) && netif_oper_up(netdev)) + _be_roce_dev_open(dev); + } + mutex_unlock(&be_adapter_list_lock); + return 0; +} +EXPORT_SYMBOL(be_roce_register_driver); + +void be_roce_unregister_driver(struct ocrdma_driver *drv) +{ + struct be_adapter *dev; + + mutex_lock(&be_adapter_list_lock); + list_for_each_entry(dev, &be_adapter_list, entry) { + if (dev->ocrdma_dev) + _be_roce_dev_remove(dev); + } + ocrdma_drv = NULL; + mutex_unlock(&be_adapter_list_lock); +} +EXPORT_SYMBOL(be_roce_unregister_driver); diff --git a/drivers/net/ethernet/emulex/benet/be_roce.h b/drivers/net/ethernet/emulex/benet/be_roce.h new file mode 100644 index 0000000..e6f7eb1 --- /dev/null +++ b/drivers/net/ethernet/emulex/benet/be_roce.h @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2005 - 2014 Emulex + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. The full GNU General + * Public License is included in this distribution in the file called COPYING. + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + */ + +#ifndef BE_ROCE_H +#define BE_ROCE_H + +#include +#include + +#define BE_ROCE_ABI_VERSION 1 + +struct ocrdma_dev; + +enum be_interrupt_mode { + BE_INTERRUPT_MODE_MSIX = 0, + BE_INTERRUPT_MODE_INTX = 1, + BE_INTERRUPT_MODE_MSI = 2, +}; + +#define MAX_MSIX_VECTORS 32 +struct be_dev_info { + u8 __iomem *db; + u64 unmapped_db; + u32 db_page_size; + u32 db_total_size; + u64 dpp_unmapped_addr; + u32 dpp_unmapped_len; + struct pci_dev *pdev; + struct net_device *netdev; + u8 mac_addr[ETH_ALEN]; + u32 dev_family; + enum be_interrupt_mode intr_mode; + struct { + int num_vectors; + int start_vector; + u32 vector_list[MAX_MSIX_VECTORS]; + } msix; +}; + +/* ocrdma driver register's the callback functions with nic driver. */ +struct ocrdma_driver { + unsigned char name[32]; + u32 be_abi_version; + struct ocrdma_dev *(*add) (struct be_dev_info *dev_info); + void (*remove) (struct ocrdma_dev *); + void (*state_change_handler) (struct ocrdma_dev *, u32 new_state); +}; + +enum { + BE_DEV_UP = 0, + BE_DEV_DOWN = 1, + BE_DEV_SHUTDOWN = 2 +}; + +/* APIs for RoCE driver to register callback handlers, + * which will be invoked when device is added, removed, ifup, ifdown + */ +int be_roce_register_driver(struct ocrdma_driver *drv); +void be_roce_unregister_driver(struct ocrdma_driver *drv); + +/* API for RoCE driver to issue mailbox commands */ +int be_roce_mcc_cmd(void *netdev_handle, void *wrb_payload, + int wrb_payload_size, u16 *cmd_status, u16 *ext_status); + +#endif /* BE_ROCE_H */ diff --git a/drivers/net/ethernet/mellanox/Kconfig b/drivers/net/ethernet/mellanox/Kconfig new file mode 100644 index 0000000..8cf7563 --- /dev/null +++ b/drivers/net/ethernet/mellanox/Kconfig @@ -0,0 +1,24 @@ +# +# Mellanox driver configuration +# + +config NET_VENDOR_MELLANOX + bool "Mellanox devices" + default y + depends on PCI + ---help--- + If you have a network (Ethernet) card belonging to this class, say Y + and read the Ethernet-HOWTO, available from + . + + Note that the answer to this question doesn't directly affect the + kernel: saying N will just cause the configurator to skip all + the questions about Mellanox cards. If you say Y, you will be asked + for your specific card in the following questions. + +if NET_VENDOR_MELLANOX + +source "drivers/net/ethernet/mellanox/mlx4/Kconfig" +source "drivers/net/ethernet/mellanox/mlx5/core/Kconfig" + +endif # NET_VENDOR_MELLANOX diff --git a/drivers/net/ethernet/mellanox/Makefile b/drivers/net/ethernet/mellanox/Makefile new file mode 100644 index 0000000..38fe32e --- /dev/null +++ b/drivers/net/ethernet/mellanox/Makefile @@ -0,0 +1,6 @@ +# +# Makefile for the Mellanox device drivers. +# + +obj-$(CONFIG_MLX4_CORE) += mlx4/ +obj-$(CONFIG_MLX5_CORE) += mlx5/core/ diff --git a/drivers/net/ethernet/mellanox/mlx4/Kconfig b/drivers/net/ethernet/mellanox/mlx4/Kconfig new file mode 100644 index 0000000..1486ce9 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/Kconfig @@ -0,0 +1,46 @@ +# +# Mellanox driver configuration +# + +config MLX4_EN + tristate "Mellanox Technologies 1/10/40Gbit Ethernet support" + depends on PCI + select MLX4_CORE + select PTP_1588_CLOCK + ---help--- + This driver supports Mellanox Technologies ConnectX Ethernet + devices. + +config MLX4_EN_DCB + bool "Data Center Bridging (DCB) Support" + default y + depends on MLX4_EN && DCB + ---help--- + Say Y here if you want to use Data Center Bridging (DCB) in the + driver. + If set to N, will not be able to configure QoS and ratelimit attributes. + This flag is depended on the kernel's DCB support. + + If unsure, set to Y + +config MLX4_EN_VXLAN + bool "VXLAN offloads Support" + default y + depends on MLX4_EN && VXLAN && !(MLX4_EN=y && VXLAN=m) + ---help--- + Say Y here if you want to use VXLAN offloads in the driver. + +config MLX4_CORE + tristate + depends on PCI + default n + +config MLX4_DEBUG + bool "Verbose debugging output" if (MLX4_CORE && EXPERT) + depends on MLX4_CORE + default y + ---help--- + This option causes debugging code to be compiled into the + mlx4_core driver. The output can be turned on via the + debug_level module parameter (which can also be set after + the driver is loaded through sysfs). diff --git a/drivers/net/ethernet/mellanox/mlx4/Makefile b/drivers/net/ethernet/mellanox/mlx4/Makefile new file mode 100644 index 0000000..3e9c70f --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/Makefile @@ -0,0 +1,10 @@ +obj-$(CONFIG_MLX4_CORE) += mlx4_core.o + +mlx4_core-y := alloc.o catas.o cmd.o cq.o eq.o fw.o icm.o intf.o main.o mcg.o \ + mr.o pd.o port.o profile.o qp.o reset.o sense.o srq.o resource_tracker.o + +obj-$(CONFIG_MLX4_EN) += mlx4_en.o + +mlx4_en-y := en_main.o en_tx.o en_rx.o en_ethtool.o en_port.o en_cq.o \ + en_resources.o en_netdev.o en_selftest.o en_clock.o +mlx4_en-$(CONFIG_MLX4_EN_DCB) += en_dcb_nl.o diff --git a/drivers/net/ethernet/mellanox/mlx4/alloc.c b/drivers/net/ethernet/mellanox/mlx4/alloc.c new file mode 100644 index 0000000..b0297da --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/alloc.c @@ -0,0 +1,419 @@ +/* + * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "mlx4.h" + +u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap) +{ + u32 obj; + + spin_lock(&bitmap->lock); + + obj = find_next_zero_bit(bitmap->table, bitmap->max, bitmap->last); + if (obj >= bitmap->max) { + bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top) + & bitmap->mask; + obj = find_first_zero_bit(bitmap->table, bitmap->max); + } + + if (obj < bitmap->max) { + set_bit(obj, bitmap->table); + bitmap->last = (obj + 1); + if (bitmap->last == bitmap->max) + bitmap->last = 0; + obj |= bitmap->top; + } else + obj = -1; + + if (obj != -1) + --bitmap->avail; + + spin_unlock(&bitmap->lock); + + return obj; +} + +void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj, int use_rr) +{ + mlx4_bitmap_free_range(bitmap, obj, 1, use_rr); +} + +u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, int align) +{ + u32 obj; + + if (likely(cnt == 1 && align == 1)) + return mlx4_bitmap_alloc(bitmap); + + spin_lock(&bitmap->lock); + + obj = bitmap_find_next_zero_area(bitmap->table, bitmap->max, + bitmap->last, cnt, align - 1); + if (obj >= bitmap->max) { + bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top) + & bitmap->mask; + obj = bitmap_find_next_zero_area(bitmap->table, bitmap->max, + 0, cnt, align - 1); + } + + if (obj < bitmap->max) { + bitmap_set(bitmap->table, obj, cnt); + if (obj == bitmap->last) { + bitmap->last = (obj + cnt); + if (bitmap->last >= bitmap->max) + bitmap->last = 0; + } + obj |= bitmap->top; + } else + obj = -1; + + if (obj != -1) + bitmap->avail -= cnt; + + spin_unlock(&bitmap->lock); + + return obj; +} + +u32 mlx4_bitmap_avail(struct mlx4_bitmap *bitmap) +{ + return bitmap->avail; +} + +void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt, + int use_rr) +{ + obj &= bitmap->max + bitmap->reserved_top - 1; + + spin_lock(&bitmap->lock); + if (!use_rr) { + bitmap->last = min(bitmap->last, obj); + bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top) + & bitmap->mask; + } + bitmap_clear(bitmap->table, obj, cnt); + bitmap->avail += cnt; + spin_unlock(&bitmap->lock); +} + +int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask, + u32 reserved_bot, u32 reserved_top) +{ + /* num must be a power of 2 */ + if (num != roundup_pow_of_two(num)) + return -EINVAL; + + bitmap->last = 0; + bitmap->top = 0; + bitmap->max = num - reserved_top; + bitmap->mask = mask; + bitmap->reserved_top = reserved_top; + bitmap->avail = num - reserved_top - reserved_bot; + spin_lock_init(&bitmap->lock); + bitmap->table = kzalloc(BITS_TO_LONGS(bitmap->max) * + sizeof (long), GFP_KERNEL); + if (!bitmap->table) + return -ENOMEM; + + bitmap_set(bitmap->table, 0, reserved_bot); + + return 0; +} + +void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap) +{ + kfree(bitmap->table); +} + +/* + * Handling for queue buffers -- we allocate a bunch of memory and + * register it in a memory region at HCA virtual address 0. If the + * requested size is > max_direct, we split the allocation into + * multiple pages, so we don't require too much contiguous memory. + */ + +int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct, + struct mlx4_buf *buf, gfp_t gfp) +{ + dma_addr_t t; + + if (size <= max_direct) { + buf->nbufs = 1; + buf->npages = 1; + buf->page_shift = get_order(size) + PAGE_SHIFT; + buf->direct.buf = dma_alloc_coherent(&dev->pdev->dev, + size, &t, gfp); + if (!buf->direct.buf) + return -ENOMEM; + + buf->direct.map = t; + + while (t & ((1 << buf->page_shift) - 1)) { + --buf->page_shift; + buf->npages *= 2; + } + + memset(buf->direct.buf, 0, size); + } else { + int i; + + buf->direct.buf = NULL; + buf->nbufs = (size + PAGE_SIZE - 1) / PAGE_SIZE; + buf->npages = buf->nbufs; + buf->page_shift = PAGE_SHIFT; + buf->page_list = kcalloc(buf->nbufs, sizeof(*buf->page_list), + gfp); + if (!buf->page_list) + return -ENOMEM; + + for (i = 0; i < buf->nbufs; ++i) { + buf->page_list[i].buf = + dma_alloc_coherent(&dev->pdev->dev, PAGE_SIZE, + &t, gfp); + if (!buf->page_list[i].buf) + goto err_free; + + buf->page_list[i].map = t; + + memset(buf->page_list[i].buf, 0, PAGE_SIZE); + } + + if (BITS_PER_LONG == 64) { + struct page **pages; + pages = kmalloc(sizeof *pages * buf->nbufs, gfp); + if (!pages) + goto err_free; + for (i = 0; i < buf->nbufs; ++i) + pages[i] = virt_to_page(buf->page_list[i].buf); + buf->direct.buf = vmap(pages, buf->nbufs, VM_MAP, PAGE_KERNEL); + kfree(pages); + if (!buf->direct.buf) + goto err_free; + } + } + + return 0; + +err_free: + mlx4_buf_free(dev, size, buf); + + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(mlx4_buf_alloc); + +void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf) +{ + int i; + + if (buf->nbufs == 1) + dma_free_coherent(&dev->pdev->dev, size, buf->direct.buf, + buf->direct.map); + else { + if (BITS_PER_LONG == 64 && buf->direct.buf) + vunmap(buf->direct.buf); + + for (i = 0; i < buf->nbufs; ++i) + if (buf->page_list[i].buf) + dma_free_coherent(&dev->pdev->dev, PAGE_SIZE, + buf->page_list[i].buf, + buf->page_list[i].map); + kfree(buf->page_list); + } +} +EXPORT_SYMBOL_GPL(mlx4_buf_free); + +static struct mlx4_db_pgdir *mlx4_alloc_db_pgdir(struct device *dma_device, + gfp_t gfp) +{ + struct mlx4_db_pgdir *pgdir; + + pgdir = kzalloc(sizeof *pgdir, gfp); + if (!pgdir) + return NULL; + + bitmap_fill(pgdir->order1, MLX4_DB_PER_PAGE / 2); + pgdir->bits[0] = pgdir->order0; + pgdir->bits[1] = pgdir->order1; + pgdir->db_page = dma_alloc_coherent(dma_device, PAGE_SIZE, + &pgdir->db_dma, gfp); + if (!pgdir->db_page) { + kfree(pgdir); + return NULL; + } + + return pgdir; +} + +static int mlx4_alloc_db_from_pgdir(struct mlx4_db_pgdir *pgdir, + struct mlx4_db *db, int order) +{ + int o; + int i; + + for (o = order; o <= 1; ++o) { + i = find_first_bit(pgdir->bits[o], MLX4_DB_PER_PAGE >> o); + if (i < MLX4_DB_PER_PAGE >> o) + goto found; + } + + return -ENOMEM; + +found: + clear_bit(i, pgdir->bits[o]); + + i <<= o; + + if (o > order) + set_bit(i ^ 1, pgdir->bits[order]); + + db->u.pgdir = pgdir; + db->index = i; + db->db = pgdir->db_page + db->index; + db->dma = pgdir->db_dma + db->index * 4; + db->order = order; + + return 0; +} + +int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order, gfp_t gfp) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_db_pgdir *pgdir; + int ret = 0; + + mutex_lock(&priv->pgdir_mutex); + + list_for_each_entry(pgdir, &priv->pgdir_list, list) + if (!mlx4_alloc_db_from_pgdir(pgdir, db, order)) + goto out; + + pgdir = mlx4_alloc_db_pgdir(&(dev->pdev->dev), gfp); + if (!pgdir) { + ret = -ENOMEM; + goto out; + } + + list_add(&pgdir->list, &priv->pgdir_list); + + /* This should never fail -- we just allocated an empty page: */ + WARN_ON(mlx4_alloc_db_from_pgdir(pgdir, db, order)); + +out: + mutex_unlock(&priv->pgdir_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(mlx4_db_alloc); + +void mlx4_db_free(struct mlx4_dev *dev, struct mlx4_db *db) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int o; + int i; + + mutex_lock(&priv->pgdir_mutex); + + o = db->order; + i = db->index; + + if (db->order == 0 && test_bit(i ^ 1, db->u.pgdir->order0)) { + clear_bit(i ^ 1, db->u.pgdir->order0); + ++o; + } + i >>= o; + set_bit(i, db->u.pgdir->bits[o]); + + if (bitmap_full(db->u.pgdir->order1, MLX4_DB_PER_PAGE / 2)) { + dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE, + db->u.pgdir->db_page, db->u.pgdir->db_dma); + list_del(&db->u.pgdir->list); + kfree(db->u.pgdir); + } + + mutex_unlock(&priv->pgdir_mutex); +} +EXPORT_SYMBOL_GPL(mlx4_db_free); + +int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres, + int size, int max_direct) +{ + int err; + + err = mlx4_db_alloc(dev, &wqres->db, 1, GFP_KERNEL); + if (err) + return err; + + *wqres->db.db = 0; + + err = mlx4_buf_alloc(dev, size, max_direct, &wqres->buf, GFP_KERNEL); + if (err) + goto err_db; + + err = mlx4_mtt_init(dev, wqres->buf.npages, wqres->buf.page_shift, + &wqres->mtt); + if (err) + goto err_buf; + + err = mlx4_buf_write_mtt(dev, &wqres->mtt, &wqres->buf, GFP_KERNEL); + if (err) + goto err_mtt; + + return 0; + +err_mtt: + mlx4_mtt_cleanup(dev, &wqres->mtt); +err_buf: + mlx4_buf_free(dev, size, &wqres->buf); +err_db: + mlx4_db_free(dev, &wqres->db); + + return err; +} +EXPORT_SYMBOL_GPL(mlx4_alloc_hwq_res); + +void mlx4_free_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres, + int size) +{ + mlx4_mtt_cleanup(dev, &wqres->mtt); + mlx4_buf_free(dev, size, &wqres->buf); + mlx4_db_free(dev, &wqres->db); +} +EXPORT_SYMBOL_GPL(mlx4_free_hwq_res); diff --git a/drivers/net/ethernet/mellanox/mlx4/catas.c b/drivers/net/ethernet/mellanox/mlx4/catas.c new file mode 100644 index 0000000..9c656fe --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/catas.c @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "mlx4.h" + +enum { + MLX4_CATAS_POLL_INTERVAL = 5 * HZ, +}; + +static DEFINE_SPINLOCK(catas_lock); + +static LIST_HEAD(catas_list); +static struct work_struct catas_work; + +static int internal_err_reset = 1; +module_param(internal_err_reset, int, 0644); +MODULE_PARM_DESC(internal_err_reset, + "Reset device on internal errors if non-zero" + " (default 1, in SRIOV mode default is 0)"); + +static void dump_err_buf(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + int i; + + mlx4_err(dev, "Internal error detected:\n"); + for (i = 0; i < priv->fw.catas_size; ++i) + mlx4_err(dev, " buf[%02x]: %08x\n", + i, swab32(readl(priv->catas_err.map + i))); +} + +static void poll_catas(unsigned long dev_ptr) +{ + struct mlx4_dev *dev = (struct mlx4_dev *) dev_ptr; + struct mlx4_priv *priv = mlx4_priv(dev); + + if (readl(priv->catas_err.map)) { + /* If the device is off-line, we cannot try to recover it */ + if (pci_channel_offline(dev->pdev)) + mod_timer(&priv->catas_err.timer, + round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL)); + else { + dump_err_buf(dev); + mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0); + + if (internal_err_reset) { + spin_lock(&catas_lock); + list_add(&priv->catas_err.list, &catas_list); + spin_unlock(&catas_lock); + + queue_work(mlx4_wq, &catas_work); + } + } + } else + mod_timer(&priv->catas_err.timer, + round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL)); +} + +static void catas_reset(struct work_struct *work) +{ + struct mlx4_priv *priv, *tmppriv; + struct mlx4_dev *dev; + + LIST_HEAD(tlist); + int ret; + + spin_lock_irq(&catas_lock); + list_splice_init(&catas_list, &tlist); + spin_unlock_irq(&catas_lock); + + list_for_each_entry_safe(priv, tmppriv, &tlist, catas_err.list) { + struct pci_dev *pdev = priv->dev.pdev; + + /* If the device is off-line, we cannot reset it */ + if (pci_channel_offline(pdev)) + continue; + + ret = mlx4_restart_one(priv->dev.pdev); + /* 'priv' now is not valid */ + if (ret) + pr_err("mlx4 %s: Reset failed (%d)\n", + pci_name(pdev), ret); + else { + dev = pci_get_drvdata(pdev); + mlx4_dbg(dev, "Reset succeeded\n"); + } + } +} + +void mlx4_start_catas_poll(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + phys_addr_t addr; + + /*If we are in SRIOV the default of the module param must be 0*/ + if (mlx4_is_mfunc(dev)) + internal_err_reset = 0; + + INIT_LIST_HEAD(&priv->catas_err.list); + init_timer(&priv->catas_err.timer); + priv->catas_err.map = NULL; + + addr = pci_resource_start(dev->pdev, priv->fw.catas_bar) + + priv->fw.catas_offset; + + priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4); + if (!priv->catas_err.map) { + mlx4_warn(dev, "Failed to map internal error buffer at 0x%llx\n", + (unsigned long long) addr); + return; + } + + priv->catas_err.timer.data = (unsigned long) dev; + priv->catas_err.timer.function = poll_catas; + priv->catas_err.timer.expires = + round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL); + add_timer(&priv->catas_err.timer); +} + +void mlx4_stop_catas_poll(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + del_timer_sync(&priv->catas_err.timer); + + if (priv->catas_err.map) + iounmap(priv->catas_err.map); + + spin_lock_irq(&catas_lock); + list_del(&priv->catas_err.list); + spin_unlock_irq(&catas_lock); +} + +void __init mlx4_catas_init(void) +{ + INIT_WORK(&catas_work, catas_reset); +} diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c new file mode 100644 index 0000000..b16e1b9 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c @@ -0,0 +1,2650 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved. + * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include "mlx4.h" +#include "fw.h" + +#define CMD_POLL_TOKEN 0xffff +#define INBOX_MASK 0xffffffffffffff00ULL + +#define CMD_CHAN_VER 1 +#define CMD_CHAN_IF_REV 1 + +enum { + /* command completed successfully: */ + CMD_STAT_OK = 0x00, + /* Internal error (such as a bus error) occurred while processing command: */ + CMD_STAT_INTERNAL_ERR = 0x01, + /* Operation/command not supported or opcode modifier not supported: */ + CMD_STAT_BAD_OP = 0x02, + /* Parameter not supported or parameter out of range: */ + CMD_STAT_BAD_PARAM = 0x03, + /* System not enabled or bad system state: */ + CMD_STAT_BAD_SYS_STATE = 0x04, + /* Attempt to access reserved or unallocaterd resource: */ + CMD_STAT_BAD_RESOURCE = 0x05, + /* Requested resource is currently executing a command, or is otherwise busy: */ + CMD_STAT_RESOURCE_BUSY = 0x06, + /* Required capability exceeds device limits: */ + CMD_STAT_EXCEED_LIM = 0x08, + /* Resource is not in the appropriate state or ownership: */ + CMD_STAT_BAD_RES_STATE = 0x09, + /* Index out of range: */ + CMD_STAT_BAD_INDEX = 0x0a, + /* FW image corrupted: */ + CMD_STAT_BAD_NVMEM = 0x0b, + /* Error in ICM mapping (e.g. not enough auxiliary ICM pages to execute command): */ + CMD_STAT_ICM_ERROR = 0x0c, + /* Attempt to modify a QP/EE which is not in the presumed state: */ + CMD_STAT_BAD_QP_STATE = 0x10, + /* Bad segment parameters (Address/Size): */ + CMD_STAT_BAD_SEG_PARAM = 0x20, + /* Memory Region has Memory Windows bound to: */ + CMD_STAT_REG_BOUND = 0x21, + /* HCA local attached memory not present: */ + CMD_STAT_LAM_NOT_PRE = 0x22, + /* Bad management packet (silently discarded): */ + CMD_STAT_BAD_PKT = 0x30, + /* More outstanding CQEs in CQ than new CQ size: */ + CMD_STAT_BAD_SIZE = 0x40, + /* Multi Function device support required: */ + CMD_STAT_MULTI_FUNC_REQ = 0x50, +}; + +enum { + HCR_IN_PARAM_OFFSET = 0x00, + HCR_IN_MODIFIER_OFFSET = 0x08, + HCR_OUT_PARAM_OFFSET = 0x0c, + HCR_TOKEN_OFFSET = 0x14, + HCR_STATUS_OFFSET = 0x18, + + HCR_OPMOD_SHIFT = 12, + HCR_T_BIT = 21, + HCR_E_BIT = 22, + HCR_GO_BIT = 23 +}; + +enum { + GO_BIT_TIMEOUT_MSECS = 10000 +}; + +enum mlx4_vlan_transition { + MLX4_VLAN_TRANSITION_VST_VST = 0, + MLX4_VLAN_TRANSITION_VST_VGT = 1, + MLX4_VLAN_TRANSITION_VGT_VST = 2, + MLX4_VLAN_TRANSITION_VGT_VGT = 3, +}; + + +struct mlx4_cmd_context { + struct completion done; + int result; + int next; + u64 out_param; + u16 token; + u8 fw_status; +}; + +static int mlx4_master_process_vhcr(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr_cmd *in_vhcr); + +static int mlx4_status_to_errno(u8 status) +{ + static const int trans_table[] = { + [CMD_STAT_INTERNAL_ERR] = -EIO, + [CMD_STAT_BAD_OP] = -EPERM, + [CMD_STAT_BAD_PARAM] = -EINVAL, + [CMD_STAT_BAD_SYS_STATE] = -ENXIO, + [CMD_STAT_BAD_RESOURCE] = -EBADF, + [CMD_STAT_RESOURCE_BUSY] = -EBUSY, + [CMD_STAT_EXCEED_LIM] = -ENOMEM, + [CMD_STAT_BAD_RES_STATE] = -EBADF, + [CMD_STAT_BAD_INDEX] = -EBADF, + [CMD_STAT_BAD_NVMEM] = -EFAULT, + [CMD_STAT_ICM_ERROR] = -ENFILE, + [CMD_STAT_BAD_QP_STATE] = -EINVAL, + [CMD_STAT_BAD_SEG_PARAM] = -EFAULT, + [CMD_STAT_REG_BOUND] = -EBUSY, + [CMD_STAT_LAM_NOT_PRE] = -EAGAIN, + [CMD_STAT_BAD_PKT] = -EINVAL, + [CMD_STAT_BAD_SIZE] = -ENOMEM, + [CMD_STAT_MULTI_FUNC_REQ] = -EACCES, + }; + + if (status >= ARRAY_SIZE(trans_table) || + (status != CMD_STAT_OK && trans_table[status] == 0)) + return -EIO; + + return trans_table[status]; +} + +static u8 mlx4_errno_to_status(int errno) +{ + switch (errno) { + case -EPERM: + return CMD_STAT_BAD_OP; + case -EINVAL: + return CMD_STAT_BAD_PARAM; + case -ENXIO: + return CMD_STAT_BAD_SYS_STATE; + case -EBUSY: + return CMD_STAT_RESOURCE_BUSY; + case -ENOMEM: + return CMD_STAT_EXCEED_LIM; + case -ENFILE: + return CMD_STAT_ICM_ERROR; + default: + return CMD_STAT_INTERNAL_ERR; + } +} + +static int comm_pending(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + u32 status = readl(&priv->mfunc.comm->slave_read); + + return (swab32(status) >> 31) != priv->cmd.comm_toggle; +} + +static void mlx4_comm_cmd_post(struct mlx4_dev *dev, u8 cmd, u16 param) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + u32 val; + + priv->cmd.comm_toggle ^= 1; + val = param | (cmd << 16) | (priv->cmd.comm_toggle << 31); + __raw_writel((__force u32) cpu_to_be32(val), + &priv->mfunc.comm->slave_write); + mmiowb(); +} + +static int mlx4_comm_cmd_poll(struct mlx4_dev *dev, u8 cmd, u16 param, + unsigned long timeout) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + unsigned long end; + int err = 0; + int ret_from_pending = 0; + + /* First, verify that the master reports correct status */ + if (comm_pending(dev)) { + mlx4_warn(dev, "Communication channel is not idle - my toggle is %d (cmd:0x%x)\n", + priv->cmd.comm_toggle, cmd); + return -EAGAIN; + } + + /* Write command */ + down(&priv->cmd.poll_sem); + mlx4_comm_cmd_post(dev, cmd, param); + + end = msecs_to_jiffies(timeout) + jiffies; + while (comm_pending(dev) && time_before(jiffies, end)) + cond_resched(); + ret_from_pending = comm_pending(dev); + if (ret_from_pending) { + /* check if the slave is trying to boot in the middle of + * FLR process. The only non-zero result in the RESET command + * is MLX4_DELAY_RESET_SLAVE*/ + if ((MLX4_COMM_CMD_RESET == cmd)) { + err = MLX4_DELAY_RESET_SLAVE; + } else { + mlx4_warn(dev, "Communication channel timed out\n"); + err = -ETIMEDOUT; + } + } + + up(&priv->cmd.poll_sem); + return err; +} + +static int mlx4_comm_cmd_wait(struct mlx4_dev *dev, u8 op, + u16 param, unsigned long timeout) +{ + struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd; + struct mlx4_cmd_context *context; + unsigned long end; + int err = 0; + + down(&cmd->event_sem); + + spin_lock(&cmd->context_lock); + BUG_ON(cmd->free_head < 0); + context = &cmd->context[cmd->free_head]; + context->token += cmd->token_mask + 1; + cmd->free_head = context->next; + spin_unlock(&cmd->context_lock); + + init_completion(&context->done); + + mlx4_comm_cmd_post(dev, op, param); + + if (!wait_for_completion_timeout(&context->done, + msecs_to_jiffies(timeout))) { + mlx4_warn(dev, "communication channel command 0x%x timed out\n", + op); + err = -EBUSY; + goto out; + } + + err = context->result; + if (err && context->fw_status != CMD_STAT_MULTI_FUNC_REQ) { + mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n", + op, context->fw_status); + goto out; + } + +out: + /* wait for comm channel ready + * this is necessary for prevention the race + * when switching between event to polling mode + */ + end = msecs_to_jiffies(timeout) + jiffies; + while (comm_pending(dev) && time_before(jiffies, end)) + cond_resched(); + + spin_lock(&cmd->context_lock); + context->next = cmd->free_head; + cmd->free_head = context - cmd->context; + spin_unlock(&cmd->context_lock); + + up(&cmd->event_sem); + return err; +} + +int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param, + unsigned long timeout) +{ + if (mlx4_priv(dev)->cmd.use_events) + return mlx4_comm_cmd_wait(dev, cmd, param, timeout); + return mlx4_comm_cmd_poll(dev, cmd, param, timeout); +} + +static int cmd_pending(struct mlx4_dev *dev) +{ + u32 status; + + if (pci_channel_offline(dev->pdev)) + return -EIO; + + status = readl(mlx4_priv(dev)->cmd.hcr + HCR_STATUS_OFFSET); + + return (status & swab32(1 << HCR_GO_BIT)) || + (mlx4_priv(dev)->cmd.toggle == + !!(status & swab32(1 << HCR_T_BIT))); +} + +static int mlx4_cmd_post(struct mlx4_dev *dev, u64 in_param, u64 out_param, + u32 in_modifier, u8 op_modifier, u16 op, u16 token, + int event) +{ + struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd; + u32 __iomem *hcr = cmd->hcr; + int ret = -EAGAIN; + unsigned long end; + + mutex_lock(&cmd->hcr_mutex); + + if (pci_channel_offline(dev->pdev)) { + /* + * Device is going through error recovery + * and cannot accept commands. + */ + ret = -EIO; + goto out; + } + + end = jiffies; + if (event) + end += msecs_to_jiffies(GO_BIT_TIMEOUT_MSECS); + + while (cmd_pending(dev)) { + if (pci_channel_offline(dev->pdev)) { + /* + * Device is going through error recovery + * and cannot accept commands. + */ + ret = -EIO; + goto out; + } + + if (time_after_eq(jiffies, end)) { + mlx4_err(dev, "%s:cmd_pending failed\n", __func__); + goto out; + } + cond_resched(); + } + + /* + * We use writel (instead of something like memcpy_toio) + * because writes of less than 32 bits to the HCR don't work + * (and some architectures such as ia64 implement memcpy_toio + * in terms of writeb). + */ + __raw_writel((__force u32) cpu_to_be32(in_param >> 32), hcr + 0); + __raw_writel((__force u32) cpu_to_be32(in_param & 0xfffffffful), hcr + 1); + __raw_writel((__force u32) cpu_to_be32(in_modifier), hcr + 2); + __raw_writel((__force u32) cpu_to_be32(out_param >> 32), hcr + 3); + __raw_writel((__force u32) cpu_to_be32(out_param & 0xfffffffful), hcr + 4); + __raw_writel((__force u32) cpu_to_be32(token << 16), hcr + 5); + + /* __raw_writel may not order writes. */ + wmb(); + + __raw_writel((__force u32) cpu_to_be32((1 << HCR_GO_BIT) | + (cmd->toggle << HCR_T_BIT) | + (event ? (1 << HCR_E_BIT) : 0) | + (op_modifier << HCR_OPMOD_SHIFT) | + op), hcr + 6); + + /* + * Make sure that our HCR writes don't get mixed in with + * writes from another CPU starting a FW command. + */ + mmiowb(); + + cmd->toggle = cmd->toggle ^ 1; + + ret = 0; + +out: + mutex_unlock(&cmd->hcr_mutex); + return ret; +} + +static int mlx4_slave_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param, + int out_is_imm, u32 in_modifier, u8 op_modifier, + u16 op, unsigned long timeout) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_vhcr_cmd *vhcr = priv->mfunc.vhcr; + int ret; + + mutex_lock(&priv->cmd.slave_cmd_mutex); + + vhcr->in_param = cpu_to_be64(in_param); + vhcr->out_param = out_param ? cpu_to_be64(*out_param) : 0; + vhcr->in_modifier = cpu_to_be32(in_modifier); + vhcr->opcode = cpu_to_be16((((u16) op_modifier) << 12) | (op & 0xfff)); + vhcr->token = cpu_to_be16(CMD_POLL_TOKEN); + vhcr->status = 0; + vhcr->flags = !!(priv->cmd.use_events) << 6; + + if (mlx4_is_master(dev)) { + ret = mlx4_master_process_vhcr(dev, dev->caps.function, vhcr); + if (!ret) { + if (out_is_imm) { + if (out_param) + *out_param = + be64_to_cpu(vhcr->out_param); + else { + mlx4_err(dev, "response expected while output mailbox is NULL for command 0x%x\n", + op); + vhcr->status = CMD_STAT_BAD_PARAM; + } + } + ret = mlx4_status_to_errno(vhcr->status); + } + } else { + ret = mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR_POST, 0, + MLX4_COMM_TIME + timeout); + if (!ret) { + if (out_is_imm) { + if (out_param) + *out_param = + be64_to_cpu(vhcr->out_param); + else { + mlx4_err(dev, "response expected while output mailbox is NULL for command 0x%x\n", + op); + vhcr->status = CMD_STAT_BAD_PARAM; + } + } + ret = mlx4_status_to_errno(vhcr->status); + } else + mlx4_err(dev, "failed execution of VHCR_POST command opcode 0x%x\n", + op); + } + + mutex_unlock(&priv->cmd.slave_cmd_mutex); + return ret; +} + +static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param, + int out_is_imm, u32 in_modifier, u8 op_modifier, + u16 op, unsigned long timeout) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + void __iomem *hcr = priv->cmd.hcr; + int err = 0; + unsigned long end; + u32 stat; + + down(&priv->cmd.poll_sem); + + if (pci_channel_offline(dev->pdev)) { + /* + * Device is going through error recovery + * and cannot accept commands. + */ + err = -EIO; + goto out; + } + + if (out_is_imm && !out_param) { + mlx4_err(dev, "response expected while output mailbox is NULL for command 0x%x\n", + op); + err = -EINVAL; + goto out; + } + + err = mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0, + in_modifier, op_modifier, op, CMD_POLL_TOKEN, 0); + if (err) + goto out; + + end = msecs_to_jiffies(timeout) + jiffies; + while (cmd_pending(dev) && time_before(jiffies, end)) { + if (pci_channel_offline(dev->pdev)) { + /* + * Device is going through error recovery + * and cannot accept commands. + */ + err = -EIO; + goto out; + } + + cond_resched(); + } + + if (cmd_pending(dev)) { + mlx4_warn(dev, "command 0x%x timed out (go bit not cleared)\n", + op); + err = -ETIMEDOUT; + goto out; + } + + if (out_is_imm) + *out_param = + (u64) be32_to_cpu((__force __be32) + __raw_readl(hcr + HCR_OUT_PARAM_OFFSET)) << 32 | + (u64) be32_to_cpu((__force __be32) + __raw_readl(hcr + HCR_OUT_PARAM_OFFSET + 4)); + stat = be32_to_cpu((__force __be32) + __raw_readl(hcr + HCR_STATUS_OFFSET)) >> 24; + err = mlx4_status_to_errno(stat); + if (err) + mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n", + op, stat); + +out: + up(&priv->cmd.poll_sem); + return err; +} + +void mlx4_cmd_event(struct mlx4_dev *dev, u16 token, u8 status, u64 out_param) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_cmd_context *context = + &priv->cmd.context[token & priv->cmd.token_mask]; + + /* previously timed out command completing at long last */ + if (token != context->token) + return; + + context->fw_status = status; + context->result = mlx4_status_to_errno(status); + context->out_param = out_param; + + complete(&context->done); +} + +static int mlx4_cmd_wait(struct mlx4_dev *dev, u64 in_param, u64 *out_param, + int out_is_imm, u32 in_modifier, u8 op_modifier, + u16 op, unsigned long timeout) +{ + struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd; + struct mlx4_cmd_context *context; + int err = 0; + + down(&cmd->event_sem); + + spin_lock(&cmd->context_lock); + BUG_ON(cmd->free_head < 0); + context = &cmd->context[cmd->free_head]; + context->token += cmd->token_mask + 1; + cmd->free_head = context->next; + spin_unlock(&cmd->context_lock); + + if (out_is_imm && !out_param) { + mlx4_err(dev, "response expected while output mailbox is NULL for command 0x%x\n", + op); + err = -EINVAL; + goto out; + } + + init_completion(&context->done); + + mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0, + in_modifier, op_modifier, op, context->token, 1); + + if (!wait_for_completion_timeout(&context->done, + msecs_to_jiffies(timeout))) { + mlx4_warn(dev, "command 0x%x timed out (go bit not cleared)\n", + op); + err = -EBUSY; + goto out; + } + + err = context->result; + if (err) { + /* Since we do not want to have this error message always + * displayed at driver start when there are ConnectX2 HCAs + * on the host, we deprecate the error message for this + * specific command/input_mod/opcode_mod/fw-status to be debug. + */ + if (op == MLX4_CMD_SET_PORT && in_modifier == 1 && + op_modifier == 0 && context->fw_status == CMD_STAT_BAD_SIZE) + mlx4_dbg(dev, "command 0x%x failed: fw status = 0x%x\n", + op, context->fw_status); + else + mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n", + op, context->fw_status); + goto out; + } + + if (out_is_imm) + *out_param = context->out_param; + +out: + spin_lock(&cmd->context_lock); + context->next = cmd->free_head; + cmd->free_head = context - cmd->context; + spin_unlock(&cmd->context_lock); + + up(&cmd->event_sem); + return err; +} + +int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param, + int out_is_imm, u32 in_modifier, u8 op_modifier, + u16 op, unsigned long timeout, int native) +{ + if (pci_channel_offline(dev->pdev)) + return -EIO; + + if (!mlx4_is_mfunc(dev) || (native && mlx4_is_master(dev))) { + if (mlx4_priv(dev)->cmd.use_events) + return mlx4_cmd_wait(dev, in_param, out_param, + out_is_imm, in_modifier, + op_modifier, op, timeout); + else + return mlx4_cmd_poll(dev, in_param, out_param, + out_is_imm, in_modifier, + op_modifier, op, timeout); + } + return mlx4_slave_cmd(dev, in_param, out_param, out_is_imm, + in_modifier, op_modifier, op, timeout); +} +EXPORT_SYMBOL_GPL(__mlx4_cmd); + + +static int mlx4_ARM_COMM_CHANNEL(struct mlx4_dev *dev) +{ + return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_ARM_COMM_CHANNEL, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); +} + +static int mlx4_ACCESS_MEM(struct mlx4_dev *dev, u64 master_addr, + int slave, u64 slave_addr, + int size, int is_read) +{ + u64 in_param; + u64 out_param; + + if ((slave_addr & 0xfff) | (master_addr & 0xfff) | + (slave & ~0x7f) | (size & 0xff)) { + mlx4_err(dev, "Bad access mem params - slave_addr:0x%llx master_addr:0x%llx slave_id:%d size:%d\n", + slave_addr, master_addr, slave, size); + return -EINVAL; + } + + if (is_read) { + in_param = (u64) slave | slave_addr; + out_param = (u64) dev->caps.function | master_addr; + } else { + in_param = (u64) dev->caps.function | master_addr; + out_param = (u64) slave | slave_addr; + } + + return mlx4_cmd_imm(dev, in_param, &out_param, size, 0, + MLX4_CMD_ACCESS_MEM, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); +} + +static int query_pkey_block(struct mlx4_dev *dev, u8 port, u16 index, u16 *pkey, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox) +{ + struct ib_smp *in_mad = (struct ib_smp *)(inbox->buf); + struct ib_smp *out_mad = (struct ib_smp *)(outbox->buf); + int err; + int i; + + if (index & 0x1f) + return -EINVAL; + + in_mad->attr_mod = cpu_to_be32(index / 32); + + err = mlx4_cmd_box(dev, inbox->dma, outbox->dma, port, 3, + MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C, + MLX4_CMD_NATIVE); + if (err) + return err; + + for (i = 0; i < 32; ++i) + pkey[i] = be16_to_cpu(((__be16 *) out_mad->data)[i]); + + return err; +} + +static int get_full_pkey_table(struct mlx4_dev *dev, u8 port, u16 *table, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox) +{ + int i; + int err; + + for (i = 0; i < dev->caps.pkey_table_len[port]; i += 32) { + err = query_pkey_block(dev, port, i, table + i, inbox, outbox); + if (err) + return err; + } + + return 0; +} +#define PORT_CAPABILITY_LOCATION_IN_SMP 20 +#define PORT_STATE_OFFSET 32 + +static enum ib_port_state vf_port_state(struct mlx4_dev *dev, int port, int vf) +{ + if (mlx4_get_slave_port_state(dev, vf, port) == SLAVE_PORT_UP) + return IB_PORT_ACTIVE; + else + return IB_PORT_DOWN; +} + +static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct ib_smp *smp = inbox->buf; + u32 index; + u8 port; + u8 opcode_modifier; + u16 *table; + int err; + int vidx, pidx; + int network_view; + struct mlx4_priv *priv = mlx4_priv(dev); + struct ib_smp *outsmp = outbox->buf; + __be16 *outtab = (__be16 *)(outsmp->data); + __be32 slave_cap_mask; + __be64 slave_node_guid; + + port = vhcr->in_modifier; + + /* network-view bit is for driver use only, and should not be passed to FW */ + opcode_modifier = vhcr->op_modifier & ~0x8; /* clear netw view bit */ + network_view = !!(vhcr->op_modifier & 0x8); + + if (smp->base_version == 1 && + smp->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED && + smp->class_version == 1) { + /* host view is paravirtualized */ + if (!network_view && smp->method == IB_MGMT_METHOD_GET) { + if (smp->attr_id == IB_SMP_ATTR_PKEY_TABLE) { + index = be32_to_cpu(smp->attr_mod); + if (port < 1 || port > dev->caps.num_ports) + return -EINVAL; + table = kcalloc(dev->caps.pkey_table_len[port], sizeof *table, GFP_KERNEL); + if (!table) + return -ENOMEM; + /* need to get the full pkey table because the paravirtualized + * pkeys may be scattered among several pkey blocks. + */ + err = get_full_pkey_table(dev, port, table, inbox, outbox); + if (!err) { + for (vidx = index * 32; vidx < (index + 1) * 32; ++vidx) { + pidx = priv->virt2phys_pkey[slave][port - 1][vidx]; + outtab[vidx % 32] = cpu_to_be16(table[pidx]); + } + } + kfree(table); + return err; + } + if (smp->attr_id == IB_SMP_ATTR_PORT_INFO) { + /*get the slave specific caps:*/ + /*do the command */ + err = mlx4_cmd_box(dev, inbox->dma, outbox->dma, + vhcr->in_modifier, opcode_modifier, + vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE); + /* modify the response for slaves */ + if (!err && slave != mlx4_master_func_num(dev)) { + u8 *state = outsmp->data + PORT_STATE_OFFSET; + + *state = (*state & 0xf0) | vf_port_state(dev, port, slave); + slave_cap_mask = priv->mfunc.master.slave_state[slave].ib_cap_mask[port]; + memcpy(outsmp->data + PORT_CAPABILITY_LOCATION_IN_SMP, &slave_cap_mask, 4); + } + return err; + } + if (smp->attr_id == IB_SMP_ATTR_GUID_INFO) { + /* compute slave's gid block */ + smp->attr_mod = cpu_to_be32(slave / 8); + /* execute cmd */ + err = mlx4_cmd_box(dev, inbox->dma, outbox->dma, + vhcr->in_modifier, opcode_modifier, + vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE); + if (!err) { + /* if needed, move slave gid to index 0 */ + if (slave % 8) + memcpy(outsmp->data, + outsmp->data + (slave % 8) * 8, 8); + /* delete all other gids */ + memset(outsmp->data + 8, 0, 56); + } + return err; + } + if (smp->attr_id == IB_SMP_ATTR_NODE_INFO) { + err = mlx4_cmd_box(dev, inbox->dma, outbox->dma, + vhcr->in_modifier, opcode_modifier, + vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE); + if (!err) { + slave_node_guid = mlx4_get_slave_node_guid(dev, slave); + memcpy(outsmp->data + 12, &slave_node_guid, 8); + } + return err; + } + } + } + + /* Non-privileged VFs are only allowed "host" view LID-routed 'Get' MADs. + * These are the MADs used by ib verbs (such as ib_query_gids). + */ + if (slave != mlx4_master_func_num(dev) && + !mlx4_vf_smi_enabled(dev, slave, port)) { + if (!(smp->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED && + smp->method == IB_MGMT_METHOD_GET) || network_view) { + mlx4_err(dev, "Unprivileged slave %d is trying to execute a Subnet MGMT MAD, class 0x%x, method 0x%x, view=%s for attr 0x%x. Rejecting\n", + slave, smp->method, smp->mgmt_class, + network_view ? "Network" : "Host", + be16_to_cpu(smp->attr_id)); + return -EPERM; + } + } + + return mlx4_cmd_box(dev, inbox->dma, outbox->dma, + vhcr->in_modifier, opcode_modifier, + vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE); +} + +static int mlx4_CMD_EPERM_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + return -EPERM; +} + +int mlx4_DMA_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + u64 in_param; + u64 out_param; + int err; + + in_param = cmd->has_inbox ? (u64) inbox->dma : vhcr->in_param; + out_param = cmd->has_outbox ? (u64) outbox->dma : vhcr->out_param; + if (cmd->encode_slave_id) { + in_param &= 0xffffffffffffff00ll; + in_param |= slave; + } + + err = __mlx4_cmd(dev, in_param, &out_param, cmd->out_is_imm, + vhcr->in_modifier, vhcr->op_modifier, vhcr->op, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); + + if (cmd->out_is_imm) + vhcr->out_param = out_param; + + return err; +} + +static struct mlx4_cmd_info cmd_info[] = { + { + .opcode = MLX4_CMD_QUERY_FW, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_QUERY_FW_wrapper + }, + { + .opcode = MLX4_CMD_QUERY_HCA, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = NULL + }, + { + .opcode = MLX4_CMD_QUERY_DEV_CAP, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_QUERY_DEV_CAP_wrapper + }, + { + .opcode = MLX4_CMD_QUERY_FUNC_CAP, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_QUERY_FUNC_CAP_wrapper + }, + { + .opcode = MLX4_CMD_QUERY_ADAPTER, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = NULL + }, + { + .opcode = MLX4_CMD_INIT_PORT, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_INIT_PORT_wrapper + }, + { + .opcode = MLX4_CMD_CLOSE_PORT, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_CLOSE_PORT_wrapper + }, + { + .opcode = MLX4_CMD_QUERY_PORT, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_QUERY_PORT_wrapper + }, + { + .opcode = MLX4_CMD_SET_PORT, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_SET_PORT_wrapper + }, + { + .opcode = MLX4_CMD_MAP_EQ, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_MAP_EQ_wrapper + }, + { + .opcode = MLX4_CMD_SW2HW_EQ, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = true, + .verify = NULL, + .wrapper = mlx4_SW2HW_EQ_wrapper + }, + { + .opcode = MLX4_CMD_HW_HEALTH_CHECK, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = NULL + }, + { + .opcode = MLX4_CMD_NOP, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = NULL + }, + { + .opcode = MLX4_CMD_CONFIG_DEV, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_CMD_EPERM_wrapper + }, + { + .opcode = MLX4_CMD_ALLOC_RES, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = true, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_ALLOC_RES_wrapper + }, + { + .opcode = MLX4_CMD_FREE_RES, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_FREE_RES_wrapper + }, + { + .opcode = MLX4_CMD_SW2HW_MPT, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = true, + .verify = NULL, + .wrapper = mlx4_SW2HW_MPT_wrapper + }, + { + .opcode = MLX4_CMD_QUERY_MPT, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_QUERY_MPT_wrapper + }, + { + .opcode = MLX4_CMD_HW2SW_MPT, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_HW2SW_MPT_wrapper + }, + { + .opcode = MLX4_CMD_READ_MTT, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = NULL + }, + { + .opcode = MLX4_CMD_WRITE_MTT, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_WRITE_MTT_wrapper + }, + { + .opcode = MLX4_CMD_SYNC_TPT, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = NULL + }, + { + .opcode = MLX4_CMD_HW2SW_EQ, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = true, + .verify = NULL, + .wrapper = mlx4_HW2SW_EQ_wrapper + }, + { + .opcode = MLX4_CMD_QUERY_EQ, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = true, + .verify = NULL, + .wrapper = mlx4_QUERY_EQ_wrapper + }, + { + .opcode = MLX4_CMD_SW2HW_CQ, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = true, + .verify = NULL, + .wrapper = mlx4_SW2HW_CQ_wrapper + }, + { + .opcode = MLX4_CMD_HW2SW_CQ, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_HW2SW_CQ_wrapper + }, + { + .opcode = MLX4_CMD_QUERY_CQ, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_QUERY_CQ_wrapper + }, + { + .opcode = MLX4_CMD_MODIFY_CQ, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = true, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_MODIFY_CQ_wrapper + }, + { + .opcode = MLX4_CMD_SW2HW_SRQ, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = true, + .verify = NULL, + .wrapper = mlx4_SW2HW_SRQ_wrapper + }, + { + .opcode = MLX4_CMD_HW2SW_SRQ, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_HW2SW_SRQ_wrapper + }, + { + .opcode = MLX4_CMD_QUERY_SRQ, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_QUERY_SRQ_wrapper + }, + { + .opcode = MLX4_CMD_ARM_SRQ, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_ARM_SRQ_wrapper + }, + { + .opcode = MLX4_CMD_RST2INIT_QP, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = true, + .verify = NULL, + .wrapper = mlx4_RST2INIT_QP_wrapper + }, + { + .opcode = MLX4_CMD_INIT2INIT_QP, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_INIT2INIT_QP_wrapper + }, + { + .opcode = MLX4_CMD_INIT2RTR_QP, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_INIT2RTR_QP_wrapper + }, + { + .opcode = MLX4_CMD_RTR2RTS_QP, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_RTR2RTS_QP_wrapper + }, + { + .opcode = MLX4_CMD_RTS2RTS_QP, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_RTS2RTS_QP_wrapper + }, + { + .opcode = MLX4_CMD_SQERR2RTS_QP, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_SQERR2RTS_QP_wrapper + }, + { + .opcode = MLX4_CMD_2ERR_QP, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_GEN_QP_wrapper + }, + { + .opcode = MLX4_CMD_RTS2SQD_QP, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_GEN_QP_wrapper + }, + { + .opcode = MLX4_CMD_SQD2SQD_QP, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_SQD2SQD_QP_wrapper + }, + { + .opcode = MLX4_CMD_SQD2RTS_QP, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_SQD2RTS_QP_wrapper + }, + { + .opcode = MLX4_CMD_2RST_QP, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_2RST_QP_wrapper + }, + { + .opcode = MLX4_CMD_QUERY_QP, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_GEN_QP_wrapper + }, + { + .opcode = MLX4_CMD_SUSPEND_QP, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_GEN_QP_wrapper + }, + { + .opcode = MLX4_CMD_UNSUSPEND_QP, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_GEN_QP_wrapper + }, + { + .opcode = MLX4_CMD_UPDATE_QP, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_UPDATE_QP_wrapper + }, + { + .opcode = MLX4_CMD_GET_OP_REQ, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_CMD_EPERM_wrapper, + }, + { + .opcode = MLX4_CMD_CONF_SPECIAL_QP, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, /* XXX verify: only demux can do this */ + .wrapper = NULL + }, + { + .opcode = MLX4_CMD_MAD_IFC, + .has_inbox = true, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_MAD_IFC_wrapper + }, + { + .opcode = MLX4_CMD_MAD_DEMUX, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_CMD_EPERM_wrapper + }, + { + .opcode = MLX4_CMD_QUERY_IF_STAT, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_QUERY_IF_STAT_wrapper + }, + /* Native multicast commands are not available for guests */ + { + .opcode = MLX4_CMD_QP_ATTACH, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_QP_ATTACH_wrapper + }, + { + .opcode = MLX4_CMD_PROMISC, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_PROMISC_wrapper + }, + /* Ethernet specific commands */ + { + .opcode = MLX4_CMD_SET_VLAN_FLTR, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_SET_VLAN_FLTR_wrapper + }, + { + .opcode = MLX4_CMD_SET_MCAST_FLTR, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_SET_MCAST_FLTR_wrapper + }, + { + .opcode = MLX4_CMD_DUMP_ETH_STATS, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_DUMP_ETH_STATS_wrapper + }, + { + .opcode = MLX4_CMD_INFORM_FLR_DONE, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = NULL + }, + /* flow steering commands */ + { + .opcode = MLX4_QP_FLOW_STEERING_ATTACH, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = true, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_QP_FLOW_STEERING_ATTACH_wrapper + }, + { + .opcode = MLX4_QP_FLOW_STEERING_DETACH, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_QP_FLOW_STEERING_DETACH_wrapper + }, + { + .opcode = MLX4_FLOW_STEERING_IB_UC_QP_RANGE, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_CMD_EPERM_wrapper + }, +}; + +static int mlx4_master_process_vhcr(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr_cmd *in_vhcr) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_cmd_info *cmd = NULL; + struct mlx4_vhcr_cmd *vhcr_cmd = in_vhcr ? in_vhcr : priv->mfunc.vhcr; + struct mlx4_vhcr *vhcr; + struct mlx4_cmd_mailbox *inbox = NULL; + struct mlx4_cmd_mailbox *outbox = NULL; + u64 in_param; + u64 out_param; + int ret = 0; + int i; + int err = 0; + + /* Create sw representation of Virtual HCR */ + vhcr = kzalloc(sizeof(struct mlx4_vhcr), GFP_KERNEL); + if (!vhcr) + return -ENOMEM; + + /* DMA in the vHCR */ + if (!in_vhcr) { + ret = mlx4_ACCESS_MEM(dev, priv->mfunc.vhcr_dma, slave, + priv->mfunc.master.slave_state[slave].vhcr_dma, + ALIGN(sizeof(struct mlx4_vhcr_cmd), + MLX4_ACCESS_MEM_ALIGN), 1); + if (ret) { + mlx4_err(dev, "%s: Failed reading vhcr ret: 0x%x\n", + __func__, ret); + kfree(vhcr); + return ret; + } + } + + /* Fill SW VHCR fields */ + vhcr->in_param = be64_to_cpu(vhcr_cmd->in_param); + vhcr->out_param = be64_to_cpu(vhcr_cmd->out_param); + vhcr->in_modifier = be32_to_cpu(vhcr_cmd->in_modifier); + vhcr->token = be16_to_cpu(vhcr_cmd->token); + vhcr->op = be16_to_cpu(vhcr_cmd->opcode) & 0xfff; + vhcr->op_modifier = (u8) (be16_to_cpu(vhcr_cmd->opcode) >> 12); + vhcr->e_bit = vhcr_cmd->flags & (1 << 6); + + /* Lookup command */ + for (i = 0; i < ARRAY_SIZE(cmd_info); ++i) { + if (vhcr->op == cmd_info[i].opcode) { + cmd = &cmd_info[i]; + break; + } + } + if (!cmd) { + mlx4_err(dev, "Unknown command:0x%x accepted from slave:%d\n", + vhcr->op, slave); + vhcr_cmd->status = CMD_STAT_BAD_PARAM; + goto out_status; + } + + /* Read inbox */ + if (cmd->has_inbox) { + vhcr->in_param &= INBOX_MASK; + inbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(inbox)) { + vhcr_cmd->status = CMD_STAT_BAD_SIZE; + inbox = NULL; + goto out_status; + } + + if (mlx4_ACCESS_MEM(dev, inbox->dma, slave, + vhcr->in_param, + MLX4_MAILBOX_SIZE, 1)) { + mlx4_err(dev, "%s: Failed reading inbox (cmd:0x%x)\n", + __func__, cmd->opcode); + vhcr_cmd->status = CMD_STAT_INTERNAL_ERR; + goto out_status; + } + } + + /* Apply permission and bound checks if applicable */ + if (cmd->verify && cmd->verify(dev, slave, vhcr, inbox)) { + mlx4_warn(dev, "Command:0x%x from slave: %d failed protection checks for resource_id:%d\n", + vhcr->op, slave, vhcr->in_modifier); + vhcr_cmd->status = CMD_STAT_BAD_OP; + goto out_status; + } + + /* Allocate outbox */ + if (cmd->has_outbox) { + outbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(outbox)) { + vhcr_cmd->status = CMD_STAT_BAD_SIZE; + outbox = NULL; + goto out_status; + } + } + + /* Execute the command! */ + if (cmd->wrapper) { + err = cmd->wrapper(dev, slave, vhcr, inbox, outbox, + cmd); + if (cmd->out_is_imm) + vhcr_cmd->out_param = cpu_to_be64(vhcr->out_param); + } else { + in_param = cmd->has_inbox ? (u64) inbox->dma : + vhcr->in_param; + out_param = cmd->has_outbox ? (u64) outbox->dma : + vhcr->out_param; + err = __mlx4_cmd(dev, in_param, &out_param, + cmd->out_is_imm, vhcr->in_modifier, + vhcr->op_modifier, vhcr->op, + MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + + if (cmd->out_is_imm) { + vhcr->out_param = out_param; + vhcr_cmd->out_param = cpu_to_be64(vhcr->out_param); + } + } + + if (err) { + mlx4_warn(dev, "vhcr command:0x%x slave:%d failed with error:%d, status %d\n", + vhcr->op, slave, vhcr->errno, err); + vhcr_cmd->status = mlx4_errno_to_status(err); + goto out_status; + } + + + /* Write outbox if command completed successfully */ + if (cmd->has_outbox && !vhcr_cmd->status) { + ret = mlx4_ACCESS_MEM(dev, outbox->dma, slave, + vhcr->out_param, + MLX4_MAILBOX_SIZE, MLX4_CMD_WRAPPED); + if (ret) { + /* If we failed to write back the outbox after the + *command was successfully executed, we must fail this + * slave, as it is now in undefined state */ + mlx4_err(dev, "%s:Failed writing outbox\n", __func__); + goto out; + } + } + +out_status: + /* DMA back vhcr result */ + if (!in_vhcr) { + ret = mlx4_ACCESS_MEM(dev, priv->mfunc.vhcr_dma, slave, + priv->mfunc.master.slave_state[slave].vhcr_dma, + ALIGN(sizeof(struct mlx4_vhcr), + MLX4_ACCESS_MEM_ALIGN), + MLX4_CMD_WRAPPED); + if (ret) + mlx4_err(dev, "%s:Failed writing vhcr result\n", + __func__); + else if (vhcr->e_bit && + mlx4_GEN_EQE(dev, slave, &priv->mfunc.master.cmd_eqe)) + mlx4_warn(dev, "Failed to generate command completion eqe for slave %d\n", + slave); + } + +out: + kfree(vhcr); + mlx4_free_cmd_mailbox(dev, inbox); + mlx4_free_cmd_mailbox(dev, outbox); + return ret; +} + +static int mlx4_master_immediate_activate_vlan_qos(struct mlx4_priv *priv, + int slave, int port) +{ + struct mlx4_vport_oper_state *vp_oper; + struct mlx4_vport_state *vp_admin; + struct mlx4_vf_immed_vlan_work *work; + struct mlx4_dev *dev = &(priv->dev); + int err; + int admin_vlan_ix = NO_INDX; + + vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port]; + vp_admin = &priv->mfunc.master.vf_admin[slave].vport[port]; + + if (vp_oper->state.default_vlan == vp_admin->default_vlan && + vp_oper->state.default_qos == vp_admin->default_qos && + vp_oper->state.link_state == vp_admin->link_state) + return 0; + + if (!(priv->mfunc.master.slave_state[slave].active && + dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_UPDATE_QP)) { + /* even if the UPDATE_QP command isn't supported, we still want + * to set this VF link according to the admin directive + */ + vp_oper->state.link_state = vp_admin->link_state; + return -1; + } + + mlx4_dbg(dev, "updating immediately admin params slave %d port %d\n", + slave, port); + mlx4_dbg(dev, "vlan %d QoS %d link down %d\n", + vp_admin->default_vlan, vp_admin->default_qos, + vp_admin->link_state); + + work = kzalloc(sizeof(*work), GFP_KERNEL); + if (!work) + return -ENOMEM; + + if (vp_oper->state.default_vlan != vp_admin->default_vlan) { + if (MLX4_VGT != vp_admin->default_vlan) { + err = __mlx4_register_vlan(&priv->dev, port, + vp_admin->default_vlan, + &admin_vlan_ix); + if (err) { + kfree(work); + mlx4_warn(&priv->dev, + "No vlan resources slave %d, port %d\n", + slave, port); + return err; + } + } else { + admin_vlan_ix = NO_INDX; + } + work->flags |= MLX4_VF_IMMED_VLAN_FLAG_VLAN; + mlx4_dbg(&priv->dev, + "alloc vlan %d idx %d slave %d port %d\n", + (int)(vp_admin->default_vlan), + admin_vlan_ix, slave, port); + } + + /* save original vlan ix and vlan id */ + work->orig_vlan_id = vp_oper->state.default_vlan; + work->orig_vlan_ix = vp_oper->vlan_idx; + + /* handle new qos */ + if (vp_oper->state.default_qos != vp_admin->default_qos) + work->flags |= MLX4_VF_IMMED_VLAN_FLAG_QOS; + + if (work->flags & MLX4_VF_IMMED_VLAN_FLAG_VLAN) + vp_oper->vlan_idx = admin_vlan_ix; + + vp_oper->state.default_vlan = vp_admin->default_vlan; + vp_oper->state.default_qos = vp_admin->default_qos; + vp_oper->state.link_state = vp_admin->link_state; + + if (vp_admin->link_state == IFLA_VF_LINK_STATE_DISABLE) + work->flags |= MLX4_VF_IMMED_VLAN_FLAG_LINK_DISABLE; + + /* iterate over QPs owned by this slave, using UPDATE_QP */ + work->port = port; + work->slave = slave; + work->qos = vp_oper->state.default_qos; + work->vlan_id = vp_oper->state.default_vlan; + work->vlan_ix = vp_oper->vlan_idx; + work->priv = priv; + INIT_WORK(&work->work, mlx4_vf_immed_vlan_work_handler); + queue_work(priv->mfunc.master.comm_wq, &work->work); + + return 0; +} + + +static int mlx4_master_activate_admin_state(struct mlx4_priv *priv, int slave) +{ + int port, err; + struct mlx4_vport_state *vp_admin; + struct mlx4_vport_oper_state *vp_oper; + struct mlx4_active_ports actv_ports = mlx4_get_active_ports( + &priv->dev, slave); + int min_port = find_first_bit(actv_ports.ports, + priv->dev.caps.num_ports) + 1; + int max_port = min_port - 1 + + bitmap_weight(actv_ports.ports, priv->dev.caps.num_ports); + + for (port = min_port; port <= max_port; port++) { + if (!test_bit(port - 1, actv_ports.ports)) + continue; + priv->mfunc.master.vf_oper[slave].smi_enabled[port] = + priv->mfunc.master.vf_admin[slave].enable_smi[port]; + vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port]; + vp_admin = &priv->mfunc.master.vf_admin[slave].vport[port]; + vp_oper->state = *vp_admin; + if (MLX4_VGT != vp_admin->default_vlan) { + err = __mlx4_register_vlan(&priv->dev, port, + vp_admin->default_vlan, &(vp_oper->vlan_idx)); + if (err) { + vp_oper->vlan_idx = NO_INDX; + mlx4_warn(&priv->dev, + "No vlan resources slave %d, port %d\n", + slave, port); + return err; + } + mlx4_dbg(&priv->dev, "alloc vlan %d idx %d slave %d port %d\n", + (int)(vp_oper->state.default_vlan), + vp_oper->vlan_idx, slave, port); + } + if (vp_admin->spoofchk) { + vp_oper->mac_idx = __mlx4_register_mac(&priv->dev, + port, + vp_admin->mac); + if (0 > vp_oper->mac_idx) { + err = vp_oper->mac_idx; + vp_oper->mac_idx = NO_INDX; + mlx4_warn(&priv->dev, + "No mac resources slave %d, port %d\n", + slave, port); + return err; + } + mlx4_dbg(&priv->dev, "alloc mac %llx idx %d slave %d port %d\n", + vp_oper->state.mac, vp_oper->mac_idx, slave, port); + } + } + return 0; +} + +static void mlx4_master_deactivate_admin_state(struct mlx4_priv *priv, int slave) +{ + int port; + struct mlx4_vport_oper_state *vp_oper; + struct mlx4_active_ports actv_ports = mlx4_get_active_ports( + &priv->dev, slave); + int min_port = find_first_bit(actv_ports.ports, + priv->dev.caps.num_ports) + 1; + int max_port = min_port - 1 + + bitmap_weight(actv_ports.ports, priv->dev.caps.num_ports); + + + for (port = min_port; port <= max_port; port++) { + if (!test_bit(port - 1, actv_ports.ports)) + continue; + priv->mfunc.master.vf_oper[slave].smi_enabled[port] = + MLX4_VF_SMI_DISABLED; + vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port]; + if (NO_INDX != vp_oper->vlan_idx) { + __mlx4_unregister_vlan(&priv->dev, + port, vp_oper->state.default_vlan); + vp_oper->vlan_idx = NO_INDX; + } + if (NO_INDX != vp_oper->mac_idx) { + __mlx4_unregister_mac(&priv->dev, port, vp_oper->state.mac); + vp_oper->mac_idx = NO_INDX; + } + } + return; +} + +static void mlx4_master_do_cmd(struct mlx4_dev *dev, int slave, u8 cmd, + u16 param, u8 toggle) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *slave_state = priv->mfunc.master.slave_state; + u32 reply; + u8 is_going_down = 0; + int i; + unsigned long flags; + + slave_state[slave].comm_toggle ^= 1; + reply = (u32) slave_state[slave].comm_toggle << 31; + if (toggle != slave_state[slave].comm_toggle) { + mlx4_warn(dev, "Incorrect toggle %d from slave %d. *** MASTER STATE COMPROMISED ***\n", + toggle, slave); + goto reset_slave; + } + if (cmd == MLX4_COMM_CMD_RESET) { + mlx4_warn(dev, "Received reset from slave:%d\n", slave); + slave_state[slave].active = false; + slave_state[slave].old_vlan_api = false; + mlx4_master_deactivate_admin_state(priv, slave); + for (i = 0; i < MLX4_EVENT_TYPES_NUM; ++i) { + slave_state[slave].event_eq[i].eqn = -1; + slave_state[slave].event_eq[i].token = 0; + } + /*check if we are in the middle of FLR process, + if so return "retry" status to the slave*/ + if (MLX4_COMM_CMD_FLR == slave_state[slave].last_cmd) + goto inform_slave_state; + + mlx4_dispatch_event(dev, MLX4_DEV_EVENT_SLAVE_SHUTDOWN, slave); + + /* write the version in the event field */ + reply |= mlx4_comm_get_version(); + + goto reset_slave; + } + /*command from slave in the middle of FLR*/ + if (cmd != MLX4_COMM_CMD_RESET && + MLX4_COMM_CMD_FLR == slave_state[slave].last_cmd) { + mlx4_warn(dev, "slave:%d is Trying to run cmd(0x%x) in the middle of FLR\n", + slave, cmd); + return; + } + + switch (cmd) { + case MLX4_COMM_CMD_VHCR0: + if (slave_state[slave].last_cmd != MLX4_COMM_CMD_RESET) + goto reset_slave; + slave_state[slave].vhcr_dma = ((u64) param) << 48; + priv->mfunc.master.slave_state[slave].cookie = 0; + mutex_init(&priv->mfunc.master.gen_eqe_mutex[slave]); + break; + case MLX4_COMM_CMD_VHCR1: + if (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR0) + goto reset_slave; + slave_state[slave].vhcr_dma |= ((u64) param) << 32; + break; + case MLX4_COMM_CMD_VHCR2: + if (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR1) + goto reset_slave; + slave_state[slave].vhcr_dma |= ((u64) param) << 16; + break; + case MLX4_COMM_CMD_VHCR_EN: + if (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR2) + goto reset_slave; + slave_state[slave].vhcr_dma |= param; + if (mlx4_master_activate_admin_state(priv, slave)) + goto reset_slave; + slave_state[slave].active = true; + mlx4_dispatch_event(dev, MLX4_DEV_EVENT_SLAVE_INIT, slave); + break; + case MLX4_COMM_CMD_VHCR_POST: + if ((slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR_EN) && + (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR_POST)) + goto reset_slave; + + mutex_lock(&priv->cmd.slave_cmd_mutex); + if (mlx4_master_process_vhcr(dev, slave, NULL)) { + mlx4_err(dev, "Failed processing vhcr for slave:%d, resetting slave\n", + slave); + mutex_unlock(&priv->cmd.slave_cmd_mutex); + goto reset_slave; + } + mutex_unlock(&priv->cmd.slave_cmd_mutex); + break; + default: + mlx4_warn(dev, "Bad comm cmd:%d from slave:%d\n", cmd, slave); + goto reset_slave; + } + spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags); + if (!slave_state[slave].is_slave_going_down) + slave_state[slave].last_cmd = cmd; + else + is_going_down = 1; + spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags); + if (is_going_down) { + mlx4_warn(dev, "Slave is going down aborting command(%d) executing from slave:%d\n", + cmd, slave); + return; + } + __raw_writel((__force u32) cpu_to_be32(reply), + &priv->mfunc.comm[slave].slave_read); + mmiowb(); + + return; + +reset_slave: + /* cleanup any slave resources */ + mlx4_delete_all_resources_for_slave(dev, slave); + spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags); + if (!slave_state[slave].is_slave_going_down) + slave_state[slave].last_cmd = MLX4_COMM_CMD_RESET; + spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags); + /*with slave in the middle of flr, no need to clean resources again.*/ +inform_slave_state: + memset(&slave_state[slave].event_eq, 0, + sizeof(struct mlx4_slave_event_eq_info)); + __raw_writel((__force u32) cpu_to_be32(reply), + &priv->mfunc.comm[slave].slave_read); + wmb(); +} + +/* master command processing */ +void mlx4_master_comm_channel(struct work_struct *work) +{ + struct mlx4_mfunc_master_ctx *master = + container_of(work, + struct mlx4_mfunc_master_ctx, + comm_work); + struct mlx4_mfunc *mfunc = + container_of(master, struct mlx4_mfunc, master); + struct mlx4_priv *priv = + container_of(mfunc, struct mlx4_priv, mfunc); + struct mlx4_dev *dev = &priv->dev; + __be32 *bit_vec; + u32 comm_cmd; + u32 vec; + int i, j, slave; + int toggle; + int served = 0; + int reported = 0; + u32 slt; + + bit_vec = master->comm_arm_bit_vector; + for (i = 0; i < COMM_CHANNEL_BIT_ARRAY_SIZE; i++) { + vec = be32_to_cpu(bit_vec[i]); + for (j = 0; j < 32; j++) { + if (!(vec & (1 << j))) + continue; + ++reported; + slave = (i * 32) + j; + comm_cmd = swab32(readl( + &mfunc->comm[slave].slave_write)); + slt = swab32(readl(&mfunc->comm[slave].slave_read)) + >> 31; + toggle = comm_cmd >> 31; + if (toggle != slt) { + if (master->slave_state[slave].comm_toggle + != slt) { + pr_info("slave %d out of sync. read toggle %d, state toggle %d. Resynching.\n", + slave, slt, + master->slave_state[slave].comm_toggle); + master->slave_state[slave].comm_toggle = + slt; + } + mlx4_master_do_cmd(dev, slave, + comm_cmd >> 16 & 0xff, + comm_cmd & 0xffff, toggle); + ++served; + } + } + } + + if (reported && reported != served) + mlx4_warn(dev, "Got command event with bitmask from %d slaves but %d were served\n", + reported, served); + + if (mlx4_ARM_COMM_CHANNEL(dev)) + mlx4_warn(dev, "Failed to arm comm channel events\n"); +} + +static int sync_toggles(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int wr_toggle; + int rd_toggle; + unsigned long end; + + wr_toggle = swab32(readl(&priv->mfunc.comm->slave_write)) >> 31; + end = jiffies + msecs_to_jiffies(5000); + + while (time_before(jiffies, end)) { + rd_toggle = swab32(readl(&priv->mfunc.comm->slave_read)) >> 31; + if (rd_toggle == wr_toggle) { + priv->cmd.comm_toggle = rd_toggle; + return 0; + } + + cond_resched(); + } + + /* + * we could reach here if for example the previous VM using this + * function misbehaved and left the channel with unsynced state. We + * should fix this here and give this VM a chance to use a properly + * synced channel + */ + mlx4_warn(dev, "recovering from previously mis-behaved VM\n"); + __raw_writel((__force u32) 0, &priv->mfunc.comm->slave_read); + __raw_writel((__force u32) 0, &priv->mfunc.comm->slave_write); + priv->cmd.comm_toggle = 0; + + return 0; +} + +int mlx4_multi_func_init(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *s_state; + int i, j, err, port; + + if (mlx4_is_master(dev)) + priv->mfunc.comm = + ioremap(pci_resource_start(dev->pdev, priv->fw.comm_bar) + + priv->fw.comm_base, MLX4_COMM_PAGESIZE); + else + priv->mfunc.comm = + ioremap(pci_resource_start(dev->pdev, 2) + + MLX4_SLAVE_COMM_BASE, MLX4_COMM_PAGESIZE); + if (!priv->mfunc.comm) { + mlx4_err(dev, "Couldn't map communication vector\n"); + goto err_vhcr; + } + + if (mlx4_is_master(dev)) { + priv->mfunc.master.slave_state = + kzalloc(dev->num_slaves * + sizeof(struct mlx4_slave_state), GFP_KERNEL); + if (!priv->mfunc.master.slave_state) + goto err_comm; + + priv->mfunc.master.vf_admin = + kzalloc(dev->num_slaves * + sizeof(struct mlx4_vf_admin_state), GFP_KERNEL); + if (!priv->mfunc.master.vf_admin) + goto err_comm_admin; + + priv->mfunc.master.vf_oper = + kzalloc(dev->num_slaves * + sizeof(struct mlx4_vf_oper_state), GFP_KERNEL); + if (!priv->mfunc.master.vf_oper) + goto err_comm_oper; + + for (i = 0; i < dev->num_slaves; ++i) { + s_state = &priv->mfunc.master.slave_state[i]; + s_state->last_cmd = MLX4_COMM_CMD_RESET; + for (j = 0; j < MLX4_EVENT_TYPES_NUM; ++j) + s_state->event_eq[j].eqn = -1; + __raw_writel((__force u32) 0, + &priv->mfunc.comm[i].slave_write); + __raw_writel((__force u32) 0, + &priv->mfunc.comm[i].slave_read); + mmiowb(); + for (port = 1; port <= MLX4_MAX_PORTS; port++) { + s_state->vlan_filter[port] = + kzalloc(sizeof(struct mlx4_vlan_fltr), + GFP_KERNEL); + if (!s_state->vlan_filter[port]) { + if (--port) + kfree(s_state->vlan_filter[port]); + goto err_slaves; + } + INIT_LIST_HEAD(&s_state->mcast_filters[port]); + priv->mfunc.master.vf_admin[i].vport[port].default_vlan = MLX4_VGT; + priv->mfunc.master.vf_oper[i].vport[port].state.default_vlan = MLX4_VGT; + priv->mfunc.master.vf_oper[i].vport[port].vlan_idx = NO_INDX; + priv->mfunc.master.vf_oper[i].vport[port].mac_idx = NO_INDX; + } + spin_lock_init(&s_state->lock); + } + + memset(&priv->mfunc.master.cmd_eqe, 0, dev->caps.eqe_size); + priv->mfunc.master.cmd_eqe.type = MLX4_EVENT_TYPE_CMD; + INIT_WORK(&priv->mfunc.master.comm_work, + mlx4_master_comm_channel); + INIT_WORK(&priv->mfunc.master.slave_event_work, + mlx4_gen_slave_eqe); + INIT_WORK(&priv->mfunc.master.slave_flr_event_work, + mlx4_master_handle_slave_flr); + spin_lock_init(&priv->mfunc.master.slave_state_lock); + spin_lock_init(&priv->mfunc.master.slave_eq.event_lock); + priv->mfunc.master.comm_wq = + create_singlethread_workqueue("mlx4_comm"); + if (!priv->mfunc.master.comm_wq) + goto err_slaves; + + if (mlx4_init_resource_tracker(dev)) + goto err_thread; + + err = mlx4_ARM_COMM_CHANNEL(dev); + if (err) { + mlx4_err(dev, " Failed to arm comm channel eq: %x\n", + err); + goto err_resource; + } + + } else { + err = sync_toggles(dev); + if (err) { + mlx4_err(dev, "Couldn't sync toggles\n"); + goto err_comm; + } + } + return 0; + +err_resource: + mlx4_free_resource_tracker(dev, RES_TR_FREE_ALL); +err_thread: + flush_workqueue(priv->mfunc.master.comm_wq); + destroy_workqueue(priv->mfunc.master.comm_wq); +err_slaves: + while (--i) { + for (port = 1; port <= MLX4_MAX_PORTS; port++) + kfree(priv->mfunc.master.slave_state[i].vlan_filter[port]); + } + kfree(priv->mfunc.master.vf_oper); +err_comm_oper: + kfree(priv->mfunc.master.vf_admin); +err_comm_admin: + kfree(priv->mfunc.master.slave_state); +err_comm: + iounmap(priv->mfunc.comm); +err_vhcr: + dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE, + priv->mfunc.vhcr, + priv->mfunc.vhcr_dma); + priv->mfunc.vhcr = NULL; + return -ENOMEM; +} + +int mlx4_cmd_init(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + mutex_init(&priv->cmd.hcr_mutex); + mutex_init(&priv->cmd.slave_cmd_mutex); + sema_init(&priv->cmd.poll_sem, 1); + priv->cmd.use_events = 0; + priv->cmd.toggle = 1; + + priv->cmd.hcr = NULL; + priv->mfunc.vhcr = NULL; + + if (!mlx4_is_slave(dev)) { + priv->cmd.hcr = ioremap(pci_resource_start(dev->pdev, 0) + + MLX4_HCR_BASE, MLX4_HCR_SIZE); + if (!priv->cmd.hcr) { + mlx4_err(dev, "Couldn't map command register\n"); + return -ENOMEM; + } + } + + if (mlx4_is_mfunc(dev)) { + priv->mfunc.vhcr = dma_alloc_coherent(&(dev->pdev->dev), PAGE_SIZE, + &priv->mfunc.vhcr_dma, + GFP_KERNEL); + if (!priv->mfunc.vhcr) + goto err_hcr; + } + + priv->cmd.pool = pci_pool_create("mlx4_cmd", dev->pdev, + MLX4_MAILBOX_SIZE, + MLX4_MAILBOX_SIZE, 0); + if (!priv->cmd.pool) + goto err_vhcr; + + return 0; + +err_vhcr: + if (mlx4_is_mfunc(dev)) + dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE, + priv->mfunc.vhcr, priv->mfunc.vhcr_dma); + priv->mfunc.vhcr = NULL; + +err_hcr: + if (!mlx4_is_slave(dev)) + iounmap(priv->cmd.hcr); + return -ENOMEM; +} + +void mlx4_multi_func_cleanup(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int i, port; + + if (mlx4_is_master(dev)) { + flush_workqueue(priv->mfunc.master.comm_wq); + destroy_workqueue(priv->mfunc.master.comm_wq); + for (i = 0; i < dev->num_slaves; i++) { + for (port = 1; port <= MLX4_MAX_PORTS; port++) + kfree(priv->mfunc.master.slave_state[i].vlan_filter[port]); + } + kfree(priv->mfunc.master.slave_state); + kfree(priv->mfunc.master.vf_admin); + kfree(priv->mfunc.master.vf_oper); + } + + iounmap(priv->mfunc.comm); +} + +void mlx4_cmd_cleanup(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + pci_pool_destroy(priv->cmd.pool); + + if (!mlx4_is_slave(dev)) + iounmap(priv->cmd.hcr); + if (mlx4_is_mfunc(dev)) + dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE, + priv->mfunc.vhcr, priv->mfunc.vhcr_dma); + priv->mfunc.vhcr = NULL; +} + +/* + * Switch to using events to issue FW commands (can only be called + * after event queue for command events has been initialized). + */ +int mlx4_cmd_use_events(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int i; + int err = 0; + + priv->cmd.context = kmalloc(priv->cmd.max_cmds * + sizeof (struct mlx4_cmd_context), + GFP_KERNEL); + if (!priv->cmd.context) + return -ENOMEM; + + for (i = 0; i < priv->cmd.max_cmds; ++i) { + priv->cmd.context[i].token = i; + priv->cmd.context[i].next = i + 1; + } + + priv->cmd.context[priv->cmd.max_cmds - 1].next = -1; + priv->cmd.free_head = 0; + + sema_init(&priv->cmd.event_sem, priv->cmd.max_cmds); + spin_lock_init(&priv->cmd.context_lock); + + for (priv->cmd.token_mask = 1; + priv->cmd.token_mask < priv->cmd.max_cmds; + priv->cmd.token_mask <<= 1) + ; /* nothing */ + --priv->cmd.token_mask; + + down(&priv->cmd.poll_sem); + priv->cmd.use_events = 1; + + return err; +} + +/* + * Switch back to polling (used when shutting down the device) + */ +void mlx4_cmd_use_polling(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int i; + + priv->cmd.use_events = 0; + + for (i = 0; i < priv->cmd.max_cmds; ++i) + down(&priv->cmd.event_sem); + + kfree(priv->cmd.context); + + up(&priv->cmd.poll_sem); +} + +struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev) +{ + struct mlx4_cmd_mailbox *mailbox; + + mailbox = kmalloc(sizeof *mailbox, GFP_KERNEL); + if (!mailbox) + return ERR_PTR(-ENOMEM); + + mailbox->buf = pci_pool_alloc(mlx4_priv(dev)->cmd.pool, GFP_KERNEL, + &mailbox->dma); + if (!mailbox->buf) { + kfree(mailbox); + return ERR_PTR(-ENOMEM); + } + + memset(mailbox->buf, 0, MLX4_MAILBOX_SIZE); + + return mailbox; +} +EXPORT_SYMBOL_GPL(mlx4_alloc_cmd_mailbox); + +void mlx4_free_cmd_mailbox(struct mlx4_dev *dev, + struct mlx4_cmd_mailbox *mailbox) +{ + if (!mailbox) + return; + + pci_pool_free(mlx4_priv(dev)->cmd.pool, mailbox->buf, mailbox->dma); + kfree(mailbox); +} +EXPORT_SYMBOL_GPL(mlx4_free_cmd_mailbox); + +u32 mlx4_comm_get_version(void) +{ + return ((u32) CMD_CHAN_IF_REV << 8) | (u32) CMD_CHAN_VER; +} + +static int mlx4_get_slave_indx(struct mlx4_dev *dev, int vf) +{ + if ((vf < 0) || (vf >= dev->num_vfs)) { + mlx4_err(dev, "Bad vf number:%d (number of activated vf: %d)\n", vf, dev->num_vfs); + return -EINVAL; + } + + return vf+1; +} + +int mlx4_get_vf_indx(struct mlx4_dev *dev, int slave) +{ + if (slave < 1 || slave > dev->num_vfs) { + mlx4_err(dev, + "Bad slave number:%d (number of activated slaves: %lu)\n", + slave, dev->num_slaves); + return -EINVAL; + } + return slave - 1; +} + +struct mlx4_active_ports mlx4_get_active_ports(struct mlx4_dev *dev, int slave) +{ + struct mlx4_active_ports actv_ports; + int vf; + + bitmap_zero(actv_ports.ports, MLX4_MAX_PORTS); + + if (slave == 0) { + bitmap_fill(actv_ports.ports, dev->caps.num_ports); + return actv_ports; + } + + vf = mlx4_get_vf_indx(dev, slave); + if (vf < 0) + return actv_ports; + + bitmap_set(actv_ports.ports, dev->dev_vfs[vf].min_port - 1, + min((int)dev->dev_vfs[mlx4_get_vf_indx(dev, slave)].n_ports, + dev->caps.num_ports)); + + return actv_ports; +} +EXPORT_SYMBOL_GPL(mlx4_get_active_ports); + +int mlx4_slave_convert_port(struct mlx4_dev *dev, int slave, int port) +{ + unsigned n; + struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave); + unsigned m = bitmap_weight(actv_ports.ports, dev->caps.num_ports); + + if (port <= 0 || port > m) + return -EINVAL; + + n = find_first_bit(actv_ports.ports, dev->caps.num_ports); + if (port <= n) + port = n + 1; + + return port; +} +EXPORT_SYMBOL_GPL(mlx4_slave_convert_port); + +int mlx4_phys_to_slave_port(struct mlx4_dev *dev, int slave, int port) +{ + struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave); + if (test_bit(port - 1, actv_ports.ports)) + return port - + find_first_bit(actv_ports.ports, dev->caps.num_ports); + + return -1; +} +EXPORT_SYMBOL_GPL(mlx4_phys_to_slave_port); + +struct mlx4_slaves_pport mlx4_phys_to_slaves_pport(struct mlx4_dev *dev, + int port) +{ + unsigned i; + struct mlx4_slaves_pport slaves_pport; + + bitmap_zero(slaves_pport.slaves, MLX4_MFUNC_MAX); + + if (port <= 0 || port > dev->caps.num_ports) + return slaves_pport; + + for (i = 0; i < dev->num_vfs + 1; i++) { + struct mlx4_active_ports actv_ports = + mlx4_get_active_ports(dev, i); + if (test_bit(port - 1, actv_ports.ports)) + set_bit(i, slaves_pport.slaves); + } + + return slaves_pport; +} +EXPORT_SYMBOL_GPL(mlx4_phys_to_slaves_pport); + +struct mlx4_slaves_pport mlx4_phys_to_slaves_pport_actv( + struct mlx4_dev *dev, + const struct mlx4_active_ports *crit_ports) +{ + unsigned i; + struct mlx4_slaves_pport slaves_pport; + + bitmap_zero(slaves_pport.slaves, MLX4_MFUNC_MAX); + + for (i = 0; i < dev->num_vfs + 1; i++) { + struct mlx4_active_ports actv_ports = + mlx4_get_active_ports(dev, i); + if (bitmap_equal(crit_ports->ports, actv_ports.ports, + dev->caps.num_ports)) + set_bit(i, slaves_pport.slaves); + } + + return slaves_pport; +} +EXPORT_SYMBOL_GPL(mlx4_phys_to_slaves_pport_actv); + +static int mlx4_slaves_closest_port(struct mlx4_dev *dev, int slave, int port) +{ + struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave); + int min_port = find_first_bit(actv_ports.ports, dev->caps.num_ports) + + 1; + int max_port = min_port + + bitmap_weight(actv_ports.ports, dev->caps.num_ports); + + if (port < min_port) + port = min_port; + else if (port >= max_port) + port = max_port - 1; + + return port; +} + +int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u64 mac) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_vport_state *s_info; + int slave; + + if (!mlx4_is_master(dev)) + return -EPROTONOSUPPORT; + + slave = mlx4_get_slave_indx(dev, vf); + if (slave < 0) + return -EINVAL; + + port = mlx4_slaves_closest_port(dev, slave, port); + s_info = &priv->mfunc.master.vf_admin[slave].vport[port]; + s_info->mac = mac; + mlx4_info(dev, "default mac on vf %d port %d to %llX will take afect only after vf restart\n", + vf, port, s_info->mac); + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_set_vf_mac); + + +int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_vport_state *vf_admin; + int slave; + + if ((!mlx4_is_master(dev)) || + !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_VLAN_CONTROL)) + return -EPROTONOSUPPORT; + + if ((vlan > 4095) || (qos > 7)) + return -EINVAL; + + slave = mlx4_get_slave_indx(dev, vf); + if (slave < 0) + return -EINVAL; + + port = mlx4_slaves_closest_port(dev, slave, port); + vf_admin = &priv->mfunc.master.vf_admin[slave].vport[port]; + + if ((0 == vlan) && (0 == qos)) + vf_admin->default_vlan = MLX4_VGT; + else + vf_admin->default_vlan = vlan; + vf_admin->default_qos = qos; + + if (mlx4_master_immediate_activate_vlan_qos(priv, slave, port)) + mlx4_info(dev, + "updating vf %d port %d config will take effect on next VF restart\n", + vf, port); + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_set_vf_vlan); + + /* mlx4_get_slave_default_vlan - + * return true if VST ( default vlan) + * if VST, will return vlan & qos (if not NULL) + */ +bool mlx4_get_slave_default_vlan(struct mlx4_dev *dev, int port, int slave, + u16 *vlan, u8 *qos) +{ + struct mlx4_vport_oper_state *vp_oper; + struct mlx4_priv *priv; + + priv = mlx4_priv(dev); + port = mlx4_slaves_closest_port(dev, slave, port); + vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port]; + + if (MLX4_VGT != vp_oper->state.default_vlan) { + if (vlan) + *vlan = vp_oper->state.default_vlan; + if (qos) + *qos = vp_oper->state.default_qos; + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(mlx4_get_slave_default_vlan); + +int mlx4_set_vf_spoofchk(struct mlx4_dev *dev, int port, int vf, bool setting) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_vport_state *s_info; + int slave; + + if ((!mlx4_is_master(dev)) || + !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FSM)) + return -EPROTONOSUPPORT; + + slave = mlx4_get_slave_indx(dev, vf); + if (slave < 0) + return -EINVAL; + + port = mlx4_slaves_closest_port(dev, slave, port); + s_info = &priv->mfunc.master.vf_admin[slave].vport[port]; + s_info->spoofchk = setting; + + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_set_vf_spoofchk); + +int mlx4_get_vf_config(struct mlx4_dev *dev, int port, int vf, struct ifla_vf_info *ivf) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_vport_state *s_info; + int slave; + + if (!mlx4_is_master(dev)) + return -EPROTONOSUPPORT; + + slave = mlx4_get_slave_indx(dev, vf); + if (slave < 0) + return -EINVAL; + + s_info = &priv->mfunc.master.vf_admin[slave].vport[port]; + ivf->vf = vf; + + /* need to convert it to a func */ + ivf->mac[0] = ((s_info->mac >> (5*8)) & 0xff); + ivf->mac[1] = ((s_info->mac >> (4*8)) & 0xff); + ivf->mac[2] = ((s_info->mac >> (3*8)) & 0xff); + ivf->mac[3] = ((s_info->mac >> (2*8)) & 0xff); + ivf->mac[4] = ((s_info->mac >> (1*8)) & 0xff); + ivf->mac[5] = ((s_info->mac) & 0xff); + + ivf->vlan = s_info->default_vlan; + ivf->qos = s_info->default_qos; + ivf->max_tx_rate = s_info->tx_rate; + ivf->min_tx_rate = 0; + ivf->spoofchk = s_info->spoofchk; + ivf->linkstate = s_info->link_state; + + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_get_vf_config); + +int mlx4_set_vf_link_state(struct mlx4_dev *dev, int port, int vf, int link_state) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_vport_state *s_info; + int slave; + u8 link_stat_event; + + slave = mlx4_get_slave_indx(dev, vf); + if (slave < 0) + return -EINVAL; + + port = mlx4_slaves_closest_port(dev, slave, port); + switch (link_state) { + case IFLA_VF_LINK_STATE_AUTO: + /* get current link state */ + if (!priv->sense.do_sense_port[port]) + link_stat_event = MLX4_PORT_CHANGE_SUBTYPE_ACTIVE; + else + link_stat_event = MLX4_PORT_CHANGE_SUBTYPE_DOWN; + break; + + case IFLA_VF_LINK_STATE_ENABLE: + link_stat_event = MLX4_PORT_CHANGE_SUBTYPE_ACTIVE; + break; + + case IFLA_VF_LINK_STATE_DISABLE: + link_stat_event = MLX4_PORT_CHANGE_SUBTYPE_DOWN; + break; + + default: + mlx4_warn(dev, "unknown value for link_state %02x on slave %d port %d\n", + link_state, slave, port); + return -EINVAL; + }; + s_info = &priv->mfunc.master.vf_admin[slave].vport[port]; + s_info->link_state = link_state; + + /* send event */ + mlx4_gen_port_state_change_eqe(dev, slave, port, link_stat_event); + + if (mlx4_master_immediate_activate_vlan_qos(priv, slave, port)) + mlx4_dbg(dev, + "updating vf %d port %d no link state HW enforcment\n", + vf, port); + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_set_vf_link_state); + +int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + if (slave < 1 || slave >= dev->num_slaves || + port < 1 || port > MLX4_MAX_PORTS) + return 0; + + return priv->mfunc.master.vf_oper[slave].smi_enabled[port] == + MLX4_VF_SMI_ENABLED; +} +EXPORT_SYMBOL_GPL(mlx4_vf_smi_enabled); + +int mlx4_vf_get_enable_smi_admin(struct mlx4_dev *dev, int slave, int port) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + if (slave == mlx4_master_func_num(dev)) + return 1; + + if (slave < 1 || slave >= dev->num_slaves || + port < 1 || port > MLX4_MAX_PORTS) + return 0; + + return priv->mfunc.master.vf_admin[slave].enable_smi[port] == + MLX4_VF_SMI_ENABLED; +} +EXPORT_SYMBOL_GPL(mlx4_vf_get_enable_smi_admin); + +int mlx4_vf_set_enable_smi_admin(struct mlx4_dev *dev, int slave, int port, + int enabled) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + if (slave == mlx4_master_func_num(dev)) + return 0; + + if (slave < 1 || slave >= dev->num_slaves || + port < 1 || port > MLX4_MAX_PORTS || + enabled < 0 || enabled > 1) + return -EINVAL; + + priv->mfunc.master.vf_admin[slave].enable_smi[port] = enabled; + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_vf_set_enable_smi_admin); diff --git a/drivers/net/ethernet/mellanox/mlx4/cq.c b/drivers/net/ethernet/mellanox/mlx4/cq.c new file mode 100644 index 0000000..56022d6 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/cq.c @@ -0,0 +1,359 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include +#include + +#include "mlx4.h" +#include "icm.h" + +#define MLX4_CQ_STATUS_OK ( 0 << 28) +#define MLX4_CQ_STATUS_OVERFLOW ( 9 << 28) +#define MLX4_CQ_STATUS_WRITE_FAIL (10 << 28) +#define MLX4_CQ_FLAG_CC ( 1 << 18) +#define MLX4_CQ_FLAG_OI ( 1 << 17) +#define MLX4_CQ_STATE_ARMED ( 9 << 8) +#define MLX4_CQ_STATE_ARMED_SOL ( 6 << 8) +#define MLX4_EQ_STATE_FIRED (10 << 8) + +void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn) +{ + struct mlx4_cq *cq; + + cq = radix_tree_lookup(&mlx4_priv(dev)->cq_table.tree, + cqn & (dev->caps.num_cqs - 1)); + if (!cq) { + mlx4_dbg(dev, "Completion event for bogus CQ %08x\n", cqn); + return; + } + + ++cq->arm_sn; + + cq->comp(cq); +} + +void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type) +{ + struct mlx4_cq_table *cq_table = &mlx4_priv(dev)->cq_table; + struct mlx4_cq *cq; + + spin_lock(&cq_table->lock); + + cq = radix_tree_lookup(&cq_table->tree, cqn & (dev->caps.num_cqs - 1)); + if (cq) + atomic_inc(&cq->refcount); + + spin_unlock(&cq_table->lock); + + if (!cq) { + mlx4_warn(dev, "Async event for bogus CQ %08x\n", cqn); + return; + } + + cq->event(cq, event_type); + + if (atomic_dec_and_test(&cq->refcount)) + complete(&cq->free); +} + +static int mlx4_SW2HW_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, + int cq_num) +{ + return mlx4_cmd(dev, mailbox->dma, cq_num, 0, + MLX4_CMD_SW2HW_CQ, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED); +} + +static int mlx4_MODIFY_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, + int cq_num, u32 opmod) +{ + return mlx4_cmd(dev, mailbox->dma, cq_num, opmod, MLX4_CMD_MODIFY_CQ, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); +} + +static int mlx4_HW2SW_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, + int cq_num) +{ + return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, + cq_num, mailbox ? 0 : 1, MLX4_CMD_HW2SW_CQ, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); +} + +int mlx4_cq_modify(struct mlx4_dev *dev, struct mlx4_cq *cq, + u16 count, u16 period) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_cq_context *cq_context; + int err; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + cq_context = mailbox->buf; + cq_context->cq_max_count = cpu_to_be16(count); + cq_context->cq_period = cpu_to_be16(period); + + err = mlx4_MODIFY_CQ(dev, mailbox, cq->cqn, 1); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} +EXPORT_SYMBOL_GPL(mlx4_cq_modify); + +int mlx4_cq_resize(struct mlx4_dev *dev, struct mlx4_cq *cq, + int entries, struct mlx4_mtt *mtt) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_cq_context *cq_context; + u64 mtt_addr; + int err; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + cq_context = mailbox->buf; + cq_context->logsize_usrpage = cpu_to_be32(ilog2(entries) << 24); + cq_context->log_page_size = mtt->page_shift - 12; + mtt_addr = mlx4_mtt_addr(dev, mtt); + cq_context->mtt_base_addr_h = mtt_addr >> 32; + cq_context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff); + + err = mlx4_MODIFY_CQ(dev, mailbox, cq->cqn, 0); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} +EXPORT_SYMBOL_GPL(mlx4_cq_resize); + +int __mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_cq_table *cq_table = &priv->cq_table; + int err; + + *cqn = mlx4_bitmap_alloc(&cq_table->bitmap); + if (*cqn == -1) + return -ENOMEM; + + err = mlx4_table_get(dev, &cq_table->table, *cqn, GFP_KERNEL); + if (err) + goto err_out; + + err = mlx4_table_get(dev, &cq_table->cmpt_table, *cqn, GFP_KERNEL); + if (err) + goto err_put; + return 0; + +err_put: + mlx4_table_put(dev, &cq_table->table, *cqn); + +err_out: + mlx4_bitmap_free(&cq_table->bitmap, *cqn, MLX4_NO_RR); + return err; +} + +static int mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn) +{ + u64 out_param; + int err; + + if (mlx4_is_mfunc(dev)) { + err = mlx4_cmd_imm(dev, 0, &out_param, RES_CQ, + RES_OP_RESERVE_AND_MAP, MLX4_CMD_ALLOC_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + if (err) + return err; + else { + *cqn = get_param_l(&out_param); + return 0; + } + } + return __mlx4_cq_alloc_icm(dev, cqn); +} + +void __mlx4_cq_free_icm(struct mlx4_dev *dev, int cqn) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_cq_table *cq_table = &priv->cq_table; + + mlx4_table_put(dev, &cq_table->cmpt_table, cqn); + mlx4_table_put(dev, &cq_table->table, cqn); + mlx4_bitmap_free(&cq_table->bitmap, cqn, MLX4_NO_RR); +} + +static void mlx4_cq_free_icm(struct mlx4_dev *dev, int cqn) +{ + u64 in_param = 0; + int err; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, cqn); + err = mlx4_cmd(dev, in_param, RES_CQ, RES_OP_RESERVE_AND_MAP, + MLX4_CMD_FREE_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + if (err) + mlx4_warn(dev, "Failed freeing cq:%d\n", cqn); + } else + __mlx4_cq_free_icm(dev, cqn); +} + +int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, + struct mlx4_mtt *mtt, struct mlx4_uar *uar, u64 db_rec, + struct mlx4_cq *cq, unsigned vector, int collapsed, + int timestamp_en) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_cq_table *cq_table = &priv->cq_table; + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_cq_context *cq_context; + u64 mtt_addr; + int err; + + if (vector > dev->caps.num_comp_vectors + dev->caps.comp_pool) + return -EINVAL; + + cq->vector = vector; + + err = mlx4_cq_alloc_icm(dev, &cq->cqn); + if (err) + return err; + + spin_lock_irq(&cq_table->lock); + err = radix_tree_insert(&cq_table->tree, cq->cqn, cq); + spin_unlock_irq(&cq_table->lock); + if (err) + goto err_icm; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + err = PTR_ERR(mailbox); + goto err_radix; + } + + cq_context = mailbox->buf; + cq_context->flags = cpu_to_be32(!!collapsed << 18); + if (timestamp_en) + cq_context->flags |= cpu_to_be32(1 << 19); + + cq_context->logsize_usrpage = cpu_to_be32((ilog2(nent) << 24) | uar->index); + cq_context->comp_eqn = priv->eq_table.eq[vector].eqn; + cq_context->log_page_size = mtt->page_shift - MLX4_ICM_PAGE_SHIFT; + + mtt_addr = mlx4_mtt_addr(dev, mtt); + cq_context->mtt_base_addr_h = mtt_addr >> 32; + cq_context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff); + cq_context->db_rec_addr = cpu_to_be64(db_rec); + + err = mlx4_SW2HW_CQ(dev, mailbox, cq->cqn); + mlx4_free_cmd_mailbox(dev, mailbox); + if (err) + goto err_radix; + + cq->cons_index = 0; + cq->arm_sn = 1; + cq->uar = uar; + atomic_set(&cq->refcount, 1); + init_completion(&cq->free); + + cq->irq = priv->eq_table.eq[cq->vector].irq; + return 0; + +err_radix: + spin_lock_irq(&cq_table->lock); + radix_tree_delete(&cq_table->tree, cq->cqn); + spin_unlock_irq(&cq_table->lock); + +err_icm: + mlx4_cq_free_icm(dev, cq->cqn); + + return err; +} +EXPORT_SYMBOL_GPL(mlx4_cq_alloc); + +void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_cq_table *cq_table = &priv->cq_table; + int err; + + err = mlx4_HW2SW_CQ(dev, NULL, cq->cqn); + if (err) + mlx4_warn(dev, "HW2SW_CQ failed (%d) for CQN %06x\n", err, cq->cqn); + + synchronize_irq(priv->eq_table.eq[cq->vector].irq); + + spin_lock_irq(&cq_table->lock); + radix_tree_delete(&cq_table->tree, cq->cqn); + spin_unlock_irq(&cq_table->lock); + + if (atomic_dec_and_test(&cq->refcount)) + complete(&cq->free); + wait_for_completion(&cq->free); + + mlx4_cq_free_icm(dev, cq->cqn); +} +EXPORT_SYMBOL_GPL(mlx4_cq_free); + +int mlx4_init_cq_table(struct mlx4_dev *dev) +{ + struct mlx4_cq_table *cq_table = &mlx4_priv(dev)->cq_table; + int err; + + spin_lock_init(&cq_table->lock); + INIT_RADIX_TREE(&cq_table->tree, GFP_ATOMIC); + if (mlx4_is_slave(dev)) + return 0; + + err = mlx4_bitmap_init(&cq_table->bitmap, dev->caps.num_cqs, + dev->caps.num_cqs - 1, dev->caps.reserved_cqs, 0); + if (err) + return err; + + return 0; +} + +void mlx4_cleanup_cq_table(struct mlx4_dev *dev) +{ + if (mlx4_is_slave(dev)) + return; + /* Nothing to do to clean up radix_tree */ + mlx4_bitmap_cleanup(&mlx4_priv(dev)->cq_table.bitmap); +} diff --git a/drivers/net/ethernet/mellanox/mlx4/en_clock.c b/drivers/net/ethernet/mellanox/mlx4/en_clock.c new file mode 100644 index 0000000..57dda95 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/en_clock.c @@ -0,0 +1,334 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include + +#include "mlx4_en.h" + +int mlx4_en_timestamp_config(struct net_device *dev, int tx_type, int rx_filter) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + int port_up = 0; + int err = 0; + + if (priv->hwtstamp_config.tx_type == tx_type && + priv->hwtstamp_config.rx_filter == rx_filter) + return 0; + + mutex_lock(&mdev->state_lock); + if (priv->port_up) { + port_up = 1; + mlx4_en_stop_port(dev, 1); + } + + mlx4_en_free_resources(priv); + + en_warn(priv, "Changing Time Stamp configuration\n"); + + priv->hwtstamp_config.tx_type = tx_type; + priv->hwtstamp_config.rx_filter = rx_filter; + + if (rx_filter != HWTSTAMP_FILTER_NONE) + dev->features &= ~NETIF_F_HW_VLAN_CTAG_RX; + else + dev->features |= NETIF_F_HW_VLAN_CTAG_RX; + + err = mlx4_en_alloc_resources(priv); + if (err) { + en_err(priv, "Failed reallocating port resources\n"); + goto out; + } + if (port_up) { + err = mlx4_en_start_port(dev); + if (err) + en_err(priv, "Failed starting port\n"); + } + +out: + mutex_unlock(&mdev->state_lock); + netdev_features_change(dev); + return err; +} + +/* mlx4_en_read_clock - read raw cycle counter (to be used by time counter) + */ +static cycle_t mlx4_en_read_clock(const struct cyclecounter *tc) +{ + struct mlx4_en_dev *mdev = + container_of(tc, struct mlx4_en_dev, cycles); + struct mlx4_dev *dev = mdev->dev; + + return mlx4_read_clock(dev) & tc->mask; +} + +u64 mlx4_en_get_cqe_ts(struct mlx4_cqe *cqe) +{ + u64 hi, lo; + struct mlx4_ts_cqe *ts_cqe = (struct mlx4_ts_cqe *)cqe; + + lo = (u64)be16_to_cpu(ts_cqe->timestamp_lo); + hi = ((u64)be32_to_cpu(ts_cqe->timestamp_hi) + !lo) << 16; + + return hi | lo; +} + +void mlx4_en_fill_hwtstamps(struct mlx4_en_dev *mdev, + struct skb_shared_hwtstamps *hwts, + u64 timestamp) +{ + unsigned long flags; + u64 nsec; + + read_lock_irqsave(&mdev->clock_lock, flags); + nsec = timecounter_cyc2time(&mdev->clock, timestamp); + read_unlock_irqrestore(&mdev->clock_lock, flags); + + memset(hwts, 0, sizeof(struct skb_shared_hwtstamps)); + hwts->hwtstamp = ns_to_ktime(nsec); +} + +/** + * mlx4_en_remove_timestamp - disable PTP device + * @mdev: board private structure + * + * Stop the PTP support. + **/ +void mlx4_en_remove_timestamp(struct mlx4_en_dev *mdev) +{ + if (mdev->ptp_clock) { + ptp_clock_unregister(mdev->ptp_clock); + mdev->ptp_clock = NULL; + mlx4_info(mdev, "removed PHC\n"); + } +} + +void mlx4_en_ptp_overflow_check(struct mlx4_en_dev *mdev) +{ + bool timeout = time_is_before_jiffies(mdev->last_overflow_check + + mdev->overflow_period); + unsigned long flags; + + if (timeout) { + write_lock_irqsave(&mdev->clock_lock, flags); + timecounter_read(&mdev->clock); + write_unlock_irqrestore(&mdev->clock_lock, flags); + mdev->last_overflow_check = jiffies; + } +} + +/** + * mlx4_en_phc_adjfreq - adjust the frequency of the hardware clock + * @ptp: ptp clock structure + * @delta: Desired frequency change in parts per billion + * + * Adjust the frequency of the PHC cycle counter by the indicated delta from + * the base frequency. + **/ +static int mlx4_en_phc_adjfreq(struct ptp_clock_info *ptp, s32 delta) +{ + u64 adj; + u32 diff, mult; + int neg_adj = 0; + unsigned long flags; + struct mlx4_en_dev *mdev = container_of(ptp, struct mlx4_en_dev, + ptp_clock_info); + + if (delta < 0) { + neg_adj = 1; + delta = -delta; + } + mult = mdev->nominal_c_mult; + adj = mult; + adj *= delta; + diff = div_u64(adj, 1000000000ULL); + + write_lock_irqsave(&mdev->clock_lock, flags); + timecounter_read(&mdev->clock); + mdev->cycles.mult = neg_adj ? mult - diff : mult + diff; + write_unlock_irqrestore(&mdev->clock_lock, flags); + + return 0; +} + +/** + * mlx4_en_phc_adjtime - Shift the time of the hardware clock + * @ptp: ptp clock structure + * @delta: Desired change in nanoseconds + * + * Adjust the timer by resetting the timecounter structure. + **/ +static int mlx4_en_phc_adjtime(struct ptp_clock_info *ptp, s64 delta) +{ + struct mlx4_en_dev *mdev = container_of(ptp, struct mlx4_en_dev, + ptp_clock_info); + unsigned long flags; + s64 now; + + write_lock_irqsave(&mdev->clock_lock, flags); + now = timecounter_read(&mdev->clock); + now += delta; + timecounter_init(&mdev->clock, &mdev->cycles, now); + write_unlock_irqrestore(&mdev->clock_lock, flags); + + return 0; +} + +/** + * mlx4_en_phc_gettime - Reads the current time from the hardware clock + * @ptp: ptp clock structure + * @ts: timespec structure to hold the current time value + * + * Read the timecounter and return the correct value in ns after converting + * it into a struct timespec. + **/ +static int mlx4_en_phc_gettime(struct ptp_clock_info *ptp, struct timespec *ts) +{ + struct mlx4_en_dev *mdev = container_of(ptp, struct mlx4_en_dev, + ptp_clock_info); + unsigned long flags; + u32 remainder; + u64 ns; + + write_lock_irqsave(&mdev->clock_lock, flags); + ns = timecounter_read(&mdev->clock); + write_unlock_irqrestore(&mdev->clock_lock, flags); + + ts->tv_sec = div_u64_rem(ns, NSEC_PER_SEC, &remainder); + ts->tv_nsec = remainder; + + return 0; +} + +/** + * mlx4_en_phc_settime - Set the current time on the hardware clock + * @ptp: ptp clock structure + * @ts: timespec containing the new time for the cycle counter + * + * Reset the timecounter to use a new base value instead of the kernel + * wall timer value. + **/ +static int mlx4_en_phc_settime(struct ptp_clock_info *ptp, + const struct timespec *ts) +{ + struct mlx4_en_dev *mdev = container_of(ptp, struct mlx4_en_dev, + ptp_clock_info); + u64 ns = timespec_to_ns(ts); + unsigned long flags; + + /* reset the timecounter */ + write_lock_irqsave(&mdev->clock_lock, flags); + timecounter_init(&mdev->clock, &mdev->cycles, ns); + write_unlock_irqrestore(&mdev->clock_lock, flags); + + return 0; +} + +/** + * mlx4_en_phc_enable - enable or disable an ancillary feature + * @ptp: ptp clock structure + * @request: Desired resource to enable or disable + * @on: Caller passes one to enable or zero to disable + * + * Enable (or disable) ancillary features of the PHC subsystem. + * Currently, no ancillary features are supported. + **/ +static int mlx4_en_phc_enable(struct ptp_clock_info __always_unused *ptp, + struct ptp_clock_request __always_unused *request, + int __always_unused on) +{ + return -EOPNOTSUPP; +} + +static const struct ptp_clock_info mlx4_en_ptp_clock_info = { + .owner = THIS_MODULE, + .max_adj = 100000000, + .n_alarm = 0, + .n_ext_ts = 0, + .n_per_out = 0, + .n_pins = 0, + .pps = 0, + .adjfreq = mlx4_en_phc_adjfreq, + .adjtime = mlx4_en_phc_adjtime, + .gettime = mlx4_en_phc_gettime, + .settime = mlx4_en_phc_settime, + .enable = mlx4_en_phc_enable, +}; + +void mlx4_en_init_timestamp(struct mlx4_en_dev *mdev) +{ + struct mlx4_dev *dev = mdev->dev; + unsigned long flags; + u64 ns; + + rwlock_init(&mdev->clock_lock); + + memset(&mdev->cycles, 0, sizeof(mdev->cycles)); + mdev->cycles.read = mlx4_en_read_clock; + mdev->cycles.mask = CLOCKSOURCE_MASK(48); + /* Using shift to make calculation more accurate. Since current HW + * clock frequency is 427 MHz, and cycles are given using a 48 bits + * register, the biggest shift when calculating using u64, is 14 + * (max_cycles * multiplier < 2^64) + */ + mdev->cycles.shift = 14; + mdev->cycles.mult = + clocksource_khz2mult(1000 * dev->caps.hca_core_clock, mdev->cycles.shift); + mdev->nominal_c_mult = mdev->cycles.mult; + + write_lock_irqsave(&mdev->clock_lock, flags); + timecounter_init(&mdev->clock, &mdev->cycles, + ktime_to_ns(ktime_get_real())); + write_unlock_irqrestore(&mdev->clock_lock, flags); + + /* Calculate period in seconds to call the overflow watchdog - to make + * sure counter is checked at least once every wrap around. + */ + ns = cyclecounter_cyc2ns(&mdev->cycles, mdev->cycles.mask); + do_div(ns, NSEC_PER_SEC / 2 / HZ); + mdev->overflow_period = ns; + + /* Configure the PHC */ + mdev->ptp_clock_info = mlx4_en_ptp_clock_info; + snprintf(mdev->ptp_clock_info.name, 16, "mlx4 ptp"); + + mdev->ptp_clock = ptp_clock_register(&mdev->ptp_clock_info, + &mdev->pdev->dev); + if (IS_ERR(mdev->ptp_clock)) { + mdev->ptp_clock = NULL; + mlx4_err(mdev, "ptp_clock_register failed\n"); + } else { + mlx4_info(mdev, "registered PHC clock\n"); + } + +} diff --git a/drivers/net/ethernet/mellanox/mlx4/en_cq.c b/drivers/net/ethernet/mellanox/mlx4/en_cq.c new file mode 100644 index 0000000..82322b1 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2007 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include + +#include "mlx4_en.h" + +static void mlx4_en_cq_event(struct mlx4_cq *cq, enum mlx4_event event) +{ + return; +} + + +int mlx4_en_create_cq(struct mlx4_en_priv *priv, + struct mlx4_en_cq **pcq, + int entries, int ring, enum cq_type mode, + int node) +{ + struct mlx4_en_dev *mdev = priv->mdev; + struct mlx4_en_cq *cq; + int err; + + cq = kzalloc_node(sizeof(*cq), GFP_KERNEL, node); + if (!cq) { + cq = kzalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) { + en_err(priv, "Failed to allocate CQ structure\n"); + return -ENOMEM; + } + } + + cq->size = entries; + cq->buf_size = cq->size * mdev->dev->caps.cqe_size; + + cq->ring = ring; + cq->is_tx = mode; + + /* Allocate HW buffers on provided NUMA node. + * dev->numa_node is used in mtt range allocation flow. + */ + set_dev_node(&mdev->dev->pdev->dev, node); + err = mlx4_alloc_hwq_res(mdev->dev, &cq->wqres, + cq->buf_size, 2 * PAGE_SIZE); + set_dev_node(&mdev->dev->pdev->dev, mdev->dev->numa_node); + if (err) + goto err_cq; + + err = mlx4_en_map_buffer(&cq->wqres.buf); + if (err) + goto err_res; + + cq->buf = (struct mlx4_cqe *)cq->wqres.buf.direct.buf; + *pcq = cq; + + return 0; + +err_res: + mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size); +err_cq: + kfree(cq); + *pcq = NULL; + return err; +} + +int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq, + int cq_idx) +{ + struct mlx4_en_dev *mdev = priv->mdev; + int err = 0; + char name[25]; + int timestamp_en = 0; + struct cpu_rmap *rmap = +#ifdef CONFIG_RFS_ACCEL + priv->dev->rx_cpu_rmap; +#else + NULL; +#endif + + cq->dev = mdev->pndev[priv->port]; + cq->mcq.set_ci_db = cq->wqres.db.db; + cq->mcq.arm_db = cq->wqres.db.db + 1; + *cq->mcq.set_ci_db = 0; + *cq->mcq.arm_db = 0; + memset(cq->buf, 0, cq->buf_size); + + if (cq->is_tx == RX) { + if (mdev->dev->caps.comp_pool) { + if (!cq->vector) { + sprintf(name, "%s-%d", priv->dev->name, + cq->ring); + /* Set IRQ for specific name (per ring) */ + if (mlx4_assign_eq(mdev->dev, name, rmap, + &cq->vector)) { + cq->vector = (cq->ring + 1 + priv->port) + % mdev->dev->caps.num_comp_vectors; + mlx4_warn(mdev, "Failed assigning an EQ to %s, falling back to legacy EQ's\n", + name); + } + + } + } else { + cq->vector = (cq->ring + 1 + priv->port) % + mdev->dev->caps.num_comp_vectors; + } + + cq->irq_desc = + irq_to_desc(mlx4_eq_get_irq(mdev->dev, + cq->vector)); + } else { + /* For TX we use the same irq per + ring we assigned for the RX */ + struct mlx4_en_cq *rx_cq; + + cq_idx = cq_idx % priv->rx_ring_num; + rx_cq = priv->rx_cq[cq_idx]; + cq->vector = rx_cq->vector; + } + + if (!cq->is_tx) + cq->size = priv->rx_ring[cq->ring]->actual_size; + + if ((cq->is_tx && priv->hwtstamp_config.tx_type) || + (!cq->is_tx && priv->hwtstamp_config.rx_filter)) + timestamp_en = 1; + + err = mlx4_cq_alloc(mdev->dev, cq->size, &cq->wqres.mtt, + &mdev->priv_uar, cq->wqres.db.dma, &cq->mcq, + cq->vector, 0, timestamp_en); + if (err) + return err; + + cq->mcq.comp = cq->is_tx ? mlx4_en_tx_irq : mlx4_en_rx_irq; + cq->mcq.event = mlx4_en_cq_event; + + if (cq->is_tx) { + netif_napi_add(cq->dev, &cq->napi, mlx4_en_poll_tx_cq, + NAPI_POLL_WEIGHT); + } else { + struct mlx4_en_rx_ring *ring = priv->rx_ring[cq->ring]; + + err = irq_set_affinity_hint(cq->mcq.irq, + ring->affinity_mask); + if (err) + mlx4_warn(mdev, "Failed setting affinity hint\n"); + + netif_napi_add(cq->dev, &cq->napi, mlx4_en_poll_rx_cq, 64); + napi_hash_add(&cq->napi); + } + + napi_enable(&cq->napi); + + return 0; +} + +void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq **pcq) +{ + struct mlx4_en_dev *mdev = priv->mdev; + struct mlx4_en_cq *cq = *pcq; + + mlx4_en_unmap_buffer(&cq->wqres.buf); + mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size); + if (priv->mdev->dev->caps.comp_pool && cq->vector) { + mlx4_release_eq(priv->mdev->dev, cq->vector); + } + cq->vector = 0; + cq->buf_size = 0; + cq->buf = NULL; + kfree(cq); + *pcq = NULL; +} + +void mlx4_en_deactivate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq) +{ + napi_disable(&cq->napi); + if (!cq->is_tx) { + napi_hash_del(&cq->napi); + synchronize_rcu(); + irq_set_affinity_hint(cq->mcq.irq, NULL); + } + netif_napi_del(&cq->napi); + + mlx4_cq_free(priv->mdev->dev, &cq->mcq); +} + +/* Set rx cq moderation parameters */ +int mlx4_en_set_cq_moder(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq) +{ + return mlx4_cq_modify(priv->mdev->dev, &cq->mcq, + cq->moder_cnt, cq->moder_time); +} + +int mlx4_en_arm_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq) +{ + mlx4_cq_arm(&cq->mcq, MLX4_CQ_DB_REQ_NOT, priv->mdev->uar_map, + &priv->mdev->uar_lock); + + return 0; +} + + diff --git a/drivers/net/ethernet/mellanox/mlx4/en_dcb_nl.c b/drivers/net/ethernet/mellanox/mlx4/en_dcb_nl.c new file mode 100644 index 0000000..c95ca25 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/en_dcb_nl.c @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2011 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include + +#include "mlx4_en.h" + +static int mlx4_en_dcbnl_ieee_getets(struct net_device *dev, + struct ieee_ets *ets) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct ieee_ets *my_ets = &priv->ets; + + /* No IEEE PFC settings available */ + if (!my_ets) + return -EINVAL; + + ets->ets_cap = IEEE_8021QAZ_MAX_TCS; + ets->cbs = my_ets->cbs; + memcpy(ets->tc_tx_bw, my_ets->tc_tx_bw, sizeof(ets->tc_tx_bw)); + memcpy(ets->tc_tsa, my_ets->tc_tsa, sizeof(ets->tc_tsa)); + memcpy(ets->prio_tc, my_ets->prio_tc, sizeof(ets->prio_tc)); + + return 0; +} + +static int mlx4_en_ets_validate(struct mlx4_en_priv *priv, struct ieee_ets *ets) +{ + int i; + int total_ets_bw = 0; + int has_ets_tc = 0; + + for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) { + if (ets->prio_tc[i] >= MLX4_EN_NUM_UP) { + en_err(priv, "Bad priority in UP <=> TC mapping. TC: %d, UP: %d\n", + i, ets->prio_tc[i]); + return -EINVAL; + } + + switch (ets->tc_tsa[i]) { + case IEEE_8021QAZ_TSA_STRICT: + break; + case IEEE_8021QAZ_TSA_ETS: + has_ets_tc = 1; + total_ets_bw += ets->tc_tx_bw[i]; + break; + default: + en_err(priv, "TC[%d]: Not supported TSA: %d\n", + i, ets->tc_tsa[i]); + return -ENOTSUPP; + } + } + + if (has_ets_tc && total_ets_bw != MLX4_EN_BW_MAX) { + en_err(priv, "Bad ETS BW sum: %d. Should be exactly 100%%\n", + total_ets_bw); + return -EINVAL; + } + + return 0; +} + +static int mlx4_en_config_port_scheduler(struct mlx4_en_priv *priv, + struct ieee_ets *ets, u16 *ratelimit) +{ + struct mlx4_en_dev *mdev = priv->mdev; + int num_strict = 0; + int i; + __u8 tc_tx_bw[IEEE_8021QAZ_MAX_TCS] = { 0 }; + __u8 pg[IEEE_8021QAZ_MAX_TCS] = { 0 }; + + ets = ets ?: &priv->ets; + ratelimit = ratelimit ?: priv->maxrate; + + /* higher TC means higher priority => lower pg */ + for (i = IEEE_8021QAZ_MAX_TCS - 1; i >= 0; i--) { + switch (ets->tc_tsa[i]) { + case IEEE_8021QAZ_TSA_STRICT: + pg[i] = num_strict++; + tc_tx_bw[i] = MLX4_EN_BW_MAX; + break; + case IEEE_8021QAZ_TSA_ETS: + pg[i] = MLX4_EN_TC_ETS; + tc_tx_bw[i] = ets->tc_tx_bw[i] ?: MLX4_EN_BW_MIN; + break; + } + } + + return mlx4_SET_PORT_SCHEDULER(mdev->dev, priv->port, tc_tx_bw, pg, + ratelimit); +} + +static int +mlx4_en_dcbnl_ieee_setets(struct net_device *dev, struct ieee_ets *ets) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + int err; + + err = mlx4_en_ets_validate(priv, ets); + if (err) + return err; + + err = mlx4_SET_PORT_PRIO2TC(mdev->dev, priv->port, ets->prio_tc); + if (err) + return err; + + err = mlx4_en_config_port_scheduler(priv, ets, NULL); + if (err) + return err; + + memcpy(&priv->ets, ets, sizeof(priv->ets)); + + return 0; +} + +static int mlx4_en_dcbnl_ieee_getpfc(struct net_device *dev, + struct ieee_pfc *pfc) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + + pfc->pfc_cap = IEEE_8021QAZ_MAX_TCS; + pfc->pfc_en = priv->prof->tx_ppp; + + return 0; +} + +static int mlx4_en_dcbnl_ieee_setpfc(struct net_device *dev, + struct ieee_pfc *pfc) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_port_profile *prof = priv->prof; + struct mlx4_en_dev *mdev = priv->mdev; + int err; + + en_dbg(DRV, priv, "cap: 0x%x en: 0x%x mbc: 0x%x delay: %d\n", + pfc->pfc_cap, + pfc->pfc_en, + pfc->mbc, + pfc->delay); + + prof->rx_pause = !pfc->pfc_en; + prof->tx_pause = !pfc->pfc_en; + prof->rx_ppp = pfc->pfc_en; + prof->tx_ppp = pfc->pfc_en; + + err = mlx4_SET_PORT_general(mdev->dev, priv->port, + priv->rx_skb_size + ETH_FCS_LEN, + prof->tx_pause, + prof->tx_ppp, + prof->rx_pause, + prof->rx_ppp); + if (err) + en_err(priv, "Failed setting pause params\n"); + + return err; +} + +static u8 mlx4_en_dcbnl_getdcbx(struct net_device *dev) +{ + return DCB_CAP_DCBX_HOST | DCB_CAP_DCBX_VER_IEEE; +} + +static u8 mlx4_en_dcbnl_setdcbx(struct net_device *dev, u8 mode) +{ + if ((mode & DCB_CAP_DCBX_LLD_MANAGED) || + (mode & DCB_CAP_DCBX_VER_CEE) || + !(mode & DCB_CAP_DCBX_VER_IEEE) || + !(mode & DCB_CAP_DCBX_HOST)) + return 1; + + return 0; +} + +#define MLX4_RATELIMIT_UNITS_IN_KB 100000 /* rate-limit HW unit in Kbps */ +static int mlx4_en_dcbnl_ieee_getmaxrate(struct net_device *dev, + struct ieee_maxrate *maxrate) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + int i; + + for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) + maxrate->tc_maxrate[i] = + priv->maxrate[i] * MLX4_RATELIMIT_UNITS_IN_KB; + + return 0; +} + +static int mlx4_en_dcbnl_ieee_setmaxrate(struct net_device *dev, + struct ieee_maxrate *maxrate) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + u16 tmp[IEEE_8021QAZ_MAX_TCS]; + int i, err; + + for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) { + /* Convert from Kbps into HW units, rounding result up. + * Setting to 0, means unlimited BW. + */ + tmp[i] = div_u64(maxrate->tc_maxrate[i] + + MLX4_RATELIMIT_UNITS_IN_KB - 1, + MLX4_RATELIMIT_UNITS_IN_KB); + } + + err = mlx4_en_config_port_scheduler(priv, NULL, tmp); + if (err) + return err; + + memcpy(priv->maxrate, tmp, sizeof(priv->maxrate)); + + return 0; +} + +const struct dcbnl_rtnl_ops mlx4_en_dcbnl_ops = { + .ieee_getets = mlx4_en_dcbnl_ieee_getets, + .ieee_setets = mlx4_en_dcbnl_ieee_setets, + .ieee_getmaxrate = mlx4_en_dcbnl_ieee_getmaxrate, + .ieee_setmaxrate = mlx4_en_dcbnl_ieee_setmaxrate, + .ieee_getpfc = mlx4_en_dcbnl_ieee_getpfc, + .ieee_setpfc = mlx4_en_dcbnl_ieee_setpfc, + + .getdcbx = mlx4_en_dcbnl_getdcbx, + .setdcbx = mlx4_en_dcbnl_setdcbx, +}; + +const struct dcbnl_rtnl_ops mlx4_en_dcbnl_pfc_ops = { + .ieee_getpfc = mlx4_en_dcbnl_ieee_getpfc, + .ieee_setpfc = mlx4_en_dcbnl_ieee_setpfc, + + .getdcbx = mlx4_en_dcbnl_getdcbx, + .setdcbx = mlx4_en_dcbnl_setdcbx, +}; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c new file mode 100644 index 0000000..ae83da9 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c @@ -0,0 +1,1349 @@ +/* + * Copyright (c) 2007 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include + +#include "mlx4_en.h" +#include "en_port.h" + +#define EN_ETHTOOL_QP_ATTACH (1ull << 63) +#define EN_ETHTOOL_SHORT_MASK cpu_to_be16(0xffff) +#define EN_ETHTOOL_WORD_MASK cpu_to_be32(0xffffffff) + +static int mlx4_en_moderation_update(struct mlx4_en_priv *priv) +{ + int i; + int err = 0; + + for (i = 0; i < priv->tx_ring_num; i++) { + priv->tx_cq[i]->moder_cnt = priv->tx_frames; + priv->tx_cq[i]->moder_time = priv->tx_usecs; + if (priv->port_up) { + err = mlx4_en_set_cq_moder(priv, priv->tx_cq[i]); + if (err) + return err; + } + } + + if (priv->adaptive_rx_coal) + return 0; + + for (i = 0; i < priv->rx_ring_num; i++) { + priv->rx_cq[i]->moder_cnt = priv->rx_frames; + priv->rx_cq[i]->moder_time = priv->rx_usecs; + priv->last_moder_time[i] = MLX4_EN_AUTO_CONF; + if (priv->port_up) { + err = mlx4_en_set_cq_moder(priv, priv->rx_cq[i]); + if (err) + return err; + } + } + + return err; +} + +static void +mlx4_en_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + + strlcpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver)); + strlcpy(drvinfo->version, DRV_VERSION " (" DRV_RELDATE ")", + sizeof(drvinfo->version)); + snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), + "%d.%d.%d", + (u16) (mdev->dev->caps.fw_ver >> 32), + (u16) ((mdev->dev->caps.fw_ver >> 16) & 0xffff), + (u16) (mdev->dev->caps.fw_ver & 0xffff)); + strlcpy(drvinfo->bus_info, pci_name(mdev->dev->pdev), + sizeof(drvinfo->bus_info)); + drvinfo->n_stats = 0; + drvinfo->regdump_len = 0; + drvinfo->eedump_len = 0; +} + +static const char mlx4_en_priv_flags[][ETH_GSTRING_LEN] = { + "blueflame", +}; + +static const char main_strings[][ETH_GSTRING_LEN] = { + "rx_packets", "tx_packets", "rx_bytes", "tx_bytes", "rx_errors", + "tx_errors", "rx_dropped", "tx_dropped", "multicast", "collisions", + "rx_length_errors", "rx_over_errors", "rx_crc_errors", + "rx_frame_errors", "rx_fifo_errors", "rx_missed_errors", + "tx_aborted_errors", "tx_carrier_errors", "tx_fifo_errors", + "tx_heartbeat_errors", "tx_window_errors", + + /* port statistics */ + "tso_packets", + "xmit_more", + "queue_stopped", "wake_queue", "tx_timeout", "rx_alloc_failed", + "rx_csum_good", "rx_csum_none", "tx_chksum_offload", + + /* packet statistics */ + "broadcast", "rx_prio_0", "rx_prio_1", "rx_prio_2", "rx_prio_3", + "rx_prio_4", "rx_prio_5", "rx_prio_6", "rx_prio_7", "tx_prio_0", + "tx_prio_1", "tx_prio_2", "tx_prio_3", "tx_prio_4", "tx_prio_5", + "tx_prio_6", "tx_prio_7", +}; +#define NUM_MAIN_STATS 21 +#define NUM_ALL_STATS (NUM_MAIN_STATS + NUM_PORT_STATS + NUM_PKT_STATS + NUM_PERF_STATS) + +static const char mlx4_en_test_names[][ETH_GSTRING_LEN]= { + "Interrupt Test", + "Link Test", + "Speed Test", + "Register Test", + "Loopback Test", +}; + +static u32 mlx4_en_get_msglevel(struct net_device *dev) +{ + return ((struct mlx4_en_priv *) netdev_priv(dev))->msg_enable; +} + +static void mlx4_en_set_msglevel(struct net_device *dev, u32 val) +{ + ((struct mlx4_en_priv *) netdev_priv(dev))->msg_enable = val; +} + +static void mlx4_en_get_wol(struct net_device *netdev, + struct ethtool_wolinfo *wol) +{ + struct mlx4_en_priv *priv = netdev_priv(netdev); + int err = 0; + u64 config = 0; + u64 mask; + + if ((priv->port < 1) || (priv->port > 2)) { + en_err(priv, "Failed to get WoL information\n"); + return; + } + + mask = (priv->port == 1) ? MLX4_DEV_CAP_FLAG_WOL_PORT1 : + MLX4_DEV_CAP_FLAG_WOL_PORT2; + + if (!(priv->mdev->dev->caps.flags & mask)) { + wol->supported = 0; + wol->wolopts = 0; + return; + } + + err = mlx4_wol_read(priv->mdev->dev, &config, priv->port); + if (err) { + en_err(priv, "Failed to get WoL information\n"); + return; + } + + if (config & MLX4_EN_WOL_MAGIC) + wol->supported = WAKE_MAGIC; + else + wol->supported = 0; + + if (config & MLX4_EN_WOL_ENABLED) + wol->wolopts = WAKE_MAGIC; + else + wol->wolopts = 0; +} + +static int mlx4_en_set_wol(struct net_device *netdev, + struct ethtool_wolinfo *wol) +{ + struct mlx4_en_priv *priv = netdev_priv(netdev); + u64 config = 0; + int err = 0; + u64 mask; + + if ((priv->port < 1) || (priv->port > 2)) + return -EOPNOTSUPP; + + mask = (priv->port == 1) ? MLX4_DEV_CAP_FLAG_WOL_PORT1 : + MLX4_DEV_CAP_FLAG_WOL_PORT2; + + if (!(priv->mdev->dev->caps.flags & mask)) + return -EOPNOTSUPP; + + if (wol->supported & ~WAKE_MAGIC) + return -EINVAL; + + err = mlx4_wol_read(priv->mdev->dev, &config, priv->port); + if (err) { + en_err(priv, "Failed to get WoL info, unable to modify\n"); + return err; + } + + if (wol->wolopts & WAKE_MAGIC) { + config |= MLX4_EN_WOL_DO_MODIFY | MLX4_EN_WOL_ENABLED | + MLX4_EN_WOL_MAGIC; + } else { + config &= ~(MLX4_EN_WOL_ENABLED | MLX4_EN_WOL_MAGIC); + config |= MLX4_EN_WOL_DO_MODIFY; + } + + err = mlx4_wol_write(priv->mdev->dev, config, priv->port); + if (err) + en_err(priv, "Failed to set WoL information\n"); + + return err; +} + +static int mlx4_en_get_sset_count(struct net_device *dev, int sset) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + int bit_count = hweight64(priv->stats_bitmap); + + switch (sset) { + case ETH_SS_STATS: + return (priv->stats_bitmap ? bit_count : NUM_ALL_STATS) + + (priv->tx_ring_num * 2) + +#ifdef CONFIG_NET_RX_BUSY_POLL + (priv->rx_ring_num * 5); +#else + (priv->rx_ring_num * 2); +#endif + case ETH_SS_TEST: + return MLX4_EN_NUM_SELF_TEST - !(priv->mdev->dev->caps.flags + & MLX4_DEV_CAP_FLAG_UC_LOOPBACK) * 2; + case ETH_SS_PRIV_FLAGS: + return ARRAY_SIZE(mlx4_en_priv_flags); + default: + return -EOPNOTSUPP; + } +} + +static void mlx4_en_get_ethtool_stats(struct net_device *dev, + struct ethtool_stats *stats, uint64_t *data) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + int index = 0; + int i, j = 0; + + spin_lock_bh(&priv->stats_lock); + + if (!(priv->stats_bitmap)) { + for (i = 0; i < NUM_MAIN_STATS; i++) + data[index++] = + ((unsigned long *) &priv->stats)[i]; + for (i = 0; i < NUM_PORT_STATS; i++) + data[index++] = + ((unsigned long *) &priv->port_stats)[i]; + for (i = 0; i < NUM_PKT_STATS; i++) + data[index++] = + ((unsigned long *) &priv->pkstats)[i]; + } else { + for (i = 0; i < NUM_MAIN_STATS; i++) { + if ((priv->stats_bitmap >> j) & 1) + data[index++] = + ((unsigned long *) &priv->stats)[i]; + j++; + } + for (i = 0; i < NUM_PORT_STATS; i++) { + if ((priv->stats_bitmap >> j) & 1) + data[index++] = + ((unsigned long *) &priv->port_stats)[i]; + j++; + } + } + for (i = 0; i < priv->tx_ring_num; i++) { + data[index++] = priv->tx_ring[i]->packets; + data[index++] = priv->tx_ring[i]->bytes; + } + for (i = 0; i < priv->rx_ring_num; i++) { + data[index++] = priv->rx_ring[i]->packets; + data[index++] = priv->rx_ring[i]->bytes; +#ifdef CONFIG_NET_RX_BUSY_POLL + data[index++] = priv->rx_ring[i]->yields; + data[index++] = priv->rx_ring[i]->misses; + data[index++] = priv->rx_ring[i]->cleaned; +#endif + } + spin_unlock_bh(&priv->stats_lock); + +} + +static void mlx4_en_self_test(struct net_device *dev, + struct ethtool_test *etest, u64 *buf) +{ + mlx4_en_ex_selftest(dev, &etest->flags, buf); +} + +static void mlx4_en_get_strings(struct net_device *dev, + uint32_t stringset, uint8_t *data) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + int index = 0; + int i; + + switch (stringset) { + case ETH_SS_TEST: + for (i = 0; i < MLX4_EN_NUM_SELF_TEST - 2; i++) + strcpy(data + i * ETH_GSTRING_LEN, mlx4_en_test_names[i]); + if (priv->mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UC_LOOPBACK) + for (; i < MLX4_EN_NUM_SELF_TEST; i++) + strcpy(data + i * ETH_GSTRING_LEN, mlx4_en_test_names[i]); + break; + + case ETH_SS_STATS: + /* Add main counters */ + if (!priv->stats_bitmap) { + for (i = 0; i < NUM_MAIN_STATS; i++) + strcpy(data + (index++) * ETH_GSTRING_LEN, + main_strings[i]); + for (i = 0; i < NUM_PORT_STATS; i++) + strcpy(data + (index++) * ETH_GSTRING_LEN, + main_strings[i + + NUM_MAIN_STATS]); + for (i = 0; i < NUM_PKT_STATS; i++) + strcpy(data + (index++) * ETH_GSTRING_LEN, + main_strings[i + + NUM_MAIN_STATS + + NUM_PORT_STATS]); + } else + for (i = 0; i < NUM_MAIN_STATS + NUM_PORT_STATS; i++) { + if ((priv->stats_bitmap >> i) & 1) { + strcpy(data + + (index++) * ETH_GSTRING_LEN, + main_strings[i]); + } + if (!(priv->stats_bitmap >> i)) + break; + } + for (i = 0; i < priv->tx_ring_num; i++) { + sprintf(data + (index++) * ETH_GSTRING_LEN, + "tx%d_packets", i); + sprintf(data + (index++) * ETH_GSTRING_LEN, + "tx%d_bytes", i); + } + for (i = 0; i < priv->rx_ring_num; i++) { + sprintf(data + (index++) * ETH_GSTRING_LEN, + "rx%d_packets", i); + sprintf(data + (index++) * ETH_GSTRING_LEN, + "rx%d_bytes", i); +#ifdef CONFIG_NET_RX_BUSY_POLL + sprintf(data + (index++) * ETH_GSTRING_LEN, + "rx%d_napi_yield", i); + sprintf(data + (index++) * ETH_GSTRING_LEN, + "rx%d_misses", i); + sprintf(data + (index++) * ETH_GSTRING_LEN, + "rx%d_cleaned", i); +#endif + } + break; + case ETH_SS_PRIV_FLAGS: + for (i = 0; i < ARRAY_SIZE(mlx4_en_priv_flags); i++) + strcpy(data + i * ETH_GSTRING_LEN, + mlx4_en_priv_flags[i]); + break; + + } +} + +static int mlx4_en_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + int trans_type; + + cmd->autoneg = AUTONEG_DISABLE; + cmd->supported = SUPPORTED_10000baseT_Full; + cmd->advertising = ADVERTISED_10000baseT_Full; + + if (mlx4_en_QUERY_PORT(priv->mdev, priv->port)) + return -ENOMEM; + + trans_type = priv->port_state.transciver; + if (netif_carrier_ok(dev)) { + ethtool_cmd_speed_set(cmd, priv->port_state.link_speed); + cmd->duplex = DUPLEX_FULL; + } else { + ethtool_cmd_speed_set(cmd, SPEED_UNKNOWN); + cmd->duplex = DUPLEX_UNKNOWN; + } + + if (trans_type > 0 && trans_type <= 0xC) { + cmd->port = PORT_FIBRE; + cmd->transceiver = XCVR_EXTERNAL; + cmd->supported |= SUPPORTED_FIBRE; + cmd->advertising |= ADVERTISED_FIBRE; + } else if (trans_type == 0x80 || trans_type == 0) { + cmd->port = PORT_TP; + cmd->transceiver = XCVR_INTERNAL; + cmd->supported |= SUPPORTED_TP; + cmd->advertising |= ADVERTISED_TP; + } else { + cmd->port = -1; + cmd->transceiver = -1; + } + return 0; +} + +static int mlx4_en_set_settings(struct net_device *dev, struct ethtool_cmd *cmd) +{ + if ((cmd->autoneg == AUTONEG_ENABLE) || + (ethtool_cmd_speed(cmd) != SPEED_10000) || + (cmd->duplex != DUPLEX_FULL)) + return -EINVAL; + + /* Nothing to change */ + return 0; +} + +static int mlx4_en_get_coalesce(struct net_device *dev, + struct ethtool_coalesce *coal) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + + coal->tx_coalesce_usecs = priv->tx_usecs; + coal->tx_max_coalesced_frames = priv->tx_frames; + coal->tx_max_coalesced_frames_irq = priv->tx_work_limit; + + coal->rx_coalesce_usecs = priv->rx_usecs; + coal->rx_max_coalesced_frames = priv->rx_frames; + + coal->pkt_rate_low = priv->pkt_rate_low; + coal->rx_coalesce_usecs_low = priv->rx_usecs_low; + coal->pkt_rate_high = priv->pkt_rate_high; + coal->rx_coalesce_usecs_high = priv->rx_usecs_high; + coal->rate_sample_interval = priv->sample_interval; + coal->use_adaptive_rx_coalesce = priv->adaptive_rx_coal; + + return 0; +} + +static int mlx4_en_set_coalesce(struct net_device *dev, + struct ethtool_coalesce *coal) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + + if (!coal->tx_max_coalesced_frames_irq) + return -EINVAL; + + priv->rx_frames = (coal->rx_max_coalesced_frames == + MLX4_EN_AUTO_CONF) ? + MLX4_EN_RX_COAL_TARGET : + coal->rx_max_coalesced_frames; + priv->rx_usecs = (coal->rx_coalesce_usecs == + MLX4_EN_AUTO_CONF) ? + MLX4_EN_RX_COAL_TIME : + coal->rx_coalesce_usecs; + + /* Setting TX coalescing parameters */ + if (coal->tx_coalesce_usecs != priv->tx_usecs || + coal->tx_max_coalesced_frames != priv->tx_frames) { + priv->tx_usecs = coal->tx_coalesce_usecs; + priv->tx_frames = coal->tx_max_coalesced_frames; + } + + /* Set adaptive coalescing params */ + priv->pkt_rate_low = coal->pkt_rate_low; + priv->rx_usecs_low = coal->rx_coalesce_usecs_low; + priv->pkt_rate_high = coal->pkt_rate_high; + priv->rx_usecs_high = coal->rx_coalesce_usecs_high; + priv->sample_interval = coal->rate_sample_interval; + priv->adaptive_rx_coal = coal->use_adaptive_rx_coalesce; + priv->tx_work_limit = coal->tx_max_coalesced_frames_irq; + + return mlx4_en_moderation_update(priv); +} + +static int mlx4_en_set_pauseparam(struct net_device *dev, + struct ethtool_pauseparam *pause) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + int err; + + if (pause->autoneg) + return -EINVAL; + + priv->prof->tx_pause = pause->tx_pause != 0; + priv->prof->rx_pause = pause->rx_pause != 0; + err = mlx4_SET_PORT_general(mdev->dev, priv->port, + priv->rx_skb_size + ETH_FCS_LEN, + priv->prof->tx_pause, + priv->prof->tx_ppp, + priv->prof->rx_pause, + priv->prof->rx_ppp); + if (err) + en_err(priv, "Failed setting pause params\n"); + + return err; +} + +static void mlx4_en_get_pauseparam(struct net_device *dev, + struct ethtool_pauseparam *pause) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + + pause->tx_pause = priv->prof->tx_pause; + pause->rx_pause = priv->prof->rx_pause; +} + +static int mlx4_en_set_ringparam(struct net_device *dev, + struct ethtool_ringparam *param) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + u32 rx_size, tx_size; + int port_up = 0; + int err = 0; + + if (param->rx_jumbo_pending || param->rx_mini_pending) + return -EINVAL; + + rx_size = roundup_pow_of_two(param->rx_pending); + rx_size = max_t(u32, rx_size, MLX4_EN_MIN_RX_SIZE); + rx_size = min_t(u32, rx_size, MLX4_EN_MAX_RX_SIZE); + tx_size = roundup_pow_of_two(param->tx_pending); + tx_size = max_t(u32, tx_size, MLX4_EN_MIN_TX_SIZE); + tx_size = min_t(u32, tx_size, MLX4_EN_MAX_TX_SIZE); + + if (rx_size == (priv->port_up ? priv->rx_ring[0]->actual_size : + priv->rx_ring[0]->size) && + tx_size == priv->tx_ring[0]->size) + return 0; + + mutex_lock(&mdev->state_lock); + if (priv->port_up) { + port_up = 1; + mlx4_en_stop_port(dev, 1); + } + + mlx4_en_free_resources(priv); + + priv->prof->tx_ring_size = tx_size; + priv->prof->rx_ring_size = rx_size; + + err = mlx4_en_alloc_resources(priv); + if (err) { + en_err(priv, "Failed reallocating port resources\n"); + goto out; + } + if (port_up) { + err = mlx4_en_start_port(dev); + if (err) + en_err(priv, "Failed starting port\n"); + } + + err = mlx4_en_moderation_update(priv); + +out: + mutex_unlock(&mdev->state_lock); + return err; +} + +static void mlx4_en_get_ringparam(struct net_device *dev, + struct ethtool_ringparam *param) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + + memset(param, 0, sizeof(*param)); + param->rx_max_pending = MLX4_EN_MAX_RX_SIZE; + param->tx_max_pending = MLX4_EN_MAX_TX_SIZE; + param->rx_pending = priv->port_up ? + priv->rx_ring[0]->actual_size : priv->rx_ring[0]->size; + param->tx_pending = priv->tx_ring[0]->size; +} + +static u32 mlx4_en_get_rxfh_indir_size(struct net_device *dev) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + + return priv->rx_ring_num; +} + +static int mlx4_en_get_rxfh(struct net_device *dev, u32 *ring_index, u8 *key) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_rss_map *rss_map = &priv->rss_map; + int rss_rings; + size_t n = priv->rx_ring_num; + int err = 0; + + rss_rings = priv->prof->rss_rings ?: priv->rx_ring_num; + + while (n--) { + ring_index[n] = rss_map->qps[n % rss_rings].qpn - + rss_map->base_qpn; + } + + return err; +} + +static int mlx4_en_set_rxfh(struct net_device *dev, const u32 *ring_index, + const u8 *key) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + int port_up = 0; + int err = 0; + int i; + int rss_rings = 0; + + /* Calculate RSS table size and make sure flows are spread evenly + * between rings + */ + for (i = 0; i < priv->rx_ring_num; i++) { + if (i > 0 && !ring_index[i] && !rss_rings) + rss_rings = i; + + if (ring_index[i] != (i % (rss_rings ?: priv->rx_ring_num))) + return -EINVAL; + } + + if (!rss_rings) + rss_rings = priv->rx_ring_num; + + /* RSS table size must be an order of 2 */ + if (!is_power_of_2(rss_rings)) + return -EINVAL; + + mutex_lock(&mdev->state_lock); + if (priv->port_up) { + port_up = 1; + mlx4_en_stop_port(dev, 1); + } + + priv->prof->rss_rings = rss_rings; + + if (port_up) { + err = mlx4_en_start_port(dev); + if (err) + en_err(priv, "Failed starting port\n"); + } + + mutex_unlock(&mdev->state_lock); + return err; +} + +#define all_zeros_or_all_ones(field) \ + ((field) == 0 || (field) == (__force typeof(field))-1) + +static int mlx4_en_validate_flow(struct net_device *dev, + struct ethtool_rxnfc *cmd) +{ + struct ethtool_usrip4_spec *l3_mask; + struct ethtool_tcpip4_spec *l4_mask; + struct ethhdr *eth_mask; + + if (cmd->fs.location >= MAX_NUM_OF_FS_RULES) + return -EINVAL; + + if (cmd->fs.flow_type & FLOW_MAC_EXT) { + /* dest mac mask must be ff:ff:ff:ff:ff:ff */ + if (!is_broadcast_ether_addr(cmd->fs.m_ext.h_dest)) + return -EINVAL; + } + + switch (cmd->fs.flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) { + case TCP_V4_FLOW: + case UDP_V4_FLOW: + if (cmd->fs.m_u.tcp_ip4_spec.tos) + return -EINVAL; + l4_mask = &cmd->fs.m_u.tcp_ip4_spec; + /* don't allow mask which isn't all 0 or 1 */ + if (!all_zeros_or_all_ones(l4_mask->ip4src) || + !all_zeros_or_all_ones(l4_mask->ip4dst) || + !all_zeros_or_all_ones(l4_mask->psrc) || + !all_zeros_or_all_ones(l4_mask->pdst)) + return -EINVAL; + break; + case IP_USER_FLOW: + l3_mask = &cmd->fs.m_u.usr_ip4_spec; + if (l3_mask->l4_4_bytes || l3_mask->tos || l3_mask->proto || + cmd->fs.h_u.usr_ip4_spec.ip_ver != ETH_RX_NFC_IP4 || + (!l3_mask->ip4src && !l3_mask->ip4dst) || + !all_zeros_or_all_ones(l3_mask->ip4src) || + !all_zeros_or_all_ones(l3_mask->ip4dst)) + return -EINVAL; + break; + case ETHER_FLOW: + eth_mask = &cmd->fs.m_u.ether_spec; + /* source mac mask must not be set */ + if (!is_zero_ether_addr(eth_mask->h_source)) + return -EINVAL; + + /* dest mac mask must be ff:ff:ff:ff:ff:ff */ + if (!is_broadcast_ether_addr(eth_mask->h_dest)) + return -EINVAL; + + if (!all_zeros_or_all_ones(eth_mask->h_proto)) + return -EINVAL; + break; + default: + return -EINVAL; + } + + if ((cmd->fs.flow_type & FLOW_EXT)) { + if (cmd->fs.m_ext.vlan_etype || + !((cmd->fs.m_ext.vlan_tci & cpu_to_be16(VLAN_VID_MASK)) == + 0 || + (cmd->fs.m_ext.vlan_tci & cpu_to_be16(VLAN_VID_MASK)) == + cpu_to_be16(VLAN_VID_MASK))) + return -EINVAL; + + if (cmd->fs.m_ext.vlan_tci) { + if (be16_to_cpu(cmd->fs.h_ext.vlan_tci) >= VLAN_N_VID) + return -EINVAL; + + } + } + + return 0; +} + +static int mlx4_en_ethtool_add_mac_rule(struct ethtool_rxnfc *cmd, + struct list_head *rule_list_h, + struct mlx4_spec_list *spec_l2, + unsigned char *mac) +{ + int err = 0; + __be64 mac_msk = cpu_to_be64(MLX4_MAC_MASK << 16); + + spec_l2->id = MLX4_NET_TRANS_RULE_ID_ETH; + memcpy(spec_l2->eth.dst_mac_msk, &mac_msk, ETH_ALEN); + memcpy(spec_l2->eth.dst_mac, mac, ETH_ALEN); + + if ((cmd->fs.flow_type & FLOW_EXT) && + (cmd->fs.m_ext.vlan_tci & cpu_to_be16(VLAN_VID_MASK))) { + spec_l2->eth.vlan_id = cmd->fs.h_ext.vlan_tci; + spec_l2->eth.vlan_id_msk = cpu_to_be16(VLAN_VID_MASK); + } + + list_add_tail(&spec_l2->list, rule_list_h); + + return err; +} + +static int mlx4_en_ethtool_add_mac_rule_by_ipv4(struct mlx4_en_priv *priv, + struct ethtool_rxnfc *cmd, + struct list_head *rule_list_h, + struct mlx4_spec_list *spec_l2, + __be32 ipv4_dst) +{ +#ifdef CONFIG_INET + unsigned char mac[ETH_ALEN]; + + if (!ipv4_is_multicast(ipv4_dst)) { + if (cmd->fs.flow_type & FLOW_MAC_EXT) + memcpy(&mac, cmd->fs.h_ext.h_dest, ETH_ALEN); + else + memcpy(&mac, priv->dev->dev_addr, ETH_ALEN); + } else { + ip_eth_mc_map(ipv4_dst, mac); + } + + return mlx4_en_ethtool_add_mac_rule(cmd, rule_list_h, spec_l2, &mac[0]); +#else + return -EINVAL; +#endif +} + +static int add_ip_rule(struct mlx4_en_priv *priv, + struct ethtool_rxnfc *cmd, + struct list_head *list_h) +{ + int err; + struct mlx4_spec_list *spec_l2 = NULL; + struct mlx4_spec_list *spec_l3 = NULL; + struct ethtool_usrip4_spec *l3_mask = &cmd->fs.m_u.usr_ip4_spec; + + spec_l3 = kzalloc(sizeof(*spec_l3), GFP_KERNEL); + spec_l2 = kzalloc(sizeof(*spec_l2), GFP_KERNEL); + if (!spec_l2 || !spec_l3) { + err = -ENOMEM; + goto free_spec; + } + + err = mlx4_en_ethtool_add_mac_rule_by_ipv4(priv, cmd, list_h, spec_l2, + cmd->fs.h_u. + usr_ip4_spec.ip4dst); + if (err) + goto free_spec; + spec_l3->id = MLX4_NET_TRANS_RULE_ID_IPV4; + spec_l3->ipv4.src_ip = cmd->fs.h_u.usr_ip4_spec.ip4src; + if (l3_mask->ip4src) + spec_l3->ipv4.src_ip_msk = EN_ETHTOOL_WORD_MASK; + spec_l3->ipv4.dst_ip = cmd->fs.h_u.usr_ip4_spec.ip4dst; + if (l3_mask->ip4dst) + spec_l3->ipv4.dst_ip_msk = EN_ETHTOOL_WORD_MASK; + list_add_tail(&spec_l3->list, list_h); + + return 0; + +free_spec: + kfree(spec_l2); + kfree(spec_l3); + return err; +} + +static int add_tcp_udp_rule(struct mlx4_en_priv *priv, + struct ethtool_rxnfc *cmd, + struct list_head *list_h, int proto) +{ + int err; + struct mlx4_spec_list *spec_l2 = NULL; + struct mlx4_spec_list *spec_l3 = NULL; + struct mlx4_spec_list *spec_l4 = NULL; + struct ethtool_tcpip4_spec *l4_mask = &cmd->fs.m_u.tcp_ip4_spec; + + spec_l2 = kzalloc(sizeof(*spec_l2), GFP_KERNEL); + spec_l3 = kzalloc(sizeof(*spec_l3), GFP_KERNEL); + spec_l4 = kzalloc(sizeof(*spec_l4), GFP_KERNEL); + if (!spec_l2 || !spec_l3 || !spec_l4) { + err = -ENOMEM; + goto free_spec; + } + + spec_l3->id = MLX4_NET_TRANS_RULE_ID_IPV4; + + if (proto == TCP_V4_FLOW) { + err = mlx4_en_ethtool_add_mac_rule_by_ipv4(priv, cmd, list_h, + spec_l2, + cmd->fs.h_u. + tcp_ip4_spec.ip4dst); + if (err) + goto free_spec; + spec_l4->id = MLX4_NET_TRANS_RULE_ID_TCP; + spec_l3->ipv4.src_ip = cmd->fs.h_u.tcp_ip4_spec.ip4src; + spec_l3->ipv4.dst_ip = cmd->fs.h_u.tcp_ip4_spec.ip4dst; + spec_l4->tcp_udp.src_port = cmd->fs.h_u.tcp_ip4_spec.psrc; + spec_l4->tcp_udp.dst_port = cmd->fs.h_u.tcp_ip4_spec.pdst; + } else { + err = mlx4_en_ethtool_add_mac_rule_by_ipv4(priv, cmd, list_h, + spec_l2, + cmd->fs.h_u. + udp_ip4_spec.ip4dst); + if (err) + goto free_spec; + spec_l4->id = MLX4_NET_TRANS_RULE_ID_UDP; + spec_l3->ipv4.src_ip = cmd->fs.h_u.udp_ip4_spec.ip4src; + spec_l3->ipv4.dst_ip = cmd->fs.h_u.udp_ip4_spec.ip4dst; + spec_l4->tcp_udp.src_port = cmd->fs.h_u.udp_ip4_spec.psrc; + spec_l4->tcp_udp.dst_port = cmd->fs.h_u.udp_ip4_spec.pdst; + } + + if (l4_mask->ip4src) + spec_l3->ipv4.src_ip_msk = EN_ETHTOOL_WORD_MASK; + if (l4_mask->ip4dst) + spec_l3->ipv4.dst_ip_msk = EN_ETHTOOL_WORD_MASK; + + if (l4_mask->psrc) + spec_l4->tcp_udp.src_port_msk = EN_ETHTOOL_SHORT_MASK; + if (l4_mask->pdst) + spec_l4->tcp_udp.dst_port_msk = EN_ETHTOOL_SHORT_MASK; + + list_add_tail(&spec_l3->list, list_h); + list_add_tail(&spec_l4->list, list_h); + + return 0; + +free_spec: + kfree(spec_l2); + kfree(spec_l3); + kfree(spec_l4); + return err; +} + +static int mlx4_en_ethtool_to_net_trans_rule(struct net_device *dev, + struct ethtool_rxnfc *cmd, + struct list_head *rule_list_h) +{ + int err; + struct ethhdr *eth_spec; + struct mlx4_spec_list *spec_l2; + struct mlx4_en_priv *priv = netdev_priv(dev); + + err = mlx4_en_validate_flow(dev, cmd); + if (err) + return err; + + switch (cmd->fs.flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) { + case ETHER_FLOW: + spec_l2 = kzalloc(sizeof(*spec_l2), GFP_KERNEL); + if (!spec_l2) + return -ENOMEM; + + eth_spec = &cmd->fs.h_u.ether_spec; + mlx4_en_ethtool_add_mac_rule(cmd, rule_list_h, spec_l2, + ð_spec->h_dest[0]); + spec_l2->eth.ether_type = eth_spec->h_proto; + if (eth_spec->h_proto) + spec_l2->eth.ether_type_enable = 1; + break; + case IP_USER_FLOW: + err = add_ip_rule(priv, cmd, rule_list_h); + break; + case TCP_V4_FLOW: + err = add_tcp_udp_rule(priv, cmd, rule_list_h, TCP_V4_FLOW); + break; + case UDP_V4_FLOW: + err = add_tcp_udp_rule(priv, cmd, rule_list_h, UDP_V4_FLOW); + break; + } + + return err; +} + +static int mlx4_en_flow_replace(struct net_device *dev, + struct ethtool_rxnfc *cmd) +{ + int err; + struct mlx4_en_priv *priv = netdev_priv(dev); + struct ethtool_flow_id *loc_rule; + struct mlx4_spec_list *spec, *tmp_spec; + u32 qpn; + u64 reg_id; + + struct mlx4_net_trans_rule rule = { + .queue_mode = MLX4_NET_TRANS_Q_FIFO, + .exclusive = 0, + .allow_loopback = 1, + .promisc_mode = MLX4_FS_REGULAR, + }; + + rule.port = priv->port; + rule.priority = MLX4_DOMAIN_ETHTOOL | cmd->fs.location; + INIT_LIST_HEAD(&rule.list); + + /* Allow direct QP attaches if the EN_ETHTOOL_QP_ATTACH flag is set */ + if (cmd->fs.ring_cookie == RX_CLS_FLOW_DISC) + qpn = priv->drop_qp.qpn; + else if (cmd->fs.ring_cookie & EN_ETHTOOL_QP_ATTACH) { + qpn = cmd->fs.ring_cookie & (EN_ETHTOOL_QP_ATTACH - 1); + } else { + if (cmd->fs.ring_cookie >= priv->rx_ring_num) { + en_warn(priv, "rxnfc: RX ring (%llu) doesn't exist\n", + cmd->fs.ring_cookie); + return -EINVAL; + } + qpn = priv->rss_map.qps[cmd->fs.ring_cookie].qpn; + if (!qpn) { + en_warn(priv, "rxnfc: RX ring (%llu) is inactive\n", + cmd->fs.ring_cookie); + return -EINVAL; + } + } + rule.qpn = qpn; + err = mlx4_en_ethtool_to_net_trans_rule(dev, cmd, &rule.list); + if (err) + goto out_free_list; + + loc_rule = &priv->ethtool_rules[cmd->fs.location]; + if (loc_rule->id) { + err = mlx4_flow_detach(priv->mdev->dev, loc_rule->id); + if (err) { + en_err(priv, "Fail to detach network rule at location %d. registration id = %llx\n", + cmd->fs.location, loc_rule->id); + goto out_free_list; + } + loc_rule->id = 0; + memset(&loc_rule->flow_spec, 0, + sizeof(struct ethtool_rx_flow_spec)); + list_del(&loc_rule->list); + } + err = mlx4_flow_attach(priv->mdev->dev, &rule, ®_id); + if (err) { + en_err(priv, "Fail to attach network rule at location %d\n", + cmd->fs.location); + goto out_free_list; + } + loc_rule->id = reg_id; + memcpy(&loc_rule->flow_spec, &cmd->fs, + sizeof(struct ethtool_rx_flow_spec)); + list_add_tail(&loc_rule->list, &priv->ethtool_list); + +out_free_list: + list_for_each_entry_safe(spec, tmp_spec, &rule.list, list) { + list_del(&spec->list); + kfree(spec); + } + return err; +} + +static int mlx4_en_flow_detach(struct net_device *dev, + struct ethtool_rxnfc *cmd) +{ + int err = 0; + struct ethtool_flow_id *rule; + struct mlx4_en_priv *priv = netdev_priv(dev); + + if (cmd->fs.location >= MAX_NUM_OF_FS_RULES) + return -EINVAL; + + rule = &priv->ethtool_rules[cmd->fs.location]; + if (!rule->id) { + err = -ENOENT; + goto out; + } + + err = mlx4_flow_detach(priv->mdev->dev, rule->id); + if (err) { + en_err(priv, "Fail to detach network rule at location %d. registration id = 0x%llx\n", + cmd->fs.location, rule->id); + goto out; + } + rule->id = 0; + memset(&rule->flow_spec, 0, sizeof(struct ethtool_rx_flow_spec)); + list_del(&rule->list); +out: + return err; + +} + +static int mlx4_en_get_flow(struct net_device *dev, struct ethtool_rxnfc *cmd, + int loc) +{ + int err = 0; + struct ethtool_flow_id *rule; + struct mlx4_en_priv *priv = netdev_priv(dev); + + if (loc < 0 || loc >= MAX_NUM_OF_FS_RULES) + return -EINVAL; + + rule = &priv->ethtool_rules[loc]; + if (rule->id) + memcpy(&cmd->fs, &rule->flow_spec, + sizeof(struct ethtool_rx_flow_spec)); + else + err = -ENOENT; + + return err; +} + +static int mlx4_en_get_num_flows(struct mlx4_en_priv *priv) +{ + + int i, res = 0; + for (i = 0; i < MAX_NUM_OF_FS_RULES; i++) { + if (priv->ethtool_rules[i].id) + res++; + } + return res; + +} + +static int mlx4_en_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd, + u32 *rule_locs) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + int err = 0; + int i = 0, priority = 0; + + if ((cmd->cmd == ETHTOOL_GRXCLSRLCNT || + cmd->cmd == ETHTOOL_GRXCLSRULE || + cmd->cmd == ETHTOOL_GRXCLSRLALL) && + (mdev->dev->caps.steering_mode != + MLX4_STEERING_MODE_DEVICE_MANAGED || !priv->port_up)) + return -EINVAL; + + switch (cmd->cmd) { + case ETHTOOL_GRXRINGS: + cmd->data = priv->rx_ring_num; + break; + case ETHTOOL_GRXCLSRLCNT: + cmd->rule_cnt = mlx4_en_get_num_flows(priv); + break; + case ETHTOOL_GRXCLSRULE: + err = mlx4_en_get_flow(dev, cmd, cmd->fs.location); + break; + case ETHTOOL_GRXCLSRLALL: + while ((!err || err == -ENOENT) && priority < cmd->rule_cnt) { + err = mlx4_en_get_flow(dev, cmd, i); + if (!err) + rule_locs[priority++] = i; + i++; + } + err = 0; + break; + default: + err = -EOPNOTSUPP; + break; + } + + return err; +} + +static int mlx4_en_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd) +{ + int err = 0; + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + + if (mdev->dev->caps.steering_mode != + MLX4_STEERING_MODE_DEVICE_MANAGED || !priv->port_up) + return -EINVAL; + + switch (cmd->cmd) { + case ETHTOOL_SRXCLSRLINS: + err = mlx4_en_flow_replace(dev, cmd); + break; + case ETHTOOL_SRXCLSRLDEL: + err = mlx4_en_flow_detach(dev, cmd); + break; + default: + en_warn(priv, "Unsupported ethtool command. (%d)\n", cmd->cmd); + return -EINVAL; + } + + return err; +} + +static void mlx4_en_get_channels(struct net_device *dev, + struct ethtool_channels *channel) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + + memset(channel, 0, sizeof(*channel)); + + channel->max_rx = MAX_RX_RINGS; + channel->max_tx = MLX4_EN_MAX_TX_RING_P_UP; + + channel->rx_count = priv->rx_ring_num; + channel->tx_count = priv->tx_ring_num / MLX4_EN_NUM_UP; +} + +static int mlx4_en_set_channels(struct net_device *dev, + struct ethtool_channels *channel) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + int port_up = 0; + int err = 0; + + if (channel->other_count || channel->combined_count || + channel->tx_count > MLX4_EN_MAX_TX_RING_P_UP || + channel->rx_count > MAX_RX_RINGS || + !channel->tx_count || !channel->rx_count) + return -EINVAL; + + mutex_lock(&mdev->state_lock); + if (priv->port_up) { + port_up = 1; + mlx4_en_stop_port(dev, 1); + } + + mlx4_en_free_resources(priv); + + priv->num_tx_rings_p_up = channel->tx_count; + priv->tx_ring_num = channel->tx_count * MLX4_EN_NUM_UP; + priv->rx_ring_num = channel->rx_count; + + err = mlx4_en_alloc_resources(priv); + if (err) { + en_err(priv, "Failed reallocating port resources\n"); + goto out; + } + + netif_set_real_num_tx_queues(dev, priv->tx_ring_num); + netif_set_real_num_rx_queues(dev, priv->rx_ring_num); + + if (dev->num_tc) + mlx4_en_setup_tc(dev, MLX4_EN_NUM_UP); + + en_warn(priv, "Using %d TX rings\n", priv->tx_ring_num); + en_warn(priv, "Using %d RX rings\n", priv->rx_ring_num); + + if (port_up) { + err = mlx4_en_start_port(dev); + if (err) + en_err(priv, "Failed starting port\n"); + } + + err = mlx4_en_moderation_update(priv); + +out: + mutex_unlock(&mdev->state_lock); + return err; +} + +static int mlx4_en_get_ts_info(struct net_device *dev, + struct ethtool_ts_info *info) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + int ret; + + ret = ethtool_op_get_ts_info(dev, info); + if (ret) + return ret; + + if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS) { + info->so_timestamping |= + SOF_TIMESTAMPING_TX_HARDWARE | + SOF_TIMESTAMPING_RX_HARDWARE | + SOF_TIMESTAMPING_RAW_HARDWARE; + + info->tx_types = + (1 << HWTSTAMP_TX_OFF) | + (1 << HWTSTAMP_TX_ON); + + info->rx_filters = + (1 << HWTSTAMP_FILTER_NONE) | + (1 << HWTSTAMP_FILTER_ALL); + + if (mdev->ptp_clock) + info->phc_index = ptp_clock_index(mdev->ptp_clock); + } + + return ret; +} + +static int mlx4_en_set_priv_flags(struct net_device *dev, u32 flags) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + bool bf_enabled_new = !!(flags & MLX4_EN_PRIV_FLAGS_BLUEFLAME); + bool bf_enabled_old = !!(priv->pflags & MLX4_EN_PRIV_FLAGS_BLUEFLAME); + int i; + + if (bf_enabled_new == bf_enabled_old) + return 0; /* Nothing to do */ + + if (bf_enabled_new) { + bool bf_supported = true; + + for (i = 0; i < priv->tx_ring_num; i++) + bf_supported &= priv->tx_ring[i]->bf_alloced; + + if (!bf_supported) { + en_err(priv, "BlueFlame is not supported\n"); + return -EINVAL; + } + + priv->pflags |= MLX4_EN_PRIV_FLAGS_BLUEFLAME; + } else { + priv->pflags &= ~MLX4_EN_PRIV_FLAGS_BLUEFLAME; + } + + for (i = 0; i < priv->tx_ring_num; i++) + priv->tx_ring[i]->bf_enabled = bf_enabled_new; + + en_info(priv, "BlueFlame %s\n", + bf_enabled_new ? "Enabled" : "Disabled"); + + return 0; +} + +static u32 mlx4_en_get_priv_flags(struct net_device *dev) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + + return priv->pflags; +} + +static int mlx4_en_get_tunable(struct net_device *dev, + const struct ethtool_tunable *tuna, + void *data) +{ + const struct mlx4_en_priv *priv = netdev_priv(dev); + int ret = 0; + + switch (tuna->id) { + case ETHTOOL_TX_COPYBREAK: + *(u32 *)data = priv->prof->inline_thold; + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int mlx4_en_set_tunable(struct net_device *dev, + const struct ethtool_tunable *tuna, + const void *data) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + int val, ret = 0; + + switch (tuna->id) { + case ETHTOOL_TX_COPYBREAK: + val = *(u32 *)data; + if (val < MIN_PKT_LEN || val > MAX_INLINE) + ret = -EINVAL; + else + priv->prof->inline_thold = val; + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + + +const struct ethtool_ops mlx4_en_ethtool_ops = { + .get_drvinfo = mlx4_en_get_drvinfo, + .get_settings = mlx4_en_get_settings, + .set_settings = mlx4_en_set_settings, + .get_link = ethtool_op_get_link, + .get_strings = mlx4_en_get_strings, + .get_sset_count = mlx4_en_get_sset_count, + .get_ethtool_stats = mlx4_en_get_ethtool_stats, + .self_test = mlx4_en_self_test, + .get_wol = mlx4_en_get_wol, + .set_wol = mlx4_en_set_wol, + .get_msglevel = mlx4_en_get_msglevel, + .set_msglevel = mlx4_en_set_msglevel, + .get_coalesce = mlx4_en_get_coalesce, + .set_coalesce = mlx4_en_set_coalesce, + .get_pauseparam = mlx4_en_get_pauseparam, + .set_pauseparam = mlx4_en_set_pauseparam, + .get_ringparam = mlx4_en_get_ringparam, + .set_ringparam = mlx4_en_set_ringparam, + .get_rxnfc = mlx4_en_get_rxnfc, + .set_rxnfc = mlx4_en_set_rxnfc, + .get_rxfh_indir_size = mlx4_en_get_rxfh_indir_size, + .get_rxfh = mlx4_en_get_rxfh, + .set_rxfh = mlx4_en_set_rxfh, + .get_channels = mlx4_en_get_channels, + .set_channels = mlx4_en_set_channels, + .get_ts_info = mlx4_en_get_ts_info, + .set_priv_flags = mlx4_en_set_priv_flags, + .get_priv_flags = mlx4_en_get_priv_flags, + .get_tunable = mlx4_en_get_tunable, + .set_tunable = mlx4_en_set_tunable, +}; + + + + + diff --git a/drivers/net/ethernet/mellanox/mlx4/en_main.c b/drivers/net/ethernet/mellanox/mlx4/en_main.c new file mode 100644 index 0000000..2091ae8 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/en_main.c @@ -0,0 +1,368 @@ +/* + * Copyright (c) 2007 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "mlx4_en.h" + +MODULE_AUTHOR("Liran Liss, Yevgeny Petrilin"); +MODULE_DESCRIPTION("Mellanox ConnectX HCA Ethernet driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION " ("DRV_RELDATE")"); + +static const char mlx4_en_version[] = + DRV_NAME ": Mellanox ConnectX HCA Ethernet driver v" + DRV_VERSION " (" DRV_RELDATE ")\n"; + +#define MLX4_EN_PARM_INT(X, def_val, desc) \ + static unsigned int X = def_val;\ + module_param(X , uint, 0444); \ + MODULE_PARM_DESC(X, desc); + + +/* + * Device scope module parameters + */ + +/* Enable RSS UDP traffic */ +MLX4_EN_PARM_INT(udp_rss, 1, + "Enable RSS for incoming UDP traffic or disabled (0)"); + +/* Priority pausing */ +MLX4_EN_PARM_INT(pfctx, 0, "Priority based Flow Control policy on TX[7:0]." + " Per priority bit mask"); +MLX4_EN_PARM_INT(pfcrx, 0, "Priority based Flow Control policy on RX[7:0]." + " Per priority bit mask"); + +MLX4_EN_PARM_INT(inline_thold, MAX_INLINE, + "Threshold for using inline data (range: 17-104, default: 104)"); + +#define MAX_PFC_TX 0xff +#define MAX_PFC_RX 0xff + +void en_print(const char *level, const struct mlx4_en_priv *priv, + const char *format, ...) +{ + va_list args; + struct va_format vaf; + + va_start(args, format); + + vaf.fmt = format; + vaf.va = &args; + if (priv->registered) + printk("%s%s: %s: %pV", + level, DRV_NAME, priv->dev->name, &vaf); + else + printk("%s%s: %s: Port %d: %pV", + level, DRV_NAME, dev_name(&priv->mdev->pdev->dev), + priv->port, &vaf); + va_end(args); +} + +void mlx4_en_update_loopback_state(struct net_device *dev, + netdev_features_t features) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + + priv->flags &= ~(MLX4_EN_FLAG_RX_FILTER_NEEDED| + MLX4_EN_FLAG_ENABLE_HW_LOOPBACK); + + /* Drop the packet if SRIOV is not enabled + * and not performing the selftest or flb disabled + */ + if (mlx4_is_mfunc(priv->mdev->dev) && + !(features & NETIF_F_LOOPBACK) && !priv->validate_loopback) + priv->flags |= MLX4_EN_FLAG_RX_FILTER_NEEDED; + + /* Set dmac in Tx WQE if we are in SRIOV mode or if loopback selftest + * is requested + */ + if (mlx4_is_mfunc(priv->mdev->dev) || priv->validate_loopback) + priv->flags |= MLX4_EN_FLAG_ENABLE_HW_LOOPBACK; +} + +static int mlx4_en_get_profile(struct mlx4_en_dev *mdev) +{ + struct mlx4_en_profile *params = &mdev->profile; + int i; + + params->udp_rss = udp_rss; + params->num_tx_rings_p_up = mlx4_low_memory_profile() ? + MLX4_EN_MIN_TX_RING_P_UP : + min_t(int, num_online_cpus(), MLX4_EN_MAX_TX_RING_P_UP); + + if (params->udp_rss && !(mdev->dev->caps.flags + & MLX4_DEV_CAP_FLAG_UDP_RSS)) { + mlx4_warn(mdev, "UDP RSS is not supported on this device\n"); + params->udp_rss = 0; + } + for (i = 1; i <= MLX4_MAX_PORTS; i++) { + params->prof[i].rx_pause = 1; + params->prof[i].rx_ppp = pfcrx; + params->prof[i].tx_pause = 1; + params->prof[i].tx_ppp = pfctx; + params->prof[i].tx_ring_size = MLX4_EN_DEF_TX_RING_SIZE; + params->prof[i].rx_ring_size = MLX4_EN_DEF_RX_RING_SIZE; + params->prof[i].tx_ring_num = params->num_tx_rings_p_up * + MLX4_EN_NUM_UP; + params->prof[i].rss_rings = 0; + params->prof[i].inline_thold = inline_thold; + } + + return 0; +} + +static void *mlx4_en_get_netdev(struct mlx4_dev *dev, void *ctx, u8 port) +{ + struct mlx4_en_dev *endev = ctx; + + return endev->pndev[port]; +} + +static void mlx4_en_event(struct mlx4_dev *dev, void *endev_ptr, + enum mlx4_dev_event event, unsigned long port) +{ + struct mlx4_en_dev *mdev = (struct mlx4_en_dev *) endev_ptr; + struct mlx4_en_priv *priv; + + switch (event) { + case MLX4_DEV_EVENT_PORT_UP: + case MLX4_DEV_EVENT_PORT_DOWN: + if (!mdev->pndev[port]) + return; + priv = netdev_priv(mdev->pndev[port]); + /* To prevent races, we poll the link state in a separate + task rather than changing it here */ + priv->link_state = event; + queue_work(mdev->workqueue, &priv->linkstate_task); + break; + + case MLX4_DEV_EVENT_CATASTROPHIC_ERROR: + mlx4_err(mdev, "Internal error detected, restarting device\n"); + break; + + case MLX4_DEV_EVENT_SLAVE_INIT: + case MLX4_DEV_EVENT_SLAVE_SHUTDOWN: + break; + default: + if (port < 1 || port > dev->caps.num_ports || + !mdev->pndev[port]) + return; + mlx4_warn(mdev, "Unhandled event %d for port %d\n", event, + (int) port); + } +} + +static void mlx4_en_remove(struct mlx4_dev *dev, void *endev_ptr) +{ + struct mlx4_en_dev *mdev = endev_ptr; + int i; + + mutex_lock(&mdev->state_lock); + mdev->device_up = false; + mutex_unlock(&mdev->state_lock); + + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) + if (mdev->pndev[i]) + mlx4_en_destroy_netdev(mdev->pndev[i]); + + if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS) + mlx4_en_remove_timestamp(mdev); + + flush_workqueue(mdev->workqueue); + destroy_workqueue(mdev->workqueue); + (void) mlx4_mr_free(dev, &mdev->mr); + iounmap(mdev->uar_map); + mlx4_uar_free(dev, &mdev->priv_uar); + mlx4_pd_free(dev, mdev->priv_pdn); + kfree(mdev); +} + +static void *mlx4_en_add(struct mlx4_dev *dev) +{ + struct mlx4_en_dev *mdev; + int i; + int err; + + printk_once(KERN_INFO "%s", mlx4_en_version); + + mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); + if (!mdev) { + err = -ENOMEM; + goto err_free_res; + } + + if (mlx4_pd_alloc(dev, &mdev->priv_pdn)) + goto err_free_dev; + + if (mlx4_uar_alloc(dev, &mdev->priv_uar)) + goto err_pd; + + mdev->uar_map = ioremap((phys_addr_t) mdev->priv_uar.pfn << PAGE_SHIFT, + PAGE_SIZE); + if (!mdev->uar_map) + goto err_uar; + spin_lock_init(&mdev->uar_lock); + + mdev->dev = dev; + mdev->dma_device = &(dev->pdev->dev); + mdev->pdev = dev->pdev; + mdev->device_up = false; + + mdev->LSO_support = !!(dev->caps.flags & (1 << 15)); + if (!mdev->LSO_support) + mlx4_warn(mdev, "LSO not supported, please upgrade to later FW version to enable LSO\n"); + + if (mlx4_mr_alloc(mdev->dev, mdev->priv_pdn, 0, ~0ull, + MLX4_PERM_LOCAL_WRITE | MLX4_PERM_LOCAL_READ, + 0, 0, &mdev->mr)) { + mlx4_err(mdev, "Failed allocating memory region\n"); + goto err_map; + } + if (mlx4_mr_enable(mdev->dev, &mdev->mr)) { + mlx4_err(mdev, "Failed enabling memory region\n"); + goto err_mr; + } + + /* Build device profile according to supplied module parameters */ + err = mlx4_en_get_profile(mdev); + if (err) { + mlx4_err(mdev, "Bad module parameters, aborting\n"); + goto err_mr; + } + + /* Configure which ports to start according to module parameters */ + mdev->port_cnt = 0; + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) + mdev->port_cnt++; + + /* Initialize time stamp mechanism */ + if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS) + mlx4_en_init_timestamp(mdev); + + /* Set default number of RX rings*/ + mlx4_en_set_num_rx_rings(mdev); + + /* Create our own workqueue for reset/multicast tasks + * Note: we cannot use the shared workqueue because of deadlocks caused + * by the rtnl lock */ + mdev->workqueue = create_singlethread_workqueue("mlx4_en"); + if (!mdev->workqueue) { + err = -ENOMEM; + goto err_mr; + } + + /* At this stage all non-port specific tasks are complete: + * mark the card state as up */ + mutex_init(&mdev->state_lock); + mdev->device_up = true; + + /* Setup ports */ + + /* Create a netdev for each port */ + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) { + mlx4_info(mdev, "Activating port:%d\n", i); + if (mlx4_en_init_netdev(mdev, i, &mdev->profile.prof[i])) + mdev->pndev[i] = NULL; + } + + return mdev; + +err_mr: + (void) mlx4_mr_free(dev, &mdev->mr); +err_map: + if (mdev->uar_map) + iounmap(mdev->uar_map); +err_uar: + mlx4_uar_free(dev, &mdev->priv_uar); +err_pd: + mlx4_pd_free(dev, mdev->priv_pdn); +err_free_dev: + kfree(mdev); +err_free_res: + return NULL; +} + +static struct mlx4_interface mlx4_en_interface = { + .add = mlx4_en_add, + .remove = mlx4_en_remove, + .event = mlx4_en_event, + .get_dev = mlx4_en_get_netdev, + .protocol = MLX4_PROT_ETH, +}; + +static void mlx4_en_verify_params(void) +{ + if (pfctx > MAX_PFC_TX) { + pr_warn("mlx4_en: WARNING: illegal module parameter pfctx 0x%x - should be in range 0-0x%x, will be changed to default (0)\n", + pfctx, MAX_PFC_TX); + pfctx = 0; + } + + if (pfcrx > MAX_PFC_RX) { + pr_warn("mlx4_en: WARNING: illegal module parameter pfcrx 0x%x - should be in range 0-0x%x, will be changed to default (0)\n", + pfcrx, MAX_PFC_RX); + pfcrx = 0; + } + + if (inline_thold < MIN_PKT_LEN || inline_thold > MAX_INLINE) { + pr_warn("mlx4_en: WARNING: illegal module parameter inline_thold %d - should be in range %d-%d, will be changed to default (%d)\n", + inline_thold, MIN_PKT_LEN, MAX_INLINE, MAX_INLINE); + inline_thold = MAX_INLINE; + } +} + +static int __init mlx4_en_init(void) +{ + mlx4_en_verify_params(); + + return mlx4_register_interface(&mlx4_en_interface); +} + +static void __exit mlx4_en_cleanup(void) +{ + mlx4_unregister_interface(&mlx4_en_interface); +} + +module_init(mlx4_en_init); +module_exit(mlx4_en_cleanup); + diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c new file mode 100644 index 0000000..4d69e38 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -0,0 +1,2652 @@ +/* + * Copyright (c) 2007 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "mlx4_en.h" +#include "en_port.h" + +int mlx4_en_setup_tc(struct net_device *dev, u8 up) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + int i; + unsigned int offset = 0; + + if (up && up != MLX4_EN_NUM_UP) + return -EINVAL; + + netdev_set_num_tc(dev, up); + + /* Partition Tx queues evenly amongst UP's */ + for (i = 0; i < up; i++) { + netdev_set_tc_queue(dev, i, priv->num_tx_rings_p_up, offset); + offset += priv->num_tx_rings_p_up; + } + + return 0; +} + +#ifdef CONFIG_NET_RX_BUSY_POLL +/* must be called with local_bh_disable()d */ +static int mlx4_en_low_latency_recv(struct napi_struct *napi) +{ + struct mlx4_en_cq *cq = container_of(napi, struct mlx4_en_cq, napi); + struct net_device *dev = cq->dev; + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_rx_ring *rx_ring = priv->rx_ring[cq->ring]; + int done; + + if (!priv->port_up) + return LL_FLUSH_FAILED; + + if (!mlx4_en_cq_lock_poll(cq)) + return LL_FLUSH_BUSY; + + done = mlx4_en_process_rx_cq(dev, cq, 4); + if (likely(done)) + rx_ring->cleaned += done; + else + rx_ring->misses++; + + mlx4_en_cq_unlock_poll(cq); + + return done; +} +#endif /* CONFIG_NET_RX_BUSY_POLL */ + +#ifdef CONFIG_RFS_ACCEL + +struct mlx4_en_filter { + struct list_head next; + struct work_struct work; + + u8 ip_proto; + __be32 src_ip; + __be32 dst_ip; + __be16 src_port; + __be16 dst_port; + + int rxq_index; + struct mlx4_en_priv *priv; + u32 flow_id; /* RFS infrastructure id */ + int id; /* mlx4_en driver id */ + u64 reg_id; /* Flow steering API id */ + u8 activated; /* Used to prevent expiry before filter + * is attached + */ + struct hlist_node filter_chain; +}; + +static void mlx4_en_filter_rfs_expire(struct mlx4_en_priv *priv); + +static enum mlx4_net_trans_rule_id mlx4_ip_proto_to_trans_rule_id(u8 ip_proto) +{ + switch (ip_proto) { + case IPPROTO_UDP: + return MLX4_NET_TRANS_RULE_ID_UDP; + case IPPROTO_TCP: + return MLX4_NET_TRANS_RULE_ID_TCP; + default: + return MLX4_NET_TRANS_RULE_NUM; + } +}; + +static void mlx4_en_filter_work(struct work_struct *work) +{ + struct mlx4_en_filter *filter = container_of(work, + struct mlx4_en_filter, + work); + struct mlx4_en_priv *priv = filter->priv; + struct mlx4_spec_list spec_tcp_udp = { + .id = mlx4_ip_proto_to_trans_rule_id(filter->ip_proto), + { + .tcp_udp = { + .dst_port = filter->dst_port, + .dst_port_msk = (__force __be16)-1, + .src_port = filter->src_port, + .src_port_msk = (__force __be16)-1, + }, + }, + }; + struct mlx4_spec_list spec_ip = { + .id = MLX4_NET_TRANS_RULE_ID_IPV4, + { + .ipv4 = { + .dst_ip = filter->dst_ip, + .dst_ip_msk = (__force __be32)-1, + .src_ip = filter->src_ip, + .src_ip_msk = (__force __be32)-1, + }, + }, + }; + struct mlx4_spec_list spec_eth = { + .id = MLX4_NET_TRANS_RULE_ID_ETH, + }; + struct mlx4_net_trans_rule rule = { + .list = LIST_HEAD_INIT(rule.list), + .queue_mode = MLX4_NET_TRANS_Q_LIFO, + .exclusive = 1, + .allow_loopback = 1, + .promisc_mode = MLX4_FS_REGULAR, + .port = priv->port, + .priority = MLX4_DOMAIN_RFS, + }; + int rc; + __be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16); + + if (spec_tcp_udp.id >= MLX4_NET_TRANS_RULE_NUM) { + en_warn(priv, "RFS: ignoring unsupported ip protocol (%d)\n", + filter->ip_proto); + goto ignore; + } + list_add_tail(&spec_eth.list, &rule.list); + list_add_tail(&spec_ip.list, &rule.list); + list_add_tail(&spec_tcp_udp.list, &rule.list); + + rule.qpn = priv->rss_map.qps[filter->rxq_index].qpn; + memcpy(spec_eth.eth.dst_mac, priv->dev->dev_addr, ETH_ALEN); + memcpy(spec_eth.eth.dst_mac_msk, &mac_mask, ETH_ALEN); + + filter->activated = 0; + + if (filter->reg_id) { + rc = mlx4_flow_detach(priv->mdev->dev, filter->reg_id); + if (rc && rc != -ENOENT) + en_err(priv, "Error detaching flow. rc = %d\n", rc); + } + + rc = mlx4_flow_attach(priv->mdev->dev, &rule, &filter->reg_id); + if (rc) + en_err(priv, "Error attaching flow. err = %d\n", rc); + +ignore: + mlx4_en_filter_rfs_expire(priv); + + filter->activated = 1; +} + +static inline struct hlist_head * +filter_hash_bucket(struct mlx4_en_priv *priv, __be32 src_ip, __be32 dst_ip, + __be16 src_port, __be16 dst_port) +{ + unsigned long l; + int bucket_idx; + + l = (__force unsigned long)src_port | + ((__force unsigned long)dst_port << 2); + l ^= (__force unsigned long)(src_ip ^ dst_ip); + + bucket_idx = hash_long(l, MLX4_EN_FILTER_HASH_SHIFT); + + return &priv->filter_hash[bucket_idx]; +} + +static struct mlx4_en_filter * +mlx4_en_filter_alloc(struct mlx4_en_priv *priv, int rxq_index, __be32 src_ip, + __be32 dst_ip, u8 ip_proto, __be16 src_port, + __be16 dst_port, u32 flow_id) +{ + struct mlx4_en_filter *filter = NULL; + + filter = kzalloc(sizeof(struct mlx4_en_filter), GFP_ATOMIC); + if (!filter) + return NULL; + + filter->priv = priv; + filter->rxq_index = rxq_index; + INIT_WORK(&filter->work, mlx4_en_filter_work); + + filter->src_ip = src_ip; + filter->dst_ip = dst_ip; + filter->ip_proto = ip_proto; + filter->src_port = src_port; + filter->dst_port = dst_port; + + filter->flow_id = flow_id; + + filter->id = priv->last_filter_id++ % RPS_NO_FILTER; + + list_add_tail(&filter->next, &priv->filters); + hlist_add_head(&filter->filter_chain, + filter_hash_bucket(priv, src_ip, dst_ip, src_port, + dst_port)); + + return filter; +} + +static void mlx4_en_filter_free(struct mlx4_en_filter *filter) +{ + struct mlx4_en_priv *priv = filter->priv; + int rc; + + list_del(&filter->next); + + rc = mlx4_flow_detach(priv->mdev->dev, filter->reg_id); + if (rc && rc != -ENOENT) + en_err(priv, "Error detaching flow. rc = %d\n", rc); + + kfree(filter); +} + +static inline struct mlx4_en_filter * +mlx4_en_filter_find(struct mlx4_en_priv *priv, __be32 src_ip, __be32 dst_ip, + u8 ip_proto, __be16 src_port, __be16 dst_port) +{ + struct mlx4_en_filter *filter; + struct mlx4_en_filter *ret = NULL; + + hlist_for_each_entry(filter, + filter_hash_bucket(priv, src_ip, dst_ip, + src_port, dst_port), + filter_chain) { + if (filter->src_ip == src_ip && + filter->dst_ip == dst_ip && + filter->ip_proto == ip_proto && + filter->src_port == src_port && + filter->dst_port == dst_port) { + ret = filter; + break; + } + } + + return ret; +} + +static int +mlx4_en_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb, + u16 rxq_index, u32 flow_id) +{ + struct mlx4_en_priv *priv = netdev_priv(net_dev); + struct mlx4_en_filter *filter; + const struct iphdr *ip; + const __be16 *ports; + u8 ip_proto; + __be32 src_ip; + __be32 dst_ip; + __be16 src_port; + __be16 dst_port; + int nhoff = skb_network_offset(skb); + int ret = 0; + + if (skb->protocol != htons(ETH_P_IP)) + return -EPROTONOSUPPORT; + + ip = (const struct iphdr *)(skb->data + nhoff); + if (ip_is_fragment(ip)) + return -EPROTONOSUPPORT; + + if ((ip->protocol != IPPROTO_TCP) && (ip->protocol != IPPROTO_UDP)) + return -EPROTONOSUPPORT; + ports = (const __be16 *)(skb->data + nhoff + 4 * ip->ihl); + + ip_proto = ip->protocol; + src_ip = ip->saddr; + dst_ip = ip->daddr; + src_port = ports[0]; + dst_port = ports[1]; + + spin_lock_bh(&priv->filters_lock); + filter = mlx4_en_filter_find(priv, src_ip, dst_ip, ip_proto, + src_port, dst_port); + if (filter) { + if (filter->rxq_index == rxq_index) + goto out; + + filter->rxq_index = rxq_index; + } else { + filter = mlx4_en_filter_alloc(priv, rxq_index, + src_ip, dst_ip, ip_proto, + src_port, dst_port, flow_id); + if (!filter) { + ret = -ENOMEM; + goto err; + } + } + + queue_work(priv->mdev->workqueue, &filter->work); + +out: + ret = filter->id; +err: + spin_unlock_bh(&priv->filters_lock); + + return ret; +} + +void mlx4_en_cleanup_filters(struct mlx4_en_priv *priv) +{ + struct mlx4_en_filter *filter, *tmp; + LIST_HEAD(del_list); + + spin_lock_bh(&priv->filters_lock); + list_for_each_entry_safe(filter, tmp, &priv->filters, next) { + list_move(&filter->next, &del_list); + hlist_del(&filter->filter_chain); + } + spin_unlock_bh(&priv->filters_lock); + + list_for_each_entry_safe(filter, tmp, &del_list, next) { + cancel_work_sync(&filter->work); + mlx4_en_filter_free(filter); + } +} + +static void mlx4_en_filter_rfs_expire(struct mlx4_en_priv *priv) +{ + struct mlx4_en_filter *filter = NULL, *tmp, *last_filter = NULL; + LIST_HEAD(del_list); + int i = 0; + + spin_lock_bh(&priv->filters_lock); + list_for_each_entry_safe(filter, tmp, &priv->filters, next) { + if (i > MLX4_EN_FILTER_EXPIRY_QUOTA) + break; + + if (filter->activated && + !work_pending(&filter->work) && + rps_may_expire_flow(priv->dev, + filter->rxq_index, filter->flow_id, + filter->id)) { + list_move(&filter->next, &del_list); + hlist_del(&filter->filter_chain); + } else + last_filter = filter; + + i++; + } + + if (last_filter && (&last_filter->next != priv->filters.next)) + list_move(&priv->filters, &last_filter->next); + + spin_unlock_bh(&priv->filters_lock); + + list_for_each_entry_safe(filter, tmp, &del_list, next) + mlx4_en_filter_free(filter); +} +#endif + +static int mlx4_en_vlan_rx_add_vid(struct net_device *dev, + __be16 proto, u16 vid) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + int err; + int idx; + + en_dbg(HW, priv, "adding VLAN:%d\n", vid); + + set_bit(vid, priv->active_vlans); + + /* Add VID to port VLAN filter */ + mutex_lock(&mdev->state_lock); + if (mdev->device_up && priv->port_up) { + err = mlx4_SET_VLAN_FLTR(mdev->dev, priv); + if (err) + en_err(priv, "Failed configuring VLAN filter\n"); + } + if (mlx4_register_vlan(mdev->dev, priv->port, vid, &idx)) + en_dbg(HW, priv, "failed adding vlan %d\n", vid); + mutex_unlock(&mdev->state_lock); + + return 0; +} + +static int mlx4_en_vlan_rx_kill_vid(struct net_device *dev, + __be16 proto, u16 vid) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + int err; + + en_dbg(HW, priv, "Killing VID:%d\n", vid); + + clear_bit(vid, priv->active_vlans); + + /* Remove VID from port VLAN filter */ + mutex_lock(&mdev->state_lock); + mlx4_unregister_vlan(mdev->dev, priv->port, vid); + + if (mdev->device_up && priv->port_up) { + err = mlx4_SET_VLAN_FLTR(mdev->dev, priv); + if (err) + en_err(priv, "Failed configuring VLAN filter\n"); + } + mutex_unlock(&mdev->state_lock); + + return 0; +} + +static void mlx4_en_u64_to_mac(unsigned char dst_mac[ETH_ALEN + 2], u64 src_mac) +{ + int i; + for (i = ETH_ALEN - 1; i >= 0; --i) { + dst_mac[i] = src_mac & 0xff; + src_mac >>= 8; + } + memset(&dst_mac[ETH_ALEN], 0, 2); +} + + +static int mlx4_en_tunnel_steer_add(struct mlx4_en_priv *priv, unsigned char *addr, + int qpn, u64 *reg_id) +{ + int err; + + if (priv->mdev->dev->caps.tunnel_offload_mode != MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) + return 0; /* do nothing */ + + err = mlx4_tunnel_steer_add(priv->mdev->dev, addr, priv->port, qpn, + MLX4_DOMAIN_NIC, reg_id); + if (err) { + en_err(priv, "failed to add vxlan steering rule, err %d\n", err); + return err; + } + en_dbg(DRV, priv, "added vxlan steering rule, mac %pM reg_id %llx\n", addr, *reg_id); + return 0; +} + + +static int mlx4_en_uc_steer_add(struct mlx4_en_priv *priv, + unsigned char *mac, int *qpn, u64 *reg_id) +{ + struct mlx4_en_dev *mdev = priv->mdev; + struct mlx4_dev *dev = mdev->dev; + int err; + + switch (dev->caps.steering_mode) { + case MLX4_STEERING_MODE_B0: { + struct mlx4_qp qp; + u8 gid[16] = {0}; + + qp.qpn = *qpn; + memcpy(&gid[10], mac, ETH_ALEN); + gid[5] = priv->port; + + err = mlx4_unicast_attach(dev, &qp, gid, 0, MLX4_PROT_ETH); + break; + } + case MLX4_STEERING_MODE_DEVICE_MANAGED: { + struct mlx4_spec_list spec_eth = { {NULL} }; + __be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16); + + struct mlx4_net_trans_rule rule = { + .queue_mode = MLX4_NET_TRANS_Q_FIFO, + .exclusive = 0, + .allow_loopback = 1, + .promisc_mode = MLX4_FS_REGULAR, + .priority = MLX4_DOMAIN_NIC, + }; + + rule.port = priv->port; + rule.qpn = *qpn; + INIT_LIST_HEAD(&rule.list); + + spec_eth.id = MLX4_NET_TRANS_RULE_ID_ETH; + memcpy(spec_eth.eth.dst_mac, mac, ETH_ALEN); + memcpy(spec_eth.eth.dst_mac_msk, &mac_mask, ETH_ALEN); + list_add_tail(&spec_eth.list, &rule.list); + + err = mlx4_flow_attach(dev, &rule, reg_id); + break; + } + default: + return -EINVAL; + } + if (err) + en_warn(priv, "Failed Attaching Unicast\n"); + + return err; +} + +static void mlx4_en_uc_steer_release(struct mlx4_en_priv *priv, + unsigned char *mac, int qpn, u64 reg_id) +{ + struct mlx4_en_dev *mdev = priv->mdev; + struct mlx4_dev *dev = mdev->dev; + + switch (dev->caps.steering_mode) { + case MLX4_STEERING_MODE_B0: { + struct mlx4_qp qp; + u8 gid[16] = {0}; + + qp.qpn = qpn; + memcpy(&gid[10], mac, ETH_ALEN); + gid[5] = priv->port; + + mlx4_unicast_detach(dev, &qp, gid, MLX4_PROT_ETH); + break; + } + case MLX4_STEERING_MODE_DEVICE_MANAGED: { + mlx4_flow_detach(dev, reg_id); + break; + } + default: + en_err(priv, "Invalid steering mode.\n"); + } +} + +static int mlx4_en_get_qp(struct mlx4_en_priv *priv) +{ + struct mlx4_en_dev *mdev = priv->mdev; + struct mlx4_dev *dev = mdev->dev; + struct mlx4_mac_entry *entry; + int index = 0; + int err = 0; + u64 reg_id; + int *qpn = &priv->base_qpn; + u64 mac = mlx4_mac_to_u64(priv->dev->dev_addr); + + en_dbg(DRV, priv, "Registering MAC: %pM for adding\n", + priv->dev->dev_addr); + index = mlx4_register_mac(dev, priv->port, mac); + if (index < 0) { + err = index; + en_err(priv, "Failed adding MAC: %pM\n", + priv->dev->dev_addr); + return err; + } + + if (dev->caps.steering_mode == MLX4_STEERING_MODE_A0) { + int base_qpn = mlx4_get_base_qpn(dev, priv->port); + *qpn = base_qpn + index; + return 0; + } + + err = mlx4_qp_reserve_range(dev, 1, 1, qpn); + en_dbg(DRV, priv, "Reserved qp %d\n", *qpn); + if (err) { + en_err(priv, "Failed to reserve qp for mac registration\n"); + goto qp_err; + } + + err = mlx4_en_uc_steer_add(priv, priv->dev->dev_addr, qpn, ®_id); + if (err) + goto steer_err; + + err = mlx4_en_tunnel_steer_add(priv, priv->dev->dev_addr, *qpn, + &priv->tunnel_reg_id); + if (err) + goto tunnel_err; + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + err = -ENOMEM; + goto alloc_err; + } + memcpy(entry->mac, priv->dev->dev_addr, sizeof(entry->mac)); + memcpy(priv->current_mac, entry->mac, sizeof(priv->current_mac)); + entry->reg_id = reg_id; + + hlist_add_head_rcu(&entry->hlist, + &priv->mac_hash[entry->mac[MLX4_EN_MAC_HASH_IDX]]); + + return 0; + +alloc_err: + if (priv->tunnel_reg_id) + mlx4_flow_detach(priv->mdev->dev, priv->tunnel_reg_id); +tunnel_err: + mlx4_en_uc_steer_release(priv, priv->dev->dev_addr, *qpn, reg_id); + +steer_err: + mlx4_qp_release_range(dev, *qpn, 1); + +qp_err: + mlx4_unregister_mac(dev, priv->port, mac); + return err; +} + +static void mlx4_en_put_qp(struct mlx4_en_priv *priv) +{ + struct mlx4_en_dev *mdev = priv->mdev; + struct mlx4_dev *dev = mdev->dev; + int qpn = priv->base_qpn; + u64 mac; + + if (dev->caps.steering_mode == MLX4_STEERING_MODE_A0) { + mac = mlx4_mac_to_u64(priv->dev->dev_addr); + en_dbg(DRV, priv, "Registering MAC: %pM for deleting\n", + priv->dev->dev_addr); + mlx4_unregister_mac(dev, priv->port, mac); + } else { + struct mlx4_mac_entry *entry; + struct hlist_node *tmp; + struct hlist_head *bucket; + unsigned int i; + + for (i = 0; i < MLX4_EN_MAC_HASH_SIZE; ++i) { + bucket = &priv->mac_hash[i]; + hlist_for_each_entry_safe(entry, tmp, bucket, hlist) { + mac = mlx4_mac_to_u64(entry->mac); + en_dbg(DRV, priv, "Registering MAC: %pM for deleting\n", + entry->mac); + mlx4_en_uc_steer_release(priv, entry->mac, + qpn, entry->reg_id); + + mlx4_unregister_mac(dev, priv->port, mac); + hlist_del_rcu(&entry->hlist); + kfree_rcu(entry, rcu); + } + } + + if (priv->tunnel_reg_id) { + mlx4_flow_detach(priv->mdev->dev, priv->tunnel_reg_id); + priv->tunnel_reg_id = 0; + } + + en_dbg(DRV, priv, "Releasing qp: port %d, qpn %d\n", + priv->port, qpn); + mlx4_qp_release_range(dev, qpn, 1); + priv->flags &= ~MLX4_EN_FLAG_FORCE_PROMISC; + } +} + +static int mlx4_en_replace_mac(struct mlx4_en_priv *priv, int qpn, + unsigned char *new_mac, unsigned char *prev_mac) +{ + struct mlx4_en_dev *mdev = priv->mdev; + struct mlx4_dev *dev = mdev->dev; + int err = 0; + u64 new_mac_u64 = mlx4_mac_to_u64(new_mac); + + if (dev->caps.steering_mode != MLX4_STEERING_MODE_A0) { + struct hlist_head *bucket; + unsigned int mac_hash; + struct mlx4_mac_entry *entry; + struct hlist_node *tmp; + u64 prev_mac_u64 = mlx4_mac_to_u64(prev_mac); + + bucket = &priv->mac_hash[prev_mac[MLX4_EN_MAC_HASH_IDX]]; + hlist_for_each_entry_safe(entry, tmp, bucket, hlist) { + if (ether_addr_equal_64bits(entry->mac, prev_mac)) { + mlx4_en_uc_steer_release(priv, entry->mac, + qpn, entry->reg_id); + mlx4_unregister_mac(dev, priv->port, + prev_mac_u64); + hlist_del_rcu(&entry->hlist); + synchronize_rcu(); + memcpy(entry->mac, new_mac, ETH_ALEN); + entry->reg_id = 0; + mac_hash = new_mac[MLX4_EN_MAC_HASH_IDX]; + hlist_add_head_rcu(&entry->hlist, + &priv->mac_hash[mac_hash]); + mlx4_register_mac(dev, priv->port, new_mac_u64); + err = mlx4_en_uc_steer_add(priv, new_mac, + &qpn, + &entry->reg_id); + if (err) + return err; + if (priv->tunnel_reg_id) { + mlx4_flow_detach(priv->mdev->dev, priv->tunnel_reg_id); + priv->tunnel_reg_id = 0; + } + err = mlx4_en_tunnel_steer_add(priv, new_mac, qpn, + &priv->tunnel_reg_id); + return err; + } + } + return -EINVAL; + } + + return __mlx4_replace_mac(dev, priv->port, qpn, new_mac_u64); +} + +static int mlx4_en_do_set_mac(struct mlx4_en_priv *priv, + unsigned char new_mac[ETH_ALEN + 2]) +{ + int err = 0; + + if (priv->port_up) { + /* Remove old MAC and insert the new one */ + err = mlx4_en_replace_mac(priv, priv->base_qpn, + new_mac, priv->current_mac); + if (err) + en_err(priv, "Failed changing HW MAC address\n"); + } else + en_dbg(HW, priv, "Port is down while registering mac, exiting...\n"); + + if (!err) + memcpy(priv->current_mac, new_mac, sizeof(priv->current_mac)); + + return err; +} + +static int mlx4_en_set_mac(struct net_device *dev, void *addr) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + struct sockaddr *saddr = addr; + unsigned char new_mac[ETH_ALEN + 2]; + int err; + + if (!is_valid_ether_addr(saddr->sa_data)) + return -EADDRNOTAVAIL; + + mutex_lock(&mdev->state_lock); + memcpy(new_mac, saddr->sa_data, ETH_ALEN); + err = mlx4_en_do_set_mac(priv, new_mac); + if (!err) + memcpy(dev->dev_addr, saddr->sa_data, ETH_ALEN); + mutex_unlock(&mdev->state_lock); + + return err; +} + +static void mlx4_en_clear_list(struct net_device *dev) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_mc_list *tmp, *mc_to_del; + + list_for_each_entry_safe(mc_to_del, tmp, &priv->mc_list, list) { + list_del(&mc_to_del->list); + kfree(mc_to_del); + } +} + +static void mlx4_en_cache_mclist(struct net_device *dev) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct netdev_hw_addr *ha; + struct mlx4_en_mc_list *tmp; + + mlx4_en_clear_list(dev); + netdev_for_each_mc_addr(ha, dev) { + tmp = kzalloc(sizeof(struct mlx4_en_mc_list), GFP_ATOMIC); + if (!tmp) { + mlx4_en_clear_list(dev); + return; + } + memcpy(tmp->addr, ha->addr, ETH_ALEN); + list_add_tail(&tmp->list, &priv->mc_list); + } +} + +static void update_mclist_flags(struct mlx4_en_priv *priv, + struct list_head *dst, + struct list_head *src) +{ + struct mlx4_en_mc_list *dst_tmp, *src_tmp, *new_mc; + bool found; + + /* Find all the entries that should be removed from dst, + * These are the entries that are not found in src + */ + list_for_each_entry(dst_tmp, dst, list) { + found = false; + list_for_each_entry(src_tmp, src, list) { + if (ether_addr_equal(dst_tmp->addr, src_tmp->addr)) { + found = true; + break; + } + } + if (!found) + dst_tmp->action = MCLIST_REM; + } + + /* Add entries that exist in src but not in dst + * mark them as need to add + */ + list_for_each_entry(src_tmp, src, list) { + found = false; + list_for_each_entry(dst_tmp, dst, list) { + if (ether_addr_equal(dst_tmp->addr, src_tmp->addr)) { + dst_tmp->action = MCLIST_NONE; + found = true; + break; + } + } + if (!found) { + new_mc = kmemdup(src_tmp, + sizeof(struct mlx4_en_mc_list), + GFP_KERNEL); + if (!new_mc) + return; + + new_mc->action = MCLIST_ADD; + list_add_tail(&new_mc->list, dst); + } + } +} + +static void mlx4_en_set_rx_mode(struct net_device *dev) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + + if (!priv->port_up) + return; + + queue_work(priv->mdev->workqueue, &priv->rx_mode_task); +} + +static void mlx4_en_set_promisc_mode(struct mlx4_en_priv *priv, + struct mlx4_en_dev *mdev) +{ + int err = 0; + + if (!(priv->flags & MLX4_EN_FLAG_PROMISC)) { + if (netif_msg_rx_status(priv)) + en_warn(priv, "Entering promiscuous mode\n"); + priv->flags |= MLX4_EN_FLAG_PROMISC; + + /* Enable promiscouos mode */ + switch (mdev->dev->caps.steering_mode) { + case MLX4_STEERING_MODE_DEVICE_MANAGED: + err = mlx4_flow_steer_promisc_add(mdev->dev, + priv->port, + priv->base_qpn, + MLX4_FS_ALL_DEFAULT); + if (err) + en_err(priv, "Failed enabling promiscuous mode\n"); + priv->flags |= MLX4_EN_FLAG_MC_PROMISC; + break; + + case MLX4_STEERING_MODE_B0: + err = mlx4_unicast_promisc_add(mdev->dev, + priv->base_qpn, + priv->port); + if (err) + en_err(priv, "Failed enabling unicast promiscuous mode\n"); + + /* Add the default qp number as multicast + * promisc + */ + if (!(priv->flags & MLX4_EN_FLAG_MC_PROMISC)) { + err = mlx4_multicast_promisc_add(mdev->dev, + priv->base_qpn, + priv->port); + if (err) + en_err(priv, "Failed enabling multicast promiscuous mode\n"); + priv->flags |= MLX4_EN_FLAG_MC_PROMISC; + } + break; + + case MLX4_STEERING_MODE_A0: + err = mlx4_SET_PORT_qpn_calc(mdev->dev, + priv->port, + priv->base_qpn, + 1); + if (err) + en_err(priv, "Failed enabling promiscuous mode\n"); + break; + } + + /* Disable port multicast filter (unconditionally) */ + err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0, + 0, MLX4_MCAST_DISABLE); + if (err) + en_err(priv, "Failed disabling multicast filter\n"); + } +} + +static void mlx4_en_clear_promisc_mode(struct mlx4_en_priv *priv, + struct mlx4_en_dev *mdev) +{ + int err = 0; + + if (netif_msg_rx_status(priv)) + en_warn(priv, "Leaving promiscuous mode\n"); + priv->flags &= ~MLX4_EN_FLAG_PROMISC; + + /* Disable promiscouos mode */ + switch (mdev->dev->caps.steering_mode) { + case MLX4_STEERING_MODE_DEVICE_MANAGED: + err = mlx4_flow_steer_promisc_remove(mdev->dev, + priv->port, + MLX4_FS_ALL_DEFAULT); + if (err) + en_err(priv, "Failed disabling promiscuous mode\n"); + priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC; + break; + + case MLX4_STEERING_MODE_B0: + err = mlx4_unicast_promisc_remove(mdev->dev, + priv->base_qpn, + priv->port); + if (err) + en_err(priv, "Failed disabling unicast promiscuous mode\n"); + /* Disable Multicast promisc */ + if (priv->flags & MLX4_EN_FLAG_MC_PROMISC) { + err = mlx4_multicast_promisc_remove(mdev->dev, + priv->base_qpn, + priv->port); + if (err) + en_err(priv, "Failed disabling multicast promiscuous mode\n"); + priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC; + } + break; + + case MLX4_STEERING_MODE_A0: + err = mlx4_SET_PORT_qpn_calc(mdev->dev, + priv->port, + priv->base_qpn, 0); + if (err) + en_err(priv, "Failed disabling promiscuous mode\n"); + break; + } +} + +static void mlx4_en_do_multicast(struct mlx4_en_priv *priv, + struct net_device *dev, + struct mlx4_en_dev *mdev) +{ + struct mlx4_en_mc_list *mclist, *tmp; + u64 mcast_addr = 0; + u8 mc_list[16] = {0}; + int err = 0; + + /* Enable/disable the multicast filter according to IFF_ALLMULTI */ + if (dev->flags & IFF_ALLMULTI) { + err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0, + 0, MLX4_MCAST_DISABLE); + if (err) + en_err(priv, "Failed disabling multicast filter\n"); + + /* Add the default qp number as multicast promisc */ + if (!(priv->flags & MLX4_EN_FLAG_MC_PROMISC)) { + switch (mdev->dev->caps.steering_mode) { + case MLX4_STEERING_MODE_DEVICE_MANAGED: + err = mlx4_flow_steer_promisc_add(mdev->dev, + priv->port, + priv->base_qpn, + MLX4_FS_MC_DEFAULT); + break; + + case MLX4_STEERING_MODE_B0: + err = mlx4_multicast_promisc_add(mdev->dev, + priv->base_qpn, + priv->port); + break; + + case MLX4_STEERING_MODE_A0: + break; + } + if (err) + en_err(priv, "Failed entering multicast promisc mode\n"); + priv->flags |= MLX4_EN_FLAG_MC_PROMISC; + } + } else { + /* Disable Multicast promisc */ + if (priv->flags & MLX4_EN_FLAG_MC_PROMISC) { + switch (mdev->dev->caps.steering_mode) { + case MLX4_STEERING_MODE_DEVICE_MANAGED: + err = mlx4_flow_steer_promisc_remove(mdev->dev, + priv->port, + MLX4_FS_MC_DEFAULT); + break; + + case MLX4_STEERING_MODE_B0: + err = mlx4_multicast_promisc_remove(mdev->dev, + priv->base_qpn, + priv->port); + break; + + case MLX4_STEERING_MODE_A0: + break; + } + if (err) + en_err(priv, "Failed disabling multicast promiscuous mode\n"); + priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC; + } + + err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0, + 0, MLX4_MCAST_DISABLE); + if (err) + en_err(priv, "Failed disabling multicast filter\n"); + + /* Flush mcast filter and init it with broadcast address */ + mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, ETH_BCAST, + 1, MLX4_MCAST_CONFIG); + + /* Update multicast list - we cache all addresses so they won't + * change while HW is updated holding the command semaphor */ + netif_addr_lock_bh(dev); + mlx4_en_cache_mclist(dev); + netif_addr_unlock_bh(dev); + list_for_each_entry(mclist, &priv->mc_list, list) { + mcast_addr = mlx4_mac_to_u64(mclist->addr); + mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, + mcast_addr, 0, MLX4_MCAST_CONFIG); + } + err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0, + 0, MLX4_MCAST_ENABLE); + if (err) + en_err(priv, "Failed enabling multicast filter\n"); + + update_mclist_flags(priv, &priv->curr_list, &priv->mc_list); + list_for_each_entry_safe(mclist, tmp, &priv->curr_list, list) { + if (mclist->action == MCLIST_REM) { + /* detach this address and delete from list */ + memcpy(&mc_list[10], mclist->addr, ETH_ALEN); + mc_list[5] = priv->port; + err = mlx4_multicast_detach(mdev->dev, + &priv->rss_map.indir_qp, + mc_list, + MLX4_PROT_ETH, + mclist->reg_id); + if (err) + en_err(priv, "Fail to detach multicast address\n"); + + if (mclist->tunnel_reg_id) { + err = mlx4_flow_detach(priv->mdev->dev, mclist->tunnel_reg_id); + if (err) + en_err(priv, "Failed to detach multicast address\n"); + } + + /* remove from list */ + list_del(&mclist->list); + kfree(mclist); + } else if (mclist->action == MCLIST_ADD) { + /* attach the address */ + memcpy(&mc_list[10], mclist->addr, ETH_ALEN); + /* needed for B0 steering support */ + mc_list[5] = priv->port; + err = mlx4_multicast_attach(mdev->dev, + &priv->rss_map.indir_qp, + mc_list, + priv->port, 0, + MLX4_PROT_ETH, + &mclist->reg_id); + if (err) + en_err(priv, "Fail to attach multicast address\n"); + + err = mlx4_en_tunnel_steer_add(priv, &mc_list[10], priv->base_qpn, + &mclist->tunnel_reg_id); + if (err) + en_err(priv, "Failed to attach multicast address\n"); + } + } + } +} + +static void mlx4_en_do_uc_filter(struct mlx4_en_priv *priv, + struct net_device *dev, + struct mlx4_en_dev *mdev) +{ + struct netdev_hw_addr *ha; + struct mlx4_mac_entry *entry; + struct hlist_node *tmp; + bool found; + u64 mac; + int err = 0; + struct hlist_head *bucket; + unsigned int i; + int removed = 0; + u32 prev_flags; + + /* Note that we do not need to protect our mac_hash traversal with rcu, + * since all modification code is protected by mdev->state_lock + */ + + /* find what to remove */ + for (i = 0; i < MLX4_EN_MAC_HASH_SIZE; ++i) { + bucket = &priv->mac_hash[i]; + hlist_for_each_entry_safe(entry, tmp, bucket, hlist) { + found = false; + netdev_for_each_uc_addr(ha, dev) { + if (ether_addr_equal_64bits(entry->mac, + ha->addr)) { + found = true; + break; + } + } + + /* MAC address of the port is not in uc list */ + if (ether_addr_equal_64bits(entry->mac, + priv->current_mac)) + found = true; + + if (!found) { + mac = mlx4_mac_to_u64(entry->mac); + mlx4_en_uc_steer_release(priv, entry->mac, + priv->base_qpn, + entry->reg_id); + mlx4_unregister_mac(mdev->dev, priv->port, mac); + + hlist_del_rcu(&entry->hlist); + kfree_rcu(entry, rcu); + en_dbg(DRV, priv, "Removed MAC %pM on port:%d\n", + entry->mac, priv->port); + ++removed; + } + } + } + + /* if we didn't remove anything, there is no use in trying to add + * again once we are in a forced promisc mode state + */ + if ((priv->flags & MLX4_EN_FLAG_FORCE_PROMISC) && 0 == removed) + return; + + prev_flags = priv->flags; + priv->flags &= ~MLX4_EN_FLAG_FORCE_PROMISC; + + /* find what to add */ + netdev_for_each_uc_addr(ha, dev) { + found = false; + bucket = &priv->mac_hash[ha->addr[MLX4_EN_MAC_HASH_IDX]]; + hlist_for_each_entry(entry, bucket, hlist) { + if (ether_addr_equal_64bits(entry->mac, ha->addr)) { + found = true; + break; + } + } + + if (!found) { + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + en_err(priv, "Failed adding MAC %pM on port:%d (out of memory)\n", + ha->addr, priv->port); + priv->flags |= MLX4_EN_FLAG_FORCE_PROMISC; + break; + } + mac = mlx4_mac_to_u64(ha->addr); + memcpy(entry->mac, ha->addr, ETH_ALEN); + err = mlx4_register_mac(mdev->dev, priv->port, mac); + if (err < 0) { + en_err(priv, "Failed registering MAC %pM on port %d: %d\n", + ha->addr, priv->port, err); + kfree(entry); + priv->flags |= MLX4_EN_FLAG_FORCE_PROMISC; + break; + } + err = mlx4_en_uc_steer_add(priv, ha->addr, + &priv->base_qpn, + &entry->reg_id); + if (err) { + en_err(priv, "Failed adding MAC %pM on port %d: %d\n", + ha->addr, priv->port, err); + mlx4_unregister_mac(mdev->dev, priv->port, mac); + kfree(entry); + priv->flags |= MLX4_EN_FLAG_FORCE_PROMISC; + break; + } else { + unsigned int mac_hash; + en_dbg(DRV, priv, "Added MAC %pM on port:%d\n", + ha->addr, priv->port); + mac_hash = ha->addr[MLX4_EN_MAC_HASH_IDX]; + bucket = &priv->mac_hash[mac_hash]; + hlist_add_head_rcu(&entry->hlist, bucket); + } + } + } + + if (priv->flags & MLX4_EN_FLAG_FORCE_PROMISC) { + en_warn(priv, "Forcing promiscuous mode on port:%d\n", + priv->port); + } else if (prev_flags & MLX4_EN_FLAG_FORCE_PROMISC) { + en_warn(priv, "Stop forcing promiscuous mode on port:%d\n", + priv->port); + } +} + +static void mlx4_en_do_set_rx_mode(struct work_struct *work) +{ + struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv, + rx_mode_task); + struct mlx4_en_dev *mdev = priv->mdev; + struct net_device *dev = priv->dev; + + mutex_lock(&mdev->state_lock); + if (!mdev->device_up) { + en_dbg(HW, priv, "Card is not up, ignoring rx mode change.\n"); + goto out; + } + if (!priv->port_up) { + en_dbg(HW, priv, "Port is down, ignoring rx mode change.\n"); + goto out; + } + + if (!netif_carrier_ok(dev)) { + if (!mlx4_en_QUERY_PORT(mdev, priv->port)) { + if (priv->port_state.link_state) { + priv->last_link_state = MLX4_DEV_EVENT_PORT_UP; + netif_carrier_on(dev); + en_dbg(LINK, priv, "Link Up\n"); + } + } + } + + if (dev->priv_flags & IFF_UNICAST_FLT) + mlx4_en_do_uc_filter(priv, dev, mdev); + + /* Promsicuous mode: disable all filters */ + if ((dev->flags & IFF_PROMISC) || + (priv->flags & MLX4_EN_FLAG_FORCE_PROMISC)) { + mlx4_en_set_promisc_mode(priv, mdev); + goto out; + } + + /* Not in promiscuous mode */ + if (priv->flags & MLX4_EN_FLAG_PROMISC) + mlx4_en_clear_promisc_mode(priv, mdev); + + mlx4_en_do_multicast(priv, dev, mdev); +out: + mutex_unlock(&mdev->state_lock); +} + +#ifdef CONFIG_NET_POLL_CONTROLLER +static void mlx4_en_netpoll(struct net_device *dev) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_cq *cq; + int i; + + for (i = 0; i < priv->rx_ring_num; i++) { + cq = priv->rx_cq[i]; + napi_schedule(&cq->napi); + } +} +#endif + +static void mlx4_en_tx_timeout(struct net_device *dev) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + int i; + + if (netif_msg_timer(priv)) + en_warn(priv, "Tx timeout called on port:%d\n", priv->port); + + for (i = 0; i < priv->tx_ring_num; i++) { + if (!netif_tx_queue_stopped(netdev_get_tx_queue(dev, i))) + continue; + en_warn(priv, "TX timeout on queue: %d, QP: 0x%x, CQ: 0x%x, Cons: 0x%x, Prod: 0x%x\n", + i, priv->tx_ring[i]->qpn, priv->tx_ring[i]->cqn, + priv->tx_ring[i]->cons, priv->tx_ring[i]->prod); + } + + priv->port_stats.tx_timeout++; + en_dbg(DRV, priv, "Scheduling watchdog\n"); + queue_work(mdev->workqueue, &priv->watchdog_task); +} + + +static struct net_device_stats *mlx4_en_get_stats(struct net_device *dev) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + + spin_lock_bh(&priv->stats_lock); + memcpy(&priv->ret_stats, &priv->stats, sizeof(priv->stats)); + spin_unlock_bh(&priv->stats_lock); + + return &priv->ret_stats; +} + +static void mlx4_en_set_default_moderation(struct mlx4_en_priv *priv) +{ + struct mlx4_en_cq *cq; + int i; + + /* If we haven't received a specific coalescing setting + * (module param), we set the moderation parameters as follows: + * - moder_cnt is set to the number of mtu sized packets to + * satisfy our coalescing target. + * - moder_time is set to a fixed value. + */ + priv->rx_frames = MLX4_EN_RX_COAL_TARGET; + priv->rx_usecs = MLX4_EN_RX_COAL_TIME; + priv->tx_frames = MLX4_EN_TX_COAL_PKTS; + priv->tx_usecs = MLX4_EN_TX_COAL_TIME; + en_dbg(INTR, priv, "Default coalesing params for mtu:%d - rx_frames:%d rx_usecs:%d\n", + priv->dev->mtu, priv->rx_frames, priv->rx_usecs); + + /* Setup cq moderation params */ + for (i = 0; i < priv->rx_ring_num; i++) { + cq = priv->rx_cq[i]; + cq->moder_cnt = priv->rx_frames; + cq->moder_time = priv->rx_usecs; + priv->last_moder_time[i] = MLX4_EN_AUTO_CONF; + priv->last_moder_packets[i] = 0; + priv->last_moder_bytes[i] = 0; + } + + for (i = 0; i < priv->tx_ring_num; i++) { + cq = priv->tx_cq[i]; + cq->moder_cnt = priv->tx_frames; + cq->moder_time = priv->tx_usecs; + } + + /* Reset auto-moderation params */ + priv->pkt_rate_low = MLX4_EN_RX_RATE_LOW; + priv->rx_usecs_low = MLX4_EN_RX_COAL_TIME_LOW; + priv->pkt_rate_high = MLX4_EN_RX_RATE_HIGH; + priv->rx_usecs_high = MLX4_EN_RX_COAL_TIME_HIGH; + priv->sample_interval = MLX4_EN_SAMPLE_INTERVAL; + priv->adaptive_rx_coal = 1; + priv->last_moder_jiffies = 0; + priv->last_moder_tx_packets = 0; +} + +static void mlx4_en_auto_moderation(struct mlx4_en_priv *priv) +{ + unsigned long period = (unsigned long) (jiffies - priv->last_moder_jiffies); + struct mlx4_en_cq *cq; + unsigned long packets; + unsigned long rate; + unsigned long avg_pkt_size; + unsigned long rx_packets; + unsigned long rx_bytes; + unsigned long rx_pkt_diff; + int moder_time; + int ring, err; + + if (!priv->adaptive_rx_coal || period < priv->sample_interval * HZ) + return; + + for (ring = 0; ring < priv->rx_ring_num; ring++) { + spin_lock_bh(&priv->stats_lock); + rx_packets = priv->rx_ring[ring]->packets; + rx_bytes = priv->rx_ring[ring]->bytes; + spin_unlock_bh(&priv->stats_lock); + + rx_pkt_diff = ((unsigned long) (rx_packets - + priv->last_moder_packets[ring])); + packets = rx_pkt_diff; + rate = packets * HZ / period; + avg_pkt_size = packets ? ((unsigned long) (rx_bytes - + priv->last_moder_bytes[ring])) / packets : 0; + + /* Apply auto-moderation only when packet rate + * exceeds a rate that it matters */ + if (rate > (MLX4_EN_RX_RATE_THRESH / priv->rx_ring_num) && + avg_pkt_size > MLX4_EN_AVG_PKT_SMALL) { + if (rate < priv->pkt_rate_low) + moder_time = priv->rx_usecs_low; + else if (rate > priv->pkt_rate_high) + moder_time = priv->rx_usecs_high; + else + moder_time = (rate - priv->pkt_rate_low) * + (priv->rx_usecs_high - priv->rx_usecs_low) / + (priv->pkt_rate_high - priv->pkt_rate_low) + + priv->rx_usecs_low; + } else { + moder_time = priv->rx_usecs_low; + } + + if (moder_time != priv->last_moder_time[ring]) { + priv->last_moder_time[ring] = moder_time; + cq = priv->rx_cq[ring]; + cq->moder_time = moder_time; + cq->moder_cnt = priv->rx_frames; + err = mlx4_en_set_cq_moder(priv, cq); + if (err) + en_err(priv, "Failed modifying moderation for cq:%d\n", + ring); + } + priv->last_moder_packets[ring] = rx_packets; + priv->last_moder_bytes[ring] = rx_bytes; + } + + priv->last_moder_jiffies = jiffies; +} + +static void mlx4_en_do_get_stats(struct work_struct *work) +{ + struct delayed_work *delay = to_delayed_work(work); + struct mlx4_en_priv *priv = container_of(delay, struct mlx4_en_priv, + stats_task); + struct mlx4_en_dev *mdev = priv->mdev; + int err; + + mutex_lock(&mdev->state_lock); + if (mdev->device_up) { + if (priv->port_up) { + err = mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 0); + if (err) + en_dbg(HW, priv, "Could not update stats\n"); + + mlx4_en_auto_moderation(priv); + } + + queue_delayed_work(mdev->workqueue, &priv->stats_task, STATS_DELAY); + } + if (mdev->mac_removed[MLX4_MAX_PORTS + 1 - priv->port]) { + mlx4_en_do_set_mac(priv, priv->current_mac); + mdev->mac_removed[MLX4_MAX_PORTS + 1 - priv->port] = 0; + } + mutex_unlock(&mdev->state_lock); +} + +/* mlx4_en_service_task - Run service task for tasks that needed to be done + * periodically + */ +static void mlx4_en_service_task(struct work_struct *work) +{ + struct delayed_work *delay = to_delayed_work(work); + struct mlx4_en_priv *priv = container_of(delay, struct mlx4_en_priv, + service_task); + struct mlx4_en_dev *mdev = priv->mdev; + + mutex_lock(&mdev->state_lock); + if (mdev->device_up) { + if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS) + mlx4_en_ptp_overflow_check(mdev); + + queue_delayed_work(mdev->workqueue, &priv->service_task, + SERVICE_TASK_DELAY); + } + mutex_unlock(&mdev->state_lock); +} + +static void mlx4_en_linkstate(struct work_struct *work) +{ + struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv, + linkstate_task); + struct mlx4_en_dev *mdev = priv->mdev; + int linkstate = priv->link_state; + + mutex_lock(&mdev->state_lock); + /* If observable port state changed set carrier state and + * report to system log */ + if (priv->last_link_state != linkstate) { + if (linkstate == MLX4_DEV_EVENT_PORT_DOWN) { + en_info(priv, "Link Down\n"); + netif_carrier_off(priv->dev); + } else { + en_info(priv, "Link Up\n"); + netif_carrier_on(priv->dev); + } + } + priv->last_link_state = linkstate; + mutex_unlock(&mdev->state_lock); +} + +static int mlx4_en_init_affinity_hint(struct mlx4_en_priv *priv, int ring_idx) +{ + struct mlx4_en_rx_ring *ring = priv->rx_ring[ring_idx]; + int numa_node = priv->mdev->dev->numa_node; + int ret = 0; + + if (!zalloc_cpumask_var(&ring->affinity_mask, GFP_KERNEL)) + return -ENOMEM; + + ret = cpumask_set_cpu_local_first(ring_idx, numa_node, + ring->affinity_mask); + if (ret) + free_cpumask_var(ring->affinity_mask); + + return ret; +} + +static void mlx4_en_free_affinity_hint(struct mlx4_en_priv *priv, int ring_idx) +{ + free_cpumask_var(priv->rx_ring[ring_idx]->affinity_mask); +} + +int mlx4_en_start_port(struct net_device *dev) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + struct mlx4_en_cq *cq; + struct mlx4_en_tx_ring *tx_ring; + int rx_index = 0; + int tx_index = 0; + int err = 0; + int i; + int j; + u8 mc_list[16] = {0}; + + if (priv->port_up) { + en_dbg(DRV, priv, "start port called while port already up\n"); + return 0; + } + + INIT_LIST_HEAD(&priv->mc_list); + INIT_LIST_HEAD(&priv->curr_list); + INIT_LIST_HEAD(&priv->ethtool_list); + memset(&priv->ethtool_rules[0], 0, + sizeof(struct ethtool_flow_id) * MAX_NUM_OF_FS_RULES); + + /* Calculate Rx buf size */ + dev->mtu = min(dev->mtu, priv->max_mtu); + mlx4_en_calc_rx_buf(dev); + en_dbg(DRV, priv, "Rx buf size:%d\n", priv->rx_skb_size); + + /* Configure rx cq's and rings */ + err = mlx4_en_activate_rx_rings(priv); + if (err) { + en_err(priv, "Failed to activate RX rings\n"); + return err; + } + for (i = 0; i < priv->rx_ring_num; i++) { + cq = priv->rx_cq[i]; + + mlx4_en_cq_init_lock(cq); + + err = mlx4_en_init_affinity_hint(priv, i); + if (err) { + en_err(priv, "Failed preparing IRQ affinity hint\n"); + goto cq_err; + } + + err = mlx4_en_activate_cq(priv, cq, i); + if (err) { + en_err(priv, "Failed activating Rx CQ\n"); + mlx4_en_free_affinity_hint(priv, i); + goto cq_err; + } + for (j = 0; j < cq->size; j++) + cq->buf[j].owner_sr_opcode = MLX4_CQE_OWNER_MASK; + err = mlx4_en_set_cq_moder(priv, cq); + if (err) { + en_err(priv, "Failed setting cq moderation parameters\n"); + mlx4_en_deactivate_cq(priv, cq); + mlx4_en_free_affinity_hint(priv, i); + goto cq_err; + } + mlx4_en_arm_cq(priv, cq); + priv->rx_ring[i]->cqn = cq->mcq.cqn; + ++rx_index; + } + + /* Set qp number */ + en_dbg(DRV, priv, "Getting qp number for port %d\n", priv->port); + err = mlx4_en_get_qp(priv); + if (err) { + en_err(priv, "Failed getting eth qp\n"); + goto cq_err; + } + mdev->mac_removed[priv->port] = 0; + + err = mlx4_en_config_rss_steer(priv); + if (err) { + en_err(priv, "Failed configuring rss steering\n"); + goto mac_err; + } + + err = mlx4_en_create_drop_qp(priv); + if (err) + goto rss_err; + + /* Configure tx cq's and rings */ + for (i = 0; i < priv->tx_ring_num; i++) { + /* Configure cq */ + cq = priv->tx_cq[i]; + err = mlx4_en_activate_cq(priv, cq, i); + if (err) { + en_err(priv, "Failed allocating Tx CQ\n"); + goto tx_err; + } + err = mlx4_en_set_cq_moder(priv, cq); + if (err) { + en_err(priv, "Failed setting cq moderation parameters\n"); + mlx4_en_deactivate_cq(priv, cq); + goto tx_err; + } + en_dbg(DRV, priv, "Resetting index of collapsed CQ:%d to -1\n", i); + cq->buf->wqe_index = cpu_to_be16(0xffff); + + /* Configure ring */ + tx_ring = priv->tx_ring[i]; + err = mlx4_en_activate_tx_ring(priv, tx_ring, cq->mcq.cqn, + i / priv->num_tx_rings_p_up); + if (err) { + en_err(priv, "Failed allocating Tx ring\n"); + mlx4_en_deactivate_cq(priv, cq); + goto tx_err; + } + tx_ring->tx_queue = netdev_get_tx_queue(dev, i); + + /* Arm CQ for TX completions */ + mlx4_en_arm_cq(priv, cq); + + /* Set initial ownership of all Tx TXBBs to SW (1) */ + for (j = 0; j < tx_ring->buf_size; j += STAMP_STRIDE) + *((u32 *) (tx_ring->buf + j)) = 0xffffffff; + ++tx_index; + } + + /* Configure port */ + err = mlx4_SET_PORT_general(mdev->dev, priv->port, + priv->rx_skb_size + ETH_FCS_LEN, + priv->prof->tx_pause, + priv->prof->tx_ppp, + priv->prof->rx_pause, + priv->prof->rx_ppp); + if (err) { + en_err(priv, "Failed setting port general configurations for port %d, with error %d\n", + priv->port, err); + goto tx_err; + } + /* Set default qp number */ + err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port, priv->base_qpn, 0); + if (err) { + en_err(priv, "Failed setting default qp numbers\n"); + goto tx_err; + } + + if (mdev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) { + err = mlx4_SET_PORT_VXLAN(mdev->dev, priv->port, VXLAN_STEER_BY_OUTER_MAC, 1); + if (err) { + en_err(priv, "Failed setting port L2 tunnel configuration, err %d\n", + err); + goto tx_err; + } + } + + /* Init port */ + en_dbg(HW, priv, "Initializing port\n"); + err = mlx4_INIT_PORT(mdev->dev, priv->port); + if (err) { + en_err(priv, "Failed Initializing port\n"); + goto tx_err; + } + + /* Attach rx QP to bradcast address */ + memset(&mc_list[10], 0xff, ETH_ALEN); + mc_list[5] = priv->port; /* needed for B0 steering support */ + if (mlx4_multicast_attach(mdev->dev, &priv->rss_map.indir_qp, mc_list, + priv->port, 0, MLX4_PROT_ETH, + &priv->broadcast_id)) + mlx4_warn(mdev, "Failed Attaching Broadcast\n"); + + /* Must redo promiscuous mode setup. */ + priv->flags &= ~(MLX4_EN_FLAG_PROMISC | MLX4_EN_FLAG_MC_PROMISC); + + /* Schedule multicast task to populate multicast list */ + queue_work(mdev->workqueue, &priv->rx_mode_task); + + mlx4_set_stats_bitmap(mdev->dev, &priv->stats_bitmap); + +#ifdef CONFIG_MLX4_EN_VXLAN + if (priv->mdev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) + vxlan_get_rx_port(dev); +#endif + priv->port_up = true; + netif_tx_start_all_queues(dev); + netif_device_attach(dev); + + return 0; + +tx_err: + while (tx_index--) { + mlx4_en_deactivate_tx_ring(priv, priv->tx_ring[tx_index]); + mlx4_en_deactivate_cq(priv, priv->tx_cq[tx_index]); + } + mlx4_en_destroy_drop_qp(priv); +rss_err: + mlx4_en_release_rss_steer(priv); +mac_err: + mlx4_en_put_qp(priv); +cq_err: + while (rx_index--) { + mlx4_en_deactivate_cq(priv, priv->rx_cq[rx_index]); + mlx4_en_free_affinity_hint(priv, i); + } + for (i = 0; i < priv->rx_ring_num; i++) + mlx4_en_deactivate_rx_ring(priv, priv->rx_ring[i]); + + return err; /* need to close devices */ +} + + +void mlx4_en_stop_port(struct net_device *dev, int detach) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + struct mlx4_en_mc_list *mclist, *tmp; + struct ethtool_flow_id *flow, *tmp_flow; + int i; + u8 mc_list[16] = {0}; + + if (!priv->port_up) { + en_dbg(DRV, priv, "stop port called while port already down\n"); + return; + } + + /* close port*/ + mlx4_CLOSE_PORT(mdev->dev, priv->port); + + /* Synchronize with tx routine */ + netif_tx_lock_bh(dev); + if (detach) + netif_device_detach(dev); + netif_tx_stop_all_queues(dev); + netif_tx_unlock_bh(dev); + + netif_tx_disable(dev); + + /* Set port as not active */ + priv->port_up = false; + + /* Promsicuous mode */ + if (mdev->dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED) { + priv->flags &= ~(MLX4_EN_FLAG_PROMISC | + MLX4_EN_FLAG_MC_PROMISC); + mlx4_flow_steer_promisc_remove(mdev->dev, + priv->port, + MLX4_FS_ALL_DEFAULT); + mlx4_flow_steer_promisc_remove(mdev->dev, + priv->port, + MLX4_FS_MC_DEFAULT); + } else if (priv->flags & MLX4_EN_FLAG_PROMISC) { + priv->flags &= ~MLX4_EN_FLAG_PROMISC; + + /* Disable promiscouos mode */ + mlx4_unicast_promisc_remove(mdev->dev, priv->base_qpn, + priv->port); + + /* Disable Multicast promisc */ + if (priv->flags & MLX4_EN_FLAG_MC_PROMISC) { + mlx4_multicast_promisc_remove(mdev->dev, priv->base_qpn, + priv->port); + priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC; + } + } + + /* Detach All multicasts */ + memset(&mc_list[10], 0xff, ETH_ALEN); + mc_list[5] = priv->port; /* needed for B0 steering support */ + mlx4_multicast_detach(mdev->dev, &priv->rss_map.indir_qp, mc_list, + MLX4_PROT_ETH, priv->broadcast_id); + list_for_each_entry(mclist, &priv->curr_list, list) { + memcpy(&mc_list[10], mclist->addr, ETH_ALEN); + mc_list[5] = priv->port; + mlx4_multicast_detach(mdev->dev, &priv->rss_map.indir_qp, + mc_list, MLX4_PROT_ETH, mclist->reg_id); + if (mclist->tunnel_reg_id) + mlx4_flow_detach(mdev->dev, mclist->tunnel_reg_id); + } + mlx4_en_clear_list(dev); + list_for_each_entry_safe(mclist, tmp, &priv->curr_list, list) { + list_del(&mclist->list); + kfree(mclist); + } + + /* Flush multicast filter */ + mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0, 1, MLX4_MCAST_CONFIG); + + /* Remove flow steering rules for the port*/ + if (mdev->dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED) { + ASSERT_RTNL(); + list_for_each_entry_safe(flow, tmp_flow, + &priv->ethtool_list, list) { + mlx4_flow_detach(mdev->dev, flow->id); + list_del(&flow->list); + } + } + + mlx4_en_destroy_drop_qp(priv); + + /* Free TX Rings */ + for (i = 0; i < priv->tx_ring_num; i++) { + mlx4_en_deactivate_tx_ring(priv, priv->tx_ring[i]); + mlx4_en_deactivate_cq(priv, priv->tx_cq[i]); + } + msleep(10); + + for (i = 0; i < priv->tx_ring_num; i++) + mlx4_en_free_tx_buf(dev, priv->tx_ring[i]); + + /* Free RSS qps */ + mlx4_en_release_rss_steer(priv); + + /* Unregister Mac address for the port */ + mlx4_en_put_qp(priv); + if (!(mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_REASSIGN_MAC_EN)) + mdev->mac_removed[priv->port] = 1; + + /* Free RX Rings */ + for (i = 0; i < priv->rx_ring_num; i++) { + struct mlx4_en_cq *cq = priv->rx_cq[i]; + + local_bh_disable(); + while (!mlx4_en_cq_lock_napi(cq)) { + pr_info("CQ %d locked\n", i); + mdelay(1); + } + local_bh_enable(); + + while (test_bit(NAPI_STATE_SCHED, &cq->napi.state)) + msleep(1); + mlx4_en_deactivate_rx_ring(priv, priv->rx_ring[i]); + mlx4_en_deactivate_cq(priv, cq); + + mlx4_en_free_affinity_hint(priv, i); + } +} + +static void mlx4_en_restart(struct work_struct *work) +{ + struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv, + watchdog_task); + struct mlx4_en_dev *mdev = priv->mdev; + struct net_device *dev = priv->dev; + + en_dbg(DRV, priv, "Watchdog task called for port %d\n", priv->port); + + mutex_lock(&mdev->state_lock); + if (priv->port_up) { + mlx4_en_stop_port(dev, 1); + if (mlx4_en_start_port(dev)) + en_err(priv, "Failed restarting port %d\n", priv->port); + } + mutex_unlock(&mdev->state_lock); +} + +static void mlx4_en_clear_stats(struct net_device *dev) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + int i; + + if (mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 1)) + en_dbg(HW, priv, "Failed dumping statistics\n"); + + memset(&priv->stats, 0, sizeof(priv->stats)); + memset(&priv->pstats, 0, sizeof(priv->pstats)); + memset(&priv->pkstats, 0, sizeof(priv->pkstats)); + memset(&priv->port_stats, 0, sizeof(priv->port_stats)); + + for (i = 0; i < priv->tx_ring_num; i++) { + priv->tx_ring[i]->bytes = 0; + priv->tx_ring[i]->packets = 0; + priv->tx_ring[i]->tx_csum = 0; + } + for (i = 0; i < priv->rx_ring_num; i++) { + priv->rx_ring[i]->bytes = 0; + priv->rx_ring[i]->packets = 0; + priv->rx_ring[i]->csum_ok = 0; + priv->rx_ring[i]->csum_none = 0; + } +} + +static int mlx4_en_open(struct net_device *dev) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + int err = 0; + + mutex_lock(&mdev->state_lock); + + if (!mdev->device_up) { + en_err(priv, "Cannot open - device down/disabled\n"); + err = -EBUSY; + goto out; + } + + /* Reset HW statistics and SW counters */ + mlx4_en_clear_stats(dev); + + err = mlx4_en_start_port(dev); + if (err) + en_err(priv, "Failed starting port:%d\n", priv->port); + +out: + mutex_unlock(&mdev->state_lock); + return err; +} + + +static int mlx4_en_close(struct net_device *dev) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + + en_dbg(IFDOWN, priv, "Close port called\n"); + + mutex_lock(&mdev->state_lock); + + mlx4_en_stop_port(dev, 0); + netif_carrier_off(dev); + + mutex_unlock(&mdev->state_lock); + return 0; +} + +void mlx4_en_free_resources(struct mlx4_en_priv *priv) +{ + int i; + +#ifdef CONFIG_RFS_ACCEL + free_irq_cpu_rmap(priv->dev->rx_cpu_rmap); + priv->dev->rx_cpu_rmap = NULL; +#endif + + for (i = 0; i < priv->tx_ring_num; i++) { + if (priv->tx_ring && priv->tx_ring[i]) + mlx4_en_destroy_tx_ring(priv, &priv->tx_ring[i]); + if (priv->tx_cq && priv->tx_cq[i]) + mlx4_en_destroy_cq(priv, &priv->tx_cq[i]); + } + + for (i = 0; i < priv->rx_ring_num; i++) { + if (priv->rx_ring[i]) + mlx4_en_destroy_rx_ring(priv, &priv->rx_ring[i], + priv->prof->rx_ring_size, priv->stride); + if (priv->rx_cq[i]) + mlx4_en_destroy_cq(priv, &priv->rx_cq[i]); + } + + if (priv->base_tx_qpn) { + mlx4_qp_release_range(priv->mdev->dev, priv->base_tx_qpn, priv->tx_ring_num); + priv->base_tx_qpn = 0; + } +} + +int mlx4_en_alloc_resources(struct mlx4_en_priv *priv) +{ + struct mlx4_en_port_profile *prof = priv->prof; + int i; + int err; + int node; + + err = mlx4_qp_reserve_range(priv->mdev->dev, priv->tx_ring_num, 256, &priv->base_tx_qpn); + if (err) { + en_err(priv, "failed reserving range for TX rings\n"); + return err; + } + + /* Create tx Rings */ + for (i = 0; i < priv->tx_ring_num; i++) { + node = cpu_to_node(i % num_online_cpus()); + if (mlx4_en_create_cq(priv, &priv->tx_cq[i], + prof->tx_ring_size, i, TX, node)) + goto err; + + if (mlx4_en_create_tx_ring(priv, &priv->tx_ring[i], + priv->base_tx_qpn + i, + prof->tx_ring_size, TXBB_SIZE, + node, i)) + goto err; + } + + /* Create rx Rings */ + for (i = 0; i < priv->rx_ring_num; i++) { + node = cpu_to_node(i % num_online_cpus()); + if (mlx4_en_create_cq(priv, &priv->rx_cq[i], + prof->rx_ring_size, i, RX, node)) + goto err; + + if (mlx4_en_create_rx_ring(priv, &priv->rx_ring[i], + prof->rx_ring_size, priv->stride, + node)) + goto err; + } + +#ifdef CONFIG_RFS_ACCEL + if (priv->mdev->dev->caps.comp_pool) { + priv->dev->rx_cpu_rmap = alloc_irq_cpu_rmap(priv->mdev->dev->caps.comp_pool); + if (!priv->dev->rx_cpu_rmap) + goto err; + } +#endif + + return 0; + +err: + en_err(priv, "Failed to allocate NIC resources\n"); + for (i = 0; i < priv->rx_ring_num; i++) { + if (priv->rx_ring[i]) + mlx4_en_destroy_rx_ring(priv, &priv->rx_ring[i], + prof->rx_ring_size, + priv->stride); + if (priv->rx_cq[i]) + mlx4_en_destroy_cq(priv, &priv->rx_cq[i]); + } + for (i = 0; i < priv->tx_ring_num; i++) { + if (priv->tx_ring[i]) + mlx4_en_destroy_tx_ring(priv, &priv->tx_ring[i]); + if (priv->tx_cq[i]) + mlx4_en_destroy_cq(priv, &priv->tx_cq[i]); + } + return -ENOMEM; +} + + +void mlx4_en_destroy_netdev(struct net_device *dev) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + + en_dbg(DRV, priv, "Destroying netdev on port:%d\n", priv->port); + + /* Unregister device - this will close the port if it was up */ + if (priv->registered) + unregister_netdev(dev); + + if (priv->allocated) + mlx4_free_hwq_res(mdev->dev, &priv->res, MLX4_EN_PAGE_SIZE); + + cancel_delayed_work(&priv->stats_task); + cancel_delayed_work(&priv->service_task); + /* flush any pending task for this netdev */ + flush_workqueue(mdev->workqueue); + + /* Detach the netdev so tasks would not attempt to access it */ + mutex_lock(&mdev->state_lock); + mdev->pndev[priv->port] = NULL; + mutex_unlock(&mdev->state_lock); + + mlx4_en_free_resources(priv); + + kfree(priv->tx_ring); + kfree(priv->tx_cq); + + free_netdev(dev); +} + +static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + int err = 0; + + en_dbg(DRV, priv, "Change MTU called - current:%d new:%d\n", + dev->mtu, new_mtu); + + if ((new_mtu < MLX4_EN_MIN_MTU) || (new_mtu > priv->max_mtu)) { + en_err(priv, "Bad MTU size:%d.\n", new_mtu); + return -EPERM; + } + dev->mtu = new_mtu; + + if (netif_running(dev)) { + mutex_lock(&mdev->state_lock); + if (!mdev->device_up) { + /* NIC is probably restarting - let watchdog task reset + * the port */ + en_dbg(DRV, priv, "Change MTU called with card down!?\n"); + } else { + mlx4_en_stop_port(dev, 1); + err = mlx4_en_start_port(dev); + if (err) { + en_err(priv, "Failed restarting port:%d\n", + priv->port); + queue_work(mdev->workqueue, &priv->watchdog_task); + } + } + mutex_unlock(&mdev->state_lock); + } + return 0; +} + +static int mlx4_en_hwtstamp_set(struct net_device *dev, struct ifreq *ifr) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + struct hwtstamp_config config; + + if (copy_from_user(&config, ifr->ifr_data, sizeof(config))) + return -EFAULT; + + /* reserved for future extensions */ + if (config.flags) + return -EINVAL; + + /* device doesn't support time stamping */ + if (!(mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS)) + return -EINVAL; + + /* TX HW timestamp */ + switch (config.tx_type) { + case HWTSTAMP_TX_OFF: + case HWTSTAMP_TX_ON: + break; + default: + return -ERANGE; + } + + /* RX HW timestamp */ + switch (config.rx_filter) { + case HWTSTAMP_FILTER_NONE: + break; + case HWTSTAMP_FILTER_ALL: + case HWTSTAMP_FILTER_SOME: + case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: + case HWTSTAMP_FILTER_PTP_V1_L4_SYNC: + case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_L4_EVENT: + case HWTSTAMP_FILTER_PTP_V2_L4_SYNC: + case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_L2_EVENT: + case HWTSTAMP_FILTER_PTP_V2_L2_SYNC: + case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_EVENT: + case HWTSTAMP_FILTER_PTP_V2_SYNC: + case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: + config.rx_filter = HWTSTAMP_FILTER_ALL; + break; + default: + return -ERANGE; + } + + if (mlx4_en_timestamp_config(dev, config.tx_type, config.rx_filter)) { + config.tx_type = HWTSTAMP_TX_OFF; + config.rx_filter = HWTSTAMP_FILTER_NONE; + } + + return copy_to_user(ifr->ifr_data, &config, + sizeof(config)) ? -EFAULT : 0; +} + +static int mlx4_en_hwtstamp_get(struct net_device *dev, struct ifreq *ifr) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + + return copy_to_user(ifr->ifr_data, &priv->hwtstamp_config, + sizeof(priv->hwtstamp_config)) ? -EFAULT : 0; +} + +static int mlx4_en_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + switch (cmd) { + case SIOCSHWTSTAMP: + return mlx4_en_hwtstamp_set(dev, ifr); + case SIOCGHWTSTAMP: + return mlx4_en_hwtstamp_get(dev, ifr); + default: + return -EOPNOTSUPP; + } +} + +static int mlx4_en_set_features(struct net_device *netdev, + netdev_features_t features) +{ + struct mlx4_en_priv *priv = netdev_priv(netdev); + + if (features & NETIF_F_LOOPBACK) + priv->ctrl_flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK); + else + priv->ctrl_flags &= + cpu_to_be32(~MLX4_WQE_CTRL_FORCE_LOOPBACK); + + mlx4_en_update_loopback_state(netdev, features); + + return 0; + +} + +static int mlx4_en_set_vf_mac(struct net_device *dev, int queue, u8 *mac) +{ + struct mlx4_en_priv *en_priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = en_priv->mdev; + u64 mac_u64 = mlx4_mac_to_u64(mac); + + if (!is_valid_ether_addr(mac)) + return -EINVAL; + + return mlx4_set_vf_mac(mdev->dev, en_priv->port, queue, mac_u64); +} + +static int mlx4_en_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos) +{ + struct mlx4_en_priv *en_priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = en_priv->mdev; + + return mlx4_set_vf_vlan(mdev->dev, en_priv->port, vf, vlan, qos); +} + +static int mlx4_en_set_vf_spoofchk(struct net_device *dev, int vf, bool setting) +{ + struct mlx4_en_priv *en_priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = en_priv->mdev; + + return mlx4_set_vf_spoofchk(mdev->dev, en_priv->port, vf, setting); +} + +static int mlx4_en_get_vf_config(struct net_device *dev, int vf, struct ifla_vf_info *ivf) +{ + struct mlx4_en_priv *en_priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = en_priv->mdev; + + return mlx4_get_vf_config(mdev->dev, en_priv->port, vf, ivf); +} + +static int mlx4_en_set_vf_link_state(struct net_device *dev, int vf, int link_state) +{ + struct mlx4_en_priv *en_priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = en_priv->mdev; + + return mlx4_set_vf_link_state(mdev->dev, en_priv->port, vf, link_state); +} + +#define PORT_ID_BYTE_LEN 8 +static int mlx4_en_get_phys_port_id(struct net_device *dev, + struct netdev_phys_port_id *ppid) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_dev *mdev = priv->mdev->dev; + int i; + u64 phys_port_id = mdev->caps.phys_port_id[priv->port]; + + if (!phys_port_id) + return -EOPNOTSUPP; + + ppid->id_len = sizeof(phys_port_id); + for (i = PORT_ID_BYTE_LEN - 1; i >= 0; --i) { + ppid->id[i] = phys_port_id & 0xff; + phys_port_id >>= 8; + } + return 0; +} + +#ifdef CONFIG_MLX4_EN_VXLAN +static void mlx4_en_add_vxlan_offloads(struct work_struct *work) +{ + int ret; + struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv, + vxlan_add_task); + + ret = mlx4_config_vxlan_port(priv->mdev->dev, priv->vxlan_port); + if (ret) + goto out; + + ret = mlx4_SET_PORT_VXLAN(priv->mdev->dev, priv->port, + VXLAN_STEER_BY_OUTER_MAC, 1); +out: + if (ret) { + en_err(priv, "failed setting L2 tunnel configuration ret %d\n", ret); + return; + } + + /* set offloads */ + priv->dev->hw_enc_features |= NETIF_F_IP_CSUM | NETIF_F_RXCSUM | + NETIF_F_TSO | NETIF_F_GSO_UDP_TUNNEL; + priv->dev->hw_features |= NETIF_F_GSO_UDP_TUNNEL; + priv->dev->features |= NETIF_F_GSO_UDP_TUNNEL; +} + +static void mlx4_en_del_vxlan_offloads(struct work_struct *work) +{ + int ret; + struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv, + vxlan_del_task); + /* unset offloads */ + priv->dev->hw_enc_features &= ~(NETIF_F_IP_CSUM | NETIF_F_RXCSUM | + NETIF_F_TSO | NETIF_F_GSO_UDP_TUNNEL); + priv->dev->hw_features &= ~NETIF_F_GSO_UDP_TUNNEL; + priv->dev->features &= ~NETIF_F_GSO_UDP_TUNNEL; + + ret = mlx4_SET_PORT_VXLAN(priv->mdev->dev, priv->port, + VXLAN_STEER_BY_OUTER_MAC, 0); + if (ret) + en_err(priv, "failed setting L2 tunnel configuration ret %d\n", ret); + + priv->vxlan_port = 0; +} + +static void mlx4_en_add_vxlan_port(struct net_device *dev, + sa_family_t sa_family, __be16 port) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + __be16 current_port; + + if (priv->mdev->dev->caps.tunnel_offload_mode != MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) + return; + + if (sa_family == AF_INET6) + return; + + current_port = priv->vxlan_port; + if (current_port && current_port != port) { + en_warn(priv, "vxlan port %d configured, can't add port %d\n", + ntohs(current_port), ntohs(port)); + return; + } + + priv->vxlan_port = port; + queue_work(priv->mdev->workqueue, &priv->vxlan_add_task); +} + +static void mlx4_en_del_vxlan_port(struct net_device *dev, + sa_family_t sa_family, __be16 port) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + __be16 current_port; + + if (priv->mdev->dev->caps.tunnel_offload_mode != MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) + return; + + if (sa_family == AF_INET6) + return; + + current_port = priv->vxlan_port; + if (current_port != port) { + en_dbg(DRV, priv, "vxlan port %d isn't configured, ignoring\n", ntohs(port)); + return; + } + + queue_work(priv->mdev->workqueue, &priv->vxlan_del_task); +} + +static bool mlx4_en_gso_check(struct sk_buff *skb, struct net_device *dev) +{ + return vxlan_gso_check(skb); +} +#endif + +static const struct net_device_ops mlx4_netdev_ops = { + .ndo_open = mlx4_en_open, + .ndo_stop = mlx4_en_close, + .ndo_start_xmit = mlx4_en_xmit, + .ndo_select_queue = mlx4_en_select_queue, + .ndo_get_stats = mlx4_en_get_stats, + .ndo_set_rx_mode = mlx4_en_set_rx_mode, + .ndo_set_mac_address = mlx4_en_set_mac, + .ndo_validate_addr = eth_validate_addr, + .ndo_change_mtu = mlx4_en_change_mtu, + .ndo_do_ioctl = mlx4_en_ioctl, + .ndo_tx_timeout = mlx4_en_tx_timeout, + .ndo_vlan_rx_add_vid = mlx4_en_vlan_rx_add_vid, + .ndo_vlan_rx_kill_vid = mlx4_en_vlan_rx_kill_vid, +#ifdef CONFIG_NET_POLL_CONTROLLER + .ndo_poll_controller = mlx4_en_netpoll, +#endif + .ndo_set_features = mlx4_en_set_features, + .ndo_setup_tc = mlx4_en_setup_tc, +#ifdef CONFIG_RFS_ACCEL + .ndo_rx_flow_steer = mlx4_en_filter_rfs, +#endif +#ifdef CONFIG_NET_RX_BUSY_POLL + .ndo_busy_poll = mlx4_en_low_latency_recv, +#endif + .ndo_get_phys_port_id = mlx4_en_get_phys_port_id, +#ifdef CONFIG_MLX4_EN_VXLAN + .ndo_add_vxlan_port = mlx4_en_add_vxlan_port, + .ndo_del_vxlan_port = mlx4_en_del_vxlan_port, + .ndo_gso_check = mlx4_en_gso_check, +#endif +}; + +static const struct net_device_ops mlx4_netdev_ops_master = { + .ndo_open = mlx4_en_open, + .ndo_stop = mlx4_en_close, + .ndo_start_xmit = mlx4_en_xmit, + .ndo_select_queue = mlx4_en_select_queue, + .ndo_get_stats = mlx4_en_get_stats, + .ndo_set_rx_mode = mlx4_en_set_rx_mode, + .ndo_set_mac_address = mlx4_en_set_mac, + .ndo_validate_addr = eth_validate_addr, + .ndo_change_mtu = mlx4_en_change_mtu, + .ndo_tx_timeout = mlx4_en_tx_timeout, + .ndo_vlan_rx_add_vid = mlx4_en_vlan_rx_add_vid, + .ndo_vlan_rx_kill_vid = mlx4_en_vlan_rx_kill_vid, + .ndo_set_vf_mac = mlx4_en_set_vf_mac, + .ndo_set_vf_vlan = mlx4_en_set_vf_vlan, + .ndo_set_vf_spoofchk = mlx4_en_set_vf_spoofchk, + .ndo_set_vf_link_state = mlx4_en_set_vf_link_state, + .ndo_get_vf_config = mlx4_en_get_vf_config, +#ifdef CONFIG_NET_POLL_CONTROLLER + .ndo_poll_controller = mlx4_en_netpoll, +#endif + .ndo_set_features = mlx4_en_set_features, + .ndo_setup_tc = mlx4_en_setup_tc, +#ifdef CONFIG_RFS_ACCEL + .ndo_rx_flow_steer = mlx4_en_filter_rfs, +#endif + .ndo_get_phys_port_id = mlx4_en_get_phys_port_id, +#ifdef CONFIG_MLX4_EN_VXLAN + .ndo_add_vxlan_port = mlx4_en_add_vxlan_port, + .ndo_del_vxlan_port = mlx4_en_del_vxlan_port, + .ndo_gso_check = mlx4_en_gso_check, +#endif +}; + +int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, + struct mlx4_en_port_profile *prof) +{ + struct net_device *dev; + struct mlx4_en_priv *priv; + int i; + int err; + u64 mac_u64; + + dev = alloc_etherdev_mqs(sizeof(struct mlx4_en_priv), + MAX_TX_RINGS, MAX_RX_RINGS); + if (dev == NULL) + return -ENOMEM; + + netif_set_real_num_tx_queues(dev, prof->tx_ring_num); + netif_set_real_num_rx_queues(dev, prof->rx_ring_num); + + SET_NETDEV_DEV(dev, &mdev->dev->pdev->dev); + dev->dev_port = port - 1; + + /* + * Initialize driver private data + */ + + priv = netdev_priv(dev); + memset(priv, 0, sizeof(struct mlx4_en_priv)); + priv->dev = dev; + priv->mdev = mdev; + priv->ddev = &mdev->pdev->dev; + priv->prof = prof; + priv->port = port; + priv->port_up = false; + priv->flags = prof->flags; + priv->pflags = MLX4_EN_PRIV_FLAGS_BLUEFLAME; + priv->ctrl_flags = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE | + MLX4_WQE_CTRL_SOLICITED); + priv->num_tx_rings_p_up = mdev->profile.num_tx_rings_p_up; + priv->tx_ring_num = prof->tx_ring_num; + priv->tx_work_limit = MLX4_EN_DEFAULT_TX_WORK; + + priv->tx_ring = kzalloc(sizeof(struct mlx4_en_tx_ring *) * MAX_TX_RINGS, + GFP_KERNEL); + if (!priv->tx_ring) { + err = -ENOMEM; + goto out; + } + priv->tx_cq = kzalloc(sizeof(struct mlx4_en_cq *) * MAX_TX_RINGS, + GFP_KERNEL); + if (!priv->tx_cq) { + err = -ENOMEM; + goto out; + } + priv->rx_ring_num = prof->rx_ring_num; + priv->cqe_factor = (mdev->dev->caps.cqe_size == 64) ? 1 : 0; + priv->cqe_size = mdev->dev->caps.cqe_size; + priv->mac_index = -1; + priv->msg_enable = MLX4_EN_MSG_LEVEL; + spin_lock_init(&priv->stats_lock); + INIT_WORK(&priv->rx_mode_task, mlx4_en_do_set_rx_mode); + INIT_WORK(&priv->watchdog_task, mlx4_en_restart); + INIT_WORK(&priv->linkstate_task, mlx4_en_linkstate); + INIT_DELAYED_WORK(&priv->stats_task, mlx4_en_do_get_stats); + INIT_DELAYED_WORK(&priv->service_task, mlx4_en_service_task); +#ifdef CONFIG_MLX4_EN_VXLAN + INIT_WORK(&priv->vxlan_add_task, mlx4_en_add_vxlan_offloads); + INIT_WORK(&priv->vxlan_del_task, mlx4_en_del_vxlan_offloads); +#endif +#ifdef CONFIG_MLX4_EN_DCB + if (!mlx4_is_slave(priv->mdev->dev)) { + if (mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_SET_ETH_SCHED) { + dev->dcbnl_ops = &mlx4_en_dcbnl_ops; + } else { + en_info(priv, "enabling only PFC DCB ops\n"); + dev->dcbnl_ops = &mlx4_en_dcbnl_pfc_ops; + } + } +#endif + + for (i = 0; i < MLX4_EN_MAC_HASH_SIZE; ++i) + INIT_HLIST_HEAD(&priv->mac_hash[i]); + + /* Query for default mac and max mtu */ + priv->max_mtu = mdev->dev->caps.eth_mtu_cap[priv->port]; + + /* Set default MAC */ + dev->addr_len = ETH_ALEN; + mlx4_en_u64_to_mac(dev->dev_addr, mdev->dev->caps.def_mac[priv->port]); + if (!is_valid_ether_addr(dev->dev_addr)) { + if (mlx4_is_slave(priv->mdev->dev)) { + eth_hw_addr_random(dev); + en_warn(priv, "Assigned random MAC address %pM\n", dev->dev_addr); + mac_u64 = mlx4_mac_to_u64(dev->dev_addr); + mdev->dev->caps.def_mac[priv->port] = mac_u64; + } else { + en_err(priv, "Port: %d, invalid mac burned: %pM, quiting\n", + priv->port, dev->dev_addr); + err = -EINVAL; + goto out; + } + } + + memcpy(priv->current_mac, dev->dev_addr, sizeof(priv->current_mac)); + + priv->stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) + + DS_SIZE * MLX4_EN_MAX_RX_FRAGS); + err = mlx4_en_alloc_resources(priv); + if (err) + goto out; + +#ifdef CONFIG_RFS_ACCEL + INIT_LIST_HEAD(&priv->filters); + spin_lock_init(&priv->filters_lock); +#endif + + /* Initialize time stamping config */ + priv->hwtstamp_config.flags = 0; + priv->hwtstamp_config.tx_type = HWTSTAMP_TX_OFF; + priv->hwtstamp_config.rx_filter = HWTSTAMP_FILTER_NONE; + + /* Allocate page for receive rings */ + err = mlx4_alloc_hwq_res(mdev->dev, &priv->res, + MLX4_EN_PAGE_SIZE, MLX4_EN_PAGE_SIZE); + if (err) { + en_err(priv, "Failed to allocate page for rx qps\n"); + goto out; + } + priv->allocated = 1; + + /* + * Initialize netdev entry points + */ + if (mlx4_is_master(priv->mdev->dev)) + dev->netdev_ops = &mlx4_netdev_ops_master; + else + dev->netdev_ops = &mlx4_netdev_ops; + dev->watchdog_timeo = MLX4_EN_WATCHDOG_TIMEOUT; + netif_set_real_num_tx_queues(dev, priv->tx_ring_num); + netif_set_real_num_rx_queues(dev, priv->rx_ring_num); + + dev->ethtool_ops = &mlx4_en_ethtool_ops; + + /* + * Set driver features + */ + dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; + if (mdev->LSO_support) + dev->hw_features |= NETIF_F_TSO | NETIF_F_TSO6; + + dev->vlan_features = dev->hw_features; + + dev->hw_features |= NETIF_F_RXCSUM | NETIF_F_RXHASH; + dev->features = dev->hw_features | NETIF_F_HIGHDMA | + NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | + NETIF_F_HW_VLAN_CTAG_FILTER; + dev->hw_features |= NETIF_F_LOOPBACK; + + if (mdev->dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED) + dev->hw_features |= NETIF_F_NTUPLE; + + if (mdev->dev->caps.steering_mode != MLX4_STEERING_MODE_A0) + dev->priv_flags |= IFF_UNICAST_FLT; + + mdev->pndev[port] = dev; + + netif_carrier_off(dev); + mlx4_en_set_default_moderation(priv); + + err = register_netdev(dev); + if (err) { + en_err(priv, "Netdev registration failed for port %d\n", port); + goto out; + } + priv->registered = 1; + + en_warn(priv, "Using %d TX rings\n", prof->tx_ring_num); + en_warn(priv, "Using %d RX rings\n", prof->rx_ring_num); + + mlx4_en_update_loopback_state(priv->dev, priv->dev->features); + + /* Configure port */ + mlx4_en_calc_rx_buf(dev); + err = mlx4_SET_PORT_general(mdev->dev, priv->port, + priv->rx_skb_size + ETH_FCS_LEN, + prof->tx_pause, prof->tx_ppp, + prof->rx_pause, prof->rx_ppp); + if (err) { + en_err(priv, "Failed setting port general configurations for port %d, with error %d\n", + priv->port, err); + goto out; + } + + if (mdev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) { + err = mlx4_SET_PORT_VXLAN(mdev->dev, priv->port, VXLAN_STEER_BY_OUTER_MAC, 1); + if (err) { + en_err(priv, "Failed setting port L2 tunnel configuration, err %d\n", + err); + goto out; + } + } + + /* Init port */ + en_warn(priv, "Initializing port\n"); + err = mlx4_INIT_PORT(mdev->dev, priv->port); + if (err) { + en_err(priv, "Failed Initializing port\n"); + goto out; + } + queue_delayed_work(mdev->workqueue, &priv->stats_task, STATS_DELAY); + + if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS) + queue_delayed_work(mdev->workqueue, &priv->service_task, + SERVICE_TASK_DELAY); + + return 0; + +out: + mlx4_en_destroy_netdev(dev); + return err; +} + diff --git a/drivers/net/ethernet/mellanox/mlx4/en_port.c b/drivers/net/ethernet/mellanox/mlx4/en_port.c new file mode 100644 index 0000000..0a0261d --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/en_port.c @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2007 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + + +#include + +#include +#include + +#include "en_port.h" +#include "mlx4_en.h" + + +int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, struct mlx4_en_priv *priv) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_set_vlan_fltr_mbox *filter; + int i; + int j; + int index = 0; + u32 entry; + int err = 0; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + filter = mailbox->buf; + for (i = VLAN_FLTR_SIZE - 1; i >= 0; i--) { + entry = 0; + for (j = 0; j < 32; j++) + if (test_bit(index++, priv->active_vlans)) + entry |= 1 << j; + filter->entry[i] = cpu_to_be32(entry); + } + err = mlx4_cmd(dev, mailbox->dma, priv->port, 0, MLX4_CMD_SET_VLAN_FLTR, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} + +int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port) +{ + struct mlx4_en_query_port_context *qport_context; + struct mlx4_en_priv *priv = netdev_priv(mdev->pndev[port]); + struct mlx4_en_port_state *state = &priv->port_state; + struct mlx4_cmd_mailbox *mailbox; + int err; + + mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, port, 0, + MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); + if (err) + goto out; + qport_context = mailbox->buf; + + /* This command is always accessed from Ethtool context + * already synchronized, no need in locking */ + state->link_state = !!(qport_context->link_up & MLX4_EN_LINK_UP_MASK); + switch (qport_context->link_speed & MLX4_EN_SPEED_MASK) { + case MLX4_EN_1G_SPEED: + state->link_speed = 1000; + break; + case MLX4_EN_10G_SPEED_XAUI: + case MLX4_EN_10G_SPEED_XFI: + state->link_speed = 10000; + break; + case MLX4_EN_40G_SPEED: + state->link_speed = 40000; + break; + default: + state->link_speed = -1; + break; + } + state->transciver = qport_context->transceiver; + +out: + mlx4_free_cmd_mailbox(mdev->dev, mailbox); + return err; +} + +int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset) +{ + struct mlx4_en_stat_out_mbox *mlx4_en_stats; + struct mlx4_en_priv *priv = netdev_priv(mdev->pndev[port]); + struct net_device_stats *stats = &priv->stats; + struct mlx4_cmd_mailbox *mailbox; + u64 in_mod = reset << 8 | port; + int err; + int i; + + mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, in_mod, 0, + MLX4_CMD_DUMP_ETH_STATS, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); + if (err) + goto out; + + mlx4_en_stats = mailbox->buf; + + spin_lock_bh(&priv->stats_lock); + + stats->rx_packets = 0; + stats->rx_bytes = 0; + priv->port_stats.rx_chksum_good = 0; + priv->port_stats.rx_chksum_none = 0; + for (i = 0; i < priv->rx_ring_num; i++) { + stats->rx_packets += priv->rx_ring[i]->packets; + stats->rx_bytes += priv->rx_ring[i]->bytes; + priv->port_stats.rx_chksum_good += priv->rx_ring[i]->csum_ok; + priv->port_stats.rx_chksum_none += priv->rx_ring[i]->csum_none; + } + stats->tx_packets = 0; + stats->tx_bytes = 0; + priv->port_stats.tx_chksum_offload = 0; + priv->port_stats.queue_stopped = 0; + priv->port_stats.wake_queue = 0; + priv->port_stats.tso_packets = 0; + priv->port_stats.xmit_more = 0; + + for (i = 0; i < priv->tx_ring_num; i++) { + const struct mlx4_en_tx_ring *ring = priv->tx_ring[i]; + + stats->tx_packets += ring->packets; + stats->tx_bytes += ring->bytes; + priv->port_stats.tx_chksum_offload += ring->tx_csum; + priv->port_stats.queue_stopped += ring->queue_stopped; + priv->port_stats.wake_queue += ring->wake_queue; + priv->port_stats.tso_packets += ring->tso_packets; + priv->port_stats.xmit_more += ring->xmit_more; + } + + stats->rx_errors = be64_to_cpu(mlx4_en_stats->PCS) + + be32_to_cpu(mlx4_en_stats->RdropLength) + + be32_to_cpu(mlx4_en_stats->RJBBR) + + be32_to_cpu(mlx4_en_stats->RCRC) + + be32_to_cpu(mlx4_en_stats->RRUNT); + stats->tx_errors = be32_to_cpu(mlx4_en_stats->TDROP); + stats->multicast = be64_to_cpu(mlx4_en_stats->MCAST_prio_0) + + be64_to_cpu(mlx4_en_stats->MCAST_prio_1) + + be64_to_cpu(mlx4_en_stats->MCAST_prio_2) + + be64_to_cpu(mlx4_en_stats->MCAST_prio_3) + + be64_to_cpu(mlx4_en_stats->MCAST_prio_4) + + be64_to_cpu(mlx4_en_stats->MCAST_prio_5) + + be64_to_cpu(mlx4_en_stats->MCAST_prio_6) + + be64_to_cpu(mlx4_en_stats->MCAST_prio_7) + + be64_to_cpu(mlx4_en_stats->MCAST_novlan); + stats->collisions = 0; + stats->rx_length_errors = be32_to_cpu(mlx4_en_stats->RdropLength); + stats->rx_over_errors = be32_to_cpu(mlx4_en_stats->RdropOvflw); + stats->rx_crc_errors = be32_to_cpu(mlx4_en_stats->RCRC); + stats->rx_frame_errors = 0; + stats->rx_fifo_errors = be32_to_cpu(mlx4_en_stats->RdropOvflw); + stats->rx_missed_errors = be32_to_cpu(mlx4_en_stats->RdropOvflw); + stats->tx_aborted_errors = 0; + stats->tx_carrier_errors = 0; + stats->tx_fifo_errors = 0; + stats->tx_heartbeat_errors = 0; + stats->tx_window_errors = 0; + + priv->pkstats.broadcast = + be64_to_cpu(mlx4_en_stats->RBCAST_prio_0) + + be64_to_cpu(mlx4_en_stats->RBCAST_prio_1) + + be64_to_cpu(mlx4_en_stats->RBCAST_prio_2) + + be64_to_cpu(mlx4_en_stats->RBCAST_prio_3) + + be64_to_cpu(mlx4_en_stats->RBCAST_prio_4) + + be64_to_cpu(mlx4_en_stats->RBCAST_prio_5) + + be64_to_cpu(mlx4_en_stats->RBCAST_prio_6) + + be64_to_cpu(mlx4_en_stats->RBCAST_prio_7) + + be64_to_cpu(mlx4_en_stats->RBCAST_novlan); + priv->pkstats.rx_prio[0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_0); + priv->pkstats.rx_prio[1] = be64_to_cpu(mlx4_en_stats->RTOT_prio_1); + priv->pkstats.rx_prio[2] = be64_to_cpu(mlx4_en_stats->RTOT_prio_2); + priv->pkstats.rx_prio[3] = be64_to_cpu(mlx4_en_stats->RTOT_prio_3); + priv->pkstats.rx_prio[4] = be64_to_cpu(mlx4_en_stats->RTOT_prio_4); + priv->pkstats.rx_prio[5] = be64_to_cpu(mlx4_en_stats->RTOT_prio_5); + priv->pkstats.rx_prio[6] = be64_to_cpu(mlx4_en_stats->RTOT_prio_6); + priv->pkstats.rx_prio[7] = be64_to_cpu(mlx4_en_stats->RTOT_prio_7); + priv->pkstats.tx_prio[0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_0); + priv->pkstats.tx_prio[1] = be64_to_cpu(mlx4_en_stats->TTOT_prio_1); + priv->pkstats.tx_prio[2] = be64_to_cpu(mlx4_en_stats->TTOT_prio_2); + priv->pkstats.tx_prio[3] = be64_to_cpu(mlx4_en_stats->TTOT_prio_3); + priv->pkstats.tx_prio[4] = be64_to_cpu(mlx4_en_stats->TTOT_prio_4); + priv->pkstats.tx_prio[5] = be64_to_cpu(mlx4_en_stats->TTOT_prio_5); + priv->pkstats.tx_prio[6] = be64_to_cpu(mlx4_en_stats->TTOT_prio_6); + priv->pkstats.tx_prio[7] = be64_to_cpu(mlx4_en_stats->TTOT_prio_7); + spin_unlock_bh(&priv->stats_lock); + +out: + mlx4_free_cmd_mailbox(mdev->dev, mailbox); + return err; +} + diff --git a/drivers/net/ethernet/mellanox/mlx4/en_port.h b/drivers/net/ethernet/mellanox/mlx4/en_port.h new file mode 100644 index 0000000..745090b --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/en_port.h @@ -0,0 +1,560 @@ +/* + * Copyright (c) 2007 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef _MLX4_EN_PORT_H_ +#define _MLX4_EN_PORT_H_ + + +#define SET_PORT_GEN_ALL_VALID 0x7 +#define SET_PORT_PROMISC_SHIFT 31 +#define SET_PORT_MC_PROMISC_SHIFT 30 + +#define MLX4_EN_NUM_TC 8 + +#define VLAN_FLTR_SIZE 128 +struct mlx4_set_vlan_fltr_mbox { + __be32 entry[VLAN_FLTR_SIZE]; +}; + + +enum { + MLX4_MCAST_CONFIG = 0, + MLX4_MCAST_DISABLE = 1, + MLX4_MCAST_ENABLE = 2, +}; + +enum { + MLX4_EN_1G_SPEED = 0x02, + MLX4_EN_10G_SPEED_XFI = 0x01, + MLX4_EN_10G_SPEED_XAUI = 0x00, + MLX4_EN_40G_SPEED = 0x40, + MLX4_EN_OTHER_SPEED = 0x0f, +}; + +struct mlx4_en_query_port_context { + u8 link_up; +#define MLX4_EN_LINK_UP_MASK 0x80 + u8 reserved; + __be16 mtu; + u8 reserved2; + u8 link_speed; +#define MLX4_EN_SPEED_MASK 0x43 + u16 reserved3[5]; + __be64 mac; + u8 transceiver; +}; + + +struct mlx4_en_stat_out_mbox { + /* Received frames with a length of 64 octets */ + __be64 R64_prio_0; + __be64 R64_prio_1; + __be64 R64_prio_2; + __be64 R64_prio_3; + __be64 R64_prio_4; + __be64 R64_prio_5; + __be64 R64_prio_6; + __be64 R64_prio_7; + __be64 R64_novlan; + /* Received frames with a length of 127 octets */ + __be64 R127_prio_0; + __be64 R127_prio_1; + __be64 R127_prio_2; + __be64 R127_prio_3; + __be64 R127_prio_4; + __be64 R127_prio_5; + __be64 R127_prio_6; + __be64 R127_prio_7; + __be64 R127_novlan; + /* Received frames with a length of 255 octets */ + __be64 R255_prio_0; + __be64 R255_prio_1; + __be64 R255_prio_2; + __be64 R255_prio_3; + __be64 R255_prio_4; + __be64 R255_prio_5; + __be64 R255_prio_6; + __be64 R255_prio_7; + __be64 R255_novlan; + /* Received frames with a length of 511 octets */ + __be64 R511_prio_0; + __be64 R511_prio_1; + __be64 R511_prio_2; + __be64 R511_prio_3; + __be64 R511_prio_4; + __be64 R511_prio_5; + __be64 R511_prio_6; + __be64 R511_prio_7; + __be64 R511_novlan; + /* Received frames with a length of 1023 octets */ + __be64 R1023_prio_0; + __be64 R1023_prio_1; + __be64 R1023_prio_2; + __be64 R1023_prio_3; + __be64 R1023_prio_4; + __be64 R1023_prio_5; + __be64 R1023_prio_6; + __be64 R1023_prio_7; + __be64 R1023_novlan; + /* Received frames with a length of 1518 octets */ + __be64 R1518_prio_0; + __be64 R1518_prio_1; + __be64 R1518_prio_2; + __be64 R1518_prio_3; + __be64 R1518_prio_4; + __be64 R1518_prio_5; + __be64 R1518_prio_6; + __be64 R1518_prio_7; + __be64 R1518_novlan; + /* Received frames with a length of 1522 octets */ + __be64 R1522_prio_0; + __be64 R1522_prio_1; + __be64 R1522_prio_2; + __be64 R1522_prio_3; + __be64 R1522_prio_4; + __be64 R1522_prio_5; + __be64 R1522_prio_6; + __be64 R1522_prio_7; + __be64 R1522_novlan; + /* Received frames with a length of 1548 octets */ + __be64 R1548_prio_0; + __be64 R1548_prio_1; + __be64 R1548_prio_2; + __be64 R1548_prio_3; + __be64 R1548_prio_4; + __be64 R1548_prio_5; + __be64 R1548_prio_6; + __be64 R1548_prio_7; + __be64 R1548_novlan; + /* Received frames with a length of 1548 < octets < MTU */ + __be64 R2MTU_prio_0; + __be64 R2MTU_prio_1; + __be64 R2MTU_prio_2; + __be64 R2MTU_prio_3; + __be64 R2MTU_prio_4; + __be64 R2MTU_prio_5; + __be64 R2MTU_prio_6; + __be64 R2MTU_prio_7; + __be64 R2MTU_novlan; + /* Received frames with a length of MTU< octets and good CRC */ + __be64 RGIANT_prio_0; + __be64 RGIANT_prio_1; + __be64 RGIANT_prio_2; + __be64 RGIANT_prio_3; + __be64 RGIANT_prio_4; + __be64 RGIANT_prio_5; + __be64 RGIANT_prio_6; + __be64 RGIANT_prio_7; + __be64 RGIANT_novlan; + /* Received broadcast frames with good CRC */ + __be64 RBCAST_prio_0; + __be64 RBCAST_prio_1; + __be64 RBCAST_prio_2; + __be64 RBCAST_prio_3; + __be64 RBCAST_prio_4; + __be64 RBCAST_prio_5; + __be64 RBCAST_prio_6; + __be64 RBCAST_prio_7; + __be64 RBCAST_novlan; + /* Received multicast frames with good CRC */ + __be64 MCAST_prio_0; + __be64 MCAST_prio_1; + __be64 MCAST_prio_2; + __be64 MCAST_prio_3; + __be64 MCAST_prio_4; + __be64 MCAST_prio_5; + __be64 MCAST_prio_6; + __be64 MCAST_prio_7; + __be64 MCAST_novlan; + /* Received unicast not short or GIANT frames with good CRC */ + __be64 RTOTG_prio_0; + __be64 RTOTG_prio_1; + __be64 RTOTG_prio_2; + __be64 RTOTG_prio_3; + __be64 RTOTG_prio_4; + __be64 RTOTG_prio_5; + __be64 RTOTG_prio_6; + __be64 RTOTG_prio_7; + __be64 RTOTG_novlan; + + /* Count of total octets of received frames, includes framing characters */ + __be64 RTTLOCT_prio_0; + /* Count of total octets of received frames, not including framing + characters */ + __be64 RTTLOCT_NOFRM_prio_0; + /* Count of Total number of octets received + (only for frames without errors) */ + __be64 ROCT_prio_0; + + __be64 RTTLOCT_prio_1; + __be64 RTTLOCT_NOFRM_prio_1; + __be64 ROCT_prio_1; + + __be64 RTTLOCT_prio_2; + __be64 RTTLOCT_NOFRM_prio_2; + __be64 ROCT_prio_2; + + __be64 RTTLOCT_prio_3; + __be64 RTTLOCT_NOFRM_prio_3; + __be64 ROCT_prio_3; + + __be64 RTTLOCT_prio_4; + __be64 RTTLOCT_NOFRM_prio_4; + __be64 ROCT_prio_4; + + __be64 RTTLOCT_prio_5; + __be64 RTTLOCT_NOFRM_prio_5; + __be64 ROCT_prio_5; + + __be64 RTTLOCT_prio_6; + __be64 RTTLOCT_NOFRM_prio_6; + __be64 ROCT_prio_6; + + __be64 RTTLOCT_prio_7; + __be64 RTTLOCT_NOFRM_prio_7; + __be64 ROCT_prio_7; + + __be64 RTTLOCT_novlan; + __be64 RTTLOCT_NOFRM_novlan; + __be64 ROCT_novlan; + + /* Count of Total received frames including bad frames */ + __be64 RTOT_prio_0; + /* Count of Total number of received frames with 802.1Q encapsulation */ + __be64 R1Q_prio_0; + __be64 reserved1; + + __be64 RTOT_prio_1; + __be64 R1Q_prio_1; + __be64 reserved2; + + __be64 RTOT_prio_2; + __be64 R1Q_prio_2; + __be64 reserved3; + + __be64 RTOT_prio_3; + __be64 R1Q_prio_3; + __be64 reserved4; + + __be64 RTOT_prio_4; + __be64 R1Q_prio_4; + __be64 reserved5; + + __be64 RTOT_prio_5; + __be64 R1Q_prio_5; + __be64 reserved6; + + __be64 RTOT_prio_6; + __be64 R1Q_prio_6; + __be64 reserved7; + + __be64 RTOT_prio_7; + __be64 R1Q_prio_7; + __be64 reserved8; + + __be64 RTOT_novlan; + __be64 R1Q_novlan; + __be64 reserved9; + + /* Total number of Successfully Received Control Frames */ + __be64 RCNTL; + __be64 reserved10; + __be64 reserved11; + __be64 reserved12; + /* Count of received frames with a length/type field value between 46 + (42 for VLANtagged frames) and 1500 (also 1500 for VLAN-tagged frames), + inclusive */ + __be64 RInRangeLengthErr; + /* Count of received frames with length/type field between 1501 and 1535 + decimal, inclusive */ + __be64 ROutRangeLengthErr; + /* Count of received frames that are longer than max allowed size for + 802.3 frames (1518/1522) */ + __be64 RFrmTooLong; + /* Count frames received with PCS error */ + __be64 PCS; + + /* Transmit frames with a length of 64 octets */ + __be64 T64_prio_0; + __be64 T64_prio_1; + __be64 T64_prio_2; + __be64 T64_prio_3; + __be64 T64_prio_4; + __be64 T64_prio_5; + __be64 T64_prio_6; + __be64 T64_prio_7; + __be64 T64_novlan; + __be64 T64_loopbk; + /* Transmit frames with a length of 65 to 127 octets. */ + __be64 T127_prio_0; + __be64 T127_prio_1; + __be64 T127_prio_2; + __be64 T127_prio_3; + __be64 T127_prio_4; + __be64 T127_prio_5; + __be64 T127_prio_6; + __be64 T127_prio_7; + __be64 T127_novlan; + __be64 T127_loopbk; + /* Transmit frames with a length of 128 to 255 octets */ + __be64 T255_prio_0; + __be64 T255_prio_1; + __be64 T255_prio_2; + __be64 T255_prio_3; + __be64 T255_prio_4; + __be64 T255_prio_5; + __be64 T255_prio_6; + __be64 T255_prio_7; + __be64 T255_novlan; + __be64 T255_loopbk; + /* Transmit frames with a length of 256 to 511 octets */ + __be64 T511_prio_0; + __be64 T511_prio_1; + __be64 T511_prio_2; + __be64 T511_prio_3; + __be64 T511_prio_4; + __be64 T511_prio_5; + __be64 T511_prio_6; + __be64 T511_prio_7; + __be64 T511_novlan; + __be64 T511_loopbk; + /* Transmit frames with a length of 512 to 1023 octets */ + __be64 T1023_prio_0; + __be64 T1023_prio_1; + __be64 T1023_prio_2; + __be64 T1023_prio_3; + __be64 T1023_prio_4; + __be64 T1023_prio_5; + __be64 T1023_prio_6; + __be64 T1023_prio_7; + __be64 T1023_novlan; + __be64 T1023_loopbk; + /* Transmit frames with a length of 1024 to 1518 octets */ + __be64 T1518_prio_0; + __be64 T1518_prio_1; + __be64 T1518_prio_2; + __be64 T1518_prio_3; + __be64 T1518_prio_4; + __be64 T1518_prio_5; + __be64 T1518_prio_6; + __be64 T1518_prio_7; + __be64 T1518_novlan; + __be64 T1518_loopbk; + /* Counts transmit frames with a length of 1519 to 1522 bytes */ + __be64 T1522_prio_0; + __be64 T1522_prio_1; + __be64 T1522_prio_2; + __be64 T1522_prio_3; + __be64 T1522_prio_4; + __be64 T1522_prio_5; + __be64 T1522_prio_6; + __be64 T1522_prio_7; + __be64 T1522_novlan; + __be64 T1522_loopbk; + /* Transmit frames with a length of 1523 to 1548 octets */ + __be64 T1548_prio_0; + __be64 T1548_prio_1; + __be64 T1548_prio_2; + __be64 T1548_prio_3; + __be64 T1548_prio_4; + __be64 T1548_prio_5; + __be64 T1548_prio_6; + __be64 T1548_prio_7; + __be64 T1548_novlan; + __be64 T1548_loopbk; + /* Counts transmit frames with a length of 1549 to MTU bytes */ + __be64 T2MTU_prio_0; + __be64 T2MTU_prio_1; + __be64 T2MTU_prio_2; + __be64 T2MTU_prio_3; + __be64 T2MTU_prio_4; + __be64 T2MTU_prio_5; + __be64 T2MTU_prio_6; + __be64 T2MTU_prio_7; + __be64 T2MTU_novlan; + __be64 T2MTU_loopbk; + /* Transmit frames with a length greater than MTU octets and a good CRC. */ + __be64 TGIANT_prio_0; + __be64 TGIANT_prio_1; + __be64 TGIANT_prio_2; + __be64 TGIANT_prio_3; + __be64 TGIANT_prio_4; + __be64 TGIANT_prio_5; + __be64 TGIANT_prio_6; + __be64 TGIANT_prio_7; + __be64 TGIANT_novlan; + __be64 TGIANT_loopbk; + /* Transmit broadcast frames with a good CRC */ + __be64 TBCAST_prio_0; + __be64 TBCAST_prio_1; + __be64 TBCAST_prio_2; + __be64 TBCAST_prio_3; + __be64 TBCAST_prio_4; + __be64 TBCAST_prio_5; + __be64 TBCAST_prio_6; + __be64 TBCAST_prio_7; + __be64 TBCAST_novlan; + __be64 TBCAST_loopbk; + /* Transmit multicast frames with a good CRC */ + __be64 TMCAST_prio_0; + __be64 TMCAST_prio_1; + __be64 TMCAST_prio_2; + __be64 TMCAST_prio_3; + __be64 TMCAST_prio_4; + __be64 TMCAST_prio_5; + __be64 TMCAST_prio_6; + __be64 TMCAST_prio_7; + __be64 TMCAST_novlan; + __be64 TMCAST_loopbk; + /* Transmit good frames that are neither broadcast nor multicast */ + __be64 TTOTG_prio_0; + __be64 TTOTG_prio_1; + __be64 TTOTG_prio_2; + __be64 TTOTG_prio_3; + __be64 TTOTG_prio_4; + __be64 TTOTG_prio_5; + __be64 TTOTG_prio_6; + __be64 TTOTG_prio_7; + __be64 TTOTG_novlan; + __be64 TTOTG_loopbk; + + /* total octets of transmitted frames, including framing characters */ + __be64 TTTLOCT_prio_0; + /* total octets of transmitted frames, not including framing characters */ + __be64 TTTLOCT_NOFRM_prio_0; + /* ifOutOctets */ + __be64 TOCT_prio_0; + + __be64 TTTLOCT_prio_1; + __be64 TTTLOCT_NOFRM_prio_1; + __be64 TOCT_prio_1; + + __be64 TTTLOCT_prio_2; + __be64 TTTLOCT_NOFRM_prio_2; + __be64 TOCT_prio_2; + + __be64 TTTLOCT_prio_3; + __be64 TTTLOCT_NOFRM_prio_3; + __be64 TOCT_prio_3; + + __be64 TTTLOCT_prio_4; + __be64 TTTLOCT_NOFRM_prio_4; + __be64 TOCT_prio_4; + + __be64 TTTLOCT_prio_5; + __be64 TTTLOCT_NOFRM_prio_5; + __be64 TOCT_prio_5; + + __be64 TTTLOCT_prio_6; + __be64 TTTLOCT_NOFRM_prio_6; + __be64 TOCT_prio_6; + + __be64 TTTLOCT_prio_7; + __be64 TTTLOCT_NOFRM_prio_7; + __be64 TOCT_prio_7; + + __be64 TTTLOCT_novlan; + __be64 TTTLOCT_NOFRM_novlan; + __be64 TOCT_novlan; + + __be64 TTTLOCT_loopbk; + __be64 TTTLOCT_NOFRM_loopbk; + __be64 TOCT_loopbk; + + /* Total frames transmitted with a good CRC that are not aborted */ + __be64 TTOT_prio_0; + /* Total number of frames transmitted with 802.1Q encapsulation */ + __be64 T1Q_prio_0; + __be64 reserved13; + + __be64 TTOT_prio_1; + __be64 T1Q_prio_1; + __be64 reserved14; + + __be64 TTOT_prio_2; + __be64 T1Q_prio_2; + __be64 reserved15; + + __be64 TTOT_prio_3; + __be64 T1Q_prio_3; + __be64 reserved16; + + __be64 TTOT_prio_4; + __be64 T1Q_prio_4; + __be64 reserved17; + + __be64 TTOT_prio_5; + __be64 T1Q_prio_5; + __be64 reserved18; + + __be64 TTOT_prio_6; + __be64 T1Q_prio_6; + __be64 reserved19; + + __be64 TTOT_prio_7; + __be64 T1Q_prio_7; + __be64 reserved20; + + __be64 TTOT_novlan; + __be64 T1Q_novlan; + __be64 reserved21; + + __be64 TTOT_loopbk; + __be64 T1Q_loopbk; + __be64 reserved22; + + /* Received frames with a length greater than MTU octets and a bad CRC */ + __be32 RJBBR; + /* Received frames with a bad CRC that are not runts, jabbers, + or alignment errors */ + __be32 RCRC; + /* Received frames with SFD with a length of less than 64 octets and a + bad CRC */ + __be32 RRUNT; + /* Received frames with a length less than 64 octets and a good CRC */ + __be32 RSHORT; + /* Total Number of Received Packets Dropped */ + __be32 RDROP; + /* Drop due to overflow */ + __be32 RdropOvflw; + /* Drop due to overflow */ + __be32 RdropLength; + /* Total of good frames. Does not include frames received with + frame-too-long, FCS, or length errors */ + __be32 RTOTFRMS; + /* Total dropped Xmited packets */ + __be32 TDROP; +}; + + +#endif diff --git a/drivers/net/ethernet/mellanox/mlx4/en_resources.c b/drivers/net/ethernet/mellanox/mlx4/en_resources.c new file mode 100644 index 0000000..f1a5500 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/en_resources.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2007 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include + +#include "mlx4_en.h" + +void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride, + int is_tx, int rss, int qpn, int cqn, + int user_prio, struct mlx4_qp_context *context) +{ + struct mlx4_en_dev *mdev = priv->mdev; + struct net_device *dev = priv->dev; + + memset(context, 0, sizeof *context); + context->flags = cpu_to_be32(7 << 16 | rss << MLX4_RSS_QPC_FLAG_OFFSET); + context->pd = cpu_to_be32(mdev->priv_pdn); + context->mtu_msgmax = 0xff; + if (!is_tx && !rss) + context->rq_size_stride = ilog2(size) << 3 | (ilog2(stride) - 4); + if (is_tx) + context->sq_size_stride = ilog2(size) << 3 | (ilog2(stride) - 4); + else + context->sq_size_stride = ilog2(TXBB_SIZE) - 4; + context->usr_page = cpu_to_be32(mdev->priv_uar.index); + context->local_qpn = cpu_to_be32(qpn); + context->pri_path.ackto = 1 & 0x07; + context->pri_path.sched_queue = 0x83 | (priv->port - 1) << 6; + if (user_prio >= 0) { + context->pri_path.sched_queue |= user_prio << 3; + context->pri_path.feup = MLX4_FEUP_FORCE_ETH_UP; + } + context->pri_path.counter_index = 0xff; + context->cqn_send = cpu_to_be32(cqn); + context->cqn_recv = cpu_to_be32(cqn); + context->db_rec_addr = cpu_to_be64(priv->res.db.dma << 2); + if (!(dev->features & NETIF_F_HW_VLAN_CTAG_RX)) + context->param3 |= cpu_to_be32(1 << 30); + + if (!is_tx && !rss && + (mdev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN)) { + en_dbg(HW, priv, "Setting RX qp %x tunnel mode to RX tunneled & non-tunneled\n", qpn); + context->srqn = cpu_to_be32(7 << 28); /* this fills bits 30:28 */ + } +} + + +int mlx4_en_map_buffer(struct mlx4_buf *buf) +{ + struct page **pages; + int i; + + if (BITS_PER_LONG == 64 || buf->nbufs == 1) + return 0; + + pages = kmalloc(sizeof *pages * buf->nbufs, GFP_KERNEL); + if (!pages) + return -ENOMEM; + + for (i = 0; i < buf->nbufs; ++i) + pages[i] = virt_to_page(buf->page_list[i].buf); + + buf->direct.buf = vmap(pages, buf->nbufs, VM_MAP, PAGE_KERNEL); + kfree(pages); + if (!buf->direct.buf) + return -ENOMEM; + + return 0; +} + +void mlx4_en_unmap_buffer(struct mlx4_buf *buf) +{ + if (BITS_PER_LONG == 64 || buf->nbufs == 1) + return; + + vunmap(buf->direct.buf); +} + +void mlx4_en_sqp_event(struct mlx4_qp *qp, enum mlx4_event event) +{ + return; +} + diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c new file mode 100644 index 0000000..01660c5 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -0,0 +1,1169 @@ +/* + * Copyright (c) 2007 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mlx4_en.h" + +static int mlx4_alloc_pages(struct mlx4_en_priv *priv, + struct mlx4_en_rx_alloc *page_alloc, + const struct mlx4_en_frag_info *frag_info, + gfp_t _gfp) +{ + int order; + struct page *page; + dma_addr_t dma; + + for (order = MLX4_EN_ALLOC_PREFER_ORDER; ;) { + gfp_t gfp = _gfp; + + if (order) + gfp |= __GFP_COMP | __GFP_NOWARN; + page = alloc_pages(gfp, order); + if (likely(page)) + break; + if (--order < 0 || + ((PAGE_SIZE << order) < frag_info->frag_size)) + return -ENOMEM; + } + dma = dma_map_page(priv->ddev, page, 0, PAGE_SIZE << order, + PCI_DMA_FROMDEVICE); + if (dma_mapping_error(priv->ddev, dma)) { + put_page(page); + return -ENOMEM; + } + page_alloc->page_size = PAGE_SIZE << order; + page_alloc->page = page; + page_alloc->dma = dma; + page_alloc->page_offset = frag_info->frag_align; + /* Not doing get_page() for each frag is a big win + * on asymetric workloads. Note we can not use atomic_set(). + */ + atomic_add(page_alloc->page_size / frag_info->frag_stride - 1, + &page->_count); + return 0; +} + +static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv, + struct mlx4_en_rx_desc *rx_desc, + struct mlx4_en_rx_alloc *frags, + struct mlx4_en_rx_alloc *ring_alloc, + gfp_t gfp) +{ + struct mlx4_en_rx_alloc page_alloc[MLX4_EN_MAX_RX_FRAGS]; + const struct mlx4_en_frag_info *frag_info; + struct page *page; + dma_addr_t dma; + int i; + + for (i = 0; i < priv->num_frags; i++) { + frag_info = &priv->frag_info[i]; + page_alloc[i] = ring_alloc[i]; + page_alloc[i].page_offset += frag_info->frag_stride; + + if (page_alloc[i].page_offset + frag_info->frag_stride <= + ring_alloc[i].page_size) + continue; + + if (mlx4_alloc_pages(priv, &page_alloc[i], frag_info, gfp)) + goto out; + } + + for (i = 0; i < priv->num_frags; i++) { + frags[i] = ring_alloc[i]; + dma = ring_alloc[i].dma + ring_alloc[i].page_offset; + ring_alloc[i] = page_alloc[i]; + rx_desc->data[i].addr = cpu_to_be64(dma); + } + + return 0; + +out: + while (i--) { + frag_info = &priv->frag_info[i]; + if (page_alloc[i].page != ring_alloc[i].page) { + dma_unmap_page(priv->ddev, page_alloc[i].dma, + page_alloc[i].page_size, PCI_DMA_FROMDEVICE); + page = page_alloc[i].page; + atomic_set(&page->_count, 1); + put_page(page); + } + } + return -ENOMEM; +} + +static void mlx4_en_free_frag(struct mlx4_en_priv *priv, + struct mlx4_en_rx_alloc *frags, + int i) +{ + const struct mlx4_en_frag_info *frag_info = &priv->frag_info[i]; + u32 next_frag_end = frags[i].page_offset + 2 * frag_info->frag_stride; + + + if (next_frag_end > frags[i].page_size) + dma_unmap_page(priv->ddev, frags[i].dma, frags[i].page_size, + PCI_DMA_FROMDEVICE); + + if (frags[i].page) + put_page(frags[i].page); +} + +static int mlx4_en_init_allocator(struct mlx4_en_priv *priv, + struct mlx4_en_rx_ring *ring) +{ + int i; + struct mlx4_en_rx_alloc *page_alloc; + + for (i = 0; i < priv->num_frags; i++) { + const struct mlx4_en_frag_info *frag_info = &priv->frag_info[i]; + + if (mlx4_alloc_pages(priv, &ring->page_alloc[i], + frag_info, GFP_KERNEL)) + goto out; + } + return 0; + +out: + while (i--) { + struct page *page; + + page_alloc = &ring->page_alloc[i]; + dma_unmap_page(priv->ddev, page_alloc->dma, + page_alloc->page_size, PCI_DMA_FROMDEVICE); + page = page_alloc->page; + atomic_set(&page->_count, 1); + put_page(page); + page_alloc->page = NULL; + } + return -ENOMEM; +} + +static void mlx4_en_destroy_allocator(struct mlx4_en_priv *priv, + struct mlx4_en_rx_ring *ring) +{ + struct mlx4_en_rx_alloc *page_alloc; + int i; + + for (i = 0; i < priv->num_frags; i++) { + const struct mlx4_en_frag_info *frag_info = &priv->frag_info[i]; + + page_alloc = &ring->page_alloc[i]; + en_dbg(DRV, priv, "Freeing allocator:%d count:%d\n", + i, page_count(page_alloc->page)); + + dma_unmap_page(priv->ddev, page_alloc->dma, + page_alloc->page_size, PCI_DMA_FROMDEVICE); + while (page_alloc->page_offset + frag_info->frag_stride < + page_alloc->page_size) { + put_page(page_alloc->page); + page_alloc->page_offset += frag_info->frag_stride; + } + page_alloc->page = NULL; + } +} + +static void mlx4_en_init_rx_desc(struct mlx4_en_priv *priv, + struct mlx4_en_rx_ring *ring, int index) +{ + struct mlx4_en_rx_desc *rx_desc = ring->buf + ring->stride * index; + int possible_frags; + int i; + + /* Set size and memtype fields */ + for (i = 0; i < priv->num_frags; i++) { + rx_desc->data[i].byte_count = + cpu_to_be32(priv->frag_info[i].frag_size); + rx_desc->data[i].lkey = cpu_to_be32(priv->mdev->mr.key); + } + + /* If the number of used fragments does not fill up the ring stride, + * remaining (unused) fragments must be padded with null address/size + * and a special memory key */ + possible_frags = (ring->stride - sizeof(struct mlx4_en_rx_desc)) / DS_SIZE; + for (i = priv->num_frags; i < possible_frags; i++) { + rx_desc->data[i].byte_count = 0; + rx_desc->data[i].lkey = cpu_to_be32(MLX4_EN_MEMTYPE_PAD); + rx_desc->data[i].addr = 0; + } +} + +static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv, + struct mlx4_en_rx_ring *ring, int index, + gfp_t gfp) +{ + struct mlx4_en_rx_desc *rx_desc = ring->buf + (index * ring->stride); + struct mlx4_en_rx_alloc *frags = ring->rx_info + + (index << priv->log_rx_info); + + return mlx4_en_alloc_frags(priv, rx_desc, frags, ring->page_alloc, gfp); +} + +static inline void mlx4_en_update_rx_prod_db(struct mlx4_en_rx_ring *ring) +{ + *ring->wqres.db.db = cpu_to_be32(ring->prod & 0xffff); +} + +static void mlx4_en_free_rx_desc(struct mlx4_en_priv *priv, + struct mlx4_en_rx_ring *ring, + int index) +{ + struct mlx4_en_rx_alloc *frags; + int nr; + + frags = ring->rx_info + (index << priv->log_rx_info); + for (nr = 0; nr < priv->num_frags; nr++) { + en_dbg(DRV, priv, "Freeing fragment:%d\n", nr); + mlx4_en_free_frag(priv, frags, nr); + } +} + +static int mlx4_en_fill_rx_buffers(struct mlx4_en_priv *priv) +{ + struct mlx4_en_rx_ring *ring; + int ring_ind; + int buf_ind; + int new_size; + + for (buf_ind = 0; buf_ind < priv->prof->rx_ring_size; buf_ind++) { + for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) { + ring = priv->rx_ring[ring_ind]; + + if (mlx4_en_prepare_rx_desc(priv, ring, + ring->actual_size, + GFP_KERNEL)) { + if (ring->actual_size < MLX4_EN_MIN_RX_SIZE) { + en_err(priv, "Failed to allocate enough rx buffers\n"); + return -ENOMEM; + } else { + new_size = rounddown_pow_of_two(ring->actual_size); + en_warn(priv, "Only %d buffers allocated reducing ring size to %d\n", + ring->actual_size, new_size); + goto reduce_rings; + } + } + ring->actual_size++; + ring->prod++; + } + } + return 0; + +reduce_rings: + for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) { + ring = priv->rx_ring[ring_ind]; + while (ring->actual_size > new_size) { + ring->actual_size--; + ring->prod--; + mlx4_en_free_rx_desc(priv, ring, ring->actual_size); + } + } + + return 0; +} + +static void mlx4_en_free_rx_buf(struct mlx4_en_priv *priv, + struct mlx4_en_rx_ring *ring) +{ + int index; + + en_dbg(DRV, priv, "Freeing Rx buf - cons:%d prod:%d\n", + ring->cons, ring->prod); + + /* Unmap and free Rx buffers */ + BUG_ON((u32) (ring->prod - ring->cons) > ring->actual_size); + while (ring->cons != ring->prod) { + index = ring->cons & ring->size_mask; + en_dbg(DRV, priv, "Processing descriptor:%d\n", index); + mlx4_en_free_rx_desc(priv, ring, index); + ++ring->cons; + } +} + +void mlx4_en_set_num_rx_rings(struct mlx4_en_dev *mdev) +{ + int i; + int num_of_eqs; + int num_rx_rings; + struct mlx4_dev *dev = mdev->dev; + + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) { + if (!dev->caps.comp_pool) + num_of_eqs = max_t(int, MIN_RX_RINGS, + min_t(int, + dev->caps.num_comp_vectors, + DEF_RX_RINGS)); + else + num_of_eqs = min_t(int, MAX_MSIX_P_PORT, + dev->caps.comp_pool/ + dev->caps.num_ports) - 1; + + num_rx_rings = mlx4_low_memory_profile() ? MIN_RX_RINGS : + min_t(int, num_of_eqs, + netif_get_num_default_rss_queues()); + mdev->profile.prof[i].rx_ring_num = + rounddown_pow_of_two(num_rx_rings); + } +} + +int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv, + struct mlx4_en_rx_ring **pring, + u32 size, u16 stride, int node) +{ + struct mlx4_en_dev *mdev = priv->mdev; + struct mlx4_en_rx_ring *ring; + int err = -ENOMEM; + int tmp; + + ring = kzalloc_node(sizeof(*ring), GFP_KERNEL, node); + if (!ring) { + ring = kzalloc(sizeof(*ring), GFP_KERNEL); + if (!ring) { + en_err(priv, "Failed to allocate RX ring structure\n"); + return -ENOMEM; + } + } + + ring->prod = 0; + ring->cons = 0; + ring->size = size; + ring->size_mask = size - 1; + ring->stride = stride; + ring->log_stride = ffs(ring->stride) - 1; + ring->buf_size = ring->size * ring->stride + TXBB_SIZE; + + tmp = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS * + sizeof(struct mlx4_en_rx_alloc)); + ring->rx_info = vmalloc_node(tmp, node); + if (!ring->rx_info) { + ring->rx_info = vmalloc(tmp); + if (!ring->rx_info) { + err = -ENOMEM; + goto err_ring; + } + } + + en_dbg(DRV, priv, "Allocated rx_info ring at addr:%p size:%d\n", + ring->rx_info, tmp); + + /* Allocate HW buffers on provided NUMA node */ + set_dev_node(&mdev->dev->pdev->dev, node); + err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, + ring->buf_size, 2 * PAGE_SIZE); + set_dev_node(&mdev->dev->pdev->dev, mdev->dev->numa_node); + if (err) + goto err_info; + + err = mlx4_en_map_buffer(&ring->wqres.buf); + if (err) { + en_err(priv, "Failed to map RX buffer\n"); + goto err_hwq; + } + ring->buf = ring->wqres.buf.direct.buf; + + ring->hwtstamp_rx_filter = priv->hwtstamp_config.rx_filter; + + *pring = ring; + return 0; + +err_hwq: + mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size); +err_info: + vfree(ring->rx_info); + ring->rx_info = NULL; +err_ring: + kfree(ring); + *pring = NULL; + + return err; +} + +int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv) +{ + struct mlx4_en_rx_ring *ring; + int i; + int ring_ind; + int err; + int stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) + + DS_SIZE * priv->num_frags); + + for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) { + ring = priv->rx_ring[ring_ind]; + + ring->prod = 0; + ring->cons = 0; + ring->actual_size = 0; + ring->cqn = priv->rx_cq[ring_ind]->mcq.cqn; + + ring->stride = stride; + if (ring->stride <= TXBB_SIZE) + ring->buf += TXBB_SIZE; + + ring->log_stride = ffs(ring->stride) - 1; + ring->buf_size = ring->size * ring->stride; + + memset(ring->buf, 0, ring->buf_size); + mlx4_en_update_rx_prod_db(ring); + + /* Initialize all descriptors */ + for (i = 0; i < ring->size; i++) + mlx4_en_init_rx_desc(priv, ring, i); + + /* Initialize page allocators */ + err = mlx4_en_init_allocator(priv, ring); + if (err) { + en_err(priv, "Failed initializing ring allocator\n"); + if (ring->stride <= TXBB_SIZE) + ring->buf -= TXBB_SIZE; + ring_ind--; + goto err_allocator; + } + } + err = mlx4_en_fill_rx_buffers(priv); + if (err) + goto err_buffers; + + for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) { + ring = priv->rx_ring[ring_ind]; + + ring->size_mask = ring->actual_size - 1; + mlx4_en_update_rx_prod_db(ring); + } + + return 0; + +err_buffers: + for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) + mlx4_en_free_rx_buf(priv, priv->rx_ring[ring_ind]); + + ring_ind = priv->rx_ring_num - 1; +err_allocator: + while (ring_ind >= 0) { + if (priv->rx_ring[ring_ind]->stride <= TXBB_SIZE) + priv->rx_ring[ring_ind]->buf -= TXBB_SIZE; + mlx4_en_destroy_allocator(priv, priv->rx_ring[ring_ind]); + ring_ind--; + } + return err; +} + +void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv, + struct mlx4_en_rx_ring **pring, + u32 size, u16 stride) +{ + struct mlx4_en_dev *mdev = priv->mdev; + struct mlx4_en_rx_ring *ring = *pring; + + mlx4_en_unmap_buffer(&ring->wqres.buf); + mlx4_free_hwq_res(mdev->dev, &ring->wqres, size * stride + TXBB_SIZE); + vfree(ring->rx_info); + ring->rx_info = NULL; + kfree(ring); + *pring = NULL; +#ifdef CONFIG_RFS_ACCEL + mlx4_en_cleanup_filters(priv); +#endif +} + +void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv, + struct mlx4_en_rx_ring *ring) +{ + mlx4_en_free_rx_buf(priv, ring); + if (ring->stride <= TXBB_SIZE) + ring->buf -= TXBB_SIZE; + mlx4_en_destroy_allocator(priv, ring); +} + + +static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv, + struct mlx4_en_rx_desc *rx_desc, + struct mlx4_en_rx_alloc *frags, + struct sk_buff *skb, + int length) +{ + struct skb_frag_struct *skb_frags_rx = skb_shinfo(skb)->frags; + struct mlx4_en_frag_info *frag_info; + int nr; + dma_addr_t dma; + + /* Collect used fragments while replacing them in the HW descriptors */ + for (nr = 0; nr < priv->num_frags; nr++) { + frag_info = &priv->frag_info[nr]; + if (length <= frag_info->frag_prefix_size) + break; + if (!frags[nr].page) + goto fail; + + dma = be64_to_cpu(rx_desc->data[nr].addr); + dma_sync_single_for_cpu(priv->ddev, dma, frag_info->frag_size, + DMA_FROM_DEVICE); + + /* Save page reference in skb */ + __skb_frag_set_page(&skb_frags_rx[nr], frags[nr].page); + skb_frag_size_set(&skb_frags_rx[nr], frag_info->frag_size); + skb_frags_rx[nr].page_offset = frags[nr].page_offset; + skb->truesize += frag_info->frag_stride; + frags[nr].page = NULL; + } + /* Adjust size of last fragment to match actual length */ + if (nr > 0) + skb_frag_size_set(&skb_frags_rx[nr - 1], + length - priv->frag_info[nr - 1].frag_prefix_size); + return nr; + +fail: + while (nr > 0) { + nr--; + __skb_frag_unref(&skb_frags_rx[nr]); + } + return 0; +} + + +static struct sk_buff *mlx4_en_rx_skb(struct mlx4_en_priv *priv, + struct mlx4_en_rx_desc *rx_desc, + struct mlx4_en_rx_alloc *frags, + unsigned int length) +{ + struct sk_buff *skb; + void *va; + int used_frags; + dma_addr_t dma; + + skb = netdev_alloc_skb(priv->dev, SMALL_PACKET_SIZE + NET_IP_ALIGN); + if (!skb) { + en_dbg(RX_ERR, priv, "Failed allocating skb\n"); + return NULL; + } + skb_reserve(skb, NET_IP_ALIGN); + skb->len = length; + + /* Get pointer to first fragment so we could copy the headers into the + * (linear part of the) skb */ + va = page_address(frags[0].page) + frags[0].page_offset; + + if (length <= SMALL_PACKET_SIZE) { + /* We are copying all relevant data to the skb - temporarily + * sync buffers for the copy */ + dma = be64_to_cpu(rx_desc->data[0].addr); + dma_sync_single_for_cpu(priv->ddev, dma, length, + DMA_FROM_DEVICE); + skb_copy_to_linear_data(skb, va, length); + skb->tail += length; + } else { + unsigned int pull_len; + + /* Move relevant fragments to skb */ + used_frags = mlx4_en_complete_rx_desc(priv, rx_desc, frags, + skb, length); + if (unlikely(!used_frags)) { + kfree_skb(skb); + return NULL; + } + skb_shinfo(skb)->nr_frags = used_frags; + + pull_len = eth_get_headlen(va, SMALL_PACKET_SIZE); + /* Copy headers into the skb linear buffer */ + memcpy(skb->data, va, pull_len); + skb->tail += pull_len; + + /* Skip headers in first fragment */ + skb_shinfo(skb)->frags[0].page_offset += pull_len; + + /* Adjust size of first fragment */ + skb_frag_size_sub(&skb_shinfo(skb)->frags[0], pull_len); + skb->data_len = length - pull_len; + } + return skb; +} + +static void validate_loopback(struct mlx4_en_priv *priv, struct sk_buff *skb) +{ + int i; + int offset = ETH_HLEN; + + for (i = 0; i < MLX4_LOOPBACK_TEST_PAYLOAD; i++, offset++) { + if (*(skb->data + offset) != (unsigned char) (i & 0xff)) + goto out_loopback; + } + /* Loopback found */ + priv->loopback_ok = 1; + +out_loopback: + dev_kfree_skb_any(skb); +} + +static void mlx4_en_refill_rx_buffers(struct mlx4_en_priv *priv, + struct mlx4_en_rx_ring *ring) +{ + int index = ring->prod & ring->size_mask; + + while ((u32) (ring->prod - ring->cons) < ring->actual_size) { + if (mlx4_en_prepare_rx_desc(priv, ring, index, GFP_ATOMIC)) + break; + ring->prod++; + index = ring->prod & ring->size_mask; + } +} + +int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int budget) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + struct mlx4_cqe *cqe; + struct mlx4_en_rx_ring *ring = priv->rx_ring[cq->ring]; + struct mlx4_en_rx_alloc *frags; + struct mlx4_en_rx_desc *rx_desc; + struct sk_buff *skb; + int index; + int nr; + unsigned int length; + int polled = 0; + int ip_summed; + int factor = priv->cqe_factor; + u64 timestamp; + bool l2_tunnel; + + if (!priv->port_up) + return 0; + + if (budget <= 0) + return polled; + + /* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx + * descriptor offset can be deduced from the CQE index instead of + * reading 'cqe->index' */ + index = cq->mcq.cons_index & ring->size_mask; + cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor; + + /* Process all completed CQEs */ + while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK, + cq->mcq.cons_index & cq->size)) { + + frags = ring->rx_info + (index << priv->log_rx_info); + rx_desc = ring->buf + (index << ring->log_stride); + + /* + * make sure we read the CQE after we read the ownership bit + */ + rmb(); + + /* Drop packet on bad receive or bad checksum */ + if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == + MLX4_CQE_OPCODE_ERROR)) { + en_err(priv, "CQE completed in error - vendor syndrom:%d syndrom:%d\n", + ((struct mlx4_err_cqe *)cqe)->vendor_err_syndrome, + ((struct mlx4_err_cqe *)cqe)->syndrome); + goto next; + } + if (unlikely(cqe->badfcs_enc & MLX4_CQE_BAD_FCS)) { + en_dbg(RX_ERR, priv, "Accepted frame with bad FCS\n"); + goto next; + } + + /* Check if we need to drop the packet if SRIOV is not enabled + * and not performing the selftest or flb disabled + */ + if (priv->flags & MLX4_EN_FLAG_RX_FILTER_NEEDED) { + struct ethhdr *ethh; + dma_addr_t dma; + /* Get pointer to first fragment since we haven't + * skb yet and cast it to ethhdr struct + */ + dma = be64_to_cpu(rx_desc->data[0].addr); + dma_sync_single_for_cpu(priv->ddev, dma, sizeof(*ethh), + DMA_FROM_DEVICE); + ethh = (struct ethhdr *)(page_address(frags[0].page) + + frags[0].page_offset); + + if (is_multicast_ether_addr(ethh->h_dest)) { + struct mlx4_mac_entry *entry; + struct hlist_head *bucket; + unsigned int mac_hash; + + /* Drop the packet, since HW loopback-ed it */ + mac_hash = ethh->h_source[MLX4_EN_MAC_HASH_IDX]; + bucket = &priv->mac_hash[mac_hash]; + rcu_read_lock(); + hlist_for_each_entry_rcu(entry, bucket, hlist) { + if (ether_addr_equal_64bits(entry->mac, + ethh->h_source)) { + rcu_read_unlock(); + goto next; + } + } + rcu_read_unlock(); + } + } + + /* + * Packet is OK - process it. + */ + length = be32_to_cpu(cqe->byte_cnt); + length -= ring->fcs_del; + ring->bytes += length; + ring->packets++; + l2_tunnel = (dev->hw_enc_features & NETIF_F_RXCSUM) && + (cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_L2_TUNNEL)); + + if (likely(dev->features & NETIF_F_RXCSUM)) { + if ((cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPOK)) && + (cqe->checksum == cpu_to_be16(0xffff))) { + ring->csum_ok++; + /* This packet is eligible for GRO if it is: + * - DIX Ethernet (type interpretation) + * - TCP/IP (v4) + * - without IP options + * - not an IP fragment + * - no LLS polling in progress + */ + if (!mlx4_en_cq_busy_polling(cq) && + (dev->features & NETIF_F_GRO)) { + struct sk_buff *gro_skb = napi_get_frags(&cq->napi); + if (!gro_skb) + goto next; + + nr = mlx4_en_complete_rx_desc(priv, + rx_desc, frags, gro_skb, + length); + if (!nr) + goto next; + + skb_shinfo(gro_skb)->nr_frags = nr; + gro_skb->len = length; + gro_skb->data_len = length; + gro_skb->ip_summed = CHECKSUM_UNNECESSARY; + + if (l2_tunnel) + gro_skb->csum_level = 1; + if ((cqe->vlan_my_qpn & + cpu_to_be32(MLX4_CQE_VLAN_PRESENT_MASK)) && + (dev->features & NETIF_F_HW_VLAN_CTAG_RX)) { + u16 vid = be16_to_cpu(cqe->sl_vid); + + __vlan_hwaccel_put_tag(gro_skb, htons(ETH_P_8021Q), vid); + } + + if (dev->features & NETIF_F_RXHASH) + skb_set_hash(gro_skb, + be32_to_cpu(cqe->immed_rss_invalid), + PKT_HASH_TYPE_L3); + + skb_record_rx_queue(gro_skb, cq->ring); + skb_mark_napi_id(gro_skb, &cq->napi); + + if (ring->hwtstamp_rx_filter == HWTSTAMP_FILTER_ALL) { + timestamp = mlx4_en_get_cqe_ts(cqe); + mlx4_en_fill_hwtstamps(mdev, + skb_hwtstamps(gro_skb), + timestamp); + } + + napi_gro_frags(&cq->napi); + goto next; + } + + /* GRO not possible, complete processing here */ + ip_summed = CHECKSUM_UNNECESSARY; + } else { + ip_summed = CHECKSUM_NONE; + ring->csum_none++; + } + } else { + ip_summed = CHECKSUM_NONE; + ring->csum_none++; + } + + skb = mlx4_en_rx_skb(priv, rx_desc, frags, length); + if (!skb) { + priv->stats.rx_dropped++; + goto next; + } + + if (unlikely(priv->validate_loopback)) { + validate_loopback(priv, skb); + goto next; + } + + skb->ip_summed = ip_summed; + skb->protocol = eth_type_trans(skb, dev); + skb_record_rx_queue(skb, cq->ring); + + if (l2_tunnel && ip_summed == CHECKSUM_UNNECESSARY) + skb->csum_level = 1; + + if (dev->features & NETIF_F_RXHASH) + skb_set_hash(skb, + be32_to_cpu(cqe->immed_rss_invalid), + PKT_HASH_TYPE_L3); + + if ((be32_to_cpu(cqe->vlan_my_qpn) & + MLX4_CQE_VLAN_PRESENT_MASK) && + (dev->features & NETIF_F_HW_VLAN_CTAG_RX)) + __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), be16_to_cpu(cqe->sl_vid)); + + if (ring->hwtstamp_rx_filter == HWTSTAMP_FILTER_ALL) { + timestamp = mlx4_en_get_cqe_ts(cqe); + mlx4_en_fill_hwtstamps(mdev, skb_hwtstamps(skb), + timestamp); + } + + skb_mark_napi_id(skb, &cq->napi); + + if (!mlx4_en_cq_busy_polling(cq)) + napi_gro_receive(&cq->napi, skb); + else + netif_receive_skb(skb); + +next: + for (nr = 0; nr < priv->num_frags; nr++) + mlx4_en_free_frag(priv, frags, nr); + + ++cq->mcq.cons_index; + index = (cq->mcq.cons_index) & ring->size_mask; + cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor; + if (++polled == budget) + goto out; + } + +out: + AVG_PERF_COUNTER(priv->pstats.rx_coal_avg, polled); + mlx4_cq_set_ci(&cq->mcq); + wmb(); /* ensure HW sees CQ consumer before we post new buffers */ + ring->cons = cq->mcq.cons_index; + mlx4_en_refill_rx_buffers(priv, ring); + mlx4_en_update_rx_prod_db(ring); + return polled; +} + + +void mlx4_en_rx_irq(struct mlx4_cq *mcq) +{ + struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq); + struct mlx4_en_priv *priv = netdev_priv(cq->dev); + + if (priv->port_up) + napi_schedule(&cq->napi); + else + mlx4_en_arm_cq(priv, cq); +} + +/* Rx CQ polling - called by NAPI */ +int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget) +{ + struct mlx4_en_cq *cq = container_of(napi, struct mlx4_en_cq, napi); + struct net_device *dev = cq->dev; + struct mlx4_en_priv *priv = netdev_priv(dev); + int done; + + if (!mlx4_en_cq_lock_napi(cq)) + return budget; + + done = mlx4_en_process_rx_cq(dev, cq, budget); + + mlx4_en_cq_unlock_napi(cq); + + /* If we used up all the quota - we're probably not done yet... */ + if (done == budget) { + int cpu_curr; + const struct cpumask *aff; + + INC_PERF_COUNTER(priv->pstats.napi_quota); + + cpu_curr = smp_processor_id(); + aff = irq_desc_get_irq_data(cq->irq_desc)->affinity; + + if (unlikely(!cpumask_test_cpu(cpu_curr, aff))) { + /* Current cpu is not according to smp_irq_affinity - + * probably affinity changed. need to stop this NAPI + * poll, and restart it on the right CPU + */ + napi_complete(napi); + mlx4_en_arm_cq(priv, cq); + return 0; + } + } else { + /* Done for now */ + napi_complete(napi); + mlx4_en_arm_cq(priv, cq); + } + return done; +} + +static const int frag_sizes[] = { + FRAG_SZ0, + FRAG_SZ1, + FRAG_SZ2, + FRAG_SZ3 +}; + +void mlx4_en_calc_rx_buf(struct net_device *dev) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + int eff_mtu = dev->mtu + ETH_HLEN + VLAN_HLEN; + int buf_size = 0; + int i = 0; + + while (buf_size < eff_mtu) { + priv->frag_info[i].frag_size = + (eff_mtu > buf_size + frag_sizes[i]) ? + frag_sizes[i] : eff_mtu - buf_size; + priv->frag_info[i].frag_prefix_size = buf_size; + if (!i) { + priv->frag_info[i].frag_align = NET_IP_ALIGN; + priv->frag_info[i].frag_stride = + ALIGN(frag_sizes[i] + NET_IP_ALIGN, SMP_CACHE_BYTES); + } else { + priv->frag_info[i].frag_align = 0; + priv->frag_info[i].frag_stride = + ALIGN(frag_sizes[i], SMP_CACHE_BYTES); + } + buf_size += priv->frag_info[i].frag_size; + i++; + } + + priv->num_frags = i; + priv->rx_skb_size = eff_mtu; + priv->log_rx_info = ROUNDUP_LOG2(i * sizeof(struct mlx4_en_rx_alloc)); + + en_dbg(DRV, priv, "Rx buffer scatter-list (effective-mtu:%d num_frags:%d):\n", + eff_mtu, priv->num_frags); + for (i = 0; i < priv->num_frags; i++) { + en_err(priv, + " frag:%d - size:%d prefix:%d align:%d stride:%d\n", + i, + priv->frag_info[i].frag_size, + priv->frag_info[i].frag_prefix_size, + priv->frag_info[i].frag_align, + priv->frag_info[i].frag_stride); + } +} + +/* RSS related functions */ + +static int mlx4_en_config_rss_qp(struct mlx4_en_priv *priv, int qpn, + struct mlx4_en_rx_ring *ring, + enum mlx4_qp_state *state, + struct mlx4_qp *qp) +{ + struct mlx4_en_dev *mdev = priv->mdev; + struct mlx4_qp_context *context; + int err = 0; + + context = kmalloc(sizeof(*context), GFP_KERNEL); + if (!context) + return -ENOMEM; + + err = mlx4_qp_alloc(mdev->dev, qpn, qp, GFP_KERNEL); + if (err) { + en_err(priv, "Failed to allocate qp #%x\n", qpn); + goto out; + } + qp->event = mlx4_en_sqp_event; + + memset(context, 0, sizeof *context); + mlx4_en_fill_qp_context(priv, ring->actual_size, ring->stride, 0, 0, + qpn, ring->cqn, -1, context); + context->db_rec_addr = cpu_to_be64(ring->wqres.db.dma); + + /* Cancel FCS removal if FW allows */ + if (mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_FCS_KEEP) { + context->param3 |= cpu_to_be32(1 << 29); + ring->fcs_del = ETH_FCS_LEN; + } else + ring->fcs_del = 0; + + err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, context, qp, state); + if (err) { + mlx4_qp_remove(mdev->dev, qp); + mlx4_qp_free(mdev->dev, qp); + } + mlx4_en_update_rx_prod_db(ring); +out: + kfree(context); + return err; +} + +int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv) +{ + int err; + u32 qpn; + + err = mlx4_qp_reserve_range(priv->mdev->dev, 1, 1, &qpn); + if (err) { + en_err(priv, "Failed reserving drop qpn\n"); + return err; + } + err = mlx4_qp_alloc(priv->mdev->dev, qpn, &priv->drop_qp, GFP_KERNEL); + if (err) { + en_err(priv, "Failed allocating drop qp\n"); + mlx4_qp_release_range(priv->mdev->dev, qpn, 1); + return err; + } + + return 0; +} + +void mlx4_en_destroy_drop_qp(struct mlx4_en_priv *priv) +{ + u32 qpn; + + qpn = priv->drop_qp.qpn; + mlx4_qp_remove(priv->mdev->dev, &priv->drop_qp); + mlx4_qp_free(priv->mdev->dev, &priv->drop_qp); + mlx4_qp_release_range(priv->mdev->dev, qpn, 1); +} + +/* Allocate rx qp's and configure them according to rss map */ +int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv) +{ + struct mlx4_en_dev *mdev = priv->mdev; + struct mlx4_en_rss_map *rss_map = &priv->rss_map; + struct mlx4_qp_context context; + struct mlx4_rss_context *rss_context; + int rss_rings; + void *ptr; + u8 rss_mask = (MLX4_RSS_IPV4 | MLX4_RSS_TCP_IPV4 | MLX4_RSS_IPV6 | + MLX4_RSS_TCP_IPV6); + int i, qpn; + int err = 0; + int good_qps = 0; + static const u32 rsskey[10] = { 0xD181C62C, 0xF7F4DB5B, 0x1983A2FC, + 0x943E1ADB, 0xD9389E6B, 0xD1039C2C, 0xA74499AD, + 0x593D56D9, 0xF3253C06, 0x2ADC1FFC}; + + en_dbg(DRV, priv, "Configuring rss steering\n"); + err = mlx4_qp_reserve_range(mdev->dev, priv->rx_ring_num, + priv->rx_ring_num, + &rss_map->base_qpn); + if (err) { + en_err(priv, "Failed reserving %d qps\n", priv->rx_ring_num); + return err; + } + + for (i = 0; i < priv->rx_ring_num; i++) { + qpn = rss_map->base_qpn + i; + err = mlx4_en_config_rss_qp(priv, qpn, priv->rx_ring[i], + &rss_map->state[i], + &rss_map->qps[i]); + if (err) + goto rss_err; + + ++good_qps; + } + + /* Configure RSS indirection qp */ + err = mlx4_qp_alloc(mdev->dev, priv->base_qpn, &rss_map->indir_qp, GFP_KERNEL); + if (err) { + en_err(priv, "Failed to allocate RSS indirection QP\n"); + goto rss_err; + } + rss_map->indir_qp.event = mlx4_en_sqp_event; + mlx4_en_fill_qp_context(priv, 0, 0, 0, 1, priv->base_qpn, + priv->rx_ring[0]->cqn, -1, &context); + + if (!priv->prof->rss_rings || priv->prof->rss_rings > priv->rx_ring_num) + rss_rings = priv->rx_ring_num; + else + rss_rings = priv->prof->rss_rings; + + ptr = ((void *) &context) + offsetof(struct mlx4_qp_context, pri_path) + + MLX4_RSS_OFFSET_IN_QPC_PRI_PATH; + rss_context = ptr; + rss_context->base_qpn = cpu_to_be32(ilog2(rss_rings) << 24 | + (rss_map->base_qpn)); + rss_context->default_qpn = cpu_to_be32(rss_map->base_qpn); + if (priv->mdev->profile.udp_rss) { + rss_mask |= MLX4_RSS_UDP_IPV4 | MLX4_RSS_UDP_IPV6; + rss_context->base_qpn_udp = rss_context->default_qpn; + } + + if (mdev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) { + en_info(priv, "Setting RSS context tunnel type to RSS on inner headers\n"); + rss_mask |= MLX4_RSS_BY_INNER_HEADERS; + } + + rss_context->flags = rss_mask; + rss_context->hash_fn = MLX4_RSS_HASH_TOP; + for (i = 0; i < 10; i++) + rss_context->rss_key[i] = cpu_to_be32(rsskey[i]); + + err = mlx4_qp_to_ready(mdev->dev, &priv->res.mtt, &context, + &rss_map->indir_qp, &rss_map->indir_state); + if (err) + goto indir_err; + + return 0; + +indir_err: + mlx4_qp_modify(mdev->dev, NULL, rss_map->indir_state, + MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->indir_qp); + mlx4_qp_remove(mdev->dev, &rss_map->indir_qp); + mlx4_qp_free(mdev->dev, &rss_map->indir_qp); +rss_err: + for (i = 0; i < good_qps; i++) { + mlx4_qp_modify(mdev->dev, NULL, rss_map->state[i], + MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->qps[i]); + mlx4_qp_remove(mdev->dev, &rss_map->qps[i]); + mlx4_qp_free(mdev->dev, &rss_map->qps[i]); + } + mlx4_qp_release_range(mdev->dev, rss_map->base_qpn, priv->rx_ring_num); + return err; +} + +void mlx4_en_release_rss_steer(struct mlx4_en_priv *priv) +{ + struct mlx4_en_dev *mdev = priv->mdev; + struct mlx4_en_rss_map *rss_map = &priv->rss_map; + int i; + + mlx4_qp_modify(mdev->dev, NULL, rss_map->indir_state, + MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->indir_qp); + mlx4_qp_remove(mdev->dev, &rss_map->indir_qp); + mlx4_qp_free(mdev->dev, &rss_map->indir_qp); + + for (i = 0; i < priv->rx_ring_num; i++) { + mlx4_qp_modify(mdev->dev, NULL, rss_map->state[i], + MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->qps[i]); + mlx4_qp_remove(mdev->dev, &rss_map->qps[i]); + mlx4_qp_free(mdev->dev, &rss_map->qps[i]); + } + mlx4_qp_release_range(mdev->dev, rss_map->base_qpn, priv->rx_ring_num); +} diff --git a/drivers/net/ethernet/mellanox/mlx4/en_selftest.c b/drivers/net/ethernet/mellanox/mlx4/en_selftest.c new file mode 100644 index 0000000..49d5afc --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/en_selftest.c @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2007 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include +#include + +#include "mlx4_en.h" + + +static int mlx4_en_test_registers(struct mlx4_en_priv *priv) +{ + return mlx4_cmd(priv->mdev->dev, 0, 0, 0, MLX4_CMD_HW_HEALTH_CHECK, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); +} + +static int mlx4_en_test_loopback_xmit(struct mlx4_en_priv *priv) +{ + struct sk_buff *skb; + struct ethhdr *ethh; + unsigned char *packet; + unsigned int packet_size = MLX4_LOOPBACK_TEST_PAYLOAD; + unsigned int i; + int err; + + + /* build the pkt before xmit */ + skb = netdev_alloc_skb(priv->dev, MLX4_LOOPBACK_TEST_PAYLOAD + ETH_HLEN + NET_IP_ALIGN); + if (!skb) + return -ENOMEM; + + skb_reserve(skb, NET_IP_ALIGN); + + ethh = (struct ethhdr *)skb_put(skb, sizeof(struct ethhdr)); + packet = (unsigned char *)skb_put(skb, packet_size); + memcpy(ethh->h_dest, priv->dev->dev_addr, ETH_ALEN); + memset(ethh->h_source, 0, ETH_ALEN); + ethh->h_proto = htons(ETH_P_ARP); + skb_set_mac_header(skb, 0); + for (i = 0; i < packet_size; ++i) /* fill our packet */ + packet[i] = (unsigned char)(i & 0xff); + + /* xmit the pkt */ + err = mlx4_en_xmit(skb, priv->dev); + return err; +} + +static int mlx4_en_test_loopback(struct mlx4_en_priv *priv) +{ + u32 loopback_ok = 0; + int i; + + + priv->loopback_ok = 0; + priv->validate_loopback = 1; + + mlx4_en_update_loopback_state(priv->dev, priv->dev->features); + + /* xmit */ + if (mlx4_en_test_loopback_xmit(priv)) { + en_err(priv, "Transmitting loopback packet failed\n"); + goto mlx4_en_test_loopback_exit; + } + + /* polling for result */ + for (i = 0; i < MLX4_EN_LOOPBACK_RETRIES; ++i) { + msleep(MLX4_EN_LOOPBACK_TIMEOUT); + if (priv->loopback_ok) { + loopback_ok = 1; + break; + } + } + if (!loopback_ok) + en_err(priv, "Loopback packet didn't arrive\n"); + +mlx4_en_test_loopback_exit: + + priv->validate_loopback = 0; + mlx4_en_update_loopback_state(priv->dev, priv->dev->features); + return !loopback_ok; +} + + +static int mlx4_en_test_link(struct mlx4_en_priv *priv) +{ + if (mlx4_en_QUERY_PORT(priv->mdev, priv->port)) + return -ENOMEM; + if (priv->port_state.link_state == 1) + return 0; + else + return 1; +} + +static int mlx4_en_test_speed(struct mlx4_en_priv *priv) +{ + + if (mlx4_en_QUERY_PORT(priv->mdev, priv->port)) + return -ENOMEM; + + /* The device supports 1G, 10G and 40G speeds */ + if (priv->port_state.link_speed != 1000 && + priv->port_state.link_speed != 10000 && + priv->port_state.link_speed != 40000) + return priv->port_state.link_speed; + return 0; +} + + +void mlx4_en_ex_selftest(struct net_device *dev, u32 *flags, u64 *buf) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + int i, carrier_ok; + + memset(buf, 0, sizeof(u64) * MLX4_EN_NUM_SELF_TEST); + + if (*flags & ETH_TEST_FL_OFFLINE) { + /* disable the interface */ + carrier_ok = netif_carrier_ok(dev); + + netif_carrier_off(dev); + /* Wait until all tx queues are empty. + * there should not be any additional incoming traffic + * since we turned the carrier off */ + msleep(200); + + if (priv->mdev->dev->caps.flags & + MLX4_DEV_CAP_FLAG_UC_LOOPBACK) { + buf[3] = mlx4_en_test_registers(priv); + if (priv->port_up) + buf[4] = mlx4_en_test_loopback(priv); + } + + if (carrier_ok) + netif_carrier_on(dev); + + } + buf[0] = mlx4_test_interrupts(mdev->dev); + buf[1] = mlx4_en_test_link(priv); + buf[2] = mlx4_en_test_speed(priv); + + for (i = 0; i < MLX4_EN_NUM_SELF_TEST; i++) { + if (buf[i]) + *flags |= ETH_TEST_FL_FAILED; + } +} diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c new file mode 100644 index 0000000..454d9fe --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -0,0 +1,997 @@ +/* + * Copyright (c) 2007 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mlx4_en.h" + +int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, + struct mlx4_en_tx_ring **pring, int qpn, u32 size, + u16 stride, int node, int queue_index) +{ + struct mlx4_en_dev *mdev = priv->mdev; + struct mlx4_en_tx_ring *ring; + int tmp; + int err; + + ring = kzalloc_node(sizeof(*ring), GFP_KERNEL, node); + if (!ring) { + ring = kzalloc(sizeof(*ring), GFP_KERNEL); + if (!ring) { + en_err(priv, "Failed allocating TX ring\n"); + return -ENOMEM; + } + } + + ring->size = size; + ring->size_mask = size - 1; + ring->stride = stride; + + tmp = size * sizeof(struct mlx4_en_tx_info); + ring->tx_info = kmalloc_node(tmp, GFP_KERNEL | __GFP_NOWARN, node); + if (!ring->tx_info) { + ring->tx_info = vmalloc(tmp); + if (!ring->tx_info) { + err = -ENOMEM; + goto err_ring; + } + } + + en_dbg(DRV, priv, "Allocated tx_info ring at addr:%p size:%d\n", + ring->tx_info, tmp); + + ring->bounce_buf = kmalloc_node(MAX_DESC_SIZE, GFP_KERNEL, node); + if (!ring->bounce_buf) { + ring->bounce_buf = kmalloc(MAX_DESC_SIZE, GFP_KERNEL); + if (!ring->bounce_buf) { + err = -ENOMEM; + goto err_info; + } + } + ring->buf_size = ALIGN(size * ring->stride, MLX4_EN_PAGE_SIZE); + + /* Allocate HW buffers on provided NUMA node */ + set_dev_node(&mdev->dev->pdev->dev, node); + err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size, + 2 * PAGE_SIZE); + set_dev_node(&mdev->dev->pdev->dev, mdev->dev->numa_node); + if (err) { + en_err(priv, "Failed allocating hwq resources\n"); + goto err_bounce; + } + + err = mlx4_en_map_buffer(&ring->wqres.buf); + if (err) { + en_err(priv, "Failed to map TX buffer\n"); + goto err_hwq_res; + } + + ring->buf = ring->wqres.buf.direct.buf; + + en_dbg(DRV, priv, "Allocated TX ring (addr:%p) - buf:%p size:%d buf_size:%d dma:%llx\n", + ring, ring->buf, ring->size, ring->buf_size, + (unsigned long long) ring->wqres.buf.direct.map); + + ring->qpn = qpn; + err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp, GFP_KERNEL); + if (err) { + en_err(priv, "Failed allocating qp %d\n", ring->qpn); + goto err_map; + } + ring->qp.event = mlx4_en_sqp_event; + + err = mlx4_bf_alloc(mdev->dev, &ring->bf, node); + if (err) { + en_dbg(DRV, priv, "working without blueflame (%d)\n", err); + ring->bf.uar = &mdev->priv_uar; + ring->bf.uar->map = mdev->uar_map; + ring->bf_enabled = false; + ring->bf_alloced = false; + priv->pflags &= ~MLX4_EN_PRIV_FLAGS_BLUEFLAME; + } else { + ring->bf_alloced = true; + ring->bf_enabled = !!(priv->pflags & + MLX4_EN_PRIV_FLAGS_BLUEFLAME); + } + + ring->hwtstamp_tx_type = priv->hwtstamp_config.tx_type; + ring->queue_index = queue_index; + + if (queue_index < priv->num_tx_rings_p_up && cpu_online(queue_index)) + cpumask_set_cpu(queue_index, &ring->affinity_mask); + + *pring = ring; + return 0; + +err_map: + mlx4_en_unmap_buffer(&ring->wqres.buf); +err_hwq_res: + mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size); +err_bounce: + kfree(ring->bounce_buf); + ring->bounce_buf = NULL; +err_info: + kvfree(ring->tx_info); + ring->tx_info = NULL; +err_ring: + kfree(ring); + *pring = NULL; + return err; +} + +void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv, + struct mlx4_en_tx_ring **pring) +{ + struct mlx4_en_dev *mdev = priv->mdev; + struct mlx4_en_tx_ring *ring = *pring; + en_dbg(DRV, priv, "Destroying tx ring, qpn: %d\n", ring->qpn); + + if (ring->bf_alloced) + mlx4_bf_free(mdev->dev, &ring->bf); + mlx4_qp_remove(mdev->dev, &ring->qp); + mlx4_qp_free(mdev->dev, &ring->qp); + mlx4_en_unmap_buffer(&ring->wqres.buf); + mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size); + kfree(ring->bounce_buf); + ring->bounce_buf = NULL; + kvfree(ring->tx_info); + ring->tx_info = NULL; + kfree(ring); + *pring = NULL; +} + +int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv, + struct mlx4_en_tx_ring *ring, + int cq, int user_prio) +{ + struct mlx4_en_dev *mdev = priv->mdev; + int err; + + ring->cqn = cq; + ring->prod = 0; + ring->cons = 0xffffffff; + ring->last_nr_txbb = 1; + memset(ring->tx_info, 0, ring->size * sizeof(struct mlx4_en_tx_info)); + memset(ring->buf, 0, ring->buf_size); + + ring->qp_state = MLX4_QP_STATE_RST; + ring->doorbell_qpn = cpu_to_be32(ring->qp.qpn << 8); + ring->mr_key = cpu_to_be32(mdev->mr.key); + + mlx4_en_fill_qp_context(priv, ring->size, ring->stride, 1, 0, ring->qpn, + ring->cqn, user_prio, &ring->context); + if (ring->bf_alloced) + ring->context.usr_page = cpu_to_be32(ring->bf.uar->index); + + err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, &ring->context, + &ring->qp, &ring->qp_state); + if (!user_prio && cpu_online(ring->queue_index)) + netif_set_xps_queue(priv->dev, &ring->affinity_mask, + ring->queue_index); + + return err; +} + +void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv, + struct mlx4_en_tx_ring *ring) +{ + struct mlx4_en_dev *mdev = priv->mdev; + + mlx4_qp_modify(mdev->dev, NULL, ring->qp_state, + MLX4_QP_STATE_RST, NULL, 0, 0, &ring->qp); +} + +static void mlx4_en_stamp_wqe(struct mlx4_en_priv *priv, + struct mlx4_en_tx_ring *ring, int index, + u8 owner) +{ + __be32 stamp = cpu_to_be32(STAMP_VAL | (!!owner << STAMP_SHIFT)); + struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE; + struct mlx4_en_tx_info *tx_info = &ring->tx_info[index]; + void *end = ring->buf + ring->buf_size; + __be32 *ptr = (__be32 *)tx_desc; + int i; + + /* Optimize the common case when there are no wraparounds */ + if (likely((void *)tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) { + /* Stamp the freed descriptor */ + for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE; + i += STAMP_STRIDE) { + *ptr = stamp; + ptr += STAMP_DWORDS; + } + } else { + /* Stamp the freed descriptor */ + for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE; + i += STAMP_STRIDE) { + *ptr = stamp; + ptr += STAMP_DWORDS; + if ((void *)ptr >= end) { + ptr = ring->buf; + stamp ^= cpu_to_be32(0x80000000); + } + } + } +} + + +static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv, + struct mlx4_en_tx_ring *ring, + int index, u8 owner, u64 timestamp) +{ + struct mlx4_en_tx_info *tx_info = &ring->tx_info[index]; + struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE; + struct mlx4_wqe_data_seg *data = (void *) tx_desc + tx_info->data_offset; + void *end = ring->buf + ring->buf_size; + struct sk_buff *skb = tx_info->skb; + int nr_maps = tx_info->nr_maps; + int i; + + /* We do not touch skb here, so prefetch skb->users location + * to speedup consume_skb() + */ + prefetchw(&skb->users); + + if (unlikely(timestamp)) { + struct skb_shared_hwtstamps hwts; + + mlx4_en_fill_hwtstamps(priv->mdev, &hwts, timestamp); + skb_tstamp_tx(skb, &hwts); + } + + /* Optimize the common case when there are no wraparounds */ + if (likely((void *) tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) { + if (!tx_info->inl) { + if (tx_info->linear) + dma_unmap_single(priv->ddev, + tx_info->map0_dma, + tx_info->map0_byte_count, + PCI_DMA_TODEVICE); + else + dma_unmap_page(priv->ddev, + tx_info->map0_dma, + tx_info->map0_byte_count, + PCI_DMA_TODEVICE); + for (i = 1; i < nr_maps; i++) { + data++; + dma_unmap_page(priv->ddev, + (dma_addr_t)be64_to_cpu(data->addr), + be32_to_cpu(data->byte_count), + PCI_DMA_TODEVICE); + } + } + } else { + if (!tx_info->inl) { + if ((void *) data >= end) { + data = ring->buf + ((void *)data - end); + } + + if (tx_info->linear) + dma_unmap_single(priv->ddev, + tx_info->map0_dma, + tx_info->map0_byte_count, + PCI_DMA_TODEVICE); + else + dma_unmap_page(priv->ddev, + tx_info->map0_dma, + tx_info->map0_byte_count, + PCI_DMA_TODEVICE); + for (i = 1; i < nr_maps; i++) { + data++; + /* Check for wraparound before unmapping */ + if ((void *) data >= end) + data = ring->buf; + dma_unmap_page(priv->ddev, + (dma_addr_t)be64_to_cpu(data->addr), + be32_to_cpu(data->byte_count), + PCI_DMA_TODEVICE); + } + } + } + dev_consume_skb_any(skb); + return tx_info->nr_txbb; +} + + +int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + int cnt = 0; + + /* Skip last polled descriptor */ + ring->cons += ring->last_nr_txbb; + en_dbg(DRV, priv, "Freeing Tx buf - cons:0x%x prod:0x%x\n", + ring->cons, ring->prod); + + if ((u32) (ring->prod - ring->cons) > ring->size) { + if (netif_msg_tx_err(priv)) + en_warn(priv, "Tx consumer passed producer!\n"); + return 0; + } + + while (ring->cons != ring->prod) { + ring->last_nr_txbb = mlx4_en_free_tx_desc(priv, ring, + ring->cons & ring->size_mask, + !!(ring->cons & ring->size), 0); + ring->cons += ring->last_nr_txbb; + cnt++; + } + + netdev_tx_reset_queue(ring->tx_queue); + + if (cnt) + en_dbg(DRV, priv, "Freed %d uncompleted tx descriptors\n", cnt); + + return cnt; +} + +static bool mlx4_en_process_tx_cq(struct net_device *dev, + struct mlx4_en_cq *cq) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_cq *mcq = &cq->mcq; + struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring]; + struct mlx4_cqe *cqe; + u16 index; + u16 new_index, ring_index, stamp_index; + u32 txbbs_skipped = 0; + u32 txbbs_stamp = 0; + u32 cons_index = mcq->cons_index; + int size = cq->size; + u32 size_mask = ring->size_mask; + struct mlx4_cqe *buf = cq->buf; + u32 packets = 0; + u32 bytes = 0; + int factor = priv->cqe_factor; + u64 timestamp = 0; + int done = 0; + int budget = priv->tx_work_limit; + u32 last_nr_txbb; + u32 ring_cons; + + if (!priv->port_up) + return true; + + netdev_txq_bql_complete_prefetchw(ring->tx_queue); + + index = cons_index & size_mask; + cqe = mlx4_en_get_cqe(buf, index, priv->cqe_size) + factor; + last_nr_txbb = ACCESS_ONCE(ring->last_nr_txbb); + ring_cons = ACCESS_ONCE(ring->cons); + ring_index = ring_cons & size_mask; + stamp_index = ring_index; + + /* Process all completed CQEs */ + while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK, + cons_index & size) && (done < budget)) { + /* + * make sure we read the CQE after we read the + * ownership bit + */ + rmb(); + + if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == + MLX4_CQE_OPCODE_ERROR)) { + struct mlx4_err_cqe *cqe_err = (struct mlx4_err_cqe *)cqe; + + en_err(priv, "CQE error - vendor syndrome: 0x%x syndrome: 0x%x\n", + cqe_err->vendor_err_syndrome, + cqe_err->syndrome); + } + + /* Skip over last polled CQE */ + new_index = be16_to_cpu(cqe->wqe_index) & size_mask; + + do { + txbbs_skipped += last_nr_txbb; + ring_index = (ring_index + last_nr_txbb) & size_mask; + if (ring->tx_info[ring_index].ts_requested) + timestamp = mlx4_en_get_cqe_ts(cqe); + + /* free next descriptor */ + last_nr_txbb = mlx4_en_free_tx_desc( + priv, ring, ring_index, + !!((ring_cons + txbbs_skipped) & + ring->size), timestamp); + + mlx4_en_stamp_wqe(priv, ring, stamp_index, + !!((ring_cons + txbbs_stamp) & + ring->size)); + stamp_index = ring_index; + txbbs_stamp = txbbs_skipped; + packets++; + bytes += ring->tx_info[ring_index].nr_bytes; + } while ((++done < budget) && (ring_index != new_index)); + + ++cons_index; + index = cons_index & size_mask; + cqe = mlx4_en_get_cqe(buf, index, priv->cqe_size) + factor; + } + + + /* + * To prevent CQ overflow we first update CQ consumer and only then + * the ring consumer. + */ + mcq->cons_index = cons_index; + mlx4_cq_set_ci(mcq); + wmb(); + + /* we want to dirty this cache line once */ + ACCESS_ONCE(ring->last_nr_txbb) = last_nr_txbb; + ACCESS_ONCE(ring->cons) = ring_cons + txbbs_skipped; + + netdev_tx_completed_queue(ring->tx_queue, packets, bytes); + + /* + * Wakeup Tx queue if this stopped, and at least 1 packet + * was completed + */ + if (netif_tx_queue_stopped(ring->tx_queue) && txbbs_skipped > 0) { + netif_tx_wake_queue(ring->tx_queue); + ring->wake_queue++; + } + return done < budget; +} + +void mlx4_en_tx_irq(struct mlx4_cq *mcq) +{ + struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq); + struct mlx4_en_priv *priv = netdev_priv(cq->dev); + + if (priv->port_up) + napi_schedule(&cq->napi); + else + mlx4_en_arm_cq(priv, cq); +} + +/* TX CQ polling - called by NAPI */ +int mlx4_en_poll_tx_cq(struct napi_struct *napi, int budget) +{ + struct mlx4_en_cq *cq = container_of(napi, struct mlx4_en_cq, napi); + struct net_device *dev = cq->dev; + struct mlx4_en_priv *priv = netdev_priv(dev); + int clean_complete; + + clean_complete = mlx4_en_process_tx_cq(dev, cq); + if (!clean_complete) + return budget; + + napi_complete(napi); + mlx4_en_arm_cq(priv, cq); + + return 0; +} + +static struct mlx4_en_tx_desc *mlx4_en_bounce_to_desc(struct mlx4_en_priv *priv, + struct mlx4_en_tx_ring *ring, + u32 index, + unsigned int desc_size) +{ + u32 copy = (ring->size - index) * TXBB_SIZE; + int i; + + for (i = desc_size - copy - 4; i >= 0; i -= 4) { + if ((i & (TXBB_SIZE - 1)) == 0) + wmb(); + + *((u32 *) (ring->buf + i)) = + *((u32 *) (ring->bounce_buf + copy + i)); + } + + for (i = copy - 4; i >= 4 ; i -= 4) { + if ((i & (TXBB_SIZE - 1)) == 0) + wmb(); + + *((u32 *) (ring->buf + index * TXBB_SIZE + i)) = + *((u32 *) (ring->bounce_buf + i)); + } + + /* Return real descriptor location */ + return ring->buf + index * TXBB_SIZE; +} + +/* Decide if skb can be inlined in tx descriptor to avoid dma mapping + * + * It seems strange we do not simply use skb_copy_bits(). + * This would allow to inline all skbs iff skb->len <= inline_thold + * + * Note that caller already checked skb was not a gso packet + */ +static bool is_inline(int inline_thold, const struct sk_buff *skb, + const struct skb_shared_info *shinfo, + void **pfrag) +{ + void *ptr; + + if (skb->len > inline_thold || !inline_thold) + return false; + + if (shinfo->nr_frags == 1) { + ptr = skb_frag_address_safe(&shinfo->frags[0]); + if (unlikely(!ptr)) + return false; + *pfrag = ptr; + return true; + } + if (shinfo->nr_frags) + return false; + return true; +} + +static int inline_size(const struct sk_buff *skb) +{ + if (skb->len + CTRL_SIZE + sizeof(struct mlx4_wqe_inline_seg) + <= MLX4_INLINE_ALIGN) + return ALIGN(skb->len + CTRL_SIZE + + sizeof(struct mlx4_wqe_inline_seg), 16); + else + return ALIGN(skb->len + CTRL_SIZE + 2 * + sizeof(struct mlx4_wqe_inline_seg), 16); +} + +static int get_real_size(const struct sk_buff *skb, + const struct skb_shared_info *shinfo, + struct net_device *dev, + int *lso_header_size, + bool *inline_ok, + void **pfrag) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + int real_size; + + if (shinfo->gso_size) { + *inline_ok = false; + if (skb->encapsulation) + *lso_header_size = (skb_inner_transport_header(skb) - skb->data) + inner_tcp_hdrlen(skb); + else + *lso_header_size = skb_transport_offset(skb) + tcp_hdrlen(skb); + real_size = CTRL_SIZE + shinfo->nr_frags * DS_SIZE + + ALIGN(*lso_header_size + 4, DS_SIZE); + if (unlikely(*lso_header_size != skb_headlen(skb))) { + /* We add a segment for the skb linear buffer only if + * it contains data */ + if (*lso_header_size < skb_headlen(skb)) + real_size += DS_SIZE; + else { + if (netif_msg_tx_err(priv)) + en_warn(priv, "Non-linear headers\n"); + return 0; + } + } + } else { + *lso_header_size = 0; + *inline_ok = is_inline(priv->prof->inline_thold, skb, + shinfo, pfrag); + + if (*inline_ok) + real_size = inline_size(skb); + else + real_size = CTRL_SIZE + + (shinfo->nr_frags + 1) * DS_SIZE; + } + + return real_size; +} + +static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, + const struct sk_buff *skb, + const struct skb_shared_info *shinfo, + int real_size, u16 *vlan_tag, + int tx_ind, void *fragptr) +{ + struct mlx4_wqe_inline_seg *inl = &tx_desc->inl; + int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - sizeof *inl; + unsigned int hlen = skb_headlen(skb); + + if (skb->len <= spc) { + if (likely(skb->len >= MIN_PKT_LEN)) { + inl->byte_count = cpu_to_be32(1 << 31 | skb->len); + } else { + inl->byte_count = cpu_to_be32(1 << 31 | MIN_PKT_LEN); + memset(((void *)(inl + 1)) + skb->len, 0, + MIN_PKT_LEN - skb->len); + } + skb_copy_from_linear_data(skb, inl + 1, hlen); + if (shinfo->nr_frags) + memcpy(((void *)(inl + 1)) + hlen, fragptr, + skb_frag_size(&shinfo->frags[0])); + + } else { + inl->byte_count = cpu_to_be32(1 << 31 | spc); + if (hlen <= spc) { + skb_copy_from_linear_data(skb, inl + 1, hlen); + if (hlen < spc) { + memcpy(((void *)(inl + 1)) + hlen, + fragptr, spc - hlen); + fragptr += spc - hlen; + } + inl = (void *) (inl + 1) + spc; + memcpy(((void *)(inl + 1)), fragptr, skb->len - spc); + } else { + skb_copy_from_linear_data(skb, inl + 1, spc); + inl = (void *) (inl + 1) + spc; + skb_copy_from_linear_data_offset(skb, spc, inl + 1, + hlen - spc); + if (shinfo->nr_frags) + memcpy(((void *)(inl + 1)) + hlen - spc, + fragptr, + skb_frag_size(&shinfo->frags[0])); + } + + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | (skb->len - spc)); + } +} + +u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb, + void *accel_priv, select_queue_fallback_t fallback) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + u16 rings_p_up = priv->num_tx_rings_p_up; + u8 up = 0; + + if (dev->num_tc) + return skb_tx_hash(dev, skb); + + if (vlan_tx_tag_present(skb)) + up = vlan_tx_tag_get(skb) >> VLAN_PRIO_SHIFT; + + return fallback(dev, skb) % rings_p_up + up * rings_p_up; +} + +static void mlx4_bf_copy(void __iomem *dst, const void *src, + unsigned int bytecnt) +{ + __iowrite64_copy(dst, src, bytecnt / 8); +} + +netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + struct mlx4_en_priv *priv = netdev_priv(dev); + struct device *ddev = priv->ddev; + struct mlx4_en_tx_ring *ring; + struct mlx4_en_tx_desc *tx_desc; + struct mlx4_wqe_data_seg *data; + struct mlx4_en_tx_info *tx_info; + int tx_ind = 0; + int nr_txbb; + int desc_size; + int real_size; + u32 index, bf_index; + __be32 op_own; + u16 vlan_tag = 0; + int i_frag; + int lso_header_size; + void *fragptr = NULL; + bool bounce = false; + bool send_doorbell; + bool stop_queue; + bool inline_ok; + u32 ring_cons; + + if (!priv->port_up) + goto tx_drop; + + tx_ind = skb_get_queue_mapping(skb); + ring = priv->tx_ring[tx_ind]; + + /* fetch ring->cons far ahead before needing it to avoid stall */ + ring_cons = ACCESS_ONCE(ring->cons); + + real_size = get_real_size(skb, shinfo, dev, &lso_header_size, + &inline_ok, &fragptr); + if (unlikely(!real_size)) + goto tx_drop; + + /* Align descriptor to TXBB size */ + desc_size = ALIGN(real_size, TXBB_SIZE); + nr_txbb = desc_size / TXBB_SIZE; + if (unlikely(nr_txbb > MAX_DESC_TXBBS)) { + if (netif_msg_tx_err(priv)) + en_warn(priv, "Oversized header or SG list\n"); + goto tx_drop; + } + + if (vlan_tx_tag_present(skb)) + vlan_tag = vlan_tx_tag_get(skb); + + + netdev_txq_bql_enqueue_prefetchw(ring->tx_queue); + + /* Track current inflight packets for performance analysis */ + AVG_PERF_COUNTER(priv->pstats.inflight_avg, + (u32)(ring->prod - ring_cons - 1)); + + /* Packet is good - grab an index and transmit it */ + index = ring->prod & ring->size_mask; + bf_index = ring->prod; + + /* See if we have enough space for whole descriptor TXBB for setting + * SW ownership on next descriptor; if not, use a bounce buffer. */ + if (likely(index + nr_txbb <= ring->size)) + tx_desc = ring->buf + index * TXBB_SIZE; + else { + tx_desc = (struct mlx4_en_tx_desc *) ring->bounce_buf; + bounce = true; + } + + /* Save skb in tx_info ring */ + tx_info = &ring->tx_info[index]; + tx_info->skb = skb; + tx_info->nr_txbb = nr_txbb; + + data = &tx_desc->data; + if (lso_header_size) + data = ((void *)&tx_desc->lso + ALIGN(lso_header_size + 4, + DS_SIZE)); + + /* valid only for none inline segments */ + tx_info->data_offset = (void *)data - (void *)tx_desc; + + tx_info->inl = inline_ok; + + tx_info->linear = (lso_header_size < skb_headlen(skb) && + !inline_ok) ? 1 : 0; + + tx_info->nr_maps = shinfo->nr_frags + tx_info->linear; + data += tx_info->nr_maps - 1; + + if (!tx_info->inl) { + dma_addr_t dma = 0; + u32 byte_count = 0; + + /* Map fragments if any */ + for (i_frag = shinfo->nr_frags - 1; i_frag >= 0; i_frag--) { + const struct skb_frag_struct *frag; + + frag = &shinfo->frags[i_frag]; + byte_count = skb_frag_size(frag); + dma = skb_frag_dma_map(ddev, frag, + 0, byte_count, + DMA_TO_DEVICE); + if (dma_mapping_error(ddev, dma)) + goto tx_drop_unmap; + + data->addr = cpu_to_be64(dma); + data->lkey = ring->mr_key; + wmb(); + data->byte_count = cpu_to_be32(byte_count); + --data; + } + + /* Map linear part if needed */ + if (tx_info->linear) { + byte_count = skb_headlen(skb) - lso_header_size; + + dma = dma_map_single(ddev, skb->data + + lso_header_size, byte_count, + PCI_DMA_TODEVICE); + if (dma_mapping_error(ddev, dma)) + goto tx_drop_unmap; + + data->addr = cpu_to_be64(dma); + data->lkey = ring->mr_key; + wmb(); + data->byte_count = cpu_to_be32(byte_count); + } + /* tx completion can avoid cache line miss for common cases */ + tx_info->map0_dma = dma; + tx_info->map0_byte_count = byte_count; + } + + /* + * For timestamping add flag to skb_shinfo and + * set flag for further reference + */ + tx_info->ts_requested = 0; + if (unlikely(ring->hwtstamp_tx_type == HWTSTAMP_TX_ON && + shinfo->tx_flags & SKBTX_HW_TSTAMP)) { + shinfo->tx_flags |= SKBTX_IN_PROGRESS; + tx_info->ts_requested = 1; + } + + /* Prepare ctrl segement apart opcode+ownership, which depends on + * whether LSO is used */ + tx_desc->ctrl.srcrb_flags = priv->ctrl_flags; + if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) { + if (!skb->encapsulation) + tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM | + MLX4_WQE_CTRL_TCP_UDP_CSUM); + else + tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM); + ring->tx_csum++; + } + + if (priv->flags & MLX4_EN_FLAG_ENABLE_HW_LOOPBACK) { + struct ethhdr *ethh; + + /* Copy dst mac address to wqe. This allows loopback in eSwitch, + * so that VFs and PF can communicate with each other + */ + ethh = (struct ethhdr *)skb->data; + tx_desc->ctrl.srcrb_flags16[0] = get_unaligned((__be16 *)ethh->h_dest); + tx_desc->ctrl.imm = get_unaligned((__be32 *)(ethh->h_dest + 2)); + } + + /* Handle LSO (TSO) packets */ + if (lso_header_size) { + int i; + + /* Mark opcode as LSO */ + op_own = cpu_to_be32(MLX4_OPCODE_LSO | (1 << 6)) | + ((ring->prod & ring->size) ? + cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0); + + /* Fill in the LSO prefix */ + tx_desc->lso.mss_hdr_size = cpu_to_be32( + shinfo->gso_size << 16 | lso_header_size); + + /* Copy headers; + * note that we already verified that it is linear */ + memcpy(tx_desc->lso.header, skb->data, lso_header_size); + + ring->tso_packets++; + + i = ((skb->len - lso_header_size) / shinfo->gso_size) + + !!((skb->len - lso_header_size) % shinfo->gso_size); + tx_info->nr_bytes = skb->len + (i - 1) * lso_header_size; + ring->packets += i; + } else { + /* Normal (Non LSO) packet */ + op_own = cpu_to_be32(MLX4_OPCODE_SEND) | + ((ring->prod & ring->size) ? + cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0); + tx_info->nr_bytes = max_t(unsigned int, skb->len, ETH_ZLEN); + ring->packets++; + } + ring->bytes += tx_info->nr_bytes; + netdev_tx_sent_queue(ring->tx_queue, tx_info->nr_bytes); + AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, skb->len); + + if (tx_info->inl) + build_inline_wqe(tx_desc, skb, shinfo, real_size, &vlan_tag, + tx_ind, fragptr); + + if (skb->encapsulation) { + struct iphdr *ipv4 = (struct iphdr *)skb_inner_network_header(skb); + if (ipv4->protocol == IPPROTO_TCP || ipv4->protocol == IPPROTO_UDP) + op_own |= cpu_to_be32(MLX4_WQE_CTRL_IIP | MLX4_WQE_CTRL_ILP); + else + op_own |= cpu_to_be32(MLX4_WQE_CTRL_IIP); + } + + ring->prod += nr_txbb; + + /* If we used a bounce buffer then copy descriptor back into place */ + if (unlikely(bounce)) + tx_desc = mlx4_en_bounce_to_desc(priv, ring, index, desc_size); + + skb_tx_timestamp(skb); + + /* Check available TXBBs And 2K spare for prefetch */ + stop_queue = (int)(ring->prod - ring_cons) > + ring->size - HEADROOM - MAX_DESC_TXBBS; + if (unlikely(stop_queue)) { + netif_tx_stop_queue(ring->tx_queue); + ring->queue_stopped++; + } + send_doorbell = !skb->xmit_more || netif_xmit_stopped(ring->tx_queue); + + real_size = (real_size / 16) & 0x3f; + + if (ring->bf_enabled && desc_size <= MAX_BF && !bounce && + !vlan_tx_tag_present(skb) && send_doorbell) { + tx_desc->ctrl.bf_qpn = ring->doorbell_qpn | + cpu_to_be32(real_size); + + op_own |= htonl((bf_index & 0xffff) << 8); + /* Ensure new descriptor hits memory + * before setting ownership of this descriptor to HW + */ + wmb(); + tx_desc->ctrl.owner_opcode = op_own; + + wmb(); + + mlx4_bf_copy(ring->bf.reg + ring->bf.offset, &tx_desc->ctrl, + desc_size); + + wmb(); + + ring->bf.offset ^= ring->bf.buf_size; + } else { + tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag); + tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN * + !!vlan_tx_tag_present(skb); + tx_desc->ctrl.fence_size = real_size; + + /* Ensure new descriptor hits memory + * before setting ownership of this descriptor to HW + */ + wmb(); + tx_desc->ctrl.owner_opcode = op_own; + if (send_doorbell) { + wmb(); + iowrite32(ring->doorbell_qpn, + ring->bf.uar->map + MLX4_SEND_DOORBELL); + } else { + ring->xmit_more++; + } + } + + if (unlikely(stop_queue)) { + /* If queue was emptied after the if (stop_queue) , and before + * the netif_tx_stop_queue() - need to wake the queue, + * or else it will remain stopped forever. + * Need a memory barrier to make sure ring->cons was not + * updated before queue was stopped. + */ + smp_rmb(); + + ring_cons = ACCESS_ONCE(ring->cons); + if (unlikely(((int)(ring->prod - ring_cons)) <= + ring->size - HEADROOM - MAX_DESC_TXBBS)) { + netif_tx_wake_queue(ring->tx_queue); + ring->wake_queue++; + } + } + return NETDEV_TX_OK; + +tx_drop_unmap: + en_err(priv, "DMA mapping error\n"); + + while (++i_frag < shinfo->nr_frags) { + ++data; + dma_unmap_page(ddev, (dma_addr_t) be64_to_cpu(data->addr), + be32_to_cpu(data->byte_count), + PCI_DMA_TODEVICE); + } + +tx_drop: + dev_kfree_skb_any(skb); + priv->stats.tx_dropped++; + return NETDEV_TX_OK; +} + diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c new file mode 100644 index 0000000..49290a4 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/eq.c @@ -0,0 +1,1409 @@ +/* + * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved. + * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include "mlx4.h" +#include "fw.h" + +enum { + MLX4_IRQNAME_SIZE = 32 +}; + +enum { + MLX4_NUM_ASYNC_EQE = 0x100, + MLX4_NUM_SPARE_EQE = 0x80, + MLX4_EQ_ENTRY_SIZE = 0x20 +}; + +#define MLX4_EQ_STATUS_OK ( 0 << 28) +#define MLX4_EQ_STATUS_WRITE_FAIL (10 << 28) +#define MLX4_EQ_OWNER_SW ( 0 << 24) +#define MLX4_EQ_OWNER_HW ( 1 << 24) +#define MLX4_EQ_FLAG_EC ( 1 << 18) +#define MLX4_EQ_FLAG_OI ( 1 << 17) +#define MLX4_EQ_STATE_ARMED ( 9 << 8) +#define MLX4_EQ_STATE_FIRED (10 << 8) +#define MLX4_EQ_STATE_ALWAYS_ARMED (11 << 8) + +#define MLX4_ASYNC_EVENT_MASK ((1ull << MLX4_EVENT_TYPE_PATH_MIG) | \ + (1ull << MLX4_EVENT_TYPE_COMM_EST) | \ + (1ull << MLX4_EVENT_TYPE_SQ_DRAINED) | \ + (1ull << MLX4_EVENT_TYPE_CQ_ERROR) | \ + (1ull << MLX4_EVENT_TYPE_WQ_CATAS_ERROR) | \ + (1ull << MLX4_EVENT_TYPE_EEC_CATAS_ERROR) | \ + (1ull << MLX4_EVENT_TYPE_PATH_MIG_FAILED) | \ + (1ull << MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | \ + (1ull << MLX4_EVENT_TYPE_WQ_ACCESS_ERROR) | \ + (1ull << MLX4_EVENT_TYPE_PORT_CHANGE) | \ + (1ull << MLX4_EVENT_TYPE_ECC_DETECT) | \ + (1ull << MLX4_EVENT_TYPE_SRQ_CATAS_ERROR) | \ + (1ull << MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE) | \ + (1ull << MLX4_EVENT_TYPE_SRQ_LIMIT) | \ + (1ull << MLX4_EVENT_TYPE_CMD) | \ + (1ull << MLX4_EVENT_TYPE_OP_REQUIRED) | \ + (1ull << MLX4_EVENT_TYPE_COMM_CHANNEL) | \ + (1ull << MLX4_EVENT_TYPE_FLR_EVENT) | \ + (1ull << MLX4_EVENT_TYPE_FATAL_WARNING)) + +static u64 get_async_ev_mask(struct mlx4_dev *dev) +{ + u64 async_ev_mask = MLX4_ASYNC_EVENT_MASK; + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV) + async_ev_mask |= (1ull << MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT); + + return async_ev_mask; +} + +static void eq_set_ci(struct mlx4_eq *eq, int req_not) +{ + __raw_writel((__force u32) cpu_to_be32((eq->cons_index & 0xffffff) | + req_not << 31), + eq->doorbell); + /* We still want ordering, just not swabbing, so add a barrier */ + mb(); +} + +static struct mlx4_eqe *get_eqe(struct mlx4_eq *eq, u32 entry, u8 eqe_factor, + u8 eqe_size) +{ + /* (entry & (eq->nent - 1)) gives us a cyclic array */ + unsigned long offset = (entry & (eq->nent - 1)) * eqe_size; + /* CX3 is capable of extending the EQE from 32 to 64 bytes with + * strides of 64B,128B and 256B. + * When 64B EQE is used, the first (in the lower addresses) + * 32 bytes in the 64 byte EQE are reserved and the next 32 bytes + * contain the legacy EQE information. + * In all other cases, the first 32B contains the legacy EQE info. + */ + return eq->page_list[offset / PAGE_SIZE].buf + (offset + (eqe_factor ? MLX4_EQ_ENTRY_SIZE : 0)) % PAGE_SIZE; +} + +static struct mlx4_eqe *next_eqe_sw(struct mlx4_eq *eq, u8 eqe_factor, u8 size) +{ + struct mlx4_eqe *eqe = get_eqe(eq, eq->cons_index, eqe_factor, size); + return !!(eqe->owner & 0x80) ^ !!(eq->cons_index & eq->nent) ? NULL : eqe; +} + +static struct mlx4_eqe *next_slave_event_eqe(struct mlx4_slave_event_eq *slave_eq) +{ + struct mlx4_eqe *eqe = + &slave_eq->event_eqe[slave_eq->cons & (SLAVE_EVENT_EQ_SIZE - 1)]; + return (!!(eqe->owner & 0x80) ^ + !!(slave_eq->cons & SLAVE_EVENT_EQ_SIZE)) ? + eqe : NULL; +} + +void mlx4_gen_slave_eqe(struct work_struct *work) +{ + struct mlx4_mfunc_master_ctx *master = + container_of(work, struct mlx4_mfunc_master_ctx, + slave_event_work); + struct mlx4_mfunc *mfunc = + container_of(master, struct mlx4_mfunc, master); + struct mlx4_priv *priv = container_of(mfunc, struct mlx4_priv, mfunc); + struct mlx4_dev *dev = &priv->dev; + struct mlx4_slave_event_eq *slave_eq = &mfunc->master.slave_eq; + struct mlx4_eqe *eqe; + u8 slave; + int i; + + for (eqe = next_slave_event_eqe(slave_eq); eqe; + eqe = next_slave_event_eqe(slave_eq)) { + slave = eqe->slave_id; + + /* All active slaves need to receive the event */ + if (slave == ALL_SLAVES) { + for (i = 0; i < dev->num_slaves; i++) { + if (i != dev->caps.function && + master->slave_state[i].active) + if (mlx4_GEN_EQE(dev, i, eqe)) + mlx4_warn(dev, "Failed to generate event for slave %d\n", + i); + } + } else { + if (mlx4_GEN_EQE(dev, slave, eqe)) + mlx4_warn(dev, "Failed to generate event for slave %d\n", + slave); + } + ++slave_eq->cons; + } +} + + +static void slave_event(struct mlx4_dev *dev, u8 slave, struct mlx4_eqe *eqe) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_event_eq *slave_eq = &priv->mfunc.master.slave_eq; + struct mlx4_eqe *s_eqe; + unsigned long flags; + + spin_lock_irqsave(&slave_eq->event_lock, flags); + s_eqe = &slave_eq->event_eqe[slave_eq->prod & (SLAVE_EVENT_EQ_SIZE - 1)]; + if ((!!(s_eqe->owner & 0x80)) ^ + (!!(slave_eq->prod & SLAVE_EVENT_EQ_SIZE))) { + mlx4_warn(dev, "Master failed to generate an EQE for slave: %d. No free EQE on slave events queue\n", + slave); + spin_unlock_irqrestore(&slave_eq->event_lock, flags); + return; + } + + memcpy(s_eqe, eqe, dev->caps.eqe_size - 1); + s_eqe->slave_id = slave; + /* ensure all information is written before setting the ownersip bit */ + wmb(); + s_eqe->owner = !!(slave_eq->prod & SLAVE_EVENT_EQ_SIZE) ? 0x0 : 0x80; + ++slave_eq->prod; + + queue_work(priv->mfunc.master.comm_wq, + &priv->mfunc.master.slave_event_work); + spin_unlock_irqrestore(&slave_eq->event_lock, flags); +} + +static void mlx4_slave_event(struct mlx4_dev *dev, int slave, + struct mlx4_eqe *eqe) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *s_slave = + &priv->mfunc.master.slave_state[slave]; + + if (!s_slave->active) { + /*mlx4_warn(dev, "Trying to pass event to inactive slave\n");*/ + return; + } + + slave_event(dev, slave, eqe); +} + +int mlx4_gen_pkey_eqe(struct mlx4_dev *dev, int slave, u8 port) +{ + struct mlx4_eqe eqe; + + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *s_slave = &priv->mfunc.master.slave_state[slave]; + + if (!s_slave->active) + return 0; + + memset(&eqe, 0, sizeof eqe); + + eqe.type = MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT; + eqe.subtype = MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE; + eqe.event.port_mgmt_change.port = port; + + return mlx4_GEN_EQE(dev, slave, &eqe); +} +EXPORT_SYMBOL(mlx4_gen_pkey_eqe); + +int mlx4_gen_guid_change_eqe(struct mlx4_dev *dev, int slave, u8 port) +{ + struct mlx4_eqe eqe; + + /*don't send if we don't have the that slave */ + if (dev->num_vfs < slave) + return 0; + memset(&eqe, 0, sizeof eqe); + + eqe.type = MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT; + eqe.subtype = MLX4_DEV_PMC_SUBTYPE_GUID_INFO; + eqe.event.port_mgmt_change.port = port; + + return mlx4_GEN_EQE(dev, slave, &eqe); +} +EXPORT_SYMBOL(mlx4_gen_guid_change_eqe); + +int mlx4_gen_port_state_change_eqe(struct mlx4_dev *dev, int slave, u8 port, + u8 port_subtype_change) +{ + struct mlx4_eqe eqe; + + /*don't send if we don't have the that slave */ + if (dev->num_vfs < slave) + return 0; + memset(&eqe, 0, sizeof eqe); + + eqe.type = MLX4_EVENT_TYPE_PORT_CHANGE; + eqe.subtype = port_subtype_change; + eqe.event.port_change.port = cpu_to_be32(port << 28); + + mlx4_dbg(dev, "%s: sending: %d to slave: %d on port: %d\n", __func__, + port_subtype_change, slave, port); + return mlx4_GEN_EQE(dev, slave, &eqe); +} +EXPORT_SYMBOL(mlx4_gen_port_state_change_eqe); + +enum slave_port_state mlx4_get_slave_port_state(struct mlx4_dev *dev, int slave, u8 port) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *s_state = priv->mfunc.master.slave_state; + struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave); + + if (slave >= dev->num_slaves || port > dev->caps.num_ports || + port <= 0 || !test_bit(port - 1, actv_ports.ports)) { + pr_err("%s: Error: asking for slave:%d, port:%d\n", + __func__, slave, port); + return SLAVE_PORT_DOWN; + } + return s_state[slave].port_state[port]; +} +EXPORT_SYMBOL(mlx4_get_slave_port_state); + +static int mlx4_set_slave_port_state(struct mlx4_dev *dev, int slave, u8 port, + enum slave_port_state state) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *s_state = priv->mfunc.master.slave_state; + struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave); + + if (slave >= dev->num_slaves || port > dev->caps.num_ports || + port <= 0 || !test_bit(port - 1, actv_ports.ports)) { + pr_err("%s: Error: asking for slave:%d, port:%d\n", + __func__, slave, port); + return -1; + } + s_state[slave].port_state[port] = state; + + return 0; +} + +static void set_all_slave_state(struct mlx4_dev *dev, u8 port, int event) +{ + int i; + enum slave_port_gen_event gen_event; + struct mlx4_slaves_pport slaves_pport = mlx4_phys_to_slaves_pport(dev, + port); + + for (i = 0; i < dev->num_vfs + 1; i++) + if (test_bit(i, slaves_pport.slaves)) + set_and_calc_slave_port_state(dev, i, port, + event, &gen_event); +} +/************************************************************************** + The function get as input the new event to that port, + and according to the prev state change the slave's port state. + The events are: + MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN, + MLX4_PORT_STATE_DEV_EVENT_PORT_UP + MLX4_PORT_STATE_IB_EVENT_GID_VALID + MLX4_PORT_STATE_IB_EVENT_GID_INVALID +***************************************************************************/ +int set_and_calc_slave_port_state(struct mlx4_dev *dev, int slave, + u8 port, int event, + enum slave_port_gen_event *gen_event) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *ctx = NULL; + unsigned long flags; + int ret = -1; + struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave); + enum slave_port_state cur_state = + mlx4_get_slave_port_state(dev, slave, port); + + *gen_event = SLAVE_PORT_GEN_EVENT_NONE; + + if (slave >= dev->num_slaves || port > dev->caps.num_ports || + port <= 0 || !test_bit(port - 1, actv_ports.ports)) { + pr_err("%s: Error: asking for slave:%d, port:%d\n", + __func__, slave, port); + return ret; + } + + ctx = &priv->mfunc.master.slave_state[slave]; + spin_lock_irqsave(&ctx->lock, flags); + + switch (cur_state) { + case SLAVE_PORT_DOWN: + if (MLX4_PORT_STATE_DEV_EVENT_PORT_UP == event) + mlx4_set_slave_port_state(dev, slave, port, + SLAVE_PENDING_UP); + break; + case SLAVE_PENDING_UP: + if (MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN == event) + mlx4_set_slave_port_state(dev, slave, port, + SLAVE_PORT_DOWN); + else if (MLX4_PORT_STATE_IB_PORT_STATE_EVENT_GID_VALID == event) { + mlx4_set_slave_port_state(dev, slave, port, + SLAVE_PORT_UP); + *gen_event = SLAVE_PORT_GEN_EVENT_UP; + } + break; + case SLAVE_PORT_UP: + if (MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN == event) { + mlx4_set_slave_port_state(dev, slave, port, + SLAVE_PORT_DOWN); + *gen_event = SLAVE_PORT_GEN_EVENT_DOWN; + } else if (MLX4_PORT_STATE_IB_EVENT_GID_INVALID == + event) { + mlx4_set_slave_port_state(dev, slave, port, + SLAVE_PENDING_UP); + *gen_event = SLAVE_PORT_GEN_EVENT_DOWN; + } + break; + default: + pr_err("%s: BUG!!! UNKNOWN state: slave:%d, port:%d\n", + __func__, slave, port); + goto out; + } + ret = mlx4_get_slave_port_state(dev, slave, port); + +out: + spin_unlock_irqrestore(&ctx->lock, flags); + return ret; +} + +EXPORT_SYMBOL(set_and_calc_slave_port_state); + +int mlx4_gen_slaves_port_mgt_ev(struct mlx4_dev *dev, u8 port, int attr) +{ + struct mlx4_eqe eqe; + + memset(&eqe, 0, sizeof eqe); + + eqe.type = MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT; + eqe.subtype = MLX4_DEV_PMC_SUBTYPE_PORT_INFO; + eqe.event.port_mgmt_change.port = port; + eqe.event.port_mgmt_change.params.port_info.changed_attr = + cpu_to_be32((u32) attr); + + slave_event(dev, ALL_SLAVES, &eqe); + return 0; +} +EXPORT_SYMBOL(mlx4_gen_slaves_port_mgt_ev); + +void mlx4_master_handle_slave_flr(struct work_struct *work) +{ + struct mlx4_mfunc_master_ctx *master = + container_of(work, struct mlx4_mfunc_master_ctx, + slave_flr_event_work); + struct mlx4_mfunc *mfunc = + container_of(master, struct mlx4_mfunc, master); + struct mlx4_priv *priv = + container_of(mfunc, struct mlx4_priv, mfunc); + struct mlx4_dev *dev = &priv->dev; + struct mlx4_slave_state *slave_state = priv->mfunc.master.slave_state; + int i; + int err; + unsigned long flags; + + mlx4_dbg(dev, "mlx4_handle_slave_flr\n"); + + for (i = 0 ; i < dev->num_slaves; i++) { + + if (MLX4_COMM_CMD_FLR == slave_state[i].last_cmd) { + mlx4_dbg(dev, "mlx4_handle_slave_flr: clean slave: %d\n", + i); + + mlx4_delete_all_resources_for_slave(dev, i); + /*return the slave to running mode*/ + spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags); + slave_state[i].last_cmd = MLX4_COMM_CMD_RESET; + slave_state[i].is_slave_going_down = 0; + spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags); + /*notify the FW:*/ + err = mlx4_cmd(dev, 0, i, 0, MLX4_CMD_INFORM_FLR_DONE, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + if (err) + mlx4_warn(dev, "Failed to notify FW on FLR done (slave:%d)\n", + i); + } + } +} + +static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_eqe *eqe; + int cqn; + int eqes_found = 0; + int set_ci = 0; + int port; + int slave = 0; + int ret; + u32 flr_slave; + u8 update_slave_state; + int i; + enum slave_port_gen_event gen_event; + unsigned long flags; + struct mlx4_vport_state *s_info; + int eqe_size = dev->caps.eqe_size; + + while ((eqe = next_eqe_sw(eq, dev->caps.eqe_factor, eqe_size))) { + /* + * Make sure we read EQ entry contents after we've + * checked the ownership bit. + */ + rmb(); + + switch (eqe->type) { + case MLX4_EVENT_TYPE_COMP: + cqn = be32_to_cpu(eqe->event.comp.cqn) & 0xffffff; + mlx4_cq_completion(dev, cqn); + break; + + case MLX4_EVENT_TYPE_PATH_MIG: + case MLX4_EVENT_TYPE_COMM_EST: + case MLX4_EVENT_TYPE_SQ_DRAINED: + case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE: + case MLX4_EVENT_TYPE_WQ_CATAS_ERROR: + case MLX4_EVENT_TYPE_PATH_MIG_FAILED: + case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR: + mlx4_dbg(dev, "event %d arrived\n", eqe->type); + if (mlx4_is_master(dev)) { + /* forward only to slave owning the QP */ + ret = mlx4_get_slave_from_resource_id(dev, + RES_QP, + be32_to_cpu(eqe->event.qp.qpn) + & 0xffffff, &slave); + if (ret && ret != -ENOENT) { + mlx4_dbg(dev, "QP event %02x(%02x) on EQ %d at index %u: could not get slave id (%d)\n", + eqe->type, eqe->subtype, + eq->eqn, eq->cons_index, ret); + break; + } + + if (!ret && slave != dev->caps.function) { + mlx4_slave_event(dev, slave, eqe); + break; + } + + } + mlx4_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & + 0xffffff, eqe->type); + break; + + case MLX4_EVENT_TYPE_SRQ_LIMIT: + mlx4_dbg(dev, "%s: MLX4_EVENT_TYPE_SRQ_LIMIT\n", + __func__); + case MLX4_EVENT_TYPE_SRQ_CATAS_ERROR: + if (mlx4_is_master(dev)) { + /* forward only to slave owning the SRQ */ + ret = mlx4_get_slave_from_resource_id(dev, + RES_SRQ, + be32_to_cpu(eqe->event.srq.srqn) + & 0xffffff, + &slave); + if (ret && ret != -ENOENT) { + mlx4_warn(dev, "SRQ event %02x(%02x) on EQ %d at index %u: could not get slave id (%d)\n", + eqe->type, eqe->subtype, + eq->eqn, eq->cons_index, ret); + break; + } + mlx4_warn(dev, "%s: slave:%d, srq_no:0x%x, event: %02x(%02x)\n", + __func__, slave, + be32_to_cpu(eqe->event.srq.srqn), + eqe->type, eqe->subtype); + + if (!ret && slave != dev->caps.function) { + mlx4_warn(dev, "%s: sending event %02x(%02x) to slave:%d\n", + __func__, eqe->type, + eqe->subtype, slave); + mlx4_slave_event(dev, slave, eqe); + break; + } + } + mlx4_srq_event(dev, be32_to_cpu(eqe->event.srq.srqn) & + 0xffffff, eqe->type); + break; + + case MLX4_EVENT_TYPE_CMD: + mlx4_cmd_event(dev, + be16_to_cpu(eqe->event.cmd.token), + eqe->event.cmd.status, + be64_to_cpu(eqe->event.cmd.out_param)); + break; + + case MLX4_EVENT_TYPE_PORT_CHANGE: { + struct mlx4_slaves_pport slaves_port; + port = be32_to_cpu(eqe->event.port_change.port) >> 28; + slaves_port = mlx4_phys_to_slaves_pport(dev, port); + if (eqe->subtype == MLX4_PORT_CHANGE_SUBTYPE_DOWN) { + mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_DOWN, + port); + mlx4_priv(dev)->sense.do_sense_port[port] = 1; + if (!mlx4_is_master(dev)) + break; + for (i = 0; i < dev->num_vfs + 1; i++) { + if (!test_bit(i, slaves_port.slaves)) + continue; + if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH) { + if (i == mlx4_master_func_num(dev)) + continue; + mlx4_dbg(dev, "%s: Sending MLX4_PORT_CHANGE_SUBTYPE_DOWN to slave: %d, port:%d\n", + __func__, i, port); + s_info = &priv->mfunc.master.vf_oper[slave].vport[port].state; + if (IFLA_VF_LINK_STATE_AUTO == s_info->link_state) { + eqe->event.port_change.port = + cpu_to_be32( + (be32_to_cpu(eqe->event.port_change.port) & 0xFFFFFFF) + | (mlx4_phys_to_slave_port(dev, i, port) << 28)); + mlx4_slave_event(dev, i, eqe); + } + } else { /* IB port */ + set_and_calc_slave_port_state(dev, i, port, + MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN, + &gen_event); + /*we can be in pending state, then do not send port_down event*/ + if (SLAVE_PORT_GEN_EVENT_DOWN == gen_event) { + if (i == mlx4_master_func_num(dev)) + continue; + mlx4_slave_event(dev, i, eqe); + } + } + } + } else { + mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_UP, port); + + mlx4_priv(dev)->sense.do_sense_port[port] = 0; + + if (!mlx4_is_master(dev)) + break; + if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH) + for (i = 0; i < dev->num_vfs + 1; i++) { + if (!test_bit(i, slaves_port.slaves)) + continue; + if (i == mlx4_master_func_num(dev)) + continue; + s_info = &priv->mfunc.master.vf_oper[slave].vport[port].state; + if (IFLA_VF_LINK_STATE_AUTO == s_info->link_state) { + eqe->event.port_change.port = + cpu_to_be32( + (be32_to_cpu(eqe->event.port_change.port) & 0xFFFFFFF) + | (mlx4_phys_to_slave_port(dev, i, port) << 28)); + mlx4_slave_event(dev, i, eqe); + } + } + else /* IB port */ + /* port-up event will be sent to a slave when the + * slave's alias-guid is set. This is done in alias_GUID.c + */ + set_all_slave_state(dev, port, MLX4_DEV_EVENT_PORT_UP); + } + break; + } + + case MLX4_EVENT_TYPE_CQ_ERROR: + mlx4_warn(dev, "CQ %s on CQN %06x\n", + eqe->event.cq_err.syndrome == 1 ? + "overrun" : "access violation", + be32_to_cpu(eqe->event.cq_err.cqn) & 0xffffff); + if (mlx4_is_master(dev)) { + ret = mlx4_get_slave_from_resource_id(dev, + RES_CQ, + be32_to_cpu(eqe->event.cq_err.cqn) + & 0xffffff, &slave); + if (ret && ret != -ENOENT) { + mlx4_dbg(dev, "CQ event %02x(%02x) on EQ %d at index %u: could not get slave id (%d)\n", + eqe->type, eqe->subtype, + eq->eqn, eq->cons_index, ret); + break; + } + + if (!ret && slave != dev->caps.function) { + mlx4_slave_event(dev, slave, eqe); + break; + } + } + mlx4_cq_event(dev, + be32_to_cpu(eqe->event.cq_err.cqn) + & 0xffffff, + eqe->type); + break; + + case MLX4_EVENT_TYPE_EQ_OVERFLOW: + mlx4_warn(dev, "EQ overrun on EQN %d\n", eq->eqn); + break; + + case MLX4_EVENT_TYPE_OP_REQUIRED: + atomic_inc(&priv->opreq_count); + /* FW commands can't be executed from interrupt context + * working in deferred task + */ + queue_work(mlx4_wq, &priv->opreq_task); + break; + + case MLX4_EVENT_TYPE_COMM_CHANNEL: + if (!mlx4_is_master(dev)) { + mlx4_warn(dev, "Received comm channel event for non master device\n"); + break; + } + memcpy(&priv->mfunc.master.comm_arm_bit_vector, + eqe->event.comm_channel_arm.bit_vec, + sizeof eqe->event.comm_channel_arm.bit_vec); + queue_work(priv->mfunc.master.comm_wq, + &priv->mfunc.master.comm_work); + break; + + case MLX4_EVENT_TYPE_FLR_EVENT: + flr_slave = be32_to_cpu(eqe->event.flr_event.slave_id); + if (!mlx4_is_master(dev)) { + mlx4_warn(dev, "Non-master function received FLR event\n"); + break; + } + + mlx4_dbg(dev, "FLR event for slave: %d\n", flr_slave); + + if (flr_slave >= dev->num_slaves) { + mlx4_warn(dev, + "Got FLR for unknown function: %d\n", + flr_slave); + update_slave_state = 0; + } else + update_slave_state = 1; + + spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags); + if (update_slave_state) { + priv->mfunc.master.slave_state[flr_slave].active = false; + priv->mfunc.master.slave_state[flr_slave].last_cmd = MLX4_COMM_CMD_FLR; + priv->mfunc.master.slave_state[flr_slave].is_slave_going_down = 1; + } + spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags); + queue_work(priv->mfunc.master.comm_wq, + &priv->mfunc.master.slave_flr_event_work); + break; + + case MLX4_EVENT_TYPE_FATAL_WARNING: + if (eqe->subtype == MLX4_FATAL_WARNING_SUBTYPE_WARMING) { + if (mlx4_is_master(dev)) + for (i = 0; i < dev->num_slaves; i++) { + mlx4_dbg(dev, "%s: Sending MLX4_FATAL_WARNING_SUBTYPE_WARMING to slave: %d\n", + __func__, i); + if (i == dev->caps.function) + continue; + mlx4_slave_event(dev, i, eqe); + } + mlx4_err(dev, "Temperature Threshold was reached! Threshold: %d celsius degrees; Current Temperature: %d\n", + be16_to_cpu(eqe->event.warming.warning_threshold), + be16_to_cpu(eqe->event.warming.current_temperature)); + } else + mlx4_warn(dev, "Unhandled event FATAL WARNING (%02x), subtype %02x on EQ %d at index %u. owner=%x, nent=0x%x, slave=%x, ownership=%s\n", + eqe->type, eqe->subtype, eq->eqn, + eq->cons_index, eqe->owner, eq->nent, + eqe->slave_id, + !!(eqe->owner & 0x80) ^ + !!(eq->cons_index & eq->nent) ? "HW" : "SW"); + + break; + + case MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT: + mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_MGMT_CHANGE, + (unsigned long) eqe); + break; + + case MLX4_EVENT_TYPE_EEC_CATAS_ERROR: + case MLX4_EVENT_TYPE_ECC_DETECT: + default: + mlx4_warn(dev, "Unhandled event %02x(%02x) on EQ %d at index %u. owner=%x, nent=0x%x, slave=%x, ownership=%s\n", + eqe->type, eqe->subtype, eq->eqn, + eq->cons_index, eqe->owner, eq->nent, + eqe->slave_id, + !!(eqe->owner & 0x80) ^ + !!(eq->cons_index & eq->nent) ? "HW" : "SW"); + break; + }; + + ++eq->cons_index; + eqes_found = 1; + ++set_ci; + + /* + * The HCA will think the queue has overflowed if we + * don't tell it we've been processing events. We + * create our EQs with MLX4_NUM_SPARE_EQE extra + * entries, so we must update our consumer index at + * least that often. + */ + if (unlikely(set_ci >= MLX4_NUM_SPARE_EQE)) { + eq_set_ci(eq, 0); + set_ci = 0; + } + } + + eq_set_ci(eq, 1); + + return eqes_found; +} + +static irqreturn_t mlx4_interrupt(int irq, void *dev_ptr) +{ + struct mlx4_dev *dev = dev_ptr; + struct mlx4_priv *priv = mlx4_priv(dev); + int work = 0; + int i; + + writel(priv->eq_table.clr_mask, priv->eq_table.clr_int); + + for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) + work |= mlx4_eq_int(dev, &priv->eq_table.eq[i]); + + return IRQ_RETVAL(work); +} + +static irqreturn_t mlx4_msi_x_interrupt(int irq, void *eq_ptr) +{ + struct mlx4_eq *eq = eq_ptr; + struct mlx4_dev *dev = eq->dev; + + mlx4_eq_int(dev, eq); + + /* MSI-X vectors always belong to us */ + return IRQ_HANDLED; +} + +int mlx4_MAP_EQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_event_eq_info *event_eq = + priv->mfunc.master.slave_state[slave].event_eq; + u32 in_modifier = vhcr->in_modifier; + u32 eqn = in_modifier & 0x3FF; + u64 in_param = vhcr->in_param; + int err = 0; + int i; + + if (slave == dev->caps.function) + err = mlx4_cmd(dev, in_param, (in_modifier & 0x80000000) | eqn, + 0, MLX4_CMD_MAP_EQ, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_NATIVE); + if (!err) + for (i = 0; i < MLX4_EVENT_TYPES_NUM; ++i) + if (in_param & (1LL << i)) + event_eq[i].eqn = in_modifier >> 31 ? -1 : eqn; + + return err; +} + +static int mlx4_MAP_EQ(struct mlx4_dev *dev, u64 event_mask, int unmap, + int eq_num) +{ + return mlx4_cmd(dev, event_mask, (unmap << 31) | eq_num, + 0, MLX4_CMD_MAP_EQ, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); +} + +static int mlx4_SW2HW_EQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, + int eq_num) +{ + return mlx4_cmd(dev, mailbox->dma, eq_num, 0, + MLX4_CMD_SW2HW_EQ, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED); +} + +static int mlx4_HW2SW_EQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, + int eq_num) +{ + return mlx4_cmd_box(dev, 0, mailbox->dma, eq_num, + 0, MLX4_CMD_HW2SW_EQ, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED); +} + +static int mlx4_num_eq_uar(struct mlx4_dev *dev) +{ + /* + * Each UAR holds 4 EQ doorbells. To figure out how many UARs + * we need to map, take the difference of highest index and + * the lowest index we'll use and add 1. + */ + return (dev->caps.num_comp_vectors + 1 + dev->caps.reserved_eqs + + dev->caps.comp_pool)/4 - dev->caps.reserved_eqs/4 + 1; +} + +static void __iomem *mlx4_get_eq_uar(struct mlx4_dev *dev, struct mlx4_eq *eq) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int index; + + index = eq->eqn / 4 - dev->caps.reserved_eqs / 4; + + if (!priv->eq_table.uar_map[index]) { + priv->eq_table.uar_map[index] = + ioremap(pci_resource_start(dev->pdev, 2) + + ((eq->eqn / 4) << PAGE_SHIFT), + PAGE_SIZE); + if (!priv->eq_table.uar_map[index]) { + mlx4_err(dev, "Couldn't map EQ doorbell for EQN 0x%06x\n", + eq->eqn); + return NULL; + } + } + + return priv->eq_table.uar_map[index] + 0x800 + 8 * (eq->eqn % 4); +} + +static void mlx4_unmap_uar(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int i; + + for (i = 0; i < mlx4_num_eq_uar(dev); ++i) + if (priv->eq_table.uar_map[i]) { + iounmap(priv->eq_table.uar_map[i]); + priv->eq_table.uar_map[i] = NULL; + } +} + +static int mlx4_create_eq(struct mlx4_dev *dev, int nent, + u8 intr, struct mlx4_eq *eq) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_eq_context *eq_context; + int npages; + u64 *dma_list = NULL; + dma_addr_t t; + u64 mtt_addr; + int err = -ENOMEM; + int i; + + eq->dev = dev; + eq->nent = roundup_pow_of_two(max(nent, 2)); + /* CX3 is capable of extending the CQE/EQE from 32 to 64 bytes, with + * strides of 64B,128B and 256B. + */ + npages = PAGE_ALIGN(eq->nent * dev->caps.eqe_size) / PAGE_SIZE; + + eq->page_list = kmalloc(npages * sizeof *eq->page_list, + GFP_KERNEL); + if (!eq->page_list) + goto err_out; + + for (i = 0; i < npages; ++i) + eq->page_list[i].buf = NULL; + + dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL); + if (!dma_list) + goto err_out_free; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + goto err_out_free; + eq_context = mailbox->buf; + + for (i = 0; i < npages; ++i) { + eq->page_list[i].buf = dma_alloc_coherent(&dev->pdev->dev, + PAGE_SIZE, &t, GFP_KERNEL); + if (!eq->page_list[i].buf) + goto err_out_free_pages; + + dma_list[i] = t; + eq->page_list[i].map = t; + + memset(eq->page_list[i].buf, 0, PAGE_SIZE); + } + + eq->eqn = mlx4_bitmap_alloc(&priv->eq_table.bitmap); + if (eq->eqn == -1) + goto err_out_free_pages; + + eq->doorbell = mlx4_get_eq_uar(dev, eq); + if (!eq->doorbell) { + err = -ENOMEM; + goto err_out_free_eq; + } + + err = mlx4_mtt_init(dev, npages, PAGE_SHIFT, &eq->mtt); + if (err) + goto err_out_free_eq; + + err = mlx4_write_mtt(dev, &eq->mtt, 0, npages, dma_list); + if (err) + goto err_out_free_mtt; + + eq_context->flags = cpu_to_be32(MLX4_EQ_STATUS_OK | + MLX4_EQ_STATE_ARMED); + eq_context->log_eq_size = ilog2(eq->nent); + eq_context->intr = intr; + eq_context->log_page_size = PAGE_SHIFT - MLX4_ICM_PAGE_SHIFT; + + mtt_addr = mlx4_mtt_addr(dev, &eq->mtt); + eq_context->mtt_base_addr_h = mtt_addr >> 32; + eq_context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff); + + err = mlx4_SW2HW_EQ(dev, mailbox, eq->eqn); + if (err) { + mlx4_warn(dev, "SW2HW_EQ failed (%d)\n", err); + goto err_out_free_mtt; + } + + kfree(dma_list); + mlx4_free_cmd_mailbox(dev, mailbox); + + eq->cons_index = 0; + + return err; + +err_out_free_mtt: + mlx4_mtt_cleanup(dev, &eq->mtt); + +err_out_free_eq: + mlx4_bitmap_free(&priv->eq_table.bitmap, eq->eqn, MLX4_USE_RR); + +err_out_free_pages: + for (i = 0; i < npages; ++i) + if (eq->page_list[i].buf) + dma_free_coherent(&dev->pdev->dev, PAGE_SIZE, + eq->page_list[i].buf, + eq->page_list[i].map); + + mlx4_free_cmd_mailbox(dev, mailbox); + +err_out_free: + kfree(eq->page_list); + kfree(dma_list); + +err_out: + return err; +} + +static void mlx4_free_eq(struct mlx4_dev *dev, + struct mlx4_eq *eq) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_cmd_mailbox *mailbox; + int err; + int i; + /* CX3 is capable of extending the CQE/EQE from 32 to 64 bytes, with + * strides of 64B,128B and 256B + */ + int npages = PAGE_ALIGN(dev->caps.eqe_size * eq->nent) / PAGE_SIZE; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return; + + err = mlx4_HW2SW_EQ(dev, mailbox, eq->eqn); + if (err) + mlx4_warn(dev, "HW2SW_EQ failed (%d)\n", err); + + if (0) { + mlx4_dbg(dev, "Dumping EQ context %02x:\n", eq->eqn); + for (i = 0; i < sizeof (struct mlx4_eq_context) / 4; ++i) { + if (i % 4 == 0) + pr_cont("[%02x] ", i * 4); + pr_cont(" %08x", be32_to_cpup(mailbox->buf + i * 4)); + if ((i + 1) % 4 == 0) + pr_cont("\n"); + } + } + synchronize_irq(eq->irq); + + mlx4_mtt_cleanup(dev, &eq->mtt); + for (i = 0; i < npages; ++i) + dma_free_coherent(&dev->pdev->dev, PAGE_SIZE, + eq->page_list[i].buf, + eq->page_list[i].map); + + kfree(eq->page_list); + mlx4_bitmap_free(&priv->eq_table.bitmap, eq->eqn, MLX4_USE_RR); + mlx4_free_cmd_mailbox(dev, mailbox); +} + +static void mlx4_free_irqs(struct mlx4_dev *dev) +{ + struct mlx4_eq_table *eq_table = &mlx4_priv(dev)->eq_table; + struct mlx4_priv *priv = mlx4_priv(dev); + int i, vec; + + if (eq_table->have_irq) + free_irq(dev->pdev->irq, dev); + + for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) + if (eq_table->eq[i].have_irq) { + free_irq(eq_table->eq[i].irq, eq_table->eq + i); + eq_table->eq[i].have_irq = 0; + } + + for (i = 0; i < dev->caps.comp_pool; i++) { + /* + * Freeing the assigned irq's + * all bits should be 0, but we need to validate + */ + if (priv->msix_ctl.pool_bm & 1ULL << i) { + /* NO need protecting*/ + vec = dev->caps.num_comp_vectors + 1 + i; + free_irq(priv->eq_table.eq[vec].irq, + &priv->eq_table.eq[vec]); + } + } + + + kfree(eq_table->irq_names); +} + +static int mlx4_map_clr_int(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + priv->clr_base = ioremap(pci_resource_start(dev->pdev, priv->fw.clr_int_bar) + + priv->fw.clr_int_base, MLX4_CLR_INT_SIZE); + if (!priv->clr_base) { + mlx4_err(dev, "Couldn't map interrupt clear register, aborting\n"); + return -ENOMEM; + } + + return 0; +} + +static void mlx4_unmap_clr_int(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + iounmap(priv->clr_base); +} + +int mlx4_alloc_eq_table(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + priv->eq_table.eq = kcalloc(dev->caps.num_eqs - dev->caps.reserved_eqs, + sizeof *priv->eq_table.eq, GFP_KERNEL); + if (!priv->eq_table.eq) + return -ENOMEM; + + return 0; +} + +void mlx4_free_eq_table(struct mlx4_dev *dev) +{ + kfree(mlx4_priv(dev)->eq_table.eq); +} + +int mlx4_init_eq_table(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int err; + int i; + + priv->eq_table.uar_map = kcalloc(mlx4_num_eq_uar(dev), + sizeof *priv->eq_table.uar_map, + GFP_KERNEL); + if (!priv->eq_table.uar_map) { + err = -ENOMEM; + goto err_out_free; + } + + err = mlx4_bitmap_init(&priv->eq_table.bitmap, dev->caps.num_eqs, + dev->caps.num_eqs - 1, dev->caps.reserved_eqs, 0); + if (err) + goto err_out_free; + + for (i = 0; i < mlx4_num_eq_uar(dev); ++i) + priv->eq_table.uar_map[i] = NULL; + + if (!mlx4_is_slave(dev)) { + err = mlx4_map_clr_int(dev); + if (err) + goto err_out_bitmap; + + priv->eq_table.clr_mask = + swab32(1 << (priv->eq_table.inta_pin & 31)); + priv->eq_table.clr_int = priv->clr_base + + (priv->eq_table.inta_pin < 32 ? 4 : 0); + } + + priv->eq_table.irq_names = + kmalloc(MLX4_IRQNAME_SIZE * (dev->caps.num_comp_vectors + 1 + + dev->caps.comp_pool), + GFP_KERNEL); + if (!priv->eq_table.irq_names) { + err = -ENOMEM; + goto err_out_bitmap; + } + + for (i = 0; i < dev->caps.num_comp_vectors; ++i) { + err = mlx4_create_eq(dev, dev->caps.num_cqs - + dev->caps.reserved_cqs + + MLX4_NUM_SPARE_EQE, + (dev->flags & MLX4_FLAG_MSI_X) ? i : 0, + &priv->eq_table.eq[i]); + if (err) { + --i; + goto err_out_unmap; + } + } + + err = mlx4_create_eq(dev, MLX4_NUM_ASYNC_EQE + MLX4_NUM_SPARE_EQE, + (dev->flags & MLX4_FLAG_MSI_X) ? dev->caps.num_comp_vectors : 0, + &priv->eq_table.eq[dev->caps.num_comp_vectors]); + if (err) + goto err_out_comp; + + /*if additional completion vectors poolsize is 0 this loop will not run*/ + for (i = dev->caps.num_comp_vectors + 1; + i < dev->caps.num_comp_vectors + dev->caps.comp_pool + 1; ++i) { + + err = mlx4_create_eq(dev, dev->caps.num_cqs - + dev->caps.reserved_cqs + + MLX4_NUM_SPARE_EQE, + (dev->flags & MLX4_FLAG_MSI_X) ? i : 0, + &priv->eq_table.eq[i]); + if (err) { + --i; + goto err_out_unmap; + } + } + + + if (dev->flags & MLX4_FLAG_MSI_X) { + const char *eq_name; + + for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) { + if (i < dev->caps.num_comp_vectors) { + snprintf(priv->eq_table.irq_names + + i * MLX4_IRQNAME_SIZE, + MLX4_IRQNAME_SIZE, + "mlx4-comp-%d@pci:%s", i, + pci_name(dev->pdev)); + } else { + snprintf(priv->eq_table.irq_names + + i * MLX4_IRQNAME_SIZE, + MLX4_IRQNAME_SIZE, + "mlx4-async@pci:%s", + pci_name(dev->pdev)); + } + + eq_name = priv->eq_table.irq_names + + i * MLX4_IRQNAME_SIZE; + err = request_irq(priv->eq_table.eq[i].irq, + mlx4_msi_x_interrupt, 0, eq_name, + priv->eq_table.eq + i); + if (err) + goto err_out_async; + + priv->eq_table.eq[i].have_irq = 1; + } + } else { + snprintf(priv->eq_table.irq_names, + MLX4_IRQNAME_SIZE, + DRV_NAME "@pci:%s", + pci_name(dev->pdev)); + err = request_irq(dev->pdev->irq, mlx4_interrupt, + IRQF_SHARED, priv->eq_table.irq_names, dev); + if (err) + goto err_out_async; + + priv->eq_table.have_irq = 1; + } + + err = mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0, + priv->eq_table.eq[dev->caps.num_comp_vectors].eqn); + if (err) + mlx4_warn(dev, "MAP_EQ for async EQ %d failed (%d)\n", + priv->eq_table.eq[dev->caps.num_comp_vectors].eqn, err); + + for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) + eq_set_ci(&priv->eq_table.eq[i], 1); + + return 0; + +err_out_async: + mlx4_free_eq(dev, &priv->eq_table.eq[dev->caps.num_comp_vectors]); + +err_out_comp: + i = dev->caps.num_comp_vectors - 1; + +err_out_unmap: + while (i >= 0) { + mlx4_free_eq(dev, &priv->eq_table.eq[i]); + --i; + } + if (!mlx4_is_slave(dev)) + mlx4_unmap_clr_int(dev); + mlx4_free_irqs(dev); + +err_out_bitmap: + mlx4_unmap_uar(dev); + mlx4_bitmap_cleanup(&priv->eq_table.bitmap); + +err_out_free: + kfree(priv->eq_table.uar_map); + + return err; +} + +void mlx4_cleanup_eq_table(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int i; + + mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 1, + priv->eq_table.eq[dev->caps.num_comp_vectors].eqn); + + mlx4_free_irqs(dev); + + for (i = 0; i < dev->caps.num_comp_vectors + dev->caps.comp_pool + 1; ++i) + mlx4_free_eq(dev, &priv->eq_table.eq[i]); + + if (!mlx4_is_slave(dev)) + mlx4_unmap_clr_int(dev); + + mlx4_unmap_uar(dev); + mlx4_bitmap_cleanup(&priv->eq_table.bitmap); + + kfree(priv->eq_table.uar_map); +} + +/* A test that verifies that we can accept interrupts on all + * the irq vectors of the device. + * Interrupts are checked using the NOP command. + */ +int mlx4_test_interrupts(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int i; + int err; + + err = mlx4_NOP(dev); + /* When not in MSI_X, there is only one irq to check */ + if (!(dev->flags & MLX4_FLAG_MSI_X) || mlx4_is_slave(dev)) + return err; + + /* A loop over all completion vectors, for each vector we will check + * whether it works by mapping command completions to that vector + * and performing a NOP command + */ + for(i = 0; !err && (i < dev->caps.num_comp_vectors); ++i) { + /* Temporary use polling for command completions */ + mlx4_cmd_use_polling(dev); + + /* Map the new eq to handle all asynchronous events */ + err = mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0, + priv->eq_table.eq[i].eqn); + if (err) { + mlx4_warn(dev, "Failed mapping eq for interrupt test\n"); + mlx4_cmd_use_events(dev); + break; + } + + /* Go back to using events */ + mlx4_cmd_use_events(dev); + err = mlx4_NOP(dev); + } + + /* Return to default */ + mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0, + priv->eq_table.eq[dev->caps.num_comp_vectors].eqn); + return err; +} +EXPORT_SYMBOL(mlx4_test_interrupts); + +int mlx4_assign_eq(struct mlx4_dev *dev, char *name, struct cpu_rmap *rmap, + int *vector) +{ + + struct mlx4_priv *priv = mlx4_priv(dev); + int vec = 0, err = 0, i; + + mutex_lock(&priv->msix_ctl.pool_lock); + for (i = 0; !vec && i < dev->caps.comp_pool; i++) { + if (~priv->msix_ctl.pool_bm & 1ULL << i) { + priv->msix_ctl.pool_bm |= 1ULL << i; + vec = dev->caps.num_comp_vectors + 1 + i; + snprintf(priv->eq_table.irq_names + + vec * MLX4_IRQNAME_SIZE, + MLX4_IRQNAME_SIZE, "%s", name); +#ifdef CONFIG_RFS_ACCEL + if (rmap) { + err = irq_cpu_rmap_add(rmap, + priv->eq_table.eq[vec].irq); + if (err) + mlx4_warn(dev, "Failed adding irq rmap\n"); + } +#endif + err = request_irq(priv->eq_table.eq[vec].irq, + mlx4_msi_x_interrupt, 0, + &priv->eq_table.irq_names[vec<<5], + priv->eq_table.eq + vec); + if (err) { + /*zero out bit by fliping it*/ + priv->msix_ctl.pool_bm ^= 1 << i; + vec = 0; + continue; + /*we dont want to break here*/ + } + + eq_set_ci(&priv->eq_table.eq[vec], 1); + } + } + mutex_unlock(&priv->msix_ctl.pool_lock); + + if (vec) { + *vector = vec; + } else { + *vector = 0; + err = (i == dev->caps.comp_pool) ? -ENOSPC : err; + } + return err; +} +EXPORT_SYMBOL(mlx4_assign_eq); + +int mlx4_eq_get_irq(struct mlx4_dev *dev, int vec) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + return priv->eq_table.eq[vec].irq; +} +EXPORT_SYMBOL(mlx4_eq_get_irq); + +void mlx4_release_eq(struct mlx4_dev *dev, int vec) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + /*bm index*/ + int i = vec - dev->caps.num_comp_vectors - 1; + + if (likely(i >= 0)) { + /*sanity check , making sure were not trying to free irq's + Belonging to a legacy EQ*/ + mutex_lock(&priv->msix_ctl.pool_lock); + if (priv->msix_ctl.pool_bm & 1ULL << i) { + free_irq(priv->eq_table.eq[vec].irq, + &priv->eq_table.eq[vec]); + priv->msix_ctl.pool_bm &= ~(1ULL << i); + } + mutex_unlock(&priv->msix_ctl.pool_lock); + } + +} +EXPORT_SYMBOL(mlx4_release_eq); + diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c new file mode 100644 index 0000000..2e88a23 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -0,0 +1,2146 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved. + * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "fw.h" +#include "icm.h" + +enum { + MLX4_COMMAND_INTERFACE_MIN_REV = 2, + MLX4_COMMAND_INTERFACE_MAX_REV = 3, + MLX4_COMMAND_INTERFACE_NEW_PORT_CMDS = 3, +}; + +extern void __buggy_use_of_MLX4_GET(void); +extern void __buggy_use_of_MLX4_PUT(void); + +static bool enable_qos; +module_param(enable_qos, bool, 0444); +MODULE_PARM_DESC(enable_qos, "Enable Quality of Service support in the HCA (default: off)"); + +#define MLX4_GET(dest, source, offset) \ + do { \ + void *__p = (char *) (source) + (offset); \ + switch (sizeof (dest)) { \ + case 1: (dest) = *(u8 *) __p; break; \ + case 2: (dest) = be16_to_cpup(__p); break; \ + case 4: (dest) = be32_to_cpup(__p); break; \ + case 8: (dest) = be64_to_cpup(__p); break; \ + default: __buggy_use_of_MLX4_GET(); \ + } \ + } while (0) + +#define MLX4_PUT(dest, source, offset) \ + do { \ + void *__d = ((char *) (dest) + (offset)); \ + switch (sizeof(source)) { \ + case 1: *(u8 *) __d = (source); break; \ + case 2: *(__be16 *) __d = cpu_to_be16(source); break; \ + case 4: *(__be32 *) __d = cpu_to_be32(source); break; \ + case 8: *(__be64 *) __d = cpu_to_be64(source); break; \ + default: __buggy_use_of_MLX4_PUT(); \ + } \ + } while (0) + +static void dump_dev_cap_flags(struct mlx4_dev *dev, u64 flags) +{ + static const char *fname[] = { + [ 0] = "RC transport", + [ 1] = "UC transport", + [ 2] = "UD transport", + [ 3] = "XRC transport", + [ 4] = "reliable multicast", + [ 5] = "FCoIB support", + [ 6] = "SRQ support", + [ 7] = "IPoIB checksum offload", + [ 8] = "P_Key violation counter", + [ 9] = "Q_Key violation counter", + [10] = "VMM", + [12] = "Dual Port Different Protocol (DPDP) support", + [15] = "Big LSO headers", + [16] = "MW support", + [17] = "APM support", + [18] = "Atomic ops support", + [19] = "Raw multicast support", + [20] = "Address vector port checking support", + [21] = "UD multicast support", + [24] = "Demand paging support", + [25] = "Router support", + [30] = "IBoE support", + [32] = "Unicast loopback support", + [34] = "FCS header control", + [38] = "Wake On LAN support", + [40] = "UDP RSS support", + [41] = "Unicast VEP steering support", + [42] = "Multicast VEP steering support", + [48] = "Counters support", + [53] = "Port ETS Scheduler support", + [55] = "Port link type sensing support", + [59] = "Port management change event support", + [61] = "64 byte EQE support", + [62] = "64 byte CQE support", + }; + int i; + + mlx4_dbg(dev, "DEV_CAP flags:\n"); + for (i = 0; i < ARRAY_SIZE(fname); ++i) + if (fname[i] && (flags & (1LL << i))) + mlx4_dbg(dev, " %s\n", fname[i]); +} + +static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags) +{ + static const char * const fname[] = { + [0] = "RSS support", + [1] = "RSS Toeplitz Hash Function support", + [2] = "RSS XOR Hash Function support", + [3] = "Device managed flow steering support", + [4] = "Automatic MAC reassignment support", + [5] = "Time stamping support", + [6] = "VST (control vlan insertion/stripping) support", + [7] = "FSM (MAC anti-spoofing) support", + [8] = "Dynamic QP updates support", + [9] = "Device managed flow steering IPoIB support", + [10] = "TCP/IP offloads/flow-steering for VXLAN support", + [11] = "MAD DEMUX (Secure-Host) support", + [12] = "Large cache line (>64B) CQE stride support", + [13] = "Large cache line (>64B) EQE stride support" + }; + int i; + + for (i = 0; i < ARRAY_SIZE(fname); ++i) + if (fname[i] && (flags & (1LL << i))) + mlx4_dbg(dev, " %s\n", fname[i]); +} + +int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg) +{ + struct mlx4_cmd_mailbox *mailbox; + u32 *inbox; + int err = 0; + +#define MOD_STAT_CFG_IN_SIZE 0x100 + +#define MOD_STAT_CFG_PG_SZ_M_OFFSET 0x002 +#define MOD_STAT_CFG_PG_SZ_OFFSET 0x003 + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + inbox = mailbox->buf; + + MLX4_PUT(inbox, cfg->log_pg_sz, MOD_STAT_CFG_PG_SZ_OFFSET); + MLX4_PUT(inbox, cfg->log_pg_sz_m, MOD_STAT_CFG_PG_SZ_M_OFFSET); + + err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_MOD_STAT_CFG, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} + +int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + u8 field, port; + u32 size, proxy_qp, qkey; + int err = 0; + +#define QUERY_FUNC_CAP_FLAGS_OFFSET 0x0 +#define QUERY_FUNC_CAP_NUM_PORTS_OFFSET 0x1 +#define QUERY_FUNC_CAP_PF_BHVR_OFFSET 0x4 +#define QUERY_FUNC_CAP_FMR_OFFSET 0x8 +#define QUERY_FUNC_CAP_QP_QUOTA_OFFSET_DEP 0x10 +#define QUERY_FUNC_CAP_CQ_QUOTA_OFFSET_DEP 0x14 +#define QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET_DEP 0x18 +#define QUERY_FUNC_CAP_MPT_QUOTA_OFFSET_DEP 0x20 +#define QUERY_FUNC_CAP_MTT_QUOTA_OFFSET_DEP 0x24 +#define QUERY_FUNC_CAP_MCG_QUOTA_OFFSET_DEP 0x28 +#define QUERY_FUNC_CAP_MAX_EQ_OFFSET 0x2c +#define QUERY_FUNC_CAP_RESERVED_EQ_OFFSET 0x30 + +#define QUERY_FUNC_CAP_QP_QUOTA_OFFSET 0x50 +#define QUERY_FUNC_CAP_CQ_QUOTA_OFFSET 0x54 +#define QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET 0x58 +#define QUERY_FUNC_CAP_MPT_QUOTA_OFFSET 0x60 +#define QUERY_FUNC_CAP_MTT_QUOTA_OFFSET 0x64 +#define QUERY_FUNC_CAP_MCG_QUOTA_OFFSET 0x68 + +#define QUERY_FUNC_CAP_FMR_FLAG 0x80 +#define QUERY_FUNC_CAP_FLAG_RDMA 0x40 +#define QUERY_FUNC_CAP_FLAG_ETH 0x80 +#define QUERY_FUNC_CAP_FLAG_QUOTAS 0x10 + +/* when opcode modifier = 1 */ +#define QUERY_FUNC_CAP_PHYS_PORT_OFFSET 0x3 +#define QUERY_FUNC_CAP_PRIV_VF_QKEY_OFFSET 0x4 +#define QUERY_FUNC_CAP_FLAGS0_OFFSET 0x8 +#define QUERY_FUNC_CAP_FLAGS1_OFFSET 0xc + +#define QUERY_FUNC_CAP_QP0_TUNNEL 0x10 +#define QUERY_FUNC_CAP_QP0_PROXY 0x14 +#define QUERY_FUNC_CAP_QP1_TUNNEL 0x18 +#define QUERY_FUNC_CAP_QP1_PROXY 0x1c +#define QUERY_FUNC_CAP_PHYS_PORT_ID 0x28 + +#define QUERY_FUNC_CAP_FLAGS1_FORCE_MAC 0x40 +#define QUERY_FUNC_CAP_FLAGS1_FORCE_VLAN 0x80 +#define QUERY_FUNC_CAP_FLAGS1_NIC_INFO 0x10 +#define QUERY_FUNC_CAP_VF_ENABLE_QP0 0x08 + +#define QUERY_FUNC_CAP_FLAGS0_FORCE_PHY_WQE_GID 0x80 + + if (vhcr->op_modifier == 1) { + struct mlx4_active_ports actv_ports = + mlx4_get_active_ports(dev, slave); + int converted_port = mlx4_slave_convert_port( + dev, slave, vhcr->in_modifier); + + if (converted_port < 0) + return -EINVAL; + + vhcr->in_modifier = converted_port; + /* phys-port = logical-port */ + field = vhcr->in_modifier - + find_first_bit(actv_ports.ports, dev->caps.num_ports); + MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_PHYS_PORT_OFFSET); + + port = vhcr->in_modifier; + proxy_qp = dev->phys_caps.base_proxy_sqpn + 8 * slave + port - 1; + + /* Set nic_info bit to mark new fields support */ + field = QUERY_FUNC_CAP_FLAGS1_NIC_INFO; + + if (mlx4_vf_smi_enabled(dev, slave, port) && + !mlx4_get_parav_qkey(dev, proxy_qp, &qkey)) { + field |= QUERY_FUNC_CAP_VF_ENABLE_QP0; + MLX4_PUT(outbox->buf, qkey, + QUERY_FUNC_CAP_PRIV_VF_QKEY_OFFSET); + } + MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FLAGS1_OFFSET); + + /* size is now the QP number */ + size = dev->phys_caps.base_tunnel_sqpn + 8 * slave + port - 1; + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP0_TUNNEL); + + size += 2; + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP1_TUNNEL); + + MLX4_PUT(outbox->buf, proxy_qp, QUERY_FUNC_CAP_QP0_PROXY); + proxy_qp += 2; + MLX4_PUT(outbox->buf, proxy_qp, QUERY_FUNC_CAP_QP1_PROXY); + + MLX4_PUT(outbox->buf, dev->caps.phys_port_id[vhcr->in_modifier], + QUERY_FUNC_CAP_PHYS_PORT_ID); + + } else if (vhcr->op_modifier == 0) { + struct mlx4_active_ports actv_ports = + mlx4_get_active_ports(dev, slave); + /* enable rdma and ethernet interfaces, and new quota locations */ + field = (QUERY_FUNC_CAP_FLAG_ETH | QUERY_FUNC_CAP_FLAG_RDMA | + QUERY_FUNC_CAP_FLAG_QUOTAS); + MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FLAGS_OFFSET); + + field = min( + bitmap_weight(actv_ports.ports, dev->caps.num_ports), + dev->caps.num_ports); + MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_NUM_PORTS_OFFSET); + + size = dev->caps.function_caps; /* set PF behaviours */ + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_PF_BHVR_OFFSET); + + field = 0; /* protected FMR support not available as yet */ + MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FMR_OFFSET); + + size = priv->mfunc.master.res_tracker.res_alloc[RES_QP].quota[slave]; + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP_QUOTA_OFFSET); + size = dev->caps.num_qps; + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP_QUOTA_OFFSET_DEP); + + size = priv->mfunc.master.res_tracker.res_alloc[RES_SRQ].quota[slave]; + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET); + size = dev->caps.num_srqs; + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET_DEP); + + size = priv->mfunc.master.res_tracker.res_alloc[RES_CQ].quota[slave]; + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_CQ_QUOTA_OFFSET); + size = dev->caps.num_cqs; + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_CQ_QUOTA_OFFSET_DEP); + + size = dev->caps.num_eqs; + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MAX_EQ_OFFSET); + + size = dev->caps.reserved_eqs; + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_RESERVED_EQ_OFFSET); + + size = priv->mfunc.master.res_tracker.res_alloc[RES_MPT].quota[slave]; + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MPT_QUOTA_OFFSET); + size = dev->caps.num_mpts; + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MPT_QUOTA_OFFSET_DEP); + + size = priv->mfunc.master.res_tracker.res_alloc[RES_MTT].quota[slave]; + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MTT_QUOTA_OFFSET); + size = dev->caps.num_mtts; + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MTT_QUOTA_OFFSET_DEP); + + size = dev->caps.num_mgms + dev->caps.num_amgms; + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET); + MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET_DEP); + + } else + err = -EINVAL; + + return err; +} + +int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u32 gen_or_port, + struct mlx4_func_cap *func_cap) +{ + struct mlx4_cmd_mailbox *mailbox; + u32 *outbox; + u8 field, op_modifier; + u32 size, qkey; + int err = 0, quotas = 0; + + op_modifier = !!gen_or_port; /* 0 = general, 1 = logical port */ + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + err = mlx4_cmd_box(dev, 0, mailbox->dma, gen_or_port, op_modifier, + MLX4_CMD_QUERY_FUNC_CAP, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + if (err) + goto out; + + outbox = mailbox->buf; + + if (!op_modifier) { + MLX4_GET(field, outbox, QUERY_FUNC_CAP_FLAGS_OFFSET); + if (!(field & (QUERY_FUNC_CAP_FLAG_ETH | QUERY_FUNC_CAP_FLAG_RDMA))) { + mlx4_err(dev, "The host supports neither eth nor rdma interfaces\n"); + err = -EPROTONOSUPPORT; + goto out; + } + func_cap->flags = field; + quotas = !!(func_cap->flags & QUERY_FUNC_CAP_FLAG_QUOTAS); + + MLX4_GET(field, outbox, QUERY_FUNC_CAP_NUM_PORTS_OFFSET); + func_cap->num_ports = field; + + MLX4_GET(size, outbox, QUERY_FUNC_CAP_PF_BHVR_OFFSET); + func_cap->pf_context_behaviour = size; + + if (quotas) { + MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP_QUOTA_OFFSET); + func_cap->qp_quota = size & 0xFFFFFF; + + MLX4_GET(size, outbox, QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET); + func_cap->srq_quota = size & 0xFFFFFF; + + MLX4_GET(size, outbox, QUERY_FUNC_CAP_CQ_QUOTA_OFFSET); + func_cap->cq_quota = size & 0xFFFFFF; + + MLX4_GET(size, outbox, QUERY_FUNC_CAP_MPT_QUOTA_OFFSET); + func_cap->mpt_quota = size & 0xFFFFFF; + + MLX4_GET(size, outbox, QUERY_FUNC_CAP_MTT_QUOTA_OFFSET); + func_cap->mtt_quota = size & 0xFFFFFF; + + MLX4_GET(size, outbox, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET); + func_cap->mcg_quota = size & 0xFFFFFF; + + } else { + MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP_QUOTA_OFFSET_DEP); + func_cap->qp_quota = size & 0xFFFFFF; + + MLX4_GET(size, outbox, QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET_DEP); + func_cap->srq_quota = size & 0xFFFFFF; + + MLX4_GET(size, outbox, QUERY_FUNC_CAP_CQ_QUOTA_OFFSET_DEP); + func_cap->cq_quota = size & 0xFFFFFF; + + MLX4_GET(size, outbox, QUERY_FUNC_CAP_MPT_QUOTA_OFFSET_DEP); + func_cap->mpt_quota = size & 0xFFFFFF; + + MLX4_GET(size, outbox, QUERY_FUNC_CAP_MTT_QUOTA_OFFSET_DEP); + func_cap->mtt_quota = size & 0xFFFFFF; + + MLX4_GET(size, outbox, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET_DEP); + func_cap->mcg_quota = size & 0xFFFFFF; + } + MLX4_GET(size, outbox, QUERY_FUNC_CAP_MAX_EQ_OFFSET); + func_cap->max_eq = size & 0xFFFFFF; + + MLX4_GET(size, outbox, QUERY_FUNC_CAP_RESERVED_EQ_OFFSET); + func_cap->reserved_eq = size & 0xFFFFFF; + + goto out; + } + + /* logical port query */ + if (gen_or_port > dev->caps.num_ports) { + err = -EINVAL; + goto out; + } + + MLX4_GET(func_cap->flags1, outbox, QUERY_FUNC_CAP_FLAGS1_OFFSET); + if (dev->caps.port_type[gen_or_port] == MLX4_PORT_TYPE_ETH) { + if (func_cap->flags1 & QUERY_FUNC_CAP_FLAGS1_FORCE_VLAN) { + mlx4_err(dev, "VLAN is enforced on this port\n"); + err = -EPROTONOSUPPORT; + goto out; + } + + if (func_cap->flags1 & QUERY_FUNC_CAP_FLAGS1_FORCE_MAC) { + mlx4_err(dev, "Force mac is enabled on this port\n"); + err = -EPROTONOSUPPORT; + goto out; + } + } else if (dev->caps.port_type[gen_or_port] == MLX4_PORT_TYPE_IB) { + MLX4_GET(field, outbox, QUERY_FUNC_CAP_FLAGS0_OFFSET); + if (field & QUERY_FUNC_CAP_FLAGS0_FORCE_PHY_WQE_GID) { + mlx4_err(dev, "phy_wqe_gid is enforced on this ib port\n"); + err = -EPROTONOSUPPORT; + goto out; + } + } + + MLX4_GET(field, outbox, QUERY_FUNC_CAP_PHYS_PORT_OFFSET); + func_cap->physical_port = field; + if (func_cap->physical_port != gen_or_port) { + err = -ENOSYS; + goto out; + } + + if (func_cap->flags1 & QUERY_FUNC_CAP_VF_ENABLE_QP0) { + MLX4_GET(qkey, outbox, QUERY_FUNC_CAP_PRIV_VF_QKEY_OFFSET); + func_cap->qp0_qkey = qkey; + } else { + func_cap->qp0_qkey = 0; + } + + MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP0_TUNNEL); + func_cap->qp0_tunnel_qpn = size & 0xFFFFFF; + + MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP0_PROXY); + func_cap->qp0_proxy_qpn = size & 0xFFFFFF; + + MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP1_TUNNEL); + func_cap->qp1_tunnel_qpn = size & 0xFFFFFF; + + MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP1_PROXY); + func_cap->qp1_proxy_qpn = size & 0xFFFFFF; + + if (func_cap->flags1 & QUERY_FUNC_CAP_FLAGS1_NIC_INFO) + MLX4_GET(func_cap->phys_port_id, outbox, + QUERY_FUNC_CAP_PHYS_PORT_ID); + + /* All other resources are allocated by the master, but we still report + * 'num' and 'reserved' capabilities as follows: + * - num remains the maximum resource index + * - 'num - reserved' is the total available objects of a resource, but + * resource indices may be less than 'reserved' + * TODO: set per-resource quotas */ + +out: + mlx4_free_cmd_mailbox(dev, mailbox); + + return err; +} + +int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) +{ + struct mlx4_cmd_mailbox *mailbox; + u32 *outbox; + u8 field; + u32 field32, flags, ext_flags; + u16 size; + u16 stat_rate; + int err; + int i; + +#define QUERY_DEV_CAP_OUT_SIZE 0x100 +#define QUERY_DEV_CAP_MAX_SRQ_SZ_OFFSET 0x10 +#define QUERY_DEV_CAP_MAX_QP_SZ_OFFSET 0x11 +#define QUERY_DEV_CAP_RSVD_QP_OFFSET 0x12 +#define QUERY_DEV_CAP_MAX_QP_OFFSET 0x13 +#define QUERY_DEV_CAP_RSVD_SRQ_OFFSET 0x14 +#define QUERY_DEV_CAP_MAX_SRQ_OFFSET 0x15 +#define QUERY_DEV_CAP_RSVD_EEC_OFFSET 0x16 +#define QUERY_DEV_CAP_MAX_EEC_OFFSET 0x17 +#define QUERY_DEV_CAP_MAX_CQ_SZ_OFFSET 0x19 +#define QUERY_DEV_CAP_RSVD_CQ_OFFSET 0x1a +#define QUERY_DEV_CAP_MAX_CQ_OFFSET 0x1b +#define QUERY_DEV_CAP_MAX_MPT_OFFSET 0x1d +#define QUERY_DEV_CAP_RSVD_EQ_OFFSET 0x1e +#define QUERY_DEV_CAP_MAX_EQ_OFFSET 0x1f +#define QUERY_DEV_CAP_RSVD_MTT_OFFSET 0x20 +#define QUERY_DEV_CAP_MAX_MRW_SZ_OFFSET 0x21 +#define QUERY_DEV_CAP_RSVD_MRW_OFFSET 0x22 +#define QUERY_DEV_CAP_MAX_MTT_SEG_OFFSET 0x23 +#define QUERY_DEV_CAP_MAX_AV_OFFSET 0x27 +#define QUERY_DEV_CAP_MAX_REQ_QP_OFFSET 0x29 +#define QUERY_DEV_CAP_MAX_RES_QP_OFFSET 0x2b +#define QUERY_DEV_CAP_MAX_GSO_OFFSET 0x2d +#define QUERY_DEV_CAP_RSS_OFFSET 0x2e +#define QUERY_DEV_CAP_MAX_RDMA_OFFSET 0x2f +#define QUERY_DEV_CAP_RSZ_SRQ_OFFSET 0x33 +#define QUERY_DEV_CAP_ACK_DELAY_OFFSET 0x35 +#define QUERY_DEV_CAP_MTU_WIDTH_OFFSET 0x36 +#define QUERY_DEV_CAP_VL_PORT_OFFSET 0x37 +#define QUERY_DEV_CAP_MAX_MSG_SZ_OFFSET 0x38 +#define QUERY_DEV_CAP_MAX_GID_OFFSET 0x3b +#define QUERY_DEV_CAP_RATE_SUPPORT_OFFSET 0x3c +#define QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET 0x3e +#define QUERY_DEV_CAP_MAX_PKEY_OFFSET 0x3f +#define QUERY_DEV_CAP_EXT_FLAGS_OFFSET 0x40 +#define QUERY_DEV_CAP_FLAGS_OFFSET 0x44 +#define QUERY_DEV_CAP_RSVD_UAR_OFFSET 0x48 +#define QUERY_DEV_CAP_UAR_SZ_OFFSET 0x49 +#define QUERY_DEV_CAP_PAGE_SZ_OFFSET 0x4b +#define QUERY_DEV_CAP_BF_OFFSET 0x4c +#define QUERY_DEV_CAP_LOG_BF_REG_SZ_OFFSET 0x4d +#define QUERY_DEV_CAP_LOG_MAX_BF_REGS_PER_PAGE_OFFSET 0x4e +#define QUERY_DEV_CAP_LOG_MAX_BF_PAGES_OFFSET 0x4f +#define QUERY_DEV_CAP_MAX_SG_SQ_OFFSET 0x51 +#define QUERY_DEV_CAP_MAX_DESC_SZ_SQ_OFFSET 0x52 +#define QUERY_DEV_CAP_MAX_SG_RQ_OFFSET 0x55 +#define QUERY_DEV_CAP_MAX_DESC_SZ_RQ_OFFSET 0x56 +#define QUERY_DEV_CAP_MAX_QP_MCG_OFFSET 0x61 +#define QUERY_DEV_CAP_RSVD_MCG_OFFSET 0x62 +#define QUERY_DEV_CAP_MAX_MCG_OFFSET 0x63 +#define QUERY_DEV_CAP_RSVD_PD_OFFSET 0x64 +#define QUERY_DEV_CAP_MAX_PD_OFFSET 0x65 +#define QUERY_DEV_CAP_RSVD_XRC_OFFSET 0x66 +#define QUERY_DEV_CAP_MAX_XRC_OFFSET 0x67 +#define QUERY_DEV_CAP_MAX_COUNTERS_OFFSET 0x68 +#define QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET 0x70 +#define QUERY_DEV_CAP_FLOW_STEERING_IPOIB_OFFSET 0x74 +#define QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET 0x76 +#define QUERY_DEV_CAP_FLOW_STEERING_MAX_QP_OFFSET 0x77 +#define QUERY_DEV_CAP_CQ_EQ_CACHE_LINE_STRIDE 0x7a +#define QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET 0x80 +#define QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET 0x82 +#define QUERY_DEV_CAP_AUX_ENTRY_SZ_OFFSET 0x84 +#define QUERY_DEV_CAP_ALTC_ENTRY_SZ_OFFSET 0x86 +#define QUERY_DEV_CAP_EQC_ENTRY_SZ_OFFSET 0x88 +#define QUERY_DEV_CAP_CQC_ENTRY_SZ_OFFSET 0x8a +#define QUERY_DEV_CAP_SRQ_ENTRY_SZ_OFFSET 0x8c +#define QUERY_DEV_CAP_C_MPT_ENTRY_SZ_OFFSET 0x8e +#define QUERY_DEV_CAP_MTT_ENTRY_SZ_OFFSET 0x90 +#define QUERY_DEV_CAP_D_MPT_ENTRY_SZ_OFFSET 0x92 +#define QUERY_DEV_CAP_BMME_FLAGS_OFFSET 0x94 +#define QUERY_DEV_CAP_RSVD_LKEY_OFFSET 0x98 +#define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET 0xa0 +#define QUERY_DEV_CAP_FW_REASSIGN_MAC 0x9d +#define QUERY_DEV_CAP_VXLAN 0x9e +#define QUERY_DEV_CAP_MAD_DEMUX_OFFSET 0xb0 + + dev_cap->flags2 = 0; + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + outbox = mailbox->buf; + + err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_DEV_CAP, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); + if (err) + goto out; + + MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_QP_OFFSET); + dev_cap->reserved_qps = 1 << (field & 0xf); + MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_OFFSET); + dev_cap->max_qps = 1 << (field & 0x1f); + MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_SRQ_OFFSET); + dev_cap->reserved_srqs = 1 << (field >> 4); + MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SRQ_OFFSET); + dev_cap->max_srqs = 1 << (field & 0x1f); + MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_CQ_SZ_OFFSET); + dev_cap->max_cq_sz = 1 << field; + MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_CQ_OFFSET); + dev_cap->reserved_cqs = 1 << (field & 0xf); + MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_CQ_OFFSET); + dev_cap->max_cqs = 1 << (field & 0x1f); + MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MPT_OFFSET); + dev_cap->max_mpts = 1 << (field & 0x3f); + MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_EQ_OFFSET); + dev_cap->reserved_eqs = field & 0xf; + MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_EQ_OFFSET); + dev_cap->max_eqs = 1 << (field & 0xf); + MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MTT_OFFSET); + dev_cap->reserved_mtts = 1 << (field >> 4); + MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MRW_SZ_OFFSET); + dev_cap->max_mrw_sz = 1 << field; + MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MRW_OFFSET); + dev_cap->reserved_mrws = 1 << (field & 0xf); + MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MTT_SEG_OFFSET); + dev_cap->max_mtt_seg = 1 << (field & 0x3f); + MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_REQ_QP_OFFSET); + dev_cap->max_requester_per_qp = 1 << (field & 0x3f); + MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_RES_QP_OFFSET); + dev_cap->max_responder_per_qp = 1 << (field & 0x3f); + MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_GSO_OFFSET); + field &= 0x1f; + if (!field) + dev_cap->max_gso_sz = 0; + else + dev_cap->max_gso_sz = 1 << field; + + MLX4_GET(field, outbox, QUERY_DEV_CAP_RSS_OFFSET); + if (field & 0x20) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RSS_XOR; + if (field & 0x10) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RSS_TOP; + field &= 0xf; + if (field) { + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RSS; + dev_cap->max_rss_tbl_sz = 1 << field; + } else + dev_cap->max_rss_tbl_sz = 0; + MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_RDMA_OFFSET); + dev_cap->max_rdma_global = 1 << (field & 0x3f); + MLX4_GET(field, outbox, QUERY_DEV_CAP_ACK_DELAY_OFFSET); + dev_cap->local_ca_ack_delay = field & 0x1f; + MLX4_GET(field, outbox, QUERY_DEV_CAP_VL_PORT_OFFSET); + dev_cap->num_ports = field & 0xf; + MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MSG_SZ_OFFSET); + dev_cap->max_msg_sz = 1 << (field & 0x1f); + MLX4_GET(field, outbox, QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET); + if (field & 0x80) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_FS_EN; + dev_cap->fs_log_max_ucast_qp_range_size = field & 0x1f; + MLX4_GET(field, outbox, QUERY_DEV_CAP_FLOW_STEERING_IPOIB_OFFSET); + if (field & 0x80) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_DMFS_IPOIB; + MLX4_GET(field, outbox, QUERY_DEV_CAP_FLOW_STEERING_MAX_QP_OFFSET); + dev_cap->fs_max_num_qp_per_entry = field; + MLX4_GET(stat_rate, outbox, QUERY_DEV_CAP_RATE_SUPPORT_OFFSET); + dev_cap->stat_rate_support = stat_rate; + MLX4_GET(field, outbox, QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET); + if (field & 0x80) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_TS; + MLX4_GET(ext_flags, outbox, QUERY_DEV_CAP_EXT_FLAGS_OFFSET); + MLX4_GET(flags, outbox, QUERY_DEV_CAP_FLAGS_OFFSET); + dev_cap->flags = flags | (u64)ext_flags << 32; + MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_UAR_OFFSET); + dev_cap->reserved_uars = field >> 4; + MLX4_GET(field, outbox, QUERY_DEV_CAP_UAR_SZ_OFFSET); + dev_cap->uar_size = 1 << ((field & 0x3f) + 20); + MLX4_GET(field, outbox, QUERY_DEV_CAP_PAGE_SZ_OFFSET); + dev_cap->min_page_sz = 1 << field; + + MLX4_GET(field, outbox, QUERY_DEV_CAP_BF_OFFSET); + if (field & 0x80) { + MLX4_GET(field, outbox, QUERY_DEV_CAP_LOG_BF_REG_SZ_OFFSET); + dev_cap->bf_reg_size = 1 << (field & 0x1f); + MLX4_GET(field, outbox, QUERY_DEV_CAP_LOG_MAX_BF_REGS_PER_PAGE_OFFSET); + if ((1 << (field & 0x3f)) > (PAGE_SIZE / dev_cap->bf_reg_size)) + field = 3; + dev_cap->bf_regs_per_page = 1 << (field & 0x3f); + mlx4_dbg(dev, "BlueFlame available (reg size %d, regs/page %d)\n", + dev_cap->bf_reg_size, dev_cap->bf_regs_per_page); + } else { + dev_cap->bf_reg_size = 0; + mlx4_dbg(dev, "BlueFlame not available\n"); + } + + MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SG_SQ_OFFSET); + dev_cap->max_sq_sg = field; + MLX4_GET(size, outbox, QUERY_DEV_CAP_MAX_DESC_SZ_SQ_OFFSET); + dev_cap->max_sq_desc_sz = size; + + MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_MCG_OFFSET); + dev_cap->max_qp_per_mcg = 1 << field; + MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MCG_OFFSET); + dev_cap->reserved_mgms = field & 0xf; + MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MCG_OFFSET); + dev_cap->max_mcgs = 1 << field; + MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_PD_OFFSET); + dev_cap->reserved_pds = field >> 4; + MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_PD_OFFSET); + dev_cap->max_pds = 1 << (field & 0x3f); + MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_XRC_OFFSET); + dev_cap->reserved_xrcds = field >> 4; + MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_XRC_OFFSET); + dev_cap->max_xrcds = 1 << (field & 0x1f); + + MLX4_GET(size, outbox, QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET); + dev_cap->rdmarc_entry_sz = size; + MLX4_GET(size, outbox, QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET); + dev_cap->qpc_entry_sz = size; + MLX4_GET(size, outbox, QUERY_DEV_CAP_AUX_ENTRY_SZ_OFFSET); + dev_cap->aux_entry_sz = size; + MLX4_GET(size, outbox, QUERY_DEV_CAP_ALTC_ENTRY_SZ_OFFSET); + dev_cap->altc_entry_sz = size; + MLX4_GET(size, outbox, QUERY_DEV_CAP_EQC_ENTRY_SZ_OFFSET); + dev_cap->eqc_entry_sz = size; + MLX4_GET(size, outbox, QUERY_DEV_CAP_CQC_ENTRY_SZ_OFFSET); + dev_cap->cqc_entry_sz = size; + MLX4_GET(size, outbox, QUERY_DEV_CAP_SRQ_ENTRY_SZ_OFFSET); + dev_cap->srq_entry_sz = size; + MLX4_GET(size, outbox, QUERY_DEV_CAP_C_MPT_ENTRY_SZ_OFFSET); + dev_cap->cmpt_entry_sz = size; + MLX4_GET(size, outbox, QUERY_DEV_CAP_MTT_ENTRY_SZ_OFFSET); + dev_cap->mtt_entry_sz = size; + MLX4_GET(size, outbox, QUERY_DEV_CAP_D_MPT_ENTRY_SZ_OFFSET); + dev_cap->dmpt_entry_sz = size; + + MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SRQ_SZ_OFFSET); + dev_cap->max_srq_sz = 1 << field; + MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_SZ_OFFSET); + dev_cap->max_qp_sz = 1 << field; + MLX4_GET(field, outbox, QUERY_DEV_CAP_RSZ_SRQ_OFFSET); + dev_cap->resize_srq = field & 1; + MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SG_RQ_OFFSET); + dev_cap->max_rq_sg = field; + MLX4_GET(size, outbox, QUERY_DEV_CAP_MAX_DESC_SZ_RQ_OFFSET); + dev_cap->max_rq_desc_sz = size; + MLX4_GET(field, outbox, QUERY_DEV_CAP_CQ_EQ_CACHE_LINE_STRIDE); + if (field & (1 << 6)) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_CQE_STRIDE; + if (field & (1 << 7)) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_EQE_STRIDE; + + MLX4_GET(dev_cap->bmme_flags, outbox, + QUERY_DEV_CAP_BMME_FLAGS_OFFSET); + MLX4_GET(dev_cap->reserved_lkey, outbox, + QUERY_DEV_CAP_RSVD_LKEY_OFFSET); + MLX4_GET(field, outbox, QUERY_DEV_CAP_FW_REASSIGN_MAC); + if (field & 1<<6) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_REASSIGN_MAC_EN; + MLX4_GET(field, outbox, QUERY_DEV_CAP_VXLAN); + if (field & 1<<3) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS; + MLX4_GET(dev_cap->max_icm_sz, outbox, + QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET); + if (dev_cap->flags & MLX4_DEV_CAP_FLAG_COUNTERS) + MLX4_GET(dev_cap->max_counters, outbox, + QUERY_DEV_CAP_MAX_COUNTERS_OFFSET); + + MLX4_GET(field32, outbox, + QUERY_DEV_CAP_MAD_DEMUX_OFFSET); + if (field32 & (1 << 0)) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_MAD_DEMUX; + + MLX4_GET(field32, outbox, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET); + if (field32 & (1 << 16)) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_UPDATE_QP; + if (field32 & (1 << 26)) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_VLAN_CONTROL; + if (field32 & (1 << 20)) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_FSM; + + if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { + for (i = 1; i <= dev_cap->num_ports; ++i) { + MLX4_GET(field, outbox, QUERY_DEV_CAP_VL_PORT_OFFSET); + dev_cap->max_vl[i] = field >> 4; + MLX4_GET(field, outbox, QUERY_DEV_CAP_MTU_WIDTH_OFFSET); + dev_cap->ib_mtu[i] = field >> 4; + dev_cap->max_port_width[i] = field & 0xf; + MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_GID_OFFSET); + dev_cap->max_gids[i] = 1 << (field & 0xf); + MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_PKEY_OFFSET); + dev_cap->max_pkeys[i] = 1 << (field & 0xf); + } + } else { +#define QUERY_PORT_SUPPORTED_TYPE_OFFSET 0x00 +#define QUERY_PORT_MTU_OFFSET 0x01 +#define QUERY_PORT_ETH_MTU_OFFSET 0x02 +#define QUERY_PORT_WIDTH_OFFSET 0x06 +#define QUERY_PORT_MAX_GID_PKEY_OFFSET 0x07 +#define QUERY_PORT_MAX_MACVLAN_OFFSET 0x0a +#define QUERY_PORT_MAX_VL_OFFSET 0x0b +#define QUERY_PORT_MAC_OFFSET 0x10 +#define QUERY_PORT_TRANS_VENDOR_OFFSET 0x18 +#define QUERY_PORT_WAVELENGTH_OFFSET 0x1c +#define QUERY_PORT_TRANS_CODE_OFFSET 0x20 + + for (i = 1; i <= dev_cap->num_ports; ++i) { + err = mlx4_cmd_box(dev, 0, mailbox->dma, i, 0, MLX4_CMD_QUERY_PORT, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); + if (err) + goto out; + + MLX4_GET(field, outbox, QUERY_PORT_SUPPORTED_TYPE_OFFSET); + dev_cap->supported_port_types[i] = field & 3; + dev_cap->suggested_type[i] = (field >> 3) & 1; + dev_cap->default_sense[i] = (field >> 4) & 1; + MLX4_GET(field, outbox, QUERY_PORT_MTU_OFFSET); + dev_cap->ib_mtu[i] = field & 0xf; + MLX4_GET(field, outbox, QUERY_PORT_WIDTH_OFFSET); + dev_cap->max_port_width[i] = field & 0xf; + MLX4_GET(field, outbox, QUERY_PORT_MAX_GID_PKEY_OFFSET); + dev_cap->max_gids[i] = 1 << (field >> 4); + dev_cap->max_pkeys[i] = 1 << (field & 0xf); + MLX4_GET(field, outbox, QUERY_PORT_MAX_VL_OFFSET); + dev_cap->max_vl[i] = field & 0xf; + MLX4_GET(field, outbox, QUERY_PORT_MAX_MACVLAN_OFFSET); + dev_cap->log_max_macs[i] = field & 0xf; + dev_cap->log_max_vlans[i] = field >> 4; + MLX4_GET(dev_cap->eth_mtu[i], outbox, QUERY_PORT_ETH_MTU_OFFSET); + MLX4_GET(dev_cap->def_mac[i], outbox, QUERY_PORT_MAC_OFFSET); + MLX4_GET(field32, outbox, QUERY_PORT_TRANS_VENDOR_OFFSET); + dev_cap->trans_type[i] = field32 >> 24; + dev_cap->vendor_oui[i] = field32 & 0xffffff; + MLX4_GET(dev_cap->wavelength[i], outbox, QUERY_PORT_WAVELENGTH_OFFSET); + MLX4_GET(dev_cap->trans_code[i], outbox, QUERY_PORT_TRANS_CODE_OFFSET); + } + } + + mlx4_dbg(dev, "Base MM extensions: flags %08x, rsvd L_Key %08x\n", + dev_cap->bmme_flags, dev_cap->reserved_lkey); + + /* + * Each UAR has 4 EQ doorbells; so if a UAR is reserved, then + * we can't use any EQs whose doorbell falls on that page, + * even if the EQ itself isn't reserved. + */ + dev_cap->reserved_eqs = max(dev_cap->reserved_uars * 4, + dev_cap->reserved_eqs); + + mlx4_dbg(dev, "Max ICM size %lld MB\n", + (unsigned long long) dev_cap->max_icm_sz >> 20); + mlx4_dbg(dev, "Max QPs: %d, reserved QPs: %d, entry size: %d\n", + dev_cap->max_qps, dev_cap->reserved_qps, dev_cap->qpc_entry_sz); + mlx4_dbg(dev, "Max SRQs: %d, reserved SRQs: %d, entry size: %d\n", + dev_cap->max_srqs, dev_cap->reserved_srqs, dev_cap->srq_entry_sz); + mlx4_dbg(dev, "Max CQs: %d, reserved CQs: %d, entry size: %d\n", + dev_cap->max_cqs, dev_cap->reserved_cqs, dev_cap->cqc_entry_sz); + mlx4_dbg(dev, "Max EQs: %d, reserved EQs: %d, entry size: %d\n", + dev_cap->max_eqs, dev_cap->reserved_eqs, dev_cap->eqc_entry_sz); + mlx4_dbg(dev, "reserved MPTs: %d, reserved MTTs: %d\n", + dev_cap->reserved_mrws, dev_cap->reserved_mtts); + mlx4_dbg(dev, "Max PDs: %d, reserved PDs: %d, reserved UARs: %d\n", + dev_cap->max_pds, dev_cap->reserved_pds, dev_cap->reserved_uars); + mlx4_dbg(dev, "Max QP/MCG: %d, reserved MGMs: %d\n", + dev_cap->max_pds, dev_cap->reserved_mgms); + mlx4_dbg(dev, "Max CQEs: %d, max WQEs: %d, max SRQ WQEs: %d\n", + dev_cap->max_cq_sz, dev_cap->max_qp_sz, dev_cap->max_srq_sz); + mlx4_dbg(dev, "Local CA ACK delay: %d, max MTU: %d, port width cap: %d\n", + dev_cap->local_ca_ack_delay, 128 << dev_cap->ib_mtu[1], + dev_cap->max_port_width[1]); + mlx4_dbg(dev, "Max SQ desc size: %d, max SQ S/G: %d\n", + dev_cap->max_sq_desc_sz, dev_cap->max_sq_sg); + mlx4_dbg(dev, "Max RQ desc size: %d, max RQ S/G: %d\n", + dev_cap->max_rq_desc_sz, dev_cap->max_rq_sg); + mlx4_dbg(dev, "Max GSO size: %d\n", dev_cap->max_gso_sz); + mlx4_dbg(dev, "Max counters: %d\n", dev_cap->max_counters); + mlx4_dbg(dev, "Max RSS Table size: %d\n", dev_cap->max_rss_tbl_sz); + + dump_dev_cap_flags(dev, dev_cap->flags); + dump_dev_cap_flags2(dev, dev_cap->flags2); + +out: + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} + +int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + u64 flags; + int err = 0; + u8 field; + u32 bmme_flags; + int real_port; + int slave_port; + int first_port; + struct mlx4_active_ports actv_ports; + + err = mlx4_cmd_box(dev, 0, outbox->dma, 0, 0, MLX4_CMD_QUERY_DEV_CAP, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); + if (err) + return err; + + /* add port mng change event capability and disable mw type 1 + * unconditionally to slaves + */ + MLX4_GET(flags, outbox->buf, QUERY_DEV_CAP_EXT_FLAGS_OFFSET); + flags |= MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV; + flags &= ~MLX4_DEV_CAP_FLAG_MEM_WINDOW; + actv_ports = mlx4_get_active_ports(dev, slave); + first_port = find_first_bit(actv_ports.ports, dev->caps.num_ports); + for (slave_port = 0, real_port = first_port; + real_port < first_port + + bitmap_weight(actv_ports.ports, dev->caps.num_ports); + ++real_port, ++slave_port) { + if (flags & (MLX4_DEV_CAP_FLAG_WOL_PORT1 << real_port)) + flags |= MLX4_DEV_CAP_FLAG_WOL_PORT1 << slave_port; + else + flags &= ~(MLX4_DEV_CAP_FLAG_WOL_PORT1 << slave_port); + } + for (; slave_port < dev->caps.num_ports; ++slave_port) + flags &= ~(MLX4_DEV_CAP_FLAG_WOL_PORT1 << slave_port); + MLX4_PUT(outbox->buf, flags, QUERY_DEV_CAP_EXT_FLAGS_OFFSET); + + MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_VL_PORT_OFFSET); + field &= ~0x0F; + field |= bitmap_weight(actv_ports.ports, dev->caps.num_ports) & 0x0F; + MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_VL_PORT_OFFSET); + + /* For guests, disable timestamp */ + MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET); + field &= 0x7f; + MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET); + + /* For guests, disable vxlan tunneling */ + MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_VXLAN); + field &= 0xf7; + MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_VXLAN); + + /* For guests, report Blueflame disabled */ + MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_BF_OFFSET); + field &= 0x7f; + MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_BF_OFFSET); + + /* For guests, disable mw type 2 */ + MLX4_GET(bmme_flags, outbox->buf, QUERY_DEV_CAP_BMME_FLAGS_OFFSET); + bmme_flags &= ~MLX4_BMME_FLAG_TYPE_2_WIN; + MLX4_PUT(outbox->buf, bmme_flags, QUERY_DEV_CAP_BMME_FLAGS_OFFSET); + + /* turn off device-managed steering capability if not enabled */ + if (dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED) { + MLX4_GET(field, outbox->buf, + QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET); + field &= 0x7f; + MLX4_PUT(outbox->buf, field, + QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET); + } + + /* turn off ipoib managed steering for guests */ + MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_FLOW_STEERING_IPOIB_OFFSET); + field &= ~0x80; + MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_FLOW_STEERING_IPOIB_OFFSET); + + return 0; +} + +int mlx4_QUERY_PORT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + u64 def_mac; + u8 port_type; + u16 short_field; + int err; + int admin_link_state; + int port = mlx4_slave_convert_port(dev, slave, + vhcr->in_modifier & 0xFF); + +#define MLX4_VF_PORT_NO_LINK_SENSE_MASK 0xE0 +#define MLX4_PORT_LINK_UP_MASK 0x80 +#define QUERY_PORT_CUR_MAX_PKEY_OFFSET 0x0c +#define QUERY_PORT_CUR_MAX_GID_OFFSET 0x0e + + if (port < 0) + return -EINVAL; + + /* Protect against untrusted guests: enforce that this is the + * QUERY_PORT general query. + */ + if (vhcr->op_modifier || vhcr->in_modifier & ~0xFF) + return -EINVAL; + + vhcr->in_modifier = port; + + err = mlx4_cmd_box(dev, 0, outbox->dma, vhcr->in_modifier, 0, + MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_NATIVE); + + if (!err && dev->caps.function != slave) { + def_mac = priv->mfunc.master.vf_oper[slave].vport[vhcr->in_modifier].state.mac; + MLX4_PUT(outbox->buf, def_mac, QUERY_PORT_MAC_OFFSET); + + /* get port type - currently only eth is enabled */ + MLX4_GET(port_type, outbox->buf, + QUERY_PORT_SUPPORTED_TYPE_OFFSET); + + /* No link sensing allowed */ + port_type &= MLX4_VF_PORT_NO_LINK_SENSE_MASK; + /* set port type to currently operating port type */ + port_type |= (dev->caps.port_type[vhcr->in_modifier] & 0x3); + + admin_link_state = priv->mfunc.master.vf_oper[slave].vport[vhcr->in_modifier].state.link_state; + if (IFLA_VF_LINK_STATE_ENABLE == admin_link_state) + port_type |= MLX4_PORT_LINK_UP_MASK; + else if (IFLA_VF_LINK_STATE_DISABLE == admin_link_state) + port_type &= ~MLX4_PORT_LINK_UP_MASK; + + MLX4_PUT(outbox->buf, port_type, + QUERY_PORT_SUPPORTED_TYPE_OFFSET); + + if (dev->caps.port_type[vhcr->in_modifier] == MLX4_PORT_TYPE_ETH) + short_field = mlx4_get_slave_num_gids(dev, slave, port); + else + short_field = 1; /* slave max gids */ + MLX4_PUT(outbox->buf, short_field, + QUERY_PORT_CUR_MAX_GID_OFFSET); + + short_field = dev->caps.pkey_table_len[vhcr->in_modifier]; + MLX4_PUT(outbox->buf, short_field, + QUERY_PORT_CUR_MAX_PKEY_OFFSET); + } + + return err; +} + +int mlx4_get_slave_pkey_gid_tbl_len(struct mlx4_dev *dev, u8 port, + int *gid_tbl_len, int *pkey_tbl_len) +{ + struct mlx4_cmd_mailbox *mailbox; + u32 *outbox; + u16 field; + int err; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + err = mlx4_cmd_box(dev, 0, mailbox->dma, port, 0, + MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); + if (err) + goto out; + + outbox = mailbox->buf; + + MLX4_GET(field, outbox, QUERY_PORT_CUR_MAX_GID_OFFSET); + *gid_tbl_len = field; + + MLX4_GET(field, outbox, QUERY_PORT_CUR_MAX_PKEY_OFFSET); + *pkey_tbl_len = field; + +out: + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} +EXPORT_SYMBOL(mlx4_get_slave_pkey_gid_tbl_len); + +int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_icm_iter iter; + __be64 *pages; + int lg; + int nent = 0; + int i; + int err = 0; + int ts = 0, tc = 0; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + pages = mailbox->buf; + + for (mlx4_icm_first(icm, &iter); + !mlx4_icm_last(&iter); + mlx4_icm_next(&iter)) { + /* + * We have to pass pages that are aligned to their + * size, so find the least significant 1 in the + * address or size and use that as our log2 size. + */ + lg = ffs(mlx4_icm_addr(&iter) | mlx4_icm_size(&iter)) - 1; + if (lg < MLX4_ICM_PAGE_SHIFT) { + mlx4_warn(dev, "Got FW area not aligned to %d (%llx/%lx)\n", + MLX4_ICM_PAGE_SIZE, + (unsigned long long) mlx4_icm_addr(&iter), + mlx4_icm_size(&iter)); + err = -EINVAL; + goto out; + } + + for (i = 0; i < mlx4_icm_size(&iter) >> lg; ++i) { + if (virt != -1) { + pages[nent * 2] = cpu_to_be64(virt); + virt += 1 << lg; + } + + pages[nent * 2 + 1] = + cpu_to_be64((mlx4_icm_addr(&iter) + (i << lg)) | + (lg - MLX4_ICM_PAGE_SHIFT)); + ts += 1 << (lg - 10); + ++tc; + + if (++nent == MLX4_MAILBOX_SIZE / 16) { + err = mlx4_cmd(dev, mailbox->dma, nent, 0, op, + MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_NATIVE); + if (err) + goto out; + nent = 0; + } + } + } + + if (nent) + err = mlx4_cmd(dev, mailbox->dma, nent, 0, op, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); + if (err) + goto out; + + switch (op) { + case MLX4_CMD_MAP_FA: + mlx4_dbg(dev, "Mapped %d chunks/%d KB for FW\n", tc, ts); + break; + case MLX4_CMD_MAP_ICM_AUX: + mlx4_dbg(dev, "Mapped %d chunks/%d KB for ICM aux\n", tc, ts); + break; + case MLX4_CMD_MAP_ICM: + mlx4_dbg(dev, "Mapped %d chunks/%d KB at %llx for ICM\n", + tc, ts, (unsigned long long) virt - (ts << 10)); + break; + } + +out: + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} + +int mlx4_MAP_FA(struct mlx4_dev *dev, struct mlx4_icm *icm) +{ + return mlx4_map_cmd(dev, MLX4_CMD_MAP_FA, icm, -1); +} + +int mlx4_UNMAP_FA(struct mlx4_dev *dev) +{ + return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_UNMAP_FA, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); +} + + +int mlx4_RUN_FW(struct mlx4_dev *dev) +{ + return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_RUN_FW, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); +} + +int mlx4_QUERY_FW(struct mlx4_dev *dev) +{ + struct mlx4_fw *fw = &mlx4_priv(dev)->fw; + struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd; + struct mlx4_cmd_mailbox *mailbox; + u32 *outbox; + int err = 0; + u64 fw_ver; + u16 cmd_if_rev; + u8 lg; + +#define QUERY_FW_OUT_SIZE 0x100 +#define QUERY_FW_VER_OFFSET 0x00 +#define QUERY_FW_PPF_ID 0x09 +#define QUERY_FW_CMD_IF_REV_OFFSET 0x0a +#define QUERY_FW_MAX_CMD_OFFSET 0x0f +#define QUERY_FW_ERR_START_OFFSET 0x30 +#define QUERY_FW_ERR_SIZE_OFFSET 0x38 +#define QUERY_FW_ERR_BAR_OFFSET 0x3c + +#define QUERY_FW_SIZE_OFFSET 0x00 +#define QUERY_FW_CLR_INT_BASE_OFFSET 0x20 +#define QUERY_FW_CLR_INT_BAR_OFFSET 0x28 + +#define QUERY_FW_COMM_BASE_OFFSET 0x40 +#define QUERY_FW_COMM_BAR_OFFSET 0x48 + +#define QUERY_FW_CLOCK_OFFSET 0x50 +#define QUERY_FW_CLOCK_BAR 0x58 + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + outbox = mailbox->buf; + + err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_FW, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); + if (err) + goto out; + + MLX4_GET(fw_ver, outbox, QUERY_FW_VER_OFFSET); + /* + * FW subminor version is at more significant bits than minor + * version, so swap here. + */ + dev->caps.fw_ver = (fw_ver & 0xffff00000000ull) | + ((fw_ver & 0xffff0000ull) >> 16) | + ((fw_ver & 0x0000ffffull) << 16); + + MLX4_GET(lg, outbox, QUERY_FW_PPF_ID); + dev->caps.function = lg; + + if (mlx4_is_slave(dev)) + goto out; + + + MLX4_GET(cmd_if_rev, outbox, QUERY_FW_CMD_IF_REV_OFFSET); + if (cmd_if_rev < MLX4_COMMAND_INTERFACE_MIN_REV || + cmd_if_rev > MLX4_COMMAND_INTERFACE_MAX_REV) { + mlx4_err(dev, "Installed FW has unsupported command interface revision %d\n", + cmd_if_rev); + mlx4_err(dev, "(Installed FW version is %d.%d.%03d)\n", + (int) (dev->caps.fw_ver >> 32), + (int) (dev->caps.fw_ver >> 16) & 0xffff, + (int) dev->caps.fw_ver & 0xffff); + mlx4_err(dev, "This driver version supports only revisions %d to %d\n", + MLX4_COMMAND_INTERFACE_MIN_REV, MLX4_COMMAND_INTERFACE_MAX_REV); + err = -ENODEV; + goto out; + } + + if (cmd_if_rev < MLX4_COMMAND_INTERFACE_NEW_PORT_CMDS) + dev->flags |= MLX4_FLAG_OLD_PORT_CMDS; + + MLX4_GET(lg, outbox, QUERY_FW_MAX_CMD_OFFSET); + cmd->max_cmds = 1 << lg; + + mlx4_dbg(dev, "FW version %d.%d.%03d (cmd intf rev %d), max commands %d\n", + (int) (dev->caps.fw_ver >> 32), + (int) (dev->caps.fw_ver >> 16) & 0xffff, + (int) dev->caps.fw_ver & 0xffff, + cmd_if_rev, cmd->max_cmds); + + MLX4_GET(fw->catas_offset, outbox, QUERY_FW_ERR_START_OFFSET); + MLX4_GET(fw->catas_size, outbox, QUERY_FW_ERR_SIZE_OFFSET); + MLX4_GET(fw->catas_bar, outbox, QUERY_FW_ERR_BAR_OFFSET); + fw->catas_bar = (fw->catas_bar >> 6) * 2; + + mlx4_dbg(dev, "Catastrophic error buffer at 0x%llx, size 0x%x, BAR %d\n", + (unsigned long long) fw->catas_offset, fw->catas_size, fw->catas_bar); + + MLX4_GET(fw->fw_pages, outbox, QUERY_FW_SIZE_OFFSET); + MLX4_GET(fw->clr_int_base, outbox, QUERY_FW_CLR_INT_BASE_OFFSET); + MLX4_GET(fw->clr_int_bar, outbox, QUERY_FW_CLR_INT_BAR_OFFSET); + fw->clr_int_bar = (fw->clr_int_bar >> 6) * 2; + + MLX4_GET(fw->comm_base, outbox, QUERY_FW_COMM_BASE_OFFSET); + MLX4_GET(fw->comm_bar, outbox, QUERY_FW_COMM_BAR_OFFSET); + fw->comm_bar = (fw->comm_bar >> 6) * 2; + mlx4_dbg(dev, "Communication vector bar:%d offset:0x%llx\n", + fw->comm_bar, fw->comm_base); + mlx4_dbg(dev, "FW size %d KB\n", fw->fw_pages >> 2); + + MLX4_GET(fw->clock_offset, outbox, QUERY_FW_CLOCK_OFFSET); + MLX4_GET(fw->clock_bar, outbox, QUERY_FW_CLOCK_BAR); + fw->clock_bar = (fw->clock_bar >> 6) * 2; + mlx4_dbg(dev, "Internal clock bar:%d offset:0x%llx\n", + fw->clock_bar, fw->clock_offset); + + /* + * Round up number of system pages needed in case + * MLX4_ICM_PAGE_SIZE < PAGE_SIZE. + */ + fw->fw_pages = + ALIGN(fw->fw_pages, PAGE_SIZE / MLX4_ICM_PAGE_SIZE) >> + (PAGE_SHIFT - MLX4_ICM_PAGE_SHIFT); + + mlx4_dbg(dev, "Clear int @ %llx, BAR %d\n", + (unsigned long long) fw->clr_int_base, fw->clr_int_bar); + +out: + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} + +int mlx4_QUERY_FW_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + u8 *outbuf; + int err; + + outbuf = outbox->buf; + err = mlx4_cmd_box(dev, 0, outbox->dma, 0, 0, MLX4_CMD_QUERY_FW, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); + if (err) + return err; + + /* for slaves, set pci PPF ID to invalid and zero out everything + * else except FW version */ + outbuf[0] = outbuf[1] = 0; + memset(&outbuf[8], 0, QUERY_FW_OUT_SIZE - 8); + outbuf[QUERY_FW_PPF_ID] = MLX4_INVALID_SLAVE_ID; + + return 0; +} + +static void get_board_id(void *vsd, char *board_id) +{ + int i; + +#define VSD_OFFSET_SIG1 0x00 +#define VSD_OFFSET_SIG2 0xde +#define VSD_OFFSET_MLX_BOARD_ID 0xd0 +#define VSD_OFFSET_TS_BOARD_ID 0x20 + +#define VSD_SIGNATURE_TOPSPIN 0x5ad + + memset(board_id, 0, MLX4_BOARD_ID_LEN); + + if (be16_to_cpup(vsd + VSD_OFFSET_SIG1) == VSD_SIGNATURE_TOPSPIN && + be16_to_cpup(vsd + VSD_OFFSET_SIG2) == VSD_SIGNATURE_TOPSPIN) { + strlcpy(board_id, vsd + VSD_OFFSET_TS_BOARD_ID, MLX4_BOARD_ID_LEN); + } else { + /* + * The board ID is a string but the firmware byte + * swaps each 4-byte word before passing it back to + * us. Therefore we need to swab it before printing. + */ + for (i = 0; i < 4; ++i) + ((u32 *) board_id)[i] = + swab32(*(u32 *) (vsd + VSD_OFFSET_MLX_BOARD_ID + i * 4)); + } +} + +int mlx4_QUERY_ADAPTER(struct mlx4_dev *dev, struct mlx4_adapter *adapter) +{ + struct mlx4_cmd_mailbox *mailbox; + u32 *outbox; + int err; + +#define QUERY_ADAPTER_OUT_SIZE 0x100 +#define QUERY_ADAPTER_INTA_PIN_OFFSET 0x10 +#define QUERY_ADAPTER_VSD_OFFSET 0x20 + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + outbox = mailbox->buf; + + err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_ADAPTER, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); + if (err) + goto out; + + MLX4_GET(adapter->inta_pin, outbox, QUERY_ADAPTER_INTA_PIN_OFFSET); + + get_board_id(outbox + QUERY_ADAPTER_VSD_OFFSET / 4, + adapter->board_id); + +out: + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} + +int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) +{ + struct mlx4_cmd_mailbox *mailbox; + __be32 *inbox; + int err; + +#define INIT_HCA_IN_SIZE 0x200 +#define INIT_HCA_VERSION_OFFSET 0x000 +#define INIT_HCA_VERSION 2 +#define INIT_HCA_VXLAN_OFFSET 0x0c +#define INIT_HCA_CACHELINE_SZ_OFFSET 0x0e +#define INIT_HCA_FLAGS_OFFSET 0x014 +#define INIT_HCA_QPC_OFFSET 0x020 +#define INIT_HCA_QPC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x10) +#define INIT_HCA_LOG_QP_OFFSET (INIT_HCA_QPC_OFFSET + 0x17) +#define INIT_HCA_SRQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x28) +#define INIT_HCA_LOG_SRQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x2f) +#define INIT_HCA_CQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x30) +#define INIT_HCA_LOG_CQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x37) +#define INIT_HCA_EQE_CQE_OFFSETS (INIT_HCA_QPC_OFFSET + 0x38) +#define INIT_HCA_EQE_CQE_STRIDE_OFFSET (INIT_HCA_QPC_OFFSET + 0x3b) +#define INIT_HCA_ALTC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x40) +#define INIT_HCA_AUXC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x50) +#define INIT_HCA_EQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x60) +#define INIT_HCA_LOG_EQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x67) +#define INIT_HCA_RDMARC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x70) +#define INIT_HCA_LOG_RD_OFFSET (INIT_HCA_QPC_OFFSET + 0x77) +#define INIT_HCA_MCAST_OFFSET 0x0c0 +#define INIT_HCA_MC_BASE_OFFSET (INIT_HCA_MCAST_OFFSET + 0x00) +#define INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x12) +#define INIT_HCA_LOG_MC_HASH_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x16) +#define INIT_HCA_UC_STEERING_OFFSET (INIT_HCA_MCAST_OFFSET + 0x18) +#define INIT_HCA_LOG_MC_TABLE_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x1b) +#define INIT_HCA_DEVICE_MANAGED_FLOW_STEERING_EN 0x6 +#define INIT_HCA_FS_PARAM_OFFSET 0x1d0 +#define INIT_HCA_FS_BASE_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x00) +#define INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x12) +#define INIT_HCA_FS_LOG_TABLE_SZ_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x1b) +#define INIT_HCA_FS_ETH_BITS_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x21) +#define INIT_HCA_FS_ETH_NUM_ADDRS_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x22) +#define INIT_HCA_FS_IB_BITS_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x25) +#define INIT_HCA_FS_IB_NUM_ADDRS_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x26) +#define INIT_HCA_TPT_OFFSET 0x0f0 +#define INIT_HCA_DMPT_BASE_OFFSET (INIT_HCA_TPT_OFFSET + 0x00) +#define INIT_HCA_TPT_MW_OFFSET (INIT_HCA_TPT_OFFSET + 0x08) +#define INIT_HCA_LOG_MPT_SZ_OFFSET (INIT_HCA_TPT_OFFSET + 0x0b) +#define INIT_HCA_MTT_BASE_OFFSET (INIT_HCA_TPT_OFFSET + 0x10) +#define INIT_HCA_CMPT_BASE_OFFSET (INIT_HCA_TPT_OFFSET + 0x18) +#define INIT_HCA_UAR_OFFSET 0x120 +#define INIT_HCA_LOG_UAR_SZ_OFFSET (INIT_HCA_UAR_OFFSET + 0x0a) +#define INIT_HCA_UAR_PAGE_SZ_OFFSET (INIT_HCA_UAR_OFFSET + 0x0b) + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + inbox = mailbox->buf; + + *((u8 *) mailbox->buf + INIT_HCA_VERSION_OFFSET) = INIT_HCA_VERSION; + + *((u8 *) mailbox->buf + INIT_HCA_CACHELINE_SZ_OFFSET) = + (ilog2(cache_line_size()) - 4) << 5; + +#if defined(__LITTLE_ENDIAN) + *(inbox + INIT_HCA_FLAGS_OFFSET / 4) &= ~cpu_to_be32(1 << 1); +#elif defined(__BIG_ENDIAN) + *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 1); +#else +#error Host endianness not defined +#endif + /* Check port for UD address vector: */ + *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1); + + /* Enable IPoIB checksumming if we can: */ + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM) + *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 3); + + /* Enable QoS support if module parameter set */ + if (enable_qos) + *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 2); + + /* enable counters */ + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS) + *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 4); + + /* CX3 is capable of extending CQEs/EQEs from 32 to 64 bytes */ + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_64B_EQE) { + *(inbox + INIT_HCA_EQE_CQE_OFFSETS / 4) |= cpu_to_be32(1 << 29); + dev->caps.eqe_size = 64; + dev->caps.eqe_factor = 1; + } else { + dev->caps.eqe_size = 32; + dev->caps.eqe_factor = 0; + } + + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_64B_CQE) { + *(inbox + INIT_HCA_EQE_CQE_OFFSETS / 4) |= cpu_to_be32(1 << 30); + dev->caps.cqe_size = 64; + dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE; + } else { + dev->caps.cqe_size = 32; + } + + /* CX3 is capable of extending CQEs\EQEs to strides larger than 64B */ + if ((dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_EQE_STRIDE) && + (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_CQE_STRIDE)) { + dev->caps.eqe_size = cache_line_size(); + dev->caps.cqe_size = cache_line_size(); + dev->caps.eqe_factor = 0; + MLX4_PUT(inbox, (u8)((ilog2(dev->caps.eqe_size) - 5) << 4 | + (ilog2(dev->caps.eqe_size) - 5)), + INIT_HCA_EQE_CQE_STRIDE_OFFSET); + + /* User still need to know to support CQE > 32B */ + dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE; + } + + /* QPC/EEC/CQC/EQC/RDMARC attributes */ + + MLX4_PUT(inbox, param->qpc_base, INIT_HCA_QPC_BASE_OFFSET); + MLX4_PUT(inbox, param->log_num_qps, INIT_HCA_LOG_QP_OFFSET); + MLX4_PUT(inbox, param->srqc_base, INIT_HCA_SRQC_BASE_OFFSET); + MLX4_PUT(inbox, param->log_num_srqs, INIT_HCA_LOG_SRQ_OFFSET); + MLX4_PUT(inbox, param->cqc_base, INIT_HCA_CQC_BASE_OFFSET); + MLX4_PUT(inbox, param->log_num_cqs, INIT_HCA_LOG_CQ_OFFSET); + MLX4_PUT(inbox, param->altc_base, INIT_HCA_ALTC_BASE_OFFSET); + MLX4_PUT(inbox, param->auxc_base, INIT_HCA_AUXC_BASE_OFFSET); + MLX4_PUT(inbox, param->eqc_base, INIT_HCA_EQC_BASE_OFFSET); + MLX4_PUT(inbox, param->log_num_eqs, INIT_HCA_LOG_EQ_OFFSET); + MLX4_PUT(inbox, param->rdmarc_base, INIT_HCA_RDMARC_BASE_OFFSET); + MLX4_PUT(inbox, param->log_rd_per_qp, INIT_HCA_LOG_RD_OFFSET); + + /* steering attributes */ + if (dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED) { + *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= + cpu_to_be32(1 << + INIT_HCA_DEVICE_MANAGED_FLOW_STEERING_EN); + + MLX4_PUT(inbox, param->mc_base, INIT_HCA_FS_BASE_OFFSET); + MLX4_PUT(inbox, param->log_mc_entry_sz, + INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET); + MLX4_PUT(inbox, param->log_mc_table_sz, + INIT_HCA_FS_LOG_TABLE_SZ_OFFSET); + /* Enable Ethernet flow steering + * with udp unicast and tcp unicast + */ + MLX4_PUT(inbox, (u8) (MLX4_FS_UDP_UC_EN | MLX4_FS_TCP_UC_EN), + INIT_HCA_FS_ETH_BITS_OFFSET); + MLX4_PUT(inbox, (u16) MLX4_FS_NUM_OF_L2_ADDR, + INIT_HCA_FS_ETH_NUM_ADDRS_OFFSET); + /* Enable IPoIB flow steering + * with udp unicast and tcp unicast + */ + MLX4_PUT(inbox, (u8) (MLX4_FS_UDP_UC_EN | MLX4_FS_TCP_UC_EN), + INIT_HCA_FS_IB_BITS_OFFSET); + MLX4_PUT(inbox, (u16) MLX4_FS_NUM_OF_L2_ADDR, + INIT_HCA_FS_IB_NUM_ADDRS_OFFSET); + } else { + MLX4_PUT(inbox, param->mc_base, INIT_HCA_MC_BASE_OFFSET); + MLX4_PUT(inbox, param->log_mc_entry_sz, + INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET); + MLX4_PUT(inbox, param->log_mc_hash_sz, + INIT_HCA_LOG_MC_HASH_SZ_OFFSET); + MLX4_PUT(inbox, param->log_mc_table_sz, + INIT_HCA_LOG_MC_TABLE_SZ_OFFSET); + if (dev->caps.steering_mode == MLX4_STEERING_MODE_B0) + MLX4_PUT(inbox, (u8) (1 << 3), + INIT_HCA_UC_STEERING_OFFSET); + } + + /* TPT attributes */ + + MLX4_PUT(inbox, param->dmpt_base, INIT_HCA_DMPT_BASE_OFFSET); + MLX4_PUT(inbox, param->mw_enabled, INIT_HCA_TPT_MW_OFFSET); + MLX4_PUT(inbox, param->log_mpt_sz, INIT_HCA_LOG_MPT_SZ_OFFSET); + MLX4_PUT(inbox, param->mtt_base, INIT_HCA_MTT_BASE_OFFSET); + MLX4_PUT(inbox, param->cmpt_base, INIT_HCA_CMPT_BASE_OFFSET); + + /* UAR attributes */ + + MLX4_PUT(inbox, param->uar_page_sz, INIT_HCA_UAR_PAGE_SZ_OFFSET); + MLX4_PUT(inbox, param->log_uar_sz, INIT_HCA_LOG_UAR_SZ_OFFSET); + + /* set parser VXLAN attributes */ + if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS) { + u8 parser_params = 0; + MLX4_PUT(inbox, parser_params, INIT_HCA_VXLAN_OFFSET); + } + + err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_INIT_HCA, 10000, + MLX4_CMD_NATIVE); + + if (err) + mlx4_err(dev, "INIT_HCA returns %d\n", err); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} + +int mlx4_QUERY_HCA(struct mlx4_dev *dev, + struct mlx4_init_hca_param *param) +{ + struct mlx4_cmd_mailbox *mailbox; + __be32 *outbox; + u32 dword_field; + int err; + u8 byte_field; + +#define QUERY_HCA_GLOBAL_CAPS_OFFSET 0x04 +#define QUERY_HCA_CORE_CLOCK_OFFSET 0x0c + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + outbox = mailbox->buf; + + err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, + MLX4_CMD_QUERY_HCA, + MLX4_CMD_TIME_CLASS_B, + !mlx4_is_slave(dev)); + if (err) + goto out; + + MLX4_GET(param->global_caps, outbox, QUERY_HCA_GLOBAL_CAPS_OFFSET); + MLX4_GET(param->hca_core_clock, outbox, QUERY_HCA_CORE_CLOCK_OFFSET); + + /* QPC/EEC/CQC/EQC/RDMARC attributes */ + + MLX4_GET(param->qpc_base, outbox, INIT_HCA_QPC_BASE_OFFSET); + MLX4_GET(param->log_num_qps, outbox, INIT_HCA_LOG_QP_OFFSET); + MLX4_GET(param->srqc_base, outbox, INIT_HCA_SRQC_BASE_OFFSET); + MLX4_GET(param->log_num_srqs, outbox, INIT_HCA_LOG_SRQ_OFFSET); + MLX4_GET(param->cqc_base, outbox, INIT_HCA_CQC_BASE_OFFSET); + MLX4_GET(param->log_num_cqs, outbox, INIT_HCA_LOG_CQ_OFFSET); + MLX4_GET(param->altc_base, outbox, INIT_HCA_ALTC_BASE_OFFSET); + MLX4_GET(param->auxc_base, outbox, INIT_HCA_AUXC_BASE_OFFSET); + MLX4_GET(param->eqc_base, outbox, INIT_HCA_EQC_BASE_OFFSET); + MLX4_GET(param->log_num_eqs, outbox, INIT_HCA_LOG_EQ_OFFSET); + MLX4_GET(param->rdmarc_base, outbox, INIT_HCA_RDMARC_BASE_OFFSET); + MLX4_GET(param->log_rd_per_qp, outbox, INIT_HCA_LOG_RD_OFFSET); + + MLX4_GET(dword_field, outbox, INIT_HCA_FLAGS_OFFSET); + if (dword_field & (1 << INIT_HCA_DEVICE_MANAGED_FLOW_STEERING_EN)) { + param->steering_mode = MLX4_STEERING_MODE_DEVICE_MANAGED; + } else { + MLX4_GET(byte_field, outbox, INIT_HCA_UC_STEERING_OFFSET); + if (byte_field & 0x8) + param->steering_mode = MLX4_STEERING_MODE_B0; + else + param->steering_mode = MLX4_STEERING_MODE_A0; + } + /* steering attributes */ + if (param->steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { + MLX4_GET(param->mc_base, outbox, INIT_HCA_FS_BASE_OFFSET); + MLX4_GET(param->log_mc_entry_sz, outbox, + INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET); + MLX4_GET(param->log_mc_table_sz, outbox, + INIT_HCA_FS_LOG_TABLE_SZ_OFFSET); + } else { + MLX4_GET(param->mc_base, outbox, INIT_HCA_MC_BASE_OFFSET); + MLX4_GET(param->log_mc_entry_sz, outbox, + INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET); + MLX4_GET(param->log_mc_hash_sz, outbox, + INIT_HCA_LOG_MC_HASH_SZ_OFFSET); + MLX4_GET(param->log_mc_table_sz, outbox, + INIT_HCA_LOG_MC_TABLE_SZ_OFFSET); + } + + /* CX3 is capable of extending CQEs/EQEs from 32 to 64 bytes */ + MLX4_GET(byte_field, outbox, INIT_HCA_EQE_CQE_OFFSETS); + if (byte_field & 0x20) /* 64-bytes eqe enabled */ + param->dev_cap_enabled |= MLX4_DEV_CAP_64B_EQE_ENABLED; + if (byte_field & 0x40) /* 64-bytes cqe enabled */ + param->dev_cap_enabled |= MLX4_DEV_CAP_64B_CQE_ENABLED; + + /* CX3 is capable of extending CQEs\EQEs to strides larger than 64B */ + MLX4_GET(byte_field, outbox, INIT_HCA_EQE_CQE_STRIDE_OFFSET); + if (byte_field) { + param->dev_cap_enabled |= MLX4_DEV_CAP_64B_EQE_ENABLED; + param->dev_cap_enabled |= MLX4_DEV_CAP_64B_CQE_ENABLED; + param->cqe_size = 1 << ((byte_field & + MLX4_CQE_SIZE_MASK_STRIDE) + 5); + param->eqe_size = 1 << (((byte_field & + MLX4_EQE_SIZE_MASK_STRIDE) >> 4) + 5); + } + + /* TPT attributes */ + + MLX4_GET(param->dmpt_base, outbox, INIT_HCA_DMPT_BASE_OFFSET); + MLX4_GET(param->mw_enabled, outbox, INIT_HCA_TPT_MW_OFFSET); + MLX4_GET(param->log_mpt_sz, outbox, INIT_HCA_LOG_MPT_SZ_OFFSET); + MLX4_GET(param->mtt_base, outbox, INIT_HCA_MTT_BASE_OFFSET); + MLX4_GET(param->cmpt_base, outbox, INIT_HCA_CMPT_BASE_OFFSET); + + /* UAR attributes */ + + MLX4_GET(param->uar_page_sz, outbox, INIT_HCA_UAR_PAGE_SZ_OFFSET); + MLX4_GET(param->log_uar_sz, outbox, INIT_HCA_LOG_UAR_SZ_OFFSET); + +out: + mlx4_free_cmd_mailbox(dev, mailbox); + + return err; +} + +/* for IB-type ports only in SRIOV mode. Checks that both proxy QP0 + * and real QP0 are active, so that the paravirtualized QP0 is ready + * to operate */ +static int check_qp0_state(struct mlx4_dev *dev, int function, int port) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + /* irrelevant if not infiniband */ + if (priv->mfunc.master.qp0_state[port].proxy_qp0_active && + priv->mfunc.master.qp0_state[port].qp0_active) + return 1; + return 0; +} + +int mlx4_INIT_PORT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int port = mlx4_slave_convert_port(dev, slave, vhcr->in_modifier); + int err; + + if (port < 0) + return -EINVAL; + + if (priv->mfunc.master.slave_state[slave].init_port_mask & (1 << port)) + return 0; + + if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) { + /* Enable port only if it was previously disabled */ + if (!priv->mfunc.master.init_port_ref[port]) { + err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); + if (err) + return err; + } + priv->mfunc.master.slave_state[slave].init_port_mask |= (1 << port); + } else { + if (slave == mlx4_master_func_num(dev)) { + if (check_qp0_state(dev, slave, port) && + !priv->mfunc.master.qp0_state[port].port_active) { + err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); + if (err) + return err; + priv->mfunc.master.qp0_state[port].port_active = 1; + priv->mfunc.master.slave_state[slave].init_port_mask |= (1 << port); + } + } else + priv->mfunc.master.slave_state[slave].init_port_mask |= (1 << port); + } + ++priv->mfunc.master.init_port_ref[port]; + return 0; +} + +int mlx4_INIT_PORT(struct mlx4_dev *dev, int port) +{ + struct mlx4_cmd_mailbox *mailbox; + u32 *inbox; + int err; + u32 flags; + u16 field; + + if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { +#define INIT_PORT_IN_SIZE 256 +#define INIT_PORT_FLAGS_OFFSET 0x00 +#define INIT_PORT_FLAG_SIG (1 << 18) +#define INIT_PORT_FLAG_NG (1 << 17) +#define INIT_PORT_FLAG_G0 (1 << 16) +#define INIT_PORT_VL_SHIFT 4 +#define INIT_PORT_PORT_WIDTH_SHIFT 8 +#define INIT_PORT_MTU_OFFSET 0x04 +#define INIT_PORT_MAX_GID_OFFSET 0x06 +#define INIT_PORT_MAX_PKEY_OFFSET 0x0a +#define INIT_PORT_GUID0_OFFSET 0x10 +#define INIT_PORT_NODE_GUID_OFFSET 0x18 +#define INIT_PORT_SI_GUID_OFFSET 0x20 + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + inbox = mailbox->buf; + + flags = 0; + flags |= (dev->caps.vl_cap[port] & 0xf) << INIT_PORT_VL_SHIFT; + flags |= (dev->caps.port_width_cap[port] & 0xf) << INIT_PORT_PORT_WIDTH_SHIFT; + MLX4_PUT(inbox, flags, INIT_PORT_FLAGS_OFFSET); + + field = 128 << dev->caps.ib_mtu_cap[port]; + MLX4_PUT(inbox, field, INIT_PORT_MTU_OFFSET); + field = dev->caps.gid_table_len[port]; + MLX4_PUT(inbox, field, INIT_PORT_MAX_GID_OFFSET); + field = dev->caps.pkey_table_len[port]; + MLX4_PUT(inbox, field, INIT_PORT_MAX_PKEY_OFFSET); + + err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_INIT_PORT, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); + + mlx4_free_cmd_mailbox(dev, mailbox); + } else + err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + + return err; +} +EXPORT_SYMBOL_GPL(mlx4_INIT_PORT); + +int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int port = mlx4_slave_convert_port(dev, slave, vhcr->in_modifier); + int err; + + if (port < 0) + return -EINVAL; + + if (!(priv->mfunc.master.slave_state[slave].init_port_mask & + (1 << port))) + return 0; + + if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) { + if (priv->mfunc.master.init_port_ref[port] == 1) { + err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT, + 1000, MLX4_CMD_NATIVE); + if (err) + return err; + } + priv->mfunc.master.slave_state[slave].init_port_mask &= ~(1 << port); + } else { + /* infiniband port */ + if (slave == mlx4_master_func_num(dev)) { + if (!priv->mfunc.master.qp0_state[port].qp0_active && + priv->mfunc.master.qp0_state[port].port_active) { + err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT, + 1000, MLX4_CMD_NATIVE); + if (err) + return err; + priv->mfunc.master.slave_state[slave].init_port_mask &= ~(1 << port); + priv->mfunc.master.qp0_state[port].port_active = 0; + } + } else + priv->mfunc.master.slave_state[slave].init_port_mask &= ~(1 << port); + } + --priv->mfunc.master.init_port_ref[port]; + return 0; +} + +int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port) +{ + return mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT, 1000, + MLX4_CMD_WRAPPED); +} +EXPORT_SYMBOL_GPL(mlx4_CLOSE_PORT); + +int mlx4_CLOSE_HCA(struct mlx4_dev *dev, int panic) +{ + return mlx4_cmd(dev, 0, 0, panic, MLX4_CMD_CLOSE_HCA, 1000, + MLX4_CMD_NATIVE); +} + +struct mlx4_config_dev { + __be32 update_flags; + __be32 rsdv1[3]; + __be16 vxlan_udp_dport; + __be16 rsvd2; +}; + +#define MLX4_VXLAN_UDP_DPORT (1 << 0) + +static int mlx4_CONFIG_DEV(struct mlx4_dev *dev, struct mlx4_config_dev *config_dev) +{ + int err; + struct mlx4_cmd_mailbox *mailbox; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + memcpy(mailbox->buf, config_dev, sizeof(*config_dev)); + + err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_CONFIG_DEV, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} + +int mlx4_config_vxlan_port(struct mlx4_dev *dev, __be16 udp_port) +{ + struct mlx4_config_dev config_dev; + + memset(&config_dev, 0, sizeof(config_dev)); + config_dev.update_flags = cpu_to_be32(MLX4_VXLAN_UDP_DPORT); + config_dev.vxlan_udp_dport = udp_port; + + return mlx4_CONFIG_DEV(dev, &config_dev); +} +EXPORT_SYMBOL_GPL(mlx4_config_vxlan_port); + + +int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages) +{ + int ret = mlx4_cmd_imm(dev, icm_size, aux_pages, 0, 0, + MLX4_CMD_SET_ICM_SIZE, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); + if (ret) + return ret; + + /* + * Round up number of system pages needed in case + * MLX4_ICM_PAGE_SIZE < PAGE_SIZE. + */ + *aux_pages = ALIGN(*aux_pages, PAGE_SIZE / MLX4_ICM_PAGE_SIZE) >> + (PAGE_SHIFT - MLX4_ICM_PAGE_SHIFT); + + return 0; +} + +int mlx4_NOP(struct mlx4_dev *dev) +{ + /* Input modifier of 0x1f means "finish as soon as possible." */ + return mlx4_cmd(dev, 0, 0x1f, 0, MLX4_CMD_NOP, 100, MLX4_CMD_NATIVE); +} + +int mlx4_get_phys_port_id(struct mlx4_dev *dev) +{ + u8 port; + u32 *outbox; + struct mlx4_cmd_mailbox *mailbox; + u32 in_mod; + u32 guid_hi, guid_lo; + int err, ret = 0; +#define MOD_STAT_CFG_PORT_OFFSET 8 +#define MOD_STAT_CFG_GUID_H 0X14 +#define MOD_STAT_CFG_GUID_L 0X1c + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + outbox = mailbox->buf; + + for (port = 1; port <= dev->caps.num_ports; port++) { + in_mod = port << MOD_STAT_CFG_PORT_OFFSET; + err = mlx4_cmd_box(dev, 0, mailbox->dma, in_mod, 0x2, + MLX4_CMD_MOD_STAT_CFG, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (err) { + mlx4_err(dev, "Fail to get port %d uplink guid\n", + port); + ret = err; + } else { + MLX4_GET(guid_hi, outbox, MOD_STAT_CFG_GUID_H); + MLX4_GET(guid_lo, outbox, MOD_STAT_CFG_GUID_L); + dev->caps.phys_port_id[port] = (u64)guid_lo | + (u64)guid_hi << 32; + } + } + mlx4_free_cmd_mailbox(dev, mailbox); + return ret; +} + +#define MLX4_WOL_SETUP_MODE (5 << 28) +int mlx4_wol_read(struct mlx4_dev *dev, u64 *config, int port) +{ + u32 in_mod = MLX4_WOL_SETUP_MODE | port << 8; + + return mlx4_cmd_imm(dev, 0, config, in_mod, 0x3, + MLX4_CMD_MOD_STAT_CFG, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); +} +EXPORT_SYMBOL_GPL(mlx4_wol_read); + +int mlx4_wol_write(struct mlx4_dev *dev, u64 config, int port) +{ + u32 in_mod = MLX4_WOL_SETUP_MODE | port << 8; + + return mlx4_cmd(dev, config, in_mod, 0x1, MLX4_CMD_MOD_STAT_CFG, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); +} +EXPORT_SYMBOL_GPL(mlx4_wol_write); + +enum { + ADD_TO_MCG = 0x26, +}; + + +void mlx4_opreq_action(struct work_struct *work) +{ + struct mlx4_priv *priv = container_of(work, struct mlx4_priv, + opreq_task); + struct mlx4_dev *dev = &priv->dev; + int num_tasks = atomic_read(&priv->opreq_count); + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_mgm *mgm; + u32 *outbox; + u32 modifier; + u16 token; + u16 type; + int err; + u32 num_qps; + struct mlx4_qp qp; + int i; + u8 rem_mcg; + u8 prot; + +#define GET_OP_REQ_MODIFIER_OFFSET 0x08 +#define GET_OP_REQ_TOKEN_OFFSET 0x14 +#define GET_OP_REQ_TYPE_OFFSET 0x1a +#define GET_OP_REQ_DATA_OFFSET 0x20 + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + mlx4_err(dev, "Failed to allocate mailbox for GET_OP_REQ\n"); + return; + } + outbox = mailbox->buf; + + while (num_tasks) { + err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, + MLX4_CMD_GET_OP_REQ, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (err) { + mlx4_err(dev, "Failed to retrieve required operation: %d\n", + err); + return; + } + MLX4_GET(modifier, outbox, GET_OP_REQ_MODIFIER_OFFSET); + MLX4_GET(token, outbox, GET_OP_REQ_TOKEN_OFFSET); + MLX4_GET(type, outbox, GET_OP_REQ_TYPE_OFFSET); + type &= 0xfff; + + switch (type) { + case ADD_TO_MCG: + if (dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED) { + mlx4_warn(dev, "ADD MCG operation is not supported in DEVICE_MANAGED steering mode\n"); + err = EPERM; + break; + } + mgm = (struct mlx4_mgm *)((u8 *)(outbox) + + GET_OP_REQ_DATA_OFFSET); + num_qps = be32_to_cpu(mgm->members_count) & + MGM_QPN_MASK; + rem_mcg = ((u8 *)(&mgm->members_count))[0] & 1; + prot = ((u8 *)(&mgm->members_count))[0] >> 6; + + for (i = 0; i < num_qps; i++) { + qp.qpn = be32_to_cpu(mgm->qp[i]); + if (rem_mcg) + err = mlx4_multicast_detach(dev, &qp, + mgm->gid, + prot, 0); + else + err = mlx4_multicast_attach(dev, &qp, + mgm->gid, + mgm->gid[5] + , 0, prot, + NULL); + if (err) + break; + } + break; + default: + mlx4_warn(dev, "Bad type for required operation\n"); + err = EINVAL; + break; + } + err = mlx4_cmd(dev, 0, ((u32) err | + (__force u32)cpu_to_be32(token) << 16), + 1, MLX4_CMD_GET_OP_REQ, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (err) { + mlx4_err(dev, "Failed to acknowledge required request: %d\n", + err); + goto out; + } + memset(outbox, 0, 0xffc); + num_tasks = atomic_dec_return(&priv->opreq_count); + } + +out: + mlx4_free_cmd_mailbox(dev, mailbox); +} + +static int mlx4_check_smp_firewall_active(struct mlx4_dev *dev, + struct mlx4_cmd_mailbox *mailbox) +{ +#define MLX4_CMD_MAD_DEMUX_SET_ATTR_OFFSET 0x10 +#define MLX4_CMD_MAD_DEMUX_GETRESP_ATTR_OFFSET 0x20 +#define MLX4_CMD_MAD_DEMUX_TRAP_ATTR_OFFSET 0x40 +#define MLX4_CMD_MAD_DEMUX_TRAP_REPRESS_ATTR_OFFSET 0x70 + + u32 set_attr_mask, getresp_attr_mask; + u32 trap_attr_mask, traprepress_attr_mask; + + MLX4_GET(set_attr_mask, mailbox->buf, + MLX4_CMD_MAD_DEMUX_SET_ATTR_OFFSET); + mlx4_dbg(dev, "SMP firewall set_attribute_mask = 0x%x\n", + set_attr_mask); + + MLX4_GET(getresp_attr_mask, mailbox->buf, + MLX4_CMD_MAD_DEMUX_GETRESP_ATTR_OFFSET); + mlx4_dbg(dev, "SMP firewall getresp_attribute_mask = 0x%x\n", + getresp_attr_mask); + + MLX4_GET(trap_attr_mask, mailbox->buf, + MLX4_CMD_MAD_DEMUX_TRAP_ATTR_OFFSET); + mlx4_dbg(dev, "SMP firewall trap_attribute_mask = 0x%x\n", + trap_attr_mask); + + MLX4_GET(traprepress_attr_mask, mailbox->buf, + MLX4_CMD_MAD_DEMUX_TRAP_REPRESS_ATTR_OFFSET); + mlx4_dbg(dev, "SMP firewall traprepress_attribute_mask = 0x%x\n", + traprepress_attr_mask); + + if (set_attr_mask && getresp_attr_mask && trap_attr_mask && + traprepress_attr_mask) + return 1; + + return 0; +} + +int mlx4_config_mad_demux(struct mlx4_dev *dev) +{ + struct mlx4_cmd_mailbox *mailbox; + int secure_host_active; + int err; + + /* Check if mad_demux is supported */ + if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_MAD_DEMUX)) + return 0; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + mlx4_warn(dev, "Failed to allocate mailbox for cmd MAD_DEMUX"); + return -ENOMEM; + } + + /* Query mad_demux to find out which MADs are handled by internal sma */ + err = mlx4_cmd_box(dev, 0, mailbox->dma, 0x01 /* subn mgmt class */, + MLX4_CMD_MAD_DEMUX_QUERY_RESTR, MLX4_CMD_MAD_DEMUX, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); + if (err) { + mlx4_warn(dev, "MLX4_CMD_MAD_DEMUX: query restrictions failed (%d)\n", + err); + goto out; + } + + secure_host_active = mlx4_check_smp_firewall_active(dev, mailbox); + + /* Config mad_demux to handle all MADs returned by the query above */ + err = mlx4_cmd(dev, mailbox->dma, 0x01 /* subn mgmt class */, + MLX4_CMD_MAD_DEMUX_CONFIG, MLX4_CMD_MAD_DEMUX, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); + if (err) { + mlx4_warn(dev, "MLX4_CMD_MAD_DEMUX: configure failed (%d)\n", err); + goto out; + } + + if (secure_host_active) + mlx4_warn(dev, "HCA operating in secure-host mode. SMP firewall activated.\n"); +out: + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h new file mode 100644 index 0000000..9b835ae --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/fw.h @@ -0,0 +1,230 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved. + * Copyright (c) 2006, 2007 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX4_FW_H +#define MLX4_FW_H + +#include "mlx4.h" +#include "icm.h" + +struct mlx4_mod_stat_cfg { + u8 log_pg_sz; + u8 log_pg_sz_m; +}; + +struct mlx4_dev_cap { + int max_srq_sz; + int max_qp_sz; + int reserved_qps; + int max_qps; + int reserved_srqs; + int max_srqs; + int max_cq_sz; + int reserved_cqs; + int max_cqs; + int max_mpts; + int reserved_eqs; + int max_eqs; + int reserved_mtts; + int max_mrw_sz; + int reserved_mrws; + int max_mtt_seg; + int max_requester_per_qp; + int max_responder_per_qp; + int max_rdma_global; + int local_ca_ack_delay; + int num_ports; + u32 max_msg_sz; + int ib_mtu[MLX4_MAX_PORTS + 1]; + int max_port_width[MLX4_MAX_PORTS + 1]; + int max_vl[MLX4_MAX_PORTS + 1]; + int max_gids[MLX4_MAX_PORTS + 1]; + int max_pkeys[MLX4_MAX_PORTS + 1]; + u64 def_mac[MLX4_MAX_PORTS + 1]; + u16 eth_mtu[MLX4_MAX_PORTS + 1]; + int trans_type[MLX4_MAX_PORTS + 1]; + int vendor_oui[MLX4_MAX_PORTS + 1]; + u16 wavelength[MLX4_MAX_PORTS + 1]; + u64 trans_code[MLX4_MAX_PORTS + 1]; + u16 stat_rate_support; + int fs_log_max_ucast_qp_range_size; + int fs_max_num_qp_per_entry; + u64 flags; + u64 flags2; + int reserved_uars; + int uar_size; + int min_page_sz; + int bf_reg_size; + int bf_regs_per_page; + int max_sq_sg; + int max_sq_desc_sz; + int max_rq_sg; + int max_rq_desc_sz; + int max_qp_per_mcg; + int reserved_mgms; + int max_mcgs; + int reserved_pds; + int max_pds; + int reserved_xrcds; + int max_xrcds; + int qpc_entry_sz; + int rdmarc_entry_sz; + int altc_entry_sz; + int aux_entry_sz; + int srq_entry_sz; + int cqc_entry_sz; + int eqc_entry_sz; + int dmpt_entry_sz; + int cmpt_entry_sz; + int mtt_entry_sz; + int resize_srq; + u32 bmme_flags; + u32 reserved_lkey; + u64 max_icm_sz; + int max_gso_sz; + int max_rss_tbl_sz; + u8 supported_port_types[MLX4_MAX_PORTS + 1]; + u8 suggested_type[MLX4_MAX_PORTS + 1]; + u8 default_sense[MLX4_MAX_PORTS + 1]; + u8 log_max_macs[MLX4_MAX_PORTS + 1]; + u8 log_max_vlans[MLX4_MAX_PORTS + 1]; + u32 max_counters; +}; + +struct mlx4_func_cap { + u8 num_ports; + u8 flags; + u32 pf_context_behaviour; + int qp_quota; + int cq_quota; + int srq_quota; + int mpt_quota; + int mtt_quota; + int max_eq; + int reserved_eq; + int mcg_quota; + u32 qp0_qkey; + u32 qp0_tunnel_qpn; + u32 qp0_proxy_qpn; + u32 qp1_tunnel_qpn; + u32 qp1_proxy_qpn; + u8 physical_port; + u8 port_flags; + u8 flags1; + u64 phys_port_id; +}; + +struct mlx4_adapter { + char board_id[MLX4_BOARD_ID_LEN]; + u8 inta_pin; +}; + +struct mlx4_init_hca_param { + u64 qpc_base; + u64 rdmarc_base; + u64 auxc_base; + u64 altc_base; + u64 srqc_base; + u64 cqc_base; + u64 eqc_base; + u64 mc_base; + u64 dmpt_base; + u64 cmpt_base; + u64 mtt_base; + u64 global_caps; + u16 log_mc_entry_sz; + u16 log_mc_hash_sz; + u16 hca_core_clock; /* Internal Clock Frequency (in MHz) */ + u8 log_num_qps; + u8 log_num_srqs; + u8 log_num_cqs; + u8 log_num_eqs; + u8 log_rd_per_qp; + u8 log_mc_table_sz; + u8 log_mpt_sz; + u8 log_uar_sz; + u8 mw_enabled; /* Enable memory windows */ + u8 uar_page_sz; /* log pg sz in 4k chunks */ + u8 steering_mode; /* for QUERY_HCA */ + u64 dev_cap_enabled; + u16 cqe_size; /* For use only when CQE stride feature enabled */ + u16 eqe_size; /* For use only when EQE stride feature enabled */ +}; + +struct mlx4_init_ib_param { + int port_width; + int vl_cap; + int mtu_cap; + u16 gid_cap; + u16 pkey_cap; + int set_guid0; + u64 guid0; + int set_node_guid; + u64 node_guid; + int set_si_guid; + u64 si_guid; +}; + +struct mlx4_set_ib_param { + int set_si_guid; + int reset_qkey_viol; + u64 si_guid; + u32 cap_mask; +}; + +int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap); +int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u32 gen_or_port, + struct mlx4_func_cap *func_cap); +int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_MAP_FA(struct mlx4_dev *dev, struct mlx4_icm *icm); +int mlx4_UNMAP_FA(struct mlx4_dev *dev); +int mlx4_RUN_FW(struct mlx4_dev *dev); +int mlx4_QUERY_FW(struct mlx4_dev *dev); +int mlx4_QUERY_ADAPTER(struct mlx4_dev *dev, struct mlx4_adapter *adapter); +int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param); +int mlx4_QUERY_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param); +int mlx4_CLOSE_HCA(struct mlx4_dev *dev, int panic); +int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt); +int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages); +int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm); +int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev); +int mlx4_NOP(struct mlx4_dev *dev); +int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg); +void mlx4_opreq_action(struct work_struct *work); + +#endif /* MLX4_FW_H */ diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c b/drivers/net/ethernet/mellanox/mlx4/icm.c new file mode 100644 index 0000000..97c9b1d --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/icm.c @@ -0,0 +1,461 @@ +/* + * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved. + * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include + +#include "mlx4.h" +#include "icm.h" +#include "fw.h" + +/* + * We allocate in as big chunks as we can, up to a maximum of 256 KB + * per chunk. + */ +enum { + MLX4_ICM_ALLOC_SIZE = 1 << 18, + MLX4_TABLE_CHUNK_SIZE = 1 << 18 +}; + +static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk) +{ + int i; + + if (chunk->nsg > 0) + pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages, + PCI_DMA_BIDIRECTIONAL); + + for (i = 0; i < chunk->npages; ++i) + __free_pages(sg_page(&chunk->mem[i]), + get_order(chunk->mem[i].length)); +} + +static void mlx4_free_icm_coherent(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk) +{ + int i; + + for (i = 0; i < chunk->npages; ++i) + dma_free_coherent(&dev->pdev->dev, chunk->mem[i].length, + lowmem_page_address(sg_page(&chunk->mem[i])), + sg_dma_address(&chunk->mem[i])); +} + +void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent) +{ + struct mlx4_icm_chunk *chunk, *tmp; + + if (!icm) + return; + + list_for_each_entry_safe(chunk, tmp, &icm->chunk_list, list) { + if (coherent) + mlx4_free_icm_coherent(dev, chunk); + else + mlx4_free_icm_pages(dev, chunk); + + kfree(chunk); + } + + kfree(icm); +} + +static int mlx4_alloc_icm_pages(struct scatterlist *mem, int order, + gfp_t gfp_mask, int node) +{ + struct page *page; + + page = alloc_pages_node(node, gfp_mask, order); + if (!page) { + page = alloc_pages(gfp_mask, order); + if (!page) + return -ENOMEM; + } + + sg_set_page(mem, page, PAGE_SIZE << order, 0); + return 0; +} + +static int mlx4_alloc_icm_coherent(struct device *dev, struct scatterlist *mem, + int order, gfp_t gfp_mask) +{ + void *buf = dma_alloc_coherent(dev, PAGE_SIZE << order, + &sg_dma_address(mem), gfp_mask); + if (!buf) + return -ENOMEM; + + sg_set_buf(mem, buf, PAGE_SIZE << order); + BUG_ON(mem->offset); + sg_dma_len(mem) = PAGE_SIZE << order; + return 0; +} + +struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages, + gfp_t gfp_mask, int coherent) +{ + struct mlx4_icm *icm; + struct mlx4_icm_chunk *chunk = NULL; + int cur_order; + int ret; + + /* We use sg_set_buf for coherent allocs, which assumes low memory */ + BUG_ON(coherent && (gfp_mask & __GFP_HIGHMEM)); + + icm = kmalloc_node(sizeof(*icm), + gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN), + dev->numa_node); + if (!icm) { + icm = kmalloc(sizeof(*icm), + gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN)); + if (!icm) + return NULL; + } + + icm->refcount = 0; + INIT_LIST_HEAD(&icm->chunk_list); + + cur_order = get_order(MLX4_ICM_ALLOC_SIZE); + + while (npages > 0) { + if (!chunk) { + chunk = kmalloc_node(sizeof(*chunk), + gfp_mask & ~(__GFP_HIGHMEM | + __GFP_NOWARN), + dev->numa_node); + if (!chunk) { + chunk = kmalloc(sizeof(*chunk), + gfp_mask & ~(__GFP_HIGHMEM | + __GFP_NOWARN)); + if (!chunk) + goto fail; + } + + sg_init_table(chunk->mem, MLX4_ICM_CHUNK_LEN); + chunk->npages = 0; + chunk->nsg = 0; + list_add_tail(&chunk->list, &icm->chunk_list); + } + + while (1 << cur_order > npages) + --cur_order; + + if (coherent) + ret = mlx4_alloc_icm_coherent(&dev->pdev->dev, + &chunk->mem[chunk->npages], + cur_order, gfp_mask); + else + ret = mlx4_alloc_icm_pages(&chunk->mem[chunk->npages], + cur_order, gfp_mask, + dev->numa_node); + + if (ret) { + if (--cur_order < 0) + goto fail; + else + continue; + } + + ++chunk->npages; + + if (coherent) + ++chunk->nsg; + else if (chunk->npages == MLX4_ICM_CHUNK_LEN) { + chunk->nsg = pci_map_sg(dev->pdev, chunk->mem, + chunk->npages, + PCI_DMA_BIDIRECTIONAL); + + if (chunk->nsg <= 0) + goto fail; + } + + if (chunk->npages == MLX4_ICM_CHUNK_LEN) + chunk = NULL; + + npages -= 1 << cur_order; + } + + if (!coherent && chunk) { + chunk->nsg = pci_map_sg(dev->pdev, chunk->mem, + chunk->npages, + PCI_DMA_BIDIRECTIONAL); + + if (chunk->nsg <= 0) + goto fail; + } + + return icm; + +fail: + mlx4_free_icm(dev, icm, coherent); + return NULL; +} + +static int mlx4_MAP_ICM(struct mlx4_dev *dev, struct mlx4_icm *icm, u64 virt) +{ + return mlx4_map_cmd(dev, MLX4_CMD_MAP_ICM, icm, virt); +} + +static int mlx4_UNMAP_ICM(struct mlx4_dev *dev, u64 virt, u32 page_count) +{ + return mlx4_cmd(dev, virt, page_count, 0, MLX4_CMD_UNMAP_ICM, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); +} + +int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm) +{ + return mlx4_map_cmd(dev, MLX4_CMD_MAP_ICM_AUX, icm, -1); +} + +int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev) +{ + return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_UNMAP_ICM_AUX, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); +} + +int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj, + gfp_t gfp) +{ + u32 i = (obj & (table->num_obj - 1)) / + (MLX4_TABLE_CHUNK_SIZE / table->obj_size); + int ret = 0; + + mutex_lock(&table->mutex); + + if (table->icm[i]) { + ++table->icm[i]->refcount; + goto out; + } + + table->icm[i] = mlx4_alloc_icm(dev, MLX4_TABLE_CHUNK_SIZE >> PAGE_SHIFT, + (table->lowmem ? gfp : GFP_HIGHUSER) | + __GFP_NOWARN, table->coherent); + if (!table->icm[i]) { + ret = -ENOMEM; + goto out; + } + + if (mlx4_MAP_ICM(dev, table->icm[i], table->virt + + (u64) i * MLX4_TABLE_CHUNK_SIZE)) { + mlx4_free_icm(dev, table->icm[i], table->coherent); + table->icm[i] = NULL; + ret = -ENOMEM; + goto out; + } + + ++table->icm[i]->refcount; + +out: + mutex_unlock(&table->mutex); + return ret; +} + +void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj) +{ + u32 i; + u64 offset; + + i = (obj & (table->num_obj - 1)) / (MLX4_TABLE_CHUNK_SIZE / table->obj_size); + + mutex_lock(&table->mutex); + + if (--table->icm[i]->refcount == 0) { + offset = (u64) i * MLX4_TABLE_CHUNK_SIZE; + mlx4_UNMAP_ICM(dev, table->virt + offset, + MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE); + mlx4_free_icm(dev, table->icm[i], table->coherent); + table->icm[i] = NULL; + } + + mutex_unlock(&table->mutex); +} + +void *mlx4_table_find(struct mlx4_icm_table *table, u32 obj, + dma_addr_t *dma_handle) +{ + int offset, dma_offset, i; + u64 idx; + struct mlx4_icm_chunk *chunk; + struct mlx4_icm *icm; + struct page *page = NULL; + + if (!table->lowmem) + return NULL; + + mutex_lock(&table->mutex); + + idx = (u64) (obj & (table->num_obj - 1)) * table->obj_size; + icm = table->icm[idx / MLX4_TABLE_CHUNK_SIZE]; + dma_offset = offset = idx % MLX4_TABLE_CHUNK_SIZE; + + if (!icm) + goto out; + + list_for_each_entry(chunk, &icm->chunk_list, list) { + for (i = 0; i < chunk->npages; ++i) { + if (dma_handle && dma_offset >= 0) { + if (sg_dma_len(&chunk->mem[i]) > dma_offset) + *dma_handle = sg_dma_address(&chunk->mem[i]) + + dma_offset; + dma_offset -= sg_dma_len(&chunk->mem[i]); + } + /* + * DMA mapping can merge pages but not split them, + * so if we found the page, dma_handle has already + * been assigned to. + */ + if (chunk->mem[i].length > offset) { + page = sg_page(&chunk->mem[i]); + goto out; + } + offset -= chunk->mem[i].length; + } + } + +out: + mutex_unlock(&table->mutex); + return page ? lowmem_page_address(page) + offset : NULL; +} + +int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, + u32 start, u32 end) +{ + int inc = MLX4_TABLE_CHUNK_SIZE / table->obj_size; + int err; + u32 i; + + for (i = start; i <= end; i += inc) { + err = mlx4_table_get(dev, table, i, GFP_KERNEL); + if (err) + goto fail; + } + + return 0; + +fail: + while (i > start) { + i -= inc; + mlx4_table_put(dev, table, i); + } + + return err; +} + +void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, + u32 start, u32 end) +{ + u32 i; + + for (i = start; i <= end; i += MLX4_TABLE_CHUNK_SIZE / table->obj_size) + mlx4_table_put(dev, table, i); +} + +int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table, + u64 virt, int obj_size, u32 nobj, int reserved, + int use_lowmem, int use_coherent) +{ + int obj_per_chunk; + int num_icm; + unsigned chunk_size; + int i; + u64 size; + + obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size; + num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk; + + table->icm = kcalloc(num_icm, sizeof *table->icm, GFP_KERNEL); + if (!table->icm) + return -ENOMEM; + table->virt = virt; + table->num_icm = num_icm; + table->num_obj = nobj; + table->obj_size = obj_size; + table->lowmem = use_lowmem; + table->coherent = use_coherent; + mutex_init(&table->mutex); + + size = (u64) nobj * obj_size; + for (i = 0; i * MLX4_TABLE_CHUNK_SIZE < reserved * obj_size; ++i) { + chunk_size = MLX4_TABLE_CHUNK_SIZE; + if ((i + 1) * MLX4_TABLE_CHUNK_SIZE > size) + chunk_size = PAGE_ALIGN(size - + i * MLX4_TABLE_CHUNK_SIZE); + + table->icm[i] = mlx4_alloc_icm(dev, chunk_size >> PAGE_SHIFT, + (use_lowmem ? GFP_KERNEL : GFP_HIGHUSER) | + __GFP_NOWARN, use_coherent); + if (!table->icm[i]) + goto err; + if (mlx4_MAP_ICM(dev, table->icm[i], virt + i * MLX4_TABLE_CHUNK_SIZE)) { + mlx4_free_icm(dev, table->icm[i], use_coherent); + table->icm[i] = NULL; + goto err; + } + + /* + * Add a reference to this ICM chunk so that it never + * gets freed (since it contains reserved firmware objects). + */ + ++table->icm[i]->refcount; + } + + return 0; + +err: + for (i = 0; i < num_icm; ++i) + if (table->icm[i]) { + mlx4_UNMAP_ICM(dev, virt + i * MLX4_TABLE_CHUNK_SIZE, + MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE); + mlx4_free_icm(dev, table->icm[i], use_coherent); + } + + kfree(table->icm); + + return -ENOMEM; +} + +void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table) +{ + int i; + + for (i = 0; i < table->num_icm; ++i) + if (table->icm[i]) { + mlx4_UNMAP_ICM(dev, table->virt + i * MLX4_TABLE_CHUNK_SIZE, + MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE); + mlx4_free_icm(dev, table->icm[i], table->coherent); + } + + kfree(table->icm); +} diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.h b/drivers/net/ethernet/mellanox/mlx4/icm.h new file mode 100644 index 0000000..0c73645 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/icm.h @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved. + * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX4_ICM_H +#define MLX4_ICM_H + +#include +#include +#include + +#define MLX4_ICM_CHUNK_LEN \ + ((256 - sizeof (struct list_head) - 2 * sizeof (int)) / \ + (sizeof (struct scatterlist))) + +enum { + MLX4_ICM_PAGE_SHIFT = 12, + MLX4_ICM_PAGE_SIZE = 1 << MLX4_ICM_PAGE_SHIFT, +}; + +struct mlx4_icm_chunk { + struct list_head list; + int npages; + int nsg; + struct scatterlist mem[MLX4_ICM_CHUNK_LEN]; +}; + +struct mlx4_icm { + struct list_head chunk_list; + int refcount; +}; + +struct mlx4_icm_iter { + struct mlx4_icm *icm; + struct mlx4_icm_chunk *chunk; + int page_idx; +}; + +struct mlx4_dev; + +struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages, + gfp_t gfp_mask, int coherent); +void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent); + +int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj, + gfp_t gfp); +void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj); +int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, + u32 start, u32 end); +void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, + u32 start, u32 end); +int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table, + u64 virt, int obj_size, u32 nobj, int reserved, + int use_lowmem, int use_coherent); +void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table); +void *mlx4_table_find(struct mlx4_icm_table *table, u32 obj, dma_addr_t *dma_handle); + +static inline void mlx4_icm_first(struct mlx4_icm *icm, + struct mlx4_icm_iter *iter) +{ + iter->icm = icm; + iter->chunk = list_empty(&icm->chunk_list) ? + NULL : list_entry(icm->chunk_list.next, + struct mlx4_icm_chunk, list); + iter->page_idx = 0; +} + +static inline int mlx4_icm_last(struct mlx4_icm_iter *iter) +{ + return !iter->chunk; +} + +static inline void mlx4_icm_next(struct mlx4_icm_iter *iter) +{ + if (++iter->page_idx >= iter->chunk->nsg) { + if (iter->chunk->list.next == &iter->icm->chunk_list) { + iter->chunk = NULL; + return; + } + + iter->chunk = list_entry(iter->chunk->list.next, + struct mlx4_icm_chunk, list); + iter->page_idx = 0; + } +} + +static inline dma_addr_t mlx4_icm_addr(struct mlx4_icm_iter *iter) +{ + return sg_dma_address(&iter->chunk->mem[iter->page_idx]); +} + +static inline unsigned long mlx4_icm_size(struct mlx4_icm_iter *iter) +{ + return sg_dma_len(&iter->chunk->mem[iter->page_idx]); +} + +int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm); +int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev); + +#endif /* MLX4_ICM_H */ diff --git a/drivers/net/ethernet/mellanox/mlx4/intf.c b/drivers/net/ethernet/mellanox/mlx4/intf.c new file mode 100644 index 0000000..116895a --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/intf.c @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "mlx4.h" + +struct mlx4_device_context { + struct list_head list; + struct mlx4_interface *intf; + void *context; +}; + +static LIST_HEAD(intf_list); +static LIST_HEAD(dev_list); +static DEFINE_MUTEX(intf_mutex); + +static void mlx4_add_device(struct mlx4_interface *intf, struct mlx4_priv *priv) +{ + struct mlx4_device_context *dev_ctx; + + dev_ctx = kmalloc(sizeof *dev_ctx, GFP_KERNEL); + if (!dev_ctx) + return; + + dev_ctx->intf = intf; + dev_ctx->context = intf->add(&priv->dev); + + if (dev_ctx->context) { + spin_lock_irq(&priv->ctx_lock); + list_add_tail(&dev_ctx->list, &priv->ctx_list); + spin_unlock_irq(&priv->ctx_lock); + } else + kfree(dev_ctx); +} + +static void mlx4_remove_device(struct mlx4_interface *intf, struct mlx4_priv *priv) +{ + struct mlx4_device_context *dev_ctx; + + list_for_each_entry(dev_ctx, &priv->ctx_list, list) + if (dev_ctx->intf == intf) { + spin_lock_irq(&priv->ctx_lock); + list_del(&dev_ctx->list); + spin_unlock_irq(&priv->ctx_lock); + + intf->remove(&priv->dev, dev_ctx->context); + kfree(dev_ctx); + return; + } +} + +int mlx4_register_interface(struct mlx4_interface *intf) +{ + struct mlx4_priv *priv; + + if (!intf->add || !intf->remove) + return -EINVAL; + + mutex_lock(&intf_mutex); + + list_add_tail(&intf->list, &intf_list); + list_for_each_entry(priv, &dev_list, dev_list) + mlx4_add_device(intf, priv); + + mutex_unlock(&intf_mutex); + + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_register_interface); + +void mlx4_unregister_interface(struct mlx4_interface *intf) +{ + struct mlx4_priv *priv; + + mutex_lock(&intf_mutex); + + list_for_each_entry(priv, &dev_list, dev_list) + mlx4_remove_device(intf, priv); + + list_del(&intf->list); + + mutex_unlock(&intf_mutex); +} +EXPORT_SYMBOL_GPL(mlx4_unregister_interface); + +void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type, + unsigned long param) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_device_context *dev_ctx; + unsigned long flags; + + spin_lock_irqsave(&priv->ctx_lock, flags); + + list_for_each_entry(dev_ctx, &priv->ctx_list, list) + if (dev_ctx->intf->event) + dev_ctx->intf->event(dev, dev_ctx->context, type, param); + + spin_unlock_irqrestore(&priv->ctx_lock, flags); +} + +int mlx4_register_device(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_interface *intf; + + mutex_lock(&intf_mutex); + + list_add_tail(&priv->dev_list, &dev_list); + list_for_each_entry(intf, &intf_list, list) + mlx4_add_device(intf, priv); + + mutex_unlock(&intf_mutex); + if (!mlx4_is_slave(dev)) + mlx4_start_catas_poll(dev); + + return 0; +} + +void mlx4_unregister_device(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_interface *intf; + + if (!mlx4_is_slave(dev)) + mlx4_stop_catas_poll(dev); + mutex_lock(&intf_mutex); + + list_for_each_entry(intf, &intf_list, list) + mlx4_remove_device(intf, priv); + + list_del(&priv->dev_list); + + mutex_unlock(&intf_mutex); +} + +void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto, int port) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_device_context *dev_ctx; + unsigned long flags; + void *result = NULL; + + spin_lock_irqsave(&priv->ctx_lock, flags); + + list_for_each_entry(dev_ctx, &priv->ctx_list, list) + if (dev_ctx->intf->protocol == proto && dev_ctx->intf->get_dev) { + result = dev_ctx->intf->get_dev(dev, dev_ctx->context, port); + break; + } + + spin_unlock_irqrestore(&priv->ctx_lock, flags); + + return result; +} +EXPORT_SYMBOL_GPL(mlx4_get_protocol_dev); diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c new file mode 100644 index 0000000..90de6e1 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -0,0 +1,2998 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved. + * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "mlx4.h" +#include "fw.h" +#include "icm.h" + +MODULE_AUTHOR("Roland Dreier"); +MODULE_DESCRIPTION("Mellanox ConnectX HCA low-level driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); + +struct workqueue_struct *mlx4_wq; + +#ifdef CONFIG_MLX4_DEBUG + +int mlx4_debug_level = 0; +module_param_named(debug_level, mlx4_debug_level, int, 0644); +MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); + +#endif /* CONFIG_MLX4_DEBUG */ + +#ifdef CONFIG_PCI_MSI + +static int msi_x = 1; +module_param(msi_x, int, 0444); +MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero"); + +#else /* CONFIG_PCI_MSI */ + +#define msi_x (0) + +#endif /* CONFIG_PCI_MSI */ + +static uint8_t num_vfs[3] = {0, 0, 0}; +static int num_vfs_argc; +module_param_array(num_vfs, byte , &num_vfs_argc, 0444); +MODULE_PARM_DESC(num_vfs, "enable #num_vfs functions if num_vfs > 0\n" + "num_vfs=port1,port2,port1+2"); + +static uint8_t probe_vf[3] = {0, 0, 0}; +static int probe_vfs_argc; +module_param_array(probe_vf, byte, &probe_vfs_argc, 0444); +MODULE_PARM_DESC(probe_vf, "number of vfs to probe by pf driver (num_vfs > 0)\n" + "probe_vf=port1,port2,port1+2"); + +int mlx4_log_num_mgm_entry_size = MLX4_DEFAULT_MGM_LOG_ENTRY_SIZE; +module_param_named(log_num_mgm_entry_size, + mlx4_log_num_mgm_entry_size, int, 0444); +MODULE_PARM_DESC(log_num_mgm_entry_size, "log mgm size, that defines the num" + " of qp per mcg, for example:" + " 10 gives 248.range: 7 <=" + " log_num_mgm_entry_size <= 12." + " To activate device managed" + " flow steering when available, set to -1"); + +static bool enable_64b_cqe_eqe = true; +module_param(enable_64b_cqe_eqe, bool, 0444); +MODULE_PARM_DESC(enable_64b_cqe_eqe, + "Enable 64 byte CQEs/EQEs when the FW supports this (default: True)"); + +#define PF_CONTEXT_BEHAVIOUR_MASK (MLX4_FUNC_CAP_64B_EQE_CQE | \ + MLX4_FUNC_CAP_EQE_CQE_STRIDE) + +static char mlx4_version[] = + DRV_NAME ": Mellanox ConnectX core driver v" + DRV_VERSION " (" DRV_RELDATE ")\n"; + +static struct mlx4_profile default_profile = { + .num_qp = 1 << 18, + .num_srq = 1 << 16, + .rdmarc_per_qp = 1 << 4, + .num_cq = 1 << 16, + .num_mcg = 1 << 13, + .num_mpt = 1 << 19, + .num_mtt = 1 << 20, /* It is really num mtt segements */ +}; + +static struct mlx4_profile low_mem_profile = { + .num_qp = 1 << 17, + .num_srq = 1 << 6, + .rdmarc_per_qp = 1 << 4, + .num_cq = 1 << 8, + .num_mcg = 1 << 8, + .num_mpt = 1 << 9, + .num_mtt = 1 << 7, +}; + +static int log_num_mac = 7; +module_param_named(log_num_mac, log_num_mac, int, 0444); +MODULE_PARM_DESC(log_num_mac, "Log2 max number of MACs per ETH port (1-7)"); + +static int log_num_vlan; +module_param_named(log_num_vlan, log_num_vlan, int, 0444); +MODULE_PARM_DESC(log_num_vlan, "Log2 max number of VLANs per ETH port (0-7)"); +/* Log2 max number of VLANs per ETH port (0-7) */ +#define MLX4_LOG_NUM_VLANS 7 +#define MLX4_MIN_LOG_NUM_VLANS 0 +#define MLX4_MIN_LOG_NUM_MAC 1 + +static bool use_prio; +module_param_named(use_prio, use_prio, bool, 0444); +MODULE_PARM_DESC(use_prio, "Enable steering by VLAN priority on ETH ports (deprecated)"); + +int log_mtts_per_seg = ilog2(MLX4_MTT_ENTRY_PER_SEG); +module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444); +MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment (1-7)"); + +static int port_type_array[2] = {MLX4_PORT_TYPE_NONE, MLX4_PORT_TYPE_NONE}; +static int arr_argc = 2; +module_param_array(port_type_array, int, &arr_argc, 0444); +MODULE_PARM_DESC(port_type_array, "Array of port types: HW_DEFAULT (0) is default " + "1 for IB, 2 for Ethernet"); + +struct mlx4_port_config { + struct list_head list; + enum mlx4_port_type port_type[MLX4_MAX_PORTS + 1]; + struct pci_dev *pdev; +}; + +static atomic_t pf_loading = ATOMIC_INIT(0); + +int mlx4_check_port_params(struct mlx4_dev *dev, + enum mlx4_port_type *port_type) +{ + int i; + + for (i = 0; i < dev->caps.num_ports - 1; i++) { + if (port_type[i] != port_type[i + 1]) { + if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) { + mlx4_err(dev, "Only same port types supported on this HCA, aborting\n"); + return -EINVAL; + } + } + } + + for (i = 0; i < dev->caps.num_ports; i++) { + if (!(port_type[i] & dev->caps.supported_type[i+1])) { + mlx4_err(dev, "Requested port type for port %d is not supported on this HCA\n", + i + 1); + return -EINVAL; + } + } + return 0; +} + +static void mlx4_set_port_mask(struct mlx4_dev *dev) +{ + int i; + + for (i = 1; i <= dev->caps.num_ports; ++i) + dev->caps.port_mask[i] = dev->caps.port_type[i]; +} + +static void mlx4_enable_cqe_eqe_stride(struct mlx4_dev *dev) +{ + struct mlx4_caps *dev_cap = &dev->caps; + + /* FW not supporting or cancelled by user */ + if (!(dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_EQE_STRIDE) || + !(dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_CQE_STRIDE)) + return; + + /* Must have 64B CQE_EQE enabled by FW to use bigger stride + * When FW has NCSI it may decide not to report 64B CQE/EQEs + */ + if (!(dev_cap->flags & MLX4_DEV_CAP_FLAG_64B_EQE) || + !(dev_cap->flags & MLX4_DEV_CAP_FLAG_64B_CQE)) { + dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_CQE_STRIDE; + dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_EQE_STRIDE; + return; + } + + if (cache_line_size() == 128 || cache_line_size() == 256) { + mlx4_dbg(dev, "Enabling CQE stride cacheLine supported\n"); + /* Changing the real data inside CQE size to 32B */ + dev_cap->flags &= ~MLX4_DEV_CAP_FLAG_64B_CQE; + dev_cap->flags &= ~MLX4_DEV_CAP_FLAG_64B_EQE; + + if (mlx4_is_master(dev)) + dev_cap->function_caps |= MLX4_FUNC_CAP_EQE_CQE_STRIDE; + } else { + mlx4_dbg(dev, "Disabling CQE stride cacheLine unsupported\n"); + dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_CQE_STRIDE; + dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_EQE_STRIDE; + } +} + +static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) +{ + int err; + int i; + + err = mlx4_QUERY_DEV_CAP(dev, dev_cap); + if (err) { + mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting\n"); + return err; + } + + if (dev_cap->min_page_sz > PAGE_SIZE) { + mlx4_err(dev, "HCA minimum page size of %d bigger than kernel PAGE_SIZE of %ld, aborting\n", + dev_cap->min_page_sz, PAGE_SIZE); + return -ENODEV; + } + if (dev_cap->num_ports > MLX4_MAX_PORTS) { + mlx4_err(dev, "HCA has %d ports, but we only support %d, aborting\n", + dev_cap->num_ports, MLX4_MAX_PORTS); + return -ENODEV; + } + + if (dev_cap->uar_size > pci_resource_len(dev->pdev, 2)) { + mlx4_err(dev, "HCA reported UAR size of 0x%x bigger than PCI resource 2 size of 0x%llx, aborting\n", + dev_cap->uar_size, + (unsigned long long) pci_resource_len(dev->pdev, 2)); + return -ENODEV; + } + + dev->caps.num_ports = dev_cap->num_ports; + dev->phys_caps.num_phys_eqs = MLX4_MAX_EQ_NUM; + for (i = 1; i <= dev->caps.num_ports; ++i) { + dev->caps.vl_cap[i] = dev_cap->max_vl[i]; + dev->caps.ib_mtu_cap[i] = dev_cap->ib_mtu[i]; + dev->phys_caps.gid_phys_table_len[i] = dev_cap->max_gids[i]; + dev->phys_caps.pkey_phys_table_len[i] = dev_cap->max_pkeys[i]; + /* set gid and pkey table operating lengths by default + * to non-sriov values */ + dev->caps.gid_table_len[i] = dev_cap->max_gids[i]; + dev->caps.pkey_table_len[i] = dev_cap->max_pkeys[i]; + dev->caps.port_width_cap[i] = dev_cap->max_port_width[i]; + dev->caps.eth_mtu_cap[i] = dev_cap->eth_mtu[i]; + dev->caps.def_mac[i] = dev_cap->def_mac[i]; + dev->caps.supported_type[i] = dev_cap->supported_port_types[i]; + dev->caps.suggested_type[i] = dev_cap->suggested_type[i]; + dev->caps.default_sense[i] = dev_cap->default_sense[i]; + dev->caps.trans_type[i] = dev_cap->trans_type[i]; + dev->caps.vendor_oui[i] = dev_cap->vendor_oui[i]; + dev->caps.wavelength[i] = dev_cap->wavelength[i]; + dev->caps.trans_code[i] = dev_cap->trans_code[i]; + } + + dev->caps.uar_page_size = PAGE_SIZE; + dev->caps.num_uars = dev_cap->uar_size / PAGE_SIZE; + dev->caps.local_ca_ack_delay = dev_cap->local_ca_ack_delay; + dev->caps.bf_reg_size = dev_cap->bf_reg_size; + dev->caps.bf_regs_per_page = dev_cap->bf_regs_per_page; + dev->caps.max_sq_sg = dev_cap->max_sq_sg; + dev->caps.max_rq_sg = dev_cap->max_rq_sg; + dev->caps.max_wqes = dev_cap->max_qp_sz; + dev->caps.max_qp_init_rdma = dev_cap->max_requester_per_qp; + dev->caps.max_srq_wqes = dev_cap->max_srq_sz; + dev->caps.max_srq_sge = dev_cap->max_rq_sg - 1; + dev->caps.reserved_srqs = dev_cap->reserved_srqs; + dev->caps.max_sq_desc_sz = dev_cap->max_sq_desc_sz; + dev->caps.max_rq_desc_sz = dev_cap->max_rq_desc_sz; + /* + * Subtract 1 from the limit because we need to allocate a + * spare CQE so the HCA HW can tell the difference between an + * empty CQ and a full CQ. + */ + dev->caps.max_cqes = dev_cap->max_cq_sz - 1; + dev->caps.reserved_cqs = dev_cap->reserved_cqs; + dev->caps.reserved_eqs = dev_cap->reserved_eqs; + dev->caps.reserved_mtts = dev_cap->reserved_mtts; + dev->caps.reserved_mrws = dev_cap->reserved_mrws; + + /* The first 128 UARs are used for EQ doorbells */ + dev->caps.reserved_uars = max_t(int, 128, dev_cap->reserved_uars); + dev->caps.reserved_pds = dev_cap->reserved_pds; + dev->caps.reserved_xrcds = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ? + dev_cap->reserved_xrcds : 0; + dev->caps.max_xrcds = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ? + dev_cap->max_xrcds : 0; + dev->caps.mtt_entry_sz = dev_cap->mtt_entry_sz; + + dev->caps.max_msg_sz = dev_cap->max_msg_sz; + dev->caps.page_size_cap = ~(u32) (dev_cap->min_page_sz - 1); + dev->caps.flags = dev_cap->flags; + dev->caps.flags2 = dev_cap->flags2; + dev->caps.bmme_flags = dev_cap->bmme_flags; + dev->caps.reserved_lkey = dev_cap->reserved_lkey; + dev->caps.stat_rate_support = dev_cap->stat_rate_support; + dev->caps.max_gso_sz = dev_cap->max_gso_sz; + dev->caps.max_rss_tbl_sz = dev_cap->max_rss_tbl_sz; + + /* Sense port always allowed on supported devices for ConnectX-1 and -2 */ + if (mlx4_priv(dev)->pci_dev_data & MLX4_PCI_DEV_FORCE_SENSE_PORT) + dev->caps.flags |= MLX4_DEV_CAP_FLAG_SENSE_SUPPORT; + /* Don't do sense port on multifunction devices (for now at least) */ + if (mlx4_is_mfunc(dev)) + dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_SENSE_SUPPORT; + + if (mlx4_low_memory_profile()) { + dev->caps.log_num_macs = MLX4_MIN_LOG_NUM_MAC; + dev->caps.log_num_vlans = MLX4_MIN_LOG_NUM_VLANS; + } else { + dev->caps.log_num_macs = log_num_mac; + dev->caps.log_num_vlans = MLX4_LOG_NUM_VLANS; + } + + for (i = 1; i <= dev->caps.num_ports; ++i) { + dev->caps.port_type[i] = MLX4_PORT_TYPE_NONE; + if (dev->caps.supported_type[i]) { + /* if only ETH is supported - assign ETH */ + if (dev->caps.supported_type[i] == MLX4_PORT_TYPE_ETH) + dev->caps.port_type[i] = MLX4_PORT_TYPE_ETH; + /* if only IB is supported, assign IB */ + else if (dev->caps.supported_type[i] == + MLX4_PORT_TYPE_IB) + dev->caps.port_type[i] = MLX4_PORT_TYPE_IB; + else { + /* if IB and ETH are supported, we set the port + * type according to user selection of port type; + * if user selected none, take the FW hint */ + if (port_type_array[i - 1] == MLX4_PORT_TYPE_NONE) + dev->caps.port_type[i] = dev->caps.suggested_type[i] ? + MLX4_PORT_TYPE_ETH : MLX4_PORT_TYPE_IB; + else + dev->caps.port_type[i] = port_type_array[i - 1]; + } + } + /* + * Link sensing is allowed on the port if 3 conditions are true: + * 1. Both protocols are supported on the port. + * 2. Different types are supported on the port + * 3. FW declared that it supports link sensing + */ + mlx4_priv(dev)->sense.sense_allowed[i] = + ((dev->caps.supported_type[i] == MLX4_PORT_TYPE_AUTO) && + (dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP) && + (dev->caps.flags & MLX4_DEV_CAP_FLAG_SENSE_SUPPORT)); + + /* + * If "default_sense" bit is set, we move the port to "AUTO" mode + * and perform sense_port FW command to try and set the correct + * port type from beginning + */ + if (mlx4_priv(dev)->sense.sense_allowed[i] && dev->caps.default_sense[i]) { + enum mlx4_port_type sensed_port = MLX4_PORT_TYPE_NONE; + dev->caps.possible_type[i] = MLX4_PORT_TYPE_AUTO; + mlx4_SENSE_PORT(dev, i, &sensed_port); + if (sensed_port != MLX4_PORT_TYPE_NONE) + dev->caps.port_type[i] = sensed_port; + } else { + dev->caps.possible_type[i] = dev->caps.port_type[i]; + } + + if (dev->caps.log_num_macs > dev_cap->log_max_macs[i]) { + dev->caps.log_num_macs = dev_cap->log_max_macs[i]; + mlx4_warn(dev, "Requested number of MACs is too much for port %d, reducing to %d\n", + i, 1 << dev->caps.log_num_macs); + } + if (dev->caps.log_num_vlans > dev_cap->log_max_vlans[i]) { + dev->caps.log_num_vlans = dev_cap->log_max_vlans[i]; + mlx4_warn(dev, "Requested number of VLANs is too much for port %d, reducing to %d\n", + i, 1 << dev->caps.log_num_vlans); + } + } + + dev->caps.max_counters = 1 << ilog2(dev_cap->max_counters); + + dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] = dev_cap->reserved_qps; + dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] = + dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] = + (1 << dev->caps.log_num_macs) * + (1 << dev->caps.log_num_vlans) * + dev->caps.num_ports; + dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH] = MLX4_NUM_FEXCH; + + dev->caps.reserved_qps = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] + + dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] + + dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] + + dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH]; + + dev->caps.sqp_demux = (mlx4_is_master(dev)) ? MLX4_MAX_NUM_SLAVES : 0; + + if (!enable_64b_cqe_eqe && !mlx4_is_slave(dev)) { + if (dev_cap->flags & + (MLX4_DEV_CAP_FLAG_64B_CQE | MLX4_DEV_CAP_FLAG_64B_EQE)) { + mlx4_warn(dev, "64B EQEs/CQEs supported by the device but not enabled\n"); + dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_64B_CQE; + dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_64B_EQE; + } + + if (dev_cap->flags2 & + (MLX4_DEV_CAP_FLAG2_CQE_STRIDE | + MLX4_DEV_CAP_FLAG2_EQE_STRIDE)) { + mlx4_warn(dev, "Disabling EQE/CQE stride per user request\n"); + dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_CQE_STRIDE; + dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_EQE_STRIDE; + } + } + + if ((dev->caps.flags & + (MLX4_DEV_CAP_FLAG_64B_CQE | MLX4_DEV_CAP_FLAG_64B_EQE)) && + mlx4_is_master(dev)) + dev->caps.function_caps |= MLX4_FUNC_CAP_64B_EQE_CQE; + + if (!mlx4_is_slave(dev)) + mlx4_enable_cqe_eqe_stride(dev); + + return 0; +} + +static int mlx4_get_pcie_dev_link_caps(struct mlx4_dev *dev, + enum pci_bus_speed *speed, + enum pcie_link_width *width) +{ + u32 lnkcap1, lnkcap2; + int err1, err2; + +#define PCIE_MLW_CAP_SHIFT 4 /* start of MLW mask in link capabilities */ + + *speed = PCI_SPEED_UNKNOWN; + *width = PCIE_LNK_WIDTH_UNKNOWN; + + err1 = pcie_capability_read_dword(dev->pdev, PCI_EXP_LNKCAP, &lnkcap1); + err2 = pcie_capability_read_dword(dev->pdev, PCI_EXP_LNKCAP2, &lnkcap2); + if (!err2 && lnkcap2) { /* PCIe r3.0-compliant */ + if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_8_0GB) + *speed = PCIE_SPEED_8_0GT; + else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_5_0GB) + *speed = PCIE_SPEED_5_0GT; + else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_2_5GB) + *speed = PCIE_SPEED_2_5GT; + } + if (!err1) { + *width = (lnkcap1 & PCI_EXP_LNKCAP_MLW) >> PCIE_MLW_CAP_SHIFT; + if (!lnkcap2) { /* pre-r3.0 */ + if (lnkcap1 & PCI_EXP_LNKCAP_SLS_5_0GB) + *speed = PCIE_SPEED_5_0GT; + else if (lnkcap1 & PCI_EXP_LNKCAP_SLS_2_5GB) + *speed = PCIE_SPEED_2_5GT; + } + } + + if (*speed == PCI_SPEED_UNKNOWN || *width == PCIE_LNK_WIDTH_UNKNOWN) { + return err1 ? err1 : + err2 ? err2 : -EINVAL; + } + return 0; +} + +static void mlx4_check_pcie_caps(struct mlx4_dev *dev) +{ + enum pcie_link_width width, width_cap; + enum pci_bus_speed speed, speed_cap; + int err; + +#define PCIE_SPEED_STR(speed) \ + (speed == PCIE_SPEED_8_0GT ? "8.0GT/s" : \ + speed == PCIE_SPEED_5_0GT ? "5.0GT/s" : \ + speed == PCIE_SPEED_2_5GT ? "2.5GT/s" : \ + "Unknown") + + err = mlx4_get_pcie_dev_link_caps(dev, &speed_cap, &width_cap); + if (err) { + mlx4_warn(dev, + "Unable to determine PCIe device BW capabilities\n"); + return; + } + + err = pcie_get_minimum_link(dev->pdev, &speed, &width); + if (err || speed == PCI_SPEED_UNKNOWN || + width == PCIE_LNK_WIDTH_UNKNOWN) { + mlx4_warn(dev, + "Unable to determine PCI device chain minimum BW\n"); + return; + } + + if (width != width_cap || speed != speed_cap) + mlx4_warn(dev, + "PCIe BW is different than device's capability\n"); + + mlx4_info(dev, "PCIe link speed is %s, device supports %s\n", + PCIE_SPEED_STR(speed), PCIE_SPEED_STR(speed_cap)); + mlx4_info(dev, "PCIe link width is x%d, device supports x%d\n", + width, width_cap); + return; +} + +/*The function checks if there are live vf, return the num of them*/ +static int mlx4_how_many_lives_vf(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *s_state; + int i; + int ret = 0; + + for (i = 1/*the ppf is 0*/; i < dev->num_slaves; ++i) { + s_state = &priv->mfunc.master.slave_state[i]; + if (s_state->active && s_state->last_cmd != + MLX4_COMM_CMD_RESET) { + mlx4_warn(dev, "%s: slave: %d is still active\n", + __func__, i); + ret++; + } + } + return ret; +} + +int mlx4_get_parav_qkey(struct mlx4_dev *dev, u32 qpn, u32 *qkey) +{ + u32 qk = MLX4_RESERVED_QKEY_BASE; + + if (qpn >= dev->phys_caps.base_tunnel_sqpn + 8 * MLX4_MFUNC_MAX || + qpn < dev->phys_caps.base_proxy_sqpn) + return -EINVAL; + + if (qpn >= dev->phys_caps.base_tunnel_sqpn) + /* tunnel qp */ + qk += qpn - dev->phys_caps.base_tunnel_sqpn; + else + qk += qpn - dev->phys_caps.base_proxy_sqpn; + *qkey = qk; + return 0; +} +EXPORT_SYMBOL(mlx4_get_parav_qkey); + +void mlx4_sync_pkey_table(struct mlx4_dev *dev, int slave, int port, int i, int val) +{ + struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev); + + if (!mlx4_is_master(dev)) + return; + + priv->virt2phys_pkey[slave][port - 1][i] = val; +} +EXPORT_SYMBOL(mlx4_sync_pkey_table); + +void mlx4_put_slave_node_guid(struct mlx4_dev *dev, int slave, __be64 guid) +{ + struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev); + + if (!mlx4_is_master(dev)) + return; + + priv->slave_node_guids[slave] = guid; +} +EXPORT_SYMBOL(mlx4_put_slave_node_guid); + +__be64 mlx4_get_slave_node_guid(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev); + + if (!mlx4_is_master(dev)) + return 0; + + return priv->slave_node_guids[slave]; +} +EXPORT_SYMBOL(mlx4_get_slave_node_guid); + +int mlx4_is_slave_active(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *s_slave; + + if (!mlx4_is_master(dev)) + return 0; + + s_slave = &priv->mfunc.master.slave_state[slave]; + return !!s_slave->active; +} +EXPORT_SYMBOL(mlx4_is_slave_active); + +static void slave_adjust_steering_mode(struct mlx4_dev *dev, + struct mlx4_dev_cap *dev_cap, + struct mlx4_init_hca_param *hca_param) +{ + dev->caps.steering_mode = hca_param->steering_mode; + if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { + dev->caps.num_qp_per_mgm = dev_cap->fs_max_num_qp_per_entry; + dev->caps.fs_log_max_ucast_qp_range_size = + dev_cap->fs_log_max_ucast_qp_range_size; + } else + dev->caps.num_qp_per_mgm = + 4 * ((1 << hca_param->log_mc_entry_sz)/16 - 2); + + mlx4_dbg(dev, "Steering mode is: %s\n", + mlx4_steering_mode_str(dev->caps.steering_mode)); +} + +static int mlx4_slave_cap(struct mlx4_dev *dev) +{ + int err; + u32 page_size; + struct mlx4_dev_cap dev_cap; + struct mlx4_func_cap func_cap; + struct mlx4_init_hca_param hca_param; + int i; + + memset(&hca_param, 0, sizeof(hca_param)); + err = mlx4_QUERY_HCA(dev, &hca_param); + if (err) { + mlx4_err(dev, "QUERY_HCA command failed, aborting\n"); + return err; + } + + /* fail if the hca has an unknown global capability + * at this time global_caps should be always zeroed + */ + if (hca_param.global_caps) { + mlx4_err(dev, "Unknown hca global capabilities\n"); + return -ENOSYS; + } + + mlx4_log_num_mgm_entry_size = hca_param.log_mc_entry_sz; + + dev->caps.hca_core_clock = hca_param.hca_core_clock; + + memset(&dev_cap, 0, sizeof(dev_cap)); + dev->caps.max_qp_dest_rdma = 1 << hca_param.log_rd_per_qp; + err = mlx4_dev_cap(dev, &dev_cap); + if (err) { + mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting\n"); + return err; + } + + err = mlx4_QUERY_FW(dev); + if (err) + mlx4_err(dev, "QUERY_FW command failed: could not get FW version\n"); + + page_size = ~dev->caps.page_size_cap + 1; + mlx4_warn(dev, "HCA minimum page size:%d\n", page_size); + if (page_size > PAGE_SIZE) { + mlx4_err(dev, "HCA minimum page size of %d bigger than kernel PAGE_SIZE of %ld, aborting\n", + page_size, PAGE_SIZE); + return -ENODEV; + } + + /* slave gets uar page size from QUERY_HCA fw command */ + dev->caps.uar_page_size = 1 << (hca_param.uar_page_sz + 12); + + /* TODO: relax this assumption */ + if (dev->caps.uar_page_size != PAGE_SIZE) { + mlx4_err(dev, "UAR size:%d != kernel PAGE_SIZE of %ld\n", + dev->caps.uar_page_size, PAGE_SIZE); + return -ENODEV; + } + + memset(&func_cap, 0, sizeof(func_cap)); + err = mlx4_QUERY_FUNC_CAP(dev, 0, &func_cap); + if (err) { + mlx4_err(dev, "QUERY_FUNC_CAP general command failed, aborting (%d)\n", + err); + return err; + } + + if ((func_cap.pf_context_behaviour | PF_CONTEXT_BEHAVIOUR_MASK) != + PF_CONTEXT_BEHAVIOUR_MASK) { + mlx4_err(dev, "Unknown pf context behaviour\n"); + return -ENOSYS; + } + + dev->caps.num_ports = func_cap.num_ports; + dev->quotas.qp = func_cap.qp_quota; + dev->quotas.srq = func_cap.srq_quota; + dev->quotas.cq = func_cap.cq_quota; + dev->quotas.mpt = func_cap.mpt_quota; + dev->quotas.mtt = func_cap.mtt_quota; + dev->caps.num_qps = 1 << hca_param.log_num_qps; + dev->caps.num_srqs = 1 << hca_param.log_num_srqs; + dev->caps.num_cqs = 1 << hca_param.log_num_cqs; + dev->caps.num_mpts = 1 << hca_param.log_mpt_sz; + dev->caps.num_eqs = func_cap.max_eq; + dev->caps.reserved_eqs = func_cap.reserved_eq; + dev->caps.num_pds = MLX4_NUM_PDS; + dev->caps.num_mgms = 0; + dev->caps.num_amgms = 0; + + if (dev->caps.num_ports > MLX4_MAX_PORTS) { + mlx4_err(dev, "HCA has %d ports, but we only support %d, aborting\n", + dev->caps.num_ports, MLX4_MAX_PORTS); + return -ENODEV; + } + + dev->caps.qp0_qkey = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL); + dev->caps.qp0_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL); + dev->caps.qp0_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL); + dev->caps.qp1_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL); + dev->caps.qp1_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL); + + if (!dev->caps.qp0_tunnel || !dev->caps.qp0_proxy || + !dev->caps.qp1_tunnel || !dev->caps.qp1_proxy || + !dev->caps.qp0_qkey) { + err = -ENOMEM; + goto err_mem; + } + + for (i = 1; i <= dev->caps.num_ports; ++i) { + err = mlx4_QUERY_FUNC_CAP(dev, (u32) i, &func_cap); + if (err) { + mlx4_err(dev, "QUERY_FUNC_CAP port command failed for port %d, aborting (%d)\n", + i, err); + goto err_mem; + } + dev->caps.qp0_qkey[i - 1] = func_cap.qp0_qkey; + dev->caps.qp0_tunnel[i - 1] = func_cap.qp0_tunnel_qpn; + dev->caps.qp0_proxy[i - 1] = func_cap.qp0_proxy_qpn; + dev->caps.qp1_tunnel[i - 1] = func_cap.qp1_tunnel_qpn; + dev->caps.qp1_proxy[i - 1] = func_cap.qp1_proxy_qpn; + dev->caps.port_mask[i] = dev->caps.port_type[i]; + dev->caps.phys_port_id[i] = func_cap.phys_port_id; + if (mlx4_get_slave_pkey_gid_tbl_len(dev, i, + &dev->caps.gid_table_len[i], + &dev->caps.pkey_table_len[i])) + goto err_mem; + } + + if (dev->caps.uar_page_size * (dev->caps.num_uars - + dev->caps.reserved_uars) > + pci_resource_len(dev->pdev, 2)) { + mlx4_err(dev, "HCA reported UAR region size of 0x%x bigger than PCI resource 2 size of 0x%llx, aborting\n", + dev->caps.uar_page_size * dev->caps.num_uars, + (unsigned long long) pci_resource_len(dev->pdev, 2)); + goto err_mem; + } + + if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_64B_EQE_ENABLED) { + dev->caps.eqe_size = 64; + dev->caps.eqe_factor = 1; + } else { + dev->caps.eqe_size = 32; + dev->caps.eqe_factor = 0; + } + + if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_64B_CQE_ENABLED) { + dev->caps.cqe_size = 64; + dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE; + } else { + dev->caps.cqe_size = 32; + } + + if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_EQE_STRIDE_ENABLED) { + dev->caps.eqe_size = hca_param.eqe_size; + dev->caps.eqe_factor = 0; + } + + if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_CQE_STRIDE_ENABLED) { + dev->caps.cqe_size = hca_param.cqe_size; + /* User still need to know when CQE > 32B */ + dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE; + } + + dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS; + mlx4_warn(dev, "Timestamping is not supported in slave mode\n"); + + slave_adjust_steering_mode(dev, &dev_cap, &hca_param); + + return 0; + +err_mem: + kfree(dev->caps.qp0_qkey); + kfree(dev->caps.qp0_tunnel); + kfree(dev->caps.qp0_proxy); + kfree(dev->caps.qp1_tunnel); + kfree(dev->caps.qp1_proxy); + dev->caps.qp0_qkey = NULL; + dev->caps.qp0_tunnel = NULL; + dev->caps.qp0_proxy = NULL; + dev->caps.qp1_tunnel = NULL; + dev->caps.qp1_proxy = NULL; + + return err; +} + +static void mlx4_request_modules(struct mlx4_dev *dev) +{ + int port; + int has_ib_port = false; + int has_eth_port = false; +#define EN_DRV_NAME "mlx4_en" +#define IB_DRV_NAME "mlx4_ib" + + for (port = 1; port <= dev->caps.num_ports; port++) { + if (dev->caps.port_type[port] == MLX4_PORT_TYPE_IB) + has_ib_port = true; + else if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH) + has_eth_port = true; + } + + if (has_eth_port) + request_module_nowait(EN_DRV_NAME); + if (has_ib_port || (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE)) + request_module_nowait(IB_DRV_NAME); +} + +/* + * Change the port configuration of the device. + * Every user of this function must hold the port mutex. + */ +int mlx4_change_port_types(struct mlx4_dev *dev, + enum mlx4_port_type *port_types) +{ + int err = 0; + int change = 0; + int port; + + for (port = 0; port < dev->caps.num_ports; port++) { + /* Change the port type only if the new type is different + * from the current, and not set to Auto */ + if (port_types[port] != dev->caps.port_type[port + 1]) + change = 1; + } + if (change) { + mlx4_unregister_device(dev); + for (port = 1; port <= dev->caps.num_ports; port++) { + mlx4_CLOSE_PORT(dev, port); + dev->caps.port_type[port] = port_types[port - 1]; + err = mlx4_SET_PORT(dev, port, -1); + if (err) { + mlx4_err(dev, "Failed to set port %d, aborting\n", + port); + goto out; + } + } + mlx4_set_port_mask(dev); + err = mlx4_register_device(dev); + if (err) { + mlx4_err(dev, "Failed to register device\n"); + goto out; + } + mlx4_request_modules(dev); + } + +out: + return err; +} + +static ssize_t show_port_type(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info, + port_attr); + struct mlx4_dev *mdev = info->dev; + char type[8]; + + sprintf(type, "%s", + (mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_IB) ? + "ib" : "eth"); + if (mdev->caps.possible_type[info->port] == MLX4_PORT_TYPE_AUTO) + sprintf(buf, "auto (%s)\n", type); + else + sprintf(buf, "%s\n", type); + + return strlen(buf); +} + +static ssize_t set_port_type(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info, + port_attr); + struct mlx4_dev *mdev = info->dev; + struct mlx4_priv *priv = mlx4_priv(mdev); + enum mlx4_port_type types[MLX4_MAX_PORTS]; + enum mlx4_port_type new_types[MLX4_MAX_PORTS]; + int i; + int err = 0; + + if (!strcmp(buf, "ib\n")) + info->tmp_type = MLX4_PORT_TYPE_IB; + else if (!strcmp(buf, "eth\n")) + info->tmp_type = MLX4_PORT_TYPE_ETH; + else if (!strcmp(buf, "auto\n")) + info->tmp_type = MLX4_PORT_TYPE_AUTO; + else { + mlx4_err(mdev, "%s is not supported port type\n", buf); + return -EINVAL; + } + + mlx4_stop_sense(mdev); + mutex_lock(&priv->port_mutex); + /* Possible type is always the one that was delivered */ + mdev->caps.possible_type[info->port] = info->tmp_type; + + for (i = 0; i < mdev->caps.num_ports; i++) { + types[i] = priv->port[i+1].tmp_type ? priv->port[i+1].tmp_type : + mdev->caps.possible_type[i+1]; + if (types[i] == MLX4_PORT_TYPE_AUTO) + types[i] = mdev->caps.port_type[i+1]; + } + + if (!(mdev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP) && + !(mdev->caps.flags & MLX4_DEV_CAP_FLAG_SENSE_SUPPORT)) { + for (i = 1; i <= mdev->caps.num_ports; i++) { + if (mdev->caps.possible_type[i] == MLX4_PORT_TYPE_AUTO) { + mdev->caps.possible_type[i] = mdev->caps.port_type[i]; + err = -EINVAL; + } + } + } + if (err) { + mlx4_err(mdev, "Auto sensing is not supported on this HCA. Set only 'eth' or 'ib' for both ports (should be the same)\n"); + goto out; + } + + mlx4_do_sense_ports(mdev, new_types, types); + + err = mlx4_check_port_params(mdev, new_types); + if (err) + goto out; + + /* We are about to apply the changes after the configuration + * was verified, no need to remember the temporary types + * any more */ + for (i = 0; i < mdev->caps.num_ports; i++) + priv->port[i + 1].tmp_type = 0; + + err = mlx4_change_port_types(mdev, new_types); + +out: + mlx4_start_sense(mdev); + mutex_unlock(&priv->port_mutex); + return err ? err : count; +} + +enum ibta_mtu { + IB_MTU_256 = 1, + IB_MTU_512 = 2, + IB_MTU_1024 = 3, + IB_MTU_2048 = 4, + IB_MTU_4096 = 5 +}; + +static inline int int_to_ibta_mtu(int mtu) +{ + switch (mtu) { + case 256: return IB_MTU_256; + case 512: return IB_MTU_512; + case 1024: return IB_MTU_1024; + case 2048: return IB_MTU_2048; + case 4096: return IB_MTU_4096; + default: return -1; + } +} + +static inline int ibta_mtu_to_int(enum ibta_mtu mtu) +{ + switch (mtu) { + case IB_MTU_256: return 256; + case IB_MTU_512: return 512; + case IB_MTU_1024: return 1024; + case IB_MTU_2048: return 2048; + case IB_MTU_4096: return 4096; + default: return -1; + } +} + +static ssize_t show_port_ib_mtu(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info, + port_mtu_attr); + struct mlx4_dev *mdev = info->dev; + + if (mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_ETH) + mlx4_warn(mdev, "port level mtu is only used for IB ports\n"); + + sprintf(buf, "%d\n", + ibta_mtu_to_int(mdev->caps.port_ib_mtu[info->port])); + return strlen(buf); +} + +static ssize_t set_port_ib_mtu(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info, + port_mtu_attr); + struct mlx4_dev *mdev = info->dev; + struct mlx4_priv *priv = mlx4_priv(mdev); + int err, port, mtu, ibta_mtu = -1; + + if (mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_ETH) { + mlx4_warn(mdev, "port level mtu is only used for IB ports\n"); + return -EINVAL; + } + + err = kstrtoint(buf, 0, &mtu); + if (!err) + ibta_mtu = int_to_ibta_mtu(mtu); + + if (err || ibta_mtu < 0) { + mlx4_err(mdev, "%s is invalid IBTA mtu\n", buf); + return -EINVAL; + } + + mdev->caps.port_ib_mtu[info->port] = ibta_mtu; + + mlx4_stop_sense(mdev); + mutex_lock(&priv->port_mutex); + mlx4_unregister_device(mdev); + for (port = 1; port <= mdev->caps.num_ports; port++) { + mlx4_CLOSE_PORT(mdev, port); + err = mlx4_SET_PORT(mdev, port, -1); + if (err) { + mlx4_err(mdev, "Failed to set port %d, aborting\n", + port); + goto err_set_port; + } + } + err = mlx4_register_device(mdev); +err_set_port: + mutex_unlock(&priv->port_mutex); + mlx4_start_sense(mdev); + return err ? err : count; +} + +static int mlx4_load_fw(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int err; + + priv->fw.fw_icm = mlx4_alloc_icm(dev, priv->fw.fw_pages, + GFP_HIGHUSER | __GFP_NOWARN, 0); + if (!priv->fw.fw_icm) { + mlx4_err(dev, "Couldn't allocate FW area, aborting\n"); + return -ENOMEM; + } + + err = mlx4_MAP_FA(dev, priv->fw.fw_icm); + if (err) { + mlx4_err(dev, "MAP_FA command failed, aborting\n"); + goto err_free; + } + + err = mlx4_RUN_FW(dev); + if (err) { + mlx4_err(dev, "RUN_FW command failed, aborting\n"); + goto err_unmap_fa; + } + + return 0; + +err_unmap_fa: + mlx4_UNMAP_FA(dev); + +err_free: + mlx4_free_icm(dev, priv->fw.fw_icm, 0); + return err; +} + +static int mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base, + int cmpt_entry_sz) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int err; + int num_eqs; + + err = mlx4_init_icm_table(dev, &priv->qp_table.cmpt_table, + cmpt_base + + ((u64) (MLX4_CMPT_TYPE_QP * + cmpt_entry_sz) << MLX4_CMPT_SHIFT), + cmpt_entry_sz, dev->caps.num_qps, + dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW], + 0, 0); + if (err) + goto err; + + err = mlx4_init_icm_table(dev, &priv->srq_table.cmpt_table, + cmpt_base + + ((u64) (MLX4_CMPT_TYPE_SRQ * + cmpt_entry_sz) << MLX4_CMPT_SHIFT), + cmpt_entry_sz, dev->caps.num_srqs, + dev->caps.reserved_srqs, 0, 0); + if (err) + goto err_qp; + + err = mlx4_init_icm_table(dev, &priv->cq_table.cmpt_table, + cmpt_base + + ((u64) (MLX4_CMPT_TYPE_CQ * + cmpt_entry_sz) << MLX4_CMPT_SHIFT), + cmpt_entry_sz, dev->caps.num_cqs, + dev->caps.reserved_cqs, 0, 0); + if (err) + goto err_srq; + + num_eqs = (mlx4_is_master(dev)) ? dev->phys_caps.num_phys_eqs : + dev->caps.num_eqs; + err = mlx4_init_icm_table(dev, &priv->eq_table.cmpt_table, + cmpt_base + + ((u64) (MLX4_CMPT_TYPE_EQ * + cmpt_entry_sz) << MLX4_CMPT_SHIFT), + cmpt_entry_sz, num_eqs, num_eqs, 0, 0); + if (err) + goto err_cq; + + return 0; + +err_cq: + mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table); + +err_srq: + mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table); + +err_qp: + mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table); + +err: + return err; +} + +static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap, + struct mlx4_init_hca_param *init_hca, u64 icm_size) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + u64 aux_pages; + int num_eqs; + int err; + + err = mlx4_SET_ICM_SIZE(dev, icm_size, &aux_pages); + if (err) { + mlx4_err(dev, "SET_ICM_SIZE command failed, aborting\n"); + return err; + } + + mlx4_dbg(dev, "%lld KB of HCA context requires %lld KB aux memory\n", + (unsigned long long) icm_size >> 10, + (unsigned long long) aux_pages << 2); + + priv->fw.aux_icm = mlx4_alloc_icm(dev, aux_pages, + GFP_HIGHUSER | __GFP_NOWARN, 0); + if (!priv->fw.aux_icm) { + mlx4_err(dev, "Couldn't allocate aux memory, aborting\n"); + return -ENOMEM; + } + + err = mlx4_MAP_ICM_AUX(dev, priv->fw.aux_icm); + if (err) { + mlx4_err(dev, "MAP_ICM_AUX command failed, aborting\n"); + goto err_free_aux; + } + + err = mlx4_init_cmpt_table(dev, init_hca->cmpt_base, dev_cap->cmpt_entry_sz); + if (err) { + mlx4_err(dev, "Failed to map cMPT context memory, aborting\n"); + goto err_unmap_aux; + } + + + num_eqs = (mlx4_is_master(dev)) ? dev->phys_caps.num_phys_eqs : + dev->caps.num_eqs; + err = mlx4_init_icm_table(dev, &priv->eq_table.table, + init_hca->eqc_base, dev_cap->eqc_entry_sz, + num_eqs, num_eqs, 0, 0); + if (err) { + mlx4_err(dev, "Failed to map EQ context memory, aborting\n"); + goto err_unmap_cmpt; + } + + /* + * Reserved MTT entries must be aligned up to a cacheline + * boundary, since the FW will write to them, while the driver + * writes to all other MTT entries. (The variable + * dev->caps.mtt_entry_sz below is really the MTT segment + * size, not the raw entry size) + */ + dev->caps.reserved_mtts = + ALIGN(dev->caps.reserved_mtts * dev->caps.mtt_entry_sz, + dma_get_cache_alignment()) / dev->caps.mtt_entry_sz; + + err = mlx4_init_icm_table(dev, &priv->mr_table.mtt_table, + init_hca->mtt_base, + dev->caps.mtt_entry_sz, + dev->caps.num_mtts, + dev->caps.reserved_mtts, 1, 0); + if (err) { + mlx4_err(dev, "Failed to map MTT context memory, aborting\n"); + goto err_unmap_eq; + } + + err = mlx4_init_icm_table(dev, &priv->mr_table.dmpt_table, + init_hca->dmpt_base, + dev_cap->dmpt_entry_sz, + dev->caps.num_mpts, + dev->caps.reserved_mrws, 1, 1); + if (err) { + mlx4_err(dev, "Failed to map dMPT context memory, aborting\n"); + goto err_unmap_mtt; + } + + err = mlx4_init_icm_table(dev, &priv->qp_table.qp_table, + init_hca->qpc_base, + dev_cap->qpc_entry_sz, + dev->caps.num_qps, + dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW], + 0, 0); + if (err) { + mlx4_err(dev, "Failed to map QP context memory, aborting\n"); + goto err_unmap_dmpt; + } + + err = mlx4_init_icm_table(dev, &priv->qp_table.auxc_table, + init_hca->auxc_base, + dev_cap->aux_entry_sz, + dev->caps.num_qps, + dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW], + 0, 0); + if (err) { + mlx4_err(dev, "Failed to map AUXC context memory, aborting\n"); + goto err_unmap_qp; + } + + err = mlx4_init_icm_table(dev, &priv->qp_table.altc_table, + init_hca->altc_base, + dev_cap->altc_entry_sz, + dev->caps.num_qps, + dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW], + 0, 0); + if (err) { + mlx4_err(dev, "Failed to map ALTC context memory, aborting\n"); + goto err_unmap_auxc; + } + + err = mlx4_init_icm_table(dev, &priv->qp_table.rdmarc_table, + init_hca->rdmarc_base, + dev_cap->rdmarc_entry_sz << priv->qp_table.rdmarc_shift, + dev->caps.num_qps, + dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW], + 0, 0); + if (err) { + mlx4_err(dev, "Failed to map RDMARC context memory, aborting\n"); + goto err_unmap_altc; + } + + err = mlx4_init_icm_table(dev, &priv->cq_table.table, + init_hca->cqc_base, + dev_cap->cqc_entry_sz, + dev->caps.num_cqs, + dev->caps.reserved_cqs, 0, 0); + if (err) { + mlx4_err(dev, "Failed to map CQ context memory, aborting\n"); + goto err_unmap_rdmarc; + } + + err = mlx4_init_icm_table(dev, &priv->srq_table.table, + init_hca->srqc_base, + dev_cap->srq_entry_sz, + dev->caps.num_srqs, + dev->caps.reserved_srqs, 0, 0); + if (err) { + mlx4_err(dev, "Failed to map SRQ context memory, aborting\n"); + goto err_unmap_cq; + } + + /* + * For flow steering device managed mode it is required to use + * mlx4_init_icm_table. For B0 steering mode it's not strictly + * required, but for simplicity just map the whole multicast + * group table now. The table isn't very big and it's a lot + * easier than trying to track ref counts. + */ + err = mlx4_init_icm_table(dev, &priv->mcg_table.table, + init_hca->mc_base, + mlx4_get_mgm_entry_size(dev), + dev->caps.num_mgms + dev->caps.num_amgms, + dev->caps.num_mgms + dev->caps.num_amgms, + 0, 0); + if (err) { + mlx4_err(dev, "Failed to map MCG context memory, aborting\n"); + goto err_unmap_srq; + } + + return 0; + +err_unmap_srq: + mlx4_cleanup_icm_table(dev, &priv->srq_table.table); + +err_unmap_cq: + mlx4_cleanup_icm_table(dev, &priv->cq_table.table); + +err_unmap_rdmarc: + mlx4_cleanup_icm_table(dev, &priv->qp_table.rdmarc_table); + +err_unmap_altc: + mlx4_cleanup_icm_table(dev, &priv->qp_table.altc_table); + +err_unmap_auxc: + mlx4_cleanup_icm_table(dev, &priv->qp_table.auxc_table); + +err_unmap_qp: + mlx4_cleanup_icm_table(dev, &priv->qp_table.qp_table); + +err_unmap_dmpt: + mlx4_cleanup_icm_table(dev, &priv->mr_table.dmpt_table); + +err_unmap_mtt: + mlx4_cleanup_icm_table(dev, &priv->mr_table.mtt_table); + +err_unmap_eq: + mlx4_cleanup_icm_table(dev, &priv->eq_table.table); + +err_unmap_cmpt: + mlx4_cleanup_icm_table(dev, &priv->eq_table.cmpt_table); + mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table); + mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table); + mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table); + +err_unmap_aux: + mlx4_UNMAP_ICM_AUX(dev); + +err_free_aux: + mlx4_free_icm(dev, priv->fw.aux_icm, 0); + + return err; +} + +static void mlx4_free_icms(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + mlx4_cleanup_icm_table(dev, &priv->mcg_table.table); + mlx4_cleanup_icm_table(dev, &priv->srq_table.table); + mlx4_cleanup_icm_table(dev, &priv->cq_table.table); + mlx4_cleanup_icm_table(dev, &priv->qp_table.rdmarc_table); + mlx4_cleanup_icm_table(dev, &priv->qp_table.altc_table); + mlx4_cleanup_icm_table(dev, &priv->qp_table.auxc_table); + mlx4_cleanup_icm_table(dev, &priv->qp_table.qp_table); + mlx4_cleanup_icm_table(dev, &priv->mr_table.dmpt_table); + mlx4_cleanup_icm_table(dev, &priv->mr_table.mtt_table); + mlx4_cleanup_icm_table(dev, &priv->eq_table.table); + mlx4_cleanup_icm_table(dev, &priv->eq_table.cmpt_table); + mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table); + mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table); + mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table); + + mlx4_UNMAP_ICM_AUX(dev); + mlx4_free_icm(dev, priv->fw.aux_icm, 0); +} + +static void mlx4_slave_exit(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + mutex_lock(&priv->cmd.slave_cmd_mutex); + if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, MLX4_COMM_TIME)) + mlx4_warn(dev, "Failed to close slave function\n"); + mutex_unlock(&priv->cmd.slave_cmd_mutex); +} + +static int map_bf_area(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + resource_size_t bf_start; + resource_size_t bf_len; + int err = 0; + + if (!dev->caps.bf_reg_size) + return -ENXIO; + + bf_start = pci_resource_start(dev->pdev, 2) + + (dev->caps.num_uars << PAGE_SHIFT); + bf_len = pci_resource_len(dev->pdev, 2) - + (dev->caps.num_uars << PAGE_SHIFT); + priv->bf_mapping = io_mapping_create_wc(bf_start, bf_len); + if (!priv->bf_mapping) + err = -ENOMEM; + + return err; +} + +static void unmap_bf_area(struct mlx4_dev *dev) +{ + if (mlx4_priv(dev)->bf_mapping) + io_mapping_free(mlx4_priv(dev)->bf_mapping); +} + +cycle_t mlx4_read_clock(struct mlx4_dev *dev) +{ + u32 clockhi, clocklo, clockhi1; + cycle_t cycles; + int i; + struct mlx4_priv *priv = mlx4_priv(dev); + + for (i = 0; i < 10; i++) { + clockhi = swab32(readl(priv->clock_mapping)); + clocklo = swab32(readl(priv->clock_mapping + 4)); + clockhi1 = swab32(readl(priv->clock_mapping)); + if (clockhi == clockhi1) + break; + } + + cycles = (u64) clockhi << 32 | (u64) clocklo; + + return cycles; +} +EXPORT_SYMBOL_GPL(mlx4_read_clock); + + +static int map_internal_clock(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + priv->clock_mapping = + ioremap(pci_resource_start(dev->pdev, priv->fw.clock_bar) + + priv->fw.clock_offset, MLX4_CLOCK_SIZE); + + if (!priv->clock_mapping) + return -ENOMEM; + + return 0; +} + +static void unmap_internal_clock(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + if (priv->clock_mapping) + iounmap(priv->clock_mapping); +} + +static void mlx4_close_hca(struct mlx4_dev *dev) +{ + unmap_internal_clock(dev); + unmap_bf_area(dev); + if (mlx4_is_slave(dev)) + mlx4_slave_exit(dev); + else { + mlx4_CLOSE_HCA(dev, 0); + mlx4_free_icms(dev); + mlx4_UNMAP_FA(dev); + mlx4_free_icm(dev, mlx4_priv(dev)->fw.fw_icm, 0); + } +} + +static int mlx4_init_slave(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + u64 dma = (u64) priv->mfunc.vhcr_dma; + int ret_from_reset = 0; + u32 slave_read; + u32 cmd_channel_ver; + + if (atomic_read(&pf_loading)) { + mlx4_warn(dev, "PF is not ready - Deferring probe\n"); + return -EPROBE_DEFER; + } + + mutex_lock(&priv->cmd.slave_cmd_mutex); + priv->cmd.max_cmds = 1; + mlx4_warn(dev, "Sending reset\n"); + ret_from_reset = mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, + MLX4_COMM_TIME); + /* if we are in the middle of flr the slave will try + * NUM_OF_RESET_RETRIES times before leaving.*/ + if (ret_from_reset) { + if (MLX4_DELAY_RESET_SLAVE == ret_from_reset) { + mlx4_warn(dev, "slave is currently in the middle of FLR - Deferring probe\n"); + mutex_unlock(&priv->cmd.slave_cmd_mutex); + return -EPROBE_DEFER; + } else + goto err; + } + + /* check the driver version - the slave I/F revision + * must match the master's */ + slave_read = swab32(readl(&priv->mfunc.comm->slave_read)); + cmd_channel_ver = mlx4_comm_get_version(); + + if (MLX4_COMM_GET_IF_REV(cmd_channel_ver) != + MLX4_COMM_GET_IF_REV(slave_read)) { + mlx4_err(dev, "slave driver version is not supported by the master\n"); + goto err; + } + + mlx4_warn(dev, "Sending vhcr0\n"); + if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR0, dma >> 48, + MLX4_COMM_TIME)) + goto err; + if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR1, dma >> 32, + MLX4_COMM_TIME)) + goto err; + if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR2, dma >> 16, + MLX4_COMM_TIME)) + goto err; + if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR_EN, dma, MLX4_COMM_TIME)) + goto err; + + mutex_unlock(&priv->cmd.slave_cmd_mutex); + return 0; + +err: + mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, 0); + mutex_unlock(&priv->cmd.slave_cmd_mutex); + return -EIO; +} + +static void mlx4_parav_master_pf_caps(struct mlx4_dev *dev) +{ + int i; + + for (i = 1; i <= dev->caps.num_ports; i++) { + if (dev->caps.port_type[i] == MLX4_PORT_TYPE_ETH) + dev->caps.gid_table_len[i] = + mlx4_get_slave_num_gids(dev, 0, i); + else + dev->caps.gid_table_len[i] = 1; + dev->caps.pkey_table_len[i] = + dev->phys_caps.pkey_phys_table_len[i] - 1; + } +} + +static int choose_log_fs_mgm_entry_size(int qp_per_entry) +{ + int i = MLX4_MIN_MGM_LOG_ENTRY_SIZE; + + for (i = MLX4_MIN_MGM_LOG_ENTRY_SIZE; i <= MLX4_MAX_MGM_LOG_ENTRY_SIZE; + i++) { + if (qp_per_entry <= 4 * ((1 << i) / 16 - 2)) + break; + } + + return (i <= MLX4_MAX_MGM_LOG_ENTRY_SIZE) ? i : -1; +} + +static void choose_steering_mode(struct mlx4_dev *dev, + struct mlx4_dev_cap *dev_cap) +{ + if (mlx4_log_num_mgm_entry_size == -1 && + dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_FS_EN && + (!mlx4_is_mfunc(dev) || + (dev_cap->fs_max_num_qp_per_entry >= (dev->num_vfs + 1))) && + choose_log_fs_mgm_entry_size(dev_cap->fs_max_num_qp_per_entry) >= + MLX4_MIN_MGM_LOG_ENTRY_SIZE) { + dev->oper_log_mgm_entry_size = + choose_log_fs_mgm_entry_size(dev_cap->fs_max_num_qp_per_entry); + dev->caps.steering_mode = MLX4_STEERING_MODE_DEVICE_MANAGED; + dev->caps.num_qp_per_mgm = dev_cap->fs_max_num_qp_per_entry; + dev->caps.fs_log_max_ucast_qp_range_size = + dev_cap->fs_log_max_ucast_qp_range_size; + } else { + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER && + dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) + dev->caps.steering_mode = MLX4_STEERING_MODE_B0; + else { + dev->caps.steering_mode = MLX4_STEERING_MODE_A0; + + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER || + dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) + mlx4_warn(dev, "Must have both UC_STEER and MC_STEER flags set to use B0 steering - falling back to A0 steering mode\n"); + } + dev->oper_log_mgm_entry_size = + mlx4_log_num_mgm_entry_size > 0 ? + mlx4_log_num_mgm_entry_size : + MLX4_DEFAULT_MGM_LOG_ENTRY_SIZE; + dev->caps.num_qp_per_mgm = mlx4_get_qp_per_mgm(dev); + } + mlx4_dbg(dev, "Steering mode is: %s, oper_log_mgm_entry_size = %d, modparam log_num_mgm_entry_size = %d\n", + mlx4_steering_mode_str(dev->caps.steering_mode), + dev->oper_log_mgm_entry_size, + mlx4_log_num_mgm_entry_size); +} + +static void choose_tunnel_offload_mode(struct mlx4_dev *dev, + struct mlx4_dev_cap *dev_cap) +{ + if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED && + dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS) + dev->caps.tunnel_offload_mode = MLX4_TUNNEL_OFFLOAD_MODE_VXLAN; + else + dev->caps.tunnel_offload_mode = MLX4_TUNNEL_OFFLOAD_MODE_NONE; + + mlx4_dbg(dev, "Tunneling offload mode is: %s\n", (dev->caps.tunnel_offload_mode + == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) ? "vxlan" : "none"); +} + +static int mlx4_init_hca(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_adapter adapter; + struct mlx4_dev_cap dev_cap; + struct mlx4_mod_stat_cfg mlx4_cfg; + struct mlx4_profile profile; + struct mlx4_init_hca_param init_hca; + u64 icm_size; + int err; + + if (!mlx4_is_slave(dev)) { + err = mlx4_QUERY_FW(dev); + if (err) { + if (err == -EACCES) + mlx4_info(dev, "non-primary physical function, skipping\n"); + else + mlx4_err(dev, "QUERY_FW command failed, aborting\n"); + return err; + } + + err = mlx4_load_fw(dev); + if (err) { + mlx4_err(dev, "Failed to start FW, aborting\n"); + return err; + } + + mlx4_cfg.log_pg_sz_m = 1; + mlx4_cfg.log_pg_sz = 0; + err = mlx4_MOD_STAT_CFG(dev, &mlx4_cfg); + if (err) + mlx4_warn(dev, "Failed to override log_pg_sz parameter\n"); + + err = mlx4_dev_cap(dev, &dev_cap); + if (err) { + mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting\n"); + goto err_stop_fw; + } + + choose_steering_mode(dev, &dev_cap); + choose_tunnel_offload_mode(dev, &dev_cap); + + err = mlx4_get_phys_port_id(dev); + if (err) + mlx4_err(dev, "Fail to get physical port id\n"); + + if (mlx4_is_master(dev)) + mlx4_parav_master_pf_caps(dev); + + if (mlx4_low_memory_profile()) { + mlx4_info(dev, "Running from within kdump kernel. Using low memory profile\n"); + profile = low_mem_profile; + } else { + profile = default_profile; + } + if (dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED) + profile.num_mcg = MLX4_FS_NUM_MCG; + + icm_size = mlx4_make_profile(dev, &profile, &dev_cap, + &init_hca); + if ((long long) icm_size < 0) { + err = icm_size; + goto err_stop_fw; + } + + dev->caps.max_fmr_maps = (1 << (32 - ilog2(dev->caps.num_mpts))) - 1; + + init_hca.log_uar_sz = ilog2(dev->caps.num_uars); + init_hca.uar_page_sz = PAGE_SHIFT - 12; + init_hca.mw_enabled = 0; + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW || + dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) + init_hca.mw_enabled = INIT_HCA_TPT_MW_ENABLE; + + err = mlx4_init_icm(dev, &dev_cap, &init_hca, icm_size); + if (err) + goto err_stop_fw; + + err = mlx4_INIT_HCA(dev, &init_hca); + if (err) { + mlx4_err(dev, "INIT_HCA command failed, aborting\n"); + goto err_free_icm; + } + /* + * If TS is supported by FW + * read HCA frequency by QUERY_HCA command + */ + if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS) { + memset(&init_hca, 0, sizeof(init_hca)); + err = mlx4_QUERY_HCA(dev, &init_hca); + if (err) { + mlx4_err(dev, "QUERY_HCA command failed, disable timestamp\n"); + dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS; + } else { + dev->caps.hca_core_clock = + init_hca.hca_core_clock; + } + + /* In case we got HCA frequency 0 - disable timestamping + * to avoid dividing by zero + */ + if (!dev->caps.hca_core_clock) { + dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS; + mlx4_err(dev, + "HCA frequency is 0 - timestamping is not supported\n"); + } else if (map_internal_clock(dev)) { + /* + * Map internal clock, + * in case of failure disable timestamping + */ + dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS; + mlx4_err(dev, "Failed to map internal clock. Timestamping is not supported\n"); + } + } + } else { + err = mlx4_init_slave(dev); + if (err) { + if (err != -EPROBE_DEFER) + mlx4_err(dev, "Failed to initialize slave\n"); + return err; + } + + err = mlx4_slave_cap(dev); + if (err) { + mlx4_err(dev, "Failed to obtain slave caps\n"); + goto err_close; + } + } + + if (map_bf_area(dev)) + mlx4_dbg(dev, "Failed to map blue flame area\n"); + + /*Only the master set the ports, all the rest got it from it.*/ + if (!mlx4_is_slave(dev)) + mlx4_set_port_mask(dev); + + err = mlx4_QUERY_ADAPTER(dev, &adapter); + if (err) { + mlx4_err(dev, "QUERY_ADAPTER command failed, aborting\n"); + goto unmap_bf; + } + + priv->eq_table.inta_pin = adapter.inta_pin; + memcpy(dev->board_id, adapter.board_id, sizeof dev->board_id); + + return 0; + +unmap_bf: + unmap_internal_clock(dev); + unmap_bf_area(dev); + + if (mlx4_is_slave(dev)) { + kfree(dev->caps.qp0_qkey); + kfree(dev->caps.qp0_tunnel); + kfree(dev->caps.qp0_proxy); + kfree(dev->caps.qp1_tunnel); + kfree(dev->caps.qp1_proxy); + } + +err_close: + if (mlx4_is_slave(dev)) + mlx4_slave_exit(dev); + else + mlx4_CLOSE_HCA(dev, 0); + +err_free_icm: + if (!mlx4_is_slave(dev)) + mlx4_free_icms(dev); + +err_stop_fw: + if (!mlx4_is_slave(dev)) { + mlx4_UNMAP_FA(dev); + mlx4_free_icm(dev, priv->fw.fw_icm, 0); + } + return err; +} + +static int mlx4_init_counters_table(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int nent; + + if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS)) + return -ENOENT; + + nent = dev->caps.max_counters; + return mlx4_bitmap_init(&priv->counters_bitmap, nent, nent - 1, 0, 0); +} + +static void mlx4_cleanup_counters_table(struct mlx4_dev *dev) +{ + mlx4_bitmap_cleanup(&mlx4_priv(dev)->counters_bitmap); +} + +int __mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS)) + return -ENOENT; + + *idx = mlx4_bitmap_alloc(&priv->counters_bitmap); + if (*idx == -1) + return -ENOMEM; + + return 0; +} + +int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx) +{ + u64 out_param; + int err; + + if (mlx4_is_mfunc(dev)) { + err = mlx4_cmd_imm(dev, 0, &out_param, RES_COUNTER, + RES_OP_RESERVE, MLX4_CMD_ALLOC_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + if (!err) + *idx = get_param_l(&out_param); + + return err; + } + return __mlx4_counter_alloc(dev, idx); +} +EXPORT_SYMBOL_GPL(mlx4_counter_alloc); + +void __mlx4_counter_free(struct mlx4_dev *dev, u32 idx) +{ + mlx4_bitmap_free(&mlx4_priv(dev)->counters_bitmap, idx, MLX4_USE_RR); + return; +} + +void mlx4_counter_free(struct mlx4_dev *dev, u32 idx) +{ + u64 in_param = 0; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, idx); + mlx4_cmd(dev, in_param, RES_COUNTER, RES_OP_RESERVE, + MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED); + return; + } + __mlx4_counter_free(dev, idx); +} +EXPORT_SYMBOL_GPL(mlx4_counter_free); + +static int mlx4_setup_hca(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int err; + int port; + __be32 ib_port_default_caps; + + err = mlx4_init_uar_table(dev); + if (err) { + mlx4_err(dev, "Failed to initialize user access region table, aborting\n"); + return err; + } + + err = mlx4_uar_alloc(dev, &priv->driver_uar); + if (err) { + mlx4_err(dev, "Failed to allocate driver access region, aborting\n"); + goto err_uar_table_free; + } + + priv->kar = ioremap((phys_addr_t) priv->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE); + if (!priv->kar) { + mlx4_err(dev, "Couldn't map kernel access region, aborting\n"); + err = -ENOMEM; + goto err_uar_free; + } + + err = mlx4_init_pd_table(dev); + if (err) { + mlx4_err(dev, "Failed to initialize protection domain table, aborting\n"); + goto err_kar_unmap; + } + + err = mlx4_init_xrcd_table(dev); + if (err) { + mlx4_err(dev, "Failed to initialize reliable connection domain table, aborting\n"); + goto err_pd_table_free; + } + + err = mlx4_init_mr_table(dev); + if (err) { + mlx4_err(dev, "Failed to initialize memory region table, aborting\n"); + goto err_xrcd_table_free; + } + + if (!mlx4_is_slave(dev)) { + err = mlx4_init_mcg_table(dev); + if (err) { + mlx4_err(dev, "Failed to initialize multicast group table, aborting\n"); + goto err_mr_table_free; + } + err = mlx4_config_mad_demux(dev); + if (err) { + mlx4_err(dev, "Failed in config_mad_demux, aborting\n"); + goto err_mcg_table_free; + } + } + + err = mlx4_init_eq_table(dev); + if (err) { + mlx4_err(dev, "Failed to initialize event queue table, aborting\n"); + goto err_mcg_table_free; + } + + err = mlx4_cmd_use_events(dev); + if (err) { + mlx4_err(dev, "Failed to switch to event-driven firmware commands, aborting\n"); + goto err_eq_table_free; + } + + err = mlx4_NOP(dev); + if (err) { + if (dev->flags & MLX4_FLAG_MSI_X) { + mlx4_warn(dev, "NOP command failed to generate MSI-X interrupt IRQ %d)\n", + priv->eq_table.eq[dev->caps.num_comp_vectors].irq); + mlx4_warn(dev, "Trying again without MSI-X\n"); + } else { + mlx4_err(dev, "NOP command failed to generate interrupt (IRQ %d), aborting\n", + priv->eq_table.eq[dev->caps.num_comp_vectors].irq); + mlx4_err(dev, "BIOS or ACPI interrupt routing problem?\n"); + } + + goto err_cmd_poll; + } + + mlx4_dbg(dev, "NOP command IRQ test passed\n"); + + err = mlx4_init_cq_table(dev); + if (err) { + mlx4_err(dev, "Failed to initialize completion queue table, aborting\n"); + goto err_cmd_poll; + } + + err = mlx4_init_srq_table(dev); + if (err) { + mlx4_err(dev, "Failed to initialize shared receive queue table, aborting\n"); + goto err_cq_table_free; + } + + err = mlx4_init_qp_table(dev); + if (err) { + mlx4_err(dev, "Failed to initialize queue pair table, aborting\n"); + goto err_srq_table_free; + } + + err = mlx4_init_counters_table(dev); + if (err && err != -ENOENT) { + mlx4_err(dev, "Failed to initialize counters table, aborting\n"); + goto err_qp_table_free; + } + + if (!mlx4_is_slave(dev)) { + for (port = 1; port <= dev->caps.num_ports; port++) { + ib_port_default_caps = 0; + err = mlx4_get_port_ib_caps(dev, port, + &ib_port_default_caps); + if (err) + mlx4_warn(dev, "failed to get port %d default ib capabilities (%d). Continuing with caps = 0\n", + port, err); + dev->caps.ib_port_def_cap[port] = ib_port_default_caps; + + /* initialize per-slave default ib port capabilities */ + if (mlx4_is_master(dev)) { + int i; + for (i = 0; i < dev->num_slaves; i++) { + if (i == mlx4_master_func_num(dev)) + continue; + priv->mfunc.master.slave_state[i].ib_cap_mask[port] = + ib_port_default_caps; + } + } + + if (mlx4_is_mfunc(dev)) + dev->caps.port_ib_mtu[port] = IB_MTU_2048; + else + dev->caps.port_ib_mtu[port] = IB_MTU_4096; + + err = mlx4_SET_PORT(dev, port, mlx4_is_master(dev) ? + dev->caps.pkey_table_len[port] : -1); + if (err) { + mlx4_err(dev, "Failed to set port %d, aborting\n", + port); + goto err_counters_table_free; + } + } + } + + return 0; + +err_counters_table_free: + mlx4_cleanup_counters_table(dev); + +err_qp_table_free: + mlx4_cleanup_qp_table(dev); + +err_srq_table_free: + mlx4_cleanup_srq_table(dev); + +err_cq_table_free: + mlx4_cleanup_cq_table(dev); + +err_cmd_poll: + mlx4_cmd_use_polling(dev); + +err_eq_table_free: + mlx4_cleanup_eq_table(dev); + +err_mcg_table_free: + if (!mlx4_is_slave(dev)) + mlx4_cleanup_mcg_table(dev); + +err_mr_table_free: + mlx4_cleanup_mr_table(dev); + +err_xrcd_table_free: + mlx4_cleanup_xrcd_table(dev); + +err_pd_table_free: + mlx4_cleanup_pd_table(dev); + +err_kar_unmap: + iounmap(priv->kar); + +err_uar_free: + mlx4_uar_free(dev, &priv->driver_uar); + +err_uar_table_free: + mlx4_cleanup_uar_table(dev); + return err; +} + +static void mlx4_enable_msi_x(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct msix_entry *entries; + int nreq = min_t(int, dev->caps.num_ports * + min_t(int, num_online_cpus() + 1, + MAX_MSIX_P_PORT) + MSIX_LEGACY_SZ, MAX_MSIX); + int i; + + if (msi_x) { + nreq = min_t(int, dev->caps.num_eqs - dev->caps.reserved_eqs, + nreq); + + entries = kcalloc(nreq, sizeof *entries, GFP_KERNEL); + if (!entries) + goto no_msi; + + for (i = 0; i < nreq; ++i) + entries[i].entry = i; + + nreq = pci_enable_msix_range(dev->pdev, entries, 2, nreq); + + if (nreq < 0) { + kfree(entries); + goto no_msi; + } else if (nreq < MSIX_LEGACY_SZ + + dev->caps.num_ports * MIN_MSIX_P_PORT) { + /*Working in legacy mode , all EQ's shared*/ + dev->caps.comp_pool = 0; + dev->caps.num_comp_vectors = nreq - 1; + } else { + dev->caps.comp_pool = nreq - MSIX_LEGACY_SZ; + dev->caps.num_comp_vectors = MSIX_LEGACY_SZ - 1; + } + for (i = 0; i < nreq; ++i) + priv->eq_table.eq[i].irq = entries[i].vector; + + dev->flags |= MLX4_FLAG_MSI_X; + + kfree(entries); + return; + } + +no_msi: + dev->caps.num_comp_vectors = 1; + dev->caps.comp_pool = 0; + + for (i = 0; i < 2; ++i) + priv->eq_table.eq[i].irq = dev->pdev->irq; +} + +static int mlx4_init_port_info(struct mlx4_dev *dev, int port) +{ + struct mlx4_port_info *info = &mlx4_priv(dev)->port[port]; + int err = 0; + + info->dev = dev; + info->port = port; + if (!mlx4_is_slave(dev)) { + mlx4_init_mac_table(dev, &info->mac_table); + mlx4_init_vlan_table(dev, &info->vlan_table); + mlx4_init_roce_gid_table(dev, &info->gid_table); + info->base_qpn = mlx4_get_base_qpn(dev, port); + } + + sprintf(info->dev_name, "mlx4_port%d", port); + info->port_attr.attr.name = info->dev_name; + if (mlx4_is_mfunc(dev)) + info->port_attr.attr.mode = S_IRUGO; + else { + info->port_attr.attr.mode = S_IRUGO | S_IWUSR; + info->port_attr.store = set_port_type; + } + info->port_attr.show = show_port_type; + sysfs_attr_init(&info->port_attr.attr); + + err = device_create_file(&dev->pdev->dev, &info->port_attr); + if (err) { + mlx4_err(dev, "Failed to create file for port %d\n", port); + info->port = -1; + } + + sprintf(info->dev_mtu_name, "mlx4_port%d_mtu", port); + info->port_mtu_attr.attr.name = info->dev_mtu_name; + if (mlx4_is_mfunc(dev)) + info->port_mtu_attr.attr.mode = S_IRUGO; + else { + info->port_mtu_attr.attr.mode = S_IRUGO | S_IWUSR; + info->port_mtu_attr.store = set_port_ib_mtu; + } + info->port_mtu_attr.show = show_port_ib_mtu; + sysfs_attr_init(&info->port_mtu_attr.attr); + + err = device_create_file(&dev->pdev->dev, &info->port_mtu_attr); + if (err) { + mlx4_err(dev, "Failed to create mtu file for port %d\n", port); + device_remove_file(&info->dev->pdev->dev, &info->port_attr); + info->port = -1; + } + + return err; +} + +static void mlx4_cleanup_port_info(struct mlx4_port_info *info) +{ + if (info->port < 0) + return; + + device_remove_file(&info->dev->pdev->dev, &info->port_attr); + device_remove_file(&info->dev->pdev->dev, &info->port_mtu_attr); +} + +static int mlx4_init_steering(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int num_entries = dev->caps.num_ports; + int i, j; + + priv->steer = kzalloc(sizeof(struct mlx4_steer) * num_entries, GFP_KERNEL); + if (!priv->steer) + return -ENOMEM; + + for (i = 0; i < num_entries; i++) + for (j = 0; j < MLX4_NUM_STEERS; j++) { + INIT_LIST_HEAD(&priv->steer[i].promisc_qps[j]); + INIT_LIST_HEAD(&priv->steer[i].steer_entries[j]); + } + return 0; +} + +static void mlx4_clear_steering(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_steer_index *entry, *tmp_entry; + struct mlx4_promisc_qp *pqp, *tmp_pqp; + int num_entries = dev->caps.num_ports; + int i, j; + + for (i = 0; i < num_entries; i++) { + for (j = 0; j < MLX4_NUM_STEERS; j++) { + list_for_each_entry_safe(pqp, tmp_pqp, + &priv->steer[i].promisc_qps[j], + list) { + list_del(&pqp->list); + kfree(pqp); + } + list_for_each_entry_safe(entry, tmp_entry, + &priv->steer[i].steer_entries[j], + list) { + list_del(&entry->list); + list_for_each_entry_safe(pqp, tmp_pqp, + &entry->duplicates, + list) { + list_del(&pqp->list); + kfree(pqp); + } + kfree(entry); + } + } + } + kfree(priv->steer); +} + +static int extended_func_num(struct pci_dev *pdev) +{ + return PCI_SLOT(pdev->devfn) * 8 + PCI_FUNC(pdev->devfn); +} + +#define MLX4_OWNER_BASE 0x8069c +#define MLX4_OWNER_SIZE 4 + +static int mlx4_get_ownership(struct mlx4_dev *dev) +{ + void __iomem *owner; + u32 ret; + + if (pci_channel_offline(dev->pdev)) + return -EIO; + + owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE, + MLX4_OWNER_SIZE); + if (!owner) { + mlx4_err(dev, "Failed to obtain ownership bit\n"); + return -ENOMEM; + } + + ret = readl(owner); + iounmap(owner); + return (int) !!ret; +} + +static void mlx4_free_ownership(struct mlx4_dev *dev) +{ + void __iomem *owner; + + if (pci_channel_offline(dev->pdev)) + return; + + owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE, + MLX4_OWNER_SIZE); + if (!owner) { + mlx4_err(dev, "Failed to obtain ownership bit\n"); + return; + } + writel(0, owner); + msleep(1000); + iounmap(owner); +} + +static int mlx4_load_one(struct pci_dev *pdev, int pci_dev_data, + int total_vfs, int *nvfs, struct mlx4_priv *priv) +{ + struct mlx4_dev *dev; + unsigned sum = 0; + int err; + int port; + int i; + int existing_vfs = 0; + + dev = &priv->dev; + + INIT_LIST_HEAD(&priv->ctx_list); + spin_lock_init(&priv->ctx_lock); + + mutex_init(&priv->port_mutex); + + INIT_LIST_HEAD(&priv->pgdir_list); + mutex_init(&priv->pgdir_mutex); + + INIT_LIST_HEAD(&priv->bf_list); + mutex_init(&priv->bf_mutex); + + dev->rev_id = pdev->revision; + dev->numa_node = dev_to_node(&pdev->dev); + + /* Detect if this device is a virtual function */ + if (pci_dev_data & MLX4_PCI_DEV_IS_VF) { + mlx4_warn(dev, "Detected virtual function - running in slave mode\n"); + dev->flags |= MLX4_FLAG_SLAVE; + } else { + /* We reset the device and enable SRIOV only for physical + * devices. Try to claim ownership on the device; + * if already taken, skip -- do not allow multiple PFs */ + err = mlx4_get_ownership(dev); + if (err) { + if (err < 0) + return err; + else { + mlx4_warn(dev, "Multiple PFs not yet supported - Skipping PF\n"); + return -EINVAL; + } + } + + if (total_vfs) { + mlx4_warn(dev, "Enabling SR-IOV with %d VFs\n", + total_vfs); + dev->dev_vfs = kzalloc( + total_vfs * sizeof(*dev->dev_vfs), + GFP_KERNEL); + if (NULL == dev->dev_vfs) { + mlx4_err(dev, "Failed to allocate memory for VFs\n"); + err = -ENOMEM; + goto err_free_own; + } else { + atomic_inc(&pf_loading); + existing_vfs = pci_num_vf(pdev); + if (existing_vfs) { + err = 0; + if (existing_vfs != total_vfs) + mlx4_err(dev, "SR-IOV was already enabled, but with num_vfs (%d) different than requested (%d)\n", + existing_vfs, total_vfs); + } else { + err = pci_enable_sriov(pdev, total_vfs); + } + if (err) { + mlx4_err(dev, "Failed to enable SR-IOV, continuing without SR-IOV (err = %d)\n", + err); + atomic_dec(&pf_loading); + } else { + mlx4_warn(dev, "Running in master mode\n"); + dev->flags |= MLX4_FLAG_SRIOV | + MLX4_FLAG_MASTER; + dev->num_vfs = total_vfs; + } + } + } + + atomic_set(&priv->opreq_count, 0); + INIT_WORK(&priv->opreq_task, mlx4_opreq_action); + + /* + * Now reset the HCA before we touch the PCI capabilities or + * attempt a firmware command, since a boot ROM may have left + * the HCA in an undefined state. + */ + err = mlx4_reset(dev); + if (err) { + mlx4_err(dev, "Failed to reset HCA, aborting\n"); + goto err_sriov; + } + } + +slave_start: + err = mlx4_cmd_init(dev); + if (err) { + mlx4_err(dev, "Failed to init command interface, aborting\n"); + goto err_sriov; + } + + /* In slave functions, the communication channel must be initialized + * before posting commands. Also, init num_slaves before calling + * mlx4_init_hca */ + if (mlx4_is_mfunc(dev)) { + if (mlx4_is_master(dev)) + dev->num_slaves = MLX4_MAX_NUM_SLAVES; + else { + dev->num_slaves = 0; + err = mlx4_multi_func_init(dev); + if (err) { + mlx4_err(dev, "Failed to init slave mfunc interface, aborting\n"); + goto err_cmd; + } + } + } + + err = mlx4_init_hca(dev); + if (err) { + if (err == -EACCES) { + /* Not primary Physical function + * Running in slave mode */ + mlx4_cmd_cleanup(dev); + dev->flags |= MLX4_FLAG_SLAVE; + dev->flags &= ~MLX4_FLAG_MASTER; + goto slave_start; + } else + goto err_mfunc; + } + + /* check if the device is functioning at its maximum possible speed. + * No return code for this call, just warn the user in case of PCI + * express device capabilities are under-satisfied by the bus. + */ + if (!mlx4_is_slave(dev)) + mlx4_check_pcie_caps(dev); + + /* In master functions, the communication channel must be initialized + * after obtaining its address from fw */ + if (mlx4_is_master(dev)) { + int ib_ports = 0; + + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) + ib_ports++; + + if (ib_ports && + (num_vfs_argc > 1 || probe_vfs_argc > 1)) { + mlx4_err(dev, + "Invalid syntax of num_vfs/probe_vfs with IB port - single port VFs syntax is only supported when all ports are configured as ethernet\n"); + err = -EINVAL; + goto err_close; + } + if (dev->caps.num_ports < 2 && + num_vfs_argc > 1) { + err = -EINVAL; + mlx4_err(dev, + "Error: Trying to configure VFs on port 2, but HCA has only %d physical ports\n", + dev->caps.num_ports); + goto err_close; + } + memcpy(dev->nvfs, nvfs, sizeof(dev->nvfs)); + + for (i = 0; i < sizeof(dev->nvfs)/sizeof(dev->nvfs[0]); i++) { + unsigned j; + + for (j = 0; j < dev->nvfs[i]; ++sum, ++j) { + dev->dev_vfs[sum].min_port = i < 2 ? i + 1 : 1; + dev->dev_vfs[sum].n_ports = i < 2 ? 1 : + dev->caps.num_ports; + } + } + + /* In master functions, the communication channel + * must be initialized after obtaining its address from fw + */ + err = mlx4_multi_func_init(dev); + if (err) { + mlx4_err(dev, "Failed to init master mfunc interface, aborting.\n"); + goto err_close; + } + } + + err = mlx4_alloc_eq_table(dev); + if (err) + goto err_master_mfunc; + + priv->msix_ctl.pool_bm = 0; + mutex_init(&priv->msix_ctl.pool_lock); + + mlx4_enable_msi_x(dev); + if ((mlx4_is_mfunc(dev)) && + !(dev->flags & MLX4_FLAG_MSI_X)) { + err = -ENOSYS; + mlx4_err(dev, "INTx is not supported in multi-function mode, aborting\n"); + goto err_free_eq; + } + + if (!mlx4_is_slave(dev)) { + err = mlx4_init_steering(dev); + if (err) + goto err_disable_msix; + } + + err = mlx4_setup_hca(dev); + if (err == -EBUSY && (dev->flags & MLX4_FLAG_MSI_X) && + !mlx4_is_mfunc(dev)) { + dev->flags &= ~MLX4_FLAG_MSI_X; + dev->caps.num_comp_vectors = 1; + dev->caps.comp_pool = 0; + pci_disable_msix(pdev); + err = mlx4_setup_hca(dev); + } + + if (err) + goto err_steer; + + mlx4_init_quotas(dev); + + for (port = 1; port <= dev->caps.num_ports; port++) { + err = mlx4_init_port_info(dev, port); + if (err) + goto err_port; + } + + err = mlx4_register_device(dev); + if (err) + goto err_port; + + mlx4_request_modules(dev); + + mlx4_sense_init(dev); + mlx4_start_sense(dev); + + priv->removed = 0; + + if (mlx4_is_master(dev) && dev->num_vfs) + atomic_dec(&pf_loading); + + return 0; + +err_port: + for (--port; port >= 1; --port) + mlx4_cleanup_port_info(&priv->port[port]); + + mlx4_cleanup_counters_table(dev); + mlx4_cleanup_qp_table(dev); + mlx4_cleanup_srq_table(dev); + mlx4_cleanup_cq_table(dev); + mlx4_cmd_use_polling(dev); + mlx4_cleanup_eq_table(dev); + mlx4_cleanup_mcg_table(dev); + mlx4_cleanup_mr_table(dev); + mlx4_cleanup_xrcd_table(dev); + mlx4_cleanup_pd_table(dev); + mlx4_cleanup_uar_table(dev); + +err_steer: + if (!mlx4_is_slave(dev)) + mlx4_clear_steering(dev); + +err_disable_msix: + if (dev->flags & MLX4_FLAG_MSI_X) + pci_disable_msix(pdev); + +err_free_eq: + mlx4_free_eq_table(dev); + +err_master_mfunc: + if (mlx4_is_master(dev)) + mlx4_multi_func_cleanup(dev); + + if (mlx4_is_slave(dev)) { + kfree(dev->caps.qp0_qkey); + kfree(dev->caps.qp0_tunnel); + kfree(dev->caps.qp0_proxy); + kfree(dev->caps.qp1_tunnel); + kfree(dev->caps.qp1_proxy); + } + +err_close: + mlx4_close_hca(dev); + +err_mfunc: + if (mlx4_is_slave(dev)) + mlx4_multi_func_cleanup(dev); + +err_cmd: + mlx4_cmd_cleanup(dev); + +err_sriov: + if (dev->flags & MLX4_FLAG_SRIOV && !existing_vfs) + pci_disable_sriov(pdev); + + if (mlx4_is_master(dev) && dev->num_vfs) + atomic_dec(&pf_loading); + + kfree(priv->dev.dev_vfs); + +err_free_own: + if (!mlx4_is_slave(dev)) + mlx4_free_ownership(dev); + + return err; +} + +static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data, + struct mlx4_priv *priv) +{ + int err; + int nvfs[MLX4_MAX_PORTS + 1] = {0, 0, 0}; + int prb_vf[MLX4_MAX_PORTS + 1] = {0, 0, 0}; + const int param_map[MLX4_MAX_PORTS + 1][MLX4_MAX_PORTS + 1] = { + {2, 0, 0}, {0, 1, 2}, {0, 1, 2} }; + unsigned total_vfs = 0; + unsigned int i; + + pr_info(DRV_NAME ": Initializing %s\n", pci_name(pdev)); + + err = pci_enable_device(pdev); + if (err) { + dev_err(&pdev->dev, "Cannot enable PCI device, aborting\n"); + return err; + } + + /* Due to requirement that all VFs and the PF are *guaranteed* 2 MACS + * per port, we must limit the number of VFs to 63 (since their are + * 128 MACs) + */ + for (i = 0; i < sizeof(nvfs)/sizeof(nvfs[0]) && i < num_vfs_argc; + total_vfs += nvfs[param_map[num_vfs_argc - 1][i]], i++) { + nvfs[param_map[num_vfs_argc - 1][i]] = num_vfs[i]; + if (nvfs[i] < 0) { + dev_err(&pdev->dev, "num_vfs module parameter cannot be negative\n"); + err = -EINVAL; + goto err_disable_pdev; + } + } + for (i = 0; i < sizeof(prb_vf)/sizeof(prb_vf[0]) && i < probe_vfs_argc; + i++) { + prb_vf[param_map[probe_vfs_argc - 1][i]] = probe_vf[i]; + if (prb_vf[i] < 0 || prb_vf[i] > nvfs[i]) { + dev_err(&pdev->dev, "probe_vf module parameter cannot be negative or greater than num_vfs\n"); + err = -EINVAL; + goto err_disable_pdev; + } + } + if (total_vfs >= MLX4_MAX_NUM_VF) { + dev_err(&pdev->dev, + "Requested more VF's (%d) than allowed (%d)\n", + total_vfs, MLX4_MAX_NUM_VF - 1); + err = -EINVAL; + goto err_disable_pdev; + } + + for (i = 0; i < MLX4_MAX_PORTS; i++) { + if (nvfs[i] + nvfs[2] >= MLX4_MAX_NUM_VF_P_PORT) { + dev_err(&pdev->dev, + "Requested more VF's (%d) for port (%d) than allowed (%d)\n", + nvfs[i] + nvfs[2], i + 1, + MLX4_MAX_NUM_VF_P_PORT - 1); + err = -EINVAL; + goto err_disable_pdev; + } + } + + /* Check for BARs. */ + if (!(pci_dev_data & MLX4_PCI_DEV_IS_VF) && + !(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) { + dev_err(&pdev->dev, "Missing DCS, aborting (driver_data: 0x%x, pci_resource_flags(pdev, 0):0x%lx)\n", + pci_dev_data, pci_resource_flags(pdev, 0)); + err = -ENODEV; + goto err_disable_pdev; + } + if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) { + dev_err(&pdev->dev, "Missing UAR, aborting\n"); + err = -ENODEV; + goto err_disable_pdev; + } + + err = pci_request_regions(pdev, DRV_NAME); + if (err) { + dev_err(&pdev->dev, "Couldn't get PCI resources, aborting\n"); + goto err_disable_pdev; + } + + pci_set_master(pdev); + + err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + if (err) { + dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask\n"); + err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + if (err) { + dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting\n"); + goto err_release_regions; + } + } + err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); + if (err) { + dev_warn(&pdev->dev, "Warning: couldn't set 64-bit consistent PCI DMA mask\n"); + err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); + if (err) { + dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, aborting\n"); + goto err_release_regions; + } + } + + /* Allow large DMA segments, up to the firmware limit of 1 GB */ + dma_set_max_seg_size(&pdev->dev, 1024 * 1024 * 1024); + /* Detect if this device is a virtual function */ + if (pci_dev_data & MLX4_PCI_DEV_IS_VF) { + /* When acting as pf, we normally skip vfs unless explicitly + * requested to probe them. + */ + if (total_vfs) { + unsigned vfs_offset = 0; + + for (i = 0; i < sizeof(nvfs)/sizeof(nvfs[0]) && + vfs_offset + nvfs[i] < extended_func_num(pdev); + vfs_offset += nvfs[i], i++) + ; + if (i == sizeof(nvfs)/sizeof(nvfs[0])) { + err = -ENODEV; + goto err_release_regions; + } + if ((extended_func_num(pdev) - vfs_offset) + > prb_vf[i]) { + dev_warn(&pdev->dev, "Skipping virtual function:%d\n", + extended_func_num(pdev)); + err = -ENODEV; + goto err_release_regions; + } + } + } + + err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv); + if (err) + goto err_release_regions; + return 0; + +err_release_regions: + pci_release_regions(pdev); + +err_disable_pdev: + pci_disable_device(pdev); + pci_set_drvdata(pdev, NULL); + return err; +} + +static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct mlx4_priv *priv; + struct mlx4_dev *dev; + int ret; + + printk_once(KERN_INFO "%s", mlx4_version); + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + dev = &priv->dev; + dev->pdev = pdev; + pci_set_drvdata(pdev, dev); + priv->pci_dev_data = id->driver_data; + + ret = __mlx4_init_one(pdev, id->driver_data, priv); + if (ret) + kfree(priv); + + return ret; +} + +static void mlx4_unload_one(struct pci_dev *pdev) +{ + struct mlx4_dev *dev = pci_get_drvdata(pdev); + struct mlx4_priv *priv = mlx4_priv(dev); + int pci_dev_data; + int p; + int active_vfs = 0; + + if (priv->removed) + return; + + pci_dev_data = priv->pci_dev_data; + + /* Disabling SR-IOV is not allowed while there are active vf's */ + if (mlx4_is_master(dev)) { + active_vfs = mlx4_how_many_lives_vf(dev); + if (active_vfs) { + pr_warn("Removing PF when there are active VF's !!\n"); + pr_warn("Will not disable SR-IOV.\n"); + } + } + mlx4_stop_sense(dev); + mlx4_unregister_device(dev); + + for (p = 1; p <= dev->caps.num_ports; p++) { + mlx4_cleanup_port_info(&priv->port[p]); + mlx4_CLOSE_PORT(dev, p); + } + + if (mlx4_is_master(dev)) + mlx4_free_resource_tracker(dev, + RES_TR_FREE_SLAVES_ONLY); + + mlx4_cleanup_counters_table(dev); + mlx4_cleanup_qp_table(dev); + mlx4_cleanup_srq_table(dev); + mlx4_cleanup_cq_table(dev); + mlx4_cmd_use_polling(dev); + mlx4_cleanup_eq_table(dev); + mlx4_cleanup_mcg_table(dev); + mlx4_cleanup_mr_table(dev); + mlx4_cleanup_xrcd_table(dev); + mlx4_cleanup_pd_table(dev); + + if (mlx4_is_master(dev)) + mlx4_free_resource_tracker(dev, + RES_TR_FREE_STRUCTS_ONLY); + + iounmap(priv->kar); + mlx4_uar_free(dev, &priv->driver_uar); + mlx4_cleanup_uar_table(dev); + if (!mlx4_is_slave(dev)) + mlx4_clear_steering(dev); + mlx4_free_eq_table(dev); + if (mlx4_is_master(dev)) + mlx4_multi_func_cleanup(dev); + mlx4_close_hca(dev); + if (mlx4_is_slave(dev)) + mlx4_multi_func_cleanup(dev); + mlx4_cmd_cleanup(dev); + + if (dev->flags & MLX4_FLAG_MSI_X) + pci_disable_msix(pdev); + if (dev->flags & MLX4_FLAG_SRIOV && !active_vfs) { + mlx4_warn(dev, "Disabling SR-IOV\n"); + pci_disable_sriov(pdev); + dev->num_vfs = 0; + } + + if (!mlx4_is_slave(dev)) + mlx4_free_ownership(dev); + + kfree(dev->caps.qp0_qkey); + kfree(dev->caps.qp0_tunnel); + kfree(dev->caps.qp0_proxy); + kfree(dev->caps.qp1_tunnel); + kfree(dev->caps.qp1_proxy); + kfree(dev->dev_vfs); + + memset(priv, 0, sizeof(*priv)); + priv->pci_dev_data = pci_dev_data; + priv->removed = 1; +} + +static void mlx4_remove_one(struct pci_dev *pdev) +{ + struct mlx4_dev *dev = pci_get_drvdata(pdev); + struct mlx4_priv *priv = mlx4_priv(dev); + + mlx4_unload_one(pdev); + pci_release_regions(pdev); + pci_disable_device(pdev); + kfree(priv); + pci_set_drvdata(pdev, NULL); +} + +int mlx4_restart_one(struct pci_dev *pdev) +{ + struct mlx4_dev *dev = pci_get_drvdata(pdev); + struct mlx4_priv *priv = mlx4_priv(dev); + int nvfs[MLX4_MAX_PORTS + 1] = {0, 0, 0}; + int pci_dev_data, err, total_vfs; + + pci_dev_data = priv->pci_dev_data; + total_vfs = dev->num_vfs; + memcpy(nvfs, dev->nvfs, sizeof(dev->nvfs)); + + mlx4_unload_one(pdev); + err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv); + if (err) { + mlx4_err(dev, "%s: ERROR: mlx4_load_one failed, pci_name=%s, err=%d\n", + __func__, pci_name(pdev), err); + return err; + } + + return err; +} + +static const struct pci_device_id mlx4_pci_table[] = { + /* MT25408 "Hermon" SDR */ + { PCI_VDEVICE(MELLANOX, 0x6340), MLX4_PCI_DEV_FORCE_SENSE_PORT }, + /* MT25408 "Hermon" DDR */ + { PCI_VDEVICE(MELLANOX, 0x634a), MLX4_PCI_DEV_FORCE_SENSE_PORT }, + /* MT25408 "Hermon" QDR */ + { PCI_VDEVICE(MELLANOX, 0x6354), MLX4_PCI_DEV_FORCE_SENSE_PORT }, + /* MT25408 "Hermon" DDR PCIe gen2 */ + { PCI_VDEVICE(MELLANOX, 0x6732), MLX4_PCI_DEV_FORCE_SENSE_PORT }, + /* MT25408 "Hermon" QDR PCIe gen2 */ + { PCI_VDEVICE(MELLANOX, 0x673c), MLX4_PCI_DEV_FORCE_SENSE_PORT }, + /* MT25408 "Hermon" EN 10GigE */ + { PCI_VDEVICE(MELLANOX, 0x6368), MLX4_PCI_DEV_FORCE_SENSE_PORT }, + /* MT25408 "Hermon" EN 10GigE PCIe gen2 */ + { PCI_VDEVICE(MELLANOX, 0x6750), MLX4_PCI_DEV_FORCE_SENSE_PORT }, + /* MT25458 ConnectX EN 10GBASE-T 10GigE */ + { PCI_VDEVICE(MELLANOX, 0x6372), MLX4_PCI_DEV_FORCE_SENSE_PORT }, + /* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */ + { PCI_VDEVICE(MELLANOX, 0x675a), MLX4_PCI_DEV_FORCE_SENSE_PORT }, + /* MT26468 ConnectX EN 10GigE PCIe gen2*/ + { PCI_VDEVICE(MELLANOX, 0x6764), MLX4_PCI_DEV_FORCE_SENSE_PORT }, + /* MT26438 ConnectX EN 40GigE PCIe gen2 5GT/s */ + { PCI_VDEVICE(MELLANOX, 0x6746), MLX4_PCI_DEV_FORCE_SENSE_PORT }, + /* MT26478 ConnectX2 40GigE PCIe gen2 */ + { PCI_VDEVICE(MELLANOX, 0x676e), MLX4_PCI_DEV_FORCE_SENSE_PORT }, + /* MT25400 Family [ConnectX-2 Virtual Function] */ + { PCI_VDEVICE(MELLANOX, 0x1002), MLX4_PCI_DEV_IS_VF }, + /* MT27500 Family [ConnectX-3] */ + { PCI_VDEVICE(MELLANOX, 0x1003), 0 }, + /* MT27500 Family [ConnectX-3 Virtual Function] */ + { PCI_VDEVICE(MELLANOX, 0x1004), MLX4_PCI_DEV_IS_VF }, + { PCI_VDEVICE(MELLANOX, 0x1005), 0 }, /* MT27510 Family */ + { PCI_VDEVICE(MELLANOX, 0x1006), 0 }, /* MT27511 Family */ + { PCI_VDEVICE(MELLANOX, 0x1007), 0 }, /* MT27520 Family */ + { PCI_VDEVICE(MELLANOX, 0x1008), 0 }, /* MT27521 Family */ + { PCI_VDEVICE(MELLANOX, 0x1009), 0 }, /* MT27530 Family */ + { PCI_VDEVICE(MELLANOX, 0x100a), 0 }, /* MT27531 Family */ + { PCI_VDEVICE(MELLANOX, 0x100b), 0 }, /* MT27540 Family */ + { PCI_VDEVICE(MELLANOX, 0x100c), 0 }, /* MT27541 Family */ + { PCI_VDEVICE(MELLANOX, 0x100d), 0 }, /* MT27550 Family */ + { PCI_VDEVICE(MELLANOX, 0x100e), 0 }, /* MT27551 Family */ + { PCI_VDEVICE(MELLANOX, 0x100f), 0 }, /* MT27560 Family */ + { PCI_VDEVICE(MELLANOX, 0x1010), 0 }, /* MT27561 Family */ + { 0, } +}; + +MODULE_DEVICE_TABLE(pci, mlx4_pci_table); + +static pci_ers_result_t mlx4_pci_err_detected(struct pci_dev *pdev, + pci_channel_state_t state) +{ + mlx4_unload_one(pdev); + + return state == pci_channel_io_perm_failure ? + PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET; +} + +static pci_ers_result_t mlx4_pci_slot_reset(struct pci_dev *pdev) +{ + struct mlx4_dev *dev = pci_get_drvdata(pdev); + struct mlx4_priv *priv = mlx4_priv(dev); + int ret; + + ret = __mlx4_init_one(pdev, priv->pci_dev_data, priv); + + return ret ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; +} + +static const struct pci_error_handlers mlx4_err_handler = { + .error_detected = mlx4_pci_err_detected, + .slot_reset = mlx4_pci_slot_reset, +}; + +static struct pci_driver mlx4_driver = { + .name = DRV_NAME, + .id_table = mlx4_pci_table, + .probe = mlx4_init_one, + .shutdown = mlx4_unload_one, + .remove = mlx4_remove_one, + .err_handler = &mlx4_err_handler, +}; + +static int __init mlx4_verify_params(void) +{ + if ((log_num_mac < 0) || (log_num_mac > 7)) { + pr_warn("mlx4_core: bad num_mac: %d\n", log_num_mac); + return -1; + } + + if (log_num_vlan != 0) + pr_warn("mlx4_core: log_num_vlan - obsolete module param, using %d\n", + MLX4_LOG_NUM_VLANS); + + if (use_prio != 0) + pr_warn("mlx4_core: use_prio - obsolete module param, ignored\n"); + + if ((log_mtts_per_seg < 1) || (log_mtts_per_seg > 7)) { + pr_warn("mlx4_core: bad log_mtts_per_seg: %d\n", + log_mtts_per_seg); + return -1; + } + + /* Check if module param for ports type has legal combination */ + if (port_type_array[0] == false && port_type_array[1] == true) { + pr_warn("Module parameter configuration ETH/IB is not supported. Switching to default configuration IB/IB\n"); + port_type_array[0] = true; + } + + if (mlx4_log_num_mgm_entry_size != -1 && + (mlx4_log_num_mgm_entry_size < MLX4_MIN_MGM_LOG_ENTRY_SIZE || + mlx4_log_num_mgm_entry_size > MLX4_MAX_MGM_LOG_ENTRY_SIZE)) { + pr_warn("mlx4_core: mlx4_log_num_mgm_entry_size (%d) not in legal range (-1 or %d..%d)\n", + mlx4_log_num_mgm_entry_size, + MLX4_MIN_MGM_LOG_ENTRY_SIZE, + MLX4_MAX_MGM_LOG_ENTRY_SIZE); + return -1; + } + + return 0; +} + +static int __init mlx4_init(void) +{ + int ret; + + if (mlx4_verify_params()) + return -EINVAL; + + mlx4_catas_init(); + + mlx4_wq = create_singlethread_workqueue("mlx4"); + if (!mlx4_wq) + return -ENOMEM; + + ret = pci_register_driver(&mlx4_driver); + if (ret < 0) + destroy_workqueue(mlx4_wq); + return ret < 0 ? ret : 0; +} + +static void __exit mlx4_cleanup(void) +{ + pci_unregister_driver(&mlx4_driver); + destroy_workqueue(mlx4_wq); +} + +module_init(mlx4_init); +module_exit(mlx4_cleanup); diff --git a/drivers/net/ethernet/mellanox/mlx4/mcg.c b/drivers/net/ethernet/mellanox/mlx4/mcg.c new file mode 100644 index 0000000..8728431 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/mcg.c @@ -0,0 +1,1613 @@ +/* + * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include +#include + +#include "mlx4.h" + +static const u8 zero_gid[16]; /* automatically initialized to 0 */ + +int mlx4_get_mgm_entry_size(struct mlx4_dev *dev) +{ + return 1 << dev->oper_log_mgm_entry_size; +} + +int mlx4_get_qp_per_mgm(struct mlx4_dev *dev) +{ + return 4 * (mlx4_get_mgm_entry_size(dev) / 16 - 2); +} + +static int mlx4_QP_FLOW_STEERING_ATTACH(struct mlx4_dev *dev, + struct mlx4_cmd_mailbox *mailbox, + u32 size, + u64 *reg_id) +{ + u64 imm; + int err = 0; + + err = mlx4_cmd_imm(dev, mailbox->dma, &imm, size, 0, + MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (err) + return err; + *reg_id = imm; + + return err; +} + +static int mlx4_QP_FLOW_STEERING_DETACH(struct mlx4_dev *dev, u64 regid) +{ + int err = 0; + + err = mlx4_cmd(dev, regid, 0, 0, + MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + + return err; +} + +static int mlx4_READ_ENTRY(struct mlx4_dev *dev, int index, + struct mlx4_cmd_mailbox *mailbox) +{ + return mlx4_cmd_box(dev, 0, mailbox->dma, index, 0, MLX4_CMD_READ_MCG, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); +} + +static int mlx4_WRITE_ENTRY(struct mlx4_dev *dev, int index, + struct mlx4_cmd_mailbox *mailbox) +{ + return mlx4_cmd(dev, mailbox->dma, index, 0, MLX4_CMD_WRITE_MCG, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); +} + +static int mlx4_WRITE_PROMISC(struct mlx4_dev *dev, u8 port, u8 steer, + struct mlx4_cmd_mailbox *mailbox) +{ + u32 in_mod; + + in_mod = (u32) port << 16 | steer << 1; + return mlx4_cmd(dev, mailbox->dma, in_mod, 0x1, + MLX4_CMD_WRITE_MCG, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); +} + +static int mlx4_GID_HASH(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, + u16 *hash, u8 op_mod) +{ + u64 imm; + int err; + + err = mlx4_cmd_imm(dev, mailbox->dma, &imm, 0, op_mod, + MLX4_CMD_MGID_HASH, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + + if (!err) + *hash = imm; + + return err; +} + +static struct mlx4_promisc_qp *get_promisc_qp(struct mlx4_dev *dev, u8 port, + enum mlx4_steer_type steer, + u32 qpn) +{ + struct mlx4_steer *s_steer; + struct mlx4_promisc_qp *pqp; + + if (port < 1 || port > dev->caps.num_ports) + return NULL; + + s_steer = &mlx4_priv(dev)->steer[port - 1]; + + list_for_each_entry(pqp, &s_steer->promisc_qps[steer], list) { + if (pqp->qpn == qpn) + return pqp; + } + /* not found */ + return NULL; +} + +/* + * Add new entry to steering data structure. + * All promisc QPs should be added as well + */ +static int new_steering_entry(struct mlx4_dev *dev, u8 port, + enum mlx4_steer_type steer, + unsigned int index, u32 qpn) +{ + struct mlx4_steer *s_steer; + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_mgm *mgm; + u32 members_count; + struct mlx4_steer_index *new_entry; + struct mlx4_promisc_qp *pqp; + struct mlx4_promisc_qp *dqp = NULL; + u32 prot; + int err; + + if (port < 1 || port > dev->caps.num_ports) + return -EINVAL; + + s_steer = &mlx4_priv(dev)->steer[port - 1]; + new_entry = kzalloc(sizeof *new_entry, GFP_KERNEL); + if (!new_entry) + return -ENOMEM; + + INIT_LIST_HEAD(&new_entry->duplicates); + new_entry->index = index; + list_add_tail(&new_entry->list, &s_steer->steer_entries[steer]); + + /* If the given qpn is also a promisc qp, + * it should be inserted to duplicates list + */ + pqp = get_promisc_qp(dev, port, steer, qpn); + if (pqp) { + dqp = kmalloc(sizeof *dqp, GFP_KERNEL); + if (!dqp) { + err = -ENOMEM; + goto out_alloc; + } + dqp->qpn = qpn; + list_add_tail(&dqp->list, &new_entry->duplicates); + } + + /* if no promisc qps for this vep, we are done */ + if (list_empty(&s_steer->promisc_qps[steer])) + return 0; + + /* now need to add all the promisc qps to the new + * steering entry, as they should also receive the packets + * destined to this address */ + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + err = -ENOMEM; + goto out_alloc; + } + mgm = mailbox->buf; + + err = mlx4_READ_ENTRY(dev, index, mailbox); + if (err) + goto out_mailbox; + + members_count = be32_to_cpu(mgm->members_count) & 0xffffff; + prot = be32_to_cpu(mgm->members_count) >> 30; + list_for_each_entry(pqp, &s_steer->promisc_qps[steer], list) { + /* don't add already existing qpn */ + if (pqp->qpn == qpn) + continue; + if (members_count == dev->caps.num_qp_per_mgm) { + /* out of space */ + err = -ENOMEM; + goto out_mailbox; + } + + /* add the qpn */ + mgm->qp[members_count++] = cpu_to_be32(pqp->qpn & MGM_QPN_MASK); + } + /* update the qps count and update the entry with all the promisc qps*/ + mgm->members_count = cpu_to_be32(members_count | (prot << 30)); + err = mlx4_WRITE_ENTRY(dev, index, mailbox); + +out_mailbox: + mlx4_free_cmd_mailbox(dev, mailbox); + if (!err) + return 0; +out_alloc: + if (dqp) { + list_del(&dqp->list); + kfree(dqp); + } + list_del(&new_entry->list); + kfree(new_entry); + return err; +} + +/* update the data structures with existing steering entry */ +static int existing_steering_entry(struct mlx4_dev *dev, u8 port, + enum mlx4_steer_type steer, + unsigned int index, u32 qpn) +{ + struct mlx4_steer *s_steer; + struct mlx4_steer_index *tmp_entry, *entry = NULL; + struct mlx4_promisc_qp *pqp; + struct mlx4_promisc_qp *dqp; + + if (port < 1 || port > dev->caps.num_ports) + return -EINVAL; + + s_steer = &mlx4_priv(dev)->steer[port - 1]; + + pqp = get_promisc_qp(dev, port, steer, qpn); + if (!pqp) + return 0; /* nothing to do */ + + list_for_each_entry(tmp_entry, &s_steer->steer_entries[steer], list) { + if (tmp_entry->index == index) { + entry = tmp_entry; + break; + } + } + if (unlikely(!entry)) { + mlx4_warn(dev, "Steering entry at index %x is not registered\n", index); + return -EINVAL; + } + + /* the given qpn is listed as a promisc qpn + * we need to add it as a duplicate to this entry + * for future references */ + list_for_each_entry(dqp, &entry->duplicates, list) { + if (qpn == dqp->qpn) + return 0; /* qp is already duplicated */ + } + + /* add the qp as a duplicate on this index */ + dqp = kmalloc(sizeof *dqp, GFP_KERNEL); + if (!dqp) + return -ENOMEM; + dqp->qpn = qpn; + list_add_tail(&dqp->list, &entry->duplicates); + + return 0; +} + +/* Check whether a qpn is a duplicate on steering entry + * If so, it should not be removed from mgm */ +static bool check_duplicate_entry(struct mlx4_dev *dev, u8 port, + enum mlx4_steer_type steer, + unsigned int index, u32 qpn) +{ + struct mlx4_steer *s_steer; + struct mlx4_steer_index *tmp_entry, *entry = NULL; + struct mlx4_promisc_qp *dqp, *tmp_dqp; + + if (port < 1 || port > dev->caps.num_ports) + return NULL; + + s_steer = &mlx4_priv(dev)->steer[port - 1]; + + /* if qp is not promisc, it cannot be duplicated */ + if (!get_promisc_qp(dev, port, steer, qpn)) + return false; + + /* The qp is promisc qp so it is a duplicate on this index + * Find the index entry, and remove the duplicate */ + list_for_each_entry(tmp_entry, &s_steer->steer_entries[steer], list) { + if (tmp_entry->index == index) { + entry = tmp_entry; + break; + } + } + if (unlikely(!entry)) { + mlx4_warn(dev, "Steering entry for index %x is not registered\n", index); + return false; + } + list_for_each_entry_safe(dqp, tmp_dqp, &entry->duplicates, list) { + if (dqp->qpn == qpn) { + list_del(&dqp->list); + kfree(dqp); + } + } + return true; +} + +/* Returns true if all the QPs != tqpn contained in this entry + * are Promisc QPs. Returns false otherwise. + */ +static bool promisc_steering_entry(struct mlx4_dev *dev, u8 port, + enum mlx4_steer_type steer, + unsigned int index, u32 tqpn, + u32 *members_count) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_mgm *mgm; + u32 m_count; + bool ret = false; + int i; + + if (port < 1 || port > dev->caps.num_ports) + return false; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return false; + mgm = mailbox->buf; + + if (mlx4_READ_ENTRY(dev, index, mailbox)) + goto out; + m_count = be32_to_cpu(mgm->members_count) & 0xffffff; + if (members_count) + *members_count = m_count; + + for (i = 0; i < m_count; i++) { + u32 qpn = be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK; + if (!get_promisc_qp(dev, port, steer, qpn) && qpn != tqpn) { + /* the qp is not promisc, the entry can't be removed */ + goto out; + } + } + ret = true; +out: + mlx4_free_cmd_mailbox(dev, mailbox); + return ret; +} + +/* IF a steering entry contains only promisc QPs, it can be removed. */ +static bool can_remove_steering_entry(struct mlx4_dev *dev, u8 port, + enum mlx4_steer_type steer, + unsigned int index, u32 tqpn) +{ + struct mlx4_steer *s_steer; + struct mlx4_steer_index *entry = NULL, *tmp_entry; + u32 members_count; + bool ret = false; + + if (port < 1 || port > dev->caps.num_ports) + return NULL; + + s_steer = &mlx4_priv(dev)->steer[port - 1]; + + if (!promisc_steering_entry(dev, port, steer, index, + tqpn, &members_count)) + goto out; + + /* All the qps currently registered for this entry are promiscuous, + * Checking for duplicates */ + ret = true; + list_for_each_entry_safe(entry, tmp_entry, &s_steer->steer_entries[steer], list) { + if (entry->index == index) { + if (list_empty(&entry->duplicates) || + members_count == 1) { + struct mlx4_promisc_qp *pqp, *tmp_pqp; + /* If there is only 1 entry in duplicates then + * this is the QP we want to delete, going over + * the list and deleting the entry. + */ + list_del(&entry->list); + list_for_each_entry_safe(pqp, tmp_pqp, + &entry->duplicates, + list) { + list_del(&pqp->list); + kfree(pqp); + } + kfree(entry); + } else { + /* This entry contains duplicates so it shouldn't be removed */ + ret = false; + goto out; + } + } + } + +out: + return ret; +} + +static int add_promisc_qp(struct mlx4_dev *dev, u8 port, + enum mlx4_steer_type steer, u32 qpn) +{ + struct mlx4_steer *s_steer; + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_mgm *mgm; + struct mlx4_steer_index *entry; + struct mlx4_promisc_qp *pqp; + struct mlx4_promisc_qp *dqp; + u32 members_count; + u32 prot; + int i; + bool found; + int err; + struct mlx4_priv *priv = mlx4_priv(dev); + + if (port < 1 || port > dev->caps.num_ports) + return -EINVAL; + + s_steer = &mlx4_priv(dev)->steer[port - 1]; + + mutex_lock(&priv->mcg_table.mutex); + + if (get_promisc_qp(dev, port, steer, qpn)) { + err = 0; /* Noting to do, already exists */ + goto out_mutex; + } + + pqp = kmalloc(sizeof *pqp, GFP_KERNEL); + if (!pqp) { + err = -ENOMEM; + goto out_mutex; + } + pqp->qpn = qpn; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + err = -ENOMEM; + goto out_alloc; + } + mgm = mailbox->buf; + + if (!(mlx4_is_mfunc(dev) && steer == MLX4_UC_STEER)) { + /* The promisc QP needs to be added for each one of the steering + * entries. If it already exists, needs to be added as + * a duplicate for this entry. + */ + list_for_each_entry(entry, + &s_steer->steer_entries[steer], + list) { + err = mlx4_READ_ENTRY(dev, entry->index, mailbox); + if (err) + goto out_mailbox; + + members_count = be32_to_cpu(mgm->members_count) & + 0xffffff; + prot = be32_to_cpu(mgm->members_count) >> 30; + found = false; + for (i = 0; i < members_count; i++) { + if ((be32_to_cpu(mgm->qp[i]) & + MGM_QPN_MASK) == qpn) { + /* Entry already exists. + * Add to duplicates. + */ + dqp = kmalloc(sizeof(*dqp), GFP_KERNEL); + if (!dqp) { + err = -ENOMEM; + goto out_mailbox; + } + dqp->qpn = qpn; + list_add_tail(&dqp->list, + &entry->duplicates); + found = true; + } + } + if (!found) { + /* Need to add the qpn to mgm */ + if (members_count == + dev->caps.num_qp_per_mgm) { + /* entry is full */ + err = -ENOMEM; + goto out_mailbox; + } + mgm->qp[members_count++] = + cpu_to_be32(qpn & MGM_QPN_MASK); + mgm->members_count = + cpu_to_be32(members_count | + (prot << 30)); + err = mlx4_WRITE_ENTRY(dev, entry->index, + mailbox); + if (err) + goto out_mailbox; + } + } + } + + /* add the new qpn to list of promisc qps */ + list_add_tail(&pqp->list, &s_steer->promisc_qps[steer]); + /* now need to add all the promisc qps to default entry */ + memset(mgm, 0, sizeof *mgm); + members_count = 0; + list_for_each_entry(dqp, &s_steer->promisc_qps[steer], list) { + if (members_count == dev->caps.num_qp_per_mgm) { + /* entry is full */ + err = -ENOMEM; + goto out_list; + } + mgm->qp[members_count++] = cpu_to_be32(dqp->qpn & MGM_QPN_MASK); + } + mgm->members_count = cpu_to_be32(members_count | MLX4_PROT_ETH << 30); + + err = mlx4_WRITE_PROMISC(dev, port, steer, mailbox); + if (err) + goto out_list; + + mlx4_free_cmd_mailbox(dev, mailbox); + mutex_unlock(&priv->mcg_table.mutex); + return 0; + +out_list: + list_del(&pqp->list); +out_mailbox: + mlx4_free_cmd_mailbox(dev, mailbox); +out_alloc: + kfree(pqp); +out_mutex: + mutex_unlock(&priv->mcg_table.mutex); + return err; +} + +static int remove_promisc_qp(struct mlx4_dev *dev, u8 port, + enum mlx4_steer_type steer, u32 qpn) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_steer *s_steer; + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_mgm *mgm; + struct mlx4_steer_index *entry, *tmp_entry; + struct mlx4_promisc_qp *pqp; + struct mlx4_promisc_qp *dqp; + u32 members_count; + bool found; + bool back_to_list = false; + int i; + int err; + + if (port < 1 || port > dev->caps.num_ports) + return -EINVAL; + + s_steer = &mlx4_priv(dev)->steer[port - 1]; + mutex_lock(&priv->mcg_table.mutex); + + pqp = get_promisc_qp(dev, port, steer, qpn); + if (unlikely(!pqp)) { + mlx4_warn(dev, "QP %x is not promiscuous QP\n", qpn); + /* nothing to do */ + err = 0; + goto out_mutex; + } + + /*remove from list of promisc qps */ + list_del(&pqp->list); + + /* set the default entry not to include the removed one */ + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + err = -ENOMEM; + back_to_list = true; + goto out_list; + } + mgm = mailbox->buf; + members_count = 0; + list_for_each_entry(dqp, &s_steer->promisc_qps[steer], list) + mgm->qp[members_count++] = cpu_to_be32(dqp->qpn & MGM_QPN_MASK); + mgm->members_count = cpu_to_be32(members_count | MLX4_PROT_ETH << 30); + + err = mlx4_WRITE_PROMISC(dev, port, steer, mailbox); + if (err) + goto out_mailbox; + + if (!(mlx4_is_mfunc(dev) && steer == MLX4_UC_STEER)) { + /* Remove the QP from all the steering entries */ + list_for_each_entry_safe(entry, tmp_entry, + &s_steer->steer_entries[steer], + list) { + found = false; + list_for_each_entry(dqp, &entry->duplicates, list) { + if (dqp->qpn == qpn) { + found = true; + break; + } + } + if (found) { + /* A duplicate, no need to change the MGM, + * only update the duplicates list + */ + list_del(&dqp->list); + kfree(dqp); + } else { + int loc = -1; + + err = mlx4_READ_ENTRY(dev, + entry->index, + mailbox); + if (err) + goto out_mailbox; + members_count = + be32_to_cpu(mgm->members_count) & + 0xffffff; + if (!members_count) { + mlx4_warn(dev, "QP %06x wasn't found in entry %x mcount=0. deleting entry...\n", + qpn, entry->index); + list_del(&entry->list); + kfree(entry); + continue; + } + + for (i = 0; i < members_count; ++i) + if ((be32_to_cpu(mgm->qp[i]) & + MGM_QPN_MASK) == qpn) { + loc = i; + break; + } + + if (loc < 0) { + mlx4_err(dev, "QP %06x wasn't found in entry %d\n", + qpn, entry->index); + err = -EINVAL; + goto out_mailbox; + } + + /* Copy the last QP in this MGM + * over removed QP + */ + mgm->qp[loc] = mgm->qp[members_count - 1]; + mgm->qp[members_count - 1] = 0; + mgm->members_count = + cpu_to_be32(--members_count | + (MLX4_PROT_ETH << 30)); + + err = mlx4_WRITE_ENTRY(dev, + entry->index, + mailbox); + if (err) + goto out_mailbox; + } + } + } + +out_mailbox: + mlx4_free_cmd_mailbox(dev, mailbox); +out_list: + if (back_to_list) + list_add_tail(&pqp->list, &s_steer->promisc_qps[steer]); + else + kfree(pqp); +out_mutex: + mutex_unlock(&priv->mcg_table.mutex); + return err; +} + +/* + * Caller must hold MCG table semaphore. gid and mgm parameters must + * be properly aligned for command interface. + * + * Returns 0 unless a firmware command error occurs. + * + * If GID is found in MGM or MGM is empty, *index = *hash, *prev = -1 + * and *mgm holds MGM entry. + * + * if GID is found in AMGM, *index = index in AMGM, *prev = index of + * previous entry in hash chain and *mgm holds AMGM entry. + * + * If no AMGM exists for given gid, *index = -1, *prev = index of last + * entry in hash chain and *mgm holds end of hash chain. + */ +static int find_entry(struct mlx4_dev *dev, u8 port, + u8 *gid, enum mlx4_protocol prot, + struct mlx4_cmd_mailbox *mgm_mailbox, + int *prev, int *index) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_mgm *mgm = mgm_mailbox->buf; + u8 *mgid; + int err; + u16 hash; + u8 op_mod = (prot == MLX4_PROT_ETH) ? + !!(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) : 0; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return -ENOMEM; + mgid = mailbox->buf; + + memcpy(mgid, gid, 16); + + err = mlx4_GID_HASH(dev, mailbox, &hash, op_mod); + mlx4_free_cmd_mailbox(dev, mailbox); + if (err) + return err; + + if (0) + mlx4_dbg(dev, "Hash for %pI6 is %04x\n", gid, hash); + + *index = hash; + *prev = -1; + + do { + err = mlx4_READ_ENTRY(dev, *index, mgm_mailbox); + if (err) + return err; + + if (!(be32_to_cpu(mgm->members_count) & 0xffffff)) { + if (*index != hash) { + mlx4_err(dev, "Found zero MGID in AMGM\n"); + err = -EINVAL; + } + return err; + } + + if (!memcmp(mgm->gid, gid, 16) && + be32_to_cpu(mgm->members_count) >> 30 == prot) + return err; + + *prev = *index; + *index = be32_to_cpu(mgm->next_gid_index) >> 6; + } while (*index); + + *index = -1; + return err; +} + +static const u8 __promisc_mode[] = { + [MLX4_FS_REGULAR] = 0x0, + [MLX4_FS_ALL_DEFAULT] = 0x1, + [MLX4_FS_MC_DEFAULT] = 0x3, + [MLX4_FS_UC_SNIFFER] = 0x4, + [MLX4_FS_MC_SNIFFER] = 0x5, +}; + +int mlx4_map_sw_to_hw_steering_mode(struct mlx4_dev *dev, + enum mlx4_net_trans_promisc_mode flow_type) +{ + if (flow_type >= MLX4_FS_MODE_NUM) { + mlx4_err(dev, "Invalid flow type. type = %d\n", flow_type); + return -EINVAL; + } + return __promisc_mode[flow_type]; +} +EXPORT_SYMBOL_GPL(mlx4_map_sw_to_hw_steering_mode); + +static void trans_rule_ctrl_to_hw(struct mlx4_net_trans_rule *ctrl, + struct mlx4_net_trans_rule_hw_ctrl *hw) +{ + u8 flags = 0; + + flags = ctrl->queue_mode == MLX4_NET_TRANS_Q_LIFO ? 1 : 0; + flags |= ctrl->exclusive ? (1 << 2) : 0; + flags |= ctrl->allow_loopback ? (1 << 3) : 0; + + hw->flags = flags; + hw->type = __promisc_mode[ctrl->promisc_mode]; + hw->prio = cpu_to_be16(ctrl->priority); + hw->port = ctrl->port; + hw->qpn = cpu_to_be32(ctrl->qpn); +} + +const u16 __sw_id_hw[] = { + [MLX4_NET_TRANS_RULE_ID_ETH] = 0xE001, + [MLX4_NET_TRANS_RULE_ID_IB] = 0xE005, + [MLX4_NET_TRANS_RULE_ID_IPV6] = 0xE003, + [MLX4_NET_TRANS_RULE_ID_IPV4] = 0xE002, + [MLX4_NET_TRANS_RULE_ID_TCP] = 0xE004, + [MLX4_NET_TRANS_RULE_ID_UDP] = 0xE006, + [MLX4_NET_TRANS_RULE_ID_VXLAN] = 0xE008 +}; + +int mlx4_map_sw_to_hw_steering_id(struct mlx4_dev *dev, + enum mlx4_net_trans_rule_id id) +{ + if (id >= MLX4_NET_TRANS_RULE_NUM) { + mlx4_err(dev, "Invalid network rule id. id = %d\n", id); + return -EINVAL; + } + return __sw_id_hw[id]; +} +EXPORT_SYMBOL_GPL(mlx4_map_sw_to_hw_steering_id); + +static const int __rule_hw_sz[] = { + [MLX4_NET_TRANS_RULE_ID_ETH] = + sizeof(struct mlx4_net_trans_rule_hw_eth), + [MLX4_NET_TRANS_RULE_ID_IB] = + sizeof(struct mlx4_net_trans_rule_hw_ib), + [MLX4_NET_TRANS_RULE_ID_IPV6] = 0, + [MLX4_NET_TRANS_RULE_ID_IPV4] = + sizeof(struct mlx4_net_trans_rule_hw_ipv4), + [MLX4_NET_TRANS_RULE_ID_TCP] = + sizeof(struct mlx4_net_trans_rule_hw_tcp_udp), + [MLX4_NET_TRANS_RULE_ID_UDP] = + sizeof(struct mlx4_net_trans_rule_hw_tcp_udp), + [MLX4_NET_TRANS_RULE_ID_VXLAN] = + sizeof(struct mlx4_net_trans_rule_hw_vxlan) +}; + +int mlx4_hw_rule_sz(struct mlx4_dev *dev, + enum mlx4_net_trans_rule_id id) +{ + if (id >= MLX4_NET_TRANS_RULE_NUM) { + mlx4_err(dev, "Invalid network rule id. id = %d\n", id); + return -EINVAL; + } + + return __rule_hw_sz[id]; +} +EXPORT_SYMBOL_GPL(mlx4_hw_rule_sz); + +static int parse_trans_rule(struct mlx4_dev *dev, struct mlx4_spec_list *spec, + struct _rule_hw *rule_hw) +{ + if (mlx4_hw_rule_sz(dev, spec->id) < 0) + return -EINVAL; + memset(rule_hw, 0, mlx4_hw_rule_sz(dev, spec->id)); + rule_hw->id = cpu_to_be16(__sw_id_hw[spec->id]); + rule_hw->size = mlx4_hw_rule_sz(dev, spec->id) >> 2; + + switch (spec->id) { + case MLX4_NET_TRANS_RULE_ID_ETH: + memcpy(rule_hw->eth.dst_mac, spec->eth.dst_mac, ETH_ALEN); + memcpy(rule_hw->eth.dst_mac_msk, spec->eth.dst_mac_msk, + ETH_ALEN); + memcpy(rule_hw->eth.src_mac, spec->eth.src_mac, ETH_ALEN); + memcpy(rule_hw->eth.src_mac_msk, spec->eth.src_mac_msk, + ETH_ALEN); + if (spec->eth.ether_type_enable) { + rule_hw->eth.ether_type_enable = 1; + rule_hw->eth.ether_type = spec->eth.ether_type; + } + rule_hw->eth.vlan_tag = spec->eth.vlan_id; + rule_hw->eth.vlan_tag_msk = spec->eth.vlan_id_msk; + break; + + case MLX4_NET_TRANS_RULE_ID_IB: + rule_hw->ib.l3_qpn = spec->ib.l3_qpn; + rule_hw->ib.qpn_mask = spec->ib.qpn_msk; + memcpy(&rule_hw->ib.dst_gid, &spec->ib.dst_gid, 16); + memcpy(&rule_hw->ib.dst_gid_msk, &spec->ib.dst_gid_msk, 16); + break; + + case MLX4_NET_TRANS_RULE_ID_IPV6: + return -EOPNOTSUPP; + + case MLX4_NET_TRANS_RULE_ID_IPV4: + rule_hw->ipv4.src_ip = spec->ipv4.src_ip; + rule_hw->ipv4.src_ip_msk = spec->ipv4.src_ip_msk; + rule_hw->ipv4.dst_ip = spec->ipv4.dst_ip; + rule_hw->ipv4.dst_ip_msk = spec->ipv4.dst_ip_msk; + break; + + case MLX4_NET_TRANS_RULE_ID_TCP: + case MLX4_NET_TRANS_RULE_ID_UDP: + rule_hw->tcp_udp.dst_port = spec->tcp_udp.dst_port; + rule_hw->tcp_udp.dst_port_msk = spec->tcp_udp.dst_port_msk; + rule_hw->tcp_udp.src_port = spec->tcp_udp.src_port; + rule_hw->tcp_udp.src_port_msk = spec->tcp_udp.src_port_msk; + break; + + case MLX4_NET_TRANS_RULE_ID_VXLAN: + rule_hw->vxlan.vni = + cpu_to_be32(be32_to_cpu(spec->vxlan.vni) << 8); + rule_hw->vxlan.vni_mask = + cpu_to_be32(be32_to_cpu(spec->vxlan.vni_mask) << 8); + break; + + default: + return -EINVAL; + } + + return __rule_hw_sz[spec->id]; +} + +static void mlx4_err_rule(struct mlx4_dev *dev, char *str, + struct mlx4_net_trans_rule *rule) +{ +#define BUF_SIZE 256 + struct mlx4_spec_list *cur; + char buf[BUF_SIZE]; + int len = 0; + + mlx4_err(dev, "%s", str); + len += snprintf(buf + len, BUF_SIZE - len, + "port = %d prio = 0x%x qp = 0x%x ", + rule->port, rule->priority, rule->qpn); + + list_for_each_entry(cur, &rule->list, list) { + switch (cur->id) { + case MLX4_NET_TRANS_RULE_ID_ETH: + len += snprintf(buf + len, BUF_SIZE - len, + "dmac = %pM ", &cur->eth.dst_mac); + if (cur->eth.ether_type) + len += snprintf(buf + len, BUF_SIZE - len, + "ethertype = 0x%x ", + be16_to_cpu(cur->eth.ether_type)); + if (cur->eth.vlan_id) + len += snprintf(buf + len, BUF_SIZE - len, + "vlan-id = %d ", + be16_to_cpu(cur->eth.vlan_id)); + break; + + case MLX4_NET_TRANS_RULE_ID_IPV4: + if (cur->ipv4.src_ip) + len += snprintf(buf + len, BUF_SIZE - len, + "src-ip = %pI4 ", + &cur->ipv4.src_ip); + if (cur->ipv4.dst_ip) + len += snprintf(buf + len, BUF_SIZE - len, + "dst-ip = %pI4 ", + &cur->ipv4.dst_ip); + break; + + case MLX4_NET_TRANS_RULE_ID_TCP: + case MLX4_NET_TRANS_RULE_ID_UDP: + if (cur->tcp_udp.src_port) + len += snprintf(buf + len, BUF_SIZE - len, + "src-port = %d ", + be16_to_cpu(cur->tcp_udp.src_port)); + if (cur->tcp_udp.dst_port) + len += snprintf(buf + len, BUF_SIZE - len, + "dst-port = %d ", + be16_to_cpu(cur->tcp_udp.dst_port)); + break; + + case MLX4_NET_TRANS_RULE_ID_IB: + len += snprintf(buf + len, BUF_SIZE - len, + "dst-gid = %pI6\n", cur->ib.dst_gid); + len += snprintf(buf + len, BUF_SIZE - len, + "dst-gid-mask = %pI6\n", + cur->ib.dst_gid_msk); + break; + + case MLX4_NET_TRANS_RULE_ID_VXLAN: + len += snprintf(buf + len, BUF_SIZE - len, + "VNID = %d ", be32_to_cpu(cur->vxlan.vni)); + break; + case MLX4_NET_TRANS_RULE_ID_IPV6: + break; + + default: + break; + } + } + len += snprintf(buf + len, BUF_SIZE - len, "\n"); + mlx4_err(dev, "%s", buf); + + if (len >= BUF_SIZE) + mlx4_err(dev, "Network rule error message was truncated, print buffer is too small\n"); +} + +int mlx4_flow_attach(struct mlx4_dev *dev, + struct mlx4_net_trans_rule *rule, u64 *reg_id) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_spec_list *cur; + u32 size = 0; + int ret; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + trans_rule_ctrl_to_hw(rule, mailbox->buf); + + size += sizeof(struct mlx4_net_trans_rule_hw_ctrl); + + list_for_each_entry(cur, &rule->list, list) { + ret = parse_trans_rule(dev, cur, mailbox->buf + size); + if (ret < 0) { + mlx4_free_cmd_mailbox(dev, mailbox); + return ret; + } + size += ret; + } + + ret = mlx4_QP_FLOW_STEERING_ATTACH(dev, mailbox, size >> 2, reg_id); + if (ret == -ENOMEM) + mlx4_err_rule(dev, + "mcg table is full. Fail to register network rule\n", + rule); + else if (ret) + mlx4_err_rule(dev, "Fail to register network rule\n", rule); + + mlx4_free_cmd_mailbox(dev, mailbox); + + return ret; +} +EXPORT_SYMBOL_GPL(mlx4_flow_attach); + +int mlx4_flow_detach(struct mlx4_dev *dev, u64 reg_id) +{ + int err; + + err = mlx4_QP_FLOW_STEERING_DETACH(dev, reg_id); + if (err) + mlx4_err(dev, "Fail to detach network rule. registration id = 0x%llx\n", + reg_id); + return err; +} +EXPORT_SYMBOL_GPL(mlx4_flow_detach); + +int mlx4_tunnel_steer_add(struct mlx4_dev *dev, unsigned char *addr, + int port, int qpn, u16 prio, u64 *reg_id) +{ + int err; + struct mlx4_spec_list spec_eth_outer = { {NULL} }; + struct mlx4_spec_list spec_vxlan = { {NULL} }; + struct mlx4_spec_list spec_eth_inner = { {NULL} }; + + struct mlx4_net_trans_rule rule = { + .queue_mode = MLX4_NET_TRANS_Q_FIFO, + .exclusive = 0, + .allow_loopback = 1, + .promisc_mode = MLX4_FS_REGULAR, + }; + + __be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16); + + rule.port = port; + rule.qpn = qpn; + rule.priority = prio; + INIT_LIST_HEAD(&rule.list); + + spec_eth_outer.id = MLX4_NET_TRANS_RULE_ID_ETH; + memcpy(spec_eth_outer.eth.dst_mac, addr, ETH_ALEN); + memcpy(spec_eth_outer.eth.dst_mac_msk, &mac_mask, ETH_ALEN); + + spec_vxlan.id = MLX4_NET_TRANS_RULE_ID_VXLAN; /* any vxlan header */ + spec_eth_inner.id = MLX4_NET_TRANS_RULE_ID_ETH; /* any inner eth header */ + + list_add_tail(&spec_eth_outer.list, &rule.list); + list_add_tail(&spec_vxlan.list, &rule.list); + list_add_tail(&spec_eth_inner.list, &rule.list); + + err = mlx4_flow_attach(dev, &rule, reg_id); + return err; +} +EXPORT_SYMBOL(mlx4_tunnel_steer_add); + +int mlx4_FLOW_STEERING_IB_UC_QP_RANGE(struct mlx4_dev *dev, u32 min_range_qpn, + u32 max_range_qpn) +{ + int err; + u64 in_param; + + in_param = ((u64) min_range_qpn) << 32; + in_param |= ((u64) max_range_qpn) & 0xFFFFFFFF; + + err = mlx4_cmd(dev, in_param, 0, 0, + MLX4_FLOW_STEERING_IB_UC_QP_RANGE, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); + + return err; +} +EXPORT_SYMBOL_GPL(mlx4_FLOW_STEERING_IB_UC_QP_RANGE); + +int mlx4_qp_attach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], + int block_mcast_loopback, enum mlx4_protocol prot, + enum mlx4_steer_type steer) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_mgm *mgm; + u32 members_count; + int index, prev; + int link = 0; + int i; + int err; + u8 port = gid[5]; + u8 new_entry = 0; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + mgm = mailbox->buf; + + mutex_lock(&priv->mcg_table.mutex); + err = find_entry(dev, port, gid, prot, + mailbox, &prev, &index); + if (err) + goto out; + + if (index != -1) { + if (!(be32_to_cpu(mgm->members_count) & 0xffffff)) { + new_entry = 1; + memcpy(mgm->gid, gid, 16); + } + } else { + link = 1; + + index = mlx4_bitmap_alloc(&priv->mcg_table.bitmap); + if (index == -1) { + mlx4_err(dev, "No AMGM entries left\n"); + err = -ENOMEM; + goto out; + } + index += dev->caps.num_mgms; + + new_entry = 1; + memset(mgm, 0, sizeof *mgm); + memcpy(mgm->gid, gid, 16); + } + + members_count = be32_to_cpu(mgm->members_count) & 0xffffff; + if (members_count == dev->caps.num_qp_per_mgm) { + mlx4_err(dev, "MGM at index %x is full\n", index); + err = -ENOMEM; + goto out; + } + + for (i = 0; i < members_count; ++i) + if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qp->qpn) { + mlx4_dbg(dev, "QP %06x already a member of MGM\n", qp->qpn); + err = 0; + goto out; + } + + if (block_mcast_loopback) + mgm->qp[members_count++] = cpu_to_be32((qp->qpn & MGM_QPN_MASK) | + (1U << MGM_BLCK_LB_BIT)); + else + mgm->qp[members_count++] = cpu_to_be32(qp->qpn & MGM_QPN_MASK); + + mgm->members_count = cpu_to_be32(members_count | (u32) prot << 30); + + err = mlx4_WRITE_ENTRY(dev, index, mailbox); + if (err) + goto out; + + if (!link) + goto out; + + err = mlx4_READ_ENTRY(dev, prev, mailbox); + if (err) + goto out; + + mgm->next_gid_index = cpu_to_be32(index << 6); + + err = mlx4_WRITE_ENTRY(dev, prev, mailbox); + if (err) + goto out; + +out: + if (prot == MLX4_PROT_ETH) { + /* manage the steering entry for promisc mode */ + if (new_entry) + new_steering_entry(dev, port, steer, index, qp->qpn); + else + existing_steering_entry(dev, port, steer, + index, qp->qpn); + } + if (err && link && index != -1) { + if (index < dev->caps.num_mgms) + mlx4_warn(dev, "Got AMGM index %d < %d\n", + index, dev->caps.num_mgms); + else + mlx4_bitmap_free(&priv->mcg_table.bitmap, + index - dev->caps.num_mgms, MLX4_USE_RR); + } + mutex_unlock(&priv->mcg_table.mutex); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} + +int mlx4_qp_detach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], + enum mlx4_protocol prot, enum mlx4_steer_type steer) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_mgm *mgm; + u32 members_count; + int prev, index; + int i, loc = -1; + int err; + u8 port = gid[5]; + bool removed_entry = false; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + mgm = mailbox->buf; + + mutex_lock(&priv->mcg_table.mutex); + + err = find_entry(dev, port, gid, prot, + mailbox, &prev, &index); + if (err) + goto out; + + if (index == -1) { + mlx4_err(dev, "MGID %pI6 not found\n", gid); + err = -EINVAL; + goto out; + } + + /* If this QP is also a promisc QP, it shouldn't be removed only if + * at least one none promisc QP is also attached to this MCG + */ + if (prot == MLX4_PROT_ETH && + check_duplicate_entry(dev, port, steer, index, qp->qpn) && + !promisc_steering_entry(dev, port, steer, index, qp->qpn, NULL)) + goto out; + + members_count = be32_to_cpu(mgm->members_count) & 0xffffff; + for (i = 0; i < members_count; ++i) + if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qp->qpn) { + loc = i; + break; + } + + if (loc == -1) { + mlx4_err(dev, "QP %06x not found in MGM\n", qp->qpn); + err = -EINVAL; + goto out; + } + + /* copy the last QP in this MGM over removed QP */ + mgm->qp[loc] = mgm->qp[members_count - 1]; + mgm->qp[members_count - 1] = 0; + mgm->members_count = cpu_to_be32(--members_count | (u32) prot << 30); + + if (prot == MLX4_PROT_ETH) + removed_entry = can_remove_steering_entry(dev, port, steer, + index, qp->qpn); + if (members_count && (prot != MLX4_PROT_ETH || !removed_entry)) { + err = mlx4_WRITE_ENTRY(dev, index, mailbox); + goto out; + } + + /* We are going to delete the entry, members count should be 0 */ + mgm->members_count = cpu_to_be32((u32) prot << 30); + + if (prev == -1) { + /* Remove entry from MGM */ + int amgm_index = be32_to_cpu(mgm->next_gid_index) >> 6; + if (amgm_index) { + err = mlx4_READ_ENTRY(dev, amgm_index, mailbox); + if (err) + goto out; + } else + memset(mgm->gid, 0, 16); + + err = mlx4_WRITE_ENTRY(dev, index, mailbox); + if (err) + goto out; + + if (amgm_index) { + if (amgm_index < dev->caps.num_mgms) + mlx4_warn(dev, "MGM entry %d had AMGM index %d < %d\n", + index, amgm_index, dev->caps.num_mgms); + else + mlx4_bitmap_free(&priv->mcg_table.bitmap, + amgm_index - dev->caps.num_mgms, MLX4_USE_RR); + } + } else { + /* Remove entry from AMGM */ + int cur_next_index = be32_to_cpu(mgm->next_gid_index) >> 6; + err = mlx4_READ_ENTRY(dev, prev, mailbox); + if (err) + goto out; + + mgm->next_gid_index = cpu_to_be32(cur_next_index << 6); + + err = mlx4_WRITE_ENTRY(dev, prev, mailbox); + if (err) + goto out; + + if (index < dev->caps.num_mgms) + mlx4_warn(dev, "entry %d had next AMGM index %d < %d\n", + prev, index, dev->caps.num_mgms); + else + mlx4_bitmap_free(&priv->mcg_table.bitmap, + index - dev->caps.num_mgms, MLX4_USE_RR); + } + +out: + mutex_unlock(&priv->mcg_table.mutex); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} + +static int mlx4_QP_ATTACH(struct mlx4_dev *dev, struct mlx4_qp *qp, + u8 gid[16], u8 attach, u8 block_loopback, + enum mlx4_protocol prot) +{ + struct mlx4_cmd_mailbox *mailbox; + int err = 0; + int qpn; + + if (!mlx4_is_mfunc(dev)) + return -EBADF; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + memcpy(mailbox->buf, gid, 16); + qpn = qp->qpn; + qpn |= (prot << 28); + if (attach && block_loopback) + qpn |= (1 << 31); + + err = mlx4_cmd(dev, mailbox->dma, qpn, attach, + MLX4_CMD_QP_ATTACH, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} + +int mlx4_trans_to_dmfs_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, + u8 gid[16], u8 port, + int block_mcast_loopback, + enum mlx4_protocol prot, u64 *reg_id) +{ + struct mlx4_spec_list spec = { {NULL} }; + __be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16); + + struct mlx4_net_trans_rule rule = { + .queue_mode = MLX4_NET_TRANS_Q_FIFO, + .exclusive = 0, + .promisc_mode = MLX4_FS_REGULAR, + .priority = MLX4_DOMAIN_NIC, + }; + + rule.allow_loopback = !block_mcast_loopback; + rule.port = port; + rule.qpn = qp->qpn; + INIT_LIST_HEAD(&rule.list); + + switch (prot) { + case MLX4_PROT_ETH: + spec.id = MLX4_NET_TRANS_RULE_ID_ETH; + memcpy(spec.eth.dst_mac, &gid[10], ETH_ALEN); + memcpy(spec.eth.dst_mac_msk, &mac_mask, ETH_ALEN); + break; + + case MLX4_PROT_IB_IPV6: + spec.id = MLX4_NET_TRANS_RULE_ID_IB; + memcpy(spec.ib.dst_gid, gid, 16); + memset(&spec.ib.dst_gid_msk, 0xff, 16); + break; + default: + return -EINVAL; + } + list_add_tail(&spec.list, &rule.list); + + return mlx4_flow_attach(dev, &rule, reg_id); +} + +int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], + u8 port, int block_mcast_loopback, + enum mlx4_protocol prot, u64 *reg_id) +{ + switch (dev->caps.steering_mode) { + case MLX4_STEERING_MODE_A0: + if (prot == MLX4_PROT_ETH) + return 0; + + case MLX4_STEERING_MODE_B0: + if (prot == MLX4_PROT_ETH) + gid[7] |= (MLX4_MC_STEER << 1); + + if (mlx4_is_mfunc(dev)) + return mlx4_QP_ATTACH(dev, qp, gid, 1, + block_mcast_loopback, prot); + return mlx4_qp_attach_common(dev, qp, gid, + block_mcast_loopback, prot, + MLX4_MC_STEER); + + case MLX4_STEERING_MODE_DEVICE_MANAGED: + return mlx4_trans_to_dmfs_attach(dev, qp, gid, port, + block_mcast_loopback, + prot, reg_id); + default: + return -EINVAL; + } +} +EXPORT_SYMBOL_GPL(mlx4_multicast_attach); + +int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], + enum mlx4_protocol prot, u64 reg_id) +{ + switch (dev->caps.steering_mode) { + case MLX4_STEERING_MODE_A0: + if (prot == MLX4_PROT_ETH) + return 0; + + case MLX4_STEERING_MODE_B0: + if (prot == MLX4_PROT_ETH) + gid[7] |= (MLX4_MC_STEER << 1); + + if (mlx4_is_mfunc(dev)) + return mlx4_QP_ATTACH(dev, qp, gid, 0, 0, prot); + + return mlx4_qp_detach_common(dev, qp, gid, prot, + MLX4_MC_STEER); + + case MLX4_STEERING_MODE_DEVICE_MANAGED: + return mlx4_flow_detach(dev, reg_id); + + default: + return -EINVAL; + } +} +EXPORT_SYMBOL_GPL(mlx4_multicast_detach); + +int mlx4_flow_steer_promisc_add(struct mlx4_dev *dev, u8 port, + u32 qpn, enum mlx4_net_trans_promisc_mode mode) +{ + struct mlx4_net_trans_rule rule; + u64 *regid_p; + + switch (mode) { + case MLX4_FS_ALL_DEFAULT: + regid_p = &dev->regid_promisc_array[port]; + break; + case MLX4_FS_MC_DEFAULT: + regid_p = &dev->regid_allmulti_array[port]; + break; + default: + return -1; + } + + if (*regid_p != 0) + return -1; + + rule.promisc_mode = mode; + rule.port = port; + rule.qpn = qpn; + INIT_LIST_HEAD(&rule.list); + mlx4_err(dev, "going promisc on %x\n", port); + + return mlx4_flow_attach(dev, &rule, regid_p); +} +EXPORT_SYMBOL_GPL(mlx4_flow_steer_promisc_add); + +int mlx4_flow_steer_promisc_remove(struct mlx4_dev *dev, u8 port, + enum mlx4_net_trans_promisc_mode mode) +{ + int ret; + u64 *regid_p; + + switch (mode) { + case MLX4_FS_ALL_DEFAULT: + regid_p = &dev->regid_promisc_array[port]; + break; + case MLX4_FS_MC_DEFAULT: + regid_p = &dev->regid_allmulti_array[port]; + break; + default: + return -1; + } + + if (*regid_p == 0) + return -1; + + ret = mlx4_flow_detach(dev, *regid_p); + if (ret == 0) + *regid_p = 0; + + return ret; +} +EXPORT_SYMBOL_GPL(mlx4_flow_steer_promisc_remove); + +int mlx4_unicast_attach(struct mlx4_dev *dev, + struct mlx4_qp *qp, u8 gid[16], + int block_mcast_loopback, enum mlx4_protocol prot) +{ + if (prot == MLX4_PROT_ETH) + gid[7] |= (MLX4_UC_STEER << 1); + + if (mlx4_is_mfunc(dev)) + return mlx4_QP_ATTACH(dev, qp, gid, 1, + block_mcast_loopback, prot); + + return mlx4_qp_attach_common(dev, qp, gid, block_mcast_loopback, + prot, MLX4_UC_STEER); +} +EXPORT_SYMBOL_GPL(mlx4_unicast_attach); + +int mlx4_unicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, + u8 gid[16], enum mlx4_protocol prot) +{ + if (prot == MLX4_PROT_ETH) + gid[7] |= (MLX4_UC_STEER << 1); + + if (mlx4_is_mfunc(dev)) + return mlx4_QP_ATTACH(dev, qp, gid, 0, 0, prot); + + return mlx4_qp_detach_common(dev, qp, gid, prot, MLX4_UC_STEER); +} +EXPORT_SYMBOL_GPL(mlx4_unicast_detach); + +int mlx4_PROMISC_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + u32 qpn = (u32) vhcr->in_param & 0xffffffff; + int port = mlx4_slave_convert_port(dev, slave, vhcr->in_param >> 62); + enum mlx4_steer_type steer = vhcr->in_modifier; + + if (port < 0) + return -EINVAL; + + /* Promiscuous unicast is not allowed in mfunc */ + if (mlx4_is_mfunc(dev) && steer == MLX4_UC_STEER) + return 0; + + if (vhcr->op_modifier) + return add_promisc_qp(dev, port, steer, qpn); + else + return remove_promisc_qp(dev, port, steer, qpn); +} + +static int mlx4_PROMISC(struct mlx4_dev *dev, u32 qpn, + enum mlx4_steer_type steer, u8 add, u8 port) +{ + return mlx4_cmd(dev, (u64) qpn | (u64) port << 62, (u32) steer, add, + MLX4_CMD_PROMISC, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED); +} + +int mlx4_multicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port) +{ + if (mlx4_is_mfunc(dev)) + return mlx4_PROMISC(dev, qpn, MLX4_MC_STEER, 1, port); + + return add_promisc_qp(dev, port, MLX4_MC_STEER, qpn); +} +EXPORT_SYMBOL_GPL(mlx4_multicast_promisc_add); + +int mlx4_multicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port) +{ + if (mlx4_is_mfunc(dev)) + return mlx4_PROMISC(dev, qpn, MLX4_MC_STEER, 0, port); + + return remove_promisc_qp(dev, port, MLX4_MC_STEER, qpn); +} +EXPORT_SYMBOL_GPL(mlx4_multicast_promisc_remove); + +int mlx4_unicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port) +{ + if (mlx4_is_mfunc(dev)) + return mlx4_PROMISC(dev, qpn, MLX4_UC_STEER, 1, port); + + return add_promisc_qp(dev, port, MLX4_UC_STEER, qpn); +} +EXPORT_SYMBOL_GPL(mlx4_unicast_promisc_add); + +int mlx4_unicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port) +{ + if (mlx4_is_mfunc(dev)) + return mlx4_PROMISC(dev, qpn, MLX4_UC_STEER, 0, port); + + return remove_promisc_qp(dev, port, MLX4_UC_STEER, qpn); +} +EXPORT_SYMBOL_GPL(mlx4_unicast_promisc_remove); + +int mlx4_init_mcg_table(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int err; + + /* No need for mcg_table when fw managed the mcg table*/ + if (dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED) + return 0; + err = mlx4_bitmap_init(&priv->mcg_table.bitmap, dev->caps.num_amgms, + dev->caps.num_amgms - 1, 0, 0); + if (err) + return err; + + mutex_init(&priv->mcg_table.mutex); + + return 0; +} + +void mlx4_cleanup_mcg_table(struct mlx4_dev *dev) +{ + if (dev->caps.steering_mode != + MLX4_STEERING_MODE_DEVICE_MANAGED) + mlx4_bitmap_cleanup(&mlx4_priv(dev)->mcg_table.bitmap); +} diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h new file mode 100644 index 0000000..de10dbb --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h @@ -0,0 +1,1316 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved. + * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX4_H +#define MLX4_H + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define DRV_NAME "mlx4_core" +#define PFX DRV_NAME ": " +#define DRV_VERSION "2.2-1" +#define DRV_RELDATE "Feb, 2014" + +#define MLX4_FS_UDP_UC_EN (1 << 1) +#define MLX4_FS_TCP_UC_EN (1 << 2) +#define MLX4_FS_NUM_OF_L2_ADDR 8 +#define MLX4_FS_MGM_LOG_ENTRY_SIZE 7 +#define MLX4_FS_NUM_MCG (1 << 17) + +#define INIT_HCA_TPT_MW_ENABLE (1 << 7) + +struct mlx4_set_port_prio2tc_context { + u8 prio2tc[4]; +}; + +struct mlx4_port_scheduler_tc_cfg_be { + __be16 pg; + __be16 bw_precentage; + __be16 max_bw_units; /* 3-100Mbps, 4-1Gbps, other values - reserved */ + __be16 max_bw_value; +}; + +struct mlx4_set_port_scheduler_context { + struct mlx4_port_scheduler_tc_cfg_be tc[MLX4_NUM_TC]; +}; + +enum { + MLX4_HCR_BASE = 0x80680, + MLX4_HCR_SIZE = 0x0001c, + MLX4_CLR_INT_SIZE = 0x00008, + MLX4_SLAVE_COMM_BASE = 0x0, + MLX4_COMM_PAGESIZE = 0x1000, + MLX4_CLOCK_SIZE = 0x00008 +}; + +enum { + MLX4_DEFAULT_MGM_LOG_ENTRY_SIZE = 10, + MLX4_MIN_MGM_LOG_ENTRY_SIZE = 7, + MLX4_MAX_MGM_LOG_ENTRY_SIZE = 12, + MLX4_MAX_QP_PER_MGM = 4 * ((1 << MLX4_MAX_MGM_LOG_ENTRY_SIZE) / 16 - 2), + MLX4_MTT_ENTRY_PER_SEG = 8, +}; + +enum { + MLX4_NUM_PDS = 1 << 15 +}; + +enum { + MLX4_CMPT_TYPE_QP = 0, + MLX4_CMPT_TYPE_SRQ = 1, + MLX4_CMPT_TYPE_CQ = 2, + MLX4_CMPT_TYPE_EQ = 3, + MLX4_CMPT_NUM_TYPE +}; + +enum { + MLX4_CMPT_SHIFT = 24, + MLX4_NUM_CMPTS = MLX4_CMPT_NUM_TYPE << MLX4_CMPT_SHIFT +}; + +enum mlx4_mpt_state { + MLX4_MPT_DISABLED = 0, + MLX4_MPT_EN_HW, + MLX4_MPT_EN_SW +}; + +#define MLX4_COMM_TIME 10000 +enum { + MLX4_COMM_CMD_RESET, + MLX4_COMM_CMD_VHCR0, + MLX4_COMM_CMD_VHCR1, + MLX4_COMM_CMD_VHCR2, + MLX4_COMM_CMD_VHCR_EN, + MLX4_COMM_CMD_VHCR_POST, + MLX4_COMM_CMD_FLR = 254 +}; + +enum { + MLX4_VF_SMI_DISABLED, + MLX4_VF_SMI_ENABLED +}; + +/*The flag indicates that the slave should delay the RESET cmd*/ +#define MLX4_DELAY_RESET_SLAVE 0xbbbbbbb +/*indicates how many retries will be done if we are in the middle of FLR*/ +#define NUM_OF_RESET_RETRIES 10 +#define SLEEP_TIME_IN_RESET (2 * 1000) +enum mlx4_resource { + RES_QP, + RES_CQ, + RES_SRQ, + RES_XRCD, + RES_MPT, + RES_MTT, + RES_MAC, + RES_VLAN, + RES_EQ, + RES_COUNTER, + RES_FS_RULE, + MLX4_NUM_OF_RESOURCE_TYPE +}; + +enum mlx4_alloc_mode { + RES_OP_RESERVE, + RES_OP_RESERVE_AND_MAP, + RES_OP_MAP_ICM, +}; + +enum mlx4_res_tracker_free_type { + RES_TR_FREE_ALL, + RES_TR_FREE_SLAVES_ONLY, + RES_TR_FREE_STRUCTS_ONLY, +}; + +/* + *Virtual HCR structures. + * mlx4_vhcr is the sw representation, in machine endianess + * + * mlx4_vhcr_cmd is the formalized structure, the one that is passed + * to FW to go through communication channel. + * It is big endian, and has the same structure as the physical HCR + * used by command interface + */ +struct mlx4_vhcr { + u64 in_param; + u64 out_param; + u32 in_modifier; + u32 errno; + u16 op; + u16 token; + u8 op_modifier; + u8 e_bit; +}; + +struct mlx4_vhcr_cmd { + __be64 in_param; + __be32 in_modifier; + __be64 out_param; + __be16 token; + u16 reserved; + u8 status; + u8 flags; + __be16 opcode; +}; + +struct mlx4_cmd_info { + u16 opcode; + bool has_inbox; + bool has_outbox; + bool out_is_imm; + bool encode_slave_id; + int (*verify)(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox); + int (*wrapper)(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +}; + +#ifdef CONFIG_MLX4_DEBUG +extern int mlx4_debug_level; +#else /* CONFIG_MLX4_DEBUG */ +#define mlx4_debug_level (0) +#endif /* CONFIG_MLX4_DEBUG */ + +#define mlx4_dbg(mdev, format, ...) \ +do { \ + if (mlx4_debug_level) \ + dev_printk(KERN_DEBUG, &(mdev)->pdev->dev, format, \ + ##__VA_ARGS__); \ +} while (0) + +#define mlx4_err(mdev, format, ...) \ + dev_err(&(mdev)->pdev->dev, format, ##__VA_ARGS__) +#define mlx4_info(mdev, format, ...) \ + dev_info(&(mdev)->pdev->dev, format, ##__VA_ARGS__) +#define mlx4_warn(mdev, format, ...) \ + dev_warn(&(mdev)->pdev->dev, format, ##__VA_ARGS__) + +extern int mlx4_log_num_mgm_entry_size; +extern int log_mtts_per_seg; + +#define MLX4_MAX_NUM_SLAVES (MLX4_MAX_NUM_PF + MLX4_MAX_NUM_VF) +#define ALL_SLAVES 0xff + +struct mlx4_bitmap { + u32 last; + u32 top; + u32 max; + u32 reserved_top; + u32 mask; + u32 avail; + spinlock_t lock; + unsigned long *table; +}; + +struct mlx4_buddy { + unsigned long **bits; + unsigned int *num_free; + u32 max_order; + spinlock_t lock; +}; + +struct mlx4_icm; + +struct mlx4_icm_table { + u64 virt; + int num_icm; + u32 num_obj; + int obj_size; + int lowmem; + int coherent; + struct mutex mutex; + struct mlx4_icm **icm; +}; + +#define MLX4_MPT_FLAG_SW_OWNS (0xfUL << 28) +#define MLX4_MPT_FLAG_FREE (0x3UL << 28) +#define MLX4_MPT_FLAG_MIO (1 << 17) +#define MLX4_MPT_FLAG_BIND_ENABLE (1 << 15) +#define MLX4_MPT_FLAG_PHYSICAL (1 << 9) +#define MLX4_MPT_FLAG_REGION (1 << 8) + +#define MLX4_MPT_PD_MASK (0x1FFFFUL) +#define MLX4_MPT_PD_VF_MASK (0xFE0000UL) +#define MLX4_MPT_PD_FLAG_FAST_REG (1 << 27) +#define MLX4_MPT_PD_FLAG_RAE (1 << 28) +#define MLX4_MPT_PD_FLAG_EN_INV (3 << 24) + +#define MLX4_MPT_QP_FLAG_BOUND_QP (1 << 7) + +#define MLX4_MPT_STATUS_SW 0xF0 +#define MLX4_MPT_STATUS_HW 0x00 + +#define MLX4_CQE_SIZE_MASK_STRIDE 0x3 +#define MLX4_EQE_SIZE_MASK_STRIDE 0x30 + +/* + * Must be packed because mtt_seg is 64 bits but only aligned to 32 bits. + */ +struct mlx4_mpt_entry { + __be32 flags; + __be32 qpn; + __be32 key; + __be32 pd_flags; + __be64 start; + __be64 length; + __be32 lkey; + __be32 win_cnt; + u8 reserved1[3]; + u8 mtt_rep; + __be64 mtt_addr; + __be32 mtt_sz; + __be32 entity_size; + __be32 first_byte_offset; +} __packed; + +/* + * Must be packed because start is 64 bits but only aligned to 32 bits. + */ +struct mlx4_eq_context { + __be32 flags; + u16 reserved1[3]; + __be16 page_offset; + u8 log_eq_size; + u8 reserved2[4]; + u8 eq_period; + u8 reserved3; + u8 eq_max_count; + u8 reserved4[3]; + u8 intr; + u8 log_page_size; + u8 reserved5[2]; + u8 mtt_base_addr_h; + __be32 mtt_base_addr_l; + u32 reserved6[2]; + __be32 consumer_index; + __be32 producer_index; + u32 reserved7[4]; +}; + +struct mlx4_cq_context { + __be32 flags; + u16 reserved1[3]; + __be16 page_offset; + __be32 logsize_usrpage; + __be16 cq_period; + __be16 cq_max_count; + u8 reserved2[3]; + u8 comp_eqn; + u8 log_page_size; + u8 reserved3[2]; + u8 mtt_base_addr_h; + __be32 mtt_base_addr_l; + __be32 last_notified_index; + __be32 solicit_producer_index; + __be32 consumer_index; + __be32 producer_index; + u32 reserved4[2]; + __be64 db_rec_addr; +}; + +struct mlx4_srq_context { + __be32 state_logsize_srqn; + u8 logstride; + u8 reserved1; + __be16 xrcd; + __be32 pg_offset_cqn; + u32 reserved2; + u8 log_page_size; + u8 reserved3[2]; + u8 mtt_base_addr_h; + __be32 mtt_base_addr_l; + __be32 pd; + __be16 limit_watermark; + __be16 wqe_cnt; + u16 reserved4; + __be16 wqe_counter; + u32 reserved5; + __be64 db_rec_addr; +}; + +struct mlx4_eq { + struct mlx4_dev *dev; + void __iomem *doorbell; + int eqn; + u32 cons_index; + u16 irq; + u16 have_irq; + int nent; + struct mlx4_buf_list *page_list; + struct mlx4_mtt mtt; +}; + +struct mlx4_slave_eqe { + u8 type; + u8 port; + u32 param; +}; + +struct mlx4_slave_event_eq_info { + int eqn; + u16 token; +}; + +struct mlx4_profile { + int num_qp; + int rdmarc_per_qp; + int num_srq; + int num_cq; + int num_mcg; + int num_mpt; + unsigned num_mtt; +}; + +struct mlx4_fw { + u64 clr_int_base; + u64 catas_offset; + u64 comm_base; + u64 clock_offset; + struct mlx4_icm *fw_icm; + struct mlx4_icm *aux_icm; + u32 catas_size; + u16 fw_pages; + u8 clr_int_bar; + u8 catas_bar; + u8 comm_bar; + u8 clock_bar; +}; + +struct mlx4_comm { + u32 slave_write; + u32 slave_read; +}; + +enum { + MLX4_MCAST_CONFIG = 0, + MLX4_MCAST_DISABLE = 1, + MLX4_MCAST_ENABLE = 2, +}; + +#define VLAN_FLTR_SIZE 128 + +struct mlx4_vlan_fltr { + __be32 entry[VLAN_FLTR_SIZE]; +}; + +struct mlx4_mcast_entry { + struct list_head list; + u64 addr; +}; + +struct mlx4_promisc_qp { + struct list_head list; + u32 qpn; +}; + +struct mlx4_steer_index { + struct list_head list; + unsigned int index; + struct list_head duplicates; +}; + +#define MLX4_EVENT_TYPES_NUM 64 + +struct mlx4_slave_state { + u8 comm_toggle; + u8 last_cmd; + u8 init_port_mask; + bool active; + bool old_vlan_api; + u8 function; + dma_addr_t vhcr_dma; + u16 mtu[MLX4_MAX_PORTS + 1]; + __be32 ib_cap_mask[MLX4_MAX_PORTS + 1]; + struct mlx4_slave_eqe eq[MLX4_MFUNC_MAX_EQES]; + struct list_head mcast_filters[MLX4_MAX_PORTS + 1]; + struct mlx4_vlan_fltr *vlan_filter[MLX4_MAX_PORTS + 1]; + /* event type to eq number lookup */ + struct mlx4_slave_event_eq_info event_eq[MLX4_EVENT_TYPES_NUM]; + u16 eq_pi; + u16 eq_ci; + spinlock_t lock; + /*initialized via the kzalloc*/ + u8 is_slave_going_down; + u32 cookie; + enum slave_port_state port_state[MLX4_MAX_PORTS + 1]; +}; + +#define MLX4_VGT 4095 +#define NO_INDX (-1) + +struct mlx4_vport_state { + u64 mac; + u16 default_vlan; + u8 default_qos; + u32 tx_rate; + bool spoofchk; + u32 link_state; +}; + +struct mlx4_vf_admin_state { + struct mlx4_vport_state vport[MLX4_MAX_PORTS + 1]; + u8 enable_smi[MLX4_MAX_PORTS + 1]; +}; + +struct mlx4_vport_oper_state { + struct mlx4_vport_state state; + int mac_idx; + int vlan_idx; +}; + +struct mlx4_vf_oper_state { + struct mlx4_vport_oper_state vport[MLX4_MAX_PORTS + 1]; + u8 smi_enabled[MLX4_MAX_PORTS + 1]; +}; + +struct slave_list { + struct mutex mutex; + struct list_head res_list[MLX4_NUM_OF_RESOURCE_TYPE]; +}; + +struct resource_allocator { + spinlock_t alloc_lock; /* protect quotas */ + union { + int res_reserved; + int res_port_rsvd[MLX4_MAX_PORTS]; + }; + union { + int res_free; + int res_port_free[MLX4_MAX_PORTS]; + }; + int *quota; + int *allocated; + int *guaranteed; +}; + +struct mlx4_resource_tracker { + spinlock_t lock; + /* tree for each resources */ + struct rb_root res_tree[MLX4_NUM_OF_RESOURCE_TYPE]; + /* num_of_slave's lists, one per slave */ + struct slave_list *slave_list; + struct resource_allocator res_alloc[MLX4_NUM_OF_RESOURCE_TYPE]; +}; + +#define SLAVE_EVENT_EQ_SIZE 128 +struct mlx4_slave_event_eq { + u32 eqn; + u32 cons; + u32 prod; + spinlock_t event_lock; + struct mlx4_eqe event_eqe[SLAVE_EVENT_EQ_SIZE]; +}; + +struct mlx4_master_qp0_state { + int proxy_qp0_active; + int qp0_active; + int port_active; +}; + +struct mlx4_mfunc_master_ctx { + struct mlx4_slave_state *slave_state; + struct mlx4_vf_admin_state *vf_admin; + struct mlx4_vf_oper_state *vf_oper; + struct mlx4_master_qp0_state qp0_state[MLX4_MAX_PORTS + 1]; + int init_port_ref[MLX4_MAX_PORTS + 1]; + u16 max_mtu[MLX4_MAX_PORTS + 1]; + int disable_mcast_ref[MLX4_MAX_PORTS + 1]; + struct mlx4_resource_tracker res_tracker; + struct workqueue_struct *comm_wq; + struct work_struct comm_work; + struct work_struct slave_event_work; + struct work_struct slave_flr_event_work; + spinlock_t slave_state_lock; + __be32 comm_arm_bit_vector[4]; + struct mlx4_eqe cmd_eqe; + struct mlx4_slave_event_eq slave_eq; + struct mutex gen_eqe_mutex[MLX4_MFUNC_MAX]; +}; + +struct mlx4_mfunc { + struct mlx4_comm __iomem *comm; + struct mlx4_vhcr_cmd *vhcr; + dma_addr_t vhcr_dma; + + struct mlx4_mfunc_master_ctx master; +}; + +#define MGM_QPN_MASK 0x00FFFFFF +#define MGM_BLCK_LB_BIT 30 + +struct mlx4_mgm { + __be32 next_gid_index; + __be32 members_count; + u32 reserved[2]; + u8 gid[16]; + __be32 qp[MLX4_MAX_QP_PER_MGM]; +}; + +struct mlx4_cmd { + struct pci_pool *pool; + void __iomem *hcr; + struct mutex hcr_mutex; + struct mutex slave_cmd_mutex; + struct semaphore poll_sem; + struct semaphore event_sem; + int max_cmds; + spinlock_t context_lock; + int free_head; + struct mlx4_cmd_context *context; + u16 token_mask; + u8 use_events; + u8 toggle; + u8 comm_toggle; +}; + +enum { + MLX4_VF_IMMED_VLAN_FLAG_VLAN = 1 << 0, + MLX4_VF_IMMED_VLAN_FLAG_QOS = 1 << 1, + MLX4_VF_IMMED_VLAN_FLAG_LINK_DISABLE = 1 << 2, +}; +struct mlx4_vf_immed_vlan_work { + struct work_struct work; + struct mlx4_priv *priv; + int flags; + int slave; + int vlan_ix; + int orig_vlan_ix; + u8 port; + u8 qos; + u16 vlan_id; + u16 orig_vlan_id; +}; + + +struct mlx4_uar_table { + struct mlx4_bitmap bitmap; +}; + +struct mlx4_mr_table { + struct mlx4_bitmap mpt_bitmap; + struct mlx4_buddy mtt_buddy; + u64 mtt_base; + u64 mpt_base; + struct mlx4_icm_table mtt_table; + struct mlx4_icm_table dmpt_table; +}; + +struct mlx4_cq_table { + struct mlx4_bitmap bitmap; + spinlock_t lock; + struct radix_tree_root tree; + struct mlx4_icm_table table; + struct mlx4_icm_table cmpt_table; +}; + +struct mlx4_eq_table { + struct mlx4_bitmap bitmap; + char *irq_names; + void __iomem *clr_int; + void __iomem **uar_map; + u32 clr_mask; + struct mlx4_eq *eq; + struct mlx4_icm_table table; + struct mlx4_icm_table cmpt_table; + int have_irq; + u8 inta_pin; +}; + +struct mlx4_srq_table { + struct mlx4_bitmap bitmap; + spinlock_t lock; + struct radix_tree_root tree; + struct mlx4_icm_table table; + struct mlx4_icm_table cmpt_table; +}; + +struct mlx4_qp_table { + struct mlx4_bitmap bitmap; + u32 rdmarc_base; + int rdmarc_shift; + spinlock_t lock; + struct mlx4_icm_table qp_table; + struct mlx4_icm_table auxc_table; + struct mlx4_icm_table altc_table; + struct mlx4_icm_table rdmarc_table; + struct mlx4_icm_table cmpt_table; +}; + +struct mlx4_mcg_table { + struct mutex mutex; + struct mlx4_bitmap bitmap; + struct mlx4_icm_table table; +}; + +struct mlx4_catas_err { + u32 __iomem *map; + struct timer_list timer; + struct list_head list; +}; + +#define MLX4_MAX_MAC_NUM 128 +#define MLX4_MAC_TABLE_SIZE (MLX4_MAX_MAC_NUM << 3) + +struct mlx4_mac_table { + __be64 entries[MLX4_MAX_MAC_NUM]; + int refs[MLX4_MAX_MAC_NUM]; + struct mutex mutex; + int total; + int max; +}; + +#define MLX4_ROCE_GID_ENTRY_SIZE 16 + +struct mlx4_roce_gid_entry { + u8 raw[MLX4_ROCE_GID_ENTRY_SIZE]; +}; + +struct mlx4_roce_gid_table { + struct mlx4_roce_gid_entry roce_gids[MLX4_ROCE_MAX_GIDS]; + struct mutex mutex; +}; + +#define MLX4_MAX_VLAN_NUM 128 +#define MLX4_VLAN_TABLE_SIZE (MLX4_MAX_VLAN_NUM << 2) + +struct mlx4_vlan_table { + __be32 entries[MLX4_MAX_VLAN_NUM]; + int refs[MLX4_MAX_VLAN_NUM]; + struct mutex mutex; + int total; + int max; +}; + +#define SET_PORT_GEN_ALL_VALID 0x7 +#define SET_PORT_PROMISC_SHIFT 31 +#define SET_PORT_MC_PROMISC_SHIFT 30 + +enum { + MCAST_DIRECT_ONLY = 0, + MCAST_DIRECT = 1, + MCAST_DEFAULT = 2 +}; + + +struct mlx4_set_port_general_context { + u8 reserved[3]; + u8 flags; + u16 reserved2; + __be16 mtu; + u8 pptx; + u8 pfctx; + u16 reserved3; + u8 pprx; + u8 pfcrx; + u16 reserved4; +}; + +struct mlx4_set_port_rqp_calc_context { + __be32 base_qpn; + u8 rererved; + u8 n_mac; + u8 n_vlan; + u8 n_prio; + u8 reserved2[3]; + u8 mac_miss; + u8 intra_no_vlan; + u8 no_vlan; + u8 intra_vlan_miss; + u8 vlan_miss; + u8 reserved3[3]; + u8 no_vlan_prio; + __be32 promisc; + __be32 mcast; +}; + +struct mlx4_port_info { + struct mlx4_dev *dev; + int port; + char dev_name[16]; + struct device_attribute port_attr; + enum mlx4_port_type tmp_type; + char dev_mtu_name[16]; + struct device_attribute port_mtu_attr; + struct mlx4_mac_table mac_table; + struct mlx4_vlan_table vlan_table; + struct mlx4_roce_gid_table gid_table; + int base_qpn; +}; + +struct mlx4_sense { + struct mlx4_dev *dev; + u8 do_sense_port[MLX4_MAX_PORTS + 1]; + u8 sense_allowed[MLX4_MAX_PORTS + 1]; + struct delayed_work sense_poll; +}; + +struct mlx4_msix_ctl { + u64 pool_bm; + struct mutex pool_lock; +}; + +struct mlx4_steer { + struct list_head promisc_qps[MLX4_NUM_STEERS]; + struct list_head steer_entries[MLX4_NUM_STEERS]; +}; + +enum { + MLX4_PCI_DEV_IS_VF = 1 << 0, + MLX4_PCI_DEV_FORCE_SENSE_PORT = 1 << 1, +}; + +enum { + MLX4_NO_RR = 0, + MLX4_USE_RR = 1, +}; + +struct mlx4_priv { + struct mlx4_dev dev; + + struct list_head dev_list; + struct list_head ctx_list; + spinlock_t ctx_lock; + + int pci_dev_data; + int removed; + + struct list_head pgdir_list; + struct mutex pgdir_mutex; + + struct mlx4_fw fw; + struct mlx4_cmd cmd; + struct mlx4_mfunc mfunc; + + struct mlx4_bitmap pd_bitmap; + struct mlx4_bitmap xrcd_bitmap; + struct mlx4_uar_table uar_table; + struct mlx4_mr_table mr_table; + struct mlx4_cq_table cq_table; + struct mlx4_eq_table eq_table; + struct mlx4_srq_table srq_table; + struct mlx4_qp_table qp_table; + struct mlx4_mcg_table mcg_table; + struct mlx4_bitmap counters_bitmap; + + struct mlx4_catas_err catas_err; + + void __iomem *clr_base; + + struct mlx4_uar driver_uar; + void __iomem *kar; + struct mlx4_port_info port[MLX4_MAX_PORTS + 1]; + struct mlx4_sense sense; + struct mutex port_mutex; + struct mlx4_msix_ctl msix_ctl; + struct mlx4_steer *steer; + struct list_head bf_list; + struct mutex bf_mutex; + struct io_mapping *bf_mapping; + void __iomem *clock_mapping; + int reserved_mtts; + int fs_hash_mode; + u8 virt2phys_pkey[MLX4_MFUNC_MAX][MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS]; + __be64 slave_node_guids[MLX4_MFUNC_MAX]; + + atomic_t opreq_count; + struct work_struct opreq_task; +}; + +static inline struct mlx4_priv *mlx4_priv(struct mlx4_dev *dev) +{ + return container_of(dev, struct mlx4_priv, dev); +} + +#define MLX4_SENSE_RANGE (HZ * 3) + +extern struct workqueue_struct *mlx4_wq; + +u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap); +void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj, int use_rr); +u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, int align); +void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt, + int use_rr); +u32 mlx4_bitmap_avail(struct mlx4_bitmap *bitmap); +int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask, + u32 reserved_bot, u32 resetrved_top); +void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap); + +int mlx4_reset(struct mlx4_dev *dev); + +int mlx4_alloc_eq_table(struct mlx4_dev *dev); +void mlx4_free_eq_table(struct mlx4_dev *dev); + +int mlx4_init_pd_table(struct mlx4_dev *dev); +int mlx4_init_xrcd_table(struct mlx4_dev *dev); +int mlx4_init_uar_table(struct mlx4_dev *dev); +int mlx4_init_mr_table(struct mlx4_dev *dev); +int mlx4_init_eq_table(struct mlx4_dev *dev); +int mlx4_init_cq_table(struct mlx4_dev *dev); +int mlx4_init_qp_table(struct mlx4_dev *dev); +int mlx4_init_srq_table(struct mlx4_dev *dev); +int mlx4_init_mcg_table(struct mlx4_dev *dev); + +void mlx4_cleanup_pd_table(struct mlx4_dev *dev); +void mlx4_cleanup_xrcd_table(struct mlx4_dev *dev); +void mlx4_cleanup_uar_table(struct mlx4_dev *dev); +void mlx4_cleanup_mr_table(struct mlx4_dev *dev); +void mlx4_cleanup_eq_table(struct mlx4_dev *dev); +void mlx4_cleanup_cq_table(struct mlx4_dev *dev); +void mlx4_cleanup_qp_table(struct mlx4_dev *dev); +void mlx4_cleanup_srq_table(struct mlx4_dev *dev); +void mlx4_cleanup_mcg_table(struct mlx4_dev *dev); +int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn, gfp_t gfp); +void __mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn); +int __mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn); +void __mlx4_cq_free_icm(struct mlx4_dev *dev, int cqn); +int __mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn); +void __mlx4_srq_free_icm(struct mlx4_dev *dev, int srqn); +int __mlx4_mpt_reserve(struct mlx4_dev *dev); +void __mlx4_mpt_release(struct mlx4_dev *dev, u32 index); +int __mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index, gfp_t gfp); +void __mlx4_mpt_free_icm(struct mlx4_dev *dev, u32 index); +u32 __mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order); +void __mlx4_free_mtt_range(struct mlx4_dev *dev, u32 first_seg, int order); + +int mlx4_WRITE_MTT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SYNC_TPT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SW2HW_MPT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_HW2SW_MPT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_QUERY_MPT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SW2HW_EQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_DMA_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int __mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, + int *base); +void __mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt); +int __mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac); +void __mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac); +int __mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, + int start_index, int npages, u64 *page_list); +int __mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx); +void __mlx4_counter_free(struct mlx4_dev *dev, u32 idx); +int __mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn); +void __mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn); + +void mlx4_start_catas_poll(struct mlx4_dev *dev); +void mlx4_stop_catas_poll(struct mlx4_dev *dev); +void mlx4_catas_init(void); +int mlx4_restart_one(struct pci_dev *pdev); +int mlx4_register_device(struct mlx4_dev *dev); +void mlx4_unregister_device(struct mlx4_dev *dev); +void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type, + unsigned long param); + +struct mlx4_dev_cap; +struct mlx4_init_hca_param; + +u64 mlx4_make_profile(struct mlx4_dev *dev, + struct mlx4_profile *request, + struct mlx4_dev_cap *dev_cap, + struct mlx4_init_hca_param *init_hca); +void mlx4_master_comm_channel(struct work_struct *work); +void mlx4_gen_slave_eqe(struct work_struct *work); +void mlx4_master_handle_slave_flr(struct work_struct *work); + +int mlx4_ALLOC_RES_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_FREE_RES_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_MAP_EQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_COMM_INT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_HW2SW_EQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_QUERY_EQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SW2HW_CQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_HW2SW_CQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_QUERY_CQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_MODIFY_CQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SW2HW_SRQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_HW2SW_SRQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_QUERY_SRQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_ARM_SRQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_GEN_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_RST2INIT_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_INIT2INIT_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_INIT2RTR_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_RTR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_RTS2RTS_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SQERR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_2ERR_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_RTS2SQD_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SQD2SQD_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SQD2RTS_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_2RST_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_QUERY_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); + +int mlx4_GEN_EQE(struct mlx4_dev *dev, int slave, struct mlx4_eqe *eqe); + +int mlx4_cmd_init(struct mlx4_dev *dev); +void mlx4_cmd_cleanup(struct mlx4_dev *dev); +int mlx4_multi_func_init(struct mlx4_dev *dev); +void mlx4_multi_func_cleanup(struct mlx4_dev *dev); +void mlx4_cmd_event(struct mlx4_dev *dev, u16 token, u8 status, u64 out_param); +int mlx4_cmd_use_events(struct mlx4_dev *dev); +void mlx4_cmd_use_polling(struct mlx4_dev *dev); + +int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param, + unsigned long timeout); + +void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn); +void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type); + +void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type); + +void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type); + +void mlx4_handle_catas_err(struct mlx4_dev *dev); + +int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port, + enum mlx4_port_type *type); +void mlx4_do_sense_ports(struct mlx4_dev *dev, + enum mlx4_port_type *stype, + enum mlx4_port_type *defaults); +void mlx4_start_sense(struct mlx4_dev *dev); +void mlx4_stop_sense(struct mlx4_dev *dev); +void mlx4_sense_init(struct mlx4_dev *dev); +int mlx4_check_port_params(struct mlx4_dev *dev, + enum mlx4_port_type *port_type); +int mlx4_change_port_types(struct mlx4_dev *dev, + enum mlx4_port_type *port_types); + +void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table); +void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table); +void mlx4_init_roce_gid_table(struct mlx4_dev *dev, + struct mlx4_roce_gid_table *table); +void __mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan); +int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index); + +int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port, int pkey_tbl_sz); +/* resource tracker functions*/ +int mlx4_get_slave_from_resource_id(struct mlx4_dev *dev, + enum mlx4_resource resource_type, + u64 resource_id, int *slave); +void mlx4_delete_all_resources_for_slave(struct mlx4_dev *dev, int slave_id); +void mlx4_reset_roce_gids(struct mlx4_dev *dev, int slave); +int mlx4_init_resource_tracker(struct mlx4_dev *dev); + +void mlx4_free_resource_tracker(struct mlx4_dev *dev, + enum mlx4_res_tracker_free_type type); + +int mlx4_QUERY_FW_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SET_PORT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_INIT_PORT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_QUERY_PORT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps); + +int mlx4_get_slave_pkey_gid_tbl_len(struct mlx4_dev *dev, u8 port, + int *gid_tbl_len, int *pkey_tbl_len); + +int mlx4_QP_ATTACH_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); + +int mlx4_UPDATE_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); + +int mlx4_PROMISC_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_qp_detach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], + enum mlx4_protocol prot, enum mlx4_steer_type steer); +int mlx4_qp_attach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], + int block_mcast_loopback, enum mlx4_protocol prot, + enum mlx4_steer_type steer); +int mlx4_trans_to_dmfs_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, + u8 gid[16], u8 port, + int block_mcast_loopback, + enum mlx4_protocol prot, u64 *reg_id); +int mlx4_SET_MCAST_FLTR_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SET_VLAN_FLTR_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_common_set_vlan_fltr(struct mlx4_dev *dev, int function, + int port, void *buf); +int mlx4_common_dump_eth_stats(struct mlx4_dev *dev, int slave, u32 in_mod, + struct mlx4_cmd_mailbox *outbox); +int mlx4_DUMP_ETH_STATS_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_PKEY_TABLE_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_QUERY_IF_STAT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_QP_FLOW_STEERING_ATTACH_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_QP_FLOW_STEERING_DETACH_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); + +int mlx4_get_mgm_entry_size(struct mlx4_dev *dev); +int mlx4_get_qp_per_mgm(struct mlx4_dev *dev); + +static inline void set_param_l(u64 *arg, u32 val) +{ + *arg = (*arg & 0xffffffff00000000ULL) | (u64) val; +} + +static inline void set_param_h(u64 *arg, u32 val) +{ + *arg = (*arg & 0xffffffff) | ((u64) val << 32); +} + +static inline u32 get_param_l(u64 *arg) +{ + return (u32) (*arg & 0xffffffff); +} + +static inline u32 get_param_h(u64 *arg) +{ + return (u32)(*arg >> 32); +} + +static inline spinlock_t *mlx4_tlock(struct mlx4_dev *dev) +{ + return &mlx4_priv(dev)->mfunc.master.res_tracker.lock; +} + +#define NOT_MASKED_PD_BITS 17 + +void mlx4_vf_immed_vlan_work_handler(struct work_struct *_work); + +void mlx4_init_quotas(struct mlx4_dev *dev); + +int mlx4_get_slave_num_gids(struct mlx4_dev *dev, int slave, int port); +/* Returns the VF index of slave */ +int mlx4_get_vf_indx(struct mlx4_dev *dev, int slave); +int mlx4_config_mad_demux(struct mlx4_dev *dev); + +#endif /* MLX4_H */ diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h new file mode 100644 index 0000000..8fef658 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -0,0 +1,881 @@ +/* + * Copyright (c) 2007 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef _MLX4_EN_H_ +#define _MLX4_EN_H_ + +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_MLX4_EN_DCB +#include +#endif +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "en_port.h" + +#define DRV_NAME "mlx4_en" +#define DRV_VERSION "2.2-1" +#define DRV_RELDATE "Feb 2014" + +#define MLX4_EN_MSG_LEVEL (NETIF_MSG_LINK | NETIF_MSG_IFDOWN) + +/* + * Device constants + */ + + +#define MLX4_EN_PAGE_SHIFT 12 +#define MLX4_EN_PAGE_SIZE (1 << MLX4_EN_PAGE_SHIFT) +#define DEF_RX_RINGS 16 +#define MAX_RX_RINGS 128 +#define MIN_RX_RINGS 4 +#define TXBB_SIZE 64 +#define HEADROOM (2048 / TXBB_SIZE + 1) +#define STAMP_STRIDE 64 +#define STAMP_DWORDS (STAMP_STRIDE / 4) +#define STAMP_SHIFT 31 +#define STAMP_VAL 0x7fffffff +#define STATS_DELAY (HZ / 4) +#define SERVICE_TASK_DELAY (HZ / 4) +#define MAX_NUM_OF_FS_RULES 256 + +#define MLX4_EN_FILTER_HASH_SHIFT 4 +#define MLX4_EN_FILTER_EXPIRY_QUOTA 60 + +/* Typical TSO descriptor with 16 gather entries is 352 bytes... */ +#define MAX_DESC_SIZE 512 +#define MAX_DESC_TXBBS (MAX_DESC_SIZE / TXBB_SIZE) + +/* + * OS related constants and tunables + */ + +#define MLX4_EN_PRIV_FLAGS_BLUEFLAME 1 + +#define MLX4_EN_WATCHDOG_TIMEOUT (15 * HZ) + +/* Use the maximum between 16384 and a single page */ +#define MLX4_EN_ALLOC_SIZE PAGE_ALIGN(16384) + +#define MLX4_EN_ALLOC_PREFER_ORDER PAGE_ALLOC_COSTLY_ORDER + +/* Receive fragment sizes; we use at most 3 fragments (for 9600 byte MTU + * and 4K allocations) */ +enum { + FRAG_SZ0 = 1536 - NET_IP_ALIGN, + FRAG_SZ1 = 4096, + FRAG_SZ2 = 4096, + FRAG_SZ3 = MLX4_EN_ALLOC_SIZE +}; +#define MLX4_EN_MAX_RX_FRAGS 4 + +/* Maximum ring sizes */ +#define MLX4_EN_MAX_TX_SIZE 8192 +#define MLX4_EN_MAX_RX_SIZE 8192 + +/* Minimum ring size for our page-allocation scheme to work */ +#define MLX4_EN_MIN_RX_SIZE (MLX4_EN_ALLOC_SIZE / SMP_CACHE_BYTES) +#define MLX4_EN_MIN_TX_SIZE (4096 / TXBB_SIZE) + +#define MLX4_EN_SMALL_PKT_SIZE 64 +#define MLX4_EN_MIN_TX_RING_P_UP 1 +#define MLX4_EN_MAX_TX_RING_P_UP 32 +#define MLX4_EN_NUM_UP 8 +#define MLX4_EN_DEF_TX_RING_SIZE 512 +#define MLX4_EN_DEF_RX_RING_SIZE 1024 +#define MAX_TX_RINGS (MLX4_EN_MAX_TX_RING_P_UP * \ + MLX4_EN_NUM_UP) + +#define MLX4_EN_DEFAULT_TX_WORK 256 + +/* Target number of packets to coalesce with interrupt moderation */ +#define MLX4_EN_RX_COAL_TARGET 44 +#define MLX4_EN_RX_COAL_TIME 0x10 + +#define MLX4_EN_TX_COAL_PKTS 16 +#define MLX4_EN_TX_COAL_TIME 0x10 + +#define MLX4_EN_RX_RATE_LOW 400000 +#define MLX4_EN_RX_COAL_TIME_LOW 0 +#define MLX4_EN_RX_RATE_HIGH 450000 +#define MLX4_EN_RX_COAL_TIME_HIGH 128 +#define MLX4_EN_RX_SIZE_THRESH 1024 +#define MLX4_EN_RX_RATE_THRESH (1000000 / MLX4_EN_RX_COAL_TIME_HIGH) +#define MLX4_EN_SAMPLE_INTERVAL 0 +#define MLX4_EN_AVG_PKT_SMALL 256 + +#define MLX4_EN_AUTO_CONF 0xffff + +#define MLX4_EN_DEF_RX_PAUSE 1 +#define MLX4_EN_DEF_TX_PAUSE 1 + +/* Interval between successive polls in the Tx routine when polling is used + instead of interrupts (in per-core Tx rings) - should be power of 2 */ +#define MLX4_EN_TX_POLL_MODER 16 +#define MLX4_EN_TX_POLL_TIMEOUT (HZ / 4) + +#define SMALL_PACKET_SIZE (256 - NET_IP_ALIGN) +#define HEADER_COPY_SIZE (128 - NET_IP_ALIGN) +#define MLX4_LOOPBACK_TEST_PAYLOAD (HEADER_COPY_SIZE - ETH_HLEN) + +#define MLX4_EN_MIN_MTU 46 +#define ETH_BCAST 0xffffffffffffULL + +#define MLX4_EN_LOOPBACK_RETRIES 5 +#define MLX4_EN_LOOPBACK_TIMEOUT 100 + +#ifdef MLX4_EN_PERF_STAT +/* Number of samples to 'average' */ +#define AVG_SIZE 128 +#define AVG_FACTOR 1024 +#define NUM_PERF_STATS NUM_PERF_COUNTERS + +#define INC_PERF_COUNTER(cnt) (++(cnt)) +#define ADD_PERF_COUNTER(cnt, add) ((cnt) += (add)) +#define AVG_PERF_COUNTER(cnt, sample) \ + ((cnt) = ((cnt) * (AVG_SIZE - 1) + (sample) * AVG_FACTOR) / AVG_SIZE) +#define GET_PERF_COUNTER(cnt) (cnt) +#define GET_AVG_PERF_COUNTER(cnt) ((cnt) / AVG_FACTOR) + +#else + +#define NUM_PERF_STATS 0 +#define INC_PERF_COUNTER(cnt) do {} while (0) +#define ADD_PERF_COUNTER(cnt, add) do {} while (0) +#define AVG_PERF_COUNTER(cnt, sample) do {} while (0) +#define GET_PERF_COUNTER(cnt) (0) +#define GET_AVG_PERF_COUNTER(cnt) (0) +#endif /* MLX4_EN_PERF_STAT */ + +/* Constants for TX flow */ +enum { + MAX_INLINE = 104, /* 128 - 16 - 4 - 4 */ + MAX_BF = 256, + MIN_PKT_LEN = 17, +}; + +/* + * Configurables + */ + +enum cq_type { + RX = 0, + TX = 1, +}; + + +/* + * Useful macros + */ +#define ROUNDUP_LOG2(x) ilog2(roundup_pow_of_two(x)) +#define XNOR(x, y) (!(x) == !(y)) + + +struct mlx4_en_tx_info { + struct sk_buff *skb; + dma_addr_t map0_dma; + u32 map0_byte_count; + u32 nr_txbb; + u32 nr_bytes; + u8 linear; + u8 data_offset; + u8 inl; + u8 ts_requested; + u8 nr_maps; +} ____cacheline_aligned_in_smp; + + +#define MLX4_EN_BIT_DESC_OWN 0x80000000 +#define CTRL_SIZE sizeof(struct mlx4_wqe_ctrl_seg) +#define MLX4_EN_MEMTYPE_PAD 0x100 +#define DS_SIZE sizeof(struct mlx4_wqe_data_seg) + + +struct mlx4_en_tx_desc { + struct mlx4_wqe_ctrl_seg ctrl; + union { + struct mlx4_wqe_data_seg data; /* at least one data segment */ + struct mlx4_wqe_lso_seg lso; + struct mlx4_wqe_inline_seg inl; + }; +}; + +#define MLX4_EN_USE_SRQ 0x01000000 + +#define MLX4_EN_CX3_LOW_ID 0x1000 +#define MLX4_EN_CX3_HIGH_ID 0x1005 + +struct mlx4_en_rx_alloc { + struct page *page; + dma_addr_t dma; + u32 page_offset; + u32 page_size; +}; + +struct mlx4_en_tx_ring { + /* cache line used and dirtied in tx completion + * (mlx4_en_free_tx_buf()) + */ + u32 last_nr_txbb; + u32 cons; + unsigned long wake_queue; + + /* cache line used and dirtied in mlx4_en_xmit() */ + u32 prod ____cacheline_aligned_in_smp; + unsigned long bytes; + unsigned long packets; + unsigned long tx_csum; + unsigned long tso_packets; + unsigned long xmit_more; + struct mlx4_bf bf; + unsigned long queue_stopped; + + /* Following part should be mostly read */ + cpumask_t affinity_mask; + struct mlx4_qp qp; + struct mlx4_hwq_resources wqres; + u32 size; /* number of TXBBs */ + u32 size_mask; + u16 stride; + u16 cqn; /* index of port CQ associated with this ring */ + u32 buf_size; + __be32 doorbell_qpn; + __be32 mr_key; + void *buf; + struct mlx4_en_tx_info *tx_info; + u8 *bounce_buf; + struct mlx4_qp_context context; + int qpn; + enum mlx4_qp_state qp_state; + u8 queue_index; + bool bf_enabled; + bool bf_alloced; + struct netdev_queue *tx_queue; + int hwtstamp_tx_type; +} ____cacheline_aligned_in_smp; + +struct mlx4_en_rx_desc { + /* actual number of entries depends on rx ring stride */ + struct mlx4_wqe_data_seg data[0]; +}; + +struct mlx4_en_rx_ring { + struct mlx4_hwq_resources wqres; + struct mlx4_en_rx_alloc page_alloc[MLX4_EN_MAX_RX_FRAGS]; + u32 size ; /* number of Rx descs*/ + u32 actual_size; + u32 size_mask; + u16 stride; + u16 log_stride; + u16 cqn; /* index of port CQ associated with this ring */ + u32 prod; + u32 cons; + u32 buf_size; + u8 fcs_del; + void *buf; + void *rx_info; + unsigned long bytes; + unsigned long packets; +#ifdef CONFIG_NET_RX_BUSY_POLL + unsigned long yields; + unsigned long misses; + unsigned long cleaned; +#endif + unsigned long csum_ok; + unsigned long csum_none; + int hwtstamp_rx_filter; + cpumask_var_t affinity_mask; +}; + +struct mlx4_en_cq { + struct mlx4_cq mcq; + struct mlx4_hwq_resources wqres; + int ring; + struct net_device *dev; + struct napi_struct napi; + int size; + int buf_size; + unsigned vector; + enum cq_type is_tx; + u16 moder_time; + u16 moder_cnt; + struct mlx4_cqe *buf; +#define MLX4_EN_OPCODE_ERROR 0x1e + +#ifdef CONFIG_NET_RX_BUSY_POLL + unsigned int state; +#define MLX4_EN_CQ_STATE_IDLE 0 +#define MLX4_EN_CQ_STATE_NAPI 1 /* NAPI owns this CQ */ +#define MLX4_EN_CQ_STATE_POLL 2 /* poll owns this CQ */ +#define MLX4_CQ_LOCKED (MLX4_EN_CQ_STATE_NAPI | MLX4_EN_CQ_STATE_POLL) +#define MLX4_EN_CQ_STATE_NAPI_YIELD 4 /* NAPI yielded this CQ */ +#define MLX4_EN_CQ_STATE_POLL_YIELD 8 /* poll yielded this CQ */ +#define CQ_YIELD (MLX4_EN_CQ_STATE_NAPI_YIELD | MLX4_EN_CQ_STATE_POLL_YIELD) +#define CQ_USER_PEND (MLX4_EN_CQ_STATE_POLL | MLX4_EN_CQ_STATE_POLL_YIELD) + spinlock_t poll_lock; /* protects from LLS/napi conflicts */ +#endif /* CONFIG_NET_RX_BUSY_POLL */ + struct irq_desc *irq_desc; +}; + +struct mlx4_en_port_profile { + u32 flags; + u32 tx_ring_num; + u32 rx_ring_num; + u32 tx_ring_size; + u32 rx_ring_size; + u8 rx_pause; + u8 rx_ppp; + u8 tx_pause; + u8 tx_ppp; + int rss_rings; + int inline_thold; +}; + +struct mlx4_en_profile { + int rss_xor; + int udp_rss; + u8 rss_mask; + u32 active_ports; + u32 small_pkt_int; + u8 no_reset; + u8 num_tx_rings_p_up; + struct mlx4_en_port_profile prof[MLX4_MAX_PORTS + 1]; +}; + +struct mlx4_en_dev { + struct mlx4_dev *dev; + struct pci_dev *pdev; + struct mutex state_lock; + struct net_device *pndev[MLX4_MAX_PORTS + 1]; + u32 port_cnt; + bool device_up; + struct mlx4_en_profile profile; + u32 LSO_support; + struct workqueue_struct *workqueue; + struct device *dma_device; + void __iomem *uar_map; + struct mlx4_uar priv_uar; + struct mlx4_mr mr; + u32 priv_pdn; + spinlock_t uar_lock; + u8 mac_removed[MLX4_MAX_PORTS + 1]; + rwlock_t clock_lock; + u32 nominal_c_mult; + struct cyclecounter cycles; + struct timecounter clock; + unsigned long last_overflow_check; + unsigned long overflow_period; + struct ptp_clock *ptp_clock; + struct ptp_clock_info ptp_clock_info; +}; + + +struct mlx4_en_rss_map { + int base_qpn; + struct mlx4_qp qps[MAX_RX_RINGS]; + enum mlx4_qp_state state[MAX_RX_RINGS]; + struct mlx4_qp indir_qp; + enum mlx4_qp_state indir_state; +}; + +struct mlx4_en_port_state { + int link_state; + int link_speed; + int transciver; +}; + +struct mlx4_en_pkt_stats { + unsigned long broadcast; + unsigned long rx_prio[8]; + unsigned long tx_prio[8]; +#define NUM_PKT_STATS 17 +}; + +struct mlx4_en_port_stats { + unsigned long tso_packets; + unsigned long xmit_more; + unsigned long queue_stopped; + unsigned long wake_queue; + unsigned long tx_timeout; + unsigned long rx_alloc_failed; + unsigned long rx_chksum_good; + unsigned long rx_chksum_none; + unsigned long tx_chksum_offload; +#define NUM_PORT_STATS 9 +}; + +struct mlx4_en_perf_stats { + u32 tx_poll; + u64 tx_pktsz_avg; + u32 inflight_avg; + u16 tx_coal_avg; + u16 rx_coal_avg; + u32 napi_quota; +#define NUM_PERF_COUNTERS 6 +}; + +enum mlx4_en_mclist_act { + MCLIST_NONE, + MCLIST_REM, + MCLIST_ADD, +}; + +struct mlx4_en_mc_list { + struct list_head list; + enum mlx4_en_mclist_act action; + u8 addr[ETH_ALEN]; + u64 reg_id; + u64 tunnel_reg_id; +}; + +struct mlx4_en_frag_info { + u16 frag_size; + u16 frag_prefix_size; + u16 frag_stride; + u16 frag_align; +}; + +#ifdef CONFIG_MLX4_EN_DCB +/* Minimal TC BW - setting to 0 will block traffic */ +#define MLX4_EN_BW_MIN 1 +#define MLX4_EN_BW_MAX 100 /* Utilize 100% of the line */ + +#define MLX4_EN_TC_ETS 7 + +#endif + +struct ethtool_flow_id { + struct list_head list; + struct ethtool_rx_flow_spec flow_spec; + u64 id; +}; + +enum { + MLX4_EN_FLAG_PROMISC = (1 << 0), + MLX4_EN_FLAG_MC_PROMISC = (1 << 1), + /* whether we need to enable hardware loopback by putting dmac + * in Tx WQE + */ + MLX4_EN_FLAG_ENABLE_HW_LOOPBACK = (1 << 2), + /* whether we need to drop packets that hardware loopback-ed */ + MLX4_EN_FLAG_RX_FILTER_NEEDED = (1 << 3), + MLX4_EN_FLAG_FORCE_PROMISC = (1 << 4) +}; + +#define MLX4_EN_MAC_HASH_SIZE (1 << BITS_PER_BYTE) +#define MLX4_EN_MAC_HASH_IDX 5 + +struct mlx4_en_priv { + struct mlx4_en_dev *mdev; + struct mlx4_en_port_profile *prof; + struct net_device *dev; + unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)]; + struct net_device_stats stats; + struct net_device_stats ret_stats; + struct mlx4_en_port_state port_state; + spinlock_t stats_lock; + struct ethtool_flow_id ethtool_rules[MAX_NUM_OF_FS_RULES]; + /* To allow rules removal while port is going down */ + struct list_head ethtool_list; + + unsigned long last_moder_packets[MAX_RX_RINGS]; + unsigned long last_moder_tx_packets; + unsigned long last_moder_bytes[MAX_RX_RINGS]; + unsigned long last_moder_jiffies; + int last_moder_time[MAX_RX_RINGS]; + u16 rx_usecs; + u16 rx_frames; + u16 tx_usecs; + u16 tx_frames; + u32 pkt_rate_low; + u16 rx_usecs_low; + u32 pkt_rate_high; + u16 rx_usecs_high; + u16 sample_interval; + u16 adaptive_rx_coal; + u32 msg_enable; + u32 loopback_ok; + u32 validate_loopback; + + struct mlx4_hwq_resources res; + int link_state; + int last_link_state; + bool port_up; + int port; + int registered; + int allocated; + int stride; + unsigned char current_mac[ETH_ALEN + 2]; + int mac_index; + unsigned max_mtu; + int base_qpn; + int cqe_factor; + int cqe_size; + + struct mlx4_en_rss_map rss_map; + __be32 ctrl_flags; + u32 flags; + u8 num_tx_rings_p_up; + u32 tx_work_limit; + u32 tx_ring_num; + u32 rx_ring_num; + u32 rx_skb_size; + struct mlx4_en_frag_info frag_info[MLX4_EN_MAX_RX_FRAGS]; + u16 num_frags; + u16 log_rx_info; + + struct mlx4_en_tx_ring **tx_ring; + struct mlx4_en_rx_ring *rx_ring[MAX_RX_RINGS]; + struct mlx4_en_cq **tx_cq; + struct mlx4_en_cq *rx_cq[MAX_RX_RINGS]; + struct mlx4_qp drop_qp; + struct work_struct rx_mode_task; + struct work_struct watchdog_task; + struct work_struct linkstate_task; + struct delayed_work stats_task; + struct delayed_work service_task; +#ifdef CONFIG_MLX4_EN_VXLAN + struct work_struct vxlan_add_task; + struct work_struct vxlan_del_task; +#endif + struct mlx4_en_perf_stats pstats; + struct mlx4_en_pkt_stats pkstats; + struct mlx4_en_port_stats port_stats; + u64 stats_bitmap; + struct list_head mc_list; + struct list_head curr_list; + u64 broadcast_id; + struct mlx4_en_stat_out_mbox hw_stats; + int vids[128]; + bool wol; + struct device *ddev; + int base_tx_qpn; + struct hlist_head mac_hash[MLX4_EN_MAC_HASH_SIZE]; + struct hwtstamp_config hwtstamp_config; + +#ifdef CONFIG_MLX4_EN_DCB + struct ieee_ets ets; + u16 maxrate[IEEE_8021QAZ_MAX_TCS]; +#endif +#ifdef CONFIG_RFS_ACCEL + spinlock_t filters_lock; + int last_filter_id; + struct list_head filters; + struct hlist_head filter_hash[1 << MLX4_EN_FILTER_HASH_SHIFT]; +#endif + u64 tunnel_reg_id; + __be16 vxlan_port; + + u32 pflags; +}; + +enum mlx4_en_wol { + MLX4_EN_WOL_MAGIC = (1ULL << 61), + MLX4_EN_WOL_ENABLED = (1ULL << 62), +}; + +struct mlx4_mac_entry { + struct hlist_node hlist; + unsigned char mac[ETH_ALEN + 2]; + u64 reg_id; + struct rcu_head rcu; +}; + +static inline struct mlx4_cqe *mlx4_en_get_cqe(void *buf, int idx, int cqe_sz) +{ + return buf + idx * cqe_sz; +} + +#ifdef CONFIG_NET_RX_BUSY_POLL +static inline void mlx4_en_cq_init_lock(struct mlx4_en_cq *cq) +{ + spin_lock_init(&cq->poll_lock); + cq->state = MLX4_EN_CQ_STATE_IDLE; +} + +/* called from the device poll rutine to get ownership of a cq */ +static inline bool mlx4_en_cq_lock_napi(struct mlx4_en_cq *cq) +{ + int rc = true; + spin_lock(&cq->poll_lock); + if (cq->state & MLX4_CQ_LOCKED) { + WARN_ON(cq->state & MLX4_EN_CQ_STATE_NAPI); + cq->state |= MLX4_EN_CQ_STATE_NAPI_YIELD; + rc = false; + } else + /* we don't care if someone yielded */ + cq->state = MLX4_EN_CQ_STATE_NAPI; + spin_unlock(&cq->poll_lock); + return rc; +} + +/* returns true is someone tried to get the cq while napi had it */ +static inline bool mlx4_en_cq_unlock_napi(struct mlx4_en_cq *cq) +{ + int rc = false; + spin_lock(&cq->poll_lock); + WARN_ON(cq->state & (MLX4_EN_CQ_STATE_POLL | + MLX4_EN_CQ_STATE_NAPI_YIELD)); + + if (cq->state & MLX4_EN_CQ_STATE_POLL_YIELD) + rc = true; + cq->state = MLX4_EN_CQ_STATE_IDLE; + spin_unlock(&cq->poll_lock); + return rc; +} + +/* called from mlx4_en_low_latency_poll() */ +static inline bool mlx4_en_cq_lock_poll(struct mlx4_en_cq *cq) +{ + int rc = true; + spin_lock_bh(&cq->poll_lock); + if ((cq->state & MLX4_CQ_LOCKED)) { + struct net_device *dev = cq->dev; + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_rx_ring *rx_ring = priv->rx_ring[cq->ring]; + + cq->state |= MLX4_EN_CQ_STATE_POLL_YIELD; + rc = false; + rx_ring->yields++; + } else + /* preserve yield marks */ + cq->state |= MLX4_EN_CQ_STATE_POLL; + spin_unlock_bh(&cq->poll_lock); + return rc; +} + +/* returns true if someone tried to get the cq while it was locked */ +static inline bool mlx4_en_cq_unlock_poll(struct mlx4_en_cq *cq) +{ + int rc = false; + spin_lock_bh(&cq->poll_lock); + WARN_ON(cq->state & (MLX4_EN_CQ_STATE_NAPI)); + + if (cq->state & MLX4_EN_CQ_STATE_POLL_YIELD) + rc = true; + cq->state = MLX4_EN_CQ_STATE_IDLE; + spin_unlock_bh(&cq->poll_lock); + return rc; +} + +/* true if a socket is polling, even if it did not get the lock */ +static inline bool mlx4_en_cq_busy_polling(struct mlx4_en_cq *cq) +{ + WARN_ON(!(cq->state & MLX4_CQ_LOCKED)); + return cq->state & CQ_USER_PEND; +} +#else +static inline void mlx4_en_cq_init_lock(struct mlx4_en_cq *cq) +{ +} + +static inline bool mlx4_en_cq_lock_napi(struct mlx4_en_cq *cq) +{ + return true; +} + +static inline bool mlx4_en_cq_unlock_napi(struct mlx4_en_cq *cq) +{ + return false; +} + +static inline bool mlx4_en_cq_lock_poll(struct mlx4_en_cq *cq) +{ + return false; +} + +static inline bool mlx4_en_cq_unlock_poll(struct mlx4_en_cq *cq) +{ + return false; +} + +static inline bool mlx4_en_cq_busy_polling(struct mlx4_en_cq *cq) +{ + return false; +} +#endif /* CONFIG_NET_RX_BUSY_POLL */ + +#define MLX4_EN_WOL_DO_MODIFY (1ULL << 63) + +void mlx4_en_update_loopback_state(struct net_device *dev, + netdev_features_t features); + +void mlx4_en_destroy_netdev(struct net_device *dev); +int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, + struct mlx4_en_port_profile *prof); + +int mlx4_en_start_port(struct net_device *dev); +void mlx4_en_stop_port(struct net_device *dev, int detach); + +void mlx4_en_free_resources(struct mlx4_en_priv *priv); +int mlx4_en_alloc_resources(struct mlx4_en_priv *priv); + +int mlx4_en_create_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq **pcq, + int entries, int ring, enum cq_type mode, int node); +void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq **pcq); +int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq, + int cq_idx); +void mlx4_en_deactivate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq); +int mlx4_en_set_cq_moder(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq); +int mlx4_en_arm_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq); + +void mlx4_en_tx_irq(struct mlx4_cq *mcq); +u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb, + void *accel_priv, select_queue_fallback_t fallback); +netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev); + +int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, + struct mlx4_en_tx_ring **pring, + int qpn, u32 size, u16 stride, + int node, int queue_index); +void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv, + struct mlx4_en_tx_ring **pring); +int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv, + struct mlx4_en_tx_ring *ring, + int cq, int user_prio); +void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv, + struct mlx4_en_tx_ring *ring); +void mlx4_en_set_num_rx_rings(struct mlx4_en_dev *mdev); +int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv, + struct mlx4_en_rx_ring **pring, + u32 size, u16 stride, int node); +void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv, + struct mlx4_en_rx_ring **pring, + u32 size, u16 stride); +int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv); +void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv, + struct mlx4_en_rx_ring *ring); +int mlx4_en_process_rx_cq(struct net_device *dev, + struct mlx4_en_cq *cq, + int budget); +int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget); +int mlx4_en_poll_tx_cq(struct napi_struct *napi, int budget); +void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride, + int is_tx, int rss, int qpn, int cqn, int user_prio, + struct mlx4_qp_context *context); +void mlx4_en_sqp_event(struct mlx4_qp *qp, enum mlx4_event event); +int mlx4_en_map_buffer(struct mlx4_buf *buf); +void mlx4_en_unmap_buffer(struct mlx4_buf *buf); + +void mlx4_en_calc_rx_buf(struct net_device *dev); +int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv); +void mlx4_en_release_rss_steer(struct mlx4_en_priv *priv); +int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv); +void mlx4_en_destroy_drop_qp(struct mlx4_en_priv *priv); +int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring); +void mlx4_en_rx_irq(struct mlx4_cq *mcq); + +int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mode); +int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, struct mlx4_en_priv *priv); + +int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset); +int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port); + +#ifdef CONFIG_MLX4_EN_DCB +extern const struct dcbnl_rtnl_ops mlx4_en_dcbnl_ops; +extern const struct dcbnl_rtnl_ops mlx4_en_dcbnl_pfc_ops; +#endif + +int mlx4_en_setup_tc(struct net_device *dev, u8 up); + +#ifdef CONFIG_RFS_ACCEL +void mlx4_en_cleanup_filters(struct mlx4_en_priv *priv); +#endif + +#define MLX4_EN_NUM_SELF_TEST 5 +void mlx4_en_ex_selftest(struct net_device *dev, u32 *flags, u64 *buf); +void mlx4_en_ptp_overflow_check(struct mlx4_en_dev *mdev); + +/* + * Functions for time stamping + */ +u64 mlx4_en_get_cqe_ts(struct mlx4_cqe *cqe); +void mlx4_en_fill_hwtstamps(struct mlx4_en_dev *mdev, + struct skb_shared_hwtstamps *hwts, + u64 timestamp); +void mlx4_en_init_timestamp(struct mlx4_en_dev *mdev); +void mlx4_en_remove_timestamp(struct mlx4_en_dev *mdev); +int mlx4_en_timestamp_config(struct net_device *dev, + int tx_type, + int rx_filter); + +/* Globals + */ +extern const struct ethtool_ops mlx4_en_ethtool_ops; + + + +/* + * printk / logging functions + */ + +__printf(3, 4) +void en_print(const char *level, const struct mlx4_en_priv *priv, + const char *format, ...); + +#define en_dbg(mlevel, priv, format, ...) \ +do { \ + if (NETIF_MSG_##mlevel & (priv)->msg_enable) \ + en_print(KERN_DEBUG, priv, format, ##__VA_ARGS__); \ +} while (0) +#define en_warn(priv, format, ...) \ + en_print(KERN_WARNING, priv, format, ##__VA_ARGS__) +#define en_err(priv, format, ...) \ + en_print(KERN_ERR, priv, format, ##__VA_ARGS__) +#define en_info(priv, format, ...) \ + en_print(KERN_INFO, priv, format, ##__VA_ARGS__) + +#define mlx4_err(mdev, format, ...) \ + pr_err(DRV_NAME " %s: " format, \ + dev_name(&(mdev)->pdev->dev), ##__VA_ARGS__) +#define mlx4_info(mdev, format, ...) \ + pr_info(DRV_NAME " %s: " format, \ + dev_name(&(mdev)->pdev->dev), ##__VA_ARGS__) +#define mlx4_warn(mdev, format, ...) \ + pr_warn(DRV_NAME " %s: " format, \ + dev_name(&(mdev)->pdev->dev), ##__VA_ARGS__) + +#endif diff --git a/drivers/net/ethernet/mellanox/mlx4/mr.c b/drivers/net/ethernet/mellanox/mlx4/mr.c new file mode 100644 index 0000000..193a6ad --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/mr.c @@ -0,0 +1,1166 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved. + * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include + +#include "mlx4.h" +#include "icm.h" + +static u32 mlx4_buddy_alloc(struct mlx4_buddy *buddy, int order) +{ + int o; + int m; + u32 seg; + + spin_lock(&buddy->lock); + + for (o = order; o <= buddy->max_order; ++o) + if (buddy->num_free[o]) { + m = 1 << (buddy->max_order - o); + seg = find_first_bit(buddy->bits[o], m); + if (seg < m) + goto found; + } + + spin_unlock(&buddy->lock); + return -1; + + found: + clear_bit(seg, buddy->bits[o]); + --buddy->num_free[o]; + + while (o > order) { + --o; + seg <<= 1; + set_bit(seg ^ 1, buddy->bits[o]); + ++buddy->num_free[o]; + } + + spin_unlock(&buddy->lock); + + seg <<= order; + + return seg; +} + +static void mlx4_buddy_free(struct mlx4_buddy *buddy, u32 seg, int order) +{ + seg >>= order; + + spin_lock(&buddy->lock); + + while (test_bit(seg ^ 1, buddy->bits[order])) { + clear_bit(seg ^ 1, buddy->bits[order]); + --buddy->num_free[order]; + seg >>= 1; + ++order; + } + + set_bit(seg, buddy->bits[order]); + ++buddy->num_free[order]; + + spin_unlock(&buddy->lock); +} + +static int mlx4_buddy_init(struct mlx4_buddy *buddy, int max_order) +{ + int i, s; + + buddy->max_order = max_order; + spin_lock_init(&buddy->lock); + + buddy->bits = kcalloc(buddy->max_order + 1, sizeof (long *), + GFP_KERNEL); + buddy->num_free = kcalloc((buddy->max_order + 1), sizeof *buddy->num_free, + GFP_KERNEL); + if (!buddy->bits || !buddy->num_free) + goto err_out; + + for (i = 0; i <= buddy->max_order; ++i) { + s = BITS_TO_LONGS(1 << (buddy->max_order - i)); + buddy->bits[i] = kcalloc(s, sizeof (long), GFP_KERNEL | __GFP_NOWARN); + if (!buddy->bits[i]) { + buddy->bits[i] = vzalloc(s * sizeof(long)); + if (!buddy->bits[i]) + goto err_out_free; + } + } + + set_bit(0, buddy->bits[buddy->max_order]); + buddy->num_free[buddy->max_order] = 1; + + return 0; + +err_out_free: + for (i = 0; i <= buddy->max_order; ++i) + if (buddy->bits[i] && is_vmalloc_addr(buddy->bits[i])) + vfree(buddy->bits[i]); + else + kfree(buddy->bits[i]); + +err_out: + kfree(buddy->bits); + kfree(buddy->num_free); + + return -ENOMEM; +} + +static void mlx4_buddy_cleanup(struct mlx4_buddy *buddy) +{ + int i; + + for (i = 0; i <= buddy->max_order; ++i) + if (is_vmalloc_addr(buddy->bits[i])) + vfree(buddy->bits[i]); + else + kfree(buddy->bits[i]); + + kfree(buddy->bits); + kfree(buddy->num_free); +} + +u32 __mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order) +{ + struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table; + u32 seg; + int seg_order; + u32 offset; + + seg_order = max_t(int, order - log_mtts_per_seg, 0); + + seg = mlx4_buddy_alloc(&mr_table->mtt_buddy, seg_order); + if (seg == -1) + return -1; + + offset = seg * (1 << log_mtts_per_seg); + + if (mlx4_table_get_range(dev, &mr_table->mtt_table, offset, + offset + (1 << order) - 1)) { + mlx4_buddy_free(&mr_table->mtt_buddy, seg, seg_order); + return -1; + } + + return offset; +} + +static u32 mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order) +{ + u64 in_param = 0; + u64 out_param; + int err; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, order); + err = mlx4_cmd_imm(dev, in_param, &out_param, RES_MTT, + RES_OP_RESERVE_AND_MAP, + MLX4_CMD_ALLOC_RES, + MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED); + if (err) + return -1; + return get_param_l(&out_param); + } + return __mlx4_alloc_mtt_range(dev, order); +} + +int mlx4_mtt_init(struct mlx4_dev *dev, int npages, int page_shift, + struct mlx4_mtt *mtt) +{ + int i; + + if (!npages) { + mtt->order = -1; + mtt->page_shift = MLX4_ICM_PAGE_SHIFT; + return 0; + } else + mtt->page_shift = page_shift; + + for (mtt->order = 0, i = 1; i < npages; i <<= 1) + ++mtt->order; + + mtt->offset = mlx4_alloc_mtt_range(dev, mtt->order); + if (mtt->offset == -1) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_mtt_init); + +void __mlx4_free_mtt_range(struct mlx4_dev *dev, u32 offset, int order) +{ + u32 first_seg; + int seg_order; + struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table; + + seg_order = max_t(int, order - log_mtts_per_seg, 0); + first_seg = offset / (1 << log_mtts_per_seg); + + mlx4_buddy_free(&mr_table->mtt_buddy, first_seg, seg_order); + mlx4_table_put_range(dev, &mr_table->mtt_table, offset, + offset + (1 << order) - 1); +} + +static void mlx4_free_mtt_range(struct mlx4_dev *dev, u32 offset, int order) +{ + u64 in_param = 0; + int err; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, offset); + set_param_h(&in_param, order); + err = mlx4_cmd(dev, in_param, RES_MTT, RES_OP_RESERVE_AND_MAP, + MLX4_CMD_FREE_RES, + MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED); + if (err) + mlx4_warn(dev, "Failed to free mtt range at:%d order:%d\n", + offset, order); + return; + } + __mlx4_free_mtt_range(dev, offset, order); +} + +void mlx4_mtt_cleanup(struct mlx4_dev *dev, struct mlx4_mtt *mtt) +{ + if (mtt->order < 0) + return; + + mlx4_free_mtt_range(dev, mtt->offset, mtt->order); +} +EXPORT_SYMBOL_GPL(mlx4_mtt_cleanup); + +u64 mlx4_mtt_addr(struct mlx4_dev *dev, struct mlx4_mtt *mtt) +{ + return (u64) mtt->offset * dev->caps.mtt_entry_sz; +} +EXPORT_SYMBOL_GPL(mlx4_mtt_addr); + +static u32 hw_index_to_key(u32 ind) +{ + return (ind >> 24) | (ind << 8); +} + +static u32 key_to_hw_index(u32 key) +{ + return (key << 24) | (key >> 8); +} + +static int mlx4_SW2HW_MPT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, + int mpt_index) +{ + return mlx4_cmd(dev, mailbox->dma, mpt_index, + 0, MLX4_CMD_SW2HW_MPT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); +} + +static int mlx4_HW2SW_MPT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, + int mpt_index) +{ + return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, mpt_index, + !mailbox, MLX4_CMD_HW2SW_MPT, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); +} + +/* Must protect against concurrent access */ +int mlx4_mr_hw_get_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr, + struct mlx4_mpt_entry ***mpt_entry) +{ + int err; + int key = key_to_hw_index(mmr->key) & (dev->caps.num_mpts - 1); + struct mlx4_cmd_mailbox *mailbox = NULL; + + if (mmr->enabled != MLX4_MPT_EN_HW) + return -EINVAL; + + err = mlx4_HW2SW_MPT(dev, NULL, key); + if (err) { + mlx4_warn(dev, "HW2SW_MPT failed (%d).", err); + mlx4_warn(dev, "Most likely the MR has MWs bound to it.\n"); + return err; + } + + mmr->enabled = MLX4_MPT_EN_SW; + + if (!mlx4_is_mfunc(dev)) { + **mpt_entry = mlx4_table_find( + &mlx4_priv(dev)->mr_table.dmpt_table, + key, NULL); + } else { + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR_OR_NULL(mailbox)) + return PTR_ERR(mailbox); + + err = mlx4_cmd_box(dev, 0, mailbox->dma, key, + 0, MLX4_CMD_QUERY_MPT, + MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); + if (err) + goto free_mailbox; + + *mpt_entry = (struct mlx4_mpt_entry **)&mailbox->buf; + } + + if (!(*mpt_entry) || !(**mpt_entry)) { + err = -ENOMEM; + goto free_mailbox; + } + + return 0; + +free_mailbox: + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} +EXPORT_SYMBOL_GPL(mlx4_mr_hw_get_mpt); + +int mlx4_mr_hw_write_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr, + struct mlx4_mpt_entry **mpt_entry) +{ + int err; + + if (!mlx4_is_mfunc(dev)) { + /* Make sure any changes to this entry are flushed */ + wmb(); + + *(u8 *)(*mpt_entry) = MLX4_MPT_STATUS_HW; + + /* Make sure the new status is written */ + wmb(); + + err = mlx4_SYNC_TPT(dev); + } else { + int key = key_to_hw_index(mmr->key) & (dev->caps.num_mpts - 1); + + struct mlx4_cmd_mailbox *mailbox = + container_of((void *)mpt_entry, struct mlx4_cmd_mailbox, + buf); + + err = mlx4_SW2HW_MPT(dev, mailbox, key); + } + + if (!err) { + mmr->pd = be32_to_cpu((*mpt_entry)->pd_flags) & MLX4_MPT_PD_MASK; + mmr->enabled = MLX4_MPT_EN_HW; + } + return err; +} +EXPORT_SYMBOL_GPL(mlx4_mr_hw_write_mpt); + +void mlx4_mr_hw_put_mpt(struct mlx4_dev *dev, + struct mlx4_mpt_entry **mpt_entry) +{ + if (mlx4_is_mfunc(dev)) { + struct mlx4_cmd_mailbox *mailbox = + container_of((void *)mpt_entry, struct mlx4_cmd_mailbox, + buf); + mlx4_free_cmd_mailbox(dev, mailbox); + } +} +EXPORT_SYMBOL_GPL(mlx4_mr_hw_put_mpt); + +int mlx4_mr_hw_change_pd(struct mlx4_dev *dev, struct mlx4_mpt_entry *mpt_entry, + u32 pdn) +{ + u32 pd_flags = be32_to_cpu(mpt_entry->pd_flags) & ~MLX4_MPT_PD_MASK; + /* The wrapper function will put the slave's id here */ + if (mlx4_is_mfunc(dev)) + pd_flags &= ~MLX4_MPT_PD_VF_MASK; + + mpt_entry->pd_flags = cpu_to_be32(pd_flags | + (pdn & MLX4_MPT_PD_MASK) + | MLX4_MPT_PD_FLAG_EN_INV); + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_mr_hw_change_pd); + +int mlx4_mr_hw_change_access(struct mlx4_dev *dev, + struct mlx4_mpt_entry *mpt_entry, + u32 access) +{ + u32 flags = (be32_to_cpu(mpt_entry->flags) & ~MLX4_PERM_MASK) | + (access & MLX4_PERM_MASK); + + mpt_entry->flags = cpu_to_be32(flags); + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_mr_hw_change_access); + +static int mlx4_mr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, u32 pd, + u64 iova, u64 size, u32 access, int npages, + int page_shift, struct mlx4_mr *mr) +{ + mr->iova = iova; + mr->size = size; + mr->pd = pd; + mr->access = access; + mr->enabled = MLX4_MPT_DISABLED; + mr->key = hw_index_to_key(mridx); + + return mlx4_mtt_init(dev, npages, page_shift, &mr->mtt); +} + +static int mlx4_WRITE_MTT(struct mlx4_dev *dev, + struct mlx4_cmd_mailbox *mailbox, + int num_entries) +{ + return mlx4_cmd(dev, mailbox->dma, num_entries, 0, MLX4_CMD_WRITE_MTT, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); +} + +int __mlx4_mpt_reserve(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + return mlx4_bitmap_alloc(&priv->mr_table.mpt_bitmap); +} + +static int mlx4_mpt_reserve(struct mlx4_dev *dev) +{ + u64 out_param; + + if (mlx4_is_mfunc(dev)) { + if (mlx4_cmd_imm(dev, 0, &out_param, RES_MPT, RES_OP_RESERVE, + MLX4_CMD_ALLOC_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED)) + return -1; + return get_param_l(&out_param); + } + return __mlx4_mpt_reserve(dev); +} + +void __mlx4_mpt_release(struct mlx4_dev *dev, u32 index) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + mlx4_bitmap_free(&priv->mr_table.mpt_bitmap, index, MLX4_NO_RR); +} + +static void mlx4_mpt_release(struct mlx4_dev *dev, u32 index) +{ + u64 in_param = 0; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, index); + if (mlx4_cmd(dev, in_param, RES_MPT, RES_OP_RESERVE, + MLX4_CMD_FREE_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED)) + mlx4_warn(dev, "Failed to release mr index:%d\n", + index); + return; + } + __mlx4_mpt_release(dev, index); +} + +int __mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index, gfp_t gfp) +{ + struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table; + + return mlx4_table_get(dev, &mr_table->dmpt_table, index, gfp); +} + +static int mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index, gfp_t gfp) +{ + u64 param = 0; + + if (mlx4_is_mfunc(dev)) { + set_param_l(¶m, index); + return mlx4_cmd_imm(dev, param, ¶m, RES_MPT, RES_OP_MAP_ICM, + MLX4_CMD_ALLOC_RES, + MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED); + } + return __mlx4_mpt_alloc_icm(dev, index, gfp); +} + +void __mlx4_mpt_free_icm(struct mlx4_dev *dev, u32 index) +{ + struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table; + + mlx4_table_put(dev, &mr_table->dmpt_table, index); +} + +static void mlx4_mpt_free_icm(struct mlx4_dev *dev, u32 index) +{ + u64 in_param = 0; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, index); + if (mlx4_cmd(dev, in_param, RES_MPT, RES_OP_MAP_ICM, + MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED)) + mlx4_warn(dev, "Failed to free icm of mr index:%d\n", + index); + return; + } + return __mlx4_mpt_free_icm(dev, index); +} + +int mlx4_mr_alloc(struct mlx4_dev *dev, u32 pd, u64 iova, u64 size, u32 access, + int npages, int page_shift, struct mlx4_mr *mr) +{ + u32 index; + int err; + + index = mlx4_mpt_reserve(dev); + if (index == -1) + return -ENOMEM; + + err = mlx4_mr_alloc_reserved(dev, index, pd, iova, size, + access, npages, page_shift, mr); + if (err) + mlx4_mpt_release(dev, index); + + return err; +} +EXPORT_SYMBOL_GPL(mlx4_mr_alloc); + +static int mlx4_mr_free_reserved(struct mlx4_dev *dev, struct mlx4_mr *mr) +{ + int err; + + if (mr->enabled == MLX4_MPT_EN_HW) { + err = mlx4_HW2SW_MPT(dev, NULL, + key_to_hw_index(mr->key) & + (dev->caps.num_mpts - 1)); + if (err) { + mlx4_warn(dev, "HW2SW_MPT failed (%d), MR has MWs bound to it\n", + err); + return err; + } + + mr->enabled = MLX4_MPT_EN_SW; + } + mlx4_mtt_cleanup(dev, &mr->mtt); + + return 0; +} + +int mlx4_mr_free(struct mlx4_dev *dev, struct mlx4_mr *mr) +{ + int ret; + + ret = mlx4_mr_free_reserved(dev, mr); + if (ret) + return ret; + if (mr->enabled) + mlx4_mpt_free_icm(dev, key_to_hw_index(mr->key)); + mlx4_mpt_release(dev, key_to_hw_index(mr->key)); + + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_mr_free); + +void mlx4_mr_rereg_mem_cleanup(struct mlx4_dev *dev, struct mlx4_mr *mr) +{ + mlx4_mtt_cleanup(dev, &mr->mtt); +} +EXPORT_SYMBOL_GPL(mlx4_mr_rereg_mem_cleanup); + +int mlx4_mr_rereg_mem_write(struct mlx4_dev *dev, struct mlx4_mr *mr, + u64 iova, u64 size, int npages, + int page_shift, struct mlx4_mpt_entry *mpt_entry) +{ + int err; + + mpt_entry->start = cpu_to_be64(iova); + mpt_entry->length = cpu_to_be64(size); + mpt_entry->entity_size = cpu_to_be32(page_shift); + + err = mlx4_mtt_init(dev, npages, page_shift, &mr->mtt); + if (err) + return err; + + mpt_entry->pd_flags &= cpu_to_be32(MLX4_MPT_PD_MASK | + MLX4_MPT_PD_FLAG_EN_INV); + mpt_entry->flags &= cpu_to_be32(MLX4_MPT_FLAG_FREE | + MLX4_MPT_FLAG_SW_OWNS); + if (mr->mtt.order < 0) { + mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_PHYSICAL); + mpt_entry->mtt_addr = 0; + } else { + mpt_entry->mtt_addr = cpu_to_be64(mlx4_mtt_addr(dev, + &mr->mtt)); + if (mr->mtt.page_shift == 0) + mpt_entry->mtt_sz = cpu_to_be32(1 << mr->mtt.order); + } + if (mr->mtt.order >= 0 && mr->mtt.page_shift == 0) { + /* fast register MR in free state */ + mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_FREE); + mpt_entry->pd_flags |= cpu_to_be32(MLX4_MPT_PD_FLAG_FAST_REG | + MLX4_MPT_PD_FLAG_RAE); + } else { + mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_SW_OWNS); + } + mr->enabled = MLX4_MPT_EN_SW; + + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_mr_rereg_mem_write); + +int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_mpt_entry *mpt_entry; + int err; + + err = mlx4_mpt_alloc_icm(dev, key_to_hw_index(mr->key), GFP_KERNEL); + if (err) + return err; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + err = PTR_ERR(mailbox); + goto err_table; + } + mpt_entry = mailbox->buf; + mpt_entry->flags = cpu_to_be32(MLX4_MPT_FLAG_MIO | + MLX4_MPT_FLAG_REGION | + mr->access); + + mpt_entry->key = cpu_to_be32(key_to_hw_index(mr->key)); + mpt_entry->pd_flags = cpu_to_be32(mr->pd | MLX4_MPT_PD_FLAG_EN_INV); + mpt_entry->start = cpu_to_be64(mr->iova); + mpt_entry->length = cpu_to_be64(mr->size); + mpt_entry->entity_size = cpu_to_be32(mr->mtt.page_shift); + + if (mr->mtt.order < 0) { + mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_PHYSICAL); + mpt_entry->mtt_addr = 0; + } else { + mpt_entry->mtt_addr = cpu_to_be64(mlx4_mtt_addr(dev, + &mr->mtt)); + } + + if (mr->mtt.order >= 0 && mr->mtt.page_shift == 0) { + /* fast register MR in free state */ + mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_FREE); + mpt_entry->pd_flags |= cpu_to_be32(MLX4_MPT_PD_FLAG_FAST_REG | + MLX4_MPT_PD_FLAG_RAE); + mpt_entry->mtt_sz = cpu_to_be32(1 << mr->mtt.order); + } else { + mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_SW_OWNS); + } + + err = mlx4_SW2HW_MPT(dev, mailbox, + key_to_hw_index(mr->key) & (dev->caps.num_mpts - 1)); + if (err) { + mlx4_warn(dev, "SW2HW_MPT failed (%d)\n", err); + goto err_cmd; + } + mr->enabled = MLX4_MPT_EN_HW; + + mlx4_free_cmd_mailbox(dev, mailbox); + + return 0; + +err_cmd: + mlx4_free_cmd_mailbox(dev, mailbox); + +err_table: + mlx4_mpt_free_icm(dev, key_to_hw_index(mr->key)); + return err; +} +EXPORT_SYMBOL_GPL(mlx4_mr_enable); + +static int mlx4_write_mtt_chunk(struct mlx4_dev *dev, struct mlx4_mtt *mtt, + int start_index, int npages, u64 *page_list) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + __be64 *mtts; + dma_addr_t dma_handle; + int i; + + mtts = mlx4_table_find(&priv->mr_table.mtt_table, mtt->offset + + start_index, &dma_handle); + + if (!mtts) + return -ENOMEM; + + dma_sync_single_for_cpu(&dev->pdev->dev, dma_handle, + npages * sizeof (u64), DMA_TO_DEVICE); + + for (i = 0; i < npages; ++i) + mtts[i] = cpu_to_be64(page_list[i] | MLX4_MTT_FLAG_PRESENT); + + dma_sync_single_for_device(&dev->pdev->dev, dma_handle, + npages * sizeof (u64), DMA_TO_DEVICE); + + return 0; +} + +int __mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, + int start_index, int npages, u64 *page_list) +{ + int err = 0; + int chunk; + int mtts_per_page; + int max_mtts_first_page; + + /* compute how may mtts fit in the first page */ + mtts_per_page = PAGE_SIZE / sizeof(u64); + max_mtts_first_page = mtts_per_page - (mtt->offset + start_index) + % mtts_per_page; + + chunk = min_t(int, max_mtts_first_page, npages); + + while (npages > 0) { + err = mlx4_write_mtt_chunk(dev, mtt, start_index, chunk, page_list); + if (err) + return err; + npages -= chunk; + start_index += chunk; + page_list += chunk; + + chunk = min_t(int, mtts_per_page, npages); + } + return err; +} + +int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, + int start_index, int npages, u64 *page_list) +{ + struct mlx4_cmd_mailbox *mailbox = NULL; + __be64 *inbox = NULL; + int chunk; + int err = 0; + int i; + + if (mtt->order < 0) + return -EINVAL; + + if (mlx4_is_mfunc(dev)) { + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + inbox = mailbox->buf; + + while (npages > 0) { + chunk = min_t(int, MLX4_MAILBOX_SIZE / sizeof(u64) - 2, + npages); + inbox[0] = cpu_to_be64(mtt->offset + start_index); + inbox[1] = 0; + for (i = 0; i < chunk; ++i) + inbox[i + 2] = cpu_to_be64(page_list[i] | + MLX4_MTT_FLAG_PRESENT); + err = mlx4_WRITE_MTT(dev, mailbox, chunk); + if (err) { + mlx4_free_cmd_mailbox(dev, mailbox); + return err; + } + + npages -= chunk; + start_index += chunk; + page_list += chunk; + } + mlx4_free_cmd_mailbox(dev, mailbox); + return err; + } + + return __mlx4_write_mtt(dev, mtt, start_index, npages, page_list); +} +EXPORT_SYMBOL_GPL(mlx4_write_mtt); + +int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, + struct mlx4_buf *buf, gfp_t gfp) +{ + u64 *page_list; + int err; + int i; + + page_list = kmalloc(buf->npages * sizeof *page_list, + gfp); + if (!page_list) + return -ENOMEM; + + for (i = 0; i < buf->npages; ++i) + if (buf->nbufs == 1) + page_list[i] = buf->direct.map + (i << buf->page_shift); + else + page_list[i] = buf->page_list[i].map; + + err = mlx4_write_mtt(dev, mtt, 0, buf->npages, page_list); + + kfree(page_list); + return err; +} +EXPORT_SYMBOL_GPL(mlx4_buf_write_mtt); + +int mlx4_mw_alloc(struct mlx4_dev *dev, u32 pd, enum mlx4_mw_type type, + struct mlx4_mw *mw) +{ + u32 index; + + if ((type == MLX4_MW_TYPE_1 && + !(dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW)) || + (type == MLX4_MW_TYPE_2 && + !(dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN))) + return -ENOTSUPP; + + index = mlx4_mpt_reserve(dev); + if (index == -1) + return -ENOMEM; + + mw->key = hw_index_to_key(index); + mw->pd = pd; + mw->type = type; + mw->enabled = MLX4_MPT_DISABLED; + + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_mw_alloc); + +int mlx4_mw_enable(struct mlx4_dev *dev, struct mlx4_mw *mw) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_mpt_entry *mpt_entry; + int err; + + err = mlx4_mpt_alloc_icm(dev, key_to_hw_index(mw->key), GFP_KERNEL); + if (err) + return err; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + err = PTR_ERR(mailbox); + goto err_table; + } + mpt_entry = mailbox->buf; + + /* Note that the MLX4_MPT_FLAG_REGION bit in mpt_entry->flags is turned + * off, thus creating a memory window and not a memory region. + */ + mpt_entry->key = cpu_to_be32(key_to_hw_index(mw->key)); + mpt_entry->pd_flags = cpu_to_be32(mw->pd); + if (mw->type == MLX4_MW_TYPE_2) { + mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_FREE); + mpt_entry->qpn = cpu_to_be32(MLX4_MPT_QP_FLAG_BOUND_QP); + mpt_entry->pd_flags |= cpu_to_be32(MLX4_MPT_PD_FLAG_EN_INV); + } + + err = mlx4_SW2HW_MPT(dev, mailbox, + key_to_hw_index(mw->key) & + (dev->caps.num_mpts - 1)); + if (err) { + mlx4_warn(dev, "SW2HW_MPT failed (%d)\n", err); + goto err_cmd; + } + mw->enabled = MLX4_MPT_EN_HW; + + mlx4_free_cmd_mailbox(dev, mailbox); + + return 0; + +err_cmd: + mlx4_free_cmd_mailbox(dev, mailbox); + +err_table: + mlx4_mpt_free_icm(dev, key_to_hw_index(mw->key)); + return err; +} +EXPORT_SYMBOL_GPL(mlx4_mw_enable); + +void mlx4_mw_free(struct mlx4_dev *dev, struct mlx4_mw *mw) +{ + int err; + + if (mw->enabled == MLX4_MPT_EN_HW) { + err = mlx4_HW2SW_MPT(dev, NULL, + key_to_hw_index(mw->key) & + (dev->caps.num_mpts - 1)); + if (err) + mlx4_warn(dev, "xxx HW2SW_MPT failed (%d)\n", err); + + mw->enabled = MLX4_MPT_EN_SW; + } + if (mw->enabled) + mlx4_mpt_free_icm(dev, key_to_hw_index(mw->key)); + mlx4_mpt_release(dev, key_to_hw_index(mw->key)); +} +EXPORT_SYMBOL_GPL(mlx4_mw_free); + +int mlx4_init_mr_table(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_mr_table *mr_table = &priv->mr_table; + int err; + + /* Nothing to do for slaves - all MR handling is forwarded + * to the master */ + if (mlx4_is_slave(dev)) + return 0; + + if (!is_power_of_2(dev->caps.num_mpts)) + return -EINVAL; + + err = mlx4_bitmap_init(&mr_table->mpt_bitmap, dev->caps.num_mpts, + ~0, dev->caps.reserved_mrws, 0); + if (err) + return err; + + err = mlx4_buddy_init(&mr_table->mtt_buddy, + ilog2((u32)dev->caps.num_mtts / + (1 << log_mtts_per_seg))); + if (err) + goto err_buddy; + + if (dev->caps.reserved_mtts) { + priv->reserved_mtts = + mlx4_alloc_mtt_range(dev, + fls(dev->caps.reserved_mtts - 1)); + if (priv->reserved_mtts < 0) { + mlx4_warn(dev, "MTT table of order %u is too small\n", + mr_table->mtt_buddy.max_order); + err = -ENOMEM; + goto err_reserve_mtts; + } + } + + return 0; + +err_reserve_mtts: + mlx4_buddy_cleanup(&mr_table->mtt_buddy); + +err_buddy: + mlx4_bitmap_cleanup(&mr_table->mpt_bitmap); + + return err; +} + +void mlx4_cleanup_mr_table(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_mr_table *mr_table = &priv->mr_table; + + if (mlx4_is_slave(dev)) + return; + if (priv->reserved_mtts >= 0) + mlx4_free_mtt_range(dev, priv->reserved_mtts, + fls(dev->caps.reserved_mtts - 1)); + mlx4_buddy_cleanup(&mr_table->mtt_buddy); + mlx4_bitmap_cleanup(&mr_table->mpt_bitmap); +} + +static inline int mlx4_check_fmr(struct mlx4_fmr *fmr, u64 *page_list, + int npages, u64 iova) +{ + int i, page_mask; + + if (npages > fmr->max_pages) + return -EINVAL; + + page_mask = (1 << fmr->page_shift) - 1; + + /* We are getting page lists, so va must be page aligned. */ + if (iova & page_mask) + return -EINVAL; + + /* Trust the user not to pass misaligned data in page_list */ + if (0) + for (i = 0; i < npages; ++i) { + if (page_list[i] & ~page_mask) + return -EINVAL; + } + + if (fmr->maps >= fmr->max_maps) + return -EINVAL; + + return 0; +} + +int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list, + int npages, u64 iova, u32 *lkey, u32 *rkey) +{ + u32 key; + int i, err; + + err = mlx4_check_fmr(fmr, page_list, npages, iova); + if (err) + return err; + + ++fmr->maps; + + key = key_to_hw_index(fmr->mr.key); + key += dev->caps.num_mpts; + *lkey = *rkey = fmr->mr.key = hw_index_to_key(key); + + *(u8 *) fmr->mpt = MLX4_MPT_STATUS_SW; + + /* Make sure MPT status is visible before writing MTT entries */ + wmb(); + + dma_sync_single_for_cpu(&dev->pdev->dev, fmr->dma_handle, + npages * sizeof(u64), DMA_TO_DEVICE); + + for (i = 0; i < npages; ++i) + fmr->mtts[i] = cpu_to_be64(page_list[i] | MLX4_MTT_FLAG_PRESENT); + + dma_sync_single_for_device(&dev->pdev->dev, fmr->dma_handle, + npages * sizeof(u64), DMA_TO_DEVICE); + + fmr->mpt->key = cpu_to_be32(key); + fmr->mpt->lkey = cpu_to_be32(key); + fmr->mpt->length = cpu_to_be64(npages * (1ull << fmr->page_shift)); + fmr->mpt->start = cpu_to_be64(iova); + + /* Make MTT entries are visible before setting MPT status */ + wmb(); + + *(u8 *) fmr->mpt = MLX4_MPT_STATUS_HW; + + /* Make sure MPT status is visible before consumer can use FMR */ + wmb(); + + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_map_phys_fmr); + +int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages, + int max_maps, u8 page_shift, struct mlx4_fmr *fmr) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int err = -ENOMEM; + + if (max_maps > dev->caps.max_fmr_maps) + return -EINVAL; + + if (page_shift < (ffs(dev->caps.page_size_cap) - 1) || page_shift >= 32) + return -EINVAL; + + /* All MTTs must fit in the same page */ + if (max_pages * sizeof *fmr->mtts > PAGE_SIZE) + return -EINVAL; + + fmr->page_shift = page_shift; + fmr->max_pages = max_pages; + fmr->max_maps = max_maps; + fmr->maps = 0; + + err = mlx4_mr_alloc(dev, pd, 0, 0, access, max_pages, + page_shift, &fmr->mr); + if (err) + return err; + + fmr->mtts = mlx4_table_find(&priv->mr_table.mtt_table, + fmr->mr.mtt.offset, + &fmr->dma_handle); + + if (!fmr->mtts) { + err = -ENOMEM; + goto err_free; + } + + return 0; + +err_free: + (void) mlx4_mr_free(dev, &fmr->mr); + return err; +} +EXPORT_SYMBOL_GPL(mlx4_fmr_alloc); + +int mlx4_fmr_enable(struct mlx4_dev *dev, struct mlx4_fmr *fmr) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int err; + + err = mlx4_mr_enable(dev, &fmr->mr); + if (err) + return err; + + fmr->mpt = mlx4_table_find(&priv->mr_table.dmpt_table, + key_to_hw_index(fmr->mr.key), NULL); + if (!fmr->mpt) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_fmr_enable); + +void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr, + u32 *lkey, u32 *rkey) +{ + struct mlx4_cmd_mailbox *mailbox; + int err; + + if (!fmr->maps) + return; + + fmr->maps = 0; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + err = PTR_ERR(mailbox); + pr_warn("mlx4_ib: mlx4_alloc_cmd_mailbox failed (%d)\n", err); + return; + } + + err = mlx4_HW2SW_MPT(dev, NULL, + key_to_hw_index(fmr->mr.key) & + (dev->caps.num_mpts - 1)); + mlx4_free_cmd_mailbox(dev, mailbox); + if (err) { + pr_warn("mlx4_ib: mlx4_HW2SW_MPT failed (%d)\n", err); + return; + } + fmr->mr.enabled = MLX4_MPT_EN_SW; +} +EXPORT_SYMBOL_GPL(mlx4_fmr_unmap); + +int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr) +{ + int ret; + + if (fmr->maps) + return -EBUSY; + + ret = mlx4_mr_free(dev, &fmr->mr); + if (ret) + return ret; + fmr->mr.enabled = MLX4_MPT_DISABLED; + + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_fmr_free); + +int mlx4_SYNC_TPT(struct mlx4_dev *dev) +{ + return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_SYNC_TPT, 1000, + MLX4_CMD_NATIVE); +} +EXPORT_SYMBOL_GPL(mlx4_SYNC_TPT); diff --git a/drivers/net/ethernet/mellanox/mlx4/pd.c b/drivers/net/ethernet/mellanox/mlx4/pd.c new file mode 100644 index 0000000..7421607 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/pd.c @@ -0,0 +1,286 @@ +/* + * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include + +#include "mlx4.h" +#include "icm.h" + +enum { + MLX4_NUM_RESERVED_UARS = 8 +}; + +int mlx4_pd_alloc(struct mlx4_dev *dev, u32 *pdn) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + *pdn = mlx4_bitmap_alloc(&priv->pd_bitmap); + if (*pdn == -1) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_pd_alloc); + +void mlx4_pd_free(struct mlx4_dev *dev, u32 pdn) +{ + mlx4_bitmap_free(&mlx4_priv(dev)->pd_bitmap, pdn, MLX4_USE_RR); +} +EXPORT_SYMBOL_GPL(mlx4_pd_free); + +int __mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + *xrcdn = mlx4_bitmap_alloc(&priv->xrcd_bitmap); + if (*xrcdn == -1) + return -ENOMEM; + + return 0; +} + +int mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn) +{ + u64 out_param; + int err; + + if (mlx4_is_mfunc(dev)) { + err = mlx4_cmd_imm(dev, 0, &out_param, + RES_XRCD, RES_OP_RESERVE, + MLX4_CMD_ALLOC_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + if (err) + return err; + + *xrcdn = get_param_l(&out_param); + return 0; + } + return __mlx4_xrcd_alloc(dev, xrcdn); +} +EXPORT_SYMBOL_GPL(mlx4_xrcd_alloc); + +void __mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn) +{ + mlx4_bitmap_free(&mlx4_priv(dev)->xrcd_bitmap, xrcdn, MLX4_USE_RR); +} + +void mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn) +{ + u64 in_param = 0; + int err; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, xrcdn); + err = mlx4_cmd(dev, in_param, RES_XRCD, + RES_OP_RESERVE, MLX4_CMD_FREE_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + if (err) + mlx4_warn(dev, "Failed to release xrcdn %d\n", xrcdn); + } else + __mlx4_xrcd_free(dev, xrcdn); +} +EXPORT_SYMBOL_GPL(mlx4_xrcd_free); + +int mlx4_init_pd_table(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + return mlx4_bitmap_init(&priv->pd_bitmap, dev->caps.num_pds, + (1 << NOT_MASKED_PD_BITS) - 1, + dev->caps.reserved_pds, 0); +} + +void mlx4_cleanup_pd_table(struct mlx4_dev *dev) +{ + mlx4_bitmap_cleanup(&mlx4_priv(dev)->pd_bitmap); +} + +int mlx4_init_xrcd_table(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + return mlx4_bitmap_init(&priv->xrcd_bitmap, (1 << 16), + (1 << 16) - 1, dev->caps.reserved_xrcds + 1, 0); +} + +void mlx4_cleanup_xrcd_table(struct mlx4_dev *dev) +{ + mlx4_bitmap_cleanup(&mlx4_priv(dev)->xrcd_bitmap); +} + +int mlx4_uar_alloc(struct mlx4_dev *dev, struct mlx4_uar *uar) +{ + int offset; + + uar->index = mlx4_bitmap_alloc(&mlx4_priv(dev)->uar_table.bitmap); + if (uar->index == -1) + return -ENOMEM; + + if (mlx4_is_slave(dev)) + offset = uar->index % ((int) pci_resource_len(dev->pdev, 2) / + dev->caps.uar_page_size); + else + offset = uar->index; + uar->pfn = (pci_resource_start(dev->pdev, 2) >> PAGE_SHIFT) + offset; + uar->map = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_uar_alloc); + +void mlx4_uar_free(struct mlx4_dev *dev, struct mlx4_uar *uar) +{ + mlx4_bitmap_free(&mlx4_priv(dev)->uar_table.bitmap, uar->index, MLX4_USE_RR); +} +EXPORT_SYMBOL_GPL(mlx4_uar_free); + +int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf, int node) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_uar *uar; + int err = 0; + int idx; + + if (!priv->bf_mapping) + return -ENOMEM; + + mutex_lock(&priv->bf_mutex); + if (!list_empty(&priv->bf_list)) + uar = list_entry(priv->bf_list.next, struct mlx4_uar, bf_list); + else { + if (mlx4_bitmap_avail(&priv->uar_table.bitmap) < MLX4_NUM_RESERVED_UARS) { + err = -ENOMEM; + goto out; + } + uar = kmalloc_node(sizeof(*uar), GFP_KERNEL, node); + if (!uar) { + uar = kmalloc(sizeof(*uar), GFP_KERNEL); + if (!uar) { + err = -ENOMEM; + goto out; + } + } + err = mlx4_uar_alloc(dev, uar); + if (err) + goto free_kmalloc; + + uar->map = ioremap(uar->pfn << PAGE_SHIFT, PAGE_SIZE); + if (!uar->map) { + err = -ENOMEM; + goto free_uar; + } + + uar->bf_map = io_mapping_map_wc(priv->bf_mapping, uar->index << PAGE_SHIFT); + if (!uar->bf_map) { + err = -ENOMEM; + goto unamp_uar; + } + uar->free_bf_bmap = 0; + list_add(&uar->bf_list, &priv->bf_list); + } + + bf->uar = uar; + idx = ffz(uar->free_bf_bmap); + uar->free_bf_bmap |= 1 << idx; + bf->uar = uar; + bf->offset = 0; + bf->buf_size = dev->caps.bf_reg_size / 2; + bf->reg = uar->bf_map + idx * dev->caps.bf_reg_size; + if (uar->free_bf_bmap == (1 << dev->caps.bf_regs_per_page) - 1) + list_del_init(&uar->bf_list); + + goto out; + +unamp_uar: + bf->uar = NULL; + iounmap(uar->map); + +free_uar: + mlx4_uar_free(dev, uar); + +free_kmalloc: + kfree(uar); + +out: + mutex_unlock(&priv->bf_mutex); + return err; +} +EXPORT_SYMBOL_GPL(mlx4_bf_alloc); + +void mlx4_bf_free(struct mlx4_dev *dev, struct mlx4_bf *bf) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int idx; + + if (!bf->uar || !bf->uar->bf_map) + return; + + mutex_lock(&priv->bf_mutex); + idx = (bf->reg - bf->uar->bf_map) / dev->caps.bf_reg_size; + bf->uar->free_bf_bmap &= ~(1 << idx); + if (!bf->uar->free_bf_bmap) { + if (!list_empty(&bf->uar->bf_list)) + list_del(&bf->uar->bf_list); + + io_mapping_unmap(bf->uar->bf_map); + iounmap(bf->uar->map); + mlx4_uar_free(dev, bf->uar); + kfree(bf->uar); + } else if (list_empty(&bf->uar->bf_list)) + list_add(&bf->uar->bf_list, &priv->bf_list); + + mutex_unlock(&priv->bf_mutex); +} +EXPORT_SYMBOL_GPL(mlx4_bf_free); + +int mlx4_init_uar_table(struct mlx4_dev *dev) +{ + if (dev->caps.num_uars <= 128) { + mlx4_err(dev, "Only %d UAR pages (need more than 128)\n", + dev->caps.num_uars); + mlx4_err(dev, "Increase firmware log2_uar_bar_megabytes?\n"); + return -ENODEV; + } + + return mlx4_bitmap_init(&mlx4_priv(dev)->uar_table.bitmap, + dev->caps.num_uars, dev->caps.num_uars - 1, + dev->caps.reserved_uars, 0); +} + +void mlx4_cleanup_uar_table(struct mlx4_dev *dev) +{ + mlx4_bitmap_cleanup(&mlx4_priv(dev)->uar_table.bitmap); +} diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c b/drivers/net/ethernet/mellanox/mlx4/port.c new file mode 100644 index 0000000..94eeb2c --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/port.c @@ -0,0 +1,1313 @@ +/* + * Copyright (c) 2007 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include + +#include "mlx4.h" + +#define MLX4_MAC_VALID (1ull << 63) + +#define MLX4_VLAN_VALID (1u << 31) +#define MLX4_VLAN_MASK 0xfff + +#define MLX4_STATS_TRAFFIC_COUNTERS_MASK 0xfULL +#define MLX4_STATS_TRAFFIC_DROPS_MASK 0xc0ULL +#define MLX4_STATS_ERROR_COUNTERS_MASK 0x1ffc30ULL +#define MLX4_STATS_PORT_COUNTERS_MASK 0x1fe00000ULL + +void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table) +{ + int i; + + mutex_init(&table->mutex); + for (i = 0; i < MLX4_MAX_MAC_NUM; i++) { + table->entries[i] = 0; + table->refs[i] = 0; + } + table->max = 1 << dev->caps.log_num_macs; + table->total = 0; +} + +void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table) +{ + int i; + + mutex_init(&table->mutex); + for (i = 0; i < MLX4_MAX_VLAN_NUM; i++) { + table->entries[i] = 0; + table->refs[i] = 0; + } + table->max = (1 << dev->caps.log_num_vlans) - MLX4_VLAN_REGULAR; + table->total = 0; +} + +void mlx4_init_roce_gid_table(struct mlx4_dev *dev, + struct mlx4_roce_gid_table *table) +{ + int i; + + mutex_init(&table->mutex); + for (i = 0; i < MLX4_ROCE_MAX_GIDS; i++) + memset(table->roce_gids[i].raw, 0, MLX4_ROCE_GID_ENTRY_SIZE); +} + +static int validate_index(struct mlx4_dev *dev, + struct mlx4_mac_table *table, int index) +{ + int err = 0; + + if (index < 0 || index >= table->max || !table->entries[index]) { + mlx4_warn(dev, "No valid Mac entry for the given index\n"); + err = -EINVAL; + } + return err; +} + +static int find_index(struct mlx4_dev *dev, + struct mlx4_mac_table *table, u64 mac) +{ + int i; + + for (i = 0; i < MLX4_MAX_MAC_NUM; i++) { + if (table->refs[i] && + (MLX4_MAC_MASK & mac) == + (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) + return i; + } + /* Mac not found */ + return -EINVAL; +} + +static int mlx4_set_port_mac_table(struct mlx4_dev *dev, u8 port, + __be64 *entries) +{ + struct mlx4_cmd_mailbox *mailbox; + u32 in_mod; + int err; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + memcpy(mailbox->buf, entries, MLX4_MAC_TABLE_SIZE); + + in_mod = MLX4_SET_PORT_MAC_TABLE << 8 | port; + + err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} + +int mlx4_find_cached_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *idx) +{ + struct mlx4_port_info *info = &mlx4_priv(dev)->port[port]; + struct mlx4_mac_table *table = &info->mac_table; + int i; + + for (i = 0; i < MLX4_MAX_MAC_NUM; i++) { + if (!table->refs[i]) + continue; + + if (mac == (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) { + *idx = i; + return 0; + } + } + + return -ENOENT; +} +EXPORT_SYMBOL_GPL(mlx4_find_cached_mac); + +int __mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac) +{ + struct mlx4_port_info *info = &mlx4_priv(dev)->port[port]; + struct mlx4_mac_table *table = &info->mac_table; + int i, err = 0; + int free = -1; + + mlx4_dbg(dev, "Registering MAC: 0x%llx for port %d\n", + (unsigned long long) mac, port); + + mutex_lock(&table->mutex); + for (i = 0; i < MLX4_MAX_MAC_NUM; i++) { + if (!table->refs[i]) { + if (free < 0) + free = i; + continue; + } + + if ((MLX4_MAC_MASK & mac) == + (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) { + /* MAC already registered, increment ref count */ + err = i; + ++table->refs[i]; + goto out; + } + } + + mlx4_dbg(dev, "Free MAC index is %d\n", free); + + if (table->total == table->max) { + /* No free mac entries */ + err = -ENOSPC; + goto out; + } + + /* Register new MAC */ + table->entries[free] = cpu_to_be64(mac | MLX4_MAC_VALID); + + err = mlx4_set_port_mac_table(dev, port, table->entries); + if (unlikely(err)) { + mlx4_err(dev, "Failed adding MAC: 0x%llx\n", + (unsigned long long) mac); + table->entries[free] = 0; + goto out; + } + table->refs[free] = 1; + err = free; + ++table->total; +out: + mutex_unlock(&table->mutex); + return err; +} +EXPORT_SYMBOL_GPL(__mlx4_register_mac); + +int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac) +{ + u64 out_param = 0; + int err = -EINVAL; + + if (mlx4_is_mfunc(dev)) { + if (!(dev->flags & MLX4_FLAG_OLD_REG_MAC)) { + err = mlx4_cmd_imm(dev, mac, &out_param, + ((u32) port) << 8 | (u32) RES_MAC, + RES_OP_RESERVE_AND_MAP, MLX4_CMD_ALLOC_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + } + if (err && err == -EINVAL && mlx4_is_slave(dev)) { + /* retry using old REG_MAC format */ + set_param_l(&out_param, port); + err = mlx4_cmd_imm(dev, mac, &out_param, RES_MAC, + RES_OP_RESERVE_AND_MAP, MLX4_CMD_ALLOC_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + if (!err) + dev->flags |= MLX4_FLAG_OLD_REG_MAC; + } + if (err) + return err; + + return get_param_l(&out_param); + } + return __mlx4_register_mac(dev, port, mac); +} +EXPORT_SYMBOL_GPL(mlx4_register_mac); + +int mlx4_get_base_qpn(struct mlx4_dev *dev, u8 port) +{ + return dev->caps.reserved_qps_base[MLX4_QP_REGION_ETH_ADDR] + + (port - 1) * (1 << dev->caps.log_num_macs); +} +EXPORT_SYMBOL_GPL(mlx4_get_base_qpn); + +void __mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac) +{ + struct mlx4_port_info *info; + struct mlx4_mac_table *table; + int index; + + if (port < 1 || port > dev->caps.num_ports) { + mlx4_warn(dev, "invalid port number (%d), aborting...\n", port); + return; + } + info = &mlx4_priv(dev)->port[port]; + table = &info->mac_table; + mutex_lock(&table->mutex); + index = find_index(dev, table, mac); + + if (validate_index(dev, table, index)) + goto out; + if (--table->refs[index]) { + mlx4_dbg(dev, "Have more references for index %d, no need to modify mac table\n", + index); + goto out; + } + + table->entries[index] = 0; + mlx4_set_port_mac_table(dev, port, table->entries); + --table->total; +out: + mutex_unlock(&table->mutex); +} +EXPORT_SYMBOL_GPL(__mlx4_unregister_mac); + +void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac) +{ + u64 out_param = 0; + + if (mlx4_is_mfunc(dev)) { + if (!(dev->flags & MLX4_FLAG_OLD_REG_MAC)) { + (void) mlx4_cmd_imm(dev, mac, &out_param, + ((u32) port) << 8 | (u32) RES_MAC, + RES_OP_RESERVE_AND_MAP, MLX4_CMD_FREE_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + } else { + /* use old unregister mac format */ + set_param_l(&out_param, port); + (void) mlx4_cmd_imm(dev, mac, &out_param, RES_MAC, + RES_OP_RESERVE_AND_MAP, MLX4_CMD_FREE_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + } + return; + } + __mlx4_unregister_mac(dev, port, mac); + return; +} +EXPORT_SYMBOL_GPL(mlx4_unregister_mac); + +int __mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac) +{ + struct mlx4_port_info *info = &mlx4_priv(dev)->port[port]; + struct mlx4_mac_table *table = &info->mac_table; + int index = qpn - info->base_qpn; + int err = 0; + + /* CX1 doesn't support multi-functions */ + mutex_lock(&table->mutex); + + err = validate_index(dev, table, index); + if (err) + goto out; + + table->entries[index] = cpu_to_be64(new_mac | MLX4_MAC_VALID); + + err = mlx4_set_port_mac_table(dev, port, table->entries); + if (unlikely(err)) { + mlx4_err(dev, "Failed adding MAC: 0x%llx\n", + (unsigned long long) new_mac); + table->entries[index] = 0; + } +out: + mutex_unlock(&table->mutex); + return err; +} +EXPORT_SYMBOL_GPL(__mlx4_replace_mac); + +static int mlx4_set_port_vlan_table(struct mlx4_dev *dev, u8 port, + __be32 *entries) +{ + struct mlx4_cmd_mailbox *mailbox; + u32 in_mod; + int err; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + memcpy(mailbox->buf, entries, MLX4_VLAN_TABLE_SIZE); + in_mod = MLX4_SET_PORT_VLAN_TABLE << 8 | port; + err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); + + mlx4_free_cmd_mailbox(dev, mailbox); + + return err; +} + +int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx) +{ + struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table; + int i; + + for (i = 0; i < MLX4_MAX_VLAN_NUM; ++i) { + if (table->refs[i] && + (vid == (MLX4_VLAN_MASK & + be32_to_cpu(table->entries[i])))) { + /* VLAN already registered, increase reference count */ + *idx = i; + return 0; + } + } + + return -ENOENT; +} +EXPORT_SYMBOL_GPL(mlx4_find_cached_vlan); + +int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, + int *index) +{ + struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table; + int i, err = 0; + int free = -1; + + mutex_lock(&table->mutex); + + if (table->total == table->max) { + /* No free vlan entries */ + err = -ENOSPC; + goto out; + } + + for (i = MLX4_VLAN_REGULAR; i < MLX4_MAX_VLAN_NUM; i++) { + if (free < 0 && (table->refs[i] == 0)) { + free = i; + continue; + } + + if (table->refs[i] && + (vlan == (MLX4_VLAN_MASK & + be32_to_cpu(table->entries[i])))) { + /* Vlan already registered, increase references count */ + *index = i; + ++table->refs[i]; + goto out; + } + } + + if (free < 0) { + err = -ENOMEM; + goto out; + } + + /* Register new VLAN */ + table->refs[free] = 1; + table->entries[free] = cpu_to_be32(vlan | MLX4_VLAN_VALID); + + err = mlx4_set_port_vlan_table(dev, port, table->entries); + if (unlikely(err)) { + mlx4_warn(dev, "Failed adding vlan: %u\n", vlan); + table->refs[free] = 0; + table->entries[free] = 0; + goto out; + } + + *index = free; + ++table->total; +out: + mutex_unlock(&table->mutex); + return err; +} + +int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index) +{ + u64 out_param = 0; + int err; + + if (vlan > 4095) + return -EINVAL; + + if (mlx4_is_mfunc(dev)) { + err = mlx4_cmd_imm(dev, vlan, &out_param, + ((u32) port) << 8 | (u32) RES_VLAN, + RES_OP_RESERVE_AND_MAP, MLX4_CMD_ALLOC_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + if (!err) + *index = get_param_l(&out_param); + + return err; + } + return __mlx4_register_vlan(dev, port, vlan, index); +} +EXPORT_SYMBOL_GPL(mlx4_register_vlan); + +void __mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan) +{ + struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table; + int index; + + mutex_lock(&table->mutex); + if (mlx4_find_cached_vlan(dev, port, vlan, &index)) { + mlx4_warn(dev, "vlan 0x%x is not in the vlan table\n", vlan); + goto out; + } + + if (index < MLX4_VLAN_REGULAR) { + mlx4_warn(dev, "Trying to free special vlan index %d\n", index); + goto out; + } + + if (--table->refs[index]) { + mlx4_dbg(dev, "Have %d more references for index %d, no need to modify vlan table\n", + table->refs[index], index); + goto out; + } + table->entries[index] = 0; + mlx4_set_port_vlan_table(dev, port, table->entries); + --table->total; +out: + mutex_unlock(&table->mutex); +} + +void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan) +{ + u64 out_param = 0; + + if (mlx4_is_mfunc(dev)) { + (void) mlx4_cmd_imm(dev, vlan, &out_param, + ((u32) port) << 8 | (u32) RES_VLAN, + RES_OP_RESERVE_AND_MAP, + MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED); + return; + } + __mlx4_unregister_vlan(dev, port, vlan); +} +EXPORT_SYMBOL_GPL(mlx4_unregister_vlan); + +int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps) +{ + struct mlx4_cmd_mailbox *inmailbox, *outmailbox; + u8 *inbuf, *outbuf; + int err; + + inmailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(inmailbox)) + return PTR_ERR(inmailbox); + + outmailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(outmailbox)) { + mlx4_free_cmd_mailbox(dev, inmailbox); + return PTR_ERR(outmailbox); + } + + inbuf = inmailbox->buf; + outbuf = outmailbox->buf; + inbuf[0] = 1; + inbuf[1] = 1; + inbuf[2] = 1; + inbuf[3] = 1; + *(__be16 *) (&inbuf[16]) = cpu_to_be16(0x0015); + *(__be32 *) (&inbuf[20]) = cpu_to_be32(port); + + err = mlx4_cmd_box(dev, inmailbox->dma, outmailbox->dma, port, 3, + MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C, + MLX4_CMD_NATIVE); + if (!err) + *caps = *(__be32 *) (outbuf + 84); + mlx4_free_cmd_mailbox(dev, inmailbox); + mlx4_free_cmd_mailbox(dev, outmailbox); + return err; +} +static struct mlx4_roce_gid_entry zgid_entry; + +int mlx4_get_slave_num_gids(struct mlx4_dev *dev, int slave, int port) +{ + int vfs; + int slave_gid = slave; + unsigned i; + struct mlx4_slaves_pport slaves_pport; + struct mlx4_active_ports actv_ports; + unsigned max_port_p_one; + + if (slave == 0) + return MLX4_ROCE_PF_GIDS; + + /* Slave is a VF */ + slaves_pport = mlx4_phys_to_slaves_pport(dev, port); + actv_ports = mlx4_get_active_ports(dev, slave); + max_port_p_one = find_first_bit(actv_ports.ports, dev->caps.num_ports) + + bitmap_weight(actv_ports.ports, dev->caps.num_ports) + 1; + + for (i = 1; i < max_port_p_one; i++) { + struct mlx4_active_ports exclusive_ports; + struct mlx4_slaves_pport slaves_pport_actv; + bitmap_zero(exclusive_ports.ports, dev->caps.num_ports); + set_bit(i - 1, exclusive_ports.ports); + if (i == port) + continue; + slaves_pport_actv = mlx4_phys_to_slaves_pport_actv( + dev, &exclusive_ports); + slave_gid -= bitmap_weight(slaves_pport_actv.slaves, + dev->num_vfs + 1); + } + vfs = bitmap_weight(slaves_pport.slaves, dev->num_vfs + 1) - 1; + if (slave_gid <= ((MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) % vfs)) + return ((MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) / vfs) + 1; + return (MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) / vfs; +} + +int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave, int port) +{ + int gids; + unsigned i; + int slave_gid = slave; + int vfs; + + struct mlx4_slaves_pport slaves_pport; + struct mlx4_active_ports actv_ports; + unsigned max_port_p_one; + + if (slave == 0) + return 0; + + slaves_pport = mlx4_phys_to_slaves_pport(dev, port); + actv_ports = mlx4_get_active_ports(dev, slave); + max_port_p_one = find_first_bit(actv_ports.ports, dev->caps.num_ports) + + bitmap_weight(actv_ports.ports, dev->caps.num_ports) + 1; + + for (i = 1; i < max_port_p_one; i++) { + struct mlx4_active_ports exclusive_ports; + struct mlx4_slaves_pport slaves_pport_actv; + bitmap_zero(exclusive_ports.ports, dev->caps.num_ports); + set_bit(i - 1, exclusive_ports.ports); + if (i == port) + continue; + slaves_pport_actv = mlx4_phys_to_slaves_pport_actv( + dev, &exclusive_ports); + slave_gid -= bitmap_weight(slaves_pport_actv.slaves, + dev->num_vfs + 1); + } + gids = MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS; + vfs = bitmap_weight(slaves_pport.slaves, dev->num_vfs + 1) - 1; + if (slave_gid <= gids % vfs) + return MLX4_ROCE_PF_GIDS + ((gids / vfs) + 1) * (slave_gid - 1); + + return MLX4_ROCE_PF_GIDS + (gids % vfs) + + ((gids / vfs) * (slave_gid - 1)); +} +EXPORT_SYMBOL_GPL(mlx4_get_base_gid_ix); + +static int mlx4_reset_roce_port_gids(struct mlx4_dev *dev, int slave, + int port, struct mlx4_cmd_mailbox *mailbox) +{ + struct mlx4_roce_gid_entry *gid_entry_mbox; + struct mlx4_priv *priv = mlx4_priv(dev); + int num_gids, base, offset; + int i, err; + + num_gids = mlx4_get_slave_num_gids(dev, slave, port); + base = mlx4_get_base_gid_ix(dev, slave, port); + + memset(mailbox->buf, 0, MLX4_MAILBOX_SIZE); + + mutex_lock(&(priv->port[port].gid_table.mutex)); + /* Zero-out gids belonging to that slave in the port GID table */ + for (i = 0, offset = base; i < num_gids; offset++, i++) + memcpy(priv->port[port].gid_table.roce_gids[offset].raw, + zgid_entry.raw, MLX4_ROCE_GID_ENTRY_SIZE); + + /* Now, copy roce port gids table to mailbox for passing to FW */ + gid_entry_mbox = (struct mlx4_roce_gid_entry *)mailbox->buf; + for (i = 0; i < MLX4_ROCE_MAX_GIDS; gid_entry_mbox++, i++) + memcpy(gid_entry_mbox->raw, + priv->port[port].gid_table.roce_gids[i].raw, + MLX4_ROCE_GID_ENTRY_SIZE); + + err = mlx4_cmd(dev, mailbox->dma, + ((u32)port) | (MLX4_SET_PORT_GID_TABLE << 8), 1, + MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_NATIVE); + mutex_unlock(&(priv->port[port].gid_table.mutex)); + return err; +} + + +void mlx4_reset_roce_gids(struct mlx4_dev *dev, int slave) +{ + struct mlx4_active_ports actv_ports; + struct mlx4_cmd_mailbox *mailbox; + int num_eth_ports, err; + int i; + + if (slave < 0 || slave > dev->num_vfs) + return; + + actv_ports = mlx4_get_active_ports(dev, slave); + + for (i = 0, num_eth_ports = 0; i < dev->caps.num_ports; i++) { + if (test_bit(i, actv_ports.ports)) { + if (dev->caps.port_type[i + 1] != MLX4_PORT_TYPE_ETH) + continue; + num_eth_ports++; + } + } + + if (!num_eth_ports) + return; + + /* have ETH ports. Alloc mailbox for SET_PORT command */ + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return; + + for (i = 0; i < dev->caps.num_ports; i++) { + if (test_bit(i, actv_ports.ports)) { + if (dev->caps.port_type[i + 1] != MLX4_PORT_TYPE_ETH) + continue; + err = mlx4_reset_roce_port_gids(dev, slave, i + 1, mailbox); + if (err) + mlx4_warn(dev, "Could not reset ETH port GID table for slave %d, port %d (%d)\n", + slave, i + 1, err); + } + } + + mlx4_free_cmd_mailbox(dev, mailbox); + return; +} + +static int mlx4_common_set_port(struct mlx4_dev *dev, int slave, u32 in_mod, + u8 op_mod, struct mlx4_cmd_mailbox *inbox) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_port_info *port_info; + struct mlx4_mfunc_master_ctx *master = &priv->mfunc.master; + struct mlx4_slave_state *slave_st = &master->slave_state[slave]; + struct mlx4_set_port_rqp_calc_context *qpn_context; + struct mlx4_set_port_general_context *gen_context; + struct mlx4_roce_gid_entry *gid_entry_tbl, *gid_entry_mbox, *gid_entry_mb1; + int reset_qkey_viols; + int port; + int is_eth; + int num_gids; + int base; + u32 in_modifier; + u32 promisc; + u16 mtu, prev_mtu; + int err; + int i, j; + int offset; + __be32 agg_cap_mask; + __be32 slave_cap_mask; + __be32 new_cap_mask; + + port = in_mod & 0xff; + in_modifier = in_mod >> 8; + is_eth = op_mod; + port_info = &priv->port[port]; + + /* Slaves cannot perform SET_PORT operations except changing MTU */ + if (is_eth) { + if (slave != dev->caps.function && + in_modifier != MLX4_SET_PORT_GENERAL && + in_modifier != MLX4_SET_PORT_GID_TABLE) { + mlx4_warn(dev, "denying SET_PORT for slave:%d\n", + slave); + return -EINVAL; + } + switch (in_modifier) { + case MLX4_SET_PORT_RQP_CALC: + qpn_context = inbox->buf; + qpn_context->base_qpn = + cpu_to_be32(port_info->base_qpn); + qpn_context->n_mac = 0x7; + promisc = be32_to_cpu(qpn_context->promisc) >> + SET_PORT_PROMISC_SHIFT; + qpn_context->promisc = cpu_to_be32( + promisc << SET_PORT_PROMISC_SHIFT | + port_info->base_qpn); + promisc = be32_to_cpu(qpn_context->mcast) >> + SET_PORT_MC_PROMISC_SHIFT; + qpn_context->mcast = cpu_to_be32( + promisc << SET_PORT_MC_PROMISC_SHIFT | + port_info->base_qpn); + break; + case MLX4_SET_PORT_GENERAL: + gen_context = inbox->buf; + /* Mtu is configured as the max MTU among all the + * the functions on the port. */ + mtu = be16_to_cpu(gen_context->mtu); + mtu = min_t(int, mtu, dev->caps.eth_mtu_cap[port] + + ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN); + prev_mtu = slave_st->mtu[port]; + slave_st->mtu[port] = mtu; + if (mtu > master->max_mtu[port]) + master->max_mtu[port] = mtu; + if (mtu < prev_mtu && prev_mtu == + master->max_mtu[port]) { + slave_st->mtu[port] = mtu; + master->max_mtu[port] = mtu; + for (i = 0; i < dev->num_slaves; i++) { + master->max_mtu[port] = + max(master->max_mtu[port], + master->slave_state[i].mtu[port]); + } + } + + gen_context->mtu = cpu_to_be16(master->max_mtu[port]); + break; + case MLX4_SET_PORT_GID_TABLE: + /* change to MULTIPLE entries: number of guest's gids + * need a FOR-loop here over number of gids the guest has. + * 1. Check no duplicates in gids passed by slave + */ + num_gids = mlx4_get_slave_num_gids(dev, slave, port); + base = mlx4_get_base_gid_ix(dev, slave, port); + gid_entry_mbox = (struct mlx4_roce_gid_entry *)(inbox->buf); + for (i = 0; i < num_gids; gid_entry_mbox++, i++) { + if (!memcmp(gid_entry_mbox->raw, zgid_entry.raw, + sizeof(zgid_entry))) + continue; + gid_entry_mb1 = gid_entry_mbox + 1; + for (j = i + 1; j < num_gids; gid_entry_mb1++, j++) { + if (!memcmp(gid_entry_mb1->raw, + zgid_entry.raw, sizeof(zgid_entry))) + continue; + if (!memcmp(gid_entry_mb1->raw, gid_entry_mbox->raw, + sizeof(gid_entry_mbox->raw))) { + /* found duplicate */ + return -EINVAL; + } + } + } + + /* 2. Check that do not have duplicates in OTHER + * entries in the port GID table + */ + + mutex_lock(&(priv->port[port].gid_table.mutex)); + for (i = 0; i < MLX4_ROCE_MAX_GIDS; i++) { + if (i >= base && i < base + num_gids) + continue; /* don't compare to slave's current gids */ + gid_entry_tbl = &priv->port[port].gid_table.roce_gids[i]; + if (!memcmp(gid_entry_tbl->raw, zgid_entry.raw, sizeof(zgid_entry))) + continue; + gid_entry_mbox = (struct mlx4_roce_gid_entry *)(inbox->buf); + for (j = 0; j < num_gids; gid_entry_mbox++, j++) { + if (!memcmp(gid_entry_mbox->raw, zgid_entry.raw, + sizeof(zgid_entry))) + continue; + if (!memcmp(gid_entry_mbox->raw, gid_entry_tbl->raw, + sizeof(gid_entry_tbl->raw))) { + /* found duplicate */ + mlx4_warn(dev, "requested gid entry for slave:%d is a duplicate of gid at index %d\n", + slave, i); + mutex_unlock(&(priv->port[port].gid_table.mutex)); + return -EINVAL; + } + } + } + + /* insert slave GIDs with memcpy, starting at slave's base index */ + gid_entry_mbox = (struct mlx4_roce_gid_entry *)(inbox->buf); + for (i = 0, offset = base; i < num_gids; gid_entry_mbox++, offset++, i++) + memcpy(priv->port[port].gid_table.roce_gids[offset].raw, + gid_entry_mbox->raw, MLX4_ROCE_GID_ENTRY_SIZE); + + /* Now, copy roce port gids table to current mailbox for passing to FW */ + gid_entry_mbox = (struct mlx4_roce_gid_entry *)(inbox->buf); + for (i = 0; i < MLX4_ROCE_MAX_GIDS; gid_entry_mbox++, i++) + memcpy(gid_entry_mbox->raw, + priv->port[port].gid_table.roce_gids[i].raw, + MLX4_ROCE_GID_ENTRY_SIZE); + + err = mlx4_cmd(dev, inbox->dma, in_mod & 0xffff, op_mod, + MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_NATIVE); + mutex_unlock(&(priv->port[port].gid_table.mutex)); + return err; + } + + return mlx4_cmd(dev, inbox->dma, in_mod & 0xffff, op_mod, + MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_NATIVE); + } + + /* For IB, we only consider: + * - The capability mask, which is set to the aggregate of all + * slave function capabilities + * - The QKey violatin counter - reset according to each request. + */ + + if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { + reset_qkey_viols = (*(u8 *) inbox->buf) & 0x40; + new_cap_mask = ((__be32 *) inbox->buf)[2]; + } else { + reset_qkey_viols = ((u8 *) inbox->buf)[3] & 0x1; + new_cap_mask = ((__be32 *) inbox->buf)[1]; + } + + /* slave may not set the IS_SM capability for the port */ + if (slave != mlx4_master_func_num(dev) && + (be32_to_cpu(new_cap_mask) & MLX4_PORT_CAP_IS_SM)) + return -EINVAL; + + /* No DEV_MGMT in multifunc mode */ + if (mlx4_is_mfunc(dev) && + (be32_to_cpu(new_cap_mask) & MLX4_PORT_CAP_DEV_MGMT_SUP)) + return -EINVAL; + + agg_cap_mask = 0; + slave_cap_mask = + priv->mfunc.master.slave_state[slave].ib_cap_mask[port]; + priv->mfunc.master.slave_state[slave].ib_cap_mask[port] = new_cap_mask; + for (i = 0; i < dev->num_slaves; i++) + agg_cap_mask |= + priv->mfunc.master.slave_state[i].ib_cap_mask[port]; + + /* only clear mailbox for guests. Master may be setting + * MTU or PKEY table size + */ + if (slave != dev->caps.function) + memset(inbox->buf, 0, 256); + if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { + *(u8 *) inbox->buf |= !!reset_qkey_viols << 6; + ((__be32 *) inbox->buf)[2] = agg_cap_mask; + } else { + ((u8 *) inbox->buf)[3] |= !!reset_qkey_viols; + ((__be32 *) inbox->buf)[1] = agg_cap_mask; + } + + err = mlx4_cmd(dev, inbox->dma, port, is_eth, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); + if (err) + priv->mfunc.master.slave_state[slave].ib_cap_mask[port] = + slave_cap_mask; + return err; +} + +int mlx4_SET_PORT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int port = mlx4_slave_convert_port( + dev, slave, vhcr->in_modifier & 0xFF); + + if (port < 0) + return -EINVAL; + + vhcr->in_modifier = (vhcr->in_modifier & ~0xFF) | + (port & 0xFF); + + return mlx4_common_set_port(dev, slave, vhcr->in_modifier, + vhcr->op_modifier, inbox); +} + +/* bit locations for set port command with zero op modifier */ +enum { + MLX4_SET_PORT_VL_CAP = 4, /* bits 7:4 */ + MLX4_SET_PORT_MTU_CAP = 12, /* bits 15:12 */ + MLX4_CHANGE_PORT_PKEY_TBL_SZ = 20, + MLX4_CHANGE_PORT_VL_CAP = 21, + MLX4_CHANGE_PORT_MTU_CAP = 22, +}; + +int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port, int pkey_tbl_sz) +{ + struct mlx4_cmd_mailbox *mailbox; + int err, vl_cap, pkey_tbl_flag = 0; + + if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH) + return 0; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + ((__be32 *) mailbox->buf)[1] = dev->caps.ib_port_def_cap[port]; + + if (pkey_tbl_sz >= 0 && mlx4_is_master(dev)) { + pkey_tbl_flag = 1; + ((__be16 *) mailbox->buf)[20] = cpu_to_be16(pkey_tbl_sz); + } + + /* IB VL CAP enum isn't used by the firmware, just numerical values */ + for (vl_cap = 8; vl_cap >= 1; vl_cap >>= 1) { + ((__be32 *) mailbox->buf)[0] = cpu_to_be32( + (1 << MLX4_CHANGE_PORT_MTU_CAP) | + (1 << MLX4_CHANGE_PORT_VL_CAP) | + (pkey_tbl_flag << MLX4_CHANGE_PORT_PKEY_TBL_SZ) | + (dev->caps.port_ib_mtu[port] << MLX4_SET_PORT_MTU_CAP) | + (vl_cap << MLX4_SET_PORT_VL_CAP)); + err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); + if (err != -ENOMEM) + break; + } + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} + +int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu, + u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_set_port_general_context *context; + int err; + u32 in_mod; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + context = mailbox->buf; + context->flags = SET_PORT_GEN_ALL_VALID; + context->mtu = cpu_to_be16(mtu); + context->pptx = (pptx * (!pfctx)) << 7; + context->pfctx = pfctx; + context->pprx = (pprx * (!pfcrx)) << 7; + context->pfcrx = pfcrx; + + in_mod = MLX4_SET_PORT_GENERAL << 8 | port; + err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} +EXPORT_SYMBOL(mlx4_SET_PORT_general); + +int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn, + u8 promisc) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_set_port_rqp_calc_context *context; + int err; + u32 in_mod; + u32 m_promisc = (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) ? + MCAST_DIRECT : MCAST_DEFAULT; + + if (dev->caps.steering_mode != MLX4_STEERING_MODE_A0) + return 0; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + context = mailbox->buf; + context->base_qpn = cpu_to_be32(base_qpn); + context->n_mac = dev->caps.log_num_macs; + context->promisc = cpu_to_be32(promisc << SET_PORT_PROMISC_SHIFT | + base_qpn); + context->mcast = cpu_to_be32(m_promisc << SET_PORT_MC_PROMISC_SHIFT | + base_qpn); + context->intra_no_vlan = 0; + context->no_vlan = MLX4_NO_VLAN_IDX; + context->intra_vlan_miss = 0; + context->vlan_miss = MLX4_VLAN_MISS_IDX; + + in_mod = MLX4_SET_PORT_RQP_CALC << 8 | port; + err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} +EXPORT_SYMBOL(mlx4_SET_PORT_qpn_calc); + +int mlx4_SET_PORT_PRIO2TC(struct mlx4_dev *dev, u8 port, u8 *prio2tc) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_set_port_prio2tc_context *context; + int err; + u32 in_mod; + int i; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + context = mailbox->buf; + for (i = 0; i < MLX4_NUM_UP; i += 2) + context->prio2tc[i >> 1] = prio2tc[i] << 4 | prio2tc[i + 1]; + + in_mod = MLX4_SET_PORT_PRIO2TC << 8 | port; + err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} +EXPORT_SYMBOL(mlx4_SET_PORT_PRIO2TC); + +int mlx4_SET_PORT_SCHEDULER(struct mlx4_dev *dev, u8 port, u8 *tc_tx_bw, + u8 *pg, u16 *ratelimit) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_set_port_scheduler_context *context; + int err; + u32 in_mod; + int i; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + context = mailbox->buf; + + for (i = 0; i < MLX4_NUM_TC; i++) { + struct mlx4_port_scheduler_tc_cfg_be *tc = &context->tc[i]; + u16 r; + + if (ratelimit && ratelimit[i]) { + if (ratelimit[i] <= MLX4_MAX_100M_UNITS_VAL) { + r = ratelimit[i]; + tc->max_bw_units = + htons(MLX4_RATELIMIT_100M_UNITS); + } else { + r = ratelimit[i]/10; + tc->max_bw_units = + htons(MLX4_RATELIMIT_1G_UNITS); + } + tc->max_bw_value = htons(r); + } else { + tc->max_bw_value = htons(MLX4_RATELIMIT_DEFAULT); + tc->max_bw_units = htons(MLX4_RATELIMIT_1G_UNITS); + } + + tc->pg = htons(pg[i]); + tc->bw_precentage = htons(tc_tx_bw[i]); + } + + in_mod = MLX4_SET_PORT_SCHEDULER << 8 | port; + err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} +EXPORT_SYMBOL(mlx4_SET_PORT_SCHEDULER); + +enum { + VXLAN_ENABLE_MODIFY = 1 << 7, + VXLAN_STEERING_MODIFY = 1 << 6, + + VXLAN_ENABLE = 1 << 7, +}; + +struct mlx4_set_port_vxlan_context { + u32 reserved1; + u8 modify_flags; + u8 reserved2; + u8 enable_flags; + u8 steering; +}; + +int mlx4_SET_PORT_VXLAN(struct mlx4_dev *dev, u8 port, u8 steering, int enable) +{ + int err; + u32 in_mod; + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_set_port_vxlan_context *context; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + context = mailbox->buf; + memset(context, 0, sizeof(*context)); + + context->modify_flags = VXLAN_ENABLE_MODIFY | VXLAN_STEERING_MODIFY; + if (enable) + context->enable_flags = VXLAN_ENABLE; + context->steering = steering; + + in_mod = MLX4_SET_PORT_VXLAN << 8 | port; + err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} +EXPORT_SYMBOL(mlx4_SET_PORT_VXLAN); + +int mlx4_SET_MCAST_FLTR_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err = 0; + + return err; +} + +int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, + u64 mac, u64 clear, u8 mode) +{ + return mlx4_cmd(dev, (mac | (clear << 63)), port, mode, + MLX4_CMD_SET_MCAST_FLTR, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); +} +EXPORT_SYMBOL(mlx4_SET_MCAST_FLTR); + +int mlx4_SET_VLAN_FLTR_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err = 0; + + return err; +} + +int mlx4_common_dump_eth_stats(struct mlx4_dev *dev, int slave, + u32 in_mod, struct mlx4_cmd_mailbox *outbox) +{ + return mlx4_cmd_box(dev, 0, outbox->dma, in_mod, 0, + MLX4_CMD_DUMP_ETH_STATS, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_NATIVE); +} + +int mlx4_DUMP_ETH_STATS_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + if (slave != dev->caps.function) + return 0; + return mlx4_common_dump_eth_stats(dev, slave, + vhcr->in_modifier, outbox); +} + +void mlx4_set_stats_bitmap(struct mlx4_dev *dev, u64 *stats_bitmap) +{ + if (!mlx4_is_mfunc(dev)) { + *stats_bitmap = 0; + return; + } + + *stats_bitmap = (MLX4_STATS_TRAFFIC_COUNTERS_MASK | + MLX4_STATS_TRAFFIC_DROPS_MASK | + MLX4_STATS_PORT_COUNTERS_MASK); + + if (mlx4_is_master(dev)) + *stats_bitmap |= MLX4_STATS_ERROR_COUNTERS_MASK; +} +EXPORT_SYMBOL(mlx4_set_stats_bitmap); + +int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid, + int *slave_id) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int i, found_ix = -1; + int vf_gids = MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS; + struct mlx4_slaves_pport slaves_pport; + unsigned num_vfs; + int slave_gid; + + if (!mlx4_is_mfunc(dev)) + return -EINVAL; + + slaves_pport = mlx4_phys_to_slaves_pport(dev, port); + num_vfs = bitmap_weight(slaves_pport.slaves, dev->num_vfs + 1) - 1; + + for (i = 0; i < MLX4_ROCE_MAX_GIDS; i++) { + if (!memcmp(priv->port[port].gid_table.roce_gids[i].raw, gid, + MLX4_ROCE_GID_ENTRY_SIZE)) { + found_ix = i; + break; + } + } + + if (found_ix >= 0) { + /* Calculate a slave_gid which is the slave number in the gid + * table and not a globally unique slave number. + */ + if (found_ix < MLX4_ROCE_PF_GIDS) + slave_gid = 0; + else if (found_ix < MLX4_ROCE_PF_GIDS + (vf_gids % num_vfs) * + (vf_gids / num_vfs + 1)) + slave_gid = ((found_ix - MLX4_ROCE_PF_GIDS) / + (vf_gids / num_vfs + 1)) + 1; + else + slave_gid = + ((found_ix - MLX4_ROCE_PF_GIDS - + ((vf_gids % num_vfs) * ((vf_gids / num_vfs + 1)))) / + (vf_gids / num_vfs)) + vf_gids % num_vfs + 1; + + /* Calculate the globally unique slave id */ + if (slave_gid) { + struct mlx4_active_ports exclusive_ports; + struct mlx4_active_ports actv_ports; + struct mlx4_slaves_pport slaves_pport_actv; + unsigned max_port_p_one; + int num_vfs_before = 0; + int candidate_slave_gid; + + /* Calculate how many VFs are on the previous port, if exists */ + for (i = 1; i < port; i++) { + bitmap_zero(exclusive_ports.ports, dev->caps.num_ports); + set_bit(i - 1, exclusive_ports.ports); + slaves_pport_actv = + mlx4_phys_to_slaves_pport_actv( + dev, &exclusive_ports); + num_vfs_before += bitmap_weight( + slaves_pport_actv.slaves, + dev->num_vfs + 1); + } + + /* candidate_slave_gid isn't necessarily the correct slave, but + * it has the same number of ports and is assigned to the same + * ports as the real slave we're looking for. On dual port VF, + * slave_gid = [single port VFs on port ] + + * [offset of the current slave from the first dual port VF] + + * 1 (for the PF). + */ + candidate_slave_gid = slave_gid + num_vfs_before; + + actv_ports = mlx4_get_active_ports(dev, candidate_slave_gid); + max_port_p_one = find_first_bit( + actv_ports.ports, dev->caps.num_ports) + + bitmap_weight(actv_ports.ports, + dev->caps.num_ports) + 1; + + /* Calculate the real slave number */ + for (i = 1; i < max_port_p_one; i++) { + if (i == port) + continue; + bitmap_zero(exclusive_ports.ports, + dev->caps.num_ports); + set_bit(i - 1, exclusive_ports.ports); + slaves_pport_actv = + mlx4_phys_to_slaves_pport_actv( + dev, &exclusive_ports); + slave_gid += bitmap_weight( + slaves_pport_actv.slaves, + dev->num_vfs + 1); + } + } + *slave_id = slave_gid; + } + + return (found_ix >= 0) ? 0 : -EINVAL; +} +EXPORT_SYMBOL(mlx4_get_slave_from_roce_gid); + +int mlx4_get_roce_gid_from_slave(struct mlx4_dev *dev, int port, int slave_id, + u8 *gid) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + if (!mlx4_is_master(dev)) + return -EINVAL; + + memcpy(gid, priv->port[port].gid_table.roce_gids[slave_id].raw, + MLX4_ROCE_GID_ENTRY_SIZE); + return 0; +} +EXPORT_SYMBOL(mlx4_get_roce_gid_from_slave); diff --git a/drivers/net/ethernet/mellanox/mlx4/profile.c b/drivers/net/ethernet/mellanox/mlx4/profile.c new file mode 100644 index 0000000..14089d9 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/profile.c @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "mlx4.h" +#include "fw.h" + +enum { + MLX4_RES_QP, + MLX4_RES_RDMARC, + MLX4_RES_ALTC, + MLX4_RES_AUXC, + MLX4_RES_SRQ, + MLX4_RES_CQ, + MLX4_RES_EQ, + MLX4_RES_DMPT, + MLX4_RES_CMPT, + MLX4_RES_MTT, + MLX4_RES_MCG, + MLX4_RES_NUM +}; + +static const char *res_name[] = { + [MLX4_RES_QP] = "QP", + [MLX4_RES_RDMARC] = "RDMARC", + [MLX4_RES_ALTC] = "ALTC", + [MLX4_RES_AUXC] = "AUXC", + [MLX4_RES_SRQ] = "SRQ", + [MLX4_RES_CQ] = "CQ", + [MLX4_RES_EQ] = "EQ", + [MLX4_RES_DMPT] = "DMPT", + [MLX4_RES_CMPT] = "CMPT", + [MLX4_RES_MTT] = "MTT", + [MLX4_RES_MCG] = "MCG", +}; + +u64 mlx4_make_profile(struct mlx4_dev *dev, + struct mlx4_profile *request, + struct mlx4_dev_cap *dev_cap, + struct mlx4_init_hca_param *init_hca) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource { + u64 size; + u64 start; + int type; + u32 num; + int log_num; + }; + + u64 total_size = 0; + struct mlx4_resource *profile; + struct mlx4_resource tmp; + struct sysinfo si; + int i, j; + + profile = kcalloc(MLX4_RES_NUM, sizeof(*profile), GFP_KERNEL); + if (!profile) + return -ENOMEM; + + /* + * We want to scale the number of MTTs with the size of the + * system memory, since it makes sense to register a lot of + * memory on a system with a lot of memory. As a heuristic, + * make sure we have enough MTTs to cover twice the system + * memory (with PAGE_SIZE entries). + * + * This number has to be a power of two and fit into 32 bits + * due to device limitations, so cap this at 2^31 as well. + * That limits us to 8TB of memory registration per HCA with + * 4KB pages, which is probably OK for the next few months. + */ + si_meminfo(&si); + request->num_mtt = + roundup_pow_of_two(max_t(unsigned, request->num_mtt, + min(1UL << (31 - log_mtts_per_seg), + si.totalram >> (log_mtts_per_seg - 1)))); + + profile[MLX4_RES_QP].size = dev_cap->qpc_entry_sz; + profile[MLX4_RES_RDMARC].size = dev_cap->rdmarc_entry_sz; + profile[MLX4_RES_ALTC].size = dev_cap->altc_entry_sz; + profile[MLX4_RES_AUXC].size = dev_cap->aux_entry_sz; + profile[MLX4_RES_SRQ].size = dev_cap->srq_entry_sz; + profile[MLX4_RES_CQ].size = dev_cap->cqc_entry_sz; + profile[MLX4_RES_EQ].size = dev_cap->eqc_entry_sz; + profile[MLX4_RES_DMPT].size = dev_cap->dmpt_entry_sz; + profile[MLX4_RES_CMPT].size = dev_cap->cmpt_entry_sz; + profile[MLX4_RES_MTT].size = dev_cap->mtt_entry_sz; + profile[MLX4_RES_MCG].size = mlx4_get_mgm_entry_size(dev); + + profile[MLX4_RES_QP].num = request->num_qp; + profile[MLX4_RES_RDMARC].num = request->num_qp * request->rdmarc_per_qp; + profile[MLX4_RES_ALTC].num = request->num_qp; + profile[MLX4_RES_AUXC].num = request->num_qp; + profile[MLX4_RES_SRQ].num = request->num_srq; + profile[MLX4_RES_CQ].num = request->num_cq; + profile[MLX4_RES_EQ].num = mlx4_is_mfunc(dev) ? + dev->phys_caps.num_phys_eqs : + min_t(unsigned, dev_cap->max_eqs, MAX_MSIX); + profile[MLX4_RES_DMPT].num = request->num_mpt; + profile[MLX4_RES_CMPT].num = MLX4_NUM_CMPTS; + profile[MLX4_RES_MTT].num = request->num_mtt * (1 << log_mtts_per_seg); + profile[MLX4_RES_MCG].num = request->num_mcg; + + for (i = 0; i < MLX4_RES_NUM; ++i) { + profile[i].type = i; + profile[i].num = roundup_pow_of_two(profile[i].num); + profile[i].log_num = ilog2(profile[i].num); + profile[i].size *= profile[i].num; + profile[i].size = max(profile[i].size, (u64) PAGE_SIZE); + } + + /* + * Sort the resources in decreasing order of size. Since they + * all have sizes that are powers of 2, we'll be able to keep + * resources aligned to their size and pack them without gaps + * using the sorted order. + */ + for (i = MLX4_RES_NUM; i > 0; --i) + for (j = 1; j < i; ++j) { + if (profile[j].size > profile[j - 1].size) { + tmp = profile[j]; + profile[j] = profile[j - 1]; + profile[j - 1] = tmp; + } + } + + for (i = 0; i < MLX4_RES_NUM; ++i) { + if (profile[i].size) { + profile[i].start = total_size; + total_size += profile[i].size; + } + + if (total_size > dev_cap->max_icm_sz) { + mlx4_err(dev, "Profile requires 0x%llx bytes; won't fit in 0x%llx bytes of context memory\n", + (unsigned long long) total_size, + (unsigned long long) dev_cap->max_icm_sz); + kfree(profile); + return -ENOMEM; + } + + if (profile[i].size) + mlx4_dbg(dev, " profile[%2d] (%6s): 2^%02d entries @ 0x%10llx, size 0x%10llx\n", + i, res_name[profile[i].type], + profile[i].log_num, + (unsigned long long) profile[i].start, + (unsigned long long) profile[i].size); + } + + mlx4_dbg(dev, "HCA context memory: reserving %d KB\n", + (int) (total_size >> 10)); + + for (i = 0; i < MLX4_RES_NUM; ++i) { + switch (profile[i].type) { + case MLX4_RES_QP: + dev->caps.num_qps = profile[i].num; + init_hca->qpc_base = profile[i].start; + init_hca->log_num_qps = profile[i].log_num; + break; + case MLX4_RES_RDMARC: + for (priv->qp_table.rdmarc_shift = 0; + request->num_qp << priv->qp_table.rdmarc_shift < profile[i].num; + ++priv->qp_table.rdmarc_shift) + ; /* nothing */ + dev->caps.max_qp_dest_rdma = 1 << priv->qp_table.rdmarc_shift; + priv->qp_table.rdmarc_base = (u32) profile[i].start; + init_hca->rdmarc_base = profile[i].start; + init_hca->log_rd_per_qp = priv->qp_table.rdmarc_shift; + break; + case MLX4_RES_ALTC: + init_hca->altc_base = profile[i].start; + break; + case MLX4_RES_AUXC: + init_hca->auxc_base = profile[i].start; + break; + case MLX4_RES_SRQ: + dev->caps.num_srqs = profile[i].num; + init_hca->srqc_base = profile[i].start; + init_hca->log_num_srqs = profile[i].log_num; + break; + case MLX4_RES_CQ: + dev->caps.num_cqs = profile[i].num; + init_hca->cqc_base = profile[i].start; + init_hca->log_num_cqs = profile[i].log_num; + break; + case MLX4_RES_EQ: + dev->caps.num_eqs = roundup_pow_of_two(min_t(unsigned, dev_cap->max_eqs, + MAX_MSIX)); + init_hca->eqc_base = profile[i].start; + init_hca->log_num_eqs = ilog2(dev->caps.num_eqs); + break; + case MLX4_RES_DMPT: + dev->caps.num_mpts = profile[i].num; + priv->mr_table.mpt_base = profile[i].start; + init_hca->dmpt_base = profile[i].start; + init_hca->log_mpt_sz = profile[i].log_num; + break; + case MLX4_RES_CMPT: + init_hca->cmpt_base = profile[i].start; + break; + case MLX4_RES_MTT: + dev->caps.num_mtts = profile[i].num; + priv->mr_table.mtt_base = profile[i].start; + init_hca->mtt_base = profile[i].start; + break; + case MLX4_RES_MCG: + init_hca->mc_base = profile[i].start; + init_hca->log_mc_entry_sz = + ilog2(mlx4_get_mgm_entry_size(dev)); + init_hca->log_mc_table_sz = profile[i].log_num; + if (dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED) { + dev->caps.num_mgms = profile[i].num; + } else { + init_hca->log_mc_hash_sz = + profile[i].log_num - 1; + dev->caps.num_mgms = profile[i].num >> 1; + dev->caps.num_amgms = profile[i].num >> 1; + } + break; + default: + break; + } + } + + /* + * PDs don't take any HCA memory, but we assign them as part + * of the HCA profile anyway. + */ + dev->caps.num_pds = MLX4_NUM_PDS; + + kfree(profile); + return total_size; +} diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c new file mode 100644 index 0000000..2301365 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/qp.c @@ -0,0 +1,633 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include +#include + +#include "mlx4.h" +#include "icm.h" + +void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type) +{ + struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table; + struct mlx4_qp *qp; + + spin_lock(&qp_table->lock); + + qp = __mlx4_qp_lookup(dev, qpn); + if (qp) + atomic_inc(&qp->refcount); + + spin_unlock(&qp_table->lock); + + if (!qp) { + mlx4_dbg(dev, "Async event for none existent QP %08x\n", qpn); + return; + } + + qp->event(qp, event_type); + + if (atomic_dec_and_test(&qp->refcount)) + complete(&qp->free); +} + +/* used for INIT/CLOSE port logic */ +static int is_master_qp0(struct mlx4_dev *dev, struct mlx4_qp *qp, int *real_qp0, int *proxy_qp0) +{ + /* this procedure is called after we already know we are on the master */ + /* qp0 is either the proxy qp0, or the real qp0 */ + u32 pf_proxy_offset = dev->phys_caps.base_proxy_sqpn + 8 * mlx4_master_func_num(dev); + *proxy_qp0 = qp->qpn >= pf_proxy_offset && qp->qpn <= pf_proxy_offset + 1; + + *real_qp0 = qp->qpn >= dev->phys_caps.base_sqpn && + qp->qpn <= dev->phys_caps.base_sqpn + 1; + + return *real_qp0 || *proxy_qp0; +} + +static int __mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt, + enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state, + struct mlx4_qp_context *context, + enum mlx4_qp_optpar optpar, + int sqd_event, struct mlx4_qp *qp, int native) +{ + static const u16 op[MLX4_QP_NUM_STATE][MLX4_QP_NUM_STATE] = { + [MLX4_QP_STATE_RST] = { + [MLX4_QP_STATE_RST] = MLX4_CMD_2RST_QP, + [MLX4_QP_STATE_ERR] = MLX4_CMD_2ERR_QP, + [MLX4_QP_STATE_INIT] = MLX4_CMD_RST2INIT_QP, + }, + [MLX4_QP_STATE_INIT] = { + [MLX4_QP_STATE_RST] = MLX4_CMD_2RST_QP, + [MLX4_QP_STATE_ERR] = MLX4_CMD_2ERR_QP, + [MLX4_QP_STATE_INIT] = MLX4_CMD_INIT2INIT_QP, + [MLX4_QP_STATE_RTR] = MLX4_CMD_INIT2RTR_QP, + }, + [MLX4_QP_STATE_RTR] = { + [MLX4_QP_STATE_RST] = MLX4_CMD_2RST_QP, + [MLX4_QP_STATE_ERR] = MLX4_CMD_2ERR_QP, + [MLX4_QP_STATE_RTS] = MLX4_CMD_RTR2RTS_QP, + }, + [MLX4_QP_STATE_RTS] = { + [MLX4_QP_STATE_RST] = MLX4_CMD_2RST_QP, + [MLX4_QP_STATE_ERR] = MLX4_CMD_2ERR_QP, + [MLX4_QP_STATE_RTS] = MLX4_CMD_RTS2RTS_QP, + [MLX4_QP_STATE_SQD] = MLX4_CMD_RTS2SQD_QP, + }, + [MLX4_QP_STATE_SQD] = { + [MLX4_QP_STATE_RST] = MLX4_CMD_2RST_QP, + [MLX4_QP_STATE_ERR] = MLX4_CMD_2ERR_QP, + [MLX4_QP_STATE_RTS] = MLX4_CMD_SQD2RTS_QP, + [MLX4_QP_STATE_SQD] = MLX4_CMD_SQD2SQD_QP, + }, + [MLX4_QP_STATE_SQER] = { + [MLX4_QP_STATE_RST] = MLX4_CMD_2RST_QP, + [MLX4_QP_STATE_ERR] = MLX4_CMD_2ERR_QP, + [MLX4_QP_STATE_RTS] = MLX4_CMD_SQERR2RTS_QP, + }, + [MLX4_QP_STATE_ERR] = { + [MLX4_QP_STATE_RST] = MLX4_CMD_2RST_QP, + [MLX4_QP_STATE_ERR] = MLX4_CMD_2ERR_QP, + } + }; + + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_cmd_mailbox *mailbox; + int ret = 0; + int real_qp0 = 0; + int proxy_qp0 = 0; + u8 port; + + if (cur_state >= MLX4_QP_NUM_STATE || new_state >= MLX4_QP_NUM_STATE || + !op[cur_state][new_state]) + return -EINVAL; + + if (op[cur_state][new_state] == MLX4_CMD_2RST_QP) { + ret = mlx4_cmd(dev, 0, qp->qpn, 2, + MLX4_CMD_2RST_QP, MLX4_CMD_TIME_CLASS_A, native); + if (mlx4_is_master(dev) && cur_state != MLX4_QP_STATE_ERR && + cur_state != MLX4_QP_STATE_RST && + is_master_qp0(dev, qp, &real_qp0, &proxy_qp0)) { + port = (qp->qpn & 1) + 1; + if (proxy_qp0) + priv->mfunc.master.qp0_state[port].proxy_qp0_active = 0; + else + priv->mfunc.master.qp0_state[port].qp0_active = 0; + } + return ret; + } + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + if (cur_state == MLX4_QP_STATE_RST && new_state == MLX4_QP_STATE_INIT) { + u64 mtt_addr = mlx4_mtt_addr(dev, mtt); + context->mtt_base_addr_h = mtt_addr >> 32; + context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff); + context->log_page_size = mtt->page_shift - MLX4_ICM_PAGE_SHIFT; + } + + *(__be32 *) mailbox->buf = cpu_to_be32(optpar); + memcpy(mailbox->buf + 8, context, sizeof *context); + + ((struct mlx4_qp_context *) (mailbox->buf + 8))->local_qpn = + cpu_to_be32(qp->qpn); + + ret = mlx4_cmd(dev, mailbox->dma, + qp->qpn | (!!sqd_event << 31), + new_state == MLX4_QP_STATE_RST ? 2 : 0, + op[cur_state][new_state], MLX4_CMD_TIME_CLASS_C, native); + + if (mlx4_is_master(dev) && is_master_qp0(dev, qp, &real_qp0, &proxy_qp0)) { + port = (qp->qpn & 1) + 1; + if (cur_state != MLX4_QP_STATE_ERR && + cur_state != MLX4_QP_STATE_RST && + new_state == MLX4_QP_STATE_ERR) { + if (proxy_qp0) + priv->mfunc.master.qp0_state[port].proxy_qp0_active = 0; + else + priv->mfunc.master.qp0_state[port].qp0_active = 0; + } else if (new_state == MLX4_QP_STATE_RTR) { + if (proxy_qp0) + priv->mfunc.master.qp0_state[port].proxy_qp0_active = 1; + else + priv->mfunc.master.qp0_state[port].qp0_active = 1; + } + } + + mlx4_free_cmd_mailbox(dev, mailbox); + return ret; +} + +int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt, + enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state, + struct mlx4_qp_context *context, + enum mlx4_qp_optpar optpar, + int sqd_event, struct mlx4_qp *qp) +{ + return __mlx4_qp_modify(dev, mtt, cur_state, new_state, context, + optpar, sqd_event, qp, 0); +} +EXPORT_SYMBOL_GPL(mlx4_qp_modify); + +int __mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, + int *base) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_qp_table *qp_table = &priv->qp_table; + + *base = mlx4_bitmap_alloc_range(&qp_table->bitmap, cnt, align); + if (*base == -1) + return -ENOMEM; + + return 0; +} + +int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base) +{ + u64 in_param = 0; + u64 out_param; + int err; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, cnt); + set_param_h(&in_param, align); + err = mlx4_cmd_imm(dev, in_param, &out_param, + RES_QP, RES_OP_RESERVE, + MLX4_CMD_ALLOC_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + if (err) + return err; + + *base = get_param_l(&out_param); + return 0; + } + return __mlx4_qp_reserve_range(dev, cnt, align, base); +} +EXPORT_SYMBOL_GPL(mlx4_qp_reserve_range); + +void __mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_qp_table *qp_table = &priv->qp_table; + + if (mlx4_is_qp_reserved(dev, (u32) base_qpn)) + return; + mlx4_bitmap_free_range(&qp_table->bitmap, base_qpn, cnt, MLX4_USE_RR); +} + +void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt) +{ + u64 in_param = 0; + int err; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, base_qpn); + set_param_h(&in_param, cnt); + err = mlx4_cmd(dev, in_param, RES_QP, RES_OP_RESERVE, + MLX4_CMD_FREE_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + if (err) { + mlx4_warn(dev, "Failed to release qp range base:%d cnt:%d\n", + base_qpn, cnt); + } + } else + __mlx4_qp_release_range(dev, base_qpn, cnt); +} +EXPORT_SYMBOL_GPL(mlx4_qp_release_range); + +int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn, gfp_t gfp) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_qp_table *qp_table = &priv->qp_table; + int err; + + err = mlx4_table_get(dev, &qp_table->qp_table, qpn, gfp); + if (err) + goto err_out; + + err = mlx4_table_get(dev, &qp_table->auxc_table, qpn, gfp); + if (err) + goto err_put_qp; + + err = mlx4_table_get(dev, &qp_table->altc_table, qpn, gfp); + if (err) + goto err_put_auxc; + + err = mlx4_table_get(dev, &qp_table->rdmarc_table, qpn, gfp); + if (err) + goto err_put_altc; + + err = mlx4_table_get(dev, &qp_table->cmpt_table, qpn, gfp); + if (err) + goto err_put_rdmarc; + + return 0; + +err_put_rdmarc: + mlx4_table_put(dev, &qp_table->rdmarc_table, qpn); + +err_put_altc: + mlx4_table_put(dev, &qp_table->altc_table, qpn); + +err_put_auxc: + mlx4_table_put(dev, &qp_table->auxc_table, qpn); + +err_put_qp: + mlx4_table_put(dev, &qp_table->qp_table, qpn); + +err_out: + return err; +} + +static int mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn, gfp_t gfp) +{ + u64 param = 0; + + if (mlx4_is_mfunc(dev)) { + set_param_l(¶m, qpn); + return mlx4_cmd_imm(dev, param, ¶m, RES_QP, RES_OP_MAP_ICM, + MLX4_CMD_ALLOC_RES, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED); + } + return __mlx4_qp_alloc_icm(dev, qpn, gfp); +} + +void __mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_qp_table *qp_table = &priv->qp_table; + + mlx4_table_put(dev, &qp_table->cmpt_table, qpn); + mlx4_table_put(dev, &qp_table->rdmarc_table, qpn); + mlx4_table_put(dev, &qp_table->altc_table, qpn); + mlx4_table_put(dev, &qp_table->auxc_table, qpn); + mlx4_table_put(dev, &qp_table->qp_table, qpn); +} + +static void mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn) +{ + u64 in_param = 0; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, qpn); + if (mlx4_cmd(dev, in_param, RES_QP, RES_OP_MAP_ICM, + MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED)) + mlx4_warn(dev, "Failed to free icm of qp:%d\n", qpn); + } else + __mlx4_qp_free_icm(dev, qpn); +} + +int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp, gfp_t gfp) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_qp_table *qp_table = &priv->qp_table; + int err; + + if (!qpn) + return -EINVAL; + + qp->qpn = qpn; + + err = mlx4_qp_alloc_icm(dev, qpn, gfp); + if (err) + return err; + + spin_lock_irq(&qp_table->lock); + err = radix_tree_insert(&dev->qp_table_tree, qp->qpn & + (dev->caps.num_qps - 1), qp); + spin_unlock_irq(&qp_table->lock); + if (err) + goto err_icm; + + atomic_set(&qp->refcount, 1); + init_completion(&qp->free); + + return 0; + +err_icm: + mlx4_qp_free_icm(dev, qpn); + return err; +} + +EXPORT_SYMBOL_GPL(mlx4_qp_alloc); + +#define MLX4_UPDATE_QP_SUPPORTED_ATTRS MLX4_UPDATE_QP_SMAC +int mlx4_update_qp(struct mlx4_dev *dev, u32 qpn, + enum mlx4_update_qp_attr attr, + struct mlx4_update_qp_params *params) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_update_qp_context *cmd; + u64 pri_addr_path_mask = 0; + u64 qp_mask = 0; + int err = 0; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + cmd = (struct mlx4_update_qp_context *)mailbox->buf; + + if (!attr || (attr & ~MLX4_UPDATE_QP_SUPPORTED_ATTRS)) + return -EINVAL; + + if (attr & MLX4_UPDATE_QP_SMAC) { + pri_addr_path_mask |= 1ULL << MLX4_UPD_QP_PATH_MASK_MAC_INDEX; + cmd->qp_context.pri_path.grh_mylmc = params->smac_index; + } + + if (attr & MLX4_UPDATE_QP_VSD) { + qp_mask |= 1ULL << MLX4_UPD_QP_MASK_VSD; + if (params->flags & MLX4_UPDATE_QP_PARAMS_FLAGS_VSD_ENABLE) + cmd->qp_context.param3 |= cpu_to_be32(MLX4_STRIP_VLAN); + } + + cmd->primary_addr_path_mask = cpu_to_be64(pri_addr_path_mask); + cmd->qp_mask = cpu_to_be64(qp_mask); + + err = mlx4_cmd(dev, mailbox->dma, qpn & 0xffffff, 0, + MLX4_CMD_UPDATE_QP, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} +EXPORT_SYMBOL_GPL(mlx4_update_qp); + +void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp) +{ + struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table; + unsigned long flags; + + spin_lock_irqsave(&qp_table->lock, flags); + radix_tree_delete(&dev->qp_table_tree, qp->qpn & (dev->caps.num_qps - 1)); + spin_unlock_irqrestore(&qp_table->lock, flags); +} +EXPORT_SYMBOL_GPL(mlx4_qp_remove); + +void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp) +{ + if (atomic_dec_and_test(&qp->refcount)) + complete(&qp->free); + wait_for_completion(&qp->free); + + mlx4_qp_free_icm(dev, qp->qpn); +} +EXPORT_SYMBOL_GPL(mlx4_qp_free); + +static int mlx4_CONF_SPECIAL_QP(struct mlx4_dev *dev, u32 base_qpn) +{ + return mlx4_cmd(dev, 0, base_qpn, 0, MLX4_CMD_CONF_SPECIAL_QP, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); +} + +int mlx4_init_qp_table(struct mlx4_dev *dev) +{ + struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table; + int err; + int reserved_from_top = 0; + int k; + + spin_lock_init(&qp_table->lock); + INIT_RADIX_TREE(&dev->qp_table_tree, GFP_ATOMIC); + if (mlx4_is_slave(dev)) + return 0; + + /* + * We reserve 2 extra QPs per port for the special QPs. The + * block of special QPs must be aligned to a multiple of 8, so + * round up. + * + * We also reserve the MSB of the 24-bit QP number to indicate + * that a QP is an XRC QP. + */ + dev->phys_caps.base_sqpn = + ALIGN(dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW], 8); + + { + int sort[MLX4_NUM_QP_REGION]; + int i, j, tmp; + int last_base = dev->caps.num_qps; + + for (i = 1; i < MLX4_NUM_QP_REGION; ++i) + sort[i] = i; + + for (i = MLX4_NUM_QP_REGION; i > 0; --i) { + for (j = 2; j < i; ++j) { + if (dev->caps.reserved_qps_cnt[sort[j]] > + dev->caps.reserved_qps_cnt[sort[j - 1]]) { + tmp = sort[j]; + sort[j] = sort[j - 1]; + sort[j - 1] = tmp; + } + } + } + + for (i = 1; i < MLX4_NUM_QP_REGION; ++i) { + last_base -= dev->caps.reserved_qps_cnt[sort[i]]; + dev->caps.reserved_qps_base[sort[i]] = last_base; + reserved_from_top += + dev->caps.reserved_qps_cnt[sort[i]]; + } + + } + + /* Reserve 8 real SQPs in both native and SRIOV modes. + * In addition, in SRIOV mode, reserve 8 proxy SQPs per function + * (for all PFs and VFs), and 8 corresponding tunnel QPs. + * Each proxy SQP works opposite its own tunnel QP. + * + * The QPs are arranged as follows: + * a. 8 real SQPs + * b. All the proxy SQPs (8 per function) + * c. All the tunnel QPs (8 per function) + */ + + err = mlx4_bitmap_init(&qp_table->bitmap, dev->caps.num_qps, + (1 << 23) - 1, mlx4_num_reserved_sqps(dev), + reserved_from_top); + if (err) + return err; + + if (mlx4_is_mfunc(dev)) { + /* for PPF use */ + dev->phys_caps.base_proxy_sqpn = dev->phys_caps.base_sqpn + 8; + dev->phys_caps.base_tunnel_sqpn = dev->phys_caps.base_sqpn + 8 + 8 * MLX4_MFUNC_MAX; + + /* In mfunc, calculate proxy and tunnel qp offsets for the PF here, + * since the PF does not call mlx4_slave_caps */ + dev->caps.qp0_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL); + dev->caps.qp0_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL); + dev->caps.qp1_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL); + dev->caps.qp1_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL); + + if (!dev->caps.qp0_tunnel || !dev->caps.qp0_proxy || + !dev->caps.qp1_tunnel || !dev->caps.qp1_proxy) { + err = -ENOMEM; + goto err_mem; + } + + for (k = 0; k < dev->caps.num_ports; k++) { + dev->caps.qp0_proxy[k] = dev->phys_caps.base_proxy_sqpn + + 8 * mlx4_master_func_num(dev) + k; + dev->caps.qp0_tunnel[k] = dev->caps.qp0_proxy[k] + 8 * MLX4_MFUNC_MAX; + dev->caps.qp1_proxy[k] = dev->phys_caps.base_proxy_sqpn + + 8 * mlx4_master_func_num(dev) + MLX4_MAX_PORTS + k; + dev->caps.qp1_tunnel[k] = dev->caps.qp1_proxy[k] + 8 * MLX4_MFUNC_MAX; + } + } + + + err = mlx4_CONF_SPECIAL_QP(dev, dev->phys_caps.base_sqpn); + if (err) + goto err_mem; + return 0; + +err_mem: + kfree(dev->caps.qp0_tunnel); + kfree(dev->caps.qp0_proxy); + kfree(dev->caps.qp1_tunnel); + kfree(dev->caps.qp1_proxy); + dev->caps.qp0_tunnel = dev->caps.qp0_proxy = + dev->caps.qp1_tunnel = dev->caps.qp1_proxy = NULL; + return err; +} + +void mlx4_cleanup_qp_table(struct mlx4_dev *dev) +{ + if (mlx4_is_slave(dev)) + return; + + mlx4_CONF_SPECIAL_QP(dev, 0); + mlx4_bitmap_cleanup(&mlx4_priv(dev)->qp_table.bitmap); +} + +int mlx4_qp_query(struct mlx4_dev *dev, struct mlx4_qp *qp, + struct mlx4_qp_context *context) +{ + struct mlx4_cmd_mailbox *mailbox; + int err; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + err = mlx4_cmd_box(dev, 0, mailbox->dma, qp->qpn, 0, + MLX4_CMD_QUERY_QP, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED); + if (!err) + memcpy(context, mailbox->buf + 8, sizeof *context); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} +EXPORT_SYMBOL_GPL(mlx4_qp_query); + +int mlx4_qp_to_ready(struct mlx4_dev *dev, struct mlx4_mtt *mtt, + struct mlx4_qp_context *context, + struct mlx4_qp *qp, enum mlx4_qp_state *qp_state) +{ + int err; + int i; + enum mlx4_qp_state states[] = { + MLX4_QP_STATE_RST, + MLX4_QP_STATE_INIT, + MLX4_QP_STATE_RTR, + MLX4_QP_STATE_RTS + }; + + for (i = 0; i < ARRAY_SIZE(states) - 1; i++) { + context->flags &= cpu_to_be32(~(0xf << 28)); + context->flags |= cpu_to_be32(states[i + 1] << 28); + err = mlx4_qp_modify(dev, mtt, states[i], states[i + 1], + context, 0, 0, qp); + if (err) { + mlx4_err(dev, "Failed to bring QP to state: %d with error: %d\n", + states[i + 1], err); + return err; + } + + *qp_state = states[i + 1]; + } + + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_qp_to_ready); diff --git a/drivers/net/ethernet/mellanox/mlx4/reset.c b/drivers/net/ethernet/mellanox/mlx4/reset.c new file mode 100644 index 0000000..ea1c6d0 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/reset.c @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "mlx4.h" + +int mlx4_reset(struct mlx4_dev *dev) +{ + void __iomem *reset; + u32 *hca_header = NULL; + int pcie_cap; + u16 devctl; + u16 linkctl; + u16 vendor; + unsigned long end; + u32 sem; + int i; + int err = 0; + +#define MLX4_RESET_BASE 0xf0000 +#define MLX4_RESET_SIZE 0x400 +#define MLX4_SEM_OFFSET 0x3fc +#define MLX4_RESET_OFFSET 0x10 +#define MLX4_RESET_VALUE swab32(1) + +#define MLX4_SEM_TIMEOUT_JIFFIES (10 * HZ) +#define MLX4_RESET_TIMEOUT_JIFFIES (2 * HZ) + + /* + * Reset the chip. This is somewhat ugly because we have to + * save off the PCI header before reset and then restore it + * after the chip reboots. We skip config space offsets 22 + * and 23 since those have a special meaning. + */ + + /* Do we need to save off the full 4K PCI Express header?? */ + hca_header = kmalloc(256, GFP_KERNEL); + if (!hca_header) { + err = -ENOMEM; + mlx4_err(dev, "Couldn't allocate memory to save HCA PCI header, aborting\n"); + goto out; + } + + pcie_cap = pci_pcie_cap(dev->pdev); + + for (i = 0; i < 64; ++i) { + if (i == 22 || i == 23) + continue; + if (pci_read_config_dword(dev->pdev, i * 4, hca_header + i)) { + err = -ENODEV; + mlx4_err(dev, "Couldn't save HCA PCI header, aborting\n"); + goto out; + } + } + + reset = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_RESET_BASE, + MLX4_RESET_SIZE); + if (!reset) { + err = -ENOMEM; + mlx4_err(dev, "Couldn't map HCA reset register, aborting\n"); + goto out; + } + + /* grab HW semaphore to lock out flash updates */ + end = jiffies + MLX4_SEM_TIMEOUT_JIFFIES; + do { + sem = readl(reset + MLX4_SEM_OFFSET); + if (!sem) + break; + + msleep(1); + } while (time_before(jiffies, end)); + + if (sem) { + mlx4_err(dev, "Failed to obtain HW semaphore, aborting\n"); + err = -EAGAIN; + iounmap(reset); + goto out; + } + + /* actually hit reset */ + writel(MLX4_RESET_VALUE, reset + MLX4_RESET_OFFSET); + iounmap(reset); + + /* Docs say to wait one second before accessing device */ + msleep(1000); + + end = jiffies + MLX4_RESET_TIMEOUT_JIFFIES; + do { + if (!pci_read_config_word(dev->pdev, PCI_VENDOR_ID, &vendor) && + vendor != 0xffff) + break; + + msleep(1); + } while (time_before(jiffies, end)); + + if (vendor == 0xffff) { + err = -ENODEV; + mlx4_err(dev, "PCI device did not come back after reset, aborting\n"); + goto out; + } + + /* Now restore the PCI headers */ + if (pcie_cap) { + devctl = hca_header[(pcie_cap + PCI_EXP_DEVCTL) / 4]; + if (pcie_capability_write_word(dev->pdev, PCI_EXP_DEVCTL, + devctl)) { + err = -ENODEV; + mlx4_err(dev, "Couldn't restore HCA PCI Express Device Control register, aborting\n"); + goto out; + } + linkctl = hca_header[(pcie_cap + PCI_EXP_LNKCTL) / 4]; + if (pcie_capability_write_word(dev->pdev, PCI_EXP_LNKCTL, + linkctl)) { + err = -ENODEV; + mlx4_err(dev, "Couldn't restore HCA PCI Express Link control register, aborting\n"); + goto out; + } + } + + for (i = 0; i < 16; ++i) { + if (i * 4 == PCI_COMMAND) + continue; + + if (pci_write_config_dword(dev->pdev, i * 4, hca_header[i])) { + err = -ENODEV; + mlx4_err(dev, "Couldn't restore HCA reg %x, aborting\n", + i); + goto out; + } + } + + if (pci_write_config_dword(dev->pdev, PCI_COMMAND, + hca_header[PCI_COMMAND / 4])) { + err = -ENODEV; + mlx4_err(dev, "Couldn't restore HCA COMMAND, aborting\n"); + goto out; + } + +out: + kfree(hca_header); + + return err; +} diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c new file mode 100644 index 0000000..cd5cf6d --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c @@ -0,0 +1,4916 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. + * All rights reserved. + * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mlx4.h" +#include "fw.h" + +#define MLX4_MAC_VALID (1ull << 63) + +struct mac_res { + struct list_head list; + u64 mac; + int ref_count; + u8 smac_index; + u8 port; +}; + +struct vlan_res { + struct list_head list; + u16 vlan; + int ref_count; + int vlan_index; + u8 port; +}; + +struct res_common { + struct list_head list; + struct rb_node node; + u64 res_id; + int owner; + int state; + int from_state; + int to_state; + int removing; +}; + +enum { + RES_ANY_BUSY = 1 +}; + +struct res_gid { + struct list_head list; + u8 gid[16]; + enum mlx4_protocol prot; + enum mlx4_steer_type steer; + u64 reg_id; +}; + +enum res_qp_states { + RES_QP_BUSY = RES_ANY_BUSY, + + /* QP number was allocated */ + RES_QP_RESERVED, + + /* ICM memory for QP context was mapped */ + RES_QP_MAPPED, + + /* QP is in hw ownership */ + RES_QP_HW +}; + +struct res_qp { + struct res_common com; + struct res_mtt *mtt; + struct res_cq *rcq; + struct res_cq *scq; + struct res_srq *srq; + struct list_head mcg_list; + spinlock_t mcg_spl; + int local_qpn; + atomic_t ref_count; + u32 qpc_flags; + /* saved qp params before VST enforcement in order to restore on VGT */ + u8 sched_queue; + __be32 param3; + u8 vlan_control; + u8 fvl_rx; + u8 pri_path_fl; + u8 vlan_index; + u8 feup; +}; + +enum res_mtt_states { + RES_MTT_BUSY = RES_ANY_BUSY, + RES_MTT_ALLOCATED, +}; + +static inline const char *mtt_states_str(enum res_mtt_states state) +{ + switch (state) { + case RES_MTT_BUSY: return "RES_MTT_BUSY"; + case RES_MTT_ALLOCATED: return "RES_MTT_ALLOCATED"; + default: return "Unknown"; + } +} + +struct res_mtt { + struct res_common com; + int order; + atomic_t ref_count; +}; + +enum res_mpt_states { + RES_MPT_BUSY = RES_ANY_BUSY, + RES_MPT_RESERVED, + RES_MPT_MAPPED, + RES_MPT_HW, +}; + +struct res_mpt { + struct res_common com; + struct res_mtt *mtt; + int key; +}; + +enum res_eq_states { + RES_EQ_BUSY = RES_ANY_BUSY, + RES_EQ_RESERVED, + RES_EQ_HW, +}; + +struct res_eq { + struct res_common com; + struct res_mtt *mtt; +}; + +enum res_cq_states { + RES_CQ_BUSY = RES_ANY_BUSY, + RES_CQ_ALLOCATED, + RES_CQ_HW, +}; + +struct res_cq { + struct res_common com; + struct res_mtt *mtt; + atomic_t ref_count; +}; + +enum res_srq_states { + RES_SRQ_BUSY = RES_ANY_BUSY, + RES_SRQ_ALLOCATED, + RES_SRQ_HW, +}; + +struct res_srq { + struct res_common com; + struct res_mtt *mtt; + struct res_cq *cq; + atomic_t ref_count; +}; + +enum res_counter_states { + RES_COUNTER_BUSY = RES_ANY_BUSY, + RES_COUNTER_ALLOCATED, +}; + +struct res_counter { + struct res_common com; + int port; +}; + +enum res_xrcdn_states { + RES_XRCD_BUSY = RES_ANY_BUSY, + RES_XRCD_ALLOCATED, +}; + +struct res_xrcdn { + struct res_common com; + int port; +}; + +enum res_fs_rule_states { + RES_FS_RULE_BUSY = RES_ANY_BUSY, + RES_FS_RULE_ALLOCATED, +}; + +struct res_fs_rule { + struct res_common com; + int qpn; +}; + +static int mlx4_is_eth(struct mlx4_dev *dev, int port) +{ + return dev->caps.port_mask[port] == MLX4_PORT_TYPE_IB ? 0 : 1; +} + +static void *res_tracker_lookup(struct rb_root *root, u64 res_id) +{ + struct rb_node *node = root->rb_node; + + while (node) { + struct res_common *res = container_of(node, struct res_common, + node); + + if (res_id < res->res_id) + node = node->rb_left; + else if (res_id > res->res_id) + node = node->rb_right; + else + return res; + } + return NULL; +} + +static int res_tracker_insert(struct rb_root *root, struct res_common *res) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + + /* Figure out where to put new node */ + while (*new) { + struct res_common *this = container_of(*new, struct res_common, + node); + + parent = *new; + if (res->res_id < this->res_id) + new = &((*new)->rb_left); + else if (res->res_id > this->res_id) + new = &((*new)->rb_right); + else + return -EEXIST; + } + + /* Add new node and rebalance tree. */ + rb_link_node(&res->node, parent, new); + rb_insert_color(&res->node, root); + + return 0; +} + +enum qp_transition { + QP_TRANS_INIT2RTR, + QP_TRANS_RTR2RTS, + QP_TRANS_RTS2RTS, + QP_TRANS_SQERR2RTS, + QP_TRANS_SQD2SQD, + QP_TRANS_SQD2RTS +}; + +/* For Debug uses */ +static const char *resource_str(enum mlx4_resource rt) +{ + switch (rt) { + case RES_QP: return "RES_QP"; + case RES_CQ: return "RES_CQ"; + case RES_SRQ: return "RES_SRQ"; + case RES_MPT: return "RES_MPT"; + case RES_MTT: return "RES_MTT"; + case RES_MAC: return "RES_MAC"; + case RES_VLAN: return "RES_VLAN"; + case RES_EQ: return "RES_EQ"; + case RES_COUNTER: return "RES_COUNTER"; + case RES_FS_RULE: return "RES_FS_RULE"; + case RES_XRCD: return "RES_XRCD"; + default: return "Unknown resource type !!!"; + }; +} + +static void rem_slave_vlans(struct mlx4_dev *dev, int slave); +static inline int mlx4_grant_resource(struct mlx4_dev *dev, int slave, + enum mlx4_resource res_type, int count, + int port) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct resource_allocator *res_alloc = + &priv->mfunc.master.res_tracker.res_alloc[res_type]; + int err = -EINVAL; + int allocated, free, reserved, guaranteed, from_free; + int from_rsvd; + + if (slave > dev->num_vfs) + return -EINVAL; + + spin_lock(&res_alloc->alloc_lock); + allocated = (port > 0) ? + res_alloc->allocated[(port - 1) * (dev->num_vfs + 1) + slave] : + res_alloc->allocated[slave]; + free = (port > 0) ? res_alloc->res_port_free[port - 1] : + res_alloc->res_free; + reserved = (port > 0) ? res_alloc->res_port_rsvd[port - 1] : + res_alloc->res_reserved; + guaranteed = res_alloc->guaranteed[slave]; + + if (allocated + count > res_alloc->quota[slave]) { + mlx4_warn(dev, "VF %d port %d res %s: quota exceeded, count %d alloc %d quota %d\n", + slave, port, resource_str(res_type), count, + allocated, res_alloc->quota[slave]); + goto out; + } + + if (allocated + count <= guaranteed) { + err = 0; + from_rsvd = count; + } else { + /* portion may need to be obtained from free area */ + if (guaranteed - allocated > 0) + from_free = count - (guaranteed - allocated); + else + from_free = count; + + from_rsvd = count - from_free; + + if (free - from_free >= reserved) + err = 0; + else + mlx4_warn(dev, "VF %d port %d res %s: free pool empty, free %d from_free %d rsvd %d\n", + slave, port, resource_str(res_type), free, + from_free, reserved); + } + + if (!err) { + /* grant the request */ + if (port > 0) { + res_alloc->allocated[(port - 1) * (dev->num_vfs + 1) + slave] += count; + res_alloc->res_port_free[port - 1] -= count; + res_alloc->res_port_rsvd[port - 1] -= from_rsvd; + } else { + res_alloc->allocated[slave] += count; + res_alloc->res_free -= count; + res_alloc->res_reserved -= from_rsvd; + } + } + +out: + spin_unlock(&res_alloc->alloc_lock); + return err; +} + +static inline void mlx4_release_resource(struct mlx4_dev *dev, int slave, + enum mlx4_resource res_type, int count, + int port) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct resource_allocator *res_alloc = + &priv->mfunc.master.res_tracker.res_alloc[res_type]; + int allocated, guaranteed, from_rsvd; + + if (slave > dev->num_vfs) + return; + + spin_lock(&res_alloc->alloc_lock); + + allocated = (port > 0) ? + res_alloc->allocated[(port - 1) * (dev->num_vfs + 1) + slave] : + res_alloc->allocated[slave]; + guaranteed = res_alloc->guaranteed[slave]; + + if (allocated - count >= guaranteed) { + from_rsvd = 0; + } else { + /* portion may need to be returned to reserved area */ + if (allocated - guaranteed > 0) + from_rsvd = count - (allocated - guaranteed); + else + from_rsvd = count; + } + + if (port > 0) { + res_alloc->allocated[(port - 1) * (dev->num_vfs + 1) + slave] -= count; + res_alloc->res_port_free[port - 1] += count; + res_alloc->res_port_rsvd[port - 1] += from_rsvd; + } else { + res_alloc->allocated[slave] -= count; + res_alloc->res_free += count; + res_alloc->res_reserved += from_rsvd; + } + + spin_unlock(&res_alloc->alloc_lock); + return; +} + +static inline void initialize_res_quotas(struct mlx4_dev *dev, + struct resource_allocator *res_alloc, + enum mlx4_resource res_type, + int vf, int num_instances) +{ + res_alloc->guaranteed[vf] = num_instances / (2 * (dev->num_vfs + 1)); + res_alloc->quota[vf] = (num_instances / 2) + res_alloc->guaranteed[vf]; + if (vf == mlx4_master_func_num(dev)) { + res_alloc->res_free = num_instances; + if (res_type == RES_MTT) { + /* reserved mtts will be taken out of the PF allocation */ + res_alloc->res_free += dev->caps.reserved_mtts; + res_alloc->guaranteed[vf] += dev->caps.reserved_mtts; + res_alloc->quota[vf] += dev->caps.reserved_mtts; + } + } +} + +void mlx4_init_quotas(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int pf; + + /* quotas for VFs are initialized in mlx4_slave_cap */ + if (mlx4_is_slave(dev)) + return; + + if (!mlx4_is_mfunc(dev)) { + dev->quotas.qp = dev->caps.num_qps - dev->caps.reserved_qps - + mlx4_num_reserved_sqps(dev); + dev->quotas.cq = dev->caps.num_cqs - dev->caps.reserved_cqs; + dev->quotas.srq = dev->caps.num_srqs - dev->caps.reserved_srqs; + dev->quotas.mtt = dev->caps.num_mtts - dev->caps.reserved_mtts; + dev->quotas.mpt = dev->caps.num_mpts - dev->caps.reserved_mrws; + return; + } + + pf = mlx4_master_func_num(dev); + dev->quotas.qp = + priv->mfunc.master.res_tracker.res_alloc[RES_QP].quota[pf]; + dev->quotas.cq = + priv->mfunc.master.res_tracker.res_alloc[RES_CQ].quota[pf]; + dev->quotas.srq = + priv->mfunc.master.res_tracker.res_alloc[RES_SRQ].quota[pf]; + dev->quotas.mtt = + priv->mfunc.master.res_tracker.res_alloc[RES_MTT].quota[pf]; + dev->quotas.mpt = + priv->mfunc.master.res_tracker.res_alloc[RES_MPT].quota[pf]; +} +int mlx4_init_resource_tracker(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int i, j; + int t; + + priv->mfunc.master.res_tracker.slave_list = + kzalloc(dev->num_slaves * sizeof(struct slave_list), + GFP_KERNEL); + if (!priv->mfunc.master.res_tracker.slave_list) + return -ENOMEM; + + for (i = 0 ; i < dev->num_slaves; i++) { + for (t = 0; t < MLX4_NUM_OF_RESOURCE_TYPE; ++t) + INIT_LIST_HEAD(&priv->mfunc.master.res_tracker. + slave_list[i].res_list[t]); + mutex_init(&priv->mfunc.master.res_tracker.slave_list[i].mutex); + } + + mlx4_dbg(dev, "Started init_resource_tracker: %ld slaves\n", + dev->num_slaves); + for (i = 0 ; i < MLX4_NUM_OF_RESOURCE_TYPE; i++) + priv->mfunc.master.res_tracker.res_tree[i] = RB_ROOT; + + for (i = 0; i < MLX4_NUM_OF_RESOURCE_TYPE; i++) { + struct resource_allocator *res_alloc = + &priv->mfunc.master.res_tracker.res_alloc[i]; + res_alloc->quota = kmalloc((dev->num_vfs + 1) * sizeof(int), GFP_KERNEL); + res_alloc->guaranteed = kmalloc((dev->num_vfs + 1) * sizeof(int), GFP_KERNEL); + if (i == RES_MAC || i == RES_VLAN) + res_alloc->allocated = kzalloc(MLX4_MAX_PORTS * + (dev->num_vfs + 1) * sizeof(int), + GFP_KERNEL); + else + res_alloc->allocated = kzalloc((dev->num_vfs + 1) * sizeof(int), GFP_KERNEL); + + if (!res_alloc->quota || !res_alloc->guaranteed || + !res_alloc->allocated) + goto no_mem_err; + + spin_lock_init(&res_alloc->alloc_lock); + for (t = 0; t < dev->num_vfs + 1; t++) { + struct mlx4_active_ports actv_ports = + mlx4_get_active_ports(dev, t); + switch (i) { + case RES_QP: + initialize_res_quotas(dev, res_alloc, RES_QP, + t, dev->caps.num_qps - + dev->caps.reserved_qps - + mlx4_num_reserved_sqps(dev)); + break; + case RES_CQ: + initialize_res_quotas(dev, res_alloc, RES_CQ, + t, dev->caps.num_cqs - + dev->caps.reserved_cqs); + break; + case RES_SRQ: + initialize_res_quotas(dev, res_alloc, RES_SRQ, + t, dev->caps.num_srqs - + dev->caps.reserved_srqs); + break; + case RES_MPT: + initialize_res_quotas(dev, res_alloc, RES_MPT, + t, dev->caps.num_mpts - + dev->caps.reserved_mrws); + break; + case RES_MTT: + initialize_res_quotas(dev, res_alloc, RES_MTT, + t, dev->caps.num_mtts - + dev->caps.reserved_mtts); + break; + case RES_MAC: + if (t == mlx4_master_func_num(dev)) { + int max_vfs_pport = 0; + /* Calculate the max vfs per port for */ + /* both ports. */ + for (j = 0; j < dev->caps.num_ports; + j++) { + struct mlx4_slaves_pport slaves_pport = + mlx4_phys_to_slaves_pport(dev, j + 1); + unsigned current_slaves = + bitmap_weight(slaves_pport.slaves, + dev->caps.num_ports) - 1; + if (max_vfs_pport < current_slaves) + max_vfs_pport = + current_slaves; + } + res_alloc->quota[t] = + MLX4_MAX_MAC_NUM - + 2 * max_vfs_pport; + res_alloc->guaranteed[t] = 2; + for (j = 0; j < MLX4_MAX_PORTS; j++) + res_alloc->res_port_free[j] = + MLX4_MAX_MAC_NUM; + } else { + res_alloc->quota[t] = MLX4_MAX_MAC_NUM; + res_alloc->guaranteed[t] = 2; + } + break; + case RES_VLAN: + if (t == mlx4_master_func_num(dev)) { + res_alloc->quota[t] = MLX4_MAX_VLAN_NUM; + res_alloc->guaranteed[t] = MLX4_MAX_VLAN_NUM / 2; + for (j = 0; j < MLX4_MAX_PORTS; j++) + res_alloc->res_port_free[j] = + res_alloc->quota[t]; + } else { + res_alloc->quota[t] = MLX4_MAX_VLAN_NUM / 2; + res_alloc->guaranteed[t] = 0; + } + break; + case RES_COUNTER: + res_alloc->quota[t] = dev->caps.max_counters; + res_alloc->guaranteed[t] = 0; + if (t == mlx4_master_func_num(dev)) + res_alloc->res_free = res_alloc->quota[t]; + break; + default: + break; + } + if (i == RES_MAC || i == RES_VLAN) { + for (j = 0; j < dev->caps.num_ports; j++) + if (test_bit(j, actv_ports.ports)) + res_alloc->res_port_rsvd[j] += + res_alloc->guaranteed[t]; + } else { + res_alloc->res_reserved += res_alloc->guaranteed[t]; + } + } + } + spin_lock_init(&priv->mfunc.master.res_tracker.lock); + return 0; + +no_mem_err: + for (i = 0; i < MLX4_NUM_OF_RESOURCE_TYPE; i++) { + kfree(priv->mfunc.master.res_tracker.res_alloc[i].allocated); + priv->mfunc.master.res_tracker.res_alloc[i].allocated = NULL; + kfree(priv->mfunc.master.res_tracker.res_alloc[i].guaranteed); + priv->mfunc.master.res_tracker.res_alloc[i].guaranteed = NULL; + kfree(priv->mfunc.master.res_tracker.res_alloc[i].quota); + priv->mfunc.master.res_tracker.res_alloc[i].quota = NULL; + } + return -ENOMEM; +} + +void mlx4_free_resource_tracker(struct mlx4_dev *dev, + enum mlx4_res_tracker_free_type type) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int i; + + if (priv->mfunc.master.res_tracker.slave_list) { + if (type != RES_TR_FREE_STRUCTS_ONLY) { + for (i = 0; i < dev->num_slaves; i++) { + if (type == RES_TR_FREE_ALL || + dev->caps.function != i) + mlx4_delete_all_resources_for_slave(dev, i); + } + /* free master's vlans */ + i = dev->caps.function; + mlx4_reset_roce_gids(dev, i); + mutex_lock(&priv->mfunc.master.res_tracker.slave_list[i].mutex); + rem_slave_vlans(dev, i); + mutex_unlock(&priv->mfunc.master.res_tracker.slave_list[i].mutex); + } + + if (type != RES_TR_FREE_SLAVES_ONLY) { + for (i = 0; i < MLX4_NUM_OF_RESOURCE_TYPE; i++) { + kfree(priv->mfunc.master.res_tracker.res_alloc[i].allocated); + priv->mfunc.master.res_tracker.res_alloc[i].allocated = NULL; + kfree(priv->mfunc.master.res_tracker.res_alloc[i].guaranteed); + priv->mfunc.master.res_tracker.res_alloc[i].guaranteed = NULL; + kfree(priv->mfunc.master.res_tracker.res_alloc[i].quota); + priv->mfunc.master.res_tracker.res_alloc[i].quota = NULL; + } + kfree(priv->mfunc.master.res_tracker.slave_list); + priv->mfunc.master.res_tracker.slave_list = NULL; + } + } +} + +static void update_pkey_index(struct mlx4_dev *dev, int slave, + struct mlx4_cmd_mailbox *inbox) +{ + u8 sched = *(u8 *)(inbox->buf + 64); + u8 orig_index = *(u8 *)(inbox->buf + 35); + u8 new_index; + struct mlx4_priv *priv = mlx4_priv(dev); + int port; + + port = (sched >> 6 & 1) + 1; + + new_index = priv->virt2phys_pkey[slave][port - 1][orig_index]; + *(u8 *)(inbox->buf + 35) = new_index; +} + +static void update_gid(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *inbox, + u8 slave) +{ + struct mlx4_qp_context *qp_ctx = inbox->buf + 8; + enum mlx4_qp_optpar optpar = be32_to_cpu(*(__be32 *) inbox->buf); + u32 ts = (be32_to_cpu(qp_ctx->flags) >> 16) & 0xff; + int port; + + if (MLX4_QP_ST_UD == ts) { + port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1; + if (mlx4_is_eth(dev, port)) + qp_ctx->pri_path.mgid_index = + mlx4_get_base_gid_ix(dev, slave, port) | 0x80; + else + qp_ctx->pri_path.mgid_index = slave | 0x80; + + } else if (MLX4_QP_ST_RC == ts || MLX4_QP_ST_XRC == ts || MLX4_QP_ST_UC == ts) { + if (optpar & MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH) { + port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1; + if (mlx4_is_eth(dev, port)) { + qp_ctx->pri_path.mgid_index += + mlx4_get_base_gid_ix(dev, slave, port); + qp_ctx->pri_path.mgid_index &= 0x7f; + } else { + qp_ctx->pri_path.mgid_index = slave & 0x7F; + } + } + if (optpar & MLX4_QP_OPTPAR_ALT_ADDR_PATH) { + port = (qp_ctx->alt_path.sched_queue >> 6 & 1) + 1; + if (mlx4_is_eth(dev, port)) { + qp_ctx->alt_path.mgid_index += + mlx4_get_base_gid_ix(dev, slave, port); + qp_ctx->alt_path.mgid_index &= 0x7f; + } else { + qp_ctx->alt_path.mgid_index = slave & 0x7F; + } + } + } +} + +static int update_vport_qp_param(struct mlx4_dev *dev, + struct mlx4_cmd_mailbox *inbox, + u8 slave, u32 qpn) +{ + struct mlx4_qp_context *qpc = inbox->buf + 8; + struct mlx4_vport_oper_state *vp_oper; + struct mlx4_priv *priv; + u32 qp_type; + int port; + + port = (qpc->pri_path.sched_queue & 0x40) ? 2 : 1; + priv = mlx4_priv(dev); + vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port]; + qp_type = (be32_to_cpu(qpc->flags) >> 16) & 0xff; + + if (MLX4_VGT != vp_oper->state.default_vlan) { + /* the reserved QPs (special, proxy, tunnel) + * do not operate over vlans + */ + if (mlx4_is_qp_reserved(dev, qpn)) + return 0; + + /* force strip vlan by clear vsd, MLX QP refers to Raw Ethernet */ + if (qp_type == MLX4_QP_ST_UD || + (qp_type == MLX4_QP_ST_MLX && mlx4_is_eth(dev, port))) { + if (dev->caps.bmme_flags & MLX4_BMME_FLAG_VSD_INIT2RTR) { + *(__be32 *)inbox->buf = + cpu_to_be32(be32_to_cpu(*(__be32 *)inbox->buf) | + MLX4_QP_OPTPAR_VLAN_STRIPPING); + qpc->param3 &= ~cpu_to_be32(MLX4_STRIP_VLAN); + } else { + struct mlx4_update_qp_params params = {.flags = 0}; + + mlx4_update_qp(dev, qpn, MLX4_UPDATE_QP_VSD, ¶ms); + } + } + + if (vp_oper->state.link_state == IFLA_VF_LINK_STATE_DISABLE && + dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_UPDATE_QP) { + qpc->pri_path.vlan_control = + MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED | + MLX4_VLAN_CTRL_ETH_TX_BLOCK_PRIO_TAGGED | + MLX4_VLAN_CTRL_ETH_TX_BLOCK_UNTAGGED | + MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED | + MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED | + MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED; + } else if (0 != vp_oper->state.default_vlan) { + qpc->pri_path.vlan_control = + MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED | + MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED | + MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED; + } else { /* priority tagged */ + qpc->pri_path.vlan_control = + MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED | + MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED; + } + + qpc->pri_path.fvl_rx |= MLX4_FVL_RX_FORCE_ETH_VLAN; + qpc->pri_path.vlan_index = vp_oper->vlan_idx; + qpc->pri_path.fl |= MLX4_FL_CV | MLX4_FL_ETH_HIDE_CQE_VLAN; + qpc->pri_path.feup |= MLX4_FEUP_FORCE_ETH_UP | MLX4_FVL_FORCE_ETH_VLAN; + qpc->pri_path.sched_queue &= 0xC7; + qpc->pri_path.sched_queue |= (vp_oper->state.default_qos) << 3; + } + if (vp_oper->state.spoofchk) { + qpc->pri_path.feup |= MLX4_FSM_FORCE_ETH_SRC_MAC; + qpc->pri_path.grh_mylmc = (0x80 & qpc->pri_path.grh_mylmc) + vp_oper->mac_idx; + } + return 0; +} + +static int mpt_mask(struct mlx4_dev *dev) +{ + return dev->caps.num_mpts - 1; +} + +static void *find_res(struct mlx4_dev *dev, u64 res_id, + enum mlx4_resource type) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + return res_tracker_lookup(&priv->mfunc.master.res_tracker.res_tree[type], + res_id); +} + +static int get_res(struct mlx4_dev *dev, int slave, u64 res_id, + enum mlx4_resource type, + void *res) +{ + struct res_common *r; + int err = 0; + + spin_lock_irq(mlx4_tlock(dev)); + r = find_res(dev, res_id, type); + if (!r) { + err = -ENONET; + goto exit; + } + + if (r->state == RES_ANY_BUSY) { + err = -EBUSY; + goto exit; + } + + if (r->owner != slave) { + err = -EPERM; + goto exit; + } + + r->from_state = r->state; + r->state = RES_ANY_BUSY; + + if (res) + *((struct res_common **)res) = r; + +exit: + spin_unlock_irq(mlx4_tlock(dev)); + return err; +} + +int mlx4_get_slave_from_resource_id(struct mlx4_dev *dev, + enum mlx4_resource type, + u64 res_id, int *slave) +{ + + struct res_common *r; + int err = -ENOENT; + int id = res_id; + + if (type == RES_QP) + id &= 0x7fffff; + spin_lock(mlx4_tlock(dev)); + + r = find_res(dev, id, type); + if (r) { + *slave = r->owner; + err = 0; + } + spin_unlock(mlx4_tlock(dev)); + + return err; +} + +static void put_res(struct mlx4_dev *dev, int slave, u64 res_id, + enum mlx4_resource type) +{ + struct res_common *r; + + spin_lock_irq(mlx4_tlock(dev)); + r = find_res(dev, res_id, type); + if (r) + r->state = r->from_state; + spin_unlock_irq(mlx4_tlock(dev)); +} + +static struct res_common *alloc_qp_tr(int id) +{ + struct res_qp *ret; + + ret = kzalloc(sizeof *ret, GFP_KERNEL); + if (!ret) + return NULL; + + ret->com.res_id = id; + ret->com.state = RES_QP_RESERVED; + ret->local_qpn = id; + INIT_LIST_HEAD(&ret->mcg_list); + spin_lock_init(&ret->mcg_spl); + atomic_set(&ret->ref_count, 0); + + return &ret->com; +} + +static struct res_common *alloc_mtt_tr(int id, int order) +{ + struct res_mtt *ret; + + ret = kzalloc(sizeof *ret, GFP_KERNEL); + if (!ret) + return NULL; + + ret->com.res_id = id; + ret->order = order; + ret->com.state = RES_MTT_ALLOCATED; + atomic_set(&ret->ref_count, 0); + + return &ret->com; +} + +static struct res_common *alloc_mpt_tr(int id, int key) +{ + struct res_mpt *ret; + + ret = kzalloc(sizeof *ret, GFP_KERNEL); + if (!ret) + return NULL; + + ret->com.res_id = id; + ret->com.state = RES_MPT_RESERVED; + ret->key = key; + + return &ret->com; +} + +static struct res_common *alloc_eq_tr(int id) +{ + struct res_eq *ret; + + ret = kzalloc(sizeof *ret, GFP_KERNEL); + if (!ret) + return NULL; + + ret->com.res_id = id; + ret->com.state = RES_EQ_RESERVED; + + return &ret->com; +} + +static struct res_common *alloc_cq_tr(int id) +{ + struct res_cq *ret; + + ret = kzalloc(sizeof *ret, GFP_KERNEL); + if (!ret) + return NULL; + + ret->com.res_id = id; + ret->com.state = RES_CQ_ALLOCATED; + atomic_set(&ret->ref_count, 0); + + return &ret->com; +} + +static struct res_common *alloc_srq_tr(int id) +{ + struct res_srq *ret; + + ret = kzalloc(sizeof *ret, GFP_KERNEL); + if (!ret) + return NULL; + + ret->com.res_id = id; + ret->com.state = RES_SRQ_ALLOCATED; + atomic_set(&ret->ref_count, 0); + + return &ret->com; +} + +static struct res_common *alloc_counter_tr(int id) +{ + struct res_counter *ret; + + ret = kzalloc(sizeof *ret, GFP_KERNEL); + if (!ret) + return NULL; + + ret->com.res_id = id; + ret->com.state = RES_COUNTER_ALLOCATED; + + return &ret->com; +} + +static struct res_common *alloc_xrcdn_tr(int id) +{ + struct res_xrcdn *ret; + + ret = kzalloc(sizeof *ret, GFP_KERNEL); + if (!ret) + return NULL; + + ret->com.res_id = id; + ret->com.state = RES_XRCD_ALLOCATED; + + return &ret->com; +} + +static struct res_common *alloc_fs_rule_tr(u64 id, int qpn) +{ + struct res_fs_rule *ret; + + ret = kzalloc(sizeof *ret, GFP_KERNEL); + if (!ret) + return NULL; + + ret->com.res_id = id; + ret->com.state = RES_FS_RULE_ALLOCATED; + ret->qpn = qpn; + return &ret->com; +} + +static struct res_common *alloc_tr(u64 id, enum mlx4_resource type, int slave, + int extra) +{ + struct res_common *ret; + + switch (type) { + case RES_QP: + ret = alloc_qp_tr(id); + break; + case RES_MPT: + ret = alloc_mpt_tr(id, extra); + break; + case RES_MTT: + ret = alloc_mtt_tr(id, extra); + break; + case RES_EQ: + ret = alloc_eq_tr(id); + break; + case RES_CQ: + ret = alloc_cq_tr(id); + break; + case RES_SRQ: + ret = alloc_srq_tr(id); + break; + case RES_MAC: + pr_err("implementation missing\n"); + return NULL; + case RES_COUNTER: + ret = alloc_counter_tr(id); + break; + case RES_XRCD: + ret = alloc_xrcdn_tr(id); + break; + case RES_FS_RULE: + ret = alloc_fs_rule_tr(id, extra); + break; + default: + return NULL; + } + if (ret) + ret->owner = slave; + + return ret; +} + +static int add_res_range(struct mlx4_dev *dev, int slave, u64 base, int count, + enum mlx4_resource type, int extra) +{ + int i; + int err; + struct mlx4_priv *priv = mlx4_priv(dev); + struct res_common **res_arr; + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct rb_root *root = &tracker->res_tree[type]; + + res_arr = kzalloc(count * sizeof *res_arr, GFP_KERNEL); + if (!res_arr) + return -ENOMEM; + + for (i = 0; i < count; ++i) { + res_arr[i] = alloc_tr(base + i, type, slave, extra); + if (!res_arr[i]) { + for (--i; i >= 0; --i) + kfree(res_arr[i]); + + kfree(res_arr); + return -ENOMEM; + } + } + + spin_lock_irq(mlx4_tlock(dev)); + for (i = 0; i < count; ++i) { + if (find_res(dev, base + i, type)) { + err = -EEXIST; + goto undo; + } + err = res_tracker_insert(root, res_arr[i]); + if (err) + goto undo; + list_add_tail(&res_arr[i]->list, + &tracker->slave_list[slave].res_list[type]); + } + spin_unlock_irq(mlx4_tlock(dev)); + kfree(res_arr); + + return 0; + +undo: + for (--i; i >= base; --i) + rb_erase(&res_arr[i]->node, root); + + spin_unlock_irq(mlx4_tlock(dev)); + + for (i = 0; i < count; ++i) + kfree(res_arr[i]); + + kfree(res_arr); + + return err; +} + +static int remove_qp_ok(struct res_qp *res) +{ + if (res->com.state == RES_QP_BUSY || atomic_read(&res->ref_count) || + !list_empty(&res->mcg_list)) { + pr_err("resource tracker: fail to remove qp, state %d, ref_count %d\n", + res->com.state, atomic_read(&res->ref_count)); + return -EBUSY; + } else if (res->com.state != RES_QP_RESERVED) { + return -EPERM; + } + + return 0; +} + +static int remove_mtt_ok(struct res_mtt *res, int order) +{ + if (res->com.state == RES_MTT_BUSY || + atomic_read(&res->ref_count)) { + pr_devel("%s-%d: state %s, ref_count %d\n", + __func__, __LINE__, + mtt_states_str(res->com.state), + atomic_read(&res->ref_count)); + return -EBUSY; + } else if (res->com.state != RES_MTT_ALLOCATED) + return -EPERM; + else if (res->order != order) + return -EINVAL; + + return 0; +} + +static int remove_mpt_ok(struct res_mpt *res) +{ + if (res->com.state == RES_MPT_BUSY) + return -EBUSY; + else if (res->com.state != RES_MPT_RESERVED) + return -EPERM; + + return 0; +} + +static int remove_eq_ok(struct res_eq *res) +{ + if (res->com.state == RES_MPT_BUSY) + return -EBUSY; + else if (res->com.state != RES_MPT_RESERVED) + return -EPERM; + + return 0; +} + +static int remove_counter_ok(struct res_counter *res) +{ + if (res->com.state == RES_COUNTER_BUSY) + return -EBUSY; + else if (res->com.state != RES_COUNTER_ALLOCATED) + return -EPERM; + + return 0; +} + +static int remove_xrcdn_ok(struct res_xrcdn *res) +{ + if (res->com.state == RES_XRCD_BUSY) + return -EBUSY; + else if (res->com.state != RES_XRCD_ALLOCATED) + return -EPERM; + + return 0; +} + +static int remove_fs_rule_ok(struct res_fs_rule *res) +{ + if (res->com.state == RES_FS_RULE_BUSY) + return -EBUSY; + else if (res->com.state != RES_FS_RULE_ALLOCATED) + return -EPERM; + + return 0; +} + +static int remove_cq_ok(struct res_cq *res) +{ + if (res->com.state == RES_CQ_BUSY) + return -EBUSY; + else if (res->com.state != RES_CQ_ALLOCATED) + return -EPERM; + + return 0; +} + +static int remove_srq_ok(struct res_srq *res) +{ + if (res->com.state == RES_SRQ_BUSY) + return -EBUSY; + else if (res->com.state != RES_SRQ_ALLOCATED) + return -EPERM; + + return 0; +} + +static int remove_ok(struct res_common *res, enum mlx4_resource type, int extra) +{ + switch (type) { + case RES_QP: + return remove_qp_ok((struct res_qp *)res); + case RES_CQ: + return remove_cq_ok((struct res_cq *)res); + case RES_SRQ: + return remove_srq_ok((struct res_srq *)res); + case RES_MPT: + return remove_mpt_ok((struct res_mpt *)res); + case RES_MTT: + return remove_mtt_ok((struct res_mtt *)res, extra); + case RES_MAC: + return -ENOSYS; + case RES_EQ: + return remove_eq_ok((struct res_eq *)res); + case RES_COUNTER: + return remove_counter_ok((struct res_counter *)res); + case RES_XRCD: + return remove_xrcdn_ok((struct res_xrcdn *)res); + case RES_FS_RULE: + return remove_fs_rule_ok((struct res_fs_rule *)res); + default: + return -EINVAL; + } +} + +static int rem_res_range(struct mlx4_dev *dev, int slave, u64 base, int count, + enum mlx4_resource type, int extra) +{ + u64 i; + int err; + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct res_common *r; + + spin_lock_irq(mlx4_tlock(dev)); + for (i = base; i < base + count; ++i) { + r = res_tracker_lookup(&tracker->res_tree[type], i); + if (!r) { + err = -ENOENT; + goto out; + } + if (r->owner != slave) { + err = -EPERM; + goto out; + } + err = remove_ok(r, type, extra); + if (err) + goto out; + } + + for (i = base; i < base + count; ++i) { + r = res_tracker_lookup(&tracker->res_tree[type], i); + rb_erase(&r->node, &tracker->res_tree[type]); + list_del(&r->list); + kfree(r); + } + err = 0; + +out: + spin_unlock_irq(mlx4_tlock(dev)); + + return err; +} + +static int qp_res_start_move_to(struct mlx4_dev *dev, int slave, int qpn, + enum res_qp_states state, struct res_qp **qp, + int alloc) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct res_qp *r; + int err = 0; + + spin_lock_irq(mlx4_tlock(dev)); + r = res_tracker_lookup(&tracker->res_tree[RES_QP], qpn); + if (!r) + err = -ENOENT; + else if (r->com.owner != slave) + err = -EPERM; + else { + switch (state) { + case RES_QP_BUSY: + mlx4_dbg(dev, "%s: failed RES_QP, 0x%llx\n", + __func__, r->com.res_id); + err = -EBUSY; + break; + + case RES_QP_RESERVED: + if (r->com.state == RES_QP_MAPPED && !alloc) + break; + + mlx4_dbg(dev, "failed RES_QP, 0x%llx\n", r->com.res_id); + err = -EINVAL; + break; + + case RES_QP_MAPPED: + if ((r->com.state == RES_QP_RESERVED && alloc) || + r->com.state == RES_QP_HW) + break; + else { + mlx4_dbg(dev, "failed RES_QP, 0x%llx\n", + r->com.res_id); + err = -EINVAL; + } + + break; + + case RES_QP_HW: + if (r->com.state != RES_QP_MAPPED) + err = -EINVAL; + break; + default: + err = -EINVAL; + } + + if (!err) { + r->com.from_state = r->com.state; + r->com.to_state = state; + r->com.state = RES_QP_BUSY; + if (qp) + *qp = r; + } + } + + spin_unlock_irq(mlx4_tlock(dev)); + + return err; +} + +static int mr_res_start_move_to(struct mlx4_dev *dev, int slave, int index, + enum res_mpt_states state, struct res_mpt **mpt) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct res_mpt *r; + int err = 0; + + spin_lock_irq(mlx4_tlock(dev)); + r = res_tracker_lookup(&tracker->res_tree[RES_MPT], index); + if (!r) + err = -ENOENT; + else if (r->com.owner != slave) + err = -EPERM; + else { + switch (state) { + case RES_MPT_BUSY: + err = -EINVAL; + break; + + case RES_MPT_RESERVED: + if (r->com.state != RES_MPT_MAPPED) + err = -EINVAL; + break; + + case RES_MPT_MAPPED: + if (r->com.state != RES_MPT_RESERVED && + r->com.state != RES_MPT_HW) + err = -EINVAL; + break; + + case RES_MPT_HW: + if (r->com.state != RES_MPT_MAPPED) + err = -EINVAL; + break; + default: + err = -EINVAL; + } + + if (!err) { + r->com.from_state = r->com.state; + r->com.to_state = state; + r->com.state = RES_MPT_BUSY; + if (mpt) + *mpt = r; + } + } + + spin_unlock_irq(mlx4_tlock(dev)); + + return err; +} + +static int eq_res_start_move_to(struct mlx4_dev *dev, int slave, int index, + enum res_eq_states state, struct res_eq **eq) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct res_eq *r; + int err = 0; + + spin_lock_irq(mlx4_tlock(dev)); + r = res_tracker_lookup(&tracker->res_tree[RES_EQ], index); + if (!r) + err = -ENOENT; + else if (r->com.owner != slave) + err = -EPERM; + else { + switch (state) { + case RES_EQ_BUSY: + err = -EINVAL; + break; + + case RES_EQ_RESERVED: + if (r->com.state != RES_EQ_HW) + err = -EINVAL; + break; + + case RES_EQ_HW: + if (r->com.state != RES_EQ_RESERVED) + err = -EINVAL; + break; + + default: + err = -EINVAL; + } + + if (!err) { + r->com.from_state = r->com.state; + r->com.to_state = state; + r->com.state = RES_EQ_BUSY; + if (eq) + *eq = r; + } + } + + spin_unlock_irq(mlx4_tlock(dev)); + + return err; +} + +static int cq_res_start_move_to(struct mlx4_dev *dev, int slave, int cqn, + enum res_cq_states state, struct res_cq **cq) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct res_cq *r; + int err; + + spin_lock_irq(mlx4_tlock(dev)); + r = res_tracker_lookup(&tracker->res_tree[RES_CQ], cqn); + if (!r) { + err = -ENOENT; + } else if (r->com.owner != slave) { + err = -EPERM; + } else if (state == RES_CQ_ALLOCATED) { + if (r->com.state != RES_CQ_HW) + err = -EINVAL; + else if (atomic_read(&r->ref_count)) + err = -EBUSY; + else + err = 0; + } else if (state != RES_CQ_HW || r->com.state != RES_CQ_ALLOCATED) { + err = -EINVAL; + } else { + err = 0; + } + + if (!err) { + r->com.from_state = r->com.state; + r->com.to_state = state; + r->com.state = RES_CQ_BUSY; + if (cq) + *cq = r; + } + + spin_unlock_irq(mlx4_tlock(dev)); + + return err; +} + +static int srq_res_start_move_to(struct mlx4_dev *dev, int slave, int index, + enum res_srq_states state, struct res_srq **srq) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct res_srq *r; + int err = 0; + + spin_lock_irq(mlx4_tlock(dev)); + r = res_tracker_lookup(&tracker->res_tree[RES_SRQ], index); + if (!r) { + err = -ENOENT; + } else if (r->com.owner != slave) { + err = -EPERM; + } else if (state == RES_SRQ_ALLOCATED) { + if (r->com.state != RES_SRQ_HW) + err = -EINVAL; + else if (atomic_read(&r->ref_count)) + err = -EBUSY; + } else if (state != RES_SRQ_HW || r->com.state != RES_SRQ_ALLOCATED) { + err = -EINVAL; + } + + if (!err) { + r->com.from_state = r->com.state; + r->com.to_state = state; + r->com.state = RES_SRQ_BUSY; + if (srq) + *srq = r; + } + + spin_unlock_irq(mlx4_tlock(dev)); + + return err; +} + +static void res_abort_move(struct mlx4_dev *dev, int slave, + enum mlx4_resource type, int id) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct res_common *r; + + spin_lock_irq(mlx4_tlock(dev)); + r = res_tracker_lookup(&tracker->res_tree[type], id); + if (r && (r->owner == slave)) + r->state = r->from_state; + spin_unlock_irq(mlx4_tlock(dev)); +} + +static void res_end_move(struct mlx4_dev *dev, int slave, + enum mlx4_resource type, int id) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct res_common *r; + + spin_lock_irq(mlx4_tlock(dev)); + r = res_tracker_lookup(&tracker->res_tree[type], id); + if (r && (r->owner == slave)) + r->state = r->to_state; + spin_unlock_irq(mlx4_tlock(dev)); +} + +static int valid_reserved(struct mlx4_dev *dev, int slave, int qpn) +{ + return mlx4_is_qp_reserved(dev, qpn) && + (mlx4_is_master(dev) || mlx4_is_guest_proxy(dev, slave, qpn)); +} + +static int fw_reserved(struct mlx4_dev *dev, int qpn) +{ + return qpn < dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW]; +} + +static int qp_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int err; + int count; + int align; + int base; + int qpn; + + switch (op) { + case RES_OP_RESERVE: + count = get_param_l(&in_param) & 0xffffff; + align = get_param_h(&in_param); + err = mlx4_grant_resource(dev, slave, RES_QP, count, 0); + if (err) + return err; + + err = __mlx4_qp_reserve_range(dev, count, align, &base); + if (err) { + mlx4_release_resource(dev, slave, RES_QP, count, 0); + return err; + } + + err = add_res_range(dev, slave, base, count, RES_QP, 0); + if (err) { + mlx4_release_resource(dev, slave, RES_QP, count, 0); + __mlx4_qp_release_range(dev, base, count); + return err; + } + set_param_l(out_param, base); + break; + case RES_OP_MAP_ICM: + qpn = get_param_l(&in_param) & 0x7fffff; + if (valid_reserved(dev, slave, qpn)) { + err = add_res_range(dev, slave, qpn, 1, RES_QP, 0); + if (err) + return err; + } + + err = qp_res_start_move_to(dev, slave, qpn, RES_QP_MAPPED, + NULL, 1); + if (err) + return err; + + if (!fw_reserved(dev, qpn)) { + err = __mlx4_qp_alloc_icm(dev, qpn, GFP_KERNEL); + if (err) { + res_abort_move(dev, slave, RES_QP, qpn); + return err; + } + } + + res_end_move(dev, slave, RES_QP, qpn); + break; + + default: + err = -EINVAL; + break; + } + return err; +} + +static int mtt_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int err = -EINVAL; + int base; + int order; + + if (op != RES_OP_RESERVE_AND_MAP) + return err; + + order = get_param_l(&in_param); + + err = mlx4_grant_resource(dev, slave, RES_MTT, 1 << order, 0); + if (err) + return err; + + base = __mlx4_alloc_mtt_range(dev, order); + if (base == -1) { + mlx4_release_resource(dev, slave, RES_MTT, 1 << order, 0); + return -ENOMEM; + } + + err = add_res_range(dev, slave, base, 1, RES_MTT, order); + if (err) { + mlx4_release_resource(dev, slave, RES_MTT, 1 << order, 0); + __mlx4_free_mtt_range(dev, base, order); + } else { + set_param_l(out_param, base); + } + + return err; +} + +static int mpt_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int err = -EINVAL; + int index; + int id; + struct res_mpt *mpt; + + switch (op) { + case RES_OP_RESERVE: + err = mlx4_grant_resource(dev, slave, RES_MPT, 1, 0); + if (err) + break; + + index = __mlx4_mpt_reserve(dev); + if (index == -1) { + mlx4_release_resource(dev, slave, RES_MPT, 1, 0); + break; + } + id = index & mpt_mask(dev); + + err = add_res_range(dev, slave, id, 1, RES_MPT, index); + if (err) { + mlx4_release_resource(dev, slave, RES_MPT, 1, 0); + __mlx4_mpt_release(dev, index); + break; + } + set_param_l(out_param, index); + break; + case RES_OP_MAP_ICM: + index = get_param_l(&in_param); + id = index & mpt_mask(dev); + err = mr_res_start_move_to(dev, slave, id, + RES_MPT_MAPPED, &mpt); + if (err) + return err; + + err = __mlx4_mpt_alloc_icm(dev, mpt->key, GFP_KERNEL); + if (err) { + res_abort_move(dev, slave, RES_MPT, id); + return err; + } + + res_end_move(dev, slave, RES_MPT, id); + break; + } + return err; +} + +static int cq_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int cqn; + int err; + + switch (op) { + case RES_OP_RESERVE_AND_MAP: + err = mlx4_grant_resource(dev, slave, RES_CQ, 1, 0); + if (err) + break; + + err = __mlx4_cq_alloc_icm(dev, &cqn); + if (err) { + mlx4_release_resource(dev, slave, RES_CQ, 1, 0); + break; + } + + err = add_res_range(dev, slave, cqn, 1, RES_CQ, 0); + if (err) { + mlx4_release_resource(dev, slave, RES_CQ, 1, 0); + __mlx4_cq_free_icm(dev, cqn); + break; + } + + set_param_l(out_param, cqn); + break; + + default: + err = -EINVAL; + } + + return err; +} + +static int srq_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int srqn; + int err; + + switch (op) { + case RES_OP_RESERVE_AND_MAP: + err = mlx4_grant_resource(dev, slave, RES_SRQ, 1, 0); + if (err) + break; + + err = __mlx4_srq_alloc_icm(dev, &srqn); + if (err) { + mlx4_release_resource(dev, slave, RES_SRQ, 1, 0); + break; + } + + err = add_res_range(dev, slave, srqn, 1, RES_SRQ, 0); + if (err) { + mlx4_release_resource(dev, slave, RES_SRQ, 1, 0); + __mlx4_srq_free_icm(dev, srqn); + break; + } + + set_param_l(out_param, srqn); + break; + + default: + err = -EINVAL; + } + + return err; +} + +static int mac_find_smac_ix_in_slave(struct mlx4_dev *dev, int slave, int port, + u8 smac_index, u64 *mac) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *mac_list = + &tracker->slave_list[slave].res_list[RES_MAC]; + struct mac_res *res, *tmp; + + list_for_each_entry_safe(res, tmp, mac_list, list) { + if (res->smac_index == smac_index && res->port == (u8) port) { + *mac = res->mac; + return 0; + } + } + return -ENOENT; +} + +static int mac_add_to_slave(struct mlx4_dev *dev, int slave, u64 mac, int port, u8 smac_index) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *mac_list = + &tracker->slave_list[slave].res_list[RES_MAC]; + struct mac_res *res, *tmp; + + list_for_each_entry_safe(res, tmp, mac_list, list) { + if (res->mac == mac && res->port == (u8) port) { + /* mac found. update ref count */ + ++res->ref_count; + return 0; + } + } + + if (mlx4_grant_resource(dev, slave, RES_MAC, 1, port)) + return -EINVAL; + res = kzalloc(sizeof *res, GFP_KERNEL); + if (!res) { + mlx4_release_resource(dev, slave, RES_MAC, 1, port); + return -ENOMEM; + } + res->mac = mac; + res->port = (u8) port; + res->smac_index = smac_index; + res->ref_count = 1; + list_add_tail(&res->list, + &tracker->slave_list[slave].res_list[RES_MAC]); + return 0; +} + +static void mac_del_from_slave(struct mlx4_dev *dev, int slave, u64 mac, + int port) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *mac_list = + &tracker->slave_list[slave].res_list[RES_MAC]; + struct mac_res *res, *tmp; + + list_for_each_entry_safe(res, tmp, mac_list, list) { + if (res->mac == mac && res->port == (u8) port) { + if (!--res->ref_count) { + list_del(&res->list); + mlx4_release_resource(dev, slave, RES_MAC, 1, port); + kfree(res); + } + break; + } + } +} + +static void rem_slave_macs(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *mac_list = + &tracker->slave_list[slave].res_list[RES_MAC]; + struct mac_res *res, *tmp; + int i; + + list_for_each_entry_safe(res, tmp, mac_list, list) { + list_del(&res->list); + /* dereference the mac the num times the slave referenced it */ + for (i = 0; i < res->ref_count; i++) + __mlx4_unregister_mac(dev, res->port, res->mac); + mlx4_release_resource(dev, slave, RES_MAC, 1, res->port); + kfree(res); + } +} + +static int mac_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param, int in_port) +{ + int err = -EINVAL; + int port; + u64 mac; + u8 smac_index; + + if (op != RES_OP_RESERVE_AND_MAP) + return err; + + port = !in_port ? get_param_l(out_param) : in_port; + port = mlx4_slave_convert_port( + dev, slave, port); + + if (port < 0) + return -EINVAL; + mac = in_param; + + err = __mlx4_register_mac(dev, port, mac); + if (err >= 0) { + smac_index = err; + set_param_l(out_param, err); + err = 0; + } + + if (!err) { + err = mac_add_to_slave(dev, slave, mac, port, smac_index); + if (err) + __mlx4_unregister_mac(dev, port, mac); + } + return err; +} + +static int vlan_add_to_slave(struct mlx4_dev *dev, int slave, u16 vlan, + int port, int vlan_index) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *vlan_list = + &tracker->slave_list[slave].res_list[RES_VLAN]; + struct vlan_res *res, *tmp; + + list_for_each_entry_safe(res, tmp, vlan_list, list) { + if (res->vlan == vlan && res->port == (u8) port) { + /* vlan found. update ref count */ + ++res->ref_count; + return 0; + } + } + + if (mlx4_grant_resource(dev, slave, RES_VLAN, 1, port)) + return -EINVAL; + res = kzalloc(sizeof(*res), GFP_KERNEL); + if (!res) { + mlx4_release_resource(dev, slave, RES_VLAN, 1, port); + return -ENOMEM; + } + res->vlan = vlan; + res->port = (u8) port; + res->vlan_index = vlan_index; + res->ref_count = 1; + list_add_tail(&res->list, + &tracker->slave_list[slave].res_list[RES_VLAN]); + return 0; +} + + +static void vlan_del_from_slave(struct mlx4_dev *dev, int slave, u16 vlan, + int port) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *vlan_list = + &tracker->slave_list[slave].res_list[RES_VLAN]; + struct vlan_res *res, *tmp; + + list_for_each_entry_safe(res, tmp, vlan_list, list) { + if (res->vlan == vlan && res->port == (u8) port) { + if (!--res->ref_count) { + list_del(&res->list); + mlx4_release_resource(dev, slave, RES_VLAN, + 1, port); + kfree(res); + } + break; + } + } +} + +static void rem_slave_vlans(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *vlan_list = + &tracker->slave_list[slave].res_list[RES_VLAN]; + struct vlan_res *res, *tmp; + int i; + + list_for_each_entry_safe(res, tmp, vlan_list, list) { + list_del(&res->list); + /* dereference the vlan the num times the slave referenced it */ + for (i = 0; i < res->ref_count; i++) + __mlx4_unregister_vlan(dev, res->port, res->vlan); + mlx4_release_resource(dev, slave, RES_VLAN, 1, res->port); + kfree(res); + } +} + +static int vlan_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param, int in_port) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *slave_state = priv->mfunc.master.slave_state; + int err; + u16 vlan; + int vlan_index; + int port; + + port = !in_port ? get_param_l(out_param) : in_port; + + if (!port || op != RES_OP_RESERVE_AND_MAP) + return -EINVAL; + + port = mlx4_slave_convert_port( + dev, slave, port); + + if (port < 0) + return -EINVAL; + /* upstream kernels had NOP for reg/unreg vlan. Continue this. */ + if (!in_port && port > 0 && port <= dev->caps.num_ports) { + slave_state[slave].old_vlan_api = true; + return 0; + } + + vlan = (u16) in_param; + + err = __mlx4_register_vlan(dev, port, vlan, &vlan_index); + if (!err) { + set_param_l(out_param, (u32) vlan_index); + err = vlan_add_to_slave(dev, slave, vlan, port, vlan_index); + if (err) + __mlx4_unregister_vlan(dev, port, vlan); + } + return err; +} + +static int counter_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + u32 index; + int err; + + if (op != RES_OP_RESERVE) + return -EINVAL; + + err = mlx4_grant_resource(dev, slave, RES_COUNTER, 1, 0); + if (err) + return err; + + err = __mlx4_counter_alloc(dev, &index); + if (err) { + mlx4_release_resource(dev, slave, RES_COUNTER, 1, 0); + return err; + } + + err = add_res_range(dev, slave, index, 1, RES_COUNTER, 0); + if (err) { + __mlx4_counter_free(dev, index); + mlx4_release_resource(dev, slave, RES_COUNTER, 1, 0); + } else { + set_param_l(out_param, index); + } + + return err; +} + +static int xrcdn_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + u32 xrcdn; + int err; + + if (op != RES_OP_RESERVE) + return -EINVAL; + + err = __mlx4_xrcd_alloc(dev, &xrcdn); + if (err) + return err; + + err = add_res_range(dev, slave, xrcdn, 1, RES_XRCD, 0); + if (err) + __mlx4_xrcd_free(dev, xrcdn); + else + set_param_l(out_param, xrcdn); + + return err; +} + +int mlx4_ALLOC_RES_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int alop = vhcr->op_modifier; + + switch (vhcr->in_modifier & 0xFF) { + case RES_QP: + err = qp_alloc_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_MTT: + err = mtt_alloc_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_MPT: + err = mpt_alloc_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_CQ: + err = cq_alloc_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_SRQ: + err = srq_alloc_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_MAC: + err = mac_alloc_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param, + (vhcr->in_modifier >> 8) & 0xFF); + break; + + case RES_VLAN: + err = vlan_alloc_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param, + (vhcr->in_modifier >> 8) & 0xFF); + break; + + case RES_COUNTER: + err = counter_alloc_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_XRCD: + err = xrcdn_alloc_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + default: + err = -EINVAL; + break; + } + + return err; +} + +static int qp_free_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param) +{ + int err; + int count; + int base; + int qpn; + + switch (op) { + case RES_OP_RESERVE: + base = get_param_l(&in_param) & 0x7fffff; + count = get_param_h(&in_param); + err = rem_res_range(dev, slave, base, count, RES_QP, 0); + if (err) + break; + mlx4_release_resource(dev, slave, RES_QP, count, 0); + __mlx4_qp_release_range(dev, base, count); + break; + case RES_OP_MAP_ICM: + qpn = get_param_l(&in_param) & 0x7fffff; + err = qp_res_start_move_to(dev, slave, qpn, RES_QP_RESERVED, + NULL, 0); + if (err) + return err; + + if (!fw_reserved(dev, qpn)) + __mlx4_qp_free_icm(dev, qpn); + + res_end_move(dev, slave, RES_QP, qpn); + + if (valid_reserved(dev, slave, qpn)) + err = rem_res_range(dev, slave, qpn, 1, RES_QP, 0); + break; + default: + err = -EINVAL; + break; + } + return err; +} + +static int mtt_free_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int err = -EINVAL; + int base; + int order; + + if (op != RES_OP_RESERVE_AND_MAP) + return err; + + base = get_param_l(&in_param); + order = get_param_h(&in_param); + err = rem_res_range(dev, slave, base, 1, RES_MTT, order); + if (!err) { + mlx4_release_resource(dev, slave, RES_MTT, 1 << order, 0); + __mlx4_free_mtt_range(dev, base, order); + } + return err; +} + +static int mpt_free_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param) +{ + int err = -EINVAL; + int index; + int id; + struct res_mpt *mpt; + + switch (op) { + case RES_OP_RESERVE: + index = get_param_l(&in_param); + id = index & mpt_mask(dev); + err = get_res(dev, slave, id, RES_MPT, &mpt); + if (err) + break; + index = mpt->key; + put_res(dev, slave, id, RES_MPT); + + err = rem_res_range(dev, slave, id, 1, RES_MPT, 0); + if (err) + break; + mlx4_release_resource(dev, slave, RES_MPT, 1, 0); + __mlx4_mpt_release(dev, index); + break; + case RES_OP_MAP_ICM: + index = get_param_l(&in_param); + id = index & mpt_mask(dev); + err = mr_res_start_move_to(dev, slave, id, + RES_MPT_RESERVED, &mpt); + if (err) + return err; + + __mlx4_mpt_free_icm(dev, mpt->key); + res_end_move(dev, slave, RES_MPT, id); + return err; + break; + default: + err = -EINVAL; + break; + } + return err; +} + +static int cq_free_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int cqn; + int err; + + switch (op) { + case RES_OP_RESERVE_AND_MAP: + cqn = get_param_l(&in_param); + err = rem_res_range(dev, slave, cqn, 1, RES_CQ, 0); + if (err) + break; + + mlx4_release_resource(dev, slave, RES_CQ, 1, 0); + __mlx4_cq_free_icm(dev, cqn); + break; + + default: + err = -EINVAL; + break; + } + + return err; +} + +static int srq_free_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int srqn; + int err; + + switch (op) { + case RES_OP_RESERVE_AND_MAP: + srqn = get_param_l(&in_param); + err = rem_res_range(dev, slave, srqn, 1, RES_SRQ, 0); + if (err) + break; + + mlx4_release_resource(dev, slave, RES_SRQ, 1, 0); + __mlx4_srq_free_icm(dev, srqn); + break; + + default: + err = -EINVAL; + break; + } + + return err; +} + +static int mac_free_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param, int in_port) +{ + int port; + int err = 0; + + switch (op) { + case RES_OP_RESERVE_AND_MAP: + port = !in_port ? get_param_l(out_param) : in_port; + port = mlx4_slave_convert_port( + dev, slave, port); + + if (port < 0) + return -EINVAL; + mac_del_from_slave(dev, slave, in_param, port); + __mlx4_unregister_mac(dev, port, in_param); + break; + default: + err = -EINVAL; + break; + } + + return err; + +} + +static int vlan_free_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param, int port) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *slave_state = priv->mfunc.master.slave_state; + int err = 0; + + port = mlx4_slave_convert_port( + dev, slave, port); + + if (port < 0) + return -EINVAL; + switch (op) { + case RES_OP_RESERVE_AND_MAP: + if (slave_state[slave].old_vlan_api) + return 0; + if (!port) + return -EINVAL; + vlan_del_from_slave(dev, slave, in_param, port); + __mlx4_unregister_vlan(dev, port, in_param); + break; + default: + err = -EINVAL; + break; + } + + return err; +} + +static int counter_free_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int index; + int err; + + if (op != RES_OP_RESERVE) + return -EINVAL; + + index = get_param_l(&in_param); + err = rem_res_range(dev, slave, index, 1, RES_COUNTER, 0); + if (err) + return err; + + __mlx4_counter_free(dev, index); + mlx4_release_resource(dev, slave, RES_COUNTER, 1, 0); + + return err; +} + +static int xrcdn_free_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int xrcdn; + int err; + + if (op != RES_OP_RESERVE) + return -EINVAL; + + xrcdn = get_param_l(&in_param); + err = rem_res_range(dev, slave, xrcdn, 1, RES_XRCD, 0); + if (err) + return err; + + __mlx4_xrcd_free(dev, xrcdn); + + return err; +} + +int mlx4_FREE_RES_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err = -EINVAL; + int alop = vhcr->op_modifier; + + switch (vhcr->in_modifier & 0xFF) { + case RES_QP: + err = qp_free_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param); + break; + + case RES_MTT: + err = mtt_free_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_MPT: + err = mpt_free_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param); + break; + + case RES_CQ: + err = cq_free_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_SRQ: + err = srq_free_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_MAC: + err = mac_free_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param, + (vhcr->in_modifier >> 8) & 0xFF); + break; + + case RES_VLAN: + err = vlan_free_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param, + (vhcr->in_modifier >> 8) & 0xFF); + break; + + case RES_COUNTER: + err = counter_free_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_XRCD: + err = xrcdn_free_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + + default: + break; + } + return err; +} + +/* ugly but other choices are uglier */ +static int mr_phys_mpt(struct mlx4_mpt_entry *mpt) +{ + return (be32_to_cpu(mpt->flags) >> 9) & 1; +} + +static int mr_get_mtt_addr(struct mlx4_mpt_entry *mpt) +{ + return (int)be64_to_cpu(mpt->mtt_addr) & 0xfffffff8; +} + +static int mr_get_mtt_size(struct mlx4_mpt_entry *mpt) +{ + return be32_to_cpu(mpt->mtt_sz); +} + +static u32 mr_get_pd(struct mlx4_mpt_entry *mpt) +{ + return be32_to_cpu(mpt->pd_flags) & 0x00ffffff; +} + +static int mr_is_fmr(struct mlx4_mpt_entry *mpt) +{ + return be32_to_cpu(mpt->pd_flags) & MLX4_MPT_PD_FLAG_FAST_REG; +} + +static int mr_is_bind_enabled(struct mlx4_mpt_entry *mpt) +{ + return be32_to_cpu(mpt->flags) & MLX4_MPT_FLAG_BIND_ENABLE; +} + +static int mr_is_region(struct mlx4_mpt_entry *mpt) +{ + return be32_to_cpu(mpt->flags) & MLX4_MPT_FLAG_REGION; +} + +static int qp_get_mtt_addr(struct mlx4_qp_context *qpc) +{ + return be32_to_cpu(qpc->mtt_base_addr_l) & 0xfffffff8; +} + +static int srq_get_mtt_addr(struct mlx4_srq_context *srqc) +{ + return be32_to_cpu(srqc->mtt_base_addr_l) & 0xfffffff8; +} + +static int qp_get_mtt_size(struct mlx4_qp_context *qpc) +{ + int page_shift = (qpc->log_page_size & 0x3f) + 12; + int log_sq_size = (qpc->sq_size_stride >> 3) & 0xf; + int log_sq_sride = qpc->sq_size_stride & 7; + int log_rq_size = (qpc->rq_size_stride >> 3) & 0xf; + int log_rq_stride = qpc->rq_size_stride & 7; + int srq = (be32_to_cpu(qpc->srqn) >> 24) & 1; + int rss = (be32_to_cpu(qpc->flags) >> 13) & 1; + u32 ts = (be32_to_cpu(qpc->flags) >> 16) & 0xff; + int xrc = (ts == MLX4_QP_ST_XRC) ? 1 : 0; + int sq_size; + int rq_size; + int total_pages; + int total_mem; + int page_offset = (be32_to_cpu(qpc->params2) >> 6) & 0x3f; + + sq_size = 1 << (log_sq_size + log_sq_sride + 4); + rq_size = (srq|rss|xrc) ? 0 : (1 << (log_rq_size + log_rq_stride + 4)); + total_mem = sq_size + rq_size; + total_pages = + roundup_pow_of_two((total_mem + (page_offset << 6)) >> + page_shift); + + return total_pages; +} + +static int check_mtt_range(struct mlx4_dev *dev, int slave, int start, + int size, struct res_mtt *mtt) +{ + int res_start = mtt->com.res_id; + int res_size = (1 << mtt->order); + + if (start < res_start || start + size > res_start + res_size) + return -EPERM; + return 0; +} + +int mlx4_SW2HW_MPT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int index = vhcr->in_modifier; + struct res_mtt *mtt; + struct res_mpt *mpt; + int mtt_base = mr_get_mtt_addr(inbox->buf) / dev->caps.mtt_entry_sz; + int phys; + int id; + u32 pd; + int pd_slave; + + id = index & mpt_mask(dev); + err = mr_res_start_move_to(dev, slave, id, RES_MPT_HW, &mpt); + if (err) + return err; + + /* Disable memory windows for VFs. */ + if (!mr_is_region(inbox->buf)) { + err = -EPERM; + goto ex_abort; + } + + /* Make sure that the PD bits related to the slave id are zeros. */ + pd = mr_get_pd(inbox->buf); + pd_slave = (pd >> 17) & 0x7f; + if (pd_slave != 0 && pd_slave != slave) { + err = -EPERM; + goto ex_abort; + } + + if (mr_is_fmr(inbox->buf)) { + /* FMR and Bind Enable are forbidden in slave devices. */ + if (mr_is_bind_enabled(inbox->buf)) { + err = -EPERM; + goto ex_abort; + } + /* FMR and Memory Windows are also forbidden. */ + if (!mr_is_region(inbox->buf)) { + err = -EPERM; + goto ex_abort; + } + } + + phys = mr_phys_mpt(inbox->buf); + if (!phys) { + err = get_res(dev, slave, mtt_base, RES_MTT, &mtt); + if (err) + goto ex_abort; + + err = check_mtt_range(dev, slave, mtt_base, + mr_get_mtt_size(inbox->buf), mtt); + if (err) + goto ex_put; + + mpt->mtt = mtt; + } + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + goto ex_put; + + if (!phys) { + atomic_inc(&mtt->ref_count); + put_res(dev, slave, mtt->com.res_id, RES_MTT); + } + + res_end_move(dev, slave, RES_MPT, id); + return 0; + +ex_put: + if (!phys) + put_res(dev, slave, mtt->com.res_id, RES_MTT); +ex_abort: + res_abort_move(dev, slave, RES_MPT, id); + + return err; +} + +int mlx4_HW2SW_MPT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int index = vhcr->in_modifier; + struct res_mpt *mpt; + int id; + + id = index & mpt_mask(dev); + err = mr_res_start_move_to(dev, slave, id, RES_MPT_MAPPED, &mpt); + if (err) + return err; + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + goto ex_abort; + + if (mpt->mtt) + atomic_dec(&mpt->mtt->ref_count); + + res_end_move(dev, slave, RES_MPT, id); + return 0; + +ex_abort: + res_abort_move(dev, slave, RES_MPT, id); + + return err; +} + +int mlx4_QUERY_MPT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int index = vhcr->in_modifier; + struct res_mpt *mpt; + int id; + + id = index & mpt_mask(dev); + err = get_res(dev, slave, id, RES_MPT, &mpt); + if (err) + return err; + + if (mpt->com.from_state == RES_MPT_MAPPED) { + /* In order to allow rereg in SRIOV, we need to alter the MPT entry. To do + * that, the VF must read the MPT. But since the MPT entry memory is not + * in the VF's virtual memory space, it must use QUERY_MPT to obtain the + * entry contents. To guarantee that the MPT cannot be changed, the driver + * must perform HW2SW_MPT before this query and return the MPT entry to HW + * ownership fofollowing the change. The change here allows the VF to + * perform QUERY_MPT also when the entry is in SW ownership. + */ + struct mlx4_mpt_entry *mpt_entry = mlx4_table_find( + &mlx4_priv(dev)->mr_table.dmpt_table, + mpt->key, NULL); + + if (NULL == mpt_entry || NULL == outbox->buf) { + err = -EINVAL; + goto out; + } + + memcpy(outbox->buf, mpt_entry, sizeof(*mpt_entry)); + + err = 0; + } else if (mpt->com.from_state == RES_MPT_HW) { + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + } else { + err = -EBUSY; + goto out; + } + + +out: + put_res(dev, slave, id, RES_MPT); + return err; +} + +static int qp_get_rcqn(struct mlx4_qp_context *qpc) +{ + return be32_to_cpu(qpc->cqn_recv) & 0xffffff; +} + +static int qp_get_scqn(struct mlx4_qp_context *qpc) +{ + return be32_to_cpu(qpc->cqn_send) & 0xffffff; +} + +static u32 qp_get_srqn(struct mlx4_qp_context *qpc) +{ + return be32_to_cpu(qpc->srqn) & 0x1ffffff; +} + +static void adjust_proxy_tun_qkey(struct mlx4_dev *dev, struct mlx4_vhcr *vhcr, + struct mlx4_qp_context *context) +{ + u32 qpn = vhcr->in_modifier & 0xffffff; + u32 qkey = 0; + + if (mlx4_get_parav_qkey(dev, qpn, &qkey)) + return; + + /* adjust qkey in qp context */ + context->qkey = cpu_to_be32(qkey); +} + +int mlx4_RST2INIT_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int qpn = vhcr->in_modifier & 0x7fffff; + struct res_mtt *mtt; + struct res_qp *qp; + struct mlx4_qp_context *qpc = inbox->buf + 8; + int mtt_base = qp_get_mtt_addr(qpc) / dev->caps.mtt_entry_sz; + int mtt_size = qp_get_mtt_size(qpc); + struct res_cq *rcq; + struct res_cq *scq; + int rcqn = qp_get_rcqn(qpc); + int scqn = qp_get_scqn(qpc); + u32 srqn = qp_get_srqn(qpc) & 0xffffff; + int use_srq = (qp_get_srqn(qpc) >> 24) & 1; + struct res_srq *srq; + int local_qpn = be32_to_cpu(qpc->local_qpn) & 0xffffff; + + err = qp_res_start_move_to(dev, slave, qpn, RES_QP_HW, &qp, 0); + if (err) + return err; + qp->local_qpn = local_qpn; + qp->sched_queue = 0; + qp->param3 = 0; + qp->vlan_control = 0; + qp->fvl_rx = 0; + qp->pri_path_fl = 0; + qp->vlan_index = 0; + qp->feup = 0; + qp->qpc_flags = be32_to_cpu(qpc->flags); + + err = get_res(dev, slave, mtt_base, RES_MTT, &mtt); + if (err) + goto ex_abort; + + err = check_mtt_range(dev, slave, mtt_base, mtt_size, mtt); + if (err) + goto ex_put_mtt; + + err = get_res(dev, slave, rcqn, RES_CQ, &rcq); + if (err) + goto ex_put_mtt; + + if (scqn != rcqn) { + err = get_res(dev, slave, scqn, RES_CQ, &scq); + if (err) + goto ex_put_rcq; + } else + scq = rcq; + + if (use_srq) { + err = get_res(dev, slave, srqn, RES_SRQ, &srq); + if (err) + goto ex_put_scq; + } + + adjust_proxy_tun_qkey(dev, vhcr, qpc); + update_pkey_index(dev, slave, inbox); + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + goto ex_put_srq; + atomic_inc(&mtt->ref_count); + qp->mtt = mtt; + atomic_inc(&rcq->ref_count); + qp->rcq = rcq; + atomic_inc(&scq->ref_count); + qp->scq = scq; + + if (scqn != rcqn) + put_res(dev, slave, scqn, RES_CQ); + + if (use_srq) { + atomic_inc(&srq->ref_count); + put_res(dev, slave, srqn, RES_SRQ); + qp->srq = srq; + } + put_res(dev, slave, rcqn, RES_CQ); + put_res(dev, slave, mtt_base, RES_MTT); + res_end_move(dev, slave, RES_QP, qpn); + + return 0; + +ex_put_srq: + if (use_srq) + put_res(dev, slave, srqn, RES_SRQ); +ex_put_scq: + if (scqn != rcqn) + put_res(dev, slave, scqn, RES_CQ); +ex_put_rcq: + put_res(dev, slave, rcqn, RES_CQ); +ex_put_mtt: + put_res(dev, slave, mtt_base, RES_MTT); +ex_abort: + res_abort_move(dev, slave, RES_QP, qpn); + + return err; +} + +static int eq_get_mtt_addr(struct mlx4_eq_context *eqc) +{ + return be32_to_cpu(eqc->mtt_base_addr_l) & 0xfffffff8; +} + +static int eq_get_mtt_size(struct mlx4_eq_context *eqc) +{ + int log_eq_size = eqc->log_eq_size & 0x1f; + int page_shift = (eqc->log_page_size & 0x3f) + 12; + + if (log_eq_size + 5 < page_shift) + return 1; + + return 1 << (log_eq_size + 5 - page_shift); +} + +static int cq_get_mtt_addr(struct mlx4_cq_context *cqc) +{ + return be32_to_cpu(cqc->mtt_base_addr_l) & 0xfffffff8; +} + +static int cq_get_mtt_size(struct mlx4_cq_context *cqc) +{ + int log_cq_size = (be32_to_cpu(cqc->logsize_usrpage) >> 24) & 0x1f; + int page_shift = (cqc->log_page_size & 0x3f) + 12; + + if (log_cq_size + 5 < page_shift) + return 1; + + return 1 << (log_cq_size + 5 - page_shift); +} + +int mlx4_SW2HW_EQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int eqn = vhcr->in_modifier; + int res_id = (slave << 8) | eqn; + struct mlx4_eq_context *eqc = inbox->buf; + int mtt_base = eq_get_mtt_addr(eqc) / dev->caps.mtt_entry_sz; + int mtt_size = eq_get_mtt_size(eqc); + struct res_eq *eq; + struct res_mtt *mtt; + + err = add_res_range(dev, slave, res_id, 1, RES_EQ, 0); + if (err) + return err; + err = eq_res_start_move_to(dev, slave, res_id, RES_EQ_HW, &eq); + if (err) + goto out_add; + + err = get_res(dev, slave, mtt_base, RES_MTT, &mtt); + if (err) + goto out_move; + + err = check_mtt_range(dev, slave, mtt_base, mtt_size, mtt); + if (err) + goto out_put; + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + goto out_put; + + atomic_inc(&mtt->ref_count); + eq->mtt = mtt; + put_res(dev, slave, mtt->com.res_id, RES_MTT); + res_end_move(dev, slave, RES_EQ, res_id); + return 0; + +out_put: + put_res(dev, slave, mtt->com.res_id, RES_MTT); +out_move: + res_abort_move(dev, slave, RES_EQ, res_id); +out_add: + rem_res_range(dev, slave, res_id, 1, RES_EQ, 0); + return err; +} + +static int get_containing_mtt(struct mlx4_dev *dev, int slave, int start, + int len, struct res_mtt **res) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct res_mtt *mtt; + int err = -EINVAL; + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry(mtt, &tracker->slave_list[slave].res_list[RES_MTT], + com.list) { + if (!check_mtt_range(dev, slave, start, len, mtt)) { + *res = mtt; + mtt->com.from_state = mtt->com.state; + mtt->com.state = RES_MTT_BUSY; + err = 0; + break; + } + } + spin_unlock_irq(mlx4_tlock(dev)); + + return err; +} + +static int verify_qp_parameters(struct mlx4_dev *dev, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + enum qp_transition transition, u8 slave) +{ + u32 qp_type; + u32 qpn; + struct mlx4_qp_context *qp_ctx; + enum mlx4_qp_optpar optpar; + int port; + int num_gids; + + qp_ctx = inbox->buf + 8; + qp_type = (be32_to_cpu(qp_ctx->flags) >> 16) & 0xff; + optpar = be32_to_cpu(*(__be32 *) inbox->buf); + + switch (qp_type) { + case MLX4_QP_ST_RC: + case MLX4_QP_ST_XRC: + case MLX4_QP_ST_UC: + switch (transition) { + case QP_TRANS_INIT2RTR: + case QP_TRANS_RTR2RTS: + case QP_TRANS_RTS2RTS: + case QP_TRANS_SQD2SQD: + case QP_TRANS_SQD2RTS: + if (slave != mlx4_master_func_num(dev)) + if (optpar & MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH) { + port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1; + if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) + num_gids = mlx4_get_slave_num_gids(dev, slave, port); + else + num_gids = 1; + if (qp_ctx->pri_path.mgid_index >= num_gids) + return -EINVAL; + } + if (optpar & MLX4_QP_OPTPAR_ALT_ADDR_PATH) { + port = (qp_ctx->alt_path.sched_queue >> 6 & 1) + 1; + if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) + num_gids = mlx4_get_slave_num_gids(dev, slave, port); + else + num_gids = 1; + if (qp_ctx->alt_path.mgid_index >= num_gids) + return -EINVAL; + } + break; + default: + break; + } + break; + + case MLX4_QP_ST_MLX: + qpn = vhcr->in_modifier & 0x7fffff; + port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1; + if (transition == QP_TRANS_INIT2RTR && + slave != mlx4_master_func_num(dev) && + mlx4_is_qp_reserved(dev, qpn) && + !mlx4_vf_smi_enabled(dev, slave, port)) { + /* only enabled VFs may create MLX proxy QPs */ + mlx4_err(dev, "%s: unprivileged slave %d attempting to create an MLX proxy special QP on port %d\n", + __func__, slave, port); + return -EPERM; + } + break; + + default: + break; + } + + return 0; +} + +int mlx4_WRITE_MTT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_mtt mtt; + __be64 *page_list = inbox->buf; + u64 *pg_list = (u64 *)page_list; + int i; + struct res_mtt *rmtt = NULL; + int start = be64_to_cpu(page_list[0]); + int npages = vhcr->in_modifier; + int err; + + err = get_containing_mtt(dev, slave, start, npages, &rmtt); + if (err) + return err; + + /* Call the SW implementation of write_mtt: + * - Prepare a dummy mtt struct + * - Translate inbox contents to simple addresses in host endianess */ + mtt.offset = 0; /* TBD this is broken but I don't handle it since + we don't really use it */ + mtt.order = 0; + mtt.page_shift = 0; + for (i = 0; i < npages; ++i) + pg_list[i + 2] = (be64_to_cpu(page_list[i + 2]) & ~1ULL); + + err = __mlx4_write_mtt(dev, &mtt, be64_to_cpu(page_list[0]), npages, + ((u64 *)page_list + 2)); + + if (rmtt) + put_res(dev, slave, rmtt->com.res_id, RES_MTT); + + return err; +} + +int mlx4_HW2SW_EQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int eqn = vhcr->in_modifier; + int res_id = eqn | (slave << 8); + struct res_eq *eq; + int err; + + err = eq_res_start_move_to(dev, slave, res_id, RES_EQ_RESERVED, &eq); + if (err) + return err; + + err = get_res(dev, slave, eq->mtt->com.res_id, RES_MTT, NULL); + if (err) + goto ex_abort; + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + goto ex_put; + + atomic_dec(&eq->mtt->ref_count); + put_res(dev, slave, eq->mtt->com.res_id, RES_MTT); + res_end_move(dev, slave, RES_EQ, res_id); + rem_res_range(dev, slave, res_id, 1, RES_EQ, 0); + + return 0; + +ex_put: + put_res(dev, slave, eq->mtt->com.res_id, RES_MTT); +ex_abort: + res_abort_move(dev, slave, RES_EQ, res_id); + + return err; +} + +int mlx4_GEN_EQE(struct mlx4_dev *dev, int slave, struct mlx4_eqe *eqe) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_event_eq_info *event_eq; + struct mlx4_cmd_mailbox *mailbox; + u32 in_modifier = 0; + int err; + int res_id; + struct res_eq *req; + + if (!priv->mfunc.master.slave_state) + return -EINVAL; + + event_eq = &priv->mfunc.master.slave_state[slave].event_eq[eqe->type]; + + /* Create the event only if the slave is registered */ + if (event_eq->eqn < 0) + return 0; + + mutex_lock(&priv->mfunc.master.gen_eqe_mutex[slave]); + res_id = (slave << 8) | event_eq->eqn; + err = get_res(dev, slave, res_id, RES_EQ, &req); + if (err) + goto unlock; + + if (req->com.from_state != RES_EQ_HW) { + err = -EINVAL; + goto put; + } + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + err = PTR_ERR(mailbox); + goto put; + } + + if (eqe->type == MLX4_EVENT_TYPE_CMD) { + ++event_eq->token; + eqe->event.cmd.token = cpu_to_be16(event_eq->token); + } + + memcpy(mailbox->buf, (u8 *) eqe, 28); + + in_modifier = (slave & 0xff) | ((event_eq->eqn & 0xff) << 16); + + err = mlx4_cmd(dev, mailbox->dma, in_modifier, 0, + MLX4_CMD_GEN_EQE, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_NATIVE); + + put_res(dev, slave, res_id, RES_EQ); + mutex_unlock(&priv->mfunc.master.gen_eqe_mutex[slave]); + mlx4_free_cmd_mailbox(dev, mailbox); + return err; + +put: + put_res(dev, slave, res_id, RES_EQ); + +unlock: + mutex_unlock(&priv->mfunc.master.gen_eqe_mutex[slave]); + return err; +} + +int mlx4_QUERY_EQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int eqn = vhcr->in_modifier; + int res_id = eqn | (slave << 8); + struct res_eq *eq; + int err; + + err = get_res(dev, slave, res_id, RES_EQ, &eq); + if (err) + return err; + + if (eq->com.from_state != RES_EQ_HW) { + err = -EINVAL; + goto ex_put; + } + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + +ex_put: + put_res(dev, slave, res_id, RES_EQ); + return err; +} + +int mlx4_SW2HW_CQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int cqn = vhcr->in_modifier; + struct mlx4_cq_context *cqc = inbox->buf; + int mtt_base = cq_get_mtt_addr(cqc) / dev->caps.mtt_entry_sz; + struct res_cq *cq; + struct res_mtt *mtt; + + err = cq_res_start_move_to(dev, slave, cqn, RES_CQ_HW, &cq); + if (err) + return err; + err = get_res(dev, slave, mtt_base, RES_MTT, &mtt); + if (err) + goto out_move; + err = check_mtt_range(dev, slave, mtt_base, cq_get_mtt_size(cqc), mtt); + if (err) + goto out_put; + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + goto out_put; + atomic_inc(&mtt->ref_count); + cq->mtt = mtt; + put_res(dev, slave, mtt->com.res_id, RES_MTT); + res_end_move(dev, slave, RES_CQ, cqn); + return 0; + +out_put: + put_res(dev, slave, mtt->com.res_id, RES_MTT); +out_move: + res_abort_move(dev, slave, RES_CQ, cqn); + return err; +} + +int mlx4_HW2SW_CQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int cqn = vhcr->in_modifier; + struct res_cq *cq; + + err = cq_res_start_move_to(dev, slave, cqn, RES_CQ_ALLOCATED, &cq); + if (err) + return err; + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + goto out_move; + atomic_dec(&cq->mtt->ref_count); + res_end_move(dev, slave, RES_CQ, cqn); + return 0; + +out_move: + res_abort_move(dev, slave, RES_CQ, cqn); + return err; +} + +int mlx4_QUERY_CQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int cqn = vhcr->in_modifier; + struct res_cq *cq; + int err; + + err = get_res(dev, slave, cqn, RES_CQ, &cq); + if (err) + return err; + + if (cq->com.from_state != RES_CQ_HW) + goto ex_put; + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +ex_put: + put_res(dev, slave, cqn, RES_CQ); + + return err; +} + +static int handle_resize(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd, + struct res_cq *cq) +{ + int err; + struct res_mtt *orig_mtt; + struct res_mtt *mtt; + struct mlx4_cq_context *cqc = inbox->buf; + int mtt_base = cq_get_mtt_addr(cqc) / dev->caps.mtt_entry_sz; + + err = get_res(dev, slave, cq->mtt->com.res_id, RES_MTT, &orig_mtt); + if (err) + return err; + + if (orig_mtt != cq->mtt) { + err = -EINVAL; + goto ex_put; + } + + err = get_res(dev, slave, mtt_base, RES_MTT, &mtt); + if (err) + goto ex_put; + + err = check_mtt_range(dev, slave, mtt_base, cq_get_mtt_size(cqc), mtt); + if (err) + goto ex_put1; + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + goto ex_put1; + atomic_dec(&orig_mtt->ref_count); + put_res(dev, slave, orig_mtt->com.res_id, RES_MTT); + atomic_inc(&mtt->ref_count); + cq->mtt = mtt; + put_res(dev, slave, mtt->com.res_id, RES_MTT); + return 0; + +ex_put1: + put_res(dev, slave, mtt->com.res_id, RES_MTT); +ex_put: + put_res(dev, slave, orig_mtt->com.res_id, RES_MTT); + + return err; + +} + +int mlx4_MODIFY_CQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int cqn = vhcr->in_modifier; + struct res_cq *cq; + int err; + + err = get_res(dev, slave, cqn, RES_CQ, &cq); + if (err) + return err; + + if (cq->com.from_state != RES_CQ_HW) + goto ex_put; + + if (vhcr->op_modifier == 0) { + err = handle_resize(dev, slave, vhcr, inbox, outbox, cmd, cq); + goto ex_put; + } + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +ex_put: + put_res(dev, slave, cqn, RES_CQ); + + return err; +} + +static int srq_get_mtt_size(struct mlx4_srq_context *srqc) +{ + int log_srq_size = (be32_to_cpu(srqc->state_logsize_srqn) >> 24) & 0xf; + int log_rq_stride = srqc->logstride & 7; + int page_shift = (srqc->log_page_size & 0x3f) + 12; + + if (log_srq_size + log_rq_stride + 4 < page_shift) + return 1; + + return 1 << (log_srq_size + log_rq_stride + 4 - page_shift); +} + +int mlx4_SW2HW_SRQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int srqn = vhcr->in_modifier; + struct res_mtt *mtt; + struct res_srq *srq; + struct mlx4_srq_context *srqc = inbox->buf; + int mtt_base = srq_get_mtt_addr(srqc) / dev->caps.mtt_entry_sz; + + if (srqn != (be32_to_cpu(srqc->state_logsize_srqn) & 0xffffff)) + return -EINVAL; + + err = srq_res_start_move_to(dev, slave, srqn, RES_SRQ_HW, &srq); + if (err) + return err; + err = get_res(dev, slave, mtt_base, RES_MTT, &mtt); + if (err) + goto ex_abort; + err = check_mtt_range(dev, slave, mtt_base, srq_get_mtt_size(srqc), + mtt); + if (err) + goto ex_put_mtt; + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + goto ex_put_mtt; + + atomic_inc(&mtt->ref_count); + srq->mtt = mtt; + put_res(dev, slave, mtt->com.res_id, RES_MTT); + res_end_move(dev, slave, RES_SRQ, srqn); + return 0; + +ex_put_mtt: + put_res(dev, slave, mtt->com.res_id, RES_MTT); +ex_abort: + res_abort_move(dev, slave, RES_SRQ, srqn); + + return err; +} + +int mlx4_HW2SW_SRQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int srqn = vhcr->in_modifier; + struct res_srq *srq; + + err = srq_res_start_move_to(dev, slave, srqn, RES_SRQ_ALLOCATED, &srq); + if (err) + return err; + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + goto ex_abort; + atomic_dec(&srq->mtt->ref_count); + if (srq->cq) + atomic_dec(&srq->cq->ref_count); + res_end_move(dev, slave, RES_SRQ, srqn); + + return 0; + +ex_abort: + res_abort_move(dev, slave, RES_SRQ, srqn); + + return err; +} + +int mlx4_QUERY_SRQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int srqn = vhcr->in_modifier; + struct res_srq *srq; + + err = get_res(dev, slave, srqn, RES_SRQ, &srq); + if (err) + return err; + if (srq->com.from_state != RES_SRQ_HW) { + err = -EBUSY; + goto out; + } + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +out: + put_res(dev, slave, srqn, RES_SRQ); + return err; +} + +int mlx4_ARM_SRQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int srqn = vhcr->in_modifier; + struct res_srq *srq; + + err = get_res(dev, slave, srqn, RES_SRQ, &srq); + if (err) + return err; + + if (srq->com.from_state != RES_SRQ_HW) { + err = -EBUSY; + goto out; + } + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +out: + put_res(dev, slave, srqn, RES_SRQ); + return err; +} + +int mlx4_GEN_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int qpn = vhcr->in_modifier & 0x7fffff; + struct res_qp *qp; + + err = get_res(dev, slave, qpn, RES_QP, &qp); + if (err) + return err; + if (qp->com.from_state != RES_QP_HW) { + err = -EBUSY; + goto out; + } + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +out: + put_res(dev, slave, qpn, RES_QP); + return err; +} + +int mlx4_INIT2INIT_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_qp_context *context = inbox->buf + 8; + adjust_proxy_tun_qkey(dev, vhcr, context); + update_pkey_index(dev, slave, inbox); + return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +} + +static int adjust_qp_sched_queue(struct mlx4_dev *dev, int slave, + struct mlx4_qp_context *qpc, + struct mlx4_cmd_mailbox *inbox) +{ + enum mlx4_qp_optpar optpar = be32_to_cpu(*(__be32 *)inbox->buf); + u8 pri_sched_queue; + int port = mlx4_slave_convert_port( + dev, slave, (qpc->pri_path.sched_queue >> 6 & 1) + 1) - 1; + + if (port < 0) + return -EINVAL; + + pri_sched_queue = (qpc->pri_path.sched_queue & ~(1 << 6)) | + ((port & 1) << 6); + + if (optpar & MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH || + mlx4_is_eth(dev, port + 1)) { + qpc->pri_path.sched_queue = pri_sched_queue; + } + + if (optpar & MLX4_QP_OPTPAR_ALT_ADDR_PATH) { + port = mlx4_slave_convert_port( + dev, slave, (qpc->alt_path.sched_queue >> 6 & 1) + + 1) - 1; + if (port < 0) + return -EINVAL; + qpc->alt_path.sched_queue = + (qpc->alt_path.sched_queue & ~(1 << 6)) | + (port & 1) << 6; + } + return 0; +} + +static int roce_verify_mac(struct mlx4_dev *dev, int slave, + struct mlx4_qp_context *qpc, + struct mlx4_cmd_mailbox *inbox) +{ + u64 mac; + int port; + u32 ts = (be32_to_cpu(qpc->flags) >> 16) & 0xff; + u8 sched = *(u8 *)(inbox->buf + 64); + u8 smac_ix; + + port = (sched >> 6 & 1) + 1; + if (mlx4_is_eth(dev, port) && (ts != MLX4_QP_ST_MLX)) { + smac_ix = qpc->pri_path.grh_mylmc & 0x7f; + if (mac_find_smac_ix_in_slave(dev, slave, port, smac_ix, &mac)) + return -ENOENT; + } + return 0; +} + +int mlx4_INIT2RTR_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + struct mlx4_qp_context *qpc = inbox->buf + 8; + int qpn = vhcr->in_modifier & 0x7fffff; + struct res_qp *qp; + u8 orig_sched_queue; + __be32 orig_param3 = qpc->param3; + u8 orig_vlan_control = qpc->pri_path.vlan_control; + u8 orig_fvl_rx = qpc->pri_path.fvl_rx; + u8 orig_pri_path_fl = qpc->pri_path.fl; + u8 orig_vlan_index = qpc->pri_path.vlan_index; + u8 orig_feup = qpc->pri_path.feup; + + err = adjust_qp_sched_queue(dev, slave, qpc, inbox); + if (err) + return err; + err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_INIT2RTR, slave); + if (err) + return err; + + if (roce_verify_mac(dev, slave, qpc, inbox)) + return -EINVAL; + + update_pkey_index(dev, slave, inbox); + update_gid(dev, inbox, (u8)slave); + adjust_proxy_tun_qkey(dev, vhcr, qpc); + orig_sched_queue = qpc->pri_path.sched_queue; + err = update_vport_qp_param(dev, inbox, slave, qpn); + if (err) + return err; + + err = get_res(dev, slave, qpn, RES_QP, &qp); + if (err) + return err; + if (qp->com.from_state != RES_QP_HW) { + err = -EBUSY; + goto out; + } + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +out: + /* if no error, save sched queue value passed in by VF. This is + * essentially the QOS value provided by the VF. This will be useful + * if we allow dynamic changes from VST back to VGT + */ + if (!err) { + qp->sched_queue = orig_sched_queue; + qp->param3 = orig_param3; + qp->vlan_control = orig_vlan_control; + qp->fvl_rx = orig_fvl_rx; + qp->pri_path_fl = orig_pri_path_fl; + qp->vlan_index = orig_vlan_index; + qp->feup = orig_feup; + } + put_res(dev, slave, qpn, RES_QP); + return err; +} + +int mlx4_RTR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + struct mlx4_qp_context *context = inbox->buf + 8; + + err = adjust_qp_sched_queue(dev, slave, context, inbox); + if (err) + return err; + err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_RTR2RTS, slave); + if (err) + return err; + + update_pkey_index(dev, slave, inbox); + update_gid(dev, inbox, (u8)slave); + adjust_proxy_tun_qkey(dev, vhcr, context); + return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +} + +int mlx4_RTS2RTS_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + struct mlx4_qp_context *context = inbox->buf + 8; + + err = adjust_qp_sched_queue(dev, slave, context, inbox); + if (err) + return err; + err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_RTS2RTS, slave); + if (err) + return err; + + update_pkey_index(dev, slave, inbox); + update_gid(dev, inbox, (u8)slave); + adjust_proxy_tun_qkey(dev, vhcr, context); + return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +} + + +int mlx4_SQERR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_qp_context *context = inbox->buf + 8; + int err = adjust_qp_sched_queue(dev, slave, context, inbox); + if (err) + return err; + adjust_proxy_tun_qkey(dev, vhcr, context); + return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +} + +int mlx4_SQD2SQD_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + struct mlx4_qp_context *context = inbox->buf + 8; + + err = adjust_qp_sched_queue(dev, slave, context, inbox); + if (err) + return err; + err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_SQD2SQD, slave); + if (err) + return err; + + adjust_proxy_tun_qkey(dev, vhcr, context); + update_gid(dev, inbox, (u8)slave); + update_pkey_index(dev, slave, inbox); + return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +} + +int mlx4_SQD2RTS_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + struct mlx4_qp_context *context = inbox->buf + 8; + + err = adjust_qp_sched_queue(dev, slave, context, inbox); + if (err) + return err; + err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_SQD2RTS, slave); + if (err) + return err; + + adjust_proxy_tun_qkey(dev, vhcr, context); + update_gid(dev, inbox, (u8)slave); + update_pkey_index(dev, slave, inbox); + return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +} + +int mlx4_2RST_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int qpn = vhcr->in_modifier & 0x7fffff; + struct res_qp *qp; + + err = qp_res_start_move_to(dev, slave, qpn, RES_QP_MAPPED, &qp, 0); + if (err) + return err; + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + goto ex_abort; + + atomic_dec(&qp->mtt->ref_count); + atomic_dec(&qp->rcq->ref_count); + atomic_dec(&qp->scq->ref_count); + if (qp->srq) + atomic_dec(&qp->srq->ref_count); + res_end_move(dev, slave, RES_QP, qpn); + return 0; + +ex_abort: + res_abort_move(dev, slave, RES_QP, qpn); + + return err; +} + +static struct res_gid *find_gid(struct mlx4_dev *dev, int slave, + struct res_qp *rqp, u8 *gid) +{ + struct res_gid *res; + + list_for_each_entry(res, &rqp->mcg_list, list) { + if (!memcmp(res->gid, gid, 16)) + return res; + } + return NULL; +} + +static int add_mcg_res(struct mlx4_dev *dev, int slave, struct res_qp *rqp, + u8 *gid, enum mlx4_protocol prot, + enum mlx4_steer_type steer, u64 reg_id) +{ + struct res_gid *res; + int err; + + res = kzalloc(sizeof *res, GFP_KERNEL); + if (!res) + return -ENOMEM; + + spin_lock_irq(&rqp->mcg_spl); + if (find_gid(dev, slave, rqp, gid)) { + kfree(res); + err = -EEXIST; + } else { + memcpy(res->gid, gid, 16); + res->prot = prot; + res->steer = steer; + res->reg_id = reg_id; + list_add_tail(&res->list, &rqp->mcg_list); + err = 0; + } + spin_unlock_irq(&rqp->mcg_spl); + + return err; +} + +static int rem_mcg_res(struct mlx4_dev *dev, int slave, struct res_qp *rqp, + u8 *gid, enum mlx4_protocol prot, + enum mlx4_steer_type steer, u64 *reg_id) +{ + struct res_gid *res; + int err; + + spin_lock_irq(&rqp->mcg_spl); + res = find_gid(dev, slave, rqp, gid); + if (!res || res->prot != prot || res->steer != steer) + err = -EINVAL; + else { + *reg_id = res->reg_id; + list_del(&res->list); + kfree(res); + err = 0; + } + spin_unlock_irq(&rqp->mcg_spl); + + return err; +} + +static int qp_attach(struct mlx4_dev *dev, int slave, struct mlx4_qp *qp, + u8 gid[16], int block_loopback, enum mlx4_protocol prot, + enum mlx4_steer_type type, u64 *reg_id) +{ + switch (dev->caps.steering_mode) { + case MLX4_STEERING_MODE_DEVICE_MANAGED: { + int port = mlx4_slave_convert_port(dev, slave, gid[5]); + if (port < 0) + return port; + return mlx4_trans_to_dmfs_attach(dev, qp, gid, port, + block_loopback, prot, + reg_id); + } + case MLX4_STEERING_MODE_B0: + if (prot == MLX4_PROT_ETH) { + int port = mlx4_slave_convert_port(dev, slave, gid[5]); + if (port < 0) + return port; + gid[5] = port; + } + return mlx4_qp_attach_common(dev, qp, gid, + block_loopback, prot, type); + default: + return -EINVAL; + } +} + +static int qp_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, + u8 gid[16], enum mlx4_protocol prot, + enum mlx4_steer_type type, u64 reg_id) +{ + switch (dev->caps.steering_mode) { + case MLX4_STEERING_MODE_DEVICE_MANAGED: + return mlx4_flow_detach(dev, reg_id); + case MLX4_STEERING_MODE_B0: + return mlx4_qp_detach_common(dev, qp, gid, prot, type); + default: + return -EINVAL; + } +} + +static int mlx4_adjust_port(struct mlx4_dev *dev, int slave, + u8 *gid, enum mlx4_protocol prot) +{ + int real_port; + + if (prot != MLX4_PROT_ETH) + return 0; + + if (dev->caps.steering_mode == MLX4_STEERING_MODE_B0 || + dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { + real_port = mlx4_slave_convert_port(dev, slave, gid[5]); + if (real_port < 0) + return -EINVAL; + gid[5] = real_port; + } + + return 0; +} + +int mlx4_QP_ATTACH_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_qp qp; /* dummy for calling attach/detach */ + u8 *gid = inbox->buf; + enum mlx4_protocol prot = (vhcr->in_modifier >> 28) & 0x7; + int err; + int qpn; + struct res_qp *rqp; + u64 reg_id = 0; + int attach = vhcr->op_modifier; + int block_loopback = vhcr->in_modifier >> 31; + u8 steer_type_mask = 2; + enum mlx4_steer_type type = (gid[7] & steer_type_mask) >> 1; + + qpn = vhcr->in_modifier & 0xffffff; + err = get_res(dev, slave, qpn, RES_QP, &rqp); + if (err) + return err; + + qp.qpn = qpn; + if (attach) { + err = qp_attach(dev, slave, &qp, gid, block_loopback, prot, + type, ®_id); + if (err) { + pr_err("Fail to attach rule to qp 0x%x\n", qpn); + goto ex_put; + } + err = add_mcg_res(dev, slave, rqp, gid, prot, type, reg_id); + if (err) + goto ex_detach; + } else { + err = mlx4_adjust_port(dev, slave, gid, prot); + if (err) + goto ex_put; + + err = rem_mcg_res(dev, slave, rqp, gid, prot, type, ®_id); + if (err) + goto ex_put; + + err = qp_detach(dev, &qp, gid, prot, type, reg_id); + if (err) + pr_err("Fail to detach rule from qp 0x%x reg_id = 0x%llx\n", + qpn, reg_id); + } + put_res(dev, slave, qpn, RES_QP); + return err; + +ex_detach: + qp_detach(dev, &qp, gid, prot, type, reg_id); +ex_put: + put_res(dev, slave, qpn, RES_QP); + return err; +} + +/* + * MAC validation for Flow Steering rules. + * VF can attach rules only with a mac address which is assigned to it. + */ +static int validate_eth_header_mac(int slave, struct _rule_hw *eth_header, + struct list_head *rlist) +{ + struct mac_res *res, *tmp; + __be64 be_mac; + + /* make sure it isn't multicast or broadcast mac*/ + if (!is_multicast_ether_addr(eth_header->eth.dst_mac) && + !is_broadcast_ether_addr(eth_header->eth.dst_mac)) { + list_for_each_entry_safe(res, tmp, rlist, list) { + be_mac = cpu_to_be64(res->mac << 16); + if (ether_addr_equal((u8 *)&be_mac, eth_header->eth.dst_mac)) + return 0; + } + pr_err("MAC %pM doesn't belong to VF %d, Steering rule rejected\n", + eth_header->eth.dst_mac, slave); + return -EINVAL; + } + return 0; +} + +/* + * In case of missing eth header, append eth header with a MAC address + * assigned to the VF. + */ +static int add_eth_header(struct mlx4_dev *dev, int slave, + struct mlx4_cmd_mailbox *inbox, + struct list_head *rlist, int header_id) +{ + struct mac_res *res, *tmp; + u8 port; + struct mlx4_net_trans_rule_hw_ctrl *ctrl; + struct mlx4_net_trans_rule_hw_eth *eth_header; + struct mlx4_net_trans_rule_hw_ipv4 *ip_header; + struct mlx4_net_trans_rule_hw_tcp_udp *l4_header; + __be64 be_mac = 0; + __be64 mac_msk = cpu_to_be64(MLX4_MAC_MASK << 16); + + ctrl = (struct mlx4_net_trans_rule_hw_ctrl *)inbox->buf; + port = ctrl->port; + eth_header = (struct mlx4_net_trans_rule_hw_eth *)(ctrl + 1); + + /* Clear a space in the inbox for eth header */ + switch (header_id) { + case MLX4_NET_TRANS_RULE_ID_IPV4: + ip_header = + (struct mlx4_net_trans_rule_hw_ipv4 *)(eth_header + 1); + memmove(ip_header, eth_header, + sizeof(*ip_header) + sizeof(*l4_header)); + break; + case MLX4_NET_TRANS_RULE_ID_TCP: + case MLX4_NET_TRANS_RULE_ID_UDP: + l4_header = (struct mlx4_net_trans_rule_hw_tcp_udp *) + (eth_header + 1); + memmove(l4_header, eth_header, sizeof(*l4_header)); + break; + default: + return -EINVAL; + } + list_for_each_entry_safe(res, tmp, rlist, list) { + if (port == res->port) { + be_mac = cpu_to_be64(res->mac << 16); + break; + } + } + if (!be_mac) { + pr_err("Failed adding eth header to FS rule, Can't find matching MAC for port %d\n", + port); + return -EINVAL; + } + + memset(eth_header, 0, sizeof(*eth_header)); + eth_header->size = sizeof(*eth_header) >> 2; + eth_header->id = cpu_to_be16(__sw_id_hw[MLX4_NET_TRANS_RULE_ID_ETH]); + memcpy(eth_header->dst_mac, &be_mac, ETH_ALEN); + memcpy(eth_header->dst_mac_msk, &mac_msk, ETH_ALEN); + + return 0; + +} + +#define MLX4_UPD_QP_PATH_MASK_SUPPORTED (1ULL << MLX4_UPD_QP_PATH_MASK_MAC_INDEX) +int mlx4_UPDATE_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd_info) +{ + int err; + u32 qpn = vhcr->in_modifier & 0xffffff; + struct res_qp *rqp; + u64 mac; + unsigned port; + u64 pri_addr_path_mask; + struct mlx4_update_qp_context *cmd; + int smac_index; + + cmd = (struct mlx4_update_qp_context *)inbox->buf; + + pri_addr_path_mask = be64_to_cpu(cmd->primary_addr_path_mask); + if (cmd->qp_mask || cmd->secondary_addr_path_mask || + (pri_addr_path_mask & ~MLX4_UPD_QP_PATH_MASK_SUPPORTED)) + return -EPERM; + + /* Just change the smac for the QP */ + err = get_res(dev, slave, qpn, RES_QP, &rqp); + if (err) { + mlx4_err(dev, "Updating qpn 0x%x for slave %d rejected\n", qpn, slave); + return err; + } + + port = (rqp->sched_queue >> 6 & 1) + 1; + + if (pri_addr_path_mask & (1ULL << MLX4_UPD_QP_PATH_MASK_MAC_INDEX)) { + smac_index = cmd->qp_context.pri_path.grh_mylmc; + err = mac_find_smac_ix_in_slave(dev, slave, port, + smac_index, &mac); + + if (err) { + mlx4_err(dev, "Failed to update qpn 0x%x, MAC is invalid. smac_ix: %d\n", + qpn, smac_index); + goto err_mac; + } + } + + err = mlx4_cmd(dev, inbox->dma, + vhcr->in_modifier, 0, + MLX4_CMD_UPDATE_QP, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (err) { + mlx4_err(dev, "Failed to update qpn on qpn 0x%x, command failed\n", qpn); + goto err_mac; + } + +err_mac: + put_res(dev, slave, qpn, RES_QP); + return err; +} + +int mlx4_QP_FLOW_STEERING_ATTACH_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *rlist = &tracker->slave_list[slave].res_list[RES_MAC]; + int err; + int qpn; + struct res_qp *rqp; + struct mlx4_net_trans_rule_hw_ctrl *ctrl; + struct _rule_hw *rule_header; + int header_id; + + if (dev->caps.steering_mode != + MLX4_STEERING_MODE_DEVICE_MANAGED) + return -EOPNOTSUPP; + + ctrl = (struct mlx4_net_trans_rule_hw_ctrl *)inbox->buf; + ctrl->port = mlx4_slave_convert_port(dev, slave, ctrl->port); + if (ctrl->port <= 0) + return -EINVAL; + qpn = be32_to_cpu(ctrl->qpn) & 0xffffff; + err = get_res(dev, slave, qpn, RES_QP, &rqp); + if (err) { + pr_err("Steering rule with qpn 0x%x rejected\n", qpn); + return err; + } + rule_header = (struct _rule_hw *)(ctrl + 1); + header_id = map_hw_to_sw_id(be16_to_cpu(rule_header->id)); + + switch (header_id) { + case MLX4_NET_TRANS_RULE_ID_ETH: + if (validate_eth_header_mac(slave, rule_header, rlist)) { + err = -EINVAL; + goto err_put; + } + break; + case MLX4_NET_TRANS_RULE_ID_IB: + break; + case MLX4_NET_TRANS_RULE_ID_IPV4: + case MLX4_NET_TRANS_RULE_ID_TCP: + case MLX4_NET_TRANS_RULE_ID_UDP: + pr_warn("Can't attach FS rule without L2 headers, adding L2 header\n"); + if (add_eth_header(dev, slave, inbox, rlist, header_id)) { + err = -EINVAL; + goto err_put; + } + vhcr->in_modifier += + sizeof(struct mlx4_net_trans_rule_hw_eth) >> 2; + break; + default: + pr_err("Corrupted mailbox\n"); + err = -EINVAL; + goto err_put; + } + + err = mlx4_cmd_imm(dev, inbox->dma, &vhcr->out_param, + vhcr->in_modifier, 0, + MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (err) + goto err_put; + + err = add_res_range(dev, slave, vhcr->out_param, 1, RES_FS_RULE, qpn); + if (err) { + mlx4_err(dev, "Fail to add flow steering resources\n"); + /* detach rule*/ + mlx4_cmd(dev, vhcr->out_param, 0, 0, + MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + goto err_put; + } + atomic_inc(&rqp->ref_count); +err_put: + put_res(dev, slave, qpn, RES_QP); + return err; +} + +int mlx4_QP_FLOW_STEERING_DETACH_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + struct res_qp *rqp; + struct res_fs_rule *rrule; + + if (dev->caps.steering_mode != + MLX4_STEERING_MODE_DEVICE_MANAGED) + return -EOPNOTSUPP; + + err = get_res(dev, slave, vhcr->in_param, RES_FS_RULE, &rrule); + if (err) + return err; + /* Release the rule form busy state before removal */ + put_res(dev, slave, vhcr->in_param, RES_FS_RULE); + err = get_res(dev, slave, rrule->qpn, RES_QP, &rqp); + if (err) + return err; + + err = rem_res_range(dev, slave, vhcr->in_param, 1, RES_FS_RULE, 0); + if (err) { + mlx4_err(dev, "Fail to remove flow steering resources\n"); + goto out; + } + + err = mlx4_cmd(dev, vhcr->in_param, 0, 0, + MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (!err) + atomic_dec(&rqp->ref_count); +out: + put_res(dev, slave, rrule->qpn, RES_QP); + return err; +} + +enum { + BUSY_MAX_RETRIES = 10 +}; + +int mlx4_QUERY_IF_STAT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int index = vhcr->in_modifier & 0xffff; + + err = get_res(dev, slave, index, RES_COUNTER, NULL); + if (err) + return err; + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + put_res(dev, slave, index, RES_COUNTER); + return err; +} + +static void detach_qp(struct mlx4_dev *dev, int slave, struct res_qp *rqp) +{ + struct res_gid *rgid; + struct res_gid *tmp; + struct mlx4_qp qp; /* dummy for calling attach/detach */ + + list_for_each_entry_safe(rgid, tmp, &rqp->mcg_list, list) { + switch (dev->caps.steering_mode) { + case MLX4_STEERING_MODE_DEVICE_MANAGED: + mlx4_flow_detach(dev, rgid->reg_id); + break; + case MLX4_STEERING_MODE_B0: + qp.qpn = rqp->local_qpn; + (void) mlx4_qp_detach_common(dev, &qp, rgid->gid, + rgid->prot, rgid->steer); + break; + } + list_del(&rgid->list); + kfree(rgid); + } +} + +static int _move_all_busy(struct mlx4_dev *dev, int slave, + enum mlx4_resource type, int print) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = + &priv->mfunc.master.res_tracker; + struct list_head *rlist = &tracker->slave_list[slave].res_list[type]; + struct res_common *r; + struct res_common *tmp; + int busy; + + busy = 0; + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(r, tmp, rlist, list) { + if (r->owner == slave) { + if (!r->removing) { + if (r->state == RES_ANY_BUSY) { + if (print) + mlx4_dbg(dev, + "%s id 0x%llx is busy\n", + resource_str(type), + r->res_id); + ++busy; + } else { + r->from_state = r->state; + r->state = RES_ANY_BUSY; + r->removing = 1; + } + } + } + } + spin_unlock_irq(mlx4_tlock(dev)); + + return busy; +} + +static int move_all_busy(struct mlx4_dev *dev, int slave, + enum mlx4_resource type) +{ + unsigned long begin; + int busy; + + begin = jiffies; + do { + busy = _move_all_busy(dev, slave, type, 0); + if (time_after(jiffies, begin + 5 * HZ)) + break; + if (busy) + cond_resched(); + } while (busy); + + if (busy) + busy = _move_all_busy(dev, slave, type, 1); + + return busy; +} +static void rem_slave_qps(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *qp_list = + &tracker->slave_list[slave].res_list[RES_QP]; + struct res_qp *qp; + struct res_qp *tmp; + int state; + u64 in_param; + int qpn; + int err; + + err = move_all_busy(dev, slave, RES_QP); + if (err) + mlx4_warn(dev, "rem_slave_qps: Could not move all qps to busy for slave %d\n", + slave); + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(qp, tmp, qp_list, com.list) { + spin_unlock_irq(mlx4_tlock(dev)); + if (qp->com.owner == slave) { + qpn = qp->com.res_id; + detach_qp(dev, slave, qp); + state = qp->com.from_state; + while (state != 0) { + switch (state) { + case RES_QP_RESERVED: + spin_lock_irq(mlx4_tlock(dev)); + rb_erase(&qp->com.node, + &tracker->res_tree[RES_QP]); + list_del(&qp->com.list); + spin_unlock_irq(mlx4_tlock(dev)); + if (!valid_reserved(dev, slave, qpn)) { + __mlx4_qp_release_range(dev, qpn, 1); + mlx4_release_resource(dev, slave, + RES_QP, 1, 0); + } + kfree(qp); + state = 0; + break; + case RES_QP_MAPPED: + if (!valid_reserved(dev, slave, qpn)) + __mlx4_qp_free_icm(dev, qpn); + state = RES_QP_RESERVED; + break; + case RES_QP_HW: + in_param = slave; + err = mlx4_cmd(dev, in_param, + qp->local_qpn, 2, + MLX4_CMD_2RST_QP, + MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (err) + mlx4_dbg(dev, "rem_slave_qps: failed to move slave %d qpn %d to reset\n", + slave, qp->local_qpn); + atomic_dec(&qp->rcq->ref_count); + atomic_dec(&qp->scq->ref_count); + atomic_dec(&qp->mtt->ref_count); + if (qp->srq) + atomic_dec(&qp->srq->ref_count); + state = RES_QP_MAPPED; + break; + default: + state = 0; + } + } + } + spin_lock_irq(mlx4_tlock(dev)); + } + spin_unlock_irq(mlx4_tlock(dev)); +} + +static void rem_slave_srqs(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *srq_list = + &tracker->slave_list[slave].res_list[RES_SRQ]; + struct res_srq *srq; + struct res_srq *tmp; + int state; + u64 in_param; + LIST_HEAD(tlist); + int srqn; + int err; + + err = move_all_busy(dev, slave, RES_SRQ); + if (err) + mlx4_warn(dev, "rem_slave_srqs: Could not move all srqs - too busy for slave %d\n", + slave); + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(srq, tmp, srq_list, com.list) { + spin_unlock_irq(mlx4_tlock(dev)); + if (srq->com.owner == slave) { + srqn = srq->com.res_id; + state = srq->com.from_state; + while (state != 0) { + switch (state) { + case RES_SRQ_ALLOCATED: + __mlx4_srq_free_icm(dev, srqn); + spin_lock_irq(mlx4_tlock(dev)); + rb_erase(&srq->com.node, + &tracker->res_tree[RES_SRQ]); + list_del(&srq->com.list); + spin_unlock_irq(mlx4_tlock(dev)); + mlx4_release_resource(dev, slave, + RES_SRQ, 1, 0); + kfree(srq); + state = 0; + break; + + case RES_SRQ_HW: + in_param = slave; + err = mlx4_cmd(dev, in_param, srqn, 1, + MLX4_CMD_HW2SW_SRQ, + MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (err) + mlx4_dbg(dev, "rem_slave_srqs: failed to move slave %d srq %d to SW ownership\n", + slave, srqn); + + atomic_dec(&srq->mtt->ref_count); + if (srq->cq) + atomic_dec(&srq->cq->ref_count); + state = RES_SRQ_ALLOCATED; + break; + + default: + state = 0; + } + } + } + spin_lock_irq(mlx4_tlock(dev)); + } + spin_unlock_irq(mlx4_tlock(dev)); +} + +static void rem_slave_cqs(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *cq_list = + &tracker->slave_list[slave].res_list[RES_CQ]; + struct res_cq *cq; + struct res_cq *tmp; + int state; + u64 in_param; + LIST_HEAD(tlist); + int cqn; + int err; + + err = move_all_busy(dev, slave, RES_CQ); + if (err) + mlx4_warn(dev, "rem_slave_cqs: Could not move all cqs - too busy for slave %d\n", + slave); + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(cq, tmp, cq_list, com.list) { + spin_unlock_irq(mlx4_tlock(dev)); + if (cq->com.owner == slave && !atomic_read(&cq->ref_count)) { + cqn = cq->com.res_id; + state = cq->com.from_state; + while (state != 0) { + switch (state) { + case RES_CQ_ALLOCATED: + __mlx4_cq_free_icm(dev, cqn); + spin_lock_irq(mlx4_tlock(dev)); + rb_erase(&cq->com.node, + &tracker->res_tree[RES_CQ]); + list_del(&cq->com.list); + spin_unlock_irq(mlx4_tlock(dev)); + mlx4_release_resource(dev, slave, + RES_CQ, 1, 0); + kfree(cq); + state = 0; + break; + + case RES_CQ_HW: + in_param = slave; + err = mlx4_cmd(dev, in_param, cqn, 1, + MLX4_CMD_HW2SW_CQ, + MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (err) + mlx4_dbg(dev, "rem_slave_cqs: failed to move slave %d cq %d to SW ownership\n", + slave, cqn); + atomic_dec(&cq->mtt->ref_count); + state = RES_CQ_ALLOCATED; + break; + + default: + state = 0; + } + } + } + spin_lock_irq(mlx4_tlock(dev)); + } + spin_unlock_irq(mlx4_tlock(dev)); +} + +static void rem_slave_mrs(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *mpt_list = + &tracker->slave_list[slave].res_list[RES_MPT]; + struct res_mpt *mpt; + struct res_mpt *tmp; + int state; + u64 in_param; + LIST_HEAD(tlist); + int mptn; + int err; + + err = move_all_busy(dev, slave, RES_MPT); + if (err) + mlx4_warn(dev, "rem_slave_mrs: Could not move all mpts - too busy for slave %d\n", + slave); + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(mpt, tmp, mpt_list, com.list) { + spin_unlock_irq(mlx4_tlock(dev)); + if (mpt->com.owner == slave) { + mptn = mpt->com.res_id; + state = mpt->com.from_state; + while (state != 0) { + switch (state) { + case RES_MPT_RESERVED: + __mlx4_mpt_release(dev, mpt->key); + spin_lock_irq(mlx4_tlock(dev)); + rb_erase(&mpt->com.node, + &tracker->res_tree[RES_MPT]); + list_del(&mpt->com.list); + spin_unlock_irq(mlx4_tlock(dev)); + mlx4_release_resource(dev, slave, + RES_MPT, 1, 0); + kfree(mpt); + state = 0; + break; + + case RES_MPT_MAPPED: + __mlx4_mpt_free_icm(dev, mpt->key); + state = RES_MPT_RESERVED; + break; + + case RES_MPT_HW: + in_param = slave; + err = mlx4_cmd(dev, in_param, mptn, 0, + MLX4_CMD_HW2SW_MPT, + MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (err) + mlx4_dbg(dev, "rem_slave_mrs: failed to move slave %d mpt %d to SW ownership\n", + slave, mptn); + if (mpt->mtt) + atomic_dec(&mpt->mtt->ref_count); + state = RES_MPT_MAPPED; + break; + default: + state = 0; + } + } + } + spin_lock_irq(mlx4_tlock(dev)); + } + spin_unlock_irq(mlx4_tlock(dev)); +} + +static void rem_slave_mtts(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = + &priv->mfunc.master.res_tracker; + struct list_head *mtt_list = + &tracker->slave_list[slave].res_list[RES_MTT]; + struct res_mtt *mtt; + struct res_mtt *tmp; + int state; + LIST_HEAD(tlist); + int base; + int err; + + err = move_all_busy(dev, slave, RES_MTT); + if (err) + mlx4_warn(dev, "rem_slave_mtts: Could not move all mtts - too busy for slave %d\n", + slave); + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(mtt, tmp, mtt_list, com.list) { + spin_unlock_irq(mlx4_tlock(dev)); + if (mtt->com.owner == slave) { + base = mtt->com.res_id; + state = mtt->com.from_state; + while (state != 0) { + switch (state) { + case RES_MTT_ALLOCATED: + __mlx4_free_mtt_range(dev, base, + mtt->order); + spin_lock_irq(mlx4_tlock(dev)); + rb_erase(&mtt->com.node, + &tracker->res_tree[RES_MTT]); + list_del(&mtt->com.list); + spin_unlock_irq(mlx4_tlock(dev)); + mlx4_release_resource(dev, slave, RES_MTT, + 1 << mtt->order, 0); + kfree(mtt); + state = 0; + break; + + default: + state = 0; + } + } + } + spin_lock_irq(mlx4_tlock(dev)); + } + spin_unlock_irq(mlx4_tlock(dev)); +} + +static void rem_slave_fs_rule(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = + &priv->mfunc.master.res_tracker; + struct list_head *fs_rule_list = + &tracker->slave_list[slave].res_list[RES_FS_RULE]; + struct res_fs_rule *fs_rule; + struct res_fs_rule *tmp; + int state; + u64 base; + int err; + + err = move_all_busy(dev, slave, RES_FS_RULE); + if (err) + mlx4_warn(dev, "rem_slave_fs_rule: Could not move all mtts to busy for slave %d\n", + slave); + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(fs_rule, tmp, fs_rule_list, com.list) { + spin_unlock_irq(mlx4_tlock(dev)); + if (fs_rule->com.owner == slave) { + base = fs_rule->com.res_id; + state = fs_rule->com.from_state; + while (state != 0) { + switch (state) { + case RES_FS_RULE_ALLOCATED: + /* detach rule */ + err = mlx4_cmd(dev, base, 0, 0, + MLX4_QP_FLOW_STEERING_DETACH, + MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + + spin_lock_irq(mlx4_tlock(dev)); + rb_erase(&fs_rule->com.node, + &tracker->res_tree[RES_FS_RULE]); + list_del(&fs_rule->com.list); + spin_unlock_irq(mlx4_tlock(dev)); + kfree(fs_rule); + state = 0; + break; + + default: + state = 0; + } + } + } + spin_lock_irq(mlx4_tlock(dev)); + } + spin_unlock_irq(mlx4_tlock(dev)); +} + +static void rem_slave_eqs(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *eq_list = + &tracker->slave_list[slave].res_list[RES_EQ]; + struct res_eq *eq; + struct res_eq *tmp; + int err; + int state; + LIST_HEAD(tlist); + int eqn; + struct mlx4_cmd_mailbox *mailbox; + + err = move_all_busy(dev, slave, RES_EQ); + if (err) + mlx4_warn(dev, "rem_slave_eqs: Could not move all eqs - too busy for slave %d\n", + slave); + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(eq, tmp, eq_list, com.list) { + spin_unlock_irq(mlx4_tlock(dev)); + if (eq->com.owner == slave) { + eqn = eq->com.res_id; + state = eq->com.from_state; + while (state != 0) { + switch (state) { + case RES_EQ_RESERVED: + spin_lock_irq(mlx4_tlock(dev)); + rb_erase(&eq->com.node, + &tracker->res_tree[RES_EQ]); + list_del(&eq->com.list); + spin_unlock_irq(mlx4_tlock(dev)); + kfree(eq); + state = 0; + break; + + case RES_EQ_HW: + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + cond_resched(); + continue; + } + err = mlx4_cmd_box(dev, slave, 0, + eqn & 0xff, 0, + MLX4_CMD_HW2SW_EQ, + MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (err) + mlx4_dbg(dev, "rem_slave_eqs: failed to move slave %d eqs %d to SW ownership\n", + slave, eqn); + mlx4_free_cmd_mailbox(dev, mailbox); + atomic_dec(&eq->mtt->ref_count); + state = RES_EQ_RESERVED; + break; + + default: + state = 0; + } + } + } + spin_lock_irq(mlx4_tlock(dev)); + } + spin_unlock_irq(mlx4_tlock(dev)); +} + +static void rem_slave_counters(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *counter_list = + &tracker->slave_list[slave].res_list[RES_COUNTER]; + struct res_counter *counter; + struct res_counter *tmp; + int err; + int index; + + err = move_all_busy(dev, slave, RES_COUNTER); + if (err) + mlx4_warn(dev, "rem_slave_counters: Could not move all counters - too busy for slave %d\n", + slave); + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(counter, tmp, counter_list, com.list) { + if (counter->com.owner == slave) { + index = counter->com.res_id; + rb_erase(&counter->com.node, + &tracker->res_tree[RES_COUNTER]); + list_del(&counter->com.list); + kfree(counter); + __mlx4_counter_free(dev, index); + mlx4_release_resource(dev, slave, RES_COUNTER, 1, 0); + } + } + spin_unlock_irq(mlx4_tlock(dev)); +} + +static void rem_slave_xrcdns(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *xrcdn_list = + &tracker->slave_list[slave].res_list[RES_XRCD]; + struct res_xrcdn *xrcd; + struct res_xrcdn *tmp; + int err; + int xrcdn; + + err = move_all_busy(dev, slave, RES_XRCD); + if (err) + mlx4_warn(dev, "rem_slave_xrcdns: Could not move all xrcdns - too busy for slave %d\n", + slave); + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(xrcd, tmp, xrcdn_list, com.list) { + if (xrcd->com.owner == slave) { + xrcdn = xrcd->com.res_id; + rb_erase(&xrcd->com.node, &tracker->res_tree[RES_XRCD]); + list_del(&xrcd->com.list); + kfree(xrcd); + __mlx4_xrcd_free(dev, xrcdn); + } + } + spin_unlock_irq(mlx4_tlock(dev)); +} + +void mlx4_delete_all_resources_for_slave(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + mlx4_reset_roce_gids(dev, slave); + mutex_lock(&priv->mfunc.master.res_tracker.slave_list[slave].mutex); + rem_slave_vlans(dev, slave); + rem_slave_macs(dev, slave); + rem_slave_fs_rule(dev, slave); + rem_slave_qps(dev, slave); + rem_slave_srqs(dev, slave); + rem_slave_cqs(dev, slave); + rem_slave_mrs(dev, slave); + rem_slave_eqs(dev, slave); + rem_slave_mtts(dev, slave); + rem_slave_counters(dev, slave); + rem_slave_xrcdns(dev, slave); + mutex_unlock(&priv->mfunc.master.res_tracker.slave_list[slave].mutex); +} + +void mlx4_vf_immed_vlan_work_handler(struct work_struct *_work) +{ + struct mlx4_vf_immed_vlan_work *work = + container_of(_work, struct mlx4_vf_immed_vlan_work, work); + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_update_qp_context *upd_context; + struct mlx4_dev *dev = &work->priv->dev; + struct mlx4_resource_tracker *tracker = + &work->priv->mfunc.master.res_tracker; + struct list_head *qp_list = + &tracker->slave_list[work->slave].res_list[RES_QP]; + struct res_qp *qp; + struct res_qp *tmp; + u64 qp_path_mask_vlan_ctrl = + ((1ULL << MLX4_UPD_QP_PATH_MASK_ETH_TX_BLOCK_UNTAGGED) | + (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_TX_BLOCK_1P) | + (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_TX_BLOCK_TAGGED) | + (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_RX_BLOCK_UNTAGGED) | + (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_RX_BLOCK_1P) | + (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_RX_BLOCK_TAGGED)); + + u64 qp_path_mask = ((1ULL << MLX4_UPD_QP_PATH_MASK_VLAN_INDEX) | + (1ULL << MLX4_UPD_QP_PATH_MASK_FVL) | + (1ULL << MLX4_UPD_QP_PATH_MASK_CV) | + (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_HIDE_CQE_VLAN) | + (1ULL << MLX4_UPD_QP_PATH_MASK_FEUP) | + (1ULL << MLX4_UPD_QP_PATH_MASK_FVL_RX) | + (1ULL << MLX4_UPD_QP_PATH_MASK_SCHED_QUEUE)); + + int err; + int port, errors = 0; + u8 vlan_control; + + if (mlx4_is_slave(dev)) { + mlx4_warn(dev, "Trying to update-qp in slave %d\n", + work->slave); + goto out; + } + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + goto out; + if (work->flags & MLX4_VF_IMMED_VLAN_FLAG_LINK_DISABLE) /* block all */ + vlan_control = MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED | + MLX4_VLAN_CTRL_ETH_TX_BLOCK_PRIO_TAGGED | + MLX4_VLAN_CTRL_ETH_TX_BLOCK_UNTAGGED | + MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED | + MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED | + MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED; + else if (!work->vlan_id) + vlan_control = MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED | + MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED; + else + vlan_control = MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED | + MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED | + MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED; + + upd_context = mailbox->buf; + upd_context->qp_mask = cpu_to_be64(1ULL << MLX4_UPD_QP_MASK_VSD); + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(qp, tmp, qp_list, com.list) { + spin_unlock_irq(mlx4_tlock(dev)); + if (qp->com.owner == work->slave) { + if (qp->com.from_state != RES_QP_HW || + !qp->sched_queue || /* no INIT2RTR trans yet */ + mlx4_is_qp_reserved(dev, qp->local_qpn) || + qp->qpc_flags & (1 << MLX4_RSS_QPC_FLAG_OFFSET)) { + spin_lock_irq(mlx4_tlock(dev)); + continue; + } + port = (qp->sched_queue >> 6 & 1) + 1; + if (port != work->port) { + spin_lock_irq(mlx4_tlock(dev)); + continue; + } + if (MLX4_QP_ST_RC == ((qp->qpc_flags >> 16) & 0xff)) + upd_context->primary_addr_path_mask = cpu_to_be64(qp_path_mask); + else + upd_context->primary_addr_path_mask = + cpu_to_be64(qp_path_mask | qp_path_mask_vlan_ctrl); + if (work->vlan_id == MLX4_VGT) { + upd_context->qp_context.param3 = qp->param3; + upd_context->qp_context.pri_path.vlan_control = qp->vlan_control; + upd_context->qp_context.pri_path.fvl_rx = qp->fvl_rx; + upd_context->qp_context.pri_path.vlan_index = qp->vlan_index; + upd_context->qp_context.pri_path.fl = qp->pri_path_fl; + upd_context->qp_context.pri_path.feup = qp->feup; + upd_context->qp_context.pri_path.sched_queue = + qp->sched_queue; + } else { + upd_context->qp_context.param3 = qp->param3 & ~cpu_to_be32(MLX4_STRIP_VLAN); + upd_context->qp_context.pri_path.vlan_control = vlan_control; + upd_context->qp_context.pri_path.vlan_index = work->vlan_ix; + upd_context->qp_context.pri_path.fvl_rx = + qp->fvl_rx | MLX4_FVL_RX_FORCE_ETH_VLAN; + upd_context->qp_context.pri_path.fl = + qp->pri_path_fl | MLX4_FL_CV | MLX4_FL_ETH_HIDE_CQE_VLAN; + upd_context->qp_context.pri_path.feup = + qp->feup | MLX4_FEUP_FORCE_ETH_UP | MLX4_FVL_FORCE_ETH_VLAN; + upd_context->qp_context.pri_path.sched_queue = + qp->sched_queue & 0xC7; + upd_context->qp_context.pri_path.sched_queue |= + ((work->qos & 0x7) << 3); + } + + err = mlx4_cmd(dev, mailbox->dma, + qp->local_qpn & 0xffffff, + 0, MLX4_CMD_UPDATE_QP, + MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE); + if (err) { + mlx4_info(dev, "UPDATE_QP failed for slave %d, port %d, qpn %d (%d)\n", + work->slave, port, qp->local_qpn, err); + errors++; + } + } + spin_lock_irq(mlx4_tlock(dev)); + } + spin_unlock_irq(mlx4_tlock(dev)); + mlx4_free_cmd_mailbox(dev, mailbox); + + if (errors) + mlx4_err(dev, "%d UPDATE_QP failures for slave %d, port %d\n", + errors, work->slave, work->port); + + /* unregister previous vlan_id if needed and we had no errors + * while updating the QPs + */ + if (work->flags & MLX4_VF_IMMED_VLAN_FLAG_VLAN && !errors && + NO_INDX != work->orig_vlan_ix) + __mlx4_unregister_vlan(&work->priv->dev, work->port, + work->orig_vlan_id); +out: + kfree(work); + return; +} diff --git a/drivers/net/ethernet/mellanox/mlx4/sense.c b/drivers/net/ethernet/mellanox/mlx4/sense.c new file mode 100644 index 0000000..094773d --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/sense.c @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2007 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include + +#include + +#include "mlx4.h" + +int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port, + enum mlx4_port_type *type) +{ + u64 out_param; + int err = 0; + + err = mlx4_cmd_imm(dev, 0, &out_param, port, 0, + MLX4_CMD_SENSE_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); + if (err) { + mlx4_err(dev, "Sense command failed for port: %d\n", port); + return err; + } + + if (out_param > 2) { + mlx4_err(dev, "Sense returned illegal value: 0x%llx\n", out_param); + return -EINVAL; + } + + *type = out_param; + return 0; +} + +void mlx4_do_sense_ports(struct mlx4_dev *dev, + enum mlx4_port_type *stype, + enum mlx4_port_type *defaults) +{ + struct mlx4_sense *sense = &mlx4_priv(dev)->sense; + int err; + int i; + + for (i = 1; i <= dev->caps.num_ports; i++) { + stype[i - 1] = 0; + if (sense->do_sense_port[i] && sense->sense_allowed[i] && + dev->caps.possible_type[i] == MLX4_PORT_TYPE_AUTO) { + err = mlx4_SENSE_PORT(dev, i, &stype[i - 1]); + if (err) + stype[i - 1] = defaults[i - 1]; + } else + stype[i - 1] = defaults[i - 1]; + } + + /* + * If sensed nothing, remain in current configuration. + */ + for (i = 0; i < dev->caps.num_ports; i++) + stype[i] = stype[i] ? stype[i] : defaults[i]; + +} + +static void mlx4_sense_port(struct work_struct *work) +{ + struct delayed_work *delay = to_delayed_work(work); + struct mlx4_sense *sense = container_of(delay, struct mlx4_sense, + sense_poll); + struct mlx4_dev *dev = sense->dev; + struct mlx4_priv *priv = mlx4_priv(dev); + enum mlx4_port_type stype[MLX4_MAX_PORTS]; + + mutex_lock(&priv->port_mutex); + mlx4_do_sense_ports(dev, stype, &dev->caps.port_type[1]); + + if (mlx4_check_port_params(dev, stype)) + goto sense_again; + + if (mlx4_change_port_types(dev, stype)) + mlx4_err(dev, "Failed to change port_types\n"); + +sense_again: + mutex_unlock(&priv->port_mutex); + queue_delayed_work(mlx4_wq , &sense->sense_poll, + round_jiffies_relative(MLX4_SENSE_RANGE)); +} + +void mlx4_start_sense(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_sense *sense = &priv->sense; + + if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) + return; + + queue_delayed_work(mlx4_wq , &sense->sense_poll, + round_jiffies_relative(MLX4_SENSE_RANGE)); +} + +void mlx4_stop_sense(struct mlx4_dev *dev) +{ + cancel_delayed_work_sync(&mlx4_priv(dev)->sense.sense_poll); +} + +void mlx4_sense_init(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_sense *sense = &priv->sense; + int port; + + sense->dev = dev; + for (port = 1; port <= dev->caps.num_ports; port++) + sense->do_sense_port[port] = 1; + + INIT_DEFERRABLE_WORK(&sense->sense_poll, mlx4_sense_port); +} diff --git a/drivers/net/ethernet/mellanox/mlx4/srq.c b/drivers/net/ethernet/mellanox/mlx4/srq.c new file mode 100644 index 0000000..6714662 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/srq.c @@ -0,0 +1,313 @@ +/* + * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include +#include +#include +#include + +#include "mlx4.h" +#include "icm.h" + +void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type) +{ + struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table; + struct mlx4_srq *srq; + + spin_lock(&srq_table->lock); + + srq = radix_tree_lookup(&srq_table->tree, srqn & (dev->caps.num_srqs - 1)); + if (srq) + atomic_inc(&srq->refcount); + + spin_unlock(&srq_table->lock); + + if (!srq) { + mlx4_warn(dev, "Async event for bogus SRQ %08x\n", srqn); + return; + } + + srq->event(srq, event_type); + + if (atomic_dec_and_test(&srq->refcount)) + complete(&srq->free); +} + +static int mlx4_SW2HW_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, + int srq_num) +{ + return mlx4_cmd(dev, mailbox->dma, srq_num, 0, + MLX4_CMD_SW2HW_SRQ, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_WRAPPED); +} + +static int mlx4_HW2SW_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, + int srq_num) +{ + return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, srq_num, + mailbox ? 0 : 1, MLX4_CMD_HW2SW_SRQ, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); +} + +static int mlx4_ARM_SRQ(struct mlx4_dev *dev, int srq_num, int limit_watermark) +{ + return mlx4_cmd(dev, limit_watermark, srq_num, 0, MLX4_CMD_ARM_SRQ, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); +} + +static int mlx4_QUERY_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, + int srq_num) +{ + return mlx4_cmd_box(dev, 0, mailbox->dma, srq_num, 0, MLX4_CMD_QUERY_SRQ, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); +} + +int __mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn) +{ + struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table; + int err; + + + *srqn = mlx4_bitmap_alloc(&srq_table->bitmap); + if (*srqn == -1) + return -ENOMEM; + + err = mlx4_table_get(dev, &srq_table->table, *srqn, GFP_KERNEL); + if (err) + goto err_out; + + err = mlx4_table_get(dev, &srq_table->cmpt_table, *srqn, GFP_KERNEL); + if (err) + goto err_put; + return 0; + +err_put: + mlx4_table_put(dev, &srq_table->table, *srqn); + +err_out: + mlx4_bitmap_free(&srq_table->bitmap, *srqn, MLX4_NO_RR); + return err; +} + +static int mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn) +{ + u64 out_param; + int err; + + if (mlx4_is_mfunc(dev)) { + err = mlx4_cmd_imm(dev, 0, &out_param, RES_SRQ, + RES_OP_RESERVE_AND_MAP, + MLX4_CMD_ALLOC_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); + if (!err) + *srqn = get_param_l(&out_param); + + return err; + } + return __mlx4_srq_alloc_icm(dev, srqn); +} + +void __mlx4_srq_free_icm(struct mlx4_dev *dev, int srqn) +{ + struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table; + + mlx4_table_put(dev, &srq_table->cmpt_table, srqn); + mlx4_table_put(dev, &srq_table->table, srqn); + mlx4_bitmap_free(&srq_table->bitmap, srqn, MLX4_NO_RR); +} + +static void mlx4_srq_free_icm(struct mlx4_dev *dev, int srqn) +{ + u64 in_param = 0; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, srqn); + if (mlx4_cmd(dev, in_param, RES_SRQ, RES_OP_RESERVE_AND_MAP, + MLX4_CMD_FREE_RES, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED)) + mlx4_warn(dev, "Failed freeing cq:%d\n", srqn); + return; + } + __mlx4_srq_free_icm(dev, srqn); +} + +int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcd, + struct mlx4_mtt *mtt, u64 db_rec, struct mlx4_srq *srq) +{ + struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table; + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_srq_context *srq_context; + u64 mtt_addr; + int err; + + err = mlx4_srq_alloc_icm(dev, &srq->srqn); + if (err) + return err; + + spin_lock_irq(&srq_table->lock); + err = radix_tree_insert(&srq_table->tree, srq->srqn, srq); + spin_unlock_irq(&srq_table->lock); + if (err) + goto err_icm; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + err = PTR_ERR(mailbox); + goto err_radix; + } + + srq_context = mailbox->buf; + srq_context->state_logsize_srqn = cpu_to_be32((ilog2(srq->max) << 24) | + srq->srqn); + srq_context->logstride = srq->wqe_shift - 4; + srq_context->xrcd = cpu_to_be16(xrcd); + srq_context->pg_offset_cqn = cpu_to_be32(cqn & 0xffffff); + srq_context->log_page_size = mtt->page_shift - MLX4_ICM_PAGE_SHIFT; + + mtt_addr = mlx4_mtt_addr(dev, mtt); + srq_context->mtt_base_addr_h = mtt_addr >> 32; + srq_context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff); + srq_context->pd = cpu_to_be32(pdn); + srq_context->db_rec_addr = cpu_to_be64(db_rec); + + err = mlx4_SW2HW_SRQ(dev, mailbox, srq->srqn); + mlx4_free_cmd_mailbox(dev, mailbox); + if (err) + goto err_radix; + + atomic_set(&srq->refcount, 1); + init_completion(&srq->free); + + return 0; + +err_radix: + spin_lock_irq(&srq_table->lock); + radix_tree_delete(&srq_table->tree, srq->srqn); + spin_unlock_irq(&srq_table->lock); + +err_icm: + mlx4_srq_free_icm(dev, srq->srqn); + return err; +} +EXPORT_SYMBOL_GPL(mlx4_srq_alloc); + +void mlx4_srq_free(struct mlx4_dev *dev, struct mlx4_srq *srq) +{ + struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table; + int err; + + err = mlx4_HW2SW_SRQ(dev, NULL, srq->srqn); + if (err) + mlx4_warn(dev, "HW2SW_SRQ failed (%d) for SRQN %06x\n", err, srq->srqn); + + spin_lock_irq(&srq_table->lock); + radix_tree_delete(&srq_table->tree, srq->srqn); + spin_unlock_irq(&srq_table->lock); + + if (atomic_dec_and_test(&srq->refcount)) + complete(&srq->free); + wait_for_completion(&srq->free); + + mlx4_srq_free_icm(dev, srq->srqn); +} +EXPORT_SYMBOL_GPL(mlx4_srq_free); + +int mlx4_srq_arm(struct mlx4_dev *dev, struct mlx4_srq *srq, int limit_watermark) +{ + return mlx4_ARM_SRQ(dev, srq->srqn, limit_watermark); +} +EXPORT_SYMBOL_GPL(mlx4_srq_arm); + +int mlx4_srq_query(struct mlx4_dev *dev, struct mlx4_srq *srq, int *limit_watermark) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_srq_context *srq_context; + int err; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + srq_context = mailbox->buf; + + err = mlx4_QUERY_SRQ(dev, mailbox, srq->srqn); + if (err) + goto err_out; + *limit_watermark = be16_to_cpu(srq_context->limit_watermark); + +err_out: + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} +EXPORT_SYMBOL_GPL(mlx4_srq_query); + +int mlx4_init_srq_table(struct mlx4_dev *dev) +{ + struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table; + int err; + + spin_lock_init(&srq_table->lock); + INIT_RADIX_TREE(&srq_table->tree, GFP_ATOMIC); + if (mlx4_is_slave(dev)) + return 0; + + err = mlx4_bitmap_init(&srq_table->bitmap, dev->caps.num_srqs, + dev->caps.num_srqs - 1, dev->caps.reserved_srqs, 0); + if (err) + return err; + + return 0; +} + +void mlx4_cleanup_srq_table(struct mlx4_dev *dev) +{ + if (mlx4_is_slave(dev)) + return; + mlx4_bitmap_cleanup(&mlx4_priv(dev)->srq_table.bitmap); +} + +struct mlx4_srq *mlx4_srq_lookup(struct mlx4_dev *dev, u32 srqn) +{ + struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table; + struct mlx4_srq *srq; + unsigned long flags; + + spin_lock_irqsave(&srq_table->lock, flags); + srq = radix_tree_lookup(&srq_table->tree, + srqn & (dev->caps.num_srqs - 1)); + spin_unlock_irqrestore(&srq_table->lock, flags); + + return srq; +} +EXPORT_SYMBOL_GPL(mlx4_srq_lookup); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig new file mode 100644 index 0000000..8ff57e8 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig @@ -0,0 +1,8 @@ +# +# Mellanox driver configuration +# + +config MLX5_CORE + tristate + depends on PCI + default n diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile new file mode 100644 index 0000000..105780b --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_MLX5_CORE) += mlx5_core.o + +mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \ + health.o mcg.o cq.o srq.o alloc.o qp.o port.o mr.o pd.o \ + mad.o diff --git a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c new file mode 100644 index 0000000..56779c1 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mlx5_core.h" + +/* Handling for queue buffers -- we allocate a bunch of memory and + * register it in a memory region at HCA virtual address 0. If the + * requested size is > max_direct, we split the allocation into + * multiple pages, so we don't require too much contiguous memory. + */ + +int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, int max_direct, + struct mlx5_buf *buf) +{ + dma_addr_t t; + + buf->size = size; + if (size <= max_direct) { + buf->nbufs = 1; + buf->npages = 1; + buf->page_shift = (u8)get_order(size) + PAGE_SHIFT; + buf->direct.buf = dma_zalloc_coherent(&dev->pdev->dev, + size, &t, GFP_KERNEL); + if (!buf->direct.buf) + return -ENOMEM; + + buf->direct.map = t; + + while (t & ((1 << buf->page_shift) - 1)) { + --buf->page_shift; + buf->npages *= 2; + } + } else { + int i; + + buf->direct.buf = NULL; + buf->nbufs = (size + PAGE_SIZE - 1) / PAGE_SIZE; + buf->npages = buf->nbufs; + buf->page_shift = PAGE_SHIFT; + buf->page_list = kcalloc(buf->nbufs, sizeof(*buf->page_list), + GFP_KERNEL); + if (!buf->page_list) + return -ENOMEM; + + for (i = 0; i < buf->nbufs; i++) { + buf->page_list[i].buf = + dma_zalloc_coherent(&dev->pdev->dev, PAGE_SIZE, + &t, GFP_KERNEL); + if (!buf->page_list[i].buf) + goto err_free; + + buf->page_list[i].map = t; + } + + if (BITS_PER_LONG == 64) { + struct page **pages; + pages = kmalloc(sizeof(*pages) * buf->nbufs, GFP_KERNEL); + if (!pages) + goto err_free; + for (i = 0; i < buf->nbufs; i++) + pages[i] = virt_to_page(buf->page_list[i].buf); + buf->direct.buf = vmap(pages, buf->nbufs, VM_MAP, PAGE_KERNEL); + kfree(pages); + if (!buf->direct.buf) + goto err_free; + } + } + + return 0; + +err_free: + mlx5_buf_free(dev, buf); + + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(mlx5_buf_alloc); + +void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_buf *buf) +{ + int i; + + if (buf->nbufs == 1) + dma_free_coherent(&dev->pdev->dev, buf->size, buf->direct.buf, + buf->direct.map); + else { + if (BITS_PER_LONG == 64 && buf->direct.buf) + vunmap(buf->direct.buf); + + for (i = 0; i < buf->nbufs; i++) + if (buf->page_list[i].buf) + dma_free_coherent(&dev->pdev->dev, PAGE_SIZE, + buf->page_list[i].buf, + buf->page_list[i].map); + kfree(buf->page_list); + } +} +EXPORT_SYMBOL_GPL(mlx5_buf_free); + +static struct mlx5_db_pgdir *mlx5_alloc_db_pgdir(struct device *dma_device) +{ + struct mlx5_db_pgdir *pgdir; + + pgdir = kzalloc(sizeof(*pgdir), GFP_KERNEL); + if (!pgdir) + return NULL; + + bitmap_fill(pgdir->bitmap, MLX5_DB_PER_PAGE); + pgdir->db_page = dma_alloc_coherent(dma_device, PAGE_SIZE, + &pgdir->db_dma, GFP_KERNEL); + if (!pgdir->db_page) { + kfree(pgdir); + return NULL; + } + + return pgdir; +} + +static int mlx5_alloc_db_from_pgdir(struct mlx5_db_pgdir *pgdir, + struct mlx5_db *db) +{ + int offset; + int i; + + i = find_first_bit(pgdir->bitmap, MLX5_DB_PER_PAGE); + if (i >= MLX5_DB_PER_PAGE) + return -ENOMEM; + + __clear_bit(i, pgdir->bitmap); + + db->u.pgdir = pgdir; + db->index = i; + offset = db->index * L1_CACHE_BYTES; + db->db = pgdir->db_page + offset / sizeof(*pgdir->db_page); + db->dma = pgdir->db_dma + offset; + + return 0; +} + +int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db) +{ + struct mlx5_db_pgdir *pgdir; + int ret = 0; + + mutex_lock(&dev->priv.pgdir_mutex); + + list_for_each_entry(pgdir, &dev->priv.pgdir_list, list) + if (!mlx5_alloc_db_from_pgdir(pgdir, db)) + goto out; + + pgdir = mlx5_alloc_db_pgdir(&(dev->pdev->dev)); + if (!pgdir) { + ret = -ENOMEM; + goto out; + } + + list_add(&pgdir->list, &dev->priv.pgdir_list); + + /* This should never fail -- we just allocated an empty page: */ + WARN_ON(mlx5_alloc_db_from_pgdir(pgdir, db)); + +out: + mutex_unlock(&dev->priv.pgdir_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(mlx5_db_alloc); + +void mlx5_db_free(struct mlx5_core_dev *dev, struct mlx5_db *db) +{ + mutex_lock(&dev->priv.pgdir_mutex); + + __set_bit(db->index, db->u.pgdir->bitmap); + + if (bitmap_full(db->u.pgdir->bitmap, MLX5_DB_PER_PAGE)) { + dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE, + db->u.pgdir->db_page, db->u.pgdir->db_dma); + list_del(&db->u.pgdir->list); + kfree(db->u.pgdir); + } + + mutex_unlock(&dev->priv.pgdir_mutex); +} +EXPORT_SYMBOL_GPL(mlx5_db_free); + + +void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas) +{ + u64 addr; + int i; + + for (i = 0; i < buf->npages; i++) { + if (buf->nbufs == 1) + addr = buf->direct.map + (i << buf->page_shift); + else + addr = buf->page_list[i].map; + + pas[i] = cpu_to_be64(addr); + } +} +EXPORT_SYMBOL_GPL(mlx5_fill_page_array); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c new file mode 100644 index 0000000..368c6c5 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -0,0 +1,1556 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mlx5_core.h" + +enum { + CMD_IF_REV = 5, +}; + +enum { + CMD_MODE_POLLING, + CMD_MODE_EVENTS +}; + +enum { + NUM_LONG_LISTS = 2, + NUM_MED_LISTS = 64, + LONG_LIST_SIZE = (2ULL * 1024 * 1024 * 1024 / PAGE_SIZE) * 8 + 16 + + MLX5_CMD_DATA_BLOCK_SIZE, + MED_LIST_SIZE = 16 + MLX5_CMD_DATA_BLOCK_SIZE, +}; + +enum { + MLX5_CMD_DELIVERY_STAT_OK = 0x0, + MLX5_CMD_DELIVERY_STAT_SIGNAT_ERR = 0x1, + MLX5_CMD_DELIVERY_STAT_TOK_ERR = 0x2, + MLX5_CMD_DELIVERY_STAT_BAD_BLK_NUM_ERR = 0x3, + MLX5_CMD_DELIVERY_STAT_OUT_PTR_ALIGN_ERR = 0x4, + MLX5_CMD_DELIVERY_STAT_IN_PTR_ALIGN_ERR = 0x5, + MLX5_CMD_DELIVERY_STAT_FW_ERR = 0x6, + MLX5_CMD_DELIVERY_STAT_IN_LENGTH_ERR = 0x7, + MLX5_CMD_DELIVERY_STAT_OUT_LENGTH_ERR = 0x8, + MLX5_CMD_DELIVERY_STAT_RES_FLD_NOT_CLR_ERR = 0x9, + MLX5_CMD_DELIVERY_STAT_CMD_DESCR_ERR = 0x10, +}; + +enum { + MLX5_CMD_STAT_OK = 0x0, + MLX5_CMD_STAT_INT_ERR = 0x1, + MLX5_CMD_STAT_BAD_OP_ERR = 0x2, + MLX5_CMD_STAT_BAD_PARAM_ERR = 0x3, + MLX5_CMD_STAT_BAD_SYS_STATE_ERR = 0x4, + MLX5_CMD_STAT_BAD_RES_ERR = 0x5, + MLX5_CMD_STAT_RES_BUSY = 0x6, + MLX5_CMD_STAT_LIM_ERR = 0x8, + MLX5_CMD_STAT_BAD_RES_STATE_ERR = 0x9, + MLX5_CMD_STAT_IX_ERR = 0xa, + MLX5_CMD_STAT_NO_RES_ERR = 0xf, + MLX5_CMD_STAT_BAD_INP_LEN_ERR = 0x50, + MLX5_CMD_STAT_BAD_OUTP_LEN_ERR = 0x51, + MLX5_CMD_STAT_BAD_QP_STATE_ERR = 0x10, + MLX5_CMD_STAT_BAD_PKT_ERR = 0x30, + MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR = 0x40, +}; + +static struct mlx5_cmd_work_ent *alloc_cmd(struct mlx5_cmd *cmd, + struct mlx5_cmd_msg *in, + struct mlx5_cmd_msg *out, + void *uout, int uout_size, + mlx5_cmd_cbk_t cbk, + void *context, int page_queue) +{ + gfp_t alloc_flags = cbk ? GFP_ATOMIC : GFP_KERNEL; + struct mlx5_cmd_work_ent *ent; + + ent = kzalloc(sizeof(*ent), alloc_flags); + if (!ent) + return ERR_PTR(-ENOMEM); + + ent->in = in; + ent->out = out; + ent->uout = uout; + ent->uout_size = uout_size; + ent->callback = cbk; + ent->context = context; + ent->cmd = cmd; + ent->page_queue = page_queue; + + return ent; +} + +static u8 alloc_token(struct mlx5_cmd *cmd) +{ + u8 token; + + spin_lock(&cmd->token_lock); + token = cmd->token++ % 255 + 1; + spin_unlock(&cmd->token_lock); + + return token; +} + +static int alloc_ent(struct mlx5_cmd *cmd) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&cmd->alloc_lock, flags); + ret = find_first_bit(&cmd->bitmask, cmd->max_reg_cmds); + if (ret < cmd->max_reg_cmds) + clear_bit(ret, &cmd->bitmask); + spin_unlock_irqrestore(&cmd->alloc_lock, flags); + + return ret < cmd->max_reg_cmds ? ret : -ENOMEM; +} + +static void free_ent(struct mlx5_cmd *cmd, int idx) +{ + unsigned long flags; + + spin_lock_irqsave(&cmd->alloc_lock, flags); + set_bit(idx, &cmd->bitmask); + spin_unlock_irqrestore(&cmd->alloc_lock, flags); +} + +static struct mlx5_cmd_layout *get_inst(struct mlx5_cmd *cmd, int idx) +{ + return cmd->cmd_buf + (idx << cmd->log_stride); +} + +static u8 xor8_buf(void *buf, int len) +{ + u8 *ptr = buf; + u8 sum = 0; + int i; + + for (i = 0; i < len; i++) + sum ^= ptr[i]; + + return sum; +} + +static int verify_block_sig(struct mlx5_cmd_prot_block *block) +{ + if (xor8_buf(block->rsvd0, sizeof(*block) - sizeof(block->data) - 1) != 0xff) + return -EINVAL; + + if (xor8_buf(block, sizeof(*block)) != 0xff) + return -EINVAL; + + return 0; +} + +static void calc_block_sig(struct mlx5_cmd_prot_block *block, u8 token, + int csum) +{ + block->token = token; + if (csum) { + block->ctrl_sig = ~xor8_buf(block->rsvd0, sizeof(*block) - + sizeof(block->data) - 2); + block->sig = ~xor8_buf(block, sizeof(*block) - 1); + } +} + +static void calc_chain_sig(struct mlx5_cmd_msg *msg, u8 token, int csum) +{ + struct mlx5_cmd_mailbox *next = msg->next; + + while (next) { + calc_block_sig(next->buf, token, csum); + next = next->next; + } +} + +static void set_signature(struct mlx5_cmd_work_ent *ent, int csum) +{ + ent->lay->sig = ~xor8_buf(ent->lay, sizeof(*ent->lay)); + calc_chain_sig(ent->in, ent->token, csum); + calc_chain_sig(ent->out, ent->token, csum); +} + +static void poll_timeout(struct mlx5_cmd_work_ent *ent) +{ + unsigned long poll_end = jiffies + msecs_to_jiffies(MLX5_CMD_TIMEOUT_MSEC + 1000); + u8 own; + + do { + own = ent->lay->status_own; + if (!(own & CMD_OWNER_HW)) { + ent->ret = 0; + return; + } + usleep_range(5000, 10000); + } while (time_before(jiffies, poll_end)); + + ent->ret = -ETIMEDOUT; +} + +static void free_cmd(struct mlx5_cmd_work_ent *ent) +{ + kfree(ent); +} + + +static int verify_signature(struct mlx5_cmd_work_ent *ent) +{ + struct mlx5_cmd_mailbox *next = ent->out->next; + int err; + u8 sig; + + sig = xor8_buf(ent->lay, sizeof(*ent->lay)); + if (sig != 0xff) + return -EINVAL; + + while (next) { + err = verify_block_sig(next->buf); + if (err) + return err; + + next = next->next; + } + + return 0; +} + +static void dump_buf(void *buf, int size, int data_only, int offset) +{ + __be32 *p = buf; + int i; + + for (i = 0; i < size; i += 16) { + pr_debug("%03x: %08x %08x %08x %08x\n", offset, be32_to_cpu(p[0]), + be32_to_cpu(p[1]), be32_to_cpu(p[2]), + be32_to_cpu(p[3])); + p += 4; + offset += 16; + } + if (!data_only) + pr_debug("\n"); +} + +const char *mlx5_command_str(int command) +{ + switch (command) { + case MLX5_CMD_OP_QUERY_HCA_CAP: + return "QUERY_HCA_CAP"; + + case MLX5_CMD_OP_SET_HCA_CAP: + return "SET_HCA_CAP"; + + case MLX5_CMD_OP_QUERY_ADAPTER: + return "QUERY_ADAPTER"; + + case MLX5_CMD_OP_INIT_HCA: + return "INIT_HCA"; + + case MLX5_CMD_OP_TEARDOWN_HCA: + return "TEARDOWN_HCA"; + + case MLX5_CMD_OP_ENABLE_HCA: + return "MLX5_CMD_OP_ENABLE_HCA"; + + case MLX5_CMD_OP_DISABLE_HCA: + return "MLX5_CMD_OP_DISABLE_HCA"; + + case MLX5_CMD_OP_QUERY_PAGES: + return "QUERY_PAGES"; + + case MLX5_CMD_OP_MANAGE_PAGES: + return "MANAGE_PAGES"; + + case MLX5_CMD_OP_CREATE_MKEY: + return "CREATE_MKEY"; + + case MLX5_CMD_OP_QUERY_MKEY: + return "QUERY_MKEY"; + + case MLX5_CMD_OP_DESTROY_MKEY: + return "DESTROY_MKEY"; + + case MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS: + return "QUERY_SPECIAL_CONTEXTS"; + + case MLX5_CMD_OP_CREATE_EQ: + return "CREATE_EQ"; + + case MLX5_CMD_OP_DESTROY_EQ: + return "DESTROY_EQ"; + + case MLX5_CMD_OP_QUERY_EQ: + return "QUERY_EQ"; + + case MLX5_CMD_OP_CREATE_CQ: + return "CREATE_CQ"; + + case MLX5_CMD_OP_DESTROY_CQ: + return "DESTROY_CQ"; + + case MLX5_CMD_OP_QUERY_CQ: + return "QUERY_CQ"; + + case MLX5_CMD_OP_MODIFY_CQ: + return "MODIFY_CQ"; + + case MLX5_CMD_OP_CREATE_QP: + return "CREATE_QP"; + + case MLX5_CMD_OP_DESTROY_QP: + return "DESTROY_QP"; + + case MLX5_CMD_OP_RST2INIT_QP: + return "RST2INIT_QP"; + + case MLX5_CMD_OP_INIT2RTR_QP: + return "INIT2RTR_QP"; + + case MLX5_CMD_OP_RTR2RTS_QP: + return "RTR2RTS_QP"; + + case MLX5_CMD_OP_RTS2RTS_QP: + return "RTS2RTS_QP"; + + case MLX5_CMD_OP_SQERR2RTS_QP: + return "SQERR2RTS_QP"; + + case MLX5_CMD_OP_2ERR_QP: + return "2ERR_QP"; + + case MLX5_CMD_OP_2RST_QP: + return "2RST_QP"; + + case MLX5_CMD_OP_QUERY_QP: + return "QUERY_QP"; + + case MLX5_CMD_OP_MAD_IFC: + return "MAD_IFC"; + + case MLX5_CMD_OP_INIT2INIT_QP: + return "INIT2INIT_QP"; + + case MLX5_CMD_OP_CREATE_PSV: + return "CREATE_PSV"; + + case MLX5_CMD_OP_DESTROY_PSV: + return "DESTROY_PSV"; + + case MLX5_CMD_OP_CREATE_SRQ: + return "CREATE_SRQ"; + + case MLX5_CMD_OP_DESTROY_SRQ: + return "DESTROY_SRQ"; + + case MLX5_CMD_OP_QUERY_SRQ: + return "QUERY_SRQ"; + + case MLX5_CMD_OP_ARM_RQ: + return "ARM_RQ"; + + case MLX5_CMD_OP_RESIZE_SRQ: + return "RESIZE_SRQ"; + + case MLX5_CMD_OP_ALLOC_PD: + return "ALLOC_PD"; + + case MLX5_CMD_OP_DEALLOC_PD: + return "DEALLOC_PD"; + + case MLX5_CMD_OP_ALLOC_UAR: + return "ALLOC_UAR"; + + case MLX5_CMD_OP_DEALLOC_UAR: + return "DEALLOC_UAR"; + + case MLX5_CMD_OP_ATTACH_TO_MCG: + return "ATTACH_TO_MCG"; + + case MLX5_CMD_OP_DETACH_FROM_MCG: + return "DETACH_FROM_MCG"; + + case MLX5_CMD_OP_ALLOC_XRCD: + return "ALLOC_XRCD"; + + case MLX5_CMD_OP_DEALLOC_XRCD: + return "DEALLOC_XRCD"; + + case MLX5_CMD_OP_ACCESS_REG: + return "MLX5_CMD_OP_ACCESS_REG"; + + default: return "unknown command opcode"; + } +} + +static void dump_command(struct mlx5_core_dev *dev, + struct mlx5_cmd_work_ent *ent, int input) +{ + u16 op = be16_to_cpu(((struct mlx5_inbox_hdr *)(ent->lay->in))->opcode); + struct mlx5_cmd_msg *msg = input ? ent->in : ent->out; + struct mlx5_cmd_mailbox *next = msg->next; + int data_only; + u32 offset = 0; + int dump_len; + + data_only = !!(mlx5_core_debug_mask & (1 << MLX5_CMD_DATA)); + + if (data_only) + mlx5_core_dbg_mask(dev, 1 << MLX5_CMD_DATA, + "dump command data %s(0x%x) %s\n", + mlx5_command_str(op), op, + input ? "INPUT" : "OUTPUT"); + else + mlx5_core_dbg(dev, "dump command %s(0x%x) %s\n", + mlx5_command_str(op), op, + input ? "INPUT" : "OUTPUT"); + + if (data_only) { + if (input) { + dump_buf(ent->lay->in, sizeof(ent->lay->in), 1, offset); + offset += sizeof(ent->lay->in); + } else { + dump_buf(ent->lay->out, sizeof(ent->lay->out), 1, offset); + offset += sizeof(ent->lay->out); + } + } else { + dump_buf(ent->lay, sizeof(*ent->lay), 0, offset); + offset += sizeof(*ent->lay); + } + + while (next && offset < msg->len) { + if (data_only) { + dump_len = min_t(int, MLX5_CMD_DATA_BLOCK_SIZE, msg->len - offset); + dump_buf(next->buf, dump_len, 1, offset); + offset += MLX5_CMD_DATA_BLOCK_SIZE; + } else { + mlx5_core_dbg(dev, "command block:\n"); + dump_buf(next->buf, sizeof(struct mlx5_cmd_prot_block), 0, offset); + offset += sizeof(struct mlx5_cmd_prot_block); + } + next = next->next; + } + + if (data_only) + pr_debug("\n"); +} + +static void cmd_work_handler(struct work_struct *work) +{ + struct mlx5_cmd_work_ent *ent = container_of(work, struct mlx5_cmd_work_ent, work); + struct mlx5_cmd *cmd = ent->cmd; + struct mlx5_core_dev *dev = container_of(cmd, struct mlx5_core_dev, cmd); + struct mlx5_cmd_layout *lay; + struct semaphore *sem; + + sem = ent->page_queue ? &cmd->pages_sem : &cmd->sem; + down(sem); + if (!ent->page_queue) { + ent->idx = alloc_ent(cmd); + if (ent->idx < 0) { + mlx5_core_err(dev, "failed to allocate command entry\n"); + up(sem); + return; + } + } else { + ent->idx = cmd->max_reg_cmds; + } + + ent->token = alloc_token(cmd); + cmd->ent_arr[ent->idx] = ent; + lay = get_inst(cmd, ent->idx); + ent->lay = lay; + memset(lay, 0, sizeof(*lay)); + memcpy(lay->in, ent->in->first.data, sizeof(lay->in)); + ent->op = be32_to_cpu(lay->in[0]) >> 16; + if (ent->in->next) + lay->in_ptr = cpu_to_be64(ent->in->next->dma); + lay->inlen = cpu_to_be32(ent->in->len); + if (ent->out->next) + lay->out_ptr = cpu_to_be64(ent->out->next->dma); + lay->outlen = cpu_to_be32(ent->out->len); + lay->type = MLX5_PCI_CMD_XPORT; + lay->token = ent->token; + lay->status_own = CMD_OWNER_HW; + set_signature(ent, !cmd->checksum_disabled); + dump_command(dev, ent, 1); + ent->ts1 = ktime_get_ns(); + + /* ring doorbell after the descriptor is valid */ + wmb(); + iowrite32be(1 << ent->idx, &dev->iseg->cmd_dbell); + mlx5_core_dbg(dev, "write 0x%x to command doorbell\n", 1 << ent->idx); + mmiowb(); + if (cmd->mode == CMD_MODE_POLLING) { + poll_timeout(ent); + /* make sure we read the descriptor after ownership is SW */ + rmb(); + mlx5_cmd_comp_handler(dev, 1UL << ent->idx); + } +} + +static const char *deliv_status_to_str(u8 status) +{ + switch (status) { + case MLX5_CMD_DELIVERY_STAT_OK: + return "no errors"; + case MLX5_CMD_DELIVERY_STAT_SIGNAT_ERR: + return "signature error"; + case MLX5_CMD_DELIVERY_STAT_TOK_ERR: + return "token error"; + case MLX5_CMD_DELIVERY_STAT_BAD_BLK_NUM_ERR: + return "bad block number"; + case MLX5_CMD_DELIVERY_STAT_OUT_PTR_ALIGN_ERR: + return "output pointer not aligned to block size"; + case MLX5_CMD_DELIVERY_STAT_IN_PTR_ALIGN_ERR: + return "input pointer not aligned to block size"; + case MLX5_CMD_DELIVERY_STAT_FW_ERR: + return "firmware internal error"; + case MLX5_CMD_DELIVERY_STAT_IN_LENGTH_ERR: + return "command input length error"; + case MLX5_CMD_DELIVERY_STAT_OUT_LENGTH_ERR: + return "command ouput length error"; + case MLX5_CMD_DELIVERY_STAT_RES_FLD_NOT_CLR_ERR: + return "reserved fields not cleared"; + case MLX5_CMD_DELIVERY_STAT_CMD_DESCR_ERR: + return "bad command descriptor type"; + default: + return "unknown status code"; + } +} + +static u16 msg_to_opcode(struct mlx5_cmd_msg *in) +{ + struct mlx5_inbox_hdr *hdr = (struct mlx5_inbox_hdr *)(in->first.data); + + return be16_to_cpu(hdr->opcode); +} + +static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent) +{ + unsigned long timeout = msecs_to_jiffies(MLX5_CMD_TIMEOUT_MSEC); + struct mlx5_cmd *cmd = &dev->cmd; + int err; + + if (cmd->mode == CMD_MODE_POLLING) { + wait_for_completion(&ent->done); + err = ent->ret; + } else { + if (!wait_for_completion_timeout(&ent->done, timeout)) + err = -ETIMEDOUT; + else + err = 0; + } + if (err == -ETIMEDOUT) { + mlx5_core_warn(dev, "%s(0x%x) timeout. Will cause a leak of a command resource\n", + mlx5_command_str(msg_to_opcode(ent->in)), + msg_to_opcode(ent->in)); + } + mlx5_core_dbg(dev, "err %d, delivery status %s(%d)\n", + err, deliv_status_to_str(ent->status), ent->status); + + return err; +} + +/* Notes: + * 1. Callback functions may not sleep + * 2. page queue commands do not support asynchrous completion + */ +static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in, + struct mlx5_cmd_msg *out, void *uout, int uout_size, + mlx5_cmd_cbk_t callback, + void *context, int page_queue, u8 *status) +{ + struct mlx5_cmd *cmd = &dev->cmd; + struct mlx5_cmd_work_ent *ent; + struct mlx5_cmd_stats *stats; + int err = 0; + s64 ds; + u16 op; + + if (callback && page_queue) + return -EINVAL; + + ent = alloc_cmd(cmd, in, out, uout, uout_size, callback, context, + page_queue); + if (IS_ERR(ent)) + return PTR_ERR(ent); + + if (!callback) + init_completion(&ent->done); + + INIT_WORK(&ent->work, cmd_work_handler); + if (page_queue) { + cmd_work_handler(&ent->work); + } else if (!queue_work(cmd->wq, &ent->work)) { + mlx5_core_warn(dev, "failed to queue work\n"); + err = -ENOMEM; + goto out_free; + } + + if (!callback) { + err = wait_func(dev, ent); + if (err == -ETIMEDOUT) + goto out; + + ds = ent->ts2 - ent->ts1; + op = be16_to_cpu(((struct mlx5_inbox_hdr *)in->first.data)->opcode); + if (op < ARRAY_SIZE(cmd->stats)) { + stats = &cmd->stats[op]; + spin_lock_irq(&stats->lock); + stats->sum += ds; + ++stats->n; + spin_unlock_irq(&stats->lock); + } + mlx5_core_dbg_mask(dev, 1 << MLX5_CMD_TIME, + "fw exec time for %s is %lld nsec\n", + mlx5_command_str(op), ds); + *status = ent->status; + free_cmd(ent); + } + + return err; + +out_free: + free_cmd(ent); +out: + return err; +} + +static ssize_t dbg_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_core_dev *dev = filp->private_data; + struct mlx5_cmd_debug *dbg = &dev->cmd.dbg; + char lbuf[3]; + int err; + + if (!dbg->in_msg || !dbg->out_msg) + return -ENOMEM; + + if (copy_from_user(lbuf, buf, sizeof(lbuf))) + return -EFAULT; + + lbuf[sizeof(lbuf) - 1] = 0; + + if (strcmp(lbuf, "go")) + return -EINVAL; + + err = mlx5_cmd_exec(dev, dbg->in_msg, dbg->inlen, dbg->out_msg, dbg->outlen); + + return err ? err : count; +} + + +static const struct file_operations fops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = dbg_write, +}; + +static int mlx5_copy_to_msg(struct mlx5_cmd_msg *to, void *from, int size) +{ + struct mlx5_cmd_prot_block *block; + struct mlx5_cmd_mailbox *next; + int copy; + + if (!to || !from) + return -ENOMEM; + + copy = min_t(int, size, sizeof(to->first.data)); + memcpy(to->first.data, from, copy); + size -= copy; + from += copy; + + next = to->next; + while (size) { + if (!next) { + /* this is a BUG */ + return -ENOMEM; + } + + copy = min_t(int, size, MLX5_CMD_DATA_BLOCK_SIZE); + block = next->buf; + memcpy(block->data, from, copy); + from += copy; + size -= copy; + next = next->next; + } + + return 0; +} + +static int mlx5_copy_from_msg(void *to, struct mlx5_cmd_msg *from, int size) +{ + struct mlx5_cmd_prot_block *block; + struct mlx5_cmd_mailbox *next; + int copy; + + if (!to || !from) + return -ENOMEM; + + copy = min_t(int, size, sizeof(from->first.data)); + memcpy(to, from->first.data, copy); + size -= copy; + to += copy; + + next = from->next; + while (size) { + if (!next) { + /* this is a BUG */ + return -ENOMEM; + } + + copy = min_t(int, size, MLX5_CMD_DATA_BLOCK_SIZE); + block = next->buf; + + memcpy(to, block->data, copy); + to += copy; + size -= copy; + next = next->next; + } + + return 0; +} + +static struct mlx5_cmd_mailbox *alloc_cmd_box(struct mlx5_core_dev *dev, + gfp_t flags) +{ + struct mlx5_cmd_mailbox *mailbox; + + mailbox = kmalloc(sizeof(*mailbox), flags); + if (!mailbox) + return ERR_PTR(-ENOMEM); + + mailbox->buf = pci_pool_alloc(dev->cmd.pool, flags, + &mailbox->dma); + if (!mailbox->buf) { + mlx5_core_dbg(dev, "failed allocation\n"); + kfree(mailbox); + return ERR_PTR(-ENOMEM); + } + memset(mailbox->buf, 0, sizeof(struct mlx5_cmd_prot_block)); + mailbox->next = NULL; + + return mailbox; +} + +static void free_cmd_box(struct mlx5_core_dev *dev, + struct mlx5_cmd_mailbox *mailbox) +{ + pci_pool_free(dev->cmd.pool, mailbox->buf, mailbox->dma); + kfree(mailbox); +} + +static struct mlx5_cmd_msg *mlx5_alloc_cmd_msg(struct mlx5_core_dev *dev, + gfp_t flags, int size) +{ + struct mlx5_cmd_mailbox *tmp, *head = NULL; + struct mlx5_cmd_prot_block *block; + struct mlx5_cmd_msg *msg; + int blen; + int err; + int n; + int i; + + msg = kzalloc(sizeof(*msg), flags); + if (!msg) + return ERR_PTR(-ENOMEM); + + blen = size - min_t(int, sizeof(msg->first.data), size); + n = (blen + MLX5_CMD_DATA_BLOCK_SIZE - 1) / MLX5_CMD_DATA_BLOCK_SIZE; + + for (i = 0; i < n; i++) { + tmp = alloc_cmd_box(dev, flags); + if (IS_ERR(tmp)) { + mlx5_core_warn(dev, "failed allocating block\n"); + err = PTR_ERR(tmp); + goto err_alloc; + } + + block = tmp->buf; + tmp->next = head; + block->next = cpu_to_be64(tmp->next ? tmp->next->dma : 0); + block->block_num = cpu_to_be32(n - i - 1); + head = tmp; + } + msg->next = head; + msg->len = size; + return msg; + +err_alloc: + while (head) { + tmp = head->next; + free_cmd_box(dev, head); + head = tmp; + } + kfree(msg); + + return ERR_PTR(err); +} + +static void mlx5_free_cmd_msg(struct mlx5_core_dev *dev, + struct mlx5_cmd_msg *msg) +{ + struct mlx5_cmd_mailbox *head = msg->next; + struct mlx5_cmd_mailbox *next; + + while (head) { + next = head->next; + free_cmd_box(dev, head); + head = next; + } + kfree(msg); +} + +static ssize_t data_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_core_dev *dev = filp->private_data; + struct mlx5_cmd_debug *dbg = &dev->cmd.dbg; + void *ptr; + int err; + + if (*pos != 0) + return -EINVAL; + + kfree(dbg->in_msg); + dbg->in_msg = NULL; + dbg->inlen = 0; + + ptr = kzalloc(count, GFP_KERNEL); + if (!ptr) + return -ENOMEM; + + if (copy_from_user(ptr, buf, count)) { + err = -EFAULT; + goto out; + } + dbg->in_msg = ptr; + dbg->inlen = count; + + *pos = count; + + return count; + +out: + kfree(ptr); + return err; +} + +static ssize_t data_read(struct file *filp, char __user *buf, size_t count, + loff_t *pos) +{ + struct mlx5_core_dev *dev = filp->private_data; + struct mlx5_cmd_debug *dbg = &dev->cmd.dbg; + int copy; + + if (*pos) + return 0; + + if (!dbg->out_msg) + return -ENOMEM; + + copy = min_t(int, count, dbg->outlen); + if (copy_to_user(buf, dbg->out_msg, copy)) + return -EFAULT; + + *pos += copy; + + return copy; +} + +static const struct file_operations dfops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = data_write, + .read = data_read, +}; + +static ssize_t outlen_read(struct file *filp, char __user *buf, size_t count, + loff_t *pos) +{ + struct mlx5_core_dev *dev = filp->private_data; + struct mlx5_cmd_debug *dbg = &dev->cmd.dbg; + char outlen[8]; + int err; + + if (*pos) + return 0; + + err = snprintf(outlen, sizeof(outlen), "%d", dbg->outlen); + if (err < 0) + return err; + + if (copy_to_user(buf, &outlen, err)) + return -EFAULT; + + *pos += err; + + return err; +} + +static ssize_t outlen_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_core_dev *dev = filp->private_data; + struct mlx5_cmd_debug *dbg = &dev->cmd.dbg; + char outlen_str[8]; + int outlen; + void *ptr; + int err; + + if (*pos != 0 || count > 6) + return -EINVAL; + + kfree(dbg->out_msg); + dbg->out_msg = NULL; + dbg->outlen = 0; + + if (copy_from_user(outlen_str, buf, count)) + return -EFAULT; + + outlen_str[7] = 0; + + err = sscanf(outlen_str, "%d", &outlen); + if (err < 0) + return err; + + ptr = kzalloc(outlen, GFP_KERNEL); + if (!ptr) + return -ENOMEM; + + dbg->out_msg = ptr; + dbg->outlen = outlen; + + *pos = count; + + return count; +} + +static const struct file_operations olfops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = outlen_write, + .read = outlen_read, +}; + +static void set_wqname(struct mlx5_core_dev *dev) +{ + struct mlx5_cmd *cmd = &dev->cmd; + + snprintf(cmd->wq_name, sizeof(cmd->wq_name), "mlx5_cmd_%s", + dev_name(&dev->pdev->dev)); +} + +static void clean_debug_files(struct mlx5_core_dev *dev) +{ + struct mlx5_cmd_debug *dbg = &dev->cmd.dbg; + + if (!mlx5_debugfs_root) + return; + + mlx5_cmdif_debugfs_cleanup(dev); + debugfs_remove_recursive(dbg->dbg_root); +} + +static int create_debugfs_files(struct mlx5_core_dev *dev) +{ + struct mlx5_cmd_debug *dbg = &dev->cmd.dbg; + int err = -ENOMEM; + + if (!mlx5_debugfs_root) + return 0; + + dbg->dbg_root = debugfs_create_dir("cmd", dev->priv.dbg_root); + if (!dbg->dbg_root) + return err; + + dbg->dbg_in = debugfs_create_file("in", 0400, dbg->dbg_root, + dev, &dfops); + if (!dbg->dbg_in) + goto err_dbg; + + dbg->dbg_out = debugfs_create_file("out", 0200, dbg->dbg_root, + dev, &dfops); + if (!dbg->dbg_out) + goto err_dbg; + + dbg->dbg_outlen = debugfs_create_file("out_len", 0600, dbg->dbg_root, + dev, &olfops); + if (!dbg->dbg_outlen) + goto err_dbg; + + dbg->dbg_status = debugfs_create_u8("status", 0600, dbg->dbg_root, + &dbg->status); + if (!dbg->dbg_status) + goto err_dbg; + + dbg->dbg_run = debugfs_create_file("run", 0200, dbg->dbg_root, dev, &fops); + if (!dbg->dbg_run) + goto err_dbg; + + mlx5_cmdif_debugfs_init(dev); + + return 0; + +err_dbg: + clean_debug_files(dev); + return err; +} + +void mlx5_cmd_use_events(struct mlx5_core_dev *dev) +{ + struct mlx5_cmd *cmd = &dev->cmd; + int i; + + for (i = 0; i < cmd->max_reg_cmds; i++) + down(&cmd->sem); + + down(&cmd->pages_sem); + + flush_workqueue(cmd->wq); + + cmd->mode = CMD_MODE_EVENTS; + + up(&cmd->pages_sem); + for (i = 0; i < cmd->max_reg_cmds; i++) + up(&cmd->sem); +} + +void mlx5_cmd_use_polling(struct mlx5_core_dev *dev) +{ + struct mlx5_cmd *cmd = &dev->cmd; + int i; + + for (i = 0; i < cmd->max_reg_cmds; i++) + down(&cmd->sem); + + down(&cmd->pages_sem); + + flush_workqueue(cmd->wq); + cmd->mode = CMD_MODE_POLLING; + + up(&cmd->pages_sem); + for (i = 0; i < cmd->max_reg_cmds; i++) + up(&cmd->sem); +} + +static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg) +{ + unsigned long flags; + + if (msg->cache) { + spin_lock_irqsave(&msg->cache->lock, flags); + list_add_tail(&msg->list, &msg->cache->head); + spin_unlock_irqrestore(&msg->cache->lock, flags); + } else { + mlx5_free_cmd_msg(dev, msg); + } +} + +void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, unsigned long vector) +{ + struct mlx5_cmd *cmd = &dev->cmd; + struct mlx5_cmd_work_ent *ent; + mlx5_cmd_cbk_t callback; + void *context; + int err; + int i; + s64 ds; + struct mlx5_cmd_stats *stats; + unsigned long flags; + + for (i = 0; i < (1 << cmd->log_sz); i++) { + if (test_bit(i, &vector)) { + struct semaphore *sem; + + ent = cmd->ent_arr[i]; + if (ent->page_queue) + sem = &cmd->pages_sem; + else + sem = &cmd->sem; + ent->ts2 = ktime_get_ns(); + memcpy(ent->out->first.data, ent->lay->out, sizeof(ent->lay->out)); + dump_command(dev, ent, 0); + if (!ent->ret) { + if (!cmd->checksum_disabled) + ent->ret = verify_signature(ent); + else + ent->ret = 0; + ent->status = ent->lay->status_own >> 1; + mlx5_core_dbg(dev, "command completed. ret 0x%x, delivery status %s(0x%x)\n", + ent->ret, deliv_status_to_str(ent->status), ent->status); + } + free_ent(cmd, ent->idx); + if (ent->callback) { + ds = ent->ts2 - ent->ts1; + if (ent->op < ARRAY_SIZE(cmd->stats)) { + stats = &cmd->stats[ent->op]; + spin_lock_irqsave(&stats->lock, flags); + stats->sum += ds; + ++stats->n; + spin_unlock_irqrestore(&stats->lock, flags); + } + + callback = ent->callback; + context = ent->context; + err = ent->ret; + if (!err) + err = mlx5_copy_from_msg(ent->uout, + ent->out, + ent->uout_size); + + mlx5_free_cmd_msg(dev, ent->out); + free_msg(dev, ent->in); + + free_cmd(ent); + callback(err, context); + } else { + complete(&ent->done); + } + up(sem); + } + } +} +EXPORT_SYMBOL(mlx5_cmd_comp_handler); + +static int status_to_err(u8 status) +{ + return status ? -1 : 0; /* TBD more meaningful codes */ +} + +static struct mlx5_cmd_msg *alloc_msg(struct mlx5_core_dev *dev, int in_size, + gfp_t gfp) +{ + struct mlx5_cmd_msg *msg = ERR_PTR(-ENOMEM); + struct mlx5_cmd *cmd = &dev->cmd; + struct cache_ent *ent = NULL; + + if (in_size > MED_LIST_SIZE && in_size <= LONG_LIST_SIZE) + ent = &cmd->cache.large; + else if (in_size > 16 && in_size <= MED_LIST_SIZE) + ent = &cmd->cache.med; + + if (ent) { + spin_lock_irq(&ent->lock); + if (!list_empty(&ent->head)) { + msg = list_entry(ent->head.next, typeof(*msg), list); + /* For cached lists, we must explicitly state what is + * the real size + */ + msg->len = in_size; + list_del(&msg->list); + } + spin_unlock_irq(&ent->lock); + } + + if (IS_ERR(msg)) + msg = mlx5_alloc_cmd_msg(dev, gfp, in_size); + + return msg; +} + +static int is_manage_pages(struct mlx5_inbox_hdr *in) +{ + return be16_to_cpu(in->opcode) == MLX5_CMD_OP_MANAGE_PAGES; +} + +static int cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, + int out_size, mlx5_cmd_cbk_t callback, void *context) +{ + struct mlx5_cmd_msg *inb; + struct mlx5_cmd_msg *outb; + int pages_queue; + gfp_t gfp; + int err; + u8 status = 0; + + pages_queue = is_manage_pages(in); + gfp = callback ? GFP_ATOMIC : GFP_KERNEL; + + inb = alloc_msg(dev, in_size, gfp); + if (IS_ERR(inb)) { + err = PTR_ERR(inb); + return err; + } + + err = mlx5_copy_to_msg(inb, in, in_size); + if (err) { + mlx5_core_warn(dev, "err %d\n", err); + goto out_in; + } + + outb = mlx5_alloc_cmd_msg(dev, gfp, out_size); + if (IS_ERR(outb)) { + err = PTR_ERR(outb); + goto out_in; + } + + err = mlx5_cmd_invoke(dev, inb, outb, out, out_size, callback, context, + pages_queue, &status); + if (err) + goto out_out; + + mlx5_core_dbg(dev, "err %d, status %d\n", err, status); + if (status) { + err = status_to_err(status); + goto out_out; + } + + err = mlx5_copy_from_msg(out, outb, out_size); + +out_out: + if (!callback) + mlx5_free_cmd_msg(dev, outb); + +out_in: + if (!callback) + free_msg(dev, inb); + return err; +} + +int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, + int out_size) +{ + return cmd_exec(dev, in, in_size, out, out_size, NULL, NULL); +} +EXPORT_SYMBOL(mlx5_cmd_exec); + +int mlx5_cmd_exec_cb(struct mlx5_core_dev *dev, void *in, int in_size, + void *out, int out_size, mlx5_cmd_cbk_t callback, + void *context) +{ + return cmd_exec(dev, in, in_size, out, out_size, callback, context); +} +EXPORT_SYMBOL(mlx5_cmd_exec_cb); + +static void destroy_msg_cache(struct mlx5_core_dev *dev) +{ + struct mlx5_cmd *cmd = &dev->cmd; + struct mlx5_cmd_msg *msg; + struct mlx5_cmd_msg *n; + + list_for_each_entry_safe(msg, n, &cmd->cache.large.head, list) { + list_del(&msg->list); + mlx5_free_cmd_msg(dev, msg); + } + + list_for_each_entry_safe(msg, n, &cmd->cache.med.head, list) { + list_del(&msg->list); + mlx5_free_cmd_msg(dev, msg); + } +} + +static int create_msg_cache(struct mlx5_core_dev *dev) +{ + struct mlx5_cmd *cmd = &dev->cmd; + struct mlx5_cmd_msg *msg; + int err; + int i; + + spin_lock_init(&cmd->cache.large.lock); + INIT_LIST_HEAD(&cmd->cache.large.head); + spin_lock_init(&cmd->cache.med.lock); + INIT_LIST_HEAD(&cmd->cache.med.head); + + for (i = 0; i < NUM_LONG_LISTS; i++) { + msg = mlx5_alloc_cmd_msg(dev, GFP_KERNEL, LONG_LIST_SIZE); + if (IS_ERR(msg)) { + err = PTR_ERR(msg); + goto ex_err; + } + msg->cache = &cmd->cache.large; + list_add_tail(&msg->list, &cmd->cache.large.head); + } + + for (i = 0; i < NUM_MED_LISTS; i++) { + msg = mlx5_alloc_cmd_msg(dev, GFP_KERNEL, MED_LIST_SIZE); + if (IS_ERR(msg)) { + err = PTR_ERR(msg); + goto ex_err; + } + msg->cache = &cmd->cache.med; + list_add_tail(&msg->list, &cmd->cache.med.head); + } + + return 0; + +ex_err: + destroy_msg_cache(dev); + return err; +} + +int mlx5_cmd_init(struct mlx5_core_dev *dev) +{ + int size = sizeof(struct mlx5_cmd_prot_block); + int align = roundup_pow_of_two(size); + struct mlx5_cmd *cmd = &dev->cmd; + u32 cmd_h, cmd_l; + u16 cmd_if_rev; + int err; + int i; + + cmd_if_rev = cmdif_rev(dev); + if (cmd_if_rev != CMD_IF_REV) { + dev_err(&dev->pdev->dev, + "Driver cmdif rev(%d) differs from firmware's(%d)\n", + CMD_IF_REV, cmd_if_rev); + return -EINVAL; + } + + cmd->pool = pci_pool_create("mlx5_cmd", dev->pdev, size, align, 0); + if (!cmd->pool) + return -ENOMEM; + + cmd->cmd_buf = (void *)__get_free_pages(GFP_ATOMIC, 0); + if (!cmd->cmd_buf) { + err = -ENOMEM; + goto err_free_pool; + } + cmd->dma = dma_map_single(&dev->pdev->dev, cmd->cmd_buf, PAGE_SIZE, + DMA_BIDIRECTIONAL); + if (dma_mapping_error(&dev->pdev->dev, cmd->dma)) { + err = -ENOMEM; + goto err_free; + } + + cmd_l = ioread32be(&dev->iseg->cmdq_addr_l_sz) & 0xff; + cmd->log_sz = cmd_l >> 4 & 0xf; + cmd->log_stride = cmd_l & 0xf; + if (1 << cmd->log_sz > MLX5_MAX_COMMANDS) { + dev_err(&dev->pdev->dev, "firmware reports too many outstanding commands %d\n", + 1 << cmd->log_sz); + err = -EINVAL; + goto err_map; + } + + if (cmd->log_sz + cmd->log_stride > PAGE_SHIFT) { + dev_err(&dev->pdev->dev, "command queue size overflow\n"); + err = -EINVAL; + goto err_map; + } + + cmd->checksum_disabled = 1; + cmd->max_reg_cmds = (1 << cmd->log_sz) - 1; + cmd->bitmask = (1 << cmd->max_reg_cmds) - 1; + + cmd->cmdif_rev = ioread32be(&dev->iseg->cmdif_rev_fw_sub) >> 16; + if (cmd->cmdif_rev > CMD_IF_REV) { + dev_err(&dev->pdev->dev, "driver does not support command interface version. driver %d, firmware %d\n", + CMD_IF_REV, cmd->cmdif_rev); + err = -ENOTSUPP; + goto err_map; + } + + spin_lock_init(&cmd->alloc_lock); + spin_lock_init(&cmd->token_lock); + for (i = 0; i < ARRAY_SIZE(cmd->stats); i++) + spin_lock_init(&cmd->stats[i].lock); + + sema_init(&cmd->sem, cmd->max_reg_cmds); + sema_init(&cmd->pages_sem, 1); + + cmd_h = (u32)((u64)(cmd->dma) >> 32); + cmd_l = (u32)(cmd->dma); + if (cmd_l & 0xfff) { + dev_err(&dev->pdev->dev, "invalid command queue address\n"); + err = -ENOMEM; + goto err_map; + } + + iowrite32be(cmd_h, &dev->iseg->cmdq_addr_h); + iowrite32be(cmd_l, &dev->iseg->cmdq_addr_l_sz); + + /* Make sure firmware sees the complete address before we proceed */ + wmb(); + + mlx5_core_dbg(dev, "descriptor at dma 0x%llx\n", (unsigned long long)(cmd->dma)); + + cmd->mode = CMD_MODE_POLLING; + + err = create_msg_cache(dev); + if (err) { + dev_err(&dev->pdev->dev, "failed to create command cache\n"); + goto err_map; + } + + set_wqname(dev); + cmd->wq = create_singlethread_workqueue(cmd->wq_name); + if (!cmd->wq) { + dev_err(&dev->pdev->dev, "failed to create command workqueue\n"); + err = -ENOMEM; + goto err_cache; + } + + err = create_debugfs_files(dev); + if (err) { + err = -ENOMEM; + goto err_wq; + } + + return 0; + +err_wq: + destroy_workqueue(cmd->wq); + +err_cache: + destroy_msg_cache(dev); + +err_map: + dma_unmap_single(&dev->pdev->dev, cmd->dma, PAGE_SIZE, + DMA_BIDIRECTIONAL); +err_free: + free_pages((unsigned long)cmd->cmd_buf, 0); + +err_free_pool: + pci_pool_destroy(cmd->pool); + + return err; +} +EXPORT_SYMBOL(mlx5_cmd_init); + +void mlx5_cmd_cleanup(struct mlx5_core_dev *dev) +{ + struct mlx5_cmd *cmd = &dev->cmd; + + clean_debug_files(dev); + destroy_workqueue(cmd->wq); + destroy_msg_cache(dev); + dma_unmap_single(&dev->pdev->dev, cmd->dma, PAGE_SIZE, + DMA_BIDIRECTIONAL); + free_pages((unsigned long)cmd->cmd_buf, 0); + pci_pool_destroy(cmd->pool); +} +EXPORT_SYMBOL(mlx5_cmd_cleanup); + +static const char *cmd_status_str(u8 status) +{ + switch (status) { + case MLX5_CMD_STAT_OK: + return "OK"; + case MLX5_CMD_STAT_INT_ERR: + return "internal error"; + case MLX5_CMD_STAT_BAD_OP_ERR: + return "bad operation"; + case MLX5_CMD_STAT_BAD_PARAM_ERR: + return "bad parameter"; + case MLX5_CMD_STAT_BAD_SYS_STATE_ERR: + return "bad system state"; + case MLX5_CMD_STAT_BAD_RES_ERR: + return "bad resource"; + case MLX5_CMD_STAT_RES_BUSY: + return "resource busy"; + case MLX5_CMD_STAT_LIM_ERR: + return "limits exceeded"; + case MLX5_CMD_STAT_BAD_RES_STATE_ERR: + return "bad resource state"; + case MLX5_CMD_STAT_IX_ERR: + return "bad index"; + case MLX5_CMD_STAT_NO_RES_ERR: + return "no resources"; + case MLX5_CMD_STAT_BAD_INP_LEN_ERR: + return "bad input length"; + case MLX5_CMD_STAT_BAD_OUTP_LEN_ERR: + return "bad output length"; + case MLX5_CMD_STAT_BAD_QP_STATE_ERR: + return "bad QP state"; + case MLX5_CMD_STAT_BAD_PKT_ERR: + return "bad packet (discarded)"; + case MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR: + return "bad size too many outstanding CQEs"; + default: + return "unknown status"; + } +} + +static int cmd_status_to_err(u8 status) +{ + switch (status) { + case MLX5_CMD_STAT_OK: return 0; + case MLX5_CMD_STAT_INT_ERR: return -EIO; + case MLX5_CMD_STAT_BAD_OP_ERR: return -EINVAL; + case MLX5_CMD_STAT_BAD_PARAM_ERR: return -EINVAL; + case MLX5_CMD_STAT_BAD_SYS_STATE_ERR: return -EIO; + case MLX5_CMD_STAT_BAD_RES_ERR: return -EINVAL; + case MLX5_CMD_STAT_RES_BUSY: return -EBUSY; + case MLX5_CMD_STAT_LIM_ERR: return -ENOMEM; + case MLX5_CMD_STAT_BAD_RES_STATE_ERR: return -EINVAL; + case MLX5_CMD_STAT_IX_ERR: return -EINVAL; + case MLX5_CMD_STAT_NO_RES_ERR: return -EAGAIN; + case MLX5_CMD_STAT_BAD_INP_LEN_ERR: return -EIO; + case MLX5_CMD_STAT_BAD_OUTP_LEN_ERR: return -EIO; + case MLX5_CMD_STAT_BAD_QP_STATE_ERR: return -EINVAL; + case MLX5_CMD_STAT_BAD_PKT_ERR: return -EINVAL; + case MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR: return -EINVAL; + default: return -EIO; + } +} + +/* this will be available till all the commands use set/get macros */ +int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr) +{ + if (!hdr->status) + return 0; + + pr_warn("command failed, status %s(0x%x), syndrome 0x%x\n", + cmd_status_str(hdr->status), hdr->status, + be32_to_cpu(hdr->syndrome)); + + return cmd_status_to_err(hdr->status); +} + +int mlx5_cmd_status_to_err_v2(void *ptr) +{ + u32 syndrome; + u8 status; + + status = be32_to_cpu(*(__be32 *)ptr) >> 24; + if (!status) + return 0; + + syndrome = be32_to_cpu(*(__be32 *)(ptr + 4)); + + pr_warn("command failed, status %s(0x%x), syndrome 0x%x\n", + cmd_status_str(status), status, syndrome); + + return cmd_status_to_err(status); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c new file mode 100644 index 0000000..43c5f48 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "mlx5_core.h" + +void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn) +{ + struct mlx5_core_cq *cq; + struct mlx5_cq_table *table = &dev->priv.cq_table; + + spin_lock(&table->lock); + cq = radix_tree_lookup(&table->tree, cqn); + if (likely(cq)) + atomic_inc(&cq->refcount); + spin_unlock(&table->lock); + + if (!cq) { + mlx5_core_warn(dev, "Completion event for bogus CQ 0x%x\n", cqn); + return; + } + + ++cq->arm_sn; + + cq->comp(cq); + + if (atomic_dec_and_test(&cq->refcount)) + complete(&cq->free); +} + +void mlx5_cq_event(struct mlx5_core_dev *dev, u32 cqn, int event_type) +{ + struct mlx5_cq_table *table = &dev->priv.cq_table; + struct mlx5_core_cq *cq; + + spin_lock(&table->lock); + + cq = radix_tree_lookup(&table->tree, cqn); + if (cq) + atomic_inc(&cq->refcount); + + spin_unlock(&table->lock); + + if (!cq) { + mlx5_core_warn(dev, "Async event for bogus CQ 0x%x\n", cqn); + return; + } + + cq->event(cq, event_type); + + if (atomic_dec_and_test(&cq->refcount)) + complete(&cq->free); +} + + +int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, + struct mlx5_create_cq_mbox_in *in, int inlen) +{ + int err; + struct mlx5_cq_table *table = &dev->priv.cq_table; + struct mlx5_create_cq_mbox_out out; + struct mlx5_destroy_cq_mbox_in din; + struct mlx5_destroy_cq_mbox_out dout; + + in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_CQ); + memset(&out, 0, sizeof(out)); + err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out)); + if (err) + return err; + + if (out.hdr.status) + return mlx5_cmd_status_to_err(&out.hdr); + + cq->cqn = be32_to_cpu(out.cqn) & 0xffffff; + cq->cons_index = 0; + cq->arm_sn = 0; + atomic_set(&cq->refcount, 1); + init_completion(&cq->free); + + spin_lock_irq(&table->lock); + err = radix_tree_insert(&table->tree, cq->cqn, cq); + spin_unlock_irq(&table->lock); + if (err) + goto err_cmd; + + cq->pid = current->pid; + err = mlx5_debug_cq_add(dev, cq); + if (err) + mlx5_core_dbg(dev, "failed adding CP 0x%x to debug file system\n", + cq->cqn); + + return 0; + +err_cmd: + memset(&din, 0, sizeof(din)); + memset(&dout, 0, sizeof(dout)); + din.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_CQ); + mlx5_cmd_exec(dev, &din, sizeof(din), &dout, sizeof(dout)); + return err; +} +EXPORT_SYMBOL(mlx5_core_create_cq); + +int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq) +{ + struct mlx5_cq_table *table = &dev->priv.cq_table; + struct mlx5_destroy_cq_mbox_in in; + struct mlx5_destroy_cq_mbox_out out; + struct mlx5_core_cq *tmp; + int err; + + spin_lock_irq(&table->lock); + tmp = radix_tree_delete(&table->tree, cq->cqn); + spin_unlock_irq(&table->lock); + if (!tmp) { + mlx5_core_warn(dev, "cq 0x%x not found in tree\n", cq->cqn); + return -EINVAL; + } + if (tmp != cq) { + mlx5_core_warn(dev, "corruption on srqn 0x%x\n", cq->cqn); + return -EINVAL; + } + + memset(&in, 0, sizeof(in)); + memset(&out, 0, sizeof(out)); + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_CQ); + in.cqn = cpu_to_be32(cq->cqn); + err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); + if (err) + return err; + + if (out.hdr.status) + return mlx5_cmd_status_to_err(&out.hdr); + + synchronize_irq(cq->irqn); + + mlx5_debug_cq_remove(dev, cq); + if (atomic_dec_and_test(&cq->refcount)) + complete(&cq->free); + wait_for_completion(&cq->free); + + return 0; +} +EXPORT_SYMBOL(mlx5_core_destroy_cq); + +int mlx5_core_query_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, + struct mlx5_query_cq_mbox_out *out) +{ + struct mlx5_query_cq_mbox_in in; + int err; + + memset(&in, 0, sizeof(in)); + memset(out, 0, sizeof(*out)); + + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_CQ); + in.cqn = cpu_to_be32(cq->cqn); + err = mlx5_cmd_exec(dev, &in, sizeof(in), out, sizeof(*out)); + if (err) + return err; + + if (out->hdr.status) + return mlx5_cmd_status_to_err(&out->hdr); + + return err; +} +EXPORT_SYMBOL(mlx5_core_query_cq); + + +int mlx5_core_modify_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, + struct mlx5_modify_cq_mbox_in *in, int in_sz) +{ + struct mlx5_modify_cq_mbox_out out; + int err; + + memset(&out, 0, sizeof(out)); + in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_MODIFY_CQ); + err = mlx5_cmd_exec(dev, in, in_sz, &out, sizeof(out)); + if (err) + return err; + + if (out.hdr.status) + return mlx5_cmd_status_to_err(&out.hdr); + + return 0; +} +EXPORT_SYMBOL(mlx5_core_modify_cq); + +int mlx5_init_cq_table(struct mlx5_core_dev *dev) +{ + struct mlx5_cq_table *table = &dev->priv.cq_table; + int err; + + spin_lock_init(&table->lock); + INIT_RADIX_TREE(&table->tree, GFP_ATOMIC); + err = mlx5_cq_debugfs_init(dev); + + return err; +} + +void mlx5_cleanup_cq_table(struct mlx5_core_dev *dev) +{ + mlx5_cq_debugfs_cleanup(dev); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c new file mode 100644 index 0000000..10e1f1a --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c @@ -0,0 +1,610 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include "mlx5_core.h" + +enum { + QP_PID, + QP_STATE, + QP_XPORT, + QP_MTU, + QP_N_RECV, + QP_RECV_SZ, + QP_N_SEND, + QP_LOG_PG_SZ, + QP_RQPN, +}; + +static char *qp_fields[] = { + [QP_PID] = "pid", + [QP_STATE] = "state", + [QP_XPORT] = "transport", + [QP_MTU] = "mtu", + [QP_N_RECV] = "num_recv", + [QP_RECV_SZ] = "rcv_wqe_sz", + [QP_N_SEND] = "num_send", + [QP_LOG_PG_SZ] = "log2_page_sz", + [QP_RQPN] = "remote_qpn", +}; + +enum { + EQ_NUM_EQES, + EQ_INTR, + EQ_LOG_PG_SZ, +}; + +static char *eq_fields[] = { + [EQ_NUM_EQES] = "num_eqes", + [EQ_INTR] = "intr", + [EQ_LOG_PG_SZ] = "log_page_size", +}; + +enum { + CQ_PID, + CQ_NUM_CQES, + CQ_LOG_PG_SZ, +}; + +static char *cq_fields[] = { + [CQ_PID] = "pid", + [CQ_NUM_CQES] = "num_cqes", + [CQ_LOG_PG_SZ] = "log_page_size", +}; + +struct dentry *mlx5_debugfs_root; +EXPORT_SYMBOL(mlx5_debugfs_root); + +void mlx5_register_debugfs(void) +{ + mlx5_debugfs_root = debugfs_create_dir("mlx5", NULL); + if (IS_ERR_OR_NULL(mlx5_debugfs_root)) + mlx5_debugfs_root = NULL; +} + +void mlx5_unregister_debugfs(void) +{ + debugfs_remove(mlx5_debugfs_root); +} + +int mlx5_qp_debugfs_init(struct mlx5_core_dev *dev) +{ + if (!mlx5_debugfs_root) + return 0; + + atomic_set(&dev->num_qps, 0); + + dev->priv.qp_debugfs = debugfs_create_dir("QPs", dev->priv.dbg_root); + if (!dev->priv.qp_debugfs) + return -ENOMEM; + + return 0; +} + +void mlx5_qp_debugfs_cleanup(struct mlx5_core_dev *dev) +{ + if (!mlx5_debugfs_root) + return; + + debugfs_remove_recursive(dev->priv.qp_debugfs); +} + +int mlx5_eq_debugfs_init(struct mlx5_core_dev *dev) +{ + if (!mlx5_debugfs_root) + return 0; + + dev->priv.eq_debugfs = debugfs_create_dir("EQs", dev->priv.dbg_root); + if (!dev->priv.eq_debugfs) + return -ENOMEM; + + return 0; +} + +void mlx5_eq_debugfs_cleanup(struct mlx5_core_dev *dev) +{ + if (!mlx5_debugfs_root) + return; + + debugfs_remove_recursive(dev->priv.eq_debugfs); +} + +static ssize_t average_read(struct file *filp, char __user *buf, size_t count, + loff_t *pos) +{ + struct mlx5_cmd_stats *stats; + u64 field = 0; + int ret; + char tbuf[22]; + + if (*pos) + return 0; + + stats = filp->private_data; + spin_lock_irq(&stats->lock); + if (stats->n) + field = div64_u64(stats->sum, stats->n); + spin_unlock_irq(&stats->lock); + ret = snprintf(tbuf, sizeof(tbuf), "%llu\n", field); + if (ret > 0) { + if (copy_to_user(buf, tbuf, ret)) + return -EFAULT; + } + + *pos += ret; + return ret; +} + + +static ssize_t average_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_cmd_stats *stats; + + stats = filp->private_data; + spin_lock_irq(&stats->lock); + stats->sum = 0; + stats->n = 0; + spin_unlock_irq(&stats->lock); + + *pos += count; + + return count; +} + +static const struct file_operations stats_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = average_read, + .write = average_write, +}; + +int mlx5_cmdif_debugfs_init(struct mlx5_core_dev *dev) +{ + struct mlx5_cmd_stats *stats; + struct dentry **cmd; + const char *namep; + int err; + int i; + + if (!mlx5_debugfs_root) + return 0; + + cmd = &dev->priv.cmdif_debugfs; + *cmd = debugfs_create_dir("commands", dev->priv.dbg_root); + if (!*cmd) + return -ENOMEM; + + for (i = 0; i < ARRAY_SIZE(dev->cmd.stats); i++) { + stats = &dev->cmd.stats[i]; + namep = mlx5_command_str(i); + if (strcmp(namep, "unknown command opcode")) { + stats->root = debugfs_create_dir(namep, *cmd); + if (!stats->root) { + mlx5_core_warn(dev, "failed adding command %d\n", + i); + err = -ENOMEM; + goto out; + } + + stats->avg = debugfs_create_file("average", 0400, + stats->root, stats, + &stats_fops); + if (!stats->avg) { + mlx5_core_warn(dev, "failed creating debugfs file\n"); + err = -ENOMEM; + goto out; + } + + stats->count = debugfs_create_u64("n", 0400, + stats->root, + &stats->n); + if (!stats->count) { + mlx5_core_warn(dev, "failed creating debugfs file\n"); + err = -ENOMEM; + goto out; + } + } + } + + return 0; +out: + debugfs_remove_recursive(dev->priv.cmdif_debugfs); + return err; +} + +void mlx5_cmdif_debugfs_cleanup(struct mlx5_core_dev *dev) +{ + if (!mlx5_debugfs_root) + return; + + debugfs_remove_recursive(dev->priv.cmdif_debugfs); +} + +int mlx5_cq_debugfs_init(struct mlx5_core_dev *dev) +{ + if (!mlx5_debugfs_root) + return 0; + + dev->priv.cq_debugfs = debugfs_create_dir("CQs", dev->priv.dbg_root); + if (!dev->priv.cq_debugfs) + return -ENOMEM; + + return 0; +} + +void mlx5_cq_debugfs_cleanup(struct mlx5_core_dev *dev) +{ + if (!mlx5_debugfs_root) + return; + + debugfs_remove_recursive(dev->priv.cq_debugfs); +} + +static u64 qp_read_field(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, + int index, int *is_str) +{ + struct mlx5_query_qp_mbox_out *out; + struct mlx5_qp_context *ctx; + u64 param = 0; + int err; + int no_sq; + + out = kzalloc(sizeof(*out), GFP_KERNEL); + if (!out) + return param; + + err = mlx5_core_qp_query(dev, qp, out, sizeof(*out)); + if (err) { + mlx5_core_warn(dev, "failed to query qp\n"); + goto out; + } + + *is_str = 0; + ctx = &out->ctx; + switch (index) { + case QP_PID: + param = qp->pid; + break; + case QP_STATE: + param = (u64)mlx5_qp_state_str(be32_to_cpu(ctx->flags) >> 28); + *is_str = 1; + break; + case QP_XPORT: + param = (u64)mlx5_qp_type_str((be32_to_cpu(ctx->flags) >> 16) & 0xff); + *is_str = 1; + break; + case QP_MTU: + switch (ctx->mtu_msgmax >> 5) { + case IB_MTU_256: + param = 256; + break; + case IB_MTU_512: + param = 512; + break; + case IB_MTU_1024: + param = 1024; + break; + case IB_MTU_2048: + param = 2048; + break; + case IB_MTU_4096: + param = 4096; + break; + default: + param = 0; + } + break; + case QP_N_RECV: + param = 1 << ((ctx->rq_size_stride >> 3) & 0xf); + break; + case QP_RECV_SZ: + param = 1 << ((ctx->rq_size_stride & 7) + 4); + break; + case QP_N_SEND: + no_sq = be16_to_cpu(ctx->sq_crq_size) >> 15; + if (!no_sq) + param = 1 << (be16_to_cpu(ctx->sq_crq_size) >> 11); + else + param = 0; + break; + case QP_LOG_PG_SZ: + param = (be32_to_cpu(ctx->log_pg_sz_remote_qpn) >> 24) & 0x1f; + param += 12; + break; + case QP_RQPN: + param = be32_to_cpu(ctx->log_pg_sz_remote_qpn) & 0xffffff; + break; + } + +out: + kfree(out); + return param; +} + +static u64 eq_read_field(struct mlx5_core_dev *dev, struct mlx5_eq *eq, + int index) +{ + struct mlx5_query_eq_mbox_out *out; + struct mlx5_eq_context *ctx; + u64 param = 0; + int err; + + out = kzalloc(sizeof(*out), GFP_KERNEL); + if (!out) + return param; + + ctx = &out->ctx; + + err = mlx5_core_eq_query(dev, eq, out, sizeof(*out)); + if (err) { + mlx5_core_warn(dev, "failed to query eq\n"); + goto out; + } + + switch (index) { + case EQ_NUM_EQES: + param = 1 << ((be32_to_cpu(ctx->log_sz_usr_page) >> 24) & 0x1f); + break; + case EQ_INTR: + param = ctx->intr; + break; + case EQ_LOG_PG_SZ: + param = (ctx->log_page_size & 0x1f) + 12; + break; + } + +out: + kfree(out); + return param; +} + +static u64 cq_read_field(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, + int index) +{ + struct mlx5_query_cq_mbox_out *out; + struct mlx5_cq_context *ctx; + u64 param = 0; + int err; + + out = kzalloc(sizeof(*out), GFP_KERNEL); + if (!out) + return param; + + ctx = &out->ctx; + + err = mlx5_core_query_cq(dev, cq, out); + if (err) { + mlx5_core_warn(dev, "failed to query cq\n"); + goto out; + } + + switch (index) { + case CQ_PID: + param = cq->pid; + break; + case CQ_NUM_CQES: + param = 1 << ((be32_to_cpu(ctx->log_sz_usr_page) >> 24) & 0x1f); + break; + case CQ_LOG_PG_SZ: + param = (ctx->log_pg_sz & 0x1f) + 12; + break; + } + +out: + kfree(out); + return param; +} + +static ssize_t dbg_read(struct file *filp, char __user *buf, size_t count, + loff_t *pos) +{ + struct mlx5_field_desc *desc; + struct mlx5_rsc_debug *d; + char tbuf[18]; + int is_str = 0; + u64 field; + int ret; + + if (*pos) + return 0; + + desc = filp->private_data; + d = (void *)(desc - desc->i) - sizeof(*d); + switch (d->type) { + case MLX5_DBG_RSC_QP: + field = qp_read_field(d->dev, d->object, desc->i, &is_str); + break; + + case MLX5_DBG_RSC_EQ: + field = eq_read_field(d->dev, d->object, desc->i); + break; + + case MLX5_DBG_RSC_CQ: + field = cq_read_field(d->dev, d->object, desc->i); + break; + + default: + mlx5_core_warn(d->dev, "invalid resource type %d\n", d->type); + return -EINVAL; + } + + + if (is_str) + ret = snprintf(tbuf, sizeof(tbuf), "%s\n", (const char *)field); + else + ret = snprintf(tbuf, sizeof(tbuf), "0x%llx\n", field); + + if (ret > 0) { + if (copy_to_user(buf, tbuf, ret)) + return -EFAULT; + } + + *pos += ret; + return ret; +} + +static const struct file_operations fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = dbg_read, +}; + +static int add_res_tree(struct mlx5_core_dev *dev, enum dbg_rsc_type type, + struct dentry *root, struct mlx5_rsc_debug **dbg, + int rsn, char **field, int nfile, void *data) +{ + struct mlx5_rsc_debug *d; + char resn[32]; + int err; + int i; + + d = kzalloc(sizeof(*d) + nfile * sizeof(d->fields[0]), GFP_KERNEL); + if (!d) + return -ENOMEM; + + d->dev = dev; + d->object = data; + d->type = type; + sprintf(resn, "0x%x", rsn); + d->root = debugfs_create_dir(resn, root); + if (!d->root) { + err = -ENOMEM; + goto out_free; + } + + for (i = 0; i < nfile; i++) { + d->fields[i].i = i; + d->fields[i].dent = debugfs_create_file(field[i], 0400, + d->root, &d->fields[i], + &fops); + if (!d->fields[i].dent) { + err = -ENOMEM; + goto out_rem; + } + } + *dbg = d; + + return 0; +out_rem: + debugfs_remove_recursive(d->root); + +out_free: + kfree(d); + return err; +} + +static void rem_res_tree(struct mlx5_rsc_debug *d) +{ + debugfs_remove_recursive(d->root); + kfree(d); +} + +int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp) +{ + int err; + + if (!mlx5_debugfs_root) + return 0; + + err = add_res_tree(dev, MLX5_DBG_RSC_QP, dev->priv.qp_debugfs, + &qp->dbg, qp->qpn, qp_fields, + ARRAY_SIZE(qp_fields), qp); + if (err) + qp->dbg = NULL; + + return err; +} + +void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp) +{ + if (!mlx5_debugfs_root) + return; + + if (qp->dbg) + rem_res_tree(qp->dbg); +} + + +int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq) +{ + int err; + + if (!mlx5_debugfs_root) + return 0; + + err = add_res_tree(dev, MLX5_DBG_RSC_EQ, dev->priv.eq_debugfs, + &eq->dbg, eq->eqn, eq_fields, + ARRAY_SIZE(eq_fields), eq); + if (err) + eq->dbg = NULL; + + return err; +} + +void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq) +{ + if (!mlx5_debugfs_root) + return; + + if (eq->dbg) + rem_res_tree(eq->dbg); +} + +int mlx5_debug_cq_add(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq) +{ + int err; + + if (!mlx5_debugfs_root) + return 0; + + err = add_res_tree(dev, MLX5_DBG_RSC_CQ, dev->priv.cq_debugfs, + &cq->dbg, cq->cqn, cq_fields, + ARRAY_SIZE(cq_fields), cq); + if (err) + cq->dbg = NULL; + + return err; +} + +void mlx5_debug_cq_remove(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq) +{ + if (!mlx5_debugfs_root) + return; + + if (cq->dbg) + rem_res_tree(cq->dbg); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c new file mode 100644 index 0000000..ad2c96a --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -0,0 +1,528 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include "mlx5_core.h" + +enum { + MLX5_EQE_SIZE = sizeof(struct mlx5_eqe), + MLX5_EQE_OWNER_INIT_VAL = 0x1, +}; + +enum { + MLX5_EQ_STATE_ARMED = 0x9, + MLX5_EQ_STATE_FIRED = 0xa, + MLX5_EQ_STATE_ALWAYS_ARMED = 0xb, +}; + +enum { + MLX5_NUM_SPARE_EQE = 0x80, + MLX5_NUM_ASYNC_EQE = 0x100, + MLX5_NUM_CMD_EQE = 32, +}; + +enum { + MLX5_EQ_DOORBEL_OFFSET = 0x40, +}; + +#define MLX5_ASYNC_EVENT_MASK ((1ull << MLX5_EVENT_TYPE_PATH_MIG) | \ + (1ull << MLX5_EVENT_TYPE_COMM_EST) | \ + (1ull << MLX5_EVENT_TYPE_SQ_DRAINED) | \ + (1ull << MLX5_EVENT_TYPE_CQ_ERROR) | \ + (1ull << MLX5_EVENT_TYPE_WQ_CATAS_ERROR) | \ + (1ull << MLX5_EVENT_TYPE_PATH_MIG_FAILED) | \ + (1ull << MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | \ + (1ull << MLX5_EVENT_TYPE_WQ_ACCESS_ERROR) | \ + (1ull << MLX5_EVENT_TYPE_PORT_CHANGE) | \ + (1ull << MLX5_EVENT_TYPE_SRQ_CATAS_ERROR) | \ + (1ull << MLX5_EVENT_TYPE_SRQ_LAST_WQE) | \ + (1ull << MLX5_EVENT_TYPE_SRQ_RQ_LIMIT)) + +struct map_eq_in { + u64 mask; + u32 reserved; + u32 unmap_eqn; +}; + +struct cre_des_eq { + u8 reserved[15]; + u8 eqn; +}; + +static int mlx5_cmd_destroy_eq(struct mlx5_core_dev *dev, u8 eqn) +{ + struct mlx5_destroy_eq_mbox_in in; + struct mlx5_destroy_eq_mbox_out out; + int err; + + memset(&in, 0, sizeof(in)); + memset(&out, 0, sizeof(out)); + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_EQ); + in.eqn = eqn; + err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); + if (!err) + goto ex; + + if (out.hdr.status) + err = mlx5_cmd_status_to_err(&out.hdr); + +ex: + return err; +} + +static struct mlx5_eqe *get_eqe(struct mlx5_eq *eq, u32 entry) +{ + return mlx5_buf_offset(&eq->buf, entry * MLX5_EQE_SIZE); +} + +static struct mlx5_eqe *next_eqe_sw(struct mlx5_eq *eq) +{ + struct mlx5_eqe *eqe = get_eqe(eq, eq->cons_index & (eq->nent - 1)); + + return ((eqe->owner & 1) ^ !!(eq->cons_index & eq->nent)) ? NULL : eqe; +} + +static const char *eqe_type_str(u8 type) +{ + switch (type) { + case MLX5_EVENT_TYPE_COMP: + return "MLX5_EVENT_TYPE_COMP"; + case MLX5_EVENT_TYPE_PATH_MIG: + return "MLX5_EVENT_TYPE_PATH_MIG"; + case MLX5_EVENT_TYPE_COMM_EST: + return "MLX5_EVENT_TYPE_COMM_EST"; + case MLX5_EVENT_TYPE_SQ_DRAINED: + return "MLX5_EVENT_TYPE_SQ_DRAINED"; + case MLX5_EVENT_TYPE_SRQ_LAST_WQE: + return "MLX5_EVENT_TYPE_SRQ_LAST_WQE"; + case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT: + return "MLX5_EVENT_TYPE_SRQ_RQ_LIMIT"; + case MLX5_EVENT_TYPE_CQ_ERROR: + return "MLX5_EVENT_TYPE_CQ_ERROR"; + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + return "MLX5_EVENT_TYPE_WQ_CATAS_ERROR"; + case MLX5_EVENT_TYPE_PATH_MIG_FAILED: + return "MLX5_EVENT_TYPE_PATH_MIG_FAILED"; + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + return "MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR"; + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + return "MLX5_EVENT_TYPE_WQ_ACCESS_ERROR"; + case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR: + return "MLX5_EVENT_TYPE_SRQ_CATAS_ERROR"; + case MLX5_EVENT_TYPE_INTERNAL_ERROR: + return "MLX5_EVENT_TYPE_INTERNAL_ERROR"; + case MLX5_EVENT_TYPE_PORT_CHANGE: + return "MLX5_EVENT_TYPE_PORT_CHANGE"; + case MLX5_EVENT_TYPE_GPIO_EVENT: + return "MLX5_EVENT_TYPE_GPIO_EVENT"; + case MLX5_EVENT_TYPE_REMOTE_CONFIG: + return "MLX5_EVENT_TYPE_REMOTE_CONFIG"; + case MLX5_EVENT_TYPE_DB_BF_CONGESTION: + return "MLX5_EVENT_TYPE_DB_BF_CONGESTION"; + case MLX5_EVENT_TYPE_STALL_EVENT: + return "MLX5_EVENT_TYPE_STALL_EVENT"; + case MLX5_EVENT_TYPE_CMD: + return "MLX5_EVENT_TYPE_CMD"; + case MLX5_EVENT_TYPE_PAGE_REQUEST: + return "MLX5_EVENT_TYPE_PAGE_REQUEST"; + default: + return "Unrecognized event"; + } +} + +static enum mlx5_dev_event port_subtype_event(u8 subtype) +{ + switch (subtype) { + case MLX5_PORT_CHANGE_SUBTYPE_DOWN: + return MLX5_DEV_EVENT_PORT_DOWN; + case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE: + return MLX5_DEV_EVENT_PORT_UP; + case MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED: + return MLX5_DEV_EVENT_PORT_INITIALIZED; + case MLX5_PORT_CHANGE_SUBTYPE_LID: + return MLX5_DEV_EVENT_LID_CHANGE; + case MLX5_PORT_CHANGE_SUBTYPE_PKEY: + return MLX5_DEV_EVENT_PKEY_CHANGE; + case MLX5_PORT_CHANGE_SUBTYPE_GUID: + return MLX5_DEV_EVENT_GUID_CHANGE; + case MLX5_PORT_CHANGE_SUBTYPE_CLIENT_REREG: + return MLX5_DEV_EVENT_CLIENT_REREG; + } + return -1; +} + +static void eq_update_ci(struct mlx5_eq *eq, int arm) +{ + __be32 __iomem *addr = eq->doorbell + (arm ? 0 : 2); + u32 val = (eq->cons_index & 0xffffff) | (eq->eqn << 24); + __raw_writel((__force u32) cpu_to_be32(val), addr); + /* We still want ordering, just not swabbing, so add a barrier */ + mb(); +} + +static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq) +{ + struct mlx5_eqe *eqe; + int eqes_found = 0; + int set_ci = 0; + u32 cqn; + u32 rsn; + u8 port; + + while ((eqe = next_eqe_sw(eq))) { + /* + * Make sure we read EQ entry contents after we've + * checked the ownership bit. + */ + rmb(); + + mlx5_core_dbg(eq->dev, "eqn %d, eqe type %s\n", + eq->eqn, eqe_type_str(eqe->type)); + switch (eqe->type) { + case MLX5_EVENT_TYPE_COMP: + cqn = be32_to_cpu(eqe->data.comp.cqn) & 0xffffff; + mlx5_cq_completion(dev, cqn); + break; + + case MLX5_EVENT_TYPE_PATH_MIG: + case MLX5_EVENT_TYPE_COMM_EST: + case MLX5_EVENT_TYPE_SQ_DRAINED: + case MLX5_EVENT_TYPE_SRQ_LAST_WQE: + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + case MLX5_EVENT_TYPE_PATH_MIG_FAILED: + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + rsn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; + mlx5_core_dbg(dev, "event %s(%d) arrived\n", + eqe_type_str(eqe->type), eqe->type); + mlx5_rsc_event(dev, rsn, eqe->type); + break; + + case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT: + case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR: + rsn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; + mlx5_core_dbg(dev, "SRQ event %s(%d): srqn 0x%x\n", + eqe_type_str(eqe->type), eqe->type, rsn); + mlx5_srq_event(dev, rsn, eqe->type); + break; + + case MLX5_EVENT_TYPE_CMD: + mlx5_cmd_comp_handler(dev, be32_to_cpu(eqe->data.cmd.vector)); + break; + + case MLX5_EVENT_TYPE_PORT_CHANGE: + port = (eqe->data.port.port >> 4) & 0xf; + switch (eqe->sub_type) { + case MLX5_PORT_CHANGE_SUBTYPE_DOWN: + case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE: + case MLX5_PORT_CHANGE_SUBTYPE_LID: + case MLX5_PORT_CHANGE_SUBTYPE_PKEY: + case MLX5_PORT_CHANGE_SUBTYPE_GUID: + case MLX5_PORT_CHANGE_SUBTYPE_CLIENT_REREG: + case MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED: + if (dev->event) + dev->event(dev, port_subtype_event(eqe->sub_type), + (unsigned long)port); + break; + default: + mlx5_core_warn(dev, "Port event with unrecognized subtype: port %d, sub_type %d\n", + port, eqe->sub_type); + } + break; + case MLX5_EVENT_TYPE_CQ_ERROR: + cqn = be32_to_cpu(eqe->data.cq_err.cqn) & 0xffffff; + mlx5_core_warn(dev, "CQ error on CQN 0x%x, syndrom 0x%x\n", + cqn, eqe->data.cq_err.syndrome); + mlx5_cq_event(dev, cqn, eqe->type); + break; + + case MLX5_EVENT_TYPE_PAGE_REQUEST: + { + u16 func_id = be16_to_cpu(eqe->data.req_pages.func_id); + s32 npages = be32_to_cpu(eqe->data.req_pages.num_pages); + + mlx5_core_dbg(dev, "page request for func 0x%x, npages %d\n", + func_id, npages); + mlx5_core_req_pages_handler(dev, func_id, npages); + } + break; + + + default: + mlx5_core_warn(dev, "Unhandled event 0x%x on EQ 0x%x\n", + eqe->type, eq->eqn); + break; + } + + ++eq->cons_index; + eqes_found = 1; + ++set_ci; + + /* The HCA will think the queue has overflowed if we + * don't tell it we've been processing events. We + * create our EQs with MLX5_NUM_SPARE_EQE extra + * entries, so we must update our consumer index at + * least that often. + */ + if (unlikely(set_ci >= MLX5_NUM_SPARE_EQE)) { + eq_update_ci(eq, 0); + set_ci = 0; + } + } + + eq_update_ci(eq, 1); + + return eqes_found; +} + +static irqreturn_t mlx5_msix_handler(int irq, void *eq_ptr) +{ + struct mlx5_eq *eq = eq_ptr; + struct mlx5_core_dev *dev = eq->dev; + + mlx5_eq_int(dev, eq); + + /* MSI-X vectors always belong to us */ + return IRQ_HANDLED; +} + +static void init_eq_buf(struct mlx5_eq *eq) +{ + struct mlx5_eqe *eqe; + int i; + + for (i = 0; i < eq->nent; i++) { + eqe = get_eqe(eq, i); + eqe->owner = MLX5_EQE_OWNER_INIT_VAL; + } +} + +int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx, + int nent, u64 mask, const char *name, struct mlx5_uar *uar) +{ + struct mlx5_eq_table *table = &dev->priv.eq_table; + struct mlx5_create_eq_mbox_in *in; + struct mlx5_create_eq_mbox_out out; + int err; + int inlen; + + eq->nent = roundup_pow_of_two(nent + MLX5_NUM_SPARE_EQE); + err = mlx5_buf_alloc(dev, eq->nent * MLX5_EQE_SIZE, 2 * PAGE_SIZE, + &eq->buf); + if (err) + return err; + + init_eq_buf(eq); + + inlen = sizeof(*in) + sizeof(in->pas[0]) * eq->buf.npages; + in = mlx5_vzalloc(inlen); + if (!in) { + err = -ENOMEM; + goto err_buf; + } + memset(&out, 0, sizeof(out)); + + mlx5_fill_page_array(&eq->buf, in->pas); + + in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_EQ); + in->ctx.log_sz_usr_page = cpu_to_be32(ilog2(eq->nent) << 24 | uar->index); + in->ctx.intr = vecidx; + in->ctx.log_page_size = eq->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT; + in->events_mask = cpu_to_be64(mask); + + err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out)); + if (err) + goto err_in; + + if (out.hdr.status) { + err = mlx5_cmd_status_to_err(&out.hdr); + goto err_in; + } + + snprintf(eq->name, MLX5_MAX_EQ_NAME, "%s@pci:%s", + name, pci_name(dev->pdev)); + eq->eqn = out.eq_number; + eq->irqn = vecidx; + eq->dev = dev; + eq->doorbell = uar->map + MLX5_EQ_DOORBEL_OFFSET; + err = request_irq(table->msix_arr[vecidx].vector, mlx5_msix_handler, 0, + eq->name, eq); + if (err) + goto err_eq; + + err = mlx5_debug_eq_add(dev, eq); + if (err) + goto err_irq; + + /* EQs are created in ARMED state + */ + eq_update_ci(eq, 1); + + mlx5_vfree(in); + return 0; + +err_irq: + free_irq(table->msix_arr[vecidx].vector, eq); + +err_eq: + mlx5_cmd_destroy_eq(dev, eq->eqn); + +err_in: + mlx5_vfree(in); + +err_buf: + mlx5_buf_free(dev, &eq->buf); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_create_map_eq); + +int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq) +{ + struct mlx5_eq_table *table = &dev->priv.eq_table; + int err; + + mlx5_debug_eq_remove(dev, eq); + free_irq(table->msix_arr[eq->irqn].vector, eq); + err = mlx5_cmd_destroy_eq(dev, eq->eqn); + if (err) + mlx5_core_warn(dev, "failed to destroy a previously created eq: eqn %d\n", + eq->eqn); + synchronize_irq(table->msix_arr[eq->irqn].vector); + mlx5_buf_free(dev, &eq->buf); + + return err; +} +EXPORT_SYMBOL_GPL(mlx5_destroy_unmap_eq); + +int mlx5_eq_init(struct mlx5_core_dev *dev) +{ + int err; + + spin_lock_init(&dev->priv.eq_table.lock); + + err = mlx5_eq_debugfs_init(dev); + + return err; +} + + +void mlx5_eq_cleanup(struct mlx5_core_dev *dev) +{ + mlx5_eq_debugfs_cleanup(dev); +} + +int mlx5_start_eqs(struct mlx5_core_dev *dev) +{ + struct mlx5_eq_table *table = &dev->priv.eq_table; + int err; + + err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD, + MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD, + "mlx5_cmd_eq", &dev->priv.uuari.uars[0]); + if (err) { + mlx5_core_warn(dev, "failed to create cmd EQ %d\n", err); + return err; + } + + mlx5_cmd_use_events(dev); + + err = mlx5_create_map_eq(dev, &table->async_eq, MLX5_EQ_VEC_ASYNC, + MLX5_NUM_ASYNC_EQE, MLX5_ASYNC_EVENT_MASK, + "mlx5_async_eq", &dev->priv.uuari.uars[0]); + if (err) { + mlx5_core_warn(dev, "failed to create async EQ %d\n", err); + goto err1; + } + + err = mlx5_create_map_eq(dev, &table->pages_eq, + MLX5_EQ_VEC_PAGES, + dev->caps.gen.max_vf + 1, + 1 << MLX5_EVENT_TYPE_PAGE_REQUEST, "mlx5_pages_eq", + &dev->priv.uuari.uars[0]); + if (err) { + mlx5_core_warn(dev, "failed to create pages EQ %d\n", err); + goto err2; + } + + return err; + +err2: + mlx5_destroy_unmap_eq(dev, &table->async_eq); + +err1: + mlx5_cmd_use_polling(dev); + mlx5_destroy_unmap_eq(dev, &table->cmd_eq); + return err; +} + +int mlx5_stop_eqs(struct mlx5_core_dev *dev) +{ + struct mlx5_eq_table *table = &dev->priv.eq_table; + int err; + + err = mlx5_destroy_unmap_eq(dev, &table->pages_eq); + if (err) + return err; + + mlx5_destroy_unmap_eq(dev, &table->async_eq); + mlx5_cmd_use_polling(dev); + + err = mlx5_destroy_unmap_eq(dev, &table->cmd_eq); + if (err) + mlx5_cmd_use_events(dev); + + return err; +} + +int mlx5_core_eq_query(struct mlx5_core_dev *dev, struct mlx5_eq *eq, + struct mlx5_query_eq_mbox_out *out, int outlen) +{ + struct mlx5_query_eq_mbox_in in; + int err; + + memset(&in, 0, sizeof(in)); + memset(out, 0, outlen); + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_EQ); + in.eqn = eq->eqn; + err = mlx5_cmd_exec(dev, &in, sizeof(in), out, outlen); + if (err) + return err; + + if (out->hdr.status) + err = mlx5_cmd_status_to_err(&out->hdr); + + return err; +} +EXPORT_SYMBOL_GPL(mlx5_core_eq_query); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c new file mode 100644 index 0000000..087c4c7 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include "mlx5_core.h" + +int mlx5_cmd_query_adapter(struct mlx5_core_dev *dev) +{ + struct mlx5_cmd_query_adapter_mbox_out *out; + struct mlx5_cmd_query_adapter_mbox_in in; + int err; + + out = kzalloc(sizeof(*out), GFP_KERNEL); + if (!out) + return -ENOMEM; + + memset(&in, 0, sizeof(in)); + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_ADAPTER); + err = mlx5_cmd_exec(dev, &in, sizeof(in), out, sizeof(*out)); + if (err) + goto out_out; + + if (out->hdr.status) { + err = mlx5_cmd_status_to_err(&out->hdr); + goto out_out; + } + + memcpy(dev->board_id, out->vsd_psid, sizeof(out->vsd_psid)); + +out_out: + kfree(out); + + return err; +} + +int mlx5_cmd_query_hca_cap(struct mlx5_core_dev *dev, struct mlx5_caps *caps) +{ + return mlx5_core_get_caps(dev, caps, HCA_CAP_OPMOD_GET_CUR); +} + +int mlx5_cmd_init_hca(struct mlx5_core_dev *dev) +{ + struct mlx5_cmd_init_hca_mbox_in in; + struct mlx5_cmd_init_hca_mbox_out out; + int err; + + memset(&in, 0, sizeof(in)); + memset(&out, 0, sizeof(out)); + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_INIT_HCA); + err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); + if (err) + return err; + + if (out.hdr.status) + err = mlx5_cmd_status_to_err(&out.hdr); + + return err; +} + +int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev) +{ + struct mlx5_cmd_teardown_hca_mbox_in in; + struct mlx5_cmd_teardown_hca_mbox_out out; + int err; + + memset(&in, 0, sizeof(in)); + memset(&out, 0, sizeof(out)); + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_TEARDOWN_HCA); + err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); + if (err) + return err; + + if (out.hdr.status) + err = mlx5_cmd_status_to_err(&out.hdr); + + return err; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c new file mode 100644 index 0000000..3e6670c --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include "mlx5_core.h" + +enum { + MLX5_HEALTH_POLL_INTERVAL = 2 * HZ, + MAX_MISSES = 3, +}; + +enum { + MLX5_HEALTH_SYNDR_FW_ERR = 0x1, + MLX5_HEALTH_SYNDR_IRISC_ERR = 0x7, + MLX5_HEALTH_SYNDR_CRC_ERR = 0x9, + MLX5_HEALTH_SYNDR_FETCH_PCI_ERR = 0xa, + MLX5_HEALTH_SYNDR_HW_FTL_ERR = 0xb, + MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR = 0xc, + MLX5_HEALTH_SYNDR_EQ_ERR = 0xd, + MLX5_HEALTH_SYNDR_FFSER_ERR = 0xf, +}; + +static DEFINE_SPINLOCK(health_lock); +static LIST_HEAD(health_list); +static struct work_struct health_work; + +static void health_care(struct work_struct *work) +{ + struct mlx5_core_health *health, *n; + struct mlx5_core_dev *dev; + struct mlx5_priv *priv; + LIST_HEAD(tlist); + + spin_lock_irq(&health_lock); + list_splice_init(&health_list, &tlist); + + spin_unlock_irq(&health_lock); + + list_for_each_entry_safe(health, n, &tlist, list) { + priv = container_of(health, struct mlx5_priv, health); + dev = container_of(priv, struct mlx5_core_dev, priv); + mlx5_core_warn(dev, "handling bad device here\n"); + /* nothing yet */ + spin_lock_irq(&health_lock); + list_del_init(&health->list); + spin_unlock_irq(&health_lock); + } +} + +static const char *hsynd_str(u8 synd) +{ + switch (synd) { + case MLX5_HEALTH_SYNDR_FW_ERR: + return "firmware internal error"; + case MLX5_HEALTH_SYNDR_IRISC_ERR: + return "irisc not responding"; + case MLX5_HEALTH_SYNDR_CRC_ERR: + return "firmware CRC error"; + case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR: + return "ICM fetch PCI error"; + case MLX5_HEALTH_SYNDR_HW_FTL_ERR: + return "HW fatal error\n"; + case MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR: + return "async EQ buffer overrun"; + case MLX5_HEALTH_SYNDR_EQ_ERR: + return "EQ error"; + case MLX5_HEALTH_SYNDR_FFSER_ERR: + return "FFSER error"; + default: + return "unrecognized error"; + } +} + +static u16 read_be16(__be16 __iomem *p) +{ + return swab16(readl((__force u16 __iomem *) p)); +} + +static u32 read_be32(__be32 __iomem *p) +{ + return swab32(readl((__force u32 __iomem *) p)); +} + +static void print_health_info(struct mlx5_core_dev *dev) +{ + struct mlx5_core_health *health = &dev->priv.health; + struct health_buffer __iomem *h = health->health; + int i; + + for (i = 0; i < ARRAY_SIZE(h->assert_var); i++) + pr_info("assert_var[%d] 0x%08x\n", i, read_be32(h->assert_var + i)); + + pr_info("assert_exit_ptr 0x%08x\n", read_be32(&h->assert_exit_ptr)); + pr_info("assert_callra 0x%08x\n", read_be32(&h->assert_callra)); + pr_info("fw_ver 0x%08x\n", read_be32(&h->fw_ver)); + pr_info("hw_id 0x%08x\n", read_be32(&h->hw_id)); + pr_info("irisc_index %d\n", readb(&h->irisc_index)); + pr_info("synd 0x%x: %s\n", readb(&h->synd), hsynd_str(readb(&h->synd))); + pr_info("ext_sync 0x%04x\n", read_be16(&h->ext_sync)); +} + +static void poll_health(unsigned long data) +{ + struct mlx5_core_dev *dev = (struct mlx5_core_dev *)data; + struct mlx5_core_health *health = &dev->priv.health; + unsigned long next; + u32 count; + + count = ioread32be(health->health_counter); + if (count == health->prev) + ++health->miss_counter; + else + health->miss_counter = 0; + + health->prev = count; + if (health->miss_counter == MAX_MISSES) { + mlx5_core_err(dev, "device's health compromised\n"); + print_health_info(dev); + spin_lock_irq(&health_lock); + list_add_tail(&health->list, &health_list); + spin_unlock_irq(&health_lock); + + queue_work(mlx5_core_wq, &health_work); + } else { + get_random_bytes(&next, sizeof(next)); + next %= HZ; + next += jiffies + MLX5_HEALTH_POLL_INTERVAL; + mod_timer(&health->timer, next); + } +} + +void mlx5_start_health_poll(struct mlx5_core_dev *dev) +{ + struct mlx5_core_health *health = &dev->priv.health; + + INIT_LIST_HEAD(&health->list); + init_timer(&health->timer); + health->health = &dev->iseg->health; + health->health_counter = &dev->iseg->health_counter; + + health->timer.data = (unsigned long)dev; + health->timer.function = poll_health; + health->timer.expires = round_jiffies(jiffies + MLX5_HEALTH_POLL_INTERVAL); + add_timer(&health->timer); +} + +void mlx5_stop_health_poll(struct mlx5_core_dev *dev) +{ + struct mlx5_core_health *health = &dev->priv.health; + + del_timer_sync(&health->timer); + + spin_lock_irq(&health_lock); + if (!list_empty(&health->list)) + list_del_init(&health->list); + spin_unlock_irq(&health_lock); +} + +void mlx5_health_cleanup(void) +{ +} + +void __init mlx5_health_init(void) +{ + INIT_WORK(&health_work, health_care); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mad.c b/drivers/net/ethernet/mellanox/mlx5/core/mad.c new file mode 100644 index 0000000..fd80ecf --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/mad.c @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include "mlx5_core.h" + +int mlx5_core_mad_ifc(struct mlx5_core_dev *dev, void *inb, void *outb, + u16 opmod, u8 port) +{ + struct mlx5_mad_ifc_mbox_in *in = NULL; + struct mlx5_mad_ifc_mbox_out *out = NULL; + int err; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + out = kzalloc(sizeof(*out), GFP_KERNEL); + if (!out) { + err = -ENOMEM; + goto out; + } + + in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_MAD_IFC); + in->hdr.opmod = cpu_to_be16(opmod); + in->port = port; + + memcpy(in->data, inb, sizeof(in->data)); + + err = mlx5_cmd_exec(dev, in, sizeof(*in), out, sizeof(*out)); + if (err) + goto out; + + if (out->hdr.status) { + err = mlx5_cmd_status_to_err(&out->hdr); + goto out; + } + + memcpy(outb, out->data, sizeof(out->data)); + +out: + kfree(out); + kfree(in); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_core_mad_ifc); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c new file mode 100644 index 0000000..71b10b2 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -0,0 +1,948 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mlx5_core.h" + +#define DRIVER_NAME "mlx5_core" +#define DRIVER_VERSION "2.2-1" +#define DRIVER_RELDATE "Feb 2014" + +MODULE_AUTHOR("Eli Cohen "); +MODULE_DESCRIPTION("Mellanox ConnectX-IB HCA core library"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRIVER_VERSION); + +int mlx5_core_debug_mask; +module_param_named(debug_mask, mlx5_core_debug_mask, int, 0644); +MODULE_PARM_DESC(debug_mask, "debug mask: 1 = dump cmd data, 2 = dump cmd exec time, 3 = both. Default=0"); + +#define MLX5_DEFAULT_PROF 2 +static int prof_sel = MLX5_DEFAULT_PROF; +module_param_named(prof_sel, prof_sel, int, 0444); +MODULE_PARM_DESC(prof_sel, "profile selector. Valid range 0 - 2"); + +struct workqueue_struct *mlx5_core_wq; +static LIST_HEAD(intf_list); +static LIST_HEAD(dev_list); +static DEFINE_MUTEX(intf_mutex); + +struct mlx5_device_context { + struct list_head list; + struct mlx5_interface *intf; + void *context; +}; + +static struct mlx5_profile profile[] = { + [0] = { + .mask = 0, + }, + [1] = { + .mask = MLX5_PROF_MASK_QP_SIZE, + .log_max_qp = 12, + }, + [2] = { + .mask = MLX5_PROF_MASK_QP_SIZE | + MLX5_PROF_MASK_MR_CACHE, + .log_max_qp = 17, + .mr_cache[0] = { + .size = 500, + .limit = 250 + }, + .mr_cache[1] = { + .size = 500, + .limit = 250 + }, + .mr_cache[2] = { + .size = 500, + .limit = 250 + }, + .mr_cache[3] = { + .size = 500, + .limit = 250 + }, + .mr_cache[4] = { + .size = 500, + .limit = 250 + }, + .mr_cache[5] = { + .size = 500, + .limit = 250 + }, + .mr_cache[6] = { + .size = 500, + .limit = 250 + }, + .mr_cache[7] = { + .size = 500, + .limit = 250 + }, + .mr_cache[8] = { + .size = 500, + .limit = 250 + }, + .mr_cache[9] = { + .size = 500, + .limit = 250 + }, + .mr_cache[10] = { + .size = 500, + .limit = 250 + }, + .mr_cache[11] = { + .size = 500, + .limit = 250 + }, + .mr_cache[12] = { + .size = 64, + .limit = 32 + }, + .mr_cache[13] = { + .size = 32, + .limit = 16 + }, + .mr_cache[14] = { + .size = 16, + .limit = 8 + }, + .mr_cache[15] = { + .size = 8, + .limit = 4 + }, + }, +}; + +static int set_dma_caps(struct pci_dev *pdev) +{ + int err; + + err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + if (err) { + dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask\n"); + err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + if (err) { + dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting\n"); + return err; + } + } + + err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); + if (err) { + dev_warn(&pdev->dev, + "Warning: couldn't set 64-bit consistent PCI DMA mask\n"); + err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); + if (err) { + dev_err(&pdev->dev, + "Can't set consistent PCI DMA mask, aborting\n"); + return err; + } + } + + dma_set_max_seg_size(&pdev->dev, 2u * 1024 * 1024 * 1024); + return err; +} + +static int request_bar(struct pci_dev *pdev) +{ + int err = 0; + + if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) { + dev_err(&pdev->dev, "Missing registers BAR, aborting\n"); + return -ENODEV; + } + + err = pci_request_regions(pdev, DRIVER_NAME); + if (err) + dev_err(&pdev->dev, "Couldn't get PCI resources, aborting\n"); + + return err; +} + +static void release_bar(struct pci_dev *pdev) +{ + pci_release_regions(pdev); +} + +static int mlx5_enable_msix(struct mlx5_core_dev *dev) +{ + struct mlx5_eq_table *table = &dev->priv.eq_table; + int num_eqs = 1 << dev->caps.gen.log_max_eq; + int nvec; + int i; + + nvec = dev->caps.gen.num_ports * num_online_cpus() + MLX5_EQ_VEC_COMP_BASE; + nvec = min_t(int, nvec, num_eqs); + if (nvec <= MLX5_EQ_VEC_COMP_BASE) + return -ENOMEM; + + table->msix_arr = kzalloc(nvec * sizeof(*table->msix_arr), GFP_KERNEL); + if (!table->msix_arr) + return -ENOMEM; + + for (i = 0; i < nvec; i++) + table->msix_arr[i].entry = i; + + nvec = pci_enable_msix_range(dev->pdev, table->msix_arr, + MLX5_EQ_VEC_COMP_BASE, nvec); + if (nvec < 0) + return nvec; + + table->num_comp_vectors = nvec - MLX5_EQ_VEC_COMP_BASE; + + return 0; +} + +static void mlx5_disable_msix(struct mlx5_core_dev *dev) +{ + struct mlx5_eq_table *table = &dev->priv.eq_table; + + pci_disable_msix(dev->pdev); + kfree(table->msix_arr); +} + +struct mlx5_reg_host_endianess { + u8 he; + u8 rsvd[15]; +}; + + +#define CAP_MASK(pos, size) ((u64)((1 << (size)) - 1) << (pos)) + +enum { + MLX5_CAP_BITS_RW_MASK = CAP_MASK(MLX5_CAP_OFF_CMDIF_CSUM, 2) | + MLX5_DEV_CAP_FLAG_DCT, +}; + +static u16 to_fw_pkey_sz(u32 size) +{ + switch (size) { + case 128: + return 0; + case 256: + return 1; + case 512: + return 2; + case 1024: + return 3; + case 2048: + return 4; + case 4096: + return 5; + default: + pr_warn("invalid pkey table size %d\n", size); + return 0; + } +} + +/* selectively copy writable fields clearing any reserved area + */ +static void copy_rw_fields(void *to, struct mlx5_caps *from) +{ + __be64 *flags_off = (__be64 *)MLX5_ADDR_OF(cmd_hca_cap, to, reserved_22); + u64 v64; + + MLX5_SET(cmd_hca_cap, to, log_max_qp, from->gen.log_max_qp); + MLX5_SET(cmd_hca_cap, to, log_max_ra_req_qp, from->gen.log_max_ra_req_qp); + MLX5_SET(cmd_hca_cap, to, log_max_ra_res_qp, from->gen.log_max_ra_res_qp); + MLX5_SET(cmd_hca_cap, to, pkey_table_size, from->gen.pkey_table_size); + MLX5_SET(cmd_hca_cap, to, log_max_ra_req_dc, from->gen.log_max_ra_req_dc); + MLX5_SET(cmd_hca_cap, to, log_max_ra_res_dc, from->gen.log_max_ra_res_dc); + MLX5_SET(cmd_hca_cap, to, pkey_table_size, to_fw_pkey_sz(from->gen.pkey_table_size)); + v64 = from->gen.flags & MLX5_CAP_BITS_RW_MASK; + *flags_off = cpu_to_be64(v64); +} + +static u16 get_pkey_table_size(int pkey) +{ + if (pkey > MLX5_MAX_LOG_PKEY_TABLE) + return 0; + + return MLX5_MIN_PKEY_TABLE_SIZE << pkey; +} + +static void fw2drv_caps(struct mlx5_caps *caps, void *out) +{ + struct mlx5_general_caps *gen = &caps->gen; + + gen->max_srq_wqes = 1 << MLX5_GET_PR(cmd_hca_cap, out, log_max_srq_sz); + gen->max_wqes = 1 << MLX5_GET_PR(cmd_hca_cap, out, log_max_qp_sz); + gen->log_max_qp = MLX5_GET_PR(cmd_hca_cap, out, log_max_qp); + gen->log_max_strq = MLX5_GET_PR(cmd_hca_cap, out, log_max_strq_sz); + gen->log_max_srq = MLX5_GET_PR(cmd_hca_cap, out, log_max_srqs); + gen->max_cqes = 1 << MLX5_GET_PR(cmd_hca_cap, out, log_max_cq_sz); + gen->log_max_cq = MLX5_GET_PR(cmd_hca_cap, out, log_max_cq); + gen->max_eqes = 1 << MLX5_GET_PR(cmd_hca_cap, out, log_max_eq_sz); + gen->log_max_mkey = MLX5_GET_PR(cmd_hca_cap, out, log_max_mkey); + gen->log_max_eq = MLX5_GET_PR(cmd_hca_cap, out, log_max_eq); + gen->max_indirection = MLX5_GET_PR(cmd_hca_cap, out, max_indirection); + gen->log_max_mrw_sz = MLX5_GET_PR(cmd_hca_cap, out, log_max_mrw_sz); + gen->log_max_bsf_list_size = MLX5_GET_PR(cmd_hca_cap, out, log_max_bsf_list_size); + gen->log_max_klm_list_size = MLX5_GET_PR(cmd_hca_cap, out, log_max_klm_list_size); + gen->log_max_ra_req_dc = MLX5_GET_PR(cmd_hca_cap, out, log_max_ra_req_dc); + gen->log_max_ra_res_dc = MLX5_GET_PR(cmd_hca_cap, out, log_max_ra_res_dc); + gen->log_max_ra_req_qp = MLX5_GET_PR(cmd_hca_cap, out, log_max_ra_req_qp); + gen->log_max_ra_res_qp = MLX5_GET_PR(cmd_hca_cap, out, log_max_ra_res_qp); + gen->max_qp_counters = MLX5_GET_PR(cmd_hca_cap, out, max_qp_cnt); + gen->pkey_table_size = get_pkey_table_size(MLX5_GET_PR(cmd_hca_cap, out, pkey_table_size)); + gen->local_ca_ack_delay = MLX5_GET_PR(cmd_hca_cap, out, local_ca_ack_delay); + gen->num_ports = MLX5_GET_PR(cmd_hca_cap, out, num_ports); + gen->log_max_msg = MLX5_GET_PR(cmd_hca_cap, out, log_max_msg); + gen->stat_rate_support = MLX5_GET_PR(cmd_hca_cap, out, stat_rate_support); + gen->flags = be64_to_cpu(*(__be64 *)MLX5_ADDR_OF(cmd_hca_cap, out, reserved_22)); + pr_debug("flags = 0x%llx\n", gen->flags); + gen->uar_sz = MLX5_GET_PR(cmd_hca_cap, out, uar_sz); + gen->min_log_pg_sz = MLX5_GET_PR(cmd_hca_cap, out, log_pg_sz); + gen->bf_reg_size = MLX5_GET_PR(cmd_hca_cap, out, bf); + gen->bf_reg_size = 1 << MLX5_GET_PR(cmd_hca_cap, out, log_bf_reg_size); + gen->max_sq_desc_sz = MLX5_GET_PR(cmd_hca_cap, out, max_wqe_sz_sq); + gen->max_rq_desc_sz = MLX5_GET_PR(cmd_hca_cap, out, max_wqe_sz_rq); + gen->max_dc_sq_desc_sz = MLX5_GET_PR(cmd_hca_cap, out, max_wqe_sz_sq_dc); + gen->max_qp_mcg = MLX5_GET_PR(cmd_hca_cap, out, max_qp_mcg); + gen->log_max_pd = MLX5_GET_PR(cmd_hca_cap, out, log_max_pd); + gen->log_max_xrcd = MLX5_GET_PR(cmd_hca_cap, out, log_max_xrcd); + gen->log_uar_page_sz = MLX5_GET_PR(cmd_hca_cap, out, log_uar_page_sz); +} + +static const char *caps_opmod_str(u16 opmod) +{ + switch (opmod) { + case HCA_CAP_OPMOD_GET_MAX: + return "GET_MAX"; + case HCA_CAP_OPMOD_GET_CUR: + return "GET_CUR"; + default: + return "Invalid"; + } +} + +int mlx5_core_get_caps(struct mlx5_core_dev *dev, struct mlx5_caps *caps, + u16 opmod) +{ + u8 in[MLX5_ST_SZ_BYTES(query_hca_cap_in)]; + int out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); + void *out; + int err; + + memset(in, 0, sizeof(in)); + out = kzalloc(out_sz, GFP_KERNEL); + if (!out) + return -ENOMEM; + MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); + MLX5_SET(query_hca_cap_in, in, op_mod, opmod); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz); + if (err) + goto query_ex; + + err = mlx5_cmd_status_to_err_v2(out); + if (err) { + mlx5_core_warn(dev, "query max hca cap failed, %d\n", err); + goto query_ex; + } + mlx5_core_dbg(dev, "%s\n", caps_opmod_str(opmod)); + fw2drv_caps(caps, MLX5_ADDR_OF(query_hca_cap_out, out, capability_struct)); + +query_ex: + kfree(out); + return err; +} + +static int set_caps(struct mlx5_core_dev *dev, void *in, int in_sz) +{ + u32 out[MLX5_ST_SZ_DW(set_hca_cap_out)]; + int err; + + memset(out, 0, sizeof(out)); + + MLX5_SET(set_hca_cap_in, in, opcode, MLX5_CMD_OP_SET_HCA_CAP); + err = mlx5_cmd_exec(dev, in, in_sz, out, sizeof(out)); + if (err) + return err; + + err = mlx5_cmd_status_to_err_v2(out); + + return err; +} + +static int handle_hca_cap(struct mlx5_core_dev *dev) +{ + void *set_ctx = NULL; + struct mlx5_profile *prof = dev->profile; + struct mlx5_caps *cur_caps = NULL; + struct mlx5_caps *max_caps = NULL; + int err = -ENOMEM; + int set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in); + + set_ctx = kzalloc(set_sz, GFP_KERNEL); + if (!set_ctx) + goto query_ex; + + max_caps = kzalloc(sizeof(*max_caps), GFP_KERNEL); + if (!max_caps) + goto query_ex; + + cur_caps = kzalloc(sizeof(*cur_caps), GFP_KERNEL); + if (!cur_caps) + goto query_ex; + + err = mlx5_core_get_caps(dev, max_caps, HCA_CAP_OPMOD_GET_MAX); + if (err) + goto query_ex; + + err = mlx5_core_get_caps(dev, cur_caps, HCA_CAP_OPMOD_GET_CUR); + if (err) + goto query_ex; + + /* we limit the size of the pkey table to 128 entries for now */ + cur_caps->gen.pkey_table_size = 128; + + if (prof->mask & MLX5_PROF_MASK_QP_SIZE) + cur_caps->gen.log_max_qp = prof->log_max_qp; + + /* disable checksum */ + cur_caps->gen.flags &= ~MLX5_DEV_CAP_FLAG_CMDIF_CSUM; + + copy_rw_fields(MLX5_ADDR_OF(set_hca_cap_in, set_ctx, hca_capability_struct), + cur_caps); + err = set_caps(dev, set_ctx, set_sz); + +query_ex: + kfree(cur_caps); + kfree(max_caps); + kfree(set_ctx); + + return err; +} + +static int set_hca_ctrl(struct mlx5_core_dev *dev) +{ + struct mlx5_reg_host_endianess he_in; + struct mlx5_reg_host_endianess he_out; + int err; + + memset(&he_in, 0, sizeof(he_in)); + he_in.he = MLX5_SET_HOST_ENDIANNESS; + err = mlx5_core_access_reg(dev, &he_in, sizeof(he_in), + &he_out, sizeof(he_out), + MLX5_REG_HOST_ENDIANNESS, 0, 1); + return err; +} + +static int mlx5_core_enable_hca(struct mlx5_core_dev *dev) +{ + int err; + struct mlx5_enable_hca_mbox_in in; + struct mlx5_enable_hca_mbox_out out; + + memset(&in, 0, sizeof(in)); + memset(&out, 0, sizeof(out)); + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_ENABLE_HCA); + err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); + if (err) + return err; + + if (out.hdr.status) + return mlx5_cmd_status_to_err(&out.hdr); + + return 0; +} + +static int mlx5_core_disable_hca(struct mlx5_core_dev *dev) +{ + int err; + struct mlx5_disable_hca_mbox_in in; + struct mlx5_disable_hca_mbox_out out; + + memset(&in, 0, sizeof(in)); + memset(&out, 0, sizeof(out)); + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DISABLE_HCA); + err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); + if (err) + return err; + + if (out.hdr.status) + return mlx5_cmd_status_to_err(&out.hdr); + + return 0; +} + +static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev) +{ + struct mlx5_priv *priv = &dev->priv; + int err; + + dev->pdev = pdev; + pci_set_drvdata(dev->pdev, dev); + strncpy(priv->name, dev_name(&pdev->dev), MLX5_MAX_NAME_LEN); + priv->name[MLX5_MAX_NAME_LEN - 1] = 0; + + mutex_init(&priv->pgdir_mutex); + INIT_LIST_HEAD(&priv->pgdir_list); + spin_lock_init(&priv->mkey_lock); + + priv->dbg_root = debugfs_create_dir(dev_name(&pdev->dev), mlx5_debugfs_root); + if (!priv->dbg_root) + return -ENOMEM; + + err = pci_enable_device(pdev); + if (err) { + dev_err(&pdev->dev, "Cannot enable PCI device, aborting\n"); + goto err_dbg; + } + + err = request_bar(pdev); + if (err) { + dev_err(&pdev->dev, "error requesting BARs, aborting\n"); + goto err_disable; + } + + pci_set_master(pdev); + + err = set_dma_caps(pdev); + if (err) { + dev_err(&pdev->dev, "Failed setting DMA capabilities mask, aborting\n"); + goto err_clr_master; + } + + dev->iseg_base = pci_resource_start(dev->pdev, 0); + dev->iseg = ioremap(dev->iseg_base, sizeof(*dev->iseg)); + if (!dev->iseg) { + err = -ENOMEM; + dev_err(&pdev->dev, "Failed mapping initialization segment, aborting\n"); + goto err_clr_master; + } + dev_info(&pdev->dev, "firmware version: %d.%d.%d\n", fw_rev_maj(dev), + fw_rev_min(dev), fw_rev_sub(dev)); + + err = mlx5_cmd_init(dev); + if (err) { + dev_err(&pdev->dev, "Failed initializing command interface, aborting\n"); + goto err_unmap; + } + + mlx5_pagealloc_init(dev); + + err = mlx5_core_enable_hca(dev); + if (err) { + dev_err(&pdev->dev, "enable hca failed\n"); + goto err_pagealloc_cleanup; + } + + err = mlx5_satisfy_startup_pages(dev, 1); + if (err) { + dev_err(&pdev->dev, "failed to allocate boot pages\n"); + goto err_disable_hca; + } + + err = set_hca_ctrl(dev); + if (err) { + dev_err(&pdev->dev, "set_hca_ctrl failed\n"); + goto reclaim_boot_pages; + } + + err = handle_hca_cap(dev); + if (err) { + dev_err(&pdev->dev, "handle_hca_cap failed\n"); + goto reclaim_boot_pages; + } + + err = mlx5_satisfy_startup_pages(dev, 0); + if (err) { + dev_err(&pdev->dev, "failed to allocate init pages\n"); + goto reclaim_boot_pages; + } + + err = mlx5_pagealloc_start(dev); + if (err) { + dev_err(&pdev->dev, "mlx5_pagealloc_start failed\n"); + goto reclaim_boot_pages; + } + + err = mlx5_cmd_init_hca(dev); + if (err) { + dev_err(&pdev->dev, "init hca failed\n"); + goto err_pagealloc_stop; + } + + mlx5_start_health_poll(dev); + + err = mlx5_cmd_query_hca_cap(dev, &dev->caps); + if (err) { + dev_err(&pdev->dev, "query hca failed\n"); + goto err_stop_poll; + } + + err = mlx5_cmd_query_adapter(dev); + if (err) { + dev_err(&pdev->dev, "query adapter failed\n"); + goto err_stop_poll; + } + + err = mlx5_enable_msix(dev); + if (err) { + dev_err(&pdev->dev, "enable msix failed\n"); + goto err_stop_poll; + } + + err = mlx5_eq_init(dev); + if (err) { + dev_err(&pdev->dev, "failed to initialize eq\n"); + goto disable_msix; + } + + err = mlx5_alloc_uuars(dev, &priv->uuari); + if (err) { + dev_err(&pdev->dev, "Failed allocating uar, aborting\n"); + goto err_eq_cleanup; + } + + err = mlx5_start_eqs(dev); + if (err) { + dev_err(&pdev->dev, "Failed to start pages and async EQs\n"); + goto err_free_uar; + } + + MLX5_INIT_DOORBELL_LOCK(&priv->cq_uar_lock); + + mlx5_init_cq_table(dev); + mlx5_init_qp_table(dev); + mlx5_init_srq_table(dev); + mlx5_init_mr_table(dev); + + return 0; + +err_free_uar: + mlx5_free_uuars(dev, &priv->uuari); + +err_eq_cleanup: + mlx5_eq_cleanup(dev); + +disable_msix: + mlx5_disable_msix(dev); + +err_stop_poll: + mlx5_stop_health_poll(dev); + if (mlx5_cmd_teardown_hca(dev)) { + dev_err(&dev->pdev->dev, "tear_down_hca failed, skip cleanup\n"); + return err; + } + +err_pagealloc_stop: + mlx5_pagealloc_stop(dev); + +reclaim_boot_pages: + mlx5_reclaim_startup_pages(dev); + +err_disable_hca: + mlx5_core_disable_hca(dev); + +err_pagealloc_cleanup: + mlx5_pagealloc_cleanup(dev); + mlx5_cmd_cleanup(dev); + +err_unmap: + iounmap(dev->iseg); + +err_clr_master: + pci_clear_master(dev->pdev); + release_bar(dev->pdev); + +err_disable: + pci_disable_device(dev->pdev); + +err_dbg: + debugfs_remove(priv->dbg_root); + return err; +} +EXPORT_SYMBOL(mlx5_dev_init); + +static void mlx5_dev_cleanup(struct mlx5_core_dev *dev) +{ + struct mlx5_priv *priv = &dev->priv; + + mlx5_cleanup_srq_table(dev); + mlx5_cleanup_qp_table(dev); + mlx5_cleanup_cq_table(dev); + mlx5_stop_eqs(dev); + mlx5_free_uuars(dev, &priv->uuari); + mlx5_eq_cleanup(dev); + mlx5_disable_msix(dev); + mlx5_stop_health_poll(dev); + if (mlx5_cmd_teardown_hca(dev)) { + dev_err(&dev->pdev->dev, "tear_down_hca failed, skip cleanup\n"); + return; + } + mlx5_pagealloc_stop(dev); + mlx5_reclaim_startup_pages(dev); + mlx5_core_disable_hca(dev); + mlx5_pagealloc_cleanup(dev); + mlx5_cmd_cleanup(dev); + iounmap(dev->iseg); + pci_clear_master(dev->pdev); + release_bar(dev->pdev); + pci_disable_device(dev->pdev); + debugfs_remove(priv->dbg_root); +} + +static void mlx5_add_device(struct mlx5_interface *intf, struct mlx5_priv *priv) +{ + struct mlx5_device_context *dev_ctx; + struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev, priv); + + dev_ctx = kmalloc(sizeof(*dev_ctx), GFP_KERNEL); + if (!dev_ctx) { + pr_warn("mlx5_add_device: alloc context failed\n"); + return; + } + + dev_ctx->intf = intf; + dev_ctx->context = intf->add(dev); + + if (dev_ctx->context) { + spin_lock_irq(&priv->ctx_lock); + list_add_tail(&dev_ctx->list, &priv->ctx_list); + spin_unlock_irq(&priv->ctx_lock); + } else { + kfree(dev_ctx); + } +} + +static void mlx5_remove_device(struct mlx5_interface *intf, struct mlx5_priv *priv) +{ + struct mlx5_device_context *dev_ctx; + struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev, priv); + + list_for_each_entry(dev_ctx, &priv->ctx_list, list) + if (dev_ctx->intf == intf) { + spin_lock_irq(&priv->ctx_lock); + list_del(&dev_ctx->list); + spin_unlock_irq(&priv->ctx_lock); + + intf->remove(dev, dev_ctx->context); + kfree(dev_ctx); + return; + } +} +static int mlx5_register_device(struct mlx5_core_dev *dev) +{ + struct mlx5_priv *priv = &dev->priv; + struct mlx5_interface *intf; + + mutex_lock(&intf_mutex); + list_add_tail(&priv->dev_list, &dev_list); + list_for_each_entry(intf, &intf_list, list) + mlx5_add_device(intf, priv); + mutex_unlock(&intf_mutex); + + return 0; +} +static void mlx5_unregister_device(struct mlx5_core_dev *dev) +{ + struct mlx5_priv *priv = &dev->priv; + struct mlx5_interface *intf; + + mutex_lock(&intf_mutex); + list_for_each_entry(intf, &intf_list, list) + mlx5_remove_device(intf, priv); + list_del(&priv->dev_list); + mutex_unlock(&intf_mutex); +} + +int mlx5_register_interface(struct mlx5_interface *intf) +{ + struct mlx5_priv *priv; + + if (!intf->add || !intf->remove) + return -EINVAL; + + mutex_lock(&intf_mutex); + list_add_tail(&intf->list, &intf_list); + list_for_each_entry(priv, &dev_list, dev_list) + mlx5_add_device(intf, priv); + mutex_unlock(&intf_mutex); + + return 0; +} +EXPORT_SYMBOL(mlx5_register_interface); + +void mlx5_unregister_interface(struct mlx5_interface *intf) +{ + struct mlx5_priv *priv; + + mutex_lock(&intf_mutex); + list_for_each_entry(priv, &dev_list, dev_list) + mlx5_remove_device(intf, priv); + list_del(&intf->list); + mutex_unlock(&intf_mutex); +} +EXPORT_SYMBOL(mlx5_unregister_interface); + +static void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event, + unsigned long param) +{ + struct mlx5_priv *priv = &dev->priv; + struct mlx5_device_context *dev_ctx; + unsigned long flags; + + spin_lock_irqsave(&priv->ctx_lock, flags); + + list_for_each_entry(dev_ctx, &priv->ctx_list, list) + if (dev_ctx->intf->event) + dev_ctx->intf->event(dev, dev_ctx->context, event, param); + + spin_unlock_irqrestore(&priv->ctx_lock, flags); +} + +struct mlx5_core_event_handler { + void (*event)(struct mlx5_core_dev *dev, + enum mlx5_dev_event event, + void *data); +}; + +static int init_one(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + struct mlx5_core_dev *dev; + struct mlx5_priv *priv; + int err; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) { + dev_err(&pdev->dev, "kzalloc failed\n"); + return -ENOMEM; + } + priv = &dev->priv; + + pci_set_drvdata(pdev, dev); + + if (prof_sel < 0 || prof_sel >= ARRAY_SIZE(profile)) { + pr_warn("selected profile out of range, selecting default (%d)\n", + MLX5_DEFAULT_PROF); + prof_sel = MLX5_DEFAULT_PROF; + } + dev->profile = &profile[prof_sel]; + dev->event = mlx5_core_event; + + INIT_LIST_HEAD(&priv->ctx_list); + spin_lock_init(&priv->ctx_lock); + err = mlx5_dev_init(dev, pdev); + if (err) { + dev_err(&pdev->dev, "mlx5_dev_init failed %d\n", err); + goto out; + } + + err = mlx5_register_device(dev); + if (err) { + dev_err(&pdev->dev, "mlx5_register_device failed %d\n", err); + goto out_init; + } + + return 0; + +out_init: + mlx5_dev_cleanup(dev); +out: + kfree(dev); + return err; +} +static void remove_one(struct pci_dev *pdev) +{ + struct mlx5_core_dev *dev = pci_get_drvdata(pdev); + + mlx5_unregister_device(dev); + mlx5_dev_cleanup(dev); + kfree(dev); +} + +static const struct pci_device_id mlx5_core_pci_table[] = { + { PCI_VDEVICE(MELLANOX, 4113) }, /* MT4113 Connect-IB */ + { PCI_VDEVICE(MELLANOX, 4115) }, /* ConnectX-4 */ + { 0, } +}; + +MODULE_DEVICE_TABLE(pci, mlx5_core_pci_table); + +static struct pci_driver mlx5_core_driver = { + .name = DRIVER_NAME, + .id_table = mlx5_core_pci_table, + .probe = init_one, + .remove = remove_one +}; + +static int __init init(void) +{ + int err; + + mlx5_register_debugfs(); + mlx5_core_wq = create_singlethread_workqueue("mlx5_core_wq"); + if (!mlx5_core_wq) { + err = -ENOMEM; + goto err_debug; + } + mlx5_health_init(); + + err = pci_register_driver(&mlx5_core_driver); + if (err) + goto err_health; + + return 0; + +err_health: + mlx5_health_cleanup(); + destroy_workqueue(mlx5_core_wq); +err_debug: + mlx5_unregister_debugfs(); + return err; +} + +static void __exit cleanup(void) +{ + pci_unregister_driver(&mlx5_core_driver); + mlx5_health_cleanup(); + destroy_workqueue(mlx5_core_wq); + mlx5_unregister_debugfs(); +} + +module_init(init); +module_exit(cleanup); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mcg.c b/drivers/net/ethernet/mellanox/mlx5/core/mcg.c new file mode 100644 index 0000000..4483764 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/mcg.c @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include "mlx5_core.h" + +struct mlx5_attach_mcg_mbox_in { + struct mlx5_inbox_hdr hdr; + __be32 qpn; + __be32 rsvd; + u8 gid[16]; +}; + +struct mlx5_attach_mcg_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvf[8]; +}; + +struct mlx5_detach_mcg_mbox_in { + struct mlx5_inbox_hdr hdr; + __be32 qpn; + __be32 rsvd; + u8 gid[16]; +}; + +struct mlx5_detach_mcg_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvf[8]; +}; + +int mlx5_core_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn) +{ + struct mlx5_attach_mcg_mbox_in in; + struct mlx5_attach_mcg_mbox_out out; + int err; + + memset(&in, 0, sizeof(in)); + memset(&out, 0, sizeof(out)); + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_ATTACH_TO_MCG); + memcpy(in.gid, mgid, sizeof(*mgid)); + in.qpn = cpu_to_be32(qpn); + err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); + if (err) + return err; + + if (out.hdr.status) + err = mlx5_cmd_status_to_err(&out.hdr); + + return err; +} +EXPORT_SYMBOL(mlx5_core_attach_mcg); + +int mlx5_core_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn) +{ + struct mlx5_detach_mcg_mbox_in in; + struct mlx5_detach_mcg_mbox_out out; + int err; + + memset(&in, 0, sizeof(in)); + memset(&out, 0, sizeof(out)); + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DETACH_FROM_MCG); + memcpy(in.gid, mgid, sizeof(*mgid)); + in.qpn = cpu_to_be32(qpn); + err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); + if (err) + return err; + + if (out.hdr.status) + err = mlx5_cmd_status_to_err(&out.hdr); + + return err; +} +EXPORT_SYMBOL(mlx5_core_detach_mcg); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h new file mode 100644 index 0000000..f0c9f9a --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __MLX5_CORE_H__ +#define __MLX5_CORE_H__ + +#include +#include +#include + +extern int mlx5_core_debug_mask; + +#define mlx5_core_dbg(dev, format, ...) \ + pr_debug("%s:%s:%d:(pid %d): " format, \ + (dev)->priv.name, __func__, __LINE__, current->pid, \ + ##__VA_ARGS__) + +#define mlx5_core_dbg_mask(dev, mask, format, ...) \ +do { \ + if ((mask) & mlx5_core_debug_mask) \ + mlx5_core_dbg(dev, format, ##__VA_ARGS__); \ +} while (0) + +#define mlx5_core_err(dev, format, ...) \ + pr_err("%s:%s:%d:(pid %d): " format, \ + (dev)->priv.name, __func__, __LINE__, current->pid, \ + ##__VA_ARGS__) + +#define mlx5_core_warn(dev, format, ...) \ + pr_warn("%s:%s:%d:(pid %d): " format, \ + (dev)->priv.name, __func__, __LINE__, current->pid, \ + ##__VA_ARGS__) + +enum { + MLX5_CMD_DATA, /* print command payload only */ + MLX5_CMD_TIME, /* print command execution time */ +}; + + +int mlx5_cmd_query_hca_cap(struct mlx5_core_dev *dev, + struct mlx5_caps *caps); +int mlx5_cmd_query_adapter(struct mlx5_core_dev *dev); +int mlx5_cmd_init_hca(struct mlx5_core_dev *dev); +int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev); + +#endif /* __MLX5_CORE_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c new file mode 100644 index 0000000..184c361 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c @@ -0,0 +1,248 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include "mlx5_core.h" + +void mlx5_init_mr_table(struct mlx5_core_dev *dev) +{ + struct mlx5_mr_table *table = &dev->priv.mr_table; + + rwlock_init(&table->lock); + INIT_RADIX_TREE(&table->tree, GFP_ATOMIC); +} + +void mlx5_cleanup_mr_table(struct mlx5_core_dev *dev) +{ +} + +int mlx5_core_create_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, + struct mlx5_create_mkey_mbox_in *in, int inlen, + mlx5_cmd_cbk_t callback, void *context, + struct mlx5_create_mkey_mbox_out *out) +{ + struct mlx5_mr_table *table = &dev->priv.mr_table; + struct mlx5_create_mkey_mbox_out lout; + int err; + u8 key; + + memset(&lout, 0, sizeof(lout)); + spin_lock_irq(&dev->priv.mkey_lock); + key = dev->priv.mkey_key++; + spin_unlock_irq(&dev->priv.mkey_lock); + in->seg.qpn_mkey7_0 |= cpu_to_be32(key); + in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_MKEY); + if (callback) { + err = mlx5_cmd_exec_cb(dev, in, inlen, out, sizeof(*out), + callback, context); + return err; + } else { + err = mlx5_cmd_exec(dev, in, inlen, &lout, sizeof(lout)); + } + + if (err) { + mlx5_core_dbg(dev, "cmd exec failed %d\n", err); + return err; + } + + if (lout.hdr.status) { + mlx5_core_dbg(dev, "status %d\n", lout.hdr.status); + return mlx5_cmd_status_to_err(&lout.hdr); + } + + mr->iova = be64_to_cpu(in->seg.start_addr); + mr->size = be64_to_cpu(in->seg.len); + mr->key = mlx5_idx_to_mkey(be32_to_cpu(lout.mkey) & 0xffffff) | key; + mr->pd = be32_to_cpu(in->seg.flags_pd) & 0xffffff; + + mlx5_core_dbg(dev, "out 0x%x, key 0x%x, mkey 0x%x\n", + be32_to_cpu(lout.mkey), key, mr->key); + + /* connect to MR tree */ + write_lock_irq(&table->lock); + err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->key), mr); + write_unlock_irq(&table->lock); + if (err) { + mlx5_core_warn(dev, "failed radix tree insert of mr 0x%x, %d\n", + mlx5_base_mkey(mr->key), err); + mlx5_core_destroy_mkey(dev, mr); + } + + return err; +} +EXPORT_SYMBOL(mlx5_core_create_mkey); + +int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr) +{ + struct mlx5_mr_table *table = &dev->priv.mr_table; + struct mlx5_destroy_mkey_mbox_in in; + struct mlx5_destroy_mkey_mbox_out out; + struct mlx5_core_mr *deleted_mr; + unsigned long flags; + int err; + + memset(&in, 0, sizeof(in)); + memset(&out, 0, sizeof(out)); + + write_lock_irqsave(&table->lock, flags); + deleted_mr = radix_tree_delete(&table->tree, mlx5_base_mkey(mr->key)); + write_unlock_irqrestore(&table->lock, flags); + if (!deleted_mr) { + mlx5_core_warn(dev, "failed radix tree delete of mr 0x%x\n", + mlx5_base_mkey(mr->key)); + return -ENOENT; + } + + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_MKEY); + in.mkey = cpu_to_be32(mlx5_mkey_to_idx(mr->key)); + err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); + if (err) + return err; + + if (out.hdr.status) + return mlx5_cmd_status_to_err(&out.hdr); + + return err; +} +EXPORT_SYMBOL(mlx5_core_destroy_mkey); + +int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, + struct mlx5_query_mkey_mbox_out *out, int outlen) +{ + struct mlx5_destroy_mkey_mbox_in in; + int err; + + memset(&in, 0, sizeof(in)); + memset(out, 0, outlen); + + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_MKEY); + in.mkey = cpu_to_be32(mlx5_mkey_to_idx(mr->key)); + err = mlx5_cmd_exec(dev, &in, sizeof(in), out, outlen); + if (err) + return err; + + if (out->hdr.status) + return mlx5_cmd_status_to_err(&out->hdr); + + return err; +} +EXPORT_SYMBOL(mlx5_core_query_mkey); + +int mlx5_core_dump_fill_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, + u32 *mkey) +{ + struct mlx5_query_special_ctxs_mbox_in in; + struct mlx5_query_special_ctxs_mbox_out out; + int err; + + memset(&in, 0, sizeof(in)); + memset(&out, 0, sizeof(out)); + + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS); + err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); + if (err) + return err; + + if (out.hdr.status) + return mlx5_cmd_status_to_err(&out.hdr); + + *mkey = be32_to_cpu(out.dump_fill_mkey); + + return err; +} +EXPORT_SYMBOL(mlx5_core_dump_fill_mkey); + +int mlx5_core_create_psv(struct mlx5_core_dev *dev, u32 pdn, + int npsvs, u32 *sig_index) +{ + struct mlx5_allocate_psv_in in; + struct mlx5_allocate_psv_out out; + int i, err; + + if (npsvs > MLX5_MAX_PSVS) + return -EINVAL; + + memset(&in, 0, sizeof(in)); + memset(&out, 0, sizeof(out)); + + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_PSV); + in.npsv_pd = cpu_to_be32((npsvs << 28) | pdn); + err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); + if (err) { + mlx5_core_err(dev, "cmd exec failed %d\n", err); + return err; + } + + if (out.hdr.status) { + mlx5_core_err(dev, "create_psv bad status %d\n", + out.hdr.status); + return mlx5_cmd_status_to_err(&out.hdr); + } + + for (i = 0; i < npsvs; i++) + sig_index[i] = be32_to_cpu(out.psv_idx[i]) & 0xffffff; + + return err; +} +EXPORT_SYMBOL(mlx5_core_create_psv); + +int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num) +{ + struct mlx5_destroy_psv_in in; + struct mlx5_destroy_psv_out out; + int err; + + memset(&in, 0, sizeof(in)); + memset(&out, 0, sizeof(out)); + + in.psv_number = cpu_to_be32(psv_num); + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_PSV); + err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); + if (err) { + mlx5_core_err(dev, "destroy_psv cmd exec failed %d\n", err); + goto out; + } + + if (out.hdr.status) { + mlx5_core_err(dev, "destroy_psv bad status %d\n", + out.hdr.status); + err = mlx5_cmd_status_to_err(&out.hdr); + goto out; + } + +out: + return err; +} +EXPORT_SYMBOL(mlx5_core_destroy_psv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c new file mode 100644 index 0000000..d476918 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c @@ -0,0 +1,531 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include "mlx5_core.h" + +enum { + MLX5_PAGES_CANT_GIVE = 0, + MLX5_PAGES_GIVE = 1, + MLX5_PAGES_TAKE = 2 +}; + +enum { + MLX5_BOOT_PAGES = 1, + MLX5_INIT_PAGES = 2, + MLX5_POST_INIT_PAGES = 3 +}; + +struct mlx5_pages_req { + struct mlx5_core_dev *dev; + u16 func_id; + s32 npages; + struct work_struct work; +}; + +struct fw_page { + struct rb_node rb_node; + u64 addr; + struct page *page; + u16 func_id; + unsigned long bitmask; + struct list_head list; + unsigned free_count; +}; + +struct mlx5_query_pages_inbox { + struct mlx5_inbox_hdr hdr; + u8 rsvd[8]; +}; + +struct mlx5_query_pages_outbox { + struct mlx5_outbox_hdr hdr; + __be16 rsvd; + __be16 func_id; + __be32 num_pages; +}; + +struct mlx5_manage_pages_inbox { + struct mlx5_inbox_hdr hdr; + __be16 rsvd; + __be16 func_id; + __be32 num_entries; + __be64 pas[0]; +}; + +struct mlx5_manage_pages_outbox { + struct mlx5_outbox_hdr hdr; + __be32 num_entries; + u8 rsvd[4]; + __be64 pas[0]; +}; + +enum { + MAX_RECLAIM_TIME_MSECS = 5000, +}; + +enum { + MLX5_MAX_RECLAIM_TIME_MILI = 5000, + MLX5_NUM_4K_IN_PAGE = PAGE_SIZE / MLX5_ADAPTER_PAGE_SIZE, +}; + +static int insert_page(struct mlx5_core_dev *dev, u64 addr, struct page *page, u16 func_id) +{ + struct rb_root *root = &dev->priv.page_root; + struct rb_node **new = &root->rb_node; + struct rb_node *parent = NULL; + struct fw_page *nfp; + struct fw_page *tfp; + int i; + + while (*new) { + parent = *new; + tfp = rb_entry(parent, struct fw_page, rb_node); + if (tfp->addr < addr) + new = &parent->rb_left; + else if (tfp->addr > addr) + new = &parent->rb_right; + else + return -EEXIST; + } + + nfp = kzalloc(sizeof(*nfp), GFP_KERNEL); + if (!nfp) + return -ENOMEM; + + nfp->addr = addr; + nfp->page = page; + nfp->func_id = func_id; + nfp->free_count = MLX5_NUM_4K_IN_PAGE; + for (i = 0; i < MLX5_NUM_4K_IN_PAGE; i++) + set_bit(i, &nfp->bitmask); + + rb_link_node(&nfp->rb_node, parent, new); + rb_insert_color(&nfp->rb_node, root); + list_add(&nfp->list, &dev->priv.free_list); + + return 0; +} + +static struct fw_page *find_fw_page(struct mlx5_core_dev *dev, u64 addr) +{ + struct rb_root *root = &dev->priv.page_root; + struct rb_node *tmp = root->rb_node; + struct fw_page *result = NULL; + struct fw_page *tfp; + + while (tmp) { + tfp = rb_entry(tmp, struct fw_page, rb_node); + if (tfp->addr < addr) { + tmp = tmp->rb_left; + } else if (tfp->addr > addr) { + tmp = tmp->rb_right; + } else { + result = tfp; + break; + } + } + + return result; +} + +static int mlx5_cmd_query_pages(struct mlx5_core_dev *dev, u16 *func_id, + s32 *npages, int boot) +{ + struct mlx5_query_pages_inbox in; + struct mlx5_query_pages_outbox out; + int err; + + memset(&in, 0, sizeof(in)); + memset(&out, 0, sizeof(out)); + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_PAGES); + in.hdr.opmod = boot ? cpu_to_be16(MLX5_BOOT_PAGES) : cpu_to_be16(MLX5_INIT_PAGES); + + err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); + if (err) + return err; + + if (out.hdr.status) + return mlx5_cmd_status_to_err(&out.hdr); + + *npages = be32_to_cpu(out.num_pages); + *func_id = be16_to_cpu(out.func_id); + + return err; +} + +static int alloc_4k(struct mlx5_core_dev *dev, u64 *addr) +{ + struct fw_page *fp; + unsigned n; + + if (list_empty(&dev->priv.free_list)) + return -ENOMEM; + + fp = list_entry(dev->priv.free_list.next, struct fw_page, list); + n = find_first_bit(&fp->bitmask, 8 * sizeof(fp->bitmask)); + if (n >= MLX5_NUM_4K_IN_PAGE) { + mlx5_core_warn(dev, "alloc 4k bug\n"); + return -ENOENT; + } + clear_bit(n, &fp->bitmask); + fp->free_count--; + if (!fp->free_count) + list_del(&fp->list); + + *addr = fp->addr + n * MLX5_ADAPTER_PAGE_SIZE; + + return 0; +} + +static void free_4k(struct mlx5_core_dev *dev, u64 addr) +{ + struct fw_page *fwp; + int n; + + fwp = find_fw_page(dev, addr & PAGE_MASK); + if (!fwp) { + mlx5_core_warn(dev, "page not found\n"); + return; + } + + n = (addr & ~PAGE_MASK) >> MLX5_ADAPTER_PAGE_SHIFT; + fwp->free_count++; + set_bit(n, &fwp->bitmask); + if (fwp->free_count == MLX5_NUM_4K_IN_PAGE) { + rb_erase(&fwp->rb_node, &dev->priv.page_root); + if (fwp->free_count != 1) + list_del(&fwp->list); + dma_unmap_page(&dev->pdev->dev, addr & PAGE_MASK, PAGE_SIZE, + DMA_BIDIRECTIONAL); + __free_page(fwp->page); + kfree(fwp); + } else if (fwp->free_count == 1) { + list_add(&fwp->list, &dev->priv.free_list); + } +} + +static int alloc_system_page(struct mlx5_core_dev *dev, u16 func_id) +{ + struct page *page; + u64 addr; + int err; + + page = alloc_page(GFP_HIGHUSER); + if (!page) { + mlx5_core_warn(dev, "failed to allocate page\n"); + return -ENOMEM; + } + addr = dma_map_page(&dev->pdev->dev, page, 0, + PAGE_SIZE, DMA_BIDIRECTIONAL); + if (dma_mapping_error(&dev->pdev->dev, addr)) { + mlx5_core_warn(dev, "failed dma mapping page\n"); + err = -ENOMEM; + goto out_alloc; + } + err = insert_page(dev, addr, page, func_id); + if (err) { + mlx5_core_err(dev, "failed to track allocated page\n"); + goto out_mapping; + } + + return 0; + +out_mapping: + dma_unmap_page(&dev->pdev->dev, addr, PAGE_SIZE, DMA_BIDIRECTIONAL); + +out_alloc: + __free_page(page); + + return err; +} +static int give_pages(struct mlx5_core_dev *dev, u16 func_id, int npages, + int notify_fail) +{ + struct mlx5_manage_pages_inbox *in; + struct mlx5_manage_pages_outbox out; + struct mlx5_manage_pages_inbox *nin; + int inlen; + u64 addr; + int err; + int i; + + inlen = sizeof(*in) + npages * sizeof(in->pas[0]); + in = mlx5_vzalloc(inlen); + if (!in) { + mlx5_core_warn(dev, "vzalloc failed %d\n", inlen); + return -ENOMEM; + } + memset(&out, 0, sizeof(out)); + + for (i = 0; i < npages; i++) { +retry: + err = alloc_4k(dev, &addr); + if (err) { + if (err == -ENOMEM) + err = alloc_system_page(dev, func_id); + if (err) + goto out_4k; + + goto retry; + } + in->pas[i] = cpu_to_be64(addr); + } + + in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_MANAGE_PAGES); + in->hdr.opmod = cpu_to_be16(MLX5_PAGES_GIVE); + in->func_id = cpu_to_be16(func_id); + in->num_entries = cpu_to_be32(npages); + err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out)); + if (err) { + mlx5_core_warn(dev, "func_id 0x%x, npages %d, err %d\n", + func_id, npages, err); + goto out_alloc; + } + dev->priv.fw_pages += npages; + + if (out.hdr.status) { + err = mlx5_cmd_status_to_err(&out.hdr); + if (err) { + mlx5_core_warn(dev, "func_id 0x%x, npages %d, status %d\n", + func_id, npages, out.hdr.status); + goto out_alloc; + } + } + + mlx5_core_dbg(dev, "err %d\n", err); + + goto out_free; + +out_alloc: + if (notify_fail) { + nin = kzalloc(sizeof(*nin), GFP_KERNEL); + if (!nin) { + mlx5_core_warn(dev, "allocation failed\n"); + goto out_4k; + } + memset(&out, 0, sizeof(out)); + nin->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_MANAGE_PAGES); + nin->hdr.opmod = cpu_to_be16(MLX5_PAGES_CANT_GIVE); + if (mlx5_cmd_exec(dev, nin, sizeof(*nin), &out, sizeof(out))) + mlx5_core_warn(dev, "page notify failed\n"); + kfree(nin); + } + +out_4k: + for (i--; i >= 0; i--) + free_4k(dev, be64_to_cpu(in->pas[i])); +out_free: + mlx5_vfree(in); + return err; +} + +static int reclaim_pages(struct mlx5_core_dev *dev, u32 func_id, int npages, + int *nclaimed) +{ + struct mlx5_manage_pages_inbox in; + struct mlx5_manage_pages_outbox *out; + int num_claimed; + int outlen; + u64 addr; + int err; + int i; + + if (nclaimed) + *nclaimed = 0; + + memset(&in, 0, sizeof(in)); + outlen = sizeof(*out) + npages * sizeof(out->pas[0]); + out = mlx5_vzalloc(outlen); + if (!out) + return -ENOMEM; + + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_MANAGE_PAGES); + in.hdr.opmod = cpu_to_be16(MLX5_PAGES_TAKE); + in.func_id = cpu_to_be16(func_id); + in.num_entries = cpu_to_be32(npages); + mlx5_core_dbg(dev, "npages %d, outlen %d\n", npages, outlen); + err = mlx5_cmd_exec(dev, &in, sizeof(in), out, outlen); + if (err) { + mlx5_core_err(dev, "failed reclaiming pages\n"); + goto out_free; + } + dev->priv.fw_pages -= npages; + + if (out->hdr.status) { + err = mlx5_cmd_status_to_err(&out->hdr); + goto out_free; + } + + num_claimed = be32_to_cpu(out->num_entries); + if (nclaimed) + *nclaimed = num_claimed; + + for (i = 0; i < num_claimed; i++) { + addr = be64_to_cpu(out->pas[i]); + free_4k(dev, addr); + } + +out_free: + mlx5_vfree(out); + return err; +} + +static void pages_work_handler(struct work_struct *work) +{ + struct mlx5_pages_req *req = container_of(work, struct mlx5_pages_req, work); + struct mlx5_core_dev *dev = req->dev; + int err = 0; + + if (req->npages < 0) + err = reclaim_pages(dev, req->func_id, -1 * req->npages, NULL); + else if (req->npages > 0) + err = give_pages(dev, req->func_id, req->npages, 1); + + if (err) + mlx5_core_warn(dev, "%s fail %d\n", + req->npages < 0 ? "reclaim" : "give", err); + + kfree(req); +} + +void mlx5_core_req_pages_handler(struct mlx5_core_dev *dev, u16 func_id, + s32 npages) +{ + struct mlx5_pages_req *req; + + req = kzalloc(sizeof(*req), GFP_ATOMIC); + if (!req) { + mlx5_core_warn(dev, "failed to allocate pages request\n"); + return; + } + + req->dev = dev; + req->func_id = func_id; + req->npages = npages; + INIT_WORK(&req->work, pages_work_handler); + queue_work(dev->priv.pg_wq, &req->work); +} + +int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev, int boot) +{ + u16 uninitialized_var(func_id); + s32 uninitialized_var(npages); + int err; + + err = mlx5_cmd_query_pages(dev, &func_id, &npages, boot); + if (err) + return err; + + mlx5_core_dbg(dev, "requested %d %s pages for func_id 0x%x\n", + npages, boot ? "boot" : "init", func_id); + + return give_pages(dev, func_id, npages, 0); +} + +enum { + MLX5_BLKS_FOR_RECLAIM_PAGES = 12 +}; + +static int optimal_reclaimed_pages(void) +{ + struct mlx5_cmd_prot_block *block; + struct mlx5_cmd_layout *lay; + int ret; + + ret = (sizeof(lay->out) + MLX5_BLKS_FOR_RECLAIM_PAGES * sizeof(block->data) - + sizeof(struct mlx5_manage_pages_outbox)) / + FIELD_SIZEOF(struct mlx5_manage_pages_outbox, pas[0]); + + return ret; +} + +int mlx5_reclaim_startup_pages(struct mlx5_core_dev *dev) +{ + unsigned long end = jiffies + msecs_to_jiffies(MAX_RECLAIM_TIME_MSECS); + struct fw_page *fwp; + struct rb_node *p; + int nclaimed = 0; + int err; + + do { + p = rb_first(&dev->priv.page_root); + if (p) { + fwp = rb_entry(p, struct fw_page, rb_node); + err = reclaim_pages(dev, fwp->func_id, + optimal_reclaimed_pages(), + &nclaimed); + if (err) { + mlx5_core_warn(dev, "failed reclaiming pages (%d)\n", + err); + return err; + } + if (nclaimed) + end = jiffies + msecs_to_jiffies(MAX_RECLAIM_TIME_MSECS); + } + if (time_after(jiffies, end)) { + mlx5_core_warn(dev, "FW did not return all pages. giving up...\n"); + break; + } + } while (p); + + return 0; +} + +void mlx5_pagealloc_init(struct mlx5_core_dev *dev) +{ + dev->priv.page_root = RB_ROOT; + INIT_LIST_HEAD(&dev->priv.free_list); +} + +void mlx5_pagealloc_cleanup(struct mlx5_core_dev *dev) +{ + /* nothing */ +} + +int mlx5_pagealloc_start(struct mlx5_core_dev *dev) +{ + dev->priv.pg_wq = create_singlethread_workqueue("mlx5_page_allocator"); + if (!dev->priv.pg_wq) + return -ENOMEM; + + return 0; +} + +void mlx5_pagealloc_stop(struct mlx5_core_dev *dev) +{ + destroy_workqueue(dev->priv.pg_wq); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pd.c b/drivers/net/ethernet/mellanox/mlx5/core/pd.c new file mode 100644 index 0000000..790da5c --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/pd.c @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include "mlx5_core.h" + +struct mlx5_alloc_pd_mbox_in { + struct mlx5_inbox_hdr hdr; + u8 rsvd[8]; +}; + +struct mlx5_alloc_pd_mbox_out { + struct mlx5_outbox_hdr hdr; + __be32 pdn; + u8 rsvd[4]; +}; + +struct mlx5_dealloc_pd_mbox_in { + struct mlx5_inbox_hdr hdr; + __be32 pdn; + u8 rsvd[4]; +}; + +struct mlx5_dealloc_pd_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd[8]; +}; + +int mlx5_core_alloc_pd(struct mlx5_core_dev *dev, u32 *pdn) +{ + struct mlx5_alloc_pd_mbox_in in; + struct mlx5_alloc_pd_mbox_out out; + int err; + + memset(&in, 0, sizeof(in)); + memset(&out, 0, sizeof(out)); + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_ALLOC_PD); + err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); + if (err) + return err; + + if (out.hdr.status) + return mlx5_cmd_status_to_err(&out.hdr); + + *pdn = be32_to_cpu(out.pdn) & 0xffffff; + return err; +} +EXPORT_SYMBOL(mlx5_core_alloc_pd); + +int mlx5_core_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn) +{ + struct mlx5_dealloc_pd_mbox_in in; + struct mlx5_dealloc_pd_mbox_out out; + int err; + + memset(&in, 0, sizeof(in)); + memset(&out, 0, sizeof(out)); + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DEALLOC_PD); + in.pdn = cpu_to_be32(pdn); + err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); + if (err) + return err; + + if (out.hdr.status) + return mlx5_cmd_status_to_err(&out.hdr); + + return err; +} +EXPORT_SYMBOL(mlx5_core_dealloc_pd); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c new file mode 100644 index 0000000..3139658 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include "mlx5_core.h" + +int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in, + int size_in, void *data_out, int size_out, + u16 reg_num, int arg, int write) +{ + struct mlx5_access_reg_mbox_in *in = NULL; + struct mlx5_access_reg_mbox_out *out = NULL; + int err = -ENOMEM; + + in = mlx5_vzalloc(sizeof(*in) + size_in); + if (!in) + return -ENOMEM; + + out = mlx5_vzalloc(sizeof(*out) + size_out); + if (!out) + goto ex1; + + memcpy(in->data, data_in, size_in); + in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_ACCESS_REG); + in->hdr.opmod = cpu_to_be16(!write); + in->arg = cpu_to_be32(arg); + in->register_id = cpu_to_be16(reg_num); + err = mlx5_cmd_exec(dev, in, sizeof(*in) + size_in, out, + sizeof(*out) + size_out); + if (err) + goto ex2; + + if (out->hdr.status) + err = mlx5_cmd_status_to_err(&out->hdr); + + if (!err) + memcpy(data_out, out->data, size_out); + +ex2: + mlx5_vfree(out); +ex1: + mlx5_vfree(in); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_core_access_reg); + + +struct mlx5_reg_pcap { + u8 rsvd0; + u8 port_num; + u8 rsvd1[2]; + __be32 caps_127_96; + __be32 caps_95_64; + __be32 caps_63_32; + __be32 caps_31_0; +}; + +int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps) +{ + struct mlx5_reg_pcap in; + struct mlx5_reg_pcap out; + int err; + + memset(&in, 0, sizeof(in)); + in.caps_127_96 = cpu_to_be32(caps); + in.port_num = port_num; + + err = mlx5_core_access_reg(dev, &in, sizeof(in), &out, + sizeof(out), MLX5_REG_PCAP, 0, 1); + + return err; +} +EXPORT_SYMBOL_GPL(mlx5_set_port_caps); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c new file mode 100644 index 0000000..5261a2b --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c @@ -0,0 +1,324 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include +#include +#include +#include +#include + +#include "mlx5_core.h" + +static struct mlx5_core_rsc_common *mlx5_get_rsc(struct mlx5_core_dev *dev, + u32 rsn) +{ + struct mlx5_qp_table *table = &dev->priv.qp_table; + struct mlx5_core_rsc_common *common; + + spin_lock(&table->lock); + + common = radix_tree_lookup(&table->tree, rsn); + if (common) + atomic_inc(&common->refcount); + + spin_unlock(&table->lock); + + if (!common) { + mlx5_core_warn(dev, "Async event for bogus resource 0x%x\n", + rsn); + return NULL; + } + return common; +} + +void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common) +{ + if (atomic_dec_and_test(&common->refcount)) + complete(&common->free); +} + +void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type) +{ + struct mlx5_core_rsc_common *common = mlx5_get_rsc(dev, rsn); + struct mlx5_core_qp *qp; + + if (!common) + return; + + switch (common->res) { + case MLX5_RES_QP: + qp = (struct mlx5_core_qp *)common; + qp->event(qp, event_type); + break; + + default: + mlx5_core_warn(dev, "invalid resource type for 0x%x\n", rsn); + } + + mlx5_core_put_rsc(common); +} + +int mlx5_core_create_qp(struct mlx5_core_dev *dev, + struct mlx5_core_qp *qp, + struct mlx5_create_qp_mbox_in *in, + int inlen) +{ + struct mlx5_qp_table *table = &dev->priv.qp_table; + struct mlx5_create_qp_mbox_out out; + struct mlx5_destroy_qp_mbox_in din; + struct mlx5_destroy_qp_mbox_out dout; + int err; + + memset(&out, 0, sizeof(out)); + in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_QP); + + err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out)); + if (err) { + mlx5_core_warn(dev, "ret %d\n", err); + return err; + } + + if (out.hdr.status) { + mlx5_core_warn(dev, "current num of QPs 0x%x\n", + atomic_read(&dev->num_qps)); + return mlx5_cmd_status_to_err(&out.hdr); + } + + qp->qpn = be32_to_cpu(out.qpn) & 0xffffff; + mlx5_core_dbg(dev, "qpn = 0x%x\n", qp->qpn); + + qp->common.res = MLX5_RES_QP; + spin_lock_irq(&table->lock); + err = radix_tree_insert(&table->tree, qp->qpn, qp); + spin_unlock_irq(&table->lock); + if (err) { + mlx5_core_warn(dev, "err %d\n", err); + goto err_cmd; + } + + err = mlx5_debug_qp_add(dev, qp); + if (err) + mlx5_core_dbg(dev, "failed adding QP 0x%x to debug file system\n", + qp->qpn); + + qp->pid = current->pid; + atomic_set(&qp->common.refcount, 1); + atomic_inc(&dev->num_qps); + init_completion(&qp->common.free); + + return 0; + +err_cmd: + memset(&din, 0, sizeof(din)); + memset(&dout, 0, sizeof(dout)); + din.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_QP); + din.qpn = cpu_to_be32(qp->qpn); + mlx5_cmd_exec(dev, &din, sizeof(din), &out, sizeof(dout)); + + return err; +} +EXPORT_SYMBOL_GPL(mlx5_core_create_qp); + +int mlx5_core_destroy_qp(struct mlx5_core_dev *dev, + struct mlx5_core_qp *qp) +{ + struct mlx5_destroy_qp_mbox_in in; + struct mlx5_destroy_qp_mbox_out out; + struct mlx5_qp_table *table = &dev->priv.qp_table; + unsigned long flags; + int err; + + mlx5_debug_qp_remove(dev, qp); + + spin_lock_irqsave(&table->lock, flags); + radix_tree_delete(&table->tree, qp->qpn); + spin_unlock_irqrestore(&table->lock, flags); + + mlx5_core_put_rsc((struct mlx5_core_rsc_common *)qp); + wait_for_completion(&qp->common.free); + + memset(&in, 0, sizeof(in)); + memset(&out, 0, sizeof(out)); + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_QP); + in.qpn = cpu_to_be32(qp->qpn); + err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); + if (err) + return err; + + if (out.hdr.status) + return mlx5_cmd_status_to_err(&out.hdr); + + atomic_dec(&dev->num_qps); + return 0; +} +EXPORT_SYMBOL_GPL(mlx5_core_destroy_qp); + +int mlx5_core_qp_modify(struct mlx5_core_dev *dev, enum mlx5_qp_state cur_state, + enum mlx5_qp_state new_state, + struct mlx5_modify_qp_mbox_in *in, int sqd_event, + struct mlx5_core_qp *qp) +{ + static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = { + [MLX5_QP_STATE_RST] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_INIT] = MLX5_CMD_OP_RST2INIT_QP, + }, + [MLX5_QP_STATE_INIT] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_INIT] = MLX5_CMD_OP_INIT2INIT_QP, + [MLX5_QP_STATE_RTR] = MLX5_CMD_OP_INIT2RTR_QP, + }, + [MLX5_QP_STATE_RTR] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_RTR2RTS_QP, + }, + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_RTS2RTS_QP, + }, + [MLX5_QP_STATE_SQD] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + }, + [MLX5_QP_STATE_SQER] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_SQERR2RTS_QP, + }, + [MLX5_QP_STATE_ERR] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + } + }; + + struct mlx5_modify_qp_mbox_out out; + int err = 0; + u16 op; + + if (cur_state >= MLX5_QP_NUM_STATE || new_state >= MLX5_QP_NUM_STATE || + !optab[cur_state][new_state]) + return -EINVAL; + + memset(&out, 0, sizeof(out)); + op = optab[cur_state][new_state]; + in->hdr.opcode = cpu_to_be16(op); + in->qpn = cpu_to_be32(qp->qpn); + err = mlx5_cmd_exec(dev, in, sizeof(*in), &out, sizeof(out)); + if (err) + return err; + + return mlx5_cmd_status_to_err(&out.hdr); +} +EXPORT_SYMBOL_GPL(mlx5_core_qp_modify); + +void mlx5_init_qp_table(struct mlx5_core_dev *dev) +{ + struct mlx5_qp_table *table = &dev->priv.qp_table; + + spin_lock_init(&table->lock); + INIT_RADIX_TREE(&table->tree, GFP_ATOMIC); + mlx5_qp_debugfs_init(dev); +} + +void mlx5_cleanup_qp_table(struct mlx5_core_dev *dev) +{ + mlx5_qp_debugfs_cleanup(dev); +} + +int mlx5_core_qp_query(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, + struct mlx5_query_qp_mbox_out *out, int outlen) +{ + struct mlx5_query_qp_mbox_in in; + int err; + + memset(&in, 0, sizeof(in)); + memset(out, 0, outlen); + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_QP); + in.qpn = cpu_to_be32(qp->qpn); + err = mlx5_cmd_exec(dev, &in, sizeof(in), out, outlen); + if (err) + return err; + + if (out->hdr.status) + return mlx5_cmd_status_to_err(&out->hdr); + + return err; +} +EXPORT_SYMBOL_GPL(mlx5_core_qp_query); + +int mlx5_core_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn) +{ + struct mlx5_alloc_xrcd_mbox_in in; + struct mlx5_alloc_xrcd_mbox_out out; + int err; + + memset(&in, 0, sizeof(in)); + memset(&out, 0, sizeof(out)); + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_ALLOC_XRCD); + err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); + if (err) + return err; + + if (out.hdr.status) + err = mlx5_cmd_status_to_err(&out.hdr); + else + *xrcdn = be32_to_cpu(out.xrcdn); + + return err; +} +EXPORT_SYMBOL_GPL(mlx5_core_xrcd_alloc); + +int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn) +{ + struct mlx5_dealloc_xrcd_mbox_in in; + struct mlx5_dealloc_xrcd_mbox_out out; + int err; + + memset(&in, 0, sizeof(in)); + memset(&out, 0, sizeof(out)); + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DEALLOC_XRCD); + in.xrcdn = cpu_to_be32(xrcdn); + err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); + if (err) + return err; + + if (out.hdr.status) + err = mlx5_cmd_status_to_err(&out.hdr); + + return err; +} +EXPORT_SYMBOL_GPL(mlx5_core_xrcd_dealloc); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/srq.c b/drivers/net/ethernet/mellanox/mlx5/core/srq.c new file mode 100644 index 0000000..38bce93 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/srq.c @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include "mlx5_core.h" + +void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type) +{ + struct mlx5_srq_table *table = &dev->priv.srq_table; + struct mlx5_core_srq *srq; + + spin_lock(&table->lock); + + srq = radix_tree_lookup(&table->tree, srqn); + if (srq) + atomic_inc(&srq->refcount); + + spin_unlock(&table->lock); + + if (!srq) { + mlx5_core_warn(dev, "Async event for bogus SRQ 0x%08x\n", srqn); + return; + } + + srq->event(srq, event_type); + + if (atomic_dec_and_test(&srq->refcount)) + complete(&srq->free); +} + +struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn) +{ + struct mlx5_srq_table *table = &dev->priv.srq_table; + struct mlx5_core_srq *srq; + + spin_lock(&table->lock); + + srq = radix_tree_lookup(&table->tree, srqn); + if (srq) + atomic_inc(&srq->refcount); + + spin_unlock(&table->lock); + + return srq; +} +EXPORT_SYMBOL(mlx5_core_get_srq); + +int mlx5_core_create_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, + struct mlx5_create_srq_mbox_in *in, int inlen) +{ + struct mlx5_create_srq_mbox_out out; + struct mlx5_srq_table *table = &dev->priv.srq_table; + struct mlx5_destroy_srq_mbox_in din; + struct mlx5_destroy_srq_mbox_out dout; + int err; + + memset(&out, 0, sizeof(out)); + in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_SRQ); + err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out)); + if (err) + return err; + + if (out.hdr.status) + return mlx5_cmd_status_to_err(&out.hdr); + + srq->srqn = be32_to_cpu(out.srqn) & 0xffffff; + + atomic_set(&srq->refcount, 1); + init_completion(&srq->free); + + spin_lock_irq(&table->lock); + err = radix_tree_insert(&table->tree, srq->srqn, srq); + spin_unlock_irq(&table->lock); + if (err) { + mlx5_core_warn(dev, "err %d, srqn 0x%x\n", err, srq->srqn); + goto err_cmd; + } + + return 0; + +err_cmd: + memset(&din, 0, sizeof(din)); + memset(&dout, 0, sizeof(dout)); + din.srqn = cpu_to_be32(srq->srqn); + din.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_SRQ); + mlx5_cmd_exec(dev, &din, sizeof(din), &dout, sizeof(dout)); + return err; +} +EXPORT_SYMBOL(mlx5_core_create_srq); + +int mlx5_core_destroy_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq) +{ + struct mlx5_destroy_srq_mbox_in in; + struct mlx5_destroy_srq_mbox_out out; + struct mlx5_srq_table *table = &dev->priv.srq_table; + struct mlx5_core_srq *tmp; + int err; + + spin_lock_irq(&table->lock); + tmp = radix_tree_delete(&table->tree, srq->srqn); + spin_unlock_irq(&table->lock); + if (!tmp) { + mlx5_core_warn(dev, "srq 0x%x not found in tree\n", srq->srqn); + return -EINVAL; + } + if (tmp != srq) { + mlx5_core_warn(dev, "corruption on srqn 0x%x\n", srq->srqn); + return -EINVAL; + } + + memset(&in, 0, sizeof(in)); + memset(&out, 0, sizeof(out)); + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_SRQ); + in.srqn = cpu_to_be32(srq->srqn); + err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); + if (err) + return err; + + if (out.hdr.status) + return mlx5_cmd_status_to_err(&out.hdr); + + if (atomic_dec_and_test(&srq->refcount)) + complete(&srq->free); + wait_for_completion(&srq->free); + + return 0; +} +EXPORT_SYMBOL(mlx5_core_destroy_srq); + +int mlx5_core_query_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, + struct mlx5_query_srq_mbox_out *out) +{ + struct mlx5_query_srq_mbox_in in; + int err; + + memset(&in, 0, sizeof(in)); + memset(out, 0, sizeof(*out)); + + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_SRQ); + in.srqn = cpu_to_be32(srq->srqn); + err = mlx5_cmd_exec(dev, &in, sizeof(in), out, sizeof(*out)); + if (err) + return err; + + if (out->hdr.status) + return mlx5_cmd_status_to_err(&out->hdr); + + return err; +} +EXPORT_SYMBOL(mlx5_core_query_srq); + +int mlx5_core_arm_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, + u16 lwm, int is_srq) +{ + struct mlx5_arm_srq_mbox_in in; + struct mlx5_arm_srq_mbox_out out; + int err; + + memset(&in, 0, sizeof(in)); + memset(&out, 0, sizeof(out)); + + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_ARM_RQ); + in.hdr.opmod = cpu_to_be16(!!is_srq); + in.srqn = cpu_to_be32(srq->srqn); + in.lwm = cpu_to_be16(lwm); + + err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); + if (err) + return err; + + if (out.hdr.status) + return mlx5_cmd_status_to_err(&out.hdr); + + return err; +} +EXPORT_SYMBOL(mlx5_core_arm_srq); + +void mlx5_init_srq_table(struct mlx5_core_dev *dev) +{ + struct mlx5_srq_table *table = &dev->priv.srq_table; + + spin_lock_init(&table->lock); + INIT_RADIX_TREE(&table->tree, GFP_ATOMIC); +} + +void mlx5_cleanup_srq_table(struct mlx5_core_dev *dev) +{ + /* nothing */ +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/uar.c b/drivers/net/ethernet/mellanox/mlx5/core/uar.c new file mode 100644 index 0000000..0a6348c --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/uar.c @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include "mlx5_core.h" + +enum { + NUM_DRIVER_UARS = 4, + NUM_LOW_LAT_UUARS = 4, +}; + + +struct mlx5_alloc_uar_mbox_in { + struct mlx5_inbox_hdr hdr; + u8 rsvd[8]; +}; + +struct mlx5_alloc_uar_mbox_out { + struct mlx5_outbox_hdr hdr; + __be32 uarn; + u8 rsvd[4]; +}; + +struct mlx5_free_uar_mbox_in { + struct mlx5_inbox_hdr hdr; + __be32 uarn; + u8 rsvd[4]; +}; + +struct mlx5_free_uar_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd[8]; +}; + +int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn) +{ + struct mlx5_alloc_uar_mbox_in in; + struct mlx5_alloc_uar_mbox_out out; + int err; + + memset(&in, 0, sizeof(in)); + memset(&out, 0, sizeof(out)); + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_ALLOC_UAR); + err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); + if (err) + goto ex; + + if (out.hdr.status) { + err = mlx5_cmd_status_to_err(&out.hdr); + goto ex; + } + + *uarn = be32_to_cpu(out.uarn) & 0xffffff; + +ex: + return err; +} +EXPORT_SYMBOL(mlx5_cmd_alloc_uar); + +int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn) +{ + struct mlx5_free_uar_mbox_in in; + struct mlx5_free_uar_mbox_out out; + int err; + + memset(&in, 0, sizeof(in)); + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DEALLOC_UAR); + in.uarn = cpu_to_be32(uarn); + err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); + if (err) + goto ex; + + if (out.hdr.status) + err = mlx5_cmd_status_to_err(&out.hdr); + +ex: + return err; +} +EXPORT_SYMBOL(mlx5_cmd_free_uar); + +static int need_uuar_lock(int uuarn) +{ + int tot_uuars = NUM_DRIVER_UARS * MLX5_BF_REGS_PER_PAGE; + + if (uuarn == 0 || tot_uuars - NUM_LOW_LAT_UUARS) + return 0; + + return 1; +} + +int mlx5_alloc_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari) +{ + int tot_uuars = NUM_DRIVER_UARS * MLX5_BF_REGS_PER_PAGE; + struct mlx5_bf *bf; + phys_addr_t addr; + int err; + int i; + + uuari->num_uars = NUM_DRIVER_UARS; + uuari->num_low_latency_uuars = NUM_LOW_LAT_UUARS; + + mutex_init(&uuari->lock); + uuari->uars = kcalloc(uuari->num_uars, sizeof(*uuari->uars), GFP_KERNEL); + if (!uuari->uars) + return -ENOMEM; + + uuari->bfs = kcalloc(tot_uuars, sizeof(*uuari->bfs), GFP_KERNEL); + if (!uuari->bfs) { + err = -ENOMEM; + goto out_uars; + } + + uuari->bitmap = kcalloc(BITS_TO_LONGS(tot_uuars), sizeof(*uuari->bitmap), + GFP_KERNEL); + if (!uuari->bitmap) { + err = -ENOMEM; + goto out_bfs; + } + + uuari->count = kcalloc(tot_uuars, sizeof(*uuari->count), GFP_KERNEL); + if (!uuari->count) { + err = -ENOMEM; + goto out_bitmap; + } + + for (i = 0; i < uuari->num_uars; i++) { + err = mlx5_cmd_alloc_uar(dev, &uuari->uars[i].index); + if (err) + goto out_count; + + addr = dev->iseg_base + ((phys_addr_t)(uuari->uars[i].index) << PAGE_SHIFT); + uuari->uars[i].map = ioremap(addr, PAGE_SIZE); + if (!uuari->uars[i].map) { + mlx5_cmd_free_uar(dev, uuari->uars[i].index); + err = -ENOMEM; + goto out_count; + } + mlx5_core_dbg(dev, "allocated uar index 0x%x, mmaped at %p\n", + uuari->uars[i].index, uuari->uars[i].map); + } + + for (i = 0; i < tot_uuars; i++) { + bf = &uuari->bfs[i]; + + bf->buf_size = dev->caps.gen.bf_reg_size / 2; + bf->uar = &uuari->uars[i / MLX5_BF_REGS_PER_PAGE]; + bf->regreg = uuari->uars[i / MLX5_BF_REGS_PER_PAGE].map; + bf->reg = NULL; /* Add WC support */ + bf->offset = (i % MLX5_BF_REGS_PER_PAGE) * dev->caps.gen.bf_reg_size + + MLX5_BF_OFFSET; + bf->need_lock = need_uuar_lock(i); + spin_lock_init(&bf->lock); + spin_lock_init(&bf->lock32); + bf->uuarn = i; + } + + return 0; + +out_count: + for (i--; i >= 0; i--) { + iounmap(uuari->uars[i].map); + mlx5_cmd_free_uar(dev, uuari->uars[i].index); + } + kfree(uuari->count); + +out_bitmap: + kfree(uuari->bitmap); + +out_bfs: + kfree(uuari->bfs); + +out_uars: + kfree(uuari->uars); + return err; +} + +int mlx5_free_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari) +{ + int i = uuari->num_uars; + + for (i--; i >= 0; i--) { + iounmap(uuari->uars[i].map); + mlx5_cmd_free_uar(dev, uuari->uars[i].index); + } + + kfree(uuari->count); + kfree(uuari->bitmap); + kfree(uuari->bfs); + kfree(uuari->uars); + + return 0; +} diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile new file mode 100644 index 0000000..59f1ce6 --- /dev/null +++ b/drivers/scsi/Makefile @@ -0,0 +1,199 @@ +# +# Makefile for linux/drivers/scsi +# +# 30 May 2000, Christoph Hellwig +# Rewritten to use lists instead of if-statements. +# +# 20 Sep 2000, Torben Mathiasen +# Changed link order to reflect new scsi initialization. +# +# *!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*! +# The link order must be, SCSI Core, SCSI HBA drivers, and +# lastly SCSI peripheral drivers (disk/tape/cdrom/etc.) to +# satisfy certain initialization assumptions in the SCSI layer. +# *!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*! + + +CFLAGS_aha152x.o = -DAHA152X_STAT -DAUTOCONF +CFLAGS_gdth.o = # -DDEBUG_GDTH=2 -D__SERIAL__ -D__COM2__ -DGDTH_STATISTICS + +obj-$(CONFIG_PCMCIA) += pcmcia/ + +obj-$(CONFIG_SCSI) += scsi_mod.o + +obj-$(CONFIG_RAID_ATTRS) += raid_class.o + +# --- NOTE ORDERING HERE --- +# For kernel non-modular link, transport attributes need to +# be initialised before drivers +# -------------------------- +obj-$(CONFIG_SCSI_SPI_ATTRS) += scsi_transport_spi.o +obj-$(CONFIG_SCSI_FC_ATTRS) += scsi_transport_fc.o +obj-$(CONFIG_SCSI_ISCSI_ATTRS) += scsi_transport_iscsi.o +obj-$(CONFIG_SCSI_SAS_ATTRS) += scsi_transport_sas.o +obj-$(CONFIG_SCSI_SAS_LIBSAS) += libsas/ +obj-$(CONFIG_SCSI_SRP_ATTRS) += scsi_transport_srp.o +obj-$(CONFIG_SCSI_DH) += device_handler/ + +obj-$(CONFIG_LIBFC) += libfc/ +obj-$(CONFIG_LIBFCOE) += fcoe/ +obj-$(CONFIG_FCOE) += fcoe/ +obj-$(CONFIG_FCOE_FNIC) += fnic/ +obj-$(CONFIG_SCSI_BNX2X_FCOE) += libfc/ fcoe/ bnx2fc/ +obj-$(CONFIG_ISCSI_TCP) += libiscsi.o libiscsi_tcp.o iscsi_tcp.o +obj-$(CONFIG_INFINIBAND_ISER) += libiscsi.o +obj-$(CONFIG_ISCSI_BOOT_SYSFS) += iscsi_boot_sysfs.o +obj-$(CONFIG_SCSI_A4000T) += 53c700.o a4000t.o +obj-$(CONFIG_SCSI_ZORRO7XX) += 53c700.o zorro7xx.o +obj-$(CONFIG_A3000_SCSI) += a3000.o wd33c93.o +obj-$(CONFIG_A2091_SCSI) += a2091.o wd33c93.o +obj-$(CONFIG_GVP11_SCSI) += gvp11.o wd33c93.o +obj-$(CONFIG_MVME147_SCSI) += mvme147.o wd33c93.o +obj-$(CONFIG_SGIWD93_SCSI) += sgiwd93.o wd33c93.o +obj-$(CONFIG_ATARI_SCSI) += atari_scsi.o +obj-$(CONFIG_MAC_SCSI) += mac_scsi.o +obj-$(CONFIG_SCSI_MAC_ESP) += esp_scsi.o mac_esp.o +obj-$(CONFIG_SUN3_SCSI) += sun3_scsi.o sun3_scsi_vme.o +obj-$(CONFIG_MVME16x_SCSI) += 53c700.o mvme16x_scsi.o +obj-$(CONFIG_BVME6000_SCSI) += 53c700.o bvme6000_scsi.o +obj-$(CONFIG_SCSI_SIM710) += 53c700.o sim710.o +obj-$(CONFIG_SCSI_ADVANSYS) += advansys.o +obj-$(CONFIG_SCSI_BUSLOGIC) += BusLogic.o +obj-$(CONFIG_SCSI_DPT_I2O) += dpt_i2o.o +obj-$(CONFIG_SCSI_U14_34F) += u14-34f.o +obj-$(CONFIG_SCSI_ARCMSR) += arcmsr/ +obj-$(CONFIG_SCSI_ULTRASTOR) += ultrastor.o +obj-$(CONFIG_SCSI_AHA152X) += aha152x.o +obj-$(CONFIG_SCSI_AHA1542) += aha1542.o +obj-$(CONFIG_SCSI_AHA1740) += aha1740.o +obj-$(CONFIG_SCSI_AIC7XXX) += aic7xxx/ +obj-$(CONFIG_SCSI_AIC79XX) += aic7xxx/ +obj-$(CONFIG_SCSI_AACRAID) += aacraid/ +obj-$(CONFIG_SCSI_AIC94XX) += aic94xx/ +obj-$(CONFIG_SCSI_PM8001) += pm8001/ +obj-$(CONFIG_SCSI_ISCI) += isci/ +obj-$(CONFIG_SCSI_IPS) += ips.o +obj-$(CONFIG_SCSI_FUTURE_DOMAIN)+= fdomain.o +obj-$(CONFIG_SCSI_IN2000) += in2000.o +obj-$(CONFIG_SCSI_GENERIC_NCR5380) += g_NCR5380.o +obj-$(CONFIG_SCSI_GENERIC_NCR5380_MMIO) += g_NCR5380_mmio.o +obj-$(CONFIG_SCSI_NCR53C406A) += NCR53c406a.o +obj-$(CONFIG_SCSI_NCR_D700) += 53c700.o NCR_D700.o +obj-$(CONFIG_SCSI_NCR_Q720) += NCR_Q720_mod.o +obj-$(CONFIG_SCSI_SYM53C416) += sym53c416.o +obj-$(CONFIG_SCSI_QLOGIC_FAS) += qlogicfas408.o qlogicfas.o +obj-$(CONFIG_PCMCIA_QLOGIC) += qlogicfas408.o +obj-$(CONFIG_SCSI_QLOGIC_1280) += qla1280.o +obj-$(CONFIG_SCSI_QLA_FC) += qla2xxx/ +obj-$(CONFIG_SCSI_QLA_ISCSI) += libiscsi.o qla4xxx/ +obj-$(CONFIG_SCSI_LPFC) += lpfc/ +obj-$(CONFIG_SCSI_BFA_FC) += bfa/ +obj-$(CONFIG_SCSI_CHELSIO_FCOE) += csiostor/ +obj-$(CONFIG_SCSI_PAS16) += pas16.o +obj-$(CONFIG_SCSI_T128) += t128.o +obj-$(CONFIG_SCSI_DMX3191D) += dmx3191d.o +obj-$(CONFIG_SCSI_HPSA) += hpsa.o +obj-$(CONFIG_SCSI_DTC3280) += dtc.o +obj-$(CONFIG_SCSI_SYM53C8XX_2) += sym53c8xx_2/ +obj-$(CONFIG_SCSI_ZALON) += zalon7xx.o +obj-$(CONFIG_SCSI_EATA_PIO) += eata_pio.o +obj-$(CONFIG_SCSI_7000FASST) += wd7000.o +obj-$(CONFIG_SCSI_EATA) += eata.o +obj-$(CONFIG_SCSI_DC395x) += dc395x.o +obj-$(CONFIG_SCSI_DC390T) += tmscsim.o +obj-$(CONFIG_MEGARAID_LEGACY) += megaraid.o +obj-$(CONFIG_MEGARAID_NEWGEN) += megaraid/ +obj-$(CONFIG_MEGARAID_SAS) += megaraid/ +obj-$(CONFIG_SCSI_MPT2SAS) += mpt2sas/ +obj-$(CONFIG_SCSI_MPT3SAS) += mpt3sas/ +obj-$(CONFIG_SCSI_UFSHCD) += ufs/ +obj-$(CONFIG_SCSI_ACARD) += atp870u.o +obj-$(CONFIG_SCSI_SUNESP) += esp_scsi.o sun_esp.o +obj-$(CONFIG_SCSI_GDTH) += gdth.o +obj-$(CONFIG_SCSI_INITIO) += initio.o +obj-$(CONFIG_SCSI_INIA100) += a100u2w.o +obj-$(CONFIG_SCSI_QLOGICPTI) += qlogicpti.o +obj-$(CONFIG_SCSI_MESH) += mesh.o +obj-$(CONFIG_SCSI_MAC53C94) += mac53c94.o +obj-$(CONFIG_BLK_DEV_3W_XXXX_RAID) += 3w-xxxx.o +obj-$(CONFIG_SCSI_3W_9XXX) += 3w-9xxx.o +obj-$(CONFIG_SCSI_3W_SAS) += 3w-sas.o +obj-$(CONFIG_SCSI_PPA) += ppa.o +obj-$(CONFIG_SCSI_IMM) += imm.o +obj-$(CONFIG_JAZZ_ESP) += esp_scsi.o jazz_esp.o +obj-$(CONFIG_SUN3X_ESP) += esp_scsi.o sun3x_esp.o +obj-$(CONFIG_SCSI_LASI700) += 53c700.o lasi700.o +obj-$(CONFIG_SCSI_SNI_53C710) += 53c700.o sni_53c710.o +obj-$(CONFIG_SCSI_NSP32) += nsp32.o +obj-$(CONFIG_SCSI_IPR) += ipr.o +obj-$(CONFIG_SCSI_IBMVSCSI) += ibmvscsi/ +obj-$(CONFIG_SCSI_IBMVFC) += ibmvscsi/ +obj-$(CONFIG_SCSI_HPTIOP) += hptiop.o +obj-$(CONFIG_SCSI_STEX) += stex.o +obj-$(CONFIG_SCSI_MVSAS) += mvsas/ +obj-$(CONFIG_SCSI_MVUMI) += mvumi.o +obj-$(CONFIG_PS3_ROM) += ps3rom.o +obj-$(CONFIG_SCSI_CXGB3_ISCSI) += libiscsi.o libiscsi_tcp.o cxgbi/ +obj-$(CONFIG_SCSI_CXGB4_ISCSI) += libiscsi.o libiscsi_tcp.o cxgbi/ +obj-$(CONFIG_SCSI_BNX2_ISCSI) += libiscsi.o bnx2i/ +obj-$(CONFIG_BE2ISCSI) += libiscsi.o be2iscsi/ +obj-$(CONFIG_SCSI_ESAS2R) += esas2r/ +obj-$(CONFIG_SCSI_PMCRAID) += pmcraid.o +obj-$(CONFIG_SCSI_VIRTIO) += virtio_scsi.o +obj-$(CONFIG_VMWARE_PVSCSI) += vmw_pvscsi.o +obj-$(CONFIG_XEN_SCSI_FRONTEND) += xen-scsifront.o +obj-$(CONFIG_HYPERV_STORAGE) += hv_storvsc.o + +obj-$(CONFIG_ARM) += arm/ + +obj-$(CONFIG_CHR_DEV_ST) += st.o +obj-$(CONFIG_CHR_DEV_OSST) += osst.o +obj-$(CONFIG_BLK_DEV_SD) += sd_mod.o +obj-$(CONFIG_BLK_DEV_SR) += sr_mod.o +obj-$(CONFIG_CHR_DEV_SG) += sg.o +obj-$(CONFIG_CHR_DEV_SCH) += ch.o +obj-$(CONFIG_SCSI_ENCLOSURE) += ses.o + +obj-$(CONFIG_SCSI_OSD_INITIATOR) += osd/ + +# This goes last, so that "real" scsi devices probe earlier +obj-$(CONFIG_SCSI_DEBUG) += scsi_debug.o + +scsi_mod-y += scsi.o hosts.o scsi_ioctl.o constants.o \ + scsicam.o scsi_error.o scsi_lib.o +scsi_mod-$(CONFIG_SCSI_DMA) += scsi_lib_dma.o +scsi_mod-y += scsi_scan.o scsi_sysfs.o scsi_devinfo.o +scsi_mod-$(CONFIG_SCSI_NETLINK) += scsi_netlink.o +scsi_mod-$(CONFIG_SYSCTL) += scsi_sysctl.o +scsi_mod-$(CONFIG_SCSI_PROC_FS) += scsi_proc.o +scsi_mod-y += scsi_trace.o +scsi_mod-$(CONFIG_PM) += scsi_pm.o + +hv_storvsc-y := storvsc_drv.o + +sd_mod-objs := sd.o +sd_mod-$(CONFIG_BLK_DEV_INTEGRITY) += sd_dif.o + +sr_mod-objs := sr.o sr_ioctl.o sr_vendor.o +ncr53c8xx-flags-$(CONFIG_SCSI_ZALON) \ + := -DCONFIG_NCR53C8XX_PREFETCH -DSCSI_NCR_BIG_ENDIAN \ + -DCONFIG_SCSI_NCR53C8XX_NO_WORD_TRANSFERS +CFLAGS_ncr53c8xx.o := $(ncr53c8xx-flags-y) $(ncr53c8xx-flags-m) +zalon7xx-objs := zalon.o ncr53c8xx.o +NCR_Q720_mod-objs := NCR_Q720.o ncr53c8xx.o +oktagon_esp_mod-objs := oktagon_esp.o oktagon_io.o + +# Files generated that shall be removed upon make clean +clean-files := 53c700_d.h 53c700_u.h + +$(obj)/53c700.o $(MODVERDIR)/$(obj)/53c700.ver: $(obj)/53c700_d.h + +# If you want to play with the firmware, uncomment +# GENERATE_FIRMWARE := 1 + +ifdef GENERATE_FIRMWARE + +$(obj)/53c700_d.h: $(src)/53c700.scr $(src)/script_asm.pl + $(PERL) -s $(src)/script_asm.pl -ncr7x0_family $@ $(@:_d.h=_u.h) < $< + +endif diff --git a/drivers/scsi/cxgbi/Kconfig b/drivers/scsi/cxgbi/Kconfig new file mode 100644 index 0000000..17eb5d5 --- /dev/null +++ b/drivers/scsi/cxgbi/Kconfig @@ -0,0 +1,2 @@ +source "drivers/scsi/cxgbi/cxgb3i/Kconfig" +source "drivers/scsi/cxgbi/cxgb4i/Kconfig" diff --git a/drivers/scsi/cxgbi/Makefile b/drivers/scsi/cxgbi/Makefile new file mode 100644 index 0000000..86007e3 --- /dev/null +++ b/drivers/scsi/cxgbi/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_SCSI_CXGB3_ISCSI) += libcxgbi.o cxgb3i/ +obj-$(CONFIG_SCSI_CXGB4_ISCSI) += libcxgbi.o cxgb4i/ diff --git a/drivers/scsi/cxgbi/cxgb3i/Kbuild b/drivers/scsi/cxgbi/cxgb3i/Kbuild new file mode 100644 index 0000000..6f095e2 --- /dev/null +++ b/drivers/scsi/cxgbi/cxgb3i/Kbuild @@ -0,0 +1,3 @@ +EXTRA_CFLAGS += -I$(srctree)/drivers/net/ethernet/chelsio/cxgb3 + +obj-$(CONFIG_SCSI_CXGB3_ISCSI) += cxgb3i.o diff --git a/drivers/scsi/cxgbi/cxgb3i/Kconfig b/drivers/scsi/cxgbi/cxgb3i/Kconfig new file mode 100644 index 0000000..e460398 --- /dev/null +++ b/drivers/scsi/cxgbi/cxgb3i/Kconfig @@ -0,0 +1,10 @@ +config SCSI_CXGB3_ISCSI + tristate "Chelsio T3 iSCSI support" + depends on PCI && INET && (IPV6 || IPV6=n) + select NETDEVICES + select ETHERNET + select NET_VENDOR_CHELSIO + select CHELSIO_T3 + select SCSI_ISCSI_ATTRS + ---help--- + This driver supports iSCSI offload for the Chelsio T3 devices. diff --git a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c new file mode 100644 index 0000000..49692a1 --- /dev/null +++ b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c @@ -0,0 +1,1413 @@ +/* + * cxgb3i_offload.c: Chelsio S3xx iscsi offloaded tcp connection management + * + * Copyright (C) 2003-2008 Chelsio Communications. All rights reserved. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the LICENSE file included in this + * release for licensing terms and conditions. + * + * Written by: Dimitris Michailidis (dm@chelsio.com) + * Karen Xie (kxie@chelsio.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ + +#include +#include +#include + +#include "common.h" +#include "t3_cpl.h" +#include "t3cdev.h" +#include "cxgb3_defs.h" +#include "cxgb3_ctl_defs.h" +#include "cxgb3_offload.h" +#include "firmware_exports.h" +#include "cxgb3i.h" + +static unsigned int dbg_level; +#include "../libcxgbi.h" + +#define DRV_MODULE_NAME "cxgb3i" +#define DRV_MODULE_DESC "Chelsio T3 iSCSI Driver" +#define DRV_MODULE_VERSION "2.0.0" +#define DRV_MODULE_RELDATE "Jun. 2010" + +static char version[] = + DRV_MODULE_DESC " " DRV_MODULE_NAME + " v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n"; + +MODULE_AUTHOR("Chelsio Communications, Inc."); +MODULE_DESCRIPTION(DRV_MODULE_DESC); +MODULE_VERSION(DRV_MODULE_VERSION); +MODULE_LICENSE("GPL"); + +module_param(dbg_level, uint, 0644); +MODULE_PARM_DESC(dbg_level, "debug flag (default=0)"); + +static int cxgb3i_rcv_win = 256 * 1024; +module_param(cxgb3i_rcv_win, int, 0644); +MODULE_PARM_DESC(cxgb3i_rcv_win, "TCP receive window in bytes (default=256KB)"); + +static int cxgb3i_snd_win = 128 * 1024; +module_param(cxgb3i_snd_win, int, 0644); +MODULE_PARM_DESC(cxgb3i_snd_win, "TCP send window in bytes (default=128KB)"); + +static int cxgb3i_rx_credit_thres = 10 * 1024; +module_param(cxgb3i_rx_credit_thres, int, 0644); +MODULE_PARM_DESC(rx_credit_thres, + "RX credits return threshold in bytes (default=10KB)"); + +static unsigned int cxgb3i_max_connect = 8 * 1024; +module_param(cxgb3i_max_connect, uint, 0644); +MODULE_PARM_DESC(cxgb3i_max_connect, "Max. # of connections (default=8092)"); + +static unsigned int cxgb3i_sport_base = 20000; +module_param(cxgb3i_sport_base, uint, 0644); +MODULE_PARM_DESC(cxgb3i_sport_base, "starting port number (default=20000)"); + +static void cxgb3i_dev_open(struct t3cdev *); +static void cxgb3i_dev_close(struct t3cdev *); +static void cxgb3i_dev_event_handler(struct t3cdev *, u32, u32); + +static struct cxgb3_client t3_client = { + .name = DRV_MODULE_NAME, + .handlers = cxgb3i_cpl_handlers, + .add = cxgb3i_dev_open, + .remove = cxgb3i_dev_close, + .event_handler = cxgb3i_dev_event_handler, +}; + +static struct scsi_host_template cxgb3i_host_template = { + .module = THIS_MODULE, + .name = DRV_MODULE_NAME, + .proc_name = DRV_MODULE_NAME, + .can_queue = CXGB3I_SCSI_HOST_QDEPTH, + .queuecommand = iscsi_queuecommand, + .change_queue_depth = iscsi_change_queue_depth, + .sg_tablesize = SG_ALL, + .max_sectors = 0xFFFF, + .cmd_per_lun = ISCSI_DEF_CMD_PER_LUN, + .eh_abort_handler = iscsi_eh_abort, + .eh_device_reset_handler = iscsi_eh_device_reset, + .eh_target_reset_handler = iscsi_eh_recover_target, + .target_alloc = iscsi_target_alloc, + .use_clustering = DISABLE_CLUSTERING, + .this_id = -1, +}; + +static struct iscsi_transport cxgb3i_iscsi_transport = { + .owner = THIS_MODULE, + .name = DRV_MODULE_NAME, + /* owner and name should be set already */ + .caps = CAP_RECOVERY_L0 | CAP_MULTI_R2T | CAP_HDRDGST + | CAP_DATADGST | CAP_DIGEST_OFFLOAD | + CAP_PADDING_OFFLOAD | CAP_TEXT_NEGO, + .attr_is_visible = cxgbi_attr_is_visible, + .get_host_param = cxgbi_get_host_param, + .set_host_param = cxgbi_set_host_param, + /* session management */ + .create_session = cxgbi_create_session, + .destroy_session = cxgbi_destroy_session, + .get_session_param = iscsi_session_get_param, + /* connection management */ + .create_conn = cxgbi_create_conn, + .bind_conn = cxgbi_bind_conn, + .destroy_conn = iscsi_tcp_conn_teardown, + .start_conn = iscsi_conn_start, + .stop_conn = iscsi_conn_stop, + .get_conn_param = iscsi_conn_get_param, + .set_param = cxgbi_set_conn_param, + .get_stats = cxgbi_get_conn_stats, + /* pdu xmit req from user space */ + .send_pdu = iscsi_conn_send_pdu, + /* task */ + .init_task = iscsi_tcp_task_init, + .xmit_task = iscsi_tcp_task_xmit, + .cleanup_task = cxgbi_cleanup_task, + /* pdu */ + .alloc_pdu = cxgbi_conn_alloc_pdu, + .init_pdu = cxgbi_conn_init_pdu, + .xmit_pdu = cxgbi_conn_xmit_pdu, + .parse_pdu_itt = cxgbi_parse_pdu_itt, + /* TCP connect/disconnect */ + .get_ep_param = cxgbi_get_ep_param, + .ep_connect = cxgbi_ep_connect, + .ep_poll = cxgbi_ep_poll, + .ep_disconnect = cxgbi_ep_disconnect, + /* Error recovery timeout call */ + .session_recovery_timedout = iscsi_session_recovery_timedout, +}; + +static struct scsi_transport_template *cxgb3i_stt; + +/* + * CPL (Chelsio Protocol Language) defines a message passing interface between + * the host driver and Chelsio asic. + * The section below implments CPLs that related to iscsi tcp connection + * open/close/abort and data send/receive. + */ + +static int push_tx_frames(struct cxgbi_sock *csk, int req_completion); + +static void send_act_open_req(struct cxgbi_sock *csk, struct sk_buff *skb, + const struct l2t_entry *e) +{ + unsigned int wscale = cxgbi_sock_compute_wscale(cxgb3i_rcv_win); + struct cpl_act_open_req *req = (struct cpl_act_open_req *)skb->head; + + skb->priority = CPL_PRIORITY_SETUP; + + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, csk->atid)); + req->local_port = csk->saddr.sin_port; + req->peer_port = csk->daddr.sin_port; + req->local_ip = csk->saddr.sin_addr.s_addr; + req->peer_ip = csk->daddr.sin_addr.s_addr; + + req->opt0h = htonl(V_KEEP_ALIVE(1) | F_TCAM_BYPASS | + V_WND_SCALE(wscale) | V_MSS_IDX(csk->mss_idx) | + V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx)); + req->opt0l = htonl(V_ULP_MODE(ULP2_MODE_ISCSI) | + V_RCV_BUFSIZ(cxgb3i_rcv_win>>10)); + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk 0x%p,%u,0x%lx,%u, %pI4:%u-%pI4:%u, %u,%u,%u.\n", + csk, csk->state, csk->flags, csk->atid, + &req->local_ip, ntohs(req->local_port), + &req->peer_ip, ntohs(req->peer_port), + csk->mss_idx, e->idx, e->smt_idx); + + l2t_send(csk->cdev->lldev, skb, csk->l2t); +} + +static inline void act_open_arp_failure(struct t3cdev *dev, struct sk_buff *skb) +{ + cxgbi_sock_act_open_req_arp_failure(NULL, skb); +} + +/* + * CPL connection close request: host -> + * + * Close a connection by sending a CPL_CLOSE_CON_REQ message and queue it to + * the write queue (i.e., after any unsent txt data). + */ +static void send_close_req(struct cxgbi_sock *csk) +{ + struct sk_buff *skb = csk->cpl_close; + struct cpl_close_con_req *req = (struct cpl_close_con_req *)skb->head; + unsigned int tid = csk->tid; + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk 0x%p,%u,0x%lx,%u.\n", + csk, csk->state, csk->flags, csk->tid); + + csk->cpl_close = NULL; + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON)); + req->wr.wr_lo = htonl(V_WR_TID(tid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); + req->rsvd = htonl(csk->write_seq); + + cxgbi_sock_skb_entail(csk, skb); + if (csk->state >= CTP_ESTABLISHED) + push_tx_frames(csk, 1); +} + +/* + * CPL connection abort request: host -> + * + * Send an ABORT_REQ message. Makes sure we do not send multiple ABORT_REQs + * for the same connection and also that we do not try to send a message + * after the connection has closed. + */ +static void abort_arp_failure(struct t3cdev *tdev, struct sk_buff *skb) +{ + struct cpl_abort_req *req = cplhdr(skb); + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "t3dev 0x%p, tid %u, skb 0x%p.\n", + tdev, GET_TID(req), skb); + req->cmd = CPL_ABORT_NO_RST; + cxgb3_ofld_send(tdev, skb); +} + +static void send_abort_req(struct cxgbi_sock *csk) +{ + struct sk_buff *skb = csk->cpl_abort_req; + struct cpl_abort_req *req; + + if (unlikely(csk->state == CTP_ABORTING || !skb)) + return; + cxgbi_sock_set_state(csk, CTP_ABORTING); + cxgbi_sock_set_flag(csk, CTPF_ABORT_RPL_PENDING); + /* Purge the send queue so we don't send anything after an abort. */ + cxgbi_sock_purge_write_queue(csk); + + csk->cpl_abort_req = NULL; + req = (struct cpl_abort_req *)skb->head; + skb->priority = CPL_PRIORITY_DATA; + set_arp_failure_handler(skb, abort_arp_failure); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); + req->wr.wr_lo = htonl(V_WR_TID(csk->tid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, csk->tid)); + req->rsvd0 = htonl(csk->snd_nxt); + req->rsvd1 = !cxgbi_sock_flag(csk, CTPF_TX_DATA_SENT); + req->cmd = CPL_ABORT_SEND_RST; + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk 0x%p,%u,0x%lx,%u, snd_nxt %u, 0x%x.\n", + csk, csk->state, csk->flags, csk->tid, csk->snd_nxt, + req->rsvd1); + + l2t_send(csk->cdev->lldev, skb, csk->l2t); +} + +/* + * CPL connection abort reply: host -> + * + * Send an ABORT_RPL message in response of the ABORT_REQ received. + */ +static void send_abort_rpl(struct cxgbi_sock *csk, int rst_status) +{ + struct sk_buff *skb = csk->cpl_abort_rpl; + struct cpl_abort_rpl *rpl = (struct cpl_abort_rpl *)skb->head; + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk 0x%p,%u,0x%lx,%u, status %d.\n", + csk, csk->state, csk->flags, csk->tid, rst_status); + + csk->cpl_abort_rpl = NULL; + skb->priority = CPL_PRIORITY_DATA; + rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); + rpl->wr.wr_lo = htonl(V_WR_TID(csk->tid)); + OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, csk->tid)); + rpl->cmd = rst_status; + cxgb3_ofld_send(csk->cdev->lldev, skb); +} + +/* + * CPL connection rx data ack: host -> + * Send RX credits through an RX_DATA_ACK CPL message. Returns the number of + * credits sent. + */ +static u32 send_rx_credits(struct cxgbi_sock *csk, u32 credits) +{ + struct sk_buff *skb; + struct cpl_rx_data_ack *req; + u32 dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX, + "csk 0x%p,%u,0x%lx,%u, credit %u, dack %u.\n", + csk, csk->state, csk->flags, csk->tid, credits, dack); + + skb = alloc_wr(sizeof(*req), 0, GFP_ATOMIC); + if (!skb) { + pr_info("csk 0x%p, credit %u, OOM.\n", csk, credits); + return 0; + } + req = (struct cpl_rx_data_ack *)skb->head; + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, csk->tid)); + req->credit_dack = htonl(F_RX_DACK_CHANGE | V_RX_DACK_MODE(1) | + V_RX_CREDITS(credits)); + skb->priority = CPL_PRIORITY_ACK; + cxgb3_ofld_send(csk->cdev->lldev, skb); + return credits; +} + +/* + * CPL connection tx data: host -> + * + * Send iscsi PDU via TX_DATA CPL message. Returns the number of + * credits sent. + * Each TX_DATA consumes work request credit (wrs), so we need to keep track of + * how many we've used so far and how many are pending (i.e., yet ack'ed by T3). + */ + +static unsigned int wrlen __read_mostly; +static unsigned int skb_wrs[SKB_WR_LIST_SIZE] __read_mostly; + +static void init_wr_tab(unsigned int wr_len) +{ + int i; + + if (skb_wrs[1]) /* already initialized */ + return; + for (i = 1; i < SKB_WR_LIST_SIZE; i++) { + int sgl_len = (3 * i) / 2 + (i & 1); + + sgl_len += 3; + skb_wrs[i] = (sgl_len <= wr_len + ? 1 : 1 + (sgl_len - 2) / (wr_len - 1)); + } + wrlen = wr_len * 8; +} + +static inline void make_tx_data_wr(struct cxgbi_sock *csk, struct sk_buff *skb, + int len, int req_completion) +{ + struct tx_data_wr *req; + struct l2t_entry *l2t = csk->l2t; + + skb_reset_transport_header(skb); + req = (struct tx_data_wr *)__skb_push(skb, sizeof(*req)); + req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA) | + (req_completion ? F_WR_COMPL : 0)); + req->wr_lo = htonl(V_WR_TID(csk->tid)); + /* len includes the length of any HW ULP additions */ + req->len = htonl(len); + /* V_TX_ULP_SUBMODE sets both the mode and submode */ + req->flags = htonl(V_TX_ULP_SUBMODE(cxgbi_skcb_ulp_mode(skb)) | + V_TX_SHOVE((skb_peek(&csk->write_queue) ? 0 : 1))); + req->sndseq = htonl(csk->snd_nxt); + req->param = htonl(V_TX_PORT(l2t->smt_idx)); + + if (!cxgbi_sock_flag(csk, CTPF_TX_DATA_SENT)) { + req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT | + V_TX_CPU_IDX(csk->rss_qid)); + /* sendbuffer is in units of 32KB. */ + req->param |= htonl(V_TX_SNDBUF(cxgb3i_snd_win >> 15)); + cxgbi_sock_set_flag(csk, CTPF_TX_DATA_SENT); + } +} + +/** + * push_tx_frames -- start transmit + * @c3cn: the offloaded connection + * @req_completion: request wr_ack or not + * + * Prepends TX_DATA_WR or CPL_CLOSE_CON_REQ headers to buffers waiting in a + * connection's send queue and sends them on to T3. Must be called with the + * connection's lock held. Returns the amount of send buffer space that was + * freed as a result of sending queued data to T3. + */ + +static void arp_failure_skb_discard(struct t3cdev *dev, struct sk_buff *skb) +{ + kfree_skb(skb); +} + +static int push_tx_frames(struct cxgbi_sock *csk, int req_completion) +{ + int total_size = 0; + struct sk_buff *skb; + + if (unlikely(csk->state < CTP_ESTABLISHED || + csk->state == CTP_CLOSE_WAIT_1 || csk->state >= CTP_ABORTING)) { + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_TX, + "csk 0x%p,%u,0x%lx,%u, in closing state.\n", + csk, csk->state, csk->flags, csk->tid); + return 0; + } + + while (csk->wr_cred && (skb = skb_peek(&csk->write_queue)) != NULL) { + int len = skb->len; /* length before skb_push */ + int frags = skb_shinfo(skb)->nr_frags + (len != skb->data_len); + int wrs_needed = skb_wrs[frags]; + + if (wrs_needed > 1 && len + sizeof(struct tx_data_wr) <= wrlen) + wrs_needed = 1; + + WARN_ON(frags >= SKB_WR_LIST_SIZE || wrs_needed < 1); + + if (csk->wr_cred < wrs_needed) { + log_debug(1 << CXGBI_DBG_PDU_TX, + "csk 0x%p, skb len %u/%u, frag %u, wr %d<%u.\n", + csk, skb->len, skb->data_len, frags, + wrs_needed, csk->wr_cred); + break; + } + + __skb_unlink(skb, &csk->write_queue); + skb->priority = CPL_PRIORITY_DATA; + skb->csum = wrs_needed; /* remember this until the WR_ACK */ + csk->wr_cred -= wrs_needed; + csk->wr_una_cred += wrs_needed; + cxgbi_sock_enqueue_wr(csk, skb); + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_TX, + "csk 0x%p, enqueue, skb len %u/%u, frag %u, wr %d, " + "left %u, unack %u.\n", + csk, skb->len, skb->data_len, frags, skb->csum, + csk->wr_cred, csk->wr_una_cred); + + if (likely(cxgbi_skcb_test_flag(skb, SKCBF_TX_NEED_HDR))) { + if ((req_completion && + csk->wr_una_cred == wrs_needed) || + csk->wr_una_cred >= csk->wr_max_cred / 2) { + req_completion = 1; + csk->wr_una_cred = 0; + } + len += cxgbi_ulp_extra_len(cxgbi_skcb_ulp_mode(skb)); + make_tx_data_wr(csk, skb, len, req_completion); + csk->snd_nxt += len; + cxgbi_skcb_clear_flag(skb, SKCBF_TX_NEED_HDR); + } + total_size += skb->truesize; + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_TX, + "csk 0x%p, tid 0x%x, send skb 0x%p.\n", + csk, csk->tid, skb); + set_arp_failure_handler(skb, arp_failure_skb_discard); + l2t_send(csk->cdev->lldev, skb, csk->l2t); + } + return total_size; +} + +/* + * Process a CPL_ACT_ESTABLISH message: -> host + * Updates connection state from an active establish CPL message. Runs with + * the connection lock held. + */ + +static inline void free_atid(struct cxgbi_sock *csk) +{ + if (cxgbi_sock_flag(csk, CTPF_HAS_ATID)) { + cxgb3_free_atid(csk->cdev->lldev, csk->atid); + cxgbi_sock_clear_flag(csk, CTPF_HAS_ATID); + cxgbi_sock_put(csk); + } +} + +static int do_act_establish(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct cxgbi_sock *csk = ctx; + struct cpl_act_establish *req = cplhdr(skb); + unsigned int tid = GET_TID(req); + unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); + u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */ + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "atid 0x%x,tid 0x%x, csk 0x%p,%u,0x%lx, isn %u.\n", + atid, atid, csk, csk->state, csk->flags, rcv_isn); + + cxgbi_sock_get(csk); + cxgbi_sock_set_flag(csk, CTPF_HAS_TID); + csk->tid = tid; + cxgb3_insert_tid(csk->cdev->lldev, &t3_client, csk, tid); + + free_atid(csk); + + csk->rss_qid = G_QNUM(ntohs(skb->csum)); + + spin_lock_bh(&csk->lock); + if (csk->retry_timer.function) { + del_timer(&csk->retry_timer); + csk->retry_timer.function = NULL; + } + + if (unlikely(csk->state != CTP_ACTIVE_OPEN)) + pr_info("csk 0x%p,%u,0x%lx,%u, got EST.\n", + csk, csk->state, csk->flags, csk->tid); + + csk->copied_seq = csk->rcv_wup = csk->rcv_nxt = rcv_isn; + if (cxgb3i_rcv_win > (M_RCV_BUFSIZ << 10)) + csk->rcv_wup -= cxgb3i_rcv_win - (M_RCV_BUFSIZ << 10); + + cxgbi_sock_established(csk, ntohl(req->snd_isn), ntohs(req->tcp_opt)); + + if (unlikely(cxgbi_sock_flag(csk, CTPF_ACTIVE_CLOSE_NEEDED))) + /* upper layer has requested closing */ + send_abort_req(csk); + else { + if (skb_queue_len(&csk->write_queue)) + push_tx_frames(csk, 1); + cxgbi_conn_tx_open(csk); + } + + spin_unlock_bh(&csk->lock); + __kfree_skb(skb); + return 0; +} + +/* + * Process a CPL_ACT_OPEN_RPL message: -> host + * Handle active open failures. + */ +static int act_open_rpl_status_to_errno(int status) +{ + switch (status) { + case CPL_ERR_CONN_RESET: + return -ECONNREFUSED; + case CPL_ERR_ARP_MISS: + return -EHOSTUNREACH; + case CPL_ERR_CONN_TIMEDOUT: + return -ETIMEDOUT; + case CPL_ERR_TCAM_FULL: + return -ENOMEM; + case CPL_ERR_CONN_EXIST: + return -EADDRINUSE; + default: + return -EIO; + } +} + +static void act_open_retry_timer(unsigned long data) +{ + struct sk_buff *skb; + struct cxgbi_sock *csk = (struct cxgbi_sock *)data; + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk 0x%p,%u,0x%lx,%u.\n", + csk, csk->state, csk->flags, csk->tid); + + cxgbi_sock_get(csk); + spin_lock_bh(&csk->lock); + skb = alloc_wr(sizeof(struct cpl_act_open_req), 0, GFP_ATOMIC); + if (!skb) + cxgbi_sock_fail_act_open(csk, -ENOMEM); + else { + skb->sk = (struct sock *)csk; + set_arp_failure_handler(skb, act_open_arp_failure); + send_act_open_req(csk, skb, csk->l2t); + } + spin_unlock_bh(&csk->lock); + cxgbi_sock_put(csk); +} + +static int do_act_open_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct cxgbi_sock *csk = ctx; + struct cpl_act_open_rpl *rpl = cplhdr(skb); + + pr_info("csk 0x%p,%u,0x%lx,%u, status %u, %pI4:%u-%pI4:%u.\n", + csk, csk->state, csk->flags, csk->atid, rpl->status, + &csk->saddr.sin_addr.s_addr, ntohs(csk->saddr.sin_port), + &csk->daddr.sin_addr.s_addr, ntohs(csk->daddr.sin_port)); + + if (rpl->status != CPL_ERR_TCAM_FULL && + rpl->status != CPL_ERR_CONN_EXIST && + rpl->status != CPL_ERR_ARP_MISS) + cxgb3_queue_tid_release(tdev, GET_TID(rpl)); + + cxgbi_sock_get(csk); + spin_lock_bh(&csk->lock); + if (rpl->status == CPL_ERR_CONN_EXIST && + csk->retry_timer.function != act_open_retry_timer) { + csk->retry_timer.function = act_open_retry_timer; + mod_timer(&csk->retry_timer, jiffies + HZ / 2); + } else + cxgbi_sock_fail_act_open(csk, + act_open_rpl_status_to_errno(rpl->status)); + + spin_unlock_bh(&csk->lock); + cxgbi_sock_put(csk); + __kfree_skb(skb); + return 0; +} + +/* + * Process PEER_CLOSE CPL messages: -> host + * Handle peer FIN. + */ +static int do_peer_close(struct t3cdev *cdev, struct sk_buff *skb, void *ctx) +{ + struct cxgbi_sock *csk = ctx; + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk 0x%p,%u,0x%lx,%u.\n", + csk, csk->state, csk->flags, csk->tid); + + cxgbi_sock_rcv_peer_close(csk); + __kfree_skb(skb); + return 0; +} + +/* + * Process CLOSE_CONN_RPL CPL message: -> host + * Process a peer ACK to our FIN. + */ +static int do_close_con_rpl(struct t3cdev *cdev, struct sk_buff *skb, + void *ctx) +{ + struct cxgbi_sock *csk = ctx; + struct cpl_close_con_rpl *rpl = cplhdr(skb); + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk 0x%p,%u,0x%lx,%u, snxt %u.\n", + csk, csk->state, csk->flags, csk->tid, ntohl(rpl->snd_nxt)); + + cxgbi_sock_rcv_close_conn_rpl(csk, ntohl(rpl->snd_nxt)); + __kfree_skb(skb); + return 0; +} + +/* + * Process ABORT_REQ_RSS CPL message: -> host + * Process abort requests. If we are waiting for an ABORT_RPL we ignore this + * request except that we need to reply to it. + */ + +static int abort_status_to_errno(struct cxgbi_sock *csk, int abort_reason, + int *need_rst) +{ + switch (abort_reason) { + case CPL_ERR_BAD_SYN: /* fall through */ + case CPL_ERR_CONN_RESET: + return csk->state > CTP_ESTABLISHED ? -EPIPE : -ECONNRESET; + case CPL_ERR_XMIT_TIMEDOUT: + case CPL_ERR_PERSIST_TIMEDOUT: + case CPL_ERR_FINWAIT2_TIMEDOUT: + case CPL_ERR_KEEPALIVE_TIMEDOUT: + return -ETIMEDOUT; + default: + return -EIO; + } +} + +static int do_abort_req(struct t3cdev *cdev, struct sk_buff *skb, void *ctx) +{ + const struct cpl_abort_req_rss *req = cplhdr(skb); + struct cxgbi_sock *csk = ctx; + int rst_status = CPL_ABORT_NO_RST; + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk 0x%p,%u,0x%lx,%u.\n", + csk, csk->state, csk->flags, csk->tid); + + if (req->status == CPL_ERR_RTX_NEG_ADVICE || + req->status == CPL_ERR_PERSIST_NEG_ADVICE) { + goto done; + } + + cxgbi_sock_get(csk); + spin_lock_bh(&csk->lock); + + if (!cxgbi_sock_flag(csk, CTPF_ABORT_REQ_RCVD)) { + cxgbi_sock_set_flag(csk, CTPF_ABORT_REQ_RCVD); + cxgbi_sock_set_state(csk, CTP_ABORTING); + goto out; + } + + cxgbi_sock_clear_flag(csk, CTPF_ABORT_REQ_RCVD); + send_abort_rpl(csk, rst_status); + + if (!cxgbi_sock_flag(csk, CTPF_ABORT_RPL_PENDING)) { + csk->err = abort_status_to_errno(csk, req->status, &rst_status); + cxgbi_sock_closed(csk); + } + +out: + spin_unlock_bh(&csk->lock); + cxgbi_sock_put(csk); +done: + __kfree_skb(skb); + return 0; +} + +/* + * Process ABORT_RPL_RSS CPL message: -> host + * Process abort replies. We only process these messages if we anticipate + * them as the coordination between SW and HW in this area is somewhat lacking + * and sometimes we get ABORT_RPLs after we are done with the connection that + * originated the ABORT_REQ. + */ +static int do_abort_rpl(struct t3cdev *cdev, struct sk_buff *skb, void *ctx) +{ + struct cpl_abort_rpl_rss *rpl = cplhdr(skb); + struct cxgbi_sock *csk = ctx; + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "status 0x%x, csk 0x%p, s %u, 0x%lx.\n", + rpl->status, csk, csk ? csk->state : 0, + csk ? csk->flags : 0UL); + /* + * Ignore replies to post-close aborts indicating that the abort was + * requested too late. These connections are terminated when we get + * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss + * arrives the TID is either no longer used or it has been recycled. + */ + if (rpl->status == CPL_ERR_ABORT_FAILED) + goto rel_skb; + /* + * Sometimes we've already closed the connection, e.g., a post-close + * abort races with ABORT_REQ_RSS, the latter frees the connection + * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED, + * but FW turns the ABORT_REQ into a regular one and so we get + * ABORT_RPL_RSS with status 0 and no connection. + */ + if (csk) + cxgbi_sock_rcv_abort_rpl(csk); +rel_skb: + __kfree_skb(skb); + return 0; +} + +/* + * Process RX_ISCSI_HDR CPL message: -> host + * Handle received PDUs, the payload could be DDP'ed. If not, the payload + * follow after the bhs. + */ +static int do_iscsi_hdr(struct t3cdev *t3dev, struct sk_buff *skb, void *ctx) +{ + struct cxgbi_sock *csk = ctx; + struct cpl_iscsi_hdr *hdr_cpl = cplhdr(skb); + struct cpl_iscsi_hdr_norss data_cpl; + struct cpl_rx_data_ddp_norss ddp_cpl; + unsigned int hdr_len, data_len, status; + unsigned int len; + int err; + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX, + "csk 0x%p,%u,0x%lx,%u, skb 0x%p,%u.\n", + csk, csk->state, csk->flags, csk->tid, skb, skb->len); + + spin_lock_bh(&csk->lock); + + if (unlikely(csk->state >= CTP_PASSIVE_CLOSE)) { + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk 0x%p,%u,0x%lx,%u, bad state.\n", + csk, csk->state, csk->flags, csk->tid); + if (csk->state != CTP_ABORTING) + goto abort_conn; + else + goto discard; + } + + cxgbi_skcb_tcp_seq(skb) = ntohl(hdr_cpl->seq); + cxgbi_skcb_flags(skb) = 0; + + skb_reset_transport_header(skb); + __skb_pull(skb, sizeof(struct cpl_iscsi_hdr)); + + len = hdr_len = ntohs(hdr_cpl->len); + /* msg coalesce is off or not enough data received */ + if (skb->len <= hdr_len) { + pr_err("%s: tid %u, CPL_ISCSI_HDR, skb len %u < %u.\n", + csk->cdev->ports[csk->port_id]->name, csk->tid, + skb->len, hdr_len); + goto abort_conn; + } + cxgbi_skcb_set_flag(skb, SKCBF_RX_COALESCED); + + err = skb_copy_bits(skb, skb->len - sizeof(ddp_cpl), &ddp_cpl, + sizeof(ddp_cpl)); + if (err < 0) { + pr_err("%s: tid %u, copy cpl_ddp %u-%zu failed %d.\n", + csk->cdev->ports[csk->port_id]->name, csk->tid, + skb->len, sizeof(ddp_cpl), err); + goto abort_conn; + } + + cxgbi_skcb_set_flag(skb, SKCBF_RX_STATUS); + cxgbi_skcb_rx_pdulen(skb) = ntohs(ddp_cpl.len); + cxgbi_skcb_rx_ddigest(skb) = ntohl(ddp_cpl.ulp_crc); + status = ntohl(ddp_cpl.ddp_status); + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX, + "csk 0x%p, skb 0x%p,%u, pdulen %u, status 0x%x.\n", + csk, skb, skb->len, cxgbi_skcb_rx_pdulen(skb), status); + + if (status & (1 << CPL_RX_DDP_STATUS_HCRC_SHIFT)) + cxgbi_skcb_set_flag(skb, SKCBF_RX_HCRC_ERR); + if (status & (1 << CPL_RX_DDP_STATUS_DCRC_SHIFT)) + cxgbi_skcb_set_flag(skb, SKCBF_RX_DCRC_ERR); + if (status & (1 << CPL_RX_DDP_STATUS_PAD_SHIFT)) + cxgbi_skcb_set_flag(skb, SKCBF_RX_PAD_ERR); + + if (skb->len > (hdr_len + sizeof(ddp_cpl))) { + err = skb_copy_bits(skb, hdr_len, &data_cpl, sizeof(data_cpl)); + if (err < 0) { + pr_err("%s: tid %u, cp %zu/%u failed %d.\n", + csk->cdev->ports[csk->port_id]->name, + csk->tid, sizeof(data_cpl), skb->len, err); + goto abort_conn; + } + data_len = ntohs(data_cpl.len); + log_debug(1 << CXGBI_DBG_DDP | 1 << CXGBI_DBG_PDU_RX, + "skb 0x%p, pdu not ddp'ed %u/%u, status 0x%x.\n", + skb, data_len, cxgbi_skcb_rx_pdulen(skb), status); + len += sizeof(data_cpl) + data_len; + } else if (status & (1 << CPL_RX_DDP_STATUS_DDP_SHIFT)) + cxgbi_skcb_set_flag(skb, SKCBF_RX_DATA_DDPD); + + csk->rcv_nxt = ntohl(ddp_cpl.seq) + cxgbi_skcb_rx_pdulen(skb); + __pskb_trim(skb, len); + __skb_queue_tail(&csk->receive_queue, skb); + cxgbi_conn_pdu_ready(csk); + + spin_unlock_bh(&csk->lock); + return 0; + +abort_conn: + send_abort_req(csk); +discard: + spin_unlock_bh(&csk->lock); + __kfree_skb(skb); + return 0; +} + +/* + * Process TX_DATA_ACK CPL messages: -> host + * Process an acknowledgment of WR completion. Advance snd_una and send the + * next batch of work requests from the write queue. + */ +static int do_wr_ack(struct t3cdev *cdev, struct sk_buff *skb, void *ctx) +{ + struct cxgbi_sock *csk = ctx; + struct cpl_wr_ack *hdr = cplhdr(skb); + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX, + "csk 0x%p,%u,0x%lx,%u, cr %u.\n", + csk, csk->state, csk->flags, csk->tid, ntohs(hdr->credits)); + + cxgbi_sock_rcv_wr_ack(csk, ntohs(hdr->credits), ntohl(hdr->snd_una), 1); + __kfree_skb(skb); + return 0; +} + +/* + * for each connection, pre-allocate skbs needed for close/abort requests. So + * that we can service the request right away. + */ +static int alloc_cpls(struct cxgbi_sock *csk) +{ + csk->cpl_close = alloc_wr(sizeof(struct cpl_close_con_req), 0, + GFP_KERNEL); + if (!csk->cpl_close) + return -ENOMEM; + csk->cpl_abort_req = alloc_wr(sizeof(struct cpl_abort_req), 0, + GFP_KERNEL); + if (!csk->cpl_abort_req) + goto free_cpl_skbs; + + csk->cpl_abort_rpl = alloc_wr(sizeof(struct cpl_abort_rpl), 0, + GFP_KERNEL); + if (!csk->cpl_abort_rpl) + goto free_cpl_skbs; + + return 0; + +free_cpl_skbs: + cxgbi_sock_free_cpl_skbs(csk); + return -ENOMEM; +} + +/** + * release_offload_resources - release offload resource + * @c3cn: the offloaded iscsi tcp connection. + * Release resources held by an offload connection (TID, L2T entry, etc.) + */ +static void l2t_put(struct cxgbi_sock *csk) +{ + struct t3cdev *t3dev = (struct t3cdev *)csk->cdev->lldev; + + if (csk->l2t) { + l2t_release(t3dev, csk->l2t); + csk->l2t = NULL; + cxgbi_sock_put(csk); + } +} + +static void release_offload_resources(struct cxgbi_sock *csk) +{ + struct t3cdev *t3dev = (struct t3cdev *)csk->cdev->lldev; + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk 0x%p,%u,0x%lx,%u.\n", + csk, csk->state, csk->flags, csk->tid); + + csk->rss_qid = 0; + cxgbi_sock_free_cpl_skbs(csk); + + if (csk->wr_cred != csk->wr_max_cred) { + cxgbi_sock_purge_wr_queue(csk); + cxgbi_sock_reset_wr_list(csk); + } + l2t_put(csk); + if (cxgbi_sock_flag(csk, CTPF_HAS_ATID)) + free_atid(csk); + else if (cxgbi_sock_flag(csk, CTPF_HAS_TID)) { + cxgb3_remove_tid(t3dev, (void *)csk, csk->tid); + cxgbi_sock_clear_flag(csk, CTPF_HAS_TID); + cxgbi_sock_put(csk); + } + csk->dst = NULL; + csk->cdev = NULL; +} + +static void update_address(struct cxgbi_hba *chba) +{ + if (chba->ipv4addr) { + if (chba->vdev && + chba->ipv4addr != cxgb3i_get_private_ipv4addr(chba->vdev)) { + cxgb3i_set_private_ipv4addr(chba->vdev, chba->ipv4addr); + cxgb3i_set_private_ipv4addr(chba->ndev, 0); + pr_info("%s set %pI4.\n", + chba->vdev->name, &chba->ipv4addr); + } else if (chba->ipv4addr != + cxgb3i_get_private_ipv4addr(chba->ndev)) { + cxgb3i_set_private_ipv4addr(chba->ndev, chba->ipv4addr); + pr_info("%s set %pI4.\n", + chba->ndev->name, &chba->ipv4addr); + } + } else if (cxgb3i_get_private_ipv4addr(chba->ndev)) { + if (chba->vdev) + cxgb3i_set_private_ipv4addr(chba->vdev, 0); + cxgb3i_set_private_ipv4addr(chba->ndev, 0); + } +} + +static int init_act_open(struct cxgbi_sock *csk) +{ + struct dst_entry *dst = csk->dst; + struct cxgbi_device *cdev = csk->cdev; + struct t3cdev *t3dev = (struct t3cdev *)cdev->lldev; + struct net_device *ndev = cdev->ports[csk->port_id]; + struct cxgbi_hba *chba = cdev->hbas[csk->port_id]; + struct sk_buff *skb = NULL; + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk 0x%p,%u,0x%lx.\n", csk, csk->state, csk->flags); + + update_address(chba); + if (chba->ipv4addr) + csk->saddr.sin_addr.s_addr = chba->ipv4addr; + + csk->rss_qid = 0; + csk->l2t = t3_l2t_get(t3dev, dst, ndev, + &csk->daddr.sin_addr.s_addr); + if (!csk->l2t) { + pr_err("NO l2t available.\n"); + return -EINVAL; + } + cxgbi_sock_get(csk); + + csk->atid = cxgb3_alloc_atid(t3dev, &t3_client, csk); + if (csk->atid < 0) { + pr_err("NO atid available.\n"); + goto rel_resource; + } + cxgbi_sock_set_flag(csk, CTPF_HAS_ATID); + cxgbi_sock_get(csk); + + skb = alloc_wr(sizeof(struct cpl_act_open_req), 0, GFP_KERNEL); + if (!skb) + goto rel_resource; + skb->sk = (struct sock *)csk; + set_arp_failure_handler(skb, act_open_arp_failure); + + csk->wr_max_cred = csk->wr_cred = T3C_DATA(t3dev)->max_wrs - 1; + csk->wr_una_cred = 0; + csk->mss_idx = cxgbi_sock_select_mss(csk, dst_mtu(dst)); + cxgbi_sock_reset_wr_list(csk); + csk->err = 0; + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk 0x%p,%u,0x%lx, %pI4:%u-%pI4:%u.\n", + csk, csk->state, csk->flags, + &csk->saddr.sin_addr.s_addr, ntohs(csk->saddr.sin_port), + &csk->daddr.sin_addr.s_addr, ntohs(csk->daddr.sin_port)); + + cxgbi_sock_set_state(csk, CTP_ACTIVE_OPEN); + send_act_open_req(csk, skb, csk->l2t); + return 0; + +rel_resource: + if (skb) + __kfree_skb(skb); + return -EINVAL; +} + +cxgb3_cpl_handler_func cxgb3i_cpl_handlers[NUM_CPL_CMDS] = { + [CPL_ACT_ESTABLISH] = do_act_establish, + [CPL_ACT_OPEN_RPL] = do_act_open_rpl, + [CPL_PEER_CLOSE] = do_peer_close, + [CPL_ABORT_REQ_RSS] = do_abort_req, + [CPL_ABORT_RPL_RSS] = do_abort_rpl, + [CPL_CLOSE_CON_RPL] = do_close_con_rpl, + [CPL_TX_DMA_ACK] = do_wr_ack, + [CPL_ISCSI_HDR] = do_iscsi_hdr, +}; + +/** + * cxgb3i_ofld_init - allocate and initialize resources for each adapter found + * @cdev: cxgbi adapter + */ +int cxgb3i_ofld_init(struct cxgbi_device *cdev) +{ + struct t3cdev *t3dev = (struct t3cdev *)cdev->lldev; + struct adap_ports port; + struct ofld_page_info rx_page_info; + unsigned int wr_len; + int rc; + + if (t3dev->ctl(t3dev, GET_WR_LEN, &wr_len) < 0 || + t3dev->ctl(t3dev, GET_PORTS, &port) < 0 || + t3dev->ctl(t3dev, GET_RX_PAGE_INFO, &rx_page_info) < 0) { + pr_warn("t3 0x%p, offload up, ioctl failed.\n", t3dev); + return -EINVAL; + } + + if (cxgb3i_max_connect > CXGBI_MAX_CONN) + cxgb3i_max_connect = CXGBI_MAX_CONN; + + rc = cxgbi_device_portmap_create(cdev, cxgb3i_sport_base, + cxgb3i_max_connect); + if (rc < 0) + return rc; + + init_wr_tab(wr_len); + cdev->csk_release_offload_resources = release_offload_resources; + cdev->csk_push_tx_frames = push_tx_frames; + cdev->csk_send_abort_req = send_abort_req; + cdev->csk_send_close_req = send_close_req; + cdev->csk_send_rx_credits = send_rx_credits; + cdev->csk_alloc_cpls = alloc_cpls; + cdev->csk_init_act_open = init_act_open; + + pr_info("cdev 0x%p, offload up, added.\n", cdev); + return 0; +} + +/* + * functions to program the pagepod in h/w + */ +static inline void ulp_mem_io_set_hdr(struct sk_buff *skb, unsigned int addr) +{ + struct ulp_mem_io *req = (struct ulp_mem_io *)skb->head; + + memset(req, 0, sizeof(*req)); + + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); + req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(addr >> 5) | + V_ULPTX_CMD(ULP_MEM_WRITE)); + req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE >> 5) | + V_ULPTX_NFLITS((PPOD_SIZE >> 3) + 1)); +} + +static int ddp_set_map(struct cxgbi_sock *csk, struct cxgbi_pagepod_hdr *hdr, + unsigned int idx, unsigned int npods, + struct cxgbi_gather_list *gl) +{ + struct cxgbi_device *cdev = csk->cdev; + struct cxgbi_ddp_info *ddp = cdev->ddp; + unsigned int pm_addr = (idx << PPOD_SIZE_SHIFT) + ddp->llimit; + int i; + + log_debug(1 << CXGBI_DBG_DDP, + "csk 0x%p, idx %u, npods %u, gl 0x%p.\n", + csk, idx, npods, gl); + + for (i = 0; i < npods; i++, idx++, pm_addr += PPOD_SIZE) { + struct sk_buff *skb = alloc_wr(sizeof(struct ulp_mem_io) + + PPOD_SIZE, 0, GFP_ATOMIC); + + if (!skb) + return -ENOMEM; + + ulp_mem_io_set_hdr(skb, pm_addr); + cxgbi_ddp_ppod_set((struct cxgbi_pagepod *)(skb->head + + sizeof(struct ulp_mem_io)), + hdr, gl, i * PPOD_PAGES_MAX); + skb->priority = CPL_PRIORITY_CONTROL; + cxgb3_ofld_send(cdev->lldev, skb); + } + return 0; +} + +static void ddp_clear_map(struct cxgbi_hba *chba, unsigned int tag, + unsigned int idx, unsigned int npods) +{ + struct cxgbi_device *cdev = chba->cdev; + struct cxgbi_ddp_info *ddp = cdev->ddp; + unsigned int pm_addr = (idx << PPOD_SIZE_SHIFT) + ddp->llimit; + int i; + + log_debug(1 << CXGBI_DBG_DDP, + "cdev 0x%p, idx %u, npods %u, tag 0x%x.\n", + cdev, idx, npods, tag); + + for (i = 0; i < npods; i++, idx++, pm_addr += PPOD_SIZE) { + struct sk_buff *skb = alloc_wr(sizeof(struct ulp_mem_io) + + PPOD_SIZE, 0, GFP_ATOMIC); + + if (!skb) { + pr_err("tag 0x%x, 0x%x, %d/%u, skb OOM.\n", + tag, idx, i, npods); + continue; + } + ulp_mem_io_set_hdr(skb, pm_addr); + skb->priority = CPL_PRIORITY_CONTROL; + cxgb3_ofld_send(cdev->lldev, skb); + } +} + +static int ddp_setup_conn_pgidx(struct cxgbi_sock *csk, + unsigned int tid, int pg_idx, bool reply) +{ + struct sk_buff *skb = alloc_wr(sizeof(struct cpl_set_tcb_field), 0, + GFP_KERNEL); + struct cpl_set_tcb_field *req; + u64 val = pg_idx < DDP_PGIDX_MAX ? pg_idx : 0; + + log_debug(1 << CXGBI_DBG_DDP, + "csk 0x%p, tid %u, pg_idx %d.\n", csk, tid, pg_idx); + if (!skb) + return -ENOMEM; + + /* set up ulp submode and page size */ + req = (struct cpl_set_tcb_field *)skb->head; + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid)); + req->reply = V_NO_REPLY(reply ? 0 : 1); + req->cpu_idx = 0; + req->word = htons(31); + req->mask = cpu_to_be64(0xF0000000); + req->val = cpu_to_be64(val << 28); + skb->priority = CPL_PRIORITY_CONTROL; + + cxgb3_ofld_send(csk->cdev->lldev, skb); + return 0; +} + +/** + * cxgb3i_setup_conn_digest - setup conn. digest setting + * @csk: cxgb tcp socket + * @tid: connection id + * @hcrc: header digest enabled + * @dcrc: data digest enabled + * @reply: request reply from h/w + * set up the iscsi digest settings for a connection identified by tid + */ +static int ddp_setup_conn_digest(struct cxgbi_sock *csk, unsigned int tid, + int hcrc, int dcrc, int reply) +{ + struct sk_buff *skb = alloc_wr(sizeof(struct cpl_set_tcb_field), 0, + GFP_KERNEL); + struct cpl_set_tcb_field *req; + u64 val = (hcrc ? 1 : 0) | (dcrc ? 2 : 0); + + log_debug(1 << CXGBI_DBG_DDP, + "csk 0x%p, tid %u, crc %d,%d.\n", csk, tid, hcrc, dcrc); + if (!skb) + return -ENOMEM; + + /* set up ulp submode and page size */ + req = (struct cpl_set_tcb_field *)skb->head; + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid)); + req->reply = V_NO_REPLY(reply ? 0 : 1); + req->cpu_idx = 0; + req->word = htons(31); + req->mask = cpu_to_be64(0x0F000000); + req->val = cpu_to_be64(val << 24); + skb->priority = CPL_PRIORITY_CONTROL; + + cxgb3_ofld_send(csk->cdev->lldev, skb); + return 0; +} + +/** + * t3_ddp_cleanup - release the cxgb3 adapter's ddp resource + * @cdev: cxgb3i adapter + * release all the resource held by the ddp pagepod manager for a given + * adapter if needed + */ + +static void t3_ddp_cleanup(struct cxgbi_device *cdev) +{ + struct t3cdev *tdev = (struct t3cdev *)cdev->lldev; + + if (cxgbi_ddp_cleanup(cdev)) { + pr_info("t3dev 0x%p, ulp_iscsi no more user.\n", tdev); + tdev->ulp_iscsi = NULL; + } +} + +/** + * ddp_init - initialize the cxgb3 adapter's ddp resource + * @cdev: cxgb3i adapter + * initialize the ddp pagepod manager for a given adapter + */ +static int cxgb3i_ddp_init(struct cxgbi_device *cdev) +{ + struct t3cdev *tdev = (struct t3cdev *)cdev->lldev; + struct cxgbi_ddp_info *ddp = tdev->ulp_iscsi; + struct ulp_iscsi_info uinfo; + unsigned int pgsz_factor[4]; + int i, err; + + if (ddp) { + kref_get(&ddp->refcnt); + pr_warn("t3dev 0x%p, ddp 0x%p already set up.\n", + tdev, tdev->ulp_iscsi); + cdev->ddp = ddp; + return -EALREADY; + } + + err = tdev->ctl(tdev, ULP_ISCSI_GET_PARAMS, &uinfo); + if (err < 0) { + pr_err("%s, failed to get iscsi param err=%d.\n", + tdev->name, err); + return err; + } + + err = cxgbi_ddp_init(cdev, uinfo.llimit, uinfo.ulimit, + uinfo.max_txsz, uinfo.max_rxsz); + if (err < 0) + return err; + + ddp = cdev->ddp; + + uinfo.tagmask = ddp->idx_mask << PPOD_IDX_SHIFT; + cxgbi_ddp_page_size_factor(pgsz_factor); + for (i = 0; i < 4; i++) + uinfo.pgsz_factor[i] = pgsz_factor[i]; + uinfo.ulimit = uinfo.llimit + (ddp->nppods << PPOD_SIZE_SHIFT); + + err = tdev->ctl(tdev, ULP_ISCSI_SET_PARAMS, &uinfo); + if (err < 0) { + pr_warn("%s unable to set iscsi param err=%d, ddp disabled.\n", + tdev->name, err); + cxgbi_ddp_cleanup(cdev); + return err; + } + tdev->ulp_iscsi = ddp; + + cdev->csk_ddp_setup_digest = ddp_setup_conn_digest; + cdev->csk_ddp_setup_pgidx = ddp_setup_conn_pgidx; + cdev->csk_ddp_set = ddp_set_map; + cdev->csk_ddp_clear = ddp_clear_map; + + pr_info("tdev 0x%p, nppods %u, bits %u, mask 0x%x,0x%x pkt %u/%u, " + "%u/%u.\n", + tdev, ddp->nppods, ddp->idx_bits, ddp->idx_mask, + ddp->rsvd_tag_mask, ddp->max_txsz, uinfo.max_txsz, + ddp->max_rxsz, uinfo.max_rxsz); + return 0; +} + +static void cxgb3i_dev_close(struct t3cdev *t3dev) +{ + struct cxgbi_device *cdev = cxgbi_device_find_by_lldev(t3dev); + + if (!cdev || cdev->flags & CXGBI_FLAG_ADAPTER_RESET) { + pr_info("0x%p close, f 0x%x.\n", cdev, cdev ? cdev->flags : 0); + return; + } + + cxgbi_device_unregister(cdev); +} + +/** + * cxgb3i_dev_open - init a t3 adapter structure and any h/w settings + * @t3dev: t3cdev adapter + */ +static void cxgb3i_dev_open(struct t3cdev *t3dev) +{ + struct cxgbi_device *cdev = cxgbi_device_find_by_lldev(t3dev); + struct adapter *adapter = tdev2adap(t3dev); + int i, err; + + if (cdev) { + pr_info("0x%p, updating.\n", cdev); + return; + } + + cdev = cxgbi_device_register(0, adapter->params.nports); + if (!cdev) { + pr_warn("device 0x%p register failed.\n", t3dev); + return; + } + + cdev->flags = CXGBI_FLAG_DEV_T3 | CXGBI_FLAG_IPV4_SET; + cdev->lldev = t3dev; + cdev->pdev = adapter->pdev; + cdev->ports = adapter->port; + cdev->nports = adapter->params.nports; + cdev->mtus = adapter->params.mtus; + cdev->nmtus = NMTUS; + cdev->snd_win = cxgb3i_snd_win; + cdev->rcv_win = cxgb3i_rcv_win; + cdev->rx_credit_thres = cxgb3i_rx_credit_thres; + cdev->skb_tx_rsvd = CXGB3I_TX_HEADER_LEN; + cdev->skb_rx_extra = sizeof(struct cpl_iscsi_hdr_norss); + cdev->dev_ddp_cleanup = t3_ddp_cleanup; + cdev->itp = &cxgb3i_iscsi_transport; + + err = cxgb3i_ddp_init(cdev); + if (err) { + pr_info("0x%p ddp init failed\n", cdev); + goto err_out; + } + + err = cxgb3i_ofld_init(cdev); + if (err) { + pr_info("0x%p offload init failed\n", cdev); + goto err_out; + } + + err = cxgbi_hbas_add(cdev, CXGB3I_MAX_LUN, CXGBI_MAX_CONN, + &cxgb3i_host_template, cxgb3i_stt); + if (err) + goto err_out; + + for (i = 0; i < cdev->nports; i++) + cdev->hbas[i]->ipv4addr = + cxgb3i_get_private_ipv4addr(cdev->ports[i]); + + pr_info("cdev 0x%p, f 0x%x, t3dev 0x%p open, err %d.\n", + cdev, cdev ? cdev->flags : 0, t3dev, err); + return; + +err_out: + cxgbi_device_unregister(cdev); +} + +static void cxgb3i_dev_event_handler(struct t3cdev *t3dev, u32 event, u32 port) +{ + struct cxgbi_device *cdev = cxgbi_device_find_by_lldev(t3dev); + + log_debug(1 << CXGBI_DBG_TOE, + "0x%p, cdev 0x%p, event 0x%x, port 0x%x.\n", + t3dev, cdev, event, port); + if (!cdev) + return; + + switch (event) { + case OFFLOAD_STATUS_DOWN: + cdev->flags |= CXGBI_FLAG_ADAPTER_RESET; + break; + case OFFLOAD_STATUS_UP: + cdev->flags &= ~CXGBI_FLAG_ADAPTER_RESET; + break; + } +} + +/** + * cxgb3i_init_module - module init entry point + * + * initialize any driver wide global data structures and register itself + * with the cxgb3 module + */ +static int __init cxgb3i_init_module(void) +{ + int rc; + + printk(KERN_INFO "%s", version); + + rc = cxgbi_iscsi_init(&cxgb3i_iscsi_transport, &cxgb3i_stt); + if (rc < 0) + return rc; + + cxgb3_register_client(&t3_client); + return 0; +} + +/** + * cxgb3i_exit_module - module cleanup/exit entry point + * + * go through the driver hba list and for each hba, release any resource held. + * and unregisters iscsi transport and the cxgb3 module + */ +static void __exit cxgb3i_exit_module(void) +{ + cxgb3_unregister_client(&t3_client); + cxgbi_device_unregister_all(CXGBI_FLAG_DEV_T3); + cxgbi_iscsi_cleanup(&cxgb3i_iscsi_transport, &cxgb3i_stt); +} + +module_init(cxgb3i_init_module); +module_exit(cxgb3i_exit_module); diff --git a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.h b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.h new file mode 100644 index 0000000..20593fd --- /dev/null +++ b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.h @@ -0,0 +1,62 @@ +/* + * cxgb3i.h: Chelsio S3xx iSCSI driver. + * + * Copyright (c) 2008 Chelsio Communications, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. + * + * Written by: Karen Xie (kxie@chelsio.com) + */ + +#ifndef __CXGB3I_H__ +#define __CXGB3I_H__ + +#define CXGB3I_SCSI_HOST_QDEPTH 1024 +#define CXGB3I_MAX_LUN 512 +#define ISCSI_PDU_NONPAYLOAD_MAX \ + (sizeof(struct iscsi_hdr) + ISCSI_MAX_AHS_SIZE + 2*ISCSI_DIGEST_SIZE) + +/*for TX: a skb must have a headroom of at least TX_HEADER_LEN bytes */ +#define CXGB3I_TX_HEADER_LEN \ + (sizeof(struct tx_data_wr) + sizeof(struct sge_opaque_hdr)) + +extern cxgb3_cpl_handler_func cxgb3i_cpl_handlers[NUM_CPL_CMDS]; + +static inline unsigned int cxgb3i_get_private_ipv4addr(struct net_device *ndev) +{ + return ((struct port_info *)(netdev_priv(ndev)))->iscsi_ipv4addr; +} + +static inline void cxgb3i_set_private_ipv4addr(struct net_device *ndev, + unsigned int addr) +{ + struct port_info *pi = (struct port_info *)netdev_priv(ndev); + + pi->iscsic.flags = addr ? 1 : 0; + pi->iscsi_ipv4addr = addr; + if (addr) + memcpy(pi->iscsic.mac_addr, ndev->dev_addr, ETH_ALEN); +} + +struct cpl_iscsi_hdr_norss { + union opcode_tid ot; + u16 pdu_len_ddp; + u16 len; + u32 seq; + u16 urg; + u8 rsvd; + u8 status; +}; + +struct cpl_rx_data_ddp_norss { + union opcode_tid ot; + u16 urg; + u16 len; + u32 seq; + u32 nxt_seq; + u32 ulp_crc; + u32 ddp_status; +}; +#endif diff --git a/drivers/scsi/cxgbi/cxgb4i/Kbuild b/drivers/scsi/cxgbi/cxgb4i/Kbuild new file mode 100644 index 0000000..8290cda --- /dev/null +++ b/drivers/scsi/cxgbi/cxgb4i/Kbuild @@ -0,0 +1,3 @@ +EXTRA_CFLAGS += -I$(srctree)/drivers/net/ethernet/chelsio/cxgb4 + +obj-$(CONFIG_SCSI_CXGB4_ISCSI) += cxgb4i.o diff --git a/drivers/scsi/cxgbi/cxgb4i/Kconfig b/drivers/scsi/cxgbi/cxgb4i/Kconfig new file mode 100644 index 0000000..8c4e423 --- /dev/null +++ b/drivers/scsi/cxgbi/cxgb4i/Kconfig @@ -0,0 +1,10 @@ +config SCSI_CXGB4_ISCSI + tristate "Chelsio T4 iSCSI support" + depends on PCI && INET && (IPV6 || IPV6=n) + select NETDEVICES + select ETHERNET + select NET_VENDOR_CHELSIO + select CHELSIO_T4 + select SCSI_ISCSI_ATTRS + ---help--- + This driver supports iSCSI offload for the Chelsio T4 devices. diff --git a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c new file mode 100644 index 0000000..1508125 --- /dev/null +++ b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c @@ -0,0 +1,1805 @@ +/* + * cxgb4i.c: Chelsio T4 iSCSI driver. + * + * Copyright (c) 2010 Chelsio Communications, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. + * + * Written by: Karen Xie (kxie@chelsio.com) + * Rakesh Ranjan (rranjan@chelsio.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ + +#include +#include +#include +#include +#include +#include +#include + +#include "t4_regs.h" +#include "t4_msg.h" +#include "cxgb4.h" +#include "cxgb4_uld.h" +#include "t4fw_api.h" +#include "l2t.h" +#include "cxgb4i.h" + +static unsigned int dbg_level; + +#include "../libcxgbi.h" + +#define DRV_MODULE_NAME "cxgb4i" +#define DRV_MODULE_DESC "Chelsio T4/T5 iSCSI Driver" +#define DRV_MODULE_VERSION "0.9.4" + +static char version[] = + DRV_MODULE_DESC " " DRV_MODULE_NAME + " v" DRV_MODULE_VERSION "\n"; + +MODULE_AUTHOR("Chelsio Communications, Inc."); +MODULE_DESCRIPTION(DRV_MODULE_DESC); +MODULE_VERSION(DRV_MODULE_VERSION); +MODULE_LICENSE("GPL"); + +module_param(dbg_level, uint, 0644); +MODULE_PARM_DESC(dbg_level, "Debug flag (default=0)"); + +static int cxgb4i_rcv_win = 256 * 1024; +module_param(cxgb4i_rcv_win, int, 0644); +MODULE_PARM_DESC(cxgb4i_rcv_win, "TCP reveive window in bytes"); + +static int cxgb4i_snd_win = 128 * 1024; +module_param(cxgb4i_snd_win, int, 0644); +MODULE_PARM_DESC(cxgb4i_snd_win, "TCP send window in bytes"); + +static int cxgb4i_rx_credit_thres = 10 * 1024; +module_param(cxgb4i_rx_credit_thres, int, 0644); +MODULE_PARM_DESC(cxgb4i_rx_credit_thres, + "RX credits return threshold in bytes (default=10KB)"); + +static unsigned int cxgb4i_max_connect = (8 * 1024); +module_param(cxgb4i_max_connect, uint, 0644); +MODULE_PARM_DESC(cxgb4i_max_connect, "Maximum number of connections"); + +static unsigned short cxgb4i_sport_base = 20000; +module_param(cxgb4i_sport_base, ushort, 0644); +MODULE_PARM_DESC(cxgb4i_sport_base, "Starting port number (default 20000)"); + +typedef void (*cxgb4i_cplhandler_func)(struct cxgbi_device *, struct sk_buff *); + +static void *t4_uld_add(const struct cxgb4_lld_info *); +static int t4_uld_rx_handler(void *, const __be64 *, const struct pkt_gl *); +static int t4_uld_state_change(void *, enum cxgb4_state state); + +static const struct cxgb4_uld_info cxgb4i_uld_info = { + .name = DRV_MODULE_NAME, + .add = t4_uld_add, + .rx_handler = t4_uld_rx_handler, + .state_change = t4_uld_state_change, +}; + +static struct scsi_host_template cxgb4i_host_template = { + .module = THIS_MODULE, + .name = DRV_MODULE_NAME, + .proc_name = DRV_MODULE_NAME, + .can_queue = CXGB4I_SCSI_HOST_QDEPTH, + .queuecommand = iscsi_queuecommand, + .change_queue_depth = iscsi_change_queue_depth, + .sg_tablesize = SG_ALL, + .max_sectors = 0xFFFF, + .cmd_per_lun = ISCSI_DEF_CMD_PER_LUN, + .eh_abort_handler = iscsi_eh_abort, + .eh_device_reset_handler = iscsi_eh_device_reset, + .eh_target_reset_handler = iscsi_eh_recover_target, + .target_alloc = iscsi_target_alloc, + .use_clustering = DISABLE_CLUSTERING, + .this_id = -1, +}; + +static struct iscsi_transport cxgb4i_iscsi_transport = { + .owner = THIS_MODULE, + .name = DRV_MODULE_NAME, + .caps = CAP_RECOVERY_L0 | CAP_MULTI_R2T | CAP_HDRDGST | + CAP_DATADGST | CAP_DIGEST_OFFLOAD | + CAP_PADDING_OFFLOAD | CAP_TEXT_NEGO, + .attr_is_visible = cxgbi_attr_is_visible, + .get_host_param = cxgbi_get_host_param, + .set_host_param = cxgbi_set_host_param, + /* session management */ + .create_session = cxgbi_create_session, + .destroy_session = cxgbi_destroy_session, + .get_session_param = iscsi_session_get_param, + /* connection management */ + .create_conn = cxgbi_create_conn, + .bind_conn = cxgbi_bind_conn, + .destroy_conn = iscsi_tcp_conn_teardown, + .start_conn = iscsi_conn_start, + .stop_conn = iscsi_conn_stop, + .get_conn_param = iscsi_conn_get_param, + .set_param = cxgbi_set_conn_param, + .get_stats = cxgbi_get_conn_stats, + /* pdu xmit req from user space */ + .send_pdu = iscsi_conn_send_pdu, + /* task */ + .init_task = iscsi_tcp_task_init, + .xmit_task = iscsi_tcp_task_xmit, + .cleanup_task = cxgbi_cleanup_task, + /* pdu */ + .alloc_pdu = cxgbi_conn_alloc_pdu, + .init_pdu = cxgbi_conn_init_pdu, + .xmit_pdu = cxgbi_conn_xmit_pdu, + .parse_pdu_itt = cxgbi_parse_pdu_itt, + /* TCP connect/disconnect */ + .get_ep_param = cxgbi_get_ep_param, + .ep_connect = cxgbi_ep_connect, + .ep_poll = cxgbi_ep_poll, + .ep_disconnect = cxgbi_ep_disconnect, + /* Error recovery timeout call */ + .session_recovery_timedout = iscsi_session_recovery_timedout, +}; + +static struct scsi_transport_template *cxgb4i_stt; + +/* + * CPL (Chelsio Protocol Language) defines a message passing interface between + * the host driver and Chelsio asic. + * The section below implments CPLs that related to iscsi tcp connection + * open/close/abort and data send/receive. + */ + +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#define RCV_BUFSIZ_MASK 0x3FFU +#define MAX_IMM_TX_PKT_LEN 128 + +static inline void set_queue(struct sk_buff *skb, unsigned int queue, + const struct cxgbi_sock *csk) +{ + skb->queue_mapping = queue; +} + +static int push_tx_frames(struct cxgbi_sock *, int); + +/* + * is_ofld_imm - check whether a packet can be sent as immediate data + * @skb: the packet + * + * Returns true if a packet can be sent as an offload WR with immediate + * data. We currently use the same limit as for Ethernet packets. + */ +static inline int is_ofld_imm(const struct sk_buff *skb) +{ + return skb->len <= (MAX_IMM_TX_PKT_LEN - + sizeof(struct fw_ofld_tx_data_wr)); +} + +static void send_act_open_req(struct cxgbi_sock *csk, struct sk_buff *skb, + struct l2t_entry *e) +{ + struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(csk->cdev); + int t4 = is_t4(lldi->adapter_type); + int wscale = cxgbi_sock_compute_wscale(csk->mss_idx); + unsigned long long opt0; + unsigned int opt2; + unsigned int qid_atid = ((unsigned int)csk->atid) | + (((unsigned int)csk->rss_qid) << 14); + + opt0 = KEEP_ALIVE(1) | + WND_SCALE(wscale) | + MSS_IDX(csk->mss_idx) | + L2T_IDX(((struct l2t_entry *)csk->l2t)->idx) | + TX_CHAN(csk->tx_chan) | + SMAC_SEL(csk->smac_idx) | + ULP_MODE(ULP_MODE_ISCSI) | + RCV_BUFSIZ(cxgb4i_rcv_win >> 10); + opt2 = RX_CHANNEL(0) | + RSS_QUEUE_VALID | + (1 << 20) | + RSS_QUEUE(csk->rss_qid); + + if (is_t4(lldi->adapter_type)) { + struct cpl_act_open_req *req = + (struct cpl_act_open_req *)skb->head; + + INIT_TP_WR(req, 0); + OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, + qid_atid)); + req->local_port = csk->saddr.sin_port; + req->peer_port = csk->daddr.sin_port; + req->local_ip = csk->saddr.sin_addr.s_addr; + req->peer_ip = csk->daddr.sin_addr.s_addr; + req->opt0 = cpu_to_be64(opt0); + req->params = cpu_to_be32(cxgb4_select_ntuple( + csk->cdev->ports[csk->port_id], + csk->l2t)); + opt2 |= 1 << 22; + req->opt2 = cpu_to_be32(opt2); + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk t4 0x%p, %pI4:%u-%pI4:%u, atid %d, qid %u.\n", + csk, &req->local_ip, ntohs(req->local_port), + &req->peer_ip, ntohs(req->peer_port), + csk->atid, csk->rss_qid); + } else { + struct cpl_t5_act_open_req *req = + (struct cpl_t5_act_open_req *)skb->head; + + INIT_TP_WR(req, 0); + OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, + qid_atid)); + req->local_port = csk->saddr.sin_port; + req->peer_port = csk->daddr.sin_port; + req->local_ip = csk->saddr.sin_addr.s_addr; + req->peer_ip = csk->daddr.sin_addr.s_addr; + req->opt0 = cpu_to_be64(opt0); + req->params = cpu_to_be64(V_FILTER_TUPLE( + cxgb4_select_ntuple( + csk->cdev->ports[csk->port_id], + csk->l2t))); + opt2 |= 1 << 31; + req->opt2 = cpu_to_be32(opt2); + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk t5 0x%p, %pI4:%u-%pI4:%u, atid %d, qid %u.\n", + csk, &req->local_ip, ntohs(req->local_port), + &req->peer_ip, ntohs(req->peer_port), + csk->atid, csk->rss_qid); + } + + set_wr_txq(skb, CPL_PRIORITY_SETUP, csk->port_id); + + pr_info_ipaddr("t%d csk 0x%p,%u,0x%lx,%u, rss_qid %u.\n", + (&csk->saddr), (&csk->daddr), t4 ? 4 : 5, csk, + csk->state, csk->flags, csk->atid, csk->rss_qid); + + cxgb4_l2t_send(csk->cdev->ports[csk->port_id], skb, csk->l2t); +} + +#if IS_ENABLED(CONFIG_IPV6) +static void send_act_open_req6(struct cxgbi_sock *csk, struct sk_buff *skb, + struct l2t_entry *e) +{ + struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(csk->cdev); + int t4 = is_t4(lldi->adapter_type); + int wscale = cxgbi_sock_compute_wscale(csk->mss_idx); + unsigned long long opt0; + unsigned int opt2; + unsigned int qid_atid = ((unsigned int)csk->atid) | + (((unsigned int)csk->rss_qid) << 14); + + opt0 = KEEP_ALIVE(1) | + WND_SCALE(wscale) | + MSS_IDX(csk->mss_idx) | + L2T_IDX(((struct l2t_entry *)csk->l2t)->idx) | + TX_CHAN(csk->tx_chan) | + SMAC_SEL(csk->smac_idx) | + ULP_MODE(ULP_MODE_ISCSI) | + RCV_BUFSIZ(cxgb4i_rcv_win >> 10); + + opt2 = RX_CHANNEL(0) | + RSS_QUEUE_VALID | + RX_FC_DISABLE | + RSS_QUEUE(csk->rss_qid); + + if (t4) { + struct cpl_act_open_req6 *req = + (struct cpl_act_open_req6 *)skb->head; + + INIT_TP_WR(req, 0); + OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ6, + qid_atid)); + req->local_port = csk->saddr6.sin6_port; + req->peer_port = csk->daddr6.sin6_port; + + req->local_ip_hi = *(__be64 *)(csk->saddr6.sin6_addr.s6_addr); + req->local_ip_lo = *(__be64 *)(csk->saddr6.sin6_addr.s6_addr + + 8); + req->peer_ip_hi = *(__be64 *)(csk->daddr6.sin6_addr.s6_addr); + req->peer_ip_lo = *(__be64 *)(csk->daddr6.sin6_addr.s6_addr + + 8); + + req->opt0 = cpu_to_be64(opt0); + + opt2 |= RX_FC_VALID; + req->opt2 = cpu_to_be32(opt2); + + req->params = cpu_to_be32(cxgb4_select_ntuple( + csk->cdev->ports[csk->port_id], + csk->l2t)); + } else { + struct cpl_t5_act_open_req6 *req = + (struct cpl_t5_act_open_req6 *)skb->head; + + INIT_TP_WR(req, 0); + OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ6, + qid_atid)); + req->local_port = csk->saddr6.sin6_port; + req->peer_port = csk->daddr6.sin6_port; + req->local_ip_hi = *(__be64 *)(csk->saddr6.sin6_addr.s6_addr); + req->local_ip_lo = *(__be64 *)(csk->saddr6.sin6_addr.s6_addr + + 8); + req->peer_ip_hi = *(__be64 *)(csk->daddr6.sin6_addr.s6_addr); + req->peer_ip_lo = *(__be64 *)(csk->daddr6.sin6_addr.s6_addr + + 8); + req->opt0 = cpu_to_be64(opt0); + + opt2 |= T5_OPT_2_VALID; + req->opt2 = cpu_to_be32(opt2); + + req->params = cpu_to_be64(V_FILTER_TUPLE(cxgb4_select_ntuple( + csk->cdev->ports[csk->port_id], + csk->l2t))); + } + + set_wr_txq(skb, CPL_PRIORITY_SETUP, csk->port_id); + + pr_info("t%d csk 0x%p,%u,0x%lx,%u, [%pI6]:%u-[%pI6]:%u, rss_qid %u.\n", + t4 ? 4 : 5, csk, csk->state, csk->flags, csk->atid, + &csk->saddr6.sin6_addr, ntohs(csk->saddr.sin_port), + &csk->daddr6.sin6_addr, ntohs(csk->daddr.sin_port), + csk->rss_qid); + + cxgb4_l2t_send(csk->cdev->ports[csk->port_id], skb, csk->l2t); +} +#endif + +static void send_close_req(struct cxgbi_sock *csk) +{ + struct sk_buff *skb = csk->cpl_close; + struct cpl_close_con_req *req = (struct cpl_close_con_req *)skb->head; + unsigned int tid = csk->tid; + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk 0x%p,%u,0x%lx, tid %u.\n", + csk, csk->state, csk->flags, csk->tid); + csk->cpl_close = NULL; + set_wr_txq(skb, CPL_PRIORITY_DATA, csk->port_id); + INIT_TP_WR(req, tid); + OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); + req->rsvd = 0; + + cxgbi_sock_skb_entail(csk, skb); + if (csk->state >= CTP_ESTABLISHED) + push_tx_frames(csk, 1); +} + +static void abort_arp_failure(void *handle, struct sk_buff *skb) +{ + struct cxgbi_sock *csk = (struct cxgbi_sock *)handle; + struct cpl_abort_req *req; + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk 0x%p,%u,0x%lx, tid %u, abort.\n", + csk, csk->state, csk->flags, csk->tid); + req = (struct cpl_abort_req *)skb->data; + req->cmd = CPL_ABORT_NO_RST; + cxgb4_ofld_send(csk->cdev->ports[csk->port_id], skb); +} + +static void send_abort_req(struct cxgbi_sock *csk) +{ + struct cpl_abort_req *req; + struct sk_buff *skb = csk->cpl_abort_req; + + if (unlikely(csk->state == CTP_ABORTING) || !skb || !csk->cdev) + return; + cxgbi_sock_set_state(csk, CTP_ABORTING); + cxgbi_sock_set_flag(csk, CTPF_ABORT_RPL_PENDING); + cxgbi_sock_purge_write_queue(csk); + + csk->cpl_abort_req = NULL; + req = (struct cpl_abort_req *)skb->head; + set_queue(skb, CPL_PRIORITY_DATA, csk); + req->cmd = CPL_ABORT_SEND_RST; + t4_set_arp_err_handler(skb, csk, abort_arp_failure); + INIT_TP_WR(req, csk->tid); + OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ABORT_REQ, csk->tid)); + req->rsvd0 = htonl(csk->snd_nxt); + req->rsvd1 = !cxgbi_sock_flag(csk, CTPF_TX_DATA_SENT); + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk 0x%p,%u,0x%lx,%u, snd_nxt %u, 0x%x.\n", + csk, csk->state, csk->flags, csk->tid, csk->snd_nxt, + req->rsvd1); + + cxgb4_l2t_send(csk->cdev->ports[csk->port_id], skb, csk->l2t); +} + +static void send_abort_rpl(struct cxgbi_sock *csk, int rst_status) +{ + struct sk_buff *skb = csk->cpl_abort_rpl; + struct cpl_abort_rpl *rpl = (struct cpl_abort_rpl *)skb->head; + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk 0x%p,%u,0x%lx,%u, status %d.\n", + csk, csk->state, csk->flags, csk->tid, rst_status); + + csk->cpl_abort_rpl = NULL; + set_queue(skb, CPL_PRIORITY_DATA, csk); + INIT_TP_WR(rpl, csk->tid); + OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_ABORT_RPL, csk->tid)); + rpl->cmd = rst_status; + cxgb4_ofld_send(csk->cdev->ports[csk->port_id], skb); +} + +/* + * CPL connection rx data ack: host -> + * Send RX credits through an RX_DATA_ACK CPL message. Returns the number of + * credits sent. + */ +static u32 send_rx_credits(struct cxgbi_sock *csk, u32 credits) +{ + struct sk_buff *skb; + struct cpl_rx_data_ack *req; + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX, + "csk 0x%p,%u,0x%lx,%u, credit %u.\n", + csk, csk->state, csk->flags, csk->tid, credits); + + skb = alloc_wr(sizeof(*req), 0, GFP_ATOMIC); + if (!skb) { + pr_info("csk 0x%p, credit %u, OOM.\n", csk, credits); + return 0; + } + req = (struct cpl_rx_data_ack *)skb->head; + + set_wr_txq(skb, CPL_PRIORITY_ACK, csk->port_id); + INIT_TP_WR(req, csk->tid); + OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_RX_DATA_ACK, + csk->tid)); + req->credit_dack = cpu_to_be32(RX_CREDITS(credits) | RX_FORCE_ACK(1)); + cxgb4_ofld_send(csk->cdev->ports[csk->port_id], skb); + return credits; +} + +/* + * sgl_len - calculates the size of an SGL of the given capacity + * @n: the number of SGL entries + * Calculates the number of flits needed for a scatter/gather list that + * can hold the given number of entries. + */ +static inline unsigned int sgl_len(unsigned int n) +{ + n--; + return (3 * n) / 2 + (n & 1) + 2; +} + +/* + * calc_tx_flits_ofld - calculate # of flits for an offload packet + * @skb: the packet + * + * Returns the number of flits needed for the given offload packet. + * These packets are already fully constructed and no additional headers + * will be added. + */ +static inline unsigned int calc_tx_flits_ofld(const struct sk_buff *skb) +{ + unsigned int flits, cnt; + + if (is_ofld_imm(skb)) + return DIV_ROUND_UP(skb->len, 8); + flits = skb_transport_offset(skb) / 8; + cnt = skb_shinfo(skb)->nr_frags; + if (skb_tail_pointer(skb) != skb_transport_header(skb)) + cnt++; + return flits + sgl_len(cnt); +} + +static inline void send_tx_flowc_wr(struct cxgbi_sock *csk) +{ + struct sk_buff *skb; + struct fw_flowc_wr *flowc; + int flowclen, i; + + flowclen = 80; + skb = alloc_wr(flowclen, 0, GFP_ATOMIC); + flowc = (struct fw_flowc_wr *)skb->head; + flowc->op_to_nparams = + htonl(FW_WR_OP(FW_FLOWC_WR) | FW_FLOWC_WR_NPARAMS(8)); + flowc->flowid_len16 = + htonl(FW_WR_LEN16(DIV_ROUND_UP(72, 16)) | + FW_WR_FLOWID(csk->tid)); + flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN; + flowc->mnemval[0].val = htonl(csk->cdev->pfvf); + flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH; + flowc->mnemval[1].val = htonl(csk->tx_chan); + flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT; + flowc->mnemval[2].val = htonl(csk->tx_chan); + flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID; + flowc->mnemval[3].val = htonl(csk->rss_qid); + flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDNXT; + flowc->mnemval[4].val = htonl(csk->snd_nxt); + flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_RCVNXT; + flowc->mnemval[5].val = htonl(csk->rcv_nxt); + flowc->mnemval[6].mnemonic = FW_FLOWC_MNEM_SNDBUF; + flowc->mnemval[6].val = htonl(cxgb4i_snd_win); + flowc->mnemval[7].mnemonic = FW_FLOWC_MNEM_MSS; + flowc->mnemval[7].val = htonl(csk->advmss); + flowc->mnemval[8].mnemonic = 0; + flowc->mnemval[8].val = 0; + for (i = 0; i < 9; i++) { + flowc->mnemval[i].r4[0] = 0; + flowc->mnemval[i].r4[1] = 0; + flowc->mnemval[i].r4[2] = 0; + } + set_queue(skb, CPL_PRIORITY_DATA, csk); + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk 0x%p, tid 0x%x, %u,%u,%u,%u,%u,%u,%u.\n", + csk, csk->tid, 0, csk->tx_chan, csk->rss_qid, + csk->snd_nxt, csk->rcv_nxt, cxgb4i_snd_win, + csk->advmss); + + cxgb4_ofld_send(csk->cdev->ports[csk->port_id], skb); +} + +static inline void make_tx_data_wr(struct cxgbi_sock *csk, struct sk_buff *skb, + int dlen, int len, u32 credits, int compl) +{ + struct fw_ofld_tx_data_wr *req; + unsigned int submode = cxgbi_skcb_ulp_mode(skb) & 3; + unsigned int wr_ulp_mode = 0; + + req = (struct fw_ofld_tx_data_wr *)__skb_push(skb, sizeof(*req)); + + if (is_ofld_imm(skb)) { + req->op_to_immdlen = htonl(FW_WR_OP(FW_OFLD_TX_DATA_WR) | + FW_WR_COMPL(1) | + FW_WR_IMMDLEN(dlen)); + req->flowid_len16 = htonl(FW_WR_FLOWID(csk->tid) | + FW_WR_LEN16(credits)); + } else { + req->op_to_immdlen = + cpu_to_be32(FW_WR_OP(FW_OFLD_TX_DATA_WR) | + FW_WR_COMPL(1) | + FW_WR_IMMDLEN(0)); + req->flowid_len16 = + cpu_to_be32(FW_WR_FLOWID(csk->tid) | + FW_WR_LEN16(credits)); + } + if (submode) + wr_ulp_mode = FW_OFLD_TX_DATA_WR_ULPMODE(ULP2_MODE_ISCSI) | + FW_OFLD_TX_DATA_WR_ULPSUBMODE(submode); + req->tunnel_to_proxy = htonl(wr_ulp_mode | + FW_OFLD_TX_DATA_WR_SHOVE(skb_peek(&csk->write_queue) ? 0 : 1)); + req->plen = htonl(len); + if (!cxgbi_sock_flag(csk, CTPF_TX_DATA_SENT)) + cxgbi_sock_set_flag(csk, CTPF_TX_DATA_SENT); +} + +static void arp_failure_skb_discard(void *handle, struct sk_buff *skb) +{ + kfree_skb(skb); +} + +static int push_tx_frames(struct cxgbi_sock *csk, int req_completion) +{ + int total_size = 0; + struct sk_buff *skb; + + if (unlikely(csk->state < CTP_ESTABLISHED || + csk->state == CTP_CLOSE_WAIT_1 || csk->state >= CTP_ABORTING)) { + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK | + 1 << CXGBI_DBG_PDU_TX, + "csk 0x%p,%u,0x%lx,%u, in closing state.\n", + csk, csk->state, csk->flags, csk->tid); + return 0; + } + + while (csk->wr_cred && (skb = skb_peek(&csk->write_queue)) != NULL) { + int dlen = skb->len; + int len = skb->len; + unsigned int credits_needed; + + skb_reset_transport_header(skb); + if (is_ofld_imm(skb)) + credits_needed = DIV_ROUND_UP(dlen + + sizeof(struct fw_ofld_tx_data_wr), 16); + else + credits_needed = DIV_ROUND_UP(8*calc_tx_flits_ofld(skb) + + sizeof(struct fw_ofld_tx_data_wr), + 16); + + if (csk->wr_cred < credits_needed) { + log_debug(1 << CXGBI_DBG_PDU_TX, + "csk 0x%p, skb %u/%u, wr %d < %u.\n", + csk, skb->len, skb->data_len, + credits_needed, csk->wr_cred); + break; + } + __skb_unlink(skb, &csk->write_queue); + set_queue(skb, CPL_PRIORITY_DATA, csk); + skb->csum = credits_needed; + csk->wr_cred -= credits_needed; + csk->wr_una_cred += credits_needed; + cxgbi_sock_enqueue_wr(csk, skb); + + log_debug(1 << CXGBI_DBG_PDU_TX, + "csk 0x%p, skb %u/%u, wr %d, left %u, unack %u.\n", + csk, skb->len, skb->data_len, credits_needed, + csk->wr_cred, csk->wr_una_cred); + + if (likely(cxgbi_skcb_test_flag(skb, SKCBF_TX_NEED_HDR))) { + if (!cxgbi_sock_flag(csk, CTPF_TX_DATA_SENT)) { + send_tx_flowc_wr(csk); + skb->csum += 5; + csk->wr_cred -= 5; + csk->wr_una_cred += 5; + } + len += cxgbi_ulp_extra_len(cxgbi_skcb_ulp_mode(skb)); + make_tx_data_wr(csk, skb, dlen, len, credits_needed, + req_completion); + csk->snd_nxt += len; + cxgbi_skcb_clear_flag(skb, SKCBF_TX_NEED_HDR); + } + total_size += skb->truesize; + t4_set_arp_err_handler(skb, csk, arp_failure_skb_discard); + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_TX, + "csk 0x%p,%u,0x%lx,%u, skb 0x%p, %u.\n", + csk, csk->state, csk->flags, csk->tid, skb, len); + + cxgb4_l2t_send(csk->cdev->ports[csk->port_id], skb, csk->l2t); + } + return total_size; +} + +static inline void free_atid(struct cxgbi_sock *csk) +{ + struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(csk->cdev); + + if (cxgbi_sock_flag(csk, CTPF_HAS_ATID)) { + cxgb4_free_atid(lldi->tids, csk->atid); + cxgbi_sock_clear_flag(csk, CTPF_HAS_ATID); + cxgbi_sock_put(csk); + } +} + +static void do_act_establish(struct cxgbi_device *cdev, struct sk_buff *skb) +{ + struct cxgbi_sock *csk; + struct cpl_act_establish *req = (struct cpl_act_establish *)skb->data; + unsigned short tcp_opt = ntohs(req->tcp_opt); + unsigned int tid = GET_TID(req); + unsigned int atid = GET_TID_TID(ntohl(req->tos_atid)); + struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev); + struct tid_info *t = lldi->tids; + u32 rcv_isn = be32_to_cpu(req->rcv_isn); + + csk = lookup_atid(t, atid); + if (unlikely(!csk)) { + pr_err("NO conn. for atid %u, cdev 0x%p.\n", atid, cdev); + goto rel_skb; + } + + if (csk->atid != atid) { + pr_err("bad conn atid %u, csk 0x%p,%u,0x%lx,tid %u, atid %u.\n", + atid, csk, csk->state, csk->flags, csk->tid, csk->atid); + goto rel_skb; + } + + pr_info_ipaddr("atid 0x%x, tid 0x%x, csk 0x%p,%u,0x%lx, isn %u.\n", + (&csk->saddr), (&csk->daddr), + atid, tid, csk, csk->state, csk->flags, rcv_isn); + + module_put(THIS_MODULE); + + cxgbi_sock_get(csk); + csk->tid = tid; + cxgb4_insert_tid(lldi->tids, csk, tid); + cxgbi_sock_set_flag(csk, CTPF_HAS_TID); + + free_atid(csk); + + spin_lock_bh(&csk->lock); + if (unlikely(csk->state != CTP_ACTIVE_OPEN)) + pr_info("csk 0x%p,%u,0x%lx,%u, got EST.\n", + csk, csk->state, csk->flags, csk->tid); + + if (csk->retry_timer.function) { + del_timer(&csk->retry_timer); + csk->retry_timer.function = NULL; + } + + csk->copied_seq = csk->rcv_wup = csk->rcv_nxt = rcv_isn; + /* + * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't + * pass through opt0. + */ + if (cxgb4i_rcv_win > (RCV_BUFSIZ_MASK << 10)) + csk->rcv_wup -= cxgb4i_rcv_win - (RCV_BUFSIZ_MASK << 10); + + csk->advmss = lldi->mtus[GET_TCPOPT_MSS(tcp_opt)] - 40; + if (GET_TCPOPT_TSTAMP(tcp_opt)) + csk->advmss -= 12; + if (csk->advmss < 128) + csk->advmss = 128; + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk 0x%p, mss_idx %u, advmss %u.\n", + csk, GET_TCPOPT_MSS(tcp_opt), csk->advmss); + + cxgbi_sock_established(csk, ntohl(req->snd_isn), ntohs(req->tcp_opt)); + + if (unlikely(cxgbi_sock_flag(csk, CTPF_ACTIVE_CLOSE_NEEDED))) + send_abort_req(csk); + else { + if (skb_queue_len(&csk->write_queue)) + push_tx_frames(csk, 0); + cxgbi_conn_tx_open(csk); + } + spin_unlock_bh(&csk->lock); + +rel_skb: + __kfree_skb(skb); +} + +static int act_open_rpl_status_to_errno(int status) +{ + switch (status) { + case CPL_ERR_CONN_RESET: + return -ECONNREFUSED; + case CPL_ERR_ARP_MISS: + return -EHOSTUNREACH; + case CPL_ERR_CONN_TIMEDOUT: + return -ETIMEDOUT; + case CPL_ERR_TCAM_FULL: + return -ENOMEM; + case CPL_ERR_CONN_EXIST: + return -EADDRINUSE; + default: + return -EIO; + } +} + +static void csk_act_open_retry_timer(unsigned long data) +{ + struct sk_buff *skb = NULL; + struct cxgbi_sock *csk = (struct cxgbi_sock *)data; + struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(csk->cdev); + void (*send_act_open_func)(struct cxgbi_sock *, struct sk_buff *, + struct l2t_entry *); + int t4 = is_t4(lldi->adapter_type), size, size6; + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk 0x%p,%u,0x%lx,%u.\n", + csk, csk->state, csk->flags, csk->tid); + + cxgbi_sock_get(csk); + spin_lock_bh(&csk->lock); + + if (t4) { + size = sizeof(struct cpl_act_open_req); + size6 = sizeof(struct cpl_act_open_req6); + } else { + size = sizeof(struct cpl_t5_act_open_req); + size6 = sizeof(struct cpl_t5_act_open_req6); + } + + if (csk->csk_family == AF_INET) { + send_act_open_func = send_act_open_req; + skb = alloc_wr(size, 0, GFP_ATOMIC); +#if IS_ENABLED(CONFIG_IPV6) + } else { + send_act_open_func = send_act_open_req6; + skb = alloc_wr(size6, 0, GFP_ATOMIC); +#endif + } + + if (!skb) + cxgbi_sock_fail_act_open(csk, -ENOMEM); + else { + skb->sk = (struct sock *)csk; + t4_set_arp_err_handler(skb, csk, + cxgbi_sock_act_open_req_arp_failure); + send_act_open_func(csk, skb, csk->l2t); + } + + spin_unlock_bh(&csk->lock); + cxgbi_sock_put(csk); + +} + +static void do_act_open_rpl(struct cxgbi_device *cdev, struct sk_buff *skb) +{ + struct cxgbi_sock *csk; + struct cpl_act_open_rpl *rpl = (struct cpl_act_open_rpl *)skb->data; + unsigned int tid = GET_TID(rpl); + unsigned int atid = + GET_TID_TID(GET_AOPEN_ATID(be32_to_cpu(rpl->atid_status))); + unsigned int status = GET_AOPEN_STATUS(be32_to_cpu(rpl->atid_status)); + struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev); + struct tid_info *t = lldi->tids; + + csk = lookup_atid(t, atid); + if (unlikely(!csk)) { + pr_err("NO matching conn. atid %u, tid %u.\n", atid, tid); + goto rel_skb; + } + + pr_info_ipaddr("tid %u/%u, status %u.\n" + "csk 0x%p,%u,0x%lx. ", (&csk->saddr), (&csk->daddr), + atid, tid, status, csk, csk->state, csk->flags); + + if (status == CPL_ERR_RTX_NEG_ADVICE) + goto rel_skb; + + module_put(THIS_MODULE); + + if (status && status != CPL_ERR_TCAM_FULL && + status != CPL_ERR_CONN_EXIST && + status != CPL_ERR_ARP_MISS) + cxgb4_remove_tid(lldi->tids, csk->port_id, GET_TID(rpl)); + + cxgbi_sock_get(csk); + spin_lock_bh(&csk->lock); + + if (status == CPL_ERR_CONN_EXIST && + csk->retry_timer.function != csk_act_open_retry_timer) { + csk->retry_timer.function = csk_act_open_retry_timer; + mod_timer(&csk->retry_timer, jiffies + HZ / 2); + } else + cxgbi_sock_fail_act_open(csk, + act_open_rpl_status_to_errno(status)); + + spin_unlock_bh(&csk->lock); + cxgbi_sock_put(csk); +rel_skb: + __kfree_skb(skb); +} + +static void do_peer_close(struct cxgbi_device *cdev, struct sk_buff *skb) +{ + struct cxgbi_sock *csk; + struct cpl_peer_close *req = (struct cpl_peer_close *)skb->data; + unsigned int tid = GET_TID(req); + struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev); + struct tid_info *t = lldi->tids; + + csk = lookup_tid(t, tid); + if (unlikely(!csk)) { + pr_err("can't find connection for tid %u.\n", tid); + goto rel_skb; + } + pr_info_ipaddr("csk 0x%p,%u,0x%lx,%u.\n", + (&csk->saddr), (&csk->daddr), + csk, csk->state, csk->flags, csk->tid); + cxgbi_sock_rcv_peer_close(csk); +rel_skb: + __kfree_skb(skb); +} + +static void do_close_con_rpl(struct cxgbi_device *cdev, struct sk_buff *skb) +{ + struct cxgbi_sock *csk; + struct cpl_close_con_rpl *rpl = (struct cpl_close_con_rpl *)skb->data; + unsigned int tid = GET_TID(rpl); + struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev); + struct tid_info *t = lldi->tids; + + csk = lookup_tid(t, tid); + if (unlikely(!csk)) { + pr_err("can't find connection for tid %u.\n", tid); + goto rel_skb; + } + pr_info_ipaddr("csk 0x%p,%u,0x%lx,%u.\n", + (&csk->saddr), (&csk->daddr), + csk, csk->state, csk->flags, csk->tid); + cxgbi_sock_rcv_close_conn_rpl(csk, ntohl(rpl->snd_nxt)); +rel_skb: + __kfree_skb(skb); +} + +static int abort_status_to_errno(struct cxgbi_sock *csk, int abort_reason, + int *need_rst) +{ + switch (abort_reason) { + case CPL_ERR_BAD_SYN: /* fall through */ + case CPL_ERR_CONN_RESET: + return csk->state > CTP_ESTABLISHED ? + -EPIPE : -ECONNRESET; + case CPL_ERR_XMIT_TIMEDOUT: + case CPL_ERR_PERSIST_TIMEDOUT: + case CPL_ERR_FINWAIT2_TIMEDOUT: + case CPL_ERR_KEEPALIVE_TIMEDOUT: + return -ETIMEDOUT; + default: + return -EIO; + } +} + +static void do_abort_req_rss(struct cxgbi_device *cdev, struct sk_buff *skb) +{ + struct cxgbi_sock *csk; + struct cpl_abort_req_rss *req = (struct cpl_abort_req_rss *)skb->data; + unsigned int tid = GET_TID(req); + struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev); + struct tid_info *t = lldi->tids; + int rst_status = CPL_ABORT_NO_RST; + + csk = lookup_tid(t, tid); + if (unlikely(!csk)) { + pr_err("can't find connection for tid %u.\n", tid); + goto rel_skb; + } + + pr_info_ipaddr("csk 0x%p,%u,0x%lx,%u, status %u.\n", + (&csk->saddr), (&csk->daddr), + csk, csk->state, csk->flags, csk->tid, req->status); + + if (req->status == CPL_ERR_RTX_NEG_ADVICE || + req->status == CPL_ERR_PERSIST_NEG_ADVICE) + goto rel_skb; + + cxgbi_sock_get(csk); + spin_lock_bh(&csk->lock); + + cxgbi_sock_clear_flag(csk, CTPF_ABORT_REQ_RCVD); + + if (!cxgbi_sock_flag(csk, CTPF_TX_DATA_SENT)) { + send_tx_flowc_wr(csk); + cxgbi_sock_set_flag(csk, CTPF_TX_DATA_SENT); + } + + cxgbi_sock_set_flag(csk, CTPF_ABORT_REQ_RCVD); + cxgbi_sock_set_state(csk, CTP_ABORTING); + + send_abort_rpl(csk, rst_status); + + if (!cxgbi_sock_flag(csk, CTPF_ABORT_RPL_PENDING)) { + csk->err = abort_status_to_errno(csk, req->status, &rst_status); + cxgbi_sock_closed(csk); + } + + spin_unlock_bh(&csk->lock); + cxgbi_sock_put(csk); +rel_skb: + __kfree_skb(skb); +} + +static void do_abort_rpl_rss(struct cxgbi_device *cdev, struct sk_buff *skb) +{ + struct cxgbi_sock *csk; + struct cpl_abort_rpl_rss *rpl = (struct cpl_abort_rpl_rss *)skb->data; + unsigned int tid = GET_TID(rpl); + struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev); + struct tid_info *t = lldi->tids; + + csk = lookup_tid(t, tid); + if (!csk) + goto rel_skb; + + if (csk) + pr_info_ipaddr("csk 0x%p,%u,0x%lx,%u, status %u.\n", + (&csk->saddr), (&csk->daddr), csk, + csk->state, csk->flags, csk->tid, rpl->status); + + if (rpl->status == CPL_ERR_ABORT_FAILED) + goto rel_skb; + + cxgbi_sock_rcv_abort_rpl(csk); +rel_skb: + __kfree_skb(skb); +} + +static void do_rx_iscsi_hdr(struct cxgbi_device *cdev, struct sk_buff *skb) +{ + struct cxgbi_sock *csk; + struct cpl_iscsi_hdr *cpl = (struct cpl_iscsi_hdr *)skb->data; + unsigned short pdu_len_ddp = be16_to_cpu(cpl->pdu_len_ddp); + unsigned int tid = GET_TID(cpl); + struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev); + struct tid_info *t = lldi->tids; + + csk = lookup_tid(t, tid); + if (unlikely(!csk)) { + pr_err("can't find conn. for tid %u.\n", tid); + goto rel_skb; + } + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX, + "csk 0x%p,%u,0x%lx, tid %u, skb 0x%p,%u, 0x%x.\n", + csk, csk->state, csk->flags, csk->tid, skb, skb->len, + pdu_len_ddp); + + spin_lock_bh(&csk->lock); + + if (unlikely(csk->state >= CTP_PASSIVE_CLOSE)) { + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk 0x%p,%u,0x%lx,%u, bad state.\n", + csk, csk->state, csk->flags, csk->tid); + if (csk->state != CTP_ABORTING) + goto abort_conn; + else + goto discard; + } + + cxgbi_skcb_tcp_seq(skb) = ntohl(cpl->seq); + cxgbi_skcb_flags(skb) = 0; + + skb_reset_transport_header(skb); + __skb_pull(skb, sizeof(*cpl)); + __pskb_trim(skb, ntohs(cpl->len)); + + if (!csk->skb_ulp_lhdr) { + unsigned char *bhs; + unsigned int hlen, dlen, plen; + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX, + "csk 0x%p,%u,0x%lx, tid %u, skb 0x%p header.\n", + csk, csk->state, csk->flags, csk->tid, skb); + csk->skb_ulp_lhdr = skb; + cxgbi_skcb_set_flag(skb, SKCBF_RX_HDR); + + if (cxgbi_skcb_tcp_seq(skb) != csk->rcv_nxt) { + pr_info("tid %u, CPL_ISCSI_HDR, bad seq, 0x%x/0x%x.\n", + csk->tid, cxgbi_skcb_tcp_seq(skb), + csk->rcv_nxt); + goto abort_conn; + } + + bhs = skb->data; + hlen = ntohs(cpl->len); + dlen = ntohl(*(unsigned int *)(bhs + 4)) & 0xFFFFFF; + + plen = ISCSI_PDU_LEN(pdu_len_ddp); + if (is_t4(lldi->adapter_type)) + plen -= 40; + + if ((hlen + dlen) != plen) { + pr_info("tid 0x%x, CPL_ISCSI_HDR, pdu len " + "mismatch %u != %u + %u, seq 0x%x.\n", + csk->tid, plen, hlen, dlen, + cxgbi_skcb_tcp_seq(skb)); + goto abort_conn; + } + + cxgbi_skcb_rx_pdulen(skb) = (hlen + dlen + 3) & (~0x3); + if (dlen) + cxgbi_skcb_rx_pdulen(skb) += csk->dcrc_len; + csk->rcv_nxt += cxgbi_skcb_rx_pdulen(skb); + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX, + "csk 0x%p, skb 0x%p, 0x%x,%u+%u,0x%x,0x%x.\n", + csk, skb, *bhs, hlen, dlen, + ntohl(*((unsigned int *)(bhs + 16))), + ntohl(*((unsigned int *)(bhs + 24)))); + + } else { + struct sk_buff *lskb = csk->skb_ulp_lhdr; + + cxgbi_skcb_set_flag(lskb, SKCBF_RX_DATA); + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX, + "csk 0x%p,%u,0x%lx, skb 0x%p data, 0x%p.\n", + csk, csk->state, csk->flags, skb, lskb); + } + + __skb_queue_tail(&csk->receive_queue, skb); + spin_unlock_bh(&csk->lock); + return; + +abort_conn: + send_abort_req(csk); +discard: + spin_unlock_bh(&csk->lock); +rel_skb: + __kfree_skb(skb); +} + +static void do_rx_data_ddp(struct cxgbi_device *cdev, + struct sk_buff *skb) +{ + struct cxgbi_sock *csk; + struct sk_buff *lskb; + struct cpl_rx_data_ddp *rpl = (struct cpl_rx_data_ddp *)skb->data; + unsigned int tid = GET_TID(rpl); + struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev); + struct tid_info *t = lldi->tids; + unsigned int status = ntohl(rpl->ddpvld); + + csk = lookup_tid(t, tid); + if (unlikely(!csk)) { + pr_err("can't find connection for tid %u.\n", tid); + goto rel_skb; + } + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX, + "csk 0x%p,%u,0x%lx, skb 0x%p,0x%x, lhdr 0x%p.\n", + csk, csk->state, csk->flags, skb, status, csk->skb_ulp_lhdr); + + spin_lock_bh(&csk->lock); + + if (unlikely(csk->state >= CTP_PASSIVE_CLOSE)) { + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk 0x%p,%u,0x%lx,%u, bad state.\n", + csk, csk->state, csk->flags, csk->tid); + if (csk->state != CTP_ABORTING) + goto abort_conn; + else + goto discard; + } + + if (!csk->skb_ulp_lhdr) { + pr_err("tid 0x%x, rcv RX_DATA_DDP w/o pdu bhs.\n", csk->tid); + goto abort_conn; + } + + lskb = csk->skb_ulp_lhdr; + csk->skb_ulp_lhdr = NULL; + + cxgbi_skcb_rx_ddigest(lskb) = ntohl(rpl->ulp_crc); + + if (ntohs(rpl->len) != cxgbi_skcb_rx_pdulen(lskb)) + pr_info("tid 0x%x, RX_DATA_DDP pdulen %u != %u.\n", + csk->tid, ntohs(rpl->len), cxgbi_skcb_rx_pdulen(lskb)); + + if (status & (1 << CPL_RX_DDP_STATUS_HCRC_SHIFT)) { + pr_info("csk 0x%p, lhdr 0x%p, status 0x%x, hcrc bad 0x%lx.\n", + csk, lskb, status, cxgbi_skcb_flags(lskb)); + cxgbi_skcb_set_flag(lskb, SKCBF_RX_HCRC_ERR); + } + if (status & (1 << CPL_RX_DDP_STATUS_DCRC_SHIFT)) { + pr_info("csk 0x%p, lhdr 0x%p, status 0x%x, dcrc bad 0x%lx.\n", + csk, lskb, status, cxgbi_skcb_flags(lskb)); + cxgbi_skcb_set_flag(lskb, SKCBF_RX_DCRC_ERR); + } + if (status & (1 << CPL_RX_DDP_STATUS_PAD_SHIFT)) { + log_debug(1 << CXGBI_DBG_PDU_RX, + "csk 0x%p, lhdr 0x%p, status 0x%x, pad bad.\n", + csk, lskb, status); + cxgbi_skcb_set_flag(lskb, SKCBF_RX_PAD_ERR); + } + if ((status & (1 << CPL_RX_DDP_STATUS_DDP_SHIFT)) && + !cxgbi_skcb_test_flag(lskb, SKCBF_RX_DATA)) { + log_debug(1 << CXGBI_DBG_PDU_RX, + "csk 0x%p, lhdr 0x%p, 0x%x, data ddp'ed.\n", + csk, lskb, status); + cxgbi_skcb_set_flag(lskb, SKCBF_RX_DATA_DDPD); + } + log_debug(1 << CXGBI_DBG_PDU_RX, + "csk 0x%p, lskb 0x%p, f 0x%lx.\n", + csk, lskb, cxgbi_skcb_flags(lskb)); + + cxgbi_skcb_set_flag(lskb, SKCBF_RX_STATUS); + cxgbi_conn_pdu_ready(csk); + spin_unlock_bh(&csk->lock); + goto rel_skb; + +abort_conn: + send_abort_req(csk); +discard: + spin_unlock_bh(&csk->lock); +rel_skb: + __kfree_skb(skb); +} + +static void do_fw4_ack(struct cxgbi_device *cdev, struct sk_buff *skb) +{ + struct cxgbi_sock *csk; + struct cpl_fw4_ack *rpl = (struct cpl_fw4_ack *)skb->data; + unsigned int tid = GET_TID(rpl); + struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev); + struct tid_info *t = lldi->tids; + + csk = lookup_tid(t, tid); + if (unlikely(!csk)) + pr_err("can't find connection for tid %u.\n", tid); + else { + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk 0x%p,%u,0x%lx,%u.\n", + csk, csk->state, csk->flags, csk->tid); + cxgbi_sock_rcv_wr_ack(csk, rpl->credits, ntohl(rpl->snd_una), + rpl->seq_vld); + } + __kfree_skb(skb); +} + +static void do_set_tcb_rpl(struct cxgbi_device *cdev, struct sk_buff *skb) +{ + struct cpl_set_tcb_rpl *rpl = (struct cpl_set_tcb_rpl *)skb->data; + unsigned int tid = GET_TID(rpl); + struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev); + struct tid_info *t = lldi->tids; + struct cxgbi_sock *csk; + + csk = lookup_tid(t, tid); + if (!csk) + pr_err("can't find conn. for tid %u.\n", tid); + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk 0x%p,%u,%lx,%u, status 0x%x.\n", + csk, csk->state, csk->flags, csk->tid, rpl->status); + + if (rpl->status != CPL_ERR_NONE) + pr_err("csk 0x%p,%u, SET_TCB_RPL status %u.\n", + csk, tid, rpl->status); + + __kfree_skb(skb); +} + +static int alloc_cpls(struct cxgbi_sock *csk) +{ + csk->cpl_close = alloc_wr(sizeof(struct cpl_close_con_req), + 0, GFP_KERNEL); + if (!csk->cpl_close) + return -ENOMEM; + + csk->cpl_abort_req = alloc_wr(sizeof(struct cpl_abort_req), + 0, GFP_KERNEL); + if (!csk->cpl_abort_req) + goto free_cpls; + + csk->cpl_abort_rpl = alloc_wr(sizeof(struct cpl_abort_rpl), + 0, GFP_KERNEL); + if (!csk->cpl_abort_rpl) + goto free_cpls; + return 0; + +free_cpls: + cxgbi_sock_free_cpl_skbs(csk); + return -ENOMEM; +} + +static inline void l2t_put(struct cxgbi_sock *csk) +{ + if (csk->l2t) { + cxgb4_l2t_release(csk->l2t); + csk->l2t = NULL; + cxgbi_sock_put(csk); + } +} + +static void release_offload_resources(struct cxgbi_sock *csk) +{ + struct cxgb4_lld_info *lldi; + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk 0x%p,%u,0x%lx,%u.\n", + csk, csk->state, csk->flags, csk->tid); + + cxgbi_sock_free_cpl_skbs(csk); + if (csk->wr_cred != csk->wr_max_cred) { + cxgbi_sock_purge_wr_queue(csk); + cxgbi_sock_reset_wr_list(csk); + } + + l2t_put(csk); + if (cxgbi_sock_flag(csk, CTPF_HAS_ATID)) + free_atid(csk); + else if (cxgbi_sock_flag(csk, CTPF_HAS_TID)) { + lldi = cxgbi_cdev_priv(csk->cdev); + cxgb4_remove_tid(lldi->tids, 0, csk->tid); + cxgbi_sock_clear_flag(csk, CTPF_HAS_TID); + cxgbi_sock_put(csk); + } + csk->dst = NULL; + csk->cdev = NULL; +} + +static int init_act_open(struct cxgbi_sock *csk) +{ + struct cxgbi_device *cdev = csk->cdev; + struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev); + struct net_device *ndev = cdev->ports[csk->port_id]; + struct sk_buff *skb = NULL; + struct neighbour *n = NULL; + void *daddr; + unsigned int step; + unsigned int size, size6; + int t4 = is_t4(lldi->adapter_type); + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk 0x%p,%u,0x%lx,%u.\n", + csk, csk->state, csk->flags, csk->tid); + + if (csk->csk_family == AF_INET) + daddr = &csk->daddr.sin_addr.s_addr; +#if IS_ENABLED(CONFIG_IPV6) + else if (csk->csk_family == AF_INET6) + daddr = &csk->daddr6.sin6_addr; +#endif + else { + pr_err("address family 0x%x not supported\n", csk->csk_family); + goto rel_resource; + } + + n = dst_neigh_lookup(csk->dst, daddr); + + if (!n) { + pr_err("%s, can't get neighbour of csk->dst.\n", ndev->name); + goto rel_resource; + } + + csk->atid = cxgb4_alloc_atid(lldi->tids, csk); + if (csk->atid < 0) { + pr_err("%s, NO atid available.\n", ndev->name); + return -EINVAL; + } + cxgbi_sock_set_flag(csk, CTPF_HAS_ATID); + cxgbi_sock_get(csk); + + csk->l2t = cxgb4_l2t_get(lldi->l2t, n, ndev, 0); + if (!csk->l2t) { + pr_err("%s, cannot alloc l2t.\n", ndev->name); + goto rel_resource; + } + cxgbi_sock_get(csk); + + if (t4) { + size = sizeof(struct cpl_act_open_req); + size6 = sizeof(struct cpl_act_open_req6); + } else { + size = sizeof(struct cpl_t5_act_open_req); + size6 = sizeof(struct cpl_t5_act_open_req6); + } + + if (csk->csk_family == AF_INET) + skb = alloc_wr(size, 0, GFP_NOIO); +#if IS_ENABLED(CONFIG_IPV6) + else + skb = alloc_wr(size6, 0, GFP_NOIO); +#endif + + if (!skb) + goto rel_resource; + skb->sk = (struct sock *)csk; + t4_set_arp_err_handler(skb, csk, cxgbi_sock_act_open_req_arp_failure); + + if (!csk->mtu) + csk->mtu = dst_mtu(csk->dst); + cxgb4_best_mtu(lldi->mtus, csk->mtu, &csk->mss_idx); + csk->tx_chan = cxgb4_port_chan(ndev); + /* SMT two entries per row */ + csk->smac_idx = ((cxgb4_port_viid(ndev) & 0x7F)) << 1; + step = lldi->ntxq / lldi->nchan; + csk->txq_idx = cxgb4_port_idx(ndev) * step; + step = lldi->nrxq / lldi->nchan; + csk->rss_qid = lldi->rxq_ids[cxgb4_port_idx(ndev) * step]; + csk->wr_cred = lldi->wr_cred - + DIV_ROUND_UP(sizeof(struct cpl_abort_req), 16); + csk->wr_max_cred = csk->wr_cred; + csk->wr_una_cred = 0; + cxgbi_sock_reset_wr_list(csk); + csk->err = 0; + + pr_info_ipaddr("csk 0x%p,%u,0x%lx,%u,%u,%u, mtu %u,%u, smac %u.\n", + (&csk->saddr), (&csk->daddr), csk, csk->state, + csk->flags, csk->tx_chan, csk->txq_idx, csk->rss_qid, + csk->mtu, csk->mss_idx, csk->smac_idx); + + /* must wait for either a act_open_rpl or act_open_establish */ + try_module_get(THIS_MODULE); + cxgbi_sock_set_state(csk, CTP_ACTIVE_OPEN); + if (csk->csk_family == AF_INET) + send_act_open_req(csk, skb, csk->l2t); +#if IS_ENABLED(CONFIG_IPV6) + else + send_act_open_req6(csk, skb, csk->l2t); +#endif + neigh_release(n); + + return 0; + +rel_resource: + if (n) + neigh_release(n); + if (skb) + __kfree_skb(skb); + return -EINVAL; +} + +cxgb4i_cplhandler_func cxgb4i_cplhandlers[NUM_CPL_CMDS] = { + [CPL_ACT_ESTABLISH] = do_act_establish, + [CPL_ACT_OPEN_RPL] = do_act_open_rpl, + [CPL_PEER_CLOSE] = do_peer_close, + [CPL_ABORT_REQ_RSS] = do_abort_req_rss, + [CPL_ABORT_RPL_RSS] = do_abort_rpl_rss, + [CPL_CLOSE_CON_RPL] = do_close_con_rpl, + [CPL_FW4_ACK] = do_fw4_ack, + [CPL_ISCSI_HDR] = do_rx_iscsi_hdr, + [CPL_ISCSI_DATA] = do_rx_iscsi_hdr, + [CPL_SET_TCB_RPL] = do_set_tcb_rpl, + [CPL_RX_DATA_DDP] = do_rx_data_ddp, + [CPL_RX_ISCSI_DDP] = do_rx_data_ddp, +}; + +int cxgb4i_ofld_init(struct cxgbi_device *cdev) +{ + int rc; + + if (cxgb4i_max_connect > CXGB4I_MAX_CONN) + cxgb4i_max_connect = CXGB4I_MAX_CONN; + + rc = cxgbi_device_portmap_create(cdev, cxgb4i_sport_base, + cxgb4i_max_connect); + if (rc < 0) + return rc; + + cdev->csk_release_offload_resources = release_offload_resources; + cdev->csk_push_tx_frames = push_tx_frames; + cdev->csk_send_abort_req = send_abort_req; + cdev->csk_send_close_req = send_close_req; + cdev->csk_send_rx_credits = send_rx_credits; + cdev->csk_alloc_cpls = alloc_cpls; + cdev->csk_init_act_open = init_act_open; + + pr_info("cdev 0x%p, offload up, added.\n", cdev); + return 0; +} + +/* + * functions to program the pagepod in h/w + */ +#define ULPMEM_IDATA_MAX_NPPODS 4 /* 256/PPOD_SIZE */ +static inline void ulp_mem_io_set_hdr(struct cxgb4_lld_info *lldi, + struct ulp_mem_io *req, + unsigned int wr_len, unsigned int dlen, + unsigned int pm_addr) +{ + struct ulptx_idata *idata = (struct ulptx_idata *)(req + 1); + + INIT_ULPTX_WR(req, wr_len, 0, 0); + if (is_t4(lldi->adapter_type)) + req->cmd = htonl(ULPTX_CMD(ULP_TX_MEM_WRITE) | + (ULP_MEMIO_ORDER(1))); + else + req->cmd = htonl(ULPTX_CMD(ULP_TX_MEM_WRITE) | + (V_T5_ULP_MEMIO_IMM(1))); + req->dlen = htonl(ULP_MEMIO_DATA_LEN(dlen >> 5)); + req->lock_addr = htonl(ULP_MEMIO_ADDR(pm_addr >> 5)); + req->len16 = htonl(DIV_ROUND_UP(wr_len - sizeof(req->wr), 16)); + + idata->cmd_more = htonl(ULPTX_CMD(ULP_TX_SC_IMM)); + idata->len = htonl(dlen); +} + +static int ddp_ppod_write_idata(struct cxgbi_device *cdev, unsigned int port_id, + struct cxgbi_pagepod_hdr *hdr, unsigned int idx, + unsigned int npods, + struct cxgbi_gather_list *gl, + unsigned int gl_pidx) +{ + struct cxgbi_ddp_info *ddp = cdev->ddp; + struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev); + struct sk_buff *skb; + struct ulp_mem_io *req; + struct ulptx_idata *idata; + struct cxgbi_pagepod *ppod; + unsigned int pm_addr = idx * PPOD_SIZE + ddp->llimit; + unsigned int dlen = PPOD_SIZE * npods; + unsigned int wr_len = roundup(sizeof(struct ulp_mem_io) + + sizeof(struct ulptx_idata) + dlen, 16); + unsigned int i; + + skb = alloc_wr(wr_len, 0, GFP_ATOMIC); + if (!skb) { + pr_err("cdev 0x%p, idx %u, npods %u, OOM.\n", + cdev, idx, npods); + return -ENOMEM; + } + req = (struct ulp_mem_io *)skb->head; + set_queue(skb, CPL_PRIORITY_CONTROL, NULL); + + ulp_mem_io_set_hdr(lldi, req, wr_len, dlen, pm_addr); + idata = (struct ulptx_idata *)(req + 1); + ppod = (struct cxgbi_pagepod *)(idata + 1); + + for (i = 0; i < npods; i++, ppod++, gl_pidx += PPOD_PAGES_MAX) { + if (!hdr && !gl) + cxgbi_ddp_ppod_clear(ppod); + else + cxgbi_ddp_ppod_set(ppod, hdr, gl, gl_pidx); + } + + cxgb4_ofld_send(cdev->ports[port_id], skb); + return 0; +} + +static int ddp_set_map(struct cxgbi_sock *csk, struct cxgbi_pagepod_hdr *hdr, + unsigned int idx, unsigned int npods, + struct cxgbi_gather_list *gl) +{ + unsigned int i, cnt; + int err = 0; + + for (i = 0; i < npods; i += cnt, idx += cnt) { + cnt = npods - i; + if (cnt > ULPMEM_IDATA_MAX_NPPODS) + cnt = ULPMEM_IDATA_MAX_NPPODS; + err = ddp_ppod_write_idata(csk->cdev, csk->port_id, hdr, + idx, cnt, gl, 4 * i); + if (err < 0) + break; + } + return err; +} + +static void ddp_clear_map(struct cxgbi_hba *chba, unsigned int tag, + unsigned int idx, unsigned int npods) +{ + unsigned int i, cnt; + int err; + + for (i = 0; i < npods; i += cnt, idx += cnt) { + cnt = npods - i; + if (cnt > ULPMEM_IDATA_MAX_NPPODS) + cnt = ULPMEM_IDATA_MAX_NPPODS; + err = ddp_ppod_write_idata(chba->cdev, chba->port_id, NULL, + idx, cnt, NULL, 0); + if (err < 0) + break; + } +} + +static int ddp_setup_conn_pgidx(struct cxgbi_sock *csk, unsigned int tid, + int pg_idx, bool reply) +{ + struct sk_buff *skb; + struct cpl_set_tcb_field *req; + + if (!pg_idx || pg_idx >= DDP_PGIDX_MAX) + return 0; + + skb = alloc_wr(sizeof(*req), 0, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + /* set up ulp page size */ + req = (struct cpl_set_tcb_field *)skb->head; + INIT_TP_WR(req, csk->tid); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, csk->tid)); + req->reply_ctrl = htons(NO_REPLY(reply) | QUEUENO(csk->rss_qid)); + req->word_cookie = htons(0); + req->mask = cpu_to_be64(0x3 << 8); + req->val = cpu_to_be64(pg_idx << 8); + set_wr_txq(skb, CPL_PRIORITY_CONTROL, csk->port_id); + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk 0x%p, tid 0x%x, pg_idx %u.\n", csk, csk->tid, pg_idx); + + cxgb4_ofld_send(csk->cdev->ports[csk->port_id], skb); + return 0; +} + +static int ddp_setup_conn_digest(struct cxgbi_sock *csk, unsigned int tid, + int hcrc, int dcrc, int reply) +{ + struct sk_buff *skb; + struct cpl_set_tcb_field *req; + + if (!hcrc && !dcrc) + return 0; + + skb = alloc_wr(sizeof(*req), 0, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + csk->hcrc_len = (hcrc ? 4 : 0); + csk->dcrc_len = (dcrc ? 4 : 0); + /* set up ulp submode */ + req = (struct cpl_set_tcb_field *)skb->head; + INIT_TP_WR(req, tid); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid)); + req->reply_ctrl = htons(NO_REPLY(reply) | QUEUENO(csk->rss_qid)); + req->word_cookie = htons(0); + req->mask = cpu_to_be64(0x3 << 4); + req->val = cpu_to_be64(((hcrc ? ULP_CRC_HEADER : 0) | + (dcrc ? ULP_CRC_DATA : 0)) << 4); + set_wr_txq(skb, CPL_PRIORITY_CONTROL, csk->port_id); + + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk 0x%p, tid 0x%x, crc %d,%d.\n", csk, csk->tid, hcrc, dcrc); + + cxgb4_ofld_send(csk->cdev->ports[csk->port_id], skb); + return 0; +} + +static int cxgb4i_ddp_init(struct cxgbi_device *cdev) +{ + struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev); + struct cxgbi_ddp_info *ddp = cdev->ddp; + unsigned int tagmask, pgsz_factor[4]; + int err; + + if (ddp) { + kref_get(&ddp->refcnt); + pr_warn("cdev 0x%p, ddp 0x%p already set up.\n", + cdev, cdev->ddp); + return -EALREADY; + } + + err = cxgbi_ddp_init(cdev, lldi->vr->iscsi.start, + lldi->vr->iscsi.start + lldi->vr->iscsi.size - 1, + lldi->iscsi_iolen, lldi->iscsi_iolen); + if (err < 0) + return err; + + ddp = cdev->ddp; + + tagmask = ddp->idx_mask << PPOD_IDX_SHIFT; + cxgbi_ddp_page_size_factor(pgsz_factor); + cxgb4_iscsi_init(lldi->ports[0], tagmask, pgsz_factor); + + cdev->csk_ddp_setup_digest = ddp_setup_conn_digest; + cdev->csk_ddp_setup_pgidx = ddp_setup_conn_pgidx; + cdev->csk_ddp_set = ddp_set_map; + cdev->csk_ddp_clear = ddp_clear_map; + + pr_info("cxgb4i 0x%p tag: sw %u, rsvd %u,%u, mask 0x%x.\n", + cdev, cdev->tag_format.sw_bits, cdev->tag_format.rsvd_bits, + cdev->tag_format.rsvd_shift, cdev->tag_format.rsvd_mask); + pr_info("cxgb4i 0x%p, nppods %u, bits %u, mask 0x%x,0x%x pkt %u/%u, " + " %u/%u.\n", + cdev, ddp->nppods, ddp->idx_bits, ddp->idx_mask, + ddp->rsvd_tag_mask, ddp->max_txsz, lldi->iscsi_iolen, + ddp->max_rxsz, lldi->iscsi_iolen); + pr_info("cxgb4i 0x%p max payload size: %u/%u, %u/%u.\n", + cdev, cdev->tx_max_size, ddp->max_txsz, cdev->rx_max_size, + ddp->max_rxsz); + return 0; +} + +static void *t4_uld_add(const struct cxgb4_lld_info *lldi) +{ + struct cxgbi_device *cdev; + struct port_info *pi; + int i, rc; + + cdev = cxgbi_device_register(sizeof(*lldi), lldi->nports); + if (!cdev) { + pr_info("t4 device 0x%p, register failed.\n", lldi); + return NULL; + } + pr_info("0x%p,0x%x, ports %u,%s, chan %u, q %u,%u, wr %u.\n", + cdev, lldi->adapter_type, lldi->nports, + lldi->ports[0]->name, lldi->nchan, lldi->ntxq, + lldi->nrxq, lldi->wr_cred); + for (i = 0; i < lldi->nrxq; i++) + log_debug(1 << CXGBI_DBG_DEV, + "t4 0x%p, rxq id #%d: %u.\n", + cdev, i, lldi->rxq_ids[i]); + + memcpy(cxgbi_cdev_priv(cdev), lldi, sizeof(*lldi)); + cdev->flags = CXGBI_FLAG_DEV_T4; + cdev->pdev = lldi->pdev; + cdev->ports = lldi->ports; + cdev->nports = lldi->nports; + cdev->mtus = lldi->mtus; + cdev->nmtus = NMTUS; + cdev->snd_win = cxgb4i_snd_win; + cdev->rcv_win = cxgb4i_rcv_win; + cdev->rx_credit_thres = cxgb4i_rx_credit_thres; + cdev->skb_tx_rsvd = CXGB4I_TX_HEADER_LEN; + cdev->skb_rx_extra = sizeof(struct cpl_iscsi_hdr); + cdev->itp = &cxgb4i_iscsi_transport; + + cdev->pfvf = FW_VIID_PFN_GET(cxgb4_port_viid(lldi->ports[0])) << 8; + pr_info("cdev 0x%p,%s, pfvf %u.\n", + cdev, lldi->ports[0]->name, cdev->pfvf); + + rc = cxgb4i_ddp_init(cdev); + if (rc) { + pr_info("t4 0x%p ddp init failed.\n", cdev); + goto err_out; + } + rc = cxgb4i_ofld_init(cdev); + if (rc) { + pr_info("t4 0x%p ofld init failed.\n", cdev); + goto err_out; + } + + rc = cxgbi_hbas_add(cdev, CXGB4I_MAX_LUN, CXGBI_MAX_CONN, + &cxgb4i_host_template, cxgb4i_stt); + if (rc) + goto err_out; + + for (i = 0; i < cdev->nports; i++) { + pi = netdev_priv(lldi->ports[i]); + cdev->hbas[i]->port_id = pi->port_id; + } + return cdev; + +err_out: + cxgbi_device_unregister(cdev); + return ERR_PTR(-ENOMEM); +} + +#define RX_PULL_LEN 128 +static int t4_uld_rx_handler(void *handle, const __be64 *rsp, + const struct pkt_gl *pgl) +{ + const struct cpl_act_establish *rpl; + struct sk_buff *skb; + unsigned int opc; + struct cxgbi_device *cdev = handle; + + if (pgl == NULL) { + unsigned int len = 64 - sizeof(struct rsp_ctrl) - 8; + + skb = alloc_wr(len, 0, GFP_ATOMIC); + if (!skb) + goto nomem; + skb_copy_to_linear_data(skb, &rsp[1], len); + } else { + if (unlikely(*(u8 *)rsp != *(u8 *)pgl->va)) { + pr_info("? FL 0x%p,RSS%#llx,FL %#llx,len %u.\n", + pgl->va, be64_to_cpu(*rsp), + be64_to_cpu(*(u64 *)pgl->va), + pgl->tot_len); + return 0; + } + skb = cxgb4_pktgl_to_skb(pgl, RX_PULL_LEN, RX_PULL_LEN); + if (unlikely(!skb)) + goto nomem; + } + + rpl = (struct cpl_act_establish *)skb->data; + opc = rpl->ot.opcode; + log_debug(1 << CXGBI_DBG_TOE, + "cdev %p, opcode 0x%x(0x%x,0x%x), skb %p.\n", + cdev, opc, rpl->ot.opcode_tid, ntohl(rpl->ot.opcode_tid), skb); + if (cxgb4i_cplhandlers[opc]) + cxgb4i_cplhandlers[opc](cdev, skb); + else { + pr_err("No handler for opcode 0x%x.\n", opc); + __kfree_skb(skb); + } + return 0; +nomem: + log_debug(1 << CXGBI_DBG_TOE, "OOM bailing out.\n"); + return 1; +} + +static int t4_uld_state_change(void *handle, enum cxgb4_state state) +{ + struct cxgbi_device *cdev = handle; + + switch (state) { + case CXGB4_STATE_UP: + pr_info("cdev 0x%p, UP.\n", cdev); + break; + case CXGB4_STATE_START_RECOVERY: + pr_info("cdev 0x%p, RECOVERY.\n", cdev); + /* close all connections */ + break; + case CXGB4_STATE_DOWN: + pr_info("cdev 0x%p, DOWN.\n", cdev); + break; + case CXGB4_STATE_DETACH: + pr_info("cdev 0x%p, DETACH.\n", cdev); + cxgbi_device_unregister(cdev); + break; + default: + pr_info("cdev 0x%p, unknown state %d.\n", cdev, state); + break; + } + return 0; +} + +static int __init cxgb4i_init_module(void) +{ + int rc; + + printk(KERN_INFO "%s", version); + + rc = cxgbi_iscsi_init(&cxgb4i_iscsi_transport, &cxgb4i_stt); + if (rc < 0) + return rc; + cxgb4_register_uld(CXGB4_ULD_ISCSI, &cxgb4i_uld_info); + + return 0; +} + +static void __exit cxgb4i_exit_module(void) +{ + cxgb4_unregister_uld(CXGB4_ULD_ISCSI); + cxgbi_device_unregister_all(CXGBI_FLAG_DEV_T4); + cxgbi_iscsi_cleanup(&cxgb4i_iscsi_transport, &cxgb4i_stt); +} + +module_init(cxgb4i_init_module); +module_exit(cxgb4i_exit_module); diff --git a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.h b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.h new file mode 100644 index 0000000..1096026 --- /dev/null +++ b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.h @@ -0,0 +1,43 @@ +/* + * cxgb4i.h: Chelsio T4 iSCSI driver. + * + * Copyright (c) 2010 Chelsio Communications, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. + * + * Written by: Karen Xie (kxie@chelsio.com) + * Written by: Rakesh Ranjan (rranjan@chelsio.com) + */ + +#ifndef __CXGB4I_H__ +#define __CXGB4I_H__ + +#define CXGB4I_SCSI_HOST_QDEPTH 1024 +#define CXGB4I_MAX_CONN 16384 +#define CXGB4I_MAX_TARGET CXGB4I_MAX_CONN +#define CXGB4I_MAX_LUN 0x1000 + +/* for TX: a skb must have a headroom of at least TX_HEADER_LEN bytes */ +#define CXGB4I_TX_HEADER_LEN \ + (sizeof(struct fw_ofld_tx_data_wr) + sizeof(struct sge_opaque_hdr)) + +struct ulptx_idata { + __be32 cmd_more; + __be32 len; +}; + +struct cpl_rx_data_ddp { + union opcode_tid ot; + __be16 urg; + __be16 len; + __be32 seq; + union { + __be32 nxt_seq; + __be32 ddp_report; + }; + __be32 ulp_crc; + __be32 ddpvld; +}; +#endif /* __CXGB4I_H__ */ diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c new file mode 100644 index 0000000..7da59c3 --- /dev/null +++ b/drivers/scsi/cxgbi/libcxgbi.c @@ -0,0 +1,2929 @@ +/* + * libcxgbi.c: Chelsio common library for T3/T4 iSCSI driver. + * + * Copyright (c) 2010 Chelsio Communications, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. + * + * Written by: Karen Xie (kxie@chelsio.com) + * Written by: Rakesh Ranjan (rranjan@chelsio.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include /* ip_dev_find */ +#include +#include + +static unsigned int dbg_level; + +#include "libcxgbi.h" + +#define DRV_MODULE_NAME "libcxgbi" +#define DRV_MODULE_DESC "Chelsio iSCSI driver library" +#define DRV_MODULE_VERSION "0.9.0" +#define DRV_MODULE_RELDATE "Jun. 2010" + +MODULE_AUTHOR("Chelsio Communications, Inc."); +MODULE_DESCRIPTION(DRV_MODULE_DESC); +MODULE_VERSION(DRV_MODULE_VERSION); +MODULE_LICENSE("GPL"); + +module_param(dbg_level, uint, 0644); +MODULE_PARM_DESC(dbg_level, "libiscsi debug level (default=0)"); + + +/* + * cxgbi device management + * maintains a list of the cxgbi devices + */ +static LIST_HEAD(cdev_list); +static DEFINE_MUTEX(cdev_mutex); + +static LIST_HEAD(cdev_rcu_list); +static DEFINE_SPINLOCK(cdev_rcu_lock); + +int cxgbi_device_portmap_create(struct cxgbi_device *cdev, unsigned int base, + unsigned int max_conn) +{ + struct cxgbi_ports_map *pmap = &cdev->pmap; + + pmap->port_csk = cxgbi_alloc_big_mem(max_conn * + sizeof(struct cxgbi_sock *), + GFP_KERNEL); + if (!pmap->port_csk) { + pr_warn("cdev 0x%p, portmap OOM %u.\n", cdev, max_conn); + return -ENOMEM; + } + + pmap->max_connect = max_conn; + pmap->sport_base = base; + spin_lock_init(&pmap->lock); + return 0; +} +EXPORT_SYMBOL_GPL(cxgbi_device_portmap_create); + +void cxgbi_device_portmap_cleanup(struct cxgbi_device *cdev) +{ + struct cxgbi_ports_map *pmap = &cdev->pmap; + struct cxgbi_sock *csk; + int i; + + for (i = 0; i < pmap->max_connect; i++) { + if (pmap->port_csk[i]) { + csk = pmap->port_csk[i]; + pmap->port_csk[i] = NULL; + log_debug(1 << CXGBI_DBG_SOCK, + "csk 0x%p, cdev 0x%p, offload down.\n", + csk, cdev); + spin_lock_bh(&csk->lock); + cxgbi_sock_set_flag(csk, CTPF_OFFLOAD_DOWN); + cxgbi_sock_closed(csk); + spin_unlock_bh(&csk->lock); + cxgbi_sock_put(csk); + } + } +} +EXPORT_SYMBOL_GPL(cxgbi_device_portmap_cleanup); + +static inline void cxgbi_device_destroy(struct cxgbi_device *cdev) +{ + log_debug(1 << CXGBI_DBG_DEV, + "cdev 0x%p, p# %u.\n", cdev, cdev->nports); + cxgbi_hbas_remove(cdev); + cxgbi_device_portmap_cleanup(cdev); + if (cdev->dev_ddp_cleanup) + cdev->dev_ddp_cleanup(cdev); + else + cxgbi_ddp_cleanup(cdev); + if (cdev->ddp) + cxgbi_ddp_cleanup(cdev); + if (cdev->pmap.max_connect) + cxgbi_free_big_mem(cdev->pmap.port_csk); + kfree(cdev); +} + +struct cxgbi_device *cxgbi_device_register(unsigned int extra, + unsigned int nports) +{ + struct cxgbi_device *cdev; + + cdev = kzalloc(sizeof(*cdev) + extra + nports * + (sizeof(struct cxgbi_hba *) + + sizeof(struct net_device *)), + GFP_KERNEL); + if (!cdev) { + pr_warn("nport %d, OOM.\n", nports); + return NULL; + } + cdev->ports = (struct net_device **)(cdev + 1); + cdev->hbas = (struct cxgbi_hba **)(((char*)cdev->ports) + nports * + sizeof(struct net_device *)); + if (extra) + cdev->dd_data = ((char *)cdev->hbas) + + nports * sizeof(struct cxgbi_hba *); + spin_lock_init(&cdev->pmap.lock); + + mutex_lock(&cdev_mutex); + list_add_tail(&cdev->list_head, &cdev_list); + mutex_unlock(&cdev_mutex); + + spin_lock(&cdev_rcu_lock); + list_add_tail_rcu(&cdev->rcu_node, &cdev_rcu_list); + spin_unlock(&cdev_rcu_lock); + + log_debug(1 << CXGBI_DBG_DEV, + "cdev 0x%p, p# %u.\n", cdev, nports); + return cdev; +} +EXPORT_SYMBOL_GPL(cxgbi_device_register); + +void cxgbi_device_unregister(struct cxgbi_device *cdev) +{ + log_debug(1 << CXGBI_DBG_DEV, + "cdev 0x%p, p# %u,%s.\n", + cdev, cdev->nports, cdev->nports ? cdev->ports[0]->name : ""); + + mutex_lock(&cdev_mutex); + list_del(&cdev->list_head); + mutex_unlock(&cdev_mutex); + + spin_lock(&cdev_rcu_lock); + list_del_rcu(&cdev->rcu_node); + spin_unlock(&cdev_rcu_lock); + synchronize_rcu(); + + cxgbi_device_destroy(cdev); +} +EXPORT_SYMBOL_GPL(cxgbi_device_unregister); + +void cxgbi_device_unregister_all(unsigned int flag) +{ + struct cxgbi_device *cdev, *tmp; + + mutex_lock(&cdev_mutex); + list_for_each_entry_safe(cdev, tmp, &cdev_list, list_head) { + if ((cdev->flags & flag) == flag) { + mutex_unlock(&cdev_mutex); + cxgbi_device_unregister(cdev); + mutex_lock(&cdev_mutex); + } + } + mutex_unlock(&cdev_mutex); +} +EXPORT_SYMBOL_GPL(cxgbi_device_unregister_all); + +struct cxgbi_device *cxgbi_device_find_by_lldev(void *lldev) +{ + struct cxgbi_device *cdev, *tmp; + + mutex_lock(&cdev_mutex); + list_for_each_entry_safe(cdev, tmp, &cdev_list, list_head) { + if (cdev->lldev == lldev) { + mutex_unlock(&cdev_mutex); + return cdev; + } + } + mutex_unlock(&cdev_mutex); + + log_debug(1 << CXGBI_DBG_DEV, + "lldev 0x%p, NO match found.\n", lldev); + return NULL; +} +EXPORT_SYMBOL_GPL(cxgbi_device_find_by_lldev); + +struct cxgbi_device *cxgbi_device_find_by_netdev(struct net_device *ndev, + int *port) +{ + struct net_device *vdev = NULL; + struct cxgbi_device *cdev, *tmp; + int i; + + if (ndev->priv_flags & IFF_802_1Q_VLAN) { + vdev = ndev; + ndev = vlan_dev_real_dev(ndev); + log_debug(1 << CXGBI_DBG_DEV, + "vlan dev %s -> %s.\n", vdev->name, ndev->name); + } + + mutex_lock(&cdev_mutex); + list_for_each_entry_safe(cdev, tmp, &cdev_list, list_head) { + for (i = 0; i < cdev->nports; i++) { + if (ndev == cdev->ports[i]) { + cdev->hbas[i]->vdev = vdev; + mutex_unlock(&cdev_mutex); + if (port) + *port = i; + return cdev; + } + } + } + mutex_unlock(&cdev_mutex); + log_debug(1 << CXGBI_DBG_DEV, + "ndev 0x%p, %s, NO match found.\n", ndev, ndev->name); + return NULL; +} +EXPORT_SYMBOL_GPL(cxgbi_device_find_by_netdev); + +struct cxgbi_device *cxgbi_device_find_by_netdev_rcu(struct net_device *ndev, + int *port) +{ + struct net_device *vdev = NULL; + struct cxgbi_device *cdev; + int i; + + if (ndev->priv_flags & IFF_802_1Q_VLAN) { + vdev = ndev; + ndev = vlan_dev_real_dev(ndev); + pr_info("vlan dev %s -> %s.\n", vdev->name, ndev->name); + } + + rcu_read_lock(); + list_for_each_entry_rcu(cdev, &cdev_rcu_list, rcu_node) { + for (i = 0; i < cdev->nports; i++) { + if (ndev == cdev->ports[i]) { + cdev->hbas[i]->vdev = vdev; + rcu_read_unlock(); + if (port) + *port = i; + return cdev; + } + } + } + rcu_read_unlock(); + + log_debug(1 << CXGBI_DBG_DEV, + "ndev 0x%p, %s, NO match found.\n", ndev, ndev->name); + return NULL; +} +EXPORT_SYMBOL_GPL(cxgbi_device_find_by_netdev_rcu); + +#if IS_ENABLED(CONFIG_IPV6) +static struct cxgbi_device *cxgbi_device_find_by_mac(struct net_device *ndev, + int *port) +{ + struct net_device *vdev = NULL; + struct cxgbi_device *cdev, *tmp; + int i; + + if (ndev->priv_flags & IFF_802_1Q_VLAN) { + vdev = ndev; + ndev = vlan_dev_real_dev(ndev); + pr_info("vlan dev %s -> %s.\n", vdev->name, ndev->name); + } + + mutex_lock(&cdev_mutex); + list_for_each_entry_safe(cdev, tmp, &cdev_list, list_head) { + for (i = 0; i < cdev->nports; i++) { + if (!memcmp(ndev->dev_addr, cdev->ports[i]->dev_addr, + MAX_ADDR_LEN)) { + cdev->hbas[i]->vdev = vdev; + mutex_unlock(&cdev_mutex); + if (port) + *port = i; + return cdev; + } + } + } + mutex_unlock(&cdev_mutex); + log_debug(1 << CXGBI_DBG_DEV, + "ndev 0x%p, %s, NO match mac found.\n", + ndev, ndev->name); + return NULL; +} +#endif + +void cxgbi_hbas_remove(struct cxgbi_device *cdev) +{ + int i; + struct cxgbi_hba *chba; + + log_debug(1 << CXGBI_DBG_DEV, + "cdev 0x%p, p#%u.\n", cdev, cdev->nports); + + for (i = 0; i < cdev->nports; i++) { + chba = cdev->hbas[i]; + if (chba) { + cdev->hbas[i] = NULL; + iscsi_host_remove(chba->shost); + pci_dev_put(cdev->pdev); + iscsi_host_free(chba->shost); + } + } +} +EXPORT_SYMBOL_GPL(cxgbi_hbas_remove); + +int cxgbi_hbas_add(struct cxgbi_device *cdev, u64 max_lun, + unsigned int max_id, struct scsi_host_template *sht, + struct scsi_transport_template *stt) +{ + struct cxgbi_hba *chba; + struct Scsi_Host *shost; + int i, err; + + log_debug(1 << CXGBI_DBG_DEV, "cdev 0x%p, p#%u.\n", cdev, cdev->nports); + + for (i = 0; i < cdev->nports; i++) { + shost = iscsi_host_alloc(sht, sizeof(*chba), 1); + if (!shost) { + pr_info("0x%p, p%d, %s, host alloc failed.\n", + cdev, i, cdev->ports[i]->name); + err = -ENOMEM; + goto err_out; + } + + shost->transportt = stt; + shost->max_lun = max_lun; + shost->max_id = max_id; + shost->max_channel = 0; + shost->max_cmd_len = 16; + + chba = iscsi_host_priv(shost); + chba->cdev = cdev; + chba->ndev = cdev->ports[i]; + chba->shost = shost; + + log_debug(1 << CXGBI_DBG_DEV, + "cdev 0x%p, p#%d %s: chba 0x%p.\n", + cdev, i, cdev->ports[i]->name, chba); + + pci_dev_get(cdev->pdev); + err = iscsi_host_add(shost, &cdev->pdev->dev); + if (err) { + pr_info("cdev 0x%p, p#%d %s, host add failed.\n", + cdev, i, cdev->ports[i]->name); + pci_dev_put(cdev->pdev); + scsi_host_put(shost); + goto err_out; + } + + cdev->hbas[i] = chba; + } + + return 0; + +err_out: + cxgbi_hbas_remove(cdev); + return err; +} +EXPORT_SYMBOL_GPL(cxgbi_hbas_add); + +/* + * iSCSI offload + * + * - source port management + * To find a free source port in the port allocation map we use a very simple + * rotor scheme to look for the next free port. + * + * If a source port has been specified make sure that it doesn't collide with + * our normal source port allocation map. If it's outside the range of our + * allocation/deallocation scheme just let them use it. + * + * If the source port is outside our allocation range, the caller is + * responsible for keeping track of their port usage. + */ + +static struct cxgbi_sock *find_sock_on_port(struct cxgbi_device *cdev, + unsigned char port_id) +{ + struct cxgbi_ports_map *pmap = &cdev->pmap; + unsigned int i; + unsigned int used; + + if (!pmap->max_connect || !pmap->used) + return NULL; + + spin_lock_bh(&pmap->lock); + used = pmap->used; + for (i = 0; used && i < pmap->max_connect; i++) { + struct cxgbi_sock *csk = pmap->port_csk[i]; + + if (csk) { + if (csk->port_id == port_id) { + spin_unlock_bh(&pmap->lock); + return csk; + } + used--; + } + } + spin_unlock_bh(&pmap->lock); + + return NULL; +} + +static int sock_get_port(struct cxgbi_sock *csk) +{ + struct cxgbi_device *cdev = csk->cdev; + struct cxgbi_ports_map *pmap = &cdev->pmap; + unsigned int start; + int idx; + __be16 *port; + + if (!pmap->max_connect) { + pr_err("cdev 0x%p, p#%u %s, NO port map.\n", + cdev, csk->port_id, cdev->ports[csk->port_id]->name); + return -EADDRNOTAVAIL; + } + + if (csk->csk_family == AF_INET) + port = &csk->saddr.sin_port; + else /* ipv6 */ + port = &csk->saddr6.sin6_port; + + if (*port) { + pr_err("source port NON-ZERO %u.\n", + ntohs(*port)); + return -EADDRINUSE; + } + + spin_lock_bh(&pmap->lock); + if (pmap->used >= pmap->max_connect) { + spin_unlock_bh(&pmap->lock); + pr_info("cdev 0x%p, p#%u %s, ALL ports used.\n", + cdev, csk->port_id, cdev->ports[csk->port_id]->name); + return -EADDRNOTAVAIL; + } + + start = idx = pmap->next; + do { + if (++idx >= pmap->max_connect) + idx = 0; + if (!pmap->port_csk[idx]) { + pmap->used++; + *port = htons(pmap->sport_base + idx); + pmap->next = idx; + pmap->port_csk[idx] = csk; + spin_unlock_bh(&pmap->lock); + cxgbi_sock_get(csk); + log_debug(1 << CXGBI_DBG_SOCK, + "cdev 0x%p, p#%u %s, p %u, %u.\n", + cdev, csk->port_id, + cdev->ports[csk->port_id]->name, + pmap->sport_base + idx, pmap->next); + return 0; + } + } while (idx != start); + spin_unlock_bh(&pmap->lock); + + /* should not happen */ + pr_warn("cdev 0x%p, p#%u %s, next %u?\n", + cdev, csk->port_id, cdev->ports[csk->port_id]->name, + pmap->next); + return -EADDRNOTAVAIL; +} + +static void sock_put_port(struct cxgbi_sock *csk) +{ + struct cxgbi_device *cdev = csk->cdev; + struct cxgbi_ports_map *pmap = &cdev->pmap; + __be16 *port; + + if (csk->csk_family == AF_INET) + port = &csk->saddr.sin_port; + else /* ipv6 */ + port = &csk->saddr6.sin6_port; + + if (*port) { + int idx = ntohs(*port) - pmap->sport_base; + + *port = 0; + if (idx < 0 || idx >= pmap->max_connect) { + pr_err("cdev 0x%p, p#%u %s, port %u OOR.\n", + cdev, csk->port_id, + cdev->ports[csk->port_id]->name, + ntohs(*port)); + return; + } + + spin_lock_bh(&pmap->lock); + pmap->port_csk[idx] = NULL; + pmap->used--; + spin_unlock_bh(&pmap->lock); + + log_debug(1 << CXGBI_DBG_SOCK, + "cdev 0x%p, p#%u %s, release %u.\n", + cdev, csk->port_id, cdev->ports[csk->port_id]->name, + pmap->sport_base + idx); + + cxgbi_sock_put(csk); + } +} + +/* + * iscsi tcp connection + */ +void cxgbi_sock_free_cpl_skbs(struct cxgbi_sock *csk) +{ + if (csk->cpl_close) { + kfree_skb(csk->cpl_close); + csk->cpl_close = NULL; + } + if (csk->cpl_abort_req) { + kfree_skb(csk->cpl_abort_req); + csk->cpl_abort_req = NULL; + } + if (csk->cpl_abort_rpl) { + kfree_skb(csk->cpl_abort_rpl); + csk->cpl_abort_rpl = NULL; + } +} +EXPORT_SYMBOL_GPL(cxgbi_sock_free_cpl_skbs); + +static struct cxgbi_sock *cxgbi_sock_create(struct cxgbi_device *cdev) +{ + struct cxgbi_sock *csk = kzalloc(sizeof(*csk), GFP_NOIO); + + if (!csk) { + pr_info("alloc csk %zu failed.\n", sizeof(*csk)); + return NULL; + } + + if (cdev->csk_alloc_cpls(csk) < 0) { + pr_info("csk 0x%p, alloc cpls failed.\n", csk); + kfree(csk); + return NULL; + } + + spin_lock_init(&csk->lock); + kref_init(&csk->refcnt); + skb_queue_head_init(&csk->receive_queue); + skb_queue_head_init(&csk->write_queue); + setup_timer(&csk->retry_timer, NULL, (unsigned long)csk); + rwlock_init(&csk->callback_lock); + csk->cdev = cdev; + csk->flags = 0; + cxgbi_sock_set_state(csk, CTP_CLOSED); + + log_debug(1 << CXGBI_DBG_SOCK, "cdev 0x%p, new csk 0x%p.\n", cdev, csk); + + return csk; +} + +static struct rtable *find_route_ipv4(struct flowi4 *fl4, + __be32 saddr, __be32 daddr, + __be16 sport, __be16 dport, u8 tos) +{ + struct rtable *rt; + + rt = ip_route_output_ports(&init_net, fl4, NULL, daddr, saddr, + dport, sport, IPPROTO_TCP, tos, 0); + if (IS_ERR(rt)) + return NULL; + + return rt; +} + +static struct cxgbi_sock *cxgbi_check_route(struct sockaddr *dst_addr) +{ + struct sockaddr_in *daddr = (struct sockaddr_in *)dst_addr; + struct dst_entry *dst; + struct net_device *ndev; + struct cxgbi_device *cdev; + struct rtable *rt = NULL; + struct neighbour *n; + struct flowi4 fl4; + struct cxgbi_sock *csk = NULL; + unsigned int mtu = 0; + int port = 0xFFFF; + int err = 0; + + rt = find_route_ipv4(&fl4, 0, daddr->sin_addr.s_addr, 0, daddr->sin_port, 0); + if (!rt) { + pr_info("no route to ipv4 0x%x, port %u.\n", + be32_to_cpu(daddr->sin_addr.s_addr), + be16_to_cpu(daddr->sin_port)); + err = -ENETUNREACH; + goto err_out; + } + dst = &rt->dst; + n = dst_neigh_lookup(dst, &daddr->sin_addr.s_addr); + if (!n) { + err = -ENODEV; + goto rel_rt; + } + ndev = n->dev; + + if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { + pr_info("multi-cast route %pI4, port %u, dev %s.\n", + &daddr->sin_addr.s_addr, ntohs(daddr->sin_port), + ndev->name); + err = -ENETUNREACH; + goto rel_neigh; + } + + if (ndev->flags & IFF_LOOPBACK) { + ndev = ip_dev_find(&init_net, daddr->sin_addr.s_addr); + mtu = ndev->mtu; + pr_info("rt dev %s, loopback -> %s, mtu %u.\n", + n->dev->name, ndev->name, mtu); + } + + cdev = cxgbi_device_find_by_netdev(ndev, &port); + if (!cdev) { + pr_info("dst %pI4, %s, NOT cxgbi device.\n", + &daddr->sin_addr.s_addr, ndev->name); + err = -ENETUNREACH; + goto rel_neigh; + } + log_debug(1 << CXGBI_DBG_SOCK, + "route to %pI4 :%u, ndev p#%d,%s, cdev 0x%p.\n", + &daddr->sin_addr.s_addr, ntohs(daddr->sin_port), + port, ndev->name, cdev); + + csk = cxgbi_sock_create(cdev); + if (!csk) { + err = -ENOMEM; + goto rel_neigh; + } + csk->cdev = cdev; + csk->port_id = port; + csk->mtu = mtu; + csk->dst = dst; + + csk->csk_family = AF_INET; + csk->daddr.sin_addr.s_addr = daddr->sin_addr.s_addr; + csk->daddr.sin_port = daddr->sin_port; + csk->daddr.sin_family = daddr->sin_family; + csk->saddr.sin_family = daddr->sin_family; + csk->saddr.sin_addr.s_addr = fl4.saddr; + neigh_release(n); + + return csk; + +rel_neigh: + neigh_release(n); + +rel_rt: + ip_rt_put(rt); + if (csk) + cxgbi_sock_closed(csk); +err_out: + return ERR_PTR(err); +} + +#if IS_ENABLED(CONFIG_IPV6) +static struct rt6_info *find_route_ipv6(const struct in6_addr *saddr, + const struct in6_addr *daddr) +{ + struct flowi6 fl; + + if (saddr) + memcpy(&fl.saddr, saddr, sizeof(struct in6_addr)); + if (daddr) + memcpy(&fl.daddr, daddr, sizeof(struct in6_addr)); + return (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); +} + +static struct cxgbi_sock *cxgbi_check_route6(struct sockaddr *dst_addr) +{ + struct sockaddr_in6 *daddr6 = (struct sockaddr_in6 *)dst_addr; + struct dst_entry *dst; + struct net_device *ndev; + struct cxgbi_device *cdev; + struct rt6_info *rt = NULL; + struct neighbour *n; + struct in6_addr pref_saddr; + struct cxgbi_sock *csk = NULL; + unsigned int mtu = 0; + int port = 0xFFFF; + int err = 0; + + rt = find_route_ipv6(NULL, &daddr6->sin6_addr); + + if (!rt) { + pr_info("no route to ipv6 %pI6 port %u\n", + daddr6->sin6_addr.s6_addr, + be16_to_cpu(daddr6->sin6_port)); + err = -ENETUNREACH; + goto err_out; + } + + dst = &rt->dst; + + n = dst_neigh_lookup(dst, &daddr6->sin6_addr); + + if (!n) { + pr_info("%pI6, port %u, dst no neighbour.\n", + daddr6->sin6_addr.s6_addr, + be16_to_cpu(daddr6->sin6_port)); + err = -ENETUNREACH; + goto rel_rt; + } + ndev = n->dev; + + if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) { + pr_info("multi-cast route %pI6 port %u, dev %s.\n", + daddr6->sin6_addr.s6_addr, + ntohs(daddr6->sin6_port), ndev->name); + err = -ENETUNREACH; + goto rel_rt; + } + + cdev = cxgbi_device_find_by_netdev(ndev, &port); + if (!cdev) + cdev = cxgbi_device_find_by_mac(ndev, &port); + if (!cdev) { + pr_info("dst %pI6 %s, NOT cxgbi device.\n", + daddr6->sin6_addr.s6_addr, ndev->name); + err = -ENETUNREACH; + goto rel_rt; + } + log_debug(1 << CXGBI_DBG_SOCK, + "route to %pI6 :%u, ndev p#%d,%s, cdev 0x%p.\n", + daddr6->sin6_addr.s6_addr, ntohs(daddr6->sin6_port), port, + ndev->name, cdev); + + csk = cxgbi_sock_create(cdev); + if (!csk) { + err = -ENOMEM; + goto rel_rt; + } + csk->cdev = cdev; + csk->port_id = port; + csk->mtu = mtu; + csk->dst = dst; + + if (ipv6_addr_any(&rt->rt6i_prefsrc.addr)) { + struct inet6_dev *idev = ip6_dst_idev((struct dst_entry *)rt); + + err = ipv6_dev_get_saddr(&init_net, idev ? idev->dev : NULL, + &daddr6->sin6_addr, 0, &pref_saddr); + if (err) { + pr_info("failed to get source address to reach %pI6\n", + &daddr6->sin6_addr); + goto rel_rt; + } + } else { + pref_saddr = rt->rt6i_prefsrc.addr; + } + + csk->csk_family = AF_INET6; + csk->daddr6.sin6_addr = daddr6->sin6_addr; + csk->daddr6.sin6_port = daddr6->sin6_port; + csk->daddr6.sin6_family = daddr6->sin6_family; + csk->saddr6.sin6_family = daddr6->sin6_family; + csk->saddr6.sin6_addr = pref_saddr; + + neigh_release(n); + return csk; + +rel_rt: + if (n) + neigh_release(n); + + ip6_rt_put(rt); + if (csk) + cxgbi_sock_closed(csk); +err_out: + return ERR_PTR(err); +} +#endif /* IS_ENABLED(CONFIG_IPV6) */ + +void cxgbi_sock_established(struct cxgbi_sock *csk, unsigned int snd_isn, + unsigned int opt) +{ + csk->write_seq = csk->snd_nxt = csk->snd_una = snd_isn; + dst_confirm(csk->dst); + smp_mb(); + cxgbi_sock_set_state(csk, CTP_ESTABLISHED); +} +EXPORT_SYMBOL_GPL(cxgbi_sock_established); + +static void cxgbi_inform_iscsi_conn_closing(struct cxgbi_sock *csk) +{ + log_debug(1 << CXGBI_DBG_SOCK, + "csk 0x%p, state %u, flags 0x%lx, conn 0x%p.\n", + csk, csk->state, csk->flags, csk->user_data); + + if (csk->state != CTP_ESTABLISHED) { + read_lock_bh(&csk->callback_lock); + if (csk->user_data) + iscsi_conn_failure(csk->user_data, + ISCSI_ERR_TCP_CONN_CLOSE); + read_unlock_bh(&csk->callback_lock); + } +} + +void cxgbi_sock_closed(struct cxgbi_sock *csk) +{ + log_debug(1 << CXGBI_DBG_SOCK, "csk 0x%p,%u,0x%lx,%u.\n", + csk, (csk)->state, (csk)->flags, (csk)->tid); + cxgbi_sock_set_flag(csk, CTPF_ACTIVE_CLOSE_NEEDED); + if (csk->state == CTP_ACTIVE_OPEN || csk->state == CTP_CLOSED) + return; + if (csk->saddr.sin_port) + sock_put_port(csk); + if (csk->dst) + dst_release(csk->dst); + csk->cdev->csk_release_offload_resources(csk); + cxgbi_sock_set_state(csk, CTP_CLOSED); + cxgbi_inform_iscsi_conn_closing(csk); + cxgbi_sock_put(csk); +} +EXPORT_SYMBOL_GPL(cxgbi_sock_closed); + +static void need_active_close(struct cxgbi_sock *csk) +{ + int data_lost; + int close_req = 0; + + log_debug(1 << CXGBI_DBG_SOCK, "csk 0x%p,%u,0x%lx,%u.\n", + csk, (csk)->state, (csk)->flags, (csk)->tid); + spin_lock_bh(&csk->lock); + dst_confirm(csk->dst); + data_lost = skb_queue_len(&csk->receive_queue); + __skb_queue_purge(&csk->receive_queue); + + if (csk->state == CTP_ACTIVE_OPEN) + cxgbi_sock_set_flag(csk, CTPF_ACTIVE_CLOSE_NEEDED); + else if (csk->state == CTP_ESTABLISHED) { + close_req = 1; + cxgbi_sock_set_state(csk, CTP_ACTIVE_CLOSE); + } else if (csk->state == CTP_PASSIVE_CLOSE) { + close_req = 1; + cxgbi_sock_set_state(csk, CTP_CLOSE_WAIT_2); + } + + if (close_req) { + if (data_lost) + csk->cdev->csk_send_abort_req(csk); + else + csk->cdev->csk_send_close_req(csk); + } + + spin_unlock_bh(&csk->lock); +} + +void cxgbi_sock_fail_act_open(struct cxgbi_sock *csk, int errno) +{ + pr_info("csk 0x%p,%u,%lx, %pI4:%u-%pI4:%u, err %d.\n", + csk, csk->state, csk->flags, + &csk->saddr.sin_addr.s_addr, csk->saddr.sin_port, + &csk->daddr.sin_addr.s_addr, csk->daddr.sin_port, + errno); + + cxgbi_sock_set_state(csk, CTP_CONNECTING); + csk->err = errno; + cxgbi_sock_closed(csk); +} +EXPORT_SYMBOL_GPL(cxgbi_sock_fail_act_open); + +void cxgbi_sock_act_open_req_arp_failure(void *handle, struct sk_buff *skb) +{ + struct cxgbi_sock *csk = (struct cxgbi_sock *)skb->sk; + + log_debug(1 << CXGBI_DBG_SOCK, "csk 0x%p,%u,0x%lx,%u.\n", + csk, (csk)->state, (csk)->flags, (csk)->tid); + cxgbi_sock_get(csk); + spin_lock_bh(&csk->lock); + if (csk->state == CTP_ACTIVE_OPEN) + cxgbi_sock_fail_act_open(csk, -EHOSTUNREACH); + spin_unlock_bh(&csk->lock); + cxgbi_sock_put(csk); + __kfree_skb(skb); +} +EXPORT_SYMBOL_GPL(cxgbi_sock_act_open_req_arp_failure); + +void cxgbi_sock_rcv_abort_rpl(struct cxgbi_sock *csk) +{ + cxgbi_sock_get(csk); + spin_lock_bh(&csk->lock); + + cxgbi_sock_set_flag(csk, CTPF_ABORT_RPL_RCVD); + if (cxgbi_sock_flag(csk, CTPF_ABORT_RPL_PENDING)) { + cxgbi_sock_clear_flag(csk, CTPF_ABORT_RPL_PENDING); + if (cxgbi_sock_flag(csk, CTPF_ABORT_REQ_RCVD)) + pr_err("csk 0x%p,%u,0x%lx,%u,ABT_RPL_RSS.\n", + csk, csk->state, csk->flags, csk->tid); + cxgbi_sock_closed(csk); + } + + spin_unlock_bh(&csk->lock); + cxgbi_sock_put(csk); +} +EXPORT_SYMBOL_GPL(cxgbi_sock_rcv_abort_rpl); + +void cxgbi_sock_rcv_peer_close(struct cxgbi_sock *csk) +{ + log_debug(1 << CXGBI_DBG_SOCK, "csk 0x%p,%u,0x%lx,%u.\n", + csk, (csk)->state, (csk)->flags, (csk)->tid); + cxgbi_sock_get(csk); + spin_lock_bh(&csk->lock); + + if (cxgbi_sock_flag(csk, CTPF_ABORT_RPL_PENDING)) + goto done; + + switch (csk->state) { + case CTP_ESTABLISHED: + cxgbi_sock_set_state(csk, CTP_PASSIVE_CLOSE); + break; + case CTP_ACTIVE_CLOSE: + cxgbi_sock_set_state(csk, CTP_CLOSE_WAIT_2); + break; + case CTP_CLOSE_WAIT_1: + cxgbi_sock_closed(csk); + break; + case CTP_ABORTING: + break; + default: + pr_err("csk 0x%p,%u,0x%lx,%u, bad state.\n", + csk, csk->state, csk->flags, csk->tid); + } + cxgbi_inform_iscsi_conn_closing(csk); +done: + spin_unlock_bh(&csk->lock); + cxgbi_sock_put(csk); +} +EXPORT_SYMBOL_GPL(cxgbi_sock_rcv_peer_close); + +void cxgbi_sock_rcv_close_conn_rpl(struct cxgbi_sock *csk, u32 snd_nxt) +{ + log_debug(1 << CXGBI_DBG_SOCK, "csk 0x%p,%u,0x%lx,%u.\n", + csk, (csk)->state, (csk)->flags, (csk)->tid); + cxgbi_sock_get(csk); + spin_lock_bh(&csk->lock); + + csk->snd_una = snd_nxt - 1; + if (cxgbi_sock_flag(csk, CTPF_ABORT_RPL_PENDING)) + goto done; + + switch (csk->state) { + case CTP_ACTIVE_CLOSE: + cxgbi_sock_set_state(csk, CTP_CLOSE_WAIT_1); + break; + case CTP_CLOSE_WAIT_1: + case CTP_CLOSE_WAIT_2: + cxgbi_sock_closed(csk); + break; + case CTP_ABORTING: + break; + default: + pr_err("csk 0x%p,%u,0x%lx,%u, bad state.\n", + csk, csk->state, csk->flags, csk->tid); + } +done: + spin_unlock_bh(&csk->lock); + cxgbi_sock_put(csk); +} +EXPORT_SYMBOL_GPL(cxgbi_sock_rcv_close_conn_rpl); + +void cxgbi_sock_rcv_wr_ack(struct cxgbi_sock *csk, unsigned int credits, + unsigned int snd_una, int seq_chk) +{ + log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK, + "csk 0x%p,%u,0x%lx,%u, cr %u,%u+%u, snd_una %u,%d.\n", + csk, csk->state, csk->flags, csk->tid, credits, + csk->wr_cred, csk->wr_una_cred, snd_una, seq_chk); + + spin_lock_bh(&csk->lock); + + csk->wr_cred += credits; + if (csk->wr_una_cred > csk->wr_max_cred - csk->wr_cred) + csk->wr_una_cred = csk->wr_max_cred - csk->wr_cred; + + while (credits) { + struct sk_buff *p = cxgbi_sock_peek_wr(csk); + + if (unlikely(!p)) { + pr_err("csk 0x%p,%u,0x%lx,%u, cr %u,%u+%u, empty.\n", + csk, csk->state, csk->flags, csk->tid, credits, + csk->wr_cred, csk->wr_una_cred); + break; + } + + if (unlikely(credits < p->csum)) { + pr_warn("csk 0x%p,%u,0x%lx,%u, cr %u,%u+%u, < %u.\n", + csk, csk->state, csk->flags, csk->tid, + credits, csk->wr_cred, csk->wr_una_cred, + p->csum); + p->csum -= credits; + break; + } else { + cxgbi_sock_dequeue_wr(csk); + credits -= p->csum; + kfree_skb(p); + } + } + + cxgbi_sock_check_wr_invariants(csk); + + if (seq_chk) { + if (unlikely(before(snd_una, csk->snd_una))) { + pr_warn("csk 0x%p,%u,0x%lx,%u, snd_una %u/%u.", + csk, csk->state, csk->flags, csk->tid, snd_una, + csk->snd_una); + goto done; + } + + if (csk->snd_una != snd_una) { + csk->snd_una = snd_una; + dst_confirm(csk->dst); + } + } + + if (skb_queue_len(&csk->write_queue)) { + if (csk->cdev->csk_push_tx_frames(csk, 0)) + cxgbi_conn_tx_open(csk); + } else + cxgbi_conn_tx_open(csk); +done: + spin_unlock_bh(&csk->lock); +} +EXPORT_SYMBOL_GPL(cxgbi_sock_rcv_wr_ack); + +static unsigned int cxgbi_sock_find_best_mtu(struct cxgbi_sock *csk, + unsigned short mtu) +{ + int i = 0; + + while (i < csk->cdev->nmtus - 1 && csk->cdev->mtus[i + 1] <= mtu) + ++i; + + return i; +} + +unsigned int cxgbi_sock_select_mss(struct cxgbi_sock *csk, unsigned int pmtu) +{ + unsigned int idx; + struct dst_entry *dst = csk->dst; + + csk->advmss = dst_metric_advmss(dst); + + if (csk->advmss > pmtu - 40) + csk->advmss = pmtu - 40; + if (csk->advmss < csk->cdev->mtus[0] - 40) + csk->advmss = csk->cdev->mtus[0] - 40; + idx = cxgbi_sock_find_best_mtu(csk, csk->advmss + 40); + + return idx; +} +EXPORT_SYMBOL_GPL(cxgbi_sock_select_mss); + +void cxgbi_sock_skb_entail(struct cxgbi_sock *csk, struct sk_buff *skb) +{ + cxgbi_skcb_tcp_seq(skb) = csk->write_seq; + __skb_queue_tail(&csk->write_queue, skb); +} +EXPORT_SYMBOL_GPL(cxgbi_sock_skb_entail); + +void cxgbi_sock_purge_wr_queue(struct cxgbi_sock *csk) +{ + struct sk_buff *skb; + + while ((skb = cxgbi_sock_dequeue_wr(csk)) != NULL) + kfree_skb(skb); +} +EXPORT_SYMBOL_GPL(cxgbi_sock_purge_wr_queue); + +void cxgbi_sock_check_wr_invariants(const struct cxgbi_sock *csk) +{ + int pending = cxgbi_sock_count_pending_wrs(csk); + + if (unlikely(csk->wr_cred + pending != csk->wr_max_cred)) + pr_err("csk 0x%p, tid %u, credit %u + %u != %u.\n", + csk, csk->tid, csk->wr_cred, pending, csk->wr_max_cred); +} +EXPORT_SYMBOL_GPL(cxgbi_sock_check_wr_invariants); + +static int cxgbi_sock_send_pdus(struct cxgbi_sock *csk, struct sk_buff *skb) +{ + struct cxgbi_device *cdev = csk->cdev; + struct sk_buff *next; + int err, copied = 0; + + spin_lock_bh(&csk->lock); + + if (csk->state != CTP_ESTABLISHED) { + log_debug(1 << CXGBI_DBG_PDU_TX, + "csk 0x%p,%u,0x%lx,%u, EAGAIN.\n", + csk, csk->state, csk->flags, csk->tid); + err = -EAGAIN; + goto out_err; + } + + if (csk->err) { + log_debug(1 << CXGBI_DBG_PDU_TX, + "csk 0x%p,%u,0x%lx,%u, EPIPE %d.\n", + csk, csk->state, csk->flags, csk->tid, csk->err); + err = -EPIPE; + goto out_err; + } + + if (csk->write_seq - csk->snd_una >= cdev->snd_win) { + log_debug(1 << CXGBI_DBG_PDU_TX, + "csk 0x%p,%u,0x%lx,%u, FULL %u-%u >= %u.\n", + csk, csk->state, csk->flags, csk->tid, csk->write_seq, + csk->snd_una, cdev->snd_win); + err = -ENOBUFS; + goto out_err; + } + + while (skb) { + int frags = skb_shinfo(skb)->nr_frags + + (skb->len != skb->data_len); + + if (unlikely(skb_headroom(skb) < cdev->skb_tx_rsvd)) { + pr_err("csk 0x%p, skb head %u < %u.\n", + csk, skb_headroom(skb), cdev->skb_tx_rsvd); + err = -EINVAL; + goto out_err; + } + + if (frags >= SKB_WR_LIST_SIZE) { + pr_err("csk 0x%p, frags %d, %u,%u >%u.\n", + csk, skb_shinfo(skb)->nr_frags, skb->len, + skb->data_len, (uint)(SKB_WR_LIST_SIZE)); + err = -EINVAL; + goto out_err; + } + + next = skb->next; + skb->next = NULL; + cxgbi_skcb_set_flag(skb, SKCBF_TX_NEED_HDR); + cxgbi_sock_skb_entail(csk, skb); + copied += skb->len; + csk->write_seq += skb->len + + cxgbi_ulp_extra_len(cxgbi_skcb_ulp_mode(skb)); + skb = next; + } +done: + if (likely(skb_queue_len(&csk->write_queue))) + cdev->csk_push_tx_frames(csk, 1); + spin_unlock_bh(&csk->lock); + return copied; + +out_err: + if (copied == 0 && err == -EPIPE) + copied = csk->err ? csk->err : -EPIPE; + else + copied = err; + goto done; +} + +/* + * Direct Data Placement - + * Directly place the iSCSI Data-In or Data-Out PDU's payload into pre-posted + * final destination host-memory buffers based on the Initiator Task Tag (ITT) + * in Data-In or Target Task Tag (TTT) in Data-Out PDUs. + * The host memory address is programmed into h/w in the format of pagepod + * entries. + * The location of the pagepod entry is encoded into ddp tag which is used as + * the base for ITT/TTT. + */ + +static unsigned char ddp_page_order[DDP_PGIDX_MAX] = {0, 1, 2, 4}; +static unsigned char ddp_page_shift[DDP_PGIDX_MAX] = {12, 13, 14, 16}; +static unsigned char page_idx = DDP_PGIDX_MAX; + +static unsigned char sw_tag_idx_bits; +static unsigned char sw_tag_age_bits; + +/* + * Direct-Data Placement page size adjustment + */ +static int ddp_adjust_page_table(void) +{ + int i; + unsigned int base_order, order; + + if (PAGE_SIZE < (1UL << ddp_page_shift[0])) { + pr_info("PAGE_SIZE 0x%lx too small, min 0x%lx\n", + PAGE_SIZE, 1UL << ddp_page_shift[0]); + return -EINVAL; + } + + base_order = get_order(1UL << ddp_page_shift[0]); + order = get_order(1UL << PAGE_SHIFT); + + for (i = 0; i < DDP_PGIDX_MAX; i++) { + /* first is the kernel page size, then just doubling */ + ddp_page_order[i] = order - base_order + i; + ddp_page_shift[i] = PAGE_SHIFT + i; + } + return 0; +} + +static int ddp_find_page_index(unsigned long pgsz) +{ + int i; + + for (i = 0; i < DDP_PGIDX_MAX; i++) { + if (pgsz == (1UL << ddp_page_shift[i])) + return i; + } + pr_info("ddp page size %lu not supported.\n", pgsz); + return DDP_PGIDX_MAX; +} + +static void ddp_setup_host_page_size(void) +{ + if (page_idx == DDP_PGIDX_MAX) { + page_idx = ddp_find_page_index(PAGE_SIZE); + + if (page_idx == DDP_PGIDX_MAX) { + pr_info("system PAGE %lu, update hw.\n", PAGE_SIZE); + if (ddp_adjust_page_table() < 0) { + pr_info("PAGE %lu, disable ddp.\n", PAGE_SIZE); + return; + } + page_idx = ddp_find_page_index(PAGE_SIZE); + } + pr_info("system PAGE %lu, ddp idx %u.\n", PAGE_SIZE, page_idx); + } +} + +void cxgbi_ddp_page_size_factor(int *pgsz_factor) +{ + int i; + + for (i = 0; i < DDP_PGIDX_MAX; i++) + pgsz_factor[i] = ddp_page_order[i]; +} +EXPORT_SYMBOL_GPL(cxgbi_ddp_page_size_factor); + +/* + * DDP setup & teardown + */ + +void cxgbi_ddp_ppod_set(struct cxgbi_pagepod *ppod, + struct cxgbi_pagepod_hdr *hdr, + struct cxgbi_gather_list *gl, unsigned int gidx) +{ + int i; + + memcpy(ppod, hdr, sizeof(*hdr)); + for (i = 0; i < (PPOD_PAGES_MAX + 1); i++, gidx++) { + ppod->addr[i] = gidx < gl->nelem ? + cpu_to_be64(gl->phys_addr[gidx]) : 0ULL; + } +} +EXPORT_SYMBOL_GPL(cxgbi_ddp_ppod_set); + +void cxgbi_ddp_ppod_clear(struct cxgbi_pagepod *ppod) +{ + memset(ppod, 0, sizeof(*ppod)); +} +EXPORT_SYMBOL_GPL(cxgbi_ddp_ppod_clear); + +static inline int ddp_find_unused_entries(struct cxgbi_ddp_info *ddp, + unsigned int start, unsigned int max, + unsigned int count, + struct cxgbi_gather_list *gl) +{ + unsigned int i, j, k; + + /* not enough entries */ + if ((max - start) < count) { + log_debug(1 << CXGBI_DBG_DDP, + "NOT enough entries %u+%u < %u.\n", start, count, max); + return -EBUSY; + } + + max -= count; + spin_lock(&ddp->map_lock); + for (i = start; i < max;) { + for (j = 0, k = i; j < count; j++, k++) { + if (ddp->gl_map[k]) + break; + } + if (j == count) { + for (j = 0, k = i; j < count; j++, k++) + ddp->gl_map[k] = gl; + spin_unlock(&ddp->map_lock); + return i; + } + i += j + 1; + } + spin_unlock(&ddp->map_lock); + log_debug(1 << CXGBI_DBG_DDP, + "NO suitable entries %u available.\n", count); + return -EBUSY; +} + +static inline void ddp_unmark_entries(struct cxgbi_ddp_info *ddp, + int start, int count) +{ + spin_lock(&ddp->map_lock); + memset(&ddp->gl_map[start], 0, + count * sizeof(struct cxgbi_gather_list *)); + spin_unlock(&ddp->map_lock); +} + +static inline void ddp_gl_unmap(struct pci_dev *pdev, + struct cxgbi_gather_list *gl) +{ + int i; + + for (i = 0; i < gl->nelem; i++) + dma_unmap_page(&pdev->dev, gl->phys_addr[i], PAGE_SIZE, + PCI_DMA_FROMDEVICE); +} + +static inline int ddp_gl_map(struct pci_dev *pdev, + struct cxgbi_gather_list *gl) +{ + int i; + + for (i = 0; i < gl->nelem; i++) { + gl->phys_addr[i] = dma_map_page(&pdev->dev, gl->pages[i], 0, + PAGE_SIZE, + PCI_DMA_FROMDEVICE); + if (unlikely(dma_mapping_error(&pdev->dev, gl->phys_addr[i]))) { + log_debug(1 << CXGBI_DBG_DDP, + "page %d 0x%p, 0x%p dma mapping err.\n", + i, gl->pages[i], pdev); + goto unmap; + } + } + return i; +unmap: + if (i) { + unsigned int nelem = gl->nelem; + + gl->nelem = i; + ddp_gl_unmap(pdev, gl); + gl->nelem = nelem; + } + return -EINVAL; +} + +static void ddp_release_gl(struct cxgbi_gather_list *gl, + struct pci_dev *pdev) +{ + ddp_gl_unmap(pdev, gl); + kfree(gl); +} + +static struct cxgbi_gather_list *ddp_make_gl(unsigned int xferlen, + struct scatterlist *sgl, + unsigned int sgcnt, + struct pci_dev *pdev, + gfp_t gfp) +{ + struct cxgbi_gather_list *gl; + struct scatterlist *sg = sgl; + struct page *sgpage = sg_page(sg); + unsigned int sglen = sg->length; + unsigned int sgoffset = sg->offset; + unsigned int npages = (xferlen + sgoffset + PAGE_SIZE - 1) >> + PAGE_SHIFT; + int i = 1, j = 0; + + if (xferlen < DDP_THRESHOLD) { + log_debug(1 << CXGBI_DBG_DDP, + "xfer %u < threshold %u, no ddp.\n", + xferlen, DDP_THRESHOLD); + return NULL; + } + + gl = kzalloc(sizeof(struct cxgbi_gather_list) + + npages * (sizeof(dma_addr_t) + + sizeof(struct page *)), gfp); + if (!gl) { + log_debug(1 << CXGBI_DBG_DDP, + "xfer %u, %u pages, OOM.\n", xferlen, npages); + return NULL; + } + + log_debug(1 << CXGBI_DBG_DDP, + "xfer %u, sgl %u, gl max %u.\n", xferlen, sgcnt, npages); + + gl->pages = (struct page **)&gl->phys_addr[npages]; + gl->nelem = npages; + gl->length = xferlen; + gl->offset = sgoffset; + gl->pages[0] = sgpage; + + for (i = 1, sg = sg_next(sgl), j = 0; i < sgcnt; + i++, sg = sg_next(sg)) { + struct page *page = sg_page(sg); + + if (sgpage == page && sg->offset == sgoffset + sglen) + sglen += sg->length; + else { + /* make sure the sgl is fit for ddp: + * each has the same page size, and + * all of the middle pages are used completely + */ + if ((j && sgoffset) || ((i != sgcnt - 1) && + ((sglen + sgoffset) & ~PAGE_MASK))) { + log_debug(1 << CXGBI_DBG_DDP, + "page %d/%u, %u + %u.\n", + i, sgcnt, sgoffset, sglen); + goto error_out; + } + + j++; + if (j == gl->nelem || sg->offset) { + log_debug(1 << CXGBI_DBG_DDP, + "page %d/%u, offset %u.\n", + j, gl->nelem, sg->offset); + goto error_out; + } + gl->pages[j] = page; + sglen = sg->length; + sgoffset = sg->offset; + sgpage = page; + } + } + gl->nelem = ++j; + + if (ddp_gl_map(pdev, gl) < 0) + goto error_out; + + return gl; + +error_out: + kfree(gl); + return NULL; +} + +static void ddp_tag_release(struct cxgbi_hba *chba, u32 tag) +{ + struct cxgbi_device *cdev = chba->cdev; + struct cxgbi_ddp_info *ddp = cdev->ddp; + u32 idx; + + idx = (tag >> PPOD_IDX_SHIFT) & ddp->idx_mask; + if (idx < ddp->nppods) { + struct cxgbi_gather_list *gl = ddp->gl_map[idx]; + unsigned int npods; + + if (!gl || !gl->nelem) { + pr_warn("tag 0x%x, idx %u, gl 0x%p, %u.\n", + tag, idx, gl, gl ? gl->nelem : 0); + return; + } + npods = (gl->nelem + PPOD_PAGES_MAX - 1) >> PPOD_PAGES_SHIFT; + log_debug(1 << CXGBI_DBG_DDP, + "tag 0x%x, release idx %u, npods %u.\n", + tag, idx, npods); + cdev->csk_ddp_clear(chba, tag, idx, npods); + ddp_unmark_entries(ddp, idx, npods); + ddp_release_gl(gl, ddp->pdev); + } else + pr_warn("tag 0x%x, idx %u > max %u.\n", tag, idx, ddp->nppods); +} + +static int ddp_tag_reserve(struct cxgbi_sock *csk, unsigned int tid, + u32 sw_tag, u32 *tagp, struct cxgbi_gather_list *gl, + gfp_t gfp) +{ + struct cxgbi_device *cdev = csk->cdev; + struct cxgbi_ddp_info *ddp = cdev->ddp; + struct cxgbi_tag_format *tformat = &cdev->tag_format; + struct cxgbi_pagepod_hdr hdr; + unsigned int npods; + int idx = -1; + int err = -ENOMEM; + u32 tag; + + npods = (gl->nelem + PPOD_PAGES_MAX - 1) >> PPOD_PAGES_SHIFT; + if (ddp->idx_last == ddp->nppods) + idx = ddp_find_unused_entries(ddp, 0, ddp->nppods, + npods, gl); + else { + idx = ddp_find_unused_entries(ddp, ddp->idx_last + 1, + ddp->nppods, npods, + gl); + if (idx < 0 && ddp->idx_last >= npods) { + idx = ddp_find_unused_entries(ddp, 0, + min(ddp->idx_last + npods, ddp->nppods), + npods, gl); + } + } + if (idx < 0) { + log_debug(1 << CXGBI_DBG_DDP, + "xferlen %u, gl %u, npods %u NO DDP.\n", + gl->length, gl->nelem, npods); + return idx; + } + + tag = cxgbi_ddp_tag_base(tformat, sw_tag); + tag |= idx << PPOD_IDX_SHIFT; + + hdr.rsvd = 0; + hdr.vld_tid = htonl(PPOD_VALID_FLAG | PPOD_TID(tid)); + hdr.pgsz_tag_clr = htonl(tag & ddp->rsvd_tag_mask); + hdr.max_offset = htonl(gl->length); + hdr.page_offset = htonl(gl->offset); + + err = cdev->csk_ddp_set(csk, &hdr, idx, npods, gl); + if (err < 0) + goto unmark_entries; + + ddp->idx_last = idx; + log_debug(1 << CXGBI_DBG_DDP, + "xfer %u, gl %u,%u, tid 0x%x, tag 0x%x->0x%x(%u,%u).\n", + gl->length, gl->nelem, gl->offset, tid, sw_tag, tag, idx, + npods); + *tagp = tag; + return 0; + +unmark_entries: + ddp_unmark_entries(ddp, idx, npods); + return err; +} + +int cxgbi_ddp_reserve(struct cxgbi_sock *csk, unsigned int *tagp, + unsigned int sw_tag, unsigned int xferlen, + struct scatterlist *sgl, unsigned int sgcnt, gfp_t gfp) +{ + struct cxgbi_device *cdev = csk->cdev; + struct cxgbi_tag_format *tformat = &cdev->tag_format; + struct cxgbi_gather_list *gl; + int err; + + if (page_idx >= DDP_PGIDX_MAX || !cdev->ddp || + xferlen < DDP_THRESHOLD) { + log_debug(1 << CXGBI_DBG_DDP, + "pgidx %u, xfer %u, NO ddp.\n", page_idx, xferlen); + return -EINVAL; + } + + if (!cxgbi_sw_tag_usable(tformat, sw_tag)) { + log_debug(1 << CXGBI_DBG_DDP, + "sw_tag 0x%x NOT usable.\n", sw_tag); + return -EINVAL; + } + + gl = ddp_make_gl(xferlen, sgl, sgcnt, cdev->pdev, gfp); + if (!gl) + return -ENOMEM; + + err = ddp_tag_reserve(csk, csk->tid, sw_tag, tagp, gl, gfp); + if (err < 0) + ddp_release_gl(gl, cdev->pdev); + + return err; +} + +static void ddp_destroy(struct kref *kref) +{ + struct cxgbi_ddp_info *ddp = container_of(kref, + struct cxgbi_ddp_info, + refcnt); + struct cxgbi_device *cdev = ddp->cdev; + int i = 0; + + pr_info("kref 0, destroy ddp 0x%p, cdev 0x%p.\n", ddp, cdev); + + while (i < ddp->nppods) { + struct cxgbi_gather_list *gl = ddp->gl_map[i]; + + if (gl) { + int npods = (gl->nelem + PPOD_PAGES_MAX - 1) + >> PPOD_PAGES_SHIFT; + pr_info("cdev 0x%p, ddp %d + %d.\n", cdev, i, npods); + kfree(gl); + i += npods; + } else + i++; + } + cxgbi_free_big_mem(ddp); +} + +int cxgbi_ddp_cleanup(struct cxgbi_device *cdev) +{ + struct cxgbi_ddp_info *ddp = cdev->ddp; + + log_debug(1 << CXGBI_DBG_DDP, + "cdev 0x%p, release ddp 0x%p.\n", cdev, ddp); + cdev->ddp = NULL; + if (ddp) + return kref_put(&ddp->refcnt, ddp_destroy); + return 0; +} +EXPORT_SYMBOL_GPL(cxgbi_ddp_cleanup); + +int cxgbi_ddp_init(struct cxgbi_device *cdev, + unsigned int llimit, unsigned int ulimit, + unsigned int max_txsz, unsigned int max_rxsz) +{ + struct cxgbi_ddp_info *ddp; + unsigned int ppmax, bits; + + ppmax = (ulimit - llimit + 1) >> PPOD_SIZE_SHIFT; + bits = __ilog2_u32(ppmax) + 1; + if (bits > PPOD_IDX_MAX_SIZE) + bits = PPOD_IDX_MAX_SIZE; + ppmax = (1 << (bits - 1)) - 1; + + ddp = cxgbi_alloc_big_mem(sizeof(struct cxgbi_ddp_info) + + ppmax * (sizeof(struct cxgbi_gather_list *) + + sizeof(struct sk_buff *)), + GFP_KERNEL); + if (!ddp) { + pr_warn("cdev 0x%p, ddp ppmax %u OOM.\n", cdev, ppmax); + return -ENOMEM; + } + ddp->gl_map = (struct cxgbi_gather_list **)(ddp + 1); + cdev->ddp = ddp; + + spin_lock_init(&ddp->map_lock); + kref_init(&ddp->refcnt); + + ddp->cdev = cdev; + ddp->pdev = cdev->pdev; + ddp->llimit = llimit; + ddp->ulimit = ulimit; + ddp->max_txsz = min_t(unsigned int, max_txsz, ULP2_MAX_PKT_SIZE); + ddp->max_rxsz = min_t(unsigned int, max_rxsz, ULP2_MAX_PKT_SIZE); + ddp->nppods = ppmax; + ddp->idx_last = ppmax; + ddp->idx_bits = bits; + ddp->idx_mask = (1 << bits) - 1; + ddp->rsvd_tag_mask = (1 << (bits + PPOD_IDX_SHIFT)) - 1; + + cdev->tag_format.sw_bits = sw_tag_idx_bits + sw_tag_age_bits; + cdev->tag_format.rsvd_bits = ddp->idx_bits; + cdev->tag_format.rsvd_shift = PPOD_IDX_SHIFT; + cdev->tag_format.rsvd_mask = (1 << cdev->tag_format.rsvd_bits) - 1; + + pr_info("%s tag format, sw %u, rsvd %u,%u, mask 0x%x.\n", + cdev->ports[0]->name, cdev->tag_format.sw_bits, + cdev->tag_format.rsvd_bits, cdev->tag_format.rsvd_shift, + cdev->tag_format.rsvd_mask); + + cdev->tx_max_size = min_t(unsigned int, ULP2_MAX_PDU_PAYLOAD, + ddp->max_txsz - ISCSI_PDU_NONPAYLOAD_LEN); + cdev->rx_max_size = min_t(unsigned int, ULP2_MAX_PDU_PAYLOAD, + ddp->max_rxsz - ISCSI_PDU_NONPAYLOAD_LEN); + + log_debug(1 << CXGBI_DBG_DDP, + "%s max payload size: %u/%u, %u/%u.\n", + cdev->ports[0]->name, cdev->tx_max_size, ddp->max_txsz, + cdev->rx_max_size, ddp->max_rxsz); + return 0; +} +EXPORT_SYMBOL_GPL(cxgbi_ddp_init); + +/* + * APIs interacting with open-iscsi libraries + */ + +static unsigned char padding[4]; + +static void task_release_itt(struct iscsi_task *task, itt_t hdr_itt) +{ + struct scsi_cmnd *sc = task->sc; + struct iscsi_tcp_conn *tcp_conn = task->conn->dd_data; + struct cxgbi_conn *cconn = tcp_conn->dd_data; + struct cxgbi_hba *chba = cconn->chba; + struct cxgbi_tag_format *tformat = &chba->cdev->tag_format; + u32 tag = ntohl((__force u32)hdr_itt); + + log_debug(1 << CXGBI_DBG_DDP, + "cdev 0x%p, release tag 0x%x.\n", chba->cdev, tag); + if (sc && + (scsi_bidi_cmnd(sc) || sc->sc_data_direction == DMA_FROM_DEVICE) && + cxgbi_is_ddp_tag(tformat, tag)) + ddp_tag_release(chba, tag); +} + +static int task_reserve_itt(struct iscsi_task *task, itt_t *hdr_itt) +{ + struct scsi_cmnd *sc = task->sc; + struct iscsi_conn *conn = task->conn; + struct iscsi_session *sess = conn->session; + struct iscsi_tcp_conn *tcp_conn = conn->dd_data; + struct cxgbi_conn *cconn = tcp_conn->dd_data; + struct cxgbi_hba *chba = cconn->chba; + struct cxgbi_tag_format *tformat = &chba->cdev->tag_format; + u32 sw_tag = (sess->age << cconn->task_idx_bits) | task->itt; + u32 tag = 0; + int err = -EINVAL; + + if (sc && + (scsi_bidi_cmnd(sc) || sc->sc_data_direction == DMA_FROM_DEVICE)) { + err = cxgbi_ddp_reserve(cconn->cep->csk, &tag, sw_tag, + scsi_in(sc)->length, + scsi_in(sc)->table.sgl, + scsi_in(sc)->table.nents, + GFP_ATOMIC); + if (err < 0) + log_debug(1 << CXGBI_DBG_DDP, + "csk 0x%p, R task 0x%p, %u,%u, no ddp.\n", + cconn->cep->csk, task, scsi_in(sc)->length, + scsi_in(sc)->table.nents); + } + + if (err < 0) + tag = cxgbi_set_non_ddp_tag(tformat, sw_tag); + /* the itt need to sent in big-endian order */ + *hdr_itt = (__force itt_t)htonl(tag); + + log_debug(1 << CXGBI_DBG_DDP, + "cdev 0x%p, task 0x%p, 0x%x(0x%x,0x%x)->0x%x/0x%x.\n", + chba->cdev, task, sw_tag, task->itt, sess->age, tag, *hdr_itt); + return 0; +} + +void cxgbi_parse_pdu_itt(struct iscsi_conn *conn, itt_t itt, int *idx, int *age) +{ + struct iscsi_tcp_conn *tcp_conn = conn->dd_data; + struct cxgbi_conn *cconn = tcp_conn->dd_data; + struct cxgbi_device *cdev = cconn->chba->cdev; + u32 tag = ntohl((__force u32) itt); + u32 sw_bits; + + sw_bits = cxgbi_tag_nonrsvd_bits(&cdev->tag_format, tag); + if (idx) + *idx = sw_bits & ((1 << cconn->task_idx_bits) - 1); + if (age) + *age = (sw_bits >> cconn->task_idx_bits) & ISCSI_AGE_MASK; + + log_debug(1 << CXGBI_DBG_DDP, + "cdev 0x%p, tag 0x%x/0x%x, -> 0x%x(0x%x,0x%x).\n", + cdev, tag, itt, sw_bits, idx ? *idx : 0xFFFFF, + age ? *age : 0xFF); +} +EXPORT_SYMBOL_GPL(cxgbi_parse_pdu_itt); + +void cxgbi_conn_tx_open(struct cxgbi_sock *csk) +{ + struct iscsi_conn *conn = csk->user_data; + + if (conn) { + log_debug(1 << CXGBI_DBG_SOCK, + "csk 0x%p, cid %d.\n", csk, conn->id); + iscsi_conn_queue_work(conn); + } +} +EXPORT_SYMBOL_GPL(cxgbi_conn_tx_open); + +/* + * pdu receive, interact with libiscsi_tcp + */ +static inline int read_pdu_skb(struct iscsi_conn *conn, + struct sk_buff *skb, + unsigned int offset, + int offloaded) +{ + int status = 0; + int bytes_read; + + bytes_read = iscsi_tcp_recv_skb(conn, skb, offset, offloaded, &status); + switch (status) { + case ISCSI_TCP_CONN_ERR: + pr_info("skb 0x%p, off %u, %d, TCP_ERR.\n", + skb, offset, offloaded); + return -EIO; + case ISCSI_TCP_SUSPENDED: + log_debug(1 << CXGBI_DBG_PDU_RX, + "skb 0x%p, off %u, %d, TCP_SUSPEND, rc %d.\n", + skb, offset, offloaded, bytes_read); + /* no transfer - just have caller flush queue */ + return bytes_read; + case ISCSI_TCP_SKB_DONE: + pr_info("skb 0x%p, off %u, %d, TCP_SKB_DONE.\n", + skb, offset, offloaded); + /* + * pdus should always fit in the skb and we should get + * segment done notifcation. + */ + iscsi_conn_printk(KERN_ERR, conn, "Invalid pdu or skb."); + return -EFAULT; + case ISCSI_TCP_SEGMENT_DONE: + log_debug(1 << CXGBI_DBG_PDU_RX, + "skb 0x%p, off %u, %d, TCP_SEG_DONE, rc %d.\n", + skb, offset, offloaded, bytes_read); + return bytes_read; + default: + pr_info("skb 0x%p, off %u, %d, invalid status %d.\n", + skb, offset, offloaded, status); + return -EINVAL; + } +} + +static int skb_read_pdu_bhs(struct iscsi_conn *conn, struct sk_buff *skb) +{ + struct iscsi_tcp_conn *tcp_conn = conn->dd_data; + + log_debug(1 << CXGBI_DBG_PDU_RX, + "conn 0x%p, skb 0x%p, len %u, flag 0x%lx.\n", + conn, skb, skb->len, cxgbi_skcb_flags(skb)); + + if (!iscsi_tcp_recv_segment_is_hdr(tcp_conn)) { + pr_info("conn 0x%p, skb 0x%p, not hdr.\n", conn, skb); + iscsi_conn_failure(conn, ISCSI_ERR_PROTO); + return -EIO; + } + + if (conn->hdrdgst_en && + cxgbi_skcb_test_flag(skb, SKCBF_RX_HCRC_ERR)) { + pr_info("conn 0x%p, skb 0x%p, hcrc.\n", conn, skb); + iscsi_conn_failure(conn, ISCSI_ERR_HDR_DGST); + return -EIO; + } + + return read_pdu_skb(conn, skb, 0, 0); +} + +static int skb_read_pdu_data(struct iscsi_conn *conn, struct sk_buff *lskb, + struct sk_buff *skb, unsigned int offset) +{ + struct iscsi_tcp_conn *tcp_conn = conn->dd_data; + bool offloaded = 0; + int opcode = tcp_conn->in.hdr->opcode & ISCSI_OPCODE_MASK; + + log_debug(1 << CXGBI_DBG_PDU_RX, + "conn 0x%p, skb 0x%p, len %u, flag 0x%lx.\n", + conn, skb, skb->len, cxgbi_skcb_flags(skb)); + + if (conn->datadgst_en && + cxgbi_skcb_test_flag(lskb, SKCBF_RX_DCRC_ERR)) { + pr_info("conn 0x%p, skb 0x%p, dcrc 0x%lx.\n", + conn, lskb, cxgbi_skcb_flags(lskb)); + iscsi_conn_failure(conn, ISCSI_ERR_DATA_DGST); + return -EIO; + } + + if (iscsi_tcp_recv_segment_is_hdr(tcp_conn)) + return 0; + + /* coalesced, add header digest length */ + if (lskb == skb && conn->hdrdgst_en) + offset += ISCSI_DIGEST_SIZE; + + if (cxgbi_skcb_test_flag(lskb, SKCBF_RX_DATA_DDPD)) + offloaded = 1; + + if (opcode == ISCSI_OP_SCSI_DATA_IN) + log_debug(1 << CXGBI_DBG_PDU_RX, + "skb 0x%p, op 0x%x, itt 0x%x, %u %s ddp'ed.\n", + skb, opcode, ntohl(tcp_conn->in.hdr->itt), + tcp_conn->in.datalen, offloaded ? "is" : "not"); + + return read_pdu_skb(conn, skb, offset, offloaded); +} + +static void csk_return_rx_credits(struct cxgbi_sock *csk, int copied) +{ + struct cxgbi_device *cdev = csk->cdev; + int must_send; + u32 credits; + + log_debug(1 << CXGBI_DBG_PDU_RX, + "csk 0x%p,%u,0x%lx,%u, seq %u, wup %u, thre %u, %u.\n", + csk, csk->state, csk->flags, csk->tid, csk->copied_seq, + csk->rcv_wup, cdev->rx_credit_thres, + cdev->rcv_win); + + if (csk->state != CTP_ESTABLISHED) + return; + + credits = csk->copied_seq - csk->rcv_wup; + if (unlikely(!credits)) + return; + if (unlikely(cdev->rx_credit_thres == 0)) + return; + + must_send = credits + 16384 >= cdev->rcv_win; + if (must_send || credits >= cdev->rx_credit_thres) + csk->rcv_wup += cdev->csk_send_rx_credits(csk, credits); +} + +void cxgbi_conn_pdu_ready(struct cxgbi_sock *csk) +{ + struct cxgbi_device *cdev = csk->cdev; + struct iscsi_conn *conn = csk->user_data; + struct sk_buff *skb; + unsigned int read = 0; + int err = 0; + + log_debug(1 << CXGBI_DBG_PDU_RX, + "csk 0x%p, conn 0x%p.\n", csk, conn); + + if (unlikely(!conn || conn->suspend_rx)) { + log_debug(1 << CXGBI_DBG_PDU_RX, + "csk 0x%p, conn 0x%p, id %d, suspend_rx %lu!\n", + csk, conn, conn ? conn->id : 0xFF, + conn ? conn->suspend_rx : 0xFF); + return; + } + + while (!err) { + skb = skb_peek(&csk->receive_queue); + if (!skb || + !(cxgbi_skcb_test_flag(skb, SKCBF_RX_STATUS))) { + if (skb) + log_debug(1 << CXGBI_DBG_PDU_RX, + "skb 0x%p, NOT ready 0x%lx.\n", + skb, cxgbi_skcb_flags(skb)); + break; + } + __skb_unlink(skb, &csk->receive_queue); + + read += cxgbi_skcb_rx_pdulen(skb); + log_debug(1 << CXGBI_DBG_PDU_RX, + "csk 0x%p, skb 0x%p,%u,f 0x%lx, pdu len %u.\n", + csk, skb, skb->len, cxgbi_skcb_flags(skb), + cxgbi_skcb_rx_pdulen(skb)); + + if (cxgbi_skcb_test_flag(skb, SKCBF_RX_COALESCED)) { + err = skb_read_pdu_bhs(conn, skb); + if (err < 0) { + pr_err("coalesced bhs, csk 0x%p, skb 0x%p,%u, " + "f 0x%lx, plen %u.\n", + csk, skb, skb->len, + cxgbi_skcb_flags(skb), + cxgbi_skcb_rx_pdulen(skb)); + goto skb_done; + } + err = skb_read_pdu_data(conn, skb, skb, + err + cdev->skb_rx_extra); + if (err < 0) + pr_err("coalesced data, csk 0x%p, skb 0x%p,%u, " + "f 0x%lx, plen %u.\n", + csk, skb, skb->len, + cxgbi_skcb_flags(skb), + cxgbi_skcb_rx_pdulen(skb)); + } else { + err = skb_read_pdu_bhs(conn, skb); + if (err < 0) { + pr_err("bhs, csk 0x%p, skb 0x%p,%u, " + "f 0x%lx, plen %u.\n", + csk, skb, skb->len, + cxgbi_skcb_flags(skb), + cxgbi_skcb_rx_pdulen(skb)); + goto skb_done; + } + + if (cxgbi_skcb_test_flag(skb, SKCBF_RX_DATA)) { + struct sk_buff *dskb; + + dskb = skb_peek(&csk->receive_queue); + if (!dskb) { + pr_err("csk 0x%p, skb 0x%p,%u, f 0x%lx," + " plen %u, NO data.\n", + csk, skb, skb->len, + cxgbi_skcb_flags(skb), + cxgbi_skcb_rx_pdulen(skb)); + err = -EIO; + goto skb_done; + } + __skb_unlink(dskb, &csk->receive_queue); + + err = skb_read_pdu_data(conn, skb, dskb, 0); + if (err < 0) + pr_err("data, csk 0x%p, skb 0x%p,%u, " + "f 0x%lx, plen %u, dskb 0x%p," + "%u.\n", + csk, skb, skb->len, + cxgbi_skcb_flags(skb), + cxgbi_skcb_rx_pdulen(skb), + dskb, dskb->len); + __kfree_skb(dskb); + } else + err = skb_read_pdu_data(conn, skb, skb, 0); + } +skb_done: + __kfree_skb(skb); + + if (err < 0) + break; + } + + log_debug(1 << CXGBI_DBG_PDU_RX, "csk 0x%p, read %u.\n", csk, read); + if (read) { + csk->copied_seq += read; + csk_return_rx_credits(csk, read); + conn->rxdata_octets += read; + } + + if (err < 0) { + pr_info("csk 0x%p, 0x%p, rx failed %d, read %u.\n", + csk, conn, err, read); + iscsi_conn_failure(conn, ISCSI_ERR_CONN_FAILED); + } +} +EXPORT_SYMBOL_GPL(cxgbi_conn_pdu_ready); + +static int sgl_seek_offset(struct scatterlist *sgl, unsigned int sgcnt, + unsigned int offset, unsigned int *off, + struct scatterlist **sgp) +{ + int i; + struct scatterlist *sg; + + for_each_sg(sgl, sg, sgcnt, i) { + if (offset < sg->length) { + *off = offset; + *sgp = sg; + return 0; + } + offset -= sg->length; + } + return -EFAULT; +} + +static int sgl_read_to_frags(struct scatterlist *sg, unsigned int sgoffset, + unsigned int dlen, struct page_frag *frags, + int frag_max) +{ + unsigned int datalen = dlen; + unsigned int sglen = sg->length - sgoffset; + struct page *page = sg_page(sg); + int i; + + i = 0; + do { + unsigned int copy; + + if (!sglen) { + sg = sg_next(sg); + if (!sg) { + pr_warn("sg %d NULL, len %u/%u.\n", + i, datalen, dlen); + return -EINVAL; + } + sgoffset = 0; + sglen = sg->length; + page = sg_page(sg); + + } + copy = min(datalen, sglen); + if (i && page == frags[i - 1].page && + sgoffset + sg->offset == + frags[i - 1].offset + frags[i - 1].size) { + frags[i - 1].size += copy; + } else { + if (i >= frag_max) { + pr_warn("too many pages %u, dlen %u.\n", + frag_max, dlen); + return -EINVAL; + } + + frags[i].page = page; + frags[i].offset = sg->offset + sgoffset; + frags[i].size = copy; + i++; + } + datalen -= copy; + sgoffset += copy; + sglen -= copy; + } while (datalen); + + return i; +} + +int cxgbi_conn_alloc_pdu(struct iscsi_task *task, u8 opcode) +{ + struct iscsi_tcp_conn *tcp_conn = task->conn->dd_data; + struct cxgbi_conn *cconn = tcp_conn->dd_data; + struct cxgbi_device *cdev = cconn->chba->cdev; + struct iscsi_conn *conn = task->conn; + struct iscsi_tcp_task *tcp_task = task->dd_data; + struct cxgbi_task_data *tdata = iscsi_task_cxgbi_data(task); + struct scsi_cmnd *sc = task->sc; + int headroom = SKB_TX_ISCSI_PDU_HEADER_MAX; + + tcp_task->dd_data = tdata; + task->hdr = NULL; + + if (SKB_MAX_HEAD(cdev->skb_tx_rsvd) > (512 * MAX_SKB_FRAGS) && + (opcode == ISCSI_OP_SCSI_DATA_OUT || + (opcode == ISCSI_OP_SCSI_CMD && + (scsi_bidi_cmnd(sc) || sc->sc_data_direction == DMA_TO_DEVICE)))) + /* data could goes into skb head */ + headroom += min_t(unsigned int, + SKB_MAX_HEAD(cdev->skb_tx_rsvd), + conn->max_xmit_dlength); + + tdata->skb = alloc_skb(cdev->skb_tx_rsvd + headroom, GFP_ATOMIC); + if (!tdata->skb) { + struct cxgbi_sock *csk = cconn->cep->csk; + struct net_device *ndev = cdev->ports[csk->port_id]; + ndev->stats.tx_dropped++; + return -ENOMEM; + } + + skb_reserve(tdata->skb, cdev->skb_tx_rsvd); + task->hdr = (struct iscsi_hdr *)tdata->skb->data; + task->hdr_max = SKB_TX_ISCSI_PDU_HEADER_MAX; /* BHS + AHS */ + + /* data_out uses scsi_cmd's itt */ + if (opcode != ISCSI_OP_SCSI_DATA_OUT) + task_reserve_itt(task, &task->hdr->itt); + + log_debug(1 << CXGBI_DBG_ISCSI | 1 << CXGBI_DBG_PDU_TX, + "task 0x%p, op 0x%x, skb 0x%p,%u+%u/%u, itt 0x%x.\n", + task, opcode, tdata->skb, cdev->skb_tx_rsvd, headroom, + conn->max_xmit_dlength, ntohl(task->hdr->itt)); + + return 0; +} +EXPORT_SYMBOL_GPL(cxgbi_conn_alloc_pdu); + +static inline void tx_skb_setmode(struct sk_buff *skb, int hcrc, int dcrc) +{ + if (hcrc || dcrc) { + u8 submode = 0; + + if (hcrc) + submode |= 1; + if (dcrc) + submode |= 2; + cxgbi_skcb_ulp_mode(skb) = (ULP2_MODE_ISCSI << 4) | submode; + } else + cxgbi_skcb_ulp_mode(skb) = 0; +} + +int cxgbi_conn_init_pdu(struct iscsi_task *task, unsigned int offset, + unsigned int count) +{ + struct iscsi_conn *conn = task->conn; + struct cxgbi_task_data *tdata = iscsi_task_cxgbi_data(task); + struct sk_buff *skb = tdata->skb; + unsigned int datalen = count; + int i, padlen = iscsi_padding(count); + struct page *pg; + + log_debug(1 << CXGBI_DBG_ISCSI | 1 << CXGBI_DBG_PDU_TX, + "task 0x%p,0x%p, skb 0x%p, 0x%x,0x%x,0x%x, %u+%u.\n", + task, task->sc, skb, (*skb->data) & ISCSI_OPCODE_MASK, + ntohl(task->cmdsn), ntohl(task->hdr->itt), offset, count); + + skb_put(skb, task->hdr_len); + tx_skb_setmode(skb, conn->hdrdgst_en, datalen ? conn->datadgst_en : 0); + if (!count) + return 0; + + if (task->sc) { + struct scsi_data_buffer *sdb = scsi_out(task->sc); + struct scatterlist *sg = NULL; + int err; + + tdata->offset = offset; + tdata->count = count; + err = sgl_seek_offset( + sdb->table.sgl, sdb->table.nents, + tdata->offset, &tdata->sgoffset, &sg); + if (err < 0) { + pr_warn("tpdu, sgl %u, bad offset %u/%u.\n", + sdb->table.nents, tdata->offset, sdb->length); + return err; + } + err = sgl_read_to_frags(sg, tdata->sgoffset, tdata->count, + tdata->frags, MAX_PDU_FRAGS); + if (err < 0) { + pr_warn("tpdu, sgl %u, bad offset %u + %u.\n", + sdb->table.nents, tdata->offset, tdata->count); + return err; + } + tdata->nr_frags = err; + + if (tdata->nr_frags > MAX_SKB_FRAGS || + (padlen && tdata->nr_frags == MAX_SKB_FRAGS)) { + char *dst = skb->data + task->hdr_len; + struct page_frag *frag = tdata->frags; + + /* data fits in the skb's headroom */ + for (i = 0; i < tdata->nr_frags; i++, frag++) { + char *src = kmap_atomic(frag->page); + + memcpy(dst, src+frag->offset, frag->size); + dst += frag->size; + kunmap_atomic(src); + } + if (padlen) { + memset(dst, 0, padlen); + padlen = 0; + } + skb_put(skb, count + padlen); + } else { + /* data fit into frag_list */ + for (i = 0; i < tdata->nr_frags; i++) { + __skb_fill_page_desc(skb, i, + tdata->frags[i].page, + tdata->frags[i].offset, + tdata->frags[i].size); + skb_frag_ref(skb, i); + } + skb_shinfo(skb)->nr_frags = tdata->nr_frags; + skb->len += count; + skb->data_len += count; + skb->truesize += count; + } + + } else { + pg = virt_to_page(task->data); + + get_page(pg); + skb_fill_page_desc(skb, 0, pg, offset_in_page(task->data), + count); + skb->len += count; + skb->data_len += count; + skb->truesize += count; + } + + if (padlen) { + i = skb_shinfo(skb)->nr_frags; + skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags, + virt_to_page(padding), offset_in_page(padding), + padlen); + + skb->data_len += padlen; + skb->truesize += padlen; + skb->len += padlen; + } + + return 0; +} +EXPORT_SYMBOL_GPL(cxgbi_conn_init_pdu); + +int cxgbi_conn_xmit_pdu(struct iscsi_task *task) +{ + struct iscsi_tcp_conn *tcp_conn = task->conn->dd_data; + struct cxgbi_conn *cconn = tcp_conn->dd_data; + struct cxgbi_task_data *tdata = iscsi_task_cxgbi_data(task); + struct sk_buff *skb = tdata->skb; + unsigned int datalen; + int err; + + if (!skb) { + log_debug(1 << CXGBI_DBG_ISCSI | 1 << CXGBI_DBG_PDU_TX, + "task 0x%p, skb NULL.\n", task); + return 0; + } + + datalen = skb->data_len; + tdata->skb = NULL; + err = cxgbi_sock_send_pdus(cconn->cep->csk, skb); + if (err > 0) { + int pdulen = err; + + log_debug(1 << CXGBI_DBG_PDU_TX, + "task 0x%p,0x%p, skb 0x%p, len %u/%u, rv %d.\n", + task, task->sc, skb, skb->len, skb->data_len, err); + + if (task->conn->hdrdgst_en) + pdulen += ISCSI_DIGEST_SIZE; + + if (datalen && task->conn->datadgst_en) + pdulen += ISCSI_DIGEST_SIZE; + + task->conn->txdata_octets += pdulen; + return 0; + } + + if (err == -EAGAIN || err == -ENOBUFS) { + log_debug(1 << CXGBI_DBG_PDU_TX, + "task 0x%p, skb 0x%p, len %u/%u, %d EAGAIN.\n", + task, skb, skb->len, skb->data_len, err); + /* reset skb to send when we are called again */ + tdata->skb = skb; + return err; + } + + kfree_skb(skb); + log_debug(1 << CXGBI_DBG_ISCSI | 1 << CXGBI_DBG_PDU_TX, + "itt 0x%x, skb 0x%p, len %u/%u, xmit err %d.\n", + task->itt, skb, skb->len, skb->data_len, err); + iscsi_conn_printk(KERN_ERR, task->conn, "xmit err %d.\n", err); + iscsi_conn_failure(task->conn, ISCSI_ERR_XMIT_FAILED); + return err; +} +EXPORT_SYMBOL_GPL(cxgbi_conn_xmit_pdu); + +void cxgbi_cleanup_task(struct iscsi_task *task) +{ + struct cxgbi_task_data *tdata = iscsi_task_cxgbi_data(task); + + log_debug(1 << CXGBI_DBG_ISCSI, + "task 0x%p, skb 0x%p, itt 0x%x.\n", + task, tdata->skb, task->hdr_itt); + + /* never reached the xmit task callout */ + if (tdata->skb) + __kfree_skb(tdata->skb); + memset(tdata, 0, sizeof(*tdata)); + + task_release_itt(task, task->hdr_itt); + iscsi_tcp_cleanup_task(task); +} +EXPORT_SYMBOL_GPL(cxgbi_cleanup_task); + +void cxgbi_get_conn_stats(struct iscsi_cls_conn *cls_conn, + struct iscsi_stats *stats) +{ + struct iscsi_conn *conn = cls_conn->dd_data; + + stats->txdata_octets = conn->txdata_octets; + stats->rxdata_octets = conn->rxdata_octets; + stats->scsicmd_pdus = conn->scsicmd_pdus_cnt; + stats->dataout_pdus = conn->dataout_pdus_cnt; + stats->scsirsp_pdus = conn->scsirsp_pdus_cnt; + stats->datain_pdus = conn->datain_pdus_cnt; + stats->r2t_pdus = conn->r2t_pdus_cnt; + stats->tmfcmd_pdus = conn->tmfcmd_pdus_cnt; + stats->tmfrsp_pdus = conn->tmfrsp_pdus_cnt; + stats->digest_err = 0; + stats->timeout_err = 0; + stats->custom_length = 1; + strcpy(stats->custom[0].desc, "eh_abort_cnt"); + stats->custom[0].value = conn->eh_abort_cnt; +} +EXPORT_SYMBOL_GPL(cxgbi_get_conn_stats); + +static int cxgbi_conn_max_xmit_dlength(struct iscsi_conn *conn) +{ + struct iscsi_tcp_conn *tcp_conn = conn->dd_data; + struct cxgbi_conn *cconn = tcp_conn->dd_data; + struct cxgbi_device *cdev = cconn->chba->cdev; + unsigned int headroom = SKB_MAX_HEAD(cdev->skb_tx_rsvd); + unsigned int max_def = 512 * MAX_SKB_FRAGS; + unsigned int max = max(max_def, headroom); + + max = min(cconn->chba->cdev->tx_max_size, max); + if (conn->max_xmit_dlength) + conn->max_xmit_dlength = min(conn->max_xmit_dlength, max); + else + conn->max_xmit_dlength = max; + cxgbi_align_pdu_size(conn->max_xmit_dlength); + + return 0; +} + +static int cxgbi_conn_max_recv_dlength(struct iscsi_conn *conn) +{ + struct iscsi_tcp_conn *tcp_conn = conn->dd_data; + struct cxgbi_conn *cconn = tcp_conn->dd_data; + unsigned int max = cconn->chba->cdev->rx_max_size; + + cxgbi_align_pdu_size(max); + + if (conn->max_recv_dlength) { + if (conn->max_recv_dlength > max) { + pr_err("MaxRecvDataSegmentLength %u > %u.\n", + conn->max_recv_dlength, max); + return -EINVAL; + } + conn->max_recv_dlength = min(conn->max_recv_dlength, max); + cxgbi_align_pdu_size(conn->max_recv_dlength); + } else + conn->max_recv_dlength = max; + + return 0; +} + +int cxgbi_set_conn_param(struct iscsi_cls_conn *cls_conn, + enum iscsi_param param, char *buf, int buflen) +{ + struct iscsi_conn *conn = cls_conn->dd_data; + struct iscsi_tcp_conn *tcp_conn = conn->dd_data; + struct cxgbi_conn *cconn = tcp_conn->dd_data; + struct cxgbi_sock *csk = cconn->cep->csk; + int err; + + log_debug(1 << CXGBI_DBG_ISCSI, + "cls_conn 0x%p, param %d, buf(%d) %s.\n", + cls_conn, param, buflen, buf); + + switch (param) { + case ISCSI_PARAM_HDRDGST_EN: + err = iscsi_set_param(cls_conn, param, buf, buflen); + if (!err && conn->hdrdgst_en) + err = csk->cdev->csk_ddp_setup_digest(csk, csk->tid, + conn->hdrdgst_en, + conn->datadgst_en, 0); + break; + case ISCSI_PARAM_DATADGST_EN: + err = iscsi_set_param(cls_conn, param, buf, buflen); + if (!err && conn->datadgst_en) + err = csk->cdev->csk_ddp_setup_digest(csk, csk->tid, + conn->hdrdgst_en, + conn->datadgst_en, 0); + break; + case ISCSI_PARAM_MAX_R2T: + return iscsi_tcp_set_max_r2t(conn, buf); + case ISCSI_PARAM_MAX_RECV_DLENGTH: + err = iscsi_set_param(cls_conn, param, buf, buflen); + if (!err) + err = cxgbi_conn_max_recv_dlength(conn); + break; + case ISCSI_PARAM_MAX_XMIT_DLENGTH: + err = iscsi_set_param(cls_conn, param, buf, buflen); + if (!err) + err = cxgbi_conn_max_xmit_dlength(conn); + break; + default: + return iscsi_set_param(cls_conn, param, buf, buflen); + } + return err; +} +EXPORT_SYMBOL_GPL(cxgbi_set_conn_param); + +static inline int csk_print_port(struct cxgbi_sock *csk, char *buf) +{ + int len; + + cxgbi_sock_get(csk); + len = sprintf(buf, "%hu\n", ntohs(csk->daddr.sin_port)); + cxgbi_sock_put(csk); + + return len; +} + +static inline int csk_print_ip(struct cxgbi_sock *csk, char *buf) +{ + int len; + + cxgbi_sock_get(csk); + if (csk->csk_family == AF_INET) + len = sprintf(buf, "%pI4", + &csk->daddr.sin_addr.s_addr); + else + len = sprintf(buf, "%pI6", + &csk->daddr6.sin6_addr); + + cxgbi_sock_put(csk); + + return len; +} + +int cxgbi_get_ep_param(struct iscsi_endpoint *ep, enum iscsi_param param, + char *buf) +{ + struct cxgbi_endpoint *cep = ep->dd_data; + struct cxgbi_sock *csk; + int len; + + log_debug(1 << CXGBI_DBG_ISCSI, + "cls_conn 0x%p, param %d.\n", ep, param); + + switch (param) { + case ISCSI_PARAM_CONN_PORT: + case ISCSI_PARAM_CONN_ADDRESS: + if (!cep) + return -ENOTCONN; + + csk = cep->csk; + if (!csk) + return -ENOTCONN; + + return iscsi_conn_get_addr_param((struct sockaddr_storage *) + &csk->daddr, param, buf); + default: + return -ENOSYS; + } + return len; +} +EXPORT_SYMBOL_GPL(cxgbi_get_ep_param); + +struct iscsi_cls_conn * +cxgbi_create_conn(struct iscsi_cls_session *cls_session, u32 cid) +{ + struct iscsi_cls_conn *cls_conn; + struct iscsi_conn *conn; + struct iscsi_tcp_conn *tcp_conn; + struct cxgbi_conn *cconn; + + cls_conn = iscsi_tcp_conn_setup(cls_session, sizeof(*cconn), cid); + if (!cls_conn) + return NULL; + + conn = cls_conn->dd_data; + tcp_conn = conn->dd_data; + cconn = tcp_conn->dd_data; + cconn->iconn = conn; + + log_debug(1 << CXGBI_DBG_ISCSI, + "cid %u(0x%x), cls 0x%p,0x%p, conn 0x%p,0x%p,0x%p.\n", + cid, cid, cls_session, cls_conn, conn, tcp_conn, cconn); + + return cls_conn; +} +EXPORT_SYMBOL_GPL(cxgbi_create_conn); + +int cxgbi_bind_conn(struct iscsi_cls_session *cls_session, + struct iscsi_cls_conn *cls_conn, + u64 transport_eph, int is_leading) +{ + struct iscsi_conn *conn = cls_conn->dd_data; + struct iscsi_tcp_conn *tcp_conn = conn->dd_data; + struct cxgbi_conn *cconn = tcp_conn->dd_data; + struct iscsi_endpoint *ep; + struct cxgbi_endpoint *cep; + struct cxgbi_sock *csk; + int err; + + ep = iscsi_lookup_endpoint(transport_eph); + if (!ep) + return -EINVAL; + + /* setup ddp pagesize */ + cep = ep->dd_data; + csk = cep->csk; + err = csk->cdev->csk_ddp_setup_pgidx(csk, csk->tid, page_idx, 0); + if (err < 0) + return err; + + err = iscsi_conn_bind(cls_session, cls_conn, is_leading); + if (err) + return -EINVAL; + + /* calculate the tag idx bits needed for this conn based on cmds_max */ + cconn->task_idx_bits = (__ilog2_u32(conn->session->cmds_max - 1)) + 1; + + write_lock_bh(&csk->callback_lock); + csk->user_data = conn; + cconn->chba = cep->chba; + cconn->cep = cep; + cep->cconn = cconn; + write_unlock_bh(&csk->callback_lock); + + cxgbi_conn_max_xmit_dlength(conn); + cxgbi_conn_max_recv_dlength(conn); + + log_debug(1 << CXGBI_DBG_ISCSI, + "cls 0x%p,0x%p, ep 0x%p, cconn 0x%p, csk 0x%p.\n", + cls_session, cls_conn, ep, cconn, csk); + /* init recv engine */ + iscsi_tcp_hdr_recv_prep(tcp_conn); + + return 0; +} +EXPORT_SYMBOL_GPL(cxgbi_bind_conn); + +struct iscsi_cls_session *cxgbi_create_session(struct iscsi_endpoint *ep, + u16 cmds_max, u16 qdepth, + u32 initial_cmdsn) +{ + struct cxgbi_endpoint *cep; + struct cxgbi_hba *chba; + struct Scsi_Host *shost; + struct iscsi_cls_session *cls_session; + struct iscsi_session *session; + + if (!ep) { + pr_err("missing endpoint.\n"); + return NULL; + } + + cep = ep->dd_data; + chba = cep->chba; + shost = chba->shost; + + BUG_ON(chba != iscsi_host_priv(shost)); + + cls_session = iscsi_session_setup(chba->cdev->itp, shost, + cmds_max, 0, + sizeof(struct iscsi_tcp_task) + + sizeof(struct cxgbi_task_data), + initial_cmdsn, ISCSI_MAX_TARGET); + if (!cls_session) + return NULL; + + session = cls_session->dd_data; + if (iscsi_tcp_r2tpool_alloc(session)) + goto remove_session; + + log_debug(1 << CXGBI_DBG_ISCSI, + "ep 0x%p, cls sess 0x%p.\n", ep, cls_session); + return cls_session; + +remove_session: + iscsi_session_teardown(cls_session); + return NULL; +} +EXPORT_SYMBOL_GPL(cxgbi_create_session); + +void cxgbi_destroy_session(struct iscsi_cls_session *cls_session) +{ + log_debug(1 << CXGBI_DBG_ISCSI, + "cls sess 0x%p.\n", cls_session); + + iscsi_tcp_r2tpool_free(cls_session->dd_data); + iscsi_session_teardown(cls_session); +} +EXPORT_SYMBOL_GPL(cxgbi_destroy_session); + +int cxgbi_set_host_param(struct Scsi_Host *shost, enum iscsi_host_param param, + char *buf, int buflen) +{ + struct cxgbi_hba *chba = iscsi_host_priv(shost); + + if (!chba->ndev) { + shost_printk(KERN_ERR, shost, "Could not get host param. " + "netdev for host not set.\n"); + return -ENODEV; + } + + log_debug(1 << CXGBI_DBG_ISCSI, + "shost 0x%p, hba 0x%p,%s, param %d, buf(%d) %s.\n", + shost, chba, chba->ndev->name, param, buflen, buf); + + switch (param) { + case ISCSI_HOST_PARAM_IPADDRESS: + { + __be32 addr = in_aton(buf); + log_debug(1 << CXGBI_DBG_ISCSI, + "hba %s, req. ipv4 %pI4.\n", chba->ndev->name, &addr); + cxgbi_set_iscsi_ipv4(chba, addr); + return 0; + } + case ISCSI_HOST_PARAM_HWADDRESS: + case ISCSI_HOST_PARAM_NETDEV_NAME: + return 0; + default: + return iscsi_host_set_param(shost, param, buf, buflen); + } +} +EXPORT_SYMBOL_GPL(cxgbi_set_host_param); + +int cxgbi_get_host_param(struct Scsi_Host *shost, enum iscsi_host_param param, + char *buf) +{ + struct cxgbi_hba *chba = iscsi_host_priv(shost); + int len = 0; + + if (!chba->ndev) { + shost_printk(KERN_ERR, shost, "Could not get host param. " + "netdev for host not set.\n"); + return -ENODEV; + } + + log_debug(1 << CXGBI_DBG_ISCSI, + "shost 0x%p, hba 0x%p,%s, param %d.\n", + shost, chba, chba->ndev->name, param); + + switch (param) { + case ISCSI_HOST_PARAM_HWADDRESS: + len = sysfs_format_mac(buf, chba->ndev->dev_addr, 6); + break; + case ISCSI_HOST_PARAM_NETDEV_NAME: + len = sprintf(buf, "%s\n", chba->ndev->name); + break; + case ISCSI_HOST_PARAM_IPADDRESS: + { + struct cxgbi_sock *csk = find_sock_on_port(chba->cdev, + chba->port_id); + if (csk) { + len = sprintf(buf, "%pIS", + (struct sockaddr *)&csk->saddr); + } + log_debug(1 << CXGBI_DBG_ISCSI, + "hba %s, addr %s.\n", chba->ndev->name, buf); + break; + } + default: + return iscsi_host_get_param(shost, param, buf); + } + + return len; +} +EXPORT_SYMBOL_GPL(cxgbi_get_host_param); + +struct iscsi_endpoint *cxgbi_ep_connect(struct Scsi_Host *shost, + struct sockaddr *dst_addr, + int non_blocking) +{ + struct iscsi_endpoint *ep; + struct cxgbi_endpoint *cep; + struct cxgbi_hba *hba = NULL; + struct cxgbi_sock *csk; + int err = -EINVAL; + + log_debug(1 << CXGBI_DBG_ISCSI | 1 << CXGBI_DBG_SOCK, + "shost 0x%p, non_blocking %d, dst_addr 0x%p.\n", + shost, non_blocking, dst_addr); + + if (shost) { + hba = iscsi_host_priv(shost); + if (!hba) { + pr_info("shost 0x%p, priv NULL.\n", shost); + goto err_out; + } + } + + if (dst_addr->sa_family == AF_INET) { + csk = cxgbi_check_route(dst_addr); +#if IS_ENABLED(CONFIG_IPV6) + } else if (dst_addr->sa_family == AF_INET6) { + csk = cxgbi_check_route6(dst_addr); +#endif + } else { + pr_info("address family 0x%x NOT supported.\n", + dst_addr->sa_family); + err = -EAFNOSUPPORT; + return (struct iscsi_endpoint *)ERR_PTR(err); + } + + if (IS_ERR(csk)) + return (struct iscsi_endpoint *)csk; + cxgbi_sock_get(csk); + + if (!hba) + hba = csk->cdev->hbas[csk->port_id]; + else if (hba != csk->cdev->hbas[csk->port_id]) { + pr_info("Could not connect through requested host %u" + "hba 0x%p != 0x%p (%u).\n", + shost->host_no, hba, + csk->cdev->hbas[csk->port_id], csk->port_id); + err = -ENOSPC; + goto release_conn; + } + + err = sock_get_port(csk); + if (err) + goto release_conn; + + cxgbi_sock_set_state(csk, CTP_CONNECTING); + err = csk->cdev->csk_init_act_open(csk); + if (err) + goto release_conn; + + if (cxgbi_sock_is_closing(csk)) { + err = -ENOSPC; + pr_info("csk 0x%p is closing.\n", csk); + goto release_conn; + } + + ep = iscsi_create_endpoint(sizeof(*cep)); + if (!ep) { + err = -ENOMEM; + pr_info("iscsi alloc ep, OOM.\n"); + goto release_conn; + } + + cep = ep->dd_data; + cep->csk = csk; + cep->chba = hba; + + log_debug(1 << CXGBI_DBG_ISCSI | 1 << CXGBI_DBG_SOCK, + "ep 0x%p, cep 0x%p, csk 0x%p, hba 0x%p,%s.\n", + ep, cep, csk, hba, hba->ndev->name); + return ep; + +release_conn: + cxgbi_sock_put(csk); + cxgbi_sock_closed(csk); +err_out: + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(cxgbi_ep_connect); + +int cxgbi_ep_poll(struct iscsi_endpoint *ep, int timeout_ms) +{ + struct cxgbi_endpoint *cep = ep->dd_data; + struct cxgbi_sock *csk = cep->csk; + + if (!cxgbi_sock_is_established(csk)) + return 0; + return 1; +} +EXPORT_SYMBOL_GPL(cxgbi_ep_poll); + +void cxgbi_ep_disconnect(struct iscsi_endpoint *ep) +{ + struct cxgbi_endpoint *cep = ep->dd_data; + struct cxgbi_conn *cconn = cep->cconn; + struct cxgbi_sock *csk = cep->csk; + + log_debug(1 << CXGBI_DBG_ISCSI | 1 << CXGBI_DBG_SOCK, + "ep 0x%p, cep 0x%p, cconn 0x%p, csk 0x%p,%u,0x%lx.\n", + ep, cep, cconn, csk, csk->state, csk->flags); + + if (cconn && cconn->iconn) { + iscsi_suspend_tx(cconn->iconn); + write_lock_bh(&csk->callback_lock); + cep->csk->user_data = NULL; + cconn->cep = NULL; + write_unlock_bh(&csk->callback_lock); + } + iscsi_destroy_endpoint(ep); + + if (likely(csk->state >= CTP_ESTABLISHED)) + need_active_close(csk); + else + cxgbi_sock_closed(csk); + + cxgbi_sock_put(csk); +} +EXPORT_SYMBOL_GPL(cxgbi_ep_disconnect); + +int cxgbi_iscsi_init(struct iscsi_transport *itp, + struct scsi_transport_template **stt) +{ + *stt = iscsi_register_transport(itp); + if (*stt == NULL) { + pr_err("unable to register %s transport 0x%p.\n", + itp->name, itp); + return -ENODEV; + } + log_debug(1 << CXGBI_DBG_ISCSI, + "%s, registered iscsi transport 0x%p.\n", + itp->name, stt); + return 0; +} +EXPORT_SYMBOL_GPL(cxgbi_iscsi_init); + +void cxgbi_iscsi_cleanup(struct iscsi_transport *itp, + struct scsi_transport_template **stt) +{ + if (*stt) { + log_debug(1 << CXGBI_DBG_ISCSI, + "de-register transport 0x%p, %s, stt 0x%p.\n", + itp, itp->name, *stt); + *stt = NULL; + iscsi_unregister_transport(itp); + } +} +EXPORT_SYMBOL_GPL(cxgbi_iscsi_cleanup); + +umode_t cxgbi_attr_is_visible(int param_type, int param) +{ + switch (param_type) { + case ISCSI_HOST_PARAM: + switch (param) { + case ISCSI_HOST_PARAM_NETDEV_NAME: + case ISCSI_HOST_PARAM_HWADDRESS: + case ISCSI_HOST_PARAM_IPADDRESS: + case ISCSI_HOST_PARAM_INITIATOR_NAME: + return S_IRUGO; + default: + return 0; + } + case ISCSI_PARAM: + switch (param) { + case ISCSI_PARAM_MAX_RECV_DLENGTH: + case ISCSI_PARAM_MAX_XMIT_DLENGTH: + case ISCSI_PARAM_HDRDGST_EN: + case ISCSI_PARAM_DATADGST_EN: + case ISCSI_PARAM_CONN_ADDRESS: + case ISCSI_PARAM_CONN_PORT: + case ISCSI_PARAM_EXP_STATSN: + case ISCSI_PARAM_PERSISTENT_ADDRESS: + case ISCSI_PARAM_PERSISTENT_PORT: + case ISCSI_PARAM_PING_TMO: + case ISCSI_PARAM_RECV_TMO: + case ISCSI_PARAM_INITIAL_R2T_EN: + case ISCSI_PARAM_MAX_R2T: + case ISCSI_PARAM_IMM_DATA_EN: + case ISCSI_PARAM_FIRST_BURST: + case ISCSI_PARAM_MAX_BURST: + case ISCSI_PARAM_PDU_INORDER_EN: + case ISCSI_PARAM_DATASEQ_INORDER_EN: + case ISCSI_PARAM_ERL: + case ISCSI_PARAM_TARGET_NAME: + case ISCSI_PARAM_TPGT: + case ISCSI_PARAM_USERNAME: + case ISCSI_PARAM_PASSWORD: + case ISCSI_PARAM_USERNAME_IN: + case ISCSI_PARAM_PASSWORD_IN: + case ISCSI_PARAM_FAST_ABORT: + case ISCSI_PARAM_ABORT_TMO: + case ISCSI_PARAM_LU_RESET_TMO: + case ISCSI_PARAM_TGT_RESET_TMO: + case ISCSI_PARAM_IFACE_NAME: + case ISCSI_PARAM_INITIATOR_NAME: + return S_IRUGO; + default: + return 0; + } + } + + return 0; +} +EXPORT_SYMBOL_GPL(cxgbi_attr_is_visible); + +static int __init libcxgbi_init_module(void) +{ + sw_tag_idx_bits = (__ilog2_u32(ISCSI_ITT_MASK)) + 1; + sw_tag_age_bits = (__ilog2_u32(ISCSI_AGE_MASK)) + 1; + + pr_info("tag itt 0x%x, %u bits, age 0x%x, %u bits.\n", + ISCSI_ITT_MASK, sw_tag_idx_bits, + ISCSI_AGE_MASK, sw_tag_age_bits); + + ddp_setup_host_page_size(); + return 0; +} + +static void __exit libcxgbi_exit_module(void) +{ + cxgbi_device_unregister_all(0xFF); + return; +} + +module_init(libcxgbi_init_module); +module_exit(libcxgbi_exit_module); diff --git a/drivers/scsi/cxgbi/libcxgbi.h b/drivers/scsi/cxgbi/libcxgbi.h new file mode 100644 index 0000000..2c7cb1c --- /dev/null +++ b/drivers/scsi/cxgbi/libcxgbi.h @@ -0,0 +1,758 @@ +/* + * libcxgbi.h: Chelsio common library for T3/T4 iSCSI driver. + * + * Copyright (c) 2010 Chelsio Communications, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. + * + * Written by: Karen Xie (kxie@chelsio.com) + * Written by: Rakesh Ranjan (rranjan@chelsio.com) + */ + +#ifndef __LIBCXGBI_H__ +#define __LIBCXGBI_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +enum cxgbi_dbg_flag { + CXGBI_DBG_ISCSI, + CXGBI_DBG_DDP, + CXGBI_DBG_TOE, + CXGBI_DBG_SOCK, + + CXGBI_DBG_PDU_TX, + CXGBI_DBG_PDU_RX, + CXGBI_DBG_DEV, +}; + +#define log_debug(level, fmt, ...) \ + do { \ + if (dbg_level & (level)) \ + pr_info(fmt, ##__VA_ARGS__); \ + } while (0) + +#define pr_info_ipaddr(fmt_trail, \ + addr1, addr2, args_trail...) \ +do { \ + if (!((1 << CXGBI_DBG_SOCK) & dbg_level)) \ + break; \ + pr_info("%pISpc - %pISpc, " fmt_trail, \ + addr1, addr2, args_trail); \ +} while (0) + +/* max. connections per adapter */ +#define CXGBI_MAX_CONN 16384 + +/* always allocate rooms for AHS */ +#define SKB_TX_ISCSI_PDU_HEADER_MAX \ + (sizeof(struct iscsi_hdr) + ISCSI_MAX_AHS_SIZE) + +#define ISCSI_PDU_NONPAYLOAD_LEN 312 /* bhs(48) + ahs(256) + digest(8)*/ + +/* + * align pdu size to multiple of 512 for better performance + */ +#define cxgbi_align_pdu_size(n) do { n = (n) & (~511); } while (0) + +#define ULP2_MODE_ISCSI 2 + +#define ULP2_MAX_PKT_SIZE 16224 +#define ULP2_MAX_PDU_PAYLOAD \ + (ULP2_MAX_PKT_SIZE - ISCSI_PDU_NONPAYLOAD_LEN) + +/* + * For iscsi connections HW may inserts digest bytes into the pdu. Those digest + * bytes are not sent by the host but are part of the TCP payload and therefore + * consume TCP sequence space. + */ +static const unsigned int ulp2_extra_len[] = { 0, 4, 4, 8 }; +static inline unsigned int cxgbi_ulp_extra_len(int submode) +{ + return ulp2_extra_len[submode & 3]; +} + +/* + * struct pagepod_hdr, pagepod - pagepod format + */ + +#define CPL_RX_DDP_STATUS_DDP_SHIFT 16 /* ddp'able */ +#define CPL_RX_DDP_STATUS_PAD_SHIFT 19 /* pad error */ +#define CPL_RX_DDP_STATUS_HCRC_SHIFT 20 /* hcrc error */ +#define CPL_RX_DDP_STATUS_DCRC_SHIFT 21 /* dcrc error */ + +struct cxgbi_pagepod_hdr { + u32 vld_tid; + u32 pgsz_tag_clr; + u32 max_offset; + u32 page_offset; + u64 rsvd; +}; + +#define PPOD_PAGES_MAX 4 +struct cxgbi_pagepod { + struct cxgbi_pagepod_hdr hdr; + u64 addr[PPOD_PAGES_MAX + 1]; +}; + +struct cxgbi_tag_format { + unsigned char sw_bits; + unsigned char rsvd_bits; + unsigned char rsvd_shift; + unsigned char filler[1]; + u32 rsvd_mask; +}; + +struct cxgbi_gather_list { + unsigned int tag; + unsigned int length; + unsigned int offset; + unsigned int nelem; + struct page **pages; + dma_addr_t phys_addr[0]; +}; + +struct cxgbi_ddp_info { + struct kref refcnt; + struct cxgbi_device *cdev; + struct pci_dev *pdev; + unsigned int max_txsz; + unsigned int max_rxsz; + unsigned int llimit; + unsigned int ulimit; + unsigned int nppods; + unsigned int idx_last; + unsigned char idx_bits; + unsigned char filler[3]; + unsigned int idx_mask; + unsigned int rsvd_tag_mask; + spinlock_t map_lock; + struct cxgbi_gather_list **gl_map; +}; + +#define DDP_PGIDX_MAX 4 +#define DDP_THRESHOLD 2048 + +#define PPOD_PAGES_SHIFT 2 /* 4 pages per pod */ + +#define PPOD_SIZE sizeof(struct cxgbi_pagepod) /* 64 */ +#define PPOD_SIZE_SHIFT 6 + +#define ULPMEM_DSGL_MAX_NPPODS 16 /* 1024/PPOD_SIZE */ +#define ULPMEM_IDATA_MAX_NPPODS 4 /* 256/PPOD_SIZE */ +#define PCIE_MEMWIN_MAX_NPPODS 16 /* 1024/PPOD_SIZE */ + +#define PPOD_COLOR_SHIFT 0 +#define PPOD_COLOR(x) ((x) << PPOD_COLOR_SHIFT) + +#define PPOD_IDX_SHIFT 6 +#define PPOD_IDX_MAX_SIZE 24 + +#define PPOD_TID_SHIFT 0 +#define PPOD_TID(x) ((x) << PPOD_TID_SHIFT) + +#define PPOD_TAG_SHIFT 6 +#define PPOD_TAG(x) ((x) << PPOD_TAG_SHIFT) + +#define PPOD_VALID_SHIFT 24 +#define PPOD_VALID(x) ((x) << PPOD_VALID_SHIFT) +#define PPOD_VALID_FLAG PPOD_VALID(1U) + +/* + * sge_opaque_hdr - + * Opaque version of structure the SGE stores at skb->head of TX_DATA packets + * and for which we must reserve space. + */ +struct sge_opaque_hdr { + void *dev; + dma_addr_t addr[MAX_SKB_FRAGS + 1]; +}; + +struct cxgbi_sock { + struct cxgbi_device *cdev; + + int tid; + int atid; + unsigned long flags; + unsigned int mtu; + unsigned short rss_qid; + unsigned short txq_idx; + unsigned short advmss; + unsigned int tx_chan; + unsigned int rx_chan; + unsigned int mss_idx; + unsigned int smac_idx; + unsigned char port_id; + int wr_max_cred; + int wr_cred; + int wr_una_cred; + unsigned char hcrc_len; + unsigned char dcrc_len; + + void *l2t; + struct sk_buff *wr_pending_head; + struct sk_buff *wr_pending_tail; + struct sk_buff *cpl_close; + struct sk_buff *cpl_abort_req; + struct sk_buff *cpl_abort_rpl; + struct sk_buff *skb_ulp_lhdr; + spinlock_t lock; + struct kref refcnt; + unsigned int state; + unsigned int csk_family; + union { + struct sockaddr_in saddr; + struct sockaddr_in6 saddr6; + }; + union { + struct sockaddr_in daddr; + struct sockaddr_in6 daddr6; + }; + struct dst_entry *dst; + struct sk_buff_head receive_queue; + struct sk_buff_head write_queue; + struct timer_list retry_timer; + int err; + rwlock_t callback_lock; + void *user_data; + + u32 rcv_nxt; + u32 copied_seq; + u32 rcv_wup; + u32 snd_nxt; + u32 snd_una; + u32 write_seq; +}; + +/* + * connection states + */ +enum cxgbi_sock_states{ + CTP_CLOSED, + CTP_CONNECTING, + CTP_ACTIVE_OPEN, + CTP_ESTABLISHED, + CTP_ACTIVE_CLOSE, + CTP_PASSIVE_CLOSE, + CTP_CLOSE_WAIT_1, + CTP_CLOSE_WAIT_2, + CTP_ABORTING, +}; + +/* + * Connection flags -- many to track some close related events. + */ +enum cxgbi_sock_flags { + CTPF_ABORT_RPL_RCVD, /*received one ABORT_RPL_RSS message */ + CTPF_ABORT_REQ_RCVD, /*received one ABORT_REQ_RSS message */ + CTPF_ABORT_RPL_PENDING, /* expecting an abort reply */ + CTPF_TX_DATA_SENT, /* already sent a TX_DATA WR */ + CTPF_ACTIVE_CLOSE_NEEDED,/* need to be closed */ + CTPF_HAS_ATID, /* reserved atid */ + CTPF_HAS_TID, /* reserved hw tid */ + CTPF_OFFLOAD_DOWN, /* offload function off */ +}; + +struct cxgbi_skb_rx_cb { + __u32 ddigest; + __u32 pdulen; +}; + +struct cxgbi_skb_tx_cb { + void *l2t; + struct sk_buff *wr_next; +}; + +enum cxgbi_skcb_flags { + SKCBF_TX_NEED_HDR, /* packet needs a header */ + SKCBF_RX_COALESCED, /* received whole pdu */ + SKCBF_RX_HDR, /* received pdu header */ + SKCBF_RX_DATA, /* received pdu payload */ + SKCBF_RX_STATUS, /* received ddp status */ + SKCBF_RX_DATA_DDPD, /* pdu payload ddp'd */ + SKCBF_RX_HCRC_ERR, /* header digest error */ + SKCBF_RX_DCRC_ERR, /* data digest error */ + SKCBF_RX_PAD_ERR, /* padding byte error */ +}; + +struct cxgbi_skb_cb { + unsigned char ulp_mode; + unsigned long flags; + unsigned int seq; + union { + struct cxgbi_skb_rx_cb rx; + struct cxgbi_skb_tx_cb tx; + }; +}; + +#define CXGBI_SKB_CB(skb) ((struct cxgbi_skb_cb *)&((skb)->cb[0])) +#define cxgbi_skcb_flags(skb) (CXGBI_SKB_CB(skb)->flags) +#define cxgbi_skcb_ulp_mode(skb) (CXGBI_SKB_CB(skb)->ulp_mode) +#define cxgbi_skcb_tcp_seq(skb) (CXGBI_SKB_CB(skb)->seq) +#define cxgbi_skcb_rx_ddigest(skb) (CXGBI_SKB_CB(skb)->rx.ddigest) +#define cxgbi_skcb_rx_pdulen(skb) (CXGBI_SKB_CB(skb)->rx.pdulen) +#define cxgbi_skcb_tx_wr_next(skb) (CXGBI_SKB_CB(skb)->tx.wr_next) + +static inline void cxgbi_skcb_set_flag(struct sk_buff *skb, + enum cxgbi_skcb_flags flag) +{ + __set_bit(flag, &(cxgbi_skcb_flags(skb))); +} + +static inline void cxgbi_skcb_clear_flag(struct sk_buff *skb, + enum cxgbi_skcb_flags flag) +{ + __clear_bit(flag, &(cxgbi_skcb_flags(skb))); +} + +static inline int cxgbi_skcb_test_flag(struct sk_buff *skb, + enum cxgbi_skcb_flags flag) +{ + return test_bit(flag, &(cxgbi_skcb_flags(skb))); +} + +static inline void cxgbi_sock_set_flag(struct cxgbi_sock *csk, + enum cxgbi_sock_flags flag) +{ + __set_bit(flag, &csk->flags); + log_debug(1 << CXGBI_DBG_SOCK, + "csk 0x%p,%u,0x%lx, bit %d.\n", + csk, csk->state, csk->flags, flag); +} + +static inline void cxgbi_sock_clear_flag(struct cxgbi_sock *csk, + enum cxgbi_sock_flags flag) +{ + __clear_bit(flag, &csk->flags); + log_debug(1 << CXGBI_DBG_SOCK, + "csk 0x%p,%u,0x%lx, bit %d.\n", + csk, csk->state, csk->flags, flag); +} + +static inline int cxgbi_sock_flag(struct cxgbi_sock *csk, + enum cxgbi_sock_flags flag) +{ + if (csk == NULL) + return 0; + return test_bit(flag, &csk->flags); +} + +static inline void cxgbi_sock_set_state(struct cxgbi_sock *csk, int state) +{ + log_debug(1 << CXGBI_DBG_SOCK, + "csk 0x%p,%u,0x%lx, state -> %u.\n", + csk, csk->state, csk->flags, state); + csk->state = state; +} + +static inline void cxgbi_sock_free(struct kref *kref) +{ + struct cxgbi_sock *csk = container_of(kref, + struct cxgbi_sock, + refcnt); + if (csk) { + log_debug(1 << CXGBI_DBG_SOCK, + "free csk 0x%p, state %u, flags 0x%lx\n", + csk, csk->state, csk->flags); + kfree(csk); + } +} + +static inline void __cxgbi_sock_put(const char *fn, struct cxgbi_sock *csk) +{ + log_debug(1 << CXGBI_DBG_SOCK, + "%s, put csk 0x%p, ref %u-1.\n", + fn, csk, atomic_read(&csk->refcnt.refcount)); + kref_put(&csk->refcnt, cxgbi_sock_free); +} +#define cxgbi_sock_put(csk) __cxgbi_sock_put(__func__, csk) + +static inline void __cxgbi_sock_get(const char *fn, struct cxgbi_sock *csk) +{ + log_debug(1 << CXGBI_DBG_SOCK, + "%s, get csk 0x%p, ref %u+1.\n", + fn, csk, atomic_read(&csk->refcnt.refcount)); + kref_get(&csk->refcnt); +} +#define cxgbi_sock_get(csk) __cxgbi_sock_get(__func__, csk) + +static inline int cxgbi_sock_is_closing(struct cxgbi_sock *csk) +{ + return csk->state >= CTP_ACTIVE_CLOSE; +} + +static inline int cxgbi_sock_is_established(struct cxgbi_sock *csk) +{ + return csk->state == CTP_ESTABLISHED; +} + +static inline void cxgbi_sock_purge_write_queue(struct cxgbi_sock *csk) +{ + struct sk_buff *skb; + + while ((skb = __skb_dequeue(&csk->write_queue))) + __kfree_skb(skb); +} + +static inline unsigned int cxgbi_sock_compute_wscale(unsigned int win) +{ + unsigned int wscale = 0; + + while (wscale < 14 && (65535 << wscale) < win) + wscale++; + return wscale; +} + +static inline struct sk_buff *alloc_wr(int wrlen, int dlen, gfp_t gfp) +{ + struct sk_buff *skb = alloc_skb(wrlen + dlen, gfp); + + if (skb) { + __skb_put(skb, wrlen); + memset(skb->head, 0, wrlen + dlen); + } else + pr_info("alloc cpl wr skb %u+%u, OOM.\n", wrlen, dlen); + return skb; +} + + +/* + * The number of WRs needed for an skb depends on the number of fragments + * in the skb and whether it has any payload in its main body. This maps the + * length of the gather list represented by an skb into the # of necessary WRs. + * The extra two fragments are for iscsi bhs and payload padding. + */ +#define SKB_WR_LIST_SIZE (MAX_SKB_FRAGS + 2) + +static inline void cxgbi_sock_reset_wr_list(struct cxgbi_sock *csk) +{ + csk->wr_pending_head = csk->wr_pending_tail = NULL; +} + +static inline void cxgbi_sock_enqueue_wr(struct cxgbi_sock *csk, + struct sk_buff *skb) +{ + cxgbi_skcb_tx_wr_next(skb) = NULL; + /* + * We want to take an extra reference since both us and the driver + * need to free the packet before it's really freed. We know there's + * just one user currently so we use atomic_set rather than skb_get + * to avoid the atomic op. + */ + atomic_set(&skb->users, 2); + + if (!csk->wr_pending_head) + csk->wr_pending_head = skb; + else + cxgbi_skcb_tx_wr_next(csk->wr_pending_tail) = skb; + csk->wr_pending_tail = skb; +} + +static inline int cxgbi_sock_count_pending_wrs(const struct cxgbi_sock *csk) +{ + int n = 0; + const struct sk_buff *skb = csk->wr_pending_head; + + while (skb) { + n += skb->csum; + skb = cxgbi_skcb_tx_wr_next(skb); + } + return n; +} + +static inline struct sk_buff *cxgbi_sock_peek_wr(const struct cxgbi_sock *csk) +{ + return csk->wr_pending_head; +} + +static inline struct sk_buff *cxgbi_sock_dequeue_wr(struct cxgbi_sock *csk) +{ + struct sk_buff *skb = csk->wr_pending_head; + + if (likely(skb)) { + csk->wr_pending_head = cxgbi_skcb_tx_wr_next(skb); + cxgbi_skcb_tx_wr_next(skb) = NULL; + } + return skb; +} + +void cxgbi_sock_check_wr_invariants(const struct cxgbi_sock *); +void cxgbi_sock_purge_wr_queue(struct cxgbi_sock *); +void cxgbi_sock_skb_entail(struct cxgbi_sock *, struct sk_buff *); +void cxgbi_sock_fail_act_open(struct cxgbi_sock *, int); +void cxgbi_sock_act_open_req_arp_failure(void *, struct sk_buff *); +void cxgbi_sock_closed(struct cxgbi_sock *); +void cxgbi_sock_established(struct cxgbi_sock *, unsigned int, unsigned int); +void cxgbi_sock_rcv_abort_rpl(struct cxgbi_sock *); +void cxgbi_sock_rcv_peer_close(struct cxgbi_sock *); +void cxgbi_sock_rcv_close_conn_rpl(struct cxgbi_sock *, u32); +void cxgbi_sock_rcv_wr_ack(struct cxgbi_sock *, unsigned int, unsigned int, + int); +unsigned int cxgbi_sock_select_mss(struct cxgbi_sock *, unsigned int); +void cxgbi_sock_free_cpl_skbs(struct cxgbi_sock *); + +struct cxgbi_hba { + struct net_device *ndev; + struct net_device *vdev; /* vlan dev */ + struct Scsi_Host *shost; + struct cxgbi_device *cdev; + __be32 ipv4addr; + unsigned char port_id; +}; + +struct cxgbi_ports_map { + unsigned int max_connect; + unsigned int used; + unsigned short sport_base; + spinlock_t lock; + unsigned int next; + struct cxgbi_sock **port_csk; +}; + +#define CXGBI_FLAG_DEV_T3 0x1 +#define CXGBI_FLAG_DEV_T4 0x2 +#define CXGBI_FLAG_ADAPTER_RESET 0x4 +#define CXGBI_FLAG_IPV4_SET 0x10 +struct cxgbi_device { + struct list_head list_head; + struct list_head rcu_node; + unsigned int flags; + struct net_device **ports; + void *lldev; + struct cxgbi_hba **hbas; + const unsigned short *mtus; + unsigned char nmtus; + unsigned char nports; + struct pci_dev *pdev; + struct dentry *debugfs_root; + struct iscsi_transport *itp; + + unsigned int pfvf; + unsigned int snd_win; + unsigned int rcv_win; + unsigned int rx_credit_thres; + unsigned int skb_tx_rsvd; + unsigned int skb_rx_extra; /* for msg coalesced mode */ + unsigned int tx_max_size; + unsigned int rx_max_size; + struct cxgbi_ports_map pmap; + struct cxgbi_tag_format tag_format; + struct cxgbi_ddp_info *ddp; + + void (*dev_ddp_cleanup)(struct cxgbi_device *); + int (*csk_ddp_set)(struct cxgbi_sock *, struct cxgbi_pagepod_hdr *, + unsigned int, unsigned int, + struct cxgbi_gather_list *); + void (*csk_ddp_clear)(struct cxgbi_hba *, + unsigned int, unsigned int, unsigned int); + int (*csk_ddp_setup_digest)(struct cxgbi_sock *, + unsigned int, int, int, int); + int (*csk_ddp_setup_pgidx)(struct cxgbi_sock *, + unsigned int, int, bool); + + void (*csk_release_offload_resources)(struct cxgbi_sock *); + int (*csk_rx_pdu_ready)(struct cxgbi_sock *, struct sk_buff *); + u32 (*csk_send_rx_credits)(struct cxgbi_sock *, u32); + int (*csk_push_tx_frames)(struct cxgbi_sock *, int); + void (*csk_send_abort_req)(struct cxgbi_sock *); + void (*csk_send_close_req)(struct cxgbi_sock *); + int (*csk_alloc_cpls)(struct cxgbi_sock *); + int (*csk_init_act_open)(struct cxgbi_sock *); + + void *dd_data; +}; +#define cxgbi_cdev_priv(cdev) ((cdev)->dd_data) + +struct cxgbi_conn { + struct cxgbi_endpoint *cep; + struct iscsi_conn *iconn; + struct cxgbi_hba *chba; + u32 task_idx_bits; +}; + +struct cxgbi_endpoint { + struct cxgbi_conn *cconn; + struct cxgbi_hba *chba; + struct cxgbi_sock *csk; +}; + +#define MAX_PDU_FRAGS ((ULP2_MAX_PDU_PAYLOAD + 512 - 1) / 512) +struct cxgbi_task_data { + unsigned short nr_frags; + struct page_frag frags[MAX_PDU_FRAGS]; + struct sk_buff *skb; + unsigned int offset; + unsigned int count; + unsigned int sgoffset; +}; +#define iscsi_task_cxgbi_data(task) \ + ((task)->dd_data + sizeof(struct iscsi_tcp_task)) + +static inline int cxgbi_is_ddp_tag(struct cxgbi_tag_format *tformat, u32 tag) +{ + return !(tag & (1 << (tformat->rsvd_bits + tformat->rsvd_shift - 1))); +} + +static inline int cxgbi_sw_tag_usable(struct cxgbi_tag_format *tformat, + u32 sw_tag) +{ + sw_tag >>= (32 - tformat->rsvd_bits); + return !sw_tag; +} + +static inline u32 cxgbi_set_non_ddp_tag(struct cxgbi_tag_format *tformat, + u32 sw_tag) +{ + unsigned char shift = tformat->rsvd_bits + tformat->rsvd_shift - 1; + u32 mask = (1 << shift) - 1; + + if (sw_tag && (sw_tag & ~mask)) { + u32 v1 = sw_tag & ((1 << shift) - 1); + u32 v2 = (sw_tag >> (shift - 1)) << shift; + + return v2 | v1 | 1 << shift; + } + + return sw_tag | 1 << shift; +} + +static inline u32 cxgbi_ddp_tag_base(struct cxgbi_tag_format *tformat, + u32 sw_tag) +{ + u32 mask = (1 << tformat->rsvd_shift) - 1; + + if (sw_tag && (sw_tag & ~mask)) { + u32 v1 = sw_tag & mask; + u32 v2 = sw_tag >> tformat->rsvd_shift; + + v2 <<= tformat->rsvd_bits + tformat->rsvd_shift; + + return v2 | v1; + } + + return sw_tag; +} + +static inline u32 cxgbi_tag_rsvd_bits(struct cxgbi_tag_format *tformat, + u32 tag) +{ + if (cxgbi_is_ddp_tag(tformat, tag)) + return (tag >> tformat->rsvd_shift) & tformat->rsvd_mask; + + return 0; +} + +static inline u32 cxgbi_tag_nonrsvd_bits(struct cxgbi_tag_format *tformat, + u32 tag) +{ + unsigned char shift = tformat->rsvd_bits + tformat->rsvd_shift - 1; + u32 v1, v2; + + if (cxgbi_is_ddp_tag(tformat, tag)) { + v1 = tag & ((1 << tformat->rsvd_shift) - 1); + v2 = (tag >> (shift + 1)) << tformat->rsvd_shift; + } else { + u32 mask = (1 << shift) - 1; + tag &= ~(1 << shift); + v1 = tag & mask; + v2 = (tag >> 1) & ~mask; + } + return v1 | v2; +} + +static inline void *cxgbi_alloc_big_mem(unsigned int size, + gfp_t gfp) +{ + void *p = kzalloc(size, gfp | __GFP_NOWARN); + + if (!p) + p = vzalloc(size); + + return p; +} + +static inline void cxgbi_free_big_mem(void *addr) +{ + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); +} + +static inline void cxgbi_set_iscsi_ipv4(struct cxgbi_hba *chba, __be32 ipaddr) +{ + if (chba->cdev->flags & CXGBI_FLAG_IPV4_SET) + chba->ipv4addr = ipaddr; + else + pr_info("set iscsi ipv4 NOT supported, using %s ipv4.\n", + chba->ndev->name); +} + +struct cxgbi_device *cxgbi_device_register(unsigned int, unsigned int); +void cxgbi_device_unregister(struct cxgbi_device *); +void cxgbi_device_unregister_all(unsigned int flag); +struct cxgbi_device *cxgbi_device_find_by_lldev(void *); +struct cxgbi_device *cxgbi_device_find_by_netdev(struct net_device *, int *); +struct cxgbi_device *cxgbi_device_find_by_netdev_rcu(struct net_device *, + int *); +int cxgbi_hbas_add(struct cxgbi_device *, u64, unsigned int, + struct scsi_host_template *, + struct scsi_transport_template *); +void cxgbi_hbas_remove(struct cxgbi_device *); + +int cxgbi_device_portmap_create(struct cxgbi_device *cdev, unsigned int base, + unsigned int max_conn); +void cxgbi_device_portmap_cleanup(struct cxgbi_device *cdev); + +void cxgbi_conn_tx_open(struct cxgbi_sock *); +void cxgbi_conn_pdu_ready(struct cxgbi_sock *); +int cxgbi_conn_alloc_pdu(struct iscsi_task *, u8); +int cxgbi_conn_init_pdu(struct iscsi_task *, unsigned int , unsigned int); +int cxgbi_conn_xmit_pdu(struct iscsi_task *); + +void cxgbi_cleanup_task(struct iscsi_task *task); + +umode_t cxgbi_attr_is_visible(int param_type, int param); +void cxgbi_get_conn_stats(struct iscsi_cls_conn *, struct iscsi_stats *); +int cxgbi_set_conn_param(struct iscsi_cls_conn *, + enum iscsi_param, char *, int); +int cxgbi_get_ep_param(struct iscsi_endpoint *ep, enum iscsi_param, char *); +struct iscsi_cls_conn *cxgbi_create_conn(struct iscsi_cls_session *, u32); +int cxgbi_bind_conn(struct iscsi_cls_session *, + struct iscsi_cls_conn *, u64, int); +void cxgbi_destroy_session(struct iscsi_cls_session *); +struct iscsi_cls_session *cxgbi_create_session(struct iscsi_endpoint *, + u16, u16, u32); +int cxgbi_set_host_param(struct Scsi_Host *, + enum iscsi_host_param, char *, int); +int cxgbi_get_host_param(struct Scsi_Host *, enum iscsi_host_param, char *); +struct iscsi_endpoint *cxgbi_ep_connect(struct Scsi_Host *, + struct sockaddr *, int); +int cxgbi_ep_poll(struct iscsi_endpoint *, int); +void cxgbi_ep_disconnect(struct iscsi_endpoint *); + +int cxgbi_iscsi_init(struct iscsi_transport *, + struct scsi_transport_template **); +void cxgbi_iscsi_cleanup(struct iscsi_transport *, + struct scsi_transport_template **); +void cxgbi_parse_pdu_itt(struct iscsi_conn *, itt_t, int *, int *); +int cxgbi_ddp_init(struct cxgbi_device *, unsigned int, unsigned int, + unsigned int, unsigned int); +int cxgbi_ddp_cleanup(struct cxgbi_device *); +void cxgbi_ddp_page_size_factor(int *); +void cxgbi_ddp_ppod_clear(struct cxgbi_pagepod *); +void cxgbi_ddp_ppod_set(struct cxgbi_pagepod *, struct cxgbi_pagepod_hdr *, + struct cxgbi_gather_list *, unsigned int); +#endif /*__LIBCXGBI_H__*/ diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h new file mode 100644 index 0000000..12b8e1b --- /dev/null +++ b/drivers/scsi/scsi_priv.h @@ -0,0 +1,186 @@ +#ifndef _SCSI_PRIV_H +#define _SCSI_PRIV_H + +#include +#include +#include + +struct request_queue; +struct request; +struct scsi_cmnd; +struct scsi_device; +struct scsi_target; +struct scsi_host_template; +struct Scsi_Host; +struct scsi_nl_hdr; + + +/* + * Scsi Error Handler Flags + */ +#define SCSI_EH_CANCEL_CMD 0x0001 /* Cancel this cmd */ +#define SCSI_EH_ABORT_SCHEDULED 0x0002 /* Abort has been scheduled */ + +#define SCSI_SENSE_VALID(scmd) \ + (((scmd)->sense_buffer[0] & 0x70) == 0x70) + +/* hosts.c */ +extern int scsi_init_hosts(void); +extern void scsi_exit_hosts(void); + +/* scsi.c */ +extern int scsi_dispatch_cmd(struct scsi_cmnd *cmd); +extern int scsi_setup_command_freelist(struct Scsi_Host *shost); +extern void scsi_destroy_command_freelist(struct Scsi_Host *shost); +#ifdef CONFIG_SCSI_LOGGING +void scsi_log_send(struct scsi_cmnd *cmd); +void scsi_log_completion(struct scsi_cmnd *cmd, int disposition); +#else +static inline void scsi_log_send(struct scsi_cmnd *cmd) + { }; +static inline void scsi_log_completion(struct scsi_cmnd *cmd, int disposition) + { }; +#endif + +/* scsi_devinfo.c */ + +/* list of keys for the lists */ +enum { + SCSI_DEVINFO_GLOBAL = 0, + SCSI_DEVINFO_SPI, +}; + +extern int scsi_get_device_flags(struct scsi_device *sdev, + const unsigned char *vendor, + const unsigned char *model); +extern int scsi_get_device_flags_keyed(struct scsi_device *sdev, + const unsigned char *vendor, + const unsigned char *model, int key); +extern int scsi_dev_info_list_add_keyed(int compatible, char *vendor, + char *model, char *strflags, + int flags, int key); +extern int scsi_dev_info_list_del_keyed(char *vendor, char *model, int key); +extern int scsi_dev_info_add_list(int key, const char *name); +extern int scsi_dev_info_remove_list(int key); + +extern int __init scsi_init_devinfo(void); +extern void scsi_exit_devinfo(void); + +/* scsi_error.c */ +extern void scmd_eh_abort_handler(struct work_struct *work); +extern enum blk_eh_timer_return scsi_times_out(struct request *req); +extern int scsi_error_handler(void *host); +extern int scsi_decide_disposition(struct scsi_cmnd *cmd); +extern void scsi_eh_wakeup(struct Scsi_Host *shost); +extern int scsi_eh_scmd_add(struct scsi_cmnd *, int); +void scsi_eh_ready_devs(struct Scsi_Host *shost, + struct list_head *work_q, + struct list_head *done_q); +int scsi_eh_get_sense(struct list_head *work_q, + struct list_head *done_q); +int scsi_noretry_cmd(struct scsi_cmnd *scmd); + +/* scsi_lib.c */ +extern int scsi_maybe_unblock_host(struct scsi_device *sdev); +extern void scsi_device_unbusy(struct scsi_device *sdev); +extern void scsi_queue_insert(struct scsi_cmnd *cmd, int reason); +extern void scsi_next_command(struct scsi_cmnd *cmd); +extern void scsi_io_completion(struct scsi_cmnd *, unsigned int); +extern void scsi_run_host_queues(struct Scsi_Host *shost); +extern struct request_queue *scsi_alloc_queue(struct scsi_device *sdev); +extern struct request_queue *scsi_mq_alloc_queue(struct scsi_device *sdev); +extern int scsi_mq_setup_tags(struct Scsi_Host *shost); +extern void scsi_mq_destroy_tags(struct Scsi_Host *shost); +extern int scsi_init_queue(void); +extern void scsi_exit_queue(void); +struct request_queue; +struct request; +extern struct kmem_cache *scsi_sdb_cache; + +/* scsi_proc.c */ +#ifdef CONFIG_SCSI_PROC_FS +extern void scsi_proc_hostdir_add(struct scsi_host_template *); +extern void scsi_proc_hostdir_rm(struct scsi_host_template *); +extern void scsi_proc_host_add(struct Scsi_Host *); +extern void scsi_proc_host_rm(struct Scsi_Host *); +extern int scsi_init_procfs(void); +extern void scsi_exit_procfs(void); +#else +# define scsi_proc_hostdir_add(sht) do { } while (0) +# define scsi_proc_hostdir_rm(sht) do { } while (0) +# define scsi_proc_host_add(shost) do { } while (0) +# define scsi_proc_host_rm(shost) do { } while (0) +# define scsi_init_procfs() (0) +# define scsi_exit_procfs() do { } while (0) +#endif /* CONFIG_PROC_FS */ + +/* scsi_scan.c */ +extern char scsi_scan_type[]; +extern int scsi_complete_async_scans(void); +extern int scsi_scan_host_selected(struct Scsi_Host *, unsigned int, + unsigned int, u64, int); +extern void scsi_forget_host(struct Scsi_Host *); +extern void scsi_rescan_device(struct device *); + +/* scsi_sysctl.c */ +#ifdef CONFIG_SYSCTL +extern int scsi_init_sysctl(void); +extern void scsi_exit_sysctl(void); +#else +# define scsi_init_sysctl() (0) +# define scsi_exit_sysctl() do { } while (0) +#endif /* CONFIG_SYSCTL */ + +/* scsi_sysfs.c */ +extern int scsi_sysfs_add_sdev(struct scsi_device *); +extern int scsi_sysfs_add_host(struct Scsi_Host *); +extern int scsi_sysfs_register(void); +extern void scsi_sysfs_unregister(void); +extern void scsi_sysfs_device_initialize(struct scsi_device *); +extern int scsi_sysfs_target_initialize(struct scsi_device *); +extern struct scsi_transport_template blank_transport_template; +extern void __scsi_remove_device(struct scsi_device *); + +extern struct bus_type scsi_bus_type; +extern const struct attribute_group *scsi_sysfs_shost_attr_groups[]; + +/* scsi_netlink.c */ +#ifdef CONFIG_SCSI_NETLINK +extern void scsi_netlink_init(void); +extern void scsi_netlink_exit(void); +extern struct sock *scsi_nl_sock; +#else +static inline void scsi_netlink_init(void) {} +static inline void scsi_netlink_exit(void) {} +#endif + +/* scsi_pm.c */ +#ifdef CONFIG_PM +extern const struct dev_pm_ops scsi_bus_pm_ops; +#endif +#ifdef CONFIG_PM_RUNTIME +extern void scsi_autopm_get_target(struct scsi_target *); +extern void scsi_autopm_put_target(struct scsi_target *); +extern int scsi_autopm_get_host(struct Scsi_Host *); +extern void scsi_autopm_put_host(struct Scsi_Host *); +#else +static inline void scsi_autopm_get_target(struct scsi_target *t) {} +static inline void scsi_autopm_put_target(struct scsi_target *t) {} +static inline int scsi_autopm_get_host(struct Scsi_Host *h) { return 0; } +static inline void scsi_autopm_put_host(struct Scsi_Host *h) {} +#endif /* CONFIG_PM_RUNTIME */ + +extern struct async_domain scsi_sd_pm_domain; +extern struct async_domain scsi_sd_probe_domain; + +/* + * internal scsi timeout functions: for use by mid-layer and transport + * classes. + */ + +#define SCSI_DEVICE_BLOCK_MAX_TIMEOUT 600 /* units in seconds */ +extern int scsi_internal_device_block(struct scsi_device *sdev); +extern int scsi_internal_device_unblock(struct scsi_device *sdev, + enum scsi_device_state new_state); + +#endif /* _SCSI_PRIV_H */ diff --git a/drivers/scsi/scsi_transport_srp.c b/drivers/scsi/scsi_transport_srp.c new file mode 100644 index 0000000..ae45bd9 --- /dev/null +++ b/drivers/scsi/scsi_transport_srp.c @@ -0,0 +1,927 @@ +/* + * SCSI RDMA (SRP) transport class + * + * Copyright (C) 2007 FUJITA Tomonori + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include "scsi_priv.h" + +struct srp_host_attrs { + atomic_t next_port_id; +}; +#define to_srp_host_attrs(host) ((struct srp_host_attrs *)(host)->shost_data) + +#define SRP_HOST_ATTRS 0 +#define SRP_RPORT_ATTRS 8 + +struct srp_internal { + struct scsi_transport_template t; + struct srp_function_template *f; + + struct device_attribute *host_attrs[SRP_HOST_ATTRS + 1]; + + struct device_attribute *rport_attrs[SRP_RPORT_ATTRS + 1]; + struct transport_container rport_attr_cont; +}; + +#define to_srp_internal(tmpl) container_of(tmpl, struct srp_internal, t) + +#define dev_to_rport(d) container_of(d, struct srp_rport, dev) +#define transport_class_to_srp_rport(dev) dev_to_rport((dev)->parent) +static inline struct Scsi_Host *rport_to_shost(struct srp_rport *r) +{ + return dev_to_shost(r->dev.parent); +} + +/** + * srp_tmo_valid() - check timeout combination validity + * @reconnect_delay: Reconnect delay in seconds. + * @fast_io_fail_tmo: Fast I/O fail timeout in seconds. + * @dev_loss_tmo: Device loss timeout in seconds. + * + * The combination of the timeout parameters must be such that SCSI commands + * are finished in a reasonable time. Hence do not allow the fast I/O fail + * timeout to exceed SCSI_DEVICE_BLOCK_MAX_TIMEOUT nor allow dev_loss_tmo to + * exceed that limit if failing I/O fast has been disabled. Furthermore, these + * parameters must be such that multipath can detect failed paths timely. + * Hence do not allow all three parameters to be disabled simultaneously. + */ +int srp_tmo_valid(int reconnect_delay, int fast_io_fail_tmo, int dev_loss_tmo) +{ + if (reconnect_delay < 0 && fast_io_fail_tmo < 0 && dev_loss_tmo < 0) + return -EINVAL; + if (reconnect_delay == 0) + return -EINVAL; + if (fast_io_fail_tmo > SCSI_DEVICE_BLOCK_MAX_TIMEOUT) + return -EINVAL; + if (fast_io_fail_tmo < 0 && + dev_loss_tmo > SCSI_DEVICE_BLOCK_MAX_TIMEOUT) + return -EINVAL; + if (dev_loss_tmo >= LONG_MAX / HZ) + return -EINVAL; + if (fast_io_fail_tmo >= 0 && dev_loss_tmo >= 0 && + fast_io_fail_tmo >= dev_loss_tmo) + return -EINVAL; + return 0; +} +EXPORT_SYMBOL_GPL(srp_tmo_valid); + +static int srp_host_setup(struct transport_container *tc, struct device *dev, + struct device *cdev) +{ + struct Scsi_Host *shost = dev_to_shost(dev); + struct srp_host_attrs *srp_host = to_srp_host_attrs(shost); + + atomic_set(&srp_host->next_port_id, 0); + return 0; +} + +static DECLARE_TRANSPORT_CLASS(srp_host_class, "srp_host", srp_host_setup, + NULL, NULL); + +static DECLARE_TRANSPORT_CLASS(srp_rport_class, "srp_remote_ports", + NULL, NULL, NULL); + +#define SRP_PID(p) \ + (p)->port_id[0], (p)->port_id[1], (p)->port_id[2], (p)->port_id[3], \ + (p)->port_id[4], (p)->port_id[5], (p)->port_id[6], (p)->port_id[7], \ + (p)->port_id[8], (p)->port_id[9], (p)->port_id[10], (p)->port_id[11], \ + (p)->port_id[12], (p)->port_id[13], (p)->port_id[14], (p)->port_id[15] + +#define SRP_PID_FMT "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:" \ + "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x" + +static ssize_t +show_srp_rport_id(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + return sprintf(buf, SRP_PID_FMT "\n", SRP_PID(rport)); +} + +static DEVICE_ATTR(port_id, S_IRUGO, show_srp_rport_id, NULL); + +static const struct { + u32 value; + char *name; +} srp_rport_role_names[] = { + {SRP_RPORT_ROLE_INITIATOR, "SRP Initiator"}, + {SRP_RPORT_ROLE_TARGET, "SRP Target"}, +}; + +static ssize_t +show_srp_rport_roles(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + int i; + char *name = NULL; + + for (i = 0; i < ARRAY_SIZE(srp_rport_role_names); i++) + if (srp_rport_role_names[i].value == rport->roles) { + name = srp_rport_role_names[i].name; + break; + } + return sprintf(buf, "%s\n", name ? : "unknown"); +} + +static DEVICE_ATTR(roles, S_IRUGO, show_srp_rport_roles, NULL); + +static ssize_t store_srp_rport_delete(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + struct Scsi_Host *shost = dev_to_shost(dev); + struct srp_internal *i = to_srp_internal(shost->transportt); + + if (i->f->rport_delete) { + i->f->rport_delete(rport); + return count; + } else { + return -ENOSYS; + } +} + +static DEVICE_ATTR(delete, S_IWUSR, NULL, store_srp_rport_delete); + +static ssize_t show_srp_rport_state(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + static const char *const state_name[] = { + [SRP_RPORT_RUNNING] = "running", + [SRP_RPORT_BLOCKED] = "blocked", + [SRP_RPORT_FAIL_FAST] = "fail-fast", + [SRP_RPORT_LOST] = "lost", + }; + struct srp_rport *rport = transport_class_to_srp_rport(dev); + enum srp_rport_state state = rport->state; + + return sprintf(buf, "%s\n", + (unsigned)state < ARRAY_SIZE(state_name) ? + state_name[state] : "???"); +} + +static DEVICE_ATTR(state, S_IRUGO, show_srp_rport_state, NULL); + +static ssize_t srp_show_tmo(char *buf, int tmo) +{ + return tmo >= 0 ? sprintf(buf, "%d\n", tmo) : sprintf(buf, "off\n"); +} + +static int srp_parse_tmo(int *tmo, const char *buf) +{ + int res = 0; + + if (strncmp(buf, "off", 3) != 0) + res = kstrtoint(buf, 0, tmo); + else + *tmo = -1; + + return res; +} + +static ssize_t show_reconnect_delay(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + + return srp_show_tmo(buf, rport->reconnect_delay); +} + +static ssize_t store_reconnect_delay(struct device *dev, + struct device_attribute *attr, + const char *buf, const size_t count) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + int res, delay; + + res = srp_parse_tmo(&delay, buf); + if (res) + goto out; + res = srp_tmo_valid(delay, rport->fast_io_fail_tmo, + rport->dev_loss_tmo); + if (res) + goto out; + + if (rport->reconnect_delay <= 0 && delay > 0 && + rport->state != SRP_RPORT_RUNNING) { + queue_delayed_work(system_long_wq, &rport->reconnect_work, + delay * HZ); + } else if (delay <= 0) { + cancel_delayed_work(&rport->reconnect_work); + } + rport->reconnect_delay = delay; + res = count; + +out: + return res; +} + +static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR, show_reconnect_delay, + store_reconnect_delay); + +static ssize_t show_failed_reconnects(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + + return sprintf(buf, "%d\n", rport->failed_reconnects); +} + +static DEVICE_ATTR(failed_reconnects, S_IRUGO, show_failed_reconnects, NULL); + +static ssize_t show_srp_rport_fast_io_fail_tmo(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + + return srp_show_tmo(buf, rport->fast_io_fail_tmo); +} + +static ssize_t store_srp_rport_fast_io_fail_tmo(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + int res; + int fast_io_fail_tmo; + + res = srp_parse_tmo(&fast_io_fail_tmo, buf); + if (res) + goto out; + res = srp_tmo_valid(rport->reconnect_delay, fast_io_fail_tmo, + rport->dev_loss_tmo); + if (res) + goto out; + rport->fast_io_fail_tmo = fast_io_fail_tmo; + res = count; + +out: + return res; +} + +static DEVICE_ATTR(fast_io_fail_tmo, S_IRUGO | S_IWUSR, + show_srp_rport_fast_io_fail_tmo, + store_srp_rport_fast_io_fail_tmo); + +static ssize_t show_srp_rport_dev_loss_tmo(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + + return srp_show_tmo(buf, rport->dev_loss_tmo); +} + +static ssize_t store_srp_rport_dev_loss_tmo(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + int res; + int dev_loss_tmo; + + res = srp_parse_tmo(&dev_loss_tmo, buf); + if (res) + goto out; + res = srp_tmo_valid(rport->reconnect_delay, rport->fast_io_fail_tmo, + dev_loss_tmo); + if (res) + goto out; + rport->dev_loss_tmo = dev_loss_tmo; + res = count; + +out: + return res; +} + +static DEVICE_ATTR(dev_loss_tmo, S_IRUGO | S_IWUSR, + show_srp_rport_dev_loss_tmo, + store_srp_rport_dev_loss_tmo); + +static int srp_rport_set_state(struct srp_rport *rport, + enum srp_rport_state new_state) +{ + enum srp_rport_state old_state = rport->state; + + lockdep_assert_held(&rport->mutex); + + switch (new_state) { + case SRP_RPORT_RUNNING: + switch (old_state) { + case SRP_RPORT_LOST: + goto invalid; + default: + break; + } + break; + case SRP_RPORT_BLOCKED: + switch (old_state) { + case SRP_RPORT_RUNNING: + break; + default: + goto invalid; + } + break; + case SRP_RPORT_FAIL_FAST: + switch (old_state) { + case SRP_RPORT_LOST: + goto invalid; + default: + break; + } + break; + case SRP_RPORT_LOST: + break; + } + rport->state = new_state; + return 0; + +invalid: + return -EINVAL; +} + +/** + * srp_reconnect_work() - reconnect and schedule a new attempt if necessary + * @work: Work structure used for scheduling this operation. + */ +static void srp_reconnect_work(struct work_struct *work) +{ + struct srp_rport *rport = container_of(to_delayed_work(work), + struct srp_rport, reconnect_work); + struct Scsi_Host *shost = rport_to_shost(rport); + int delay, res; + + res = srp_reconnect_rport(rport); + if (res != 0) { + shost_printk(KERN_ERR, shost, + "reconnect attempt %d failed (%d)\n", + ++rport->failed_reconnects, res); + delay = rport->reconnect_delay * + min(100, max(1, rport->failed_reconnects - 10)); + if (delay > 0) + queue_delayed_work(system_long_wq, + &rport->reconnect_work, delay * HZ); + } +} + +static void __rport_fail_io_fast(struct srp_rport *rport) +{ + struct Scsi_Host *shost = rport_to_shost(rport); + struct srp_internal *i; + + lockdep_assert_held(&rport->mutex); + + if (srp_rport_set_state(rport, SRP_RPORT_FAIL_FAST)) + return; + scsi_target_unblock(rport->dev.parent, SDEV_TRANSPORT_OFFLINE); + + /* Involve the LLD if possible to terminate all I/O on the rport. */ + i = to_srp_internal(shost->transportt); + if (i->f->terminate_rport_io) + i->f->terminate_rport_io(rport); +} + +/** + * rport_fast_io_fail_timedout() - fast I/O failure timeout handler + * @work: Work structure used for scheduling this operation. + */ +static void rport_fast_io_fail_timedout(struct work_struct *work) +{ + struct srp_rport *rport = container_of(to_delayed_work(work), + struct srp_rport, fast_io_fail_work); + struct Scsi_Host *shost = rport_to_shost(rport); + + pr_info("fast_io_fail_tmo expired for SRP %s / %s.\n", + dev_name(&rport->dev), dev_name(&shost->shost_gendev)); + + mutex_lock(&rport->mutex); + if (rport->state == SRP_RPORT_BLOCKED) + __rport_fail_io_fast(rport); + mutex_unlock(&rport->mutex); +} + +/** + * rport_dev_loss_timedout() - device loss timeout handler + * @work: Work structure used for scheduling this operation. + */ +static void rport_dev_loss_timedout(struct work_struct *work) +{ + struct srp_rport *rport = container_of(to_delayed_work(work), + struct srp_rport, dev_loss_work); + struct Scsi_Host *shost = rport_to_shost(rport); + struct srp_internal *i = to_srp_internal(shost->transportt); + + pr_info("dev_loss_tmo expired for SRP %s / %s.\n", + dev_name(&rport->dev), dev_name(&shost->shost_gendev)); + + mutex_lock(&rport->mutex); + WARN_ON(srp_rport_set_state(rport, SRP_RPORT_LOST) != 0); + scsi_target_unblock(rport->dev.parent, SDEV_TRANSPORT_OFFLINE); + mutex_unlock(&rport->mutex); + + i->f->rport_delete(rport); +} + +static void __srp_start_tl_fail_timers(struct srp_rport *rport) +{ + struct Scsi_Host *shost = rport_to_shost(rport); + int delay, fast_io_fail_tmo, dev_loss_tmo; + + lockdep_assert_held(&rport->mutex); + + delay = rport->reconnect_delay; + fast_io_fail_tmo = rport->fast_io_fail_tmo; + dev_loss_tmo = rport->dev_loss_tmo; + pr_debug("%s current state: %d\n", dev_name(&shost->shost_gendev), + rport->state); + + if (rport->state == SRP_RPORT_LOST) + return; + if (delay > 0) + queue_delayed_work(system_long_wq, &rport->reconnect_work, + 1UL * delay * HZ); + if ((fast_io_fail_tmo >= 0 || dev_loss_tmo >= 0) && + srp_rport_set_state(rport, SRP_RPORT_BLOCKED) == 0) { + pr_debug("%s new state: %d\n", dev_name(&shost->shost_gendev), + rport->state); + scsi_target_block(&shost->shost_gendev); + if (fast_io_fail_tmo >= 0) + queue_delayed_work(system_long_wq, + &rport->fast_io_fail_work, + 1UL * fast_io_fail_tmo * HZ); + if (dev_loss_tmo >= 0) + queue_delayed_work(system_long_wq, + &rport->dev_loss_work, + 1UL * dev_loss_tmo * HZ); + } +} + +/** + * srp_start_tl_fail_timers() - start the transport layer failure timers + * @rport: SRP target port. + * + * Start the transport layer fast I/O failure and device loss timers. Do not + * modify a timer that was already started. + */ +void srp_start_tl_fail_timers(struct srp_rport *rport) +{ + mutex_lock(&rport->mutex); + __srp_start_tl_fail_timers(rport); + mutex_unlock(&rport->mutex); +} +EXPORT_SYMBOL(srp_start_tl_fail_timers); + +/** + * scsi_request_fn_active() - number of kernel threads inside scsi_request_fn() + * @shost: SCSI host for which to count the number of scsi_request_fn() callers. + */ +static int scsi_request_fn_active(struct Scsi_Host *shost) +{ + struct scsi_device *sdev; + struct request_queue *q; + int request_fn_active = 0; + + shost_for_each_device(sdev, shost) { + q = sdev->request_queue; + + spin_lock_irq(q->queue_lock); + request_fn_active += q->request_fn_active; + spin_unlock_irq(q->queue_lock); + } + + return request_fn_active; +} + +/** + * srp_reconnect_rport() - reconnect to an SRP target port + * @rport: SRP target port. + * + * Blocks SCSI command queueing before invoking reconnect() such that + * queuecommand() won't be invoked concurrently with reconnect() from outside + * the SCSI EH. This is important since a reconnect() implementation may + * reallocate resources needed by queuecommand(). + * + * Notes: + * - This function neither waits until outstanding requests have finished nor + * tries to abort these. It is the responsibility of the reconnect() + * function to finish outstanding commands before reconnecting to the target + * port. + * - It is the responsibility of the caller to ensure that the resources + * reallocated by the reconnect() function won't be used while this function + * is in progress. One possible strategy is to invoke this function from + * the context of the SCSI EH thread only. Another possible strategy is to + * lock the rport mutex inside each SCSI LLD callback that can be invoked by + * the SCSI EH (the scsi_host_template.eh_*() functions and also the + * scsi_host_template.queuecommand() function). + */ +int srp_reconnect_rport(struct srp_rport *rport) +{ + struct Scsi_Host *shost = rport_to_shost(rport); + struct srp_internal *i = to_srp_internal(shost->transportt); + struct scsi_device *sdev; + int res; + + pr_debug("SCSI host %s\n", dev_name(&shost->shost_gendev)); + + res = mutex_lock_interruptible(&rport->mutex); + if (res) + goto out; + scsi_target_block(&shost->shost_gendev); + while (scsi_request_fn_active(shost)) + msleep(20); + res = rport->state != SRP_RPORT_LOST ? i->f->reconnect(rport) : -ENODEV; + pr_debug("%s (state %d): transport.reconnect() returned %d\n", + dev_name(&shost->shost_gendev), rport->state, res); + if (res == 0) { + cancel_delayed_work(&rport->fast_io_fail_work); + cancel_delayed_work(&rport->dev_loss_work); + + rport->failed_reconnects = 0; + srp_rport_set_state(rport, SRP_RPORT_RUNNING); + scsi_target_unblock(&shost->shost_gendev, SDEV_RUNNING); + /* + * If the SCSI error handler has offlined one or more devices, + * invoking scsi_target_unblock() won't change the state of + * these devices into running so do that explicitly. + */ + spin_lock_irq(shost->host_lock); + __shost_for_each_device(sdev, shost) + if (sdev->sdev_state == SDEV_OFFLINE) + sdev->sdev_state = SDEV_RUNNING; + spin_unlock_irq(shost->host_lock); + } else if (rport->state == SRP_RPORT_RUNNING) { + /* + * srp_reconnect_rport() has been invoked with fast_io_fail + * and dev_loss off. Mark the port as failed and start the TL + * failure timers if these had not yet been started. + */ + __rport_fail_io_fast(rport); + scsi_target_unblock(&shost->shost_gendev, + SDEV_TRANSPORT_OFFLINE); + __srp_start_tl_fail_timers(rport); + } else if (rport->state != SRP_RPORT_BLOCKED) { + scsi_target_unblock(&shost->shost_gendev, + SDEV_TRANSPORT_OFFLINE); + } + mutex_unlock(&rport->mutex); + +out: + return res; +} +EXPORT_SYMBOL(srp_reconnect_rport); + +/** + * srp_timed_out() - SRP transport intercept of the SCSI timeout EH + * @scmd: SCSI command. + * + * If a timeout occurs while an rport is in the blocked state, ask the SCSI + * EH to continue waiting (BLK_EH_RESET_TIMER). Otherwise let the SCSI core + * handle the timeout (BLK_EH_NOT_HANDLED). + * + * Note: This function is called from soft-IRQ context and with the request + * queue lock held. + */ +static enum blk_eh_timer_return srp_timed_out(struct scsi_cmnd *scmd) +{ + struct scsi_device *sdev = scmd->device; + struct Scsi_Host *shost = sdev->host; + struct srp_internal *i = to_srp_internal(shost->transportt); + + pr_debug("timeout for sdev %s\n", dev_name(&sdev->sdev_gendev)); + return i->f->reset_timer_if_blocked && scsi_device_blocked(sdev) ? + BLK_EH_RESET_TIMER : BLK_EH_NOT_HANDLED; +} + +static void srp_rport_release(struct device *dev) +{ + struct srp_rport *rport = dev_to_rport(dev); + + put_device(dev->parent); + kfree(rport); +} + +static int scsi_is_srp_rport(const struct device *dev) +{ + return dev->release == srp_rport_release; +} + +static int srp_rport_match(struct attribute_container *cont, + struct device *dev) +{ + struct Scsi_Host *shost; + struct srp_internal *i; + + if (!scsi_is_srp_rport(dev)) + return 0; + + shost = dev_to_shost(dev->parent); + if (!shost->transportt) + return 0; + if (shost->transportt->host_attrs.ac.class != &srp_host_class.class) + return 0; + + i = to_srp_internal(shost->transportt); + return &i->rport_attr_cont.ac == cont; +} + +static int srp_host_match(struct attribute_container *cont, struct device *dev) +{ + struct Scsi_Host *shost; + struct srp_internal *i; + + if (!scsi_is_host_device(dev)) + return 0; + + shost = dev_to_shost(dev); + if (!shost->transportt) + return 0; + if (shost->transportt->host_attrs.ac.class != &srp_host_class.class) + return 0; + + i = to_srp_internal(shost->transportt); + return &i->t.host_attrs.ac == cont; +} + +/** + * srp_rport_get() - increment rport reference count + * @rport: SRP target port. + */ +void srp_rport_get(struct srp_rport *rport) +{ + get_device(&rport->dev); +} +EXPORT_SYMBOL(srp_rport_get); + +/** + * srp_rport_put() - decrement rport reference count + * @rport: SRP target port. + */ +void srp_rport_put(struct srp_rport *rport) +{ + put_device(&rport->dev); +} +EXPORT_SYMBOL(srp_rport_put); + +/** + * srp_rport_add - add a SRP remote port to the device hierarchy + * @shost: scsi host the remote port is connected to. + * @ids: The port id for the remote port. + * + * Publishes a port to the rest of the system. + */ +struct srp_rport *srp_rport_add(struct Scsi_Host *shost, + struct srp_rport_identifiers *ids) +{ + struct srp_rport *rport; + struct device *parent = &shost->shost_gendev; + struct srp_internal *i = to_srp_internal(shost->transportt); + int id, ret; + + rport = kzalloc(sizeof(*rport), GFP_KERNEL); + if (!rport) + return ERR_PTR(-ENOMEM); + + mutex_init(&rport->mutex); + + device_initialize(&rport->dev); + + rport->dev.parent = get_device(parent); + rport->dev.release = srp_rport_release; + + memcpy(rport->port_id, ids->port_id, sizeof(rport->port_id)); + rport->roles = ids->roles; + + if (i->f->reconnect) + rport->reconnect_delay = i->f->reconnect_delay ? + *i->f->reconnect_delay : 10; + INIT_DELAYED_WORK(&rport->reconnect_work, srp_reconnect_work); + rport->fast_io_fail_tmo = i->f->fast_io_fail_tmo ? + *i->f->fast_io_fail_tmo : 15; + rport->dev_loss_tmo = i->f->dev_loss_tmo ? *i->f->dev_loss_tmo : 60; + INIT_DELAYED_WORK(&rport->fast_io_fail_work, + rport_fast_io_fail_timedout); + INIT_DELAYED_WORK(&rport->dev_loss_work, rport_dev_loss_timedout); + + id = atomic_inc_return(&to_srp_host_attrs(shost)->next_port_id); + dev_set_name(&rport->dev, "port-%d:%d", shost->host_no, id); + + transport_setup_device(&rport->dev); + + ret = device_add(&rport->dev); + if (ret) { + transport_destroy_device(&rport->dev); + put_device(&rport->dev); + return ERR_PTR(ret); + } + + transport_add_device(&rport->dev); + transport_configure_device(&rport->dev); + + return rport; +} +EXPORT_SYMBOL_GPL(srp_rport_add); + +/** + * srp_rport_del - remove a SRP remote port + * @rport: SRP remote port to remove + * + * Removes the specified SRP remote port. + */ +void srp_rport_del(struct srp_rport *rport) +{ + struct device *dev = &rport->dev; + + transport_remove_device(dev); + device_del(dev); + transport_destroy_device(dev); + + put_device(dev); +} +EXPORT_SYMBOL_GPL(srp_rport_del); + +static int do_srp_rport_del(struct device *dev, void *data) +{ + if (scsi_is_srp_rport(dev)) + srp_rport_del(dev_to_rport(dev)); + return 0; +} + +/** + * srp_remove_host - tear down a Scsi_Host's SRP data structures + * @shost: Scsi Host that is torn down + * + * Removes all SRP remote ports for a given Scsi_Host. + * Must be called just before scsi_remove_host for SRP HBAs. + */ +void srp_remove_host(struct Scsi_Host *shost) +{ + device_for_each_child(&shost->shost_gendev, NULL, do_srp_rport_del); +} +EXPORT_SYMBOL_GPL(srp_remove_host); + +/** + * srp_stop_rport_timers - stop the transport layer recovery timers + * @rport: SRP remote port for which to stop the timers. + * + * Must be called after srp_remove_host() and scsi_remove_host(). The caller + * must hold a reference on the rport (rport->dev) and on the SCSI host + * (rport->dev.parent). + */ +void srp_stop_rport_timers(struct srp_rport *rport) +{ + mutex_lock(&rport->mutex); + if (rport->state == SRP_RPORT_BLOCKED) + __rport_fail_io_fast(rport); + srp_rport_set_state(rport, SRP_RPORT_LOST); + mutex_unlock(&rport->mutex); + + cancel_delayed_work_sync(&rport->reconnect_work); + cancel_delayed_work_sync(&rport->fast_io_fail_work); + cancel_delayed_work_sync(&rport->dev_loss_work); +} +EXPORT_SYMBOL_GPL(srp_stop_rport_timers); + +static int srp_tsk_mgmt_response(struct Scsi_Host *shost, u64 nexus, u64 tm_id, + int result) +{ + struct srp_internal *i = to_srp_internal(shost->transportt); + return i->f->tsk_mgmt_response(shost, nexus, tm_id, result); +} + +static int srp_it_nexus_response(struct Scsi_Host *shost, u64 nexus, int result) +{ + struct srp_internal *i = to_srp_internal(shost->transportt); + return i->f->it_nexus_response(shost, nexus, result); +} + +/** + * srp_attach_transport - instantiate SRP transport template + * @ft: SRP transport class function template + */ +struct scsi_transport_template * +srp_attach_transport(struct srp_function_template *ft) +{ + int count; + struct srp_internal *i; + + i = kzalloc(sizeof(*i), GFP_KERNEL); + if (!i) + return NULL; + + i->t.eh_timed_out = srp_timed_out; + + i->t.tsk_mgmt_response = srp_tsk_mgmt_response; + i->t.it_nexus_response = srp_it_nexus_response; + + i->t.host_size = sizeof(struct srp_host_attrs); + i->t.host_attrs.ac.attrs = &i->host_attrs[0]; + i->t.host_attrs.ac.class = &srp_host_class.class; + i->t.host_attrs.ac.match = srp_host_match; + i->host_attrs[0] = NULL; + transport_container_register(&i->t.host_attrs); + + i->rport_attr_cont.ac.attrs = &i->rport_attrs[0]; + i->rport_attr_cont.ac.class = &srp_rport_class.class; + i->rport_attr_cont.ac.match = srp_rport_match; + + count = 0; + i->rport_attrs[count++] = &dev_attr_port_id; + i->rport_attrs[count++] = &dev_attr_roles; + if (ft->has_rport_state) { + i->rport_attrs[count++] = &dev_attr_state; + i->rport_attrs[count++] = &dev_attr_fast_io_fail_tmo; + i->rport_attrs[count++] = &dev_attr_dev_loss_tmo; + } + if (ft->reconnect) { + i->rport_attrs[count++] = &dev_attr_reconnect_delay; + i->rport_attrs[count++] = &dev_attr_failed_reconnects; + } + if (ft->rport_delete) + i->rport_attrs[count++] = &dev_attr_delete; + i->rport_attrs[count++] = NULL; + BUG_ON(count > ARRAY_SIZE(i->rport_attrs)); + + transport_container_register(&i->rport_attr_cont); + + i->f = ft; + + return &i->t; +} +EXPORT_SYMBOL_GPL(srp_attach_transport); + +/** + * srp_release_transport - release SRP transport template instance + * @t: transport template instance + */ +void srp_release_transport(struct scsi_transport_template *t) +{ + struct srp_internal *i = to_srp_internal(t); + + transport_container_unregister(&i->t.host_attrs); + transport_container_unregister(&i->rport_attr_cont); + + kfree(i); +} +EXPORT_SYMBOL_GPL(srp_release_transport); + +static __init int srp_transport_init(void) +{ + int ret; + + ret = transport_class_register(&srp_host_class); + if (ret) + return ret; + ret = transport_class_register(&srp_rport_class); + if (ret) + goto unregister_host_class; + + return 0; +unregister_host_class: + transport_class_unregister(&srp_host_class); + return ret; +} + +static void __exit srp_transport_exit(void) +{ + transport_class_unregister(&srp_host_class); + transport_class_unregister(&srp_rport_class); +} + +MODULE_AUTHOR("FUJITA Tomonori"); +MODULE_DESCRIPTION("SRP Transport Attributes"); +MODULE_LICENSE("GPL"); + +module_init(srp_transport_init); +module_exit(srp_transport_exit); diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h new file mode 100644 index 0000000..379c026 --- /dev/null +++ b/include/linux/mlx4/cmd.h @@ -0,0 +1,261 @@ +/* + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX4_CMD_H +#define MLX4_CMD_H + +#include +#include + +enum { + /* initialization and general commands */ + MLX4_CMD_SYS_EN = 0x1, + MLX4_CMD_SYS_DIS = 0x2, + MLX4_CMD_MAP_FA = 0xfff, + MLX4_CMD_UNMAP_FA = 0xffe, + MLX4_CMD_RUN_FW = 0xff6, + MLX4_CMD_MOD_STAT_CFG = 0x34, + MLX4_CMD_QUERY_DEV_CAP = 0x3, + MLX4_CMD_QUERY_FW = 0x4, + MLX4_CMD_ENABLE_LAM = 0xff8, + MLX4_CMD_DISABLE_LAM = 0xff7, + MLX4_CMD_QUERY_DDR = 0x5, + MLX4_CMD_QUERY_ADAPTER = 0x6, + MLX4_CMD_INIT_HCA = 0x7, + MLX4_CMD_CLOSE_HCA = 0x8, + MLX4_CMD_INIT_PORT = 0x9, + MLX4_CMD_CLOSE_PORT = 0xa, + MLX4_CMD_QUERY_HCA = 0xb, + MLX4_CMD_QUERY_PORT = 0x43, + MLX4_CMD_SENSE_PORT = 0x4d, + MLX4_CMD_HW_HEALTH_CHECK = 0x50, + MLX4_CMD_SET_PORT = 0xc, + MLX4_CMD_SET_NODE = 0x5a, + MLX4_CMD_QUERY_FUNC = 0x56, + MLX4_CMD_ACCESS_DDR = 0x2e, + MLX4_CMD_MAP_ICM = 0xffa, + MLX4_CMD_UNMAP_ICM = 0xff9, + MLX4_CMD_MAP_ICM_AUX = 0xffc, + MLX4_CMD_UNMAP_ICM_AUX = 0xffb, + MLX4_CMD_SET_ICM_SIZE = 0xffd, + /*master notify fw on finish for slave's flr*/ + MLX4_CMD_INFORM_FLR_DONE = 0x5b, + MLX4_CMD_GET_OP_REQ = 0x59, + + /* TPT commands */ + MLX4_CMD_SW2HW_MPT = 0xd, + MLX4_CMD_QUERY_MPT = 0xe, + MLX4_CMD_HW2SW_MPT = 0xf, + MLX4_CMD_READ_MTT = 0x10, + MLX4_CMD_WRITE_MTT = 0x11, + MLX4_CMD_SYNC_TPT = 0x2f, + + /* EQ commands */ + MLX4_CMD_MAP_EQ = 0x12, + MLX4_CMD_SW2HW_EQ = 0x13, + MLX4_CMD_HW2SW_EQ = 0x14, + MLX4_CMD_QUERY_EQ = 0x15, + + /* CQ commands */ + MLX4_CMD_SW2HW_CQ = 0x16, + MLX4_CMD_HW2SW_CQ = 0x17, + MLX4_CMD_QUERY_CQ = 0x18, + MLX4_CMD_MODIFY_CQ = 0x2c, + + /* SRQ commands */ + MLX4_CMD_SW2HW_SRQ = 0x35, + MLX4_CMD_HW2SW_SRQ = 0x36, + MLX4_CMD_QUERY_SRQ = 0x37, + MLX4_CMD_ARM_SRQ = 0x40, + + /* QP/EE commands */ + MLX4_CMD_RST2INIT_QP = 0x19, + MLX4_CMD_INIT2RTR_QP = 0x1a, + MLX4_CMD_RTR2RTS_QP = 0x1b, + MLX4_CMD_RTS2RTS_QP = 0x1c, + MLX4_CMD_SQERR2RTS_QP = 0x1d, + MLX4_CMD_2ERR_QP = 0x1e, + MLX4_CMD_RTS2SQD_QP = 0x1f, + MLX4_CMD_SQD2SQD_QP = 0x38, + MLX4_CMD_SQD2RTS_QP = 0x20, + MLX4_CMD_2RST_QP = 0x21, + MLX4_CMD_QUERY_QP = 0x22, + MLX4_CMD_INIT2INIT_QP = 0x2d, + MLX4_CMD_SUSPEND_QP = 0x32, + MLX4_CMD_UNSUSPEND_QP = 0x33, + MLX4_CMD_UPDATE_QP = 0x61, + /* special QP and management commands */ + MLX4_CMD_CONF_SPECIAL_QP = 0x23, + MLX4_CMD_MAD_IFC = 0x24, + MLX4_CMD_MAD_DEMUX = 0x203, + + /* multicast commands */ + MLX4_CMD_READ_MCG = 0x25, + MLX4_CMD_WRITE_MCG = 0x26, + MLX4_CMD_MGID_HASH = 0x27, + + /* miscellaneous commands */ + MLX4_CMD_DIAG_RPRT = 0x30, + MLX4_CMD_NOP = 0x31, + MLX4_CMD_CONFIG_DEV = 0x3a, + MLX4_CMD_ACCESS_MEM = 0x2e, + MLX4_CMD_SET_VEP = 0x52, + + /* Ethernet specific commands */ + MLX4_CMD_SET_VLAN_FLTR = 0x47, + MLX4_CMD_SET_MCAST_FLTR = 0x48, + MLX4_CMD_DUMP_ETH_STATS = 0x49, + + /* Communication channel commands */ + MLX4_CMD_ARM_COMM_CHANNEL = 0x57, + MLX4_CMD_GEN_EQE = 0x58, + + /* virtual commands */ + MLX4_CMD_ALLOC_RES = 0xf00, + MLX4_CMD_FREE_RES = 0xf01, + MLX4_CMD_MCAST_ATTACH = 0xf05, + MLX4_CMD_UCAST_ATTACH = 0xf06, + MLX4_CMD_PROMISC = 0xf08, + MLX4_CMD_QUERY_FUNC_CAP = 0xf0a, + MLX4_CMD_QP_ATTACH = 0xf0b, + + /* debug commands */ + MLX4_CMD_QUERY_DEBUG_MSG = 0x2a, + MLX4_CMD_SET_DEBUG_MSG = 0x2b, + + /* statistics commands */ + MLX4_CMD_QUERY_IF_STAT = 0X54, + MLX4_CMD_SET_IF_STAT = 0X55, + + /* register/delete flow steering network rules */ + MLX4_QP_FLOW_STEERING_ATTACH = 0x65, + MLX4_QP_FLOW_STEERING_DETACH = 0x66, + MLX4_FLOW_STEERING_IB_UC_QP_RANGE = 0x64, +}; + +enum { + MLX4_CMD_TIME_CLASS_A = 10000, + MLX4_CMD_TIME_CLASS_B = 10000, + MLX4_CMD_TIME_CLASS_C = 10000, +}; + +enum { + MLX4_MAILBOX_SIZE = 4096, + MLX4_ACCESS_MEM_ALIGN = 256, +}; + +enum { + /* set port opcode modifiers */ + MLX4_SET_PORT_GENERAL = 0x0, + MLX4_SET_PORT_RQP_CALC = 0x1, + MLX4_SET_PORT_MAC_TABLE = 0x2, + MLX4_SET_PORT_VLAN_TABLE = 0x3, + MLX4_SET_PORT_PRIO_MAP = 0x4, + MLX4_SET_PORT_GID_TABLE = 0x5, + MLX4_SET_PORT_PRIO2TC = 0x8, + MLX4_SET_PORT_SCHEDULER = 0x9, + MLX4_SET_PORT_VXLAN = 0xB +}; + +enum { + MLX4_CMD_MAD_DEMUX_CONFIG = 0, + MLX4_CMD_MAD_DEMUX_QUERY_STATE = 1, + MLX4_CMD_MAD_DEMUX_QUERY_RESTR = 2, /* Query mad demux restrictions */ +}; + +enum { + MLX4_CMD_WRAPPED, + MLX4_CMD_NATIVE +}; + +struct mlx4_dev; + +struct mlx4_cmd_mailbox { + void *buf; + dma_addr_t dma; +}; + +int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param, + int out_is_imm, u32 in_modifier, u8 op_modifier, + u16 op, unsigned long timeout, int native); + +/* Invoke a command with no output parameter */ +static inline int mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u32 in_modifier, + u8 op_modifier, u16 op, unsigned long timeout, + int native) +{ + return __mlx4_cmd(dev, in_param, NULL, 0, in_modifier, + op_modifier, op, timeout, native); +} + +/* Invoke a command with an output mailbox */ +static inline int mlx4_cmd_box(struct mlx4_dev *dev, u64 in_param, u64 out_param, + u32 in_modifier, u8 op_modifier, u16 op, + unsigned long timeout, int native) +{ + return __mlx4_cmd(dev, in_param, &out_param, 0, in_modifier, + op_modifier, op, timeout, native); +} + +/* + * Invoke a command with an immediate output parameter (and copy the + * output into the caller's out_param pointer after the command + * executes). + */ +static inline int mlx4_cmd_imm(struct mlx4_dev *dev, u64 in_param, u64 *out_param, + u32 in_modifier, u8 op_modifier, u16 op, + unsigned long timeout, int native) +{ + return __mlx4_cmd(dev, in_param, out_param, 1, in_modifier, + op_modifier, op, timeout, native); +} + +struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev); +void mlx4_free_cmd_mailbox(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox); + +u32 mlx4_comm_get_version(void); +int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u64 mac); +int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos); +int mlx4_set_vf_spoofchk(struct mlx4_dev *dev, int port, int vf, bool setting); +int mlx4_get_vf_config(struct mlx4_dev *dev, int port, int vf, struct ifla_vf_info *ivf); +int mlx4_set_vf_link_state(struct mlx4_dev *dev, int port, int vf, int link_state); +/* + * mlx4_get_slave_default_vlan - + * return true if VST ( default vlan) + * if VST, will return vlan & qos (if not NULL) + */ +bool mlx4_get_slave_default_vlan(struct mlx4_dev *dev, int port, int slave, + u16 *vlan, u8 *qos); + +#define MLX4_COMM_GET_IF_REV(cmd_chan_ver) (u8)((cmd_chan_ver) >> 8) + +#endif /* MLX4_CMD_H */ diff --git a/include/linux/mlx4/cq.h b/include/linux/mlx4/cq.h new file mode 100644 index 0000000..e7ecc12 --- /dev/null +++ b/include/linux/mlx4/cq.h @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX4_CQ_H +#define MLX4_CQ_H + +#include +#include + +#include +#include + +struct mlx4_cqe { + __be32 vlan_my_qpn; + __be32 immed_rss_invalid; + __be32 g_mlpath_rqpn; + __be16 sl_vid; + union { + struct { + __be16 rlid; + __be16 status; + u8 ipv6_ext_mask; + u8 badfcs_enc; + }; + u8 smac[ETH_ALEN]; + }; + __be32 byte_cnt; + __be16 wqe_index; + __be16 checksum; + u8 reserved[3]; + u8 owner_sr_opcode; +}; + +struct mlx4_err_cqe { + __be32 my_qpn; + u32 reserved1[5]; + __be16 wqe_index; + u8 vendor_err_syndrome; + u8 syndrome; + u8 reserved2[3]; + u8 owner_sr_opcode; +}; + +struct mlx4_ts_cqe { + __be32 vlan_my_qpn; + __be32 immed_rss_invalid; + __be32 g_mlpath_rqpn; + __be32 timestamp_hi; + __be16 status; + u8 ipv6_ext_mask; + u8 badfcs_enc; + __be32 byte_cnt; + __be16 wqe_index; + __be16 checksum; + u8 reserved; + __be16 timestamp_lo; + u8 owner_sr_opcode; +} __packed; + +enum { + MLX4_CQE_L2_TUNNEL_IPOK = 1 << 31, + MLX4_CQE_VLAN_PRESENT_MASK = 1 << 29, + MLX4_CQE_L2_TUNNEL = 1 << 27, + MLX4_CQE_L2_TUNNEL_CSUM = 1 << 26, + MLX4_CQE_L2_TUNNEL_IPV4 = 1 << 25, + + MLX4_CQE_QPN_MASK = 0xffffff, + MLX4_CQE_VID_MASK = 0xfff, +}; + +enum { + MLX4_CQE_OWNER_MASK = 0x80, + MLX4_CQE_IS_SEND_MASK = 0x40, + MLX4_CQE_OPCODE_MASK = 0x1f +}; + +enum { + MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR = 0x01, + MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR = 0x02, + MLX4_CQE_SYNDROME_LOCAL_PROT_ERR = 0x04, + MLX4_CQE_SYNDROME_WR_FLUSH_ERR = 0x05, + MLX4_CQE_SYNDROME_MW_BIND_ERR = 0x06, + MLX4_CQE_SYNDROME_BAD_RESP_ERR = 0x10, + MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR = 0x11, + MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR = 0x12, + MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR = 0x13, + MLX4_CQE_SYNDROME_REMOTE_OP_ERR = 0x14, + MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR = 0x15, + MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR = 0x16, + MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR = 0x22, +}; + +enum { + MLX4_CQE_STATUS_IPV4 = 1 << 6, + MLX4_CQE_STATUS_IPV4F = 1 << 7, + MLX4_CQE_STATUS_IPV6 = 1 << 8, + MLX4_CQE_STATUS_IPV4OPT = 1 << 9, + MLX4_CQE_STATUS_TCP = 1 << 10, + MLX4_CQE_STATUS_UDP = 1 << 11, + MLX4_CQE_STATUS_IPOK = 1 << 12, +}; + +enum { + MLX4_CQE_LLC = 1, + MLX4_CQE_SNAP = 1 << 1, + MLX4_CQE_BAD_FCS = 1 << 4, +}; + +static inline void mlx4_cq_arm(struct mlx4_cq *cq, u32 cmd, + void __iomem *uar_page, + spinlock_t *doorbell_lock) +{ + __be32 doorbell[2]; + u32 sn; + u32 ci; + + sn = cq->arm_sn & 3; + ci = cq->cons_index & 0xffffff; + + *cq->arm_db = cpu_to_be32(sn << 28 | cmd | ci); + + /* + * Make sure that the doorbell record in host memory is + * written before ringing the doorbell via PCI MMIO. + */ + wmb(); + + doorbell[0] = cpu_to_be32(sn << 28 | cmd | cq->cqn); + doorbell[1] = cpu_to_be32(ci); + + mlx4_write64(doorbell, uar_page + MLX4_CQ_DOORBELL, doorbell_lock); +} + +static inline void mlx4_cq_set_ci(struct mlx4_cq *cq) +{ + *cq->set_ci_db = cpu_to_be32(cq->cons_index & 0xffffff); +} + +enum { + MLX4_CQ_DB_REQ_NOT_SOL = 1 << 24, + MLX4_CQ_DB_REQ_NOT = 2 << 24 +}; + +int mlx4_cq_modify(struct mlx4_dev *dev, struct mlx4_cq *cq, + u16 count, u16 period); +int mlx4_cq_resize(struct mlx4_dev *dev, struct mlx4_cq *cq, + int entries, struct mlx4_mtt *mtt); + +#endif /* MLX4_CQ_H */ diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h new file mode 100644 index 0000000..37e4404 --- /dev/null +++ b/include/linux/mlx4/device.h @@ -0,0 +1,1292 @@ +/* + * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX4_DEVICE_H +#define MLX4_DEVICE_H + +#include +#include +#include +#include +#include +#include + +#include + +#include + +#define MAX_MSIX_P_PORT 17 +#define MAX_MSIX 64 +#define MSIX_LEGACY_SZ 4 +#define MIN_MSIX_P_PORT 5 + +#define MLX4_NUM_UP 8 +#define MLX4_NUM_TC 8 +#define MLX4_MAX_100M_UNITS_VAL 255 /* + * work around: can't set values + * greater then this value when + * using 100 Mbps units. + */ +#define MLX4_RATELIMIT_100M_UNITS 3 /* 100 Mbps */ +#define MLX4_RATELIMIT_1G_UNITS 4 /* 1 Gbps */ +#define MLX4_RATELIMIT_DEFAULT 0x00ff + +#define MLX4_ROCE_MAX_GIDS 128 +#define MLX4_ROCE_PF_GIDS 16 + +enum { + MLX4_FLAG_MSI_X = 1 << 0, + MLX4_FLAG_OLD_PORT_CMDS = 1 << 1, + MLX4_FLAG_MASTER = 1 << 2, + MLX4_FLAG_SLAVE = 1 << 3, + MLX4_FLAG_SRIOV = 1 << 4, + MLX4_FLAG_OLD_REG_MAC = 1 << 6, +}; + +enum { + MLX4_PORT_CAP_IS_SM = 1 << 1, + MLX4_PORT_CAP_DEV_MGMT_SUP = 1 << 19, +}; + +enum { + MLX4_MAX_PORTS = 2, + MLX4_MAX_PORT_PKEYS = 128 +}; + +/* base qkey for use in sriov tunnel-qp/proxy-qp communication. + * These qkeys must not be allowed for general use. This is a 64k range, + * and to test for violation, we use the mask (protect against future chg). + */ +#define MLX4_RESERVED_QKEY_BASE (0xFFFF0000) +#define MLX4_RESERVED_QKEY_MASK (0xFFFF0000) + +enum { + MLX4_BOARD_ID_LEN = 64 +}; + +enum { + MLX4_MAX_NUM_PF = 16, + MLX4_MAX_NUM_VF = 64, + MLX4_MAX_NUM_VF_P_PORT = 64, + MLX4_MFUNC_MAX = 80, + MLX4_MAX_EQ_NUM = 1024, + MLX4_MFUNC_EQ_NUM = 4, + MLX4_MFUNC_MAX_EQES = 8, + MLX4_MFUNC_EQE_MASK = (MLX4_MFUNC_MAX_EQES - 1) +}; + +/* Driver supports 3 diffrent device methods to manage traffic steering: + * -device managed - High level API for ib and eth flow steering. FW is + * managing flow steering tables. + * - B0 steering mode - Common low level API for ib and (if supported) eth. + * - A0 steering mode - Limited low level API for eth. In case of IB, + * B0 mode is in use. + */ +enum { + MLX4_STEERING_MODE_A0, + MLX4_STEERING_MODE_B0, + MLX4_STEERING_MODE_DEVICE_MANAGED +}; + +static inline const char *mlx4_steering_mode_str(int steering_mode) +{ + switch (steering_mode) { + case MLX4_STEERING_MODE_A0: + return "A0 steering"; + + case MLX4_STEERING_MODE_B0: + return "B0 steering"; + + case MLX4_STEERING_MODE_DEVICE_MANAGED: + return "Device managed flow steering"; + + default: + return "Unrecognize steering mode"; + } +} + +enum { + MLX4_TUNNEL_OFFLOAD_MODE_NONE, + MLX4_TUNNEL_OFFLOAD_MODE_VXLAN +}; + +enum { + MLX4_DEV_CAP_FLAG_RC = 1LL << 0, + MLX4_DEV_CAP_FLAG_UC = 1LL << 1, + MLX4_DEV_CAP_FLAG_UD = 1LL << 2, + MLX4_DEV_CAP_FLAG_XRC = 1LL << 3, + MLX4_DEV_CAP_FLAG_SRQ = 1LL << 6, + MLX4_DEV_CAP_FLAG_IPOIB_CSUM = 1LL << 7, + MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR = 1LL << 8, + MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR = 1LL << 9, + MLX4_DEV_CAP_FLAG_DPDP = 1LL << 12, + MLX4_DEV_CAP_FLAG_BLH = 1LL << 15, + MLX4_DEV_CAP_FLAG_MEM_WINDOW = 1LL << 16, + MLX4_DEV_CAP_FLAG_APM = 1LL << 17, + MLX4_DEV_CAP_FLAG_ATOMIC = 1LL << 18, + MLX4_DEV_CAP_FLAG_RAW_MCAST = 1LL << 19, + MLX4_DEV_CAP_FLAG_UD_AV_PORT = 1LL << 20, + MLX4_DEV_CAP_FLAG_UD_MCAST = 1LL << 21, + MLX4_DEV_CAP_FLAG_IBOE = 1LL << 30, + MLX4_DEV_CAP_FLAG_UC_LOOPBACK = 1LL << 32, + MLX4_DEV_CAP_FLAG_FCS_KEEP = 1LL << 34, + MLX4_DEV_CAP_FLAG_WOL_PORT1 = 1LL << 37, + MLX4_DEV_CAP_FLAG_WOL_PORT2 = 1LL << 38, + MLX4_DEV_CAP_FLAG_UDP_RSS = 1LL << 40, + MLX4_DEV_CAP_FLAG_VEP_UC_STEER = 1LL << 41, + MLX4_DEV_CAP_FLAG_VEP_MC_STEER = 1LL << 42, + MLX4_DEV_CAP_FLAG_COUNTERS = 1LL << 48, + MLX4_DEV_CAP_FLAG_SET_ETH_SCHED = 1LL << 53, + MLX4_DEV_CAP_FLAG_SENSE_SUPPORT = 1LL << 55, + MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV = 1LL << 59, + MLX4_DEV_CAP_FLAG_64B_EQE = 1LL << 61, + MLX4_DEV_CAP_FLAG_64B_CQE = 1LL << 62 +}; + +enum { + MLX4_DEV_CAP_FLAG2_RSS = 1LL << 0, + MLX4_DEV_CAP_FLAG2_RSS_TOP = 1LL << 1, + MLX4_DEV_CAP_FLAG2_RSS_XOR = 1LL << 2, + MLX4_DEV_CAP_FLAG2_FS_EN = 1LL << 3, + MLX4_DEV_CAP_FLAG2_REASSIGN_MAC_EN = 1LL << 4, + MLX4_DEV_CAP_FLAG2_TS = 1LL << 5, + MLX4_DEV_CAP_FLAG2_VLAN_CONTROL = 1LL << 6, + MLX4_DEV_CAP_FLAG2_FSM = 1LL << 7, + MLX4_DEV_CAP_FLAG2_UPDATE_QP = 1LL << 8, + MLX4_DEV_CAP_FLAG2_DMFS_IPOIB = 1LL << 9, + MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS = 1LL << 10, + MLX4_DEV_CAP_FLAG2_MAD_DEMUX = 1LL << 11, + MLX4_DEV_CAP_FLAG2_CQE_STRIDE = 1LL << 12, + MLX4_DEV_CAP_FLAG2_EQE_STRIDE = 1LL << 13 +}; + +enum { + MLX4_DEV_CAP_64B_EQE_ENABLED = 1LL << 0, + MLX4_DEV_CAP_64B_CQE_ENABLED = 1LL << 1, + MLX4_DEV_CAP_CQE_STRIDE_ENABLED = 1LL << 2, + MLX4_DEV_CAP_EQE_STRIDE_ENABLED = 1LL << 3 +}; + +enum { + MLX4_USER_DEV_CAP_LARGE_CQE = 1L << 0 +}; + +enum { + MLX4_FUNC_CAP_64B_EQE_CQE = 1L << 0, + MLX4_FUNC_CAP_EQE_CQE_STRIDE = 1L << 1 +}; + + +#define MLX4_ATTR_EXTENDED_PORT_INFO cpu_to_be16(0xff90) + +enum { + MLX4_BMME_FLAG_WIN_TYPE_2B = 1 << 1, + MLX4_BMME_FLAG_LOCAL_INV = 1 << 6, + MLX4_BMME_FLAG_REMOTE_INV = 1 << 7, + MLX4_BMME_FLAG_TYPE_2_WIN = 1 << 9, + MLX4_BMME_FLAG_RESERVED_LKEY = 1 << 10, + MLX4_BMME_FLAG_FAST_REG_WR = 1 << 11, + MLX4_BMME_FLAG_VSD_INIT2RTR = 1 << 28, +}; + +enum mlx4_event { + MLX4_EVENT_TYPE_COMP = 0x00, + MLX4_EVENT_TYPE_PATH_MIG = 0x01, + MLX4_EVENT_TYPE_COMM_EST = 0x02, + MLX4_EVENT_TYPE_SQ_DRAINED = 0x03, + MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE = 0x13, + MLX4_EVENT_TYPE_SRQ_LIMIT = 0x14, + MLX4_EVENT_TYPE_CQ_ERROR = 0x04, + MLX4_EVENT_TYPE_WQ_CATAS_ERROR = 0x05, + MLX4_EVENT_TYPE_EEC_CATAS_ERROR = 0x06, + MLX4_EVENT_TYPE_PATH_MIG_FAILED = 0x07, + MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR = 0x10, + MLX4_EVENT_TYPE_WQ_ACCESS_ERROR = 0x11, + MLX4_EVENT_TYPE_SRQ_CATAS_ERROR = 0x12, + MLX4_EVENT_TYPE_LOCAL_CATAS_ERROR = 0x08, + MLX4_EVENT_TYPE_PORT_CHANGE = 0x09, + MLX4_EVENT_TYPE_EQ_OVERFLOW = 0x0f, + MLX4_EVENT_TYPE_ECC_DETECT = 0x0e, + MLX4_EVENT_TYPE_CMD = 0x0a, + MLX4_EVENT_TYPE_VEP_UPDATE = 0x19, + MLX4_EVENT_TYPE_COMM_CHANNEL = 0x18, + MLX4_EVENT_TYPE_OP_REQUIRED = 0x1a, + MLX4_EVENT_TYPE_FATAL_WARNING = 0x1b, + MLX4_EVENT_TYPE_FLR_EVENT = 0x1c, + MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT = 0x1d, + MLX4_EVENT_TYPE_NONE = 0xff, +}; + +enum { + MLX4_PORT_CHANGE_SUBTYPE_DOWN = 1, + MLX4_PORT_CHANGE_SUBTYPE_ACTIVE = 4 +}; + +enum { + MLX4_FATAL_WARNING_SUBTYPE_WARMING = 0, +}; + +enum slave_port_state { + SLAVE_PORT_DOWN = 0, + SLAVE_PENDING_UP, + SLAVE_PORT_UP, +}; + +enum slave_port_gen_event { + SLAVE_PORT_GEN_EVENT_DOWN = 0, + SLAVE_PORT_GEN_EVENT_UP, + SLAVE_PORT_GEN_EVENT_NONE, +}; + +enum slave_port_state_event { + MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN, + MLX4_PORT_STATE_DEV_EVENT_PORT_UP, + MLX4_PORT_STATE_IB_PORT_STATE_EVENT_GID_VALID, + MLX4_PORT_STATE_IB_EVENT_GID_INVALID, +}; + +enum { + MLX4_PERM_LOCAL_READ = 1 << 10, + MLX4_PERM_LOCAL_WRITE = 1 << 11, + MLX4_PERM_REMOTE_READ = 1 << 12, + MLX4_PERM_REMOTE_WRITE = 1 << 13, + MLX4_PERM_ATOMIC = 1 << 14, + MLX4_PERM_BIND_MW = 1 << 15, + MLX4_PERM_MASK = 0xFC00 +}; + +enum { + MLX4_OPCODE_NOP = 0x00, + MLX4_OPCODE_SEND_INVAL = 0x01, + MLX4_OPCODE_RDMA_WRITE = 0x08, + MLX4_OPCODE_RDMA_WRITE_IMM = 0x09, + MLX4_OPCODE_SEND = 0x0a, + MLX4_OPCODE_SEND_IMM = 0x0b, + MLX4_OPCODE_LSO = 0x0e, + MLX4_OPCODE_RDMA_READ = 0x10, + MLX4_OPCODE_ATOMIC_CS = 0x11, + MLX4_OPCODE_ATOMIC_FA = 0x12, + MLX4_OPCODE_MASKED_ATOMIC_CS = 0x14, + MLX4_OPCODE_MASKED_ATOMIC_FA = 0x15, + MLX4_OPCODE_BIND_MW = 0x18, + MLX4_OPCODE_FMR = 0x19, + MLX4_OPCODE_LOCAL_INVAL = 0x1b, + MLX4_OPCODE_CONFIG_CMD = 0x1f, + + MLX4_RECV_OPCODE_RDMA_WRITE_IMM = 0x00, + MLX4_RECV_OPCODE_SEND = 0x01, + MLX4_RECV_OPCODE_SEND_IMM = 0x02, + MLX4_RECV_OPCODE_SEND_INVAL = 0x03, + + MLX4_CQE_OPCODE_ERROR = 0x1e, + MLX4_CQE_OPCODE_RESIZE = 0x16, +}; + +enum { + MLX4_STAT_RATE_OFFSET = 5 +}; + +enum mlx4_protocol { + MLX4_PROT_IB_IPV6 = 0, + MLX4_PROT_ETH, + MLX4_PROT_IB_IPV4, + MLX4_PROT_FCOE +}; + +enum { + MLX4_MTT_FLAG_PRESENT = 1 +}; + +enum mlx4_qp_region { + MLX4_QP_REGION_FW = 0, + MLX4_QP_REGION_ETH_ADDR, + MLX4_QP_REGION_FC_ADDR, + MLX4_QP_REGION_FC_EXCH, + MLX4_NUM_QP_REGION +}; + +enum mlx4_port_type { + MLX4_PORT_TYPE_NONE = 0, + MLX4_PORT_TYPE_IB = 1, + MLX4_PORT_TYPE_ETH = 2, + MLX4_PORT_TYPE_AUTO = 3 +}; + +enum mlx4_special_vlan_idx { + MLX4_NO_VLAN_IDX = 0, + MLX4_VLAN_MISS_IDX, + MLX4_VLAN_REGULAR +}; + +enum mlx4_steer_type { + MLX4_MC_STEER = 0, + MLX4_UC_STEER, + MLX4_NUM_STEERS +}; + +enum { + MLX4_NUM_FEXCH = 64 * 1024, +}; + +enum { + MLX4_MAX_FAST_REG_PAGES = 511, +}; + +enum { + MLX4_DEV_PMC_SUBTYPE_GUID_INFO = 0x14, + MLX4_DEV_PMC_SUBTYPE_PORT_INFO = 0x15, + MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE = 0x16, +}; + +/* Port mgmt change event handling */ +enum { + MLX4_EQ_PORT_INFO_MSTR_SM_LID_CHANGE_MASK = 1 << 0, + MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK = 1 << 1, + MLX4_EQ_PORT_INFO_LID_CHANGE_MASK = 1 << 2, + MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK = 1 << 3, + MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK = 1 << 4, +}; + +#define MSTR_SM_CHANGE_MASK (MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK | \ + MLX4_EQ_PORT_INFO_MSTR_SM_LID_CHANGE_MASK) + +static inline u64 mlx4_fw_ver(u64 major, u64 minor, u64 subminor) +{ + return (major << 32) | (minor << 16) | subminor; +} + +struct mlx4_phys_caps { + u32 gid_phys_table_len[MLX4_MAX_PORTS + 1]; + u32 pkey_phys_table_len[MLX4_MAX_PORTS + 1]; + u32 num_phys_eqs; + u32 base_sqpn; + u32 base_proxy_sqpn; + u32 base_tunnel_sqpn; +}; + +struct mlx4_caps { + u64 fw_ver; + u32 function; + int num_ports; + int vl_cap[MLX4_MAX_PORTS + 1]; + int ib_mtu_cap[MLX4_MAX_PORTS + 1]; + __be32 ib_port_def_cap[MLX4_MAX_PORTS + 1]; + u64 def_mac[MLX4_MAX_PORTS + 1]; + int eth_mtu_cap[MLX4_MAX_PORTS + 1]; + int gid_table_len[MLX4_MAX_PORTS + 1]; + int pkey_table_len[MLX4_MAX_PORTS + 1]; + int trans_type[MLX4_MAX_PORTS + 1]; + int vendor_oui[MLX4_MAX_PORTS + 1]; + int wavelength[MLX4_MAX_PORTS + 1]; + u64 trans_code[MLX4_MAX_PORTS + 1]; + int local_ca_ack_delay; + int num_uars; + u32 uar_page_size; + int bf_reg_size; + int bf_regs_per_page; + int max_sq_sg; + int max_rq_sg; + int num_qps; + int max_wqes; + int max_sq_desc_sz; + int max_rq_desc_sz; + int max_qp_init_rdma; + int max_qp_dest_rdma; + u32 *qp0_qkey; + u32 *qp0_proxy; + u32 *qp1_proxy; + u32 *qp0_tunnel; + u32 *qp1_tunnel; + int num_srqs; + int max_srq_wqes; + int max_srq_sge; + int reserved_srqs; + int num_cqs; + int max_cqes; + int reserved_cqs; + int num_eqs; + int reserved_eqs; + int num_comp_vectors; + int comp_pool; + int num_mpts; + int max_fmr_maps; + int num_mtts; + int fmr_reserved_mtts; + int reserved_mtts; + int reserved_mrws; + int reserved_uars; + int num_mgms; + int num_amgms; + int reserved_mcgs; + int num_qp_per_mgm; + int steering_mode; + int fs_log_max_ucast_qp_range_size; + int num_pds; + int reserved_pds; + int max_xrcds; + int reserved_xrcds; + int mtt_entry_sz; + u32 max_msg_sz; + u32 page_size_cap; + u64 flags; + u64 flags2; + u32 bmme_flags; + u32 reserved_lkey; + u16 stat_rate_support; + u8 port_width_cap[MLX4_MAX_PORTS + 1]; + int max_gso_sz; + int max_rss_tbl_sz; + int reserved_qps_cnt[MLX4_NUM_QP_REGION]; + int reserved_qps; + int reserved_qps_base[MLX4_NUM_QP_REGION]; + int log_num_macs; + int log_num_vlans; + enum mlx4_port_type port_type[MLX4_MAX_PORTS + 1]; + u8 supported_type[MLX4_MAX_PORTS + 1]; + u8 suggested_type[MLX4_MAX_PORTS + 1]; + u8 default_sense[MLX4_MAX_PORTS + 1]; + u32 port_mask[MLX4_MAX_PORTS + 1]; + enum mlx4_port_type possible_type[MLX4_MAX_PORTS + 1]; + u32 max_counters; + u8 port_ib_mtu[MLX4_MAX_PORTS + 1]; + u16 sqp_demux; + u32 eqe_size; + u32 cqe_size; + u8 eqe_factor; + u32 userspace_caps; /* userspace must be aware of these */ + u32 function_caps; /* VFs must be aware of these */ + u16 hca_core_clock; + u64 phys_port_id[MLX4_MAX_PORTS + 1]; + int tunnel_offload_mode; +}; + +struct mlx4_buf_list { + void *buf; + dma_addr_t map; +}; + +struct mlx4_buf { + struct mlx4_buf_list direct; + struct mlx4_buf_list *page_list; + int nbufs; + int npages; + int page_shift; +}; + +struct mlx4_mtt { + u32 offset; + int order; + int page_shift; +}; + +enum { + MLX4_DB_PER_PAGE = PAGE_SIZE / 4 +}; + +struct mlx4_db_pgdir { + struct list_head list; + DECLARE_BITMAP(order0, MLX4_DB_PER_PAGE); + DECLARE_BITMAP(order1, MLX4_DB_PER_PAGE / 2); + unsigned long *bits[2]; + __be32 *db_page; + dma_addr_t db_dma; +}; + +struct mlx4_ib_user_db_page; + +struct mlx4_db { + __be32 *db; + union { + struct mlx4_db_pgdir *pgdir; + struct mlx4_ib_user_db_page *user_page; + } u; + dma_addr_t dma; + int index; + int order; +}; + +struct mlx4_hwq_resources { + struct mlx4_db db; + struct mlx4_mtt mtt; + struct mlx4_buf buf; +}; + +struct mlx4_mr { + struct mlx4_mtt mtt; + u64 iova; + u64 size; + u32 key; + u32 pd; + u32 access; + int enabled; +}; + +enum mlx4_mw_type { + MLX4_MW_TYPE_1 = 1, + MLX4_MW_TYPE_2 = 2, +}; + +struct mlx4_mw { + u32 key; + u32 pd; + enum mlx4_mw_type type; + int enabled; +}; + +struct mlx4_fmr { + struct mlx4_mr mr; + struct mlx4_mpt_entry *mpt; + __be64 *mtts; + dma_addr_t dma_handle; + int max_pages; + int max_maps; + int maps; + u8 page_shift; +}; + +struct mlx4_uar { + unsigned long pfn; + int index; + struct list_head bf_list; + unsigned free_bf_bmap; + void __iomem *map; + void __iomem *bf_map; +}; + +struct mlx4_bf { + unsigned int offset; + int buf_size; + struct mlx4_uar *uar; + void __iomem *reg; +}; + +struct mlx4_cq { + void (*comp) (struct mlx4_cq *); + void (*event) (struct mlx4_cq *, enum mlx4_event); + + struct mlx4_uar *uar; + + u32 cons_index; + + u16 irq; + __be32 *set_ci_db; + __be32 *arm_db; + int arm_sn; + + int cqn; + unsigned vector; + + atomic_t refcount; + struct completion free; +}; + +struct mlx4_qp { + void (*event) (struct mlx4_qp *, enum mlx4_event); + + int qpn; + + atomic_t refcount; + struct completion free; +}; + +struct mlx4_srq { + void (*event) (struct mlx4_srq *, enum mlx4_event); + + int srqn; + int max; + int max_gs; + int wqe_shift; + + atomic_t refcount; + struct completion free; +}; + +struct mlx4_av { + __be32 port_pd; + u8 reserved1; + u8 g_slid; + __be16 dlid; + u8 reserved2; + u8 gid_index; + u8 stat_rate; + u8 hop_limit; + __be32 sl_tclass_flowlabel; + u8 dgid[16]; +}; + +struct mlx4_eth_av { + __be32 port_pd; + u8 reserved1; + u8 smac_idx; + u16 reserved2; + u8 reserved3; + u8 gid_index; + u8 stat_rate; + u8 hop_limit; + __be32 sl_tclass_flowlabel; + u8 dgid[16]; + u8 s_mac[6]; + u8 reserved4[2]; + __be16 vlan; + u8 mac[ETH_ALEN]; +}; + +union mlx4_ext_av { + struct mlx4_av ib; + struct mlx4_eth_av eth; +}; + +struct mlx4_counter { + u8 reserved1[3]; + u8 counter_mode; + __be32 num_ifc; + u32 reserved2[2]; + __be64 rx_frames; + __be64 rx_bytes; + __be64 tx_frames; + __be64 tx_bytes; +}; + +struct mlx4_quotas { + int qp; + int cq; + int srq; + int mpt; + int mtt; + int counter; + int xrcd; +}; + +struct mlx4_vf_dev { + u8 min_port; + u8 n_ports; +}; + +struct mlx4_dev { + struct pci_dev *pdev; + unsigned long flags; + unsigned long num_slaves; + struct mlx4_caps caps; + struct mlx4_phys_caps phys_caps; + struct mlx4_quotas quotas; + struct radix_tree_root qp_table_tree; + u8 rev_id; + char board_id[MLX4_BOARD_ID_LEN]; + int num_vfs; + int numa_node; + int oper_log_mgm_entry_size; + u64 regid_promisc_array[MLX4_MAX_PORTS + 1]; + u64 regid_allmulti_array[MLX4_MAX_PORTS + 1]; + struct mlx4_vf_dev *dev_vfs; + int nvfs[MLX4_MAX_PORTS + 1]; +}; + +struct mlx4_eqe { + u8 reserved1; + u8 type; + u8 reserved2; + u8 subtype; + union { + u32 raw[6]; + struct { + __be32 cqn; + } __packed comp; + struct { + u16 reserved1; + __be16 token; + u32 reserved2; + u8 reserved3[3]; + u8 status; + __be64 out_param; + } __packed cmd; + struct { + __be32 qpn; + } __packed qp; + struct { + __be32 srqn; + } __packed srq; + struct { + __be32 cqn; + u32 reserved1; + u8 reserved2[3]; + u8 syndrome; + } __packed cq_err; + struct { + u32 reserved1[2]; + __be32 port; + } __packed port_change; + struct { + #define COMM_CHANNEL_BIT_ARRAY_SIZE 4 + u32 reserved; + u32 bit_vec[COMM_CHANNEL_BIT_ARRAY_SIZE]; + } __packed comm_channel_arm; + struct { + u8 port; + u8 reserved[3]; + __be64 mac; + } __packed mac_update; + struct { + __be32 slave_id; + } __packed flr_event; + struct { + __be16 current_temperature; + __be16 warning_threshold; + } __packed warming; + struct { + u8 reserved[3]; + u8 port; + union { + struct { + __be16 mstr_sm_lid; + __be16 port_lid; + __be32 changed_attr; + u8 reserved[3]; + u8 mstr_sm_sl; + __be64 gid_prefix; + } __packed port_info; + struct { + __be32 block_ptr; + __be32 tbl_entries_mask; + } __packed tbl_change_info; + } params; + } __packed port_mgmt_change; + } event; + u8 slave_id; + u8 reserved3[2]; + u8 owner; +} __packed; + +struct mlx4_init_port_param { + int set_guid0; + int set_node_guid; + int set_si_guid; + u16 mtu; + int port_width_cap; + u16 vl_cap; + u16 max_gid; + u16 max_pkey; + u64 guid0; + u64 node_guid; + u64 si_guid; +}; + +#define mlx4_foreach_port(port, dev, type) \ + for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \ + if ((type) == (dev)->caps.port_mask[(port)]) + +#define mlx4_foreach_non_ib_transport_port(port, dev) \ + for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \ + if (((dev)->caps.port_mask[port] != MLX4_PORT_TYPE_IB)) + +#define mlx4_foreach_ib_transport_port(port, dev) \ + for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \ + if (((dev)->caps.port_mask[port] == MLX4_PORT_TYPE_IB) || \ + ((dev)->caps.flags & MLX4_DEV_CAP_FLAG_IBOE)) + +#define MLX4_INVALID_SLAVE_ID 0xFF + +void handle_port_mgmt_change_event(struct work_struct *work); + +static inline int mlx4_master_func_num(struct mlx4_dev *dev) +{ + return dev->caps.function; +} + +static inline int mlx4_is_master(struct mlx4_dev *dev) +{ + return dev->flags & MLX4_FLAG_MASTER; +} + +static inline int mlx4_num_reserved_sqps(struct mlx4_dev *dev) +{ + return dev->phys_caps.base_sqpn + 8 + + 16 * MLX4_MFUNC_MAX * !!mlx4_is_master(dev); +} + +static inline int mlx4_is_qp_reserved(struct mlx4_dev *dev, u32 qpn) +{ + return (qpn < dev->phys_caps.base_sqpn + 8 + + 16 * MLX4_MFUNC_MAX * !!mlx4_is_master(dev)); +} + +static inline int mlx4_is_guest_proxy(struct mlx4_dev *dev, int slave, u32 qpn) +{ + int guest_proxy_base = dev->phys_caps.base_proxy_sqpn + slave * 8; + + if (qpn >= guest_proxy_base && qpn < guest_proxy_base + 8) + return 1; + + return 0; +} + +static inline int mlx4_is_mfunc(struct mlx4_dev *dev) +{ + return dev->flags & (MLX4_FLAG_SLAVE | MLX4_FLAG_MASTER); +} + +static inline int mlx4_is_slave(struct mlx4_dev *dev) +{ + return dev->flags & MLX4_FLAG_SLAVE; +} + +int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct, + struct mlx4_buf *buf, gfp_t gfp); +void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf); +static inline void *mlx4_buf_offset(struct mlx4_buf *buf, int offset) +{ + if (BITS_PER_LONG == 64 || buf->nbufs == 1) + return buf->direct.buf + offset; + else + return buf->page_list[offset >> PAGE_SHIFT].buf + + (offset & (PAGE_SIZE - 1)); +} + +int mlx4_pd_alloc(struct mlx4_dev *dev, u32 *pdn); +void mlx4_pd_free(struct mlx4_dev *dev, u32 pdn); +int mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn); +void mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn); + +int mlx4_uar_alloc(struct mlx4_dev *dev, struct mlx4_uar *uar); +void mlx4_uar_free(struct mlx4_dev *dev, struct mlx4_uar *uar); +int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf, int node); +void mlx4_bf_free(struct mlx4_dev *dev, struct mlx4_bf *bf); + +int mlx4_mtt_init(struct mlx4_dev *dev, int npages, int page_shift, + struct mlx4_mtt *mtt); +void mlx4_mtt_cleanup(struct mlx4_dev *dev, struct mlx4_mtt *mtt); +u64 mlx4_mtt_addr(struct mlx4_dev *dev, struct mlx4_mtt *mtt); + +int mlx4_mr_alloc(struct mlx4_dev *dev, u32 pd, u64 iova, u64 size, u32 access, + int npages, int page_shift, struct mlx4_mr *mr); +int mlx4_mr_free(struct mlx4_dev *dev, struct mlx4_mr *mr); +int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr); +int mlx4_mw_alloc(struct mlx4_dev *dev, u32 pd, enum mlx4_mw_type type, + struct mlx4_mw *mw); +void mlx4_mw_free(struct mlx4_dev *dev, struct mlx4_mw *mw); +int mlx4_mw_enable(struct mlx4_dev *dev, struct mlx4_mw *mw); +int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, + int start_index, int npages, u64 *page_list); +int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, + struct mlx4_buf *buf, gfp_t gfp); + +int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order, + gfp_t gfp); +void mlx4_db_free(struct mlx4_dev *dev, struct mlx4_db *db); + +int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres, + int size, int max_direct); +void mlx4_free_hwq_res(struct mlx4_dev *mdev, struct mlx4_hwq_resources *wqres, + int size); + +int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt, + struct mlx4_uar *uar, u64 db_rec, struct mlx4_cq *cq, + unsigned vector, int collapsed, int timestamp_en); +void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq); + +int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base); +void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt); + +int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp, + gfp_t gfp); +void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp); + +int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcdn, + struct mlx4_mtt *mtt, u64 db_rec, struct mlx4_srq *srq); +void mlx4_srq_free(struct mlx4_dev *dev, struct mlx4_srq *srq); +int mlx4_srq_arm(struct mlx4_dev *dev, struct mlx4_srq *srq, int limit_watermark); +int mlx4_srq_query(struct mlx4_dev *dev, struct mlx4_srq *srq, int *limit_watermark); + +int mlx4_INIT_PORT(struct mlx4_dev *dev, int port); +int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port); + +int mlx4_unicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], + int block_mcast_loopback, enum mlx4_protocol prot); +int mlx4_unicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], + enum mlx4_protocol prot); +int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], + u8 port, int block_mcast_loopback, + enum mlx4_protocol protocol, u64 *reg_id); +int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], + enum mlx4_protocol protocol, u64 reg_id); + +enum { + MLX4_DOMAIN_UVERBS = 0x1000, + MLX4_DOMAIN_ETHTOOL = 0x2000, + MLX4_DOMAIN_RFS = 0x3000, + MLX4_DOMAIN_NIC = 0x5000, +}; + +enum mlx4_net_trans_rule_id { + MLX4_NET_TRANS_RULE_ID_ETH = 0, + MLX4_NET_TRANS_RULE_ID_IB, + MLX4_NET_TRANS_RULE_ID_IPV6, + MLX4_NET_TRANS_RULE_ID_IPV4, + MLX4_NET_TRANS_RULE_ID_TCP, + MLX4_NET_TRANS_RULE_ID_UDP, + MLX4_NET_TRANS_RULE_ID_VXLAN, + MLX4_NET_TRANS_RULE_NUM, /* should be last */ +}; + +extern const u16 __sw_id_hw[]; + +static inline int map_hw_to_sw_id(u16 header_id) +{ + + int i; + for (i = 0; i < MLX4_NET_TRANS_RULE_NUM; i++) { + if (header_id == __sw_id_hw[i]) + return i; + } + return -EINVAL; +} + +enum mlx4_net_trans_promisc_mode { + MLX4_FS_REGULAR = 1, + MLX4_FS_ALL_DEFAULT, + MLX4_FS_MC_DEFAULT, + MLX4_FS_UC_SNIFFER, + MLX4_FS_MC_SNIFFER, + MLX4_FS_MODE_NUM, /* should be last */ +}; + +struct mlx4_spec_eth { + u8 dst_mac[ETH_ALEN]; + u8 dst_mac_msk[ETH_ALEN]; + u8 src_mac[ETH_ALEN]; + u8 src_mac_msk[ETH_ALEN]; + u8 ether_type_enable; + __be16 ether_type; + __be16 vlan_id_msk; + __be16 vlan_id; +}; + +struct mlx4_spec_tcp_udp { + __be16 dst_port; + __be16 dst_port_msk; + __be16 src_port; + __be16 src_port_msk; +}; + +struct mlx4_spec_ipv4 { + __be32 dst_ip; + __be32 dst_ip_msk; + __be32 src_ip; + __be32 src_ip_msk; +}; + +struct mlx4_spec_ib { + __be32 l3_qpn; + __be32 qpn_msk; + u8 dst_gid[16]; + u8 dst_gid_msk[16]; +}; + +struct mlx4_spec_vxlan { + __be32 vni; + __be32 vni_mask; + +}; + +struct mlx4_spec_list { + struct list_head list; + enum mlx4_net_trans_rule_id id; + union { + struct mlx4_spec_eth eth; + struct mlx4_spec_ib ib; + struct mlx4_spec_ipv4 ipv4; + struct mlx4_spec_tcp_udp tcp_udp; + struct mlx4_spec_vxlan vxlan; + }; +}; + +enum mlx4_net_trans_hw_rule_queue { + MLX4_NET_TRANS_Q_FIFO, + MLX4_NET_TRANS_Q_LIFO, +}; + +struct mlx4_net_trans_rule { + struct list_head list; + enum mlx4_net_trans_hw_rule_queue queue_mode; + bool exclusive; + bool allow_loopback; + enum mlx4_net_trans_promisc_mode promisc_mode; + u8 port; + u16 priority; + u32 qpn; +}; + +struct mlx4_net_trans_rule_hw_ctrl { + __be16 prio; + u8 type; + u8 flags; + u8 rsvd1; + u8 funcid; + u8 vep; + u8 port; + __be32 qpn; + __be32 rsvd2; +}; + +struct mlx4_net_trans_rule_hw_ib { + u8 size; + u8 rsvd1; + __be16 id; + u32 rsvd2; + __be32 l3_qpn; + __be32 qpn_mask; + u8 dst_gid[16]; + u8 dst_gid_msk[16]; +} __packed; + +struct mlx4_net_trans_rule_hw_eth { + u8 size; + u8 rsvd; + __be16 id; + u8 rsvd1[6]; + u8 dst_mac[6]; + u16 rsvd2; + u8 dst_mac_msk[6]; + u16 rsvd3; + u8 src_mac[6]; + u16 rsvd4; + u8 src_mac_msk[6]; + u8 rsvd5; + u8 ether_type_enable; + __be16 ether_type; + __be16 vlan_tag_msk; + __be16 vlan_tag; +} __packed; + +struct mlx4_net_trans_rule_hw_tcp_udp { + u8 size; + u8 rsvd; + __be16 id; + __be16 rsvd1[3]; + __be16 dst_port; + __be16 rsvd2; + __be16 dst_port_msk; + __be16 rsvd3; + __be16 src_port; + __be16 rsvd4; + __be16 src_port_msk; +} __packed; + +struct mlx4_net_trans_rule_hw_ipv4 { + u8 size; + u8 rsvd; + __be16 id; + __be32 rsvd1; + __be32 dst_ip; + __be32 dst_ip_msk; + __be32 src_ip; + __be32 src_ip_msk; +} __packed; + +struct mlx4_net_trans_rule_hw_vxlan { + u8 size; + u8 rsvd; + __be16 id; + __be32 rsvd1; + __be32 vni; + __be32 vni_mask; +} __packed; + +struct _rule_hw { + union { + struct { + u8 size; + u8 rsvd; + __be16 id; + }; + struct mlx4_net_trans_rule_hw_eth eth; + struct mlx4_net_trans_rule_hw_ib ib; + struct mlx4_net_trans_rule_hw_ipv4 ipv4; + struct mlx4_net_trans_rule_hw_tcp_udp tcp_udp; + struct mlx4_net_trans_rule_hw_vxlan vxlan; + }; +}; + +enum { + VXLAN_STEER_BY_OUTER_MAC = 1 << 0, + VXLAN_STEER_BY_OUTER_VLAN = 1 << 1, + VXLAN_STEER_BY_VSID_VNI = 1 << 2, + VXLAN_STEER_BY_INNER_MAC = 1 << 3, + VXLAN_STEER_BY_INNER_VLAN = 1 << 4, +}; + + +int mlx4_flow_steer_promisc_add(struct mlx4_dev *dev, u8 port, u32 qpn, + enum mlx4_net_trans_promisc_mode mode); +int mlx4_flow_steer_promisc_remove(struct mlx4_dev *dev, u8 port, + enum mlx4_net_trans_promisc_mode mode); +int mlx4_multicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port); +int mlx4_multicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port); +int mlx4_unicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port); +int mlx4_unicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port); +int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mode); + +int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac); +void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac); +int mlx4_get_base_qpn(struct mlx4_dev *dev, u8 port); +int __mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac); +void mlx4_set_stats_bitmap(struct mlx4_dev *dev, u64 *stats_bitmap); +int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu, + u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx); +int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn, + u8 promisc); +int mlx4_SET_PORT_PRIO2TC(struct mlx4_dev *dev, u8 port, u8 *prio2tc); +int mlx4_SET_PORT_SCHEDULER(struct mlx4_dev *dev, u8 port, u8 *tc_tx_bw, + u8 *pg, u16 *ratelimit); +int mlx4_SET_PORT_VXLAN(struct mlx4_dev *dev, u8 port, u8 steering, int enable); +int mlx4_find_cached_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *idx); +int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx); +int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index); +void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan); + +int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list, + int npages, u64 iova, u32 *lkey, u32 *rkey); +int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages, + int max_maps, u8 page_shift, struct mlx4_fmr *fmr); +int mlx4_fmr_enable(struct mlx4_dev *dev, struct mlx4_fmr *fmr); +void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr, + u32 *lkey, u32 *rkey); +int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr); +int mlx4_SYNC_TPT(struct mlx4_dev *dev); +int mlx4_test_interrupts(struct mlx4_dev *dev); +int mlx4_assign_eq(struct mlx4_dev *dev, char *name, struct cpu_rmap *rmap, + int *vector); +void mlx4_release_eq(struct mlx4_dev *dev, int vec); + +int mlx4_eq_get_irq(struct mlx4_dev *dev, int vec); + +int mlx4_get_phys_port_id(struct mlx4_dev *dev); +int mlx4_wol_read(struct mlx4_dev *dev, u64 *config, int port); +int mlx4_wol_write(struct mlx4_dev *dev, u64 config, int port); + +int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx); +void mlx4_counter_free(struct mlx4_dev *dev, u32 idx); + +int mlx4_flow_attach(struct mlx4_dev *dev, + struct mlx4_net_trans_rule *rule, u64 *reg_id); +int mlx4_flow_detach(struct mlx4_dev *dev, u64 reg_id); +int mlx4_map_sw_to_hw_steering_mode(struct mlx4_dev *dev, + enum mlx4_net_trans_promisc_mode flow_type); +int mlx4_map_sw_to_hw_steering_id(struct mlx4_dev *dev, + enum mlx4_net_trans_rule_id id); +int mlx4_hw_rule_sz(struct mlx4_dev *dev, enum mlx4_net_trans_rule_id id); + +int mlx4_tunnel_steer_add(struct mlx4_dev *dev, unsigned char *addr, + int port, int qpn, u16 prio, u64 *reg_id); + +void mlx4_sync_pkey_table(struct mlx4_dev *dev, int slave, int port, + int i, int val); + +int mlx4_get_parav_qkey(struct mlx4_dev *dev, u32 qpn, u32 *qkey); + +int mlx4_is_slave_active(struct mlx4_dev *dev, int slave); +int mlx4_gen_pkey_eqe(struct mlx4_dev *dev, int slave, u8 port); +int mlx4_gen_guid_change_eqe(struct mlx4_dev *dev, int slave, u8 port); +int mlx4_gen_slaves_port_mgt_ev(struct mlx4_dev *dev, u8 port, int attr); +int mlx4_gen_port_state_change_eqe(struct mlx4_dev *dev, int slave, u8 port, u8 port_subtype_change); +enum slave_port_state mlx4_get_slave_port_state(struct mlx4_dev *dev, int slave, u8 port); +int set_and_calc_slave_port_state(struct mlx4_dev *dev, int slave, u8 port, int event, enum slave_port_gen_event *gen_event); + +void mlx4_put_slave_node_guid(struct mlx4_dev *dev, int slave, __be64 guid); +__be64 mlx4_get_slave_node_guid(struct mlx4_dev *dev, int slave); + +int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid, + int *slave_id); +int mlx4_get_roce_gid_from_slave(struct mlx4_dev *dev, int port, int slave_id, + u8 *gid); + +int mlx4_FLOW_STEERING_IB_UC_QP_RANGE(struct mlx4_dev *dev, u32 min_range_qpn, + u32 max_range_qpn); + +cycle_t mlx4_read_clock(struct mlx4_dev *dev); + +struct mlx4_active_ports { + DECLARE_BITMAP(ports, MLX4_MAX_PORTS); +}; +/* Returns a bitmap of the physical ports which are assigned to slave */ +struct mlx4_active_ports mlx4_get_active_ports(struct mlx4_dev *dev, int slave); + +/* Returns the physical port that represents the virtual port of the slave, */ +/* or a value < 0 in case of an error. If a slave has 2 ports, the identity */ +/* mapping is returned. */ +int mlx4_slave_convert_port(struct mlx4_dev *dev, int slave, int port); + +struct mlx4_slaves_pport { + DECLARE_BITMAP(slaves, MLX4_MFUNC_MAX); +}; +/* Returns a bitmap of all slaves that are assigned to port. */ +struct mlx4_slaves_pport mlx4_phys_to_slaves_pport(struct mlx4_dev *dev, + int port); + +/* Returns a bitmap of all slaves that are assigned exactly to all the */ +/* the ports that are set in crit_ports. */ +struct mlx4_slaves_pport mlx4_phys_to_slaves_pport_actv( + struct mlx4_dev *dev, + const struct mlx4_active_ports *crit_ports); + +/* Returns the slave's virtual port that represents the physical port. */ +int mlx4_phys_to_slave_port(struct mlx4_dev *dev, int slave, int port); + +int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave, int port); + +int mlx4_config_vxlan_port(struct mlx4_dev *dev, __be16 udp_port); +int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port); +int mlx4_vf_get_enable_smi_admin(struct mlx4_dev *dev, int slave, int port); +int mlx4_vf_set_enable_smi_admin(struct mlx4_dev *dev, int slave, int port, + int enable); +int mlx4_mr_hw_get_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr, + struct mlx4_mpt_entry ***mpt_entry); +int mlx4_mr_hw_write_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr, + struct mlx4_mpt_entry **mpt_entry); +int mlx4_mr_hw_change_pd(struct mlx4_dev *dev, struct mlx4_mpt_entry *mpt_entry, + u32 pdn); +int mlx4_mr_hw_change_access(struct mlx4_dev *dev, + struct mlx4_mpt_entry *mpt_entry, + u32 access); +void mlx4_mr_hw_put_mpt(struct mlx4_dev *dev, + struct mlx4_mpt_entry **mpt_entry); +void mlx4_mr_rereg_mem_cleanup(struct mlx4_dev *dev, struct mlx4_mr *mr); +int mlx4_mr_rereg_mem_write(struct mlx4_dev *dev, struct mlx4_mr *mr, + u64 iova, u64 size, int npages, + int page_shift, struct mlx4_mpt_entry *mpt_entry); + +/* Returns true if running in low memory profile (kdump kernel) */ +static inline bool mlx4_low_memory_profile(void) +{ + return is_kdump_kernel(); +} + +#endif /* MLX4_DEVICE_H */ diff --git a/include/linux/mlx4/doorbell.h b/include/linux/mlx4/doorbell.h new file mode 100644 index 0000000..f31bba2 --- /dev/null +++ b/include/linux/mlx4/doorbell.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX4_DOORBELL_H +#define MLX4_DOORBELL_H + +#include +#include + +#define MLX4_SEND_DOORBELL 0x14 +#define MLX4_CQ_DOORBELL 0x20 + +#if BITS_PER_LONG == 64 +/* + * Assume that we can just write a 64-bit doorbell atomically. s390 + * actually doesn't have writeq() but S/390 systems don't even have + * PCI so we won't worry about it. + */ + +#define MLX4_DECLARE_DOORBELL_LOCK(name) +#define MLX4_INIT_DOORBELL_LOCK(ptr) do { } while (0) +#define MLX4_GET_DOORBELL_LOCK(ptr) (NULL) + +static inline void mlx4_write64(__be32 val[2], void __iomem *dest, + spinlock_t *doorbell_lock) +{ + __raw_writeq(*(u64 *) val, dest); +} + +#else + +/* + * Just fall back to a spinlock to protect the doorbell if + * BITS_PER_LONG is 32 -- there's no portable way to do atomic 64-bit + * MMIO writes. + */ + +#define MLX4_DECLARE_DOORBELL_LOCK(name) spinlock_t name; +#define MLX4_INIT_DOORBELL_LOCK(ptr) spin_lock_init(ptr) +#define MLX4_GET_DOORBELL_LOCK(ptr) (ptr) + +static inline void mlx4_write64(__be32 val[2], void __iomem *dest, + spinlock_t *doorbell_lock) +{ + unsigned long flags; + + spin_lock_irqsave(doorbell_lock, flags); + __raw_writel((__force u32) val[0], dest); + __raw_writel((__force u32) val[1], dest + 4); + spin_unlock_irqrestore(doorbell_lock, flags); +} + +#endif + +#endif /* MLX4_DOORBELL_H */ diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h new file mode 100644 index 0000000..022055c --- /dev/null +++ b/include/linux/mlx4/driver.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX4_DRIVER_H +#define MLX4_DRIVER_H + +#include + +struct mlx4_dev; + +#define MLX4_MAC_MASK 0xffffffffffffULL + +enum mlx4_dev_event { + MLX4_DEV_EVENT_CATASTROPHIC_ERROR, + MLX4_DEV_EVENT_PORT_UP, + MLX4_DEV_EVENT_PORT_DOWN, + MLX4_DEV_EVENT_PORT_REINIT, + MLX4_DEV_EVENT_PORT_MGMT_CHANGE, + MLX4_DEV_EVENT_SLAVE_INIT, + MLX4_DEV_EVENT_SLAVE_SHUTDOWN, +}; + +struct mlx4_interface { + void * (*add) (struct mlx4_dev *dev); + void (*remove)(struct mlx4_dev *dev, void *context); + void (*event) (struct mlx4_dev *dev, void *context, + enum mlx4_dev_event event, unsigned long param); + void * (*get_dev)(struct mlx4_dev *dev, void *context, u8 port); + struct list_head list; + enum mlx4_protocol protocol; +}; + +int mlx4_register_interface(struct mlx4_interface *intf); +void mlx4_unregister_interface(struct mlx4_interface *intf); + +void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto, int port); + +static inline u64 mlx4_mac_to_u64(u8 *addr) +{ + u64 mac = 0; + int i; + + for (i = 0; i < ETH_ALEN; i++) { + mac <<= 8; + mac |= addr[i]; + } + return mac; +} + +#endif /* MLX4_DRIVER_H */ diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h new file mode 100644 index 0000000..5f4e36c --- /dev/null +++ b/include/linux/mlx4/qp.h @@ -0,0 +1,462 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX4_QP_H +#define MLX4_QP_H + +#include +#include + +#include + +#define MLX4_INVALID_LKEY 0x100 + +enum mlx4_qp_optpar { + MLX4_QP_OPTPAR_ALT_ADDR_PATH = 1 << 0, + MLX4_QP_OPTPAR_RRE = 1 << 1, + MLX4_QP_OPTPAR_RAE = 1 << 2, + MLX4_QP_OPTPAR_RWE = 1 << 3, + MLX4_QP_OPTPAR_PKEY_INDEX = 1 << 4, + MLX4_QP_OPTPAR_Q_KEY = 1 << 5, + MLX4_QP_OPTPAR_RNR_TIMEOUT = 1 << 6, + MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH = 1 << 7, + MLX4_QP_OPTPAR_SRA_MAX = 1 << 8, + MLX4_QP_OPTPAR_RRA_MAX = 1 << 9, + MLX4_QP_OPTPAR_PM_STATE = 1 << 10, + MLX4_QP_OPTPAR_RETRY_COUNT = 1 << 12, + MLX4_QP_OPTPAR_RNR_RETRY = 1 << 13, + MLX4_QP_OPTPAR_ACK_TIMEOUT = 1 << 14, + MLX4_QP_OPTPAR_SCHED_QUEUE = 1 << 16, + MLX4_QP_OPTPAR_COUNTER_INDEX = 1 << 20, + MLX4_QP_OPTPAR_VLAN_STRIPPING = 1 << 21, +}; + +enum mlx4_qp_state { + MLX4_QP_STATE_RST = 0, + MLX4_QP_STATE_INIT = 1, + MLX4_QP_STATE_RTR = 2, + MLX4_QP_STATE_RTS = 3, + MLX4_QP_STATE_SQER = 4, + MLX4_QP_STATE_SQD = 5, + MLX4_QP_STATE_ERR = 6, + MLX4_QP_STATE_SQ_DRAINING = 7, + MLX4_QP_NUM_STATE +}; + +enum { + MLX4_QP_ST_RC = 0x0, + MLX4_QP_ST_UC = 0x1, + MLX4_QP_ST_RD = 0x2, + MLX4_QP_ST_UD = 0x3, + MLX4_QP_ST_XRC = 0x6, + MLX4_QP_ST_MLX = 0x7 +}; + +enum { + MLX4_QP_PM_MIGRATED = 0x3, + MLX4_QP_PM_ARMED = 0x0, + MLX4_QP_PM_REARM = 0x1 +}; + +enum { + /* params1 */ + MLX4_QP_BIT_SRE = 1 << 15, + MLX4_QP_BIT_SWE = 1 << 14, + MLX4_QP_BIT_SAE = 1 << 13, + /* params2 */ + MLX4_QP_BIT_RRE = 1 << 15, + MLX4_QP_BIT_RWE = 1 << 14, + MLX4_QP_BIT_RAE = 1 << 13, + MLX4_QP_BIT_RIC = 1 << 4, +}; + +enum { + MLX4_RSS_HASH_XOR = 0, + MLX4_RSS_HASH_TOP = 1, + + MLX4_RSS_UDP_IPV6 = 1 << 0, + MLX4_RSS_UDP_IPV4 = 1 << 1, + MLX4_RSS_TCP_IPV6 = 1 << 2, + MLX4_RSS_IPV6 = 1 << 3, + MLX4_RSS_TCP_IPV4 = 1 << 4, + MLX4_RSS_IPV4 = 1 << 5, + + MLX4_RSS_BY_OUTER_HEADERS = 0 << 6, + MLX4_RSS_BY_INNER_HEADERS = 2 << 6, + MLX4_RSS_BY_INNER_HEADERS_IPONLY = 3 << 6, + + /* offset of mlx4_rss_context within mlx4_qp_context.pri_path */ + MLX4_RSS_OFFSET_IN_QPC_PRI_PATH = 0x24, + /* offset of being RSS indirection QP within mlx4_qp_context.flags */ + MLX4_RSS_QPC_FLAG_OFFSET = 13, +}; + +struct mlx4_rss_context { + __be32 base_qpn; + __be32 default_qpn; + u16 reserved; + u8 hash_fn; + u8 flags; + __be32 rss_key[10]; + __be32 base_qpn_udp; +}; + +struct mlx4_qp_path { + u8 fl; + u8 vlan_control; + u8 disable_pkey_check; + u8 pkey_index; + u8 counter_index; + u8 grh_mylmc; + __be16 rlid; + u8 ackto; + u8 mgid_index; + u8 static_rate; + u8 hop_limit; + __be32 tclass_flowlabel; + u8 rgid[16]; + u8 sched_queue; + u8 vlan_index; + u8 feup; + u8 fvl_rx; + u8 reserved4[2]; + u8 dmac[ETH_ALEN]; +}; + +enum { /* fl */ + MLX4_FL_CV = 1 << 6, + MLX4_FL_ETH_HIDE_CQE_VLAN = 1 << 2 +}; +enum { /* vlan_control */ + MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED = 1 << 6, + MLX4_VLAN_CTRL_ETH_TX_BLOCK_PRIO_TAGGED = 1 << 5, /* 802.1p priority tag */ + MLX4_VLAN_CTRL_ETH_TX_BLOCK_UNTAGGED = 1 << 4, + MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED = 1 << 2, + MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED = 1 << 1, /* 802.1p priority tag */ + MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED = 1 << 0 +}; + +enum { /* feup */ + MLX4_FEUP_FORCE_ETH_UP = 1 << 6, /* force Eth UP */ + MLX4_FSM_FORCE_ETH_SRC_MAC = 1 << 5, /* force Source MAC */ + MLX4_FVL_FORCE_ETH_VLAN = 1 << 3 /* force Eth vlan */ +}; + +enum { /* fvl_rx */ + MLX4_FVL_RX_FORCE_ETH_VLAN = 1 << 0 /* enforce Eth rx vlan */ +}; + +struct mlx4_qp_context { + __be32 flags; + __be32 pd; + u8 mtu_msgmax; + u8 rq_size_stride; + u8 sq_size_stride; + u8 rlkey; + __be32 usr_page; + __be32 local_qpn; + __be32 remote_qpn; + struct mlx4_qp_path pri_path; + struct mlx4_qp_path alt_path; + __be32 params1; + u32 reserved1; + __be32 next_send_psn; + __be32 cqn_send; + u32 reserved2[2]; + __be32 last_acked_psn; + __be32 ssn; + __be32 params2; + __be32 rnr_nextrecvpsn; + __be32 xrcd; + __be32 cqn_recv; + __be64 db_rec_addr; + __be32 qkey; + __be32 srqn; + __be32 msn; + __be16 rq_wqe_counter; + __be16 sq_wqe_counter; + u32 reserved3[2]; + __be32 param3; + __be32 nummmcpeers_basemkey; + u8 log_page_size; + u8 reserved4[2]; + u8 mtt_base_addr_h; + __be32 mtt_base_addr_l; + u32 reserved5[10]; +}; + +struct mlx4_update_qp_context { + __be64 qp_mask; + __be64 primary_addr_path_mask; + __be64 secondary_addr_path_mask; + u64 reserved1; + struct mlx4_qp_context qp_context; + u64 reserved2[58]; +}; + +enum { + MLX4_UPD_QP_MASK_PM_STATE = 32, + MLX4_UPD_QP_MASK_VSD = 33, +}; + +enum { + MLX4_UPD_QP_PATH_MASK_PKEY_INDEX = 0 + 32, + MLX4_UPD_QP_PATH_MASK_FSM = 1 + 32, + MLX4_UPD_QP_PATH_MASK_MAC_INDEX = 2 + 32, + MLX4_UPD_QP_PATH_MASK_FVL = 3 + 32, + MLX4_UPD_QP_PATH_MASK_CV = 4 + 32, + MLX4_UPD_QP_PATH_MASK_VLAN_INDEX = 5 + 32, + MLX4_UPD_QP_PATH_MASK_ETH_HIDE_CQE_VLAN = 6 + 32, + MLX4_UPD_QP_PATH_MASK_ETH_TX_BLOCK_UNTAGGED = 7 + 32, + MLX4_UPD_QP_PATH_MASK_ETH_TX_BLOCK_1P = 8 + 32, + MLX4_UPD_QP_PATH_MASK_ETH_TX_BLOCK_TAGGED = 9 + 32, + MLX4_UPD_QP_PATH_MASK_ETH_RX_BLOCK_UNTAGGED = 10 + 32, + MLX4_UPD_QP_PATH_MASK_ETH_RX_BLOCK_1P = 11 + 32, + MLX4_UPD_QP_PATH_MASK_ETH_RX_BLOCK_TAGGED = 12 + 32, + MLX4_UPD_QP_PATH_MASK_FEUP = 13 + 32, + MLX4_UPD_QP_PATH_MASK_SCHED_QUEUE = 14 + 32, + MLX4_UPD_QP_PATH_MASK_IF_COUNTER_INDEX = 15 + 32, + MLX4_UPD_QP_PATH_MASK_FVL_RX = 16 + 32, +}; + +enum { /* param3 */ + MLX4_STRIP_VLAN = 1 << 30 +}; + +/* Which firmware version adds support for NEC (NoErrorCompletion) bit */ +#define MLX4_FW_VER_WQE_CTRL_NEC mlx4_fw_ver(2, 2, 232) + +enum { + MLX4_WQE_CTRL_NEC = 1 << 29, + MLX4_WQE_CTRL_IIP = 1 << 28, + MLX4_WQE_CTRL_ILP = 1 << 27, + MLX4_WQE_CTRL_FENCE = 1 << 6, + MLX4_WQE_CTRL_CQ_UPDATE = 3 << 2, + MLX4_WQE_CTRL_SOLICITED = 1 << 1, + MLX4_WQE_CTRL_IP_CSUM = 1 << 4, + MLX4_WQE_CTRL_TCP_UDP_CSUM = 1 << 5, + MLX4_WQE_CTRL_INS_VLAN = 1 << 6, + MLX4_WQE_CTRL_STRONG_ORDER = 1 << 7, + MLX4_WQE_CTRL_FORCE_LOOPBACK = 1 << 0, +}; + +struct mlx4_wqe_ctrl_seg { + __be32 owner_opcode; + union { + struct { + __be16 vlan_tag; + u8 ins_vlan; + u8 fence_size; + }; + __be32 bf_qpn; + }; + /* + * High 24 bits are SRC remote buffer; low 8 bits are flags: + * [7] SO (strong ordering) + * [5] TCP/UDP checksum + * [4] IP checksum + * [3:2] C (generate completion queue entry) + * [1] SE (solicited event) + * [0] FL (force loopback) + */ + union { + __be32 srcrb_flags; + __be16 srcrb_flags16[2]; + }; + /* + * imm is immediate data for send/RDMA write w/ immediate; + * also invalidation key for send with invalidate; input + * modifier for WQEs on CCQs. + */ + __be32 imm; +}; + +enum { + MLX4_WQE_MLX_VL15 = 1 << 17, + MLX4_WQE_MLX_SLR = 1 << 16 +}; + +struct mlx4_wqe_mlx_seg { + u8 owner; + u8 reserved1[2]; + u8 opcode; + __be16 sched_prio; + u8 reserved2; + u8 size; + /* + * [17] VL15 + * [16] SLR + * [15:12] static rate + * [11:8] SL + * [4] ICRC + * [3:2] C + * [0] FL (force loopback) + */ + __be32 flags; + __be16 rlid; + u16 reserved3; +}; + +struct mlx4_wqe_datagram_seg { + __be32 av[8]; + __be32 dqpn; + __be32 qkey; + __be16 vlan; + u8 mac[ETH_ALEN]; +}; + +struct mlx4_wqe_lso_seg { + __be32 mss_hdr_size; + __be32 header[0]; +}; + +enum mlx4_wqe_bind_seg_flags2 { + MLX4_WQE_BIND_ZERO_BASED = (1 << 30), + MLX4_WQE_BIND_TYPE_2 = (1 << 31), +}; + +struct mlx4_wqe_bind_seg { + __be32 flags1; + __be32 flags2; + __be32 new_rkey; + __be32 lkey; + __be64 addr; + __be64 length; +}; + +enum { + MLX4_WQE_FMR_PERM_LOCAL_READ = 1 << 27, + MLX4_WQE_FMR_PERM_LOCAL_WRITE = 1 << 28, + MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ = 1 << 29, + MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE = 1 << 30, + MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC = 1 << 31 +}; + +struct mlx4_wqe_fmr_seg { + __be32 flags; + __be32 mem_key; + __be64 buf_list; + __be64 start_addr; + __be64 reg_len; + __be32 offset; + __be32 page_size; + u32 reserved[2]; +}; + +struct mlx4_wqe_fmr_ext_seg { + u8 flags; + u8 reserved; + __be16 app_mask; + __be16 wire_app_tag; + __be16 mem_app_tag; + __be32 wire_ref_tag_base; + __be32 mem_ref_tag_base; +}; + +struct mlx4_wqe_local_inval_seg { + u64 reserved1; + __be32 mem_key; + u32 reserved2; + u64 reserved3[2]; +}; + +struct mlx4_wqe_raddr_seg { + __be64 raddr; + __be32 rkey; + u32 reserved; +}; + +struct mlx4_wqe_atomic_seg { + __be64 swap_add; + __be64 compare; +}; + +struct mlx4_wqe_masked_atomic_seg { + __be64 swap_add; + __be64 compare; + __be64 swap_add_mask; + __be64 compare_mask; +}; + +struct mlx4_wqe_data_seg { + __be32 byte_count; + __be32 lkey; + __be64 addr; +}; + +enum { + MLX4_INLINE_ALIGN = 64, + MLX4_INLINE_SEG = 1 << 31, +}; + +struct mlx4_wqe_inline_seg { + __be32 byte_count; +}; + +enum mlx4_update_qp_attr { + MLX4_UPDATE_QP_SMAC = 1 << 0, + MLX4_UPDATE_QP_VSD = 1 << 2, + MLX4_UPDATE_QP_SUPPORTED_ATTRS = (1 << 2) - 1 +}; + +enum mlx4_update_qp_params_flags { + MLX4_UPDATE_QP_PARAMS_FLAGS_VSD_ENABLE = 1 << 0, +}; + +struct mlx4_update_qp_params { + u8 smac_index; + u32 flags; +}; + +int mlx4_update_qp(struct mlx4_dev *dev, u32 qpn, + enum mlx4_update_qp_attr attr, + struct mlx4_update_qp_params *params); +int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt, + enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state, + struct mlx4_qp_context *context, enum mlx4_qp_optpar optpar, + int sqd_event, struct mlx4_qp *qp); + +int mlx4_qp_query(struct mlx4_dev *dev, struct mlx4_qp *qp, + struct mlx4_qp_context *context); + +int mlx4_qp_to_ready(struct mlx4_dev *dev, struct mlx4_mtt *mtt, + struct mlx4_qp_context *context, + struct mlx4_qp *qp, enum mlx4_qp_state *qp_state); + +static inline struct mlx4_qp *__mlx4_qp_lookup(struct mlx4_dev *dev, u32 qpn) +{ + return radix_tree_lookup(&dev->qp_table_tree, qpn & (dev->caps.num_qps - 1)); +} + +void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp); + +#endif /* MLX4_QP_H */ diff --git a/include/linux/mlx4/srq.h b/include/linux/mlx4/srq.h new file mode 100644 index 0000000..192e0f7 --- /dev/null +++ b/include/linux/mlx4/srq.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX4_SRQ_H +#define MLX4_SRQ_H + +struct mlx4_wqe_srq_next_seg { + u16 reserved1; + __be16 next_wqe_index; + u32 reserved2[3]; +}; + +struct mlx4_srq *mlx4_srq_lookup(struct mlx4_dev *dev, u32 srqn); + +#endif /* MLX4_SRQ_H */ diff --git a/include/linux/mlx5/cmd.h b/include/linux/mlx5/cmd.h new file mode 100644 index 0000000..2826a4b --- /dev/null +++ b/include/linux/mlx5/cmd.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_CMD_H +#define MLX5_CMD_H + +#include + +struct manage_pages_layout { + u64 ptr; + u32 reserved; + u16 num_entries; + u16 func_id; +}; + + +struct mlx5_cmd_alloc_uar_imm_out { + u32 rsvd[3]; + u32 uarn; +}; + +#endif /* MLX5_CMD_H */ diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h new file mode 100644 index 0000000..f6b17ac --- /dev/null +++ b/include/linux/mlx5/cq.h @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_CORE_CQ_H +#define MLX5_CORE_CQ_H + +#include +#include + + +struct mlx5_core_cq { + u32 cqn; + int cqe_sz; + __be32 *set_ci_db; + __be32 *arm_db; + atomic_t refcount; + struct completion free; + unsigned vector; + int irqn; + void (*comp) (struct mlx5_core_cq *); + void (*event) (struct mlx5_core_cq *, enum mlx5_event); + struct mlx5_uar *uar; + u32 cons_index; + unsigned arm_sn; + struct mlx5_rsc_debug *dbg; + int pid; +}; + + +enum { + MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR = 0x01, + MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR = 0x02, + MLX5_CQE_SYNDROME_LOCAL_PROT_ERR = 0x04, + MLX5_CQE_SYNDROME_WR_FLUSH_ERR = 0x05, + MLX5_CQE_SYNDROME_MW_BIND_ERR = 0x06, + MLX5_CQE_SYNDROME_BAD_RESP_ERR = 0x10, + MLX5_CQE_SYNDROME_LOCAL_ACCESS_ERR = 0x11, + MLX5_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR = 0x12, + MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR = 0x13, + MLX5_CQE_SYNDROME_REMOTE_OP_ERR = 0x14, + MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR = 0x15, + MLX5_CQE_SYNDROME_RNR_RETRY_EXC_ERR = 0x16, + MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR = 0x22, +}; + +enum { + MLX5_CQE_OWNER_MASK = 1, + MLX5_CQE_REQ = 0, + MLX5_CQE_RESP_WR_IMM = 1, + MLX5_CQE_RESP_SEND = 2, + MLX5_CQE_RESP_SEND_IMM = 3, + MLX5_CQE_RESP_SEND_INV = 4, + MLX5_CQE_RESIZE_CQ = 5, + MLX5_CQE_SIG_ERR = 12, + MLX5_CQE_REQ_ERR = 13, + MLX5_CQE_RESP_ERR = 14, + MLX5_CQE_INVALID = 15, +}; + +enum { + MLX5_CQ_MODIFY_PERIOD = 1 << 0, + MLX5_CQ_MODIFY_COUNT = 1 << 1, + MLX5_CQ_MODIFY_OVERRUN = 1 << 2, +}; + +enum { + MLX5_CQ_OPMOD_RESIZE = 1, + MLX5_MODIFY_CQ_MASK_LOG_SIZE = 1 << 0, + MLX5_MODIFY_CQ_MASK_PG_OFFSET = 1 << 1, + MLX5_MODIFY_CQ_MASK_PG_SIZE = 1 << 2, +}; + +struct mlx5_cq_modify_params { + int type; + union { + struct { + u32 page_offset; + u8 log_cq_size; + } resize; + + struct { + } moder; + + struct { + } mapping; + } params; +}; + +enum { + CQE_SIZE_64 = 0, + CQE_SIZE_128 = 1, +}; + +static inline int cqe_sz_to_mlx_sz(u8 size) +{ + return size == 64 ? CQE_SIZE_64 : CQE_SIZE_128; +} + +static inline void mlx5_cq_set_ci(struct mlx5_core_cq *cq) +{ + *cq->set_ci_db = cpu_to_be32(cq->cons_index & 0xffffff); +} + +enum { + MLX5_CQ_DB_REQ_NOT_SOL = 1 << 24, + MLX5_CQ_DB_REQ_NOT = 0 << 24 +}; + +static inline void mlx5_cq_arm(struct mlx5_core_cq *cq, u32 cmd, + void __iomem *uar_page, + spinlock_t *doorbell_lock) +{ + __be32 doorbell[2]; + u32 sn; + u32 ci; + + sn = cq->arm_sn & 3; + ci = cq->cons_index & 0xffffff; + + *cq->arm_db = cpu_to_be32(sn << 28 | cmd | ci); + + /* Make sure that the doorbell record in host memory is + * written before ringing the doorbell via PCI MMIO. + */ + wmb(); + + doorbell[0] = cpu_to_be32(sn << 28 | cmd | ci); + doorbell[1] = cpu_to_be32(cq->cqn); + + mlx5_write64(doorbell, uar_page + MLX5_CQ_DOORBELL, doorbell_lock); +} + +int mlx5_init_cq_table(struct mlx5_core_dev *dev); +void mlx5_cleanup_cq_table(struct mlx5_core_dev *dev); +int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, + struct mlx5_create_cq_mbox_in *in, int inlen); +int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq); +int mlx5_core_query_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, + struct mlx5_query_cq_mbox_out *out); +int mlx5_core_modify_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, + struct mlx5_modify_cq_mbox_in *in, int in_sz); +int mlx5_debug_cq_add(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq); +void mlx5_debug_cq_remove(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq); + +#endif /* MLX5_CORE_CQ_H */ diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h new file mode 100644 index 0000000..1d67fd3 --- /dev/null +++ b/include/linux/mlx5/device.h @@ -0,0 +1,937 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_DEVICE_H +#define MLX5_DEVICE_H + +#include +#include + +#if defined(__LITTLE_ENDIAN) +#define MLX5_SET_HOST_ENDIANNESS 0 +#elif defined(__BIG_ENDIAN) +#define MLX5_SET_HOST_ENDIANNESS 0x80 +#else +#error Host endianness not defined +#endif + +/* helper macros */ +#define __mlx5_nullp(typ) ((struct mlx5_ifc_##typ##_bits *)0) +#define __mlx5_bit_sz(typ, fld) sizeof(__mlx5_nullp(typ)->fld) +#define __mlx5_bit_off(typ, fld) ((unsigned)(unsigned long)(&(__mlx5_nullp(typ)->fld))) +#define __mlx5_dw_off(typ, fld) (__mlx5_bit_off(typ, fld) / 32) +#define __mlx5_64_off(typ, fld) (__mlx5_bit_off(typ, fld) / 64) +#define __mlx5_dw_bit_off(typ, fld) (32 - __mlx5_bit_sz(typ, fld) - (__mlx5_bit_off(typ, fld) & 0x1f)) +#define __mlx5_mask(typ, fld) ((u32)((1ull << __mlx5_bit_sz(typ, fld)) - 1)) +#define __mlx5_dw_mask(typ, fld) (__mlx5_mask(typ, fld) << __mlx5_dw_bit_off(typ, fld)) +#define __mlx5_st_sz_bits(typ) sizeof(struct mlx5_ifc_##typ##_bits) + +#define MLX5_FLD_SZ_BYTES(typ, fld) (__mlx5_bit_sz(typ, fld) / 8) +#define MLX5_ST_SZ_BYTES(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 8) +#define MLX5_ST_SZ_DW(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 32) +#define MLX5_BYTE_OFF(typ, fld) (__mlx5_bit_off(typ, fld) / 8) +#define MLX5_ADDR_OF(typ, p, fld) ((char *)(p) + MLX5_BYTE_OFF(typ, fld)) + +/* insert a value to a struct */ +#define MLX5_SET(typ, p, fld, v) do { \ + BUILD_BUG_ON(__mlx5_st_sz_bits(typ) % 32); \ + *((__be32 *)(p) + __mlx5_dw_off(typ, fld)) = \ + cpu_to_be32((be32_to_cpu(*((__be32 *)(p) + __mlx5_dw_off(typ, fld))) & \ + (~__mlx5_dw_mask(typ, fld))) | (((v) & __mlx5_mask(typ, fld)) \ + << __mlx5_dw_bit_off(typ, fld))); \ +} while (0) + +#define MLX5_GET(typ, p, fld) ((be32_to_cpu(*((__be32 *)(p) +\ +__mlx5_dw_off(typ, fld))) >> __mlx5_dw_bit_off(typ, fld)) & \ +__mlx5_mask(typ, fld)) + +#define MLX5_GET_PR(typ, p, fld) ({ \ + u32 ___t = MLX5_GET(typ, p, fld); \ + pr_debug(#fld " = 0x%x\n", ___t); \ + ___t; \ +}) + +#define MLX5_SET64(typ, p, fld, v) do { \ + BUILD_BUG_ON(__mlx5_bit_sz(typ, fld) != 64); \ + BUILD_BUG_ON(__mlx5_bit_off(typ, fld) % 64); \ + *((__be64 *)(p) + __mlx5_64_off(typ, fld)) = cpu_to_be64(v); \ +} while (0) + +#define MLX5_GET64(typ, p, fld) be64_to_cpu(*((__be64 *)(p) + __mlx5_64_off(typ, fld))) + +enum { + MLX5_MAX_COMMANDS = 32, + MLX5_CMD_DATA_BLOCK_SIZE = 512, + MLX5_PCI_CMD_XPORT = 7, + MLX5_MKEY_BSF_OCTO_SIZE = 4, + MLX5_MAX_PSVS = 4, +}; + +enum { + MLX5_EXTENDED_UD_AV = 0x80000000, +}; + +enum { + MLX5_CQ_STATE_ARMED = 9, + MLX5_CQ_STATE_ALWAYS_ARMED = 0xb, + MLX5_CQ_STATE_FIRED = 0xa, +}; + +enum { + MLX5_STAT_RATE_OFFSET = 5, +}; + +enum { + MLX5_INLINE_SEG = 0x80000000, +}; + +enum { + MLX5_MIN_PKEY_TABLE_SIZE = 128, + MLX5_MAX_LOG_PKEY_TABLE = 5, +}; + +enum { + MLX5_PERM_LOCAL_READ = 1 << 2, + MLX5_PERM_LOCAL_WRITE = 1 << 3, + MLX5_PERM_REMOTE_READ = 1 << 4, + MLX5_PERM_REMOTE_WRITE = 1 << 5, + MLX5_PERM_ATOMIC = 1 << 6, + MLX5_PERM_UMR_EN = 1 << 7, +}; + +enum { + MLX5_PCIE_CTRL_SMALL_FENCE = 1 << 0, + MLX5_PCIE_CTRL_RELAXED_ORDERING = 1 << 2, + MLX5_PCIE_CTRL_NO_SNOOP = 1 << 3, + MLX5_PCIE_CTRL_TLP_PROCE_EN = 1 << 6, + MLX5_PCIE_CTRL_TPH_MASK = 3 << 4, +}; + +enum { + MLX5_ACCESS_MODE_PA = 0, + MLX5_ACCESS_MODE_MTT = 1, + MLX5_ACCESS_MODE_KLM = 2 +}; + +enum { + MLX5_MKEY_REMOTE_INVAL = 1 << 24, + MLX5_MKEY_FLAG_SYNC_UMR = 1 << 29, + MLX5_MKEY_BSF_EN = 1 << 30, + MLX5_MKEY_LEN64 = 1 << 31, +}; + +enum { + MLX5_EN_RD = (u64)1, + MLX5_EN_WR = (u64)2 +}; + +enum { + MLX5_BF_REGS_PER_PAGE = 4, + MLX5_MAX_UAR_PAGES = 1 << 8, + MLX5_NON_FP_BF_REGS_PER_PAGE = 2, + MLX5_MAX_UUARS = MLX5_MAX_UAR_PAGES * MLX5_NON_FP_BF_REGS_PER_PAGE, +}; + +enum { + MLX5_MKEY_MASK_LEN = 1ull << 0, + MLX5_MKEY_MASK_PAGE_SIZE = 1ull << 1, + MLX5_MKEY_MASK_START_ADDR = 1ull << 6, + MLX5_MKEY_MASK_PD = 1ull << 7, + MLX5_MKEY_MASK_EN_RINVAL = 1ull << 8, + MLX5_MKEY_MASK_EN_SIGERR = 1ull << 9, + MLX5_MKEY_MASK_BSF_EN = 1ull << 12, + MLX5_MKEY_MASK_KEY = 1ull << 13, + MLX5_MKEY_MASK_QPN = 1ull << 14, + MLX5_MKEY_MASK_LR = 1ull << 17, + MLX5_MKEY_MASK_LW = 1ull << 18, + MLX5_MKEY_MASK_RR = 1ull << 19, + MLX5_MKEY_MASK_RW = 1ull << 20, + MLX5_MKEY_MASK_A = 1ull << 21, + MLX5_MKEY_MASK_SMALL_FENCE = 1ull << 23, + MLX5_MKEY_MASK_FREE = 1ull << 29, +}; + +enum mlx5_event { + MLX5_EVENT_TYPE_COMP = 0x0, + + MLX5_EVENT_TYPE_PATH_MIG = 0x01, + MLX5_EVENT_TYPE_COMM_EST = 0x02, + MLX5_EVENT_TYPE_SQ_DRAINED = 0x03, + MLX5_EVENT_TYPE_SRQ_LAST_WQE = 0x13, + MLX5_EVENT_TYPE_SRQ_RQ_LIMIT = 0x14, + + MLX5_EVENT_TYPE_CQ_ERROR = 0x04, + MLX5_EVENT_TYPE_WQ_CATAS_ERROR = 0x05, + MLX5_EVENT_TYPE_PATH_MIG_FAILED = 0x07, + MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR = 0x10, + MLX5_EVENT_TYPE_WQ_ACCESS_ERROR = 0x11, + MLX5_EVENT_TYPE_SRQ_CATAS_ERROR = 0x12, + + MLX5_EVENT_TYPE_INTERNAL_ERROR = 0x08, + MLX5_EVENT_TYPE_PORT_CHANGE = 0x09, + MLX5_EVENT_TYPE_GPIO_EVENT = 0x15, + MLX5_EVENT_TYPE_REMOTE_CONFIG = 0x19, + + MLX5_EVENT_TYPE_DB_BF_CONGESTION = 0x1a, + MLX5_EVENT_TYPE_STALL_EVENT = 0x1b, + + MLX5_EVENT_TYPE_CMD = 0x0a, + MLX5_EVENT_TYPE_PAGE_REQUEST = 0xb, +}; + +enum { + MLX5_PORT_CHANGE_SUBTYPE_DOWN = 1, + MLX5_PORT_CHANGE_SUBTYPE_ACTIVE = 4, + MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED = 5, + MLX5_PORT_CHANGE_SUBTYPE_LID = 6, + MLX5_PORT_CHANGE_SUBTYPE_PKEY = 7, + MLX5_PORT_CHANGE_SUBTYPE_GUID = 8, + MLX5_PORT_CHANGE_SUBTYPE_CLIENT_REREG = 9, +}; + +enum { + MLX5_DEV_CAP_FLAG_RC = 1LL << 0, + MLX5_DEV_CAP_FLAG_UC = 1LL << 1, + MLX5_DEV_CAP_FLAG_UD = 1LL << 2, + MLX5_DEV_CAP_FLAG_XRC = 1LL << 3, + MLX5_DEV_CAP_FLAG_SRQ = 1LL << 6, + MLX5_DEV_CAP_FLAG_BAD_PKEY_CNTR = 1LL << 8, + MLX5_DEV_CAP_FLAG_BAD_QKEY_CNTR = 1LL << 9, + MLX5_DEV_CAP_FLAG_APM = 1LL << 17, + MLX5_DEV_CAP_FLAG_ATOMIC = 1LL << 18, + MLX5_DEV_CAP_FLAG_BLOCK_MCAST = 1LL << 23, + MLX5_DEV_CAP_FLAG_ON_DMND_PG = 1LL << 24, + MLX5_DEV_CAP_FLAG_CQ_MODER = 1LL << 29, + MLX5_DEV_CAP_FLAG_RESIZE_CQ = 1LL << 30, + MLX5_DEV_CAP_FLAG_RESIZE_SRQ = 1LL << 32, + MLX5_DEV_CAP_FLAG_DCT = 1LL << 37, + MLX5_DEV_CAP_FLAG_REMOTE_FENCE = 1LL << 38, + MLX5_DEV_CAP_FLAG_TLP_HINTS = 1LL << 39, + MLX5_DEV_CAP_FLAG_SIG_HAND_OVER = 1LL << 40, + MLX5_DEV_CAP_FLAG_CMDIF_CSUM = 3LL << 46, +}; + +enum { + MLX5_OPCODE_NOP = 0x00, + MLX5_OPCODE_SEND_INVAL = 0x01, + MLX5_OPCODE_RDMA_WRITE = 0x08, + MLX5_OPCODE_RDMA_WRITE_IMM = 0x09, + MLX5_OPCODE_SEND = 0x0a, + MLX5_OPCODE_SEND_IMM = 0x0b, + MLX5_OPCODE_RDMA_READ = 0x10, + MLX5_OPCODE_ATOMIC_CS = 0x11, + MLX5_OPCODE_ATOMIC_FA = 0x12, + MLX5_OPCODE_ATOMIC_MASKED_CS = 0x14, + MLX5_OPCODE_ATOMIC_MASKED_FA = 0x15, + MLX5_OPCODE_BIND_MW = 0x18, + MLX5_OPCODE_CONFIG_CMD = 0x1f, + + MLX5_RECV_OPCODE_RDMA_WRITE_IMM = 0x00, + MLX5_RECV_OPCODE_SEND = 0x01, + MLX5_RECV_OPCODE_SEND_IMM = 0x02, + MLX5_RECV_OPCODE_SEND_INVAL = 0x03, + + MLX5_CQE_OPCODE_ERROR = 0x1e, + MLX5_CQE_OPCODE_RESIZE = 0x16, + + MLX5_OPCODE_SET_PSV = 0x20, + MLX5_OPCODE_GET_PSV = 0x21, + MLX5_OPCODE_CHECK_PSV = 0x22, + MLX5_OPCODE_RGET_PSV = 0x26, + MLX5_OPCODE_RCHECK_PSV = 0x27, + + MLX5_OPCODE_UMR = 0x25, + +}; + +enum { + MLX5_SET_PORT_RESET_QKEY = 0, + MLX5_SET_PORT_GUID0 = 16, + MLX5_SET_PORT_NODE_GUID = 17, + MLX5_SET_PORT_SYS_GUID = 18, + MLX5_SET_PORT_GID_TABLE = 19, + MLX5_SET_PORT_PKEY_TABLE = 20, +}; + +enum { + MLX5_MAX_PAGE_SHIFT = 31 +}; + +enum { + MLX5_ADAPTER_PAGE_SHIFT = 12, + MLX5_ADAPTER_PAGE_SIZE = 1 << MLX5_ADAPTER_PAGE_SHIFT, +}; + +enum { + MLX5_CAP_OFF_CMDIF_CSUM = 46, +}; + +enum { + HCA_CAP_OPMOD_GET_MAX = 0, + HCA_CAP_OPMOD_GET_CUR = 1, +}; + +struct mlx5_inbox_hdr { + __be16 opcode; + u8 rsvd[4]; + __be16 opmod; +}; + +struct mlx5_outbox_hdr { + u8 status; + u8 rsvd[3]; + __be32 syndrome; +}; + +struct mlx5_cmd_query_adapter_mbox_in { + struct mlx5_inbox_hdr hdr; + u8 rsvd[8]; +}; + +struct mlx5_cmd_query_adapter_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd0[24]; + u8 intapin; + u8 rsvd1[13]; + __be16 vsd_vendor_id; + u8 vsd[208]; + u8 vsd_psid[16]; +}; + +struct mlx5_cmd_init_hca_mbox_in { + struct mlx5_inbox_hdr hdr; + u8 rsvd0[2]; + __be16 profile; + u8 rsvd1[4]; +}; + +struct mlx5_cmd_init_hca_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd[8]; +}; + +struct mlx5_cmd_teardown_hca_mbox_in { + struct mlx5_inbox_hdr hdr; + u8 rsvd0[2]; + __be16 profile; + u8 rsvd1[4]; +}; + +struct mlx5_cmd_teardown_hca_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd[8]; +}; + +struct mlx5_cmd_layout { + u8 type; + u8 rsvd0[3]; + __be32 inlen; + __be64 in_ptr; + __be32 in[4]; + __be32 out[4]; + __be64 out_ptr; + __be32 outlen; + u8 token; + u8 sig; + u8 rsvd1; + u8 status_own; +}; + + +struct health_buffer { + __be32 assert_var[5]; + __be32 rsvd0[3]; + __be32 assert_exit_ptr; + __be32 assert_callra; + __be32 rsvd1[2]; + __be32 fw_ver; + __be32 hw_id; + __be32 rsvd2; + u8 irisc_index; + u8 synd; + __be16 ext_sync; +}; + +struct mlx5_init_seg { + __be32 fw_rev; + __be32 cmdif_rev_fw_sub; + __be32 rsvd0[2]; + __be32 cmdq_addr_h; + __be32 cmdq_addr_l_sz; + __be32 cmd_dbell; + __be32 rsvd1[121]; + struct health_buffer health; + __be32 rsvd2[884]; + __be32 health_counter; + __be32 rsvd3[1019]; + __be64 ieee1588_clk; + __be32 ieee1588_clk_type; + __be32 clr_intx; +}; + +struct mlx5_eqe_comp { + __be32 reserved[6]; + __be32 cqn; +}; + +struct mlx5_eqe_qp_srq { + __be32 reserved[6]; + __be32 qp_srq_n; +}; + +struct mlx5_eqe_cq_err { + __be32 cqn; + u8 reserved1[7]; + u8 syndrome; +}; + +struct mlx5_eqe_port_state { + u8 reserved0[8]; + u8 port; +}; + +struct mlx5_eqe_gpio { + __be32 reserved0[2]; + __be64 gpio_event; +}; + +struct mlx5_eqe_congestion { + u8 type; + u8 rsvd0; + u8 congestion_level; +}; + +struct mlx5_eqe_stall_vl { + u8 rsvd0[3]; + u8 port_vl; +}; + +struct mlx5_eqe_cmd { + __be32 vector; + __be32 rsvd[6]; +}; + +struct mlx5_eqe_page_req { + u8 rsvd0[2]; + __be16 func_id; + __be32 num_pages; + __be32 rsvd1[5]; +}; + +union ev_data { + __be32 raw[7]; + struct mlx5_eqe_cmd cmd; + struct mlx5_eqe_comp comp; + struct mlx5_eqe_qp_srq qp_srq; + struct mlx5_eqe_cq_err cq_err; + struct mlx5_eqe_port_state port; + struct mlx5_eqe_gpio gpio; + struct mlx5_eqe_congestion cong; + struct mlx5_eqe_stall_vl stall_vl; + struct mlx5_eqe_page_req req_pages; +} __packed; + +struct mlx5_eqe { + u8 rsvd0; + u8 type; + u8 rsvd1; + u8 sub_type; + __be32 rsvd2[7]; + union ev_data data; + __be16 rsvd3; + u8 signature; + u8 owner; +} __packed; + +struct mlx5_cmd_prot_block { + u8 data[MLX5_CMD_DATA_BLOCK_SIZE]; + u8 rsvd0[48]; + __be64 next; + __be32 block_num; + u8 rsvd1; + u8 token; + u8 ctrl_sig; + u8 sig; +}; + +struct mlx5_err_cqe { + u8 rsvd0[32]; + __be32 srqn; + u8 rsvd1[18]; + u8 vendor_err_synd; + u8 syndrome; + __be32 s_wqe_opcode_qpn; + __be16 wqe_counter; + u8 signature; + u8 op_own; +}; + +struct mlx5_cqe64 { + u8 rsvd0[17]; + u8 ml_path; + u8 rsvd20[4]; + __be16 slid; + __be32 flags_rqpn; + u8 rsvd28[4]; + __be32 srqn; + __be32 imm_inval_pkey; + u8 rsvd40[4]; + __be32 byte_cnt; + __be64 timestamp; + __be32 sop_drop_qpn; + __be16 wqe_counter; + u8 signature; + u8 op_own; +}; + +struct mlx5_sig_err_cqe { + u8 rsvd0[16]; + __be32 expected_trans_sig; + __be32 actual_trans_sig; + __be32 expected_reftag; + __be32 actual_reftag; + __be16 syndrome; + u8 rsvd22[2]; + __be32 mkey; + __be64 err_offset; + u8 rsvd30[8]; + __be32 qpn; + u8 rsvd38[2]; + u8 signature; + u8 op_own; +}; + +struct mlx5_wqe_srq_next_seg { + u8 rsvd0[2]; + __be16 next_wqe_index; + u8 signature; + u8 rsvd1[11]; +}; + +union mlx5_ext_cqe { + struct ib_grh grh; + u8 inl[64]; +}; + +struct mlx5_cqe128 { + union mlx5_ext_cqe inl_grh; + struct mlx5_cqe64 cqe64; +}; + +struct mlx5_srq_ctx { + u8 state_log_sz; + u8 rsvd0[3]; + __be32 flags_xrcd; + __be32 pgoff_cqn; + u8 rsvd1[4]; + u8 log_pg_sz; + u8 rsvd2[7]; + __be32 pd; + __be16 lwm; + __be16 wqe_cnt; + u8 rsvd3[8]; + __be64 db_record; +}; + +struct mlx5_create_srq_mbox_in { + struct mlx5_inbox_hdr hdr; + __be32 input_srqn; + u8 rsvd0[4]; + struct mlx5_srq_ctx ctx; + u8 rsvd1[208]; + __be64 pas[0]; +}; + +struct mlx5_create_srq_mbox_out { + struct mlx5_outbox_hdr hdr; + __be32 srqn; + u8 rsvd[4]; +}; + +struct mlx5_destroy_srq_mbox_in { + struct mlx5_inbox_hdr hdr; + __be32 srqn; + u8 rsvd[4]; +}; + +struct mlx5_destroy_srq_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd[8]; +}; + +struct mlx5_query_srq_mbox_in { + struct mlx5_inbox_hdr hdr; + __be32 srqn; + u8 rsvd0[4]; +}; + +struct mlx5_query_srq_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd0[8]; + struct mlx5_srq_ctx ctx; + u8 rsvd1[32]; + __be64 pas[0]; +}; + +struct mlx5_arm_srq_mbox_in { + struct mlx5_inbox_hdr hdr; + __be32 srqn; + __be16 rsvd; + __be16 lwm; +}; + +struct mlx5_arm_srq_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd[8]; +}; + +struct mlx5_cq_context { + u8 status; + u8 cqe_sz_flags; + u8 st; + u8 rsvd3; + u8 rsvd4[6]; + __be16 page_offset; + __be32 log_sz_usr_page; + __be16 cq_period; + __be16 cq_max_count; + __be16 rsvd20; + __be16 c_eqn; + u8 log_pg_sz; + u8 rsvd25[7]; + __be32 last_notified_index; + __be32 solicit_producer_index; + __be32 consumer_counter; + __be32 producer_counter; + u8 rsvd48[8]; + __be64 db_record_addr; +}; + +struct mlx5_create_cq_mbox_in { + struct mlx5_inbox_hdr hdr; + __be32 input_cqn; + u8 rsvdx[4]; + struct mlx5_cq_context ctx; + u8 rsvd6[192]; + __be64 pas[0]; +}; + +struct mlx5_create_cq_mbox_out { + struct mlx5_outbox_hdr hdr; + __be32 cqn; + u8 rsvd0[4]; +}; + +struct mlx5_destroy_cq_mbox_in { + struct mlx5_inbox_hdr hdr; + __be32 cqn; + u8 rsvd0[4]; +}; + +struct mlx5_destroy_cq_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd0[8]; +}; + +struct mlx5_query_cq_mbox_in { + struct mlx5_inbox_hdr hdr; + __be32 cqn; + u8 rsvd0[4]; +}; + +struct mlx5_query_cq_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd0[8]; + struct mlx5_cq_context ctx; + u8 rsvd6[16]; + __be64 pas[0]; +}; + +struct mlx5_modify_cq_mbox_in { + struct mlx5_inbox_hdr hdr; + __be32 cqn; + __be32 field_select; + struct mlx5_cq_context ctx; + u8 rsvd[192]; + __be64 pas[0]; +}; + +struct mlx5_modify_cq_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd[8]; +}; + +struct mlx5_enable_hca_mbox_in { + struct mlx5_inbox_hdr hdr; + u8 rsvd[8]; +}; + +struct mlx5_enable_hca_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd[8]; +}; + +struct mlx5_disable_hca_mbox_in { + struct mlx5_inbox_hdr hdr; + u8 rsvd[8]; +}; + +struct mlx5_disable_hca_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd[8]; +}; + +struct mlx5_eq_context { + u8 status; + u8 ec_oi; + u8 st; + u8 rsvd2[7]; + __be16 page_pffset; + __be32 log_sz_usr_page; + u8 rsvd3[7]; + u8 intr; + u8 log_page_size; + u8 rsvd4[15]; + __be32 consumer_counter; + __be32 produser_counter; + u8 rsvd5[16]; +}; + +struct mlx5_create_eq_mbox_in { + struct mlx5_inbox_hdr hdr; + u8 rsvd0[3]; + u8 input_eqn; + u8 rsvd1[4]; + struct mlx5_eq_context ctx; + u8 rsvd2[8]; + __be64 events_mask; + u8 rsvd3[176]; + __be64 pas[0]; +}; + +struct mlx5_create_eq_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd0[3]; + u8 eq_number; + u8 rsvd1[4]; +}; + +struct mlx5_destroy_eq_mbox_in { + struct mlx5_inbox_hdr hdr; + u8 rsvd0[3]; + u8 eqn; + u8 rsvd1[4]; +}; + +struct mlx5_destroy_eq_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd[8]; +}; + +struct mlx5_map_eq_mbox_in { + struct mlx5_inbox_hdr hdr; + __be64 mask; + u8 mu; + u8 rsvd0[2]; + u8 eqn; + u8 rsvd1[24]; +}; + +struct mlx5_map_eq_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd[8]; +}; + +struct mlx5_query_eq_mbox_in { + struct mlx5_inbox_hdr hdr; + u8 rsvd0[3]; + u8 eqn; + u8 rsvd1[4]; +}; + +struct mlx5_query_eq_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd[8]; + struct mlx5_eq_context ctx; +}; + +struct mlx5_mkey_seg { + /* This is a two bit field occupying bits 31-30. + * bit 31 is always 0, + * bit 30 is zero for regular MRs and 1 (e.g free) for UMRs that do not have tanslation + */ + u8 status; + u8 pcie_control; + u8 flags; + u8 version; + __be32 qpn_mkey7_0; + u8 rsvd1[4]; + __be32 flags_pd; + __be64 start_addr; + __be64 len; + __be32 bsfs_octo_size; + u8 rsvd2[16]; + __be32 xlt_oct_size; + u8 rsvd3[3]; + u8 log2_page_size; + u8 rsvd4[4]; +}; + +struct mlx5_query_special_ctxs_mbox_in { + struct mlx5_inbox_hdr hdr; + u8 rsvd[8]; +}; + +struct mlx5_query_special_ctxs_mbox_out { + struct mlx5_outbox_hdr hdr; + __be32 dump_fill_mkey; + __be32 reserved_lkey; +}; + +struct mlx5_create_mkey_mbox_in { + struct mlx5_inbox_hdr hdr; + __be32 input_mkey_index; + u8 rsvd0[4]; + struct mlx5_mkey_seg seg; + u8 rsvd1[16]; + __be32 xlat_oct_act_size; + __be32 rsvd2; + u8 rsvd3[168]; + __be64 pas[0]; +}; + +struct mlx5_create_mkey_mbox_out { + struct mlx5_outbox_hdr hdr; + __be32 mkey; + u8 rsvd[4]; +}; + +struct mlx5_destroy_mkey_mbox_in { + struct mlx5_inbox_hdr hdr; + __be32 mkey; + u8 rsvd[4]; +}; + +struct mlx5_destroy_mkey_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd[8]; +}; + +struct mlx5_query_mkey_mbox_in { + struct mlx5_inbox_hdr hdr; + __be32 mkey; +}; + +struct mlx5_query_mkey_mbox_out { + struct mlx5_outbox_hdr hdr; + __be64 pas[0]; +}; + +struct mlx5_modify_mkey_mbox_in { + struct mlx5_inbox_hdr hdr; + __be32 mkey; + __be64 pas[0]; +}; + +struct mlx5_modify_mkey_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd[8]; +}; + +struct mlx5_dump_mkey_mbox_in { + struct mlx5_inbox_hdr hdr; +}; + +struct mlx5_dump_mkey_mbox_out { + struct mlx5_outbox_hdr hdr; + __be32 mkey; +}; + +struct mlx5_mad_ifc_mbox_in { + struct mlx5_inbox_hdr hdr; + __be16 remote_lid; + u8 rsvd0; + u8 port; + u8 rsvd1[4]; + u8 data[256]; +}; + +struct mlx5_mad_ifc_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd[8]; + u8 data[256]; +}; + +struct mlx5_access_reg_mbox_in { + struct mlx5_inbox_hdr hdr; + u8 rsvd0[2]; + __be16 register_id; + __be32 arg; + __be32 data[0]; +}; + +struct mlx5_access_reg_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd[8]; + __be32 data[0]; +}; + +#define MLX5_ATTR_EXTENDED_PORT_INFO cpu_to_be16(0xff90) + +enum { + MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO = 1 << 0 +}; + +struct mlx5_allocate_psv_in { + struct mlx5_inbox_hdr hdr; + __be32 npsv_pd; + __be32 rsvd_psv0; +}; + +struct mlx5_allocate_psv_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd[8]; + __be32 psv_idx[4]; +}; + +struct mlx5_destroy_psv_in { + struct mlx5_inbox_hdr hdr; + __be32 psv_number; + u8 rsvd[4]; +}; + +struct mlx5_destroy_psv_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd[8]; +}; + +#endif /* MLX5_DEVICE_H */ diff --git a/include/linux/mlx5/doorbell.h b/include/linux/mlx5/doorbell.h new file mode 100644 index 0000000..163a818 --- /dev/null +++ b/include/linux/mlx5/doorbell.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_DOORBELL_H +#define MLX5_DOORBELL_H + +#define MLX5_BF_OFFSET 0x800 +#define MLX5_CQ_DOORBELL 0x20 + +#if BITS_PER_LONG == 64 +/* Assume that we can just write a 64-bit doorbell atomically. s390 + * actually doesn't have writeq() but S/390 systems don't even have + * PCI so we won't worry about it. + */ + +#define MLX5_DECLARE_DOORBELL_LOCK(name) +#define MLX5_INIT_DOORBELL_LOCK(ptr) do { } while (0) +#define MLX5_GET_DOORBELL_LOCK(ptr) (NULL) + +static inline void mlx5_write64(__be32 val[2], void __iomem *dest, + spinlock_t *doorbell_lock) +{ + __raw_writeq(*(u64 *)val, dest); +} + +#else + +/* Just fall back to a spinlock to protect the doorbell if + * BITS_PER_LONG is 32 -- there's no portable way to do atomic 64-bit + * MMIO writes. + */ + +#define MLX5_DECLARE_DOORBELL_LOCK(name) spinlock_t name; +#define MLX5_INIT_DOORBELL_LOCK(ptr) spin_lock_init(ptr) +#define MLX5_GET_DOORBELL_LOCK(ptr) (ptr) + +static inline void mlx5_write64(__be32 val[2], void __iomem *dest, + spinlock_t *doorbell_lock) +{ + unsigned long flags; + + spin_lock_irqsave(doorbell_lock, flags); + __raw_writel((__force u32) val[0], dest); + __raw_writel((__force u32) val[1], dest + 4); + spin_unlock_irqrestore(doorbell_lock, flags); +} + +#endif + +#endif /* MLX5_DOORBELL_H */ diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h new file mode 100644 index 0000000..246310d --- /dev/null +++ b/include/linux/mlx5/driver.h @@ -0,0 +1,796 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_DRIVER_H +#define MLX5_DRIVER_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +enum { + MLX5_BOARD_ID_LEN = 64, + MLX5_MAX_NAME_LEN = 16, +}; + +enum { + /* one minute for the sake of bringup. Generally, commands must always + * complete and we may need to increase this timeout value + */ + MLX5_CMD_TIMEOUT_MSEC = 7200 * 1000, + MLX5_CMD_WQ_MAX_NAME = 32, +}; + +enum { + CMD_OWNER_SW = 0x0, + CMD_OWNER_HW = 0x1, + CMD_STATUS_SUCCESS = 0, +}; + +enum mlx5_sqp_t { + MLX5_SQP_SMI = 0, + MLX5_SQP_GSI = 1, + MLX5_SQP_IEEE_1588 = 2, + MLX5_SQP_SNIFFER = 3, + MLX5_SQP_SYNC_UMR = 4, +}; + +enum { + MLX5_MAX_PORTS = 2, +}; + +enum { + MLX5_EQ_VEC_PAGES = 0, + MLX5_EQ_VEC_CMD = 1, + MLX5_EQ_VEC_ASYNC = 2, + MLX5_EQ_VEC_COMP_BASE, +}; + +enum { + MLX5_MAX_EQ_NAME = 32 +}; + +enum { + MLX5_ATOMIC_MODE_IB_COMP = 1 << 16, + MLX5_ATOMIC_MODE_CX = 2 << 16, + MLX5_ATOMIC_MODE_8B = 3 << 16, + MLX5_ATOMIC_MODE_16B = 4 << 16, + MLX5_ATOMIC_MODE_32B = 5 << 16, + MLX5_ATOMIC_MODE_64B = 6 << 16, + MLX5_ATOMIC_MODE_128B = 7 << 16, + MLX5_ATOMIC_MODE_256B = 8 << 16, +}; + +enum { + MLX5_REG_PCAP = 0x5001, + MLX5_REG_PMTU = 0x5003, + MLX5_REG_PTYS = 0x5004, + MLX5_REG_PAOS = 0x5006, + MLX5_REG_PMAOS = 0x5012, + MLX5_REG_PUDE = 0x5009, + MLX5_REG_PMPE = 0x5010, + MLX5_REG_PELC = 0x500e, + MLX5_REG_PMLP = 0, /* TBD */ + MLX5_REG_NODE_DESC = 0x6001, + MLX5_REG_HOST_ENDIANNESS = 0x7004, +}; + +enum dbg_rsc_type { + MLX5_DBG_RSC_QP, + MLX5_DBG_RSC_EQ, + MLX5_DBG_RSC_CQ, +}; + +struct mlx5_field_desc { + struct dentry *dent; + int i; +}; + +struct mlx5_rsc_debug { + struct mlx5_core_dev *dev; + void *object; + enum dbg_rsc_type type; + struct dentry *root; + struct mlx5_field_desc fields[0]; +}; + +enum mlx5_dev_event { + MLX5_DEV_EVENT_SYS_ERROR, + MLX5_DEV_EVENT_PORT_UP, + MLX5_DEV_EVENT_PORT_DOWN, + MLX5_DEV_EVENT_PORT_INITIALIZED, + MLX5_DEV_EVENT_LID_CHANGE, + MLX5_DEV_EVENT_PKEY_CHANGE, + MLX5_DEV_EVENT_GUID_CHANGE, + MLX5_DEV_EVENT_CLIENT_REREG, +}; + +struct mlx5_uuar_info { + struct mlx5_uar *uars; + int num_uars; + int num_low_latency_uuars; + unsigned long *bitmap; + unsigned int *count; + struct mlx5_bf *bfs; + + /* + * protect uuar allocation data structs + */ + struct mutex lock; + u32 ver; +}; + +struct mlx5_bf { + void __iomem *reg; + void __iomem *regreg; + int buf_size; + struct mlx5_uar *uar; + unsigned long offset; + int need_lock; + /* protect blue flame buffer selection when needed + */ + spinlock_t lock; + + /* serialize 64 bit writes when done as two 32 bit accesses + */ + spinlock_t lock32; + int uuarn; +}; + +struct mlx5_cmd_first { + __be32 data[4]; +}; + +struct mlx5_cmd_msg { + struct list_head list; + struct cache_ent *cache; + u32 len; + struct mlx5_cmd_first first; + struct mlx5_cmd_mailbox *next; +}; + +struct mlx5_cmd_debug { + struct dentry *dbg_root; + struct dentry *dbg_in; + struct dentry *dbg_out; + struct dentry *dbg_outlen; + struct dentry *dbg_status; + struct dentry *dbg_run; + void *in_msg; + void *out_msg; + u8 status; + u16 inlen; + u16 outlen; +}; + +struct cache_ent { + /* protect block chain allocations + */ + spinlock_t lock; + struct list_head head; +}; + +struct cmd_msg_cache { + struct cache_ent large; + struct cache_ent med; + +}; + +struct mlx5_cmd_stats { + u64 sum; + u64 n; + struct dentry *root; + struct dentry *avg; + struct dentry *count; + /* protect command average calculations */ + spinlock_t lock; +}; + +struct mlx5_cmd { + void *cmd_buf; + dma_addr_t dma; + u16 cmdif_rev; + u8 log_sz; + u8 log_stride; + int max_reg_cmds; + int events; + u32 __iomem *vector; + + /* protect command queue allocations + */ + spinlock_t alloc_lock; + + /* protect token allocations + */ + spinlock_t token_lock; + u8 token; + unsigned long bitmask; + char wq_name[MLX5_CMD_WQ_MAX_NAME]; + struct workqueue_struct *wq; + struct semaphore sem; + struct semaphore pages_sem; + int mode; + struct mlx5_cmd_work_ent *ent_arr[MLX5_MAX_COMMANDS]; + struct pci_pool *pool; + struct mlx5_cmd_debug dbg; + struct cmd_msg_cache cache; + int checksum_disabled; + struct mlx5_cmd_stats stats[MLX5_CMD_OP_MAX]; +}; + +struct mlx5_port_caps { + int gid_table_len; + int pkey_table_len; +}; + +struct mlx5_general_caps { + u8 log_max_eq; + u8 log_max_cq; + u8 log_max_qp; + u8 log_max_mkey; + u8 log_max_pd; + u8 log_max_srq; + u8 log_max_strq; + u8 log_max_mrw_sz; + u8 log_max_bsf_list_size; + u8 log_max_klm_list_size; + u32 max_cqes; + int max_wqes; + u32 max_eqes; + u32 max_indirection; + int max_sq_desc_sz; + int max_rq_desc_sz; + int max_dc_sq_desc_sz; + u64 flags; + u16 stat_rate_support; + int log_max_msg; + int num_ports; + u8 log_max_ra_res_qp; + u8 log_max_ra_req_qp; + int max_srq_wqes; + int bf_reg_size; + int bf_regs_per_page; + struct mlx5_port_caps port[MLX5_MAX_PORTS]; + u8 ext_port_cap[MLX5_MAX_PORTS]; + int max_vf; + u32 reserved_lkey; + u8 local_ca_ack_delay; + u8 log_max_mcg; + u32 max_qp_mcg; + int min_page_sz; + int pd_cap; + u32 max_qp_counters; + u32 pkey_table_size; + u8 log_max_ra_req_dc; + u8 log_max_ra_res_dc; + u32 uar_sz; + u8 min_log_pg_sz; + u8 log_max_xrcd; + u16 log_uar_page_sz; +}; + +struct mlx5_caps { + struct mlx5_general_caps gen; +}; + +struct mlx5_cmd_mailbox { + void *buf; + dma_addr_t dma; + struct mlx5_cmd_mailbox *next; +}; + +struct mlx5_buf_list { + void *buf; + dma_addr_t map; +}; + +struct mlx5_buf { + struct mlx5_buf_list direct; + struct mlx5_buf_list *page_list; + int nbufs; + int npages; + int size; + u8 page_shift; +}; + +struct mlx5_eq { + struct mlx5_core_dev *dev; + __be32 __iomem *doorbell; + u32 cons_index; + struct mlx5_buf buf; + int size; + u8 irqn; + u8 eqn; + int nent; + u64 mask; + char name[MLX5_MAX_EQ_NAME]; + struct list_head list; + int index; + struct mlx5_rsc_debug *dbg; +}; + +struct mlx5_core_psv { + u32 psv_idx; + struct psv_layout { + u32 pd; + u16 syndrome; + u16 reserved; + u16 bg; + u16 app_tag; + u32 ref_tag; + } psv; +}; + +struct mlx5_core_sig_ctx { + struct mlx5_core_psv psv_memory; + struct mlx5_core_psv psv_wire; + struct ib_sig_err err_item; + bool sig_status_checked; + bool sig_err_exists; + u32 sigerr_count; +}; + +struct mlx5_core_mr { + u64 iova; + u64 size; + u32 key; + u32 pd; +}; + +enum mlx5_res_type { + MLX5_RES_QP, +}; + +struct mlx5_core_rsc_common { + enum mlx5_res_type res; + atomic_t refcount; + struct completion free; +}; + +struct mlx5_core_srq { + u32 srqn; + int max; + int max_gs; + int max_avail_gather; + int wqe_shift; + void (*event) (struct mlx5_core_srq *, enum mlx5_event); + + atomic_t refcount; + struct completion free; +}; + +struct mlx5_eq_table { + void __iomem *update_ci; + void __iomem *update_arm_ci; + struct list_head *comp_eq_head; + struct mlx5_eq pages_eq; + struct mlx5_eq async_eq; + struct mlx5_eq cmd_eq; + struct msix_entry *msix_arr; + int num_comp_vectors; + /* protect EQs list + */ + spinlock_t lock; +}; + +struct mlx5_uar { + u32 index; + struct list_head bf_list; + unsigned free_bf_bmap; + void __iomem *wc_map; + void __iomem *map; +}; + + +struct mlx5_core_health { + struct health_buffer __iomem *health; + __be32 __iomem *health_counter; + struct timer_list timer; + struct list_head list; + u32 prev; + int miss_counter; +}; + +struct mlx5_cq_table { + /* protect radix tree + */ + spinlock_t lock; + struct radix_tree_root tree; +}; + +struct mlx5_qp_table { + /* protect radix tree + */ + spinlock_t lock; + struct radix_tree_root tree; +}; + +struct mlx5_srq_table { + /* protect radix tree + */ + spinlock_t lock; + struct radix_tree_root tree; +}; + +struct mlx5_mr_table { + /* protect radix tree + */ + rwlock_t lock; + struct radix_tree_root tree; +}; + +struct mlx5_priv { + char name[MLX5_MAX_NAME_LEN]; + struct mlx5_eq_table eq_table; + struct mlx5_uuar_info uuari; + MLX5_DECLARE_DOORBELL_LOCK(cq_uar_lock); + + /* pages stuff */ + struct workqueue_struct *pg_wq; + struct rb_root page_root; + int fw_pages; + int reg_pages; + struct list_head free_list; + + struct mlx5_core_health health; + + struct mlx5_srq_table srq_table; + + /* start: qp staff */ + struct mlx5_qp_table qp_table; + struct dentry *qp_debugfs; + struct dentry *eq_debugfs; + struct dentry *cq_debugfs; + struct dentry *cmdif_debugfs; + /* end: qp staff */ + + /* start: cq staff */ + struct mlx5_cq_table cq_table; + /* end: cq staff */ + + /* start: mr staff */ + struct mlx5_mr_table mr_table; + /* end: mr staff */ + + /* start: alloc staff */ + struct mutex pgdir_mutex; + struct list_head pgdir_list; + /* end: alloc staff */ + struct dentry *dbg_root; + + /* protect mkey key part */ + spinlock_t mkey_lock; + u8 mkey_key; + + struct list_head dev_list; + struct list_head ctx_list; + spinlock_t ctx_lock; +}; + +struct mlx5_core_dev { + struct pci_dev *pdev; + u8 rev_id; + char board_id[MLX5_BOARD_ID_LEN]; + struct mlx5_cmd cmd; + struct mlx5_caps caps; + phys_addr_t iseg_base; + struct mlx5_init_seg __iomem *iseg; + void (*event) (struct mlx5_core_dev *dev, + enum mlx5_dev_event event, + unsigned long param); + struct mlx5_priv priv; + struct mlx5_profile *profile; + atomic_t num_qps; +}; + +struct mlx5_db { + __be32 *db; + union { + struct mlx5_db_pgdir *pgdir; + struct mlx5_ib_user_db_page *user_page; + } u; + dma_addr_t dma; + int index; +}; + +enum { + MLX5_DB_PER_PAGE = PAGE_SIZE / L1_CACHE_BYTES, +}; + +enum { + MLX5_COMP_EQ_SIZE = 1024, +}; + +struct mlx5_db_pgdir { + struct list_head list; + DECLARE_BITMAP(bitmap, MLX5_DB_PER_PAGE); + __be32 *db_page; + dma_addr_t db_dma; +}; + +typedef void (*mlx5_cmd_cbk_t)(int status, void *context); + +struct mlx5_cmd_work_ent { + struct mlx5_cmd_msg *in; + struct mlx5_cmd_msg *out; + void *uout; + int uout_size; + mlx5_cmd_cbk_t callback; + void *context; + int idx; + struct completion done; + struct mlx5_cmd *cmd; + struct work_struct work; + struct mlx5_cmd_layout *lay; + int ret; + int page_queue; + u8 status; + u8 token; + u64 ts1; + u64 ts2; + u16 op; +}; + +struct mlx5_pas { + u64 pa; + u8 log_sz; +}; + +static inline void *mlx5_buf_offset(struct mlx5_buf *buf, int offset) +{ + if (likely(BITS_PER_LONG == 64 || buf->nbufs == 1)) + return buf->direct.buf + offset; + else + return buf->page_list[offset >> PAGE_SHIFT].buf + + (offset & (PAGE_SIZE - 1)); +} + +extern struct workqueue_struct *mlx5_core_wq; + +#define STRUCT_FIELD(header, field) \ + .struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field), \ + .struct_size_bytes = sizeof((struct ib_unpacked_ ## header *)0)->field + +struct ib_field { + size_t struct_offset_bytes; + size_t struct_size_bytes; + int offset_bits; + int size_bits; +}; + +static inline struct mlx5_core_dev *pci2mlx5_core_dev(struct pci_dev *pdev) +{ + return pci_get_drvdata(pdev); +} + +extern struct dentry *mlx5_debugfs_root; + +static inline u16 fw_rev_maj(struct mlx5_core_dev *dev) +{ + return ioread32be(&dev->iseg->fw_rev) & 0xffff; +} + +static inline u16 fw_rev_min(struct mlx5_core_dev *dev) +{ + return ioread32be(&dev->iseg->fw_rev) >> 16; +} + +static inline u16 fw_rev_sub(struct mlx5_core_dev *dev) +{ + return ioread32be(&dev->iseg->cmdif_rev_fw_sub) & 0xffff; +} + +static inline u16 cmdif_rev(struct mlx5_core_dev *dev) +{ + return ioread32be(&dev->iseg->cmdif_rev_fw_sub) >> 16; +} + +static inline void *mlx5_vzalloc(unsigned long size) +{ + void *rtn; + + rtn = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); + if (!rtn) + rtn = vzalloc(size); + return rtn; +} + +static inline void mlx5_vfree(const void *addr) +{ + if (addr && is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); +} + +static inline u32 mlx5_base_mkey(const u32 key) +{ + return key & 0xffffff00u; +} + +int mlx5_cmd_init(struct mlx5_core_dev *dev); +void mlx5_cmd_cleanup(struct mlx5_core_dev *dev); +void mlx5_cmd_use_events(struct mlx5_core_dev *dev); +void mlx5_cmd_use_polling(struct mlx5_core_dev *dev); +int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr); +int mlx5_cmd_status_to_err_v2(void *ptr); +int mlx5_core_get_caps(struct mlx5_core_dev *dev, struct mlx5_caps *caps, + u16 opmod); +int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, + int out_size); +int mlx5_cmd_exec_cb(struct mlx5_core_dev *dev, void *in, int in_size, + void *out, int out_size, mlx5_cmd_cbk_t callback, + void *context); +int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn); +int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn); +int mlx5_alloc_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari); +int mlx5_free_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari); +void mlx5_health_cleanup(void); +void __init mlx5_health_init(void); +void mlx5_start_health_poll(struct mlx5_core_dev *dev); +void mlx5_stop_health_poll(struct mlx5_core_dev *dev); +int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, int max_direct, + struct mlx5_buf *buf); +void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_buf *buf); +struct mlx5_cmd_mailbox *mlx5_alloc_cmd_mailbox_chain(struct mlx5_core_dev *dev, + gfp_t flags, int npages); +void mlx5_free_cmd_mailbox_chain(struct mlx5_core_dev *dev, + struct mlx5_cmd_mailbox *head); +int mlx5_core_create_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, + struct mlx5_create_srq_mbox_in *in, int inlen); +int mlx5_core_destroy_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq); +int mlx5_core_query_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, + struct mlx5_query_srq_mbox_out *out); +int mlx5_core_arm_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, + u16 lwm, int is_srq); +void mlx5_init_mr_table(struct mlx5_core_dev *dev); +void mlx5_cleanup_mr_table(struct mlx5_core_dev *dev); +int mlx5_core_create_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, + struct mlx5_create_mkey_mbox_in *in, int inlen, + mlx5_cmd_cbk_t callback, void *context, + struct mlx5_create_mkey_mbox_out *out); +int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr); +int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, + struct mlx5_query_mkey_mbox_out *out, int outlen); +int mlx5_core_dump_fill_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, + u32 *mkey); +int mlx5_core_alloc_pd(struct mlx5_core_dev *dev, u32 *pdn); +int mlx5_core_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn); +int mlx5_core_mad_ifc(struct mlx5_core_dev *dev, void *inb, void *outb, + u16 opmod, u8 port); +void mlx5_pagealloc_init(struct mlx5_core_dev *dev); +void mlx5_pagealloc_cleanup(struct mlx5_core_dev *dev); +int mlx5_pagealloc_start(struct mlx5_core_dev *dev); +void mlx5_pagealloc_stop(struct mlx5_core_dev *dev); +void mlx5_core_req_pages_handler(struct mlx5_core_dev *dev, u16 func_id, + s32 npages); +int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev, int boot); +int mlx5_reclaim_startup_pages(struct mlx5_core_dev *dev); +void mlx5_register_debugfs(void); +void mlx5_unregister_debugfs(void); +int mlx5_eq_init(struct mlx5_core_dev *dev); +void mlx5_eq_cleanup(struct mlx5_core_dev *dev); +void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas); +void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn); +void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type); +void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type); +struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn); +void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, unsigned long vector); +void mlx5_cq_event(struct mlx5_core_dev *dev, u32 cqn, int event_type); +int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx, + int nent, u64 mask, const char *name, struct mlx5_uar *uar); +int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq); +int mlx5_start_eqs(struct mlx5_core_dev *dev); +int mlx5_stop_eqs(struct mlx5_core_dev *dev); +int mlx5_core_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn); +int mlx5_core_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn); + +int mlx5_qp_debugfs_init(struct mlx5_core_dev *dev); +void mlx5_qp_debugfs_cleanup(struct mlx5_core_dev *dev); +int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in, + int size_in, void *data_out, int size_out, + u16 reg_num, int arg, int write); +int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps); + +int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq); +void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq); +int mlx5_core_eq_query(struct mlx5_core_dev *dev, struct mlx5_eq *eq, + struct mlx5_query_eq_mbox_out *out, int outlen); +int mlx5_eq_debugfs_init(struct mlx5_core_dev *dev); +void mlx5_eq_debugfs_cleanup(struct mlx5_core_dev *dev); +int mlx5_cq_debugfs_init(struct mlx5_core_dev *dev); +void mlx5_cq_debugfs_cleanup(struct mlx5_core_dev *dev); +int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db); +void mlx5_db_free(struct mlx5_core_dev *dev, struct mlx5_db *db); + +const char *mlx5_command_str(int command); +int mlx5_cmdif_debugfs_init(struct mlx5_core_dev *dev); +void mlx5_cmdif_debugfs_cleanup(struct mlx5_core_dev *dev); +int mlx5_core_create_psv(struct mlx5_core_dev *dev, u32 pdn, + int npsvs, u32 *sig_index); +int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num); +void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common); + +static inline u32 mlx5_mkey_to_idx(u32 mkey) +{ + return mkey >> 8; +} + +static inline u32 mlx5_idx_to_mkey(u32 mkey_idx) +{ + return mkey_idx << 8; +} + +static inline u8 mlx5_mkey_variant(u32 mkey) +{ + return mkey & 0xff; +} + +enum { + MLX5_PROF_MASK_QP_SIZE = (u64)1 << 0, + MLX5_PROF_MASK_MR_CACHE = (u64)1 << 1, +}; + +enum { + MAX_MR_CACHE_ENTRIES = 16, +}; + +struct mlx5_interface { + void * (*add)(struct mlx5_core_dev *dev); + void (*remove)(struct mlx5_core_dev *dev, void *context); + void (*event)(struct mlx5_core_dev *dev, void *context, + enum mlx5_dev_event event, unsigned long param); + struct list_head list; +}; + +int mlx5_register_interface(struct mlx5_interface *intf); +void mlx5_unregister_interface(struct mlx5_interface *intf); + +struct mlx5_profile { + u64 mask; + u8 log_max_qp; + struct { + int size; + int limit; + } mr_cache[MAX_MR_CACHE_ENTRIES]; +}; + +#endif /* MLX5_DRIVER_H */ diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h new file mode 100644 index 0000000..5f48b8f --- /dev/null +++ b/include/linux/mlx5/mlx5_ifc.h @@ -0,0 +1,349 @@ +/* + * Copyright (c) 2014, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_IFC_H +#define MLX5_IFC_H + +enum { + MLX5_CMD_OP_QUERY_HCA_CAP = 0x100, + MLX5_CMD_OP_QUERY_ADAPTER = 0x101, + MLX5_CMD_OP_INIT_HCA = 0x102, + MLX5_CMD_OP_TEARDOWN_HCA = 0x103, + MLX5_CMD_OP_ENABLE_HCA = 0x104, + MLX5_CMD_OP_DISABLE_HCA = 0x105, + MLX5_CMD_OP_QUERY_PAGES = 0x107, + MLX5_CMD_OP_MANAGE_PAGES = 0x108, + MLX5_CMD_OP_SET_HCA_CAP = 0x109, + MLX5_CMD_OP_CREATE_MKEY = 0x200, + MLX5_CMD_OP_QUERY_MKEY = 0x201, + MLX5_CMD_OP_DESTROY_MKEY = 0x202, + MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS = 0x203, + MLX5_CMD_OP_PAGE_FAULT_RESUME = 0x204, + MLX5_CMD_OP_CREATE_EQ = 0x301, + MLX5_CMD_OP_DESTROY_EQ = 0x302, + MLX5_CMD_OP_QUERY_EQ = 0x303, + MLX5_CMD_OP_GEN_EQE = 0x304, + MLX5_CMD_OP_CREATE_CQ = 0x400, + MLX5_CMD_OP_DESTROY_CQ = 0x401, + MLX5_CMD_OP_QUERY_CQ = 0x402, + MLX5_CMD_OP_MODIFY_CQ = 0x403, + MLX5_CMD_OP_CREATE_QP = 0x500, + MLX5_CMD_OP_DESTROY_QP = 0x501, + MLX5_CMD_OP_RST2INIT_QP = 0x502, + MLX5_CMD_OP_INIT2RTR_QP = 0x503, + MLX5_CMD_OP_RTR2RTS_QP = 0x504, + MLX5_CMD_OP_RTS2RTS_QP = 0x505, + MLX5_CMD_OP_SQERR2RTS_QP = 0x506, + MLX5_CMD_OP_2ERR_QP = 0x507, + MLX5_CMD_OP_2RST_QP = 0x50a, + MLX5_CMD_OP_QUERY_QP = 0x50b, + MLX5_CMD_OP_INIT2INIT_QP = 0x50e, + MLX5_CMD_OP_CREATE_PSV = 0x600, + MLX5_CMD_OP_DESTROY_PSV = 0x601, + MLX5_CMD_OP_CREATE_SRQ = 0x700, + MLX5_CMD_OP_DESTROY_SRQ = 0x701, + MLX5_CMD_OP_QUERY_SRQ = 0x702, + MLX5_CMD_OP_ARM_RQ = 0x703, + MLX5_CMD_OP_RESIZE_SRQ = 0x704, + MLX5_CMD_OP_CREATE_DCT = 0x710, + MLX5_CMD_OP_DESTROY_DCT = 0x711, + MLX5_CMD_OP_DRAIN_DCT = 0x712, + MLX5_CMD_OP_QUERY_DCT = 0x713, + MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION = 0x714, + MLX5_CMD_OP_QUERY_VPORT_STATE = 0x750, + MLX5_CMD_OP_MODIFY_VPORT_STATE = 0x751, + MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT = 0x752, + MLX5_CMD_OP_MODIFY_ESW_VPORT_CONTEXT = 0x753, + MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT = 0x754, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT = 0x755, + MLX5_CMD_OP_QUERY_RCOE_ADDRESS = 0x760, + MLX5_CMD_OP_SET_ROCE_ADDRESS = 0x761, + MLX5_CMD_OP_QUERY_VPORT_COUNTER = 0x770, + MLX5_CMD_OP_ALLOC_Q_COUNTER = 0x771, + MLX5_CMD_OP_DEALLOC_Q_COUNTER = 0x772, + MLX5_CMD_OP_QUERY_Q_COUNTER = 0x773, + MLX5_CMD_OP_ALLOC_PD = 0x800, + MLX5_CMD_OP_DEALLOC_PD = 0x801, + MLX5_CMD_OP_ALLOC_UAR = 0x802, + MLX5_CMD_OP_DEALLOC_UAR = 0x803, + MLX5_CMD_OP_CONFIG_INT_MODERATION = 0x804, + MLX5_CMD_OP_ACCESS_REG = 0x805, + MLX5_CMD_OP_ATTACH_TO_MCG = 0x806, + MLX5_CMD_OP_DETACH_FROM_MCG = 0x807, + MLX5_CMD_OP_GET_DROPPED_PACKET_LOG = 0x80a, + MLX5_CMD_OP_MAD_IFC = 0x50d, + MLX5_CMD_OP_QUERY_MAD_DEMUX = 0x80b, + MLX5_CMD_OP_SET_MAD_DEMUX = 0x80c, + MLX5_CMD_OP_NOP = 0x80d, + MLX5_CMD_OP_ALLOC_XRCD = 0x80e, + MLX5_CMD_OP_DEALLOC_XRCD = 0x80f, + MLX5_CMD_OP_SET_BURST_SIZE = 0x812, + MLX5_CMD_OP_QUERY_BURST_SZIE = 0x813, + MLX5_CMD_OP_ACTIVATE_TRACER = 0x814, + MLX5_CMD_OP_DEACTIVATE_TRACER = 0x815, + MLX5_CMD_OP_CREATE_SNIFFER_RULE = 0x820, + MLX5_CMD_OP_DESTROY_SNIFFER_RULE = 0x821, + MLX5_CMD_OP_QUERY_CONG_PARAMS = 0x822, + MLX5_CMD_OP_MODIFY_CONG_PARAMS = 0x823, + MLX5_CMD_OP_QUERY_CONG_STATISTICS = 0x824, + MLX5_CMD_OP_CREATE_TIR = 0x900, + MLX5_CMD_OP_MODIFY_TIR = 0x901, + MLX5_CMD_OP_DESTROY_TIR = 0x902, + MLX5_CMD_OP_QUERY_TIR = 0x903, + MLX5_CMD_OP_CREATE_TIS = 0x912, + MLX5_CMD_OP_MODIFY_TIS = 0x913, + MLX5_CMD_OP_DESTROY_TIS = 0x914, + MLX5_CMD_OP_QUERY_TIS = 0x915, + MLX5_CMD_OP_CREATE_SQ = 0x904, + MLX5_CMD_OP_MODIFY_SQ = 0x905, + MLX5_CMD_OP_DESTROY_SQ = 0x906, + MLX5_CMD_OP_QUERY_SQ = 0x907, + MLX5_CMD_OP_CREATE_RQ = 0x908, + MLX5_CMD_OP_MODIFY_RQ = 0x909, + MLX5_CMD_OP_DESTROY_RQ = 0x90a, + MLX5_CMD_OP_QUERY_RQ = 0x90b, + MLX5_CMD_OP_CREATE_RMP = 0x90c, + MLX5_CMD_OP_MODIFY_RMP = 0x90d, + MLX5_CMD_OP_DESTROY_RMP = 0x90e, + MLX5_CMD_OP_QUERY_RMP = 0x90f, + MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY = 0x910, + MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY = 0x911, + MLX5_CMD_OP_MAX = 0x911 +}; + +struct mlx5_ifc_cmd_hca_cap_bits { + u8 reserved_0[0x80]; + + u8 log_max_srq_sz[0x8]; + u8 log_max_qp_sz[0x8]; + u8 reserved_1[0xb]; + u8 log_max_qp[0x5]; + + u8 log_max_strq_sz[0x8]; + u8 reserved_2[0x3]; + u8 log_max_srqs[0x5]; + u8 reserved_3[0x10]; + + u8 reserved_4[0x8]; + u8 log_max_cq_sz[0x8]; + u8 reserved_5[0xb]; + u8 log_max_cq[0x5]; + + u8 log_max_eq_sz[0x8]; + u8 reserved_6[0x2]; + u8 log_max_mkey[0x6]; + u8 reserved_7[0xc]; + u8 log_max_eq[0x4]; + + u8 max_indirection[0x8]; + u8 reserved_8[0x1]; + u8 log_max_mrw_sz[0x7]; + u8 reserved_9[0x2]; + u8 log_max_bsf_list_size[0x6]; + u8 reserved_10[0x2]; + u8 log_max_klm_list_size[0x6]; + + u8 reserved_11[0xa]; + u8 log_max_ra_req_dc[0x6]; + u8 reserved_12[0xa]; + u8 log_max_ra_res_dc[0x6]; + + u8 reserved_13[0xa]; + u8 log_max_ra_req_qp[0x6]; + u8 reserved_14[0xa]; + u8 log_max_ra_res_qp[0x6]; + + u8 pad_cap[0x1]; + u8 cc_query_allowed[0x1]; + u8 cc_modify_allowed[0x1]; + u8 reserved_15[0x1d]; + + u8 reserved_16[0x6]; + u8 max_qp_cnt[0xa]; + u8 pkey_table_size[0x10]; + + u8 eswitch_owner[0x1]; + u8 reserved_17[0xa]; + u8 local_ca_ack_delay[0x5]; + u8 reserved_18[0x8]; + u8 num_ports[0x8]; + + u8 reserved_19[0x3]; + u8 log_max_msg[0x5]; + u8 reserved_20[0x18]; + + u8 stat_rate_support[0x10]; + u8 reserved_21[0x10]; + + u8 reserved_22[0x10]; + u8 cmdif_checksum[0x2]; + u8 sigerr_cqe[0x1]; + u8 reserved_23[0x1]; + u8 wq_signature[0x1]; + u8 sctr_data_cqe[0x1]; + u8 reserved_24[0x1]; + u8 sho[0x1]; + u8 tph[0x1]; + u8 rf[0x1]; + u8 dc[0x1]; + u8 reserved_25[0x2]; + u8 roce[0x1]; + u8 atomic[0x1]; + u8 rsz_srq[0x1]; + + u8 cq_oi[0x1]; + u8 cq_resize[0x1]; + u8 cq_moderation[0x1]; + u8 sniffer_rule_flow[0x1]; + u8 sniffer_rule_vport[0x1]; + u8 sniffer_rule_phy[0x1]; + u8 reserved_26[0x1]; + u8 pg[0x1]; + u8 block_lb_mc[0x1]; + u8 reserved_27[0x3]; + u8 cd[0x1]; + u8 reserved_28[0x1]; + u8 apm[0x1]; + u8 reserved_29[0x7]; + u8 qkv[0x1]; + u8 pkv[0x1]; + u8 reserved_30[0x4]; + u8 xrc[0x1]; + u8 ud[0x1]; + u8 uc[0x1]; + u8 rc[0x1]; + + u8 reserved_31[0xa]; + u8 uar_sz[0x6]; + u8 reserved_32[0x8]; + u8 log_pg_sz[0x8]; + + u8 bf[0x1]; + u8 reserved_33[0xa]; + u8 log_bf_reg_size[0x5]; + u8 reserved_34[0x10]; + + u8 reserved_35[0x10]; + u8 max_wqe_sz_sq[0x10]; + + u8 reserved_36[0x10]; + u8 max_wqe_sz_rq[0x10]; + + u8 reserved_37[0x10]; + u8 max_wqe_sz_sq_dc[0x10]; + + u8 reserved_38[0x7]; + u8 max_qp_mcg[0x19]; + + u8 reserved_39[0x18]; + u8 log_max_mcg[0x8]; + + u8 reserved_40[0xb]; + u8 log_max_pd[0x5]; + u8 reserved_41[0xb]; + u8 log_max_xrcd[0x5]; + + u8 reserved_42[0x20]; + + u8 reserved_43[0x3]; + u8 log_max_rq[0x5]; + u8 reserved_44[0x3]; + u8 log_max_sq[0x5]; + u8 reserved_45[0x3]; + u8 log_max_tir[0x5]; + u8 reserved_46[0x3]; + u8 log_max_tis[0x5]; + + u8 reserved_47[0x13]; + u8 log_max_rq_per_tir[0x5]; + u8 reserved_48[0x3]; + u8 log_max_tis_per_sq[0x5]; + + u8 reserved_49[0xe0]; + + u8 reserved_50[0x10]; + u8 log_uar_page_sz[0x10]; + + u8 reserved_51[0x100]; + + u8 reserved_52[0x1f]; + u8 cqe_zip[0x1]; + + u8 cqe_zip_timeout[0x10]; + u8 cqe_zip_max_num[0x10]; + + u8 reserved_53[0x220]; +}; + +struct mlx5_ifc_set_hca_cap_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; + + struct mlx5_ifc_cmd_hca_cap_bits hca_capability_struct; +}; + +struct mlx5_ifc_query_hca_cap_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; +}; + +struct mlx5_ifc_query_hca_cap_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; + + u8 capability_struct[256][0x8]; +}; + +struct mlx5_ifc_set_hca_cap_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +#endif /* MLX5_IFC_H */ diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h new file mode 100644 index 0000000..3fa075d --- /dev/null +++ b/include/linux/mlx5/qp.h @@ -0,0 +1,598 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_QP_H +#define MLX5_QP_H + +#include +#include + +#define MLX5_INVALID_LKEY 0x100 +#define MLX5_SIG_WQE_SIZE (MLX5_SEND_WQE_BB * 5) +#define MLX5_DIF_SIZE 8 +#define MLX5_STRIDE_BLOCK_OP 0x400 +#define MLX5_CPY_GRD_MASK 0xc0 +#define MLX5_CPY_APP_MASK 0x30 +#define MLX5_CPY_REF_MASK 0x0f +#define MLX5_BSF_INC_REFTAG (1 << 6) +#define MLX5_BSF_INL_VALID (1 << 15) +#define MLX5_BSF_REFRESH_DIF (1 << 14) +#define MLX5_BSF_REPEAT_BLOCK (1 << 7) +#define MLX5_BSF_APPTAG_ESCAPE 0x1 +#define MLX5_BSF_APPREF_ESCAPE 0x2 + +enum mlx5_qp_optpar { + MLX5_QP_OPTPAR_ALT_ADDR_PATH = 1 << 0, + MLX5_QP_OPTPAR_RRE = 1 << 1, + MLX5_QP_OPTPAR_RAE = 1 << 2, + MLX5_QP_OPTPAR_RWE = 1 << 3, + MLX5_QP_OPTPAR_PKEY_INDEX = 1 << 4, + MLX5_QP_OPTPAR_Q_KEY = 1 << 5, + MLX5_QP_OPTPAR_RNR_TIMEOUT = 1 << 6, + MLX5_QP_OPTPAR_PRIMARY_ADDR_PATH = 1 << 7, + MLX5_QP_OPTPAR_SRA_MAX = 1 << 8, + MLX5_QP_OPTPAR_RRA_MAX = 1 << 9, + MLX5_QP_OPTPAR_PM_STATE = 1 << 10, + MLX5_QP_OPTPAR_RETRY_COUNT = 1 << 12, + MLX5_QP_OPTPAR_RNR_RETRY = 1 << 13, + MLX5_QP_OPTPAR_ACK_TIMEOUT = 1 << 14, + MLX5_QP_OPTPAR_PRI_PORT = 1 << 16, + MLX5_QP_OPTPAR_SRQN = 1 << 18, + MLX5_QP_OPTPAR_CQN_RCV = 1 << 19, + MLX5_QP_OPTPAR_DC_HS = 1 << 20, + MLX5_QP_OPTPAR_DC_KEY = 1 << 21, +}; + +enum mlx5_qp_state { + MLX5_QP_STATE_RST = 0, + MLX5_QP_STATE_INIT = 1, + MLX5_QP_STATE_RTR = 2, + MLX5_QP_STATE_RTS = 3, + MLX5_QP_STATE_SQER = 4, + MLX5_QP_STATE_SQD = 5, + MLX5_QP_STATE_ERR = 6, + MLX5_QP_STATE_SQ_DRAINING = 7, + MLX5_QP_STATE_SUSPENDED = 9, + MLX5_QP_NUM_STATE +}; + +enum { + MLX5_QP_ST_RC = 0x0, + MLX5_QP_ST_UC = 0x1, + MLX5_QP_ST_UD = 0x2, + MLX5_QP_ST_XRC = 0x3, + MLX5_QP_ST_MLX = 0x4, + MLX5_QP_ST_DCI = 0x5, + MLX5_QP_ST_DCT = 0x6, + MLX5_QP_ST_QP0 = 0x7, + MLX5_QP_ST_QP1 = 0x8, + MLX5_QP_ST_RAW_ETHERTYPE = 0x9, + MLX5_QP_ST_RAW_IPV6 = 0xa, + MLX5_QP_ST_SNIFFER = 0xb, + MLX5_QP_ST_SYNC_UMR = 0xe, + MLX5_QP_ST_PTP_1588 = 0xd, + MLX5_QP_ST_REG_UMR = 0xc, + MLX5_QP_ST_MAX +}; + +enum { + MLX5_QP_PM_MIGRATED = 0x3, + MLX5_QP_PM_ARMED = 0x0, + MLX5_QP_PM_REARM = 0x1 +}; + +enum { + MLX5_NON_ZERO_RQ = 0 << 24, + MLX5_SRQ_RQ = 1 << 24, + MLX5_CRQ_RQ = 2 << 24, + MLX5_ZERO_LEN_RQ = 3 << 24 +}; + +enum { + /* params1 */ + MLX5_QP_BIT_SRE = 1 << 15, + MLX5_QP_BIT_SWE = 1 << 14, + MLX5_QP_BIT_SAE = 1 << 13, + /* params2 */ + MLX5_QP_BIT_RRE = 1 << 15, + MLX5_QP_BIT_RWE = 1 << 14, + MLX5_QP_BIT_RAE = 1 << 13, + MLX5_QP_BIT_RIC = 1 << 4, +}; + +enum { + MLX5_WQE_CTRL_CQ_UPDATE = 2 << 2, + MLX5_WQE_CTRL_SOLICITED = 1 << 1, +}; + +enum { + MLX5_SEND_WQE_BB = 64, +}; + +enum { + MLX5_WQE_FMR_PERM_LOCAL_READ = 1 << 27, + MLX5_WQE_FMR_PERM_LOCAL_WRITE = 1 << 28, + MLX5_WQE_FMR_PERM_REMOTE_READ = 1 << 29, + MLX5_WQE_FMR_PERM_REMOTE_WRITE = 1 << 30, + MLX5_WQE_FMR_PERM_ATOMIC = 1 << 31 +}; + +enum { + MLX5_FENCE_MODE_NONE = 0 << 5, + MLX5_FENCE_MODE_INITIATOR_SMALL = 1 << 5, + MLX5_FENCE_MODE_STRONG_ORDERING = 3 << 5, + MLX5_FENCE_MODE_SMALL_AND_FENCE = 4 << 5, +}; + +enum { + MLX5_QP_LAT_SENSITIVE = 1 << 28, + MLX5_QP_BLOCK_MCAST = 1 << 30, + MLX5_QP_ENABLE_SIG = 1 << 31, +}; + +enum { + MLX5_RCV_DBR = 0, + MLX5_SND_DBR = 1, +}; + +enum { + MLX5_FLAGS_INLINE = 1<<7, + MLX5_FLAGS_CHECK_FREE = 1<<5, +}; + +struct mlx5_wqe_fmr_seg { + __be32 flags; + __be32 mem_key; + __be64 buf_list; + __be64 start_addr; + __be64 reg_len; + __be32 offset; + __be32 page_size; + u32 reserved[2]; +}; + +struct mlx5_wqe_ctrl_seg { + __be32 opmod_idx_opcode; + __be32 qpn_ds; + u8 signature; + u8 rsvd[2]; + u8 fm_ce_se; + __be32 imm; +}; + +struct mlx5_wqe_xrc_seg { + __be32 xrc_srqn; + u8 rsvd[12]; +}; + +struct mlx5_wqe_masked_atomic_seg { + __be64 swap_add; + __be64 compare; + __be64 swap_add_mask; + __be64 compare_mask; +}; + +struct mlx5_av { + union { + struct { + __be32 qkey; + __be32 reserved; + } qkey; + __be64 dc_key; + } key; + __be32 dqp_dct; + u8 stat_rate_sl; + u8 fl_mlid; + __be16 rlid; + u8 reserved0[10]; + u8 tclass; + u8 hop_limit; + __be32 grh_gid_fl; + u8 rgid[16]; +}; + +struct mlx5_wqe_datagram_seg { + struct mlx5_av av; +}; + +struct mlx5_wqe_raddr_seg { + __be64 raddr; + __be32 rkey; + u32 reserved; +}; + +struct mlx5_wqe_atomic_seg { + __be64 swap_add; + __be64 compare; +}; + +struct mlx5_wqe_data_seg { + __be32 byte_count; + __be32 lkey; + __be64 addr; +}; + +struct mlx5_wqe_umr_ctrl_seg { + u8 flags; + u8 rsvd0[3]; + __be16 klm_octowords; + __be16 bsf_octowords; + __be64 mkey_mask; + u8 rsvd1[32]; +}; + +struct mlx5_seg_set_psv { + __be32 psv_num; + __be16 syndrome; + __be16 status; + __be32 transient_sig; + __be32 ref_tag; +}; + +struct mlx5_seg_get_psv { + u8 rsvd[19]; + u8 num_psv; + __be32 l_key; + __be64 va; + __be32 psv_index[4]; +}; + +struct mlx5_seg_check_psv { + u8 rsvd0[2]; + __be16 err_coalescing_op; + u8 rsvd1[2]; + __be16 xport_err_op; + u8 rsvd2[2]; + __be16 xport_err_mask; + u8 rsvd3[7]; + u8 num_psv; + __be32 l_key; + __be64 va; + __be32 psv_index[4]; +}; + +struct mlx5_rwqe_sig { + u8 rsvd0[4]; + u8 signature; + u8 rsvd1[11]; +}; + +struct mlx5_wqe_signature_seg { + u8 rsvd0[4]; + u8 signature; + u8 rsvd1[11]; +}; + +struct mlx5_wqe_inline_seg { + __be32 byte_count; +}; + +enum mlx5_sig_type { + MLX5_DIF_CRC = 0x1, + MLX5_DIF_IPCS = 0x2, +}; + +struct mlx5_bsf_inl { + __be16 vld_refresh; + __be16 dif_apptag; + __be32 dif_reftag; + u8 sig_type; + u8 rp_inv_seed; + u8 rsvd[3]; + u8 dif_inc_ref_guard_check; + __be16 dif_app_bitmask_check; +}; + +struct mlx5_bsf { + struct mlx5_bsf_basic { + u8 bsf_size_sbs; + u8 check_byte_mask; + union { + u8 copy_byte_mask; + u8 bs_selector; + u8 rsvd_wflags; + } wire; + union { + u8 bs_selector; + u8 rsvd_mflags; + } mem; + __be32 raw_data_size; + __be32 w_bfs_psv; + __be32 m_bfs_psv; + } basic; + struct mlx5_bsf_ext { + __be32 t_init_gen_pro_size; + __be32 rsvd_epi_size; + __be32 w_tfs_psv; + __be32 m_tfs_psv; + } ext; + struct mlx5_bsf_inl w_inl; + struct mlx5_bsf_inl m_inl; +}; + +struct mlx5_klm { + __be32 bcount; + __be32 key; + __be64 va; +}; + +struct mlx5_stride_block_entry { + __be16 stride; + __be16 bcount; + __be32 key; + __be64 va; +}; + +struct mlx5_stride_block_ctrl_seg { + __be32 bcount_per_cycle; + __be32 op; + __be32 repeat_count; + u16 rsvd; + __be16 num_entries; +}; + +struct mlx5_core_qp { + struct mlx5_core_rsc_common common; /* must be first */ + void (*event) (struct mlx5_core_qp *, int); + int qpn; + struct mlx5_rsc_debug *dbg; + int pid; +}; + +struct mlx5_qp_path { + u8 fl; + u8 rsvd3; + u8 free_ar; + u8 pkey_index; + u8 rsvd0; + u8 grh_mlid; + __be16 rlid; + u8 ackto_lt; + u8 mgid_index; + u8 static_rate; + u8 hop_limit; + __be32 tclass_flowlabel; + u8 rgid[16]; + u8 rsvd1[4]; + u8 sl; + u8 port; + u8 rsvd2[6]; +}; + +struct mlx5_qp_context { + __be32 flags; + __be32 flags_pd; + u8 mtu_msgmax; + u8 rq_size_stride; + __be16 sq_crq_size; + __be32 qp_counter_set_usr_page; + __be32 wire_qpn; + __be32 log_pg_sz_remote_qpn; + struct mlx5_qp_path pri_path; + struct mlx5_qp_path alt_path; + __be32 params1; + u8 reserved2[4]; + __be32 next_send_psn; + __be32 cqn_send; + u8 reserved3[8]; + __be32 last_acked_psn; + __be32 ssn; + __be32 params2; + __be32 rnr_nextrecvpsn; + __be32 xrcd; + __be32 cqn_recv; + __be64 db_rec_addr; + __be32 qkey; + __be32 rq_type_srqn; + __be32 rmsn; + __be16 hw_sq_wqe_counter; + __be16 sw_sq_wqe_counter; + __be16 hw_rcyclic_byte_counter; + __be16 hw_rq_counter; + __be16 sw_rcyclic_byte_counter; + __be16 sw_rq_counter; + u8 rsvd0[5]; + u8 cgs; + u8 cs_req; + u8 cs_res; + __be64 dc_access_key; + u8 rsvd1[24]; +}; + +struct mlx5_create_qp_mbox_in { + struct mlx5_inbox_hdr hdr; + __be32 input_qpn; + u8 rsvd0[4]; + __be32 opt_param_mask; + u8 rsvd1[4]; + struct mlx5_qp_context ctx; + u8 rsvd3[16]; + __be64 pas[0]; +}; + +struct mlx5_create_qp_mbox_out { + struct mlx5_outbox_hdr hdr; + __be32 qpn; + u8 rsvd0[4]; +}; + +struct mlx5_destroy_qp_mbox_in { + struct mlx5_inbox_hdr hdr; + __be32 qpn; + u8 rsvd0[4]; +}; + +struct mlx5_destroy_qp_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd0[8]; +}; + +struct mlx5_modify_qp_mbox_in { + struct mlx5_inbox_hdr hdr; + __be32 qpn; + u8 rsvd1[4]; + __be32 optparam; + u8 rsvd0[4]; + struct mlx5_qp_context ctx; +}; + +struct mlx5_modify_qp_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd0[8]; +}; + +struct mlx5_query_qp_mbox_in { + struct mlx5_inbox_hdr hdr; + __be32 qpn; + u8 rsvd[4]; +}; + +struct mlx5_query_qp_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd1[8]; + __be32 optparam; + u8 rsvd0[4]; + struct mlx5_qp_context ctx; + u8 rsvd2[16]; + __be64 pas[0]; +}; + +struct mlx5_conf_sqp_mbox_in { + struct mlx5_inbox_hdr hdr; + __be32 qpn; + u8 rsvd[3]; + u8 type; +}; + +struct mlx5_conf_sqp_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd[8]; +}; + +struct mlx5_alloc_xrcd_mbox_in { + struct mlx5_inbox_hdr hdr; + u8 rsvd[8]; +}; + +struct mlx5_alloc_xrcd_mbox_out { + struct mlx5_outbox_hdr hdr; + __be32 xrcdn; + u8 rsvd[4]; +}; + +struct mlx5_dealloc_xrcd_mbox_in { + struct mlx5_inbox_hdr hdr; + __be32 xrcdn; + u8 rsvd[4]; +}; + +struct mlx5_dealloc_xrcd_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd[8]; +}; + +static inline struct mlx5_core_qp *__mlx5_qp_lookup(struct mlx5_core_dev *dev, u32 qpn) +{ + return radix_tree_lookup(&dev->priv.qp_table.tree, qpn); +} + +static inline struct mlx5_core_mr *__mlx5_mr_lookup(struct mlx5_core_dev *dev, u32 key) +{ + return radix_tree_lookup(&dev->priv.mr_table.tree, key); +} + +int mlx5_core_create_qp(struct mlx5_core_dev *dev, + struct mlx5_core_qp *qp, + struct mlx5_create_qp_mbox_in *in, + int inlen); +int mlx5_core_qp_modify(struct mlx5_core_dev *dev, enum mlx5_qp_state cur_state, + enum mlx5_qp_state new_state, + struct mlx5_modify_qp_mbox_in *in, int sqd_event, + struct mlx5_core_qp *qp); +int mlx5_core_destroy_qp(struct mlx5_core_dev *dev, + struct mlx5_core_qp *qp); +int mlx5_core_qp_query(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, + struct mlx5_query_qp_mbox_out *out, int outlen); + +int mlx5_core_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn); +int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn); +void mlx5_init_qp_table(struct mlx5_core_dev *dev); +void mlx5_cleanup_qp_table(struct mlx5_core_dev *dev); +int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); +void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); + +static inline const char *mlx5_qp_type_str(int type) +{ + switch (type) { + case MLX5_QP_ST_RC: return "RC"; + case MLX5_QP_ST_UC: return "C"; + case MLX5_QP_ST_UD: return "UD"; + case MLX5_QP_ST_XRC: return "XRC"; + case MLX5_QP_ST_MLX: return "MLX"; + case MLX5_QP_ST_QP0: return "QP0"; + case MLX5_QP_ST_QP1: return "QP1"; + case MLX5_QP_ST_RAW_ETHERTYPE: return "RAW_ETHERTYPE"; + case MLX5_QP_ST_RAW_IPV6: return "RAW_IPV6"; + case MLX5_QP_ST_SNIFFER: return "SNIFFER"; + case MLX5_QP_ST_SYNC_UMR: return "SYNC_UMR"; + case MLX5_QP_ST_PTP_1588: return "PTP_1588"; + case MLX5_QP_ST_REG_UMR: return "REG_UMR"; + default: return "Invalid transport type"; + } +} + +static inline const char *mlx5_qp_state_str(int state) +{ + switch (state) { + case MLX5_QP_STATE_RST: + return "RST"; + case MLX5_QP_STATE_INIT: + return "INIT"; + case MLX5_QP_STATE_RTR: + return "RTR"; + case MLX5_QP_STATE_RTS: + return "RTS"; + case MLX5_QP_STATE_SQER: + return "SQER"; + case MLX5_QP_STATE_SQD: + return "SQD"; + case MLX5_QP_STATE_ERR: + return "ERR"; + case MLX5_QP_STATE_SQ_DRAINING: + return "SQ_DRAINING"; + case MLX5_QP_STATE_SUSPENDED: + return "SUSPENDED"; + default: return "Invalid QP state"; + } +} + +#endif /* MLX5_QP_H */ diff --git a/include/linux/mlx5/srq.h b/include/linux/mlx5/srq.h new file mode 100644 index 0000000..e1a363a --- /dev/null +++ b/include/linux/mlx5/srq.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2013, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_SRQ_H +#define MLX5_SRQ_H + +#include + +void mlx5_init_srq_table(struct mlx5_core_dev *dev); +void mlx5_cleanup_srq_table(struct mlx5_core_dev *dev); + +#endif /* MLX5_SRQ_H */ diff --git a/include/rdma/ib.h b/include/rdma/ib.h new file mode 100644 index 0000000..cf8f9e7 --- /dev/null +++ b/include/rdma/ib.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2010 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(_RDMA_IB_H) +#define _RDMA_IB_H + +#include + +struct ib_addr { + union { + __u8 uib_addr8[16]; + __be16 uib_addr16[8]; + __be32 uib_addr32[4]; + __be64 uib_addr64[2]; + } ib_u; +#define sib_addr8 ib_u.uib_addr8 +#define sib_addr16 ib_u.uib_addr16 +#define sib_addr32 ib_u.uib_addr32 +#define sib_addr64 ib_u.uib_addr64 +#define sib_raw ib_u.uib_addr8 +#define sib_subnet_prefix ib_u.uib_addr64[0] +#define sib_interface_id ib_u.uib_addr64[1] +}; + +static inline int ib_addr_any(const struct ib_addr *a) +{ + return ((a->sib_addr64[0] | a->sib_addr64[1]) == 0); +} + +static inline int ib_addr_loopback(const struct ib_addr *a) +{ + return ((a->sib_addr32[0] | a->sib_addr32[1] | + a->sib_addr32[2] | (a->sib_addr32[3] ^ htonl(1))) == 0); +} + +static inline void ib_addr_set(struct ib_addr *addr, + __be32 w1, __be32 w2, __be32 w3, __be32 w4) +{ + addr->sib_addr32[0] = w1; + addr->sib_addr32[1] = w2; + addr->sib_addr32[2] = w3; + addr->sib_addr32[3] = w4; +} + +static inline int ib_addr_cmp(const struct ib_addr *a1, const struct ib_addr *a2) +{ + return memcmp(a1, a2, sizeof(struct ib_addr)); +} + +struct sockaddr_ib { + unsigned short int sib_family; /* AF_IB */ + __be16 sib_pkey; + __be32 sib_flowinfo; + struct ib_addr sib_addr; + __be64 sib_sid; + __be64 sib_sid_mask; + __u64 sib_scope_id; +}; + +#endif /* _RDMA_IB_H */ diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h new file mode 100644 index 0000000..ce55906 --- /dev/null +++ b/include/rdma/ib_addr.h @@ -0,0 +1,312 @@ +/* + * Copyright (c) 2005 Voltaire Inc. All rights reserved. + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(IB_ADDR_H) +#define IB_ADDR_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct rdma_addr_client { + atomic_t refcount; + struct completion comp; +}; + +/** + * rdma_addr_register_client - Register an address client. + */ +void rdma_addr_register_client(struct rdma_addr_client *client); + +/** + * rdma_addr_unregister_client - Deregister an address client. + * @client: Client object to deregister. + */ +void rdma_addr_unregister_client(struct rdma_addr_client *client); + +struct rdma_dev_addr { + unsigned char src_dev_addr[MAX_ADDR_LEN]; + unsigned char dst_dev_addr[MAX_ADDR_LEN]; + unsigned char broadcast[MAX_ADDR_LEN]; + unsigned short dev_type; + int bound_dev_if; + enum rdma_transport_type transport; +}; + +/** + * rdma_translate_ip - Translate a local IP address to an RDMA hardware + * address. + */ +int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr, + u16 *vlan_id); + +/** + * rdma_resolve_ip - Resolve source and destination IP addresses to + * RDMA hardware addresses. + * @client: Address client associated with request. + * @src_addr: An optional source address to use in the resolution. If a + * source address is not provided, a usable address will be returned via + * the callback. + * @dst_addr: The destination address to resolve. + * @addr: A reference to a data location that will receive the resolved + * addresses. The data location must remain valid until the callback has + * been invoked. + * @timeout_ms: Amount of time to wait for the address resolution to complete. + * @callback: Call invoked once address resolution has completed, timed out, + * or been canceled. A status of 0 indicates success. + * @context: User-specified context associated with the call. + */ +int rdma_resolve_ip(struct rdma_addr_client *client, + struct sockaddr *src_addr, struct sockaddr *dst_addr, + struct rdma_dev_addr *addr, int timeout_ms, + void (*callback)(int status, struct sockaddr *src_addr, + struct rdma_dev_addr *addr, void *context), + void *context); + +void rdma_addr_cancel(struct rdma_dev_addr *addr); + +int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev, + const unsigned char *dst_dev_addr); + +int rdma_addr_size(struct sockaddr *addr); + +int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id); +int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *smac, + u16 *vlan_id); + +static inline u16 ib_addr_get_pkey(struct rdma_dev_addr *dev_addr) +{ + return ((u16)dev_addr->broadcast[8] << 8) | (u16)dev_addr->broadcast[9]; +} + +static inline void ib_addr_set_pkey(struct rdma_dev_addr *dev_addr, u16 pkey) +{ + dev_addr->broadcast[8] = pkey >> 8; + dev_addr->broadcast[9] = (unsigned char) pkey; +} + +static inline void ib_addr_get_mgid(struct rdma_dev_addr *dev_addr, + union ib_gid *gid) +{ + memcpy(gid, dev_addr->broadcast + 4, sizeof *gid); +} + +static inline int rdma_addr_gid_offset(struct rdma_dev_addr *dev_addr) +{ + return dev_addr->dev_type == ARPHRD_INFINIBAND ? 4 : 0; +} + +static inline u16 rdma_vlan_dev_vlan_id(const struct net_device *dev) +{ + return dev->priv_flags & IFF_802_1Q_VLAN ? + vlan_dev_vlan_id(dev) : 0xffff; +} + +static inline int rdma_ip2gid(struct sockaddr *addr, union ib_gid *gid) +{ + switch (addr->sa_family) { + case AF_INET: + ipv6_addr_set_v4mapped(((struct sockaddr_in *) + addr)->sin_addr.s_addr, + (struct in6_addr *)gid); + break; + case AF_INET6: + memcpy(gid->raw, &((struct sockaddr_in6 *)addr)->sin6_addr, 16); + break; + default: + return -EINVAL; + } + return 0; +} + +/* Important - sockaddr should be a union of sockaddr_in and sockaddr_in6 */ +static inline int rdma_gid2ip(struct sockaddr *out, union ib_gid *gid) +{ + if (ipv6_addr_v4mapped((struct in6_addr *)gid)) { + struct sockaddr_in *out_in = (struct sockaddr_in *)out; + memset(out_in, 0, sizeof(*out_in)); + out_in->sin_family = AF_INET; + memcpy(&out_in->sin_addr.s_addr, gid->raw + 12, 4); + } else { + struct sockaddr_in6 *out_in = (struct sockaddr_in6 *)out; + memset(out_in, 0, sizeof(*out_in)); + out_in->sin6_family = AF_INET6; + memcpy(&out_in->sin6_addr.s6_addr, gid->raw, 16); + } + return 0; +} + +static inline void iboe_addr_get_sgid(struct rdma_dev_addr *dev_addr, + union ib_gid *gid) +{ + struct net_device *dev; + struct in_device *ip4; + + dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); + if (dev) { + ip4 = (struct in_device *)dev->ip_ptr; + if (ip4 && ip4->ifa_list && ip4->ifa_list->ifa_address) + ipv6_addr_set_v4mapped(ip4->ifa_list->ifa_address, + (struct in6_addr *)gid); + dev_put(dev); + } +} + +static inline void rdma_addr_get_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) +{ + if (dev_addr->transport == RDMA_TRANSPORT_IB && + dev_addr->dev_type != ARPHRD_INFINIBAND) + iboe_addr_get_sgid(dev_addr, gid); + else + memcpy(gid, dev_addr->src_dev_addr + + rdma_addr_gid_offset(dev_addr), sizeof *gid); +} + +static inline void rdma_addr_set_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) +{ + memcpy(dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), gid, sizeof *gid); +} + +static inline void rdma_addr_get_dgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) +{ + memcpy(gid, dev_addr->dst_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof *gid); +} + +static inline void rdma_addr_set_dgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) +{ + memcpy(dev_addr->dst_dev_addr + rdma_addr_gid_offset(dev_addr), gid, sizeof *gid); +} + +static inline enum ib_mtu iboe_get_mtu(int mtu) +{ + /* + * reduce IB headers from effective IBoE MTU. 28 stands for + * atomic header which is the biggest possible header after BTH + */ + mtu = mtu - IB_GRH_BYTES - IB_BTH_BYTES - 28; + + if (mtu >= ib_mtu_enum_to_int(IB_MTU_4096)) + return IB_MTU_4096; + else if (mtu >= ib_mtu_enum_to_int(IB_MTU_2048)) + return IB_MTU_2048; + else if (mtu >= ib_mtu_enum_to_int(IB_MTU_1024)) + return IB_MTU_1024; + else if (mtu >= ib_mtu_enum_to_int(IB_MTU_512)) + return IB_MTU_512; + else if (mtu >= ib_mtu_enum_to_int(IB_MTU_256)) + return IB_MTU_256; + else + return 0; +} + +static inline int iboe_get_rate(struct net_device *dev) +{ + struct ethtool_cmd cmd; + u32 speed; + int err; + + rtnl_lock(); + err = __ethtool_get_settings(dev, &cmd); + rtnl_unlock(); + if (err) + return IB_RATE_PORT_CURRENT; + + speed = ethtool_cmd_speed(&cmd); + if (speed >= 40000) + return IB_RATE_40_GBPS; + else if (speed >= 30000) + return IB_RATE_30_GBPS; + else if (speed >= 20000) + return IB_RATE_20_GBPS; + else if (speed >= 10000) + return IB_RATE_10_GBPS; + else + return IB_RATE_PORT_CURRENT; +} + +static inline int rdma_link_local_addr(struct in6_addr *addr) +{ + if (addr->s6_addr32[0] == htonl(0xfe800000) && + addr->s6_addr32[1] == 0) + return 1; + + return 0; +} + +static inline void rdma_get_ll_mac(struct in6_addr *addr, u8 *mac) +{ + memcpy(mac, &addr->s6_addr[8], 3); + memcpy(mac + 3, &addr->s6_addr[13], 3); + mac[0] ^= 2; +} + +static inline int rdma_is_multicast_addr(struct in6_addr *addr) +{ + return addr->s6_addr[0] == 0xff; +} + +static inline void rdma_get_mcast_mac(struct in6_addr *addr, u8 *mac) +{ + int i; + + mac[0] = 0x33; + mac[1] = 0x33; + for (i = 2; i < 6; ++i) + mac[i] = addr->s6_addr[i + 10]; +} + +static inline u16 rdma_get_vlan_id(union ib_gid *dgid) +{ + u16 vid; + + vid = dgid->raw[11] << 8 | dgid->raw[12]; + return vid < 0x1000 ? vid : 0xffff; +} + +static inline struct net_device *rdma_vlan_dev_real_dev(const struct net_device *dev) +{ + return dev->priv_flags & IFF_802_1Q_VLAN ? + vlan_dev_real_dev(dev) : NULL; +} + +#endif /* IB_ADDR_H */ diff --git a/include/rdma/ib_cache.h b/include/rdma/ib_cache.h new file mode 100644 index 0000000..ad9a3c2 --- /dev/null +++ b/include/rdma/ib_cache.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IB_CACHE_H +#define _IB_CACHE_H + +#include + +/** + * ib_get_cached_gid - Returns a cached GID table entry + * @device: The device to query. + * @port_num: The port number of the device to query. + * @index: The index into the cached GID table to query. + * @gid: The GID value found at the specified index. + * + * ib_get_cached_gid() fetches the specified GID table entry stored in + * the local software cache. + */ +int ib_get_cached_gid(struct ib_device *device, + u8 port_num, + int index, + union ib_gid *gid); + +/** + * ib_find_cached_gid - Returns the port number and GID table index where + * a specified GID value occurs. + * @device: The device to query. + * @gid: The GID value to search for. + * @port_num: The port number of the device where the GID value was found. + * @index: The index into the cached GID table where the GID was found. This + * parameter may be NULL. + * + * ib_find_cached_gid() searches for the specified GID value in + * the local software cache. + */ +int ib_find_cached_gid(struct ib_device *device, + union ib_gid *gid, + u8 *port_num, + u16 *index); + +/** + * ib_get_cached_pkey - Returns a cached PKey table entry + * @device: The device to query. + * @port_num: The port number of the device to query. + * @index: The index into the cached PKey table to query. + * @pkey: The PKey value found at the specified index. + * + * ib_get_cached_pkey() fetches the specified PKey table entry stored in + * the local software cache. + */ +int ib_get_cached_pkey(struct ib_device *device_handle, + u8 port_num, + int index, + u16 *pkey); + +/** + * ib_find_cached_pkey - Returns the PKey table index where a specified + * PKey value occurs. + * @device: The device to query. + * @port_num: The port number of the device to search for the PKey. + * @pkey: The PKey value to search for. + * @index: The index into the cached PKey table where the PKey was found. + * + * ib_find_cached_pkey() searches the specified PKey table in + * the local software cache. + */ +int ib_find_cached_pkey(struct ib_device *device, + u8 port_num, + u16 pkey, + u16 *index); + +/** + * ib_find_exact_cached_pkey - Returns the PKey table index where a specified + * PKey value occurs. Comparison uses the FULL 16 bits (incl membership bit) + * @device: The device to query. + * @port_num: The port number of the device to search for the PKey. + * @pkey: The PKey value to search for. + * @index: The index into the cached PKey table where the PKey was found. + * + * ib_find_exact_cached_pkey() searches the specified PKey table in + * the local software cache. + */ +int ib_find_exact_cached_pkey(struct ib_device *device, + u8 port_num, + u16 pkey, + u16 *index); + +/** + * ib_get_cached_lmc - Returns a cached lmc table entry + * @device: The device to query. + * @port_num: The port number of the device to query. + * @lmc: The lmc value for the specified port for that device. + * + * ib_get_cached_lmc() fetches the specified lmc table entry stored in + * the local software cache. + */ +int ib_get_cached_lmc(struct ib_device *device, + u8 port_num, + u8 *lmc); + +#endif /* _IB_CACHE_H */ diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h new file mode 100644 index 0000000..0e3ff30 --- /dev/null +++ b/include/rdma/ib_cm.h @@ -0,0 +1,604 @@ +/* + * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved. + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2004 Voltaire Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if !defined(IB_CM_H) +#define IB_CM_H + +#include +#include + +/* ib_cm and ib_user_cm modules share /sys/class/infiniband_cm */ +extern struct class cm_class; + +enum ib_cm_state { + IB_CM_IDLE, + IB_CM_LISTEN, + IB_CM_REQ_SENT, + IB_CM_REQ_RCVD, + IB_CM_MRA_REQ_SENT, + IB_CM_MRA_REQ_RCVD, + IB_CM_REP_SENT, + IB_CM_REP_RCVD, + IB_CM_MRA_REP_SENT, + IB_CM_MRA_REP_RCVD, + IB_CM_ESTABLISHED, + IB_CM_DREQ_SENT, + IB_CM_DREQ_RCVD, + IB_CM_TIMEWAIT, + IB_CM_SIDR_REQ_SENT, + IB_CM_SIDR_REQ_RCVD +}; + +enum ib_cm_lap_state { + IB_CM_LAP_UNINIT, + IB_CM_LAP_IDLE, + IB_CM_LAP_SENT, + IB_CM_LAP_RCVD, + IB_CM_MRA_LAP_SENT, + IB_CM_MRA_LAP_RCVD, +}; + +enum ib_cm_event_type { + IB_CM_REQ_ERROR, + IB_CM_REQ_RECEIVED, + IB_CM_REP_ERROR, + IB_CM_REP_RECEIVED, + IB_CM_RTU_RECEIVED, + IB_CM_USER_ESTABLISHED, + IB_CM_DREQ_ERROR, + IB_CM_DREQ_RECEIVED, + IB_CM_DREP_RECEIVED, + IB_CM_TIMEWAIT_EXIT, + IB_CM_MRA_RECEIVED, + IB_CM_REJ_RECEIVED, + IB_CM_LAP_ERROR, + IB_CM_LAP_RECEIVED, + IB_CM_APR_RECEIVED, + IB_CM_SIDR_REQ_ERROR, + IB_CM_SIDR_REQ_RECEIVED, + IB_CM_SIDR_REP_RECEIVED +}; + +enum ib_cm_data_size { + IB_CM_REQ_PRIVATE_DATA_SIZE = 92, + IB_CM_MRA_PRIVATE_DATA_SIZE = 222, + IB_CM_REJ_PRIVATE_DATA_SIZE = 148, + IB_CM_REP_PRIVATE_DATA_SIZE = 196, + IB_CM_RTU_PRIVATE_DATA_SIZE = 224, + IB_CM_DREQ_PRIVATE_DATA_SIZE = 220, + IB_CM_DREP_PRIVATE_DATA_SIZE = 224, + IB_CM_REJ_ARI_LENGTH = 72, + IB_CM_LAP_PRIVATE_DATA_SIZE = 168, + IB_CM_APR_PRIVATE_DATA_SIZE = 148, + IB_CM_APR_INFO_LENGTH = 72, + IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE = 216, + IB_CM_SIDR_REP_PRIVATE_DATA_SIZE = 136, + IB_CM_SIDR_REP_INFO_LENGTH = 72, + IB_CM_COMPARE_SIZE = 64 +}; + +struct ib_cm_id; + +struct ib_cm_req_event_param { + struct ib_cm_id *listen_id; + u8 port; + + struct ib_sa_path_rec *primary_path; + struct ib_sa_path_rec *alternate_path; + + __be64 remote_ca_guid; + u32 remote_qkey; + u32 remote_qpn; + enum ib_qp_type qp_type; + + u32 starting_psn; + u8 responder_resources; + u8 initiator_depth; + unsigned int local_cm_response_timeout:5; + unsigned int flow_control:1; + unsigned int remote_cm_response_timeout:5; + unsigned int retry_count:3; + unsigned int rnr_retry_count:3; + unsigned int srq:1; +}; + +struct ib_cm_rep_event_param { + __be64 remote_ca_guid; + u32 remote_qkey; + u32 remote_qpn; + u32 starting_psn; + u8 responder_resources; + u8 initiator_depth; + unsigned int target_ack_delay:5; + unsigned int failover_accepted:2; + unsigned int flow_control:1; + unsigned int rnr_retry_count:3; + unsigned int srq:1; +}; + +enum ib_cm_rej_reason { + IB_CM_REJ_NO_QP = 1, + IB_CM_REJ_NO_EEC = 2, + IB_CM_REJ_NO_RESOURCES = 3, + IB_CM_REJ_TIMEOUT = 4, + IB_CM_REJ_UNSUPPORTED = 5, + IB_CM_REJ_INVALID_COMM_ID = 6, + IB_CM_REJ_INVALID_COMM_INSTANCE = 7, + IB_CM_REJ_INVALID_SERVICE_ID = 8, + IB_CM_REJ_INVALID_TRANSPORT_TYPE = 9, + IB_CM_REJ_STALE_CONN = 10, + IB_CM_REJ_RDC_NOT_EXIST = 11, + IB_CM_REJ_INVALID_GID = 12, + IB_CM_REJ_INVALID_LID = 13, + IB_CM_REJ_INVALID_SL = 14, + IB_CM_REJ_INVALID_TRAFFIC_CLASS = 15, + IB_CM_REJ_INVALID_HOP_LIMIT = 16, + IB_CM_REJ_INVALID_PACKET_RATE = 17, + IB_CM_REJ_INVALID_ALT_GID = 18, + IB_CM_REJ_INVALID_ALT_LID = 19, + IB_CM_REJ_INVALID_ALT_SL = 20, + IB_CM_REJ_INVALID_ALT_TRAFFIC_CLASS = 21, + IB_CM_REJ_INVALID_ALT_HOP_LIMIT = 22, + IB_CM_REJ_INVALID_ALT_PACKET_RATE = 23, + IB_CM_REJ_PORT_CM_REDIRECT = 24, + IB_CM_REJ_PORT_REDIRECT = 25, + IB_CM_REJ_INVALID_MTU = 26, + IB_CM_REJ_INSUFFICIENT_RESP_RESOURCES = 27, + IB_CM_REJ_CONSUMER_DEFINED = 28, + IB_CM_REJ_INVALID_RNR_RETRY = 29, + IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID = 30, + IB_CM_REJ_INVALID_CLASS_VERSION = 31, + IB_CM_REJ_INVALID_FLOW_LABEL = 32, + IB_CM_REJ_INVALID_ALT_FLOW_LABEL = 33 +}; + +struct ib_cm_rej_event_param { + enum ib_cm_rej_reason reason; + void *ari; + u8 ari_length; +}; + +struct ib_cm_mra_event_param { + u8 service_timeout; +}; + +struct ib_cm_lap_event_param { + struct ib_sa_path_rec *alternate_path; +}; + +enum ib_cm_apr_status { + IB_CM_APR_SUCCESS, + IB_CM_APR_INVALID_COMM_ID, + IB_CM_APR_UNSUPPORTED, + IB_CM_APR_REJECT, + IB_CM_APR_REDIRECT, + IB_CM_APR_IS_CURRENT, + IB_CM_APR_INVALID_QPN_EECN, + IB_CM_APR_INVALID_LID, + IB_CM_APR_INVALID_GID, + IB_CM_APR_INVALID_FLOW_LABEL, + IB_CM_APR_INVALID_TCLASS, + IB_CM_APR_INVALID_HOP_LIMIT, + IB_CM_APR_INVALID_PACKET_RATE, + IB_CM_APR_INVALID_SL +}; + +struct ib_cm_apr_event_param { + enum ib_cm_apr_status ap_status; + void *apr_info; + u8 info_len; +}; + +struct ib_cm_sidr_req_event_param { + struct ib_cm_id *listen_id; + u8 port; + u16 pkey; +}; + +enum ib_cm_sidr_status { + IB_SIDR_SUCCESS, + IB_SIDR_UNSUPPORTED, + IB_SIDR_REJECT, + IB_SIDR_NO_QP, + IB_SIDR_REDIRECT, + IB_SIDR_UNSUPPORTED_VERSION +}; + +struct ib_cm_sidr_rep_event_param { + enum ib_cm_sidr_status status; + u32 qkey; + u32 qpn; + void *info; + u8 info_len; +}; + +struct ib_cm_event { + enum ib_cm_event_type event; + union { + struct ib_cm_req_event_param req_rcvd; + struct ib_cm_rep_event_param rep_rcvd; + /* No data for RTU received events. */ + struct ib_cm_rej_event_param rej_rcvd; + struct ib_cm_mra_event_param mra_rcvd; + struct ib_cm_lap_event_param lap_rcvd; + struct ib_cm_apr_event_param apr_rcvd; + /* No data for DREQ/DREP received events. */ + struct ib_cm_sidr_req_event_param sidr_req_rcvd; + struct ib_cm_sidr_rep_event_param sidr_rep_rcvd; + enum ib_wc_status send_status; + } param; + + void *private_data; +}; + +#define CM_REQ_ATTR_ID cpu_to_be16(0x0010) +#define CM_MRA_ATTR_ID cpu_to_be16(0x0011) +#define CM_REJ_ATTR_ID cpu_to_be16(0x0012) +#define CM_REP_ATTR_ID cpu_to_be16(0x0013) +#define CM_RTU_ATTR_ID cpu_to_be16(0x0014) +#define CM_DREQ_ATTR_ID cpu_to_be16(0x0015) +#define CM_DREP_ATTR_ID cpu_to_be16(0x0016) +#define CM_SIDR_REQ_ATTR_ID cpu_to_be16(0x0017) +#define CM_SIDR_REP_ATTR_ID cpu_to_be16(0x0018) +#define CM_LAP_ATTR_ID cpu_to_be16(0x0019) +#define CM_APR_ATTR_ID cpu_to_be16(0x001A) + +/** + * ib_cm_handler - User-defined callback to process communication events. + * @cm_id: Communication identifier associated with the reported event. + * @event: Information about the communication event. + * + * IB_CM_REQ_RECEIVED and IB_CM_SIDR_REQ_RECEIVED communication events + * generated as a result of listen requests result in the allocation of a + * new @cm_id. The new @cm_id is returned to the user through this callback. + * Clients are responsible for destroying the new @cm_id. For peer-to-peer + * IB_CM_REQ_RECEIVED and all other events, the returned @cm_id corresponds + * to a user's existing communication identifier. + * + * Users may not call ib_destroy_cm_id while in the context of this callback; + * however, returning a non-zero value instructs the communication manager to + * destroy the @cm_id after the callback completes. + */ +typedef int (*ib_cm_handler)(struct ib_cm_id *cm_id, + struct ib_cm_event *event); + +struct ib_cm_id { + ib_cm_handler cm_handler; + void *context; + struct ib_device *device; + __be64 service_id; + __be64 service_mask; + enum ib_cm_state state; /* internal CM/debug use */ + enum ib_cm_lap_state lap_state; /* internal CM/debug use */ + __be32 local_id; + __be32 remote_id; + u32 remote_cm_qpn; /* 1 unless redirected */ +}; + +/** + * ib_create_cm_id - Allocate a communication identifier. + * @device: Device associated with the cm_id. All related communication will + * be associated with the specified device. + * @cm_handler: Callback invoked to notify the user of CM events. + * @context: User specified context associated with the communication + * identifier. + * + * Communication identifiers are used to track connection states, service + * ID resolution requests, and listen requests. + */ +struct ib_cm_id *ib_create_cm_id(struct ib_device *device, + ib_cm_handler cm_handler, + void *context); + +/** + * ib_destroy_cm_id - Destroy a connection identifier. + * @cm_id: Connection identifier to destroy. + * + * This call blocks until the connection identifier is destroyed. + */ +void ib_destroy_cm_id(struct ib_cm_id *cm_id); + +#define IB_SERVICE_ID_AGN_MASK cpu_to_be64(0xFF00000000000000ULL) +#define IB_CM_ASSIGN_SERVICE_ID cpu_to_be64(0x0200000000000000ULL) +#define IB_CMA_SERVICE_ID cpu_to_be64(0x0000000001000000ULL) +#define IB_CMA_SERVICE_ID_MASK cpu_to_be64(0xFFFFFFFFFF000000ULL) +#define IB_SDP_SERVICE_ID cpu_to_be64(0x0000000000010000ULL) +#define IB_SDP_SERVICE_ID_MASK cpu_to_be64(0xFFFFFFFFFFFF0000ULL) + +struct ib_cm_compare_data { + u8 data[IB_CM_COMPARE_SIZE]; + u8 mask[IB_CM_COMPARE_SIZE]; +}; + +/** + * ib_cm_listen - Initiates listening on the specified service ID for + * connection and service ID resolution requests. + * @cm_id: Connection identifier associated with the listen request. + * @service_id: Service identifier matched against incoming connection + * and service ID resolution requests. The service ID should be specified + * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will + * assign a service ID to the caller. + * @service_mask: Mask applied to service ID used to listen across a + * range of service IDs. If set to 0, the service ID is matched + * exactly. This parameter is ignored if %service_id is set to + * IB_CM_ASSIGN_SERVICE_ID. + * @compare_data: This parameter is optional. It specifies data that must + * appear in the private data of a connection request for the specified + * listen request. + */ +int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask, + struct ib_cm_compare_data *compare_data); + +struct ib_cm_req_param { + struct ib_sa_path_rec *primary_path; + struct ib_sa_path_rec *alternate_path; + __be64 service_id; + u32 qp_num; + enum ib_qp_type qp_type; + u32 starting_psn; + const void *private_data; + u8 private_data_len; + u8 peer_to_peer; + u8 responder_resources; + u8 initiator_depth; + u8 remote_cm_response_timeout; + u8 flow_control; + u8 local_cm_response_timeout; + u8 retry_count; + u8 rnr_retry_count; + u8 max_cm_retries; + u8 srq; +}; + +/** + * ib_send_cm_req - Sends a connection request to the remote node. + * @cm_id: Connection identifier that will be associated with the + * connection request. + * @param: Connection request information needed to establish the + * connection. + */ +int ib_send_cm_req(struct ib_cm_id *cm_id, + struct ib_cm_req_param *param); + +struct ib_cm_rep_param { + u32 qp_num; + u32 starting_psn; + const void *private_data; + u8 private_data_len; + u8 responder_resources; + u8 initiator_depth; + u8 failover_accepted; + u8 flow_control; + u8 rnr_retry_count; + u8 srq; +}; + +/** + * ib_send_cm_rep - Sends a connection reply in response to a connection + * request. + * @cm_id: Connection identifier that will be associated with the + * connection request. + * @param: Connection reply information needed to establish the + * connection. + */ +int ib_send_cm_rep(struct ib_cm_id *cm_id, + struct ib_cm_rep_param *param); + +/** + * ib_send_cm_rtu - Sends a connection ready to use message in response + * to a connection reply message. + * @cm_id: Connection identifier associated with the connection request. + * @private_data: Optional user-defined private data sent with the + * ready to use message. + * @private_data_len: Size of the private data buffer, in bytes. + */ +int ib_send_cm_rtu(struct ib_cm_id *cm_id, + const void *private_data, + u8 private_data_len); + +/** + * ib_send_cm_dreq - Sends a disconnection request for an existing + * connection. + * @cm_id: Connection identifier associated with the connection being + * released. + * @private_data: Optional user-defined private data sent with the + * disconnection request message. + * @private_data_len: Size of the private data buffer, in bytes. + */ +int ib_send_cm_dreq(struct ib_cm_id *cm_id, + const void *private_data, + u8 private_data_len); + +/** + * ib_send_cm_drep - Sends a disconnection reply to a disconnection request. + * @cm_id: Connection identifier associated with the connection being + * released. + * @private_data: Optional user-defined private data sent with the + * disconnection reply message. + * @private_data_len: Size of the private data buffer, in bytes. + * + * If the cm_id is in the correct state, the CM will transition the connection + * to the timewait state, even if an error occurs sending the DREP message. + */ +int ib_send_cm_drep(struct ib_cm_id *cm_id, + const void *private_data, + u8 private_data_len); + +/** + * ib_cm_notify - Notifies the CM of an event reported to the consumer. + * @cm_id: Connection identifier to transition to established. + * @event: Type of event. + * + * This routine should be invoked by users to notify the CM of relevant + * communication events. Events that should be reported to the CM and + * when to report them are: + * + * IB_EVENT_COMM_EST - Used when a message is received on a connected + * QP before an RTU has been received. + * IB_EVENT_PATH_MIG - Notifies the CM that the connection has failed over + * to the alternate path. + */ +int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event); + +/** + * ib_send_cm_rej - Sends a connection rejection message to the + * remote node. + * @cm_id: Connection identifier associated with the connection being + * rejected. + * @reason: Reason for the connection request rejection. + * @ari: Optional additional rejection information. + * @ari_length: Size of the additional rejection information, in bytes. + * @private_data: Optional user-defined private data sent with the + * rejection message. + * @private_data_len: Size of the private data buffer, in bytes. + */ +int ib_send_cm_rej(struct ib_cm_id *cm_id, + enum ib_cm_rej_reason reason, + void *ari, + u8 ari_length, + const void *private_data, + u8 private_data_len); + +#define IB_CM_MRA_FLAG_DELAY 0x80 /* Send MRA only after a duplicate msg */ + +/** + * ib_send_cm_mra - Sends a message receipt acknowledgement to a connection + * message. + * @cm_id: Connection identifier associated with the connection message. + * @service_timeout: The lower 5-bits specify the maximum time required for + * the sender to reply to the connection message. The upper 3-bits + * specify additional control flags. + * @private_data: Optional user-defined private data sent with the + * message receipt acknowledgement. + * @private_data_len: Size of the private data buffer, in bytes. + */ +int ib_send_cm_mra(struct ib_cm_id *cm_id, + u8 service_timeout, + const void *private_data, + u8 private_data_len); + +/** + * ib_send_cm_lap - Sends a load alternate path request. + * @cm_id: Connection identifier associated with the load alternate path + * message. + * @alternate_path: A path record that identifies the alternate path to + * load. + * @private_data: Optional user-defined private data sent with the + * load alternate path message. + * @private_data_len: Size of the private data buffer, in bytes. + */ +int ib_send_cm_lap(struct ib_cm_id *cm_id, + struct ib_sa_path_rec *alternate_path, + const void *private_data, + u8 private_data_len); + +/** + * ib_cm_init_qp_attr - Initializes the QP attributes for use in transitioning + * to a specified QP state. + * @cm_id: Communication identifier associated with the QP attributes to + * initialize. + * @qp_attr: On input, specifies the desired QP state. On output, the + * mandatory and desired optional attributes will be set in order to + * modify the QP to the specified state. + * @qp_attr_mask: The QP attribute mask that may be used to transition the + * QP to the specified state. + * + * Users must set the @qp_attr->qp_state to the desired QP state. This call + * will set all required attributes for the given transition, along with + * known optional attributes. Users may override the attributes returned from + * this call before calling ib_modify_qp. + */ +int ib_cm_init_qp_attr(struct ib_cm_id *cm_id, + struct ib_qp_attr *qp_attr, + int *qp_attr_mask); + +/** + * ib_send_cm_apr - Sends an alternate path response message in response to + * a load alternate path request. + * @cm_id: Connection identifier associated with the alternate path response. + * @status: Reply status sent with the alternate path response. + * @info: Optional additional information sent with the alternate path + * response. + * @info_length: Size of the additional information, in bytes. + * @private_data: Optional user-defined private data sent with the + * alternate path response message. + * @private_data_len: Size of the private data buffer, in bytes. + */ +int ib_send_cm_apr(struct ib_cm_id *cm_id, + enum ib_cm_apr_status status, + void *info, + u8 info_length, + const void *private_data, + u8 private_data_len); + +struct ib_cm_sidr_req_param { + struct ib_sa_path_rec *path; + __be64 service_id; + int timeout_ms; + const void *private_data; + u8 private_data_len; + u8 max_cm_retries; +}; + +/** + * ib_send_cm_sidr_req - Sends a service ID resolution request to the + * remote node. + * @cm_id: Communication identifier that will be associated with the + * service ID resolution request. + * @param: Service ID resolution request information. + */ +int ib_send_cm_sidr_req(struct ib_cm_id *cm_id, + struct ib_cm_sidr_req_param *param); + +struct ib_cm_sidr_rep_param { + u32 qp_num; + u32 qkey; + enum ib_cm_sidr_status status; + const void *info; + u8 info_length; + const void *private_data; + u8 private_data_len; +}; + +/** + * ib_send_cm_sidr_rep - Sends a service ID resolution reply to the + * remote node. + * @cm_id: Communication identifier associated with the received service ID + * resolution request. + * @param: Service ID resolution reply information. + */ +int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id, + struct ib_cm_sidr_rep_param *param); + +#endif /* IB_CM_H */ diff --git a/include/rdma/ib_fmr_pool.h b/include/rdma/ib_fmr_pool.h new file mode 100644 index 0000000..f62b842 --- /dev/null +++ b/include/rdma/ib_fmr_pool.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(IB_FMR_POOL_H) +#define IB_FMR_POOL_H + +#include + +struct ib_fmr_pool; + +/** + * struct ib_fmr_pool_param - Parameters for creating FMR pool + * @max_pages_per_fmr:Maximum number of pages per map request. + * @page_shift: Log2 of sizeof "pages" mapped by this fmr + * @access:Access flags for FMRs in pool. + * @pool_size:Number of FMRs to allocate for pool. + * @dirty_watermark:Flush is triggered when @dirty_watermark dirty + * FMRs are present. + * @flush_function:Callback called when unmapped FMRs are flushed and + * more FMRs are possibly available for mapping + * @flush_arg:Context passed to user's flush function. + * @cache:If set, FMRs may be reused after unmapping for identical map + * requests. + */ +struct ib_fmr_pool_param { + int max_pages_per_fmr; + int page_shift; + enum ib_access_flags access; + int pool_size; + int dirty_watermark; + void (*flush_function)(struct ib_fmr_pool *pool, + void *arg); + void *flush_arg; + unsigned cache:1; +}; + +struct ib_pool_fmr { + struct ib_fmr *fmr; + struct ib_fmr_pool *pool; + struct list_head list; + struct hlist_node cache_node; + int ref_count; + int remap_count; + u64 io_virtual_address; + int page_list_len; + u64 page_list[0]; +}; + +struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, + struct ib_fmr_pool_param *params); + +void ib_destroy_fmr_pool(struct ib_fmr_pool *pool); + +int ib_flush_fmr_pool(struct ib_fmr_pool *pool); + +struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle, + u64 *page_list, + int list_len, + u64 io_virtual_address); + +int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr); + +#endif /* IB_FMR_POOL_H */ diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h new file mode 100644 index 0000000..9bb99e9 --- /dev/null +++ b/include/rdma/ib_mad.h @@ -0,0 +1,680 @@ +/* + * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2004 Infinicon Corporation. All rights reserved. + * Copyright (c) 2004 Intel Corporation. All rights reserved. + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2004-2006 Voltaire Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(IB_MAD_H) +#define IB_MAD_H + +#include + +#include +#include + +/* Management base version */ +#define IB_MGMT_BASE_VERSION 1 + +/* Management classes */ +#define IB_MGMT_CLASS_SUBN_LID_ROUTED 0x01 +#define IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE 0x81 +#define IB_MGMT_CLASS_SUBN_ADM 0x03 +#define IB_MGMT_CLASS_PERF_MGMT 0x04 +#define IB_MGMT_CLASS_BM 0x05 +#define IB_MGMT_CLASS_DEVICE_MGMT 0x06 +#define IB_MGMT_CLASS_CM 0x07 +#define IB_MGMT_CLASS_SNMP 0x08 +#define IB_MGMT_CLASS_DEVICE_ADM 0x10 +#define IB_MGMT_CLASS_BOOT_MGMT 0x11 +#define IB_MGMT_CLASS_BIS 0x12 +#define IB_MGMT_CLASS_CONG_MGMT 0x21 +#define IB_MGMT_CLASS_VENDOR_RANGE2_START 0x30 +#define IB_MGMT_CLASS_VENDOR_RANGE2_END 0x4F + +#define IB_OPENIB_OUI (0x001405) + +/* Management methods */ +#define IB_MGMT_METHOD_GET 0x01 +#define IB_MGMT_METHOD_SET 0x02 +#define IB_MGMT_METHOD_GET_RESP 0x81 +#define IB_MGMT_METHOD_SEND 0x03 +#define IB_MGMT_METHOD_TRAP 0x05 +#define IB_MGMT_METHOD_REPORT 0x06 +#define IB_MGMT_METHOD_REPORT_RESP 0x86 +#define IB_MGMT_METHOD_TRAP_REPRESS 0x07 + +#define IB_MGMT_METHOD_RESP 0x80 +#define IB_BM_ATTR_MOD_RESP cpu_to_be32(1) + +#define IB_MGMT_MAX_METHODS 128 + +/* MAD Status field bit masks */ +#define IB_MGMT_MAD_STATUS_SUCCESS 0x0000 +#define IB_MGMT_MAD_STATUS_BUSY 0x0001 +#define IB_MGMT_MAD_STATUS_REDIRECT_REQD 0x0002 +#define IB_MGMT_MAD_STATUS_BAD_VERSION 0x0004 +#define IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD 0x0008 +#define IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB 0x000c +#define IB_MGMT_MAD_STATUS_INVALID_ATTRIB_VALUE 0x001c + +/* RMPP information */ +#define IB_MGMT_RMPP_VERSION 1 + +#define IB_MGMT_RMPP_TYPE_DATA 1 +#define IB_MGMT_RMPP_TYPE_ACK 2 +#define IB_MGMT_RMPP_TYPE_STOP 3 +#define IB_MGMT_RMPP_TYPE_ABORT 4 + +#define IB_MGMT_RMPP_FLAG_ACTIVE 1 +#define IB_MGMT_RMPP_FLAG_FIRST (1<<1) +#define IB_MGMT_RMPP_FLAG_LAST (1<<2) + +#define IB_MGMT_RMPP_NO_RESPTIME 0x1F + +#define IB_MGMT_RMPP_STATUS_SUCCESS 0 +#define IB_MGMT_RMPP_STATUS_RESX 1 +#define IB_MGMT_RMPP_STATUS_ABORT_MIN 118 +#define IB_MGMT_RMPP_STATUS_T2L 118 +#define IB_MGMT_RMPP_STATUS_BAD_LEN 119 +#define IB_MGMT_RMPP_STATUS_BAD_SEG 120 +#define IB_MGMT_RMPP_STATUS_BADT 121 +#define IB_MGMT_RMPP_STATUS_W2S 122 +#define IB_MGMT_RMPP_STATUS_S2B 123 +#define IB_MGMT_RMPP_STATUS_BAD_STATUS 124 +#define IB_MGMT_RMPP_STATUS_UNV 125 +#define IB_MGMT_RMPP_STATUS_TMR 126 +#define IB_MGMT_RMPP_STATUS_UNSPEC 127 +#define IB_MGMT_RMPP_STATUS_ABORT_MAX 127 + +#define IB_QP0 0 +#define IB_QP1 cpu_to_be32(1) +#define IB_QP1_QKEY 0x80010000 +#define IB_QP_SET_QKEY 0x80000000 + +#define IB_DEFAULT_PKEY_PARTIAL 0x7FFF +#define IB_DEFAULT_PKEY_FULL 0xFFFF + +enum { + IB_MGMT_MAD_HDR = 24, + IB_MGMT_MAD_DATA = 232, + IB_MGMT_RMPP_HDR = 36, + IB_MGMT_RMPP_DATA = 220, + IB_MGMT_VENDOR_HDR = 40, + IB_MGMT_VENDOR_DATA = 216, + IB_MGMT_SA_HDR = 56, + IB_MGMT_SA_DATA = 200, + IB_MGMT_DEVICE_HDR = 64, + IB_MGMT_DEVICE_DATA = 192, +}; + +struct ib_mad_hdr { + u8 base_version; + u8 mgmt_class; + u8 class_version; + u8 method; + __be16 status; + __be16 class_specific; + __be64 tid; + __be16 attr_id; + __be16 resv; + __be32 attr_mod; +}; + +struct ib_rmpp_hdr { + u8 rmpp_version; + u8 rmpp_type; + u8 rmpp_rtime_flags; + u8 rmpp_status; + __be32 seg_num; + __be32 paylen_newwin; +}; + +typedef u64 __bitwise ib_sa_comp_mask; + +#define IB_SA_COMP_MASK(n) ((__force ib_sa_comp_mask) cpu_to_be64(1ull << (n))) + +/* + * ib_sa_hdr and ib_sa_mad structures must be packed because they have + * 64-bit fields that are only 32-bit aligned. 64-bit architectures will + * lay them out wrong otherwise. (And unfortunately they are sent on + * the wire so we can't change the layout) + */ +struct ib_sa_hdr { + __be64 sm_key; + __be16 attr_offset; + __be16 reserved; + ib_sa_comp_mask comp_mask; +} __attribute__ ((packed)); + +struct ib_mad { + struct ib_mad_hdr mad_hdr; + u8 data[IB_MGMT_MAD_DATA]; +}; + +struct ib_rmpp_mad { + struct ib_mad_hdr mad_hdr; + struct ib_rmpp_hdr rmpp_hdr; + u8 data[IB_MGMT_RMPP_DATA]; +}; + +struct ib_sa_mad { + struct ib_mad_hdr mad_hdr; + struct ib_rmpp_hdr rmpp_hdr; + struct ib_sa_hdr sa_hdr; + u8 data[IB_MGMT_SA_DATA]; +} __attribute__ ((packed)); + +struct ib_vendor_mad { + struct ib_mad_hdr mad_hdr; + struct ib_rmpp_hdr rmpp_hdr; + u8 reserved; + u8 oui[3]; + u8 data[IB_MGMT_VENDOR_DATA]; +}; + +struct ib_class_port_info { + u8 base_version; + u8 class_version; + __be16 capability_mask; + u8 reserved[3]; + u8 resp_time_value; + u8 redirect_gid[16]; + __be32 redirect_tcslfl; + __be16 redirect_lid; + __be16 redirect_pkey; + __be32 redirect_qp; + __be32 redirect_qkey; + u8 trap_gid[16]; + __be32 trap_tcslfl; + __be16 trap_lid; + __be16 trap_pkey; + __be32 trap_hlqp; + __be32 trap_qkey; +}; + +/** + * ib_mad_send_buf - MAD data buffer and work request for sends. + * @next: A pointer used to chain together MADs for posting. + * @mad: References an allocated MAD data buffer for MADs that do not have + * RMPP active. For MADs using RMPP, references the common and management + * class specific headers. + * @mad_agent: MAD agent that allocated the buffer. + * @ah: The address handle to use when sending the MAD. + * @context: User-controlled context fields. + * @hdr_len: Indicates the size of the data header of the MAD. This length + * includes the common MAD, RMPP, and class specific headers. + * @data_len: Indicates the total size of user-transferred data. + * @seg_count: The number of RMPP segments allocated for this send. + * @seg_size: Size of each RMPP segment. + * @timeout_ms: Time to wait for a response. + * @retries: Number of times to retry a request for a response. For MADs + * using RMPP, this applies per window. On completion, returns the number + * of retries needed to complete the transfer. + * + * Users are responsible for initializing the MAD buffer itself, with the + * exception of any RMPP header. Additional segment buffer space allocated + * beyond data_len is padding. + */ +struct ib_mad_send_buf { + struct ib_mad_send_buf *next; + void *mad; + struct ib_mad_agent *mad_agent; + struct ib_ah *ah; + void *context[2]; + int hdr_len; + int data_len; + int seg_count; + int seg_size; + int timeout_ms; + int retries; +}; + +/** + * ib_response_mad - Returns if the specified MAD has been generated in + * response to a sent request or trap. + */ +int ib_response_mad(struct ib_mad *mad); + +/** + * ib_get_rmpp_resptime - Returns the RMPP response time. + * @rmpp_hdr: An RMPP header. + */ +static inline u8 ib_get_rmpp_resptime(struct ib_rmpp_hdr *rmpp_hdr) +{ + return rmpp_hdr->rmpp_rtime_flags >> 3; +} + +/** + * ib_get_rmpp_flags - Returns the RMPP flags. + * @rmpp_hdr: An RMPP header. + */ +static inline u8 ib_get_rmpp_flags(struct ib_rmpp_hdr *rmpp_hdr) +{ + return rmpp_hdr->rmpp_rtime_flags & 0x7; +} + +/** + * ib_set_rmpp_resptime - Sets the response time in an RMPP header. + * @rmpp_hdr: An RMPP header. + * @rtime: The response time to set. + */ +static inline void ib_set_rmpp_resptime(struct ib_rmpp_hdr *rmpp_hdr, u8 rtime) +{ + rmpp_hdr->rmpp_rtime_flags = ib_get_rmpp_flags(rmpp_hdr) | (rtime << 3); +} + +/** + * ib_set_rmpp_flags - Sets the flags in an RMPP header. + * @rmpp_hdr: An RMPP header. + * @flags: The flags to set. + */ +static inline void ib_set_rmpp_flags(struct ib_rmpp_hdr *rmpp_hdr, u8 flags) +{ + rmpp_hdr->rmpp_rtime_flags = (rmpp_hdr->rmpp_rtime_flags & 0xF8) | + (flags & 0x7); +} + +struct ib_mad_agent; +struct ib_mad_send_wc; +struct ib_mad_recv_wc; + +/** + * ib_mad_send_handler - callback handler for a sent MAD. + * @mad_agent: MAD agent that sent the MAD. + * @mad_send_wc: Send work completion information on the sent MAD. + */ +typedef void (*ib_mad_send_handler)(struct ib_mad_agent *mad_agent, + struct ib_mad_send_wc *mad_send_wc); + +/** + * ib_mad_snoop_handler - Callback handler for snooping sent MADs. + * @mad_agent: MAD agent that snooped the MAD. + * @send_wr: Work request information on the sent MAD. + * @mad_send_wc: Work completion information on the sent MAD. Valid + * only for snooping that occurs on a send completion. + * + * Clients snooping MADs should not modify data referenced by the @send_wr + * or @mad_send_wc. + */ +typedef void (*ib_mad_snoop_handler)(struct ib_mad_agent *mad_agent, + struct ib_mad_send_buf *send_buf, + struct ib_mad_send_wc *mad_send_wc); + +/** + * ib_mad_recv_handler - callback handler for a received MAD. + * @mad_agent: MAD agent requesting the received MAD. + * @mad_recv_wc: Received work completion information on the received MAD. + * + * MADs received in response to a send request operation will be handed to + * the user before the send operation completes. All data buffers given + * to registered agents through this routine are owned by the receiving + * client, except for snooping agents. Clients snooping MADs should not + * modify the data referenced by @mad_recv_wc. + */ +typedef void (*ib_mad_recv_handler)(struct ib_mad_agent *mad_agent, + struct ib_mad_recv_wc *mad_recv_wc); + +/** + * ib_mad_agent - Used to track MAD registration with the access layer. + * @device: Reference to device registration is on. + * @qp: Reference to QP used for sending and receiving MADs. + * @mr: Memory region for system memory usable for DMA. + * @recv_handler: Callback handler for a received MAD. + * @send_handler: Callback handler for a sent MAD. + * @snoop_handler: Callback handler for snooped sent MADs. + * @context: User-specified context associated with this registration. + * @hi_tid: Access layer assigned transaction ID for this client. + * Unsolicited MADs sent by this client will have the upper 32-bits + * of their TID set to this value. + * @flags: registration flags + * @port_num: Port number on which QP is registered + * @rmpp_version: If set, indicates the RMPP version used by this agent. + */ +enum { + IB_MAD_USER_RMPP = IB_USER_MAD_USER_RMPP, +}; +struct ib_mad_agent { + struct ib_device *device; + struct ib_qp *qp; + struct ib_mr *mr; + ib_mad_recv_handler recv_handler; + ib_mad_send_handler send_handler; + ib_mad_snoop_handler snoop_handler; + void *context; + u32 hi_tid; + u32 flags; + u8 port_num; + u8 rmpp_version; +}; + +/** + * ib_mad_send_wc - MAD send completion information. + * @send_buf: Send MAD data buffer associated with the send MAD request. + * @status: Completion status. + * @vendor_err: Optional vendor error information returned with a failed + * request. + */ +struct ib_mad_send_wc { + struct ib_mad_send_buf *send_buf; + enum ib_wc_status status; + u32 vendor_err; +}; + +/** + * ib_mad_recv_buf - received MAD buffer information. + * @list: Reference to next data buffer for a received RMPP MAD. + * @grh: References a data buffer containing the global route header. + * The data refereced by this buffer is only valid if the GRH is + * valid. + * @mad: References the start of the received MAD. + */ +struct ib_mad_recv_buf { + struct list_head list; + struct ib_grh *grh; + struct ib_mad *mad; +}; + +/** + * ib_mad_recv_wc - received MAD information. + * @wc: Completion information for the received data. + * @recv_buf: Specifies the location of the received data buffer(s). + * @rmpp_list: Specifies a list of RMPP reassembled received MAD buffers. + * @mad_len: The length of the received MAD, without duplicated headers. + * + * For received response, the wr_id contains a pointer to the ib_mad_send_buf + * for the corresponding send request. + */ +struct ib_mad_recv_wc { + struct ib_wc *wc; + struct ib_mad_recv_buf recv_buf; + struct list_head rmpp_list; + int mad_len; +}; + +/** + * ib_mad_reg_req - MAD registration request + * @mgmt_class: Indicates which management class of MADs should be receive + * by the caller. This field is only required if the user wishes to + * receive unsolicited MADs, otherwise it should be 0. + * @mgmt_class_version: Indicates which version of MADs for the given + * management class to receive. + * @oui: Indicates IEEE OUI when mgmt_class is a vendor class + * in the range from 0x30 to 0x4f. Otherwise not used. + * @method_mask: The caller will receive unsolicited MADs for any method + * where @method_mask = 1. + * + */ +struct ib_mad_reg_req { + u8 mgmt_class; + u8 mgmt_class_version; + u8 oui[3]; + DECLARE_BITMAP(method_mask, IB_MGMT_MAX_METHODS); +}; + +/** + * ib_register_mad_agent - Register to send/receive MADs. + * @device: The device to register with. + * @port_num: The port on the specified device to use. + * @qp_type: Specifies which QP to access. Must be either + * IB_QPT_SMI or IB_QPT_GSI. + * @mad_reg_req: Specifies which unsolicited MADs should be received + * by the caller. This parameter may be NULL if the caller only + * wishes to receive solicited responses. + * @rmpp_version: If set, indicates that the client will send + * and receive MADs that contain the RMPP header for the given version. + * If set to 0, indicates that RMPP is not used by this client. + * @send_handler: The completion callback routine invoked after a send + * request has completed. + * @recv_handler: The completion callback routine invoked for a received + * MAD. + * @context: User specified context associated with the registration. + * @registration_flags: Registration flags to set for this agent + */ +struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, + u8 port_num, + enum ib_qp_type qp_type, + struct ib_mad_reg_req *mad_reg_req, + u8 rmpp_version, + ib_mad_send_handler send_handler, + ib_mad_recv_handler recv_handler, + void *context, + u32 registration_flags); + +enum ib_mad_snoop_flags { + /*IB_MAD_SNOOP_POSTED_SENDS = 1,*/ + /*IB_MAD_SNOOP_RMPP_SENDS = (1<<1),*/ + IB_MAD_SNOOP_SEND_COMPLETIONS = (1<<2), + /*IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS = (1<<3),*/ + IB_MAD_SNOOP_RECVS = (1<<4) + /*IB_MAD_SNOOP_RMPP_RECVS = (1<<5),*/ + /*IB_MAD_SNOOP_REDIRECTED_QPS = (1<<6)*/ +}; + +/** + * ib_register_mad_snoop - Register to snoop sent and received MADs. + * @device: The device to register with. + * @port_num: The port on the specified device to use. + * @qp_type: Specifies which QP traffic to snoop. Must be either + * IB_QPT_SMI or IB_QPT_GSI. + * @mad_snoop_flags: Specifies information where snooping occurs. + * @send_handler: The callback routine invoked for a snooped send. + * @recv_handler: The callback routine invoked for a snooped receive. + * @context: User specified context associated with the registration. + */ +struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device, + u8 port_num, + enum ib_qp_type qp_type, + int mad_snoop_flags, + ib_mad_snoop_handler snoop_handler, + ib_mad_recv_handler recv_handler, + void *context); + +/** + * ib_unregister_mad_agent - Unregisters a client from using MAD services. + * @mad_agent: Corresponding MAD registration request to deregister. + * + * After invoking this routine, MAD services are no longer usable by the + * client on the associated QP. + */ +int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent); + +/** + * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated + * with the registered client. + * @send_buf: Specifies the information needed to send the MAD(s). + * @bad_send_buf: Specifies the MAD on which an error was encountered. This + * parameter is optional if only a single MAD is posted. + * + * Sent MADs are not guaranteed to complete in the order that they were posted. + * + * If the MAD requires RMPP, the data buffer should contain a single copy + * of the common MAD, RMPP, and class specific headers, followed by the class + * defined data. If the class defined data would not divide evenly into + * RMPP segments, then space must be allocated at the end of the referenced + * buffer for any required padding. To indicate the amount of class defined + * data being transferred, the paylen_newwin field in the RMPP header should + * be set to the size of the class specific header plus the amount of class + * defined data being transferred. The paylen_newwin field should be + * specified in network-byte order. + */ +int ib_post_send_mad(struct ib_mad_send_buf *send_buf, + struct ib_mad_send_buf **bad_send_buf); + + +/** + * ib_free_recv_mad - Returns data buffers used to receive a MAD. + * @mad_recv_wc: Work completion information for a received MAD. + * + * Clients receiving MADs through their ib_mad_recv_handler must call this + * routine to return the work completion buffers to the access layer. + */ +void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc); + +/** + * ib_cancel_mad - Cancels an outstanding send MAD operation. + * @mad_agent: Specifies the registration associated with sent MAD. + * @send_buf: Indicates the MAD to cancel. + * + * MADs will be returned to the user through the corresponding + * ib_mad_send_handler. + */ +void ib_cancel_mad(struct ib_mad_agent *mad_agent, + struct ib_mad_send_buf *send_buf); + +/** + * ib_modify_mad - Modifies an outstanding send MAD operation. + * @mad_agent: Specifies the registration associated with sent MAD. + * @send_buf: Indicates the MAD to modify. + * @timeout_ms: New timeout value for sent MAD. + * + * This call will reset the timeout value for a sent MAD to the specified + * value. + */ +int ib_modify_mad(struct ib_mad_agent *mad_agent, + struct ib_mad_send_buf *send_buf, u32 timeout_ms); + +/** + * ib_redirect_mad_qp - Registers a QP for MAD services. + * @qp: Reference to a QP that requires MAD services. + * @rmpp_version: If set, indicates that the client will send + * and receive MADs that contain the RMPP header for the given version. + * If set to 0, indicates that RMPP is not used by this client. + * @send_handler: The completion callback routine invoked after a send + * request has completed. + * @recv_handler: The completion callback routine invoked for a received + * MAD. + * @context: User specified context associated with the registration. + * + * Use of this call allows clients to use MAD services, such as RMPP, + * on user-owned QPs. After calling this routine, users may send + * MADs on the specified QP by calling ib_mad_post_send. + */ +struct ib_mad_agent *ib_redirect_mad_qp(struct ib_qp *qp, + u8 rmpp_version, + ib_mad_send_handler send_handler, + ib_mad_recv_handler recv_handler, + void *context); + +/** + * ib_process_mad_wc - Processes a work completion associated with a + * MAD sent or received on a redirected QP. + * @mad_agent: Specifies the registered MAD service using the redirected QP. + * @wc: References a work completion associated with a sent or received + * MAD segment. + * + * This routine is used to complete or continue processing on a MAD request. + * If the work completion is associated with a send operation, calling + * this routine is required to continue an RMPP transfer or to wait for a + * corresponding response, if it is a request. If the work completion is + * associated with a receive operation, calling this routine is required to + * process an inbound or outbound RMPP transfer, or to match a response MAD + * with its corresponding request. + */ +int ib_process_mad_wc(struct ib_mad_agent *mad_agent, + struct ib_wc *wc); + +/** + * ib_create_send_mad - Allocate and initialize a data buffer and work request + * for sending a MAD. + * @mad_agent: Specifies the registered MAD service to associate with the MAD. + * @remote_qpn: Specifies the QPN of the receiving node. + * @pkey_index: Specifies which PKey the MAD will be sent using. This field + * is valid only if the remote_qpn is QP 1. + * @rmpp_active: Indicates if the send will enable RMPP. + * @hdr_len: Indicates the size of the data header of the MAD. This length + * should include the common MAD header, RMPP header, plus any class + * specific header. + * @data_len: Indicates the size of any user-transferred data. The call will + * automatically adjust the allocated buffer size to account for any + * additional padding that may be necessary. + * @gfp_mask: GFP mask used for the memory allocation. + * + * This routine allocates a MAD for sending. The returned MAD send buffer + * will reference a data buffer usable for sending a MAD, along + * with an initialized work request structure. Users may modify the returned + * MAD data buffer before posting the send. + * + * The returned MAD header, class specific headers, and any padding will be + * cleared. Users are responsible for initializing the common MAD header, + * any class specific header, and MAD data area. + * If @rmpp_active is set, the RMPP header will be initialized for sending. + */ +struct ib_mad_send_buf *ib_create_send_mad(struct ib_mad_agent *mad_agent, + u32 remote_qpn, u16 pkey_index, + int rmpp_active, + int hdr_len, int data_len, + gfp_t gfp_mask); + +/** + * ib_is_mad_class_rmpp - returns whether given management class + * supports RMPP. + * @mgmt_class: management class + * + * This routine returns whether the management class supports RMPP. + */ +int ib_is_mad_class_rmpp(u8 mgmt_class); + +/** + * ib_get_mad_data_offset - returns the data offset for a given + * management class. + * @mgmt_class: management class + * + * This routine returns the data offset in the MAD for the management + * class requested. + */ +int ib_get_mad_data_offset(u8 mgmt_class); + +/** + * ib_get_rmpp_segment - returns the data buffer for a given RMPP segment. + * @send_buf: Previously allocated send data buffer. + * @seg_num: number of segment to return + * + * This routine returns a pointer to the data buffer of an RMPP MAD. + * Users must provide synchronization to @send_buf around this call. + */ +void *ib_get_rmpp_segment(struct ib_mad_send_buf *send_buf, int seg_num); + +/** + * ib_free_send_mad - Returns data buffers used to send a MAD. + * @send_buf: Previously allocated send data buffer. + */ +void ib_free_send_mad(struct ib_mad_send_buf *send_buf); + +/** + * ib_mad_kernel_rmpp_agent - Returns if the agent is performing RMPP. + * @agent: the agent in question + * @return: true if agent is performing rmpp, false otherwise. + */ +int ib_mad_kernel_rmpp_agent(struct ib_mad_agent *agent); + +#endif /* IB_MAD_H */ diff --git a/include/rdma/ib_marshall.h b/include/rdma/ib_marshall.h new file mode 100644 index 0000000..db03720 --- /dev/null +++ b/include/rdma/ib_marshall.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(IB_USER_MARSHALL_H) +#define IB_USER_MARSHALL_H + +#include +#include +#include +#include + +void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst, + struct ib_qp_attr *src); + +void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst, + struct ib_ah_attr *src); + +void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst, + struct ib_sa_path_rec *src); + +void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst, + struct ib_user_path_rec *src); + +#endif /* IB_USER_MARSHALL_H */ diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h new file mode 100644 index 0000000..b1f7592 --- /dev/null +++ b/include/rdma/ib_pack.h @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IB_PACK_H +#define IB_PACK_H + +#include +#include + +enum { + IB_LRH_BYTES = 8, + IB_ETH_BYTES = 14, + IB_VLAN_BYTES = 4, + IB_GRH_BYTES = 40, + IB_BTH_BYTES = 12, + IB_DETH_BYTES = 8 +}; + +struct ib_field { + size_t struct_offset_bytes; + size_t struct_size_bytes; + int offset_words; + int offset_bits; + int size_bits; + char *field_name; +}; + +#define RESERVED \ + .field_name = "reserved" + +/* + * This macro cleans up the definitions of constants for BTH opcodes. + * It is used to define constants such as IB_OPCODE_UD_SEND_ONLY, + * which becomes IB_OPCODE_UD + IB_OPCODE_SEND_ONLY, and this gives + * the correct value. + * + * In short, user code should use the constants defined using the + * macro rather than worrying about adding together other constants. +*/ +#define IB_OPCODE(transport, op) \ + IB_OPCODE_ ## transport ## _ ## op = \ + IB_OPCODE_ ## transport + IB_OPCODE_ ## op + +enum { + /* transport types -- just used to define real constants */ + IB_OPCODE_RC = 0x00, + IB_OPCODE_UC = 0x20, + IB_OPCODE_RD = 0x40, + IB_OPCODE_UD = 0x60, + + /* operations -- just used to define real constants */ + IB_OPCODE_SEND_FIRST = 0x00, + IB_OPCODE_SEND_MIDDLE = 0x01, + IB_OPCODE_SEND_LAST = 0x02, + IB_OPCODE_SEND_LAST_WITH_IMMEDIATE = 0x03, + IB_OPCODE_SEND_ONLY = 0x04, + IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE = 0x05, + IB_OPCODE_RDMA_WRITE_FIRST = 0x06, + IB_OPCODE_RDMA_WRITE_MIDDLE = 0x07, + IB_OPCODE_RDMA_WRITE_LAST = 0x08, + IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE = 0x09, + IB_OPCODE_RDMA_WRITE_ONLY = 0x0a, + IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE = 0x0b, + IB_OPCODE_RDMA_READ_REQUEST = 0x0c, + IB_OPCODE_RDMA_READ_RESPONSE_FIRST = 0x0d, + IB_OPCODE_RDMA_READ_RESPONSE_MIDDLE = 0x0e, + IB_OPCODE_RDMA_READ_RESPONSE_LAST = 0x0f, + IB_OPCODE_RDMA_READ_RESPONSE_ONLY = 0x10, + IB_OPCODE_ACKNOWLEDGE = 0x11, + IB_OPCODE_ATOMIC_ACKNOWLEDGE = 0x12, + IB_OPCODE_COMPARE_SWAP = 0x13, + IB_OPCODE_FETCH_ADD = 0x14, + + /* real constants follow -- see comment about above IB_OPCODE() + macro for more details */ + + /* RC */ + IB_OPCODE(RC, SEND_FIRST), + IB_OPCODE(RC, SEND_MIDDLE), + IB_OPCODE(RC, SEND_LAST), + IB_OPCODE(RC, SEND_LAST_WITH_IMMEDIATE), + IB_OPCODE(RC, SEND_ONLY), + IB_OPCODE(RC, SEND_ONLY_WITH_IMMEDIATE), + IB_OPCODE(RC, RDMA_WRITE_FIRST), + IB_OPCODE(RC, RDMA_WRITE_MIDDLE), + IB_OPCODE(RC, RDMA_WRITE_LAST), + IB_OPCODE(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE), + IB_OPCODE(RC, RDMA_WRITE_ONLY), + IB_OPCODE(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE), + IB_OPCODE(RC, RDMA_READ_REQUEST), + IB_OPCODE(RC, RDMA_READ_RESPONSE_FIRST), + IB_OPCODE(RC, RDMA_READ_RESPONSE_MIDDLE), + IB_OPCODE(RC, RDMA_READ_RESPONSE_LAST), + IB_OPCODE(RC, RDMA_READ_RESPONSE_ONLY), + IB_OPCODE(RC, ACKNOWLEDGE), + IB_OPCODE(RC, ATOMIC_ACKNOWLEDGE), + IB_OPCODE(RC, COMPARE_SWAP), + IB_OPCODE(RC, FETCH_ADD), + + /* UC */ + IB_OPCODE(UC, SEND_FIRST), + IB_OPCODE(UC, SEND_MIDDLE), + IB_OPCODE(UC, SEND_LAST), + IB_OPCODE(UC, SEND_LAST_WITH_IMMEDIATE), + IB_OPCODE(UC, SEND_ONLY), + IB_OPCODE(UC, SEND_ONLY_WITH_IMMEDIATE), + IB_OPCODE(UC, RDMA_WRITE_FIRST), + IB_OPCODE(UC, RDMA_WRITE_MIDDLE), + IB_OPCODE(UC, RDMA_WRITE_LAST), + IB_OPCODE(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE), + IB_OPCODE(UC, RDMA_WRITE_ONLY), + IB_OPCODE(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE), + + /* RD */ + IB_OPCODE(RD, SEND_FIRST), + IB_OPCODE(RD, SEND_MIDDLE), + IB_OPCODE(RD, SEND_LAST), + IB_OPCODE(RD, SEND_LAST_WITH_IMMEDIATE), + IB_OPCODE(RD, SEND_ONLY), + IB_OPCODE(RD, SEND_ONLY_WITH_IMMEDIATE), + IB_OPCODE(RD, RDMA_WRITE_FIRST), + IB_OPCODE(RD, RDMA_WRITE_MIDDLE), + IB_OPCODE(RD, RDMA_WRITE_LAST), + IB_OPCODE(RD, RDMA_WRITE_LAST_WITH_IMMEDIATE), + IB_OPCODE(RD, RDMA_WRITE_ONLY), + IB_OPCODE(RD, RDMA_WRITE_ONLY_WITH_IMMEDIATE), + IB_OPCODE(RD, RDMA_READ_REQUEST), + IB_OPCODE(RD, RDMA_READ_RESPONSE_FIRST), + IB_OPCODE(RD, RDMA_READ_RESPONSE_MIDDLE), + IB_OPCODE(RD, RDMA_READ_RESPONSE_LAST), + IB_OPCODE(RD, RDMA_READ_RESPONSE_ONLY), + IB_OPCODE(RD, ACKNOWLEDGE), + IB_OPCODE(RD, ATOMIC_ACKNOWLEDGE), + IB_OPCODE(RD, COMPARE_SWAP), + IB_OPCODE(RD, FETCH_ADD), + + /* UD */ + IB_OPCODE(UD, SEND_ONLY), + IB_OPCODE(UD, SEND_ONLY_WITH_IMMEDIATE) +}; + +enum { + IB_LNH_RAW = 0, + IB_LNH_IP = 1, + IB_LNH_IBA_LOCAL = 2, + IB_LNH_IBA_GLOBAL = 3 +}; + +struct ib_unpacked_lrh { + u8 virtual_lane; + u8 link_version; + u8 service_level; + u8 link_next_header; + __be16 destination_lid; + __be16 packet_length; + __be16 source_lid; +}; + +struct ib_unpacked_grh { + u8 ip_version; + u8 traffic_class; + __be32 flow_label; + __be16 payload_length; + u8 next_header; + u8 hop_limit; + union ib_gid source_gid; + union ib_gid destination_gid; +}; + +struct ib_unpacked_bth { + u8 opcode; + u8 solicited_event; + u8 mig_req; + u8 pad_count; + u8 transport_header_version; + __be16 pkey; + __be32 destination_qpn; + u8 ack_req; + __be32 psn; +}; + +struct ib_unpacked_deth { + __be32 qkey; + __be32 source_qpn; +}; + +struct ib_unpacked_eth { + u8 dmac_h[4]; + u8 dmac_l[2]; + u8 smac_h[2]; + u8 smac_l[4]; + __be16 type; +}; + +struct ib_unpacked_vlan { + __be16 tag; + __be16 type; +}; + +struct ib_ud_header { + int lrh_present; + struct ib_unpacked_lrh lrh; + int eth_present; + struct ib_unpacked_eth eth; + int vlan_present; + struct ib_unpacked_vlan vlan; + int grh_present; + struct ib_unpacked_grh grh; + struct ib_unpacked_bth bth; + struct ib_unpacked_deth deth; + int immediate_present; + __be32 immediate_data; +}; + +void ib_pack(const struct ib_field *desc, + int desc_len, + void *structure, + void *buf); + +void ib_unpack(const struct ib_field *desc, + int desc_len, + void *buf, + void *structure); + +void ib_ud_header_init(int payload_bytes, + int lrh_present, + int eth_present, + int vlan_present, + int grh_present, + int immediate_present, + struct ib_ud_header *header); + +int ib_ud_header_pack(struct ib_ud_header *header, + void *buf); + +int ib_ud_header_unpack(void *buf, + struct ib_ud_header *header); + +#endif /* IB_PACK_H */ diff --git a/include/rdma/ib_pma.h b/include/rdma/ib_pma.h new file mode 100644 index 0000000..a5889f1 --- /dev/null +++ b/include/rdma/ib_pma.h @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation. + * All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(IB_PMA_H) +#define IB_PMA_H + +#include + +/* + * PMA class portinfo capability mask bits + */ +#define IB_PMA_CLASS_CAP_ALLPORTSELECT cpu_to_be16(1 << 8) +#define IB_PMA_CLASS_CAP_EXT_WIDTH cpu_to_be16(1 << 9) +#define IB_PMA_CLASS_CAP_XMIT_WAIT cpu_to_be16(1 << 12) + +#define IB_PMA_CLASS_PORT_INFO cpu_to_be16(0x0001) +#define IB_PMA_PORT_SAMPLES_CONTROL cpu_to_be16(0x0010) +#define IB_PMA_PORT_SAMPLES_RESULT cpu_to_be16(0x0011) +#define IB_PMA_PORT_COUNTERS cpu_to_be16(0x0012) +#define IB_PMA_PORT_COUNTERS_EXT cpu_to_be16(0x001D) +#define IB_PMA_PORT_SAMPLES_RESULT_EXT cpu_to_be16(0x001E) + +struct ib_pma_mad { + struct ib_mad_hdr mad_hdr; + u8 reserved[40]; + u8 data[192]; +} __packed; + +struct ib_pma_portsamplescontrol { + u8 opcode; + u8 port_select; + u8 tick; + u8 counter_width; /* resv: 7:3, counter width: 2:0 */ + __be32 counter_mask0_9; /* 2, 10 3-bit fields */ + __be16 counter_mask10_14; /* 1, 5 3-bit fields */ + u8 sample_mechanisms; + u8 sample_status; /* only lower 2 bits */ + __be64 option_mask; + __be64 vendor_mask; + __be32 sample_start; + __be32 sample_interval; + __be16 tag; + __be16 counter_select[15]; + __be32 reserved1; + __be64 samples_only_option_mask; + __be32 reserved2[28]; +}; + +struct ib_pma_portsamplesresult { + __be16 tag; + __be16 sample_status; /* only lower 2 bits */ + __be32 counter[15]; +}; + +struct ib_pma_portsamplesresult_ext { + __be16 tag; + __be16 sample_status; /* only lower 2 bits */ + __be32 extended_width; /* only upper 2 bits */ + __be64 counter[15]; +}; + +struct ib_pma_portcounters { + u8 reserved; + u8 port_select; + __be16 counter_select; + __be16 symbol_error_counter; + u8 link_error_recovery_counter; + u8 link_downed_counter; + __be16 port_rcv_errors; + __be16 port_rcv_remphys_errors; + __be16 port_rcv_switch_relay_errors; + __be16 port_xmit_discards; + u8 port_xmit_constraint_errors; + u8 port_rcv_constraint_errors; + u8 reserved1; + u8 link_overrun_errors; /* LocalLink: 7:4, BufferOverrun: 3:0 */ + __be16 reserved2; + __be16 vl15_dropped; + __be32 port_xmit_data; + __be32 port_rcv_data; + __be32 port_xmit_packets; + __be32 port_rcv_packets; + __be32 port_xmit_wait; +} __packed; + + +#define IB_PMA_SEL_SYMBOL_ERROR cpu_to_be16(0x0001) +#define IB_PMA_SEL_LINK_ERROR_RECOVERY cpu_to_be16(0x0002) +#define IB_PMA_SEL_LINK_DOWNED cpu_to_be16(0x0004) +#define IB_PMA_SEL_PORT_RCV_ERRORS cpu_to_be16(0x0008) +#define IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS cpu_to_be16(0x0010) +#define IB_PMA_SEL_PORT_XMIT_DISCARDS cpu_to_be16(0x0040) +#define IB_PMA_SEL_LOCAL_LINK_INTEGRITY_ERRORS cpu_to_be16(0x0200) +#define IB_PMA_SEL_EXCESSIVE_BUFFER_OVERRUNS cpu_to_be16(0x0400) +#define IB_PMA_SEL_PORT_VL15_DROPPED cpu_to_be16(0x0800) +#define IB_PMA_SEL_PORT_XMIT_DATA cpu_to_be16(0x1000) +#define IB_PMA_SEL_PORT_RCV_DATA cpu_to_be16(0x2000) +#define IB_PMA_SEL_PORT_XMIT_PACKETS cpu_to_be16(0x4000) +#define IB_PMA_SEL_PORT_RCV_PACKETS cpu_to_be16(0x8000) + +struct ib_pma_portcounters_ext { + u8 reserved; + u8 port_select; + __be16 counter_select; + __be32 reserved1; + __be64 port_xmit_data; + __be64 port_rcv_data; + __be64 port_xmit_packets; + __be64 port_rcv_packets; + __be64 port_unicast_xmit_packets; + __be64 port_unicast_rcv_packets; + __be64 port_multicast_xmit_packets; + __be64 port_multicast_rcv_packets; +} __packed; + +#define IB_PMA_SELX_PORT_XMIT_DATA cpu_to_be16(0x0001) +#define IB_PMA_SELX_PORT_RCV_DATA cpu_to_be16(0x0002) +#define IB_PMA_SELX_PORT_XMIT_PACKETS cpu_to_be16(0x0004) +#define IB_PMA_SELX_PORT_RCV_PACKETS cpu_to_be16(0x0008) +#define IB_PMA_SELX_PORT_UNI_XMIT_PACKETS cpu_to_be16(0x0010) +#define IB_PMA_SELX_PORT_UNI_RCV_PACKETS cpu_to_be16(0x0020) +#define IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS cpu_to_be16(0x0040) +#define IB_PMA_SELX_PORT_MULTI_RCV_PACKETS cpu_to_be16(0x0080) + +#endif /* IB_PMA_H */ diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h new file mode 100644 index 0000000..7e071a6 --- /dev/null +++ b/include/rdma/ib_sa.h @@ -0,0 +1,431 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Voltaire, Inc. All rights reserved. + * Copyright (c) 2006 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IB_SA_H +#define IB_SA_H + +#include +#include + +#include + +#include +#include + +enum { + IB_SA_CLASS_VERSION = 2, /* IB spec version 1.1/1.2 */ + + IB_SA_METHOD_GET_TABLE = 0x12, + IB_SA_METHOD_GET_TABLE_RESP = 0x92, + IB_SA_METHOD_DELETE = 0x15, + IB_SA_METHOD_DELETE_RESP = 0x95, + IB_SA_METHOD_GET_MULTI = 0x14, + IB_SA_METHOD_GET_MULTI_RESP = 0x94, + IB_SA_METHOD_GET_TRACE_TBL = 0x13 +}; + +enum { + IB_SA_ATTR_CLASS_PORTINFO = 0x01, + IB_SA_ATTR_NOTICE = 0x02, + IB_SA_ATTR_INFORM_INFO = 0x03, + IB_SA_ATTR_NODE_REC = 0x11, + IB_SA_ATTR_PORT_INFO_REC = 0x12, + IB_SA_ATTR_SL2VL_REC = 0x13, + IB_SA_ATTR_SWITCH_REC = 0x14, + IB_SA_ATTR_LINEAR_FDB_REC = 0x15, + IB_SA_ATTR_RANDOM_FDB_REC = 0x16, + IB_SA_ATTR_MCAST_FDB_REC = 0x17, + IB_SA_ATTR_SM_INFO_REC = 0x18, + IB_SA_ATTR_LINK_REC = 0x20, + IB_SA_ATTR_GUID_INFO_REC = 0x30, + IB_SA_ATTR_SERVICE_REC = 0x31, + IB_SA_ATTR_PARTITION_REC = 0x33, + IB_SA_ATTR_PATH_REC = 0x35, + IB_SA_ATTR_VL_ARB_REC = 0x36, + IB_SA_ATTR_MC_MEMBER_REC = 0x38, + IB_SA_ATTR_TRACE_REC = 0x39, + IB_SA_ATTR_MULTI_PATH_REC = 0x3a, + IB_SA_ATTR_SERVICE_ASSOC_REC = 0x3b, + IB_SA_ATTR_INFORM_INFO_REC = 0xf3 +}; + +enum ib_sa_selector { + IB_SA_GT = 0, + IB_SA_LT = 1, + IB_SA_EQ = 2, + /* + * The meaning of "best" depends on the attribute: for + * example, for MTU best will return the largest available + * MTU, while for packet life time, best will return the + * smallest available life time. + */ + IB_SA_BEST = 3 +}; + +/* + * Structures for SA records are named "struct ib_sa_xxx_rec." No + * attempt is made to pack structures to match the physical layout of + * SA records in SA MADs; all packing and unpacking is handled by the + * SA query code. + * + * For a record with structure ib_sa_xxx_rec, the naming convention + * for the component mask value for field yyy is IB_SA_XXX_REC_YYY (we + * never use different abbreviations or otherwise change the spelling + * of xxx/yyy between ib_sa_xxx_rec.yyy and IB_SA_XXX_REC_YYY). + * + * Reserved rows are indicated with comments to help maintainability. + */ + +#define IB_SA_PATH_REC_SERVICE_ID (IB_SA_COMP_MASK( 0) |\ + IB_SA_COMP_MASK( 1)) +#define IB_SA_PATH_REC_DGID IB_SA_COMP_MASK( 2) +#define IB_SA_PATH_REC_SGID IB_SA_COMP_MASK( 3) +#define IB_SA_PATH_REC_DLID IB_SA_COMP_MASK( 4) +#define IB_SA_PATH_REC_SLID IB_SA_COMP_MASK( 5) +#define IB_SA_PATH_REC_RAW_TRAFFIC IB_SA_COMP_MASK( 6) +/* reserved: 7 */ +#define IB_SA_PATH_REC_FLOW_LABEL IB_SA_COMP_MASK( 8) +#define IB_SA_PATH_REC_HOP_LIMIT IB_SA_COMP_MASK( 9) +#define IB_SA_PATH_REC_TRAFFIC_CLASS IB_SA_COMP_MASK(10) +#define IB_SA_PATH_REC_REVERSIBLE IB_SA_COMP_MASK(11) +#define IB_SA_PATH_REC_NUMB_PATH IB_SA_COMP_MASK(12) +#define IB_SA_PATH_REC_PKEY IB_SA_COMP_MASK(13) +#define IB_SA_PATH_REC_QOS_CLASS IB_SA_COMP_MASK(14) +#define IB_SA_PATH_REC_SL IB_SA_COMP_MASK(15) +#define IB_SA_PATH_REC_MTU_SELECTOR IB_SA_COMP_MASK(16) +#define IB_SA_PATH_REC_MTU IB_SA_COMP_MASK(17) +#define IB_SA_PATH_REC_RATE_SELECTOR IB_SA_COMP_MASK(18) +#define IB_SA_PATH_REC_RATE IB_SA_COMP_MASK(19) +#define IB_SA_PATH_REC_PACKET_LIFE_TIME_SELECTOR IB_SA_COMP_MASK(20) +#define IB_SA_PATH_REC_PACKET_LIFE_TIME IB_SA_COMP_MASK(21) +#define IB_SA_PATH_REC_PREFERENCE IB_SA_COMP_MASK(22) + +struct ib_sa_path_rec { + __be64 service_id; + union ib_gid dgid; + union ib_gid sgid; + __be16 dlid; + __be16 slid; + int raw_traffic; + /* reserved */ + __be32 flow_label; + u8 hop_limit; + u8 traffic_class; + int reversible; + u8 numb_path; + __be16 pkey; + __be16 qos_class; + u8 sl; + u8 mtu_selector; + u8 mtu; + u8 rate_selector; + u8 rate; + u8 packet_life_time_selector; + u8 packet_life_time; + u8 preference; + u8 smac[ETH_ALEN]; + u8 dmac[ETH_ALEN]; + u16 vlan_id; +}; + +#define IB_SA_MCMEMBER_REC_MGID IB_SA_COMP_MASK( 0) +#define IB_SA_MCMEMBER_REC_PORT_GID IB_SA_COMP_MASK( 1) +#define IB_SA_MCMEMBER_REC_QKEY IB_SA_COMP_MASK( 2) +#define IB_SA_MCMEMBER_REC_MLID IB_SA_COMP_MASK( 3) +#define IB_SA_MCMEMBER_REC_MTU_SELECTOR IB_SA_COMP_MASK( 4) +#define IB_SA_MCMEMBER_REC_MTU IB_SA_COMP_MASK( 5) +#define IB_SA_MCMEMBER_REC_TRAFFIC_CLASS IB_SA_COMP_MASK( 6) +#define IB_SA_MCMEMBER_REC_PKEY IB_SA_COMP_MASK( 7) +#define IB_SA_MCMEMBER_REC_RATE_SELECTOR IB_SA_COMP_MASK( 8) +#define IB_SA_MCMEMBER_REC_RATE IB_SA_COMP_MASK( 9) +#define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR IB_SA_COMP_MASK(10) +#define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME IB_SA_COMP_MASK(11) +#define IB_SA_MCMEMBER_REC_SL IB_SA_COMP_MASK(12) +#define IB_SA_MCMEMBER_REC_FLOW_LABEL IB_SA_COMP_MASK(13) +#define IB_SA_MCMEMBER_REC_HOP_LIMIT IB_SA_COMP_MASK(14) +#define IB_SA_MCMEMBER_REC_SCOPE IB_SA_COMP_MASK(15) +#define IB_SA_MCMEMBER_REC_JOIN_STATE IB_SA_COMP_MASK(16) +#define IB_SA_MCMEMBER_REC_PROXY_JOIN IB_SA_COMP_MASK(17) + +struct ib_sa_mcmember_rec { + union ib_gid mgid; + union ib_gid port_gid; + __be32 qkey; + __be16 mlid; + u8 mtu_selector; + u8 mtu; + u8 traffic_class; + __be16 pkey; + u8 rate_selector; + u8 rate; + u8 packet_life_time_selector; + u8 packet_life_time; + u8 sl; + __be32 flow_label; + u8 hop_limit; + u8 scope; + u8 join_state; + int proxy_join; +}; + +/* Service Record Component Mask Sec 15.2.5.14 Ver 1.1 */ +#define IB_SA_SERVICE_REC_SERVICE_ID IB_SA_COMP_MASK( 0) +#define IB_SA_SERVICE_REC_SERVICE_GID IB_SA_COMP_MASK( 1) +#define IB_SA_SERVICE_REC_SERVICE_PKEY IB_SA_COMP_MASK( 2) +/* reserved: 3 */ +#define IB_SA_SERVICE_REC_SERVICE_LEASE IB_SA_COMP_MASK( 4) +#define IB_SA_SERVICE_REC_SERVICE_KEY IB_SA_COMP_MASK( 5) +#define IB_SA_SERVICE_REC_SERVICE_NAME IB_SA_COMP_MASK( 6) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_0 IB_SA_COMP_MASK( 7) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_1 IB_SA_COMP_MASK( 8) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_2 IB_SA_COMP_MASK( 9) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_3 IB_SA_COMP_MASK(10) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_4 IB_SA_COMP_MASK(11) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_5 IB_SA_COMP_MASK(12) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_6 IB_SA_COMP_MASK(13) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_7 IB_SA_COMP_MASK(14) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_8 IB_SA_COMP_MASK(15) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_9 IB_SA_COMP_MASK(16) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_10 IB_SA_COMP_MASK(17) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_11 IB_SA_COMP_MASK(18) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_12 IB_SA_COMP_MASK(19) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_13 IB_SA_COMP_MASK(20) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_14 IB_SA_COMP_MASK(21) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_15 IB_SA_COMP_MASK(22) +#define IB_SA_SERVICE_REC_SERVICE_DATA16_0 IB_SA_COMP_MASK(23) +#define IB_SA_SERVICE_REC_SERVICE_DATA16_1 IB_SA_COMP_MASK(24) +#define IB_SA_SERVICE_REC_SERVICE_DATA16_2 IB_SA_COMP_MASK(25) +#define IB_SA_SERVICE_REC_SERVICE_DATA16_3 IB_SA_COMP_MASK(26) +#define IB_SA_SERVICE_REC_SERVICE_DATA16_4 IB_SA_COMP_MASK(27) +#define IB_SA_SERVICE_REC_SERVICE_DATA16_5 IB_SA_COMP_MASK(28) +#define IB_SA_SERVICE_REC_SERVICE_DATA16_6 IB_SA_COMP_MASK(29) +#define IB_SA_SERVICE_REC_SERVICE_DATA16_7 IB_SA_COMP_MASK(30) +#define IB_SA_SERVICE_REC_SERVICE_DATA32_0 IB_SA_COMP_MASK(31) +#define IB_SA_SERVICE_REC_SERVICE_DATA32_1 IB_SA_COMP_MASK(32) +#define IB_SA_SERVICE_REC_SERVICE_DATA32_2 IB_SA_COMP_MASK(33) +#define IB_SA_SERVICE_REC_SERVICE_DATA32_3 IB_SA_COMP_MASK(34) +#define IB_SA_SERVICE_REC_SERVICE_DATA64_0 IB_SA_COMP_MASK(35) +#define IB_SA_SERVICE_REC_SERVICE_DATA64_1 IB_SA_COMP_MASK(36) + +#define IB_DEFAULT_SERVICE_LEASE 0xFFFFFFFF + +struct ib_sa_service_rec { + u64 id; + union ib_gid gid; + __be16 pkey; + /* reserved */ + u32 lease; + u8 key[16]; + u8 name[64]; + u8 data8[16]; + u16 data16[8]; + u32 data32[4]; + u64 data64[2]; +}; + +#define IB_SA_GUIDINFO_REC_LID IB_SA_COMP_MASK(0) +#define IB_SA_GUIDINFO_REC_BLOCK_NUM IB_SA_COMP_MASK(1) +#define IB_SA_GUIDINFO_REC_RES1 IB_SA_COMP_MASK(2) +#define IB_SA_GUIDINFO_REC_RES2 IB_SA_COMP_MASK(3) +#define IB_SA_GUIDINFO_REC_GID0 IB_SA_COMP_MASK(4) +#define IB_SA_GUIDINFO_REC_GID1 IB_SA_COMP_MASK(5) +#define IB_SA_GUIDINFO_REC_GID2 IB_SA_COMP_MASK(6) +#define IB_SA_GUIDINFO_REC_GID3 IB_SA_COMP_MASK(7) +#define IB_SA_GUIDINFO_REC_GID4 IB_SA_COMP_MASK(8) +#define IB_SA_GUIDINFO_REC_GID5 IB_SA_COMP_MASK(9) +#define IB_SA_GUIDINFO_REC_GID6 IB_SA_COMP_MASK(10) +#define IB_SA_GUIDINFO_REC_GID7 IB_SA_COMP_MASK(11) + +struct ib_sa_guidinfo_rec { + __be16 lid; + u8 block_num; + /* reserved */ + u8 res1; + __be32 res2; + u8 guid_info_list[64]; +}; + +struct ib_sa_client { + atomic_t users; + struct completion comp; +}; + +/** + * ib_sa_register_client - Register an SA client. + */ +void ib_sa_register_client(struct ib_sa_client *client); + +/** + * ib_sa_unregister_client - Deregister an SA client. + * @client: Client object to deregister. + */ +void ib_sa_unregister_client(struct ib_sa_client *client); + +struct ib_sa_query; + +void ib_sa_cancel_query(int id, struct ib_sa_query *query); + +int ib_sa_path_rec_get(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + struct ib_sa_path_rec *rec, + ib_sa_comp_mask comp_mask, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_path_rec *resp, + void *context), + void *context, + struct ib_sa_query **query); + +int ib_sa_service_rec_query(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + u8 method, + struct ib_sa_service_rec *rec, + ib_sa_comp_mask comp_mask, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_service_rec *resp, + void *context), + void *context, + struct ib_sa_query **sa_query); + +struct ib_sa_multicast { + struct ib_sa_mcmember_rec rec; + ib_sa_comp_mask comp_mask; + int (*callback)(int status, + struct ib_sa_multicast *multicast); + void *context; +}; + +/** + * ib_sa_join_multicast - Initiates a join request to the specified multicast + * group. + * @client: SA client + * @device: Device associated with the multicast group. + * @port_num: Port on the specified device to associate with the multicast + * group. + * @rec: SA multicast member record specifying group attributes. + * @comp_mask: Component mask indicating which group attributes of %rec are + * valid. + * @gfp_mask: GFP mask for memory allocations. + * @callback: User callback invoked once the join operation completes. + * @context: User specified context stored with the ib_sa_multicast structure. + * + * This call initiates a multicast join request with the SA for the specified + * multicast group. If the join operation is started successfully, it returns + * an ib_sa_multicast structure that is used to track the multicast operation. + * Users must free this structure by calling ib_free_multicast, even if the + * join operation later fails. (The callback status is non-zero.) + * + * If the join operation fails; status will be non-zero, with the following + * failures possible: + * -ETIMEDOUT: The request timed out. + * -EIO: An error occurred sending the query. + * -EINVAL: The MCMemberRecord values differed from the existing group's. + * -ENETRESET: Indicates that an fatal error has occurred on the multicast + * group, and the user must rejoin the group to continue using it. + */ +struct ib_sa_multicast *ib_sa_join_multicast(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + struct ib_sa_mcmember_rec *rec, + ib_sa_comp_mask comp_mask, gfp_t gfp_mask, + int (*callback)(int status, + struct ib_sa_multicast + *multicast), + void *context); + +/** + * ib_free_multicast - Frees the multicast tracking structure, and releases + * any reference on the multicast group. + * @multicast: Multicast tracking structure allocated by ib_join_multicast. + * + * This call blocks until the multicast identifier is destroyed. It may + * not be called from within the multicast callback; however, returning a non- + * zero value from the callback will result in destroying the multicast + * tracking structure. + */ +void ib_sa_free_multicast(struct ib_sa_multicast *multicast); + +/** + * ib_get_mcmember_rec - Looks up a multicast member record by its MGID and + * returns it if found. + * @device: Device associated with the multicast group. + * @port_num: Port on the specified device to associate with the multicast + * group. + * @mgid: MGID of multicast group. + * @rec: Location to copy SA multicast member record. + */ +int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num, + union ib_gid *mgid, struct ib_sa_mcmember_rec *rec); + +/** + * ib_init_ah_from_mcmember - Initialize address handle attributes based on + * an SA multicast member record. + */ +int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num, + struct ib_sa_mcmember_rec *rec, + struct ib_ah_attr *ah_attr); + +/** + * ib_init_ah_from_path - Initialize address handle attributes based on an SA + * path record. + */ +int ib_init_ah_from_path(struct ib_device *device, u8 port_num, + struct ib_sa_path_rec *rec, + struct ib_ah_attr *ah_attr); + +/** + * ib_sa_pack_path - Conert a path record from struct ib_sa_path_rec + * to IB MAD wire format. + */ +void ib_sa_pack_path(struct ib_sa_path_rec *rec, void *attribute); + +/** + * ib_sa_unpack_path - Convert a path record from MAD format to struct + * ib_sa_path_rec. + */ +void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec); + +/* Support GuidInfoRecord */ +int ib_sa_guid_info_rec_query(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + struct ib_sa_guidinfo_rec *rec, + ib_sa_comp_mask comp_mask, u8 method, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_guidinfo_rec *resp, + void *context), + void *context, + struct ib_sa_query **sa_query); + +#endif /* IB_SA_H */ diff --git a/include/rdma/ib_smi.h b/include/rdma/ib_smi.h new file mode 100644 index 0000000..98b9086 --- /dev/null +++ b/include/rdma/ib_smi.h @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2004 Infinicon Corporation. All rights reserved. + * Copyright (c) 2004 Intel Corporation. All rights reserved. + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2004 Voltaire Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(IB_SMI_H) +#define IB_SMI_H + +#include + +#define IB_SMP_DATA_SIZE 64 +#define IB_SMP_MAX_PATH_HOPS 64 + +struct ib_smp { + u8 base_version; + u8 mgmt_class; + u8 class_version; + u8 method; + __be16 status; + u8 hop_ptr; + u8 hop_cnt; + __be64 tid; + __be16 attr_id; + __be16 resv; + __be32 attr_mod; + __be64 mkey; + __be16 dr_slid; + __be16 dr_dlid; + u8 reserved[28]; + u8 data[IB_SMP_DATA_SIZE]; + u8 initial_path[IB_SMP_MAX_PATH_HOPS]; + u8 return_path[IB_SMP_MAX_PATH_HOPS]; +} __attribute__ ((packed)); + +#define IB_SMP_DIRECTION cpu_to_be16(0x8000) + +/* Subnet management attributes */ +#define IB_SMP_ATTR_NOTICE cpu_to_be16(0x0002) +#define IB_SMP_ATTR_NODE_DESC cpu_to_be16(0x0010) +#define IB_SMP_ATTR_NODE_INFO cpu_to_be16(0x0011) +#define IB_SMP_ATTR_SWITCH_INFO cpu_to_be16(0x0012) +#define IB_SMP_ATTR_GUID_INFO cpu_to_be16(0x0014) +#define IB_SMP_ATTR_PORT_INFO cpu_to_be16(0x0015) +#define IB_SMP_ATTR_PKEY_TABLE cpu_to_be16(0x0016) +#define IB_SMP_ATTR_SL_TO_VL_TABLE cpu_to_be16(0x0017) +#define IB_SMP_ATTR_VL_ARB_TABLE cpu_to_be16(0x0018) +#define IB_SMP_ATTR_LINEAR_FORWARD_TABLE cpu_to_be16(0x0019) +#define IB_SMP_ATTR_RANDOM_FORWARD_TABLE cpu_to_be16(0x001A) +#define IB_SMP_ATTR_MCAST_FORWARD_TABLE cpu_to_be16(0x001B) +#define IB_SMP_ATTR_SM_INFO cpu_to_be16(0x0020) +#define IB_SMP_ATTR_VENDOR_DIAG cpu_to_be16(0x0030) +#define IB_SMP_ATTR_LED_INFO cpu_to_be16(0x0031) +#define IB_SMP_ATTR_VENDOR_MASK cpu_to_be16(0xFF00) + +struct ib_port_info { + __be64 mkey; + __be64 gid_prefix; + __be16 lid; + __be16 sm_lid; + __be32 cap_mask; + __be16 diag_code; + __be16 mkey_lease_period; + u8 local_port_num; + u8 link_width_enabled; + u8 link_width_supported; + u8 link_width_active; + u8 linkspeed_portstate; /* 4 bits, 4 bits */ + u8 portphysstate_linkdown; /* 4 bits, 4 bits */ + u8 mkeyprot_resv_lmc; /* 2 bits, 3, 3 */ + u8 linkspeedactive_enabled; /* 4 bits, 4 bits */ + u8 neighbormtu_mastersmsl; /* 4 bits, 4 bits */ + u8 vlcap_inittype; /* 4 bits, 4 bits */ + u8 vl_high_limit; + u8 vl_arb_high_cap; + u8 vl_arb_low_cap; + u8 inittypereply_mtucap; /* 4 bits, 4 bits */ + u8 vlstallcnt_hoqlife; /* 3 bits, 5 bits */ + u8 operationalvl_pei_peo_fpi_fpo; /* 4 bits, 1, 1, 1, 1 */ + __be16 mkey_violations; + __be16 pkey_violations; + __be16 qkey_violations; + u8 guid_cap; + u8 clientrereg_resv_subnetto; /* 1 bit, 2 bits, 5 */ + u8 resv_resptimevalue; /* 3 bits, 5 bits */ + u8 localphyerrors_overrunerrors; /* 4 bits, 4 bits */ + __be16 max_credit_hint; + u8 resv; + u8 link_roundtrip_latency[3]; +}; + +static inline u8 +ib_get_smp_direction(struct ib_smp *smp) +{ + return ((smp->status & IB_SMP_DIRECTION) == IB_SMP_DIRECTION); +} + +#endif /* IB_SMI_H */ diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h new file mode 100644 index 0000000..a2bf41e --- /dev/null +++ b/include/rdma/ib_umem.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2007 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IB_UMEM_H +#define IB_UMEM_H + +#include +#include +#include + +struct ib_ucontext; + +struct ib_umem { + struct ib_ucontext *context; + size_t length; + int offset; + int page_size; + int writable; + int hugetlb; + struct work_struct work; + struct pid *pid; + struct mm_struct *mm; + unsigned long diff; + struct sg_table sg_head; + int nmap; + int npages; +}; + +#ifdef CONFIG_INFINIBAND_USER_MEM + +struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, + size_t size, int access, int dmasync); +void ib_umem_release(struct ib_umem *umem); +int ib_umem_page_count(struct ib_umem *umem); + +#else /* CONFIG_INFINIBAND_USER_MEM */ + +#include + +static inline struct ib_umem *ib_umem_get(struct ib_ucontext *context, + unsigned long addr, size_t size, + int access, int dmasync) { + return ERR_PTR(-EINVAL); +} +static inline void ib_umem_release(struct ib_umem *umem) { } +static inline int ib_umem_page_count(struct ib_umem *umem) { return 0; } + +#endif /* CONFIG_INFINIBAND_USER_MEM */ + +#endif /* IB_UMEM_H */ diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h new file mode 100644 index 0000000..470a011 --- /dev/null +++ b/include/rdma/ib_verbs.h @@ -0,0 +1,2626 @@ +/* + * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2004 Infinicon Corporation. All rights reserved. + * Copyright (c) 2004 Intel Corporation. All rights reserved. + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2004 Voltaire Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(IB_VERBS_H) +#define IB_VERBS_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +extern struct workqueue_struct *ib_wq; + +union ib_gid { + u8 raw[16]; + struct { + __be64 subnet_prefix; + __be64 interface_id; + } global; +}; + +enum rdma_node_type { + /* IB values map to NodeInfo:NodeType. */ + RDMA_NODE_IB_CA = 1, + RDMA_NODE_IB_SWITCH, + RDMA_NODE_IB_ROUTER, + RDMA_NODE_RNIC, + RDMA_NODE_USNIC, + RDMA_NODE_USNIC_UDP, +}; + +enum rdma_transport_type { + RDMA_TRANSPORT_IB, + RDMA_TRANSPORT_IWARP, + RDMA_TRANSPORT_USNIC, + RDMA_TRANSPORT_USNIC_UDP +}; + +__attribute_const__ enum rdma_transport_type +rdma_node_get_transport(enum rdma_node_type node_type); + +enum rdma_link_layer { + IB_LINK_LAYER_UNSPECIFIED, + IB_LINK_LAYER_INFINIBAND, + IB_LINK_LAYER_ETHERNET, +}; + +enum ib_device_cap_flags { + IB_DEVICE_RESIZE_MAX_WR = 1, + IB_DEVICE_BAD_PKEY_CNTR = (1<<1), + IB_DEVICE_BAD_QKEY_CNTR = (1<<2), + IB_DEVICE_RAW_MULTI = (1<<3), + IB_DEVICE_AUTO_PATH_MIG = (1<<4), + IB_DEVICE_CHANGE_PHY_PORT = (1<<5), + IB_DEVICE_UD_AV_PORT_ENFORCE = (1<<6), + IB_DEVICE_CURR_QP_STATE_MOD = (1<<7), + IB_DEVICE_SHUTDOWN_PORT = (1<<8), + IB_DEVICE_INIT_TYPE = (1<<9), + IB_DEVICE_PORT_ACTIVE_EVENT = (1<<10), + IB_DEVICE_SYS_IMAGE_GUID = (1<<11), + IB_DEVICE_RC_RNR_NAK_GEN = (1<<12), + IB_DEVICE_SRQ_RESIZE = (1<<13), + IB_DEVICE_N_NOTIFY_CQ = (1<<14), + IB_DEVICE_LOCAL_DMA_LKEY = (1<<15), + IB_DEVICE_RESERVED = (1<<16), /* old SEND_W_INV */ + IB_DEVICE_MEM_WINDOW = (1<<17), + /* + * Devices should set IB_DEVICE_UD_IP_SUM if they support + * insertion of UDP and TCP checksum on outgoing UD IPoIB + * messages and can verify the validity of checksum for + * incoming messages. Setting this flag implies that the + * IPoIB driver may set NETIF_F_IP_CSUM for datagram mode. + */ + IB_DEVICE_UD_IP_CSUM = (1<<18), + IB_DEVICE_UD_TSO = (1<<19), + IB_DEVICE_XRC = (1<<20), + IB_DEVICE_MEM_MGT_EXTENSIONS = (1<<21), + IB_DEVICE_BLOCK_MULTICAST_LOOPBACK = (1<<22), + IB_DEVICE_MEM_WINDOW_TYPE_2A = (1<<23), + IB_DEVICE_MEM_WINDOW_TYPE_2B = (1<<24), + IB_DEVICE_MANAGED_FLOW_STEERING = (1<<29), + IB_DEVICE_SIGNATURE_HANDOVER = (1<<30) +}; + +enum ib_signature_prot_cap { + IB_PROT_T10DIF_TYPE_1 = 1, + IB_PROT_T10DIF_TYPE_2 = 1 << 1, + IB_PROT_T10DIF_TYPE_3 = 1 << 2, +}; + +enum ib_signature_guard_cap { + IB_GUARD_T10DIF_CRC = 1, + IB_GUARD_T10DIF_CSUM = 1 << 1, +}; + +enum ib_atomic_cap { + IB_ATOMIC_NONE, + IB_ATOMIC_HCA, + IB_ATOMIC_GLOB +}; + +struct ib_device_attr { + u64 fw_ver; + __be64 sys_image_guid; + u64 max_mr_size; + u64 page_size_cap; + u32 vendor_id; + u32 vendor_part_id; + u32 hw_ver; + int max_qp; + int max_qp_wr; + int device_cap_flags; + int max_sge; + int max_sge_rd; + int max_cq; + int max_cqe; + int max_mr; + int max_pd; + int max_qp_rd_atom; + int max_ee_rd_atom; + int max_res_rd_atom; + int max_qp_init_rd_atom; + int max_ee_init_rd_atom; + enum ib_atomic_cap atomic_cap; + enum ib_atomic_cap masked_atomic_cap; + int max_ee; + int max_rdd; + int max_mw; + int max_raw_ipv6_qp; + int max_raw_ethy_qp; + int max_mcast_grp; + int max_mcast_qp_attach; + int max_total_mcast_qp_attach; + int max_ah; + int max_fmr; + int max_map_per_fmr; + int max_srq; + int max_srq_wr; + int max_srq_sge; + unsigned int max_fast_reg_page_list_len; + u16 max_pkeys; + u8 local_ca_ack_delay; + int sig_prot_cap; + int sig_guard_cap; +}; + +enum ib_mtu { + IB_MTU_256 = 1, + IB_MTU_512 = 2, + IB_MTU_1024 = 3, + IB_MTU_2048 = 4, + IB_MTU_4096 = 5 +}; + +static inline int ib_mtu_enum_to_int(enum ib_mtu mtu) +{ + switch (mtu) { + case IB_MTU_256: return 256; + case IB_MTU_512: return 512; + case IB_MTU_1024: return 1024; + case IB_MTU_2048: return 2048; + case IB_MTU_4096: return 4096; + default: return -1; + } +} + +enum ib_port_state { + IB_PORT_NOP = 0, + IB_PORT_DOWN = 1, + IB_PORT_INIT = 2, + IB_PORT_ARMED = 3, + IB_PORT_ACTIVE = 4, + IB_PORT_ACTIVE_DEFER = 5 +}; + +enum ib_port_cap_flags { + IB_PORT_SM = 1 << 1, + IB_PORT_NOTICE_SUP = 1 << 2, + IB_PORT_TRAP_SUP = 1 << 3, + IB_PORT_OPT_IPD_SUP = 1 << 4, + IB_PORT_AUTO_MIGR_SUP = 1 << 5, + IB_PORT_SL_MAP_SUP = 1 << 6, + IB_PORT_MKEY_NVRAM = 1 << 7, + IB_PORT_PKEY_NVRAM = 1 << 8, + IB_PORT_LED_INFO_SUP = 1 << 9, + IB_PORT_SM_DISABLED = 1 << 10, + IB_PORT_SYS_IMAGE_GUID_SUP = 1 << 11, + IB_PORT_PKEY_SW_EXT_PORT_TRAP_SUP = 1 << 12, + IB_PORT_EXTENDED_SPEEDS_SUP = 1 << 14, + IB_PORT_CM_SUP = 1 << 16, + IB_PORT_SNMP_TUNNEL_SUP = 1 << 17, + IB_PORT_REINIT_SUP = 1 << 18, + IB_PORT_DEVICE_MGMT_SUP = 1 << 19, + IB_PORT_VENDOR_CLASS_SUP = 1 << 20, + IB_PORT_DR_NOTICE_SUP = 1 << 21, + IB_PORT_CAP_MASK_NOTICE_SUP = 1 << 22, + IB_PORT_BOOT_MGMT_SUP = 1 << 23, + IB_PORT_LINK_LATENCY_SUP = 1 << 24, + IB_PORT_CLIENT_REG_SUP = 1 << 25, + IB_PORT_IP_BASED_GIDS = 1 << 26 +}; + +enum ib_port_width { + IB_WIDTH_1X = 1, + IB_WIDTH_4X = 2, + IB_WIDTH_8X = 4, + IB_WIDTH_12X = 8 +}; + +static inline int ib_width_enum_to_int(enum ib_port_width width) +{ + switch (width) { + case IB_WIDTH_1X: return 1; + case IB_WIDTH_4X: return 4; + case IB_WIDTH_8X: return 8; + case IB_WIDTH_12X: return 12; + default: return -1; + } +} + +enum ib_port_speed { + IB_SPEED_SDR = 1, + IB_SPEED_DDR = 2, + IB_SPEED_QDR = 4, + IB_SPEED_FDR10 = 8, + IB_SPEED_FDR = 16, + IB_SPEED_EDR = 32 +}; + +struct ib_protocol_stats { + /* TBD... */ +}; + +struct iw_protocol_stats { + u64 ipInReceives; + u64 ipInHdrErrors; + u64 ipInTooBigErrors; + u64 ipInNoRoutes; + u64 ipInAddrErrors; + u64 ipInUnknownProtos; + u64 ipInTruncatedPkts; + u64 ipInDiscards; + u64 ipInDelivers; + u64 ipOutForwDatagrams; + u64 ipOutRequests; + u64 ipOutDiscards; + u64 ipOutNoRoutes; + u64 ipReasmTimeout; + u64 ipReasmReqds; + u64 ipReasmOKs; + u64 ipReasmFails; + u64 ipFragOKs; + u64 ipFragFails; + u64 ipFragCreates; + u64 ipInMcastPkts; + u64 ipOutMcastPkts; + u64 ipInBcastPkts; + u64 ipOutBcastPkts; + + u64 tcpRtoAlgorithm; + u64 tcpRtoMin; + u64 tcpRtoMax; + u64 tcpMaxConn; + u64 tcpActiveOpens; + u64 tcpPassiveOpens; + u64 tcpAttemptFails; + u64 tcpEstabResets; + u64 tcpCurrEstab; + u64 tcpInSegs; + u64 tcpOutSegs; + u64 tcpRetransSegs; + u64 tcpInErrs; + u64 tcpOutRsts; +}; + +union rdma_protocol_stats { + struct ib_protocol_stats ib; + struct iw_protocol_stats iw; +}; + +struct ib_port_attr { + enum ib_port_state state; + enum ib_mtu max_mtu; + enum ib_mtu active_mtu; + int gid_tbl_len; + u32 port_cap_flags; + u32 max_msg_sz; + u32 bad_pkey_cntr; + u32 qkey_viol_cntr; + u16 pkey_tbl_len; + u16 lid; + u16 sm_lid; + u8 lmc; + u8 max_vl_num; + u8 sm_sl; + u8 subnet_timeout; + u8 init_type_reply; + u8 active_width; + u8 active_speed; + u8 phys_state; +}; + +enum ib_device_modify_flags { + IB_DEVICE_MODIFY_SYS_IMAGE_GUID = 1 << 0, + IB_DEVICE_MODIFY_NODE_DESC = 1 << 1 +}; + +struct ib_device_modify { + u64 sys_image_guid; + char node_desc[64]; +}; + +enum ib_port_modify_flags { + IB_PORT_SHUTDOWN = 1, + IB_PORT_INIT_TYPE = (1<<2), + IB_PORT_RESET_QKEY_CNTR = (1<<3) +}; + +struct ib_port_modify { + u32 set_port_cap_mask; + u32 clr_port_cap_mask; + u8 init_type; +}; + +enum ib_event_type { + IB_EVENT_CQ_ERR, + IB_EVENT_QP_FATAL, + IB_EVENT_QP_REQ_ERR, + IB_EVENT_QP_ACCESS_ERR, + IB_EVENT_COMM_EST, + IB_EVENT_SQ_DRAINED, + IB_EVENT_PATH_MIG, + IB_EVENT_PATH_MIG_ERR, + IB_EVENT_DEVICE_FATAL, + IB_EVENT_PORT_ACTIVE, + IB_EVENT_PORT_ERR, + IB_EVENT_LID_CHANGE, + IB_EVENT_PKEY_CHANGE, + IB_EVENT_SM_CHANGE, + IB_EVENT_SRQ_ERR, + IB_EVENT_SRQ_LIMIT_REACHED, + IB_EVENT_QP_LAST_WQE_REACHED, + IB_EVENT_CLIENT_REREGISTER, + IB_EVENT_GID_CHANGE, +}; + +struct ib_event { + struct ib_device *device; + union { + struct ib_cq *cq; + struct ib_qp *qp; + struct ib_srq *srq; + u8 port_num; + } element; + enum ib_event_type event; +}; + +struct ib_event_handler { + struct ib_device *device; + void (*handler)(struct ib_event_handler *, struct ib_event *); + struct list_head list; +}; + +#define INIT_IB_EVENT_HANDLER(_ptr, _device, _handler) \ + do { \ + (_ptr)->device = _device; \ + (_ptr)->handler = _handler; \ + INIT_LIST_HEAD(&(_ptr)->list); \ + } while (0) + +struct ib_global_route { + union ib_gid dgid; + u32 flow_label; + u8 sgid_index; + u8 hop_limit; + u8 traffic_class; +}; + +struct ib_grh { + __be32 version_tclass_flow; + __be16 paylen; + u8 next_hdr; + u8 hop_limit; + union ib_gid sgid; + union ib_gid dgid; +}; + +enum { + IB_MULTICAST_QPN = 0xffffff +}; + +#define IB_LID_PERMISSIVE cpu_to_be16(0xFFFF) + +enum ib_ah_flags { + IB_AH_GRH = 1 +}; + +enum ib_rate { + IB_RATE_PORT_CURRENT = 0, + IB_RATE_2_5_GBPS = 2, + IB_RATE_5_GBPS = 5, + IB_RATE_10_GBPS = 3, + IB_RATE_20_GBPS = 6, + IB_RATE_30_GBPS = 4, + IB_RATE_40_GBPS = 7, + IB_RATE_60_GBPS = 8, + IB_RATE_80_GBPS = 9, + IB_RATE_120_GBPS = 10, + IB_RATE_14_GBPS = 11, + IB_RATE_56_GBPS = 12, + IB_RATE_112_GBPS = 13, + IB_RATE_168_GBPS = 14, + IB_RATE_25_GBPS = 15, + IB_RATE_100_GBPS = 16, + IB_RATE_200_GBPS = 17, + IB_RATE_300_GBPS = 18 +}; + +/** + * ib_rate_to_mult - Convert the IB rate enum to a multiple of the + * base rate of 2.5 Gbit/sec. For example, IB_RATE_5_GBPS will be + * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec. + * @rate: rate to convert. + */ +__attribute_const__ int ib_rate_to_mult(enum ib_rate rate); + +/** + * ib_rate_to_mbps - Convert the IB rate enum to Mbps. + * For example, IB_RATE_2_5_GBPS will be converted to 2500. + * @rate: rate to convert. + */ +__attribute_const__ int ib_rate_to_mbps(enum ib_rate rate); + +enum ib_mr_create_flags { + IB_MR_SIGNATURE_EN = 1, +}; + +/** + * ib_mr_init_attr - Memory region init attributes passed to routine + * ib_create_mr. + * @max_reg_descriptors: max number of registration descriptors that + * may be used with registration work requests. + * @flags: MR creation flags bit mask. + */ +struct ib_mr_init_attr { + int max_reg_descriptors; + u32 flags; +}; + +/** + * Signature types + * IB_SIG_TYPE_NONE: Unprotected. + * IB_SIG_TYPE_T10_DIF: Type T10-DIF + */ +enum ib_signature_type { + IB_SIG_TYPE_NONE, + IB_SIG_TYPE_T10_DIF, +}; + +/** + * Signature T10-DIF block-guard types + * IB_T10DIF_CRC: Corresponds to T10-PI mandated CRC checksum rules. + * IB_T10DIF_CSUM: Corresponds to IP checksum rules. + */ +enum ib_t10_dif_bg_type { + IB_T10DIF_CRC, + IB_T10DIF_CSUM +}; + +/** + * struct ib_t10_dif_domain - Parameters specific for T10-DIF + * domain. + * @bg_type: T10-DIF block guard type (CRC|CSUM) + * @pi_interval: protection information interval. + * @bg: seed of guard computation. + * @app_tag: application tag of guard block + * @ref_tag: initial guard block reference tag. + * @ref_remap: Indicate wethear the reftag increments each block + * @app_escape: Indicate to skip block check if apptag=0xffff + * @ref_escape: Indicate to skip block check if reftag=0xffffffff + * @apptag_check_mask: check bitmask of application tag. + */ +struct ib_t10_dif_domain { + enum ib_t10_dif_bg_type bg_type; + u16 pi_interval; + u16 bg; + u16 app_tag; + u32 ref_tag; + bool ref_remap; + bool app_escape; + bool ref_escape; + u16 apptag_check_mask; +}; + +/** + * struct ib_sig_domain - Parameters for signature domain + * @sig_type: specific signauture type + * @sig: union of all signature domain attributes that may + * be used to set domain layout. + */ +struct ib_sig_domain { + enum ib_signature_type sig_type; + union { + struct ib_t10_dif_domain dif; + } sig; +}; + +/** + * struct ib_sig_attrs - Parameters for signature handover operation + * @check_mask: bitmask for signature byte check (8 bytes) + * @mem: memory domain layout desciptor. + * @wire: wire domain layout desciptor. + */ +struct ib_sig_attrs { + u8 check_mask; + struct ib_sig_domain mem; + struct ib_sig_domain wire; +}; + +enum ib_sig_err_type { + IB_SIG_BAD_GUARD, + IB_SIG_BAD_REFTAG, + IB_SIG_BAD_APPTAG, +}; + +/** + * struct ib_sig_err - signature error descriptor + */ +struct ib_sig_err { + enum ib_sig_err_type err_type; + u32 expected; + u32 actual; + u64 sig_err_offset; + u32 key; +}; + +enum ib_mr_status_check { + IB_MR_CHECK_SIG_STATUS = 1, +}; + +/** + * struct ib_mr_status - Memory region status container + * + * @fail_status: Bitmask of MR checks status. For each + * failed check a corresponding status bit is set. + * @sig_err: Additional info for IB_MR_CEHCK_SIG_STATUS + * failure. + */ +struct ib_mr_status { + u32 fail_status; + struct ib_sig_err sig_err; +}; + +/** + * mult_to_ib_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate + * enum. + * @mult: multiple to convert. + */ +__attribute_const__ enum ib_rate mult_to_ib_rate(int mult); + +struct ib_ah_attr { + struct ib_global_route grh; + u16 dlid; + u8 sl; + u8 src_path_bits; + u8 static_rate; + u8 ah_flags; + u8 port_num; + u8 dmac[ETH_ALEN]; + u16 vlan_id; +}; + +enum ib_wc_status { + IB_WC_SUCCESS, + IB_WC_LOC_LEN_ERR, + IB_WC_LOC_QP_OP_ERR, + IB_WC_LOC_EEC_OP_ERR, + IB_WC_LOC_PROT_ERR, + IB_WC_WR_FLUSH_ERR, + IB_WC_MW_BIND_ERR, + IB_WC_BAD_RESP_ERR, + IB_WC_LOC_ACCESS_ERR, + IB_WC_REM_INV_REQ_ERR, + IB_WC_REM_ACCESS_ERR, + IB_WC_REM_OP_ERR, + IB_WC_RETRY_EXC_ERR, + IB_WC_RNR_RETRY_EXC_ERR, + IB_WC_LOC_RDD_VIOL_ERR, + IB_WC_REM_INV_RD_REQ_ERR, + IB_WC_REM_ABORT_ERR, + IB_WC_INV_EECN_ERR, + IB_WC_INV_EEC_STATE_ERR, + IB_WC_FATAL_ERR, + IB_WC_RESP_TIMEOUT_ERR, + IB_WC_GENERAL_ERR +}; + +enum ib_wc_opcode { + IB_WC_SEND, + IB_WC_RDMA_WRITE, + IB_WC_RDMA_READ, + IB_WC_COMP_SWAP, + IB_WC_FETCH_ADD, + IB_WC_BIND_MW, + IB_WC_LSO, + IB_WC_LOCAL_INV, + IB_WC_FAST_REG_MR, + IB_WC_MASKED_COMP_SWAP, + IB_WC_MASKED_FETCH_ADD, +/* + * Set value of IB_WC_RECV so consumers can test if a completion is a + * receive by testing (opcode & IB_WC_RECV). + */ + IB_WC_RECV = 1 << 7, + IB_WC_RECV_RDMA_WITH_IMM +}; + +enum ib_wc_flags { + IB_WC_GRH = 1, + IB_WC_WITH_IMM = (1<<1), + IB_WC_WITH_INVALIDATE = (1<<2), + IB_WC_IP_CSUM_OK = (1<<3), + IB_WC_WITH_SMAC = (1<<4), + IB_WC_WITH_VLAN = (1<<5), +}; + +struct ib_wc { + u64 wr_id; + enum ib_wc_status status; + enum ib_wc_opcode opcode; + u32 vendor_err; + u32 byte_len; + struct ib_qp *qp; + union { + __be32 imm_data; + u32 invalidate_rkey; + } ex; + u32 src_qp; + int wc_flags; + u16 pkey_index; + u16 slid; + u8 sl; + u8 dlid_path_bits; + u8 port_num; /* valid only for DR SMPs on switches */ + u8 smac[ETH_ALEN]; + u16 vlan_id; +}; + +enum ib_cq_notify_flags { + IB_CQ_SOLICITED = 1 << 0, + IB_CQ_NEXT_COMP = 1 << 1, + IB_CQ_SOLICITED_MASK = IB_CQ_SOLICITED | IB_CQ_NEXT_COMP, + IB_CQ_REPORT_MISSED_EVENTS = 1 << 2, +}; + +enum ib_srq_type { + IB_SRQT_BASIC, + IB_SRQT_XRC +}; + +enum ib_srq_attr_mask { + IB_SRQ_MAX_WR = 1 << 0, + IB_SRQ_LIMIT = 1 << 1, +}; + +struct ib_srq_attr { + u32 max_wr; + u32 max_sge; + u32 srq_limit; +}; + +struct ib_srq_init_attr { + void (*event_handler)(struct ib_event *, void *); + void *srq_context; + struct ib_srq_attr attr; + enum ib_srq_type srq_type; + + union { + struct { + struct ib_xrcd *xrcd; + struct ib_cq *cq; + } xrc; + } ext; +}; + +struct ib_qp_cap { + u32 max_send_wr; + u32 max_recv_wr; + u32 max_send_sge; + u32 max_recv_sge; + u32 max_inline_data; +}; + +enum ib_sig_type { + IB_SIGNAL_ALL_WR, + IB_SIGNAL_REQ_WR +}; + +enum ib_qp_type { + /* + * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries + * here (and in that order) since the MAD layer uses them as + * indices into a 2-entry table. + */ + IB_QPT_SMI, + IB_QPT_GSI, + + IB_QPT_RC, + IB_QPT_UC, + IB_QPT_UD, + IB_QPT_RAW_IPV6, + IB_QPT_RAW_ETHERTYPE, + IB_QPT_RAW_PACKET = 8, + IB_QPT_XRC_INI = 9, + IB_QPT_XRC_TGT, + IB_QPT_MAX, + /* Reserve a range for qp types internal to the low level driver. + * These qp types will not be visible at the IB core layer, so the + * IB_QPT_MAX usages should not be affected in the core layer + */ + IB_QPT_RESERVED1 = 0x1000, + IB_QPT_RESERVED2, + IB_QPT_RESERVED3, + IB_QPT_RESERVED4, + IB_QPT_RESERVED5, + IB_QPT_RESERVED6, + IB_QPT_RESERVED7, + IB_QPT_RESERVED8, + IB_QPT_RESERVED9, + IB_QPT_RESERVED10, +}; + +enum ib_qp_create_flags { + IB_QP_CREATE_IPOIB_UD_LSO = 1 << 0, + IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK = 1 << 1, + IB_QP_CREATE_NETIF_QP = 1 << 5, + IB_QP_CREATE_SIGNATURE_EN = 1 << 6, + IB_QP_CREATE_USE_GFP_NOIO = 1 << 7, + /* reserve bits 26-31 for low level drivers' internal use */ + IB_QP_CREATE_RESERVED_START = 1 << 26, + IB_QP_CREATE_RESERVED_END = 1 << 31, +}; + + +/* + * Note: users may not call ib_close_qp or ib_destroy_qp from the event_handler + * callback to destroy the passed in QP. + */ + +struct ib_qp_init_attr { + void (*event_handler)(struct ib_event *, void *); + void *qp_context; + struct ib_cq *send_cq; + struct ib_cq *recv_cq; + struct ib_srq *srq; + struct ib_xrcd *xrcd; /* XRC TGT QPs only */ + struct ib_qp_cap cap; + enum ib_sig_type sq_sig_type; + enum ib_qp_type qp_type; + enum ib_qp_create_flags create_flags; + u8 port_num; /* special QP types only */ +}; + +struct ib_qp_open_attr { + void (*event_handler)(struct ib_event *, void *); + void *qp_context; + u32 qp_num; + enum ib_qp_type qp_type; +}; + +enum ib_rnr_timeout { + IB_RNR_TIMER_655_36 = 0, + IB_RNR_TIMER_000_01 = 1, + IB_RNR_TIMER_000_02 = 2, + IB_RNR_TIMER_000_03 = 3, + IB_RNR_TIMER_000_04 = 4, + IB_RNR_TIMER_000_06 = 5, + IB_RNR_TIMER_000_08 = 6, + IB_RNR_TIMER_000_12 = 7, + IB_RNR_TIMER_000_16 = 8, + IB_RNR_TIMER_000_24 = 9, + IB_RNR_TIMER_000_32 = 10, + IB_RNR_TIMER_000_48 = 11, + IB_RNR_TIMER_000_64 = 12, + IB_RNR_TIMER_000_96 = 13, + IB_RNR_TIMER_001_28 = 14, + IB_RNR_TIMER_001_92 = 15, + IB_RNR_TIMER_002_56 = 16, + IB_RNR_TIMER_003_84 = 17, + IB_RNR_TIMER_005_12 = 18, + IB_RNR_TIMER_007_68 = 19, + IB_RNR_TIMER_010_24 = 20, + IB_RNR_TIMER_015_36 = 21, + IB_RNR_TIMER_020_48 = 22, + IB_RNR_TIMER_030_72 = 23, + IB_RNR_TIMER_040_96 = 24, + IB_RNR_TIMER_061_44 = 25, + IB_RNR_TIMER_081_92 = 26, + IB_RNR_TIMER_122_88 = 27, + IB_RNR_TIMER_163_84 = 28, + IB_RNR_TIMER_245_76 = 29, + IB_RNR_TIMER_327_68 = 30, + IB_RNR_TIMER_491_52 = 31 +}; + +enum ib_qp_attr_mask { + IB_QP_STATE = 1, + IB_QP_CUR_STATE = (1<<1), + IB_QP_EN_SQD_ASYNC_NOTIFY = (1<<2), + IB_QP_ACCESS_FLAGS = (1<<3), + IB_QP_PKEY_INDEX = (1<<4), + IB_QP_PORT = (1<<5), + IB_QP_QKEY = (1<<6), + IB_QP_AV = (1<<7), + IB_QP_PATH_MTU = (1<<8), + IB_QP_TIMEOUT = (1<<9), + IB_QP_RETRY_CNT = (1<<10), + IB_QP_RNR_RETRY = (1<<11), + IB_QP_RQ_PSN = (1<<12), + IB_QP_MAX_QP_RD_ATOMIC = (1<<13), + IB_QP_ALT_PATH = (1<<14), + IB_QP_MIN_RNR_TIMER = (1<<15), + IB_QP_SQ_PSN = (1<<16), + IB_QP_MAX_DEST_RD_ATOMIC = (1<<17), + IB_QP_PATH_MIG_STATE = (1<<18), + IB_QP_CAP = (1<<19), + IB_QP_DEST_QPN = (1<<20), + IB_QP_SMAC = (1<<21), + IB_QP_ALT_SMAC = (1<<22), + IB_QP_VID = (1<<23), + IB_QP_ALT_VID = (1<<24), +}; + +enum ib_qp_state { + IB_QPS_RESET, + IB_QPS_INIT, + IB_QPS_RTR, + IB_QPS_RTS, + IB_QPS_SQD, + IB_QPS_SQE, + IB_QPS_ERR +}; + +enum ib_mig_state { + IB_MIG_MIGRATED, + IB_MIG_REARM, + IB_MIG_ARMED +}; + +enum ib_mw_type { + IB_MW_TYPE_1 = 1, + IB_MW_TYPE_2 = 2 +}; + +struct ib_qp_attr { + enum ib_qp_state qp_state; + enum ib_qp_state cur_qp_state; + enum ib_mtu path_mtu; + enum ib_mig_state path_mig_state; + u32 qkey; + u32 rq_psn; + u32 sq_psn; + u32 dest_qp_num; + int qp_access_flags; + struct ib_qp_cap cap; + struct ib_ah_attr ah_attr; + struct ib_ah_attr alt_ah_attr; + u16 pkey_index; + u16 alt_pkey_index; + u8 en_sqd_async_notify; + u8 sq_draining; + u8 max_rd_atomic; + u8 max_dest_rd_atomic; + u8 min_rnr_timer; + u8 port_num; + u8 timeout; + u8 retry_cnt; + u8 rnr_retry; + u8 alt_port_num; + u8 alt_timeout; + u8 smac[ETH_ALEN]; + u8 alt_smac[ETH_ALEN]; + u16 vlan_id; + u16 alt_vlan_id; +}; + +enum ib_wr_opcode { + IB_WR_RDMA_WRITE, + IB_WR_RDMA_WRITE_WITH_IMM, + IB_WR_SEND, + IB_WR_SEND_WITH_IMM, + IB_WR_RDMA_READ, + IB_WR_ATOMIC_CMP_AND_SWP, + IB_WR_ATOMIC_FETCH_AND_ADD, + IB_WR_LSO, + IB_WR_SEND_WITH_INV, + IB_WR_RDMA_READ_WITH_INV, + IB_WR_LOCAL_INV, + IB_WR_FAST_REG_MR, + IB_WR_MASKED_ATOMIC_CMP_AND_SWP, + IB_WR_MASKED_ATOMIC_FETCH_AND_ADD, + IB_WR_BIND_MW, + IB_WR_REG_SIG_MR, + /* reserve values for low level drivers' internal use. + * These values will not be used at all in the ib core layer. + */ + IB_WR_RESERVED1 = 0xf0, + IB_WR_RESERVED2, + IB_WR_RESERVED3, + IB_WR_RESERVED4, + IB_WR_RESERVED5, + IB_WR_RESERVED6, + IB_WR_RESERVED7, + IB_WR_RESERVED8, + IB_WR_RESERVED9, + IB_WR_RESERVED10, +}; + +enum ib_send_flags { + IB_SEND_FENCE = 1, + IB_SEND_SIGNALED = (1<<1), + IB_SEND_SOLICITED = (1<<2), + IB_SEND_INLINE = (1<<3), + IB_SEND_IP_CSUM = (1<<4), + + /* reserve bits 26-31 for low level drivers' internal use */ + IB_SEND_RESERVED_START = (1 << 26), + IB_SEND_RESERVED_END = (1 << 31), +}; + +struct ib_sge { + u64 addr; + u32 length; + u32 lkey; +}; + +struct ib_fast_reg_page_list { + struct ib_device *device; + u64 *page_list; + unsigned int max_page_list_len; +}; + +/** + * struct ib_mw_bind_info - Parameters for a memory window bind operation. + * @mr: A memory region to bind the memory window to. + * @addr: The address where the memory window should begin. + * @length: The length of the memory window, in bytes. + * @mw_access_flags: Access flags from enum ib_access_flags for the window. + * + * This struct contains the shared parameters for type 1 and type 2 + * memory window bind operations. + */ +struct ib_mw_bind_info { + struct ib_mr *mr; + u64 addr; + u64 length; + int mw_access_flags; +}; + +struct ib_send_wr { + struct ib_send_wr *next; + u64 wr_id; + struct ib_sge *sg_list; + int num_sge; + enum ib_wr_opcode opcode; + int send_flags; + union { + __be32 imm_data; + u32 invalidate_rkey; + } ex; + union { + struct { + u64 remote_addr; + u32 rkey; + } rdma; + struct { + u64 remote_addr; + u64 compare_add; + u64 swap; + u64 compare_add_mask; + u64 swap_mask; + u32 rkey; + } atomic; + struct { + struct ib_ah *ah; + void *header; + int hlen; + int mss; + u32 remote_qpn; + u32 remote_qkey; + u16 pkey_index; /* valid for GSI only */ + u8 port_num; /* valid for DR SMPs on switch only */ + } ud; + struct { + u64 iova_start; + struct ib_fast_reg_page_list *page_list; + unsigned int page_shift; + unsigned int page_list_len; + u32 length; + int access_flags; + u32 rkey; + } fast_reg; + struct { + struct ib_mw *mw; + /* The new rkey for the memory window. */ + u32 rkey; + struct ib_mw_bind_info bind_info; + } bind_mw; + struct { + struct ib_sig_attrs *sig_attrs; + struct ib_mr *sig_mr; + int access_flags; + struct ib_sge *prot; + } sig_handover; + } wr; + u32 xrc_remote_srq_num; /* XRC TGT QPs only */ +}; + +struct ib_recv_wr { + struct ib_recv_wr *next; + u64 wr_id; + struct ib_sge *sg_list; + int num_sge; +}; + +enum ib_access_flags { + IB_ACCESS_LOCAL_WRITE = 1, + IB_ACCESS_REMOTE_WRITE = (1<<1), + IB_ACCESS_REMOTE_READ = (1<<2), + IB_ACCESS_REMOTE_ATOMIC = (1<<3), + IB_ACCESS_MW_BIND = (1<<4), + IB_ZERO_BASED = (1<<5) +}; + +struct ib_phys_buf { + u64 addr; + u64 size; +}; + +struct ib_mr_attr { + struct ib_pd *pd; + u64 device_virt_addr; + u64 size; + int mr_access_flags; + u32 lkey; + u32 rkey; +}; + +enum ib_mr_rereg_flags { + IB_MR_REREG_TRANS = 1, + IB_MR_REREG_PD = (1<<1), + IB_MR_REREG_ACCESS = (1<<2), + IB_MR_REREG_SUPPORTED = ((IB_MR_REREG_ACCESS << 1) - 1) +}; + +/** + * struct ib_mw_bind - Parameters for a type 1 memory window bind operation. + * @wr_id: Work request id. + * @send_flags: Flags from ib_send_flags enum. + * @bind_info: More parameters of the bind operation. + */ +struct ib_mw_bind { + u64 wr_id; + int send_flags; + struct ib_mw_bind_info bind_info; +}; + +struct ib_fmr_attr { + int max_pages; + int max_maps; + u8 page_shift; +}; + +struct ib_ucontext { + struct ib_device *device; + struct list_head pd_list; + struct list_head mr_list; + struct list_head mw_list; + struct list_head cq_list; + struct list_head qp_list; + struct list_head srq_list; + struct list_head ah_list; + struct list_head xrcd_list; + struct list_head rule_list; + int closing; +}; + +struct ib_uobject { + u64 user_handle; /* handle given to us by userspace */ + struct ib_ucontext *context; /* associated user context */ + void *object; /* containing object */ + struct list_head list; /* link to context's list */ + int id; /* index into kernel idr */ + struct kref ref; + struct rw_semaphore mutex; /* protects .live */ + int live; +}; + +struct ib_udata { + const void __user *inbuf; + void __user *outbuf; + size_t inlen; + size_t outlen; +}; + +struct ib_pd { + struct ib_device *device; + struct ib_uobject *uobject; + atomic_t usecnt; /* count all resources */ +}; + +struct ib_xrcd { + struct ib_device *device; + atomic_t usecnt; /* count all exposed resources */ + struct inode *inode; + + struct mutex tgt_qp_mutex; + struct list_head tgt_qp_list; +}; + +struct ib_ah { + struct ib_device *device; + struct ib_pd *pd; + struct ib_uobject *uobject; +}; + +typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context); + +struct ib_cq { + struct ib_device *device; + struct ib_uobject *uobject; + ib_comp_handler comp_handler; + void (*event_handler)(struct ib_event *, void *); + void *cq_context; + int cqe; + atomic_t usecnt; /* count number of work queues */ +}; + +struct ib_srq { + struct ib_device *device; + struct ib_pd *pd; + struct ib_uobject *uobject; + void (*event_handler)(struct ib_event *, void *); + void *srq_context; + enum ib_srq_type srq_type; + atomic_t usecnt; + + union { + struct { + struct ib_xrcd *xrcd; + struct ib_cq *cq; + u32 srq_num; + } xrc; + } ext; +}; + +struct ib_qp { + struct ib_device *device; + struct ib_pd *pd; + struct ib_cq *send_cq; + struct ib_cq *recv_cq; + struct ib_srq *srq; + struct ib_xrcd *xrcd; /* XRC TGT QPs only */ + struct list_head xrcd_list; + /* count times opened, mcast attaches, flow attaches */ + atomic_t usecnt; + struct list_head open_list; + struct ib_qp *real_qp; + struct ib_uobject *uobject; + void (*event_handler)(struct ib_event *, void *); + void *qp_context; + u32 qp_num; + enum ib_qp_type qp_type; +}; + +struct ib_mr { + struct ib_device *device; + struct ib_pd *pd; + struct ib_uobject *uobject; + u32 lkey; + u32 rkey; + atomic_t usecnt; /* count number of MWs */ +}; + +struct ib_mw { + struct ib_device *device; + struct ib_pd *pd; + struct ib_uobject *uobject; + u32 rkey; + enum ib_mw_type type; +}; + +struct ib_fmr { + struct ib_device *device; + struct ib_pd *pd; + struct list_head list; + u32 lkey; + u32 rkey; +}; + +/* Supported steering options */ +enum ib_flow_attr_type { + /* steering according to rule specifications */ + IB_FLOW_ATTR_NORMAL = 0x0, + /* default unicast and multicast rule - + * receive all Eth traffic which isn't steered to any QP + */ + IB_FLOW_ATTR_ALL_DEFAULT = 0x1, + /* default multicast rule - + * receive all Eth multicast traffic which isn't steered to any QP + */ + IB_FLOW_ATTR_MC_DEFAULT = 0x2, + /* sniffer rule - receive all port traffic */ + IB_FLOW_ATTR_SNIFFER = 0x3 +}; + +/* Supported steering header types */ +enum ib_flow_spec_type { + /* L2 headers*/ + IB_FLOW_SPEC_ETH = 0x20, + IB_FLOW_SPEC_IB = 0x22, + /* L3 header*/ + IB_FLOW_SPEC_IPV4 = 0x30, + /* L4 headers*/ + IB_FLOW_SPEC_TCP = 0x40, + IB_FLOW_SPEC_UDP = 0x41 +}; +#define IB_FLOW_SPEC_LAYER_MASK 0xF0 +#define IB_FLOW_SPEC_SUPPORT_LAYERS 4 + +/* Flow steering rule priority is set according to it's domain. + * Lower domain value means higher priority. + */ +enum ib_flow_domain { + IB_FLOW_DOMAIN_USER, + IB_FLOW_DOMAIN_ETHTOOL, + IB_FLOW_DOMAIN_RFS, + IB_FLOW_DOMAIN_NIC, + IB_FLOW_DOMAIN_NUM /* Must be last */ +}; + +struct ib_flow_eth_filter { + u8 dst_mac[6]; + u8 src_mac[6]; + __be16 ether_type; + __be16 vlan_tag; +}; + +struct ib_flow_spec_eth { + enum ib_flow_spec_type type; + u16 size; + struct ib_flow_eth_filter val; + struct ib_flow_eth_filter mask; +}; + +struct ib_flow_ib_filter { + __be16 dlid; + __u8 sl; +}; + +struct ib_flow_spec_ib { + enum ib_flow_spec_type type; + u16 size; + struct ib_flow_ib_filter val; + struct ib_flow_ib_filter mask; +}; + +struct ib_flow_ipv4_filter { + __be32 src_ip; + __be32 dst_ip; +}; + +struct ib_flow_spec_ipv4 { + enum ib_flow_spec_type type; + u16 size; + struct ib_flow_ipv4_filter val; + struct ib_flow_ipv4_filter mask; +}; + +struct ib_flow_tcp_udp_filter { + __be16 dst_port; + __be16 src_port; +}; + +struct ib_flow_spec_tcp_udp { + enum ib_flow_spec_type type; + u16 size; + struct ib_flow_tcp_udp_filter val; + struct ib_flow_tcp_udp_filter mask; +}; + +union ib_flow_spec { + struct { + enum ib_flow_spec_type type; + u16 size; + }; + struct ib_flow_spec_eth eth; + struct ib_flow_spec_ib ib; + struct ib_flow_spec_ipv4 ipv4; + struct ib_flow_spec_tcp_udp tcp_udp; +}; + +struct ib_flow_attr { + enum ib_flow_attr_type type; + u16 size; + u16 priority; + u32 flags; + u8 num_of_specs; + u8 port; + /* Following are the optional layers according to user request + * struct ib_flow_spec_xxx + * struct ib_flow_spec_yyy + */ +}; + +struct ib_flow { + struct ib_qp *qp; + struct ib_uobject *uobject; +}; + +struct ib_mad; +struct ib_grh; + +enum ib_process_mad_flags { + IB_MAD_IGNORE_MKEY = 1, + IB_MAD_IGNORE_BKEY = 2, + IB_MAD_IGNORE_ALL = IB_MAD_IGNORE_MKEY | IB_MAD_IGNORE_BKEY +}; + +enum ib_mad_result { + IB_MAD_RESULT_FAILURE = 0, /* (!SUCCESS is the important flag) */ + IB_MAD_RESULT_SUCCESS = 1 << 0, /* MAD was successfully processed */ + IB_MAD_RESULT_REPLY = 1 << 1, /* Reply packet needs to be sent */ + IB_MAD_RESULT_CONSUMED = 1 << 2 /* Packet consumed: stop processing */ +}; + +#define IB_DEVICE_NAME_MAX 64 + +struct ib_cache { + rwlock_t lock; + struct ib_event_handler event_handler; + struct ib_pkey_cache **pkey_cache; + struct ib_gid_cache **gid_cache; + u8 *lmc_cache; +}; + +struct ib_dma_mapping_ops { + int (*mapping_error)(struct ib_device *dev, + u64 dma_addr); + u64 (*map_single)(struct ib_device *dev, + void *ptr, size_t size, + enum dma_data_direction direction); + void (*unmap_single)(struct ib_device *dev, + u64 addr, size_t size, + enum dma_data_direction direction); + u64 (*map_page)(struct ib_device *dev, + struct page *page, unsigned long offset, + size_t size, + enum dma_data_direction direction); + void (*unmap_page)(struct ib_device *dev, + u64 addr, size_t size, + enum dma_data_direction direction); + int (*map_sg)(struct ib_device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction); + void (*unmap_sg)(struct ib_device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction); + void (*sync_single_for_cpu)(struct ib_device *dev, + u64 dma_handle, + size_t size, + enum dma_data_direction dir); + void (*sync_single_for_device)(struct ib_device *dev, + u64 dma_handle, + size_t size, + enum dma_data_direction dir); + void *(*alloc_coherent)(struct ib_device *dev, + size_t size, + u64 *dma_handle, + gfp_t flag); + void (*free_coherent)(struct ib_device *dev, + size_t size, void *cpu_addr, + u64 dma_handle); +}; + +struct iw_cm_verbs; + +struct ib_device { + struct device *dma_device; + + char name[IB_DEVICE_NAME_MAX]; + + struct list_head event_handler_list; + spinlock_t event_handler_lock; + + spinlock_t client_data_lock; + struct list_head core_list; + struct list_head client_data_list; + + struct ib_cache cache; + int *pkey_tbl_len; + int *gid_tbl_len; + + int num_comp_vectors; + + struct iw_cm_verbs *iwcm; + + int (*get_protocol_stats)(struct ib_device *device, + union rdma_protocol_stats *stats); + int (*query_device)(struct ib_device *device, + struct ib_device_attr *device_attr); + int (*query_port)(struct ib_device *device, + u8 port_num, + struct ib_port_attr *port_attr); + enum rdma_link_layer (*get_link_layer)(struct ib_device *device, + u8 port_num); + int (*query_gid)(struct ib_device *device, + u8 port_num, int index, + union ib_gid *gid); + int (*query_pkey)(struct ib_device *device, + u8 port_num, u16 index, u16 *pkey); + int (*modify_device)(struct ib_device *device, + int device_modify_mask, + struct ib_device_modify *device_modify); + int (*modify_port)(struct ib_device *device, + u8 port_num, int port_modify_mask, + struct ib_port_modify *port_modify); + struct ib_ucontext * (*alloc_ucontext)(struct ib_device *device, + struct ib_udata *udata); + int (*dealloc_ucontext)(struct ib_ucontext *context); + int (*mmap)(struct ib_ucontext *context, + struct vm_area_struct *vma); + struct ib_pd * (*alloc_pd)(struct ib_device *device, + struct ib_ucontext *context, + struct ib_udata *udata); + int (*dealloc_pd)(struct ib_pd *pd); + struct ib_ah * (*create_ah)(struct ib_pd *pd, + struct ib_ah_attr *ah_attr); + int (*modify_ah)(struct ib_ah *ah, + struct ib_ah_attr *ah_attr); + int (*query_ah)(struct ib_ah *ah, + struct ib_ah_attr *ah_attr); + int (*destroy_ah)(struct ib_ah *ah); + struct ib_srq * (*create_srq)(struct ib_pd *pd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata); + int (*modify_srq)(struct ib_srq *srq, + struct ib_srq_attr *srq_attr, + enum ib_srq_attr_mask srq_attr_mask, + struct ib_udata *udata); + int (*query_srq)(struct ib_srq *srq, + struct ib_srq_attr *srq_attr); + int (*destroy_srq)(struct ib_srq *srq); + int (*post_srq_recv)(struct ib_srq *srq, + struct ib_recv_wr *recv_wr, + struct ib_recv_wr **bad_recv_wr); + struct ib_qp * (*create_qp)(struct ib_pd *pd, + struct ib_qp_init_attr *qp_init_attr, + struct ib_udata *udata); + int (*modify_qp)(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_udata *udata); + int (*query_qp)(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr); + int (*destroy_qp)(struct ib_qp *qp); + int (*post_send)(struct ib_qp *qp, + struct ib_send_wr *send_wr, + struct ib_send_wr **bad_send_wr); + int (*post_recv)(struct ib_qp *qp, + struct ib_recv_wr *recv_wr, + struct ib_recv_wr **bad_recv_wr); + struct ib_cq * (*create_cq)(struct ib_device *device, int cqe, + int comp_vector, + struct ib_ucontext *context, + struct ib_udata *udata); + int (*modify_cq)(struct ib_cq *cq, u16 cq_count, + u16 cq_period); + int (*destroy_cq)(struct ib_cq *cq); + int (*resize_cq)(struct ib_cq *cq, int cqe, + struct ib_udata *udata); + int (*poll_cq)(struct ib_cq *cq, int num_entries, + struct ib_wc *wc); + int (*peek_cq)(struct ib_cq *cq, int wc_cnt); + int (*req_notify_cq)(struct ib_cq *cq, + enum ib_cq_notify_flags flags); + int (*req_ncomp_notif)(struct ib_cq *cq, + int wc_cnt); + struct ib_mr * (*get_dma_mr)(struct ib_pd *pd, + int mr_access_flags); + struct ib_mr * (*reg_phys_mr)(struct ib_pd *pd, + struct ib_phys_buf *phys_buf_array, + int num_phys_buf, + int mr_access_flags, + u64 *iova_start); + struct ib_mr * (*reg_user_mr)(struct ib_pd *pd, + u64 start, u64 length, + u64 virt_addr, + int mr_access_flags, + struct ib_udata *udata); + int (*rereg_user_mr)(struct ib_mr *mr, + int flags, + u64 start, u64 length, + u64 virt_addr, + int mr_access_flags, + struct ib_pd *pd, + struct ib_udata *udata); + int (*query_mr)(struct ib_mr *mr, + struct ib_mr_attr *mr_attr); + int (*dereg_mr)(struct ib_mr *mr); + int (*destroy_mr)(struct ib_mr *mr); + struct ib_mr * (*create_mr)(struct ib_pd *pd, + struct ib_mr_init_attr *mr_init_attr); + struct ib_mr * (*alloc_fast_reg_mr)(struct ib_pd *pd, + int max_page_list_len); + struct ib_fast_reg_page_list * (*alloc_fast_reg_page_list)(struct ib_device *device, + int page_list_len); + void (*free_fast_reg_page_list)(struct ib_fast_reg_page_list *page_list); + int (*rereg_phys_mr)(struct ib_mr *mr, + int mr_rereg_mask, + struct ib_pd *pd, + struct ib_phys_buf *phys_buf_array, + int num_phys_buf, + int mr_access_flags, + u64 *iova_start); + struct ib_mw * (*alloc_mw)(struct ib_pd *pd, + enum ib_mw_type type); + int (*bind_mw)(struct ib_qp *qp, + struct ib_mw *mw, + struct ib_mw_bind *mw_bind); + int (*dealloc_mw)(struct ib_mw *mw); + struct ib_fmr * (*alloc_fmr)(struct ib_pd *pd, + int mr_access_flags, + struct ib_fmr_attr *fmr_attr); + int (*map_phys_fmr)(struct ib_fmr *fmr, + u64 *page_list, int list_len, + u64 iova); + int (*unmap_fmr)(struct list_head *fmr_list); + int (*dealloc_fmr)(struct ib_fmr *fmr); + int (*attach_mcast)(struct ib_qp *qp, + union ib_gid *gid, + u16 lid); + int (*detach_mcast)(struct ib_qp *qp, + union ib_gid *gid, + u16 lid); + int (*process_mad)(struct ib_device *device, + int process_mad_flags, + u8 port_num, + struct ib_wc *in_wc, + struct ib_grh *in_grh, + struct ib_mad *in_mad, + struct ib_mad *out_mad); + struct ib_xrcd * (*alloc_xrcd)(struct ib_device *device, + struct ib_ucontext *ucontext, + struct ib_udata *udata); + int (*dealloc_xrcd)(struct ib_xrcd *xrcd); + struct ib_flow * (*create_flow)(struct ib_qp *qp, + struct ib_flow_attr + *flow_attr, + int domain); + int (*destroy_flow)(struct ib_flow *flow_id); + int (*check_mr_status)(struct ib_mr *mr, u32 check_mask, + struct ib_mr_status *mr_status); + + struct ib_dma_mapping_ops *dma_ops; + + struct module *owner; + struct device dev; + struct kobject *ports_parent; + struct list_head port_list; + + enum { + IB_DEV_UNINITIALIZED, + IB_DEV_REGISTERED, + IB_DEV_UNREGISTERED + } reg_state; + + int uverbs_abi_ver; + u64 uverbs_cmd_mask; + u64 uverbs_ex_cmd_mask; + + char node_desc[64]; + __be64 node_guid; + u32 local_dma_lkey; + u8 node_type; + u8 phys_port_cnt; +}; + +struct ib_client { + char *name; + void (*add) (struct ib_device *); + void (*remove)(struct ib_device *); + + struct list_head list; +}; + +struct ib_device *ib_alloc_device(size_t size); +void ib_dealloc_device(struct ib_device *device); + +int ib_register_device(struct ib_device *device, + int (*port_callback)(struct ib_device *, + u8, struct kobject *)); +void ib_unregister_device(struct ib_device *device); + +int ib_register_client (struct ib_client *client); +void ib_unregister_client(struct ib_client *client); + +void *ib_get_client_data(struct ib_device *device, struct ib_client *client); +void ib_set_client_data(struct ib_device *device, struct ib_client *client, + void *data); + +static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len) +{ + return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0; +} + +static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len) +{ + return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0; +} + +/** + * ib_modify_qp_is_ok - Check that the supplied attribute mask + * contains all required attributes and no attributes not allowed for + * the given QP state transition. + * @cur_state: Current QP state + * @next_state: Next QP state + * @type: QP type + * @mask: Mask of supplied QP attributes + * @ll : link layer of port + * + * This function is a helper function that a low-level driver's + * modify_qp method can use to validate the consumer's input. It + * checks that cur_state and next_state are valid QP states, that a + * transition from cur_state to next_state is allowed by the IB spec, + * and that the attribute mask supplied is allowed for the transition. + */ +int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, + enum ib_qp_type type, enum ib_qp_attr_mask mask, + enum rdma_link_layer ll); + +int ib_register_event_handler (struct ib_event_handler *event_handler); +int ib_unregister_event_handler(struct ib_event_handler *event_handler); +void ib_dispatch_event(struct ib_event *event); + +int ib_query_device(struct ib_device *device, + struct ib_device_attr *device_attr); + +int ib_query_port(struct ib_device *device, + u8 port_num, struct ib_port_attr *port_attr); + +enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, + u8 port_num); + +int ib_query_gid(struct ib_device *device, + u8 port_num, int index, union ib_gid *gid); + +int ib_query_pkey(struct ib_device *device, + u8 port_num, u16 index, u16 *pkey); + +int ib_modify_device(struct ib_device *device, + int device_modify_mask, + struct ib_device_modify *device_modify); + +int ib_modify_port(struct ib_device *device, + u8 port_num, int port_modify_mask, + struct ib_port_modify *port_modify); + +int ib_find_gid(struct ib_device *device, union ib_gid *gid, + u8 *port_num, u16 *index); + +int ib_find_pkey(struct ib_device *device, + u8 port_num, u16 pkey, u16 *index); + +/** + * ib_alloc_pd - Allocates an unused protection domain. + * @device: The device on which to allocate the protection domain. + * + * A protection domain object provides an association between QPs, shared + * receive queues, address handles, memory regions, and memory windows. + */ +struct ib_pd *ib_alloc_pd(struct ib_device *device); + +/** + * ib_dealloc_pd - Deallocates a protection domain. + * @pd: The protection domain to deallocate. + */ +int ib_dealloc_pd(struct ib_pd *pd); + +/** + * ib_create_ah - Creates an address handle for the given address vector. + * @pd: The protection domain associated with the address handle. + * @ah_attr: The attributes of the address vector. + * + * The address handle is used to reference a local or global destination + * in all UD QP post sends. + */ +struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr); + +/** + * ib_init_ah_from_wc - Initializes address handle attributes from a + * work completion. + * @device: Device on which the received message arrived. + * @port_num: Port on which the received message arrived. + * @wc: Work completion associated with the received message. + * @grh: References the received global route header. This parameter is + * ignored unless the work completion indicates that the GRH is valid. + * @ah_attr: Returned attributes that can be used when creating an address + * handle for replying to the message. + */ +int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc, + struct ib_grh *grh, struct ib_ah_attr *ah_attr); + +/** + * ib_create_ah_from_wc - Creates an address handle associated with the + * sender of the specified work completion. + * @pd: The protection domain associated with the address handle. + * @wc: Work completion information associated with a received message. + * @grh: References the received global route header. This parameter is + * ignored unless the work completion indicates that the GRH is valid. + * @port_num: The outbound port number to associate with the address. + * + * The address handle is used to reference a local or global destination + * in all UD QP post sends. + */ +struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc, + struct ib_grh *grh, u8 port_num); + +/** + * ib_modify_ah - Modifies the address vector associated with an address + * handle. + * @ah: The address handle to modify. + * @ah_attr: The new address vector attributes to associate with the + * address handle. + */ +int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr); + +/** + * ib_query_ah - Queries the address vector associated with an address + * handle. + * @ah: The address handle to query. + * @ah_attr: The address vector attributes associated with the address + * handle. + */ +int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr); + +/** + * ib_destroy_ah - Destroys an address handle. + * @ah: The address handle to destroy. + */ +int ib_destroy_ah(struct ib_ah *ah); + +/** + * ib_create_srq - Creates a SRQ associated with the specified protection + * domain. + * @pd: The protection domain associated with the SRQ. + * @srq_init_attr: A list of initial attributes required to create the + * SRQ. If SRQ creation succeeds, then the attributes are updated to + * the actual capabilities of the created SRQ. + * + * srq_attr->max_wr and srq_attr->max_sge are read the determine the + * requested size of the SRQ, and set to the actual values allocated + * on return. If ib_create_srq() succeeds, then max_wr and max_sge + * will always be at least as large as the requested values. + */ +struct ib_srq *ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *srq_init_attr); + +/** + * ib_modify_srq - Modifies the attributes for the specified SRQ. + * @srq: The SRQ to modify. + * @srq_attr: On input, specifies the SRQ attributes to modify. On output, + * the current values of selected SRQ attributes are returned. + * @srq_attr_mask: A bit-mask used to specify which attributes of the SRQ + * are being modified. + * + * The mask may contain IB_SRQ_MAX_WR to resize the SRQ and/or + * IB_SRQ_LIMIT to set the SRQ's limit and request notification when + * the number of receives queued drops below the limit. + */ +int ib_modify_srq(struct ib_srq *srq, + struct ib_srq_attr *srq_attr, + enum ib_srq_attr_mask srq_attr_mask); + +/** + * ib_query_srq - Returns the attribute list and current values for the + * specified SRQ. + * @srq: The SRQ to query. + * @srq_attr: The attributes of the specified SRQ. + */ +int ib_query_srq(struct ib_srq *srq, + struct ib_srq_attr *srq_attr); + +/** + * ib_destroy_srq - Destroys the specified SRQ. + * @srq: The SRQ to destroy. + */ +int ib_destroy_srq(struct ib_srq *srq); + +/** + * ib_post_srq_recv - Posts a list of work requests to the specified SRQ. + * @srq: The SRQ to post the work request on. + * @recv_wr: A list of work requests to post on the receive queue. + * @bad_recv_wr: On an immediate failure, this parameter will reference + * the work request that failed to be posted on the QP. + */ +static inline int ib_post_srq_recv(struct ib_srq *srq, + struct ib_recv_wr *recv_wr, + struct ib_recv_wr **bad_recv_wr) +{ + return srq->device->post_srq_recv(srq, recv_wr, bad_recv_wr); +} + +/** + * ib_create_qp - Creates a QP associated with the specified protection + * domain. + * @pd: The protection domain associated with the QP. + * @qp_init_attr: A list of initial attributes required to create the + * QP. If QP creation succeeds, then the attributes are updated to + * the actual capabilities of the created QP. + */ +struct ib_qp *ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *qp_init_attr); + +/** + * ib_modify_qp - Modifies the attributes for the specified QP and then + * transitions the QP to the given state. + * @qp: The QP to modify. + * @qp_attr: On input, specifies the QP attributes to modify. On output, + * the current values of selected QP attributes are returned. + * @qp_attr_mask: A bit-mask used to specify which attributes of the QP + * are being modified. + */ +int ib_modify_qp(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask); + +/** + * ib_query_qp - Returns the attribute list and current values for the + * specified QP. + * @qp: The QP to query. + * @qp_attr: The attributes of the specified QP. + * @qp_attr_mask: A bit-mask used to select specific attributes to query. + * @qp_init_attr: Additional attributes of the selected QP. + * + * The qp_attr_mask may be used to limit the query to gathering only the + * selected attributes. + */ +int ib_query_qp(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr); + +/** + * ib_destroy_qp - Destroys the specified QP. + * @qp: The QP to destroy. + */ +int ib_destroy_qp(struct ib_qp *qp); + +/** + * ib_open_qp - Obtain a reference to an existing sharable QP. + * @xrcd - XRC domain + * @qp_open_attr: Attributes identifying the QP to open. + * + * Returns a reference to a sharable QP. + */ +struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, + struct ib_qp_open_attr *qp_open_attr); + +/** + * ib_close_qp - Release an external reference to a QP. + * @qp: The QP handle to release + * + * The opened QP handle is released by the caller. The underlying + * shared QP is not destroyed until all internal references are released. + */ +int ib_close_qp(struct ib_qp *qp); + +/** + * ib_post_send - Posts a list of work requests to the send queue of + * the specified QP. + * @qp: The QP to post the work request on. + * @send_wr: A list of work requests to post on the send queue. + * @bad_send_wr: On an immediate failure, this parameter will reference + * the work request that failed to be posted on the QP. + * + * While IBA Vol. 1 section 11.4.1.1 specifies that if an immediate + * error is returned, the QP state shall not be affected, + * ib_post_send() will return an immediate error after queueing any + * earlier work requests in the list. + */ +static inline int ib_post_send(struct ib_qp *qp, + struct ib_send_wr *send_wr, + struct ib_send_wr **bad_send_wr) +{ + return qp->device->post_send(qp, send_wr, bad_send_wr); +} + +/** + * ib_post_recv - Posts a list of work requests to the receive queue of + * the specified QP. + * @qp: The QP to post the work request on. + * @recv_wr: A list of work requests to post on the receive queue. + * @bad_recv_wr: On an immediate failure, this parameter will reference + * the work request that failed to be posted on the QP. + */ +static inline int ib_post_recv(struct ib_qp *qp, + struct ib_recv_wr *recv_wr, + struct ib_recv_wr **bad_recv_wr) +{ + return qp->device->post_recv(qp, recv_wr, bad_recv_wr); +} + +/** + * ib_create_cq - Creates a CQ on the specified device. + * @device: The device on which to create the CQ. + * @comp_handler: A user-specified callback that is invoked when a + * completion event occurs on the CQ. + * @event_handler: A user-specified callback that is invoked when an + * asynchronous event not associated with a completion occurs on the CQ. + * @cq_context: Context associated with the CQ returned to the user via + * the associated completion and event handlers. + * @cqe: The minimum size of the CQ. + * @comp_vector - Completion vector used to signal completion events. + * Must be >= 0 and < context->num_comp_vectors. + * + * Users can examine the cq structure to determine the actual CQ size. + */ +struct ib_cq *ib_create_cq(struct ib_device *device, + ib_comp_handler comp_handler, + void (*event_handler)(struct ib_event *, void *), + void *cq_context, int cqe, int comp_vector); + +/** + * ib_resize_cq - Modifies the capacity of the CQ. + * @cq: The CQ to resize. + * @cqe: The minimum size of the CQ. + * + * Users can examine the cq structure to determine the actual CQ size. + */ +int ib_resize_cq(struct ib_cq *cq, int cqe); + +/** + * ib_modify_cq - Modifies moderation params of the CQ + * @cq: The CQ to modify. + * @cq_count: number of CQEs that will trigger an event + * @cq_period: max period of time in usec before triggering an event + * + */ +int ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); + +/** + * ib_destroy_cq - Destroys the specified CQ. + * @cq: The CQ to destroy. + */ +int ib_destroy_cq(struct ib_cq *cq); + +/** + * ib_poll_cq - poll a CQ for completion(s) + * @cq:the CQ being polled + * @num_entries:maximum number of completions to return + * @wc:array of at least @num_entries &struct ib_wc where completions + * will be returned + * + * Poll a CQ for (possibly multiple) completions. If the return value + * is < 0, an error occurred. If the return value is >= 0, it is the + * number of completions returned. If the return value is + * non-negative and < num_entries, then the CQ was emptied. + */ +static inline int ib_poll_cq(struct ib_cq *cq, int num_entries, + struct ib_wc *wc) +{ + return cq->device->poll_cq(cq, num_entries, wc); +} + +/** + * ib_peek_cq - Returns the number of unreaped completions currently + * on the specified CQ. + * @cq: The CQ to peek. + * @wc_cnt: A minimum number of unreaped completions to check for. + * + * If the number of unreaped completions is greater than or equal to wc_cnt, + * this function returns wc_cnt, otherwise, it returns the actual number of + * unreaped completions. + */ +int ib_peek_cq(struct ib_cq *cq, int wc_cnt); + +/** + * ib_req_notify_cq - Request completion notification on a CQ. + * @cq: The CQ to generate an event for. + * @flags: + * Must contain exactly one of %IB_CQ_SOLICITED or %IB_CQ_NEXT_COMP + * to request an event on the next solicited event or next work + * completion at any type, respectively. %IB_CQ_REPORT_MISSED_EVENTS + * may also be |ed in to request a hint about missed events, as + * described below. + * + * Return Value: + * < 0 means an error occurred while requesting notification + * == 0 means notification was requested successfully, and if + * IB_CQ_REPORT_MISSED_EVENTS was passed in, then no events + * were missed and it is safe to wait for another event. In + * this case is it guaranteed that any work completions added + * to the CQ since the last CQ poll will trigger a completion + * notification event. + * > 0 is only returned if IB_CQ_REPORT_MISSED_EVENTS was passed + * in. It means that the consumer must poll the CQ again to + * make sure it is empty to avoid missing an event because of a + * race between requesting notification and an entry being + * added to the CQ. This return value means it is possible + * (but not guaranteed) that a work completion has been added + * to the CQ since the last poll without triggering a + * completion notification event. + */ +static inline int ib_req_notify_cq(struct ib_cq *cq, + enum ib_cq_notify_flags flags) +{ + return cq->device->req_notify_cq(cq, flags); +} + +/** + * ib_req_ncomp_notif - Request completion notification when there are + * at least the specified number of unreaped completions on the CQ. + * @cq: The CQ to generate an event for. + * @wc_cnt: The number of unreaped completions that should be on the + * CQ before an event is generated. + */ +static inline int ib_req_ncomp_notif(struct ib_cq *cq, int wc_cnt) +{ + return cq->device->req_ncomp_notif ? + cq->device->req_ncomp_notif(cq, wc_cnt) : + -ENOSYS; +} + +/** + * ib_get_dma_mr - Returns a memory region for system memory that is + * usable for DMA. + * @pd: The protection domain associated with the memory region. + * @mr_access_flags: Specifies the memory access rights. + * + * Note that the ib_dma_*() functions defined below must be used + * to create/destroy addresses used with the Lkey or Rkey returned + * by ib_get_dma_mr(). + */ +struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags); + +/** + * ib_dma_mapping_error - check a DMA addr for error + * @dev: The device for which the dma_addr was created + * @dma_addr: The DMA address to check + */ +static inline int ib_dma_mapping_error(struct ib_device *dev, u64 dma_addr) +{ + if (dev->dma_ops) + return dev->dma_ops->mapping_error(dev, dma_addr); + return dma_mapping_error(dev->dma_device, dma_addr); +} + +/** + * ib_dma_map_single - Map a kernel virtual address to DMA address + * @dev: The device for which the dma_addr is to be created + * @cpu_addr: The kernel virtual address + * @size: The size of the region in bytes + * @direction: The direction of the DMA + */ +static inline u64 ib_dma_map_single(struct ib_device *dev, + void *cpu_addr, size_t size, + enum dma_data_direction direction) +{ + if (dev->dma_ops) + return dev->dma_ops->map_single(dev, cpu_addr, size, direction); + return dma_map_single(dev->dma_device, cpu_addr, size, direction); +} + +/** + * ib_dma_unmap_single - Destroy a mapping created by ib_dma_map_single() + * @dev: The device for which the DMA address was created + * @addr: The DMA address + * @size: The size of the region in bytes + * @direction: The direction of the DMA + */ +static inline void ib_dma_unmap_single(struct ib_device *dev, + u64 addr, size_t size, + enum dma_data_direction direction) +{ + if (dev->dma_ops) + dev->dma_ops->unmap_single(dev, addr, size, direction); + else + dma_unmap_single(dev->dma_device, addr, size, direction); +} + +static inline u64 ib_dma_map_single_attrs(struct ib_device *dev, + void *cpu_addr, size_t size, + enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + return dma_map_single_attrs(dev->dma_device, cpu_addr, size, + direction, attrs); +} + +static inline void ib_dma_unmap_single_attrs(struct ib_device *dev, + u64 addr, size_t size, + enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + return dma_unmap_single_attrs(dev->dma_device, addr, size, + direction, attrs); +} + +/** + * ib_dma_map_page - Map a physical page to DMA address + * @dev: The device for which the dma_addr is to be created + * @page: The page to be mapped + * @offset: The offset within the page + * @size: The size of the region in bytes + * @direction: The direction of the DMA + */ +static inline u64 ib_dma_map_page(struct ib_device *dev, + struct page *page, + unsigned long offset, + size_t size, + enum dma_data_direction direction) +{ + if (dev->dma_ops) + return dev->dma_ops->map_page(dev, page, offset, size, direction); + return dma_map_page(dev->dma_device, page, offset, size, direction); +} + +/** + * ib_dma_unmap_page - Destroy a mapping created by ib_dma_map_page() + * @dev: The device for which the DMA address was created + * @addr: The DMA address + * @size: The size of the region in bytes + * @direction: The direction of the DMA + */ +static inline void ib_dma_unmap_page(struct ib_device *dev, + u64 addr, size_t size, + enum dma_data_direction direction) +{ + if (dev->dma_ops) + dev->dma_ops->unmap_page(dev, addr, size, direction); + else + dma_unmap_page(dev->dma_device, addr, size, direction); +} + +/** + * ib_dma_map_sg - Map a scatter/gather list to DMA addresses + * @dev: The device for which the DMA addresses are to be created + * @sg: The array of scatter/gather entries + * @nents: The number of scatter/gather entries + * @direction: The direction of the DMA + */ +static inline int ib_dma_map_sg(struct ib_device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + if (dev->dma_ops) + return dev->dma_ops->map_sg(dev, sg, nents, direction); + return dma_map_sg(dev->dma_device, sg, nents, direction); +} + +/** + * ib_dma_unmap_sg - Unmap a scatter/gather list of DMA addresses + * @dev: The device for which the DMA addresses were created + * @sg: The array of scatter/gather entries + * @nents: The number of scatter/gather entries + * @direction: The direction of the DMA + */ +static inline void ib_dma_unmap_sg(struct ib_device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + if (dev->dma_ops) + dev->dma_ops->unmap_sg(dev, sg, nents, direction); + else + dma_unmap_sg(dev->dma_device, sg, nents, direction); +} + +static inline int ib_dma_map_sg_attrs(struct ib_device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + return dma_map_sg_attrs(dev->dma_device, sg, nents, direction, attrs); +} + +static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + dma_unmap_sg_attrs(dev->dma_device, sg, nents, direction, attrs); +} +/** + * ib_sg_dma_address - Return the DMA address from a scatter/gather entry + * @dev: The device for which the DMA addresses were created + * @sg: The scatter/gather entry + * + * Note: this function is obsolete. To do: change all occurrences of + * ib_sg_dma_address() into sg_dma_address(). + */ +static inline u64 ib_sg_dma_address(struct ib_device *dev, + struct scatterlist *sg) +{ + return sg_dma_address(sg); +} + +/** + * ib_sg_dma_len - Return the DMA length from a scatter/gather entry + * @dev: The device for which the DMA addresses were created + * @sg: The scatter/gather entry + * + * Note: this function is obsolete. To do: change all occurrences of + * ib_sg_dma_len() into sg_dma_len(). + */ +static inline unsigned int ib_sg_dma_len(struct ib_device *dev, + struct scatterlist *sg) +{ + return sg_dma_len(sg); +} + +/** + * ib_dma_sync_single_for_cpu - Prepare DMA region to be accessed by CPU + * @dev: The device for which the DMA address was created + * @addr: The DMA address + * @size: The size of the region in bytes + * @dir: The direction of the DMA + */ +static inline void ib_dma_sync_single_for_cpu(struct ib_device *dev, + u64 addr, + size_t size, + enum dma_data_direction dir) +{ + if (dev->dma_ops) + dev->dma_ops->sync_single_for_cpu(dev, addr, size, dir); + else + dma_sync_single_for_cpu(dev->dma_device, addr, size, dir); +} + +/** + * ib_dma_sync_single_for_device - Prepare DMA region to be accessed by device + * @dev: The device for which the DMA address was created + * @addr: The DMA address + * @size: The size of the region in bytes + * @dir: The direction of the DMA + */ +static inline void ib_dma_sync_single_for_device(struct ib_device *dev, + u64 addr, + size_t size, + enum dma_data_direction dir) +{ + if (dev->dma_ops) + dev->dma_ops->sync_single_for_device(dev, addr, size, dir); + else + dma_sync_single_for_device(dev->dma_device, addr, size, dir); +} + +/** + * ib_dma_alloc_coherent - Allocate memory and map it for DMA + * @dev: The device for which the DMA address is requested + * @size: The size of the region to allocate in bytes + * @dma_handle: A pointer for returning the DMA address of the region + * @flag: memory allocator flags + */ +static inline void *ib_dma_alloc_coherent(struct ib_device *dev, + size_t size, + u64 *dma_handle, + gfp_t flag) +{ + if (dev->dma_ops) + return dev->dma_ops->alloc_coherent(dev, size, dma_handle, flag); + else { + dma_addr_t handle; + void *ret; + + ret = dma_alloc_coherent(dev->dma_device, size, &handle, flag); + *dma_handle = handle; + return ret; + } +} + +/** + * ib_dma_free_coherent - Free memory allocated by ib_dma_alloc_coherent() + * @dev: The device for which the DMA addresses were allocated + * @size: The size of the region + * @cpu_addr: the address returned by ib_dma_alloc_coherent() + * @dma_handle: the DMA address returned by ib_dma_alloc_coherent() + */ +static inline void ib_dma_free_coherent(struct ib_device *dev, + size_t size, void *cpu_addr, + u64 dma_handle) +{ + if (dev->dma_ops) + dev->dma_ops->free_coherent(dev, size, cpu_addr, dma_handle); + else + dma_free_coherent(dev->dma_device, size, cpu_addr, dma_handle); +} + +/** + * ib_reg_phys_mr - Prepares a virtually addressed memory region for use + * by an HCA. + * @pd: The protection domain associated assigned to the registered region. + * @phys_buf_array: Specifies a list of physical buffers to use in the + * memory region. + * @num_phys_buf: Specifies the size of the phys_buf_array. + * @mr_access_flags: Specifies the memory access rights. + * @iova_start: The offset of the region's starting I/O virtual address. + */ +struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd, + struct ib_phys_buf *phys_buf_array, + int num_phys_buf, + int mr_access_flags, + u64 *iova_start); + +/** + * ib_rereg_phys_mr - Modifies the attributes of an existing memory region. + * Conceptually, this call performs the functions deregister memory region + * followed by register physical memory region. Where possible, + * resources are reused instead of deallocated and reallocated. + * @mr: The memory region to modify. + * @mr_rereg_mask: A bit-mask used to indicate which of the following + * properties of the memory region are being modified. + * @pd: If %IB_MR_REREG_PD is set in mr_rereg_mask, this field specifies + * the new protection domain to associated with the memory region, + * otherwise, this parameter is ignored. + * @phys_buf_array: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this + * field specifies a list of physical buffers to use in the new + * translation, otherwise, this parameter is ignored. + * @num_phys_buf: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this + * field specifies the size of the phys_buf_array, otherwise, this + * parameter is ignored. + * @mr_access_flags: If %IB_MR_REREG_ACCESS is set in mr_rereg_mask, this + * field specifies the new memory access rights, otherwise, this + * parameter is ignored. + * @iova_start: The offset of the region's starting I/O virtual address. + */ +int ib_rereg_phys_mr(struct ib_mr *mr, + int mr_rereg_mask, + struct ib_pd *pd, + struct ib_phys_buf *phys_buf_array, + int num_phys_buf, + int mr_access_flags, + u64 *iova_start); + +/** + * ib_query_mr - Retrieves information about a specific memory region. + * @mr: The memory region to retrieve information about. + * @mr_attr: The attributes of the specified memory region. + */ +int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr); + +/** + * ib_dereg_mr - Deregisters a memory region and removes it from the + * HCA translation table. + * @mr: The memory region to deregister. + * + * This function can fail, if the memory region has memory windows bound to it. + */ +int ib_dereg_mr(struct ib_mr *mr); + + +/** + * ib_create_mr - Allocates a memory region that may be used for + * signature handover operations. + * @pd: The protection domain associated with the region. + * @mr_init_attr: memory region init attributes. + */ +struct ib_mr *ib_create_mr(struct ib_pd *pd, + struct ib_mr_init_attr *mr_init_attr); + +/** + * ib_destroy_mr - Destroys a memory region that was created using + * ib_create_mr and removes it from HW translation tables. + * @mr: The memory region to destroy. + * + * This function can fail, if the memory region has memory windows bound to it. + */ +int ib_destroy_mr(struct ib_mr *mr); + +/** + * ib_alloc_fast_reg_mr - Allocates memory region usable with the + * IB_WR_FAST_REG_MR send work request. + * @pd: The protection domain associated with the region. + * @max_page_list_len: requested max physical buffer list length to be + * used with fast register work requests for this MR. + */ +struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len); + +/** + * ib_alloc_fast_reg_page_list - Allocates a page list array + * @device - ib device pointer. + * @page_list_len - size of the page list array to be allocated. + * + * This allocates and returns a struct ib_fast_reg_page_list * and a + * page_list array that is at least page_list_len in size. The actual + * size is returned in max_page_list_len. The caller is responsible + * for initializing the contents of the page_list array before posting + * a send work request with the IB_WC_FAST_REG_MR opcode. + * + * The page_list array entries must be translated using one of the + * ib_dma_*() functions just like the addresses passed to + * ib_map_phys_fmr(). Once the ib_post_send() is issued, the struct + * ib_fast_reg_page_list must not be modified by the caller until the + * IB_WC_FAST_REG_MR work request completes. + */ +struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list( + struct ib_device *device, int page_list_len); + +/** + * ib_free_fast_reg_page_list - Deallocates a previously allocated + * page list array. + * @page_list - struct ib_fast_reg_page_list pointer to be deallocated. + */ +void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list); + +/** + * ib_update_fast_reg_key - updates the key portion of the fast_reg MR + * R_Key and L_Key. + * @mr - struct ib_mr pointer to be updated. + * @newkey - new key to be used. + */ +static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey) +{ + mr->lkey = (mr->lkey & 0xffffff00) | newkey; + mr->rkey = (mr->rkey & 0xffffff00) | newkey; +} + +/** + * ib_inc_rkey - increments the key portion of the given rkey. Can be used + * for calculating a new rkey for type 2 memory windows. + * @rkey - the rkey to increment. + */ +static inline u32 ib_inc_rkey(u32 rkey) +{ + const u32 mask = 0x000000ff; + return ((rkey + 1) & mask) | (rkey & ~mask); +} + +/** + * ib_alloc_mw - Allocates a memory window. + * @pd: The protection domain associated with the memory window. + * @type: The type of the memory window (1 or 2). + */ +struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); + +/** + * ib_bind_mw - Posts a work request to the send queue of the specified + * QP, which binds the memory window to the given address range and + * remote access attributes. + * @qp: QP to post the bind work request on. + * @mw: The memory window to bind. + * @mw_bind: Specifies information about the memory window, including + * its address range, remote access rights, and associated memory region. + * + * If there is no immediate error, the function will update the rkey member + * of the mw parameter to its new value. The bind operation can still fail + * asynchronously. + */ +static inline int ib_bind_mw(struct ib_qp *qp, + struct ib_mw *mw, + struct ib_mw_bind *mw_bind) +{ + /* XXX reference counting in corresponding MR? */ + return mw->device->bind_mw ? + mw->device->bind_mw(qp, mw, mw_bind) : + -ENOSYS; +} + +/** + * ib_dealloc_mw - Deallocates a memory window. + * @mw: The memory window to deallocate. + */ +int ib_dealloc_mw(struct ib_mw *mw); + +/** + * ib_alloc_fmr - Allocates a unmapped fast memory region. + * @pd: The protection domain associated with the unmapped region. + * @mr_access_flags: Specifies the memory access rights. + * @fmr_attr: Attributes of the unmapped region. + * + * A fast memory region must be mapped before it can be used as part of + * a work request. + */ +struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd, + int mr_access_flags, + struct ib_fmr_attr *fmr_attr); + +/** + * ib_map_phys_fmr - Maps a list of physical pages to a fast memory region. + * @fmr: The fast memory region to associate with the pages. + * @page_list: An array of physical pages to map to the fast memory region. + * @list_len: The number of pages in page_list. + * @iova: The I/O virtual address to use with the mapped region. + */ +static inline int ib_map_phys_fmr(struct ib_fmr *fmr, + u64 *page_list, int list_len, + u64 iova) +{ + return fmr->device->map_phys_fmr(fmr, page_list, list_len, iova); +} + +/** + * ib_unmap_fmr - Removes the mapping from a list of fast memory regions. + * @fmr_list: A linked list of fast memory regions to unmap. + */ +int ib_unmap_fmr(struct list_head *fmr_list); + +/** + * ib_dealloc_fmr - Deallocates a fast memory region. + * @fmr: The fast memory region to deallocate. + */ +int ib_dealloc_fmr(struct ib_fmr *fmr); + +/** + * ib_attach_mcast - Attaches the specified QP to a multicast group. + * @qp: QP to attach to the multicast group. The QP must be type + * IB_QPT_UD. + * @gid: Multicast group GID. + * @lid: Multicast group LID in host byte order. + * + * In order to send and receive multicast packets, subnet + * administration must have created the multicast group and configured + * the fabric appropriately. The port associated with the specified + * QP must also be a member of the multicast group. + */ +int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); + +/** + * ib_detach_mcast - Detaches the specified QP from a multicast group. + * @qp: QP to detach from the multicast group. + * @gid: Multicast group GID. + * @lid: Multicast group LID in host byte order. + */ +int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); + +/** + * ib_alloc_xrcd - Allocates an XRC domain. + * @device: The device on which to allocate the XRC domain. + */ +struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device); + +/** + * ib_dealloc_xrcd - Deallocates an XRC domain. + * @xrcd: The XRC domain to deallocate. + */ +int ib_dealloc_xrcd(struct ib_xrcd *xrcd); + +struct ib_flow *ib_create_flow(struct ib_qp *qp, + struct ib_flow_attr *flow_attr, int domain); +int ib_destroy_flow(struct ib_flow *flow_id); + +static inline int ib_check_mr_access(int flags) +{ + /* + * Local write permission is required if remote write or + * remote atomic permission is also requested. + */ + if (flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) && + !(flags & IB_ACCESS_LOCAL_WRITE)) + return -EINVAL; + + return 0; +} + +/** + * ib_check_mr_status: lightweight check of MR status. + * This routine may provide status checks on a selected + * ib_mr. first use is for signature status check. + * + * @mr: A memory region. + * @check_mask: Bitmask of which checks to perform from + * ib_mr_status_check enumeration. + * @mr_status: The container of relevant status checks. + * failed checks will be indicated in the status bitmask + * and the relevant info shall be in the error item. + */ +int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, + struct ib_mr_status *mr_status); + +#endif /* IB_VERBS_H */ diff --git a/include/rdma/iw_cm.h b/include/rdma/iw_cm.h new file mode 100644 index 0000000..1017e0b --- /dev/null +++ b/include/rdma/iw_cm.h @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2005 Network Appliance, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef IW_CM_H +#define IW_CM_H + +#include +#include + +struct iw_cm_id; + +enum iw_cm_event_type { + IW_CM_EVENT_CONNECT_REQUEST = 1, /* connect request received */ + IW_CM_EVENT_CONNECT_REPLY, /* reply from active connect request */ + IW_CM_EVENT_ESTABLISHED, /* passive side accept successful */ + IW_CM_EVENT_DISCONNECT, /* orderly shutdown */ + IW_CM_EVENT_CLOSE /* close complete */ +}; + +struct iw_cm_event { + enum iw_cm_event_type event; + int status; + struct sockaddr_storage local_addr; + struct sockaddr_storage remote_addr; + void *private_data; + void *provider_data; + u8 private_data_len; + u8 ord; + u8 ird; +}; + +/** + * iw_cm_handler - Function to be called by the IW CM when delivering events + * to the client. + * + * @cm_id: The IW CM identifier associated with the event. + * @event: Pointer to the event structure. + */ +typedef int (*iw_cm_handler)(struct iw_cm_id *cm_id, + struct iw_cm_event *event); + +/** + * iw_event_handler - Function called by the provider when delivering provider + * events to the IW CM. Returns either 0 indicating the event was processed + * or -errno if the event could not be processed. + * + * @cm_id: The IW CM identifier associated with the event. + * @event: Pointer to the event structure. + */ +typedef int (*iw_event_handler)(struct iw_cm_id *cm_id, + struct iw_cm_event *event); + +struct iw_cm_id { + iw_cm_handler cm_handler; /* client callback function */ + void *context; /* client cb context */ + struct ib_device *device; + struct sockaddr_storage local_addr; + struct sockaddr_storage remote_addr; + void *provider_data; /* provider private data */ + iw_event_handler event_handler; /* cb for provider + events */ + /* Used by provider to add and remove refs on IW cm_id */ + void (*add_ref)(struct iw_cm_id *); + void (*rem_ref)(struct iw_cm_id *); +}; + +struct iw_cm_conn_param { + const void *private_data; + u16 private_data_len; + u32 ord; + u32 ird; + u32 qpn; +}; + +struct iw_cm_verbs { + void (*add_ref)(struct ib_qp *qp); + + void (*rem_ref)(struct ib_qp *qp); + + struct ib_qp * (*get_qp)(struct ib_device *device, + int qpn); + + int (*connect)(struct iw_cm_id *cm_id, + struct iw_cm_conn_param *conn_param); + + int (*accept)(struct iw_cm_id *cm_id, + struct iw_cm_conn_param *conn_param); + + int (*reject)(struct iw_cm_id *cm_id, + const void *pdata, u8 pdata_len); + + int (*create_listen)(struct iw_cm_id *cm_id, + int backlog); + + int (*destroy_listen)(struct iw_cm_id *cm_id); +}; + +/** + * iw_create_cm_id - Create an IW CM identifier. + * + * @device: The IB device on which to create the IW CM identier. + * @event_handler: User callback invoked to report events associated with the + * returned IW CM identifier. + * @context: User specified context associated with the id. + */ +struct iw_cm_id *iw_create_cm_id(struct ib_device *device, + iw_cm_handler cm_handler, void *context); + +/** + * iw_destroy_cm_id - Destroy an IW CM identifier. + * + * @cm_id: The previously created IW CM identifier to destroy. + * + * The client can assume that no events will be delivered for the CM ID after + * this function returns. + */ +void iw_destroy_cm_id(struct iw_cm_id *cm_id); + +/** + * iw_cm_bind_qp - Unbind the specified IW CM identifier and QP + * + * @cm_id: The IW CM idenfier to unbind from the QP. + * @qp: The QP + * + * This is called by the provider when destroying the QP to ensure + * that any references held by the IWCM are released. It may also + * be called by the IWCM when destroying a CM_ID to that any + * references held by the provider are released. + */ +void iw_cm_unbind_qp(struct iw_cm_id *cm_id, struct ib_qp *qp); + +/** + * iw_cm_get_qp - Return the ib_qp associated with a QPN + * + * @ib_device: The IB device + * @qpn: The queue pair number + */ +struct ib_qp *iw_cm_get_qp(struct ib_device *device, int qpn); + +/** + * iw_cm_listen - Listen for incoming connection requests on the + * specified IW CM id. + * + * @cm_id: The IW CM identifier. + * @backlog: The maximum number of outstanding un-accepted inbound listen + * requests to queue. + * + * The source address and port number are specified in the IW CM identifier + * structure. + */ +int iw_cm_listen(struct iw_cm_id *cm_id, int backlog); + +/** + * iw_cm_accept - Called to accept an incoming connect request. + * + * @cm_id: The IW CM identifier associated with the connection request. + * @iw_param: Pointer to a structure containing connection establishment + * parameters. + * + * The specified cm_id will have been provided in the event data for a + * CONNECT_REQUEST event. Subsequent events related to this connection will be + * delivered to the specified IW CM identifier prior and may occur prior to + * the return of this function. If this function returns a non-zero value, the + * client can assume that no events will be delivered to the specified IW CM + * identifier. + */ +int iw_cm_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param); + +/** + * iw_cm_reject - Reject an incoming connection request. + * + * @cm_id: Connection identifier associated with the request. + * @private_daa: Pointer to data to deliver to the remote peer as part of the + * reject message. + * @private_data_len: The number of bytes in the private_data parameter. + * + * The client can assume that no events will be delivered to the specified IW + * CM identifier following the return of this function. The private_data + * buffer is available for reuse when this function returns. + */ +int iw_cm_reject(struct iw_cm_id *cm_id, const void *private_data, + u8 private_data_len); + +/** + * iw_cm_connect - Called to request a connection to a remote peer. + * + * @cm_id: The IW CM identifier for the connection. + * @iw_param: Pointer to a structure containing connection establishment + * parameters. + * + * Events may be delivered to the specified IW CM identifier prior to the + * return of this function. If this function returns a non-zero value, the + * client can assume that no events will be delivered to the specified IW CM + * identifier. + */ +int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param); + +/** + * iw_cm_disconnect - Close the specified connection. + * + * @cm_id: The IW CM identifier to close. + * @abrupt: If 0, the connection will be closed gracefully, otherwise, the + * connection will be reset. + * + * The IW CM identifier is still active until the IW_CM_EVENT_CLOSE event is + * delivered. + */ +int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt); + +/** + * iw_cm_init_qp_attr - Called to initialize the attributes of the QP + * associated with a IW CM identifier. + * + * @cm_id: The IW CM identifier associated with the QP + * @qp_attr: Pointer to the QP attributes structure. + * @qp_attr_mask: Pointer to a bit vector specifying which QP attributes are + * valid. + */ +int iw_cm_init_qp_attr(struct iw_cm_id *cm_id, struct ib_qp_attr *qp_attr, + int *qp_attr_mask); + +#endif /* IW_CM_H */ diff --git a/include/rdma/iw_portmap.h b/include/rdma/iw_portmap.h new file mode 100644 index 0000000..928b277 --- /dev/null +++ b/include/rdma/iw_portmap.h @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * Copyright (c) 2014 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _IW_PORTMAP_H +#define _IW_PORTMAP_H + +#define IWPM_ULIBNAME_SIZE 32 +#define IWPM_DEVNAME_SIZE 32 +#define IWPM_IFNAME_SIZE 16 +#define IWPM_IPADDR_SIZE 16 + +enum { + IWPM_INVALID_NLMSG_ERR = 10, + IWPM_CREATE_MAPPING_ERR, + IWPM_DUPLICATE_MAPPING_ERR, + IWPM_UNKNOWN_MAPPING_ERR, + IWPM_CLIENT_DEV_INFO_ERR, + IWPM_USER_LIB_INFO_ERR, + IWPM_REMOTE_QUERY_REJECT +}; + +struct iwpm_dev_data { + char dev_name[IWPM_DEVNAME_SIZE]; + char if_name[IWPM_IFNAME_SIZE]; +}; + +struct iwpm_sa_data { + struct sockaddr_storage loc_addr; + struct sockaddr_storage mapped_loc_addr; + struct sockaddr_storage rem_addr; + struct sockaddr_storage mapped_rem_addr; +}; + +/** + * iwpm_init - Allocate resources for the iwarp port mapper + * + * Should be called when network interface goes up. + */ +int iwpm_init(u8); + +/** + * iwpm_exit - Deallocate resources for the iwarp port mapper + * + * Should be called when network interface goes down. + */ +int iwpm_exit(u8); + +/** + * iwpm_valid_pid - Check if the userspace iwarp port mapper pid is valid + * + * Returns true if the pid is greater than zero, otherwise returns false + */ +int iwpm_valid_pid(void); + +/** + * iwpm_register_pid - Send a netlink query to userspace + * to get the iwarp port mapper pid + * @pm_msg: Contains driver info to send to the userspace port mapper + * @nl_client: The index of the netlink client + */ +int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client); + +/** + * iwpm_add_mapping - Send a netlink add mapping request to + * the userspace port mapper + * @pm_msg: Contains the local ip/tcp address info to send + * @nl_client: The index of the netlink client + * + * If the request is successful, the pm_msg stores + * the port mapper response (mapped address info) + */ +int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client); + +/** + * iwpm_add_and_query_mapping - Send a netlink add and query mapping request + * to the userspace port mapper + * @pm_msg: Contains the local and remote ip/tcp address info to send + * @nl_client: The index of the netlink client + * + * If the request is successful, the pm_msg stores the + * port mapper response (mapped local and remote address info) + */ +int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client); + +/** + * iwpm_remove_mapping - Send a netlink remove mapping request + * to the userspace port mapper + * + * @local_addr: Local ip/tcp address to remove + * @nl_client: The index of the netlink client + */ +int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client); + +/** + * iwpm_register_pid_cb - Process the port mapper response to + * iwpm_register_pid query + * @skb: + * @cb: Contains the received message (payload and netlink header) + * + * If successful, the function receives the userspace port mapper pid + * which is used in future communication with the port mapper + */ +int iwpm_register_pid_cb(struct sk_buff *, struct netlink_callback *); + +/** + * iwpm_add_mapping_cb - Process the port mapper response to + * iwpm_add_mapping request + * @skb: + * @cb: Contains the received message (payload and netlink header) + */ +int iwpm_add_mapping_cb(struct sk_buff *, struct netlink_callback *); + +/** + * iwpm_add_and_query_mapping_cb - Process the port mapper response to + * iwpm_add_and_query_mapping request + * @skb: + * @cb: Contains the received message (payload and netlink header) + */ +int iwpm_add_and_query_mapping_cb(struct sk_buff *, struct netlink_callback *); + +/** + * iwpm_mapping_error_cb - Process port mapper notification for error + * + * @skb: + * @cb: Contains the received message (payload and netlink header) + */ +int iwpm_mapping_error_cb(struct sk_buff *, struct netlink_callback *); + +/** + * iwpm_mapping_info_cb - Process a notification that the userspace + * port mapper daemon is started + * @skb: + * @cb: Contains the received message (payload and netlink header) + * + * Using the received port mapper pid, send all the local mapping + * info records to the userspace port mapper + */ +int iwpm_mapping_info_cb(struct sk_buff *, struct netlink_callback *); + +/** + * iwpm_ack_mapping_info_cb - Process the port mapper ack for + * the provided local mapping info records + * @skb: + * @cb: Contains the received message (payload and netlink header) + */ +int iwpm_ack_mapping_info_cb(struct sk_buff *, struct netlink_callback *); + +/** + * iwpm_create_mapinfo - Store local and mapped IPv4/IPv6 address + * info in a hash table + * @local_addr: Local ip/tcp address + * @mapped_addr: Mapped local ip/tcp address + * @nl_client: The index of the netlink client + */ +int iwpm_create_mapinfo(struct sockaddr_storage *local_addr, + struct sockaddr_storage *mapped_addr, u8 nl_client); + +/** + * iwpm_remove_mapinfo - Remove local and mapped IPv4/IPv6 address + * info from the hash table + * @local_addr: Local ip/tcp address + * @mapped_addr: Mapped local ip/tcp address + * + * Returns err code if mapping info is not found in the hash table, + * otherwise returns 0 + */ +int iwpm_remove_mapinfo(struct sockaddr_storage *local_addr, + struct sockaddr_storage *mapped_addr); + +#endif /* _IW_PORTMAP_H */ diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h new file mode 100644 index 0000000..1ed2088 --- /dev/null +++ b/include/rdma/rdma_cm.h @@ -0,0 +1,383 @@ +/* + * Copyright (c) 2005 Voltaire Inc. All rights reserved. + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(RDMA_CM_H) +#define RDMA_CM_H + +#include +#include +#include +#include + +/* + * Upon receiving a device removal event, users must destroy the associated + * RDMA identifier and release all resources allocated with the device. + */ +enum rdma_cm_event_type { + RDMA_CM_EVENT_ADDR_RESOLVED, + RDMA_CM_EVENT_ADDR_ERROR, + RDMA_CM_EVENT_ROUTE_RESOLVED, + RDMA_CM_EVENT_ROUTE_ERROR, + RDMA_CM_EVENT_CONNECT_REQUEST, + RDMA_CM_EVENT_CONNECT_RESPONSE, + RDMA_CM_EVENT_CONNECT_ERROR, + RDMA_CM_EVENT_UNREACHABLE, + RDMA_CM_EVENT_REJECTED, + RDMA_CM_EVENT_ESTABLISHED, + RDMA_CM_EVENT_DISCONNECTED, + RDMA_CM_EVENT_DEVICE_REMOVAL, + RDMA_CM_EVENT_MULTICAST_JOIN, + RDMA_CM_EVENT_MULTICAST_ERROR, + RDMA_CM_EVENT_ADDR_CHANGE, + RDMA_CM_EVENT_TIMEWAIT_EXIT +}; + +enum rdma_port_space { + RDMA_PS_SDP = 0x0001, + RDMA_PS_IPOIB = 0x0002, + RDMA_PS_IB = 0x013F, + RDMA_PS_TCP = 0x0106, + RDMA_PS_UDP = 0x0111, +}; + +#define RDMA_IB_IP_PS_MASK 0xFFFFFFFFFFFF0000ULL +#define RDMA_IB_IP_PS_TCP 0x0000000001060000ULL +#define RDMA_IB_IP_PS_UDP 0x0000000001110000ULL +#define RDMA_IB_IP_PS_IB 0x00000000013F0000ULL + +struct rdma_addr { + struct sockaddr_storage src_addr; + struct sockaddr_storage dst_addr; + struct rdma_dev_addr dev_addr; +}; + +struct rdma_route { + struct rdma_addr addr; + struct ib_sa_path_rec *path_rec; + int num_paths; +}; + +struct rdma_conn_param { + const void *private_data; + u8 private_data_len; + u8 responder_resources; + u8 initiator_depth; + u8 flow_control; + u8 retry_count; /* ignored when accepting */ + u8 rnr_retry_count; + /* Fields below ignored if a QP is created on the rdma_cm_id. */ + u8 srq; + u32 qp_num; + u32 qkey; +}; + +struct rdma_ud_param { + const void *private_data; + u8 private_data_len; + struct ib_ah_attr ah_attr; + u32 qp_num; + u32 qkey; +}; + +struct rdma_cm_event { + enum rdma_cm_event_type event; + int status; + union { + struct rdma_conn_param conn; + struct rdma_ud_param ud; + } param; +}; + +enum rdma_cm_state { + RDMA_CM_IDLE, + RDMA_CM_ADDR_QUERY, + RDMA_CM_ADDR_RESOLVED, + RDMA_CM_ROUTE_QUERY, + RDMA_CM_ROUTE_RESOLVED, + RDMA_CM_CONNECT, + RDMA_CM_DISCONNECT, + RDMA_CM_ADDR_BOUND, + RDMA_CM_LISTEN, + RDMA_CM_DEVICE_REMOVAL, + RDMA_CM_DESTROYING +}; + +struct rdma_cm_id; + +/** + * rdma_cm_event_handler - Callback used to report user events. + * + * Notes: Users may not call rdma_destroy_id from this callback to destroy + * the passed in id, or a corresponding listen id. Returning a + * non-zero value from the callback will destroy the passed in id. + */ +typedef int (*rdma_cm_event_handler)(struct rdma_cm_id *id, + struct rdma_cm_event *event); + +struct rdma_cm_id { + struct ib_device *device; + void *context; + struct ib_qp *qp; + rdma_cm_event_handler event_handler; + struct rdma_route route; + enum rdma_port_space ps; + enum ib_qp_type qp_type; + u8 port_num; +}; + +/** + * rdma_create_id - Create an RDMA identifier. + * + * @event_handler: User callback invoked to report events associated with the + * returned rdma_id. + * @context: User specified context associated with the id. + * @ps: RDMA port space. + * @qp_type: type of queue pair associated with the id. + */ +struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler, + void *context, enum rdma_port_space ps, + enum ib_qp_type qp_type); + +/** + * rdma_destroy_id - Destroys an RDMA identifier. + * + * @id: RDMA identifier. + * + * Note: calling this function has the effect of canceling in-flight + * asynchronous operations associated with the id. + */ +void rdma_destroy_id(struct rdma_cm_id *id); + +/** + * rdma_bind_addr - Bind an RDMA identifier to a source address and + * associated RDMA device, if needed. + * + * @id: RDMA identifier. + * @addr: Local address information. Wildcard values are permitted. + * + * This associates a source address with the RDMA identifier before calling + * rdma_listen. If a specific local address is given, the RDMA identifier will + * be bound to a local RDMA device. + */ +int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr); + +/** + * rdma_resolve_addr - Resolve destination and optional source addresses + * from IP addresses to an RDMA address. If successful, the specified + * rdma_cm_id will be bound to a local device. + * + * @id: RDMA identifier. + * @src_addr: Source address information. This parameter may be NULL. + * @dst_addr: Destination address information. + * @timeout_ms: Time to wait for resolution to complete. + */ +int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, + struct sockaddr *dst_addr, int timeout_ms); + +/** + * rdma_resolve_route - Resolve the RDMA address bound to the RDMA identifier + * into route information needed to establish a connection. + * + * This is called on the client side of a connection. + * Users must have first called rdma_resolve_addr to resolve a dst_addr + * into an RDMA address before calling this routine. + */ +int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms); + +/** + * rdma_create_qp - Allocate a QP and associate it with the specified RDMA + * identifier. + * + * QPs allocated to an rdma_cm_id will automatically be transitioned by the CMA + * through their states. + */ +int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, + struct ib_qp_init_attr *qp_init_attr); + +/** + * rdma_destroy_qp - Deallocate the QP associated with the specified RDMA + * identifier. + * + * Users must destroy any QP associated with an RDMA identifier before + * destroying the RDMA ID. + */ +void rdma_destroy_qp(struct rdma_cm_id *id); + +/** + * rdma_init_qp_attr - Initializes the QP attributes for use in transitioning + * to a specified QP state. + * @id: Communication identifier associated with the QP attributes to + * initialize. + * @qp_attr: On input, specifies the desired QP state. On output, the + * mandatory and desired optional attributes will be set in order to + * modify the QP to the specified state. + * @qp_attr_mask: The QP attribute mask that may be used to transition the + * QP to the specified state. + * + * Users must set the @qp_attr->qp_state to the desired QP state. This call + * will set all required attributes for the given transition, along with + * known optional attributes. Users may override the attributes returned from + * this call before calling ib_modify_qp. + * + * Users that wish to have their QP automatically transitioned through its + * states can associate a QP with the rdma_cm_id by calling rdma_create_qp(). + */ +int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, + int *qp_attr_mask); + +/** + * rdma_connect - Initiate an active connection request. + * @id: Connection identifier to connect. + * @conn_param: Connection information used for connected QPs. + * + * Users must have resolved a route for the rdma_cm_id to connect with + * by having called rdma_resolve_route before calling this routine. + * + * This call will either connect to a remote QP or obtain remote QP + * information for unconnected rdma_cm_id's. The actual operation is + * based on the rdma_cm_id's port space. + */ +int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param); + +/** + * rdma_listen - This function is called by the passive side to + * listen for incoming connection requests. + * + * Users must have bound the rdma_cm_id to a local address by calling + * rdma_bind_addr before calling this routine. + */ +int rdma_listen(struct rdma_cm_id *id, int backlog); + +/** + * rdma_accept - Called to accept a connection request or response. + * @id: Connection identifier associated with the request. + * @conn_param: Information needed to establish the connection. This must be + * provided if accepting a connection request. If accepting a connection + * response, this parameter must be NULL. + * + * Typically, this routine is only called by the listener to accept a connection + * request. It must also be called on the active side of a connection if the + * user is performing their own QP transitions. + * + * In the case of error, a reject message is sent to the remote side and the + * state of the qp associated with the id is modified to error, such that any + * previously posted receive buffers would be flushed. + */ +int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param); + +/** + * rdma_notify - Notifies the RDMA CM of an asynchronous event that has + * occurred on the connection. + * @id: Connection identifier to transition to established. + * @event: Asynchronous event. + * + * This routine should be invoked by users to notify the CM of relevant + * communication events. Events that should be reported to the CM and + * when to report them are: + * + * IB_EVENT_COMM_EST - Used when a message is received on a connected + * QP before an RTU has been received. + */ +int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event); + +/** + * rdma_reject - Called to reject a connection request or response. + */ +int rdma_reject(struct rdma_cm_id *id, const void *private_data, + u8 private_data_len); + +/** + * rdma_disconnect - This function disconnects the associated QP and + * transitions it into the error state. + */ +int rdma_disconnect(struct rdma_cm_id *id); + +/** + * rdma_join_multicast - Join the multicast group specified by the given + * address. + * @id: Communication identifier associated with the request. + * @addr: Multicast address identifying the group to join. + * @context: User-defined context associated with the join request, returned + * to the user through the private_data pointer in multicast events. + */ +int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, + void *context); + +/** + * rdma_leave_multicast - Leave the multicast group specified by the given + * address. + */ +void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr); + +/** + * rdma_set_service_type - Set the type of service associated with a + * connection identifier. + * @id: Communication identifier to associated with service type. + * @tos: Type of service. + * + * The type of service is interpretted as a differentiated service + * field (RFC 2474). The service type should be specified before + * performing route resolution, as existing communication on the + * connection identifier may be unaffected. The type of service + * requested may not be supported by the network to all destinations. + */ +void rdma_set_service_type(struct rdma_cm_id *id, int tos); + +/** + * rdma_set_reuseaddr - Allow the reuse of local addresses when binding + * the rdma_cm_id. + * @id: Communication identifier to configure. + * @reuse: Value indicating if the bound address is reusable. + * + * Reuse must be set before an address is bound to the id. + */ +int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse); + +/** + * rdma_set_afonly - Specify that listens are restricted to the + * bound address family only. + * @id: Communication identifer to configure. + * @afonly: Value indicating if listens are restricted. + * + * Must be set before identifier is in the listening state. + */ +int rdma_set_afonly(struct rdma_cm_id *id, int afonly); + + /** + * rdma_get_service_id - Return the IB service ID for a specified address. + * @id: Communication identifier associated with the address. + * @addr: Address for the service ID. + */ +__be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr); + +#endif /* RDMA_CM_H */ diff --git a/include/rdma/rdma_cm_ib.h b/include/rdma/rdma_cm_ib.h new file mode 100644 index 0000000..2389c3b --- /dev/null +++ b/include/rdma/rdma_cm_ib.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2006 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(RDMA_CM_IB_H) +#define RDMA_CM_IB_H + +#include + +/** + * rdma_set_ib_paths - Manually sets the path records used to establish a + * connection. + * @id: Connection identifier associated with the request. + * @path_rec: Reference to the path record + * + * This call permits a user to specify routing information for rdma_cm_id's + * bound to Infiniband devices. It is called on the client side of a + * connection and replaces the call to rdma_resolve_route. + */ +int rdma_set_ib_paths(struct rdma_cm_id *id, + struct ib_sa_path_rec *path_rec, int num_paths); + +/* Global qkey for UDP QPs and multicast groups. */ +#define RDMA_UDP_QKEY 0x01234567 + +#endif /* RDMA_CM_IB_H */ diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h new file mode 100644 index 0000000..0790882 --- /dev/null +++ b/include/rdma/rdma_netlink.h @@ -0,0 +1,80 @@ +#ifndef _RDMA_NETLINK_H +#define _RDMA_NETLINK_H + + +#include +#include + +struct ibnl_client_cbs { + int (*dump)(struct sk_buff *skb, struct netlink_callback *nlcb); + struct module *module; +}; + +int ibnl_init(void); +void ibnl_cleanup(void); + +/** + * Add a a client to the list of IB netlink exporters. + * @index: Index of the added client + * @nops: Number of supported ops by the added client. + * @cb_table: A table for op->callback + * + * Returns 0 on success or a negative error code. + */ +int ibnl_add_client(int index, int nops, + const struct ibnl_client_cbs cb_table[]); + +/** + * Remove a client from IB netlink. + * @index: Index of the removed IB client. + * + * Returns 0 on success or a negative error code. + */ +int ibnl_remove_client(int index); + +/** + * Put a new message in a supplied skb. + * @skb: The netlink skb. + * @nlh: Pointer to put the header of the new netlink message. + * @seq: The message sequence number. + * @len: The requested message length to allocate. + * @client: Calling IB netlink client. + * @op: message content op. + * Returns the allocated buffer on success and NULL on failure. + */ +void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq, + int len, int client, int op, int flags); +/** + * Put a new attribute in a supplied skb. + * @skb: The netlink skb. + * @nlh: Header of the netlink message to append the attribute to. + * @len: The length of the attribute data. + * @data: The attribute data to put. + * @type: The attribute type. + * Returns the 0 and a negative error code on failure. + */ +int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh, + int len, void *data, int type); + +/** + * Send the supplied skb to a specific userspace PID. + * @skb: The netlink skb + * @nlh: Header of the netlink message to send + * @pid: Userspace netlink process ID + * Returns 0 on success or a negative error code. + */ +int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh, + __u32 pid); + +/** + * Send the supplied skb to a netlink group. + * @skb: The netlink skb + * @nlh: Header of the netlink message to send + * @group: Netlink group ID + * @flags: allocation flags + * Returns 0 on success or a negative error code. + */ +int ibnl_multicast(struct sk_buff *skb, struct nlmsghdr *nlh, + unsigned int group, gfp_t flags); + +#endif /* _RDMA_NETLINK_H */ diff --git a/include/scsi/scsi_transport_srp.h b/include/scsi/scsi_transport_srp.h new file mode 100644 index 0000000..cdb05dd --- /dev/null +++ b/include/scsi/scsi_transport_srp.h @@ -0,0 +1,149 @@ +#ifndef SCSI_TRANSPORT_SRP_H +#define SCSI_TRANSPORT_SRP_H + +#include +#include +#include + +#define SRP_RPORT_ROLE_INITIATOR 0 +#define SRP_RPORT_ROLE_TARGET 1 + +struct srp_rport_identifiers { + u8 port_id[16]; + u8 roles; +}; + +/** + * enum srp_rport_state - SRP transport layer state + * @SRP_RPORT_RUNNING: Transport layer operational. + * @SRP_RPORT_BLOCKED: Transport layer not operational; fast I/O fail timer + * is running and I/O has been blocked. + * @SRP_RPORT_FAIL_FAST: Fast I/O fail timer has expired; fail I/O fast. + * @SRP_RPORT_LOST: Port is being removed. + */ +enum srp_rport_state { + SRP_RPORT_RUNNING, + SRP_RPORT_BLOCKED, + SRP_RPORT_FAIL_FAST, + SRP_RPORT_LOST, +}; + +/** + * struct srp_rport - SRP initiator or target port + * + * Fields that are relevant for SRP initiator and SRP target drivers: + * @dev: Device associated with this rport. + * @port_id: 16-byte port identifier. + * @roles: Role of this port - initiator or target. + * + * Fields that are only relevant for SRP initiator drivers: + * @lld_data: LLD private data. + * @mutex: Protects against concurrent rport reconnect / + * fast_io_fail / dev_loss_tmo activity. + * @state: rport state. + * @reconnect_delay: Reconnect delay in seconds. + * @failed_reconnects: Number of failed reconnect attempts. + * @reconnect_work: Work structure used for scheduling reconnect attempts. + * @fast_io_fail_tmo: Fast I/O fail timeout in seconds. + * @dev_loss_tmo: Device loss timeout in seconds. + * @fast_io_fail_work: Work structure used for scheduling fast I/O fail work. + * @dev_loss_work: Work structure used for scheduling device loss work. + */ +struct srp_rport { + /* for initiator and target drivers */ + + struct device dev; + + u8 port_id[16]; + u8 roles; + + /* for initiator drivers */ + + void *lld_data; + + struct mutex mutex; + enum srp_rport_state state; + int reconnect_delay; + int failed_reconnects; + struct delayed_work reconnect_work; + int fast_io_fail_tmo; + int dev_loss_tmo; + struct delayed_work fast_io_fail_work; + struct delayed_work dev_loss_work; +}; + +/** + * struct srp_function_template + * + * Fields that are only relevant for SRP initiator drivers: + * @has_rport_state: Whether or not to create the state, fast_io_fail_tmo and + * dev_loss_tmo sysfs attribute for an rport. + * @reset_timer_if_blocked: Whether or srp_timed_out() should reset the command + * timer if the device on which it has been queued is blocked. + * @reconnect_delay: If not NULL, points to the default reconnect_delay value. + * @fast_io_fail_tmo: If not NULL, points to the default fast_io_fail_tmo value. + * @dev_loss_tmo: If not NULL, points to the default dev_loss_tmo value. + * @reconnect: Callback function for reconnecting to the target. See also + * srp_reconnect_rport(). + * @terminate_rport_io: Callback function for terminating all outstanding I/O + * requests for an rport. + * @rport_delete: Callback function that deletes an rport. + * + * Fields that are only relevant for SRP target drivers: + * @tsk_mgmt_response: Callback function for sending a task management response. + * @it_nexus_response: Callback function for processing an IT nexus response. + */ +struct srp_function_template { + /* for initiator drivers */ + bool has_rport_state; + bool reset_timer_if_blocked; + int *reconnect_delay; + int *fast_io_fail_tmo; + int *dev_loss_tmo; + int (*reconnect)(struct srp_rport *rport); + void (*terminate_rport_io)(struct srp_rport *rport); + void (*rport_delete)(struct srp_rport *rport); + /* for target drivers */ + int (* tsk_mgmt_response)(struct Scsi_Host *, u64, u64, int); + int (* it_nexus_response)(struct Scsi_Host *, u64, int); +}; + +extern struct scsi_transport_template * +srp_attach_transport(struct srp_function_template *); +extern void srp_release_transport(struct scsi_transport_template *); + +extern void srp_rport_get(struct srp_rport *rport); +extern void srp_rport_put(struct srp_rport *rport); +extern struct srp_rport *srp_rport_add(struct Scsi_Host *, + struct srp_rport_identifiers *); +extern void srp_rport_del(struct srp_rport *); +extern int srp_tmo_valid(int reconnect_delay, int fast_io_fail_tmo, + int dev_loss_tmo); +extern int srp_reconnect_rport(struct srp_rport *rport); +extern void srp_start_tl_fail_timers(struct srp_rport *rport); +extern void srp_remove_host(struct Scsi_Host *); +extern void srp_stop_rport_timers(struct srp_rport *rport); + +/** + * srp_chkready() - evaluate the transport layer state before I/O + * @rport: SRP target port pointer. + * + * Returns a SCSI result code that can be returned by the LLD queuecommand() + * implementation. The role of this function is similar to that of + * fc_remote_port_chkready(). + */ +static inline int srp_chkready(struct srp_rport *rport) +{ + switch (rport->state) { + case SRP_RPORT_RUNNING: + case SRP_RPORT_BLOCKED: + default: + return 0; + case SRP_RPORT_FAIL_FAST: + return DID_TRANSPORT_FAILFAST << 16; + case SRP_RPORT_LOST: + return DID_NO_CONNECT << 16; + } +} + +#endif diff --git a/include/scsi/srp.h b/include/scsi/srp.h new file mode 100644 index 0000000..1ae84db --- /dev/null +++ b/include/scsi/srp.h @@ -0,0 +1,280 @@ +/* + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id$ + */ + +#ifndef SCSI_SRP_H +#define SCSI_SRP_H + +/* + * Structures and constants for the SCSI RDMA Protocol (SRP) as + * defined by the INCITS T10 committee. This file was written using + * draft Revision 16a of the SRP standard. + */ + +#include + +enum { + SRP_LOGIN_REQ = 0x00, + SRP_TSK_MGMT = 0x01, + SRP_CMD = 0x02, + SRP_I_LOGOUT = 0x03, + SRP_LOGIN_RSP = 0xc0, + SRP_RSP = 0xc1, + SRP_LOGIN_REJ = 0xc2, + SRP_T_LOGOUT = 0x80, + SRP_CRED_REQ = 0x81, + SRP_AER_REQ = 0x82, + SRP_CRED_RSP = 0x41, + SRP_AER_RSP = 0x42 +}; + +enum { + SRP_BUF_FORMAT_DIRECT = 1 << 1, + SRP_BUF_FORMAT_INDIRECT = 1 << 2 +}; + +enum { + SRP_NO_DATA_DESC = 0, + SRP_DATA_DESC_DIRECT = 1, + SRP_DATA_DESC_INDIRECT = 2 +}; + +enum { + SRP_TSK_ABORT_TASK = 0x01, + SRP_TSK_ABORT_TASK_SET = 0x02, + SRP_TSK_CLEAR_TASK_SET = 0x04, + SRP_TSK_LUN_RESET = 0x08, + SRP_TSK_CLEAR_ACA = 0x40 +}; + +enum srp_login_rej_reason { + SRP_LOGIN_REJ_UNABLE_ESTABLISH_CHANNEL = 0x00010000, + SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES = 0x00010001, + SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE = 0x00010002, + SRP_LOGIN_REJ_UNABLE_ASSOCIATE_CHANNEL = 0x00010003, + SRP_LOGIN_REJ_UNSUPPORTED_DESCRIPTOR_FMT = 0x00010004, + SRP_LOGIN_REJ_MULTI_CHANNEL_UNSUPPORTED = 0x00010005, + SRP_LOGIN_REJ_CHANNEL_LIMIT_REACHED = 0x00010006 +}; + +enum { + SRP_REV10_IB_IO_CLASS = 0xff00, + SRP_REV16A_IB_IO_CLASS = 0x0100 +}; + +struct srp_direct_buf { + __be64 va; + __be32 key; + __be32 len; +}; + +/* + * We need the packed attribute because the SRP spec puts the list of + * descriptors at an offset of 20, which is not aligned to the size of + * struct srp_direct_buf. The whole structure must be packed to avoid + * having the 20-byte structure padded to 24 bytes on 64-bit architectures. + */ +struct srp_indirect_buf { + struct srp_direct_buf table_desc; + __be32 len; + struct srp_direct_buf desc_list[0]; +} __attribute__((packed)); + +enum { + SRP_MULTICHAN_SINGLE = 0, + SRP_MULTICHAN_MULTI = 1 +}; + +struct srp_login_req { + u8 opcode; + u8 reserved1[7]; + u64 tag; + __be32 req_it_iu_len; + u8 reserved2[4]; + __be16 req_buf_fmt; + u8 req_flags; + u8 reserved3[5]; + u8 initiator_port_id[16]; + u8 target_port_id[16]; +}; + +/* + * The SRP spec defines the size of the LOGIN_RSP structure to be 52 + * bytes, so it needs to be packed to avoid having it padded to 56 + * bytes on 64-bit architectures. + */ +struct srp_login_rsp { + u8 opcode; + u8 reserved1[3]; + __be32 req_lim_delta; + u64 tag; + __be32 max_it_iu_len; + __be32 max_ti_iu_len; + __be16 buf_fmt; + u8 rsp_flags; + u8 reserved2[25]; +} __attribute__((packed)); + +struct srp_login_rej { + u8 opcode; + u8 reserved1[3]; + __be32 reason; + u64 tag; + u8 reserved2[8]; + __be16 buf_fmt; + u8 reserved3[6]; +}; + +struct srp_i_logout { + u8 opcode; + u8 reserved[7]; + u64 tag; +}; + +struct srp_t_logout { + u8 opcode; + u8 sol_not; + u8 reserved[2]; + __be32 reason; + u64 tag; +}; + +/* + * We need the packed attribute because the SRP spec only aligns the + * 8-byte LUN field to 4 bytes. + */ +struct srp_tsk_mgmt { + u8 opcode; + u8 sol_not; + u8 reserved1[6]; + u64 tag; + u8 reserved2[4]; + __be64 lun __attribute__((packed)); + u8 reserved3[2]; + u8 tsk_mgmt_func; + u8 reserved4; + u64 task_tag; + u8 reserved5[8]; +}; + +/* + * We need the packed attribute because the SRP spec only aligns the + * 8-byte LUN field to 4 bytes. + */ +struct srp_cmd { + u8 opcode; + u8 sol_not; + u8 reserved1[3]; + u8 buf_fmt; + u8 data_out_desc_cnt; + u8 data_in_desc_cnt; + u64 tag; + u8 reserved2[4]; + __be64 lun __attribute__((packed)); + u8 reserved3; + u8 task_attr; + u8 reserved4; + u8 add_cdb_len; + u8 cdb[16]; + u8 add_data[0]; +}; + +enum { + SRP_RSP_FLAG_RSPVALID = 1 << 0, + SRP_RSP_FLAG_SNSVALID = 1 << 1, + SRP_RSP_FLAG_DOOVER = 1 << 2, + SRP_RSP_FLAG_DOUNDER = 1 << 3, + SRP_RSP_FLAG_DIOVER = 1 << 4, + SRP_RSP_FLAG_DIUNDER = 1 << 5 +}; + +/* + * The SRP spec defines the size of the RSP structure to be 36 bytes, + * so it needs to be packed to avoid having it padded to 40 bytes on + * 64-bit architectures. + */ +struct srp_rsp { + u8 opcode; + u8 sol_not; + u8 reserved1[2]; + __be32 req_lim_delta; + u64 tag; + u8 reserved2[2]; + u8 flags; + u8 status; + __be32 data_out_res_cnt; + __be32 data_in_res_cnt; + __be32 sense_data_len; + __be32 resp_data_len; + u8 data[0]; +} __attribute__((packed)); + +struct srp_cred_req { + u8 opcode; + u8 sol_not; + u8 reserved[2]; + __be32 req_lim_delta; + u64 tag; +}; + +struct srp_cred_rsp { + u8 opcode; + u8 reserved[7]; + u64 tag; +}; + +/* + * The SRP spec defines the fixed portion of the AER_REQ structure to be + * 36 bytes, so it needs to be packed to avoid having it padded to 40 bytes + * on 64-bit architectures. + */ +struct srp_aer_req { + u8 opcode; + u8 sol_not; + u8 reserved[2]; + __be32 req_lim_delta; + u64 tag; + u32 reserved2; + __be64 lun; + __be32 sense_data_len; + u32 reserved3; + u8 sense_data[0]; +} __attribute__((packed)); + +struct srp_aer_rsp { + u8 opcode; + u8 reserved[7]; + u64 tag; +}; + +#endif /* SCSI_SRP_H */ diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h new file mode 100644 index 0000000..9195095 --- /dev/null +++ b/include/uapi/linux/rds.h @@ -0,0 +1,285 @@ +/* + * Copyright (c) 2008 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef _LINUX_RDS_H +#define _LINUX_RDS_H + +#include + +#define RDS_IB_ABI_VERSION 0x301 + +/* + * setsockopt/getsockopt for SOL_RDS + */ +#define RDS_CANCEL_SENT_TO 1 +#define RDS_GET_MR 2 +#define RDS_FREE_MR 3 +/* deprecated: RDS_BARRIER 4 */ +#define RDS_RECVERR 5 +#define RDS_CONG_MONITOR 6 +#define RDS_GET_MR_FOR_DEST 7 + +/* + * Control message types for SOL_RDS. + * + * CMSG_RDMA_ARGS (sendmsg) + * Request a RDMA transfer to/from the specified + * memory ranges. + * The cmsg_data is a struct rds_rdma_args. + * RDS_CMSG_RDMA_DEST (recvmsg, sendmsg) + * Kernel informs application about intended + * source/destination of a RDMA transfer + * RDS_CMSG_RDMA_MAP (sendmsg) + * Application asks kernel to map the given + * memory range into a IB MR, and send the + * R_Key along in an RDS extension header. + * The cmsg_data is a struct rds_get_mr_args, + * the same as for the GET_MR setsockopt. + * RDS_CMSG_RDMA_STATUS (recvmsg) + * Returns the status of a completed RDMA operation. + */ +#define RDS_CMSG_RDMA_ARGS 1 +#define RDS_CMSG_RDMA_DEST 2 +#define RDS_CMSG_RDMA_MAP 3 +#define RDS_CMSG_RDMA_STATUS 4 +#define RDS_CMSG_CONG_UPDATE 5 +#define RDS_CMSG_ATOMIC_FADD 6 +#define RDS_CMSG_ATOMIC_CSWP 7 +#define RDS_CMSG_MASKED_ATOMIC_FADD 8 +#define RDS_CMSG_MASKED_ATOMIC_CSWP 9 + +#define RDS_INFO_FIRST 10000 +#define RDS_INFO_COUNTERS 10000 +#define RDS_INFO_CONNECTIONS 10001 +/* 10002 aka RDS_INFO_FLOWS is deprecated */ +#define RDS_INFO_SEND_MESSAGES 10003 +#define RDS_INFO_RETRANS_MESSAGES 10004 +#define RDS_INFO_RECV_MESSAGES 10005 +#define RDS_INFO_SOCKETS 10006 +#define RDS_INFO_TCP_SOCKETS 10007 +#define RDS_INFO_IB_CONNECTIONS 10008 +#define RDS_INFO_CONNECTION_STATS 10009 +#define RDS_INFO_IWARP_CONNECTIONS 10010 +#define RDS_INFO_LAST 10010 + +struct rds_info_counter { + uint8_t name[32]; + uint64_t value; +} __attribute__((packed)); + +#define RDS_INFO_CONNECTION_FLAG_SENDING 0x01 +#define RDS_INFO_CONNECTION_FLAG_CONNECTING 0x02 +#define RDS_INFO_CONNECTION_FLAG_CONNECTED 0x04 + +#define TRANSNAMSIZ 16 + +struct rds_info_connection { + uint64_t next_tx_seq; + uint64_t next_rx_seq; + __be32 laddr; + __be32 faddr; + uint8_t transport[TRANSNAMSIZ]; /* null term ascii */ + uint8_t flags; +} __attribute__((packed)); + +#define RDS_INFO_MESSAGE_FLAG_ACK 0x01 +#define RDS_INFO_MESSAGE_FLAG_FAST_ACK 0x02 + +struct rds_info_message { + uint64_t seq; + uint32_t len; + __be32 laddr; + __be32 faddr; + __be16 lport; + __be16 fport; + uint8_t flags; +} __attribute__((packed)); + +struct rds_info_socket { + uint32_t sndbuf; + __be32 bound_addr; + __be32 connected_addr; + __be16 bound_port; + __be16 connected_port; + uint32_t rcvbuf; + uint64_t inum; +} __attribute__((packed)); + +struct rds_info_tcp_socket { + __be32 local_addr; + __be16 local_port; + __be32 peer_addr; + __be16 peer_port; + uint64_t hdr_rem; + uint64_t data_rem; + uint32_t last_sent_nxt; + uint32_t last_expected_una; + uint32_t last_seen_una; +} __attribute__((packed)); + +#define RDS_IB_GID_LEN 16 +struct rds_info_rdma_connection { + __be32 src_addr; + __be32 dst_addr; + uint8_t src_gid[RDS_IB_GID_LEN]; + uint8_t dst_gid[RDS_IB_GID_LEN]; + + uint32_t max_send_wr; + uint32_t max_recv_wr; + uint32_t max_send_sge; + uint32_t rdma_mr_max; + uint32_t rdma_mr_size; +}; + +/* + * Congestion monitoring. + * Congestion control in RDS happens at the host connection + * level by exchanging a bitmap marking congested ports. + * By default, a process sleeping in poll() is always woken + * up when the congestion map is updated. + * With explicit monitoring, an application can have more + * fine-grained control. + * The application installs a 64bit mask value in the socket, + * where each bit corresponds to a group of ports. + * When a congestion update arrives, RDS checks the set of + * ports that are now uncongested against the list bit mask + * installed in the socket, and if they overlap, we queue a + * cong_notification on the socket. + * + * To install the congestion monitor bitmask, use RDS_CONG_MONITOR + * with the 64bit mask. + * Congestion updates are received via RDS_CMSG_CONG_UPDATE + * control messages. + * + * The correspondence between bits and ports is + * 1 << (portnum % 64) + */ +#define RDS_CONG_MONITOR_SIZE 64 +#define RDS_CONG_MONITOR_BIT(port) (((unsigned int) port) % RDS_CONG_MONITOR_SIZE) +#define RDS_CONG_MONITOR_MASK(port) (1ULL << RDS_CONG_MONITOR_BIT(port)) + +/* + * RDMA related types + */ + +/* + * This encapsulates a remote memory location. + * In the current implementation, it contains the R_Key + * of the remote memory region, and the offset into it + * (so that the application does not have to worry about + * alignment). + */ +typedef uint64_t rds_rdma_cookie_t; + +struct rds_iovec { + uint64_t addr; + uint64_t bytes; +}; + +struct rds_get_mr_args { + struct rds_iovec vec; + uint64_t cookie_addr; + uint64_t flags; +}; + +struct rds_get_mr_for_dest_args { + struct sockaddr_storage dest_addr; + struct rds_iovec vec; + uint64_t cookie_addr; + uint64_t flags; +}; + +struct rds_free_mr_args { + rds_rdma_cookie_t cookie; + uint64_t flags; +}; + +struct rds_rdma_args { + rds_rdma_cookie_t cookie; + struct rds_iovec remote_vec; + uint64_t local_vec_addr; + uint64_t nr_local; + uint64_t flags; + uint64_t user_token; +}; + +struct rds_atomic_args { + rds_rdma_cookie_t cookie; + uint64_t local_addr; + uint64_t remote_addr; + union { + struct { + uint64_t compare; + uint64_t swap; + } cswp; + struct { + uint64_t add; + } fadd; + struct { + uint64_t compare; + uint64_t swap; + uint64_t compare_mask; + uint64_t swap_mask; + } m_cswp; + struct { + uint64_t add; + uint64_t nocarry_mask; + } m_fadd; + }; + uint64_t flags; + uint64_t user_token; +}; + +struct rds_rdma_notify { + uint64_t user_token; + int32_t status; +}; + +#define RDS_RDMA_SUCCESS 0 +#define RDS_RDMA_REMOTE_ERROR 1 +#define RDS_RDMA_CANCELED 2 +#define RDS_RDMA_DROPPED 3 +#define RDS_RDMA_OTHER_ERROR 4 + +/* + * Common set of flags for all RDMA related structs + */ +#define RDS_RDMA_READWRITE 0x0001 +#define RDS_RDMA_FENCE 0x0002 /* use FENCE for immediate send */ +#define RDS_RDMA_INVALIDATE 0x0004 /* invalidate R_Key after freeing MR */ +#define RDS_RDMA_USE_ONCE 0x0008 /* free MR after use */ +#define RDS_RDMA_DONTWAIT 0x0010 /* Don't wait in SET_BARRIER */ +#define RDS_RDMA_NOTIFY_ME 0x0020 /* Notify when operation completes */ +#define RDS_RDMA_SILENT 0x0040 /* Do not interrupt remote */ + +#endif /* IB_RDS_H */ diff --git a/include/uapi/rdma/Kbuild b/include/uapi/rdma/Kbuild new file mode 100644 index 0000000..687ae33 --- /dev/null +++ b/include/uapi/rdma/Kbuild @@ -0,0 +1,7 @@ +# UAPI Header export list +header-y += ib_user_cm.h +header-y += ib_user_mad.h +header-y += ib_user_sa.h +header-y += ib_user_verbs.h +header-y += rdma_netlink.h +header-y += rdma_user_cm.h diff --git a/include/uapi/rdma/ib_user_cm.h b/include/uapi/rdma/ib_user_cm.h new file mode 100644 index 0000000..f79014a --- /dev/null +++ b/include/uapi/rdma/ib_user_cm.h @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IB_USER_CM_H +#define IB_USER_CM_H + +#include +#include + +#define IB_USER_CM_ABI_VERSION 5 + +enum { + IB_USER_CM_CMD_CREATE_ID, + IB_USER_CM_CMD_DESTROY_ID, + IB_USER_CM_CMD_ATTR_ID, + + IB_USER_CM_CMD_LISTEN, + IB_USER_CM_CMD_NOTIFY, + + IB_USER_CM_CMD_SEND_REQ, + IB_USER_CM_CMD_SEND_REP, + IB_USER_CM_CMD_SEND_RTU, + IB_USER_CM_CMD_SEND_DREQ, + IB_USER_CM_CMD_SEND_DREP, + IB_USER_CM_CMD_SEND_REJ, + IB_USER_CM_CMD_SEND_MRA, + IB_USER_CM_CMD_SEND_LAP, + IB_USER_CM_CMD_SEND_APR, + IB_USER_CM_CMD_SEND_SIDR_REQ, + IB_USER_CM_CMD_SEND_SIDR_REP, + + IB_USER_CM_CMD_EVENT, + IB_USER_CM_CMD_INIT_QP_ATTR, +}; +/* + * command ABI structures. + */ +struct ib_ucm_cmd_hdr { + __u32 cmd; + __u16 in; + __u16 out; +}; + +struct ib_ucm_create_id { + __u64 uid; + __u64 response; +}; + +struct ib_ucm_create_id_resp { + __u32 id; +}; + +struct ib_ucm_destroy_id { + __u64 response; + __u32 id; + __u32 reserved; +}; + +struct ib_ucm_destroy_id_resp { + __u32 events_reported; +}; + +struct ib_ucm_attr_id { + __u64 response; + __u32 id; + __u32 reserved; +}; + +struct ib_ucm_attr_id_resp { + __be64 service_id; + __be64 service_mask; + __be32 local_id; + __be32 remote_id; +}; + +struct ib_ucm_init_qp_attr { + __u64 response; + __u32 id; + __u32 qp_state; +}; + +struct ib_ucm_listen { + __be64 service_id; + __be64 service_mask; + __u32 id; + __u32 reserved; +}; + +struct ib_ucm_notify { + __u32 id; + __u32 event; +}; + +struct ib_ucm_private_data { + __u64 data; + __u32 id; + __u8 len; + __u8 reserved[3]; +}; + +struct ib_ucm_req { + __u32 id; + __u32 qpn; + __u32 qp_type; + __u32 psn; + __be64 sid; + __u64 data; + __u64 primary_path; + __u64 alternate_path; + __u8 len; + __u8 peer_to_peer; + __u8 responder_resources; + __u8 initiator_depth; + __u8 remote_cm_response_timeout; + __u8 flow_control; + __u8 local_cm_response_timeout; + __u8 retry_count; + __u8 rnr_retry_count; + __u8 max_cm_retries; + __u8 srq; + __u8 reserved[5]; +}; + +struct ib_ucm_rep { + __u64 uid; + __u64 data; + __u32 id; + __u32 qpn; + __u32 psn; + __u8 len; + __u8 responder_resources; + __u8 initiator_depth; + __u8 target_ack_delay; + __u8 failover_accepted; + __u8 flow_control; + __u8 rnr_retry_count; + __u8 srq; + __u8 reserved[4]; +}; + +struct ib_ucm_info { + __u32 id; + __u32 status; + __u64 info; + __u64 data; + __u8 info_len; + __u8 data_len; + __u8 reserved[6]; +}; + +struct ib_ucm_mra { + __u64 data; + __u32 id; + __u8 len; + __u8 timeout; + __u8 reserved[2]; +}; + +struct ib_ucm_lap { + __u64 path; + __u64 data; + __u32 id; + __u8 len; + __u8 reserved[3]; +}; + +struct ib_ucm_sidr_req { + __u32 id; + __u32 timeout; + __be64 sid; + __u64 data; + __u64 path; + __u16 reserved_pkey; + __u8 len; + __u8 max_cm_retries; + __u8 reserved[4]; +}; + +struct ib_ucm_sidr_rep { + __u32 id; + __u32 qpn; + __u32 qkey; + __u32 status; + __u64 info; + __u64 data; + __u8 info_len; + __u8 data_len; + __u8 reserved[6]; +}; +/* + * event notification ABI structures. + */ +struct ib_ucm_event_get { + __u64 response; + __u64 data; + __u64 info; + __u8 data_len; + __u8 info_len; + __u8 reserved[6]; +}; + +struct ib_ucm_req_event_resp { + struct ib_user_path_rec primary_path; + struct ib_user_path_rec alternate_path; + __be64 remote_ca_guid; + __u32 remote_qkey; + __u32 remote_qpn; + __u32 qp_type; + __u32 starting_psn; + __u8 responder_resources; + __u8 initiator_depth; + __u8 local_cm_response_timeout; + __u8 flow_control; + __u8 remote_cm_response_timeout; + __u8 retry_count; + __u8 rnr_retry_count; + __u8 srq; + __u8 port; + __u8 reserved[7]; +}; + +struct ib_ucm_rep_event_resp { + __be64 remote_ca_guid; + __u32 remote_qkey; + __u32 remote_qpn; + __u32 starting_psn; + __u8 responder_resources; + __u8 initiator_depth; + __u8 target_ack_delay; + __u8 failover_accepted; + __u8 flow_control; + __u8 rnr_retry_count; + __u8 srq; + __u8 reserved[5]; +}; + +struct ib_ucm_rej_event_resp { + __u32 reason; + /* ari in ib_ucm_event_get info field. */ +}; + +struct ib_ucm_mra_event_resp { + __u8 timeout; + __u8 reserved[3]; +}; + +struct ib_ucm_lap_event_resp { + struct ib_user_path_rec path; +}; + +struct ib_ucm_apr_event_resp { + __u32 status; + /* apr info in ib_ucm_event_get info field. */ +}; + +struct ib_ucm_sidr_req_event_resp { + __u16 pkey; + __u8 port; + __u8 reserved; +}; + +struct ib_ucm_sidr_rep_event_resp { + __u32 status; + __u32 qkey; + __u32 qpn; + /* info in ib_ucm_event_get info field. */ +}; + +#define IB_UCM_PRES_DATA 0x01 +#define IB_UCM_PRES_INFO 0x02 +#define IB_UCM_PRES_PRIMARY 0x04 +#define IB_UCM_PRES_ALTERNATE 0x08 + +struct ib_ucm_event_resp { + __u64 uid; + __u32 id; + __u32 event; + __u32 present; + __u32 reserved; + union { + struct ib_ucm_req_event_resp req_resp; + struct ib_ucm_rep_event_resp rep_resp; + struct ib_ucm_rej_event_resp rej_resp; + struct ib_ucm_mra_event_resp mra_resp; + struct ib_ucm_lap_event_resp lap_resp; + struct ib_ucm_apr_event_resp apr_resp; + + struct ib_ucm_sidr_req_event_resp sidr_req_resp; + struct ib_ucm_sidr_rep_event_resp sidr_rep_resp; + + __u32 send_status; + } u; +}; + +#endif /* IB_USER_CM_H */ diff --git a/include/uapi/rdma/ib_user_mad.h b/include/uapi/rdma/ib_user_mad.h new file mode 100644 index 0000000..09f809f --- /dev/null +++ b/include/uapi/rdma/ib_user_mad.h @@ -0,0 +1,245 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IB_USER_MAD_H +#define IB_USER_MAD_H + +#include +#include + +/* + * Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define IB_USER_MAD_ABI_VERSION 5 + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + */ + +/** + * ib_user_mad_hdr_old - Old version of MAD packet header without pkey_index + * @id - ID of agent MAD received with/to be sent with + * @status - 0 on successful receive, ETIMEDOUT if no response + * received (transaction ID in data[] will be set to TID of original + * request) (ignored on send) + * @timeout_ms - Milliseconds to wait for response (unset on receive) + * @retries - Number of automatic retries to attempt + * @qpn - Remote QP number received from/to be sent to + * @qkey - Remote Q_Key to be sent with (unset on receive) + * @lid - Remote lid received from/to be sent to + * @sl - Service level received with/to be sent with + * @path_bits - Local path bits received with/to be sent with + * @grh_present - If set, GRH was received/should be sent + * @gid_index - Local GID index to send with (unset on receive) + * @hop_limit - Hop limit in GRH + * @traffic_class - Traffic class in GRH + * @gid - Remote GID in GRH + * @flow_label - Flow label in GRH + */ +struct ib_user_mad_hdr_old { + __u32 id; + __u32 status; + __u32 timeout_ms; + __u32 retries; + __u32 length; + __be32 qpn; + __be32 qkey; + __be16 lid; + __u8 sl; + __u8 path_bits; + __u8 grh_present; + __u8 gid_index; + __u8 hop_limit; + __u8 traffic_class; + __u8 gid[16]; + __be32 flow_label; +}; + +/** + * ib_user_mad_hdr - MAD packet header + * This layout allows specifying/receiving the P_Key index. To use + * this capability, an application must call the + * IB_USER_MAD_ENABLE_PKEY ioctl on the user MAD file handle before + * any other actions with the file handle. + * @id - ID of agent MAD received with/to be sent with + * @status - 0 on successful receive, ETIMEDOUT if no response + * received (transaction ID in data[] will be set to TID of original + * request) (ignored on send) + * @timeout_ms - Milliseconds to wait for response (unset on receive) + * @retries - Number of automatic retries to attempt + * @qpn - Remote QP number received from/to be sent to + * @qkey - Remote Q_Key to be sent with (unset on receive) + * @lid - Remote lid received from/to be sent to + * @sl - Service level received with/to be sent with + * @path_bits - Local path bits received with/to be sent with + * @grh_present - If set, GRH was received/should be sent + * @gid_index - Local GID index to send with (unset on receive) + * @hop_limit - Hop limit in GRH + * @traffic_class - Traffic class in GRH + * @gid - Remote GID in GRH + * @flow_label - Flow label in GRH + * @pkey_index - P_Key index + */ +struct ib_user_mad_hdr { + __u32 id; + __u32 status; + __u32 timeout_ms; + __u32 retries; + __u32 length; + __be32 qpn; + __be32 qkey; + __be16 lid; + __u8 sl; + __u8 path_bits; + __u8 grh_present; + __u8 gid_index; + __u8 hop_limit; + __u8 traffic_class; + __u8 gid[16]; + __be32 flow_label; + __u16 pkey_index; + __u8 reserved[6]; +}; + +/** + * ib_user_mad - MAD packet + * @hdr - MAD packet header + * @data - Contents of MAD + * + */ +struct ib_user_mad { + struct ib_user_mad_hdr hdr; + __u64 data[0]; +}; + +/* + * Earlier versions of this interface definition declared the + * method_mask[] member as an array of __u32 but treated it as a + * bitmap made up of longs in the kernel. This ambiguity meant that + * 32-bit big-endian applications that can run on both 32-bit and + * 64-bit kernels had no consistent ABI to rely on, and 64-bit + * big-endian applications that treated method_mask as being made up + * of 32-bit words would have their bitmap misinterpreted. + * + * To clear up this confusion, we change the declaration of + * method_mask[] to use unsigned long and handle the conversion from + * 32-bit userspace to 64-bit kernel for big-endian systems in the + * compat_ioctl method. Unfortunately, to keep the structure layout + * the same, we need the method_mask[] array to be aligned only to 4 + * bytes even when long is 64 bits, which forces us into this ugly + * typedef. + */ +typedef unsigned long __attribute__((aligned(4))) packed_ulong; +#define IB_USER_MAD_LONGS_PER_METHOD_MASK (128 / (8 * sizeof (long))) + +/** + * ib_user_mad_reg_req - MAD registration request + * @id - Set by the kernel; used to identify agent in future requests. + * @qpn - Queue pair number; must be 0 or 1. + * @method_mask - The caller will receive unsolicited MADs for any method + * where @method_mask = 1. + * @mgmt_class - Indicates which management class of MADs should be receive + * by the caller. This field is only required if the user wishes to + * receive unsolicited MADs, otherwise it should be 0. + * @mgmt_class_version - Indicates which version of MADs for the given + * management class to receive. + * @oui: Indicates IEEE OUI when mgmt_class is a vendor class + * in the range from 0x30 to 0x4f. Otherwise not used. + * @rmpp_version: If set, indicates the RMPP version used. + * + */ +struct ib_user_mad_reg_req { + __u32 id; + packed_ulong method_mask[IB_USER_MAD_LONGS_PER_METHOD_MASK]; + __u8 qpn; + __u8 mgmt_class; + __u8 mgmt_class_version; + __u8 oui[3]; + __u8 rmpp_version; +}; + +/** + * ib_user_mad_reg_req2 - MAD registration request + * + * @id - Set by the _kernel_; used by userspace to identify the + * registered agent in future requests. + * @qpn - Queue pair number; must be 0 or 1. + * @mgmt_class - Indicates which management class of MADs should be + * receive by the caller. This field is only required if + * the user wishes to receive unsolicited MADs, otherwise + * it should be 0. + * @mgmt_class_version - Indicates which version of MADs for the given + * management class to receive. + * @res - Ignored. + * @flags - additional registration flags; Must be in the set of + * flags defined in IB_USER_MAD_REG_FLAGS_CAP + * @method_mask - The caller wishes to receive unsolicited MADs for the + * methods whose bit(s) is(are) set. + * @oui - Indicates IEEE OUI to use when mgmt_class is a vendor + * class in the range from 0x30 to 0x4f. Otherwise not + * used. + * @rmpp_version - If set, indicates the RMPP version to use. + */ +enum { + IB_USER_MAD_USER_RMPP = (1 << 0), +}; +#define IB_USER_MAD_REG_FLAGS_CAP (IB_USER_MAD_USER_RMPP) +struct ib_user_mad_reg_req2 { + __u32 id; + __u32 qpn; + __u8 mgmt_class; + __u8 mgmt_class_version; + __u16 res; + __u32 flags; + __u64 method_mask[2]; + __u32 oui; + __u8 rmpp_version; + __u8 reserved[3]; +}; + +#define IB_IOCTL_MAGIC 0x1b + +#define IB_USER_MAD_REGISTER_AGENT _IOWR(IB_IOCTL_MAGIC, 1, \ + struct ib_user_mad_reg_req) + +#define IB_USER_MAD_UNREGISTER_AGENT _IOW(IB_IOCTL_MAGIC, 2, __u32) + +#define IB_USER_MAD_ENABLE_PKEY _IO(IB_IOCTL_MAGIC, 3) + +#define IB_USER_MAD_REGISTER_AGENT2 _IOWR(IB_IOCTL_MAGIC, 4, \ + struct ib_user_mad_reg_req2) + +#endif /* IB_USER_MAD_H */ diff --git a/include/uapi/rdma/ib_user_sa.h b/include/uapi/rdma/ib_user_sa.h new file mode 100644 index 0000000..cfc7c9b --- /dev/null +++ b/include/uapi/rdma/ib_user_sa.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IB_USER_SA_H +#define IB_USER_SA_H + +#include + +enum { + IB_PATH_GMP = 1, + IB_PATH_PRIMARY = (1<<1), + IB_PATH_ALTERNATE = (1<<2), + IB_PATH_OUTBOUND = (1<<3), + IB_PATH_INBOUND = (1<<4), + IB_PATH_INBOUND_REVERSE = (1<<5), + IB_PATH_BIDIRECTIONAL = IB_PATH_OUTBOUND | IB_PATH_INBOUND_REVERSE +}; + +struct ib_path_rec_data { + __u32 flags; + __u32 reserved; + __u32 path_rec[16]; +}; + +struct ib_user_path_rec { + __u8 dgid[16]; + __u8 sgid[16]; + __be16 dlid; + __be16 slid; + __u32 raw_traffic; + __be32 flow_label; + __u32 reversible; + __u32 mtu; + __be16 pkey; + __u8 hop_limit; + __u8 traffic_class; + __u8 numb_path; + __u8 sl; + __u8 mtu_selector; + __u8 rate_selector; + __u8 rate; + __u8 packet_life_time_selector; + __u8 packet_life_time; + __u8 preference; +}; + +#endif /* IB_USER_SA_H */ diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h new file mode 100644 index 0000000..26daf55 --- /dev/null +++ b/include/uapi/rdma/ib_user_verbs.h @@ -0,0 +1,880 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * Copyright (c) 2005 PathScale, Inc. All rights reserved. + * Copyright (c) 2006 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IB_USER_VERBS_H +#define IB_USER_VERBS_H + +#include + +/* + * Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define IB_USER_VERBS_ABI_VERSION 6 +#define IB_USER_VERBS_CMD_THRESHOLD 50 + +enum { + IB_USER_VERBS_CMD_GET_CONTEXT, + IB_USER_VERBS_CMD_QUERY_DEVICE, + IB_USER_VERBS_CMD_QUERY_PORT, + IB_USER_VERBS_CMD_ALLOC_PD, + IB_USER_VERBS_CMD_DEALLOC_PD, + IB_USER_VERBS_CMD_CREATE_AH, + IB_USER_VERBS_CMD_MODIFY_AH, + IB_USER_VERBS_CMD_QUERY_AH, + IB_USER_VERBS_CMD_DESTROY_AH, + IB_USER_VERBS_CMD_REG_MR, + IB_USER_VERBS_CMD_REG_SMR, + IB_USER_VERBS_CMD_REREG_MR, + IB_USER_VERBS_CMD_QUERY_MR, + IB_USER_VERBS_CMD_DEREG_MR, + IB_USER_VERBS_CMD_ALLOC_MW, + IB_USER_VERBS_CMD_BIND_MW, + IB_USER_VERBS_CMD_DEALLOC_MW, + IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL, + IB_USER_VERBS_CMD_CREATE_CQ, + IB_USER_VERBS_CMD_RESIZE_CQ, + IB_USER_VERBS_CMD_DESTROY_CQ, + IB_USER_VERBS_CMD_POLL_CQ, + IB_USER_VERBS_CMD_PEEK_CQ, + IB_USER_VERBS_CMD_REQ_NOTIFY_CQ, + IB_USER_VERBS_CMD_CREATE_QP, + IB_USER_VERBS_CMD_QUERY_QP, + IB_USER_VERBS_CMD_MODIFY_QP, + IB_USER_VERBS_CMD_DESTROY_QP, + IB_USER_VERBS_CMD_POST_SEND, + IB_USER_VERBS_CMD_POST_RECV, + IB_USER_VERBS_CMD_ATTACH_MCAST, + IB_USER_VERBS_CMD_DETACH_MCAST, + IB_USER_VERBS_CMD_CREATE_SRQ, + IB_USER_VERBS_CMD_MODIFY_SRQ, + IB_USER_VERBS_CMD_QUERY_SRQ, + IB_USER_VERBS_CMD_DESTROY_SRQ, + IB_USER_VERBS_CMD_POST_SRQ_RECV, + IB_USER_VERBS_CMD_OPEN_XRCD, + IB_USER_VERBS_CMD_CLOSE_XRCD, + IB_USER_VERBS_CMD_CREATE_XSRQ, + IB_USER_VERBS_CMD_OPEN_QP, +}; + +enum { + IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD, + IB_USER_VERBS_EX_CMD_DESTROY_FLOW +}; + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * Specifically: + * - Do not use pointer types -- pass pointers in __u64 instead. + * - Make sure that any structure larger than 4 bytes is padded to a + * multiple of 8 bytes. Otherwise the structure size will be + * different between 32-bit and 64-bit architectures. + */ + +struct ib_uverbs_async_event_desc { + __u64 element; + __u32 event_type; /* enum ib_event_type */ + __u32 reserved; +}; + +struct ib_uverbs_comp_event_desc { + __u64 cq_handle; +}; + +/* + * All commands from userspace should start with a __u32 command field + * followed by __u16 in_words and out_words fields (which give the + * length of the command block and response buffer if any in 32-bit + * words). The kernel driver will read these fields first and read + * the rest of the command struct based on these value. + */ + +#define IB_USER_VERBS_CMD_COMMAND_MASK 0xff +#define IB_USER_VERBS_CMD_FLAGS_MASK 0xff000000u +#define IB_USER_VERBS_CMD_FLAGS_SHIFT 24 + +#define IB_USER_VERBS_CMD_FLAG_EXTENDED 0x80 + +struct ib_uverbs_cmd_hdr { + __u32 command; + __u16 in_words; + __u16 out_words; +}; + +struct ib_uverbs_ex_cmd_hdr { + __u64 response; + __u16 provider_in_words; + __u16 provider_out_words; + __u32 cmd_hdr_reserved; +}; + +struct ib_uverbs_get_context { + __u64 response; + __u64 driver_data[0]; +}; + +struct ib_uverbs_get_context_resp { + __u32 async_fd; + __u32 num_comp_vectors; +}; + +struct ib_uverbs_query_device { + __u64 response; + __u64 driver_data[0]; +}; + +struct ib_uverbs_query_device_resp { + __u64 fw_ver; + __be64 node_guid; + __be64 sys_image_guid; + __u64 max_mr_size; + __u64 page_size_cap; + __u32 vendor_id; + __u32 vendor_part_id; + __u32 hw_ver; + __u32 max_qp; + __u32 max_qp_wr; + __u32 device_cap_flags; + __u32 max_sge; + __u32 max_sge_rd; + __u32 max_cq; + __u32 max_cqe; + __u32 max_mr; + __u32 max_pd; + __u32 max_qp_rd_atom; + __u32 max_ee_rd_atom; + __u32 max_res_rd_atom; + __u32 max_qp_init_rd_atom; + __u32 max_ee_init_rd_atom; + __u32 atomic_cap; + __u32 max_ee; + __u32 max_rdd; + __u32 max_mw; + __u32 max_raw_ipv6_qp; + __u32 max_raw_ethy_qp; + __u32 max_mcast_grp; + __u32 max_mcast_qp_attach; + __u32 max_total_mcast_qp_attach; + __u32 max_ah; + __u32 max_fmr; + __u32 max_map_per_fmr; + __u32 max_srq; + __u32 max_srq_wr; + __u32 max_srq_sge; + __u16 max_pkeys; + __u8 local_ca_ack_delay; + __u8 phys_port_cnt; + __u8 reserved[4]; +}; + +struct ib_uverbs_query_port { + __u64 response; + __u8 port_num; + __u8 reserved[7]; + __u64 driver_data[0]; +}; + +struct ib_uverbs_query_port_resp { + __u32 port_cap_flags; + __u32 max_msg_sz; + __u32 bad_pkey_cntr; + __u32 qkey_viol_cntr; + __u32 gid_tbl_len; + __u16 pkey_tbl_len; + __u16 lid; + __u16 sm_lid; + __u8 state; + __u8 max_mtu; + __u8 active_mtu; + __u8 lmc; + __u8 max_vl_num; + __u8 sm_sl; + __u8 subnet_timeout; + __u8 init_type_reply; + __u8 active_width; + __u8 active_speed; + __u8 phys_state; + __u8 link_layer; + __u8 reserved[2]; +}; + +struct ib_uverbs_alloc_pd { + __u64 response; + __u64 driver_data[0]; +}; + +struct ib_uverbs_alloc_pd_resp { + __u32 pd_handle; +}; + +struct ib_uverbs_dealloc_pd { + __u32 pd_handle; +}; + +struct ib_uverbs_open_xrcd { + __u64 response; + __u32 fd; + __u32 oflags; + __u64 driver_data[0]; +}; + +struct ib_uverbs_open_xrcd_resp { + __u32 xrcd_handle; +}; + +struct ib_uverbs_close_xrcd { + __u32 xrcd_handle; +}; + +struct ib_uverbs_reg_mr { + __u64 response; + __u64 start; + __u64 length; + __u64 hca_va; + __u32 pd_handle; + __u32 access_flags; + __u64 driver_data[0]; +}; + +struct ib_uverbs_reg_mr_resp { + __u32 mr_handle; + __u32 lkey; + __u32 rkey; +}; + +struct ib_uverbs_rereg_mr { + __u64 response; + __u32 mr_handle; + __u32 flags; + __u64 start; + __u64 length; + __u64 hca_va; + __u32 pd_handle; + __u32 access_flags; +}; + +struct ib_uverbs_rereg_mr_resp { + __u32 lkey; + __u32 rkey; +}; + +struct ib_uverbs_dereg_mr { + __u32 mr_handle; +}; + +struct ib_uverbs_alloc_mw { + __u64 response; + __u32 pd_handle; + __u8 mw_type; + __u8 reserved[3]; +}; + +struct ib_uverbs_alloc_mw_resp { + __u32 mw_handle; + __u32 rkey; +}; + +struct ib_uverbs_dealloc_mw { + __u32 mw_handle; +}; + +struct ib_uverbs_create_comp_channel { + __u64 response; +}; + +struct ib_uverbs_create_comp_channel_resp { + __u32 fd; +}; + +struct ib_uverbs_create_cq { + __u64 response; + __u64 user_handle; + __u32 cqe; + __u32 comp_vector; + __s32 comp_channel; + __u32 reserved; + __u64 driver_data[0]; +}; + +struct ib_uverbs_create_cq_resp { + __u32 cq_handle; + __u32 cqe; +}; + +struct ib_uverbs_resize_cq { + __u64 response; + __u32 cq_handle; + __u32 cqe; + __u64 driver_data[0]; +}; + +struct ib_uverbs_resize_cq_resp { + __u32 cqe; + __u32 reserved; + __u64 driver_data[0]; +}; + +struct ib_uverbs_poll_cq { + __u64 response; + __u32 cq_handle; + __u32 ne; +}; + +struct ib_uverbs_wc { + __u64 wr_id; + __u32 status; + __u32 opcode; + __u32 vendor_err; + __u32 byte_len; + union { + __u32 imm_data; + __u32 invalidate_rkey; + } ex; + __u32 qp_num; + __u32 src_qp; + __u32 wc_flags; + __u16 pkey_index; + __u16 slid; + __u8 sl; + __u8 dlid_path_bits; + __u8 port_num; + __u8 reserved; +}; + +struct ib_uverbs_poll_cq_resp { + __u32 count; + __u32 reserved; + struct ib_uverbs_wc wc[0]; +}; + +struct ib_uverbs_req_notify_cq { + __u32 cq_handle; + __u32 solicited_only; +}; + +struct ib_uverbs_destroy_cq { + __u64 response; + __u32 cq_handle; + __u32 reserved; +}; + +struct ib_uverbs_destroy_cq_resp { + __u32 comp_events_reported; + __u32 async_events_reported; +}; + +struct ib_uverbs_global_route { + __u8 dgid[16]; + __u32 flow_label; + __u8 sgid_index; + __u8 hop_limit; + __u8 traffic_class; + __u8 reserved; +}; + +struct ib_uverbs_ah_attr { + struct ib_uverbs_global_route grh; + __u16 dlid; + __u8 sl; + __u8 src_path_bits; + __u8 static_rate; + __u8 is_global; + __u8 port_num; + __u8 reserved; +}; + +struct ib_uverbs_qp_attr { + __u32 qp_attr_mask; + __u32 qp_state; + __u32 cur_qp_state; + __u32 path_mtu; + __u32 path_mig_state; + __u32 qkey; + __u32 rq_psn; + __u32 sq_psn; + __u32 dest_qp_num; + __u32 qp_access_flags; + + struct ib_uverbs_ah_attr ah_attr; + struct ib_uverbs_ah_attr alt_ah_attr; + + /* ib_qp_cap */ + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; + + __u16 pkey_index; + __u16 alt_pkey_index; + __u8 en_sqd_async_notify; + __u8 sq_draining; + __u8 max_rd_atomic; + __u8 max_dest_rd_atomic; + __u8 min_rnr_timer; + __u8 port_num; + __u8 timeout; + __u8 retry_cnt; + __u8 rnr_retry; + __u8 alt_port_num; + __u8 alt_timeout; + __u8 reserved[5]; +}; + +struct ib_uverbs_create_qp { + __u64 response; + __u64 user_handle; + __u32 pd_handle; + __u32 send_cq_handle; + __u32 recv_cq_handle; + __u32 srq_handle; + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; + __u8 sq_sig_all; + __u8 qp_type; + __u8 is_srq; + __u8 reserved; + __u64 driver_data[0]; +}; + +struct ib_uverbs_open_qp { + __u64 response; + __u64 user_handle; + __u32 pd_handle; + __u32 qpn; + __u8 qp_type; + __u8 reserved[7]; + __u64 driver_data[0]; +}; + +/* also used for open response */ +struct ib_uverbs_create_qp_resp { + __u32 qp_handle; + __u32 qpn; + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; + __u32 reserved; +}; + +/* + * This struct needs to remain a multiple of 8 bytes to keep the + * alignment of the modify QP parameters. + */ +struct ib_uverbs_qp_dest { + __u8 dgid[16]; + __u32 flow_label; + __u16 dlid; + __u16 reserved; + __u8 sgid_index; + __u8 hop_limit; + __u8 traffic_class; + __u8 sl; + __u8 src_path_bits; + __u8 static_rate; + __u8 is_global; + __u8 port_num; +}; + +struct ib_uverbs_query_qp { + __u64 response; + __u32 qp_handle; + __u32 attr_mask; + __u64 driver_data[0]; +}; + +struct ib_uverbs_query_qp_resp { + struct ib_uverbs_qp_dest dest; + struct ib_uverbs_qp_dest alt_dest; + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; + __u32 qkey; + __u32 rq_psn; + __u32 sq_psn; + __u32 dest_qp_num; + __u32 qp_access_flags; + __u16 pkey_index; + __u16 alt_pkey_index; + __u8 qp_state; + __u8 cur_qp_state; + __u8 path_mtu; + __u8 path_mig_state; + __u8 sq_draining; + __u8 max_rd_atomic; + __u8 max_dest_rd_atomic; + __u8 min_rnr_timer; + __u8 port_num; + __u8 timeout; + __u8 retry_cnt; + __u8 rnr_retry; + __u8 alt_port_num; + __u8 alt_timeout; + __u8 sq_sig_all; + __u8 reserved[5]; + __u64 driver_data[0]; +}; + +struct ib_uverbs_modify_qp { + struct ib_uverbs_qp_dest dest; + struct ib_uverbs_qp_dest alt_dest; + __u32 qp_handle; + __u32 attr_mask; + __u32 qkey; + __u32 rq_psn; + __u32 sq_psn; + __u32 dest_qp_num; + __u32 qp_access_flags; + __u16 pkey_index; + __u16 alt_pkey_index; + __u8 qp_state; + __u8 cur_qp_state; + __u8 path_mtu; + __u8 path_mig_state; + __u8 en_sqd_async_notify; + __u8 max_rd_atomic; + __u8 max_dest_rd_atomic; + __u8 min_rnr_timer; + __u8 port_num; + __u8 timeout; + __u8 retry_cnt; + __u8 rnr_retry; + __u8 alt_port_num; + __u8 alt_timeout; + __u8 reserved[2]; + __u64 driver_data[0]; +}; + +struct ib_uverbs_modify_qp_resp { +}; + +struct ib_uverbs_destroy_qp { + __u64 response; + __u32 qp_handle; + __u32 reserved; +}; + +struct ib_uverbs_destroy_qp_resp { + __u32 events_reported; +}; + +/* + * The ib_uverbs_sge structure isn't used anywhere, since we assume + * the ib_sge structure is packed the same way on 32-bit and 64-bit + * architectures in both kernel and user space. It's just here to + * document the ABI. + */ +struct ib_uverbs_sge { + __u64 addr; + __u32 length; + __u32 lkey; +}; + +struct ib_uverbs_send_wr { + __u64 wr_id; + __u32 num_sge; + __u32 opcode; + __u32 send_flags; + union { + __u32 imm_data; + __u32 invalidate_rkey; + } ex; + union { + struct { + __u64 remote_addr; + __u32 rkey; + __u32 reserved; + } rdma; + struct { + __u64 remote_addr; + __u64 compare_add; + __u64 swap; + __u32 rkey; + __u32 reserved; + } atomic; + struct { + __u32 ah; + __u32 remote_qpn; + __u32 remote_qkey; + __u32 reserved; + } ud; + } wr; +}; + +struct ib_uverbs_post_send { + __u64 response; + __u32 qp_handle; + __u32 wr_count; + __u32 sge_count; + __u32 wqe_size; + struct ib_uverbs_send_wr send_wr[0]; +}; + +struct ib_uverbs_post_send_resp { + __u32 bad_wr; +}; + +struct ib_uverbs_recv_wr { + __u64 wr_id; + __u32 num_sge; + __u32 reserved; +}; + +struct ib_uverbs_post_recv { + __u64 response; + __u32 qp_handle; + __u32 wr_count; + __u32 sge_count; + __u32 wqe_size; + struct ib_uverbs_recv_wr recv_wr[0]; +}; + +struct ib_uverbs_post_recv_resp { + __u32 bad_wr; +}; + +struct ib_uverbs_post_srq_recv { + __u64 response; + __u32 srq_handle; + __u32 wr_count; + __u32 sge_count; + __u32 wqe_size; + struct ib_uverbs_recv_wr recv[0]; +}; + +struct ib_uverbs_post_srq_recv_resp { + __u32 bad_wr; +}; + +struct ib_uverbs_create_ah { + __u64 response; + __u64 user_handle; + __u32 pd_handle; + __u32 reserved; + struct ib_uverbs_ah_attr attr; +}; + +struct ib_uverbs_create_ah_resp { + __u32 ah_handle; +}; + +struct ib_uverbs_destroy_ah { + __u32 ah_handle; +}; + +struct ib_uverbs_attach_mcast { + __u8 gid[16]; + __u32 qp_handle; + __u16 mlid; + __u16 reserved; + __u64 driver_data[0]; +}; + +struct ib_uverbs_detach_mcast { + __u8 gid[16]; + __u32 qp_handle; + __u16 mlid; + __u16 reserved; + __u64 driver_data[0]; +}; + +struct ib_uverbs_flow_spec_hdr { + __u32 type; + __u16 size; + __u16 reserved; + /* followed by flow_spec */ + __u64 flow_spec_data[0]; +}; + +struct ib_uverbs_flow_eth_filter { + __u8 dst_mac[6]; + __u8 src_mac[6]; + __be16 ether_type; + __be16 vlan_tag; +}; + +struct ib_uverbs_flow_spec_eth { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + struct ib_uverbs_flow_eth_filter val; + struct ib_uverbs_flow_eth_filter mask; +}; + +struct ib_uverbs_flow_ipv4_filter { + __be32 src_ip; + __be32 dst_ip; +}; + +struct ib_uverbs_flow_spec_ipv4 { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + struct ib_uverbs_flow_ipv4_filter val; + struct ib_uverbs_flow_ipv4_filter mask; +}; + +struct ib_uverbs_flow_tcp_udp_filter { + __be16 dst_port; + __be16 src_port; +}; + +struct ib_uverbs_flow_spec_tcp_udp { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + struct ib_uverbs_flow_tcp_udp_filter val; + struct ib_uverbs_flow_tcp_udp_filter mask; +}; + +struct ib_uverbs_flow_attr { + __u32 type; + __u16 size; + __u16 priority; + __u8 num_of_specs; + __u8 reserved[2]; + __u8 port; + __u32 flags; + /* Following are the optional layers according to user request + * struct ib_flow_spec_xxx + * struct ib_flow_spec_yyy + */ + struct ib_uverbs_flow_spec_hdr flow_specs[0]; +}; + +struct ib_uverbs_create_flow { + __u32 comp_mask; + __u32 qp_handle; + struct ib_uverbs_flow_attr flow_attr; +}; + +struct ib_uverbs_create_flow_resp { + __u32 comp_mask; + __u32 flow_handle; +}; + +struct ib_uverbs_destroy_flow { + __u32 comp_mask; + __u32 flow_handle; +}; + +struct ib_uverbs_create_srq { + __u64 response; + __u64 user_handle; + __u32 pd_handle; + __u32 max_wr; + __u32 max_sge; + __u32 srq_limit; + __u64 driver_data[0]; +}; + +struct ib_uverbs_create_xsrq { + __u64 response; + __u64 user_handle; + __u32 srq_type; + __u32 pd_handle; + __u32 max_wr; + __u32 max_sge; + __u32 srq_limit; + __u32 reserved; + __u32 xrcd_handle; + __u32 cq_handle; + __u64 driver_data[0]; +}; + +struct ib_uverbs_create_srq_resp { + __u32 srq_handle; + __u32 max_wr; + __u32 max_sge; + __u32 srqn; +}; + +struct ib_uverbs_modify_srq { + __u32 srq_handle; + __u32 attr_mask; + __u32 max_wr; + __u32 srq_limit; + __u64 driver_data[0]; +}; + +struct ib_uverbs_query_srq { + __u64 response; + __u32 srq_handle; + __u32 reserved; + __u64 driver_data[0]; +}; + +struct ib_uverbs_query_srq_resp { + __u32 max_wr; + __u32 max_sge; + __u32 srq_limit; + __u32 reserved; +}; + +struct ib_uverbs_destroy_srq { + __u64 response; + __u32 srq_handle; + __u32 reserved; +}; + +struct ib_uverbs_destroy_srq_resp { + __u32 events_reported; +}; + +#endif /* IB_USER_VERBS_H */ diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h new file mode 100644 index 0000000..de69170 --- /dev/null +++ b/include/uapi/rdma/rdma_netlink.h @@ -0,0 +1,131 @@ +#ifndef _UAPI_RDMA_NETLINK_H +#define _UAPI_RDMA_NETLINK_H + +#include + +enum { + RDMA_NL_RDMA_CM = 1, + RDMA_NL_NES, + RDMA_NL_C4IW, + RDMA_NL_NUM_CLIENTS +}; + +enum { + RDMA_NL_GROUP_CM = 1, + RDMA_NL_GROUP_IWPM, + RDMA_NL_NUM_GROUPS +}; + +#define RDMA_NL_GET_CLIENT(type) ((type & (((1 << 6) - 1) << 10)) >> 10) +#define RDMA_NL_GET_OP(type) (type & ((1 << 10) - 1)) +#define RDMA_NL_GET_TYPE(client, op) ((client << 10) + op) + +enum { + RDMA_NL_RDMA_CM_ID_STATS = 0, + RDMA_NL_RDMA_CM_NUM_OPS +}; + +enum { + RDMA_NL_RDMA_CM_ATTR_SRC_ADDR = 1, + RDMA_NL_RDMA_CM_ATTR_DST_ADDR, + RDMA_NL_RDMA_CM_NUM_ATTR, +}; + +/* iwarp port mapper op-codes */ +enum { + RDMA_NL_IWPM_REG_PID = 0, + RDMA_NL_IWPM_ADD_MAPPING, + RDMA_NL_IWPM_QUERY_MAPPING, + RDMA_NL_IWPM_REMOVE_MAPPING, + RDMA_NL_IWPM_HANDLE_ERR, + RDMA_NL_IWPM_MAPINFO, + RDMA_NL_IWPM_MAPINFO_NUM, + RDMA_NL_IWPM_NUM_OPS +}; + +struct rdma_cm_id_stats { + __u32 qp_num; + __u32 bound_dev_if; + __u32 port_space; + __s32 pid; + __u8 cm_state; + __u8 node_type; + __u8 port_num; + __u8 qp_type; +}; + +enum { + IWPM_NLA_REG_PID_UNSPEC = 0, + IWPM_NLA_REG_PID_SEQ, + IWPM_NLA_REG_IF_NAME, + IWPM_NLA_REG_IBDEV_NAME, + IWPM_NLA_REG_ULIB_NAME, + IWPM_NLA_REG_PID_MAX +}; + +enum { + IWPM_NLA_RREG_PID_UNSPEC = 0, + IWPM_NLA_RREG_PID_SEQ, + IWPM_NLA_RREG_IBDEV_NAME, + IWPM_NLA_RREG_ULIB_NAME, + IWPM_NLA_RREG_ULIB_VER, + IWPM_NLA_RREG_PID_ERR, + IWPM_NLA_RREG_PID_MAX + +}; + +enum { + IWPM_NLA_MANAGE_MAPPING_UNSPEC = 0, + IWPM_NLA_MANAGE_MAPPING_SEQ, + IWPM_NLA_MANAGE_ADDR, + IWPM_NLA_MANAGE_MAPPED_LOC_ADDR, + IWPM_NLA_RMANAGE_MAPPING_ERR, + IWPM_NLA_RMANAGE_MAPPING_MAX +}; + +#define IWPM_NLA_MANAGE_MAPPING_MAX 3 +#define IWPM_NLA_QUERY_MAPPING_MAX 4 +#define IWPM_NLA_MAPINFO_SEND_MAX 3 + +enum { + IWPM_NLA_QUERY_MAPPING_UNSPEC = 0, + IWPM_NLA_QUERY_MAPPING_SEQ, + IWPM_NLA_QUERY_LOCAL_ADDR, + IWPM_NLA_QUERY_REMOTE_ADDR, + IWPM_NLA_RQUERY_MAPPED_LOC_ADDR, + IWPM_NLA_RQUERY_MAPPED_REM_ADDR, + IWPM_NLA_RQUERY_MAPPING_ERR, + IWPM_NLA_RQUERY_MAPPING_MAX +}; + +enum { + IWPM_NLA_MAPINFO_REQ_UNSPEC = 0, + IWPM_NLA_MAPINFO_ULIB_NAME, + IWPM_NLA_MAPINFO_ULIB_VER, + IWPM_NLA_MAPINFO_REQ_MAX +}; + +enum { + IWPM_NLA_MAPINFO_UNSPEC = 0, + IWPM_NLA_MAPINFO_LOCAL_ADDR, + IWPM_NLA_MAPINFO_MAPPED_ADDR, + IWPM_NLA_MAPINFO_MAX +}; + +enum { + IWPM_NLA_MAPINFO_NUM_UNSPEC = 0, + IWPM_NLA_MAPINFO_SEQ, + IWPM_NLA_MAPINFO_SEND_NUM, + IWPM_NLA_MAPINFO_ACK_NUM, + IWPM_NLA_MAPINFO_NUM_MAX +}; + +enum { + IWPM_NLA_ERR_UNSPEC = 0, + IWPM_NLA_ERR_SEQ, + IWPM_NLA_ERR_CODE, + IWPM_NLA_ERR_MAX +}; + + +#endif /* _UAPI_RDMA_NETLINK_H */ diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h new file mode 100644 index 0000000..3066718 --- /dev/null +++ b/include/uapi/rdma/rdma_user_cm.h @@ -0,0 +1,303 @@ +/* + * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RDMA_USER_CM_H +#define RDMA_USER_CM_H + +#include +#include +#include +#include +#include + +#define RDMA_USER_CM_ABI_VERSION 4 + +#define RDMA_MAX_PRIVATE_DATA 256 + +enum { + RDMA_USER_CM_CMD_CREATE_ID, + RDMA_USER_CM_CMD_DESTROY_ID, + RDMA_USER_CM_CMD_BIND_IP, + RDMA_USER_CM_CMD_RESOLVE_IP, + RDMA_USER_CM_CMD_RESOLVE_ROUTE, + RDMA_USER_CM_CMD_QUERY_ROUTE, + RDMA_USER_CM_CMD_CONNECT, + RDMA_USER_CM_CMD_LISTEN, + RDMA_USER_CM_CMD_ACCEPT, + RDMA_USER_CM_CMD_REJECT, + RDMA_USER_CM_CMD_DISCONNECT, + RDMA_USER_CM_CMD_INIT_QP_ATTR, + RDMA_USER_CM_CMD_GET_EVENT, + RDMA_USER_CM_CMD_GET_OPTION, + RDMA_USER_CM_CMD_SET_OPTION, + RDMA_USER_CM_CMD_NOTIFY, + RDMA_USER_CM_CMD_JOIN_IP_MCAST, + RDMA_USER_CM_CMD_LEAVE_MCAST, + RDMA_USER_CM_CMD_MIGRATE_ID, + RDMA_USER_CM_CMD_QUERY, + RDMA_USER_CM_CMD_BIND, + RDMA_USER_CM_CMD_RESOLVE_ADDR, + RDMA_USER_CM_CMD_JOIN_MCAST +}; + +/* + * command ABI structures. + */ +struct rdma_ucm_cmd_hdr { + __u32 cmd; + __u16 in; + __u16 out; +}; + +struct rdma_ucm_create_id { + __u64 uid; + __u64 response; + __u16 ps; + __u8 qp_type; + __u8 reserved[5]; +}; + +struct rdma_ucm_create_id_resp { + __u32 id; +}; + +struct rdma_ucm_destroy_id { + __u64 response; + __u32 id; + __u32 reserved; +}; + +struct rdma_ucm_destroy_id_resp { + __u32 events_reported; +}; + +struct rdma_ucm_bind_ip { + __u64 response; + struct sockaddr_in6 addr; + __u32 id; +}; + +struct rdma_ucm_bind { + __u32 id; + __u16 addr_size; + __u16 reserved; + struct sockaddr_storage addr; +}; + +struct rdma_ucm_resolve_ip { + struct sockaddr_in6 src_addr; + struct sockaddr_in6 dst_addr; + __u32 id; + __u32 timeout_ms; +}; + +struct rdma_ucm_resolve_addr { + __u32 id; + __u32 timeout_ms; + __u16 src_size; + __u16 dst_size; + __u32 reserved; + struct sockaddr_storage src_addr; + struct sockaddr_storage dst_addr; +}; + +struct rdma_ucm_resolve_route { + __u32 id; + __u32 timeout_ms; +}; + +enum { + RDMA_USER_CM_QUERY_ADDR, + RDMA_USER_CM_QUERY_PATH, + RDMA_USER_CM_QUERY_GID +}; + +struct rdma_ucm_query { + __u64 response; + __u32 id; + __u32 option; +}; + +struct rdma_ucm_query_route_resp { + __u64 node_guid; + struct ib_user_path_rec ib_route[2]; + struct sockaddr_in6 src_addr; + struct sockaddr_in6 dst_addr; + __u32 num_paths; + __u8 port_num; + __u8 reserved[3]; +}; + +struct rdma_ucm_query_addr_resp { + __u64 node_guid; + __u8 port_num; + __u8 reserved; + __u16 pkey; + __u16 src_size; + __u16 dst_size; + struct sockaddr_storage src_addr; + struct sockaddr_storage dst_addr; +}; + +struct rdma_ucm_query_path_resp { + __u32 num_paths; + __u32 reserved; + struct ib_path_rec_data path_data[0]; +}; + +struct rdma_ucm_conn_param { + __u32 qp_num; + __u32 qkey; + __u8 private_data[RDMA_MAX_PRIVATE_DATA]; + __u8 private_data_len; + __u8 srq; + __u8 responder_resources; + __u8 initiator_depth; + __u8 flow_control; + __u8 retry_count; + __u8 rnr_retry_count; + __u8 valid; +}; + +struct rdma_ucm_ud_param { + __u32 qp_num; + __u32 qkey; + struct ib_uverbs_ah_attr ah_attr; + __u8 private_data[RDMA_MAX_PRIVATE_DATA]; + __u8 private_data_len; + __u8 reserved[7]; +}; + +struct rdma_ucm_connect { + struct rdma_ucm_conn_param conn_param; + __u32 id; + __u32 reserved; +}; + +struct rdma_ucm_listen { + __u32 id; + __u32 backlog; +}; + +struct rdma_ucm_accept { + __u64 uid; + struct rdma_ucm_conn_param conn_param; + __u32 id; + __u32 reserved; +}; + +struct rdma_ucm_reject { + __u32 id; + __u8 private_data_len; + __u8 reserved[3]; + __u8 private_data[RDMA_MAX_PRIVATE_DATA]; +}; + +struct rdma_ucm_disconnect { + __u32 id; +}; + +struct rdma_ucm_init_qp_attr { + __u64 response; + __u32 id; + __u32 qp_state; +}; + +struct rdma_ucm_notify { + __u32 id; + __u32 event; +}; + +struct rdma_ucm_join_ip_mcast { + __u64 response; /* rdma_ucm_create_id_resp */ + __u64 uid; + struct sockaddr_in6 addr; + __u32 id; +}; + +struct rdma_ucm_join_mcast { + __u64 response; /* rdma_ucma_create_id_resp */ + __u64 uid; + __u32 id; + __u16 addr_size; + __u16 reserved; + struct sockaddr_storage addr; +}; + +struct rdma_ucm_get_event { + __u64 response; +}; + +struct rdma_ucm_event_resp { + __u64 uid; + __u32 id; + __u32 event; + __u32 status; + union { + struct rdma_ucm_conn_param conn; + struct rdma_ucm_ud_param ud; + } param; +}; + +/* Option levels */ +enum { + RDMA_OPTION_ID = 0, + RDMA_OPTION_IB = 1 +}; + +/* Option details */ +enum { + RDMA_OPTION_ID_TOS = 0, + RDMA_OPTION_ID_REUSEADDR = 1, + RDMA_OPTION_ID_AFONLY = 2, + RDMA_OPTION_IB_PATH = 1 +}; + +struct rdma_ucm_set_option { + __u64 optval; + __u32 id; + __u32 level; + __u32 optname; + __u32 optlen; +}; + +struct rdma_ucm_migrate_id { + __u64 response; + __u32 id; + __u32 fd; +}; + +struct rdma_ucm_migrate_resp { + __u32 events_reported; +}; + +#endif /* RDMA_USER_CM_H */ diff --git a/net/rds/Kconfig b/net/rds/Kconfig new file mode 100644 index 0000000..f2c670b --- /dev/null +++ b/net/rds/Kconfig @@ -0,0 +1,28 @@ + +config RDS + tristate "The RDS Protocol" + depends on INET + ---help--- + The RDS (Reliable Datagram Sockets) protocol provides reliable, + sequenced delivery of datagrams over Infiniband, iWARP, + or TCP. + +config RDS_RDMA + tristate "RDS over Infiniband and iWARP" + depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS + ---help--- + Allow RDS to use Infiniband and iWARP as a transport. + This transport supports RDMA operations. + +config RDS_TCP + tristate "RDS over TCP" + depends on RDS + ---help--- + Allow RDS to use TCP as a transport. + This transport does not support RDMA operations. + +config RDS_DEBUG + bool "RDS debugging messages" + depends on RDS + default n + diff --git a/net/rds/Makefile b/net/rds/Makefile new file mode 100644 index 0000000..56d3f60 --- /dev/null +++ b/net/rds/Makefile @@ -0,0 +1,19 @@ +obj-$(CONFIG_RDS) += rds.o +rds-y := af_rds.o bind.o cong.o connection.o info.o message.o \ + recv.o send.o stats.o sysctl.o threads.o transport.o \ + loop.o page.o rdma.o + +obj-$(CONFIG_RDS_RDMA) += rds_rdma.o +rds_rdma-y := rdma_transport.o \ + ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \ + ib_sysctl.o ib_rdma.o \ + iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \ + iw_sysctl.o iw_rdma.o + + +obj-$(CONFIG_RDS_TCP) += rds_tcp.o +rds_tcp-y := tcp.o tcp_connect.o tcp_listen.o tcp_recv.o \ + tcp_send.o tcp_stats.o + +ccflags-$(CONFIG_RDS_DEBUG) := -DDEBUG + diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c new file mode 100644 index 0000000..1044337 --- /dev/null +++ b/net/rds/af_rds.c @@ -0,0 +1,599 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include +#include +#include +#include + +#include "rds.h" + +char *rds_str_array(char **array, size_t elements, size_t index) +{ + if ((index < elements) && array[index]) + return array[index]; + else + return "unknown"; +} +EXPORT_SYMBOL(rds_str_array); + +/* this is just used for stats gathering :/ */ +static DEFINE_SPINLOCK(rds_sock_lock); +static unsigned long rds_sock_count; +static LIST_HEAD(rds_sock_list); +DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq); + +/* + * This is called as the final descriptor referencing this socket is closed. + * We have to unbind the socket so that another socket can be bound to the + * address it was using. + * + * We have to be careful about racing with the incoming path. sock_orphan() + * sets SOCK_DEAD and we use that as an indicator to the rx path that new + * messages shouldn't be queued. + */ +static int rds_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct rds_sock *rs; + + if (!sk) + goto out; + + rs = rds_sk_to_rs(sk); + + sock_orphan(sk); + /* Note - rds_clear_recv_queue grabs rs_recv_lock, so + * that ensures the recv path has completed messing + * with the socket. */ + rds_clear_recv_queue(rs); + rds_cong_remove_socket(rs); + + /* + * the binding lookup hash uses rcu, we need to + * make sure we synchronize_rcu before we free our + * entry + */ + rds_remove_bound(rs); + synchronize_rcu(); + + rds_send_drop_to(rs, NULL); + rds_rdma_drop_keys(rs); + rds_notify_queue_get(rs, NULL); + + spin_lock_bh(&rds_sock_lock); + list_del_init(&rs->rs_item); + rds_sock_count--; + spin_unlock_bh(&rds_sock_lock); + + rds_trans_put(rs->rs_transport); + + sock->sk = NULL; + sock_put(sk); +out: + return 0; +} + +/* + * Careful not to race with rds_release -> sock_orphan which clears sk_sleep. + * _bh() isn't OK here, we're called from interrupt handlers. It's probably OK + * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but + * this seems more conservative. + * NB - normally, one would use sk_callback_lock for this, but we can + * get here from interrupts, whereas the network code grabs sk_callback_lock + * with _lock_bh only - so relying on sk_callback_lock introduces livelocks. + */ +void rds_wake_sk_sleep(struct rds_sock *rs) +{ + unsigned long flags; + + read_lock_irqsave(&rs->rs_recv_lock, flags); + __rds_wake_sk_sleep(rds_rs_to_sk(rs)); + read_unlock_irqrestore(&rs->rs_recv_lock, flags); +} + +static int rds_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + struct rds_sock *rs = rds_sk_to_rs(sock->sk); + + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + + /* racey, don't care */ + if (peer) { + if (!rs->rs_conn_addr) + return -ENOTCONN; + + sin->sin_port = rs->rs_conn_port; + sin->sin_addr.s_addr = rs->rs_conn_addr; + } else { + sin->sin_port = rs->rs_bound_port; + sin->sin_addr.s_addr = rs->rs_bound_addr; + } + + sin->sin_family = AF_INET; + + *uaddr_len = sizeof(*sin); + return 0; +} + +/* + * RDS' poll is without a doubt the least intuitive part of the interface, + * as POLLIN and POLLOUT do not behave entirely as you would expect from + * a network protocol. + * + * POLLIN is asserted if + * - there is data on the receive queue. + * - to signal that a previously congested destination may have become + * uncongested + * - A notification has been queued to the socket (this can be a congestion + * update, or a RDMA completion). + * + * POLLOUT is asserted if there is room on the send queue. This does not mean + * however, that the next sendmsg() call will succeed. If the application tries + * to send to a congested destination, the system call may still fail (and + * return ENOBUFS). + */ +static unsigned int rds_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + struct sock *sk = sock->sk; + struct rds_sock *rs = rds_sk_to_rs(sk); + unsigned int mask = 0; + unsigned long flags; + + poll_wait(file, sk_sleep(sk), wait); + + if (rs->rs_seen_congestion) + poll_wait(file, &rds_poll_waitq, wait); + + read_lock_irqsave(&rs->rs_recv_lock, flags); + if (!rs->rs_cong_monitor) { + /* When a congestion map was updated, we signal POLLIN for + * "historical" reasons. Applications can also poll for + * WRBAND instead. */ + if (rds_cong_updated_since(&rs->rs_cong_track)) + mask |= (POLLIN | POLLRDNORM | POLLWRBAND); + } else { + spin_lock(&rs->rs_lock); + if (rs->rs_cong_notify) + mask |= (POLLIN | POLLRDNORM); + spin_unlock(&rs->rs_lock); + } + if (!list_empty(&rs->rs_recv_queue) || + !list_empty(&rs->rs_notify_queue)) + mask |= (POLLIN | POLLRDNORM); + if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) + mask |= (POLLOUT | POLLWRNORM); + read_unlock_irqrestore(&rs->rs_recv_lock, flags); + + /* clear state any time we wake a seen-congested socket */ + if (mask) + rs->rs_seen_congestion = 0; + + return mask; +} + +static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + return -ENOIOCTLCMD; +} + +static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval, + int len) +{ + struct sockaddr_in sin; + int ret = 0; + + /* racing with another thread binding seems ok here */ + if (rs->rs_bound_addr == 0) { + ret = -ENOTCONN; /* XXX not a great errno */ + goto out; + } + + if (len < sizeof(struct sockaddr_in)) { + ret = -EINVAL; + goto out; + } + + if (copy_from_user(&sin, optval, sizeof(sin))) { + ret = -EFAULT; + goto out; + } + + rds_send_drop_to(rs, &sin); +out: + return ret; +} + +static int rds_set_bool_option(unsigned char *optvar, char __user *optval, + int optlen) +{ + int value; + + if (optlen < sizeof(int)) + return -EINVAL; + if (get_user(value, (int __user *) optval)) + return -EFAULT; + *optvar = !!value; + return 0; +} + +static int rds_cong_monitor(struct rds_sock *rs, char __user *optval, + int optlen) +{ + int ret; + + ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen); + if (ret == 0) { + if (rs->rs_cong_monitor) { + rds_cong_add_socket(rs); + } else { + rds_cong_remove_socket(rs); + rs->rs_cong_mask = 0; + rs->rs_cong_notify = 0; + } + } + return ret; +} + +static int rds_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct rds_sock *rs = rds_sk_to_rs(sock->sk); + int ret; + + if (level != SOL_RDS) { + ret = -ENOPROTOOPT; + goto out; + } + + switch (optname) { + case RDS_CANCEL_SENT_TO: + ret = rds_cancel_sent_to(rs, optval, optlen); + break; + case RDS_GET_MR: + ret = rds_get_mr(rs, optval, optlen); + break; + case RDS_GET_MR_FOR_DEST: + ret = rds_get_mr_for_dest(rs, optval, optlen); + break; + case RDS_FREE_MR: + ret = rds_free_mr(rs, optval, optlen); + break; + case RDS_RECVERR: + ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen); + break; + case RDS_CONG_MONITOR: + ret = rds_cong_monitor(rs, optval, optlen); + break; + default: + ret = -ENOPROTOOPT; + } +out: + return ret; +} + +static int rds_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct rds_sock *rs = rds_sk_to_rs(sock->sk); + int ret = -ENOPROTOOPT, len; + + if (level != SOL_RDS) + goto out; + + if (get_user(len, optlen)) { + ret = -EFAULT; + goto out; + } + + switch (optname) { + case RDS_INFO_FIRST ... RDS_INFO_LAST: + ret = rds_info_getsockopt(sock, optname, optval, + optlen); + break; + + case RDS_RECVERR: + if (len < sizeof(int)) + ret = -EINVAL; + else + if (put_user(rs->rs_recverr, (int __user *) optval) || + put_user(sizeof(int), optlen)) + ret = -EFAULT; + else + ret = 0; + break; + default: + break; + } + +out: + return ret; + +} + +static int rds_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + struct rds_sock *rs = rds_sk_to_rs(sk); + int ret = 0; + + lock_sock(sk); + + if (addr_len != sizeof(struct sockaddr_in)) { + ret = -EINVAL; + goto out; + } + + if (sin->sin_family != AF_INET) { + ret = -EAFNOSUPPORT; + goto out; + } + + if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { + ret = -EDESTADDRREQ; + goto out; + } + + rs->rs_conn_addr = sin->sin_addr.s_addr; + rs->rs_conn_port = sin->sin_port; + +out: + release_sock(sk); + return ret; +} + +static struct proto rds_proto = { + .name = "RDS", + .owner = THIS_MODULE, + .obj_size = sizeof(struct rds_sock), +}; + +static const struct proto_ops rds_proto_ops = { + .family = AF_RDS, + .owner = THIS_MODULE, + .release = rds_release, + .bind = rds_bind, + .connect = rds_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = rds_getname, + .poll = rds_poll, + .ioctl = rds_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = rds_setsockopt, + .getsockopt = rds_getsockopt, + .sendmsg = rds_sendmsg, + .recvmsg = rds_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static int __rds_create(struct socket *sock, struct sock *sk, int protocol) +{ + struct rds_sock *rs; + + sock_init_data(sock, sk); + sock->ops = &rds_proto_ops; + sk->sk_protocol = protocol; + + rs = rds_sk_to_rs(sk); + spin_lock_init(&rs->rs_lock); + rwlock_init(&rs->rs_recv_lock); + INIT_LIST_HEAD(&rs->rs_send_queue); + INIT_LIST_HEAD(&rs->rs_recv_queue); + INIT_LIST_HEAD(&rs->rs_notify_queue); + INIT_LIST_HEAD(&rs->rs_cong_list); + spin_lock_init(&rs->rs_rdma_lock); + rs->rs_rdma_keys = RB_ROOT; + + spin_lock_bh(&rds_sock_lock); + list_add_tail(&rs->rs_item, &rds_sock_list); + rds_sock_count++; + spin_unlock_bh(&rds_sock_lock); + + return 0; +} + +static int rds_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct sock *sk; + + if (sock->type != SOCK_SEQPACKET || protocol) + return -ESOCKTNOSUPPORT; + + sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto); + if (!sk) + return -ENOMEM; + + return __rds_create(sock, sk, protocol); +} + +void rds_sock_addref(struct rds_sock *rs) +{ + sock_hold(rds_rs_to_sk(rs)); +} + +void rds_sock_put(struct rds_sock *rs) +{ + sock_put(rds_rs_to_sk(rs)); +} + +static const struct net_proto_family rds_family_ops = { + .family = AF_RDS, + .create = rds_create, + .owner = THIS_MODULE, +}; + +static void rds_sock_inc_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + struct rds_sock *rs; + struct rds_incoming *inc; + unsigned int total = 0; + + len /= sizeof(struct rds_info_message); + + spin_lock_bh(&rds_sock_lock); + + list_for_each_entry(rs, &rds_sock_list, rs_item) { + read_lock(&rs->rs_recv_lock); + + /* XXX too lazy to maintain counts.. */ + list_for_each_entry(inc, &rs->rs_recv_queue, i_item) { + total++; + if (total <= len) + rds_inc_info_copy(inc, iter, inc->i_saddr, + rs->rs_bound_addr, 1); + } + + read_unlock(&rs->rs_recv_lock); + } + + spin_unlock_bh(&rds_sock_lock); + + lens->nr = total; + lens->each = sizeof(struct rds_info_message); +} + +static void rds_sock_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + struct rds_info_socket sinfo; + struct rds_sock *rs; + + len /= sizeof(struct rds_info_socket); + + spin_lock_bh(&rds_sock_lock); + + if (len < rds_sock_count) + goto out; + + list_for_each_entry(rs, &rds_sock_list, rs_item) { + sinfo.sndbuf = rds_sk_sndbuf(rs); + sinfo.rcvbuf = rds_sk_rcvbuf(rs); + sinfo.bound_addr = rs->rs_bound_addr; + sinfo.connected_addr = rs->rs_conn_addr; + sinfo.bound_port = rs->rs_bound_port; + sinfo.connected_port = rs->rs_conn_port; + sinfo.inum = sock_i_ino(rds_rs_to_sk(rs)); + + rds_info_copy(iter, &sinfo, sizeof(sinfo)); + } + +out: + lens->nr = rds_sock_count; + lens->each = sizeof(struct rds_info_socket); + + spin_unlock_bh(&rds_sock_lock); +} + +static void rds_exit(void) +{ + sock_unregister(rds_family_ops.family); + proto_unregister(&rds_proto); + rds_conn_exit(); + rds_cong_exit(); + rds_sysctl_exit(); + rds_threads_exit(); + rds_stats_exit(); + rds_page_exit(); + rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info); + rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); +} +module_exit(rds_exit); + +static int rds_init(void) +{ + int ret; + + ret = rds_conn_init(); + if (ret) + goto out; + ret = rds_threads_init(); + if (ret) + goto out_conn; + ret = rds_sysctl_init(); + if (ret) + goto out_threads; + ret = rds_stats_init(); + if (ret) + goto out_sysctl; + ret = proto_register(&rds_proto, 1); + if (ret) + goto out_stats; + ret = sock_register(&rds_family_ops); + if (ret) + goto out_proto; + + rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info); + rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); + + goto out; + +out_proto: + proto_unregister(&rds_proto); +out_stats: + rds_stats_exit(); +out_sysctl: + rds_sysctl_exit(); +out_threads: + rds_threads_exit(); +out_conn: + rds_conn_exit(); + rds_cong_exit(); + rds_page_exit(); +out: + return ret; +} +module_init(rds_init); + +#define DRV_VERSION "4.0" +#define DRV_RELDATE "Feb 12, 2009" + +MODULE_AUTHOR("Oracle Corporation "); +MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets" + " v" DRV_VERSION " (" DRV_RELDATE ")"); +MODULE_VERSION(DRV_VERSION); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS_NETPROTO(PF_RDS); diff --git a/net/rds/bind.c b/net/rds/bind.c new file mode 100644 index 0000000..a2e6562 --- /dev/null +++ b/net/rds/bind.c @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include +#include +#include +#include "rds.h" + +#define BIND_HASH_SIZE 1024 +static struct hlist_head bind_hash_table[BIND_HASH_SIZE]; +static DEFINE_SPINLOCK(rds_bind_lock); + +static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port) +{ + return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) & + (BIND_HASH_SIZE - 1)); +} + +static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port, + struct rds_sock *insert) +{ + struct rds_sock *rs; + struct hlist_head *head = hash_to_bucket(addr, port); + u64 cmp; + u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port); + + rcu_read_lock(); + hlist_for_each_entry_rcu(rs, head, rs_bound_node) { + cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) | + be16_to_cpu(rs->rs_bound_port); + + if (cmp == needle) { + rcu_read_unlock(); + return rs; + } + } + rcu_read_unlock(); + + if (insert) { + /* + * make sure our addr and port are set before + * we are added to the list, other people + * in rcu will find us as soon as the + * hlist_add_head_rcu is done + */ + insert->rs_bound_addr = addr; + insert->rs_bound_port = port; + rds_sock_addref(insert); + + hlist_add_head_rcu(&insert->rs_bound_node, head); + } + return NULL; +} + +/* + * Return the rds_sock bound at the given local address. + * + * The rx path can race with rds_release. We notice if rds_release() has + * marked this socket and don't return a rs ref to the rx path. + */ +struct rds_sock *rds_find_bound(__be32 addr, __be16 port) +{ + struct rds_sock *rs; + + rs = rds_bind_lookup(addr, port, NULL); + + if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) + rds_sock_addref(rs); + else + rs = NULL; + + rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, + ntohs(port)); + return rs; +} + +/* returns -ve errno or +ve port */ +static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) +{ + unsigned long flags; + int ret = -EADDRINUSE; + u16 rover, last; + + if (*port != 0) { + rover = be16_to_cpu(*port); + last = rover; + } else { + rover = max_t(u16, prandom_u32(), 2); + last = rover - 1; + } + + spin_lock_irqsave(&rds_bind_lock, flags); + + do { + if (rover == 0) + rover++; + if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) { + *port = rs->rs_bound_port; + ret = 0; + rdsdebug("rs %p binding to %pI4:%d\n", + rs, &addr, (int)ntohs(*port)); + break; + } + } while (rover++ != last); + + spin_unlock_irqrestore(&rds_bind_lock, flags); + + return ret; +} + +void rds_remove_bound(struct rds_sock *rs) +{ + unsigned long flags; + + spin_lock_irqsave(&rds_bind_lock, flags); + + if (rs->rs_bound_addr) { + rdsdebug("rs %p unbinding from %pI4:%d\n", + rs, &rs->rs_bound_addr, + ntohs(rs->rs_bound_port)); + + hlist_del_init_rcu(&rs->rs_bound_node); + rds_sock_put(rs); + rs->rs_bound_addr = 0; + } + + spin_unlock_irqrestore(&rds_bind_lock, flags); +} + +int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk = sock->sk; + struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + struct rds_sock *rs = rds_sk_to_rs(sk); + struct rds_transport *trans; + int ret = 0; + + lock_sock(sk); + + if (addr_len != sizeof(struct sockaddr_in) || + sin->sin_family != AF_INET || + rs->rs_bound_addr || + sin->sin_addr.s_addr == htonl(INADDR_ANY)) { + ret = -EINVAL; + goto out; + } + + ret = rds_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port); + if (ret) + goto out; + + trans = rds_trans_get_preferred(sin->sin_addr.s_addr); + if (!trans) { + ret = -EADDRNOTAVAIL; + rds_remove_bound(rs); + printk_ratelimited(KERN_INFO "RDS: rds_bind() could not find a transport, " + "load rds_tcp or rds_rdma?\n"); + goto out; + } + + rs->rs_transport = trans; + ret = 0; + +out: + release_sock(sk); + + /* we might have called rds_remove_bound on error */ + if (ret) + synchronize_rcu(); + return ret; +} diff --git a/net/rds/cong.c b/net/rds/cong.c new file mode 100644 index 0000000..e5b65ac --- /dev/null +++ b/net/rds/cong.c @@ -0,0 +1,406 @@ +/* + * Copyright (c) 2007 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include +#include + +#include "rds.h" + +/* + * This file implements the receive side of the unconventional congestion + * management in RDS. + * + * Messages waiting in the receive queue on the receiving socket are accounted + * against the sockets SO_RCVBUF option value. Only the payload bytes in the + * message are accounted for. If the number of bytes queued equals or exceeds + * rcvbuf then the socket is congested. All sends attempted to this socket's + * address should return block or return -EWOULDBLOCK. + * + * Applications are expected to be reasonably tuned such that this situation + * very rarely occurs. An application encountering this "back-pressure" is + * considered a bug. + * + * This is implemented by having each node maintain bitmaps which indicate + * which ports on bound addresses are congested. As the bitmap changes it is + * sent through all the connections which terminate in the local address of the + * bitmap which changed. + * + * The bitmaps are allocated as connections are brought up. This avoids + * allocation in the interrupt handling path which queues messages on sockets. + * The dense bitmaps let transports send the entire bitmap on any bitmap change + * reasonably efficiently. This is much easier to implement than some + * finer-grained communication of per-port congestion. The sender does a very + * inexpensive bit test to test if the port it's about to send to is congested + * or not. + */ + +/* + * Interaction with poll is a tad tricky. We want all processes stuck in + * poll to wake up and check whether a congested destination became uncongested. + * The really sad thing is we have no idea which destinations the application + * wants to send to - we don't even know which rds_connections are involved. + * So until we implement a more flexible rds poll interface, we have to make + * do with this: + * We maintain a global counter that is incremented each time a congestion map + * update is received. Each rds socket tracks this value, and if rds_poll + * finds that the saved generation number is smaller than the global generation + * number, it wakes up the process. + */ +static atomic_t rds_cong_generation = ATOMIC_INIT(0); + +/* + * Congestion monitoring + */ +static LIST_HEAD(rds_cong_monitor); +static DEFINE_RWLOCK(rds_cong_monitor_lock); + +/* + * Yes, a global lock. It's used so infrequently that it's worth keeping it + * global to simplify the locking. It's only used in the following + * circumstances: + * + * - on connection buildup to associate a conn with its maps + * - on map changes to inform conns of a new map to send + * + * It's sadly ordered under the socket callback lock and the connection lock. + * Receive paths can mark ports congested from interrupt context so the + * lock masks interrupts. + */ +static DEFINE_SPINLOCK(rds_cong_lock); +static struct rb_root rds_cong_tree = RB_ROOT; + +static struct rds_cong_map *rds_cong_tree_walk(__be32 addr, + struct rds_cong_map *insert) +{ + struct rb_node **p = &rds_cong_tree.rb_node; + struct rb_node *parent = NULL; + struct rds_cong_map *map; + + while (*p) { + parent = *p; + map = rb_entry(parent, struct rds_cong_map, m_rb_node); + + if (addr < map->m_addr) + p = &(*p)->rb_left; + else if (addr > map->m_addr) + p = &(*p)->rb_right; + else + return map; + } + + if (insert) { + rb_link_node(&insert->m_rb_node, parent, p); + rb_insert_color(&insert->m_rb_node, &rds_cong_tree); + } + return NULL; +} + +/* + * There is only ever one bitmap for any address. Connections try and allocate + * these bitmaps in the process getting pointers to them. The bitmaps are only + * ever freed as the module is removed after all connections have been freed. + */ +static struct rds_cong_map *rds_cong_from_addr(__be32 addr) +{ + struct rds_cong_map *map; + struct rds_cong_map *ret = NULL; + unsigned long zp; + unsigned long i; + unsigned long flags; + + map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL); + if (!map) + return NULL; + + map->m_addr = addr; + init_waitqueue_head(&map->m_waitq); + INIT_LIST_HEAD(&map->m_conn_list); + + for (i = 0; i < RDS_CONG_MAP_PAGES; i++) { + zp = get_zeroed_page(GFP_KERNEL); + if (zp == 0) + goto out; + map->m_page_addrs[i] = zp; + } + + spin_lock_irqsave(&rds_cong_lock, flags); + ret = rds_cong_tree_walk(addr, map); + spin_unlock_irqrestore(&rds_cong_lock, flags); + + if (!ret) { + ret = map; + map = NULL; + } + +out: + if (map) { + for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++) + free_page(map->m_page_addrs[i]); + kfree(map); + } + + rdsdebug("map %p for addr %x\n", ret, be32_to_cpu(addr)); + + return ret; +} + +/* + * Put the conn on its local map's list. This is called when the conn is + * really added to the hash. It's nested under the rds_conn_lock, sadly. + */ +void rds_cong_add_conn(struct rds_connection *conn) +{ + unsigned long flags; + + rdsdebug("conn %p now on map %p\n", conn, conn->c_lcong); + spin_lock_irqsave(&rds_cong_lock, flags); + list_add_tail(&conn->c_map_item, &conn->c_lcong->m_conn_list); + spin_unlock_irqrestore(&rds_cong_lock, flags); +} + +void rds_cong_remove_conn(struct rds_connection *conn) +{ + unsigned long flags; + + rdsdebug("removing conn %p from map %p\n", conn, conn->c_lcong); + spin_lock_irqsave(&rds_cong_lock, flags); + list_del_init(&conn->c_map_item); + spin_unlock_irqrestore(&rds_cong_lock, flags); +} + +int rds_cong_get_maps(struct rds_connection *conn) +{ + conn->c_lcong = rds_cong_from_addr(conn->c_laddr); + conn->c_fcong = rds_cong_from_addr(conn->c_faddr); + + if (!(conn->c_lcong && conn->c_fcong)) + return -ENOMEM; + + return 0; +} + +void rds_cong_queue_updates(struct rds_cong_map *map) +{ + struct rds_connection *conn; + unsigned long flags; + + spin_lock_irqsave(&rds_cong_lock, flags); + + list_for_each_entry(conn, &map->m_conn_list, c_map_item) { + if (!test_and_set_bit(0, &conn->c_map_queued)) { + rds_stats_inc(s_cong_update_queued); + rds_send_xmit(conn); + } + } + + spin_unlock_irqrestore(&rds_cong_lock, flags); +} + +void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask) +{ + rdsdebug("waking map %p for %pI4\n", + map, &map->m_addr); + rds_stats_inc(s_cong_update_received); + atomic_inc(&rds_cong_generation); + if (waitqueue_active(&map->m_waitq)) + wake_up(&map->m_waitq); + if (waitqueue_active(&rds_poll_waitq)) + wake_up_all(&rds_poll_waitq); + + if (portmask && !list_empty(&rds_cong_monitor)) { + unsigned long flags; + struct rds_sock *rs; + + read_lock_irqsave(&rds_cong_monitor_lock, flags); + list_for_each_entry(rs, &rds_cong_monitor, rs_cong_list) { + spin_lock(&rs->rs_lock); + rs->rs_cong_notify |= (rs->rs_cong_mask & portmask); + rs->rs_cong_mask &= ~portmask; + spin_unlock(&rs->rs_lock); + if (rs->rs_cong_notify) + rds_wake_sk_sleep(rs); + } + read_unlock_irqrestore(&rds_cong_monitor_lock, flags); + } +} +EXPORT_SYMBOL_GPL(rds_cong_map_updated); + +int rds_cong_updated_since(unsigned long *recent) +{ + unsigned long gen = atomic_read(&rds_cong_generation); + + if (likely(*recent == gen)) + return 0; + *recent = gen; + return 1; +} + +/* + * We're called under the locking that protects the sockets receive buffer + * consumption. This makes it a lot easier for the caller to only call us + * when it knows that an existing set bit needs to be cleared, and vice versa. + * We can't block and we need to deal with concurrent sockets working against + * the same per-address map. + */ +void rds_cong_set_bit(struct rds_cong_map *map, __be16 port) +{ + unsigned long i; + unsigned long off; + + rdsdebug("setting congestion for %pI4:%u in map %p\n", + &map->m_addr, ntohs(port), map); + + i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; + off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; + + __set_bit_le(off, (void *)map->m_page_addrs[i]); +} + +void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port) +{ + unsigned long i; + unsigned long off; + + rdsdebug("clearing congestion for %pI4:%u in map %p\n", + &map->m_addr, ntohs(port), map); + + i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; + off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; + + __clear_bit_le(off, (void *)map->m_page_addrs[i]); +} + +static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port) +{ + unsigned long i; + unsigned long off; + + i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; + off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; + + return test_bit_le(off, (void *)map->m_page_addrs[i]); +} + +void rds_cong_add_socket(struct rds_sock *rs) +{ + unsigned long flags; + + write_lock_irqsave(&rds_cong_monitor_lock, flags); + if (list_empty(&rs->rs_cong_list)) + list_add(&rs->rs_cong_list, &rds_cong_monitor); + write_unlock_irqrestore(&rds_cong_monitor_lock, flags); +} + +void rds_cong_remove_socket(struct rds_sock *rs) +{ + unsigned long flags; + struct rds_cong_map *map; + + write_lock_irqsave(&rds_cong_monitor_lock, flags); + list_del_init(&rs->rs_cong_list); + write_unlock_irqrestore(&rds_cong_monitor_lock, flags); + + /* update congestion map for now-closed port */ + spin_lock_irqsave(&rds_cong_lock, flags); + map = rds_cong_tree_walk(rs->rs_bound_addr, NULL); + spin_unlock_irqrestore(&rds_cong_lock, flags); + + if (map && rds_cong_test_bit(map, rs->rs_bound_port)) { + rds_cong_clear_bit(map, rs->rs_bound_port); + rds_cong_queue_updates(map); + } +} + +int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, + struct rds_sock *rs) +{ + if (!rds_cong_test_bit(map, port)) + return 0; + if (nonblock) { + if (rs && rs->rs_cong_monitor) { + unsigned long flags; + + /* It would have been nice to have an atomic set_bit on + * a uint64_t. */ + spin_lock_irqsave(&rs->rs_lock, flags); + rs->rs_cong_mask |= RDS_CONG_MONITOR_MASK(ntohs(port)); + spin_unlock_irqrestore(&rs->rs_lock, flags); + + /* Test again - a congestion update may have arrived in + * the meantime. */ + if (!rds_cong_test_bit(map, port)) + return 0; + } + rds_stats_inc(s_cong_send_error); + return -ENOBUFS; + } + + rds_stats_inc(s_cong_send_blocked); + rdsdebug("waiting on map %p for port %u\n", map, be16_to_cpu(port)); + + return wait_event_interruptible(map->m_waitq, + !rds_cong_test_bit(map, port)); +} + +void rds_cong_exit(void) +{ + struct rb_node *node; + struct rds_cong_map *map; + unsigned long i; + + while ((node = rb_first(&rds_cong_tree))) { + map = rb_entry(node, struct rds_cong_map, m_rb_node); + rdsdebug("freeing map %p\n", map); + rb_erase(&map->m_rb_node, &rds_cong_tree); + for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++) + free_page(map->m_page_addrs[i]); + kfree(map); + } +} + +/* + * Allocate a RDS message containing a congestion update. + */ +struct rds_message *rds_cong_update_alloc(struct rds_connection *conn) +{ + struct rds_cong_map *map = conn->c_lcong; + struct rds_message *rm; + + rm = rds_message_map_pages(map->m_page_addrs, RDS_CONG_MAP_BYTES); + if (!IS_ERR(rm)) + rm->m_inc.i_hdr.h_flags = RDS_FLAG_CONG_BITMAP; + + return rm; +} diff --git a/net/rds/connection.c b/net/rds/connection.c new file mode 100644 index 0000000..378c3a6 --- /dev/null +++ b/net/rds/connection.c @@ -0,0 +1,577 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include +#include + +#include "rds.h" +#include "loop.h" + +#define RDS_CONNECTION_HASH_BITS 12 +#define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS) +#define RDS_CONNECTION_HASH_MASK (RDS_CONNECTION_HASH_ENTRIES - 1) + +/* converting this to RCU is a chore for another day.. */ +static DEFINE_SPINLOCK(rds_conn_lock); +static unsigned long rds_conn_count; +static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES]; +static struct kmem_cache *rds_conn_slab; + +static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr) +{ + static u32 rds_hash_secret __read_mostly; + + unsigned long hash; + + net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret)); + + /* Pass NULL, don't need struct net for hash */ + hash = __inet_ehashfn(be32_to_cpu(laddr), 0, + be32_to_cpu(faddr), 0, + rds_hash_secret); + return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK]; +} + +#define rds_conn_info_set(var, test, suffix) do { \ + if (test) \ + var |= RDS_INFO_CONNECTION_FLAG_##suffix; \ +} while (0) + +/* rcu read lock must be held or the connection spinlock */ +static struct rds_connection *rds_conn_lookup(struct hlist_head *head, + __be32 laddr, __be32 faddr, + struct rds_transport *trans) +{ + struct rds_connection *conn, *ret = NULL; + + hlist_for_each_entry_rcu(conn, head, c_hash_node) { + if (conn->c_faddr == faddr && conn->c_laddr == laddr && + conn->c_trans == trans) { + ret = conn; + break; + } + } + rdsdebug("returning conn %p for %pI4 -> %pI4\n", ret, + &laddr, &faddr); + return ret; +} + +/* + * This is called by transports as they're bringing down a connection. + * It clears partial message state so that the transport can start sending + * and receiving over this connection again in the future. It is up to + * the transport to have serialized this call with its send and recv. + */ +static void rds_conn_reset(struct rds_connection *conn) +{ + rdsdebug("connection %pI4 to %pI4 reset\n", + &conn->c_laddr, &conn->c_faddr); + + rds_stats_inc(s_conn_reset); + rds_send_reset(conn); + conn->c_flags = 0; + + /* Do not clear next_rx_seq here, else we cannot distinguish + * retransmitted packets from new packets, and will hand all + * of them to the application. That is not consistent with the + * reliability guarantees of RDS. */ +} + +/* + * There is only every one 'conn' for a given pair of addresses in the + * system at a time. They contain messages to be retransmitted and so + * span the lifetime of the actual underlying transport connections. + * + * For now they are not garbage collected once they're created. They + * are torn down as the module is removed, if ever. + */ +static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, + struct rds_transport *trans, gfp_t gfp, + int is_outgoing) +{ + struct rds_connection *conn, *parent = NULL; + struct hlist_head *head = rds_conn_bucket(laddr, faddr); + struct rds_transport *loop_trans; + unsigned long flags; + int ret; + + rcu_read_lock(); + conn = rds_conn_lookup(head, laddr, faddr, trans); + if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && + !is_outgoing) { + /* This is a looped back IB connection, and we're + * called by the code handling the incoming connect. + * We need a second connection object into which we + * can stick the other QP. */ + parent = conn; + conn = parent->c_passive; + } + rcu_read_unlock(); + if (conn) + goto out; + + conn = kmem_cache_zalloc(rds_conn_slab, gfp); + if (!conn) { + conn = ERR_PTR(-ENOMEM); + goto out; + } + + INIT_HLIST_NODE(&conn->c_hash_node); + conn->c_laddr = laddr; + conn->c_faddr = faddr; + spin_lock_init(&conn->c_lock); + conn->c_next_tx_seq = 1; + + init_waitqueue_head(&conn->c_waitq); + INIT_LIST_HEAD(&conn->c_send_queue); + INIT_LIST_HEAD(&conn->c_retrans); + + ret = rds_cong_get_maps(conn); + if (ret) { + kmem_cache_free(rds_conn_slab, conn); + conn = ERR_PTR(ret); + goto out; + } + + /* + * This is where a connection becomes loopback. If *any* RDS sockets + * can bind to the destination address then we'd rather the messages + * flow through loopback rather than either transport. + */ + loop_trans = rds_trans_get_preferred(faddr); + if (loop_trans) { + rds_trans_put(loop_trans); + conn->c_loopback = 1; + if (is_outgoing && trans->t_prefer_loopback) { + /* "outgoing" connection - and the transport + * says it wants the connection handled by the + * loopback transport. This is what TCP does. + */ + trans = &rds_loop_transport; + } + } + + conn->c_trans = trans; + + ret = trans->conn_alloc(conn, gfp); + if (ret) { + kmem_cache_free(rds_conn_slab, conn); + conn = ERR_PTR(ret); + goto out; + } + + atomic_set(&conn->c_state, RDS_CONN_DOWN); + conn->c_reconnect_jiffies = 0; + INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker); + INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker); + INIT_DELAYED_WORK(&conn->c_conn_w, rds_connect_worker); + INIT_WORK(&conn->c_down_w, rds_shutdown_worker); + mutex_init(&conn->c_cm_lock); + conn->c_flags = 0; + + rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n", + conn, &laddr, &faddr, + trans->t_name ? trans->t_name : "[unknown]", + is_outgoing ? "(outgoing)" : ""); + + /* + * Since we ran without holding the conn lock, someone could + * have created the same conn (either normal or passive) in the + * interim. We check while holding the lock. If we won, we complete + * init and return our conn. If we lost, we rollback and return the + * other one. + */ + spin_lock_irqsave(&rds_conn_lock, flags); + if (parent) { + /* Creating passive conn */ + if (parent->c_passive) { + trans->conn_free(conn->c_transport_data); + kmem_cache_free(rds_conn_slab, conn); + conn = parent->c_passive; + } else { + parent->c_passive = conn; + rds_cong_add_conn(conn); + rds_conn_count++; + } + } else { + /* Creating normal conn */ + struct rds_connection *found; + + found = rds_conn_lookup(head, laddr, faddr, trans); + if (found) { + trans->conn_free(conn->c_transport_data); + kmem_cache_free(rds_conn_slab, conn); + conn = found; + } else { + hlist_add_head_rcu(&conn->c_hash_node, head); + rds_cong_add_conn(conn); + rds_conn_count++; + } + } + spin_unlock_irqrestore(&rds_conn_lock, flags); + +out: + return conn; +} + +struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, + struct rds_transport *trans, gfp_t gfp) +{ + return __rds_conn_create(laddr, faddr, trans, gfp, 0); +} +EXPORT_SYMBOL_GPL(rds_conn_create); + +struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, + struct rds_transport *trans, gfp_t gfp) +{ + return __rds_conn_create(laddr, faddr, trans, gfp, 1); +} +EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); + +void rds_conn_shutdown(struct rds_connection *conn) +{ + /* shut it down unless it's down already */ + if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) { + /* + * Quiesce the connection mgmt handlers before we start tearing + * things down. We don't hold the mutex for the entire + * duration of the shutdown operation, else we may be + * deadlocking with the CM handler. Instead, the CM event + * handler is supposed to check for state DISCONNECTING + */ + mutex_lock(&conn->c_cm_lock); + if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING) + && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) { + rds_conn_error(conn, "shutdown called in state %d\n", + atomic_read(&conn->c_state)); + mutex_unlock(&conn->c_cm_lock); + return; + } + mutex_unlock(&conn->c_cm_lock); + + wait_event(conn->c_waitq, + !test_bit(RDS_IN_XMIT, &conn->c_flags)); + + conn->c_trans->conn_shutdown(conn); + rds_conn_reset(conn); + + if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) { + /* This can happen - eg when we're in the middle of tearing + * down the connection, and someone unloads the rds module. + * Quite reproduceable with loopback connections. + * Mostly harmless. + */ + rds_conn_error(conn, + "%s: failed to transition to state DOWN, " + "current state is %d\n", + __func__, + atomic_read(&conn->c_state)); + return; + } + } + + /* Then reconnect if it's still live. + * The passive side of an IB loopback connection is never added + * to the conn hash, so we never trigger a reconnect on this + * conn - the reconnect is always triggered by the active peer. */ + cancel_delayed_work_sync(&conn->c_conn_w); + rcu_read_lock(); + if (!hlist_unhashed(&conn->c_hash_node)) { + rcu_read_unlock(); + rds_queue_reconnect(conn); + } else { + rcu_read_unlock(); + } +} + +/* + * Stop and free a connection. + * + * This can only be used in very limited circumstances. It assumes that once + * the conn has been shutdown that no one else is referencing the connection. + * We can only ensure this in the rmmod path in the current code. + */ +void rds_conn_destroy(struct rds_connection *conn) +{ + struct rds_message *rm, *rtmp; + unsigned long flags; + + rdsdebug("freeing conn %p for %pI4 -> " + "%pI4\n", conn, &conn->c_laddr, + &conn->c_faddr); + + /* Ensure conn will not be scheduled for reconnect */ + spin_lock_irq(&rds_conn_lock); + hlist_del_init_rcu(&conn->c_hash_node); + spin_unlock_irq(&rds_conn_lock); + synchronize_rcu(); + + /* shut the connection down */ + rds_conn_drop(conn); + flush_work(&conn->c_down_w); + + /* make sure lingering queued work won't try to ref the conn */ + cancel_delayed_work_sync(&conn->c_send_w); + cancel_delayed_work_sync(&conn->c_recv_w); + + /* tear down queued messages */ + list_for_each_entry_safe(rm, rtmp, + &conn->c_send_queue, + m_conn_item) { + list_del_init(&rm->m_conn_item); + BUG_ON(!list_empty(&rm->m_sock_item)); + rds_message_put(rm); + } + if (conn->c_xmit_rm) + rds_message_put(conn->c_xmit_rm); + + conn->c_trans->conn_free(conn->c_transport_data); + + /* + * The congestion maps aren't freed up here. They're + * freed by rds_cong_exit() after all the connections + * have been freed. + */ + rds_cong_remove_conn(conn); + + BUG_ON(!list_empty(&conn->c_retrans)); + kmem_cache_free(rds_conn_slab, conn); + + spin_lock_irqsave(&rds_conn_lock, flags); + rds_conn_count--; + spin_unlock_irqrestore(&rds_conn_lock, flags); +} +EXPORT_SYMBOL_GPL(rds_conn_destroy); + +static void rds_conn_message_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int want_send) +{ + struct hlist_head *head; + struct list_head *list; + struct rds_connection *conn; + struct rds_message *rm; + unsigned int total = 0; + unsigned long flags; + size_t i; + + len /= sizeof(struct rds_info_message); + + rcu_read_lock(); + + for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); + i++, head++) { + hlist_for_each_entry_rcu(conn, head, c_hash_node) { + if (want_send) + list = &conn->c_send_queue; + else + list = &conn->c_retrans; + + spin_lock_irqsave(&conn->c_lock, flags); + + /* XXX too lazy to maintain counts.. */ + list_for_each_entry(rm, list, m_conn_item) { + total++; + if (total <= len) + rds_inc_info_copy(&rm->m_inc, iter, + conn->c_laddr, + conn->c_faddr, 0); + } + + spin_unlock_irqrestore(&conn->c_lock, flags); + } + } + rcu_read_unlock(); + + lens->nr = total; + lens->each = sizeof(struct rds_info_message); +} + +static void rds_conn_message_info_send(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + rds_conn_message_info(sock, len, iter, lens, 1); +} + +static void rds_conn_message_info_retrans(struct socket *sock, + unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + rds_conn_message_info(sock, len, iter, lens, 0); +} + +void rds_for_each_conn_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int (*visitor)(struct rds_connection *, void *), + size_t item_len) +{ + uint64_t buffer[(item_len + 7) / 8]; + struct hlist_head *head; + struct rds_connection *conn; + size_t i; + + rcu_read_lock(); + + lens->nr = 0; + lens->each = item_len; + + for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); + i++, head++) { + hlist_for_each_entry_rcu(conn, head, c_hash_node) { + + /* XXX no c_lock usage.. */ + if (!visitor(conn, buffer)) + continue; + + /* We copy as much as we can fit in the buffer, + * but we count all items so that the caller + * can resize the buffer. */ + if (len >= item_len) { + rds_info_copy(iter, buffer, item_len); + len -= item_len; + } + lens->nr++; + } + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(rds_for_each_conn_info); + +static int rds_conn_info_visitor(struct rds_connection *conn, + void *buffer) +{ + struct rds_info_connection *cinfo = buffer; + + cinfo->next_tx_seq = conn->c_next_tx_seq; + cinfo->next_rx_seq = conn->c_next_rx_seq; + cinfo->laddr = conn->c_laddr; + cinfo->faddr = conn->c_faddr; + strncpy(cinfo->transport, conn->c_trans->t_name, + sizeof(cinfo->transport)); + cinfo->flags = 0; + + rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &conn->c_flags), + SENDING); + /* XXX Future: return the state rather than these funky bits */ + rds_conn_info_set(cinfo->flags, + atomic_read(&conn->c_state) == RDS_CONN_CONNECTING, + CONNECTING); + rds_conn_info_set(cinfo->flags, + atomic_read(&conn->c_state) == RDS_CONN_UP, + CONNECTED); + return 1; +} + +static void rds_conn_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + rds_for_each_conn_info(sock, len, iter, lens, + rds_conn_info_visitor, + sizeof(struct rds_info_connection)); +} + +int rds_conn_init(void) +{ + rds_conn_slab = kmem_cache_create("rds_connection", + sizeof(struct rds_connection), + 0, 0, NULL); + if (!rds_conn_slab) + return -ENOMEM; + + rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info); + rds_info_register_func(RDS_INFO_SEND_MESSAGES, + rds_conn_message_info_send); + rds_info_register_func(RDS_INFO_RETRANS_MESSAGES, + rds_conn_message_info_retrans); + + return 0; +} + +void rds_conn_exit(void) +{ + rds_loop_exit(); + + WARN_ON(!hlist_empty(rds_conn_hash)); + + kmem_cache_destroy(rds_conn_slab); + + rds_info_deregister_func(RDS_INFO_CONNECTIONS, rds_conn_info); + rds_info_deregister_func(RDS_INFO_SEND_MESSAGES, + rds_conn_message_info_send); + rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES, + rds_conn_message_info_retrans); +} + +/* + * Force a disconnect + */ +void rds_conn_drop(struct rds_connection *conn) +{ + atomic_set(&conn->c_state, RDS_CONN_ERROR); + queue_work(rds_wq, &conn->c_down_w); +} +EXPORT_SYMBOL_GPL(rds_conn_drop); + +/* + * If the connection is down, trigger a connect. We may have scheduled a + * delayed reconnect however - in this case we should not interfere. + */ +void rds_conn_connect_if_down(struct rds_connection *conn) +{ + if (rds_conn_state(conn) == RDS_CONN_DOWN && + !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) + queue_delayed_work(rds_wq, &conn->c_conn_w, 0); +} +EXPORT_SYMBOL_GPL(rds_conn_connect_if_down); + +/* + * An error occurred on the connection + */ +void +__rds_conn_error(struct rds_connection *conn, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vprintk(fmt, ap); + va_end(ap); + + rds_conn_drop(conn); +} diff --git a/net/rds/ib.c b/net/rds/ib.c new file mode 100644 index 0000000..ba2dffe --- /dev/null +++ b/net/rds/ib.c @@ -0,0 +1,435 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rds.h" +#include "ib.h" + +static unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE; +unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */ +unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT; + +module_param(fmr_pool_size, int, 0444); +MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA"); +module_param(fmr_message_size, int, 0444); +MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer"); +module_param(rds_ib_retry_count, int, 0444); +MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); + +/* + * we have a clumsy combination of RCU and a rwsem protecting this list + * because it is used both in the get_mr fast path and while blocking in + * the FMR flushing path. + */ +DECLARE_RWSEM(rds_ib_devices_lock); +struct list_head rds_ib_devices; + +/* NOTE: if also grabbing ibdev lock, grab this first */ +DEFINE_SPINLOCK(ib_nodev_conns_lock); +LIST_HEAD(ib_nodev_conns); + +static void rds_ib_nodev_connect(void) +{ + struct rds_ib_connection *ic; + + spin_lock(&ib_nodev_conns_lock); + list_for_each_entry(ic, &ib_nodev_conns, ib_node) + rds_conn_connect_if_down(ic->conn); + spin_unlock(&ib_nodev_conns_lock); +} + +static void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev) +{ + struct rds_ib_connection *ic; + unsigned long flags; + + spin_lock_irqsave(&rds_ibdev->spinlock, flags); + list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node) + rds_conn_drop(ic->conn); + spin_unlock_irqrestore(&rds_ibdev->spinlock, flags); +} + +/* + * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references + * from interrupt context so we push freing off into a work struct in krdsd. + */ +static void rds_ib_dev_free(struct work_struct *work) +{ + struct rds_ib_ipaddr *i_ipaddr, *i_next; + struct rds_ib_device *rds_ibdev = container_of(work, + struct rds_ib_device, free_work); + + if (rds_ibdev->mr_pool) + rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); + if (rds_ibdev->mr) + ib_dereg_mr(rds_ibdev->mr); + if (rds_ibdev->pd) + ib_dealloc_pd(rds_ibdev->pd); + + list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { + list_del(&i_ipaddr->list); + kfree(i_ipaddr); + } + + kfree(rds_ibdev); +} + +void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) +{ + BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0); + if (atomic_dec_and_test(&rds_ibdev->refcount)) + queue_work(rds_wq, &rds_ibdev->free_work); +} + +static void rds_ib_add_one(struct ib_device *device) +{ + struct rds_ib_device *rds_ibdev; + struct ib_device_attr *dev_attr; + + /* Only handle IB (no iWARP) devices */ + if (device->node_type != RDMA_NODE_IB_CA) + return; + + dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); + if (!dev_attr) + return; + + if (ib_query_device(device, dev_attr)) { + rdsdebug("Query device failed for %s\n", device->name); + goto free_attr; + } + + rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL, + ibdev_to_node(device)); + if (!rds_ibdev) + goto free_attr; + + spin_lock_init(&rds_ibdev->spinlock); + atomic_set(&rds_ibdev->refcount, 1); + INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free); + + rds_ibdev->max_wrs = dev_attr->max_qp_wr; + rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); + + rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32; + rds_ibdev->max_fmrs = dev_attr->max_fmr ? + min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) : + fmr_pool_size; + + rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom; + rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom; + + rds_ibdev->dev = device; + rds_ibdev->pd = ib_alloc_pd(device); + if (IS_ERR(rds_ibdev->pd)) { + rds_ibdev->pd = NULL; + goto put_dev; + } + + rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(rds_ibdev->mr)) { + rds_ibdev->mr = NULL; + goto put_dev; + } + + rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); + if (IS_ERR(rds_ibdev->mr_pool)) { + rds_ibdev->mr_pool = NULL; + goto put_dev; + } + + INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); + INIT_LIST_HEAD(&rds_ibdev->conn_list); + + down_write(&rds_ib_devices_lock); + list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices); + up_write(&rds_ib_devices_lock); + atomic_inc(&rds_ibdev->refcount); + + ib_set_client_data(device, &rds_ib_client, rds_ibdev); + atomic_inc(&rds_ibdev->refcount); + + rds_ib_nodev_connect(); + +put_dev: + rds_ib_dev_put(rds_ibdev); +free_attr: + kfree(dev_attr); +} + +/* + * New connections use this to find the device to associate with the + * connection. It's not in the fast path so we're not concerned about the + * performance of the IB call. (As of this writing, it uses an interrupt + * blocking spinlock to serialize walking a per-device list of all registered + * clients.) + * + * RCU is used to handle incoming connections racing with device teardown. + * Rather than use a lock to serialize removal from the client_data and + * getting a new reference, we use an RCU grace period. The destruction + * path removes the device from client_data and then waits for all RCU + * readers to finish. + * + * A new connection can get NULL from this if its arriving on a + * device that is in the process of being removed. + */ +struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device) +{ + struct rds_ib_device *rds_ibdev; + + rcu_read_lock(); + rds_ibdev = ib_get_client_data(device, &rds_ib_client); + if (rds_ibdev) + atomic_inc(&rds_ibdev->refcount); + rcu_read_unlock(); + return rds_ibdev; +} + +/* + * The IB stack is letting us know that a device is going away. This can + * happen if the underlying HCA driver is removed or if PCI hotplug is removing + * the pci function, for example. + * + * This can be called at any time and can be racing with any other RDS path. + */ +static void rds_ib_remove_one(struct ib_device *device) +{ + struct rds_ib_device *rds_ibdev; + + rds_ibdev = ib_get_client_data(device, &rds_ib_client); + if (!rds_ibdev) + return; + + rds_ib_dev_shutdown(rds_ibdev); + + /* stop connection attempts from getting a reference to this device. */ + ib_set_client_data(device, &rds_ib_client, NULL); + + down_write(&rds_ib_devices_lock); + list_del_rcu(&rds_ibdev->list); + up_write(&rds_ib_devices_lock); + + /* + * This synchronize rcu is waiting for readers of both the ib + * client data and the devices list to finish before we drop + * both of those references. + */ + synchronize_rcu(); + rds_ib_dev_put(rds_ibdev); + rds_ib_dev_put(rds_ibdev); +} + +struct ib_client rds_ib_client = { + .name = "rds_ib", + .add = rds_ib_add_one, + .remove = rds_ib_remove_one +}; + +static int rds_ib_conn_info_visitor(struct rds_connection *conn, + void *buffer) +{ + struct rds_info_rdma_connection *iinfo = buffer; + struct rds_ib_connection *ic; + + /* We will only ever look at IB transports */ + if (conn->c_trans != &rds_ib_transport) + return 0; + + iinfo->src_addr = conn->c_laddr; + iinfo->dst_addr = conn->c_faddr; + + memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); + memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); + if (rds_conn_state(conn) == RDS_CONN_UP) { + struct rds_ib_device *rds_ibdev; + struct rdma_dev_addr *dev_addr; + + ic = conn->c_transport_data; + dev_addr = &ic->i_cm_id->route.addr.dev_addr; + + rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); + rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); + + rds_ibdev = ic->rds_ibdev; + iinfo->max_send_wr = ic->i_send_ring.w_nr; + iinfo->max_recv_wr = ic->i_recv_ring.w_nr; + iinfo->max_send_sge = rds_ibdev->max_sge; + rds_ib_get_mr_info(rds_ibdev, iinfo); + } + return 1; +} + +static void rds_ib_ic_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + rds_for_each_conn_info(sock, len, iter, lens, + rds_ib_conn_info_visitor, + sizeof(struct rds_info_rdma_connection)); +} + + +/* + * Early RDS/IB was built to only bind to an address if there is an IPoIB + * device with that address set. + * + * If it were me, I'd advocate for something more flexible. Sending and + * receiving should be device-agnostic. Transports would try and maintain + * connections between peers who have messages queued. Userspace would be + * allowed to influence which paths have priority. We could call userspace + * asserting this policy "routing". + */ +static int rds_ib_laddr_check(__be32 addr) +{ + int ret; + struct rdma_cm_id *cm_id; + struct sockaddr_in sin; + + /* Create a CMA ID and try to bind it. This catches both + * IB and iWARP capable NICs. + */ + cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(cm_id)) + return PTR_ERR(cm_id); + + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = addr; + + /* rdma_bind_addr will only succeed for IB & iWARP devices */ + ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); + /* due to this, we will claim to support iWARP devices unless we + check node_type. */ + if (ret || !cm_id->device || + cm_id->device->node_type != RDMA_NODE_IB_CA) + ret = -EADDRNOTAVAIL; + + rdsdebug("addr %pI4 ret %d node type %d\n", + &addr, ret, + cm_id->device ? cm_id->device->node_type : -1); + + rdma_destroy_id(cm_id); + + return ret; +} + +static void rds_ib_unregister_client(void) +{ + ib_unregister_client(&rds_ib_client); + /* wait for rds_ib_dev_free() to complete */ + flush_workqueue(rds_wq); +} + +void rds_ib_exit(void) +{ + rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); + rds_ib_unregister_client(); + rds_ib_destroy_nodev_conns(); + rds_ib_sysctl_exit(); + rds_ib_recv_exit(); + rds_trans_unregister(&rds_ib_transport); +} + +struct rds_transport rds_ib_transport = { + .laddr_check = rds_ib_laddr_check, + .xmit_complete = rds_ib_xmit_complete, + .xmit = rds_ib_xmit, + .xmit_rdma = rds_ib_xmit_rdma, + .xmit_atomic = rds_ib_xmit_atomic, + .recv = rds_ib_recv, + .conn_alloc = rds_ib_conn_alloc, + .conn_free = rds_ib_conn_free, + .conn_connect = rds_ib_conn_connect, + .conn_shutdown = rds_ib_conn_shutdown, + .inc_copy_to_user = rds_ib_inc_copy_to_user, + .inc_free = rds_ib_inc_free, + .cm_initiate_connect = rds_ib_cm_initiate_connect, + .cm_handle_connect = rds_ib_cm_handle_connect, + .cm_connect_complete = rds_ib_cm_connect_complete, + .stats_info_copy = rds_ib_stats_info_copy, + .exit = rds_ib_exit, + .get_mr = rds_ib_get_mr, + .sync_mr = rds_ib_sync_mr, + .free_mr = rds_ib_free_mr, + .flush_mrs = rds_ib_flush_mrs, + .t_owner = THIS_MODULE, + .t_name = "infiniband", + .t_type = RDS_TRANS_IB +}; + +int rds_ib_init(void) +{ + int ret; + + INIT_LIST_HEAD(&rds_ib_devices); + + ret = ib_register_client(&rds_ib_client); + if (ret) + goto out; + + ret = rds_ib_sysctl_init(); + if (ret) + goto out_ibreg; + + ret = rds_ib_recv_init(); + if (ret) + goto out_sysctl; + + ret = rds_trans_register(&rds_ib_transport); + if (ret) + goto out_recv; + + rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); + + goto out; + +out_recv: + rds_ib_recv_exit(); +out_sysctl: + rds_ib_sysctl_exit(); +out_ibreg: + rds_ib_unregister_client(); +out: + return ret; +} + +MODULE_LICENSE("GPL"); + diff --git a/net/rds/ib.h b/net/rds/ib.h new file mode 100644 index 0000000..7280ab8 --- /dev/null +++ b/net/rds/ib.h @@ -0,0 +1,373 @@ +#ifndef _RDS_IB_H +#define _RDS_IB_H + +#include +#include +#include +#include +#include +#include "rds.h" +#include "rdma_transport.h" + +#define RDS_FMR_SIZE 256 +#define RDS_FMR_POOL_SIZE 8192 + +#define RDS_IB_MAX_SGE 8 +#define RDS_IB_RECV_SGE 2 + +#define RDS_IB_DEFAULT_RECV_WR 1024 +#define RDS_IB_DEFAULT_SEND_WR 256 + +#define RDS_IB_DEFAULT_RETRY_COUNT 2 + +#define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ + +#define RDS_IB_RECYCLE_BATCH_COUNT 32 + +extern struct rw_semaphore rds_ib_devices_lock; +extern struct list_head rds_ib_devices; + +/* + * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to + * try and minimize the amount of memory tied up both the device and + * socket receive queues. + */ +struct rds_page_frag { + struct list_head f_item; + struct list_head f_cache_entry; + struct scatterlist f_sg; +}; + +struct rds_ib_incoming { + struct list_head ii_frags; + struct list_head ii_cache_entry; + struct rds_incoming ii_inc; +}; + +struct rds_ib_cache_head { + struct list_head *first; + unsigned long count; +}; + +struct rds_ib_refill_cache { + struct rds_ib_cache_head __percpu *percpu; + struct list_head *xfer; + struct list_head *ready; +}; + +struct rds_ib_connect_private { + /* Add new fields at the end, and don't permute existing fields. */ + __be32 dp_saddr; + __be32 dp_daddr; + u8 dp_protocol_major; + u8 dp_protocol_minor; + __be16 dp_protocol_minor_mask; /* bitmask */ + __be32 dp_reserved1; + __be64 dp_ack_seq; + __be32 dp_credit; /* non-zero enables flow ctl */ +}; + +struct rds_ib_send_work { + void *s_op; + struct ib_send_wr s_wr; + struct ib_sge s_sge[RDS_IB_MAX_SGE]; + unsigned long s_queued; +}; + +struct rds_ib_recv_work { + struct rds_ib_incoming *r_ibinc; + struct rds_page_frag *r_frag; + struct ib_recv_wr r_wr; + struct ib_sge r_sge[2]; +}; + +struct rds_ib_work_ring { + u32 w_nr; + u32 w_alloc_ptr; + u32 w_alloc_ctr; + u32 w_free_ptr; + atomic_t w_free_ctr; +}; + +struct rds_ib_device; + +struct rds_ib_connection { + + struct list_head ib_node; + struct rds_ib_device *rds_ibdev; + struct rds_connection *conn; + + /* alphabet soup, IBTA style */ + struct rdma_cm_id *i_cm_id; + struct ib_pd *i_pd; + struct ib_mr *i_mr; + struct ib_cq *i_send_cq; + struct ib_cq *i_recv_cq; + + /* tx */ + struct rds_ib_work_ring i_send_ring; + struct rm_data_op *i_data_op; + struct rds_header *i_send_hdrs; + u64 i_send_hdrs_dma; + struct rds_ib_send_work *i_sends; + atomic_t i_signaled_sends; + + /* rx */ + struct tasklet_struct i_recv_tasklet; + struct mutex i_recv_mutex; + struct rds_ib_work_ring i_recv_ring; + struct rds_ib_incoming *i_ibinc; + u32 i_recv_data_rem; + struct rds_header *i_recv_hdrs; + u64 i_recv_hdrs_dma; + struct rds_ib_recv_work *i_recvs; + u64 i_ack_recv; /* last ACK received */ + struct rds_ib_refill_cache i_cache_incs; + struct rds_ib_refill_cache i_cache_frags; + + /* sending acks */ + unsigned long i_ack_flags; +#ifdef KERNEL_HAS_ATOMIC64 + atomic64_t i_ack_next; /* next ACK to send */ +#else + spinlock_t i_ack_lock; /* protect i_ack_next */ + u64 i_ack_next; /* next ACK to send */ +#endif + struct rds_header *i_ack; + struct ib_send_wr i_ack_wr; + struct ib_sge i_ack_sge; + u64 i_ack_dma; + unsigned long i_ack_queued; + + /* Flow control related information + * + * Our algorithm uses a pair variables that we need to access + * atomically - one for the send credits, and one posted + * recv credits we need to transfer to remote. + * Rather than protect them using a slow spinlock, we put both into + * a single atomic_t and update it using cmpxchg + */ + atomic_t i_credits; + + /* Protocol version specific information */ + unsigned int i_flowctl:1; /* enable/disable flow ctl */ + + /* Batched completions */ + unsigned int i_unsignaled_wrs; +}; + +/* This assumes that atomic_t is at least 32 bits */ +#define IB_GET_SEND_CREDITS(v) ((v) & 0xffff) +#define IB_GET_POST_CREDITS(v) ((v) >> 16) +#define IB_SET_SEND_CREDITS(v) ((v) & 0xffff) +#define IB_SET_POST_CREDITS(v) ((v) << 16) + +struct rds_ib_ipaddr { + struct list_head list; + __be32 ipaddr; +}; + +struct rds_ib_device { + struct list_head list; + struct list_head ipaddr_list; + struct list_head conn_list; + struct ib_device *dev; + struct ib_pd *pd; + struct ib_mr *mr; + struct rds_ib_mr_pool *mr_pool; + unsigned int fmr_max_remaps; + unsigned int max_fmrs; + int max_sge; + unsigned int max_wrs; + unsigned int max_initiator_depth; + unsigned int max_responder_resources; + spinlock_t spinlock; /* protect the above */ + atomic_t refcount; + struct work_struct free_work; +}; + +#define ibdev_to_node(ibdev) dev_to_node(ibdev->dma_device) +#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev) + +/* bits for i_ack_flags */ +#define IB_ACK_IN_FLIGHT 0 +#define IB_ACK_REQUESTED 1 + +/* Magic WR_ID for ACKs */ +#define RDS_IB_ACK_WR_ID (~(u64) 0) + +struct rds_ib_statistics { + uint64_t s_ib_connect_raced; + uint64_t s_ib_listen_closed_stale; + uint64_t s_ib_tx_cq_call; + uint64_t s_ib_tx_cq_event; + uint64_t s_ib_tx_ring_full; + uint64_t s_ib_tx_throttle; + uint64_t s_ib_tx_sg_mapping_failure; + uint64_t s_ib_tx_stalled; + uint64_t s_ib_tx_credit_updates; + uint64_t s_ib_rx_cq_call; + uint64_t s_ib_rx_cq_event; + uint64_t s_ib_rx_ring_empty; + uint64_t s_ib_rx_refill_from_cq; + uint64_t s_ib_rx_refill_from_thread; + uint64_t s_ib_rx_alloc_limit; + uint64_t s_ib_rx_credit_updates; + uint64_t s_ib_ack_sent; + uint64_t s_ib_ack_send_failure; + uint64_t s_ib_ack_send_delayed; + uint64_t s_ib_ack_send_piggybacked; + uint64_t s_ib_ack_received; + uint64_t s_ib_rdma_mr_alloc; + uint64_t s_ib_rdma_mr_free; + uint64_t s_ib_rdma_mr_used; + uint64_t s_ib_rdma_mr_pool_flush; + uint64_t s_ib_rdma_mr_pool_wait; + uint64_t s_ib_rdma_mr_pool_depleted; + uint64_t s_ib_atomic_cswp; + uint64_t s_ib_atomic_fadd; +}; + +extern struct workqueue_struct *rds_ib_wq; + +/* + * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h + * doesn't define it. + */ +static inline void rds_ib_dma_sync_sg_for_cpu(struct ib_device *dev, + struct scatterlist *sg, unsigned int sg_dma_len, int direction) +{ + unsigned int i; + + for (i = 0; i < sg_dma_len; ++i) { + ib_dma_sync_single_for_cpu(dev, + ib_sg_dma_address(dev, &sg[i]), + ib_sg_dma_len(dev, &sg[i]), + direction); + } +} +#define ib_dma_sync_sg_for_cpu rds_ib_dma_sync_sg_for_cpu + +static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev, + struct scatterlist *sg, unsigned int sg_dma_len, int direction) +{ + unsigned int i; + + for (i = 0; i < sg_dma_len; ++i) { + ib_dma_sync_single_for_device(dev, + ib_sg_dma_address(dev, &sg[i]), + ib_sg_dma_len(dev, &sg[i]), + direction); + } +} +#define ib_dma_sync_sg_for_device rds_ib_dma_sync_sg_for_device + + +/* ib.c */ +extern struct rds_transport rds_ib_transport; +struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device); +void rds_ib_dev_put(struct rds_ib_device *rds_ibdev); +extern struct ib_client rds_ib_client; + +extern unsigned int fmr_message_size; +extern unsigned int rds_ib_retry_count; + +extern spinlock_t ib_nodev_conns_lock; +extern struct list_head ib_nodev_conns; + +/* ib_cm.c */ +int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp); +void rds_ib_conn_free(void *arg); +int rds_ib_conn_connect(struct rds_connection *conn); +void rds_ib_conn_shutdown(struct rds_connection *conn); +void rds_ib_state_change(struct sock *sk); +int rds_ib_listen_init(void); +void rds_ib_listen_stop(void); +void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...); +int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); +int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id); +void rds_ib_cm_connect_complete(struct rds_connection *conn, + struct rdma_cm_event *event); + + +#define rds_ib_conn_error(conn, fmt...) \ + __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt) + +/* ib_rdma.c */ +int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); +void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); +void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); +void rds_ib_destroy_nodev_conns(void); +struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *); +void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); +void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); +void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, + struct rds_sock *rs, u32 *key_ret); +void rds_ib_sync_mr(void *trans_private, int dir); +void rds_ib_free_mr(void *trans_private, int invalidate); +void rds_ib_flush_mrs(void); + +/* ib_recv.c */ +int rds_ib_recv_init(void); +void rds_ib_recv_exit(void); +int rds_ib_recv(struct rds_connection *conn); +int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic); +void rds_ib_recv_free_caches(struct rds_ib_connection *ic); +void rds_ib_recv_refill(struct rds_connection *conn, int prefill); +void rds_ib_inc_free(struct rds_incoming *inc); +int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, + size_t size); +void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context); +void rds_ib_recv_tasklet_fn(unsigned long data); +void rds_ib_recv_init_ring(struct rds_ib_connection *ic); +void rds_ib_recv_clear_ring(struct rds_ib_connection *ic); +void rds_ib_recv_init_ack(struct rds_ib_connection *ic); +void rds_ib_attempt_ack(struct rds_ib_connection *ic); +void rds_ib_ack_send_complete(struct rds_ib_connection *ic); +u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic); + +/* ib_ring.c */ +void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr); +void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr); +u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos); +void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val); +void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val); +int rds_ib_ring_empty(struct rds_ib_work_ring *ring); +int rds_ib_ring_low(struct rds_ib_work_ring *ring); +u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring); +u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest); +extern wait_queue_head_t rds_ib_ring_empty_wait; + +/* ib_send.c */ +char *rds_ib_wc_status_str(enum ib_wc_status status); +void rds_ib_xmit_complete(struct rds_connection *conn); +int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, unsigned int off); +void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context); +void rds_ib_send_init_ring(struct rds_ib_connection *ic); +void rds_ib_send_clear_ring(struct rds_ib_connection *ic); +int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op); +void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits); +void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted); +int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted, + u32 *adv_credits, int need_posted, int max_posted); +int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op); + +/* ib_stats.c */ +DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); +#define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member) +unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail); + +/* ib_sysctl.c */ +int rds_ib_sysctl_init(void); +void rds_ib_sysctl_exit(void); +extern unsigned long rds_ib_sysctl_max_send_wr; +extern unsigned long rds_ib_sysctl_max_recv_wr; +extern unsigned long rds_ib_sysctl_max_unsig_wrs; +extern unsigned long rds_ib_sysctl_max_unsig_bytes; +extern unsigned long rds_ib_sysctl_max_recv_allocation; +extern unsigned int rds_ib_sysctl_flow_control; + +#endif diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c new file mode 100644 index 0000000..31b74f5 --- /dev/null +++ b/net/rds/ib_cm.c @@ -0,0 +1,829 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include +#include + +#include "rds.h" +#include "ib.h" + +static char *rds_ib_event_type_strings[] = { +#define RDS_IB_EVENT_STRING(foo) \ + [IB_EVENT_##foo] = __stringify(IB_EVENT_##foo) + RDS_IB_EVENT_STRING(CQ_ERR), + RDS_IB_EVENT_STRING(QP_FATAL), + RDS_IB_EVENT_STRING(QP_REQ_ERR), + RDS_IB_EVENT_STRING(QP_ACCESS_ERR), + RDS_IB_EVENT_STRING(COMM_EST), + RDS_IB_EVENT_STRING(SQ_DRAINED), + RDS_IB_EVENT_STRING(PATH_MIG), + RDS_IB_EVENT_STRING(PATH_MIG_ERR), + RDS_IB_EVENT_STRING(DEVICE_FATAL), + RDS_IB_EVENT_STRING(PORT_ACTIVE), + RDS_IB_EVENT_STRING(PORT_ERR), + RDS_IB_EVENT_STRING(LID_CHANGE), + RDS_IB_EVENT_STRING(PKEY_CHANGE), + RDS_IB_EVENT_STRING(SM_CHANGE), + RDS_IB_EVENT_STRING(SRQ_ERR), + RDS_IB_EVENT_STRING(SRQ_LIMIT_REACHED), + RDS_IB_EVENT_STRING(QP_LAST_WQE_REACHED), + RDS_IB_EVENT_STRING(CLIENT_REREGISTER), +#undef RDS_IB_EVENT_STRING +}; + +static char *rds_ib_event_str(enum ib_event_type type) +{ + return rds_str_array(rds_ib_event_type_strings, + ARRAY_SIZE(rds_ib_event_type_strings), type); +}; + +/* + * Set the selected protocol version + */ +static void rds_ib_set_protocol(struct rds_connection *conn, unsigned int version) +{ + conn->c_version = version; +} + +/* + * Set up flow control + */ +static void rds_ib_set_flow_control(struct rds_connection *conn, u32 credits) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + + if (rds_ib_sysctl_flow_control && credits != 0) { + /* We're doing flow control */ + ic->i_flowctl = 1; + rds_ib_send_add_credits(conn, credits); + } else { + ic->i_flowctl = 0; + } +} + +/* + * Tune RNR behavior. Without flow control, we use a rather + * low timeout, but not the absolute minimum - this should + * be tunable. + * + * We already set the RNR retry count to 7 (which is the + * smallest infinite number :-) above. + * If flow control is off, we want to change this back to 0 + * so that we learn quickly when our credit accounting is + * buggy. + * + * Caller passes in a qp_attr pointer - don't waste stack spacv + * by allocation this twice. + */ +static void +rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr) +{ + int ret; + + attr->min_rnr_timer = IB_RNR_TIMER_000_32; + ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER); + if (ret) + printk(KERN_NOTICE "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d\n", -ret); +} + +/* + * Connection established. + * We get here for both outgoing and incoming connection. + */ +void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event) +{ + const struct rds_ib_connect_private *dp = NULL; + struct rds_ib_connection *ic = conn->c_transport_data; + struct ib_qp_attr qp_attr; + int err; + + if (event->param.conn.private_data_len >= sizeof(*dp)) { + dp = event->param.conn.private_data; + + /* make sure it isn't empty data */ + if (dp->dp_protocol_major) { + rds_ib_set_protocol(conn, + RDS_PROTOCOL(dp->dp_protocol_major, + dp->dp_protocol_minor)); + rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); + } + } + + if (conn->c_version < RDS_PROTOCOL(3,1)) { + printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed," + " no longer supported\n", + &conn->c_faddr, + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version)); + rds_conn_destroy(conn); + return; + } else { + printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", + &conn->c_faddr, + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version), + ic->i_flowctl ? ", flow control" : ""); + } + + /* + * Init rings and fill recv. this needs to wait until protocol negotiation + * is complete, since ring layout is different from 3.0 to 3.1. + */ + rds_ib_send_init_ring(ic); + rds_ib_recv_init_ring(ic); + /* Post receive buffers - as a side effect, this will update + * the posted credit count. */ + rds_ib_recv_refill(conn, 1); + + /* Tune RNR behavior */ + rds_ib_tune_rnr(ic, &qp_attr); + + qp_attr.qp_state = IB_QPS_RTS; + err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE); + if (err) + printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err); + + /* update ib_device with this local ipaddr */ + err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr); + if (err) + printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", + err); + + /* If the peer gave us the last packet it saw, process this as if + * we had received a regular ACK. */ + if (dp && dp->dp_ack_seq) + rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); + + rds_connect_complete(conn); +} + +static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, + struct rdma_conn_param *conn_param, + struct rds_ib_connect_private *dp, + u32 protocol_version, + u32 max_responder_resources, + u32 max_initiator_depth) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_ib_device *rds_ibdev = ic->rds_ibdev; + + memset(conn_param, 0, sizeof(struct rdma_conn_param)); + + conn_param->responder_resources = + min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources); + conn_param->initiator_depth = + min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth); + conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7); + conn_param->rnr_retry_count = 7; + + if (dp) { + memset(dp, 0, sizeof(*dp)); + dp->dp_saddr = conn->c_laddr; + dp->dp_daddr = conn->c_faddr; + dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version); + dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); + dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS); + dp->dp_ack_seq = rds_ib_piggyb_ack(ic); + + /* Advertise flow control */ + if (ic->i_flowctl) { + unsigned int credits; + + credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)); + dp->dp_credit = cpu_to_be32(credits); + atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits); + } + + conn_param->private_data = dp; + conn_param->private_data_len = sizeof(*dp); + } +} + +static void rds_ib_cq_event_handler(struct ib_event *event, void *data) +{ + rdsdebug("event %u (%s) data %p\n", + event->event, rds_ib_event_str(event->event), data); +} + +static void rds_ib_qp_event_handler(struct ib_event *event, void *data) +{ + struct rds_connection *conn = data; + struct rds_ib_connection *ic = conn->c_transport_data; + + rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event, + rds_ib_event_str(event->event)); + + switch (event->event) { + case IB_EVENT_COMM_EST: + rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); + break; + default: + rdsdebug("Fatal QP Event %u (%s) " + "- connection %pI4->%pI4, reconnecting\n", + event->event, rds_ib_event_str(event->event), + &conn->c_laddr, &conn->c_faddr); + rds_conn_drop(conn); + break; + } +} + +/* + * This needs to be very careful to not leave IS_ERR pointers around for + * cleanup to trip over. + */ +static int rds_ib_setup_qp(struct rds_connection *conn) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct ib_device *dev = ic->i_cm_id->device; + struct ib_qp_init_attr attr; + struct rds_ib_device *rds_ibdev; + int ret; + + /* + * It's normal to see a null device if an incoming connection races + * with device removal, so we don't print a warning. + */ + rds_ibdev = rds_ib_get_client_data(dev); + if (!rds_ibdev) + return -EOPNOTSUPP; + + /* add the conn now so that connection establishment has the dev */ + rds_ib_add_conn(rds_ibdev, conn); + + if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1) + rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1); + if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1) + rds_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1); + + /* Protection domain and memory range */ + ic->i_pd = rds_ibdev->pd; + ic->i_mr = rds_ibdev->mr; + + ic->i_send_cq = ib_create_cq(dev, rds_ib_send_cq_comp_handler, + rds_ib_cq_event_handler, conn, + ic->i_send_ring.w_nr + 1, 0); + if (IS_ERR(ic->i_send_cq)) { + ret = PTR_ERR(ic->i_send_cq); + ic->i_send_cq = NULL; + rdsdebug("ib_create_cq send failed: %d\n", ret); + goto out; + } + + ic->i_recv_cq = ib_create_cq(dev, rds_ib_recv_cq_comp_handler, + rds_ib_cq_event_handler, conn, + ic->i_recv_ring.w_nr, 0); + if (IS_ERR(ic->i_recv_cq)) { + ret = PTR_ERR(ic->i_recv_cq); + ic->i_recv_cq = NULL; + rdsdebug("ib_create_cq recv failed: %d\n", ret); + goto out; + } + + ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); + if (ret) { + rdsdebug("ib_req_notify_cq send failed: %d\n", ret); + goto out; + } + + ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); + if (ret) { + rdsdebug("ib_req_notify_cq recv failed: %d\n", ret); + goto out; + } + + /* XXX negotiate max send/recv with remote? */ + memset(&attr, 0, sizeof(attr)); + attr.event_handler = rds_ib_qp_event_handler; + attr.qp_context = conn; + /* + 1 to allow for the single ack message */ + attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1; + attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1; + attr.cap.max_send_sge = rds_ibdev->max_sge; + attr.cap.max_recv_sge = RDS_IB_RECV_SGE; + attr.sq_sig_type = IB_SIGNAL_REQ_WR; + attr.qp_type = IB_QPT_RC; + attr.send_cq = ic->i_send_cq; + attr.recv_cq = ic->i_recv_cq; + + /* + * XXX this can fail if max_*_wr is too large? Are we supposed + * to back off until we get a value that the hardware can support? + */ + ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr); + if (ret) { + rdsdebug("rdma_create_qp failed: %d\n", ret); + goto out; + } + + ic->i_send_hdrs = ib_dma_alloc_coherent(dev, + ic->i_send_ring.w_nr * + sizeof(struct rds_header), + &ic->i_send_hdrs_dma, GFP_KERNEL); + if (!ic->i_send_hdrs) { + ret = -ENOMEM; + rdsdebug("ib_dma_alloc_coherent send failed\n"); + goto out; + } + + ic->i_recv_hdrs = ib_dma_alloc_coherent(dev, + ic->i_recv_ring.w_nr * + sizeof(struct rds_header), + &ic->i_recv_hdrs_dma, GFP_KERNEL); + if (!ic->i_recv_hdrs) { + ret = -ENOMEM; + rdsdebug("ib_dma_alloc_coherent recv failed\n"); + goto out; + } + + ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), + &ic->i_ack_dma, GFP_KERNEL); + if (!ic->i_ack) { + ret = -ENOMEM; + rdsdebug("ib_dma_alloc_coherent ack failed\n"); + goto out; + } + + ic->i_sends = vzalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work), + ibdev_to_node(dev)); + if (!ic->i_sends) { + ret = -ENOMEM; + rdsdebug("send allocation failed\n"); + goto out; + } + + ic->i_recvs = vzalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work), + ibdev_to_node(dev)); + if (!ic->i_recvs) { + ret = -ENOMEM; + rdsdebug("recv allocation failed\n"); + goto out; + } + + rds_ib_recv_init_ack(ic); + + rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr, + ic->i_send_cq, ic->i_recv_cq); + +out: + rds_ib_dev_put(rds_ibdev); + return ret; +} + +static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event) +{ + const struct rds_ib_connect_private *dp = event->param.conn.private_data; + u16 common; + u32 version = 0; + + /* + * rdma_cm private data is odd - when there is any private data in the + * request, we will be given a pretty large buffer without telling us the + * original size. The only way to tell the difference is by looking at + * the contents, which are initialized to zero. + * If the protocol version fields aren't set, this is a connection attempt + * from an older version. This could could be 3.0 or 2.0 - we can't tell. + * We really should have changed this for OFED 1.3 :-( + */ + + /* Be paranoid. RDS always has privdata */ + if (!event->param.conn.private_data_len) { + printk(KERN_NOTICE "RDS incoming connection has no private data, " + "rejecting\n"); + return 0; + } + + /* Even if len is crap *now* I still want to check it. -ASG */ + if (event->param.conn.private_data_len < sizeof (*dp) || + dp->dp_protocol_major == 0) + return RDS_PROTOCOL_3_0; + + common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS; + if (dp->dp_protocol_major == 3 && common) { + version = RDS_PROTOCOL_3_0; + while ((common >>= 1) != 0) + version++; + } else + printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n", + &dp->dp_saddr, + dp->dp_protocol_major, + dp->dp_protocol_minor); + return version; +} + +int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + __be64 lguid = cm_id->route.path_rec->sgid.global.interface_id; + __be64 fguid = cm_id->route.path_rec->dgid.global.interface_id; + const struct rds_ib_connect_private *dp = event->param.conn.private_data; + struct rds_ib_connect_private dp_rep; + struct rds_connection *conn = NULL; + struct rds_ib_connection *ic = NULL; + struct rdma_conn_param conn_param; + u32 version; + int err = 1, destroy = 1; + + /* Check whether the remote protocol version matches ours. */ + version = rds_ib_protocol_compatible(event); + if (!version) + goto out; + + rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid " + "0x%llx\n", &dp->dp_saddr, &dp->dp_daddr, + RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version), + (unsigned long long)be64_to_cpu(lguid), + (unsigned long long)be64_to_cpu(fguid)); + + conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_ib_transport, + GFP_KERNEL); + if (IS_ERR(conn)) { + rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); + conn = NULL; + goto out; + } + + /* + * The connection request may occur while the + * previous connection exist, e.g. in case of failover. + * But as connections may be initiated simultaneously + * by both hosts, we have a random backoff mechanism - + * see the comment above rds_queue_reconnect() + */ + mutex_lock(&conn->c_cm_lock); + if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { + if (rds_conn_state(conn) == RDS_CONN_UP) { + rdsdebug("incoming connect while connecting\n"); + rds_conn_drop(conn); + rds_ib_stats_inc(s_ib_listen_closed_stale); + } else + if (rds_conn_state(conn) == RDS_CONN_CONNECTING) { + /* Wait and see - our connect may still be succeeding */ + rds_ib_stats_inc(s_ib_connect_raced); + } + goto out; + } + + ic = conn->c_transport_data; + + rds_ib_set_protocol(conn, version); + rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); + + /* If the peer gave us the last packet it saw, process this as if + * we had received a regular ACK. */ + if (dp->dp_ack_seq) + rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); + + BUG_ON(cm_id->context); + BUG_ON(ic->i_cm_id); + + ic->i_cm_id = cm_id; + cm_id->context = conn; + + /* We got halfway through setting up the ib_connection, if we + * fail now, we have to take the long route out of this mess. */ + destroy = 0; + + err = rds_ib_setup_qp(conn); + if (err) { + rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err); + goto out; + } + + rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version, + event->param.conn.responder_resources, + event->param.conn.initiator_depth); + + /* rdma_accept() calls rdma_reject() internally if it fails */ + err = rdma_accept(cm_id, &conn_param); + if (err) + rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err); + +out: + if (conn) + mutex_unlock(&conn->c_cm_lock); + if (err) + rdma_reject(cm_id, NULL, 0); + return destroy; +} + + +int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) +{ + struct rds_connection *conn = cm_id->context; + struct rds_ib_connection *ic = conn->c_transport_data; + struct rdma_conn_param conn_param; + struct rds_ib_connect_private dp; + int ret; + + /* If the peer doesn't do protocol negotiation, we must + * default to RDSv3.0 */ + rds_ib_set_protocol(conn, RDS_PROTOCOL_3_0); + ic->i_flowctl = rds_ib_sysctl_flow_control; /* advertise flow control */ + + ret = rds_ib_setup_qp(conn); + if (ret) { + rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", ret); + goto out; + } + + rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION, + UINT_MAX, UINT_MAX); + ret = rdma_connect(cm_id, &conn_param); + if (ret) + rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret); + +out: + /* Beware - returning non-zero tells the rdma_cm to destroy + * the cm_id. We should certainly not do it as long as we still + * "own" the cm_id. */ + if (ret) { + if (ic->i_cm_id == cm_id) + ret = 0; + } + return ret; +} + +int rds_ib_conn_connect(struct rds_connection *conn) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct sockaddr_in src, dest; + int ret; + + /* XXX I wonder what affect the port space has */ + /* delegate cm event handler to rdma_transport */ + ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(ic->i_cm_id)) { + ret = PTR_ERR(ic->i_cm_id); + ic->i_cm_id = NULL; + rdsdebug("rdma_create_id() failed: %d\n", ret); + goto out; + } + + rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn); + + src.sin_family = AF_INET; + src.sin_addr.s_addr = (__force u32)conn->c_laddr; + src.sin_port = (__force u16)htons(0); + + dest.sin_family = AF_INET; + dest.sin_addr.s_addr = (__force u32)conn->c_faddr; + dest.sin_port = (__force u16)htons(RDS_PORT); + + ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src, + (struct sockaddr *)&dest, + RDS_RDMA_RESOLVE_TIMEOUT_MS); + if (ret) { + rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id, + ret); + rdma_destroy_id(ic->i_cm_id); + ic->i_cm_id = NULL; + } + +out: + return ret; +} + +/* + * This is so careful about only cleaning up resources that were built up + * so that it can be called at any point during startup. In fact it + * can be called multiple times for a given connection. + */ +void rds_ib_conn_shutdown(struct rds_connection *conn) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + int err = 0; + + rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id, + ic->i_pd, ic->i_send_cq, ic->i_recv_cq, + ic->i_cm_id ? ic->i_cm_id->qp : NULL); + + if (ic->i_cm_id) { + struct ib_device *dev = ic->i_cm_id->device; + + rdsdebug("disconnecting cm %p\n", ic->i_cm_id); + err = rdma_disconnect(ic->i_cm_id); + if (err) { + /* Actually this may happen quite frequently, when + * an outgoing connect raced with an incoming connect. + */ + rdsdebug("failed to disconnect, cm: %p err %d\n", + ic->i_cm_id, err); + } + + /* + * We want to wait for tx and rx completion to finish + * before we tear down the connection, but we have to be + * careful not to get stuck waiting on a send ring that + * only has unsignaled sends in it. We've shutdown new + * sends before getting here so by waiting for signaled + * sends to complete we're ensured that there will be no + * more tx processing. + */ + wait_event(rds_ib_ring_empty_wait, + rds_ib_ring_empty(&ic->i_recv_ring) && + (atomic_read(&ic->i_signaled_sends) == 0)); + tasklet_kill(&ic->i_recv_tasklet); + + if (ic->i_send_hdrs) + ib_dma_free_coherent(dev, + ic->i_send_ring.w_nr * + sizeof(struct rds_header), + ic->i_send_hdrs, + ic->i_send_hdrs_dma); + + if (ic->i_recv_hdrs) + ib_dma_free_coherent(dev, + ic->i_recv_ring.w_nr * + sizeof(struct rds_header), + ic->i_recv_hdrs, + ic->i_recv_hdrs_dma); + + if (ic->i_ack) + ib_dma_free_coherent(dev, sizeof(struct rds_header), + ic->i_ack, ic->i_ack_dma); + + if (ic->i_sends) + rds_ib_send_clear_ring(ic); + if (ic->i_recvs) + rds_ib_recv_clear_ring(ic); + + if (ic->i_cm_id->qp) + rdma_destroy_qp(ic->i_cm_id); + if (ic->i_send_cq) + ib_destroy_cq(ic->i_send_cq); + if (ic->i_recv_cq) + ib_destroy_cq(ic->i_recv_cq); + rdma_destroy_id(ic->i_cm_id); + + /* + * Move connection back to the nodev list. + */ + if (ic->rds_ibdev) + rds_ib_remove_conn(ic->rds_ibdev, conn); + + ic->i_cm_id = NULL; + ic->i_pd = NULL; + ic->i_mr = NULL; + ic->i_send_cq = NULL; + ic->i_recv_cq = NULL; + ic->i_send_hdrs = NULL; + ic->i_recv_hdrs = NULL; + ic->i_ack = NULL; + } + BUG_ON(ic->rds_ibdev); + + /* Clear pending transmit */ + if (ic->i_data_op) { + struct rds_message *rm; + + rm = container_of(ic->i_data_op, struct rds_message, data); + rds_message_put(rm); + ic->i_data_op = NULL; + } + + /* Clear the ACK state */ + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); +#ifdef KERNEL_HAS_ATOMIC64 + atomic64_set(&ic->i_ack_next, 0); +#else + ic->i_ack_next = 0; +#endif + ic->i_ack_recv = 0; + + /* Clear flow control state */ + ic->i_flowctl = 0; + atomic_set(&ic->i_credits, 0); + + rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr); + rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr); + + if (ic->i_ibinc) { + rds_inc_put(&ic->i_ibinc->ii_inc); + ic->i_ibinc = NULL; + } + + vfree(ic->i_sends); + ic->i_sends = NULL; + vfree(ic->i_recvs); + ic->i_recvs = NULL; +} + +int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) +{ + struct rds_ib_connection *ic; + unsigned long flags; + int ret; + + /* XXX too lazy? */ + ic = kzalloc(sizeof(struct rds_ib_connection), gfp); + if (!ic) + return -ENOMEM; + + ret = rds_ib_recv_alloc_caches(ic); + if (ret) { + kfree(ic); + return ret; + } + + INIT_LIST_HEAD(&ic->ib_node); + tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn, + (unsigned long) ic); + mutex_init(&ic->i_recv_mutex); +#ifndef KERNEL_HAS_ATOMIC64 + spin_lock_init(&ic->i_ack_lock); +#endif + atomic_set(&ic->i_signaled_sends, 0); + + /* + * rds_ib_conn_shutdown() waits for these to be emptied so they + * must be initialized before it can be called. + */ + rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr); + rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr); + + ic->conn = conn; + conn->c_transport_data = ic; + + spin_lock_irqsave(&ib_nodev_conns_lock, flags); + list_add_tail(&ic->ib_node, &ib_nodev_conns); + spin_unlock_irqrestore(&ib_nodev_conns_lock, flags); + + + rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data); + return 0; +} + +/* + * Free a connection. Connection must be shut down and not set for reconnect. + */ +void rds_ib_conn_free(void *arg) +{ + struct rds_ib_connection *ic = arg; + spinlock_t *lock_ptr; + + rdsdebug("ic %p\n", ic); + + /* + * Conn is either on a dev's list or on the nodev list. + * A race with shutdown() or connect() would cause problems + * (since rds_ibdev would change) but that should never happen. + */ + lock_ptr = ic->rds_ibdev ? &ic->rds_ibdev->spinlock : &ib_nodev_conns_lock; + + spin_lock_irq(lock_ptr); + list_del(&ic->ib_node); + spin_unlock_irq(lock_ptr); + + rds_ib_recv_free_caches(ic); + + kfree(ic); +} + + +/* + * An error occurred on the connection + */ +void +__rds_ib_conn_error(struct rds_connection *conn, const char *fmt, ...) +{ + va_list ap; + + rds_conn_drop(conn); + + va_start(ap, fmt); + vprintk(fmt, ap); + va_end(ap); +} diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c new file mode 100644 index 0000000..273b8bf --- /dev/null +++ b/net/rds/ib_rdma.c @@ -0,0 +1,784 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include + +#include "rds.h" +#include "ib.h" + +static DEFINE_PER_CPU(unsigned long, clean_list_grace); +#define CLEAN_LIST_BUSY_BIT 0 + +/* + * This is stored as mr->r_trans_private. + */ +struct rds_ib_mr { + struct rds_ib_device *device; + struct rds_ib_mr_pool *pool; + struct ib_fmr *fmr; + + struct llist_node llnode; + + /* unmap_list is for freeing */ + struct list_head unmap_list; + unsigned int remap_count; + + struct scatterlist *sg; + unsigned int sg_len; + u64 *dma; + int sg_dma_len; +}; + +/* + * Our own little FMR pool + */ +struct rds_ib_mr_pool { + struct mutex flush_lock; /* serialize fmr invalidate */ + struct delayed_work flush_worker; /* flush worker */ + + atomic_t item_count; /* total # of MRs */ + atomic_t dirty_count; /* # dirty of MRs */ + + struct llist_head drop_list; /* MRs that have reached their max_maps limit */ + struct llist_head free_list; /* unused MRs */ + struct llist_head clean_list; /* global unused & unamapped MRs */ + wait_queue_head_t flush_wait; + + atomic_t free_pinned; /* memory pinned by free MRs */ + unsigned long max_items; + unsigned long max_items_soft; + unsigned long max_free_pinned; + struct ib_fmr_attr fmr_attr; +}; + +static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **); +static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr); +static void rds_ib_mr_pool_flush_worker(struct work_struct *work); + +static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) +{ + struct rds_ib_device *rds_ibdev; + struct rds_ib_ipaddr *i_ipaddr; + + rcu_read_lock(); + list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) { + list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) { + if (i_ipaddr->ipaddr == ipaddr) { + atomic_inc(&rds_ibdev->refcount); + rcu_read_unlock(); + return rds_ibdev; + } + } + } + rcu_read_unlock(); + + return NULL; +} + +static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) +{ + struct rds_ib_ipaddr *i_ipaddr; + + i_ipaddr = kmalloc(sizeof *i_ipaddr, GFP_KERNEL); + if (!i_ipaddr) + return -ENOMEM; + + i_ipaddr->ipaddr = ipaddr; + + spin_lock_irq(&rds_ibdev->spinlock); + list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list); + spin_unlock_irq(&rds_ibdev->spinlock); + + return 0; +} + +static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) +{ + struct rds_ib_ipaddr *i_ipaddr; + struct rds_ib_ipaddr *to_free = NULL; + + + spin_lock_irq(&rds_ibdev->spinlock); + list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) { + if (i_ipaddr->ipaddr == ipaddr) { + list_del_rcu(&i_ipaddr->list); + to_free = i_ipaddr; + break; + } + } + spin_unlock_irq(&rds_ibdev->spinlock); + + if (to_free) { + synchronize_rcu(); + kfree(to_free); + } +} + +int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) +{ + struct rds_ib_device *rds_ibdev_old; + + rds_ibdev_old = rds_ib_get_device(ipaddr); + if (rds_ibdev_old) { + rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr); + rds_ib_dev_put(rds_ibdev_old); + } + + return rds_ib_add_ipaddr(rds_ibdev, ipaddr); +} + +void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + + /* conn was previously on the nodev_conns_list */ + spin_lock_irq(&ib_nodev_conns_lock); + BUG_ON(list_empty(&ib_nodev_conns)); + BUG_ON(list_empty(&ic->ib_node)); + list_del(&ic->ib_node); + + spin_lock(&rds_ibdev->spinlock); + list_add_tail(&ic->ib_node, &rds_ibdev->conn_list); + spin_unlock(&rds_ibdev->spinlock); + spin_unlock_irq(&ib_nodev_conns_lock); + + ic->rds_ibdev = rds_ibdev; + atomic_inc(&rds_ibdev->refcount); +} + +void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + + /* place conn on nodev_conns_list */ + spin_lock(&ib_nodev_conns_lock); + + spin_lock_irq(&rds_ibdev->spinlock); + BUG_ON(list_empty(&ic->ib_node)); + list_del(&ic->ib_node); + spin_unlock_irq(&rds_ibdev->spinlock); + + list_add_tail(&ic->ib_node, &ib_nodev_conns); + + spin_unlock(&ib_nodev_conns_lock); + + ic->rds_ibdev = NULL; + rds_ib_dev_put(rds_ibdev); +} + +void rds_ib_destroy_nodev_conns(void) +{ + struct rds_ib_connection *ic, *_ic; + LIST_HEAD(tmp_list); + + /* avoid calling conn_destroy with irqs off */ + spin_lock_irq(&ib_nodev_conns_lock); + list_splice(&ib_nodev_conns, &tmp_list); + spin_unlock_irq(&ib_nodev_conns_lock); + + list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) + rds_conn_destroy(ic->conn); +} + +struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev) +{ + struct rds_ib_mr_pool *pool; + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return ERR_PTR(-ENOMEM); + + init_llist_head(&pool->free_list); + init_llist_head(&pool->drop_list); + init_llist_head(&pool->clean_list); + mutex_init(&pool->flush_lock); + init_waitqueue_head(&pool->flush_wait); + INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); + + pool->fmr_attr.max_pages = fmr_message_size; + pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; + pool->fmr_attr.page_shift = PAGE_SHIFT; + pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4; + + /* We never allow more than max_items MRs to be allocated. + * When we exceed more than max_items_soft, we start freeing + * items more aggressively. + * Make sure that max_items > max_items_soft > max_items / 2 + */ + pool->max_items_soft = rds_ibdev->max_fmrs * 3 / 4; + pool->max_items = rds_ibdev->max_fmrs; + + return pool; +} + +void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo) +{ + struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + + iinfo->rdma_mr_max = pool->max_items; + iinfo->rdma_mr_size = pool->fmr_attr.max_pages; +} + +void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) +{ + cancel_delayed_work_sync(&pool->flush_worker); + rds_ib_flush_mr_pool(pool, 1, NULL); + WARN_ON(atomic_read(&pool->item_count)); + WARN_ON(atomic_read(&pool->free_pinned)); + kfree(pool); +} + +static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool) +{ + struct rds_ib_mr *ibmr = NULL; + struct llist_node *ret; + unsigned long *flag; + + preempt_disable(); + flag = this_cpu_ptr(&clean_list_grace); + set_bit(CLEAN_LIST_BUSY_BIT, flag); + ret = llist_del_first(&pool->clean_list); + if (ret) + ibmr = llist_entry(ret, struct rds_ib_mr, llnode); + + clear_bit(CLEAN_LIST_BUSY_BIT, flag); + preempt_enable(); + return ibmr; +} + +static inline void wait_clean_list_grace(void) +{ + int cpu; + unsigned long *flag; + + for_each_online_cpu(cpu) { + flag = &per_cpu(clean_list_grace, cpu); + while (test_bit(CLEAN_LIST_BUSY_BIT, flag)) + cpu_relax(); + } +} + +static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) +{ + struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + struct rds_ib_mr *ibmr = NULL; + int err = 0, iter = 0; + + if (atomic_read(&pool->dirty_count) >= pool->max_items / 10) + schedule_delayed_work(&pool->flush_worker, 10); + + while (1) { + ibmr = rds_ib_reuse_fmr(pool); + if (ibmr) + return ibmr; + + /* No clean MRs - now we have the choice of either + * allocating a fresh MR up to the limit imposed by the + * driver, or flush any dirty unused MRs. + * We try to avoid stalling in the send path if possible, + * so we allocate as long as we're allowed to. + * + * We're fussy with enforcing the FMR limit, though. If the driver + * tells us we can't use more than N fmrs, we shouldn't start + * arguing with it */ + if (atomic_inc_return(&pool->item_count) <= pool->max_items) + break; + + atomic_dec(&pool->item_count); + + if (++iter > 2) { + rds_ib_stats_inc(s_ib_rdma_mr_pool_depleted); + return ERR_PTR(-EAGAIN); + } + + /* We do have some empty MRs. Flush them out. */ + rds_ib_stats_inc(s_ib_rdma_mr_pool_wait); + rds_ib_flush_mr_pool(pool, 0, &ibmr); + if (ibmr) + return ibmr; + } + + ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev)); + if (!ibmr) { + err = -ENOMEM; + goto out_no_cigar; + } + + memset(ibmr, 0, sizeof(*ibmr)); + + ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd, + (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE| + IB_ACCESS_REMOTE_ATOMIC), + &pool->fmr_attr); + if (IS_ERR(ibmr->fmr)) { + err = PTR_ERR(ibmr->fmr); + ibmr->fmr = NULL; + printk(KERN_WARNING "RDS/IB: ib_alloc_fmr failed (err=%d)\n", err); + goto out_no_cigar; + } + + rds_ib_stats_inc(s_ib_rdma_mr_alloc); + return ibmr; + +out_no_cigar: + if (ibmr) { + if (ibmr->fmr) + ib_dealloc_fmr(ibmr->fmr); + kfree(ibmr); + } + atomic_dec(&pool->item_count); + return ERR_PTR(err); +} + +static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr, + struct scatterlist *sg, unsigned int nents) +{ + struct ib_device *dev = rds_ibdev->dev; + struct scatterlist *scat = sg; + u64 io_addr = 0; + u64 *dma_pages; + u32 len; + int page_cnt, sg_dma_len; + int i, j; + int ret; + + sg_dma_len = ib_dma_map_sg(dev, sg, nents, + DMA_BIDIRECTIONAL); + if (unlikely(!sg_dma_len)) { + printk(KERN_WARNING "RDS/IB: dma_map_sg failed!\n"); + return -EBUSY; + } + + len = 0; + page_cnt = 0; + + for (i = 0; i < sg_dma_len; ++i) { + unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); + u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); + + if (dma_addr & ~PAGE_MASK) { + if (i > 0) + return -EINVAL; + else + ++page_cnt; + } + if ((dma_addr + dma_len) & ~PAGE_MASK) { + if (i < sg_dma_len - 1) + return -EINVAL; + else + ++page_cnt; + } + + len += dma_len; + } + + page_cnt += len >> PAGE_SHIFT; + if (page_cnt > fmr_message_size) + return -EINVAL; + + dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC, + rdsibdev_to_node(rds_ibdev)); + if (!dma_pages) + return -ENOMEM; + + page_cnt = 0; + for (i = 0; i < sg_dma_len; ++i) { + unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); + u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); + + for (j = 0; j < dma_len; j += PAGE_SIZE) + dma_pages[page_cnt++] = + (dma_addr & PAGE_MASK) + j; + } + + ret = ib_map_phys_fmr(ibmr->fmr, + dma_pages, page_cnt, io_addr); + if (ret) + goto out; + + /* Success - we successfully remapped the MR, so we can + * safely tear down the old mapping. */ + rds_ib_teardown_mr(ibmr); + + ibmr->sg = scat; + ibmr->sg_len = nents; + ibmr->sg_dma_len = sg_dma_len; + ibmr->remap_count++; + + rds_ib_stats_inc(s_ib_rdma_mr_used); + ret = 0; + +out: + kfree(dma_pages); + + return ret; +} + +void rds_ib_sync_mr(void *trans_private, int direction) +{ + struct rds_ib_mr *ibmr = trans_private; + struct rds_ib_device *rds_ibdev = ibmr->device; + + switch (direction) { + case DMA_FROM_DEVICE: + ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg, + ibmr->sg_dma_len, DMA_BIDIRECTIONAL); + break; + case DMA_TO_DEVICE: + ib_dma_sync_sg_for_device(rds_ibdev->dev, ibmr->sg, + ibmr->sg_dma_len, DMA_BIDIRECTIONAL); + break; + } +} + +static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) +{ + struct rds_ib_device *rds_ibdev = ibmr->device; + + if (ibmr->sg_dma_len) { + ib_dma_unmap_sg(rds_ibdev->dev, + ibmr->sg, ibmr->sg_len, + DMA_BIDIRECTIONAL); + ibmr->sg_dma_len = 0; + } + + /* Release the s/g list */ + if (ibmr->sg_len) { + unsigned int i; + + for (i = 0; i < ibmr->sg_len; ++i) { + struct page *page = sg_page(&ibmr->sg[i]); + + /* FIXME we need a way to tell a r/w MR + * from a r/o MR */ + BUG_ON(irqs_disabled()); + set_page_dirty(page); + put_page(page); + } + kfree(ibmr->sg); + + ibmr->sg = NULL; + ibmr->sg_len = 0; + } +} + +static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr) +{ + unsigned int pinned = ibmr->sg_len; + + __rds_ib_teardown_mr(ibmr); + if (pinned) { + struct rds_ib_device *rds_ibdev = ibmr->device; + struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + + atomic_sub(pinned, &pool->free_pinned); + } +} + +static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int free_all) +{ + unsigned int item_count; + + item_count = atomic_read(&pool->item_count); + if (free_all) + return item_count; + + return 0; +} + +/* + * given an llist of mrs, put them all into the list_head for more processing + */ +static void llist_append_to_list(struct llist_head *llist, struct list_head *list) +{ + struct rds_ib_mr *ibmr; + struct llist_node *node; + struct llist_node *next; + + node = llist_del_all(llist); + while (node) { + next = node->next; + ibmr = llist_entry(node, struct rds_ib_mr, llnode); + list_add_tail(&ibmr->unmap_list, list); + node = next; + } +} + +/* + * this takes a list head of mrs and turns it into linked llist nodes + * of clusters. Each cluster has linked llist nodes of + * MR_CLUSTER_SIZE mrs that are ready for reuse. + */ +static void list_to_llist_nodes(struct rds_ib_mr_pool *pool, + struct list_head *list, + struct llist_node **nodes_head, + struct llist_node **nodes_tail) +{ + struct rds_ib_mr *ibmr; + struct llist_node *cur = NULL; + struct llist_node **next = nodes_head; + + list_for_each_entry(ibmr, list, unmap_list) { + cur = &ibmr->llnode; + *next = cur; + next = &cur->next; + } + *next = NULL; + *nodes_tail = cur; +} + +/* + * Flush our pool of MRs. + * At a minimum, all currently unused MRs are unmapped. + * If the number of MRs allocated exceeds the limit, we also try + * to free as many MRs as needed to get back to this limit. + */ +static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, + int free_all, struct rds_ib_mr **ibmr_ret) +{ + struct rds_ib_mr *ibmr, *next; + struct llist_node *clean_nodes; + struct llist_node *clean_tail; + LIST_HEAD(unmap_list); + LIST_HEAD(fmr_list); + unsigned long unpinned = 0; + unsigned int nfreed = 0, ncleaned = 0, free_goal; + int ret = 0; + + rds_ib_stats_inc(s_ib_rdma_mr_pool_flush); + + if (ibmr_ret) { + DEFINE_WAIT(wait); + while(!mutex_trylock(&pool->flush_lock)) { + ibmr = rds_ib_reuse_fmr(pool); + if (ibmr) { + *ibmr_ret = ibmr; + finish_wait(&pool->flush_wait, &wait); + goto out_nolock; + } + + prepare_to_wait(&pool->flush_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (llist_empty(&pool->clean_list)) + schedule(); + + ibmr = rds_ib_reuse_fmr(pool); + if (ibmr) { + *ibmr_ret = ibmr; + finish_wait(&pool->flush_wait, &wait); + goto out_nolock; + } + } + finish_wait(&pool->flush_wait, &wait); + } else + mutex_lock(&pool->flush_lock); + + if (ibmr_ret) { + ibmr = rds_ib_reuse_fmr(pool); + if (ibmr) { + *ibmr_ret = ibmr; + goto out; + } + } + + /* Get the list of all MRs to be dropped. Ordering matters - + * we want to put drop_list ahead of free_list. + */ + llist_append_to_list(&pool->drop_list, &unmap_list); + llist_append_to_list(&pool->free_list, &unmap_list); + if (free_all) + llist_append_to_list(&pool->clean_list, &unmap_list); + + free_goal = rds_ib_flush_goal(pool, free_all); + + if (list_empty(&unmap_list)) + goto out; + + /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ + list_for_each_entry(ibmr, &unmap_list, unmap_list) + list_add(&ibmr->fmr->list, &fmr_list); + + ret = ib_unmap_fmr(&fmr_list); + if (ret) + printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret); + + /* Now we can destroy the DMA mapping and unpin any pages */ + list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) { + unpinned += ibmr->sg_len; + __rds_ib_teardown_mr(ibmr); + if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) { + rds_ib_stats_inc(s_ib_rdma_mr_free); + list_del(&ibmr->unmap_list); + ib_dealloc_fmr(ibmr->fmr); + kfree(ibmr); + nfreed++; + } + ncleaned++; + } + + if (!list_empty(&unmap_list)) { + /* we have to make sure that none of the things we're about + * to put on the clean list would race with other cpus trying + * to pull items off. The llist would explode if we managed to + * remove something from the clean list and then add it back again + * while another CPU was spinning on that same item in llist_del_first. + * + * This is pretty unlikely, but just in case wait for an llist grace period + * here before adding anything back into the clean list. + */ + wait_clean_list_grace(); + + list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail); + if (ibmr_ret) + *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode); + + /* more than one entry in llist nodes */ + if (clean_nodes->next) + llist_add_batch(clean_nodes->next, clean_tail, &pool->clean_list); + + } + + atomic_sub(unpinned, &pool->free_pinned); + atomic_sub(ncleaned, &pool->dirty_count); + atomic_sub(nfreed, &pool->item_count); + +out: + mutex_unlock(&pool->flush_lock); + if (waitqueue_active(&pool->flush_wait)) + wake_up(&pool->flush_wait); +out_nolock: + return ret; +} + +static void rds_ib_mr_pool_flush_worker(struct work_struct *work) +{ + struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work); + + rds_ib_flush_mr_pool(pool, 0, NULL); +} + +void rds_ib_free_mr(void *trans_private, int invalidate) +{ + struct rds_ib_mr *ibmr = trans_private; + struct rds_ib_device *rds_ibdev = ibmr->device; + struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + + rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); + + /* Return it to the pool's free list */ + if (ibmr->remap_count >= pool->fmr_attr.max_maps) + llist_add(&ibmr->llnode, &pool->drop_list); + else + llist_add(&ibmr->llnode, &pool->free_list); + + atomic_add(ibmr->sg_len, &pool->free_pinned); + atomic_inc(&pool->dirty_count); + + /* If we've pinned too many pages, request a flush */ + if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || + atomic_read(&pool->dirty_count) >= pool->max_items / 10) + schedule_delayed_work(&pool->flush_worker, 10); + + if (invalidate) { + if (likely(!in_interrupt())) { + rds_ib_flush_mr_pool(pool, 0, NULL); + } else { + /* We get here if the user created a MR marked + * as use_once and invalidate at the same time. */ + schedule_delayed_work(&pool->flush_worker, 10); + } + } + + rds_ib_dev_put(rds_ibdev); +} + +void rds_ib_flush_mrs(void) +{ + struct rds_ib_device *rds_ibdev; + + down_read(&rds_ib_devices_lock); + list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { + struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + + if (pool) + rds_ib_flush_mr_pool(pool, 0, NULL); + } + up_read(&rds_ib_devices_lock); +} + +void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, + struct rds_sock *rs, u32 *key_ret) +{ + struct rds_ib_device *rds_ibdev; + struct rds_ib_mr *ibmr = NULL; + int ret; + + rds_ibdev = rds_ib_get_device(rs->rs_bound_addr); + if (!rds_ibdev) { + ret = -ENODEV; + goto out; + } + + if (!rds_ibdev->mr_pool) { + ret = -ENODEV; + goto out; + } + + ibmr = rds_ib_alloc_fmr(rds_ibdev); + if (IS_ERR(ibmr)) + return ibmr; + + ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents); + if (ret == 0) + *key_ret = ibmr->fmr->rkey; + else + printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret); + + ibmr->device = rds_ibdev; + rds_ibdev = NULL; + + out: + if (ret) { + if (ibmr) + rds_ib_free_mr(ibmr, 0); + ibmr = ERR_PTR(ret); + } + if (rds_ibdev) + rds_ib_dev_put(rds_ibdev); + return ibmr; +} + diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c new file mode 100644 index 0000000..d67de45 --- /dev/null +++ b/net/rds/ib_recv.c @@ -0,0 +1,1079 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include +#include + +#include "rds.h" +#include "ib.h" + +static struct kmem_cache *rds_ib_incoming_slab; +static struct kmem_cache *rds_ib_frag_slab; +static atomic_t rds_ib_allocation = ATOMIC_INIT(0); + +void rds_ib_recv_init_ring(struct rds_ib_connection *ic) +{ + struct rds_ib_recv_work *recv; + u32 i; + + for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) { + struct ib_sge *sge; + + recv->r_ibinc = NULL; + recv->r_frag = NULL; + + recv->r_wr.next = NULL; + recv->r_wr.wr_id = i; + recv->r_wr.sg_list = recv->r_sge; + recv->r_wr.num_sge = RDS_IB_RECV_SGE; + + sge = &recv->r_sge[0]; + sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); + sge->length = sizeof(struct rds_header); + sge->lkey = ic->i_mr->lkey; + + sge = &recv->r_sge[1]; + sge->addr = 0; + sge->length = RDS_FRAG_SIZE; + sge->lkey = ic->i_mr->lkey; + } +} + +/* + * The entire 'from' list, including the from element itself, is put on + * to the tail of the 'to' list. + */ +static void list_splice_entire_tail(struct list_head *from, + struct list_head *to) +{ + struct list_head *from_last = from->prev; + + list_splice_tail(from_last, to); + list_add_tail(from_last, to); +} + +static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache) +{ + struct list_head *tmp; + + tmp = xchg(&cache->xfer, NULL); + if (tmp) { + if (cache->ready) + list_splice_entire_tail(tmp, cache->ready); + else + cache->ready = tmp; + } +} + +static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache) +{ + struct rds_ib_cache_head *head; + int cpu; + + cache->percpu = alloc_percpu(struct rds_ib_cache_head); + if (!cache->percpu) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + head = per_cpu_ptr(cache->percpu, cpu); + head->first = NULL; + head->count = 0; + } + cache->xfer = NULL; + cache->ready = NULL; + + return 0; +} + +int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic) +{ + int ret; + + ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs); + if (!ret) { + ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags); + if (ret) + free_percpu(ic->i_cache_incs.percpu); + } + + return ret; +} + +static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache, + struct list_head *caller_list) +{ + struct rds_ib_cache_head *head; + int cpu; + + for_each_possible_cpu(cpu) { + head = per_cpu_ptr(cache->percpu, cpu); + if (head->first) { + list_splice_entire_tail(head->first, caller_list); + head->first = NULL; + } + } + + if (cache->ready) { + list_splice_entire_tail(cache->ready, caller_list); + cache->ready = NULL; + } +} + +void rds_ib_recv_free_caches(struct rds_ib_connection *ic) +{ + struct rds_ib_incoming *inc; + struct rds_ib_incoming *inc_tmp; + struct rds_page_frag *frag; + struct rds_page_frag *frag_tmp; + LIST_HEAD(list); + + rds_ib_cache_xfer_to_ready(&ic->i_cache_incs); + rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list); + free_percpu(ic->i_cache_incs.percpu); + + list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) { + list_del(&inc->ii_cache_entry); + WARN_ON(!list_empty(&inc->ii_frags)); + kmem_cache_free(rds_ib_incoming_slab, inc); + } + + rds_ib_cache_xfer_to_ready(&ic->i_cache_frags); + rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list); + free_percpu(ic->i_cache_frags.percpu); + + list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) { + list_del(&frag->f_cache_entry); + WARN_ON(!list_empty(&frag->f_item)); + kmem_cache_free(rds_ib_frag_slab, frag); + } +} + +/* fwd decl */ +static void rds_ib_recv_cache_put(struct list_head *new_item, + struct rds_ib_refill_cache *cache); +static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache); + + +/* Recycle frag and attached recv buffer f_sg */ +static void rds_ib_frag_free(struct rds_ib_connection *ic, + struct rds_page_frag *frag) +{ + rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg)); + + rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags); +} + +/* Recycle inc after freeing attached frags */ +void rds_ib_inc_free(struct rds_incoming *inc) +{ + struct rds_ib_incoming *ibinc; + struct rds_page_frag *frag; + struct rds_page_frag *pos; + struct rds_ib_connection *ic = inc->i_conn->c_transport_data; + + ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); + + /* Free attached frags */ + list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) { + list_del_init(&frag->f_item); + rds_ib_frag_free(ic, frag); + } + BUG_ON(!list_empty(&ibinc->ii_frags)); + + rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc); + rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs); +} + +static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, + struct rds_ib_recv_work *recv) +{ + if (recv->r_ibinc) { + rds_inc_put(&recv->r_ibinc->ii_inc); + recv->r_ibinc = NULL; + } + if (recv->r_frag) { + ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE); + rds_ib_frag_free(ic, recv->r_frag); + recv->r_frag = NULL; + } +} + +void rds_ib_recv_clear_ring(struct rds_ib_connection *ic) +{ + u32 i; + + for (i = 0; i < ic->i_recv_ring.w_nr; i++) + rds_ib_recv_clear_one(ic, &ic->i_recvs[i]); +} + +static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic, + gfp_t slab_mask) +{ + struct rds_ib_incoming *ibinc; + struct list_head *cache_item; + int avail_allocs; + + cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs); + if (cache_item) { + ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry); + } else { + avail_allocs = atomic_add_unless(&rds_ib_allocation, + 1, rds_ib_sysctl_max_recv_allocation); + if (!avail_allocs) { + rds_ib_stats_inc(s_ib_rx_alloc_limit); + return NULL; + } + ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask); + if (!ibinc) { + atomic_dec(&rds_ib_allocation); + return NULL; + } + } + INIT_LIST_HEAD(&ibinc->ii_frags); + rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr); + + return ibinc; +} + +static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic, + gfp_t slab_mask, gfp_t page_mask) +{ + struct rds_page_frag *frag; + struct list_head *cache_item; + int ret; + + cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags); + if (cache_item) { + frag = container_of(cache_item, struct rds_page_frag, f_cache_entry); + } else { + frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask); + if (!frag) + return NULL; + + sg_init_table(&frag->f_sg, 1); + ret = rds_page_remainder_alloc(&frag->f_sg, + RDS_FRAG_SIZE, page_mask); + if (ret) { + kmem_cache_free(rds_ib_frag_slab, frag); + return NULL; + } + } + + INIT_LIST_HEAD(&frag->f_item); + + return frag; +} + +static int rds_ib_recv_refill_one(struct rds_connection *conn, + struct rds_ib_recv_work *recv, int prefill) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct ib_sge *sge; + int ret = -ENOMEM; + gfp_t slab_mask = GFP_NOWAIT; + gfp_t page_mask = GFP_NOWAIT; + + if (prefill) { + slab_mask = GFP_KERNEL; + page_mask = GFP_HIGHUSER; + } + + if (!ic->i_cache_incs.ready) + rds_ib_cache_xfer_to_ready(&ic->i_cache_incs); + if (!ic->i_cache_frags.ready) + rds_ib_cache_xfer_to_ready(&ic->i_cache_frags); + + /* + * ibinc was taken from recv if recv contained the start of a message. + * recvs that were continuations will still have this allocated. + */ + if (!recv->r_ibinc) { + recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask); + if (!recv->r_ibinc) + goto out; + } + + WARN_ON(recv->r_frag); /* leak! */ + recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask); + if (!recv->r_frag) + goto out; + + ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, + 1, DMA_FROM_DEVICE); + WARN_ON(ret != 1); + + sge = &recv->r_sge[0]; + sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); + sge->length = sizeof(struct rds_header); + + sge = &recv->r_sge[1]; + sge->addr = ib_sg_dma_address(ic->i_cm_id->device, &recv->r_frag->f_sg); + sge->length = ib_sg_dma_len(ic->i_cm_id->device, &recv->r_frag->f_sg); + + ret = 0; +out: + return ret; +} + +/* + * This tries to allocate and post unused work requests after making sure that + * they have all the allocations they need to queue received fragments into + * sockets. + * + * -1 is returned if posting fails due to temporary resource exhaustion. + */ +void rds_ib_recv_refill(struct rds_connection *conn, int prefill) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_ib_recv_work *recv; + struct ib_recv_wr *failed_wr; + unsigned int posted = 0; + int ret = 0; + u32 pos; + + while ((prefill || rds_conn_up(conn)) && + rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) { + if (pos >= ic->i_recv_ring.w_nr) { + printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", + pos); + break; + } + + recv = &ic->i_recvs[pos]; + ret = rds_ib_recv_refill_one(conn, recv, prefill); + if (ret) { + break; + } + + /* XXX when can this fail? */ + ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); + rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv, + recv->r_ibinc, sg_page(&recv->r_frag->f_sg), + (long) ib_sg_dma_address( + ic->i_cm_id->device, + &recv->r_frag->f_sg), + ret); + if (ret) { + rds_ib_conn_error(conn, "recv post on " + "%pI4 returned %d, disconnecting and " + "reconnecting\n", &conn->c_faddr, + ret); + break; + } + + posted++; + } + + /* We're doing flow control - update the window. */ + if (ic->i_flowctl && posted) + rds_ib_advertise_credits(conn, posted); + + if (ret) + rds_ib_ring_unalloc(&ic->i_recv_ring, 1); +} + +/* + * We want to recycle several types of recv allocations, like incs and frags. + * To use this, the *_free() function passes in the ptr to a list_head within + * the recyclee, as well as the cache to put it on. + * + * First, we put the memory on a percpu list. When this reaches a certain size, + * We move it to an intermediate non-percpu list in a lockless manner, with some + * xchg/compxchg wizardry. + * + * N.B. Instead of a list_head as the anchor, we use a single pointer, which can + * be NULL and xchg'd. The list is actually empty when the pointer is NULL, and + * list_empty() will return true with one element is actually present. + */ +static void rds_ib_recv_cache_put(struct list_head *new_item, + struct rds_ib_refill_cache *cache) +{ + unsigned long flags; + struct list_head *old, *chpfirst; + + local_irq_save(flags); + + chpfirst = __this_cpu_read(cache->percpu->first); + if (!chpfirst) + INIT_LIST_HEAD(new_item); + else /* put on front */ + list_add_tail(new_item, chpfirst); + + __this_cpu_write(cache->percpu->first, new_item); + __this_cpu_inc(cache->percpu->count); + + if (__this_cpu_read(cache->percpu->count) < RDS_IB_RECYCLE_BATCH_COUNT) + goto end; + + /* + * Return our per-cpu first list to the cache's xfer by atomically + * grabbing the current xfer list, appending it to our per-cpu list, + * and then atomically returning that entire list back to the + * cache's xfer list as long as it's still empty. + */ + do { + old = xchg(&cache->xfer, NULL); + if (old) + list_splice_entire_tail(old, chpfirst); + old = cmpxchg(&cache->xfer, NULL, chpfirst); + } while (old); + + + __this_cpu_write(cache->percpu->first, NULL); + __this_cpu_write(cache->percpu->count, 0); +end: + local_irq_restore(flags); +} + +static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache) +{ + struct list_head *head = cache->ready; + + if (head) { + if (!list_empty(head)) { + cache->ready = head->next; + list_del_init(head); + } else + cache->ready = NULL; + } + + return head; +} + +int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, + size_t size) +{ + struct rds_ib_incoming *ibinc; + struct rds_page_frag *frag; + struct iovec *iov = first_iov; + unsigned long to_copy; + unsigned long frag_off = 0; + unsigned long iov_off = 0; + int copied = 0; + int ret; + u32 len; + + ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); + frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item); + len = be32_to_cpu(inc->i_hdr.h_len); + + while (copied < size && copied < len) { + if (frag_off == RDS_FRAG_SIZE) { + frag = list_entry(frag->f_item.next, + struct rds_page_frag, f_item); + frag_off = 0; + } + while (iov_off == iov->iov_len) { + iov_off = 0; + iov++; + } + + to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off); + to_copy = min_t(size_t, to_copy, size - copied); + to_copy = min_t(unsigned long, to_copy, len - copied); + + rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag " + "[%p, %u] + %lu\n", + to_copy, iov->iov_base, iov->iov_len, iov_off, + sg_page(&frag->f_sg), frag->f_sg.offset, frag_off); + + /* XXX needs + offset for multiple recvs per page */ + ret = rds_page_copy_to_user(sg_page(&frag->f_sg), + frag->f_sg.offset + frag_off, + iov->iov_base + iov_off, + to_copy); + if (ret) { + copied = ret; + break; + } + + iov_off += to_copy; + frag_off += to_copy; + copied += to_copy; + } + + return copied; +} + +/* ic starts out kzalloc()ed */ +void rds_ib_recv_init_ack(struct rds_ib_connection *ic) +{ + struct ib_send_wr *wr = &ic->i_ack_wr; + struct ib_sge *sge = &ic->i_ack_sge; + + sge->addr = ic->i_ack_dma; + sge->length = sizeof(struct rds_header); + sge->lkey = ic->i_mr->lkey; + + wr->sg_list = sge; + wr->num_sge = 1; + wr->opcode = IB_WR_SEND; + wr->wr_id = RDS_IB_ACK_WR_ID; + wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED; +} + +/* + * You'd think that with reliable IB connections you wouldn't need to ack + * messages that have been received. The problem is that IB hardware generates + * an ack message before it has DMAed the message into memory. This creates a + * potential message loss if the HCA is disabled for any reason between when it + * sends the ack and before the message is DMAed and processed. This is only a + * potential issue if another HCA is available for fail-over. + * + * When the remote host receives our ack they'll free the sent message from + * their send queue. To decrease the latency of this we always send an ack + * immediately after we've received messages. + * + * For simplicity, we only have one ack in flight at a time. This puts + * pressure on senders to have deep enough send queues to absorb the latency of + * a single ack frame being in flight. This might not be good enough. + * + * This is implemented by have a long-lived send_wr and sge which point to a + * statically allocated ack frame. This ack wr does not fall under the ring + * accounting that the tx and rx wrs do. The QP attribute specifically makes + * room for it beyond the ring size. Send completion notices its special + * wr_id and avoids working with the ring in that case. + */ +#ifndef KERNEL_HAS_ATOMIC64 +static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, + int ack_required) +{ + unsigned long flags; + + spin_lock_irqsave(&ic->i_ack_lock, flags); + ic->i_ack_next = seq; + if (ack_required) + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + spin_unlock_irqrestore(&ic->i_ack_lock, flags); +} + +static u64 rds_ib_get_ack(struct rds_ib_connection *ic) +{ + unsigned long flags; + u64 seq; + + clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + + spin_lock_irqsave(&ic->i_ack_lock, flags); + seq = ic->i_ack_next; + spin_unlock_irqrestore(&ic->i_ack_lock, flags); + + return seq; +} +#else +static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, + int ack_required) +{ + atomic64_set(&ic->i_ack_next, seq); + if (ack_required) { + smp_mb__before_atomic(); + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + } +} + +static u64 rds_ib_get_ack(struct rds_ib_connection *ic) +{ + clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + smp_mb__after_atomic(); + + return atomic64_read(&ic->i_ack_next); +} +#endif + + +static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credits) +{ + struct rds_header *hdr = ic->i_ack; + struct ib_send_wr *failed_wr; + u64 seq; + int ret; + + seq = rds_ib_get_ack(ic); + + rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq); + rds_message_populate_header(hdr, 0, 0, 0); + hdr->h_ack = cpu_to_be64(seq); + hdr->h_credit = adv_credits; + rds_message_make_checksum(hdr); + ic->i_ack_queued = jiffies; + + ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr); + if (unlikely(ret)) { + /* Failed to send. Release the WR, and + * force another ACK. + */ + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + + rds_ib_stats_inc(s_ib_ack_send_failure); + + rds_ib_conn_error(ic->conn, "sending ack failed\n"); + } else + rds_ib_stats_inc(s_ib_ack_sent); +} + +/* + * There are 3 ways of getting acknowledgements to the peer: + * 1. We call rds_ib_attempt_ack from the recv completion handler + * to send an ACK-only frame. + * However, there can be only one such frame in the send queue + * at any time, so we may have to postpone it. + * 2. When another (data) packet is transmitted while there's + * an ACK in the queue, we piggyback the ACK sequence number + * on the data packet. + * 3. If the ACK WR is done sending, we get called from the + * send queue completion handler, and check whether there's + * another ACK pending (postponed because the WR was on the + * queue). If so, we transmit it. + * + * We maintain 2 variables: + * - i_ack_flags, which keeps track of whether the ACK WR + * is currently in the send queue or not (IB_ACK_IN_FLIGHT) + * - i_ack_next, which is the last sequence number we received + * + * Potentially, send queue and receive queue handlers can run concurrently. + * It would be nice to not have to use a spinlock to synchronize things, + * but the one problem that rules this out is that 64bit updates are + * not atomic on all platforms. Things would be a lot simpler if + * we had atomic64 or maybe cmpxchg64 everywhere. + * + * Reconnecting complicates this picture just slightly. When we + * reconnect, we may be seeing duplicate packets. The peer + * is retransmitting them, because it hasn't seen an ACK for + * them. It is important that we ACK these. + * + * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with + * this flag set *MUST* be acknowledged immediately. + */ + +/* + * When we get here, we're called from the recv queue handler. + * Check whether we ought to transmit an ACK. + */ +void rds_ib_attempt_ack(struct rds_ib_connection *ic) +{ + unsigned int adv_credits; + + if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) + return; + + if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) { + rds_ib_stats_inc(s_ib_ack_send_delayed); + return; + } + + /* Can we get a send credit? */ + if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) { + rds_ib_stats_inc(s_ib_tx_throttle); + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + return; + } + + clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + rds_ib_send_ack(ic, adv_credits); +} + +/* + * We get here from the send completion handler, when the + * adapter tells us the ACK frame was sent. + */ +void rds_ib_ack_send_complete(struct rds_ib_connection *ic) +{ + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + rds_ib_attempt_ack(ic); +} + +/* + * This is called by the regular xmit code when it wants to piggyback + * an ACK on an outgoing frame. + */ +u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic) +{ + if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) + rds_ib_stats_inc(s_ib_ack_send_piggybacked); + return rds_ib_get_ack(ic); +} + +/* + * It's kind of lame that we're copying from the posted receive pages into + * long-lived bitmaps. We could have posted the bitmaps and rdma written into + * them. But receiving new congestion bitmaps should be a *rare* event, so + * hopefully we won't need to invest that complexity in making it more + * efficient. By copying we can share a simpler core with TCP which has to + * copy. + */ +static void rds_ib_cong_recv(struct rds_connection *conn, + struct rds_ib_incoming *ibinc) +{ + struct rds_cong_map *map; + unsigned int map_off; + unsigned int map_page; + struct rds_page_frag *frag; + unsigned long frag_off; + unsigned long to_copy; + unsigned long copied; + uint64_t uncongested = 0; + void *addr; + + /* catch completely corrupt packets */ + if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) + return; + + map = conn->c_fcong; + map_page = 0; + map_off = 0; + + frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item); + frag_off = 0; + + copied = 0; + + while (copied < RDS_CONG_MAP_BYTES) { + uint64_t *src, *dst; + unsigned int k; + + to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); + BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ + + addr = kmap_atomic(sg_page(&frag->f_sg)); + + src = addr + frag_off; + dst = (void *)map->m_page_addrs[map_page] + map_off; + for (k = 0; k < to_copy; k += 8) { + /* Record ports that became uncongested, ie + * bits that changed from 0 to 1. */ + uncongested |= ~(*src) & *dst; + *dst++ = *src++; + } + kunmap_atomic(addr); + + copied += to_copy; + + map_off += to_copy; + if (map_off == PAGE_SIZE) { + map_off = 0; + map_page++; + } + + frag_off += to_copy; + if (frag_off == RDS_FRAG_SIZE) { + frag = list_entry(frag->f_item.next, + struct rds_page_frag, f_item); + frag_off = 0; + } + } + + /* the congestion map is in little endian order */ + uncongested = le64_to_cpu(uncongested); + + rds_cong_map_updated(map, uncongested); +} + +/* + * Rings are posted with all the allocations they'll need to queue the + * incoming message to the receiving socket so this can't fail. + * All fragments start with a header, so we can make sure we're not receiving + * garbage, and we can tell a small 8 byte fragment from an ACK frame. + */ +struct rds_ib_ack_state { + u64 ack_next; + u64 ack_recv; + unsigned int ack_required:1; + unsigned int ack_next_valid:1; + unsigned int ack_recv_valid:1; +}; + +static void rds_ib_process_recv(struct rds_connection *conn, + struct rds_ib_recv_work *recv, u32 data_len, + struct rds_ib_ack_state *state) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_ib_incoming *ibinc = ic->i_ibinc; + struct rds_header *ihdr, *hdr; + + /* XXX shut down the connection if port 0,0 are seen? */ + + rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv, + data_len); + + if (data_len < sizeof(struct rds_header)) { + rds_ib_conn_error(conn, "incoming message " + "from %pI4 didn't include a " + "header, disconnecting and " + "reconnecting\n", + &conn->c_faddr); + return; + } + data_len -= sizeof(struct rds_header); + + ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs]; + + /* Validate the checksum. */ + if (!rds_message_verify_checksum(ihdr)) { + rds_ib_conn_error(conn, "incoming message " + "from %pI4 has corrupted header - " + "forcing a reconnect\n", + &conn->c_faddr); + rds_stats_inc(s_recv_drop_bad_checksum); + return; + } + + /* Process the ACK sequence which comes with every packet */ + state->ack_recv = be64_to_cpu(ihdr->h_ack); + state->ack_recv_valid = 1; + + /* Process the credits update if there was one */ + if (ihdr->h_credit) + rds_ib_send_add_credits(conn, ihdr->h_credit); + + if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) { + /* This is an ACK-only packet. The fact that it gets + * special treatment here is that historically, ACKs + * were rather special beasts. + */ + rds_ib_stats_inc(s_ib_ack_received); + + /* + * Usually the frags make their way on to incs and are then freed as + * the inc is freed. We don't go that route, so we have to drop the + * page ref ourselves. We can't just leave the page on the recv + * because that confuses the dma mapping of pages and each recv's use + * of a partial page. + * + * FIXME: Fold this into the code path below. + */ + rds_ib_frag_free(ic, recv->r_frag); + recv->r_frag = NULL; + return; + } + + /* + * If we don't already have an inc on the connection then this + * fragment has a header and starts a message.. copy its header + * into the inc and save the inc so we can hang upcoming fragments + * off its list. + */ + if (!ibinc) { + ibinc = recv->r_ibinc; + recv->r_ibinc = NULL; + ic->i_ibinc = ibinc; + + hdr = &ibinc->ii_inc.i_hdr; + memcpy(hdr, ihdr, sizeof(*hdr)); + ic->i_recv_data_rem = be32_to_cpu(hdr->h_len); + + rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc, + ic->i_recv_data_rem, hdr->h_flags); + } else { + hdr = &ibinc->ii_inc.i_hdr; + /* We can't just use memcmp here; fragments of a + * single message may carry different ACKs */ + if (hdr->h_sequence != ihdr->h_sequence || + hdr->h_len != ihdr->h_len || + hdr->h_sport != ihdr->h_sport || + hdr->h_dport != ihdr->h_dport) { + rds_ib_conn_error(conn, + "fragment header mismatch; forcing reconnect\n"); + return; + } + } + + list_add_tail(&recv->r_frag->f_item, &ibinc->ii_frags); + recv->r_frag = NULL; + + if (ic->i_recv_data_rem > RDS_FRAG_SIZE) + ic->i_recv_data_rem -= RDS_FRAG_SIZE; + else { + ic->i_recv_data_rem = 0; + ic->i_ibinc = NULL; + + if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) + rds_ib_cong_recv(conn, ibinc); + else { + rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr, + &ibinc->ii_inc, GFP_ATOMIC); + state->ack_next = be64_to_cpu(hdr->h_sequence); + state->ack_next_valid = 1; + } + + /* Evaluate the ACK_REQUIRED flag *after* we received + * the complete frame, and after bumping the next_rx + * sequence. */ + if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) { + rds_stats_inc(s_recv_ack_required); + state->ack_required = 1; + } + + rds_inc_put(&ibinc->ii_inc); + } +} + +/* + * Plucking the oldest entry from the ring can be done concurrently with + * the thread refilling the ring. Each ring operation is protected by + * spinlocks and the transient state of refilling doesn't change the + * recording of which entry is oldest. + * + * This relies on IB only calling one cq comp_handler for each cq so that + * there will only be one caller of rds_recv_incoming() per RDS connection. + */ +void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context) +{ + struct rds_connection *conn = context; + struct rds_ib_connection *ic = conn->c_transport_data; + + rdsdebug("conn %p cq %p\n", conn, cq); + + rds_ib_stats_inc(s_ib_rx_cq_call); + + tasklet_schedule(&ic->i_recv_tasklet); +} + +static inline void rds_poll_cq(struct rds_ib_connection *ic, + struct rds_ib_ack_state *state) +{ + struct rds_connection *conn = ic->conn; + struct ib_wc wc; + struct rds_ib_recv_work *recv; + + while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) { + rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", + (unsigned long long)wc.wr_id, wc.status, + rds_ib_wc_status_str(wc.status), wc.byte_len, + be32_to_cpu(wc.ex.imm_data)); + rds_ib_stats_inc(s_ib_rx_cq_event); + + recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)]; + + ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE); + + /* + * Also process recvs in connecting state because it is possible + * to get a recv completion _before_ the rdmacm ESTABLISHED + * event is processed. + */ + if (wc.status == IB_WC_SUCCESS) { + rds_ib_process_recv(conn, recv, wc.byte_len, state); + } else { + /* We expect errors as the qp is drained during shutdown */ + if (rds_conn_up(conn) || rds_conn_connecting(conn)) + rds_ib_conn_error(conn, "recv completion on %pI4 had " + "status %u (%s), disconnecting and " + "reconnecting\n", &conn->c_faddr, + wc.status, + rds_ib_wc_status_str(wc.status)); + } + + /* + * It's very important that we only free this ring entry if we've truly + * freed the resources allocated to the entry. The refilling path can + * leak if we don't. + */ + rds_ib_ring_free(&ic->i_recv_ring, 1); + } +} + +void rds_ib_recv_tasklet_fn(unsigned long data) +{ + struct rds_ib_connection *ic = (struct rds_ib_connection *) data; + struct rds_connection *conn = ic->conn; + struct rds_ib_ack_state state = { 0, }; + + rds_poll_cq(ic, &state); + ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); + rds_poll_cq(ic, &state); + + if (state.ack_next_valid) + rds_ib_set_ack(ic, state.ack_next, state.ack_required); + if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) { + rds_send_drop_acked(conn, state.ack_recv, NULL); + ic->i_ack_recv = state.ack_recv; + } + if (rds_conn_up(conn)) + rds_ib_attempt_ack(ic); + + /* If we ever end up with a really empty receive ring, we're + * in deep trouble, as the sender will definitely see RNR + * timeouts. */ + if (rds_ib_ring_empty(&ic->i_recv_ring)) + rds_ib_stats_inc(s_ib_rx_ring_empty); + + if (rds_ib_ring_low(&ic->i_recv_ring)) + rds_ib_recv_refill(conn, 0); +} + +int rds_ib_recv(struct rds_connection *conn) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + int ret = 0; + + rdsdebug("conn %p\n", conn); + if (rds_conn_up(conn)) + rds_ib_attempt_ack(ic); + + return ret; +} + +int rds_ib_recv_init(void) +{ + struct sysinfo si; + int ret = -ENOMEM; + + /* Default to 30% of all available RAM for recv memory */ + si_meminfo(&si); + rds_ib_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE; + + rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming", + sizeof(struct rds_ib_incoming), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!rds_ib_incoming_slab) + goto out; + + rds_ib_frag_slab = kmem_cache_create("rds_ib_frag", + sizeof(struct rds_page_frag), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!rds_ib_frag_slab) + kmem_cache_destroy(rds_ib_incoming_slab); + else + ret = 0; +out: + return ret; +} + +void rds_ib_recv_exit(void) +{ + kmem_cache_destroy(rds_ib_incoming_slab); + kmem_cache_destroy(rds_ib_frag_slab); +} diff --git a/net/rds/ib_ring.c b/net/rds/ib_ring.c new file mode 100644 index 0000000..ff97e8e --- /dev/null +++ b/net/rds/ib_ring.c @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include + +#include "rds.h" +#include "ib.h" + +/* + * Locking for IB rings. + * We assume that allocation is always protected by a mutex + * in the caller (this is a valid assumption for the current + * implementation). + * + * Freeing always happens in an interrupt, and hence only + * races with allocations, but not with other free()s. + * + * The interaction between allocation and freeing is that + * the alloc code has to determine the number of free entries. + * To this end, we maintain two counters; an allocation counter + * and a free counter. Both are allowed to run freely, and wrap + * around. + * The number of used entries is always (alloc_ctr - free_ctr) % NR. + * + * The current implementation makes free_ctr atomic. When the + * caller finds an allocation fails, it should set an "alloc fail" + * bit and retry the allocation. The "alloc fail" bit essentially tells + * the CQ completion handlers to wake it up after freeing some + * more entries. + */ + +/* + * This only happens on shutdown. + */ +DECLARE_WAIT_QUEUE_HEAD(rds_ib_ring_empty_wait); + +void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr) +{ + memset(ring, 0, sizeof(*ring)); + ring->w_nr = nr; + rdsdebug("ring %p nr %u\n", ring, ring->w_nr); +} + +static inline u32 __rds_ib_ring_used(struct rds_ib_work_ring *ring) +{ + u32 diff; + + /* This assumes that atomic_t has at least as many bits as u32 */ + diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr); + BUG_ON(diff > ring->w_nr); + + return diff; +} + +void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr) +{ + /* We only ever get called from the connection setup code, + * prior to creating the QP. */ + BUG_ON(__rds_ib_ring_used(ring)); + ring->w_nr = nr; +} + +static int __rds_ib_ring_empty(struct rds_ib_work_ring *ring) +{ + return __rds_ib_ring_used(ring) == 0; +} + +u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos) +{ + u32 ret = 0, avail; + + avail = ring->w_nr - __rds_ib_ring_used(ring); + + rdsdebug("ring %p val %u next %u free %u\n", ring, val, + ring->w_alloc_ptr, avail); + + if (val && avail) { + ret = min(val, avail); + *pos = ring->w_alloc_ptr; + + ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr; + ring->w_alloc_ctr += ret; + } + + return ret; +} + +void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val) +{ + ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr; + atomic_add(val, &ring->w_free_ctr); + + if (__rds_ib_ring_empty(ring) && + waitqueue_active(&rds_ib_ring_empty_wait)) + wake_up(&rds_ib_ring_empty_wait); +} + +void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val) +{ + ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr; + ring->w_alloc_ctr -= val; +} + +int rds_ib_ring_empty(struct rds_ib_work_ring *ring) +{ + return __rds_ib_ring_empty(ring); +} + +int rds_ib_ring_low(struct rds_ib_work_ring *ring) +{ + return __rds_ib_ring_used(ring) <= (ring->w_nr >> 1); +} + +/* + * returns the oldest alloced ring entry. This will be the next one + * freed. This can't be called if there are none allocated. + */ +u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring) +{ + return ring->w_free_ptr; +} + +/* + * returns the number of completed work requests. + */ + +u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest) +{ + u32 ret; + + if (oldest <= (unsigned long long)wr_id) + ret = (unsigned long long)wr_id - oldest + 1; + else + ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1; + + rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret, + wr_id, oldest); + return ret; +} diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c new file mode 100644 index 0000000..1dde91e --- /dev/null +++ b/net/rds/ib_send.c @@ -0,0 +1,1020 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include +#include + +#include "rds.h" +#include "ib.h" + +static char *rds_ib_wc_status_strings[] = { +#define RDS_IB_WC_STATUS_STR(foo) \ + [IB_WC_##foo] = __stringify(IB_WC_##foo) + RDS_IB_WC_STATUS_STR(SUCCESS), + RDS_IB_WC_STATUS_STR(LOC_LEN_ERR), + RDS_IB_WC_STATUS_STR(LOC_QP_OP_ERR), + RDS_IB_WC_STATUS_STR(LOC_EEC_OP_ERR), + RDS_IB_WC_STATUS_STR(LOC_PROT_ERR), + RDS_IB_WC_STATUS_STR(WR_FLUSH_ERR), + RDS_IB_WC_STATUS_STR(MW_BIND_ERR), + RDS_IB_WC_STATUS_STR(BAD_RESP_ERR), + RDS_IB_WC_STATUS_STR(LOC_ACCESS_ERR), + RDS_IB_WC_STATUS_STR(REM_INV_REQ_ERR), + RDS_IB_WC_STATUS_STR(REM_ACCESS_ERR), + RDS_IB_WC_STATUS_STR(REM_OP_ERR), + RDS_IB_WC_STATUS_STR(RETRY_EXC_ERR), + RDS_IB_WC_STATUS_STR(RNR_RETRY_EXC_ERR), + RDS_IB_WC_STATUS_STR(LOC_RDD_VIOL_ERR), + RDS_IB_WC_STATUS_STR(REM_INV_RD_REQ_ERR), + RDS_IB_WC_STATUS_STR(REM_ABORT_ERR), + RDS_IB_WC_STATUS_STR(INV_EECN_ERR), + RDS_IB_WC_STATUS_STR(INV_EEC_STATE_ERR), + RDS_IB_WC_STATUS_STR(FATAL_ERR), + RDS_IB_WC_STATUS_STR(RESP_TIMEOUT_ERR), + RDS_IB_WC_STATUS_STR(GENERAL_ERR), +#undef RDS_IB_WC_STATUS_STR +}; + +char *rds_ib_wc_status_str(enum ib_wc_status status) +{ + return rds_str_array(rds_ib_wc_status_strings, + ARRAY_SIZE(rds_ib_wc_status_strings), status); +} + +/* + * Convert IB-specific error message to RDS error message and call core + * completion handler. + */ +static void rds_ib_send_complete(struct rds_message *rm, + int wc_status, + void (*complete)(struct rds_message *rm, int status)) +{ + int notify_status; + + switch (wc_status) { + case IB_WC_WR_FLUSH_ERR: + return; + + case IB_WC_SUCCESS: + notify_status = RDS_RDMA_SUCCESS; + break; + + case IB_WC_REM_ACCESS_ERR: + notify_status = RDS_RDMA_REMOTE_ERROR; + break; + + default: + notify_status = RDS_RDMA_OTHER_ERROR; + break; + } + complete(rm, notify_status); +} + +static void rds_ib_send_unmap_data(struct rds_ib_connection *ic, + struct rm_data_op *op, + int wc_status) +{ + if (op->op_nents) + ib_dma_unmap_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, + DMA_TO_DEVICE); +} + +static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, + struct rm_rdma_op *op, + int wc_status) +{ + if (op->op_mapped) { + ib_dma_unmap_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, + op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + op->op_mapped = 0; + } + + /* If the user asked for a completion notification on this + * message, we can implement three different semantics: + * 1. Notify when we received the ACK on the RDS message + * that was queued with the RDMA. This provides reliable + * notification of RDMA status at the expense of a one-way + * packet delay. + * 2. Notify when the IB stack gives us the completion event for + * the RDMA operation. + * 3. Notify when the IB stack gives us the completion event for + * the accompanying RDS messages. + * Here, we implement approach #3. To implement approach #2, + * we would need to take an event for the rdma WR. To implement #1, + * don't call rds_rdma_send_complete at all, and fall back to the notify + * handling in the ACK processing code. + * + * Note: There's no need to explicitly sync any RDMA buffers using + * ib_dma_sync_sg_for_cpu - the completion for the RDMA + * operation itself unmapped the RDMA buffers, which takes care + * of synching. + */ + rds_ib_send_complete(container_of(op, struct rds_message, rdma), + wc_status, rds_rdma_send_complete); + + if (op->op_write) + rds_stats_add(s_send_rdma_bytes, op->op_bytes); + else + rds_stats_add(s_recv_rdma_bytes, op->op_bytes); +} + +static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic, + struct rm_atomic_op *op, + int wc_status) +{ + /* unmap atomic recvbuf */ + if (op->op_mapped) { + ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1, + DMA_FROM_DEVICE); + op->op_mapped = 0; + } + + rds_ib_send_complete(container_of(op, struct rds_message, atomic), + wc_status, rds_atomic_send_complete); + + if (op->op_type == RDS_ATOMIC_TYPE_CSWP) + rds_ib_stats_inc(s_ib_atomic_cswp); + else + rds_ib_stats_inc(s_ib_atomic_fadd); +} + +/* + * Unmap the resources associated with a struct send_work. + * + * Returns the rm for no good reason other than it is unobtainable + * other than by switching on wr.opcode, currently, and the caller, + * the event handler, needs it. + */ +static struct rds_message *rds_ib_send_unmap_op(struct rds_ib_connection *ic, + struct rds_ib_send_work *send, + int wc_status) +{ + struct rds_message *rm = NULL; + + /* In the error case, wc.opcode sometimes contains garbage */ + switch (send->s_wr.opcode) { + case IB_WR_SEND: + if (send->s_op) { + rm = container_of(send->s_op, struct rds_message, data); + rds_ib_send_unmap_data(ic, send->s_op, wc_status); + } + break; + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_READ: + if (send->s_op) { + rm = container_of(send->s_op, struct rds_message, rdma); + rds_ib_send_unmap_rdma(ic, send->s_op, wc_status); + } + break; + case IB_WR_ATOMIC_FETCH_AND_ADD: + case IB_WR_ATOMIC_CMP_AND_SWP: + if (send->s_op) { + rm = container_of(send->s_op, struct rds_message, atomic); + rds_ib_send_unmap_atomic(ic, send->s_op, wc_status); + } + break; + default: + printk_ratelimited(KERN_NOTICE + "RDS/IB: %s: unexpected opcode 0x%x in WR!\n", + __func__, send->s_wr.opcode); + break; + } + + send->s_wr.opcode = 0xdead; + + return rm; +} + +void rds_ib_send_init_ring(struct rds_ib_connection *ic) +{ + struct rds_ib_send_work *send; + u32 i; + + for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { + struct ib_sge *sge; + + send->s_op = NULL; + + send->s_wr.wr_id = i; + send->s_wr.sg_list = send->s_sge; + send->s_wr.ex.imm_data = 0; + + sge = &send->s_sge[0]; + sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); + sge->length = sizeof(struct rds_header); + sge->lkey = ic->i_mr->lkey; + + send->s_sge[1].lkey = ic->i_mr->lkey; + } +} + +void rds_ib_send_clear_ring(struct rds_ib_connection *ic) +{ + struct rds_ib_send_work *send; + u32 i; + + for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { + if (send->s_op && send->s_wr.opcode != 0xdead) + rds_ib_send_unmap_op(ic, send, IB_WC_WR_FLUSH_ERR); + } +} + +/* + * The only fast path caller always has a non-zero nr, so we don't + * bother testing nr before performing the atomic sub. + */ +static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr) +{ + if ((atomic_sub_return(nr, &ic->i_signaled_sends) == 0) && + waitqueue_active(&rds_ib_ring_empty_wait)) + wake_up(&rds_ib_ring_empty_wait); + BUG_ON(atomic_read(&ic->i_signaled_sends) < 0); +} + +/* + * The _oldest/_free ring operations here race cleanly with the alloc/unalloc + * operations performed in the send path. As the sender allocs and potentially + * unallocs the next free entry in the ring it doesn't alter which is + * the next to be freed, which is what this is concerned with. + */ +void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) +{ + struct rds_connection *conn = context; + struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_message *rm = NULL; + struct ib_wc wc; + struct rds_ib_send_work *send; + u32 completed; + u32 oldest; + u32 i = 0; + int ret; + int nr_sig = 0; + + rdsdebug("cq %p conn %p\n", cq, conn); + rds_ib_stats_inc(s_ib_tx_cq_call); + ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + if (ret) + rdsdebug("ib_req_notify_cq send failed: %d\n", ret); + + while (ib_poll_cq(cq, 1, &wc) > 0) { + rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", + (unsigned long long)wc.wr_id, wc.status, + rds_ib_wc_status_str(wc.status), wc.byte_len, + be32_to_cpu(wc.ex.imm_data)); + rds_ib_stats_inc(s_ib_tx_cq_event); + + if (wc.wr_id == RDS_IB_ACK_WR_ID) { + if (time_after(jiffies, ic->i_ack_queued + HZ/2)) + rds_ib_stats_inc(s_ib_tx_stalled); + rds_ib_ack_send_complete(ic); + continue; + } + + oldest = rds_ib_ring_oldest(&ic->i_send_ring); + + completed = rds_ib_ring_completed(&ic->i_send_ring, wc.wr_id, oldest); + + for (i = 0; i < completed; i++) { + send = &ic->i_sends[oldest]; + if (send->s_wr.send_flags & IB_SEND_SIGNALED) + nr_sig++; + + rm = rds_ib_send_unmap_op(ic, send, wc.status); + + if (time_after(jiffies, send->s_queued + HZ/2)) + rds_ib_stats_inc(s_ib_tx_stalled); + + if (send->s_op) { + if (send->s_op == rm->m_final_op) { + /* If anyone waited for this message to get flushed out, wake + * them up now */ + rds_message_unmapped(rm); + } + rds_message_put(rm); + send->s_op = NULL; + } + + oldest = (oldest + 1) % ic->i_send_ring.w_nr; + } + + rds_ib_ring_free(&ic->i_send_ring, completed); + rds_ib_sub_signaled(ic, nr_sig); + nr_sig = 0; + + if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || + test_bit(0, &conn->c_map_queued)) + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + + /* We expect errors as the qp is drained during shutdown */ + if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { + rds_ib_conn_error(conn, "send completion on %pI4 had status " + "%u (%s), disconnecting and reconnecting\n", + &conn->c_faddr, wc.status, + rds_ib_wc_status_str(wc.status)); + } + } +} + +/* + * This is the main function for allocating credits when sending + * messages. + * + * Conceptually, we have two counters: + * - send credits: this tells us how many WRs we're allowed + * to submit without overruning the receiver's queue. For + * each SEND WR we post, we decrement this by one. + * + * - posted credits: this tells us how many WRs we recently + * posted to the receive queue. This value is transferred + * to the peer as a "credit update" in a RDS header field. + * Every time we transmit credits to the peer, we subtract + * the amount of transferred credits from this counter. + * + * It is essential that we avoid situations where both sides have + * exhausted their send credits, and are unable to send new credits + * to the peer. We achieve this by requiring that we send at least + * one credit update to the peer before exhausting our credits. + * When new credits arrive, we subtract one credit that is withheld + * until we've posted new buffers and are ready to transmit these + * credits (see rds_ib_send_add_credits below). + * + * The RDS send code is essentially single-threaded; rds_send_xmit + * sets RDS_IN_XMIT to ensure exclusive access to the send ring. + * However, the ACK sending code is independent and can race with + * message SENDs. + * + * In the send path, we need to update the counters for send credits + * and the counter of posted buffers atomically - when we use the + * last available credit, we cannot allow another thread to race us + * and grab the posted credits counter. Hence, we have to use a + * spinlock to protect the credit counter, or use atomics. + * + * Spinlocks shared between the send and the receive path are bad, + * because they create unnecessary delays. An early implementation + * using a spinlock showed a 5% degradation in throughput at some + * loads. + * + * This implementation avoids spinlocks completely, putting both + * counters into a single atomic, and updating that atomic using + * atomic_add (in the receive path, when receiving fresh credits), + * and using atomic_cmpxchg when updating the two counters. + */ +int rds_ib_send_grab_credits(struct rds_ib_connection *ic, + u32 wanted, u32 *adv_credits, int need_posted, int max_posted) +{ + unsigned int avail, posted, got = 0, advertise; + long oldval, newval; + + *adv_credits = 0; + if (!ic->i_flowctl) + return wanted; + +try_again: + advertise = 0; + oldval = newval = atomic_read(&ic->i_credits); + posted = IB_GET_POST_CREDITS(oldval); + avail = IB_GET_SEND_CREDITS(oldval); + + rdsdebug("rds_ib_send_grab_credits(%u): credits=%u posted=%u\n", + wanted, avail, posted); + + /* The last credit must be used to send a credit update. */ + if (avail && !posted) + avail--; + + if (avail < wanted) { + struct rds_connection *conn = ic->i_cm_id->context; + + /* Oops, there aren't that many credits left! */ + set_bit(RDS_LL_SEND_FULL, &conn->c_flags); + got = avail; + } else { + /* Sometimes you get what you want, lalala. */ + got = wanted; + } + newval -= IB_SET_SEND_CREDITS(got); + + /* + * If need_posted is non-zero, then the caller wants + * the posted regardless of whether any send credits are + * available. + */ + if (posted && (got || need_posted)) { + advertise = min_t(unsigned int, posted, max_posted); + newval -= IB_SET_POST_CREDITS(advertise); + } + + /* Finally bill everything */ + if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval) + goto try_again; + + *adv_credits = advertise; + return got; +} + +void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + + if (credits == 0) + return; + + rdsdebug("rds_ib_send_add_credits(%u): current=%u%s\n", + credits, + IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)), + test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : ""); + + atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits); + if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)) + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + + WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384); + + rds_ib_stats_inc(s_ib_rx_credit_updates); +} + +void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + + if (posted == 0) + return; + + atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits); + + /* Decide whether to send an update to the peer now. + * If we would send a credit update for every single buffer we + * post, we would end up with an ACK storm (ACK arrives, + * consumes buffer, we refill the ring, send ACK to remote + * advertising the newly posted buffer... ad inf) + * + * Performance pretty much depends on how often we send + * credit updates - too frequent updates mean lots of ACKs. + * Too infrequent updates, and the peer will run out of + * credits and has to throttle. + * For the time being, 16 seems to be a good compromise. + */ + if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16) + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); +} + +static inline int rds_ib_set_wr_signal_state(struct rds_ib_connection *ic, + struct rds_ib_send_work *send, + bool notify) +{ + /* + * We want to delay signaling completions just enough to get + * the batching benefits but not so much that we create dead time + * on the wire. + */ + if (ic->i_unsignaled_wrs-- == 0 || notify) { + ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; + send->s_wr.send_flags |= IB_SEND_SIGNALED; + return 1; + } + return 0; +} + +/* + * This can be called multiple times for a given message. The first time + * we see a message we map its scatterlist into the IB device so that + * we can provide that mapped address to the IB scatter gather entries + * in the IB work requests. We translate the scatterlist into a series + * of work requests that fragment the message. These work requests complete + * in order so we pass ownership of the message to the completion handler + * once we send the final fragment. + * + * The RDS core uses the c_send_lock to only enter this function once + * per connection. This makes sure that the tx ring alloc/unalloc pairs + * don't get out of sync and confuse the ring. + */ +int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, unsigned int off) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct ib_device *dev = ic->i_cm_id->device; + struct rds_ib_send_work *send = NULL; + struct rds_ib_send_work *first; + struct rds_ib_send_work *prev; + struct ib_send_wr *failed_wr; + struct scatterlist *scat; + u32 pos; + u32 i; + u32 work_alloc; + u32 credit_alloc = 0; + u32 posted; + u32 adv_credits = 0; + int send_flags = 0; + int bytes_sent = 0; + int ret; + int flow_controlled = 0; + int nr_sig = 0; + + BUG_ON(off % RDS_FRAG_SIZE); + BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); + + /* Do not send cong updates to IB loopback */ + if (conn->c_loopback + && rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) { + rds_cong_map_updated(conn->c_fcong, ~(u64) 0); + scat = &rm->data.op_sg[sg]; + ret = max_t(int, RDS_CONG_MAP_BYTES, scat->length); + return sizeof(struct rds_header) + ret; + } + + /* FIXME we may overallocate here */ + if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) + i = 1; + else + i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE); + + work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); + if (work_alloc == 0) { + set_bit(RDS_LL_SEND_FULL, &conn->c_flags); + rds_ib_stats_inc(s_ib_tx_ring_full); + ret = -ENOMEM; + goto out; + } + + if (ic->i_flowctl) { + credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT); + adv_credits += posted; + if (credit_alloc < work_alloc) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); + work_alloc = credit_alloc; + flow_controlled = 1; + } + if (work_alloc == 0) { + set_bit(RDS_LL_SEND_FULL, &conn->c_flags); + rds_ib_stats_inc(s_ib_tx_throttle); + ret = -ENOMEM; + goto out; + } + } + + /* map the message the first time we see it */ + if (!ic->i_data_op) { + if (rm->data.op_nents) { + rm->data.op_count = ib_dma_map_sg(dev, + rm->data.op_sg, + rm->data.op_nents, + DMA_TO_DEVICE); + rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count); + if (rm->data.op_count == 0) { + rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + ret = -ENOMEM; /* XXX ? */ + goto out; + } + } else { + rm->data.op_count = 0; + } + + rds_message_addref(rm); + ic->i_data_op = &rm->data; + + /* Finalize the header */ + if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) + rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED; + if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) + rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED; + + /* If it has a RDMA op, tell the peer we did it. This is + * used by the peer to release use-once RDMA MRs. */ + if (rm->rdma.op_active) { + struct rds_ext_header_rdma ext_hdr; + + ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey); + rds_message_add_extension(&rm->m_inc.i_hdr, + RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); + } + if (rm->m_rdma_cookie) { + rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr, + rds_rdma_cookie_key(rm->m_rdma_cookie), + rds_rdma_cookie_offset(rm->m_rdma_cookie)); + } + + /* Note - rds_ib_piggyb_ack clears the ACK_REQUIRED bit, so + * we should not do this unless we have a chance of at least + * sticking the header into the send ring. Which is why we + * should call rds_ib_ring_alloc first. */ + rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_ib_piggyb_ack(ic)); + rds_message_make_checksum(&rm->m_inc.i_hdr); + + /* + * Update adv_credits since we reset the ACK_REQUIRED bit. + */ + if (ic->i_flowctl) { + rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); + adv_credits += posted; + BUG_ON(adv_credits > 255); + } + } + + /* Sometimes you want to put a fence between an RDMA + * READ and the following SEND. + * We could either do this all the time + * or when requested by the user. Right now, we let + * the application choose. + */ + if (rm->rdma.op_active && rm->rdma.op_fence) + send_flags = IB_SEND_FENCE; + + /* Each frag gets a header. Msgs may be 0 bytes */ + send = &ic->i_sends[pos]; + first = send; + prev = NULL; + scat = &ic->i_data_op->op_sg[sg]; + i = 0; + do { + unsigned int len = 0; + + /* Set up the header */ + send->s_wr.send_flags = send_flags; + send->s_wr.opcode = IB_WR_SEND; + send->s_wr.num_sge = 1; + send->s_wr.next = NULL; + send->s_queued = jiffies; + send->s_op = NULL; + + send->s_sge[0].addr = ic->i_send_hdrs_dma + + (pos * sizeof(struct rds_header)); + send->s_sge[0].length = sizeof(struct rds_header); + + memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); + + /* Set up the data, if present */ + if (i < work_alloc + && scat != &rm->data.op_sg[rm->data.op_count]) { + len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); + send->s_wr.num_sge = 2; + + send->s_sge[1].addr = ib_sg_dma_address(dev, scat) + off; + send->s_sge[1].length = len; + + bytes_sent += len; + off += len; + if (off == ib_sg_dma_len(dev, scat)) { + scat++; + off = 0; + } + } + + rds_ib_set_wr_signal_state(ic, send, 0); + + /* + * Always signal the last one if we're stopping due to flow control. + */ + if (ic->i_flowctl && flow_controlled && i == (work_alloc-1)) + send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + + if (send->s_wr.send_flags & IB_SEND_SIGNALED) + nr_sig++; + + rdsdebug("send %p wr %p num_sge %u next %p\n", send, + &send->s_wr, send->s_wr.num_sge, send->s_wr.next); + + if (ic->i_flowctl && adv_credits) { + struct rds_header *hdr = &ic->i_send_hdrs[pos]; + + /* add credit and redo the header checksum */ + hdr->h_credit = adv_credits; + rds_message_make_checksum(hdr); + adv_credits = 0; + rds_ib_stats_inc(s_ib_tx_credit_updates); + } + + if (prev) + prev->s_wr.next = &send->s_wr; + prev = send; + + pos = (pos + 1) % ic->i_send_ring.w_nr; + send = &ic->i_sends[pos]; + i++; + + } while (i < work_alloc + && scat != &rm->data.op_sg[rm->data.op_count]); + + /* Account the RDS header in the number of bytes we sent, but just once. + * The caller has no concept of fragmentation. */ + if (hdr_off == 0) + bytes_sent += sizeof(struct rds_header); + + /* if we finished the message then send completion owns it */ + if (scat == &rm->data.op_sg[rm->data.op_count]) { + prev->s_op = ic->i_data_op; + prev->s_wr.send_flags |= IB_SEND_SOLICITED; + ic->i_data_op = NULL; + } + + /* Put back wrs & credits we didn't use */ + if (i < work_alloc) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); + work_alloc = i; + } + if (ic->i_flowctl && i < credit_alloc) + rds_ib_send_add_credits(conn, credit_alloc - i); + + if (nr_sig) + atomic_add(nr_sig, &ic->i_signaled_sends); + + /* XXX need to worry about failed_wr and partial sends. */ + failed_wr = &first->s_wr; + ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); + rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, + first, &first->s_wr, ret, failed_wr); + BUG_ON(failed_wr != &first->s_wr); + if (ret) { + printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 " + "returned %d\n", &conn->c_faddr, ret); + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_sub_signaled(ic, nr_sig); + if (prev->s_op) { + ic->i_data_op = prev->s_op; + prev->s_op = NULL; + } + + rds_ib_conn_error(ic->conn, "ib_post_send failed\n"); + goto out; + } + + ret = bytes_sent; +out: + BUG_ON(adv_credits); + return ret; +} + +/* + * Issue atomic operation. + * A simplified version of the rdma case, we always map 1 SG, and + * only 8 bytes, for the return value from the atomic operation. + */ +int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_ib_send_work *send = NULL; + struct ib_send_wr *failed_wr; + struct rds_ib_device *rds_ibdev; + u32 pos; + u32 work_alloc; + int ret; + int nr_sig = 0; + + rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); + + work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos); + if (work_alloc != 1) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_stats_inc(s_ib_tx_ring_full); + ret = -ENOMEM; + goto out; + } + + /* address of send request in ring */ + send = &ic->i_sends[pos]; + send->s_queued = jiffies; + + if (op->op_type == RDS_ATOMIC_TYPE_CSWP) { + send->s_wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP; + send->s_wr.wr.atomic.compare_add = op->op_m_cswp.compare; + send->s_wr.wr.atomic.swap = op->op_m_cswp.swap; + send->s_wr.wr.atomic.compare_add_mask = op->op_m_cswp.compare_mask; + send->s_wr.wr.atomic.swap_mask = op->op_m_cswp.swap_mask; + } else { /* FADD */ + send->s_wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD; + send->s_wr.wr.atomic.compare_add = op->op_m_fadd.add; + send->s_wr.wr.atomic.swap = 0; + send->s_wr.wr.atomic.compare_add_mask = op->op_m_fadd.nocarry_mask; + send->s_wr.wr.atomic.swap_mask = 0; + } + nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify); + send->s_wr.num_sge = 1; + send->s_wr.next = NULL; + send->s_wr.wr.atomic.remote_addr = op->op_remote_addr; + send->s_wr.wr.atomic.rkey = op->op_rkey; + send->s_op = op; + rds_message_addref(container_of(send->s_op, struct rds_message, atomic)); + + /* map 8 byte retval buffer to the device */ + ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE); + rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret); + if (ret != 1) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); + ret = -ENOMEM; /* XXX ? */ + goto out; + } + + /* Convert our struct scatterlist to struct ib_sge */ + send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg); + send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg); + send->s_sge[0].lkey = ic->i_mr->lkey; + + rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr, + send->s_sge[0].addr, send->s_sge[0].length); + + if (nr_sig) + atomic_add(nr_sig, &ic->i_signaled_sends); + + failed_wr = &send->s_wr; + ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr); + rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic, + send, &send->s_wr, ret, failed_wr); + BUG_ON(failed_wr != &send->s_wr); + if (ret) { + printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 " + "returned %d\n", &conn->c_faddr, ret); + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_sub_signaled(ic, nr_sig); + goto out; + } + + if (unlikely(failed_wr != &send->s_wr)) { + printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret); + BUG_ON(failed_wr != &send->s_wr); + } + +out: + return ret; +} + +int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_ib_send_work *send = NULL; + struct rds_ib_send_work *first; + struct rds_ib_send_work *prev; + struct ib_send_wr *failed_wr; + struct scatterlist *scat; + unsigned long len; + u64 remote_addr = op->op_remote_addr; + u32 max_sge = ic->rds_ibdev->max_sge; + u32 pos; + u32 work_alloc; + u32 i; + u32 j; + int sent; + int ret; + int num_sge; + int nr_sig = 0; + + /* map the op the first time we see it */ + if (!op->op_mapped) { + op->op_count = ib_dma_map_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, (op->op_write) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE); + rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count); + if (op->op_count == 0) { + rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); + ret = -ENOMEM; /* XXX ? */ + goto out; + } + + op->op_mapped = 1; + } + + /* + * Instead of knowing how to return a partial rdma read/write we insist that there + * be enough work requests to send the entire message. + */ + i = ceil(op->op_count, max_sge); + + work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); + if (work_alloc != i) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_stats_inc(s_ib_tx_ring_full); + ret = -ENOMEM; + goto out; + } + + send = &ic->i_sends[pos]; + first = send; + prev = NULL; + scat = &op->op_sg[0]; + sent = 0; + num_sge = op->op_count; + + for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) { + send->s_wr.send_flags = 0; + send->s_queued = jiffies; + send->s_op = NULL; + + nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify); + + send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; + send->s_wr.wr.rdma.remote_addr = remote_addr; + send->s_wr.wr.rdma.rkey = op->op_rkey; + + if (num_sge > max_sge) { + send->s_wr.num_sge = max_sge; + num_sge -= max_sge; + } else { + send->s_wr.num_sge = num_sge; + } + + send->s_wr.next = NULL; + + if (prev) + prev->s_wr.next = &send->s_wr; + + for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) { + len = ib_sg_dma_len(ic->i_cm_id->device, scat); + send->s_sge[j].addr = + ib_sg_dma_address(ic->i_cm_id->device, scat); + send->s_sge[j].length = len; + send->s_sge[j].lkey = ic->i_mr->lkey; + + sent += len; + rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr); + + remote_addr += len; + scat++; + } + + rdsdebug("send %p wr %p num_sge %u next %p\n", send, + &send->s_wr, send->s_wr.num_sge, send->s_wr.next); + + prev = send; + if (++send == &ic->i_sends[ic->i_send_ring.w_nr]) + send = ic->i_sends; + } + + /* give a reference to the last op */ + if (scat == &op->op_sg[op->op_count]) { + prev->s_op = op; + rds_message_addref(container_of(op, struct rds_message, rdma)); + } + + if (i < work_alloc) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); + work_alloc = i; + } + + if (nr_sig) + atomic_add(nr_sig, &ic->i_signaled_sends); + + failed_wr = &first->s_wr; + ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); + rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, + first, &first->s_wr, ret, failed_wr); + BUG_ON(failed_wr != &first->s_wr); + if (ret) { + printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 " + "returned %d\n", &conn->c_faddr, ret); + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_sub_signaled(ic, nr_sig); + goto out; + } + + if (unlikely(failed_wr != &first->s_wr)) { + printk(KERN_WARNING "RDS/IB: ib_post_send() rc=%d, but failed_wqe updated!\n", ret); + BUG_ON(failed_wr != &first->s_wr); + } + + +out: + return ret; +} + +void rds_ib_xmit_complete(struct rds_connection *conn) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + + /* We may have a pending ACK or window update we were unable + * to send previously (due to flow control). Try again. */ + rds_ib_attempt_ack(ic); +} diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c new file mode 100644 index 0000000..2d5965d --- /dev/null +++ b/net/rds/ib_stats.c @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include + +#include "rds.h" +#include "ib.h" + +DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats); + +static const char *const rds_ib_stat_names[] = { + "ib_connect_raced", + "ib_listen_closed_stale", + "ib_tx_cq_call", + "ib_tx_cq_event", + "ib_tx_ring_full", + "ib_tx_throttle", + "ib_tx_sg_mapping_failure", + "ib_tx_stalled", + "ib_tx_credit_updates", + "ib_rx_cq_call", + "ib_rx_cq_event", + "ib_rx_ring_empty", + "ib_rx_refill_from_cq", + "ib_rx_refill_from_thread", + "ib_rx_alloc_limit", + "ib_rx_credit_updates", + "ib_ack_sent", + "ib_ack_send_failure", + "ib_ack_send_delayed", + "ib_ack_send_piggybacked", + "ib_ack_received", + "ib_rdma_mr_alloc", + "ib_rdma_mr_free", + "ib_rdma_mr_used", + "ib_rdma_mr_pool_flush", + "ib_rdma_mr_pool_wait", + "ib_rdma_mr_pool_depleted", + "ib_atomic_cswp", + "ib_atomic_fadd", +}; + +unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail) +{ + struct rds_ib_statistics stats = {0, }; + uint64_t *src; + uint64_t *sum; + size_t i; + int cpu; + + if (avail < ARRAY_SIZE(rds_ib_stat_names)) + goto out; + + for_each_online_cpu(cpu) { + src = (uint64_t *)&(per_cpu(rds_ib_stats, cpu)); + sum = (uint64_t *)&stats; + for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) + *(sum++) += *(src++); + } + + rds_stats_info_copy(iter, (uint64_t *)&stats, rds_ib_stat_names, + ARRAY_SIZE(rds_ib_stat_names)); +out: + return ARRAY_SIZE(rds_ib_stat_names); +} diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c new file mode 100644 index 0000000..e4e41b3 --- /dev/null +++ b/net/rds/ib_sysctl.c @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include + +#include "ib.h" + +static struct ctl_table_header *rds_ib_sysctl_hdr; + +unsigned long rds_ib_sysctl_max_send_wr = RDS_IB_DEFAULT_SEND_WR; +unsigned long rds_ib_sysctl_max_recv_wr = RDS_IB_DEFAULT_RECV_WR; +unsigned long rds_ib_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE; +static unsigned long rds_ib_sysctl_max_wr_min = 1; +/* hardware will fail CQ creation long before this */ +static unsigned long rds_ib_sysctl_max_wr_max = (u32)~0; + +unsigned long rds_ib_sysctl_max_unsig_wrs = 16; +static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1; +static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64; + +/* + * This sysctl does nothing. + * + * Backwards compatibility with RDS 3.0 wire protocol + * disables initial FC credit exchange. + * If it's ever possible to drop 3.0 support, + * setting this to 1 and moving init/refill of send/recv + * rings from ib_cm_connect_complete() back into ib_setup_qp() + * will cause credits to be added before protocol negotiation. + */ +unsigned int rds_ib_sysctl_flow_control = 0; + +static struct ctl_table rds_ib_sysctl_table[] = { + { + .procname = "max_send_wr", + .data = &rds_ib_sysctl_max_send_wr, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &rds_ib_sysctl_max_wr_min, + .extra2 = &rds_ib_sysctl_max_wr_max, + }, + { + .procname = "max_recv_wr", + .data = &rds_ib_sysctl_max_recv_wr, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &rds_ib_sysctl_max_wr_min, + .extra2 = &rds_ib_sysctl_max_wr_max, + }, + { + .procname = "max_unsignaled_wr", + .data = &rds_ib_sysctl_max_unsig_wrs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &rds_ib_sysctl_max_unsig_wr_min, + .extra2 = &rds_ib_sysctl_max_unsig_wr_max, + }, + { + .procname = "max_recv_allocation", + .data = &rds_ib_sysctl_max_recv_allocation, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "flow_control", + .data = &rds_ib_sysctl_flow_control, + .maxlen = sizeof(rds_ib_sysctl_flow_control), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +void rds_ib_sysctl_exit(void) +{ + if (rds_ib_sysctl_hdr) + unregister_net_sysctl_table(rds_ib_sysctl_hdr); +} + +int rds_ib_sysctl_init(void) +{ + rds_ib_sysctl_hdr = register_net_sysctl(&init_net, "net/rds/ib", rds_ib_sysctl_table); + if (!rds_ib_sysctl_hdr) + return -ENOMEM; + return 0; +} diff --git a/net/rds/info.c b/net/rds/info.c new file mode 100644 index 0000000..9a6b4f6 --- /dev/null +++ b/net/rds/info.c @@ -0,0 +1,243 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include +#include + +#include "rds.h" + +/* + * This file implements a getsockopt() call which copies a set of fixed + * sized structs into a user-specified buffer as a means of providing + * read-only information about RDS. + * + * For a given information source there are a given number of fixed sized + * structs at a given time. The structs are only copied if the user-specified + * buffer is big enough. The destination pages that make up the buffer + * are pinned for the duration of the copy. + * + * This gives us the following benefits: + * + * - simple implementation, no copy "position" across multiple calls + * - consistent snapshot of an info source + * - atomic copy works well with whatever locking info source has + * - one portable tool to get rds info across implementations + * - long-lived tool can get info without allocating + * + * at the following costs: + * + * - info source copy must be pinned, may be "large" + */ + +struct rds_info_iterator { + struct page **pages; + void *addr; + unsigned long offset; +}; + +static DEFINE_SPINLOCK(rds_info_lock); +static rds_info_func rds_info_funcs[RDS_INFO_LAST - RDS_INFO_FIRST + 1]; + +void rds_info_register_func(int optname, rds_info_func func) +{ + int offset = optname - RDS_INFO_FIRST; + + BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST); + + spin_lock(&rds_info_lock); + BUG_ON(rds_info_funcs[offset]); + rds_info_funcs[offset] = func; + spin_unlock(&rds_info_lock); +} +EXPORT_SYMBOL_GPL(rds_info_register_func); + +void rds_info_deregister_func(int optname, rds_info_func func) +{ + int offset = optname - RDS_INFO_FIRST; + + BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST); + + spin_lock(&rds_info_lock); + BUG_ON(rds_info_funcs[offset] != func); + rds_info_funcs[offset] = NULL; + spin_unlock(&rds_info_lock); +} +EXPORT_SYMBOL_GPL(rds_info_deregister_func); + +/* + * Typically we hold an atomic kmap across multiple rds_info_copy() calls + * because the kmap is so expensive. This must be called before using blocking + * operations while holding the mapping and as the iterator is torn down. + */ +void rds_info_iter_unmap(struct rds_info_iterator *iter) +{ + if (iter->addr) { + kunmap_atomic(iter->addr); + iter->addr = NULL; + } +} + +/* + * get_user_pages() called flush_dcache_page() on the pages for us. + */ +void rds_info_copy(struct rds_info_iterator *iter, void *data, + unsigned long bytes) +{ + unsigned long this; + + while (bytes) { + if (!iter->addr) + iter->addr = kmap_atomic(*iter->pages); + + this = min(bytes, PAGE_SIZE - iter->offset); + + rdsdebug("page %p addr %p offset %lu this %lu data %p " + "bytes %lu\n", *iter->pages, iter->addr, + iter->offset, this, data, bytes); + + memcpy(iter->addr + iter->offset, data, this); + + data += this; + bytes -= this; + iter->offset += this; + + if (iter->offset == PAGE_SIZE) { + kunmap_atomic(iter->addr); + iter->addr = NULL; + iter->offset = 0; + iter->pages++; + } + } +} +EXPORT_SYMBOL_GPL(rds_info_copy); + +/* + * @optval points to the userspace buffer that the information snapshot + * will be copied into. + * + * @optlen on input is the size of the buffer in userspace. @optlen + * on output is the size of the requested snapshot in bytes. + * + * This function returns -errno if there is a failure, particularly -ENOSPC + * if the given userspace buffer was not large enough to fit the snapshot. + * On success it returns the positive number of bytes of each array element + * in the snapshot. + */ +int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, + int __user *optlen) +{ + struct rds_info_iterator iter; + struct rds_info_lengths lens; + unsigned long nr_pages = 0; + unsigned long start; + unsigned long i; + rds_info_func func; + struct page **pages = NULL; + int ret; + int len; + int total; + + if (get_user(len, optlen)) { + ret = -EFAULT; + goto out; + } + + /* check for all kinds of wrapping and the like */ + start = (unsigned long)optval; + if (len < 0 || len + PAGE_SIZE - 1 < len || start + len < start) { + ret = -EINVAL; + goto out; + } + + /* a 0 len call is just trying to probe its length */ + if (len == 0) + goto call_func; + + nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK)) + >> PAGE_SHIFT; + + pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL); + if (!pages) { + ret = -ENOMEM; + goto out; + } + ret = get_user_pages_fast(start, nr_pages, 1, pages); + if (ret != nr_pages) { + if (ret > 0) + nr_pages = ret; + else + nr_pages = 0; + ret = -EAGAIN; /* XXX ? */ + goto out; + } + + rdsdebug("len %d nr_pages %lu\n", len, nr_pages); + +call_func: + func = rds_info_funcs[optname - RDS_INFO_FIRST]; + if (!func) { + ret = -ENOPROTOOPT; + goto out; + } + + iter.pages = pages; + iter.addr = NULL; + iter.offset = start & (PAGE_SIZE - 1); + + func(sock, len, &iter, &lens); + BUG_ON(lens.each == 0); + + total = lens.nr * lens.each; + + rds_info_iter_unmap(&iter); + + if (total > len) { + len = total; + ret = -ENOSPC; + } else { + len = total; + ret = lens.each; + } + + if (put_user(len, optlen)) + ret = -EFAULT; + +out: + for (i = 0; pages && i < nr_pages; i++) + put_page(pages[i]); + kfree(pages); + + return ret; +} diff --git a/net/rds/info.h b/net/rds/info.h new file mode 100644 index 0000000..b6c052c --- /dev/null +++ b/net/rds/info.h @@ -0,0 +1,30 @@ +#ifndef _RDS_INFO_H +#define _RDS_INFO_H + +struct rds_info_lengths { + unsigned int nr; + unsigned int each; +}; + +struct rds_info_iterator; + +/* + * These functions must fill in the fields of @lens to reflect the size + * of the available info source. If the snapshot fits in @len then it + * should be copied using @iter. The caller will deduce if it was copied + * or not by comparing the lengths. + */ +typedef void (*rds_info_func)(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens); + +void rds_info_register_func(int optname, rds_info_func func); +void rds_info_deregister_func(int optname, rds_info_func func); +int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, + int __user *optlen); +void rds_info_copy(struct rds_info_iterator *iter, void *data, + unsigned long bytes); +void rds_info_iter_unmap(struct rds_info_iterator *iter); + + +#endif diff --git a/net/rds/iw.c b/net/rds/iw.c new file mode 100644 index 0000000..5899356 --- /dev/null +++ b/net/rds/iw.c @@ -0,0 +1,329 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rds.h" +#include "iw.h" + +unsigned int fastreg_pool_size = RDS_FASTREG_POOL_SIZE; +unsigned int fastreg_message_size = RDS_FASTREG_SIZE + 1; /* +1 allows for unaligned MRs */ + +module_param(fastreg_pool_size, int, 0444); +MODULE_PARM_DESC(fastreg_pool_size, " Max number of fastreg MRs per device"); +module_param(fastreg_message_size, int, 0444); +MODULE_PARM_DESC(fastreg_message_size, " Max size of a RDMA transfer (fastreg MRs)"); + +struct list_head rds_iw_devices; + +/* NOTE: if also grabbing iwdev lock, grab this first */ +DEFINE_SPINLOCK(iw_nodev_conns_lock); +LIST_HEAD(iw_nodev_conns); + +static void rds_iw_add_one(struct ib_device *device) +{ + struct rds_iw_device *rds_iwdev; + struct ib_device_attr *dev_attr; + + /* Only handle iwarp devices */ + if (device->node_type != RDMA_NODE_RNIC) + return; + + dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); + if (!dev_attr) + return; + + if (ib_query_device(device, dev_attr)) { + rdsdebug("Query device failed for %s\n", device->name); + goto free_attr; + } + + rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL); + if (!rds_iwdev) + goto free_attr; + + spin_lock_init(&rds_iwdev->spinlock); + + rds_iwdev->dma_local_lkey = !!(dev_attr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY); + rds_iwdev->max_wrs = dev_attr->max_qp_wr; + rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE); + + rds_iwdev->dev = device; + rds_iwdev->pd = ib_alloc_pd(device); + if (IS_ERR(rds_iwdev->pd)) + goto free_dev; + + if (!rds_iwdev->dma_local_lkey) { + rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd, + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(rds_iwdev->mr)) + goto err_pd; + } else + rds_iwdev->mr = NULL; + + rds_iwdev->mr_pool = rds_iw_create_mr_pool(rds_iwdev); + if (IS_ERR(rds_iwdev->mr_pool)) { + rds_iwdev->mr_pool = NULL; + goto err_mr; + } + + INIT_LIST_HEAD(&rds_iwdev->cm_id_list); + INIT_LIST_HEAD(&rds_iwdev->conn_list); + list_add_tail(&rds_iwdev->list, &rds_iw_devices); + + ib_set_client_data(device, &rds_iw_client, rds_iwdev); + + goto free_attr; + +err_mr: + if (rds_iwdev->mr) + ib_dereg_mr(rds_iwdev->mr); +err_pd: + ib_dealloc_pd(rds_iwdev->pd); +free_dev: + kfree(rds_iwdev); +free_attr: + kfree(dev_attr); +} + +static void rds_iw_remove_one(struct ib_device *device) +{ + struct rds_iw_device *rds_iwdev; + struct rds_iw_cm_id *i_cm_id, *next; + + rds_iwdev = ib_get_client_data(device, &rds_iw_client); + if (!rds_iwdev) + return; + + spin_lock_irq(&rds_iwdev->spinlock); + list_for_each_entry_safe(i_cm_id, next, &rds_iwdev->cm_id_list, list) { + list_del(&i_cm_id->list); + kfree(i_cm_id); + } + spin_unlock_irq(&rds_iwdev->spinlock); + + rds_iw_destroy_conns(rds_iwdev); + + if (rds_iwdev->mr_pool) + rds_iw_destroy_mr_pool(rds_iwdev->mr_pool); + + if (rds_iwdev->mr) + ib_dereg_mr(rds_iwdev->mr); + + while (ib_dealloc_pd(rds_iwdev->pd)) { + rdsdebug("Failed to dealloc pd %p\n", rds_iwdev->pd); + msleep(1); + } + + list_del(&rds_iwdev->list); + kfree(rds_iwdev); +} + +struct ib_client rds_iw_client = { + .name = "rds_iw", + .add = rds_iw_add_one, + .remove = rds_iw_remove_one +}; + +static int rds_iw_conn_info_visitor(struct rds_connection *conn, + void *buffer) +{ + struct rds_info_rdma_connection *iinfo = buffer; + struct rds_iw_connection *ic; + + /* We will only ever look at IB transports */ + if (conn->c_trans != &rds_iw_transport) + return 0; + + iinfo->src_addr = conn->c_laddr; + iinfo->dst_addr = conn->c_faddr; + + memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); + memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); + if (rds_conn_state(conn) == RDS_CONN_UP) { + struct rds_iw_device *rds_iwdev; + struct rdma_dev_addr *dev_addr; + + ic = conn->c_transport_data; + dev_addr = &ic->i_cm_id->route.addr.dev_addr; + + rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); + rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); + + rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); + iinfo->max_send_wr = ic->i_send_ring.w_nr; + iinfo->max_recv_wr = ic->i_recv_ring.w_nr; + iinfo->max_send_sge = rds_iwdev->max_sge; + rds_iw_get_mr_info(rds_iwdev, iinfo); + } + return 1; +} + +static void rds_iw_ic_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + rds_for_each_conn_info(sock, len, iter, lens, + rds_iw_conn_info_visitor, + sizeof(struct rds_info_rdma_connection)); +} + + +/* + * Early RDS/IB was built to only bind to an address if there is an IPoIB + * device with that address set. + * + * If it were me, I'd advocate for something more flexible. Sending and + * receiving should be device-agnostic. Transports would try and maintain + * connections between peers who have messages queued. Userspace would be + * allowed to influence which paths have priority. We could call userspace + * asserting this policy "routing". + */ +static int rds_iw_laddr_check(__be32 addr) +{ + int ret; + struct rdma_cm_id *cm_id; + struct sockaddr_in sin; + + /* Create a CMA ID and try to bind it. This catches both + * IB and iWARP capable NICs. + */ + cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(cm_id)) + return PTR_ERR(cm_id); + + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = addr; + + /* rdma_bind_addr will only succeed for IB & iWARP devices */ + ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); + /* due to this, we will claim to support IB devices unless we + check node_type. */ + if (ret || !cm_id->device || + cm_id->device->node_type != RDMA_NODE_RNIC) + ret = -EADDRNOTAVAIL; + + rdsdebug("addr %pI4 ret %d node type %d\n", + &addr, ret, + cm_id->device ? cm_id->device->node_type : -1); + + rdma_destroy_id(cm_id); + + return ret; +} + +void rds_iw_exit(void) +{ + rds_info_deregister_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info); + rds_iw_destroy_nodev_conns(); + ib_unregister_client(&rds_iw_client); + rds_iw_sysctl_exit(); + rds_iw_recv_exit(); + rds_trans_unregister(&rds_iw_transport); +} + +struct rds_transport rds_iw_transport = { + .laddr_check = rds_iw_laddr_check, + .xmit_complete = rds_iw_xmit_complete, + .xmit = rds_iw_xmit, + .xmit_rdma = rds_iw_xmit_rdma, + .recv = rds_iw_recv, + .conn_alloc = rds_iw_conn_alloc, + .conn_free = rds_iw_conn_free, + .conn_connect = rds_iw_conn_connect, + .conn_shutdown = rds_iw_conn_shutdown, + .inc_copy_to_user = rds_iw_inc_copy_to_user, + .inc_free = rds_iw_inc_free, + .cm_initiate_connect = rds_iw_cm_initiate_connect, + .cm_handle_connect = rds_iw_cm_handle_connect, + .cm_connect_complete = rds_iw_cm_connect_complete, + .stats_info_copy = rds_iw_stats_info_copy, + .exit = rds_iw_exit, + .get_mr = rds_iw_get_mr, + .sync_mr = rds_iw_sync_mr, + .free_mr = rds_iw_free_mr, + .flush_mrs = rds_iw_flush_mrs, + .t_owner = THIS_MODULE, + .t_name = "iwarp", + .t_type = RDS_TRANS_IWARP, + .t_prefer_loopback = 1, +}; + +int rds_iw_init(void) +{ + int ret; + + INIT_LIST_HEAD(&rds_iw_devices); + + ret = ib_register_client(&rds_iw_client); + if (ret) + goto out; + + ret = rds_iw_sysctl_init(); + if (ret) + goto out_ibreg; + + ret = rds_iw_recv_init(); + if (ret) + goto out_sysctl; + + ret = rds_trans_register(&rds_iw_transport); + if (ret) + goto out_recv; + + rds_info_register_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info); + + goto out; + +out_recv: + rds_iw_recv_exit(); +out_sysctl: + rds_iw_sysctl_exit(); +out_ibreg: + ib_unregister_client(&rds_iw_client); +out: + return ret; +} + +MODULE_LICENSE("GPL"); + diff --git a/net/rds/iw.h b/net/rds/iw.h new file mode 100644 index 0000000..04ce3b1 --- /dev/null +++ b/net/rds/iw.h @@ -0,0 +1,396 @@ +#ifndef _RDS_IW_H +#define _RDS_IW_H + +#include +#include +#include +#include "rds.h" +#include "rdma_transport.h" + +#define RDS_FASTREG_SIZE 20 +#define RDS_FASTREG_POOL_SIZE 2048 + +#define RDS_IW_MAX_SGE 8 +#define RDS_IW_RECV_SGE 2 + +#define RDS_IW_DEFAULT_RECV_WR 1024 +#define RDS_IW_DEFAULT_SEND_WR 256 + +#define RDS_IW_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ + +extern struct list_head rds_iw_devices; + +/* + * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to + * try and minimize the amount of memory tied up both the device and + * socket receive queues. + */ +/* page offset of the final full frag that fits in the page */ +#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE) +struct rds_page_frag { + struct list_head f_item; + struct page *f_page; + unsigned long f_offset; + dma_addr_t f_mapped; +}; + +struct rds_iw_incoming { + struct list_head ii_frags; + struct rds_incoming ii_inc; +}; + +struct rds_iw_connect_private { + /* Add new fields at the end, and don't permute existing fields. */ + __be32 dp_saddr; + __be32 dp_daddr; + u8 dp_protocol_major; + u8 dp_protocol_minor; + __be16 dp_protocol_minor_mask; /* bitmask */ + __be32 dp_reserved1; + __be64 dp_ack_seq; + __be32 dp_credit; /* non-zero enables flow ctl */ +}; + +struct rds_iw_scatterlist { + struct scatterlist *list; + unsigned int len; + int dma_len; + unsigned int dma_npages; + unsigned int bytes; +}; + +struct rds_iw_mapping { + spinlock_t m_lock; /* protect the mapping struct */ + struct list_head m_list; + struct rds_iw_mr *m_mr; + uint32_t m_rkey; + struct rds_iw_scatterlist m_sg; +}; + +struct rds_iw_send_work { + struct rds_message *s_rm; + + /* We should really put these into a union: */ + struct rm_rdma_op *s_op; + struct rds_iw_mapping *s_mapping; + struct ib_mr *s_mr; + struct ib_fast_reg_page_list *s_page_list; + unsigned char s_remap_count; + + struct ib_send_wr s_wr; + struct ib_sge s_sge[RDS_IW_MAX_SGE]; + unsigned long s_queued; +}; + +struct rds_iw_recv_work { + struct rds_iw_incoming *r_iwinc; + struct rds_page_frag *r_frag; + struct ib_recv_wr r_wr; + struct ib_sge r_sge[2]; +}; + +struct rds_iw_work_ring { + u32 w_nr; + u32 w_alloc_ptr; + u32 w_alloc_ctr; + u32 w_free_ptr; + atomic_t w_free_ctr; +}; + +struct rds_iw_device; + +struct rds_iw_connection { + + struct list_head iw_node; + struct rds_iw_device *rds_iwdev; + struct rds_connection *conn; + + /* alphabet soup, IBTA style */ + struct rdma_cm_id *i_cm_id; + struct ib_pd *i_pd; + struct ib_mr *i_mr; + struct ib_cq *i_send_cq; + struct ib_cq *i_recv_cq; + + /* tx */ + struct rds_iw_work_ring i_send_ring; + struct rds_message *i_rm; + struct rds_header *i_send_hdrs; + u64 i_send_hdrs_dma; + struct rds_iw_send_work *i_sends; + + /* rx */ + struct tasklet_struct i_recv_tasklet; + struct mutex i_recv_mutex; + struct rds_iw_work_ring i_recv_ring; + struct rds_iw_incoming *i_iwinc; + u32 i_recv_data_rem; + struct rds_header *i_recv_hdrs; + u64 i_recv_hdrs_dma; + struct rds_iw_recv_work *i_recvs; + struct rds_page_frag i_frag; + u64 i_ack_recv; /* last ACK received */ + + /* sending acks */ + unsigned long i_ack_flags; +#ifdef KERNEL_HAS_ATOMIC64 + atomic64_t i_ack_next; /* next ACK to send */ +#else + spinlock_t i_ack_lock; /* protect i_ack_next */ + u64 i_ack_next; /* next ACK to send */ +#endif + struct rds_header *i_ack; + struct ib_send_wr i_ack_wr; + struct ib_sge i_ack_sge; + u64 i_ack_dma; + unsigned long i_ack_queued; + + /* Flow control related information + * + * Our algorithm uses a pair variables that we need to access + * atomically - one for the send credits, and one posted + * recv credits we need to transfer to remote. + * Rather than protect them using a slow spinlock, we put both into + * a single atomic_t and update it using cmpxchg + */ + atomic_t i_credits; + + /* Protocol version specific information */ + unsigned int i_flowctl:1; /* enable/disable flow ctl */ + unsigned int i_dma_local_lkey:1; + unsigned int i_fastreg_posted:1; /* fastreg posted on this connection */ + /* Batched completions */ + unsigned int i_unsignaled_wrs; + long i_unsignaled_bytes; +}; + +/* This assumes that atomic_t is at least 32 bits */ +#define IB_GET_SEND_CREDITS(v) ((v) & 0xffff) +#define IB_GET_POST_CREDITS(v) ((v) >> 16) +#define IB_SET_SEND_CREDITS(v) ((v) & 0xffff) +#define IB_SET_POST_CREDITS(v) ((v) << 16) + +struct rds_iw_cm_id { + struct list_head list; + struct rdma_cm_id *cm_id; +}; + +struct rds_iw_device { + struct list_head list; + struct list_head cm_id_list; + struct list_head conn_list; + struct ib_device *dev; + struct ib_pd *pd; + struct ib_mr *mr; + struct rds_iw_mr_pool *mr_pool; + int max_sge; + unsigned int max_wrs; + unsigned int dma_local_lkey:1; + spinlock_t spinlock; /* protect the above */ +}; + +/* bits for i_ack_flags */ +#define IB_ACK_IN_FLIGHT 0 +#define IB_ACK_REQUESTED 1 + +/* Magic WR_ID for ACKs */ +#define RDS_IW_ACK_WR_ID ((u64)0xffffffffffffffffULL) +#define RDS_IW_FAST_REG_WR_ID ((u64)0xefefefefefefefefULL) +#define RDS_IW_LOCAL_INV_WR_ID ((u64)0xdfdfdfdfdfdfdfdfULL) + +struct rds_iw_statistics { + uint64_t s_iw_connect_raced; + uint64_t s_iw_listen_closed_stale; + uint64_t s_iw_tx_cq_call; + uint64_t s_iw_tx_cq_event; + uint64_t s_iw_tx_ring_full; + uint64_t s_iw_tx_throttle; + uint64_t s_iw_tx_sg_mapping_failure; + uint64_t s_iw_tx_stalled; + uint64_t s_iw_tx_credit_updates; + uint64_t s_iw_rx_cq_call; + uint64_t s_iw_rx_cq_event; + uint64_t s_iw_rx_ring_empty; + uint64_t s_iw_rx_refill_from_cq; + uint64_t s_iw_rx_refill_from_thread; + uint64_t s_iw_rx_alloc_limit; + uint64_t s_iw_rx_credit_updates; + uint64_t s_iw_ack_sent; + uint64_t s_iw_ack_send_failure; + uint64_t s_iw_ack_send_delayed; + uint64_t s_iw_ack_send_piggybacked; + uint64_t s_iw_ack_received; + uint64_t s_iw_rdma_mr_alloc; + uint64_t s_iw_rdma_mr_free; + uint64_t s_iw_rdma_mr_used; + uint64_t s_iw_rdma_mr_pool_flush; + uint64_t s_iw_rdma_mr_pool_wait; + uint64_t s_iw_rdma_mr_pool_depleted; +}; + +extern struct workqueue_struct *rds_iw_wq; + +/* + * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h + * doesn't define it. + */ +static inline void rds_iw_dma_sync_sg_for_cpu(struct ib_device *dev, + struct scatterlist *sg, unsigned int sg_dma_len, int direction) +{ + unsigned int i; + + for (i = 0; i < sg_dma_len; ++i) { + ib_dma_sync_single_for_cpu(dev, + ib_sg_dma_address(dev, &sg[i]), + ib_sg_dma_len(dev, &sg[i]), + direction); + } +} +#define ib_dma_sync_sg_for_cpu rds_iw_dma_sync_sg_for_cpu + +static inline void rds_iw_dma_sync_sg_for_device(struct ib_device *dev, + struct scatterlist *sg, unsigned int sg_dma_len, int direction) +{ + unsigned int i; + + for (i = 0; i < sg_dma_len; ++i) { + ib_dma_sync_single_for_device(dev, + ib_sg_dma_address(dev, &sg[i]), + ib_sg_dma_len(dev, &sg[i]), + direction); + } +} +#define ib_dma_sync_sg_for_device rds_iw_dma_sync_sg_for_device + +static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic) +{ + return ic->i_dma_local_lkey ? ic->i_cm_id->device->local_dma_lkey : ic->i_mr->lkey; +} + +/* ib.c */ +extern struct rds_transport rds_iw_transport; +extern struct ib_client rds_iw_client; + +extern unsigned int fastreg_pool_size; +extern unsigned int fastreg_message_size; + +extern spinlock_t iw_nodev_conns_lock; +extern struct list_head iw_nodev_conns; + +/* ib_cm.c */ +int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp); +void rds_iw_conn_free(void *arg); +int rds_iw_conn_connect(struct rds_connection *conn); +void rds_iw_conn_shutdown(struct rds_connection *conn); +void rds_iw_state_change(struct sock *sk); +int rds_iw_listen_init(void); +void rds_iw_listen_stop(void); +void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...); +int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); +int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id); +void rds_iw_cm_connect_complete(struct rds_connection *conn, + struct rdma_cm_event *event); + + +#define rds_iw_conn_error(conn, fmt...) \ + __rds_iw_conn_error(conn, KERN_WARNING "RDS/IW: " fmt) + +/* ib_rdma.c */ +int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id); +void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn); +void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn); +void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock); +static inline void rds_iw_destroy_nodev_conns(void) +{ + __rds_iw_destroy_conns(&iw_nodev_conns, &iw_nodev_conns_lock); +} +static inline void rds_iw_destroy_conns(struct rds_iw_device *rds_iwdev) +{ + __rds_iw_destroy_conns(&rds_iwdev->conn_list, &rds_iwdev->spinlock); +} +struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *); +void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo); +void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *); +void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents, + struct rds_sock *rs, u32 *key_ret); +void rds_iw_sync_mr(void *trans_private, int dir); +void rds_iw_free_mr(void *trans_private, int invalidate); +void rds_iw_flush_mrs(void); + +/* ib_recv.c */ +int rds_iw_recv_init(void); +void rds_iw_recv_exit(void); +int rds_iw_recv(struct rds_connection *conn); +int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, + gfp_t page_gfp, int prefill); +void rds_iw_inc_free(struct rds_incoming *inc); +int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, + size_t size); +void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context); +void rds_iw_recv_tasklet_fn(unsigned long data); +void rds_iw_recv_init_ring(struct rds_iw_connection *ic); +void rds_iw_recv_clear_ring(struct rds_iw_connection *ic); +void rds_iw_recv_init_ack(struct rds_iw_connection *ic); +void rds_iw_attempt_ack(struct rds_iw_connection *ic); +void rds_iw_ack_send_complete(struct rds_iw_connection *ic); +u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic); + +/* ib_ring.c */ +void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr); +void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr); +u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos); +void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val); +void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val); +int rds_iw_ring_empty(struct rds_iw_work_ring *ring); +int rds_iw_ring_low(struct rds_iw_work_ring *ring); +u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring); +u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest); +extern wait_queue_head_t rds_iw_ring_empty_wait; + +/* ib_send.c */ +void rds_iw_xmit_complete(struct rds_connection *conn); +int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, unsigned int off); +void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context); +void rds_iw_send_init_ring(struct rds_iw_connection *ic); +void rds_iw_send_clear_ring(struct rds_iw_connection *ic); +int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op); +void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits); +void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted); +int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted, + u32 *adv_credits, int need_posted, int max_posted); + +/* ib_stats.c */ +DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats); +#define rds_iw_stats_inc(member) rds_stats_inc_which(rds_iw_stats, member) +unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail); + +/* ib_sysctl.c */ +int rds_iw_sysctl_init(void); +void rds_iw_sysctl_exit(void); +extern unsigned long rds_iw_sysctl_max_send_wr; +extern unsigned long rds_iw_sysctl_max_recv_wr; +extern unsigned long rds_iw_sysctl_max_unsig_wrs; +extern unsigned long rds_iw_sysctl_max_unsig_bytes; +extern unsigned long rds_iw_sysctl_max_recv_allocation; +extern unsigned int rds_iw_sysctl_flow_control; + +/* + * Helper functions for getting/setting the header and data SGEs in + * RDS packets (not RDMA) + */ +static inline struct ib_sge * +rds_iw_header_sge(struct rds_iw_connection *ic, struct ib_sge *sge) +{ + return &sge[0]; +} + +static inline struct ib_sge * +rds_iw_data_sge(struct rds_iw_connection *ic, struct ib_sge *sge) +{ + return &sge[1]; +} + +#endif diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c new file mode 100644 index 0000000..a91e1db --- /dev/null +++ b/net/rds/iw_cm.c @@ -0,0 +1,765 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include +#include + +#include "rds.h" +#include "iw.h" + +/* + * Set the selected protocol version + */ +static void rds_iw_set_protocol(struct rds_connection *conn, unsigned int version) +{ + conn->c_version = version; +} + +/* + * Set up flow control + */ +static void rds_iw_set_flow_control(struct rds_connection *conn, u32 credits) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + + if (rds_iw_sysctl_flow_control && credits != 0) { + /* We're doing flow control */ + ic->i_flowctl = 1; + rds_iw_send_add_credits(conn, credits); + } else { + ic->i_flowctl = 0; + } +} + +/* + * Connection established. + * We get here for both outgoing and incoming connection. + */ +void rds_iw_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event) +{ + const struct rds_iw_connect_private *dp = NULL; + struct rds_iw_connection *ic = conn->c_transport_data; + struct rds_iw_device *rds_iwdev; + int err; + + if (event->param.conn.private_data_len) { + dp = event->param.conn.private_data; + + rds_iw_set_protocol(conn, + RDS_PROTOCOL(dp->dp_protocol_major, + dp->dp_protocol_minor)); + rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); + } + + /* update ib_device with this local ipaddr & conn */ + rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); + err = rds_iw_update_cm_id(rds_iwdev, ic->i_cm_id); + if (err) + printk(KERN_ERR "rds_iw_update_ipaddr failed (%d)\n", err); + rds_iw_add_conn(rds_iwdev, conn); + + /* If the peer gave us the last packet it saw, process this as if + * we had received a regular ACK. */ + if (dp && dp->dp_ack_seq) + rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); + + printk(KERN_NOTICE "RDS/IW: connected to %pI4<->%pI4 version %u.%u%s\n", + &conn->c_laddr, &conn->c_faddr, + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version), + ic->i_flowctl ? ", flow control" : ""); + + rds_connect_complete(conn); +} + +static void rds_iw_cm_fill_conn_param(struct rds_connection *conn, + struct rdma_conn_param *conn_param, + struct rds_iw_connect_private *dp, + u32 protocol_version) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + + memset(conn_param, 0, sizeof(struct rdma_conn_param)); + /* XXX tune these? */ + conn_param->responder_resources = 1; + conn_param->initiator_depth = 1; + + if (dp) { + memset(dp, 0, sizeof(*dp)); + dp->dp_saddr = conn->c_laddr; + dp->dp_daddr = conn->c_faddr; + dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version); + dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); + dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IW_SUPPORTED_PROTOCOLS); + dp->dp_ack_seq = rds_iw_piggyb_ack(ic); + + /* Advertise flow control */ + if (ic->i_flowctl) { + unsigned int credits; + + credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)); + dp->dp_credit = cpu_to_be32(credits); + atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits); + } + + conn_param->private_data = dp; + conn_param->private_data_len = sizeof(*dp); + } +} + +static void rds_iw_cq_event_handler(struct ib_event *event, void *data) +{ + rdsdebug("event %u data %p\n", event->event, data); +} + +static void rds_iw_qp_event_handler(struct ib_event *event, void *data) +{ + struct rds_connection *conn = data; + struct rds_iw_connection *ic = conn->c_transport_data; + + rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event); + + switch (event->event) { + case IB_EVENT_COMM_EST: + rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); + break; + case IB_EVENT_QP_REQ_ERR: + case IB_EVENT_QP_FATAL: + default: + rdsdebug("Fatal QP Event %u " + "- connection %pI4->%pI4, reconnecting\n", + event->event, &conn->c_laddr, + &conn->c_faddr); + rds_conn_drop(conn); + break; + } +} + +/* + * Create a QP + */ +static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr, + struct rds_iw_device *rds_iwdev, + struct rds_iw_work_ring *send_ring, + void (*send_cq_handler)(struct ib_cq *, void *), + struct rds_iw_work_ring *recv_ring, + void (*recv_cq_handler)(struct ib_cq *, void *), + void *context) +{ + struct ib_device *dev = rds_iwdev->dev; + unsigned int send_size, recv_size; + int ret; + + /* The offset of 1 is to accommodate the additional ACK WR. */ + send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1); + recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1); + rds_iw_ring_resize(send_ring, send_size - 1); + rds_iw_ring_resize(recv_ring, recv_size - 1); + + memset(attr, 0, sizeof(*attr)); + attr->event_handler = rds_iw_qp_event_handler; + attr->qp_context = context; + attr->cap.max_send_wr = send_size; + attr->cap.max_recv_wr = recv_size; + attr->cap.max_send_sge = rds_iwdev->max_sge; + attr->cap.max_recv_sge = RDS_IW_RECV_SGE; + attr->sq_sig_type = IB_SIGNAL_REQ_WR; + attr->qp_type = IB_QPT_RC; + + attr->send_cq = ib_create_cq(dev, send_cq_handler, + rds_iw_cq_event_handler, + context, send_size, 0); + if (IS_ERR(attr->send_cq)) { + ret = PTR_ERR(attr->send_cq); + attr->send_cq = NULL; + rdsdebug("ib_create_cq send failed: %d\n", ret); + goto out; + } + + attr->recv_cq = ib_create_cq(dev, recv_cq_handler, + rds_iw_cq_event_handler, + context, recv_size, 0); + if (IS_ERR(attr->recv_cq)) { + ret = PTR_ERR(attr->recv_cq); + attr->recv_cq = NULL; + rdsdebug("ib_create_cq send failed: %d\n", ret); + goto out; + } + + ret = ib_req_notify_cq(attr->send_cq, IB_CQ_NEXT_COMP); + if (ret) { + rdsdebug("ib_req_notify_cq send failed: %d\n", ret); + goto out; + } + + ret = ib_req_notify_cq(attr->recv_cq, IB_CQ_SOLICITED); + if (ret) { + rdsdebug("ib_req_notify_cq recv failed: %d\n", ret); + goto out; + } + +out: + if (ret) { + if (attr->send_cq) + ib_destroy_cq(attr->send_cq); + if (attr->recv_cq) + ib_destroy_cq(attr->recv_cq); + } + return ret; +} + +/* + * This needs to be very careful to not leave IS_ERR pointers around for + * cleanup to trip over. + */ +static int rds_iw_setup_qp(struct rds_connection *conn) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + struct ib_device *dev = ic->i_cm_id->device; + struct ib_qp_init_attr attr; + struct rds_iw_device *rds_iwdev; + int ret; + + /* rds_iw_add_one creates a rds_iw_device object per IB device, + * and allocates a protection domain, memory range and MR pool + * for each. If that fails for any reason, it will not register + * the rds_iwdev at all. + */ + rds_iwdev = ib_get_client_data(dev, &rds_iw_client); + if (!rds_iwdev) { + printk_ratelimited(KERN_NOTICE "RDS/IW: No client_data for device %s\n", + dev->name); + return -EOPNOTSUPP; + } + + /* Protection domain and memory range */ + ic->i_pd = rds_iwdev->pd; + ic->i_mr = rds_iwdev->mr; + + ret = rds_iw_init_qp_attrs(&attr, rds_iwdev, + &ic->i_send_ring, rds_iw_send_cq_comp_handler, + &ic->i_recv_ring, rds_iw_recv_cq_comp_handler, + conn); + if (ret < 0) + goto out; + + ic->i_send_cq = attr.send_cq; + ic->i_recv_cq = attr.recv_cq; + + /* + * XXX this can fail if max_*_wr is too large? Are we supposed + * to back off until we get a value that the hardware can support? + */ + ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr); + if (ret) { + rdsdebug("rdma_create_qp failed: %d\n", ret); + goto out; + } + + ic->i_send_hdrs = ib_dma_alloc_coherent(dev, + ic->i_send_ring.w_nr * + sizeof(struct rds_header), + &ic->i_send_hdrs_dma, GFP_KERNEL); + if (!ic->i_send_hdrs) { + ret = -ENOMEM; + rdsdebug("ib_dma_alloc_coherent send failed\n"); + goto out; + } + + ic->i_recv_hdrs = ib_dma_alloc_coherent(dev, + ic->i_recv_ring.w_nr * + sizeof(struct rds_header), + &ic->i_recv_hdrs_dma, GFP_KERNEL); + if (!ic->i_recv_hdrs) { + ret = -ENOMEM; + rdsdebug("ib_dma_alloc_coherent recv failed\n"); + goto out; + } + + ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), + &ic->i_ack_dma, GFP_KERNEL); + if (!ic->i_ack) { + ret = -ENOMEM; + rdsdebug("ib_dma_alloc_coherent ack failed\n"); + goto out; + } + + ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work)); + if (!ic->i_sends) { + ret = -ENOMEM; + rdsdebug("send allocation failed\n"); + goto out; + } + rds_iw_send_init_ring(ic); + + ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work)); + if (!ic->i_recvs) { + ret = -ENOMEM; + rdsdebug("recv allocation failed\n"); + goto out; + } + + rds_iw_recv_init_ring(ic); + rds_iw_recv_init_ack(ic); + + /* Post receive buffers - as a side effect, this will update + * the posted credit count. */ + rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1); + + rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr, + ic->i_send_cq, ic->i_recv_cq); + +out: + return ret; +} + +static u32 rds_iw_protocol_compatible(const struct rds_iw_connect_private *dp) +{ + u16 common; + u32 version = 0; + + /* rdma_cm private data is odd - when there is any private data in the + * request, we will be given a pretty large buffer without telling us the + * original size. The only way to tell the difference is by looking at + * the contents, which are initialized to zero. + * If the protocol version fields aren't set, this is a connection attempt + * from an older version. This could could be 3.0 or 2.0 - we can't tell. + * We really should have changed this for OFED 1.3 :-( */ + if (dp->dp_protocol_major == 0) + return RDS_PROTOCOL_3_0; + + common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IW_SUPPORTED_PROTOCOLS; + if (dp->dp_protocol_major == 3 && common) { + version = RDS_PROTOCOL_3_0; + while ((common >>= 1) != 0) + version++; + } + printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using " + "incompatible protocol version %u.%u\n", + &dp->dp_saddr, + dp->dp_protocol_major, + dp->dp_protocol_minor); + return version; +} + +int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + const struct rds_iw_connect_private *dp = event->param.conn.private_data; + struct rds_iw_connect_private dp_rep; + struct rds_connection *conn = NULL; + struct rds_iw_connection *ic = NULL; + struct rdma_conn_param conn_param; + struct rds_iw_device *rds_iwdev; + u32 version; + int err, destroy = 1; + + /* Check whether the remote protocol version matches ours. */ + version = rds_iw_protocol_compatible(dp); + if (!version) + goto out; + + rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u\n", + &dp->dp_saddr, &dp->dp_daddr, + RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version)); + + conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_iw_transport, + GFP_KERNEL); + if (IS_ERR(conn)) { + rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); + conn = NULL; + goto out; + } + + /* + * The connection request may occur while the + * previous connection exist, e.g. in case of failover. + * But as connections may be initiated simultaneously + * by both hosts, we have a random backoff mechanism - + * see the comment above rds_queue_reconnect() + */ + mutex_lock(&conn->c_cm_lock); + if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { + if (rds_conn_state(conn) == RDS_CONN_UP) { + rdsdebug("incoming connect while connecting\n"); + rds_conn_drop(conn); + rds_iw_stats_inc(s_iw_listen_closed_stale); + } else + if (rds_conn_state(conn) == RDS_CONN_CONNECTING) { + /* Wait and see - our connect may still be succeeding */ + rds_iw_stats_inc(s_iw_connect_raced); + } + mutex_unlock(&conn->c_cm_lock); + goto out; + } + + ic = conn->c_transport_data; + + rds_iw_set_protocol(conn, version); + rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); + + /* If the peer gave us the last packet it saw, process this as if + * we had received a regular ACK. */ + if (dp->dp_ack_seq) + rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); + + BUG_ON(cm_id->context); + BUG_ON(ic->i_cm_id); + + ic->i_cm_id = cm_id; + cm_id->context = conn; + + rds_iwdev = ib_get_client_data(cm_id->device, &rds_iw_client); + ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey; + + /* We got halfway through setting up the ib_connection, if we + * fail now, we have to take the long route out of this mess. */ + destroy = 0; + + err = rds_iw_setup_qp(conn); + if (err) { + rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", err); + mutex_unlock(&conn->c_cm_lock); + goto out; + } + + rds_iw_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); + + /* rdma_accept() calls rdma_reject() internally if it fails */ + err = rdma_accept(cm_id, &conn_param); + mutex_unlock(&conn->c_cm_lock); + if (err) { + rds_iw_conn_error(conn, "rdma_accept failed (%d)\n", err); + goto out; + } + + return 0; + +out: + rdma_reject(cm_id, NULL, 0); + return destroy; +} + + +int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id) +{ + struct rds_connection *conn = cm_id->context; + struct rds_iw_connection *ic = conn->c_transport_data; + struct rdma_conn_param conn_param; + struct rds_iw_connect_private dp; + int ret; + + /* If the peer doesn't do protocol negotiation, we must + * default to RDSv3.0 */ + rds_iw_set_protocol(conn, RDS_PROTOCOL_3_0); + ic->i_flowctl = rds_iw_sysctl_flow_control; /* advertise flow control */ + + ret = rds_iw_setup_qp(conn); + if (ret) { + rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", ret); + goto out; + } + + rds_iw_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION); + + ret = rdma_connect(cm_id, &conn_param); + if (ret) + rds_iw_conn_error(conn, "rdma_connect failed (%d)\n", ret); + +out: + /* Beware - returning non-zero tells the rdma_cm to destroy + * the cm_id. We should certainly not do it as long as we still + * "own" the cm_id. */ + if (ret) { + struct rds_iw_connection *ic = conn->c_transport_data; + + if (ic->i_cm_id == cm_id) + ret = 0; + } + return ret; +} + +int rds_iw_conn_connect(struct rds_connection *conn) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + struct rds_iw_device *rds_iwdev; + struct sockaddr_in src, dest; + int ret; + + /* XXX I wonder what affect the port space has */ + /* delegate cm event handler to rdma_transport */ + ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(ic->i_cm_id)) { + ret = PTR_ERR(ic->i_cm_id); + ic->i_cm_id = NULL; + rdsdebug("rdma_create_id() failed: %d\n", ret); + goto out; + } + + rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn); + + src.sin_family = AF_INET; + src.sin_addr.s_addr = (__force u32)conn->c_laddr; + src.sin_port = (__force u16)htons(0); + + /* First, bind to the local address and device. */ + ret = rdma_bind_addr(ic->i_cm_id, (struct sockaddr *) &src); + if (ret) { + rdsdebug("rdma_bind_addr(%pI4) failed: %d\n", + &conn->c_laddr, ret); + rdma_destroy_id(ic->i_cm_id); + ic->i_cm_id = NULL; + goto out; + } + + rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); + ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey; + + dest.sin_family = AF_INET; + dest.sin_addr.s_addr = (__force u32)conn->c_faddr; + dest.sin_port = (__force u16)htons(RDS_PORT); + + ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src, + (struct sockaddr *)&dest, + RDS_RDMA_RESOLVE_TIMEOUT_MS); + if (ret) { + rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id, + ret); + rdma_destroy_id(ic->i_cm_id); + ic->i_cm_id = NULL; + } + +out: + return ret; +} + +/* + * This is so careful about only cleaning up resources that were built up + * so that it can be called at any point during startup. In fact it + * can be called multiple times for a given connection. + */ +void rds_iw_conn_shutdown(struct rds_connection *conn) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + int err = 0; + struct ib_qp_attr qp_attr; + + rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id, + ic->i_pd, ic->i_send_cq, ic->i_recv_cq, + ic->i_cm_id ? ic->i_cm_id->qp : NULL); + + if (ic->i_cm_id) { + struct ib_device *dev = ic->i_cm_id->device; + + rdsdebug("disconnecting cm %p\n", ic->i_cm_id); + err = rdma_disconnect(ic->i_cm_id); + if (err) { + /* Actually this may happen quite frequently, when + * an outgoing connect raced with an incoming connect. + */ + rdsdebug("rds_iw_conn_shutdown: failed to disconnect," + " cm: %p err %d\n", ic->i_cm_id, err); + } + + if (ic->i_cm_id->qp) { + qp_attr.qp_state = IB_QPS_ERR; + ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE); + } + + wait_event(rds_iw_ring_empty_wait, + rds_iw_ring_empty(&ic->i_send_ring) && + rds_iw_ring_empty(&ic->i_recv_ring)); + + if (ic->i_send_hdrs) + ib_dma_free_coherent(dev, + ic->i_send_ring.w_nr * + sizeof(struct rds_header), + ic->i_send_hdrs, + ic->i_send_hdrs_dma); + + if (ic->i_recv_hdrs) + ib_dma_free_coherent(dev, + ic->i_recv_ring.w_nr * + sizeof(struct rds_header), + ic->i_recv_hdrs, + ic->i_recv_hdrs_dma); + + if (ic->i_ack) + ib_dma_free_coherent(dev, sizeof(struct rds_header), + ic->i_ack, ic->i_ack_dma); + + if (ic->i_sends) + rds_iw_send_clear_ring(ic); + if (ic->i_recvs) + rds_iw_recv_clear_ring(ic); + + if (ic->i_cm_id->qp) + rdma_destroy_qp(ic->i_cm_id); + if (ic->i_send_cq) + ib_destroy_cq(ic->i_send_cq); + if (ic->i_recv_cq) + ib_destroy_cq(ic->i_recv_cq); + + /* + * If associated with an rds_iw_device: + * Move connection back to the nodev list. + * Remove cm_id from the device cm_id list. + */ + if (ic->rds_iwdev) + rds_iw_remove_conn(ic->rds_iwdev, conn); + + rdma_destroy_id(ic->i_cm_id); + + ic->i_cm_id = NULL; + ic->i_pd = NULL; + ic->i_mr = NULL; + ic->i_send_cq = NULL; + ic->i_recv_cq = NULL; + ic->i_send_hdrs = NULL; + ic->i_recv_hdrs = NULL; + ic->i_ack = NULL; + } + BUG_ON(ic->rds_iwdev); + + /* Clear pending transmit */ + if (ic->i_rm) { + rds_message_put(ic->i_rm); + ic->i_rm = NULL; + } + + /* Clear the ACK state */ + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); +#ifdef KERNEL_HAS_ATOMIC64 + atomic64_set(&ic->i_ack_next, 0); +#else + ic->i_ack_next = 0; +#endif + ic->i_ack_recv = 0; + + /* Clear flow control state */ + ic->i_flowctl = 0; + atomic_set(&ic->i_credits, 0); + + rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr); + rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr); + + if (ic->i_iwinc) { + rds_inc_put(&ic->i_iwinc->ii_inc); + ic->i_iwinc = NULL; + } + + vfree(ic->i_sends); + ic->i_sends = NULL; + vfree(ic->i_recvs); + ic->i_recvs = NULL; + rdsdebug("shutdown complete\n"); +} + +int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp) +{ + struct rds_iw_connection *ic; + unsigned long flags; + + /* XXX too lazy? */ + ic = kzalloc(sizeof(struct rds_iw_connection), gfp); + if (!ic) + return -ENOMEM; + + INIT_LIST_HEAD(&ic->iw_node); + tasklet_init(&ic->i_recv_tasklet, rds_iw_recv_tasklet_fn, + (unsigned long) ic); + mutex_init(&ic->i_recv_mutex); +#ifndef KERNEL_HAS_ATOMIC64 + spin_lock_init(&ic->i_ack_lock); +#endif + + /* + * rds_iw_conn_shutdown() waits for these to be emptied so they + * must be initialized before it can be called. + */ + rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr); + rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr); + + ic->conn = conn; + conn->c_transport_data = ic; + + spin_lock_irqsave(&iw_nodev_conns_lock, flags); + list_add_tail(&ic->iw_node, &iw_nodev_conns); + spin_unlock_irqrestore(&iw_nodev_conns_lock, flags); + + + rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data); + return 0; +} + +/* + * Free a connection. Connection must be shut down and not set for reconnect. + */ +void rds_iw_conn_free(void *arg) +{ + struct rds_iw_connection *ic = arg; + spinlock_t *lock_ptr; + + rdsdebug("ic %p\n", ic); + + /* + * Conn is either on a dev's list or on the nodev list. + * A race with shutdown() or connect() would cause problems + * (since rds_iwdev would change) but that should never happen. + */ + lock_ptr = ic->rds_iwdev ? &ic->rds_iwdev->spinlock : &iw_nodev_conns_lock; + + spin_lock_irq(lock_ptr); + list_del(&ic->iw_node); + spin_unlock_irq(lock_ptr); + + kfree(ic); +} + +/* + * An error occurred on the connection + */ +void +__rds_iw_conn_error(struct rds_connection *conn, const char *fmt, ...) +{ + va_list ap; + + rds_conn_drop(conn); + + va_start(ap, fmt); + vprintk(fmt, ap); + va_end(ap); +} diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c new file mode 100644 index 0000000..a817705 --- /dev/null +++ b/net/rds/iw_rdma.c @@ -0,0 +1,871 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include + +#include "rds.h" +#include "iw.h" + + +/* + * This is stored as mr->r_trans_private. + */ +struct rds_iw_mr { + struct rds_iw_device *device; + struct rds_iw_mr_pool *pool; + struct rdma_cm_id *cm_id; + + struct ib_mr *mr; + struct ib_fast_reg_page_list *page_list; + + struct rds_iw_mapping mapping; + unsigned char remap_count; +}; + +/* + * Our own little MR pool + */ +struct rds_iw_mr_pool { + struct rds_iw_device *device; /* back ptr to the device that owns us */ + + struct mutex flush_lock; /* serialize fmr invalidate */ + struct work_struct flush_worker; /* flush worker */ + + spinlock_t list_lock; /* protect variables below */ + atomic_t item_count; /* total # of MRs */ + atomic_t dirty_count; /* # dirty of MRs */ + struct list_head dirty_list; /* dirty mappings */ + struct list_head clean_list; /* unused & unamapped MRs */ + atomic_t free_pinned; /* memory pinned by free MRs */ + unsigned long max_message_size; /* in pages */ + unsigned long max_items; + unsigned long max_items_soft; + unsigned long max_free_pinned; + int max_pages; +}; + +static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all); +static void rds_iw_mr_pool_flush_worker(struct work_struct *work); +static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); +static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool, + struct rds_iw_mr *ibmr, + struct scatterlist *sg, unsigned int nents); +static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); +static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, + struct list_head *unmap_list, + struct list_head *kill_list, + int *unpinned); +static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); + +static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwdev, struct rdma_cm_id **cm_id) +{ + struct rds_iw_device *iwdev; + struct rds_iw_cm_id *i_cm_id; + + *rds_iwdev = NULL; + *cm_id = NULL; + + list_for_each_entry(iwdev, &rds_iw_devices, list) { + spin_lock_irq(&iwdev->spinlock); + list_for_each_entry(i_cm_id, &iwdev->cm_id_list, list) { + struct sockaddr_in *src_addr, *dst_addr; + + src_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.src_addr; + dst_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.dst_addr; + + rdsdebug("local ipaddr = %x port %d, " + "remote ipaddr = %x port %d" + "..looking for %x port %d, " + "remote ipaddr = %x port %d\n", + src_addr->sin_addr.s_addr, + src_addr->sin_port, + dst_addr->sin_addr.s_addr, + dst_addr->sin_port, + rs->rs_bound_addr, + rs->rs_bound_port, + rs->rs_conn_addr, + rs->rs_conn_port); +#ifdef WORKING_TUPLE_DETECTION + if (src_addr->sin_addr.s_addr == rs->rs_bound_addr && + src_addr->sin_port == rs->rs_bound_port && + dst_addr->sin_addr.s_addr == rs->rs_conn_addr && + dst_addr->sin_port == rs->rs_conn_port) { +#else + /* FIXME - needs to compare the local and remote + * ipaddr/port tuple, but the ipaddr is the only + * available information in the rds_sock (as the rest are + * zero'ed. It doesn't appear to be properly populated + * during connection setup... + */ + if (src_addr->sin_addr.s_addr == rs->rs_bound_addr) { +#endif + spin_unlock_irq(&iwdev->spinlock); + *rds_iwdev = iwdev; + *cm_id = i_cm_id->cm_id; + return 0; + } + } + spin_unlock_irq(&iwdev->spinlock); + } + + return 1; +} + +static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id) +{ + struct rds_iw_cm_id *i_cm_id; + + i_cm_id = kmalloc(sizeof *i_cm_id, GFP_KERNEL); + if (!i_cm_id) + return -ENOMEM; + + i_cm_id->cm_id = cm_id; + + spin_lock_irq(&rds_iwdev->spinlock); + list_add_tail(&i_cm_id->list, &rds_iwdev->cm_id_list); + spin_unlock_irq(&rds_iwdev->spinlock); + + return 0; +} + +static void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, + struct rdma_cm_id *cm_id) +{ + struct rds_iw_cm_id *i_cm_id; + + spin_lock_irq(&rds_iwdev->spinlock); + list_for_each_entry(i_cm_id, &rds_iwdev->cm_id_list, list) { + if (i_cm_id->cm_id == cm_id) { + list_del(&i_cm_id->list); + kfree(i_cm_id); + break; + } + } + spin_unlock_irq(&rds_iwdev->spinlock); +} + + +int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id) +{ + struct sockaddr_in *src_addr, *dst_addr; + struct rds_iw_device *rds_iwdev_old; + struct rds_sock rs; + struct rdma_cm_id *pcm_id; + int rc; + + src_addr = (struct sockaddr_in *)&cm_id->route.addr.src_addr; + dst_addr = (struct sockaddr_in *)&cm_id->route.addr.dst_addr; + + rs.rs_bound_addr = src_addr->sin_addr.s_addr; + rs.rs_bound_port = src_addr->sin_port; + rs.rs_conn_addr = dst_addr->sin_addr.s_addr; + rs.rs_conn_port = dst_addr->sin_port; + + rc = rds_iw_get_device(&rs, &rds_iwdev_old, &pcm_id); + if (rc) + rds_iw_remove_cm_id(rds_iwdev, cm_id); + + return rds_iw_add_cm_id(rds_iwdev, cm_id); +} + +void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + + /* conn was previously on the nodev_conns_list */ + spin_lock_irq(&iw_nodev_conns_lock); + BUG_ON(list_empty(&iw_nodev_conns)); + BUG_ON(list_empty(&ic->iw_node)); + list_del(&ic->iw_node); + + spin_lock(&rds_iwdev->spinlock); + list_add_tail(&ic->iw_node, &rds_iwdev->conn_list); + spin_unlock(&rds_iwdev->spinlock); + spin_unlock_irq(&iw_nodev_conns_lock); + + ic->rds_iwdev = rds_iwdev; +} + +void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + + /* place conn on nodev_conns_list */ + spin_lock(&iw_nodev_conns_lock); + + spin_lock_irq(&rds_iwdev->spinlock); + BUG_ON(list_empty(&ic->iw_node)); + list_del(&ic->iw_node); + spin_unlock_irq(&rds_iwdev->spinlock); + + list_add_tail(&ic->iw_node, &iw_nodev_conns); + + spin_unlock(&iw_nodev_conns_lock); + + rds_iw_remove_cm_id(ic->rds_iwdev, ic->i_cm_id); + ic->rds_iwdev = NULL; +} + +void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock) +{ + struct rds_iw_connection *ic, *_ic; + LIST_HEAD(tmp_list); + + /* avoid calling conn_destroy with irqs off */ + spin_lock_irq(list_lock); + list_splice(list, &tmp_list); + INIT_LIST_HEAD(list); + spin_unlock_irq(list_lock); + + list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) + rds_conn_destroy(ic->conn); +} + +static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg, + struct scatterlist *list, unsigned int sg_len) +{ + sg->list = list; + sg->len = sg_len; + sg->dma_len = 0; + sg->dma_npages = 0; + sg->bytes = 0; +} + +static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev, + struct rds_iw_scatterlist *sg) +{ + struct ib_device *dev = rds_iwdev->dev; + u64 *dma_pages = NULL; + int i, j, ret; + + WARN_ON(sg->dma_len); + + sg->dma_len = ib_dma_map_sg(dev, sg->list, sg->len, DMA_BIDIRECTIONAL); + if (unlikely(!sg->dma_len)) { + printk(KERN_WARNING "RDS/IW: dma_map_sg failed!\n"); + return ERR_PTR(-EBUSY); + } + + sg->bytes = 0; + sg->dma_npages = 0; + + ret = -EINVAL; + for (i = 0; i < sg->dma_len; ++i) { + unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]); + u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]); + u64 end_addr; + + sg->bytes += dma_len; + + end_addr = dma_addr + dma_len; + if (dma_addr & PAGE_MASK) { + if (i > 0) + goto out_unmap; + dma_addr &= ~PAGE_MASK; + } + if (end_addr & PAGE_MASK) { + if (i < sg->dma_len - 1) + goto out_unmap; + end_addr = (end_addr + PAGE_MASK) & ~PAGE_MASK; + } + + sg->dma_npages += (end_addr - dma_addr) >> PAGE_SHIFT; + } + + /* Now gather the dma addrs into one list */ + if (sg->dma_npages > fastreg_message_size) + goto out_unmap; + + dma_pages = kmalloc(sizeof(u64) * sg->dma_npages, GFP_ATOMIC); + if (!dma_pages) { + ret = -ENOMEM; + goto out_unmap; + } + + for (i = j = 0; i < sg->dma_len; ++i) { + unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]); + u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]); + u64 end_addr; + + end_addr = dma_addr + dma_len; + dma_addr &= ~PAGE_MASK; + for (; dma_addr < end_addr; dma_addr += PAGE_SIZE) + dma_pages[j++] = dma_addr; + BUG_ON(j > sg->dma_npages); + } + + return dma_pages; + +out_unmap: + ib_dma_unmap_sg(rds_iwdev->dev, sg->list, sg->len, DMA_BIDIRECTIONAL); + sg->dma_len = 0; + kfree(dma_pages); + return ERR_PTR(ret); +} + + +struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *rds_iwdev) +{ + struct rds_iw_mr_pool *pool; + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) { + printk(KERN_WARNING "RDS/IW: rds_iw_create_mr_pool alloc error\n"); + return ERR_PTR(-ENOMEM); + } + + pool->device = rds_iwdev; + INIT_LIST_HEAD(&pool->dirty_list); + INIT_LIST_HEAD(&pool->clean_list); + mutex_init(&pool->flush_lock); + spin_lock_init(&pool->list_lock); + INIT_WORK(&pool->flush_worker, rds_iw_mr_pool_flush_worker); + + pool->max_message_size = fastreg_message_size; + pool->max_items = fastreg_pool_size; + pool->max_free_pinned = pool->max_items * pool->max_message_size / 4; + pool->max_pages = fastreg_message_size; + + /* We never allow more than max_items MRs to be allocated. + * When we exceed more than max_items_soft, we start freeing + * items more aggressively. + * Make sure that max_items > max_items_soft > max_items / 2 + */ + pool->max_items_soft = pool->max_items * 3 / 4; + + return pool; +} + +void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo) +{ + struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool; + + iinfo->rdma_mr_max = pool->max_items; + iinfo->rdma_mr_size = pool->max_pages; +} + +void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *pool) +{ + flush_workqueue(rds_wq); + rds_iw_flush_mr_pool(pool, 1); + BUG_ON(atomic_read(&pool->item_count)); + BUG_ON(atomic_read(&pool->free_pinned)); + kfree(pool); +} + +static inline struct rds_iw_mr *rds_iw_reuse_fmr(struct rds_iw_mr_pool *pool) +{ + struct rds_iw_mr *ibmr = NULL; + unsigned long flags; + + spin_lock_irqsave(&pool->list_lock, flags); + if (!list_empty(&pool->clean_list)) { + ibmr = list_entry(pool->clean_list.next, struct rds_iw_mr, mapping.m_list); + list_del_init(&ibmr->mapping.m_list); + } + spin_unlock_irqrestore(&pool->list_lock, flags); + + return ibmr; +} + +static struct rds_iw_mr *rds_iw_alloc_mr(struct rds_iw_device *rds_iwdev) +{ + struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool; + struct rds_iw_mr *ibmr = NULL; + int err = 0, iter = 0; + + while (1) { + ibmr = rds_iw_reuse_fmr(pool); + if (ibmr) + return ibmr; + + /* No clean MRs - now we have the choice of either + * allocating a fresh MR up to the limit imposed by the + * driver, or flush any dirty unused MRs. + * We try to avoid stalling in the send path if possible, + * so we allocate as long as we're allowed to. + * + * We're fussy with enforcing the FMR limit, though. If the driver + * tells us we can't use more than N fmrs, we shouldn't start + * arguing with it */ + if (atomic_inc_return(&pool->item_count) <= pool->max_items) + break; + + atomic_dec(&pool->item_count); + + if (++iter > 2) { + rds_iw_stats_inc(s_iw_rdma_mr_pool_depleted); + return ERR_PTR(-EAGAIN); + } + + /* We do have some empty MRs. Flush them out. */ + rds_iw_stats_inc(s_iw_rdma_mr_pool_wait); + rds_iw_flush_mr_pool(pool, 0); + } + + ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); + if (!ibmr) { + err = -ENOMEM; + goto out_no_cigar; + } + + spin_lock_init(&ibmr->mapping.m_lock); + INIT_LIST_HEAD(&ibmr->mapping.m_list); + ibmr->mapping.m_mr = ibmr; + + err = rds_iw_init_fastreg(pool, ibmr); + if (err) + goto out_no_cigar; + + rds_iw_stats_inc(s_iw_rdma_mr_alloc); + return ibmr; + +out_no_cigar: + if (ibmr) { + rds_iw_destroy_fastreg(pool, ibmr); + kfree(ibmr); + } + atomic_dec(&pool->item_count); + return ERR_PTR(err); +} + +void rds_iw_sync_mr(void *trans_private, int direction) +{ + struct rds_iw_mr *ibmr = trans_private; + struct rds_iw_device *rds_iwdev = ibmr->device; + + switch (direction) { + case DMA_FROM_DEVICE: + ib_dma_sync_sg_for_cpu(rds_iwdev->dev, ibmr->mapping.m_sg.list, + ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL); + break; + case DMA_TO_DEVICE: + ib_dma_sync_sg_for_device(rds_iwdev->dev, ibmr->mapping.m_sg.list, + ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL); + break; + } +} + +/* + * Flush our pool of MRs. + * At a minimum, all currently unused MRs are unmapped. + * If the number of MRs allocated exceeds the limit, we also try + * to free as many MRs as needed to get back to this limit. + */ +static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all) +{ + struct rds_iw_mr *ibmr, *next; + LIST_HEAD(unmap_list); + LIST_HEAD(kill_list); + unsigned long flags; + unsigned int nfreed = 0, ncleaned = 0, unpinned = 0; + int ret = 0; + + rds_iw_stats_inc(s_iw_rdma_mr_pool_flush); + + mutex_lock(&pool->flush_lock); + + spin_lock_irqsave(&pool->list_lock, flags); + /* Get the list of all mappings to be destroyed */ + list_splice_init(&pool->dirty_list, &unmap_list); + if (free_all) + list_splice_init(&pool->clean_list, &kill_list); + spin_unlock_irqrestore(&pool->list_lock, flags); + + /* Batched invalidate of dirty MRs. + * For FMR based MRs, the mappings on the unmap list are + * actually members of an ibmr (ibmr->mapping). They either + * migrate to the kill_list, or have been cleaned and should be + * moved to the clean_list. + * For fastregs, they will be dynamically allocated, and + * will be destroyed by the unmap function. + */ + if (!list_empty(&unmap_list)) { + ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list, + &kill_list, &unpinned); + /* If we've been asked to destroy all MRs, move those + * that were simply cleaned to the kill list */ + if (free_all) + list_splice_init(&unmap_list, &kill_list); + } + + /* Destroy any MRs that are past their best before date */ + list_for_each_entry_safe(ibmr, next, &kill_list, mapping.m_list) { + rds_iw_stats_inc(s_iw_rdma_mr_free); + list_del(&ibmr->mapping.m_list); + rds_iw_destroy_fastreg(pool, ibmr); + kfree(ibmr); + nfreed++; + } + + /* Anything that remains are laundered ibmrs, which we can add + * back to the clean list. */ + if (!list_empty(&unmap_list)) { + spin_lock_irqsave(&pool->list_lock, flags); + list_splice(&unmap_list, &pool->clean_list); + spin_unlock_irqrestore(&pool->list_lock, flags); + } + + atomic_sub(unpinned, &pool->free_pinned); + atomic_sub(ncleaned, &pool->dirty_count); + atomic_sub(nfreed, &pool->item_count); + + mutex_unlock(&pool->flush_lock); + return ret; +} + +static void rds_iw_mr_pool_flush_worker(struct work_struct *work) +{ + struct rds_iw_mr_pool *pool = container_of(work, struct rds_iw_mr_pool, flush_worker); + + rds_iw_flush_mr_pool(pool, 0); +} + +void rds_iw_free_mr(void *trans_private, int invalidate) +{ + struct rds_iw_mr *ibmr = trans_private; + struct rds_iw_mr_pool *pool = ibmr->device->mr_pool; + + rdsdebug("RDS/IW: free_mr nents %u\n", ibmr->mapping.m_sg.len); + if (!pool) + return; + + /* Return it to the pool's free list */ + rds_iw_free_fastreg(pool, ibmr); + + /* If we've pinned too many pages, request a flush */ + if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || + atomic_read(&pool->dirty_count) >= pool->max_items / 10) + queue_work(rds_wq, &pool->flush_worker); + + if (invalidate) { + if (likely(!in_interrupt())) { + rds_iw_flush_mr_pool(pool, 0); + } else { + /* We get here if the user created a MR marked + * as use_once and invalidate at the same time. */ + queue_work(rds_wq, &pool->flush_worker); + } + } +} + +void rds_iw_flush_mrs(void) +{ + struct rds_iw_device *rds_iwdev; + + list_for_each_entry(rds_iwdev, &rds_iw_devices, list) { + struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool; + + if (pool) + rds_iw_flush_mr_pool(pool, 0); + } +} + +void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents, + struct rds_sock *rs, u32 *key_ret) +{ + struct rds_iw_device *rds_iwdev; + struct rds_iw_mr *ibmr = NULL; + struct rdma_cm_id *cm_id; + int ret; + + ret = rds_iw_get_device(rs, &rds_iwdev, &cm_id); + if (ret || !cm_id) { + ret = -ENODEV; + goto out; + } + + if (!rds_iwdev->mr_pool) { + ret = -ENODEV; + goto out; + } + + ibmr = rds_iw_alloc_mr(rds_iwdev); + if (IS_ERR(ibmr)) + return ibmr; + + ibmr->cm_id = cm_id; + ibmr->device = rds_iwdev; + + ret = rds_iw_map_fastreg(rds_iwdev->mr_pool, ibmr, sg, nents); + if (ret == 0) + *key_ret = ibmr->mr->rkey; + else + printk(KERN_WARNING "RDS/IW: failed to map mr (errno=%d)\n", ret); + +out: + if (ret) { + if (ibmr) + rds_iw_free_mr(ibmr, 0); + ibmr = ERR_PTR(ret); + } + return ibmr; +} + +/* + * iWARP fastreg handling + * + * The life cycle of a fastreg registration is a bit different from + * FMRs. + * The idea behind fastreg is to have one MR, to which we bind different + * mappings over time. To avoid stalling on the expensive map and invalidate + * operations, these operations are pipelined on the same send queue on + * which we want to send the message containing the r_key. + * + * This creates a bit of a problem for us, as we do not have the destination + * IP in GET_MR, so the connection must be setup prior to the GET_MR call for + * RDMA to be correctly setup. If a fastreg request is present, rds_iw_xmit + * will try to queue a LOCAL_INV (if needed) and a FAST_REG_MR work request + * before queuing the SEND. When completions for these arrive, they are + * dispatched to the MR has a bit set showing that RDMa can be performed. + * + * There is another interesting aspect that's related to invalidation. + * The application can request that a mapping is invalidated in FREE_MR. + * The expectation there is that this invalidation step includes ALL + * PREVIOUSLY FREED MRs. + */ +static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool, + struct rds_iw_mr *ibmr) +{ + struct rds_iw_device *rds_iwdev = pool->device; + struct ib_fast_reg_page_list *page_list = NULL; + struct ib_mr *mr; + int err; + + mr = ib_alloc_fast_reg_mr(rds_iwdev->pd, pool->max_message_size); + if (IS_ERR(mr)) { + err = PTR_ERR(mr); + + printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed (err=%d)\n", err); + return err; + } + + /* FIXME - this is overkill, but mapping->m_sg.dma_len/mapping->m_sg.dma_npages + * is not filled in. + */ + page_list = ib_alloc_fast_reg_page_list(rds_iwdev->dev, pool->max_message_size); + if (IS_ERR(page_list)) { + err = PTR_ERR(page_list); + + printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed (err=%d)\n", err); + ib_dereg_mr(mr); + return err; + } + + ibmr->page_list = page_list; + ibmr->mr = mr; + return 0; +} + +static int rds_iw_rdma_build_fastreg(struct rds_iw_mapping *mapping) +{ + struct rds_iw_mr *ibmr = mapping->m_mr; + struct ib_send_wr f_wr, *failed_wr; + int ret; + + /* + * Perform a WR for the fast_reg_mr. Each individual page + * in the sg list is added to the fast reg page list and placed + * inside the fast_reg_mr WR. The key used is a rolling 8bit + * counter, which should guarantee uniqueness. + */ + ib_update_fast_reg_key(ibmr->mr, ibmr->remap_count++); + mapping->m_rkey = ibmr->mr->rkey; + + memset(&f_wr, 0, sizeof(f_wr)); + f_wr.wr_id = RDS_IW_FAST_REG_WR_ID; + f_wr.opcode = IB_WR_FAST_REG_MR; + f_wr.wr.fast_reg.length = mapping->m_sg.bytes; + f_wr.wr.fast_reg.rkey = mapping->m_rkey; + f_wr.wr.fast_reg.page_list = ibmr->page_list; + f_wr.wr.fast_reg.page_list_len = mapping->m_sg.dma_len; + f_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + f_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE; + f_wr.wr.fast_reg.iova_start = 0; + f_wr.send_flags = IB_SEND_SIGNALED; + + failed_wr = &f_wr; + ret = ib_post_send(ibmr->cm_id->qp, &f_wr, &failed_wr); + BUG_ON(failed_wr != &f_wr); + if (ret) + printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n", + __func__, __LINE__, ret); + return ret; +} + +static int rds_iw_rdma_fastreg_inv(struct rds_iw_mr *ibmr) +{ + struct ib_send_wr s_wr, *failed_wr; + int ret = 0; + + if (!ibmr->cm_id->qp || !ibmr->mr) + goto out; + + memset(&s_wr, 0, sizeof(s_wr)); + s_wr.wr_id = RDS_IW_LOCAL_INV_WR_ID; + s_wr.opcode = IB_WR_LOCAL_INV; + s_wr.ex.invalidate_rkey = ibmr->mr->rkey; + s_wr.send_flags = IB_SEND_SIGNALED; + + failed_wr = &s_wr; + ret = ib_post_send(ibmr->cm_id->qp, &s_wr, &failed_wr); + if (ret) { + printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n", + __func__, __LINE__, ret); + goto out; + } +out: + return ret; +} + +static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool, + struct rds_iw_mr *ibmr, + struct scatterlist *sg, + unsigned int sg_len) +{ + struct rds_iw_device *rds_iwdev = pool->device; + struct rds_iw_mapping *mapping = &ibmr->mapping; + u64 *dma_pages; + int i, ret = 0; + + rds_iw_set_scatterlist(&mapping->m_sg, sg, sg_len); + + dma_pages = rds_iw_map_scatterlist(rds_iwdev, &mapping->m_sg); + if (IS_ERR(dma_pages)) { + ret = PTR_ERR(dma_pages); + dma_pages = NULL; + goto out; + } + + if (mapping->m_sg.dma_len > pool->max_message_size) { + ret = -EMSGSIZE; + goto out; + } + + for (i = 0; i < mapping->m_sg.dma_npages; ++i) + ibmr->page_list->page_list[i] = dma_pages[i]; + + ret = rds_iw_rdma_build_fastreg(mapping); + if (ret) + goto out; + + rds_iw_stats_inc(s_iw_rdma_mr_used); + +out: + kfree(dma_pages); + + return ret; +} + +/* + * "Free" a fastreg MR. + */ +static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, + struct rds_iw_mr *ibmr) +{ + unsigned long flags; + int ret; + + if (!ibmr->mapping.m_sg.dma_len) + return; + + ret = rds_iw_rdma_fastreg_inv(ibmr); + if (ret) + return; + + /* Try to post the LOCAL_INV WR to the queue. */ + spin_lock_irqsave(&pool->list_lock, flags); + + list_add_tail(&ibmr->mapping.m_list, &pool->dirty_list); + atomic_add(ibmr->mapping.m_sg.len, &pool->free_pinned); + atomic_inc(&pool->dirty_count); + + spin_unlock_irqrestore(&pool->list_lock, flags); +} + +static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, + struct list_head *unmap_list, + struct list_head *kill_list, + int *unpinned) +{ + struct rds_iw_mapping *mapping, *next; + unsigned int ncleaned = 0; + LIST_HEAD(laundered); + + /* Batched invalidation of fastreg MRs. + * Why do we do it this way, even though we could pipeline unmap + * and remap? The reason is the application semantics - when the + * application requests an invalidation of MRs, it expects all + * previously released R_Keys to become invalid. + * + * If we implement MR reuse naively, we risk memory corruption + * (this has actually been observed). So the default behavior + * requires that a MR goes through an explicit unmap operation before + * we can reuse it again. + * + * We could probably improve on this a little, by allowing immediate + * reuse of a MR on the same socket (eg you could add small + * cache of unused MRs to strct rds_socket - GET_MR could grab one + * of these without requiring an explicit invalidate). + */ + while (!list_empty(unmap_list)) { + unsigned long flags; + + spin_lock_irqsave(&pool->list_lock, flags); + list_for_each_entry_safe(mapping, next, unmap_list, m_list) { + *unpinned += mapping->m_sg.len; + list_move(&mapping->m_list, &laundered); + ncleaned++; + } + spin_unlock_irqrestore(&pool->list_lock, flags); + } + + /* Move all laundered mappings back to the unmap list. + * We do not kill any WRs right now - it doesn't seem the + * fastreg API has a max_remap limit. */ + list_splice_init(&laundered, unmap_list); + + return ncleaned; +} + +static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, + struct rds_iw_mr *ibmr) +{ + if (ibmr->page_list) + ib_free_fast_reg_page_list(ibmr->page_list); + if (ibmr->mr) + ib_dereg_mr(ibmr->mr); +} diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c new file mode 100644 index 0000000..aa8bf67 --- /dev/null +++ b/net/rds/iw_recv.c @@ -0,0 +1,919 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include +#include + +#include "rds.h" +#include "iw.h" + +static struct kmem_cache *rds_iw_incoming_slab; +static struct kmem_cache *rds_iw_frag_slab; +static atomic_t rds_iw_allocation = ATOMIC_INIT(0); + +static void rds_iw_frag_drop_page(struct rds_page_frag *frag) +{ + rdsdebug("frag %p page %p\n", frag, frag->f_page); + __free_page(frag->f_page); + frag->f_page = NULL; +} + +static void rds_iw_frag_free(struct rds_page_frag *frag) +{ + rdsdebug("frag %p page %p\n", frag, frag->f_page); + BUG_ON(frag->f_page); + kmem_cache_free(rds_iw_frag_slab, frag); +} + +/* + * We map a page at a time. Its fragments are posted in order. This + * is called in fragment order as the fragments get send completion events. + * Only the last frag in the page performs the unmapping. + * + * It's OK for ring cleanup to call this in whatever order it likes because + * DMA is not in flight and so we can unmap while other ring entries still + * hold page references in their frags. + */ +static void rds_iw_recv_unmap_page(struct rds_iw_connection *ic, + struct rds_iw_recv_work *recv) +{ + struct rds_page_frag *frag = recv->r_frag; + + rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page); + if (frag->f_mapped) + ib_dma_unmap_page(ic->i_cm_id->device, + frag->f_mapped, + RDS_FRAG_SIZE, DMA_FROM_DEVICE); + frag->f_mapped = 0; +} + +void rds_iw_recv_init_ring(struct rds_iw_connection *ic) +{ + struct rds_iw_recv_work *recv; + u32 i; + + for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) { + struct ib_sge *sge; + + recv->r_iwinc = NULL; + recv->r_frag = NULL; + + recv->r_wr.next = NULL; + recv->r_wr.wr_id = i; + recv->r_wr.sg_list = recv->r_sge; + recv->r_wr.num_sge = RDS_IW_RECV_SGE; + + sge = rds_iw_data_sge(ic, recv->r_sge); + sge->addr = 0; + sge->length = RDS_FRAG_SIZE; + sge->lkey = 0; + + sge = rds_iw_header_sge(ic, recv->r_sge); + sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); + sge->length = sizeof(struct rds_header); + sge->lkey = 0; + } +} + +static void rds_iw_recv_clear_one(struct rds_iw_connection *ic, + struct rds_iw_recv_work *recv) +{ + if (recv->r_iwinc) { + rds_inc_put(&recv->r_iwinc->ii_inc); + recv->r_iwinc = NULL; + } + if (recv->r_frag) { + rds_iw_recv_unmap_page(ic, recv); + if (recv->r_frag->f_page) + rds_iw_frag_drop_page(recv->r_frag); + rds_iw_frag_free(recv->r_frag); + recv->r_frag = NULL; + } +} + +void rds_iw_recv_clear_ring(struct rds_iw_connection *ic) +{ + u32 i; + + for (i = 0; i < ic->i_recv_ring.w_nr; i++) + rds_iw_recv_clear_one(ic, &ic->i_recvs[i]); + + if (ic->i_frag.f_page) + rds_iw_frag_drop_page(&ic->i_frag); +} + +static int rds_iw_recv_refill_one(struct rds_connection *conn, + struct rds_iw_recv_work *recv, + gfp_t kptr_gfp, gfp_t page_gfp) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + dma_addr_t dma_addr; + struct ib_sge *sge; + int ret = -ENOMEM; + + if (!recv->r_iwinc) { + if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) { + rds_iw_stats_inc(s_iw_rx_alloc_limit); + goto out; + } + recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab, + kptr_gfp); + if (!recv->r_iwinc) { + atomic_dec(&rds_iw_allocation); + goto out; + } + INIT_LIST_HEAD(&recv->r_iwinc->ii_frags); + rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr); + } + + if (!recv->r_frag) { + recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp); + if (!recv->r_frag) + goto out; + INIT_LIST_HEAD(&recv->r_frag->f_item); + recv->r_frag->f_page = NULL; + } + + if (!ic->i_frag.f_page) { + ic->i_frag.f_page = alloc_page(page_gfp); + if (!ic->i_frag.f_page) + goto out; + ic->i_frag.f_offset = 0; + } + + dma_addr = ib_dma_map_page(ic->i_cm_id->device, + ic->i_frag.f_page, + ic->i_frag.f_offset, + RDS_FRAG_SIZE, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr)) + goto out; + + /* + * Once we get the RDS_PAGE_LAST_OFF frag then rds_iw_frag_unmap() + * must be called on this recv. This happens as completions hit + * in order or on connection shutdown. + */ + recv->r_frag->f_page = ic->i_frag.f_page; + recv->r_frag->f_offset = ic->i_frag.f_offset; + recv->r_frag->f_mapped = dma_addr; + + sge = rds_iw_data_sge(ic, recv->r_sge); + sge->addr = dma_addr; + sge->length = RDS_FRAG_SIZE; + + sge = rds_iw_header_sge(ic, recv->r_sge); + sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); + sge->length = sizeof(struct rds_header); + + get_page(recv->r_frag->f_page); + + if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) { + ic->i_frag.f_offset += RDS_FRAG_SIZE; + } else { + put_page(ic->i_frag.f_page); + ic->i_frag.f_page = NULL; + ic->i_frag.f_offset = 0; + } + + ret = 0; +out: + return ret; +} + +/* + * This tries to allocate and post unused work requests after making sure that + * they have all the allocations they need to queue received fragments into + * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc + * pairs don't go unmatched. + * + * -1 is returned if posting fails due to temporary resource exhaustion. + */ +int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, + gfp_t page_gfp, int prefill) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + struct rds_iw_recv_work *recv; + struct ib_recv_wr *failed_wr; + unsigned int posted = 0; + int ret = 0; + u32 pos; + + while ((prefill || rds_conn_up(conn)) && + rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) { + if (pos >= ic->i_recv_ring.w_nr) { + printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", + pos); + ret = -EINVAL; + break; + } + + recv = &ic->i_recvs[pos]; + ret = rds_iw_recv_refill_one(conn, recv, kptr_gfp, page_gfp); + if (ret) { + ret = -1; + break; + } + + /* XXX when can this fail? */ + ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); + rdsdebug("recv %p iwinc %p page %p addr %lu ret %d\n", recv, + recv->r_iwinc, recv->r_frag->f_page, + (long) recv->r_frag->f_mapped, ret); + if (ret) { + rds_iw_conn_error(conn, "recv post on " + "%pI4 returned %d, disconnecting and " + "reconnecting\n", &conn->c_faddr, + ret); + ret = -1; + break; + } + + posted++; + } + + /* We're doing flow control - update the window. */ + if (ic->i_flowctl && posted) + rds_iw_advertise_credits(conn, posted); + + if (ret) + rds_iw_ring_unalloc(&ic->i_recv_ring, 1); + return ret; +} + +static void rds_iw_inc_purge(struct rds_incoming *inc) +{ + struct rds_iw_incoming *iwinc; + struct rds_page_frag *frag; + struct rds_page_frag *pos; + + iwinc = container_of(inc, struct rds_iw_incoming, ii_inc); + rdsdebug("purging iwinc %p inc %p\n", iwinc, inc); + + list_for_each_entry_safe(frag, pos, &iwinc->ii_frags, f_item) { + list_del_init(&frag->f_item); + rds_iw_frag_drop_page(frag); + rds_iw_frag_free(frag); + } +} + +void rds_iw_inc_free(struct rds_incoming *inc) +{ + struct rds_iw_incoming *iwinc; + + iwinc = container_of(inc, struct rds_iw_incoming, ii_inc); + + rds_iw_inc_purge(inc); + rdsdebug("freeing iwinc %p inc %p\n", iwinc, inc); + BUG_ON(!list_empty(&iwinc->ii_frags)); + kmem_cache_free(rds_iw_incoming_slab, iwinc); + atomic_dec(&rds_iw_allocation); + BUG_ON(atomic_read(&rds_iw_allocation) < 0); +} + +int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, + size_t size) +{ + struct rds_iw_incoming *iwinc; + struct rds_page_frag *frag; + struct iovec *iov = first_iov; + unsigned long to_copy; + unsigned long frag_off = 0; + unsigned long iov_off = 0; + int copied = 0; + int ret; + u32 len; + + iwinc = container_of(inc, struct rds_iw_incoming, ii_inc); + frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item); + len = be32_to_cpu(inc->i_hdr.h_len); + + while (copied < size && copied < len) { + if (frag_off == RDS_FRAG_SIZE) { + frag = list_entry(frag->f_item.next, + struct rds_page_frag, f_item); + frag_off = 0; + } + while (iov_off == iov->iov_len) { + iov_off = 0; + iov++; + } + + to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off); + to_copy = min_t(size_t, to_copy, size - copied); + to_copy = min_t(unsigned long, to_copy, len - copied); + + rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag " + "[%p, %lu] + %lu\n", + to_copy, iov->iov_base, iov->iov_len, iov_off, + frag->f_page, frag->f_offset, frag_off); + + /* XXX needs + offset for multiple recvs per page */ + ret = rds_page_copy_to_user(frag->f_page, + frag->f_offset + frag_off, + iov->iov_base + iov_off, + to_copy); + if (ret) { + copied = ret; + break; + } + + iov_off += to_copy; + frag_off += to_copy; + copied += to_copy; + } + + return copied; +} + +/* ic starts out kzalloc()ed */ +void rds_iw_recv_init_ack(struct rds_iw_connection *ic) +{ + struct ib_send_wr *wr = &ic->i_ack_wr; + struct ib_sge *sge = &ic->i_ack_sge; + + sge->addr = ic->i_ack_dma; + sge->length = sizeof(struct rds_header); + sge->lkey = rds_iw_local_dma_lkey(ic); + + wr->sg_list = sge; + wr->num_sge = 1; + wr->opcode = IB_WR_SEND; + wr->wr_id = RDS_IW_ACK_WR_ID; + wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED; +} + +/* + * You'd think that with reliable IB connections you wouldn't need to ack + * messages that have been received. The problem is that IB hardware generates + * an ack message before it has DMAed the message into memory. This creates a + * potential message loss if the HCA is disabled for any reason between when it + * sends the ack and before the message is DMAed and processed. This is only a + * potential issue if another HCA is available for fail-over. + * + * When the remote host receives our ack they'll free the sent message from + * their send queue. To decrease the latency of this we always send an ack + * immediately after we've received messages. + * + * For simplicity, we only have one ack in flight at a time. This puts + * pressure on senders to have deep enough send queues to absorb the latency of + * a single ack frame being in flight. This might not be good enough. + * + * This is implemented by have a long-lived send_wr and sge which point to a + * statically allocated ack frame. This ack wr does not fall under the ring + * accounting that the tx and rx wrs do. The QP attribute specifically makes + * room for it beyond the ring size. Send completion notices its special + * wr_id and avoids working with the ring in that case. + */ +#ifndef KERNEL_HAS_ATOMIC64 +static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq, + int ack_required) +{ + unsigned long flags; + + spin_lock_irqsave(&ic->i_ack_lock, flags); + ic->i_ack_next = seq; + if (ack_required) + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + spin_unlock_irqrestore(&ic->i_ack_lock, flags); +} + +static u64 rds_iw_get_ack(struct rds_iw_connection *ic) +{ + unsigned long flags; + u64 seq; + + clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + + spin_lock_irqsave(&ic->i_ack_lock, flags); + seq = ic->i_ack_next; + spin_unlock_irqrestore(&ic->i_ack_lock, flags); + + return seq; +} +#else +static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq, + int ack_required) +{ + atomic64_set(&ic->i_ack_next, seq); + if (ack_required) { + smp_mb__before_atomic(); + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + } +} + +static u64 rds_iw_get_ack(struct rds_iw_connection *ic) +{ + clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + smp_mb__after_atomic(); + + return atomic64_read(&ic->i_ack_next); +} +#endif + + +static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credits) +{ + struct rds_header *hdr = ic->i_ack; + struct ib_send_wr *failed_wr; + u64 seq; + int ret; + + seq = rds_iw_get_ack(ic); + + rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq); + rds_message_populate_header(hdr, 0, 0, 0); + hdr->h_ack = cpu_to_be64(seq); + hdr->h_credit = adv_credits; + rds_message_make_checksum(hdr); + ic->i_ack_queued = jiffies; + + ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr); + if (unlikely(ret)) { + /* Failed to send. Release the WR, and + * force another ACK. + */ + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + + rds_iw_stats_inc(s_iw_ack_send_failure); + + rds_iw_conn_error(ic->conn, "sending ack failed\n"); + } else + rds_iw_stats_inc(s_iw_ack_sent); +} + +/* + * There are 3 ways of getting acknowledgements to the peer: + * 1. We call rds_iw_attempt_ack from the recv completion handler + * to send an ACK-only frame. + * However, there can be only one such frame in the send queue + * at any time, so we may have to postpone it. + * 2. When another (data) packet is transmitted while there's + * an ACK in the queue, we piggyback the ACK sequence number + * on the data packet. + * 3. If the ACK WR is done sending, we get called from the + * send queue completion handler, and check whether there's + * another ACK pending (postponed because the WR was on the + * queue). If so, we transmit it. + * + * We maintain 2 variables: + * - i_ack_flags, which keeps track of whether the ACK WR + * is currently in the send queue or not (IB_ACK_IN_FLIGHT) + * - i_ack_next, which is the last sequence number we received + * + * Potentially, send queue and receive queue handlers can run concurrently. + * It would be nice to not have to use a spinlock to synchronize things, + * but the one problem that rules this out is that 64bit updates are + * not atomic on all platforms. Things would be a lot simpler if + * we had atomic64 or maybe cmpxchg64 everywhere. + * + * Reconnecting complicates this picture just slightly. When we + * reconnect, we may be seeing duplicate packets. The peer + * is retransmitting them, because it hasn't seen an ACK for + * them. It is important that we ACK these. + * + * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with + * this flag set *MUST* be acknowledged immediately. + */ + +/* + * When we get here, we're called from the recv queue handler. + * Check whether we ought to transmit an ACK. + */ +void rds_iw_attempt_ack(struct rds_iw_connection *ic) +{ + unsigned int adv_credits; + + if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) + return; + + if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) { + rds_iw_stats_inc(s_iw_ack_send_delayed); + return; + } + + /* Can we get a send credit? */ + if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) { + rds_iw_stats_inc(s_iw_tx_throttle); + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + return; + } + + clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + rds_iw_send_ack(ic, adv_credits); +} + +/* + * We get here from the send completion handler, when the + * adapter tells us the ACK frame was sent. + */ +void rds_iw_ack_send_complete(struct rds_iw_connection *ic) +{ + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + rds_iw_attempt_ack(ic); +} + +/* + * This is called by the regular xmit code when it wants to piggyback + * an ACK on an outgoing frame. + */ +u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic) +{ + if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) + rds_iw_stats_inc(s_iw_ack_send_piggybacked); + return rds_iw_get_ack(ic); +} + +/* + * It's kind of lame that we're copying from the posted receive pages into + * long-lived bitmaps. We could have posted the bitmaps and rdma written into + * them. But receiving new congestion bitmaps should be a *rare* event, so + * hopefully we won't need to invest that complexity in making it more + * efficient. By copying we can share a simpler core with TCP which has to + * copy. + */ +static void rds_iw_cong_recv(struct rds_connection *conn, + struct rds_iw_incoming *iwinc) +{ + struct rds_cong_map *map; + unsigned int map_off; + unsigned int map_page; + struct rds_page_frag *frag; + unsigned long frag_off; + unsigned long to_copy; + unsigned long copied; + uint64_t uncongested = 0; + void *addr; + + /* catch completely corrupt packets */ + if (be32_to_cpu(iwinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) + return; + + map = conn->c_fcong; + map_page = 0; + map_off = 0; + + frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item); + frag_off = 0; + + copied = 0; + + while (copied < RDS_CONG_MAP_BYTES) { + uint64_t *src, *dst; + unsigned int k; + + to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); + BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ + + addr = kmap_atomic(frag->f_page); + + src = addr + frag_off; + dst = (void *)map->m_page_addrs[map_page] + map_off; + for (k = 0; k < to_copy; k += 8) { + /* Record ports that became uncongested, ie + * bits that changed from 0 to 1. */ + uncongested |= ~(*src) & *dst; + *dst++ = *src++; + } + kunmap_atomic(addr); + + copied += to_copy; + + map_off += to_copy; + if (map_off == PAGE_SIZE) { + map_off = 0; + map_page++; + } + + frag_off += to_copy; + if (frag_off == RDS_FRAG_SIZE) { + frag = list_entry(frag->f_item.next, + struct rds_page_frag, f_item); + frag_off = 0; + } + } + + /* the congestion map is in little endian order */ + uncongested = le64_to_cpu(uncongested); + + rds_cong_map_updated(map, uncongested); +} + +/* + * Rings are posted with all the allocations they'll need to queue the + * incoming message to the receiving socket so this can't fail. + * All fragments start with a header, so we can make sure we're not receiving + * garbage, and we can tell a small 8 byte fragment from an ACK frame. + */ +struct rds_iw_ack_state { + u64 ack_next; + u64 ack_recv; + unsigned int ack_required:1; + unsigned int ack_next_valid:1; + unsigned int ack_recv_valid:1; +}; + +static void rds_iw_process_recv(struct rds_connection *conn, + struct rds_iw_recv_work *recv, u32 byte_len, + struct rds_iw_ack_state *state) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + struct rds_iw_incoming *iwinc = ic->i_iwinc; + struct rds_header *ihdr, *hdr; + + /* XXX shut down the connection if port 0,0 are seen? */ + + rdsdebug("ic %p iwinc %p recv %p byte len %u\n", ic, iwinc, recv, + byte_len); + + if (byte_len < sizeof(struct rds_header)) { + rds_iw_conn_error(conn, "incoming message " + "from %pI4 didn't include a " + "header, disconnecting and " + "reconnecting\n", + &conn->c_faddr); + return; + } + byte_len -= sizeof(struct rds_header); + + ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs]; + + /* Validate the checksum. */ + if (!rds_message_verify_checksum(ihdr)) { + rds_iw_conn_error(conn, "incoming message " + "from %pI4 has corrupted header - " + "forcing a reconnect\n", + &conn->c_faddr); + rds_stats_inc(s_recv_drop_bad_checksum); + return; + } + + /* Process the ACK sequence which comes with every packet */ + state->ack_recv = be64_to_cpu(ihdr->h_ack); + state->ack_recv_valid = 1; + + /* Process the credits update if there was one */ + if (ihdr->h_credit) + rds_iw_send_add_credits(conn, ihdr->h_credit); + + if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) { + /* This is an ACK-only packet. The fact that it gets + * special treatment here is that historically, ACKs + * were rather special beasts. + */ + rds_iw_stats_inc(s_iw_ack_received); + + /* + * Usually the frags make their way on to incs and are then freed as + * the inc is freed. We don't go that route, so we have to drop the + * page ref ourselves. We can't just leave the page on the recv + * because that confuses the dma mapping of pages and each recv's use + * of a partial page. We can leave the frag, though, it will be + * reused. + * + * FIXME: Fold this into the code path below. + */ + rds_iw_frag_drop_page(recv->r_frag); + return; + } + + /* + * If we don't already have an inc on the connection then this + * fragment has a header and starts a message.. copy its header + * into the inc and save the inc so we can hang upcoming fragments + * off its list. + */ + if (!iwinc) { + iwinc = recv->r_iwinc; + recv->r_iwinc = NULL; + ic->i_iwinc = iwinc; + + hdr = &iwinc->ii_inc.i_hdr; + memcpy(hdr, ihdr, sizeof(*hdr)); + ic->i_recv_data_rem = be32_to_cpu(hdr->h_len); + + rdsdebug("ic %p iwinc %p rem %u flag 0x%x\n", ic, iwinc, + ic->i_recv_data_rem, hdr->h_flags); + } else { + hdr = &iwinc->ii_inc.i_hdr; + /* We can't just use memcmp here; fragments of a + * single message may carry different ACKs */ + if (hdr->h_sequence != ihdr->h_sequence || + hdr->h_len != ihdr->h_len || + hdr->h_sport != ihdr->h_sport || + hdr->h_dport != ihdr->h_dport) { + rds_iw_conn_error(conn, + "fragment header mismatch; forcing reconnect\n"); + return; + } + } + + list_add_tail(&recv->r_frag->f_item, &iwinc->ii_frags); + recv->r_frag = NULL; + + if (ic->i_recv_data_rem > RDS_FRAG_SIZE) + ic->i_recv_data_rem -= RDS_FRAG_SIZE; + else { + ic->i_recv_data_rem = 0; + ic->i_iwinc = NULL; + + if (iwinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) + rds_iw_cong_recv(conn, iwinc); + else { + rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr, + &iwinc->ii_inc, GFP_ATOMIC); + state->ack_next = be64_to_cpu(hdr->h_sequence); + state->ack_next_valid = 1; + } + + /* Evaluate the ACK_REQUIRED flag *after* we received + * the complete frame, and after bumping the next_rx + * sequence. */ + if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) { + rds_stats_inc(s_recv_ack_required); + state->ack_required = 1; + } + + rds_inc_put(&iwinc->ii_inc); + } +} + +/* + * Plucking the oldest entry from the ring can be done concurrently with + * the thread refilling the ring. Each ring operation is protected by + * spinlocks and the transient state of refilling doesn't change the + * recording of which entry is oldest. + * + * This relies on IB only calling one cq comp_handler for each cq so that + * there will only be one caller of rds_recv_incoming() per RDS connection. + */ +void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context) +{ + struct rds_connection *conn = context; + struct rds_iw_connection *ic = conn->c_transport_data; + + rdsdebug("conn %p cq %p\n", conn, cq); + + rds_iw_stats_inc(s_iw_rx_cq_call); + + tasklet_schedule(&ic->i_recv_tasklet); +} + +static inline void rds_poll_cq(struct rds_iw_connection *ic, + struct rds_iw_ack_state *state) +{ + struct rds_connection *conn = ic->conn; + struct ib_wc wc; + struct rds_iw_recv_work *recv; + + while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) { + rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", + (unsigned long long)wc.wr_id, wc.status, wc.byte_len, + be32_to_cpu(wc.ex.imm_data)); + rds_iw_stats_inc(s_iw_rx_cq_event); + + recv = &ic->i_recvs[rds_iw_ring_oldest(&ic->i_recv_ring)]; + + rds_iw_recv_unmap_page(ic, recv); + + /* + * Also process recvs in connecting state because it is possible + * to get a recv completion _before_ the rdmacm ESTABLISHED + * event is processed. + */ + if (rds_conn_up(conn) || rds_conn_connecting(conn)) { + /* We expect errors as the qp is drained during shutdown */ + if (wc.status == IB_WC_SUCCESS) { + rds_iw_process_recv(conn, recv, wc.byte_len, state); + } else { + rds_iw_conn_error(conn, "recv completion on " + "%pI4 had status %u, disconnecting and " + "reconnecting\n", &conn->c_faddr, + wc.status); + } + } + + rds_iw_ring_free(&ic->i_recv_ring, 1); + } +} + +void rds_iw_recv_tasklet_fn(unsigned long data) +{ + struct rds_iw_connection *ic = (struct rds_iw_connection *) data; + struct rds_connection *conn = ic->conn; + struct rds_iw_ack_state state = { 0, }; + + rds_poll_cq(ic, &state); + ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); + rds_poll_cq(ic, &state); + + if (state.ack_next_valid) + rds_iw_set_ack(ic, state.ack_next, state.ack_required); + if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) { + rds_send_drop_acked(conn, state.ack_recv, NULL); + ic->i_ack_recv = state.ack_recv; + } + if (rds_conn_up(conn)) + rds_iw_attempt_ack(ic); + + /* If we ever end up with a really empty receive ring, we're + * in deep trouble, as the sender will definitely see RNR + * timeouts. */ + if (rds_iw_ring_empty(&ic->i_recv_ring)) + rds_iw_stats_inc(s_iw_rx_ring_empty); + + /* + * If the ring is running low, then schedule the thread to refill. + */ + if (rds_iw_ring_low(&ic->i_recv_ring)) + queue_delayed_work(rds_wq, &conn->c_recv_w, 0); +} + +int rds_iw_recv(struct rds_connection *conn) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + int ret = 0; + + rdsdebug("conn %p\n", conn); + + /* + * If we get a temporary posting failure in this context then + * we're really low and we want the caller to back off for a bit. + */ + mutex_lock(&ic->i_recv_mutex); + if (rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0)) + ret = -ENOMEM; + else + rds_iw_stats_inc(s_iw_rx_refill_from_thread); + mutex_unlock(&ic->i_recv_mutex); + + if (rds_conn_up(conn)) + rds_iw_attempt_ack(ic); + + return ret; +} + +int rds_iw_recv_init(void) +{ + struct sysinfo si; + int ret = -ENOMEM; + + /* Default to 30% of all available RAM for recv memory */ + si_meminfo(&si); + rds_iw_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE; + + rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming", + sizeof(struct rds_iw_incoming), + 0, 0, NULL); + if (!rds_iw_incoming_slab) + goto out; + + rds_iw_frag_slab = kmem_cache_create("rds_iw_frag", + sizeof(struct rds_page_frag), + 0, 0, NULL); + if (!rds_iw_frag_slab) + kmem_cache_destroy(rds_iw_incoming_slab); + else + ret = 0; +out: + return ret; +} + +void rds_iw_recv_exit(void) +{ + kmem_cache_destroy(rds_iw_incoming_slab); + kmem_cache_destroy(rds_iw_frag_slab); +} diff --git a/net/rds/iw_ring.c b/net/rds/iw_ring.c new file mode 100644 index 0000000..da8e3b6 --- /dev/null +++ b/net/rds/iw_ring.c @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include + +#include "rds.h" +#include "iw.h" + +/* + * Locking for IB rings. + * We assume that allocation is always protected by a mutex + * in the caller (this is a valid assumption for the current + * implementation). + * + * Freeing always happens in an interrupt, and hence only + * races with allocations, but not with other free()s. + * + * The interaction between allocation and freeing is that + * the alloc code has to determine the number of free entries. + * To this end, we maintain two counters; an allocation counter + * and a free counter. Both are allowed to run freely, and wrap + * around. + * The number of used entries is always (alloc_ctr - free_ctr) % NR. + * + * The current implementation makes free_ctr atomic. When the + * caller finds an allocation fails, it should set an "alloc fail" + * bit and retry the allocation. The "alloc fail" bit essentially tells + * the CQ completion handlers to wake it up after freeing some + * more entries. + */ + +/* + * This only happens on shutdown. + */ +DECLARE_WAIT_QUEUE_HEAD(rds_iw_ring_empty_wait); + +void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr) +{ + memset(ring, 0, sizeof(*ring)); + ring->w_nr = nr; + rdsdebug("ring %p nr %u\n", ring, ring->w_nr); +} + +static inline u32 __rds_iw_ring_used(struct rds_iw_work_ring *ring) +{ + u32 diff; + + /* This assumes that atomic_t has at least as many bits as u32 */ + diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr); + BUG_ON(diff > ring->w_nr); + + return diff; +} + +void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr) +{ + /* We only ever get called from the connection setup code, + * prior to creating the QP. */ + BUG_ON(__rds_iw_ring_used(ring)); + ring->w_nr = nr; +} + +static int __rds_iw_ring_empty(struct rds_iw_work_ring *ring) +{ + return __rds_iw_ring_used(ring) == 0; +} + +u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos) +{ + u32 ret = 0, avail; + + avail = ring->w_nr - __rds_iw_ring_used(ring); + + rdsdebug("ring %p val %u next %u free %u\n", ring, val, + ring->w_alloc_ptr, avail); + + if (val && avail) { + ret = min(val, avail); + *pos = ring->w_alloc_ptr; + + ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr; + ring->w_alloc_ctr += ret; + } + + return ret; +} + +void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val) +{ + ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr; + atomic_add(val, &ring->w_free_ctr); + + if (__rds_iw_ring_empty(ring) && + waitqueue_active(&rds_iw_ring_empty_wait)) + wake_up(&rds_iw_ring_empty_wait); +} + +void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val) +{ + ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr; + ring->w_alloc_ctr -= val; +} + +int rds_iw_ring_empty(struct rds_iw_work_ring *ring) +{ + return __rds_iw_ring_empty(ring); +} + +int rds_iw_ring_low(struct rds_iw_work_ring *ring) +{ + return __rds_iw_ring_used(ring) <= (ring->w_nr >> 1); +} + + +/* + * returns the oldest alloced ring entry. This will be the next one + * freed. This can't be called if there are none allocated. + */ +u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring) +{ + return ring->w_free_ptr; +} + +/* + * returns the number of completed work requests. + */ + +u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest) +{ + u32 ret; + + if (oldest <= (unsigned long long)wr_id) + ret = (unsigned long long)wr_id - oldest + 1; + else + ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1; + + rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret, + wr_id, oldest); + return ret; +} diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c new file mode 100644 index 0000000..9105ea0 --- /dev/null +++ b/net/rds/iw_send.c @@ -0,0 +1,974 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include +#include + +#include "rds.h" +#include "iw.h" + +static void rds_iw_send_rdma_complete(struct rds_message *rm, + int wc_status) +{ + int notify_status; + + switch (wc_status) { + case IB_WC_WR_FLUSH_ERR: + return; + + case IB_WC_SUCCESS: + notify_status = RDS_RDMA_SUCCESS; + break; + + case IB_WC_REM_ACCESS_ERR: + notify_status = RDS_RDMA_REMOTE_ERROR; + break; + + default: + notify_status = RDS_RDMA_OTHER_ERROR; + break; + } + rds_rdma_send_complete(rm, notify_status); +} + +static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic, + struct rm_rdma_op *op) +{ + if (op->op_mapped) { + ib_dma_unmap_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, + op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + op->op_mapped = 0; + } +} + +static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, + struct rds_iw_send_work *send, + int wc_status) +{ + struct rds_message *rm = send->s_rm; + + rdsdebug("ic %p send %p rm %p\n", ic, send, rm); + + ib_dma_unmap_sg(ic->i_cm_id->device, + rm->data.op_sg, rm->data.op_nents, + DMA_TO_DEVICE); + + if (rm->rdma.op_active) { + rds_iw_send_unmap_rdma(ic, &rm->rdma); + + /* If the user asked for a completion notification on this + * message, we can implement three different semantics: + * 1. Notify when we received the ACK on the RDS message + * that was queued with the RDMA. This provides reliable + * notification of RDMA status at the expense of a one-way + * packet delay. + * 2. Notify when the IB stack gives us the completion event for + * the RDMA operation. + * 3. Notify when the IB stack gives us the completion event for + * the accompanying RDS messages. + * Here, we implement approach #3. To implement approach #2, + * call rds_rdma_send_complete from the cq_handler. To implement #1, + * don't call rds_rdma_send_complete at all, and fall back to the notify + * handling in the ACK processing code. + * + * Note: There's no need to explicitly sync any RDMA buffers using + * ib_dma_sync_sg_for_cpu - the completion for the RDMA + * operation itself unmapped the RDMA buffers, which takes care + * of synching. + */ + rds_iw_send_rdma_complete(rm, wc_status); + + if (rm->rdma.op_write) + rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes); + else + rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes); + } + + /* If anyone waited for this message to get flushed out, wake + * them up now */ + rds_message_unmapped(rm); + + rds_message_put(rm); + send->s_rm = NULL; +} + +void rds_iw_send_init_ring(struct rds_iw_connection *ic) +{ + struct rds_iw_send_work *send; + u32 i; + + for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { + struct ib_sge *sge; + + send->s_rm = NULL; + send->s_op = NULL; + send->s_mapping = NULL; + + send->s_wr.next = NULL; + send->s_wr.wr_id = i; + send->s_wr.sg_list = send->s_sge; + send->s_wr.num_sge = 1; + send->s_wr.opcode = IB_WR_SEND; + send->s_wr.send_flags = 0; + send->s_wr.ex.imm_data = 0; + + sge = rds_iw_data_sge(ic, send->s_sge); + sge->lkey = 0; + + sge = rds_iw_header_sge(ic, send->s_sge); + sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); + sge->length = sizeof(struct rds_header); + sge->lkey = 0; + + send->s_mr = ib_alloc_fast_reg_mr(ic->i_pd, fastreg_message_size); + if (IS_ERR(send->s_mr)) { + printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed\n"); + break; + } + + send->s_page_list = ib_alloc_fast_reg_page_list( + ic->i_cm_id->device, fastreg_message_size); + if (IS_ERR(send->s_page_list)) { + printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed\n"); + break; + } + } +} + +void rds_iw_send_clear_ring(struct rds_iw_connection *ic) +{ + struct rds_iw_send_work *send; + u32 i; + + for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { + BUG_ON(!send->s_mr); + ib_dereg_mr(send->s_mr); + BUG_ON(!send->s_page_list); + ib_free_fast_reg_page_list(send->s_page_list); + if (send->s_wr.opcode == 0xdead) + continue; + if (send->s_rm) + rds_iw_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR); + if (send->s_op) + rds_iw_send_unmap_rdma(ic, send->s_op); + } +} + +/* + * The _oldest/_free ring operations here race cleanly with the alloc/unalloc + * operations performed in the send path. As the sender allocs and potentially + * unallocs the next free entry in the ring it doesn't alter which is + * the next to be freed, which is what this is concerned with. + */ +void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context) +{ + struct rds_connection *conn = context; + struct rds_iw_connection *ic = conn->c_transport_data; + struct ib_wc wc; + struct rds_iw_send_work *send; + u32 completed; + u32 oldest; + u32 i; + int ret; + + rdsdebug("cq %p conn %p\n", cq, conn); + rds_iw_stats_inc(s_iw_tx_cq_call); + ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + if (ret) + rdsdebug("ib_req_notify_cq send failed: %d\n", ret); + + while (ib_poll_cq(cq, 1, &wc) > 0) { + rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", + (unsigned long long)wc.wr_id, wc.status, wc.byte_len, + be32_to_cpu(wc.ex.imm_data)); + rds_iw_stats_inc(s_iw_tx_cq_event); + + if (wc.status != IB_WC_SUCCESS) { + printk(KERN_ERR "WC Error: status = %d opcode = %d\n", wc.status, wc.opcode); + break; + } + + if (wc.opcode == IB_WC_LOCAL_INV && wc.wr_id == RDS_IW_LOCAL_INV_WR_ID) { + ic->i_fastreg_posted = 0; + continue; + } + + if (wc.opcode == IB_WC_FAST_REG_MR && wc.wr_id == RDS_IW_FAST_REG_WR_ID) { + ic->i_fastreg_posted = 1; + continue; + } + + if (wc.wr_id == RDS_IW_ACK_WR_ID) { + if (time_after(jiffies, ic->i_ack_queued + HZ/2)) + rds_iw_stats_inc(s_iw_tx_stalled); + rds_iw_ack_send_complete(ic); + continue; + } + + oldest = rds_iw_ring_oldest(&ic->i_send_ring); + + completed = rds_iw_ring_completed(&ic->i_send_ring, wc.wr_id, oldest); + + for (i = 0; i < completed; i++) { + send = &ic->i_sends[oldest]; + + /* In the error case, wc.opcode sometimes contains garbage */ + switch (send->s_wr.opcode) { + case IB_WR_SEND: + if (send->s_rm) + rds_iw_send_unmap_rm(ic, send, wc.status); + break; + case IB_WR_FAST_REG_MR: + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_READ: + case IB_WR_RDMA_READ_WITH_INV: + /* Nothing to be done - the SG list will be unmapped + * when the SEND completes. */ + break; + default: + printk_ratelimited(KERN_NOTICE + "RDS/IW: %s: unexpected opcode 0x%x in WR!\n", + __func__, send->s_wr.opcode); + break; + } + + send->s_wr.opcode = 0xdead; + send->s_wr.num_sge = 1; + if (time_after(jiffies, send->s_queued + HZ/2)) + rds_iw_stats_inc(s_iw_tx_stalled); + + /* If a RDMA operation produced an error, signal this right + * away. If we don't, the subsequent SEND that goes with this + * RDMA will be canceled with ERR_WFLUSH, and the application + * never learn that the RDMA failed. */ + if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) { + struct rds_message *rm; + + rm = rds_send_get_message(conn, send->s_op); + if (rm) + rds_iw_send_rdma_complete(rm, wc.status); + } + + oldest = (oldest + 1) % ic->i_send_ring.w_nr; + } + + rds_iw_ring_free(&ic->i_send_ring, completed); + + if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || + test_bit(0, &conn->c_map_queued)) + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + + /* We expect errors as the qp is drained during shutdown */ + if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { + rds_iw_conn_error(conn, + "send completion on %pI4 " + "had status %u, disconnecting and reconnecting\n", + &conn->c_faddr, wc.status); + } + } +} + +/* + * This is the main function for allocating credits when sending + * messages. + * + * Conceptually, we have two counters: + * - send credits: this tells us how many WRs we're allowed + * to submit without overruning the receiver's queue. For + * each SEND WR we post, we decrement this by one. + * + * - posted credits: this tells us how many WRs we recently + * posted to the receive queue. This value is transferred + * to the peer as a "credit update" in a RDS header field. + * Every time we transmit credits to the peer, we subtract + * the amount of transferred credits from this counter. + * + * It is essential that we avoid situations where both sides have + * exhausted their send credits, and are unable to send new credits + * to the peer. We achieve this by requiring that we send at least + * one credit update to the peer before exhausting our credits. + * When new credits arrive, we subtract one credit that is withheld + * until we've posted new buffers and are ready to transmit these + * credits (see rds_iw_send_add_credits below). + * + * The RDS send code is essentially single-threaded; rds_send_xmit + * grabs c_send_lock to ensure exclusive access to the send ring. + * However, the ACK sending code is independent and can race with + * message SENDs. + * + * In the send path, we need to update the counters for send credits + * and the counter of posted buffers atomically - when we use the + * last available credit, we cannot allow another thread to race us + * and grab the posted credits counter. Hence, we have to use a + * spinlock to protect the credit counter, or use atomics. + * + * Spinlocks shared between the send and the receive path are bad, + * because they create unnecessary delays. An early implementation + * using a spinlock showed a 5% degradation in throughput at some + * loads. + * + * This implementation avoids spinlocks completely, putting both + * counters into a single atomic, and updating that atomic using + * atomic_add (in the receive path, when receiving fresh credits), + * and using atomic_cmpxchg when updating the two counters. + */ +int rds_iw_send_grab_credits(struct rds_iw_connection *ic, + u32 wanted, u32 *adv_credits, int need_posted, int max_posted) +{ + unsigned int avail, posted, got = 0, advertise; + long oldval, newval; + + *adv_credits = 0; + if (!ic->i_flowctl) + return wanted; + +try_again: + advertise = 0; + oldval = newval = atomic_read(&ic->i_credits); + posted = IB_GET_POST_CREDITS(oldval); + avail = IB_GET_SEND_CREDITS(oldval); + + rdsdebug("rds_iw_send_grab_credits(%u): credits=%u posted=%u\n", + wanted, avail, posted); + + /* The last credit must be used to send a credit update. */ + if (avail && !posted) + avail--; + + if (avail < wanted) { + struct rds_connection *conn = ic->i_cm_id->context; + + /* Oops, there aren't that many credits left! */ + set_bit(RDS_LL_SEND_FULL, &conn->c_flags); + got = avail; + } else { + /* Sometimes you get what you want, lalala. */ + got = wanted; + } + newval -= IB_SET_SEND_CREDITS(got); + + /* + * If need_posted is non-zero, then the caller wants + * the posted regardless of whether any send credits are + * available. + */ + if (posted && (got || need_posted)) { + advertise = min_t(unsigned int, posted, max_posted); + newval -= IB_SET_POST_CREDITS(advertise); + } + + /* Finally bill everything */ + if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval) + goto try_again; + + *adv_credits = advertise; + return got; +} + +void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + + if (credits == 0) + return; + + rdsdebug("rds_iw_send_add_credits(%u): current=%u%s\n", + credits, + IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)), + test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : ""); + + atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits); + if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)) + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + + WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384); + + rds_iw_stats_inc(s_iw_rx_credit_updates); +} + +void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + + if (posted == 0) + return; + + atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits); + + /* Decide whether to send an update to the peer now. + * If we would send a credit update for every single buffer we + * post, we would end up with an ACK storm (ACK arrives, + * consumes buffer, we refill the ring, send ACK to remote + * advertising the newly posted buffer... ad inf) + * + * Performance pretty much depends on how often we send + * credit updates - too frequent updates mean lots of ACKs. + * Too infrequent updates, and the peer will run out of + * credits and has to throttle. + * For the time being, 16 seems to be a good compromise. + */ + if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16) + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); +} + +static inline void +rds_iw_xmit_populate_wr(struct rds_iw_connection *ic, + struct rds_iw_send_work *send, unsigned int pos, + unsigned long buffer, unsigned int length, + int send_flags) +{ + struct ib_sge *sge; + + WARN_ON(pos != send - ic->i_sends); + + send->s_wr.send_flags = send_flags; + send->s_wr.opcode = IB_WR_SEND; + send->s_wr.num_sge = 2; + send->s_wr.next = NULL; + send->s_queued = jiffies; + send->s_op = NULL; + + if (length != 0) { + sge = rds_iw_data_sge(ic, send->s_sge); + sge->addr = buffer; + sge->length = length; + sge->lkey = rds_iw_local_dma_lkey(ic); + + sge = rds_iw_header_sge(ic, send->s_sge); + } else { + /* We're sending a packet with no payload. There is only + * one SGE */ + send->s_wr.num_sge = 1; + sge = &send->s_sge[0]; + } + + sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header)); + sge->length = sizeof(struct rds_header); + sge->lkey = rds_iw_local_dma_lkey(ic); +} + +/* + * This can be called multiple times for a given message. The first time + * we see a message we map its scatterlist into the IB device so that + * we can provide that mapped address to the IB scatter gather entries + * in the IB work requests. We translate the scatterlist into a series + * of work requests that fragment the message. These work requests complete + * in order so we pass ownership of the message to the completion handler + * once we send the final fragment. + * + * The RDS core uses the c_send_lock to only enter this function once + * per connection. This makes sure that the tx ring alloc/unalloc pairs + * don't get out of sync and confuse the ring. + */ +int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, unsigned int off) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + struct ib_device *dev = ic->i_cm_id->device; + struct rds_iw_send_work *send = NULL; + struct rds_iw_send_work *first; + struct rds_iw_send_work *prev; + struct ib_send_wr *failed_wr; + struct scatterlist *scat; + u32 pos; + u32 i; + u32 work_alloc; + u32 credit_alloc; + u32 posted; + u32 adv_credits = 0; + int send_flags = 0; + int sent; + int ret; + int flow_controlled = 0; + + BUG_ON(off % RDS_FRAG_SIZE); + BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); + + /* Fastreg support */ + if (rds_rdma_cookie_key(rm->m_rdma_cookie) && !ic->i_fastreg_posted) { + ret = -EAGAIN; + goto out; + } + + /* FIXME we may overallocate here */ + if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) + i = 1; + else + i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE); + + work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos); + if (work_alloc == 0) { + set_bit(RDS_LL_SEND_FULL, &conn->c_flags); + rds_iw_stats_inc(s_iw_tx_ring_full); + ret = -ENOMEM; + goto out; + } + + credit_alloc = work_alloc; + if (ic->i_flowctl) { + credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT); + adv_credits += posted; + if (credit_alloc < work_alloc) { + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); + work_alloc = credit_alloc; + flow_controlled++; + } + if (work_alloc == 0) { + set_bit(RDS_LL_SEND_FULL, &conn->c_flags); + rds_iw_stats_inc(s_iw_tx_throttle); + ret = -ENOMEM; + goto out; + } + } + + /* map the message the first time we see it */ + if (!ic->i_rm) { + /* + printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n", + be16_to_cpu(rm->m_inc.i_hdr.h_dport), + rm->m_inc.i_hdr.h_flags, + be32_to_cpu(rm->m_inc.i_hdr.h_len)); + */ + if (rm->data.op_nents) { + rm->data.op_count = ib_dma_map_sg(dev, + rm->data.op_sg, + rm->data.op_nents, + DMA_TO_DEVICE); + rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count); + if (rm->data.op_count == 0) { + rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); + ret = -ENOMEM; /* XXX ? */ + goto out; + } + } else { + rm->data.op_count = 0; + } + + ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; + ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes; + rds_message_addref(rm); + ic->i_rm = rm; + + /* Finalize the header */ + if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) + rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED; + if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) + rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED; + + /* If it has a RDMA op, tell the peer we did it. This is + * used by the peer to release use-once RDMA MRs. */ + if (rm->rdma.op_active) { + struct rds_ext_header_rdma ext_hdr; + + ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey); + rds_message_add_extension(&rm->m_inc.i_hdr, + RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); + } + if (rm->m_rdma_cookie) { + rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr, + rds_rdma_cookie_key(rm->m_rdma_cookie), + rds_rdma_cookie_offset(rm->m_rdma_cookie)); + } + + /* Note - rds_iw_piggyb_ack clears the ACK_REQUIRED bit, so + * we should not do this unless we have a chance of at least + * sticking the header into the send ring. Which is why we + * should call rds_iw_ring_alloc first. */ + rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_iw_piggyb_ack(ic)); + rds_message_make_checksum(&rm->m_inc.i_hdr); + + /* + * Update adv_credits since we reset the ACK_REQUIRED bit. + */ + rds_iw_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); + adv_credits += posted; + BUG_ON(adv_credits > 255); + } + + send = &ic->i_sends[pos]; + first = send; + prev = NULL; + scat = &rm->data.op_sg[sg]; + sent = 0; + i = 0; + + /* Sometimes you want to put a fence between an RDMA + * READ and the following SEND. + * We could either do this all the time + * or when requested by the user. Right now, we let + * the application choose. + */ + if (rm->rdma.op_active && rm->rdma.op_fence) + send_flags = IB_SEND_FENCE; + + /* + * We could be copying the header into the unused tail of the page. + * That would need to be changed in the future when those pages might + * be mapped userspace pages or page cache pages. So instead we always + * use a second sge and our long-lived ring of mapped headers. We send + * the header after the data so that the data payload can be aligned on + * the receiver. + */ + + /* handle a 0-len message */ + if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) { + rds_iw_xmit_populate_wr(ic, send, pos, 0, 0, send_flags); + goto add_header; + } + + /* if there's data reference it with a chain of work reqs */ + for (; i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]; i++) { + unsigned int len; + + send = &ic->i_sends[pos]; + + len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); + rds_iw_xmit_populate_wr(ic, send, pos, + ib_sg_dma_address(dev, scat) + off, len, + send_flags); + + /* + * We want to delay signaling completions just enough to get + * the batching benefits but not so much that we create dead time + * on the wire. + */ + if (ic->i_unsignaled_wrs-- == 0) { + ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; + send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + } + + ic->i_unsignaled_bytes -= len; + if (ic->i_unsignaled_bytes <= 0) { + ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes; + send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + } + + /* + * Always signal the last one if we're stopping due to flow control. + */ + if (flow_controlled && i == (work_alloc-1)) + send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + + rdsdebug("send %p wr %p num_sge %u next %p\n", send, + &send->s_wr, send->s_wr.num_sge, send->s_wr.next); + + sent += len; + off += len; + if (off == ib_sg_dma_len(dev, scat)) { + scat++; + off = 0; + } + +add_header: + /* Tack on the header after the data. The header SGE should already + * have been set up to point to the right header buffer. */ + memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); + + if (0) { + struct rds_header *hdr = &ic->i_send_hdrs[pos]; + + printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n", + be16_to_cpu(hdr->h_dport), + hdr->h_flags, + be32_to_cpu(hdr->h_len)); + } + if (adv_credits) { + struct rds_header *hdr = &ic->i_send_hdrs[pos]; + + /* add credit and redo the header checksum */ + hdr->h_credit = adv_credits; + rds_message_make_checksum(hdr); + adv_credits = 0; + rds_iw_stats_inc(s_iw_tx_credit_updates); + } + + if (prev) + prev->s_wr.next = &send->s_wr; + prev = send; + + pos = (pos + 1) % ic->i_send_ring.w_nr; + } + + /* Account the RDS header in the number of bytes we sent, but just once. + * The caller has no concept of fragmentation. */ + if (hdr_off == 0) + sent += sizeof(struct rds_header); + + /* if we finished the message then send completion owns it */ + if (scat == &rm->data.op_sg[rm->data.op_count]) { + prev->s_rm = ic->i_rm; + prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + ic->i_rm = NULL; + } + + if (i < work_alloc) { + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i); + work_alloc = i; + } + if (ic->i_flowctl && i < credit_alloc) + rds_iw_send_add_credits(conn, credit_alloc - i); + + /* XXX need to worry about failed_wr and partial sends. */ + failed_wr = &first->s_wr; + ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); + rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, + first, &first->s_wr, ret, failed_wr); + BUG_ON(failed_wr != &first->s_wr); + if (ret) { + printk(KERN_WARNING "RDS/IW: ib_post_send to %pI4 " + "returned %d\n", &conn->c_faddr, ret); + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); + if (prev->s_rm) { + ic->i_rm = prev->s_rm; + prev->s_rm = NULL; + } + goto out; + } + + ret = sent; +out: + BUG_ON(adv_credits); + return ret; +} + +static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rds_iw_connection *ic, struct rds_iw_send_work *send, int nent, int len, u64 sg_addr) +{ + BUG_ON(nent > send->s_page_list->max_page_list_len); + /* + * Perform a WR for the fast_reg_mr. Each individual page + * in the sg list is added to the fast reg page list and placed + * inside the fast_reg_mr WR. + */ + send->s_wr.opcode = IB_WR_FAST_REG_MR; + send->s_wr.wr.fast_reg.length = len; + send->s_wr.wr.fast_reg.rkey = send->s_mr->rkey; + send->s_wr.wr.fast_reg.page_list = send->s_page_list; + send->s_wr.wr.fast_reg.page_list_len = nent; + send->s_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + send->s_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE; + send->s_wr.wr.fast_reg.iova_start = sg_addr; + + ib_update_fast_reg_key(send->s_mr, send->s_remap_count++); +} + +int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + struct rds_iw_send_work *send = NULL; + struct rds_iw_send_work *first; + struct rds_iw_send_work *prev; + struct ib_send_wr *failed_wr; + struct rds_iw_device *rds_iwdev; + struct scatterlist *scat; + unsigned long len; + u64 remote_addr = op->op_remote_addr; + u32 pos, fr_pos; + u32 work_alloc; + u32 i; + u32 j; + int sent; + int ret; + int num_sge; + + rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); + + /* map the message the first time we see it */ + if (!op->op_mapped) { + op->op_count = ib_dma_map_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, (op->op_write) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE); + rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count); + if (op->op_count == 0) { + rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); + ret = -ENOMEM; /* XXX ? */ + goto out; + } + + op->op_mapped = 1; + } + + if (!op->op_write) { + /* Alloc space on the send queue for the fastreg */ + work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos); + if (work_alloc != 1) { + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_iw_stats_inc(s_iw_tx_ring_full); + ret = -ENOMEM; + goto out; + } + } + + /* + * Instead of knowing how to return a partial rdma read/write we insist that there + * be enough work requests to send the entire message. + */ + i = ceil(op->op_count, rds_iwdev->max_sge); + + work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos); + if (work_alloc != i) { + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_iw_stats_inc(s_iw_tx_ring_full); + ret = -ENOMEM; + goto out; + } + + send = &ic->i_sends[pos]; + if (!op->op_write) { + first = prev = &ic->i_sends[fr_pos]; + } else { + first = send; + prev = NULL; + } + scat = &op->op_sg[0]; + sent = 0; + num_sge = op->op_count; + + for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) { + send->s_wr.send_flags = 0; + send->s_queued = jiffies; + + /* + * We want to delay signaling completions just enough to get + * the batching benefits but not so much that we create dead time on the wire. + */ + if (ic->i_unsignaled_wrs-- == 0) { + ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; + send->s_wr.send_flags = IB_SEND_SIGNALED; + } + + /* To avoid the need to have the plumbing to invalidate the fastreg_mr used + * for local access after RDS is finished with it, using + * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed. + */ + if (op->op_write) + send->s_wr.opcode = IB_WR_RDMA_WRITE; + else + send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV; + + send->s_wr.wr.rdma.remote_addr = remote_addr; + send->s_wr.wr.rdma.rkey = op->op_rkey; + send->s_op = op; + + if (num_sge > rds_iwdev->max_sge) { + send->s_wr.num_sge = rds_iwdev->max_sge; + num_sge -= rds_iwdev->max_sge; + } else + send->s_wr.num_sge = num_sge; + + send->s_wr.next = NULL; + + if (prev) + prev->s_wr.next = &send->s_wr; + + for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) { + len = ib_sg_dma_len(ic->i_cm_id->device, scat); + + if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) + send->s_page_list->page_list[j] = ib_sg_dma_address(ic->i_cm_id->device, scat); + else { + send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat); + send->s_sge[j].length = len; + send->s_sge[j].lkey = rds_iw_local_dma_lkey(ic); + } + + sent += len; + rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr); + remote_addr += len; + + scat++; + } + + if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) { + send->s_wr.num_sge = 1; + send->s_sge[0].addr = conn->c_xmit_rm->m_rs->rs_user_addr; + send->s_sge[0].length = conn->c_xmit_rm->m_rs->rs_user_bytes; + send->s_sge[0].lkey = ic->i_sends[fr_pos].s_mr->lkey; + } + + rdsdebug("send %p wr %p num_sge %u next %p\n", send, + &send->s_wr, send->s_wr.num_sge, send->s_wr.next); + + prev = send; + if (++send == &ic->i_sends[ic->i_send_ring.w_nr]) + send = ic->i_sends; + } + + /* if we finished the message then send completion owns it */ + if (scat == &op->op_sg[op->op_count]) + first->s_wr.send_flags = IB_SEND_SIGNALED; + + if (i < work_alloc) { + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i); + work_alloc = i; + } + + /* On iWARP, local memory access by a remote system (ie, RDMA Read) is not + * recommended. Putting the lkey on the wire is a security hole, as it can + * allow for memory access to all of memory on the remote system. Some + * adapters do not allow using the lkey for this at all. To bypass this use a + * fastreg_mr (or possibly a dma_mr) + */ + if (!op->op_write) { + rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos], + op->op_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr); + work_alloc++; + } + + failed_wr = &first->s_wr; + ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); + rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, + first, &first->s_wr, ret, failed_wr); + BUG_ON(failed_wr != &first->s_wr); + if (ret) { + printk(KERN_WARNING "RDS/IW: rdma ib_post_send to %pI4 " + "returned %d\n", &conn->c_faddr, ret); + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); + goto out; + } + +out: + return ret; +} + +void rds_iw_xmit_complete(struct rds_connection *conn) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + + /* We may have a pending ACK or window update we were unable + * to send previously (due to flow control). Try again. */ + rds_iw_attempt_ack(ic); +} diff --git a/net/rds/iw_stats.c b/net/rds/iw_stats.c new file mode 100644 index 0000000..5fe67f6 --- /dev/null +++ b/net/rds/iw_stats.c @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include + +#include "rds.h" +#include "iw.h" + +DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_iw_statistics, rds_iw_stats); + +static const char *const rds_iw_stat_names[] = { + "iw_connect_raced", + "iw_listen_closed_stale", + "iw_tx_cq_call", + "iw_tx_cq_event", + "iw_tx_ring_full", + "iw_tx_throttle", + "iw_tx_sg_mapping_failure", + "iw_tx_stalled", + "iw_tx_credit_updates", + "iw_rx_cq_call", + "iw_rx_cq_event", + "iw_rx_ring_empty", + "iw_rx_refill_from_cq", + "iw_rx_refill_from_thread", + "iw_rx_alloc_limit", + "iw_rx_credit_updates", + "iw_ack_sent", + "iw_ack_send_failure", + "iw_ack_send_delayed", + "iw_ack_send_piggybacked", + "iw_ack_received", + "iw_rdma_mr_alloc", + "iw_rdma_mr_free", + "iw_rdma_mr_used", + "iw_rdma_mr_pool_flush", + "iw_rdma_mr_pool_wait", + "iw_rdma_mr_pool_depleted", +}; + +unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail) +{ + struct rds_iw_statistics stats = {0, }; + uint64_t *src; + uint64_t *sum; + size_t i; + int cpu; + + if (avail < ARRAY_SIZE(rds_iw_stat_names)) + goto out; + + for_each_online_cpu(cpu) { + src = (uint64_t *)&(per_cpu(rds_iw_stats, cpu)); + sum = (uint64_t *)&stats; + for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) + *(sum++) += *(src++); + } + + rds_stats_info_copy(iter, (uint64_t *)&stats, rds_iw_stat_names, + ARRAY_SIZE(rds_iw_stat_names)); +out: + return ARRAY_SIZE(rds_iw_stat_names); +} diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c new file mode 100644 index 0000000..139239d --- /dev/null +++ b/net/rds/iw_sysctl.c @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include + +#include "iw.h" + +static struct ctl_table_header *rds_iw_sysctl_hdr; + +unsigned long rds_iw_sysctl_max_send_wr = RDS_IW_DEFAULT_SEND_WR; +unsigned long rds_iw_sysctl_max_recv_wr = RDS_IW_DEFAULT_RECV_WR; +unsigned long rds_iw_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE; +static unsigned long rds_iw_sysctl_max_wr_min = 1; +/* hardware will fail CQ creation long before this */ +static unsigned long rds_iw_sysctl_max_wr_max = (u32)~0; + +unsigned long rds_iw_sysctl_max_unsig_wrs = 16; +static unsigned long rds_iw_sysctl_max_unsig_wr_min = 1; +static unsigned long rds_iw_sysctl_max_unsig_wr_max = 64; + +unsigned long rds_iw_sysctl_max_unsig_bytes = (16 << 20); +static unsigned long rds_iw_sysctl_max_unsig_bytes_min = 1; +static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL; + +unsigned int rds_iw_sysctl_flow_control = 1; + +static struct ctl_table rds_iw_sysctl_table[] = { + { + .procname = "max_send_wr", + .data = &rds_iw_sysctl_max_send_wr, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &rds_iw_sysctl_max_wr_min, + .extra2 = &rds_iw_sysctl_max_wr_max, + }, + { + .procname = "max_recv_wr", + .data = &rds_iw_sysctl_max_recv_wr, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &rds_iw_sysctl_max_wr_min, + .extra2 = &rds_iw_sysctl_max_wr_max, + }, + { + .procname = "max_unsignaled_wr", + .data = &rds_iw_sysctl_max_unsig_wrs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &rds_iw_sysctl_max_unsig_wr_min, + .extra2 = &rds_iw_sysctl_max_unsig_wr_max, + }, + { + .procname = "max_unsignaled_bytes", + .data = &rds_iw_sysctl_max_unsig_bytes, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &rds_iw_sysctl_max_unsig_bytes_min, + .extra2 = &rds_iw_sysctl_max_unsig_bytes_max, + }, + { + .procname = "max_recv_allocation", + .data = &rds_iw_sysctl_max_recv_allocation, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "flow_control", + .data = &rds_iw_sysctl_flow_control, + .maxlen = sizeof(rds_iw_sysctl_flow_control), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +void rds_iw_sysctl_exit(void) +{ + unregister_net_sysctl_table(rds_iw_sysctl_hdr); +} + +int rds_iw_sysctl_init(void) +{ + rds_iw_sysctl_hdr = register_net_sysctl(&init_net, "net/rds/iw", rds_iw_sysctl_table); + if (!rds_iw_sysctl_hdr) + return -ENOMEM; + return 0; +} diff --git a/net/rds/loop.c b/net/rds/loop.c new file mode 100644 index 0000000..6b12b68 --- /dev/null +++ b/net/rds/loop.c @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include + +#include "rds.h" +#include "loop.h" + +static DEFINE_SPINLOCK(loop_conns_lock); +static LIST_HEAD(loop_conns); + +/* + * This 'loopback' transport is a special case for flows that originate + * and terminate on the same machine. + * + * Connection build-up notices if the destination address is thought of + * as a local address by a transport. At that time it decides to use the + * loopback transport instead of the bound transport of the sending socket. + * + * The loopback transport's sending path just hands the sent rds_message + * straight to the receiving path via an embedded rds_incoming. + */ + +/* + * Usually a message transits both the sender and receiver's conns as it + * flows to the receiver. In the loopback case, though, the receive path + * is handed the sending conn so the sense of the addresses is reversed. + */ +static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, + unsigned int off) +{ + struct scatterlist *sgp = &rm->data.op_sg[sg]; + int ret = sizeof(struct rds_header) + + be32_to_cpu(rm->m_inc.i_hdr.h_len); + + /* Do not send cong updates to loopback */ + if (rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) { + rds_cong_map_updated(conn->c_fcong, ~(u64) 0); + ret = min_t(int, ret, sgp->length - conn->c_xmit_data_off); + goto out; + } + + BUG_ON(hdr_off || sg || off); + + rds_inc_init(&rm->m_inc, conn, conn->c_laddr); + /* For the embedded inc. Matching put is in loop_inc_free() */ + rds_message_addref(rm); + + rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc, + GFP_KERNEL); + + rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence), + NULL); + + rds_inc_put(&rm->m_inc); +out: + return ret; +} + +/* + * See rds_loop_xmit(). Since our inc is embedded in the rm, we + * make sure the rm lives at least until the inc is done. + */ +static void rds_loop_inc_free(struct rds_incoming *inc) +{ + struct rds_message *rm = container_of(inc, struct rds_message, m_inc); + rds_message_put(rm); +} + +/* we need to at least give the thread something to succeed */ +static int rds_loop_recv(struct rds_connection *conn) +{ + return 0; +} + +struct rds_loop_connection { + struct list_head loop_node; + struct rds_connection *conn; +}; + +/* + * Even the loopback transport needs to keep track of its connections, + * so it can call rds_conn_destroy() on them on exit. N.B. there are + * 1+ loopback addresses (127.*.*.*) so it's not a bug to have + * multiple loopback conns allocated, although rather useless. + */ +static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp) +{ + struct rds_loop_connection *lc; + unsigned long flags; + + lc = kzalloc(sizeof(struct rds_loop_connection), gfp); + if (!lc) + return -ENOMEM; + + INIT_LIST_HEAD(&lc->loop_node); + lc->conn = conn; + conn->c_transport_data = lc; + + spin_lock_irqsave(&loop_conns_lock, flags); + list_add_tail(&lc->loop_node, &loop_conns); + spin_unlock_irqrestore(&loop_conns_lock, flags); + + return 0; +} + +static void rds_loop_conn_free(void *arg) +{ + struct rds_loop_connection *lc = arg; + unsigned long flags; + + rdsdebug("lc %p\n", lc); + spin_lock_irqsave(&loop_conns_lock, flags); + list_del(&lc->loop_node); + spin_unlock_irqrestore(&loop_conns_lock, flags); + kfree(lc); +} + +static int rds_loop_conn_connect(struct rds_connection *conn) +{ + rds_connect_complete(conn); + return 0; +} + +static void rds_loop_conn_shutdown(struct rds_connection *conn) +{ +} + +void rds_loop_exit(void) +{ + struct rds_loop_connection *lc, *_lc; + LIST_HEAD(tmp_list); + + /* avoid calling conn_destroy with irqs off */ + spin_lock_irq(&loop_conns_lock); + list_splice(&loop_conns, &tmp_list); + INIT_LIST_HEAD(&loop_conns); + spin_unlock_irq(&loop_conns_lock); + + list_for_each_entry_safe(lc, _lc, &tmp_list, loop_node) { + WARN_ON(lc->conn->c_passive); + rds_conn_destroy(lc->conn); + } +} + +/* + * This is missing .xmit_* because loop doesn't go through generic + * rds_send_xmit() and doesn't call rds_recv_incoming(). .listen_stop and + * .laddr_check are missing because transport.c doesn't iterate over + * rds_loop_transport. + */ +struct rds_transport rds_loop_transport = { + .xmit = rds_loop_xmit, + .recv = rds_loop_recv, + .conn_alloc = rds_loop_conn_alloc, + .conn_free = rds_loop_conn_free, + .conn_connect = rds_loop_conn_connect, + .conn_shutdown = rds_loop_conn_shutdown, + .inc_copy_to_user = rds_message_inc_copy_to_user, + .inc_free = rds_loop_inc_free, + .t_name = "loopback", +}; diff --git a/net/rds/loop.h b/net/rds/loop.h new file mode 100644 index 0000000..f32b093 --- /dev/null +++ b/net/rds/loop.h @@ -0,0 +1,9 @@ +#ifndef _RDS_LOOP_H +#define _RDS_LOOP_H + +/* loop.c */ +extern struct rds_transport rds_loop_transport; + +void rds_loop_exit(void); + +#endif diff --git a/net/rds/message.c b/net/rds/message.c new file mode 100644 index 0000000..aba232f --- /dev/null +++ b/net/rds/message.c @@ -0,0 +1,402 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include + +#include "rds.h" + +static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { +[RDS_EXTHDR_NONE] = 0, +[RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version), +[RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma), +[RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest), +}; + + +void rds_message_addref(struct rds_message *rm) +{ + rdsdebug("addref rm %p ref %d\n", rm, atomic_read(&rm->m_refcount)); + atomic_inc(&rm->m_refcount); +} +EXPORT_SYMBOL_GPL(rds_message_addref); + +/* + * This relies on dma_map_sg() not touching sg[].page during merging. + */ +static void rds_message_purge(struct rds_message *rm) +{ + unsigned long i; + + if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) + return; + + for (i = 0; i < rm->data.op_nents; i++) { + rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i])); + /* XXX will have to put_page for page refs */ + __free_page(sg_page(&rm->data.op_sg[i])); + } + rm->data.op_nents = 0; + + if (rm->rdma.op_active) + rds_rdma_free_op(&rm->rdma); + if (rm->rdma.op_rdma_mr) + rds_mr_put(rm->rdma.op_rdma_mr); + + if (rm->atomic.op_active) + rds_atomic_free_op(&rm->atomic); + if (rm->atomic.op_rdma_mr) + rds_mr_put(rm->atomic.op_rdma_mr); +} + +void rds_message_put(struct rds_message *rm) +{ + rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount)); + WARN(!atomic_read(&rm->m_refcount), "danger refcount zero on %p\n", rm); + if (atomic_dec_and_test(&rm->m_refcount)) { + BUG_ON(!list_empty(&rm->m_sock_item)); + BUG_ON(!list_empty(&rm->m_conn_item)); + rds_message_purge(rm); + + kfree(rm); + } +} +EXPORT_SYMBOL_GPL(rds_message_put); + +void rds_message_populate_header(struct rds_header *hdr, __be16 sport, + __be16 dport, u64 seq) +{ + hdr->h_flags = 0; + hdr->h_sport = sport; + hdr->h_dport = dport; + hdr->h_sequence = cpu_to_be64(seq); + hdr->h_exthdr[0] = RDS_EXTHDR_NONE; +} +EXPORT_SYMBOL_GPL(rds_message_populate_header); + +int rds_message_add_extension(struct rds_header *hdr, unsigned int type, + const void *data, unsigned int len) +{ + unsigned int ext_len = sizeof(u8) + len; + unsigned char *dst; + + /* For now, refuse to add more than one extension header */ + if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE) + return 0; + + if (type >= __RDS_EXTHDR_MAX || len != rds_exthdr_size[type]) + return 0; + + if (ext_len >= RDS_HEADER_EXT_SPACE) + return 0; + dst = hdr->h_exthdr; + + *dst++ = type; + memcpy(dst, data, len); + + dst[len] = RDS_EXTHDR_NONE; + return 1; +} +EXPORT_SYMBOL_GPL(rds_message_add_extension); + +/* + * If a message has extension headers, retrieve them here. + * Call like this: + * + * unsigned int pos = 0; + * + * while (1) { + * buflen = sizeof(buffer); + * type = rds_message_next_extension(hdr, &pos, buffer, &buflen); + * if (type == RDS_EXTHDR_NONE) + * break; + * ... + * } + */ +int rds_message_next_extension(struct rds_header *hdr, + unsigned int *pos, void *buf, unsigned int *buflen) +{ + unsigned int offset, ext_type, ext_len; + u8 *src = hdr->h_exthdr; + + offset = *pos; + if (offset >= RDS_HEADER_EXT_SPACE) + goto none; + + /* Get the extension type and length. For now, the + * length is implied by the extension type. */ + ext_type = src[offset++]; + + if (ext_type == RDS_EXTHDR_NONE || ext_type >= __RDS_EXTHDR_MAX) + goto none; + ext_len = rds_exthdr_size[ext_type]; + if (offset + ext_len > RDS_HEADER_EXT_SPACE) + goto none; + + *pos = offset + ext_len; + if (ext_len < *buflen) + *buflen = ext_len; + memcpy(buf, src + offset, *buflen); + return ext_type; + +none: + *pos = RDS_HEADER_EXT_SPACE; + *buflen = 0; + return RDS_EXTHDR_NONE; +} + +int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset) +{ + struct rds_ext_header_rdma_dest ext_hdr; + + ext_hdr.h_rdma_rkey = cpu_to_be32(r_key); + ext_hdr.h_rdma_offset = cpu_to_be32(offset); + return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr)); +} +EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension); + +/* + * Each rds_message is allocated with extra space for the scatterlist entries + * rds ops will need. This is to minimize memory allocation count. Then, each rds op + * can grab SGs when initializing its part of the rds_message. + */ +struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp) +{ + struct rds_message *rm; + + if (extra_len > KMALLOC_MAX_SIZE - sizeof(struct rds_message)) + return NULL; + + rm = kzalloc(sizeof(struct rds_message) + extra_len, gfp); + if (!rm) + goto out; + + rm->m_used_sgs = 0; + rm->m_total_sgs = extra_len / sizeof(struct scatterlist); + + atomic_set(&rm->m_refcount, 1); + INIT_LIST_HEAD(&rm->m_sock_item); + INIT_LIST_HEAD(&rm->m_conn_item); + spin_lock_init(&rm->m_rs_lock); + init_waitqueue_head(&rm->m_flush_wait); + +out: + return rm; +} + +/* + * RDS ops use this to grab SG entries from the rm's sg pool. + */ +struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents) +{ + struct scatterlist *sg_first = (struct scatterlist *) &rm[1]; + struct scatterlist *sg_ret; + + WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs); + WARN_ON(!nents); + + if (rm->m_used_sgs + nents > rm->m_total_sgs) + return NULL; + + sg_ret = &sg_first[rm->m_used_sgs]; + sg_init_table(sg_ret, nents); + rm->m_used_sgs += nents; + + return sg_ret; +} + +struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len) +{ + struct rds_message *rm; + unsigned int i; + int num_sgs = ceil(total_len, PAGE_SIZE); + int extra_bytes = num_sgs * sizeof(struct scatterlist); + + rm = rds_message_alloc(extra_bytes, GFP_NOWAIT); + if (!rm) + return ERR_PTR(-ENOMEM); + + set_bit(RDS_MSG_PAGEVEC, &rm->m_flags); + rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); + rm->data.op_nents = ceil(total_len, PAGE_SIZE); + rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs); + if (!rm->data.op_sg) { + rds_message_put(rm); + return ERR_PTR(-ENOMEM); + } + + for (i = 0; i < rm->data.op_nents; ++i) { + sg_set_page(&rm->data.op_sg[i], + virt_to_page(page_addrs[i]), + PAGE_SIZE, 0); + } + + return rm; +} + +int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov, + size_t total_len) +{ + unsigned long to_copy; + unsigned long iov_off; + unsigned long sg_off; + struct iovec *iov; + struct scatterlist *sg; + int ret = 0; + + rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); + + /* + * now allocate and copy in the data payload. + */ + sg = rm->data.op_sg; + iov = first_iov; + iov_off = 0; + sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ + + while (total_len) { + if (!sg_page(sg)) { + ret = rds_page_remainder_alloc(sg, total_len, + GFP_HIGHUSER); + if (ret) + goto out; + rm->data.op_nents++; + sg_off = 0; + } + + while (iov_off == iov->iov_len) { + iov_off = 0; + iov++; + } + + to_copy = min(iov->iov_len - iov_off, sg->length - sg_off); + to_copy = min_t(size_t, to_copy, total_len); + + rdsdebug("copying %lu bytes from user iov [%p, %zu] + %lu to " + "sg [%p, %u, %u] + %lu\n", + to_copy, iov->iov_base, iov->iov_len, iov_off, + (void *)sg_page(sg), sg->offset, sg->length, sg_off); + + ret = rds_page_copy_from_user(sg_page(sg), sg->offset + sg_off, + iov->iov_base + iov_off, + to_copy); + if (ret) + goto out; + + iov_off += to_copy; + total_len -= to_copy; + sg_off += to_copy; + + if (sg_off == sg->length) + sg++; + } + +out: + return ret; +} + +int rds_message_inc_copy_to_user(struct rds_incoming *inc, + struct iovec *first_iov, size_t size) +{ + struct rds_message *rm; + struct iovec *iov; + struct scatterlist *sg; + unsigned long to_copy; + unsigned long iov_off; + unsigned long vec_off; + int copied; + int ret; + u32 len; + + rm = container_of(inc, struct rds_message, m_inc); + len = be32_to_cpu(rm->m_inc.i_hdr.h_len); + + iov = first_iov; + iov_off = 0; + sg = rm->data.op_sg; + vec_off = 0; + copied = 0; + + while (copied < size && copied < len) { + while (iov_off == iov->iov_len) { + iov_off = 0; + iov++; + } + + to_copy = min(iov->iov_len - iov_off, sg->length - vec_off); + to_copy = min_t(size_t, to_copy, size - copied); + to_copy = min_t(unsigned long, to_copy, len - copied); + + rdsdebug("copying %lu bytes to user iov [%p, %zu] + %lu to " + "sg [%p, %u, %u] + %lu\n", + to_copy, iov->iov_base, iov->iov_len, iov_off, + sg_page(sg), sg->offset, sg->length, vec_off); + + ret = rds_page_copy_to_user(sg_page(sg), sg->offset + vec_off, + iov->iov_base + iov_off, + to_copy); + if (ret) { + copied = ret; + break; + } + + iov_off += to_copy; + vec_off += to_copy; + copied += to_copy; + + if (vec_off == sg->length) { + vec_off = 0; + sg++; + } + } + + return copied; +} + +/* + * If the message is still on the send queue, wait until the transport + * is done with it. This is particularly important for RDMA operations. + */ +void rds_message_wait(struct rds_message *rm) +{ + wait_event_interruptible(rm->m_flush_wait, + !test_bit(RDS_MSG_MAPPED, &rm->m_flags)); +} + +void rds_message_unmapped(struct rds_message *rm) +{ + clear_bit(RDS_MSG_MAPPED, &rm->m_flags); + wake_up_interruptible(&rm->m_flush_wait); +} +EXPORT_SYMBOL_GPL(rds_message_unmapped); + diff --git a/net/rds/page.c b/net/rds/page.c new file mode 100644 index 0000000..9005a2c --- /dev/null +++ b/net/rds/page.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include + +#include "rds.h" + +struct rds_page_remainder { + struct page *r_page; + unsigned long r_offset; +}; + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, + rds_page_remainders); + +/* + * returns 0 on success or -errno on failure. + * + * We don't have to worry about flush_dcache_page() as this only works + * with private pages. If, say, we were to do directed receive to pinned + * user pages we'd have to worry more about cache coherence. (Though + * the flush_dcache_page() in get_user_pages() would probably be enough). + */ +int rds_page_copy_user(struct page *page, unsigned long offset, + void __user *ptr, unsigned long bytes, + int to_user) +{ + unsigned long ret; + void *addr; + + addr = kmap(page); + if (to_user) { + rds_stats_add(s_copy_to_user, bytes); + ret = copy_to_user(ptr, addr + offset, bytes); + } else { + rds_stats_add(s_copy_from_user, bytes); + ret = copy_from_user(addr + offset, ptr, bytes); + } + kunmap(page); + + return ret ? -EFAULT : 0; +} +EXPORT_SYMBOL_GPL(rds_page_copy_user); + +/** + * rds_page_remainder_alloc - build up regions of a message. + * + * @scat: Scatter list for message + * @bytes: the number of bytes needed. + * @gfp: the waiting behaviour of the allocation + * + * @gfp is always ored with __GFP_HIGHMEM. Callers must be prepared to + * kmap the pages, etc. + * + * If @bytes is at least a full page then this just returns a page from + * alloc_page(). + * + * If @bytes is a partial page then this stores the unused region of the + * page in a per-cpu structure. Future partial-page allocations may be + * satisfied from that cached region. This lets us waste less memory on + * small allocations with minimal complexity. It works because the transmit + * path passes read-only page regions down to devices. They hold a page + * reference until they are done with the region. + */ +int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, + gfp_t gfp) +{ + struct rds_page_remainder *rem; + unsigned long flags; + struct page *page; + int ret; + + gfp |= __GFP_HIGHMEM; + + /* jump straight to allocation if we're trying for a huge page */ + if (bytes >= PAGE_SIZE) { + page = alloc_page(gfp); + if (!page) { + ret = -ENOMEM; + } else { + sg_set_page(scat, page, PAGE_SIZE, 0); + ret = 0; + } + goto out; + } + + rem = &per_cpu(rds_page_remainders, get_cpu()); + local_irq_save(flags); + + while (1) { + /* avoid a tiny region getting stuck by tossing it */ + if (rem->r_page && bytes > (PAGE_SIZE - rem->r_offset)) { + rds_stats_inc(s_page_remainder_miss); + __free_page(rem->r_page); + rem->r_page = NULL; + } + + /* hand out a fragment from the cached page */ + if (rem->r_page && bytes <= (PAGE_SIZE - rem->r_offset)) { + sg_set_page(scat, rem->r_page, bytes, rem->r_offset); + get_page(sg_page(scat)); + + if (rem->r_offset != 0) + rds_stats_inc(s_page_remainder_hit); + + rem->r_offset += bytes; + if (rem->r_offset == PAGE_SIZE) { + __free_page(rem->r_page); + rem->r_page = NULL; + } + ret = 0; + break; + } + + /* alloc if there is nothing for us to use */ + local_irq_restore(flags); + put_cpu(); + + page = alloc_page(gfp); + + rem = &per_cpu(rds_page_remainders, get_cpu()); + local_irq_save(flags); + + if (!page) { + ret = -ENOMEM; + break; + } + + /* did someone race to fill the remainder before us? */ + if (rem->r_page) { + __free_page(page); + continue; + } + + /* otherwise install our page and loop around to alloc */ + rem->r_page = page; + rem->r_offset = 0; + } + + local_irq_restore(flags); + put_cpu(); +out: + rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret, + ret ? NULL : sg_page(scat), ret ? 0 : scat->offset, + ret ? 0 : scat->length); + return ret; +} +EXPORT_SYMBOL_GPL(rds_page_remainder_alloc); + +static int rds_page_remainder_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + struct rds_page_remainder *rem; + long cpu = (long)hcpu; + + rem = &per_cpu(rds_page_remainders, cpu); + + rdsdebug("cpu %ld action 0x%lx\n", cpu, action); + + switch (action) { + case CPU_DEAD: + if (rem->r_page) + __free_page(rem->r_page); + rem->r_page = NULL; + break; + } + + return 0; +} + +static struct notifier_block rds_page_remainder_nb = { + .notifier_call = rds_page_remainder_cpu_notify, +}; + +void rds_page_exit(void) +{ + int i; + + for_each_possible_cpu(i) + rds_page_remainder_cpu_notify(&rds_page_remainder_nb, + (unsigned long)CPU_DEAD, + (void *)(long)i); +} diff --git a/net/rds/rdma.c b/net/rds/rdma.c new file mode 100644 index 0000000..40084d8 --- /dev/null +++ b/net/rds/rdma.c @@ -0,0 +1,859 @@ +/* + * Copyright (c) 2007 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include /* for DMA_*_DEVICE */ + +#include "rds.h" + +/* + * XXX + * - build with sparse + * - should we limit the size of a mr region? let transport return failure? + * - should we detect duplicate keys on a socket? hmm. + * - an rdma is an mlock, apply rlimit? + */ + +/* + * get the number of pages by looking at the page indices that the start and + * end addresses fall in. + * + * Returns 0 if the vec is invalid. It is invalid if the number of bytes + * causes the address to wrap or overflows an unsigned int. This comes + * from being stored in the 'length' member of 'struct scatterlist'. + */ +static unsigned int rds_pages_in_vec(struct rds_iovec *vec) +{ + if ((vec->addr + vec->bytes <= vec->addr) || + (vec->bytes > (u64)UINT_MAX)) + return 0; + + return ((vec->addr + vec->bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) - + (vec->addr >> PAGE_SHIFT); +} + +static struct rds_mr *rds_mr_tree_walk(struct rb_root *root, u64 key, + struct rds_mr *insert) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct rds_mr *mr; + + while (*p) { + parent = *p; + mr = rb_entry(parent, struct rds_mr, r_rb_node); + + if (key < mr->r_key) + p = &(*p)->rb_left; + else if (key > mr->r_key) + p = &(*p)->rb_right; + else + return mr; + } + + if (insert) { + rb_link_node(&insert->r_rb_node, parent, p); + rb_insert_color(&insert->r_rb_node, root); + atomic_inc(&insert->r_refcount); + } + return NULL; +} + +/* + * Destroy the transport-specific part of a MR. + */ +static void rds_destroy_mr(struct rds_mr *mr) +{ + struct rds_sock *rs = mr->r_sock; + void *trans_private = NULL; + unsigned long flags; + + rdsdebug("RDS: destroy mr key is %x refcnt %u\n", + mr->r_key, atomic_read(&mr->r_refcount)); + + if (test_and_set_bit(RDS_MR_DEAD, &mr->r_state)) + return; + + spin_lock_irqsave(&rs->rs_rdma_lock, flags); + if (!RB_EMPTY_NODE(&mr->r_rb_node)) + rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); + trans_private = mr->r_trans_private; + mr->r_trans_private = NULL; + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + + if (trans_private) + mr->r_trans->free_mr(trans_private, mr->r_invalidate); +} + +void __rds_put_mr_final(struct rds_mr *mr) +{ + rds_destroy_mr(mr); + kfree(mr); +} + +/* + * By the time this is called we can't have any more ioctls called on + * the socket so we don't need to worry about racing with others. + */ +void rds_rdma_drop_keys(struct rds_sock *rs) +{ + struct rds_mr *mr; + struct rb_node *node; + unsigned long flags; + + /* Release any MRs associated with this socket */ + spin_lock_irqsave(&rs->rs_rdma_lock, flags); + while ((node = rb_first(&rs->rs_rdma_keys))) { + mr = container_of(node, struct rds_mr, r_rb_node); + if (mr->r_trans == rs->rs_transport) + mr->r_invalidate = 0; + rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); + RB_CLEAR_NODE(&mr->r_rb_node); + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + rds_destroy_mr(mr); + rds_mr_put(mr); + spin_lock_irqsave(&rs->rs_rdma_lock, flags); + } + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + + if (rs->rs_transport && rs->rs_transport->flush_mrs) + rs->rs_transport->flush_mrs(); +} + +/* + * Helper function to pin user pages. + */ +static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages, + struct page **pages, int write) +{ + int ret; + + ret = get_user_pages_fast(user_addr, nr_pages, write, pages); + + if (ret >= 0 && ret < nr_pages) { + while (ret--) + put_page(pages[ret]); + ret = -EFAULT; + } + + return ret; +} + +static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, + u64 *cookie_ret, struct rds_mr **mr_ret) +{ + struct rds_mr *mr = NULL, *found; + unsigned int nr_pages; + struct page **pages = NULL; + struct scatterlist *sg; + void *trans_private; + unsigned long flags; + rds_rdma_cookie_t cookie; + unsigned int nents; + long i; + int ret; + + if (rs->rs_bound_addr == 0) { + ret = -ENOTCONN; /* XXX not a great errno */ + goto out; + } + + if (!rs->rs_transport->get_mr) { + ret = -EOPNOTSUPP; + goto out; + } + + nr_pages = rds_pages_in_vec(&args->vec); + if (nr_pages == 0) { + ret = -EINVAL; + goto out; + } + + rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n", + args->vec.addr, args->vec.bytes, nr_pages); + + /* XXX clamp nr_pages to limit the size of this alloc? */ + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!pages) { + ret = -ENOMEM; + goto out; + } + + mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL); + if (!mr) { + ret = -ENOMEM; + goto out; + } + + atomic_set(&mr->r_refcount, 1); + RB_CLEAR_NODE(&mr->r_rb_node); + mr->r_trans = rs->rs_transport; + mr->r_sock = rs; + + if (args->flags & RDS_RDMA_USE_ONCE) + mr->r_use_once = 1; + if (args->flags & RDS_RDMA_INVALIDATE) + mr->r_invalidate = 1; + if (args->flags & RDS_RDMA_READWRITE) + mr->r_write = 1; + + /* + * Pin the pages that make up the user buffer and transfer the page + * pointers to the mr's sg array. We check to see if we've mapped + * the whole region after transferring the partial page references + * to the sg array so that we can have one page ref cleanup path. + * + * For now we have no flag that tells us whether the mapping is + * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to + * the zero page. + */ + ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1); + if (ret < 0) + goto out; + + nents = ret; + sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); + if (!sg) { + ret = -ENOMEM; + goto out; + } + WARN_ON(!nents); + sg_init_table(sg, nents); + + /* Stick all pages into the scatterlist */ + for (i = 0 ; i < nents; i++) + sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0); + + rdsdebug("RDS: trans_private nents is %u\n", nents); + + /* Obtain a transport specific MR. If this succeeds, the + * s/g list is now owned by the MR. + * Note that dma_map() implies that pending writes are + * flushed to RAM, so no dma_sync is needed here. */ + trans_private = rs->rs_transport->get_mr(sg, nents, rs, + &mr->r_key); + + if (IS_ERR(trans_private)) { + for (i = 0 ; i < nents; i++) + put_page(sg_page(&sg[i])); + kfree(sg); + ret = PTR_ERR(trans_private); + goto out; + } + + mr->r_trans_private = trans_private; + + rdsdebug("RDS: get_mr put_user key is %x cookie_addr %p\n", + mr->r_key, (void *)(unsigned long) args->cookie_addr); + + /* The user may pass us an unaligned address, but we can only + * map page aligned regions. So we keep the offset, and build + * a 64bit cookie containing and pass that + * around. */ + cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK); + if (cookie_ret) + *cookie_ret = cookie; + + if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long) args->cookie_addr)) { + ret = -EFAULT; + goto out; + } + + /* Inserting the new MR into the rbtree bumps its + * reference count. */ + spin_lock_irqsave(&rs->rs_rdma_lock, flags); + found = rds_mr_tree_walk(&rs->rs_rdma_keys, mr->r_key, mr); + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + + BUG_ON(found && found != mr); + + rdsdebug("RDS: get_mr key is %x\n", mr->r_key); + if (mr_ret) { + atomic_inc(&mr->r_refcount); + *mr_ret = mr; + } + + ret = 0; +out: + kfree(pages); + if (mr) + rds_mr_put(mr); + return ret; +} + +int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen) +{ + struct rds_get_mr_args args; + + if (optlen != sizeof(struct rds_get_mr_args)) + return -EINVAL; + + if (copy_from_user(&args, (struct rds_get_mr_args __user *)optval, + sizeof(struct rds_get_mr_args))) + return -EFAULT; + + return __rds_rdma_map(rs, &args, NULL, NULL); +} + +int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen) +{ + struct rds_get_mr_for_dest_args args; + struct rds_get_mr_args new_args; + + if (optlen != sizeof(struct rds_get_mr_for_dest_args)) + return -EINVAL; + + if (copy_from_user(&args, (struct rds_get_mr_for_dest_args __user *)optval, + sizeof(struct rds_get_mr_for_dest_args))) + return -EFAULT; + + /* + * Initially, just behave like get_mr(). + * TODO: Implement get_mr as wrapper around this + * and deprecate it. + */ + new_args.vec = args.vec; + new_args.cookie_addr = args.cookie_addr; + new_args.flags = args.flags; + + return __rds_rdma_map(rs, &new_args, NULL, NULL); +} + +/* + * Free the MR indicated by the given R_Key + */ +int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen) +{ + struct rds_free_mr_args args; + struct rds_mr *mr; + unsigned long flags; + + if (optlen != sizeof(struct rds_free_mr_args)) + return -EINVAL; + + if (copy_from_user(&args, (struct rds_free_mr_args __user *)optval, + sizeof(struct rds_free_mr_args))) + return -EFAULT; + + /* Special case - a null cookie means flush all unused MRs */ + if (args.cookie == 0) { + if (!rs->rs_transport || !rs->rs_transport->flush_mrs) + return -EINVAL; + rs->rs_transport->flush_mrs(); + return 0; + } + + /* Look up the MR given its R_key and remove it from the rbtree + * so nobody else finds it. + * This should also prevent races with rds_rdma_unuse. + */ + spin_lock_irqsave(&rs->rs_rdma_lock, flags); + mr = rds_mr_tree_walk(&rs->rs_rdma_keys, rds_rdma_cookie_key(args.cookie), NULL); + if (mr) { + rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); + RB_CLEAR_NODE(&mr->r_rb_node); + if (args.flags & RDS_RDMA_INVALIDATE) + mr->r_invalidate = 1; + } + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + + if (!mr) + return -EINVAL; + + /* + * call rds_destroy_mr() ourselves so that we're sure it's done by the time + * we return. If we let rds_mr_put() do it it might not happen until + * someone else drops their ref. + */ + rds_destroy_mr(mr); + rds_mr_put(mr); + return 0; +} + +/* + * This is called when we receive an extension header that + * tells us this MR was used. It allows us to implement + * use_once semantics + */ +void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) +{ + struct rds_mr *mr; + unsigned long flags; + int zot_me = 0; + + spin_lock_irqsave(&rs->rs_rdma_lock, flags); + mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); + if (!mr) { + printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key); + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + return; + } + + if (mr->r_use_once || force) { + rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); + RB_CLEAR_NODE(&mr->r_rb_node); + zot_me = 1; + } + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + + /* May have to issue a dma_sync on this memory region. + * Note we could avoid this if the operation was a RDMA READ, + * but at this point we can't tell. */ + if (mr->r_trans->sync_mr) + mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); + + /* If the MR was marked as invalidate, this will + * trigger an async flush. */ + if (zot_me) + rds_destroy_mr(mr); + rds_mr_put(mr); +} + +void rds_rdma_free_op(struct rm_rdma_op *ro) +{ + unsigned int i; + + for (i = 0; i < ro->op_nents; i++) { + struct page *page = sg_page(&ro->op_sg[i]); + + /* Mark page dirty if it was possibly modified, which + * is the case for a RDMA_READ which copies from remote + * to local memory */ + if (!ro->op_write) { + BUG_ON(irqs_disabled()); + set_page_dirty(page); + } + put_page(page); + } + + kfree(ro->op_notifier); + ro->op_notifier = NULL; + ro->op_active = 0; +} + +void rds_atomic_free_op(struct rm_atomic_op *ao) +{ + struct page *page = sg_page(ao->op_sg); + + /* Mark page dirty if it was possibly modified, which + * is the case for a RDMA_READ which copies from remote + * to local memory */ + set_page_dirty(page); + put_page(page); + + kfree(ao->op_notifier); + ao->op_notifier = NULL; + ao->op_active = 0; +} + + +/* + * Count the number of pages needed to describe an incoming iovec array. + */ +static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs) +{ + int tot_pages = 0; + unsigned int nr_pages; + unsigned int i; + + /* figure out the number of pages in the vector */ + for (i = 0; i < nr_iovecs; i++) { + nr_pages = rds_pages_in_vec(&iov[i]); + if (nr_pages == 0) + return -EINVAL; + + tot_pages += nr_pages; + + /* + * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, + * so tot_pages cannot overflow without first going negative. + */ + if (tot_pages < 0) + return -EINVAL; + } + + return tot_pages; +} + +int rds_rdma_extra_size(struct rds_rdma_args *args) +{ + struct rds_iovec vec; + struct rds_iovec __user *local_vec; + int tot_pages = 0; + unsigned int nr_pages; + unsigned int i; + + local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; + + /* figure out the number of pages in the vector */ + for (i = 0; i < args->nr_local; i++) { + if (copy_from_user(&vec, &local_vec[i], + sizeof(struct rds_iovec))) + return -EFAULT; + + nr_pages = rds_pages_in_vec(&vec); + if (nr_pages == 0) + return -EINVAL; + + tot_pages += nr_pages; + + /* + * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, + * so tot_pages cannot overflow without first going negative. + */ + if (tot_pages < 0) + return -EINVAL; + } + + return tot_pages * sizeof(struct scatterlist); +} + +/* + * The application asks for a RDMA transfer. + * Extract all arguments and set up the rdma_op + */ +int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg) +{ + struct rds_rdma_args *args; + struct rm_rdma_op *op = &rm->rdma; + int nr_pages; + unsigned int nr_bytes; + struct page **pages = NULL; + struct rds_iovec iovstack[UIO_FASTIOV], *iovs = iovstack; + int iov_size; + unsigned int i, j; + int ret = 0; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) + || rm->rdma.op_active) + return -EINVAL; + + args = CMSG_DATA(cmsg); + + if (rs->rs_bound_addr == 0) { + ret = -ENOTCONN; /* XXX not a great errno */ + goto out_ret; + } + + if (args->nr_local > UIO_MAXIOV) { + ret = -EMSGSIZE; + goto out_ret; + } + + /* Check whether to allocate the iovec area */ + iov_size = args->nr_local * sizeof(struct rds_iovec); + if (args->nr_local > UIO_FASTIOV) { + iovs = sock_kmalloc(rds_rs_to_sk(rs), iov_size, GFP_KERNEL); + if (!iovs) { + ret = -ENOMEM; + goto out_ret; + } + } + + if (copy_from_user(iovs, (struct rds_iovec __user *)(unsigned long) args->local_vec_addr, iov_size)) { + ret = -EFAULT; + goto out; + } + + nr_pages = rds_rdma_pages(iovs, args->nr_local); + if (nr_pages < 0) { + ret = -EINVAL; + goto out; + } + + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!pages) { + ret = -ENOMEM; + goto out; + } + + op->op_write = !!(args->flags & RDS_RDMA_READWRITE); + op->op_fence = !!(args->flags & RDS_RDMA_FENCE); + op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); + op->op_silent = !!(args->flags & RDS_RDMA_SILENT); + op->op_active = 1; + op->op_recverr = rs->rs_recverr; + WARN_ON(!nr_pages); + op->op_sg = rds_message_alloc_sgs(rm, nr_pages); + if (!op->op_sg) { + ret = -ENOMEM; + goto out; + } + + if (op->op_notify || op->op_recverr) { + /* We allocate an uninitialized notifier here, because + * we don't want to do that in the completion handler. We + * would have to use GFP_ATOMIC there, and don't want to deal + * with failed allocations. + */ + op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); + if (!op->op_notifier) { + ret = -ENOMEM; + goto out; + } + op->op_notifier->n_user_token = args->user_token; + op->op_notifier->n_status = RDS_RDMA_SUCCESS; + } + + /* The cookie contains the R_Key of the remote memory region, and + * optionally an offset into it. This is how we implement RDMA into + * unaligned memory. + * When setting up the RDMA, we need to add that offset to the + * destination address (which is really an offset into the MR) + * FIXME: We may want to move this into ib_rdma.c + */ + op->op_rkey = rds_rdma_cookie_key(args->cookie); + op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie); + + nr_bytes = 0; + + rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n", + (unsigned long long)args->nr_local, + (unsigned long long)args->remote_vec.addr, + op->op_rkey); + + for (i = 0; i < args->nr_local; i++) { + struct rds_iovec *iov = &iovs[i]; + /* don't need to check, rds_rdma_pages() verified nr will be +nonzero */ + unsigned int nr = rds_pages_in_vec(iov); + + rs->rs_user_addr = iov->addr; + rs->rs_user_bytes = iov->bytes; + + /* If it's a WRITE operation, we want to pin the pages for reading. + * If it's a READ operation, we need to pin the pages for writing. + */ + ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write); + if (ret < 0) + goto out; + + rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n", + nr_bytes, nr, iov->bytes, iov->addr); + + nr_bytes += iov->bytes; + + for (j = 0; j < nr; j++) { + unsigned int offset = iov->addr & ~PAGE_MASK; + struct scatterlist *sg; + + sg = &op->op_sg[op->op_nents + j]; + sg_set_page(sg, pages[j], + min_t(unsigned int, iov->bytes, PAGE_SIZE - offset), + offset); + + rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n", + sg->offset, sg->length, iov->addr, iov->bytes); + + iov->addr += sg->length; + iov->bytes -= sg->length; + } + + op->op_nents += nr; + } + + if (nr_bytes > args->remote_vec.bytes) { + rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n", + nr_bytes, + (unsigned int) args->remote_vec.bytes); + ret = -EINVAL; + goto out; + } + op->op_bytes = nr_bytes; + +out: + if (iovs != iovstack) + sock_kfree_s(rds_rs_to_sk(rs), iovs, iov_size); + kfree(pages); +out_ret: + if (ret) + rds_rdma_free_op(op); + else + rds_stats_inc(s_send_rdma); + + return ret; +} + +/* + * The application wants us to pass an RDMA destination (aka MR) + * to the remote + */ +int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg) +{ + unsigned long flags; + struct rds_mr *mr; + u32 r_key; + int err = 0; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t)) || + rm->m_rdma_cookie != 0) + return -EINVAL; + + memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie)); + + /* We are reusing a previously mapped MR here. Most likely, the + * application has written to the buffer, so we need to explicitly + * flush those writes to RAM. Otherwise the HCA may not see them + * when doing a DMA from that buffer. + */ + r_key = rds_rdma_cookie_key(rm->m_rdma_cookie); + + spin_lock_irqsave(&rs->rs_rdma_lock, flags); + mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); + if (!mr) + err = -EINVAL; /* invalid r_key */ + else + atomic_inc(&mr->r_refcount); + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + + if (mr) { + mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); + rm->rdma.op_rdma_mr = mr; + } + return err; +} + +/* + * The application passes us an address range it wants to enable RDMA + * to/from. We map the area, and save the pair + * in rm->m_rdma_cookie. This causes it to be sent along to the peer + * in an extension header. + */ +int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg) +{ + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args)) || + rm->m_rdma_cookie != 0) + return -EINVAL; + + return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr); +} + +/* + * Fill in rds_message for an atomic request. + */ +int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg) +{ + struct page *page = NULL; + struct rds_atomic_args *args; + int ret = 0; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args)) + || rm->atomic.op_active) + return -EINVAL; + + args = CMSG_DATA(cmsg); + + /* Nonmasked & masked cmsg ops converted to masked hw ops */ + switch (cmsg->cmsg_type) { + case RDS_CMSG_ATOMIC_FADD: + rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; + rm->atomic.op_m_fadd.add = args->fadd.add; + rm->atomic.op_m_fadd.nocarry_mask = 0; + break; + case RDS_CMSG_MASKED_ATOMIC_FADD: + rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; + rm->atomic.op_m_fadd.add = args->m_fadd.add; + rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask; + break; + case RDS_CMSG_ATOMIC_CSWP: + rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; + rm->atomic.op_m_cswp.compare = args->cswp.compare; + rm->atomic.op_m_cswp.swap = args->cswp.swap; + rm->atomic.op_m_cswp.compare_mask = ~0; + rm->atomic.op_m_cswp.swap_mask = ~0; + break; + case RDS_CMSG_MASKED_ATOMIC_CSWP: + rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; + rm->atomic.op_m_cswp.compare = args->m_cswp.compare; + rm->atomic.op_m_cswp.swap = args->m_cswp.swap; + rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask; + rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask; + break; + default: + BUG(); /* should never happen */ + } + + rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); + rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT); + rm->atomic.op_active = 1; + rm->atomic.op_recverr = rs->rs_recverr; + rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1); + if (!rm->atomic.op_sg) { + ret = -ENOMEM; + goto err; + } + + /* verify 8 byte-aligned */ + if (args->local_addr & 0x7) { + ret = -EFAULT; + goto err; + } + + ret = rds_pin_pages(args->local_addr, 1, &page, 1); + if (ret != 1) + goto err; + ret = 0; + + sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr)); + + if (rm->atomic.op_notify || rm->atomic.op_recverr) { + /* We allocate an uninitialized notifier here, because + * we don't want to do that in the completion handler. We + * would have to use GFP_ATOMIC there, and don't want to deal + * with failed allocations. + */ + rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL); + if (!rm->atomic.op_notifier) { + ret = -ENOMEM; + goto err; + } + + rm->atomic.op_notifier->n_user_token = args->user_token; + rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS; + } + + rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie); + rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie); + + return ret; +err: + if (page) + put_page(page); + kfree(rm->atomic.op_notifier); + + return ret; +} diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c new file mode 100644 index 0000000..6cd9d1d --- /dev/null +++ b/net/rds/rdma_transport.c @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2009 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include + +#include "rdma_transport.h" + +static struct rdma_cm_id *rds_rdma_listen_id; + +static char *rds_cm_event_strings[] = { +#define RDS_CM_EVENT_STRING(foo) \ + [RDMA_CM_EVENT_##foo] = __stringify(RDMA_CM_EVENT_##foo) + RDS_CM_EVENT_STRING(ADDR_RESOLVED), + RDS_CM_EVENT_STRING(ADDR_ERROR), + RDS_CM_EVENT_STRING(ROUTE_RESOLVED), + RDS_CM_EVENT_STRING(ROUTE_ERROR), + RDS_CM_EVENT_STRING(CONNECT_REQUEST), + RDS_CM_EVENT_STRING(CONNECT_RESPONSE), + RDS_CM_EVENT_STRING(CONNECT_ERROR), + RDS_CM_EVENT_STRING(UNREACHABLE), + RDS_CM_EVENT_STRING(REJECTED), + RDS_CM_EVENT_STRING(ESTABLISHED), + RDS_CM_EVENT_STRING(DISCONNECTED), + RDS_CM_EVENT_STRING(DEVICE_REMOVAL), + RDS_CM_EVENT_STRING(MULTICAST_JOIN), + RDS_CM_EVENT_STRING(MULTICAST_ERROR), + RDS_CM_EVENT_STRING(ADDR_CHANGE), + RDS_CM_EVENT_STRING(TIMEWAIT_EXIT), +#undef RDS_CM_EVENT_STRING +}; + +static char *rds_cm_event_str(enum rdma_cm_event_type type) +{ + return rds_str_array(rds_cm_event_strings, + ARRAY_SIZE(rds_cm_event_strings), type); +}; + +int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + /* this can be null in the listening path */ + struct rds_connection *conn = cm_id->context; + struct rds_transport *trans; + int ret = 0; + + rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id, + event->event, rds_cm_event_str(event->event)); + + if (cm_id->device->node_type == RDMA_NODE_RNIC) + trans = &rds_iw_transport; + else + trans = &rds_ib_transport; + + /* Prevent shutdown from tearing down the connection + * while we're executing. */ + if (conn) { + mutex_lock(&conn->c_cm_lock); + + /* If the connection is being shut down, bail out + * right away. We return 0 so cm_id doesn't get + * destroyed prematurely */ + if (rds_conn_state(conn) == RDS_CONN_DISCONNECTING) { + /* Reject incoming connections while we're tearing + * down an existing one. */ + if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) + ret = 1; + goto out; + } + } + + switch (event->event) { + case RDMA_CM_EVENT_CONNECT_REQUEST: + ret = trans->cm_handle_connect(cm_id, event); + break; + + case RDMA_CM_EVENT_ADDR_RESOLVED: + /* XXX do we need to clean up if this fails? */ + ret = rdma_resolve_route(cm_id, + RDS_RDMA_RESOLVE_TIMEOUT_MS); + break; + + case RDMA_CM_EVENT_ROUTE_RESOLVED: + /* XXX worry about racing with listen acceptance */ + ret = trans->cm_initiate_connect(cm_id); + break; + + case RDMA_CM_EVENT_ESTABLISHED: + trans->cm_connect_complete(conn, event); + break; + + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_REJECTED: + case RDMA_CM_EVENT_DEVICE_REMOVAL: + case RDMA_CM_EVENT_ADDR_CHANGE: + if (conn) + rds_conn_drop(conn); + break; + + case RDMA_CM_EVENT_DISCONNECTED: + rdsdebug("DISCONNECT event - dropping connection " + "%pI4->%pI4\n", &conn->c_laddr, + &conn->c_faddr); + rds_conn_drop(conn); + break; + + default: + /* things like device disconnect? */ + printk(KERN_ERR "RDS: unknown event %u (%s)!\n", + event->event, rds_cm_event_str(event->event)); + break; + } + +out: + if (conn) + mutex_unlock(&conn->c_cm_lock); + + rdsdebug("id %p event %u (%s) handling ret %d\n", cm_id, event->event, + rds_cm_event_str(event->event), ret); + + return ret; +} + +static int rds_rdma_listen_init(void) +{ + struct sockaddr_in sin; + struct rdma_cm_id *cm_id; + int ret; + + cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP, + IB_QPT_RC); + if (IS_ERR(cm_id)) { + ret = PTR_ERR(cm_id); + printk(KERN_ERR "RDS/RDMA: failed to setup listener, " + "rdma_create_id() returned %d\n", ret); + return ret; + } + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY); + sin.sin_port = (__force u16)htons(RDS_PORT); + + /* + * XXX I bet this binds the cm_id to a device. If we want to support + * fail-over we'll have to take this into consideration. + */ + ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); + if (ret) { + printk(KERN_ERR "RDS/RDMA: failed to setup listener, " + "rdma_bind_addr() returned %d\n", ret); + goto out; + } + + ret = rdma_listen(cm_id, 128); + if (ret) { + printk(KERN_ERR "RDS/RDMA: failed to setup listener, " + "rdma_listen() returned %d\n", ret); + goto out; + } + + rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT); + + rds_rdma_listen_id = cm_id; + cm_id = NULL; +out: + if (cm_id) + rdma_destroy_id(cm_id); + return ret; +} + +static void rds_rdma_listen_stop(void) +{ + if (rds_rdma_listen_id) { + rdsdebug("cm %p\n", rds_rdma_listen_id); + rdma_destroy_id(rds_rdma_listen_id); + rds_rdma_listen_id = NULL; + } +} + +static int rds_rdma_init(void) +{ + int ret; + + ret = rds_rdma_listen_init(); + if (ret) + goto out; + + ret = rds_iw_init(); + if (ret) + goto err_iw_init; + + ret = rds_ib_init(); + if (ret) + goto err_ib_init; + + goto out; + +err_ib_init: + rds_iw_exit(); +err_iw_init: + rds_rdma_listen_stop(); +out: + return ret; +} +module_init(rds_rdma_init); + +static void rds_rdma_exit(void) +{ + /* stop listening first to ensure no new connections are attempted */ + rds_rdma_listen_stop(); + rds_ib_exit(); + rds_iw_exit(); +} +module_exit(rds_rdma_exit); + +MODULE_AUTHOR("Oracle Corporation "); +MODULE_DESCRIPTION("RDS: IB/iWARP transport"); +MODULE_LICENSE("Dual BSD/GPL"); + diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h new file mode 100644 index 0000000..faba4e3 --- /dev/null +++ b/net/rds/rdma_transport.h @@ -0,0 +1,24 @@ +#ifndef _RDMA_TRANSPORT_H +#define _RDMA_TRANSPORT_H + +#include +#include +#include "rds.h" + +#define RDS_RDMA_RESOLVE_TIMEOUT_MS 5000 + +int rds_rdma_conn_connect(struct rds_connection *conn); +int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); + +/* from ib.c */ +extern struct rds_transport rds_ib_transport; +int rds_ib_init(void); +void rds_ib_exit(void); + +/* from iw.c */ +extern struct rds_transport rds_iw_transport; +int rds_iw_init(void); +void rds_iw_exit(void); + +#endif diff --git a/net/rds/rds.h b/net/rds/rds.h new file mode 100644 index 0000000..48f8ffc --- /dev/null +++ b/net/rds/rds.h @@ -0,0 +1,812 @@ +#ifndef _RDS_RDS_H +#define _RDS_RDS_H + +#include +#include +#include +#include +#include +#include + +#include "info.h" + +/* + * RDS Network protocol version + */ +#define RDS_PROTOCOL_3_0 0x0300 +#define RDS_PROTOCOL_3_1 0x0301 +#define RDS_PROTOCOL_VERSION RDS_PROTOCOL_3_1 +#define RDS_PROTOCOL_MAJOR(v) ((v) >> 8) +#define RDS_PROTOCOL_MINOR(v) ((v) & 255) +#define RDS_PROTOCOL(maj, min) (((maj) << 8) | min) + +/* + * XXX randomly chosen, but at least seems to be unused: + * # 18464-18768 Unassigned + * We should do better. We want a reserved port to discourage unpriv'ed + * userspace from listening. + */ +#define RDS_PORT 18634 + +#ifdef ATOMIC64_INIT +#define KERNEL_HAS_ATOMIC64 +#endif + +#ifdef DEBUG +#define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args) +#else +/* sigh, pr_debug() causes unused variable warnings */ +static inline __printf(1, 2) +void rdsdebug(char *fmt, ...) +{ +} +#endif + +/* XXX is there one of these somewhere? */ +#define ceil(x, y) \ + ({ unsigned long __x = (x), __y = (y); (__x + __y - 1) / __y; }) + +#define RDS_FRAG_SHIFT 12 +#define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT)) + +#define RDS_CONG_MAP_BYTES (65536 / 8) +#define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE) +#define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8) + +struct rds_cong_map { + struct rb_node m_rb_node; + __be32 m_addr; + wait_queue_head_t m_waitq; + struct list_head m_conn_list; + unsigned long m_page_addrs[RDS_CONG_MAP_PAGES]; +}; + + +/* + * This is how we will track the connection state: + * A connection is always in one of the following + * states. Updates to the state are atomic and imply + * a memory barrier. + */ +enum { + RDS_CONN_DOWN = 0, + RDS_CONN_CONNECTING, + RDS_CONN_DISCONNECTING, + RDS_CONN_UP, + RDS_CONN_ERROR, +}; + +/* Bits for c_flags */ +#define RDS_LL_SEND_FULL 0 +#define RDS_RECONNECT_PENDING 1 +#define RDS_IN_XMIT 2 + +struct rds_connection { + struct hlist_node c_hash_node; + __be32 c_laddr; + __be32 c_faddr; + unsigned int c_loopback:1; + struct rds_connection *c_passive; + + struct rds_cong_map *c_lcong; + struct rds_cong_map *c_fcong; + + struct rds_message *c_xmit_rm; + unsigned long c_xmit_sg; + unsigned int c_xmit_hdr_off; + unsigned int c_xmit_data_off; + unsigned int c_xmit_atomic_sent; + unsigned int c_xmit_rdma_sent; + unsigned int c_xmit_data_sent; + + spinlock_t c_lock; /* protect msg queues */ + u64 c_next_tx_seq; + struct list_head c_send_queue; + struct list_head c_retrans; + + u64 c_next_rx_seq; + + struct rds_transport *c_trans; + void *c_transport_data; + + atomic_t c_state; + unsigned long c_flags; + unsigned long c_reconnect_jiffies; + struct delayed_work c_send_w; + struct delayed_work c_recv_w; + struct delayed_work c_conn_w; + struct work_struct c_down_w; + struct mutex c_cm_lock; /* protect conn state & cm */ + wait_queue_head_t c_waitq; + + struct list_head c_map_item; + unsigned long c_map_queued; + + unsigned int c_unacked_packets; + unsigned int c_unacked_bytes; + + /* Protocol version */ + unsigned int c_version; +}; + +#define RDS_FLAG_CONG_BITMAP 0x01 +#define RDS_FLAG_ACK_REQUIRED 0x02 +#define RDS_FLAG_RETRANSMITTED 0x04 +#define RDS_MAX_ADV_CREDIT 255 + +/* + * Maximum space available for extension headers. + */ +#define RDS_HEADER_EXT_SPACE 16 + +struct rds_header { + __be64 h_sequence; + __be64 h_ack; + __be32 h_len; + __be16 h_sport; + __be16 h_dport; + u8 h_flags; + u8 h_credit; + u8 h_padding[4]; + __sum16 h_csum; + + u8 h_exthdr[RDS_HEADER_EXT_SPACE]; +}; + +/* + * Reserved - indicates end of extensions + */ +#define RDS_EXTHDR_NONE 0 + +/* + * This extension header is included in the very + * first message that is sent on a new connection, + * and identifies the protocol level. This will help + * rolling updates if a future change requires breaking + * the protocol. + * NB: This is no longer true for IB, where we do a version + * negotiation during the connection setup phase (protocol + * version information is included in the RDMA CM private data). + */ +#define RDS_EXTHDR_VERSION 1 +struct rds_ext_header_version { + __be32 h_version; +}; + +/* + * This extension header is included in the RDS message + * chasing an RDMA operation. + */ +#define RDS_EXTHDR_RDMA 2 +struct rds_ext_header_rdma { + __be32 h_rdma_rkey; +}; + +/* + * This extension header tells the peer about the + * destination of the requested RDMA + * operation. + */ +#define RDS_EXTHDR_RDMA_DEST 3 +struct rds_ext_header_rdma_dest { + __be32 h_rdma_rkey; + __be32 h_rdma_offset; +}; + +#define __RDS_EXTHDR_MAX 16 /* for now */ + +struct rds_incoming { + atomic_t i_refcount; + struct list_head i_item; + struct rds_connection *i_conn; + struct rds_header i_hdr; + unsigned long i_rx_jiffies; + __be32 i_saddr; + + rds_rdma_cookie_t i_rdma_cookie; +}; + +struct rds_mr { + struct rb_node r_rb_node; + atomic_t r_refcount; + u32 r_key; + + /* A copy of the creation flags */ + unsigned int r_use_once:1; + unsigned int r_invalidate:1; + unsigned int r_write:1; + + /* This is for RDS_MR_DEAD. + * It would be nice & consistent to make this part of the above + * bit field here, but we need to use test_and_set_bit. + */ + unsigned long r_state; + struct rds_sock *r_sock; /* back pointer to the socket that owns us */ + struct rds_transport *r_trans; + void *r_trans_private; +}; + +/* Flags for mr->r_state */ +#define RDS_MR_DEAD 0 + +static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset) +{ + return r_key | (((u64) offset) << 32); +} + +static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie) +{ + return cookie; +} + +static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) +{ + return cookie >> 32; +} + +/* atomic operation types */ +#define RDS_ATOMIC_TYPE_CSWP 0 +#define RDS_ATOMIC_TYPE_FADD 1 + +/* + * m_sock_item and m_conn_item are on lists that are serialized under + * conn->c_lock. m_sock_item has additional meaning in that once it is empty + * the message will not be put back on the retransmit list after being sent. + * messages that are canceled while being sent rely on this. + * + * m_inc is used by loopback so that it can pass an incoming message straight + * back up into the rx path. It embeds a wire header which is also used by + * the send path, which is kind of awkward. + * + * m_sock_item indicates the message's presence on a socket's send or receive + * queue. m_rs will point to that socket. + * + * m_daddr is used by cancellation to prune messages to a given destination. + * + * The RDS_MSG_ON_SOCK and RDS_MSG_ON_CONN flags are used to avoid lock + * nesting. As paths iterate over messages on a sock, or conn, they must + * also lock the conn, or sock, to remove the message from those lists too. + * Testing the flag to determine if the message is still on the lists lets + * us avoid testing the list_head directly. That means each path can use + * the message's list_head to keep it on a local list while juggling locks + * without confusing the other path. + * + * m_ack_seq is an optional field set by transports who need a different + * sequence number range to invalidate. They can use this in a callback + * that they pass to rds_send_drop_acked() to see if each message has been + * acked. The HAS_ACK_SEQ flag can be used to detect messages which haven't + * had ack_seq set yet. + */ +#define RDS_MSG_ON_SOCK 1 +#define RDS_MSG_ON_CONN 2 +#define RDS_MSG_HAS_ACK_SEQ 3 +#define RDS_MSG_ACK_REQUIRED 4 +#define RDS_MSG_RETRANSMITTED 5 +#define RDS_MSG_MAPPED 6 +#define RDS_MSG_PAGEVEC 7 + +struct rds_message { + atomic_t m_refcount; + struct list_head m_sock_item; + struct list_head m_conn_item; + struct rds_incoming m_inc; + u64 m_ack_seq; + __be32 m_daddr; + unsigned long m_flags; + + /* Never access m_rs without holding m_rs_lock. + * Lock nesting is + * rm->m_rs_lock + * -> rs->rs_lock + */ + spinlock_t m_rs_lock; + wait_queue_head_t m_flush_wait; + + struct rds_sock *m_rs; + + /* cookie to send to remote, in rds header */ + rds_rdma_cookie_t m_rdma_cookie; + + unsigned int m_used_sgs; + unsigned int m_total_sgs; + + void *m_final_op; + + struct { + struct rm_atomic_op { + int op_type; + union { + struct { + uint64_t compare; + uint64_t swap; + uint64_t compare_mask; + uint64_t swap_mask; + } op_m_cswp; + struct { + uint64_t add; + uint64_t nocarry_mask; + } op_m_fadd; + }; + + u32 op_rkey; + u64 op_remote_addr; + unsigned int op_notify:1; + unsigned int op_recverr:1; + unsigned int op_mapped:1; + unsigned int op_silent:1; + unsigned int op_active:1; + struct scatterlist *op_sg; + struct rds_notifier *op_notifier; + + struct rds_mr *op_rdma_mr; + } atomic; + struct rm_rdma_op { + u32 op_rkey; + u64 op_remote_addr; + unsigned int op_write:1; + unsigned int op_fence:1; + unsigned int op_notify:1; + unsigned int op_recverr:1; + unsigned int op_mapped:1; + unsigned int op_silent:1; + unsigned int op_active:1; + unsigned int op_bytes; + unsigned int op_nents; + unsigned int op_count; + struct scatterlist *op_sg; + struct rds_notifier *op_notifier; + + struct rds_mr *op_rdma_mr; + } rdma; + struct rm_data_op { + unsigned int op_active:1; + unsigned int op_nents; + unsigned int op_count; + struct scatterlist *op_sg; + } data; + }; +}; + +/* + * The RDS notifier is used (optionally) to tell the application about + * completed RDMA operations. Rather than keeping the whole rds message + * around on the queue, we allocate a small notifier that is put on the + * socket's notifier_list. Notifications are delivered to the application + * through control messages. + */ +struct rds_notifier { + struct list_head n_list; + uint64_t n_user_token; + int n_status; +}; + +/** + * struct rds_transport - transport specific behavioural hooks + * + * @xmit: .xmit is called by rds_send_xmit() to tell the transport to send + * part of a message. The caller serializes on the send_sem so this + * doesn't need to be reentrant for a given conn. The header must be + * sent before the data payload. .xmit must be prepared to send a + * message with no data payload. .xmit should return the number of + * bytes that were sent down the connection, including header bytes. + * Returning 0 tells the caller that it doesn't need to perform any + * additional work now. This is usually the case when the transport has + * filled the sending queue for its connection and will handle + * triggering the rds thread to continue the send when space becomes + * available. Returning -EAGAIN tells the caller to retry the send + * immediately. Returning -ENOMEM tells the caller to retry the send at + * some point in the future. + * + * @conn_shutdown: conn_shutdown stops traffic on the given connection. Once + * it returns the connection can not call rds_recv_incoming(). + * This will only be called once after conn_connect returns + * non-zero success and will The caller serializes this with + * the send and connecting paths (xmit_* and conn_*). The + * transport is responsible for other serialization, including + * rds_recv_incoming(). This is called in process context but + * should try hard not to block. + */ + +#define RDS_TRANS_IB 0 +#define RDS_TRANS_IWARP 1 +#define RDS_TRANS_TCP 2 +#define RDS_TRANS_COUNT 3 + +struct rds_transport { + char t_name[TRANSNAMSIZ]; + struct list_head t_item; + struct module *t_owner; + unsigned int t_prefer_loopback:1; + unsigned int t_type; + + int (*laddr_check)(__be32 addr); + int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp); + void (*conn_free)(void *data); + int (*conn_connect)(struct rds_connection *conn); + void (*conn_shutdown)(struct rds_connection *conn); + void (*xmit_prepare)(struct rds_connection *conn); + void (*xmit_complete)(struct rds_connection *conn); + int (*xmit)(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, unsigned int off); + int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op); + int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op); + int (*recv)(struct rds_connection *conn); + int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov, + size_t size); + void (*inc_free)(struct rds_incoming *inc); + + int (*cm_handle_connect)(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); + int (*cm_initiate_connect)(struct rdma_cm_id *cm_id); + void (*cm_connect_complete)(struct rds_connection *conn, + struct rdma_cm_event *event); + + unsigned int (*stats_info_copy)(struct rds_info_iterator *iter, + unsigned int avail); + void (*exit)(void); + void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg, + struct rds_sock *rs, u32 *key_ret); + void (*sync_mr)(void *trans_private, int direction); + void (*free_mr)(void *trans_private, int invalidate); + void (*flush_mrs)(void); +}; + +struct rds_sock { + struct sock rs_sk; + + u64 rs_user_addr; + u64 rs_user_bytes; + + /* + * bound_addr used for both incoming and outgoing, no INADDR_ANY + * support. + */ + struct hlist_node rs_bound_node; + __be32 rs_bound_addr; + __be32 rs_conn_addr; + __be16 rs_bound_port; + __be16 rs_conn_port; + struct rds_transport *rs_transport; + + /* + * rds_sendmsg caches the conn it used the last time around. + * This helps avoid costly lookups. + */ + struct rds_connection *rs_conn; + + /* flag indicating we were congested or not */ + int rs_congested; + /* seen congestion (ENOBUFS) when sending? */ + int rs_seen_congestion; + + /* rs_lock protects all these adjacent members before the newline */ + spinlock_t rs_lock; + struct list_head rs_send_queue; + u32 rs_snd_bytes; + int rs_rcv_bytes; + struct list_head rs_notify_queue; /* currently used for failed RDMAs */ + + /* Congestion wake_up. If rs_cong_monitor is set, we use cong_mask + * to decide whether the application should be woken up. + * If not set, we use rs_cong_track to find out whether a cong map + * update arrived. + */ + uint64_t rs_cong_mask; + uint64_t rs_cong_notify; + struct list_head rs_cong_list; + unsigned long rs_cong_track; + + /* + * rs_recv_lock protects the receive queue, and is + * used to serialize with rds_release. + */ + rwlock_t rs_recv_lock; + struct list_head rs_recv_queue; + + /* just for stats reporting */ + struct list_head rs_item; + + /* these have their own lock */ + spinlock_t rs_rdma_lock; + struct rb_root rs_rdma_keys; + + /* Socket options - in case there will be more */ + unsigned char rs_recverr, + rs_cong_monitor; +}; + +static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk) +{ + return container_of(sk, struct rds_sock, rs_sk); +} +static inline struct sock *rds_rs_to_sk(struct rds_sock *rs) +{ + return &rs->rs_sk; +} + +/* + * The stack assigns sk_sndbuf and sk_rcvbuf to twice the specified value + * to account for overhead. We don't account for overhead, we just apply + * the number of payload bytes to the specified value. + */ +static inline int rds_sk_sndbuf(struct rds_sock *rs) +{ + return rds_rs_to_sk(rs)->sk_sndbuf / 2; +} +static inline int rds_sk_rcvbuf(struct rds_sock *rs) +{ + return rds_rs_to_sk(rs)->sk_rcvbuf / 2; +} + +struct rds_statistics { + uint64_t s_conn_reset; + uint64_t s_recv_drop_bad_checksum; + uint64_t s_recv_drop_old_seq; + uint64_t s_recv_drop_no_sock; + uint64_t s_recv_drop_dead_sock; + uint64_t s_recv_deliver_raced; + uint64_t s_recv_delivered; + uint64_t s_recv_queued; + uint64_t s_recv_immediate_retry; + uint64_t s_recv_delayed_retry; + uint64_t s_recv_ack_required; + uint64_t s_recv_rdma_bytes; + uint64_t s_recv_ping; + uint64_t s_send_queue_empty; + uint64_t s_send_queue_full; + uint64_t s_send_lock_contention; + uint64_t s_send_lock_queue_raced; + uint64_t s_send_immediate_retry; + uint64_t s_send_delayed_retry; + uint64_t s_send_drop_acked; + uint64_t s_send_ack_required; + uint64_t s_send_queued; + uint64_t s_send_rdma; + uint64_t s_send_rdma_bytes; + uint64_t s_send_pong; + uint64_t s_page_remainder_hit; + uint64_t s_page_remainder_miss; + uint64_t s_copy_to_user; + uint64_t s_copy_from_user; + uint64_t s_cong_update_queued; + uint64_t s_cong_update_received; + uint64_t s_cong_send_error; + uint64_t s_cong_send_blocked; +}; + +/* af_rds.c */ +char *rds_str_array(char **array, size_t elements, size_t index); +void rds_sock_addref(struct rds_sock *rs); +void rds_sock_put(struct rds_sock *rs); +void rds_wake_sk_sleep(struct rds_sock *rs); +static inline void __rds_wake_sk_sleep(struct sock *sk) +{ + wait_queue_head_t *waitq = sk_sleep(sk); + + if (!sock_flag(sk, SOCK_DEAD) && waitq) + wake_up(waitq); +} +extern wait_queue_head_t rds_poll_waitq; + + +/* bind.c */ +int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); +void rds_remove_bound(struct rds_sock *rs); +struct rds_sock *rds_find_bound(__be32 addr, __be16 port); + +/* cong.c */ +int rds_cong_get_maps(struct rds_connection *conn); +void rds_cong_add_conn(struct rds_connection *conn); +void rds_cong_remove_conn(struct rds_connection *conn); +void rds_cong_set_bit(struct rds_cong_map *map, __be16 port); +void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port); +int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rds_sock *rs); +void rds_cong_queue_updates(struct rds_cong_map *map); +void rds_cong_map_updated(struct rds_cong_map *map, uint64_t); +int rds_cong_updated_since(unsigned long *recent); +void rds_cong_add_socket(struct rds_sock *); +void rds_cong_remove_socket(struct rds_sock *); +void rds_cong_exit(void); +struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); + +/* conn.c */ +int rds_conn_init(void); +void rds_conn_exit(void); +struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, + struct rds_transport *trans, gfp_t gfp); +struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, + struct rds_transport *trans, gfp_t gfp); +void rds_conn_shutdown(struct rds_connection *conn); +void rds_conn_destroy(struct rds_connection *conn); +void rds_conn_drop(struct rds_connection *conn); +void rds_conn_connect_if_down(struct rds_connection *conn); +void rds_for_each_conn_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int (*visitor)(struct rds_connection *, void *), + size_t item_len); +__printf(2, 3) +void __rds_conn_error(struct rds_connection *conn, const char *, ...); +#define rds_conn_error(conn, fmt...) \ + __rds_conn_error(conn, KERN_WARNING "RDS: " fmt) + +static inline int +rds_conn_transition(struct rds_connection *conn, int old, int new) +{ + return atomic_cmpxchg(&conn->c_state, old, new) == old; +} + +static inline int +rds_conn_state(struct rds_connection *conn) +{ + return atomic_read(&conn->c_state); +} + +static inline int +rds_conn_up(struct rds_connection *conn) +{ + return atomic_read(&conn->c_state) == RDS_CONN_UP; +} + +static inline int +rds_conn_connecting(struct rds_connection *conn) +{ + return atomic_read(&conn->c_state) == RDS_CONN_CONNECTING; +} + +/* message.c */ +struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp); +struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents); +int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov, + size_t total_len); +struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len); +void rds_message_populate_header(struct rds_header *hdr, __be16 sport, + __be16 dport, u64 seq); +int rds_message_add_extension(struct rds_header *hdr, + unsigned int type, const void *data, unsigned int len); +int rds_message_next_extension(struct rds_header *hdr, + unsigned int *pos, void *buf, unsigned int *buflen); +int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset); +int rds_message_inc_copy_to_user(struct rds_incoming *inc, + struct iovec *first_iov, size_t size); +void rds_message_inc_free(struct rds_incoming *inc); +void rds_message_addref(struct rds_message *rm); +void rds_message_put(struct rds_message *rm); +void rds_message_wait(struct rds_message *rm); +void rds_message_unmapped(struct rds_message *rm); + +static inline void rds_message_make_checksum(struct rds_header *hdr) +{ + hdr->h_csum = 0; + hdr->h_csum = ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2); +} + +static inline int rds_message_verify_checksum(const struct rds_header *hdr) +{ + return !hdr->h_csum || ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2) == 0; +} + + +/* page.c */ +int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, + gfp_t gfp); +int rds_page_copy_user(struct page *page, unsigned long offset, + void __user *ptr, unsigned long bytes, + int to_user); +#define rds_page_copy_to_user(page, offset, ptr, bytes) \ + rds_page_copy_user(page, offset, ptr, bytes, 1) +#define rds_page_copy_from_user(page, offset, ptr, bytes) \ + rds_page_copy_user(page, offset, ptr, bytes, 0) +void rds_page_exit(void); + +/* recv.c */ +void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, + __be32 saddr); +void rds_inc_put(struct rds_incoming *inc); +void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, + struct rds_incoming *inc, gfp_t gfp); +int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t size, int msg_flags); +void rds_clear_recv_queue(struct rds_sock *rs); +int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg); +void rds_inc_info_copy(struct rds_incoming *inc, + struct rds_info_iterator *iter, + __be32 saddr, __be32 daddr, int flip); + +/* send.c */ +int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t payload_len); +void rds_send_reset(struct rds_connection *conn); +int rds_send_xmit(struct rds_connection *conn); +struct sockaddr_in; +void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest); +typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); +void rds_send_drop_acked(struct rds_connection *conn, u64 ack, + is_acked_func is_acked); +int rds_send_pong(struct rds_connection *conn, __be16 dport); +struct rds_message *rds_send_get_message(struct rds_connection *, + struct rm_rdma_op *); + +/* rdma.c */ +void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); +int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen); +int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen); +int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen); +void rds_rdma_drop_keys(struct rds_sock *rs); +int rds_rdma_extra_size(struct rds_rdma_args *args); +int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +void rds_rdma_free_op(struct rm_rdma_op *ro); +void rds_atomic_free_op(struct rm_atomic_op *ao); +void rds_rdma_send_complete(struct rds_message *rm, int wc_status); +void rds_atomic_send_complete(struct rds_message *rm, int wc_status); +int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); + +void __rds_put_mr_final(struct rds_mr *mr); +static inline void rds_mr_put(struct rds_mr *mr) +{ + if (atomic_dec_and_test(&mr->r_refcount)) + __rds_put_mr_final(mr); +} + +/* stats.c */ +DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); +#define rds_stats_inc_which(which, member) do { \ + per_cpu(which, get_cpu()).member++; \ + put_cpu(); \ +} while (0) +#define rds_stats_inc(member) rds_stats_inc_which(rds_stats, member) +#define rds_stats_add_which(which, member, count) do { \ + per_cpu(which, get_cpu()).member += count; \ + put_cpu(); \ +} while (0) +#define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count) +int rds_stats_init(void); +void rds_stats_exit(void); +void rds_stats_info_copy(struct rds_info_iterator *iter, + uint64_t *values, const char *const *names, + size_t nr); + +/* sysctl.c */ +int rds_sysctl_init(void); +void rds_sysctl_exit(void); +extern unsigned long rds_sysctl_sndbuf_min; +extern unsigned long rds_sysctl_sndbuf_default; +extern unsigned long rds_sysctl_sndbuf_max; +extern unsigned long rds_sysctl_reconnect_min_jiffies; +extern unsigned long rds_sysctl_reconnect_max_jiffies; +extern unsigned int rds_sysctl_max_unacked_packets; +extern unsigned int rds_sysctl_max_unacked_bytes; +extern unsigned int rds_sysctl_ping_enable; +extern unsigned long rds_sysctl_trace_flags; +extern unsigned int rds_sysctl_trace_level; + +/* threads.c */ +int rds_threads_init(void); +void rds_threads_exit(void); +extern struct workqueue_struct *rds_wq; +void rds_queue_reconnect(struct rds_connection *conn); +void rds_connect_worker(struct work_struct *); +void rds_shutdown_worker(struct work_struct *); +void rds_send_worker(struct work_struct *); +void rds_recv_worker(struct work_struct *); +void rds_connect_complete(struct rds_connection *conn); + +/* transport.c */ +int rds_trans_register(struct rds_transport *trans); +void rds_trans_unregister(struct rds_transport *trans); +struct rds_transport *rds_trans_get_preferred(__be32 addr); +void rds_trans_put(struct rds_transport *trans); +unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail); +int rds_trans_init(void); +void rds_trans_exit(void); + +#endif diff --git a/net/rds/recv.c b/net/rds/recv.c new file mode 100644 index 0000000..bd82522 --- /dev/null +++ b/net/rds/recv.c @@ -0,0 +1,547 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include +#include + +#include "rds.h" + +void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, + __be32 saddr) +{ + atomic_set(&inc->i_refcount, 1); + INIT_LIST_HEAD(&inc->i_item); + inc->i_conn = conn; + inc->i_saddr = saddr; + inc->i_rdma_cookie = 0; +} +EXPORT_SYMBOL_GPL(rds_inc_init); + +static void rds_inc_addref(struct rds_incoming *inc) +{ + rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount)); + atomic_inc(&inc->i_refcount); +} + +void rds_inc_put(struct rds_incoming *inc) +{ + rdsdebug("put inc %p ref %d\n", inc, atomic_read(&inc->i_refcount)); + if (atomic_dec_and_test(&inc->i_refcount)) { + BUG_ON(!list_empty(&inc->i_item)); + + inc->i_conn->c_trans->inc_free(inc); + } +} +EXPORT_SYMBOL_GPL(rds_inc_put); + +static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk, + struct rds_cong_map *map, + int delta, __be16 port) +{ + int now_congested; + + if (delta == 0) + return; + + rs->rs_rcv_bytes += delta; + now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs); + + rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d " + "now_cong %d delta %d\n", + rs, &rs->rs_bound_addr, + ntohs(rs->rs_bound_port), rs->rs_rcv_bytes, + rds_sk_rcvbuf(rs), now_congested, delta); + + /* wasn't -> am congested */ + if (!rs->rs_congested && now_congested) { + rs->rs_congested = 1; + rds_cong_set_bit(map, port); + rds_cong_queue_updates(map); + } + /* was -> aren't congested */ + /* Require more free space before reporting uncongested to prevent + bouncing cong/uncong state too often */ + else if (rs->rs_congested && (rs->rs_rcv_bytes < (rds_sk_rcvbuf(rs)/2))) { + rs->rs_congested = 0; + rds_cong_clear_bit(map, port); + rds_cong_queue_updates(map); + } + + /* do nothing if no change in cong state */ +} + +/* + * Process all extension headers that come with this message. + */ +static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock *rs) +{ + struct rds_header *hdr = &inc->i_hdr; + unsigned int pos = 0, type, len; + union { + struct rds_ext_header_version version; + struct rds_ext_header_rdma rdma; + struct rds_ext_header_rdma_dest rdma_dest; + } buffer; + + while (1) { + len = sizeof(buffer); + type = rds_message_next_extension(hdr, &pos, &buffer, &len); + if (type == RDS_EXTHDR_NONE) + break; + /* Process extension header here */ + switch (type) { + case RDS_EXTHDR_RDMA: + rds_rdma_unuse(rs, be32_to_cpu(buffer.rdma.h_rdma_rkey), 0); + break; + + case RDS_EXTHDR_RDMA_DEST: + /* We ignore the size for now. We could stash it + * somewhere and use it for error checking. */ + inc->i_rdma_cookie = rds_rdma_make_cookie( + be32_to_cpu(buffer.rdma_dest.h_rdma_rkey), + be32_to_cpu(buffer.rdma_dest.h_rdma_offset)); + + break; + } + } +} + +/* + * The transport must make sure that this is serialized against other + * rx and conn reset on this specific conn. + * + * We currently assert that only one fragmented message will be sent + * down a connection at a time. This lets us reassemble in the conn + * instead of per-flow which means that we don't have to go digging through + * flows to tear down partial reassembly progress on conn failure and + * we save flow lookup and locking for each frag arrival. It does mean + * that small messages will wait behind large ones. Fragmenting at all + * is only to reduce the memory consumption of pre-posted buffers. + * + * The caller passes in saddr and daddr instead of us getting it from the + * conn. This lets loopback, who only has one conn for both directions, + * tell us which roles the addrs in the conn are playing for this message. + */ +void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, + struct rds_incoming *inc, gfp_t gfp) +{ + struct rds_sock *rs = NULL; + struct sock *sk; + unsigned long flags; + + inc->i_conn = conn; + inc->i_rx_jiffies = jiffies; + + rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u " + "flags 0x%x rx_jiffies %lu\n", conn, + (unsigned long long)conn->c_next_rx_seq, + inc, + (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence), + be32_to_cpu(inc->i_hdr.h_len), + be16_to_cpu(inc->i_hdr.h_sport), + be16_to_cpu(inc->i_hdr.h_dport), + inc->i_hdr.h_flags, + inc->i_rx_jiffies); + + /* + * Sequence numbers should only increase. Messages get their + * sequence number as they're queued in a sending conn. They + * can be dropped, though, if the sending socket is closed before + * they hit the wire. So sequence numbers can skip forward + * under normal operation. They can also drop back in the conn + * failover case as previously sent messages are resent down the + * new instance of a conn. We drop those, otherwise we have + * to assume that the next valid seq does not come after a + * hole in the fragment stream. + * + * The headers don't give us a way to realize if fragments of + * a message have been dropped. We assume that frags that arrive + * to a flow are part of the current message on the flow that is + * being reassembled. This means that senders can't drop messages + * from the sending conn until all their frags are sent. + * + * XXX we could spend more on the wire to get more robust failure + * detection, arguably worth it to avoid data corruption. + */ + if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq && + (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) { + rds_stats_inc(s_recv_drop_old_seq); + goto out; + } + conn->c_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1; + + if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) { + rds_stats_inc(s_recv_ping); + rds_send_pong(conn, inc->i_hdr.h_sport); + goto out; + } + + rs = rds_find_bound(daddr, inc->i_hdr.h_dport); + if (!rs) { + rds_stats_inc(s_recv_drop_no_sock); + goto out; + } + + /* Process extension headers */ + rds_recv_incoming_exthdrs(inc, rs); + + /* We can be racing with rds_release() which marks the socket dead. */ + sk = rds_rs_to_sk(rs); + + /* serialize with rds_release -> sock_orphan */ + write_lock_irqsave(&rs->rs_recv_lock, flags); + if (!sock_flag(sk, SOCK_DEAD)) { + rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs); + rds_stats_inc(s_recv_queued); + rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, + be32_to_cpu(inc->i_hdr.h_len), + inc->i_hdr.h_dport); + rds_inc_addref(inc); + list_add_tail(&inc->i_item, &rs->rs_recv_queue); + __rds_wake_sk_sleep(sk); + } else { + rds_stats_inc(s_recv_drop_dead_sock); + } + write_unlock_irqrestore(&rs->rs_recv_lock, flags); + +out: + if (rs) + rds_sock_put(rs); +} +EXPORT_SYMBOL_GPL(rds_recv_incoming); + +/* + * be very careful here. This is being called as the condition in + * wait_event_*() needs to cope with being called many times. + */ +static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc) +{ + unsigned long flags; + + if (!*inc) { + read_lock_irqsave(&rs->rs_recv_lock, flags); + if (!list_empty(&rs->rs_recv_queue)) { + *inc = list_entry(rs->rs_recv_queue.next, + struct rds_incoming, + i_item); + rds_inc_addref(*inc); + } + read_unlock_irqrestore(&rs->rs_recv_lock, flags); + } + + return *inc != NULL; +} + +static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc, + int drop) +{ + struct sock *sk = rds_rs_to_sk(rs); + int ret = 0; + unsigned long flags; + + write_lock_irqsave(&rs->rs_recv_lock, flags); + if (!list_empty(&inc->i_item)) { + ret = 1; + if (drop) { + /* XXX make sure this i_conn is reliable */ + rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, + -be32_to_cpu(inc->i_hdr.h_len), + inc->i_hdr.h_dport); + list_del_init(&inc->i_item); + rds_inc_put(inc); + } + } + write_unlock_irqrestore(&rs->rs_recv_lock, flags); + + rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop); + return ret; +} + +/* + * Pull errors off the error queue. + * If msghdr is NULL, we will just purge the error queue. + */ +int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr) +{ + struct rds_notifier *notifier; + struct rds_rdma_notify cmsg = { 0 }; /* fill holes with zero */ + unsigned int count = 0, max_messages = ~0U; + unsigned long flags; + LIST_HEAD(copy); + int err = 0; + + + /* put_cmsg copies to user space and thus may sleep. We can't do this + * with rs_lock held, so first grab as many notifications as we can stuff + * in the user provided cmsg buffer. We don't try to copy more, to avoid + * losing notifications - except when the buffer is so small that it wouldn't + * even hold a single notification. Then we give him as much of this single + * msg as we can squeeze in, and set MSG_CTRUNC. + */ + if (msghdr) { + max_messages = msghdr->msg_controllen / CMSG_SPACE(sizeof(cmsg)); + if (!max_messages) + max_messages = 1; + } + + spin_lock_irqsave(&rs->rs_lock, flags); + while (!list_empty(&rs->rs_notify_queue) && count < max_messages) { + notifier = list_entry(rs->rs_notify_queue.next, + struct rds_notifier, n_list); + list_move(¬ifier->n_list, ©); + count++; + } + spin_unlock_irqrestore(&rs->rs_lock, flags); + + if (!count) + return 0; + + while (!list_empty(©)) { + notifier = list_entry(copy.next, struct rds_notifier, n_list); + + if (msghdr) { + cmsg.user_token = notifier->n_user_token; + cmsg.status = notifier->n_status; + + err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS, + sizeof(cmsg), &cmsg); + if (err) + break; + } + + list_del_init(¬ifier->n_list); + kfree(notifier); + } + + /* If we bailed out because of an error in put_cmsg, + * we may be left with one or more notifications that we + * didn't process. Return them to the head of the list. */ + if (!list_empty(©)) { + spin_lock_irqsave(&rs->rs_lock, flags); + list_splice(©, &rs->rs_notify_queue); + spin_unlock_irqrestore(&rs->rs_lock, flags); + } + + return err; +} + +/* + * Queue a congestion notification + */ +static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr) +{ + uint64_t notify = rs->rs_cong_notify; + unsigned long flags; + int err; + + err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE, + sizeof(notify), ¬ify); + if (err) + return err; + + spin_lock_irqsave(&rs->rs_lock, flags); + rs->rs_cong_notify &= ~notify; + spin_unlock_irqrestore(&rs->rs_lock, flags); + + return 0; +} + +/* + * Receive any control messages. + */ +static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg) +{ + int ret = 0; + + if (inc->i_rdma_cookie) { + ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST, + sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie); + if (ret) + return ret; + } + + return 0; +} + +int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t size, int msg_flags) +{ + struct sock *sk = sock->sk; + struct rds_sock *rs = rds_sk_to_rs(sk); + long timeo; + int ret = 0, nonblock = msg_flags & MSG_DONTWAIT; + DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); + struct rds_incoming *inc = NULL; + + /* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */ + timeo = sock_rcvtimeo(sk, nonblock); + + rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo); + + if (msg_flags & MSG_OOB) + goto out; + + while (1) { + /* If there are pending notifications, do those - and nothing else */ + if (!list_empty(&rs->rs_notify_queue)) { + ret = rds_notify_queue_get(rs, msg); + break; + } + + if (rs->rs_cong_notify) { + ret = rds_notify_cong(rs, msg); + break; + } + + if (!rds_next_incoming(rs, &inc)) { + if (nonblock) { + ret = -EAGAIN; + break; + } + + timeo = wait_event_interruptible_timeout(*sk_sleep(sk), + (!list_empty(&rs->rs_notify_queue) || + rs->rs_cong_notify || + rds_next_incoming(rs, &inc)), timeo); + rdsdebug("recvmsg woke inc %p timeo %ld\n", inc, + timeo); + if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT) + continue; + + ret = timeo; + if (ret == 0) + ret = -ETIMEDOUT; + break; + } + + rdsdebug("copying inc %p from %pI4:%u to user\n", inc, + &inc->i_conn->c_faddr, + ntohs(inc->i_hdr.h_sport)); + ret = inc->i_conn->c_trans->inc_copy_to_user(inc, msg->msg_iov, + size); + if (ret < 0) + break; + + /* + * if the message we just copied isn't at the head of the + * recv queue then someone else raced us to return it, try + * to get the next message. + */ + if (!rds_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) { + rds_inc_put(inc); + inc = NULL; + rds_stats_inc(s_recv_deliver_raced); + continue; + } + + if (ret < be32_to_cpu(inc->i_hdr.h_len)) { + if (msg_flags & MSG_TRUNC) + ret = be32_to_cpu(inc->i_hdr.h_len); + msg->msg_flags |= MSG_TRUNC; + } + + if (rds_cmsg_recv(inc, msg)) { + ret = -EFAULT; + goto out; + } + + rds_stats_inc(s_recv_delivered); + + if (sin) { + sin->sin_family = AF_INET; + sin->sin_port = inc->i_hdr.h_sport; + sin->sin_addr.s_addr = inc->i_saddr; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + msg->msg_namelen = sizeof(*sin); + } + break; + } + + if (inc) + rds_inc_put(inc); + +out: + return ret; +} + +/* + * The socket is being shut down and we're asked to drop messages that were + * queued for recvmsg. The caller has unbound the socket so the receive path + * won't queue any more incoming fragments or messages on the socket. + */ +void rds_clear_recv_queue(struct rds_sock *rs) +{ + struct sock *sk = rds_rs_to_sk(rs); + struct rds_incoming *inc, *tmp; + unsigned long flags; + + write_lock_irqsave(&rs->rs_recv_lock, flags); + list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) { + rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, + -be32_to_cpu(inc->i_hdr.h_len), + inc->i_hdr.h_dport); + list_del_init(&inc->i_item); + rds_inc_put(inc); + } + write_unlock_irqrestore(&rs->rs_recv_lock, flags); +} + +/* + * inc->i_saddr isn't used here because it is only set in the receive + * path. + */ +void rds_inc_info_copy(struct rds_incoming *inc, + struct rds_info_iterator *iter, + __be32 saddr, __be32 daddr, int flip) +{ + struct rds_info_message minfo; + + minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence); + minfo.len = be32_to_cpu(inc->i_hdr.h_len); + + if (flip) { + minfo.laddr = daddr; + minfo.faddr = saddr; + minfo.lport = inc->i_hdr.h_dport; + minfo.fport = inc->i_hdr.h_sport; + } else { + minfo.laddr = saddr; + minfo.faddr = daddr; + minfo.lport = inc->i_hdr.h_sport; + minfo.fport = inc->i_hdr.h_dport; + } + + rds_info_copy(iter, &minfo, sizeof(minfo)); +} diff --git a/net/rds/send.c b/net/rds/send.c new file mode 100644 index 0000000..0a64541 --- /dev/null +++ b/net/rds/send.c @@ -0,0 +1,1137 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rds.h" + +/* When transmitting messages in rds_send_xmit, we need to emerge from + * time to time and briefly release the CPU. Otherwise the softlock watchdog + * will kick our shin. + * Also, it seems fairer to not let one busy connection stall all the + * others. + * + * send_batch_count is the number of times we'll loop in send_xmit. Setting + * it to 0 will restore the old behavior (where we looped until we had + * drained the queue). + */ +static int send_batch_count = 64; +module_param(send_batch_count, int, 0444); +MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue"); + +static void rds_send_remove_from_sock(struct list_head *messages, int status); + +/* + * Reset the send state. Callers must ensure that this doesn't race with + * rds_send_xmit(). + */ +void rds_send_reset(struct rds_connection *conn) +{ + struct rds_message *rm, *tmp; + unsigned long flags; + + if (conn->c_xmit_rm) { + rm = conn->c_xmit_rm; + conn->c_xmit_rm = NULL; + /* Tell the user the RDMA op is no longer mapped by the + * transport. This isn't entirely true (it's flushed out + * independently) but as the connection is down, there's + * no ongoing RDMA to/from that memory */ + rds_message_unmapped(rm); + rds_message_put(rm); + } + + conn->c_xmit_sg = 0; + conn->c_xmit_hdr_off = 0; + conn->c_xmit_data_off = 0; + conn->c_xmit_atomic_sent = 0; + conn->c_xmit_rdma_sent = 0; + conn->c_xmit_data_sent = 0; + + conn->c_map_queued = 0; + + conn->c_unacked_packets = rds_sysctl_max_unacked_packets; + conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes; + + /* Mark messages as retransmissions, and move them to the send q */ + spin_lock_irqsave(&conn->c_lock, flags); + list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { + set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); + set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags); + } + list_splice_init(&conn->c_retrans, &conn->c_send_queue); + spin_unlock_irqrestore(&conn->c_lock, flags); +} + +static int acquire_in_xmit(struct rds_connection *conn) +{ + return test_and_set_bit(RDS_IN_XMIT, &conn->c_flags) == 0; +} + +static void release_in_xmit(struct rds_connection *conn) +{ + clear_bit(RDS_IN_XMIT, &conn->c_flags); + smp_mb__after_atomic(); + /* + * We don't use wait_on_bit()/wake_up_bit() because our waking is in a + * hot path and finding waiters is very rare. We don't want to walk + * the system-wide hashed waitqueue buckets in the fast path only to + * almost never find waiters. + */ + if (waitqueue_active(&conn->c_waitq)) + wake_up_all(&conn->c_waitq); +} + +/* + * We're making the conscious trade-off here to only send one message + * down the connection at a time. + * Pro: + * - tx queueing is a simple fifo list + * - reassembly is optional and easily done by transports per conn + * - no per flow rx lookup at all, straight to the socket + * - less per-frag memory and wire overhead + * Con: + * - queued acks can be delayed behind large messages + * Depends: + * - small message latency is higher behind queued large messages + * - large message latency isn't starved by intervening small sends + */ +int rds_send_xmit(struct rds_connection *conn) +{ + struct rds_message *rm; + unsigned long flags; + unsigned int tmp; + struct scatterlist *sg; + int ret = 0; + LIST_HEAD(to_be_dropped); + +restart: + + /* + * sendmsg calls here after having queued its message on the send + * queue. We only have one task feeding the connection at a time. If + * another thread is already feeding the queue then we back off. This + * avoids blocking the caller and trading per-connection data between + * caches per message. + */ + if (!acquire_in_xmit(conn)) { + rds_stats_inc(s_send_lock_contention); + ret = -ENOMEM; + goto out; + } + + /* + * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT, + * we do the opposite to avoid races. + */ + if (!rds_conn_up(conn)) { + release_in_xmit(conn); + ret = 0; + goto out; + } + + if (conn->c_trans->xmit_prepare) + conn->c_trans->xmit_prepare(conn); + + /* + * spin trying to push headers and data down the connection until + * the connection doesn't make forward progress. + */ + while (1) { + + rm = conn->c_xmit_rm; + + /* + * If between sending messages, we can send a pending congestion + * map update. + */ + if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) { + rm = rds_cong_update_alloc(conn); + if (IS_ERR(rm)) { + ret = PTR_ERR(rm); + break; + } + rm->data.op_active = 1; + + conn->c_xmit_rm = rm; + } + + /* + * If not already working on one, grab the next message. + * + * c_xmit_rm holds a ref while we're sending this message down + * the connction. We can use this ref while holding the + * send_sem.. rds_send_reset() is serialized with it. + */ + if (!rm) { + unsigned int len; + + spin_lock_irqsave(&conn->c_lock, flags); + + if (!list_empty(&conn->c_send_queue)) { + rm = list_entry(conn->c_send_queue.next, + struct rds_message, + m_conn_item); + rds_message_addref(rm); + + /* + * Move the message from the send queue to the retransmit + * list right away. + */ + list_move_tail(&rm->m_conn_item, &conn->c_retrans); + } + + spin_unlock_irqrestore(&conn->c_lock, flags); + + if (!rm) + break; + + /* Unfortunately, the way Infiniband deals with + * RDMA to a bad MR key is by moving the entire + * queue pair to error state. We cold possibly + * recover from that, but right now we drop the + * connection. + * Therefore, we never retransmit messages with RDMA ops. + */ + if (rm->rdma.op_active && + test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { + spin_lock_irqsave(&conn->c_lock, flags); + if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) + list_move(&rm->m_conn_item, &to_be_dropped); + spin_unlock_irqrestore(&conn->c_lock, flags); + continue; + } + + /* Require an ACK every once in a while */ + len = ntohl(rm->m_inc.i_hdr.h_len); + if (conn->c_unacked_packets == 0 || + conn->c_unacked_bytes < len) { + __set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); + + conn->c_unacked_packets = rds_sysctl_max_unacked_packets; + conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes; + rds_stats_inc(s_send_ack_required); + } else { + conn->c_unacked_bytes -= len; + conn->c_unacked_packets--; + } + + conn->c_xmit_rm = rm; + } + + /* The transport either sends the whole rdma or none of it */ + if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) { + rm->m_final_op = &rm->rdma; + ret = conn->c_trans->xmit_rdma(conn, &rm->rdma); + if (ret) + break; + conn->c_xmit_rdma_sent = 1; + + /* The transport owns the mapped memory for now. + * You can't unmap it while it's on the send queue */ + set_bit(RDS_MSG_MAPPED, &rm->m_flags); + } + + if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) { + rm->m_final_op = &rm->atomic; + ret = conn->c_trans->xmit_atomic(conn, &rm->atomic); + if (ret) + break; + conn->c_xmit_atomic_sent = 1; + + /* The transport owns the mapped memory for now. + * You can't unmap it while it's on the send queue */ + set_bit(RDS_MSG_MAPPED, &rm->m_flags); + } + + /* + * A number of cases require an RDS header to be sent + * even if there is no data. + * We permit 0-byte sends; rds-ping depends on this. + * However, if there are exclusively attached silent ops, + * we skip the hdr/data send, to enable silent operation. + */ + if (rm->data.op_nents == 0) { + int ops_present; + int all_ops_are_silent = 1; + + ops_present = (rm->atomic.op_active || rm->rdma.op_active); + if (rm->atomic.op_active && !rm->atomic.op_silent) + all_ops_are_silent = 0; + if (rm->rdma.op_active && !rm->rdma.op_silent) + all_ops_are_silent = 0; + + if (ops_present && all_ops_are_silent + && !rm->m_rdma_cookie) + rm->data.op_active = 0; + } + + if (rm->data.op_active && !conn->c_xmit_data_sent) { + rm->m_final_op = &rm->data; + ret = conn->c_trans->xmit(conn, rm, + conn->c_xmit_hdr_off, + conn->c_xmit_sg, + conn->c_xmit_data_off); + if (ret <= 0) + break; + + if (conn->c_xmit_hdr_off < sizeof(struct rds_header)) { + tmp = min_t(int, ret, + sizeof(struct rds_header) - + conn->c_xmit_hdr_off); + conn->c_xmit_hdr_off += tmp; + ret -= tmp; + } + + sg = &rm->data.op_sg[conn->c_xmit_sg]; + while (ret) { + tmp = min_t(int, ret, sg->length - + conn->c_xmit_data_off); + conn->c_xmit_data_off += tmp; + ret -= tmp; + if (conn->c_xmit_data_off == sg->length) { + conn->c_xmit_data_off = 0; + sg++; + conn->c_xmit_sg++; + BUG_ON(ret != 0 && + conn->c_xmit_sg == rm->data.op_nents); + } + } + + if (conn->c_xmit_hdr_off == sizeof(struct rds_header) && + (conn->c_xmit_sg == rm->data.op_nents)) + conn->c_xmit_data_sent = 1; + } + + /* + * A rm will only take multiple times through this loop + * if there is a data op. Thus, if the data is sent (or there was + * none), then we're done with the rm. + */ + if (!rm->data.op_active || conn->c_xmit_data_sent) { + conn->c_xmit_rm = NULL; + conn->c_xmit_sg = 0; + conn->c_xmit_hdr_off = 0; + conn->c_xmit_data_off = 0; + conn->c_xmit_rdma_sent = 0; + conn->c_xmit_atomic_sent = 0; + conn->c_xmit_data_sent = 0; + + rds_message_put(rm); + } + } + + if (conn->c_trans->xmit_complete) + conn->c_trans->xmit_complete(conn); + + release_in_xmit(conn); + + /* Nuke any messages we decided not to retransmit. */ + if (!list_empty(&to_be_dropped)) { + /* irqs on here, so we can put(), unlike above */ + list_for_each_entry(rm, &to_be_dropped, m_conn_item) + rds_message_put(rm); + rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); + } + + /* + * Other senders can queue a message after we last test the send queue + * but before we clear RDS_IN_XMIT. In that case they'd back off and + * not try and send their newly queued message. We need to check the + * send queue after having cleared RDS_IN_XMIT so that their message + * doesn't get stuck on the send queue. + * + * If the transport cannot continue (i.e ret != 0), then it must + * call us when more room is available, such as from the tx + * completion handler. + */ + if (ret == 0) { + smp_mb(); + if (!list_empty(&conn->c_send_queue)) { + rds_stats_inc(s_send_lock_queue_raced); + goto restart; + } + } +out: + return ret; +} + +static void rds_send_sndbuf_remove(struct rds_sock *rs, struct rds_message *rm) +{ + u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len); + + assert_spin_locked(&rs->rs_lock); + + BUG_ON(rs->rs_snd_bytes < len); + rs->rs_snd_bytes -= len; + + if (rs->rs_snd_bytes == 0) + rds_stats_inc(s_send_queue_empty); +} + +static inline int rds_send_is_acked(struct rds_message *rm, u64 ack, + is_acked_func is_acked) +{ + if (is_acked) + return is_acked(rm, ack); + return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack; +} + +/* + * This is pretty similar to what happens below in the ACK + * handling code - except that we call here as soon as we get + * the IB send completion on the RDMA op and the accompanying + * message. + */ +void rds_rdma_send_complete(struct rds_message *rm, int status) +{ + struct rds_sock *rs = NULL; + struct rm_rdma_op *ro; + struct rds_notifier *notifier; + unsigned long flags; + + spin_lock_irqsave(&rm->m_rs_lock, flags); + + ro = &rm->rdma; + if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && + ro->op_active && ro->op_notify && ro->op_notifier) { + notifier = ro->op_notifier; + rs = rm->m_rs; + sock_hold(rds_rs_to_sk(rs)); + + notifier->n_status = status; + spin_lock(&rs->rs_lock); + list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); + spin_unlock(&rs->rs_lock); + + ro->op_notifier = NULL; + } + + spin_unlock_irqrestore(&rm->m_rs_lock, flags); + + if (rs) { + rds_wake_sk_sleep(rs); + sock_put(rds_rs_to_sk(rs)); + } +} +EXPORT_SYMBOL_GPL(rds_rdma_send_complete); + +/* + * Just like above, except looks at atomic op + */ +void rds_atomic_send_complete(struct rds_message *rm, int status) +{ + struct rds_sock *rs = NULL; + struct rm_atomic_op *ao; + struct rds_notifier *notifier; + unsigned long flags; + + spin_lock_irqsave(&rm->m_rs_lock, flags); + + ao = &rm->atomic; + if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) + && ao->op_active && ao->op_notify && ao->op_notifier) { + notifier = ao->op_notifier; + rs = rm->m_rs; + sock_hold(rds_rs_to_sk(rs)); + + notifier->n_status = status; + spin_lock(&rs->rs_lock); + list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); + spin_unlock(&rs->rs_lock); + + ao->op_notifier = NULL; + } + + spin_unlock_irqrestore(&rm->m_rs_lock, flags); + + if (rs) { + rds_wake_sk_sleep(rs); + sock_put(rds_rs_to_sk(rs)); + } +} +EXPORT_SYMBOL_GPL(rds_atomic_send_complete); + +/* + * This is the same as rds_rdma_send_complete except we + * don't do any locking - we have all the ingredients (message, + * socket, socket lock) and can just move the notifier. + */ +static inline void +__rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) +{ + struct rm_rdma_op *ro; + struct rm_atomic_op *ao; + + ro = &rm->rdma; + if (ro->op_active && ro->op_notify && ro->op_notifier) { + ro->op_notifier->n_status = status; + list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue); + ro->op_notifier = NULL; + } + + ao = &rm->atomic; + if (ao->op_active && ao->op_notify && ao->op_notifier) { + ao->op_notifier->n_status = status; + list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue); + ao->op_notifier = NULL; + } + + /* No need to wake the app - caller does this */ +} + +/* + * This is called from the IB send completion when we detect + * a RDMA operation that failed with remote access error. + * So speed is not an issue here. + */ +struct rds_message *rds_send_get_message(struct rds_connection *conn, + struct rm_rdma_op *op) +{ + struct rds_message *rm, *tmp, *found = NULL; + unsigned long flags; + + spin_lock_irqsave(&conn->c_lock, flags); + + list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { + if (&rm->rdma == op) { + atomic_inc(&rm->m_refcount); + found = rm; + goto out; + } + } + + list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { + if (&rm->rdma == op) { + atomic_inc(&rm->m_refcount); + found = rm; + break; + } + } + +out: + spin_unlock_irqrestore(&conn->c_lock, flags); + + return found; +} +EXPORT_SYMBOL_GPL(rds_send_get_message); + +/* + * This removes messages from the socket's list if they're on it. The list + * argument must be private to the caller, we must be able to modify it + * without locks. The messages must have a reference held for their + * position on the list. This function will drop that reference after + * removing the messages from the 'messages' list regardless of if it found + * the messages on the socket list or not. + */ +static void rds_send_remove_from_sock(struct list_head *messages, int status) +{ + unsigned long flags; + struct rds_sock *rs = NULL; + struct rds_message *rm; + + while (!list_empty(messages)) { + int was_on_sock = 0; + + rm = list_entry(messages->next, struct rds_message, + m_conn_item); + list_del_init(&rm->m_conn_item); + + /* + * If we see this flag cleared then we're *sure* that someone + * else beat us to removing it from the sock. If we race + * with their flag update we'll get the lock and then really + * see that the flag has been cleared. + * + * The message spinlock makes sure nobody clears rm->m_rs + * while we're messing with it. It does not prevent the + * message from being removed from the socket, though. + */ + spin_lock_irqsave(&rm->m_rs_lock, flags); + if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) + goto unlock_and_drop; + + if (rs != rm->m_rs) { + if (rs) { + rds_wake_sk_sleep(rs); + sock_put(rds_rs_to_sk(rs)); + } + rs = rm->m_rs; + if (rs) + sock_hold(rds_rs_to_sk(rs)); + } + if (!rs) + goto unlock_and_drop; + spin_lock(&rs->rs_lock); + + if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) { + struct rm_rdma_op *ro = &rm->rdma; + struct rds_notifier *notifier; + + list_del_init(&rm->m_sock_item); + rds_send_sndbuf_remove(rs, rm); + + if (ro->op_active && ro->op_notifier && + (ro->op_notify || (ro->op_recverr && status))) { + notifier = ro->op_notifier; + list_add_tail(¬ifier->n_list, + &rs->rs_notify_queue); + if (!notifier->n_status) + notifier->n_status = status; + rm->rdma.op_notifier = NULL; + } + was_on_sock = 1; + rm->m_rs = NULL; + } + spin_unlock(&rs->rs_lock); + +unlock_and_drop: + spin_unlock_irqrestore(&rm->m_rs_lock, flags); + rds_message_put(rm); + if (was_on_sock) + rds_message_put(rm); + } + + if (rs) { + rds_wake_sk_sleep(rs); + sock_put(rds_rs_to_sk(rs)); + } +} + +/* + * Transports call here when they've determined that the receiver queued + * messages up to, and including, the given sequence number. Messages are + * moved to the retrans queue when rds_send_xmit picks them off the send + * queue. This means that in the TCP case, the message may not have been + * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked + * checks the RDS_MSG_HAS_ACK_SEQ bit. + */ +void rds_send_drop_acked(struct rds_connection *conn, u64 ack, + is_acked_func is_acked) +{ + struct rds_message *rm, *tmp; + unsigned long flags; + LIST_HEAD(list); + + spin_lock_irqsave(&conn->c_lock, flags); + + list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { + if (!rds_send_is_acked(rm, ack, is_acked)) + break; + + list_move(&rm->m_conn_item, &list); + clear_bit(RDS_MSG_ON_CONN, &rm->m_flags); + } + + /* order flag updates with spin locks */ + if (!list_empty(&list)) + smp_mb__after_atomic(); + + spin_unlock_irqrestore(&conn->c_lock, flags); + + /* now remove the messages from the sock list as needed */ + rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS); +} +EXPORT_SYMBOL_GPL(rds_send_drop_acked); + +void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) +{ + struct rds_message *rm, *tmp; + struct rds_connection *conn; + unsigned long flags; + LIST_HEAD(list); + + /* get all the messages we're dropping under the rs lock */ + spin_lock_irqsave(&rs->rs_lock, flags); + + list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) { + if (dest && (dest->sin_addr.s_addr != rm->m_daddr || + dest->sin_port != rm->m_inc.i_hdr.h_dport)) + continue; + + list_move(&rm->m_sock_item, &list); + rds_send_sndbuf_remove(rs, rm); + clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags); + } + + /* order flag updates with the rs lock */ + smp_mb__after_atomic(); + + spin_unlock_irqrestore(&rs->rs_lock, flags); + + if (list_empty(&list)) + return; + + /* Remove the messages from the conn */ + list_for_each_entry(rm, &list, m_sock_item) { + + conn = rm->m_inc.i_conn; + + spin_lock_irqsave(&conn->c_lock, flags); + /* + * Maybe someone else beat us to removing rm from the conn. + * If we race with their flag update we'll get the lock and + * then really see that the flag has been cleared. + */ + if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { + spin_unlock_irqrestore(&conn->c_lock, flags); + spin_lock_irqsave(&rm->m_rs_lock, flags); + rm->m_rs = NULL; + spin_unlock_irqrestore(&rm->m_rs_lock, flags); + continue; + } + list_del_init(&rm->m_conn_item); + spin_unlock_irqrestore(&conn->c_lock, flags); + + /* + * Couldn't grab m_rs_lock in top loop (lock ordering), + * but we can now. + */ + spin_lock_irqsave(&rm->m_rs_lock, flags); + + spin_lock(&rs->rs_lock); + __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); + spin_unlock(&rs->rs_lock); + + rm->m_rs = NULL; + spin_unlock_irqrestore(&rm->m_rs_lock, flags); + + rds_message_put(rm); + } + + rds_wake_sk_sleep(rs); + + while (!list_empty(&list)) { + rm = list_entry(list.next, struct rds_message, m_sock_item); + list_del_init(&rm->m_sock_item); + + rds_message_wait(rm); + rds_message_put(rm); + } +} + +/* + * we only want this to fire once so we use the callers 'queued'. It's + * possible that another thread can race with us and remove the + * message from the flow with RDS_CANCEL_SENT_TO. + */ +static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn, + struct rds_message *rm, __be16 sport, + __be16 dport, int *queued) +{ + unsigned long flags; + u32 len; + + if (*queued) + goto out; + + len = be32_to_cpu(rm->m_inc.i_hdr.h_len); + + /* this is the only place which holds both the socket's rs_lock + * and the connection's c_lock */ + spin_lock_irqsave(&rs->rs_lock, flags); + + /* + * If there is a little space in sndbuf, we don't queue anything, + * and userspace gets -EAGAIN. But poll() indicates there's send + * room. This can lead to bad behavior (spinning) if snd_bytes isn't + * freed up by incoming acks. So we check the *old* value of + * rs_snd_bytes here to allow the last msg to exceed the buffer, + * and poll() now knows no more data can be sent. + */ + if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) { + rs->rs_snd_bytes += len; + + /* let recv side know we are close to send space exhaustion. + * This is probably not the optimal way to do it, as this + * means we set the flag on *all* messages as soon as our + * throughput hits a certain threshold. + */ + if (rs->rs_snd_bytes >= rds_sk_sndbuf(rs) / 2) + __set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); + + list_add_tail(&rm->m_sock_item, &rs->rs_send_queue); + set_bit(RDS_MSG_ON_SOCK, &rm->m_flags); + rds_message_addref(rm); + rm->m_rs = rs; + + /* The code ordering is a little weird, but we're + trying to minimize the time we hold c_lock */ + rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0); + rm->m_inc.i_conn = conn; + rds_message_addref(rm); + + spin_lock(&conn->c_lock); + rm->m_inc.i_hdr.h_sequence = cpu_to_be64(conn->c_next_tx_seq++); + list_add_tail(&rm->m_conn_item, &conn->c_send_queue); + set_bit(RDS_MSG_ON_CONN, &rm->m_flags); + spin_unlock(&conn->c_lock); + + rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n", + rm, len, rs, rs->rs_snd_bytes, + (unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence)); + + *queued = 1; + } + + spin_unlock_irqrestore(&rs->rs_lock, flags); +out: + return *queued; +} + +/* + * rds_message is getting to be quite complicated, and we'd like to allocate + * it all in one go. This figures out how big it needs to be up front. + */ +static int rds_rm_size(struct msghdr *msg, int data_len) +{ + struct cmsghdr *cmsg; + int size = 0; + int cmsg_groups = 0; + int retval; + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + if (cmsg->cmsg_level != SOL_RDS) + continue; + + switch (cmsg->cmsg_type) { + case RDS_CMSG_RDMA_ARGS: + cmsg_groups |= 1; + retval = rds_rdma_extra_size(CMSG_DATA(cmsg)); + if (retval < 0) + return retval; + size += retval; + + break; + + case RDS_CMSG_RDMA_DEST: + case RDS_CMSG_RDMA_MAP: + cmsg_groups |= 2; + /* these are valid but do no add any size */ + break; + + case RDS_CMSG_ATOMIC_CSWP: + case RDS_CMSG_ATOMIC_FADD: + case RDS_CMSG_MASKED_ATOMIC_CSWP: + case RDS_CMSG_MASKED_ATOMIC_FADD: + cmsg_groups |= 1; + size += sizeof(struct scatterlist); + break; + + default: + return -EINVAL; + } + + } + + size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist); + + /* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */ + if (cmsg_groups == 3) + return -EINVAL; + + return size; +} + +static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, + struct msghdr *msg, int *allocated_mr) +{ + struct cmsghdr *cmsg; + int ret = 0; + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + if (cmsg->cmsg_level != SOL_RDS) + continue; + + /* As a side effect, RDMA_DEST and RDMA_MAP will set + * rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr. + */ + switch (cmsg->cmsg_type) { + case RDS_CMSG_RDMA_ARGS: + ret = rds_cmsg_rdma_args(rs, rm, cmsg); + break; + + case RDS_CMSG_RDMA_DEST: + ret = rds_cmsg_rdma_dest(rs, rm, cmsg); + break; + + case RDS_CMSG_RDMA_MAP: + ret = rds_cmsg_rdma_map(rs, rm, cmsg); + if (!ret) + *allocated_mr = 1; + break; + case RDS_CMSG_ATOMIC_CSWP: + case RDS_CMSG_ATOMIC_FADD: + case RDS_CMSG_MASKED_ATOMIC_CSWP: + case RDS_CMSG_MASKED_ATOMIC_FADD: + ret = rds_cmsg_atomic(rs, rm, cmsg); + break; + + default: + return -EINVAL; + } + + if (ret) + break; + } + + return ret; +} + +int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t payload_len) +{ + struct sock *sk = sock->sk; + struct rds_sock *rs = rds_sk_to_rs(sk); + DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); + __be32 daddr; + __be16 dport; + struct rds_message *rm = NULL; + struct rds_connection *conn; + int ret = 0; + int queued = 0, allocated_mr = 0; + int nonblock = msg->msg_flags & MSG_DONTWAIT; + long timeo = sock_sndtimeo(sk, nonblock); + + /* Mirror Linux UDP mirror of BSD error message compatibility */ + /* XXX: Perhaps MSG_MORE someday */ + if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) { + ret = -EOPNOTSUPP; + goto out; + } + + if (msg->msg_namelen) { + /* XXX fail non-unicast destination IPs? */ + if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET) { + ret = -EINVAL; + goto out; + } + daddr = usin->sin_addr.s_addr; + dport = usin->sin_port; + } else { + /* We only care about consistency with ->connect() */ + lock_sock(sk); + daddr = rs->rs_conn_addr; + dport = rs->rs_conn_port; + release_sock(sk); + } + + /* racing with another thread binding seems ok here */ + if (daddr == 0 || rs->rs_bound_addr == 0) { + ret = -ENOTCONN; /* XXX not a great errno */ + goto out; + } + + /* size of rm including all sgs */ + ret = rds_rm_size(msg, payload_len); + if (ret < 0) + goto out; + + rm = rds_message_alloc(ret, GFP_KERNEL); + if (!rm) { + ret = -ENOMEM; + goto out; + } + + /* Attach data to the rm */ + if (payload_len) { + rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE)); + if (!rm->data.op_sg) { + ret = -ENOMEM; + goto out; + } + ret = rds_message_copy_from_user(rm, msg->msg_iov, payload_len); + if (ret) + goto out; + } + rm->data.op_active = 1; + + rm->m_daddr = daddr; + + /* rds_conn_create has a spinlock that runs with IRQ off. + * Caching the conn in the socket helps a lot. */ + if (rs->rs_conn && rs->rs_conn->c_faddr == daddr) + conn = rs->rs_conn; + else { + conn = rds_conn_create_outgoing(rs->rs_bound_addr, daddr, + rs->rs_transport, + sock->sk->sk_allocation); + if (IS_ERR(conn)) { + ret = PTR_ERR(conn); + goto out; + } + rs->rs_conn = conn; + } + + /* Parse any control messages the user may have included. */ + ret = rds_cmsg_send(rs, rm, msg, &allocated_mr); + if (ret) + goto out; + + if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) { + printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", + &rm->rdma, conn->c_trans->xmit_rdma); + ret = -EOPNOTSUPP; + goto out; + } + + if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) { + printk_ratelimited(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n", + &rm->atomic, conn->c_trans->xmit_atomic); + ret = -EOPNOTSUPP; + goto out; + } + + rds_conn_connect_if_down(conn); + + ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs); + if (ret) { + rs->rs_seen_congestion = 1; + goto out; + } + + while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port, + dport, &queued)) { + rds_stats_inc(s_send_queue_full); + /* XXX make sure this is reasonable */ + if (payload_len > rds_sk_sndbuf(rs)) { + ret = -EMSGSIZE; + goto out; + } + if (nonblock) { + ret = -EAGAIN; + goto out; + } + + timeo = wait_event_interruptible_timeout(*sk_sleep(sk), + rds_send_queue_rm(rs, conn, rm, + rs->rs_bound_port, + dport, + &queued), + timeo); + rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo); + if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT) + continue; + + ret = timeo; + if (ret == 0) + ret = -ETIMEDOUT; + goto out; + } + + /* + * By now we've committed to the send. We reuse rds_send_worker() + * to retry sends in the rds thread if the transport asks us to. + */ + rds_stats_inc(s_send_queued); + + if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) + rds_send_xmit(conn); + + rds_message_put(rm); + return payload_len; + +out: + /* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly. + * If the sendmsg goes through, we keep the MR. If it fails with EAGAIN + * or in any other way, we need to destroy the MR again */ + if (allocated_mr) + rds_rdma_unuse(rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1); + + if (rm) + rds_message_put(rm); + return ret; +} + +/* + * Reply to a ping packet. + */ +int +rds_send_pong(struct rds_connection *conn, __be16 dport) +{ + struct rds_message *rm; + unsigned long flags; + int ret = 0; + + rm = rds_message_alloc(0, GFP_ATOMIC); + if (!rm) { + ret = -ENOMEM; + goto out; + } + + rm->m_daddr = conn->c_faddr; + rm->data.op_active = 1; + + rds_conn_connect_if_down(conn); + + ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL); + if (ret) + goto out; + + spin_lock_irqsave(&conn->c_lock, flags); + list_add_tail(&rm->m_conn_item, &conn->c_send_queue); + set_bit(RDS_MSG_ON_CONN, &rm->m_flags); + rds_message_addref(rm); + rm->m_inc.i_conn = conn; + + rds_message_populate_header(&rm->m_inc.i_hdr, 0, dport, + conn->c_next_tx_seq); + conn->c_next_tx_seq++; + spin_unlock_irqrestore(&conn->c_lock, flags); + + rds_stats_inc(s_send_queued); + rds_stats_inc(s_send_pong); + + if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + + rds_message_put(rm); + return 0; + +out: + if (rm) + rds_message_put(rm); + return ret; +} diff --git a/net/rds/stats.c b/net/rds/stats.c new file mode 100644 index 0000000..73be187 --- /dev/null +++ b/net/rds/stats.c @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include + +#include "rds.h" + +DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); +EXPORT_PER_CPU_SYMBOL_GPL(rds_stats); + +/* :.,$s/unsigned long\>.*\= sizeof(ctr.name)); + strncpy(ctr.name, names[i], sizeof(ctr.name) - 1); + ctr.name[sizeof(ctr.name) - 1] = '\0'; + ctr.value = values[i]; + + rds_info_copy(iter, &ctr, sizeof(ctr)); + } +} +EXPORT_SYMBOL_GPL(rds_stats_info_copy); + +/* + * This gives global counters across all the transports. The strings + * are copied in so that the tool doesn't need knowledge of the specific + * stats that we're exporting. Some are pretty implementation dependent + * and may change over time. That doesn't stop them from being useful. + * + * This is the only function in the chain that knows about the byte granular + * length in userspace. It converts it to number of stat entries that the + * rest of the functions operate in. + */ +static void rds_stats_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + struct rds_statistics stats = {0, }; + uint64_t *src; + uint64_t *sum; + size_t i; + int cpu; + unsigned int avail; + + avail = len / sizeof(struct rds_info_counter); + + if (avail < ARRAY_SIZE(rds_stat_names)) { + avail = 0; + goto trans; + } + + for_each_online_cpu(cpu) { + src = (uint64_t *)&(per_cpu(rds_stats, cpu)); + sum = (uint64_t *)&stats; + for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) + *(sum++) += *(src++); + } + + rds_stats_info_copy(iter, (uint64_t *)&stats, rds_stat_names, + ARRAY_SIZE(rds_stat_names)); + avail -= ARRAY_SIZE(rds_stat_names); + +trans: + lens->each = sizeof(struct rds_info_counter); + lens->nr = rds_trans_stats_info_copy(iter, avail) + + ARRAY_SIZE(rds_stat_names); +} + +void rds_stats_exit(void) +{ + rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info); +} + +int rds_stats_init(void) +{ + rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info); + return 0; +} diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c new file mode 100644 index 0000000..c3b0cd4 --- /dev/null +++ b/net/rds/sysctl.c @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include + +#include "rds.h" + +static struct ctl_table_header *rds_sysctl_reg_table; + +static unsigned long rds_sysctl_reconnect_min = 1; +static unsigned long rds_sysctl_reconnect_max = ~0UL; + +unsigned long rds_sysctl_reconnect_min_jiffies; +unsigned long rds_sysctl_reconnect_max_jiffies = HZ; + +unsigned int rds_sysctl_max_unacked_packets = 8; +unsigned int rds_sysctl_max_unacked_bytes = (16 << 20); + +unsigned int rds_sysctl_ping_enable = 1; + +static struct ctl_table rds_sysctl_rds_table[] = { + { + .procname = "reconnect_min_delay_ms", + .data = &rds_sysctl_reconnect_min_jiffies, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_ms_jiffies_minmax, + .extra1 = &rds_sysctl_reconnect_min, + .extra2 = &rds_sysctl_reconnect_max_jiffies, + }, + { + .procname = "reconnect_max_delay_ms", + .data = &rds_sysctl_reconnect_max_jiffies, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_ms_jiffies_minmax, + .extra1 = &rds_sysctl_reconnect_min_jiffies, + .extra2 = &rds_sysctl_reconnect_max, + }, + { + .procname = "max_unacked_packets", + .data = &rds_sysctl_max_unacked_packets, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "max_unacked_bytes", + .data = &rds_sysctl_max_unacked_bytes, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "ping_enable", + .data = &rds_sysctl_ping_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +void rds_sysctl_exit(void) +{ + unregister_net_sysctl_table(rds_sysctl_reg_table); +} + +int rds_sysctl_init(void) +{ + rds_sysctl_reconnect_min = msecs_to_jiffies(1); + rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min; + + rds_sysctl_reg_table = register_net_sysctl(&init_net,"net/rds", rds_sysctl_rds_table); + if (!rds_sysctl_reg_table) + return -ENOMEM; + return 0; +} diff --git a/net/rds/tcp.c b/net/rds/tcp.c new file mode 100644 index 0000000..edac9ef --- /dev/null +++ b/net/rds/tcp.c @@ -0,0 +1,326 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include +#include + +#include "rds.h" +#include "tcp.h" + +/* only for info exporting */ +static DEFINE_SPINLOCK(rds_tcp_tc_list_lock); +static LIST_HEAD(rds_tcp_tc_list); +static unsigned int rds_tcp_tc_count; + +/* Track rds_tcp_connection structs so they can be cleaned up */ +static DEFINE_SPINLOCK(rds_tcp_conn_lock); +static LIST_HEAD(rds_tcp_conn_list); + +static struct kmem_cache *rds_tcp_conn_slab; + +#define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024) + +/* doing it this way avoids calling tcp_sk() */ +void rds_tcp_nonagle(struct socket *sock) +{ + mm_segment_t oldfs = get_fs(); + int val = 1; + + set_fs(KERNEL_DS); + sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, (char __user *)&val, + sizeof(val)); + set_fs(oldfs); +} + +void rds_tcp_tune(struct socket *sock) +{ + struct sock *sk = sock->sk; + + rds_tcp_nonagle(sock); + + /* + * We're trying to saturate gigabit with the default, + * see svc_sock_setbufsize(). + */ + lock_sock(sk); + sk->sk_sndbuf = RDS_TCP_DEFAULT_BUFSIZE; + sk->sk_rcvbuf = RDS_TCP_DEFAULT_BUFSIZE; + sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK; + release_sock(sk); +} + +u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc) +{ + return tcp_sk(tc->t_sock->sk)->snd_nxt; +} + +u32 rds_tcp_snd_una(struct rds_tcp_connection *tc) +{ + return tcp_sk(tc->t_sock->sk)->snd_una; +} + +void rds_tcp_restore_callbacks(struct socket *sock, + struct rds_tcp_connection *tc) +{ + rdsdebug("restoring sock %p callbacks from tc %p\n", sock, tc); + write_lock_bh(&sock->sk->sk_callback_lock); + + /* done under the callback_lock to serialize with write_space */ + spin_lock(&rds_tcp_tc_list_lock); + list_del_init(&tc->t_list_item); + rds_tcp_tc_count--; + spin_unlock(&rds_tcp_tc_list_lock); + + tc->t_sock = NULL; + + sock->sk->sk_write_space = tc->t_orig_write_space; + sock->sk->sk_data_ready = tc->t_orig_data_ready; + sock->sk->sk_state_change = tc->t_orig_state_change; + sock->sk->sk_user_data = NULL; + + write_unlock_bh(&sock->sk->sk_callback_lock); +} + +/* + * This is the only path that sets tc->t_sock. Send and receive trust that + * it is set. The RDS_CONN_CONNECTED bit protects those paths from being + * called while it isn't set. + */ +void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn) +{ + struct rds_tcp_connection *tc = conn->c_transport_data; + + rdsdebug("setting sock %p callbacks to tc %p\n", sock, tc); + write_lock_bh(&sock->sk->sk_callback_lock); + + /* done under the callback_lock to serialize with write_space */ + spin_lock(&rds_tcp_tc_list_lock); + list_add_tail(&tc->t_list_item, &rds_tcp_tc_list); + rds_tcp_tc_count++; + spin_unlock(&rds_tcp_tc_list_lock); + + /* accepted sockets need our listen data ready undone */ + if (sock->sk->sk_data_ready == rds_tcp_listen_data_ready) + sock->sk->sk_data_ready = sock->sk->sk_user_data; + + tc->t_sock = sock; + tc->conn = conn; + tc->t_orig_data_ready = sock->sk->sk_data_ready; + tc->t_orig_write_space = sock->sk->sk_write_space; + tc->t_orig_state_change = sock->sk->sk_state_change; + + sock->sk->sk_user_data = conn; + sock->sk->sk_data_ready = rds_tcp_data_ready; + sock->sk->sk_write_space = rds_tcp_write_space; + sock->sk->sk_state_change = rds_tcp_state_change; + + write_unlock_bh(&sock->sk->sk_callback_lock); +} + +static void rds_tcp_tc_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + struct rds_info_tcp_socket tsinfo; + struct rds_tcp_connection *tc; + unsigned long flags; + struct sockaddr_in sin; + int sinlen; + + spin_lock_irqsave(&rds_tcp_tc_list_lock, flags); + + if (len / sizeof(tsinfo) < rds_tcp_tc_count) + goto out; + + list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) { + + sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 0); + tsinfo.local_addr = sin.sin_addr.s_addr; + tsinfo.local_port = sin.sin_port; + sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 1); + tsinfo.peer_addr = sin.sin_addr.s_addr; + tsinfo.peer_port = sin.sin_port; + + tsinfo.hdr_rem = tc->t_tinc_hdr_rem; + tsinfo.data_rem = tc->t_tinc_data_rem; + tsinfo.last_sent_nxt = tc->t_last_sent_nxt; + tsinfo.last_expected_una = tc->t_last_expected_una; + tsinfo.last_seen_una = tc->t_last_seen_una; + + rds_info_copy(iter, &tsinfo, sizeof(tsinfo)); + } + +out: + lens->nr = rds_tcp_tc_count; + lens->each = sizeof(tsinfo); + + spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags); +} + +static int rds_tcp_laddr_check(__be32 addr) +{ + if (inet_addr_type(&init_net, addr) == RTN_LOCAL) + return 0; + return -EADDRNOTAVAIL; +} + +static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) +{ + struct rds_tcp_connection *tc; + + tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp); + if (!tc) + return -ENOMEM; + + tc->t_sock = NULL; + tc->t_tinc = NULL; + tc->t_tinc_hdr_rem = sizeof(struct rds_header); + tc->t_tinc_data_rem = 0; + + conn->c_transport_data = tc; + + spin_lock_irq(&rds_tcp_conn_lock); + list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list); + spin_unlock_irq(&rds_tcp_conn_lock); + + rdsdebug("alloced tc %p\n", conn->c_transport_data); + return 0; +} + +static void rds_tcp_conn_free(void *arg) +{ + struct rds_tcp_connection *tc = arg; + unsigned long flags; + rdsdebug("freeing tc %p\n", tc); + + spin_lock_irqsave(&rds_tcp_conn_lock, flags); + list_del(&tc->t_tcp_node); + spin_unlock_irqrestore(&rds_tcp_conn_lock, flags); + + kmem_cache_free(rds_tcp_conn_slab, tc); +} + +static void rds_tcp_destroy_conns(void) +{ + struct rds_tcp_connection *tc, *_tc; + LIST_HEAD(tmp_list); + + /* avoid calling conn_destroy with irqs off */ + spin_lock_irq(&rds_tcp_conn_lock); + list_splice(&rds_tcp_conn_list, &tmp_list); + INIT_LIST_HEAD(&rds_tcp_conn_list); + spin_unlock_irq(&rds_tcp_conn_lock); + + list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) { + if (tc->conn->c_passive) + rds_conn_destroy(tc->conn->c_passive); + rds_conn_destroy(tc->conn); + } +} + +static void rds_tcp_exit(void) +{ + rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); + rds_tcp_listen_stop(); + rds_tcp_destroy_conns(); + rds_trans_unregister(&rds_tcp_transport); + rds_tcp_recv_exit(); + kmem_cache_destroy(rds_tcp_conn_slab); +} +module_exit(rds_tcp_exit); + +struct rds_transport rds_tcp_transport = { + .laddr_check = rds_tcp_laddr_check, + .xmit_prepare = rds_tcp_xmit_prepare, + .xmit_complete = rds_tcp_xmit_complete, + .xmit = rds_tcp_xmit, + .recv = rds_tcp_recv, + .conn_alloc = rds_tcp_conn_alloc, + .conn_free = rds_tcp_conn_free, + .conn_connect = rds_tcp_conn_connect, + .conn_shutdown = rds_tcp_conn_shutdown, + .inc_copy_to_user = rds_tcp_inc_copy_to_user, + .inc_free = rds_tcp_inc_free, + .stats_info_copy = rds_tcp_stats_info_copy, + .exit = rds_tcp_exit, + .t_owner = THIS_MODULE, + .t_name = "tcp", + .t_type = RDS_TRANS_TCP, + .t_prefer_loopback = 1, +}; + +static int rds_tcp_init(void) +{ + int ret; + + rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection", + sizeof(struct rds_tcp_connection), + 0, 0, NULL); + if (!rds_tcp_conn_slab) { + ret = -ENOMEM; + goto out; + } + + ret = rds_tcp_recv_init(); + if (ret) + goto out_slab; + + ret = rds_trans_register(&rds_tcp_transport); + if (ret) + goto out_recv; + + ret = rds_tcp_listen_init(); + if (ret) + goto out_register; + + rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); + + goto out; + +out_register: + rds_trans_unregister(&rds_tcp_transport); +out_recv: + rds_tcp_recv_exit(); +out_slab: + kmem_cache_destroy(rds_tcp_conn_slab); +out: + return ret; +} +module_init(rds_tcp_init); + +MODULE_AUTHOR("Oracle Corporation "); +MODULE_DESCRIPTION("RDS: TCP transport"); +MODULE_LICENSE("Dual BSD/GPL"); + diff --git a/net/rds/tcp.h b/net/rds/tcp.h new file mode 100644 index 0000000..6563749 --- /dev/null +++ b/net/rds/tcp.h @@ -0,0 +1,88 @@ +#ifndef _RDS_TCP_H +#define _RDS_TCP_H + +#define RDS_TCP_PORT 16385 + +struct rds_tcp_incoming { + struct rds_incoming ti_inc; + struct sk_buff_head ti_skb_list; +}; + +struct rds_tcp_connection { + + struct list_head t_tcp_node; + struct rds_connection *conn; + struct socket *t_sock; + void *t_orig_write_space; + void *t_orig_data_ready; + void *t_orig_state_change; + + struct rds_tcp_incoming *t_tinc; + size_t t_tinc_hdr_rem; + size_t t_tinc_data_rem; + + /* XXX error report? */ + struct work_struct t_conn_w; + struct work_struct t_send_w; + struct work_struct t_down_w; + struct work_struct t_recv_w; + + /* for info exporting only */ + struct list_head t_list_item; + u32 t_last_sent_nxt; + u32 t_last_expected_una; + u32 t_last_seen_una; +}; + +struct rds_tcp_statistics { + uint64_t s_tcp_data_ready_calls; + uint64_t s_tcp_write_space_calls; + uint64_t s_tcp_sndbuf_full; + uint64_t s_tcp_connect_raced; + uint64_t s_tcp_listen_closed_stale; +}; + +/* tcp.c */ +void rds_tcp_tune(struct socket *sock); +void rds_tcp_nonagle(struct socket *sock); +void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn); +void rds_tcp_restore_callbacks(struct socket *sock, + struct rds_tcp_connection *tc); +u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc); +u32 rds_tcp_snd_una(struct rds_tcp_connection *tc); +u64 rds_tcp_map_seq(struct rds_tcp_connection *tc, u32 seq); +extern struct rds_transport rds_tcp_transport; + +/* tcp_connect.c */ +int rds_tcp_conn_connect(struct rds_connection *conn); +void rds_tcp_conn_shutdown(struct rds_connection *conn); +void rds_tcp_state_change(struct sock *sk); + +/* tcp_listen.c */ +int rds_tcp_listen_init(void); +void rds_tcp_listen_stop(void); +void rds_tcp_listen_data_ready(struct sock *sk); + +/* tcp_recv.c */ +int rds_tcp_recv_init(void); +void rds_tcp_recv_exit(void); +void rds_tcp_data_ready(struct sock *sk); +int rds_tcp_recv(struct rds_connection *conn); +void rds_tcp_inc_free(struct rds_incoming *inc); +int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, + size_t size); + +/* tcp_send.c */ +void rds_tcp_xmit_prepare(struct rds_connection *conn); +void rds_tcp_xmit_complete(struct rds_connection *conn); +int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, unsigned int off); +void rds_tcp_write_space(struct sock *sk); + +/* tcp_stats.c */ +DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats); +#define rds_tcp_stats_inc(member) rds_stats_inc_which(rds_tcp_stats, member) +unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail); + +#endif diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c new file mode 100644 index 0000000..f9f564a --- /dev/null +++ b/net/rds/tcp_connect.c @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include + +#include "rds.h" +#include "tcp.h" + +void rds_tcp_state_change(struct sock *sk) +{ + void (*state_change)(struct sock *sk); + struct rds_connection *conn; + struct rds_tcp_connection *tc; + + read_lock(&sk->sk_callback_lock); + conn = sk->sk_user_data; + if (!conn) { + state_change = sk->sk_state_change; + goto out; + } + tc = conn->c_transport_data; + state_change = tc->t_orig_state_change; + + rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state); + + switch(sk->sk_state) { + /* ignore connecting sockets as they make progress */ + case TCP_SYN_SENT: + case TCP_SYN_RECV: + break; + case TCP_ESTABLISHED: + rds_connect_complete(conn); + break; + case TCP_CLOSE: + rds_conn_drop(conn); + default: + break; + } +out: + read_unlock(&sk->sk_callback_lock); + state_change(sk); +} + +int rds_tcp_conn_connect(struct rds_connection *conn) +{ + struct socket *sock = NULL; + struct sockaddr_in src, dest; + int ret; + + ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + if (ret < 0) + goto out; + + rds_tcp_tune(sock); + + src.sin_family = AF_INET; + src.sin_addr.s_addr = (__force u32)conn->c_laddr; + src.sin_port = (__force u16)htons(0); + + ret = sock->ops->bind(sock, (struct sockaddr *)&src, sizeof(src)); + if (ret) { + rdsdebug("bind failed with %d at address %pI4\n", + ret, &conn->c_laddr); + goto out; + } + + dest.sin_family = AF_INET; + dest.sin_addr.s_addr = (__force u32)conn->c_faddr; + dest.sin_port = (__force u16)htons(RDS_TCP_PORT); + + /* + * once we call connect() we can start getting callbacks and they + * own the socket + */ + rds_tcp_set_callbacks(sock, conn); + ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest), + O_NONBLOCK); + + rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret); + if (ret == -EINPROGRESS) + ret = 0; + if (ret == 0) + sock = NULL; + else + rds_tcp_restore_callbacks(sock, conn->c_transport_data); + +out: + if (sock) + sock_release(sock); + return ret; +} + +/* + * Before killing the tcp socket this needs to serialize with callbacks. The + * caller has already grabbed the sending sem so we're serialized with other + * senders. + * + * TCP calls the callbacks with the sock lock so we hold it while we reset the + * callbacks to those set by TCP. Our callbacks won't execute again once we + * hold the sock lock. + */ +void rds_tcp_conn_shutdown(struct rds_connection *conn) +{ + struct rds_tcp_connection *tc = conn->c_transport_data; + struct socket *sock = tc->t_sock; + + rdsdebug("shutting down conn %p tc %p sock %p\n", conn, tc, sock); + + if (sock) { + sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN); + lock_sock(sock->sk); + rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */ + + release_sock(sock->sk); + sock_release(sock); + } + + if (tc->t_tinc) { + rds_inc_put(&tc->t_tinc->ti_inc); + tc->t_tinc = NULL; + } + tc->t_tinc_hdr_rem = sizeof(struct rds_header); + tc->t_tinc_data_rem = 0; +} diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c new file mode 100644 index 0000000..23ab4dc --- /dev/null +++ b/net/rds/tcp_listen.c @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include + +#include "rds.h" +#include "tcp.h" + +/* + * cheesy, but simple.. + */ +static void rds_tcp_accept_worker(struct work_struct *work); +static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker); +static struct socket *rds_tcp_listen_sock; + +static int rds_tcp_accept_one(struct socket *sock) +{ + struct socket *new_sock = NULL; + struct rds_connection *conn; + int ret; + struct inet_sock *inet; + + ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, + sock->sk->sk_protocol, &new_sock); + if (ret) + goto out; + + new_sock->type = sock->type; + new_sock->ops = sock->ops; + ret = sock->ops->accept(sock, new_sock, O_NONBLOCK); + if (ret < 0) + goto out; + + rds_tcp_tune(new_sock); + + inet = inet_sk(new_sock->sk); + + rdsdebug("accepted tcp %pI4:%u -> %pI4:%u\n", + &inet->inet_saddr, ntohs(inet->inet_sport), + &inet->inet_daddr, ntohs(inet->inet_dport)); + + conn = rds_conn_create(inet->inet_saddr, inet->inet_daddr, + &rds_tcp_transport, GFP_KERNEL); + if (IS_ERR(conn)) { + ret = PTR_ERR(conn); + goto out; + } + + /* + * see the comment above rds_queue_delayed_reconnect() + */ + if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { + if (rds_conn_state(conn) == RDS_CONN_UP) + rds_tcp_stats_inc(s_tcp_listen_closed_stale); + else + rds_tcp_stats_inc(s_tcp_connect_raced); + rds_conn_drop(conn); + ret = 0; + goto out; + } + + rds_tcp_set_callbacks(new_sock, conn); + rds_connect_complete(conn); + new_sock = NULL; + ret = 0; + +out: + if (new_sock) + sock_release(new_sock); + return ret; +} + +static void rds_tcp_accept_worker(struct work_struct *work) +{ + while (rds_tcp_accept_one(rds_tcp_listen_sock) == 0) + cond_resched(); +} + +void rds_tcp_listen_data_ready(struct sock *sk) +{ + void (*ready)(struct sock *sk); + + rdsdebug("listen data ready sk %p\n", sk); + + read_lock(&sk->sk_callback_lock); + ready = sk->sk_user_data; + if (!ready) { /* check for teardown race */ + ready = sk->sk_data_ready; + goto out; + } + + /* + * ->sk_data_ready is also called for a newly established child socket + * before it has been accepted and the accepter has set up their + * data_ready.. we only want to queue listen work for our listening + * socket + */ + if (sk->sk_state == TCP_LISTEN) + queue_work(rds_wq, &rds_tcp_listen_work); + +out: + read_unlock(&sk->sk_callback_lock); + ready(sk); +} + +int rds_tcp_listen_init(void) +{ + struct sockaddr_in sin; + struct socket *sock = NULL; + int ret; + + ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + if (ret < 0) + goto out; + + sock->sk->sk_reuse = SK_CAN_REUSE; + rds_tcp_nonagle(sock); + + write_lock_bh(&sock->sk->sk_callback_lock); + sock->sk->sk_user_data = sock->sk->sk_data_ready; + sock->sk->sk_data_ready = rds_tcp_listen_data_ready; + write_unlock_bh(&sock->sk->sk_callback_lock); + + sin.sin_family = PF_INET; + sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY); + sin.sin_port = (__force u16)htons(RDS_TCP_PORT); + + ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); + if (ret < 0) + goto out; + + ret = sock->ops->listen(sock, 64); + if (ret < 0) + goto out; + + rds_tcp_listen_sock = sock; + sock = NULL; +out: + if (sock) + sock_release(sock); + return ret; +} + +void rds_tcp_listen_stop(void) +{ + struct socket *sock = rds_tcp_listen_sock; + struct sock *sk; + + if (!sock) + return; + + sk = sock->sk; + + /* serialize with and prevent further callbacks */ + lock_sock(sk); + write_lock_bh(&sk->sk_callback_lock); + if (sk->sk_user_data) { + sk->sk_data_ready = sk->sk_user_data; + sk->sk_user_data = NULL; + } + write_unlock_bh(&sk->sk_callback_lock); + release_sock(sk); + + /* wait for accepts to stop and close the socket */ + flush_workqueue(rds_wq); + sock_release(sock); + rds_tcp_listen_sock = NULL; +} diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c new file mode 100644 index 0000000..9ae6e0a --- /dev/null +++ b/net/rds/tcp_recv.c @@ -0,0 +1,356 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include + +#include "rds.h" +#include "tcp.h" + +static struct kmem_cache *rds_tcp_incoming_slab; + +static void rds_tcp_inc_purge(struct rds_incoming *inc) +{ + struct rds_tcp_incoming *tinc; + tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); + rdsdebug("purging tinc %p inc %p\n", tinc, inc); + skb_queue_purge(&tinc->ti_skb_list); +} + +void rds_tcp_inc_free(struct rds_incoming *inc) +{ + struct rds_tcp_incoming *tinc; + tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); + rds_tcp_inc_purge(inc); + rdsdebug("freeing tinc %p inc %p\n", tinc, inc); + kmem_cache_free(rds_tcp_incoming_slab, tinc); +} + +/* + * this is pretty lame, but, whatever. + */ +int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, + size_t size) +{ + struct rds_tcp_incoming *tinc; + struct iovec *iov, tmp; + struct sk_buff *skb; + unsigned long to_copy, skb_off; + int ret = 0; + + if (size == 0) + goto out; + + tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); + iov = first_iov; + tmp = *iov; + + skb_queue_walk(&tinc->ti_skb_list, skb) { + skb_off = 0; + while (skb_off < skb->len) { + while (tmp.iov_len == 0) { + iov++; + tmp = *iov; + } + + to_copy = min(tmp.iov_len, size); + to_copy = min(to_copy, skb->len - skb_off); + + rdsdebug("ret %d size %zu skb %p skb_off %lu " + "skblen %d iov_base %p iov_len %zu cpy %lu\n", + ret, size, skb, skb_off, skb->len, + tmp.iov_base, tmp.iov_len, to_copy); + + /* modifies tmp as it copies */ + if (skb_copy_datagram_iovec(skb, skb_off, &tmp, + to_copy)) { + ret = -EFAULT; + goto out; + } + + rds_stats_add(s_copy_to_user, to_copy); + size -= to_copy; + ret += to_copy; + skb_off += to_copy; + if (size == 0) + goto out; + } + } +out: + return ret; +} + +/* + * We have a series of skbs that have fragmented pieces of the congestion + * bitmap. They must add up to the exact size of the congestion bitmap. We + * use the skb helpers to copy those into the pages that make up the in-memory + * congestion bitmap for the remote address of this connection. We then tell + * the congestion core that the bitmap has been changed so that it can wake up + * sleepers. + * + * This is racing with sending paths which are using test_bit to see if the + * bitmap indicates that their recipient is congested. + */ + +static void rds_tcp_cong_recv(struct rds_connection *conn, + struct rds_tcp_incoming *tinc) +{ + struct sk_buff *skb; + unsigned int to_copy, skb_off; + unsigned int map_off; + unsigned int map_page; + struct rds_cong_map *map; + int ret; + + /* catch completely corrupt packets */ + if (be32_to_cpu(tinc->ti_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) + return; + + map_page = 0; + map_off = 0; + map = conn->c_fcong; + + skb_queue_walk(&tinc->ti_skb_list, skb) { + skb_off = 0; + while (skb_off < skb->len) { + to_copy = min_t(unsigned int, PAGE_SIZE - map_off, + skb->len - skb_off); + + BUG_ON(map_page >= RDS_CONG_MAP_PAGES); + + /* only returns 0 or -error */ + ret = skb_copy_bits(skb, skb_off, + (void *)map->m_page_addrs[map_page] + map_off, + to_copy); + BUG_ON(ret != 0); + + skb_off += to_copy; + map_off += to_copy; + if (map_off == PAGE_SIZE) { + map_off = 0; + map_page++; + } + } + } + + rds_cong_map_updated(map, ~(u64) 0); +} + +struct rds_tcp_desc_arg { + struct rds_connection *conn; + gfp_t gfp; +}; + +static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, + unsigned int offset, size_t len) +{ + struct rds_tcp_desc_arg *arg = desc->arg.data; + struct rds_connection *conn = arg->conn; + struct rds_tcp_connection *tc = conn->c_transport_data; + struct rds_tcp_incoming *tinc = tc->t_tinc; + struct sk_buff *clone; + size_t left = len, to_copy; + + rdsdebug("tcp data tc %p skb %p offset %u len %zu\n", tc, skb, offset, + len); + + /* + * tcp_read_sock() interprets partial progress as an indication to stop + * processing. + */ + while (left) { + if (!tinc) { + tinc = kmem_cache_alloc(rds_tcp_incoming_slab, + arg->gfp); + if (!tinc) { + desc->error = -ENOMEM; + goto out; + } + tc->t_tinc = tinc; + rdsdebug("alloced tinc %p\n", tinc); + rds_inc_init(&tinc->ti_inc, conn, conn->c_faddr); + /* + * XXX * we might be able to use the __ variants when + * we've already serialized at a higher level. + */ + skb_queue_head_init(&tinc->ti_skb_list); + } + + if (left && tc->t_tinc_hdr_rem) { + to_copy = min(tc->t_tinc_hdr_rem, left); + rdsdebug("copying %zu header from skb %p\n", to_copy, + skb); + skb_copy_bits(skb, offset, + (char *)&tinc->ti_inc.i_hdr + + sizeof(struct rds_header) - + tc->t_tinc_hdr_rem, + to_copy); + tc->t_tinc_hdr_rem -= to_copy; + left -= to_copy; + offset += to_copy; + + if (tc->t_tinc_hdr_rem == 0) { + /* could be 0 for a 0 len message */ + tc->t_tinc_data_rem = + be32_to_cpu(tinc->ti_inc.i_hdr.h_len); + } + } + + if (left && tc->t_tinc_data_rem) { + clone = skb_clone(skb, arg->gfp); + if (!clone) { + desc->error = -ENOMEM; + goto out; + } + + to_copy = min(tc->t_tinc_data_rem, left); + pskb_pull(clone, offset); + pskb_trim(clone, to_copy); + skb_queue_tail(&tinc->ti_skb_list, clone); + + rdsdebug("skb %p data %p len %d off %u to_copy %zu -> " + "clone %p data %p len %d\n", + skb, skb->data, skb->len, offset, to_copy, + clone, clone->data, clone->len); + + tc->t_tinc_data_rem -= to_copy; + left -= to_copy; + offset += to_copy; + } + + if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) { + if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) + rds_tcp_cong_recv(conn, tinc); + else + rds_recv_incoming(conn, conn->c_faddr, + conn->c_laddr, &tinc->ti_inc, + arg->gfp); + + tc->t_tinc_hdr_rem = sizeof(struct rds_header); + tc->t_tinc_data_rem = 0; + tc->t_tinc = NULL; + rds_inc_put(&tinc->ti_inc); + tinc = NULL; + } + } +out: + rdsdebug("returning len %zu left %zu skb len %d rx queue depth %d\n", + len, left, skb->len, + skb_queue_len(&tc->t_sock->sk->sk_receive_queue)); + return len - left; +} + +/* the caller has to hold the sock lock */ +static int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp) +{ + struct rds_tcp_connection *tc = conn->c_transport_data; + struct socket *sock = tc->t_sock; + read_descriptor_t desc; + struct rds_tcp_desc_arg arg; + + /* It's like glib in the kernel! */ + arg.conn = conn; + arg.gfp = gfp; + desc.arg.data = &arg; + desc.error = 0; + desc.count = 1; /* give more than one skb per call */ + + tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv); + rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp, + desc.error); + + return desc.error; +} + +/* + * We hold the sock lock to serialize our rds_tcp_recv->tcp_read_sock from + * data_ready. + * + * if we fail to allocate we're in trouble.. blindly wait some time before + * trying again to see if the VM can free up something for us. + */ +int rds_tcp_recv(struct rds_connection *conn) +{ + struct rds_tcp_connection *tc = conn->c_transport_data; + struct socket *sock = tc->t_sock; + int ret = 0; + + rdsdebug("recv worker conn %p tc %p sock %p\n", conn, tc, sock); + + lock_sock(sock->sk); + ret = rds_tcp_read_sock(conn, GFP_KERNEL); + release_sock(sock->sk); + + return ret; +} + +void rds_tcp_data_ready(struct sock *sk) +{ + void (*ready)(struct sock *sk); + struct rds_connection *conn; + struct rds_tcp_connection *tc; + + rdsdebug("data ready sk %p\n", sk); + + read_lock(&sk->sk_callback_lock); + conn = sk->sk_user_data; + if (!conn) { /* check for teardown race */ + ready = sk->sk_data_ready; + goto out; + } + + tc = conn->c_transport_data; + ready = tc->t_orig_data_ready; + rds_tcp_stats_inc(s_tcp_data_ready_calls); + + if (rds_tcp_read_sock(conn, GFP_ATOMIC) == -ENOMEM) + queue_delayed_work(rds_wq, &conn->c_recv_w, 0); +out: + read_unlock(&sk->sk_callback_lock); + ready(sk); +} + +int rds_tcp_recv_init(void) +{ + rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming", + sizeof(struct rds_tcp_incoming), + 0, 0, NULL); + if (!rds_tcp_incoming_slab) + return -ENOMEM; + return 0; +} + +void rds_tcp_recv_exit(void) +{ + kmem_cache_destroy(rds_tcp_incoming_slab); +} diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c new file mode 100644 index 0000000..53b17ca --- /dev/null +++ b/net/rds/tcp_send.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include + +#include "rds.h" +#include "tcp.h" + +static void rds_tcp_cork(struct socket *sock, int val) +{ + mm_segment_t oldfs; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + sock->ops->setsockopt(sock, SOL_TCP, TCP_CORK, (char __user *)&val, + sizeof(val)); + set_fs(oldfs); +} + +void rds_tcp_xmit_prepare(struct rds_connection *conn) +{ + struct rds_tcp_connection *tc = conn->c_transport_data; + + rds_tcp_cork(tc->t_sock, 1); +} + +void rds_tcp_xmit_complete(struct rds_connection *conn) +{ + struct rds_tcp_connection *tc = conn->c_transport_data; + + rds_tcp_cork(tc->t_sock, 0); +} + +/* the core send_sem serializes this with other xmit and shutdown */ +static int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len) +{ + struct kvec vec = { + .iov_base = data, + .iov_len = len, + }; + struct msghdr msg = { + .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL, + }; + + return kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len); +} + +/* the core send_sem serializes this with other xmit and shutdown */ +int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, unsigned int off) +{ + struct rds_tcp_connection *tc = conn->c_transport_data; + int done = 0; + int ret = 0; + + if (hdr_off == 0) { + /* + * m_ack_seq is set to the sequence number of the last byte of + * header and data. see rds_tcp_is_acked(). + */ + tc->t_last_sent_nxt = rds_tcp_snd_nxt(tc); + rm->m_ack_seq = tc->t_last_sent_nxt + + sizeof(struct rds_header) + + be32_to_cpu(rm->m_inc.i_hdr.h_len) - 1; + smp_mb__before_atomic(); + set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags); + tc->t_last_expected_una = rm->m_ack_seq + 1; + + rdsdebug("rm %p tcp nxt %u ack_seq %llu\n", + rm, rds_tcp_snd_nxt(tc), + (unsigned long long)rm->m_ack_seq); + } + + if (hdr_off < sizeof(struct rds_header)) { + /* see rds_tcp_write_space() */ + set_bit(SOCK_NOSPACE, &tc->t_sock->sk->sk_socket->flags); + + ret = rds_tcp_sendmsg(tc->t_sock, + (void *)&rm->m_inc.i_hdr + hdr_off, + sizeof(rm->m_inc.i_hdr) - hdr_off); + if (ret < 0) + goto out; + done += ret; + if (hdr_off + done != sizeof(struct rds_header)) + goto out; + } + + while (sg < rm->data.op_nents) { + ret = tc->t_sock->ops->sendpage(tc->t_sock, + sg_page(&rm->data.op_sg[sg]), + rm->data.op_sg[sg].offset + off, + rm->data.op_sg[sg].length - off, + MSG_DONTWAIT|MSG_NOSIGNAL); + rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]), + rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off, + ret); + if (ret <= 0) + break; + + off += ret; + done += ret; + if (off == rm->data.op_sg[sg].length) { + off = 0; + sg++; + } + } + +out: + if (ret <= 0) { + /* write_space will hit after EAGAIN, all else fatal */ + if (ret == -EAGAIN) { + rds_tcp_stats_inc(s_tcp_sndbuf_full); + ret = 0; + } else { + printk(KERN_WARNING "RDS/tcp: send to %pI4 " + "returned %d, disconnecting and reconnecting\n", + &conn->c_faddr, ret); + rds_conn_drop(conn); + } + } + if (done == 0) + done = ret; + return done; +} + +/* + * rm->m_ack_seq is set to the tcp sequence number that corresponds to the + * last byte of the message, including the header. This means that the + * entire message has been received if rm->m_ack_seq is "before" the next + * unacked byte of the TCP sequence space. We have to do very careful + * wrapping 32bit comparisons here. + */ +static int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack) +{ + if (!test_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags)) + return 0; + return (__s32)((u32)rm->m_ack_seq - (u32)ack) < 0; +} + +void rds_tcp_write_space(struct sock *sk) +{ + void (*write_space)(struct sock *sk); + struct rds_connection *conn; + struct rds_tcp_connection *tc; + + read_lock(&sk->sk_callback_lock); + conn = sk->sk_user_data; + if (!conn) { + write_space = sk->sk_write_space; + goto out; + } + + tc = conn->c_transport_data; + rdsdebug("write_space for tc %p\n", tc); + write_space = tc->t_orig_write_space; + rds_tcp_stats_inc(s_tcp_write_space_calls); + + rdsdebug("tcp una %u\n", rds_tcp_snd_una(tc)); + tc->t_last_seen_una = rds_tcp_snd_una(tc); + rds_send_drop_acked(conn, rds_tcp_snd_una(tc), rds_tcp_is_acked); + + if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + +out: + read_unlock(&sk->sk_callback_lock); + + /* + * write_space is only called when data leaves tcp's send queue if + * SOCK_NOSPACE is set. We set SOCK_NOSPACE every time we put + * data in tcp's send queue because we use write_space to parse the + * sequence numbers and notice that rds messages have been fully + * received. + * + * tcp's write_space clears SOCK_NOSPACE if the send queue has more + * than a certain amount of space. So we need to set it again *after* + * we call tcp's write_space or else we might only get called on the + * first of a series of incoming tcp acks. + */ + write_space(sk); + + if (sk->sk_socket) + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +} diff --git a/net/rds/tcp_stats.c b/net/rds/tcp_stats.c new file mode 100644 index 0000000..f8a7954 --- /dev/null +++ b/net/rds/tcp_stats.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include + +#include "rds.h" +#include "tcp.h" + +DEFINE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats) + ____cacheline_aligned; + +static const char * const rds_tcp_stat_names[] = { + "tcp_data_ready_calls", + "tcp_write_space_calls", + "tcp_sndbuf_full", + "tcp_connect_raced", + "tcp_listen_closed_stale", +}; + +unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail) +{ + struct rds_tcp_statistics stats = {0, }; + uint64_t *src; + uint64_t *sum; + size_t i; + int cpu; + + if (avail < ARRAY_SIZE(rds_tcp_stat_names)) + goto out; + + for_each_online_cpu(cpu) { + src = (uint64_t *)&(per_cpu(rds_tcp_stats, cpu)); + sum = (uint64_t *)&stats; + for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) + *(sum++) += *(src++); + } + + rds_stats_info_copy(iter, (uint64_t *)&stats, rds_tcp_stat_names, + ARRAY_SIZE(rds_tcp_stat_names)); +out: + return ARRAY_SIZE(rds_tcp_stat_names); +} diff --git a/net/rds/threads.c b/net/rds/threads.c new file mode 100644 index 0000000..dc2402e --- /dev/null +++ b/net/rds/threads.c @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include + +#include "rds.h" + +/* + * All of connection management is simplified by serializing it through + * work queues that execute in a connection managing thread. + * + * TCP wants to send acks through sendpage() in response to data_ready(), + * but it needs a process context to do so. + * + * The receive paths need to allocate but can't drop packets (!) so we have + * a thread around to block allocating if the receive fast path sees an + * allocation failure. + */ + +/* Grand Unified Theory of connection life cycle: + * At any point in time, the connection can be in one of these states: + * DOWN, CONNECTING, UP, DISCONNECTING, ERROR + * + * The following transitions are possible: + * ANY -> ERROR + * UP -> DISCONNECTING + * ERROR -> DISCONNECTING + * DISCONNECTING -> DOWN + * DOWN -> CONNECTING + * CONNECTING -> UP + * + * Transition to state DISCONNECTING/DOWN: + * - Inside the shutdown worker; synchronizes with xmit path + * through RDS_IN_XMIT, and with connection management callbacks + * via c_cm_lock. + * + * For receive callbacks, we rely on the underlying transport + * (TCP, IB/RDMA) to provide the necessary synchronisation. + */ +struct workqueue_struct *rds_wq; +EXPORT_SYMBOL_GPL(rds_wq); + +void rds_connect_complete(struct rds_connection *conn) +{ + if (!rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_UP)) { + printk(KERN_WARNING "%s: Cannot transition to state UP, " + "current state is %d\n", + __func__, + atomic_read(&conn->c_state)); + rds_conn_drop(conn); + return; + } + + rdsdebug("conn %p for %pI4 to %pI4 complete\n", + conn, &conn->c_laddr, &conn->c_faddr); + + conn->c_reconnect_jiffies = 0; + set_bit(0, &conn->c_map_queued); + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + queue_delayed_work(rds_wq, &conn->c_recv_w, 0); +} +EXPORT_SYMBOL_GPL(rds_connect_complete); + +/* + * This random exponential backoff is relied on to eventually resolve racing + * connects. + * + * If connect attempts race then both parties drop both connections and come + * here to wait for a random amount of time before trying again. Eventually + * the backoff range will be so much greater than the time it takes to + * establish a connection that one of the pair will establish the connection + * before the other's random delay fires. + * + * Connection attempts that arrive while a connection is already established + * are also considered to be racing connects. This lets a connection from + * a rebooted machine replace an existing stale connection before the transport + * notices that the connection has failed. + * + * We should *always* start with a random backoff; otherwise a broken connection + * will always take several iterations to be re-established. + */ +void rds_queue_reconnect(struct rds_connection *conn) +{ + unsigned long rand; + + rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n", + conn, &conn->c_laddr, &conn->c_faddr, + conn->c_reconnect_jiffies); + + set_bit(RDS_RECONNECT_PENDING, &conn->c_flags); + if (conn->c_reconnect_jiffies == 0) { + conn->c_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies; + queue_delayed_work(rds_wq, &conn->c_conn_w, 0); + return; + } + + get_random_bytes(&rand, sizeof(rand)); + rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n", + rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies, + conn, &conn->c_laddr, &conn->c_faddr); + queue_delayed_work(rds_wq, &conn->c_conn_w, + rand % conn->c_reconnect_jiffies); + + conn->c_reconnect_jiffies = min(conn->c_reconnect_jiffies * 2, + rds_sysctl_reconnect_max_jiffies); +} + +void rds_connect_worker(struct work_struct *work) +{ + struct rds_connection *conn = container_of(work, struct rds_connection, c_conn_w.work); + int ret; + + clear_bit(RDS_RECONNECT_PENDING, &conn->c_flags); + if (rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { + ret = conn->c_trans->conn_connect(conn); + rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n", + conn, &conn->c_laddr, &conn->c_faddr, ret); + + if (ret) { + if (rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_DOWN)) + rds_queue_reconnect(conn); + else + rds_conn_error(conn, "RDS: connect failed\n"); + } + } +} + +void rds_send_worker(struct work_struct *work) +{ + struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work); + int ret; + + if (rds_conn_state(conn) == RDS_CONN_UP) { + ret = rds_send_xmit(conn); + rdsdebug("conn %p ret %d\n", conn, ret); + switch (ret) { + case -EAGAIN: + rds_stats_inc(s_send_immediate_retry); + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + break; + case -ENOMEM: + rds_stats_inc(s_send_delayed_retry); + queue_delayed_work(rds_wq, &conn->c_send_w, 2); + default: + break; + } + } +} + +void rds_recv_worker(struct work_struct *work) +{ + struct rds_connection *conn = container_of(work, struct rds_connection, c_recv_w.work); + int ret; + + if (rds_conn_state(conn) == RDS_CONN_UP) { + ret = conn->c_trans->recv(conn); + rdsdebug("conn %p ret %d\n", conn, ret); + switch (ret) { + case -EAGAIN: + rds_stats_inc(s_recv_immediate_retry); + queue_delayed_work(rds_wq, &conn->c_recv_w, 0); + break; + case -ENOMEM: + rds_stats_inc(s_recv_delayed_retry); + queue_delayed_work(rds_wq, &conn->c_recv_w, 2); + default: + break; + } + } +} + +void rds_shutdown_worker(struct work_struct *work) +{ + struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w); + + rds_conn_shutdown(conn); +} + +void rds_threads_exit(void) +{ + destroy_workqueue(rds_wq); +} + +int rds_threads_init(void) +{ + rds_wq = create_singlethread_workqueue("krdsd"); + if (!rds_wq) + return -ENOMEM; + + return 0; +} diff --git a/net/rds/transport.c b/net/rds/transport.c new file mode 100644 index 0000000..7f2ac4f --- /dev/null +++ b/net/rds/transport.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include + +#include "rds.h" +#include "loop.h" + +static struct rds_transport *transports[RDS_TRANS_COUNT]; +static DECLARE_RWSEM(rds_trans_sem); + +int rds_trans_register(struct rds_transport *trans) +{ + BUG_ON(strlen(trans->t_name) + 1 > TRANSNAMSIZ); + + down_write(&rds_trans_sem); + + if (transports[trans->t_type]) + printk(KERN_ERR "RDS Transport type %d already registered\n", + trans->t_type); + else { + transports[trans->t_type] = trans; + printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name); + } + + up_write(&rds_trans_sem); + + return 0; +} +EXPORT_SYMBOL_GPL(rds_trans_register); + +void rds_trans_unregister(struct rds_transport *trans) +{ + down_write(&rds_trans_sem); + + transports[trans->t_type] = NULL; + printk(KERN_INFO "Unregistered RDS/%s transport\n", trans->t_name); + + up_write(&rds_trans_sem); +} +EXPORT_SYMBOL_GPL(rds_trans_unregister); + +void rds_trans_put(struct rds_transport *trans) +{ + if (trans && trans->t_owner) + module_put(trans->t_owner); +} + +struct rds_transport *rds_trans_get_preferred(__be32 addr) +{ + struct rds_transport *ret = NULL; + struct rds_transport *trans; + unsigned int i; + + if (IN_LOOPBACK(ntohl(addr))) + return &rds_loop_transport; + + down_read(&rds_trans_sem); + for (i = 0; i < RDS_TRANS_COUNT; i++) { + trans = transports[i]; + + if (trans && (trans->laddr_check(addr) == 0) && + (!trans->t_owner || try_module_get(trans->t_owner))) { + ret = trans; + break; + } + } + up_read(&rds_trans_sem); + + return ret; +} + +/* + * This returns the number of stats entries in the snapshot and only + * copies them using the iter if there is enough space for them. The + * caller passes in the global stats so that we can size and copy while + * holding the lock. + */ +unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail) + +{ + struct rds_transport *trans; + unsigned int total = 0; + unsigned int part; + int i; + + rds_info_iter_unmap(iter); + down_read(&rds_trans_sem); + + for (i = 0; i < RDS_TRANS_COUNT; i++) + { + trans = transports[i]; + if (!trans || !trans->stats_info_copy) + continue; + + part = trans->stats_info_copy(iter, avail); + avail -= min(avail, part); + total += part; + } + + up_read(&rds_trans_sem); + + return total; +} + diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile new file mode 100644 index 0000000..da5136f --- /dev/null +++ b/net/sunrpc/xprtrdma/Makefile @@ -0,0 +1,8 @@ +obj-$(CONFIG_SUNRPC_XPRT_RDMA_CLIENT) += xprtrdma.o + +xprtrdma-y := transport.o rpc_rdma.o verbs.o + +obj-$(CONFIG_SUNRPC_XPRT_RDMA_SERVER) += svcrdma.o + +svcrdma-y := svc_rdma.o svc_rdma_transport.o \ + svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c new file mode 100644 index 0000000..6166c98 --- /dev/null +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -0,0 +1,875 @@ +/* + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * rpc_rdma.c + * + * This file contains the guts of the RPC RDMA protocol, and + * does marshaling/unmarshaling, etc. It is also where interfacing + * to the Linux RPC framework lives. + */ + +#include "xprt_rdma.h" + +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +#ifdef RPC_DEBUG +static const char transfertypes[][12] = { + "pure inline", /* no chunks */ + " read chunk", /* some argument via rdma read */ + "*read chunk", /* entire request via rdma read */ + "write chunk", /* some result via rdma write */ + "reply chunk" /* entire reply via rdma write */ +}; +#endif + +/* + * Chunk assembly from upper layer xdr_buf. + * + * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk + * elements. Segments are then coalesced when registered, if possible + * within the selected memreg mode. + * + * Returns positive number of segments converted, or a negative errno. + */ + +static int +rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, + enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs) +{ + int len, n = 0, p; + int page_base; + struct page **ppages; + + if (pos == 0 && xdrbuf->head[0].iov_len) { + seg[n].mr_page = NULL; + seg[n].mr_offset = xdrbuf->head[0].iov_base; + seg[n].mr_len = xdrbuf->head[0].iov_len; + ++n; + } + + len = xdrbuf->page_len; + ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); + page_base = xdrbuf->page_base & ~PAGE_MASK; + p = 0; + while (len && n < nsegs) { + if (!ppages[p]) { + /* alloc the pagelist for receiving buffer */ + ppages[p] = alloc_page(GFP_ATOMIC); + if (!ppages[p]) + return -ENOMEM; + } + seg[n].mr_page = ppages[p]; + seg[n].mr_offset = (void *)(unsigned long) page_base; + seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); + if (seg[n].mr_len > PAGE_SIZE) + return -EIO; + len -= seg[n].mr_len; + ++n; + ++p; + page_base = 0; /* page offset only applies to first page */ + } + + /* Message overflows the seg array */ + if (len && n == nsegs) + return -EIO; + + if (xdrbuf->tail[0].iov_len) { + /* the rpcrdma protocol allows us to omit any trailing + * xdr pad bytes, saving the server an RDMA operation. */ + if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize) + return n; + if (n == nsegs) + /* Tail remains, but we're out of segments */ + return -EIO; + seg[n].mr_page = NULL; + seg[n].mr_offset = xdrbuf->tail[0].iov_base; + seg[n].mr_len = xdrbuf->tail[0].iov_len; + ++n; + } + + return n; +} + +/* + * Create read/write chunk lists, and reply chunks, for RDMA + * + * Assume check against THRESHOLD has been done, and chunks are required. + * Assume only encoding one list entry for read|write chunks. The NFSv3 + * protocol is simple enough to allow this as it only has a single "bulk + * result" in each procedure - complicated NFSv4 COMPOUNDs are not. (The + * RDMA/Sessions NFSv4 proposal addresses this for future v4 revs.) + * + * When used for a single reply chunk (which is a special write + * chunk used for the entire reply, rather than just the data), it + * is used primarily for READDIR and READLINK which would otherwise + * be severely size-limited by a small rdma inline read max. The server + * response will come back as an RDMA Write, followed by a message + * of type RDMA_NOMSG carrying the xid and length. As a result, reply + * chunks do not provide data alignment, however they do not require + * "fixup" (moving the response to the upper layer buffer) either. + * + * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): + * + * Read chunklist (a linked list): + * N elements, position P (same P for all chunks of same arg!): + * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0 + * + * Write chunklist (a list of (one) counted array): + * N elements: + * 1 - N - HLOO - HLOO - ... - HLOO - 0 + * + * Reply chunk (a counted array): + * N elements: + * 1 - N - HLOO - HLOO - ... - HLOO + * + * Returns positive RPC/RDMA header size, or negative errno. + */ + +static ssize_t +rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, + struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type) +{ + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); + int n, nsegs, nchunks = 0; + unsigned int pos; + struct rpcrdma_mr_seg *seg = req->rl_segments; + struct rpcrdma_read_chunk *cur_rchunk = NULL; + struct rpcrdma_write_array *warray = NULL; + struct rpcrdma_write_chunk *cur_wchunk = NULL; + __be32 *iptr = headerp->rm_body.rm_chunks; + + if (type == rpcrdma_readch || type == rpcrdma_areadch) { + /* a read chunk - server will RDMA Read our memory */ + cur_rchunk = (struct rpcrdma_read_chunk *) iptr; + } else { + /* a write or reply chunk - server will RDMA Write our memory */ + *iptr++ = xdr_zero; /* encode a NULL read chunk list */ + if (type == rpcrdma_replych) + *iptr++ = xdr_zero; /* a NULL write chunk list */ + warray = (struct rpcrdma_write_array *) iptr; + cur_wchunk = (struct rpcrdma_write_chunk *) (warray + 1); + } + + if (type == rpcrdma_replych || type == rpcrdma_areadch) + pos = 0; + else + pos = target->head[0].iov_len; + + nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS); + if (nsegs < 0) + return nsegs; + + do { + n = rpcrdma_register_external(seg, nsegs, + cur_wchunk != NULL, r_xprt); + if (n <= 0) + goto out; + if (cur_rchunk) { /* read */ + cur_rchunk->rc_discrim = xdr_one; + /* all read chunks have the same "position" */ + cur_rchunk->rc_position = htonl(pos); + cur_rchunk->rc_target.rs_handle = htonl(seg->mr_rkey); + cur_rchunk->rc_target.rs_length = htonl(seg->mr_len); + xdr_encode_hyper( + (__be32 *)&cur_rchunk->rc_target.rs_offset, + seg->mr_base); + dprintk("RPC: %s: read chunk " + "elem %d@0x%llx:0x%x pos %u (%s)\n", __func__, + seg->mr_len, (unsigned long long)seg->mr_base, + seg->mr_rkey, pos, n < nsegs ? "more" : "last"); + cur_rchunk++; + r_xprt->rx_stats.read_chunk_count++; + } else { /* write/reply */ + cur_wchunk->wc_target.rs_handle = htonl(seg->mr_rkey); + cur_wchunk->wc_target.rs_length = htonl(seg->mr_len); + xdr_encode_hyper( + (__be32 *)&cur_wchunk->wc_target.rs_offset, + seg->mr_base); + dprintk("RPC: %s: %s chunk " + "elem %d@0x%llx:0x%x (%s)\n", __func__, + (type == rpcrdma_replych) ? "reply" : "write", + seg->mr_len, (unsigned long long)seg->mr_base, + seg->mr_rkey, n < nsegs ? "more" : "last"); + cur_wchunk++; + if (type == rpcrdma_replych) + r_xprt->rx_stats.reply_chunk_count++; + else + r_xprt->rx_stats.write_chunk_count++; + r_xprt->rx_stats.total_rdma_request += seg->mr_len; + } + nchunks++; + seg += n; + nsegs -= n; + } while (nsegs); + + /* success. all failures return above */ + req->rl_nchunks = nchunks; + + /* + * finish off header. If write, marshal discrim and nchunks. + */ + if (cur_rchunk) { + iptr = (__be32 *) cur_rchunk; + *iptr++ = xdr_zero; /* finish the read chunk list */ + *iptr++ = xdr_zero; /* encode a NULL write chunk list */ + *iptr++ = xdr_zero; /* encode a NULL reply chunk */ + } else { + warray->wc_discrim = xdr_one; + warray->wc_nchunks = htonl(nchunks); + iptr = (__be32 *) cur_wchunk; + if (type == rpcrdma_writech) { + *iptr++ = xdr_zero; /* finish the write chunk list */ + *iptr++ = xdr_zero; /* encode a NULL reply chunk */ + } + } + + /* + * Return header size. + */ + return (unsigned char *)iptr - (unsigned char *)headerp; + +out: + if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR) { + for (pos = 0; nchunks--;) + pos += rpcrdma_deregister_external( + &req->rl_segments[pos], r_xprt); + } + return n; +} + +/* + * Marshal chunks. This routine returns the header length + * consumed by marshaling. + * + * Returns positive RPC/RDMA header size, or negative errno. + */ + +ssize_t +rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result) +{ + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)req->rl_base; + + if (req->rl_rtype != rpcrdma_noch) + result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf, + headerp, req->rl_rtype); + else if (req->rl_wtype != rpcrdma_noch) + result = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf, + headerp, req->rl_wtype); + return result; +} + +/* + * Copy write data inline. + * This function is used for "small" requests. Data which is passed + * to RPC via iovecs (or page list) is copied directly into the + * pre-registered memory buffer for this request. For small amounts + * of data, this is efficient. The cutoff value is tunable. + */ +static int +rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) +{ + int i, npages, curlen; + int copy_len; + unsigned char *srcp, *destp; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); + int page_base; + struct page **ppages; + + destp = rqst->rq_svec[0].iov_base; + curlen = rqst->rq_svec[0].iov_len; + destp += curlen; + /* + * Do optional padding where it makes sense. Alignment of write + * payload can help the server, if our setting is accurate. + */ + pad -= (curlen + 36/*sizeof(struct rpcrdma_msg_padded)*/); + if (pad < 0 || rqst->rq_slen - curlen < RPCRDMA_INLINE_PAD_THRESH) + pad = 0; /* don't pad this request */ + + dprintk("RPC: %s: pad %d destp 0x%p len %d hdrlen %d\n", + __func__, pad, destp, rqst->rq_slen, curlen); + + copy_len = rqst->rq_snd_buf.page_len; + + if (rqst->rq_snd_buf.tail[0].iov_len) { + curlen = rqst->rq_snd_buf.tail[0].iov_len; + if (destp + copy_len != rqst->rq_snd_buf.tail[0].iov_base) { + memmove(destp + copy_len, + rqst->rq_snd_buf.tail[0].iov_base, curlen); + r_xprt->rx_stats.pullup_copy_count += curlen; + } + dprintk("RPC: %s: tail destp 0x%p len %d\n", + __func__, destp + copy_len, curlen); + rqst->rq_svec[0].iov_len += curlen; + } + r_xprt->rx_stats.pullup_copy_count += copy_len; + + page_base = rqst->rq_snd_buf.page_base; + ppages = rqst->rq_snd_buf.pages + (page_base >> PAGE_SHIFT); + page_base &= ~PAGE_MASK; + npages = PAGE_ALIGN(page_base+copy_len) >> PAGE_SHIFT; + for (i = 0; copy_len && i < npages; i++) { + curlen = PAGE_SIZE - page_base; + if (curlen > copy_len) + curlen = copy_len; + dprintk("RPC: %s: page %d destp 0x%p len %d curlen %d\n", + __func__, i, destp, copy_len, curlen); + srcp = kmap_atomic(ppages[i]); + memcpy(destp, srcp+page_base, curlen); + kunmap_atomic(srcp); + rqst->rq_svec[0].iov_len += curlen; + destp += curlen; + copy_len -= curlen; + page_base = 0; + } + /* header now contains entire send message */ + return pad; +} + +/* + * Marshal a request: the primary job of this routine is to choose + * the transfer modes. See comments below. + * + * Uses multiple RDMA IOVs for a request: + * [0] -- RPC RDMA header, which uses memory from the *start* of the + * preregistered buffer that already holds the RPC data in + * its middle. + * [1] -- the RPC header/data, marshaled by RPC and the NFS protocol. + * [2] -- optional padding. + * [3] -- if padded, header only in [1] and data here. + * + * Returns zero on success, otherwise a negative errno. + */ + +int +rpcrdma_marshal_req(struct rpc_rqst *rqst) +{ + struct rpc_xprt *xprt = rqst->rq_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + char *base; + size_t rpclen, padlen; + ssize_t hdrlen; + struct rpcrdma_msg *headerp; + + /* + * rpclen gets amount of data in first buffer, which is the + * pre-registered buffer. + */ + base = rqst->rq_svec[0].iov_base; + rpclen = rqst->rq_svec[0].iov_len; + + /* build RDMA header in private area at front */ + headerp = (struct rpcrdma_msg *) req->rl_base; + /* don't htonl XID, it's already done in request */ + headerp->rm_xid = rqst->rq_xid; + headerp->rm_vers = xdr_one; + headerp->rm_credit = htonl(r_xprt->rx_buf.rb_max_requests); + headerp->rm_type = htonl(RDMA_MSG); + + /* + * Chunks needed for results? + * + * o If the expected result is under the inline threshold, all ops + * return as inline (but see later). + * o Large non-read ops return as a single reply chunk. + * o Large read ops return data as write chunk(s), header as inline. + * + * Note: the NFS code sending down multiple result segments implies + * the op is one of read, readdir[plus], readlink or NFSv4 getacl. + */ + + /* + * This code can handle read chunks, write chunks OR reply + * chunks -- only one type. If the request is too big to fit + * inline, then we will choose read chunks. If the request is + * a READ, then use write chunks to separate the file data + * into pages; otherwise use reply chunks. + */ + if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst)) + req->rl_wtype = rpcrdma_noch; + else if (rqst->rq_rcv_buf.page_len == 0) + req->rl_wtype = rpcrdma_replych; + else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) + req->rl_wtype = rpcrdma_writech; + else + req->rl_wtype = rpcrdma_replych; + + /* + * Chunks needed for arguments? + * + * o If the total request is under the inline threshold, all ops + * are sent as inline. + * o Large non-write ops are sent with the entire message as a + * single read chunk (protocol 0-position special case). + * o Large write ops transmit data as read chunk(s), header as + * inline. + * + * Note: the NFS code sending down multiple argument segments + * implies the op is a write. + * TBD check NFSv4 setacl + */ + if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) + req->rl_rtype = rpcrdma_noch; + else if (rqst->rq_snd_buf.page_len == 0) + req->rl_rtype = rpcrdma_areadch; + else + req->rl_rtype = rpcrdma_readch; + + /* The following simplification is not true forever */ + if (req->rl_rtype != rpcrdma_noch && req->rl_wtype == rpcrdma_replych) + req->rl_wtype = rpcrdma_noch; + if (req->rl_rtype != rpcrdma_noch && req->rl_wtype != rpcrdma_noch) { + dprintk("RPC: %s: cannot marshal multiple chunk lists\n", + __func__); + return -EIO; + } + + hdrlen = 28; /*sizeof *headerp;*/ + padlen = 0; + + /* + * Pull up any extra send data into the preregistered buffer. + * When padding is in use and applies to the transfer, insert + * it and change the message type. + */ + if (req->rl_rtype == rpcrdma_noch) { + + padlen = rpcrdma_inline_pullup(rqst, + RPCRDMA_INLINE_PAD_VALUE(rqst)); + + if (padlen) { + headerp->rm_type = htonl(RDMA_MSGP); + headerp->rm_body.rm_padded.rm_align = + htonl(RPCRDMA_INLINE_PAD_VALUE(rqst)); + headerp->rm_body.rm_padded.rm_thresh = + htonl(RPCRDMA_INLINE_PAD_THRESH); + headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero; + headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; + headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; + hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ + if (req->rl_wtype != rpcrdma_noch) { + dprintk("RPC: %s: invalid chunk list\n", + __func__); + return -EIO; + } + } else { + headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero; + headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero; + headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero; + /* new length after pullup */ + rpclen = rqst->rq_svec[0].iov_len; + /* + * Currently we try to not actually use read inline. + * Reply chunks have the desirable property that + * they land, packed, directly in the target buffers + * without headers, so they require no fixup. The + * additional RDMA Write op sends the same amount + * of data, streams on-the-wire and adds no overhead + * on receive. Therefore, we request a reply chunk + * for non-writes wherever feasible and efficient. + */ + if (req->rl_wtype == rpcrdma_noch) + req->rl_wtype = rpcrdma_replych; + } + } + + hdrlen = rpcrdma_marshal_chunks(rqst, hdrlen); + if (hdrlen < 0) + return hdrlen; + + dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" + " headerp 0x%p base 0x%p lkey 0x%x\n", + __func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen, + headerp, base, req->rl_iov.lkey); + + /* + * initialize send_iov's - normally only two: rdma chunk header and + * single preregistered RPC header buffer, but if padding is present, + * then use a preregistered (and zeroed) pad buffer between the RPC + * header and any write data. In all non-rdma cases, any following + * data has been copied into the RPC header buffer. + */ + req->rl_send_iov[0].addr = req->rl_iov.addr; + req->rl_send_iov[0].length = hdrlen; + req->rl_send_iov[0].lkey = req->rl_iov.lkey; + + req->rl_send_iov[1].addr = req->rl_iov.addr + (base - req->rl_base); + req->rl_send_iov[1].length = rpclen; + req->rl_send_iov[1].lkey = req->rl_iov.lkey; + + req->rl_niovs = 2; + + if (padlen) { + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + + req->rl_send_iov[2].addr = ep->rep_pad.addr; + req->rl_send_iov[2].length = padlen; + req->rl_send_iov[2].lkey = ep->rep_pad.lkey; + + req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen; + req->rl_send_iov[3].length = rqst->rq_slen - rpclen; + req->rl_send_iov[3].lkey = req->rl_iov.lkey; + + req->rl_niovs = 4; + } + + return 0; +} + +/* + * Chase down a received write or reply chunklist to get length + * RDMA'd by server. See map at rpcrdma_create_chunks()! :-) + */ +static int +rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __be32 **iptrp) +{ + unsigned int i, total_len; + struct rpcrdma_write_chunk *cur_wchunk; + + i = ntohl(**iptrp); /* get array count */ + if (i > max) + return -1; + cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1); + total_len = 0; + while (i--) { + struct rpcrdma_segment *seg = &cur_wchunk->wc_target; + ifdebug(FACILITY) { + u64 off; + xdr_decode_hyper((__be32 *)&seg->rs_offset, &off); + dprintk("RPC: %s: chunk %d@0x%llx:0x%x\n", + __func__, + ntohl(seg->rs_length), + (unsigned long long)off, + ntohl(seg->rs_handle)); + } + total_len += ntohl(seg->rs_length); + ++cur_wchunk; + } + /* check and adjust for properly terminated write chunk */ + if (wrchunk) { + __be32 *w = (__be32 *) cur_wchunk; + if (*w++ != xdr_zero) + return -1; + cur_wchunk = (struct rpcrdma_write_chunk *) w; + } + if ((char *) cur_wchunk > rep->rr_base + rep->rr_len) + return -1; + + *iptrp = (__be32 *) cur_wchunk; + return total_len; +} + +/* + * Scatter inline received data back into provided iov's. + */ +static void +rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) +{ + int i, npages, curlen, olen; + char *destp; + struct page **ppages; + int page_base; + + curlen = rqst->rq_rcv_buf.head[0].iov_len; + if (curlen > copy_len) { /* write chunk header fixup */ + curlen = copy_len; + rqst->rq_rcv_buf.head[0].iov_len = curlen; + } + + dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n", + __func__, srcp, copy_len, curlen); + + /* Shift pointer for first receive segment only */ + rqst->rq_rcv_buf.head[0].iov_base = srcp; + srcp += curlen; + copy_len -= curlen; + + olen = copy_len; + i = 0; + rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen; + page_base = rqst->rq_rcv_buf.page_base; + ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT); + page_base &= ~PAGE_MASK; + + if (copy_len && rqst->rq_rcv_buf.page_len) { + npages = PAGE_ALIGN(page_base + + rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT; + for (; i < npages; i++) { + curlen = PAGE_SIZE - page_base; + if (curlen > copy_len) + curlen = copy_len; + dprintk("RPC: %s: page %d" + " srcp 0x%p len %d curlen %d\n", + __func__, i, srcp, copy_len, curlen); + destp = kmap_atomic(ppages[i]); + memcpy(destp + page_base, srcp, curlen); + flush_dcache_page(ppages[i]); + kunmap_atomic(destp); + srcp += curlen; + copy_len -= curlen; + if (copy_len == 0) + break; + page_base = 0; + } + } + + if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) { + curlen = copy_len; + if (curlen > rqst->rq_rcv_buf.tail[0].iov_len) + curlen = rqst->rq_rcv_buf.tail[0].iov_len; + if (rqst->rq_rcv_buf.tail[0].iov_base != srcp) + memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen); + dprintk("RPC: %s: tail srcp 0x%p len %d curlen %d\n", + __func__, srcp, copy_len, curlen); + rqst->rq_rcv_buf.tail[0].iov_len = curlen; + copy_len -= curlen; ++i; + } else + rqst->rq_rcv_buf.tail[0].iov_len = 0; + + if (pad) { + /* implicit padding on terminal chunk */ + unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base; + while (pad--) + p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0; + } + + if (copy_len) + dprintk("RPC: %s: %d bytes in" + " %d extra segments (%d lost)\n", + __func__, olen, i, copy_len); + + /* TBD avoid a warning from call_decode() */ + rqst->rq_private_buf = rqst->rq_rcv_buf; +} + +void +rpcrdma_connect_worker(struct work_struct *work) +{ + struct rpcrdma_ep *ep = + container_of(work, struct rpcrdma_ep, rep_connect_worker.work); + struct rpc_xprt *xprt = ep->rep_xprt; + + spin_lock_bh(&xprt->transport_lock); + if (++xprt->connect_cookie == 0) /* maintain a reserved value */ + ++xprt->connect_cookie; + if (ep->rep_connected > 0) { + if (!xprt_test_and_set_connected(xprt)) + xprt_wake_pending_tasks(xprt, 0); + } else { + if (xprt_test_and_clear_connected(xprt)) + xprt_wake_pending_tasks(xprt, -ENOTCONN); + } + spin_unlock_bh(&xprt->transport_lock); +} + +/* + * This function is called when an async event is posted to + * the connection which changes the connection state. All it + * does at this point is mark the connection up/down, the rpc + * timers do the rest. + */ +void +rpcrdma_conn_func(struct rpcrdma_ep *ep) +{ + schedule_delayed_work(&ep->rep_connect_worker, 0); +} + +/* + * Called as a tasklet to do req/reply match and complete a request + * Errors must result in the RPC task either being awakened, or + * allowed to timeout, to discover the errors at that time. + */ +void +rpcrdma_reply_handler(struct rpcrdma_rep *rep) +{ + struct rpcrdma_msg *headerp; + struct rpcrdma_req *req; + struct rpc_rqst *rqst; + struct rpc_xprt *xprt = rep->rr_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + __be32 *iptr; + int rdmalen, status; + unsigned long cwnd; + + /* Check status. If bad, signal disconnect and return rep to pool */ + if (rep->rr_len == ~0U) { + rpcrdma_recv_buffer_put(rep); + if (r_xprt->rx_ep.rep_connected == 1) { + r_xprt->rx_ep.rep_connected = -EIO; + rpcrdma_conn_func(&r_xprt->rx_ep); + } + return; + } + if (rep->rr_len < 28) { + dprintk("RPC: %s: short/invalid reply\n", __func__); + goto repost; + } + headerp = (struct rpcrdma_msg *) rep->rr_base; + if (headerp->rm_vers != xdr_one) { + dprintk("RPC: %s: invalid version %d\n", + __func__, ntohl(headerp->rm_vers)); + goto repost; + } + + /* Get XID and try for a match. */ + spin_lock(&xprt->transport_lock); + rqst = xprt_lookup_rqst(xprt, headerp->rm_xid); + if (rqst == NULL) { + spin_unlock(&xprt->transport_lock); + dprintk("RPC: %s: reply 0x%p failed " + "to match any request xid 0x%08x len %d\n", + __func__, rep, headerp->rm_xid, rep->rr_len); +repost: + r_xprt->rx_stats.bad_reply_count++; + rep->rr_func = rpcrdma_reply_handler; + if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep)) + rpcrdma_recv_buffer_put(rep); + + return; + } + + /* get request object */ + req = rpcr_to_rdmar(rqst); + if (req->rl_reply) { + spin_unlock(&xprt->transport_lock); + dprintk("RPC: %s: duplicate reply 0x%p to RPC " + "request 0x%p: xid 0x%08x\n", __func__, rep, req, + headerp->rm_xid); + goto repost; + } + + dprintk("RPC: %s: reply 0x%p completes request 0x%p\n" + " RPC request 0x%p xid 0x%08x\n", + __func__, rep, req, rqst, headerp->rm_xid); + + /* from here on, the reply is no longer an orphan */ + req->rl_reply = rep; + xprt->reestablish_timeout = 0; + + /* check for expected message types */ + /* The order of some of these tests is important. */ + switch (headerp->rm_type) { + case htonl(RDMA_MSG): + /* never expect read chunks */ + /* never expect reply chunks (two ways to check) */ + /* never expect write chunks without having offered RDMA */ + if (headerp->rm_body.rm_chunks[0] != xdr_zero || + (headerp->rm_body.rm_chunks[1] == xdr_zero && + headerp->rm_body.rm_chunks[2] != xdr_zero) || + (headerp->rm_body.rm_chunks[1] != xdr_zero && + req->rl_nchunks == 0)) + goto badheader; + if (headerp->rm_body.rm_chunks[1] != xdr_zero) { + /* count any expected write chunks in read reply */ + /* start at write chunk array count */ + iptr = &headerp->rm_body.rm_chunks[2]; + rdmalen = rpcrdma_count_chunks(rep, + req->rl_nchunks, 1, &iptr); + /* check for validity, and no reply chunk after */ + if (rdmalen < 0 || *iptr++ != xdr_zero) + goto badheader; + rep->rr_len -= + ((unsigned char *)iptr - (unsigned char *)headerp); + status = rep->rr_len + rdmalen; + r_xprt->rx_stats.total_rdma_reply += rdmalen; + /* special case - last chunk may omit padding */ + if (rdmalen &= 3) { + rdmalen = 4 - rdmalen; + status += rdmalen; + } + } else { + /* else ordinary inline */ + rdmalen = 0; + iptr = (__be32 *)((unsigned char *)headerp + 28); + rep->rr_len -= 28; /*sizeof *headerp;*/ + status = rep->rr_len; + } + /* Fix up the rpc results for upper layer */ + rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen); + break; + + case htonl(RDMA_NOMSG): + /* never expect read or write chunks, always reply chunks */ + if (headerp->rm_body.rm_chunks[0] != xdr_zero || + headerp->rm_body.rm_chunks[1] != xdr_zero || + headerp->rm_body.rm_chunks[2] != xdr_one || + req->rl_nchunks == 0) + goto badheader; + iptr = (__be32 *)((unsigned char *)headerp + 28); + rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr); + if (rdmalen < 0) + goto badheader; + r_xprt->rx_stats.total_rdma_reply += rdmalen; + /* Reply chunk buffer already is the reply vector - no fixup. */ + status = rdmalen; + break; + +badheader: + default: + dprintk("%s: invalid rpcrdma reply header (type %d):" + " chunks[012] == %d %d %d" + " expected chunks <= %d\n", + __func__, ntohl(headerp->rm_type), + headerp->rm_body.rm_chunks[0], + headerp->rm_body.rm_chunks[1], + headerp->rm_body.rm_chunks[2], + req->rl_nchunks); + status = -EIO; + r_xprt->rx_stats.bad_reply_count++; + break; + } + + cwnd = xprt->cwnd; + xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT; + if (xprt->cwnd > cwnd) + xprt_release_rqst_cong(rqst->rq_task); + + dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n", + __func__, xprt, rqst, status); + xprt_complete_rqst(rqst->rq_task, status); + spin_unlock(&xprt->transport_lock); +} diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c new file mode 100644 index 0000000..c1b6270 --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -0,0 +1,302 @@ +/* + * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Author: Tom Tucker + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "xprt_rdma.h" + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +/* RPC/RDMA parameters */ +unsigned int svcrdma_ord = RPCRDMA_ORD; +static unsigned int min_ord = 1; +static unsigned int max_ord = 4096; +unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS; +static unsigned int min_max_requests = 4; +static unsigned int max_max_requests = 16384; +unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE; +static unsigned int min_max_inline = 4096; +static unsigned int max_max_inline = 65536; + +atomic_t rdma_stat_recv; +atomic_t rdma_stat_read; +atomic_t rdma_stat_write; +atomic_t rdma_stat_sq_starve; +atomic_t rdma_stat_rq_starve; +atomic_t rdma_stat_rq_poll; +atomic_t rdma_stat_rq_prod; +atomic_t rdma_stat_sq_poll; +atomic_t rdma_stat_sq_prod; + +/* Temporary NFS request map and context caches */ +struct kmem_cache *svc_rdma_map_cachep; +struct kmem_cache *svc_rdma_ctxt_cachep; + +struct workqueue_struct *svc_rdma_wq; + +/* + * This function implements reading and resetting an atomic_t stat + * variable through read/write to a proc file. Any write to the file + * resets the associated statistic to zero. Any read returns it's + * current value. + */ +static int read_reset_stat(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + atomic_t *stat = (atomic_t *)table->data; + + if (!stat) + return -EINVAL; + + if (write) + atomic_set(stat, 0); + else { + char str_buf[32]; + char *data; + int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat)); + if (len >= 32) + return -EFAULT; + len = strlen(str_buf); + if (*ppos > len) { + *lenp = 0; + return 0; + } + data = &str_buf[*ppos]; + len -= *ppos; + if (len > *lenp) + len = *lenp; + if (len && copy_to_user(buffer, str_buf, len)) + return -EFAULT; + *lenp = len; + *ppos += len; + } + return 0; +} + +static struct ctl_table_header *svcrdma_table_header; +static struct ctl_table svcrdma_parm_table[] = { + { + .procname = "max_requests", + .data = &svcrdma_max_requests, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_max_requests, + .extra2 = &max_max_requests + }, + { + .procname = "max_req_size", + .data = &svcrdma_max_req_size, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_max_inline, + .extra2 = &max_max_inline + }, + { + .procname = "max_outbound_read_requests", + .data = &svcrdma_ord, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_ord, + .extra2 = &max_ord, + }, + + { + .procname = "rdma_stat_read", + .data = &rdma_stat_read, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = read_reset_stat, + }, + { + .procname = "rdma_stat_recv", + .data = &rdma_stat_recv, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = read_reset_stat, + }, + { + .procname = "rdma_stat_write", + .data = &rdma_stat_write, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = read_reset_stat, + }, + { + .procname = "rdma_stat_sq_starve", + .data = &rdma_stat_sq_starve, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = read_reset_stat, + }, + { + .procname = "rdma_stat_rq_starve", + .data = &rdma_stat_rq_starve, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = read_reset_stat, + }, + { + .procname = "rdma_stat_rq_poll", + .data = &rdma_stat_rq_poll, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = read_reset_stat, + }, + { + .procname = "rdma_stat_rq_prod", + .data = &rdma_stat_rq_prod, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = read_reset_stat, + }, + { + .procname = "rdma_stat_sq_poll", + .data = &rdma_stat_sq_poll, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = read_reset_stat, + }, + { + .procname = "rdma_stat_sq_prod", + .data = &rdma_stat_sq_prod, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = read_reset_stat, + }, + { }, +}; + +static struct ctl_table svcrdma_table[] = { + { + .procname = "svc_rdma", + .mode = 0555, + .child = svcrdma_parm_table + }, + { }, +}; + +static struct ctl_table svcrdma_root_table[] = { + { + .procname = "sunrpc", + .mode = 0555, + .child = svcrdma_table + }, + { }, +}; + +void svc_rdma_cleanup(void) +{ + dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n"); + destroy_workqueue(svc_rdma_wq); + if (svcrdma_table_header) { + unregister_sysctl_table(svcrdma_table_header); + svcrdma_table_header = NULL; + } + svc_unreg_xprt_class(&svc_rdma_class); + kmem_cache_destroy(svc_rdma_map_cachep); + kmem_cache_destroy(svc_rdma_ctxt_cachep); +} + +int svc_rdma_init(void) +{ + dprintk("SVCRDMA Module Init, register RPC RDMA transport\n"); + dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord); + dprintk("\tmax_requests : %d\n", svcrdma_max_requests); + dprintk("\tsq_depth : %d\n", + svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT); + dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); + + svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0); + if (!svc_rdma_wq) + return -ENOMEM; + + if (!svcrdma_table_header) + svcrdma_table_header = + register_sysctl_table(svcrdma_root_table); + + /* Create the temporary map cache */ + svc_rdma_map_cachep = kmem_cache_create("svc_rdma_map_cache", + sizeof(struct svc_rdma_req_map), + 0, + SLAB_HWCACHE_ALIGN, + NULL); + if (!svc_rdma_map_cachep) { + printk(KERN_INFO "Could not allocate map cache.\n"); + goto err0; + } + + /* Create the temporary context cache */ + svc_rdma_ctxt_cachep = + kmem_cache_create("svc_rdma_ctxt_cache", + sizeof(struct svc_rdma_op_ctxt), + 0, + SLAB_HWCACHE_ALIGN, + NULL); + if (!svc_rdma_ctxt_cachep) { + printk(KERN_INFO "Could not allocate WR ctxt cache.\n"); + goto err1; + } + + /* Register RDMA with the SVC transport switch */ + svc_reg_xprt_class(&svc_rdma_class); + return 0; + err1: + kmem_cache_destroy(svc_rdma_map_cachep); + err0: + unregister_sysctl_table(svcrdma_table_header); + destroy_workqueue(svc_rdma_wq); + return -ENOMEM; +} +MODULE_AUTHOR("Tom Tucker "); +MODULE_DESCRIPTION("SVC RDMA Transport"); +MODULE_LICENSE("Dual BSD/GPL"); +module_init(svc_rdma_init); +module_exit(svc_rdma_cleanup); diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c new file mode 100644 index 0000000..65b1462 --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c @@ -0,0 +1,386 @@ +/* + * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Author: Tom Tucker + */ + +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +/* + * Decodes a read chunk list. The expected format is as follows: + * descrim : xdr_one + * position : u32 offset into XDR stream + * handle : u32 RKEY + * . . . + * end-of-list: xdr_zero + */ +static u32 *decode_read_list(u32 *va, u32 *vaend) +{ + struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va; + + while (ch->rc_discrim != xdr_zero) { + if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) > + (unsigned long)vaend) { + dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch); + return NULL; + } + ch++; + } + return (u32 *)&ch->rc_position; +} + +/* + * Determine number of chunks and total bytes in chunk list. The chunk + * list has already been verified to fit within the RPCRDMA header. + */ +void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *ch, + int *ch_count, int *byte_count) +{ + /* compute the number of bytes represented by read chunks */ + *byte_count = 0; + *ch_count = 0; + for (; ch->rc_discrim != 0; ch++) { + *byte_count = *byte_count + ntohl(ch->rc_target.rs_length); + *ch_count = *ch_count + 1; + } +} + +/* + * Decodes a write chunk list. The expected format is as follows: + * descrim : xdr_one + * nchunks : + * handle : u32 RKEY ---+ + * length : u32 | + * offset : remove va + + * . . . | + * ---+ + */ +static u32 *decode_write_list(u32 *va, u32 *vaend) +{ + unsigned long start, end; + int nchunks; + + struct rpcrdma_write_array *ary = + (struct rpcrdma_write_array *)va; + + /* Check for not write-array */ + if (ary->wc_discrim == xdr_zero) + return (u32 *)&ary->wc_nchunks; + + if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > + (unsigned long)vaend) { + dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend); + return NULL; + } + nchunks = ntohl(ary->wc_nchunks); + + start = (unsigned long)&ary->wc_array[0]; + end = (unsigned long)vaend; + if (nchunks < 0 || + nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) || + (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) { + dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n", + ary, nchunks, vaend); + return NULL; + } + /* + * rs_length is the 2nd 4B field in wc_target and taking its + * address skips the list terminator + */ + return (u32 *)&ary->wc_array[nchunks].wc_target.rs_length; +} + +static u32 *decode_reply_array(u32 *va, u32 *vaend) +{ + unsigned long start, end; + int nchunks; + struct rpcrdma_write_array *ary = + (struct rpcrdma_write_array *)va; + + /* Check for no reply-array */ + if (ary->wc_discrim == xdr_zero) + return (u32 *)&ary->wc_nchunks; + + if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > + (unsigned long)vaend) { + dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend); + return NULL; + } + nchunks = ntohl(ary->wc_nchunks); + + start = (unsigned long)&ary->wc_array[0]; + end = (unsigned long)vaend; + if (nchunks < 0 || + nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) || + (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) { + dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n", + ary, nchunks, vaend); + return NULL; + } + return (u32 *)&ary->wc_array[nchunks]; +} + +int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req, + struct svc_rqst *rqstp) +{ + struct rpcrdma_msg *rmsgp = NULL; + u32 *va; + u32 *vaend; + u32 hdr_len; + + rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base; + + /* Verify that there's enough bytes for header + something */ + if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_MIN) { + dprintk("svcrdma: header too short = %d\n", + rqstp->rq_arg.len); + return -EINVAL; + } + + /* Decode the header */ + rmsgp->rm_xid = ntohl(rmsgp->rm_xid); + rmsgp->rm_vers = ntohl(rmsgp->rm_vers); + rmsgp->rm_credit = ntohl(rmsgp->rm_credit); + rmsgp->rm_type = ntohl(rmsgp->rm_type); + + if (rmsgp->rm_vers != RPCRDMA_VERSION) + return -ENOSYS; + + /* Pull in the extra for the padded case and bump our pointer */ + if (rmsgp->rm_type == RDMA_MSGP) { + int hdrlen; + rmsgp->rm_body.rm_padded.rm_align = + ntohl(rmsgp->rm_body.rm_padded.rm_align); + rmsgp->rm_body.rm_padded.rm_thresh = + ntohl(rmsgp->rm_body.rm_padded.rm_thresh); + + va = &rmsgp->rm_body.rm_padded.rm_pempty[4]; + rqstp->rq_arg.head[0].iov_base = va; + hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp); + rqstp->rq_arg.head[0].iov_len -= hdrlen; + if (hdrlen > rqstp->rq_arg.len) + return -EINVAL; + return hdrlen; + } + + /* The chunk list may contain either a read chunk list or a write + * chunk list and a reply chunk list. + */ + va = &rmsgp->rm_body.rm_chunks[0]; + vaend = (u32 *)((unsigned long)rmsgp + rqstp->rq_arg.len); + va = decode_read_list(va, vaend); + if (!va) + return -EINVAL; + va = decode_write_list(va, vaend); + if (!va) + return -EINVAL; + va = decode_reply_array(va, vaend); + if (!va) + return -EINVAL; + + rqstp->rq_arg.head[0].iov_base = va; + hdr_len = (unsigned long)va - (unsigned long)rmsgp; + rqstp->rq_arg.head[0].iov_len -= hdr_len; + + *rdma_req = rmsgp; + return hdr_len; +} + +int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *rqstp) +{ + struct rpcrdma_msg *rmsgp = NULL; + struct rpcrdma_read_chunk *ch; + struct rpcrdma_write_array *ary; + u32 *va; + u32 hdrlen; + + dprintk("svcrdma: processing deferred RDMA header on rqstp=%p\n", + rqstp); + rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base; + + /* Pull in the extra for the padded case and bump our pointer */ + if (rmsgp->rm_type == RDMA_MSGP) { + va = &rmsgp->rm_body.rm_padded.rm_pempty[4]; + rqstp->rq_arg.head[0].iov_base = va; + hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp); + rqstp->rq_arg.head[0].iov_len -= hdrlen; + return hdrlen; + } + + /* + * Skip all chunks to find RPC msg. These were previously processed + */ + va = &rmsgp->rm_body.rm_chunks[0]; + + /* Skip read-list */ + for (ch = (struct rpcrdma_read_chunk *)va; + ch->rc_discrim != xdr_zero; ch++); + va = (u32 *)&ch->rc_position; + + /* Skip write-list */ + ary = (struct rpcrdma_write_array *)va; + if (ary->wc_discrim == xdr_zero) + va = (u32 *)&ary->wc_nchunks; + else + /* + * rs_length is the 2nd 4B field in wc_target and taking its + * address skips the list terminator + */ + va = (u32 *)&ary->wc_array[ary->wc_nchunks].wc_target.rs_length; + + /* Skip reply-array */ + ary = (struct rpcrdma_write_array *)va; + if (ary->wc_discrim == xdr_zero) + va = (u32 *)&ary->wc_nchunks; + else + va = (u32 *)&ary->wc_array[ary->wc_nchunks]; + + rqstp->rq_arg.head[0].iov_base = va; + hdrlen = (unsigned long)va - (unsigned long)rmsgp; + rqstp->rq_arg.head[0].iov_len -= hdrlen; + + return hdrlen; +} + +int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt, + struct rpcrdma_msg *rmsgp, + enum rpcrdma_errcode err, u32 *va) +{ + u32 *startp = va; + + *va++ = htonl(rmsgp->rm_xid); + *va++ = htonl(rmsgp->rm_vers); + *va++ = htonl(xprt->sc_max_requests); + *va++ = htonl(RDMA_ERROR); + *va++ = htonl(err); + if (err == ERR_VERS) { + *va++ = htonl(RPCRDMA_VERSION); + *va++ = htonl(RPCRDMA_VERSION); + } + + return (int)((unsigned long)va - (unsigned long)startp); +} + +int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp) +{ + struct rpcrdma_write_array *wr_ary; + + /* There is no read-list in a reply */ + + /* skip write list */ + wr_ary = (struct rpcrdma_write_array *) + &rmsgp->rm_body.rm_chunks[1]; + if (wr_ary->wc_discrim) + wr_ary = (struct rpcrdma_write_array *) + &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)]. + wc_target.rs_length; + else + wr_ary = (struct rpcrdma_write_array *) + &wr_ary->wc_nchunks; + + /* skip reply array */ + if (wr_ary->wc_discrim) + wr_ary = (struct rpcrdma_write_array *) + &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)]; + else + wr_ary = (struct rpcrdma_write_array *) + &wr_ary->wc_nchunks; + + return (unsigned long) wr_ary - (unsigned long) rmsgp; +} + +void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks) +{ + struct rpcrdma_write_array *ary; + + /* no read-list */ + rmsgp->rm_body.rm_chunks[0] = xdr_zero; + + /* write-array discrim */ + ary = (struct rpcrdma_write_array *) + &rmsgp->rm_body.rm_chunks[1]; + ary->wc_discrim = xdr_one; + ary->wc_nchunks = htonl(chunks); + + /* write-list terminator */ + ary->wc_array[chunks].wc_target.rs_handle = xdr_zero; + + /* reply-array discriminator */ + ary->wc_array[chunks].wc_target.rs_length = xdr_zero; +} + +void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary, + int chunks) +{ + ary->wc_discrim = xdr_one; + ary->wc_nchunks = htonl(chunks); +} + +void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary, + int chunk_no, + __be32 rs_handle, + __be64 rs_offset, + u32 write_len) +{ + struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target; + seg->rs_handle = rs_handle; + seg->rs_offset = rs_offset; + seg->rs_length = htonl(write_len); +} + +void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt, + struct rpcrdma_msg *rdma_argp, + struct rpcrdma_msg *rdma_resp, + enum rpcrdma_proc rdma_type) +{ + rdma_resp->rm_xid = htonl(rdma_argp->rm_xid); + rdma_resp->rm_vers = htonl(rdma_argp->rm_vers); + rdma_resp->rm_credit = htonl(xprt->sc_max_requests); + rdma_resp->rm_type = htonl(rdma_type); + + /* Encode chunks lists */ + rdma_resp->rm_body.rm_chunks[0] = xdr_zero; + rdma_resp->rm_body.rm_chunks[1] = xdr_zero; + rdma_resp->rm_body.rm_chunks[2] = xdr_zero; +} diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c new file mode 100644 index 0000000..e011027 --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -0,0 +1,614 @@ +/* + * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. + * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Author: Tom Tucker + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +/* + * Replace the pages in the rq_argpages array with the pages from the SGE in + * the RDMA_RECV completion. The SGL should contain full pages up until the + * last one. + */ +static void rdma_build_arg_xdr(struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *ctxt, + u32 byte_count) +{ + struct page *page; + u32 bc; + int sge_no; + + /* Swap the page in the SGE with the page in argpages */ + page = ctxt->pages[0]; + put_page(rqstp->rq_pages[0]); + rqstp->rq_pages[0] = page; + + /* Set up the XDR head */ + rqstp->rq_arg.head[0].iov_base = page_address(page); + rqstp->rq_arg.head[0].iov_len = + min_t(size_t, byte_count, ctxt->sge[0].length); + rqstp->rq_arg.len = byte_count; + rqstp->rq_arg.buflen = byte_count; + + /* Compute bytes past head in the SGL */ + bc = byte_count - rqstp->rq_arg.head[0].iov_len; + + /* If data remains, store it in the pagelist */ + rqstp->rq_arg.page_len = bc; + rqstp->rq_arg.page_base = 0; + rqstp->rq_arg.pages = &rqstp->rq_pages[1]; + sge_no = 1; + while (bc && sge_no < ctxt->count) { + page = ctxt->pages[sge_no]; + put_page(rqstp->rq_pages[sge_no]); + rqstp->rq_pages[sge_no] = page; + bc -= min_t(u32, bc, ctxt->sge[sge_no].length); + rqstp->rq_arg.buflen += ctxt->sge[sge_no].length; + sge_no++; + } + rqstp->rq_respages = &rqstp->rq_pages[sge_no]; + rqstp->rq_next_page = rqstp->rq_respages + 1; + + /* We should never run out of SGE because the limit is defined to + * support the max allowed RPC data length + */ + BUG_ON(bc && (sge_no == ctxt->count)); + BUG_ON((rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len) + != byte_count); + BUG_ON(rqstp->rq_arg.len != byte_count); + + /* If not all pages were used from the SGL, free the remaining ones */ + bc = sge_no; + while (sge_no < ctxt->count) { + page = ctxt->pages[sge_no++]; + put_page(page); + } + ctxt->count = bc; + + /* Set up tail */ + rqstp->rq_arg.tail[0].iov_base = NULL; + rqstp->rq_arg.tail[0].iov_len = 0; +} + +static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count) +{ + if (rdma_node_get_transport(xprt->sc_cm_id->device->node_type) == + RDMA_TRANSPORT_IWARP) + return 1; + else + return min_t(int, sge_count, xprt->sc_max_sge); +} + +typedef int (*rdma_reader_fn)(struct svcxprt_rdma *xprt, + struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *head, + int *page_no, + u32 *page_offset, + u32 rs_handle, + u32 rs_length, + u64 rs_offset, + int last); + +/* Issue an RDMA_READ using the local lkey to map the data sink */ +static int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, + struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *head, + int *page_no, + u32 *page_offset, + u32 rs_handle, + u32 rs_length, + u64 rs_offset, + int last) +{ + struct ib_send_wr read_wr; + int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT; + struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt); + int ret, read, pno; + u32 pg_off = *page_offset; + u32 pg_no = *page_no; + + ctxt->direction = DMA_FROM_DEVICE; + ctxt->read_hdr = head; + pages_needed = + min_t(int, pages_needed, rdma_read_max_sge(xprt, pages_needed)); + read = min_t(int, pages_needed << PAGE_SHIFT, rs_length); + + for (pno = 0; pno < pages_needed; pno++) { + int len = min_t(int, rs_length, PAGE_SIZE - pg_off); + + head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no]; + head->arg.page_len += len; + head->arg.len += len; + if (!pg_off) + head->count++; + rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1]; + rqstp->rq_next_page = rqstp->rq_respages + 1; + ctxt->sge[pno].addr = + ib_dma_map_page(xprt->sc_cm_id->device, + head->arg.pages[pg_no], pg_off, + PAGE_SIZE - pg_off, + DMA_FROM_DEVICE); + ret = ib_dma_mapping_error(xprt->sc_cm_id->device, + ctxt->sge[pno].addr); + if (ret) + goto err; + atomic_inc(&xprt->sc_dma_used); + + /* The lkey here is either a local dma lkey or a dma_mr lkey */ + ctxt->sge[pno].lkey = xprt->sc_dma_lkey; + ctxt->sge[pno].length = len; + ctxt->count++; + + /* adjust offset and wrap to next page if needed */ + pg_off += len; + if (pg_off == PAGE_SIZE) { + pg_off = 0; + pg_no++; + } + rs_length -= len; + } + + if (last && rs_length == 0) + set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); + else + clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); + + memset(&read_wr, 0, sizeof(read_wr)); + read_wr.wr_id = (unsigned long)ctxt; + read_wr.opcode = IB_WR_RDMA_READ; + ctxt->wr_op = read_wr.opcode; + read_wr.send_flags = IB_SEND_SIGNALED; + read_wr.wr.rdma.rkey = rs_handle; + read_wr.wr.rdma.remote_addr = rs_offset; + read_wr.sg_list = ctxt->sge; + read_wr.num_sge = pages_needed; + + ret = svc_rdma_send(xprt, &read_wr); + if (ret) { + pr_err("svcrdma: Error %d posting RDMA_READ\n", ret); + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + goto err; + } + + /* return current location in page array */ + *page_no = pg_no; + *page_offset = pg_off; + ret = read; + atomic_inc(&rdma_stat_read); + return ret; + err: + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 0); + return ret; +} + +/* Issue an RDMA_READ using an FRMR to map the data sink */ +static int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, + struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *head, + int *page_no, + u32 *page_offset, + u32 rs_handle, + u32 rs_length, + u64 rs_offset, + int last) +{ + struct ib_send_wr read_wr; + struct ib_send_wr inv_wr; + struct ib_send_wr fastreg_wr; + u8 key; + int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT; + struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt); + struct svc_rdma_fastreg_mr *frmr = svc_rdma_get_frmr(xprt); + int ret, read, pno; + u32 pg_off = *page_offset; + u32 pg_no = *page_no; + + if (IS_ERR(frmr)) + return -ENOMEM; + + ctxt->direction = DMA_FROM_DEVICE; + ctxt->frmr = frmr; + pages_needed = min_t(int, pages_needed, xprt->sc_frmr_pg_list_len); + read = min_t(int, pages_needed << PAGE_SHIFT, rs_length); + + frmr->kva = page_address(rqstp->rq_arg.pages[pg_no]); + frmr->direction = DMA_FROM_DEVICE; + frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE); + frmr->map_len = pages_needed << PAGE_SHIFT; + frmr->page_list_len = pages_needed; + + for (pno = 0; pno < pages_needed; pno++) { + int len = min_t(int, rs_length, PAGE_SIZE - pg_off); + + head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no]; + head->arg.page_len += len; + head->arg.len += len; + if (!pg_off) + head->count++; + rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1]; + rqstp->rq_next_page = rqstp->rq_respages + 1; + frmr->page_list->page_list[pno] = + ib_dma_map_page(xprt->sc_cm_id->device, + head->arg.pages[pg_no], 0, + PAGE_SIZE, DMA_FROM_DEVICE); + ret = ib_dma_mapping_error(xprt->sc_cm_id->device, + frmr->page_list->page_list[pno]); + if (ret) + goto err; + atomic_inc(&xprt->sc_dma_used); + + /* adjust offset and wrap to next page if needed */ + pg_off += len; + if (pg_off == PAGE_SIZE) { + pg_off = 0; + pg_no++; + } + rs_length -= len; + } + + if (last && rs_length == 0) + set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); + else + clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); + + /* Bump the key */ + key = (u8)(frmr->mr->lkey & 0x000000FF); + ib_update_fast_reg_key(frmr->mr, ++key); + + ctxt->sge[0].addr = (unsigned long)frmr->kva + *page_offset; + ctxt->sge[0].lkey = frmr->mr->lkey; + ctxt->sge[0].length = read; + ctxt->count = 1; + ctxt->read_hdr = head; + + /* Prepare FASTREG WR */ + memset(&fastreg_wr, 0, sizeof(fastreg_wr)); + fastreg_wr.opcode = IB_WR_FAST_REG_MR; + fastreg_wr.send_flags = IB_SEND_SIGNALED; + fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva; + fastreg_wr.wr.fast_reg.page_list = frmr->page_list; + fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len; + fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + fastreg_wr.wr.fast_reg.length = frmr->map_len; + fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags; + fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey; + fastreg_wr.next = &read_wr; + + /* Prepare RDMA_READ */ + memset(&read_wr, 0, sizeof(read_wr)); + read_wr.send_flags = IB_SEND_SIGNALED; + read_wr.wr.rdma.rkey = rs_handle; + read_wr.wr.rdma.remote_addr = rs_offset; + read_wr.sg_list = ctxt->sge; + read_wr.num_sge = 1; + if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_READ_W_INV) { + read_wr.opcode = IB_WR_RDMA_READ_WITH_INV; + read_wr.wr_id = (unsigned long)ctxt; + read_wr.ex.invalidate_rkey = ctxt->frmr->mr->lkey; + } else { + read_wr.opcode = IB_WR_RDMA_READ; + read_wr.next = &inv_wr; + /* Prepare invalidate */ + memset(&inv_wr, 0, sizeof(inv_wr)); + inv_wr.wr_id = (unsigned long)ctxt; + inv_wr.opcode = IB_WR_LOCAL_INV; + inv_wr.send_flags = IB_SEND_SIGNALED | IB_SEND_FENCE; + inv_wr.ex.invalidate_rkey = frmr->mr->lkey; + } + ctxt->wr_op = read_wr.opcode; + + /* Post the chain */ + ret = svc_rdma_send(xprt, &fastreg_wr); + if (ret) { + pr_err("svcrdma: Error %d posting RDMA_READ\n", ret); + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + goto err; + } + + /* return current location in page array */ + *page_no = pg_no; + *page_offset = pg_off; + ret = read; + atomic_inc(&rdma_stat_read); + return ret; + err: + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 0); + svc_rdma_put_frmr(xprt, frmr); + return ret; +} + +static int rdma_read_chunks(struct svcxprt_rdma *xprt, + struct rpcrdma_msg *rmsgp, + struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *head) +{ + int page_no, ch_count, ret; + struct rpcrdma_read_chunk *ch; + u32 page_offset, byte_count; + u64 rs_offset; + rdma_reader_fn reader; + + /* If no read list is present, return 0 */ + ch = svc_rdma_get_read_chunk(rmsgp); + if (!ch) + return 0; + + svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count); + if (ch_count > RPCSVC_MAXPAGES) + return -EINVAL; + + /* The request is completed when the RDMA_READs complete. The + * head context keeps all the pages that comprise the + * request. + */ + head->arg.head[0] = rqstp->rq_arg.head[0]; + head->arg.tail[0] = rqstp->rq_arg.tail[0]; + head->arg.pages = &head->pages[head->count]; + head->hdr_count = head->count; + head->arg.page_base = 0; + head->arg.page_len = 0; + head->arg.len = rqstp->rq_arg.len; + head->arg.buflen = rqstp->rq_arg.buflen; + + /* Use FRMR if supported */ + if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG) + reader = rdma_read_chunk_frmr; + else + reader = rdma_read_chunk_lcl; + + page_no = 0; page_offset = 0; + for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; + ch->rc_discrim != 0; ch++) { + + xdr_decode_hyper((__be32 *)&ch->rc_target.rs_offset, + &rs_offset); + byte_count = ntohl(ch->rc_target.rs_length); + + while (byte_count > 0) { + ret = reader(xprt, rqstp, head, + &page_no, &page_offset, + ntohl(ch->rc_target.rs_handle), + byte_count, rs_offset, + ((ch+1)->rc_discrim == 0) /* last */ + ); + if (ret < 0) + goto err; + byte_count -= ret; + rs_offset += ret; + head->arg.buflen += ret; + } + } + ret = 1; + err: + /* Detach arg pages. svc_recv will replenish them */ + for (page_no = 0; + &rqstp->rq_pages[page_no] < rqstp->rq_respages; page_no++) + rqstp->rq_pages[page_no] = NULL; + + return ret; +} + +/* + * To avoid a separate RDMA READ just for a handful of zero bytes, + * RFC 5666 section 3.7 allows the client to omit the XDR zero pad + * in chunk lists. + */ +static void +rdma_fix_xdr_pad(struct xdr_buf *buf) +{ + unsigned int page_len = buf->page_len; + unsigned int size = (XDR_QUADLEN(page_len) << 2) - page_len; + unsigned int offset, pg_no; + char *p; + + if (size == 0) + return; + + pg_no = page_len >> PAGE_SHIFT; + offset = page_len & ~PAGE_MASK; + p = page_address(buf->pages[pg_no]); + memset(p + offset, 0, size); + + buf->page_len += size; + buf->buflen += size; + buf->len += size; +} + +static int rdma_read_complete(struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *head) +{ + int page_no; + int ret; + + BUG_ON(!head); + + /* Copy RPC pages */ + for (page_no = 0; page_no < head->count; page_no++) { + put_page(rqstp->rq_pages[page_no]); + rqstp->rq_pages[page_no] = head->pages[page_no]; + } + /* Point rq_arg.pages past header */ + rdma_fix_xdr_pad(&head->arg); + rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count]; + rqstp->rq_arg.page_len = head->arg.page_len; + rqstp->rq_arg.page_base = head->arg.page_base; + + /* rq_respages starts after the last arg page */ + rqstp->rq_respages = &rqstp->rq_arg.pages[page_no]; + rqstp->rq_next_page = rqstp->rq_respages + 1; + + /* Rebuild rq_arg head and tail. */ + rqstp->rq_arg.head[0] = head->arg.head[0]; + rqstp->rq_arg.tail[0] = head->arg.tail[0]; + rqstp->rq_arg.len = head->arg.len; + rqstp->rq_arg.buflen = head->arg.buflen; + + /* Free the context */ + svc_rdma_put_context(head, 0); + + /* XXX: What should this be? */ + rqstp->rq_prot = IPPROTO_MAX; + svc_xprt_copy_addrs(rqstp, rqstp->rq_xprt); + + ret = rqstp->rq_arg.head[0].iov_len + + rqstp->rq_arg.page_len + + rqstp->rq_arg.tail[0].iov_len; + dprintk("svcrdma: deferred read ret=%d, rq_arg.len =%d, " + "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n", + ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base, + rqstp->rq_arg.head[0].iov_len); + + return ret; +} + +/* + * Set up the rqstp thread context to point to the RQ buffer. If + * necessary, pull additional data from the client with an RDMA_READ + * request. + */ +int svc_rdma_recvfrom(struct svc_rqst *rqstp) +{ + struct svc_xprt *xprt = rqstp->rq_xprt; + struct svcxprt_rdma *rdma_xprt = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + struct svc_rdma_op_ctxt *ctxt = NULL; + struct rpcrdma_msg *rmsgp; + int ret = 0; + int len; + + dprintk("svcrdma: rqstp=%p\n", rqstp); + + spin_lock_bh(&rdma_xprt->sc_rq_dto_lock); + if (!list_empty(&rdma_xprt->sc_read_complete_q)) { + ctxt = list_entry(rdma_xprt->sc_read_complete_q.next, + struct svc_rdma_op_ctxt, + dto_q); + list_del_init(&ctxt->dto_q); + spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock); + return rdma_read_complete(rqstp, ctxt); + } else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) { + ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next, + struct svc_rdma_op_ctxt, + dto_q); + list_del_init(&ctxt->dto_q); + } else { + atomic_inc(&rdma_stat_rq_starve); + clear_bit(XPT_DATA, &xprt->xpt_flags); + ctxt = NULL; + } + spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock); + if (!ctxt) { + /* This is the EAGAIN path. The svc_recv routine will + * return -EAGAIN, the nfsd thread will go to call into + * svc_recv again and we shouldn't be on the active + * transport list + */ + if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) + goto close_out; + + goto out; + } + dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n", + ctxt, rdma_xprt, rqstp, ctxt->wc_status); + BUG_ON(ctxt->wc_status != IB_WC_SUCCESS); + atomic_inc(&rdma_stat_recv); + + /* Build up the XDR from the receive buffers. */ + rdma_build_arg_xdr(rqstp, ctxt, ctxt->byte_len); + + /* Decode the RDMA header. */ + len = svc_rdma_xdr_decode_req(&rmsgp, rqstp); + rqstp->rq_xprt_hlen = len; + + /* If the request is invalid, reply with an error */ + if (len < 0) { + if (len == -ENOSYS) + svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS); + goto close_out; + } + + /* Read read-list data. */ + ret = rdma_read_chunks(rdma_xprt, rmsgp, rqstp, ctxt); + if (ret > 0) { + /* read-list posted, defer until data received from client. */ + goto defer; + } else if (ret < 0) { + /* Post of read-list failed, free context. */ + svc_rdma_put_context(ctxt, 1); + return 0; + } + + ret = rqstp->rq_arg.head[0].iov_len + + rqstp->rq_arg.page_len + + rqstp->rq_arg.tail[0].iov_len; + svc_rdma_put_context(ctxt, 0); + out: + dprintk("svcrdma: ret = %d, rq_arg.len =%d, " + "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n", + ret, rqstp->rq_arg.len, + rqstp->rq_arg.head[0].iov_base, + rqstp->rq_arg.head[0].iov_len); + rqstp->rq_prot = IPPROTO_MAX; + svc_xprt_copy_addrs(rqstp, xprt); + return ret; + + close_out: + if (ctxt) + svc_rdma_put_context(ctxt, 1); + dprintk("svcrdma: transport %p is closing\n", xprt); + /* + * Set the close bit and enqueue it. svc_recv will see the + * close bit and call svc_xprt_delete + */ + set_bit(XPT_CLOSE, &xprt->xpt_flags); +defer: + return 0; +} diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c new file mode 100644 index 0000000..9f1b506 --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -0,0 +1,554 @@ +/* + * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. + * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Author: Tom Tucker + */ + +#include +#include +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +static int map_xdr(struct svcxprt_rdma *xprt, + struct xdr_buf *xdr, + struct svc_rdma_req_map *vec) +{ + int sge_no; + u32 sge_bytes; + u32 page_bytes; + u32 page_off; + int page_no; + + BUG_ON(xdr->len != + (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)); + + /* Skip the first sge, this is for the RPCRDMA header */ + sge_no = 1; + + /* Head SGE */ + vec->sge[sge_no].iov_base = xdr->head[0].iov_base; + vec->sge[sge_no].iov_len = xdr->head[0].iov_len; + sge_no++; + + /* pages SGE */ + page_no = 0; + page_bytes = xdr->page_len; + page_off = xdr->page_base; + while (page_bytes) { + vec->sge[sge_no].iov_base = + page_address(xdr->pages[page_no]) + page_off; + sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off)); + page_bytes -= sge_bytes; + vec->sge[sge_no].iov_len = sge_bytes; + + sge_no++; + page_no++; + page_off = 0; /* reset for next time through loop */ + } + + /* Tail SGE */ + if (xdr->tail[0].iov_len) { + vec->sge[sge_no].iov_base = xdr->tail[0].iov_base; + vec->sge[sge_no].iov_len = xdr->tail[0].iov_len; + sge_no++; + } + + dprintk("svcrdma: map_xdr: sge_no %d page_no %d " + "page_base %u page_len %u head_len %zu tail_len %zu\n", + sge_no, page_no, xdr->page_base, xdr->page_len, + xdr->head[0].iov_len, xdr->tail[0].iov_len); + + vec->count = sge_no; + return 0; +} + +static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt, + struct xdr_buf *xdr, + u32 xdr_off, size_t len, int dir) +{ + struct page *page; + dma_addr_t dma_addr; + if (xdr_off < xdr->head[0].iov_len) { + /* This offset is in the head */ + xdr_off += (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK; + page = virt_to_page(xdr->head[0].iov_base); + } else { + xdr_off -= xdr->head[0].iov_len; + if (xdr_off < xdr->page_len) { + /* This offset is in the page list */ + xdr_off += xdr->page_base; + page = xdr->pages[xdr_off >> PAGE_SHIFT]; + xdr_off &= ~PAGE_MASK; + } else { + /* This offset is in the tail */ + xdr_off -= xdr->page_len; + xdr_off += (unsigned long) + xdr->tail[0].iov_base & ~PAGE_MASK; + page = virt_to_page(xdr->tail[0].iov_base); + } + } + dma_addr = ib_dma_map_page(xprt->sc_cm_id->device, page, xdr_off, + min_t(size_t, PAGE_SIZE, len), dir); + return dma_addr; +} + +/* Assumptions: + * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE + */ +static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, + u32 rmr, u64 to, + u32 xdr_off, int write_len, + struct svc_rdma_req_map *vec) +{ + struct ib_send_wr write_wr; + struct ib_sge *sge; + int xdr_sge_no; + int sge_no; + int sge_bytes; + int sge_off; + int bc; + struct svc_rdma_op_ctxt *ctxt; + + BUG_ON(vec->count > RPCSVC_MAXPAGES); + dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, " + "write_len=%d, vec->sge=%p, vec->count=%lu\n", + rmr, (unsigned long long)to, xdr_off, + write_len, vec->sge, vec->count); + + ctxt = svc_rdma_get_context(xprt); + ctxt->direction = DMA_TO_DEVICE; + sge = ctxt->sge; + + /* Find the SGE associated with xdr_off */ + for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < vec->count; + xdr_sge_no++) { + if (vec->sge[xdr_sge_no].iov_len > bc) + break; + bc -= vec->sge[xdr_sge_no].iov_len; + } + + sge_off = bc; + bc = write_len; + sge_no = 0; + + /* Copy the remaining SGE */ + while (bc != 0) { + sge_bytes = min_t(size_t, + bc, vec->sge[xdr_sge_no].iov_len-sge_off); + sge[sge_no].length = sge_bytes; + sge[sge_no].addr = + dma_map_xdr(xprt, &rqstp->rq_res, xdr_off, + sge_bytes, DMA_TO_DEVICE); + xdr_off += sge_bytes; + if (ib_dma_mapping_error(xprt->sc_cm_id->device, + sge[sge_no].addr)) + goto err; + atomic_inc(&xprt->sc_dma_used); + sge[sge_no].lkey = xprt->sc_dma_lkey; + ctxt->count++; + sge_off = 0; + sge_no++; + xdr_sge_no++; + BUG_ON(xdr_sge_no > vec->count); + bc -= sge_bytes; + if (sge_no == xprt->sc_max_sge) + break; + } + + /* Prepare WRITE WR */ + memset(&write_wr, 0, sizeof write_wr); + ctxt->wr_op = IB_WR_RDMA_WRITE; + write_wr.wr_id = (unsigned long)ctxt; + write_wr.sg_list = &sge[0]; + write_wr.num_sge = sge_no; + write_wr.opcode = IB_WR_RDMA_WRITE; + write_wr.send_flags = IB_SEND_SIGNALED; + write_wr.wr.rdma.rkey = rmr; + write_wr.wr.rdma.remote_addr = to; + + /* Post It */ + atomic_inc(&rdma_stat_write); + if (svc_rdma_send(xprt, &write_wr)) + goto err; + return write_len - bc; + err: + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 0); + /* Fatal error, close transport */ + return -EIO; +} + +static int send_write_chunks(struct svcxprt_rdma *xprt, + struct rpcrdma_msg *rdma_argp, + struct rpcrdma_msg *rdma_resp, + struct svc_rqst *rqstp, + struct svc_rdma_req_map *vec) +{ + u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len; + int write_len; + u32 xdr_off; + int chunk_off; + int chunk_no; + struct rpcrdma_write_array *arg_ary; + struct rpcrdma_write_array *res_ary; + int ret; + + arg_ary = svc_rdma_get_write_array(rdma_argp); + if (!arg_ary) + return 0; + res_ary = (struct rpcrdma_write_array *) + &rdma_resp->rm_body.rm_chunks[1]; + + /* Write chunks start at the pagelist */ + for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0; + xfer_len && chunk_no < arg_ary->wc_nchunks; + chunk_no++) { + struct rpcrdma_segment *arg_ch; + u64 rs_offset; + + arg_ch = &arg_ary->wc_array[chunk_no].wc_target; + write_len = min(xfer_len, ntohl(arg_ch->rs_length)); + + /* Prepare the response chunk given the length actually + * written */ + xdr_decode_hyper((__be32 *)&arg_ch->rs_offset, &rs_offset); + svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no, + arg_ch->rs_handle, + arg_ch->rs_offset, + write_len); + chunk_off = 0; + while (write_len) { + ret = send_write(xprt, rqstp, + ntohl(arg_ch->rs_handle), + rs_offset + chunk_off, + xdr_off, + write_len, + vec); + if (ret <= 0) { + dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n", + ret); + return -EIO; + } + chunk_off += ret; + xdr_off += ret; + xfer_len -= ret; + write_len -= ret; + } + } + /* Update the req with the number of chunks actually used */ + svc_rdma_xdr_encode_write_list(rdma_resp, chunk_no); + + return rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len; +} + +static int send_reply_chunks(struct svcxprt_rdma *xprt, + struct rpcrdma_msg *rdma_argp, + struct rpcrdma_msg *rdma_resp, + struct svc_rqst *rqstp, + struct svc_rdma_req_map *vec) +{ + u32 xfer_len = rqstp->rq_res.len; + int write_len; + u32 xdr_off; + int chunk_no; + int chunk_off; + int nchunks; + struct rpcrdma_segment *ch; + struct rpcrdma_write_array *arg_ary; + struct rpcrdma_write_array *res_ary; + int ret; + + arg_ary = svc_rdma_get_reply_array(rdma_argp); + if (!arg_ary) + return 0; + /* XXX: need to fix when reply lists occur with read-list and or + * write-list */ + res_ary = (struct rpcrdma_write_array *) + &rdma_resp->rm_body.rm_chunks[2]; + + /* xdr offset starts at RPC message */ + nchunks = ntohl(arg_ary->wc_nchunks); + for (xdr_off = 0, chunk_no = 0; + xfer_len && chunk_no < nchunks; + chunk_no++) { + u64 rs_offset; + ch = &arg_ary->wc_array[chunk_no].wc_target; + write_len = min(xfer_len, htonl(ch->rs_length)); + + /* Prepare the reply chunk given the length actually + * written */ + xdr_decode_hyper((__be32 *)&ch->rs_offset, &rs_offset); + svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no, + ch->rs_handle, ch->rs_offset, + write_len); + chunk_off = 0; + while (write_len) { + ret = send_write(xprt, rqstp, + ntohl(ch->rs_handle), + rs_offset + chunk_off, + xdr_off, + write_len, + vec); + if (ret <= 0) { + dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n", + ret); + return -EIO; + } + chunk_off += ret; + xdr_off += ret; + xfer_len -= ret; + write_len -= ret; + } + } + /* Update the req with the number of chunks actually used */ + svc_rdma_xdr_encode_reply_array(res_ary, chunk_no); + + return rqstp->rq_res.len; +} + +/* This function prepares the portion of the RPCRDMA message to be + * sent in the RDMA_SEND. This function is called after data sent via + * RDMA has already been transmitted. There are three cases: + * - The RPCRDMA header, RPC header, and payload are all sent in a + * single RDMA_SEND. This is the "inline" case. + * - The RPCRDMA header and some portion of the RPC header and data + * are sent via this RDMA_SEND and another portion of the data is + * sent via RDMA. + * - The RPCRDMA header [NOMSG] is sent in this RDMA_SEND and the RPC + * header and data are all transmitted via RDMA. + * In all three cases, this function prepares the RPCRDMA header in + * sge[0], the 'type' parameter indicates the type to place in the + * RPCRDMA header, and the 'byte_count' field indicates how much of + * the XDR to include in this RDMA_SEND. NB: The offset of the payload + * to send is zero in the XDR. + */ +static int send_reply(struct svcxprt_rdma *rdma, + struct svc_rqst *rqstp, + struct page *page, + struct rpcrdma_msg *rdma_resp, + struct svc_rdma_op_ctxt *ctxt, + struct svc_rdma_req_map *vec, + int byte_count) +{ + struct ib_send_wr send_wr; + int sge_no; + int sge_bytes; + int page_no; + int pages; + int ret; + + /* Post a recv buffer to handle another request. */ + ret = svc_rdma_post_recv(rdma); + if (ret) { + printk(KERN_INFO + "svcrdma: could not post a receive buffer, err=%d." + "Closing transport %p.\n", ret, rdma); + set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); + svc_rdma_put_context(ctxt, 0); + return -ENOTCONN; + } + + /* Prepare the context */ + ctxt->pages[0] = page; + ctxt->count = 1; + + /* Prepare the SGE for the RPCRDMA Header */ + ctxt->sge[0].lkey = rdma->sc_dma_lkey; + ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp); + ctxt->sge[0].addr = + ib_dma_map_page(rdma->sc_cm_id->device, page, 0, + ctxt->sge[0].length, DMA_TO_DEVICE); + if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) + goto err; + atomic_inc(&rdma->sc_dma_used); + + ctxt->direction = DMA_TO_DEVICE; + + /* Map the payload indicated by 'byte_count' */ + for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) { + int xdr_off = 0; + sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count); + byte_count -= sge_bytes; + ctxt->sge[sge_no].addr = + dma_map_xdr(rdma, &rqstp->rq_res, xdr_off, + sge_bytes, DMA_TO_DEVICE); + xdr_off += sge_bytes; + if (ib_dma_mapping_error(rdma->sc_cm_id->device, + ctxt->sge[sge_no].addr)) + goto err; + atomic_inc(&rdma->sc_dma_used); + ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey; + ctxt->sge[sge_no].length = sge_bytes; + } + BUG_ON(byte_count != 0); + + /* Save all respages in the ctxt and remove them from the + * respages array. They are our pages until the I/O + * completes. + */ + pages = rqstp->rq_next_page - rqstp->rq_respages; + for (page_no = 0; page_no < pages; page_no++) { + ctxt->pages[page_no+1] = rqstp->rq_respages[page_no]; + ctxt->count++; + rqstp->rq_respages[page_no] = NULL; + /* + * If there are more pages than SGE, terminate SGE + * list so that svc_rdma_unmap_dma doesn't attempt to + * unmap garbage. + */ + if (page_no+1 >= sge_no) + ctxt->sge[page_no+1].length = 0; + } + rqstp->rq_next_page = rqstp->rq_respages + 1; + + BUG_ON(sge_no > rdma->sc_max_sge); + memset(&send_wr, 0, sizeof send_wr); + ctxt->wr_op = IB_WR_SEND; + send_wr.wr_id = (unsigned long)ctxt; + send_wr.sg_list = ctxt->sge; + send_wr.num_sge = sge_no; + send_wr.opcode = IB_WR_SEND; + send_wr.send_flags = IB_SEND_SIGNALED; + + ret = svc_rdma_send(rdma, &send_wr); + if (ret) + goto err; + + return 0; + + err: + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 1); + return -EIO; +} + +void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp) +{ +} + +/* + * Return the start of an xdr buffer. + */ +static void *xdr_start(struct xdr_buf *xdr) +{ + return xdr->head[0].iov_base - + (xdr->len - + xdr->page_len - + xdr->tail[0].iov_len - + xdr->head[0].iov_len); +} + +int svc_rdma_sendto(struct svc_rqst *rqstp) +{ + struct svc_xprt *xprt = rqstp->rq_xprt; + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + struct rpcrdma_msg *rdma_argp; + struct rpcrdma_msg *rdma_resp; + struct rpcrdma_write_array *reply_ary; + enum rpcrdma_proc reply_type; + int ret; + int inline_bytes; + struct page *res_page; + struct svc_rdma_op_ctxt *ctxt; + struct svc_rdma_req_map *vec; + + dprintk("svcrdma: sending response for rqstp=%p\n", rqstp); + + /* Get the RDMA request header. */ + rdma_argp = xdr_start(&rqstp->rq_arg); + + /* Build an req vec for the XDR */ + ctxt = svc_rdma_get_context(rdma); + ctxt->direction = DMA_TO_DEVICE; + vec = svc_rdma_get_req_map(); + ret = map_xdr(rdma, &rqstp->rq_res, vec); + if (ret) + goto err0; + inline_bytes = rqstp->rq_res.len; + + /* Create the RDMA response header */ + res_page = svc_rdma_get_page(); + rdma_resp = page_address(res_page); + reply_ary = svc_rdma_get_reply_array(rdma_argp); + if (reply_ary) + reply_type = RDMA_NOMSG; + else + reply_type = RDMA_MSG; + svc_rdma_xdr_encode_reply_header(rdma, rdma_argp, + rdma_resp, reply_type); + + /* Send any write-chunk data and build resp write-list */ + ret = send_write_chunks(rdma, rdma_argp, rdma_resp, + rqstp, vec); + if (ret < 0) { + printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n", + ret); + goto err1; + } + inline_bytes -= ret; + + /* Send any reply-list data and update resp reply-list */ + ret = send_reply_chunks(rdma, rdma_argp, rdma_resp, + rqstp, vec); + if (ret < 0) { + printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n", + ret); + goto err1; + } + inline_bytes -= ret; + + ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec, + inline_bytes); + svc_rdma_put_req_map(vec); + dprintk("svcrdma: send_reply returns %d\n", ret); + return ret; + + err1: + put_page(res_page); + err0: + svc_rdma_put_req_map(vec); + svc_rdma_put_context(ctxt, 0); + return ret; +} diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c new file mode 100644 index 0000000..4e61880 --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -0,0 +1,1355 @@ +/* + * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. + * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Author: Tom Tucker + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "xprt_rdma.h" + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, + struct net *net, + struct sockaddr *sa, int salen, + int flags); +static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt); +static void svc_rdma_release_rqst(struct svc_rqst *); +static void dto_tasklet_func(unsigned long data); +static void svc_rdma_detach(struct svc_xprt *xprt); +static void svc_rdma_free(struct svc_xprt *xprt); +static int svc_rdma_has_wspace(struct svc_xprt *xprt); +static int svc_rdma_secure_port(struct svc_rqst *); +static void rq_cq_reap(struct svcxprt_rdma *xprt); +static void sq_cq_reap(struct svcxprt_rdma *xprt); + +static DECLARE_TASKLET(dto_tasklet, dto_tasklet_func, 0UL); +static DEFINE_SPINLOCK(dto_lock); +static LIST_HEAD(dto_xprt_q); + +static struct svc_xprt_ops svc_rdma_ops = { + .xpo_create = svc_rdma_create, + .xpo_recvfrom = svc_rdma_recvfrom, + .xpo_sendto = svc_rdma_sendto, + .xpo_release_rqst = svc_rdma_release_rqst, + .xpo_detach = svc_rdma_detach, + .xpo_free = svc_rdma_free, + .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr, + .xpo_has_wspace = svc_rdma_has_wspace, + .xpo_accept = svc_rdma_accept, + .xpo_secure_port = svc_rdma_secure_port, +}; + +struct svc_xprt_class svc_rdma_class = { + .xcl_name = "rdma", + .xcl_owner = THIS_MODULE, + .xcl_ops = &svc_rdma_ops, + .xcl_max_payload = RPCSVC_MAXPAYLOAD_RDMA, + .xcl_ident = XPRT_TRANSPORT_RDMA, +}; + +struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) +{ + struct svc_rdma_op_ctxt *ctxt; + + while (1) { + ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep, GFP_KERNEL); + if (ctxt) + break; + schedule_timeout_uninterruptible(msecs_to_jiffies(500)); + } + ctxt->xprt = xprt; + INIT_LIST_HEAD(&ctxt->dto_q); + ctxt->count = 0; + ctxt->frmr = NULL; + atomic_inc(&xprt->sc_ctxt_used); + return ctxt; +} + +void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) +{ + struct svcxprt_rdma *xprt = ctxt->xprt; + int i; + for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) { + /* + * Unmap the DMA addr in the SGE if the lkey matches + * the sc_dma_lkey, otherwise, ignore it since it is + * an FRMR lkey and will be unmapped later when the + * last WR that uses it completes. + */ + if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) { + atomic_dec(&xprt->sc_dma_used); + ib_dma_unmap_page(xprt->sc_cm_id->device, + ctxt->sge[i].addr, + ctxt->sge[i].length, + ctxt->direction); + } + } +} + +void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) +{ + struct svcxprt_rdma *xprt; + int i; + + BUG_ON(!ctxt); + xprt = ctxt->xprt; + if (free_pages) + for (i = 0; i < ctxt->count; i++) + put_page(ctxt->pages[i]); + + kmem_cache_free(svc_rdma_ctxt_cachep, ctxt); + atomic_dec(&xprt->sc_ctxt_used); +} + +/* + * Temporary NFS req mappings are shared across all transport + * instances. These are short lived and should be bounded by the number + * of concurrent server threads * depth of the SQ. + */ +struct svc_rdma_req_map *svc_rdma_get_req_map(void) +{ + struct svc_rdma_req_map *map; + while (1) { + map = kmem_cache_alloc(svc_rdma_map_cachep, GFP_KERNEL); + if (map) + break; + schedule_timeout_uninterruptible(msecs_to_jiffies(500)); + } + map->count = 0; + return map; +} + +void svc_rdma_put_req_map(struct svc_rdma_req_map *map) +{ + kmem_cache_free(svc_rdma_map_cachep, map); +} + +/* ib_cq event handler */ +static void cq_event_handler(struct ib_event *event, void *context) +{ + struct svc_xprt *xprt = context; + dprintk("svcrdma: received CQ event id=%d, context=%p\n", + event->event, context); + set_bit(XPT_CLOSE, &xprt->xpt_flags); +} + +/* QP event handler */ +static void qp_event_handler(struct ib_event *event, void *context) +{ + struct svc_xprt *xprt = context; + + switch (event->event) { + /* These are considered benign events */ + case IB_EVENT_PATH_MIG: + case IB_EVENT_COMM_EST: + case IB_EVENT_SQ_DRAINED: + case IB_EVENT_QP_LAST_WQE_REACHED: + dprintk("svcrdma: QP event %d received for QP=%p\n", + event->event, event->element.qp); + break; + /* These are considered fatal events */ + case IB_EVENT_PATH_MIG_ERR: + case IB_EVENT_QP_FATAL: + case IB_EVENT_QP_REQ_ERR: + case IB_EVENT_QP_ACCESS_ERR: + case IB_EVENT_DEVICE_FATAL: + default: + dprintk("svcrdma: QP ERROR event %d received for QP=%p, " + "closing transport\n", + event->event, event->element.qp); + set_bit(XPT_CLOSE, &xprt->xpt_flags); + break; + } +} + +/* + * Data Transfer Operation Tasklet + * + * Walks a list of transports with I/O pending, removing entries as + * they are added to the server's I/O pending list. Two bits indicate + * if SQ, RQ, or both have I/O pending. The dto_lock is an irqsave + * spinlock that serializes access to the transport list with the RQ + * and SQ interrupt handlers. + */ +static void dto_tasklet_func(unsigned long data) +{ + struct svcxprt_rdma *xprt; + unsigned long flags; + + spin_lock_irqsave(&dto_lock, flags); + while (!list_empty(&dto_xprt_q)) { + xprt = list_entry(dto_xprt_q.next, + struct svcxprt_rdma, sc_dto_q); + list_del_init(&xprt->sc_dto_q); + spin_unlock_irqrestore(&dto_lock, flags); + + rq_cq_reap(xprt); + sq_cq_reap(xprt); + + svc_xprt_put(&xprt->sc_xprt); + spin_lock_irqsave(&dto_lock, flags); + } + spin_unlock_irqrestore(&dto_lock, flags); +} + +/* + * Receive Queue Completion Handler + * + * Since an RQ completion handler is called on interrupt context, we + * need to defer the handling of the I/O to a tasklet + */ +static void rq_comp_handler(struct ib_cq *cq, void *cq_context) +{ + struct svcxprt_rdma *xprt = cq_context; + unsigned long flags; + + /* Guard against unconditional flush call for destroyed QP */ + if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0) + return; + + /* + * Set the bit regardless of whether or not it's on the list + * because it may be on the list already due to an SQ + * completion. + */ + set_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags); + + /* + * If this transport is not already on the DTO transport queue, + * add it + */ + spin_lock_irqsave(&dto_lock, flags); + if (list_empty(&xprt->sc_dto_q)) { + svc_xprt_get(&xprt->sc_xprt); + list_add_tail(&xprt->sc_dto_q, &dto_xprt_q); + } + spin_unlock_irqrestore(&dto_lock, flags); + + /* Tasklet does all the work to avoid irqsave locks. */ + tasklet_schedule(&dto_tasklet); +} + +/* + * rq_cq_reap - Process the RQ CQ. + * + * Take all completing WC off the CQE and enqueue the associated DTO + * context on the dto_q for the transport. + * + * Note that caller must hold a transport reference. + */ +static void rq_cq_reap(struct svcxprt_rdma *xprt) +{ + int ret; + struct ib_wc wc; + struct svc_rdma_op_ctxt *ctxt = NULL; + + if (!test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags)) + return; + + ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP); + atomic_inc(&rdma_stat_rq_poll); + + while ((ret = ib_poll_cq(xprt->sc_rq_cq, 1, &wc)) > 0) { + ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; + ctxt->wc_status = wc.status; + ctxt->byte_len = wc.byte_len; + svc_rdma_unmap_dma(ctxt); + if (wc.status != IB_WC_SUCCESS) { + /* Close the transport */ + dprintk("svcrdma: transport closing putting ctxt %p\n", ctxt); + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + svc_rdma_put_context(ctxt, 1); + svc_xprt_put(&xprt->sc_xprt); + continue; + } + spin_lock_bh(&xprt->sc_rq_dto_lock); + list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q); + spin_unlock_bh(&xprt->sc_rq_dto_lock); + svc_xprt_put(&xprt->sc_xprt); + } + + if (ctxt) + atomic_inc(&rdma_stat_rq_prod); + + set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); + /* + * If data arrived before established event, + * don't enqueue. This defers RPC I/O until the + * RDMA connection is complete. + */ + if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags)) + svc_xprt_enqueue(&xprt->sc_xprt); +} + +/* + * Process a completion context + */ +static void process_context(struct svcxprt_rdma *xprt, + struct svc_rdma_op_ctxt *ctxt) +{ + svc_rdma_unmap_dma(ctxt); + + switch (ctxt->wr_op) { + case IB_WR_SEND: + BUG_ON(ctxt->frmr); + svc_rdma_put_context(ctxt, 1); + break; + + case IB_WR_RDMA_WRITE: + BUG_ON(ctxt->frmr); + svc_rdma_put_context(ctxt, 0); + break; + + case IB_WR_RDMA_READ: + case IB_WR_RDMA_READ_WITH_INV: + svc_rdma_put_frmr(xprt, ctxt->frmr); + if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { + struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; + BUG_ON(!read_hdr); + spin_lock_bh(&xprt->sc_rq_dto_lock); + set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); + list_add_tail(&read_hdr->dto_q, + &xprt->sc_read_complete_q); + spin_unlock_bh(&xprt->sc_rq_dto_lock); + svc_xprt_enqueue(&xprt->sc_xprt); + } + svc_rdma_put_context(ctxt, 0); + break; + + default: + BUG_ON(1); + printk(KERN_ERR "svcrdma: unexpected completion type, " + "opcode=%d\n", + ctxt->wr_op); + break; + } +} + +/* + * Send Queue Completion Handler - potentially called on interrupt context. + * + * Note that caller must hold a transport reference. + */ +static void sq_cq_reap(struct svcxprt_rdma *xprt) +{ + struct svc_rdma_op_ctxt *ctxt = NULL; + struct ib_wc wc_a[6]; + struct ib_wc *wc; + struct ib_cq *cq = xprt->sc_sq_cq; + int ret; + + memset(wc_a, 0, sizeof(wc_a)); + + if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) + return; + + ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP); + atomic_inc(&rdma_stat_sq_poll); + while ((ret = ib_poll_cq(cq, ARRAY_SIZE(wc_a), wc_a)) > 0) { + int i; + + for (i = 0; i < ret; i++) { + wc = &wc_a[i]; + if (wc->status != IB_WC_SUCCESS) { + dprintk("svcrdma: sq wc err status %d\n", + wc->status); + + /* Close the transport */ + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + } + + /* Decrement used SQ WR count */ + atomic_dec(&xprt->sc_sq_count); + wake_up(&xprt->sc_send_wait); + + ctxt = (struct svc_rdma_op_ctxt *) + (unsigned long)wc->wr_id; + if (ctxt) + process_context(xprt, ctxt); + + svc_xprt_put(&xprt->sc_xprt); + } + } + + if (ctxt) + atomic_inc(&rdma_stat_sq_prod); +} + +static void sq_comp_handler(struct ib_cq *cq, void *cq_context) +{ + struct svcxprt_rdma *xprt = cq_context; + unsigned long flags; + + /* Guard against unconditional flush call for destroyed QP */ + if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0) + return; + + /* + * Set the bit regardless of whether or not it's on the list + * because it may be on the list already due to an RQ + * completion. + */ + set_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags); + + /* + * If this transport is not already on the DTO transport queue, + * add it + */ + spin_lock_irqsave(&dto_lock, flags); + if (list_empty(&xprt->sc_dto_q)) { + svc_xprt_get(&xprt->sc_xprt); + list_add_tail(&xprt->sc_dto_q, &dto_xprt_q); + } + spin_unlock_irqrestore(&dto_lock, flags); + + /* Tasklet does all the work to avoid irqsave locks. */ + tasklet_schedule(&dto_tasklet); +} + +static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, + int listener) +{ + struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL); + + if (!cma_xprt) + return NULL; + svc_xprt_init(&init_net, &svc_rdma_class, &cma_xprt->sc_xprt, serv); + INIT_LIST_HEAD(&cma_xprt->sc_accept_q); + INIT_LIST_HEAD(&cma_xprt->sc_dto_q); + INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); + INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); + INIT_LIST_HEAD(&cma_xprt->sc_frmr_q); + init_waitqueue_head(&cma_xprt->sc_send_wait); + + spin_lock_init(&cma_xprt->sc_lock); + spin_lock_init(&cma_xprt->sc_rq_dto_lock); + spin_lock_init(&cma_xprt->sc_frmr_q_lock); + + cma_xprt->sc_ord = svcrdma_ord; + + cma_xprt->sc_max_req_size = svcrdma_max_req_size; + cma_xprt->sc_max_requests = svcrdma_max_requests; + cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT; + atomic_set(&cma_xprt->sc_sq_count, 0); + atomic_set(&cma_xprt->sc_ctxt_used, 0); + + if (listener) + set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); + + return cma_xprt; +} + +struct page *svc_rdma_get_page(void) +{ + struct page *page; + + while ((page = alloc_page(GFP_KERNEL)) == NULL) { + /* If we can't get memory, wait a bit and try again */ + printk(KERN_INFO "svcrdma: out of memory...retrying in 1s\n"); + schedule_timeout_uninterruptible(msecs_to_jiffies(1000)); + } + return page; +} + +int svc_rdma_post_recv(struct svcxprt_rdma *xprt) +{ + struct ib_recv_wr recv_wr, *bad_recv_wr; + struct svc_rdma_op_ctxt *ctxt; + struct page *page; + dma_addr_t pa; + int sge_no; + int buflen; + int ret; + + ctxt = svc_rdma_get_context(xprt); + buflen = 0; + ctxt->direction = DMA_FROM_DEVICE; + for (sge_no = 0; buflen < xprt->sc_max_req_size; sge_no++) { + BUG_ON(sge_no >= xprt->sc_max_sge); + page = svc_rdma_get_page(); + ctxt->pages[sge_no] = page; + pa = ib_dma_map_page(xprt->sc_cm_id->device, + page, 0, PAGE_SIZE, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa)) + goto err_put_ctxt; + atomic_inc(&xprt->sc_dma_used); + ctxt->sge[sge_no].addr = pa; + ctxt->sge[sge_no].length = PAGE_SIZE; + ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey; + ctxt->count = sge_no + 1; + buflen += PAGE_SIZE; + } + recv_wr.next = NULL; + recv_wr.sg_list = &ctxt->sge[0]; + recv_wr.num_sge = ctxt->count; + recv_wr.wr_id = (u64)(unsigned long)ctxt; + + svc_xprt_get(&xprt->sc_xprt); + ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr); + if (ret) { + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 1); + svc_xprt_put(&xprt->sc_xprt); + } + return ret; + + err_put_ctxt: + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 1); + return -ENOMEM; +} + +/* + * This function handles the CONNECT_REQUEST event on a listening + * endpoint. It is passed the cma_id for the _new_ connection. The context in + * this cma_id is inherited from the listening cma_id and is the svc_xprt + * structure for the listening endpoint. + * + * This function creates a new xprt for the new connection and enqueues it on + * the accept queue for the listent xprt. When the listen thread is kicked, it + * will call the recvfrom method on the listen xprt which will accept the new + * connection. + */ +static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird) +{ + struct svcxprt_rdma *listen_xprt = new_cma_id->context; + struct svcxprt_rdma *newxprt; + struct sockaddr *sa; + + /* Create a new transport */ + newxprt = rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, 0); + if (!newxprt) { + dprintk("svcrdma: failed to create new transport\n"); + return; + } + newxprt->sc_cm_id = new_cma_id; + new_cma_id->context = newxprt; + dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n", + newxprt, newxprt->sc_cm_id, listen_xprt); + + /* Save client advertised inbound read limit for use later in accept. */ + newxprt->sc_ord = client_ird; + + /* Set the local and remote addresses in the transport */ + sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; + svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa)); + sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr; + svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa)); + + /* + * Enqueue the new transport on the accept queue of the listening + * transport + */ + spin_lock_bh(&listen_xprt->sc_lock); + list_add_tail(&newxprt->sc_accept_q, &listen_xprt->sc_accept_q); + spin_unlock_bh(&listen_xprt->sc_lock); + + set_bit(XPT_CONN, &listen_xprt->sc_xprt.xpt_flags); + svc_xprt_enqueue(&listen_xprt->sc_xprt); +} + +/* + * Handles events generated on the listening endpoint. These events will be + * either be incoming connect requests or adapter removal events. + */ +static int rdma_listen_handler(struct rdma_cm_id *cma_id, + struct rdma_cm_event *event) +{ + struct svcxprt_rdma *xprt = cma_id->context; + int ret = 0; + + switch (event->event) { + case RDMA_CM_EVENT_CONNECT_REQUEST: + dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, " + "event=%d\n", cma_id, cma_id->context, event->event); + handle_connect_req(cma_id, + event->param.conn.initiator_depth); + break; + + case RDMA_CM_EVENT_ESTABLISHED: + /* Accept complete */ + dprintk("svcrdma: Connection completed on LISTEN xprt=%p, " + "cm_id=%p\n", xprt, cma_id); + break; + + case RDMA_CM_EVENT_DEVICE_REMOVAL: + dprintk("svcrdma: Device removal xprt=%p, cm_id=%p\n", + xprt, cma_id); + if (xprt) + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + break; + + default: + dprintk("svcrdma: Unexpected event on listening endpoint %p, " + "event=%d\n", cma_id, event->event); + break; + } + + return ret; +} + +static int rdma_cma_handler(struct rdma_cm_id *cma_id, + struct rdma_cm_event *event) +{ + struct svc_xprt *xprt = cma_id->context; + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + switch (event->event) { + case RDMA_CM_EVENT_ESTABLISHED: + /* Accept complete */ + svc_xprt_get(xprt); + dprintk("svcrdma: Connection completed on DTO xprt=%p, " + "cm_id=%p\n", xprt, cma_id); + clear_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags); + svc_xprt_enqueue(xprt); + break; + case RDMA_CM_EVENT_DISCONNECTED: + dprintk("svcrdma: Disconnect on DTO xprt=%p, cm_id=%p\n", + xprt, cma_id); + if (xprt) { + set_bit(XPT_CLOSE, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); + } + break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, " + "event=%d\n", cma_id, xprt, event->event); + if (xprt) { + set_bit(XPT_CLOSE, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); + } + break; + default: + dprintk("svcrdma: Unexpected event on DTO endpoint %p, " + "event=%d\n", cma_id, event->event); + break; + } + return 0; +} + +/* + * Create a listening RDMA service endpoint. + */ +static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, + struct net *net, + struct sockaddr *sa, int salen, + int flags) +{ + struct rdma_cm_id *listen_id; + struct svcxprt_rdma *cma_xprt; + struct svc_xprt *xprt; + int ret; + + dprintk("svcrdma: Creating RDMA socket\n"); + if (sa->sa_family != AF_INET) { + dprintk("svcrdma: Address family %d is not supported.\n", sa->sa_family); + return ERR_PTR(-EAFNOSUPPORT); + } + cma_xprt = rdma_create_xprt(serv, 1); + if (!cma_xprt) + return ERR_PTR(-ENOMEM); + xprt = &cma_xprt->sc_xprt; + + listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP, + IB_QPT_RC); + if (IS_ERR(listen_id)) { + ret = PTR_ERR(listen_id); + dprintk("svcrdma: rdma_create_id failed = %d\n", ret); + goto err0; + } + + ret = rdma_bind_addr(listen_id, sa); + if (ret) { + dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret); + goto err1; + } + cma_xprt->sc_cm_id = listen_id; + + ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG); + if (ret) { + dprintk("svcrdma: rdma_listen failed = %d\n", ret); + goto err1; + } + + /* + * We need to use the address from the cm_id in case the + * caller specified 0 for the port number. + */ + sa = (struct sockaddr *)&cma_xprt->sc_cm_id->route.addr.src_addr; + svc_xprt_set_local(&cma_xprt->sc_xprt, sa, salen); + + return &cma_xprt->sc_xprt; + + err1: + rdma_destroy_id(listen_id); + err0: + kfree(cma_xprt); + return ERR_PTR(ret); +} + +static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt) +{ + struct ib_mr *mr; + struct ib_fast_reg_page_list *pl; + struct svc_rdma_fastreg_mr *frmr; + + frmr = kmalloc(sizeof(*frmr), GFP_KERNEL); + if (!frmr) + goto err; + + mr = ib_alloc_fast_reg_mr(xprt->sc_pd, RPCSVC_MAXPAGES); + if (IS_ERR(mr)) + goto err_free_frmr; + + pl = ib_alloc_fast_reg_page_list(xprt->sc_cm_id->device, + RPCSVC_MAXPAGES); + if (IS_ERR(pl)) + goto err_free_mr; + + frmr->mr = mr; + frmr->page_list = pl; + INIT_LIST_HEAD(&frmr->frmr_list); + return frmr; + + err_free_mr: + ib_dereg_mr(mr); + err_free_frmr: + kfree(frmr); + err: + return ERR_PTR(-ENOMEM); +} + +static void rdma_dealloc_frmr_q(struct svcxprt_rdma *xprt) +{ + struct svc_rdma_fastreg_mr *frmr; + + while (!list_empty(&xprt->sc_frmr_q)) { + frmr = list_entry(xprt->sc_frmr_q.next, + struct svc_rdma_fastreg_mr, frmr_list); + list_del_init(&frmr->frmr_list); + ib_dereg_mr(frmr->mr); + ib_free_fast_reg_page_list(frmr->page_list); + kfree(frmr); + } +} + +struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_fastreg_mr *frmr = NULL; + + spin_lock_bh(&rdma->sc_frmr_q_lock); + if (!list_empty(&rdma->sc_frmr_q)) { + frmr = list_entry(rdma->sc_frmr_q.next, + struct svc_rdma_fastreg_mr, frmr_list); + list_del_init(&frmr->frmr_list); + frmr->map_len = 0; + frmr->page_list_len = 0; + } + spin_unlock_bh(&rdma->sc_frmr_q_lock); + if (frmr) + return frmr; + + return rdma_alloc_frmr(rdma); +} + +static void frmr_unmap_dma(struct svcxprt_rdma *xprt, + struct svc_rdma_fastreg_mr *frmr) +{ + int page_no; + for (page_no = 0; page_no < frmr->page_list_len; page_no++) { + dma_addr_t addr = frmr->page_list->page_list[page_no]; + if (ib_dma_mapping_error(frmr->mr->device, addr)) + continue; + atomic_dec(&xprt->sc_dma_used); + ib_dma_unmap_page(frmr->mr->device, addr, PAGE_SIZE, + frmr->direction); + } +} + +void svc_rdma_put_frmr(struct svcxprt_rdma *rdma, + struct svc_rdma_fastreg_mr *frmr) +{ + if (frmr) { + frmr_unmap_dma(rdma, frmr); + spin_lock_bh(&rdma->sc_frmr_q_lock); + BUG_ON(!list_empty(&frmr->frmr_list)); + list_add(&frmr->frmr_list, &rdma->sc_frmr_q); + spin_unlock_bh(&rdma->sc_frmr_q_lock); + } +} + +/* + * This is the xpo_recvfrom function for listening endpoints. Its + * purpose is to accept incoming connections. The CMA callback handler + * has already created a new transport and attached it to the new CMA + * ID. + * + * There is a queue of pending connections hung on the listening + * transport. This queue contains the new svc_xprt structure. This + * function takes svc_xprt structures off the accept_q and completes + * the connection. + */ +static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) +{ + struct svcxprt_rdma *listen_rdma; + struct svcxprt_rdma *newxprt = NULL; + struct rdma_conn_param conn_param; + struct ib_qp_init_attr qp_attr; + struct ib_device_attr devattr; + int uninitialized_var(dma_mr_acc); + int need_dma_mr; + int ret; + int i; + + listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); + clear_bit(XPT_CONN, &xprt->xpt_flags); + /* Get the next entry off the accept list */ + spin_lock_bh(&listen_rdma->sc_lock); + if (!list_empty(&listen_rdma->sc_accept_q)) { + newxprt = list_entry(listen_rdma->sc_accept_q.next, + struct svcxprt_rdma, sc_accept_q); + list_del_init(&newxprt->sc_accept_q); + } + if (!list_empty(&listen_rdma->sc_accept_q)) + set_bit(XPT_CONN, &listen_rdma->sc_xprt.xpt_flags); + spin_unlock_bh(&listen_rdma->sc_lock); + if (!newxprt) + return NULL; + + dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n", + newxprt, newxprt->sc_cm_id); + + ret = ib_query_device(newxprt->sc_cm_id->device, &devattr); + if (ret) { + dprintk("svcrdma: could not query device attributes on " + "device %p, rc=%d\n", newxprt->sc_cm_id->device, ret); + goto errout; + } + + /* Qualify the transport resource defaults with the + * capabilities of this particular device */ + newxprt->sc_max_sge = min((size_t)devattr.max_sge, + (size_t)RPCSVC_MAXPAGES); + newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr, + (size_t)svcrdma_max_requests); + newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests; + + /* + * Limit ORD based on client limit, local device limit, and + * configured svcrdma limit. + */ + newxprt->sc_ord = min_t(size_t, devattr.max_qp_rd_atom, newxprt->sc_ord); + newxprt->sc_ord = min_t(size_t, svcrdma_ord, newxprt->sc_ord); + + newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device); + if (IS_ERR(newxprt->sc_pd)) { + dprintk("svcrdma: error creating PD for connect request\n"); + goto errout; + } + newxprt->sc_sq_cq = ib_create_cq(newxprt->sc_cm_id->device, + sq_comp_handler, + cq_event_handler, + newxprt, + newxprt->sc_sq_depth, + 0); + if (IS_ERR(newxprt->sc_sq_cq)) { + dprintk("svcrdma: error creating SQ CQ for connect request\n"); + goto errout; + } + newxprt->sc_rq_cq = ib_create_cq(newxprt->sc_cm_id->device, + rq_comp_handler, + cq_event_handler, + newxprt, + newxprt->sc_max_requests, + 0); + if (IS_ERR(newxprt->sc_rq_cq)) { + dprintk("svcrdma: error creating RQ CQ for connect request\n"); + goto errout; + } + + memset(&qp_attr, 0, sizeof qp_attr); + qp_attr.event_handler = qp_event_handler; + qp_attr.qp_context = &newxprt->sc_xprt; + qp_attr.cap.max_send_wr = newxprt->sc_sq_depth; + qp_attr.cap.max_recv_wr = newxprt->sc_max_requests; + qp_attr.cap.max_send_sge = newxprt->sc_max_sge; + qp_attr.cap.max_recv_sge = newxprt->sc_max_sge; + qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + qp_attr.qp_type = IB_QPT_RC; + qp_attr.send_cq = newxprt->sc_sq_cq; + qp_attr.recv_cq = newxprt->sc_rq_cq; + dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n" + " cm_id->device=%p, sc_pd->device=%p\n" + " cap.max_send_wr = %d\n" + " cap.max_recv_wr = %d\n" + " cap.max_send_sge = %d\n" + " cap.max_recv_sge = %d\n", + newxprt->sc_cm_id, newxprt->sc_pd, + newxprt->sc_cm_id->device, newxprt->sc_pd->device, + qp_attr.cap.max_send_wr, + qp_attr.cap.max_recv_wr, + qp_attr.cap.max_send_sge, + qp_attr.cap.max_recv_sge); + + ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr); + if (ret) { + dprintk("svcrdma: failed to create QP, ret=%d\n", ret); + goto errout; + } + newxprt->sc_qp = newxprt->sc_cm_id->qp; + + /* + * Use the most secure set of MR resources based on the + * transport type and available memory management features in + * the device. Here's the table implemented below: + * + * Fast Global DMA Remote WR + * Reg LKEY MR Access + * Sup'd Sup'd Needed Needed + * + * IWARP N N Y Y + * N Y Y Y + * Y N Y N + * Y Y N - + * + * IB N N Y N + * N Y N - + * Y N Y N + * Y Y N - + * + * NB: iWARP requires remote write access for the data sink + * of an RDMA_READ. IB does not. + */ + if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { + newxprt->sc_frmr_pg_list_len = + devattr.max_fast_reg_page_list_len; + newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG; + } + + /* + * Determine if a DMA MR is required and if so, what privs are required + */ + switch (rdma_node_get_transport(newxprt->sc_cm_id->device->node_type)) { + case RDMA_TRANSPORT_IWARP: + newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV; + if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) { + need_dma_mr = 1; + dma_mr_acc = + (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE); + } else if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) { + need_dma_mr = 1; + dma_mr_acc = IB_ACCESS_LOCAL_WRITE; + } else + need_dma_mr = 0; + break; + case RDMA_TRANSPORT_IB: + if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) { + need_dma_mr = 1; + dma_mr_acc = IB_ACCESS_LOCAL_WRITE; + } else if (!(devattr.device_cap_flags & + IB_DEVICE_LOCAL_DMA_LKEY)) { + need_dma_mr = 1; + dma_mr_acc = IB_ACCESS_LOCAL_WRITE; + } else + need_dma_mr = 0; + break; + default: + goto errout; + } + + /* Create the DMA MR if needed, otherwise, use the DMA LKEY */ + if (need_dma_mr) { + /* Register all of physical memory */ + newxprt->sc_phys_mr = + ib_get_dma_mr(newxprt->sc_pd, dma_mr_acc); + if (IS_ERR(newxprt->sc_phys_mr)) { + dprintk("svcrdma: Failed to create DMA MR ret=%d\n", + ret); + goto errout; + } + newxprt->sc_dma_lkey = newxprt->sc_phys_mr->lkey; + } else + newxprt->sc_dma_lkey = + newxprt->sc_cm_id->device->local_dma_lkey; + + /* Post receive buffers */ + for (i = 0; i < newxprt->sc_max_requests; i++) { + ret = svc_rdma_post_recv(newxprt); + if (ret) { + dprintk("svcrdma: failure posting receive buffers\n"); + goto errout; + } + } + + /* Swap out the handler */ + newxprt->sc_cm_id->event_handler = rdma_cma_handler; + + /* + * Arm the CQs for the SQ and RQ before accepting so we can't + * miss the first message + */ + ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP); + ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP); + + /* Accept Connection */ + set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags); + memset(&conn_param, 0, sizeof conn_param); + conn_param.responder_resources = 0; + conn_param.initiator_depth = newxprt->sc_ord; + ret = rdma_accept(newxprt->sc_cm_id, &conn_param); + if (ret) { + dprintk("svcrdma: failed to accept new connection, ret=%d\n", + ret); + goto errout; + } + + dprintk("svcrdma: new connection %p accepted with the following " + "attributes:\n" + " local_ip : %pI4\n" + " local_port : %d\n" + " remote_ip : %pI4\n" + " remote_port : %d\n" + " max_sge : %d\n" + " sq_depth : %d\n" + " max_requests : %d\n" + " ord : %d\n", + newxprt, + &((struct sockaddr_in *)&newxprt->sc_cm_id-> + route.addr.src_addr)->sin_addr.s_addr, + ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id-> + route.addr.src_addr)->sin_port), + &((struct sockaddr_in *)&newxprt->sc_cm_id-> + route.addr.dst_addr)->sin_addr.s_addr, + ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id-> + route.addr.dst_addr)->sin_port), + newxprt->sc_max_sge, + newxprt->sc_sq_depth, + newxprt->sc_max_requests, + newxprt->sc_ord); + + return &newxprt->sc_xprt; + + errout: + dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret); + /* Take a reference in case the DTO handler runs */ + svc_xprt_get(&newxprt->sc_xprt); + if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp)) + ib_destroy_qp(newxprt->sc_qp); + rdma_destroy_id(newxprt->sc_cm_id); + /* This call to put will destroy the transport */ + svc_xprt_put(&newxprt->sc_xprt); + return NULL; +} + +static void svc_rdma_release_rqst(struct svc_rqst *rqstp) +{ +} + +/* + * When connected, an svc_xprt has at least two references: + * + * - A reference held by the cm_id between the ESTABLISHED and + * DISCONNECTED events. If the remote peer disconnected first, this + * reference could be gone. + * + * - A reference held by the svc_recv code that called this function + * as part of close processing. + * + * At a minimum one references should still be held. + */ +static void svc_rdma_detach(struct svc_xprt *xprt) +{ + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + dprintk("svc: svc_rdma_detach(%p)\n", xprt); + + /* Disconnect and flush posted WQE */ + rdma_disconnect(rdma->sc_cm_id); +} + +static void __svc_rdma_free(struct work_struct *work) +{ + struct svcxprt_rdma *rdma = + container_of(work, struct svcxprt_rdma, sc_work); + dprintk("svcrdma: svc_rdma_free(%p)\n", rdma); + + /* We should only be called from kref_put */ + BUG_ON(atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0); + + /* + * Destroy queued, but not processed read completions. Note + * that this cleanup has to be done before destroying the + * cm_id because the device ptr is needed to unmap the dma in + * svc_rdma_put_context. + */ + while (!list_empty(&rdma->sc_read_complete_q)) { + struct svc_rdma_op_ctxt *ctxt; + ctxt = list_entry(rdma->sc_read_complete_q.next, + struct svc_rdma_op_ctxt, + dto_q); + list_del_init(&ctxt->dto_q); + svc_rdma_put_context(ctxt, 1); + } + + /* Destroy queued, but not processed recv completions */ + while (!list_empty(&rdma->sc_rq_dto_q)) { + struct svc_rdma_op_ctxt *ctxt; + ctxt = list_entry(rdma->sc_rq_dto_q.next, + struct svc_rdma_op_ctxt, + dto_q); + list_del_init(&ctxt->dto_q); + svc_rdma_put_context(ctxt, 1); + } + + /* Warn if we leaked a resource or under-referenced */ + WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0); + WARN_ON(atomic_read(&rdma->sc_dma_used) != 0); + + /* De-allocate fastreg mr */ + rdma_dealloc_frmr_q(rdma); + + /* Destroy the QP if present (not a listener) */ + if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) + ib_destroy_qp(rdma->sc_qp); + + if (rdma->sc_sq_cq && !IS_ERR(rdma->sc_sq_cq)) + ib_destroy_cq(rdma->sc_sq_cq); + + if (rdma->sc_rq_cq && !IS_ERR(rdma->sc_rq_cq)) + ib_destroy_cq(rdma->sc_rq_cq); + + if (rdma->sc_phys_mr && !IS_ERR(rdma->sc_phys_mr)) + ib_dereg_mr(rdma->sc_phys_mr); + + if (rdma->sc_pd && !IS_ERR(rdma->sc_pd)) + ib_dealloc_pd(rdma->sc_pd); + + /* Destroy the CM ID */ + rdma_destroy_id(rdma->sc_cm_id); + + kfree(rdma); +} + +static void svc_rdma_free(struct svc_xprt *xprt) +{ + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + INIT_WORK(&rdma->sc_work, __svc_rdma_free); + queue_work(svc_rdma_wq, &rdma->sc_work); +} + +static int svc_rdma_has_wspace(struct svc_xprt *xprt) +{ + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + + /* + * If there are already waiters on the SQ, + * return false. + */ + if (waitqueue_active(&rdma->sc_send_wait)) + return 0; + + /* Otherwise return true. */ + return 1; +} + +static int svc_rdma_secure_port(struct svc_rqst *rqstp) +{ + return 1; +} + +/* + * Attempt to register the kvec representing the RPC memory with the + * device. + * + * Returns: + * NULL : The device does not support fastreg or there were no more + * fastreg mr. + * frmr : The kvec register request was successfully posted. + * <0 : An error was encountered attempting to register the kvec. + */ +int svc_rdma_fastreg(struct svcxprt_rdma *xprt, + struct svc_rdma_fastreg_mr *frmr) +{ + struct ib_send_wr fastreg_wr; + u8 key; + + /* Bump the key */ + key = (u8)(frmr->mr->lkey & 0x000000FF); + ib_update_fast_reg_key(frmr->mr, ++key); + + /* Prepare FASTREG WR */ + memset(&fastreg_wr, 0, sizeof fastreg_wr); + fastreg_wr.opcode = IB_WR_FAST_REG_MR; + fastreg_wr.send_flags = IB_SEND_SIGNALED; + fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva; + fastreg_wr.wr.fast_reg.page_list = frmr->page_list; + fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len; + fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + fastreg_wr.wr.fast_reg.length = frmr->map_len; + fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags; + fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey; + return svc_rdma_send(xprt, &fastreg_wr); +} + +int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) +{ + struct ib_send_wr *bad_wr, *n_wr; + int wr_count; + int i; + int ret; + + if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) + return -ENOTCONN; + + BUG_ON(wr->send_flags != IB_SEND_SIGNALED); + wr_count = 1; + for (n_wr = wr->next; n_wr; n_wr = n_wr->next) + wr_count++; + + /* If the SQ is full, wait until an SQ entry is available */ + while (1) { + spin_lock_bh(&xprt->sc_lock); + if (xprt->sc_sq_depth < atomic_read(&xprt->sc_sq_count) + wr_count) { + spin_unlock_bh(&xprt->sc_lock); + atomic_inc(&rdma_stat_sq_starve); + + /* See if we can opportunistically reap SQ WR to make room */ + sq_cq_reap(xprt); + + /* Wait until SQ WR available if SQ still full */ + wait_event(xprt->sc_send_wait, + atomic_read(&xprt->sc_sq_count) < + xprt->sc_sq_depth); + if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) + return -ENOTCONN; + continue; + } + /* Take a transport ref for each WR posted */ + for (i = 0; i < wr_count; i++) + svc_xprt_get(&xprt->sc_xprt); + + /* Bump used SQ WR count and post */ + atomic_add(wr_count, &xprt->sc_sq_count); + ret = ib_post_send(xprt->sc_qp, wr, &bad_wr); + if (ret) { + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + atomic_sub(wr_count, &xprt->sc_sq_count); + for (i = 0; i < wr_count; i ++) + svc_xprt_put(&xprt->sc_xprt); + dprintk("svcrdma: failed to post SQ WR rc=%d, " + "sc_sq_count=%d, sc_sq_depth=%d\n", + ret, atomic_read(&xprt->sc_sq_count), + xprt->sc_sq_depth); + } + spin_unlock_bh(&xprt->sc_lock); + if (ret) + wake_up(&xprt->sc_send_wait); + break; + } + return ret; +} + +void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, + enum rpcrdma_errcode err) +{ + struct ib_send_wr err_wr; + struct page *p; + struct svc_rdma_op_ctxt *ctxt; + u32 *va; + int length; + int ret; + + p = svc_rdma_get_page(); + va = page_address(p); + + /* XDR encode error */ + length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va); + + ctxt = svc_rdma_get_context(xprt); + ctxt->direction = DMA_FROM_DEVICE; + ctxt->count = 1; + ctxt->pages[0] = p; + + /* Prepare SGE for local address */ + ctxt->sge[0].addr = ib_dma_map_page(xprt->sc_cm_id->device, + p, 0, length, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, ctxt->sge[0].addr)) { + put_page(p); + svc_rdma_put_context(ctxt, 1); + return; + } + atomic_inc(&xprt->sc_dma_used); + ctxt->sge[0].lkey = xprt->sc_dma_lkey; + ctxt->sge[0].length = length; + + /* Prepare SEND WR */ + memset(&err_wr, 0, sizeof err_wr); + ctxt->wr_op = IB_WR_SEND; + err_wr.wr_id = (unsigned long)ctxt; + err_wr.sg_list = ctxt->sge; + err_wr.num_sge = 1; + err_wr.opcode = IB_WR_SEND; + err_wr.send_flags = IB_SEND_SIGNALED; + + /* Post It */ + ret = svc_rdma_send(xprt, &err_wr); + if (ret) { + dprintk("svcrdma: Error %d posting send for protocol error\n", + ret); + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 1); + } +} diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c new file mode 100644 index 0000000..6a4615d --- /dev/null +++ b/net/sunrpc/xprtrdma/transport.c @@ -0,0 +1,747 @@ +/* + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * transport.c + * + * This file contains the top-level implementation of an RPC RDMA + * transport. + * + * Naming convention: functions beginning with xprt_ are part of the + * transport switch. All others are RPC RDMA internal. + */ + +#include +#include +#include +#include +#include + +#include "xprt_rdma.h" + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +MODULE_LICENSE("Dual BSD/GPL"); + +MODULE_DESCRIPTION("RPC/RDMA Transport for Linux kernel NFS"); +MODULE_AUTHOR("Network Appliance, Inc."); + +/* + * tunables + */ + +static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; +static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; +static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; +static unsigned int xprt_rdma_inline_write_padding; +static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; + int xprt_rdma_pad_optimize = 0; + +#ifdef RPC_DEBUG + +static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE; +static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE; +static unsigned int zero; +static unsigned int max_padding = PAGE_SIZE; +static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS; +static unsigned int max_memreg = RPCRDMA_LAST - 1; + +static struct ctl_table_header *sunrpc_table_header; + +static struct ctl_table xr_tunables_table[] = { + { + .procname = "rdma_slot_table_entries", + .data = &xprt_rdma_slot_table_entries, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_slot_table_size, + .extra2 = &max_slot_table_size + }, + { + .procname = "rdma_max_inline_read", + .data = &xprt_rdma_max_inline_read, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "rdma_max_inline_write", + .data = &xprt_rdma_max_inline_write, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "rdma_inline_write_padding", + .data = &xprt_rdma_inline_write_padding, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &max_padding, + }, + { + .procname = "rdma_memreg_strategy", + .data = &xprt_rdma_memreg_strategy, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_memreg, + .extra2 = &max_memreg, + }, + { + .procname = "rdma_pad_optimize", + .data = &xprt_rdma_pad_optimize, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { }, +}; + +static struct ctl_table sunrpc_table[] = { + { + .procname = "sunrpc", + .mode = 0555, + .child = xr_tunables_table + }, + { }, +}; + +#endif + +#define RPCRDMA_BIND_TO (60U * HZ) +#define RPCRDMA_INIT_REEST_TO (5U * HZ) +#define RPCRDMA_MAX_REEST_TO (30U * HZ) +#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ) + +static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */ + +static void +xprt_rdma_format_addresses(struct rpc_xprt *xprt) +{ + struct sockaddr *sap = (struct sockaddr *) + &rpcx_to_rdmad(xprt).addr; + struct sockaddr_in *sin = (struct sockaddr_in *)sap; + char buf[64]; + + (void)rpc_ntop(sap, buf, sizeof(buf)); + xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL); + + snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap)); + xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL); + + xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma"; + + snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr)); + xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL); + + snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap)); + xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL); + + /* netid */ + xprt->address_strings[RPC_DISPLAY_NETID] = "rdma"; +} + +static void +xprt_rdma_free_addresses(struct rpc_xprt *xprt) +{ + unsigned int i; + + for (i = 0; i < RPC_DISPLAY_MAX; i++) + switch (i) { + case RPC_DISPLAY_PROTO: + case RPC_DISPLAY_NETID: + continue; + default: + kfree(xprt->address_strings[i]); + } +} + +static void +xprt_rdma_connect_worker(struct work_struct *work) +{ + struct rpcrdma_xprt *r_xprt = + container_of(work, struct rpcrdma_xprt, rdma_connect.work); + struct rpc_xprt *xprt = &r_xprt->xprt; + int rc = 0; + + xprt_clear_connected(xprt); + + dprintk("RPC: %s: %sconnect\n", __func__, + r_xprt->rx_ep.rep_connected != 0 ? "re" : ""); + rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); + if (rc) + xprt_wake_pending_tasks(xprt, rc); + + dprintk("RPC: %s: exit\n", __func__); + xprt_clear_connecting(xprt); +} + +/* + * xprt_rdma_destroy + * + * Destroy the xprt. + * Free all memory associated with the object, including its own. + * NOTE: none of the *destroy methods free memory for their top-level + * objects, even though they may have allocated it (they do free + * private memory). It's up to the caller to handle it. In this + * case (RDMA transport), all structure memory is inlined with the + * struct rpcrdma_xprt. + */ +static void +xprt_rdma_destroy(struct rpc_xprt *xprt) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + + dprintk("RPC: %s: called\n", __func__); + + cancel_delayed_work_sync(&r_xprt->rdma_connect); + + xprt_clear_connected(xprt); + + rpcrdma_buffer_destroy(&r_xprt->rx_buf); + rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia); + rpcrdma_ia_close(&r_xprt->rx_ia); + + xprt_rdma_free_addresses(xprt); + + xprt_free(xprt); + + dprintk("RPC: %s: returning\n", __func__); + + module_put(THIS_MODULE); +} + +static const struct rpc_timeout xprt_rdma_default_timeout = { + .to_initval = 60 * HZ, + .to_maxval = 60 * HZ, +}; + +/** + * xprt_setup_rdma - Set up transport to use RDMA + * + * @args: rpc transport arguments + */ +static struct rpc_xprt * +xprt_setup_rdma(struct xprt_create *args) +{ + struct rpcrdma_create_data_internal cdata; + struct rpc_xprt *xprt; + struct rpcrdma_xprt *new_xprt; + struct rpcrdma_ep *new_ep; + struct sockaddr_in *sin; + int rc; + + if (args->addrlen > sizeof(xprt->addr)) { + dprintk("RPC: %s: address too large\n", __func__); + return ERR_PTR(-EBADF); + } + + xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), + xprt_rdma_slot_table_entries, + xprt_rdma_slot_table_entries); + if (xprt == NULL) { + dprintk("RPC: %s: couldn't allocate rpcrdma_xprt\n", + __func__); + return ERR_PTR(-ENOMEM); + } + + /* 60 second timeout, no retries */ + xprt->timeout = &xprt_rdma_default_timeout; + xprt->bind_timeout = RPCRDMA_BIND_TO; + xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; + xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO; + + xprt->resvport = 0; /* privileged port not needed */ + xprt->tsh_size = 0; /* RPC-RDMA handles framing */ + xprt->ops = &xprt_rdma_procs; + + /* + * Set up RDMA-specific connect data. + */ + + /* Put server RDMA address in local cdata */ + memcpy(&cdata.addr, args->dstaddr, args->addrlen); + + /* Ensure xprt->addr holds valid server TCP (not RDMA) + * address, for any side protocols which peek at it */ + xprt->prot = IPPROTO_TCP; + xprt->addrlen = args->addrlen; + memcpy(&xprt->addr, &cdata.addr, xprt->addrlen); + + sin = (struct sockaddr_in *)&cdata.addr; + if (ntohs(sin->sin_port) != 0) + xprt_set_bound(xprt); + + dprintk("RPC: %s: %pI4:%u\n", + __func__, &sin->sin_addr.s_addr, ntohs(sin->sin_port)); + + /* Set max requests */ + cdata.max_requests = xprt->max_reqs; + + /* Set some length limits */ + cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */ + cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */ + + cdata.inline_wsize = xprt_rdma_max_inline_write; + if (cdata.inline_wsize > cdata.wsize) + cdata.inline_wsize = cdata.wsize; + + cdata.inline_rsize = xprt_rdma_max_inline_read; + if (cdata.inline_rsize > cdata.rsize) + cdata.inline_rsize = cdata.rsize; + + cdata.padding = xprt_rdma_inline_write_padding; + + /* + * Create new transport instance, which includes initialized + * o ia + * o endpoint + * o buffers + */ + + new_xprt = rpcx_to_rdmax(xprt); + + rc = rpcrdma_ia_open(new_xprt, (struct sockaddr *) &cdata.addr, + xprt_rdma_memreg_strategy); + if (rc) + goto out1; + + /* + * initialize and create ep + */ + new_xprt->rx_data = cdata; + new_ep = &new_xprt->rx_ep; + new_ep->rep_remote_addr = cdata.addr; + + rc = rpcrdma_ep_create(&new_xprt->rx_ep, + &new_xprt->rx_ia, &new_xprt->rx_data); + if (rc) + goto out2; + + /* + * Allocate pre-registered send and receive buffers for headers and + * any inline data. Also specify any padding which will be provided + * from a preregistered zero buffer. + */ + rc = rpcrdma_buffer_create(&new_xprt->rx_buf, new_ep, &new_xprt->rx_ia, + &new_xprt->rx_data); + if (rc) + goto out3; + + /* + * Register a callback for connection events. This is necessary because + * connection loss notification is async. We also catch connection loss + * when reaping receives. + */ + INIT_DELAYED_WORK(&new_xprt->rdma_connect, xprt_rdma_connect_worker); + new_ep->rep_func = rpcrdma_conn_func; + new_ep->rep_xprt = xprt; + + xprt_rdma_format_addresses(xprt); + xprt->max_payload = rpcrdma_max_payload(new_xprt); + dprintk("RPC: %s: transport data payload maximum: %zu bytes\n", + __func__, xprt->max_payload); + + if (!try_module_get(THIS_MODULE)) + goto out4; + + return xprt; + +out4: + xprt_rdma_free_addresses(xprt); + rc = -EINVAL; +out3: + rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia); +out2: + rpcrdma_ia_close(&new_xprt->rx_ia); +out1: + xprt_free(xprt); + return ERR_PTR(rc); +} + +/* + * Close a connection, during shutdown or timeout/reconnect + */ +static void +xprt_rdma_close(struct rpc_xprt *xprt) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + + dprintk("RPC: %s: closing\n", __func__); + if (r_xprt->rx_ep.rep_connected > 0) + xprt->reestablish_timeout = 0; + xprt_disconnect_done(xprt); + rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia); +} + +static void +xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port) +{ + struct sockaddr_in *sap; + + sap = (struct sockaddr_in *)&xprt->addr; + sap->sin_port = htons(port); + sap = (struct sockaddr_in *)&rpcx_to_rdmad(xprt).addr; + sap->sin_port = htons(port); + dprintk("RPC: %s: %u\n", __func__, port); +} + +static void +xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + + if (r_xprt->rx_ep.rep_connected != 0) { + /* Reconnect */ + schedule_delayed_work(&r_xprt->rdma_connect, + xprt->reestablish_timeout); + xprt->reestablish_timeout <<= 1; + if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO) + xprt->reestablish_timeout = RPCRDMA_MAX_REEST_TO; + else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO) + xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; + } else { + schedule_delayed_work(&r_xprt->rdma_connect, 0); + if (!RPC_IS_ASYNC(task)) + flush_delayed_work(&r_xprt->rdma_connect); + } +} + +/* + * The RDMA allocate/free functions need the task structure as a place + * to hide the struct rpcrdma_req, which is necessary for the actual send/recv + * sequence. For this reason, the recv buffers are attached to send + * buffers for portions of the RPC. Note that the RPC layer allocates + * both send and receive buffers in the same call. We may register + * the receive buffer portion when using reply chunks. + */ +static void * +xprt_rdma_allocate(struct rpc_task *task, size_t size) +{ + struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; + struct rpcrdma_req *req, *nreq; + + req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf); + if (req == NULL) + return NULL; + + if (size > req->rl_size) { + dprintk("RPC: %s: size %zd too large for buffer[%zd]: " + "prog %d vers %d proc %d\n", + __func__, size, req->rl_size, + task->tk_client->cl_prog, task->tk_client->cl_vers, + task->tk_msg.rpc_proc->p_proc); + /* + * Outgoing length shortage. Our inline write max must have + * been configured to perform direct i/o. + * + * This is therefore a large metadata operation, and the + * allocate call was made on the maximum possible message, + * e.g. containing long filename(s) or symlink data. In + * fact, while these metadata operations *might* carry + * large outgoing payloads, they rarely *do*. However, we + * have to commit to the request here, so reallocate and + * register it now. The data path will never require this + * reallocation. + * + * If the allocation or registration fails, the RPC framework + * will (doggedly) retry. + */ + if (task->tk_flags & RPC_TASK_SWAPPER) + nreq = kmalloc(sizeof *req + size, GFP_ATOMIC); + else + nreq = kmalloc(sizeof *req + size, GFP_NOFS); + if (nreq == NULL) + goto outfail; + + if (rpcrdma_register_internal(&rpcx_to_rdmax(xprt)->rx_ia, + nreq->rl_base, size + sizeof(struct rpcrdma_req) + - offsetof(struct rpcrdma_req, rl_base), + &nreq->rl_handle, &nreq->rl_iov)) { + kfree(nreq); + goto outfail; + } + rpcx_to_rdmax(xprt)->rx_stats.hardway_register_count += size; + nreq->rl_size = size; + nreq->rl_niovs = 0; + nreq->rl_nchunks = 0; + nreq->rl_buffer = (struct rpcrdma_buffer *)req; + nreq->rl_reply = req->rl_reply; + memcpy(nreq->rl_segments, + req->rl_segments, sizeof nreq->rl_segments); + /* flag the swap with an unused field */ + nreq->rl_iov.length = 0; + req->rl_reply = NULL; + req = nreq; + } + dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); + req->rl_connect_cookie = 0; /* our reserved value */ + return req->rl_xdr_buf; + +outfail: + rpcrdma_buffer_put(req); + rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++; + return NULL; +} + +/* + * This function returns all RDMA resources to the pool. + */ +static void +xprt_rdma_free(void *buffer) +{ + struct rpcrdma_req *req; + struct rpcrdma_xprt *r_xprt; + struct rpcrdma_rep *rep; + int i; + + if (buffer == NULL) + return; + + req = container_of(buffer, struct rpcrdma_req, rl_xdr_buf[0]); + if (req->rl_iov.length == 0) { /* see allocate above */ + r_xprt = container_of(((struct rpcrdma_req *) req->rl_buffer)->rl_buffer, + struct rpcrdma_xprt, rx_buf); + } else + r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf); + rep = req->rl_reply; + + dprintk("RPC: %s: called on 0x%p%s\n", + __func__, rep, (rep && rep->rr_func) ? " (with waiter)" : ""); + + /* + * Finish the deregistration. The process is considered + * complete when the rr_func vector becomes NULL - this + * was put in place during rpcrdma_reply_handler() - the wait + * call below will not block if the dereg is "done". If + * interrupted, our framework will clean up. + */ + for (i = 0; req->rl_nchunks;) { + --req->rl_nchunks; + i += rpcrdma_deregister_external( + &req->rl_segments[i], r_xprt); + } + + if (req->rl_iov.length == 0) { /* see allocate above */ + struct rpcrdma_req *oreq = (struct rpcrdma_req *)req->rl_buffer; + oreq->rl_reply = req->rl_reply; + (void) rpcrdma_deregister_internal(&r_xprt->rx_ia, + req->rl_handle, + &req->rl_iov); + kfree(req); + req = oreq; + } + + /* Put back request+reply buffers */ + rpcrdma_buffer_put(req); +} + +/* + * send_request invokes the meat of RPC RDMA. It must do the following: + * 1. Marshal the RPC request into an RPC RDMA request, which means + * putting a header in front of data, and creating IOVs for RDMA + * from those in the request. + * 2. In marshaling, detect opportunities for RDMA, and use them. + * 3. Post a recv message to set up asynch completion, then send + * the request (rpcrdma_ep_post). + * 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP). + */ + +static int +xprt_rdma_send_request(struct rpc_task *task) +{ + struct rpc_rqst *rqst = task->tk_rqstp; + struct rpc_xprt *xprt = rqst->rq_xprt; + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + int rc = 0; + + if (req->rl_niovs == 0) + rc = rpcrdma_marshal_req(rqst); + else if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR) + rc = rpcrdma_marshal_chunks(rqst, 0); + if (rc < 0) + goto failed_marshal; + + if (req->rl_reply == NULL) /* e.g. reconnection */ + rpcrdma_recv_buffer_get(req); + + if (req->rl_reply) { + req->rl_reply->rr_func = rpcrdma_reply_handler; + /* this need only be done once, but... */ + req->rl_reply->rr_xprt = xprt; + } + + /* Must suppress retransmit to maintain credits */ + if (req->rl_connect_cookie == xprt->connect_cookie) + goto drop_connection; + req->rl_connect_cookie = xprt->connect_cookie; + + if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) + goto drop_connection; + + rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len; + rqst->rq_bytes_sent = 0; + return 0; + +failed_marshal: + r_xprt->rx_stats.failed_marshal_count++; + dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n", + __func__, rc); + if (rc == -EIO) + return -EIO; +drop_connection: + xprt_disconnect_done(xprt); + return -ENOTCONN; /* implies disconnect */ +} + +static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + long idle_time = 0; + + if (xprt_connected(xprt)) + idle_time = (long)(jiffies - xprt->last_used) / HZ; + + seq_printf(seq, + "\txprt:\trdma %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu " + "%lu %lu %lu %Lu %Lu %Lu %Lu %lu %lu %lu\n", + + 0, /* need a local port? */ + xprt->stat.bind_count, + xprt->stat.connect_count, + xprt->stat.connect_time, + idle_time, + xprt->stat.sends, + xprt->stat.recvs, + xprt->stat.bad_xids, + xprt->stat.req_u, + xprt->stat.bklog_u, + + r_xprt->rx_stats.read_chunk_count, + r_xprt->rx_stats.write_chunk_count, + r_xprt->rx_stats.reply_chunk_count, + r_xprt->rx_stats.total_rdma_request, + r_xprt->rx_stats.total_rdma_reply, + r_xprt->rx_stats.pullup_copy_count, + r_xprt->rx_stats.fixup_copy_count, + r_xprt->rx_stats.hardway_register_count, + r_xprt->rx_stats.failed_marshal_count, + r_xprt->rx_stats.bad_reply_count); +} + +/* + * Plumbing for rpc transport switch and kernel module + */ + +static struct rpc_xprt_ops xprt_rdma_procs = { + .reserve_xprt = xprt_reserve_xprt_cong, + .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */ + .alloc_slot = xprt_alloc_slot, + .release_request = xprt_release_rqst_cong, /* ditto */ + .set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */ + .rpcbind = rpcb_getport_async, /* sunrpc/rpcb_clnt.c */ + .set_port = xprt_rdma_set_port, + .connect = xprt_rdma_connect, + .buf_alloc = xprt_rdma_allocate, + .buf_free = xprt_rdma_free, + .send_request = xprt_rdma_send_request, + .close = xprt_rdma_close, + .destroy = xprt_rdma_destroy, + .print_stats = xprt_rdma_print_stats +}; + +static struct xprt_class xprt_rdma = { + .list = LIST_HEAD_INIT(xprt_rdma.list), + .name = "rdma", + .owner = THIS_MODULE, + .ident = XPRT_TRANSPORT_RDMA, + .setup = xprt_setup_rdma, +}; + +static void __exit xprt_rdma_cleanup(void) +{ + int rc; + + dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n"); +#ifdef RPC_DEBUG + if (sunrpc_table_header) { + unregister_sysctl_table(sunrpc_table_header); + sunrpc_table_header = NULL; + } +#endif + rc = xprt_unregister_transport(&xprt_rdma); + if (rc) + dprintk("RPC: %s: xprt_unregister returned %i\n", + __func__, rc); +} + +static int __init xprt_rdma_init(void) +{ + int rc; + + rc = xprt_register_transport(&xprt_rdma); + + if (rc) + return rc; + + dprintk("RPCRDMA Module Init, register RPC RDMA transport\n"); + + dprintk("Defaults:\n"); + dprintk("\tSlots %d\n" + "\tMaxInlineRead %d\n\tMaxInlineWrite %d\n", + xprt_rdma_slot_table_entries, + xprt_rdma_max_inline_read, xprt_rdma_max_inline_write); + dprintk("\tPadding %d\n\tMemreg %d\n", + xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy); + +#ifdef RPC_DEBUG + if (!sunrpc_table_header) + sunrpc_table_header = register_sysctl_table(sunrpc_table); +#endif + return 0; +} + +module_init(xprt_rdma_init); +module_exit(xprt_rdma_cleanup); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c new file mode 100644 index 0000000..61c4129 --- /dev/null +++ b/net/sunrpc/xprtrdma/verbs.c @@ -0,0 +1,2076 @@ +/* + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * verbs.c + * + * Encapsulates the major functions managing: + * o adapters + * o endpoints + * o connections + * o buffer memory + */ + +#include +#include +#include + +#include "xprt_rdma.h" + +/* + * Globals/Macros + */ + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +static void rpcrdma_reset_frmrs(struct rpcrdma_ia *); + +/* + * internal functions + */ + +/* + * handle replies in tasklet context, using a single, global list + * rdma tasklet function -- just turn around and call the func + * for all replies on the list + */ + +static DEFINE_SPINLOCK(rpcrdma_tk_lock_g); +static LIST_HEAD(rpcrdma_tasklets_g); + +static void +rpcrdma_run_tasklet(unsigned long data) +{ + struct rpcrdma_rep *rep; + void (*func)(struct rpcrdma_rep *); + unsigned long flags; + + data = data; + spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); + while (!list_empty(&rpcrdma_tasklets_g)) { + rep = list_entry(rpcrdma_tasklets_g.next, + struct rpcrdma_rep, rr_list); + list_del(&rep->rr_list); + func = rep->rr_func; + rep->rr_func = NULL; + spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); + + if (func) + func(rep); + else + rpcrdma_recv_buffer_put(rep); + + spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); + } + spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); +} + +static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL); + +static void +rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) +{ + struct rpcrdma_ep *ep = context; + + dprintk("RPC: %s: QP error %X on device %s ep %p\n", + __func__, event->event, event->device->name, context); + if (ep->rep_connected == 1) { + ep->rep_connected = -EIO; + ep->rep_func(ep); + wake_up_all(&ep->rep_connect_wait); + } +} + +static void +rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context) +{ + struct rpcrdma_ep *ep = context; + + dprintk("RPC: %s: CQ error %X on device %s ep %p\n", + __func__, event->event, event->device->name, context); + if (ep->rep_connected == 1) { + ep->rep_connected = -EIO; + ep->rep_func(ep); + wake_up_all(&ep->rep_connect_wait); + } +} + +static void +rpcrdma_sendcq_process_wc(struct ib_wc *wc) +{ + struct rpcrdma_mw *frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; + + dprintk("RPC: %s: frmr %p status %X opcode %d\n", + __func__, frmr, wc->status, wc->opcode); + + if (wc->wr_id == 0ULL) + return; + if (wc->status != IB_WC_SUCCESS) + frmr->r.frmr.fr_state = FRMR_IS_STALE; +} + +static int +rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) +{ + struct ib_wc *wcs; + int budget, count, rc; + + budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE; + do { + wcs = ep->rep_send_wcs; + + rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs); + if (rc <= 0) + return rc; + + count = rc; + while (count-- > 0) + rpcrdma_sendcq_process_wc(wcs++); + } while (rc == RPCRDMA_POLLSIZE && --budget); + return 0; +} + +/* + * Handle send, fast_reg_mr, and local_inv completions. + * + * Send events are typically suppressed and thus do not result + * in an upcall. Occasionally one is signaled, however. This + * prevents the provider's completion queue from wrapping and + * losing a completion. + */ +static void +rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context) +{ + struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context; + int rc; + + rc = rpcrdma_sendcq_poll(cq, ep); + if (rc) { + dprintk("RPC: %s: ib_poll_cq failed: %i\n", + __func__, rc); + return; + } + + rc = ib_req_notify_cq(cq, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + if (rc == 0) + return; + if (rc < 0) { + dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", + __func__, rc); + return; + } + + rpcrdma_sendcq_poll(cq, ep); +} + +static void +rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list) +{ + struct rpcrdma_rep *rep = + (struct rpcrdma_rep *)(unsigned long)wc->wr_id; + + dprintk("RPC: %s: rep %p status %X opcode %X length %u\n", + __func__, rep, wc->status, wc->opcode, wc->byte_len); + + if (wc->status != IB_WC_SUCCESS) { + rep->rr_len = ~0U; + goto out_schedule; + } + if (wc->opcode != IB_WC_RECV) + return; + + rep->rr_len = wc->byte_len; + ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device, + rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE); + + if (rep->rr_len >= 16) { + struct rpcrdma_msg *p = (struct rpcrdma_msg *)rep->rr_base; + unsigned int credits = ntohl(p->rm_credit); + + if (credits == 0) + credits = 1; /* don't deadlock */ + else if (credits > rep->rr_buffer->rb_max_requests) + credits = rep->rr_buffer->rb_max_requests; + atomic_set(&rep->rr_buffer->rb_credits, credits); + } + +out_schedule: + list_add_tail(&rep->rr_list, sched_list); +} + +static int +rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) +{ + struct list_head sched_list; + struct ib_wc *wcs; + int budget, count, rc; + unsigned long flags; + + INIT_LIST_HEAD(&sched_list); + budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE; + do { + wcs = ep->rep_recv_wcs; + + rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs); + if (rc <= 0) + goto out_schedule; + + count = rc; + while (count-- > 0) + rpcrdma_recvcq_process_wc(wcs++, &sched_list); + } while (rc == RPCRDMA_POLLSIZE && --budget); + rc = 0; + +out_schedule: + spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); + list_splice_tail(&sched_list, &rpcrdma_tasklets_g); + spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); + tasklet_schedule(&rpcrdma_tasklet_g); + return rc; +} + +/* + * Handle receive completions. + * + * It is reentrant but processes single events in order to maintain + * ordering of receives to keep server credits. + * + * It is the responsibility of the scheduled tasklet to return + * recv buffers to the pool. NOTE: this affects synchronization of + * connection shutdown. That is, the structures required for + * the completion of the reply handler must remain intact until + * all memory has been reclaimed. + */ +static void +rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context) +{ + struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context; + int rc; + + rc = rpcrdma_recvcq_poll(cq, ep); + if (rc) { + dprintk("RPC: %s: ib_poll_cq failed: %i\n", + __func__, rc); + return; + } + + rc = ib_req_notify_cq(cq, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + if (rc == 0) + return; + if (rc < 0) { + dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", + __func__, rc); + return; + } + + rpcrdma_recvcq_poll(cq, ep); +} + +static void +rpcrdma_flush_cqs(struct rpcrdma_ep *ep) +{ + rpcrdma_recvcq_upcall(ep->rep_attr.recv_cq, ep); + rpcrdma_sendcq_upcall(ep->rep_attr.send_cq, ep); +} + +#ifdef RPC_DEBUG +static const char * const conn[] = { + "address resolved", + "address error", + "route resolved", + "route error", + "connect request", + "connect response", + "connect error", + "unreachable", + "rejected", + "established", + "disconnected", + "device removal", + "multicast join", + "multicast error", + "address change", + "timewait exit", +}; + +#define CONNECTION_MSG(status) \ + ((status) < ARRAY_SIZE(conn) ? \ + conn[(status)] : "unrecognized connection error") +#endif + +static int +rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) +{ + struct rpcrdma_xprt *xprt = id->context; + struct rpcrdma_ia *ia = &xprt->rx_ia; + struct rpcrdma_ep *ep = &xprt->rx_ep; +#ifdef RPC_DEBUG + struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr; +#endif + struct ib_qp_attr attr; + struct ib_qp_init_attr iattr; + int connstate = 0; + + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + case RDMA_CM_EVENT_ROUTE_RESOLVED: + ia->ri_async_rc = 0; + complete(&ia->ri_done); + break; + case RDMA_CM_EVENT_ADDR_ERROR: + ia->ri_async_rc = -EHOSTUNREACH; + dprintk("RPC: %s: CM address resolution error, ep 0x%p\n", + __func__, ep); + complete(&ia->ri_done); + break; + case RDMA_CM_EVENT_ROUTE_ERROR: + ia->ri_async_rc = -ENETUNREACH; + dprintk("RPC: %s: CM route resolution error, ep 0x%p\n", + __func__, ep); + complete(&ia->ri_done); + break; + case RDMA_CM_EVENT_ESTABLISHED: + connstate = 1; + ib_query_qp(ia->ri_id->qp, &attr, + IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC, + &iattr); + dprintk("RPC: %s: %d responder resources" + " (%d initiator)\n", + __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic); + goto connected; + case RDMA_CM_EVENT_CONNECT_ERROR: + connstate = -ENOTCONN; + goto connected; + case RDMA_CM_EVENT_UNREACHABLE: + connstate = -ENETDOWN; + goto connected; + case RDMA_CM_EVENT_REJECTED: + connstate = -ECONNREFUSED; + goto connected; + case RDMA_CM_EVENT_DISCONNECTED: + connstate = -ECONNABORTED; + goto connected; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + connstate = -ENODEV; +connected: + atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1); + dprintk("RPC: %s: %sconnected\n", + __func__, connstate > 0 ? "" : "dis"); + ep->rep_connected = connstate; + ep->rep_func(ep); + wake_up_all(&ep->rep_connect_wait); + /*FALLTHROUGH*/ + default: + dprintk("RPC: %s: %pI4:%u (ep 0x%p): %s\n", + __func__, &addr->sin_addr.s_addr, + ntohs(addr->sin_port), ep, + CONNECTION_MSG(event->event)); + break; + } + +#ifdef RPC_DEBUG + if (connstate == 1) { + int ird = attr.max_dest_rd_atomic; + int tird = ep->rep_remote_cma.responder_resources; + printk(KERN_INFO "rpcrdma: connection to %pI4:%u " + "on %s, memreg %d slots %d ird %d%s\n", + &addr->sin_addr.s_addr, + ntohs(addr->sin_port), + ia->ri_id->device->name, + ia->ri_memreg_strategy, + xprt->rx_buf.rb_max_requests, + ird, ird < 4 && ird < tird / 2 ? " (low!)" : ""); + } else if (connstate < 0) { + printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n", + &addr->sin_addr.s_addr, + ntohs(addr->sin_port), + connstate); + } +#endif + + return 0; +} + +static struct rdma_cm_id * +rpcrdma_create_id(struct rpcrdma_xprt *xprt, + struct rpcrdma_ia *ia, struct sockaddr *addr) +{ + struct rdma_cm_id *id; + int rc; + + init_completion(&ia->ri_done); + + id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(id)) { + rc = PTR_ERR(id); + dprintk("RPC: %s: rdma_create_id() failed %i\n", + __func__, rc); + return id; + } + + ia->ri_async_rc = -ETIMEDOUT; + rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT); + if (rc) { + dprintk("RPC: %s: rdma_resolve_addr() failed %i\n", + __func__, rc); + goto out; + } + wait_for_completion_interruptible_timeout(&ia->ri_done, + msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); + rc = ia->ri_async_rc; + if (rc) + goto out; + + ia->ri_async_rc = -ETIMEDOUT; + rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); + if (rc) { + dprintk("RPC: %s: rdma_resolve_route() failed %i\n", + __func__, rc); + goto out; + } + wait_for_completion_interruptible_timeout(&ia->ri_done, + msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); + rc = ia->ri_async_rc; + if (rc) + goto out; + + return id; + +out: + rdma_destroy_id(id); + return ERR_PTR(rc); +} + +/* + * Drain any cq, prior to teardown. + */ +static void +rpcrdma_clean_cq(struct ib_cq *cq) +{ + struct ib_wc wc; + int count = 0; + + while (1 == ib_poll_cq(cq, 1, &wc)) + ++count; + + if (count) + dprintk("RPC: %s: flushed %d events (last 0x%x)\n", + __func__, count, wc.opcode); +} + +/* + * Exported functions. + */ + +/* + * Open and initialize an Interface Adapter. + * o initializes fields of struct rpcrdma_ia, including + * interface and provider attributes and protection zone. + */ +int +rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) +{ + int rc, mem_priv; + struct ib_device_attr devattr; + struct rpcrdma_ia *ia = &xprt->rx_ia; + + ia->ri_id = rpcrdma_create_id(xprt, ia, addr); + if (IS_ERR(ia->ri_id)) { + rc = PTR_ERR(ia->ri_id); + goto out1; + } + + ia->ri_pd = ib_alloc_pd(ia->ri_id->device); + if (IS_ERR(ia->ri_pd)) { + rc = PTR_ERR(ia->ri_pd); + dprintk("RPC: %s: ib_alloc_pd() failed %i\n", + __func__, rc); + goto out2; + } + + /* + * Query the device to determine if the requested memory + * registration strategy is supported. If it isn't, set the + * strategy to a globally supported model. + */ + rc = ib_query_device(ia->ri_id->device, &devattr); + if (rc) { + dprintk("RPC: %s: ib_query_device failed %d\n", + __func__, rc); + goto out2; + } + + if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) { + ia->ri_have_dma_lkey = 1; + ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey; + } + + if (memreg == RPCRDMA_FRMR) { + /* Requires both frmr reg and local dma lkey */ + if ((devattr.device_cap_flags & + (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != + (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { + dprintk("RPC: %s: FRMR registration " + "not supported by HCA\n", __func__); + memreg = RPCRDMA_MTHCAFMR; + } else { + /* Mind the ia limit on FRMR page list depth */ + ia->ri_max_frmr_depth = min_t(unsigned int, + RPCRDMA_MAX_DATA_SEGS, + devattr.max_fast_reg_page_list_len); + } + } + if (memreg == RPCRDMA_MTHCAFMR) { + if (!ia->ri_id->device->alloc_fmr) { + dprintk("RPC: %s: MTHCAFMR registration " + "not supported by HCA\n", __func__); + memreg = RPCRDMA_ALLPHYSICAL; + } + } + + /* + * Optionally obtain an underlying physical identity mapping in + * order to do a memory window-based bind. This base registration + * is protected from remote access - that is enabled only by binding + * for the specific bytes targeted during each RPC operation, and + * revoked after the corresponding completion similar to a storage + * adapter. + */ + switch (memreg) { + case RPCRDMA_FRMR: + break; + case RPCRDMA_ALLPHYSICAL: + mem_priv = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ; + goto register_setup; + case RPCRDMA_MTHCAFMR: + if (ia->ri_have_dma_lkey) + break; + mem_priv = IB_ACCESS_LOCAL_WRITE; + register_setup: + ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); + if (IS_ERR(ia->ri_bind_mem)) { + printk(KERN_ALERT "%s: ib_get_dma_mr for " + "phys register failed with %lX\n", + __func__, PTR_ERR(ia->ri_bind_mem)); + rc = -ENOMEM; + goto out2; + } + break; + default: + printk(KERN_ERR "RPC: Unsupported memory " + "registration mode: %d\n", memreg); + rc = -ENOMEM; + goto out2; + } + dprintk("RPC: %s: memory registration strategy is %d\n", + __func__, memreg); + + /* Else will do memory reg/dereg for each chunk */ + ia->ri_memreg_strategy = memreg; + + rwlock_init(&ia->ri_qplock); + return 0; +out2: + rdma_destroy_id(ia->ri_id); + ia->ri_id = NULL; +out1: + return rc; +} + +/* + * Clean up/close an IA. + * o if event handles and PD have been initialized, free them. + * o close the IA + */ +void +rpcrdma_ia_close(struct rpcrdma_ia *ia) +{ + int rc; + + dprintk("RPC: %s: entering\n", __func__); + if (ia->ri_bind_mem != NULL) { + rc = ib_dereg_mr(ia->ri_bind_mem); + dprintk("RPC: %s: ib_dereg_mr returned %i\n", + __func__, rc); + } + if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) { + if (ia->ri_id->qp) + rdma_destroy_qp(ia->ri_id); + rdma_destroy_id(ia->ri_id); + ia->ri_id = NULL; + } + if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) { + rc = ib_dealloc_pd(ia->ri_pd); + dprintk("RPC: %s: ib_dealloc_pd returned %i\n", + __func__, rc); + } +} + +/* + * Create unconnected endpoint. + */ +int +rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, + struct rpcrdma_create_data_internal *cdata) +{ + struct ib_device_attr devattr; + struct ib_cq *sendcq, *recvcq; + int rc, err; + + rc = ib_query_device(ia->ri_id->device, &devattr); + if (rc) { + dprintk("RPC: %s: ib_query_device failed %d\n", + __func__, rc); + return rc; + } + + /* check provider's send/recv wr limits */ + if (cdata->max_requests > devattr.max_qp_wr) + cdata->max_requests = devattr.max_qp_wr; + + ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; + ep->rep_attr.qp_context = ep; + /* send_cq and recv_cq initialized below */ + ep->rep_attr.srq = NULL; + ep->rep_attr.cap.max_send_wr = cdata->max_requests; + switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: { + int depth = 7; + + /* Add room for frmr register and invalidate WRs. + * 1. FRMR reg WR for head + * 2. FRMR invalidate WR for head + * 3. N FRMR reg WRs for pagelist + * 4. N FRMR invalidate WRs for pagelist + * 5. FRMR reg WR for tail + * 6. FRMR invalidate WR for tail + * 7. The RDMA_SEND WR + */ + + /* Calculate N if the device max FRMR depth is smaller than + * RPCRDMA_MAX_DATA_SEGS. + */ + if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) { + int delta = RPCRDMA_MAX_DATA_SEGS - + ia->ri_max_frmr_depth; + + do { + depth += 2; /* FRMR reg + invalidate */ + delta -= ia->ri_max_frmr_depth; + } while (delta > 0); + + } + ep->rep_attr.cap.max_send_wr *= depth; + if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) { + cdata->max_requests = devattr.max_qp_wr / depth; + if (!cdata->max_requests) + return -EINVAL; + ep->rep_attr.cap.max_send_wr = cdata->max_requests * + depth; + } + break; + } + default: + break; + } + ep->rep_attr.cap.max_recv_wr = cdata->max_requests; + ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2); + ep->rep_attr.cap.max_recv_sge = 1; + ep->rep_attr.cap.max_inline_data = 0; + ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + ep->rep_attr.qp_type = IB_QPT_RC; + ep->rep_attr.port_num = ~0; + + dprintk("RPC: %s: requested max: dtos: send %d recv %d; " + "iovs: send %d recv %d\n", + __func__, + ep->rep_attr.cap.max_send_wr, + ep->rep_attr.cap.max_recv_wr, + ep->rep_attr.cap.max_send_sge, + ep->rep_attr.cap.max_recv_sge); + + /* set trigger for requesting send completion */ + ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1; + if (ep->rep_cqinit <= 2) + ep->rep_cqinit = 0; + INIT_CQCOUNT(ep); + ep->rep_ia = ia; + init_waitqueue_head(&ep->rep_connect_wait); + INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); + + sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall, + rpcrdma_cq_async_error_upcall, ep, + ep->rep_attr.cap.max_send_wr + 1, 0); + if (IS_ERR(sendcq)) { + rc = PTR_ERR(sendcq); + dprintk("RPC: %s: failed to create send CQ: %i\n", + __func__, rc); + goto out1; + } + + rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP); + if (rc) { + dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", + __func__, rc); + goto out2; + } + + recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall, + rpcrdma_cq_async_error_upcall, ep, + ep->rep_attr.cap.max_recv_wr + 1, 0); + if (IS_ERR(recvcq)) { + rc = PTR_ERR(recvcq); + dprintk("RPC: %s: failed to create recv CQ: %i\n", + __func__, rc); + goto out2; + } + + rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP); + if (rc) { + dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", + __func__, rc); + ib_destroy_cq(recvcq); + goto out2; + } + + ep->rep_attr.send_cq = sendcq; + ep->rep_attr.recv_cq = recvcq; + + /* Initialize cma parameters */ + + /* RPC/RDMA does not use private data */ + ep->rep_remote_cma.private_data = NULL; + ep->rep_remote_cma.private_data_len = 0; + + /* Client offers RDMA Read but does not initiate */ + ep->rep_remote_cma.initiator_depth = 0; + if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ + ep->rep_remote_cma.responder_resources = 32; + else + ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom; + + ep->rep_remote_cma.retry_count = 7; + ep->rep_remote_cma.flow_control = 0; + ep->rep_remote_cma.rnr_retry_count = 0; + + return 0; + +out2: + err = ib_destroy_cq(sendcq); + if (err) + dprintk("RPC: %s: ib_destroy_cq returned %i\n", + __func__, err); +out1: + return rc; +} + +/* + * rpcrdma_ep_destroy + * + * Disconnect and destroy endpoint. After this, the only + * valid operations on the ep are to free it (if dynamically + * allocated) or re-create it. + */ +void +rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +{ + int rc; + + dprintk("RPC: %s: entering, connected is %d\n", + __func__, ep->rep_connected); + + cancel_delayed_work_sync(&ep->rep_connect_worker); + + if (ia->ri_id->qp) { + rpcrdma_ep_disconnect(ep, ia); + rdma_destroy_qp(ia->ri_id); + ia->ri_id->qp = NULL; + } + + /* padding - could be done in rpcrdma_buffer_destroy... */ + if (ep->rep_pad_mr) { + rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad); + ep->rep_pad_mr = NULL; + } + + rpcrdma_clean_cq(ep->rep_attr.recv_cq); + rc = ib_destroy_cq(ep->rep_attr.recv_cq); + if (rc) + dprintk("RPC: %s: ib_destroy_cq returned %i\n", + __func__, rc); + + rpcrdma_clean_cq(ep->rep_attr.send_cq); + rc = ib_destroy_cq(ep->rep_attr.send_cq); + if (rc) + dprintk("RPC: %s: ib_destroy_cq returned %i\n", + __func__, rc); +} + +/* + * Connect unconnected endpoint. + */ +int +rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +{ + struct rdma_cm_id *id, *old; + int rc = 0; + int retry_count = 0; + + if (ep->rep_connected != 0) { + struct rpcrdma_xprt *xprt; +retry: + dprintk("RPC: %s: reconnecting...\n", __func__); + + rpcrdma_ep_disconnect(ep, ia); + rpcrdma_flush_cqs(ep); + + if (ia->ri_memreg_strategy == RPCRDMA_FRMR) + rpcrdma_reset_frmrs(ia); + + xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); + id = rpcrdma_create_id(xprt, ia, + (struct sockaddr *)&xprt->rx_data.addr); + if (IS_ERR(id)) { + rc = -EHOSTUNREACH; + goto out; + } + /* TEMP TEMP TEMP - fail if new device: + * Deregister/remarshal *all* requests! + * Close and recreate adapter, pd, etc! + * Re-determine all attributes still sane! + * More stuff I haven't thought of! + * Rrrgh! + */ + if (ia->ri_id->device != id->device) { + printk("RPC: %s: can't reconnect on " + "different device!\n", __func__); + rdma_destroy_id(id); + rc = -ENETUNREACH; + goto out; + } + /* END TEMP */ + rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr); + if (rc) { + dprintk("RPC: %s: rdma_create_qp failed %i\n", + __func__, rc); + rdma_destroy_id(id); + rc = -ENETUNREACH; + goto out; + } + + write_lock(&ia->ri_qplock); + old = ia->ri_id; + ia->ri_id = id; + write_unlock(&ia->ri_qplock); + + rdma_destroy_qp(old); + rdma_destroy_id(old); + } else { + dprintk("RPC: %s: connecting...\n", __func__); + rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); + if (rc) { + dprintk("RPC: %s: rdma_create_qp failed %i\n", + __func__, rc); + /* do not update ep->rep_connected */ + return -ENETUNREACH; + } + } + + ep->rep_connected = 0; + + rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); + if (rc) { + dprintk("RPC: %s: rdma_connect() failed with %i\n", + __func__, rc); + goto out; + } + + wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); + + /* + * Check state. A non-peer reject indicates no listener + * (ECONNREFUSED), which may be a transient state. All + * others indicate a transport condition which has already + * undergone a best-effort. + */ + if (ep->rep_connected == -ECONNREFUSED && + ++retry_count <= RDMA_CONNECT_RETRY_MAX) { + dprintk("RPC: %s: non-peer_reject, retry\n", __func__); + goto retry; + } + if (ep->rep_connected <= 0) { + /* Sometimes, the only way to reliably connect to remote + * CMs is to use same nonzero values for ORD and IRD. */ + if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 && + (ep->rep_remote_cma.responder_resources == 0 || + ep->rep_remote_cma.initiator_depth != + ep->rep_remote_cma.responder_resources)) { + if (ep->rep_remote_cma.responder_resources == 0) + ep->rep_remote_cma.responder_resources = 1; + ep->rep_remote_cma.initiator_depth = + ep->rep_remote_cma.responder_resources; + goto retry; + } + rc = ep->rep_connected; + } else { + dprintk("RPC: %s: connected\n", __func__); + } + +out: + if (rc) + ep->rep_connected = rc; + return rc; +} + +/* + * rpcrdma_ep_disconnect + * + * This is separate from destroy to facilitate the ability + * to reconnect without recreating the endpoint. + * + * This call is not reentrant, and must not be made in parallel + * on the same endpoint. + */ +void +rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +{ + int rc; + + rpcrdma_flush_cqs(ep); + rc = rdma_disconnect(ia->ri_id); + if (!rc) { + /* returns without wait if not connected */ + wait_event_interruptible(ep->rep_connect_wait, + ep->rep_connected != 1); + dprintk("RPC: %s: after wait, %sconnected\n", __func__, + (ep->rep_connected == 1) ? "still " : "dis"); + } else { + dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc); + ep->rep_connected = rc; + } +} + +static int +rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf) +{ + int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; + struct ib_fmr_attr fmr_attr = { + .max_pages = RPCRDMA_MAX_DATA_SEGS, + .max_maps = 1, + .page_shift = PAGE_SHIFT + }; + struct rpcrdma_mw *r; + int i, rc; + + i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; + dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i); + + while (i--) { + r = kzalloc(sizeof(*r), GFP_KERNEL); + if (r == NULL) + return -ENOMEM; + + r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr); + if (IS_ERR(r->r.fmr)) { + rc = PTR_ERR(r->r.fmr); + dprintk("RPC: %s: ib_alloc_fmr failed %i\n", + __func__, rc); + goto out_free; + } + + list_add(&r->mw_list, &buf->rb_mws); + list_add(&r->mw_all, &buf->rb_all); + } + return 0; + +out_free: + kfree(r); + return rc; +} + +static int +rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf) +{ + struct rpcrdma_frmr *f; + struct rpcrdma_mw *r; + int i, rc; + + i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; + dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i); + + while (i--) { + r = kzalloc(sizeof(*r), GFP_KERNEL); + if (r == NULL) + return -ENOMEM; + f = &r->r.frmr; + + f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, + ia->ri_max_frmr_depth); + if (IS_ERR(f->fr_mr)) { + rc = PTR_ERR(f->fr_mr); + dprintk("RPC: %s: ib_alloc_fast_reg_mr " + "failed %i\n", __func__, rc); + goto out_free; + } + + f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device, + ia->ri_max_frmr_depth); + if (IS_ERR(f->fr_pgl)) { + rc = PTR_ERR(f->fr_pgl); + dprintk("RPC: %s: ib_alloc_fast_reg_page_list " + "failed %i\n", __func__, rc); + + ib_dereg_mr(f->fr_mr); + goto out_free; + } + + list_add(&r->mw_list, &buf->rb_mws); + list_add(&r->mw_all, &buf->rb_all); + } + + return 0; + +out_free: + kfree(r); + return rc; +} + +int +rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, + struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) +{ + char *p; + size_t len, rlen, wlen; + int i, rc; + + buf->rb_max_requests = cdata->max_requests; + spin_lock_init(&buf->rb_lock); + atomic_set(&buf->rb_credits, 1); + + /* Need to allocate: + * 1. arrays for send and recv pointers + * 2. arrays of struct rpcrdma_req to fill in pointers + * 3. array of struct rpcrdma_rep for replies + * 4. padding, if any + * Send/recv buffers in req/rep need to be registered + */ + len = buf->rb_max_requests * + (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *)); + len += cdata->padding; + + p = kzalloc(len, GFP_KERNEL); + if (p == NULL) { + dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n", + __func__, len); + rc = -ENOMEM; + goto out; + } + buf->rb_pool = p; /* for freeing it later */ + + buf->rb_send_bufs = (struct rpcrdma_req **) p; + p = (char *) &buf->rb_send_bufs[buf->rb_max_requests]; + buf->rb_recv_bufs = (struct rpcrdma_rep **) p; + p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests]; + + /* + * Register the zeroed pad buffer, if any. + */ + if (cdata->padding) { + rc = rpcrdma_register_internal(ia, p, cdata->padding, + &ep->rep_pad_mr, &ep->rep_pad); + if (rc) + goto out; + } + p += cdata->padding; + + INIT_LIST_HEAD(&buf->rb_mws); + INIT_LIST_HEAD(&buf->rb_all); + switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + rc = rpcrdma_init_frmrs(ia, buf); + if (rc) + goto out; + break; + case RPCRDMA_MTHCAFMR: + rc = rpcrdma_init_fmrs(ia, buf); + if (rc) + goto out; + break; + default: + break; + } + + /* + * Allocate/init the request/reply buffers. Doing this + * using kmalloc for now -- one for each buf. + */ + wlen = 1 << fls(cdata->inline_wsize + sizeof(struct rpcrdma_req)); + rlen = 1 << fls(cdata->inline_rsize + sizeof(struct rpcrdma_rep)); + dprintk("RPC: %s: wlen = %zu, rlen = %zu\n", + __func__, wlen, rlen); + + for (i = 0; i < buf->rb_max_requests; i++) { + struct rpcrdma_req *req; + struct rpcrdma_rep *rep; + + req = kmalloc(wlen, GFP_KERNEL); + if (req == NULL) { + dprintk("RPC: %s: request buffer %d alloc" + " failed\n", __func__, i); + rc = -ENOMEM; + goto out; + } + memset(req, 0, sizeof(struct rpcrdma_req)); + buf->rb_send_bufs[i] = req; + buf->rb_send_bufs[i]->rl_buffer = buf; + + rc = rpcrdma_register_internal(ia, req->rl_base, + wlen - offsetof(struct rpcrdma_req, rl_base), + &buf->rb_send_bufs[i]->rl_handle, + &buf->rb_send_bufs[i]->rl_iov); + if (rc) + goto out; + + buf->rb_send_bufs[i]->rl_size = wlen - + sizeof(struct rpcrdma_req); + + rep = kmalloc(rlen, GFP_KERNEL); + if (rep == NULL) { + dprintk("RPC: %s: reply buffer %d alloc failed\n", + __func__, i); + rc = -ENOMEM; + goto out; + } + memset(rep, 0, sizeof(struct rpcrdma_rep)); + buf->rb_recv_bufs[i] = rep; + buf->rb_recv_bufs[i]->rr_buffer = buf; + + rc = rpcrdma_register_internal(ia, rep->rr_base, + rlen - offsetof(struct rpcrdma_rep, rr_base), + &buf->rb_recv_bufs[i]->rr_handle, + &buf->rb_recv_bufs[i]->rr_iov); + if (rc) + goto out; + + } + dprintk("RPC: %s: max_requests %d\n", + __func__, buf->rb_max_requests); + /* done */ + return 0; +out: + rpcrdma_buffer_destroy(buf); + return rc; +} + +static void +rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mw *r; + int rc; + + while (!list_empty(&buf->rb_all)) { + r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); + list_del(&r->mw_all); + list_del(&r->mw_list); + + rc = ib_dealloc_fmr(r->r.fmr); + if (rc) + dprintk("RPC: %s: ib_dealloc_fmr failed %i\n", + __func__, rc); + + kfree(r); + } +} + +static void +rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mw *r; + int rc; + + while (!list_empty(&buf->rb_all)) { + r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); + list_del(&r->mw_all); + list_del(&r->mw_list); + + rc = ib_dereg_mr(r->r.frmr.fr_mr); + if (rc) + dprintk("RPC: %s: ib_dereg_mr failed %i\n", + __func__, rc); + ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); + + kfree(r); + } +} + +void +rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_ia *ia = rdmab_to_ia(buf); + int i; + + /* clean up in reverse order from create + * 1. recv mr memory (mr free, then kfree) + * 2. send mr memory (mr free, then kfree) + * 3. MWs + */ + dprintk("RPC: %s: entering\n", __func__); + + for (i = 0; i < buf->rb_max_requests; i++) { + if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) { + rpcrdma_deregister_internal(ia, + buf->rb_recv_bufs[i]->rr_handle, + &buf->rb_recv_bufs[i]->rr_iov); + kfree(buf->rb_recv_bufs[i]); + } + if (buf->rb_send_bufs && buf->rb_send_bufs[i]) { + rpcrdma_deregister_internal(ia, + buf->rb_send_bufs[i]->rl_handle, + &buf->rb_send_bufs[i]->rl_iov); + kfree(buf->rb_send_bufs[i]); + } + } + + switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + rpcrdma_destroy_frmrs(buf); + break; + case RPCRDMA_MTHCAFMR: + rpcrdma_destroy_fmrs(buf); + break; + default: + break; + } + + kfree(buf->rb_pool); +} + +/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in + * an unusable state. Find FRMRs in this state and dereg / reg + * each. FRMRs that are VALID and attached to an rpcrdma_req are + * also torn down. + * + * This gives all in-use FRMRs a fresh rkey and leaves them INVALID. + * + * This is invoked only in the transport connect worker in order + * to serialize with rpcrdma_register_frmr_external(). + */ +static void +rpcrdma_reset_frmrs(struct rpcrdma_ia *ia) +{ + struct rpcrdma_xprt *r_xprt = + container_of(ia, struct rpcrdma_xprt, rx_ia); + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct list_head *pos; + struct rpcrdma_mw *r; + int rc; + + list_for_each(pos, &buf->rb_all) { + r = list_entry(pos, struct rpcrdma_mw, mw_all); + + if (r->r.frmr.fr_state == FRMR_IS_INVALID) + continue; + + rc = ib_dereg_mr(r->r.frmr.fr_mr); + if (rc) + dprintk("RPC: %s: ib_dereg_mr failed %i\n", + __func__, rc); + ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); + + r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, + ia->ri_max_frmr_depth); + if (IS_ERR(r->r.frmr.fr_mr)) { + rc = PTR_ERR(r->r.frmr.fr_mr); + dprintk("RPC: %s: ib_alloc_fast_reg_mr" + " failed %i\n", __func__, rc); + continue; + } + r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list( + ia->ri_id->device, + ia->ri_max_frmr_depth); + if (IS_ERR(r->r.frmr.fr_pgl)) { + rc = PTR_ERR(r->r.frmr.fr_pgl); + dprintk("RPC: %s: " + "ib_alloc_fast_reg_page_list " + "failed %i\n", __func__, rc); + + ib_dereg_mr(r->r.frmr.fr_mr); + continue; + } + r->r.frmr.fr_state = FRMR_IS_INVALID; + } +} + +/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving + * some req segments uninitialized. + */ +static void +rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf) +{ + if (*mw) { + list_add_tail(&(*mw)->mw_list, &buf->rb_mws); + *mw = NULL; + } +} + +/* Cycle mw's back in reverse order, and "spin" them. + * This delays and scrambles reuse as much as possible. + */ +static void +rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mr_seg *seg = req->rl_segments; + struct rpcrdma_mr_seg *seg1 = seg; + int i; + + for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++) + rpcrdma_buffer_put_mr(&seg->mr_chunk.rl_mw, buf); + rpcrdma_buffer_put_mr(&seg1->mr_chunk.rl_mw, buf); +} + +static void +rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) +{ + buf->rb_send_bufs[--buf->rb_send_index] = req; + req->rl_niovs = 0; + if (req->rl_reply) { + buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply; + req->rl_reply->rr_func = NULL; + req->rl_reply = NULL; + } +} + +/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external(). + * Redo only the ib_post_send(). + */ +static void +rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia) +{ + struct rpcrdma_xprt *r_xprt = + container_of(ia, struct rpcrdma_xprt, rx_ia); + struct ib_send_wr invalidate_wr, *bad_wr; + int rc; + + dprintk("RPC: %s: FRMR %p is stale\n", __func__, r); + + /* When this FRMR is re-inserted into rb_mws, it is no longer stale */ + r->r.frmr.fr_state = FRMR_IS_INVALID; + + memset(&invalidate_wr, 0, sizeof(invalidate_wr)); + invalidate_wr.wr_id = (unsigned long)(void *)r; + invalidate_wr.opcode = IB_WR_LOCAL_INV; + invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey; + DECR_CQCOUNT(&r_xprt->rx_ep); + + dprintk("RPC: %s: frmr %p invalidating rkey %08x\n", + __func__, r, r->r.frmr.fr_mr->rkey); + + read_lock(&ia->ri_qplock); + rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); + read_unlock(&ia->ri_qplock); + if (rc) { + /* Force rpcrdma_buffer_get() to retry */ + r->r.frmr.fr_state = FRMR_IS_STALE; + dprintk("RPC: %s: ib_post_send failed, %i\n", + __func__, rc); + } +} + +static void +rpcrdma_retry_flushed_linv(struct list_head *stale, + struct rpcrdma_buffer *buf) +{ + struct rpcrdma_ia *ia = rdmab_to_ia(buf); + struct list_head *pos; + struct rpcrdma_mw *r; + unsigned long flags; + + list_for_each(pos, stale) { + r = list_entry(pos, struct rpcrdma_mw, mw_list); + rpcrdma_retry_local_inv(r, ia); + } + + spin_lock_irqsave(&buf->rb_lock, flags); + list_splice_tail(stale, &buf->rb_mws); + spin_unlock_irqrestore(&buf->rb_lock, flags); +} + +static struct rpcrdma_req * +rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf, + struct list_head *stale) +{ + struct rpcrdma_mw *r; + int i; + + i = RPCRDMA_MAX_SEGS - 1; + while (!list_empty(&buf->rb_mws)) { + r = list_entry(buf->rb_mws.next, + struct rpcrdma_mw, mw_list); + list_del(&r->mw_list); + if (r->r.frmr.fr_state == FRMR_IS_STALE) { + list_add(&r->mw_list, stale); + continue; + } + req->rl_segments[i].mr_chunk.rl_mw = r; + if (unlikely(i-- == 0)) + return req; /* Success */ + } + + /* Not enough entries on rb_mws for this req */ + rpcrdma_buffer_put_sendbuf(req, buf); + rpcrdma_buffer_put_mrs(req, buf); + return NULL; +} + +static struct rpcrdma_req * +rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mw *r; + int i; + + i = RPCRDMA_MAX_SEGS - 1; + while (!list_empty(&buf->rb_mws)) { + r = list_entry(buf->rb_mws.next, + struct rpcrdma_mw, mw_list); + list_del(&r->mw_list); + req->rl_segments[i].mr_chunk.rl_mw = r; + if (unlikely(i-- == 0)) + return req; /* Success */ + } + + /* Not enough entries on rb_mws for this req */ + rpcrdma_buffer_put_sendbuf(req, buf); + rpcrdma_buffer_put_mrs(req, buf); + return NULL; +} + +/* + * Get a set of request/reply buffers. + * + * Reply buffer (if needed) is attached to send buffer upon return. + * Rule: + * rb_send_index and rb_recv_index MUST always be pointing to the + * *next* available buffer (non-NULL). They are incremented after + * removing buffers, and decremented *before* returning them. + */ +struct rpcrdma_req * +rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) +{ + struct rpcrdma_ia *ia = rdmab_to_ia(buffers); + struct list_head stale; + struct rpcrdma_req *req; + unsigned long flags; + + spin_lock_irqsave(&buffers->rb_lock, flags); + if (buffers->rb_send_index == buffers->rb_max_requests) { + spin_unlock_irqrestore(&buffers->rb_lock, flags); + dprintk("RPC: %s: out of request buffers\n", __func__); + return ((struct rpcrdma_req *)NULL); + } + + req = buffers->rb_send_bufs[buffers->rb_send_index]; + if (buffers->rb_send_index < buffers->rb_recv_index) { + dprintk("RPC: %s: %d extra receives outstanding (ok)\n", + __func__, + buffers->rb_recv_index - buffers->rb_send_index); + req->rl_reply = NULL; + } else { + req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index]; + buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL; + } + buffers->rb_send_bufs[buffers->rb_send_index++] = NULL; + + INIT_LIST_HEAD(&stale); + switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + req = rpcrdma_buffer_get_frmrs(req, buffers, &stale); + break; + case RPCRDMA_MTHCAFMR: + req = rpcrdma_buffer_get_fmrs(req, buffers); + break; + default: + break; + } + spin_unlock_irqrestore(&buffers->rb_lock, flags); + if (!list_empty(&stale)) + rpcrdma_retry_flushed_linv(&stale, buffers); + return req; +} + +/* + * Put request/reply buffers back into pool. + * Pre-decrement counter/array index. + */ +void +rpcrdma_buffer_put(struct rpcrdma_req *req) +{ + struct rpcrdma_buffer *buffers = req->rl_buffer; + struct rpcrdma_ia *ia = rdmab_to_ia(buffers); + unsigned long flags; + + spin_lock_irqsave(&buffers->rb_lock, flags); + rpcrdma_buffer_put_sendbuf(req, buffers); + switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + case RPCRDMA_MTHCAFMR: + rpcrdma_buffer_put_mrs(req, buffers); + break; + default: + break; + } + spin_unlock_irqrestore(&buffers->rb_lock, flags); +} + +/* + * Recover reply buffers from pool. + * This happens when recovering from error conditions. + * Post-increment counter/array index. + */ +void +rpcrdma_recv_buffer_get(struct rpcrdma_req *req) +{ + struct rpcrdma_buffer *buffers = req->rl_buffer; + unsigned long flags; + + if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */ + buffers = ((struct rpcrdma_req *) buffers)->rl_buffer; + spin_lock_irqsave(&buffers->rb_lock, flags); + if (buffers->rb_recv_index < buffers->rb_max_requests) { + req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index]; + buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL; + } + spin_unlock_irqrestore(&buffers->rb_lock, flags); +} + +/* + * Put reply buffers back into pool when not attached to + * request. This happens in error conditions. + */ +void +rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) +{ + struct rpcrdma_buffer *buffers = rep->rr_buffer; + unsigned long flags; + + rep->rr_func = NULL; + spin_lock_irqsave(&buffers->rb_lock, flags); + buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep; + spin_unlock_irqrestore(&buffers->rb_lock, flags); +} + +/* + * Wrappers for internal-use kmalloc memory registration, used by buffer code. + */ + +int +rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, + struct ib_mr **mrp, struct ib_sge *iov) +{ + struct ib_phys_buf ipb; + struct ib_mr *mr; + int rc; + + /* + * All memory passed here was kmalloc'ed, therefore phys-contiguous. + */ + iov->addr = ib_dma_map_single(ia->ri_id->device, + va, len, DMA_BIDIRECTIONAL); + if (ib_dma_mapping_error(ia->ri_id->device, iov->addr)) + return -ENOMEM; + + iov->length = len; + + if (ia->ri_have_dma_lkey) { + *mrp = NULL; + iov->lkey = ia->ri_dma_lkey; + return 0; + } else if (ia->ri_bind_mem != NULL) { + *mrp = NULL; + iov->lkey = ia->ri_bind_mem->lkey; + return 0; + } + + ipb.addr = iov->addr; + ipb.size = iov->length; + mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1, + IB_ACCESS_LOCAL_WRITE, &iov->addr); + + dprintk("RPC: %s: phys convert: 0x%llx " + "registered 0x%llx length %d\n", + __func__, (unsigned long long)ipb.addr, + (unsigned long long)iov->addr, len); + + if (IS_ERR(mr)) { + *mrp = NULL; + rc = PTR_ERR(mr); + dprintk("RPC: %s: failed with %i\n", __func__, rc); + } else { + *mrp = mr; + iov->lkey = mr->lkey; + rc = 0; + } + + return rc; +} + +int +rpcrdma_deregister_internal(struct rpcrdma_ia *ia, + struct ib_mr *mr, struct ib_sge *iov) +{ + int rc; + + ib_dma_unmap_single(ia->ri_id->device, + iov->addr, iov->length, DMA_BIDIRECTIONAL); + + if (NULL == mr) + return 0; + + rc = ib_dereg_mr(mr); + if (rc) + dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc); + return rc; +} + +/* + * Wrappers for chunk registration, shared by read/write chunk code. + */ + +static void +rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing) +{ + seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; + seg->mr_dmalen = seg->mr_len; + if (seg->mr_page) + seg->mr_dma = ib_dma_map_page(ia->ri_id->device, + seg->mr_page, offset_in_page(seg->mr_offset), + seg->mr_dmalen, seg->mr_dir); + else + seg->mr_dma = ib_dma_map_single(ia->ri_id->device, + seg->mr_offset, + seg->mr_dmalen, seg->mr_dir); + if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) { + dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n", + __func__, + (unsigned long long)seg->mr_dma, + seg->mr_offset, seg->mr_dmalen); + } +} + +static void +rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg) +{ + if (seg->mr_page) + ib_dma_unmap_page(ia->ri_id->device, + seg->mr_dma, seg->mr_dmalen, seg->mr_dir); + else + ib_dma_unmap_single(ia->ri_id->device, + seg->mr_dma, seg->mr_dmalen, seg->mr_dir); +} + +static int +rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, + int *nsegs, int writing, struct rpcrdma_ia *ia, + struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_mr_seg *seg1 = seg; + struct rpcrdma_mw *mw = seg1->mr_chunk.rl_mw; + struct rpcrdma_frmr *frmr = &mw->r.frmr; + struct ib_mr *mr = frmr->fr_mr; + struct ib_send_wr fastreg_wr, *bad_wr; + u8 key; + int len, pageoff; + int i, rc; + int seg_len; + u64 pa; + int page_no; + + pageoff = offset_in_page(seg1->mr_offset); + seg1->mr_offset -= pageoff; /* start of page */ + seg1->mr_len += pageoff; + len = -pageoff; + if (*nsegs > ia->ri_max_frmr_depth) + *nsegs = ia->ri_max_frmr_depth; + for (page_no = i = 0; i < *nsegs;) { + rpcrdma_map_one(ia, seg, writing); + pa = seg->mr_dma; + for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) { + frmr->fr_pgl->page_list[page_no++] = pa; + pa += PAGE_SIZE; + } + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < *nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) + break; + } + dprintk("RPC: %s: Using frmr %p to map %d segments\n", + __func__, mw, i); + + frmr->fr_state = FRMR_IS_VALID; + + memset(&fastreg_wr, 0, sizeof(fastreg_wr)); + fastreg_wr.wr_id = (unsigned long)(void *)mw; + fastreg_wr.opcode = IB_WR_FAST_REG_MR; + fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma; + fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl; + fastreg_wr.wr.fast_reg.page_list_len = page_no; + fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT; + if (fastreg_wr.wr.fast_reg.length < len) { + rc = -EIO; + goto out_err; + } + + /* Bump the key */ + key = (u8)(mr->rkey & 0x000000FF); + ib_update_fast_reg_key(mr, ++key); + + fastreg_wr.wr.fast_reg.access_flags = (writing ? + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : + IB_ACCESS_REMOTE_READ); + fastreg_wr.wr.fast_reg.rkey = mr->rkey; + DECR_CQCOUNT(&r_xprt->rx_ep); + + rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr); + if (rc) { + dprintk("RPC: %s: failed ib_post_send for register," + " status %i\n", __func__, rc); + ib_update_fast_reg_key(mr, --key); + goto out_err; + } else { + seg1->mr_rkey = mr->rkey; + seg1->mr_base = seg1->mr_dma + pageoff; + seg1->mr_nsegs = i; + seg1->mr_len = len; + } + *nsegs = i; + return 0; +out_err: + frmr->fr_state = FRMR_IS_INVALID; + while (i--) + rpcrdma_unmap_one(ia, --seg); + return rc; +} + +static int +rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, + struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_mr_seg *seg1 = seg; + struct ib_send_wr invalidate_wr, *bad_wr; + int rc; + + seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_INVALID; + + memset(&invalidate_wr, 0, sizeof invalidate_wr); + invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; + invalidate_wr.opcode = IB_WR_LOCAL_INV; + invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + DECR_CQCOUNT(&r_xprt->rx_ep); + + read_lock(&ia->ri_qplock); + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); + rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); + read_unlock(&ia->ri_qplock); + if (rc) { + /* Force rpcrdma_buffer_get() to retry */ + seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_STALE; + dprintk("RPC: %s: failed ib_post_send for invalidate," + " status %i\n", __func__, rc); + } + return rc; +} + +static int +rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg, + int *nsegs, int writing, struct rpcrdma_ia *ia) +{ + struct rpcrdma_mr_seg *seg1 = seg; + u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; + int len, pageoff, i, rc; + + pageoff = offset_in_page(seg1->mr_offset); + seg1->mr_offset -= pageoff; /* start of page */ + seg1->mr_len += pageoff; + len = -pageoff; + if (*nsegs > RPCRDMA_MAX_DATA_SEGS) + *nsegs = RPCRDMA_MAX_DATA_SEGS; + for (i = 0; i < *nsegs;) { + rpcrdma_map_one(ia, seg, writing); + physaddrs[i] = seg->mr_dma; + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < *nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) + break; + } + rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr, + physaddrs, i, seg1->mr_dma); + if (rc) { + dprintk("RPC: %s: failed ib_map_phys_fmr " + "%u@0x%llx+%i (%d)... status %i\n", __func__, + len, (unsigned long long)seg1->mr_dma, + pageoff, i, rc); + while (i--) + rpcrdma_unmap_one(ia, --seg); + } else { + seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey; + seg1->mr_base = seg1->mr_dma + pageoff; + seg1->mr_nsegs = i; + seg1->mr_len = len; + } + *nsegs = i; + return rc; +} + +static int +rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, + struct rpcrdma_ia *ia) +{ + struct rpcrdma_mr_seg *seg1 = seg; + LIST_HEAD(l); + int rc; + + list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l); + rc = ib_unmap_fmr(&l); + read_lock(&ia->ri_qplock); + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); + read_unlock(&ia->ri_qplock); + if (rc) + dprintk("RPC: %s: failed ib_unmap_fmr," + " status %i\n", __func__, rc); + return rc; +} + +int +rpcrdma_register_external(struct rpcrdma_mr_seg *seg, + int nsegs, int writing, struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + int rc = 0; + + switch (ia->ri_memreg_strategy) { + + case RPCRDMA_ALLPHYSICAL: + rpcrdma_map_one(ia, seg, writing); + seg->mr_rkey = ia->ri_bind_mem->rkey; + seg->mr_base = seg->mr_dma; + seg->mr_nsegs = 1; + nsegs = 1; + break; + + /* Registration using frmr registration */ + case RPCRDMA_FRMR: + rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt); + break; + + /* Registration using fmr memory registration */ + case RPCRDMA_MTHCAFMR: + rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia); + break; + + default: + return -1; + } + if (rc) + return -1; + + return nsegs; +} + +int +rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, + struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + int nsegs = seg->mr_nsegs, rc; + + switch (ia->ri_memreg_strategy) { + + case RPCRDMA_ALLPHYSICAL: + read_lock(&ia->ri_qplock); + rpcrdma_unmap_one(ia, seg); + read_unlock(&ia->ri_qplock); + break; + + case RPCRDMA_FRMR: + rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt); + break; + + case RPCRDMA_MTHCAFMR: + rc = rpcrdma_deregister_fmr_external(seg, ia); + break; + + default: + break; + } + return nsegs; +} + +/* + * Prepost any receive buffer, then post send. + * + * Receive buffer is donated to hardware, reclaimed upon recv completion. + */ +int +rpcrdma_ep_post(struct rpcrdma_ia *ia, + struct rpcrdma_ep *ep, + struct rpcrdma_req *req) +{ + struct ib_send_wr send_wr, *send_wr_fail; + struct rpcrdma_rep *rep = req->rl_reply; + int rc; + + if (rep) { + rc = rpcrdma_ep_post_recv(ia, ep, rep); + if (rc) + goto out; + req->rl_reply = NULL; + } + + send_wr.next = NULL; + send_wr.wr_id = 0ULL; /* no send cookie */ + send_wr.sg_list = req->rl_send_iov; + send_wr.num_sge = req->rl_niovs; + send_wr.opcode = IB_WR_SEND; + if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */ + ib_dma_sync_single_for_device(ia->ri_id->device, + req->rl_send_iov[3].addr, req->rl_send_iov[3].length, + DMA_TO_DEVICE); + ib_dma_sync_single_for_device(ia->ri_id->device, + req->rl_send_iov[1].addr, req->rl_send_iov[1].length, + DMA_TO_DEVICE); + ib_dma_sync_single_for_device(ia->ri_id->device, + req->rl_send_iov[0].addr, req->rl_send_iov[0].length, + DMA_TO_DEVICE); + + if (DECR_CQCOUNT(ep) > 0) + send_wr.send_flags = 0; + else { /* Provider must take a send completion every now and then */ + INIT_CQCOUNT(ep); + send_wr.send_flags = IB_SEND_SIGNALED; + } + + rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); + if (rc) + dprintk("RPC: %s: ib_post_send returned %i\n", __func__, + rc); +out: + return rc; +} + +/* + * (Re)post a receive buffer. + */ +int +rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, + struct rpcrdma_ep *ep, + struct rpcrdma_rep *rep) +{ + struct ib_recv_wr recv_wr, *recv_wr_fail; + int rc; + + recv_wr.next = NULL; + recv_wr.wr_id = (u64) (unsigned long) rep; + recv_wr.sg_list = &rep->rr_iov; + recv_wr.num_sge = 1; + + ib_dma_sync_single_for_cpu(ia->ri_id->device, + rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL); + + rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); + + if (rc) + dprintk("RPC: %s: ib_post_recv returned %i\n", __func__, + rc); + return rc; +} + +/* Physical mapping means one Read/Write list entry per-page. + * All list entries must fit within an inline buffer + * + * NB: The server must return a Write list for NFS READ, + * which has the same constraint. Factor in the inline + * rsize as well. + */ +static size_t +rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; + unsigned int inline_size, pages; + + inline_size = min_t(unsigned int, + cdata->inline_wsize, cdata->inline_rsize); + inline_size -= RPCRDMA_HDRLEN_MIN; + pages = inline_size / sizeof(struct rpcrdma_segment); + return pages << PAGE_SHIFT; +} + +static size_t +rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt) +{ + return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT; +} + +size_t +rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt) +{ + size_t result; + + switch (r_xprt->rx_ia.ri_memreg_strategy) { + case RPCRDMA_ALLPHYSICAL: + result = rpcrdma_physical_max_payload(r_xprt); + break; + default: + result = rpcrdma_mr_max_payload(r_xprt); + } + return result; +} diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h new file mode 100644 index 0000000..ac7fc9a --- /dev/null +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -0,0 +1,402 @@ +/* + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_SUNRPC_XPRT_RDMA_H +#define _LINUX_SUNRPC_XPRT_RDMA_H + +#include /* wait_queue_head_t, etc */ +#include /* spinlock_t, etc */ +#include /* atomic_t, etc */ +#include /* struct work_struct */ + +#include /* RDMA connection api */ +#include /* RDMA verbs api */ + +#include /* rpc_xprt */ +#include /* RPC/RDMA protocol */ +#include /* xprt parameters */ +#include /* RPCSVC_MAXPAYLOAD */ + +#define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */ +#define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */ + +/* + * Interface Adapter -- one per transport instance + */ +struct rpcrdma_ia { + rwlock_t ri_qplock; + struct rdma_cm_id *ri_id; + struct ib_pd *ri_pd; + struct ib_mr *ri_bind_mem; + u32 ri_dma_lkey; + int ri_have_dma_lkey; + struct completion ri_done; + int ri_async_rc; + enum rpcrdma_memreg ri_memreg_strategy; + unsigned int ri_max_frmr_depth; +}; + +/* + * RDMA Endpoint -- one per transport instance + */ + +#define RPCRDMA_WC_BUDGET (128) +#define RPCRDMA_POLLSIZE (16) + +struct rpcrdma_ep { + atomic_t rep_cqcount; + int rep_cqinit; + int rep_connected; + struct rpcrdma_ia *rep_ia; + struct ib_qp_init_attr rep_attr; + wait_queue_head_t rep_connect_wait; + struct ib_sge rep_pad; /* holds zeroed pad */ + struct ib_mr *rep_pad_mr; /* holds zeroed pad */ + void (*rep_func)(struct rpcrdma_ep *); + struct rpc_xprt *rep_xprt; /* for rep_func */ + struct rdma_conn_param rep_remote_cma; + struct sockaddr_storage rep_remote_addr; + struct delayed_work rep_connect_worker; + struct ib_wc rep_send_wcs[RPCRDMA_POLLSIZE]; + struct ib_wc rep_recv_wcs[RPCRDMA_POLLSIZE]; +}; + +#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) +#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) + +enum rpcrdma_chunktype { + rpcrdma_noch = 0, + rpcrdma_readch, + rpcrdma_areadch, + rpcrdma_writech, + rpcrdma_replych +}; + +/* + * struct rpcrdma_rep -- this structure encapsulates state required to recv + * and complete a reply, asychronously. It needs several pieces of + * state: + * o recv buffer (posted to provider) + * o ib_sge (also donated to provider) + * o status of reply (length, success or not) + * o bookkeeping state to get run by tasklet (list, etc) + * + * These are allocated during initialization, per-transport instance; + * however, the tasklet execution list itself is global, as it should + * always be pretty short. + * + * N of these are associated with a transport instance, and stored in + * struct rpcrdma_buffer. N is the max number of outstanding requests. + */ + +/* temporary static scatter/gather max */ +#define RPCRDMA_MAX_DATA_SEGS (64) /* max scatter/gather */ +#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */ +#define MAX_RPCRDMAHDR (\ + /* max supported RPC/RDMA header */ \ + sizeof(struct rpcrdma_msg) + (2 * sizeof(u32)) + \ + (sizeof(struct rpcrdma_read_chunk) * RPCRDMA_MAX_SEGS) + sizeof(u32)) + +struct rpcrdma_buffer; + +struct rpcrdma_rep { + unsigned int rr_len; /* actual received reply length */ + struct rpcrdma_buffer *rr_buffer; /* home base for this structure */ + struct rpc_xprt *rr_xprt; /* needed for request/reply matching */ + void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */ + struct list_head rr_list; /* tasklet list */ + struct ib_sge rr_iov; /* for posting */ + struct ib_mr *rr_handle; /* handle for mem in rr_iov */ + char rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */ +}; + +/* + * struct rpcrdma_mw - external memory region metadata + * + * An external memory region is any buffer or page that is registered + * on the fly (ie, not pre-registered). + * + * Each rpcrdma_buffer has a list of free MWs anchored in rb_mws. During + * call_allocate, rpcrdma_buffer_get() assigns one to each segment in + * an rpcrdma_req. Then rpcrdma_register_external() grabs these to keep + * track of registration metadata while each RPC is pending. + * rpcrdma_deregister_external() uses this metadata to unmap and + * release these resources when an RPC is complete. + */ +enum rpcrdma_frmr_state { + FRMR_IS_INVALID, /* ready to be used */ + FRMR_IS_VALID, /* in use */ + FRMR_IS_STALE, /* failed completion */ +}; + +struct rpcrdma_frmr { + struct ib_fast_reg_page_list *fr_pgl; + struct ib_mr *fr_mr; + enum rpcrdma_frmr_state fr_state; +}; + +struct rpcrdma_mw { + union { + struct ib_fmr *fmr; + struct rpcrdma_frmr frmr; + } r; + struct list_head mw_list; + struct list_head mw_all; +}; + +/* + * struct rpcrdma_req -- structure central to the request/reply sequence. + * + * N of these are associated with a transport instance, and stored in + * struct rpcrdma_buffer. N is the max number of outstanding requests. + * + * It includes pre-registered buffer memory for send AND recv. + * The recv buffer, however, is not owned by this structure, and + * is "donated" to the hardware when a recv is posted. When a + * reply is handled, the recv buffer used is given back to the + * struct rpcrdma_req associated with the request. + * + * In addition to the basic memory, this structure includes an array + * of iovs for send operations. The reason is that the iovs passed to + * ib_post_{send,recv} must not be modified until the work request + * completes. + * + * NOTES: + * o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we + * marshal. The number needed varies depending on the iov lists that + * are passed to us, the memory registration mode we are in, and if + * physical addressing is used, the layout. + */ + +struct rpcrdma_mr_seg { /* chunk descriptors */ + union { /* chunk memory handles */ + struct ib_mr *rl_mr; /* if registered directly */ + struct rpcrdma_mw *rl_mw; /* if registered from region */ + } mr_chunk; + u64 mr_base; /* registration result */ + u32 mr_rkey; /* registration result */ + u32 mr_len; /* length of chunk or segment */ + int mr_nsegs; /* number of segments in chunk or 0 */ + enum dma_data_direction mr_dir; /* segment mapping direction */ + dma_addr_t mr_dma; /* segment mapping address */ + size_t mr_dmalen; /* segment mapping length */ + struct page *mr_page; /* owning page, if any */ + char *mr_offset; /* kva if no page, else offset */ +}; + +struct rpcrdma_req { + size_t rl_size; /* actual length of buffer */ + unsigned int rl_niovs; /* 0, 2 or 4 */ + unsigned int rl_nchunks; /* non-zero if chunks */ + unsigned int rl_connect_cookie; /* retry detection */ + enum rpcrdma_chunktype rl_rtype, rl_wtype; + struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ + struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ + struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */ + struct ib_sge rl_send_iov[4]; /* for active requests */ + struct ib_sge rl_iov; /* for posting */ + struct ib_mr *rl_handle; /* handle for mem in rl_iov */ + char rl_base[MAX_RPCRDMAHDR]; /* start of actual buffer */ + __u32 rl_xdr_buf[0]; /* start of returned rpc rq_buffer */ +}; +#define rpcr_to_rdmar(r) \ + container_of((r)->rq_buffer, struct rpcrdma_req, rl_xdr_buf[0]) + +/* + * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for + * inline requests/replies, and client/server credits. + * + * One of these is associated with a transport instance + */ +struct rpcrdma_buffer { + spinlock_t rb_lock; /* protects indexes */ + atomic_t rb_credits; /* most recent server credits */ + int rb_max_requests;/* client max requests */ + struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */ + struct list_head rb_all; + int rb_send_index; + struct rpcrdma_req **rb_send_bufs; + int rb_recv_index; + struct rpcrdma_rep **rb_recv_bufs; + char *rb_pool; +}; +#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) + +/* + * Internal structure for transport instance creation. This + * exists primarily for modularity. + * + * This data should be set with mount options + */ +struct rpcrdma_create_data_internal { + struct sockaddr_storage addr; /* RDMA server address */ + unsigned int max_requests; /* max requests (slots) in flight */ + unsigned int rsize; /* mount rsize - max read hdr+data */ + unsigned int wsize; /* mount wsize - max write hdr+data */ + unsigned int inline_rsize; /* max non-rdma read data payload */ + unsigned int inline_wsize; /* max non-rdma write data payload */ + unsigned int padding; /* non-rdma write header padding */ +}; + +#define RPCRDMA_INLINE_READ_THRESHOLD(rq) \ + (rpcx_to_rdmad(rq->rq_xprt).inline_rsize) + +#define RPCRDMA_INLINE_WRITE_THRESHOLD(rq)\ + (rpcx_to_rdmad(rq->rq_xprt).inline_wsize) + +#define RPCRDMA_INLINE_PAD_VALUE(rq)\ + rpcx_to_rdmad(rq->rq_xprt).padding + +/* + * Statistics for RPCRDMA + */ +struct rpcrdma_stats { + unsigned long read_chunk_count; + unsigned long write_chunk_count; + unsigned long reply_chunk_count; + + unsigned long long total_rdma_request; + unsigned long long total_rdma_reply; + + unsigned long long pullup_copy_count; + unsigned long long fixup_copy_count; + unsigned long hardway_register_count; + unsigned long failed_marshal_count; + unsigned long bad_reply_count; +}; + +/* + * RPCRDMA transport -- encapsulates the structures above for + * integration with RPC. + * + * The contained structures are embedded, not pointers, + * for convenience. This structure need not be visible externally. + * + * It is allocated and initialized during mount, and released + * during unmount. + */ +struct rpcrdma_xprt { + struct rpc_xprt xprt; + struct rpcrdma_ia rx_ia; + struct rpcrdma_ep rx_ep; + struct rpcrdma_buffer rx_buf; + struct rpcrdma_create_data_internal rx_data; + struct delayed_work rdma_connect; + struct rpcrdma_stats rx_stats; +}; + +#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt) +#define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data) + +/* Setting this to 0 ensures interoperability with early servers. + * Setting this to 1 enhances certain unaligned read/write performance. + * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */ +extern int xprt_rdma_pad_optimize; + +/* + * Interface Adapter calls - xprtrdma/verbs.c + */ +int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int); +void rpcrdma_ia_close(struct rpcrdma_ia *); + +/* + * Endpoint calls - xprtrdma/verbs.c + */ +int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *, + struct rpcrdma_create_data_internal *); +void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); +int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); +void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); + +int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, + struct rpcrdma_req *); +int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *, + struct rpcrdma_rep *); + +/* + * Buffer calls - xprtrdma/verbs.c + */ +int rpcrdma_buffer_create(struct rpcrdma_buffer *, struct rpcrdma_ep *, + struct rpcrdma_ia *, + struct rpcrdma_create_data_internal *); +void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); + +struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); +void rpcrdma_buffer_put(struct rpcrdma_req *); +void rpcrdma_recv_buffer_get(struct rpcrdma_req *); +void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); + +int rpcrdma_register_internal(struct rpcrdma_ia *, void *, int, + struct ib_mr **, struct ib_sge *); +int rpcrdma_deregister_internal(struct rpcrdma_ia *, + struct ib_mr *, struct ib_sge *); + +int rpcrdma_register_external(struct rpcrdma_mr_seg *, + int, int, struct rpcrdma_xprt *); +int rpcrdma_deregister_external(struct rpcrdma_mr_seg *, + struct rpcrdma_xprt *); + +/* + * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c + */ +void rpcrdma_connect_worker(struct work_struct *); +void rpcrdma_conn_func(struct rpcrdma_ep *); +void rpcrdma_reply_handler(struct rpcrdma_rep *); + +/* + * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c + */ +ssize_t rpcrdma_marshal_chunks(struct rpc_rqst *, ssize_t); +int rpcrdma_marshal_req(struct rpc_rqst *); +size_t rpcrdma_max_payload(struct rpcrdma_xprt *); + +/* Temporary NFS request map cache. Created in svc_rdma.c */ +extern struct kmem_cache *svc_rdma_map_cachep; +/* WR context cache. Created in svc_rdma.c */ +extern struct kmem_cache *svc_rdma_ctxt_cachep; +/* Workqueue created in svc_rdma.c */ +extern struct workqueue_struct *svc_rdma_wq; + +#if RPCSVC_MAXPAYLOAD < (RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT) +#define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD +#else +#define RPCSVC_MAXPAYLOAD_RDMA (RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT) +#endif + +#endif /* _LINUX_SUNRPC_XPRT_RDMA_H */